From 849369d6c66d3054688672f97d31fceb8e8230fb Mon Sep 17 00:00:00 2001 From: root Date: Fri, 25 Dec 2015 04:40:36 +0000 Subject: initial_commit --- net/802/Kconfig | 7 + net/802/Makefile | 14 + net/802/fc.c | 131 + net/802/fddi.c | 215 + net/802/garp.c | 635 ++ net/802/hippi.c | 235 + net/802/p8022.c | 67 + net/802/p8023.c | 64 + net/802/psnap.c | 167 + net/802/stp.c | 103 + net/802/tr.c | 677 ++ net/8021q/Kconfig | 29 + net/8021q/Makefile | 10 + net/8021q/vlan.c | 722 ++ net/8021q/vlan.h | 134 + net/8021q/vlan_core.c | 181 + net/8021q/vlan_dev.c | 705 ++ net/8021q/vlan_gvrp.c | 66 + net/8021q/vlan_netlink.c | 240 + net/8021q/vlanproc.c | 327 + net/8021q/vlanproc.h | 20 + net/9p/Kconfig | 36 + net/9p/Makefile | 18 + net/9p/client.c | 1958 ++++++ net/9p/error.c | 247 + net/9p/mod.c | 174 + net/9p/protocol.c | 682 ++ net/9p/protocol.h | 34 + net/9p/trans_common.c | 92 + net/9p/trans_common.h | 32 + net/9p/trans_fd.c | 1087 +++ net/9p/trans_rdma.c | 719 ++ net/9p/trans_virtio.c | 629 ++ net/9p/util.c | 146 + net/Kconfig | 341 + net/Makefile | 71 + net/TUNABLE | 50 + net/activity_stats.c | 115 + net/appletalk/Makefile | 9 + net/appletalk/aarp.c | 1063 +++ net/appletalk/atalk_proc.c | 301 + net/appletalk/ddp.c | 1977 ++++++ net/appletalk/dev.c | 44 + net/appletalk/sysctl_net_atalk.c | 61 + net/atm/Kconfig | 73 + net/atm/Makefile | 15 + net/atm/addr.c | 161 + net/atm/addr.h | 20 + net/atm/atm_misc.c | 101 + net/atm/atm_sysfs.c | 200 + net/atm/br2684.c | 847 +++ net/atm/clip.c | 1009 +++ net/atm/common.c | 870 +++ net/atm/common.h | 55 + net/atm/ioctl.c | 371 ++ net/atm/lec.c | 2409 +++++++ net/atm/lec.h | 155 + net/atm/lec_arpc.h | 96 + net/atm/mpc.c | 1537 +++++ net/atm/mpc.h | 64 + net/atm/mpoa_caches.c | 569 ++ net/atm/mpoa_caches.h | 96 + net/atm/mpoa_proc.c | 312 + net/atm/pppoatm.c | 362 + net/atm/proc.c | 497 ++ net/atm/protocols.h | 13 + net/atm/pvc.c | 160 + net/atm/raw.c | 85 + net/atm/resources.c | 463 ++ net/atm/resources.h | 47 + net/atm/signaling.c | 269 + net/atm/signaling.h | 30 + net/atm/svc.c | 691 ++ net/ax25/Kconfig | 121 + net/ax25/Makefile | 11 + net/ax25/TODO | 20 + net/ax25/af_ax25.c | 2022 ++++++ net/ax25/ax25_addr.c | 306 + net/ax25/ax25_dev.c | 205 + net/ax25/ax25_ds_in.c | 303 + net/ax25/ax25_ds_subr.c | 211 + net/ax25/ax25_ds_timer.c | 238 + net/ax25/ax25_iface.c | 219 + net/ax25/ax25_in.c | 456 ++ net/ax25/ax25_ip.c | 243 + net/ax25/ax25_out.c | 399 ++ net/ax25/ax25_route.c | 505 ++ net/ax25/ax25_std_in.c | 447 ++ net/ax25/ax25_std_subr.c | 87 + net/ax25/ax25_std_timer.c | 176 + net/ax25/ax25_subr.c | 290 + net/ax25/ax25_timer.c | 227 + net/ax25/ax25_uid.c | 218 + net/ax25/sysctl_net_ax25.c | 210 + net/batman-adv/Kconfig | 25 + net/batman-adv/Makefile | 39 + net/batman-adv/aggregation.c | 280 + net/batman-adv/aggregation.h | 43 + net/batman-adv/bat_debugfs.c | 358 + net/batman-adv/bat_debugfs.h | 33 + net/batman-adv/bat_sysfs.c | 596 ++ net/batman-adv/bat_sysfs.h | 42 + net/batman-adv/bitarray.c | 201 + net/batman-adv/bitarray.h | 44 + net/batman-adv/gateway_client.c | 561 ++ net/batman-adv/gateway_client.h | 36 + net/batman-adv/gateway_common.c | 177 + net/batman-adv/gateway_common.h | 38 + net/batman-adv/hard-interface.c | 682 ++ net/batman-adv/hard-interface.h | 66 + net/batman-adv/hash.c | 70 + net/batman-adv/hash.h | 149 + net/batman-adv/icmp_socket.c | 356 + net/batman-adv/icmp_socket.h | 32 + net/batman-adv/main.c | 189 + net/batman-adv/main.h | 182 + net/batman-adv/originator.c | 644 ++ net/batman-adv/originator.h | 103 + net/batman-adv/packet.h | 136 + net/batman-adv/ring_buffer.c | 52 + net/batman-adv/ring_buffer.h | 28 + net/batman-adv/routing.c | 1535 +++++ net/batman-adv/routing.h | 46 + net/batman-adv/send.c | 611 ++ net/batman-adv/send.h | 39 + net/batman-adv/soft-interface.c | 927 +++ net/batman-adv/soft-interface.h | 36 + net/batman-adv/translation-table.c | 638 ++ net/batman-adv/translation-table.h | 43 + net/batman-adv/types.h | 291 + net/batman-adv/unicast.c | 355 + net/batman-adv/unicast.h | 58 + net/batman-adv/vis.c | 993 +++ net/batman-adv/vis.h | 37 + net/bluetooth/Kconfig | 70 + net/bluetooth/Makefile | 13 + net/bluetooth/af_bluetooth.c | 633 ++ net/bluetooth/bnep/Kconfig | 24 + net/bluetooth/bnep/Makefile | 7 + net/bluetooth/bnep/bnep.h | 180 + net/bluetooth/bnep/core.c | 751 +++ net/bluetooth/bnep/netdev.c | 238 + net/bluetooth/bnep/sock.c | 259 + net/bluetooth/cmtp/Kconfig | 11 + net/bluetooth/cmtp/Makefile | 7 + net/bluetooth/cmtp/capi.c | 623 ++ net/bluetooth/cmtp/cmtp.h | 128 + net/bluetooth/cmtp/core.c | 497 ++ net/bluetooth/cmtp/sock.c | 253 + net/bluetooth/hci_conn.c | 991 +++ net/bluetooth/hci_core.c | 2427 +++++++ net/bluetooth/hci_event.c | 3118 +++++++++ net/bluetooth/hci_sock.c | 817 +++ net/bluetooth/hci_sysfs.c | 607 ++ net/bluetooth/hidp/Kconfig | 12 + net/bluetooth/hidp/Makefile | 7 + net/bluetooth/hidp/core.c | 1220 ++++ net/bluetooth/hidp/hidp.h | 191 + net/bluetooth/hidp/sock.c | 305 + net/bluetooth/l2cap_core.c | 4378 ++++++++++++ net/bluetooth/l2cap_sock.c | 1094 +++ net/bluetooth/lib.c | 171 + net/bluetooth/mgmt.c | 2288 +++++++ net/bluetooth/rfcomm/Kconfig | 17 + net/bluetooth/rfcomm/Makefile | 8 + net/bluetooth/rfcomm/core.c | 2227 +++++++ net/bluetooth/rfcomm/sock.c | 1072 +++ net/bluetooth/rfcomm/tty.c | 1190 ++++ net/bluetooth/sco.c | 1104 +++ net/bluetooth/smp.c | 702 ++ net/bridge/Kconfig | 48 + net/bridge/Makefile | 17 + net/bridge/br.c | 107 + net/bridge/br_device.c | 371 ++ net/bridge/br_fdb.c | 696 ++ net/bridge/br_forward.c | 271 + net/bridge/br_if.c | 454 ++ net/bridge/br_input.c | 206 + net/bridge/br_ioctl.c | 395 ++ net/bridge/br_multicast.c | 1777 +++++ net/bridge/br_netfilter.c | 1037 +++ net/bridge/br_netlink.c | 252 + net/bridge/br_notify.c | 108 + net/bridge/br_private.h | 554 ++ net/bridge/br_private_stp.h | 69 + net/bridge/br_stp.c | 534 ++ net/bridge/br_stp_bpdu.c | 223 + net/bridge/br_stp_if.c | 301 + net/bridge/br_stp_timer.c | 173 + net/bridge/br_sysfs_br.c | 784 +++ net/bridge/br_sysfs_if.c | 283 + net/bridge/netfilter/Kconfig | 221 + net/bridge/netfilter/Makefile | 34 + net/bridge/netfilter/ebt_802_3.c | 72 + net/bridge/netfilter/ebt_among.c | 229 + net/bridge/netfilter/ebt_arp.c | 140 + net/bridge/netfilter/ebt_arpreply.c | 98 + net/bridge/netfilter/ebt_dnat.c | 74 + net/bridge/netfilter/ebt_ip.c | 130 + net/bridge/netfilter/ebt_ip6.c | 155 + net/bridge/netfilter/ebt_limit.c | 127 + net/bridge/netfilter/ebt_log.c | 228 + net/bridge/netfilter/ebt_mark.c | 110 + net/bridge/netfilter/ebt_mark_m.c | 98 + net/bridge/netfilter/ebt_nflog.c | 72 + net/bridge/netfilter/ebt_pkttype.c | 56 + net/bridge/netfilter/ebt_redirect.c | 80 + net/bridge/netfilter/ebt_snat.c | 87 + net/bridge/netfilter/ebt_stp.c | 197 + net/bridge/netfilter/ebt_ulog.c | 342 + net/bridge/netfilter/ebt_vlan.c | 181 + net/bridge/netfilter/ebtable_broute.c | 104 + net/bridge/netfilter/ebtable_filter.c | 139 + net/bridge/netfilter/ebtable_nat.c | 139 + net/bridge/netfilter/ebtables.c | 2416 +++++++ net/caif/Kconfig | 42 + net/caif/Makefile | 14 + net/caif/caif_dev.c | 433 ++ net/caif/caif_socket.c | 1220 ++++ net/caif/cfcnfg.c | 631 ++ net/caif/cfctrl.c | 648 ++ net/caif/cfdbgl.c | 56 + net/caif/cfdgml.c | 112 + net/caif/cffrml.c | 199 + net/caif/cfmuxl.c | 273 + net/caif/cfpkt_skbuff.c | 400 ++ net/caif/cfrfml.c | 308 + net/caif/cfserl.c | 192 + net/caif/cfsrvl.c | 225 + net/caif/cfutill.c | 104 + net/caif/cfveil.c | 104 + net/caif/cfvidl.c | 64 + net/caif/chnl_net.c | 545 ++ net/can/Kconfig | 44 + net/can/Makefile | 12 + net/can/af_can.c | 888 +++ net/can/af_can.h | 120 + net/can/bcm.c | 1632 +++++ net/can/proc.c | 540 ++ net/can/raw.c | 803 +++ net/ceph/Kconfig | 29 + net/ceph/Makefile | 15 + net/ceph/armor.c | 105 + net/ceph/auth.c | 259 + net/ceph/auth_none.c | 132 + net/ceph/auth_none.h | 29 + net/ceph/auth_x.c | 690 ++ net/ceph/auth_x.h | 50 + net/ceph/auth_x_protocol.h | 90 + net/ceph/buffer.c | 68 + net/ceph/ceph_common.c | 622 ++ net/ceph/ceph_fs.c | 78 + net/ceph/ceph_hash.c | 121 + net/ceph/ceph_strings.c | 84 + net/ceph/crush/crush.c | 151 + net/ceph/crush/hash.c | 149 + net/ceph/crush/mapper.c | 609 ++ net/ceph/crypto.c | 485 ++ net/ceph/crypto.h | 52 + net/ceph/debugfs.c | 267 + net/ceph/messenger.c | 2489 +++++++ net/ceph/mon_client.c | 1027 +++ net/ceph/msgpool.c | 64 + net/ceph/osd_client.c | 2162 ++++++ net/ceph/osdmap.c | 1135 ++++ net/ceph/pagelist.c | 154 + net/ceph/pagevec.c | 233 + net/compat.c | 867 +++ net/core/Makefile | 21 + net/core/datagram.c | 775 +++ net/core/dev.c | 6522 ++++++++++++++++++ net/core/dev_addr_lists.c | 733 ++ net/core/drop_monitor.c | 385 ++ net/core/dst.c | 426 ++ net/core/ethtool.c | 2166 ++++++ net/core/fib_rules.c | 766 +++ net/core/filter.c | 654 ++ net/core/flow.c | 437 ++ net/core/gen_estimator.c | 322 + net/core/gen_stats.c | 250 + net/core/iovec.c | 264 + net/core/kmap_skb.h | 19 + net/core/link_watch.c | 246 + net/core/neighbour.c | 2934 ++++++++ net/core/net-sysfs.c | 1339 ++++ net/core/net-sysfs.h | 11 + net/core/net-traces.c | 34 + net/core/net_namespace.c | 638 ++ net/core/netevent.c | 69 + net/core/netpoll.c | 954 +++ net/core/pktgen.c | 3796 +++++++++++ net/core/request_sock.c | 131 + net/core/rtnetlink.c | 2030 ++++++ net/core/scm.c | 344 + net/core/secure_seq.c | 184 + net/core/skbuff.c | 3084 +++++++++ net/core/sock.c | 2579 ++++++++ net/core/stream.c | 208 + net/core/sysctl_net_core.c | 259 + net/core/timestamping.c | 136 + net/core/user_dma.c | 130 + net/core/utils.c | 323 + net/dcb/Kconfig | 22 + net/dcb/Makefile | 1 + net/dcb/dcbevent.c | 40 + net/dcb/dcbnl.c | 1835 +++++ net/dccp/Kconfig | 62 + net/dccp/Makefile | 28 + net/dccp/ackvec.c | 408 ++ net/dccp/ackvec.h | 136 + net/dccp/ccid.c | 222 + net/dccp/ccid.h | 265 + net/dccp/ccids/Kconfig | 55 + net/dccp/ccids/ccid2.c | 671 ++ net/dccp/ccids/ccid2.h | 104 + net/dccp/ccids/ccid3.c | 866 +++ net/dccp/ccids/ccid3.h | 160 + net/dccp/ccids/lib/loss_interval.c | 184 + net/dccp/ccids/lib/loss_interval.h | 73 + net/dccp/ccids/lib/packet_history.c | 448 ++ net/dccp/ccids/lib/packet_history.h | 158 + net/dccp/ccids/lib/tfrc.c | 44 + net/dccp/ccids/lib/tfrc.h | 77 + net/dccp/ccids/lib/tfrc_equation.c | 703 ++ net/dccp/dccp.h | 506 ++ net/dccp/diag.c | 74 + net/dccp/feat.c | 1359 ++++ net/dccp/feat.h | 136 + net/dccp/input.c | 732 ++ net/dccp/ipv4.c | 1080 +++ net/dccp/ipv6.c | 1211 ++++ net/dccp/ipv6.h | 36 + net/dccp/minisocks.c | 278 + net/dccp/options.c | 640 ++ net/dccp/output.c | 697 ++ net/dccp/probe.c | 198 + net/dccp/proto.c | 1257 ++++ net/dccp/qpolicy.c | 137 + net/dccp/sysctl.c | 124 + net/dccp/timer.c | 292 + net/decnet/Kconfig | 43 + net/decnet/Makefile | 10 + net/decnet/README | 8 + net/decnet/TODO | 41 + net/decnet/af_decnet.c | 2423 +++++++ net/decnet/dn_dev.c | 1445 ++++ net/decnet/dn_fib.c | 770 +++ net/decnet/dn_neigh.c | 612 ++ net/decnet/dn_nsp_in.c | 912 +++ net/decnet/dn_nsp_out.c | 720 ++ net/decnet/dn_route.c | 1861 ++++++ net/decnet/dn_rules.c | 254 + net/decnet/dn_table.c | 908 +++ net/decnet/dn_timer.c | 109 + net/decnet/netfilter/Kconfig | 16 + net/decnet/netfilter/Makefile | 6 + net/decnet/netfilter/dn_rtmsg.c | 161 + net/decnet/sysctl_net_decnet.c | 377 ++ net/dns_resolver/Kconfig | 27 + net/dns_resolver/Makefile | 7 + net/dns_resolver/dns_key.c | 309 + net/dns_resolver/dns_query.c | 165 + net/dns_resolver/internal.h | 44 + net/dsa/Kconfig | 60 + net/dsa/Makefile | 13 + net/dsa/dsa.c | 434 ++ net/dsa/dsa_priv.h | 181 + net/dsa/mv88e6060.c | 288 + net/dsa/mv88e6123_61_65.c | 447 ++ net/dsa/mv88e6131.c | 439 ++ net/dsa/mv88e6xxx.c | 522 ++ net/dsa/mv88e6xxx.h | 95 + net/dsa/slave.c | 407 ++ net/dsa/tag_dsa.c | 205 + net/dsa/tag_edsa.c | 224 + net/dsa/tag_trailer.c | 133 + net/econet/Kconfig | 36 + net/econet/Makefile | 7 + net/econet/af_econet.c | 1174 ++++ net/ethernet/Makefile | 7 + net/ethernet/eth.c | 395 ++ net/ethernet/pe2.c | 37 + net/ieee802154/Kconfig | 12 + net/ieee802154/Makefile | 3 + net/ieee802154/af802154.h | 36 + net/ieee802154/af_ieee802154.c | 373 ++ net/ieee802154/dgram.c | 461 ++ net/ieee802154/ieee802154.h | 53 + net/ieee802154/netlink.c | 139 + net/ieee802154/nl-mac.c | 618 ++ net/ieee802154/nl-phy.c | 346 + net/ieee802154/nl_policy.c | 56 + net/ieee802154/raw.c | 267 + net/ieee802154/wpan-class.c | 226 + net/ipv4/Kconfig | 626 ++ net/ipv4/Makefile | 54 + net/ipv4/af_inet.c | 1823 +++++ net/ipv4/ah4.c | 535 ++ net/ipv4/arp.c | 1490 +++++ net/ipv4/cipso_ipv4.c | 2368 +++++++ net/ipv4/datagram.c | 87 + net/ipv4/devinet.c | 1851 ++++++ net/ipv4/esp4.c | 725 ++ net/ipv4/fib_frontend.c | 1136 ++++ net/ipv4/fib_lookup.h | 57 + net/ipv4/fib_rules.c | 307 + net/ipv4/fib_semantics.c | 1248 ++++ net/ipv4/fib_trie.c | 2636 ++++++++ net/ipv4/gre.c | 143 + net/ipv4/icmp.c | 1207 ++++ net/ipv4/igmp.c | 2661 ++++++++ net/ipv4/inet_connection_sock.c | 775 +++ net/ipv4/inet_diag.c | 951 +++ net/ipv4/inet_fragment.c | 286 + net/ipv4/inet_hashtables.c | 584 ++ net/ipv4/inet_lro.c | 600 ++ net/ipv4/inet_timewait_sock.c | 523 ++ net/ipv4/inetpeer.c | 637 ++ net/ipv4/ip_forward.c | 132 + net/ipv4/ip_fragment.c | 835 +++ net/ipv4/ip_gre.c | 1754 +++++ net/ipv4/ip_input.c | 452 ++ net/ipv4/ip_options.c | 649 ++ net/ipv4/ip_output.c | 1543 +++++ net/ipv4/ip_sockglue.c | 1352 ++++ net/ipv4/ipcomp.c | 185 + net/ipv4/ipconfig.c | 1636 +++++ net/ipv4/ipip.c | 912 +++ net/ipv4/ipmr.c | 2559 +++++++ net/ipv4/netfilter.c | 247 + net/ipv4/netfilter/Kconfig | 395 ++ net/ipv4/netfilter/Makefile | 72 + net/ipv4/netfilter/arp_tables.c | 1914 ++++++ net/ipv4/netfilter/arpt_mangle.c | 91 + net/ipv4/netfilter/arptable_filter.c | 93 + net/ipv4/netfilter/ip_queue.c | 639 ++ net/ipv4/netfilter/ip_tables.c | 2271 +++++++ net/ipv4/netfilter/ipt_CLUSTERIP.c | 746 +++ net/ipv4/netfilter/ipt_LOG.c | 516 ++ net/ipv4/netfilter/ipt_MASQUERADE.c | 173 + net/ipv4/netfilter/ipt_NETMAP.c | 98 + net/ipv4/netfilter/ipt_REDIRECT.c | 110 + net/ipv4/netfilter/ipt_REJECT.c | 220 + net/ipv4/netfilter/ipt_ULOG.c | 442 ++ net/ipv4/netfilter/ipt_ah.c | 91 + net/ipv4/netfilter/ipt_ecn.c | 127 + net/ipv4/netfilter/iptable_filter.c | 121 + net/ipv4/netfilter/iptable_mangle.c | 151 + net/ipv4/netfilter/iptable_raw.c | 96 + net/ipv4/netfilter/iptable_security.c | 113 + net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 459 ++ .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 462 ++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 318 + net/ipv4/netfilter/nf_defrag_ipv4.c | 128 + net/ipv4/netfilter/nf_nat_amanda.c | 85 + net/ipv4/netfilter/nf_nat_core.c | 779 +++ net/ipv4/netfilter/nf_nat_ftp.c | 137 + net/ipv4/netfilter/nf_nat_h323.c | 618 ++ net/ipv4/netfilter/nf_nat_helper.c | 451 ++ net/ipv4/netfilter/nf_nat_irc.c | 99 + net/ipv4/netfilter/nf_nat_pptp.c | 308 + net/ipv4/netfilter/nf_nat_proto_common.c | 125 + net/ipv4/netfilter/nf_nat_proto_dccp.c | 108 + net/ipv4/netfilter/nf_nat_proto_gre.c | 149 + net/ipv4/netfilter/nf_nat_proto_icmp.c | 84 + net/ipv4/netfilter/nf_nat_proto_sctp.c | 97 + net/ipv4/netfilter/nf_nat_proto_tcp.c | 92 + net/ipv4/netfilter/nf_nat_proto_udp.c | 83 + net/ipv4/netfilter/nf_nat_proto_udplite.c | 99 + net/ipv4/netfilter/nf_nat_proto_unknown.c | 53 + net/ipv4/netfilter/nf_nat_rule.c | 214 + net/ipv4/netfilter/nf_nat_sip.c | 561 ++ net/ipv4/netfilter/nf_nat_snmp_basic.c | 1334 ++++ net/ipv4/netfilter/nf_nat_standalone.c | 326 + net/ipv4/netfilter/nf_nat_tftp.c | 51 + net/ipv4/ping.c | 931 +++ net/ipv4/proc.c | 495 ++ net/ipv4/protocol.c | 61 + net/ipv4/raw.c | 1064 +++ net/ipv4/route.c | 3471 ++++++++++ net/ipv4/syncookies.c | 377 ++ net/ipv4/sysctl_net_ipv4.c | 840 +++ net/ipv4/sysfs_net_ipv4.c | 88 + net/ipv4/tcp.c | 3431 ++++++++++ net/ipv4/tcp_bic.c | 239 + net/ipv4/tcp_cong.c | 424 ++ net/ipv4/tcp_cubic.c | 492 ++ net/ipv4/tcp_diag.c | 57 + net/ipv4/tcp_highspeed.c | 187 + net/ipv4/tcp_htcp.c | 315 + net/ipv4/tcp_hybla.c | 192 + net/ipv4/tcp_illinois.c | 356 + net/ipv4/tcp_input.c | 5963 +++++++++++++++++ net/ipv4/tcp_ipv4.c | 2657 ++++++++ net/ipv4/tcp_lp.c | 344 + net/ipv4/tcp_minisocks.c | 786 +++ net/ipv4/tcp_output.c | 2853 ++++++++ net/ipv4/tcp_probe.c | 258 + net/ipv4/tcp_scalable.c | 62 + net/ipv4/tcp_timer.c | 602 ++ net/ipv4/tcp_vegas.c | 339 + net/ipv4/tcp_vegas.h | 24 + net/ipv4/tcp_veno.c | 234 + net/ipv4/tcp_westwood.c | 304 + net/ipv4/tcp_yeah.c | 260 + net/ipv4/tunnel4.c | 192 + net/ipv4/udp.c | 2285 +++++++ net/ipv4/udp_impl.h | 34 + net/ipv4/udplite.c | 131 + net/ipv4/xfrm4_input.c | 166 + net/ipv4/xfrm4_mode_beet.c | 156 + net/ipv4/xfrm4_mode_transport.c | 80 + net/ipv4/xfrm4_mode_tunnel.c | 121 + net/ipv4/xfrm4_output.c | 101 + net/ipv4/xfrm4_policy.c | 305 + net/ipv4/xfrm4_state.c | 98 + net/ipv4/xfrm4_tunnel.c | 113 + net/ipv6/Kconfig | 253 + net/ipv6/Makefile | 42 + net/ipv6/addrconf.c | 4783 +++++++++++++ net/ipv6/addrconf_core.c | 79 + net/ipv6/addrlabel.c | 599 ++ net/ipv6/af_inet6.c | 1349 ++++ net/ipv6/ah6.c | 757 +++ net/ipv6/anycast.c | 549 ++ net/ipv6/datagram.c | 868 +++ net/ipv6/esp6.c | 674 ++ net/ipv6/exthdrs.c | 898 +++ net/ipv6/exthdrs_core.c | 106 + net/ipv6/fib6_rules.c | 307 + net/ipv6/icmp.c | 982 +++ net/ipv6/inet6_connection_sock.c | 252 + net/ipv6/inet6_hashtables.c | 304 + net/ipv6/ip6_fib.c | 1606 +++++ net/ipv6/ip6_flowlabel.c | 782 +++ net/ipv6/ip6_input.c | 335 + net/ipv6/ip6_output.c | 1690 +++++ net/ipv6/ip6_tunnel.c | 1582 +++++ net/ipv6/ip6mr.c | 2278 +++++++ net/ipv6/ipcomp6.c | 218 + net/ipv6/ipv6_sockglue.c | 1290 ++++ net/ipv6/mcast.c | 2678 ++++++++ net/ipv6/mip6.c | 525 ++ net/ipv6/ndisc.c | 1896 ++++++ net/ipv6/netfilter.c | 188 + net/ipv6/netfilter/Kconfig | 224 + net/ipv6/netfilter/Makefile | 34 + net/ipv6/netfilter/ip6_queue.c | 640 ++ net/ipv6/netfilter/ip6_tables.c | 2375 +++++++ net/ipv6/netfilter/ip6t_LOG.c | 527 ++ net/ipv6/netfilter/ip6t_REJECT.c | 271 + net/ipv6/netfilter/ip6t_ah.c | 121 + net/ipv6/netfilter/ip6t_eui64.c | 74 + net/ipv6/netfilter/ip6t_frag.c | 136 + net/ipv6/netfilter/ip6t_hbh.c | 215 + net/ipv6/netfilter/ip6t_ipv6header.c | 154 + net/ipv6/netfilter/ip6t_mh.c | 94 + net/ipv6/netfilter/ip6t_rt.c | 225 + net/ipv6/netfilter/ip6table_filter.c | 113 + net/ipv6/netfilter/ip6table_mangle.c | 145 + net/ipv6/netfilter/ip6table_raw.c | 88 + net/ipv6/netfilter/ip6table_security.c | 105 + net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 398 ++ net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 308 + net/ipv6/netfilter/nf_conntrack_reasm.c | 651 ++ net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 137 + net/ipv6/proc.c | 342 + net/ipv6/protocol.c | 54 + net/ipv6/raw.c | 1370 ++++ net/ipv6/reassembly.c | 769 +++ net/ipv6/route.c | 2976 +++++++++ net/ipv6/sit.c | 1293 ++++ net/ipv6/syncookies.c | 268 + net/ipv6/sysctl_net_ipv6.c | 176 + net/ipv6/tcp_ipv6.c | 2321 +++++++ net/ipv6/tunnel6.c | 184 + net/ipv6/udp.c | 1517 +++++ net/ipv6/udp_impl.h | 37 + net/ipv6/udplite.c | 132 + net/ipv6/xfrm6_input.c | 146 + net/ipv6/xfrm6_mode_beet.c | 131 + net/ipv6/xfrm6_mode_ro.c | 84 + net/ipv6/xfrm6_mode_transport.c | 85 + net/ipv6/xfrm6_mode_tunnel.c | 117 + net/ipv6/xfrm6_output.c | 109 + net/ipv6/xfrm6_policy.c | 357 + net/ipv6/xfrm6_state.c | 196 + net/ipv6/xfrm6_tunnel.c | 402 ++ net/ipx/Kconfig | 60 + net/ipx/Makefile | 8 + net/ipx/af_ipx.c | 2092 ++++++ net/ipx/ipx_proc.c | 339 + net/ipx/ipx_route.c | 295 + net/ipx/sysctl_net_ipx.c | 46 + net/irda/Kconfig | 96 + net/irda/Makefile | 15 + net/irda/af_irda.c | 2746 ++++++++ net/irda/discovery.c | 422 ++ net/irda/ircomm/Kconfig | 12 + net/irda/ircomm/Makefile | 8 + net/irda/ircomm/ircomm_core.c | 592 ++ net/irda/ircomm/ircomm_event.c | 250 + net/irda/ircomm/ircomm_lmp.c | 370 ++ net/irda/ircomm/ircomm_param.c | 511 ++ net/irda/ircomm/ircomm_ttp.c | 368 + net/irda/ircomm/ircomm_tty.c | 1411 ++++ net/irda/ircomm/ircomm_tty_attach.c | 997 +++ net/irda/ircomm/ircomm_tty_ioctl.c | 427 ++ net/irda/irda_device.c | 324 + net/irda/iriap.c | 1104 +++ net/irda/iriap_event.c | 504 ++ net/irda/irias_object.c | 567 ++ net/irda/irlan/Kconfig | 14 + net/irda/irlan/Makefile | 7 + net/irda/irlan/irlan_client.c | 576 ++ net/irda/irlan/irlan_client_event.c | 533 ++ net/irda/irlan/irlan_common.c | 1214 ++++ net/irda/irlan/irlan_eth.c | 349 + net/irda/irlan/irlan_event.c | 60 + net/irda/irlan/irlan_filter.c | 243 + net/irda/irlan/irlan_provider.c | 417 ++ net/irda/irlan/irlan_provider_event.c | 241 + net/irda/irlap.c | 1237 ++++ net/irda/irlap_event.c | 2330 +++++++ net/irda/irlap_frame.c | 1434 ++++ net/irda/irlmp.c | 2025 ++++++ net/irda/irlmp_event.c | 909 +++ net/irda/irlmp_frame.c | 490 ++ net/irda/irmod.c | 211 + net/irda/irnet/Kconfig | 13 + net/irda/irnet/Makefile | 7 + net/irda/irnet/irnet.h | 528 ++ net/irda/irnet/irnet_irda.c | 1885 ++++++ net/irda/irnet/irnet_irda.h | 178 + net/irda/irnet/irnet_ppp.c | 1189 ++++ net/irda/irnet/irnet_ppp.h | 119 + net/irda/irnetlink.c | 159 + net/irda/irproc.c | 97 + net/irda/irqueue.c | 921 +++ net/irda/irsysctl.c | 273 + net/irda/irttp.c | 1915 ++++++ net/irda/parameters.c | 591 ++ net/irda/qos.c | 774 +++ net/irda/timer.c | 232 + net/irda/wrapper.c | 492 ++ net/iucv/Kconfig | 15 + net/iucv/Makefile | 6 + net/iucv/af_iucv.c | 1795 +++++ net/iucv/iucv.c | 2098 ++++++ net/key/Makefile | 5 + net/key/af_key.c | 3818 +++++++++++ net/l2tp/Kconfig | 107 + net/l2tp/Makefile | 12 + net/l2tp/l2tp_core.c | 1703 +++++ net/l2tp/l2tp_core.h | 271 + net/l2tp/l2tp_debugfs.c | 341 + net/l2tp/l2tp_eth.c | 335 + net/l2tp/l2tp_ip.c | 707 ++ net/l2tp/l2tp_netlink.c | 841 +++ net/l2tp/l2tp_ppp.c | 1840 +++++ net/lapb/Kconfig | 22 + net/lapb/Makefile | 7 + net/lapb/lapb_iface.c | 450 ++ net/lapb/lapb_in.c | 724 ++ net/lapb/lapb_out.c | 224 + net/lapb/lapb_subr.c | 313 + net/lapb/lapb_timer.c | 189 + net/llc/Kconfig | 10 + net/llc/Makefile | 25 + net/llc/af_llc.c | 1248 ++++ net/llc/llc_c_ac.c | 1444 ++++ net/llc/llc_c_ev.c | 748 +++ net/llc/llc_c_st.c | 4946 ++++++++++++++ net/llc/llc_conn.c | 1016 +++ net/llc/llc_core.c | 169 + net/llc/llc_if.c | 154 + net/llc/llc_input.c | 212 + net/llc/llc_output.c | 81 + net/llc/llc_pdu.c | 372 ++ net/llc/llc_proc.c | 276 + net/llc/llc_s_ac.c | 208 + net/llc/llc_s_ev.c | 115 + net/llc/llc_s_st.c | 183 + net/llc/llc_sap.c | 444 ++ net/llc/llc_station.c | 722 ++ net/llc/sysctl_net_llc.c | 103 + net/mac80211/Kconfig | 236 + net/mac80211/Makefile | 61 + net/mac80211/aes_ccm.c | 149 + net/mac80211/aes_ccm.h | 26 + net/mac80211/aes_cmac.c | 132 + net/mac80211/aes_cmac.h | 19 + net/mac80211/agg-rx.c | 328 + net/mac80211/agg-tx.c | 842 +++ net/mac80211/cfg.c | 2149 ++++++ net/mac80211/cfg.h | 9 + net/mac80211/chan.c | 130 + net/mac80211/debugfs.c | 526 ++ net/mac80211/debugfs.h | 15 + net/mac80211/debugfs_key.c | 342 + net/mac80211/debugfs_key.h | 33 + net/mac80211/debugfs_netdev.c | 564 ++ net/mac80211/debugfs_netdev.h | 22 + net/mac80211/debugfs_sta.c | 377 ++ net/mac80211/debugfs_sta.h | 14 + net/mac80211/driver-ops.h | 640 ++ net/mac80211/driver-trace.c | 9 + net/mac80211/driver-trace.h | 1348 ++++ net/mac80211/event.c | 27 + net/mac80211/ht.c | 318 + net/mac80211/ibss.c | 1016 +++ net/mac80211/ieee80211_i.h | 1408 ++++ net/mac80211/iface.c | 1417 ++++ net/mac80211/key.c | 535 ++ net/mac80211/key.h | 152 + net/mac80211/led.c | 308 + net/mac80211/led.h | 73 + net/mac80211/main.c | 1102 +++ net/mac80211/mesh.c | 650 ++ net/mac80211/mesh.h | 312 + net/mac80211/mesh_hwmp.c | 1016 +++ net/mac80211/mesh_pathtbl.c | 848 +++ net/mac80211/mesh_plink.c | 809 +++ net/mac80211/michael.c | 86 + net/mac80211/michael.h | 24 + net/mac80211/mlme.c | 2671 ++++++++ net/mac80211/offchannel.c | 299 + net/mac80211/pm.c | 130 + net/mac80211/rate.c | 415 ++ net/mac80211/rate.h | 164 + net/mac80211/rc80211_minstrel.c | 570 ++ net/mac80211/rc80211_minstrel.h | 97 + net/mac80211/rc80211_minstrel_debugfs.c | 143 + net/mac80211/rc80211_minstrel_ht.c | 881 +++ net/mac80211/rc80211_minstrel_ht.h | 130 + net/mac80211/rc80211_minstrel_ht_debugfs.c | 119 + net/mac80211/rc80211_pid.h | 278 + net/mac80211/rc80211_pid_algo.c | 477 ++ net/mac80211/rc80211_pid_debugfs.c | 227 + net/mac80211/rx.c | 2965 +++++++++ net/mac80211/scan.c | 977 +++ net/mac80211/spectmgmt.c | 86 + net/mac80211/sta_info.c | 1005 +++ net/mac80211/sta_info.h | 502 ++ net/mac80211/status.c | 456 ++ net/mac80211/tkip.c | 342 + net/mac80211/tkip.h | 33 + net/mac80211/tx.c | 2560 +++++++ net/mac80211/util.c | 1443 ++++ net/mac80211/wep.c | 342 + net/mac80211/wep.h | 35 + net/mac80211/wme.c | 160 + net/mac80211/wme.h | 30 + net/mac80211/work.c | 1276 ++++ net/mac80211/wpa.c | 630 ++ net/mac80211/wpa.h | 36 + net/netfilter/Kconfig | 1130 ++++ net/netfilter/Makefile | 117 + net/netfilter/core.c | 291 + net/netfilter/ipset/Kconfig | 122 + net/netfilter/ipset/Makefile | 24 + net/netfilter/ipset/ip_set_bitmap_ip.c | 586 ++ net/netfilter/ipset/ip_set_bitmap_ipmac.c | 655 ++ net/netfilter/ipset/ip_set_bitmap_port.c | 514 ++ net/netfilter/ipset/ip_set_core.c | 1708 +++++ net/netfilter/ipset/ip_set_getport.c | 155 + net/netfilter/ipset/ip_set_hash_ip.c | 464 ++ net/netfilter/ipset/ip_set_hash_ipport.c | 530 ++ net/netfilter/ipset/ip_set_hash_ipportip.c | 548 ++ net/netfilter/ipset/ip_set_hash_ipportnet.c | 616 ++ net/netfilter/ipset/ip_set_hash_net.c | 462 ++ net/netfilter/ipset/ip_set_hash_netport.c | 566 ++ net/netfilter/ipset/ip_set_list_set.c | 577 ++ net/netfilter/ipset/pfxlen.c | 291 + net/netfilter/ipvs/Kconfig | 267 + net/netfilter/ipvs/Makefile | 40 + net/netfilter/ipvs/ip_vs_app.c | 601 ++ net/netfilter/ipvs/ip_vs_conn.c | 1326 ++++ net/netfilter/ipvs/ip_vs_core.c | 2050 ++++++ net/netfilter/ipvs/ip_vs_ctl.c | 3778 +++++++++++ net/netfilter/ipvs/ip_vs_dh.c | 271 + net/netfilter/ipvs/ip_vs_est.c | 218 + net/netfilter/ipvs/ip_vs_ftp.c | 480 ++ net/netfilter/ipvs/ip_vs_lblc.c | 625 ++ net/netfilter/ipvs/ip_vs_lblcr.c | 821 +++ net/netfilter/ipvs/ip_vs_lc.c | 91 + net/netfilter/ipvs/ip_vs_nfct.c | 294 + net/netfilter/ipvs/ip_vs_nq.c | 140 + net/netfilter/ipvs/ip_vs_pe.c | 140 + net/netfilter/ipvs/ip_vs_pe_sip.c | 171 + net/netfilter/ipvs/ip_vs_proto.c | 395 ++ net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 166 + net/netfilter/ipvs/ip_vs_proto_sctp.c | 1136 ++++ net/netfilter/ipvs/ip_vs_proto_tcp.c | 721 ++ net/netfilter/ipvs/ip_vs_proto_udp.c | 509 ++ net/netfilter/ipvs/ip_vs_rr.c | 113 + net/netfilter/ipvs/ip_vs_sched.c | 260 + net/netfilter/ipvs/ip_vs_sed.c | 141 + net/netfilter/ipvs/ip_vs_sh.c | 269 + net/netfilter/ipvs/ip_vs_sync.c | 1700 +++++ net/netfilter/ipvs/ip_vs_wlc.c | 113 + net/netfilter/ipvs/ip_vs_wrr.c | 232 + net/netfilter/ipvs/ip_vs_xmit.c | 1370 ++++ net/netfilter/nf_conntrack_acct.c | 136 + net/netfilter/nf_conntrack_amanda.c | 237 + net/netfilter/nf_conntrack_broadcast.c | 82 + net/netfilter/nf_conntrack_core.c | 1592 +++++ net/netfilter/nf_conntrack_ecache.c | 266 + net/netfilter/nf_conntrack_expect.c | 677 ++ net/netfilter/nf_conntrack_extend.c | 189 + net/netfilter/nf_conntrack_ftp.c | 589 ++ net/netfilter/nf_conntrack_h323_asn1.c | 888 +++ net/netfilter/nf_conntrack_h323_main.c | 1837 +++++ net/netfilter/nf_conntrack_h323_types.c | 1922 ++++++ net/netfilter/nf_conntrack_helper.c | 288 + net/netfilter/nf_conntrack_irc.c | 291 + net/netfilter/nf_conntrack_l3proto_generic.c | 74 + net/netfilter/nf_conntrack_netbios_ns.c | 71 + net/netfilter/nf_conntrack_netlink.c | 2226 +++++++ net/netfilter/nf_conntrack_pptp.c | 631 ++ net/netfilter/nf_conntrack_proto.c | 384 ++ net/netfilter/nf_conntrack_proto_dccp.c | 899 +++ net/netfilter/nf_conntrack_proto_generic.c | 105 + net/netfilter/nf_conntrack_proto_gre.c | 346 + net/netfilter/nf_conntrack_proto_sctp.c | 750 +++ net/netfilter/nf_conntrack_proto_tcp.c | 1497 +++++ net/netfilter/nf_conntrack_proto_udp.c | 231 + net/netfilter/nf_conntrack_proto_udplite.c | 240 + net/netfilter/nf_conntrack_sane.c | 238 + net/netfilter/nf_conntrack_sip.c | 1610 +++++ net/netfilter/nf_conntrack_snmp.c | 77 + net/netfilter/nf_conntrack_standalone.c | 590 ++ net/netfilter/nf_conntrack_tftp.c | 153 + net/netfilter/nf_conntrack_timestamp.c | 120 + net/netfilter/nf_internals.h | 38 + net/netfilter/nf_log.c | 318 + net/netfilter/nf_queue.c | 395 ++ net/netfilter/nf_sockopt.c | 169 + net/netfilter/nf_tproxy_core.c | 62 + net/netfilter/nfnetlink.c | 225 + net/netfilter/nfnetlink_log.c | 1015 +++ net/netfilter/nfnetlink_queue.c | 950 +++ net/netfilter/x_tables.c | 1395 ++++ net/netfilter/xt_AUDIT.c | 222 + net/netfilter/xt_CHECKSUM.c | 70 + net/netfilter/xt_CLASSIFY.c | 73 + net/netfilter/xt_CONNSECMARK.c | 143 + net/netfilter/xt_CT.c | 171 + net/netfilter/xt_IDLETIMER.c | 408 ++ net/netfilter/xt_LED.c | 215 + net/netfilter/xt_NFLOG.c | 72 + net/netfilter/xt_NFQUEUE.c | 160 + net/netfilter/xt_NOTRACK.c | 53 + net/netfilter/xt_SECMARK.c | 147 + net/netfilter/xt_TCPOPTSTRIP.c | 142 + net/netfilter/xt_TEE.c | 310 + net/netfilter/xt_TPROXY.c | 432 ++ net/netfilter/xt_TRACE.c | 40 + net/netfilter/xt_addrtype.c | 243 + net/netfilter/xt_cluster.c | 178 + net/netfilter/xt_comment.c | 45 + net/netfilter/xt_connbytes.c | 155 + net/netfilter/xt_connlimit.c | 317 + net/netfilter/xt_connmark.c | 167 + net/netfilter/xt_conntrack.c | 332 + net/netfilter/xt_cpu.c | 65 + net/netfilter/xt_dccp.c | 188 + net/netfilter/xt_devgroup.c | 82 + net/netfilter/xt_dscp.c | 115 + net/netfilter/xt_esp.c | 107 + net/netfilter/xt_hashlimit.c | 847 +++ net/netfilter/xt_helper.c | 99 + net/netfilter/xt_hl.c | 96 + net/netfilter/xt_iprange.c | 140 + net/netfilter/xt_ipvs.c | 188 + net/netfilter/xt_length.c | 70 + net/netfilter/xt_limit.c | 210 + net/netfilter/xt_mac.c | 66 + net/netfilter/xt_mark.c | 84 + net/netfilter/xt_multiport.c | 165 + net/netfilter/xt_osf.c | 424 ++ net/netfilter/xt_owner.c | 82 + net/netfilter/xt_physdev.c | 128 + net/netfilter/xt_pkttype.c | 65 + net/netfilter/xt_policy.c | 188 + net/netfilter/xt_qtaguid.c | 2785 ++++++++ net/netfilter/xt_qtaguid_internal.h | 330 + net/netfilter/xt_qtaguid_print.c | 556 ++ net/netfilter/xt_qtaguid_print.h | 120 + net/netfilter/xt_quota.c | 89 + net/netfilter/xt_quota2.c | 381 ++ net/netfilter/xt_rateest.c | 158 + net/netfilter/xt_realm.c | 54 + net/netfilter/xt_recent.c | 668 ++ net/netfilter/xt_repldata.h | 35 + net/netfilter/xt_sctp.c | 198 + net/netfilter/xt_set.c | 373 ++ net/netfilter/xt_socket.c | 404 ++ net/netfilter/xt_state.c | 79 + net/netfilter/xt_statistic.c | 100 + net/netfilter/xt_string.c | 96 + net/netfilter/xt_tcpmss.c | 110 + net/netfilter/xt_tcpudp.c | 234 + net/netfilter/xt_time.c | 269 + net/netfilter/xt_u32.c | 123 + net/netlabel/Kconfig | 17 + net/netlabel/Makefile | 17 + net/netlabel/netlabel_addrlist.c | 384 ++ net/netlabel/netlabel_addrlist.h | 209 + net/netlabel/netlabel_cipso_v4.c | 787 +++ net/netlabel/netlabel_cipso_v4.h | 170 + net/netlabel/netlabel_domainhash.c | 729 ++ net/netlabel/netlabel_domainhash.h | 112 + net/netlabel/netlabel_kapi.c | 1090 +++ net/netlabel/netlabel_mgmt.c | 785 +++ net/netlabel/netlabel_mgmt.h | 223 + net/netlabel/netlabel_unlabeled.c | 1563 +++++ net/netlabel/netlabel_unlabeled.h | 246 + net/netlabel/netlabel_user.c | 124 + net/netlabel/netlabel_user.h | 66 + net/netlink/Makefile | 5 + net/netlink/af_netlink.c | 2165 ++++++ net/netlink/genetlink.c | 948 +++ net/netrom/Makefile | 9 + net/netrom/af_netrom.c | 1509 +++++ net/netrom/nr_dev.c | 213 + net/netrom/nr_in.c | 306 + net/netrom/nr_loopback.c | 77 + net/netrom/nr_out.c | 274 + net/netrom/nr_route.c | 1027 +++ net/netrom/nr_subr.c | 282 + net/netrom/nr_timer.c | 249 + net/netrom/sysctl_net_netrom.c | 163 + net/nonet.c | 26 + net/packet/Kconfig | 16 + net/packet/Makefile | 5 + net/packet/af_packet.c | 2808 ++++++++ net/phonet/Kconfig | 16 + net/phonet/Makefile | 11 + net/phonet/af_phonet.c | 548 ++ net/phonet/datagram.c | 214 + net/phonet/pep-gprs.c | 328 + net/phonet/pep.c | 1293 ++++ net/phonet/pn_dev.c | 448 ++ net/phonet/pn_netlink.c | 303 + net/phonet/socket.c | 825 +++ net/phonet/sysctl.c | 111 + net/rds/Kconfig | 28 + net/rds/Makefile | 19 + net/rds/af_rds.c | 599 ++ net/rds/bind.c | 204 + net/rds/cong.c | 405 ++ net/rds/connection.c | 573 ++ net/rds/ib.c | 433 ++ net/rds/ib.h | 373 ++ net/rds/ib_cm.c | 832 +++ net/rds/ib_rdma.c | 794 +++ net/rds/ib_recv.c | 1076 +++ net/rds/ib_ring.c | 168 + net/rds/ib_send.c | 1021 +++ net/rds/ib_stats.c | 97 + net/rds/ib_sysctl.c | 128 + net/rds/info.c | 242 + net/rds/info.h | 30 + net/rds/iw.c | 327 + net/rds/iw.h | 395 ++ net/rds/iw_cm.c | 766 +++ net/rds/iw_rdma.c | 878 +++ net/rds/iw_recv.c | 920 +++ net/rds/iw_ring.c | 169 + net/rds/iw_send.c | 974 +++ net/rds/iw_stats.c | 95 + net/rds/iw_sysctl.c | 131 + net/rds/loop.c | 194 + net/rds/loop.h | 9 + net/rds/message.c | 401 ++ net/rds/page.c | 212 + net/rds/rdma.c | 858 +++ net/rds/rdma_transport.c | 250 + net/rds/rdma_transport.h | 24 + net/rds/rds.h | 812 +++ net/rds/recv.c | 546 ++ net/rds/send.c | 1133 ++++ net/rds/stats.c | 150 + net/rds/sysctl.c | 117 + net/rds/tcp.c | 325 + net/rds/tcp.h | 88 + net/rds/tcp_connect.c | 152 + net/rds/tcp_listen.c | 200 + net/rds/tcp_recv.c | 359 + net/rds/tcp_send.c | 215 + net/rds/tcp_stats.c | 74 + net/rds/threads.c | 222 + net/rds/transport.c | 137 + net/rds/xlist.h | 80 + net/rfkill/Kconfig | 49 + net/rfkill/Makefile | 9 + net/rfkill/core.c | 1283 ++++ net/rfkill/input.c | 350 + net/rfkill/rfkill-gpio.c | 227 + net/rfkill/rfkill-regulator.c | 164 + net/rfkill/rfkill.h | 27 + net/rose/Makefile | 9 + net/rose/af_rose.c | 1643 +++++ net/rose/rose_dev.c | 172 + net/rose/rose_in.c | 295 + net/rose/rose_link.c | 299 + net/rose/rose_loopback.c | 124 + net/rose/rose_out.c | 126 + net/rose/rose_route.c | 1371 ++++ net/rose/rose_subr.c | 557 ++ net/rose/rose_timer.c | 217 + net/rose/sysctl_net_rose.c | 135 + net/rxrpc/Kconfig | 44 + net/rxrpc/Makefile | 29 + net/rxrpc/af_rxrpc.c | 889 +++ net/rxrpc/ar-accept.c | 510 ++ net/rxrpc/ar-ack.c | 1307 ++++ net/rxrpc/ar-call.c | 822 +++ net/rxrpc/ar-connection.c | 922 +++ net/rxrpc/ar-connevent.c | 405 ++ net/rxrpc/ar-error.c | 251 + net/rxrpc/ar-input.c | 804 +++ net/rxrpc/ar-internal.h | 786 +++ net/rxrpc/ar-key.c | 1220 ++++ net/rxrpc/ar-local.c | 310 + net/rxrpc/ar-output.c | 738 +++ net/rxrpc/ar-peer.c | 303 + net/rxrpc/ar-proc.c | 192 + net/rxrpc/ar-recvmsg.c | 438 ++ net/rxrpc/ar-security.c | 264 + net/rxrpc/ar-skbuff.c | 132 + net/rxrpc/ar-transport.c | 279 + net/rxrpc/rxkad.c | 1161 ++++ net/sched/Kconfig | 585 ++ net/sched/Makefile | 54 + net/sched/act_api.c | 1125 ++++ net/sched/act_csum.c | 594 ++ net/sched/act_gact.c | 221 + net/sched/act_ipt.c | 317 + net/sched/act_mirred.c | 300 + net/sched/act_nat.c | 328 + net/sched/act_pedit.c | 262 + net/sched/act_police.c | 402 ++ net/sched/act_simple.c | 218 + net/sched/act_skbedit.c | 218 + net/sched/cls_api.c | 621 ++ net/sched/cls_basic.c | 306 + net/sched/cls_cgroup.c | 325 + net/sched/cls_flow.c | 739 +++ net/sched/cls_fw.c | 399 ++ net/sched/cls_route.c | 627 ++ net/sched/cls_rsvp.c | 28 + net/sched/cls_rsvp.h | 671 ++ net/sched/cls_rsvp6.c | 28 + net/sched/cls_tcindex.c | 507 ++ net/sched/cls_u32.c | 817 +++ net/sched/em_cmp.c | 99 + net/sched/em_meta.c | 879 +++ net/sched/em_nbyte.c | 80 + net/sched/em_text.c | 158 + net/sched/em_u32.c | 64 + net/sched/ematch.c | 543 ++ net/sched/sch_api.c | 1805 +++++ net/sched/sch_atm.c | 690 ++ net/sched/sch_blackhole.c | 53 + net/sched/sch_cbq.c | 2075 ++++++ net/sched/sch_choke.c | 687 ++ net/sched/sch_drr.c | 525 ++ net/sched/sch_dsmark.c | 509 ++ net/sched/sch_fifo.c | 179 + net/sched/sch_generic.c | 907 +++ net/sched/sch_gred.c | 605 ++ net/sched/sch_hfsc.c | 1744 +++++ net/sched/sch_htb.c | 1587 +++++ net/sched/sch_ingress.c | 143 + net/sched/sch_mq.c | 240 + net/sched/sch_mqprio.c | 418 ++ net/sched/sch_multiq.c | 441 ++ net/sched/sch_netem.c | 974 +++ net/sched/sch_prio.c | 405 ++ net/sched/sch_qfq.c | 1137 ++++ net/sched/sch_red.c | 359 + net/sched/sch_sfb.c | 708 ++ net/sched/sch_sfq.c | 722 ++ net/sched/sch_tbf.c | 464 ++ net/sched/sch_teql.c | 538 ++ net/sctp/Kconfig | 100 + net/sctp/Makefile | 21 + net/sctp/associola.c | 1653 +++++ net/sctp/auth.c | 950 +++ net/sctp/bind_addr.c | 551 ++ net/sctp/chunk.c | 364 + net/sctp/command.c | 75 + net/sctp/debug.c | 183 + net/sctp/endpointola.c | 497 ++ net/sctp/input.c | 1139 ++++ net/sctp/inqueue.c | 246 + net/sctp/ipv6.c | 1079 +++ net/sctp/objcnt.c | 148 + net/sctp/output.c | 751 +++ net/sctp/outqueue.c | 1900 ++++++ net/sctp/primitive.c | 220 + net/sctp/probe.c | 217 + net/sctp/proc.c | 516 ++ net/sctp/protocol.c | 1381 ++++ net/sctp/sm_make_chunk.c | 3421 ++++++++++ net/sctp/sm_sideeffect.c | 1717 +++++ net/sctp/sm_statefuns.c | 6161 +++++++++++++++++ net/sctp/sm_statetable.c | 937 +++ net/sctp/socket.c | 6719 +++++++++++++++++++ net/sctp/ssnmap.c | 133 + net/sctp/sysctl.c | 289 + net/sctp/transport.c | 626 ++ net/sctp/tsnmap.c | 385 ++ net/sctp/ulpevent.c | 1099 +++ net/sctp/ulpqueue.c | 1088 +++ net/socket.c | 3384 ++++++++++ net/sunrpc/Kconfig | 37 + net/sunrpc/Makefile | 18 + net/sunrpc/addr.c | 355 + net/sunrpc/auth.c | 688 ++ net/sunrpc/auth_generic.c | 183 + net/sunrpc/auth_gss/Makefile | 13 + net/sunrpc/auth_gss/auth_gss.c | 1646 +++++ net/sunrpc/auth_gss/gss_generic_token.c | 234 + net/sunrpc/auth_gss/gss_krb5_crypto.c | 990 +++ net/sunrpc/auth_gss/gss_krb5_keys.c | 336 + net/sunrpc/auth_gss/gss_krb5_mech.c | 774 +++ net/sunrpc/auth_gss/gss_krb5_seal.c | 223 + net/sunrpc/auth_gss/gss_krb5_seqnum.c | 166 + net/sunrpc/auth_gss/gss_krb5_unseal.c | 226 + net/sunrpc/auth_gss/gss_krb5_wrap.c | 587 ++ net/sunrpc/auth_gss/gss_mech_switch.c | 381 ++ net/sunrpc/auth_gss/svcauth_gss.c | 1458 ++++ net/sunrpc/auth_null.c | 142 + net/sunrpc/auth_unix.c | 246 + net/sunrpc/backchannel_rqst.c | 282 + net/sunrpc/bc_svc.c | 66 + net/sunrpc/cache.c | 1812 +++++ net/sunrpc/clnt.c | 1876 ++++++ net/sunrpc/netns.h | 19 + net/sunrpc/rpc_pipe.c | 1086 +++ net/sunrpc/rpcb_clnt.c | 1074 +++ net/sunrpc/sched.c | 1016 +++ net/sunrpc/socklib.c | 185 + net/sunrpc/stats.c | 277 + net/sunrpc/sunrpc.h | 51 + net/sunrpc/sunrpc_syms.c | 120 + net/sunrpc/svc.c | 1323 ++++ net/sunrpc/svc_xprt.c | 1270 ++++ net/sunrpc/svcauth.c | 165 + net/sunrpc/svcauth_unix.c | 981 +++ net/sunrpc/svcsock.c | 1663 +++++ net/sunrpc/sysctl.c | 183 + net/sunrpc/timer.c | 122 + net/sunrpc/xdr.c | 1267 ++++ net/sunrpc/xprt.c | 1190 ++++ net/sunrpc/xprtrdma/Makefile | 8 + net/sunrpc/xprtrdma/rpc_rdma.c | 882 +++ net/sunrpc/xprtrdma/svc_rdma.c | 301 + net/sunrpc/xprtrdma/svc_rdma_marshal.c | 412 ++ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 685 ++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 743 +++ net/sunrpc/xprtrdma/svc_rdma_transport.c | 1363 ++++ net/sunrpc/xprtrdma/transport.c | 780 +++ net/sunrpc/xprtrdma/verbs.c | 1952 ++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 346 + net/sunrpc/xprtsock.c | 2953 +++++++++ net/sysctl_net.c | 129 + net/tipc/Kconfig | 69 + net/tipc/Makefile | 13 + net/tipc/addr.c | 106 + net/tipc/addr.h | 79 + net/tipc/bcast.c | 914 +++ net/tipc/bcast.h | 106 + net/tipc/bearer.c | 677 ++ net/tipc/bearer.h | 214 + net/tipc/config.c | 502 ++ net/tipc/config.h | 72 + net/tipc/core.c | 211 + net/tipc/core.h | 312 + net/tipc/discover.c | 358 + net/tipc/discover.h | 49 + net/tipc/eth_media.c | 323 + net/tipc/handler.c | 132 + net/tipc/link.c | 3042 +++++++++ net/tipc/link.h | 314 + net/tipc/log.c | 351 + net/tipc/log.h | 67 + net/tipc/msg.c | 355 + net/tipc/msg.h | 767 +++ net/tipc/name_distr.c | 319 + net/tipc/name_distr.h | 48 + net/tipc/name_table.c | 1037 +++ net/tipc/name_table.h | 108 + net/tipc/net.c | 228 + net/tipc/net.h | 47 + net/tipc/netlink.c | 108 + net/tipc/node.c | 492 ++ net/tipc/node.h | 134 + net/tipc/node_subscr.c | 97 + net/tipc/node_subscr.h | 64 + net/tipc/port.c | 1355 ++++ net/tipc/port.h | 315 + net/tipc/ref.c | 300 + net/tipc/ref.h | 49 + net/tipc/socket.c | 1904 ++++++ net/tipc/subscr.c | 589 ++ net/tipc/subscr.h | 90 + net/unix/Kconfig | 21 + net/unix/Makefile | 8 + net/unix/af_unix.c | 2385 +++++++ net/unix/garbage.c | 386 ++ net/unix/sysctl_net_unix.c | 63 + net/wanrouter/Kconfig | 27 + net/wanrouter/Makefile | 7 + net/wanrouter/patchlevel | 1 + net/wanrouter/wanmain.c | 787 +++ net/wanrouter/wanproc.c | 382 ++ net/wimax/Kconfig | 39 + net/wimax/Makefile | 14 + net/wimax/debug-levels.h | 43 + net/wimax/debugfs.c | 80 + net/wimax/id-table.c | 145 + net/wimax/op-msg.c | 432 ++ net/wimax/op-reset.c | 140 + net/wimax/op-rfkill.c | 468 ++ net/wimax/op-state-get.c | 83 + net/wimax/stack.c | 633 ++ net/wimax/wimax-internal.h | 91 + net/wireless/.gitignore | 1 + net/wireless/Kconfig | 184 + net/wireless/Makefile | 23 + net/wireless/chan.c | 135 + net/wireless/core.c | 1056 +++ net/wireless/core.h | 464 ++ net/wireless/db.txt | 17 + net/wireless/debugfs.c | 121 + net/wireless/debugfs.h | 11 + net/wireless/ethtool.c | 78 + net/wireless/ethtool.h | 6 + net/wireless/genregdb.awk | 119 + net/wireless/ibss.c | 526 ++ net/wireless/lib80211.c | 286 + net/wireless/lib80211_crypt_ccmp.c | 493 ++ net/wireless/lib80211_crypt_tkip.c | 784 +++ net/wireless/lib80211_crypt_wep.c | 294 + net/wireless/mesh.c | 161 + net/wireless/mlme.c | 1084 +++ net/wireless/nl80211.c | 6994 ++++++++++++++++++++ net/wireless/nl80211.h | 112 + net/wireless/radiotap.c | 357 + net/wireless/reg.c | 2322 +++++++ net/wireless/reg.h | 85 + net/wireless/regdb.h | 7 + net/wireless/scan.c | 1197 ++++ net/wireless/sme.c | 1020 +++ net/wireless/sysfs.c | 151 + net/wireless/sysfs.h | 9 + net/wireless/util.c | 1025 +++ net/wireless/wext-compat.c | 1537 +++++ net/wireless/wext-compat.h | 49 + net/wireless/wext-core.c | 1084 +++ net/wireless/wext-priv.c | 249 + net/wireless/wext-proc.c | 155 + net/wireless/wext-sme.c | 405 ++ net/wireless/wext-spy.c | 231 + net/x25/Kconfig | 36 + net/x25/Makefile | 10 + net/x25/af_x25.c | 1829 +++++ net/x25/sysctl_net_x25.c | 90 + net/x25/x25_dev.c | 224 + net/x25/x25_facilities.c | 346 + net/x25/x25_forward.c | 167 + net/x25/x25_in.c | 384 ++ net/x25/x25_link.c | 407 ++ net/x25/x25_out.c | 231 + net/x25/x25_proc.c | 266 + net/x25/x25_route.c | 226 + net/x25/x25_subr.c | 378 ++ net/x25/x25_timer.c | 174 + net/xfrm/Kconfig | 80 + net/xfrm/Makefile | 10 + net/xfrm/xfrm_algo.c | 754 +++ net/xfrm/xfrm_hash.c | 39 + net/xfrm/xfrm_hash.h | 136 + net/xfrm/xfrm_input.c | 291 + net/xfrm/xfrm_ipcomp.c | 383 ++ net/xfrm/xfrm_output.c | 212 + net/xfrm/xfrm_policy.c | 2978 +++++++++ net/xfrm/xfrm_proc.c | 84 + net/xfrm/xfrm_replay.c | 550 ++ net/xfrm/xfrm_state.c | 2231 +++++++ net/xfrm/xfrm_sysctl.c | 82 + net/xfrm/xfrm_user.c | 2973 +++++++++ 1296 files changed, 694444 insertions(+) create mode 100644 net/802/Kconfig create mode 100644 net/802/Makefile create mode 100644 net/802/fc.c create mode 100644 net/802/fddi.c create mode 100644 net/802/garp.c create mode 100644 net/802/hippi.c create mode 100644 net/802/p8022.c create mode 100644 net/802/p8023.c create mode 100644 net/802/psnap.c create mode 100644 net/802/stp.c create mode 100644 net/802/tr.c create mode 100644 net/8021q/Kconfig create mode 100644 net/8021q/Makefile create mode 100644 net/8021q/vlan.c create mode 100644 net/8021q/vlan.h create mode 100644 net/8021q/vlan_core.c create mode 100644 net/8021q/vlan_dev.c create mode 100644 net/8021q/vlan_gvrp.c create mode 100644 net/8021q/vlan_netlink.c create mode 100644 net/8021q/vlanproc.c create mode 100644 net/8021q/vlanproc.h create mode 100644 net/9p/Kconfig create mode 100644 net/9p/Makefile create mode 100644 net/9p/client.c create mode 100644 net/9p/error.c create mode 100644 net/9p/mod.c create mode 100644 net/9p/protocol.c create mode 100644 net/9p/protocol.h create mode 100644 net/9p/trans_common.c create mode 100644 net/9p/trans_common.h create mode 100644 net/9p/trans_fd.c create mode 100644 net/9p/trans_rdma.c create mode 100644 net/9p/trans_virtio.c create mode 100644 net/9p/util.c create mode 100644 net/Kconfig create mode 100644 net/Makefile create mode 100644 net/TUNABLE create mode 100644 net/activity_stats.c create mode 100644 net/appletalk/Makefile create mode 100644 net/appletalk/aarp.c create mode 100644 net/appletalk/atalk_proc.c create mode 100644 net/appletalk/ddp.c create mode 100644 net/appletalk/dev.c create mode 100644 net/appletalk/sysctl_net_atalk.c create mode 100644 net/atm/Kconfig create mode 100644 net/atm/Makefile create mode 100644 net/atm/addr.c create mode 100644 net/atm/addr.h create mode 100644 net/atm/atm_misc.c create mode 100644 net/atm/atm_sysfs.c create mode 100644 net/atm/br2684.c create mode 100644 net/atm/clip.c create mode 100644 net/atm/common.c create mode 100644 net/atm/common.h create mode 100644 net/atm/ioctl.c create mode 100644 net/atm/lec.c create mode 100644 net/atm/lec.h create mode 100644 net/atm/lec_arpc.h create mode 100644 net/atm/mpc.c create mode 100644 net/atm/mpc.h create mode 100644 net/atm/mpoa_caches.c create mode 100644 net/atm/mpoa_caches.h create mode 100644 net/atm/mpoa_proc.c create mode 100644 net/atm/pppoatm.c create mode 100644 net/atm/proc.c create mode 100644 net/atm/protocols.h create mode 100644 net/atm/pvc.c create mode 100644 net/atm/raw.c create mode 100644 net/atm/resources.c create mode 100644 net/atm/resources.h create mode 100644 net/atm/signaling.c create mode 100644 net/atm/signaling.h create mode 100644 net/atm/svc.c create mode 100644 net/ax25/Kconfig create mode 100644 net/ax25/Makefile create mode 100644 net/ax25/TODO create mode 100644 net/ax25/af_ax25.c create mode 100644 net/ax25/ax25_addr.c create mode 100644 net/ax25/ax25_dev.c create mode 100644 net/ax25/ax25_ds_in.c create mode 100644 net/ax25/ax25_ds_subr.c create mode 100644 net/ax25/ax25_ds_timer.c create mode 100644 net/ax25/ax25_iface.c create mode 100644 net/ax25/ax25_in.c create mode 100644 net/ax25/ax25_ip.c create mode 100644 net/ax25/ax25_out.c create mode 100644 net/ax25/ax25_route.c create mode 100644 net/ax25/ax25_std_in.c create mode 100644 net/ax25/ax25_std_subr.c create mode 100644 net/ax25/ax25_std_timer.c create mode 100644 net/ax25/ax25_subr.c create mode 100644 net/ax25/ax25_timer.c create mode 100644 net/ax25/ax25_uid.c create mode 100644 net/ax25/sysctl_net_ax25.c create mode 100644 net/batman-adv/Kconfig create mode 100644 net/batman-adv/Makefile create mode 100644 net/batman-adv/aggregation.c create mode 100644 net/batman-adv/aggregation.h create mode 100644 net/batman-adv/bat_debugfs.c create mode 100644 net/batman-adv/bat_debugfs.h create mode 100644 net/batman-adv/bat_sysfs.c create mode 100644 net/batman-adv/bat_sysfs.h create mode 100644 net/batman-adv/bitarray.c create mode 100644 net/batman-adv/bitarray.h create mode 100644 net/batman-adv/gateway_client.c create mode 100644 net/batman-adv/gateway_client.h create mode 100644 net/batman-adv/gateway_common.c create mode 100644 net/batman-adv/gateway_common.h create mode 100644 net/batman-adv/hard-interface.c create mode 100644 net/batman-adv/hard-interface.h create mode 100644 net/batman-adv/hash.c create mode 100644 net/batman-adv/hash.h create mode 100644 net/batman-adv/icmp_socket.c create mode 100644 net/batman-adv/icmp_socket.h create mode 100644 net/batman-adv/main.c create mode 100644 net/batman-adv/main.h create mode 100644 net/batman-adv/originator.c create mode 100644 net/batman-adv/originator.h create mode 100644 net/batman-adv/packet.h create mode 100644 net/batman-adv/ring_buffer.c create mode 100644 net/batman-adv/ring_buffer.h create mode 100644 net/batman-adv/routing.c create mode 100644 net/batman-adv/routing.h create mode 100644 net/batman-adv/send.c create mode 100644 net/batman-adv/send.h create mode 100644 net/batman-adv/soft-interface.c create mode 100644 net/batman-adv/soft-interface.h create mode 100644 net/batman-adv/translation-table.c create mode 100644 net/batman-adv/translation-table.h create mode 100644 net/batman-adv/types.h create mode 100644 net/batman-adv/unicast.c create mode 100644 net/batman-adv/unicast.h create mode 100644 net/batman-adv/vis.c create mode 100644 net/batman-adv/vis.h create mode 100644 net/bluetooth/Kconfig create mode 100644 net/bluetooth/Makefile create mode 100644 net/bluetooth/af_bluetooth.c create mode 100644 net/bluetooth/bnep/Kconfig create mode 100644 net/bluetooth/bnep/Makefile create mode 100644 net/bluetooth/bnep/bnep.h create mode 100644 net/bluetooth/bnep/core.c create mode 100644 net/bluetooth/bnep/netdev.c create mode 100644 net/bluetooth/bnep/sock.c create mode 100644 net/bluetooth/cmtp/Kconfig create mode 100644 net/bluetooth/cmtp/Makefile create mode 100644 net/bluetooth/cmtp/capi.c create mode 100644 net/bluetooth/cmtp/cmtp.h create mode 100644 net/bluetooth/cmtp/core.c create mode 100644 net/bluetooth/cmtp/sock.c create mode 100644 net/bluetooth/hci_conn.c create mode 100644 net/bluetooth/hci_core.c create mode 100755 net/bluetooth/hci_event.c create mode 100644 net/bluetooth/hci_sock.c create mode 100644 net/bluetooth/hci_sysfs.c create mode 100644 net/bluetooth/hidp/Kconfig create mode 100644 net/bluetooth/hidp/Makefile create mode 100644 net/bluetooth/hidp/core.c create mode 100644 net/bluetooth/hidp/hidp.h create mode 100644 net/bluetooth/hidp/sock.c create mode 100644 net/bluetooth/l2cap_core.c create mode 100644 net/bluetooth/l2cap_sock.c create mode 100644 net/bluetooth/lib.c create mode 100644 net/bluetooth/mgmt.c create mode 100644 net/bluetooth/rfcomm/Kconfig create mode 100644 net/bluetooth/rfcomm/Makefile create mode 100644 net/bluetooth/rfcomm/core.c create mode 100644 net/bluetooth/rfcomm/sock.c create mode 100644 net/bluetooth/rfcomm/tty.c create mode 100644 net/bluetooth/sco.c create mode 100644 net/bluetooth/smp.c create mode 100644 net/bridge/Kconfig create mode 100644 net/bridge/Makefile create mode 100644 net/bridge/br.c create mode 100644 net/bridge/br_device.c create mode 100644 net/bridge/br_fdb.c create mode 100644 net/bridge/br_forward.c create mode 100644 net/bridge/br_if.c create mode 100644 net/bridge/br_input.c create mode 100644 net/bridge/br_ioctl.c create mode 100644 net/bridge/br_multicast.c create mode 100644 net/bridge/br_netfilter.c create mode 100644 net/bridge/br_netlink.c create mode 100644 net/bridge/br_notify.c create mode 100644 net/bridge/br_private.h create mode 100644 net/bridge/br_private_stp.h create mode 100644 net/bridge/br_stp.c create mode 100644 net/bridge/br_stp_bpdu.c create mode 100644 net/bridge/br_stp_if.c create mode 100644 net/bridge/br_stp_timer.c create mode 100644 net/bridge/br_sysfs_br.c create mode 100644 net/bridge/br_sysfs_if.c create mode 100644 net/bridge/netfilter/Kconfig create mode 100644 net/bridge/netfilter/Makefile create mode 100644 net/bridge/netfilter/ebt_802_3.c create mode 100644 net/bridge/netfilter/ebt_among.c create mode 100644 net/bridge/netfilter/ebt_arp.c create mode 100644 net/bridge/netfilter/ebt_arpreply.c create mode 100644 net/bridge/netfilter/ebt_dnat.c create mode 100644 net/bridge/netfilter/ebt_ip.c create mode 100644 net/bridge/netfilter/ebt_ip6.c create mode 100644 net/bridge/netfilter/ebt_limit.c create mode 100644 net/bridge/netfilter/ebt_log.c create mode 100644 net/bridge/netfilter/ebt_mark.c create mode 100644 net/bridge/netfilter/ebt_mark_m.c create mode 100644 net/bridge/netfilter/ebt_nflog.c create mode 100644 net/bridge/netfilter/ebt_pkttype.c create mode 100644 net/bridge/netfilter/ebt_redirect.c create mode 100644 net/bridge/netfilter/ebt_snat.c create mode 100644 net/bridge/netfilter/ebt_stp.c create mode 100644 net/bridge/netfilter/ebt_ulog.c create mode 100644 net/bridge/netfilter/ebt_vlan.c create mode 100644 net/bridge/netfilter/ebtable_broute.c create mode 100644 net/bridge/netfilter/ebtable_filter.c create mode 100644 net/bridge/netfilter/ebtable_nat.c create mode 100644 net/bridge/netfilter/ebtables.c create mode 100644 net/caif/Kconfig create mode 100644 net/caif/Makefile create mode 100644 net/caif/caif_dev.c create mode 100644 net/caif/caif_socket.c create mode 100644 net/caif/cfcnfg.c create mode 100644 net/caif/cfctrl.c create mode 100644 net/caif/cfdbgl.c create mode 100644 net/caif/cfdgml.c create mode 100644 net/caif/cffrml.c create mode 100644 net/caif/cfmuxl.c create mode 100644 net/caif/cfpkt_skbuff.c create mode 100644 net/caif/cfrfml.c create mode 100644 net/caif/cfserl.c create mode 100644 net/caif/cfsrvl.c create mode 100644 net/caif/cfutill.c create mode 100644 net/caif/cfveil.c create mode 100644 net/caif/cfvidl.c create mode 100644 net/caif/chnl_net.c create mode 100644 net/can/Kconfig create mode 100644 net/can/Makefile create mode 100644 net/can/af_can.c create mode 100644 net/can/af_can.h create mode 100644 net/can/bcm.c create mode 100644 net/can/proc.c create mode 100644 net/can/raw.c create mode 100644 net/ceph/Kconfig create mode 100644 net/ceph/Makefile create mode 100644 net/ceph/armor.c create mode 100644 net/ceph/auth.c create mode 100644 net/ceph/auth_none.c create mode 100644 net/ceph/auth_none.h create mode 100644 net/ceph/auth_x.c create mode 100644 net/ceph/auth_x.h create mode 100644 net/ceph/auth_x_protocol.h create mode 100644 net/ceph/buffer.c create mode 100644 net/ceph/ceph_common.c create mode 100644 net/ceph/ceph_fs.c create mode 100644 net/ceph/ceph_hash.c create mode 100644 net/ceph/ceph_strings.c create mode 100644 net/ceph/crush/crush.c create mode 100644 net/ceph/crush/hash.c create mode 100644 net/ceph/crush/mapper.c create mode 100644 net/ceph/crypto.c create mode 100644 net/ceph/crypto.h create mode 100644 net/ceph/debugfs.c create mode 100644 net/ceph/messenger.c create mode 100644 net/ceph/mon_client.c create mode 100644 net/ceph/msgpool.c create mode 100644 net/ceph/osd_client.c create mode 100644 net/ceph/osdmap.c create mode 100644 net/ceph/pagelist.c create mode 100644 net/ceph/pagevec.c create mode 100644 net/compat.c create mode 100644 net/core/Makefile create mode 100644 net/core/datagram.c create mode 100644 net/core/dev.c create mode 100644 net/core/dev_addr_lists.c create mode 100644 net/core/drop_monitor.c create mode 100644 net/core/dst.c create mode 100644 net/core/ethtool.c create mode 100644 net/core/fib_rules.c create mode 100644 net/core/filter.c create mode 100644 net/core/flow.c create mode 100644 net/core/gen_estimator.c create mode 100644 net/core/gen_stats.c create mode 100644 net/core/iovec.c create mode 100644 net/core/kmap_skb.h create mode 100644 net/core/link_watch.c create mode 100644 net/core/neighbour.c create mode 100644 net/core/net-sysfs.c create mode 100644 net/core/net-sysfs.h create mode 100644 net/core/net-traces.c create mode 100644 net/core/net_namespace.c create mode 100644 net/core/netevent.c create mode 100644 net/core/netpoll.c create mode 100644 net/core/pktgen.c create mode 100644 net/core/request_sock.c create mode 100644 net/core/rtnetlink.c create mode 100644 net/core/scm.c create mode 100644 net/core/secure_seq.c create mode 100644 net/core/skbuff.c create mode 100644 net/core/sock.c create mode 100644 net/core/stream.c create mode 100644 net/core/sysctl_net_core.c create mode 100644 net/core/timestamping.c create mode 100644 net/core/user_dma.c create mode 100644 net/core/utils.c create mode 100644 net/dcb/Kconfig create mode 100644 net/dcb/Makefile create mode 100644 net/dcb/dcbevent.c create mode 100644 net/dcb/dcbnl.c create mode 100644 net/dccp/Kconfig create mode 100644 net/dccp/Makefile create mode 100644 net/dccp/ackvec.c create mode 100644 net/dccp/ackvec.h create mode 100644 net/dccp/ccid.c create mode 100644 net/dccp/ccid.h create mode 100644 net/dccp/ccids/Kconfig create mode 100644 net/dccp/ccids/ccid2.c create mode 100644 net/dccp/ccids/ccid2.h create mode 100644 net/dccp/ccids/ccid3.c create mode 100644 net/dccp/ccids/ccid3.h create mode 100644 net/dccp/ccids/lib/loss_interval.c create mode 100644 net/dccp/ccids/lib/loss_interval.h create mode 100644 net/dccp/ccids/lib/packet_history.c create mode 100644 net/dccp/ccids/lib/packet_history.h create mode 100644 net/dccp/ccids/lib/tfrc.c create mode 100644 net/dccp/ccids/lib/tfrc.h create mode 100644 net/dccp/ccids/lib/tfrc_equation.c create mode 100644 net/dccp/dccp.h create mode 100644 net/dccp/diag.c create mode 100644 net/dccp/feat.c create mode 100644 net/dccp/feat.h create mode 100644 net/dccp/input.c create mode 100644 net/dccp/ipv4.c create mode 100644 net/dccp/ipv6.c create mode 100644 net/dccp/ipv6.h create mode 100644 net/dccp/minisocks.c create mode 100644 net/dccp/options.c create mode 100644 net/dccp/output.c create mode 100644 net/dccp/probe.c create mode 100644 net/dccp/proto.c create mode 100644 net/dccp/qpolicy.c create mode 100644 net/dccp/sysctl.c create mode 100644 net/dccp/timer.c create mode 100644 net/decnet/Kconfig create mode 100644 net/decnet/Makefile create mode 100644 net/decnet/README create mode 100644 net/decnet/TODO create mode 100644 net/decnet/af_decnet.c create mode 100644 net/decnet/dn_dev.c create mode 100644 net/decnet/dn_fib.c create mode 100644 net/decnet/dn_neigh.c create mode 100644 net/decnet/dn_nsp_in.c create mode 100644 net/decnet/dn_nsp_out.c create mode 100644 net/decnet/dn_route.c create mode 100644 net/decnet/dn_rules.c create mode 100644 net/decnet/dn_table.c create mode 100644 net/decnet/dn_timer.c create mode 100644 net/decnet/netfilter/Kconfig create mode 100644 net/decnet/netfilter/Makefile create mode 100644 net/decnet/netfilter/dn_rtmsg.c create mode 100644 net/decnet/sysctl_net_decnet.c create mode 100644 net/dns_resolver/Kconfig create mode 100644 net/dns_resolver/Makefile create mode 100644 net/dns_resolver/dns_key.c create mode 100644 net/dns_resolver/dns_query.c create mode 100644 net/dns_resolver/internal.h create mode 100644 net/dsa/Kconfig create mode 100644 net/dsa/Makefile create mode 100644 net/dsa/dsa.c create mode 100644 net/dsa/dsa_priv.h create mode 100644 net/dsa/mv88e6060.c create mode 100644 net/dsa/mv88e6123_61_65.c create mode 100644 net/dsa/mv88e6131.c create mode 100644 net/dsa/mv88e6xxx.c create mode 100644 net/dsa/mv88e6xxx.h create mode 100644 net/dsa/slave.c create mode 100644 net/dsa/tag_dsa.c create mode 100644 net/dsa/tag_edsa.c create mode 100644 net/dsa/tag_trailer.c create mode 100644 net/econet/Kconfig create mode 100644 net/econet/Makefile create mode 100644 net/econet/af_econet.c create mode 100644 net/ethernet/Makefile create mode 100644 net/ethernet/eth.c create mode 100644 net/ethernet/pe2.c create mode 100644 net/ieee802154/Kconfig create mode 100644 net/ieee802154/Makefile create mode 100644 net/ieee802154/af802154.h create mode 100644 net/ieee802154/af_ieee802154.c create mode 100644 net/ieee802154/dgram.c create mode 100644 net/ieee802154/ieee802154.h create mode 100644 net/ieee802154/netlink.c create mode 100644 net/ieee802154/nl-mac.c create mode 100644 net/ieee802154/nl-phy.c create mode 100644 net/ieee802154/nl_policy.c create mode 100644 net/ieee802154/raw.c create mode 100644 net/ieee802154/wpan-class.c create mode 100644 net/ipv4/Kconfig create mode 100644 net/ipv4/Makefile create mode 100644 net/ipv4/af_inet.c create mode 100644 net/ipv4/ah4.c create mode 100644 net/ipv4/arp.c create mode 100644 net/ipv4/cipso_ipv4.c create mode 100644 net/ipv4/datagram.c create mode 100644 net/ipv4/devinet.c create mode 100644 net/ipv4/esp4.c create mode 100644 net/ipv4/fib_frontend.c create mode 100644 net/ipv4/fib_lookup.h create mode 100644 net/ipv4/fib_rules.c create mode 100644 net/ipv4/fib_semantics.c create mode 100644 net/ipv4/fib_trie.c create mode 100644 net/ipv4/gre.c create mode 100644 net/ipv4/icmp.c create mode 100644 net/ipv4/igmp.c create mode 100644 net/ipv4/inet_connection_sock.c create mode 100644 net/ipv4/inet_diag.c create mode 100644 net/ipv4/inet_fragment.c create mode 100644 net/ipv4/inet_hashtables.c create mode 100644 net/ipv4/inet_lro.c create mode 100644 net/ipv4/inet_timewait_sock.c create mode 100644 net/ipv4/inetpeer.c create mode 100644 net/ipv4/ip_forward.c create mode 100644 net/ipv4/ip_fragment.c create mode 100644 net/ipv4/ip_gre.c create mode 100644 net/ipv4/ip_input.c create mode 100644 net/ipv4/ip_options.c create mode 100644 net/ipv4/ip_output.c create mode 100644 net/ipv4/ip_sockglue.c create mode 100644 net/ipv4/ipcomp.c create mode 100644 net/ipv4/ipconfig.c create mode 100644 net/ipv4/ipip.c create mode 100644 net/ipv4/ipmr.c create mode 100644 net/ipv4/netfilter.c create mode 100644 net/ipv4/netfilter/Kconfig create mode 100644 net/ipv4/netfilter/Makefile create mode 100644 net/ipv4/netfilter/arp_tables.c create mode 100644 net/ipv4/netfilter/arpt_mangle.c create mode 100644 net/ipv4/netfilter/arptable_filter.c create mode 100644 net/ipv4/netfilter/ip_queue.c create mode 100644 net/ipv4/netfilter/ip_tables.c create mode 100644 net/ipv4/netfilter/ipt_CLUSTERIP.c create mode 100644 net/ipv4/netfilter/ipt_LOG.c create mode 100644 net/ipv4/netfilter/ipt_MASQUERADE.c create mode 100644 net/ipv4/netfilter/ipt_NETMAP.c create mode 100644 net/ipv4/netfilter/ipt_REDIRECT.c create mode 100644 net/ipv4/netfilter/ipt_REJECT.c create mode 100644 net/ipv4/netfilter/ipt_ULOG.c create mode 100644 net/ipv4/netfilter/ipt_ah.c create mode 100644 net/ipv4/netfilter/ipt_ecn.c create mode 100644 net/ipv4/netfilter/iptable_filter.c create mode 100644 net/ipv4/netfilter/iptable_mangle.c create mode 100644 net/ipv4/netfilter/iptable_raw.c create mode 100644 net/ipv4/netfilter/iptable_security.c create mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c create mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c create mode 100644 net/ipv4/netfilter/nf_conntrack_proto_icmp.c create mode 100644 net/ipv4/netfilter/nf_defrag_ipv4.c create mode 100644 net/ipv4/netfilter/nf_nat_amanda.c create mode 100644 net/ipv4/netfilter/nf_nat_core.c create mode 100644 net/ipv4/netfilter/nf_nat_ftp.c create mode 100644 net/ipv4/netfilter/nf_nat_h323.c create mode 100644 net/ipv4/netfilter/nf_nat_helper.c create mode 100644 net/ipv4/netfilter/nf_nat_irc.c create mode 100644 net/ipv4/netfilter/nf_nat_pptp.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_common.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_dccp.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_gre.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_icmp.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_sctp.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_tcp.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_udp.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_udplite.c create mode 100644 net/ipv4/netfilter/nf_nat_proto_unknown.c create mode 100644 net/ipv4/netfilter/nf_nat_rule.c create mode 100644 net/ipv4/netfilter/nf_nat_sip.c create mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic.c create mode 100644 net/ipv4/netfilter/nf_nat_standalone.c create mode 100644 net/ipv4/netfilter/nf_nat_tftp.c create mode 100644 net/ipv4/ping.c create mode 100644 net/ipv4/proc.c create mode 100644 net/ipv4/protocol.c create mode 100644 net/ipv4/raw.c create mode 100644 net/ipv4/route.c create mode 100644 net/ipv4/syncookies.c create mode 100644 net/ipv4/sysctl_net_ipv4.c create mode 100644 net/ipv4/sysfs_net_ipv4.c create mode 100644 net/ipv4/tcp.c create mode 100644 net/ipv4/tcp_bic.c create mode 100644 net/ipv4/tcp_cong.c create mode 100644 net/ipv4/tcp_cubic.c create mode 100644 net/ipv4/tcp_diag.c create mode 100644 net/ipv4/tcp_highspeed.c create mode 100644 net/ipv4/tcp_htcp.c create mode 100644 net/ipv4/tcp_hybla.c create mode 100644 net/ipv4/tcp_illinois.c create mode 100644 net/ipv4/tcp_input.c create mode 100644 net/ipv4/tcp_ipv4.c create mode 100644 net/ipv4/tcp_lp.c create mode 100644 net/ipv4/tcp_minisocks.c create mode 100644 net/ipv4/tcp_output.c create mode 100644 net/ipv4/tcp_probe.c create mode 100644 net/ipv4/tcp_scalable.c create mode 100644 net/ipv4/tcp_timer.c create mode 100644 net/ipv4/tcp_vegas.c create mode 100644 net/ipv4/tcp_vegas.h create mode 100644 net/ipv4/tcp_veno.c create mode 100644 net/ipv4/tcp_westwood.c create mode 100644 net/ipv4/tcp_yeah.c create mode 100644 net/ipv4/tunnel4.c create mode 100644 net/ipv4/udp.c create mode 100644 net/ipv4/udp_impl.h create mode 100644 net/ipv4/udplite.c create mode 100644 net/ipv4/xfrm4_input.c create mode 100644 net/ipv4/xfrm4_mode_beet.c create mode 100644 net/ipv4/xfrm4_mode_transport.c create mode 100644 net/ipv4/xfrm4_mode_tunnel.c create mode 100644 net/ipv4/xfrm4_output.c create mode 100644 net/ipv4/xfrm4_policy.c create mode 100644 net/ipv4/xfrm4_state.c create mode 100644 net/ipv4/xfrm4_tunnel.c create mode 100644 net/ipv6/Kconfig create mode 100644 net/ipv6/Makefile create mode 100644 net/ipv6/addrconf.c create mode 100644 net/ipv6/addrconf_core.c create mode 100644 net/ipv6/addrlabel.c create mode 100644 net/ipv6/af_inet6.c create mode 100644 net/ipv6/ah6.c create mode 100644 net/ipv6/anycast.c create mode 100644 net/ipv6/datagram.c create mode 100644 net/ipv6/esp6.c create mode 100644 net/ipv6/exthdrs.c create mode 100644 net/ipv6/exthdrs_core.c create mode 100644 net/ipv6/fib6_rules.c create mode 100644 net/ipv6/icmp.c create mode 100644 net/ipv6/inet6_connection_sock.c create mode 100644 net/ipv6/inet6_hashtables.c create mode 100644 net/ipv6/ip6_fib.c create mode 100644 net/ipv6/ip6_flowlabel.c create mode 100644 net/ipv6/ip6_input.c create mode 100644 net/ipv6/ip6_output.c create mode 100644 net/ipv6/ip6_tunnel.c create mode 100644 net/ipv6/ip6mr.c create mode 100644 net/ipv6/ipcomp6.c create mode 100644 net/ipv6/ipv6_sockglue.c create mode 100644 net/ipv6/mcast.c create mode 100644 net/ipv6/mip6.c create mode 100644 net/ipv6/ndisc.c create mode 100644 net/ipv6/netfilter.c create mode 100644 net/ipv6/netfilter/Kconfig create mode 100644 net/ipv6/netfilter/Makefile create mode 100644 net/ipv6/netfilter/ip6_queue.c create mode 100644 net/ipv6/netfilter/ip6_tables.c create mode 100644 net/ipv6/netfilter/ip6t_LOG.c create mode 100644 net/ipv6/netfilter/ip6t_REJECT.c create mode 100644 net/ipv6/netfilter/ip6t_ah.c create mode 100644 net/ipv6/netfilter/ip6t_eui64.c create mode 100644 net/ipv6/netfilter/ip6t_frag.c create mode 100644 net/ipv6/netfilter/ip6t_hbh.c create mode 100644 net/ipv6/netfilter/ip6t_ipv6header.c create mode 100644 net/ipv6/netfilter/ip6t_mh.c create mode 100644 net/ipv6/netfilter/ip6t_rt.c create mode 100644 net/ipv6/netfilter/ip6table_filter.c create mode 100644 net/ipv6/netfilter/ip6table_mangle.c create mode 100644 net/ipv6/netfilter/ip6table_raw.c create mode 100644 net/ipv6/netfilter/ip6table_security.c create mode 100644 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c create mode 100644 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c create mode 100644 net/ipv6/netfilter/nf_conntrack_reasm.c create mode 100644 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c create mode 100644 net/ipv6/proc.c create mode 100644 net/ipv6/protocol.c create mode 100644 net/ipv6/raw.c create mode 100644 net/ipv6/reassembly.c create mode 100644 net/ipv6/route.c create mode 100644 net/ipv6/sit.c create mode 100644 net/ipv6/syncookies.c create mode 100644 net/ipv6/sysctl_net_ipv6.c create mode 100644 net/ipv6/tcp_ipv6.c create mode 100644 net/ipv6/tunnel6.c create mode 100644 net/ipv6/udp.c create mode 100644 net/ipv6/udp_impl.h create mode 100644 net/ipv6/udplite.c create mode 100644 net/ipv6/xfrm6_input.c create mode 100644 net/ipv6/xfrm6_mode_beet.c create mode 100644 net/ipv6/xfrm6_mode_ro.c create mode 100644 net/ipv6/xfrm6_mode_transport.c create mode 100644 net/ipv6/xfrm6_mode_tunnel.c create mode 100644 net/ipv6/xfrm6_output.c create mode 100644 net/ipv6/xfrm6_policy.c create mode 100644 net/ipv6/xfrm6_state.c create mode 100644 net/ipv6/xfrm6_tunnel.c create mode 100644 net/ipx/Kconfig create mode 100644 net/ipx/Makefile create mode 100644 net/ipx/af_ipx.c create mode 100644 net/ipx/ipx_proc.c create mode 100644 net/ipx/ipx_route.c create mode 100644 net/ipx/sysctl_net_ipx.c create mode 100644 net/irda/Kconfig create mode 100644 net/irda/Makefile create mode 100644 net/irda/af_irda.c create mode 100644 net/irda/discovery.c create mode 100644 net/irda/ircomm/Kconfig create mode 100644 net/irda/ircomm/Makefile create mode 100644 net/irda/ircomm/ircomm_core.c create mode 100644 net/irda/ircomm/ircomm_event.c create mode 100644 net/irda/ircomm/ircomm_lmp.c create mode 100644 net/irda/ircomm/ircomm_param.c create mode 100644 net/irda/ircomm/ircomm_ttp.c create mode 100644 net/irda/ircomm/ircomm_tty.c create mode 100644 net/irda/ircomm/ircomm_tty_attach.c create mode 100644 net/irda/ircomm/ircomm_tty_ioctl.c create mode 100644 net/irda/irda_device.c create mode 100644 net/irda/iriap.c create mode 100644 net/irda/iriap_event.c create mode 100644 net/irda/irias_object.c create mode 100644 net/irda/irlan/Kconfig create mode 100644 net/irda/irlan/Makefile create mode 100644 net/irda/irlan/irlan_client.c create mode 100644 net/irda/irlan/irlan_client_event.c create mode 100644 net/irda/irlan/irlan_common.c create mode 100644 net/irda/irlan/irlan_eth.c create mode 100644 net/irda/irlan/irlan_event.c create mode 100644 net/irda/irlan/irlan_filter.c create mode 100644 net/irda/irlan/irlan_provider.c create mode 100644 net/irda/irlan/irlan_provider_event.c create mode 100644 net/irda/irlap.c create mode 100644 net/irda/irlap_event.c create mode 100644 net/irda/irlap_frame.c create mode 100644 net/irda/irlmp.c create mode 100644 net/irda/irlmp_event.c create mode 100644 net/irda/irlmp_frame.c create mode 100644 net/irda/irmod.c create mode 100644 net/irda/irnet/Kconfig create mode 100644 net/irda/irnet/Makefile create mode 100644 net/irda/irnet/irnet.h create mode 100644 net/irda/irnet/irnet_irda.c create mode 100644 net/irda/irnet/irnet_irda.h create mode 100644 net/irda/irnet/irnet_ppp.c create mode 100644 net/irda/irnet/irnet_ppp.h create mode 100644 net/irda/irnetlink.c create mode 100644 net/irda/irproc.c create mode 100644 net/irda/irqueue.c create mode 100644 net/irda/irsysctl.c create mode 100644 net/irda/irttp.c create mode 100644 net/irda/parameters.c create mode 100644 net/irda/qos.c create mode 100644 net/irda/timer.c create mode 100644 net/irda/wrapper.c create mode 100644 net/iucv/Kconfig create mode 100644 net/iucv/Makefile create mode 100644 net/iucv/af_iucv.c create mode 100644 net/iucv/iucv.c create mode 100644 net/key/Makefile create mode 100644 net/key/af_key.c create mode 100644 net/l2tp/Kconfig create mode 100644 net/l2tp/Makefile create mode 100644 net/l2tp/l2tp_core.c create mode 100644 net/l2tp/l2tp_core.h create mode 100644 net/l2tp/l2tp_debugfs.c create mode 100644 net/l2tp/l2tp_eth.c create mode 100644 net/l2tp/l2tp_ip.c create mode 100644 net/l2tp/l2tp_netlink.c create mode 100644 net/l2tp/l2tp_ppp.c create mode 100644 net/lapb/Kconfig create mode 100644 net/lapb/Makefile create mode 100644 net/lapb/lapb_iface.c create mode 100644 net/lapb/lapb_in.c create mode 100644 net/lapb/lapb_out.c create mode 100644 net/lapb/lapb_subr.c create mode 100644 net/lapb/lapb_timer.c create mode 100644 net/llc/Kconfig create mode 100644 net/llc/Makefile create mode 100644 net/llc/af_llc.c create mode 100644 net/llc/llc_c_ac.c create mode 100644 net/llc/llc_c_ev.c create mode 100644 net/llc/llc_c_st.c create mode 100644 net/llc/llc_conn.c create mode 100644 net/llc/llc_core.c create mode 100644 net/llc/llc_if.c create mode 100644 net/llc/llc_input.c create mode 100644 net/llc/llc_output.c create mode 100644 net/llc/llc_pdu.c create mode 100644 net/llc/llc_proc.c create mode 100644 net/llc/llc_s_ac.c create mode 100644 net/llc/llc_s_ev.c create mode 100644 net/llc/llc_s_st.c create mode 100644 net/llc/llc_sap.c create mode 100644 net/llc/llc_station.c create mode 100644 net/llc/sysctl_net_llc.c create mode 100644 net/mac80211/Kconfig create mode 100644 net/mac80211/Makefile create mode 100644 net/mac80211/aes_ccm.c create mode 100644 net/mac80211/aes_ccm.h create mode 100644 net/mac80211/aes_cmac.c create mode 100644 net/mac80211/aes_cmac.h create mode 100644 net/mac80211/agg-rx.c create mode 100644 net/mac80211/agg-tx.c create mode 100644 net/mac80211/cfg.c create mode 100644 net/mac80211/cfg.h create mode 100644 net/mac80211/chan.c create mode 100644 net/mac80211/debugfs.c create mode 100644 net/mac80211/debugfs.h create mode 100644 net/mac80211/debugfs_key.c create mode 100644 net/mac80211/debugfs_key.h create mode 100644 net/mac80211/debugfs_netdev.c create mode 100644 net/mac80211/debugfs_netdev.h create mode 100644 net/mac80211/debugfs_sta.c create mode 100644 net/mac80211/debugfs_sta.h create mode 100644 net/mac80211/driver-ops.h create mode 100644 net/mac80211/driver-trace.c create mode 100644 net/mac80211/driver-trace.h create mode 100644 net/mac80211/event.c create mode 100644 net/mac80211/ht.c create mode 100644 net/mac80211/ibss.c create mode 100644 net/mac80211/ieee80211_i.h create mode 100644 net/mac80211/iface.c create mode 100644 net/mac80211/key.c create mode 100644 net/mac80211/key.h create mode 100644 net/mac80211/led.c create mode 100644 net/mac80211/led.h create mode 100644 net/mac80211/main.c create mode 100644 net/mac80211/mesh.c create mode 100644 net/mac80211/mesh.h create mode 100644 net/mac80211/mesh_hwmp.c create mode 100644 net/mac80211/mesh_pathtbl.c create mode 100644 net/mac80211/mesh_plink.c create mode 100644 net/mac80211/michael.c create mode 100644 net/mac80211/michael.h create mode 100644 net/mac80211/mlme.c create mode 100644 net/mac80211/offchannel.c create mode 100644 net/mac80211/pm.c create mode 100644 net/mac80211/rate.c create mode 100644 net/mac80211/rate.h create mode 100644 net/mac80211/rc80211_minstrel.c create mode 100644 net/mac80211/rc80211_minstrel.h create mode 100644 net/mac80211/rc80211_minstrel_debugfs.c create mode 100644 net/mac80211/rc80211_minstrel_ht.c create mode 100644 net/mac80211/rc80211_minstrel_ht.h create mode 100644 net/mac80211/rc80211_minstrel_ht_debugfs.c create mode 100644 net/mac80211/rc80211_pid.h create mode 100644 net/mac80211/rc80211_pid_algo.c create mode 100644 net/mac80211/rc80211_pid_debugfs.c create mode 100644 net/mac80211/rx.c create mode 100644 net/mac80211/scan.c create mode 100644 net/mac80211/spectmgmt.c create mode 100644 net/mac80211/sta_info.c create mode 100644 net/mac80211/sta_info.h create mode 100644 net/mac80211/status.c create mode 100644 net/mac80211/tkip.c create mode 100644 net/mac80211/tkip.h create mode 100644 net/mac80211/tx.c create mode 100644 net/mac80211/util.c create mode 100644 net/mac80211/wep.c create mode 100644 net/mac80211/wep.h create mode 100644 net/mac80211/wme.c create mode 100644 net/mac80211/wme.h create mode 100644 net/mac80211/work.c create mode 100644 net/mac80211/wpa.c create mode 100644 net/mac80211/wpa.h create mode 100644 net/netfilter/Kconfig create mode 100644 net/netfilter/Makefile create mode 100644 net/netfilter/core.c create mode 100644 net/netfilter/ipset/Kconfig create mode 100644 net/netfilter/ipset/Makefile create mode 100644 net/netfilter/ipset/ip_set_bitmap_ip.c create mode 100644 net/netfilter/ipset/ip_set_bitmap_ipmac.c create mode 100644 net/netfilter/ipset/ip_set_bitmap_port.c create mode 100644 net/netfilter/ipset/ip_set_core.c create mode 100644 net/netfilter/ipset/ip_set_getport.c create mode 100644 net/netfilter/ipset/ip_set_hash_ip.c create mode 100644 net/netfilter/ipset/ip_set_hash_ipport.c create mode 100644 net/netfilter/ipset/ip_set_hash_ipportip.c create mode 100644 net/netfilter/ipset/ip_set_hash_ipportnet.c create mode 100644 net/netfilter/ipset/ip_set_hash_net.c create mode 100644 net/netfilter/ipset/ip_set_hash_netport.c create mode 100644 net/netfilter/ipset/ip_set_list_set.c create mode 100644 net/netfilter/ipset/pfxlen.c create mode 100644 net/netfilter/ipvs/Kconfig create mode 100644 net/netfilter/ipvs/Makefile create mode 100644 net/netfilter/ipvs/ip_vs_app.c create mode 100644 net/netfilter/ipvs/ip_vs_conn.c create mode 100644 net/netfilter/ipvs/ip_vs_core.c create mode 100644 net/netfilter/ipvs/ip_vs_ctl.c create mode 100644 net/netfilter/ipvs/ip_vs_dh.c create mode 100644 net/netfilter/ipvs/ip_vs_est.c create mode 100644 net/netfilter/ipvs/ip_vs_ftp.c create mode 100644 net/netfilter/ipvs/ip_vs_lblc.c create mode 100644 net/netfilter/ipvs/ip_vs_lblcr.c create mode 100644 net/netfilter/ipvs/ip_vs_lc.c create mode 100644 net/netfilter/ipvs/ip_vs_nfct.c create mode 100644 net/netfilter/ipvs/ip_vs_nq.c create mode 100644 net/netfilter/ipvs/ip_vs_pe.c create mode 100644 net/netfilter/ipvs/ip_vs_pe_sip.c create mode 100644 net/netfilter/ipvs/ip_vs_proto.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_ah_esp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_sctp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_tcp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_udp.c create mode 100644 net/netfilter/ipvs/ip_vs_rr.c create mode 100644 net/netfilter/ipvs/ip_vs_sched.c create mode 100644 net/netfilter/ipvs/ip_vs_sed.c create mode 100644 net/netfilter/ipvs/ip_vs_sh.c create mode 100644 net/netfilter/ipvs/ip_vs_sync.c create mode 100644 net/netfilter/ipvs/ip_vs_wlc.c create mode 100644 net/netfilter/ipvs/ip_vs_wrr.c create mode 100644 net/netfilter/ipvs/ip_vs_xmit.c create mode 100644 net/netfilter/nf_conntrack_acct.c create mode 100644 net/netfilter/nf_conntrack_amanda.c create mode 100644 net/netfilter/nf_conntrack_broadcast.c create mode 100644 net/netfilter/nf_conntrack_core.c create mode 100644 net/netfilter/nf_conntrack_ecache.c create mode 100644 net/netfilter/nf_conntrack_expect.c create mode 100644 net/netfilter/nf_conntrack_extend.c create mode 100644 net/netfilter/nf_conntrack_ftp.c create mode 100644 net/netfilter/nf_conntrack_h323_asn1.c create mode 100644 net/netfilter/nf_conntrack_h323_main.c create mode 100644 net/netfilter/nf_conntrack_h323_types.c create mode 100644 net/netfilter/nf_conntrack_helper.c create mode 100644 net/netfilter/nf_conntrack_irc.c create mode 100644 net/netfilter/nf_conntrack_l3proto_generic.c create mode 100644 net/netfilter/nf_conntrack_netbios_ns.c create mode 100644 net/netfilter/nf_conntrack_netlink.c create mode 100644 net/netfilter/nf_conntrack_pptp.c create mode 100644 net/netfilter/nf_conntrack_proto.c create mode 100644 net/netfilter/nf_conntrack_proto_dccp.c create mode 100644 net/netfilter/nf_conntrack_proto_generic.c create mode 100644 net/netfilter/nf_conntrack_proto_gre.c create mode 100644 net/netfilter/nf_conntrack_proto_sctp.c create mode 100644 net/netfilter/nf_conntrack_proto_tcp.c create mode 100644 net/netfilter/nf_conntrack_proto_udp.c create mode 100644 net/netfilter/nf_conntrack_proto_udplite.c create mode 100644 net/netfilter/nf_conntrack_sane.c create mode 100644 net/netfilter/nf_conntrack_sip.c create mode 100644 net/netfilter/nf_conntrack_snmp.c create mode 100644 net/netfilter/nf_conntrack_standalone.c create mode 100644 net/netfilter/nf_conntrack_tftp.c create mode 100644 net/netfilter/nf_conntrack_timestamp.c create mode 100644 net/netfilter/nf_internals.h create mode 100644 net/netfilter/nf_log.c create mode 100644 net/netfilter/nf_queue.c create mode 100644 net/netfilter/nf_sockopt.c create mode 100644 net/netfilter/nf_tproxy_core.c create mode 100644 net/netfilter/nfnetlink.c create mode 100644 net/netfilter/nfnetlink_log.c create mode 100644 net/netfilter/nfnetlink_queue.c create mode 100644 net/netfilter/x_tables.c create mode 100644 net/netfilter/xt_AUDIT.c create mode 100644 net/netfilter/xt_CHECKSUM.c create mode 100644 net/netfilter/xt_CLASSIFY.c create mode 100644 net/netfilter/xt_CONNSECMARK.c create mode 100644 net/netfilter/xt_CT.c create mode 100644 net/netfilter/xt_IDLETIMER.c create mode 100644 net/netfilter/xt_LED.c create mode 100644 net/netfilter/xt_NFLOG.c create mode 100644 net/netfilter/xt_NFQUEUE.c create mode 100644 net/netfilter/xt_NOTRACK.c create mode 100644 net/netfilter/xt_SECMARK.c create mode 100644 net/netfilter/xt_TCPOPTSTRIP.c create mode 100644 net/netfilter/xt_TEE.c create mode 100644 net/netfilter/xt_TPROXY.c create mode 100644 net/netfilter/xt_TRACE.c create mode 100644 net/netfilter/xt_addrtype.c create mode 100644 net/netfilter/xt_cluster.c create mode 100644 net/netfilter/xt_comment.c create mode 100644 net/netfilter/xt_connbytes.c create mode 100644 net/netfilter/xt_connlimit.c create mode 100644 net/netfilter/xt_connmark.c create mode 100644 net/netfilter/xt_conntrack.c create mode 100644 net/netfilter/xt_cpu.c create mode 100644 net/netfilter/xt_dccp.c create mode 100644 net/netfilter/xt_devgroup.c create mode 100644 net/netfilter/xt_dscp.c create mode 100644 net/netfilter/xt_esp.c create mode 100644 net/netfilter/xt_hashlimit.c create mode 100644 net/netfilter/xt_helper.c create mode 100644 net/netfilter/xt_hl.c create mode 100644 net/netfilter/xt_iprange.c create mode 100644 net/netfilter/xt_ipvs.c create mode 100644 net/netfilter/xt_length.c create mode 100644 net/netfilter/xt_limit.c create mode 100644 net/netfilter/xt_mac.c create mode 100644 net/netfilter/xt_mark.c create mode 100644 net/netfilter/xt_multiport.c create mode 100644 net/netfilter/xt_osf.c create mode 100644 net/netfilter/xt_owner.c create mode 100644 net/netfilter/xt_physdev.c create mode 100644 net/netfilter/xt_pkttype.c create mode 100644 net/netfilter/xt_policy.c create mode 100644 net/netfilter/xt_qtaguid.c create mode 100644 net/netfilter/xt_qtaguid_internal.h create mode 100644 net/netfilter/xt_qtaguid_print.c create mode 100644 net/netfilter/xt_qtaguid_print.h create mode 100644 net/netfilter/xt_quota.c create mode 100644 net/netfilter/xt_quota2.c create mode 100644 net/netfilter/xt_rateest.c create mode 100644 net/netfilter/xt_realm.c create mode 100644 net/netfilter/xt_recent.c create mode 100644 net/netfilter/xt_repldata.h create mode 100644 net/netfilter/xt_sctp.c create mode 100644 net/netfilter/xt_set.c create mode 100644 net/netfilter/xt_socket.c create mode 100644 net/netfilter/xt_state.c create mode 100644 net/netfilter/xt_statistic.c create mode 100644 net/netfilter/xt_string.c create mode 100644 net/netfilter/xt_tcpmss.c create mode 100644 net/netfilter/xt_tcpudp.c create mode 100644 net/netfilter/xt_time.c create mode 100644 net/netfilter/xt_u32.c create mode 100644 net/netlabel/Kconfig create mode 100644 net/netlabel/Makefile create mode 100644 net/netlabel/netlabel_addrlist.c create mode 100644 net/netlabel/netlabel_addrlist.h create mode 100644 net/netlabel/netlabel_cipso_v4.c create mode 100644 net/netlabel/netlabel_cipso_v4.h create mode 100644 net/netlabel/netlabel_domainhash.c create mode 100644 net/netlabel/netlabel_domainhash.h create mode 100644 net/netlabel/netlabel_kapi.c create mode 100644 net/netlabel/netlabel_mgmt.c create mode 100644 net/netlabel/netlabel_mgmt.h create mode 100644 net/netlabel/netlabel_unlabeled.c create mode 100644 net/netlabel/netlabel_unlabeled.h create mode 100644 net/netlabel/netlabel_user.c create mode 100644 net/netlabel/netlabel_user.h create mode 100644 net/netlink/Makefile create mode 100644 net/netlink/af_netlink.c create mode 100644 net/netlink/genetlink.c create mode 100644 net/netrom/Makefile create mode 100644 net/netrom/af_netrom.c create mode 100644 net/netrom/nr_dev.c create mode 100644 net/netrom/nr_in.c create mode 100644 net/netrom/nr_loopback.c create mode 100644 net/netrom/nr_out.c create mode 100644 net/netrom/nr_route.c create mode 100644 net/netrom/nr_subr.c create mode 100644 net/netrom/nr_timer.c create mode 100644 net/netrom/sysctl_net_netrom.c create mode 100644 net/nonet.c create mode 100644 net/packet/Kconfig create mode 100644 net/packet/Makefile create mode 100644 net/packet/af_packet.c create mode 100644 net/phonet/Kconfig create mode 100644 net/phonet/Makefile create mode 100644 net/phonet/af_phonet.c create mode 100644 net/phonet/datagram.c create mode 100644 net/phonet/pep-gprs.c create mode 100644 net/phonet/pep.c create mode 100644 net/phonet/pn_dev.c create mode 100644 net/phonet/pn_netlink.c create mode 100644 net/phonet/socket.c create mode 100644 net/phonet/sysctl.c create mode 100644 net/rds/Kconfig create mode 100644 net/rds/Makefile create mode 100644 net/rds/af_rds.c create mode 100644 net/rds/bind.c create mode 100644 net/rds/cong.c create mode 100644 net/rds/connection.c create mode 100644 net/rds/ib.c create mode 100644 net/rds/ib.h create mode 100644 net/rds/ib_cm.c create mode 100644 net/rds/ib_rdma.c create mode 100644 net/rds/ib_recv.c create mode 100644 net/rds/ib_ring.c create mode 100644 net/rds/ib_send.c create mode 100644 net/rds/ib_stats.c create mode 100644 net/rds/ib_sysctl.c create mode 100644 net/rds/info.c create mode 100644 net/rds/info.h create mode 100644 net/rds/iw.c create mode 100644 net/rds/iw.h create mode 100644 net/rds/iw_cm.c create mode 100644 net/rds/iw_rdma.c create mode 100644 net/rds/iw_recv.c create mode 100644 net/rds/iw_ring.c create mode 100644 net/rds/iw_send.c create mode 100644 net/rds/iw_stats.c create mode 100644 net/rds/iw_sysctl.c create mode 100644 net/rds/loop.c create mode 100644 net/rds/loop.h create mode 100644 net/rds/message.c create mode 100644 net/rds/page.c create mode 100644 net/rds/rdma.c create mode 100644 net/rds/rdma_transport.c create mode 100644 net/rds/rdma_transport.h create mode 100644 net/rds/rds.h create mode 100644 net/rds/recv.c create mode 100644 net/rds/send.c create mode 100644 net/rds/stats.c create mode 100644 net/rds/sysctl.c create mode 100644 net/rds/tcp.c create mode 100644 net/rds/tcp.h create mode 100644 net/rds/tcp_connect.c create mode 100644 net/rds/tcp_listen.c create mode 100644 net/rds/tcp_recv.c create mode 100644 net/rds/tcp_send.c create mode 100644 net/rds/tcp_stats.c create mode 100644 net/rds/threads.c create mode 100644 net/rds/transport.c create mode 100644 net/rds/xlist.h create mode 100644 net/rfkill/Kconfig create mode 100644 net/rfkill/Makefile create mode 100644 net/rfkill/core.c create mode 100644 net/rfkill/input.c create mode 100644 net/rfkill/rfkill-gpio.c create mode 100644 net/rfkill/rfkill-regulator.c create mode 100644 net/rfkill/rfkill.h create mode 100644 net/rose/Makefile create mode 100644 net/rose/af_rose.c create mode 100644 net/rose/rose_dev.c create mode 100644 net/rose/rose_in.c create mode 100644 net/rose/rose_link.c create mode 100644 net/rose/rose_loopback.c create mode 100644 net/rose/rose_out.c create mode 100644 net/rose/rose_route.c create mode 100644 net/rose/rose_subr.c create mode 100644 net/rose/rose_timer.c create mode 100644 net/rose/sysctl_net_rose.c create mode 100644 net/rxrpc/Kconfig create mode 100644 net/rxrpc/Makefile create mode 100644 net/rxrpc/af_rxrpc.c create mode 100644 net/rxrpc/ar-accept.c create mode 100644 net/rxrpc/ar-ack.c create mode 100644 net/rxrpc/ar-call.c create mode 100644 net/rxrpc/ar-connection.c create mode 100644 net/rxrpc/ar-connevent.c create mode 100644 net/rxrpc/ar-error.c create mode 100644 net/rxrpc/ar-input.c create mode 100644 net/rxrpc/ar-internal.h create mode 100644 net/rxrpc/ar-key.c create mode 100644 net/rxrpc/ar-local.c create mode 100644 net/rxrpc/ar-output.c create mode 100644 net/rxrpc/ar-peer.c create mode 100644 net/rxrpc/ar-proc.c create mode 100644 net/rxrpc/ar-recvmsg.c create mode 100644 net/rxrpc/ar-security.c create mode 100644 net/rxrpc/ar-skbuff.c create mode 100644 net/rxrpc/ar-transport.c create mode 100644 net/rxrpc/rxkad.c create mode 100644 net/sched/Kconfig create mode 100644 net/sched/Makefile create mode 100644 net/sched/act_api.c create mode 100644 net/sched/act_csum.c create mode 100644 net/sched/act_gact.c create mode 100644 net/sched/act_ipt.c create mode 100644 net/sched/act_mirred.c create mode 100644 net/sched/act_nat.c create mode 100644 net/sched/act_pedit.c create mode 100644 net/sched/act_police.c create mode 100644 net/sched/act_simple.c create mode 100644 net/sched/act_skbedit.c create mode 100644 net/sched/cls_api.c create mode 100644 net/sched/cls_basic.c create mode 100644 net/sched/cls_cgroup.c create mode 100644 net/sched/cls_flow.c create mode 100644 net/sched/cls_fw.c create mode 100644 net/sched/cls_route.c create mode 100644 net/sched/cls_rsvp.c create mode 100644 net/sched/cls_rsvp.h create mode 100644 net/sched/cls_rsvp6.c create mode 100644 net/sched/cls_tcindex.c create mode 100644 net/sched/cls_u32.c create mode 100644 net/sched/em_cmp.c create mode 100644 net/sched/em_meta.c create mode 100644 net/sched/em_nbyte.c create mode 100644 net/sched/em_text.c create mode 100644 net/sched/em_u32.c create mode 100644 net/sched/ematch.c create mode 100644 net/sched/sch_api.c create mode 100644 net/sched/sch_atm.c create mode 100644 net/sched/sch_blackhole.c create mode 100644 net/sched/sch_cbq.c create mode 100644 net/sched/sch_choke.c create mode 100644 net/sched/sch_drr.c create mode 100644 net/sched/sch_dsmark.c create mode 100644 net/sched/sch_fifo.c create mode 100644 net/sched/sch_generic.c create mode 100644 net/sched/sch_gred.c create mode 100644 net/sched/sch_hfsc.c create mode 100644 net/sched/sch_htb.c create mode 100644 net/sched/sch_ingress.c create mode 100644 net/sched/sch_mq.c create mode 100644 net/sched/sch_mqprio.c create mode 100644 net/sched/sch_multiq.c create mode 100644 net/sched/sch_netem.c create mode 100644 net/sched/sch_prio.c create mode 100644 net/sched/sch_qfq.c create mode 100644 net/sched/sch_red.c create mode 100644 net/sched/sch_sfb.c create mode 100644 net/sched/sch_sfq.c create mode 100644 net/sched/sch_tbf.c create mode 100644 net/sched/sch_teql.c create mode 100644 net/sctp/Kconfig create mode 100644 net/sctp/Makefile create mode 100644 net/sctp/associola.c create mode 100644 net/sctp/auth.c create mode 100644 net/sctp/bind_addr.c create mode 100644 net/sctp/chunk.c create mode 100644 net/sctp/command.c create mode 100644 net/sctp/debug.c create mode 100644 net/sctp/endpointola.c create mode 100644 net/sctp/input.c create mode 100644 net/sctp/inqueue.c create mode 100644 net/sctp/ipv6.c create mode 100644 net/sctp/objcnt.c create mode 100644 net/sctp/output.c create mode 100644 net/sctp/outqueue.c create mode 100644 net/sctp/primitive.c create mode 100644 net/sctp/probe.c create mode 100644 net/sctp/proc.c create mode 100644 net/sctp/protocol.c create mode 100644 net/sctp/sm_make_chunk.c create mode 100644 net/sctp/sm_sideeffect.c create mode 100644 net/sctp/sm_statefuns.c create mode 100644 net/sctp/sm_statetable.c create mode 100644 net/sctp/socket.c create mode 100644 net/sctp/ssnmap.c create mode 100644 net/sctp/sysctl.c create mode 100644 net/sctp/transport.c create mode 100644 net/sctp/tsnmap.c create mode 100644 net/sctp/ulpevent.c create mode 100644 net/sctp/ulpqueue.c create mode 100644 net/socket.c create mode 100644 net/sunrpc/Kconfig create mode 100644 net/sunrpc/Makefile create mode 100644 net/sunrpc/addr.c create mode 100644 net/sunrpc/auth.c create mode 100644 net/sunrpc/auth_generic.c create mode 100644 net/sunrpc/auth_gss/Makefile create mode 100644 net/sunrpc/auth_gss/auth_gss.c create mode 100644 net/sunrpc/auth_gss/gss_generic_token.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_crypto.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_keys.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_mech.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_seal.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_seqnum.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_unseal.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_wrap.c create mode 100644 net/sunrpc/auth_gss/gss_mech_switch.c create mode 100644 net/sunrpc/auth_gss/svcauth_gss.c create mode 100644 net/sunrpc/auth_null.c create mode 100644 net/sunrpc/auth_unix.c create mode 100644 net/sunrpc/backchannel_rqst.c create mode 100644 net/sunrpc/bc_svc.c create mode 100644 net/sunrpc/cache.c create mode 100644 net/sunrpc/clnt.c create mode 100644 net/sunrpc/netns.h create mode 100644 net/sunrpc/rpc_pipe.c create mode 100644 net/sunrpc/rpcb_clnt.c create mode 100644 net/sunrpc/sched.c create mode 100644 net/sunrpc/socklib.c create mode 100644 net/sunrpc/stats.c create mode 100644 net/sunrpc/sunrpc.h create mode 100644 net/sunrpc/sunrpc_syms.c create mode 100644 net/sunrpc/svc.c create mode 100644 net/sunrpc/svc_xprt.c create mode 100644 net/sunrpc/svcauth.c create mode 100644 net/sunrpc/svcauth_unix.c create mode 100644 net/sunrpc/svcsock.c create mode 100644 net/sunrpc/sysctl.c create mode 100644 net/sunrpc/timer.c create mode 100644 net/sunrpc/xdr.c create mode 100644 net/sunrpc/xprt.c create mode 100644 net/sunrpc/xprtrdma/Makefile create mode 100644 net/sunrpc/xprtrdma/rpc_rdma.c create mode 100644 net/sunrpc/xprtrdma/svc_rdma.c create mode 100644 net/sunrpc/xprtrdma/svc_rdma_marshal.c create mode 100644 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c create mode 100644 net/sunrpc/xprtrdma/svc_rdma_sendto.c create mode 100644 net/sunrpc/xprtrdma/svc_rdma_transport.c create mode 100644 net/sunrpc/xprtrdma/transport.c create mode 100644 net/sunrpc/xprtrdma/verbs.c create mode 100644 net/sunrpc/xprtrdma/xprt_rdma.h create mode 100644 net/sunrpc/xprtsock.c create mode 100644 net/sysctl_net.c create mode 100644 net/tipc/Kconfig create mode 100644 net/tipc/Makefile create mode 100644 net/tipc/addr.c create mode 100644 net/tipc/addr.h create mode 100644 net/tipc/bcast.c create mode 100644 net/tipc/bcast.h create mode 100644 net/tipc/bearer.c create mode 100644 net/tipc/bearer.h create mode 100644 net/tipc/config.c create mode 100644 net/tipc/config.h create mode 100644 net/tipc/core.c create mode 100644 net/tipc/core.h create mode 100644 net/tipc/discover.c create mode 100644 net/tipc/discover.h create mode 100644 net/tipc/eth_media.c create mode 100644 net/tipc/handler.c create mode 100644 net/tipc/link.c create mode 100644 net/tipc/link.h create mode 100644 net/tipc/log.c create mode 100644 net/tipc/log.h create mode 100644 net/tipc/msg.c create mode 100644 net/tipc/msg.h create mode 100644 net/tipc/name_distr.c create mode 100644 net/tipc/name_distr.h create mode 100644 net/tipc/name_table.c create mode 100644 net/tipc/name_table.h create mode 100644 net/tipc/net.c create mode 100644 net/tipc/net.h create mode 100644 net/tipc/netlink.c create mode 100644 net/tipc/node.c create mode 100644 net/tipc/node.h create mode 100644 net/tipc/node_subscr.c create mode 100644 net/tipc/node_subscr.h create mode 100644 net/tipc/port.c create mode 100644 net/tipc/port.h create mode 100644 net/tipc/ref.c create mode 100644 net/tipc/ref.h create mode 100644 net/tipc/socket.c create mode 100644 net/tipc/subscr.c create mode 100644 net/tipc/subscr.h create mode 100644 net/unix/Kconfig create mode 100644 net/unix/Makefile create mode 100644 net/unix/af_unix.c create mode 100644 net/unix/garbage.c create mode 100644 net/unix/sysctl_net_unix.c create mode 100644 net/wanrouter/Kconfig create mode 100644 net/wanrouter/Makefile create mode 100644 net/wanrouter/patchlevel create mode 100644 net/wanrouter/wanmain.c create mode 100644 net/wanrouter/wanproc.c create mode 100644 net/wimax/Kconfig create mode 100644 net/wimax/Makefile create mode 100644 net/wimax/debug-levels.h create mode 100644 net/wimax/debugfs.c create mode 100644 net/wimax/id-table.c create mode 100644 net/wimax/op-msg.c create mode 100644 net/wimax/op-reset.c create mode 100644 net/wimax/op-rfkill.c create mode 100644 net/wimax/op-state-get.c create mode 100644 net/wimax/stack.c create mode 100644 net/wimax/wimax-internal.h create mode 100644 net/wireless/.gitignore create mode 100644 net/wireless/Kconfig create mode 100644 net/wireless/Makefile create mode 100644 net/wireless/chan.c create mode 100644 net/wireless/core.c create mode 100644 net/wireless/core.h create mode 100644 net/wireless/db.txt create mode 100644 net/wireless/debugfs.c create mode 100644 net/wireless/debugfs.h create mode 100644 net/wireless/ethtool.c create mode 100644 net/wireless/ethtool.h create mode 100644 net/wireless/genregdb.awk create mode 100644 net/wireless/ibss.c create mode 100644 net/wireless/lib80211.c create mode 100644 net/wireless/lib80211_crypt_ccmp.c create mode 100644 net/wireless/lib80211_crypt_tkip.c create mode 100644 net/wireless/lib80211_crypt_wep.c create mode 100644 net/wireless/mesh.c create mode 100644 net/wireless/mlme.c create mode 100644 net/wireless/nl80211.c create mode 100644 net/wireless/nl80211.h create mode 100644 net/wireless/radiotap.c create mode 100644 net/wireless/reg.c create mode 100644 net/wireless/reg.h create mode 100644 net/wireless/regdb.h create mode 100644 net/wireless/scan.c create mode 100644 net/wireless/sme.c create mode 100644 net/wireless/sysfs.c create mode 100644 net/wireless/sysfs.h create mode 100644 net/wireless/util.c create mode 100644 net/wireless/wext-compat.c create mode 100644 net/wireless/wext-compat.h create mode 100644 net/wireless/wext-core.c create mode 100644 net/wireless/wext-priv.c create mode 100644 net/wireless/wext-proc.c create mode 100644 net/wireless/wext-sme.c create mode 100644 net/wireless/wext-spy.c create mode 100644 net/x25/Kconfig create mode 100644 net/x25/Makefile create mode 100644 net/x25/af_x25.c create mode 100644 net/x25/sysctl_net_x25.c create mode 100644 net/x25/x25_dev.c create mode 100644 net/x25/x25_facilities.c create mode 100644 net/x25/x25_forward.c create mode 100644 net/x25/x25_in.c create mode 100644 net/x25/x25_link.c create mode 100644 net/x25/x25_out.c create mode 100644 net/x25/x25_proc.c create mode 100644 net/x25/x25_route.c create mode 100644 net/x25/x25_subr.c create mode 100644 net/x25/x25_timer.c create mode 100644 net/xfrm/Kconfig create mode 100644 net/xfrm/Makefile create mode 100644 net/xfrm/xfrm_algo.c create mode 100644 net/xfrm/xfrm_hash.c create mode 100644 net/xfrm/xfrm_hash.h create mode 100644 net/xfrm/xfrm_input.c create mode 100644 net/xfrm/xfrm_ipcomp.c create mode 100644 net/xfrm/xfrm_output.c create mode 100644 net/xfrm/xfrm_policy.c create mode 100644 net/xfrm/xfrm_proc.c create mode 100644 net/xfrm/xfrm_replay.c create mode 100644 net/xfrm/xfrm_state.c create mode 100644 net/xfrm/xfrm_sysctl.c create mode 100644 net/xfrm/xfrm_user.c (limited to 'net') diff --git a/net/802/Kconfig b/net/802/Kconfig new file mode 100644 index 00000000..be33d27c --- /dev/null +++ b/net/802/Kconfig @@ -0,0 +1,7 @@ +config STP + tristate + select LLC + +config GARP + tristate + select STP diff --git a/net/802/Makefile b/net/802/Makefile new file mode 100644 index 00000000..7893d679 --- /dev/null +++ b/net/802/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for the Linux 802.x protocol layers. +# + +# Check the p8022 selections against net/core/Makefile. +obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_TR) += p8022.o psnap.o tr.o +obj-$(CONFIG_NET_FC) += fc.o +obj-$(CONFIG_FDDI) += fddi.o +obj-$(CONFIG_HIPPI) += hippi.o +obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o +obj-$(CONFIG_ATALK) += p8022.o psnap.o +obj-$(CONFIG_STP) += stp.o +obj-$(CONFIG_GARP) += garp.o diff --git a/net/802/fc.c b/net/802/fc.c new file mode 100644 index 00000000..1e49f2d4 --- /dev/null +++ b/net/802/fc.c @@ -0,0 +1,131 @@ +/* + * NET3: Fibre Channel device handling subroutines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Vineet Abraham + * v 1.0 03/22/99 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Put the headers on a Fibre Channel packet. + */ + +static int fc_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + struct fch_hdr *fch; + int hdr_len; + + /* + * Add the 802.2 SNAP header if IP as the IPv4 code calls + * dev->hard_header directly. + */ + if (type == ETH_P_IP || type == ETH_P_ARP) + { + struct fcllc *fcllc; + + hdr_len = sizeof(struct fch_hdr) + sizeof(struct fcllc); + fch = (struct fch_hdr *)skb_push(skb, hdr_len); + fcllc = (struct fcllc *)(fch+1); + fcllc->dsap = fcllc->ssap = EXTENDED_SAP; + fcllc->llc = UI_CMD; + fcllc->protid[0] = fcllc->protid[1] = fcllc->protid[2] = 0x00; + fcllc->ethertype = htons(type); + } + else + { + hdr_len = sizeof(struct fch_hdr); + fch = (struct fch_hdr *)skb_push(skb, hdr_len); + } + + if(saddr) + memcpy(fch->saddr,saddr,dev->addr_len); + else + memcpy(fch->saddr,dev->dev_addr,dev->addr_len); + + if(daddr) + { + memcpy(fch->daddr,daddr,dev->addr_len); + return hdr_len; + } + return -hdr_len; +} + +/* + * A neighbour discovery of some species (eg arp) has completed. We + * can now send the packet. + */ + +static int fc_rebuild_header(struct sk_buff *skb) +{ +#ifdef CONFIG_INET + struct fch_hdr *fch=(struct fch_hdr *)skb->data; + struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); + if(fcllc->ethertype != htons(ETH_P_IP)) { + printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype)); + return 0; + } + return arp_find(fch->daddr, skb); +#else + return 0; +#endif +} + +static const struct header_ops fc_header_ops = { + .create = fc_header, + .rebuild = fc_rebuild_header, +}; + +static void fc_setup(struct net_device *dev) +{ + dev->header_ops = &fc_header_ops; + dev->type = ARPHRD_IEEE802; + dev->hard_header_len = FC_HLEN; + dev->mtu = 2024; + dev->addr_len = FC_ALEN; + dev->tx_queue_len = 100; /* Long queues on fc */ + dev->flags = IFF_BROADCAST; + + memset(dev->broadcast, 0xFF, FC_ALEN); +} + +/** + * alloc_fcdev - Register fibre channel device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this fibre channel device + * + * Fill in the fields of the device structure with fibre channel-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ +struct net_device *alloc_fcdev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "fc%d", fc_setup); +} +EXPORT_SYMBOL(alloc_fcdev); diff --git a/net/802/fddi.c b/net/802/fddi.c new file mode 100644 index 00000000..94b3ad08 --- /dev/null +++ b/net/802/fddi.c @@ -0,0 +1,215 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * FDDI-type device handling. + * + * Version: @(#)fddi.c 1.0.0 08/12/96 + * + * Authors: Lawrence V. Stefani, + * + * fddi.c is based on previous eth.c and tr.c work by + * Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Florian La Roche, + * Alan Cox, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes + * Alan Cox : New arp/rebuild header + * Maciej W. Rozycki : IPv6 support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Create the FDDI MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +static int fddi_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + int hl = FDDI_K_SNAP_HLEN; + struct fddihdr *fddi; + + if(type != ETH_P_IP && type != ETH_P_IPV6 && type != ETH_P_ARP) + hl=FDDI_K_8022_HLEN-3; + fddi = (struct fddihdr *)skb_push(skb, hl); + fddi->fc = FDDI_FC_K_ASYNC_LLC_DEF; + if(type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP) + { + fddi->hdr.llc_snap.dsap = FDDI_EXTENDED_SAP; + fddi->hdr.llc_snap.ssap = FDDI_EXTENDED_SAP; + fddi->hdr.llc_snap.ctrl = FDDI_UI_CMD; + fddi->hdr.llc_snap.oui[0] = 0x00; + fddi->hdr.llc_snap.oui[1] = 0x00; + fddi->hdr.llc_snap.oui[2] = 0x00; + fddi->hdr.llc_snap.ethertype = htons(type); + } + + /* Set the source and destination hardware addresses */ + + if (saddr != NULL) + memcpy(fddi->saddr, saddr, dev->addr_len); + else + memcpy(fddi->saddr, dev->dev_addr, dev->addr_len); + + if (daddr != NULL) + { + memcpy(fddi->daddr, daddr, dev->addr_len); + return hl; + } + + return -hl; +} + + +/* + * Rebuild the FDDI MAC header. This is called after an ARP + * (or in future other address resolution) has completed on + * this sk_buff. We now let ARP fill in the other fields. + */ + +static int fddi_rebuild_header(struct sk_buff *skb) +{ + struct fddihdr *fddi = (struct fddihdr *)skb->data; + +#ifdef CONFIG_INET + if (fddi->hdr.llc_snap.ethertype == htons(ETH_P_IP)) + /* Try to get ARP to resolve the header and fill destination address */ + return arp_find(fddi->daddr, skb); + else +#endif + { + printk("%s: Don't know how to resolve type %04X addresses.\n", + skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype)); + return 0; + } +} + + +/* + * Determine the packet's protocol ID and fill in skb fields. + * This routine is called before an incoming packet is passed + * up. It's used to fill in specific skb fields and to set + * the proper pointer to the start of packet data (skb->data). + */ + +__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + struct fddihdr *fddi = (struct fddihdr *)skb->data; + __be16 type; + + /* + * Set mac.raw field to point to FC byte, set data field to point + * to start of packet data. Assume 802.2 SNAP frames for now. + */ + + skb->dev = dev; + skb_reset_mac_header(skb); /* point to frame control (FC) */ + + if(fddi->hdr.llc_8022_1.dsap==0xe0) + { + skb_pull(skb, FDDI_K_8022_HLEN-3); + type = htons(ETH_P_802_2); + } + else + { + skb_pull(skb, FDDI_K_SNAP_HLEN); /* adjust for 21 byte header */ + type=fddi->hdr.llc_snap.ethertype; + } + + /* Set packet type based on destination address and flag settings */ + + if (*fddi->daddr & 0x01) + { + if (memcmp(fddi->daddr, dev->broadcast, FDDI_K_ALEN) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } + + else if (dev->flags & IFF_PROMISC) + { + if (memcmp(fddi->daddr, dev->dev_addr, FDDI_K_ALEN)) + skb->pkt_type = PACKET_OTHERHOST; + } + + /* Assume 802.2 SNAP frames, for now */ + + return type; +} + +EXPORT_SYMBOL(fddi_type_trans); + +int fddi_change_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL(fddi_change_mtu); + +static const struct header_ops fddi_header_ops = { + .create = fddi_header, + .rebuild = fddi_rebuild_header, +}; + + +static void fddi_setup(struct net_device *dev) +{ + dev->header_ops = &fddi_header_ops; + dev->type = ARPHRD_FDDI; + dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */ + dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */ + dev->addr_len = FDDI_K_ALEN; + dev->tx_queue_len = 100; /* Long queues on FDDI */ + dev->flags = IFF_BROADCAST | IFF_MULTICAST; + + memset(dev->broadcast, 0xFF, FDDI_K_ALEN); +} + +/** + * alloc_fddidev - Register FDDI device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this FDDI device + * + * Fill in the fields of the device structure with FDDI-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ +struct net_device *alloc_fddidev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup); +} +EXPORT_SYMBOL(alloc_fddidev); + +MODULE_LICENSE("GPL"); diff --git a/net/802/garp.c b/net/802/garp.c new file mode 100644 index 00000000..16102951 --- /dev/null +++ b/net/802/garp.c @@ -0,0 +1,635 @@ +/* + * IEEE 802.1D Generic Attribute Registration Protocol (GARP) + * + * Copyright (c) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int garp_join_time __read_mostly = 200; +module_param(garp_join_time, uint, 0644); +MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)"); +MODULE_LICENSE("GPL"); + +static const struct garp_state_trans { + u8 state; + u8 action; +} garp_applicant_state_table[GARP_APPLICANT_MAX + 1][GARP_EVENT_MAX + 1] = { + [GARP_APPLICANT_VA] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA, + .action = GARP_ACTION_S_JOIN_IN }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AA }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, + }, + [GARP_APPLICANT_AA] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA, + .action = GARP_ACTION_S_JOIN_IN }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, + }, + [GARP_APPLICANT_QA] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA }, + }, + [GARP_APPLICANT_LA] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_VO, + .action = GARP_ACTION_S_LEAVE_EMPTY }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_LA }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_LA }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_LA }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VA }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, + }, + [GARP_APPLICANT_VP] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA, + .action = GARP_ACTION_S_JOIN_IN }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AP }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_VO }, + }, + [GARP_APPLICANT_AP] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA, + .action = GARP_ACTION_S_JOIN_IN }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_AO }, + }, + [GARP_APPLICANT_QP] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_QO }, + }, + [GARP_APPLICANT_VO] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AO }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VP }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, + }, + [GARP_APPLICANT_AO] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_AP }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, + }, + [GARP_APPLICANT_QO] = { + [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID }, + [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO }, + [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO }, + [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_QP }, + [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID }, + }, +}; + +static int garp_attr_cmp(const struct garp_attr *attr, + const void *data, u8 len, u8 type) +{ + if (attr->type != type) + return attr->type - type; + if (attr->dlen != len) + return attr->dlen - len; + return memcmp(attr->data, data, len); +} + +static struct garp_attr *garp_attr_lookup(const struct garp_applicant *app, + const void *data, u8 len, u8 type) +{ + struct rb_node *parent = app->gid.rb_node; + struct garp_attr *attr; + int d; + + while (parent) { + attr = rb_entry(parent, struct garp_attr, node); + d = garp_attr_cmp(attr, data, len, type); + if (d < 0) + parent = parent->rb_left; + else if (d > 0) + parent = parent->rb_right; + else + return attr; + } + return NULL; +} + +static void garp_attr_insert(struct garp_applicant *app, struct garp_attr *new) +{ + struct rb_node *parent = NULL, **p = &app->gid.rb_node; + struct garp_attr *attr; + int d; + + while (*p) { + parent = *p; + attr = rb_entry(parent, struct garp_attr, node); + d = garp_attr_cmp(attr, new->data, new->dlen, new->type); + if (d < 0) + p = &parent->rb_left; + else if (d > 0) + p = &parent->rb_right; + } + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &app->gid); +} + +static struct garp_attr *garp_attr_create(struct garp_applicant *app, + const void *data, u8 len, u8 type) +{ + struct garp_attr *attr; + + attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); + if (!attr) + return attr; + attr->state = GARP_APPLICANT_VO; + attr->type = type; + attr->dlen = len; + memcpy(attr->data, data, len); + garp_attr_insert(app, attr); + return attr; +} + +static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr) +{ + rb_erase(&attr->node, &app->gid); + kfree(attr); +} + +static int garp_pdu_init(struct garp_applicant *app) +{ + struct sk_buff *skb; + struct garp_pdu_hdr *gp; + +#define LLC_RESERVE sizeof(struct llc_pdu_un) + skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), + GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + skb->dev = app->dev; + skb->protocol = htons(ETH_P_802_2); + skb_reserve(skb, LL_RESERVED_SPACE(app->dev) + LLC_RESERVE); + + gp = (struct garp_pdu_hdr *)__skb_put(skb, sizeof(*gp)); + put_unaligned(htons(GARP_PROTOCOL_ID), &gp->protocol); + + app->pdu = skb; + return 0; +} + +static int garp_pdu_append_end_mark(struct garp_applicant *app) +{ + if (skb_tailroom(app->pdu) < sizeof(u8)) + return -1; + *(u8 *)__skb_put(app->pdu, sizeof(u8)) = GARP_END_MARK; + return 0; +} + +static void garp_pdu_queue(struct garp_applicant *app) +{ + if (!app->pdu) + return; + + garp_pdu_append_end_mark(app); + garp_pdu_append_end_mark(app); + + llc_pdu_header_init(app->pdu, LLC_PDU_TYPE_U, LLC_SAP_BSPAN, + LLC_SAP_BSPAN, LLC_PDU_CMD); + llc_pdu_init_as_ui_cmd(app->pdu); + llc_mac_hdr_init(app->pdu, app->dev->dev_addr, + app->app->proto.group_address); + + skb_queue_tail(&app->queue, app->pdu); + app->pdu = NULL; +} + +static void garp_queue_xmit(struct garp_applicant *app) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&app->queue))) + dev_queue_xmit(skb); +} + +static int garp_pdu_append_msg(struct garp_applicant *app, u8 attrtype) +{ + struct garp_msg_hdr *gm; + + if (skb_tailroom(app->pdu) < sizeof(*gm)) + return -1; + gm = (struct garp_msg_hdr *)__skb_put(app->pdu, sizeof(*gm)); + gm->attrtype = attrtype; + garp_cb(app->pdu)->cur_type = attrtype; + return 0; +} + +static int garp_pdu_append_attr(struct garp_applicant *app, + const struct garp_attr *attr, + enum garp_attr_event event) +{ + struct garp_attr_hdr *ga; + unsigned int len; + int err; +again: + if (!app->pdu) { + err = garp_pdu_init(app); + if (err < 0) + return err; + } + + if (garp_cb(app->pdu)->cur_type != attr->type) { + if (garp_cb(app->pdu)->cur_type && + garp_pdu_append_end_mark(app) < 0) + goto queue; + if (garp_pdu_append_msg(app, attr->type) < 0) + goto queue; + } + + len = sizeof(*ga) + attr->dlen; + if (skb_tailroom(app->pdu) < len) + goto queue; + ga = (struct garp_attr_hdr *)__skb_put(app->pdu, len); + ga->len = len; + ga->event = event; + memcpy(ga->data, attr->data, attr->dlen); + return 0; + +queue: + garp_pdu_queue(app); + goto again; +} + +static void garp_attr_event(struct garp_applicant *app, + struct garp_attr *attr, enum garp_event event) +{ + enum garp_applicant_state state; + + state = garp_applicant_state_table[attr->state][event].state; + if (state == GARP_APPLICANT_INVALID) + return; + + switch (garp_applicant_state_table[attr->state][event].action) { + case GARP_ACTION_NONE: + break; + case GARP_ACTION_S_JOIN_IN: + /* When appending the attribute fails, don't update state in + * order to retry on next TRANSMIT_PDU event. */ + if (garp_pdu_append_attr(app, attr, GARP_JOIN_IN) < 0) + return; + break; + case GARP_ACTION_S_LEAVE_EMPTY: + garp_pdu_append_attr(app, attr, GARP_LEAVE_EMPTY); + /* As a pure applicant, sending a leave message implies that + * the attribute was unregistered and can be destroyed. */ + garp_attr_destroy(app, attr); + return; + default: + WARN_ON(1); + } + + attr->state = state; +} + +int garp_request_join(const struct net_device *dev, + const struct garp_application *appl, + const void *data, u8 len, u8 type) +{ + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); + struct garp_attr *attr; + + spin_lock_bh(&app->lock); + attr = garp_attr_create(app, data, len, type); + if (!attr) { + spin_unlock_bh(&app->lock); + return -ENOMEM; + } + garp_attr_event(app, attr, GARP_EVENT_REQ_JOIN); + spin_unlock_bh(&app->lock); + return 0; +} +EXPORT_SYMBOL_GPL(garp_request_join); + +void garp_request_leave(const struct net_device *dev, + const struct garp_application *appl, + const void *data, u8 len, u8 type) +{ + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); + struct garp_attr *attr; + + spin_lock_bh(&app->lock); + attr = garp_attr_lookup(app, data, len, type); + if (!attr) { + spin_unlock_bh(&app->lock); + return; + } + garp_attr_event(app, attr, GARP_EVENT_REQ_LEAVE); + spin_unlock_bh(&app->lock); +} +EXPORT_SYMBOL_GPL(garp_request_leave); + +static void garp_gid_event(struct garp_applicant *app, enum garp_event event) +{ + struct rb_node *node, *next; + struct garp_attr *attr; + + for (node = rb_first(&app->gid); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct garp_attr, node); + garp_attr_event(app, attr, event); + } +} + +static void garp_join_timer_arm(struct garp_applicant *app) +{ + unsigned long delay; + + delay = (u64)msecs_to_jiffies(garp_join_time) * net_random() >> 32; + mod_timer(&app->join_timer, jiffies + delay); +} + +static void garp_join_timer(unsigned long data) +{ + struct garp_applicant *app = (struct garp_applicant *)data; + + spin_lock(&app->lock); + garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); + garp_pdu_queue(app); + spin_unlock(&app->lock); + + garp_queue_xmit(app); + garp_join_timer_arm(app); +} + +static int garp_pdu_parse_end_mark(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, sizeof(u8))) + return -1; + if (*skb->data == GARP_END_MARK) { + skb_pull(skb, sizeof(u8)); + return -1; + } + return 0; +} + +static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb, + u8 attrtype) +{ + const struct garp_attr_hdr *ga; + struct garp_attr *attr; + enum garp_event event; + unsigned int dlen; + + if (!pskb_may_pull(skb, sizeof(*ga))) + return -1; + ga = (struct garp_attr_hdr *)skb->data; + if (ga->len < sizeof(*ga)) + return -1; + + if (!pskb_may_pull(skb, ga->len)) + return -1; + skb_pull(skb, ga->len); + dlen = sizeof(*ga) - ga->len; + + if (attrtype > app->app->maxattr) + return 0; + + switch (ga->event) { + case GARP_LEAVE_ALL: + if (dlen != 0) + return -1; + garp_gid_event(app, GARP_EVENT_R_LEAVE_EMPTY); + return 0; + case GARP_JOIN_EMPTY: + event = GARP_EVENT_R_JOIN_EMPTY; + break; + case GARP_JOIN_IN: + event = GARP_EVENT_R_JOIN_IN; + break; + case GARP_LEAVE_EMPTY: + event = GARP_EVENT_R_LEAVE_EMPTY; + break; + case GARP_EMPTY: + event = GARP_EVENT_R_EMPTY; + break; + default: + return 0; + } + + if (dlen == 0) + return -1; + attr = garp_attr_lookup(app, ga->data, dlen, attrtype); + if (attr == NULL) + return 0; + garp_attr_event(app, attr, event); + return 0; +} + +static int garp_pdu_parse_msg(struct garp_applicant *app, struct sk_buff *skb) +{ + const struct garp_msg_hdr *gm; + + if (!pskb_may_pull(skb, sizeof(*gm))) + return -1; + gm = (struct garp_msg_hdr *)skb->data; + if (gm->attrtype == 0) + return -1; + skb_pull(skb, sizeof(*gm)); + + while (skb->len > 0) { + if (garp_pdu_parse_attr(app, skb, gm->attrtype) < 0) + return -1; + if (garp_pdu_parse_end_mark(skb) < 0) + break; + } + return 0; +} + +static void garp_pdu_rcv(const struct stp_proto *proto, struct sk_buff *skb, + struct net_device *dev) +{ + struct garp_application *appl = proto->data; + struct garp_port *port; + struct garp_applicant *app; + const struct garp_pdu_hdr *gp; + + port = rcu_dereference(dev->garp_port); + if (!port) + goto err; + app = rcu_dereference(port->applicants[appl->type]); + if (!app) + goto err; + + if (!pskb_may_pull(skb, sizeof(*gp))) + goto err; + gp = (struct garp_pdu_hdr *)skb->data; + if (get_unaligned(&gp->protocol) != htons(GARP_PROTOCOL_ID)) + goto err; + skb_pull(skb, sizeof(*gp)); + + spin_lock(&app->lock); + while (skb->len > 0) { + if (garp_pdu_parse_msg(app, skb) < 0) + break; + if (garp_pdu_parse_end_mark(skb) < 0) + break; + } + spin_unlock(&app->lock); +err: + kfree_skb(skb); +} + +static int garp_init_port(struct net_device *dev) +{ + struct garp_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + rcu_assign_pointer(dev->garp_port, port); + return 0; +} + +static void garp_release_port(struct net_device *dev) +{ + struct garp_port *port = rtnl_dereference(dev->garp_port); + unsigned int i; + + for (i = 0; i <= GARP_APPLICATION_MAX; i++) { + if (rtnl_dereference(port->applicants[i])) + return; + } + rcu_assign_pointer(dev->garp_port, NULL); + kfree_rcu(port, rcu); +} + +int garp_init_applicant(struct net_device *dev, struct garp_application *appl) +{ + struct garp_applicant *app; + int err; + + ASSERT_RTNL(); + + if (!rtnl_dereference(dev->garp_port)) { + err = garp_init_port(dev); + if (err < 0) + goto err1; + } + + err = -ENOMEM; + app = kzalloc(sizeof(*app), GFP_KERNEL); + if (!app) + goto err2; + + err = dev_mc_add(dev, appl->proto.group_address); + if (err < 0) + goto err3; + + app->dev = dev; + app->app = appl; + app->gid = RB_ROOT; + spin_lock_init(&app->lock); + skb_queue_head_init(&app->queue); + rcu_assign_pointer(dev->garp_port->applicants[appl->type], app); + setup_timer(&app->join_timer, garp_join_timer, (unsigned long)app); + garp_join_timer_arm(app); + return 0; + +err3: + kfree(app); +err2: + garp_release_port(dev); +err1: + return err; +} +EXPORT_SYMBOL_GPL(garp_init_applicant); + +void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl) +{ + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); + + ASSERT_RTNL(); + + rcu_assign_pointer(port->applicants[appl->type], NULL); + + /* Delete timer and generate a final TRANSMIT_PDU event to flush out + * all pending messages before the applicant is gone. */ + del_timer_sync(&app->join_timer); + garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); + garp_pdu_queue(app); + garp_queue_xmit(app); + + dev_mc_del(dev, appl->proto.group_address); + kfree_rcu(app, rcu); + garp_release_port(dev); +} +EXPORT_SYMBOL_GPL(garp_uninit_applicant); + +int garp_register_application(struct garp_application *appl) +{ + appl->proto.rcv = garp_pdu_rcv; + appl->proto.data = appl; + return stp_proto_register(&appl->proto); +} +EXPORT_SYMBOL_GPL(garp_register_application); + +void garp_unregister_application(struct garp_application *appl) +{ + stp_proto_unregister(&appl->proto); +} +EXPORT_SYMBOL_GPL(garp_unregister_application); diff --git a/net/802/hippi.c b/net/802/hippi.c new file mode 100644 index 00000000..91aca878 --- /dev/null +++ b/net/802/hippi.c @@ -0,0 +1,235 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * HIPPI-type device handling. + * + * Version: @(#)hippi.c 1.0.0 05/29/97 + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Florian La Roche, + * Alan Cox, + * Jes Sorensen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Create the HIPPI MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +static int hippi_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN); + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; + + if (!len){ + len = skb->len - HIPPI_HLEN; + printk("hippi_header(): length not supplied\n"); + } + + /* + * Due to the stupidity of the little endian byte-order we + * have to set the fp field this way. + */ + hip->fp.fixed = htonl(0x04800018); + hip->fp.d2_size = htonl(len + 8); + hip->le.fc = 0; + hip->le.double_wide = 0; /* only HIPPI 800 for the time being */ + hip->le.message_type = 0; /* Data PDU */ + + hip->le.dest_addr_type = 2; /* 12 bit SC address */ + hip->le.src_addr_type = 2; /* 12 bit SC address */ + + memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3); + memset(&hip->le.reserved, 0, 16); + + hip->snap.dsap = HIPPI_EXTENDED_SAP; + hip->snap.ssap = HIPPI_EXTENDED_SAP; + hip->snap.ctrl = HIPPI_UI_CMD; + hip->snap.oui[0] = 0x00; + hip->snap.oui[1] = 0x00; + hip->snap.oui[2] = 0x00; + hip->snap.ethertype = htons(type); + + if (daddr) + { + memcpy(hip->le.dest_switch_addr, daddr + 3, 3); + memcpy(&hcb->ifield, daddr + 2, 4); + return HIPPI_HLEN; + } + hcb->ifield = 0; + return -((int)HIPPI_HLEN); +} + + +/* + * Rebuild the HIPPI MAC header. This is called after an ARP has + * completed on this sk_buff. We now let ARP fill in the other fields. + */ + +static int hippi_rebuild_header(struct sk_buff *skb) +{ + struct hippi_hdr *hip = (struct hippi_hdr *)skb->data; + + /* + * Only IP is currently supported + */ + + if(hip->snap.ethertype != htons(ETH_P_IP)) + { + printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",skb->dev->name,ntohs(hip->snap.ethertype)); + return 0; + } + + /* + * We don't support dynamic ARP on HIPPI, but we use the ARP + * static ARP tables to hold the I-FIELDs. + */ + return arp_find(hip->le.daddr, skb); +} + + +/* + * Determine the packet's protocol ID. + */ + +__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + struct hippi_hdr *hip; + + /* + * This is actually wrong ... question is if we really should + * set the raw address here. + */ + skb->dev = dev; + skb_reset_mac_header(skb); + hip = (struct hippi_hdr *)skb_mac_header(skb); + skb_pull(skb, HIPPI_HLEN); + + /* + * No fancy promisc stuff here now. + */ + + return hip->snap.ethertype; +} + +EXPORT_SYMBOL(hippi_type_trans); + +int hippi_change_mtu(struct net_device *dev, int new_mtu) +{ + /* + * HIPPI's got these nice large MTUs. + */ + if ((new_mtu < 68) || (new_mtu > 65280)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL(hippi_change_mtu); + +/* + * For HIPPI we will actually use the lower 4 bytes of the hardware + * address as the I-FIELD rather than the actual hardware address. + */ +int hippi_mac_addr(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + if (netif_running(dev)) + return -EBUSY; + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + return 0; +} +EXPORT_SYMBOL(hippi_mac_addr); + +int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) +{ + /* Never send broadcast/multicast ARP messages */ + p->mcast_probes = 0; + + /* In IPv6 unicast probes are valid even on NBMA, + * because they are encapsulated in normal IPv6 protocol. + * Should be a generic flag. + */ + if (p->tbl->family != AF_INET6) + p->ucast_probes = 0; + return 0; +} +EXPORT_SYMBOL(hippi_neigh_setup_dev); + +static const struct header_ops hippi_header_ops = { + .create = hippi_header, + .rebuild = hippi_rebuild_header, +}; + + +static void hippi_setup(struct net_device *dev) +{ + dev->header_ops = &hippi_header_ops; + + /* + * We don't support HIPPI `ARP' for the time being, and probably + * never will unless someone else implements it. However we + * still need a fake ARPHRD to make ifconfig and friends play ball. + */ + dev->type = ARPHRD_HIPPI; + dev->hard_header_len = HIPPI_HLEN; + dev->mtu = 65280; + dev->addr_len = HIPPI_ALEN; + dev->tx_queue_len = 25 /* 5 */; + memset(dev->broadcast, 0xFF, HIPPI_ALEN); + + + /* + * HIPPI doesn't support broadcast+multicast and we only use + * static ARP tables. ARP is disabled by hippi_neigh_setup_dev. + */ + dev->flags = 0; +} + +/** + * alloc_hippi_dev - Register HIPPI device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this HIPPI device + * + * Fill in the fields of the device structure with HIPPI-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ + +struct net_device *alloc_hippi_dev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "hip%d", hippi_setup); +} + +EXPORT_SYMBOL(alloc_hippi_dev); diff --git a/net/802/p8022.c b/net/802/p8022.c new file mode 100644 index 00000000..7f353c4f --- /dev/null +++ b/net/802/p8022.c @@ -0,0 +1,67 @@ +/* + * NET3: Support for 802.2 demultiplexing off Ethernet (Token ring + * is kept separate see p8022tr.c) + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Demultiplex 802.2 encoded protocols. We match the entry by the + * SSAP/DSAP pair and then deliver to the registered datalink that + * matches. The control byte is ignored and handling of such items + * is up to the routine passed the frame. + * + * Unlike the 802.3 datalink we have a list of 802.2 entries as + * there are multiple protocols to demux. The list is currently + * short (3 or 4 entries at most). The current demux assumes this. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, + unsigned char *dest) +{ + llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); + return 0; +} + +struct datalink_proto *register_8022_client(unsigned char type, + int (*func)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev)) +{ + struct datalink_proto *proto; + + proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + if (proto) { + proto->type[0] = type; + proto->header_length = 3; + proto->request = p8022_request; + proto->sap = llc_sap_open(type, func); + if (!proto->sap) { + kfree(proto); + proto = NULL; + } + } + return proto; +} + +void unregister_8022_client(struct datalink_proto *proto) +{ + llc_sap_put(proto->sap); + kfree(proto); +} + +EXPORT_SYMBOL(register_8022_client); +EXPORT_SYMBOL(unregister_8022_client); + +MODULE_LICENSE("GPL"); diff --git a/net/802/p8023.c b/net/802/p8023.c new file mode 100644 index 00000000..1256a40d --- /dev/null +++ b/net/802/p8023.c @@ -0,0 +1,64 @@ +/* + * NET3: 802.3 data link hooks used for IPX 802.3 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 802.3 isn't really a protocol data link layer. Some old IPX stuff + * uses it however. Note that there is only one 802.3 protocol layer + * in the system. We don't currently support different protocols + * running raw 802.3 on different devices. Thankfully nobody else + * has done anything like the old IPX. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Place an 802.3 header on a packet. The driver will do the mac + * addresses, we just need to give it the buffer length. + */ +static int p8023_request(struct datalink_proto *dl, + struct sk_buff *skb, unsigned char *dest_node) +{ + struct net_device *dev = skb->dev; + + dev_hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); + return dev_queue_xmit(skb); +} + +/* + * Create an 802.3 client. Note there can be only one 802.3 client + */ +struct datalink_proto *make_8023_client(void) +{ + struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + + if (proto) { + proto->header_length = 0; + proto->request = p8023_request; + } + return proto; +} + +/* + * Destroy the 802.3 client. + */ +void destroy_8023_client(struct datalink_proto *dl) +{ + kfree(dl); +} + +EXPORT_SYMBOL(destroy_8023_client); +EXPORT_SYMBOL(make_8023_client); + +MODULE_LICENSE("GPL"); diff --git a/net/802/psnap.c b/net/802/psnap.c new file mode 100644 index 00000000..db6baf7c --- /dev/null +++ b/net/802/psnap.c @@ -0,0 +1,167 @@ +/* + * SNAP data link layer. Derived from 802.2 + * + * Alan Cox , + * from the 802.2 layer by Greg Page. + * Merged in additions from Greg Page's psnap.c. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(snap_list); +static DEFINE_SPINLOCK(snap_lock); +static struct llc_sap *snap_sap; + +/* + * Find a snap client by matching the 5 bytes. + */ +static struct datalink_proto *find_snap_client(const unsigned char *desc) +{ + struct datalink_proto *proto = NULL, *p; + + list_for_each_entry_rcu(p, &snap_list, node) { + if (!memcmp(p->type, desc, 5)) { + proto = p; + break; + } + } + return proto; +} + +/* + * A SNAP packet has arrived + */ +static int snap_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + int rc = 1; + struct datalink_proto *proto; + static struct packet_type snap_packet_type = { + .type = cpu_to_be16(ETH_P_SNAP), + }; + + if (unlikely(!pskb_may_pull(skb, 5))) + goto drop; + + rcu_read_lock(); + proto = find_snap_client(skb_transport_header(skb)); + if (proto) { + /* Pass the frame on. */ + skb->transport_header += 5; + skb_pull_rcsum(skb, 5); + rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); + } + rcu_read_unlock(); + + if (unlikely(!proto)) + goto drop; + +out: + return rc; + +drop: + kfree_skb(skb); + goto out; +} + +/* + * Put a SNAP header on a frame and pass to 802.2 + */ +static int snap_request(struct datalink_proto *dl, + struct sk_buff *skb, u8 *dest) +{ + memcpy(skb_push(skb, 5), dl->type, 5); + llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap); + return 0; +} + +/* + * Set up the SNAP layer + */ +EXPORT_SYMBOL(register_snap_client); +EXPORT_SYMBOL(unregister_snap_client); + +static const char snap_err_msg[] __initconst = + KERN_CRIT "SNAP - unable to register with 802.2\n"; + +static int __init snap_init(void) +{ + snap_sap = llc_sap_open(0xAA, snap_rcv); + if (!snap_sap) { + printk(snap_err_msg); + return -EBUSY; + } + + return 0; +} + +module_init(snap_init); + +static void __exit snap_exit(void) +{ + llc_sap_put(snap_sap); +} + +module_exit(snap_exit); + + +/* + * Register SNAP clients. We don't yet use this for IP. + */ +struct datalink_proto *register_snap_client(const unsigned char *desc, + int (*rcvfunc)(struct sk_buff *, + struct net_device *, + struct packet_type *, + struct net_device *)) +{ + struct datalink_proto *proto = NULL; + + spin_lock_bh(&snap_lock); + + if (find_snap_client(desc)) + goto out; + + proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + if (proto) { + memcpy(proto->type, desc, 5); + proto->rcvfunc = rcvfunc; + proto->header_length = 5 + 3; /* snap + 802.2 */ + proto->request = snap_request; + list_add_rcu(&proto->node, &snap_list); + } +out: + spin_unlock_bh(&snap_lock); + + return proto; +} + +/* + * Unregister SNAP clients. Protocols no longer want to play with us ... + */ +void unregister_snap_client(struct datalink_proto *proto) +{ + spin_lock_bh(&snap_lock); + list_del_rcu(&proto->node); + spin_unlock_bh(&snap_lock); + + synchronize_net(); + + kfree(proto); +} + +MODULE_LICENSE("GPL"); diff --git a/net/802/stp.c b/net/802/stp.c new file mode 100644 index 00000000..978c30b1 --- /dev/null +++ b/net/802/stp.c @@ -0,0 +1,103 @@ +/* + * STP SAP demux + * + * Copyright (c) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ +#define GARP_ADDR_MIN 0x20 +#define GARP_ADDR_MAX 0x2F +#define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) + +static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; +static const struct stp_proto __rcu *stp_proto __read_mostly; + +static struct llc_sap *sap __read_mostly; +static unsigned int sap_registered; +static DEFINE_MUTEX(stp_proto_mutex); + +/* Called under rcu_read_lock from LLC */ +static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + const struct ethhdr *eh = eth_hdr(skb); + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + const struct stp_proto *proto; + + if (pdu->ssap != LLC_SAP_BSPAN || + pdu->dsap != LLC_SAP_BSPAN || + pdu->ctrl_1 != LLC_PDU_TYPE_U) + goto err; + + if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { + proto = rcu_dereference(garp_protos[eh->h_dest[5] - + GARP_ADDR_MIN]); + if (proto && + compare_ether_addr(eh->h_dest, proto->group_address)) + goto err; + } else + proto = rcu_dereference(stp_proto); + + if (!proto) + goto err; + + proto->rcv(proto, skb, dev); + return 0; + +err: + kfree_skb(skb); + return 0; +} + +int stp_proto_register(const struct stp_proto *proto) +{ + int err = 0; + + mutex_lock(&stp_proto_mutex); + if (sap_registered++ == 0) { + sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); + if (!sap) { + err = -ENOMEM; + goto out; + } + } + if (is_zero_ether_addr(proto->group_address)) + rcu_assign_pointer(stp_proto, proto); + else + rcu_assign_pointer(garp_protos[proto->group_address[5] - + GARP_ADDR_MIN], proto); +out: + mutex_unlock(&stp_proto_mutex); + return err; +} +EXPORT_SYMBOL_GPL(stp_proto_register); + +void stp_proto_unregister(const struct stp_proto *proto) +{ + mutex_lock(&stp_proto_mutex); + if (is_zero_ether_addr(proto->group_address)) + rcu_assign_pointer(stp_proto, NULL); + else + rcu_assign_pointer(garp_protos[proto->group_address[5] - + GARP_ADDR_MIN], NULL); + synchronize_rcu(); + + if (--sap_registered == 0) + llc_sap_put(sap); + mutex_unlock(&stp_proto_mutex); +} +EXPORT_SYMBOL_GPL(stp_proto_unregister); + +MODULE_LICENSE("GPL"); diff --git a/net/802/tr.c b/net/802/tr.c new file mode 100644 index 00000000..5e20cf8a --- /dev/null +++ b/net/802/tr.c @@ -0,0 +1,677 @@ +/* + * NET3: Token ring device handling subroutines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: 3 Feb 97 Paul Norton Minor routing fixes. + * Added rif table to /proc/net/tr_rif and rif timeout to + * /proc/sys/net/token-ring/rif_timeout. + * 22 Jun 98 Paul Norton Rearranged + * tr_header and tr_type_trans to handle passing IPX SNAP and + * 802.2 through the correct layers. Eliminated tr_reformat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev); +static void rif_check_expire(unsigned long dummy); + +#define TR_SR_DEBUG 0 + +/* + * Each RIF entry we learn is kept this way + */ + +struct rif_cache { + unsigned char addr[TR_ALEN]; + int iface; + __be16 rcf; + __be16 rseg[8]; + struct rif_cache *next; + unsigned long last_used; + unsigned char local_ring; +}; + +#define RIF_TABLE_SIZE 32 + +/* + * We hash the RIF cache 32 ways. We do after all have to look it + * up a lot. + */ + +static struct rif_cache *rif_table[RIF_TABLE_SIZE]; + +static DEFINE_SPINLOCK(rif_lock); + + +/* + * Garbage disposal timer. + */ + +static struct timer_list rif_timer; + +static int sysctl_tr_rif_timeout = 60*10*HZ; + +static inline unsigned long rif_hash(const unsigned char *addr) +{ + unsigned long x; + + x = addr[0]; + x = (x << 2) ^ addr[1]; + x = (x << 2) ^ addr[2]; + x = (x << 2) ^ addr[3]; + x = (x << 2) ^ addr[4]; + x = (x << 2) ^ addr[5]; + + x ^= x >> 8; + + return x & (RIF_TABLE_SIZE - 1); +} + +/* + * Put the headers on a token ring packet. Token ring source routing + * makes this a little more exciting than on ethernet. + */ + +static int tr_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + struct trh_hdr *trh; + int hdr_len; + + /* + * Add the 802.2 SNAP header if IP as the IPv4/IPv6 code calls + * dev->hard_header directly. + */ + if (type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP) + { + struct trllc *trllc; + + hdr_len = sizeof(struct trh_hdr) + sizeof(struct trllc); + trh = (struct trh_hdr *)skb_push(skb, hdr_len); + trllc = (struct trllc *)(trh+1); + trllc->dsap = trllc->ssap = EXTENDED_SAP; + trllc->llc = UI_CMD; + trllc->protid[0] = trllc->protid[1] = trllc->protid[2] = 0x00; + trllc->ethertype = htons(type); + } + else + { + hdr_len = sizeof(struct trh_hdr); + trh = (struct trh_hdr *)skb_push(skb, hdr_len); + } + + trh->ac=AC; + trh->fc=LLC_FRAME; + + if(saddr) + memcpy(trh->saddr,saddr,dev->addr_len); + else + memcpy(trh->saddr,dev->dev_addr,dev->addr_len); + + /* + * Build the destination and then source route the frame + */ + + if(daddr) + { + memcpy(trh->daddr,daddr,dev->addr_len); + tr_source_route(skb, trh, dev); + return hdr_len; + } + + return -hdr_len; +} + +/* + * A neighbour discovery of some species (eg arp) has completed. We + * can now send the packet. + */ + +static int tr_rebuild_header(struct sk_buff *skb) +{ + struct trh_hdr *trh=(struct trh_hdr *)skb->data; + struct trllc *trllc=(struct trllc *)(skb->data+sizeof(struct trh_hdr)); + struct net_device *dev = skb->dev; + + /* + * FIXME: We don't yet support IPv6 over token rings + */ + + if(trllc->ethertype != htons(ETH_P_IP)) { + printk("tr_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(trllc->ethertype)); + return 0; + } + +#ifdef CONFIG_INET + if(arp_find(trh->daddr, skb)) { + return 1; + } + else +#endif + { + tr_source_route(skb,trh,dev); + return 0; + } +} + +/* + * Some of this is a bit hackish. We intercept RIF information + * used for source routing. We also grab IP directly and don't feed + * it via SNAP. + */ + +__be16 tr_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + + struct trh_hdr *trh; + struct trllc *trllc; + unsigned riflen=0; + + skb->dev = dev; + skb_reset_mac_header(skb); + trh = tr_hdr(skb); + + if(trh->saddr[0] & TR_RII) + riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8; + + trllc = (struct trllc *)(skb->data+sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen); + + skb_pull(skb,sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen); + + if(*trh->daddr & 0x80) + { + if(!memcmp(trh->daddr,dev->broadcast,TR_ALEN)) + skb->pkt_type=PACKET_BROADCAST; + else + skb->pkt_type=PACKET_MULTICAST; + } + else if ( (trh->daddr[0] & 0x01) && (trh->daddr[1] & 0x00) && (trh->daddr[2] & 0x5E)) + { + skb->pkt_type=PACKET_MULTICAST; + } + else if(dev->flags & IFF_PROMISC) + { + if(memcmp(trh->daddr, dev->dev_addr, TR_ALEN)) + skb->pkt_type=PACKET_OTHERHOST; + } + + if ((skb->pkt_type != PACKET_BROADCAST) && + (skb->pkt_type != PACKET_MULTICAST)) + tr_add_rif_info(trh,dev) ; + + /* + * Strip the SNAP header from ARP packets since we don't + * pass them through to the 802.2/SNAP layers. + */ + + if (trllc->dsap == EXTENDED_SAP && + (trllc->ethertype == htons(ETH_P_IP) || + trllc->ethertype == htons(ETH_P_IPV6) || + trllc->ethertype == htons(ETH_P_ARP))) + { + skb_pull(skb, sizeof(struct trllc)); + return trllc->ethertype; + } + + return htons(ETH_P_TR_802_2); +} + +/* + * We try to do source routing... + */ + +void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh, + struct net_device *dev) +{ + int slack; + unsigned int hash; + struct rif_cache *entry; + unsigned char *olddata; + unsigned long flags; + static const unsigned char mcast_func_addr[] + = {0xC0,0x00,0x00,0x04,0x00,0x00}; + + spin_lock_irqsave(&rif_lock, flags); + + /* + * Broadcasts are single route as stated in RFC 1042 + */ + if( (!memcmp(&(trh->daddr[0]),&(dev->broadcast[0]),TR_ALEN)) || + (!memcmp(&(trh->daddr[0]),&(mcast_func_addr[0]), TR_ALEN)) ) + { + trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK) + | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST); + trh->saddr[0]|=TR_RII; + } + else + { + hash = rif_hash(trh->daddr); + /* + * Walk the hash table and look for an entry + */ + for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->daddr[0]),TR_ALEN);entry=entry->next); + + /* + * If we found an entry we can route the frame. + */ + if(entry) + { +#if TR_SR_DEBUG +printk("source routing for %pM\n", trh->daddr); +#endif + if(!entry->local_ring && (ntohs(entry->rcf) & TR_RCF_LEN_MASK) >> 8) + { + trh->rcf=entry->rcf; + memcpy(&trh->rseg[0],&entry->rseg[0],8*sizeof(unsigned short)); + trh->rcf^=htons(TR_RCF_DIR_BIT); + trh->rcf&=htons(0x1fff); /* Issam Chehab */ + + trh->saddr[0]|=TR_RII; +#if TR_SR_DEBUG + printk("entry found with rcf %04x\n", entry->rcf); + } + else + { + printk("entry found but without rcf length, local=%02x\n", entry->local_ring); +#endif + } + entry->last_used=jiffies; + } + else + { + /* + * Without the information we simply have to shout + * on the wire. The replies should rapidly clean this + * situation up. + */ + trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK) + | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST); + trh->saddr[0]|=TR_RII; +#if TR_SR_DEBUG + printk("no entry in rif table found - broadcasting frame\n"); +#endif + } + } + + /* Compress the RIF here so we don't have to do it in the driver(s) */ + if (!(trh->saddr[0] & 0x80)) + slack = 18; + else + slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8); + olddata = skb->data; + spin_unlock_irqrestore(&rif_lock, flags); + + skb_pull(skb, slack); + memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack); +} + +/* + * We have learned some new RIF information for our source + * routing. + */ + +static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev) +{ + unsigned int hash, rii_p = 0; + unsigned long flags; + struct rif_cache *entry; + unsigned char saddr0; + + spin_lock_irqsave(&rif_lock, flags); + saddr0 = trh->saddr[0]; + + /* + * Firstly see if the entry exists + */ + + if(trh->saddr[0] & TR_RII) + { + trh->saddr[0]&=0x7f; + if (((ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8) > 2) + { + rii_p = 1; + } + } + + hash = rif_hash(trh->saddr); + for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN);entry=entry->next); + + if(entry==NULL) + { +#if TR_SR_DEBUG + printk("adding rif_entry: addr:%pM rcf:%04X\n", + trh->saddr, ntohs(trh->rcf)); +#endif + /* + * Allocate our new entry. A failure to allocate loses + * use the information. This is harmless. + * + * FIXME: We ought to keep some kind of cache size + * limiting and adjust the timers to suit. + */ + entry=kmalloc(sizeof(struct rif_cache),GFP_ATOMIC); + + if(!entry) + { + printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n"); + spin_unlock_irqrestore(&rif_lock, flags); + return; + } + + memcpy(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN); + entry->iface = dev->ifindex; + entry->next=rif_table[hash]; + entry->last_used=jiffies; + rif_table[hash]=entry; + + if (rii_p) + { + entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK); + memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short)); + entry->local_ring = 0; + } + else + { + entry->local_ring = 1; + } + } + else /* Y. Tahara added */ + { + /* + * Update existing entries + */ + if (!entry->local_ring) + if (entry->rcf != (trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK)) && + !(trh->rcf & htons(TR_RCF_BROADCAST_MASK))) + { +#if TR_SR_DEBUG +printk("updating rif_entry: addr:%pM rcf:%04X\n", + trh->saddr, ntohs(trh->rcf)); +#endif + entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK); + memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short)); + } + entry->last_used=jiffies; + } + trh->saddr[0]=saddr0; /* put the routing indicator back for tcpdump */ + spin_unlock_irqrestore(&rif_lock, flags); +} + +/* + * Scan the cache with a timer and see what we need to throw out. + */ + +static void rif_check_expire(unsigned long dummy) +{ + int i; + unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2; + + spin_lock_irqsave(&rif_lock, flags); + + for(i =0; i < RIF_TABLE_SIZE; i++) { + struct rif_cache *entry, **pentry; + + pentry = rif_table+i; + while((entry=*pentry) != NULL) { + unsigned long expires + = entry->last_used + sysctl_tr_rif_timeout; + + if (time_before_eq(expires, jiffies)) { + *pentry = entry->next; + kfree(entry); + } else { + pentry = &entry->next; + + if (time_before(expires, next_interval)) + next_interval = expires; + } + } + } + + spin_unlock_irqrestore(&rif_lock, flags); + + mod_timer(&rif_timer, next_interval); + +} + +/* + * Generate the /proc/net information for the token ring RIF + * routing. + */ + +#ifdef CONFIG_PROC_FS + +static struct rif_cache *rif_get_idx(loff_t pos) +{ + int i; + struct rif_cache *entry; + loff_t off = 0; + + for(i = 0; i < RIF_TABLE_SIZE; i++) + for(entry = rif_table[i]; entry; entry = entry->next) { + if (off == pos) + return entry; + ++off; + } + + return NULL; +} + +static void *rif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&rif_lock) +{ + spin_lock_irq(&rif_lock); + + return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int i; + struct rif_cache *ent = v; + + ++*pos; + + if (v == SEQ_START_TOKEN) { + i = -1; + goto scan; + } + + if (ent->next) + return ent->next; + + i = rif_hash(ent->addr); + scan: + while (++i < RIF_TABLE_SIZE) { + if ((ent = rif_table[i]) != NULL) + return ent; + } + return NULL; +} + +static void rif_seq_stop(struct seq_file *seq, void *v) + __releases(&rif_lock) +{ + spin_unlock_irq(&rif_lock); +} + +static int rif_seq_show(struct seq_file *seq, void *v) +{ + int j, rcf_len, segment, brdgnmb; + struct rif_cache *entry = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "if TR address TTL rcf routing segments\n"); + else { + struct net_device *dev = dev_get_by_index(&init_net, entry->iface); + long ttl = (long) (entry->last_used + sysctl_tr_rif_timeout) + - (long) jiffies; + + seq_printf(seq, "%s %pM %7li ", + dev?dev->name:"?", + entry->addr, + ttl/HZ); + + if (entry->local_ring) + seq_puts(seq, "local\n"); + else { + + seq_printf(seq, "%04X", ntohs(entry->rcf)); + rcf_len = ((ntohs(entry->rcf) & TR_RCF_LEN_MASK)>>8)-2; + if (rcf_len) + rcf_len >>= 1; + for(j = 1; j < rcf_len; j++) { + if(j==1) { + segment=ntohs(entry->rseg[j-1])>>4; + seq_printf(seq," %03X",segment); + } + + segment=ntohs(entry->rseg[j])>>4; + brdgnmb=ntohs(entry->rseg[j-1])&0x00f; + seq_printf(seq,"-%01X-%03X",brdgnmb,segment); + } + seq_putc(seq, '\n'); + } + + if (dev) + dev_put(dev); + } + return 0; +} + + +static const struct seq_operations rif_seq_ops = { + .start = rif_seq_start, + .next = rif_seq_next, + .stop = rif_seq_stop, + .show = rif_seq_show, +}; + +static int rif_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rif_seq_ops); +} + +static const struct file_operations rif_seq_fops = { + .owner = THIS_MODULE, + .open = rif_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +static const struct header_ops tr_header_ops = { + .create = tr_header, + .rebuild= tr_rebuild_header, +}; + +static void tr_setup(struct net_device *dev) +{ + /* + * Configure and register + */ + + dev->header_ops = &tr_header_ops; + + dev->type = ARPHRD_IEEE802_TR; + dev->hard_header_len = TR_HLEN; + dev->mtu = 2000; + dev->addr_len = TR_ALEN; + dev->tx_queue_len = 100; /* Long queues on tr */ + + memset(dev->broadcast,0xFF, TR_ALEN); + + /* New-style flags. */ + dev->flags = IFF_BROADCAST | IFF_MULTICAST ; +} + +/** + * alloc_trdev - Register token ring device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this token ring device + * + * Fill in the fields of the device structure with token ring-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ +struct net_device *alloc_trdev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "tr%d", tr_setup); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table tr_table[] = { + { + .procname = "rif_timeout", + .data = &sysctl_tr_rif_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { }, +}; + +static __initdata struct ctl_path tr_path[] = { + { .procname = "net", }, + { .procname = "token-ring", }, + { } +}; +#endif + +/* + * Called during bootup. We don't actually have to initialise + * too much for this. + */ + +static int __init rif_init(void) +{ + rif_timer.expires = jiffies + sysctl_tr_rif_timeout; + setup_timer(&rif_timer, rif_check_expire, 0); + add_timer(&rif_timer); +#ifdef CONFIG_SYSCTL + register_sysctl_paths(tr_path, tr_table); +#endif + proc_net_fops_create(&init_net, "tr_rif", S_IRUGO, &rif_seq_fops); + return 0; +} + +module_init(rif_init); + +EXPORT_SYMBOL(tr_type_trans); +EXPORT_SYMBOL(alloc_trdev); + +MODULE_LICENSE("GPL"); diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig new file mode 100644 index 00000000..fa073a54 --- /dev/null +++ b/net/8021q/Kconfig @@ -0,0 +1,29 @@ +# +# Configuration for 802.1Q VLAN support +# + +config VLAN_8021Q + tristate "802.1Q VLAN Support" + ---help--- + Select this and you will be able to create 802.1Q VLAN interfaces + on your ethernet interfaces. 802.1Q VLAN supports almost + everything a regular ethernet interface does, including + firewalling, bridging, and of course IP traffic. You will need + the 'vconfig' tool from the VLAN project in order to effectively + use VLANs. See the VLAN web page for more information: + + + To compile this code as a module, choose M here: the module + will be called 8021q. + + If unsure, say N. + +config VLAN_8021Q_GVRP + bool "GVRP (GARP VLAN Registration Protocol) support" + depends on VLAN_8021Q + select GARP + help + Select this to enable GVRP end-system support. GVRP is used for + automatic propagation of registered VLANs to switches. + + If unsure, say N. diff --git a/net/8021q/Makefile b/net/8021q/Makefile new file mode 100644 index 00000000..9f4f174e --- /dev/null +++ b/net/8021q/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux VLAN layer. +# +obj-$(subst m,y,$(CONFIG_VLAN_8021Q)) += vlan_core.o +obj-$(CONFIG_VLAN_8021Q) += 8021q.o + +8021q-y := vlan.o vlan_dev.o vlan_netlink.o +8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o +8021q-$(CONFIG_PROC_FS) += vlanproc.o + diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c new file mode 100644 index 00000000..917ecb93 --- /dev/null +++ b/net/8021q/vlan.c @@ -0,0 +1,722 @@ +/* + * INET 802.1Q VLAN + * Ethernet-type device handling. + * + * Authors: Ben Greear + * Please send support related email to: netdev@vger.kernel.org + * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html + * + * Fixes: + * Fix for packet capture - Nick Eggleston ; + * Add HW acceleration hooks - David S. Miller ; + * Correct all the locking - David S. Miller ; + * Use hash table for VLAN groups - David S. Miller + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "vlan.h" +#include "vlanproc.h" + +#define DRV_VERSION "1.8" + +/* Global VLAN variables */ + +int vlan_net_id __read_mostly; + +const char vlan_fullname[] = "802.1Q VLAN Support"; +const char vlan_version[] = DRV_VERSION; + +/* End of global variables definitions. */ + +static void vlan_group_free(struct vlan_group *grp) +{ + int i; + + for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++) + kfree(grp->vlan_devices_arrays[i]); + kfree(grp); +} + +static struct vlan_group *vlan_group_alloc(struct net_device *real_dev) +{ + struct vlan_group *grp; + + grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL); + if (!grp) + return NULL; + + grp->real_dev = real_dev; + return grp; +} + +static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id) +{ + struct net_device **array; + unsigned int size; + + ASSERT_RTNL(); + + array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; + if (array != NULL) + return 0; + + size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; + array = kzalloc(size, GFP_KERNEL); + if (array == NULL) + return -ENOBUFS; + + vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN] = array; + return 0; +} + +static void vlan_rcu_free(struct rcu_head *rcu) +{ + vlan_group_free(container_of(rcu, struct vlan_group, rcu)); +} + +void unregister_vlan_dev(struct net_device *dev, struct list_head *head) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct net_device *real_dev = vlan->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + struct vlan_group *grp; + u16 vlan_id = vlan->vlan_id; + + ASSERT_RTNL(); + + grp = rtnl_dereference(real_dev->vlgrp); + BUG_ON(!grp); + + /* Take it out of our own structures, but be sure to interlock with + * HW accelerating devices or SW vlan input packet processing if + * VLAN is not 0 (leave it there for 802.1p). + */ + if (vlan_id && (real_dev->features & NETIF_F_HW_VLAN_FILTER)) + ops->ndo_vlan_rx_kill_vid(real_dev, vlan_id); + + grp->nr_vlans--; + + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_leave(dev); + + vlan_group_set_device(grp, vlan_id, NULL); + /* Because unregister_netdevice_queue() makes sure at least one rcu + * grace period is respected before device freeing, + * we dont need to call synchronize_net() here. + */ + unregister_netdevice_queue(dev, head); + + /* If the group is now empty, kill off the group. */ + if (grp->nr_vlans == 0) { + vlan_gvrp_uninit_applicant(real_dev); + + rcu_assign_pointer(real_dev->vlgrp, NULL); + if (ops->ndo_vlan_rx_register) + ops->ndo_vlan_rx_register(real_dev, NULL); + + /* Free the group, after all cpu's are done. */ + call_rcu(&grp->rcu, vlan_rcu_free); + } + + /* Get rid of the vlan's reference to real_dev */ + dev_put(real_dev); +} + +int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id) +{ + const char *name = real_dev->name; + const struct net_device_ops *ops = real_dev->netdev_ops; + + if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { + pr_info("8021q: VLANs not supported on %s\n", name); + return -EOPNOTSUPP; + } + + if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) && + (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) { + pr_info("8021q: Device %s has buggy VLAN hw accel\n", name); + return -EOPNOTSUPP; + } + + if (vlan_find_dev(real_dev, vlan_id) != NULL) + return -EEXIST; + + return 0; +} + +int register_vlan_dev(struct net_device *dev) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct net_device *real_dev = vlan->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + u16 vlan_id = vlan->vlan_id; + struct vlan_group *grp, *ngrp = NULL; + int err; + + grp = rtnl_dereference(real_dev->vlgrp); + if (!grp) { + ngrp = grp = vlan_group_alloc(real_dev); + if (!grp) + return -ENOBUFS; + err = vlan_gvrp_init_applicant(real_dev); + if (err < 0) + goto out_free_group; + } + + err = vlan_group_prealloc_vid(grp, vlan_id); + if (err < 0) + goto out_uninit_applicant; + + err = register_netdevice(dev); + if (err < 0) + goto out_uninit_applicant; + + /* Account for reference in struct vlan_dev_info */ + dev_hold(real_dev); + + netif_stacked_transfer_operstate(real_dev, dev); + linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ + + /* So, got the sucker initialized, now lets place + * it into our local structure. + */ + vlan_group_set_device(grp, vlan_id, dev); + grp->nr_vlans++; + + if (ngrp) { + if (ops->ndo_vlan_rx_register && (real_dev->features & NETIF_F_HW_VLAN_RX)) + ops->ndo_vlan_rx_register(real_dev, ngrp); + rcu_assign_pointer(real_dev->vlgrp, ngrp); + } + if (real_dev->features & NETIF_F_HW_VLAN_FILTER) + ops->ndo_vlan_rx_add_vid(real_dev, vlan_id); + + return 0; + +out_uninit_applicant: + if (ngrp) + vlan_gvrp_uninit_applicant(real_dev); +out_free_group: + if (ngrp) { + /* Free the group, after all cpu's are done. */ + call_rcu(&ngrp->rcu, vlan_rcu_free); + } + return err; +} + +/* Attach a VLAN device to a mac address (ie Ethernet Card). + * Returns 0 if the device was created or a negative error code otherwise. + */ +static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) +{ + struct net_device *new_dev; + struct net *net = dev_net(real_dev); + struct vlan_net *vn = net_generic(net, vlan_net_id); + char name[IFNAMSIZ]; + int err; + + if (vlan_id >= VLAN_VID_MASK) + return -ERANGE; + + err = vlan_check_real_dev(real_dev, vlan_id); + if (err < 0) + return err; + + /* Gotta set up the fields for the device. */ + switch (vn->name_type) { + case VLAN_NAME_TYPE_RAW_PLUS_VID: + /* name will look like: eth1.0005 */ + snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); + break; + case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: + /* Put our vlan.VID in the name. + * Name will look like: vlan5 + */ + snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); + break; + case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: + /* Put our vlan.VID in the name. + * Name will look like: eth0.5 + */ + snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); + break; + case VLAN_NAME_TYPE_PLUS_VID: + /* Put our vlan.VID in the name. + * Name will look like: vlan0005 + */ + default: + snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); + } + + new_dev = alloc_netdev(sizeof(struct vlan_dev_info), name, vlan_setup); + + if (new_dev == NULL) + return -ENOBUFS; + + dev_net_set(new_dev, net); + /* need 4 bytes for extra VLAN header info, + * hope the underlying device can handle it. + */ + new_dev->mtu = real_dev->mtu; + + vlan_dev_info(new_dev)->vlan_id = vlan_id; + vlan_dev_info(new_dev)->real_dev = real_dev; + vlan_dev_info(new_dev)->dent = NULL; + vlan_dev_info(new_dev)->flags = VLAN_FLAG_REORDER_HDR; + + new_dev->rtnl_link_ops = &vlan_link_ops; + err = register_vlan_dev(new_dev); + if (err < 0) + goto out_free_newdev; + + return 0; + +out_free_newdev: + free_netdev(new_dev); + return err; +} + +static void vlan_sync_address(struct net_device *dev, + struct net_device *vlandev) +{ + struct vlan_dev_info *vlan = vlan_dev_info(vlandev); + + /* May be called without an actual change */ + if (!compare_ether_addr(vlan->real_dev_addr, dev->dev_addr)) + return; + + /* vlan address was different from the old address and is equal to + * the new address */ + if (compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) && + !compare_ether_addr(vlandev->dev_addr, dev->dev_addr)) + dev_uc_del(dev, vlandev->dev_addr); + + /* vlan address was equal to the old address and is different from + * the new address */ + if (!compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) && + compare_ether_addr(vlandev->dev_addr, dev->dev_addr)) + dev_uc_add(dev, vlandev->dev_addr); + + memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN); +} + +static void vlan_transfer_features(struct net_device *dev, + struct net_device *vlandev) +{ + vlandev->gso_max_size = dev->gso_max_size; + + if (dev->features & NETIF_F_HW_VLAN_TX) + vlandev->hard_header_len = dev->hard_header_len; + else + vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; + +#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) + vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; +#endif + + netdev_update_features(vlandev); +} + +static void __vlan_device_event(struct net_device *dev, unsigned long event) +{ + switch (event) { + case NETDEV_CHANGENAME: + vlan_proc_rem_dev(dev); + if (vlan_proc_add_dev(dev) < 0) + pr_warning("8021q: failed to change proc name for %s\n", + dev->name); + break; + case NETDEV_REGISTER: + if (vlan_proc_add_dev(dev) < 0) + pr_warning("8021q: failed to add proc entry for %s\n", + dev->name); + break; + case NETDEV_UNREGISTER: + vlan_proc_rem_dev(dev); + break; + } +} + +static int vlan_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct vlan_group *grp; + int i, flgs; + struct net_device *vlandev; + struct vlan_dev_info *vlan; + LIST_HEAD(list); + + if (is_vlan_dev(dev)) + __vlan_device_event(dev, event); + + if ((event == NETDEV_UP) && + (dev->features & NETIF_F_HW_VLAN_FILTER) && + dev->netdev_ops->ndo_vlan_rx_add_vid) { + pr_info("8021q: adding VLAN 0 to HW filter on device %s\n", + dev->name); + dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0); + } + + grp = rtnl_dereference(dev->vlgrp); + if (!grp) + goto out; + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGE: + /* Propagate real device state to vlan devices */ + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + netif_stacked_transfer_operstate(dev, vlandev); + } + break; + + case NETDEV_CHANGEADDR: + /* Adjust unicast filters on underlying device */ + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + flgs = vlandev->flags; + if (!(flgs & IFF_UP)) + continue; + + vlan_sync_address(dev, vlandev); + } + break; + + case NETDEV_CHANGEMTU: + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + if (vlandev->mtu <= dev->mtu) + continue; + + dev_set_mtu(vlandev, dev->mtu); + } + break; + + case NETDEV_FEAT_CHANGE: + /* Propagate device features to underlying device */ + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + vlan_transfer_features(dev, vlandev); + } + + break; + + case NETDEV_DOWN: + /* Put all VLANs for this dev in the down state too. */ + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + flgs = vlandev->flags; + if (!(flgs & IFF_UP)) + continue; + + vlan = vlan_dev_info(vlandev); + if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) + dev_change_flags(vlandev, flgs & ~IFF_UP); + netif_stacked_transfer_operstate(dev, vlandev); + } + break; + + case NETDEV_UP: + /* Put all VLANs for this dev in the up state too. */ + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + flgs = vlandev->flags; + if (flgs & IFF_UP) + continue; + + vlan = vlan_dev_info(vlandev); + if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) + dev_change_flags(vlandev, flgs | IFF_UP); + netif_stacked_transfer_operstate(dev, vlandev); + } + break; + + case NETDEV_UNREGISTER: + /* twiddle thumbs on netns device moves */ + if (dev->reg_state != NETREG_UNREGISTERING) + break; + + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + /* unregistration of last vlan destroys group, abort + * afterwards */ + if (grp->nr_vlans == 1) + i = VLAN_N_VID; + + unregister_vlan_dev(vlandev, &list); + } + unregister_netdevice_many(&list); + break; + + case NETDEV_PRE_TYPE_CHANGE: + /* Forbid underlaying device to change its type. */ + return NOTIFY_BAD; + + case NETDEV_NOTIFY_PEERS: + case NETDEV_BONDING_FAILOVER: + /* Propagate to vlan devices */ + for (i = 0; i < VLAN_N_VID; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + call_netdevice_notifiers(event, vlandev); + } + break; + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block vlan_notifier_block __read_mostly = { + .notifier_call = vlan_device_event, +}; + +/* + * VLAN IOCTL handler. + * o execute requested action or pass command to the device driver + * arg is really a struct vlan_ioctl_args __user *. + */ +static int vlan_ioctl_handler(struct net *net, void __user *arg) +{ + int err; + struct vlan_ioctl_args args; + struct net_device *dev = NULL; + + if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) + return -EFAULT; + + /* Null terminate this sucker, just in case. */ + args.device1[23] = 0; + args.u.device2[23] = 0; + + rtnl_lock(); + + switch (args.cmd) { + case SET_VLAN_INGRESS_PRIORITY_CMD: + case SET_VLAN_EGRESS_PRIORITY_CMD: + case SET_VLAN_FLAG_CMD: + case ADD_VLAN_CMD: + case DEL_VLAN_CMD: + case GET_VLAN_REALDEV_NAME_CMD: + case GET_VLAN_VID_CMD: + err = -ENODEV; + dev = __dev_get_by_name(net, args.device1); + if (!dev) + goto out; + + err = -EINVAL; + if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) + goto out; + } + + switch (args.cmd) { + case SET_VLAN_INGRESS_PRIORITY_CMD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + vlan_dev_set_ingress_priority(dev, + args.u.skb_priority, + args.vlan_qos); + err = 0; + break; + + case SET_VLAN_EGRESS_PRIORITY_CMD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + err = vlan_dev_set_egress_priority(dev, + args.u.skb_priority, + args.vlan_qos); + break; + + case SET_VLAN_FLAG_CMD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + err = vlan_dev_change_flags(dev, + args.vlan_qos ? args.u.flag : 0, + args.u.flag); + break; + + case SET_VLAN_NAME_TYPE_CMD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + if ((args.u.name_type >= 0) && + (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { + struct vlan_net *vn; + + vn = net_generic(net, vlan_net_id); + vn->name_type = args.u.name_type; + err = 0; + } else { + err = -EINVAL; + } + break; + + case ADD_VLAN_CMD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + err = register_vlan_device(dev, args.u.VID); + break; + + case DEL_VLAN_CMD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + unregister_vlan_dev(dev, NULL); + err = 0; + break; + + case GET_VLAN_REALDEV_NAME_CMD: + err = 0; + vlan_dev_get_realdev_name(dev, args.u.device2); + if (copy_to_user(arg, &args, + sizeof(struct vlan_ioctl_args))) + err = -EFAULT; + break; + + case GET_VLAN_VID_CMD: + err = 0; + args.u.VID = vlan_dev_vlan_id(dev); + if (copy_to_user(arg, &args, + sizeof(struct vlan_ioctl_args))) + err = -EFAULT; + break; + + default: + err = -EOPNOTSUPP; + break; + } +out: + rtnl_unlock(); + return err; +} + +static int __net_init vlan_init_net(struct net *net) +{ + struct vlan_net *vn = net_generic(net, vlan_net_id); + int err; + + vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; + + err = vlan_proc_init(net); + + return err; +} + +static void __net_exit vlan_exit_net(struct net *net) +{ + vlan_proc_cleanup(net); +} + +static struct pernet_operations vlan_net_ops = { + .init = vlan_init_net, + .exit = vlan_exit_net, + .id = &vlan_net_id, + .size = sizeof(struct vlan_net), +}; + +static int __init vlan_proto_init(void) +{ + int err; + + pr_info("%s v%s\n", vlan_fullname, vlan_version); + + err = register_pernet_subsys(&vlan_net_ops); + if (err < 0) + goto err0; + + err = register_netdevice_notifier(&vlan_notifier_block); + if (err < 0) + goto err2; + + err = vlan_gvrp_init(); + if (err < 0) + goto err3; + + err = vlan_netlink_init(); + if (err < 0) + goto err4; + + vlan_ioctl_set(vlan_ioctl_handler); + return 0; + +err4: + vlan_gvrp_uninit(); +err3: + unregister_netdevice_notifier(&vlan_notifier_block); +err2: + unregister_pernet_subsys(&vlan_net_ops); +err0: + return err; +} + +static void __exit vlan_cleanup_module(void) +{ + vlan_ioctl_set(NULL); + vlan_netlink_fini(); + + unregister_netdevice_notifier(&vlan_notifier_block); + + unregister_pernet_subsys(&vlan_net_ops); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ + + vlan_gvrp_uninit(); +} + +module_init(vlan_proto_init); +module_exit(vlan_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_VERSION); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h new file mode 100644 index 00000000..9da07e30 --- /dev/null +++ b/net/8021q/vlan.h @@ -0,0 +1,134 @@ +#ifndef __BEN_VLAN_802_1Q_INC__ +#define __BEN_VLAN_802_1Q_INC__ + +#include +#include + + +/** + * struct vlan_priority_tci_mapping - vlan egress priority mappings + * @priority: skb priority + * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 + * @next: pointer to next struct + */ +struct vlan_priority_tci_mapping { + u32 priority; + u16 vlan_qos; + struct vlan_priority_tci_mapping *next; +}; + + +/** + * struct vlan_pcpu_stats - VLAN percpu rx/tx stats + * @rx_packets: number of received packets + * @rx_bytes: number of received bytes + * @rx_multicast: number of received multicast packets + * @tx_packets: number of transmitted packets + * @tx_bytes: number of transmitted bytes + * @syncp: synchronization point for 64bit counters + * @rx_errors: number of rx errors + * @tx_dropped: number of tx drops + */ +struct vlan_pcpu_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_multicast; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; + u32 rx_errors; + u32 tx_dropped; +}; + +/** + * struct vlan_dev_info - VLAN private device data + * @nr_ingress_mappings: number of ingress priority mappings + * @ingress_priority_map: ingress priority mappings + * @nr_egress_mappings: number of egress priority mappings + * @egress_priority_map: hash of egress priority mappings + * @vlan_id: VLAN identifier + * @flags: device flags + * @real_dev: underlying netdevice + * @real_dev_addr: address of underlying netdevice + * @dent: proc dir entry + * @vlan_pcpu_stats: ptr to percpu rx stats + */ +struct vlan_dev_info { + unsigned int nr_ingress_mappings; + u32 ingress_priority_map[8]; + unsigned int nr_egress_mappings; + struct vlan_priority_tci_mapping *egress_priority_map[16]; + + u16 vlan_id; + u16 flags; + + struct net_device *real_dev; + unsigned char real_dev_addr[ETH_ALEN]; + + struct proc_dir_entry *dent; + struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; +}; + +static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev) +{ + return netdev_priv(dev); +} + +/* found in vlan_dev.c */ +void vlan_dev_set_ingress_priority(const struct net_device *dev, + u32 skb_prio, u16 vlan_prio); +int vlan_dev_set_egress_priority(const struct net_device *dev, + u32 skb_prio, u16 vlan_prio); +int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); + +int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id); +void vlan_setup(struct net_device *dev); +int register_vlan_dev(struct net_device *dev); +void unregister_vlan_dev(struct net_device *dev, struct list_head *head); + +static inline u32 vlan_get_ingress_priority(struct net_device *dev, + u16 vlan_tci) +{ + struct vlan_dev_info *vip = vlan_dev_info(dev); + + return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7]; +} + +#ifdef CONFIG_VLAN_8021Q_GVRP +extern int vlan_gvrp_request_join(const struct net_device *dev); +extern void vlan_gvrp_request_leave(const struct net_device *dev); +extern int vlan_gvrp_init_applicant(struct net_device *dev); +extern void vlan_gvrp_uninit_applicant(struct net_device *dev); +extern int vlan_gvrp_init(void); +extern void vlan_gvrp_uninit(void); +#else +static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; } +static inline void vlan_gvrp_request_leave(const struct net_device *dev) {} +static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; } +static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {} +static inline int vlan_gvrp_init(void) { return 0; } +static inline void vlan_gvrp_uninit(void) {} +#endif + +extern const char vlan_fullname[]; +extern const char vlan_version[]; +extern int vlan_netlink_init(void); +extern void vlan_netlink_fini(void); + +extern struct rtnl_link_ops vlan_link_ops; + +extern int vlan_net_id; + +struct proc_dir_entry; + +struct vlan_net { + /* /proc/net/vlan */ + struct proc_dir_entry *proc_vlan_dir; + /* /proc/net/vlan/config */ + struct proc_dir_entry *proc_vlan_conf; + /* Determines interface naming scheme. */ + unsigned short name_type; +}; + +#endif /* !(__BEN_VLAN_802_1Q_INC__) */ diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c new file mode 100644 index 00000000..27263fb1 --- /dev/null +++ b/net/8021q/vlan_core.c @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include "vlan.h" + +bool vlan_do_receive(struct sk_buff **skbp) +{ + struct sk_buff *skb = *skbp; + u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; + struct net_device *vlan_dev; + struct vlan_pcpu_stats *rx_stats; + + vlan_dev = vlan_find_dev(skb->dev, vlan_id); + if (!vlan_dev) { + if (vlan_id) + skb->pkt_type = PACKET_OTHERHOST; + return false; + } + + skb = *skbp = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return false; + + skb->dev = vlan_dev; + if (skb->pkt_type == PACKET_OTHERHOST) { + /* Our lower layer thinks this is not local, let's make sure. + * This allows the VLAN to have a different MAC than the + * underlying device, and still route correctly. */ + if (!compare_ether_addr(eth_hdr(skb)->h_dest, + vlan_dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + } + + if (!(vlan_dev_info(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) { + unsigned int offset = skb->data - skb_mac_header(skb); + + /* + * vlan_insert_tag expect skb->data pointing to mac header. + * So change skb->data before calling it and change back to + * original position later + */ + skb_push(skb, offset); + skb = *skbp = vlan_insert_tag(skb, skb->vlan_tci); + if (!skb) + return false; + skb_pull(skb, offset + VLAN_HLEN); + skb_reset_mac_len(skb); + } + + skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); + skb->vlan_tci = 0; + + rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_pcpu_stats); + + u64_stats_update_begin(&rx_stats->syncp); + rx_stats->rx_packets++; + rx_stats->rx_bytes += skb->len; + if (skb->pkt_type == PACKET_MULTICAST) + rx_stats->rx_multicast++; + u64_stats_update_end(&rx_stats->syncp); + + return true; +} + +struct net_device *vlan_dev_real_dev(const struct net_device *dev) +{ + return vlan_dev_info(dev)->real_dev; +} +EXPORT_SYMBOL(vlan_dev_real_dev); + +u16 vlan_dev_vlan_id(const struct net_device *dev) +{ + return vlan_dev_info(dev)->vlan_id; +} +EXPORT_SYMBOL(vlan_dev_vlan_id); + +/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ +int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, + u16 vlan_tci, int polling) +{ + __vlan_hwaccel_put_tag(skb, vlan_tci); + return polling ? netif_receive_skb(skb) : netif_rx(skb); +} +EXPORT_SYMBOL(__vlan_hwaccel_rx); + +gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, + unsigned int vlan_tci, struct sk_buff *skb) +{ + __vlan_hwaccel_put_tag(skb, vlan_tci); + return napi_gro_receive(napi, skb); +} +EXPORT_SYMBOL(vlan_gro_receive); + +gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, + unsigned int vlan_tci) +{ + __vlan_hwaccel_put_tag(napi->skb, vlan_tci); + return napi_gro_frags(napi); +} +EXPORT_SYMBOL(vlan_gro_frags); + +static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) +{ + if (skb_cow(skb, skb_headroom(skb)) < 0) + return NULL; + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + skb->mac_header += VLAN_HLEN; + skb_reset_mac_len(skb); + return skb; +} + +static void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) +{ + __be16 proto; + unsigned char *rawp; + + /* + * Was a VLAN packet, grab the encapsulated protocol, which the layer + * three protocols care about. + */ + + proto = vhdr->h_vlan_encapsulated_proto; + if (ntohs(proto) >= 1536) { + skb->protocol = proto; + return; + } + + rawp = skb->data; + if (*(unsigned short *) rawp == 0xFFFF) + /* + * This is a magic hack to spot IPX packets. Older Novell + * breaks the protocol design and runs IPX over 802.3 without + * an 802.2 LLC layer. We look for FFFF which isn't a used + * 802.2 SSAP/DSAP. This won't work for fault tolerant netware + * but does for the rest. + */ + skb->protocol = htons(ETH_P_802_3); + else + /* + * Real 802.2 LLC + */ + skb->protocol = htons(ETH_P_802_2); +} + +struct sk_buff *vlan_untag(struct sk_buff *skb) +{ + struct vlan_hdr *vhdr; + u16 vlan_tci; + + if (unlikely(vlan_tx_tag_present(skb))) { + /* vlan_tci is already set-up so leave this for another time */ + return skb; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto err_free; + + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + goto err_free; + + vhdr = (struct vlan_hdr *) skb->data; + vlan_tci = ntohs(vhdr->h_vlan_TCI); + __vlan_hwaccel_put_tag(skb, vlan_tci); + + skb_pull_rcsum(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vhdr); + + skb = vlan_reorder_header(skb); + if (unlikely(!skb)) + goto err_free; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + return skb; + +err_free: + kfree_skb(skb); + return NULL; +} diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c new file mode 100644 index 00000000..d5484561 --- /dev/null +++ b/net/8021q/vlan_dev.c @@ -0,0 +1,705 @@ +/* -*- linux-c -*- + * INET 802.1Q VLAN + * Ethernet-type device handling. + * + * Authors: Ben Greear + * Please send support related email to: netdev@vger.kernel.org + * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html + * + * Fixes: Mar 22 2001: Martin Bokaemper + * - reset skb->pkt_type on incoming packets when MAC was changed + * - see that changed MAC is saddr for outgoing packets + * Oct 20, 2001: Ard van Breeman: + * - Fix MC-list, finally. + * - Flush MC-list on VLAN destroy. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "vlan.h" +#include "vlanproc.h" +#include + +/* + * Rebuild the Ethernet MAC header. This is called after an ARP + * (or in future other address resolution) has completed on this + * sk_buff. We now let ARP fill in the other fields. + * + * This routine CANNOT use cached dst->neigh! + * Really, it is used only when dst->neigh is wrong. + * + * TODO: This needs a checkup, I'm ignorant here. --BLG + */ +static int vlan_dev_rebuild_header(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + + switch (veth->h_vlan_encapsulated_proto) { +#ifdef CONFIG_INET + case htons(ETH_P_IP): + + /* TODO: Confirm this will work with VLAN headers... */ + return arp_find(veth->h_dest, skb); +#endif + default: + pr_debug("%s: unable to resolve type %X addresses.\n", + dev->name, ntohs(veth->h_vlan_encapsulated_proto)); + + memcpy(veth->h_source, dev->dev_addr, ETH_ALEN); + break; + } + + return 0; +} + +static inline u16 +vlan_dev_get_egress_qos_mask(struct net_device *dev, struct sk_buff *skb) +{ + struct vlan_priority_tci_mapping *mp; + + mp = vlan_dev_info(dev)->egress_priority_map[(skb->priority & 0xF)]; + while (mp) { + if (mp->priority == skb->priority) { + return mp->vlan_qos; /* This should already be shifted + * to mask correctly with the + * VLAN's TCI */ + } + mp = mp->next; + } + return 0; +} + +/* + * Create the VLAN header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + * + * This is called when the SKB is moving down the stack towards the + * physical devices. + */ +static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, + unsigned int len) +{ + struct vlan_hdr *vhdr; + unsigned int vhdrlen = 0; + u16 vlan_tci = 0; + int rc; + + if (!(vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR)) { + vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN); + + vlan_tci = vlan_dev_info(dev)->vlan_id; + vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); + vhdr->h_vlan_TCI = htons(vlan_tci); + + /* + * Set the protocol type. For a packet of type ETH_P_802_3/2 we + * put the length in here instead. + */ + if (type != ETH_P_802_3 && type != ETH_P_802_2) + vhdr->h_vlan_encapsulated_proto = htons(type); + else + vhdr->h_vlan_encapsulated_proto = htons(len); + + skb->protocol = htons(ETH_P_8021Q); + type = ETH_P_8021Q; + vhdrlen = VLAN_HLEN; + } + + /* Before delegating work to the lower layer, enter our MAC-address */ + if (saddr == NULL) + saddr = dev->dev_addr; + + /* Now make the underlying real hard header */ + dev = vlan_dev_info(dev)->real_dev; + rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen); + if (rc > 0) + rc += vhdrlen; + return rc; +} + +static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + unsigned int len; + int ret; + + /* Handle non-VLAN frames if they are sent to us, for example by DHCP. + * + * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING + * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... + */ + if (veth->h_vlan_proto != htons(ETH_P_8021Q) || + vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR) { + u16 vlan_tci; + vlan_tci = vlan_dev_info(dev)->vlan_id; + vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); + skb = __vlan_hwaccel_put_tag(skb, vlan_tci); + } + + skb->dev = vlan_dev_info(dev)->real_dev; + len = skb->len; + ret = dev_queue_xmit(skb); + + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { + struct vlan_pcpu_stats *stats; + + stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += len; + u64_stats_update_end(&stats->syncp); + } else { + this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped); + } + + return ret; +} + +static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) +{ + /* TODO: gotta make sure the underlying layer can handle it, + * maybe an IFF_VLAN_CAPABLE flag for devices? + */ + if (vlan_dev_info(dev)->real_dev->mtu < new_mtu) + return -ERANGE; + + dev->mtu = new_mtu; + + return 0; +} + +void vlan_dev_set_ingress_priority(const struct net_device *dev, + u32 skb_prio, u16 vlan_prio) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + + if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio) + vlan->nr_ingress_mappings--; + else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio) + vlan->nr_ingress_mappings++; + + vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio; +} + +int vlan_dev_set_egress_priority(const struct net_device *dev, + u32 skb_prio, u16 vlan_prio) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct vlan_priority_tci_mapping *mp = NULL; + struct vlan_priority_tci_mapping *np; + u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; + + /* See if a priority mapping exists.. */ + mp = vlan->egress_priority_map[skb_prio & 0xF]; + while (mp) { + if (mp->priority == skb_prio) { + if (mp->vlan_qos && !vlan_qos) + vlan->nr_egress_mappings--; + else if (!mp->vlan_qos && vlan_qos) + vlan->nr_egress_mappings++; + mp->vlan_qos = vlan_qos; + return 0; + } + mp = mp->next; + } + + /* Create a new mapping then. */ + mp = vlan->egress_priority_map[skb_prio & 0xF]; + np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL); + if (!np) + return -ENOBUFS; + + np->next = mp; + np->priority = skb_prio; + np->vlan_qos = vlan_qos; + vlan->egress_priority_map[skb_prio & 0xF] = np; + if (vlan_qos) + vlan->nr_egress_mappings++; + return 0; +} + +/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */ +int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + u32 old_flags = vlan->flags; + + if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | + VLAN_FLAG_LOOSE_BINDING)) + return -EINVAL; + + vlan->flags = (old_flags & ~mask) | (flags & mask); + + if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) { + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_join(dev); + else + vlan_gvrp_request_leave(dev); + } + return 0; +} + +void vlan_dev_get_realdev_name(const struct net_device *dev, char *result) +{ + strncpy(result, vlan_dev_info(dev)->real_dev->name, 23); +} + +static int vlan_dev_open(struct net_device *dev) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct net_device *real_dev = vlan->real_dev; + int err; + + if (!(real_dev->flags & IFF_UP) && + !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) + return -ENETDOWN; + + if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) { + err = dev_uc_add(real_dev, dev->dev_addr); + if (err < 0) + goto out; + } + + if (dev->flags & IFF_ALLMULTI) { + err = dev_set_allmulti(real_dev, 1); + if (err < 0) + goto del_unicast; + } + if (dev->flags & IFF_PROMISC) { + err = dev_set_promiscuity(real_dev, 1); + if (err < 0) + goto clear_allmulti; + } + + memcpy(vlan->real_dev_addr, real_dev->dev_addr, ETH_ALEN); + + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_join(dev); + + if (netif_carrier_ok(real_dev)) + netif_carrier_on(dev); + return 0; + +clear_allmulti: + if (dev->flags & IFF_ALLMULTI) + dev_set_allmulti(real_dev, -1); +del_unicast: + if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) + dev_uc_del(real_dev, dev->dev_addr); +out: + netif_carrier_off(dev); + return err; +} + +static int vlan_dev_stop(struct net_device *dev) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct net_device *real_dev = vlan->real_dev; + + dev_mc_unsync(real_dev, dev); + dev_uc_unsync(real_dev, dev); + if (dev->flags & IFF_ALLMULTI) + dev_set_allmulti(real_dev, -1); + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(real_dev, -1); + + if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) + dev_uc_del(real_dev, dev->dev_addr); + + netif_carrier_off(dev); + return 0; +} + +static int vlan_dev_set_mac_address(struct net_device *dev, void *p) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + struct sockaddr *addr = p; + int err; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + if (!(dev->flags & IFF_UP)) + goto out; + + if (compare_ether_addr(addr->sa_data, real_dev->dev_addr)) { + err = dev_uc_add(real_dev, addr->sa_data); + if (err < 0) + return err; + } + + if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) + dev_uc_del(real_dev, dev->dev_addr); + +out: + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + return 0; +} + +static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + struct ifreq ifrr; + int err = -EOPNOTSUPP; + + strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); + ifrr.ifr_ifru = ifr->ifr_ifru; + + switch (cmd) { + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSMIIREG: + if (netif_device_present(real_dev) && ops->ndo_do_ioctl) + err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); + break; + } + + if (!err) + ifr->ifr_ifru = ifrr.ifr_ifru; + + return err; +} + +static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int err = 0; + + if (netif_device_present(real_dev) && ops->ndo_neigh_setup) + err = ops->ndo_neigh_setup(real_dev, pa); + + return err; +} + +#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) +static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid, + struct scatterlist *sgl, unsigned int sgc) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int rc = 0; + + if (ops->ndo_fcoe_ddp_setup) + rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc); + + return rc; +} + +static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int len = 0; + + if (ops->ndo_fcoe_ddp_done) + len = ops->ndo_fcoe_ddp_done(real_dev, xid); + + return len; +} + +static int vlan_dev_fcoe_enable(struct net_device *dev) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int rc = -EINVAL; + + if (ops->ndo_fcoe_enable) + rc = ops->ndo_fcoe_enable(real_dev); + return rc; +} + +static int vlan_dev_fcoe_disable(struct net_device *dev) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int rc = -EINVAL; + + if (ops->ndo_fcoe_disable) + rc = ops->ndo_fcoe_disable(real_dev); + return rc; +} + +static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int rc = -EINVAL; + + if (ops->ndo_fcoe_get_wwn) + rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); + return rc; +} + +static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, + struct scatterlist *sgl, unsigned int sgc) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + const struct net_device_ops *ops = real_dev->netdev_ops; + int rc = 0; + + if (ops->ndo_fcoe_ddp_target) + rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); + + return rc; +} +#endif + +static void vlan_dev_change_rx_flags(struct net_device *dev, int change) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + + if (change & IFF_ALLMULTI) + dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); + if (change & IFF_PROMISC) + dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); +} + +static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) +{ + dev_mc_sync(vlan_dev_info(vlan_dev)->real_dev, vlan_dev); + dev_uc_sync(vlan_dev_info(vlan_dev)->real_dev, vlan_dev); +} + +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; +static struct lock_class_key vlan_netdev_addr_lock_key; + +static void vlan_dev_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_subclass) +{ + lockdep_set_class_and_subclass(&txq->_xmit_lock, + &vlan_netdev_xmit_lock_key, + *(int *)_subclass); +} + +static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) +{ + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &vlan_netdev_addr_lock_key, + subclass); + netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); +} + +static const struct header_ops vlan_header_ops = { + .create = vlan_dev_hard_header, + .rebuild = vlan_dev_rebuild_header, + .parse = eth_header_parse, +}; + +static const struct net_device_ops vlan_netdev_ops; + +static int vlan_dev_init(struct net_device *dev) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + int subclass = 0; + + netif_carrier_off(dev); + + /* IFF_BROADCAST|IFF_MULTICAST; ??? */ + dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | + IFF_MASTER | IFF_SLAVE); + dev->iflink = real_dev->ifindex; + dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | + (1<<__LINK_STATE_DORMANT))) | + (1<<__LINK_STATE_PRESENT); + + dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG | + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | + NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM | + NETIF_F_ALL_FCOE; + + dev->features |= real_dev->vlan_features | NETIF_F_LLTX; + dev->gso_max_size = real_dev->gso_max_size; + + /* ipv6 shared card related stuff */ + dev->dev_id = real_dev->dev_id; + + if (is_zero_ether_addr(dev->dev_addr)) + memcpy(dev->dev_addr, real_dev->dev_addr, dev->addr_len); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); + +#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) + dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; +#endif + + dev->needed_headroom = real_dev->needed_headroom; + if (real_dev->features & NETIF_F_HW_VLAN_TX) { + dev->header_ops = real_dev->header_ops; + dev->hard_header_len = real_dev->hard_header_len; + } else { + dev->header_ops = &vlan_header_ops; + dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; + } + + dev->netdev_ops = &vlan_netdev_ops; + + if (is_vlan_dev(real_dev)) + subclass = 1; + + vlan_dev_set_lockdep_class(dev, subclass); + + vlan_dev_info(dev)->vlan_pcpu_stats = alloc_percpu(struct vlan_pcpu_stats); + if (!vlan_dev_info(dev)->vlan_pcpu_stats) + return -ENOMEM; + + return 0; +} + +static void vlan_dev_uninit(struct net_device *dev) +{ + struct vlan_priority_tci_mapping *pm; + struct vlan_dev_info *vlan = vlan_dev_info(dev); + int i; + + free_percpu(vlan->vlan_pcpu_stats); + vlan->vlan_pcpu_stats = NULL; + for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { + while ((pm = vlan->egress_priority_map[i]) != NULL) { + vlan->egress_priority_map[i] = pm->next; + kfree(pm); + } + } +} + +static u32 vlan_dev_fix_features(struct net_device *dev, u32 features) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + u32 old_features = features; + + features &= real_dev->features; + features &= real_dev->vlan_features; + + if (old_features & NETIF_F_SOFT_FEATURES) + features |= old_features & NETIF_F_SOFT_FEATURES; + + if (dev_ethtool_get_rx_csum(real_dev)) + features |= NETIF_F_RXCSUM; + features |= NETIF_F_LLTX; + + return features; +} + +static int vlan_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd) +{ + const struct vlan_dev_info *vlan = vlan_dev_info(dev); + return dev_ethtool_get_settings(vlan->real_dev, cmd); +} + +static void vlan_ethtool_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strcpy(info->driver, vlan_fullname); + strcpy(info->version, vlan_version); + strcpy(info->fw_version, "N/A"); +} + +static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + + if (vlan_dev_info(dev)->vlan_pcpu_stats) { + struct vlan_pcpu_stats *p; + u32 rx_errors = 0, tx_dropped = 0; + int i; + + for_each_possible_cpu(i) { + u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; + unsigned int start; + + p = per_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats, i); + do { + start = u64_stats_fetch_begin_bh(&p->syncp); + rxpackets = p->rx_packets; + rxbytes = p->rx_bytes; + rxmulticast = p->rx_multicast; + txpackets = p->tx_packets; + txbytes = p->tx_bytes; + } while (u64_stats_fetch_retry_bh(&p->syncp, start)); + + stats->rx_packets += rxpackets; + stats->rx_bytes += rxbytes; + stats->multicast += rxmulticast; + stats->tx_packets += txpackets; + stats->tx_bytes += txbytes; + /* rx_errors & tx_dropped are u32 */ + rx_errors += p->rx_errors; + tx_dropped += p->tx_dropped; + } + stats->rx_errors = rx_errors; + stats->tx_dropped = tx_dropped; + } + return stats; +} + +static const struct ethtool_ops vlan_ethtool_ops = { + .get_settings = vlan_ethtool_get_settings, + .get_drvinfo = vlan_ethtool_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +static const struct net_device_ops vlan_netdev_ops = { + .ndo_change_mtu = vlan_dev_change_mtu, + .ndo_init = vlan_dev_init, + .ndo_uninit = vlan_dev_uninit, + .ndo_open = vlan_dev_open, + .ndo_stop = vlan_dev_stop, + .ndo_start_xmit = vlan_dev_hard_start_xmit, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = vlan_dev_set_mac_address, + .ndo_set_rx_mode = vlan_dev_set_rx_mode, + .ndo_set_multicast_list = vlan_dev_set_rx_mode, + .ndo_change_rx_flags = vlan_dev_change_rx_flags, + .ndo_do_ioctl = vlan_dev_ioctl, + .ndo_neigh_setup = vlan_dev_neigh_setup, + .ndo_get_stats64 = vlan_dev_get_stats64, +#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) + .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup, + .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done, + .ndo_fcoe_enable = vlan_dev_fcoe_enable, + .ndo_fcoe_disable = vlan_dev_fcoe_disable, + .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, + .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, +#endif + .ndo_fix_features = vlan_dev_fix_features, +}; + +void vlan_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->priv_flags |= IFF_802_1Q_VLAN; + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->tx_queue_len = 0; + + dev->netdev_ops = &vlan_netdev_ops; + dev->destructor = free_netdev; + dev->ethtool_ops = &vlan_ethtool_ops; + + memset(dev->broadcast, 0, ETH_ALEN); +} diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c new file mode 100644 index 00000000..061cecee --- /dev/null +++ b/net/8021q/vlan_gvrp.c @@ -0,0 +1,66 @@ +/* + * IEEE 802.1Q GARP VLAN Registration Protocol (GVRP) + * + * Copyright (c) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ +#include +#include +#include +#include "vlan.h" + +#define GARP_GVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 } + +enum gvrp_attributes { + GVRP_ATTR_INVALID, + GVRP_ATTR_VID, + __GVRP_ATTR_MAX +}; +#define GVRP_ATTR_MAX (__GVRP_ATTR_MAX - 1) + +static struct garp_application vlan_gvrp_app __read_mostly = { + .proto.group_address = GARP_GVRP_ADDRESS, + .maxattr = GVRP_ATTR_MAX, + .type = GARP_APPLICATION_GVRP, +}; + +int vlan_gvrp_request_join(const struct net_device *dev) +{ + const struct vlan_dev_info *vlan = vlan_dev_info(dev); + __be16 vlan_id = htons(vlan->vlan_id); + + return garp_request_join(vlan->real_dev, &vlan_gvrp_app, + &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID); +} + +void vlan_gvrp_request_leave(const struct net_device *dev) +{ + const struct vlan_dev_info *vlan = vlan_dev_info(dev); + __be16 vlan_id = htons(vlan->vlan_id); + + garp_request_leave(vlan->real_dev, &vlan_gvrp_app, + &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID); +} + +int vlan_gvrp_init_applicant(struct net_device *dev) +{ + return garp_init_applicant(dev, &vlan_gvrp_app); +} + +void vlan_gvrp_uninit_applicant(struct net_device *dev) +{ + garp_uninit_applicant(dev, &vlan_gvrp_app); +} + +int __init vlan_gvrp_init(void) +{ + return garp_register_application(&vlan_gvrp_app); +} + +void vlan_gvrp_uninit(void) +{ + garp_unregister_application(&vlan_gvrp_app); +} diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c new file mode 100644 index 00000000..be9a5c19 --- /dev/null +++ b/net/8021q/vlan_netlink.c @@ -0,0 +1,240 @@ +/* + * VLAN netlink control interface + * + * Copyright (c) 2007 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include "vlan.h" + + +static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = { + [IFLA_VLAN_ID] = { .type = NLA_U16 }, + [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) }, + [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED }, + [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = { + [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) }, +}; + + +static inline int vlan_validate_qos_map(struct nlattr *attr) +{ + if (!attr) + return 0; + return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy); +} + +static int vlan_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + struct ifla_vlan_flags *flags; + u16 id; + int err; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + return -EINVAL; + + if (data[IFLA_VLAN_ID]) { + id = nla_get_u16(data[IFLA_VLAN_ID]); + if (id >= VLAN_VID_MASK) + return -ERANGE; + } + if (data[IFLA_VLAN_FLAGS]) { + flags = nla_data(data[IFLA_VLAN_FLAGS]); + if ((flags->flags & flags->mask) & + ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | + VLAN_FLAG_LOOSE_BINDING)) + return -EINVAL; + } + + err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); + if (err < 0) + return err; + err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); + if (err < 0) + return err; + return 0; +} + +static int vlan_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ifla_vlan_flags *flags; + struct ifla_vlan_qos_mapping *m; + struct nlattr *attr; + int rem; + + if (data[IFLA_VLAN_FLAGS]) { + flags = nla_data(data[IFLA_VLAN_FLAGS]); + vlan_dev_change_flags(dev, flags->flags, flags->mask); + } + if (data[IFLA_VLAN_INGRESS_QOS]) { + nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { + m = nla_data(attr); + vlan_dev_set_ingress_priority(dev, m->to, m->from); + } + } + if (data[IFLA_VLAN_EGRESS_QOS]) { + nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { + m = nla_data(attr); + vlan_dev_set_egress_priority(dev, m->from, m->to); + } + } + return 0; +} + +static int vlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct net_device *real_dev; + int err; + + if (!data[IFLA_VLAN_ID]) + return -EINVAL; + + if (!tb[IFLA_LINK]) + return -EINVAL; + real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!real_dev) + return -ENODEV; + + vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); + vlan->real_dev = real_dev; + vlan->flags = VLAN_FLAG_REORDER_HDR; + + err = vlan_check_real_dev(real_dev, vlan->vlan_id); + if (err < 0) + return err; + + if (!tb[IFLA_MTU]) + dev->mtu = real_dev->mtu; + else if (dev->mtu > real_dev->mtu) + return -EINVAL; + + err = vlan_changelink(dev, tb, data); + if (err < 0) + return err; + + return register_vlan_dev(dev); +} + +static inline size_t vlan_qos_map_size(unsigned int n) +{ + if (n == 0) + return 0; + /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */ + return nla_total_size(sizeof(struct nlattr)) + + nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n; +} + +static size_t vlan_get_size(const struct net_device *dev) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + + return nla_total_size(2) + /* IFLA_VLAN_ID */ + sizeof(struct ifla_vlan_flags) + /* IFLA_VLAN_FLAGS */ + vlan_qos_map_size(vlan->nr_ingress_mappings) + + vlan_qos_map_size(vlan->nr_egress_mappings); +} + +static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct vlan_priority_tci_mapping *pm; + struct ifla_vlan_flags f; + struct ifla_vlan_qos_mapping m; + struct nlattr *nest; + unsigned int i; + + NLA_PUT_U16(skb, IFLA_VLAN_ID, vlan_dev_info(dev)->vlan_id); + if (vlan->flags) { + f.flags = vlan->flags; + f.mask = ~0; + NLA_PUT(skb, IFLA_VLAN_FLAGS, sizeof(f), &f); + } + if (vlan->nr_ingress_mappings) { + nest = nla_nest_start(skb, IFLA_VLAN_INGRESS_QOS); + if (nest == NULL) + goto nla_put_failure; + + for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) { + if (!vlan->ingress_priority_map[i]) + continue; + + m.from = i; + m.to = vlan->ingress_priority_map[i]; + NLA_PUT(skb, IFLA_VLAN_QOS_MAPPING, + sizeof(m), &m); + } + nla_nest_end(skb, nest); + } + + if (vlan->nr_egress_mappings) { + nest = nla_nest_start(skb, IFLA_VLAN_EGRESS_QOS); + if (nest == NULL) + goto nla_put_failure; + + for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { + for (pm = vlan->egress_priority_map[i]; pm; + pm = pm->next) { + if (!pm->vlan_qos) + continue; + + m.from = pm->priority; + m.to = (pm->vlan_qos >> 13) & 0x7; + NLA_PUT(skb, IFLA_VLAN_QOS_MAPPING, + sizeof(m), &m); + } + } + nla_nest_end(skb, nest); + } + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +struct rtnl_link_ops vlan_link_ops __read_mostly = { + .kind = "vlan", + .maxtype = IFLA_VLAN_MAX, + .policy = vlan_policy, + .priv_size = sizeof(struct vlan_dev_info), + .setup = vlan_setup, + .validate = vlan_validate, + .newlink = vlan_newlink, + .changelink = vlan_changelink, + .dellink = unregister_vlan_dev, + .get_size = vlan_get_size, + .fill_info = vlan_fill_info, +}; + +int __init vlan_netlink_init(void) +{ + return rtnl_link_register(&vlan_link_ops); +} + +void __exit vlan_netlink_fini(void) +{ + rtnl_link_unregister(&vlan_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("vlan"); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c new file mode 100644 index 00000000..d940c49d --- /dev/null +++ b/net/8021q/vlanproc.c @@ -0,0 +1,327 @@ +/****************************************************************************** + * vlanproc.c VLAN Module. /proc filesystem interface. + * + * This module is completely hardware-independent and provides + * access to the router using Linux /proc filesystem. + * + * Author: Ben Greear, coppied from wanproc.c + * by: Gene Kozin + * + * Copyright: (c) 1998 Ben Greear + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * ============================================================================ + * Jan 20, 1998 Ben Greear Initial Version + *****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vlanproc.h" +#include "vlan.h" + +/****** Function Prototypes *************************************************/ + +/* Methods for preparing data for reading proc entries */ +static int vlan_seq_show(struct seq_file *seq, void *v); +static void *vlan_seq_start(struct seq_file *seq, loff_t *pos); +static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos); +static void vlan_seq_stop(struct seq_file *seq, void *); +static int vlandev_seq_show(struct seq_file *seq, void *v); + +/* + * Global Data + */ + + +/* + * Names of the proc directory entries + */ + +static const char name_root[] = "vlan"; +static const char name_conf[] = "config"; + +/* + * Structures for interfacing with the /proc filesystem. + * VLAN creates its own directory /proc/net/vlan with the following + * entries: + * config device status/configuration + * entry for each device + */ + +/* + * Generic /proc/net/vlan/ file and inode operations + */ + +static const struct seq_operations vlan_seq_ops = { + .start = vlan_seq_start, + .next = vlan_seq_next, + .stop = vlan_seq_stop, + .show = vlan_seq_show, +}; + +static int vlan_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &vlan_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations vlan_fops = { + .owner = THIS_MODULE, + .open = vlan_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +/* + * /proc/net/vlan/ file and inode operations + */ + +static int vlandev_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, vlandev_seq_show, PDE(inode)->data); +} + +static const struct file_operations vlandev_fops = { + .owner = THIS_MODULE, + .open = vlandev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Proc filesystem derectory entries. + */ + +/* Strings */ +static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { + [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", + [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", + [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", + [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", +}; +/* + * Interface functions + */ + +/* + * Clean up /proc/net/vlan entries + */ + +void vlan_proc_cleanup(struct net *net) +{ + struct vlan_net *vn = net_generic(net, vlan_net_id); + + if (vn->proc_vlan_conf) + remove_proc_entry(name_conf, vn->proc_vlan_dir); + + if (vn->proc_vlan_dir) + proc_net_remove(net, name_root); + + /* Dynamically added entries should be cleaned up as their vlan_device + * is removed, so we should not have to take care of it here... + */ +} + +/* + * Create /proc/net/vlan entries + */ + +int __net_init vlan_proc_init(struct net *net) +{ + struct vlan_net *vn = net_generic(net, vlan_net_id); + + vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net); + if (!vn->proc_vlan_dir) + goto err; + + vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR, + vn->proc_vlan_dir, &vlan_fops); + if (!vn->proc_vlan_conf) + goto err; + return 0; + +err: + pr_err("%s: can't create entry in proc filesystem!\n", __func__); + vlan_proc_cleanup(net); + return -ENOBUFS; +} + +/* + * Add directory entry for VLAN device. + */ + +int vlan_proc_add_dev(struct net_device *vlandev) +{ + struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); + struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); + + dev_info->dent = + proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR, + vn->proc_vlan_dir, &vlandev_fops, vlandev); + if (!dev_info->dent) + return -ENOBUFS; + return 0; +} + +/* + * Delete directory entry for VLAN device. + */ +int vlan_proc_rem_dev(struct net_device *vlandev) +{ + struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); + + /** NOTE: This will consume the memory pointed to by dent, it seems. */ + if (vlan_dev_info(vlandev)->dent) { + remove_proc_entry(vlan_dev_info(vlandev)->dent->name, + vn->proc_vlan_dir); + vlan_dev_info(vlandev)->dent = NULL; + } + return 0; +} + +/****** Proc filesystem entry points ****************************************/ + +/* + * The following few functions build the content of /proc/net/vlan/config + */ + +/* start read of /proc/net/vlan/config */ +static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct net_device *dev; + struct net *net = seq_file_net(seq); + loff_t i = 1; + + rcu_read_lock(); + if (*pos == 0) + return SEQ_START_TOKEN; + + for_each_netdev_rcu(net, dev) { + if (!is_vlan_dev(dev)) + continue; + + if (i++ == *pos) + return dev; + } + + return NULL; +} + +static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev; + struct net *net = seq_file_net(seq); + + ++*pos; + + dev = (struct net_device *)v; + if (v == SEQ_START_TOKEN) + dev = net_device_entry(&net->dev_base_head); + + for_each_netdev_continue_rcu(net, dev) { + if (!is_vlan_dev(dev)) + continue; + + return dev; + } + + return NULL; +} + +static void vlan_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static int vlan_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_net(seq); + struct vlan_net *vn = net_generic(net, vlan_net_id); + + if (v == SEQ_START_TOKEN) { + const char *nmtype = NULL; + + seq_puts(seq, "VLAN Dev name | VLAN ID\n"); + + if (vn->name_type < ARRAY_SIZE(vlan_name_type_str)) + nmtype = vlan_name_type_str[vn->name_type]; + + seq_printf(seq, "Name-Type: %s\n", + nmtype ? nmtype : "UNKNOWN"); + } else { + const struct net_device *vlandev = v; + const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); + + seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, + dev_info->vlan_id, dev_info->real_dev->name); + } + return 0; +} + +static int vlandev_seq_show(struct seq_file *seq, void *offset) +{ + struct net_device *vlandev = (struct net_device *) seq->private; + const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats; + static const char fmt64[] = "%30s %12llu\n"; + int i; + + if (!is_vlan_dev(vlandev)) + return 0; + + stats = dev_get_stats(vlandev, &temp); + seq_printf(seq, + "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n", + vlandev->name, dev_info->vlan_id, + (int)(dev_info->flags & 1), vlandev->priv_flags); + + seq_printf(seq, fmt64, "total frames received", stats->rx_packets); + seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); + seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast); + seq_puts(seq, "\n"); + seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets); + seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes); + seq_printf(seq, "Device: %s", dev_info->real_dev->name); + /* now show all PRIORITY mappings relating to this VLAN */ + seq_printf(seq, "\nINGRESS priority mappings: " + "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", + dev_info->ingress_priority_map[0], + dev_info->ingress_priority_map[1], + dev_info->ingress_priority_map[2], + dev_info->ingress_priority_map[3], + dev_info->ingress_priority_map[4], + dev_info->ingress_priority_map[5], + dev_info->ingress_priority_map[6], + dev_info->ingress_priority_map[7]); + + seq_printf(seq, " EGRESS priority mappings: "); + for (i = 0; i < 16; i++) { + const struct vlan_priority_tci_mapping *mp + = dev_info->egress_priority_map[i]; + while (mp) { + seq_printf(seq, "%u:%hu ", + mp->priority, ((mp->vlan_qos >> 13) & 0x7)); + mp = mp->next; + } + } + seq_puts(seq, "\n"); + + return 0; +} diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h new file mode 100644 index 00000000..063f60a3 --- /dev/null +++ b/net/8021q/vlanproc.h @@ -0,0 +1,20 @@ +#ifndef __BEN_VLAN_PROC_INC__ +#define __BEN_VLAN_PROC_INC__ + +#ifdef CONFIG_PROC_FS +struct net; + +int vlan_proc_init(struct net *net); +int vlan_proc_rem_dev(struct net_device *vlandev); +int vlan_proc_add_dev(struct net_device *vlandev); +void vlan_proc_cleanup(struct net *net); + +#else /* No CONFIG_PROC_FS */ + +#define vlan_proc_init(net) (0) +#define vlan_proc_cleanup(net) do {} while (0) +#define vlan_proc_add_dev(dev) ({(void)(dev), 0; }) +#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; }) +#endif + +#endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/9p/Kconfig b/net/9p/Kconfig new file mode 100644 index 00000000..d9ea09b1 --- /dev/null +++ b/net/9p/Kconfig @@ -0,0 +1,36 @@ +# +# 9P protocol configuration +# + +menuconfig NET_9P + depends on NET + tristate "Plan 9 Resource Sharing Support (9P2000)" + help + If you say Y here, you will get experimental support for + Plan 9 resource sharing via the 9P2000 protocol. + + See for more information. + + If unsure, say N. + +if NET_9P + +config NET_9P_VIRTIO + depends on VIRTIO + tristate "9P Virtio Transport" + help + This builds support for a transports between + guest partitions and a host partition. + +config NET_9P_RDMA + depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL + tristate "9P RDMA Transport (Experimental)" + help + This builds support for an RDMA transport. + +config NET_9P_DEBUG + bool "Debug information" + help + Say Y if you want the 9P subsystem to log debug information. + +endif diff --git a/net/9p/Makefile b/net/9p/Makefile new file mode 100644 index 00000000..a0874cc1 --- /dev/null +++ b/net/9p/Makefile @@ -0,0 +1,18 @@ +obj-$(CONFIG_NET_9P) := 9pnet.o +obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o +obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o + +9pnet-objs := \ + mod.o \ + client.o \ + error.o \ + util.o \ + protocol.o \ + trans_fd.o \ + trans_common.o \ + +9pnet_virtio-objs := \ + trans_virtio.o \ + +9pnet_rdma-objs := \ + trans_rdma.o \ diff --git a/net/9p/client.c b/net/9p/client.c new file mode 100644 index 00000000..5532710f --- /dev/null +++ b/net/9p/client.c @@ -0,0 +1,1958 @@ +/* + * net/9p/clnt.c + * + * 9P Client + * + * Copyright (C) 2008 by Eric Van Hensbergen + * Copyright (C) 2007 by Latchesar Ionkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "protocol.h" + +/* + * Client Option Parsing (code inspired by NFS code) + * - a little lazy - parse all client options + */ + +enum { + Opt_msize, + Opt_trans, + Opt_legacy, + Opt_version, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_msize, "msize=%u"}, + {Opt_legacy, "noextend"}, + {Opt_trans, "trans=%s"}, + {Opt_version, "version=%s"}, + {Opt_err, NULL}, +}; + +inline int p9_is_proto_dotl(struct p9_client *clnt) +{ + return clnt->proto_version == p9_proto_2000L; +} +EXPORT_SYMBOL(p9_is_proto_dotl); + +inline int p9_is_proto_dotu(struct p9_client *clnt) +{ + return clnt->proto_version == p9_proto_2000u; +} +EXPORT_SYMBOL(p9_is_proto_dotu); + +/* Interpret mount option for protocol version */ +static int get_protocol_version(const substring_t *name) +{ + int version = -EINVAL; + + if (!strncmp("9p2000", name->from, name->to-name->from)) { + version = p9_proto_legacy; + P9_DPRINTK(P9_DEBUG_9P, "Protocol version: Legacy\n"); + } else if (!strncmp("9p2000.u", name->from, name->to-name->from)) { + version = p9_proto_2000u; + P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); + } else if (!strncmp("9p2000.L", name->from, name->to-name->from)) { + version = p9_proto_2000L; + P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.L\n"); + } else { + P9_DPRINTK(P9_DEBUG_ERROR, "Unknown protocol version %s. ", + name->from); + } + return version; +} + +/** + * parse_options - parse mount options into client structure + * @opts: options string passed from mount + * @clnt: existing v9fs client information + * + * Return 0 upon success, -ERRNO upon failure + */ + +static int parse_opts(char *opts, struct p9_client *clnt) +{ + char *options, *tmp_options; + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + int ret = 0; + + clnt->proto_version = p9_proto_2000u; + clnt->msize = 8192; + + if (!opts) + return 0; + + tmp_options = kstrdup(opts, GFP_KERNEL); + if (!tmp_options) { + P9_DPRINTK(P9_DEBUG_ERROR, + "failed to allocate copy of option string\n"); + return -ENOMEM; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + token = match_token(p, tokens, args); + if (token < Opt_trans) { + int r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + } + switch (token) { + case Opt_msize: + clnt->msize = option; + break; + case Opt_trans: + clnt->trans_mod = v9fs_get_trans_by_name(&args[0]); + if(clnt->trans_mod == NULL) { + P9_DPRINTK(P9_DEBUG_ERROR, + "Could not find request transport: %s\n", + (char *) &args[0]); + ret = -EINVAL; + goto free_and_return; + } + break; + case Opt_legacy: + clnt->proto_version = p9_proto_legacy; + break; + case Opt_version: + ret = get_protocol_version(&args[0]); + if (ret == -EINVAL) + goto free_and_return; + clnt->proto_version = ret; + break; + default: + continue; + } + } + +free_and_return: + kfree(tmp_options); + return ret; +} + +/** + * p9_tag_alloc - lookup/allocate a request by tag + * @c: client session to lookup tag within + * @tag: numeric id for transaction + * + * this is a simple array lookup, but will grow the + * request_slots as necessary to accommodate transaction + * ids which did not previously have a slot. + * + * this code relies on the client spinlock to manage locks, its + * possible we should switch to something else, but I'd rather + * stick with something low-overhead for the common case. + * + */ + +static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) +{ + unsigned long flags; + int row, col; + struct p9_req_t *req; + + /* This looks up the original request by tag so we know which + * buffer to read the data into */ + tag++; + + if (tag >= c->max_tag) { + spin_lock_irqsave(&c->lock, flags); + /* check again since original check was outside of lock */ + while (tag >= c->max_tag) { + row = (tag / P9_ROW_MAXTAG); + c->reqs[row] = kcalloc(P9_ROW_MAXTAG, + sizeof(struct p9_req_t), GFP_ATOMIC); + + if (!c->reqs[row]) { + printk(KERN_ERR "Couldn't grow tag array\n"); + spin_unlock_irqrestore(&c->lock, flags); + return ERR_PTR(-ENOMEM); + } + for (col = 0; col < P9_ROW_MAXTAG; col++) { + c->reqs[row][col].status = REQ_STATUS_IDLE; + c->reqs[row][col].tc = NULL; + } + c->max_tag += P9_ROW_MAXTAG; + } + spin_unlock_irqrestore(&c->lock, flags); + } + row = tag / P9_ROW_MAXTAG; + col = tag % P9_ROW_MAXTAG; + + req = &c->reqs[row][col]; + if (!req->tc) { + req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); + if (!req->wq) { + printk(KERN_ERR "Couldn't grow tag array\n"); + return ERR_PTR(-ENOMEM); + } + init_waitqueue_head(req->wq); + if ((c->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == + P9_TRANS_PREF_PAYLOAD_SEP) { + int alloc_msize = min(c->msize, 4096); + req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, + GFP_NOFS); + req->tc->capacity = alloc_msize; + req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, + GFP_NOFS); + req->rc->capacity = alloc_msize; + } else { + req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize, + GFP_NOFS); + req->tc->capacity = c->msize; + req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize, + GFP_NOFS); + req->rc->capacity = c->msize; + } + if ((!req->tc) || (!req->rc)) { + printk(KERN_ERR "Couldn't grow tag array\n"); + kfree(req->tc); + kfree(req->rc); + kfree(req->wq); + req->tc = req->rc = NULL; + req->wq = NULL; + return ERR_PTR(-ENOMEM); + } + req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall); + req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); + } + + p9pdu_reset(req->tc); + p9pdu_reset(req->rc); + + req->tc->tag = tag-1; + req->status = REQ_STATUS_ALLOC; + + return &c->reqs[row][col]; +} + +/** + * p9_tag_lookup - lookup a request by tag + * @c: client session to lookup tag within + * @tag: numeric id for transaction + * + */ + +struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) +{ + int row, col; + + /* This looks up the original request by tag so we know which + * buffer to read the data into */ + tag++; + + if(tag >= c->max_tag) + return NULL; + + row = tag / P9_ROW_MAXTAG; + col = tag % P9_ROW_MAXTAG; + + return &c->reqs[row][col]; +} +EXPORT_SYMBOL(p9_tag_lookup); + +/** + * p9_tag_init - setup tags structure and contents + * @c: v9fs client struct + * + * This initializes the tags structure for each client instance. + * + */ + +static int p9_tag_init(struct p9_client *c) +{ + int err = 0; + + c->tagpool = p9_idpool_create(); + if (IS_ERR(c->tagpool)) { + err = PTR_ERR(c->tagpool); + goto error; + } + err = p9_idpool_get(c->tagpool); /* reserve tag 0 */ + if (err < 0) { + p9_idpool_destroy(c->tagpool); + goto error; + } + c->max_tag = 0; +error: + return err; +} + +/** + * p9_tag_cleanup - cleans up tags structure and reclaims resources + * @c: v9fs client struct + * + * This frees resources associated with the tags structure + * + */ +static void p9_tag_cleanup(struct p9_client *c) +{ + int row, col; + + /* check to insure all requests are idle */ + for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { + for (col = 0; col < P9_ROW_MAXTAG; col++) { + if (c->reqs[row][col].status != REQ_STATUS_IDLE) { + P9_DPRINTK(P9_DEBUG_MUX, + "Attempting to cleanup non-free tag %d,%d\n", + row, col); + /* TODO: delay execution of cleanup */ + return; + } + } + } + + if (c->tagpool) { + p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */ + p9_idpool_destroy(c->tagpool); + } + + /* free requests associated with tags */ + for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { + for (col = 0; col < P9_ROW_MAXTAG; col++) { + kfree(c->reqs[row][col].wq); + kfree(c->reqs[row][col].tc); + kfree(c->reqs[row][col].rc); + } + kfree(c->reqs[row]); + } + c->max_tag = 0; +} + +/** + * p9_free_req - free a request and clean-up as necessary + * c: client state + * r: request to release + * + */ + +static void p9_free_req(struct p9_client *c, struct p9_req_t *r) +{ + int tag = r->tc->tag; + P9_DPRINTK(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); + + r->status = REQ_STATUS_IDLE; + if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool)) + p9_idpool_put(tag, c->tagpool); +} + +/** + * p9_client_cb - call back from transport to client + * c: client state + * req: request received + * + */ +void p9_client_cb(struct p9_client *c, struct p9_req_t *req) +{ + P9_DPRINTK(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); + wake_up(req->wq); + P9_DPRINTK(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); +} +EXPORT_SYMBOL(p9_client_cb); + +/** + * p9_parse_header - parse header arguments out of a packet + * @pdu: packet to parse + * @size: size of packet + * @type: type of request + * @tag: tag of packet + * @rewind: set if we need to rewind offset afterwards + */ + +int +p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag, + int rewind) +{ + int8_t r_type; + int16_t r_tag; + int32_t r_size; + int offset = pdu->offset; + int err; + + pdu->offset = 0; + if (pdu->size == 0) + pdu->size = 7; + + err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag); + if (err) + goto rewind_and_exit; + + pdu->size = r_size; + pdu->id = r_type; + pdu->tag = r_tag; + + P9_DPRINTK(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n", pdu->size, + pdu->id, pdu->tag); + + if (type) + *type = r_type; + if (tag) + *tag = r_tag; + if (size) + *size = r_size; + + +rewind_and_exit: + if (rewind) + pdu->offset = offset; + return err; +} +EXPORT_SYMBOL(p9_parse_header); + +/** + * p9_check_errors - check 9p packet for error return and process it + * @c: current client instance + * @req: request to parse and check for error conditions + * + * returns error code if one is discovered, otherwise returns 0 + * + * this will have to be more complicated if we have multiple + * error packet types + */ + +static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) +{ + int8_t type; + int err; + int ecode; + + err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + if (err) { + P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); + return err; + } + + if (type != P9_RERROR && type != P9_RLERROR) + return 0; + + if (!p9_is_proto_dotl(c)) { + char *ename; + + if (req->tc->pbuf_size) { + /* Handle user buffers */ + size_t len = req->rc->size - req->rc->offset; + if (req->tc->pubuf) { + /* User Buffer */ + err = copy_from_user( + &req->rc->sdata[req->rc->offset], + req->tc->pubuf, len); + if (err) { + err = -EFAULT; + goto out_err; + } + } else { + /* Kernel Buffer */ + memmove(&req->rc->sdata[req->rc->offset], + req->tc->pkbuf, len); + } + } + err = p9pdu_readf(req->rc, c->proto_version, "s?d", + &ename, &ecode); + if (err) + goto out_err; + + if (p9_is_proto_dotu(c)) + err = -ecode; + + if (!err || !IS_ERR_VALUE(err)) { + err = p9_errstr2errno(ename, strlen(ename)); + + P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, + ename); + + kfree(ename); + } + } else { + err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = -ecode; + + P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); + } + + + return err; + +out_err: + P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); + + return err; +} + +static struct p9_req_t * +p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); + +/** + * p9_client_flush - flush (cancel) a request + * @c: client state + * @oldreq: request to cancel + * + * This sents a flush for a particular request and links + * the flush request to the original request. The current + * code only supports a single flush request although the protocol + * allows for multiple flush requests to be sent for a single request. + * + */ + +static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) +{ + struct p9_req_t *req; + int16_t oldtag; + int err; + + err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1); + if (err) + return err; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag); + + req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag); + if (IS_ERR(req)) + return PTR_ERR(req); + + + /* if we haven't received a response for oldreq, + remove it from the list. */ + spin_lock(&c->lock); + if (oldreq->status == REQ_STATUS_FLSH) + list_del(&oldreq->req_list); + spin_unlock(&c->lock); + + p9_free_req(c, req); + return 0; +} + +/** + * p9_client_rpc - issue a request and wait for a response + * @c: client session + * @type: type of request + * @fmt: protocol format string (see protocol.c) + * + * Returns request structure (which client must free using p9_free_req) + */ + +static struct p9_req_t * +p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) +{ + va_list ap; + int tag, err; + struct p9_req_t *req; + unsigned long flags; + int sigpending; + + P9_DPRINTK(P9_DEBUG_MUX, "client %p op %d\n", c, type); + + /* we allow for any status other than disconnected */ + if (c->status == Disconnected) + return ERR_PTR(-EIO); + + /* if status is begin_disconnected we allow only clunk request */ + if ((c->status == BeginDisconnect) && (type != P9_TCLUNK)) + return ERR_PTR(-EIO); + + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } else + sigpending = 0; + + tag = P9_NOTAG; + if (type != P9_TVERSION) { + tag = p9_idpool_get(c->tagpool); + if (tag < 0) + return ERR_PTR(-ENOMEM); + } + + req = p9_tag_alloc(c, tag); + if (IS_ERR(req)) + return req; + + /* marshall the data */ + p9pdu_prepare(req->tc, tag, type); + va_start(ap, fmt); + err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); + va_end(ap); + if (err) + goto reterr; + p9pdu_finalize(req->tc); + + err = c->trans_mod->request(c, req); + if (err < 0) { + if (err != -ERESTARTSYS && err != -EFAULT) + c->status = Disconnected; + goto reterr; + } + + P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d\n", req->wq, tag); + err = wait_event_interruptible(*req->wq, + req->status >= REQ_STATUS_RCVD); + P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d returned %d\n", + req->wq, tag, err); + + if (req->status == REQ_STATUS_ERROR) { + P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); + err = req->t_err; + } + + if ((err == -ERESTARTSYS) && (c->status == Connected)) { + P9_DPRINTK(P9_DEBUG_MUX, "flushing\n"); + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + + if (c->trans_mod->cancel(c, req)) + p9_client_flush(c, req); + + /* if we received the response anyway, don't signal error */ + if (req->status == REQ_STATUS_RCVD) + err = 0; + } + + if (sigpending) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + if (err < 0) + goto reterr; + + err = p9_check_errors(c, req); + if (!err) { + P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type); + return req; + } + +reterr: + P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d error: %d\n", c, type, + err); + p9_free_req(c, req); + return ERR_PTR(err); +} + +static struct p9_fid *p9_fid_create(struct p9_client *clnt) +{ + int ret; + struct p9_fid *fid; + unsigned long flags; + + P9_DPRINTK(P9_DEBUG_FID, "clnt %p\n", clnt); + fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL); + if (!fid) + return ERR_PTR(-ENOMEM); + + ret = p9_idpool_get(clnt->fidpool); + if (ret < 0) { + ret = -ENOSPC; + goto error; + } + fid->fid = ret; + + memset(&fid->qid, 0, sizeof(struct p9_qid)); + fid->mode = -1; + fid->uid = current_fsuid(); + fid->clnt = clnt; + fid->rdir = NULL; + spin_lock_irqsave(&clnt->lock, flags); + list_add(&fid->flist, &clnt->fidlist); + spin_unlock_irqrestore(&clnt->lock, flags); + + return fid; + +error: + kfree(fid); + return ERR_PTR(ret); +} + +static void p9_fid_destroy(struct p9_fid *fid) +{ + struct p9_client *clnt; + unsigned long flags; + + P9_DPRINTK(P9_DEBUG_FID, "fid %d\n", fid->fid); + clnt = fid->clnt; + p9_idpool_put(fid->fid, clnt->fidpool); + spin_lock_irqsave(&clnt->lock, flags); + list_del(&fid->flist); + spin_unlock_irqrestore(&clnt->lock, flags); + kfree(fid->rdir); + kfree(fid); +} + +static int p9_client_version(struct p9_client *c) +{ + int err = 0; + struct p9_req_t *req; + char *version; + int msize; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", + c->msize, c->proto_version); + + switch (c->proto_version) { + case p9_proto_2000L: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000.L"); + break; + case p9_proto_2000u: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000.u"); + break; + case p9_proto_legacy: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000"); + break; + default: + return -EINVAL; + break; + } + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); + if (err) { + P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err); + p9pdu_dump(1, req->rc); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version); + if (!strncmp(version, "9P2000.L", 8)) + c->proto_version = p9_proto_2000L; + else if (!strncmp(version, "9P2000.u", 8)) + c->proto_version = p9_proto_2000u; + else if (!strncmp(version, "9P2000", 6)) + c->proto_version = p9_proto_legacy; + else { + err = -EREMOTEIO; + goto error; + } + + if (msize < c->msize) + c->msize = msize; + +error: + kfree(version); + p9_free_req(c, req); + + return err; +} + +struct p9_client *p9_client_create(const char *dev_name, char *options) +{ + int err; + struct p9_client *clnt; + + err = 0; + clnt = kmalloc(sizeof(struct p9_client), GFP_KERNEL); + if (!clnt) + return ERR_PTR(-ENOMEM); + + clnt->trans_mod = NULL; + clnt->trans = NULL; + spin_lock_init(&clnt->lock); + INIT_LIST_HEAD(&clnt->fidlist); + + err = p9_tag_init(clnt); + if (err < 0) + goto free_client; + + err = parse_opts(options, clnt); + if (err < 0) + goto destroy_tagpool; + + if (!clnt->trans_mod) + clnt->trans_mod = v9fs_get_default_trans(); + + if (clnt->trans_mod == NULL) { + err = -EPROTONOSUPPORT; + P9_DPRINTK(P9_DEBUG_ERROR, + "No transport defined or default transport\n"); + goto destroy_tagpool; + } + + clnt->fidpool = p9_idpool_create(); + if (IS_ERR(clnt->fidpool)) { + err = PTR_ERR(clnt->fidpool); + goto put_trans; + } + + P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", + clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); + + err = clnt->trans_mod->create(clnt, dev_name, options); + if (err) + goto destroy_fidpool; + + if (clnt->msize > clnt->trans_mod->maxsize) + clnt->msize = clnt->trans_mod->maxsize; + + err = p9_client_version(clnt); + if (err) + goto close_trans; + + return clnt; + +close_trans: + clnt->trans_mod->close(clnt); +destroy_fidpool: + p9_idpool_destroy(clnt->fidpool); +put_trans: + v9fs_put_trans(clnt->trans_mod); +destroy_tagpool: + p9_idpool_destroy(clnt->tagpool); +free_client: + kfree(clnt); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_create); + +void p9_client_destroy(struct p9_client *clnt) +{ + struct p9_fid *fid, *fidptr; + + P9_DPRINTK(P9_DEBUG_MUX, "clnt %p\n", clnt); + + if (clnt->trans_mod) + clnt->trans_mod->close(clnt); + + v9fs_put_trans(clnt->trans_mod); + + list_for_each_entry_safe(fid, fidptr, &clnt->fidlist, flist) { + printk(KERN_INFO "Found fid %d not clunked\n", fid->fid); + p9_fid_destroy(fid); + } + + if (clnt->fidpool) + p9_idpool_destroy(clnt->fidpool); + + p9_tag_cleanup(clnt); + + kfree(clnt); +} +EXPORT_SYMBOL(p9_client_destroy); + +void p9_client_disconnect(struct p9_client *clnt) +{ + P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt); + clnt->status = Disconnected; +} +EXPORT_SYMBOL(p9_client_disconnect); + +void p9_client_begin_disconnect(struct p9_client *clnt) +{ + P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt); + clnt->status = BeginDisconnect; +} +EXPORT_SYMBOL(p9_client_begin_disconnect); + +struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, + char *uname, u32 n_uname, char *aname) +{ + int err; + struct p9_req_t *req; + struct p9_fid *fid; + struct p9_qid qid; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", + afid ? afid->fid : -1, uname, aname); + err = 0; + + fid = p9_fid_create(clnt); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + req = p9_client_rpc(clnt, P9_TATTACH, "ddss?d", fid->fid, + afid ? afid->fid : P9_NOFID, uname, aname, n_uname); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); + if (err) { + p9pdu_dump(1, req->rc); + p9_free_req(clnt, req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n", + qid.type, + (unsigned long long)qid.path, + qid.version); + + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); + + p9_free_req(clnt, req); + return fid; + +error: + if (fid) + p9_fid_destroy(fid); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_attach); + +struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, + char **wnames, int clone) +{ + int err; + struct p9_client *clnt; + struct p9_fid *fid; + struct p9_qid *wqids; + struct p9_req_t *req; + uint16_t nwqids, count; + + err = 0; + wqids = NULL; + clnt = oldfid->clnt; + if (clone) { + fid = p9_fid_create(clnt); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + fid = NULL; + goto error; + } + + fid->uid = oldfid->uid; + } else + fid = oldfid; + + + P9_DPRINTK(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n", + oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL); + + req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid, + nwname, wnames); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); + if (err) { + p9pdu_dump(1, req->rc); + p9_free_req(clnt, req); + goto clunk_fid; + } + p9_free_req(clnt, req); + + P9_DPRINTK(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); + + if (nwqids != nwname) { + err = -ENOENT; + goto clunk_fid; + } + + for (count = 0; count < nwqids; count++) + P9_DPRINTK(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n", + count, wqids[count].type, + (unsigned long long)wqids[count].path, + wqids[count].version); + + if (nwname) + memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); + else + fid->qid = oldfid->qid; + + kfree(wqids); + return fid; + +clunk_fid: + kfree(wqids); + p9_client_clunk(fid); + fid = NULL; + +error: + if (fid && (fid != oldfid)) + p9_fid_destroy(fid); + + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_walk); + +int p9_client_open(struct p9_fid *fid, int mode) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + struct p9_qid qid; + int iounit; + + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> %s fid %d mode %d\n", + p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode); + err = 0; + + if (fid->mode != -1) + return -EINVAL; + + if (p9_is_proto_dotl(clnt)) + req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode); + else + req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n", + p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type, + (unsigned long long)qid.path, qid.version, iounit); + + fid->mode = mode; + fid->iounit = iounit; + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_open); + +int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode, + gid_t gid, struct p9_qid *qid) +{ + int err = 0; + struct p9_client *clnt; + struct p9_req_t *req; + int iounit; + + P9_DPRINTK(P9_DEBUG_9P, + ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n", + ofid->fid, name, flags, mode, gid); + clnt = ofid->clnt; + + if (ofid->mode != -1) + return -EINVAL; + + req = p9_client_rpc(clnt, P9_TLCREATE, "dsddd", ofid->fid, name, flags, + mode, gid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n", + qid->type, + (unsigned long long)qid->path, + qid->version, iounit); + + ofid->mode = mode; + ofid->iounit = iounit; + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_create_dotl); + +int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode, + char *extension) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + struct p9_qid qid; + int iounit; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n", + fid->fid, name, perm, mode); + err = 0; + clnt = fid->clnt; + + if (fid->mode != -1) + return -EINVAL; + + req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm, + mode, extension); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n", + qid.type, + (unsigned long long)qid.path, + qid.version, iounit); + + fid->mode = mode; + fid->iounit = iounit; + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_fcreate); + +int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid, + struct p9_qid *qid) +{ + int err = 0; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n", + dfid->fid, name, symtgt); + clnt = dfid->clnt; + + req = p9_client_rpc(clnt, P9_TSYMLINK, "dssd", dfid->fid, name, symtgt, + gid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n", + qid->type, (unsigned long long)qid->path, qid->version); + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_symlink); + +int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname) +{ + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n", + dfid->fid, oldfid->fid, newname); + clnt = dfid->clnt; + req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid, + newname); + if (IS_ERR(req)) + return PTR_ERR(req); + + P9_DPRINTK(P9_DEBUG_9P, "<<< RLINK\n"); + p9_free_req(clnt, req); + return 0; +} +EXPORT_SYMBOL(p9_client_link); + +int p9_client_fsync(struct p9_fid *fid, int datasync) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n", + fid->fid, datasync); + err = 0; + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); + + p9_free_req(clnt, req); + +error: + return err; +} +EXPORT_SYMBOL(p9_client_fsync); + +int p9_client_clunk(struct p9_fid *fid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + if (!fid) { + P9_EPRINTK(KERN_WARNING, "Trying to clunk with NULL fid\n"); + dump_stack(); + return 0; + } + + P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid); + err = 0; + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); + + p9_free_req(clnt, req); +error: + /* + * Fid is not valid even after a failed clunk + */ + p9_fid_destroy(fid); + return err; +} +EXPORT_SYMBOL(p9_client_clunk); + +int p9_client_remove(struct p9_fid *fid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid); + err = 0; + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); + + p9_free_req(clnt, req); +error: + p9_fid_destroy(fid); + return err; +} +EXPORT_SYMBOL(p9_client_remove); + +int +p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, + u32 count) +{ + int err, rsize; + struct p9_client *clnt; + struct p9_req_t *req; + char *dataptr; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid, + (long long unsigned) offset, count); + err = 0; + clnt = fid->clnt; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy for small IO (< 1024) */ + if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == + P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) { + req = p9_client_rpc(clnt, P9_TREAD, "dqE", fid->fid, offset, + rsize, data, udata); + } else { + req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, + rsize); + } + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + + if (!req->tc->pbuf_size) { + if (data) { + memmove(data, dataptr, count); + } else { + err = copy_to_user(udata, dataptr, count); + if (err) { + err = -EFAULT; + goto free_and_error; + } + } + } + p9_free_req(clnt, req); + return count; + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_read); + +int +p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, + u64 offset, u32 count) +{ + int err, rsize; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d\n", + fid->fid, (long long unsigned) offset, count); + err = 0; + clnt = fid->clnt; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy form small IO (< 1024) */ + if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == + P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) { + req = p9_client_rpc(clnt, P9_TWRITE, "dqE", fid->fid, offset, + rsize, data, udata); + } else { + + if (data) + req = p9_client_rpc(clnt, P9_TWRITE, "dqD", fid->fid, + offset, rsize, data); + else + req = p9_client_rpc(clnt, P9_TWRITE, "dqU", fid->fid, + offset, rsize, udata); + } + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); + + p9_free_req(clnt, req); + return count; + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_write); + +struct p9_wstat *p9_client_stat(struct p9_fid *fid) +{ + int err; + struct p9_client *clnt; + struct p9_wstat *ret = kmalloc(sizeof(struct p9_wstat), GFP_KERNEL); + struct p9_req_t *req; + u16 ignored; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid); + + if (!ret) + return ERR_PTR(-ENOMEM); + + err = 0; + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); + if (err) { + p9pdu_dump(1, req->rc); + p9_free_req(clnt, req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, + "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n" + "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" + "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n" + "<<< uid=%d gid=%d n_muid=%d\n", + ret->size, ret->type, ret->dev, ret->qid.type, + (unsigned long long)ret->qid.path, ret->qid.version, ret->mode, + ret->atime, ret->mtime, (unsigned long long)ret->length, + ret->name, ret->uid, ret->gid, ret->muid, ret->extension, + ret->n_uid, ret->n_gid, ret->n_muid); + + p9_free_req(clnt, req); + return ret; + +error: + kfree(ret); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_stat); + +struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, + u64 request_mask) +{ + int err; + struct p9_client *clnt; + struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl), + GFP_KERNEL); + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n", + fid->fid, request_mask); + + if (!ret) + return ERR_PTR(-ENOMEM); + + err = 0; + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret); + if (err) { + p9pdu_dump(1, req->rc); + p9_free_req(clnt, req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, + "<<< RGETATTR st_result_mask=%lld\n" + "<<< qid=%x.%llx.%x\n" + "<<< st_mode=%8.8x st_nlink=%llu\n" + "<<< st_uid=%d st_gid=%d\n" + "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n" + "<<< st_atime_sec=%lld st_atime_nsec=%lld\n" + "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n" + "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n" + "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" + "<<< st_gen=%lld st_data_version=%lld", + ret->st_result_mask, ret->qid.type, ret->qid.path, + ret->qid.version, ret->st_mode, ret->st_nlink, ret->st_uid, + ret->st_gid, ret->st_rdev, ret->st_size, ret->st_blksize, + ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec, + ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec, + ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, + ret->st_gen, ret->st_data_version); + + p9_free_req(clnt, req); + return ret; + +error: + kfree(ret); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_getattr_dotl); + +static int p9_client_statsize(struct p9_wstat *wst, int proto_version) +{ + int ret; + + /* NOTE: size shouldn't include its own length */ + /* size[2] type[2] dev[4] qid[13] */ + /* mode[4] atime[4] mtime[4] length[8]*/ + /* name[s] uid[s] gid[s] muid[s] */ + ret = 2+4+13+4+4+4+8+2+2+2+2; + + if (wst->name) + ret += strlen(wst->name); + if (wst->uid) + ret += strlen(wst->uid); + if (wst->gid) + ret += strlen(wst->gid); + if (wst->muid) + ret += strlen(wst->muid); + + if ((proto_version == p9_proto_2000u) || + (proto_version == p9_proto_2000L)) { + ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ + if (wst->extension) + ret += strlen(wst->extension); + } + + return ret; +} + +int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + err = 0; + clnt = fid->clnt; + wst->size = p9_client_statsize(wst, clnt->proto_version); + P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid); + P9_DPRINTK(P9_DEBUG_9P, + " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" + " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" + " name=%s uid=%s gid=%s muid=%s extension=(%s)\n" + " uid=%d gid=%d n_muid=%d\n", + wst->size, wst->type, wst->dev, wst->qid.type, + (unsigned long long)wst->qid.path, wst->qid.version, wst->mode, + wst->atime, wst->mtime, (unsigned long long)wst->length, + wst->name, wst->uid, wst->gid, wst->muid, wst->extension, + wst->n_uid, wst->n_gid, wst->n_muid); + + req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size+2, wst); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); + + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_wstat); + +int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid); + P9_DPRINTK(P9_DEBUG_9P, + " valid=%x mode=%x uid=%d gid=%d size=%lld\n" + " atime_sec=%lld atime_nsec=%lld\n" + " mtime_sec=%lld mtime_nsec=%lld\n", + p9attr->valid, p9attr->mode, p9attr->uid, p9attr->gid, + p9attr->size, p9attr->atime_sec, p9attr->atime_nsec, + p9attr->mtime_sec, p9attr->mtime_nsec); + + req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr); + + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_setattr); + +int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + err = 0; + clnt = fid->clnt; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid); + + req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, + &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, + &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); + if (err) { + p9pdu_dump(1, req->rc); + p9_free_req(clnt, req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld " + "blocks %llu bfree %llu bavail %llu files %llu ffree %llu " + "fsid %llu namelen %ld\n", + fid->fid, (long unsigned int)sb->type, (long int)sb->bsize, + sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree, + sb->fsid, (long int)sb->namelen); + + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_statfs); + +int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, char *name) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + err = 0; + clnt = fid->clnt; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n", + fid->fid, newdirfid->fid, name); + + req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid, + newdirfid->fid, name); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); + + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_rename); + +/* + * An xattrwalk without @attr_name gives the fid for the lisxattr namespace + */ +struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, + const char *attr_name, u64 *attr_size) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + struct p9_fid *attr_fid; + + err = 0; + clnt = file_fid->clnt; + attr_fid = p9_fid_create(clnt); + if (IS_ERR(attr_fid)) { + err = PTR_ERR(attr_fid); + attr_fid = NULL; + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, + ">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n", + file_fid->fid, attr_fid->fid, attr_name); + + req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds", + file_fid->fid, attr_fid->fid, attr_name); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size); + if (err) { + p9pdu_dump(1, req->rc); + p9_free_req(clnt, req); + goto clunk_fid; + } + p9_free_req(clnt, req); + P9_DPRINTK(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", + attr_fid->fid, *attr_size); + return attr_fid; +clunk_fid: + p9_client_clunk(attr_fid); + attr_fid = NULL; +error: + if (attr_fid && (attr_fid != file_fid)) + p9_fid_destroy(attr_fid); + + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(p9_client_xattrwalk); + +int p9_client_xattrcreate(struct p9_fid *fid, const char *name, + u64 attr_size, int flags) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + P9_DPRINTK(P9_DEBUG_9P, + ">>> TXATTRCREATE fid %d name %s size %lld flag %d\n", + fid->fid, name, (long long)attr_size, flags); + err = 0; + clnt = fid->clnt; + req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd", + fid->fid, name, attr_size, flags); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL_GPL(p9_client_xattrcreate); + +int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) +{ + int err, rsize; + struct p9_client *clnt; + struct p9_req_t *req; + char *dataptr; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", + fid->fid, (long long unsigned) offset, count); + + err = 0; + clnt = fid->clnt; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_READDIRHDRSZ) + rsize = clnt->msize - P9_READDIRHDRSZ; + + if (count < rsize) + rsize = count; + + if ((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == + P9_TRANS_PREF_PAYLOAD_SEP) { + req = p9_client_rpc(clnt, P9_TREADDIR, "dqF", fid->fid, + offset, rsize, data); + } else { + req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, + offset, rsize); + } + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); + if (err) { + p9pdu_dump(1, req->rc); + goto free_and_error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); + + if (!req->tc->pbuf_size && data) + memmove(data, dataptr, count); + + p9_free_req(clnt, req); + return count; + +free_and_error: + p9_free_req(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_readdir); + +int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode, + dev_t rdev, gid_t gid, struct p9_qid *qid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d " + "minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev)); + req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddd", fid->fid, name, mode, + MAJOR(rdev), MINOR(rdev), gid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, + (unsigned long long)qid->path, qid->version); + +error: + p9_free_req(clnt, req); + return err; + +} +EXPORT_SYMBOL(p9_client_mknod_dotl); + +int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode, + gid_t gid, struct p9_qid *qid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n", + fid->fid, name, mode, gid); + req = p9_client_rpc(clnt, P9_TMKDIR, "dsdd", fid->fid, name, mode, + gid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, + (unsigned long long)qid->path, qid->version); + +error: + p9_free_req(clnt, req); + return err; + +} +EXPORT_SYMBOL(p9_client_mkdir_dotl); + +int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d " + "start %lld length %lld proc_id %d client_id %s\n", + fid->fid, flock->type, flock->flags, flock->start, + flock->length, flock->proc_id, flock->client_id); + + req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type, + flock->flags, flock->start, flock->length, + flock->proc_id, flock->client_id); + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "b", status); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); +error: + p9_free_req(clnt, req); + return err; + +} +EXPORT_SYMBOL(p9_client_lock_dotl); + +int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld " + "length %lld proc_id %d client_id %s\n", fid->fid, glock->type, + glock->start, glock->length, glock->proc_id, glock->client_id); + + req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type, + glock->start, glock->length, glock->proc_id, glock->client_id); + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type, + &glock->start, &glock->length, &glock->proc_id, + &glock->client_id); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " + "proc_id %d client_id %s\n", glock->type, glock->start, + glock->length, glock->proc_id, glock->client_id); +error: + p9_free_req(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_getlock_dotl); + +int p9_client_readlink(struct p9_fid *fid, char **target) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid); + + req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "s", target); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); +error: + p9_free_req(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_readlink); diff --git a/net/9p/error.c b/net/9p/error.c new file mode 100644 index 00000000..52518512 --- /dev/null +++ b/net/9p/error.c @@ -0,0 +1,247 @@ +/* + * linux/fs/9p/error.c + * + * Error string handling + * + * Plan 9 uses error strings, Unix uses error numbers. These functions + * try to help manage that and provide for dynamically adding error + * mappings. + * + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include + +/** + * struct errormap - map string errors from Plan 9 to Linux numeric ids + * @name: string sent over 9P + * @val: numeric id most closely representing @name + * @namelen: length of string + * @list: hash-table list for string lookup + */ +struct errormap { + char *name; + int val; + + int namelen; + struct hlist_node list; +}; + +#define ERRHASHSZ 32 +static struct hlist_head hash_errmap[ERRHASHSZ]; + +/* FixMe - reduce to a reasonable size */ +static struct errormap errmap[] = { + {"Operation not permitted", EPERM}, + {"wstat prohibited", EPERM}, + {"No such file or directory", ENOENT}, + {"directory entry not found", ENOENT}, + {"file not found", ENOENT}, + {"Interrupted system call", EINTR}, + {"Input/output error", EIO}, + {"No such device or address", ENXIO}, + {"Argument list too long", E2BIG}, + {"Bad file descriptor", EBADF}, + {"Resource temporarily unavailable", EAGAIN}, + {"Cannot allocate memory", ENOMEM}, + {"Permission denied", EACCES}, + {"Bad address", EFAULT}, + {"Block device required", ENOTBLK}, + {"Device or resource busy", EBUSY}, + {"File exists", EEXIST}, + {"Invalid cross-device link", EXDEV}, + {"No such device", ENODEV}, + {"Not a directory", ENOTDIR}, + {"Is a directory", EISDIR}, + {"Invalid argument", EINVAL}, + {"Too many open files in system", ENFILE}, + {"Too many open files", EMFILE}, + {"Text file busy", ETXTBSY}, + {"File too large", EFBIG}, + {"No space left on device", ENOSPC}, + {"Illegal seek", ESPIPE}, + {"Read-only file system", EROFS}, + {"Too many links", EMLINK}, + {"Broken pipe", EPIPE}, + {"Numerical argument out of domain", EDOM}, + {"Numerical result out of range", ERANGE}, + {"Resource deadlock avoided", EDEADLK}, + {"File name too long", ENAMETOOLONG}, + {"No locks available", ENOLCK}, + {"Function not implemented", ENOSYS}, + {"Directory not empty", ENOTEMPTY}, + {"Too many levels of symbolic links", ELOOP}, + {"No message of desired type", ENOMSG}, + {"Identifier removed", EIDRM}, + {"No data available", ENODATA}, + {"Machine is not on the network", ENONET}, + {"Package not installed", ENOPKG}, + {"Object is remote", EREMOTE}, + {"Link has been severed", ENOLINK}, + {"Communication error on send", ECOMM}, + {"Protocol error", EPROTO}, + {"Bad message", EBADMSG}, + {"File descriptor in bad state", EBADFD}, + {"Streams pipe error", ESTRPIPE}, + {"Too many users", EUSERS}, + {"Socket operation on non-socket", ENOTSOCK}, + {"Message too long", EMSGSIZE}, + {"Protocol not available", ENOPROTOOPT}, + {"Protocol not supported", EPROTONOSUPPORT}, + {"Socket type not supported", ESOCKTNOSUPPORT}, + {"Operation not supported", EOPNOTSUPP}, + {"Protocol family not supported", EPFNOSUPPORT}, + {"Network is down", ENETDOWN}, + {"Network is unreachable", ENETUNREACH}, + {"Network dropped connection on reset", ENETRESET}, + {"Software caused connection abort", ECONNABORTED}, + {"Connection reset by peer", ECONNRESET}, + {"No buffer space available", ENOBUFS}, + {"Transport endpoint is already connected", EISCONN}, + {"Transport endpoint is not connected", ENOTCONN}, + {"Cannot send after transport endpoint shutdown", ESHUTDOWN}, + {"Connection timed out", ETIMEDOUT}, + {"Connection refused", ECONNREFUSED}, + {"Host is down", EHOSTDOWN}, + {"No route to host", EHOSTUNREACH}, + {"Operation already in progress", EALREADY}, + {"Operation now in progress", EINPROGRESS}, + {"Is a named type file", EISNAM}, + {"Remote I/O error", EREMOTEIO}, + {"Disk quota exceeded", EDQUOT}, +/* errors from fossil, vacfs, and u9fs */ + {"fid unknown or out of range", EBADF}, + {"permission denied", EACCES}, + {"file does not exist", ENOENT}, + {"authentication failed", ECONNREFUSED}, + {"bad offset in directory read", ESPIPE}, + {"bad use of fid", EBADF}, + {"wstat can't convert between files and directories", EPERM}, + {"directory is not empty", ENOTEMPTY}, + {"file exists", EEXIST}, + {"file already exists", EEXIST}, + {"file or directory already exists", EEXIST}, + {"fid already in use", EBADF}, + {"file in use", ETXTBSY}, + {"i/o error", EIO}, + {"file already open for I/O", ETXTBSY}, + {"illegal mode", EINVAL}, + {"illegal name", ENAMETOOLONG}, + {"not a directory", ENOTDIR}, + {"not a member of proposed group", EPERM}, + {"not owner", EACCES}, + {"only owner can change group in wstat", EACCES}, + {"read only file system", EROFS}, + {"no access to special file", EPERM}, + {"i/o count too large", EIO}, + {"unknown group", EINVAL}, + {"unknown user", EINVAL}, + {"bogus wstat buffer", EPROTO}, + {"exclusive use file already open", EAGAIN}, + {"corrupted directory entry", EIO}, + {"corrupted file entry", EIO}, + {"corrupted block label", EIO}, + {"corrupted meta data", EIO}, + {"illegal offset", EINVAL}, + {"illegal path element", ENOENT}, + {"root of file system is corrupted", EIO}, + {"corrupted super block", EIO}, + {"protocol botch", EPROTO}, + {"file system is full", ENOSPC}, + {"file is in use", EAGAIN}, + {"directory entry is not allocated", ENOENT}, + {"file is read only", EROFS}, + {"file has been removed", EIDRM}, + {"only support truncation to zero length", EPERM}, + {"cannot remove root", EPERM}, + {"file too big", EFBIG}, + {"venti i/o error", EIO}, + /* these are not errors */ + {"u9fs rhostsauth: no authentication required", 0}, + {"u9fs authnone: no authentication required", 0}, + {NULL, -1} +}; + +/** + * p9_error_init - preload mappings into hash list + * + */ + +int p9_error_init(void) +{ + struct errormap *c; + int bucket; + + /* initialize hash table */ + for (bucket = 0; bucket < ERRHASHSZ; bucket++) + INIT_HLIST_HEAD(&hash_errmap[bucket]); + + /* load initial error map into hash table */ + for (c = errmap; c->name != NULL; c++) { + c->namelen = strlen(c->name); + bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ; + INIT_HLIST_NODE(&c->list); + hlist_add_head(&c->list, &hash_errmap[bucket]); + } + + return 1; +} +EXPORT_SYMBOL(p9_error_init); + +/** + * errstr2errno - convert error string to error number + * @errstr: error string + * @len: length of error string + * + */ + +int p9_errstr2errno(char *errstr, int len) +{ + int errno; + struct hlist_node *p; + struct errormap *c; + int bucket; + + errno = 0; + p = NULL; + c = NULL; + bucket = jhash(errstr, len, 0) % ERRHASHSZ; + hlist_for_each_entry(c, p, &hash_errmap[bucket], list) { + if (c->namelen == len && !memcmp(c->name, errstr, len)) { + errno = c->val; + break; + } + } + + if (errno == 0) { + /* TODO: if error isn't found, add it dynamically */ + errstr[len] = 0; + printk(KERN_ERR "%s: server reported unknown error %s\n", + __func__, errstr); + errno = ESERVERFAULT; + } + + return -errno; +} +EXPORT_SYMBOL(p9_errstr2errno); diff --git a/net/9p/mod.c b/net/9p/mod.c new file mode 100644 index 00000000..72c39827 --- /dev/null +++ b/net/9p/mod.c @@ -0,0 +1,174 @@ +/* + * net/9p/9p.c + * + * 9P entry point + * + * Copyright (C) 2007 by Latchesar Ionkov + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NET_9P_DEBUG +unsigned int p9_debug_level = 0; /* feature-rific global debug level */ +EXPORT_SYMBOL(p9_debug_level); +module_param_named(debug, p9_debug_level, uint, 0); +MODULE_PARM_DESC(debug, "9P debugging level"); +#endif + +/* + * Dynamic Transport Registration Routines + * + */ + +static DEFINE_SPINLOCK(v9fs_trans_lock); +static LIST_HEAD(v9fs_trans_list); + +/** + * v9fs_register_trans - register a new transport with 9p + * @m: structure describing the transport module and entry points + * + */ +void v9fs_register_trans(struct p9_trans_module *m) +{ + spin_lock(&v9fs_trans_lock); + list_add_tail(&m->list, &v9fs_trans_list); + spin_unlock(&v9fs_trans_lock); +} +EXPORT_SYMBOL(v9fs_register_trans); + +/** + * v9fs_unregister_trans - unregister a 9p transport + * @m: the transport to remove + * + */ +void v9fs_unregister_trans(struct p9_trans_module *m) +{ + spin_lock(&v9fs_trans_lock); + list_del_init(&m->list); + spin_unlock(&v9fs_trans_lock); +} +EXPORT_SYMBOL(v9fs_unregister_trans); + +/** + * v9fs_get_trans_by_name - get transport with the matching name + * @name: string identifying transport + * + */ +struct p9_trans_module *v9fs_get_trans_by_name(const substring_t *name) +{ + struct p9_trans_module *t, *found = NULL; + + spin_lock(&v9fs_trans_lock); + + list_for_each_entry(t, &v9fs_trans_list, list) + if (strncmp(t->name, name->from, name->to-name->from) == 0 && + try_module_get(t->owner)) { + found = t; + break; + } + + spin_unlock(&v9fs_trans_lock); + return found; +} +EXPORT_SYMBOL(v9fs_get_trans_by_name); + +/** + * v9fs_get_default_trans - get the default transport + * + */ + +struct p9_trans_module *v9fs_get_default_trans(void) +{ + struct p9_trans_module *t, *found = NULL; + + spin_lock(&v9fs_trans_lock); + + list_for_each_entry(t, &v9fs_trans_list, list) + if (t->def && try_module_get(t->owner)) { + found = t; + break; + } + + if (!found) + list_for_each_entry(t, &v9fs_trans_list, list) + if (try_module_get(t->owner)) { + found = t; + break; + } + + spin_unlock(&v9fs_trans_lock); + return found; +} +EXPORT_SYMBOL(v9fs_get_default_trans); + +/** + * v9fs_put_trans - put trans + * @m: transport to put + * + */ +void v9fs_put_trans(struct p9_trans_module *m) +{ + if (m) + module_put(m->owner); +} + +/** + * init_p9 - Initialize module + * + */ +static int __init init_p9(void) +{ + int ret = 0; + + p9_error_init(); + printk(KERN_INFO "Installing 9P2000 support\n"); + p9_trans_fd_init(); + + return ret; +} + +/** + * exit_p9 - shutdown module + * + */ + +static void __exit exit_p9(void) +{ + printk(KERN_INFO "Unloading 9P2000 support\n"); + + p9_trans_fd_exit(); +} + +module_init(init_p9) +module_exit(exit_p9) + +MODULE_AUTHOR("Latchesar Ionkov "); +MODULE_AUTHOR("Eric Van Hensbergen "); +MODULE_AUTHOR("Ron Minnich "); +MODULE_LICENSE("GPL"); diff --git a/net/9p/protocol.c b/net/9p/protocol.c new file mode 100644 index 00000000..a873277c --- /dev/null +++ b/net/9p/protocol.c @@ -0,0 +1,682 @@ +/* + * net/9p/protocol.c + * + * 9P Protocol Support Code + * + * Copyright (C) 2008 by Eric Van Hensbergen + * + * Base on code from Anthony Liguori + * Copyright (C) 2008 by IBM, Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "protocol.h" + +static int +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); + +#ifdef CONFIG_NET_9P_DEBUG +void +p9pdu_dump(int way, struct p9_fcall *pdu) +{ + int i, n; + u8 *data = pdu->sdata; + int datalen = pdu->size; + char buf[255]; + int buflen = 255; + + i = n = 0; + if (datalen > (buflen-16)) + datalen = buflen-16; + while (i < datalen) { + n += scnprintf(buf + n, buflen - n, "%02x ", data[i]); + if (i%4 == 3) + n += scnprintf(buf + n, buflen - n, " "); + if (i%32 == 31) + n += scnprintf(buf + n, buflen - n, "\n"); + + i++; + } + n += scnprintf(buf + n, buflen - n, "\n"); + + if (way) + P9_DPRINTK(P9_DEBUG_PKT, "[[[(%d) %s\n", datalen, buf); + else + P9_DPRINTK(P9_DEBUG_PKT, "]]](%d) %s\n", datalen, buf); +} +#else +void +p9pdu_dump(int way, struct p9_fcall *pdu) +{ +} +#endif +EXPORT_SYMBOL(p9pdu_dump); + +void p9stat_free(struct p9_wstat *stbuf) +{ + kfree(stbuf->name); + kfree(stbuf->uid); + kfree(stbuf->gid); + kfree(stbuf->muid); + kfree(stbuf->extension); +} +EXPORT_SYMBOL(p9stat_free); + +static size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size) +{ + size_t len = min(pdu->size - pdu->offset, size); + memcpy(data, &pdu->sdata[pdu->offset], len); + pdu->offset += len; + return size - len; +} + +static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size) +{ + size_t len = min(pdu->capacity - pdu->size, size); + memcpy(&pdu->sdata[pdu->size], data, len); + pdu->size += len; + return size - len; +} + +static size_t +pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) +{ + size_t len = min(pdu->capacity - pdu->size, size); + if (copy_from_user(&pdu->sdata[pdu->size], udata, len)) + len = 0; + + pdu->size += len; + return size - len; +} + +static size_t +pdu_write_urw(struct p9_fcall *pdu, const char *kdata, const char __user *udata, + size_t size) +{ + BUG_ON(pdu->size > P9_IOHDRSZ); + pdu->pubuf = (char __user *)udata; + pdu->pkbuf = (char *)kdata; + pdu->pbuf_size = size; + return 0; +} + +static size_t +pdu_write_readdir(struct p9_fcall *pdu, const char *kdata, size_t size) +{ + BUG_ON(pdu->size > P9_READDIRHDRSZ); + pdu->pkbuf = (char *)kdata; + pdu->pbuf_size = size; + return 0; +} + +/* + b - int8_t + w - int16_t + d - int32_t + q - int64_t + s - string + S - stat + Q - qid + D - data blob (int32_t size followed by void *, results are not freed) + T - array of strings (int16_t count, followed by strings) + R - array of qids (int16_t count, followed by qids) + A - stat for 9p2000.L (p9_stat_dotl) + ? - if optional = 1, continue parsing +*/ + +static int +p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) +{ + const char *ptr; + int errcode = 0; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b':{ + int8_t *val = va_arg(ap, int8_t *); + if (pdu_read(pdu, val, sizeof(*val))) { + errcode = -EFAULT; + break; + } + } + break; + case 'w':{ + int16_t *val = va_arg(ap, int16_t *); + __le16 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *val = le16_to_cpu(le_val); + } + break; + case 'd':{ + int32_t *val = va_arg(ap, int32_t *); + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *val = le32_to_cpu(le_val); + } + break; + case 'q':{ + int64_t *val = va_arg(ap, int64_t *); + __le64 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *val = le64_to_cpu(le_val); + } + break; + case 's':{ + char **sptr = va_arg(ap, char **); + uint16_t len; + + errcode = p9pdu_readf(pdu, proto_version, + "w", &len); + if (errcode) + break; + + *sptr = kmalloc(len + 1, GFP_NOFS); + if (*sptr == NULL) { + errcode = -EFAULT; + break; + } + if (pdu_read(pdu, *sptr, len)) { + errcode = -EFAULT; + kfree(*sptr); + *sptr = NULL; + } else + (*sptr)[len] = 0; + } + break; + case 'Q':{ + struct p9_qid *qid = + va_arg(ap, struct p9_qid *); + + errcode = p9pdu_readf(pdu, proto_version, "bdq", + &qid->type, &qid->version, + &qid->path); + } + break; + case 'S':{ + struct p9_wstat *stbuf = + va_arg(ap, struct p9_wstat *); + + memset(stbuf, 0, sizeof(struct p9_wstat)); + stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = + -1; + errcode = + p9pdu_readf(pdu, proto_version, + "wwdQdddqssss?sddd", + &stbuf->size, &stbuf->type, + &stbuf->dev, &stbuf->qid, + &stbuf->mode, &stbuf->atime, + &stbuf->mtime, &stbuf->length, + &stbuf->name, &stbuf->uid, + &stbuf->gid, &stbuf->muid, + &stbuf->extension, + &stbuf->n_uid, &stbuf->n_gid, + &stbuf->n_muid); + if (errcode) + p9stat_free(stbuf); + } + break; + case 'D':{ + uint32_t *count = va_arg(ap, uint32_t *); + void **data = va_arg(ap, void **); + + errcode = + p9pdu_readf(pdu, proto_version, "d", count); + if (!errcode) { + *count = + min_t(uint32_t, *count, + pdu->size - pdu->offset); + *data = &pdu->sdata[pdu->offset]; + } + } + break; + case 'T':{ + uint16_t *nwname = va_arg(ap, uint16_t *); + char ***wnames = va_arg(ap, char ***); + + errcode = p9pdu_readf(pdu, proto_version, + "w", nwname); + if (!errcode) { + *wnames = + kmalloc(sizeof(char *) * *nwname, + GFP_NOFS); + if (!*wnames) + errcode = -ENOMEM; + } + + if (!errcode) { + int i; + + for (i = 0; i < *nwname; i++) { + errcode = + p9pdu_readf(pdu, + proto_version, + "s", + &(*wnames)[i]); + if (errcode) + break; + } + } + + if (errcode) { + if (*wnames) { + int i; + + for (i = 0; i < *nwname; i++) + kfree((*wnames)[i]); + } + kfree(*wnames); + *wnames = NULL; + } + } + break; + case 'R':{ + int16_t *nwqid = va_arg(ap, int16_t *); + struct p9_qid **wqids = + va_arg(ap, struct p9_qid **); + + *wqids = NULL; + + errcode = + p9pdu_readf(pdu, proto_version, "w", nwqid); + if (!errcode) { + *wqids = + kmalloc(*nwqid * + sizeof(struct p9_qid), + GFP_NOFS); + if (*wqids == NULL) + errcode = -ENOMEM; + } + + if (!errcode) { + int i; + + for (i = 0; i < *nwqid; i++) { + errcode = + p9pdu_readf(pdu, + proto_version, + "Q", + &(*wqids)[i]); + if (errcode) + break; + } + } + + if (errcode) { + kfree(*wqids); + *wqids = NULL; + } + } + break; + case 'A': { + struct p9_stat_dotl *stbuf = + va_arg(ap, struct p9_stat_dotl *); + + memset(stbuf, 0, sizeof(struct p9_stat_dotl)); + errcode = + p9pdu_readf(pdu, proto_version, + "qQdddqqqqqqqqqqqqqqq", + &stbuf->st_result_mask, + &stbuf->qid, + &stbuf->st_mode, + &stbuf->st_uid, &stbuf->st_gid, + &stbuf->st_nlink, + &stbuf->st_rdev, &stbuf->st_size, + &stbuf->st_blksize, &stbuf->st_blocks, + &stbuf->st_atime_sec, + &stbuf->st_atime_nsec, + &stbuf->st_mtime_sec, + &stbuf->st_mtime_nsec, + &stbuf->st_ctime_sec, + &stbuf->st_ctime_nsec, + &stbuf->st_btime_sec, + &stbuf->st_btime_nsec, + &stbuf->st_gen, + &stbuf->st_data_version); + } + break; + case '?': + if ((proto_version != p9_proto_2000u) && + (proto_version != p9_proto_2000L)) + return 0; + break; + default: + BUG(); + break; + } + + if (errcode) + break; + } + + return errcode; +} + +int +p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) +{ + const char *ptr; + int errcode = 0; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b':{ + int8_t val = va_arg(ap, int); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 'w':{ + __le16 val = cpu_to_le16(va_arg(ap, int)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 'd':{ + __le32 val = cpu_to_le32(va_arg(ap, int32_t)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 'q':{ + __le64 val = cpu_to_le64(va_arg(ap, int64_t)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 's':{ + const char *sptr = va_arg(ap, const char *); + uint16_t len = 0; + if (sptr) + len = min_t(uint16_t, strlen(sptr), + USHRT_MAX); + + errcode = p9pdu_writef(pdu, proto_version, + "w", len); + if (!errcode && pdu_write(pdu, sptr, len)) + errcode = -EFAULT; + } + break; + case 'Q':{ + const struct p9_qid *qid = + va_arg(ap, const struct p9_qid *); + errcode = + p9pdu_writef(pdu, proto_version, "bdq", + qid->type, qid->version, + qid->path); + } break; + case 'S':{ + const struct p9_wstat *stbuf = + va_arg(ap, const struct p9_wstat *); + errcode = + p9pdu_writef(pdu, proto_version, + "wwdQdddqssss?sddd", + stbuf->size, stbuf->type, + stbuf->dev, &stbuf->qid, + stbuf->mode, stbuf->atime, + stbuf->mtime, stbuf->length, + stbuf->name, stbuf->uid, + stbuf->gid, stbuf->muid, + stbuf->extension, stbuf->n_uid, + stbuf->n_gid, stbuf->n_muid); + } break; + case 'D':{ + uint32_t count = va_arg(ap, uint32_t); + const void *data = va_arg(ap, const void *); + + errcode = p9pdu_writef(pdu, proto_version, "d", + count); + if (!errcode && pdu_write(pdu, data, count)) + errcode = -EFAULT; + } + break; + case 'E':{ + int32_t cnt = va_arg(ap, int32_t); + const char *k = va_arg(ap, const void *); + const char __user *u = va_arg(ap, + const void __user *); + errcode = p9pdu_writef(pdu, proto_version, "d", + cnt); + if (!errcode && pdu_write_urw(pdu, k, u, cnt)) + errcode = -EFAULT; + } + break; + case 'F':{ + int32_t cnt = va_arg(ap, int32_t); + const char *k = va_arg(ap, const void *); + errcode = p9pdu_writef(pdu, proto_version, "d", + cnt); + if (!errcode && pdu_write_readdir(pdu, k, cnt)) + errcode = -EFAULT; + } + break; + case 'U':{ + int32_t count = va_arg(ap, int32_t); + const char __user *udata = + va_arg(ap, const void __user *); + errcode = p9pdu_writef(pdu, proto_version, "d", + count); + if (!errcode && pdu_write_u(pdu, udata, count)) + errcode = -EFAULT; + } + break; + case 'T':{ + uint16_t nwname = va_arg(ap, int); + const char **wnames = va_arg(ap, const char **); + + errcode = p9pdu_writef(pdu, proto_version, "w", + nwname); + if (!errcode) { + int i; + + for (i = 0; i < nwname; i++) { + errcode = + p9pdu_writef(pdu, + proto_version, + "s", + wnames[i]); + if (errcode) + break; + } + } + } + break; + case 'R':{ + int16_t nwqid = va_arg(ap, int); + struct p9_qid *wqids = + va_arg(ap, struct p9_qid *); + + errcode = p9pdu_writef(pdu, proto_version, "w", + nwqid); + if (!errcode) { + int i; + + for (i = 0; i < nwqid; i++) { + errcode = + p9pdu_writef(pdu, + proto_version, + "Q", + &wqids[i]); + if (errcode) + break; + } + } + } + break; + case 'I':{ + struct p9_iattr_dotl *p9attr = va_arg(ap, + struct p9_iattr_dotl *); + + errcode = p9pdu_writef(pdu, proto_version, + "ddddqqqqq", + p9attr->valid, + p9attr->mode, + p9attr->uid, + p9attr->gid, + p9attr->size, + p9attr->atime_sec, + p9attr->atime_nsec, + p9attr->mtime_sec, + p9attr->mtime_nsec); + } + break; + case '?': + if ((proto_version != p9_proto_2000u) && + (proto_version != p9_proto_2000L)) + return 0; + break; + default: + BUG(); + break; + } + + if (errcode) + break; + } + + return errcode; +} + +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = p9pdu_vreadf(pdu, proto_version, fmt, ap); + va_end(ap); + + return ret; +} + +static int +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = p9pdu_vwritef(pdu, proto_version, fmt, ap); + va_end(ap); + + return ret; +} + +int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version) +{ + struct p9_fcall fake_pdu; + int ret; + + fake_pdu.size = len; + fake_pdu.capacity = len; + fake_pdu.sdata = buf; + fake_pdu.offset = 0; + + ret = p9pdu_readf(&fake_pdu, proto_version, "S", st); + if (ret) { + P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); + p9pdu_dump(1, &fake_pdu); + } + + return ret; +} +EXPORT_SYMBOL(p9stat_read); + +int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type) +{ + pdu->id = type; + return p9pdu_writef(pdu, 0, "dbw", 0, type, tag); +} + +int p9pdu_finalize(struct p9_fcall *pdu) +{ + int size = pdu->size; + int err; + + pdu->size = 0; + err = p9pdu_writef(pdu, 0, "d", size); + pdu->size = size; + +#ifdef CONFIG_NET_9P_DEBUG + if ((p9_debug_level & P9_DEBUG_PKT) == P9_DEBUG_PKT) + p9pdu_dump(0, pdu); +#endif + + P9_DPRINTK(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", pdu->size, + pdu->id, pdu->tag); + + return err; +} + +void p9pdu_reset(struct p9_fcall *pdu) +{ + pdu->offset = 0; + pdu->size = 0; + pdu->private = NULL; + pdu->pubuf = NULL; + pdu->pkbuf = NULL; + pdu->pbuf_size = 0; +} + +int p9dirent_read(char *buf, int len, struct p9_dirent *dirent, + int proto_version) +{ + struct p9_fcall fake_pdu; + int ret; + char *nameptr; + + fake_pdu.size = len; + fake_pdu.capacity = len; + fake_pdu.sdata = buf; + fake_pdu.offset = 0; + + ret = p9pdu_readf(&fake_pdu, proto_version, "Qqbs", &dirent->qid, + &dirent->d_off, &dirent->d_type, &nameptr); + if (ret) { + P9_DPRINTK(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); + p9pdu_dump(1, &fake_pdu); + goto out; + } + + strcpy(dirent->d_name, nameptr); + kfree(nameptr); + +out: + return fake_pdu.offset; +} +EXPORT_SYMBOL(p9dirent_read); diff --git a/net/9p/protocol.h b/net/9p/protocol.h new file mode 100644 index 00000000..2431c0f3 --- /dev/null +++ b/net/9p/protocol.h @@ -0,0 +1,34 @@ +/* + * net/9p/protocol.h + * + * 9P Protocol Support Code + * + * Copyright (C) 2008 by Eric Van Hensbergen + * + * Base on code from Anthony Liguori + * Copyright (C) 2008 by IBM, Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap); +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); +int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type); +int p9pdu_finalize(struct p9_fcall *pdu); +void p9pdu_dump(int, struct p9_fcall *); +void p9pdu_reset(struct p9_fcall *pdu); diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c new file mode 100644 index 00000000..9a70ebde --- /dev/null +++ b/net/9p/trans_common.c @@ -0,0 +1,92 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Venkateswararao Jujjuri + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include +#include "trans_common.h" + +/** + * p9_release_req_pages - Release pages after the transaction. + * @*private: PDU's private page of struct trans_rpage_info + */ +void +p9_release_req_pages(struct trans_rpage_info *rpinfo) +{ + int i = 0; + + while (rpinfo->rp_data[i] && rpinfo->rp_nr_pages--) { + put_page(rpinfo->rp_data[i]); + i++; + } +} +EXPORT_SYMBOL(p9_release_req_pages); + +/** + * p9_nr_pages - Return number of pages needed to accommodate the payload. + */ +int +p9_nr_pages(struct p9_req_t *req) +{ + unsigned long start_page, end_page; + start_page = (unsigned long)req->tc->pubuf >> PAGE_SHIFT; + end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size + + PAGE_SIZE - 1) >> PAGE_SHIFT; + return end_page - start_page; +} +EXPORT_SYMBOL(p9_nr_pages); + +/** + * payload_gup - Translates user buffer into kernel pages and + * pins them either for read/write through get_user_pages_fast(). + * @req: Request to be sent to server. + * @pdata_off: data offset into the first page after translation (gup). + * @pdata_len: Total length of the IO. gup may not return requested # of pages. + * @nr_pages: number of pages to accommodate the payload + * @rw: Indicates if the pages are for read or write. + */ +int +p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, + int nr_pages, u8 rw) +{ + uint32_t first_page_bytes = 0; + int32_t pdata_mapped_pages; + struct trans_rpage_info *rpinfo; + + *pdata_off = (__force size_t)req->tc->pubuf & (PAGE_SIZE-1); + + if (*pdata_off) + first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off), + req->tc->pbuf_size); + + rpinfo = req->tc->private; + pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, + nr_pages, rw, &rpinfo->rp_data[0]); + if (pdata_mapped_pages <= 0) + return pdata_mapped_pages; + + rpinfo->rp_nr_pages = pdata_mapped_pages; + if (*pdata_off) { + *pdata_len = first_page_bytes; + *pdata_len += min((req->tc->pbuf_size - *pdata_len), + ((size_t)pdata_mapped_pages - 1) << PAGE_SHIFT); + } else { + *pdata_len = min(req->tc->pbuf_size, + (size_t)pdata_mapped_pages << PAGE_SHIFT); + } + return 0; +} +EXPORT_SYMBOL(p9_payload_gup); diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h new file mode 100644 index 00000000..76309223 --- /dev/null +++ b/net/9p/trans_common.h @@ -0,0 +1,32 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Venkateswararao Jujjuri + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* TRUE if it is user context */ +#define P9_IS_USER_CONTEXT (!segment_eq(get_fs(), KERNEL_DS)) + +/** + * struct trans_rpage_info - To store mapped page information in PDU. + * @rp_alloc:Set if this structure is allocd, not a reuse unused space in pdu. + * @rp_nr_pages: Number of mapped pages + * @rp_data: Array of page pointers + */ +struct trans_rpage_info { + u8 rp_alloc; + int rp_nr_pages; + struct page *rp_data[0]; +}; + +void p9_release_req_pages(struct trans_rpage_info *); +int p9_payload_gup(struct p9_req_t *, size_t *, int *, int, u8); +int p9_nr_pages(struct p9_req_t *); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c new file mode 100644 index 00000000..fdfdb574 --- /dev/null +++ b/net/9p/trans_fd.c @@ -0,0 +1,1087 @@ +/* + * linux/fs/9p/trans_fd.c + * + * Fd transport layer. Includes deprecated socket layer. + * + * Copyright (C) 2006 by Russ Cox + * Copyright (C) 2004-2005 by Latchesar Ionkov + * Copyright (C) 2004-2008 by Eric Van Hensbergen + * Copyright (C) 1997-2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* killme */ + +#define P9_PORT 564 +#define MAX_SOCK_BUF (64*1024) +#define MAXPOLLWADDR 2 + +/** + * struct p9_fd_opts - per-transport options + * @rfd: file descriptor for reading (trans=fd) + * @wfd: file descriptor for writing (trans=fd) + * @port: port to connect to (trans=tcp) + * + */ + +struct p9_fd_opts { + int rfd; + int wfd; + u16 port; +}; + +/** + * struct p9_trans_fd - transport state + * @rd: reference to file to read from + * @wr: reference of file to write to + * @conn: connection state reference + * + */ + +struct p9_trans_fd { + struct file *rd; + struct file *wr; + struct p9_conn *conn; +}; + +/* + * Option Parsing (code inspired by NFS code) + * - a little lazy - parse all fd-transport options + */ + +enum { + /* Options that take integer arguments */ + Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_port, "port=%u"}, + {Opt_rfdno, "rfdno=%u"}, + {Opt_wfdno, "wfdno=%u"}, + {Opt_err, NULL}, +}; + +enum { + Rworksched = 1, /* read work scheduled or running */ + Rpending = 2, /* can read */ + Wworksched = 4, /* write work scheduled or running */ + Wpending = 8, /* can write */ +}; + +struct p9_poll_wait { + struct p9_conn *conn; + wait_queue_t wait; + wait_queue_head_t *wait_addr; +}; + +/** + * struct p9_conn - fd mux connection state information + * @mux_list: list link for mux to manage multiple connections (?) + * @client: reference to client instance for this connection + * @err: error state + * @req_list: accounting for requests which have been sent + * @unsent_req_list: accounting for requests that haven't been sent + * @req: current request being processed (if any) + * @tmp_buf: temporary buffer to read in header + * @rsize: amount to read for current frame + * @rpos: read position in current frame + * @rbuf: current read buffer + * @wpos: write position for current frame + * @wsize: amount of data to write for current frame + * @wbuf: current write buffer + * @poll_pending_link: pending links to be polled per conn + * @poll_wait: array of wait_q's for various worker threads + * @pt: poll state + * @rq: current read work + * @wq: current write work + * @wsched: ???? + * + */ + +struct p9_conn { + struct list_head mux_list; + struct p9_client *client; + int err; + struct list_head req_list; + struct list_head unsent_req_list; + struct p9_req_t *req; + char tmp_buf[7]; + int rsize; + int rpos; + char *rbuf; + int wpos; + int wsize; + char *wbuf; + struct list_head poll_pending_link; + struct p9_poll_wait poll_wait[MAXPOLLWADDR]; + poll_table pt; + struct work_struct rq; + struct work_struct wq; + unsigned long wsched; +}; + +static void p9_poll_workfn(struct work_struct *work); + +static DEFINE_SPINLOCK(p9_poll_lock); +static LIST_HEAD(p9_poll_pending_list); +static DECLARE_WORK(p9_poll_work, p9_poll_workfn); + +static void p9_mux_poll_stop(struct p9_conn *m) +{ + unsigned long flags; + int i; + + for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) { + struct p9_poll_wait *pwait = &m->poll_wait[i]; + + if (pwait->wait_addr) { + remove_wait_queue(pwait->wait_addr, &pwait->wait); + pwait->wait_addr = NULL; + } + } + + spin_lock_irqsave(&p9_poll_lock, flags); + list_del_init(&m->poll_pending_link); + spin_unlock_irqrestore(&p9_poll_lock, flags); +} + +/** + * p9_conn_cancel - cancel all pending requests with error + * @m: mux data + * @err: error code + * + */ + +static void p9_conn_cancel(struct p9_conn *m, int err) +{ + struct p9_req_t *req, *rtmp; + unsigned long flags; + LIST_HEAD(cancel_list); + + P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); + + spin_lock_irqsave(&m->client->lock, flags); + + if (m->err) { + spin_unlock_irqrestore(&m->client->lock, flags); + return; + } + + m->err = err; + + list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { + req->status = REQ_STATUS_ERROR; + if (!req->t_err) + req->t_err = err; + list_move(&req->req_list, &cancel_list); + } + list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { + req->status = REQ_STATUS_ERROR; + if (!req->t_err) + req->t_err = err; + list_move(&req->req_list, &cancel_list); + } + spin_unlock_irqrestore(&m->client->lock, flags); + + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { + P9_DPRINTK(P9_DEBUG_ERROR, "call back req %p\n", req); + list_del(&req->req_list); + p9_client_cb(m->client, req); + } +} + +static int +p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt) +{ + int ret, n; + struct p9_trans_fd *ts = NULL; + + if (client && client->status == Connected) + ts = client->trans; + + if (!ts) + return -EREMOTEIO; + + if (!ts->rd->f_op || !ts->rd->f_op->poll) + return -EIO; + + if (!ts->wr->f_op || !ts->wr->f_op->poll) + return -EIO; + + ret = ts->rd->f_op->poll(ts->rd, pt); + if (ret < 0) + return ret; + + if (ts->rd != ts->wr) { + n = ts->wr->f_op->poll(ts->wr, pt); + if (n < 0) + return n; + ret = (ret & ~POLLOUT) | (n & ~POLLIN); + } + + return ret; +} + +/** + * p9_fd_read- read from a fd + * @client: client instance + * @v: buffer to receive data into + * @len: size of receive buffer + * + */ + +static int p9_fd_read(struct p9_client *client, void *v, int len) +{ + int ret; + struct p9_trans_fd *ts = NULL; + + if (client && client->status != Disconnected) + ts = client->trans; + + if (!ts) + return -EREMOTEIO; + + if (!(ts->rd->f_flags & O_NONBLOCK)) + P9_DPRINTK(P9_DEBUG_ERROR, "blocking read ...\n"); + + ret = kernel_read(ts->rd, ts->rd->f_pos, v, len); + if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) + client->status = Disconnected; + return ret; +} + +/** + * p9_read_work - called when there is some data to be read from a transport + * @work: container of work to be done + * + */ + +static void p9_read_work(struct work_struct *work) +{ + int n, err; + struct p9_conn *m; + + m = container_of(work, struct p9_conn, rq); + + if (m->err < 0) + return; + + P9_DPRINTK(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos); + + if (!m->rbuf) { + m->rbuf = m->tmp_buf; + m->rpos = 0; + m->rsize = 7; /* start by reading header */ + } + + clear_bit(Rpending, &m->wsched); + P9_DPRINTK(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", m, + m->rpos, m->rsize, m->rsize-m->rpos); + err = p9_fd_read(m->client, m->rbuf + m->rpos, + m->rsize - m->rpos); + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); + if (err == -EAGAIN) { + clear_bit(Rworksched, &m->wsched); + return; + } + + if (err <= 0) + goto error; + + m->rpos += err; + + if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */ + u16 tag; + P9_DPRINTK(P9_DEBUG_TRANS, "got new header\n"); + + n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */ + if (n >= m->client->msize) { + P9_DPRINTK(P9_DEBUG_ERROR, + "requested packet size too big: %d\n", n); + err = -EIO; + goto error; + } + + tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */ + P9_DPRINTK(P9_DEBUG_TRANS, + "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); + + m->req = p9_tag_lookup(m->client, tag); + if (!m->req || (m->req->status != REQ_STATUS_SENT && + m->req->status != REQ_STATUS_FLSH)) { + P9_DPRINTK(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", + tag); + err = -EIO; + goto error; + } + + if (m->req->rc == NULL) { + m->req->rc = kmalloc(sizeof(struct p9_fcall) + + m->client->msize, GFP_NOFS); + if (!m->req->rc) { + m->req = NULL; + err = -ENOMEM; + goto error; + } + } + m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall); + memcpy(m->rbuf, m->tmp_buf, m->rsize); + m->rsize = n; + } + + /* not an else because some packets (like clunk) have no payload */ + if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */ + P9_DPRINTK(P9_DEBUG_TRANS, "got new packet\n"); + spin_lock(&m->client->lock); + if (m->req->status != REQ_STATUS_ERROR) + m->req->status = REQ_STATUS_RCVD; + list_del(&m->req->req_list); + spin_unlock(&m->client->lock); + p9_client_cb(m->client, m->req); + m->rbuf = NULL; + m->rpos = 0; + m->rsize = 0; + m->req = NULL; + } + + if (!list_empty(&m->req_list)) { + if (test_and_clear_bit(Rpending, &m->wsched)) + n = POLLIN; + else + n = p9_fd_poll(m->client, NULL); + + if (n & POLLIN) { + P9_DPRINTK(P9_DEBUG_TRANS, "sched read work %p\n", m); + schedule_work(&m->rq); + } else + clear_bit(Rworksched, &m->wsched); + } else + clear_bit(Rworksched, &m->wsched); + + return; +error: + p9_conn_cancel(m, err); + clear_bit(Rworksched, &m->wsched); +} + +/** + * p9_fd_write - write to a socket + * @client: client instance + * @v: buffer to send data from + * @len: size of send buffer + * + */ + +static int p9_fd_write(struct p9_client *client, void *v, int len) +{ + int ret; + mm_segment_t oldfs; + struct p9_trans_fd *ts = NULL; + + if (client && client->status != Disconnected) + ts = client->trans; + + if (!ts) + return -EREMOTEIO; + + if (!(ts->wr->f_flags & O_NONBLOCK)) + P9_DPRINTK(P9_DEBUG_ERROR, "blocking write ...\n"); + + oldfs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos); + set_fs(oldfs); + + if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) + client->status = Disconnected; + return ret; +} + +/** + * p9_write_work - called when a transport can send some data + * @work: container for work to be done + * + */ + +static void p9_write_work(struct work_struct *work) +{ + int n, err; + struct p9_conn *m; + struct p9_req_t *req; + + m = container_of(work, struct p9_conn, wq); + + if (m->err < 0) { + clear_bit(Wworksched, &m->wsched); + return; + } + + if (!m->wsize) { + if (list_empty(&m->unsent_req_list)) { + clear_bit(Wworksched, &m->wsched); + return; + } + + spin_lock(&m->client->lock); + req = list_entry(m->unsent_req_list.next, struct p9_req_t, + req_list); + req->status = REQ_STATUS_SENT; + P9_DPRINTK(P9_DEBUG_TRANS, "move req %p\n", req); + list_move_tail(&req->req_list, &m->req_list); + + m->wbuf = req->tc->sdata; + m->wsize = req->tc->size; + m->wpos = 0; + spin_unlock(&m->client->lock); + } + + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", m, m->wpos, + m->wsize); + clear_bit(Wpending, &m->wsched); + err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos); + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err); + if (err == -EAGAIN) { + clear_bit(Wworksched, &m->wsched); + return; + } + + if (err < 0) + goto error; + else if (err == 0) { + err = -EREMOTEIO; + goto error; + } + + m->wpos += err; + if (m->wpos == m->wsize) + m->wpos = m->wsize = 0; + + if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) { + if (test_and_clear_bit(Wpending, &m->wsched)) + n = POLLOUT; + else + n = p9_fd_poll(m->client, NULL); + + if (n & POLLOUT) { + P9_DPRINTK(P9_DEBUG_TRANS, "sched write work %p\n", m); + schedule_work(&m->wq); + } else + clear_bit(Wworksched, &m->wsched); + } else + clear_bit(Wworksched, &m->wsched); + + return; + +error: + p9_conn_cancel(m, err); + clear_bit(Wworksched, &m->wsched); +} + +static int p9_pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct p9_poll_wait *pwait = + container_of(wait, struct p9_poll_wait, wait); + struct p9_conn *m = pwait->conn; + unsigned long flags; + + spin_lock_irqsave(&p9_poll_lock, flags); + if (list_empty(&m->poll_pending_link)) + list_add_tail(&m->poll_pending_link, &p9_poll_pending_list); + spin_unlock_irqrestore(&p9_poll_lock, flags); + + schedule_work(&p9_poll_work); + return 1; +} + +/** + * p9_pollwait - add poll task to the wait queue + * @filp: file pointer being polled + * @wait_address: wait_q to block on + * @p: poll state + * + * called by files poll operation to add v9fs-poll task to files wait queue + */ + +static void +p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) +{ + struct p9_conn *m = container_of(p, struct p9_conn, pt); + struct p9_poll_wait *pwait = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) { + if (m->poll_wait[i].wait_addr == NULL) { + pwait = &m->poll_wait[i]; + break; + } + } + + if (!pwait) { + P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n"); + return; + } + + pwait->conn = m; + pwait->wait_addr = wait_address; + init_waitqueue_func_entry(&pwait->wait, p9_pollwake); + add_wait_queue(wait_address, &pwait->wait); +} + +/** + * p9_conn_create - allocate and initialize the per-session mux data + * @client: client instance + * + * Note: Creates the polling task if this is the first session. + */ + +static struct p9_conn *p9_conn_create(struct p9_client *client) +{ + int n; + struct p9_conn *m; + + P9_DPRINTK(P9_DEBUG_TRANS, "client %p msize %d\n", client, + client->msize); + m = kzalloc(sizeof(struct p9_conn), GFP_KERNEL); + if (!m) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&m->mux_list); + m->client = client; + + INIT_LIST_HEAD(&m->req_list); + INIT_LIST_HEAD(&m->unsent_req_list); + INIT_WORK(&m->rq, p9_read_work); + INIT_WORK(&m->wq, p9_write_work); + INIT_LIST_HEAD(&m->poll_pending_link); + init_poll_funcptr(&m->pt, p9_pollwait); + + n = p9_fd_poll(client, &m->pt); + if (n & POLLIN) { + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can read\n", m); + set_bit(Rpending, &m->wsched); + } + + if (n & POLLOUT) { + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can write\n", m); + set_bit(Wpending, &m->wsched); + } + + return m; +} + +/** + * p9_poll_mux - polls a mux and schedules read or write works if necessary + * @m: connection to poll + * + */ + +static void p9_poll_mux(struct p9_conn *m) +{ + int n; + + if (m->err < 0) + return; + + n = p9_fd_poll(m->client, NULL); + if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) { + P9_DPRINTK(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n); + if (n >= 0) + n = -ECONNRESET; + p9_conn_cancel(m, n); + } + + if (n & POLLIN) { + set_bit(Rpending, &m->wsched); + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can read\n", m); + if (!test_and_set_bit(Rworksched, &m->wsched)) { + P9_DPRINTK(P9_DEBUG_TRANS, "sched read work %p\n", m); + schedule_work(&m->rq); + } + } + + if (n & POLLOUT) { + set_bit(Wpending, &m->wsched); + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can write\n", m); + if ((m->wsize || !list_empty(&m->unsent_req_list)) && + !test_and_set_bit(Wworksched, &m->wsched)) { + P9_DPRINTK(P9_DEBUG_TRANS, "sched write work %p\n", m); + schedule_work(&m->wq); + } + } +} + +/** + * p9_fd_request - send 9P request + * The function can sleep until the request is scheduled for sending. + * The function can be interrupted. Return from the function is not + * a guarantee that the request is sent successfully. + * + * @client: client instance + * @req: request to be sent + * + */ + +static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) +{ + int n; + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = ts->conn; + + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", m, + current, req->tc, req->tc->id); + if (m->err < 0) + return m->err; + + spin_lock(&client->lock); + req->status = REQ_STATUS_UNSENT; + list_add_tail(&req->req_list, &m->unsent_req_list); + spin_unlock(&client->lock); + + if (test_and_clear_bit(Wpending, &m->wsched)) + n = POLLOUT; + else + n = p9_fd_poll(m->client, NULL); + + if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) + schedule_work(&m->wq); + + return 0; +} + +static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) +{ + int ret = 1; + + P9_DPRINTK(P9_DEBUG_TRANS, "client %p req %p\n", client, req); + + spin_lock(&client->lock); + + if (req->status == REQ_STATUS_UNSENT) { + list_del(&req->req_list); + req->status = REQ_STATUS_FLSHD; + ret = 0; + } else if (req->status == REQ_STATUS_SENT) + req->status = REQ_STATUS_FLSH; + + spin_unlock(&client->lock); + + return ret; +} + +/** + * parse_opts - parse mount options into p9_fd_opts structure + * @params: options string passed from mount + * @opts: fd transport-specific structure to parse options into + * + * Returns 0 upon success, -ERRNO upon failure + */ + +static int parse_opts(char *params, struct p9_fd_opts *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *options, *tmp_options; + + opts->port = P9_PORT; + opts->rfd = ~0; + opts->wfd = ~0; + + if (!params) + return 0; + + tmp_options = kstrdup(params, GFP_KERNEL); + if (!tmp_options) { + P9_DPRINTK(P9_DEBUG_ERROR, + "failed to allocate copy of option string\n"); + return -ENOMEM; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + int r; + if (!*p) + continue; + token = match_token(p, tokens, args); + if (token != Opt_err) { + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + continue; + } + } + switch (token) { + case Opt_port: + opts->port = option; + break; + case Opt_rfdno: + opts->rfd = option; + break; + case Opt_wfdno: + opts->wfd = option; + break; + default: + continue; + } + } + + kfree(tmp_options); + return 0; +} + +static int p9_fd_open(struct p9_client *client, int rfd, int wfd) +{ + struct p9_trans_fd *ts = kmalloc(sizeof(struct p9_trans_fd), + GFP_KERNEL); + if (!ts) + return -ENOMEM; + + ts->rd = fget(rfd); + ts->wr = fget(wfd); + if (!ts->rd || !ts->wr) { + if (ts->rd) + fput(ts->rd); + if (ts->wr) + fput(ts->wr); + kfree(ts); + return -EIO; + } + + client->trans = ts; + client->status = Connected; + + return 0; +} + +static int p9_socket_open(struct p9_client *client, struct socket *csocket) +{ + struct p9_trans_fd *p; + int ret, fd; + + p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); + if (!p) + return -ENOMEM; + + csocket->sk->sk_allocation = GFP_NOIO; + fd = sock_map_fd(csocket, 0); + if (fd < 0) { + P9_EPRINTK(KERN_ERR, "p9_socket_open: failed to map fd\n"); + sock_release(csocket); + kfree(p); + return fd; + } + + get_file(csocket->file); + get_file(csocket->file); + p->wr = p->rd = csocket->file; + client->trans = p; + client->status = Connected; + + sys_close(fd); /* still racy */ + + p->rd->f_flags |= O_NONBLOCK; + + p->conn = p9_conn_create(client); + if (IS_ERR(p->conn)) { + ret = PTR_ERR(p->conn); + p->conn = NULL; + kfree(p); + sockfd_put(csocket); + sockfd_put(csocket); + return ret; + } + return 0; +} + +/** + * p9_mux_destroy - cancels all pending requests and frees mux resources + * @m: mux to destroy + * + */ + +static void p9_conn_destroy(struct p9_conn *m) +{ + P9_DPRINTK(P9_DEBUG_TRANS, "mux %p prev %p next %p\n", m, + m->mux_list.prev, m->mux_list.next); + + p9_mux_poll_stop(m); + cancel_work_sync(&m->rq); + cancel_work_sync(&m->wq); + + p9_conn_cancel(m, -ECONNRESET); + + m->client = NULL; + kfree(m); +} + +/** + * p9_fd_close - shutdown file descriptor transport + * @client: client instance + * + */ + +static void p9_fd_close(struct p9_client *client) +{ + struct p9_trans_fd *ts; + + if (!client) + return; + + ts = client->trans; + if (!ts) + return; + + client->status = Disconnected; + + p9_conn_destroy(ts->conn); + + if (ts->rd) + fput(ts->rd); + if (ts->wr) + fput(ts->wr); + + kfree(ts); +} + +/* + * stolen from NFS - maybe should be made a generic function? + */ +static inline int valid_ipaddr4(const char *buf) +{ + int rc, count, in[4]; + + rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); + if (rc != 4) + return -EINVAL; + for (count = 0; count < 4; count++) { + if (in[count] > 255) + return -EINVAL; + } + return 0; +} + +static int +p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct socket *csocket; + struct sockaddr_in sin_server; + struct p9_fd_opts opts; + + err = parse_opts(args, &opts); + if (err < 0) + return err; + + if (valid_ipaddr4(addr) < 0) + return -EINVAL; + + csocket = NULL; + + sin_server.sin_family = AF_INET; + sin_server.sin_addr.s_addr = in_aton(addr); + sin_server.sin_port = htons(opts.port); + err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &csocket, 1); + if (err) { + P9_EPRINTK(KERN_ERR, "p9_trans_tcp: problem creating socket\n"); + return err; + } + + err = csocket->ops->connect(csocket, + (struct sockaddr *)&sin_server, + sizeof(struct sockaddr_in), 0); + if (err < 0) { + P9_EPRINTK(KERN_ERR, + "p9_trans_tcp: problem connecting socket to %s\n", + addr); + sock_release(csocket); + return err; + } + + return p9_socket_open(client, csocket); +} + +static int +p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct socket *csocket; + struct sockaddr_un sun_server; + + csocket = NULL; + + if (strlen(addr) >= UNIX_PATH_MAX) { + P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n", + addr); + return -ENAMETOOLONG; + } + + sun_server.sun_family = PF_UNIX; + strcpy(sun_server.sun_path, addr); + err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_UNIX, + SOCK_STREAM, 0, &csocket, 1); + if (err < 0) { + P9_EPRINTK(KERN_ERR, "p9_trans_unix: problem creating socket\n"); + return err; + } + err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server, + sizeof(struct sockaddr_un) - 1, 0); + if (err < 0) { + P9_EPRINTK(KERN_ERR, + "p9_trans_unix: problem connecting socket: %s: %d\n", + addr, err); + sock_release(csocket); + return err; + } + + return p9_socket_open(client, csocket); +} + +static int +p9_fd_create(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct p9_fd_opts opts; + struct p9_trans_fd *p; + + parse_opts(args, &opts); + + if (opts.rfd == ~0 || opts.wfd == ~0) { + printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n"); + return -ENOPROTOOPT; + } + + err = p9_fd_open(client, opts.rfd, opts.wfd); + if (err < 0) + return err; + + p = (struct p9_trans_fd *) client->trans; + p->conn = p9_conn_create(client); + if (IS_ERR(p->conn)) { + err = PTR_ERR(p->conn); + p->conn = NULL; + fput(p->rd); + fput(p->wr); + return err; + } + + return 0; +} + +static struct p9_trans_module p9_tcp_trans = { + .name = "tcp", + .maxsize = MAX_SOCK_BUF, + .def = 1, + .create = p9_fd_create_tcp, + .close = p9_fd_close, + .request = p9_fd_request, + .cancel = p9_fd_cancel, + .owner = THIS_MODULE, +}; + +static struct p9_trans_module p9_unix_trans = { + .name = "unix", + .maxsize = MAX_SOCK_BUF, + .def = 0, + .create = p9_fd_create_unix, + .close = p9_fd_close, + .request = p9_fd_request, + .cancel = p9_fd_cancel, + .owner = THIS_MODULE, +}; + +static struct p9_trans_module p9_fd_trans = { + .name = "fd", + .maxsize = MAX_SOCK_BUF, + .def = 0, + .create = p9_fd_create, + .close = p9_fd_close, + .request = p9_fd_request, + .cancel = p9_fd_cancel, + .owner = THIS_MODULE, +}; + +/** + * p9_poll_proc - poll worker thread + * @a: thread state and arguments + * + * polls all v9fs transports for new events and queues the appropriate + * work to the work queue + * + */ + +static void p9_poll_workfn(struct work_struct *work) +{ + unsigned long flags; + + P9_DPRINTK(P9_DEBUG_TRANS, "start %p\n", current); + + spin_lock_irqsave(&p9_poll_lock, flags); + while (!list_empty(&p9_poll_pending_list)) { + struct p9_conn *conn = list_first_entry(&p9_poll_pending_list, + struct p9_conn, + poll_pending_link); + list_del_init(&conn->poll_pending_link); + spin_unlock_irqrestore(&p9_poll_lock, flags); + + p9_poll_mux(conn); + + spin_lock_irqsave(&p9_poll_lock, flags); + } + spin_unlock_irqrestore(&p9_poll_lock, flags); + + P9_DPRINTK(P9_DEBUG_TRANS, "finish\n"); +} + +int p9_trans_fd_init(void) +{ + v9fs_register_trans(&p9_tcp_trans); + v9fs_register_trans(&p9_unix_trans); + v9fs_register_trans(&p9_fd_trans); + + return 0; +} + +void p9_trans_fd_exit(void) +{ + flush_work_sync(&p9_poll_work); + v9fs_unregister_trans(&p9_tcp_trans); + v9fs_unregister_trans(&p9_unix_trans); + v9fs_unregister_trans(&p9_fd_trans); +} diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c new file mode 100644 index 00000000..159c50f1 --- /dev/null +++ b/net/9p/trans_rdma.c @@ -0,0 +1,719 @@ +/* + * linux/fs/9p/trans_rdma.c + * + * RDMA transport layer based on the trans_fd.c implementation. + * + * Copyright (C) 2008 by Tom Tucker + * Copyright (C) 2006 by Russ Cox + * Copyright (C) 2004-2005 by Latchesar Ionkov + * Copyright (C) 2004-2008 by Eric Van Hensbergen + * Copyright (C) 1997-2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define P9_PORT 5640 +#define P9_RDMA_SQ_DEPTH 32 +#define P9_RDMA_RQ_DEPTH 32 +#define P9_RDMA_SEND_SGE 4 +#define P9_RDMA_RECV_SGE 4 +#define P9_RDMA_IRD 0 +#define P9_RDMA_ORD 0 +#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ +#define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can + * safely advertise a maxsize + * of 64k */ + +/** + * struct p9_trans_rdma - RDMA transport instance + * + * @state: tracks the transport state machine for connection setup and tear down + * @cm_id: The RDMA CM ID + * @pd: Protection Domain pointer + * @qp: Queue Pair pointer + * @cq: Completion Queue pointer + * @dm_mr: DMA Memory Region pointer + * @lkey: The local access only memory region key + * @timeout: Number of uSecs to wait for connection management events + * @sq_depth: The depth of the Send Queue + * @sq_sem: Semaphore for the SQ + * @rq_depth: The depth of the Receive Queue. + * @rq_count: Count of requests in the Receive Queue. + * @addr: The remote peer's address + * @req_lock: Protects the active request list + * @cm_done: Completion event for connection management tracking + */ +struct p9_trans_rdma { + enum { + P9_RDMA_INIT, + P9_RDMA_ADDR_RESOLVED, + P9_RDMA_ROUTE_RESOLVED, + P9_RDMA_CONNECTED, + P9_RDMA_FLUSHING, + P9_RDMA_CLOSING, + P9_RDMA_CLOSED, + } state; + struct rdma_cm_id *cm_id; + struct ib_pd *pd; + struct ib_qp *qp; + struct ib_cq *cq; + struct ib_mr *dma_mr; + u32 lkey; + long timeout; + int sq_depth; + struct semaphore sq_sem; + int rq_depth; + atomic_t rq_count; + struct sockaddr_in addr; + spinlock_t req_lock; + + struct completion cm_done; +}; + +/** + * p9_rdma_context - Keeps track of in-process WR + * + * @wc_op: The original WR op for when the CQE completes in error. + * @busa: Bus address to unmap when the WR completes + * @req: Keeps track of requests (send) + * @rc: Keepts track of replies (receive) + */ +struct p9_rdma_req; +struct p9_rdma_context { + enum ib_wc_opcode wc_op; + dma_addr_t busa; + union { + struct p9_req_t *req; + struct p9_fcall *rc; + }; +}; + +/** + * p9_rdma_opts - Collection of mount options + * @port: port of connection + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + int sq_depth; + int rq_depth; + long timeout; +}; + +/* + * Option Parsing (code inspired by NFS code) + */ +enum { + /* Options that take integer arguments */ + Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, +}; + +static match_table_t tokens = { + {Opt_port, "port=%u"}, + {Opt_sq_depth, "sq=%u"}, + {Opt_rq_depth, "rq=%u"}, + {Opt_timeout, "timeout=%u"}, + {Opt_err, NULL}, +}; + +/** + * parse_opts - parse mount options into rdma options structure + * @params: options string passed from mount + * @opts: rdma transport-specific structure to parse options into + * + * Returns 0 upon success, -ERRNO upon failure + */ +static int parse_opts(char *params, struct p9_rdma_opts *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *options, *tmp_options; + + opts->port = P9_PORT; + opts->sq_depth = P9_RDMA_SQ_DEPTH; + opts->rq_depth = P9_RDMA_RQ_DEPTH; + opts->timeout = P9_RDMA_TIMEOUT; + + if (!params) + return 0; + + tmp_options = kstrdup(params, GFP_KERNEL); + if (!tmp_options) { + P9_DPRINTK(P9_DEBUG_ERROR, + "failed to allocate copy of option string\n"); + return -ENOMEM; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + int r; + if (!*p) + continue; + token = match_token(p, tokens, args); + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + continue; + } + switch (token) { + case Opt_port: + opts->port = option; + break; + case Opt_sq_depth: + opts->sq_depth = option; + break; + case Opt_rq_depth: + opts->rq_depth = option; + break; + case Opt_timeout: + opts->timeout = option; + break; + default: + continue; + } + } + /* RQ must be at least as large as the SQ */ + opts->rq_depth = max(opts->rq_depth, opts->sq_depth); + kfree(tmp_options); + return 0; +} + +static int +p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct p9_client *c = id->context; + struct p9_trans_rdma *rdma = c->trans; + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + BUG_ON(rdma->state != P9_RDMA_INIT); + rdma->state = P9_RDMA_ADDR_RESOLVED; + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); + rdma->state = P9_RDMA_ROUTE_RESOLVED; + break; + + case RDMA_CM_EVENT_ESTABLISHED: + BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); + rdma->state = P9_RDMA_CONNECTED; + break; + + case RDMA_CM_EVENT_DISCONNECTED: + if (rdma) + rdma->state = P9_RDMA_CLOSED; + if (c) + c->status = Disconnected; + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + break; + + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_CONNECT_REQUEST: + case RDMA_CM_EVENT_CONNECT_RESPONSE: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + c->status = Disconnected; + rdma_disconnect(rdma->cm_id); + break; + default: + BUG(); + } + complete(&rdma->cm_done); + return 0; +} + +static void +handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, + struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +{ + struct p9_req_t *req; + int err = 0; + int16_t tag; + + req = NULL; + ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, + DMA_FROM_DEVICE); + + if (status != IB_WC_SUCCESS) + goto err_out; + + err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); + if (err) + goto err_out; + + req = p9_tag_lookup(client, tag); + if (!req) + goto err_out; + + req->rc = c->rc; + req->status = REQ_STATUS_RCVD; + p9_client_cb(client, req); + + return; + + err_out: + P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, status); + rdma->state = P9_RDMA_FLUSHING; + client->status = Disconnected; +} + +static void +handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, + struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +{ + ib_dma_unmap_single(rdma->cm_id->device, + c->busa, c->req->tc->size, + DMA_TO_DEVICE); +} + +static void qp_event_handler(struct ib_event *event, void *context) +{ + P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, + context); +} + +static void cq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct p9_client *client = cq_context; + struct p9_trans_rdma *rdma = client->trans; + int ret; + struct ib_wc wc; + + ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); + while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { + struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; + + switch (c->wc_op) { + case IB_WC_RECV: + atomic_dec(&rdma->rq_count); + handle_recv(client, rdma, c, wc.status, wc.byte_len); + break; + + case IB_WC_SEND: + handle_send(client, rdma, c, wc.status, wc.byte_len); + up(&rdma->sq_sem); + break; + + default: + printk(KERN_ERR "9prdma: unexpected completion type, " + "c->wc_op=%d, wc.opcode=%d, status=%d\n", + c->wc_op, wc.opcode, wc.status); + break; + } + kfree(c); + } +} + +static void cq_event_handler(struct ib_event *e, void *v) +{ + P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); +} + +static void rdma_destroy_trans(struct p9_trans_rdma *rdma) +{ + if (!rdma) + return; + + if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) + ib_dereg_mr(rdma->dma_mr); + + if (rdma->qp && !IS_ERR(rdma->qp)) + ib_destroy_qp(rdma->qp); + + if (rdma->pd && !IS_ERR(rdma->pd)) + ib_dealloc_pd(rdma->pd); + + if (rdma->cq && !IS_ERR(rdma->cq)) + ib_destroy_cq(rdma->cq); + + if (rdma->cm_id && !IS_ERR(rdma->cm_id)) + rdma_destroy_id(rdma->cm_id); + + kfree(rdma); +} + +static int +post_recv(struct p9_client *client, struct p9_rdma_context *c) +{ + struct p9_trans_rdma *rdma = client->trans; + struct ib_recv_wr wr, *bad_wr; + struct ib_sge sge; + + c->busa = ib_dma_map_single(rdma->cm_id->device, + c->rc->sdata, client->msize, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) + goto error; + + sge.addr = c->busa; + sge.length = client->msize; + sge.lkey = rdma->lkey; + + wr.next = NULL; + c->wc_op = IB_WC_RECV; + wr.wr_id = (unsigned long) c; + wr.sg_list = &sge; + wr.num_sge = 1; + return ib_post_recv(rdma->qp, &wr, &bad_wr); + + error: + P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); + return -EIO; +} + +static int rdma_request(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + struct ib_send_wr wr, *bad_wr; + struct ib_sge sge; + int err = 0; + unsigned long flags; + struct p9_rdma_context *c = NULL; + struct p9_rdma_context *rpl_context = NULL; + + /* Allocate an fcall for the reply */ + rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); + if (!rpl_context) { + err = -ENOMEM; + goto err_close; + } + + /* + * If the request has a buffer, steal it, otherwise + * allocate a new one. Typically, requests should already + * have receive buffers allocated and just swap them around + */ + if (!req->rc) { + req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, + GFP_NOFS); + if (req->rc) { + req->rc->sdata = (char *) req->rc + + sizeof(struct p9_fcall); + req->rc->capacity = client->msize; + } + } + rpl_context->rc = req->rc; + if (!rpl_context->rc) { + err = -ENOMEM; + goto err_free2; + } + + /* + * Post a receive buffer for this request. We need to ensure + * there is a reply buffer available for every outstanding + * request. A flushed request can result in no reply for an + * outstanding request, so we must keep a count to avoid + * overflowing the RQ. + */ + if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { + err = post_recv(client, rpl_context); + if (err) + goto err_free1; + } else + atomic_dec(&rdma->rq_count); + + /* remove posted receive buffer from request structure */ + req->rc = NULL; + + /* Post the request */ + c = kmalloc(sizeof *c, GFP_NOFS); + if (!c) { + err = -ENOMEM; + goto err_free1; + } + c->req = req; + + c->busa = ib_dma_map_single(rdma->cm_id->device, + c->req->tc->sdata, c->req->tc->size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) + goto error; + + sge.addr = c->busa; + sge.length = c->req->tc->size; + sge.lkey = rdma->lkey; + + wr.next = NULL; + c->wc_op = IB_WC_SEND; + wr.wr_id = (unsigned long) c; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + wr.sg_list = &sge; + wr.num_sge = 1; + + if (down_interruptible(&rdma->sq_sem)) + goto error; + + return ib_post_send(rdma->qp, &wr, &bad_wr); + + error: + kfree(c); + kfree(rpl_context->rc); + kfree(rpl_context); + P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); + return -EIO; + err_free1: + kfree(rpl_context->rc); + err_free2: + kfree(rpl_context); + err_close: + spin_lock_irqsave(&rdma->req_lock, flags); + if (rdma->state < P9_RDMA_CLOSING) { + rdma->state = P9_RDMA_CLOSING; + spin_unlock_irqrestore(&rdma->req_lock, flags); + rdma_disconnect(rdma->cm_id); + } else + spin_unlock_irqrestore(&rdma->req_lock, flags); + return err; +} + +static void rdma_close(struct p9_client *client) +{ + struct p9_trans_rdma *rdma; + + if (!client) + return; + + rdma = client->trans; + if (!rdma) + return; + + client->status = Disconnected; + rdma_disconnect(rdma->cm_id); + rdma_destroy_trans(rdma); +} + +/** + * alloc_rdma - Allocate and initialize the rdma transport structure + * @opts: Mount options structure + */ +static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) +{ + struct p9_trans_rdma *rdma; + + rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); + if (!rdma) + return NULL; + + rdma->sq_depth = opts->sq_depth; + rdma->rq_depth = opts->rq_depth; + rdma->timeout = opts->timeout; + spin_lock_init(&rdma->req_lock); + init_completion(&rdma->cm_done); + sema_init(&rdma->sq_sem, rdma->sq_depth); + atomic_set(&rdma->rq_count, 0); + + return rdma; +} + +/* its not clear to me we can do anything after send has been posted */ +static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +/** + * trans_create_rdma - Transport method for creating atransport instance + * @client: client instance + * @addr: IP address string + * @args: Mount options string + */ +static int +rdma_create_trans(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct p9_rdma_opts opts; + struct p9_trans_rdma *rdma; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct ib_device_attr devattr; + + /* Parse the transport specific mount options */ + err = parse_opts(args, &opts); + if (err < 0) + return err; + + /* Create and initialize the RDMA transport structure */ + rdma = alloc_rdma(&opts); + if (!rdma) + return -ENOMEM; + + /* Create the RDMA CM ID */ + rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(rdma->cm_id)) + goto error; + + /* Associate the client with the transport */ + client->trans = rdma; + + /* Resolve the server's address */ + rdma->addr.sin_family = AF_INET; + rdma->addr.sin_addr.s_addr = in_aton(addr); + rdma->addr.sin_port = htons(opts.port); + err = rdma_resolve_addr(rdma->cm_id, NULL, + (struct sockaddr *)&rdma->addr, + rdma->timeout); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) + goto error; + + /* Resolve the route to the server */ + err = rdma_resolve_route(rdma->cm_id, rdma->timeout); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) + goto error; + + /* Query the device attributes */ + err = ib_query_device(rdma->cm_id->device, &devattr); + if (err) + goto error; + + /* Create the Completion Queue */ + rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, + cq_event_handler, client, + opts.sq_depth + opts.rq_depth + 1, 0); + if (IS_ERR(rdma->cq)) + goto error; + ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); + + /* Create the Protection Domain */ + rdma->pd = ib_alloc_pd(rdma->cm_id->device); + if (IS_ERR(rdma->pd)) + goto error; + + /* Cache the DMA lkey in the transport */ + rdma->dma_mr = NULL; + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + rdma->lkey = rdma->cm_id->device->local_dma_lkey; + else { + rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rdma->dma_mr)) + goto error; + rdma->lkey = rdma->dma_mr->lkey; + } + + /* Create the Queue Pair */ + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = client; + qp_attr.cap.max_send_wr = opts.sq_depth; + qp_attr.cap.max_recv_wr = opts.rq_depth; + qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; + qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = rdma->cq; + qp_attr.recv_cq = rdma->cq; + err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); + if (err) + goto error; + rdma->qp = rdma->cm_id->qp; + + /* Request a connection */ + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + conn_param.responder_resources = P9_RDMA_IRD; + conn_param.initiator_depth = P9_RDMA_ORD; + err = rdma_connect(rdma->cm_id, &conn_param); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_CONNECTED)) + goto error; + + client->status = Connected; + + return 0; + +error: + rdma_destroy_trans(rdma); + return -ENOTCONN; +} + +static struct p9_trans_module p9_rdma_trans = { + .name = "rdma", + .maxsize = P9_RDMA_MAXSIZE, + .def = 0, + .owner = THIS_MODULE, + .create = rdma_create_trans, + .close = rdma_close, + .request = rdma_request, + .cancel = rdma_cancel, +}; + +/** + * p9_trans_rdma_init - Register the 9P RDMA transport driver + */ +static int __init p9_trans_rdma_init(void) +{ + v9fs_register_trans(&p9_rdma_trans); + return 0; +} + +static void __exit p9_trans_rdma_exit(void) +{ + v9fs_unregister_trans(&p9_rdma_trans); +} + +module_init(p9_trans_rdma_init); +module_exit(p9_trans_rdma_exit); + +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("RDMA Transport for 9P"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c new file mode 100644 index 00000000..e317583f --- /dev/null +++ b/net/9p/trans_virtio.c @@ -0,0 +1,629 @@ +/* + * The Virtio 9p transport driver + * + * This is a block based transport driver based on the lguest block driver + * code. + * + * Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation + * + * Based on virtio console driver + * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trans_common.h" + +#define VIRTQUEUE_NUM 128 + +/* a single mutex to manage channel initialization and attachment */ +static DEFINE_MUTEX(virtio_9p_lock); +static DECLARE_WAIT_QUEUE_HEAD(vp_wq); +static atomic_t vp_pinned = ATOMIC_INIT(0); + +/** + * struct virtio_chan - per-instance transport information + * @initialized: whether the channel is initialized + * @inuse: whether the channel is in use + * @lock: protects multiple elements within this structure + * @client: client instance + * @vdev: virtio dev associated with this channel + * @vq: virtio queue associated with this channel + * @sg: scatter gather list which is used to pack a request (protected?) + * + * We keep all per-channel information in a structure. + * This structure is allocated within the devices dev->mem space. + * A pointer to the structure will get put in the transport private. + * + */ + +struct virtio_chan { + bool inuse; + + spinlock_t lock; + + struct p9_client *client; + struct virtio_device *vdev; + struct virtqueue *vq; + int ring_bufs_avail; + wait_queue_head_t *vc_wq; + /* This is global limit. Since we don't have a global structure, + * will be placing it in each channel. + */ + int p9_max_pages; + /* Scatterlist: can be too big for stack. */ + struct scatterlist sg[VIRTQUEUE_NUM]; + + int tag_len; + /* + * tag name to identify a mount Non-null terminated + */ + char *tag; + + struct list_head chan_list; +}; + +static struct list_head virtio_chan_list; + +/* How many bytes left in this page. */ +static unsigned int rest_of_page(void *data) +{ + return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); +} + +/** + * p9_virtio_close - reclaim resources of a channel + * @client: client instance + * + * This reclaims a channel by freeing its resources and + * reseting its inuse flag. + * + */ + +static void p9_virtio_close(struct p9_client *client) +{ + struct virtio_chan *chan = client->trans; + + mutex_lock(&virtio_9p_lock); + if (chan) + chan->inuse = false; + mutex_unlock(&virtio_9p_lock); +} + +/** + * req_done - callback which signals activity from the server + * @vq: virtio queue activity was received on + * + * This notifies us that the server has triggered some activity + * on the virtio channel - most likely a response to request we + * sent. Figure out which requests now have responses and wake up + * those threads. + * + * Bugs: could do with some additional sanity checking, but appears to work. + * + */ + +static void req_done(struct virtqueue *vq) +{ + struct virtio_chan *chan = vq->vdev->priv; + struct p9_fcall *rc; + unsigned int len; + struct p9_req_t *req; + unsigned long flags; + + P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); + + while (1) { + spin_lock_irqsave(&chan->lock, flags); + rc = virtqueue_get_buf(chan->vq, &len); + + if (rc == NULL) { + spin_unlock_irqrestore(&chan->lock, flags); + break; + } + + chan->ring_bufs_avail = 1; + spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); + P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); + P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); + req = p9_tag_lookup(chan->client, rc->tag); + if (req->tc->private) { + struct trans_rpage_info *rp = req->tc->private; + int p = rp->rp_nr_pages; + /*Release pages */ + p9_release_req_pages(rp); + atomic_sub(p, &vp_pinned); + wake_up(&vp_wq); + if (rp->rp_alloc) + kfree(rp); + req->tc->private = NULL; + } + req->status = REQ_STATUS_RCVD; + p9_client_cb(chan->client, req); + } +} + +/** + * pack_sg_list - pack a scatter gather list from a linear buffer + * @sg: scatter/gather list to pack into + * @start: which segment of the sg_list to start at + * @limit: maximum segment to pack data to + * @data: data to pack into scatter/gather list + * @count: amount of data to pack into the scatter/gather list + * + * sg_lists have multiple segments of various sizes. This will pack + * arbitrary data into an existing scatter gather list, segmenting the + * data as necessary within constraints. + * + */ + +static int +pack_sg_list(struct scatterlist *sg, int start, int limit, char *data, + int count) +{ + int s; + int index = start; + + while (count) { + s = rest_of_page(data); + if (s > count) + s = count; + sg_set_buf(&sg[index++], data, s); + count -= s; + data += s; + BUG_ON(index > limit); + } + + return index-start; +} + +/* We don't currently allow canceling of virtio requests */ +static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +/** + * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, + * this takes a list of pages. + * @sg: scatter/gather list to pack into + * @start: which segment of the sg_list to start at + * @pdata_off: Offset into the first page + * @**pdata: a list of pages to add into sg. + * @count: amount of data to pack into the scatter/gather list + */ +static int +pack_sg_list_p(struct scatterlist *sg, int start, int limit, size_t pdata_off, + struct page **pdata, int count) +{ + int s; + int i = 0; + int index = start; + + if (pdata_off) { + s = min((int)(PAGE_SIZE - pdata_off), count); + sg_set_page(&sg[index++], pdata[i++], s, pdata_off); + count -= s; + } + + while (count) { + BUG_ON(index > limit); + s = min((int)PAGE_SIZE, count); + sg_set_page(&sg[index++], pdata[i++], s, 0); + count -= s; + } + return index-start; +} + +/** + * p9_virtio_request - issue a request + * @client: client instance issuing the request + * @req: request to be issued + * + */ + +static int +p9_virtio_request(struct p9_client *client, struct p9_req_t *req) +{ + int in, out, inp, outp; + struct virtio_chan *chan = client->trans; + unsigned long flags; + size_t pdata_off = 0; + struct trans_rpage_info *rpinfo = NULL; + int err, pdata_len = 0; + + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); + + req->status = REQ_STATUS_SENT; + + if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) { + int nr_pages = p9_nr_pages(req); + int rpinfo_size = sizeof(struct trans_rpage_info) + + sizeof(struct page *) * nr_pages; + + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_interruptible(vp_wq, + atomic_read(&vp_pinned) < chan->p9_max_pages); + if (err == -ERESTARTSYS) + return err; + P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n"); + } + + if (rpinfo_size <= (req->tc->capacity - req->tc->size)) { + /* We can use sdata */ + req->tc->private = req->tc->sdata + req->tc->size; + rpinfo = (struct trans_rpage_info *)req->tc->private; + rpinfo->rp_alloc = 0; + } else { + req->tc->private = kmalloc(rpinfo_size, GFP_NOFS); + if (!req->tc->private) { + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: " + "private kmalloc returned NULL"); + return -ENOMEM; + } + rpinfo = (struct trans_rpage_info *)req->tc->private; + rpinfo->rp_alloc = 1; + } + + err = p9_payload_gup(req, &pdata_off, &pdata_len, nr_pages, + req->tc->id == P9_TREAD ? 1 : 0); + if (err < 0) { + if (rpinfo->rp_alloc) + kfree(rpinfo); + return err; + } else { + atomic_add(rpinfo->rp_nr_pages, &vp_pinned); + } + } + +req_retry_pinned: + spin_lock_irqsave(&chan->lock, flags); + + /* Handle out VirtIO ring buffers */ + out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, + req->tc->size); + + if (req->tc->pbuf_size && (req->tc->id == P9_TWRITE)) { + /* We have additional write payload buffer to take care */ + if (req->tc->pubuf && P9_IS_USER_CONTEXT) { + outp = pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, + pdata_off, rpinfo->rp_data, pdata_len); + } else { + char *pbuf; + if (req->tc->pubuf) + pbuf = (__force char *) req->tc->pubuf; + else + pbuf = req->tc->pkbuf; + outp = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, pbuf, + req->tc->pbuf_size); + } + out += outp; + } + + /* Handle in VirtIO ring buffers */ + if (req->tc->pbuf_size && + ((req->tc->id == P9_TREAD) || (req->tc->id == P9_TREADDIR))) { + /* + * Take care of additional Read payload. + * 11 is the read/write header = PDU Header(7) + IO Size (4). + * Arrange in such a way that server places header in the + * alloced memory and payload onto the user buffer. + */ + inp = pack_sg_list(chan->sg, out, + VIRTQUEUE_NUM, req->rc->sdata, 11); + /* + * Running executables in the filesystem may result in + * a read request with kernel buffer as opposed to user buffer. + */ + if (req->tc->pubuf && P9_IS_USER_CONTEXT) { + in = pack_sg_list_p(chan->sg, out+inp, VIRTQUEUE_NUM, + pdata_off, rpinfo->rp_data, pdata_len); + } else { + char *pbuf; + if (req->tc->pubuf) + pbuf = (__force char *) req->tc->pubuf; + else + pbuf = req->tc->pkbuf; + + in = pack_sg_list(chan->sg, out+inp, VIRTQUEUE_NUM, + pbuf, req->tc->pbuf_size); + } + in += inp; + } else { + in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, + req->rc->sdata, req->rc->capacity); + } + + err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc); + if (err < 0) { + if (err == -ENOSPC) { + chan->ring_bufs_avail = 0; + spin_unlock_irqrestore(&chan->lock, flags); + err = wait_event_interruptible(*chan->vc_wq, + chan->ring_bufs_avail); + if (err == -ERESTARTSYS) + return err; + + P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); + goto req_retry_pinned; + } else { + spin_unlock_irqrestore(&chan->lock, flags); + P9_DPRINTK(P9_DEBUG_TRANS, + "9p debug: " + "virtio rpc add_buf returned failure"); + if (rpinfo && rpinfo->rp_alloc) + kfree(rpinfo); + return -EIO; + } + } + + virtqueue_kick(chan->vq); + spin_unlock_irqrestore(&chan->lock, flags); + + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n"); + return 0; +} + +static ssize_t p9_mount_tag_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct virtio_chan *chan; + struct virtio_device *vdev; + + vdev = dev_to_virtio(dev); + chan = vdev->priv; + + return snprintf(buf, chan->tag_len + 1, "%s", chan->tag); +} + +static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); + +/** + * p9_virtio_probe - probe for existence of 9P virtio channels + * @vdev: virtio device to probe + * + * This probes for existing virtio channels. + * + */ + +static int p9_virtio_probe(struct virtio_device *vdev) +{ + __u16 tag_len; + char *tag; + int err; + struct virtio_chan *chan; + + chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); + if (!chan) { + printk(KERN_ERR "9p: Failed to allocate virtio 9P channel\n"); + err = -ENOMEM; + goto fail; + } + + chan->vdev = vdev; + + /* We expect one virtqueue, for requests. */ + chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); + if (IS_ERR(chan->vq)) { + err = PTR_ERR(chan->vq); + goto out_free_vq; + } + chan->vq->vdev->priv = chan; + spin_lock_init(&chan->lock); + + sg_init_table(chan->sg, VIRTQUEUE_NUM); + + chan->inuse = false; + if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) { + vdev->config->get(vdev, + offsetof(struct virtio_9p_config, tag_len), + &tag_len, sizeof(tag_len)); + } else { + err = -EINVAL; + goto out_free_vq; + } + tag = kmalloc(tag_len, GFP_KERNEL); + if (!tag) { + err = -ENOMEM; + goto out_free_vq; + } + vdev->config->get(vdev, offsetof(struct virtio_9p_config, tag), + tag, tag_len); + chan->tag = tag; + chan->tag_len = tag_len; + err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); + if (err) { + goto out_free_tag; + } + chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + if (!chan->vc_wq) { + err = -ENOMEM; + goto out_free_tag; + } + init_waitqueue_head(chan->vc_wq); + chan->ring_bufs_avail = 1; + /* Ceiling limit to avoid denial of service attacks */ + chan->p9_max_pages = nr_free_buffer_pages()/4; + + mutex_lock(&virtio_9p_lock); + list_add_tail(&chan->chan_list, &virtio_chan_list); + mutex_unlock(&virtio_9p_lock); + return 0; + +out_free_tag: + kfree(tag); +out_free_vq: + vdev->config->del_vqs(vdev); + kfree(chan); +fail: + return err; +} + + +/** + * p9_virtio_create - allocate a new virtio channel + * @client: client instance invoking this transport + * @devname: string identifying the channel to connect to (unused) + * @args: args passed from sys_mount() for per-transport options (unused) + * + * This sets up a transport channel for 9p communication. Right now + * we only match the first available channel, but eventually we couldlook up + * alternate channels by matching devname versus a virtio_config entry. + * We use a simple reference count mechanism to ensure that only a single + * mount has a channel open at a time. + * + */ + +static int +p9_virtio_create(struct p9_client *client, const char *devname, char *args) +{ + struct virtio_chan *chan; + int ret = -ENOENT; + int found = 0; + + mutex_lock(&virtio_9p_lock); + list_for_each_entry(chan, &virtio_chan_list, chan_list) { + if (!strncmp(devname, chan->tag, chan->tag_len) && + strlen(devname) == chan->tag_len) { + if (!chan->inuse) { + chan->inuse = true; + found = 1; + break; + } + ret = -EBUSY; + } + } + mutex_unlock(&virtio_9p_lock); + + if (!found) { + printk(KERN_ERR "9p: no channels available\n"); + return ret; + } + + client->trans = (void *)chan; + client->status = Connected; + chan->client = client; + + return 0; +} + +/** + * p9_virtio_remove - clean up resources associated with a virtio device + * @vdev: virtio device to remove + * + */ + +static void p9_virtio_remove(struct virtio_device *vdev) +{ + struct virtio_chan *chan = vdev->priv; + + BUG_ON(chan->inuse); + vdev->config->del_vqs(vdev); + + mutex_lock(&virtio_9p_lock); + list_del(&chan->chan_list); + mutex_unlock(&virtio_9p_lock); + sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); + kfree(chan->tag); + kfree(chan->vc_wq); + kfree(chan); + +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { + VIRTIO_9P_MOUNT_TAG, +}; + +/* The standard "struct lguest_driver": */ +static struct virtio_driver p9_virtio_drv = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = p9_virtio_probe, + .remove = p9_virtio_remove, +}; + +static struct p9_trans_module p9_virtio_trans = { + .name = "virtio", + .create = p9_virtio_create, + .close = p9_virtio_close, + .request = p9_virtio_request, + .cancel = p9_virtio_cancel, + + /* + * We leave one entry for input and one entry for response + * headers. We also skip one more entry to accomodate, address + * that are not at page boundary, that can result in an extra + * page in zero copy. + */ + .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), + .pref = P9_TRANS_PREF_PAYLOAD_SEP, + .def = 0, + .owner = THIS_MODULE, +}; + +/* The standard init function */ +static int __init p9_virtio_init(void) +{ + INIT_LIST_HEAD(&virtio_chan_list); + + v9fs_register_trans(&p9_virtio_trans); + return register_virtio_driver(&p9_virtio_drv); +} + +static void __exit p9_virtio_cleanup(void) +{ + unregister_virtio_driver(&p9_virtio_drv); + v9fs_unregister_trans(&p9_virtio_trans); +} + +module_init(p9_virtio_init); +module_exit(p9_virtio_cleanup); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_AUTHOR("Eric Van Hensbergen "); +MODULE_DESCRIPTION("Virtio 9p Transport"); +MODULE_LICENSE("GPL"); diff --git a/net/9p/util.c b/net/9p/util.c new file mode 100644 index 00000000..9c1c9348 --- /dev/null +++ b/net/9p/util.c @@ -0,0 +1,146 @@ +/* + * net/9p/util.c + * + * This file contains some helper functions + * + * Copyright (C) 2007 by Latchesar Ionkov + * Copyright (C) 2004 by Eric Van Hensbergen + * Copyright (C) 2002 by Ron Minnich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * struct p9_idpool - per-connection accounting for tag idpool + * @lock: protects the pool + * @pool: idr to allocate tag id from + * + */ + +struct p9_idpool { + spinlock_t lock; + struct idr pool; +}; + +/** + * p9_idpool_create - create a new per-connection id pool + * + */ + +struct p9_idpool *p9_idpool_create(void) +{ + struct p9_idpool *p; + + p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&p->lock); + idr_init(&p->pool); + + return p; +} +EXPORT_SYMBOL(p9_idpool_create); + +/** + * p9_idpool_destroy - create a new per-connection id pool + * @p: idpool to destroy + */ + +void p9_idpool_destroy(struct p9_idpool *p) +{ + idr_destroy(&p->pool); + kfree(p); +} +EXPORT_SYMBOL(p9_idpool_destroy); + +/** + * p9_idpool_get - allocate numeric id from pool + * @p: pool to allocate from + * + * Bugs: This seems to be an awful generic function, should it be in idr.c with + * the lock included in struct idr? + */ + +int p9_idpool_get(struct p9_idpool *p) +{ + int i = 0; + int error; + unsigned long flags; + +retry: + if (idr_pre_get(&p->pool, GFP_NOFS) == 0) + return -1; + + spin_lock_irqsave(&p->lock, flags); + + /* no need to store exactly p, we just need something non-null */ + error = idr_get_new(&p->pool, p, &i); + spin_unlock_irqrestore(&p->lock, flags); + + if (error == -EAGAIN) + goto retry; + else if (error) + return -1; + + P9_DPRINTK(P9_DEBUG_MUX, " id %d pool %p\n", i, p); + return i; +} +EXPORT_SYMBOL(p9_idpool_get); + +/** + * p9_idpool_put - release numeric id from pool + * @id: numeric id which is being released + * @p: pool to release id into + * + * Bugs: This seems to be an awful generic function, should it be in idr.c with + * the lock included in struct idr? + */ + +void p9_idpool_put(int id, struct p9_idpool *p) +{ + unsigned long flags; + + P9_DPRINTK(P9_DEBUG_MUX, " id %d pool %p\n", id, p); + + spin_lock_irqsave(&p->lock, flags); + idr_remove(&p->pool, id); + spin_unlock_irqrestore(&p->lock, flags); +} +EXPORT_SYMBOL(p9_idpool_put); + +/** + * p9_idpool_check - check if the specified id is available + * @id: id to check + * @p: pool to check + */ + +int p9_idpool_check(int id, struct p9_idpool *p) +{ + return idr_find(&p->pool, id) != NULL; +} +EXPORT_SYMBOL(p9_idpool_check); + diff --git a/net/Kconfig b/net/Kconfig new file mode 100644 index 00000000..919cf9a8 --- /dev/null +++ b/net/Kconfig @@ -0,0 +1,341 @@ +# +# Network configuration +# + +menuconfig NET + bool "Networking support" + select NLATTR + ---help--- + Unless you really know what you are doing, you should say Y here. + The reason is that some programs need kernel networking support even + when running on a stand-alone machine that isn't connected to any + other computer. + + If you are upgrading from an older kernel, you + should consider updating your networking tools too because changes + in the kernel and the tools often go hand in hand. The tools are + contained in the package net-tools, the location and version number + of which are given in . + + For a general introduction to Linux networking, it is highly + recommended to read the NET-HOWTO, available from + . + +if NET + +config WANT_COMPAT_NETLINK_MESSAGES + bool + help + This option can be selected by other options that need compat + netlink messages. + +config COMPAT_NETLINK_MESSAGES + def_bool y + depends on COMPAT + depends on WEXT_CORE || WANT_COMPAT_NETLINK_MESSAGES + help + This option makes it possible to send different netlink messages + to tasks depending on whether the task is a compat task or not. To + achieve this, you need to set skb_shinfo(skb)->frag_list to the + compat skb before sending the skb, the netlink code will sort out + which message to actually pass to the task. + + Newly written code should NEVER need this option but do + compat-independent messages instead! + +menu "Networking options" + +source "net/packet/Kconfig" +source "net/unix/Kconfig" +source "net/xfrm/Kconfig" +source "net/iucv/Kconfig" + +config INET + bool "TCP/IP networking" + ---help--- + These are the protocols used on the Internet and on most local + Ethernets. It is highly recommended to say Y here (this will enlarge + your kernel by about 400 KB), since some programs (e.g. the X window + system) use TCP/IP even if your machine is not connected to any + other computer. You will get the so-called loopback device which + allows you to ping yourself (great fun, that!). + + For an excellent introduction to Linux networking, please read the + Linux Networking HOWTO, available from + . + + If you say Y here and also to "/proc file system support" and + "Sysctl support" below, you can change various aspects of the + behavior of the TCP/IP code by writing to the (virtual) files in + /proc/sys/net/ipv4/*; the options are explained in the file + . + + Short answer: say Y. + +if INET +source "net/ipv4/Kconfig" +source "net/ipv6/Kconfig" +source "net/netlabel/Kconfig" + +endif # if INET + +config ANDROID_PARANOID_NETWORK + bool "Only allow certain groups to create sockets" + default y + help + none + +config NET_ACTIVITY_STATS + bool "Network activity statistics tracking" + default y + help + Network activity statistics are useful for tracking wireless + modem activity on 2G, 3G, 4G wireless networks. Counts number of + transmissions and groups them in specified time buckets. + +config NETWORK_SECMARK + bool "Security Marking" + help + This enables security marking of network packets, similar + to nfmark, but designated for security purposes. + If you are unsure how to answer this question, answer N. + +config NETWORK_PHY_TIMESTAMPING + bool "Timestamping in PHY devices" + depends on EXPERIMENTAL + help + This allows timestamping of network packets by PHYs with + hardware timestamping capabilities. This option adds some + overhead in the transmit and receive paths. + + If you are unsure how to answer this question, answer N. + +menuconfig NETFILTER + bool "Network packet filtering framework (Netfilter)" + ---help--- + Netfilter is a framework for filtering and mangling network packets + that pass through your Linux box. + + The most common use of packet filtering is to run your Linux box as + a firewall protecting a local network from the Internet. The type of + firewall provided by this kernel support is called a "packet + filter", which means that it can reject individual network packets + based on type, source, destination etc. The other kind of firewall, + a "proxy-based" one, is more secure but more intrusive and more + bothersome to set up; it inspects the network traffic much more + closely, modifies it and has knowledge about the higher level + protocols, which a packet filter lacks. Moreover, proxy-based + firewalls often require changes to the programs running on the local + clients. Proxy-based firewalls don't need support by the kernel, but + they are often combined with a packet filter, which only works if + you say Y here. + + You should also say Y here if you intend to use your Linux box as + the gateway to the Internet for a local network of machines without + globally valid IP addresses. This is called "masquerading": if one + of the computers on your local network wants to send something to + the outside, your box can "masquerade" as that computer, i.e. it + forwards the traffic to the intended outside destination, but + modifies the packets to make it look like they came from the + firewall box itself. It works both ways: if the outside host + replies, the Linux box will silently forward the traffic to the + correct local computer. This way, the computers on your local net + are completely invisible to the outside world, even though they can + reach the outside and can receive replies. It is even possible to + run globally visible servers from within a masqueraded local network + using a mechanism called portforwarding. Masquerading is also often + called NAT (Network Address Translation). + + Another use of Netfilter is in transparent proxying: if a machine on + the local network tries to connect to an outside host, your Linux + box can transparently forward the traffic to a local server, + typically a caching proxy server. + + Yet another use of Netfilter is building a bridging firewall. Using + a bridge with Network packet filtering enabled makes iptables "see" + the bridged traffic. For filtering on the lower network and Ethernet + protocols over the bridge, use ebtables (under bridge netfilter + configuration). + + Various modules exist for netfilter which replace the previous + masquerading (ipmasqadm), packet filtering (ipchains), transparent + proxying, and portforwarding mechanisms. Please see + under "iptables" for the location of + these packages. + +if NETFILTER + +config NETFILTER_DEBUG + bool "Network packet filtering debugging" + depends on NETFILTER + help + You can say Y here if you want to get additional messages useful in + debugging the netfilter code. + +config NETFILTER_ADVANCED + bool "Advanced netfilter configuration" + depends on NETFILTER + default y + help + If you say Y here you can select between all the netfilter modules. + If you say N the more unusual ones will not be shown and the + basic ones needed by most people will default to 'M'. + + If unsure, say Y. + +config BRIDGE_NETFILTER + bool "Bridged IP/ARP packets filtering" + depends on BRIDGE && NETFILTER && INET + depends on NETFILTER_ADVANCED + default y + ---help--- + Enabling this option will let arptables resp. iptables see bridged + ARP resp. IP traffic. If you want a bridging firewall, you probably + want this option enabled. + Enabling or disabling this option doesn't enable or disable + ebtables. + + If unsure, say N. + +source "net/netfilter/Kconfig" +source "net/ipv4/netfilter/Kconfig" +source "net/ipv6/netfilter/Kconfig" +source "net/decnet/netfilter/Kconfig" +source "net/bridge/netfilter/Kconfig" + +endif + +source "net/dccp/Kconfig" +source "net/sctp/Kconfig" +source "net/rds/Kconfig" +source "net/tipc/Kconfig" +source "net/atm/Kconfig" +source "net/l2tp/Kconfig" +source "net/802/Kconfig" +source "net/bridge/Kconfig" +source "net/dsa/Kconfig" +source "net/8021q/Kconfig" +source "net/decnet/Kconfig" +source "net/llc/Kconfig" +source "net/ipx/Kconfig" +source "drivers/net/appletalk/Kconfig" +source "net/x25/Kconfig" +source "net/lapb/Kconfig" +source "net/econet/Kconfig" +source "net/wanrouter/Kconfig" +source "net/phonet/Kconfig" +source "net/ieee802154/Kconfig" +source "net/sched/Kconfig" +source "net/dcb/Kconfig" +source "net/dns_resolver/Kconfig" +source "net/batman-adv/Kconfig" + +config RPS + boolean "RPS" + depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS + default y + +config RFS_ACCEL + boolean + depends on RPS && GENERIC_HARDIRQS + select CPU_RMAP + default y + +config XPS + boolean + depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS + default y + +config HAVE_BPF_JIT + bool + +config BPF_JIT + bool "enable BPF Just In Time compiler" + depends on HAVE_BPF_JIT + depends on MODULES + ---help--- + Berkeley Packet Filter filtering capabilities are normally handled + by an interpreter. This option allows kernel to generate a native + code when filter is loaded in memory. This should speedup + packet sniffing (libpcap/tcpdump). Note : Admin should enable + this feature changing /proc/sys/net/core/bpf_jit_enable + +menu "Network testing" + +config NET_PKTGEN + tristate "Packet Generator (USE WITH CAUTION)" + depends on PROC_FS + ---help--- + This module will inject preconfigured packets, at a configurable + rate, out of a given interface. It is used for network interface + stress testing and performance analysis. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use the packet generator can be found + at . + + To compile this code as a module, choose M here: the + module will be called pktgen. + +config NET_TCPPROBE + tristate "TCP connection probing" + depends on INET && EXPERIMENTAL && PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to TCP connection + state in response to incoming packets. It is used for debugging + TCP congestion avoidance modules. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use TCP connection probing can be found + at: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe + + To compile this code as a module, choose M here: the + module will be called tcp_probe. + +config NET_DROP_MONITOR + boolean "Network packet drop alerting service" + depends on INET && EXPERIMENTAL && TRACEPOINTS + ---help--- + This feature provides an alerting service to userspace in the + event that packets are discarded in the network stack. Alerts + are broadcast via netlink socket to any listening user space + process. If you don't need network drop alerts, or if you are ok + just checking the various proc files and other utilities for + drop statistics, say N here. + +endmenu + +endmenu + +source "net/ax25/Kconfig" +source "net/can/Kconfig" +source "net/irda/Kconfig" +source "net/bluetooth/Kconfig" +source "net/rxrpc/Kconfig" + +config FIB_RULES + bool + +menuconfig WIRELESS + bool "Wireless" + depends on !S390 + default y + +if WIRELESS + +source "net/wireless/Kconfig" +source "net/mac80211/Kconfig" + +endif # WIRELESS + +source "net/wimax/Kconfig" + +source "net/rfkill/Kconfig" +source "net/9p/Kconfig" +source "net/caif/Kconfig" +source "net/ceph/Kconfig" + + +endif # if NET diff --git a/net/Makefile b/net/Makefile new file mode 100644 index 00000000..54808aba --- /dev/null +++ b/net/Makefile @@ -0,0 +1,71 @@ +# +# Makefile for the linux networking. +# +# 2 Sep 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# + +obj-y := nonet.o + +obj-$(CONFIG_NET) := socket.o core/ + +tmp-$(CONFIG_COMPAT) := compat.o +obj-$(CONFIG_NET) += $(tmp-y) + +# LLC has to be linked before the files in net/802/ +obj-$(CONFIG_LLC) += llc/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ +obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_INET) += ipv4/ +obj-$(CONFIG_XFRM) += xfrm/ +obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_NET) += ipv6/ +obj-$(CONFIG_PACKET) += packet/ +obj-$(CONFIG_NET_KEY) += key/ +obj-$(CONFIG_BRIDGE) += bridge/ +obj-$(CONFIG_NET_DSA) += dsa/ +obj-$(CONFIG_IPX) += ipx/ +obj-$(CONFIG_ATALK) += appletalk/ +obj-$(CONFIG_WAN_ROUTER) += wanrouter/ +obj-$(CONFIG_X25) += x25/ +obj-$(CONFIG_LAPB) += lapb/ +obj-$(CONFIG_NETROM) += netrom/ +obj-$(CONFIG_ROSE) += rose/ +obj-$(CONFIG_AX25) += ax25/ +obj-$(CONFIG_CAN) += can/ +obj-$(CONFIG_IRDA) += irda/ +obj-$(CONFIG_BT) += bluetooth/ +obj-$(CONFIG_SUNRPC) += sunrpc/ +obj-$(CONFIG_AF_RXRPC) += rxrpc/ +obj-$(CONFIG_ATM) += atm/ +obj-$(CONFIG_L2TP) += l2tp/ +obj-$(CONFIG_DECNET) += decnet/ +obj-$(CONFIG_ECONET) += econet/ +obj-$(CONFIG_PHONET) += phonet/ +ifneq ($(CONFIG_VLAN_8021Q),) +obj-y += 8021q/ +endif +obj-$(CONFIG_IP_DCCP) += dccp/ +obj-$(CONFIG_IP_SCTP) += sctp/ +obj-$(CONFIG_RDS) += rds/ +obj-$(CONFIG_WIRELESS) += wireless/ +obj-$(CONFIG_MAC80211) += mac80211/ +obj-$(CONFIG_TIPC) += tipc/ +obj-$(CONFIG_NETLABEL) += netlabel/ +obj-$(CONFIG_IUCV) += iucv/ +obj-$(CONFIG_RFKILL) += rfkill/ +obj-$(CONFIG_NET_9P) += 9p/ +obj-$(CONFIG_CAIF) += caif/ +ifneq ($(CONFIG_DCB),) +obj-y += dcb/ +endif +obj-$(CONFIG_IEEE802154) += ieee802154/ + +ifeq ($(CONFIG_NET),y) +obj-$(CONFIG_SYSCTL) += sysctl_net.o +endif +obj-$(CONFIG_WIMAX) += wimax/ +obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ +obj-$(CONFIG_CEPH_LIB) += ceph/ +obj-$(CONFIG_BATMAN_ADV) += batman-adv/ +obj-$(CONFIG_NET_ACTIVITY_STATS) += activity_stats.o diff --git a/net/TUNABLE b/net/TUNABLE new file mode 100644 index 00000000..9913211f --- /dev/null +++ b/net/TUNABLE @@ -0,0 +1,50 @@ +The following parameters should be tunable at compile time. Some of them +exist as sysctls too. + +This is far from complete + +Item Description +---------------------------------------------------------------------------- +MAX_LINKS Maximum number of netlink minor devices. (1-32) +RIF_TABLE_SIZE Token ring RIF cache size (tunable) +AARP_HASH_SIZE Size of Appletalk hash table (tunable) +AX25_DEF_T1 AX.25 parameters. These are all tunable via +AX25_DEF_T2 SIOCAX25SETPARMS +AX25_DEF_T3 T1-T3,N2 have the meanings in the specification +AX25_DEF_N2 +AX25_DEF_AXDEFMODE 8 = normal 128 is PE1CHL extended +AX25_DEF_IPDEFMODE 'D' - datagram 'V' - virtual connection +AX25_DEF_BACKOFF 'E'xponential 'L'inear +AX25_DEF_NETROM Allow netrom 1=Y +AX25_DF_TEXT Allow PID=Text 1=Y +AX25_DEF_WINDOW Window for normal mode +AX25_DEF_EWINDOW Window for PE1CHL mode +AX25_DEF_DIGI 1 for inband 2 for cross band 3 for both +AX25_DEF_CONMODE Allow connected modes 1=Yes +AX25_ROUTE_MAX AX.25 route cache size - no currently tunable +Unnamed (16) Number of protocol hash slots (tunable) +DEV_NUMBUFFS Number of priority levels (not easily tunable) +Unnamed (300) Maximum packet backlog queue (tunable) +MAX_IOVEC Maximum number of iovecs in a message (tunable) +MIN_WINDOW Offered minimum window (tunable) +MAX_WINDOW Offered maximum window (tunable) +MAX_HEADER Largest physical header (tunable) +MAX_ADDR_LEN Largest physical address (tunable) +SOCK_ARRAY_SIZE IP socket array hash size (tunable) +IP_MAX_MEMBERSHIPS Largest number of groups per socket (BSD style) (tunable) +16 Hard coded constant for amount of room allowed for + cache align and faster forwarding (tunable) +IP_FRAG_TIME Time we hold a fragment for. (tunable) +PORT_MASQ_BEGIN First port reserved for masquerade (tunable) +PORT_MASQ_END Last port used for masquerade (tunable) +MASQUERADE_EXPIRE_TCP_FIN Time we keep a masquerade for after a FIN +MASQUERADE_EXPIRE_UDP Time we keep a UDP masquerade for (tunable) +MAXVIFS Maximum mrouted vifs (1-32) +MFC_LINES Lines in the multicast router cache (tunable) + +NetROM parameters are tunable via an ioctl passing a struct + +4000 Size a Unix domain socket malloc falls back to + (tunable) should be 8K - a bit for 8K machines like + the ALPHA + diff --git a/net/activity_stats.c b/net/activity_stats.c new file mode 100644 index 00000000..8a3e9347 --- /dev/null +++ b/net/activity_stats.c @@ -0,0 +1,115 @@ +/* net/activity_stats.c + * + * Copyright (C) 2010 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Author: Mike Chan (mike@android.com) + */ + +#include +#include +#include + +/* + * Track transmission rates in buckets (power of 2). + * 1,2,4,8...512 seconds. + * + * Buckets represent the count of network transmissions at least + * N seconds apart, where N is 1 << bucket index. + */ +#define BUCKET_MAX 10 + +/* Track network activity frequency */ +static unsigned long activity_stats[BUCKET_MAX]; +static ktime_t last_transmit; +static ktime_t suspend_time; +static DEFINE_SPINLOCK(activity_lock); + +void activity_stats_update(void) +{ + int i; + unsigned long flags; + ktime_t now; + s64 delta; + + spin_lock_irqsave(&activity_lock, flags); + now = ktime_get(); + delta = ktime_to_ns(ktime_sub(now, last_transmit)); + + for (i = BUCKET_MAX - 1; i >= 0; i--) { + /* + * Check if the time delta between network activity is within the + * minimum bucket range. + */ + if (delta < (1000000000ULL << i)) + continue; + + activity_stats[i]++; + last_transmit = now; + break; + } + spin_unlock_irqrestore(&activity_lock, flags); +} + +static int activity_stats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int i; + int len; + char *p = page; + + /* Only print if offset is 0, or we have enough buffer space */ + if (off || count < (30 * BUCKET_MAX + 22)) + return -ENOMEM; + + len = snprintf(p, count, "Min Bucket(sec) Count\n"); + count -= len; + p += len; + + for (i = 0; i < BUCKET_MAX; i++) { + len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]); + count -= len; + p += len; + } + *eof = 1; + + return p - page; +} + +static int activity_stats_notifier(struct notifier_block *nb, + unsigned long event, void *dummy) +{ + switch (event) { + case PM_SUSPEND_PREPARE: + suspend_time = ktime_get_real(); + break; + + case PM_POST_SUSPEND: + suspend_time = ktime_sub(ktime_get_real(), suspend_time); + last_transmit = ktime_sub(last_transmit, suspend_time); + } + + return 0; +} + +static struct notifier_block activity_stats_notifier_block = { + .notifier_call = activity_stats_notifier, +}; + +static int __init activity_stats_init(void) +{ + create_proc_read_entry("activity", S_IRUGO, + init_net.proc_net_stat, activity_stats_read_proc, NULL); + return register_pm_notifier(&activity_stats_notifier_block); +} + +subsys_initcall(activity_stats_init); + diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile new file mode 100644 index 00000000..5cda56ed --- /dev/null +++ b/net/appletalk/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux AppleTalk layer. +# + +obj-$(CONFIG_ATALK) += appletalk.o + +appletalk-y := aarp.o ddp.o dev.o +appletalk-$(CONFIG_PROC_FS) += atalk_proc.o +appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c new file mode 100644 index 00000000..50dce798 --- /dev/null +++ b/net/appletalk/aarp.c @@ -0,0 +1,1063 @@ +/* + * AARP: An implementation of the AppleTalk AARP protocol for + * Ethernet 'ELAP'. + * + * Alan Cox + * + * This doesn't fit cleanly with the IP arp. Potentially we can use + * the generic neighbour discovery code to clean this up. + * + * FIXME: + * We ought to handle the retransmits with a single list and a + * separate fast timer for when it is needed. + * Use neighbour discovery code. + * Token Ring Support. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * References: + * Inside AppleTalk (2nd Ed). + * Fixes: + * Jaume Grau - flush caches on AARP_PROBE + * Rob Newberry - Added proxy AARP and AARP proc fs, + * moved probing from DDP module. + * Arnaldo C. Melo - don't mangle rx packets + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME; +int sysctl_aarp_tick_time = AARP_TICK_TIME; +int sysctl_aarp_retransmit_limit = AARP_RETRANSMIT_LIMIT; +int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; + +/* Lists of aarp entries */ +/** + * struct aarp_entry - AARP entry + * @last_sent - Last time we xmitted the aarp request + * @packet_queue - Queue of frames wait for resolution + * @status - Used for proxy AARP + * expires_at - Entry expiry time + * target_addr - DDP Address + * dev - Device to use + * hwaddr - Physical i/f address of target/router + * xmit_count - When this hits 10 we give up + * next - Next entry in chain + */ +struct aarp_entry { + /* These first two are only used for unresolved entries */ + unsigned long last_sent; + struct sk_buff_head packet_queue; + int status; + unsigned long expires_at; + struct atalk_addr target_addr; + struct net_device *dev; + char hwaddr[6]; + unsigned short xmit_count; + struct aarp_entry *next; +}; + +/* Hashed list of resolved, unresolved and proxy entries */ +static struct aarp_entry *resolved[AARP_HASH_SIZE]; +static struct aarp_entry *unresolved[AARP_HASH_SIZE]; +static struct aarp_entry *proxies[AARP_HASH_SIZE]; +static int unresolved_count; + +/* One lock protects it all. */ +static DEFINE_RWLOCK(aarp_lock); + +/* Used to walk the list and purge/kick entries. */ +static struct timer_list aarp_timer; + +/* + * Delete an aarp queue + * + * Must run under aarp_lock. + */ +static void __aarp_expire(struct aarp_entry *a) +{ + skb_queue_purge(&a->packet_queue); + kfree(a); +} + +/* + * Send an aarp queue entry request + * + * Must run under aarp_lock. + */ +static void __aarp_send_query(struct aarp_entry *a) +{ + static unsigned char aarp_eth_multicast[ETH_ALEN] = + { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; + struct net_device *dev = a->dev; + struct elapaarp *eah; + int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + struct atalk_addr *sat = atalk_find_dev_addr(dev); + + if (!skb) + return; + + if (!sat) { + kfree_skb(skb); + return; + } + + /* Set up the buffer */ + skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(*eah)); + skb->protocol = htons(ETH_P_ATALK); + skb->dev = dev; + eah = aarp_hdr(skb); + + /* Set up the ARP */ + eah->hw_type = htons(AARP_HW_TYPE_ETHERNET); + eah->pa_type = htons(ETH_P_ATALK); + eah->hw_len = ETH_ALEN; + eah->pa_len = AARP_PA_ALEN; + eah->function = htons(AARP_REQUEST); + + memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + + eah->pa_src_zero = 0; + eah->pa_src_net = sat->s_net; + eah->pa_src_node = sat->s_node; + + memset(eah->hw_dst, '\0', ETH_ALEN); + + eah->pa_dst_zero = 0; + eah->pa_dst_net = a->target_addr.s_net; + eah->pa_dst_node = a->target_addr.s_node; + + /* Send it */ + aarp_dl->request(aarp_dl, skb, aarp_eth_multicast); + /* Update the sending count */ + a->xmit_count++; + a->last_sent = jiffies; +} + +/* This runs under aarp_lock and in softint context, so only atomic memory + * allocations can be used. */ +static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us, + struct atalk_addr *them, unsigned char *sha) +{ + struct elapaarp *eah; + int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + /* Set up the buffer */ + skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(*eah)); + skb->protocol = htons(ETH_P_ATALK); + skb->dev = dev; + eah = aarp_hdr(skb); + + /* Set up the ARP */ + eah->hw_type = htons(AARP_HW_TYPE_ETHERNET); + eah->pa_type = htons(ETH_P_ATALK); + eah->hw_len = ETH_ALEN; + eah->pa_len = AARP_PA_ALEN; + eah->function = htons(AARP_REPLY); + + memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + + eah->pa_src_zero = 0; + eah->pa_src_net = us->s_net; + eah->pa_src_node = us->s_node; + + if (!sha) + memset(eah->hw_dst, '\0', ETH_ALEN); + else + memcpy(eah->hw_dst, sha, ETH_ALEN); + + eah->pa_dst_zero = 0; + eah->pa_dst_net = them->s_net; + eah->pa_dst_node = them->s_node; + + /* Send it */ + aarp_dl->request(aarp_dl, skb, sha); +} + +/* + * Send probe frames. Called from aarp_probe_network and + * aarp_proxy_probe_network. + */ + +static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us) +{ + struct elapaarp *eah; + int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + static unsigned char aarp_eth_multicast[ETH_ALEN] = + { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; + + if (!skb) + return; + + /* Set up the buffer */ + skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(*eah)); + skb->protocol = htons(ETH_P_ATALK); + skb->dev = dev; + eah = aarp_hdr(skb); + + /* Set up the ARP */ + eah->hw_type = htons(AARP_HW_TYPE_ETHERNET); + eah->pa_type = htons(ETH_P_ATALK); + eah->hw_len = ETH_ALEN; + eah->pa_len = AARP_PA_ALEN; + eah->function = htons(AARP_PROBE); + + memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + + eah->pa_src_zero = 0; + eah->pa_src_net = us->s_net; + eah->pa_src_node = us->s_node; + + memset(eah->hw_dst, '\0', ETH_ALEN); + + eah->pa_dst_zero = 0; + eah->pa_dst_net = us->s_net; + eah->pa_dst_node = us->s_node; + + /* Send it */ + aarp_dl->request(aarp_dl, skb, aarp_eth_multicast); +} + +/* + * Handle an aarp timer expire + * + * Must run under the aarp_lock. + */ + +static void __aarp_expire_timer(struct aarp_entry **n) +{ + struct aarp_entry *t; + + while (*n) + /* Expired ? */ + if (time_after(jiffies, (*n)->expires_at)) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } else + n = &((*n)->next); +} + +/* + * Kick all pending requests 5 times a second. + * + * Must run under the aarp_lock. + */ +static void __aarp_kick(struct aarp_entry **n) +{ + struct aarp_entry *t; + + while (*n) + /* Expired: if this will be the 11th tx, we delete instead. */ + if ((*n)->xmit_count >= sysctl_aarp_retransmit_limit) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } else { + __aarp_send_query(*n); + n = &((*n)->next); + } +} + +/* + * A device has gone down. Take all entries referring to the device + * and remove them. + * + * Must run under the aarp_lock. + */ +static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev) +{ + struct aarp_entry *t; + + while (*n) + if ((*n)->dev == dev) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } else + n = &((*n)->next); +} + +/* Handle the timer event */ +static void aarp_expire_timeout(unsigned long unused) +{ + int ct; + + write_lock_bh(&aarp_lock); + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_timer(&resolved[ct]); + __aarp_kick(&unresolved[ct]); + __aarp_expire_timer(&unresolved[ct]); + __aarp_expire_timer(&proxies[ct]); + } + + write_unlock_bh(&aarp_lock); + mod_timer(&aarp_timer, jiffies + + (unresolved_count ? sysctl_aarp_tick_time : + sysctl_aarp_expiry_time)); +} + +/* Network device notifier chain handler. */ +static int aarp_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + int ct; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (event == NETDEV_DOWN) { + write_lock_bh(&aarp_lock); + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_device(&resolved[ct], dev); + __aarp_expire_device(&unresolved[ct], dev); + __aarp_expire_device(&proxies[ct], dev); + } + + write_unlock_bh(&aarp_lock); + } + return NOTIFY_DONE; +} + +/* Expire all entries in a hash chain */ +static void __aarp_expire_all(struct aarp_entry **n) +{ + struct aarp_entry *t; + + while (*n) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } +} + +/* Cleanup all hash chains -- module unloading */ +static void aarp_purge(void) +{ + int ct; + + write_lock_bh(&aarp_lock); + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_all(&resolved[ct]); + __aarp_expire_all(&unresolved[ct]); + __aarp_expire_all(&proxies[ct]); + } + write_unlock_bh(&aarp_lock); +} + +/* + * Create a new aarp entry. This must use GFP_ATOMIC because it + * runs while holding spinlocks. + */ +static struct aarp_entry *aarp_alloc(void) +{ + struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC); + + if (a) + skb_queue_head_init(&a->packet_queue); + return a; +} + +/* + * Find an entry. We might return an expired but not yet purged entry. We + * don't care as it will do no harm. + * + * This must run under the aarp_lock. + */ +static struct aarp_entry *__aarp_find_entry(struct aarp_entry *list, + struct net_device *dev, + struct atalk_addr *sat) +{ + while (list) { + if (list->target_addr.s_net == sat->s_net && + list->target_addr.s_node == sat->s_node && + list->dev == dev) + break; + list = list->next; + } + + return list; +} + +/* Called from the DDP code, and thus must be exported. */ +void aarp_proxy_remove(struct net_device *dev, struct atalk_addr *sa) +{ + int hash = sa->s_node % (AARP_HASH_SIZE - 1); + struct aarp_entry *a; + + write_lock_bh(&aarp_lock); + + a = __aarp_find_entry(proxies[hash], dev, sa); + if (a) + a->expires_at = jiffies - 1; + + write_unlock_bh(&aarp_lock); +} + +/* This must run under aarp_lock. */ +static struct atalk_addr *__aarp_proxy_find(struct net_device *dev, + struct atalk_addr *sa) +{ + int hash = sa->s_node % (AARP_HASH_SIZE - 1); + struct aarp_entry *a = __aarp_find_entry(proxies[hash], dev, sa); + + return a ? sa : NULL; +} + +/* + * Probe a Phase 1 device or a device that requires its Net:Node to + * be set via an ioctl. + */ +static void aarp_send_probe_phase1(struct atalk_iface *iface) +{ + struct ifreq atreq; + struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; + const struct net_device_ops *ops = iface->dev->netdev_ops; + + sa->sat_addr.s_node = iface->address.s_node; + sa->sat_addr.s_net = ntohs(iface->address.s_net); + + /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ + if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) { + ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR); + if (iface->address.s_net != htons(sa->sat_addr.s_net) || + iface->address.s_node != sa->sat_addr.s_node) + iface->status |= ATIF_PROBE_FAIL; + + iface->address.s_net = htons(sa->sat_addr.s_net); + iface->address.s_node = sa->sat_addr.s_node; + } +} + + +void aarp_probe_network(struct atalk_iface *atif) +{ + if (atif->dev->type == ARPHRD_LOCALTLK || + atif->dev->type == ARPHRD_PPP) + aarp_send_probe_phase1(atif); + else { + unsigned int count; + + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, &atif->address); + + /* Defer 1/10th */ + msleep(100); + + if (atif->status & ATIF_PROBE_FAIL) + break; + } + } +} + +int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) +{ + int hash, retval = -EPROTONOSUPPORT; + struct aarp_entry *entry; + unsigned int count; + + /* + * we don't currently support LocalTalk or PPP for proxy AARP; + * if someone wants to try and add it, have fun + */ + if (atif->dev->type == ARPHRD_LOCALTLK || + atif->dev->type == ARPHRD_PPP) + goto out; + + /* + * create a new AARP entry with the flags set to be published -- + * we need this one to hang around even if it's in use + */ + entry = aarp_alloc(); + retval = -ENOMEM; + if (!entry) + goto out; + + entry->expires_at = -1; + entry->status = ATIF_PROBE; + entry->target_addr.s_node = sa->s_node; + entry->target_addr.s_net = sa->s_net; + entry->dev = atif->dev; + + write_lock_bh(&aarp_lock); + + hash = sa->s_node % (AARP_HASH_SIZE - 1); + entry->next = proxies[hash]; + proxies[hash] = entry; + + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, sa); + + /* Defer 1/10th */ + write_unlock_bh(&aarp_lock); + msleep(100); + write_lock_bh(&aarp_lock); + + if (entry->status & ATIF_PROBE_FAIL) + break; + } + + if (entry->status & ATIF_PROBE_FAIL) { + entry->expires_at = jiffies - 1; /* free the entry */ + retval = -EADDRINUSE; /* return network full */ + } else { /* clear the probing flag */ + entry->status &= ~ATIF_PROBE; + retval = 1; + } + + write_unlock_bh(&aarp_lock); +out: + return retval; +} + +/* Send a DDP frame */ +int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb, + struct atalk_addr *sa, void *hwaddr) +{ + static char ddp_eth_multicast[ETH_ALEN] = + { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; + int hash; + struct aarp_entry *a; + + skb_reset_network_header(skb); + + /* Check for LocalTalk first */ + if (dev->type == ARPHRD_LOCALTLK) { + struct atalk_addr *at = atalk_find_dev_addr(dev); + struct ddpehdr *ddp = (struct ddpehdr *)skb->data; + int ft = 2; + + /* + * Compressible ? + * + * IFF: src_net == dest_net == device_net + * (zero matches anything) + */ + + if ((!ddp->deh_snet || at->s_net == ddp->deh_snet) && + (!ddp->deh_dnet || at->s_net == ddp->deh_dnet)) { + skb_pull(skb, sizeof(*ddp) - 4); + + /* + * The upper two remaining bytes are the port + * numbers we just happen to need. Now put the + * length in the lower two. + */ + *((__be16 *)skb->data) = htons(skb->len); + ft = 1; + } + /* + * Nice and easy. No AARP type protocols occur here so we can + * just shovel it out with a 3 byte LLAP header + */ + + skb_push(skb, 3); + skb->data[0] = sa->s_node; + skb->data[1] = at->s_node; + skb->data[2] = ft; + skb->dev = dev; + goto sendit; + } + + /* On a PPP link we neither compress nor aarp. */ + if (dev->type == ARPHRD_PPP) { + skb->protocol = htons(ETH_P_PPPTALK); + skb->dev = dev; + goto sendit; + } + + /* Non ELAP we cannot do. */ + if (dev->type != ARPHRD_ETHER) + goto free_it; + + skb->dev = dev; + skb->protocol = htons(ETH_P_ATALK); + hash = sa->s_node % (AARP_HASH_SIZE - 1); + + /* Do we have a resolved entry? */ + if (sa->s_node == ATADDR_BCAST) { + /* Send it */ + ddp_dl->request(ddp_dl, skb, ddp_eth_multicast); + goto sent; + } + + write_lock_bh(&aarp_lock); + a = __aarp_find_entry(resolved[hash], dev, sa); + + if (a) { /* Return 1 and fill in the address */ + a->expires_at = jiffies + (sysctl_aarp_expiry_time * 10); + ddp_dl->request(ddp_dl, skb, a->hwaddr); + write_unlock_bh(&aarp_lock); + goto sent; + } + + /* Do we have an unresolved entry: This is the less common path */ + a = __aarp_find_entry(unresolved[hash], dev, sa); + if (a) { /* Queue onto the unresolved queue */ + skb_queue_tail(&a->packet_queue, skb); + goto out_unlock; + } + + /* Allocate a new entry */ + a = aarp_alloc(); + if (!a) { + /* Whoops slipped... good job it's an unreliable protocol 8) */ + write_unlock_bh(&aarp_lock); + goto free_it; + } + + /* Set up the queue */ + skb_queue_tail(&a->packet_queue, skb); + a->expires_at = jiffies + sysctl_aarp_resolve_time; + a->dev = dev; + a->next = unresolved[hash]; + a->target_addr = *sa; + a->xmit_count = 0; + unresolved[hash] = a; + unresolved_count++; + + /* Send an initial request for the address */ + __aarp_send_query(a); + + /* + * Switch to fast timer if needed (That is if this is the first + * unresolved entry to get added) + */ + + if (unresolved_count == 1) + mod_timer(&aarp_timer, jiffies + sysctl_aarp_tick_time); + + /* Now finally, it is safe to drop the lock. */ +out_unlock: + write_unlock_bh(&aarp_lock); + + /* Tell the ddp layer we have taken over for this frame. */ + goto sent; + +sendit: + if (skb->sk) + skb->priority = skb->sk->sk_priority; + if (dev_queue_xmit(skb)) + goto drop; +sent: + return NET_XMIT_SUCCESS; +free_it: + kfree_skb(skb); +drop: + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(aarp_send_ddp); + +/* + * An entry in the aarp unresolved queue has become resolved. Send + * all the frames queued under it. + * + * Must run under aarp_lock. + */ +static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, + int hash) +{ + struct sk_buff *skb; + + while (*list) + if (*list == a) { + unresolved_count--; + *list = a->next; + + /* Move into the resolved list */ + a->next = resolved[hash]; + resolved[hash] = a; + + /* Kick frames off */ + while ((skb = skb_dequeue(&a->packet_queue)) != NULL) { + a->expires_at = jiffies + + sysctl_aarp_expiry_time * 10; + ddp_dl->request(ddp_dl, skb, a->hwaddr); + } + } else + list = &((*list)->next); +} + +/* + * This is called by the SNAP driver whenever we see an AARP SNAP + * frame. We currently only support Ethernet. + */ +static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct elapaarp *ea = aarp_hdr(skb); + int hash, ret = 0; + __u16 function; + struct aarp_entry *a; + struct atalk_addr sa, *ma, da; + struct atalk_iface *ifa; + + if (!net_eq(dev_net(dev), &init_net)) + goto out0; + + /* We only do Ethernet SNAP AARP. */ + if (dev->type != ARPHRD_ETHER) + goto out0; + + /* Frame size ok? */ + if (!skb_pull(skb, sizeof(*ea))) + goto out0; + + function = ntohs(ea->function); + + /* Sanity check fields. */ + if (function < AARP_REQUEST || function > AARP_PROBE || + ea->hw_len != ETH_ALEN || ea->pa_len != AARP_PA_ALEN || + ea->pa_src_zero || ea->pa_dst_zero) + goto out0; + + /* Looks good. */ + hash = ea->pa_src_node % (AARP_HASH_SIZE - 1); + + /* Build an address. */ + sa.s_node = ea->pa_src_node; + sa.s_net = ea->pa_src_net; + + /* Process the packet. Check for replies of me. */ + ifa = atalk_find_dev(dev); + if (!ifa) + goto out1; + + if (ifa->status & ATIF_PROBE && + ifa->address.s_node == ea->pa_dst_node && + ifa->address.s_net == ea->pa_dst_net) { + ifa->status |= ATIF_PROBE_FAIL; /* Fail the probe (in use) */ + goto out1; + } + + /* Check for replies of proxy AARP entries */ + da.s_node = ea->pa_dst_node; + da.s_net = ea->pa_dst_net; + + write_lock_bh(&aarp_lock); + a = __aarp_find_entry(proxies[hash], dev, &da); + + if (a && a->status & ATIF_PROBE) { + a->status |= ATIF_PROBE_FAIL; + /* + * we do not respond to probe or request packets for + * this address while we are probing this address + */ + goto unlock; + } + + switch (function) { + case AARP_REPLY: + if (!unresolved_count) /* Speed up */ + break; + + /* Find the entry. */ + a = __aarp_find_entry(unresolved[hash], dev, &sa); + if (!a || dev != a->dev) + break; + + /* We can fill one in - this is good. */ + memcpy(a->hwaddr, ea->hw_src, ETH_ALEN); + __aarp_resolved(&unresolved[hash], a, hash); + if (!unresolved_count) + mod_timer(&aarp_timer, + jiffies + sysctl_aarp_expiry_time); + break; + + case AARP_REQUEST: + case AARP_PROBE: + + /* + * If it is my address set ma to my address and reply. + * We can treat probe and request the same. Probe + * simply means we shouldn't cache the querying host, + * as in a probe they are proposing an address not + * using one. + * + * Support for proxy-AARP added. We check if the + * address is one of our proxies before we toss the + * packet out. + */ + + sa.s_node = ea->pa_dst_node; + sa.s_net = ea->pa_dst_net; + + /* See if we have a matching proxy. */ + ma = __aarp_proxy_find(dev, &sa); + if (!ma) + ma = &ifa->address; + else { /* We need to make a copy of the entry. */ + da.s_node = sa.s_node; + da.s_net = sa.s_net; + ma = &da; + } + + if (function == AARP_PROBE) { + /* + * A probe implies someone trying to get an + * address. So as a precaution flush any + * entries we have for this address. + */ + a = __aarp_find_entry(resolved[sa.s_node % + (AARP_HASH_SIZE - 1)], + skb->dev, &sa); + + /* + * Make it expire next tick - that avoids us + * getting into a probe/flush/learn/probe/ + * flush/learn cycle during probing of a slow + * to respond host addr. + */ + if (a) { + a->expires_at = jiffies - 1; + mod_timer(&aarp_timer, jiffies + + sysctl_aarp_tick_time); + } + } + + if (sa.s_node != ma->s_node) + break; + + if (sa.s_net && ma->s_net && sa.s_net != ma->s_net) + break; + + sa.s_node = ea->pa_src_node; + sa.s_net = ea->pa_src_net; + + /* aarp_my_address has found the address to use for us. + */ + aarp_send_reply(dev, ma, &sa, ea->hw_src); + break; + } + +unlock: + write_unlock_bh(&aarp_lock); +out1: + ret = 1; +out0: + kfree_skb(skb); + return ret; +} + +static struct notifier_block aarp_notifier = { + .notifier_call = aarp_device_event, +}; + +static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 }; + +void __init aarp_proto_init(void) +{ + aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv); + if (!aarp_dl) + printk(KERN_CRIT "Unable to register AARP with SNAP.\n"); + setup_timer(&aarp_timer, aarp_expire_timeout, 0); + aarp_timer.expires = jiffies + sysctl_aarp_expiry_time; + add_timer(&aarp_timer); + register_netdevice_notifier(&aarp_notifier); +} + +/* Remove the AARP entries associated with a device. */ +void aarp_device_down(struct net_device *dev) +{ + int ct; + + write_lock_bh(&aarp_lock); + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_device(&resolved[ct], dev); + __aarp_expire_device(&unresolved[ct], dev); + __aarp_expire_device(&proxies[ct], dev); + } + + write_unlock_bh(&aarp_lock); +} + +#ifdef CONFIG_PROC_FS +struct aarp_iter_state { + int bucket; + struct aarp_entry **table; +}; + +/* + * Get the aarp entry that is in the chain described + * by the iterator. + * If pos is set then skip till that index. + * pos = 1 is the first entry + */ +static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos) +{ + int ct = iter->bucket; + struct aarp_entry **table = iter->table; + loff_t off = 0; + struct aarp_entry *entry; + + rescan: + while(ct < AARP_HASH_SIZE) { + for (entry = table[ct]; entry; entry = entry->next) { + if (!pos || ++off == *pos) { + iter->table = table; + iter->bucket = ct; + return entry; + } + } + ++ct; + } + + if (table == resolved) { + ct = 0; + table = unresolved; + goto rescan; + } + if (table == unresolved) { + ct = 0; + table = proxies; + goto rescan; + } + return NULL; +} + +static void *aarp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(aarp_lock) +{ + struct aarp_iter_state *iter = seq->private; + + read_lock_bh(&aarp_lock); + iter->table = resolved; + iter->bucket = 0; + + return *pos ? iter_next(iter, pos) : SEQ_START_TOKEN; +} + +static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct aarp_entry *entry = v; + struct aarp_iter_state *iter = seq->private; + + ++*pos; + + /* first line after header */ + if (v == SEQ_START_TOKEN) + entry = iter_next(iter, NULL); + + /* next entry in current bucket */ + else if (entry->next) + entry = entry->next; + + /* next bucket or table */ + else { + ++iter->bucket; + entry = iter_next(iter, NULL); + } + return entry; +} + +static void aarp_seq_stop(struct seq_file *seq, void *v) + __releases(aarp_lock) +{ + read_unlock_bh(&aarp_lock); +} + +static const char *dt2str(unsigned long ticks) +{ + static char buf[32]; + + sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100 ) / HZ); + + return buf; +} + +static int aarp_seq_show(struct seq_file *seq, void *v) +{ + struct aarp_iter_state *iter = seq->private; + struct aarp_entry *entry = v; + unsigned long now = jiffies; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Address Interface Hardware Address" + " Expires LastSend Retry Status\n"); + else { + seq_printf(seq, "%04X:%02X %-12s", + ntohs(entry->target_addr.s_net), + (unsigned int) entry->target_addr.s_node, + entry->dev ? entry->dev->name : "????"); + seq_printf(seq, "%pM", entry->hwaddr); + seq_printf(seq, " %8s", + dt2str((long)entry->expires_at - (long)now)); + if (iter->table == unresolved) + seq_printf(seq, " %8s %6hu", + dt2str(now - entry->last_sent), + entry->xmit_count); + else + seq_puts(seq, " "); + seq_printf(seq, " %s\n", + (iter->table == resolved) ? "resolved" + : (iter->table == unresolved) ? "unresolved" + : (iter->table == proxies) ? "proxies" + : "unknown"); + } + return 0; +} + +static const struct seq_operations aarp_seq_ops = { + .start = aarp_seq_start, + .next = aarp_seq_next, + .stop = aarp_seq_stop, + .show = aarp_seq_show, +}; + +static int aarp_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &aarp_seq_ops, + sizeof(struct aarp_iter_state)); +} + +const struct file_operations atalk_seq_arp_fops = { + .owner = THIS_MODULE, + .open = aarp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +/* General module cleanup. Called from cleanup_module() in ddp.c. */ +void aarp_cleanup_module(void) +{ + del_timer_sync(&aarp_timer); + unregister_netdevice_notifier(&aarp_notifier); + unregister_snap_client(aarp_dl); + aarp_purge(); +} diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c new file mode 100644 index 00000000..6ef0e761 --- /dev/null +++ b/net/appletalk/atalk_proc.c @@ -0,0 +1,301 @@ +/* + * atalk_proc.c - proc support for Appletalk + * + * Copyright(c) Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, version 2. + */ + +#include +#include +#include +#include +#include +#include + + +static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos) +{ + struct atalk_iface *i; + + for (i = atalk_interfaces; pos && i; i = i->next) + --pos; + + return i; +} + +static void *atalk_seq_interface_start(struct seq_file *seq, loff_t *pos) + __acquires(atalk_interfaces_lock) +{ + loff_t l = *pos; + + read_lock_bh(&atalk_interfaces_lock); + return l ? atalk_get_interface_idx(--l) : SEQ_START_TOKEN; +} + +static void *atalk_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct atalk_iface *i; + + ++*pos; + if (v == SEQ_START_TOKEN) { + i = NULL; + if (atalk_interfaces) + i = atalk_interfaces; + goto out; + } + i = v; + i = i->next; +out: + return i; +} + +static void atalk_seq_interface_stop(struct seq_file *seq, void *v) + __releases(atalk_interfaces_lock) +{ + read_unlock_bh(&atalk_interfaces_lock); +} + +static int atalk_seq_interface_show(struct seq_file *seq, void *v) +{ + struct atalk_iface *iface; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Interface Address Networks " + "Status\n"); + goto out; + } + + iface = v; + seq_printf(seq, "%-16s %04X:%02X %04X-%04X %d\n", + iface->dev->name, ntohs(iface->address.s_net), + iface->address.s_node, ntohs(iface->nets.nr_firstnet), + ntohs(iface->nets.nr_lastnet), iface->status); +out: + return 0; +} + +static __inline__ struct atalk_route *atalk_get_route_idx(loff_t pos) +{ + struct atalk_route *r; + + for (r = atalk_routes; pos && r; r = r->next) + --pos; + + return r; +} + +static void *atalk_seq_route_start(struct seq_file *seq, loff_t *pos) + __acquires(atalk_routes_lock) +{ + loff_t l = *pos; + + read_lock_bh(&atalk_routes_lock); + return l ? atalk_get_route_idx(--l) : SEQ_START_TOKEN; +} + +static void *atalk_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct atalk_route *r; + + ++*pos; + if (v == SEQ_START_TOKEN) { + r = NULL; + if (atalk_routes) + r = atalk_routes; + goto out; + } + r = v; + r = r->next; +out: + return r; +} + +static void atalk_seq_route_stop(struct seq_file *seq, void *v) + __releases(atalk_routes_lock) +{ + read_unlock_bh(&atalk_routes_lock); +} + +static int atalk_seq_route_show(struct seq_file *seq, void *v) +{ + struct atalk_route *rt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Target Router Flags Dev\n"); + goto out; + } + + if (atrtr_default.dev) { + rt = &atrtr_default; + seq_printf(seq, "Default %04X:%02X %-4d %s\n", + ntohs(rt->gateway.s_net), rt->gateway.s_node, + rt->flags, rt->dev->name); + } + + rt = v; + seq_printf(seq, "%04X:%02X %04X:%02X %-4d %s\n", + ntohs(rt->target.s_net), rt->target.s_node, + ntohs(rt->gateway.s_net), rt->gateway.s_node, + rt->flags, rt->dev->name); +out: + return 0; +} + +static void *atalk_seq_socket_start(struct seq_file *seq, loff_t *pos) + __acquires(atalk_sockets_lock) +{ + read_lock_bh(&atalk_sockets_lock); + return seq_hlist_start_head(&atalk_sockets, *pos); +} + +static void *atalk_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &atalk_sockets, pos); +} + +static void atalk_seq_socket_stop(struct seq_file *seq, void *v) + __releases(atalk_sockets_lock) +{ + read_unlock_bh(&atalk_sockets_lock); +} + +static int atalk_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock *s; + struct atalk_sock *at; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "Type Local_addr Remote_addr Tx_queue " + "Rx_queue St UID\n"); + goto out; + } + + s = sk_entry(v); + at = at_sk(s); + + seq_printf(seq, "%02X %04X:%02X:%02X %04X:%02X:%02X %08X:%08X " + "%02X %d\n", + s->sk_type, ntohs(at->src_net), at->src_node, at->src_port, + ntohs(at->dest_net), at->dest_node, at->dest_port, + sk_wmem_alloc_get(s), + sk_rmem_alloc_get(s), + s->sk_state, SOCK_INODE(s->sk_socket)->i_uid); +out: + return 0; +} + +static const struct seq_operations atalk_seq_interface_ops = { + .start = atalk_seq_interface_start, + .next = atalk_seq_interface_next, + .stop = atalk_seq_interface_stop, + .show = atalk_seq_interface_show, +}; + +static const struct seq_operations atalk_seq_route_ops = { + .start = atalk_seq_route_start, + .next = atalk_seq_route_next, + .stop = atalk_seq_route_stop, + .show = atalk_seq_route_show, +}; + +static const struct seq_operations atalk_seq_socket_ops = { + .start = atalk_seq_socket_start, + .next = atalk_seq_socket_next, + .stop = atalk_seq_socket_stop, + .show = atalk_seq_socket_show, +}; + +static int atalk_seq_interface_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atalk_seq_interface_ops); +} + +static int atalk_seq_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atalk_seq_route_ops); +} + +static int atalk_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atalk_seq_socket_ops); +} + +static const struct file_operations atalk_seq_interface_fops = { + .owner = THIS_MODULE, + .open = atalk_seq_interface_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations atalk_seq_route_fops = { + .owner = THIS_MODULE, + .open = atalk_seq_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations atalk_seq_socket_fops = { + .owner = THIS_MODULE, + .open = atalk_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *atalk_proc_dir; + +int __init atalk_proc_init(void) +{ + struct proc_dir_entry *p; + int rc = -ENOMEM; + + atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net); + if (!atalk_proc_dir) + goto out; + + p = proc_create("interface", S_IRUGO, atalk_proc_dir, + &atalk_seq_interface_fops); + if (!p) + goto out_interface; + + p = proc_create("route", S_IRUGO, atalk_proc_dir, + &atalk_seq_route_fops); + if (!p) + goto out_route; + + p = proc_create("socket", S_IRUGO, atalk_proc_dir, + &atalk_seq_socket_fops); + if (!p) + goto out_socket; + + p = proc_create("arp", S_IRUGO, atalk_proc_dir, &atalk_seq_arp_fops); + if (!p) + goto out_arp; + + rc = 0; +out: + return rc; +out_arp: + remove_proc_entry("socket", atalk_proc_dir); +out_socket: + remove_proc_entry("route", atalk_proc_dir); +out_route: + remove_proc_entry("interface", atalk_proc_dir); +out_interface: + remove_proc_entry("atalk", init_net.proc_net); + goto out; +} + +void __exit atalk_proc_exit(void) +{ + remove_proc_entry("interface", atalk_proc_dir); + remove_proc_entry("route", atalk_proc_dir); + remove_proc_entry("socket", atalk_proc_dir); + remove_proc_entry("arp", atalk_proc_dir); + remove_proc_entry("atalk", init_net.proc_net); +} diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c new file mode 100644 index 00000000..956a5302 --- /dev/null +++ b/net/appletalk/ddp.c @@ -0,0 +1,1977 @@ +/* + * DDP: An implementation of the AppleTalk DDP protocol for + * Ethernet 'ELAP'. + * + * Alan Cox + * + * With more than a little assistance from + * + * Wesley Craig + * + * Fixes: + * Neil Horman : Added missing device ioctls + * Michael Callahan : Made routing work + * Wesley Craig : Fix probing to listen to a + * passed node id. + * Alan Cox : Added send/recvmsg support + * Alan Cox : Moved at. to protinfo in + * socket. + * Alan Cox : Added firewall hooks. + * Alan Cox : Supports new ARPHRD_LOOPBACK + * Christer Weinigel : Routing and /proc fixes. + * Bradford Johnson : LocalTalk. + * Tom Dyas : Module support. + * Alan Cox : Hooks for PPP (based on the + * LocalTalk hook). + * Alan Cox : Posix bits + * Alan Cox/Mike Freeman : Possible fix to NBP problems + * Bradford Johnson : IP-over-DDP (experimental) + * Jay Schulist : Moved IP-over-DDP to its own + * driver file. (ipddp.c & ipddp.h) + * Jay Schulist : Made work as module with + * AppleTalk drivers, cleaned it. + * Rob Newberry : Added proxy AARP and AARP + * procfs, moved probing to AARP + * module. + * Adrian Sun/ + * Michael Zuelsdorff : fix for net.0 packets. don't + * allow illegal ether/tokentalk + * port assignment. we lose a + * valid localtalk port as a + * result. + * Arnaldo C. de Melo : Cleanup, in preparation for + * shared skb support 8) + * Arnaldo C. de Melo : Move proc stuff to atalk_proc.c, + * use seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include /* For TIOCOUTQ/INQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../core/kmap_skb.h" + +struct datalink_proto *ddp_dl, *aarp_dl; +static const struct proto_ops atalk_dgram_ops; + +/**************************************************************************\ +* * +* Handlers for the socket list. * +* * +\**************************************************************************/ + +HLIST_HEAD(atalk_sockets); +DEFINE_RWLOCK(atalk_sockets_lock); + +static inline void __atalk_insert_socket(struct sock *sk) +{ + sk_add_node(sk, &atalk_sockets); +} + +static inline void atalk_remove_socket(struct sock *sk) +{ + write_lock_bh(&atalk_sockets_lock); + sk_del_node_init(sk); + write_unlock_bh(&atalk_sockets_lock); +} + +static struct sock *atalk_search_socket(struct sockaddr_at *to, + struct atalk_iface *atif) +{ + struct sock *s; + struct hlist_node *node; + + read_lock_bh(&atalk_sockets_lock); + sk_for_each(s, node, &atalk_sockets) { + struct atalk_sock *at = at_sk(s); + + if (to->sat_port != at->src_port) + continue; + + if (to->sat_addr.s_net == ATADDR_ANYNET && + to->sat_addr.s_node == ATADDR_BCAST) + goto found; + + if (to->sat_addr.s_net == at->src_net && + (to->sat_addr.s_node == at->src_node || + to->sat_addr.s_node == ATADDR_BCAST || + to->sat_addr.s_node == ATADDR_ANYNODE)) + goto found; + + /* XXXX.0 -- we got a request for this router. make sure + * that the node is appropriately set. */ + if (to->sat_addr.s_node == ATADDR_ANYNODE && + to->sat_addr.s_net != ATADDR_ANYNET && + atif->address.s_node == at->src_node) { + to->sat_addr.s_node = atif->address.s_node; + goto found; + } + } + s = NULL; +found: + read_unlock_bh(&atalk_sockets_lock); + return s; +} + +/** + * atalk_find_or_insert_socket - Try to find a socket matching ADDR + * @sk - socket to insert in the list if it is not there already + * @sat - address to search for + * + * Try to find a socket matching ADDR in the socket list, if found then return + * it. If not, insert SK into the socket list. + * + * This entire operation must execute atomically. + */ +static struct sock *atalk_find_or_insert_socket(struct sock *sk, + struct sockaddr_at *sat) +{ + struct sock *s; + struct hlist_node *node; + struct atalk_sock *at; + + write_lock_bh(&atalk_sockets_lock); + sk_for_each(s, node, &atalk_sockets) { + at = at_sk(s); + + if (at->src_net == sat->sat_addr.s_net && + at->src_node == sat->sat_addr.s_node && + at->src_port == sat->sat_port) + goto found; + } + s = NULL; + __atalk_insert_socket(sk); /* Wheee, it's free, assign and insert. */ +found: + write_unlock_bh(&atalk_sockets_lock); + return s; +} + +static void atalk_destroy_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + if (sk_has_allocations(sk)) { + sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +static inline void atalk_destroy_socket(struct sock *sk) +{ + atalk_remove_socket(sk); + skb_queue_purge(&sk->sk_receive_queue); + + if (sk_has_allocations(sk)) { + setup_timer(&sk->sk_timer, atalk_destroy_timer, + (unsigned long)sk); + sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +/**************************************************************************\ +* * +* Routing tables for the AppleTalk socket layer. * +* * +\**************************************************************************/ + +/* Anti-deadlock ordering is atalk_routes_lock --> iface_lock -DaveM */ +struct atalk_route *atalk_routes; +DEFINE_RWLOCK(atalk_routes_lock); + +struct atalk_iface *atalk_interfaces; +DEFINE_RWLOCK(atalk_interfaces_lock); + +/* For probing devices or in a routerless network */ +struct atalk_route atrtr_default; + +/* AppleTalk interface control */ +/* + * Drop a device. Doesn't drop any of its routes - that is the caller's + * problem. Called when we down the interface or delete the address. + */ +static void atif_drop_device(struct net_device *dev) +{ + struct atalk_iface **iface = &atalk_interfaces; + struct atalk_iface *tmp; + + write_lock_bh(&atalk_interfaces_lock); + while ((tmp = *iface) != NULL) { + if (tmp->dev == dev) { + *iface = tmp->next; + dev_put(dev); + kfree(tmp); + dev->atalk_ptr = NULL; + } else + iface = &tmp->next; + } + write_unlock_bh(&atalk_interfaces_lock); +} + +static struct atalk_iface *atif_add_device(struct net_device *dev, + struct atalk_addr *sa) +{ + struct atalk_iface *iface = kzalloc(sizeof(*iface), GFP_KERNEL); + + if (!iface) + goto out; + + dev_hold(dev); + iface->dev = dev; + dev->atalk_ptr = iface; + iface->address = *sa; + iface->status = 0; + + write_lock_bh(&atalk_interfaces_lock); + iface->next = atalk_interfaces; + atalk_interfaces = iface; + write_unlock_bh(&atalk_interfaces_lock); +out: + return iface; +} + +/* Perform phase 2 AARP probing on our tentative address */ +static int atif_probe_device(struct atalk_iface *atif) +{ + int netrange = ntohs(atif->nets.nr_lastnet) - + ntohs(atif->nets.nr_firstnet) + 1; + int probe_net = ntohs(atif->address.s_net); + int probe_node = atif->address.s_node; + int netct, nodect; + + /* Offset the network we start probing with */ + if (probe_net == ATADDR_ANYNET) { + probe_net = ntohs(atif->nets.nr_firstnet); + if (netrange) + probe_net += jiffies % netrange; + } + if (probe_node == ATADDR_ANYNODE) + probe_node = jiffies & 0xFF; + + /* Scan the networks */ + atif->status |= ATIF_PROBE; + for (netct = 0; netct <= netrange; netct++) { + /* Sweep the available nodes from a given start */ + atif->address.s_net = htons(probe_net); + for (nodect = 0; nodect < 256; nodect++) { + atif->address.s_node = (nodect + probe_node) & 0xFF; + if (atif->address.s_node > 0 && + atif->address.s_node < 254) { + /* Probe a proposed address */ + aarp_probe_network(atif); + + if (!(atif->status & ATIF_PROBE_FAIL)) { + atif->status &= ~ATIF_PROBE; + return 0; + } + } + atif->status &= ~ATIF_PROBE_FAIL; + } + probe_net++; + if (probe_net > ntohs(atif->nets.nr_lastnet)) + probe_net = ntohs(atif->nets.nr_firstnet); + } + atif->status &= ~ATIF_PROBE; + + return -EADDRINUSE; /* Network is full... */ +} + + +/* Perform AARP probing for a proxy address */ +static int atif_proxy_probe_device(struct atalk_iface *atif, + struct atalk_addr* proxy_addr) +{ + int netrange = ntohs(atif->nets.nr_lastnet) - + ntohs(atif->nets.nr_firstnet) + 1; + /* we probe the interface's network */ + int probe_net = ntohs(atif->address.s_net); + int probe_node = ATADDR_ANYNODE; /* we'll take anything */ + int netct, nodect; + + /* Offset the network we start probing with */ + if (probe_net == ATADDR_ANYNET) { + probe_net = ntohs(atif->nets.nr_firstnet); + if (netrange) + probe_net += jiffies % netrange; + } + + if (probe_node == ATADDR_ANYNODE) + probe_node = jiffies & 0xFF; + + /* Scan the networks */ + for (netct = 0; netct <= netrange; netct++) { + /* Sweep the available nodes from a given start */ + proxy_addr->s_net = htons(probe_net); + for (nodect = 0; nodect < 256; nodect++) { + proxy_addr->s_node = (nodect + probe_node) & 0xFF; + if (proxy_addr->s_node > 0 && + proxy_addr->s_node < 254) { + /* Tell AARP to probe a proposed address */ + int ret = aarp_proxy_probe_network(atif, + proxy_addr); + + if (ret != -EADDRINUSE) + return ret; + } + } + probe_net++; + if (probe_net > ntohs(atif->nets.nr_lastnet)) + probe_net = ntohs(atif->nets.nr_firstnet); + } + + return -EADDRINUSE; /* Network is full... */ +} + + +struct atalk_addr *atalk_find_dev_addr(struct net_device *dev) +{ + struct atalk_iface *iface = dev->atalk_ptr; + return iface ? &iface->address : NULL; +} + +static struct atalk_addr *atalk_find_primary(void) +{ + struct atalk_iface *fiface = NULL; + struct atalk_addr *retval; + struct atalk_iface *iface; + + /* + * Return a point-to-point interface only if + * there is no non-ptp interface available. + */ + read_lock_bh(&atalk_interfaces_lock); + for (iface = atalk_interfaces; iface; iface = iface->next) { + if (!fiface && !(iface->dev->flags & IFF_LOOPBACK)) + fiface = iface; + if (!(iface->dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) { + retval = &iface->address; + goto out; + } + } + + if (fiface) + retval = &fiface->address; + else if (atalk_interfaces) + retval = &atalk_interfaces->address; + else + retval = NULL; +out: + read_unlock_bh(&atalk_interfaces_lock); + return retval; +} + +/* + * Find a match for 'any network' - ie any of our interfaces with that + * node number will do just nicely. + */ +static struct atalk_iface *atalk_find_anynet(int node, struct net_device *dev) +{ + struct atalk_iface *iface = dev->atalk_ptr; + + if (!iface || iface->status & ATIF_PROBE) + goto out_err; + + if (node != ATADDR_BCAST && + iface->address.s_node != node && + node != ATADDR_ANYNODE) + goto out_err; +out: + return iface; +out_err: + iface = NULL; + goto out; +} + +/* Find a match for a specific network:node pair */ +static struct atalk_iface *atalk_find_interface(__be16 net, int node) +{ + struct atalk_iface *iface; + + read_lock_bh(&atalk_interfaces_lock); + for (iface = atalk_interfaces; iface; iface = iface->next) { + if ((node == ATADDR_BCAST || + node == ATADDR_ANYNODE || + iface->address.s_node == node) && + iface->address.s_net == net && + !(iface->status & ATIF_PROBE)) + break; + + /* XXXX.0 -- net.0 returns the iface associated with net */ + if (node == ATADDR_ANYNODE && net != ATADDR_ANYNET && + ntohs(iface->nets.nr_firstnet) <= ntohs(net) && + ntohs(net) <= ntohs(iface->nets.nr_lastnet)) + break; + } + read_unlock_bh(&atalk_interfaces_lock); + return iface; +} + + +/* + * Find a route for an AppleTalk packet. This ought to get cached in + * the socket (later on...). We know about host routes and the fact + * that a route must be direct to broadcast. + */ +static struct atalk_route *atrtr_find(struct atalk_addr *target) +{ + /* + * we must search through all routes unless we find a + * host route, because some host routes might overlap + * network routes + */ + struct atalk_route *net_route = NULL; + struct atalk_route *r; + + read_lock_bh(&atalk_routes_lock); + for (r = atalk_routes; r; r = r->next) { + if (!(r->flags & RTF_UP)) + continue; + + if (r->target.s_net == target->s_net) { + if (r->flags & RTF_HOST) { + /* + * if this host route is for the target, + * the we're done + */ + if (r->target.s_node == target->s_node) + goto out; + } else + /* + * this route will work if there isn't a + * direct host route, so cache it + */ + net_route = r; + } + } + + /* + * if we found a network route but not a direct host + * route, then return it + */ + if (net_route) + r = net_route; + else if (atrtr_default.dev) + r = &atrtr_default; + else /* No route can be found */ + r = NULL; +out: + read_unlock_bh(&atalk_routes_lock); + return r; +} + + +/* + * Given an AppleTalk network, find the device to use. This can be + * a simple lookup. + */ +struct net_device *atrtr_get_dev(struct atalk_addr *sa) +{ + struct atalk_route *atr = atrtr_find(sa); + return atr ? atr->dev : NULL; +} + +/* Set up a default router */ +static void atrtr_set_default(struct net_device *dev) +{ + atrtr_default.dev = dev; + atrtr_default.flags = RTF_UP; + atrtr_default.gateway.s_net = htons(0); + atrtr_default.gateway.s_node = 0; +} + +/* + * Add a router. Basically make sure it looks valid and stuff the + * entry in the list. While it uses netranges we always set them to one + * entry to work like netatalk. + */ +static int atrtr_create(struct rtentry *r, struct net_device *devhint) +{ + struct sockaddr_at *ta = (struct sockaddr_at *)&r->rt_dst; + struct sockaddr_at *ga = (struct sockaddr_at *)&r->rt_gateway; + struct atalk_route *rt; + struct atalk_iface *iface, *riface; + int retval = -EINVAL; + + /* + * Fixme: Raise/Lower a routing change semaphore for these + * operations. + */ + + /* Validate the request */ + if (ta->sat_family != AF_APPLETALK || + (!devhint && ga->sat_family != AF_APPLETALK)) + goto out; + + /* Now walk the routing table and make our decisions */ + write_lock_bh(&atalk_routes_lock); + for (rt = atalk_routes; rt; rt = rt->next) { + if (r->rt_flags != rt->flags) + continue; + + if (ta->sat_addr.s_net == rt->target.s_net) { + if (!(rt->flags & RTF_HOST)) + break; + if (ta->sat_addr.s_node == rt->target.s_node) + break; + } + } + + if (!devhint) { + riface = NULL; + + read_lock_bh(&atalk_interfaces_lock); + for (iface = atalk_interfaces; iface; iface = iface->next) { + if (!riface && + ntohs(ga->sat_addr.s_net) >= + ntohs(iface->nets.nr_firstnet) && + ntohs(ga->sat_addr.s_net) <= + ntohs(iface->nets.nr_lastnet)) + riface = iface; + + if (ga->sat_addr.s_net == iface->address.s_net && + ga->sat_addr.s_node == iface->address.s_node) + riface = iface; + } + read_unlock_bh(&atalk_interfaces_lock); + + retval = -ENETUNREACH; + if (!riface) + goto out_unlock; + + devhint = riface->dev; + } + + if (!rt) { + rt = kzalloc(sizeof(*rt), GFP_ATOMIC); + + retval = -ENOBUFS; + if (!rt) + goto out_unlock; + + rt->next = atalk_routes; + atalk_routes = rt; + } + + /* Fill in the routing entry */ + rt->target = ta->sat_addr; + dev_hold(devhint); + rt->dev = devhint; + rt->flags = r->rt_flags; + rt->gateway = ga->sat_addr; + + retval = 0; +out_unlock: + write_unlock_bh(&atalk_routes_lock); +out: + return retval; +} + +/* Delete a route. Find it and discard it */ +static int atrtr_delete(struct atalk_addr * addr) +{ + struct atalk_route **r = &atalk_routes; + int retval = 0; + struct atalk_route *tmp; + + write_lock_bh(&atalk_routes_lock); + while ((tmp = *r) != NULL) { + if (tmp->target.s_net == addr->s_net && + (!(tmp->flags&RTF_GATEWAY) || + tmp->target.s_node == addr->s_node)) { + *r = tmp->next; + dev_put(tmp->dev); + kfree(tmp); + goto out; + } + r = &tmp->next; + } + retval = -ENOENT; +out: + write_unlock_bh(&atalk_routes_lock); + return retval; +} + +/* + * Called when a device is downed. Just throw away any routes + * via it. + */ +static void atrtr_device_down(struct net_device *dev) +{ + struct atalk_route **r = &atalk_routes; + struct atalk_route *tmp; + + write_lock_bh(&atalk_routes_lock); + while ((tmp = *r) != NULL) { + if (tmp->dev == dev) { + *r = tmp->next; + dev_put(dev); + kfree(tmp); + } else + r = &tmp->next; + } + write_unlock_bh(&atalk_routes_lock); + + if (atrtr_default.dev == dev) + atrtr_set_default(NULL); +} + +/* Actually down the interface */ +static inline void atalk_dev_down(struct net_device *dev) +{ + atrtr_device_down(dev); /* Remove all routes for the device */ + aarp_device_down(dev); /* Remove AARP entries for the device */ + atif_drop_device(dev); /* Remove the device */ +} + +/* + * A device event has occurred. Watch for devices going down and + * delete our use of them (iface and route). + */ +static int ddp_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (event == NETDEV_DOWN) + /* Discard any use of this */ + atalk_dev_down(dev); + + return NOTIFY_DONE; +} + +/* ioctl calls. Shouldn't even need touching */ +/* Device configuration ioctl calls */ +static int atif_ioctl(int cmd, void __user *arg) +{ + static char aarp_mcast[6] = { 0x09, 0x00, 0x00, 0xFF, 0xFF, 0xFF }; + struct ifreq atreq; + struct atalk_netrange *nr; + struct sockaddr_at *sa; + struct net_device *dev; + struct atalk_iface *atif; + int ct; + int limit; + struct rtentry rtdef; + int add_route; + + if (copy_from_user(&atreq, arg, sizeof(atreq))) + return -EFAULT; + + dev = __dev_get_by_name(&init_net, atreq.ifr_name); + if (!dev) + return -ENODEV; + + sa = (struct sockaddr_at *)&atreq.ifr_addr; + atif = atalk_find_dev(dev); + + switch (cmd) { + case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + if (dev->type != ARPHRD_ETHER && + dev->type != ARPHRD_LOOPBACK && + dev->type != ARPHRD_LOCALTLK && + dev->type != ARPHRD_PPP) + return -EPROTONOSUPPORT; + + nr = (struct atalk_netrange *)&sa->sat_zero[0]; + add_route = 1; + + /* + * if this is a point-to-point iface, and we already + * have an iface for this AppleTalk address, then we + * should not add a route + */ + if ((dev->flags & IFF_POINTOPOINT) && + atalk_find_interface(sa->sat_addr.s_net, + sa->sat_addr.s_node)) { + printk(KERN_DEBUG "AppleTalk: point-to-point " + "interface added with " + "existing address\n"); + add_route = 0; + } + + /* + * Phase 1 is fine on LocalTalk but we don't do + * EtherTalk phase 1. Anyone wanting to add it go ahead. + */ + if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) + return -EPROTONOSUPPORT; + if (sa->sat_addr.s_node == ATADDR_BCAST || + sa->sat_addr.s_node == 254) + return -EINVAL; + if (atif) { + /* Already setting address */ + if (atif->status & ATIF_PROBE) + return -EBUSY; + + atif->address.s_net = sa->sat_addr.s_net; + atif->address.s_node = sa->sat_addr.s_node; + atrtr_device_down(dev); /* Flush old routes */ + } else { + atif = atif_add_device(dev, &sa->sat_addr); + if (!atif) + return -ENOMEM; + } + atif->nets = *nr; + + /* + * Check if the chosen address is used. If so we + * error and atalkd will try another. + */ + + if (!(dev->flags & IFF_LOOPBACK) && + !(dev->flags & IFF_POINTOPOINT) && + atif_probe_device(atif) < 0) { + atif_drop_device(dev); + return -EADDRINUSE; + } + + /* Hey it worked - add the direct routes */ + sa = (struct sockaddr_at *)&rtdef.rt_gateway; + sa->sat_family = AF_APPLETALK; + sa->sat_addr.s_net = atif->address.s_net; + sa->sat_addr.s_node = atif->address.s_node; + sa = (struct sockaddr_at *)&rtdef.rt_dst; + rtdef.rt_flags = RTF_UP; + sa->sat_family = AF_APPLETALK; + sa->sat_addr.s_node = ATADDR_ANYNODE; + if (dev->flags & IFF_LOOPBACK || + dev->flags & IFF_POINTOPOINT) + rtdef.rt_flags |= RTF_HOST; + + /* Routerless initial state */ + if (nr->nr_firstnet == htons(0) && + nr->nr_lastnet == htons(0xFFFE)) { + sa->sat_addr.s_net = atif->address.s_net; + atrtr_create(&rtdef, dev); + atrtr_set_default(dev); + } else { + limit = ntohs(nr->nr_lastnet); + if (limit - ntohs(nr->nr_firstnet) > 4096) { + printk(KERN_WARNING "Too many routes/" + "iface.\n"); + return -EINVAL; + } + if (add_route) + for (ct = ntohs(nr->nr_firstnet); + ct <= limit; ct++) { + sa->sat_addr.s_net = htons(ct); + atrtr_create(&rtdef, dev); + } + } + dev_mc_add_global(dev, aarp_mcast); + return 0; + + case SIOCGIFADDR: + if (!atif) + return -EADDRNOTAVAIL; + + sa->sat_family = AF_APPLETALK; + sa->sat_addr = atif->address; + break; + + case SIOCGIFBRDADDR: + if (!atif) + return -EADDRNOTAVAIL; + + sa->sat_family = AF_APPLETALK; + sa->sat_addr.s_net = atif->address.s_net; + sa->sat_addr.s_node = ATADDR_BCAST; + break; + + case SIOCATALKDIFADDR: + case SIOCDIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + atalk_dev_down(dev); + break; + + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + /* + * for now, we only support proxy AARP on ELAP; + * we should be able to do it for LocalTalk, too. + */ + if (dev->type != ARPHRD_ETHER) + return -EPROTONOSUPPORT; + + /* + * atif points to the current interface on this network; + * we aren't concerned about its current status (at + * least for now), but it has all the settings about + * the network we're going to probe. Consequently, it + * must exist. + */ + if (!atif) + return -EADDRNOTAVAIL; + + nr = (struct atalk_netrange *)&(atif->nets); + /* + * Phase 1 is fine on Localtalk but we don't do + * Ethertalk phase 1. Anyone wanting to add it go ahead. + */ + if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) + return -EPROTONOSUPPORT; + + if (sa->sat_addr.s_node == ATADDR_BCAST || + sa->sat_addr.s_node == 254) + return -EINVAL; + + /* + * Check if the chosen address is used. If so we + * error and ATCP will try another. + */ + if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0) + return -EADDRINUSE; + + /* + * We now have an address on the local network, and + * the AARP code will defend it for us until we take it + * down. We don't set up any routes right now, because + * ATCP will install them manually via SIOCADDRT. + */ + break; + + case SIOCDARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + if (!atif) + return -EADDRNOTAVAIL; + + /* give to aarp module to remove proxy entry */ + aarp_proxy_remove(atif->dev, &(sa->sat_addr)); + return 0; + } + + return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; +} + +/* Routing ioctl() calls */ +static int atrtr_ioctl(unsigned int cmd, void __user *arg) +{ + struct rtentry rt; + + if (copy_from_user(&rt, arg, sizeof(rt))) + return -EFAULT; + + switch (cmd) { + case SIOCDELRT: + if (rt.rt_dst.sa_family != AF_APPLETALK) + return -EINVAL; + return atrtr_delete(&((struct sockaddr_at *) + &rt.rt_dst)->sat_addr); + + case SIOCADDRT: { + struct net_device *dev = NULL; + if (rt.rt_dev) { + char name[IFNAMSIZ]; + if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1)) + return -EFAULT; + name[IFNAMSIZ-1] = '\0'; + dev = __dev_get_by_name(&init_net, name); + if (!dev) + return -ENODEV; + } + return atrtr_create(&rt, dev); + } + } + return -EINVAL; +} + +/**************************************************************************\ +* * +* Handling for system calls applied via the various interfaces to an * +* AppleTalk socket object. * +* * +\**************************************************************************/ + +/* + * Checksum: This is 'optional'. It's quite likely also a good + * candidate for assembler hackery 8) + */ +static unsigned long atalk_sum_partial(const unsigned char *data, + int len, unsigned long sum) +{ + /* This ought to be unwrapped neatly. I'll trust gcc for now */ + while (len--) { + sum += *data++; + sum = rol16(sum, 1); + } + return sum; +} + +/* Checksum skb data -- similar to skb_checksum */ +static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset, + int len, unsigned long sum) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + /* checksum stuff in header space */ + if ( (copy = start - offset) > 0) { + if (copy > len) + copy = len; + sum = atalk_sum_partial(skb->data + offset, copy, sum); + if ( (len -= copy) == 0) + return sum; + + offset += copy; + } + + /* checksum stuff in frags */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + sum = atalk_sum_partial(vaddr + frag->page_offset + + offset - start, copy, sum); + kunmap_skb_frag(vaddr); + + if (!(len -= copy)) + return sum; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + sum = atalk_sum_skb(frag_iter, offset - start, + copy, sum); + if ((len -= copy) == 0) + return sum; + offset += copy; + } + start = end; + } + + BUG_ON(len > 0); + + return sum; +} + +static __be16 atalk_checksum(const struct sk_buff *skb, int len) +{ + unsigned long sum; + + /* skip header 4 bytes */ + sum = atalk_sum_skb(skb, 4, len-4, 0); + + /* Use 0xFFFF for 0. 0 itself means none */ + return sum ? htons((unsigned short)sum) : htons(0xFFFF); +} + +static struct proto ddp_proto = { + .name = "DDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct atalk_sock), +}; + +/* + * Create a socket. Initialise the socket, blank the addresses + * set the state. + */ +static int atalk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + int rc = -ESOCKTNOSUPPORT; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + /* + * We permit SOCK_DGRAM and RAW is an extension. It is trivial to do + * and gives you the full ELAP frame. Should be handy for CAP 8) + */ + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + goto out; + rc = -ENOMEM; + sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto); + if (!sk) + goto out; + rc = 0; + sock->ops = &atalk_dgram_ops; + sock_init_data(sock, sk); + + /* Checksums on by default */ + sock_set_flag(sk, SOCK_ZAPPED); +out: + return rc; +} + +/* Free a socket. No work needed */ +static int atalk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock_hold(sk); + lock_sock(sk); + + sock_orphan(sk); + sock->sk = NULL; + atalk_destroy_socket(sk); + + release_sock(sk); + sock_put(sk); + } + return 0; +} + +/** + * atalk_pick_and_bind_port - Pick a source port when one is not given + * @sk - socket to insert into the tables + * @sat - address to search for + * + * Pick a source port when one is not given. If we can find a suitable free + * one, we insert the socket into the tables using it. + * + * This whole operation must be atomic. + */ +static int atalk_pick_and_bind_port(struct sock *sk, struct sockaddr_at *sat) +{ + int retval; + + write_lock_bh(&atalk_sockets_lock); + + for (sat->sat_port = ATPORT_RESERVED; + sat->sat_port < ATPORT_LAST; + sat->sat_port++) { + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &atalk_sockets) { + struct atalk_sock *at = at_sk(s); + + if (at->src_net == sat->sat_addr.s_net && + at->src_node == sat->sat_addr.s_node && + at->src_port == sat->sat_port) + goto try_next_port; + } + + /* Wheee, it's free, assign and insert. */ + __atalk_insert_socket(sk); + at_sk(sk)->src_port = sat->sat_port; + retval = 0; + goto out; + +try_next_port:; + } + + retval = -EBUSY; +out: + write_unlock_bh(&atalk_sockets_lock); + return retval; +} + +static int atalk_autobind(struct sock *sk) +{ + struct atalk_sock *at = at_sk(sk); + struct sockaddr_at sat; + struct atalk_addr *ap = atalk_find_primary(); + int n = -EADDRNOTAVAIL; + + if (!ap || ap->s_net == htons(ATADDR_ANYNET)) + goto out; + + at->src_net = sat.sat_addr.s_net = ap->s_net; + at->src_node = sat.sat_addr.s_node = ap->s_node; + + n = atalk_pick_and_bind_port(sk, &sat); + if (!n) + sock_reset_flag(sk, SOCK_ZAPPED); +out: + return n; +} + +/* Set the address 'our end' of the connection */ +static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_at *addr = (struct sockaddr_at *)uaddr; + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + int err; + + if (!sock_flag(sk, SOCK_ZAPPED) || + addr_len != sizeof(struct sockaddr_at)) + return -EINVAL; + + if (addr->sat_family != AF_APPLETALK) + return -EAFNOSUPPORT; + + lock_sock(sk); + if (addr->sat_addr.s_net == htons(ATADDR_ANYNET)) { + struct atalk_addr *ap = atalk_find_primary(); + + err = -EADDRNOTAVAIL; + if (!ap) + goto out; + + at->src_net = addr->sat_addr.s_net = ap->s_net; + at->src_node = addr->sat_addr.s_node= ap->s_node; + } else { + err = -EADDRNOTAVAIL; + if (!atalk_find_interface(addr->sat_addr.s_net, + addr->sat_addr.s_node)) + goto out; + + at->src_net = addr->sat_addr.s_net; + at->src_node = addr->sat_addr.s_node; + } + + if (addr->sat_port == ATADDR_ANYPORT) { + err = atalk_pick_and_bind_port(sk, addr); + + if (err < 0) + goto out; + } else { + at->src_port = addr->sat_port; + + err = -EADDRINUSE; + if (atalk_find_or_insert_socket(sk, addr)) + goto out; + } + + sock_reset_flag(sk, SOCK_ZAPPED); + err = 0; +out: + release_sock(sk); + return err; +} + +/* Set the address we talk to */ +static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + struct sockaddr_at *addr; + int err; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(*addr)) + return -EINVAL; + + addr = (struct sockaddr_at *)uaddr; + + if (addr->sat_family != AF_APPLETALK) + return -EAFNOSUPPORT; + + if (addr->sat_addr.s_node == ATADDR_BCAST && + !sock_flag(sk, SOCK_BROADCAST)) { +#if 1 + printk(KERN_WARNING "%s is broken and did not set " + "SO_BROADCAST. It will break when 2.2 is " + "released.\n", + current->comm); +#else + return -EACCES; +#endif + } + + lock_sock(sk); + err = -EBUSY; + if (sock_flag(sk, SOCK_ZAPPED)) + if (atalk_autobind(sk) < 0) + goto out; + + err = -ENETUNREACH; + if (!atrtr_get_dev(&addr->sat_addr)) + goto out; + + at->dest_port = addr->sat_port; + at->dest_net = addr->sat_addr.s_net; + at->dest_node = addr->sat_addr.s_node; + + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + err = 0; +out: + release_sock(sk); + return err; +} + +/* + * Find the name of an AppleTalk socket. Just copy the right + * fields into the sockaddr. + */ +static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_at sat; + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + int err; + + lock_sock(sk); + err = -ENOBUFS; + if (sock_flag(sk, SOCK_ZAPPED)) + if (atalk_autobind(sk) < 0) + goto out; + + *uaddr_len = sizeof(struct sockaddr_at); + memset(&sat.sat_zero, 0, sizeof(sat.sat_zero)); + + if (peer) { + err = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + sat.sat_addr.s_net = at->dest_net; + sat.sat_addr.s_node = at->dest_node; + sat.sat_port = at->dest_port; + } else { + sat.sat_addr.s_net = at->src_net; + sat.sat_addr.s_node = at->src_node; + sat.sat_port = at->src_port; + } + + err = 0; + sat.sat_family = AF_APPLETALK; + memcpy(uaddr, &sat, sizeof(sat)); + +out: + release_sock(sk); + return err; +} + +#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE) +static __inline__ int is_ip_over_ddp(struct sk_buff *skb) +{ + return skb->data[12] == 22; +} + +static int handle_ip_over_ddp(struct sk_buff *skb) +{ + struct net_device *dev = __dev_get_by_name(&init_net, "ipddp0"); + struct net_device_stats *stats; + + /* This needs to be able to handle ipddp"N" devices */ + if (!dev) { + kfree_skb(skb); + return NET_RX_DROP; + } + + skb->protocol = htons(ETH_P_IP); + skb_pull(skb, 13); + skb->dev = dev; + skb_reset_transport_header(skb); + + stats = netdev_priv(dev); + stats->rx_packets++; + stats->rx_bytes += skb->len + 13; + return netif_rx(skb); /* Send the SKB up to a higher place. */ +} +#else +/* make it easy for gcc to optimize this test out, i.e. kill the code */ +#define is_ip_over_ddp(skb) 0 +#define handle_ip_over_ddp(skb) 0 +#endif + +static int atalk_route_packet(struct sk_buff *skb, struct net_device *dev, + struct ddpehdr *ddp, __u16 len_hops, int origlen) +{ + struct atalk_route *rt; + struct atalk_addr ta; + + /* + * Don't route multicast, etc., packets, or packets sent to "this + * network" + */ + if (skb->pkt_type != PACKET_HOST || !ddp->deh_dnet) { + /* + * FIXME: + * + * Can it ever happen that a packet is from a PPP iface and + * needs to be broadcast onto the default network? + */ + if (dev->type == ARPHRD_PPP) + printk(KERN_DEBUG "AppleTalk: didn't forward broadcast " + "packet received from PPP iface\n"); + goto free_it; + } + + ta.s_net = ddp->deh_dnet; + ta.s_node = ddp->deh_dnode; + + /* Route the packet */ + rt = atrtr_find(&ta); + /* increment hops count */ + len_hops += 1 << 10; + if (!rt || !(len_hops & (15 << 10))) + goto free_it; + + /* FIXME: use skb->cb to be able to use shared skbs */ + + /* + * Route goes through another gateway, so set the target to the + * gateway instead. + */ + + if (rt->flags & RTF_GATEWAY) { + ta.s_net = rt->gateway.s_net; + ta.s_node = rt->gateway.s_node; + } + + /* Fix up skb->len field */ + skb_trim(skb, min_t(unsigned int, origlen, + (rt->dev->hard_header_len + + ddp_dl->header_length + (len_hops & 1023)))); + + /* FIXME: use skb->cb to be able to use shared skbs */ + ddp->deh_len_hops = htons(len_hops); + + /* + * Send the buffer onwards + * + * Now we must always be careful. If it's come from LocalTalk to + * EtherTalk it might not fit + * + * Order matters here: If a packet has to be copied to make a new + * headroom (rare hopefully) then it won't need unsharing. + * + * Note. ddp-> becomes invalid at the realloc. + */ + if (skb_headroom(skb) < 22) { + /* 22 bytes - 12 ether, 2 len, 3 802.2 5 snap */ + struct sk_buff *nskb = skb_realloc_headroom(skb, 32); + kfree_skb(skb); + skb = nskb; + } else + skb = skb_unshare(skb, GFP_ATOMIC); + + /* + * If the buffer didn't vanish into the lack of space bitbucket we can + * send it. + */ + if (skb == NULL) + goto drop; + + if (aarp_send_ddp(rt->dev, skb, &ta, NULL) == NET_XMIT_DROP) + return NET_RX_DROP; + return NET_RX_SUCCESS; +free_it: + kfree_skb(skb); +drop: + return NET_RX_DROP; +} + +/** + * atalk_rcv - Receive a packet (in skb) from device dev + * @skb - packet received + * @dev - network device where the packet comes from + * @pt - packet type + * + * Receive a packet (in skb) from device dev. This has come from the SNAP + * decoder, and on entry skb->transport_header is the DDP header, skb->len + * is the DDP header, skb->len is the DDP length. The physical headers + * have been extracted. PPP should probably pass frames marked as for this + * layer. [ie ARPHRD_ETHERTALK] + */ +static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct ddpehdr *ddp; + struct sock *sock; + struct atalk_iface *atif; + struct sockaddr_at tosat; + int origlen; + __u16 len_hops; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + /* Don't mangle buffer if shared */ + if (!(skb = skb_share_check(skb, GFP_ATOMIC))) + goto out; + + /* Size check and make sure header is contiguous */ + if (!pskb_may_pull(skb, sizeof(*ddp))) + goto drop; + + ddp = ddp_hdr(skb); + + len_hops = ntohs(ddp->deh_len_hops); + + /* Trim buffer in case of stray trailing data */ + origlen = skb->len; + skb_trim(skb, min_t(unsigned int, skb->len, len_hops & 1023)); + + /* + * Size check to see if ddp->deh_len was crap + * (Otherwise we'll detonate most spectacularly + * in the middle of atalk_checksum() or recvmsg()). + */ + if (skb->len < sizeof(*ddp) || skb->len < (len_hops & 1023)) { + pr_debug("AppleTalk: dropping corrupted frame (deh_len=%u, " + "skb->len=%u)\n", len_hops & 1023, skb->len); + goto drop; + } + + /* + * Any checksums. Note we don't do htons() on this == is assumed to be + * valid for net byte orders all over the networking code... + */ + if (ddp->deh_sum && + atalk_checksum(skb, len_hops & 1023) != ddp->deh_sum) + /* Not a valid AppleTalk frame - dustbin time */ + goto drop; + + /* Check the packet is aimed at us */ + if (!ddp->deh_dnet) /* Net 0 is 'this network' */ + atif = atalk_find_anynet(ddp->deh_dnode, dev); + else + atif = atalk_find_interface(ddp->deh_dnet, ddp->deh_dnode); + + if (!atif) { + /* Not ours, so we route the packet via the correct + * AppleTalk iface + */ + return atalk_route_packet(skb, dev, ddp, len_hops, origlen); + } + + /* if IP over DDP is not selected this code will be optimized out */ + if (is_ip_over_ddp(skb)) + return handle_ip_over_ddp(skb); + /* + * Which socket - atalk_search_socket() looks for a *full match* + * of the tuple. + */ + tosat.sat_addr.s_net = ddp->deh_dnet; + tosat.sat_addr.s_node = ddp->deh_dnode; + tosat.sat_port = ddp->deh_dport; + + sock = atalk_search_socket(&tosat, atif); + if (!sock) /* But not one of our sockets */ + goto drop; + + /* Queue packet (standard) */ + skb->sk = sock; + + if (sock_queue_rcv_skb(sock, skb) < 0) + goto drop; + + return NET_RX_SUCCESS; + +drop: + kfree_skb(skb); +out: + return NET_RX_DROP; + +} + +/* + * Receive a LocalTalk frame. We make some demands on the caller here. + * Caller must provide enough headroom on the packet to pull the short + * header and append a long one. + */ +static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + if (!net_eq(dev_net(dev), &init_net)) + goto freeit; + + /* Expand any short form frames */ + if (skb_mac_header(skb)[2] == 1) { + struct ddpehdr *ddp; + /* Find our address */ + struct atalk_addr *ap = atalk_find_dev_addr(dev); + + if (!ap || skb->len < sizeof(__be16) || skb->len > 1023) + goto freeit; + + /* Don't mangle buffer if shared */ + if (!(skb = skb_share_check(skb, GFP_ATOMIC))) + return 0; + + /* + * The push leaves us with a ddephdr not an shdr, and + * handily the port bytes in the right place preset. + */ + ddp = (struct ddpehdr *) skb_push(skb, sizeof(*ddp) - 4); + + /* Now fill in the long header */ + + /* + * These two first. The mac overlays the new source/dest + * network information so we MUST copy these before + * we write the network numbers ! + */ + + ddp->deh_dnode = skb_mac_header(skb)[0]; /* From physical header */ + ddp->deh_snode = skb_mac_header(skb)[1]; /* From physical header */ + + ddp->deh_dnet = ap->s_net; /* Network number */ + ddp->deh_snet = ap->s_net; + ddp->deh_sum = 0; /* No checksum */ + /* + * Not sure about this bit... + */ + /* Non routable, so force a drop if we slip up later */ + ddp->deh_len_hops = htons(skb->len + (DDP_MAXHOPS << 10)); + } + skb_reset_transport_header(skb); + + return atalk_rcv(skb, dev, pt, orig_dev); +freeit: + kfree_skb(skb); + return 0; +} + +static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t len) +{ + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + struct sockaddr_at *usat = (struct sockaddr_at *)msg->msg_name; + int flags = msg->msg_flags; + int loopback = 0; + struct sockaddr_at local_satalk, gsat; + struct sk_buff *skb; + struct net_device *dev; + struct ddpehdr *ddp; + int size; + struct atalk_route *rt; + int err; + + if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (len > DDP_MAXSZ) + return -EMSGSIZE; + + lock_sock(sk); + if (usat) { + err = -EBUSY; + if (sock_flag(sk, SOCK_ZAPPED)) + if (atalk_autobind(sk) < 0) + goto out; + + err = -EINVAL; + if (msg->msg_namelen < sizeof(*usat) || + usat->sat_family != AF_APPLETALK) + goto out; + + err = -EPERM; + /* netatalk didn't implement this check */ + if (usat->sat_addr.s_node == ATADDR_BCAST && + !sock_flag(sk, SOCK_BROADCAST)) { + goto out; + } + } else { + err = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + usat = &local_satalk; + usat->sat_family = AF_APPLETALK; + usat->sat_port = at->dest_port; + usat->sat_addr.s_node = at->dest_node; + usat->sat_addr.s_net = at->dest_net; + } + + /* Build a packet */ + SOCK_DEBUG(sk, "SK %p: Got address.\n", sk); + + /* For headers */ + size = sizeof(struct ddpehdr) + len + ddp_dl->header_length; + + if (usat->sat_addr.s_net || usat->sat_addr.s_node == ATADDR_ANYNODE) { + rt = atrtr_find(&usat->sat_addr); + } else { + struct atalk_addr at_hint; + + at_hint.s_node = 0; + at_hint.s_net = at->src_net; + + rt = atrtr_find(&at_hint); + } + err = ENETUNREACH; + if (!rt) + goto out; + + dev = rt->dev; + + SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", + sk, size, dev->name); + + size += dev->hard_header_len; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err); + lock_sock(sk); + if (!skb) + goto out; + + skb->sk = sk; + skb_reserve(skb, ddp_dl->header_length); + skb_reserve(skb, dev->hard_header_len); + skb->dev = dev; + + SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); + + ddp = (struct ddpehdr *)skb_put(skb, sizeof(struct ddpehdr)); + ddp->deh_len_hops = htons(len + sizeof(*ddp)); + ddp->deh_dnet = usat->sat_addr.s_net; + ddp->deh_snet = at->src_net; + ddp->deh_dnode = usat->sat_addr.s_node; + ddp->deh_snode = at->src_node; + ddp->deh_dport = usat->sat_port; + ddp->deh_sport = at->src_port; + + SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len); + + err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (err) { + kfree_skb(skb); + err = -EFAULT; + goto out; + } + + if (sk->sk_no_check == 1) + ddp->deh_sum = 0; + else + ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp)); + + /* + * Loopback broadcast packets to non gateway targets (ie routes + * to group we are in) + */ + if (ddp->deh_dnode == ATADDR_BCAST && + !(rt->flags & RTF_GATEWAY) && !(dev->flags & IFF_LOOPBACK)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_KERNEL); + + if (skb2) { + loopback = 1; + SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk); + /* + * If it fails it is queued/sent above in the aarp queue + */ + aarp_send_ddp(dev, skb2, &usat->sat_addr, NULL); + } + } + + if (dev->flags & IFF_LOOPBACK || loopback) { + SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk); + /* loop back */ + skb_orphan(skb); + if (ddp->deh_dnode == ATADDR_BCAST) { + struct atalk_addr at_lo; + + at_lo.s_node = 0; + at_lo.s_net = 0; + + rt = atrtr_find(&at_lo); + if (!rt) { + kfree_skb(skb); + err = -ENETUNREACH; + goto out; + } + dev = rt->dev; + skb->dev = dev; + } + ddp_dl->request(ddp_dl, skb, dev->dev_addr); + } else { + SOCK_DEBUG(sk, "SK %p: send out.\n", sk); + if (rt->flags & RTF_GATEWAY) { + gsat.sat_addr = rt->gateway; + usat = &gsat; + } + + /* + * If it fails it is queued/sent above in the aarp queue + */ + aarp_send_ddp(dev, skb, &usat->sat_addr, NULL); + } + SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len); + +out: + release_sock(sk); + return err ? : len; +} + +static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_at *sat = (struct sockaddr_at *)msg->msg_name; + struct ddpehdr *ddp; + int copied = 0; + int offset = 0; + int err = 0; + struct sk_buff *skb; + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + lock_sock(sk); + + if (!skb) + goto out; + + /* FIXME: use skb->cb to be able to use shared skbs */ + ddp = ddp_hdr(skb); + copied = ntohs(ddp->deh_len_hops) & 1023; + + if (sk->sk_type != SOCK_RAW) { + offset = sizeof(*ddp); + copied -= offset; + } + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); + + if (!err) { + if (sat) { + sat->sat_family = AF_APPLETALK; + sat->sat_port = ddp->deh_sport; + sat->sat_addr.s_node = ddp->deh_snode; + sat->sat_addr.s_net = ddp->deh_snet; + } + msg->msg_namelen = sizeof(*sat); + } + + skb_free_datagram(sk, skb); /* Free the datagram. */ + +out: + release_sock(sk); + return err ? : copied; +} + + +/* + * AppleTalk ioctl calls. + */ +static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int rc = -ENOIOCTLCMD; + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + + switch (cmd) { + /* Protocol layer */ + case TIOCOUTQ: { + long amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + + if (amount < 0) + amount = 0; + rc = put_user(amount, (int __user *)argp); + break; + } + case TIOCINQ: { + /* + * These two are safe on a single CPU system as only + * user tasks fiddle here + */ + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + long amount = 0; + + if (skb) + amount = skb->len - sizeof(struct ddpehdr); + rc = put_user(amount, (int __user *)argp); + break; + } + case SIOCGSTAMP: + rc = sock_get_timestamp(sk, argp); + break; + case SIOCGSTAMPNS: + rc = sock_get_timestampns(sk, argp); + break; + /* Routing */ + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (capable(CAP_NET_ADMIN)) + rc = atrtr_ioctl(cmd, argp); + break; + /* Interface */ + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCATALKDIFADDR: + case SIOCDIFADDR: + case SIOCSARP: /* proxy AARP */ + case SIOCDARP: /* proxy AARP */ + rtnl_lock(); + rc = atif_ioctl(cmd, argp); + rtnl_unlock(); + break; + } + + return rc; +} + + +#ifdef CONFIG_COMPAT +static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + /* + * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we + * cannot handle it in common code. The data we access if ifreq + * here is compatible, so we can simply call the native + * handler. + */ + if (cmd == SIOCATALKDIFADDR) + return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); + + return -ENOIOCTLCMD; +} +#endif + + +static const struct net_proto_family atalk_family_ops = { + .family = PF_APPLETALK, + .create = atalk_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops atalk_dgram_ops = { + .family = PF_APPLETALK, + .owner = THIS_MODULE, + .release = atalk_release, + .bind = atalk_bind, + .connect = atalk_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = atalk_getname, + .poll = datagram_poll, + .ioctl = atalk_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = atalk_compat_ioctl, +#endif + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = atalk_sendmsg, + .recvmsg = atalk_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct notifier_block ddp_notifier = { + .notifier_call = ddp_device_event, +}; + +static struct packet_type ltalk_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_LOCALTALK), + .func = ltalk_rcv, +}; + +static struct packet_type ppptalk_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_PPPTALK), + .func = atalk_rcv, +}; + +static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B }; + +/* Export symbols for use by drivers when AppleTalk is a module */ +EXPORT_SYMBOL(atrtr_get_dev); +EXPORT_SYMBOL(atalk_find_dev_addr); + +static const char atalk_err_snap[] __initconst = + KERN_CRIT "Unable to register DDP with SNAP.\n"; + +/* Called by proto.c on kernel start up */ +static int __init atalk_init(void) +{ + int rc = proto_register(&ddp_proto, 0); + + if (rc != 0) + goto out; + + (void)sock_register(&atalk_family_ops); + ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv); + if (!ddp_dl) + printk(atalk_err_snap); + + dev_add_pack(<alk_packet_type); + dev_add_pack(&ppptalk_packet_type); + + register_netdevice_notifier(&ddp_notifier); + aarp_proto_init(); + atalk_proc_init(); + atalk_register_sysctl(); +out: + return rc; +} +module_init(atalk_init); + +/* + * No explicit module reference count manipulation is needed in the + * protocol. Socket layer sets module reference count for us + * and interfaces reference counting is done + * by the network device layer. + * + * Ergo, before the AppleTalk module can be removed, all AppleTalk + * sockets be closed from user space. + */ +static void __exit atalk_exit(void) +{ +#ifdef CONFIG_SYSCTL + atalk_unregister_sysctl(); +#endif /* CONFIG_SYSCTL */ + atalk_proc_exit(); + aarp_cleanup_module(); /* General aarp clean-up. */ + unregister_netdevice_notifier(&ddp_notifier); + dev_remove_pack(<alk_packet_type); + dev_remove_pack(&ppptalk_packet_type); + unregister_snap_client(ddp_dl); + sock_unregister(PF_APPLETALK); + proto_unregister(&ddp_proto); +} +module_exit(atalk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alan Cox "); +MODULE_DESCRIPTION("AppleTalk 0.20\n"); +MODULE_ALIAS_NETPROTO(PF_APPLETALK); diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c new file mode 100644 index 00000000..6c8016f6 --- /dev/null +++ b/net/appletalk/dev.c @@ -0,0 +1,44 @@ +/* + * Moved here from drivers/net/net_init.c, which is: + * Written 1993,1994,1995 by Donald Becker. + */ + +#include +#include +#include +#include +#include + +static void ltalk_setup(struct net_device *dev) +{ + /* Fill in the fields of the device structure with localtalk-generic values. */ + + dev->type = ARPHRD_LOCALTLK; + dev->hard_header_len = LTALK_HLEN; + dev->mtu = LTALK_MTU; + dev->addr_len = LTALK_ALEN; + dev->tx_queue_len = 10; + + dev->broadcast[0] = 0xFF; + + dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP; +} + +/** + * alloc_ltalkdev - Allocates and sets up an localtalk device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this localtalk device + * + * Fill in the fields of the device structure with localtalk-generic + * values. Basically does everything except registering the device. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ + +struct net_device *alloc_ltalkdev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "lt%d", ltalk_setup); +} +EXPORT_SYMBOL(alloc_ltalkdev); diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c new file mode 100644 index 00000000..04e9c0da --- /dev/null +++ b/net/appletalk/sysctl_net_atalk.c @@ -0,0 +1,61 @@ +/* + * sysctl_net_atalk.c: sysctl interface to net AppleTalk subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/atalk directory entry (empty =) ). [MS] + * Dynamic registration, added aarp entries. (5/30/97 Chris Horn) + */ + +#include +#include +#include + +static struct ctl_table atalk_table[] = { + { + .procname = "aarp-expiry-time", + .data = &sysctl_aarp_expiry_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "aarp-tick-time", + .data = &sysctl_aarp_tick_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "aarp-retransmit-limit", + .data = &sysctl_aarp_retransmit_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "aarp-resolve-time", + .data = &sysctl_aarp_resolve_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { }, +}; + +static struct ctl_path atalk_path[] = { + { .procname = "net", }, + { .procname = "appletalk", }, + { } +}; + +static struct ctl_table_header *atalk_table_header; + +void atalk_register_sysctl(void) +{ + atalk_table_header = register_sysctl_paths(atalk_path, atalk_table); +} + +void atalk_unregister_sysctl(void) +{ + unregister_sysctl_table(atalk_table_header); +} diff --git a/net/atm/Kconfig b/net/atm/Kconfig new file mode 100644 index 00000000..754ea103 --- /dev/null +++ b/net/atm/Kconfig @@ -0,0 +1,73 @@ +# +# Asynchronous Transfer Mode (ATM) +# + +config ATM + tristate "Asynchronous Transfer Mode (ATM)" + ---help--- + ATM is a high-speed networking technology for Local Area Networks + and Wide Area Networks. It uses a fixed packet size and is + connection oriented, allowing for the negotiation of minimum + bandwidth requirements. + + In order to participate in an ATM network, your Linux box needs an + ATM networking card. If you have that, say Y here and to the driver + of your ATM card below. + + Note that you need a set of user-space programs to actually make use + of ATM. See the file for + further details. + +config ATM_CLIP + tristate "Classical IP over ATM" + depends on ATM && INET + help + Classical IP over ATM for PVCs and SVCs, supporting InARP and + ATMARP. If you want to communication with other IP hosts on your ATM + network, you will typically either say Y here or to "LAN Emulation + (LANE)" below. + +config ATM_CLIP_NO_ICMP + bool "Do NOT send ICMP if no neighbour" + depends on ATM_CLIP + help + Normally, an "ICMP host unreachable" message is sent if a neighbour + cannot be reached because there is no VC to it in the kernel's + ATMARP table. This may cause problems when ATMARP table entries are + briefly removed during revalidation. If you say Y here, packets to + such neighbours are silently discarded instead. + +config ATM_LANE + tristate "LAN Emulation (LANE) support" + depends on ATM + help + LAN Emulation emulates services of existing LANs across an ATM + network. Besides operating as a normal ATM end station client, Linux + LANE client can also act as an proxy client bridging packets between + ELAN and Ethernet segments. You need LANE if you want to try MPOA. + +config ATM_MPOA + tristate "Multi-Protocol Over ATM (MPOA) support" + depends on ATM && INET && ATM_LANE!=n + help + Multi-Protocol Over ATM allows ATM edge devices such as routers, + bridges and ATM attached hosts establish direct ATM VCs across + subnetwork boundaries. These shortcut connections bypass routers + enhancing overall network performance. + +config ATM_BR2684 + tristate "RFC1483/2684 Bridged protocols" + depends on ATM && INET + help + ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483) + This device will act like an ethernet from the kernels point of view, + with the traffic being carried by ATM PVCs (currently 1 PVC/device). + This is sometimes used over DSL lines. If in doubt, say N. + +config ATM_BR2684_IPFILTER + bool "Per-VC IP filter kludge" + depends on ATM_BR2684 + help + This is an experimental mechanism for users who need to terminate a + large number of IP-only vcc's. Do not enable this unless you are sure + you know what you are doing. diff --git a/net/atm/Makefile b/net/atm/Makefile new file mode 100644 index 00000000..cc50bd1f --- /dev/null +++ b/net/atm/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for the ATM Protocol Families. +# + +atm-y := addr.o pvc.o signaling.o svc.o ioctl.o common.o atm_misc.o raw.o resources.o atm_sysfs.o +mpoa-objs := mpc.o mpoa_caches.o mpoa_proc.o + +obj-$(CONFIG_ATM) += atm.o +obj-$(CONFIG_ATM_CLIP) += clip.o +obj-$(CONFIG_ATM_BR2684) += br2684.o +atm-$(CONFIG_PROC_FS) += proc.o + +obj-$(CONFIG_ATM_LANE) += lec.o +obj-$(CONFIG_ATM_MPOA) += mpoa.o +obj-$(CONFIG_PPPOATM) += pppoatm.o diff --git a/net/atm/addr.c b/net/atm/addr.c new file mode 100644 index 00000000..dcda35c6 --- /dev/null +++ b/net/atm/addr.c @@ -0,0 +1,161 @@ +/* net/atm/addr.c - Local ATM address registry */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#include +#include +#include +#include + +#include "signaling.h" +#include "addr.h" + +static int check_addr(const struct sockaddr_atmsvc *addr) +{ + int i; + + if (addr->sas_family != AF_ATMSVC) + return -EAFNOSUPPORT; + if (!*addr->sas_addr.pub) + return *addr->sas_addr.prv ? 0 : -EINVAL; + for (i = 1; i < ATM_E164_LEN + 1; i++) /* make sure it's \0-terminated */ + if (!addr->sas_addr.pub[i]) + return 0; + return -EINVAL; +} + +static int identical(const struct sockaddr_atmsvc *a, const struct sockaddr_atmsvc *b) +{ + if (*a->sas_addr.prv) + if (memcmp(a->sas_addr.prv, b->sas_addr.prv, ATM_ESA_LEN)) + return 0; + if (!*a->sas_addr.pub) + return !*b->sas_addr.pub; + if (!*b->sas_addr.pub) + return 0; + return !strcmp(a->sas_addr.pub, b->sas_addr.pub); +} + +static void notify_sigd(const struct atm_dev *dev) +{ + struct sockaddr_atmpvc pvc; + + pvc.sap_addr.itf = dev->number; + sigd_enq(NULL, as_itf_notify, NULL, &pvc, NULL); +} + +void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t atype) +{ + unsigned long flags; + struct atm_dev_addr *this, *p; + struct list_head *head; + + spin_lock_irqsave(&dev->lock, flags); + if (atype == ATM_ADDR_LECS) + head = &dev->lecs; + else + head = &dev->local; + list_for_each_entry_safe(this, p, head, entry) { + list_del(&this->entry); + kfree(this); + } + spin_unlock_irqrestore(&dev->lock, flags); + if (head == &dev->local) + notify_sigd(dev); +} + +int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr, + enum atm_addr_type_t atype) +{ + unsigned long flags; + struct atm_dev_addr *this; + struct list_head *head; + int error; + + error = check_addr(addr); + if (error) + return error; + spin_lock_irqsave(&dev->lock, flags); + if (atype == ATM_ADDR_LECS) + head = &dev->lecs; + else + head = &dev->local; + list_for_each_entry(this, head, entry) { + if (identical(&this->addr, addr)) { + spin_unlock_irqrestore(&dev->lock, flags); + return -EEXIST; + } + } + this = kmalloc(sizeof(struct atm_dev_addr), GFP_ATOMIC); + if (!this) { + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOMEM; + } + this->addr = *addr; + list_add(&this->entry, head); + spin_unlock_irqrestore(&dev->lock, flags); + if (head == &dev->local) + notify_sigd(dev); + return 0; +} + +int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr, + enum atm_addr_type_t atype) +{ + unsigned long flags; + struct atm_dev_addr *this; + struct list_head *head; + int error; + + error = check_addr(addr); + if (error) + return error; + spin_lock_irqsave(&dev->lock, flags); + if (atype == ATM_ADDR_LECS) + head = &dev->lecs; + else + head = &dev->local; + list_for_each_entry(this, head, entry) { + if (identical(&this->addr, addr)) { + list_del(&this->entry); + spin_unlock_irqrestore(&dev->lock, flags); + kfree(this); + if (head == &dev->local) + notify_sigd(dev); + return 0; + } + } + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOENT; +} + +int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user * buf, + size_t size, enum atm_addr_type_t atype) +{ + unsigned long flags; + struct atm_dev_addr *this; + struct list_head *head; + int total = 0, error; + struct sockaddr_atmsvc *tmp_buf, *tmp_bufp; + + spin_lock_irqsave(&dev->lock, flags); + if (atype == ATM_ADDR_LECS) + head = &dev->lecs; + else + head = &dev->local; + list_for_each_entry(this, head, entry) + total += sizeof(struct sockaddr_atmsvc); + tmp_buf = tmp_bufp = kmalloc(total, GFP_ATOMIC); + if (!tmp_buf) { + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOMEM; + } + list_for_each_entry(this, head, entry) + memcpy(tmp_bufp++, &this->addr, sizeof(struct sockaddr_atmsvc)); + spin_unlock_irqrestore(&dev->lock, flags); + error = total > size ? -E2BIG : total; + if (copy_to_user(buf, tmp_buf, total < size ? total : size)) + error = -EFAULT; + kfree(tmp_buf); + return error; +} diff --git a/net/atm/addr.h b/net/atm/addr.h new file mode 100644 index 00000000..6837e9e7 --- /dev/null +++ b/net/atm/addr.h @@ -0,0 +1,20 @@ +/* net/atm/addr.h - Local ATM address registry */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_ADDR_H +#define NET_ATM_ADDR_H + +#include +#include + +void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t type); +int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr, + enum atm_addr_type_t type); +int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr, + enum atm_addr_type_t type); +int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user *buf, + size_t size, enum atm_addr_type_t type); + +#endif diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c new file mode 100644 index 00000000..fc63526d --- /dev/null +++ b/net/atm/atm_misc.c @@ -0,0 +1,101 @@ +/* net/atm/atm_misc.c - Various functions for use by ATM drivers */ + +/* Written 1995-2000 by Werner Almesberger, EPFL ICA */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int atm_charge(struct atm_vcc *vcc, int truesize) +{ + atm_force_charge(vcc, truesize); + if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf) + return 1; + atm_return(vcc, truesize); + atomic_inc(&vcc->stats->rx_drop); + return 0; +} +EXPORT_SYMBOL(atm_charge); + +struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc, int pdu_size, + gfp_t gfp_flags) +{ + struct sock *sk = sk_atm(vcc); + int guess = atm_guess_pdu2truesize(pdu_size); + + atm_force_charge(vcc, guess); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + struct sk_buff *skb = alloc_skb(pdu_size, gfp_flags); + + if (skb) { + atomic_add(skb->truesize-guess, + &sk->sk_rmem_alloc); + return skb; + } + } + atm_return(vcc, guess); + atomic_inc(&vcc->stats->rx_drop); + return NULL; +} +EXPORT_SYMBOL(atm_alloc_charge); + + +/* + * atm_pcr_goal returns the positive PCR if it should be rounded up, the + * negative PCR if it should be rounded down, and zero if the maximum available + * bandwidth should be used. + * + * The rules are as follows (* = maximum, - = absent (0), x = value "x", + * (x+ = x or next value above x, x- = x or next value below): + * + * min max pcr result min max pcr result + * - - - * (UBR only) x - - x+ + * - - * * x - * * + * - - z z- x - z z- + * - * - * x * - x+ + * - * * * x * * * + * - * z z- x * z z- + * - y - y- x y - x+ + * - y * y- x y * y- + * - y z z- x y z z- + * + * All non-error cases can be converted with the following simple set of rules: + * + * if pcr == z then z- + * else if min == x && pcr == - then x+ + * else if max == y then y- + * else * + */ + +int atm_pcr_goal(const struct atm_trafprm *tp) +{ + if (tp->pcr && tp->pcr != ATM_MAX_PCR) + return -tp->pcr; + if (tp->min_pcr && !tp->pcr) + return tp->min_pcr; + if (tp->max_pcr != ATM_MAX_PCR) + return -tp->max_pcr; + return 0; +} +EXPORT_SYMBOL(atm_pcr_goal); + +void sonet_copy_stats(struct k_sonet_stats *from, struct sonet_stats *to) +{ +#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) + __SONET_ITEMS +#undef __HANDLE_ITEM +} +EXPORT_SYMBOL(sonet_copy_stats); + +void sonet_subtract_stats(struct k_sonet_stats *from, struct sonet_stats *to) +{ +#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i) + __SONET_ITEMS +#undef __HANDLE_ITEM +} +EXPORT_SYMBOL(sonet_subtract_stats); diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c new file mode 100644 index 00000000..f49da581 --- /dev/null +++ b/net/atm/atm_sysfs.c @@ -0,0 +1,200 @@ +/* ATM driver model support. */ + +#include +#include +#include +#include +#include +#include "common.h" +#include "resources.h" + +#define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev) + +static ssize_t show_type(struct device *cdev, + struct device_attribute *attr, char *buf) +{ + struct atm_dev *adev = to_atm_dev(cdev); + return sprintf(buf, "%s\n", adev->type); +} + +static ssize_t show_address(struct device *cdev, + struct device_attribute *attr, char *buf) +{ + char *pos = buf; + struct atm_dev *adev = to_atm_dev(cdev); + int i; + + for (i = 0; i < (ESI_LEN - 1); i++) + pos += sprintf(pos, "%02x:", adev->esi[i]); + pos += sprintf(pos, "%02x\n", adev->esi[i]); + + return pos - buf; +} + +static ssize_t show_atmaddress(struct device *cdev, + struct device_attribute *attr, char *buf) +{ + unsigned long flags; + char *pos = buf; + struct atm_dev *adev = to_atm_dev(cdev); + struct atm_dev_addr *aaddr; + int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin; + int i, j; + + spin_lock_irqsave(&adev->lock, flags); + list_for_each_entry(aaddr, &adev->local, entry) { + for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) { + if (j == *fmt) { + pos += sprintf(pos, "."); + ++fmt; + j = 0; + } + pos += sprintf(pos, "%02x", + aaddr->addr.sas_addr.prv[i]); + } + pos += sprintf(pos, "\n"); + } + spin_unlock_irqrestore(&adev->lock, flags); + + return pos - buf; +} + +static ssize_t show_atmindex(struct device *cdev, + struct device_attribute *attr, char *buf) +{ + struct atm_dev *adev = to_atm_dev(cdev); + + return sprintf(buf, "%d\n", adev->number); +} + +static ssize_t show_carrier(struct device *cdev, + struct device_attribute *attr, char *buf) +{ + char *pos = buf; + struct atm_dev *adev = to_atm_dev(cdev); + + pos += sprintf(pos, "%d\n", + adev->signal == ATM_PHY_SIG_LOST ? 0 : 1); + + return pos - buf; +} + +static ssize_t show_link_rate(struct device *cdev, + struct device_attribute *attr, char *buf) +{ + char *pos = buf; + struct atm_dev *adev = to_atm_dev(cdev); + int link_rate; + + /* show the link rate, not the data rate */ + switch (adev->link_rate) { + case ATM_OC3_PCR: + link_rate = 155520000; + break; + case ATM_OC12_PCR: + link_rate = 622080000; + break; + case ATM_25_PCR: + link_rate = 25600000; + break; + default: + link_rate = adev->link_rate * 8 * 53; + } + pos += sprintf(pos, "%d\n", link_rate); + + return pos - buf; +} + +static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL); +static DEVICE_ATTR(atmindex, S_IRUGO, show_atmindex, NULL); +static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); +static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); +static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL); + +static struct device_attribute *atm_attrs[] = { + &dev_attr_atmaddress, + &dev_attr_address, + &dev_attr_atmindex, + &dev_attr_carrier, + &dev_attr_type, + &dev_attr_link_rate, + NULL +}; + + +static int atm_uevent(struct device *cdev, struct kobj_uevent_env *env) +{ + struct atm_dev *adev; + + if (!cdev) + return -ENODEV; + + adev = to_atm_dev(cdev); + if (!adev) + return -ENODEV; + + if (add_uevent_var(env, "NAME=%s%d", adev->type, adev->number)) + return -ENOMEM; + + return 0; +} + +static void atm_release(struct device *cdev) +{ + struct atm_dev *adev = to_atm_dev(cdev); + + kfree(adev); +} + +static struct class atm_class = { + .name = "atm", + .dev_release = atm_release, + .dev_uevent = atm_uevent, +}; + +int atm_register_sysfs(struct atm_dev *adev, struct device *parent) +{ + struct device *cdev = &adev->class_dev; + int i, j, err; + + cdev->class = &atm_class; + cdev->parent = parent; + dev_set_drvdata(cdev, adev); + + dev_set_name(cdev, "%s%d", adev->type, adev->number); + err = device_register(cdev); + if (err < 0) + return err; + + for (i = 0; atm_attrs[i]; i++) { + err = device_create_file(cdev, atm_attrs[i]); + if (err) + goto err_out; + } + + return 0; + +err_out: + for (j = 0; j < i; j++) + device_remove_file(cdev, atm_attrs[j]); + device_del(cdev); + return err; +} + +void atm_unregister_sysfs(struct atm_dev *adev) +{ + struct device *cdev = &adev->class_dev; + + device_del(cdev); +} + +int __init atm_sysfs_init(void) +{ + return class_register(&atm_class); +} + +void __exit atm_sysfs_exit(void) +{ + class_unregister(&atm_class); +} diff --git a/net/atm/br2684.c b/net/atm/br2684.c new file mode 100644 index 00000000..d07223c8 --- /dev/null +++ b/net/atm/br2684.c @@ -0,0 +1,847 @@ +/* + * Ethernet netdevice using ATM AAL5 as underlying carrier + * (RFC1483 obsoleted by RFC2684) for Linux + * + * Authors: Marcell GAL, 2000, XDSL Ltd, Hungary + * Eric Kinzie, 2006-2007, US Naval Research Laboratory + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common.h" + +static void skb_debug(const struct sk_buff *skb) +{ +#ifdef SKB_DEBUG +#define NUM2PRINT 50 + print_hex_dump(KERN_DEBUG, "br2684: skb: ", DUMP_OFFSET, + 16, 1, skb->data, min(NUM2PRINT, skb->len), true); +#endif +} + +#define BR2684_ETHERTYPE_LEN 2 +#define BR2684_PAD_LEN 2 + +#define LLC 0xaa, 0xaa, 0x03 +#define SNAP_BRIDGED 0x00, 0x80, 0xc2 +#define SNAP_ROUTED 0x00, 0x00, 0x00 +#define PID_ETHERNET 0x00, 0x07 +#define ETHERTYPE_IPV4 0x08, 0x00 +#define ETHERTYPE_IPV6 0x86, 0xdd +#define PAD_BRIDGED 0x00, 0x00 + +static const unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 }; +static const unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 }; +static const unsigned char llc_oui_pid_pad[] = + { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED }; +static const unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 }; +static const unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 }; + +enum br2684_encaps { + e_vc = BR2684_ENCAPS_VC, + e_llc = BR2684_ENCAPS_LLC, +}; + +struct br2684_vcc { + struct atm_vcc *atmvcc; + struct net_device *device; + /* keep old push, pop functions for chaining */ + void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb); + void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); + enum br2684_encaps encaps; + struct list_head brvccs; +#ifdef CONFIG_ATM_BR2684_IPFILTER + struct br2684_filter filter; +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + unsigned copies_needed, copies_failed; +}; + +struct br2684_dev { + struct net_device *net_dev; + struct list_head br2684_devs; + int number; + struct list_head brvccs; /* one device <=> one vcc (before xmas) */ + int mac_was_set; + enum br2684_payload payload; +}; + +/* + * This lock should be held for writing any time the list of devices or + * their attached vcc's could be altered. It should be held for reading + * any time these are being queried. Note that we sometimes need to + * do read-locking under interrupt context, so write locking must block + * the current CPU's interrupts + */ +static DEFINE_RWLOCK(devs_lock); + +static LIST_HEAD(br2684_devs); + +static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev) +{ + return netdev_priv(net_dev); +} + +static inline struct net_device *list_entry_brdev(const struct list_head *le) +{ + return list_entry(le, struct br2684_dev, br2684_devs)->net_dev; +} + +static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc) +{ + return (struct br2684_vcc *)(atmvcc->user_back); +} + +static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le) +{ + return list_entry(le, struct br2684_vcc, brvccs); +} + +/* Caller should hold read_lock(&devs_lock) */ +static struct net_device *br2684_find_dev(const struct br2684_if_spec *s) +{ + struct list_head *lh; + struct net_device *net_dev; + switch (s->method) { + case BR2684_FIND_BYNUM: + list_for_each(lh, &br2684_devs) { + net_dev = list_entry_brdev(lh); + if (BRPRIV(net_dev)->number == s->spec.devnum) + return net_dev; + } + break; + case BR2684_FIND_BYIFNAME: + list_for_each(lh, &br2684_devs) { + net_dev = list_entry_brdev(lh); + if (!strncmp(net_dev->name, s->spec.ifname, IFNAMSIZ)) + return net_dev; + } + break; + } + return NULL; +} + +static int atm_dev_event(struct notifier_block *this, unsigned long event, + void *arg) +{ + struct atm_dev *atm_dev = arg; + struct list_head *lh; + struct net_device *net_dev; + struct br2684_vcc *brvcc; + struct atm_vcc *atm_vcc; + unsigned long flags; + + pr_debug("event=%ld dev=%p\n", event, atm_dev); + + read_lock_irqsave(&devs_lock, flags); + list_for_each(lh, &br2684_devs) { + net_dev = list_entry_brdev(lh); + + list_for_each_entry(brvcc, &BRPRIV(net_dev)->brvccs, brvccs) { + atm_vcc = brvcc->atmvcc; + if (atm_vcc && brvcc->atmvcc->dev == atm_dev) { + + if (atm_vcc->dev->signal == ATM_PHY_SIG_LOST) + netif_carrier_off(net_dev); + else + netif_carrier_on(net_dev); + + } + } + } + read_unlock_irqrestore(&devs_lock, flags); + + return NOTIFY_DONE; +} + +static struct notifier_block atm_dev_notifier = { + .notifier_call = atm_dev_event, +}; + +/* chained vcc->pop function. Check if we should wake the netif_queue */ +static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct br2684_vcc *brvcc = BR2684_VCC(vcc); + struct net_device *net_dev = skb->dev; + + pr_debug("(vcc %p ; net_dev %p )\n", vcc, net_dev); + brvcc->old_pop(vcc, skb); + + if (!net_dev) + return; + + if (atm_may_send(vcc, 0)) + netif_wake_queue(net_dev); + +} +/* + * Send a packet out a particular vcc. Not to useful right now, but paves + * the way for multiple vcc's per itf. Returns true if we can send, + * otherwise false + */ +static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev, + struct br2684_vcc *brvcc) +{ + struct br2684_dev *brdev = BRPRIV(dev); + struct atm_vcc *atmvcc; + int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2; + + if (skb_headroom(skb) < minheadroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom); + brvcc->copies_needed++; + dev_kfree_skb(skb); + if (skb2 == NULL) { + brvcc->copies_failed++; + return 0; + } + skb = skb2; + } + + if (brvcc->encaps == e_llc) { + if (brdev->payload == p_bridged) { + skb_push(skb, sizeof(llc_oui_pid_pad)); + skb_copy_to_linear_data(skb, llc_oui_pid_pad, + sizeof(llc_oui_pid_pad)); + } else if (brdev->payload == p_routed) { + unsigned short prot = ntohs(skb->protocol); + + skb_push(skb, sizeof(llc_oui_ipv4)); + switch (prot) { + case ETH_P_IP: + skb_copy_to_linear_data(skb, llc_oui_ipv4, + sizeof(llc_oui_ipv4)); + break; + case ETH_P_IPV6: + skb_copy_to_linear_data(skb, llc_oui_ipv6, + sizeof(llc_oui_ipv6)); + break; + default: + dev_kfree_skb(skb); + return 0; + } + } + } else { /* e_vc */ + if (brdev->payload == p_bridged) { + skb_push(skb, 2); + memset(skb->data, 0, 2); + } + } + skb_debug(skb); + + ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc; + pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev); + atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = atmvcc->atm_options; + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + atmvcc->send(atmvcc, skb); + + if (!atm_may_send(atmvcc, 0)) { + netif_stop_queue(brvcc->device); + /*check for race with br2684_pop*/ + if (atm_may_send(atmvcc, 0)) + netif_start_queue(brvcc->device); + } + + return 1; +} + +static inline struct br2684_vcc *pick_outgoing_vcc(const struct sk_buff *skb, + const struct br2684_dev *brdev) +{ + return list_empty(&brdev->brvccs) ? NULL : list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */ +} + +static netdev_tx_t br2684_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct br2684_dev *brdev = BRPRIV(dev); + struct br2684_vcc *brvcc; + + pr_debug("skb_dst(skb)=%p\n", skb_dst(skb)); + read_lock(&devs_lock); + brvcc = pick_outgoing_vcc(skb, brdev); + if (brvcc == NULL) { + pr_debug("no vcc attached to dev %s\n", dev->name); + dev->stats.tx_errors++; + dev->stats.tx_carrier_errors++; + /* netif_stop_queue(dev); */ + dev_kfree_skb(skb); + read_unlock(&devs_lock); + return NETDEV_TX_OK; + } + if (!br2684_xmit_vcc(skb, dev, brvcc)) { + /* + * We should probably use netif_*_queue() here, but that + * involves added complication. We need to walk before + * we can run. + * + * Don't free here! this pointer might be no longer valid! + */ + dev->stats.tx_errors++; + dev->stats.tx_fifo_errors++; + } + read_unlock(&devs_lock); + return NETDEV_TX_OK; +} + +/* + * We remember when the MAC gets set, so we don't override it later with + * the ESI of the ATM card of the first VC + */ +static int br2684_mac_addr(struct net_device *dev, void *p) +{ + int err = eth_mac_addr(dev, p); + if (!err) + BRPRIV(dev)->mac_was_set = 1; + return err; +} + +#ifdef CONFIG_ATM_BR2684_IPFILTER +/* this IOCTL is experimental. */ +static int br2684_setfilt(struct atm_vcc *atmvcc, void __user * arg) +{ + struct br2684_vcc *brvcc; + struct br2684_filter_set fs; + + if (copy_from_user(&fs, arg, sizeof fs)) + return -EFAULT; + if (fs.ifspec.method != BR2684_FIND_BYNOTHING) { + /* + * This is really a per-vcc thing, but we can also search + * by device. + */ + struct br2684_dev *brdev; + read_lock(&devs_lock); + brdev = BRPRIV(br2684_find_dev(&fs.ifspec)); + if (brdev == NULL || list_empty(&brdev->brvccs) || + brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */ + brvcc = NULL; + else + brvcc = list_entry_brvcc(brdev->brvccs.next); + read_unlock(&devs_lock); + if (brvcc == NULL) + return -ESRCH; + } else + brvcc = BR2684_VCC(atmvcc); + memcpy(&brvcc->filter, &fs.filter, sizeof(brvcc->filter)); + return 0; +} + +/* Returns 1 if packet should be dropped */ +static inline int +packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb) +{ + if (brvcc->filter.netmask == 0) + return 0; /* no filter in place */ + if (type == htons(ETH_P_IP) && + (((struct iphdr *)(skb->data))->daddr & brvcc->filter. + netmask) == brvcc->filter.prefix) + return 0; + if (type == htons(ETH_P_ARP)) + return 0; + /* + * TODO: we should probably filter ARPs too.. don't want to have + * them returning values that don't make sense, or is that ok? + */ + return 1; /* drop */ +} +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + +static void br2684_close_vcc(struct br2684_vcc *brvcc) +{ + pr_debug("removing VCC %p from dev %p\n", brvcc, brvcc->device); + write_lock_irq(&devs_lock); + list_del(&brvcc->brvccs); + write_unlock_irq(&devs_lock); + brvcc->atmvcc->user_back = NULL; /* what about vcc->recvq ??? */ + brvcc->old_push(brvcc->atmvcc, NULL); /* pass on the bad news */ + kfree(brvcc); + module_put(THIS_MODULE); +} + +/* when AAL5 PDU comes in: */ +static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) +{ + struct br2684_vcc *brvcc = BR2684_VCC(atmvcc); + struct net_device *net_dev = brvcc->device; + struct br2684_dev *brdev = BRPRIV(net_dev); + + pr_debug("\n"); + + if (unlikely(skb == NULL)) { + /* skb==NULL means VCC is being destroyed */ + br2684_close_vcc(brvcc); + if (list_empty(&brdev->brvccs)) { + write_lock_irq(&devs_lock); + list_del(&brdev->br2684_devs); + write_unlock_irq(&devs_lock); + unregister_netdev(net_dev); + free_netdev(net_dev); + } + return; + } + + skb_debug(skb); + atm_return(atmvcc, skb->truesize); + pr_debug("skb from brdev %p\n", brdev); + if (brvcc->encaps == e_llc) { + + if (skb->len > 7 && skb->data[7] == 0x01) + __skb_trim(skb, skb->len - 4); + + /* accept packets that have "ipv[46]" in the snap header */ + if ((skb->len >= (sizeof(llc_oui_ipv4))) && + (memcmp(skb->data, llc_oui_ipv4, + sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) { + if (memcmp(skb->data + 6, ethertype_ipv6, + sizeof(ethertype_ipv6)) == 0) + skb->protocol = htons(ETH_P_IPV6); + else if (memcmp(skb->data + 6, ethertype_ipv4, + sizeof(ethertype_ipv4)) == 0) + skb->protocol = htons(ETH_P_IP); + else + goto error; + skb_pull(skb, sizeof(llc_oui_ipv4)); + skb_reset_network_header(skb); + skb->pkt_type = PACKET_HOST; + /* + * Let us waste some time for checking the encapsulation. + * Note, that only 7 char is checked so frames with a valid FCS + * are also accepted (but FCS is not checked of course). + */ + } else if ((skb->len >= sizeof(llc_oui_pid_pad)) && + (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) { + skb_pull(skb, sizeof(llc_oui_pid_pad)); + skb->protocol = eth_type_trans(skb, net_dev); + } else + goto error; + + } else { /* e_vc */ + if (brdev->payload == p_routed) { + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->version == 4) + skb->protocol = htons(ETH_P_IP); + else if (iph->version == 6) + skb->protocol = htons(ETH_P_IPV6); + else + goto error; + skb->pkt_type = PACKET_HOST; + } else { /* p_bridged */ + /* first 2 chars should be 0 */ + if (*((u16 *) (skb->data)) != 0) + goto error; + skb_pull(skb, BR2684_PAD_LEN); + skb->protocol = eth_type_trans(skb, net_dev); + } + } + +#ifdef CONFIG_ATM_BR2684_IPFILTER + if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) + goto dropped; +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + skb->dev = net_dev; + ATM_SKB(skb)->vcc = atmvcc; /* needed ? */ + pr_debug("received packet's protocol: %x\n", ntohs(skb->protocol)); + skb_debug(skb); + /* sigh, interface is down? */ + if (unlikely(!(net_dev->flags & IFF_UP))) + goto dropped; + net_dev->stats.rx_packets++; + net_dev->stats.rx_bytes += skb->len; + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(skb); + return; + +dropped: + net_dev->stats.rx_dropped++; + goto free_skb; +error: + net_dev->stats.rx_errors++; +free_skb: + dev_kfree_skb(skb); +} + +/* + * Assign a vcc to a dev + * Note: we do not have explicit unassign, but look at _push() + */ +static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) +{ + struct sk_buff_head queue; + int err; + struct br2684_vcc *brvcc; + struct sk_buff *skb, *tmp; + struct sk_buff_head *rq; + struct br2684_dev *brdev; + struct net_device *net_dev; + struct atm_backend_br2684 be; + unsigned long flags; + + if (copy_from_user(&be, arg, sizeof be)) + return -EFAULT; + brvcc = kzalloc(sizeof(struct br2684_vcc), GFP_KERNEL); + if (!brvcc) + return -ENOMEM; + write_lock_irq(&devs_lock); + net_dev = br2684_find_dev(&be.ifspec); + if (net_dev == NULL) { + pr_err("tried to attach to non-existent device\n"); + err = -ENXIO; + goto error; + } + brdev = BRPRIV(net_dev); + if (atmvcc->push == NULL) { + err = -EBADFD; + goto error; + } + if (!list_empty(&brdev->brvccs)) { + /* Only 1 VCC/dev right now */ + err = -EEXIST; + goto error; + } + if (be.fcs_in != BR2684_FCSIN_NO || + be.fcs_out != BR2684_FCSOUT_NO || + be.fcs_auto || be.has_vpiid || be.send_padding || + (be.encaps != BR2684_ENCAPS_VC && + be.encaps != BR2684_ENCAPS_LLC) || + be.min_size != 0) { + err = -EINVAL; + goto error; + } + pr_debug("vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, brvcc); + if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) { + unsigned char *esi = atmvcc->dev->esi; + if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5]) + memcpy(net_dev->dev_addr, esi, net_dev->addr_len); + else + net_dev->dev_addr[2] = 1; + } + list_add(&brvcc->brvccs, &brdev->brvccs); + write_unlock_irq(&devs_lock); + brvcc->device = net_dev; + brvcc->atmvcc = atmvcc; + atmvcc->user_back = brvcc; + brvcc->encaps = (enum br2684_encaps)be.encaps; + brvcc->old_push = atmvcc->push; + brvcc->old_pop = atmvcc->pop; + barrier(); + atmvcc->push = br2684_push; + atmvcc->pop = br2684_pop; + + __skb_queue_head_init(&queue); + rq = &sk_atm(atmvcc)->sk_receive_queue; + + spin_lock_irqsave(&rq->lock, flags); + skb_queue_splice_init(rq, &queue); + spin_unlock_irqrestore(&rq->lock, flags); + + skb_queue_walk_safe(&queue, skb, tmp) { + struct net_device *dev; + + br2684_push(atmvcc, skb); + dev = skb->dev; + + dev->stats.rx_bytes -= skb->len; + dev->stats.rx_packets--; + } + + /* initialize netdev carrier state */ + if (atmvcc->dev->signal == ATM_PHY_SIG_LOST) + netif_carrier_off(net_dev); + else + netif_carrier_on(net_dev); + + __module_get(THIS_MODULE); + return 0; + +error: + write_unlock_irq(&devs_lock); + kfree(brvcc); + return err; +} + +static const struct net_device_ops br2684_netdev_ops = { + .ndo_start_xmit = br2684_start_xmit, + .ndo_set_mac_address = br2684_mac_addr, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + +static const struct net_device_ops br2684_netdev_ops_routed = { + .ndo_start_xmit = br2684_start_xmit, + .ndo_set_mac_address = br2684_mac_addr, + .ndo_change_mtu = eth_change_mtu +}; + +static void br2684_setup(struct net_device *netdev) +{ + struct br2684_dev *brdev = BRPRIV(netdev); + + ether_setup(netdev); + brdev->net_dev = netdev; + + netdev->netdev_ops = &br2684_netdev_ops; + + INIT_LIST_HEAD(&brdev->brvccs); +} + +static void br2684_setup_routed(struct net_device *netdev) +{ + struct br2684_dev *brdev = BRPRIV(netdev); + + brdev->net_dev = netdev; + netdev->hard_header_len = 0; + netdev->netdev_ops = &br2684_netdev_ops_routed; + netdev->addr_len = 0; + netdev->mtu = 1500; + netdev->type = ARPHRD_PPP; + netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; + netdev->tx_queue_len = 100; + INIT_LIST_HEAD(&brdev->brvccs); +} + +static int br2684_create(void __user *arg) +{ + int err; + struct net_device *netdev; + struct br2684_dev *brdev; + struct atm_newif_br2684 ni; + enum br2684_payload payload; + + pr_debug("\n"); + + if (copy_from_user(&ni, arg, sizeof ni)) + return -EFAULT; + + if (ni.media & BR2684_FLAG_ROUTED) + payload = p_routed; + else + payload = p_bridged; + ni.media &= 0xffff; /* strip flags */ + + if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500) + return -EINVAL; + + netdev = alloc_netdev(sizeof(struct br2684_dev), + ni.ifname[0] ? ni.ifname : "nas%d", + (payload == p_routed) ? + br2684_setup_routed : br2684_setup); + if (!netdev) + return -ENOMEM; + + brdev = BRPRIV(netdev); + + pr_debug("registered netdev %s\n", netdev->name); + /* open, stop, do_ioctl ? */ + err = register_netdev(netdev); + if (err < 0) { + pr_err("register_netdev failed\n"); + free_netdev(netdev); + return err; + } + + write_lock_irq(&devs_lock); + + brdev->payload = payload; + + if (list_empty(&br2684_devs)) { + /* 1st br2684 device */ + brdev->number = 1; + } else + brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1; + + list_add_tail(&brdev->br2684_devs, &br2684_devs); + write_unlock_irq(&devs_lock); + return 0; +} + +/* + * This handles ioctls actually performed on our vcc - we must return + * -ENOIOCTLCMD for any unrecognized ioctl + */ +static int br2684_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct atm_vcc *atmvcc = ATM_SD(sock); + void __user *argp = (void __user *)arg; + atm_backend_t b; + + int err; + switch (cmd) { + case ATM_SETBACKEND: + case ATM_NEWBACKENDIF: + err = get_user(b, (atm_backend_t __user *) argp); + if (err) + return -EFAULT; + if (b != ATM_BACKEND_BR2684) + return -ENOIOCTLCMD; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (cmd == ATM_SETBACKEND) + return br2684_regvcc(atmvcc, argp); + else + return br2684_create(argp); +#ifdef CONFIG_ATM_BR2684_IPFILTER + case BR2684_SETFILT: + if (atmvcc->push != br2684_push) + return -ENOIOCTLCMD; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = br2684_setfilt(atmvcc, argp); + + return err; +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + } + return -ENOIOCTLCMD; +} + +static struct atm_ioctl br2684_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = br2684_ioctl, +}; + +#ifdef CONFIG_PROC_FS +static void *br2684_seq_start(struct seq_file *seq, loff_t * pos) + __acquires(devs_lock) +{ + read_lock(&devs_lock); + return seq_list_start(&br2684_devs, *pos); +} + +static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t * pos) +{ + return seq_list_next(v, &br2684_devs, pos); +} + +static void br2684_seq_stop(struct seq_file *seq, void *v) + __releases(devs_lock) +{ + read_unlock(&devs_lock); +} + +static int br2684_seq_show(struct seq_file *seq, void *v) +{ + const struct br2684_dev *brdev = list_entry(v, struct br2684_dev, + br2684_devs); + const struct net_device *net_dev = brdev->net_dev; + const struct br2684_vcc *brvcc; + + seq_printf(seq, "dev %.16s: num=%d, mac=%pM (%s)\n", + net_dev->name, + brdev->number, + net_dev->dev_addr, + brdev->mac_was_set ? "set" : "auto"); + + list_for_each_entry(brvcc, &brdev->brvccs, brvccs) { + seq_printf(seq, " vcc %d.%d.%d: encaps=%s payload=%s" + ", failed copies %u/%u" + "\n", brvcc->atmvcc->dev->number, + brvcc->atmvcc->vpi, brvcc->atmvcc->vci, + (brvcc->encaps == e_llc) ? "LLC" : "VC", + (brdev->payload == p_bridged) ? "bridged" : "routed", + brvcc->copies_failed, brvcc->copies_needed); +#ifdef CONFIG_ATM_BR2684_IPFILTER +#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte] +#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3) + if (brvcc->filter.netmask != 0) + seq_printf(seq, " filter=%d.%d.%d.%d/" + "%d.%d.%d.%d\n", bs(prefix), bs(netmask)); +#undef bs +#undef b1 +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + } + return 0; +} + +static const struct seq_operations br2684_seq_ops = { + .start = br2684_seq_start, + .next = br2684_seq_next, + .stop = br2684_seq_stop, + .show = br2684_seq_show, +}; + +static int br2684_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &br2684_seq_ops); +} + +static const struct file_operations br2684_proc_ops = { + .owner = THIS_MODULE, + .open = br2684_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +extern struct proc_dir_entry *atm_proc_root; /* from proc.c */ +#endif /* CONFIG_PROC_FS */ + +static int __init br2684_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *p; + p = proc_create("br2684", 0, atm_proc_root, &br2684_proc_ops); + if (p == NULL) + return -ENOMEM; +#endif + register_atm_ioctl(&br2684_ioctl_ops); + register_atmdevice_notifier(&atm_dev_notifier); + return 0; +} + +static void __exit br2684_exit(void) +{ + struct net_device *net_dev; + struct br2684_dev *brdev; + struct br2684_vcc *brvcc; + deregister_atm_ioctl(&br2684_ioctl_ops); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("br2684", atm_proc_root); +#endif + + + unregister_atmdevice_notifier(&atm_dev_notifier); + + while (!list_empty(&br2684_devs)) { + net_dev = list_entry_brdev(br2684_devs.next); + brdev = BRPRIV(net_dev); + while (!list_empty(&brdev->brvccs)) { + brvcc = list_entry_brvcc(brdev->brvccs.next); + br2684_close_vcc(brvcc); + } + + list_del(&brdev->br2684_devs); + unregister_netdev(net_dev); + free_netdev(net_dev); + } +} + +module_init(br2684_init); +module_exit(br2684_exit); + +MODULE_AUTHOR("Marcell GAL"); +MODULE_DESCRIPTION("RFC2684 bridged protocols over ATM/AAL5"); +MODULE_LICENSE("GPL"); diff --git a/net/atm/clip.c b/net/atm/clip.c new file mode 100644 index 00000000..5889074e --- /dev/null +++ b/net/atm/clip.c @@ -0,0 +1,1009 @@ +/* net/atm/clip.c - RFC1577 Classical IP over ATM */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include /* for UINT_MAX */ +#include +#include +#include +#include +#include +#include +#include /* for some manifest constants */ +#include +#include +#include +#include +#include +#include +#include /* for net/route.h */ +#include /* for struct sockaddr_in */ +#include /* for IFF_UP */ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for struct rtable and routing */ +#include /* icmp_send */ +#include /* for HZ */ +#include +#include /* for htons etc. */ +#include /* save/restore_flags */ +#include + +#include "common.h" +#include "resources.h" +#include + +static struct net_device *clip_devs; +static struct atm_vcc *atmarpd; +static struct neigh_table clip_tbl; +static struct timer_list idle_timer; + +static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) +{ + struct sock *sk; + struct atmarp_ctrl *ctrl; + struct sk_buff *skb; + + pr_debug("(%d)\n", type); + if (!atmarpd) + return -EUNATCH; + skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + ctrl = (struct atmarp_ctrl *)skb_put(skb, sizeof(struct atmarp_ctrl)); + ctrl->type = type; + ctrl->itf_num = itf; + ctrl->ip = ip; + atm_force_charge(atmarpd, skb->truesize); + + sk = sk_atm(atmarpd); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + return 0; +} + +static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry) +{ + pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh); + clip_vcc->entry = entry; + clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */ + clip_vcc->next = entry->vccs; + entry->vccs = clip_vcc; + entry->neigh->used = jiffies; +} + +static void unlink_clip_vcc(struct clip_vcc *clip_vcc) +{ + struct atmarp_entry *entry = clip_vcc->entry; + struct clip_vcc **walk; + + if (!entry) { + pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); + return; + } + netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ + entry->neigh->used = jiffies; + for (walk = &entry->vccs; *walk; walk = &(*walk)->next) + if (*walk == clip_vcc) { + int error; + + *walk = clip_vcc->next; /* atomic */ + clip_vcc->entry = NULL; + if (clip_vcc->xoff) + netif_wake_queue(entry->neigh->dev); + if (entry->vccs) + goto out; + entry->expires = jiffies - 1; + /* force resolution or expiration */ + error = neigh_update(entry->neigh, NULL, NUD_NONE, + NEIGH_UPDATE_F_ADMIN); + if (error) + pr_crit("neigh_update failed with %d\n", error); + goto out; + } + pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); +out: + netif_tx_unlock_bh(entry->neigh->dev); +} + +/* The neighbour entry n->lock is held. */ +static int neigh_check_cb(struct neighbour *n) +{ + struct atmarp_entry *entry = NEIGH2ENTRY(n); + struct clip_vcc *cv; + + for (cv = entry->vccs; cv; cv = cv->next) { + unsigned long exp = cv->last_use + cv->idle_timeout; + + if (cv->idle_timeout && time_after(jiffies, exp)) { + pr_debug("releasing vcc %p->%p of entry %p\n", + cv, cv->vcc, entry); + vcc_release_async(cv->vcc, -ETIMEDOUT); + } + } + + if (entry->vccs || time_before(jiffies, entry->expires)) + return 0; + + if (atomic_read(&n->refcnt) > 1) { + struct sk_buff *skb; + + pr_debug("destruction postponed with ref %d\n", + atomic_read(&n->refcnt)); + + while ((skb = skb_dequeue(&n->arp_queue)) != NULL) + dev_kfree_skb(skb); + + return 0; + } + + pr_debug("expired neigh %p\n", n); + return 1; +} + +static void idle_timer_check(unsigned long dummy) +{ + write_lock(&clip_tbl.lock); + __neigh_for_each_release(&clip_tbl, neigh_check_cb); + mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); + write_unlock(&clip_tbl.lock); +} + +static int clip_arp_rcv(struct sk_buff *skb) +{ + struct atm_vcc *vcc; + + pr_debug("\n"); + vcc = ATM_SKB(skb)->vcc; + if (!vcc || !atm_charge(vcc, skb->truesize)) { + dev_kfree_skb_any(skb); + return 0; + } + pr_debug("pushing to %p\n", vcc); + pr_debug("using %p\n", CLIP_VCC(vcc)->old_push); + CLIP_VCC(vcc)->old_push(vcc, skb); + return 0; +} + +static const unsigned char llc_oui[] = { + 0xaa, /* DSAP: non-ISO */ + 0xaa, /* SSAP: non-ISO */ + 0x03, /* Ctrl: Unnumbered Information Command PDU */ + 0x00, /* OUI: EtherType */ + 0x00, + 0x00 +}; + +static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct clip_vcc *clip_vcc = CLIP_VCC(vcc); + + pr_debug("\n"); + if (!skb) { + pr_debug("removing VCC %p\n", clip_vcc); + if (clip_vcc->entry) + unlink_clip_vcc(clip_vcc); + clip_vcc->old_push(vcc, NULL); /* pass on the bad news */ + kfree(clip_vcc); + return; + } + atm_return(vcc, skb->truesize); + skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; + /* clip_vcc->entry == NULL if we don't have an IP address yet */ + if (!skb->dev) { + dev_kfree_skb_any(skb); + return; + } + ATM_SKB(skb)->vcc = vcc; + skb_reset_mac_header(skb); + if (!clip_vcc->encap || + skb->len < RFC1483LLC_LEN || + memcmp(skb->data, llc_oui, sizeof(llc_oui))) + skb->protocol = htons(ETH_P_IP); + else { + skb->protocol = ((__be16 *)skb->data)[3]; + skb_pull(skb, RFC1483LLC_LEN); + if (skb->protocol == htons(ETH_P_ARP)) { + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + clip_arp_rcv(skb); + return; + } + } + clip_vcc->last_use = jiffies; + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(skb); +} + +/* + * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that + * clip_pop is atomic with respect to the critical section in clip_start_xmit. + */ + +static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct clip_vcc *clip_vcc = CLIP_VCC(vcc); + struct net_device *dev = skb->dev; + int old; + unsigned long flags; + + pr_debug("(vcc %p)\n", vcc); + clip_vcc->old_pop(vcc, skb); + /* skb->dev == NULL in outbound ARP packets */ + if (!dev) + return; + spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags); + if (atm_may_send(vcc, 0)) { + old = xchg(&clip_vcc->xoff, 0); + if (old) + netif_wake_queue(dev); + } + spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags); +} + +static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + pr_debug("(neigh %p, skb %p)\n", neigh, skb); + to_atmarpd(act_need, PRIV(neigh->dev)->number, NEIGH2ENTRY(neigh)->ip); +} + +static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb) +{ +#ifndef CONFIG_ATM_CLIP_NO_ICMP + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); +#endif + kfree_skb(skb); +} + +static const struct neigh_ops clip_neigh_ops = { + .family = AF_INET, + .solicit = clip_neigh_solicit, + .error_report = clip_neigh_error, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static int clip_constructor(struct neighbour *neigh) +{ + struct atmarp_entry *entry = NEIGH2ENTRY(neigh); + struct net_device *dev = neigh->dev; + struct in_device *in_dev; + struct neigh_parms *parms; + + pr_debug("(neigh %p, entry %p)\n", neigh, entry); + neigh->type = inet_addr_type(&init_net, entry->ip); + if (neigh->type != RTN_UNICAST) + return -EINVAL; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in_dev->arp_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + neigh->ops = &clip_neigh_ops; + neigh->output = neigh->nud_state & NUD_VALID ? + neigh->ops->connected_output : neigh->ops->output; + entry->neigh = neigh; + entry->vccs = NULL; + entry->expires = jiffies - 1; + return 0; +} + +static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd) +{ + return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd); +} + +static struct neigh_table clip_tbl = { + .family = AF_INET, + .entry_size = sizeof(struct neighbour)+sizeof(struct atmarp_entry), + .key_len = 4, + .hash = clip_hash, + .constructor = clip_constructor, + .id = "clip_arp_cache", + + /* parameters are copied from ARP ... */ + .parms = { + .tbl = &clip_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +/* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ + +/* + * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means + * to allocate the neighbour entry but not to ask atmarpd for resolution. Also, + * don't increment the usage count. This is used to create entries in + * clip_setentry. + */ + +static int clip_encap(struct atm_vcc *vcc, int mode) +{ + CLIP_VCC(vcc)->encap = mode; + return 0; +} + +static netdev_tx_t clip_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct clip_priv *clip_priv = PRIV(dev); + struct dst_entry *dst = skb_dst(skb); + struct atmarp_entry *entry; + struct neighbour *n; + struct atm_vcc *vcc; + int old; + unsigned long flags; + + pr_debug("(skb %p)\n", skb); + if (!dst) { + pr_err("skb_dst(skb) == NULL\n"); + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + n = dst_get_neighbour(dst); + if (!n) { +#if 0 + n = clip_find_neighbour(skb_dst(skb), 1); + if (!n) { + dev_kfree_skb(skb); /* lost that one */ + dev->stats.tx_dropped++; + return 0; + } + dst_set_neighbour(dst, n); +#endif + pr_err("NO NEIGHBOUR !\n"); + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + entry = NEIGH2ENTRY(n); + if (!entry->vccs) { + if (time_after(jiffies, entry->expires)) { + /* should be resolved */ + entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ; + to_atmarpd(act_need, PRIV(dev)->number, entry->ip); + } + if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) + skb_queue_tail(&entry->neigh->arp_queue, skb); + else { + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + } + return NETDEV_TX_OK; + } + pr_debug("neigh %p, vccs %p\n", entry, entry->vccs); + ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc; + pr_debug("using neighbour %p, vcc %p\n", n, vcc); + if (entry->vccs->encap) { + void *here; + + here = skb_push(skb, RFC1483LLC_LEN); + memcpy(here, llc_oui, sizeof(llc_oui)); + ((__be16 *) here)[3] = skb->protocol; + } + atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = vcc->atm_options; + entry->vccs->last_use = jiffies; + pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); + old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ + if (old) { + pr_warning("XOFF->XOFF transition\n"); + return NETDEV_TX_OK; + } + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + vcc->send(vcc, skb); + if (atm_may_send(vcc, 0)) { + entry->vccs->xoff = 0; + return NETDEV_TX_OK; + } + spin_lock_irqsave(&clip_priv->xoff_lock, flags); + netif_stop_queue(dev); /* XOFF -> throttle immediately */ + barrier(); + if (!entry->vccs->xoff) + netif_start_queue(dev); + /* Oh, we just raced with clip_pop. netif_start_queue should be + good enough, because nothing should really be asleep because + of the brief netif_stop_queue. If this isn't true or if it + changes, use netif_wake_queue instead. */ + spin_unlock_irqrestore(&clip_priv->xoff_lock, flags); + return NETDEV_TX_OK; +} + +static int clip_mkip(struct atm_vcc *vcc, int timeout) +{ + struct sk_buff_head *rq, queue; + struct clip_vcc *clip_vcc; + struct sk_buff *skb, *tmp; + unsigned long flags; + + if (!vcc->push) + return -EBADFD; + clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); + if (!clip_vcc) + return -ENOMEM; + pr_debug("%p vcc %p\n", clip_vcc, vcc); + clip_vcc->vcc = vcc; + vcc->user_back = clip_vcc; + set_bit(ATM_VF_IS_CLIP, &vcc->flags); + clip_vcc->entry = NULL; + clip_vcc->xoff = 0; + clip_vcc->encap = 1; + clip_vcc->last_use = jiffies; + clip_vcc->idle_timeout = timeout * HZ; + clip_vcc->old_push = vcc->push; + clip_vcc->old_pop = vcc->pop; + vcc->push = clip_push; + vcc->pop = clip_pop; + + __skb_queue_head_init(&queue); + rq = &sk_atm(vcc)->sk_receive_queue; + + spin_lock_irqsave(&rq->lock, flags); + skb_queue_splice_init(rq, &queue); + spin_unlock_irqrestore(&rq->lock, flags); + + /* re-process everything received between connection setup and MKIP */ + skb_queue_walk_safe(&queue, skb, tmp) { + if (!clip_devs) { + atm_return(vcc, skb->truesize); + kfree_skb(skb); + } else { + struct net_device *dev = skb->dev; + unsigned int len = skb->len; + + skb_get(skb); + clip_push(vcc, skb); + dev->stats.rx_packets--; + dev->stats.rx_bytes -= len; + kfree_skb(skb); + } + } + return 0; +} + +static int clip_setentry(struct atm_vcc *vcc, __be32 ip) +{ + struct neighbour *neigh; + struct atmarp_entry *entry; + int error; + struct clip_vcc *clip_vcc; + struct rtable *rt; + + if (vcc->push != clip_push) { + pr_warning("non-CLIP VCC\n"); + return -EBADF; + } + clip_vcc = CLIP_VCC(vcc); + if (!ip) { + if (!clip_vcc->entry) { + pr_err("hiding hidden ATMARP entry\n"); + return 0; + } + pr_debug("remove\n"); + unlink_clip_vcc(clip_vcc); + return 0; + } + rt = ip_route_output(&init_net, ip, 0, 1, 0); + if (IS_ERR(rt)) + return PTR_ERR(rt); + neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1); + ip_rt_put(rt); + if (!neigh) + return -ENOMEM; + entry = NEIGH2ENTRY(neigh); + if (entry != clip_vcc->entry) { + if (!clip_vcc->entry) + pr_debug("add\n"); + else { + pr_debug("update\n"); + unlink_clip_vcc(clip_vcc); + } + link_vcc(clip_vcc, entry); + } + error = neigh_update(neigh, llc_oui, NUD_PERMANENT, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + return error; +} + +static const struct net_device_ops clip_netdev_ops = { + .ndo_start_xmit = clip_start_xmit, +}; + +static void clip_setup(struct net_device *dev) +{ + dev->netdev_ops = &clip_netdev_ops; + dev->type = ARPHRD_ATM; + dev->hard_header_len = RFC1483LLC_LEN; + dev->mtu = RFC1626_MTU; + dev->tx_queue_len = 100; /* "normal" queue (packets) */ + /* When using a "real" qdisc, the qdisc determines the queue */ + /* length. tx_queue_len is only used for the default case, */ + /* without any more elaborate queuing. 100 is a reasonable */ + /* compromise between decent burst-tolerance and protection */ + /* against memory hogs. */ + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static int clip_create(int number) +{ + struct net_device *dev; + struct clip_priv *clip_priv; + int error; + + if (number != -1) { + for (dev = clip_devs; dev; dev = PRIV(dev)->next) + if (PRIV(dev)->number == number) + return -EEXIST; + } else { + number = 0; + for (dev = clip_devs; dev; dev = PRIV(dev)->next) + if (PRIV(dev)->number >= number) + number = PRIV(dev)->number + 1; + } + dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup); + if (!dev) + return -ENOMEM; + clip_priv = PRIV(dev); + sprintf(dev->name, "atm%d", number); + spin_lock_init(&clip_priv->xoff_lock); + clip_priv->number = number; + error = register_netdev(dev); + if (error) { + free_netdev(dev); + return error; + } + clip_priv->next = clip_devs; + clip_devs = dev; + pr_debug("registered (net:%s)\n", dev->name); + return number; +} + +static int clip_device_event(struct notifier_block *this, unsigned long event, + void *arg) +{ + struct net_device *dev = arg; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + neigh_ifdown(&clip_tbl, dev); + return NOTIFY_DONE; + } + + /* ignore non-CLIP devices */ + if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + pr_debug("NETDEV_UP\n"); + to_atmarpd(act_up, PRIV(dev)->number, 0); + break; + case NETDEV_GOING_DOWN: + pr_debug("NETDEV_DOWN\n"); + to_atmarpd(act_down, PRIV(dev)->number, 0); + break; + case NETDEV_CHANGE: + case NETDEV_CHANGEMTU: + pr_debug("NETDEV_CHANGE*\n"); + to_atmarpd(act_change, PRIV(dev)->number, 0); + break; + } + return NOTIFY_DONE; +} + +static int clip_inet_event(struct notifier_block *this, unsigned long event, + void *ifa) +{ + struct in_device *in_dev; + + in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; + /* + * Transitions are of the down-change-up type, so it's sufficient to + * handle the change on up. + */ + if (event != NETDEV_UP) + return NOTIFY_DONE; + return clip_device_event(this, NETDEV_CHANGE, in_dev->dev); +} + +static struct notifier_block clip_dev_notifier = { + .notifier_call = clip_device_event, +}; + + + +static struct notifier_block clip_inet_notifier = { + .notifier_call = clip_inet_event, +}; + + + +static void atmarpd_close(struct atm_vcc *vcc) +{ + pr_debug("\n"); + + rtnl_lock(); + atmarpd = NULL; + skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); + rtnl_unlock(); + + pr_debug("(done)\n"); + module_put(THIS_MODULE); +} + +static struct atmdev_ops atmarpd_dev_ops = { + .close = atmarpd_close +}; + + +static struct atm_dev atmarpd_dev = { + .ops = &atmarpd_dev_ops, + .type = "arpd", + .number = 999, + .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) +}; + + +static int atm_init_atmarp(struct atm_vcc *vcc) +{ + rtnl_lock(); + if (atmarpd) { + rtnl_unlock(); + return -EADDRINUSE; + } + + mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); + + atmarpd = vcc; + set_bit(ATM_VF_META, &vcc->flags); + set_bit(ATM_VF_READY, &vcc->flags); + /* allow replies and avoid getting closed if signaling dies */ + vcc->dev = &atmarpd_dev; + vcc_insert_socket(sk_atm(vcc)); + vcc->push = NULL; + vcc->pop = NULL; /* crash */ + vcc->push_oam = NULL; /* crash */ + rtnl_unlock(); + return 0; +} + +static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct atm_vcc *vcc = ATM_SD(sock); + int err = 0; + + switch (cmd) { + case SIOCMKCLIP: + case ATMARPD_CTRL: + case ATMARP_MKIP: + case ATMARP_SETENTRY: + case ATMARP_ENCAP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + return -ENOIOCTLCMD; + } + + switch (cmd) { + case SIOCMKCLIP: + err = clip_create(arg); + break; + case ATMARPD_CTRL: + err = atm_init_atmarp(vcc); + if (!err) { + sock->state = SS_CONNECTED; + __module_get(THIS_MODULE); + } + break; + case ATMARP_MKIP: + err = clip_mkip(vcc, arg); + break; + case ATMARP_SETENTRY: + err = clip_setentry(vcc, (__force __be32)arg); + break; + case ATMARP_ENCAP: + err = clip_encap(vcc, arg); + break; + } + return err; +} + +static struct atm_ioctl clip_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = clip_ioctl, +}; + +#ifdef CONFIG_PROC_FS + +static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) +{ + static int code[] = { 1, 2, 10, 6, 1, 0 }; + static int e164[] = { 1, 8, 4, 6, 1, 0 }; + + if (*addr->sas_addr.pub) { + seq_printf(seq, "%s", addr->sas_addr.pub); + if (*addr->sas_addr.prv) + seq_putc(seq, '+'); + } else if (!*addr->sas_addr.prv) { + seq_printf(seq, "%s", "(none)"); + return; + } + if (*addr->sas_addr.prv) { + unsigned char *prv = addr->sas_addr.prv; + int *fields; + int i, j; + + fields = *prv == ATM_AFI_E164 ? e164 : code; + for (i = 0; fields[i]; i++) { + for (j = fields[i]; j; j--) + seq_printf(seq, "%02X", *prv++); + if (fields[i + 1]) + seq_putc(seq, '.'); + } + } +} + +/* This means the neighbour entry has no attached VCC objects. */ +#define SEQ_NO_VCC_TOKEN ((void *) 2) + +static void atmarp_info(struct seq_file *seq, struct net_device *dev, + struct atmarp_entry *entry, struct clip_vcc *clip_vcc) +{ + unsigned long exp; + char buf[17]; + int svc, llc, off; + + svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || + (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC)); + + llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap); + + if (clip_vcc == SEQ_NO_VCC_TOKEN) + exp = entry->neigh->used; + else + exp = clip_vcc->last_use; + + exp = (jiffies - exp) / HZ; + + seq_printf(seq, "%-6s%-4s%-4s%5ld ", + dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp); + + off = scnprintf(buf, sizeof(buf) - 1, "%pI4", + &entry->ip); + while (off < 16) + buf[off++] = ' '; + buf[off] = '\0'; + seq_printf(seq, "%s", buf); + + if (clip_vcc == SEQ_NO_VCC_TOKEN) { + if (time_before(jiffies, entry->expires)) + seq_printf(seq, "(resolving)\n"); + else + seq_printf(seq, "(expired, ref %d)\n", + atomic_read(&entry->neigh->refcnt)); + } else if (!svc) { + seq_printf(seq, "%d.%d.%d\n", + clip_vcc->vcc->dev->number, + clip_vcc->vcc->vpi, clip_vcc->vcc->vci); + } else { + svc_addr(seq, &clip_vcc->vcc->remote); + seq_putc(seq, '\n'); + } +} + +struct clip_seq_state { + /* This member must be first. */ + struct neigh_seq_state ns; + + /* Local to clip specific iteration. */ + struct clip_vcc *vcc; +}; + +static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e, + struct clip_vcc *curr) +{ + if (!curr) { + curr = e->vccs; + if (!curr) + return SEQ_NO_VCC_TOKEN; + return curr; + } + if (curr == SEQ_NO_VCC_TOKEN) + return NULL; + + curr = curr->next; + + return curr; +} + +static void *clip_seq_vcc_walk(struct clip_seq_state *state, + struct atmarp_entry *e, loff_t * pos) +{ + struct clip_vcc *vcc = state->vcc; + + vcc = clip_seq_next_vcc(e, vcc); + if (vcc && pos != NULL) { + while (*pos) { + vcc = clip_seq_next_vcc(e, vcc); + if (!vcc) + break; + --(*pos); + } + } + state->vcc = vcc; + + return vcc; +} + +static void *clip_seq_sub_iter(struct neigh_seq_state *_state, + struct neighbour *n, loff_t * pos) +{ + struct clip_seq_state *state = (struct clip_seq_state *)_state; + + return clip_seq_vcc_walk(state, NEIGH2ENTRY(n), pos); +} + +static void *clip_seq_start(struct seq_file *seq, loff_t * pos) +{ + struct clip_seq_state *state = seq->private; + state->ns.neigh_sub_iter = clip_seq_sub_iter; + return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY); +} + +static int clip_seq_show(struct seq_file *seq, void *v) +{ + static char atm_arp_banner[] = + "IPitf TypeEncp Idle IP address ATM address\n"; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, atm_arp_banner); + } else { + struct clip_seq_state *state = seq->private; + struct neighbour *n = v; + struct clip_vcc *vcc = state->vcc; + + atmarp_info(seq, n->dev, NEIGH2ENTRY(n), vcc); + } + return 0; +} + +static const struct seq_operations arp_seq_ops = { + .start = clip_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = clip_seq_show, +}; + +static int arp_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &arp_seq_ops, + sizeof(struct clip_seq_state)); +} + +static const struct file_operations arp_seq_fops = { + .open = arp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, + .owner = THIS_MODULE +}; +#endif + +static void atm_clip_exit_noproc(void); + +static int __init atm_clip_init(void) +{ + neigh_table_init_no_netlink(&clip_tbl); + + clip_tbl_hook = &clip_tbl; + register_atm_ioctl(&clip_ioctl_ops); + register_netdevice_notifier(&clip_dev_notifier); + register_inetaddr_notifier(&clip_inet_notifier); + + setup_timer(&idle_timer, idle_timer_check, 0); + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *p; + + p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops); + if (!p) { + pr_err("Unable to initialize /proc/net/atm/arp\n"); + atm_clip_exit_noproc(); + return -ENOMEM; + } + } +#endif + + return 0; +} + +static void atm_clip_exit_noproc(void) +{ + struct net_device *dev, *next; + + unregister_inetaddr_notifier(&clip_inet_notifier); + unregister_netdevice_notifier(&clip_dev_notifier); + + deregister_atm_ioctl(&clip_ioctl_ops); + + /* First, stop the idle timer, so it stops banging + * on the table. + */ + del_timer_sync(&idle_timer); + + /* Next, purge the table, so that the device + * unregister loop below does not hang due to + * device references remaining in the table. + */ + neigh_ifdown(&clip_tbl, NULL); + + dev = clip_devs; + while (dev) { + next = PRIV(dev)->next; + unregister_netdev(dev); + free_netdev(dev); + dev = next; + } + + /* Now it is safe to fully shutdown whole table. */ + neigh_table_clear(&clip_tbl); + + clip_tbl_hook = NULL; +} + +static void __exit atm_clip_exit(void) +{ + remove_proc_entry("arp", atm_proc_root); + + atm_clip_exit_noproc(); +} + +module_init(atm_clip_init); +module_exit(atm_clip_exit); +MODULE_AUTHOR("Werner Almesberger"); +MODULE_DESCRIPTION("Classical/IP over ATM interface"); +MODULE_LICENSE("GPL"); diff --git a/net/atm/common.c b/net/atm/common.c new file mode 100644 index 00000000..22b963d0 --- /dev/null +++ b/net/atm/common.c @@ -0,0 +1,870 @@ +/* net/atm/common.c - ATM sockets (common part for PVC and SVC) */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include /* struct socket, struct proto_ops */ +#include /* ATM stuff */ +#include +#include /* SOL_SOCKET */ +#include /* error codes */ +#include +#include +#include +#include /* struct timeval */ +#include +#include +#include +#include +#include /* struct sock */ +#include +#include + +#include + +#include "resources.h" /* atm_find_dev */ +#include "common.h" /* prototypes */ +#include "protocols.h" /* atm_init_ */ +#include "addr.h" /* address registry */ +#include "signaling.h" /* for WAITING and sigd_attach */ + +struct hlist_head vcc_hash[VCC_HTABLE_SIZE]; +EXPORT_SYMBOL(vcc_hash); + +DEFINE_RWLOCK(vcc_sklist_lock); +EXPORT_SYMBOL(vcc_sklist_lock); + +static ATOMIC_NOTIFIER_HEAD(atm_dev_notify_chain); + +static void __vcc_insert_socket(struct sock *sk) +{ + struct atm_vcc *vcc = atm_sk(sk); + struct hlist_head *head = &vcc_hash[vcc->vci & (VCC_HTABLE_SIZE - 1)]; + sk->sk_hash = vcc->vci & (VCC_HTABLE_SIZE - 1); + sk_add_node(sk, head); +} + +void vcc_insert_socket(struct sock *sk) +{ + write_lock_irq(&vcc_sklist_lock); + __vcc_insert_socket(sk); + write_unlock_irq(&vcc_sklist_lock); +} +EXPORT_SYMBOL(vcc_insert_socket); + +static void vcc_remove_socket(struct sock *sk) +{ + write_lock_irq(&vcc_sklist_lock); + sk_del_node_init(sk); + write_unlock_irq(&vcc_sklist_lock); +} + +static struct sk_buff *alloc_tx(struct atm_vcc *vcc, unsigned int size) +{ + struct sk_buff *skb; + struct sock *sk = sk_atm(vcc); + + if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) { + pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n", + sk_wmem_alloc_get(sk), size, sk->sk_sndbuf); + return NULL; + } + while (!(skb = alloc_skb(size, GFP_KERNEL))) + schedule(); + pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize); + atomic_add(skb->truesize, &sk->sk_wmem_alloc); + return skb; +} + +static void vcc_sock_destruct(struct sock *sk) +{ + if (atomic_read(&sk->sk_rmem_alloc)) + printk(KERN_DEBUG "%s: rmem leakage (%d bytes) detected.\n", + __func__, atomic_read(&sk->sk_rmem_alloc)); + + if (atomic_read(&sk->sk_wmem_alloc)) + printk(KERN_DEBUG "%s: wmem leakage (%d bytes) detected.\n", + __func__, atomic_read(&sk->sk_wmem_alloc)); +} + +static void vcc_def_wakeup(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up(&wq->wait); + rcu_read_unlock(); +} + +static inline int vcc_writable(struct sock *sk) +{ + struct atm_vcc *vcc = atm_sk(sk); + + return (vcc->qos.txtp.max_sdu + + atomic_read(&sk->sk_wmem_alloc)) <= sk->sk_sndbuf; +} + +static void vcc_write_space(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + + if (vcc_writable(sk)) { + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); + + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } + + rcu_read_unlock(); +} + +static struct proto vcc_proto = { + .name = "VCC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct atm_vcc), +}; + +int vcc_create(struct net *net, struct socket *sock, int protocol, int family) +{ + struct sock *sk; + struct atm_vcc *vcc; + + sock->sk = NULL; + if (sock->type == SOCK_STREAM) + return -EINVAL; + sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto); + if (!sk) + return -ENOMEM; + sock_init_data(sock, sk); + sk->sk_state_change = vcc_def_wakeup; + sk->sk_write_space = vcc_write_space; + + vcc = atm_sk(sk); + vcc->dev = NULL; + memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc)); + memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc)); + vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */ + atomic_set(&sk->sk_wmem_alloc, 1); + atomic_set(&sk->sk_rmem_alloc, 0); + vcc->push = NULL; + vcc->pop = NULL; + vcc->push_oam = NULL; + vcc->vpi = vcc->vci = 0; /* no VCI/VPI yet */ + vcc->atm_options = vcc->aal_options = 0; + sk->sk_destruct = vcc_sock_destruct; + return 0; +} + +static void vcc_destroy_socket(struct sock *sk) +{ + struct atm_vcc *vcc = atm_sk(sk); + struct sk_buff *skb; + + set_bit(ATM_VF_CLOSE, &vcc->flags); + clear_bit(ATM_VF_READY, &vcc->flags); + if (vcc->dev) { + if (vcc->dev->ops->close) + vcc->dev->ops->close(vcc); + if (vcc->push) + vcc->push(vcc, NULL); /* atmarpd has no push */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + atm_return(vcc, skb->truesize); + kfree_skb(skb); + } + + module_put(vcc->dev->ops->owner); + atm_dev_put(vcc->dev); + } + + vcc_remove_socket(sk); +} + +int vcc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + lock_sock(sk); + vcc_destroy_socket(sock->sk); + release_sock(sk); + sock_put(sk); + } + + return 0; +} + +void vcc_release_async(struct atm_vcc *vcc, int reply) +{ + struct sock *sk = sk_atm(vcc); + + set_bit(ATM_VF_CLOSE, &vcc->flags); + sk->sk_shutdown |= RCV_SHUTDOWN; + sk->sk_err = -reply; + clear_bit(ATM_VF_WAITING, &vcc->flags); + sk->sk_state_change(sk); +} +EXPORT_SYMBOL(vcc_release_async); + +void atm_dev_signal_change(struct atm_dev *dev, char signal) +{ + pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n", + __func__, signal, dev, dev->number, dev->signal); + + /* atm driver sending invalid signal */ + WARN_ON(signal < ATM_PHY_SIG_LOST || signal > ATM_PHY_SIG_FOUND); + + if (dev->signal == signal) + return; /* no change */ + + dev->signal = signal; + + atomic_notifier_call_chain(&atm_dev_notify_chain, signal, dev); +} +EXPORT_SYMBOL(atm_dev_signal_change); + +void atm_dev_release_vccs(struct atm_dev *dev) +{ + int i; + + write_lock_irq(&vcc_sklist_lock); + for (i = 0; i < VCC_HTABLE_SIZE; i++) { + struct hlist_head *head = &vcc_hash[i]; + struct hlist_node *node, *tmp; + struct sock *s; + struct atm_vcc *vcc; + + sk_for_each_safe(s, node, tmp, head) { + vcc = atm_sk(s); + if (vcc->dev == dev) { + vcc_release_async(vcc, -EPIPE); + sk_del_node_init(s); + } + } + } + write_unlock_irq(&vcc_sklist_lock); +} +EXPORT_SYMBOL(atm_dev_release_vccs); + +static int adjust_tp(struct atm_trafprm *tp, unsigned char aal) +{ + int max_sdu; + + if (!tp->traffic_class) + return 0; + switch (aal) { + case ATM_AAL0: + max_sdu = ATM_CELL_SIZE-1; + break; + case ATM_AAL34: + max_sdu = ATM_MAX_AAL34_PDU; + break; + default: + pr_warning("AAL problems ... (%d)\n", aal); + /* fall through */ + case ATM_AAL5: + max_sdu = ATM_MAX_AAL5_PDU; + } + if (!tp->max_sdu) + tp->max_sdu = max_sdu; + else if (tp->max_sdu > max_sdu) + return -EINVAL; + if (!tp->max_cdv) + tp->max_cdv = ATM_MAX_CDV; + return 0; +} + +static int check_ci(const struct atm_vcc *vcc, short vpi, int vci) +{ + struct hlist_head *head = &vcc_hash[vci & (VCC_HTABLE_SIZE - 1)]; + struct hlist_node *node; + struct sock *s; + struct atm_vcc *walk; + + sk_for_each(s, node, head) { + walk = atm_sk(s); + if (walk->dev != vcc->dev) + continue; + if (test_bit(ATM_VF_ADDR, &walk->flags) && walk->vpi == vpi && + walk->vci == vci && ((walk->qos.txtp.traffic_class != + ATM_NONE && vcc->qos.txtp.traffic_class != ATM_NONE) || + (walk->qos.rxtp.traffic_class != ATM_NONE && + vcc->qos.rxtp.traffic_class != ATM_NONE))) + return -EADDRINUSE; + } + + /* allow VCCs with same VPI/VCI iff they don't collide on + TX/RX (but we may refuse such sharing for other reasons, + e.g. if protocol requires to have both channels) */ + + return 0; +} + +static int find_ci(const struct atm_vcc *vcc, short *vpi, int *vci) +{ + static short p; /* poor man's per-device cache */ + static int c; + short old_p; + int old_c; + int err; + + if (*vpi != ATM_VPI_ANY && *vci != ATM_VCI_ANY) { + err = check_ci(vcc, *vpi, *vci); + return err; + } + /* last scan may have left values out of bounds for current device */ + if (*vpi != ATM_VPI_ANY) + p = *vpi; + else if (p >= 1 << vcc->dev->ci_range.vpi_bits) + p = 0; + if (*vci != ATM_VCI_ANY) + c = *vci; + else if (c < ATM_NOT_RSV_VCI || c >= 1 << vcc->dev->ci_range.vci_bits) + c = ATM_NOT_RSV_VCI; + old_p = p; + old_c = c; + do { + if (!check_ci(vcc, p, c)) { + *vpi = p; + *vci = c; + return 0; + } + if (*vci == ATM_VCI_ANY) { + c++; + if (c >= 1 << vcc->dev->ci_range.vci_bits) + c = ATM_NOT_RSV_VCI; + } + if ((c == ATM_NOT_RSV_VCI || *vci != ATM_VCI_ANY) && + *vpi == ATM_VPI_ANY) { + p++; + if (p >= 1 << vcc->dev->ci_range.vpi_bits) + p = 0; + } + } while (old_p != p || old_c != c); + return -EADDRINUSE; +} + +static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi, + int vci) +{ + struct sock *sk = sk_atm(vcc); + int error; + + if ((vpi != ATM_VPI_UNSPEC && vpi != ATM_VPI_ANY && + vpi >> dev->ci_range.vpi_bits) || (vci != ATM_VCI_UNSPEC && + vci != ATM_VCI_ANY && vci >> dev->ci_range.vci_bits)) + return -EINVAL; + if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE)) + return -EPERM; + error = -ENODEV; + if (!try_module_get(dev->ops->owner)) + return error; + vcc->dev = dev; + write_lock_irq(&vcc_sklist_lock); + if (test_bit(ATM_DF_REMOVED, &dev->flags) || + (error = find_ci(vcc, &vpi, &vci))) { + write_unlock_irq(&vcc_sklist_lock); + goto fail_module_put; + } + vcc->vpi = vpi; + vcc->vci = vci; + __vcc_insert_socket(sk); + write_unlock_irq(&vcc_sklist_lock); + switch (vcc->qos.aal) { + case ATM_AAL0: + error = atm_init_aal0(vcc); + vcc->stats = &dev->stats.aal0; + break; + case ATM_AAL34: + error = atm_init_aal34(vcc); + vcc->stats = &dev->stats.aal34; + break; + case ATM_NO_AAL: + /* ATM_AAL5 is also used in the "0 for default" case */ + vcc->qos.aal = ATM_AAL5; + /* fall through */ + case ATM_AAL5: + error = atm_init_aal5(vcc); + vcc->stats = &dev->stats.aal5; + break; + default: + error = -EPROTOTYPE; + } + if (!error) + error = adjust_tp(&vcc->qos.txtp, vcc->qos.aal); + if (!error) + error = adjust_tp(&vcc->qos.rxtp, vcc->qos.aal); + if (error) + goto fail; + pr_debug("VCC %d.%d, AAL %d\n", vpi, vci, vcc->qos.aal); + pr_debug(" TX: %d, PCR %d..%d, SDU %d\n", + vcc->qos.txtp.traffic_class, + vcc->qos.txtp.min_pcr, + vcc->qos.txtp.max_pcr, + vcc->qos.txtp.max_sdu); + pr_debug(" RX: %d, PCR %d..%d, SDU %d\n", + vcc->qos.rxtp.traffic_class, + vcc->qos.rxtp.min_pcr, + vcc->qos.rxtp.max_pcr, + vcc->qos.rxtp.max_sdu); + + if (dev->ops->open) { + error = dev->ops->open(vcc); + if (error) + goto fail; + } + return 0; + +fail: + vcc_remove_socket(sk); +fail_module_put: + module_put(dev->ops->owner); + /* ensure we get dev module ref count correct */ + vcc->dev = NULL; + return error; +} + +int vcc_connect(struct socket *sock, int itf, short vpi, int vci) +{ + struct atm_dev *dev; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + pr_debug("(vpi %d, vci %d)\n", vpi, vci); + if (sock->state == SS_CONNECTED) + return -EISCONN; + if (sock->state != SS_UNCONNECTED) + return -EINVAL; + if (!(vpi || vci)) + return -EINVAL; + + if (vpi != ATM_VPI_UNSPEC && vci != ATM_VCI_UNSPEC) + clear_bit(ATM_VF_PARTIAL, &vcc->flags); + else + if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) + return -EINVAL; + pr_debug("(TX: cl %d,bw %d-%d,sdu %d; " + "RX: cl %d,bw %d-%d,sdu %d,AAL %s%d)\n", + vcc->qos.txtp.traffic_class, vcc->qos.txtp.min_pcr, + vcc->qos.txtp.max_pcr, vcc->qos.txtp.max_sdu, + vcc->qos.rxtp.traffic_class, vcc->qos.rxtp.min_pcr, + vcc->qos.rxtp.max_pcr, vcc->qos.rxtp.max_sdu, + vcc->qos.aal == ATM_AAL5 ? "" : + vcc->qos.aal == ATM_AAL0 ? "" : " ??? code ", + vcc->qos.aal == ATM_AAL0 ? 0 : vcc->qos.aal); + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) + return -EBADFD; + if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || + vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) + return -EINVAL; + if (likely(itf != ATM_ITF_ANY)) { + dev = try_then_request_module(atm_dev_lookup(itf), + "atm-device-%d", itf); + } else { + dev = NULL; + mutex_lock(&atm_dev_mutex); + if (!list_empty(&atm_devs)) { + dev = list_entry(atm_devs.next, + struct atm_dev, dev_list); + atm_dev_hold(dev); + } + mutex_unlock(&atm_dev_mutex); + } + if (!dev) + return -ENODEV; + error = __vcc_connect(vcc, dev, vpi, vci); + if (error) { + atm_dev_put(dev); + return error; + } + if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC) + set_bit(ATM_VF_PARTIAL, &vcc->flags); + if (test_bit(ATM_VF_READY, &ATM_SD(sock)->flags)) + sock->state = SS_CONNECTED; + return 0; +} + +int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + struct sk_buff *skb; + int copied, error = -EINVAL; + + if (sock->state != SS_CONNECTED) + return -ENOTCONN; + if (flags & ~MSG_DONTWAIT) /* only handle MSG_DONTWAIT */ + return -EOPNOTSUPP; + vcc = ATM_SD(sock); + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + test_bit(ATM_VF_CLOSE, &vcc->flags) || + !test_bit(ATM_VF_READY, &vcc->flags)) + return 0; + + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error); + if (!skb) + return error; + + copied = skb->len; + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (error) + return error; + sock_recv_ts_and_drops(msg, sk, skb); + pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc), skb->truesize); + atm_return(vcc, skb->truesize); + skb_free_datagram(sk, skb); + return copied; +} + +int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t total_len) +{ + struct sock *sk = sock->sk; + DEFINE_WAIT(wait); + struct atm_vcc *vcc; + struct sk_buff *skb; + int eff, error; + const void __user *buff; + int size; + + lock_sock(sk); + if (sock->state != SS_CONNECTED) { + error = -ENOTCONN; + goto out; + } + if (m->msg_name) { + error = -EISCONN; + goto out; + } + if (m->msg_iovlen != 1) { + error = -ENOSYS; /* fix this later @@@ */ + goto out; + } + buff = m->msg_iov->iov_base; + size = m->msg_iov->iov_len; + vcc = ATM_SD(sock); + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + test_bit(ATM_VF_CLOSE, &vcc->flags) || + !test_bit(ATM_VF_READY, &vcc->flags)) { + error = -EPIPE; + send_sig(SIGPIPE, current, 0); + goto out; + } + if (!size) { + error = 0; + goto out; + } + if (size < 0 || size > vcc->qos.txtp.max_sdu) { + error = -EMSGSIZE; + goto out; + } + + eff = (size+3) & ~3; /* align to word boundary */ + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + error = 0; + while (!(skb = alloc_tx(vcc, eff))) { + if (m->msg_flags & MSG_DONTWAIT) { + error = -EAGAIN; + break; + } + schedule(); + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + test_bit(ATM_VF_CLOSE, &vcc->flags) || + !test_bit(ATM_VF_READY, &vcc->flags)) { + error = -EPIPE; + send_sig(SIGPIPE, current, 0); + break; + } + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + if (error) + goto out; + skb->dev = NULL; /* for paths shared with net_device interfaces */ + ATM_SKB(skb)->atm_options = vcc->atm_options; + if (copy_from_user(skb_put(skb, size), buff, size)) { + kfree_skb(skb); + error = -EFAULT; + goto out; + } + if (eff != size) + memset(skb->data + size, 0, eff-size); + error = vcc->dev->ops->send(vcc, skb); + error = error ? error : size; +out: + release_sock(sk); + return error; +} + +unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + unsigned int mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + vcc = ATM_SD(sock); + + /* exceptional events */ + if (sk->sk_err) + mask = POLLERR; + + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + test_bit(ATM_VF_CLOSE, &vcc->flags)) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* writable? */ + if (sock->state == SS_CONNECTING && + test_bit(ATM_VF_WAITING, &vcc->flags)) + return mask; + + if (vcc->qos.txtp.traffic_class != ATM_NONE && + vcc_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + +static int atm_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) +{ + int error; + + /* + * Don't let the QoS change the already connected AAL type nor the + * traffic class. + */ + if (qos->aal != vcc->qos.aal || + qos->rxtp.traffic_class != vcc->qos.rxtp.traffic_class || + qos->txtp.traffic_class != vcc->qos.txtp.traffic_class) + return -EINVAL; + error = adjust_tp(&qos->txtp, qos->aal); + if (!error) + error = adjust_tp(&qos->rxtp, qos->aal); + if (error) + return error; + if (!vcc->dev->ops->change_qos) + return -EOPNOTSUPP; + if (sk_atm(vcc)->sk_family == AF_ATMPVC) + return vcc->dev->ops->change_qos(vcc, qos, ATM_MF_SET); + return svc_change_qos(vcc, qos); +} + +static int check_tp(const struct atm_trafprm *tp) +{ + /* @@@ Should be merged with adjust_tp */ + if (!tp->traffic_class || tp->traffic_class == ATM_ANYCLASS) + return 0; + if (tp->traffic_class != ATM_UBR && !tp->min_pcr && !tp->pcr && + !tp->max_pcr) + return -EINVAL; + if (tp->min_pcr == ATM_MAX_PCR) + return -EINVAL; + if (tp->min_pcr && tp->max_pcr && tp->max_pcr != ATM_MAX_PCR && + tp->min_pcr > tp->max_pcr) + return -EINVAL; + /* + * We allow pcr to be outside [min_pcr,max_pcr], because later + * adjustment may still push it in the valid range. + */ + return 0; +} + +static int check_qos(const struct atm_qos *qos) +{ + int error; + + if (!qos->txtp.traffic_class && !qos->rxtp.traffic_class) + return -EINVAL; + if (qos->txtp.traffic_class != qos->rxtp.traffic_class && + qos->txtp.traffic_class && qos->rxtp.traffic_class && + qos->txtp.traffic_class != ATM_ANYCLASS && + qos->rxtp.traffic_class != ATM_ANYCLASS) + return -EINVAL; + error = check_tp(&qos->txtp); + if (error) + return error; + return check_tp(&qos->rxtp); +} + +int vcc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct atm_vcc *vcc; + unsigned long value; + int error; + + if (__SO_LEVEL_MATCH(optname, level) && optlen != __SO_SIZE(optname)) + return -EINVAL; + + vcc = ATM_SD(sock); + switch (optname) { + case SO_ATMQOS: + { + struct atm_qos qos; + + if (copy_from_user(&qos, optval, sizeof(qos))) + return -EFAULT; + error = check_qos(&qos); + if (error) + return error; + if (sock->state == SS_CONNECTED) + return atm_change_qos(vcc, &qos); + if (sock->state != SS_UNCONNECTED) + return -EBADFD; + vcc->qos = qos; + set_bit(ATM_VF_HASQOS, &vcc->flags); + return 0; + } + case SO_SETCLP: + if (get_user(value, (unsigned long __user *)optval)) + return -EFAULT; + if (value) + vcc->atm_options |= ATM_ATMOPT_CLP; + else + vcc->atm_options &= ~ATM_ATMOPT_CLP; + return 0; + default: + if (level == SOL_SOCKET) + return -EINVAL; + break; + } + if (!vcc->dev || !vcc->dev->ops->setsockopt) + return -EINVAL; + return vcc->dev->ops->setsockopt(vcc, level, optname, optval, optlen); +} + +int vcc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct atm_vcc *vcc; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + if (__SO_LEVEL_MATCH(optname, level) && len != __SO_SIZE(optname)) + return -EINVAL; + + vcc = ATM_SD(sock); + switch (optname) { + case SO_ATMQOS: + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) + return -EINVAL; + return copy_to_user(optval, &vcc->qos, sizeof(vcc->qos)) + ? -EFAULT : 0; + case SO_SETCLP: + return put_user(vcc->atm_options & ATM_ATMOPT_CLP ? 1 : 0, + (unsigned long __user *)optval) ? -EFAULT : 0; + case SO_ATMPVC: + { + struct sockaddr_atmpvc pvc; + + if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) + return -ENOTCONN; + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = vcc->dev->number; + pvc.sap_addr.vpi = vcc->vpi; + pvc.sap_addr.vci = vcc->vci; + return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0; + } + default: + if (level == SOL_SOCKET) + return -EINVAL; + break; + } + if (!vcc->dev || !vcc->dev->ops->getsockopt) + return -EINVAL; + return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len); +} + +int register_atmdevice_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&atm_dev_notify_chain, nb); +} +EXPORT_SYMBOL_GPL(register_atmdevice_notifier); + +void unregister_atmdevice_notifier(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&atm_dev_notify_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_atmdevice_notifier); + +static int __init atm_init(void) +{ + int error; + + error = proto_register(&vcc_proto, 0); + if (error < 0) + goto out; + error = atmpvc_init(); + if (error < 0) { + pr_err("atmpvc_init() failed with %d\n", error); + goto out_unregister_vcc_proto; + } + error = atmsvc_init(); + if (error < 0) { + pr_err("atmsvc_init() failed with %d\n", error); + goto out_atmpvc_exit; + } + error = atm_proc_init(); + if (error < 0) { + pr_err("atm_proc_init() failed with %d\n", error); + goto out_atmsvc_exit; + } + error = atm_sysfs_init(); + if (error < 0) { + pr_err("atm_sysfs_init() failed with %d\n", error); + goto out_atmproc_exit; + } +out: + return error; +out_atmproc_exit: + atm_proc_exit(); +out_atmsvc_exit: + atmsvc_exit(); +out_atmpvc_exit: + atmsvc_exit(); +out_unregister_vcc_proto: + proto_unregister(&vcc_proto); + goto out; +} + +static void __exit atm_exit(void) +{ + atm_proc_exit(); + atm_sysfs_exit(); + atmsvc_exit(); + atmpvc_exit(); + proto_unregister(&vcc_proto); +} + +subsys_initcall(atm_init); + +module_exit(atm_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_ATMPVC); +MODULE_ALIAS_NETPROTO(PF_ATMSVC); diff --git a/net/atm/common.h b/net/atm/common.h new file mode 100644 index 00000000..f48a76b6 --- /dev/null +++ b/net/atm/common.h @@ -0,0 +1,55 @@ +/* net/atm/common.h - ATM sockets (common part for PVC and SVC) */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_COMMON_H +#define NET_ATM_COMMON_H + +#include +#include /* for poll_table */ + + +int vcc_create(struct net *net, struct socket *sock, int protocol, int family); +int vcc_release(struct socket *sock); +int vcc_connect(struct socket *sock, int itf, short vpi, int vci); +int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags); +int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t total_len); +unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait); +int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +int vcc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen); +int vcc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen); + +int atmpvc_init(void); +void atmpvc_exit(void); +int atmsvc_init(void); +void atmsvc_exit(void); +int atm_sysfs_init(void); +void atm_sysfs_exit(void); + +#ifdef CONFIG_PROC_FS +int atm_proc_init(void); +void atm_proc_exit(void); +#else +static inline int atm_proc_init(void) +{ + return 0; +} + +static inline void atm_proc_exit(void) +{ + /* nothing */ +} +#endif /* CONFIG_PROC_FS */ + +/* SVC */ +int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos); + +void atm_dev_release_vccs(struct atm_dev *dev); + +#endif diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c new file mode 100644 index 00000000..62dc8bfe --- /dev/null +++ b/net/atm/ioctl.c @@ -0,0 +1,371 @@ +/* ATM ioctl handling */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ +/* 2003 John Levon */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include /* struct socket, struct proto_ops */ +#include /* ATM stuff */ +#include +#include /* CLIP_*ENCAP */ +#include /* manifest constants */ +#include +#include /* for ioctls */ +#include +#include +#include +#include +#include +#include +#include + +#include "resources.h" +#include "signaling.h" /* for WAITING and sigd_attach */ +#include "common.h" + + +static DEFINE_MUTEX(ioctl_mutex); +static LIST_HEAD(ioctl_list); + + +void register_atm_ioctl(struct atm_ioctl *ioctl) +{ + mutex_lock(&ioctl_mutex); + list_add_tail(&ioctl->list, &ioctl_list); + mutex_unlock(&ioctl_mutex); +} +EXPORT_SYMBOL(register_atm_ioctl); + +void deregister_atm_ioctl(struct atm_ioctl *ioctl) +{ + mutex_lock(&ioctl_mutex); + list_del(&ioctl->list); + mutex_unlock(&ioctl_mutex); +} +EXPORT_SYMBOL(deregister_atm_ioctl); + +static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg, int compat) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + int error; + struct list_head *pos; + void __user *argp = (void __user *)arg; + + vcc = ATM_SD(sock); + switch (cmd) { + case SIOCOUTQ: + if (sock->state != SS_CONNECTED || + !test_bit(ATM_VF_READY, &vcc->flags)) { + error = -EINVAL; + goto done; + } + error = put_user(sk->sk_sndbuf - sk_wmem_alloc_get(sk), + (int __user *)argp) ? -EFAULT : 0; + goto done; + case SIOCINQ: + { + struct sk_buff *skb; + + if (sock->state != SS_CONNECTED) { + error = -EINVAL; + goto done; + } + skb = skb_peek(&sk->sk_receive_queue); + error = put_user(skb ? skb->len : 0, + (int __user *)argp) ? -EFAULT : 0; + goto done; + } + case SIOCGSTAMP: /* borrowed from IP */ +#ifdef CONFIG_COMPAT + if (compat) + error = compat_sock_get_timestamp(sk, argp); + else +#endif + error = sock_get_timestamp(sk, argp); + goto done; + case SIOCGSTAMPNS: /* borrowed from IP */ +#ifdef CONFIG_COMPAT + if (compat) + error = compat_sock_get_timestampns(sk, argp); + else +#endif + error = sock_get_timestampns(sk, argp); + goto done; + case ATM_SETSC: + if (net_ratelimit()) + pr_warning("ATM_SETSC is obsolete; used by %s:%d\n", + current->comm, task_pid_nr(current)); + error = 0; + goto done; + case ATMSIGD_CTRL: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + /* + * The user/kernel protocol for exchanging signalling + * info uses kernel pointers as opaque references, + * so the holder of the file descriptor can scribble + * on the kernel... so we should make sure that we + * have the same privileges that /proc/kcore needs + */ + if (!capable(CAP_SYS_RAWIO)) { + error = -EPERM; + goto done; + } +#ifdef CONFIG_COMPAT + /* WTF? I don't even want to _think_ about making this + work for 32-bit userspace. TBH I don't really want + to think about it at all. dwmw2. */ + if (compat) { + if (net_ratelimit()) + pr_warning("32-bit task cannot be atmsigd\n"); + error = -EINVAL; + goto done; + } +#endif + error = sigd_attach(vcc); + if (!error) + sock->state = SS_CONNECTED; + goto done; + case ATM_SETBACKEND: + case ATM_NEWBACKENDIF: + { + atm_backend_t backend; + error = get_user(backend, (atm_backend_t __user *)argp); + if (error) + goto done; + switch (backend) { + case ATM_BACKEND_PPP: + request_module("pppoatm"); + break; + case ATM_BACKEND_BR2684: + request_module("br2684"); + break; + } + break; + } + case ATMMPC_CTRL: + case ATMMPC_DATA: + request_module("mpoa"); + break; + case ATMARPD_CTRL: + request_module("clip"); + break; + case ATMLEC_CTRL: + request_module("lec"); + break; + } + + error = -ENOIOCTLCMD; + + mutex_lock(&ioctl_mutex); + list_for_each(pos, &ioctl_list) { + struct atm_ioctl *ic = list_entry(pos, struct atm_ioctl, list); + if (try_module_get(ic->owner)) { + error = ic->ioctl(sock, cmd, arg); + module_put(ic->owner); + if (error != -ENOIOCTLCMD) + break; + } + } + mutex_unlock(&ioctl_mutex); + + if (error != -ENOIOCTLCMD) + goto done; + + error = atm_dev_ioctl(cmd, argp, compat); + +done: + return error; +} + +int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_vcc_ioctl(sock, cmd, arg, 0); +} + +#ifdef CONFIG_COMPAT +/* + * FIXME: + * The compat_ioctl handling is duplicated, using both these conversion + * routines and the compat argument to the actual handlers. Both + * versions are somewhat incomplete and should be merged, e.g. by + * moving the ioctl number translation into the actual handlers and + * killing the conversion code. + * + * -arnd, November 2009 + */ +#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct compat_atmif_sioc) +#define ATM_GETNAMES32 _IOW('a', ATMIOC_ITF+3, struct compat_atm_iobuf) +#define ATM_GETTYPE32 _IOW('a', ATMIOC_ITF+4, struct compat_atmif_sioc) +#define ATM_GETESI32 _IOW('a', ATMIOC_ITF+5, struct compat_atmif_sioc) +#define ATM_GETADDR32 _IOW('a', ATMIOC_ITF+6, struct compat_atmif_sioc) +#define ATM_RSTADDR32 _IOW('a', ATMIOC_ITF+7, struct compat_atmif_sioc) +#define ATM_ADDADDR32 _IOW('a', ATMIOC_ITF+8, struct compat_atmif_sioc) +#define ATM_DELADDR32 _IOW('a', ATMIOC_ITF+9, struct compat_atmif_sioc) +#define ATM_GETCIRANGE32 _IOW('a', ATMIOC_ITF+10, struct compat_atmif_sioc) +#define ATM_SETCIRANGE32 _IOW('a', ATMIOC_ITF+11, struct compat_atmif_sioc) +#define ATM_SETESI32 _IOW('a', ATMIOC_ITF+12, struct compat_atmif_sioc) +#define ATM_SETESIF32 _IOW('a', ATMIOC_ITF+13, struct compat_atmif_sioc) +#define ATM_GETSTAT32 _IOW('a', ATMIOC_SARCOM+0, struct compat_atmif_sioc) +#define ATM_GETSTATZ32 _IOW('a', ATMIOC_SARCOM+1, struct compat_atmif_sioc) +#define ATM_GETLOOP32 _IOW('a', ATMIOC_SARCOM+2, struct compat_atmif_sioc) +#define ATM_SETLOOP32 _IOW('a', ATMIOC_SARCOM+3, struct compat_atmif_sioc) +#define ATM_QUERYLOOP32 _IOW('a', ATMIOC_SARCOM+4, struct compat_atmif_sioc) + +static struct { + unsigned int cmd32; + unsigned int cmd; +} atm_ioctl_map[] = { + { ATM_GETLINKRATE32, ATM_GETLINKRATE }, + { ATM_GETNAMES32, ATM_GETNAMES }, + { ATM_GETTYPE32, ATM_GETTYPE }, + { ATM_GETESI32, ATM_GETESI }, + { ATM_GETADDR32, ATM_GETADDR }, + { ATM_RSTADDR32, ATM_RSTADDR }, + { ATM_ADDADDR32, ATM_ADDADDR }, + { ATM_DELADDR32, ATM_DELADDR }, + { ATM_GETCIRANGE32, ATM_GETCIRANGE }, + { ATM_SETCIRANGE32, ATM_SETCIRANGE }, + { ATM_SETESI32, ATM_SETESI }, + { ATM_SETESIF32, ATM_SETESIF }, + { ATM_GETSTAT32, ATM_GETSTAT }, + { ATM_GETSTATZ32, ATM_GETSTATZ }, + { ATM_GETLOOP32, ATM_GETLOOP }, + { ATM_SETLOOP32, ATM_SETLOOP }, + { ATM_QUERYLOOP32, ATM_QUERYLOOP }, +}; + +#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map) + +static int do_atm_iobuf(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct atm_iobuf __user *iobuf; + struct compat_atm_iobuf __user *iobuf32; + u32 data; + void __user *datap; + int len, err; + + iobuf = compat_alloc_user_space(sizeof(*iobuf)); + iobuf32 = compat_ptr(arg); + + if (get_user(len, &iobuf32->length) || + get_user(data, &iobuf32->buffer)) + return -EFAULT; + datap = compat_ptr(data); + if (put_user(len, &iobuf->length) || + put_user(datap, &iobuf->buffer)) + return -EFAULT; + + err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0); + + if (!err) { + if (copy_in_user(&iobuf32->length, &iobuf->length, + sizeof(int))) + err = -EFAULT; + } + + return err; +} + +static int do_atmif_sioc(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct atmif_sioc __user *sioc; + struct compat_atmif_sioc __user *sioc32; + u32 data; + void __user *datap; + int err; + + sioc = compat_alloc_user_space(sizeof(*sioc)); + sioc32 = compat_ptr(arg); + + if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) || + get_user(data, &sioc32->arg)) + return -EFAULT; + datap = compat_ptr(data); + if (put_user(datap, &sioc->arg)) + return -EFAULT; + + err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0); + + if (!err) { + if (copy_in_user(&sioc32->length, &sioc->length, + sizeof(int))) + err = -EFAULT; + } + return err; +} + +static int do_atm_ioctl(struct socket *sock, unsigned int cmd32, + unsigned long arg) +{ + int i; + unsigned int cmd = 0; + + switch (cmd32) { + case SONET_GETSTAT: + case SONET_GETSTATZ: + case SONET_GETDIAG: + case SONET_SETDIAG: + case SONET_CLRDIAG: + case SONET_SETFRAMING: + case SONET_GETFRAMING: + case SONET_GETFRSENSE: + return do_atmif_sioc(sock, cmd32, arg); + } + + for (i = 0; i < NR_ATM_IOCTL; i++) { + if (cmd32 == atm_ioctl_map[i].cmd32) { + cmd = atm_ioctl_map[i].cmd; + break; + } + } + if (i == NR_ATM_IOCTL) + return -EINVAL; + + switch (cmd) { + case ATM_GETNAMES: + return do_atm_iobuf(sock, cmd, arg); + + case ATM_GETLINKRATE: + case ATM_GETTYPE: + case ATM_GETESI: + case ATM_GETADDR: + case ATM_RSTADDR: + case ATM_ADDADDR: + case ATM_DELADDR: + case ATM_GETCIRANGE: + case ATM_SETCIRANGE: + case ATM_SETESI: + case ATM_SETESIF: + case ATM_GETSTAT: + case ATM_GETSTATZ: + case ATM_GETLOOP: + case ATM_SETLOOP: + case ATM_QUERYLOOP: + return do_atmif_sioc(sock, cmd, arg); + } + + return -EINVAL; +} + +int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + int ret; + + ret = do_vcc_ioctl(sock, cmd, arg, 1); + if (ret != -ENOIOCTLCMD) + return ret; + + return do_atm_ioctl(sock, cmd, arg); +} +#endif diff --git a/net/atm/lec.c b/net/atm/lec.c new file mode 100644 index 00000000..ba48daa6 --- /dev/null +++ b/net/atm/lec.c @@ -0,0 +1,2409 @@ +/* + * lec.c: Lan Emulation driver + * + * Marko Kiiskila + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include + +/* We are ethernet device */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* TokenRing if needed */ +#ifdef CONFIG_TR +#include +#endif + +/* And atm device */ +#include +#include + +/* Proxy LEC knows about bridging */ +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +#include "../bridge/br_private.h" + +static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 }; +#endif + +/* Modular too */ +#include +#include + +#include "lec.h" +#include "lec_arpc.h" +#include "resources.h" + +#define DUMP_PACKETS 0 /* + * 0 = None, + * 1 = 30 first bytes + * 2 = Whole packet + */ + +#define LEC_UNRES_QUE_LEN 8 /* + * number of tx packets to queue for a + * single destination while waiting for SVC + */ + +static int lec_open(struct net_device *dev); +static netdev_tx_t lec_start_xmit(struct sk_buff *skb, + struct net_device *dev); +static int lec_close(struct net_device *dev); +static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, + const unsigned char *mac_addr); +static int lec_arp_remove(struct lec_priv *priv, + struct lec_arp_table *to_remove); +/* LANE2 functions */ +static void lane2_associate_ind(struct net_device *dev, const u8 *mac_address, + const u8 *tlvs, u32 sizeoftlvs); +static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force, + u8 **tlvs, u32 *sizeoftlvs); +static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst, + const u8 *tlvs, u32 sizeoftlvs); + +static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, + unsigned long permanent); +static void lec_arp_check_empties(struct lec_priv *priv, + struct atm_vcc *vcc, struct sk_buff *skb); +static void lec_arp_destroy(struct lec_priv *priv); +static void lec_arp_init(struct lec_priv *priv); +static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, + const unsigned char *mac_to_find, + int is_rdesc, + struct lec_arp_table **ret_entry); +static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, + const unsigned char *atm_addr, + unsigned long remoteflag, + unsigned int targetless_le_arp); +static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id); +static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc); +static void lec_set_flush_tran_id(struct lec_priv *priv, + const unsigned char *atm_addr, + unsigned long tran_id); +static void lec_vcc_added(struct lec_priv *priv, + const struct atmlec_ioc *ioc_data, + struct atm_vcc *vcc, + void (*old_push)(struct atm_vcc *vcc, + struct sk_buff *skb)); +static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc); + +/* must be done under lec_arp_lock */ +static inline void lec_arp_hold(struct lec_arp_table *entry) +{ + atomic_inc(&entry->usage); +} + +static inline void lec_arp_put(struct lec_arp_table *entry) +{ + if (atomic_dec_and_test(&entry->usage)) + kfree(entry); +} + +static struct lane2_ops lane2_ops = { + lane2_resolve, /* resolve, spec 3.1.3 */ + lane2_associate_req, /* associate_req, spec 3.1.4 */ + NULL /* associate indicator, spec 3.1.5 */ +}; + +static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +/* Device structures */ +static struct net_device *dev_lec[MAX_LEC_ITF]; + +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) +{ + char *buff; + struct lec_priv *priv; + + /* + * Check if this is a BPDU. If so, ask zeppelin to send + * LE_TOPOLOGY_REQUEST with the same value of Topology Change bit + * as the Config BPDU has + */ + buff = skb->data + skb->dev->hard_header_len; + if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) { + struct sock *sk; + struct sk_buff *skb2; + struct atmlec_msg *mesg; + + skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); + if (skb2 == NULL) + return; + skb2->len = sizeof(struct atmlec_msg); + mesg = (struct atmlec_msg *)skb2->data; + mesg->type = l_topology_change; + buff += 4; + mesg->content.normal.flag = *buff & 0x01; + /* 0x01 is topology change */ + + priv = netdev_priv(dev); + atm_force_charge(priv->lecd, skb2->truesize); + sk = sk_atm(priv->lecd); + skb_queue_tail(&sk->sk_receive_queue, skb2); + sk->sk_data_ready(sk, skb2->len); + } +} +#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ + +/* + * Modelled after tr_type_trans + * All multicast and ARE or STE frames go to BUS. + * Non source routed frames go by destination address. + * Last hop source routed frames go by destination address. + * Not last hop source routed frames go by _next_ route descriptor. + * Returns pointer to destination MAC address or fills in rdesc + * and returns NULL. + */ +#ifdef CONFIG_TR +static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) +{ + struct trh_hdr *trh; + unsigned int riflen, num_rdsc; + + trh = (struct trh_hdr *)packet; + if (trh->daddr[0] & (uint8_t) 0x80) + return bus_mac; /* multicast */ + + if (trh->saddr[0] & TR_RII) { + riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8; + if ((ntohs(trh->rcf) >> 13) != 0) + return bus_mac; /* ARE or STE */ + } else + return trh->daddr; /* not source routed */ + + if (riflen < 6) + return trh->daddr; /* last hop, source routed */ + + /* riflen is 6 or more, packet has more than one route descriptor */ + num_rdsc = (riflen / 2) - 1; + memset(rdesc, 0, ETH_ALEN); + /* offset 4 comes from LAN destination field in LE control frames */ + if (trh->rcf & htons((uint16_t) TR_RCF_DIR_BIT)) + memcpy(&rdesc[4], &trh->rseg[num_rdsc - 2], sizeof(__be16)); + else { + memcpy(&rdesc[4], &trh->rseg[1], sizeof(__be16)); + rdesc[5] = ((ntohs(trh->rseg[0]) & 0x000f) | (rdesc[5] & 0xf0)); + } + + return NULL; +} +#endif /* CONFIG_TR */ + +/* + * Open/initialize the netdevice. This is called (in the current kernel) + * sometime after booting when the 'ifconfig' program is run. + * + * This routine should set everything up anew at each open, even + * registers that "should" only need to be set once at boot, so that + * there is non-reboot way to recover if something goes wrong. + */ + +static int lec_open(struct net_device *dev) +{ + netif_start_queue(dev); + + return 0; +} + +static void +lec_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + ATM_SKB(skb)->vcc = vcc; + ATM_SKB(skb)->atm_options = vcc->atm_options; + + atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + if (vcc->send(vcc, skb) < 0) { + dev->stats.tx_dropped++; + return; + } + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; +} + +static void lec_tx_timeout(struct net_device *dev) +{ + pr_info("%s\n", dev->name); + dev->trans_start = jiffies; + netif_wake_queue(dev); +} + +static netdev_tx_t lec_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct sk_buff *skb2; + struct lec_priv *priv = netdev_priv(dev); + struct lecdatahdr_8023 *lec_h; + struct atm_vcc *vcc; + struct lec_arp_table *entry; + unsigned char *dst; + int min_frame_size; +#ifdef CONFIG_TR + unsigned char rdesc[ETH_ALEN]; /* Token Ring route descriptor */ +#endif + int is_rdesc; + + pr_debug("called\n"); + if (!priv->lecd) { + pr_info("%s:No lecd attached\n", dev->name); + dev->stats.tx_errors++; + netif_stop_queue(dev); + kfree_skb(skb); + return NETDEV_TX_OK; + } + + pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n", + (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb), + (long)skb_end_pointer(skb)); +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) + lec_handle_bridge(skb, dev); +#endif + + /* Make sure we have room for lec_id */ + if (skb_headroom(skb) < 2) { + pr_debug("reallocating skb\n"); + skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN); + kfree_skb(skb); + if (skb2 == NULL) + return NETDEV_TX_OK; + skb = skb2; + } + skb_push(skb, 2); + + /* Put le header to place, works for TokenRing too */ + lec_h = (struct lecdatahdr_8023 *)skb->data; + lec_h->le_header = htons(priv->lecid); + +#ifdef CONFIG_TR + /* + * Ugly. Use this to realign Token Ring packets for + * e.g. PCA-200E driver. + */ + if (priv->is_trdev) { + skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN); + kfree_skb(skb); + if (skb2 == NULL) + return NETDEV_TX_OK; + skb = skb2; + } +#endif + +#if DUMP_PACKETS >= 2 +#define MAX_DUMP_SKB 99 +#elif DUMP_PACKETS >= 1 +#define MAX_DUMP_SKB 30 +#endif +#if DUMP_PACKETS >= 1 + printk(KERN_DEBUG "%s: send datalen:%ld lecid:%4.4x\n", + dev->name, skb->len, priv->lecid); + print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1, + skb->data, min(skb->len, MAX_DUMP_SKB), true); +#endif /* DUMP_PACKETS >= 1 */ + + /* Minimum ethernet-frame size */ +#ifdef CONFIG_TR + if (priv->is_trdev) + min_frame_size = LEC_MINIMUM_8025_SIZE; + else +#endif + min_frame_size = LEC_MINIMUM_8023_SIZE; + if (skb->len < min_frame_size) { + if ((skb->len + skb_tailroom(skb)) < min_frame_size) { + skb2 = skb_copy_expand(skb, 0, + min_frame_size - skb->truesize, + GFP_ATOMIC); + dev_kfree_skb(skb); + if (skb2 == NULL) { + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + skb = skb2; + } + skb_put(skb, min_frame_size - skb->len); + } + + /* Send to right vcc */ + is_rdesc = 0; + dst = lec_h->h_dest; +#ifdef CONFIG_TR + if (priv->is_trdev) { + dst = get_tr_dst(skb->data + 2, rdesc); + if (dst == NULL) { + dst = rdesc; + is_rdesc = 1; + } + } +#endif + entry = NULL; + vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry); + pr_debug("%s:vcc:%p vcc_flags:%lx, entry:%p\n", + dev->name, vcc, vcc ? vcc->flags : 0, entry); + if (!vcc || !test_bit(ATM_VF_READY, &vcc->flags)) { + if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) { + pr_debug("%s:queuing packet, MAC address %pM\n", + dev->name, lec_h->h_dest); + skb_queue_tail(&entry->tx_wait, skb); + } else { + pr_debug("%s:tx queue full or no arp entry, dropping, MAC address: %pM\n", + dev->name, lec_h->h_dest); + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + } + goto out; + } +#if DUMP_PACKETS > 0 + printk(KERN_DEBUG "%s:sending to vpi:%d vci:%d\n", + dev->name, vcc->vpi, vcc->vci); +#endif /* DUMP_PACKETS > 0 */ + + while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) { + pr_debug("emptying tx queue, MAC address %pM\n", lec_h->h_dest); + lec_send(vcc, skb2); + } + + lec_send(vcc, skb); + + if (!atm_may_send(vcc, 0)) { + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + + vpriv->xoff = 1; + netif_stop_queue(dev); + + /* + * vcc->pop() might have occurred in between, making + * the vcc usuable again. Since xmit is serialized, + * this is the only situation we have to re-test. + */ + + if (atm_may_send(vcc, 0)) + netif_wake_queue(dev); + } + +out: + if (entry) + lec_arp_put(entry); + dev->trans_start = jiffies; + return NETDEV_TX_OK; +} + +/* The inverse routine to net_open(). */ +static int lec_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + unsigned long flags; + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct lec_priv *priv = netdev_priv(dev); + struct atmlec_msg *mesg; + struct lec_arp_table *entry; + int i; + char *tmp; /* FIXME */ + + atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + mesg = (struct atmlec_msg *)skb->data; + tmp = skb->data; + tmp += sizeof(struct atmlec_msg); + pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type); + switch (mesg->type) { + case l_set_mac_addr: + for (i = 0; i < 6; i++) + dev->dev_addr[i] = mesg->content.normal.mac_addr[i]; + break; + case l_del_mac_addr: + for (i = 0; i < 6; i++) + dev->dev_addr[i] = 0; + break; + case l_addr_delete: + lec_addr_delete(priv, mesg->content.normal.atm_addr, + mesg->content.normal.flag); + break; + case l_topology_change: + priv->topology_change = mesg->content.normal.flag; + break; + case l_flush_complete: + lec_flush_complete(priv, mesg->content.normal.flag); + break; + case l_narp_req: /* LANE2: see 7.1.35 in the lane2 spec */ + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = lec_arp_find(priv, mesg->content.normal.mac_addr); + lec_arp_remove(priv, entry); + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + if (mesg->content.normal.no_source_le_narp) + break; + /* FALL THROUGH */ + case l_arp_update: + lec_arp_update(priv, mesg->content.normal.mac_addr, + mesg->content.normal.atm_addr, + mesg->content.normal.flag, + mesg->content.normal.targetless_le_arp); + pr_debug("in l_arp_update\n"); + if (mesg->sizeoftlvs != 0) { /* LANE2 3.1.5 */ + pr_debug("LANE2 3.1.5, got tlvs, size %d\n", + mesg->sizeoftlvs); + lane2_associate_ind(dev, mesg->content.normal.mac_addr, + tmp, mesg->sizeoftlvs); + } + break; + case l_config: + priv->maximum_unknown_frame_count = + mesg->content.config.maximum_unknown_frame_count; + priv->max_unknown_frame_time = + (mesg->content.config.max_unknown_frame_time * HZ); + priv->max_retry_count = mesg->content.config.max_retry_count; + priv->aging_time = (mesg->content.config.aging_time * HZ); + priv->forward_delay_time = + (mesg->content.config.forward_delay_time * HZ); + priv->arp_response_time = + (mesg->content.config.arp_response_time * HZ); + priv->flush_timeout = (mesg->content.config.flush_timeout * HZ); + priv->path_switching_delay = + (mesg->content.config.path_switching_delay * HZ); + priv->lane_version = mesg->content.config.lane_version; + /* LANE2 */ + priv->lane2_ops = NULL; + if (priv->lane_version > 1) + priv->lane2_ops = &lane2_ops; + if (dev_set_mtu(dev, mesg->content.config.mtu)) + pr_info("%s: change_mtu to %d failed\n", + dev->name, mesg->content.config.mtu); + priv->is_proxy = mesg->content.config.is_proxy; + break; + case l_flush_tran_id: + lec_set_flush_tran_id(priv, mesg->content.normal.atm_addr, + mesg->content.normal.flag); + break; + case l_set_lecid: + priv->lecid = + (unsigned short)(0xffff & mesg->content.normal.flag); + break; + case l_should_bridge: +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + { + pr_debug("%s: bridge zeppelin asks about %pM\n", + dev->name, mesg->content.proxy.mac_addr); + + if (br_fdb_test_addr_hook == NULL) + break; + + if (br_fdb_test_addr_hook(dev, mesg->content.proxy.mac_addr)) { + /* hit from bridge table, send LE_ARP_RESPONSE */ + struct sk_buff *skb2; + struct sock *sk; + + pr_debug("%s: entry found, responding to zeppelin\n", + dev->name); + skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); + if (skb2 == NULL) + break; + skb2->len = sizeof(struct atmlec_msg); + skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg)); + atm_force_charge(priv->lecd, skb2->truesize); + sk = sk_atm(priv->lecd); + skb_queue_tail(&sk->sk_receive_queue, skb2); + sk->sk_data_ready(sk, skb2->len); + } + } +#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ + break; + default: + pr_info("%s: Unknown message type %d\n", dev->name, mesg->type); + dev_kfree_skb(skb); + return -EINVAL; + } + dev_kfree_skb(skb); + return 0; +} + +static void lec_atm_close(struct atm_vcc *vcc) +{ + struct sk_buff *skb; + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct lec_priv *priv = netdev_priv(dev); + + priv->lecd = NULL; + /* Do something needful? */ + + netif_stop_queue(dev); + lec_arp_destroy(priv); + + if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) + pr_info("%s closing with messages pending\n", dev->name); + while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { + atm_return(vcc, skb->truesize); + dev_kfree_skb(skb); + } + + pr_info("%s: Shut down!\n", dev->name); + module_put(THIS_MODULE); +} + +static struct atmdev_ops lecdev_ops = { + .close = lec_atm_close, + .send = lec_atm_send +}; + +static struct atm_dev lecatm_dev = { + .ops = &lecdev_ops, + .type = "lec", + .number = 999, /* dummy device number */ + .lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock) +}; + +/* + * LANE2: new argument struct sk_buff *data contains + * the LE_ARP based TLVs introduced in the LANE2 spec + */ +static int +send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, + const unsigned char *mac_addr, const unsigned char *atm_addr, + struct sk_buff *data) +{ + struct sock *sk; + struct sk_buff *skb; + struct atmlec_msg *mesg; + + if (!priv || !priv->lecd) + return -1; + skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); + if (!skb) + return -1; + skb->len = sizeof(struct atmlec_msg); + mesg = (struct atmlec_msg *)skb->data; + memset(mesg, 0, sizeof(struct atmlec_msg)); + mesg->type = type; + if (data != NULL) + mesg->sizeoftlvs = data->len; + if (mac_addr) + memcpy(&mesg->content.normal.mac_addr, mac_addr, ETH_ALEN); + else + mesg->content.normal.targetless_le_arp = 1; + if (atm_addr) + memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN); + + atm_force_charge(priv->lecd, skb->truesize); + sk = sk_atm(priv->lecd); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + if (data != NULL) { + pr_debug("about to send %d bytes of data\n", data->len); + atm_force_charge(priv->lecd, data->truesize); + skb_queue_tail(&sk->sk_receive_queue, data); + sk->sk_data_ready(sk, skb->len); + } + + return 0; +} + +/* shamelessly stolen from drivers/net/net_init.c */ +static int lec_change_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < 68) || (new_mtu > 18190)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void lec_set_multicast_list(struct net_device *dev) +{ + /* + * by default, all multicast frames arrive over the bus. + * eventually support selective multicast service + */ +} + +static const struct net_device_ops lec_netdev_ops = { + .ndo_open = lec_open, + .ndo_stop = lec_close, + .ndo_start_xmit = lec_start_xmit, + .ndo_change_mtu = lec_change_mtu, + .ndo_tx_timeout = lec_tx_timeout, + .ndo_set_multicast_list = lec_set_multicast_list, +}; + +static const unsigned char lec_ctrl_magic[] = { + 0xff, + 0x00, + 0x01, + 0x01 +}; + +#define LEC_DATA_DIRECT_8023 2 +#define LEC_DATA_DIRECT_8025 3 + +static int lec_is_data_direct(struct atm_vcc *vcc) +{ + return ((vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8023) || + (vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8025)); +} + +static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) +{ + unsigned long flags; + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct lec_priv *priv = netdev_priv(dev); + +#if DUMP_PACKETS > 0 + printk(KERN_DEBUG "%s: vcc vpi:%d vci:%d\n", + dev->name, vcc->vpi, vcc->vci); +#endif + if (!skb) { + pr_debug("%s: null skb\n", dev->name); + lec_vcc_close(priv, vcc); + return; + } +#if DUMP_PACKETS >= 2 +#define MAX_SKB_DUMP 99 +#elif DUMP_PACKETS >= 1 +#define MAX_SKB_DUMP 30 +#endif +#if DUMP_PACKETS > 0 + printk(KERN_DEBUG "%s: rcv datalen:%ld lecid:%4.4x\n", + dev->name, skb->len, priv->lecid); + print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1, + skb->data, min(MAX_SKB_DUMP, skb->len), true); +#endif /* DUMP_PACKETS > 0 */ + if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) { + /* Control frame, to daemon */ + struct sock *sk = sk_atm(vcc); + + pr_debug("%s: To daemon\n", dev->name); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + } else { /* Data frame, queue to protocol handlers */ + struct lec_arp_table *entry; + unsigned char *src, *dst; + + atm_return(vcc, skb->truesize); + if (*(__be16 *) skb->data == htons(priv->lecid) || + !priv->lecd || !(dev->flags & IFF_UP)) { + /* + * Probably looping back, or if lecd is missing, + * lecd has gone down + */ + pr_debug("Ignoring frame...\n"); + dev_kfree_skb(skb); + return; + } +#ifdef CONFIG_TR + if (priv->is_trdev) + dst = ((struct lecdatahdr_8025 *)skb->data)->h_dest; + else +#endif + dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest; + + /* + * If this is a Data Direct VCC, and the VCC does not match + * the LE_ARP cache entry, delete the LE_ARP cache entry. + */ + spin_lock_irqsave(&priv->lec_arp_lock, flags); + if (lec_is_data_direct(vcc)) { +#ifdef CONFIG_TR + if (priv->is_trdev) + src = + ((struct lecdatahdr_8025 *)skb->data)-> + h_source; + else +#endif + src = + ((struct lecdatahdr_8023 *)skb->data)-> + h_source; + entry = lec_arp_find(priv, src); + if (entry && entry->vcc != vcc) { + lec_arp_remove(priv, entry); + lec_arp_put(entry); + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + if (!(dst[0] & 0x01) && /* Never filter Multi/Broadcast */ + !priv->is_proxy && /* Proxy wants all the packets */ + memcmp(dst, dev->dev_addr, dev->addr_len)) { + dev_kfree_skb(skb); + return; + } + if (!hlist_empty(&priv->lec_arp_empty_ones)) + lec_arp_check_empties(priv, vcc, skb); + skb_pull(skb, 2); /* skip lec_id */ +#ifdef CONFIG_TR + if (priv->is_trdev) + skb->protocol = tr_type_trans(skb, dev); + else +#endif + skb->protocol = eth_type_trans(skb, dev); + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(skb); + } +} + +static void lec_pop(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + struct net_device *dev = skb->dev; + + if (vpriv == NULL) { + pr_info("vpriv = NULL!?!?!?\n"); + return; + } + + vpriv->old_pop(vcc, skb); + + if (vpriv->xoff && atm_may_send(vcc, 0)) { + vpriv->xoff = 0; + if (netif_running(dev) && netif_queue_stopped(dev)) + netif_wake_queue(dev); + } +} + +static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) +{ + struct lec_vcc_priv *vpriv; + int bytes_left; + struct atmlec_ioc ioc_data; + + /* Lecd must be up in this case */ + bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc)); + if (bytes_left != 0) + pr_info("copy from user failed for %d bytes\n", bytes_left); + if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF || + !dev_lec[ioc_data.dev_num]) + return -EINVAL; + vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL); + if (!vpriv) + return -ENOMEM; + vpriv->xoff = 0; + vpriv->old_pop = vcc->pop; + vcc->user_back = vpriv; + vcc->pop = lec_pop; + lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]), + &ioc_data, vcc, vcc->push); + vcc->proto_data = dev_lec[ioc_data.dev_num]; + vcc->push = lec_push; + return 0; +} + +static int lec_mcast_attach(struct atm_vcc *vcc, int arg) +{ + if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg]) + return -EINVAL; + vcc->proto_data = dev_lec[arg]; + return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc); +} + +/* Initialize device. */ +static int lecd_attach(struct atm_vcc *vcc, int arg) +{ + int i; + struct lec_priv *priv; + + if (arg < 0) + i = 0; + else + i = arg; +#ifdef CONFIG_TR + if (arg >= MAX_LEC_ITF) + return -EINVAL; +#else /* Reserve the top NUM_TR_DEVS for TR */ + if (arg >= (MAX_LEC_ITF - NUM_TR_DEVS)) + return -EINVAL; +#endif + if (!dev_lec[i]) { + int is_trdev, size; + + is_trdev = 0; + if (i >= (MAX_LEC_ITF - NUM_TR_DEVS)) + is_trdev = 1; + + size = sizeof(struct lec_priv); +#ifdef CONFIG_TR + if (is_trdev) + dev_lec[i] = alloc_trdev(size); + else +#endif + dev_lec[i] = alloc_etherdev(size); + if (!dev_lec[i]) + return -ENOMEM; + dev_lec[i]->netdev_ops = &lec_netdev_ops; + snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); + if (register_netdev(dev_lec[i])) { + free_netdev(dev_lec[i]); + return -EINVAL; + } + + priv = netdev_priv(dev_lec[i]); + priv->is_trdev = is_trdev; + } else { + priv = netdev_priv(dev_lec[i]); + if (priv->lecd) + return -EADDRINUSE; + } + lec_arp_init(priv); + priv->itfnum = i; /* LANE2 addition */ + priv->lecd = vcc; + vcc->dev = &lecatm_dev; + vcc_insert_socket(sk_atm(vcc)); + + vcc->proto_data = dev_lec[i]; + set_bit(ATM_VF_META, &vcc->flags); + set_bit(ATM_VF_READY, &vcc->flags); + + /* Set default values to these variables */ + priv->maximum_unknown_frame_count = 1; + priv->max_unknown_frame_time = (1 * HZ); + priv->vcc_timeout_period = (1200 * HZ); + priv->max_retry_count = 1; + priv->aging_time = (300 * HZ); + priv->forward_delay_time = (15 * HZ); + priv->topology_change = 0; + priv->arp_response_time = (1 * HZ); + priv->flush_timeout = (4 * HZ); + priv->path_switching_delay = (6 * HZ); + + if (dev_lec[i]->flags & IFF_UP) + netif_start_queue(dev_lec[i]); + __module_get(THIS_MODULE); + return i; +} + +#ifdef CONFIG_PROC_FS +static const char *lec_arp_get_status_string(unsigned char status) +{ + static const char *const lec_arp_status_string[] = { + "ESI_UNKNOWN ", + "ESI_ARP_PENDING ", + "ESI_VC_PENDING ", + " ", + "ESI_FLUSH_PENDING ", + "ESI_FORWARD_DIRECT" + }; + + if (status > ESI_FORWARD_DIRECT) + status = 3; /* ESI_UNDEFINED */ + return lec_arp_status_string[status]; +} + +static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) +{ + int i; + + for (i = 0; i < ETH_ALEN; i++) + seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff); + seq_printf(seq, " "); + for (i = 0; i < ATM_ESA_LEN; i++) + seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff); + seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status), + entry->flags & 0xffff); + if (entry->vcc) + seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); + else + seq_printf(seq, " "); + if (entry->recv_vcc) { + seq_printf(seq, " %3d %3d", entry->recv_vcc->vpi, + entry->recv_vcc->vci); + } + seq_putc(seq, '\n'); +} + +struct lec_state { + unsigned long flags; + struct lec_priv *locked; + struct hlist_node *node; + struct net_device *dev; + int itf; + int arp_table; + int misc_table; +}; + +static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, + loff_t *l) +{ + struct hlist_node *e = state->node; + struct lec_arp_table *tmp; + + if (!e) + e = tbl->first; + if (e == SEQ_START_TOKEN) { + e = tbl->first; + --*l; + } + + hlist_for_each_entry_from(tmp, e, next) { + if (--*l < 0) + break; + } + state->node = e; + + return (*l < 0) ? state : NULL; +} + +static void *lec_arp_walk(struct lec_state *state, loff_t *l, + struct lec_priv *priv) +{ + void *v = NULL; + int p; + + for (p = state->arp_table; p < LEC_ARP_TABLE_SIZE; p++) { + v = lec_tbl_walk(state, &priv->lec_arp_tables[p], l); + if (v) + break; + } + state->arp_table = p; + return v; +} + +static void *lec_misc_walk(struct lec_state *state, loff_t *l, + struct lec_priv *priv) +{ + struct hlist_head *lec_misc_tables[] = { + &priv->lec_arp_empty_ones, + &priv->lec_no_forward, + &priv->mcast_fwds + }; + void *v = NULL; + int q; + + for (q = state->misc_table; q < ARRAY_SIZE(lec_misc_tables); q++) { + v = lec_tbl_walk(state, lec_misc_tables[q], l); + if (v) + break; + } + state->misc_table = q; + return v; +} + +static void *lec_priv_walk(struct lec_state *state, loff_t *l, + struct lec_priv *priv) +{ + if (!state->locked) { + state->locked = priv; + spin_lock_irqsave(&priv->lec_arp_lock, state->flags); + } + if (!lec_arp_walk(state, l, priv) && !lec_misc_walk(state, l, priv)) { + spin_unlock_irqrestore(&priv->lec_arp_lock, state->flags); + state->locked = NULL; + /* Partial state reset for the next time we get called */ + state->arp_table = state->misc_table = 0; + } + return state->locked; +} + +static void *lec_itf_walk(struct lec_state *state, loff_t *l) +{ + struct net_device *dev; + void *v; + + dev = state->dev ? state->dev : dev_lec[state->itf]; + v = (dev && netdev_priv(dev)) ? + lec_priv_walk(state, l, netdev_priv(dev)) : NULL; + if (!v && dev) { + dev_put(dev); + /* Partial state reset for the next time we get called */ + dev = NULL; + } + state->dev = dev; + return v; +} + +static void *lec_get_idx(struct lec_state *state, loff_t l) +{ + void *v = NULL; + + for (; state->itf < MAX_LEC_ITF; state->itf++) { + v = lec_itf_walk(state, &l); + if (v) + break; + } + return v; +} + +static void *lec_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct lec_state *state = seq->private; + + state->itf = 0; + state->dev = NULL; + state->locked = NULL; + state->arp_table = 0; + state->misc_table = 0; + state->node = SEQ_START_TOKEN; + + return *pos ? lec_get_idx(state, *pos) : SEQ_START_TOKEN; +} + +static void lec_seq_stop(struct seq_file *seq, void *v) +{ + struct lec_state *state = seq->private; + + if (state->dev) { + spin_unlock_irqrestore(&state->locked->lec_arp_lock, + state->flags); + dev_put(state->dev); + } +} + +static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct lec_state *state = seq->private; + + v = lec_get_idx(state, 1); + *pos += !!PTR_ERR(v); + return v; +} + +static int lec_seq_show(struct seq_file *seq, void *v) +{ + static const char lec_banner[] = + "Itf MAC ATM destination" + " Status Flags " + "VPI/VCI Recv VPI/VCI\n"; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, lec_banner); + else { + struct lec_state *state = seq->private; + struct net_device *dev = state->dev; + struct lec_arp_table *entry = hlist_entry(state->node, + struct lec_arp_table, + next); + + seq_printf(seq, "%s ", dev->name); + lec_info(seq, entry); + } + return 0; +} + +static const struct seq_operations lec_seq_ops = { + .start = lec_seq_start, + .next = lec_seq_next, + .stop = lec_seq_stop, + .show = lec_seq_show, +}; + +static int lec_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &lec_seq_ops, sizeof(struct lec_state)); +} + +static const struct file_operations lec_seq_fops = { + .owner = THIS_MODULE, + .open = lec_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct atm_vcc *vcc = ATM_SD(sock); + int err = 0; + + switch (cmd) { + case ATMLEC_CTRL: + case ATMLEC_MCAST: + case ATMLEC_DATA: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + return -ENOIOCTLCMD; + } + + switch (cmd) { + case ATMLEC_CTRL: + err = lecd_attach(vcc, (int)arg); + if (err >= 0) + sock->state = SS_CONNECTED; + break; + case ATMLEC_MCAST: + err = lec_mcast_attach(vcc, (int)arg); + break; + case ATMLEC_DATA: + err = lec_vcc_attach(vcc, (void __user *)arg); + break; + } + + return err; +} + +static struct atm_ioctl lane_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = lane_ioctl, +}; + +static int __init lane_module_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *p; + + p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops); + if (!p) { + pr_err("Unable to initialize /proc/net/atm/lec\n"); + return -ENOMEM; + } +#endif + + register_atm_ioctl(&lane_ioctl_ops); + pr_info("lec.c: initialized\n"); + return 0; +} + +static void __exit lane_module_cleanup(void) +{ + int i; + + remove_proc_entry("lec", atm_proc_root); + + deregister_atm_ioctl(&lane_ioctl_ops); + + for (i = 0; i < MAX_LEC_ITF; i++) { + if (dev_lec[i] != NULL) { + unregister_netdev(dev_lec[i]); + free_netdev(dev_lec[i]); + dev_lec[i] = NULL; + } + } +} + +module_init(lane_module_init); +module_exit(lane_module_cleanup); + +/* + * LANE2: 3.1.3, LE_RESOLVE.request + * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs. + * If sizeoftlvs == NULL the default TLVs associated with with this + * lec will be used. + * If dst_mac == NULL, targetless LE_ARP will be sent + */ +static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force, + u8 **tlvs, u32 *sizeoftlvs) +{ + unsigned long flags; + struct lec_priv *priv = netdev_priv(dev); + struct lec_arp_table *table; + struct sk_buff *skb; + int retval; + + if (force == 0) { + spin_lock_irqsave(&priv->lec_arp_lock, flags); + table = lec_arp_find(priv, dst_mac); + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + if (table == NULL) + return -1; + + *tlvs = kmemdup(table->tlvs, table->sizeoftlvs, GFP_ATOMIC); + if (*tlvs == NULL) + return -1; + + *sizeoftlvs = table->sizeoftlvs; + + return 0; + } + + if (sizeoftlvs == NULL) + retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, NULL); + + else { + skb = alloc_skb(*sizeoftlvs, GFP_ATOMIC); + if (skb == NULL) + return -1; + skb->len = *sizeoftlvs; + skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs); + retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb); + } + return retval; +} + +/* + * LANE2: 3.1.4, LE_ASSOCIATE.request + * Associate the *tlvs with the *lan_dst address. + * Will overwrite any previous association + * Returns 1 for success, 0 for failure (out of memory) + * + */ +static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst, + const u8 *tlvs, u32 sizeoftlvs) +{ + int retval; + struct sk_buff *skb; + struct lec_priv *priv = netdev_priv(dev); + + if (compare_ether_addr(lan_dst, dev->dev_addr)) + return 0; /* not our mac address */ + + kfree(priv->tlvs); /* NULL if there was no previous association */ + + priv->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL); + if (priv->tlvs == NULL) + return 0; + priv->sizeoftlvs = sizeoftlvs; + + skb = alloc_skb(sizeoftlvs, GFP_ATOMIC); + if (skb == NULL) + return 0; + skb->len = sizeoftlvs; + skb_copy_to_linear_data(skb, tlvs, sizeoftlvs); + retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb); + if (retval != 0) + pr_info("lec.c: lane2_associate_req() failed\n"); + /* + * If the previous association has changed we must + * somehow notify other LANE entities about the change + */ + return 1; +} + +/* + * LANE2: 3.1.5, LE_ASSOCIATE.indication + * + */ +static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr, + const u8 *tlvs, u32 sizeoftlvs) +{ +#if 0 + int i = 0; +#endif + struct lec_priv *priv = netdev_priv(dev); +#if 0 /* + * Why have the TLVs in LE_ARP entries + * since we do not use them? When you + * uncomment this code, make sure the + * TLVs get freed when entry is killed + */ + struct lec_arp_table *entry = lec_arp_find(priv, mac_addr); + + if (entry == NULL) + return; /* should not happen */ + + kfree(entry->tlvs); + + entry->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL); + if (entry->tlvs == NULL) + return; + entry->sizeoftlvs = sizeoftlvs; +#endif +#if 0 + pr_info("\n"); + pr_info("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs); + while (i < sizeoftlvs) + pr_cont("%02x ", tlvs[i++]); + + pr_cont("\n"); +#endif + + /* tell MPOA about the TLVs we saw */ + if (priv->lane2_ops && priv->lane2_ops->associate_indicator) { + priv->lane2_ops->associate_indicator(dev, mac_addr, + tlvs, sizeoftlvs); + } +} + +/* + * Here starts what used to lec_arpc.c + * + * lec_arpc.c was added here when making + * lane client modular. October 1997 + */ + +#include +#include +#include +#include +#include +#include + +#if 0 +#define pr_debug(format, args...) +/* + #define pr_debug printk +*/ +#endif +#define DEBUG_ARP_TABLE 0 + +#define LEC_ARP_REFRESH_INTERVAL (3*HZ) + +static void lec_arp_check_expire(struct work_struct *work); +static void lec_arp_expire_arp(unsigned long data); + +/* + * Arp table funcs + */ + +#define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE - 1)) + +/* + * Initialization of arp-cache + */ +static void lec_arp_init(struct lec_priv *priv) +{ + unsigned short i; + + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); + INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); + INIT_HLIST_HEAD(&priv->lec_no_forward); + INIT_HLIST_HEAD(&priv->mcast_fwds); + spin_lock_init(&priv->lec_arp_lock); + INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire); + schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL); +} + +static void lec_arp_clear_vccs(struct lec_arp_table *entry) +{ + if (entry->vcc) { + struct atm_vcc *vcc = entry->vcc; + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + struct net_device *dev = (struct net_device *)vcc->proto_data; + + vcc->pop = vpriv->old_pop; + if (vpriv->xoff) + netif_wake_queue(dev); + kfree(vpriv); + vcc->user_back = NULL; + vcc->push = entry->old_push; + vcc_release_async(vcc, -EPIPE); + entry->vcc = NULL; + } + if (entry->recv_vcc) { + entry->recv_vcc->push = entry->old_recv_push; + vcc_release_async(entry->recv_vcc, -EPIPE); + entry->recv_vcc = NULL; + } +} + +/* + * Insert entry to lec_arp_table + * LANE2: Add to the end of the list to satisfy 8.1.13 + */ +static inline void +lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry) +{ + struct hlist_head *tmp; + + tmp = &priv->lec_arp_tables[HASH(entry->mac_addr[ETH_ALEN - 1])]; + hlist_add_head(&entry->next, tmp); + + pr_debug("Added entry:%pM\n", entry->mac_addr); +} + +/* + * Remove entry from lec_arp_table + */ +static int +lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove) +{ + struct hlist_node *node; + struct lec_arp_table *entry; + int i, remove_vcc = 1; + + if (!to_remove) + return -1; + + hlist_del(&to_remove->next); + del_timer(&to_remove->timer); + + /* + * If this is the only MAC connected to this VCC, + * also tear down the VCC + */ + if (to_remove->status >= ESI_FLUSH_PENDING) { + /* + * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT + */ + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry(entry, node, + &priv->lec_arp_tables[i], next) { + if (memcmp(to_remove->atm_addr, + entry->atm_addr, ATM_ESA_LEN) == 0) { + remove_vcc = 0; + break; + } + } + } + if (remove_vcc) + lec_arp_clear_vccs(to_remove); + } + skb_queue_purge(&to_remove->tx_wait); /* FIXME: good place for this? */ + + pr_debug("Removed entry:%pM\n", to_remove->mac_addr); + return 0; +} + +#if DEBUG_ARP_TABLE +static const char *get_status_string(unsigned char st) +{ + switch (st) { + case ESI_UNKNOWN: + return "ESI_UNKNOWN"; + case ESI_ARP_PENDING: + return "ESI_ARP_PENDING"; + case ESI_VC_PENDING: + return "ESI_VC_PENDING"; + case ESI_FLUSH_PENDING: + return "ESI_FLUSH_PENDING"; + case ESI_FORWARD_DIRECT: + return "ESI_FORWARD_DIRECT"; + } + return ""; +} + +static void dump_arp_table(struct lec_priv *priv) +{ + struct hlist_node *node; + struct lec_arp_table *rulla; + char buf[256]; + int i, j, offset; + + pr_info("Dump %p:\n", priv); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry(rulla, node, + &priv->lec_arp_tables[i], next) { + offset = 0; + offset += sprintf(buf, "%d: %p\n", i, rulla); + offset += sprintf(buf + offset, "Mac: %pM", + rulla->mac_addr); + offset += sprintf(buf + offset, " Atm:"); + for (j = 0; j < ATM_ESA_LEN; j++) { + offset += sprintf(buf + offset, + "%2.2x ", + rulla->atm_addr[j] & 0xff); + } + offset += sprintf(buf + offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc ? rulla->vcc->vpi : 0, + rulla->vcc ? rulla->vcc->vci : 0, + rulla->recv_vcc ? rulla->recv_vcc-> + vpi : 0, + rulla->recv_vcc ? rulla->recv_vcc-> + vci : 0, rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset += + sprintf(buf + offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + pr_info("%s\n", buf); + } + } + + if (!hlist_empty(&priv->lec_no_forward)) + pr_info("No forward\n"); + hlist_for_each_entry(rulla, node, &priv->lec_no_forward, next) { + offset = 0; + offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); + offset += sprintf(buf + offset, " Atm:"); + for (j = 0; j < ATM_ESA_LEN; j++) { + offset += sprintf(buf + offset, "%2.2x ", + rulla->atm_addr[j] & 0xff); + } + offset += sprintf(buf + offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc ? rulla->vcc->vpi : 0, + rulla->vcc ? rulla->vcc->vci : 0, + rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, + rulla->recv_vcc ? rulla->recv_vcc->vci : 0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset += sprintf(buf + offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + pr_info("%s\n", buf); + } + + if (!hlist_empty(&priv->lec_arp_empty_ones)) + pr_info("Empty ones\n"); + hlist_for_each_entry(rulla, node, &priv->lec_arp_empty_ones, next) { + offset = 0; + offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); + offset += sprintf(buf + offset, " Atm:"); + for (j = 0; j < ATM_ESA_LEN; j++) { + offset += sprintf(buf + offset, "%2.2x ", + rulla->atm_addr[j] & 0xff); + } + offset += sprintf(buf + offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc ? rulla->vcc->vpi : 0, + rulla->vcc ? rulla->vcc->vci : 0, + rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, + rulla->recv_vcc ? rulla->recv_vcc->vci : 0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset += sprintf(buf + offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + pr_info("%s", buf); + } + + if (!hlist_empty(&priv->mcast_fwds)) + pr_info("Multicast Forward VCCs\n"); + hlist_for_each_entry(rulla, node, &priv->mcast_fwds, next) { + offset = 0; + offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr); + offset += sprintf(buf + offset, " Atm:"); + for (j = 0; j < ATM_ESA_LEN; j++) { + offset += sprintf(buf + offset, "%2.2x ", + rulla->atm_addr[j] & 0xff); + } + offset += sprintf(buf + offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc ? rulla->vcc->vpi : 0, + rulla->vcc ? rulla->vcc->vci : 0, + rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, + rulla->recv_vcc ? rulla->recv_vcc->vci : 0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset += sprintf(buf + offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + pr_info("%s\n", buf); + } + +} +#else +#define dump_arp_table(priv) do { } while (0) +#endif + +/* + * Destruction of arp-cache + */ +static void lec_arp_destroy(struct lec_priv *priv) +{ + unsigned long flags; + struct hlist_node *node, *next; + struct lec_arp_table *entry; + int i; + + cancel_delayed_work_sync(&priv->lec_arp_work); + + /* + * Remove all entries + */ + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_tables[i], next) { + lec_arp_remove(priv, entry); + lec_arp_put(entry); + } + INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); + } + + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_empty_ones, next) { + del_timer_sync(&entry->timer); + lec_arp_clear_vccs(entry); + hlist_del(&entry->next); + lec_arp_put(entry); + } + INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); + + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_no_forward, next) { + del_timer_sync(&entry->timer); + lec_arp_clear_vccs(entry); + hlist_del(&entry->next); + lec_arp_put(entry); + } + INIT_HLIST_HEAD(&priv->lec_no_forward); + + hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) { + /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ + lec_arp_clear_vccs(entry); + hlist_del(&entry->next); + lec_arp_put(entry); + } + INIT_HLIST_HEAD(&priv->mcast_fwds); + priv->mcast_vcc = NULL; + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +/* + * Find entry by mac_address + */ +static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, + const unsigned char *mac_addr) +{ + struct hlist_node *node; + struct hlist_head *head; + struct lec_arp_table *entry; + + pr_debug("%pM\n", mac_addr); + + head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])]; + hlist_for_each_entry(entry, node, head, next) { + if (!compare_ether_addr(mac_addr, entry->mac_addr)) + return entry; + } + return NULL; +} + +static struct lec_arp_table *make_entry(struct lec_priv *priv, + const unsigned char *mac_addr) +{ + struct lec_arp_table *to_return; + + to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC); + if (!to_return) { + pr_info("LEC: Arp entry kmalloc failed\n"); + return NULL; + } + memcpy(to_return->mac_addr, mac_addr, ETH_ALEN); + INIT_HLIST_NODE(&to_return->next); + setup_timer(&to_return->timer, lec_arp_expire_arp, + (unsigned long)to_return); + to_return->last_used = jiffies; + to_return->priv = priv; + skb_queue_head_init(&to_return->tx_wait); + atomic_set(&to_return->usage, 1); + return to_return; +} + +/* Arp sent timer expired */ +static void lec_arp_expire_arp(unsigned long data) +{ + struct lec_arp_table *entry; + + entry = (struct lec_arp_table *)data; + + pr_debug("\n"); + if (entry->status == ESI_ARP_PENDING) { + if (entry->no_tries <= entry->priv->max_retry_count) { + if (entry->is_rdesc) + send_to_lecd(entry->priv, l_rdesc_arp_xmt, + entry->mac_addr, NULL, NULL); + else + send_to_lecd(entry->priv, l_arp_xmt, + entry->mac_addr, NULL, NULL); + entry->no_tries++; + } + mod_timer(&entry->timer, jiffies + (1 * HZ)); + } +} + +/* Unknown/unused vcc expire, remove associated entry */ +static void lec_arp_expire_vcc(unsigned long data) +{ + unsigned long flags; + struct lec_arp_table *to_remove = (struct lec_arp_table *)data; + struct lec_priv *priv = (struct lec_priv *)to_remove->priv; + + del_timer(&to_remove->timer); + + pr_debug("%p %p: vpi:%d vci:%d\n", + to_remove, priv, + to_remove->vcc ? to_remove->recv_vcc->vpi : 0, + to_remove->vcc ? to_remove->recv_vcc->vci : 0); + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + hlist_del(&to_remove->next); + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + lec_arp_clear_vccs(to_remove); + lec_arp_put(to_remove); +} + +static bool __lec_arp_check_expire(struct lec_arp_table *entry, + unsigned long now, + struct lec_priv *priv) +{ + unsigned long time_to_check; + + if ((entry->flags) & LEC_REMOTE_FLAG && priv->topology_change) + time_to_check = priv->forward_delay_time; + else + time_to_check = priv->aging_time; + + pr_debug("About to expire: %lx - %lx > %lx\n", + now, entry->last_used, time_to_check); + if (time_after(now, entry->last_used + time_to_check) && + !(entry->flags & LEC_PERMANENT_FLAG) && + !(entry->mac_addr[0] & 0x01)) { /* LANE2: 7.1.20 */ + /* Remove entry */ + pr_debug("Entry timed out\n"); + lec_arp_remove(priv, entry); + lec_arp_put(entry); + } else { + /* Something else */ + if ((entry->status == ESI_VC_PENDING || + entry->status == ESI_ARP_PENDING) && + time_after_eq(now, entry->timestamp + + priv->max_unknown_frame_time)) { + entry->timestamp = jiffies; + entry->packets_flooded = 0; + if (entry->status == ESI_VC_PENDING) + send_to_lecd(priv, l_svc_setup, + entry->mac_addr, + entry->atm_addr, + NULL); + } + if (entry->status == ESI_FLUSH_PENDING && + time_after_eq(now, entry->timestamp + + priv->path_switching_delay)) { + lec_arp_hold(entry); + return true; + } + } + + return false; +} +/* + * Expire entries. + * 1. Re-set timer + * 2. For each entry, delete entries that have aged past the age limit. + * 3. For each entry, depending on the status of the entry, perform + * the following maintenance. + * a. If status is ESI_VC_PENDING or ESI_ARP_PENDING then if the + * tick_count is above the max_unknown_frame_time, clear + * the tick_count to zero and clear the packets_flooded counter + * to zero. This supports the packet rate limit per address + * while flooding unknowns. + * b. If the status is ESI_FLUSH_PENDING and the tick_count is greater + * than or equal to the path_switching_delay, change the status + * to ESI_FORWARD_DIRECT. This causes the flush period to end + * regardless of the progress of the flush protocol. + */ +static void lec_arp_check_expire(struct work_struct *work) +{ + unsigned long flags; + struct lec_priv *priv = + container_of(work, struct lec_priv, lec_arp_work.work); + struct hlist_node *node, *next; + struct lec_arp_table *entry; + unsigned long now; + int i; + + pr_debug("%p\n", priv); + now = jiffies; +restart: + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_tables[i], next) { + if (__lec_arp_check_expire(entry, now, priv)) { + struct sk_buff *skb; + struct atm_vcc *vcc = entry->vcc; + + spin_unlock_irqrestore(&priv->lec_arp_lock, + flags); + while ((skb = skb_dequeue(&entry->tx_wait))) + lec_send(vcc, skb); + entry->last_used = jiffies; + entry->status = ESI_FORWARD_DIRECT; + lec_arp_put(entry); + + goto restart; + } + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL); +} + +/* + * Try to find vcc where mac_address is attached. + * + */ +static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, + const unsigned char *mac_to_find, + int is_rdesc, + struct lec_arp_table **ret_entry) +{ + unsigned long flags; + struct lec_arp_table *entry; + struct atm_vcc *found; + + if (mac_to_find[0] & 0x01) { + switch (priv->lane_version) { + case 1: + return priv->mcast_vcc; + case 2: /* LANE2 wants arp for multicast addresses */ + if (!compare_ether_addr(mac_to_find, bus_mac)) + return priv->mcast_vcc; + break; + default: + break; + } + } + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = lec_arp_find(priv, mac_to_find); + + if (entry) { + if (entry->status == ESI_FORWARD_DIRECT) { + /* Connection Ok */ + entry->last_used = jiffies; + lec_arp_hold(entry); + *ret_entry = entry; + found = entry->vcc; + goto out; + } + /* + * If the LE_ARP cache entry is still pending, reset count to 0 + * so another LE_ARP request can be made for this frame. + */ + if (entry->status == ESI_ARP_PENDING) + entry->no_tries = 0; + /* + * Data direct VC not yet set up, check to see if the unknown + * frame count is greater than the limit. If the limit has + * not been reached, allow the caller to send packet to + * BUS. + */ + if (entry->status != ESI_FLUSH_PENDING && + entry->packets_flooded < + priv->maximum_unknown_frame_count) { + entry->packets_flooded++; + pr_debug("Flooding..\n"); + found = priv->mcast_vcc; + goto out; + } + /* + * We got here because entry->status == ESI_FLUSH_PENDING + * or BUS flood limit was reached for an entry which is + * in ESI_ARP_PENDING or ESI_VC_PENDING state. + */ + lec_arp_hold(entry); + *ret_entry = entry; + pr_debug("entry->status %d entry->vcc %p\n", entry->status, + entry->vcc); + found = NULL; + } else { + /* No matching entry was found */ + entry = make_entry(priv, mac_to_find); + pr_debug("Making entry\n"); + if (!entry) { + found = priv->mcast_vcc; + goto out; + } + lec_arp_add(priv, entry); + /* We want arp-request(s) to be sent */ + entry->packets_flooded = 1; + entry->status = ESI_ARP_PENDING; + entry->no_tries = 1; + entry->last_used = entry->timestamp = jiffies; + entry->is_rdesc = is_rdesc; + if (entry->is_rdesc) + send_to_lecd(priv, l_rdesc_arp_xmt, mac_to_find, NULL, + NULL); + else + send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); + entry->timer.expires = jiffies + (1 * HZ); + entry->timer.function = lec_arp_expire_arp; + add_timer(&entry->timer); + found = priv->mcast_vcc; + } + +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return found; +} + +static int +lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, + unsigned long permanent) +{ + unsigned long flags; + struct hlist_node *node, *next; + struct lec_arp_table *entry; + int i; + + pr_debug("\n"); + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_tables[i], next) { + if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) && + (permanent || + !(entry->flags & LEC_PERMANENT_FLAG))) { + lec_arp_remove(priv, entry); + lec_arp_put(entry); + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return 0; + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return -1; +} + +/* + * Notifies: Response to arp_request (atm_addr != NULL) + */ +static void +lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, + const unsigned char *atm_addr, unsigned long remoteflag, + unsigned int targetless_le_arp) +{ + unsigned long flags; + struct hlist_node *node, *next; + struct lec_arp_table *entry, *tmp; + int i; + + pr_debug("%smac:%pM\n", + (targetless_le_arp) ? "targetless " : "", mac_addr); + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = lec_arp_find(priv, mac_addr); + if (entry == NULL && targetless_le_arp) + goto out; /* + * LANE2: ignore targetless LE_ARPs for which + * we have no entry in the cache. 7.1.30 + */ + if (!hlist_empty(&priv->lec_arp_empty_ones)) { + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_empty_ones, next) { + if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) { + hlist_del(&entry->next); + del_timer(&entry->timer); + tmp = lec_arp_find(priv, mac_addr); + if (tmp) { + del_timer(&tmp->timer); + tmp->status = ESI_FORWARD_DIRECT; + memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN); + tmp->vcc = entry->vcc; + tmp->old_push = entry->old_push; + tmp->last_used = jiffies; + del_timer(&entry->timer); + lec_arp_put(entry); + entry = tmp; + } else { + entry->status = ESI_FORWARD_DIRECT; + memcpy(entry->mac_addr, mac_addr, ETH_ALEN); + entry->last_used = jiffies; + lec_arp_add(priv, entry); + } + if (remoteflag) + entry->flags |= LEC_REMOTE_FLAG; + else + entry->flags &= ~LEC_REMOTE_FLAG; + pr_debug("After update\n"); + dump_arp_table(priv); + goto out; + } + } + } + + entry = lec_arp_find(priv, mac_addr); + if (!entry) { + entry = make_entry(priv, mac_addr); + if (!entry) + goto out; + entry->status = ESI_UNKNOWN; + lec_arp_add(priv, entry); + /* Temporary, changes before end of function */ + } + memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN); + del_timer(&entry->timer); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry(tmp, node, + &priv->lec_arp_tables[i], next) { + if (entry != tmp && + !memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) { + /* Vcc to this host exists */ + if (tmp->status > ESI_VC_PENDING) { + /* + * ESI_FLUSH_PENDING, + * ESI_FORWARD_DIRECT + */ + entry->vcc = tmp->vcc; + entry->old_push = tmp->old_push; + } + entry->status = tmp->status; + break; + } + } + } + if (remoteflag) + entry->flags |= LEC_REMOTE_FLAG; + else + entry->flags &= ~LEC_REMOTE_FLAG; + if (entry->status == ESI_ARP_PENDING || entry->status == ESI_UNKNOWN) { + entry->status = ESI_VC_PENDING; + send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL); + } + pr_debug("After update2\n"); + dump_arp_table(priv); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +/* + * Notifies: Vcc setup ready + */ +static void +lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, + struct atm_vcc *vcc, + void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb)) +{ + unsigned long flags; + struct hlist_node *node; + struct lec_arp_table *entry; + int i, found_entry = 0; + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */ + if (ioc_data->receive == 2) { + pr_debug("LEC_ARP: Attaching mcast forward\n"); +#if 0 + entry = lec_arp_find(priv, bus_mac); + if (!entry) { + pr_info("LEC_ARP: Multicast entry not found!\n"); + goto out; + } + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + entry->recv_vcc = vcc; + entry->old_recv_push = old_push; +#endif + entry = make_entry(priv, bus_mac); + if (entry == NULL) + goto out; + del_timer(&entry->timer); + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + entry->recv_vcc = vcc; + entry->old_recv_push = old_push; + hlist_add_head(&entry->next, &priv->mcast_fwds); + goto out; + } else if (ioc_data->receive == 1) { + /* + * Vcc which we don't want to make default vcc, + * attach it anyway. + */ + pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", + ioc_data->atm_addr[0], ioc_data->atm_addr[1], + ioc_data->atm_addr[2], ioc_data->atm_addr[3], + ioc_data->atm_addr[4], ioc_data->atm_addr[5], + ioc_data->atm_addr[6], ioc_data->atm_addr[7], + ioc_data->atm_addr[8], ioc_data->atm_addr[9], + ioc_data->atm_addr[10], ioc_data->atm_addr[11], + ioc_data->atm_addr[12], ioc_data->atm_addr[13], + ioc_data->atm_addr[14], ioc_data->atm_addr[15], + ioc_data->atm_addr[16], ioc_data->atm_addr[17], + ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + entry = make_entry(priv, bus_mac); + if (entry == NULL) + goto out; + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + memset(entry->mac_addr, 0, ETH_ALEN); + entry->recv_vcc = vcc; + entry->old_recv_push = old_push; + entry->status = ESI_UNKNOWN; + entry->timer.expires = jiffies + priv->vcc_timeout_period; + entry->timer.function = lec_arp_expire_vcc; + hlist_add_head(&entry->next, &priv->lec_no_forward); + add_timer(&entry->timer); + dump_arp_table(priv); + goto out; + } + pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", + ioc_data->atm_addr[0], ioc_data->atm_addr[1], + ioc_data->atm_addr[2], ioc_data->atm_addr[3], + ioc_data->atm_addr[4], ioc_data->atm_addr[5], + ioc_data->atm_addr[6], ioc_data->atm_addr[7], + ioc_data->atm_addr[8], ioc_data->atm_addr[9], + ioc_data->atm_addr[10], ioc_data->atm_addr[11], + ioc_data->atm_addr[12], ioc_data->atm_addr[13], + ioc_data->atm_addr[14], ioc_data->atm_addr[15], + ioc_data->atm_addr[16], ioc_data->atm_addr[17], + ioc_data->atm_addr[18], ioc_data->atm_addr[19]); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry(entry, node, + &priv->lec_arp_tables[i], next) { + if (memcmp + (ioc_data->atm_addr, entry->atm_addr, + ATM_ESA_LEN) == 0) { + pr_debug("LEC_ARP: Attaching data direct\n"); + pr_debug("Currently -> Vcc: %d, Rvcc:%d\n", + entry->vcc ? entry->vcc->vci : 0, + entry->recv_vcc ? entry->recv_vcc-> + vci : 0); + found_entry = 1; + del_timer(&entry->timer); + entry->vcc = vcc; + entry->old_push = old_push; + if (entry->status == ESI_VC_PENDING) { + if (priv->maximum_unknown_frame_count + == 0) + entry->status = + ESI_FORWARD_DIRECT; + else { + entry->timestamp = jiffies; + entry->status = + ESI_FLUSH_PENDING; +#if 0 + send_to_lecd(priv, l_flush_xmt, + NULL, + entry->atm_addr, + NULL); +#endif + } + } else { + /* + * They were forming a connection + * to us, and we to them. Our + * ATM address is numerically lower + * than theirs, so we make connection + * we formed into default VCC (8.1.11). + * Connection they made gets torn + * down. This might confuse some + * clients. Can be changed if + * someone reports trouble... + */ + ; + } + } + } + } + if (found_entry) { + pr_debug("After vcc was added\n"); + dump_arp_table(priv); + goto out; + } + /* + * Not found, snatch address from first data packet that arrives + * from this vcc + */ + entry = make_entry(priv, bus_mac); + if (!entry) + goto out; + entry->vcc = vcc; + entry->old_push = old_push; + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + memset(entry->mac_addr, 0, ETH_ALEN); + entry->status = ESI_UNKNOWN; + hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); + entry->timer.expires = jiffies + priv->vcc_timeout_period; + entry->timer.function = lec_arp_expire_vcc; + add_timer(&entry->timer); + pr_debug("After vcc was added\n"); + dump_arp_table(priv); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id) +{ + unsigned long flags; + struct hlist_node *node; + struct lec_arp_table *entry; + int i; + + pr_debug("%lx\n", tran_id); +restart: + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry(entry, node, + &priv->lec_arp_tables[i], next) { + if (entry->flush_tran_id == tran_id && + entry->status == ESI_FLUSH_PENDING) { + struct sk_buff *skb; + struct atm_vcc *vcc = entry->vcc; + + lec_arp_hold(entry); + spin_unlock_irqrestore(&priv->lec_arp_lock, + flags); + while ((skb = skb_dequeue(&entry->tx_wait))) + lec_send(vcc, skb); + entry->last_used = jiffies; + entry->status = ESI_FORWARD_DIRECT; + lec_arp_put(entry); + pr_debug("LEC_ARP: Flushed\n"); + goto restart; + } + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + dump_arp_table(priv); +} + +static void +lec_set_flush_tran_id(struct lec_priv *priv, + const unsigned char *atm_addr, unsigned long tran_id) +{ + unsigned long flags; + struct hlist_node *node; + struct lec_arp_table *entry; + int i; + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) + hlist_for_each_entry(entry, node, + &priv->lec_arp_tables[i], next) { + if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) { + entry->flush_tran_id = tran_id; + pr_debug("Set flush transaction id to %lx for %p\n", + tran_id, entry); + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc) +{ + unsigned long flags; + unsigned char mac_addr[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + struct lec_arp_table *to_add; + struct lec_vcc_priv *vpriv; + int err = 0; + + vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL); + if (!vpriv) + return -ENOMEM; + vpriv->xoff = 0; + vpriv->old_pop = vcc->pop; + vcc->user_back = vpriv; + vcc->pop = lec_pop; + spin_lock_irqsave(&priv->lec_arp_lock, flags); + to_add = make_entry(priv, mac_addr); + if (!to_add) { + vcc->pop = vpriv->old_pop; + kfree(vpriv); + err = -ENOMEM; + goto out; + } + memcpy(to_add->atm_addr, vcc->remote.sas_addr.prv, ATM_ESA_LEN); + to_add->status = ESI_FORWARD_DIRECT; + to_add->flags |= LEC_PERMANENT_FLAG; + to_add->vcc = vcc; + to_add->old_push = vcc->push; + vcc->push = lec_push; + priv->mcast_vcc = vcc; + lec_arp_add(priv, to_add); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return err; +} + +static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) +{ + unsigned long flags; + struct hlist_node *node, *next; + struct lec_arp_table *entry; + int i; + + pr_debug("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci); + dump_arp_table(priv); + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_tables[i], next) { + if (vcc == entry->vcc) { + lec_arp_remove(priv, entry); + lec_arp_put(entry); + if (priv->mcast_vcc == vcc) + priv->mcast_vcc = NULL; + } + } + } + + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_empty_ones, next) { + if (entry->vcc == vcc) { + lec_arp_clear_vccs(entry); + del_timer(&entry->timer); + hlist_del(&entry->next); + lec_arp_put(entry); + } + } + + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_no_forward, next) { + if (entry->recv_vcc == vcc) { + lec_arp_clear_vccs(entry); + del_timer(&entry->timer); + hlist_del(&entry->next); + lec_arp_put(entry); + } + } + + hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) { + if (entry->recv_vcc == vcc) { + lec_arp_clear_vccs(entry); + /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ + hlist_del(&entry->next); + lec_arp_put(entry); + } + } + + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + dump_arp_table(priv); +} + +static void +lec_arp_check_empties(struct lec_priv *priv, + struct atm_vcc *vcc, struct sk_buff *skb) +{ + unsigned long flags; + struct hlist_node *node, *next; + struct lec_arp_table *entry, *tmp; + struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data; + unsigned char *src; +#ifdef CONFIG_TR + struct lecdatahdr_8025 *tr_hdr = (struct lecdatahdr_8025 *)skb->data; + + if (priv->is_trdev) + src = tr_hdr->h_source; + else +#endif + src = hdr->h_source; + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + hlist_for_each_entry_safe(entry, node, next, + &priv->lec_arp_empty_ones, next) { + if (vcc == entry->vcc) { + del_timer(&entry->timer); + memcpy(entry->mac_addr, src, ETH_ALEN); + entry->status = ESI_FORWARD_DIRECT; + entry->last_used = jiffies; + /* We might have got an entry */ + tmp = lec_arp_find(priv, src); + if (tmp) { + lec_arp_remove(priv, tmp); + lec_arp_put(tmp); + } + hlist_del(&entry->next); + lec_arp_add(priv, entry); + goto out; + } + } + pr_debug("LEC_ARP: Arp_check_empties: entry not found!\n"); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +MODULE_LICENSE("GPL"); diff --git a/net/atm/lec.h b/net/atm/lec.h new file mode 100644 index 00000000..dfc07196 --- /dev/null +++ b/net/atm/lec.h @@ -0,0 +1,155 @@ +/* + * Lan Emulation client header file + * + * Marko Kiiskila + */ + +#ifndef _LEC_H_ +#define _LEC_H_ + +#include +#include +#include + +#define LEC_HEADER_LEN 16 + +struct lecdatahdr_8023 { + __be16 le_header; + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + __be16 h_type; +}; + +struct lecdatahdr_8025 { + __be16 le_header; + unsigned char ac_pad; + unsigned char fc; + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; +}; + +#define LEC_MINIMUM_8023_SIZE 62 +#define LEC_MINIMUM_8025_SIZE 16 + +/* + * Operations that LANE2 capable device can do. Two first functions + * are used to make the device do things. See spec 3.1.3 and 3.1.4. + * + * The third function is intended for the MPOA component sitting on + * top of the LANE device. The MPOA component assigns it's own function + * to (*associate_indicator)() and the LANE device will use that + * function to tell about TLVs it sees floating through. + * + */ +struct lane2_ops { + int (*resolve) (struct net_device *dev, const u8 *dst_mac, int force, + u8 **tlvs, u32 *sizeoftlvs); + int (*associate_req) (struct net_device *dev, const u8 *lan_dst, + const u8 *tlvs, u32 sizeoftlvs); + void (*associate_indicator) (struct net_device *dev, const u8 *mac_addr, + const u8 *tlvs, u32 sizeoftlvs); +}; + +/* + * ATM LAN Emulation supports both LLC & Dix Ethernet EtherType + * frames. + * + * 1. Dix Ethernet EtherType frames encoded by placing EtherType + * field in h_type field. Data follows immediatelly after header. + * 2. LLC Data frames whose total length, including LLC field and data, + * but not padding required to meet the minimum data frame length, + * is less than 1536(0x0600) MUST be encoded by placing that length + * in the h_type field. The LLC field follows header immediatelly. + * 3. LLC data frames longer than this maximum MUST be encoded by placing + * the value 0 in the h_type field. + * + */ + +/* Hash table size */ +#define LEC_ARP_TABLE_SIZE 16 + +struct lec_priv { + unsigned short lecid; /* Lecid of this client */ + struct hlist_head lec_arp_empty_ones; + /* Used for storing VCC's that don't have a MAC address attached yet */ + struct hlist_head lec_arp_tables[LEC_ARP_TABLE_SIZE]; + /* Actual LE ARP table */ + struct hlist_head lec_no_forward; + /* + * Used for storing VCC's (and forward packets from) which are to + * age out by not using them to forward packets. + * This is because to some LE clients there will be 2 VCCs. Only + * one of them gets used. + */ + struct hlist_head mcast_fwds; + /* + * With LANEv2 it is possible that BUS (or a special multicast server) + * establishes multiple Multicast Forward VCCs to us. This list + * collects all those VCCs. LANEv1 client has only one item in this + * list. These entries are not aged out. + */ + spinlock_t lec_arp_lock; + struct atm_vcc *mcast_vcc; /* Default Multicast Send VCC */ + struct atm_vcc *lecd; + struct delayed_work lec_arp_work; /* C10 */ + unsigned int maximum_unknown_frame_count; + /* + * Within the period of time defined by this variable, the client will send + * no more than C10 frames to BUS for a given unicast destination. (C11) + */ + unsigned long max_unknown_frame_time; + /* + * If no traffic has been sent in this vcc for this period of time, + * vcc will be torn down (C12) + */ + unsigned long vcc_timeout_period; + /* + * An LE Client MUST not retry an LE_ARP_REQUEST for a + * given frame's LAN Destination more than maximum retry count times, + * after the first LEC_ARP_REQUEST (C13) + */ + unsigned short max_retry_count; + /* + * Max time the client will maintain an entry in its arp cache in + * absence of a verification of that relationship (C17) + */ + unsigned long aging_time; + /* + * Max time the client will maintain an entry in cache when + * topology change flag is true (C18) + */ + unsigned long forward_delay_time; /* Topology change flag (C19) */ + int topology_change; + /* + * Max time the client expects an LE_ARP_REQUEST/LE_ARP_RESPONSE + * cycle to take (C20) + */ + unsigned long arp_response_time; + /* + * Time limit ot wait to receive an LE_FLUSH_RESPONSE after the + * LE_FLUSH_REQUEST has been sent before taking recover action. (C21) + */ + unsigned long flush_timeout; + /* The time since sending a frame to the bus after which the + * LE Client may assume that the frame has been either discarded or + * delivered to the recipient (C22) + */ + unsigned long path_switching_delay; + + u8 *tlvs; /* LANE2: TLVs are new */ + u32 sizeoftlvs; /* The size of the tlv array in bytes */ + int lane_version; /* LANE2 */ + int itfnum; /* e.g. 2 for lec2, 5 for lec5 */ + struct lane2_ops *lane2_ops; /* can be NULL for LANE v1 */ + int is_proxy; /* bridge between ATM and Ethernet */ + int is_trdev; /* Device type, 0 = Ethernet, 1 = TokenRing */ +}; + +struct lec_vcc_priv { + void (*old_pop) (struct atm_vcc *vcc, struct sk_buff *skb); + int xoff; +}; + +#define LEC_VCC_PRIV(vcc) ((struct lec_vcc_priv *)((vcc)->user_back)) + +#endif /* _LEC_H_ */ diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h new file mode 100644 index 00000000..ec67435a --- /dev/null +++ b/net/atm/lec_arpc.h @@ -0,0 +1,96 @@ +/* + * Lec arp cache + * + * Marko Kiiskila + */ +#ifndef _LEC_ARP_H_ +#define _LEC_ARP_H_ +#include +#include +#include +#include + +struct lec_arp_table { + struct hlist_node next; /* Linked entry list */ + unsigned char atm_addr[ATM_ESA_LEN]; /* Atm address */ + unsigned char mac_addr[ETH_ALEN]; /* Mac address */ + int is_rdesc; /* Mac address is a route descriptor */ + struct atm_vcc *vcc; /* Vcc this entry is attached */ + struct atm_vcc *recv_vcc; /* Vcc we receive data from */ + + void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb); + /* Push that leads to daemon */ + + void (*old_recv_push) (struct atm_vcc *vcc, struct sk_buff *skb); + /* Push that leads to daemon */ + + unsigned long last_used; /* For expiry */ + unsigned long timestamp; /* Used for various timestamping things: + * 1. FLUSH started + * (status=ESI_FLUSH_PENDING) + * 2. Counting to + * max_unknown_frame_time + * (status=ESI_ARP_PENDING|| + * status=ESI_VC_PENDING) + */ + unsigned char no_tries; /* No of times arp retry has been tried */ + unsigned char status; /* Status of this entry */ + unsigned short flags; /* Flags for this entry */ + unsigned short packets_flooded; /* Data packets flooded */ + unsigned long flush_tran_id; /* Transaction id in flush protocol */ + struct timer_list timer; /* Arping timer */ + struct lec_priv *priv; /* Pointer back */ + u8 *tlvs; + u32 sizeoftlvs; /* + * LANE2: Each MAC address can have TLVs + * associated with it. sizeoftlvs tells the + * the length of the tlvs array + */ + struct sk_buff_head tx_wait; /* wait queue for outgoing packets */ + atomic_t usage; /* usage count */ +}; + +/* + * LANE2: Template tlv struct for accessing + * the tlvs in the lec_arp_table->tlvs array + */ +struct tlv { + u32 type; + u8 length; + u8 value[255]; +}; + +/* Status fields */ +#define ESI_UNKNOWN 0 /* + * Next packet sent to this mac address + * causes ARP-request to be sent + */ +#define ESI_ARP_PENDING 1 /* + * There is no ATM address associated with this + * 48-bit address. The LE-ARP protocol is in + * progress. + */ +#define ESI_VC_PENDING 2 /* + * There is a valid ATM address associated with + * this 48-bit address but there is no VC set + * up to that ATM address. The signaling + * protocol is in process. + */ +#define ESI_FLUSH_PENDING 4 /* + * The LEC has been notified of the FLUSH_START + * status and it is assumed that the flush + * protocol is in process. + */ +#define ESI_FORWARD_DIRECT 5 /* + * Either the Path Switching Delay (C22) has + * elapsed or the LEC has notified the Mapping + * that the flush protocol has completed. In + * either case, it is safe to forward packets + * to this address via the data direct VC. + */ + +/* Flag values */ +#define LEC_REMOTE_FLAG 0x0001 +#define LEC_PERMANENT_FLAG 0x0002 + +#endif /* _LEC_ARP_H_ */ diff --git a/net/atm/mpc.c b/net/atm/mpc.c new file mode 100644 index 00000000..3ccca42e --- /dev/null +++ b/net/atm/mpc.c @@ -0,0 +1,1537 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* We are an ethernet device */ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_fast_csum() */ +#include +#include +#include + +/* And atm device */ +#include +#include +#include +/* Modular too */ +#include + +#include "lec.h" +#include "mpc.h" +#include "resources.h" + +/* + * mpc.c: Implementation of MPOA client kernel part + */ + +#if 0 +#define dprintk(format, args...) \ + printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args) +#define dprintk_cont(format, args...) printk(KERN_CONT format, ##args) +#else +#define dprintk(format, args...) \ + do { if (0) \ + printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\ + } while (0) +#define dprintk_cont(format, args...) \ + do { if (0) printk(KERN_CONT format, ##args); } while (0) +#endif + +#if 0 +#define ddprintk(format, args...) \ + printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args) +#define ddprintk_cont(format, args...) printk(KERN_CONT format, ##args) +#else +#define ddprintk(format, args...) \ + do { if (0) \ + printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\ + } while (0) +#define ddprintk_cont(format, args...) \ + do { if (0) printk(KERN_CONT format, ##args); } while (0) +#endif + +/* mpc_daemon -> kernel */ +static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void mps_death(struct k_message *msg, struct mpoa_client *mpc); +static void clean_up(struct k_message *msg, struct mpoa_client *mpc, + int action); +static void MPOA_cache_impos_rcvd(struct k_message *msg, + struct mpoa_client *mpc); +static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, + struct mpoa_client *mpc); +static void set_mps_mac_addr_rcvd(struct k_message *mesg, + struct mpoa_client *mpc); + +static const uint8_t *copy_macs(struct mpoa_client *mpc, + const uint8_t *router_mac, + const uint8_t *tlvs, uint8_t mps_macs, + uint8_t device_type); +static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry); + +static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc); +static void mpoad_close(struct atm_vcc *vcc); +static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb); + +static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb); +static netdev_tx_t mpc_send_packet(struct sk_buff *skb, + struct net_device *dev); +static int mpoa_event_listener(struct notifier_block *mpoa_notifier, + unsigned long event, void *dev); +static void mpc_timer_refresh(void); +static void mpc_cache_check(unsigned long checking_time); + +static struct llc_snap_hdr llc_snap_mpoa_ctrl = { + 0xaa, 0xaa, 0x03, + {0x00, 0x00, 0x5e}, + {0x00, 0x03} /* For MPOA control PDUs */ +}; +static struct llc_snap_hdr llc_snap_mpoa_data = { + 0xaa, 0xaa, 0x03, + {0x00, 0x00, 0x00}, + {0x08, 0x00} /* This is for IP PDUs only */ +}; +static struct llc_snap_hdr llc_snap_mpoa_data_tagged = { + 0xaa, 0xaa, 0x03, + {0x00, 0x00, 0x00}, + {0x88, 0x4c} /* This is for tagged data PDUs */ +}; + +static struct notifier_block mpoa_notifier = { + mpoa_event_listener, + NULL, + 0 +}; + +struct mpoa_client *mpcs = NULL; /* FIXME */ +static struct atm_mpoa_qos *qos_head = NULL; +static DEFINE_TIMER(mpc_timer, NULL, 0, 0); + + +static struct mpoa_client *find_mpc_by_itfnum(int itf) +{ + struct mpoa_client *mpc; + + mpc = mpcs; /* our global linked list */ + while (mpc != NULL) { + if (mpc->dev_num == itf) + return mpc; + mpc = mpc->next; + } + + return NULL; /* not found */ +} + +static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc) +{ + struct mpoa_client *mpc; + + mpc = mpcs; /* our global linked list */ + while (mpc != NULL) { + if (mpc->mpoad_vcc == vcc) + return mpc; + mpc = mpc->next; + } + + return NULL; /* not found */ +} + +static struct mpoa_client *find_mpc_by_lec(struct net_device *dev) +{ + struct mpoa_client *mpc; + + mpc = mpcs; /* our global linked list */ + while (mpc != NULL) { + if (mpc->dev == dev) + return mpc; + mpc = mpc->next; + } + + return NULL; /* not found */ +} + +/* + * Functions for managing QoS list + */ + +/* + * Overwrites the old entry or makes a new one. + */ +struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos) +{ + struct atm_mpoa_qos *entry; + + entry = atm_mpoa_search_qos(dst_ip); + if (entry != NULL) { + entry->qos = *qos; + return entry; + } + + entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL); + if (entry == NULL) { + pr_info("mpoa: out of memory\n"); + return entry; + } + + entry->ipaddr = dst_ip; + entry->qos = *qos; + + entry->next = qos_head; + qos_head = entry; + + return entry; +} + +struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip) +{ + struct atm_mpoa_qos *qos; + + qos = qos_head; + while (qos) { + if (qos->ipaddr == dst_ip) + break; + qos = qos->next; + } + + return qos; +} + +/* + * Returns 0 for failure + */ +int atm_mpoa_delete_qos(struct atm_mpoa_qos *entry) +{ + struct atm_mpoa_qos *curr; + + if (entry == NULL) + return 0; + if (entry == qos_head) { + qos_head = qos_head->next; + kfree(entry); + return 1; + } + + curr = qos_head; + while (curr != NULL) { + if (curr->next == entry) { + curr->next = entry->next; + kfree(entry); + return 1; + } + curr = curr->next; + } + + return 0; +} + +/* this is buggered - we need locking for qos_head */ +void atm_mpoa_disp_qos(struct seq_file *m) +{ + struct atm_mpoa_qos *qos; + + qos = qos_head; + seq_printf(m, "QoS entries for shortcuts:\n"); + seq_printf(m, "IP address\n TX:max_pcr pcr min_pcr max_cdv max_sdu\n RX:max_pcr pcr min_pcr max_cdv max_sdu\n"); + + while (qos != NULL) { + seq_printf(m, "%pI4\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n", + &qos->ipaddr, + qos->qos.txtp.max_pcr, + qos->qos.txtp.pcr, + qos->qos.txtp.min_pcr, + qos->qos.txtp.max_cdv, + qos->qos.txtp.max_sdu, + qos->qos.rxtp.max_pcr, + qos->qos.rxtp.pcr, + qos->qos.rxtp.min_pcr, + qos->qos.rxtp.max_cdv, + qos->qos.rxtp.max_sdu); + qos = qos->next; + } +} + +static struct net_device *find_lec_by_itfnum(int itf) +{ + struct net_device *dev; + char name[IFNAMSIZ]; + + sprintf(name, "lec%d", itf); + dev = dev_get_by_name(&init_net, name); + + return dev; +} + +static struct mpoa_client *alloc_mpc(void) +{ + struct mpoa_client *mpc; + + mpc = kzalloc(sizeof(struct mpoa_client), GFP_KERNEL); + if (mpc == NULL) + return NULL; + rwlock_init(&mpc->ingress_lock); + rwlock_init(&mpc->egress_lock); + mpc->next = mpcs; + atm_mpoa_init_cache(mpc); + + mpc->parameters.mpc_p1 = MPC_P1; + mpc->parameters.mpc_p2 = MPC_P2; + memset(mpc->parameters.mpc_p3, 0, sizeof(mpc->parameters.mpc_p3)); + mpc->parameters.mpc_p4 = MPC_P4; + mpc->parameters.mpc_p5 = MPC_P5; + mpc->parameters.mpc_p6 = MPC_P6; + + mpcs = mpc; + + return mpc; +} + +/* + * + * start_mpc() puts the MPC on line. All the packets destined + * to the lec underneath us are now being monitored and + * shortcuts will be established. + * + */ +static void start_mpc(struct mpoa_client *mpc, struct net_device *dev) +{ + + dprintk("(%s)\n", mpc->dev->name); + if (!dev->netdev_ops) + pr_info("(%s) not starting\n", dev->name); + else { + mpc->old_ops = dev->netdev_ops; + mpc->new_ops = *mpc->old_ops; + mpc->new_ops.ndo_start_xmit = mpc_send_packet; + dev->netdev_ops = &mpc->new_ops; + } +} + +static void stop_mpc(struct mpoa_client *mpc) +{ + struct net_device *dev = mpc->dev; + dprintk("(%s)", mpc->dev->name); + + /* Lets not nullify lec device's dev->hard_start_xmit */ + if (dev->netdev_ops != &mpc->new_ops) { + dprintk_cont(" mpc already stopped, not fatal\n"); + return; + } + dprintk_cont("\n"); + + dev->netdev_ops = mpc->old_ops; + mpc->old_ops = NULL; + + /* close_shortcuts(mpc); ??? FIXME */ +} + +static const char *mpoa_device_type_string(char type) __attribute__ ((unused)); + +static const char *mpoa_device_type_string(char type) +{ + switch (type) { + case NON_MPOA: + return "non-MPOA device"; + case MPS: + return "MPS"; + case MPC: + return "MPC"; + case MPS_AND_MPC: + return "both MPS and MPC"; + } + + return "unspecified (non-MPOA) device"; +} + +/* + * lec device calls this via its netdev_priv(dev)->lane2_ops + * ->associate_indicator() when it sees a TLV in LE_ARP packet. + * We fill in the pointer above when we see a LANE2 lec initializing + * See LANE2 spec 3.1.5 + * + * Quite a big and ugly function but when you look at it + * all it does is to try to locate and parse MPOA Device + * Type TLV. + * We give our lec a pointer to this function and when the + * lec sees a TLV it uses the pointer to call this function. + * + */ +static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr, + const u8 *tlvs, u32 sizeoftlvs) +{ + uint32_t type; + uint8_t length, mpoa_device_type, number_of_mps_macs; + const uint8_t *end_of_tlvs; + struct mpoa_client *mpc; + + mpoa_device_type = number_of_mps_macs = 0; /* silence gcc */ + dprintk("(%s) received TLV(s), ", dev->name); + dprintk("total length of all TLVs %d\n", sizeoftlvs); + mpc = find_mpc_by_lec(dev); /* Sampo-Fix: moved here from below */ + if (mpc == NULL) { + pr_info("(%s) no mpc\n", dev->name); + return; + } + end_of_tlvs = tlvs + sizeoftlvs; + while (end_of_tlvs - tlvs >= 5) { + type = ((tlvs[0] << 24) | (tlvs[1] << 16) | + (tlvs[2] << 8) | tlvs[3]); + length = tlvs[4]; + tlvs += 5; + dprintk(" type 0x%x length %02x\n", type, length); + if (tlvs + length > end_of_tlvs) { + pr_info("TLV value extends past its buffer, aborting parse\n"); + return; + } + + if (type == 0) { + pr_info("mpoa: (%s) TLV type was 0, returning\n", + dev->name); + return; + } + + if (type != TLV_MPOA_DEVICE_TYPE) { + tlvs += length; + continue; /* skip other TLVs */ + } + mpoa_device_type = *tlvs++; + number_of_mps_macs = *tlvs++; + dprintk("(%s) MPOA device type '%s', ", + dev->name, mpoa_device_type_string(mpoa_device_type)); + if (mpoa_device_type == MPS_AND_MPC && + length < (42 + number_of_mps_macs*ETH_ALEN)) { /* :) */ + pr_info("(%s) short MPOA Device Type TLV\n", + dev->name); + continue; + } + if ((mpoa_device_type == MPS || mpoa_device_type == MPC) && + length < 22 + number_of_mps_macs*ETH_ALEN) { + pr_info("(%s) short MPOA Device Type TLV\n", dev->name); + continue; + } + if (mpoa_device_type != MPS && + mpoa_device_type != MPS_AND_MPC) { + dprintk("ignoring non-MPS device "); + if (mpoa_device_type == MPC) + tlvs += 20; + continue; /* we are only interested in MPSs */ + } + if (number_of_mps_macs == 0 && + mpoa_device_type == MPS_AND_MPC) { + pr_info("(%s) MPS_AND_MPC has zero MACs\n", dev->name); + continue; /* someone should read the spec */ + } + dprintk_cont("this MPS has %d MAC addresses\n", + number_of_mps_macs); + + /* + * ok, now we can go and tell our daemon + * the control address of MPS + */ + send_set_mps_ctrl_addr(tlvs, mpc); + + tlvs = copy_macs(mpc, mac_addr, tlvs, + number_of_mps_macs, mpoa_device_type); + if (tlvs == NULL) + return; + } + if (end_of_tlvs - tlvs != 0) + pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n", + dev->name, end_of_tlvs - tlvs); +} + +/* + * Store at least advertizing router's MAC address + * plus the possible MAC address(es) to mpc->mps_macs. + * For a freshly allocated MPOA client mpc->mps_macs == 0. + */ +static const uint8_t *copy_macs(struct mpoa_client *mpc, + const uint8_t *router_mac, + const uint8_t *tlvs, uint8_t mps_macs, + uint8_t device_type) +{ + int num_macs; + num_macs = (mps_macs > 1) ? mps_macs : 1; + + if (mpc->number_of_mps_macs != num_macs) { /* need to reallocate? */ + if (mpc->number_of_mps_macs != 0) + kfree(mpc->mps_macs); + mpc->number_of_mps_macs = 0; + mpc->mps_macs = kmalloc(num_macs * ETH_ALEN, GFP_KERNEL); + if (mpc->mps_macs == NULL) { + pr_info("(%s) out of mem\n", mpc->dev->name); + return NULL; + } + } + memcpy(mpc->mps_macs, router_mac, ETH_ALEN); + tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20; + if (mps_macs > 0) + memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN); + tlvs += mps_macs*ETH_ALEN; + mpc->number_of_mps_macs = num_macs; + + return tlvs; +} + +static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) +{ + in_cache_entry *entry; + struct iphdr *iph; + char *buff; + __be32 ipaddr = 0; + + static struct { + struct llc_snap_hdr hdr; + __be32 tag; + } tagged_llc_snap_hdr = { + {0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x88, 0x4c}}, + 0 + }; + + buff = skb->data + mpc->dev->hard_header_len; + iph = (struct iphdr *)buff; + ipaddr = iph->daddr; + + ddprintk("(%s) ipaddr 0x%x\n", + mpc->dev->name, ipaddr); + + entry = mpc->in_ops->get(ipaddr, mpc); + if (entry == NULL) { + entry = mpc->in_ops->add_entry(ipaddr, mpc); + if (entry != NULL) + mpc->in_ops->put(entry); + return 1; + } + /* threshold not exceeded or VCC not ready */ + if (mpc->in_ops->cache_hit(entry, mpc) != OPEN) { + ddprintk("(%s) cache_hit: returns != OPEN\n", + mpc->dev->name); + mpc->in_ops->put(entry); + return 1; + } + + ddprintk("(%s) using shortcut\n", + mpc->dev->name); + /* MPOA spec A.1.4, MPOA client must decrement IP ttl at least by one */ + if (iph->ttl <= 1) { + ddprintk("(%s) IP ttl = %u, using LANE\n", + mpc->dev->name, iph->ttl); + mpc->in_ops->put(entry); + return 1; + } + iph->ttl--; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + if (entry->ctrl_info.tag != 0) { + ddprintk("(%s) adding tag 0x%x\n", + mpc->dev->name, entry->ctrl_info.tag); + tagged_llc_snap_hdr.tag = entry->ctrl_info.tag; + skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ + skb_push(skb, sizeof(tagged_llc_snap_hdr)); + /* add LLC/SNAP header */ + skb_copy_to_linear_data(skb, &tagged_llc_snap_hdr, + sizeof(tagged_llc_snap_hdr)); + } else { + skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ + skb_push(skb, sizeof(struct llc_snap_hdr)); + /* add LLC/SNAP header + tag */ + skb_copy_to_linear_data(skb, &llc_snap_mpoa_data, + sizeof(struct llc_snap_hdr)); + } + + atomic_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = entry->shortcut->atm_options; + entry->shortcut->send(entry->shortcut, skb); + entry->packets_fwded++; + mpc->in_ops->put(entry); + + return 0; +} + +/* + * Probably needs some error checks and locking, not sure... + */ +static netdev_tx_t mpc_send_packet(struct sk_buff *skb, + struct net_device *dev) +{ + struct mpoa_client *mpc; + struct ethhdr *eth; + int i = 0; + + mpc = find_mpc_by_lec(dev); /* this should NEVER fail */ + if (mpc == NULL) { + pr_info("(%s) no MPC found\n", dev->name); + goto non_ip; + } + + eth = (struct ethhdr *)skb->data; + if (eth->h_proto != htons(ETH_P_IP)) + goto non_ip; /* Multi-Protocol Over ATM :-) */ + + /* Weed out funny packets (e.g., AF_PACKET or raw). */ + if (skb->len < ETH_HLEN + sizeof(struct iphdr)) + goto non_ip; + skb_set_network_header(skb, ETH_HLEN); + if (skb->len < ETH_HLEN + ip_hdr(skb)->ihl * 4 || ip_hdr(skb)->ihl < 5) + goto non_ip; + + while (i < mpc->number_of_mps_macs) { + if (!compare_ether_addr(eth->h_dest, + (mpc->mps_macs + i*ETH_ALEN))) + if (send_via_shortcut(skb, mpc) == 0) /* try shortcut */ + return NETDEV_TX_OK; + i++; + } + +non_ip: + return mpc->old_ops->ndo_start_xmit(skb, dev); +} + +static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) +{ + int bytes_left; + struct mpoa_client *mpc; + struct atmmpc_ioc ioc_data; + in_cache_entry *in_entry; + __be32 ipaddr; + + bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmmpc_ioc)); + if (bytes_left != 0) { + pr_info("mpoa:Short read (missed %d bytes) from userland\n", + bytes_left); + return -EFAULT; + } + ipaddr = ioc_data.ipaddr; + if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF) + return -EINVAL; + + mpc = find_mpc_by_itfnum(ioc_data.dev_num); + if (mpc == NULL) + return -EINVAL; + + if (ioc_data.type == MPC_SOCKET_INGRESS) { + in_entry = mpc->in_ops->get(ipaddr, mpc); + if (in_entry == NULL || + in_entry->entry_state < INGRESS_RESOLVED) { + pr_info("(%s) did not find RESOLVED entry from ingress cache\n", + mpc->dev->name); + if (in_entry != NULL) + mpc->in_ops->put(in_entry); + return -EINVAL; + } + pr_info("(%s) attaching ingress SVC, entry = %pI4\n", + mpc->dev->name, &in_entry->ctrl_info.in_dst_ip); + in_entry->shortcut = vcc; + mpc->in_ops->put(in_entry); + } else { + pr_info("(%s) attaching egress SVC\n", mpc->dev->name); + } + + vcc->proto_data = mpc->dev; + vcc->push = mpc_push; + + return 0; +} + +/* + * + */ +static void mpc_vcc_close(struct atm_vcc *vcc, struct net_device *dev) +{ + struct mpoa_client *mpc; + in_cache_entry *in_entry; + eg_cache_entry *eg_entry; + + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) { + pr_info("(%s) close for unknown MPC\n", dev->name); + return; + } + + dprintk("(%s)\n", dev->name); + in_entry = mpc->in_ops->get_by_vcc(vcc, mpc); + if (in_entry) { + dprintk("(%s) ingress SVC closed ip = %pI4\n", + mpc->dev->name, &in_entry->ctrl_info.in_dst_ip); + in_entry->shortcut = NULL; + mpc->in_ops->put(in_entry); + } + eg_entry = mpc->eg_ops->get_by_vcc(vcc, mpc); + if (eg_entry) { + dprintk("(%s) egress SVC closed\n", mpc->dev->name); + eg_entry->shortcut = NULL; + mpc->eg_ops->put(eg_entry); + } + + if (in_entry == NULL && eg_entry == NULL) + dprintk("(%s) unused vcc closed\n", dev->name); +} + +static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct sk_buff *new_skb; + eg_cache_entry *eg; + struct mpoa_client *mpc; + __be32 tag; + char *tmp; + + ddprintk("(%s)\n", dev->name); + if (skb == NULL) { + dprintk("(%s) null skb, closing VCC\n", dev->name); + mpc_vcc_close(vcc, dev); + return; + } + + skb->dev = dev; + if (memcmp(skb->data, &llc_snap_mpoa_ctrl, + sizeof(struct llc_snap_hdr)) == 0) { + struct sock *sk = sk_atm(vcc); + + dprintk("(%s) control packet arrived\n", dev->name); + /* Pass control packets to daemon */ + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + return; + } + + /* data coming over the shortcut */ + atm_return(vcc, skb->truesize); + + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) { + pr_info("(%s) unknown MPC\n", dev->name); + return; + } + + if (memcmp(skb->data, &llc_snap_mpoa_data_tagged, + sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */ + ddprintk("(%s) tagged data packet arrived\n", dev->name); + + } else if (memcmp(skb->data, &llc_snap_mpoa_data, + sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */ + pr_info("(%s) Unsupported non-tagged data packet arrived. Purging\n", + dev->name); + dev_kfree_skb_any(skb); + return; + } else { + pr_info("(%s) garbage arrived, purging\n", dev->name); + dev_kfree_skb_any(skb); + return; + } + + tmp = skb->data + sizeof(struct llc_snap_hdr); + tag = *(__be32 *)tmp; + + eg = mpc->eg_ops->get_by_tag(tag, mpc); + if (eg == NULL) { + pr_info("mpoa: (%s) Didn't find egress cache entry, tag = %u\n", + dev->name, tag); + purge_egress_shortcut(vcc, NULL); + dev_kfree_skb_any(skb); + return; + } + + /* + * See if ingress MPC is using shortcut we opened as a return channel. + * This means we have a bi-directional vcc opened by us. + */ + if (eg->shortcut == NULL) { + eg->shortcut = vcc; + pr_info("(%s) egress SVC in use\n", dev->name); + } + + skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag)); + /* get rid of LLC/SNAP header */ + new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length); + /* LLC/SNAP is shorter than MAC header :( */ + dev_kfree_skb_any(skb); + if (new_skb == NULL) { + mpc->eg_ops->put(eg); + return; + } + skb_push(new_skb, eg->ctrl_info.DH_length); /* add MAC header */ + skb_copy_to_linear_data(new_skb, eg->ctrl_info.DLL_header, + eg->ctrl_info.DH_length); + new_skb->protocol = eth_type_trans(new_skb, dev); + skb_reset_network_header(new_skb); + + eg->latest_ip_addr = ip_hdr(new_skb)->saddr; + eg->packets_rcvd++; + mpc->eg_ops->put(eg); + + memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data)); + netif_rx(new_skb); +} + +static struct atmdev_ops mpc_ops = { /* only send is required */ + .close = mpoad_close, + .send = msg_from_mpoad +}; + +static struct atm_dev mpc_dev = { + .ops = &mpc_ops, + .type = "mpc", + .number = 42, + .lock = __SPIN_LOCK_UNLOCKED(mpc_dev.lock) + /* members not explicitly initialised will be 0 */ +}; + +static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg) +{ + struct mpoa_client *mpc; + struct lec_priv *priv; + int err; + + if (mpcs == NULL) { + init_timer(&mpc_timer); + mpc_timer_refresh(); + + /* This lets us now how our LECs are doing */ + err = register_netdevice_notifier(&mpoa_notifier); + if (err < 0) { + del_timer(&mpc_timer); + return err; + } + } + + mpc = find_mpc_by_itfnum(arg); + if (mpc == NULL) { + dprintk("allocating new mpc for itf %d\n", arg); + mpc = alloc_mpc(); + if (mpc == NULL) + return -ENOMEM; + mpc->dev_num = arg; + mpc->dev = find_lec_by_itfnum(arg); + /* NULL if there was no lec */ + } + if (mpc->mpoad_vcc) { + pr_info("mpoad is already present for itf %d\n", arg); + return -EADDRINUSE; + } + + if (mpc->dev) { /* check if the lec is LANE2 capable */ + priv = netdev_priv(mpc->dev); + if (priv->lane_version < 2) { + dev_put(mpc->dev); + mpc->dev = NULL; + } else + priv->lane2_ops->associate_indicator = lane2_assoc_ind; + } + + mpc->mpoad_vcc = vcc; + vcc->dev = &mpc_dev; + vcc_insert_socket(sk_atm(vcc)); + set_bit(ATM_VF_META, &vcc->flags); + set_bit(ATM_VF_READY, &vcc->flags); + + if (mpc->dev) { + char empty[ATM_ESA_LEN]; + memset(empty, 0, ATM_ESA_LEN); + + start_mpc(mpc, mpc->dev); + /* set address if mpcd e.g. gets killed and restarted. + * If we do not do it now we have to wait for the next LE_ARP + */ + if (memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0) + send_set_mps_ctrl_addr(mpc->mps_ctrl_addr, mpc); + } + + __module_get(THIS_MODULE); + return arg; +} + +static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc) +{ + struct k_message mesg; + + memcpy(mpc->mps_ctrl_addr, addr, ATM_ESA_LEN); + + mesg.type = SET_MPS_CTRL_ADDR; + memcpy(mesg.MPS_ctrl, addr, ATM_ESA_LEN); + msg_to_mpoad(&mesg, mpc); +} + +static void mpoad_close(struct atm_vcc *vcc) +{ + struct mpoa_client *mpc; + struct sk_buff *skb; + + mpc = find_mpc_by_vcc(vcc); + if (mpc == NULL) { + pr_info("did not find MPC\n"); + return; + } + if (!mpc->mpoad_vcc) { + pr_info("close for non-present mpoad\n"); + return; + } + + mpc->mpoad_vcc = NULL; + if (mpc->dev) { + struct lec_priv *priv = netdev_priv(mpc->dev); + priv->lane2_ops->associate_indicator = NULL; + stop_mpc(mpc); + dev_put(mpc->dev); + } + + mpc->in_ops->destroy_cache(mpc); + mpc->eg_ops->destroy_cache(mpc); + + while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { + atm_return(vcc, skb->truesize); + kfree_skb(skb); + } + + pr_info("(%s) going down\n", + (mpc->dev) ? mpc->dev->name : ""); + module_put(THIS_MODULE); +} + +/* + * + */ +static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb) +{ + + struct mpoa_client *mpc = find_mpc_by_vcc(vcc); + struct k_message *mesg = (struct k_message *)skb->data; + atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + + if (mpc == NULL) { + pr_info("no mpc found\n"); + return 0; + } + dprintk("(%s)", mpc->dev ? mpc->dev->name : ""); + switch (mesg->type) { + case MPOA_RES_REPLY_RCVD: + dprintk_cont("mpoa_res_reply_rcvd\n"); + MPOA_res_reply_rcvd(mesg, mpc); + break; + case MPOA_TRIGGER_RCVD: + dprintk_cont("mpoa_trigger_rcvd\n"); + MPOA_trigger_rcvd(mesg, mpc); + break; + case INGRESS_PURGE_RCVD: + dprintk_cont("nhrp_purge_rcvd\n"); + ingress_purge_rcvd(mesg, mpc); + break; + case EGRESS_PURGE_RCVD: + dprintk_cont("egress_purge_reply_rcvd\n"); + egress_purge_rcvd(mesg, mpc); + break; + case MPS_DEATH: + dprintk_cont("mps_death\n"); + mps_death(mesg, mpc); + break; + case CACHE_IMPOS_RCVD: + dprintk_cont("cache_impos_rcvd\n"); + MPOA_cache_impos_rcvd(mesg, mpc); + break; + case SET_MPC_CTRL_ADDR: + dprintk_cont("set_mpc_ctrl_addr\n"); + set_mpc_ctrl_addr_rcvd(mesg, mpc); + break; + case SET_MPS_MAC_ADDR: + dprintk_cont("set_mps_mac_addr\n"); + set_mps_mac_addr_rcvd(mesg, mpc); + break; + case CLEAN_UP_AND_EXIT: + dprintk_cont("clean_up_and_exit\n"); + clean_up(mesg, mpc, DIE); + break; + case RELOAD: + dprintk_cont("reload\n"); + clean_up(mesg, mpc, RELOAD); + break; + case SET_MPC_PARAMS: + dprintk_cont("set_mpc_params\n"); + mpc->parameters = mesg->content.params; + break; + default: + dprintk_cont("unknown message %d\n", mesg->type); + break; + } + kfree_skb(skb); + + return 0; +} + +/* Remember that this function may not do things that sleep */ +int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) +{ + struct sk_buff *skb; + struct sock *sk; + + if (mpc == NULL || !mpc->mpoad_vcc) { + pr_info("mesg %d to a non-existent mpoad\n", mesg->type); + return -ENXIO; + } + + skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + skb_put(skb, sizeof(struct k_message)); + skb_copy_to_linear_data(skb, mesg, sizeof(*mesg)); + atm_force_charge(mpc->mpoad_vcc, skb->truesize); + + sk = sk_atm(mpc->mpoad_vcc); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + return 0; +} + +static int mpoa_event_listener(struct notifier_block *mpoa_notifier, + unsigned long event, void *dev_ptr) +{ + struct net_device *dev; + struct mpoa_client *mpc; + struct lec_priv *priv; + + dev = (struct net_device *)dev_ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->name == NULL || strncmp(dev->name, "lec", 3)) + return NOTIFY_DONE; /* we are only interested in lec:s */ + + switch (event) { + case NETDEV_REGISTER: /* a new lec device was allocated */ + priv = netdev_priv(dev); + if (priv->lane_version < 2) + break; + priv->lane2_ops->associate_indicator = lane2_assoc_ind; + mpc = find_mpc_by_itfnum(priv->itfnum); + if (mpc == NULL) { + dprintk("allocating new mpc for %s\n", dev->name); + mpc = alloc_mpc(); + if (mpc == NULL) { + pr_info("no new mpc"); + break; + } + } + mpc->dev_num = priv->itfnum; + mpc->dev = dev; + dev_hold(dev); + dprintk("(%s) was initialized\n", dev->name); + break; + case NETDEV_UNREGISTER: + /* the lec device was deallocated */ + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) + break; + dprintk("device (%s) was deallocated\n", dev->name); + stop_mpc(mpc); + dev_put(mpc->dev); + mpc->dev = NULL; + break; + case NETDEV_UP: + /* the dev was ifconfig'ed up */ + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) + break; + if (mpc->mpoad_vcc != NULL) + start_mpc(mpc, dev); + break; + case NETDEV_DOWN: + /* the dev was ifconfig'ed down */ + /* this means that the flow of packets from the + * upper layer stops + */ + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) + break; + if (mpc->mpoad_vcc != NULL) + stop_mpc(mpc); + break; + case NETDEV_REBOOT: + case NETDEV_CHANGE: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_GOING_DOWN: + break; + default: + break; + } + + return NOTIFY_DONE; +} + +/* + * Functions which are called after a message is received from mpcd. + * Msg is reused on purpose. + */ + + +static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + __be32 dst_ip = msg->content.in_info.in_dst_ip; + in_cache_entry *entry; + + entry = mpc->in_ops->get(dst_ip, mpc); + if (entry == NULL) { + entry = mpc->in_ops->add_entry(dst_ip, mpc); + entry->entry_state = INGRESS_RESOLVING; + msg->type = SND_MPOA_RES_RQST; + msg->content.in_info = entry->ctrl_info; + msg_to_mpoad(msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + mpc->in_ops->put(entry); + return; + } + + if (entry->entry_state == INGRESS_INVALID) { + entry->entry_state = INGRESS_RESOLVING; + msg->type = SND_MPOA_RES_RQST; + msg->content.in_info = entry->ctrl_info; + msg_to_mpoad(msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + mpc->in_ops->put(entry); + return; + } + + pr_info("(%s) entry already in resolving state\n", + (mpc->dev) ? mpc->dev->name : ""); + mpc->in_ops->put(entry); +} + +/* + * Things get complicated because we have to check if there's an egress + * shortcut with suitable traffic parameters we could use. + */ +static void check_qos_and_open_shortcut(struct k_message *msg, + struct mpoa_client *client, + in_cache_entry *entry) +{ + __be32 dst_ip = msg->content.in_info.in_dst_ip; + struct atm_mpoa_qos *qos = atm_mpoa_search_qos(dst_ip); + eg_cache_entry *eg_entry = client->eg_ops->get_by_src_ip(dst_ip, client); + + if (eg_entry && eg_entry->shortcut) { + if (eg_entry->shortcut->qos.txtp.traffic_class & + msg->qos.txtp.traffic_class & + (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)) { + if (eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR) + entry->shortcut = eg_entry->shortcut; + else if (eg_entry->shortcut->qos.txtp.max_pcr > 0) + entry->shortcut = eg_entry->shortcut; + } + if (entry->shortcut) { + dprintk("(%s) using egress SVC to reach %pI4\n", + client->dev->name, &dst_ip); + client->eg_ops->put(eg_entry); + return; + } + } + if (eg_entry != NULL) + client->eg_ops->put(eg_entry); + + /* No luck in the egress cache we must open an ingress SVC */ + msg->type = OPEN_INGRESS_SVC; + if (qos && + (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class)) { + msg->qos = qos->qos; + pr_info("(%s) trying to get a CBR shortcut\n", + client->dev->name); + } else + memset(&msg->qos, 0, sizeof(struct atm_qos)); + msg_to_mpoad(msg, client); +} + +static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + __be32 dst_ip = msg->content.in_info.in_dst_ip; + in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc); + + dprintk("(%s) ip %pI4\n", + mpc->dev->name, &dst_ip); + ddprintk("(%s) entry = %p", + mpc->dev->name, entry); + if (entry == NULL) { + pr_info("(%s) ARGH, received res. reply for an entry that doesn't exist.\n", + mpc->dev->name); + return; + } + ddprintk_cont(" entry_state = %d ", entry->entry_state); + + if (entry->entry_state == INGRESS_RESOLVED) { + pr_info("(%s) RESOLVED entry!\n", mpc->dev->name); + mpc->in_ops->put(entry); + return; + } + + entry->ctrl_info = msg->content.in_info; + do_gettimeofday(&(entry->tv)); + do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */ + entry->refresh_time = 0; + ddprintk_cont("entry->shortcut = %p\n", entry->shortcut); + + if (entry->entry_state == INGRESS_RESOLVING && + entry->shortcut != NULL) { + entry->entry_state = INGRESS_RESOLVED; + mpc->in_ops->put(entry); + return; /* Shortcut already open... */ + } + + if (entry->shortcut != NULL) { + pr_info("(%s) entry->shortcut != NULL, impossible!\n", + mpc->dev->name); + mpc->in_ops->put(entry); + return; + } + + check_qos_and_open_shortcut(msg, mpc, entry); + entry->entry_state = INGRESS_RESOLVED; + mpc->in_ops->put(entry); + + return; + +} + +static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + __be32 dst_ip = msg->content.in_info.in_dst_ip; + __be32 mask = msg->ip_mask; + in_cache_entry *entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask); + + if (entry == NULL) { + pr_info("(%s) purge for a non-existing entry, ip = %pI4\n", + mpc->dev->name, &dst_ip); + return; + } + + do { + dprintk("(%s) removing an ingress entry, ip = %pI4\n", + mpc->dev->name, &dst_ip); + write_lock_bh(&mpc->ingress_lock); + mpc->in_ops->remove_entry(entry, mpc); + write_unlock_bh(&mpc->ingress_lock); + mpc->in_ops->put(entry); + entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask); + } while (entry != NULL); +} + +static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + __be32 cache_id = msg->content.eg_info.cache_id; + eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(cache_id, mpc); + + if (entry == NULL) { + dprintk("(%s) purge for a non-existing entry\n", + mpc->dev->name); + return; + } + + write_lock_irq(&mpc->egress_lock); + mpc->eg_ops->remove_entry(entry, mpc); + write_unlock_irq(&mpc->egress_lock); + + mpc->eg_ops->put(entry); +} + +static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry) +{ + struct sock *sk; + struct k_message *purge_msg; + struct sk_buff *skb; + + dprintk("entering\n"); + if (vcc == NULL) { + pr_info("vcc == NULL\n"); + return; + } + + skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC); + if (skb == NULL) { + pr_info("out of memory\n"); + return; + } + + skb_put(skb, sizeof(struct k_message)); + memset(skb->data, 0, sizeof(struct k_message)); + purge_msg = (struct k_message *)skb->data; + purge_msg->type = DATA_PLANE_PURGE; + if (entry != NULL) + purge_msg->content.eg_info = entry->ctrl_info; + + atm_force_charge(vcc, skb->truesize); + + sk = sk_atm(vcc); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + dprintk("exiting\n"); +} + +/* + * Our MPS died. Tell our daemon to send NHRP data plane purge to each + * of the egress shortcuts we have. + */ +static void mps_death(struct k_message *msg, struct mpoa_client *mpc) +{ + eg_cache_entry *entry; + + dprintk("(%s)\n", mpc->dev->name); + + if (memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)) { + pr_info("(%s) wrong MPS\n", mpc->dev->name); + return; + } + + /* FIXME: This knows too much of the cache structure */ + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while (entry != NULL) { + purge_egress_shortcut(entry->shortcut, entry); + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + mpc->in_ops->destroy_cache(mpc); + mpc->eg_ops->destroy_cache(mpc); +} + +static void MPOA_cache_impos_rcvd(struct k_message *msg, + struct mpoa_client *mpc) +{ + uint16_t holding_time; + eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(msg->content.eg_info.cache_id, mpc); + + holding_time = msg->content.eg_info.holding_time; + dprintk("(%s) entry = %p, holding_time = %u\n", + mpc->dev->name, entry, holding_time); + if (entry == NULL && holding_time) { + entry = mpc->eg_ops->add_entry(msg, mpc); + mpc->eg_ops->put(entry); + return; + } + if (holding_time) { + mpc->eg_ops->update(entry, holding_time); + return; + } + + write_lock_irq(&mpc->egress_lock); + mpc->eg_ops->remove_entry(entry, mpc); + write_unlock_irq(&mpc->egress_lock); + + mpc->eg_ops->put(entry); +} + +static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, + struct mpoa_client *mpc) +{ + struct lec_priv *priv; + int i, retval ; + + uint8_t tlv[4 + 1 + 1 + 1 + ATM_ESA_LEN]; + + tlv[0] = 00; tlv[1] = 0xa0; tlv[2] = 0x3e; tlv[3] = 0x2a; /* type */ + tlv[4] = 1 + 1 + ATM_ESA_LEN; /* length */ + tlv[5] = 0x02; /* MPOA client */ + tlv[6] = 0x00; /* number of MPS MAC addresses */ + + memcpy(&tlv[7], mesg->MPS_ctrl, ATM_ESA_LEN); /* MPC ctrl ATM addr */ + memcpy(mpc->our_ctrl_addr, mesg->MPS_ctrl, ATM_ESA_LEN); + + dprintk("(%s) setting MPC ctrl ATM address to", + mpc->dev ? mpc->dev->name : ""); + for (i = 7; i < sizeof(tlv); i++) + dprintk_cont(" %02x", tlv[i]); + dprintk_cont("\n"); + + if (mpc->dev) { + priv = netdev_priv(mpc->dev); + retval = priv->lane2_ops->associate_req(mpc->dev, + mpc->dev->dev_addr, + tlv, sizeof(tlv)); + if (retval == 0) + pr_info("(%s) MPOA device type TLV association failed\n", + mpc->dev->name); + retval = priv->lane2_ops->resolve(mpc->dev, NULL, 1, NULL, NULL); + if (retval < 0) + pr_info("(%s) targetless LE_ARP request failed\n", + mpc->dev->name); + } +} + +static void set_mps_mac_addr_rcvd(struct k_message *msg, + struct mpoa_client *client) +{ + + if (client->number_of_mps_macs) + kfree(client->mps_macs); + client->number_of_mps_macs = 0; + client->mps_macs = kmemdup(msg->MPS_ctrl, ETH_ALEN, GFP_KERNEL); + if (client->mps_macs == NULL) { + pr_info("out of memory\n"); + return; + } + client->number_of_mps_macs = 1; +} + +/* + * purge egress cache and tell daemon to 'action' (DIE, RELOAD) + */ +static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action) +{ + + eg_cache_entry *entry; + msg->type = SND_EGRESS_PURGE; + + + /* FIXME: This knows too much of the cache structure */ + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while (entry != NULL) { + msg->content.eg_info = entry->ctrl_info; + dprintk("cache_id %u\n", entry->ctrl_info.cache_id); + msg_to_mpoad(msg, mpc); + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + msg->type = action; + msg_to_mpoad(msg, mpc); +} + +static void mpc_timer_refresh(void) +{ + mpc_timer.expires = jiffies + (MPC_P2 * HZ); + mpc_timer.data = mpc_timer.expires; + mpc_timer.function = mpc_cache_check; + add_timer(&mpc_timer); +} + +static void mpc_cache_check(unsigned long checking_time) +{ + struct mpoa_client *mpc = mpcs; + static unsigned long previous_resolving_check_time; + static unsigned long previous_refresh_time; + + while (mpc != NULL) { + mpc->in_ops->clear_count(mpc); + mpc->eg_ops->clear_expired(mpc); + if (checking_time - previous_resolving_check_time > + mpc->parameters.mpc_p4 * HZ) { + mpc->in_ops->check_resolving(mpc); + previous_resolving_check_time = checking_time; + } + if (checking_time - previous_refresh_time > + mpc->parameters.mpc_p5 * HZ) { + mpc->in_ops->refresh(mpc); + previous_refresh_time = checking_time; + } + mpc = mpc->next; + } + mpc_timer_refresh(); +} + +static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + struct atm_vcc *vcc = ATM_SD(sock); + + if (cmd != ATMMPC_CTRL && cmd != ATMMPC_DATA) + return -ENOIOCTLCMD; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ATMMPC_CTRL: + err = atm_mpoa_mpoad_attach(vcc, (int)arg); + if (err >= 0) + sock->state = SS_CONNECTED; + break; + case ATMMPC_DATA: + err = atm_mpoa_vcc_attach(vcc, (void __user *)arg); + break; + default: + break; + } + return err; +} + +static struct atm_ioctl atm_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = atm_mpoa_ioctl, +}; + +static __init int atm_mpoa_init(void) +{ + register_atm_ioctl(&atm_ioctl_ops); + + if (mpc_proc_init() != 0) + pr_info("failed to initialize /proc/mpoa\n"); + + pr_info("mpc.c: initialized\n"); + + return 0; +} + +static void __exit atm_mpoa_cleanup(void) +{ + struct mpoa_client *mpc, *tmp; + struct atm_mpoa_qos *qos, *nextqos; + struct lec_priv *priv; + + mpc_proc_clean(); + + del_timer(&mpc_timer); + unregister_netdevice_notifier(&mpoa_notifier); + deregister_atm_ioctl(&atm_ioctl_ops); + + mpc = mpcs; + mpcs = NULL; + while (mpc != NULL) { + tmp = mpc->next; + if (mpc->dev != NULL) { + stop_mpc(mpc); + priv = netdev_priv(mpc->dev); + if (priv->lane2_ops != NULL) + priv->lane2_ops->associate_indicator = NULL; + } + ddprintk("about to clear caches\n"); + mpc->in_ops->destroy_cache(mpc); + mpc->eg_ops->destroy_cache(mpc); + ddprintk("caches cleared\n"); + kfree(mpc->mps_macs); + memset(mpc, 0, sizeof(struct mpoa_client)); + ddprintk("about to kfree %p\n", mpc); + kfree(mpc); + ddprintk("next mpc is at %p\n", tmp); + mpc = tmp; + } + + qos = qos_head; + qos_head = NULL; + while (qos != NULL) { + nextqos = qos->next; + dprintk("freeing qos entry %p\n", qos); + kfree(qos); + qos = nextqos; + } +} + +module_init(atm_mpoa_init); +module_exit(atm_mpoa_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/net/atm/mpc.h b/net/atm/mpc.h new file mode 100644 index 00000000..0919a88b --- /dev/null +++ b/net/atm/mpc.h @@ -0,0 +1,64 @@ +#ifndef _MPC_H_ +#define _MPC_H_ + +#include +#include +#include +#include +#include +#include "mpoa_caches.h" + +/* kernel -> mpc-daemon */ +int msg_to_mpoad(struct k_message *msg, struct mpoa_client *mpc); + +struct mpoa_client { + struct mpoa_client *next; + struct net_device *dev; /* lec in question */ + int dev_num; /* e.g. 2 for lec2 */ + + struct atm_vcc *mpoad_vcc; /* control channel to mpoad */ + uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */ + uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */ + + rwlock_t ingress_lock; + struct in_cache_ops *in_ops; /* ingress cache operations */ + in_cache_entry *in_cache; /* the ingress cache of this MPC */ + + rwlock_t egress_lock; + struct eg_cache_ops *eg_ops; /* egress cache operations */ + eg_cache_entry *eg_cache; /* the egress cache of this MPC */ + + uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */ + int number_of_mps_macs; /* number of the above MAC addresses */ + struct mpc_parameters parameters; /* parameters for this client */ + + const struct net_device_ops *old_ops; + struct net_device_ops new_ops; +}; + + +struct atm_mpoa_qos { + struct atm_mpoa_qos *next; + __be32 ipaddr; + struct atm_qos qos; +}; + + +/* MPOA QoS operations */ +struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos); +struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip); +int atm_mpoa_delete_qos(struct atm_mpoa_qos *qos); + +/* Display QoS entries. This is for the procfs */ +struct seq_file; +void atm_mpoa_disp_qos(struct seq_file *m); + +#ifdef CONFIG_PROC_FS +int mpc_proc_init(void); +void mpc_proc_clean(void); +#else +#define mpc_proc_init() (0) +#define mpc_proc_clean() do { } while(0) +#endif + +#endif /* _MPC_H_ */ diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c new file mode 100644 index 00000000..d1b2d9a0 --- /dev/null +++ b/net/atm/mpoa_caches.c @@ -0,0 +1,569 @@ +#include +#include +#include +#include + +#include "mpoa_caches.h" +#include "mpc.h" + +/* + * mpoa_caches.c: Implementation of ingress and egress cache + * handling functions + */ + +#if 0 +#define dprintk(format, args...) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */ +#else +#define dprintk(format, args...) \ + do { if (0) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\ + } while (0) +#endif + +#if 0 +#define ddprintk(format, args...) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */ +#else +#define ddprintk(format, args...) \ + do { if (0) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\ + } while (0) +#endif + +static in_cache_entry *in_cache_get(__be32 dst_ip, + struct mpoa_client *client) +{ + in_cache_entry *entry; + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while (entry != NULL) { + if (entry->ctrl_info.in_dst_ip == dst_ip) { + atomic_inc(&entry->use); + read_unlock_bh(&client->ingress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); + + return NULL; +} + +static in_cache_entry *in_cache_get_with_mask(__be32 dst_ip, + struct mpoa_client *client, + __be32 mask) +{ + in_cache_entry *entry; + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while (entry != NULL) { + if ((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask)) { + atomic_inc(&entry->use); + read_unlock_bh(&client->ingress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); + + return NULL; + +} + +static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc, + struct mpoa_client *client) +{ + in_cache_entry *entry; + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while (entry != NULL) { + if (entry->shortcut == vcc) { + atomic_inc(&entry->use); + read_unlock_bh(&client->ingress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); + + return NULL; +} + +static in_cache_entry *in_cache_add_entry(__be32 dst_ip, + struct mpoa_client *client) +{ + in_cache_entry *entry = kzalloc(sizeof(in_cache_entry), GFP_KERNEL); + + if (entry == NULL) { + pr_info("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n"); + return NULL; + } + + dprintk("adding an ingress entry, ip = %pI4\n", &dst_ip); + + atomic_set(&entry->use, 1); + dprintk("new_in_cache_entry: about to lock\n"); + write_lock_bh(&client->ingress_lock); + entry->next = client->in_cache; + entry->prev = NULL; + if (client->in_cache != NULL) + client->in_cache->prev = entry; + client->in_cache = entry; + + memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); + entry->ctrl_info.in_dst_ip = dst_ip; + do_gettimeofday(&(entry->tv)); + entry->retry_time = client->parameters.mpc_p4; + entry->count = 1; + entry->entry_state = INGRESS_INVALID; + entry->ctrl_info.holding_time = HOLDING_TIME_DEFAULT; + atomic_inc(&entry->use); + + write_unlock_bh(&client->ingress_lock); + dprintk("new_in_cache_entry: unlocked\n"); + + return entry; +} + +static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) +{ + struct atm_mpoa_qos *qos; + struct k_message msg; + + entry->count++; + if (entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL) + return OPEN; + + if (entry->entry_state == INGRESS_REFRESHING) { + if (entry->count > mpc->parameters.mpc_p1) { + msg.type = SND_MPOA_RES_RQST; + msg.content.in_info = entry->ctrl_info; + memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN); + qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); + if (qos != NULL) + msg.qos = qos->qos; + msg_to_mpoad(&msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + entry->entry_state = INGRESS_RESOLVING; + } + if (entry->shortcut != NULL) + return OPEN; + return CLOSED; + } + + if (entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL) + return OPEN; + + if (entry->count > mpc->parameters.mpc_p1 && + entry->entry_state == INGRESS_INVALID) { + dprintk("(%s) threshold exceeded for ip %pI4, sending MPOA res req\n", + mpc->dev->name, &entry->ctrl_info.in_dst_ip); + entry->entry_state = INGRESS_RESOLVING; + msg.type = SND_MPOA_RES_RQST; + memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN); + msg.content.in_info = entry->ctrl_info; + qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); + if (qos != NULL) + msg.qos = qos->qos; + msg_to_mpoad(&msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + } + + return CLOSED; +} + +static void in_cache_put(in_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->use)) { + memset(entry, 0, sizeof(in_cache_entry)); + kfree(entry); + } +} + +/* + * This should be called with write lock on + */ +static void in_cache_remove_entry(in_cache_entry *entry, + struct mpoa_client *client) +{ + struct atm_vcc *vcc; + struct k_message msg; + + vcc = entry->shortcut; + dprintk("removing an ingress entry, ip = %pI4\n", + &entry->ctrl_info.in_dst_ip); + + if (entry->prev != NULL) + entry->prev->next = entry->next; + else + client->in_cache = entry->next; + if (entry->next != NULL) + entry->next->prev = entry->prev; + client->in_ops->put(entry); + if (client->in_cache == NULL && client->eg_cache == NULL) { + msg.type = STOP_KEEP_ALIVE_SM; + msg_to_mpoad(&msg, client); + } + + /* Check if the egress side still uses this VCC */ + if (vcc != NULL) { + eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc, + client); + if (eg_entry != NULL) { + client->eg_ops->put(eg_entry); + return; + } + vcc_release_async(vcc, -EPIPE); + } +} + +/* Call this every MPC-p2 seconds... Not exactly correct solution, + but an easy one... */ +static void clear_count_and_expired(struct mpoa_client *client) +{ + in_cache_entry *entry, *next_entry; + struct timeval now; + + do_gettimeofday(&now); + + write_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while (entry != NULL) { + entry->count = 0; + next_entry = entry->next; + if ((now.tv_sec - entry->tv.tv_sec) + > entry->ctrl_info.holding_time) { + dprintk("holding time expired, ip = %pI4\n", + &entry->ctrl_info.in_dst_ip); + client->in_ops->remove_entry(entry, client); + } + entry = next_entry; + } + write_unlock_bh(&client->ingress_lock); +} + +/* Call this every MPC-p4 seconds. */ +static void check_resolving_entries(struct mpoa_client *client) +{ + + struct atm_mpoa_qos *qos; + in_cache_entry *entry; + struct timeval now; + struct k_message msg; + + do_gettimeofday(&now); + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while (entry != NULL) { + if (entry->entry_state == INGRESS_RESOLVING) { + if ((now.tv_sec - entry->hold_down.tv_sec) < + client->parameters.mpc_p6) { + entry = entry->next; /* Entry in hold down */ + continue; + } + if ((now.tv_sec - entry->reply_wait.tv_sec) > + entry->retry_time) { + entry->retry_time = MPC_C1 * (entry->retry_time); + /* + * Retry time maximum exceeded, + * put entry in hold down. + */ + if (entry->retry_time > client->parameters.mpc_p5) { + do_gettimeofday(&(entry->hold_down)); + entry->retry_time = client->parameters.mpc_p4; + entry = entry->next; + continue; + } + /* Ask daemon to send a resolution request. */ + memset(&(entry->hold_down), 0, sizeof(struct timeval)); + msg.type = SND_MPOA_RES_RTRY; + memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN); + msg.content.in_info = entry->ctrl_info; + qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); + if (qos != NULL) + msg.qos = qos->qos; + msg_to_mpoad(&msg, client); + do_gettimeofday(&(entry->reply_wait)); + } + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); +} + +/* Call this every MPC-p5 seconds. */ +static void refresh_entries(struct mpoa_client *client) +{ + struct timeval now; + struct in_cache_entry *entry = client->in_cache; + + ddprintk("refresh_entries\n"); + do_gettimeofday(&now); + + read_lock_bh(&client->ingress_lock); + while (entry != NULL) { + if (entry->entry_state == INGRESS_RESOLVED) { + if (!(entry->refresh_time)) + entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3; + if ((now.tv_sec - entry->reply_wait.tv_sec) > + entry->refresh_time) { + dprintk("refreshing an entry.\n"); + entry->entry_state = INGRESS_REFRESHING; + + } + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); +} + +static void in_destroy_cache(struct mpoa_client *mpc) +{ + write_lock_irq(&mpc->ingress_lock); + while (mpc->in_cache != NULL) + mpc->in_ops->remove_entry(mpc->in_cache, mpc); + write_unlock_irq(&mpc->ingress_lock); +} + +static eg_cache_entry *eg_cache_get_by_cache_id(__be32 cache_id, + struct mpoa_client *mpc) +{ + eg_cache_entry *entry; + + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while (entry != NULL) { + if (entry->ctrl_info.cache_id == cache_id) { + atomic_inc(&entry->use); + read_unlock_irq(&mpc->egress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + return NULL; +} + +/* This can be called from any context since it saves CPU flags */ +static eg_cache_entry *eg_cache_get_by_tag(__be32 tag, struct mpoa_client *mpc) +{ + unsigned long flags; + eg_cache_entry *entry; + + read_lock_irqsave(&mpc->egress_lock, flags); + entry = mpc->eg_cache; + while (entry != NULL) { + if (entry->ctrl_info.tag == tag) { + atomic_inc(&entry->use); + read_unlock_irqrestore(&mpc->egress_lock, flags); + return entry; + } + entry = entry->next; + } + read_unlock_irqrestore(&mpc->egress_lock, flags); + + return NULL; +} + +/* This can be called from any context since it saves CPU flags */ +static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc, + struct mpoa_client *mpc) +{ + unsigned long flags; + eg_cache_entry *entry; + + read_lock_irqsave(&mpc->egress_lock, flags); + entry = mpc->eg_cache; + while (entry != NULL) { + if (entry->shortcut == vcc) { + atomic_inc(&entry->use); + read_unlock_irqrestore(&mpc->egress_lock, flags); + return entry; + } + entry = entry->next; + } + read_unlock_irqrestore(&mpc->egress_lock, flags); + + return NULL; +} + +static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr, + struct mpoa_client *mpc) +{ + eg_cache_entry *entry; + + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while (entry != NULL) { + if (entry->latest_ip_addr == ipaddr) { + atomic_inc(&entry->use); + read_unlock_irq(&mpc->egress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + return NULL; +} + +static void eg_cache_put(eg_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->use)) { + memset(entry, 0, sizeof(eg_cache_entry)); + kfree(entry); + } +} + +/* + * This should be called with write lock on + */ +static void eg_cache_remove_entry(eg_cache_entry *entry, + struct mpoa_client *client) +{ + struct atm_vcc *vcc; + struct k_message msg; + + vcc = entry->shortcut; + dprintk("removing an egress entry.\n"); + if (entry->prev != NULL) + entry->prev->next = entry->next; + else + client->eg_cache = entry->next; + if (entry->next != NULL) + entry->next->prev = entry->prev; + client->eg_ops->put(entry); + if (client->in_cache == NULL && client->eg_cache == NULL) { + msg.type = STOP_KEEP_ALIVE_SM; + msg_to_mpoad(&msg, client); + } + + /* Check if the ingress side still uses this VCC */ + if (vcc != NULL) { + in_cache_entry *in_entry = client->in_ops->get_by_vcc(vcc, client); + if (in_entry != NULL) { + client->in_ops->put(in_entry); + return; + } + vcc_release_async(vcc, -EPIPE); + } +} + +static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, + struct mpoa_client *client) +{ + eg_cache_entry *entry = kzalloc(sizeof(eg_cache_entry), GFP_KERNEL); + + if (entry == NULL) { + pr_info("out of memory\n"); + return NULL; + } + + dprintk("adding an egress entry, ip = %pI4, this should be our IP\n", + &msg->content.eg_info.eg_dst_ip); + + atomic_set(&entry->use, 1); + dprintk("new_eg_cache_entry: about to lock\n"); + write_lock_irq(&client->egress_lock); + entry->next = client->eg_cache; + entry->prev = NULL; + if (client->eg_cache != NULL) + client->eg_cache->prev = entry; + client->eg_cache = entry; + + memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); + entry->ctrl_info = msg->content.eg_info; + do_gettimeofday(&(entry->tv)); + entry->entry_state = EGRESS_RESOLVED; + dprintk("new_eg_cache_entry cache_id %u\n", + ntohl(entry->ctrl_info.cache_id)); + dprintk("mps_ip = %pI4\n", &entry->ctrl_info.mps_ip); + atomic_inc(&entry->use); + + write_unlock_irq(&client->egress_lock); + dprintk("new_eg_cache_entry: unlocked\n"); + + return entry; +} + +static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time) +{ + do_gettimeofday(&(entry->tv)); + entry->entry_state = EGRESS_RESOLVED; + entry->ctrl_info.holding_time = holding_time; +} + +static void clear_expired(struct mpoa_client *client) +{ + eg_cache_entry *entry, *next_entry; + struct timeval now; + struct k_message msg; + + do_gettimeofday(&now); + + write_lock_irq(&client->egress_lock); + entry = client->eg_cache; + while (entry != NULL) { + next_entry = entry->next; + if ((now.tv_sec - entry->tv.tv_sec) + > entry->ctrl_info.holding_time) { + msg.type = SND_EGRESS_PURGE; + msg.content.eg_info = entry->ctrl_info; + dprintk("egress_cache: holding time expired, cache_id = %u.\n", + ntohl(entry->ctrl_info.cache_id)); + msg_to_mpoad(&msg, client); + client->eg_ops->remove_entry(entry, client); + } + entry = next_entry; + } + write_unlock_irq(&client->egress_lock); +} + +static void eg_destroy_cache(struct mpoa_client *mpc) +{ + write_lock_irq(&mpc->egress_lock); + while (mpc->eg_cache != NULL) + mpc->eg_ops->remove_entry(mpc->eg_cache, mpc); + write_unlock_irq(&mpc->egress_lock); +} + + +static struct in_cache_ops ingress_ops = { + in_cache_add_entry, /* add_entry */ + in_cache_get, /* get */ + in_cache_get_with_mask, /* get_with_mask */ + in_cache_get_by_vcc, /* get_by_vcc */ + in_cache_put, /* put */ + in_cache_remove_entry, /* remove_entry */ + cache_hit, /* cache_hit */ + clear_count_and_expired, /* clear_count */ + check_resolving_entries, /* check_resolving */ + refresh_entries, /* refresh */ + in_destroy_cache /* destroy_cache */ +}; + +static struct eg_cache_ops egress_ops = { + eg_cache_add_entry, /* add_entry */ + eg_cache_get_by_cache_id, /* get_by_cache_id */ + eg_cache_get_by_tag, /* get_by_tag */ + eg_cache_get_by_vcc, /* get_by_vcc */ + eg_cache_get_by_src_ip, /* get_by_src_ip */ + eg_cache_put, /* put */ + eg_cache_remove_entry, /* remove_entry */ + update_eg_cache_entry, /* update */ + clear_expired, /* clear_expired */ + eg_destroy_cache /* destroy_cache */ +}; + + +void atm_mpoa_init_cache(struct mpoa_client *mpc) +{ + mpc->in_ops = &ingress_ops; + mpc->eg_ops = &egress_ops; +} diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h new file mode 100644 index 00000000..8e5f78cf --- /dev/null +++ b/net/atm/mpoa_caches.h @@ -0,0 +1,96 @@ +#ifndef MPOA_CACHES_H +#define MPOA_CACHES_H + +#include +#include +#include +#include +#include + +struct mpoa_client; + +void atm_mpoa_init_cache(struct mpoa_client *mpc); + +typedef struct in_cache_entry { + struct in_cache_entry *next; + struct in_cache_entry *prev; + struct timeval tv; + struct timeval reply_wait; + struct timeval hold_down; + uint32_t packets_fwded; + uint16_t entry_state; + uint32_t retry_time; + uint32_t refresh_time; + uint32_t count; + struct atm_vcc *shortcut; + uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN]; + struct in_ctrl_info ctrl_info; + atomic_t use; +} in_cache_entry; + +struct in_cache_ops{ + in_cache_entry *(*add_entry)(__be32 dst_ip, + struct mpoa_client *client); + in_cache_entry *(*get)(__be32 dst_ip, struct mpoa_client *client); + in_cache_entry *(*get_with_mask)(__be32 dst_ip, + struct mpoa_client *client, + __be32 mask); + in_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, + struct mpoa_client *client); + void (*put)(in_cache_entry *entry); + void (*remove_entry)(in_cache_entry *delEntry, + struct mpoa_client *client ); + int (*cache_hit)(in_cache_entry *entry, + struct mpoa_client *client); + void (*clear_count)(struct mpoa_client *client); + void (*check_resolving)(struct mpoa_client *client); + void (*refresh)(struct mpoa_client *client); + void (*destroy_cache)(struct mpoa_client *mpc); +}; + +typedef struct eg_cache_entry{ + struct eg_cache_entry *next; + struct eg_cache_entry *prev; + struct timeval tv; + uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN]; + struct atm_vcc *shortcut; + uint32_t packets_rcvd; + uint16_t entry_state; + __be32 latest_ip_addr; /* The src IP address of the last packet */ + struct eg_ctrl_info ctrl_info; + atomic_t use; +} eg_cache_entry; + +struct eg_cache_ops{ + eg_cache_entry *(*add_entry)(struct k_message *msg, struct mpoa_client *client); + eg_cache_entry *(*get_by_cache_id)(__be32 cache_id, struct mpoa_client *client); + eg_cache_entry *(*get_by_tag)(__be32 cache_id, struct mpoa_client *client); + eg_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, struct mpoa_client *client); + eg_cache_entry *(*get_by_src_ip)(__be32 ipaddr, struct mpoa_client *client); + void (*put)(eg_cache_entry *entry); + void (*remove_entry)(eg_cache_entry *entry, struct mpoa_client *client); + void (*update)(eg_cache_entry *entry, uint16_t holding_time); + void (*clear_expired)(struct mpoa_client *client); + void (*destroy_cache)(struct mpoa_client *mpc); +}; + + +/* Ingress cache entry states */ + +#define INGRESS_REFRESHING 3 +#define INGRESS_RESOLVED 2 +#define INGRESS_RESOLVING 1 +#define INGRESS_INVALID 0 + +/* VCC states */ + +#define OPEN 1 +#define CLOSED 0 + +/* Egress cache entry states */ + +#define EGRESS_RESOLVED 2 +#define EGRESS_PURGE 1 +#define EGRESS_INVALID 0 + +#endif diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c new file mode 100644 index 00000000..53e50029 --- /dev/null +++ b/net/atm/mpoa_proc.c @@ -0,0 +1,312 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#ifdef CONFIG_PROC_FS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mpc.h" +#include "mpoa_caches.h" + +/* + * mpoa_proc.c: Implementation MPOA client's proc + * file system statistics + */ + +#if 1 +#define dprintk(format, args...) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */ +#else +#define dprintk(format, args...) \ + do { if (0) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\ + } while (0) +#endif + +#if 0 +#define ddprintk(format, args...) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */ +#else +#define ddprintk(format, args...) \ + do { if (0) \ + printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\ + } while (0) +#endif + +#define STAT_FILE_NAME "mpc" /* Our statistic file's name */ + +extern struct mpoa_client *mpcs; +extern struct proc_dir_entry *atm_proc_root; /* from proc.c. */ + +static int proc_mpc_open(struct inode *inode, struct file *file); +static ssize_t proc_mpc_write(struct file *file, const char __user *buff, + size_t nbytes, loff_t *ppos); + +static int parse_qos(const char *buff); + +/* + * Define allowed FILE OPERATIONS + */ +static const struct file_operations mpc_file_operations = { + .owner = THIS_MODULE, + .open = proc_mpc_open, + .read = seq_read, + .llseek = seq_lseek, + .write = proc_mpc_write, + .release = seq_release, +}; + +/* + * Returns the state of an ingress cache entry as a string + */ +static const char *ingress_state_string(int state) +{ + switch (state) { + case INGRESS_RESOLVING: + return "resolving "; + case INGRESS_RESOLVED: + return "resolved "; + case INGRESS_INVALID: + return "invalid "; + case INGRESS_REFRESHING: + return "refreshing "; + } + + return ""; +} + +/* + * Returns the state of an egress cache entry as a string + */ +static const char *egress_state_string(int state) +{ + switch (state) { + case EGRESS_RESOLVED: + return "resolved "; + case EGRESS_PURGE: + return "purge "; + case EGRESS_INVALID: + return "invalid "; + } + + return ""; +} + +/* + * FIXME: mpcs (and per-mpc lists) have no locking whatsoever. + */ + +static void *mpc_start(struct seq_file *m, loff_t *pos) +{ + loff_t l = *pos; + struct mpoa_client *mpc; + + if (!l--) + return SEQ_START_TOKEN; + for (mpc = mpcs; mpc; mpc = mpc->next) + if (!l--) + return mpc; + return NULL; +} + +static void *mpc_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct mpoa_client *p = v; + (*pos)++; + return v == SEQ_START_TOKEN ? mpcs : p->next; +} + +static void mpc_stop(struct seq_file *m, void *v) +{ +} + +/* + * READING function - called when the /proc/atm/mpoa file is read from. + */ +static int mpc_show(struct seq_file *m, void *v) +{ + struct mpoa_client *mpc = v; + int i; + in_cache_entry *in_entry; + eg_cache_entry *eg_entry; + struct timeval now; + unsigned char ip_string[16]; + + if (v == SEQ_START_TOKEN) { + atm_mpoa_disp_qos(m); + return 0; + } + + seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num); + seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n"); + do_gettimeofday(&now); + + for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) { + sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip); + seq_printf(m, "%-16s%s%-14lu%-12u", + ip_string, + ingress_state_string(in_entry->entry_state), + in_entry->ctrl_info.holding_time - + (now.tv_sec-in_entry->tv.tv_sec), + in_entry->packets_fwded); + if (in_entry->shortcut) + seq_printf(m, " %-3d %-3d", + in_entry->shortcut->vpi, + in_entry->shortcut->vci); + seq_printf(m, "\n"); + } + + seq_printf(m, "\n"); + seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n"); + for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) { + unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr; + for (i = 0; i < ATM_ESA_LEN; i++) + seq_printf(m, "%02x", p[i]); + seq_printf(m, "\n%-16lu%s%-14lu%-15u", + (unsigned long)ntohl(eg_entry->ctrl_info.cache_id), + egress_state_string(eg_entry->entry_state), + (eg_entry->ctrl_info.holding_time - + (now.tv_sec-eg_entry->tv.tv_sec)), + eg_entry->packets_rcvd); + + /* latest IP address */ + sprintf(ip_string, "%pI4", &eg_entry->latest_ip_addr); + seq_printf(m, "%-16s", ip_string); + + if (eg_entry->shortcut) + seq_printf(m, " %-3d %-3d", + eg_entry->shortcut->vpi, + eg_entry->shortcut->vci); + seq_printf(m, "\n"); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations mpc_op = { + .start = mpc_start, + .next = mpc_next, + .stop = mpc_stop, + .show = mpc_show +}; + +static int proc_mpc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &mpc_op); +} + +static ssize_t proc_mpc_write(struct file *file, const char __user *buff, + size_t nbytes, loff_t *ppos) +{ + char *page, *p; + unsigned len; + + if (nbytes == 0) + return 0; + + if (nbytes >= PAGE_SIZE) + nbytes = PAGE_SIZE-1; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + for (p = page, len = 0; len < nbytes; p++, len++) { + if (get_user(*p, buff++)) { + free_page((unsigned long)page); + return -EFAULT; + } + if (*p == '\0' || *p == '\n') + break; + } + + *p = '\0'; + + if (!parse_qos(page)) + printk("mpoa: proc_mpc_write: could not parse '%s'\n", page); + + free_page((unsigned long)page); + + return len; +} + +static int parse_qos(const char *buff) +{ + /* possible lines look like this + * add 130.230.54.142 tx=max_pcr,max_sdu rx=max_pcr,max_sdu + */ + unsigned char ip[4]; + int tx_pcr, tx_sdu, rx_pcr, rx_sdu; + __be32 ipaddr; + struct atm_qos qos; + + memset(&qos, 0, sizeof(struct atm_qos)); + + if (sscanf(buff, "del %hhu.%hhu.%hhu.%hhu", + ip, ip+1, ip+2, ip+3) == 4) { + ipaddr = *(__be32 *)ip; + return atm_mpoa_delete_qos(atm_mpoa_search_qos(ipaddr)); + } + + if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=tx", + ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu) == 6) { + rx_pcr = tx_pcr; + rx_sdu = tx_sdu; + } else if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=%d,%d", + ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu, &rx_pcr, &rx_sdu) != 8) + return 0; + + ipaddr = *(__be32 *)ip; + qos.txtp.traffic_class = ATM_CBR; + qos.txtp.max_pcr = tx_pcr; + qos.txtp.max_sdu = tx_sdu; + qos.rxtp.traffic_class = ATM_CBR; + qos.rxtp.max_pcr = rx_pcr; + qos.rxtp.max_sdu = rx_sdu; + qos.aal = ATM_AAL5; + dprintk("parse_qos(): setting qos paramameters to tx=%d,%d rx=%d,%d\n", + qos.txtp.max_pcr, qos.txtp.max_sdu, + qos.rxtp.max_pcr, qos.rxtp.max_sdu); + + atm_mpoa_add_qos(ipaddr, &qos); + return 1; +} + +/* + * INITIALIZATION function - called when module is initialized/loaded. + */ +int mpc_proc_init(void) +{ + struct proc_dir_entry *p; + + p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_file_operations); + if (!p) { + pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME); + return -ENOMEM; + } + return 0; +} + +/* + * DELETING function - called when module is removed. + */ +void mpc_proc_clean(void) +{ + remove_proc_entry(STAT_FILE_NAME, atm_proc_root); +} + +#endif /* CONFIG_PROC_FS */ + + + + + + diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c new file mode 100644 index 00000000..e9aced0e --- /dev/null +++ b/net/atm/pppoatm.c @@ -0,0 +1,362 @@ +/* net/atm/pppoatm.c - RFC2364 PPP over ATM/AAL5 */ + +/* Copyright 1999-2000 by Mitchell Blank Jr */ +/* Based on clip.c; 1995-1999 by Werner Almesberger, EPFL LRC/ICA */ +/* And on ppp_async.c; Copyright 1999 Paul Mackerras */ +/* And help from Jens Axboe */ + +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This driver provides the encapsulation and framing for sending + * and receiving PPP frames in ATM AAL5 PDUs. + */ + +/* + * One shortcoming of this driver is that it does not comply with + * section 8 of RFC2364 - we are supposed to detect a change + * in encapsulation and immediately abort the connection (in order + * to avoid a black-hole being created if our peer loses state + * and changes encapsulation unilaterally. However, since the + * ppp_generic layer actually does the decapsulation, we need + * a way of notifying it when we _think_ there might be a problem) + * There's two cases: + * 1. LLC-encapsulation was missing when it was enabled. In + * this case, we should tell the upper layer "tear down + * this session if this skb looks ok to you" + * 2. LLC-encapsulation was present when it was disabled. Then + * we need to tell the upper layer "this packet may be + * ok, but if its in error tear down the session" + * These hooks are not yet available in ppp_generic + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +enum pppoatm_encaps { + e_autodetect = PPPOATM_ENCAPS_AUTODETECT, + e_vc = PPPOATM_ENCAPS_VC, + e_llc = PPPOATM_ENCAPS_LLC, +}; + +struct pppoatm_vcc { + struct atm_vcc *atmvcc; /* VCC descriptor */ + void (*old_push)(struct atm_vcc *, struct sk_buff *); + void (*old_pop)(struct atm_vcc *, struct sk_buff *); + /* keep old push/pop for detaching */ + enum pppoatm_encaps encaps; + int flags; /* SC_COMP_PROT - compress protocol */ + struct ppp_channel chan; /* interface to generic ppp layer */ + struct tasklet_struct wakeup_tasklet; +}; + +/* + * Header used for LLC Encapsulated PPP (4 bytes) followed by the LCP protocol + * ID (0xC021) used in autodetection + */ +static const unsigned char pppllc[6] = { 0xFE, 0xFE, 0x03, 0xCF, 0xC0, 0x21 }; +#define LLC_LEN (4) + +static inline struct pppoatm_vcc *atmvcc_to_pvcc(const struct atm_vcc *atmvcc) +{ + return (struct pppoatm_vcc *) (atmvcc->user_back); +} + +static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan) +{ + return (struct pppoatm_vcc *) (chan->private); +} + +/* + * We can't do this directly from our _pop handler, since the ppp code + * doesn't want to be called in interrupt context, so we do it from + * a tasklet + */ +static void pppoatm_wakeup_sender(unsigned long arg) +{ + ppp_output_wakeup((struct ppp_channel *) arg); +} + +/* + * This gets called every time the ATM card has finished sending our + * skb. The ->old_pop will take care up normal atm flow control, + * but we also need to wake up the device if we blocked it + */ +static void pppoatm_pop(struct atm_vcc *atmvcc, struct sk_buff *skb) +{ + struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc); + pvcc->old_pop(atmvcc, skb); + /* + * We don't really always want to do this since it's + * really inefficient - it would be much better if we could + * test if we had actually throttled the generic layer. + * Unfortunately then there would be a nasty SMP race where + * we could clear that flag just as we refuse another packet. + * For now we do the safe thing. + */ + tasklet_schedule(&pvcc->wakeup_tasklet); +} + +/* + * Unbind from PPP - currently we only do this when closing the socket, + * but we could put this into an ioctl if need be + */ +static void pppoatm_unassign_vcc(struct atm_vcc *atmvcc) +{ + struct pppoatm_vcc *pvcc; + pvcc = atmvcc_to_pvcc(atmvcc); + atmvcc->push = pvcc->old_push; + atmvcc->pop = pvcc->old_pop; + tasklet_kill(&pvcc->wakeup_tasklet); + ppp_unregister_channel(&pvcc->chan); + atmvcc->user_back = NULL; + kfree(pvcc); + /* Gee, I hope we have the big kernel lock here... */ + module_put(THIS_MODULE); +} + +/* Called when an AAL5 PDU comes in */ +static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb) +{ + struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc); + pr_debug("\n"); + if (skb == NULL) { /* VCC was closed */ + pr_debug("removing ATMPPP VCC %p\n", pvcc); + pppoatm_unassign_vcc(atmvcc); + atmvcc->push(atmvcc, NULL); /* Pass along bad news */ + return; + } + atm_return(atmvcc, skb->truesize); + switch (pvcc->encaps) { + case e_llc: + if (skb->len < LLC_LEN || + memcmp(skb->data, pppllc, LLC_LEN)) + goto error; + skb_pull(skb, LLC_LEN); + break; + case e_autodetect: + if (pvcc->chan.ppp == NULL) { /* Not bound yet! */ + kfree_skb(skb); + return; + } + if (skb->len >= sizeof(pppllc) && + !memcmp(skb->data, pppllc, sizeof(pppllc))) { + pvcc->encaps = e_llc; + skb_pull(skb, LLC_LEN); + break; + } + if (skb->len >= (sizeof(pppllc) - LLC_LEN) && + !memcmp(skb->data, &pppllc[LLC_LEN], + sizeof(pppllc) - LLC_LEN)) { + pvcc->encaps = e_vc; + pvcc->chan.mtu += LLC_LEN; + break; + } + pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n", + skb->data[0], skb->data[1], skb->data[2], + skb->data[3], skb->data[4], skb->data[5]); + goto error; + case e_vc: + break; + } + ppp_input(&pvcc->chan, skb); + return; + +error: + kfree_skb(skb); + ppp_input_error(&pvcc->chan, 0); +} + +/* + * Called by the ppp_generic.c to send a packet - returns true if packet + * was accepted. If we return false, then it's our job to call + * ppp_output_wakeup(chan) when we're feeling more up to it. + * Note that in the ENOMEM case (as opposed to the !atm_may_send case) + * we should really drop the packet, but the generic layer doesn't + * support this yet. We just return 'DROP_PACKET' which we actually define + * as success, just to be clear what we're really doing. + */ +#define DROP_PACKET 1 +static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb) +{ + struct pppoatm_vcc *pvcc = chan_to_pvcc(chan); + ATM_SKB(skb)->vcc = pvcc->atmvcc; + pr_debug("(skb=0x%p, vcc=0x%p)\n", skb, pvcc->atmvcc); + if (skb->data[0] == '\0' && (pvcc->flags & SC_COMP_PROT)) + (void) skb_pull(skb, 1); + switch (pvcc->encaps) { /* LLC encapsulation needed */ + case e_llc: + if (skb_headroom(skb) < LLC_LEN) { + struct sk_buff *n; + n = skb_realloc_headroom(skb, LLC_LEN); + if (n != NULL && + !atm_may_send(pvcc->atmvcc, n->truesize)) { + kfree_skb(n); + goto nospace; + } + kfree_skb(skb); + skb = n; + if (skb == NULL) + return DROP_PACKET; + } else if (!atm_may_send(pvcc->atmvcc, skb->truesize)) + goto nospace; + memcpy(skb_push(skb, LLC_LEN), pppllc, LLC_LEN); + break; + case e_vc: + if (!atm_may_send(pvcc->atmvcc, skb->truesize)) + goto nospace; + break; + case e_autodetect: + pr_debug("Trying to send without setting encaps!\n"); + kfree_skb(skb); + return 1; + } + + atomic_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options; + pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", + skb, ATM_SKB(skb)->vcc, ATM_SKB(skb)->vcc->dev); + return ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) + ? DROP_PACKET : 1; +nospace: + /* + * We don't have space to send this SKB now, but we might have + * already applied SC_COMP_PROT compression, so may need to undo + */ + if ((pvcc->flags & SC_COMP_PROT) && skb_headroom(skb) > 0 && + skb->data[-1] == '\0') + (void) skb_push(skb, 1); + return 0; +} + +/* This handles ioctls sent to the /dev/ppp interface */ +static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case PPPIOCGFLAGS: + return put_user(chan_to_pvcc(chan)->flags, (int __user *) arg) + ? -EFAULT : 0; + case PPPIOCSFLAGS: + return get_user(chan_to_pvcc(chan)->flags, (int __user *) arg) + ? -EFAULT : 0; + } + return -ENOTTY; +} + +static const struct ppp_channel_ops pppoatm_ops = { + .start_xmit = pppoatm_send, + .ioctl = pppoatm_devppp_ioctl, +}; + +static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) +{ + struct atm_backend_ppp be; + struct pppoatm_vcc *pvcc; + int err; + /* + * Each PPPoATM instance has its own tasklet - this is just a + * prototypical one used to initialize them + */ + static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0); + if (copy_from_user(&be, arg, sizeof be)) + return -EFAULT; + if (be.encaps != PPPOATM_ENCAPS_AUTODETECT && + be.encaps != PPPOATM_ENCAPS_VC && be.encaps != PPPOATM_ENCAPS_LLC) + return -EINVAL; + pvcc = kzalloc(sizeof(*pvcc), GFP_KERNEL); + if (pvcc == NULL) + return -ENOMEM; + pvcc->atmvcc = atmvcc; + pvcc->old_push = atmvcc->push; + pvcc->old_pop = atmvcc->pop; + pvcc->encaps = (enum pppoatm_encaps) be.encaps; + pvcc->chan.private = pvcc; + pvcc->chan.ops = &pppoatm_ops; + pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN - + (be.encaps == e_vc ? 0 : LLC_LEN); + pvcc->wakeup_tasklet = tasklet_proto; + pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan; + err = ppp_register_channel(&pvcc->chan); + if (err != 0) { + kfree(pvcc); + return err; + } + atmvcc->user_back = pvcc; + atmvcc->push = pppoatm_push; + atmvcc->pop = pppoatm_pop; + __module_get(THIS_MODULE); + return 0; +} + +/* + * This handles ioctls actually performed on our vcc - we must return + * -ENOIOCTLCMD for any unrecognized ioctl + */ +static int pppoatm_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct atm_vcc *atmvcc = ATM_SD(sock); + void __user *argp = (void __user *)arg; + + if (cmd != ATM_SETBACKEND && atmvcc->push != pppoatm_push) + return -ENOIOCTLCMD; + switch (cmd) { + case ATM_SETBACKEND: { + atm_backend_t b; + if (get_user(b, (atm_backend_t __user *) argp)) + return -EFAULT; + if (b != ATM_BACKEND_PPP) + return -ENOIOCTLCMD; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + return pppoatm_assign_vcc(atmvcc, argp); + } + case PPPIOCGCHAN: + return put_user(ppp_channel_index(&atmvcc_to_pvcc(atmvcc)-> + chan), (int __user *) argp) ? -EFAULT : 0; + case PPPIOCGUNIT: + return put_user(ppp_unit_number(&atmvcc_to_pvcc(atmvcc)-> + chan), (int __user *) argp) ? -EFAULT : 0; + } + return -ENOIOCTLCMD; +} + +static struct atm_ioctl pppoatm_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = pppoatm_ioctl, +}; + +static int __init pppoatm_init(void) +{ + register_atm_ioctl(&pppoatm_ioctl_ops); + return 0; +} + +static void __exit pppoatm_exit(void) +{ + deregister_atm_ioctl(&pppoatm_ioctl_ops); +} + +module_init(pppoatm_init); +module_exit(pppoatm_exit); + +MODULE_AUTHOR("Mitchell Blank Jr "); +MODULE_DESCRIPTION("RFC2364 PPP over ATM/AAL5"); +MODULE_LICENSE("GPL"); diff --git a/net/atm/proc.c b/net/atm/proc.c new file mode 100644 index 00000000..be3afdef --- /dev/null +++ b/net/atm/proc.c @@ -0,0 +1,497 @@ +/* net/atm/proc.c - ATM /proc interface + * + * Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA + * + * seq_file api usage by romieu@fr.zoreil.com + * + * Evaluating the efficiency of the whole thing if left as an exercise to + * the reader. + */ + +#include /* for EXPORT_SYMBOL */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for __init */ +#include +#include +#include +#include +#include /* for HZ */ +#include +#include "resources.h" +#include "common.h" /* atm_proc_init prototype */ +#include "signaling.h" /* to get sigd - ugly too */ + +static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, + size_t count, loff_t *pos); + +static const struct file_operations proc_atm_dev_ops = { + .owner = THIS_MODULE, + .read = proc_dev_atm_read, + .llseek = noop_llseek, +}; + +static void add_stats(struct seq_file *seq, const char *aal, + const struct k_atm_aal_stats *stats) +{ + seq_printf(seq, "%s ( %d %d %d %d %d )", aal, + atomic_read(&stats->tx), atomic_read(&stats->tx_err), + atomic_read(&stats->rx), atomic_read(&stats->rx_err), + atomic_read(&stats->rx_drop)); +} + +static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev) +{ + int i; + + seq_printf(seq, "%3d %-8s", dev->number, dev->type); + for (i = 0; i < ESI_LEN; i++) + seq_printf(seq, "%02x", dev->esi[i]); + seq_puts(seq, " "); + add_stats(seq, "0", &dev->stats.aal0); + seq_puts(seq, " "); + add_stats(seq, "5", &dev->stats.aal5); + seq_printf(seq, "\t[%d]", atomic_read(&dev->refcnt)); + seq_putc(seq, '\n'); +} + +struct vcc_state { + int bucket; + struct sock *sk; + int family; +}; + +static inline int compare_family(struct sock *sk, int family) +{ + return !family || (sk->sk_family == family); +} + +static int __vcc_walk(struct sock **sock, int family, int *bucket, loff_t l) +{ + struct sock *sk = *sock; + + if (sk == SEQ_START_TOKEN) { + for (*bucket = 0; *bucket < VCC_HTABLE_SIZE; ++*bucket) { + struct hlist_head *head = &vcc_hash[*bucket]; + + sk = hlist_empty(head) ? NULL : __sk_head(head); + if (sk) + break; + } + l--; + } +try_again: + for (; sk; sk = sk_next(sk)) { + l -= compare_family(sk, family); + if (l < 0) + goto out; + } + if (!sk && ++*bucket < VCC_HTABLE_SIZE) { + sk = sk_head(&vcc_hash[*bucket]); + goto try_again; + } + sk = SEQ_START_TOKEN; +out: + *sock = sk; + return (l < 0); +} + +static inline void *vcc_walk(struct vcc_state *state, loff_t l) +{ + return __vcc_walk(&state->sk, state->family, &state->bucket, l) ? + state : NULL; +} + +static int __vcc_seq_open(struct inode *inode, struct file *file, + int family, const struct seq_operations *ops) +{ + struct vcc_state *state; + + state = __seq_open_private(file, ops, sizeof(*state)); + if (state == NULL) + return -ENOMEM; + + state->family = family; + return 0; +} + +static void *vcc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(vcc_sklist_lock) +{ + struct vcc_state *state = seq->private; + loff_t left = *pos; + + read_lock(&vcc_sklist_lock); + state->sk = SEQ_START_TOKEN; + return left ? vcc_walk(state, left) : SEQ_START_TOKEN; +} + +static void vcc_seq_stop(struct seq_file *seq, void *v) + __releases(vcc_sklist_lock) +{ + read_unlock(&vcc_sklist_lock); +} + +static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct vcc_state *state = seq->private; + + v = vcc_walk(state, 1); + *pos += !!PTR_ERR(v); + return v; +} + +static void pvc_info(struct seq_file *seq, struct atm_vcc *vcc) +{ + static const char *const class_name[] = { + "off", "UBR", "CBR", "VBR", "ABR"}; + static const char *const aal_name[] = { + "---", "1", "2", "3/4", /* 0- 3 */ + "???", "5", "???", "???", /* 4- 7 */ + "???", "???", "???", "???", /* 8-11 */ + "???", "0", "???", "???"}; /* 12-15 */ + + seq_printf(seq, "%3d %3d %5d %-3s %7d %-5s %7d %-6s", + vcc->dev->number, vcc->vpi, vcc->vci, + vcc->qos.aal >= ARRAY_SIZE(aal_name) ? "err" : + aal_name[vcc->qos.aal], vcc->qos.rxtp.min_pcr, + class_name[vcc->qos.rxtp.traffic_class], + vcc->qos.txtp.min_pcr, + class_name[vcc->qos.txtp.traffic_class]); + if (test_bit(ATM_VF_IS_CLIP, &vcc->flags)) { + struct clip_vcc *clip_vcc = CLIP_VCC(vcc); + struct net_device *dev; + + dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : NULL; + seq_printf(seq, "CLIP, Itf:%s, Encap:", + dev ? dev->name : "none?"); + seq_printf(seq, "%s", clip_vcc->encap ? "LLC/SNAP" : "None"); + } + seq_putc(seq, '\n'); +} + +static const char *vcc_state(struct atm_vcc *vcc) +{ + static const char *const map[] = { ATM_VS2TXT_MAP }; + + return map[ATM_VF2VS(vcc->flags)]; +} + +static void vcc_info(struct seq_file *seq, struct atm_vcc *vcc) +{ + struct sock *sk = sk_atm(vcc); + + seq_printf(seq, "%pK ", vcc); + if (!vcc->dev) + seq_printf(seq, "Unassigned "); + else + seq_printf(seq, "%3d %3d %5d ", vcc->dev->number, vcc->vpi, + vcc->vci); + switch (sk->sk_family) { + case AF_ATMPVC: + seq_printf(seq, "PVC"); + break; + case AF_ATMSVC: + seq_printf(seq, "SVC"); + break; + default: + seq_printf(seq, "%3d", sk->sk_family); + } + seq_printf(seq, " %04lx %5d %7d/%7d %7d/%7d [%d]\n", + vcc->flags, sk->sk_err, + sk_wmem_alloc_get(sk), sk->sk_sndbuf, + sk_rmem_alloc_get(sk), sk->sk_rcvbuf, + atomic_read(&sk->sk_refcnt)); +} + +static void svc_info(struct seq_file *seq, struct atm_vcc *vcc) +{ + if (!vcc->dev) + seq_printf(seq, sizeof(void *) == 4 ? + "N/A@%pK%10s" : "N/A@%pK%2s", vcc, ""); + else + seq_printf(seq, "%3d %3d %5d ", + vcc->dev->number, vcc->vpi, vcc->vci); + seq_printf(seq, "%-10s ", vcc_state(vcc)); + seq_printf(seq, "%s%s", vcc->remote.sas_addr.pub, + *vcc->remote.sas_addr.pub && *vcc->remote.sas_addr.prv ? "+" : ""); + if (*vcc->remote.sas_addr.prv) { + int i; + + for (i = 0; i < ATM_ESA_LEN; i++) + seq_printf(seq, "%02x", vcc->remote.sas_addr.prv[i]); + } + seq_putc(seq, '\n'); +} + +static int atm_dev_seq_show(struct seq_file *seq, void *v) +{ + static char atm_dev_banner[] = + "Itf Type ESI/\"MAC\"addr " + "AAL(TX,err,RX,err,drop) ... [refcnt]\n"; + + if (v == &atm_devs) + seq_puts(seq, atm_dev_banner); + else { + struct atm_dev *dev = list_entry(v, struct atm_dev, dev_list); + + atm_dev_info(seq, dev); + } + return 0; +} + +static const struct seq_operations atm_dev_seq_ops = { + .start = atm_dev_seq_start, + .next = atm_dev_seq_next, + .stop = atm_dev_seq_stop, + .show = atm_dev_seq_show, +}; + +static int atm_dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atm_dev_seq_ops); +} + +static const struct file_operations devices_seq_fops = { + .open = atm_dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int pvc_seq_show(struct seq_file *seq, void *v) +{ + static char atm_pvc_banner[] = + "Itf VPI VCI AAL RX(PCR,Class) TX(PCR,Class)\n"; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, atm_pvc_banner); + else { + struct vcc_state *state = seq->private; + struct atm_vcc *vcc = atm_sk(state->sk); + + pvc_info(seq, vcc); + } + return 0; +} + +static const struct seq_operations pvc_seq_ops = { + .start = vcc_seq_start, + .next = vcc_seq_next, + .stop = vcc_seq_stop, + .show = pvc_seq_show, +}; + +static int pvc_seq_open(struct inode *inode, struct file *file) +{ + return __vcc_seq_open(inode, file, PF_ATMPVC, &pvc_seq_ops); +} + +static const struct file_operations pvc_seq_fops = { + .open = pvc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int vcc_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, sizeof(void *) == 4 ? "%-8s%s" : "%-16s%s", + "Address ", "Itf VPI VCI Fam Flags Reply " + "Send buffer Recv buffer [refcnt]\n"); + } else { + struct vcc_state *state = seq->private; + struct atm_vcc *vcc = atm_sk(state->sk); + + vcc_info(seq, vcc); + } + return 0; +} + +static const struct seq_operations vcc_seq_ops = { + .start = vcc_seq_start, + .next = vcc_seq_next, + .stop = vcc_seq_stop, + .show = vcc_seq_show, +}; + +static int vcc_seq_open(struct inode *inode, struct file *file) +{ + return __vcc_seq_open(inode, file, 0, &vcc_seq_ops); +} + +static const struct file_operations vcc_seq_fops = { + .open = vcc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int svc_seq_show(struct seq_file *seq, void *v) +{ + static const char atm_svc_banner[] = + "Itf VPI VCI State Remote\n"; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, atm_svc_banner); + else { + struct vcc_state *state = seq->private; + struct atm_vcc *vcc = atm_sk(state->sk); + + svc_info(seq, vcc); + } + return 0; +} + +static const struct seq_operations svc_seq_ops = { + .start = vcc_seq_start, + .next = vcc_seq_next, + .stop = vcc_seq_stop, + .show = svc_seq_show, +}; + +static int svc_seq_open(struct inode *inode, struct file *file) +{ + return __vcc_seq_open(inode, file, PF_ATMSVC, &svc_seq_ops); +} + +static const struct file_operations svc_seq_fops = { + .open = svc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct atm_dev *dev; + unsigned long page; + int length; + + if (count == 0) + return 0; + page = get_zeroed_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + dev = PDE(file->f_path.dentry->d_inode)->data; + if (!dev->ops->proc_read) + length = -EINVAL; + else { + length = dev->ops->proc_read(dev, pos, (char *)page); + if (length > count) + length = -EINVAL; + } + if (length >= 0) { + if (copy_to_user(buf, (char *)page, length)) + length = -EFAULT; + (*pos)++; + } + free_page(page); + return length; +} + +struct proc_dir_entry *atm_proc_root; +EXPORT_SYMBOL(atm_proc_root); + + +int atm_proc_dev_register(struct atm_dev *dev) +{ + int error; + + /* No proc info */ + if (!dev->ops->proc_read) + return 0; + + error = -ENOMEM; + dev->proc_name = kasprintf(GFP_KERNEL, "%s:%d", dev->type, dev->number); + if (!dev->proc_name) + goto err_out; + + dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root, + &proc_atm_dev_ops, dev); + if (!dev->proc_entry) + goto err_free_name; + return 0; + +err_free_name: + kfree(dev->proc_name); +err_out: + return error; +} + +void atm_proc_dev_deregister(struct atm_dev *dev) +{ + if (!dev->ops->proc_read) + return; + + remove_proc_entry(dev->proc_name, atm_proc_root); + kfree(dev->proc_name); +} + +static struct atm_proc_entry { + char *name; + const struct file_operations *proc_fops; + struct proc_dir_entry *dirent; +} atm_proc_ents[] = { + { .name = "devices", .proc_fops = &devices_seq_fops }, + { .name = "pvc", .proc_fops = &pvc_seq_fops }, + { .name = "svc", .proc_fops = &svc_seq_fops }, + { .name = "vc", .proc_fops = &vcc_seq_fops }, + { .name = NULL, .proc_fops = NULL } +}; + +static void atm_proc_dirs_remove(void) +{ + static struct atm_proc_entry *e; + + for (e = atm_proc_ents; e->name; e++) { + if (e->dirent) + remove_proc_entry(e->name, atm_proc_root); + } + proc_net_remove(&init_net, "atm"); +} + +int __init atm_proc_init(void) +{ + static struct atm_proc_entry *e; + int ret; + + atm_proc_root = proc_net_mkdir(&init_net, "atm", init_net.proc_net); + if (!atm_proc_root) + goto err_out; + for (e = atm_proc_ents; e->name; e++) { + struct proc_dir_entry *dirent; + + dirent = proc_create(e->name, S_IRUGO, + atm_proc_root, e->proc_fops); + if (!dirent) + goto err_out_remove; + e->dirent = dirent; + } + ret = 0; +out: + return ret; + +err_out_remove: + atm_proc_dirs_remove(); +err_out: + ret = -ENOMEM; + goto out; +} + +void atm_proc_exit(void) +{ + atm_proc_dirs_remove(); +} diff --git a/net/atm/protocols.h b/net/atm/protocols.h new file mode 100644 index 00000000..acdfc856 --- /dev/null +++ b/net/atm/protocols.h @@ -0,0 +1,13 @@ +/* net/atm/protocols.h - ATM protocol handler entry points */ + +/* Written 1995-1997 by Werner Almesberger, EPFL LRC */ + + +#ifndef NET_ATM_PROTOCOLS_H +#define NET_ATM_PROTOCOLS_H + +int atm_init_aal0(struct atm_vcc *vcc); /* "raw" AAL0 */ +int atm_init_aal34(struct atm_vcc *vcc);/* "raw" AAL3/4 transport */ +int atm_init_aal5(struct atm_vcc *vcc); /* "raw" AAL5 transport */ + +#endif diff --git a/net/atm/pvc.c b/net/atm/pvc.c new file mode 100644 index 00000000..437ee70c --- /dev/null +++ b/net/atm/pvc.c @@ -0,0 +1,160 @@ +/* net/atm/pvc.c - ATM PVC sockets */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include /* struct socket, struct proto_ops */ +#include /* ATM stuff */ +#include /* ATM devices */ +#include /* error codes */ +#include /* printk */ +#include +#include +#include +#include /* for sock_no_* */ + +#include "resources.h" /* devs and vccs */ +#include "common.h" /* common for PVCs and SVCs */ + + +static int pvc_shutdown(struct socket *sock, int how) +{ + return 0; +} + +static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr, + int sockaddr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_atmpvc *addr; + struct atm_vcc *vcc; + int error; + + if (sockaddr_len != sizeof(struct sockaddr_atmpvc)) + return -EINVAL; + addr = (struct sockaddr_atmpvc *)sockaddr; + if (addr->sap_family != AF_ATMPVC) + return -EAFNOSUPPORT; + lock_sock(sk); + vcc = ATM_SD(sock); + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { + error = -EBADFD; + goto out; + } + if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) { + if (vcc->vpi != ATM_VPI_UNSPEC) + addr->sap_addr.vpi = vcc->vpi; + if (vcc->vci != ATM_VCI_UNSPEC) + addr->sap_addr.vci = vcc->vci; + } + error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi, + addr->sap_addr.vci); +out: + release_sock(sk); + return error; +} + +static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, + int sockaddr_len, int flags) +{ + return pvc_bind(sock, sockaddr, sockaddr_len); +} + +static int pvc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int error; + + lock_sock(sk); + error = vcc_setsockopt(sock, level, optname, optval, optlen); + release_sock(sk); + return error; +} + +static int pvc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int error; + + lock_sock(sk); + error = vcc_getsockopt(sock, level, optname, optval, optlen); + release_sock(sk); + return error; +} + +static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, + int *sockaddr_len, int peer) +{ + struct sockaddr_atmpvc *addr; + struct atm_vcc *vcc = ATM_SD(sock); + + if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) + return -ENOTCONN; + *sockaddr_len = sizeof(struct sockaddr_atmpvc); + addr = (struct sockaddr_atmpvc *)sockaddr; + addr->sap_family = AF_ATMPVC; + addr->sap_addr.itf = vcc->dev->number; + addr->sap_addr.vpi = vcc->vpi; + addr->sap_addr.vci = vcc->vci; + return 0; +} + +static const struct proto_ops pvc_proto_ops = { + .family = PF_ATMPVC, + .owner = THIS_MODULE, + + .release = vcc_release, + .bind = pvc_bind, + .connect = pvc_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = pvc_getname, + .poll = vcc_poll, + .ioctl = vcc_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vcc_compat_ioctl, +#endif + .listen = sock_no_listen, + .shutdown = pvc_shutdown, + .setsockopt = pvc_setsockopt, + .getsockopt = pvc_getsockopt, + .sendmsg = vcc_sendmsg, + .recvmsg = vcc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + + +static int pvc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + if (net != &init_net) + return -EAFNOSUPPORT; + + sock->ops = &pvc_proto_ops; + return vcc_create(net, sock, protocol, PF_ATMPVC); +} + +static const struct net_proto_family pvc_family_ops = { + .family = PF_ATMPVC, + .create = pvc_create, + .owner = THIS_MODULE, +}; + + +/* + * Initialize the ATM PVC protocol family + */ + + +int __init atmpvc_init(void) +{ + return sock_register(&pvc_family_ops); +} + +void atmpvc_exit(void) +{ + sock_unregister(PF_ATMPVC); +} diff --git a/net/atm/raw.c b/net/atm/raw.c new file mode 100644 index 00000000..b4f7b9ff --- /dev/null +++ b/net/atm/raw.c @@ -0,0 +1,85 @@ +/* net/atm/raw.c - Raw AAL0 and AAL5 transports */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "protocols.h" + +/* + * SKB == NULL indicates that the link is being closed + */ + +static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb) +{ + if (skb) { + struct sock *sk = sk_atm(vcc); + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + } +} + +static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct sock *sk = sk_atm(vcc); + + pr_debug("(%d) %d -= %d\n", + vcc->vci, sk_wmem_alloc_get(sk), skb->truesize); + atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + dev_kfree_skb_any(skb); + sk->sk_write_space(sk); +} + +static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb) +{ + /* + * Note that if vpi/vci are _ANY or _UNSPEC the below will + * still work + */ + if (!capable(CAP_NET_ADMIN) && + (((u32 *)skb->data)[0] & (ATM_HDR_VPI_MASK | ATM_HDR_VCI_MASK)) != + ((vcc->vpi << ATM_HDR_VPI_SHIFT) | + (vcc->vci << ATM_HDR_VCI_SHIFT))) { + kfree_skb(skb); + return -EADDRNOTAVAIL; + } + return vcc->dev->ops->send(vcc, skb); +} + +int atm_init_aal0(struct atm_vcc *vcc) +{ + vcc->push = atm_push_raw; + vcc->pop = atm_pop_raw; + vcc->push_oam = NULL; + vcc->send = atm_send_aal0; + return 0; +} + +int atm_init_aal34(struct atm_vcc *vcc) +{ + vcc->push = atm_push_raw; + vcc->pop = atm_pop_raw; + vcc->push_oam = NULL; + vcc->send = vcc->dev->ops->send; + return 0; +} + +int atm_init_aal5(struct atm_vcc *vcc) +{ + vcc->push = atm_push_raw; + vcc->pop = atm_pop_raw; + vcc->push_oam = NULL; + vcc->send = vcc->dev->ops->send; + return 0; +} +EXPORT_SYMBOL(atm_init_aal5); diff --git a/net/atm/resources.c b/net/atm/resources.c new file mode 100644 index 00000000..23f45ce6 --- /dev/null +++ b/net/atm/resources.c @@ -0,0 +1,463 @@ +/* net/atm/resources.c - Statically allocated resources */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +/* Fixes + * Arnaldo Carvalho de Melo + * 2002/01 - don't free the whole struct sock on sk->destruct time, + * use the default destruct function initialized by sock_init_data */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include /* for barrier */ +#include +#include +#include +#include +#include +#include + +#include /* for struct sock */ + +#include "common.h" +#include "resources.h" +#include "addr.h" + + +LIST_HEAD(atm_devs); +DEFINE_MUTEX(atm_dev_mutex); + +static struct atm_dev *__alloc_atm_dev(const char *type) +{ + struct atm_dev *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + dev->type = type; + dev->signal = ATM_PHY_SIG_UNKNOWN; + dev->link_rate = ATM_OC3_PCR; + spin_lock_init(&dev->lock); + INIT_LIST_HEAD(&dev->local); + INIT_LIST_HEAD(&dev->lecs); + + return dev; +} + +static struct atm_dev *__atm_dev_lookup(int number) +{ + struct atm_dev *dev; + struct list_head *p; + + list_for_each(p, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + if (dev->number == number) { + atm_dev_hold(dev); + return dev; + } + } + return NULL; +} + +struct atm_dev *atm_dev_lookup(int number) +{ + struct atm_dev *dev; + + mutex_lock(&atm_dev_mutex); + dev = __atm_dev_lookup(number); + mutex_unlock(&atm_dev_mutex); + return dev; +} +EXPORT_SYMBOL(atm_dev_lookup); + +struct atm_dev *atm_dev_register(const char *type, struct device *parent, + const struct atmdev_ops *ops, int number, + unsigned long *flags) +{ + struct atm_dev *dev, *inuse; + + dev = __alloc_atm_dev(type); + if (!dev) { + pr_err("no space for dev %s\n", type); + return NULL; + } + mutex_lock(&atm_dev_mutex); + if (number != -1) { + inuse = __atm_dev_lookup(number); + if (inuse) { + atm_dev_put(inuse); + mutex_unlock(&atm_dev_mutex); + kfree(dev); + return NULL; + } + dev->number = number; + } else { + dev->number = 0; + while ((inuse = __atm_dev_lookup(dev->number))) { + atm_dev_put(inuse); + dev->number++; + } + } + + dev->ops = ops; + if (flags) + dev->flags = *flags; + else + memset(&dev->flags, 0, sizeof(dev->flags)); + memset(&dev->stats, 0, sizeof(dev->stats)); + atomic_set(&dev->refcnt, 1); + + if (atm_proc_dev_register(dev) < 0) { + pr_err("atm_proc_dev_register failed for dev %s\n", type); + goto out_fail; + } + + if (atm_register_sysfs(dev, parent) < 0) { + pr_err("atm_register_sysfs failed for dev %s\n", type); + atm_proc_dev_deregister(dev); + goto out_fail; + } + + list_add_tail(&dev->dev_list, &atm_devs); + +out: + mutex_unlock(&atm_dev_mutex); + return dev; + +out_fail: + kfree(dev); + dev = NULL; + goto out; +} +EXPORT_SYMBOL(atm_dev_register); + +void atm_dev_deregister(struct atm_dev *dev) +{ + BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags)); + set_bit(ATM_DF_REMOVED, &dev->flags); + + /* + * if we remove current device from atm_devs list, new device + * with same number can appear, such we need deregister proc, + * release async all vccs and remove them from vccs list too + */ + mutex_lock(&atm_dev_mutex); + list_del(&dev->dev_list); + mutex_unlock(&atm_dev_mutex); + + atm_dev_release_vccs(dev); + atm_unregister_sysfs(dev); + atm_proc_dev_deregister(dev); + + atm_dev_put(dev); +} +EXPORT_SYMBOL(atm_dev_deregister); + +static void copy_aal_stats(struct k_atm_aal_stats *from, + struct atm_aal_stats *to) +{ +#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) + __AAL_STAT_ITEMS +#undef __HANDLE_ITEM +} + +static void subtract_aal_stats(struct k_atm_aal_stats *from, + struct atm_aal_stats *to) +{ +#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i) + __AAL_STAT_ITEMS +#undef __HANDLE_ITEM +} + +static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, + int zero) +{ + struct atm_dev_stats tmp; + int error = 0; + + copy_aal_stats(&dev->stats.aal0, &tmp.aal0); + copy_aal_stats(&dev->stats.aal34, &tmp.aal34); + copy_aal_stats(&dev->stats.aal5, &tmp.aal5); + if (arg) + error = copy_to_user(arg, &tmp, sizeof(tmp)); + if (zero && !error) { + subtract_aal_stats(&dev->stats.aal0, &tmp.aal0); + subtract_aal_stats(&dev->stats.aal34, &tmp.aal34); + subtract_aal_stats(&dev->stats.aal5, &tmp.aal5); + } + return error ? -EFAULT : 0; +} + +int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat) +{ + void __user *buf; + int error, len, number, size = 0; + struct atm_dev *dev; + struct list_head *p; + int *tmp_buf, *tmp_p; + int __user *sioc_len; + int __user *iobuf_len; + +#ifndef CONFIG_COMPAT + compat = 0; /* Just so the compiler _knows_ */ +#endif + + switch (cmd) { + case ATM_GETNAMES: + if (compat) { +#ifdef CONFIG_COMPAT + struct compat_atm_iobuf __user *ciobuf = arg; + compat_uptr_t cbuf; + iobuf_len = &ciobuf->length; + if (get_user(cbuf, &ciobuf->buffer)) + return -EFAULT; + buf = compat_ptr(cbuf); +#endif + } else { + struct atm_iobuf __user *iobuf = arg; + iobuf_len = &iobuf->length; + if (get_user(buf, &iobuf->buffer)) + return -EFAULT; + } + if (get_user(len, iobuf_len)) + return -EFAULT; + mutex_lock(&atm_dev_mutex); + list_for_each(p, &atm_devs) + size += sizeof(int); + if (size > len) { + mutex_unlock(&atm_dev_mutex); + return -E2BIG; + } + tmp_buf = kmalloc(size, GFP_ATOMIC); + if (!tmp_buf) { + mutex_unlock(&atm_dev_mutex); + return -ENOMEM; + } + tmp_p = tmp_buf; + list_for_each(p, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + *tmp_p++ = dev->number; + } + mutex_unlock(&atm_dev_mutex); + error = ((copy_to_user(buf, tmp_buf, size)) || + put_user(size, iobuf_len)) + ? -EFAULT : 0; + kfree(tmp_buf); + return error; + default: + break; + } + + if (compat) { +#ifdef CONFIG_COMPAT + struct compat_atmif_sioc __user *csioc = arg; + compat_uptr_t carg; + + sioc_len = &csioc->length; + if (get_user(carg, &csioc->arg)) + return -EFAULT; + buf = compat_ptr(carg); + + if (get_user(len, &csioc->length)) + return -EFAULT; + if (get_user(number, &csioc->number)) + return -EFAULT; +#endif + } else { + struct atmif_sioc __user *sioc = arg; + + sioc_len = &sioc->length; + if (get_user(buf, &sioc->arg)) + return -EFAULT; + if (get_user(len, &sioc->length)) + return -EFAULT; + if (get_user(number, &sioc->number)) + return -EFAULT; + } + + dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d", + number); + if (!dev) + return -ENODEV; + + switch (cmd) { + case ATM_GETTYPE: + size = strlen(dev->type) + 1; + if (copy_to_user(buf, dev->type, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_GETESI: + size = ESI_LEN; + if (copy_to_user(buf, dev->esi, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_SETESI: + { + int i; + + for (i = 0; i < ESI_LEN; i++) + if (dev->esi[i]) { + error = -EEXIST; + goto done; + } + } + /* fall through */ + case ATM_SETESIF: + { + unsigned char esi[ESI_LEN]; + + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + if (copy_from_user(esi, buf, ESI_LEN)) { + error = -EFAULT; + goto done; + } + memcpy(dev->esi, esi, ESI_LEN); + error = ESI_LEN; + goto done; + } + case ATM_GETSTATZ: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + /* fall through */ + case ATM_GETSTAT: + size = sizeof(struct atm_dev_stats); + error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ); + if (error) + goto done; + break; + case ATM_GETCIRANGE: + size = sizeof(struct atm_cirange); + if (copy_to_user(buf, &dev->ci_range, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_GETLINKRATE: + size = sizeof(int); + if (copy_to_user(buf, &dev->link_rate, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_RSTADDR: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + atm_reset_addr(dev, ATM_ADDR_LOCAL); + break; + case ATM_ADDADDR: + case ATM_DELADDR: + case ATM_ADDLECSADDR: + case ATM_DELLECSADDR: + { + struct sockaddr_atmsvc addr; + + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + + if (copy_from_user(&addr, buf, sizeof(addr))) { + error = -EFAULT; + goto done; + } + if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR) + error = atm_add_addr(dev, &addr, + (cmd == ATM_ADDADDR ? + ATM_ADDR_LOCAL : ATM_ADDR_LECS)); + else + error = atm_del_addr(dev, &addr, + (cmd == ATM_DELADDR ? + ATM_ADDR_LOCAL : ATM_ADDR_LECS)); + goto done; + } + case ATM_GETADDR: + case ATM_GETLECSADDR: + error = atm_get_addr(dev, buf, len, + (cmd == ATM_GETADDR ? + ATM_ADDR_LOCAL : ATM_ADDR_LECS)); + if (error < 0) + goto done; + size = error; + /* may return 0, but later on size == 0 means "don't + write the length" */ + error = put_user(size, sioc_len) ? -EFAULT : 0; + goto done; + case ATM_SETLOOP: + if (__ATM_LM_XTRMT((int) (unsigned long) buf) && + __ATM_LM_XTLOC((int) (unsigned long) buf) > + __ATM_LM_XTRMT((int) (unsigned long) buf)) { + error = -EINVAL; + goto done; + } + /* fall through */ + case ATM_SETCIRANGE: + case SONET_GETSTATZ: + case SONET_SETDIAG: + case SONET_CLRDIAG: + case SONET_SETFRAMING: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + /* fall through */ + default: + if (compat) { +#ifdef CONFIG_COMPAT + if (!dev->ops->compat_ioctl) { + error = -EINVAL; + goto done; + } + size = dev->ops->compat_ioctl(dev, cmd, buf); +#endif + } else { + if (!dev->ops->ioctl) { + error = -EINVAL; + goto done; + } + size = dev->ops->ioctl(dev, cmd, buf); + } + if (size < 0) { + error = (size == -ENOIOCTLCMD ? -EINVAL : size); + goto done; + } + } + + if (size) + error = put_user(size, sioc_len) ? -EFAULT : 0; + else + error = 0; +done: + atm_dev_put(dev); + return error; +} + +void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) +{ + mutex_lock(&atm_dev_mutex); + return seq_list_start_head(&atm_devs, *pos); +} + +void atm_dev_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&atm_dev_mutex); +} + +void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &atm_devs, pos); +} diff --git a/net/atm/resources.h b/net/atm/resources.h new file mode 100644 index 00000000..521431e3 --- /dev/null +++ b/net/atm/resources.h @@ -0,0 +1,47 @@ +/* net/atm/resources.h - ATM-related resources */ + +/* Written 1995-1998 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_RESOURCES_H +#define NET_ATM_RESOURCES_H + +#include +#include + + +extern struct list_head atm_devs; +extern struct mutex atm_dev_mutex; + +int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat); + + +#ifdef CONFIG_PROC_FS + +#include + +void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos); +void atm_dev_seq_stop(struct seq_file *seq, void *v); +void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); + + +int atm_proc_dev_register(struct atm_dev *dev); +void atm_proc_dev_deregister(struct atm_dev *dev); + +#else + +static inline int atm_proc_dev_register(struct atm_dev *dev) +{ + return 0; +} + +static inline void atm_proc_dev_deregister(struct atm_dev *dev) +{ + /* nothing */ +} + +#endif /* CONFIG_PROC_FS */ + +int atm_register_sysfs(struct atm_dev *adev, struct device *parent); +void atm_unregister_sysfs(struct atm_dev *adev); +#endif diff --git a/net/atm/signaling.c b/net/atm/signaling.c new file mode 100644 index 00000000..509c8ac0 --- /dev/null +++ b/net/atm/signaling.c @@ -0,0 +1,269 @@ +/* net/atm/signaling.c - ATM signaling */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include /* error codes */ +#include /* printk */ +#include +#include +#include /* jiffies and HZ */ +#include /* ATM stuff */ +#include +#include +#include +#include +#include + +#include "resources.h" +#include "signaling.h" + +#undef WAIT_FOR_DEMON /* #define this if system calls on SVC sockets + should block until the demon runs. + Danger: may cause nasty hangs if the demon + crashes. */ + +struct atm_vcc *sigd = NULL; +#ifdef WAIT_FOR_DEMON +static DECLARE_WAIT_QUEUE_HEAD(sigd_sleep); +#endif + +static void sigd_put_skb(struct sk_buff *skb) +{ +#ifdef WAIT_FOR_DEMON + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&sigd_sleep, &wait); + while (!sigd) { + set_current_state(TASK_UNINTERRUPTIBLE); + pr_debug("atmsvc: waiting for signaling daemon...\n"); + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(&sigd_sleep, &wait); +#else + if (!sigd) { + pr_debug("atmsvc: no signaling daemon\n"); + kfree_skb(skb); + return; + } +#endif + atm_force_charge(sigd, skb->truesize); + skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb); + sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len); +} + +static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg) +{ + struct sk_buff *skb; + + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + !test_bit(ATM_VF_READY, &vcc->flags)) + return; + msg->type = as_error; + if (!vcc->dev->ops->change_qos) + msg->reply = -EOPNOTSUPP; + else { + /* should lock VCC */ + msg->reply = vcc->dev->ops->change_qos(vcc, &msg->qos, + msg->reply); + if (!msg->reply) + msg->type = as_okay; + } + /* + * Should probably just turn around the old skb. But the, the buffer + * space accounting needs to follow the change too. Maybe later. + */ + while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL))) + schedule(); + *(struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg)) = *msg; + sigd_put_skb(skb); +} + +static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct atmsvc_msg *msg; + struct atm_vcc *session_vcc; + struct sock *sk; + + msg = (struct atmsvc_msg *) skb->data; + atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + vcc = *(struct atm_vcc **) &msg->vcc; + pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc); + sk = sk_atm(vcc); + + switch (msg->type) { + case as_okay: + sk->sk_err = -msg->reply; + clear_bit(ATM_VF_WAITING, &vcc->flags); + if (!*vcc->local.sas_addr.prv && !*vcc->local.sas_addr.pub) { + vcc->local.sas_family = AF_ATMSVC; + memcpy(vcc->local.sas_addr.prv, + msg->local.sas_addr.prv, ATM_ESA_LEN); + memcpy(vcc->local.sas_addr.pub, + msg->local.sas_addr.pub, ATM_E164_LEN + 1); + } + session_vcc = vcc->session ? vcc->session : vcc; + if (session_vcc->vpi || session_vcc->vci) + break; + session_vcc->itf = msg->pvc.sap_addr.itf; + session_vcc->vpi = msg->pvc.sap_addr.vpi; + session_vcc->vci = msg->pvc.sap_addr.vci; + if (session_vcc->vpi || session_vcc->vci) + session_vcc->qos = msg->qos; + break; + case as_error: + clear_bit(ATM_VF_REGIS, &vcc->flags); + clear_bit(ATM_VF_READY, &vcc->flags); + sk->sk_err = -msg->reply; + clear_bit(ATM_VF_WAITING, &vcc->flags); + break; + case as_indicate: + vcc = *(struct atm_vcc **)&msg->listen_vcc; + sk = sk_atm(vcc); + pr_debug("as_indicate!!!\n"); + lock_sock(sk); + if (sk_acceptq_is_full(sk)) { + sigd_enq(NULL, as_reject, vcc, NULL, NULL); + dev_kfree_skb(skb); + goto as_indicate_complete; + } + sk->sk_ack_backlog++; + skb_queue_tail(&sk->sk_receive_queue, skb); + pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk)); + sk->sk_state_change(sk); +as_indicate_complete: + release_sock(sk); + return 0; + case as_close: + set_bit(ATM_VF_RELEASED, &vcc->flags); + vcc_release_async(vcc, msg->reply); + goto out; + case as_modify: + modify_qos(vcc, msg); + break; + case as_addparty: + case as_dropparty: + sk->sk_err_soft = msg->reply; + /* < 0 failure, otherwise ep_ref */ + clear_bit(ATM_VF_WAITING, &vcc->flags); + break; + default: + pr_alert("bad message type %d\n", (int)msg->type); + return -EINVAL; + } + sk->sk_state_change(sk); +out: + dev_kfree_skb(skb); + return 0; +} + +void sigd_enq2(struct atm_vcc *vcc, enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc, const struct atm_qos *qos, + int reply) +{ + struct sk_buff *skb; + struct atmsvc_msg *msg; + static unsigned session = 0; + + pr_debug("%d (0x%p)\n", (int)type, vcc); + while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL))) + schedule(); + msg = (struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg)); + memset(msg, 0, sizeof(*msg)); + msg->type = type; + *(struct atm_vcc **) &msg->vcc = vcc; + *(struct atm_vcc **) &msg->listen_vcc = listen_vcc; + msg->reply = reply; + if (qos) + msg->qos = *qos; + if (vcc) + msg->sap = vcc->sap; + if (svc) + msg->svc = *svc; + if (vcc) + msg->local = vcc->local; + if (pvc) + msg->pvc = *pvc; + if (vcc) { + if (type == as_connect && test_bit(ATM_VF_SESSION, &vcc->flags)) + msg->session = ++session; + /* every new pmp connect gets the next session number */ + } + sigd_put_skb(skb); + if (vcc) + set_bit(ATM_VF_REGIS, &vcc->flags); +} + +void sigd_enq(struct atm_vcc *vcc, enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc) +{ + sigd_enq2(vcc, type, listen_vcc, pvc, svc, vcc ? &vcc->qos : NULL, 0); + /* other ISP applications may use "reply" */ +} + +static void purge_vcc(struct atm_vcc *vcc) +{ + if (sk_atm(vcc)->sk_family == PF_ATMSVC && + !test_bit(ATM_VF_META, &vcc->flags)) { + set_bit(ATM_VF_RELEASED, &vcc->flags); + clear_bit(ATM_VF_REGIS, &vcc->flags); + vcc_release_async(vcc, -EUNATCH); + } +} + +static void sigd_close(struct atm_vcc *vcc) +{ + struct hlist_node *node; + struct sock *s; + int i; + + pr_debug("\n"); + sigd = NULL; + if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) + pr_err("closing with requests pending\n"); + skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); + + read_lock(&vcc_sklist_lock); + for (i = 0; i < VCC_HTABLE_SIZE; ++i) { + struct hlist_head *head = &vcc_hash[i]; + + sk_for_each(s, node, head) { + vcc = atm_sk(s); + + purge_vcc(vcc); + } + } + read_unlock(&vcc_sklist_lock); +} + +static struct atmdev_ops sigd_dev_ops = { + .close = sigd_close, + .send = sigd_send +}; + +static struct atm_dev sigd_dev = { + .ops = &sigd_dev_ops, + .type = "sig", + .number = 999, + .lock = __SPIN_LOCK_UNLOCKED(sigd_dev.lock) +}; + +int sigd_attach(struct atm_vcc *vcc) +{ + if (sigd) + return -EADDRINUSE; + pr_debug("\n"); + sigd = vcc; + vcc->dev = &sigd_dev; + vcc_insert_socket(sk_atm(vcc)); + set_bit(ATM_VF_META, &vcc->flags); + set_bit(ATM_VF_READY, &vcc->flags); +#ifdef WAIT_FOR_DEMON + wake_up(&sigd_sleep); +#endif + return 0; +} diff --git a/net/atm/signaling.h b/net/atm/signaling.h new file mode 100644 index 00000000..08b2a69c --- /dev/null +++ b/net/atm/signaling.h @@ -0,0 +1,30 @@ +/* net/atm/signaling.h - ATM signaling */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_SIGNALING_H +#define NET_ATM_SIGNALING_H + +#include +#include +#include + + +extern struct atm_vcc *sigd; /* needed in svc_release */ + + +/* + * sigd_enq is a wrapper for sigd_enq2, covering the more common cases, and + * avoiding huge lists of null values. + */ + +void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply); +void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc); +int sigd_attach(struct atm_vcc *vcc); + +#endif diff --git a/net/atm/svc.c b/net/atm/svc.c new file mode 100644 index 00000000..754ee479 --- /dev/null +++ b/net/atm/svc.c @@ -0,0 +1,691 @@ +/* net/atm/svc.c - ATM SVC sockets */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include /* struct socket, struct proto_ops */ +#include /* error codes */ +#include /* printk */ +#include +#include +#include /* jiffies and HZ */ +#include /* O_NONBLOCK */ +#include +#include /* ATM stuff */ +#include +#include +#include +#include +#include /* for sock_no_* */ +#include + +#include "resources.h" +#include "common.h" /* common for PVCs and SVCs */ +#include "signaling.h" +#include "addr.h" + +static int svc_create(struct net *net, struct socket *sock, int protocol, + int kern); + +/* + * Note: since all this is still nicely synchronized with the signaling demon, + * there's no need to protect sleep loops with clis. If signaling is + * moved into the kernel, that would change. + */ + + +static int svc_shutdown(struct socket *sock, int how) +{ + return 0; +} + +static void svc_disconnect(struct atm_vcc *vcc) +{ + DEFINE_WAIT(wait); + struct sk_buff *skb; + struct sock *sk = sk_atm(vcc); + + pr_debug("%p\n", vcc); + if (test_bit(ATM_VF_REGIS, &vcc->flags)) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(vcc, as_close, NULL, NULL, NULL); + while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk_sleep(sk), &wait, + TASK_UNINTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + } + /* beware - socket is still in use by atmsigd until the last + as_indicate has been answered */ + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + atm_return(vcc, skb->truesize); + pr_debug("LISTEN REL\n"); + sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0); + dev_kfree_skb(skb); + } + clear_bit(ATM_VF_REGIS, &vcc->flags); + /* ... may retry later */ +} + +static int svc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + + if (sk) { + vcc = ATM_SD(sock); + pr_debug("%p\n", vcc); + clear_bit(ATM_VF_READY, &vcc->flags); + /* + * VCC pointer is used as a reference, + * so we must not free it (thereby subjecting it to re-use) + * before all pending connections are closed + */ + svc_disconnect(vcc); + vcc_release(sock); + } + return 0; +} + +static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, + int sockaddr_len) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct sockaddr_atmsvc *addr; + struct atm_vcc *vcc; + int error; + + if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) + return -EINVAL; + lock_sock(sk); + if (sock->state == SS_CONNECTED) { + error = -EISCONN; + goto out; + } + if (sock->state != SS_UNCONNECTED) { + error = -EINVAL; + goto out; + } + vcc = ATM_SD(sock); + addr = (struct sockaddr_atmsvc *) sockaddr; + if (addr->sas_family != AF_ATMSVC) { + error = -EAFNOSUPPORT; + goto out; + } + clear_bit(ATM_VF_BOUND, &vcc->flags); + /* failing rebind will kill old binding */ + /* @@@ check memory (de)allocation on rebind */ + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { + error = -EBADFD; + goto out; + } + vcc->local = *addr; + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */ + if (!sigd) { + error = -EUNATCH; + goto out; + } + if (!sk->sk_err) + set_bit(ATM_VF_BOUND, &vcc->flags); + error = -sk->sk_err; +out: + release_sock(sk); + return error; +} + +static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, + int sockaddr_len, int flags) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct sockaddr_atmsvc *addr; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + pr_debug("%p\n", vcc); + lock_sock(sk); + if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) { + error = -EINVAL; + goto out; + } + + switch (sock->state) { + default: + error = -EINVAL; + goto out; + case SS_CONNECTED: + error = -EISCONN; + goto out; + case SS_CONNECTING: + if (test_bit(ATM_VF_WAITING, &vcc->flags)) { + error = -EALREADY; + goto out; + } + sock->state = SS_UNCONNECTED; + if (sk->sk_err) { + error = -sk->sk_err; + goto out; + } + break; + case SS_UNCONNECTED: + addr = (struct sockaddr_atmsvc *) sockaddr; + if (addr->sas_family != AF_ATMSVC) { + error = -EAFNOSUPPORT; + goto out; + } + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { + error = -EBADFD; + goto out; + } + if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || + vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) { + error = -EINVAL; + goto out; + } + if (!vcc->qos.txtp.traffic_class && + !vcc->qos.rxtp.traffic_class) { + error = -EINVAL; + goto out; + } + vcc->remote = *addr; + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote); + if (flags & O_NONBLOCK) { + finish_wait(sk_sleep(sk), &wait); + sock->state = SS_CONNECTING; + error = -EINPROGRESS; + goto out; + } + error = 0; + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + if (!signal_pending(current)) { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + continue; + } + pr_debug("*ABORT*\n"); + /* + * This is tricky: + * Kernel ---close--> Demon + * Kernel <--close--- Demon + * or + * Kernel ---close--> Demon + * Kernel <--error--- Demon + * or + * Kernel ---close--> Demon + * Kernel <--okay---- Demon + * Kernel <--close--- Demon + */ + sigd_enq(vcc, as_close, NULL, NULL, NULL); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + schedule(); + } + if (!sk->sk_err) + while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && + sigd) { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + schedule(); + } + clear_bit(ATM_VF_REGIS, &vcc->flags); + clear_bit(ATM_VF_RELEASED, &vcc->flags); + clear_bit(ATM_VF_CLOSE, &vcc->flags); + /* we're gone now but may connect later */ + error = -EINTR; + break; + } + finish_wait(sk_sleep(sk), &wait); + if (error) + goto out; + if (!sigd) { + error = -EUNATCH; + goto out; + } + if (sk->sk_err) { + error = -sk->sk_err; + goto out; + } + } +/* + * Not supported yet + * + * #ifndef CONFIG_SINGLE_SIGITF + */ + vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp); + vcc->qos.txtp.pcr = 0; + vcc->qos.txtp.min_pcr = 0; +/* + * #endif + */ + error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci); + if (!error) + sock->state = SS_CONNECTED; + else + (void)svc_disconnect(vcc); +out: + release_sock(sk); + return error; +} + +static int svc_listen(struct socket *sock, int backlog) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + pr_debug("%p\n", vcc); + lock_sock(sk); + /* let server handle listen on unbound sockets */ + if (test_bit(ATM_VF_SESSION, &vcc->flags)) { + error = -EINVAL; + goto out; + } + if (test_bit(ATM_VF_LISTEN, &vcc->flags)) { + error = -EADDRINUSE; + goto out; + } + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + if (!sigd) { + error = -EUNATCH; + goto out; + } + set_bit(ATM_VF_LISTEN, &vcc->flags); + vcc_insert_socket(sk); + sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT; + error = -sk->sk_err; +out: + release_sock(sk); + return error; +} + +static int svc_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + struct atmsvc_msg *msg; + struct atm_vcc *old_vcc = ATM_SD(sock); + struct atm_vcc *new_vcc; + int error; + + lock_sock(sk); + + error = svc_create(sock_net(sk), newsock, 0, 0); + if (error) + goto out; + + new_vcc = ATM_SD(newsock); + + pr_debug("%p -> %p\n", old_vcc, new_vcc); + while (1) { + DEFINE_WAIT(wait); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + while (!(skb = skb_dequeue(&sk->sk_receive_queue)) && + sigd) { + if (test_bit(ATM_VF_RELEASED, &old_vcc->flags)) + break; + if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) { + error = -sk->sk_err; + break; + } + if (flags & O_NONBLOCK) { + error = -EAGAIN; + break; + } + release_sock(sk); + schedule(); + lock_sock(sk); + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + if (error) + goto out; + if (!skb) { + error = -EUNATCH; + goto out; + } + msg = (struct atmsvc_msg *)skb->data; + new_vcc->qos = msg->qos; + set_bit(ATM_VF_HASQOS, &new_vcc->flags); + new_vcc->remote = msg->svc; + new_vcc->local = msg->local; + new_vcc->sap = msg->sap; + error = vcc_connect(newsock, msg->pvc.sap_addr.itf, + msg->pvc.sap_addr.vpi, + msg->pvc.sap_addr.vci); + dev_kfree_skb(skb); + sk->sk_ack_backlog--; + if (error) { + sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, + &old_vcc->qos, error); + error = error == -EAGAIN ? -EBUSY : error; + goto out; + } + /* wait should be short, so we ignore the non-blocking flag */ + set_bit(ATM_VF_WAITING, &new_vcc->flags); + prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, + TASK_UNINTERRUPTIBLE); + sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL); + while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) { + release_sock(sk); + schedule(); + lock_sock(sk); + prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, + TASK_UNINTERRUPTIBLE); + } + finish_wait(sk_sleep(sk_atm(new_vcc)), &wait); + if (!sigd) { + error = -EUNATCH; + goto out; + } + if (!sk_atm(new_vcc)->sk_err) + break; + if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) { + error = -sk_atm(new_vcc)->sk_err; + goto out; + } + } + newsock->state = SS_CONNECTED; +out: + release_sock(sk); + return error; +} + +static int svc_getname(struct socket *sock, struct sockaddr *sockaddr, + int *sockaddr_len, int peer) +{ + struct sockaddr_atmsvc *addr; + + *sockaddr_len = sizeof(struct sockaddr_atmsvc); + addr = (struct sockaddr_atmsvc *) sockaddr; + memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, + sizeof(struct sockaddr_atmsvc)); + return 0; +} + +int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) +{ + struct sock *sk = sk_atm(vcc); + DEFINE_WAIT(wait); + + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && + !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + if (!sigd) + return -EUNATCH; + return -sk->sk_err; +} + +static int svc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int value, error = 0; + + lock_sock(sk); + switch (optname) { + case SO_ATMSAP: + if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) { + error = -EINVAL; + goto out; + } + if (copy_from_user(&vcc->sap, optval, optlen)) { + error = -EFAULT; + goto out; + } + set_bit(ATM_VF_HASSAP, &vcc->flags); + break; + case SO_MULTIPOINT: + if (level != SOL_ATM || optlen != sizeof(int)) { + error = -EINVAL; + goto out; + } + if (get_user(value, (int __user *)optval)) { + error = -EFAULT; + goto out; + } + if (value == 1) + set_bit(ATM_VF_SESSION, &vcc->flags); + else if (value == 0) + clear_bit(ATM_VF_SESSION, &vcc->flags); + else + error = -EINVAL; + break; + default: + error = vcc_setsockopt(sock, level, optname, optval, optlen); + } + +out: + release_sock(sk); + return error; +} + +static int svc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int error = 0, len; + + lock_sock(sk); + if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) { + error = vcc_getsockopt(sock, level, optname, optval, optlen); + goto out; + } + if (get_user(len, optlen)) { + error = -EFAULT; + goto out; + } + if (len != sizeof(struct atm_sap)) { + error = -EINVAL; + goto out; + } + if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) { + error = -EFAULT; + goto out; + } +out: + release_sock(sk); + return error; +} + +static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, + int sockaddr_len, int flags) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + lock_sock(sk); + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sigd_enq(vcc, as_addparty, NULL, NULL, + (struct sockaddr_atmsvc *) sockaddr); + if (flags & O_NONBLOCK) { + finish_wait(sk_sleep(sk), &wait); + error = -EINPROGRESS; + goto out; + } + pr_debug("added wait queue\n"); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + error = xchg(&sk->sk_err_soft, 0); +out: + release_sock(sk); + return error; +} + +static int svc_dropparty(struct socket *sock, int ep_ref) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + lock_sock(sk); + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + if (!sigd) { + error = -EUNATCH; + goto out; + } + error = xchg(&sk->sk_err_soft, 0); +out: + release_sock(sk); + return error; +} + +static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int error, ep_ref; + struct sockaddr_atmsvc sa; + struct atm_vcc *vcc = ATM_SD(sock); + + switch (cmd) { + case ATM_ADDPARTY: + if (!test_bit(ATM_VF_SESSION, &vcc->flags)) + return -EINVAL; + if (copy_from_user(&sa, (void __user *) arg, sizeof(sa))) + return -EFAULT; + error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa), + 0); + break; + case ATM_DROPPARTY: + if (!test_bit(ATM_VF_SESSION, &vcc->flags)) + return -EINVAL; + if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int))) + return -EFAULT; + error = svc_dropparty(sock, ep_ref); + break; + default: + error = vcc_ioctl(sock, cmd, arg); + } + + return error; +} + +#ifdef CONFIG_COMPAT +static int svc_compat_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf. + But actually it takes a struct sockaddr_atmsvc, which doesn't need + compat handling. So all we have to do is fix up cmd... */ + if (cmd == COMPAT_ATM_ADDPARTY) + cmd = ATM_ADDPARTY; + + if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY) + return svc_ioctl(sock, cmd, arg); + else + return vcc_compat_ioctl(sock, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static const struct proto_ops svc_proto_ops = { + .family = PF_ATMSVC, + .owner = THIS_MODULE, + + .release = svc_release, + .bind = svc_bind, + .connect = svc_connect, + .socketpair = sock_no_socketpair, + .accept = svc_accept, + .getname = svc_getname, + .poll = vcc_poll, + .ioctl = svc_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = svc_compat_ioctl, +#endif + .listen = svc_listen, + .shutdown = svc_shutdown, + .setsockopt = svc_setsockopt, + .getsockopt = svc_getsockopt, + .sendmsg = vcc_sendmsg, + .recvmsg = vcc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + + +static int svc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + int error; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + sock->ops = &svc_proto_ops; + error = vcc_create(net, sock, protocol, AF_ATMSVC); + if (error) + return error; + ATM_SD(sock)->local.sas_family = AF_ATMSVC; + ATM_SD(sock)->remote.sas_family = AF_ATMSVC; + return 0; +} + +static const struct net_proto_family svc_family_ops = { + .family = PF_ATMSVC, + .create = svc_create, + .owner = THIS_MODULE, +}; + + +/* + * Initialize the ATM SVC protocol family + */ + +int __init atmsvc_init(void) +{ + return sock_register(&svc_family_ops); +} + +void atmsvc_exit(void) +{ + sock_unregister(PF_ATMSVC); +} diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig new file mode 100644 index 00000000..705e53ef --- /dev/null +++ b/net/ax25/Kconfig @@ -0,0 +1,121 @@ +# +# Amateur Radio protocols and AX.25 device configuration +# + +menuconfig HAMRADIO + depends on NET && !S390 + bool "Amateur Radio support" + help + If you want to connect your Linux box to an amateur radio, answer Y + here. You want to read + and more specifically about AX.25 on Linux + . + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about amateur radio. + +comment "Packet Radio protocols" + depends on HAMRADIO + +config AX25 + tristate "Amateur Radio AX.25 Level 2 protocol" + depends on HAMRADIO + help + This is the protocol used for computer communication over amateur + radio. It is either used by itself for point-to-point links, or to + carry other protocols such as tcp/ip. To use it, you need a device + that connects your Linux box to your amateur radio. You can either + use a low speed TNC (a Terminal Node Controller acts as a kind of + modem connecting your computer's serial port to your radio's + microphone input and speaker output) supporting the KISS protocol or + one of the various SCC cards that are supported by the generic Z8530 + or the DMA SCC driver. Another option are the Baycom modem serial + and parallel port hacks or the sound card modem (supported by their + own drivers). If you say Y here, you also have to say Y to one of + those drivers. + + Information about where to get supporting software for Linux amateur + radio as well as information about how to configure an AX.25 port is + contained in the AX25-HOWTO, available from + . You might also want to + check out the file in the + kernel source. More information about digital amateur radio in + general is on the WWW at + . + + To compile this driver as a module, choose M here: the + module will be called ax25. + +config AX25_DAMA_SLAVE + bool "AX.25 DAMA Slave support" + default y + depends on AX25 + help + DAMA is a mechanism to prevent collisions when doing AX.25 + networking. A DAMA server (called "master") accepts incoming traffic + from clients (called "slaves") and redistributes it to other slaves. + If you say Y here, your Linux box will act as a DAMA slave; this is + transparent in that you don't have to do any special DAMA + configuration. Linux cannot yet act as a DAMA server. This option + only compiles DAMA slave support into the kernel. It still needs to + be enabled at runtime. For more about DAMA see + . If unsure, say Y. + +# placeholder until implemented +config AX25_DAMA_MASTER + bool 'AX.25 DAMA Master support' + depends on AX25_DAMA_SLAVE && BROKEN + help + DAMA is a mechanism to prevent collisions when doing AX.25 + networking. A DAMA server (called "master") accepts incoming traffic + from clients (called "slaves") and redistributes it to other slaves. + If you say Y here, your Linux box will act as a DAMA master; this is + transparent in that you don't have to do any special DAMA + configuration. Linux cannot yet act as a DAMA server. This option + only compiles DAMA slave support into the kernel. It still needs to + be explicitly enabled, so if unsure, say Y. + +config NETROM + tristate "Amateur Radio NET/ROM protocol" + depends on AX25 + help + NET/ROM is a network layer protocol on top of AX.25 useful for + routing. + + A comprehensive listing of all the software for Linux amateur radio + users as well as information about how to configure an AX.25 port is + contained in the Linux Ham Wiki, available from + . You also might want to check out the + file . More information about + digital amateur radio in general is on the WWW at + . + + To compile this driver as a module, choose M here: the + module will be called netrom. + +config ROSE + tristate "Amateur Radio X.25 PLP (Rose)" + depends on AX25 + help + The Packet Layer Protocol (PLP) is a way to route packets over X.25 + connections in general and amateur radio AX.25 connections in + particular, essentially an alternative to NET/ROM. + + A comprehensive listing of all the software for Linux amateur radio + users as well as information about how to configure an AX.25 port is + contained in the Linux Ham Wiki, available from + . You also might want to check out the + file . More information about + digital amateur radio in general is on the WWW at + . + + To compile this driver as a module, choose M here: the + module will be called rose. + +menu "AX.25 network device drivers" + depends on HAMRADIO && AX25 + +source "drivers/net/hamradio/Kconfig" + +endmenu diff --git a/net/ax25/Makefile b/net/ax25/Makefile new file mode 100644 index 00000000..43c46d2c --- /dev/null +++ b/net/ax25/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the Linux AX.25 layer. +# + +obj-$(CONFIG_AX25) += ax25.o + +ax25-y := ax25_addr.o ax25_dev.o ax25_iface.o ax25_in.o ax25_ip.o ax25_out.o \ + ax25_route.o ax25_std_in.o ax25_std_subr.o ax25_std_timer.o \ + ax25_subr.o ax25_timer.o ax25_uid.o af_ax25.o +ax25-$(CONFIG_AX25_DAMA_SLAVE) += ax25_ds_in.o ax25_ds_subr.o ax25_ds_timer.o +ax25-$(CONFIG_SYSCTL) += sysctl_net_ax25.o diff --git a/net/ax25/TODO b/net/ax25/TODO new file mode 100644 index 00000000..69fb4e36 --- /dev/null +++ b/net/ax25/TODO @@ -0,0 +1,20 @@ +Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and +listen_lock have to be bh-safe? + +Do the netrom and rose locks have to be bh-safe? + +A device might be deleted after lookup in the SIOCADDRT ioctl but before it's +being used. + +Routes to a device being taken down might be deleted by ax25_rt_device_down +but added by somebody else before the device has been deleted fully. + +The ax25_rt_find_route synopsys is pervert but I somehow had to deal with +the race caused by the static variable in it's previous implementation. + +Implement proper socket locking in netrom and rose. + +Check socket locking when ax25_rcv is sending to raw sockets. In particular +ax25_send_to_raw() seems fishy. Heck - ax25_rcv is fishy. + +Handle XID and TEST frames properly. diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c new file mode 100644 index 00000000..b04a6ef4 --- /dev/null +++ b/net/ax25/af_ax25.c @@ -0,0 +1,2022 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk) + * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + * Copyright (C) Hans Alblas PE1AYX (hans@esrac.ele.tue.nl) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +HLIST_HEAD(ax25_list); +DEFINE_SPINLOCK(ax25_list_lock); + +static const struct proto_ops ax25_proto_ops; + +static void ax25_free_sock(struct sock *sk) +{ + ax25_cb_put(ax25_sk(sk)); +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void ax25_cb_del(ax25_cb *ax25) +{ + if (!hlist_unhashed(&ax25->ax25_node)) { + spin_lock_bh(&ax25_list_lock); + hlist_del_init(&ax25->ax25_node); + spin_unlock_bh(&ax25_list_lock); + ax25_cb_put(ax25); + } +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void ax25_kill_by_device(struct net_device *dev) +{ + ax25_dev *ax25_dev; + ax25_cb *s; + struct hlist_node *node; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return; + + spin_lock_bh(&ax25_list_lock); +again: + ax25_for_each(s, node, &ax25_list) { + if (s->ax25_dev == ax25_dev) { + s->ax25_dev = NULL; + spin_unlock_bh(&ax25_list_lock); + ax25_disconnect(s, ENETUNREACH); + spin_lock_bh(&ax25_list_lock); + + /* The entry could have been deleted from the + * list meanwhile and thus the next pointer is + * no longer valid. Play it safe and restart + * the scan. Forward progress is ensured + * because we set s->ax25_dev to NULL and we + * are never passed a NULL 'dev' argument. + */ + goto again; + } + } + spin_unlock_bh(&ax25_list_lock); +} + +/* + * Handle device status changes. + */ +static int ax25_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* Reject non AX.25 devices */ + if (dev->type != ARPHRD_AX25) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + ax25_dev_device_up(dev); + break; + case NETDEV_DOWN: + ax25_kill_by_device(dev); + ax25_rt_device_down(dev); + ax25_dev_device_down(dev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +void ax25_cb_add(ax25_cb *ax25) +{ + spin_lock_bh(&ax25_list_lock); + ax25_cb_hold(ax25); + hlist_add_head(&ax25->ax25_node, &ax25_list); + spin_unlock_bh(&ax25_list_lock); +} + +/* + * Find a socket that wants to accept the SABM we have just + * received. + */ +struct sock *ax25_find_listener(ax25_address *addr, int digi, + struct net_device *dev, int type) +{ + ax25_cb *s; + struct hlist_node *node; + + spin_lock(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if ((s->iamdigi && !digi) || (!s->iamdigi && digi)) + continue; + if (s->sk && !ax25cmp(&s->source_addr, addr) && + s->sk->sk_type == type && s->sk->sk_state == TCP_LISTEN) { + /* If device is null we match any device */ + if (s->ax25_dev == NULL || s->ax25_dev->dev == dev) { + sock_hold(s->sk); + spin_unlock(&ax25_list_lock); + return s->sk; + } + } + } + spin_unlock(&ax25_list_lock); + + return NULL; +} + +/* + * Find an AX.25 socket given both ends. + */ +struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr, + int type) +{ + struct sock *sk = NULL; + ax25_cb *s; + struct hlist_node *node; + + spin_lock(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->sk && !ax25cmp(&s->source_addr, my_addr) && + !ax25cmp(&s->dest_addr, dest_addr) && + s->sk->sk_type == type) { + sk = s->sk; + sock_hold(sk); + break; + } + } + + spin_unlock(&ax25_list_lock); + + return sk; +} + +/* + * Find an AX.25 control block given both ends. It will only pick up + * floating AX.25 control blocks or non Raw socket bound control blocks. + */ +ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr, + ax25_digi *digi, struct net_device *dev) +{ + ax25_cb *s; + struct hlist_node *node; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->sk && s->sk->sk_type != SOCK_SEQPACKET) + continue; + if (s->ax25_dev == NULL) + continue; + if (ax25cmp(&s->source_addr, src_addr) == 0 && ax25cmp(&s->dest_addr, dest_addr) == 0 && s->ax25_dev->dev == dev) { + if (digi != NULL && digi->ndigi != 0) { + if (s->digipeat == NULL) + continue; + if (ax25digicmp(s->digipeat, digi) != 0) + continue; + } else { + if (s->digipeat != NULL && s->digipeat->ndigi != 0) + continue; + } + ax25_cb_hold(s); + spin_unlock_bh(&ax25_list_lock); + + return s; + } + } + spin_unlock_bh(&ax25_list_lock); + + return NULL; +} + +EXPORT_SYMBOL(ax25_find_cb); + +void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto) +{ + ax25_cb *s; + struct sk_buff *copy; + struct hlist_node *node; + + spin_lock(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 && + s->sk->sk_type == SOCK_RAW && + s->sk->sk_protocol == proto && + s->ax25_dev->dev == skb->dev && + atomic_read(&s->sk->sk_rmem_alloc) <= s->sk->sk_rcvbuf) { + if ((copy = skb_clone(skb, GFP_ATOMIC)) == NULL) + continue; + if (sock_queue_rcv_skb(s->sk, copy) != 0) + kfree_skb(copy); + } + } + spin_unlock(&ax25_list_lock); +} + +/* + * Deferred destroy. + */ +void ax25_destroy_socket(ax25_cb *); + +/* + * Handler for deferred kills. + */ +static void ax25_destroy_timer(unsigned long data) +{ + ax25_cb *ax25=(ax25_cb *)data; + struct sock *sk; + + sk=ax25->sk; + + bh_lock_sock(sk); + sock_hold(sk); + ax25_destroy_socket(ax25); + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + */ +void ax25_destroy_socket(ax25_cb *ax25) +{ + struct sk_buff *skb; + + ax25_cb_del(ax25); + + ax25_stop_heartbeat(ax25); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + + ax25_clear_queues(ax25); /* Flush the queues */ + + if (ax25->sk != NULL) { + while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) { + if (skb->sk != ax25->sk) { + /* A pending connection */ + ax25_cb *sax25 = ax25_sk(skb->sk); + + /* Queue the unaccepted socket for death */ + sock_orphan(skb->sk); + + /* 9A4GL: hack to release unaccepted sockets */ + skb->sk->sk_state = TCP_LISTEN; + + ax25_start_heartbeat(sax25); + sax25->state = AX25_STATE_0; + } + + kfree_skb(skb); + } + skb_queue_purge(&ax25->sk->sk_write_queue); + } + + if (ax25->sk != NULL) { + if (sk_has_allocations(ax25->sk)) { + /* Defer: outstanding buffers */ + setup_timer(&ax25->dtimer, ax25_destroy_timer, + (unsigned long)ax25); + ax25->dtimer.expires = jiffies + 2 * HZ; + add_timer(&ax25->dtimer); + } else { + struct sock *sk=ax25->sk; + ax25->sk=NULL; + sock_put(sk); + } + } else { + ax25_cb_put(ax25); + } +} + +/* + * dl1bke 960311: set parameters for existing AX.25 connections, + * includes a KILL command to abort any connection. + * VERY useful for debugging ;-) + */ +static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) +{ + struct ax25_ctl_struct ax25_ctl; + ax25_digi digi; + ax25_dev *ax25_dev; + ax25_cb *ax25; + unsigned int k; + int ret = 0; + + if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) + return -EFAULT; + + if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) + return -ENODEV; + + if (ax25_ctl.digi_count > AX25_MAX_DIGIS) + return -EINVAL; + + if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) + return -EINVAL; + + digi.ndigi = ax25_ctl.digi_count; + for (k = 0; k < digi.ndigi; k++) + digi.calls[k] = ax25_ctl.digi_addr[k]; + + if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + return -ENOTCONN; + + switch (ax25_ctl.cmd) { + case AX25_KILL: + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); +#ifdef CONFIG_AX25_DAMA_SLAVE + if (ax25_dev->dama.slave && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE) + ax25_dama_off(ax25); +#endif + ax25_disconnect(ax25, ENETRESET); + break; + + case AX25_WINDOW: + if (ax25->modulus == AX25_MODULUS) { + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7) + goto einval_put; + } else { + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63) + goto einval_put; + } + ax25->window = ax25_ctl.arg; + break; + + case AX25_T1: + if (ax25_ctl.arg < 1) + goto einval_put; + ax25->rtt = (ax25_ctl.arg * HZ) / 2; + ax25->t1 = ax25_ctl.arg * HZ; + break; + + case AX25_T2: + if (ax25_ctl.arg < 1) + goto einval_put; + ax25->t2 = ax25_ctl.arg * HZ; + break; + + case AX25_N2: + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31) + goto einval_put; + ax25->n2count = 0; + ax25->n2 = ax25_ctl.arg; + break; + + case AX25_T3: + ax25->t3 = ax25_ctl.arg * HZ; + break; + + case AX25_IDLE: + ax25->idle = ax25_ctl.arg * 60 * HZ; + break; + + case AX25_PACLEN: + if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535) + goto einval_put; + ax25->paclen = ax25_ctl.arg; + break; + + default: + goto einval_put; + } + +out_put: + ax25_cb_put(ax25); + return ret; + +einval_put: + ret = -EINVAL; + goto out_put; +} + +static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev) +{ + ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2; + ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]); + ax25->t2 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T2]); + ax25->t3 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T3]); + ax25->n2 = ax25_dev->values[AX25_VALUES_N2]; + ax25->paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + ax25->idle = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_IDLE]); + ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF]; + + if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; + } +} + +/* + * Fill in a created AX.25 created control block with the default + * values for a particular device. + */ +void ax25_fillin_cb(ax25_cb *ax25, ax25_dev *ax25_dev) +{ + ax25->ax25_dev = ax25_dev; + + if (ax25->ax25_dev != NULL) { + ax25_fillin_cb_from_dev(ax25, ax25_dev); + return; + } + + /* + * No device, use kernel / AX.25 spec default values + */ + ax25->rtt = msecs_to_jiffies(AX25_DEF_T1) / 2; + ax25->t1 = msecs_to_jiffies(AX25_DEF_T1); + ax25->t2 = msecs_to_jiffies(AX25_DEF_T2); + ax25->t3 = msecs_to_jiffies(AX25_DEF_T3); + ax25->n2 = AX25_DEF_N2; + ax25->paclen = AX25_DEF_PACLEN; + ax25->idle = msecs_to_jiffies(AX25_DEF_IDLE); + ax25->backoff = AX25_DEF_BACKOFF; + + if (AX25_DEF_AXDEFMODE) { + ax25->modulus = AX25_EMODULUS; + ax25->window = AX25_DEF_EWINDOW; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = AX25_DEF_WINDOW; + } +} + +/* + * Create an empty AX.25 control block. + */ +ax25_cb *ax25_create_cb(void) +{ + ax25_cb *ax25; + + if ((ax25 = kzalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL) + return NULL; + + atomic_set(&ax25->refcount, 1); + + skb_queue_head_init(&ax25->write_queue); + skb_queue_head_init(&ax25->frag_queue); + skb_queue_head_init(&ax25->ack_queue); + skb_queue_head_init(&ax25->reseq_queue); + + ax25_setup_timers(ax25); + + ax25_fillin_cb(ax25, NULL); + + ax25->state = AX25_STATE_0; + + return ax25; +} + +/* + * Handling for system calls applied via the various interfaces to an + * AX25 socket object + */ + +static int ax25_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25; + struct net_device *dev; + char devname[IFNAMSIZ]; + int opt, res = 0; + + if (level != SOL_AX25) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + ax25 = ax25_sk(sk); + + switch (optname) { + case AX25_WINDOW: + if (ax25->modulus == AX25_MODULUS) { + if (opt < 1 || opt > 7) { + res = -EINVAL; + break; + } + } else { + if (opt < 1 || opt > 63) { + res = -EINVAL; + break; + } + } + ax25->window = opt; + break; + + case AX25_T1: + if (opt < 1) { + res = -EINVAL; + break; + } + ax25->rtt = (opt * HZ) >> 1; + ax25->t1 = opt * HZ; + break; + + case AX25_T2: + if (opt < 1) { + res = -EINVAL; + break; + } + ax25->t2 = opt * HZ; + break; + + case AX25_N2: + if (opt < 1 || opt > 31) { + res = -EINVAL; + break; + } + ax25->n2 = opt; + break; + + case AX25_T3: + if (opt < 1) { + res = -EINVAL; + break; + } + ax25->t3 = opt * HZ; + break; + + case AX25_IDLE: + if (opt < 0) { + res = -EINVAL; + break; + } + ax25->idle = opt * 60 * HZ; + break; + + case AX25_BACKOFF: + if (opt < 0 || opt > 2) { + res = -EINVAL; + break; + } + ax25->backoff = opt; + break; + + case AX25_EXTSEQ: + ax25->modulus = opt ? AX25_EMODULUS : AX25_MODULUS; + break; + + case AX25_PIDINCL: + ax25->pidincl = opt ? 1 : 0; + break; + + case AX25_IAMDIGI: + ax25->iamdigi = opt ? 1 : 0; + break; + + case AX25_PACLEN: + if (opt < 16 || opt > 65535) { + res = -EINVAL; + break; + } + ax25->paclen = opt; + break; + + case SO_BINDTODEVICE: + if (optlen > IFNAMSIZ) + optlen = IFNAMSIZ; + + if (copy_from_user(devname, optval, optlen)) { + res = -EFAULT; + break; + } + + if (sk->sk_type == SOCK_SEQPACKET && + (sock->state != SS_UNCONNECTED || + sk->sk_state == TCP_LISTEN)) { + res = -EADDRNOTAVAIL; + break; + } + + dev = dev_get_by_name(&init_net, devname); + if (!dev) { + res = -ENODEV; + break; + } + + ax25->ax25_dev = ax25_dev_ax25dev(dev); + ax25_fillin_cb(ax25, ax25->ax25_dev); + dev_put(dev); + break; + + default: + res = -ENOPROTOOPT; + } + release_sock(sk); + + return res; +} + +static int ax25_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25; + struct ax25_dev *ax25_dev; + char devname[IFNAMSIZ]; + void *valptr; + int val = 0; + int maxlen, length; + + if (level != SOL_AX25) + return -ENOPROTOOPT; + + if (get_user(maxlen, optlen)) + return -EFAULT; + + if (maxlen < 1) + return -EFAULT; + + valptr = (void *) &val; + length = min_t(unsigned int, maxlen, sizeof(int)); + + lock_sock(sk); + ax25 = ax25_sk(sk); + + switch (optname) { + case AX25_WINDOW: + val = ax25->window; + break; + + case AX25_T1: + val = ax25->t1 / HZ; + break; + + case AX25_T2: + val = ax25->t2 / HZ; + break; + + case AX25_N2: + val = ax25->n2; + break; + + case AX25_T3: + val = ax25->t3 / HZ; + break; + + case AX25_IDLE: + val = ax25->idle / (60 * HZ); + break; + + case AX25_BACKOFF: + val = ax25->backoff; + break; + + case AX25_EXTSEQ: + val = (ax25->modulus == AX25_EMODULUS); + break; + + case AX25_PIDINCL: + val = ax25->pidincl; + break; + + case AX25_IAMDIGI: + val = ax25->iamdigi; + break; + + case AX25_PACLEN: + val = ax25->paclen; + break; + + case SO_BINDTODEVICE: + ax25_dev = ax25->ax25_dev; + + if (ax25_dev != NULL && ax25_dev->dev != NULL) { + strlcpy(devname, ax25_dev->dev->name, sizeof(devname)); + length = strlen(devname) + 1; + } else { + *devname = '\0'; + length = 1; + } + + valptr = (void *) devname; + break; + + default: + release_sock(sk); + return -ENOPROTOOPT; + } + release_sock(sk); + + if (put_user(length, optlen)) + return -EFAULT; + + return copy_to_user(optval, valptr, length) ? -EFAULT : 0; +} + +static int ax25_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int res = 0; + + lock_sock(sk); + if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_LISTEN) { + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + goto out; + } + res = -EOPNOTSUPP; + +out: + release_sock(sk); + + return res; +} + +/* + * XXX: when creating ax25_sock we should update the .obj_size setting + * below. + */ +static struct proto ax25_proto = { + .name = "AX25", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + +static int ax25_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + ax25_cb *ax25; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + switch (sock->type) { + case SOCK_DGRAM: + if (protocol == 0 || protocol == PF_AX25) + protocol = AX25_P_TEXT; + break; + + case SOCK_SEQPACKET: + switch (protocol) { + case 0: + case PF_AX25: /* For CLX */ + protocol = AX25_P_TEXT; + break; + case AX25_P_SEGMENT: +#ifdef CONFIG_INET + case AX25_P_ARP: + case AX25_P_IP: +#endif +#ifdef CONFIG_NETROM + case AX25_P_NETROM: +#endif +#ifdef CONFIG_ROSE + case AX25_P_ROSE: +#endif + return -ESOCKTNOSUPPORT; +#ifdef CONFIG_NETROM_MODULE + case AX25_P_NETROM: + if (ax25_protocol_is_registered(AX25_P_NETROM)) + return -ESOCKTNOSUPPORT; +#endif +#ifdef CONFIG_ROSE_MODULE + case AX25_P_ROSE: + if (ax25_protocol_is_registered(AX25_P_ROSE)) + return -ESOCKTNOSUPPORT; +#endif + default: + break; + } + break; + + case SOCK_RAW: + break; + default: + return -ESOCKTNOSUPPORT; + } + + sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto); + if (sk == NULL) + return -ENOMEM; + + ax25 = sk->sk_protinfo = ax25_create_cb(); + if (!ax25) { + sk_free(sk); + return -ENOMEM; + } + + sock_init_data(sock, sk); + + sk->sk_destruct = ax25_free_sock; + sock->ops = &ax25_proto_ops; + sk->sk_protocol = protocol; + + ax25->sk = sk; + + return 0; +} + +struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) +{ + struct sock *sk; + ax25_cb *ax25, *oax25; + + sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot); + if (sk == NULL) + return NULL; + + if ((ax25 = ax25_create_cb()) == NULL) { + sk_free(sk); + return NULL; + } + + switch (osk->sk_type) { + case SOCK_DGRAM: + break; + case SOCK_SEQPACKET: + break; + default: + sk_free(sk); + ax25_cb_put(ax25); + return NULL; + } + + sock_init_data(NULL, sk); + + sk->sk_type = osk->sk_type; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sock_copy_flags(sk, osk); + + oax25 = ax25_sk(osk); + + ax25->modulus = oax25->modulus; + ax25->backoff = oax25->backoff; + ax25->pidincl = oax25->pidincl; + ax25->iamdigi = oax25->iamdigi; + ax25->rtt = oax25->rtt; + ax25->t1 = oax25->t1; + ax25->t2 = oax25->t2; + ax25->t3 = oax25->t3; + ax25->n2 = oax25->n2; + ax25->idle = oax25->idle; + ax25->paclen = oax25->paclen; + ax25->window = oax25->window; + + ax25->ax25_dev = ax25_dev; + ax25->source_addr = oax25->source_addr; + + if (oax25->digipeat != NULL) { + ax25->digipeat = kmemdup(oax25->digipeat, sizeof(ax25_digi), + GFP_ATOMIC); + if (ax25->digipeat == NULL) { + sk_free(sk); + ax25_cb_put(ax25); + return NULL; + } + } + + sk->sk_protinfo = ax25; + sk->sk_destruct = ax25_free_sock; + ax25->sk = sk; + + return sk; +} + +static int ax25_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25; + + if (sk == NULL) + return 0; + + sock_hold(sk); + sock_orphan(sk); + lock_sock(sk); + ax25 = ax25_sk(sk); + + if (sk->sk_type == SOCK_SEQPACKET) { + switch (ax25->state) { + case AX25_STATE_0: + release_sock(sk); + ax25_disconnect(ax25, 0); + lock_sock(sk); + ax25_destroy_socket(ax25); + break; + + case AX25_STATE_1: + case AX25_STATE_2: + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + release_sock(sk); + ax25_disconnect(ax25, 0); + lock_sock(sk); + ax25_destroy_socket(ax25); + break; + + case AX25_STATE_3: + case AX25_STATE_4: + ax25_clear_queues(ax25); + ax25->n2count = 0; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_send_control(ax25, + AX25_DISC, + AX25_POLLON, + AX25_COMMAND); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + break; +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + break; +#endif + } + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25->state = AX25_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DESTROY); + break; + + default: + break; + } + } else { + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + ax25_destroy_socket(ax25); + } + + sock->sk = NULL; + release_sock(sk); + sock_put(sk); + + return 0; +} + +/* + * We support a funny extension here so you can (as root) give any callsign + * digipeated via a local address as source. This hack is obsolete now + * that we've implemented support for SO_BINDTODEVICE. It is however small + * and trivially backward compatible. + */ +static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; + ax25_dev *ax25_dev = NULL; + ax25_uid_assoc *user; + ax25_address call; + ax25_cb *ax25; + int err = 0; + + if (addr_len != sizeof(struct sockaddr_ax25) && + addr_len != sizeof(struct full_sockaddr_ax25)) + /* support for old structure may go away some time + * ax25_bind(): uses old (6 digipeater) socket structure. + */ + if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || + (addr_len > sizeof(struct full_sockaddr_ax25))) + return -EINVAL; + + if (addr->fsa_ax25.sax25_family != AF_AX25) + return -EINVAL; + + user = ax25_findbyuid(current_euid()); + if (user) { + call = user->call; + ax25_uid_put(user); + } else { + if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) + return -EACCES; + + call = addr->fsa_ax25.sax25_call; + } + + lock_sock(sk); + + ax25 = ax25_sk(sk); + if (!sock_flag(sk, SOCK_ZAPPED)) { + err = -EINVAL; + goto out; + } + + ax25->source_addr = call; + + /* + * User already set interface with SO_BINDTODEVICE + */ + if (ax25->ax25_dev != NULL) + goto done; + + if (addr_len > sizeof(struct sockaddr_ax25) && addr->fsa_ax25.sax25_ndigis == 1) { + if (ax25cmp(&addr->fsa_digipeater[0], &null_ax25_address) != 0 && + (ax25_dev = ax25_addr_ax25dev(&addr->fsa_digipeater[0])) == NULL) { + err = -EADDRNOTAVAIL; + goto out; + } + } else { + if ((ax25_dev = ax25_addr_ax25dev(&addr->fsa_ax25.sax25_call)) == NULL) { + err = -EADDRNOTAVAIL; + goto out; + } + } + + if (ax25_dev != NULL) + ax25_fillin_cb(ax25, ax25_dev); + +done: + ax25_cb_add(ax25); + sock_reset_flag(sk, SOCK_ZAPPED); + +out: + release_sock(sk); + + return err; +} + +/* + * FIXME: nonblock behaviour looks like it may have a bug. + */ +static int __must_check ax25_connect(struct socket *sock, + struct sockaddr *uaddr, int addr_len, int flags) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25 = ax25_sk(sk), *ax25t; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; + ax25_digi *digi = NULL; + int ct = 0, err = 0; + + /* + * some sanity checks. code further down depends on this + */ + + if (addr_len == sizeof(struct sockaddr_ax25)) + /* support for this will go away in early 2.5.x + * ax25_connect(): uses obsolete socket structure + */ + ; + else if (addr_len != sizeof(struct full_sockaddr_ax25)) + /* support for old structure may go away some time + * ax25_connect(): uses old (6 digipeater) socket structure. + */ + if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || + (addr_len > sizeof(struct full_sockaddr_ax25))) + return -EINVAL; + + + if (fsa->fsa_ax25.sax25_family != AF_AX25) + return -EINVAL; + + lock_sock(sk); + + /* deal with restarts */ + if (sock->state == SS_CONNECTING) { + switch (sk->sk_state) { + case TCP_SYN_SENT: /* still trying */ + err = -EINPROGRESS; + goto out_release; + + case TCP_ESTABLISHED: /* connection established */ + sock->state = SS_CONNECTED; + goto out_release; + + case TCP_CLOSE: /* connection refused */ + sock->state = SS_UNCONNECTED; + err = -ECONNREFUSED; + goto out_release; + } + } + + if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) { + err = -EISCONN; /* No reconnect on a seqpacket socket */ + goto out_release; + } + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + kfree(ax25->digipeat); + ax25->digipeat = NULL; + + /* + * Handle digi-peaters to be used. + */ + if (addr_len > sizeof(struct sockaddr_ax25) && + fsa->fsa_ax25.sax25_ndigis != 0) { + /* Valid number of digipeaters ? */ + if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) { + err = -EINVAL; + goto out_release; + } + + if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { + err = -ENOBUFS; + goto out_release; + } + + digi->ndigi = fsa->fsa_ax25.sax25_ndigis; + digi->lastrepeat = -1; + + while (ct < fsa->fsa_ax25.sax25_ndigis) { + if ((fsa->fsa_digipeater[ct].ax25_call[6] & + AX25_HBIT) && ax25->iamdigi) { + digi->repeated[ct] = 1; + digi->lastrepeat = ct; + } else { + digi->repeated[ct] = 0; + } + digi->calls[ct] = fsa->fsa_digipeater[ct]; + ct++; + } + } + + /* + * Must bind first - autobinding in this may or may not work. If + * the socket is already bound, check to see if the device has + * been filled in, error if it hasn't. + */ + if (sock_flag(sk, SOCK_ZAPPED)) { + /* check if we can remove this feature. It is broken. */ + printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", + current->comm); + if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { + kfree(digi); + goto out_release; + } + + ax25_fillin_cb(ax25, ax25->ax25_dev); + ax25_cb_add(ax25); + } else { + if (ax25->ax25_dev == NULL) { + kfree(digi); + err = -EHOSTUNREACH; + goto out_release; + } + } + + if (sk->sk_type == SOCK_SEQPACKET && + (ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi, + ax25->ax25_dev->dev))) { + kfree(digi); + err = -EADDRINUSE; /* Already such a connection */ + ax25_cb_put(ax25t); + goto out_release; + } + + ax25->dest_addr = fsa->fsa_ax25.sax25_call; + ax25->digipeat = digi; + + /* First the easy one */ + if (sk->sk_type != SOCK_SEQPACKET) { + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + goto out_release; + } + + /* Move to connecting socket, ax.25 lapb WAIT_UA.. */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_establish_data_link(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + if (ax25->ax25_dev->dama.slave) + ax25_ds_establish_data_link(ax25); + else + ax25_std_establish_data_link(ax25); + break; +#endif + } + + ax25->state = AX25_STATE_1; + + ax25_start_heartbeat(ax25); + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { + err = -EINPROGRESS; + goto out_release; + } + + if (sk->sk_state == TCP_SYN_SENT) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk->sk_state != TCP_SYN_SENT) + break; + if (!signal_pending(current)) { + release_sock(sk); + schedule(); + lock_sock(sk); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(sk_sleep(sk), &wait); + + if (err) + goto out_release; + } + + if (sk->sk_state != TCP_ESTABLISHED) { + /* Not in ABM, not in WAIT_UA -> failed */ + sock->state = SS_UNCONNECTED; + err = sock_error(sk); /* Always set at this point */ + goto out_release; + } + + sock->state = SS_CONNECTED; + + err = 0; +out_release: + release_sock(sk); + + return err; +} + +static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sk_buff *skb; + struct sock *newsk; + DEFINE_WAIT(wait); + struct sock *sk; + int err = 0; + + if (sock->state != SS_UNCONNECTED) + return -EINVAL; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EOPNOTSUPP; + goto out; + } + + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto out; + } + + /* + * The read queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + if (flags & O_NONBLOCK) { + err = -EWOULDBLOCK; + break; + } + if (!signal_pending(current)) { + release_sock(sk); + schedule(); + lock_sock(sk); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(sk_sleep(sk), &wait); + + if (err) + goto out; + + newsk = skb->sk; + sock_graft(newsk, newsock); + + /* Now attach up the new socket */ + kfree_skb(skb); + sk->sk_ack_backlog--; + newsock->state = SS_CONNECTED; + +out: + release_sock(sk); + + return err; +} + +static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; + struct sock *sk = sock->sk; + unsigned char ndigi, i; + ax25_cb *ax25; + int err = 0; + + memset(fsa, 0, sizeof(*fsa)); + lock_sock(sk); + ax25 = ax25_sk(sk); + + if (peer != 0) { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + + fsa->fsa_ax25.sax25_family = AF_AX25; + fsa->fsa_ax25.sax25_call = ax25->dest_addr; + + if (ax25->digipeat != NULL) { + ndigi = ax25->digipeat->ndigi; + fsa->fsa_ax25.sax25_ndigis = ndigi; + for (i = 0; i < ndigi; i++) + fsa->fsa_digipeater[i] = + ax25->digipeat->calls[i]; + } + } else { + fsa->fsa_ax25.sax25_family = AF_AX25; + fsa->fsa_ax25.sax25_call = ax25->source_addr; + fsa->fsa_ax25.sax25_ndigis = 1; + if (ax25->ax25_dev != NULL) { + memcpy(&fsa->fsa_digipeater[0], + ax25->ax25_dev->dev->dev_addr, AX25_ADDR_LEN); + } else { + fsa->fsa_digipeater[0] = null_ax25_address; + } + } + *uaddr_len = sizeof (struct full_sockaddr_ax25); + +out: + release_sock(sk); + + return err; +} + +static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; + struct sock *sk = sock->sk; + struct sockaddr_ax25 sax; + struct sk_buff *skb; + ax25_digi dtmp, *dp; + ax25_cb *ax25; + size_t size; + int lv, err, addr_len = msg->msg_namelen; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + lock_sock(sk); + ax25 = ax25_sk(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) { + err = -EADDRNOTAVAIL; + goto out; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + goto out; + } + + if (ax25->ax25_dev == NULL) { + err = -ENETUNREACH; + goto out; + } + + if (len > ax25->ax25_dev->dev->mtu) { + err = -EMSGSIZE; + goto out; + } + + if (usax != NULL) { + if (usax->sax25_family != AF_AX25) { + err = -EINVAL; + goto out; + } + + if (addr_len == sizeof(struct sockaddr_ax25)) + /* ax25_sendmsg(): uses obsolete socket structure */ + ; + else if (addr_len != sizeof(struct full_sockaddr_ax25)) + /* support for old structure may go away some time + * ax25_sendmsg(): uses old (6 digipeater) + * socket structure. + */ + if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || + (addr_len > sizeof(struct full_sockaddr_ax25))) { + err = -EINVAL; + goto out; + } + + + if (addr_len > sizeof(struct sockaddr_ax25) && usax->sax25_ndigis != 0) { + int ct = 0; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; + + /* Valid number of digipeaters ? */ + if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) { + err = -EINVAL; + goto out; + } + + dtmp.ndigi = usax->sax25_ndigis; + + while (ct < usax->sax25_ndigis) { + dtmp.repeated[ct] = 0; + dtmp.calls[ct] = fsa->fsa_digipeater[ct]; + ct++; + } + + dtmp.lastrepeat = 0; + } + + sax = *usax; + if (sk->sk_type == SOCK_SEQPACKET && + ax25cmp(&ax25->dest_addr, &sax.sax25_call)) { + err = -EISCONN; + goto out; + } + if (usax->sax25_ndigis == 0) + dp = NULL; + else + dp = &dtmp; + } else { + /* + * FIXME: 1003.1g - if the socket is like this because + * it has become closed (not started closed) and is VC + * we ought to SIGPIPE, EPIPE + */ + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + sax.sax25_family = AF_AX25; + sax.sax25_call = ax25->dest_addr; + dp = ax25->digipeat; + } + + /* Build a packet */ + /* Assume the worst case */ + size = len + ax25->ax25_dev->dev->hard_header_len; + + skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + skb_reserve(skb, size - len); + + /* User data follows immediately after the AX.25 data */ + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + kfree_skb(skb); + goto out; + } + + skb_reset_network_header(skb); + + /* Add the PID if one is not supplied by the user in the skb */ + if (!ax25->pidincl) + *skb_push(skb, 1) = sk->sk_protocol; + + if (sk->sk_type == SOCK_SEQPACKET) { + /* Connected mode sockets go via the LAPB machine */ + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + err = -ENOTCONN; + goto out; + } + + /* Shove it onto the queue and kick */ + ax25_output(ax25, ax25->paclen, skb); + + err = len; + goto out; + } + + skb_push(skb, 1 + ax25_addr_size(dp)); + + /* Building AX.25 Header */ + + /* Build an AX.25 header */ + lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call, + dp, AX25_COMMAND, AX25_MODULUS); + + skb_set_transport_header(skb, lv); + + *skb_transport_header(skb) = AX25_UI; + + /* Datagram frames go straight out of the door as UI */ + ax25_queue_xmit(skb, ax25->ax25_dev->dev); + + err = len; + +out: + release_sock(sk); + + return err; +} + +static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied; + int err = 0; + + lock_sock(sk); + /* + * This works for seqpacket too. The receiver has ordered the + * queue for us! We do one quick check first though + */ + if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + + /* Now we can treat all alike */ + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + if (!ax25_sk(sk)->pidincl) + skb_pull(skb, 1); /* Remove PID */ + + skb_reset_transport_header(skb); + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (msg->msg_namelen != 0) { + struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + ax25_digi digi; + ax25_address src; + const unsigned char *mac = skb_mac_header(skb); + + ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL, + &digi, NULL, NULL); + sax->sax25_family = AF_AX25; + /* We set this correctly, even though we may not let the + application know the digi calls further down (because it + did NOT ask to know them). This could get political... **/ + sax->sax25_ndigis = digi.ndigi; + sax->sax25_call = src; + + if (sax->sax25_ndigis != 0) { + int ct; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)sax; + + for (ct = 0; ct < digi.ndigi; ct++) + fsa->fsa_digipeater[ct] = digi.calls[ct]; + } + msg->msg_namelen = sizeof(struct full_sockaddr_ax25); + } + + skb_free_datagram(sk, skb); + err = copied; + +out: + release_sock(sk); + + return err; +} + +static int ax25_shutdown(struct socket *sk, int how) +{ + /* FIXME - generate DM and RNR states */ + return -EOPNOTSUPP; +} + +static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + int res = 0; + + lock_sock(sk); + switch (cmd) { + case TIOCOUTQ: { + long amount; + + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + res = put_user(amount, (int __user *)argp); + break; + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + res = put_user(amount, (int __user *) argp); + break; + } + + case SIOCGSTAMP: + res = sock_get_timestamp(sk, argp); + break; + + case SIOCGSTAMPNS: + res = sock_get_timestampns(sk, argp); + break; + + case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ + case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ + case SIOCAX25GETUID: { + struct sockaddr_ax25 sax25; + if (copy_from_user(&sax25, argp, sizeof(sax25))) { + res = -EFAULT; + break; + } + res = ax25_uid_ioctl(cmd, &sax25); + break; + } + + case SIOCAX25NOUID: { /* Set the default policy (default/bar) */ + long amount; + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + if (get_user(amount, (long __user *)argp)) { + res = -EFAULT; + break; + } + if (amount > AX25_NOUID_BLOCK) { + res = -EINVAL; + break; + } + ax25_uid_policy = amount; + res = 0; + break; + } + + case SIOCADDRT: + case SIOCDELRT: + case SIOCAX25OPTRT: + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + res = ax25_rt_ioctl(cmd, argp); + break; + + case SIOCAX25CTLCON: + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + res = ax25_ctl_ioctl(cmd, argp); + break; + + case SIOCAX25GETINFO: + case SIOCAX25GETINFOOLD: { + ax25_cb *ax25 = ax25_sk(sk); + struct ax25_info_struct ax25_info; + + ax25_info.t1 = ax25->t1 / HZ; + ax25_info.t2 = ax25->t2 / HZ; + ax25_info.t3 = ax25->t3 / HZ; + ax25_info.idle = ax25->idle / (60 * HZ); + ax25_info.n2 = ax25->n2; + ax25_info.t1timer = ax25_display_timer(&ax25->t1timer) / HZ; + ax25_info.t2timer = ax25_display_timer(&ax25->t2timer) / HZ; + ax25_info.t3timer = ax25_display_timer(&ax25->t3timer) / HZ; + ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ); + ax25_info.n2count = ax25->n2count; + ax25_info.state = ax25->state; + ax25_info.rcv_q = sk_rmem_alloc_get(sk); + ax25_info.snd_q = sk_wmem_alloc_get(sk); + ax25_info.vs = ax25->vs; + ax25_info.vr = ax25->vr; + ax25_info.va = ax25->va; + ax25_info.vs_max = ax25->vs; /* reserved */ + ax25_info.paclen = ax25->paclen; + ax25_info.window = ax25->window; + + /* old structure? */ + if (cmd == SIOCAX25GETINFOOLD) { + static int warned = 0; + if (!warned) { + printk(KERN_INFO "%s uses old SIOCAX25GETINFO\n", + current->comm); + warned=1; + } + + if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct_deprecated))) { + res = -EFAULT; + break; + } + } else { + if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct))) { + res = -EINVAL; + break; + } + } + res = 0; + break; + } + + case SIOCAX25ADDFWD: + case SIOCAX25DELFWD: { + struct ax25_fwd_struct ax25_fwd; + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + if (copy_from_user(&ax25_fwd, argp, sizeof(ax25_fwd))) { + res = -EFAULT; + break; + } + res = ax25_fwd_ioctl(cmd, &ax25_fwd); + break; + } + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + res = -EINVAL; + break; + + default: + res = -ENOIOCTLCMD; + break; + } + release_sock(sk); + + return res; +} + +#ifdef CONFIG_PROC_FS + +static void *ax25_info_start(struct seq_file *seq, loff_t *pos) + __acquires(ax25_list_lock) +{ + spin_lock_bh(&ax25_list_lock); + return seq_hlist_start(&ax25_list, *pos); +} + +static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &ax25_list, pos); +} + +static void ax25_info_stop(struct seq_file *seq, void *v) + __releases(ax25_list_lock) +{ + spin_unlock_bh(&ax25_list_lock); +} + +static int ax25_info_show(struct seq_file *seq, void *v) +{ + ax25_cb *ax25 = hlist_entry(v, struct ax25_cb, ax25_node); + char buf[11]; + int k; + + + /* + * New format: + * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode + */ + + seq_printf(seq, "%8.8lx %s %s%s ", + (long) ax25, + ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name, + ax2asc(buf, &ax25->source_addr), + ax25->iamdigi? "*":""); + seq_printf(seq, "%s", ax2asc(buf, &ax25->dest_addr)); + + for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) { + seq_printf(seq, ",%s%s", + ax2asc(buf, &ax25->digipeat->calls[k]), + ax25->digipeat->repeated[k]? "*":""); + } + + seq_printf(seq, " %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %d %d", + ax25->state, + ax25->vs, ax25->vr, ax25->va, + ax25_display_timer(&ax25->t1timer) / HZ, ax25->t1 / HZ, + ax25_display_timer(&ax25->t2timer) / HZ, ax25->t2 / HZ, + ax25_display_timer(&ax25->t3timer) / HZ, ax25->t3 / HZ, + ax25_display_timer(&ax25->idletimer) / (60 * HZ), + ax25->idle / (60 * HZ), + ax25->n2count, ax25->n2, + ax25->rtt / HZ, + ax25->window, + ax25->paclen); + + if (ax25->sk != NULL) { + seq_printf(seq, " %d %d %lu\n", + sk_wmem_alloc_get(ax25->sk), + sk_rmem_alloc_get(ax25->sk), + sock_i_ino(ax25->sk)); + } else { + seq_puts(seq, " * * *\n"); + } + return 0; +} + +static const struct seq_operations ax25_info_seqops = { + .start = ax25_info_start, + .next = ax25_info_next, + .stop = ax25_info_stop, + .show = ax25_info_show, +}; + +static int ax25_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ax25_info_seqops); +} + +static const struct file_operations ax25_info_fops = { + .owner = THIS_MODULE, + .open = ax25_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +static const struct net_proto_family ax25_family_ops = { + .family = PF_AX25, + .create = ax25_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops ax25_proto_ops = { + .family = PF_AX25, + .owner = THIS_MODULE, + .release = ax25_release, + .bind = ax25_bind, + .connect = ax25_connect, + .socketpair = sock_no_socketpair, + .accept = ax25_accept, + .getname = ax25_getname, + .poll = datagram_poll, + .ioctl = ax25_ioctl, + .listen = ax25_listen, + .shutdown = ax25_shutdown, + .setsockopt = ax25_setsockopt, + .getsockopt = ax25_getsockopt, + .sendmsg = ax25_sendmsg, + .recvmsg = ax25_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +/* + * Called by socket.c on kernel start up + */ +static struct packet_type ax25_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_AX25), + .func = ax25_kiss_rcv, +}; + +static struct notifier_block ax25_dev_notifier = { + .notifier_call =ax25_device_event, +}; + +static int __init ax25_init(void) +{ + int rc = proto_register(&ax25_proto, 0); + + if (rc != 0) + goto out; + + sock_register(&ax25_family_ops); + dev_add_pack(&ax25_packet_type); + register_netdevice_notifier(&ax25_dev_notifier); + ax25_register_sysctl(); + + proc_net_fops_create(&init_net, "ax25_route", S_IRUGO, &ax25_route_fops); + proc_net_fops_create(&init_net, "ax25", S_IRUGO, &ax25_info_fops); + proc_net_fops_create(&init_net, "ax25_calls", S_IRUGO, &ax25_uid_fops); +out: + return rc; +} +module_init(ax25_init); + + +MODULE_AUTHOR("Jonathan Naylor G4KLX "); +MODULE_DESCRIPTION("The amateur radio AX.25 link layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_AX25); + +static void __exit ax25_exit(void) +{ + proc_net_remove(&init_net, "ax25_route"); + proc_net_remove(&init_net, "ax25"); + proc_net_remove(&init_net, "ax25_calls"); + + unregister_netdevice_notifier(&ax25_dev_notifier); + ax25_unregister_sysctl(); + + dev_remove_pack(&ax25_packet_type); + + sock_unregister(PF_AX25); + proto_unregister(&ax25_proto); + + ax25_rt_free(); + ax25_uid_free(); + ax25_dev_free(); +} +module_exit(ax25_exit); diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c new file mode 100644 index 00000000..7e7964dd --- /dev/null +++ b/net/ax25/ax25_addr.c @@ -0,0 +1,306 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The default broadcast address of an interface is QST-0; the default address + * is LINUX-1. The null address is defined as a callsign of all spaces with + * an SSID of zero. + */ + +const ax25_address ax25_bcast = + {{'Q' << 1, 'S' << 1, 'T' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}}; +const ax25_address ax25_defaddr = + {{'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, 1 << 1}}; +const ax25_address null_ax25_address = + {{' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}}; + +EXPORT_SYMBOL_GPL(ax25_bcast); +EXPORT_SYMBOL_GPL(ax25_defaddr); +EXPORT_SYMBOL(null_ax25_address); + +/* + * ax25 -> ascii conversion + */ +char *ax2asc(char *buf, const ax25_address *a) +{ + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') *s++ = c; + } + + *s++ = '-'; + + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; + +} + +EXPORT_SYMBOL(ax2asc); + +/* + * ascii -> ax25 conversion + */ +void asc2ax(ax25_address *addr, const char *callsign) +{ + const char *s; + int n; + + for (s = callsign, n = 0; n < 6; n++) { + if (*s != '\0' && *s != '-') + addr->ax25_call[n] = *s++; + else + addr->ax25_call[n] = ' '; + addr->ax25_call[n] <<= 1; + addr->ax25_call[n] &= 0xFE; + } + + if (*s++ == '\0') { + addr->ax25_call[6] = 0x00; + return; + } + + addr->ax25_call[6] = *s++ - '0'; + + if (*s != '\0') { + addr->ax25_call[6] *= 10; + addr->ax25_call[6] += *s++ - '0'; + } + + addr->ax25_call[6] <<= 1; + addr->ax25_call[6] &= 0x1E; +} + +EXPORT_SYMBOL(asc2ax); + +/* + * Compare two ax.25 addresses + */ +int ax25cmp(const ax25_address *a, const ax25_address *b) +{ + int ct = 0; + + while (ct < 6) { + if ((a->ax25_call[ct] & 0xFE) != (b->ax25_call[ct] & 0xFE)) /* Clean off repeater bits */ + return 1; + ct++; + } + + if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */ + return 0; + + return 2; /* Partial match */ +} + +EXPORT_SYMBOL(ax25cmp); + +/* + * Compare two AX.25 digipeater paths. + */ +int ax25digicmp(const ax25_digi *digi1, const ax25_digi *digi2) +{ + int i; + + if (digi1->ndigi != digi2->ndigi) + return 1; + + if (digi1->lastrepeat != digi2->lastrepeat) + return 1; + + for (i = 0; i < digi1->ndigi; i++) + if (ax25cmp(&digi1->calls[i], &digi2->calls[i]) != 0) + return 1; + + return 0; +} + +/* + * Given an AX.25 address pull of to, from, digi list, command/response and the start of data + * + */ +const unsigned char *ax25_addr_parse(const unsigned char *buf, int len, + ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags, + int *dama) +{ + int d = 0; + + if (len < 14) return NULL; + + if (flags != NULL) { + *flags = 0; + + if (buf[6] & AX25_CBIT) + *flags = AX25_COMMAND; + if (buf[13] & AX25_CBIT) + *flags = AX25_RESPONSE; + } + + if (dama != NULL) + *dama = ~buf[13] & AX25_DAMA_FLAG; + + /* Copy to, from */ + if (dest != NULL) + memcpy(dest, buf + 0, AX25_ADDR_LEN); + if (src != NULL) + memcpy(src, buf + 7, AX25_ADDR_LEN); + + buf += 2 * AX25_ADDR_LEN; + len -= 2 * AX25_ADDR_LEN; + + digi->lastrepeat = -1; + digi->ndigi = 0; + + while (!(buf[-1] & AX25_EBIT)) { + if (d >= AX25_MAX_DIGIS) return NULL; /* Max of 6 digis */ + if (len < 7) return NULL; /* Short packet */ + + memcpy(&digi->calls[d], buf, AX25_ADDR_LEN); + digi->ndigi = d + 1; + + if (buf[6] & AX25_HBIT) { + digi->repeated[d] = 1; + digi->lastrepeat = d; + } else { + digi->repeated[d] = 0; + } + + buf += AX25_ADDR_LEN; + len -= AX25_ADDR_LEN; + d++; + } + + return buf; +} + +/* + * Assemble an AX.25 header from the bits + */ +int ax25_addr_build(unsigned char *buf, const ax25_address *src, + const ax25_address *dest, const ax25_digi *d, int flag, int modulus) +{ + int len = 0; + int ct = 0; + + memcpy(buf, dest, AX25_ADDR_LEN); + buf[6] &= ~(AX25_EBIT | AX25_CBIT); + buf[6] |= AX25_SSSID_SPARE; + + if (flag == AX25_COMMAND) buf[6] |= AX25_CBIT; + + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + + memcpy(buf, src, AX25_ADDR_LEN); + buf[6] &= ~(AX25_EBIT | AX25_CBIT); + buf[6] &= ~AX25_SSSID_SPARE; + + if (modulus == AX25_MODULUS) + buf[6] |= AX25_SSSID_SPARE; + else + buf[6] |= AX25_ESSID_SPARE; + + if (flag == AX25_RESPONSE) buf[6] |= AX25_CBIT; + + /* + * Fast path the normal digiless path + */ + if (d == NULL || d->ndigi == 0) { + buf[6] |= AX25_EBIT; + return 2 * AX25_ADDR_LEN; + } + + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + + while (ct < d->ndigi) { + memcpy(buf, &d->calls[ct], AX25_ADDR_LEN); + + if (d->repeated[ct]) + buf[6] |= AX25_HBIT; + else + buf[6] &= ~AX25_HBIT; + + buf[6] &= ~AX25_EBIT; + buf[6] |= AX25_SSSID_SPARE; + + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + ct++; + } + + buf[-1] |= AX25_EBIT; + + return len; +} + +int ax25_addr_size(const ax25_digi *dp) +{ + if (dp == NULL) + return 2 * AX25_ADDR_LEN; + + return AX25_ADDR_LEN * (2 + dp->ndigi); +} + +/* + * Reverse Digipeat List. May not pass both parameters as same struct + */ +void ax25_digi_invert(const ax25_digi *in, ax25_digi *out) +{ + int ct; + + out->ndigi = in->ndigi; + out->lastrepeat = in->ndigi - in->lastrepeat - 2; + + /* Invert the digipeaters */ + for (ct = 0; ct < in->ndigi; ct++) { + out->calls[ct] = in->calls[in->ndigi - ct - 1]; + + if (ct <= out->lastrepeat) { + out->calls[ct].ax25_call[6] |= AX25_HBIT; + out->repeated[ct] = 1; + } else { + out->calls[ct].ax25_call[6] &= ~AX25_HBIT; + out->repeated[ct] = 0; + } + } +} + diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c new file mode 100644 index 00000000..c1cb982f --- /dev/null +++ b/net/ax25/ax25_dev.c @@ -0,0 +1,205 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +ax25_dev *ax25_dev_list; +DEFINE_SPINLOCK(ax25_dev_lock); + +ax25_dev *ax25_addr_ax25dev(ax25_address *addr) +{ + ax25_dev *ax25_dev, *res = NULL; + + spin_lock_bh(&ax25_dev_lock); + for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) + if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { + res = ax25_dev; + } + spin_unlock_bh(&ax25_dev_lock); + + return res; +} + +/* + * This is called when an interface is brought up. These are + * reasonable defaults. + */ +void ax25_dev_device_up(struct net_device *dev) +{ + ax25_dev *ax25_dev; + + if ((ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_ATOMIC)) == NULL) { + printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n"); + return; + } + + ax25_unregister_sysctl(); + + dev->ax25_ptr = ax25_dev; + ax25_dev->dev = dev; + dev_hold(dev); + ax25_dev->forward = NULL; + + ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; + ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; + ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF; + ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE; + ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW; + ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW; + ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1; + ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2; + ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3; + ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE; + ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; + ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; + ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; + ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; + +#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) + ax25_ds_setup_timer(ax25_dev); +#endif + + spin_lock_bh(&ax25_dev_lock); + ax25_dev->next = ax25_dev_list; + ax25_dev_list = ax25_dev; + spin_unlock_bh(&ax25_dev_lock); + + ax25_register_sysctl(); +} + +void ax25_dev_device_down(struct net_device *dev) +{ + ax25_dev *s, *ax25_dev; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return; + + ax25_unregister_sysctl(); + + spin_lock_bh(&ax25_dev_lock); + +#ifdef CONFIG_AX25_DAMA_SLAVE + ax25_ds_del_timer(ax25_dev); +#endif + + /* + * Remove any packet forwarding that points to this device. + */ + for (s = ax25_dev_list; s != NULL; s = s->next) + if (s->forward == dev) + s->forward = NULL; + + if ((s = ax25_dev_list) == ax25_dev) { + ax25_dev_list = s->next; + spin_unlock_bh(&ax25_dev_lock); + dev_put(dev); + kfree(ax25_dev); + ax25_register_sysctl(); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == ax25_dev) { + s->next = ax25_dev->next; + spin_unlock_bh(&ax25_dev_lock); + dev_put(dev); + kfree(ax25_dev); + ax25_register_sysctl(); + return; + } + + s = s->next; + } + spin_unlock_bh(&ax25_dev_lock); + dev->ax25_ptr = NULL; + + ax25_register_sysctl(); +} + +int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) +{ + ax25_dev *ax25_dev, *fwd_dev; + + if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL) + return -EINVAL; + + switch (cmd) { + case SIOCAX25ADDFWD: + if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + return -EINVAL; + if (ax25_dev->forward != NULL) + return -EINVAL; + ax25_dev->forward = fwd_dev->dev; + break; + + case SIOCAX25DELFWD: + if (ax25_dev->forward == NULL) + return -EINVAL; + ax25_dev->forward = NULL; + break; + + default: + return -EINVAL; + } + + return 0; +} + +struct net_device *ax25_fwd_dev(struct net_device *dev) +{ + ax25_dev *ax25_dev; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return dev; + + if (ax25_dev->forward == NULL) + return dev; + + return ax25_dev->forward; +} + +/* + * Free all memory associated with device structures. + */ +void __exit ax25_dev_free(void) +{ + ax25_dev *s, *ax25_dev; + + spin_lock_bh(&ax25_dev_lock); + ax25_dev = ax25_dev_list; + while (ax25_dev != NULL) { + s = ax25_dev; + dev_put(ax25_dev->dev); + ax25_dev = ax25_dev->next; + kfree(s); + } + ax25_dev_list = NULL; + spin_unlock_bh(&ax25_dev_lock); +} diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c new file mode 100644 index 00000000..8273b120 --- /dev/null +++ b/net/ax25/ax25_ds_in.c @@ -0,0 +1,303 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file ax25_ds_timer.c. + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_ds_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_SABME: + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE); + break; + + case AX25_UA: + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_ESTABLISHED; + /* + * For WAIT_SABM connections we will produce an accept + * ready socket here + */ + if (!sock_flag(ax25->sk, SOCK_DEAD)) + ax25->sk->sk_state_change(ax25->sk); + bh_unlock_sock(ax25->sk); + } + ax25_dama_on(ax25); + + /* according to DK4EG's spec we are required to + * send a RR RESPONSE FINAL NR=0. + */ + + ax25_std_enquiry_response(ax25); + break; + + case AX25_DM: + if (pf) + ax25_disconnect(ax25, ECONNREFUSED); + break; + + default: + if (pf) + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file ax25_ds_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_ds_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_dama_off(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_dama_off(ax25); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + case AX25_UA: + if (pf) { + ax25_dama_off(ax25); + ax25_disconnect(ax25, 0); + } + break; + + case AX25_I: + case AX25_REJ: + case AX25_RNR: + case AX25_RR: + if (pf) { + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_dama_off(ax25); + } + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file ax25_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_ds_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type) +{ + int queued = 0; + + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + if (frametype == AX25_SABM) { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } else { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + } + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->condition = 0x00; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25_requeue_frames(ax25); + ax25_dama_on(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_dama_off(ax25); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + ax25_dama_off(ax25); + ax25_disconnect(ax25, ECONNRESET); + break; + + case AX25_RR: + case AX25_RNR: + if (frametype == AX25_RR) + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + else + ax25->condition |= AX25_COND_PEER_RX_BUSY; + + if (ax25_validate_nr(ax25, nr)) { + if (ax25_check_iframes_acked(ax25, nr)) + ax25->n2count=0; + if (type == AX25_COMMAND && pf) + ax25_ds_enquiry_response(ax25); + } else { + ax25_ds_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_REJ: + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + + if (ax25_validate_nr(ax25, nr)) { + if (ax25->va != nr) + ax25->n2count=0; + + ax25_frames_acked(ax25, nr); + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_requeue_frames(ax25); + + if (type == AX25_COMMAND && pf) + ax25_ds_enquiry_response(ax25); + } else { + ax25_ds_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_I: + if (!ax25_validate_nr(ax25, nr)) { + ax25_ds_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + break; + } + if (ax25->condition & AX25_COND_PEER_RX_BUSY) { + ax25_frames_acked(ax25, nr); + ax25->n2count = 0; + } else { + if (ax25_check_iframes_acked(ax25, nr)) + ax25->n2count = 0; + } + if (ax25->condition & AX25_COND_OWN_RX_BUSY) { + if (pf) ax25_ds_enquiry_response(ax25); + break; + } + if (ns == ax25->vr) { + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25->vr = ns; /* ax25->vr - 1 */ + ax25->condition &= ~AX25_COND_REJECT; + if (pf) { + ax25_ds_enquiry_response(ax25); + } else { + if (!(ax25->condition & AX25_COND_ACK_PENDING)) { + ax25->condition |= AX25_COND_ACK_PENDING; + ax25_start_t2timer(ax25); + } + } + } else { + if (ax25->condition & AX25_COND_REJECT) { + if (pf) ax25_ds_enquiry_response(ax25); + } else { + ax25->condition |= AX25_COND_REJECT; + ax25_ds_enquiry_response(ax25); + ax25->condition &= ~AX25_COND_ACK_PENDING; + } + } + break; + + case AX25_FRMR: + case AX25_ILLEGAL: + ax25_ds_establish_data_link(ax25); + ax25->state = AX25_STATE_1; + break; + + default: + break; + } + + return queued; +} + +/* + * Higher level upcall for a LAPB frame + */ +int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type) +{ + int queued = 0, frametype, ns, nr, pf; + + frametype = ax25_decode(ax25, skb, &ns, &nr, &pf); + + switch (ax25->state) { + case AX25_STATE_1: + queued = ax25_ds_state1_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_2: + queued = ax25_ds_state2_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_3: + queued = ax25_ds_state3_machine(ax25, skb, frametype, ns, nr, pf, type); + break; + } + + return queued; +} + diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c new file mode 100644 index 00000000..85816e61 --- /dev/null +++ b/net/ax25/ax25_ds_subr.c @@ -0,0 +1,211 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void ax25_ds_nr_error_recovery(ax25_cb *ax25) +{ + ax25_ds_establish_data_link(ax25); +} + +/* + * dl1bke 960114: transmit I frames on DAMA poll + */ +void ax25_ds_enquiry_response(ax25_cb *ax25) +{ + ax25_cb *ax25o; + struct hlist_node *node; + + /* Please note that neither DK4EG's nor DG2FEF's + * DAMA spec mention the following behaviour as seen + * with TheFirmware: + * + * DB0ACH->DL1BKE [DAMA] + * DL1BKE->DB0ACH + * DL1BKE-7->DB0PRA-6 DB0ACH + * DL1BKE->DB0ACH + * + * The Flexnet DAMA Master implementation apparently + * insists on the "proper" AX.25 behaviour: + * + * DB0ACH->DL1BKE [DAMA] + * DL1BKE->DB0ACH + * DL1BKE->DB0ACH + * DL1BKE-7->DB0PRA-6 DB0ACH + * + * Flexnet refuses to send us *any* I frame if we send + * a REJ in case AX25_COND_REJECT is set. It is superfluous in + * this mode anyway (a RR or RNR invokes the retransmission). + * Is this a Flexnet bug? + */ + + ax25_std_enquiry_response(ax25); + + if (!(ax25->condition & AX25_COND_PEER_RX_BUSY)) { + ax25_requeue_frames(ax25); + ax25_kick(ax25); + } + + if (ax25->state == AX25_STATE_1 || ax25->state == AX25_STATE_2 || skb_peek(&ax25->ack_queue) != NULL) + ax25_ds_t1_timeout(ax25); + else + ax25->n2count = 0; + + ax25_start_t3timer(ax25); + ax25_ds_set_timer(ax25->ax25_dev); + + spin_lock(&ax25_list_lock); + ax25_for_each(ax25o, node, &ax25_list) { + if (ax25o == ax25) + continue; + + if (ax25o->ax25_dev != ax25->ax25_dev) + continue; + + if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2) { + ax25_ds_t1_timeout(ax25o); + continue; + } + + if (!(ax25o->condition & AX25_COND_PEER_RX_BUSY) && ax25o->state == AX25_STATE_3) { + ax25_requeue_frames(ax25o); + ax25_kick(ax25o); + } + + if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2 || skb_peek(&ax25o->ack_queue) != NULL) + ax25_ds_t1_timeout(ax25o); + + /* do not start T3 for listening sockets (tnx DD8NE) */ + + if (ax25o->state != AX25_STATE_0) + ax25_start_t3timer(ax25o); + } + spin_unlock(&ax25_list_lock); +} + +void ax25_ds_establish_data_link(ax25_cb *ax25) +{ + ax25->condition &= AX25_COND_DAMA_MODE; + ax25->n2count = 0; + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t3timer(ax25); +} + +/* + * :::FIXME::: + * This is a kludge. Not all drivers recognize kiss commands. + * We need a driver level request to switch duplex mode, that does + * either SCC changing, PI config or KISS as required. Currently + * this request isn't reliable. + */ +static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char param) +{ + struct sk_buff *skb; + unsigned char *p; + + if (ax25_dev->dev == NULL) + return; + + if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL) + return; + + skb_reset_network_header(skb); + p = skb_put(skb, 2); + + *p++ = cmd; + *p++ = param; + + skb->protocol = ax25_type_trans(skb, ax25_dev->dev); + + dev_queue_xmit(skb); +} + +/* + * A nasty problem arises if we count the number of DAMA connections + * wrong, especially when connections on the device already existed + * and our network node (or the sysop) decides to turn on DAMA Master + * mode. We thus flag the 'real' slave connections with + * ax25->dama_slave=1 and look on every disconnect if still slave + * connections exist. + */ +static int ax25_check_dama_slave(ax25_dev *ax25_dev) +{ + ax25_cb *ax25; + int res = 0; + struct hlist_node *node; + + spin_lock(&ax25_list_lock); + ax25_for_each(ax25, node, &ax25_list) + if (ax25->ax25_dev == ax25_dev && (ax25->condition & AX25_COND_DAMA_MODE) && ax25->state > AX25_STATE_1) { + res = 1; + break; + } + spin_unlock(&ax25_list_lock); + + return res; +} + +static void ax25_dev_dama_on(ax25_dev *ax25_dev) +{ + if (ax25_dev == NULL) + return; + + if (ax25_dev->dama.slave == 0) + ax25_kiss_cmd(ax25_dev, 5, 1); + + ax25_dev->dama.slave = 1; + ax25_ds_set_timer(ax25_dev); +} + +void ax25_dev_dama_off(ax25_dev *ax25_dev) +{ + if (ax25_dev == NULL) + return; + + if (ax25_dev->dama.slave && !ax25_check_dama_slave(ax25_dev)) { + ax25_kiss_cmd(ax25_dev, 5, 0); + ax25_dev->dama.slave = 0; + ax25_ds_del_timer(ax25_dev); + } +} + +void ax25_dama_on(ax25_cb *ax25) +{ + ax25_dev_dama_on(ax25->ax25_dev); + ax25->condition |= AX25_COND_DAMA_MODE; +} + +void ax25_dama_off(ax25_cb *ax25) +{ + ax25->condition &= ~AX25_COND_DAMA_MODE; + ax25_dev_dama_off(ax25->ax25_dev); +} + diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c new file mode 100644 index 00000000..c7d81436 --- /dev/null +++ b/net/ax25/ax25_ds_timer.c @@ -0,0 +1,238 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void ax25_ds_timeout(unsigned long); + +/* + * Add DAMA slave timeout timer to timer list. + * Unlike the connection based timers the timeout function gets + * triggered every second. Please note that NET_AX25_DAMA_SLAVE_TIMEOUT + * (aka /proc/sys/net/ax25/{dev}/dama_slave_timeout) is still in + * 1/10th of a second. + */ + +void ax25_ds_setup_timer(ax25_dev *ax25_dev) +{ + setup_timer(&ax25_dev->dama.slave_timer, ax25_ds_timeout, + (unsigned long)ax25_dev); +} + +void ax25_ds_del_timer(ax25_dev *ax25_dev) +{ + if (ax25_dev) + del_timer(&ax25_dev->dama.slave_timer); +} + +void ax25_ds_set_timer(ax25_dev *ax25_dev) +{ + if (ax25_dev == NULL) /* paranoia */ + return; + + ax25_dev->dama.slave_timeout = + msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10; + mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ); +} + +/* + * DAMA Slave Timeout + * Silently discard all (slave) connections in case our master forgot us... + */ + +static void ax25_ds_timeout(unsigned long arg) +{ + ax25_dev *ax25_dev = (struct ax25_dev *) arg; + ax25_cb *ax25; + struct hlist_node *node; + + if (ax25_dev == NULL || !ax25_dev->dama.slave) + return; /* Yikes! */ + + if (!ax25_dev->dama.slave_timeout || --ax25_dev->dama.slave_timeout) { + ax25_ds_set_timer(ax25_dev); + return; + } + + spin_lock(&ax25_list_lock); + ax25_for_each(ax25, node, &ax25_list) { + if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE)) + continue; + + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_disconnect(ax25, ETIMEDOUT); + } + spin_unlock(&ax25_list_lock); + + ax25_dev_dama_off(ax25_dev); +} + +void ax25_ds_heartbeat_expiry(ax25_cb *ax25) +{ + struct sock *sk=ax25->sk; + + if (sk) + bh_lock_sock(sk); + + switch (ax25->state) { + + case AX25_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (!sk || sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && + sock_flag(sk, SOCK_DEAD))) { + if (sk) { + sock_hold(sk); + ax25_destroy_socket(ax25); + bh_unlock_sock(sk); + sock_put(sk); + } else + ax25_destroy_socket(ax25); + return; + } + break; + + case AX25_STATE_3: + /* + * Check the state of the receive buffer. + */ + if (sk != NULL) { + if (atomic_read(&sk->sk_rmem_alloc) < + (sk->sk_rcvbuf >> 1) && + (ax25->condition & AX25_COND_OWN_RX_BUSY)) { + ax25->condition &= ~AX25_COND_OWN_RX_BUSY; + ax25->condition &= ~AX25_COND_ACK_PENDING; + break; + } + } + break; + } + + if (sk) + bh_unlock_sock(sk); + + ax25_start_heartbeat(ax25); +} + +/* dl1bke 960114: T3 works much like the IDLE timeout, but + * gets reloaded with every frame for this + * connection. + */ +void ax25_ds_t3timer_expiry(ax25_cb *ax25) +{ + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_dama_off(ax25); + ax25_disconnect(ax25, ETIMEDOUT); +} + +/* dl1bke 960228: close the connection when IDLE expires. + * unlike T3 this timer gets reloaded only on + * I frames. + */ +void ax25_ds_idletimer_expiry(ax25_cb *ax25) +{ + ax25_clear_queues(ax25); + + ax25->n2count = 0; + ax25->state = AX25_STATE_2; + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25_stop_t3timer(ax25); + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_CLOSE; + ax25->sk->sk_err = 0; + ax25->sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + ax25->sk->sk_state_change(ax25->sk); + sock_set_flag(ax25->sk, SOCK_DEAD); + } + bh_unlock_sock(ax25->sk); + } +} + +/* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC + * within the poll of any connected channel. Remember + * that we are not allowed to send anything unless we + * get polled by the Master. + * + * Thus we'll have to do parts of our T1 handling in + * ax25_enquiry_response(). + */ +void ax25_ds_t1_timeout(ax25_cb *ax25) +{ + switch (ax25->state) { + case AX25_STATE_1: + if (ax25->n2count == ax25->n2) { + if (ax25->modulus == AX25_MODULUS) { + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25->n2count = 0; + ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); + } + } else { + ax25->n2count++; + if (ax25->modulus == AX25_MODULUS) + ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_SABME, AX25_POLLOFF, AX25_COMMAND); + } + break; + + case AX25_STATE_2: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + } + break; + + case AX25_STATE_3: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + } + break; + } + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); +} diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c new file mode 100644 index 00000000..60b545e2 --- /dev/null +++ b/net/ax25/ax25_iface.c @@ -0,0 +1,219 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ax25_protocol *protocol_list; +static DEFINE_RWLOCK(protocol_list_lock); + +static HLIST_HEAD(ax25_linkfail_list); +static DEFINE_SPINLOCK(linkfail_lock); + +static struct listen_struct { + struct listen_struct *next; + ax25_address callsign; + struct net_device *dev; +} *listen_list = NULL; +static DEFINE_SPINLOCK(listen_lock); + +/* + * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT, + * AX25_P_IP or AX25_P_ARP ... + */ +void ax25_register_pid(struct ax25_protocol *ap) +{ + write_lock_bh(&protocol_list_lock); + ap->next = protocol_list; + protocol_list = ap; + write_unlock_bh(&protocol_list_lock); +} + +EXPORT_SYMBOL_GPL(ax25_register_pid); + +void ax25_protocol_release(unsigned int pid) +{ + struct ax25_protocol *protocol; + + write_lock_bh(&protocol_list_lock); + protocol = protocol_list; + if (protocol == NULL) + goto out; + + if (protocol->pid == pid) { + protocol_list = protocol->next; + goto out; + } + + while (protocol != NULL && protocol->next != NULL) { + if (protocol->next->pid == pid) { + protocol->next = protocol->next->next; + goto out; + } + + protocol = protocol->next; + } +out: + write_unlock_bh(&protocol_list_lock); +} + +EXPORT_SYMBOL(ax25_protocol_release); + +void ax25_linkfail_register(struct ax25_linkfail *lf) +{ + spin_lock_bh(&linkfail_lock); + hlist_add_head(&lf->lf_node, &ax25_linkfail_list); + spin_unlock_bh(&linkfail_lock); +} + +EXPORT_SYMBOL(ax25_linkfail_register); + +void ax25_linkfail_release(struct ax25_linkfail *lf) +{ + spin_lock_bh(&linkfail_lock); + hlist_del_init(&lf->lf_node); + spin_unlock_bh(&linkfail_lock); +} + +EXPORT_SYMBOL(ax25_linkfail_release); + +int ax25_listen_register(ax25_address *callsign, struct net_device *dev) +{ + struct listen_struct *listen; + + if (ax25_listen_mine(callsign, dev)) + return 0; + + if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + listen->callsign = *callsign; + listen->dev = dev; + + spin_lock_bh(&listen_lock); + listen->next = listen_list; + listen_list = listen; + spin_unlock_bh(&listen_lock); + + return 0; +} + +EXPORT_SYMBOL(ax25_listen_register); + +void ax25_listen_release(ax25_address *callsign, struct net_device *dev) +{ + struct listen_struct *s, *listen; + + spin_lock_bh(&listen_lock); + listen = listen_list; + if (listen == NULL) { + spin_unlock_bh(&listen_lock); + return; + } + + if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) { + listen_list = listen->next; + spin_unlock_bh(&listen_lock); + kfree(listen); + return; + } + + while (listen != NULL && listen->next != NULL) { + if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) { + s = listen->next; + listen->next = listen->next->next; + spin_unlock_bh(&listen_lock); + kfree(s); + return; + } + + listen = listen->next; + } + spin_unlock_bh(&listen_lock); +} + +EXPORT_SYMBOL(ax25_listen_release); + +int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) +{ + int (*res)(struct sk_buff *, ax25_cb *) = NULL; + struct ax25_protocol *protocol; + + read_lock(&protocol_list_lock); + for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) + if (protocol->pid == pid) { + res = protocol->func; + break; + } + read_unlock(&protocol_list_lock); + + return res; +} + +int ax25_listen_mine(ax25_address *callsign, struct net_device *dev) +{ + struct listen_struct *listen; + + spin_lock_bh(&listen_lock); + for (listen = listen_list; listen != NULL; listen = listen->next) + if (ax25cmp(&listen->callsign, callsign) == 0 && + (listen->dev == dev || listen->dev == NULL)) { + spin_unlock_bh(&listen_lock); + return 1; + } + spin_unlock_bh(&listen_lock); + + return 0; +} + +void ax25_link_failed(ax25_cb *ax25, int reason) +{ + struct ax25_linkfail *lf; + struct hlist_node *node; + + spin_lock_bh(&linkfail_lock); + hlist_for_each_entry(lf, node, &ax25_linkfail_list, lf_node) + lf->func(ax25, reason); + spin_unlock_bh(&linkfail_lock); +} + +int ax25_protocol_is_registered(unsigned int pid) +{ + struct ax25_protocol *protocol; + int res = 0; + + read_lock_bh(&protocol_list_lock); + for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) + if (protocol->pid == pid) { + res = 1; + break; + } + read_unlock_bh(&protocol_list_lock); + + return res; +} diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c new file mode 100644 index 00000000..9bb77654 --- /dev/null +++ b/net/ax25/ax25_in.c @@ -0,0 +1,456 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Given a fragment, queue it on the fragment queue and if the fragment + * is complete, send it back to ax25_rx_iframe. + */ +static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) +{ + struct sk_buff *skbn, *skbo; + + if (ax25->fragno != 0) { + if (!(*skb->data & AX25_SEG_FIRST)) { + if ((ax25->fragno - 1) == (*skb->data & AX25_SEG_REM)) { + /* Enqueue fragment */ + ax25->fragno = *skb->data & AX25_SEG_REM; + skb_pull(skb, 1); /* skip fragno */ + ax25->fraglen += skb->len; + skb_queue_tail(&ax25->frag_queue, skb); + + /* Last fragment received ? */ + if (ax25->fragno == 0) { + skbn = alloc_skb(AX25_MAX_HEADER_LEN + + ax25->fraglen, + GFP_ATOMIC); + if (!skbn) { + skb_queue_purge(&ax25->frag_queue); + return 1; + } + + skb_reserve(skbn, AX25_MAX_HEADER_LEN); + + skbn->dev = ax25->ax25_dev->dev; + skb_reset_network_header(skbn); + skb_reset_transport_header(skbn); + + /* Copy data from the fragments */ + while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) { + skb_copy_from_linear_data(skbo, + skb_put(skbn, skbo->len), + skbo->len); + kfree_skb(skbo); + } + + ax25->fraglen = 0; + + if (ax25_rx_iframe(ax25, skbn) == 0) + kfree_skb(skbn); + } + + return 1; + } + } + } else { + /* First fragment received */ + if (*skb->data & AX25_SEG_FIRST) { + skb_queue_purge(&ax25->frag_queue); + ax25->fragno = *skb->data & AX25_SEG_REM; + skb_pull(skb, 1); /* skip fragno */ + ax25->fraglen = skb->len; + skb_queue_tail(&ax25->frag_queue, skb); + return 1; + } + } + + return 0; +} + +/* + * This is where all valid I frames are sent to, to be dispatched to + * whichever protocol requires them. + */ +int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) +{ + int (*func)(struct sk_buff *, ax25_cb *); + unsigned char pid; + int queued = 0; + + if (skb == NULL) return 0; + + ax25_start_idletimer(ax25); + + pid = *skb->data; + + if (pid == AX25_P_IP) { + /* working around a TCP bug to keep additional listeners + * happy. TCP re-uses the buffer and destroys the original + * content. + */ + struct sk_buff *skbn = skb_copy(skb, GFP_ATOMIC); + if (skbn != NULL) { + kfree_skb(skb); + skb = skbn; + } + + skb_pull(skb, 1); /* Remove PID */ + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + skb->dev = ax25->ax25_dev->dev; + skb->pkt_type = PACKET_HOST; + skb->protocol = htons(ETH_P_IP); + netif_rx(skb); + return 1; + } + if (pid == AX25_P_SEGMENT) { + skb_pull(skb, 1); /* Remove PID */ + return ax25_rx_fragment(ax25, skb); + } + + if ((func = ax25_protocol_function(pid)) != NULL) { + skb_pull(skb, 1); /* Remove PID */ + return (*func)(skb, ax25); + } + + if (ax25->sk != NULL && ax25->ax25_dev->values[AX25_VALUES_CONMODE] == 2) { + if ((!ax25->pidincl && ax25->sk->sk_protocol == pid) || + ax25->pidincl) { + if (sock_queue_rcv_skb(ax25->sk, skb) == 0) + queued = 1; + else + ax25->condition |= AX25_COND_OWN_RX_BUSY; + } + } + + return queued; +} + +/* + * Higher level upcall for a LAPB frame + */ +static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, int dama) +{ + int queued = 0; + + if (ax25->state == AX25_STATE_0) + return 0; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + queued = ax25_std_frame_in(ax25, skb, type); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (dama || ax25->ax25_dev->dama.slave) + queued = ax25_ds_frame_in(ax25, skb, type); + else + queued = ax25_std_frame_in(ax25, skb, type); + break; +#endif + } + + return queued; +} + +static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, + ax25_address *dev_addr, struct packet_type *ptype) +{ + ax25_address src, dest, *next_digi = NULL; + int type = 0, mine = 0, dama; + struct sock *make, *sk; + ax25_digi dp, reverse_dp; + ax25_cb *ax25; + ax25_dev *ax25_dev; + + /* + * Process the AX.25/LAPB frame. + */ + + skb_reset_transport_header(skb); + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + goto free; + + /* + * Parse the address header. + */ + + if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL) + goto free; + + /* + * Ours perhaps ? + */ + if (dp.lastrepeat + 1 < dp.ndigi) /* Not yet digipeated completely */ + next_digi = &dp.calls[dp.lastrepeat + 1]; + + /* + * Pull of the AX.25 headers leaving the CTRL/PID bytes + */ + skb_pull(skb, ax25_addr_size(&dp)); + + /* For our port addresses ? */ + if (ax25cmp(&dest, dev_addr) == 0 && dp.lastrepeat + 1 == dp.ndigi) + mine = 1; + + /* Also match on any registered callsign from L3/4 */ + if (!mine && ax25_listen_mine(&dest, dev) && dp.lastrepeat + 1 == dp.ndigi) + mine = 1; + + /* UI frame - bypass LAPB processing */ + if ((*skb->data & ~0x10) == AX25_UI && dp.lastrepeat + 1 == dp.ndigi) { + skb_set_transport_header(skb, 2); /* skip control and pid */ + + ax25_send_to_raw(&dest, skb, skb->data[1]); + + if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0) + goto free; + + /* Now we are pointing at the pid byte */ + switch (skb->data[1]) { + case AX25_P_IP: + skb_pull(skb,2); /* drop PID/CTRL */ + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + skb->protocol = htons(ETH_P_IP); + netif_rx(skb); + break; + + case AX25_P_ARP: + skb_pull(skb,2); + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + skb->protocol = htons(ETH_P_ARP); + netif_rx(skb); + break; + case AX25_P_TEXT: + /* Now find a suitable dgram socket */ + sk = ax25_get_socket(&dest, &src, SOCK_DGRAM); + if (sk != NULL) { + bh_lock_sock(sk); + if (atomic_read(&sk->sk_rmem_alloc) >= + sk->sk_rcvbuf) { + kfree_skb(skb); + } else { + /* + * Remove the control and PID. + */ + skb_pull(skb, 2); + if (sock_queue_rcv_skb(sk, skb) != 0) + kfree_skb(skb); + } + bh_unlock_sock(sk); + sock_put(sk); + } else { + kfree_skb(skb); + } + break; + + default: + kfree_skb(skb); /* Will scan SOCK_AX25 RAW sockets */ + break; + } + + return 0; + } + + /* + * Is connected mode supported on this device ? + * If not, should we DM the incoming frame (except DMs) or + * silently ignore them. For now we stay quiet. + */ + if (ax25_dev->values[AX25_VALUES_CONMODE] == 0) + goto free; + + /* LAPB */ + + /* AX.25 state 1-4 */ + + ax25_digi_invert(&dp, &reverse_dp); + + if ((ax25 = ax25_find_cb(&dest, &src, &reverse_dp, dev)) != NULL) { + /* + * Process the frame. If it is queued up internally it + * returns one otherwise we free it immediately. This + * routine itself wakes the user context layers so we do + * no further work + */ + if (ax25_process_rx_frame(ax25, skb, type, dama) == 0) + kfree_skb(skb); + + ax25_cb_put(ax25); + return 0; + } + + /* AX.25 state 0 (disconnected) */ + + /* a) received not a SABM(E) */ + + if ((*skb->data & ~AX25_PF) != AX25_SABM && + (*skb->data & ~AX25_PF) != AX25_SABME) { + /* + * Never reply to a DM. Also ignore any connects for + * addresses that are not our interfaces and not a socket. + */ + if ((*skb->data & ~AX25_PF) != AX25_DM && mine) + ax25_return_dm(dev, &src, &dest, &dp); + + goto free; + } + + /* b) received SABM(E) */ + + if (dp.lastrepeat + 1 == dp.ndigi) + sk = ax25_find_listener(&dest, 0, dev, SOCK_SEQPACKET); + else + sk = ax25_find_listener(next_digi, 1, dev, SOCK_SEQPACKET); + + if (sk != NULL) { + bh_lock_sock(sk); + if (sk_acceptq_is_full(sk) || + (make = ax25_make_new(sk, ax25_dev)) == NULL) { + if (mine) + ax25_return_dm(dev, &src, &dest, &dp); + kfree_skb(skb); + bh_unlock_sock(sk); + sock_put(sk); + + return 0; + } + + ax25 = ax25_sk(make); + skb_set_owner_r(skb, make); + skb_queue_head(&sk->sk_receive_queue, skb); + + make->sk_state = TCP_ESTABLISHED; + + sk->sk_ack_backlog++; + bh_unlock_sock(sk); + } else { + if (!mine) + goto free; + + if ((ax25 = ax25_create_cb()) == NULL) { + ax25_return_dm(dev, &src, &dest, &dp); + goto free; + } + + ax25_fillin_cb(ax25, ax25_dev); + } + + ax25->source_addr = dest; + ax25->dest_addr = src; + + /* + * Sort out any digipeated paths. + */ + if (dp.ndigi && !ax25->digipeat && + (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + ax25_destroy_socket(ax25); + if (sk) + sock_put(sk); + return 0; + } + + if (dp.ndigi == 0) { + kfree(ax25->digipeat); + ax25->digipeat = NULL; + } else { + /* Reverse the source SABM's path */ + memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi)); + } + + if ((*skb->data & ~AX25_PF) == AX25_SABME) { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; + } + + ax25_send_control(ax25, AX25_UA, AX25_POLLON, AX25_RESPONSE); + +#ifdef CONFIG_AX25_DAMA_SLAVE + if (dama && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE) + ax25_dama_on(ax25); +#endif + + ax25->state = AX25_STATE_3; + + ax25_cb_add(ax25); + + ax25_start_heartbeat(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + + if (sk) { + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + sock_put(sk); + } else { +free: + kfree_skb(skb); + } + return 0; +} + +/* + * Receive an AX.25 frame via a SLIP interface. + */ +int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype, struct net_device *orig_dev) +{ + skb_orphan(skb); + + if (!net_eq(dev_net(dev), &init_net)) { + kfree_skb(skb); + return 0; + } + + if ((*skb->data & 0x0F) != 0) { + kfree_skb(skb); /* Not a KISS data frame */ + return 0; + } + + skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */ + + return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype); +} diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c new file mode 100644 index 00000000..cf0c47a2 --- /dev/null +++ b/net/ax25/ax25_ip.c @@ -0,0 +1,243 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * IP over AX.25 encapsulation. + */ + +/* + * Shove an AX.25 UI header on an IP packet and handle ARP + */ + +#ifdef CONFIG_INET + +int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned len) +{ + unsigned char *buff; + + /* they sometimes come back to us... */ + if (type == ETH_P_AX25) + return 0; + + /* header is an AX.25 UI frame from us to them */ + buff = skb_push(skb, AX25_HEADER_LEN); + *buff++ = 0x00; /* KISS DATA */ + + if (daddr != NULL) + memcpy(buff, daddr, dev->addr_len); /* Address specified */ + + buff[6] &= ~AX25_CBIT; + buff[6] &= ~AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + if (saddr != NULL) + memcpy(buff, saddr, dev->addr_len); + else + memcpy(buff, dev->dev_addr, dev->addr_len); + + buff[6] &= ~AX25_CBIT; + buff[6] |= AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + *buff++ = AX25_UI; /* UI */ + + /* Append a suitable AX.25 PID */ + switch (type) { + case ETH_P_IP: + *buff++ = AX25_P_IP; + break; + case ETH_P_ARP: + *buff++ = AX25_P_ARP; + break; + default: + printk(KERN_ERR "AX.25: ax25_hard_header - wrong protocol type 0x%2.2x\n", type); + *buff++ = 0; + break; + } + + if (daddr != NULL) + return AX25_HEADER_LEN; + + return -AX25_HEADER_LEN; /* Unfinished header */ +} + +int ax25_rebuild_header(struct sk_buff *skb) +{ + struct sk_buff *ourskb; + unsigned char *bp = skb->data; + ax25_route *route; + struct net_device *dev = NULL; + ax25_address *src, *dst; + ax25_digi *digipeat = NULL; + ax25_dev *ax25_dev; + ax25_cb *ax25; + char ip_mode = ' '; + + dst = (ax25_address *)(bp + 1); + src = (ax25_address *)(bp + 8); + + if (arp_find(bp + 1, skb)) + return 1; + + route = ax25_get_route(dst, NULL); + if (route) { + digipeat = route->digipeat; + dev = route->dev; + ip_mode = route->ip_mode; + } + + if (dev == NULL) + dev = skb->dev; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { + goto put; + } + + if (bp[16] == AX25_P_IP) { + if (ip_mode == 'V' || (ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) { + /* + * We copy the buffer and release the original thereby + * keeping it straight + * + * Note: we report 1 back so the caller will + * not feed the frame direct to the physical device + * We don't want that to happen. (It won't be upset + * as we have pulled the frame from the queue by + * freeing it). + * + * NB: TCP modifies buffers that are still + * on a device queue, thus we use skb_copy() + * instead of using skb_clone() unless this + * gets fixed. + */ + + ax25_address src_c; + ax25_address dst_c; + + if ((ourskb = skb_copy(skb, GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + goto put; + } + + if (skb->sk != NULL) + skb_set_owner_w(ourskb, skb->sk); + + kfree_skb(skb); + /* dl9sau: bugfix + * after kfree_skb(), dst and src which were pointer + * to bp which is part of skb->data would not be valid + * anymore hope that after skb_pull(ourskb, ..) our + * dsc_c and src_c will not become invalid + */ + bp = ourskb->data; + dst_c = *(ax25_address *)(bp + 1); + src_c = *(ax25_address *)(bp + 8); + + skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */ + skb_reset_network_header(ourskb); + + ax25=ax25_send_frame( + ourskb, + ax25_dev->values[AX25_VALUES_PACLEN], + &src_c, + &dst_c, digipeat, dev); + if (ax25) { + ax25_cb_put(ax25); + } + goto put; + } + } + + bp[7] &= ~AX25_CBIT; + bp[7] &= ~AX25_EBIT; + bp[7] |= AX25_SSSID_SPARE; + + bp[14] &= ~AX25_CBIT; + bp[14] |= AX25_EBIT; + bp[14] |= AX25_SSSID_SPARE; + + skb_pull(skb, AX25_KISS_HEADER_LEN); + + if (digipeat != NULL) { + if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) { + kfree_skb(skb); + goto put; + } + + skb = ourskb; + } + + ax25_queue_xmit(skb, dev); + +put: + if (route) + ax25_put_route(route); + + return 1; +} + +#else /* INET */ + +int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned len) +{ + return -AX25_HEADER_LEN; +} + +int ax25_rebuild_header(struct sk_buff *skb) +{ + return 1; +} + +#endif + +const struct header_ops ax25_header_ops = { + .create = ax25_hard_header, + .rebuild = ax25_rebuild_header, +}; + +EXPORT_SYMBOL(ax25_hard_header); +EXPORT_SYMBOL(ax25_rebuild_header); +EXPORT_SYMBOL(ax25_header_ops); + diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c new file mode 100644 index 00000000..37507d80 --- /dev/null +++ b/net/ax25/ax25_out.c @@ -0,0 +1,399 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(ax25_frag_lock); + +ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev) +{ + ax25_dev *ax25_dev; + ax25_cb *ax25; + + /* + * Take the default packet length for the device if zero is + * specified. + */ + if (paclen == 0) { + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return NULL; + + paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + } + + /* + * Look for an existing connection. + */ + if ((ax25 = ax25_find_cb(src, dest, digi, dev)) != NULL) { + ax25_output(ax25, paclen, skb); + return ax25; /* It already existed */ + } + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return NULL; + + if ((ax25 = ax25_create_cb()) == NULL) + return NULL; + + ax25_fillin_cb(ax25, ax25_dev); + + ax25->source_addr = *src; + ax25->dest_addr = *dest; + + if (digi != NULL) { + ax25->digipeat = kmemdup(digi, sizeof(*digi), GFP_ATOMIC); + if (ax25->digipeat == NULL) { + ax25_cb_put(ax25); + return NULL; + } + } + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_establish_data_link(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25_dev->dama.slave) + ax25_ds_establish_data_link(ax25); + else + ax25_std_establish_data_link(ax25); + break; +#endif + } + + /* + * There is one ref for the state machine; a caller needs + * one more to put it back, just like with the existing one. + */ + ax25_cb_hold(ax25); + + ax25_cb_add(ax25); + + ax25->state = AX25_STATE_1; + + ax25_start_heartbeat(ax25); + + ax25_output(ax25, paclen, skb); + + return ax25; /* We had to create it */ +} + +EXPORT_SYMBOL(ax25_send_frame); + +/* + * All outgoing AX.25 I frames pass via this routine. Therefore this is + * where the fragmentation of frames takes place. If fragment is set to + * zero then we are not allowed to do fragmentation, even if the frame + * is too large. + */ +void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char *p; + int frontlen, len, fragno, ka9qfrag, first = 1; + + if (paclen < 16) { + WARN_ON_ONCE(1); + kfree_skb(skb); + return; + } + + if ((skb->len - 1) > paclen) { + if (*skb->data == AX25_P_TEXT) { + skb_pull(skb, 1); /* skip PID */ + ka9qfrag = 0; + } else { + paclen -= 2; /* Allow for fragment control info */ + ka9qfrag = 1; + } + + fragno = skb->len / paclen; + if (skb->len % paclen == 0) fragno--; + + frontlen = skb_headroom(skb); /* Address space + CTRL */ + + while (skb->len > 0) { + spin_lock_bh(&ax25_frag_lock); + if ((skbn = alloc_skb(paclen + 2 + frontlen, GFP_ATOMIC)) == NULL) { + spin_unlock_bh(&ax25_frag_lock); + printk(KERN_CRIT "AX.25: ax25_output - out of memory\n"); + return; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + spin_unlock_bh(&ax25_frag_lock); + + len = (paclen > skb->len) ? skb->len : paclen; + + if (ka9qfrag == 1) { + skb_reserve(skbn, frontlen + 2); + skb_set_network_header(skbn, + skb_network_offset(skb)); + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); + p = skb_push(skbn, 2); + + *p++ = AX25_P_SEGMENT; + + *p = fragno--; + if (first) { + *p |= AX25_SEG_FIRST; + first = 0; + } + } else { + skb_reserve(skbn, frontlen + 1); + skb_set_network_header(skbn, + skb_network_offset(skb)); + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); + p = skb_push(skbn, 1); + *p = AX25_P_TEXT; + } + + skb_pull(skb, len); + skb_queue_tail(&ax25->write_queue, skbn); /* Throw it on the queue */ + } + + kfree_skb(skb); + } else { + skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */ + } + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_kick(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + /* + * A DAMA slave is _required_ to work as normal AX.25L2V2 + * if no DAMA master is available. + */ + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) ax25_kick(ax25); + break; +#endif + } +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit) +{ + unsigned char *frame; + + if (skb == NULL) + return; + + skb_reset_network_header(skb); + + if (ax25->modulus == AX25_MODULUS) { + frame = skb_push(skb, 1); + + *frame = AX25_I; + *frame |= (poll_bit) ? AX25_PF : 0; + *frame |= (ax25->vr << 5); + *frame |= (ax25->vs << 1); + } else { + frame = skb_push(skb, 2); + + frame[0] = AX25_I; + frame[0] |= (ax25->vs << 1); + frame[1] = (poll_bit) ? AX25_EPF : 0; + frame[1] |= (ax25->vr << 1); + } + + ax25_start_idletimer(ax25); + + ax25_transmit_buffer(ax25, skb, AX25_COMMAND); +} + +void ax25_kick(ax25_cb *ax25) +{ + struct sk_buff *skb, *skbn; + int last = 1; + unsigned short start, end, next; + + if (ax25->state != AX25_STATE_3 && ax25->state != AX25_STATE_4) + return; + + if (ax25->condition & AX25_COND_PEER_RX_BUSY) + return; + + if (skb_peek(&ax25->write_queue) == NULL) + return; + + start = (skb_peek(&ax25->ack_queue) == NULL) ? ax25->va : ax25->vs; + end = (ax25->va + ax25->window) % ax25->modulus; + + if (start == end) + return; + + /* + * Transmit data until either we're out of data to send or + * the window is full. Send a poll on the final I frame if + * the window is filled. + */ + + /* + * Dequeue the frame and copy it. + * Check for race with ax25_clear_queues(). + */ + skb = skb_dequeue(&ax25->write_queue); + if (!skb) + return; + + ax25->vs = start; + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&ax25->write_queue, skb); + break; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + next = (ax25->vs + 1) % ax25->modulus; + last = (next == end); + + /* + * Transmit the frame copy. + * bke 960114: do not set the Poll bit on the last frame + * in DAMA mode. + */ + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_send_iframe(ax25, skbn, (last) ? AX25_POLLON : AX25_POLLOFF); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + ax25_send_iframe(ax25, skbn, AX25_POLLOFF); + break; +#endif + } + + ax25->vs = next; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&ax25->ack_queue, skb); + + } while (!last && (skb = skb_dequeue(&ax25->write_queue)) != NULL); + + ax25->condition &= ~AX25_COND_ACK_PENDING; + + if (!ax25_t1timer_running(ax25)) { + ax25_stop_t3timer(ax25); + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + } +} + +void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) +{ + struct sk_buff *skbn; + unsigned char *ptr; + int headroom; + + if (ax25->ax25_dev == NULL) { + ax25_disconnect(ax25, ENETUNREACH); + return; + } + + headroom = ax25_addr_size(ax25->digipeat); + + if (skb_headroom(skb) < headroom) { + if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) { + printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n"); + kfree_skb(skb); + return; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + skb = skbn; + } + + ptr = skb_push(skb, headroom); + + ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus); + + ax25_queue_xmit(skb, ax25->ax25_dev->dev); +} + +/* + * A small shim to dev_queue_xmit to add the KISS control byte, and do + * any packet forwarding in operation. + */ +void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned char *ptr; + + skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev)); + + ptr = skb_push(skb, 1); + *ptr = 0x00; /* KISS */ + + dev_queue_xmit(skb); +} + +int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr) +{ + if (ax25->vs == nr) { + ax25_frames_acked(ax25, nr); + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + return 1; + } else { + if (ax25->va != nr) { + ax25_frames_acked(ax25, nr); + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + return 1; + } + } + return 0; +} + diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c new file mode 100644 index 00000000..a1690845 --- /dev/null +++ b/net/ax25/ax25_route.c @@ -0,0 +1,505 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static ax25_route *ax25_route_list; +static DEFINE_RWLOCK(ax25_route_lock); + +void ax25_rt_device_down(struct net_device *dev) +{ + ax25_route *s, *t, *ax25_rt; + + write_lock_bh(&ax25_route_lock); + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + + if (s->dev == dev) { + if (ax25_route_list == s) { + ax25_route_list = s->next; + kfree(s->digipeat); + kfree(s); + } else { + for (t = ax25_route_list; t != NULL; t = t->next) { + if (t->next == s) { + t->next = s->next; + kfree(s->digipeat); + kfree(s); + break; + } + } + } + } + } + write_unlock_bh(&ax25_route_lock); +} + +static int __must_check ax25_rt_add(struct ax25_routes_struct *route) +{ + ax25_route *ax25_rt; + ax25_dev *ax25_dev; + int i; + + if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) + return -EINVAL; + if (route->digi_count > AX25_MAX_DIGIS) + return -EINVAL; + + write_lock_bh(&ax25_route_lock); + + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 && + ax25_rt->dev == ax25_dev->dev) { + kfree(ax25_rt->digipeat); + ax25_rt->digipeat = NULL; + if (route->digi_count != 0) { + if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + write_unlock_bh(&ax25_route_lock); + return -ENOMEM; + } + ax25_rt->digipeat->lastrepeat = -1; + ax25_rt->digipeat->ndigi = route->digi_count; + for (i = 0; i < route->digi_count; i++) { + ax25_rt->digipeat->repeated[i] = 0; + ax25_rt->digipeat->calls[i] = route->digi_addr[i]; + } + } + write_unlock_bh(&ax25_route_lock); + return 0; + } + ax25_rt = ax25_rt->next; + } + + if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { + write_unlock_bh(&ax25_route_lock); + return -ENOMEM; + } + + atomic_set(&ax25_rt->refcount, 1); + ax25_rt->callsign = route->dest_addr; + ax25_rt->dev = ax25_dev->dev; + ax25_rt->digipeat = NULL; + ax25_rt->ip_mode = ' '; + if (route->digi_count != 0) { + if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + write_unlock_bh(&ax25_route_lock); + kfree(ax25_rt); + return -ENOMEM; + } + ax25_rt->digipeat->lastrepeat = -1; + ax25_rt->digipeat->ndigi = route->digi_count; + for (i = 0; i < route->digi_count; i++) { + ax25_rt->digipeat->repeated[i] = 0; + ax25_rt->digipeat->calls[i] = route->digi_addr[i]; + } + } + ax25_rt->next = ax25_route_list; + ax25_route_list = ax25_rt; + write_unlock_bh(&ax25_route_lock); + + return 0; +} + +void __ax25_put_route(ax25_route *ax25_rt) +{ + kfree(ax25_rt->digipeat); + kfree(ax25_rt); +} + +static int ax25_rt_del(struct ax25_routes_struct *route) +{ + ax25_route *s, *t, *ax25_rt; + ax25_dev *ax25_dev; + + if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) + return -EINVAL; + + write_lock_bh(&ax25_route_lock); + + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + if (s->dev == ax25_dev->dev && + ax25cmp(&route->dest_addr, &s->callsign) == 0) { + if (ax25_route_list == s) { + ax25_route_list = s->next; + ax25_put_route(s); + } else { + for (t = ax25_route_list; t != NULL; t = t->next) { + if (t->next == s) { + t->next = s->next; + ax25_put_route(s); + break; + } + } + } + } + } + write_unlock_bh(&ax25_route_lock); + + return 0; +} + +static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) +{ + ax25_route *ax25_rt; + ax25_dev *ax25_dev; + int err = 0; + + if ((ax25_dev = ax25_addr_ax25dev(&rt_option->port_addr)) == NULL) + return -EINVAL; + + write_lock_bh(&ax25_route_lock); + + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + if (ax25_rt->dev == ax25_dev->dev && + ax25cmp(&rt_option->dest_addr, &ax25_rt->callsign) == 0) { + switch (rt_option->cmd) { + case AX25_SET_RT_IPMODE: + switch (rt_option->arg) { + case ' ': + case 'D': + case 'V': + ax25_rt->ip_mode = rt_option->arg; + break; + default: + err = -EINVAL; + goto out; + } + break; + default: + err = -EINVAL; + goto out; + } + } + ax25_rt = ax25_rt->next; + } + +out: + write_unlock_bh(&ax25_route_lock); + return err; +} + +int ax25_rt_ioctl(unsigned int cmd, void __user *arg) +{ + struct ax25_route_opt_struct rt_option; + struct ax25_routes_struct route; + + switch (cmd) { + case SIOCADDRT: + if (copy_from_user(&route, arg, sizeof(route))) + return -EFAULT; + return ax25_rt_add(&route); + + case SIOCDELRT: + if (copy_from_user(&route, arg, sizeof(route))) + return -EFAULT; + return ax25_rt_del(&route); + + case SIOCAX25OPTRT: + if (copy_from_user(&rt_option, arg, sizeof(rt_option))) + return -EFAULT; + return ax25_rt_opt(&rt_option); + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_PROC_FS + +static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ax25_route_lock) +{ + struct ax25_route *ax25_rt; + int i = 1; + + read_lock(&ax25_route_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) { + if (i == *pos) + return ax25_rt; + ++i; + } + + return NULL; +} + +static void *ax25_rt_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (v == SEQ_START_TOKEN) ? ax25_route_list : + ((struct ax25_route *) v)->next; +} + +static void ax25_rt_seq_stop(struct seq_file *seq, void *v) + __releases(ax25_route_lock) +{ + read_unlock(&ax25_route_lock); +} + +static int ax25_rt_seq_show(struct seq_file *seq, void *v) +{ + char buf[11]; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "callsign dev mode digipeaters\n"); + else { + struct ax25_route *ax25_rt = v; + const char *callsign; + int i; + + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0) + callsign = "default"; + else + callsign = ax2asc(buf, &ax25_rt->callsign); + + seq_printf(seq, "%-9s %-4s", + callsign, + ax25_rt->dev ? ax25_rt->dev->name : "???"); + + switch (ax25_rt->ip_mode) { + case 'V': + seq_puts(seq, " vc"); + break; + case 'D': + seq_puts(seq, " dg"); + break; + default: + seq_puts(seq, " *"); + break; + } + + if (ax25_rt->digipeat != NULL) + for (i = 0; i < ax25_rt->digipeat->ndigi; i++) + seq_printf(seq, " %s", + ax2asc(buf, &ax25_rt->digipeat->calls[i])); + + seq_puts(seq, "\n"); + } + return 0; +} + +static const struct seq_operations ax25_rt_seqops = { + .start = ax25_rt_seq_start, + .next = ax25_rt_seq_next, + .stop = ax25_rt_seq_stop, + .show = ax25_rt_seq_show, +}; + +static int ax25_rt_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ax25_rt_seqops); +} + +const struct file_operations ax25_route_fops = { + .owner = THIS_MODULE, + .open = ax25_rt_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +/* + * Find AX.25 route + * + * Only routes with a reference count of zero can be destroyed. + */ +ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) +{ + ax25_route *ax25_spe_rt = NULL; + ax25_route *ax25_def_rt = NULL; + ax25_route *ax25_rt; + + read_lock(&ax25_route_lock); + /* + * Bind to the physical interface we heard them on, or the default + * route if none is found; + */ + for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) { + if (dev == NULL) { + if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev != NULL) + ax25_spe_rt = ax25_rt; + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev != NULL) + ax25_def_rt = ax25_rt; + } else { + if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev == dev) + ax25_spe_rt = ax25_rt; + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev == dev) + ax25_def_rt = ax25_rt; + } + } + + ax25_rt = ax25_def_rt; + if (ax25_spe_rt != NULL) + ax25_rt = ax25_spe_rt; + + if (ax25_rt != NULL) + ax25_hold_route(ax25_rt); + + read_unlock(&ax25_route_lock); + + return ax25_rt; +} + +/* + * Adjust path: If you specify a default route and want to connect + * a target on the digipeater path but w/o having a special route + * set before, the path has to be truncated from your target on. + */ +static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat) +{ + int k; + + for (k = 0; k < digipeat->ndigi; k++) { + if (ax25cmp(addr, &digipeat->calls[k]) == 0) + break; + } + + digipeat->ndigi = k; +} + + +/* + * Find which interface to use. + */ +int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) +{ + ax25_uid_assoc *user; + ax25_route *ax25_rt; + int err = 0; + + if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) + return -EHOSTUNREACH; + + if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { + err = -EHOSTUNREACH; + goto put; + } + + user = ax25_findbyuid(current_euid()); + if (user) { + ax25->source_addr = user->call; + ax25_uid_put(user); + } else { + if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { + err = -EPERM; + goto put; + } + ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr; + } + + if (ax25_rt->digipeat != NULL) { + ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi), + GFP_ATOMIC); + if (ax25->digipeat == NULL) { + err = -ENOMEM; + goto put; + } + ax25_adjust_path(addr, ax25->digipeat); + } + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + sock_reset_flag(ax25->sk, SOCK_ZAPPED); + bh_unlock_sock(ax25->sk); + } + +put: + ax25_put_route(ax25_rt); + + return err; +} + +struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, + ax25_address *dest, ax25_digi *digi) +{ + struct sk_buff *skbn; + unsigned char *bp; + int len; + + len = digi->ndigi * AX25_ADDR_LEN; + + if (skb_headroom(skb) < len) { + if ((skbn = skb_realloc_headroom(skb, len)) == NULL) { + printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n"); + return NULL; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + + skb = skbn; + } + + bp = skb_push(skb, len); + + ax25_addr_build(bp, src, dest, digi, AX25_COMMAND, AX25_MODULUS); + + return skb; +} + +/* + * Free all memory associated with routing structures. + */ +void __exit ax25_rt_free(void) +{ + ax25_route *s, *ax25_rt = ax25_route_list; + + write_lock_bh(&ax25_route_lock); + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + + kfree(s->digipeat); + kfree(s); + } + write_unlock_bh(&ax25_route_lock); +} diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c new file mode 100644 index 00000000..a8eef88d --- /dev/null +++ b/net/ax25/ax25_std_in.c @@ -0,0 +1,447 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + * + * Most of this code is based on the SDL diagrams published in the 7th ARRL + * Computer Networking Conference papers. The diagrams have mistakes in them, + * but are mostly correct. Before you modify the code could you read the SDL + * diagrams as the code is not obvious and probably very easy to break. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file ax25_std_timer.c. + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_SABME: + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE); + break; + + case AX25_UA: + if (pf) { + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_ESTABLISHED; + /* For WAIT_SABM connections we will produce an accept ready socket here */ + if (!sock_flag(ax25->sk, SOCK_DEAD)) + ax25->sk->sk_state_change(ax25->sk); + bh_unlock_sock(ax25->sk); + } + } + break; + + case AX25_DM: + if (pf) { + if (ax25->modulus == AX25_MODULUS) { + ax25_disconnect(ax25, ECONNREFUSED); + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } + } + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file ax25_std_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + case AX25_UA: + if (pf) + ax25_disconnect(ax25, 0); + break; + + case AX25_I: + case AX25_REJ: + case AX25_RNR: + case AX25_RR: + if (pf) ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file ax25_std_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type) +{ + int queued = 0; + + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + if (frametype == AX25_SABM) { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } else { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + } + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->condition = 0x00; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25_requeue_frames(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + ax25_disconnect(ax25, ECONNRESET); + break; + + case AX25_RR: + case AX25_RNR: + if (frametype == AX25_RR) + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + else + ax25->condition |= AX25_COND_PEER_RX_BUSY; + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_check_iframes_acked(ax25, nr); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_REJ: + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_requeue_frames(ax25); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_I: + if (!ax25_validate_nr(ax25, nr)) { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + break; + } + if (ax25->condition & AX25_COND_PEER_RX_BUSY) { + ax25_frames_acked(ax25, nr); + } else { + ax25_check_iframes_acked(ax25, nr); + } + if (ax25->condition & AX25_COND_OWN_RX_BUSY) { + if (pf) ax25_std_enquiry_response(ax25); + break; + } + if (ns == ax25->vr) { + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25->vr = ns; /* ax25->vr - 1 */ + ax25->condition &= ~AX25_COND_REJECT; + if (pf) { + ax25_std_enquiry_response(ax25); + } else { + if (!(ax25->condition & AX25_COND_ACK_PENDING)) { + ax25->condition |= AX25_COND_ACK_PENDING; + ax25_start_t2timer(ax25); + } + } + } else { + if (ax25->condition & AX25_COND_REJECT) { + if (pf) ax25_std_enquiry_response(ax25); + } else { + ax25->condition |= AX25_COND_REJECT; + ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE); + ax25->condition &= ~AX25_COND_ACK_PENDING; + } + } + break; + + case AX25_FRMR: + case AX25_ILLEGAL: + ax25_std_establish_data_link(ax25); + ax25->state = AX25_STATE_1; + break; + + default: + break; + } + + return queued; +} + +/* + * State machine for state 4, Timer Recovery State. + * The handling of the timer(s) is in file ax25_std_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state4_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type) +{ + int queued = 0; + + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + if (frametype == AX25_SABM) { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } else { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + } + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->condition = 0x00; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + ax25_requeue_frames(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + ax25_disconnect(ax25, ECONNRESET); + break; + + case AX25_RR: + case AX25_RNR: + if (frametype == AX25_RR) + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + else + ax25->condition |= AX25_COND_PEER_RX_BUSY; + if (type == AX25_RESPONSE && pf) { + ax25_stop_t1timer(ax25); + ax25->n2count = 0; + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + if (ax25->vs == ax25->va) { + ax25_start_t3timer(ax25); + ax25->state = AX25_STATE_3; + } else { + ax25_requeue_frames(ax25); + } + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + } + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_REJ: + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + if (pf && type == AX25_RESPONSE) { + ax25_stop_t1timer(ax25); + ax25->n2count = 0; + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + if (ax25->vs == ax25->va) { + ax25_start_t3timer(ax25); + ax25->state = AX25_STATE_3; + } else { + ax25_requeue_frames(ax25); + } + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + } + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + ax25_requeue_frames(ax25); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_I: + if (!ax25_validate_nr(ax25, nr)) { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + break; + } + ax25_frames_acked(ax25, nr); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) { + if (pf) + ax25_std_enquiry_response(ax25); + break; + } + if (ns == ax25->vr) { + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25->vr = ns; /* ax25->vr - 1 */ + ax25->condition &= ~AX25_COND_REJECT; + if (pf) { + ax25_std_enquiry_response(ax25); + } else { + if (!(ax25->condition & AX25_COND_ACK_PENDING)) { + ax25->condition |= AX25_COND_ACK_PENDING; + ax25_start_t2timer(ax25); + } + } + } else { + if (ax25->condition & AX25_COND_REJECT) { + if (pf) ax25_std_enquiry_response(ax25); + } else { + ax25->condition |= AX25_COND_REJECT; + ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE); + ax25->condition &= ~AX25_COND_ACK_PENDING; + } + } + break; + + case AX25_FRMR: + case AX25_ILLEGAL: + ax25_std_establish_data_link(ax25); + ax25->state = AX25_STATE_1; + break; + + default: + break; + } + + return queued; +} + +/* + * Higher level upcall for a LAPB frame + */ +int ax25_std_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type) +{ + int queued = 0, frametype, ns, nr, pf; + + frametype = ax25_decode(ax25, skb, &ns, &nr, &pf); + + switch (ax25->state) { + case AX25_STATE_1: + queued = ax25_std_state1_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_2: + queued = ax25_std_state2_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_3: + queued = ax25_std_state3_machine(ax25, skb, frametype, ns, nr, pf, type); + break; + case AX25_STATE_4: + queued = ax25_std_state4_machine(ax25, skb, frametype, ns, nr, pf, type); + break; + } + + ax25_kick(ax25); + + return queued; +} diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c new file mode 100644 index 00000000..277f81bb --- /dev/null +++ b/net/ax25/ax25_std_subr.c @@ -0,0 +1,87 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void ax25_std_nr_error_recovery(ax25_cb *ax25) +{ + ax25_std_establish_data_link(ax25); +} + +void ax25_std_establish_data_link(ax25_cb *ax25) +{ + ax25->condition = 0x00; + ax25->n2count = 0; + + if (ax25->modulus == AX25_MODULUS) + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND); + + ax25_calculate_t1(ax25); + ax25_stop_idletimer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t1timer(ax25); +} + +void ax25_std_transmit_enquiry(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_COMMAND); + + ax25->condition &= ~AX25_COND_ACK_PENDING; + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); +} + +void ax25_std_enquiry_response(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_RESPONSE); + else + ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_RESPONSE); + + ax25->condition &= ~AX25_COND_ACK_PENDING; +} + +void ax25_std_timeout_response(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25_send_control(ax25, AX25_RNR, AX25_POLLOFF, AX25_RESPONSE); + else + ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE); + + ax25->condition &= ~AX25_COND_ACK_PENDING; +} diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c new file mode 100644 index 00000000..96e4b927 --- /dev/null +++ b/net/ax25/ax25_std_timer.c @@ -0,0 +1,176 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void ax25_std_heartbeat_expiry(ax25_cb *ax25) +{ + struct sock *sk = ax25->sk; + + if (sk) + bh_lock_sock(sk); + + switch (ax25->state) { + case AX25_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (!sk || sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && + sock_flag(sk, SOCK_DEAD))) { + if (sk) { + sock_hold(sk); + ax25_destroy_socket(ax25); + bh_unlock_sock(sk); + sock_put(sk); + } else + ax25_destroy_socket(ax25); + return; + } + break; + + case AX25_STATE_3: + case AX25_STATE_4: + /* + * Check the state of the receive buffer. + */ + if (sk != NULL) { + if (atomic_read(&sk->sk_rmem_alloc) < + (sk->sk_rcvbuf >> 1) && + (ax25->condition & AX25_COND_OWN_RX_BUSY)) { + ax25->condition &= ~AX25_COND_OWN_RX_BUSY; + ax25->condition &= ~AX25_COND_ACK_PENDING; + ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE); + break; + } + } + } + + if (sk) + bh_unlock_sock(sk); + + ax25_start_heartbeat(ax25); +} + +void ax25_std_t2timer_expiry(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_ACK_PENDING) { + ax25->condition &= ~AX25_COND_ACK_PENDING; + ax25_std_timeout_response(ax25); + } +} + +void ax25_std_t3timer_expiry(ax25_cb *ax25) +{ + ax25->n2count = 0; + ax25_std_transmit_enquiry(ax25); + ax25->state = AX25_STATE_4; +} + +void ax25_std_idletimer_expiry(ax25_cb *ax25) +{ + ax25_clear_queues(ax25); + + ax25->n2count = 0; + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25->state = AX25_STATE_2; + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_CLOSE; + ax25->sk->sk_err = 0; + ax25->sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + ax25->sk->sk_state_change(ax25->sk); + sock_set_flag(ax25->sk, SOCK_DEAD); + } + bh_unlock_sock(ax25->sk); + } +} + +void ax25_std_t1timer_expiry(ax25_cb *ax25) +{ + switch (ax25->state) { + case AX25_STATE_1: + if (ax25->n2count == ax25->n2) { + if (ax25->modulus == AX25_MODULUS) { + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25->n2count = 0; + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + } + } else { + ax25->n2count++; + if (ax25->modulus == AX25_MODULUS) + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND); + } + break; + + case AX25_STATE_2: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + } + break; + + case AX25_STATE_3: + ax25->n2count = 1; + ax25_std_transmit_enquiry(ax25); + ax25->state = AX25_STATE_4; + break; + + case AX25_STATE_4: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + ax25_std_transmit_enquiry(ax25); + } + break; + } + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); +} diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c new file mode 100644 index 00000000..c6715ee4 --- /dev/null +++ b/net/ax25/ax25_subr.c @@ -0,0 +1,290 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all the queues of frames. + */ +void ax25_clear_queues(ax25_cb *ax25) +{ + skb_queue_purge(&ax25->write_queue); + skb_queue_purge(&ax25->ack_queue); + skb_queue_purge(&ax25->reseq_queue); + skb_queue_purge(&ax25->frag_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) +{ + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (ax25->va != nr) { + while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) { + skb = skb_dequeue(&ax25->ack_queue); + kfree_skb(skb); + ax25->va = (ax25->va + 1) % ax25->modulus; + } + } +} + +void ax25_requeue_frames(ax25_cb *ax25) +{ + struct sk_buff *skb; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by ax25_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ + while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL) + skb_queue_head(&ax25->write_queue, skb); +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int ax25_validate_nr(ax25_cb *ax25, unsigned short nr) +{ + unsigned short vc = ax25->va; + + while (vc != ax25->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % ax25->modulus; + } + + if (nr == ax25->vs) return 1; + + return 0; +} + +/* + * This routine is the centralised routine for parsing the control + * information for the different frame formats. + */ +int ax25_decode(ax25_cb *ax25, struct sk_buff *skb, int *ns, int *nr, int *pf) +{ + unsigned char *frame; + int frametype = AX25_ILLEGAL; + + frame = skb->data; + *ns = *nr = *pf = 0; + + if (ax25->modulus == AX25_MODULUS) { + if ((frame[0] & AX25_S) == 0) { + frametype = AX25_I; /* I frame - carries NR/NS/PF */ + *ns = (frame[0] >> 1) & 0x07; + *nr = (frame[0] >> 5) & 0x07; + *pf = frame[0] & AX25_PF; + } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */ + frametype = frame[0] & 0x0F; + *nr = (frame[0] >> 5) & 0x07; + *pf = frame[0] & AX25_PF; + } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */ + frametype = frame[0] & ~AX25_PF; + *pf = frame[0] & AX25_PF; + } + skb_pull(skb, 1); + } else { + if ((frame[0] & AX25_S) == 0) { + frametype = AX25_I; /* I frame - carries NR/NS/PF */ + *ns = (frame[0] >> 1) & 0x7F; + *nr = (frame[1] >> 1) & 0x7F; + *pf = frame[1] & AX25_EPF; + skb_pull(skb, 2); + } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */ + frametype = frame[0] & 0x0F; + *nr = (frame[1] >> 1) & 0x7F; + *pf = frame[1] & AX25_EPF; + skb_pull(skb, 2); + } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */ + frametype = frame[0] & ~AX25_PF; + *pf = frame[0] & AX25_PF; + skb_pull(skb, 1); + } + } + + return frametype; +} + +/* + * This routine is called when the HDLC layer internally generates a + * command or response for the remote machine ( eg. RR, UA etc. ). + * Only supervisory or unnumbered frames are processed. + */ +void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type) +{ + struct sk_buff *skb; + unsigned char *dptr; + + if ((skb = alloc_skb(ax25->ax25_dev->dev->hard_header_len + 2, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len); + + skb_reset_network_header(skb); + + /* Assume a response - address structure for DTE */ + if (ax25->modulus == AX25_MODULUS) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= (poll_bit) ? AX25_PF : 0; + if ((frametype & AX25_U) == AX25_S) /* S frames carry NR */ + *dptr |= (ax25->vr << 5); + } else { + if ((frametype & AX25_U) == AX25_U) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= (poll_bit) ? AX25_PF : 0; + } else { + dptr = skb_put(skb, 2); + dptr[0] = frametype; + dptr[1] = (ax25->vr << 1); + dptr[1] |= (poll_bit) ? AX25_EPF : 0; + } + } + + ax25_transmit_buffer(ax25, skb, type); +} + +/* + * Send a 'DM' to an unknown connection attempt, or an invalid caller. + * + * Note: src here is the sender, thus it's the target of the DM + */ +void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *dest, ax25_digi *digi) +{ + struct sk_buff *skb; + char *dptr; + ax25_digi retdigi; + + if (dev == NULL) + return; + + if ((skb = alloc_skb(dev->hard_header_len + 1, GFP_ATOMIC)) == NULL) + return; /* Next SABM will get DM'd */ + + skb_reserve(skb, dev->hard_header_len); + skb_reset_network_header(skb); + + ax25_digi_invert(digi, &retdigi); + + dptr = skb_put(skb, 1); + + *dptr = AX25_DM | AX25_PF; + + /* + * Do the address ourselves + */ + dptr = skb_push(skb, ax25_addr_size(digi)); + dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS); + + ax25_queue_xmit(skb, dev); +} + +/* + * Exponential backoff for AX.25 + */ +void ax25_calculate_t1(ax25_cb *ax25) +{ + int n, t = 2; + + switch (ax25->backoff) { + case 0: + break; + + case 1: + t += 2 * ax25->n2count; + break; + + case 2: + for (n = 0; n < ax25->n2count; n++) + t *= 2; + if (t > 8) t = 8; + break; + } + + ax25->t1 = t * ax25->rtt; +} + +/* + * Calculate the Round Trip Time + */ +void ax25_calculate_rtt(ax25_cb *ax25) +{ + if (ax25->backoff == 0) + return; + + if (ax25_t1timer_running(ax25) && ax25->n2count == 0) + ax25->rtt = (9 * ax25->rtt + ax25->t1 - ax25_display_timer(&ax25->t1timer)) / 10; + + if (ax25->rtt < AX25_T1CLAMPLO) + ax25->rtt = AX25_T1CLAMPLO; + + if (ax25->rtt > AX25_T1CLAMPHI) + ax25->rtt = AX25_T1CLAMPHI; +} + +void ax25_disconnect(ax25_cb *ax25, int reason) +{ + ax25_clear_queues(ax25); + + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + + ax25->state = AX25_STATE_0; + + ax25_link_failed(ax25, reason); + + if (ax25->sk != NULL) { + local_bh_disable(); + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_CLOSE; + ax25->sk->sk_err = reason; + ax25->sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + ax25->sk->sk_state_change(ax25->sk); + sock_set_flag(ax25->sk, SOCK_DEAD); + } + bh_unlock_sock(ax25->sk); + local_bh_enable(); + } +} diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c new file mode 100644 index 00000000..db29ea71 --- /dev/null +++ b/net/ax25/ax25_timer.c @@ -0,0 +1,227 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) + * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void ax25_heartbeat_expiry(unsigned long); +static void ax25_t1timer_expiry(unsigned long); +static void ax25_t2timer_expiry(unsigned long); +static void ax25_t3timer_expiry(unsigned long); +static void ax25_idletimer_expiry(unsigned long); + +void ax25_setup_timers(ax25_cb *ax25) +{ + setup_timer(&ax25->timer, ax25_heartbeat_expiry, (unsigned long)ax25); + setup_timer(&ax25->t1timer, ax25_t1timer_expiry, (unsigned long)ax25); + setup_timer(&ax25->t2timer, ax25_t2timer_expiry, (unsigned long)ax25); + setup_timer(&ax25->t3timer, ax25_t3timer_expiry, (unsigned long)ax25); + setup_timer(&ax25->idletimer, ax25_idletimer_expiry, + (unsigned long)ax25); +} + +void ax25_start_heartbeat(ax25_cb *ax25) +{ + mod_timer(&ax25->timer, jiffies + 5 * HZ); +} + +void ax25_start_t1timer(ax25_cb *ax25) +{ + mod_timer(&ax25->t1timer, jiffies + ax25->t1); +} + +void ax25_start_t2timer(ax25_cb *ax25) +{ + mod_timer(&ax25->t2timer, jiffies + ax25->t2); +} + +void ax25_start_t3timer(ax25_cb *ax25) +{ + if (ax25->t3 > 0) + mod_timer(&ax25->t3timer, jiffies + ax25->t3); + else + del_timer(&ax25->t3timer); +} + +void ax25_start_idletimer(ax25_cb *ax25) +{ + if (ax25->idle > 0) + mod_timer(&ax25->idletimer, jiffies + ax25->idle); + else + del_timer(&ax25->idletimer); +} + +void ax25_stop_heartbeat(ax25_cb *ax25) +{ + del_timer(&ax25->timer); +} + +void ax25_stop_t1timer(ax25_cb *ax25) +{ + del_timer(&ax25->t1timer); +} + +void ax25_stop_t2timer(ax25_cb *ax25) +{ + del_timer(&ax25->t2timer); +} + +void ax25_stop_t3timer(ax25_cb *ax25) +{ + del_timer(&ax25->t3timer); +} + +void ax25_stop_idletimer(ax25_cb *ax25) +{ + del_timer(&ax25->idletimer); +} + +int ax25_t1timer_running(ax25_cb *ax25) +{ + return timer_pending(&ax25->t1timer); +} + +unsigned long ax25_display_timer(struct timer_list *timer) +{ + if (!timer_pending(timer)) + return 0; + + return timer->expires - jiffies; +} + +EXPORT_SYMBOL(ax25_display_timer); + +static void ax25_heartbeat_expiry(unsigned long param) +{ + int proto = AX25_PROTO_STD_SIMPLEX; + ax25_cb *ax25 = (ax25_cb *)param; + + if (ax25->ax25_dev) + proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]; + + switch (proto) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_heartbeat_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25->ax25_dev->dama.slave) + ax25_ds_heartbeat_expiry(ax25); + else + ax25_std_heartbeat_expiry(ax25); + break; +#endif + } +} + +static void ax25_t1timer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_t1timer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) + ax25_std_t1timer_expiry(ax25); + break; +#endif + } +} + +static void ax25_t2timer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_t2timer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) + ax25_std_t2timer_expiry(ax25); + break; +#endif + } +} + +static void ax25_t3timer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_t3timer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25->ax25_dev->dama.slave) + ax25_ds_t3timer_expiry(ax25); + else + ax25_std_t3timer_expiry(ax25); + break; +#endif + } +} + +static void ax25_idletimer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_idletimer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25->ax25_dev->dama.slave) + ax25_ds_idletimer_expiry(ax25); + else + ax25_std_idletimer_expiry(ax25); + break; +#endif + } +} diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c new file mode 100644 index 00000000..d349be95 --- /dev/null +++ b/net/ax25/ax25_uid.c @@ -0,0 +1,218 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines. + */ + +static HLIST_HEAD(ax25_uid_list); +static DEFINE_RWLOCK(ax25_uid_lock); + +int ax25_uid_policy; + +EXPORT_SYMBOL(ax25_uid_policy); + +ax25_uid_assoc *ax25_findbyuid(uid_t uid) +{ + ax25_uid_assoc *ax25_uid, *res = NULL; + struct hlist_node *node; + + read_lock(&ax25_uid_lock); + ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + if (ax25_uid->uid == uid) { + ax25_uid_hold(ax25_uid); + res = ax25_uid; + break; + } + } + read_unlock(&ax25_uid_lock); + + return res; +} + +EXPORT_SYMBOL(ax25_findbyuid); + +int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) +{ + ax25_uid_assoc *ax25_uid; + struct hlist_node *node; + ax25_uid_assoc *user; + unsigned long res; + + switch (cmd) { + case SIOCAX25GETUID: + res = -ENOENT; + read_lock(&ax25_uid_lock); + ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { + res = ax25_uid->uid; + break; + } + } + read_unlock(&ax25_uid_lock); + + return res; + + case SIOCAX25ADDUID: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + user = ax25_findbyuid(sax->sax25_uid); + if (user) { + ax25_uid_put(user); + return -EEXIST; + } + if (sax->sax25_uid == 0) + return -EINVAL; + if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL) + return -ENOMEM; + + atomic_set(&ax25_uid->refcount, 1); + ax25_uid->uid = sax->sax25_uid; + ax25_uid->call = sax->sax25_call; + + write_lock(&ax25_uid_lock); + hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list); + write_unlock(&ax25_uid_lock); + + return 0; + + case SIOCAX25DELUID: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ax25_uid = NULL; + write_lock(&ax25_uid_lock); + ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) + break; + } + if (ax25_uid == NULL) { + write_unlock(&ax25_uid_lock); + return -ENOENT; + } + hlist_del_init(&ax25_uid->uid_node); + ax25_uid_put(ax25_uid); + write_unlock(&ax25_uid_lock); + + return 0; + + default: + return -EINVAL; + } + + return -EINVAL; /*NOTREACHED */ +} + +#ifdef CONFIG_PROC_FS + +static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ax25_uid_lock) +{ + read_lock(&ax25_uid_lock); + return seq_hlist_start_head(&ax25_uid_list, *pos); +} + +static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &ax25_uid_list, pos); +} + +static void ax25_uid_seq_stop(struct seq_file *seq, void *v) + __releases(ax25_uid_lock) +{ + read_unlock(&ax25_uid_lock); +} + +static int ax25_uid_seq_show(struct seq_file *seq, void *v) +{ + char buf[11]; + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "Policy: %d\n", ax25_uid_policy); + else { + struct ax25_uid_assoc *pt; + + pt = hlist_entry(v, struct ax25_uid_assoc, uid_node); + seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(buf, &pt->call)); + } + return 0; +} + +static const struct seq_operations ax25_uid_seqops = { + .start = ax25_uid_seq_start, + .next = ax25_uid_seq_next, + .stop = ax25_uid_seq_stop, + .show = ax25_uid_seq_show, +}; + +static int ax25_uid_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ax25_uid_seqops); +} + +const struct file_operations ax25_uid_fops = { + .owner = THIS_MODULE, + .open = ax25_uid_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +/* + * Free all memory associated with UID/Callsign structures. + */ +void __exit ax25_uid_free(void) +{ + ax25_uid_assoc *ax25_uid; + struct hlist_node *node; + + write_lock(&ax25_uid_lock); +again: + ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) { + hlist_del_init(&ax25_uid->uid_node); + ax25_uid_put(ax25_uid); + goto again; + } + write_unlock(&ax25_uid_lock); +} diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c new file mode 100644 index 00000000..ebe0ef3f --- /dev/null +++ b/net/ax25/sysctl_net_ax25.c @@ -0,0 +1,210 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) + */ +#include +#include +#include +#include +#include + +static int min_ipdefmode[1], max_ipdefmode[] = {1}; +static int min_axdefmode[1], max_axdefmode[] = {1}; +static int min_backoff[1], max_backoff[] = {2}; +static int min_conmode[1], max_conmode[] = {2}; +static int min_window[] = {1}, max_window[] = {7}; +static int min_ewindow[] = {1}, max_ewindow[] = {63}; +static int min_t1[] = {1}, max_t1[] = {30000}; +static int min_t2[] = {1}, max_t2[] = {20000}; +static int min_t3[1], max_t3[] = {3600000}; +static int min_idle[1], max_idle[] = {65535000}; +static int min_n2[] = {1}, max_n2[] = {31}; +static int min_paclen[] = {1}, max_paclen[] = {512}; +static int min_proto[1], max_proto[] = { AX25_PROTO_MAX }; +#ifdef CONFIG_AX25_DAMA_SLAVE +static int min_ds_timeout[1], max_ds_timeout[] = {65535000}; +#endif + +static struct ctl_table_header *ax25_table_header; + +static ctl_table *ax25_table; +static int ax25_table_size; + +static struct ctl_path ax25_path[] = { + { .procname = "net", }, + { .procname = "ax25", }, + { } +}; + +static const ctl_table ax25_param_table[] = { + { + .procname = "ip_default_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ipdefmode, + .extra2 = &max_ipdefmode + }, + { + .procname = "ax25_default_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_axdefmode, + .extra2 = &max_axdefmode + }, + { + .procname = "backoff_type", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_backoff, + .extra2 = &max_backoff + }, + { + .procname = "connect_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_conmode, + .extra2 = &max_conmode + }, + { + .procname = "standard_window_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_window, + .extra2 = &max_window + }, + { + .procname = "extended_window_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ewindow, + .extra2 = &max_ewindow + }, + { + .procname = "t1_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t1, + .extra2 = &max_t1 + }, + { + .procname = "t2_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t2, + .extra2 = &max_t2 + }, + { + .procname = "t3_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t3, + .extra2 = &max_t3 + }, + { + .procname = "idle_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_idle, + .extra2 = &max_idle + }, + { + .procname = "maximum_retry_count", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_n2, + .extra2 = &max_n2 + }, + { + .procname = "maximum_packet_length", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_paclen, + .extra2 = &max_paclen + }, + { + .procname = "protocol", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_proto, + .extra2 = &max_proto + }, +#ifdef CONFIG_AX25_DAMA_SLAVE + { + .procname = "dama_slave_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ds_timeout, + .extra2 = &max_ds_timeout + }, +#endif + + { } /* that's all, folks! */ +}; + +void ax25_register_sysctl(void) +{ + ax25_dev *ax25_dev; + int n, k; + + spin_lock_bh(&ax25_dev_lock); + for (ax25_table_size = sizeof(ctl_table), ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) + ax25_table_size += sizeof(ctl_table); + + if ((ax25_table = kzalloc(ax25_table_size, GFP_ATOMIC)) == NULL) { + spin_unlock_bh(&ax25_dev_lock); + return; + } + + for (n = 0, ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) { + struct ctl_table *child = kmemdup(ax25_param_table, + sizeof(ax25_param_table), + GFP_ATOMIC); + if (!child) { + while (n--) + kfree(ax25_table[n].child); + kfree(ax25_table); + spin_unlock_bh(&ax25_dev_lock); + return; + } + ax25_table[n].child = ax25_dev->systable = child; + ax25_table[n].procname = ax25_dev->dev->name; + ax25_table[n].mode = 0555; + + + for (k = 0; k < AX25_MAX_VALUES; k++) + child[k].data = &ax25_dev->values[k]; + + n++; + } + spin_unlock_bh(&ax25_dev_lock); + + ax25_table_header = register_sysctl_paths(ax25_path, ax25_table); +} + +void ax25_unregister_sysctl(void) +{ + ctl_table *p; + unregister_sysctl_table(ax25_table_header); + + for (p = ax25_table; p->procname; p++) + kfree(p->child); + kfree(ax25_table); +} diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig new file mode 100644 index 00000000..6c051ad8 --- /dev/null +++ b/net/batman-adv/Kconfig @@ -0,0 +1,25 @@ +# +# B.A.T.M.A.N meshing protocol +# + +config BATMAN_ADV + tristate "B.A.T.M.A.N. Advanced Meshing Protocol" + depends on NET + default n + ---help--- + + B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is + a routing protocol for multi-hop ad-hoc mesh networks. The + networks may be wired or wireless. See + http://www.open-mesh.org/ for more information and user space + tools. + +config BATMAN_ADV_DEBUG + bool "B.A.T.M.A.N. debugging" + depends on BATMAN_ADV != n + ---help--- + + This is an option for use by developers; most people should + say N here. This enables compilation of support for + outputting debugging information to the kernel log. The + output is controlled via the module parameter debug. diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile new file mode 100644 index 00000000..2de93d00 --- /dev/null +++ b/net/batman-adv/Makefile @@ -0,0 +1,39 @@ +# +# Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: +# +# Marek Lindner, Simon Wunderlich +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA +# + +obj-$(CONFIG_BATMAN_ADV) += batman-adv.o +batman-adv-y += aggregation.o +batman-adv-y += bat_debugfs.o +batman-adv-y += bat_sysfs.o +batman-adv-y += bitarray.o +batman-adv-y += gateway_client.o +batman-adv-y += gateway_common.o +batman-adv-y += hard-interface.o +batman-adv-y += hash.o +batman-adv-y += icmp_socket.o +batman-adv-y += main.o +batman-adv-y += originator.o +batman-adv-y += ring_buffer.o +batman-adv-y += routing.o +batman-adv-y += send.o +batman-adv-y += soft-interface.o +batman-adv-y += translation-table.o +batman-adv-y += unicast.o +batman-adv-y += vis.o diff --git a/net/batman-adv/aggregation.c b/net/batman-adv/aggregation.c new file mode 100644 index 00000000..a8c32030 --- /dev/null +++ b/net/batman-adv/aggregation.c @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "aggregation.h" +#include "send.h" +#include "routing.h" +#include "hard-interface.h" + +/* calculate the size of the tt information for a given packet */ +static int tt_len(struct batman_packet *batman_packet) +{ + return batman_packet->num_tt * ETH_ALEN; +} + +/* return true if new_packet can be aggregated with forw_packet */ +static bool can_aggregate_with(struct batman_packet *new_batman_packet, + int packet_len, + unsigned long send_time, + bool directlink, + struct hard_iface *if_incoming, + struct forw_packet *forw_packet) +{ + struct batman_packet *batman_packet = + (struct batman_packet *)forw_packet->skb->data; + int aggregated_bytes = forw_packet->packet_len + packet_len; + + /** + * we can aggregate the current packet to this aggregated packet + * if: + * + * - the send time is within our MAX_AGGREGATION_MS time + * - the resulting packet wont be bigger than + * MAX_AGGREGATION_BYTES + */ + + if (time_before(send_time, forw_packet->send_time) && + time_after_eq(send_time + msecs_to_jiffies(MAX_AGGREGATION_MS), + forw_packet->send_time) && + (aggregated_bytes <= MAX_AGGREGATION_BYTES)) { + + /** + * check aggregation compatibility + * -> direct link packets are broadcasted on + * their interface only + * -> aggregate packet if the current packet is + * a "global" packet as well as the base + * packet + */ + + /* packets without direct link flag and high TTL + * are flooded through the net */ + if ((!directlink) && + (!(batman_packet->flags & DIRECTLINK)) && + (batman_packet->ttl != 1) && + + /* own packets originating non-primary + * interfaces leave only that interface */ + ((!forw_packet->own) || + (forw_packet->if_incoming->if_num == 0))) + return true; + + /* if the incoming packet is sent via this one + * interface only - we still can aggregate */ + if ((directlink) && + (new_batman_packet->ttl == 1) && + (forw_packet->if_incoming == if_incoming) && + + /* packets from direct neighbors or + * own secondary interface packets + * (= secondary interface packets in general) */ + (batman_packet->flags & DIRECTLINK || + (forw_packet->own && + forw_packet->if_incoming->if_num != 0))) + return true; + } + + return false; +} + +/* create a new aggregated packet and add this packet to it */ +static void new_aggregated_packet(unsigned char *packet_buff, int packet_len, + unsigned long send_time, bool direct_link, + struct hard_iface *if_incoming, + int own_packet) +{ + struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct forw_packet *forw_packet_aggr; + unsigned char *skb_buff; + + if (!atomic_inc_not_zero(&if_incoming->refcount)) + return; + + /* own packet should always be scheduled */ + if (!own_packet) { + if (!atomic_dec_not_zero(&bat_priv->batman_queue_left)) { + bat_dbg(DBG_BATMAN, bat_priv, + "batman packet queue full\n"); + goto out; + } + } + + forw_packet_aggr = kmalloc(sizeof(struct forw_packet), GFP_ATOMIC); + if (!forw_packet_aggr) { + if (!own_packet) + atomic_inc(&bat_priv->batman_queue_left); + goto out; + } + + if ((atomic_read(&bat_priv->aggregated_ogms)) && + (packet_len < MAX_AGGREGATION_BYTES)) + forw_packet_aggr->skb = dev_alloc_skb(MAX_AGGREGATION_BYTES + + sizeof(struct ethhdr)); + else + forw_packet_aggr->skb = dev_alloc_skb(packet_len + + sizeof(struct ethhdr)); + + if (!forw_packet_aggr->skb) { + if (!own_packet) + atomic_inc(&bat_priv->batman_queue_left); + kfree(forw_packet_aggr); + goto out; + } + skb_reserve(forw_packet_aggr->skb, sizeof(struct ethhdr)); + + INIT_HLIST_NODE(&forw_packet_aggr->list); + + skb_buff = skb_put(forw_packet_aggr->skb, packet_len); + forw_packet_aggr->packet_len = packet_len; + memcpy(skb_buff, packet_buff, packet_len); + + forw_packet_aggr->own = own_packet; + forw_packet_aggr->if_incoming = if_incoming; + forw_packet_aggr->num_packets = 0; + forw_packet_aggr->direct_link_flags = 0; + forw_packet_aggr->send_time = send_time; + + /* save packet direct link flag status */ + if (direct_link) + forw_packet_aggr->direct_link_flags |= 1; + + /* add new packet to packet list */ + spin_lock_bh(&bat_priv->forw_bat_list_lock); + hlist_add_head(&forw_packet_aggr->list, &bat_priv->forw_bat_list); + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + /* start timer for this packet */ + INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, + send_outstanding_bat_packet); + queue_delayed_work(bat_event_workqueue, + &forw_packet_aggr->delayed_work, + send_time - jiffies); + + return; +out: + hardif_free_ref(if_incoming); +} + +/* aggregate a new packet into the existing aggregation */ +static void aggregate(struct forw_packet *forw_packet_aggr, + unsigned char *packet_buff, + int packet_len, + bool direct_link) +{ + unsigned char *skb_buff; + + skb_buff = skb_put(forw_packet_aggr->skb, packet_len); + memcpy(skb_buff, packet_buff, packet_len); + forw_packet_aggr->packet_len += packet_len; + forw_packet_aggr->num_packets++; + + /* save packet direct link flag status */ + if (direct_link) + forw_packet_aggr->direct_link_flags |= + (1 << forw_packet_aggr->num_packets); +} + +void add_bat_packet_to_list(struct bat_priv *bat_priv, + unsigned char *packet_buff, int packet_len, + struct hard_iface *if_incoming, char own_packet, + unsigned long send_time) +{ + /** + * _aggr -> pointer to the packet we want to aggregate with + * _pos -> pointer to the position in the queue + */ + struct forw_packet *forw_packet_aggr = NULL, *forw_packet_pos = NULL; + struct hlist_node *tmp_node; + struct batman_packet *batman_packet = + (struct batman_packet *)packet_buff; + bool direct_link = batman_packet->flags & DIRECTLINK ? 1 : 0; + + /* find position for the packet in the forward queue */ + spin_lock_bh(&bat_priv->forw_bat_list_lock); + /* own packets are not to be aggregated */ + if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) { + hlist_for_each_entry(forw_packet_pos, tmp_node, + &bat_priv->forw_bat_list, list) { + if (can_aggregate_with(batman_packet, + packet_len, + send_time, + direct_link, + if_incoming, + forw_packet_pos)) { + forw_packet_aggr = forw_packet_pos; + break; + } + } + } + + /* nothing to aggregate with - either aggregation disabled or no + * suitable aggregation packet found */ + if (!forw_packet_aggr) { + /* the following section can run without the lock */ + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + /** + * if we could not aggregate this packet with one of the others + * we hold it back for a while, so that it might be aggregated + * later on + */ + if ((!own_packet) && + (atomic_read(&bat_priv->aggregated_ogms))) + send_time += msecs_to_jiffies(MAX_AGGREGATION_MS); + + new_aggregated_packet(packet_buff, packet_len, + send_time, direct_link, + if_incoming, own_packet); + } else { + aggregate(forw_packet_aggr, + packet_buff, packet_len, + direct_link); + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + } +} + +/* unpack the aggregated packets and process them one by one */ +void receive_aggr_bat_packet(struct ethhdr *ethhdr, unsigned char *packet_buff, + int packet_len, struct hard_iface *if_incoming) +{ + struct batman_packet *batman_packet; + int buff_pos = 0; + unsigned char *tt_buff; + + batman_packet = (struct batman_packet *)packet_buff; + + do { + /* network to host order for our 32bit seqno, and the + orig_interval. */ + batman_packet->seqno = ntohl(batman_packet->seqno); + + tt_buff = packet_buff + buff_pos + BAT_PACKET_LEN; + receive_bat_packet(ethhdr, batman_packet, + tt_buff, tt_len(batman_packet), + if_incoming); + + buff_pos += BAT_PACKET_LEN + tt_len(batman_packet); + batman_packet = (struct batman_packet *) + (packet_buff + buff_pos); + } while (aggregated_packet(buff_pos, packet_len, + batman_packet->num_tt)); +} diff --git a/net/batman-adv/aggregation.h b/net/batman-adv/aggregation.h new file mode 100644 index 00000000..7e6d72fb --- /dev/null +++ b/net/batman-adv/aggregation.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_AGGREGATION_H_ +#define _NET_BATMAN_ADV_AGGREGATION_H_ + +#include "main.h" + +/* is there another aggregated packet here? */ +static inline int aggregated_packet(int buff_pos, int packet_len, int num_tt) +{ + int next_buff_pos = buff_pos + BAT_PACKET_LEN + (num_tt * ETH_ALEN); + + return (next_buff_pos <= packet_len) && + (next_buff_pos <= MAX_AGGREGATION_BYTES); +} + +void add_bat_packet_to_list(struct bat_priv *bat_priv, + unsigned char *packet_buff, int packet_len, + struct hard_iface *if_incoming, char own_packet, + unsigned long send_time); +void receive_aggr_bat_packet(struct ethhdr *ethhdr, unsigned char *packet_buff, + int packet_len, struct hard_iface *if_incoming); + +#endif /* _NET_BATMAN_ADV_AGGREGATION_H_ */ diff --git a/net/batman-adv/bat_debugfs.c b/net/batman-adv/bat_debugfs.c new file mode 100644 index 00000000..abaeec5f --- /dev/null +++ b/net/batman-adv/bat_debugfs.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" + +#include + +#include "bat_debugfs.h" +#include "translation-table.h" +#include "originator.h" +#include "hard-interface.h" +#include "gateway_common.h" +#include "gateway_client.h" +#include "soft-interface.h" +#include "vis.h" +#include "icmp_socket.h" + +static struct dentry *bat_debugfs; + +#ifdef CONFIG_BATMAN_ADV_DEBUG +#define LOG_BUFF_MASK (log_buff_len-1) +#define LOG_BUFF(idx) (debug_log->log_buff[(idx) & LOG_BUFF_MASK]) + +static int log_buff_len = LOG_BUF_LEN; + +static void emit_log_char(struct debug_log *debug_log, char c) +{ + LOG_BUFF(debug_log->log_end) = c; + debug_log->log_end++; + + if (debug_log->log_end - debug_log->log_start > log_buff_len) + debug_log->log_start = debug_log->log_end - log_buff_len; +} + +static int fdebug_log(struct debug_log *debug_log, char *fmt, ...) +{ + va_list args; + static char debug_log_buf[256]; + char *p; + + if (!debug_log) + return 0; + + spin_lock_bh(&debug_log->lock); + va_start(args, fmt); + vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args); + va_end(args); + + for (p = debug_log_buf; *p != 0; p++) + emit_log_char(debug_log, *p); + + spin_unlock_bh(&debug_log->lock); + + wake_up(&debug_log->queue_wait); + + return 0; +} + +int debug_log(struct bat_priv *bat_priv, char *fmt, ...) +{ + va_list args; + char tmp_log_buf[256]; + + va_start(args, fmt); + vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args); + fdebug_log(bat_priv->debug_log, "[%10u] %s", + (jiffies / HZ), tmp_log_buf); + va_end(args); + + return 0; +} + +static int log_open(struct inode *inode, struct file *file) +{ + nonseekable_open(inode, file); + file->private_data = inode->i_private; + inc_module_count(); + return 0; +} + +static int log_release(struct inode *inode, struct file *file) +{ + dec_module_count(); + return 0; +} + +static ssize_t log_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct bat_priv *bat_priv = file->private_data; + struct debug_log *debug_log = bat_priv->debug_log; + int error, i = 0; + char c; + + if ((file->f_flags & O_NONBLOCK) && + !(debug_log->log_end - debug_log->log_start)) + return -EAGAIN; + + if ((!buf) || (count < 0)) + return -EINVAL; + + if (count == 0) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + error = wait_event_interruptible(debug_log->queue_wait, + (debug_log->log_start - debug_log->log_end)); + + if (error) + return error; + + spin_lock_bh(&debug_log->lock); + + while ((!error) && (i < count) && + (debug_log->log_start != debug_log->log_end)) { + c = LOG_BUFF(debug_log->log_start); + + debug_log->log_start++; + + spin_unlock_bh(&debug_log->lock); + + error = __put_user(c, buf); + + spin_lock_bh(&debug_log->lock); + + buf++; + i++; + + } + + spin_unlock_bh(&debug_log->lock); + + if (!error) + return i; + + return error; +} + +static unsigned int log_poll(struct file *file, poll_table *wait) +{ + struct bat_priv *bat_priv = file->private_data; + struct debug_log *debug_log = bat_priv->debug_log; + + poll_wait(file, &debug_log->queue_wait, wait); + + if (debug_log->log_end - debug_log->log_start) + return POLLIN | POLLRDNORM; + + return 0; +} + +static const struct file_operations log_fops = { + .open = log_open, + .release = log_release, + .read = log_read, + .poll = log_poll, + .llseek = no_llseek, +}; + +static int debug_log_setup(struct bat_priv *bat_priv) +{ + struct dentry *d; + + if (!bat_priv->debug_dir) + goto err; + + bat_priv->debug_log = kzalloc(sizeof(struct debug_log), GFP_ATOMIC); + if (!bat_priv->debug_log) + goto err; + + spin_lock_init(&bat_priv->debug_log->lock); + init_waitqueue_head(&bat_priv->debug_log->queue_wait); + + d = debugfs_create_file("log", S_IFREG | S_IRUSR, + bat_priv->debug_dir, bat_priv, &log_fops); + if (d) + goto err; + + return 0; + +err: + return 1; +} + +static void debug_log_cleanup(struct bat_priv *bat_priv) +{ + kfree(bat_priv->debug_log); + bat_priv->debug_log = NULL; +} +#else /* CONFIG_BATMAN_ADV_DEBUG */ +static int debug_log_setup(struct bat_priv *bat_priv) +{ + bat_priv->debug_log = NULL; + return 0; +} + +static void debug_log_cleanup(struct bat_priv *bat_priv) +{ + return; +} +#endif + +static int originators_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, orig_seq_print_text, net_dev); +} + +static int gateways_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, gw_client_seq_print_text, net_dev); +} + +static int softif_neigh_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, softif_neigh_seq_print_text, net_dev); +} + +static int transtable_global_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, tt_global_seq_print_text, net_dev); +} + +static int transtable_local_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, tt_local_seq_print_text, net_dev); +} + +static int vis_data_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + return single_open(file, vis_seq_print_text, net_dev); +} + +struct bat_debuginfo { + struct attribute attr; + const struct file_operations fops; +}; + +#define BAT_DEBUGINFO(_name, _mode, _open) \ +struct bat_debuginfo bat_debuginfo_##_name = { \ + .attr = { .name = __stringify(_name), \ + .mode = _mode, }, \ + .fops = { .owner = THIS_MODULE, \ + .open = _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + } \ +}; + +static BAT_DEBUGINFO(originators, S_IRUGO, originators_open); +static BAT_DEBUGINFO(gateways, S_IRUGO, gateways_open); +static BAT_DEBUGINFO(softif_neigh, S_IRUGO, softif_neigh_open); +static BAT_DEBUGINFO(transtable_global, S_IRUGO, transtable_global_open); +static BAT_DEBUGINFO(transtable_local, S_IRUGO, transtable_local_open); +static BAT_DEBUGINFO(vis_data, S_IRUGO, vis_data_open); + +static struct bat_debuginfo *mesh_debuginfos[] = { + &bat_debuginfo_originators, + &bat_debuginfo_gateways, + &bat_debuginfo_softif_neigh, + &bat_debuginfo_transtable_global, + &bat_debuginfo_transtable_local, + &bat_debuginfo_vis_data, + NULL, +}; + +void debugfs_init(void) +{ + bat_debugfs = debugfs_create_dir(DEBUGFS_BAT_SUBDIR, NULL); + if (bat_debugfs == ERR_PTR(-ENODEV)) + bat_debugfs = NULL; +} + +void debugfs_destroy(void) +{ + if (bat_debugfs) { + debugfs_remove_recursive(bat_debugfs); + bat_debugfs = NULL; + } +} + +int debugfs_add_meshif(struct net_device *dev) +{ + struct bat_priv *bat_priv = netdev_priv(dev); + struct bat_debuginfo **bat_debug; + struct dentry *file; + + if (!bat_debugfs) + goto out; + + bat_priv->debug_dir = debugfs_create_dir(dev->name, bat_debugfs); + if (!bat_priv->debug_dir) + goto out; + + bat_socket_setup(bat_priv); + debug_log_setup(bat_priv); + + for (bat_debug = mesh_debuginfos; *bat_debug; ++bat_debug) { + file = debugfs_create_file(((*bat_debug)->attr).name, + S_IFREG | ((*bat_debug)->attr).mode, + bat_priv->debug_dir, + dev, &(*bat_debug)->fops); + if (!file) { + bat_err(dev, "Can't add debugfs file: %s/%s\n", + dev->name, ((*bat_debug)->attr).name); + goto rem_attr; + } + } + + return 0; +rem_attr: + debugfs_remove_recursive(bat_priv->debug_dir); + bat_priv->debug_dir = NULL; +out: +#ifdef CONFIG_DEBUG_FS + return -ENOMEM; +#else + return 0; +#endif /* CONFIG_DEBUG_FS */ +} + +void debugfs_del_meshif(struct net_device *dev) +{ + struct bat_priv *bat_priv = netdev_priv(dev); + + debug_log_cleanup(bat_priv); + + if (bat_debugfs) { + debugfs_remove_recursive(bat_priv->debug_dir); + bat_priv->debug_dir = NULL; + } +} diff --git a/net/batman-adv/bat_debugfs.h b/net/batman-adv/bat_debugfs.h new file mode 100644 index 00000000..bc9cda3f --- /dev/null +++ b/net/batman-adv/bat_debugfs.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + + +#ifndef _NET_BATMAN_ADV_DEBUGFS_H_ +#define _NET_BATMAN_ADV_DEBUGFS_H_ + +#define DEBUGFS_BAT_SUBDIR "batman_adv" + +void debugfs_init(void); +void debugfs_destroy(void); +int debugfs_add_meshif(struct net_device *dev); +void debugfs_del_meshif(struct net_device *dev); + +#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */ diff --git a/net/batman-adv/bat_sysfs.c b/net/batman-adv/bat_sysfs.c new file mode 100644 index 00000000..497a0700 --- /dev/null +++ b/net/batman-adv/bat_sysfs.c @@ -0,0 +1,596 @@ +/* + * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "bat_sysfs.h" +#include "translation-table.h" +#include "originator.h" +#include "hard-interface.h" +#include "gateway_common.h" +#include "gateway_client.h" +#include "vis.h" + +#define to_dev(obj) container_of(obj, struct device, kobj) +#define kobj_to_netdev(obj) to_net_dev(to_dev(obj->parent)) +#define kobj_to_batpriv(obj) netdev_priv(kobj_to_netdev(obj)) + +/* Use this, if you have customized show and store functions */ +#define BAT_ATTR(_name, _mode, _show, _store) \ +struct bat_attribute bat_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; + +#define BAT_ATTR_STORE_BOOL(_name, _post_func) \ +ssize_t store_##_name(struct kobject *kobj, struct attribute *attr, \ + char *buff, size_t count) \ +{ \ + struct net_device *net_dev = kobj_to_netdev(kobj); \ + struct bat_priv *bat_priv = netdev_priv(net_dev); \ + return __store_bool_attr(buff, count, _post_func, attr, \ + &bat_priv->_name, net_dev); \ +} + +#define BAT_ATTR_SHOW_BOOL(_name) \ +ssize_t show_##_name(struct kobject *kobj, struct attribute *attr, \ + char *buff) \ +{ \ + struct bat_priv *bat_priv = kobj_to_batpriv(kobj); \ + return sprintf(buff, "%s\n", \ + atomic_read(&bat_priv->_name) == 0 ? \ + "disabled" : "enabled"); \ +} \ + +/* Use this, if you are going to turn a [name] in bat_priv on or off */ +#define BAT_ATTR_BOOL(_name, _mode, _post_func) \ + static BAT_ATTR_STORE_BOOL(_name, _post_func) \ + static BAT_ATTR_SHOW_BOOL(_name) \ + static BAT_ATTR(_name, _mode, show_##_name, store_##_name) + + +#define BAT_ATTR_STORE_UINT(_name, _min, _max, _post_func) \ +ssize_t store_##_name(struct kobject *kobj, struct attribute *attr, \ + char *buff, size_t count) \ +{ \ + struct net_device *net_dev = kobj_to_netdev(kobj); \ + struct bat_priv *bat_priv = netdev_priv(net_dev); \ + return __store_uint_attr(buff, count, _min, _max, _post_func, \ + attr, &bat_priv->_name, net_dev); \ +} + +#define BAT_ATTR_SHOW_UINT(_name) \ +ssize_t show_##_name(struct kobject *kobj, struct attribute *attr, \ + char *buff) \ +{ \ + struct bat_priv *bat_priv = kobj_to_batpriv(kobj); \ + return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \ +} \ + +/* Use this, if you are going to set [name] in bat_priv to unsigned integer + * values only */ +#define BAT_ATTR_UINT(_name, _mode, _min, _max, _post_func) \ + static BAT_ATTR_STORE_UINT(_name, _min, _max, _post_func) \ + static BAT_ATTR_SHOW_UINT(_name) \ + static BAT_ATTR(_name, _mode, show_##_name, store_##_name) + + +static int store_bool_attr(char *buff, size_t count, + struct net_device *net_dev, + char *attr_name, atomic_t *attr) +{ + int enabled = -1; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if ((strncmp(buff, "1", 2) == 0) || + (strncmp(buff, "enable", 7) == 0) || + (strncmp(buff, "enabled", 8) == 0)) + enabled = 1; + + if ((strncmp(buff, "0", 2) == 0) || + (strncmp(buff, "disable", 8) == 0) || + (strncmp(buff, "disabled", 9) == 0)) + enabled = 0; + + if (enabled < 0) { + bat_info(net_dev, + "%s: Invalid parameter received: %s\n", + attr_name, buff); + return -EINVAL; + } + + if (atomic_read(attr) == enabled) + return count; + + bat_info(net_dev, "%s: Changing from: %s to: %s\n", attr_name, + atomic_read(attr) == 1 ? "enabled" : "disabled", + enabled == 1 ? "enabled" : "disabled"); + + atomic_set(attr, (unsigned)enabled); + return count; +} + +static inline ssize_t __store_bool_attr(char *buff, size_t count, + void (*post_func)(struct net_device *), + struct attribute *attr, + atomic_t *attr_store, struct net_device *net_dev) +{ + int ret; + + ret = store_bool_attr(buff, count, net_dev, (char *)attr->name, + attr_store); + if (post_func && ret) + post_func(net_dev); + + return ret; +} + +static int store_uint_attr(char *buff, size_t count, + struct net_device *net_dev, char *attr_name, + unsigned int min, unsigned int max, atomic_t *attr) +{ + unsigned long uint_val; + int ret; + + ret = strict_strtoul(buff, 10, &uint_val); + if (ret) { + bat_info(net_dev, + "%s: Invalid parameter received: %s\n", + attr_name, buff); + return -EINVAL; + } + + if (uint_val < min) { + bat_info(net_dev, "%s: Value is too small: %lu min: %u\n", + attr_name, uint_val, min); + return -EINVAL; + } + + if (uint_val > max) { + bat_info(net_dev, "%s: Value is too big: %lu max: %u\n", + attr_name, uint_val, max); + return -EINVAL; + } + + if (atomic_read(attr) == uint_val) + return count; + + bat_info(net_dev, "%s: Changing from: %i to: %lu\n", + attr_name, atomic_read(attr), uint_val); + + atomic_set(attr, uint_val); + return count; +} + +static inline ssize_t __store_uint_attr(char *buff, size_t count, + int min, int max, + void (*post_func)(struct net_device *), + struct attribute *attr, + atomic_t *attr_store, struct net_device *net_dev) +{ + int ret; + + ret = store_uint_attr(buff, count, net_dev, (char *)attr->name, + min, max, attr_store); + if (post_func && ret) + post_func(net_dev); + + return ret; +} + +static ssize_t show_vis_mode(struct kobject *kobj, struct attribute *attr, + char *buff) +{ + struct bat_priv *bat_priv = kobj_to_batpriv(kobj); + int vis_mode = atomic_read(&bat_priv->vis_mode); + + return sprintf(buff, "%s\n", + vis_mode == VIS_TYPE_CLIENT_UPDATE ? + "client" : "server"); +} + +static ssize_t store_vis_mode(struct kobject *kobj, struct attribute *attr, + char *buff, size_t count) +{ + struct net_device *net_dev = kobj_to_netdev(kobj); + struct bat_priv *bat_priv = netdev_priv(net_dev); + unsigned long val; + int ret, vis_mode_tmp = -1; + + ret = strict_strtoul(buff, 10, &val); + + if (((count == 2) && (!ret) && (val == VIS_TYPE_CLIENT_UPDATE)) || + (strncmp(buff, "client", 6) == 0) || + (strncmp(buff, "off", 3) == 0)) + vis_mode_tmp = VIS_TYPE_CLIENT_UPDATE; + + if (((count == 2) && (!ret) && (val == VIS_TYPE_SERVER_SYNC)) || + (strncmp(buff, "server", 6) == 0)) + vis_mode_tmp = VIS_TYPE_SERVER_SYNC; + + if (vis_mode_tmp < 0) { + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + bat_info(net_dev, + "Invalid parameter for 'vis mode' setting received: " + "%s\n", buff); + return -EINVAL; + } + + if (atomic_read(&bat_priv->vis_mode) == vis_mode_tmp) + return count; + + bat_info(net_dev, "Changing vis mode from: %s to: %s\n", + atomic_read(&bat_priv->vis_mode) == VIS_TYPE_CLIENT_UPDATE ? + "client" : "server", vis_mode_tmp == VIS_TYPE_CLIENT_UPDATE ? + "client" : "server"); + + atomic_set(&bat_priv->vis_mode, (unsigned)vis_mode_tmp); + return count; +} + +static void post_gw_deselect(struct net_device *net_dev) +{ + struct bat_priv *bat_priv = netdev_priv(net_dev); + gw_deselect(bat_priv); +} + +static ssize_t show_gw_mode(struct kobject *kobj, struct attribute *attr, + char *buff) +{ + struct bat_priv *bat_priv = kobj_to_batpriv(kobj); + int bytes_written; + + switch (atomic_read(&bat_priv->gw_mode)) { + case GW_MODE_CLIENT: + bytes_written = sprintf(buff, "%s\n", GW_MODE_CLIENT_NAME); + break; + case GW_MODE_SERVER: + bytes_written = sprintf(buff, "%s\n", GW_MODE_SERVER_NAME); + break; + default: + bytes_written = sprintf(buff, "%s\n", GW_MODE_OFF_NAME); + break; + } + + return bytes_written; +} + +static ssize_t store_gw_mode(struct kobject *kobj, struct attribute *attr, + char *buff, size_t count) +{ + struct net_device *net_dev = kobj_to_netdev(kobj); + struct bat_priv *bat_priv = netdev_priv(net_dev); + char *curr_gw_mode_str; + int gw_mode_tmp = -1; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if (strncmp(buff, GW_MODE_OFF_NAME, strlen(GW_MODE_OFF_NAME)) == 0) + gw_mode_tmp = GW_MODE_OFF; + + if (strncmp(buff, GW_MODE_CLIENT_NAME, + strlen(GW_MODE_CLIENT_NAME)) == 0) + gw_mode_tmp = GW_MODE_CLIENT; + + if (strncmp(buff, GW_MODE_SERVER_NAME, + strlen(GW_MODE_SERVER_NAME)) == 0) + gw_mode_tmp = GW_MODE_SERVER; + + if (gw_mode_tmp < 0) { + bat_info(net_dev, + "Invalid parameter for 'gw mode' setting received: " + "%s\n", buff); + return -EINVAL; + } + + if (atomic_read(&bat_priv->gw_mode) == gw_mode_tmp) + return count; + + switch (atomic_read(&bat_priv->gw_mode)) { + case GW_MODE_CLIENT: + curr_gw_mode_str = GW_MODE_CLIENT_NAME; + break; + case GW_MODE_SERVER: + curr_gw_mode_str = GW_MODE_SERVER_NAME; + break; + default: + curr_gw_mode_str = GW_MODE_OFF_NAME; + break; + } + + bat_info(net_dev, "Changing gw mode from: %s to: %s\n", + curr_gw_mode_str, buff); + + gw_deselect(bat_priv); + atomic_set(&bat_priv->gw_mode, (unsigned)gw_mode_tmp); + return count; +} + +static ssize_t show_gw_bwidth(struct kobject *kobj, struct attribute *attr, + char *buff) +{ + struct bat_priv *bat_priv = kobj_to_batpriv(kobj); + int down, up; + int gw_bandwidth = atomic_read(&bat_priv->gw_bandwidth); + + gw_bandwidth_to_kbit(gw_bandwidth, &down, &up); + return sprintf(buff, "%i%s/%i%s\n", + (down > 2048 ? down / 1024 : down), + (down > 2048 ? "MBit" : "KBit"), + (up > 2048 ? up / 1024 : up), + (up > 2048 ? "MBit" : "KBit")); +} + +static ssize_t store_gw_bwidth(struct kobject *kobj, struct attribute *attr, + char *buff, size_t count) +{ + struct net_device *net_dev = kobj_to_netdev(kobj); + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + return gw_bandwidth_set(net_dev, buff, count); +} + +BAT_ATTR_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL); +BAT_ATTR_BOOL(bonding, S_IRUGO | S_IWUSR, NULL); +BAT_ATTR_BOOL(fragmentation, S_IRUGO | S_IWUSR, update_min_mtu); +static BAT_ATTR(vis_mode, S_IRUGO | S_IWUSR, show_vis_mode, store_vis_mode); +static BAT_ATTR(gw_mode, S_IRUGO | S_IWUSR, show_gw_mode, store_gw_mode); +BAT_ATTR_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * JITTER, INT_MAX, NULL); +BAT_ATTR_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, TQ_MAX_VALUE, NULL); +BAT_ATTR_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, TQ_MAX_VALUE, + post_gw_deselect); +static BAT_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, show_gw_bwidth, + store_gw_bwidth); +#ifdef CONFIG_BATMAN_ADV_DEBUG +BAT_ATTR_UINT(log_level, S_IRUGO | S_IWUSR, 0, 3, NULL); +#endif + +static struct bat_attribute *mesh_attrs[] = { + &bat_attr_aggregated_ogms, + &bat_attr_bonding, + &bat_attr_fragmentation, + &bat_attr_vis_mode, + &bat_attr_gw_mode, + &bat_attr_orig_interval, + &bat_attr_hop_penalty, + &bat_attr_gw_sel_class, + &bat_attr_gw_bandwidth, +#ifdef CONFIG_BATMAN_ADV_DEBUG + &bat_attr_log_level, +#endif + NULL, +}; + +int sysfs_add_meshif(struct net_device *dev) +{ + struct kobject *batif_kobject = &dev->dev.kobj; + struct bat_priv *bat_priv = netdev_priv(dev); + struct bat_attribute **bat_attr; + int err; + + bat_priv->mesh_obj = kobject_create_and_add(SYSFS_IF_MESH_SUBDIR, + batif_kobject); + if (!bat_priv->mesh_obj) { + bat_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name, + SYSFS_IF_MESH_SUBDIR); + goto out; + } + + for (bat_attr = mesh_attrs; *bat_attr; ++bat_attr) { + err = sysfs_create_file(bat_priv->mesh_obj, + &((*bat_attr)->attr)); + if (err) { + bat_err(dev, "Can't add sysfs file: %s/%s/%s\n", + dev->name, SYSFS_IF_MESH_SUBDIR, + ((*bat_attr)->attr).name); + goto rem_attr; + } + } + + return 0; + +rem_attr: + for (bat_attr = mesh_attrs; *bat_attr; ++bat_attr) + sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); + + kobject_put(bat_priv->mesh_obj); + bat_priv->mesh_obj = NULL; +out: + return -ENOMEM; +} + +void sysfs_del_meshif(struct net_device *dev) +{ + struct bat_priv *bat_priv = netdev_priv(dev); + struct bat_attribute **bat_attr; + + for (bat_attr = mesh_attrs; *bat_attr; ++bat_attr) + sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); + + kobject_put(bat_priv->mesh_obj); + bat_priv->mesh_obj = NULL; +} + +static ssize_t show_mesh_iface(struct kobject *kobj, struct attribute *attr, + char *buff) +{ + struct net_device *net_dev = kobj_to_netdev(kobj); + struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev); + ssize_t length; + + if (!hard_iface) + return 0; + + length = sprintf(buff, "%s\n", hard_iface->if_status == IF_NOT_IN_USE ? + "none" : hard_iface->soft_iface->name); + + hardif_free_ref(hard_iface); + + return length; +} + +static ssize_t store_mesh_iface(struct kobject *kobj, struct attribute *attr, + char *buff, size_t count) +{ + struct net_device *net_dev = kobj_to_netdev(kobj); + struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev); + int status_tmp = -1; + int ret = count; + + if (!hard_iface) + return count; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if (strlen(buff) >= IFNAMSIZ) { + pr_err("Invalid parameter for 'mesh_iface' setting received: " + "interface name too long '%s'\n", buff); + hardif_free_ref(hard_iface); + return -EINVAL; + } + + if (strncmp(buff, "none", 4) == 0) + status_tmp = IF_NOT_IN_USE; + else + status_tmp = IF_I_WANT_YOU; + + if (hard_iface->if_status == status_tmp) + goto out; + + if ((hard_iface->soft_iface) && + (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0)) + goto out; + + if (!rtnl_trylock()) { + ret = -ERESTARTSYS; + goto out; + } + + if (status_tmp == IF_NOT_IN_USE) { + hardif_disable_interface(hard_iface); + goto unlock; + } + + /* if the interface already is in use */ + if (hard_iface->if_status != IF_NOT_IN_USE) + hardif_disable_interface(hard_iface); + + ret = hardif_enable_interface(hard_iface, buff); + +unlock: + rtnl_unlock(); +out: + hardif_free_ref(hard_iface); + return ret; +} + +static ssize_t show_iface_status(struct kobject *kobj, struct attribute *attr, + char *buff) +{ + struct net_device *net_dev = kobj_to_netdev(kobj); + struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev); + ssize_t length; + + if (!hard_iface) + return 0; + + switch (hard_iface->if_status) { + case IF_TO_BE_REMOVED: + length = sprintf(buff, "disabling\n"); + break; + case IF_INACTIVE: + length = sprintf(buff, "inactive\n"); + break; + case IF_ACTIVE: + length = sprintf(buff, "active\n"); + break; + case IF_TO_BE_ACTIVATED: + length = sprintf(buff, "enabling\n"); + break; + case IF_NOT_IN_USE: + default: + length = sprintf(buff, "not in use\n"); + break; + } + + hardif_free_ref(hard_iface); + + return length; +} + +static BAT_ATTR(mesh_iface, S_IRUGO | S_IWUSR, + show_mesh_iface, store_mesh_iface); +static BAT_ATTR(iface_status, S_IRUGO, show_iface_status, NULL); + +static struct bat_attribute *batman_attrs[] = { + &bat_attr_mesh_iface, + &bat_attr_iface_status, + NULL, +}; + +int sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev) +{ + struct kobject *hardif_kobject = &dev->dev.kobj; + struct bat_attribute **bat_attr; + int err; + + *hardif_obj = kobject_create_and_add(SYSFS_IF_BAT_SUBDIR, + hardif_kobject); + + if (!*hardif_obj) { + bat_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name, + SYSFS_IF_BAT_SUBDIR); + goto out; + } + + for (bat_attr = batman_attrs; *bat_attr; ++bat_attr) { + err = sysfs_create_file(*hardif_obj, &((*bat_attr)->attr)); + if (err) { + bat_err(dev, "Can't add sysfs file: %s/%s/%s\n", + dev->name, SYSFS_IF_BAT_SUBDIR, + ((*bat_attr)->attr).name); + goto rem_attr; + } + } + + return 0; + +rem_attr: + for (bat_attr = batman_attrs; *bat_attr; ++bat_attr) + sysfs_remove_file(*hardif_obj, &((*bat_attr)->attr)); +out: + return -ENOMEM; +} + +void sysfs_del_hardif(struct kobject **hardif_obj) +{ + kobject_put(*hardif_obj); + *hardif_obj = NULL; +} diff --git a/net/batman-adv/bat_sysfs.h b/net/batman-adv/bat_sysfs.h new file mode 100644 index 00000000..02f1fa7a --- /dev/null +++ b/net/batman-adv/bat_sysfs.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + + +#ifndef _NET_BATMAN_ADV_SYSFS_H_ +#define _NET_BATMAN_ADV_SYSFS_H_ + +#define SYSFS_IF_MESH_SUBDIR "mesh" +#define SYSFS_IF_BAT_SUBDIR "batman_adv" + +struct bat_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + char *buf, size_t count); +}; + +int sysfs_add_meshif(struct net_device *dev); +void sysfs_del_meshif(struct net_device *dev); +int sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev); +void sysfs_del_hardif(struct kobject **hardif_obj); + +#endif /* _NET_BATMAN_ADV_SYSFS_H_ */ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c new file mode 100644 index 00000000..ad2ca925 --- /dev/null +++ b/net/batman-adv/bitarray.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors: + * + * Simon Wunderlich, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "bitarray.h" + +#include + +/* returns true if the corresponding bit in the given seq_bits indicates true + * and curr_seqno is within range of last_seqno */ +uint8_t get_bit_status(unsigned long *seq_bits, uint32_t last_seqno, + uint32_t curr_seqno) +{ + int32_t diff, word_offset, word_num; + + diff = last_seqno - curr_seqno; + if (diff < 0 || diff >= TQ_LOCAL_WINDOW_SIZE) { + return 0; + } else { + /* which word */ + word_num = (last_seqno - curr_seqno) / WORD_BIT_SIZE; + /* which position in the selected word */ + word_offset = (last_seqno - curr_seqno) % WORD_BIT_SIZE; + + if (test_bit(word_offset, &seq_bits[word_num])) + return 1; + else + return 0; + } +} + +/* turn corresponding bit on, so we can remember that we got the packet */ +void bit_mark(unsigned long *seq_bits, int32_t n) +{ + int32_t word_offset, word_num; + + /* if too old, just drop it */ + if (n < 0 || n >= TQ_LOCAL_WINDOW_SIZE) + return; + + /* which word */ + word_num = n / WORD_BIT_SIZE; + /* which position in the selected word */ + word_offset = n % WORD_BIT_SIZE; + + set_bit(word_offset, &seq_bits[word_num]); /* turn the position on */ +} + +/* shift the packet array by n places. */ +static void bit_shift(unsigned long *seq_bits, int32_t n) +{ + int32_t word_offset, word_num; + int32_t i; + + if (n <= 0 || n >= TQ_LOCAL_WINDOW_SIZE) + return; + + word_offset = n % WORD_BIT_SIZE;/* shift how much inside each word */ + word_num = n / WORD_BIT_SIZE; /* shift over how much (full) words */ + + for (i = NUM_WORDS - 1; i > word_num; i--) { + /* going from old to new, so we don't overwrite the data we copy + * from. + * + * left is high, right is low: FEDC BA98 7654 3210 + * ^^ ^^ + * vvvv + * ^^^^ = from, vvvvv =to, we'd have word_num==1 and + * word_offset==WORD_BIT_SIZE/2 ????? in this example. + * (=24 bits) + * + * our desired output would be: 9876 5432 1000 0000 + * */ + + seq_bits[i] = + (seq_bits[i - word_num] << word_offset) + + /* take the lower port from the left half, shift it left + * to its final position */ + (seq_bits[i - word_num - 1] >> + (WORD_BIT_SIZE-word_offset)); + /* and the upper part of the right half and shift it left to + * it's position */ + /* for our example that would be: word[0] = 9800 + 0076 = + * 9876 */ + } + /* now for our last word, i==word_num, we only have the it's "left" + * half. that's the 1000 word in our example.*/ + + seq_bits[i] = (seq_bits[i - word_num] << word_offset); + + /* pad the rest with 0, if there is anything */ + i--; + + for (; i >= 0; i--) + seq_bits[i] = 0; +} + +static void bit_reset_window(unsigned long *seq_bits) +{ + int i; + for (i = 0; i < NUM_WORDS; i++) + seq_bits[i] = 0; +} + + +/* receive and process one packet within the sequence number window. + * + * returns: + * 1 if the window was moved (either new or very old) + * 0 if the window was not moved/shifted. + */ +char bit_get_packet(void *priv, unsigned long *seq_bits, + int32_t seq_num_diff, int8_t set_mark) +{ + struct bat_priv *bat_priv = (struct bat_priv *)priv; + + /* sequence number is slightly older. We already got a sequence number + * higher than this one, so we just mark it. */ + + if ((seq_num_diff <= 0) && (seq_num_diff > -TQ_LOCAL_WINDOW_SIZE)) { + if (set_mark) + bit_mark(seq_bits, -seq_num_diff); + return 0; + } + + /* sequence number is slightly newer, so we shift the window and + * set the mark if required */ + + if ((seq_num_diff > 0) && (seq_num_diff < TQ_LOCAL_WINDOW_SIZE)) { + bit_shift(seq_bits, seq_num_diff); + + if (set_mark) + bit_mark(seq_bits, 0); + return 1; + } + + /* sequence number is much newer, probably missed a lot of packets */ + + if ((seq_num_diff >= TQ_LOCAL_WINDOW_SIZE) + || (seq_num_diff < EXPECTED_SEQNO_RANGE)) { + bat_dbg(DBG_BATMAN, bat_priv, + "We missed a lot of packets (%i) !\n", + seq_num_diff - 1); + bit_reset_window(seq_bits); + if (set_mark) + bit_mark(seq_bits, 0); + return 1; + } + + /* received a much older packet. The other host either restarted + * or the old packet got delayed somewhere in the network. The + * packet should be dropped without calling this function if the + * seqno window is protected. */ + + if ((seq_num_diff <= -TQ_LOCAL_WINDOW_SIZE) + || (seq_num_diff >= EXPECTED_SEQNO_RANGE)) { + + bat_dbg(DBG_BATMAN, bat_priv, + "Other host probably restarted!\n"); + + bit_reset_window(seq_bits); + if (set_mark) + bit_mark(seq_bits, 0); + + return 1; + } + + /* never reached */ + return 0; +} + +/* count the hamming weight, how many good packets did we receive? just count + * the 1's. + */ +int bit_packet_count(unsigned long *seq_bits) +{ + int i, hamming = 0; + + for (i = 0; i < NUM_WORDS; i++) + hamming += hweight_long(seq_bits[i]); + + return hamming; +} diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h new file mode 100644 index 00000000..769c246d --- /dev/null +++ b/net/batman-adv/bitarray.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors: + * + * Simon Wunderlich, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_BITARRAY_H_ +#define _NET_BATMAN_ADV_BITARRAY_H_ + +#define WORD_BIT_SIZE (sizeof(unsigned long) * 8) + +/* returns true if the corresponding bit in the given seq_bits indicates true + * and curr_seqno is within range of last_seqno */ +uint8_t get_bit_status(unsigned long *seq_bits, uint32_t last_seqno, + uint32_t curr_seqno); + +/* turn corresponding bit on, so we can remember that we got the packet */ +void bit_mark(unsigned long *seq_bits, int32_t n); + + +/* receive and process one packet, returns 1 if received seq_num is considered + * new, 0 if old */ +char bit_get_packet(void *priv, unsigned long *seq_bits, + int32_t seq_num_diff, int8_t set_mark); + +/* count the hamming weight, how many good packets did we receive? */ +int bit_packet_count(unsigned long *seq_bits); + +#endif /* _NET_BATMAN_ADV_BITARRAY_H_ */ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c new file mode 100644 index 00000000..61605a0f --- /dev/null +++ b/net/batman-adv/gateway_client.c @@ -0,0 +1,561 @@ +/* + * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "originator.h" +#include +#include +#include +#include + +static void gw_node_free_ref(struct gw_node *gw_node) +{ + if (atomic_dec_and_test(&gw_node->refcount)) + kfree_rcu(gw_node, rcu); +} + +static struct gw_node *gw_get_selected_gw_node(struct bat_priv *bat_priv) +{ + struct gw_node *gw_node; + + rcu_read_lock(); + gw_node = rcu_dereference(bat_priv->curr_gw); + if (!gw_node) + goto out; + + if (!atomic_inc_not_zero(&gw_node->refcount)) + gw_node = NULL; + +out: + rcu_read_unlock(); + return gw_node; +} + +struct orig_node *gw_get_selected_orig(struct bat_priv *bat_priv) +{ + struct gw_node *gw_node; + struct orig_node *orig_node = NULL; + + gw_node = gw_get_selected_gw_node(bat_priv); + if (!gw_node) + goto out; + + rcu_read_lock(); + orig_node = gw_node->orig_node; + if (!orig_node) + goto unlock; + + if (!atomic_inc_not_zero(&orig_node->refcount)) + orig_node = NULL; + +unlock: + rcu_read_unlock(); +out: + if (gw_node) + gw_node_free_ref(gw_node); + return orig_node; +} + +static void gw_select(struct bat_priv *bat_priv, struct gw_node *new_gw_node) +{ + struct gw_node *curr_gw_node; + + spin_lock_bh(&bat_priv->gw_list_lock); + + if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount)) + new_gw_node = NULL; + + curr_gw_node = bat_priv->curr_gw; + rcu_assign_pointer(bat_priv->curr_gw, new_gw_node); + + if (curr_gw_node) + gw_node_free_ref(curr_gw_node); + + spin_unlock_bh(&bat_priv->gw_list_lock); +} + +void gw_deselect(struct bat_priv *bat_priv) +{ + gw_select(bat_priv, NULL); +} + +void gw_election(struct bat_priv *bat_priv) +{ + struct hlist_node *node; + struct gw_node *gw_node, *curr_gw = NULL, *curr_gw_tmp = NULL; + struct neigh_node *router; + uint8_t max_tq = 0; + uint32_t max_gw_factor = 0, tmp_gw_factor = 0; + int down, up; + + /** + * The batman daemon checks here if we already passed a full originator + * cycle in order to make sure we don't choose the first gateway we + * hear about. This check is based on the daemon's uptime which we + * don't have. + **/ + if (atomic_read(&bat_priv->gw_mode) != GW_MODE_CLIENT) + return; + + curr_gw = gw_get_selected_gw_node(bat_priv); + if (curr_gw) + goto out; + + rcu_read_lock(); + if (hlist_empty(&bat_priv->gw_list)) { + bat_dbg(DBG_BATMAN, bat_priv, + "Removing selected gateway - " + "no gateway in range\n"); + gw_deselect(bat_priv); + goto unlock; + } + + hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw_list, list) { + if (gw_node->deleted) + continue; + + router = orig_node_get_router(gw_node->orig_node); + if (!router) + continue; + + switch (atomic_read(&bat_priv->gw_sel_class)) { + case 1: /* fast connection */ + gw_bandwidth_to_kbit(gw_node->orig_node->gw_flags, + &down, &up); + + tmp_gw_factor = (router->tq_avg * router->tq_avg * + down * 100 * 100) / + (TQ_LOCAL_WINDOW_SIZE * + TQ_LOCAL_WINDOW_SIZE * 64); + + if ((tmp_gw_factor > max_gw_factor) || + ((tmp_gw_factor == max_gw_factor) && + (router->tq_avg > max_tq))) + curr_gw_tmp = gw_node; + break; + + default: /** + * 2: stable connection (use best statistic) + * 3: fast-switch (use best statistic but change as + * soon as a better gateway appears) + * XX: late-switch (use best statistic but change as + * soon as a better gateway appears which has + * $routing_class more tq points) + **/ + if (router->tq_avg > max_tq) + curr_gw_tmp = gw_node; + break; + } + + if (router->tq_avg > max_tq) + max_tq = router->tq_avg; + + if (tmp_gw_factor > max_gw_factor) + max_gw_factor = tmp_gw_factor; + + neigh_node_free_ref(router); + } + + if (curr_gw != curr_gw_tmp) { + router = orig_node_get_router(curr_gw_tmp->orig_node); + if (!router) + goto unlock; + + if ((curr_gw) && (!curr_gw_tmp)) + bat_dbg(DBG_BATMAN, bat_priv, + "Removing selected gateway - " + "no gateway in range\n"); + else if ((!curr_gw) && (curr_gw_tmp)) + bat_dbg(DBG_BATMAN, bat_priv, + "Adding route to gateway %pM " + "(gw_flags: %i, tq: %i)\n", + curr_gw_tmp->orig_node->orig, + curr_gw_tmp->orig_node->gw_flags, + router->tq_avg); + else + bat_dbg(DBG_BATMAN, bat_priv, + "Changing route to gateway %pM " + "(gw_flags: %i, tq: %i)\n", + curr_gw_tmp->orig_node->orig, + curr_gw_tmp->orig_node->gw_flags, + router->tq_avg); + + neigh_node_free_ref(router); + gw_select(bat_priv, curr_gw_tmp); + } + +unlock: + rcu_read_unlock(); +out: + if (curr_gw) + gw_node_free_ref(curr_gw); +} + +void gw_check_election(struct bat_priv *bat_priv, struct orig_node *orig_node) +{ + struct orig_node *curr_gw_orig; + struct neigh_node *router_gw = NULL, *router_orig = NULL; + uint8_t gw_tq_avg, orig_tq_avg; + + curr_gw_orig = gw_get_selected_orig(bat_priv); + if (!curr_gw_orig) + goto deselect; + + router_gw = orig_node_get_router(curr_gw_orig); + if (!router_gw) + goto deselect; + + /* this node already is the gateway */ + if (curr_gw_orig == orig_node) + goto out; + + router_orig = orig_node_get_router(orig_node); + if (!router_orig) + goto out; + + gw_tq_avg = router_gw->tq_avg; + orig_tq_avg = router_orig->tq_avg; + + /* the TQ value has to be better */ + if (orig_tq_avg < gw_tq_avg) + goto out; + + /** + * if the routing class is greater than 3 the value tells us how much + * greater the TQ value of the new gateway must be + **/ + if ((atomic_read(&bat_priv->gw_sel_class) > 3) && + (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class))) + goto out; + + bat_dbg(DBG_BATMAN, bat_priv, + "Restarting gateway selection: better gateway found (tq curr: " + "%i, tq new: %i)\n", + gw_tq_avg, orig_tq_avg); + +deselect: + gw_deselect(bat_priv); +out: + if (curr_gw_orig) + orig_node_free_ref(curr_gw_orig); + if (router_gw) + neigh_node_free_ref(router_gw); + if (router_orig) + neigh_node_free_ref(router_orig); + + return; +} + +static void gw_node_add(struct bat_priv *bat_priv, + struct orig_node *orig_node, uint8_t new_gwflags) +{ + struct gw_node *gw_node; + int down, up; + + gw_node = kmalloc(sizeof(struct gw_node), GFP_ATOMIC); + if (!gw_node) + return; + + memset(gw_node, 0, sizeof(struct gw_node)); + INIT_HLIST_NODE(&gw_node->list); + gw_node->orig_node = orig_node; + atomic_set(&gw_node->refcount, 1); + + spin_lock_bh(&bat_priv->gw_list_lock); + hlist_add_head_rcu(&gw_node->list, &bat_priv->gw_list); + spin_unlock_bh(&bat_priv->gw_list_lock); + + gw_bandwidth_to_kbit(new_gwflags, &down, &up); + bat_dbg(DBG_BATMAN, bat_priv, + "Found new gateway %pM -> gw_class: %i - %i%s/%i%s\n", + orig_node->orig, new_gwflags, + (down > 2048 ? down / 1024 : down), + (down > 2048 ? "MBit" : "KBit"), + (up > 2048 ? up / 1024 : up), + (up > 2048 ? "MBit" : "KBit")); +} + +void gw_node_update(struct bat_priv *bat_priv, + struct orig_node *orig_node, uint8_t new_gwflags) +{ + struct hlist_node *node; + struct gw_node *gw_node, *curr_gw; + + /** + * Note: We don't need a NULL check here, since curr_gw never gets + * dereferenced. If curr_gw is NULL we also should not exit as we may + * have this gateway in our list (duplication check!) even though we + * have no currently selected gateway. + */ + curr_gw = gw_get_selected_gw_node(bat_priv); + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw_list, list) { + if (gw_node->orig_node != orig_node) + continue; + + bat_dbg(DBG_BATMAN, bat_priv, + "Gateway class of originator %pM changed from " + "%i to %i\n", + orig_node->orig, gw_node->orig_node->gw_flags, + new_gwflags); + + gw_node->deleted = 0; + + if (new_gwflags == 0) { + gw_node->deleted = jiffies; + bat_dbg(DBG_BATMAN, bat_priv, + "Gateway %pM removed from gateway list\n", + orig_node->orig); + + if (gw_node == curr_gw) + goto deselect; + } + + goto unlock; + } + + if (new_gwflags == 0) + goto unlock; + + gw_node_add(bat_priv, orig_node, new_gwflags); + goto unlock; + +deselect: + gw_deselect(bat_priv); +unlock: + rcu_read_unlock(); + + if (curr_gw) + gw_node_free_ref(curr_gw); +} + +void gw_node_delete(struct bat_priv *bat_priv, struct orig_node *orig_node) +{ + return gw_node_update(bat_priv, orig_node, 0); +} + +void gw_node_purge(struct bat_priv *bat_priv) +{ + struct gw_node *gw_node, *curr_gw; + struct hlist_node *node, *node_tmp; + unsigned long timeout = 2 * PURGE_TIMEOUT * HZ; + char do_deselect = 0; + + curr_gw = gw_get_selected_gw_node(bat_priv); + + spin_lock_bh(&bat_priv->gw_list_lock); + + hlist_for_each_entry_safe(gw_node, node, node_tmp, + &bat_priv->gw_list, list) { + if (((!gw_node->deleted) || + (time_before(jiffies, gw_node->deleted + timeout))) && + atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE) + continue; + + if (curr_gw == gw_node) + do_deselect = 1; + + hlist_del_rcu(&gw_node->list); + gw_node_free_ref(gw_node); + } + + spin_unlock_bh(&bat_priv->gw_list_lock); + + /* gw_deselect() needs to acquire the gw_list_lock */ + if (do_deselect) + gw_deselect(bat_priv); + + if (curr_gw) + gw_node_free_ref(curr_gw); +} + +/** + * fails if orig_node has no router + */ +static int _write_buffer_text(struct bat_priv *bat_priv, + struct seq_file *seq, struct gw_node *gw_node) +{ + struct gw_node *curr_gw; + struct neigh_node *router; + int down, up, ret = -1; + + gw_bandwidth_to_kbit(gw_node->orig_node->gw_flags, &down, &up); + + router = orig_node_get_router(gw_node->orig_node); + if (!router) + goto out; + + curr_gw = gw_get_selected_gw_node(bat_priv); + + ret = seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %3i - %i%s/%i%s\n", + (curr_gw == gw_node ? "=>" : " "), + gw_node->orig_node->orig, + router->tq_avg, router->addr, + router->if_incoming->net_dev->name, + gw_node->orig_node->gw_flags, + (down > 2048 ? down / 1024 : down), + (down > 2048 ? "MBit" : "KBit"), + (up > 2048 ? up / 1024 : up), + (up > 2048 ? "MBit" : "KBit")); + + neigh_node_free_ref(router); + if (curr_gw) + gw_node_free_ref(curr_gw); +out: + return ret; +} + +int gw_client_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct bat_priv *bat_priv = netdev_priv(net_dev); + struct hard_iface *primary_if; + struct gw_node *gw_node; + struct hlist_node *node; + int gw_count = 0, ret = 0; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - please " + "specify interfaces to enable it\n", + net_dev->name); + goto out; + } + + if (primary_if->if_status != IF_ACTIVE) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - " + "primary interface not active\n", + net_dev->name); + goto out; + } + + seq_printf(seq, " %-12s (%s/%i) %17s [%10s]: gw_class ... " + "[B.A.T.M.A.N. adv %s%s, MainIF/MAC: %s/%pM (%s)]\n", + "Gateway", "#", TQ_MAX_VALUE, "Nexthop", + "outgoingIF", SOURCE_VERSION, REVISION_VERSION_STR, + primary_if->net_dev->name, + primary_if->net_dev->dev_addr, net_dev->name); + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw_list, list) { + if (gw_node->deleted) + continue; + + /* fails if orig_node has no router */ + if (_write_buffer_text(bat_priv, seq, gw_node) < 0) + continue; + + gw_count++; + } + rcu_read_unlock(); + + if (gw_count == 0) + seq_printf(seq, "No gateways in range ...\n"); + +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb) +{ + struct ethhdr *ethhdr; + struct iphdr *iphdr; + struct ipv6hdr *ipv6hdr; + struct udphdr *udphdr; + struct gw_node *curr_gw; + unsigned int header_len = 0; + + if (atomic_read(&bat_priv->gw_mode) == GW_MODE_OFF) + return 0; + + /* check for ethernet header */ + if (!pskb_may_pull(skb, header_len + ETH_HLEN)) + return 0; + ethhdr = (struct ethhdr *)skb->data; + header_len += ETH_HLEN; + + /* check for initial vlan header */ + if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) { + if (!pskb_may_pull(skb, header_len + VLAN_HLEN)) + return 0; + ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); + header_len += VLAN_HLEN; + } + + /* check for ip header */ + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_IP: + if (!pskb_may_pull(skb, header_len + sizeof(struct iphdr))) + return 0; + iphdr = (struct iphdr *)(skb->data + header_len); + header_len += iphdr->ihl * 4; + + /* check for udp header */ + if (iphdr->protocol != IPPROTO_UDP) + return 0; + + break; + case ETH_P_IPV6: + if (!pskb_may_pull(skb, header_len + sizeof(struct ipv6hdr))) + return 0; + ipv6hdr = (struct ipv6hdr *)(skb->data + header_len); + header_len += sizeof(struct ipv6hdr); + + /* check for udp header */ + if (ipv6hdr->nexthdr != IPPROTO_UDP) + return 0; + + break; + default: + return 0; + } + + if (!pskb_may_pull(skb, header_len + sizeof(struct udphdr))) + return 0; + udphdr = (struct udphdr *)(skb->data + header_len); + header_len += sizeof(struct udphdr); + + /* check for bootp port */ + if ((ntohs(ethhdr->h_proto) == ETH_P_IP) && + (ntohs(udphdr->dest) != 67)) + return 0; + + if ((ntohs(ethhdr->h_proto) == ETH_P_IPV6) && + (ntohs(udphdr->dest) != 547)) + return 0; + + if (atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER) + return -1; + + curr_gw = gw_get_selected_gw_node(bat_priv); + if (!curr_gw) + return 0; + + if (curr_gw) + gw_node_free_ref(curr_gw); + return 1; +} diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h new file mode 100644 index 00000000..1ce8c606 --- /dev/null +++ b/net/batman-adv/gateway_client.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ +#define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ + +void gw_deselect(struct bat_priv *bat_priv); +void gw_election(struct bat_priv *bat_priv); +struct orig_node *gw_get_selected_orig(struct bat_priv *bat_priv); +void gw_check_election(struct bat_priv *bat_priv, struct orig_node *orig_node); +void gw_node_update(struct bat_priv *bat_priv, + struct orig_node *orig_node, uint8_t new_gwflags); +void gw_node_delete(struct bat_priv *bat_priv, struct orig_node *orig_node); +void gw_node_purge(struct bat_priv *bat_priv); +int gw_client_seq_print_text(struct seq_file *seq, void *offset); +int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb); + +#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c new file mode 100644 index 00000000..50d3a59a --- /dev/null +++ b/net/batman-adv/gateway_common.c @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "gateway_common.h" +#include "gateway_client.h" + +/* calculates the gateway class from kbit */ +static void kbit_to_gw_bandwidth(int down, int up, long *gw_srv_class) +{ + int mdown = 0, tdown, tup, difference; + uint8_t sbit, part; + + *gw_srv_class = 0; + difference = 0x0FFFFFFF; + + /* test all downspeeds */ + for (sbit = 0; sbit < 2; sbit++) { + for (part = 0; part < 16; part++) { + tdown = 32 * (sbit + 2) * (1 << part); + + if (abs(tdown - down) < difference) { + *gw_srv_class = (sbit << 7) + (part << 3); + difference = abs(tdown - down); + mdown = tdown; + } + } + } + + /* test all upspeeds */ + difference = 0x0FFFFFFF; + + for (part = 0; part < 8; part++) { + tup = ((part + 1) * (mdown)) / 8; + + if (abs(tup - up) < difference) { + *gw_srv_class = (*gw_srv_class & 0xF8) | part; + difference = abs(tup - up); + } + } +} + +/* returns the up and downspeeds in kbit, calculated from the class */ +void gw_bandwidth_to_kbit(uint8_t gw_srv_class, int *down, int *up) +{ + char sbit = (gw_srv_class & 0x80) >> 7; + char dpart = (gw_srv_class & 0x78) >> 3; + char upart = (gw_srv_class & 0x07); + + if (!gw_srv_class) { + *down = 0; + *up = 0; + return; + } + + *down = 32 * (sbit + 2) * (1 << dpart); + *up = ((upart + 1) * (*down)) / 8; +} + +static bool parse_gw_bandwidth(struct net_device *net_dev, char *buff, + long *up, long *down) +{ + int ret, multi = 1; + char *slash_ptr, *tmp_ptr; + + slash_ptr = strchr(buff, '/'); + if (slash_ptr) + *slash_ptr = 0; + + if (strlen(buff) > 4) { + tmp_ptr = buff + strlen(buff) - 4; + + if (strnicmp(tmp_ptr, "mbit", 4) == 0) + multi = 1024; + + if ((strnicmp(tmp_ptr, "kbit", 4) == 0) || + (multi > 1)) + *tmp_ptr = '\0'; + } + + ret = strict_strtoul(buff, 10, down); + if (ret) { + bat_err(net_dev, + "Download speed of gateway mode invalid: %s\n", + buff); + return false; + } + + *down *= multi; + + /* we also got some upload info */ + if (slash_ptr) { + multi = 1; + + if (strlen(slash_ptr + 1) > 4) { + tmp_ptr = slash_ptr + 1 - 4 + strlen(slash_ptr + 1); + + if (strnicmp(tmp_ptr, "mbit", 4) == 0) + multi = 1024; + + if ((strnicmp(tmp_ptr, "kbit", 4) == 0) || + (multi > 1)) + *tmp_ptr = '\0'; + } + + ret = strict_strtoul(slash_ptr + 1, 10, up); + if (ret) { + bat_err(net_dev, + "Upload speed of gateway mode invalid: " + "%s\n", slash_ptr + 1); + return false; + } + + *up *= multi; + } + + return true; +} + +ssize_t gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count) +{ + struct bat_priv *bat_priv = netdev_priv(net_dev); + long gw_bandwidth_tmp = 0, up = 0, down = 0; + bool ret; + + ret = parse_gw_bandwidth(net_dev, buff, &up, &down); + if (!ret) + goto end; + + if ((!down) || (down < 256)) + down = 2000; + + if (!up) + up = down / 5; + + kbit_to_gw_bandwidth(down, up, &gw_bandwidth_tmp); + + /** + * the gw bandwidth we guessed above might not match the given + * speeds, hence we need to calculate it back to show the number + * that is going to be propagated + **/ + gw_bandwidth_to_kbit((uint8_t)gw_bandwidth_tmp, + (int *)&down, (int *)&up); + + gw_deselect(bat_priv); + bat_info(net_dev, "Changing gateway bandwidth from: '%i' to: '%ld' " + "(propagating: %ld%s/%ld%s)\n", + atomic_read(&bat_priv->gw_bandwidth), gw_bandwidth_tmp, + (down > 2048 ? down / 1024 : down), + (down > 2048 ? "MBit" : "KBit"), + (up > 2048 ? up / 1024 : up), + (up > 2048 ? "MBit" : "KBit")); + + atomic_set(&bat_priv->gw_bandwidth, gw_bandwidth_tmp); + +end: + return count; +} diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h new file mode 100644 index 00000000..55e527a4 --- /dev/null +++ b/net/batman-adv/gateway_common.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ +#define _NET_BATMAN_ADV_GATEWAY_COMMON_H_ + +enum gw_modes { + GW_MODE_OFF, + GW_MODE_CLIENT, + GW_MODE_SERVER, +}; + +#define GW_MODE_OFF_NAME "off" +#define GW_MODE_CLIENT_NAME "client" +#define GW_MODE_SERVER_NAME "server" + +void gw_bandwidth_to_kbit(uint8_t gw_class, int *down, int *up); +ssize_t gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count); + +#endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c new file mode 100644 index 00000000..dfbfccc9 --- /dev/null +++ b/net/batman-adv/hard-interface.c @@ -0,0 +1,682 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "hard-interface.h" +#include "soft-interface.h" +#include "send.h" +#include "translation-table.h" +#include "routing.h" +#include "bat_sysfs.h" +#include "originator.h" +#include "hash.h" + +#include + + +static int batman_skb_recv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *ptype, + struct net_device *orig_dev); + +void hardif_free_rcu(struct rcu_head *rcu) +{ + struct hard_iface *hard_iface; + + hard_iface = container_of(rcu, struct hard_iface, rcu); + dev_put(hard_iface->net_dev); + kfree(hard_iface); +} + +struct hard_iface *hardif_get_by_netdev(struct net_device *net_dev) +{ + struct hard_iface *hard_iface; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if (hard_iface->net_dev == net_dev && + atomic_inc_not_zero(&hard_iface->refcount)) + goto out; + } + + hard_iface = NULL; + +out: + rcu_read_unlock(); + return hard_iface; +} + +static int is_valid_iface(struct net_device *net_dev) +{ + if (net_dev->flags & IFF_LOOPBACK) + return 0; + + if (net_dev->type != ARPHRD_ETHER) + return 0; + + if (net_dev->addr_len != ETH_ALEN) + return 0; + + /* no batman over batman */ + if (softif_is_valid(net_dev)) + return 0; + + /* Device is being bridged */ + /* if (net_dev->priv_flags & IFF_BRIDGE_PORT) + return 0; */ + + return 1; +} + +static struct hard_iface *hardif_get_active(struct net_device *soft_iface) +{ + struct hard_iface *hard_iface; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + if (hard_iface->if_status == IF_ACTIVE && + atomic_inc_not_zero(&hard_iface->refcount)) + goto out; + } + + hard_iface = NULL; + +out: + rcu_read_unlock(); + return hard_iface; +} + +static void primary_if_update_addr(struct bat_priv *bat_priv) +{ + struct vis_packet *vis_packet; + struct hard_iface *primary_if; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + vis_packet = (struct vis_packet *) + bat_priv->my_vis_info->skb_packet->data; + memcpy(vis_packet->vis_orig, primary_if->net_dev->dev_addr, ETH_ALEN); + memcpy(vis_packet->sender_orig, + primary_if->net_dev->dev_addr, ETH_ALEN); + +out: + if (primary_if) + hardif_free_ref(primary_if); +} + +static void primary_if_select(struct bat_priv *bat_priv, + struct hard_iface *new_hard_iface) +{ + struct hard_iface *curr_hard_iface; + struct batman_packet *batman_packet; + + ASSERT_RTNL(); + + if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount)) + new_hard_iface = NULL; + + curr_hard_iface = bat_priv->primary_if; + rcu_assign_pointer(bat_priv->primary_if, new_hard_iface); + + if (curr_hard_iface) + hardif_free_ref(curr_hard_iface); + + if (!new_hard_iface) + return; + + batman_packet = (struct batman_packet *)(new_hard_iface->packet_buff); + batman_packet->flags = PRIMARIES_FIRST_HOP; + batman_packet->ttl = TTL; + + primary_if_update_addr(bat_priv); + + /*** + * hacky trick to make sure that we send the TT information via + * our new primary interface + */ + atomic_set(&bat_priv->tt_local_changed, 1); +} + +static bool hardif_is_iface_up(struct hard_iface *hard_iface) +{ + if (hard_iface->net_dev->flags & IFF_UP) + return true; + + return false; +} + +static void update_mac_addresses(struct hard_iface *hard_iface) +{ + memcpy(((struct batman_packet *)(hard_iface->packet_buff))->orig, + hard_iface->net_dev->dev_addr, ETH_ALEN); + memcpy(((struct batman_packet *)(hard_iface->packet_buff))->prev_sender, + hard_iface->net_dev->dev_addr, ETH_ALEN); +} + +static void check_known_mac_addr(struct net_device *net_dev) +{ + struct hard_iface *hard_iface; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if ((hard_iface->if_status != IF_ACTIVE) && + (hard_iface->if_status != IF_TO_BE_ACTIVATED)) + continue; + + if (hard_iface->net_dev == net_dev) + continue; + + if (!compare_eth(hard_iface->net_dev->dev_addr, + net_dev->dev_addr)) + continue; + + pr_warning("The newly added mac address (%pM) already exists " + "on: %s\n", net_dev->dev_addr, + hard_iface->net_dev->name); + pr_warning("It is strongly recommended to keep mac addresses " + "unique to avoid problems!\n"); + } + rcu_read_unlock(); +} + +int hardif_min_mtu(struct net_device *soft_iface) +{ + struct bat_priv *bat_priv = netdev_priv(soft_iface); + struct hard_iface *hard_iface; + /* allow big frames if all devices are capable to do so + * (have MTU > 1500 + BAT_HEADER_LEN) */ + int min_mtu = ETH_DATA_LEN; + + if (atomic_read(&bat_priv->fragmentation)) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if ((hard_iface->if_status != IF_ACTIVE) && + (hard_iface->if_status != IF_TO_BE_ACTIVATED)) + continue; + + if (hard_iface->soft_iface != soft_iface) + continue; + + min_mtu = min_t(int, hard_iface->net_dev->mtu - BAT_HEADER_LEN, + min_mtu); + } + rcu_read_unlock(); +out: + return min_mtu; +} + +/* adjusts the MTU if a new interface with a smaller MTU appeared. */ +void update_min_mtu(struct net_device *soft_iface) +{ + int min_mtu; + + min_mtu = hardif_min_mtu(soft_iface); + if (soft_iface->mtu != min_mtu) + soft_iface->mtu = min_mtu; +} + +static void hardif_activate_interface(struct hard_iface *hard_iface) +{ + struct bat_priv *bat_priv; + struct hard_iface *primary_if = NULL; + + if (hard_iface->if_status != IF_INACTIVE) + goto out; + + bat_priv = netdev_priv(hard_iface->soft_iface); + + update_mac_addresses(hard_iface); + hard_iface->if_status = IF_TO_BE_ACTIVATED; + + /** + * the first active interface becomes our primary interface or + * the next active interface after the old primay interface was removed + */ + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + primary_if_select(bat_priv, hard_iface); + + bat_info(hard_iface->soft_iface, "Interface activated: %s\n", + hard_iface->net_dev->name); + + update_min_mtu(hard_iface->soft_iface); + +out: + if (primary_if) + hardif_free_ref(primary_if); +} + +static void hardif_deactivate_interface(struct hard_iface *hard_iface) +{ + if ((hard_iface->if_status != IF_ACTIVE) && + (hard_iface->if_status != IF_TO_BE_ACTIVATED)) + return; + + hard_iface->if_status = IF_INACTIVE; + + bat_info(hard_iface->soft_iface, "Interface deactivated: %s\n", + hard_iface->net_dev->name); + + update_min_mtu(hard_iface->soft_iface); +} + +int hardif_enable_interface(struct hard_iface *hard_iface, char *iface_name) +{ + struct bat_priv *bat_priv; + struct batman_packet *batman_packet; + struct net_device *soft_iface; + int ret; + + if (hard_iface->if_status != IF_NOT_IN_USE) + goto out; + + if (!atomic_inc_not_zero(&hard_iface->refcount)) + goto out; + + soft_iface = dev_get_by_name(&init_net, iface_name); + + if (!soft_iface) { + soft_iface = softif_create(iface_name); + + if (!soft_iface) { + ret = -ENOMEM; + goto err; + } + + /* dev_get_by_name() increases the reference counter for us */ + dev_hold(soft_iface); + } + + if (!softif_is_valid(soft_iface)) { + pr_err("Can't create batman mesh interface %s: " + "already exists as regular interface\n", + soft_iface->name); + dev_put(soft_iface); + ret = -EINVAL; + goto err; + } + + hard_iface->soft_iface = soft_iface; + bat_priv = netdev_priv(hard_iface->soft_iface); + hard_iface->packet_len = BAT_PACKET_LEN; + hard_iface->packet_buff = kmalloc(hard_iface->packet_len, GFP_ATOMIC); + + if (!hard_iface->packet_buff) { + bat_err(hard_iface->soft_iface, "Can't add interface packet " + "(%s): out of memory\n", hard_iface->net_dev->name); + ret = -ENOMEM; + goto err; + } + + batman_packet = (struct batman_packet *)(hard_iface->packet_buff); + batman_packet->packet_type = BAT_PACKET; + batman_packet->version = COMPAT_VERSION; + batman_packet->flags = 0; + batman_packet->ttl = 2; + batman_packet->tq = TQ_MAX_VALUE; + batman_packet->num_tt = 0; + + hard_iface->if_num = bat_priv->num_ifaces; + bat_priv->num_ifaces++; + hard_iface->if_status = IF_INACTIVE; + orig_hash_add_if(hard_iface, bat_priv->num_ifaces); + + hard_iface->batman_adv_ptype.type = __constant_htons(ETH_P_BATMAN); + hard_iface->batman_adv_ptype.func = batman_skb_recv; + hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; + dev_add_pack(&hard_iface->batman_adv_ptype); + + atomic_set(&hard_iface->seqno, 1); + atomic_set(&hard_iface->frag_seqno, 1); + bat_info(hard_iface->soft_iface, "Adding interface: %s\n", + hard_iface->net_dev->name); + + if (atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu < + ETH_DATA_LEN + BAT_HEADER_LEN) + bat_info(hard_iface->soft_iface, + "The MTU of interface %s is too small (%i) to handle " + "the transport of batman-adv packets. Packets going " + "over this interface will be fragmented on layer2 " + "which could impact the performance. Setting the MTU " + "to %zi would solve the problem.\n", + hard_iface->net_dev->name, hard_iface->net_dev->mtu, + ETH_DATA_LEN + BAT_HEADER_LEN); + + if (!atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu < + ETH_DATA_LEN + BAT_HEADER_LEN) + bat_info(hard_iface->soft_iface, + "The MTU of interface %s is too small (%i) to handle " + "the transport of batman-adv packets. If you experience" + " problems getting traffic through try increasing the " + "MTU to %zi.\n", + hard_iface->net_dev->name, hard_iface->net_dev->mtu, + ETH_DATA_LEN + BAT_HEADER_LEN); + + if (hardif_is_iface_up(hard_iface)) + hardif_activate_interface(hard_iface); + else + bat_err(hard_iface->soft_iface, "Not using interface %s " + "(retrying later): interface not active\n", + hard_iface->net_dev->name); + + /* begin scheduling originator messages on that interface */ + schedule_own_packet(hard_iface); + +out: + return 0; + +err: + hardif_free_ref(hard_iface); + return ret; +} + +void hardif_disable_interface(struct hard_iface *hard_iface) +{ + struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct hard_iface *primary_if = NULL; + + if (hard_iface->if_status == IF_ACTIVE) + hardif_deactivate_interface(hard_iface); + + if (hard_iface->if_status != IF_INACTIVE) + goto out; + + bat_info(hard_iface->soft_iface, "Removing interface: %s\n", + hard_iface->net_dev->name); + dev_remove_pack(&hard_iface->batman_adv_ptype); + + bat_priv->num_ifaces--; + orig_hash_del_if(hard_iface, bat_priv->num_ifaces); + + primary_if = primary_if_get_selected(bat_priv); + if (hard_iface == primary_if) { + struct hard_iface *new_if; + + new_if = hardif_get_active(hard_iface->soft_iface); + primary_if_select(bat_priv, new_if); + + if (new_if) + hardif_free_ref(new_if); + } + + kfree(hard_iface->packet_buff); + hard_iface->packet_buff = NULL; + hard_iface->if_status = IF_NOT_IN_USE; + + /* delete all references to this hard_iface */ + purge_orig_ref(bat_priv); + purge_outstanding_packets(bat_priv, hard_iface); + dev_put(hard_iface->soft_iface); + + /* nobody uses this interface anymore */ + if (!bat_priv->num_ifaces) + softif_destroy(hard_iface->soft_iface); + + hard_iface->soft_iface = NULL; + hardif_free_ref(hard_iface); + +out: + if (primary_if) + hardif_free_ref(primary_if); +} + +static struct hard_iface *hardif_add_interface(struct net_device *net_dev) +{ + struct hard_iface *hard_iface; + int ret; + + ASSERT_RTNL(); + + ret = is_valid_iface(net_dev); + if (ret != 1) + goto out; + + dev_hold(net_dev); + + hard_iface = kmalloc(sizeof(struct hard_iface), GFP_ATOMIC); + if (!hard_iface) { + pr_err("Can't add interface (%s): out of memory\n", + net_dev->name); + goto release_dev; + } + + ret = sysfs_add_hardif(&hard_iface->hardif_obj, net_dev); + if (ret) + goto free_if; + + hard_iface->if_num = -1; + hard_iface->net_dev = net_dev; + hard_iface->soft_iface = NULL; + hard_iface->if_status = IF_NOT_IN_USE; + INIT_LIST_HEAD(&hard_iface->list); + /* extra reference for return */ + atomic_set(&hard_iface->refcount, 2); + + check_known_mac_addr(hard_iface->net_dev); + list_add_tail_rcu(&hard_iface->list, &hardif_list); + + return hard_iface; + +free_if: + kfree(hard_iface); +release_dev: + dev_put(net_dev); +out: + return NULL; +} + +static void hardif_remove_interface(struct hard_iface *hard_iface) +{ + ASSERT_RTNL(); + + /* first deactivate interface */ + if (hard_iface->if_status != IF_NOT_IN_USE) + hardif_disable_interface(hard_iface); + + if (hard_iface->if_status != IF_NOT_IN_USE) + return; + + hard_iface->if_status = IF_TO_BE_REMOVED; + sysfs_del_hardif(&hard_iface->hardif_obj); + hardif_free_ref(hard_iface); +} + +void hardif_remove_interfaces(void) +{ + struct hard_iface *hard_iface, *hard_iface_tmp; + + rtnl_lock(); + list_for_each_entry_safe(hard_iface, hard_iface_tmp, + &hardif_list, list) { + list_del_rcu(&hard_iface->list); + hardif_remove_interface(hard_iface); + } + rtnl_unlock(); +} + +static int hard_if_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *net_dev = (struct net_device *)ptr; + struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev); + struct hard_iface *primary_if = NULL; + struct bat_priv *bat_priv; + + if (!hard_iface && event == NETDEV_REGISTER) + hard_iface = hardif_add_interface(net_dev); + + if (!hard_iface) + goto out; + + switch (event) { + case NETDEV_UP: + hardif_activate_interface(hard_iface); + break; + case NETDEV_GOING_DOWN: + case NETDEV_DOWN: + hardif_deactivate_interface(hard_iface); + break; + case NETDEV_UNREGISTER: + list_del_rcu(&hard_iface->list); + + hardif_remove_interface(hard_iface); + break; + case NETDEV_CHANGEMTU: + if (hard_iface->soft_iface) + update_min_mtu(hard_iface->soft_iface); + break; + case NETDEV_CHANGEADDR: + if (hard_iface->if_status == IF_NOT_IN_USE) + goto hardif_put; + + check_known_mac_addr(hard_iface->net_dev); + update_mac_addresses(hard_iface); + + bat_priv = netdev_priv(hard_iface->soft_iface); + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto hardif_put; + + if (hard_iface == primary_if) + primary_if_update_addr(bat_priv); + break; + default: + break; + }; + +hardif_put: + hardif_free_ref(hard_iface); +out: + if (primary_if) + hardif_free_ref(primary_if); + return NOTIFY_DONE; +} + +/* receive a packet with the batman ethertype coming on a hard + * interface */ +static int batman_skb_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype, + struct net_device *orig_dev) +{ + struct bat_priv *bat_priv; + struct batman_packet *batman_packet; + struct hard_iface *hard_iface; + int ret; + + hard_iface = container_of(ptype, struct hard_iface, batman_adv_ptype); + skb = skb_share_check(skb, GFP_ATOMIC); + + /* skb was released by skb_share_check() */ + if (!skb) + goto err_out; + + /* packet should hold at least type and version */ + if (unlikely(!pskb_may_pull(skb, 2))) + goto err_free; + + /* expect a valid ethernet header here. */ + if (unlikely(skb->mac_len != sizeof(struct ethhdr) + || !skb_mac_header(skb))) + goto err_free; + + if (!hard_iface->soft_iface) + goto err_free; + + bat_priv = netdev_priv(hard_iface->soft_iface); + + if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE) + goto err_free; + + /* discard frames on not active interfaces */ + if (hard_iface->if_status != IF_ACTIVE) + goto err_free; + + batman_packet = (struct batman_packet *)skb->data; + + if (batman_packet->version != COMPAT_VERSION) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: incompatible batman version (%i)\n", + batman_packet->version); + goto err_free; + } + + /* all receive handlers return whether they received or reused + * the supplied skb. if not, we have to free the skb. */ + + switch (batman_packet->packet_type) { + /* batman originator packet */ + case BAT_PACKET: + ret = recv_bat_packet(skb, hard_iface); + break; + + /* batman icmp packet */ + case BAT_ICMP: + ret = recv_icmp_packet(skb, hard_iface); + break; + + /* unicast packet */ + case BAT_UNICAST: + ret = recv_unicast_packet(skb, hard_iface); + break; + + /* fragmented unicast packet */ + case BAT_UNICAST_FRAG: + ret = recv_ucast_frag_packet(skb, hard_iface); + break; + + /* broadcast packet */ + case BAT_BCAST: + ret = recv_bcast_packet(skb, hard_iface); + break; + + /* vis packet */ + case BAT_VIS: + ret = recv_vis_packet(skb, hard_iface); + break; + default: + ret = NET_RX_DROP; + } + + if (ret == NET_RX_DROP) + kfree_skb(skb); + + /* return NET_RX_SUCCESS in any case as we + * most probably dropped the packet for + * routing-logical reasons. */ + + return NET_RX_SUCCESS; + +err_free: + kfree_skb(skb); +err_out: + return NET_RX_DROP; +} + +struct notifier_block hard_if_notifier = { + .notifier_call = hard_if_event, +}; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h new file mode 100644 index 00000000..64265991 --- /dev/null +++ b/net/batman-adv/hard-interface.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ +#define _NET_BATMAN_ADV_HARD_INTERFACE_H_ + +#define IF_NOT_IN_USE 0 +#define IF_TO_BE_REMOVED 1 +#define IF_INACTIVE 2 +#define IF_ACTIVE 3 +#define IF_TO_BE_ACTIVATED 4 +#define IF_I_WANT_YOU 5 + +extern struct notifier_block hard_if_notifier; + +struct hard_iface *hardif_get_by_netdev(struct net_device *net_dev); +int hardif_enable_interface(struct hard_iface *hard_iface, char *iface_name); +void hardif_disable_interface(struct hard_iface *hard_iface); +void hardif_remove_interfaces(void); +int hardif_min_mtu(struct net_device *soft_iface); +void update_min_mtu(struct net_device *soft_iface); +void hardif_free_rcu(struct rcu_head *rcu); + +static inline void hardif_free_ref(struct hard_iface *hard_iface) +{ + if (atomic_dec_and_test(&hard_iface->refcount)) + call_rcu(&hard_iface->rcu, hardif_free_rcu); +} + +static inline struct hard_iface *primary_if_get_selected( + struct bat_priv *bat_priv) +{ + struct hard_iface *hard_iface; + + rcu_read_lock(); + hard_iface = rcu_dereference(bat_priv->primary_if); + if (!hard_iface) + goto out; + + if (!atomic_inc_not_zero(&hard_iface->refcount)) + hard_iface = NULL; + +out: + rcu_read_unlock(); + return hard_iface; +} + +#endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c new file mode 100644 index 00000000..c5213d8f --- /dev/null +++ b/net/batman-adv/hash.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors: + * + * Simon Wunderlich, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "hash.h" + +/* clears the hash */ +static void hash_init(struct hashtable_t *hash) +{ + int i; + + for (i = 0 ; i < hash->size; i++) { + INIT_HLIST_HEAD(&hash->table[i]); + spin_lock_init(&hash->list_locks[i]); + } +} + +/* free only the hashtable and the hash itself. */ +void hash_destroy(struct hashtable_t *hash) +{ + kfree(hash->list_locks); + kfree(hash->table); + kfree(hash); +} + +/* allocates and clears the hash */ +struct hashtable_t *hash_new(int size) +{ + struct hashtable_t *hash; + + hash = kmalloc(sizeof(struct hashtable_t), GFP_ATOMIC); + if (!hash) + return NULL; + + hash->table = kmalloc(sizeof(struct element_t *) * size, GFP_ATOMIC); + if (!hash->table) + goto free_hash; + + hash->list_locks = kmalloc(sizeof(spinlock_t) * size, GFP_ATOMIC); + if (!hash->list_locks) + goto free_table; + + hash->size = size; + hash_init(hash); + return hash; + +free_table: + kfree(hash->table); +free_hash: + kfree(hash); + return NULL; +} diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h new file mode 100644 index 00000000..434822b2 --- /dev/null +++ b/net/batman-adv/hash.h @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors: + * + * Simon Wunderlich, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_HASH_H_ +#define _NET_BATMAN_ADV_HASH_H_ + +#include + +/* callback to a compare function. should + * compare 2 element datas for their keys, + * return 0 if same and not 0 if not + * same */ +typedef int (*hashdata_compare_cb)(struct hlist_node *, void *); + +/* the hashfunction, should return an index + * based on the key in the data of the first + * argument and the size the second */ +typedef int (*hashdata_choose_cb)(void *, int); +typedef void (*hashdata_free_cb)(struct hlist_node *, void *); + +struct hashtable_t { + struct hlist_head *table; /* the hashtable itself with the buckets */ + spinlock_t *list_locks; /* spinlock for each hash list entry */ + int size; /* size of hashtable */ +}; + +/* allocates and clears the hash */ +struct hashtable_t *hash_new(int size); + +/* free only the hashtable and the hash itself. */ +void hash_destroy(struct hashtable_t *hash); + +/* remove the hash structure. if hashdata_free_cb != NULL, this function will be + * called to remove the elements inside of the hash. if you don't remove the + * elements, memory might be leaked. */ +static inline void hash_delete(struct hashtable_t *hash, + hashdata_free_cb free_cb, void *arg) +{ + struct hlist_head *head; + struct hlist_node *node, *node_tmp; + spinlock_t *list_lock; /* spinlock to protect write access */ + int i; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + list_lock = &hash->list_locks[i]; + + spin_lock_bh(list_lock); + hlist_for_each_safe(node, node_tmp, head) { + hlist_del_rcu(node); + + if (free_cb) + free_cb(node, arg); + } + spin_unlock_bh(list_lock); + } + + hash_destroy(hash); +} + +/* adds data to the hashtable. returns 0 on success, -1 on error */ +static inline int hash_add(struct hashtable_t *hash, + hashdata_compare_cb compare, + hashdata_choose_cb choose, + void *data, struct hlist_node *data_node) +{ + int index; + struct hlist_head *head; + struct hlist_node *node; + spinlock_t *list_lock; /* spinlock to protect write access */ + + if (!hash) + goto err; + + index = choose(data, hash->size); + head = &hash->table[index]; + list_lock = &hash->list_locks[index]; + + rcu_read_lock(); + __hlist_for_each_rcu(node, head) { + if (!compare(node, data)) + continue; + + goto err_unlock; + } + rcu_read_unlock(); + + /* no duplicate found in list, add new element */ + spin_lock_bh(list_lock); + hlist_add_head_rcu(data_node, head); + spin_unlock_bh(list_lock); + + return 0; + +err_unlock: + rcu_read_unlock(); +err: + return -1; +} + +/* removes data from hash, if found. returns pointer do data on success, so you + * can remove the used structure yourself, or NULL on error . data could be the + * structure you use with just the key filled, we just need the key for + * comparing. */ +static inline void *hash_remove(struct hashtable_t *hash, + hashdata_compare_cb compare, + hashdata_choose_cb choose, void *data) +{ + size_t index; + struct hlist_node *node; + struct hlist_head *head; + void *data_save = NULL; + + index = choose(data, hash->size); + head = &hash->table[index]; + + spin_lock_bh(&hash->list_locks[index]); + hlist_for_each(node, head) { + if (!compare(node, data)) + continue; + + data_save = node; + hlist_del_rcu(node); + break; + } + spin_unlock_bh(&hash->list_locks[index]); + + return data_save; +} + +#endif /* _NET_BATMAN_ADV_HASH_H_ */ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c new file mode 100644 index 00000000..fa22ba2b --- /dev/null +++ b/net/batman-adv/icmp_socket.c @@ -0,0 +1,356 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include +#include +#include "icmp_socket.h" +#include "send.h" +#include "hash.h" +#include "originator.h" +#include "hard-interface.h" + +static struct socket_client *socket_client_hash[256]; + +static void bat_socket_add_packet(struct socket_client *socket_client, + struct icmp_packet_rr *icmp_packet, + size_t icmp_len); + +void bat_socket_init(void) +{ + memset(socket_client_hash, 0, sizeof(socket_client_hash)); +} + +static int bat_socket_open(struct inode *inode, struct file *file) +{ + unsigned int i; + struct socket_client *socket_client; + + nonseekable_open(inode, file); + + socket_client = kmalloc(sizeof(struct socket_client), GFP_KERNEL); + + if (!socket_client) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(socket_client_hash); i++) { + if (!socket_client_hash[i]) { + socket_client_hash[i] = socket_client; + break; + } + } + + if (i == ARRAY_SIZE(socket_client_hash)) { + pr_err("Error - can't add another packet client: " + "maximum number of clients reached\n"); + kfree(socket_client); + return -EXFULL; + } + + INIT_LIST_HEAD(&socket_client->queue_list); + socket_client->queue_len = 0; + socket_client->index = i; + socket_client->bat_priv = inode->i_private; + spin_lock_init(&socket_client->lock); + init_waitqueue_head(&socket_client->queue_wait); + + file->private_data = socket_client; + + inc_module_count(); + return 0; +} + +static int bat_socket_release(struct inode *inode, struct file *file) +{ + struct socket_client *socket_client = file->private_data; + struct socket_packet *socket_packet; + struct list_head *list_pos, *list_pos_tmp; + + spin_lock_bh(&socket_client->lock); + + /* for all packets in the queue ... */ + list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) { + socket_packet = list_entry(list_pos, + struct socket_packet, list); + + list_del(list_pos); + kfree(socket_packet); + } + + socket_client_hash[socket_client->index] = NULL; + spin_unlock_bh(&socket_client->lock); + + kfree(socket_client); + dec_module_count(); + + return 0; +} + +static ssize_t bat_socket_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct socket_client *socket_client = file->private_data; + struct socket_packet *socket_packet; + size_t packet_len; + int error; + + if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0)) + return -EAGAIN; + + if ((!buf) || (count < sizeof(struct icmp_packet))) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + error = wait_event_interruptible(socket_client->queue_wait, + socket_client->queue_len); + + if (error) + return error; + + spin_lock_bh(&socket_client->lock); + + socket_packet = list_first_entry(&socket_client->queue_list, + struct socket_packet, list); + list_del(&socket_packet->list); + socket_client->queue_len--; + + spin_unlock_bh(&socket_client->lock); + + error = __copy_to_user(buf, &socket_packet->icmp_packet, + socket_packet->icmp_len); + + packet_len = socket_packet->icmp_len; + kfree(socket_packet); + + if (error) + return -EFAULT; + + return packet_len; +} + +static ssize_t bat_socket_write(struct file *file, const char __user *buff, + size_t len, loff_t *off) +{ + struct socket_client *socket_client = file->private_data; + struct bat_priv *bat_priv = socket_client->bat_priv; + struct hard_iface *primary_if = NULL; + struct sk_buff *skb; + struct icmp_packet_rr *icmp_packet; + + struct orig_node *orig_node = NULL; + struct neigh_node *neigh_node = NULL; + size_t packet_len = sizeof(struct icmp_packet); + + if (len < sizeof(struct icmp_packet)) { + bat_dbg(DBG_BATMAN, bat_priv, + "Error - can't send packet from char device: " + "invalid packet size\n"); + return -EINVAL; + } + + primary_if = primary_if_get_selected(bat_priv); + + if (!primary_if) { + len = -EFAULT; + goto out; + } + + if (len >= sizeof(struct icmp_packet_rr)) + packet_len = sizeof(struct icmp_packet_rr); + + skb = dev_alloc_skb(packet_len + sizeof(struct ethhdr)); + if (!skb) { + len = -ENOMEM; + goto out; + } + + skb_reserve(skb, sizeof(struct ethhdr)); + icmp_packet = (struct icmp_packet_rr *)skb_put(skb, packet_len); + + if (!access_ok(VERIFY_READ, buff, packet_len)) { + len = -EFAULT; + goto free_skb; + } + + if (__copy_from_user(icmp_packet, buff, packet_len)) { + len = -EFAULT; + goto free_skb; + } + + if (icmp_packet->packet_type != BAT_ICMP) { + bat_dbg(DBG_BATMAN, bat_priv, + "Error - can't send packet from char device: " + "got bogus packet type (expected: BAT_ICMP)\n"); + len = -EINVAL; + goto free_skb; + } + + if (icmp_packet->msg_type != ECHO_REQUEST) { + bat_dbg(DBG_BATMAN, bat_priv, + "Error - can't send packet from char device: " + "got bogus message type (expected: ECHO_REQUEST)\n"); + len = -EINVAL; + goto free_skb; + } + + icmp_packet->uid = socket_client->index; + + if (icmp_packet->version != COMPAT_VERSION) { + icmp_packet->msg_type = PARAMETER_PROBLEM; + icmp_packet->ttl = COMPAT_VERSION; + bat_socket_add_packet(socket_client, icmp_packet, packet_len); + goto free_skb; + } + + if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE) + goto dst_unreach; + + orig_node = orig_hash_find(bat_priv, icmp_packet->dst); + if (!orig_node) + goto dst_unreach; + + neigh_node = orig_node_get_router(orig_node); + if (!neigh_node) + goto dst_unreach; + + if (!neigh_node->if_incoming) + goto dst_unreach; + + if (neigh_node->if_incoming->if_status != IF_ACTIVE) + goto dst_unreach; + + memcpy(icmp_packet->orig, + primary_if->net_dev->dev_addr, ETH_ALEN); + + if (packet_len == sizeof(struct icmp_packet_rr)) + memcpy(icmp_packet->rr, + neigh_node->if_incoming->net_dev->dev_addr, ETH_ALEN); + + send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + goto out; + +dst_unreach: + icmp_packet->msg_type = DESTINATION_UNREACHABLE; + bat_socket_add_packet(socket_client, icmp_packet, packet_len); +free_skb: + kfree_skb(skb); +out: + if (primary_if) + hardif_free_ref(primary_if); + if (neigh_node) + neigh_node_free_ref(neigh_node); + if (orig_node) + orig_node_free_ref(orig_node); + return len; +} + +static unsigned int bat_socket_poll(struct file *file, poll_table *wait) +{ + struct socket_client *socket_client = file->private_data; + + poll_wait(file, &socket_client->queue_wait, wait); + + if (socket_client->queue_len > 0) + return POLLIN | POLLRDNORM; + + return 0; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = bat_socket_open, + .release = bat_socket_release, + .read = bat_socket_read, + .write = bat_socket_write, + .poll = bat_socket_poll, + .llseek = no_llseek, +}; + +int bat_socket_setup(struct bat_priv *bat_priv) +{ + struct dentry *d; + + if (!bat_priv->debug_dir) + goto err; + + d = debugfs_create_file(ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR, + bat_priv->debug_dir, bat_priv, &fops); + if (d) + goto err; + + return 0; + +err: + return 1; +} + +static void bat_socket_add_packet(struct socket_client *socket_client, + struct icmp_packet_rr *icmp_packet, + size_t icmp_len) +{ + struct socket_packet *socket_packet; + + socket_packet = kmalloc(sizeof(struct socket_packet), GFP_ATOMIC); + + if (!socket_packet) + return; + + INIT_LIST_HEAD(&socket_packet->list); + memcpy(&socket_packet->icmp_packet, icmp_packet, icmp_len); + socket_packet->icmp_len = icmp_len; + + spin_lock_bh(&socket_client->lock); + + /* while waiting for the lock the socket_client could have been + * deleted */ + if (!socket_client_hash[icmp_packet->uid]) { + spin_unlock_bh(&socket_client->lock); + kfree(socket_packet); + return; + } + + list_add_tail(&socket_packet->list, &socket_client->queue_list); + socket_client->queue_len++; + + if (socket_client->queue_len > 100) { + socket_packet = list_first_entry(&socket_client->queue_list, + struct socket_packet, list); + + list_del(&socket_packet->list); + kfree(socket_packet); + socket_client->queue_len--; + } + + spin_unlock_bh(&socket_client->lock); + + wake_up(&socket_client->queue_wait); +} + +void bat_socket_receive_packet(struct icmp_packet_rr *icmp_packet, + size_t icmp_len) +{ + struct socket_client *hash = socket_client_hash[icmp_packet->uid]; + + if (hash) + bat_socket_add_packet(hash, icmp_packet, icmp_len); +} diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h new file mode 100644 index 00000000..462b190f --- /dev/null +++ b/net/batman-adv/icmp_socket.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ +#define _NET_BATMAN_ADV_ICMP_SOCKET_H_ + +#define ICMP_SOCKET "socket" + +void bat_socket_init(void); +int bat_socket_setup(struct bat_priv *bat_priv); +void bat_socket_receive_packet(struct icmp_packet_rr *icmp_packet, + size_t icmp_len); + +#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c new file mode 100644 index 00000000..0a7cee00 --- /dev/null +++ b/net/batman-adv/main.c @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "bat_sysfs.h" +#include "bat_debugfs.h" +#include "routing.h" +#include "send.h" +#include "originator.h" +#include "soft-interface.h" +#include "icmp_socket.h" +#include "translation-table.h" +#include "hard-interface.h" +#include "gateway_client.h" +#include "vis.h" +#include "hash.h" + + +/* List manipulations on hardif_list have to be rtnl_lock()'ed, + * list traversals just rcu-locked */ +struct list_head hardif_list; + +unsigned char broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + +struct workqueue_struct *bat_event_workqueue; + +static int __init batman_init(void) +{ + INIT_LIST_HEAD(&hardif_list); + + /* the name should not be longer than 10 chars - see + * http://lwn.net/Articles/23634/ */ + bat_event_workqueue = create_singlethread_workqueue("bat_events"); + + if (!bat_event_workqueue) + return -ENOMEM; + + bat_socket_init(); + debugfs_init(); + + register_netdevice_notifier(&hard_if_notifier); + + pr_info("B.A.T.M.A.N. advanced %s%s (compatibility version %i) " + "loaded\n", SOURCE_VERSION, REVISION_VERSION_STR, + COMPAT_VERSION); + + return 0; +} + +static void __exit batman_exit(void) +{ + debugfs_destroy(); + unregister_netdevice_notifier(&hard_if_notifier); + hardif_remove_interfaces(); + + flush_workqueue(bat_event_workqueue); + destroy_workqueue(bat_event_workqueue); + bat_event_workqueue = NULL; + + rcu_barrier(); +} + +int mesh_init(struct net_device *soft_iface) +{ + struct bat_priv *bat_priv = netdev_priv(soft_iface); + + spin_lock_init(&bat_priv->forw_bat_list_lock); + spin_lock_init(&bat_priv->forw_bcast_list_lock); + spin_lock_init(&bat_priv->tt_lhash_lock); + spin_lock_init(&bat_priv->tt_ghash_lock); + spin_lock_init(&bat_priv->gw_list_lock); + spin_lock_init(&bat_priv->vis_hash_lock); + spin_lock_init(&bat_priv->vis_list_lock); + spin_lock_init(&bat_priv->softif_neigh_lock); + spin_lock_init(&bat_priv->softif_neigh_vid_lock); + + INIT_HLIST_HEAD(&bat_priv->forw_bat_list); + INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); + INIT_HLIST_HEAD(&bat_priv->gw_list); + INIT_HLIST_HEAD(&bat_priv->softif_neigh_vids); + + if (originator_init(bat_priv) < 1) + goto err; + + if (tt_local_init(bat_priv) < 1) + goto err; + + if (tt_global_init(bat_priv) < 1) + goto err; + + tt_local_add(soft_iface, soft_iface->dev_addr); + + if (vis_init(bat_priv) < 1) + goto err; + + atomic_set(&bat_priv->mesh_state, MESH_ACTIVE); + goto end; + +err: + pr_err("Unable to allocate memory for mesh information structures: " + "out of mem ?\n"); + mesh_free(soft_iface); + return -1; + +end: + return 0; +} + +void mesh_free(struct net_device *soft_iface) +{ + struct bat_priv *bat_priv = netdev_priv(soft_iface); + + atomic_set(&bat_priv->mesh_state, MESH_DEACTIVATING); + + purge_outstanding_packets(bat_priv, NULL); + + vis_quit(bat_priv); + + gw_node_purge(bat_priv); + originator_free(bat_priv); + + tt_local_free(bat_priv); + tt_global_free(bat_priv); + + softif_neigh_purge(bat_priv); + + atomic_set(&bat_priv->mesh_state, MESH_INACTIVE); +} + +void inc_module_count(void) +{ + try_module_get(THIS_MODULE); +} + +void dec_module_count(void) +{ + module_put(THIS_MODULE); +} + +int is_my_mac(uint8_t *addr) +{ + struct hard_iface *hard_iface; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if (hard_iface->if_status != IF_ACTIVE) + continue; + + if (compare_eth(hard_iface->net_dev->dev_addr, addr)) { + rcu_read_unlock(); + return 1; + } + } + rcu_read_unlock(); + return 0; + +} + +module_init(batman_init); +module_exit(batman_exit); + +MODULE_LICENSE("GPL"); + +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_SUPPORTED_DEVICE(DRIVER_DEVICE); +#ifdef REVISION_VERSION +MODULE_VERSION(SOURCE_VERSION "-" REVISION_VERSION); +#else +MODULE_VERSION(SOURCE_VERSION); +#endif diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h new file mode 100644 index 00000000..148b49e0 --- /dev/null +++ b/net/batman-adv/main.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_MAIN_H_ +#define _NET_BATMAN_ADV_MAIN_H_ + +#define DRIVER_AUTHOR "Marek Lindner , " \ + "Simon Wunderlich " +#define DRIVER_DESC "B.A.T.M.A.N. advanced" +#define DRIVER_DEVICE "batman-adv" + +#define SOURCE_VERSION "next" + + +/* B.A.T.M.A.N. parameters */ + +#define TQ_MAX_VALUE 255 +#define JITTER 20 + + /* Time To Live of broadcast messages */ +#define TTL 50 + +/* purge originators after time in seconds if no valid packet comes in + * -> TODO: check influence on TQ_LOCAL_WINDOW_SIZE */ +#define PURGE_TIMEOUT 200 +#define TT_LOCAL_TIMEOUT 3600 /* in seconds */ + +/* sliding packet range of received originator messages in squence numbers + * (should be a multiple of our word size) */ +#define TQ_LOCAL_WINDOW_SIZE 64 +#define TQ_GLOBAL_WINDOW_SIZE 5 +#define TQ_LOCAL_BIDRECT_SEND_MINIMUM 1 +#define TQ_LOCAL_BIDRECT_RECV_MINIMUM 1 +#define TQ_TOTAL_BIDRECT_LIMIT 1 + +#define NUM_WORDS (TQ_LOCAL_WINDOW_SIZE / WORD_BIT_SIZE) + +#define LOG_BUF_LEN 8192 /* has to be a power of 2 */ + +#define VIS_INTERVAL 5000 /* 5 seconds */ + +/* how much worse secondary interfaces may be to be considered as bonding + * candidates */ +#define BONDING_TQ_THRESHOLD 50 + +/* should not be bigger than 512 bytes or change the size of + * forw_packet->direct_link_flags */ +#define MAX_AGGREGATION_BYTES 512 +#define MAX_AGGREGATION_MS 100 + +#define SOFTIF_NEIGH_TIMEOUT 180000 /* 3 minutes */ + +/* don't reset again within 30 seconds */ +#define RESET_PROTECTION_MS 30000 +#define EXPECTED_SEQNO_RANGE 65536 + +#define MESH_INACTIVE 0 +#define MESH_ACTIVE 1 +#define MESH_DEACTIVATING 2 + +#define BCAST_QUEUE_LEN 256 +#define BATMAN_QUEUE_LEN 256 + +/* + * Debug Messages + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +/* Append 'batman-adv: ' before kernel messages */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +/* all messages related to routing / flooding / broadcasting / etc */ +#define DBG_BATMAN 1 +/* route or tt entry added / changed / deleted */ +#define DBG_ROUTES 2 +#define DBG_ALL 3 + + +/* + * Vis + */ + +/* + * Kernel headers + */ + +#include /* mutex */ +#include /* needed by all modules */ +#include /* netdevice */ +#include /* ethernet address classifaction */ +#include /* ethernet header */ +#include /* poll_table */ +#include /* kernel threads */ +#include /* schedule types */ +#include /* workqueue */ +#include +#include /* struct sock */ +#include +#include +#include "types.h" + +#ifndef REVISION_VERSION +#define REVISION_VERSION_STR "" +#else +#define REVISION_VERSION_STR " "REVISION_VERSION +#endif + +extern struct list_head hardif_list; + +extern unsigned char broadcast_addr[]; +extern struct workqueue_struct *bat_event_workqueue; + +int mesh_init(struct net_device *soft_iface); +void mesh_free(struct net_device *soft_iface); +void inc_module_count(void); +void dec_module_count(void); +int is_my_mac(uint8_t *addr); + +#ifdef CONFIG_BATMAN_ADV_DEBUG +int debug_log(struct bat_priv *bat_priv, char *fmt, ...); + +#define bat_dbg(type, bat_priv, fmt, arg...) \ + do { \ + if (atomic_read(&bat_priv->log_level) & type) \ + debug_log(bat_priv, fmt, ## arg); \ + } \ + while (0) +#else /* !CONFIG_BATMAN_ADV_DEBUG */ +static inline void bat_dbg(char type __always_unused, + struct bat_priv *bat_priv __always_unused, + char *fmt __always_unused, ...) +{ +} +#endif + +#define bat_info(net_dev, fmt, arg...) \ + do { \ + struct net_device *_netdev = (net_dev); \ + struct bat_priv *_batpriv = netdev_priv(_netdev); \ + bat_dbg(DBG_ALL, _batpriv, fmt, ## arg); \ + pr_info("%s: " fmt, _netdev->name, ## arg); \ + } while (0) +#define bat_err(net_dev, fmt, arg...) \ + do { \ + struct net_device *_netdev = (net_dev); \ + struct bat_priv *_batpriv = netdev_priv(_netdev); \ + bat_dbg(DBG_ALL, _batpriv, fmt, ## arg); \ + pr_err("%s: " fmt, _netdev->name, ## arg); \ + } while (0) + +/** + * returns 1 if they are the same ethernet addr + * + * note: can't use compare_ether_addr() as it requires aligned memory + */ +static inline int compare_eth(void *data1, void *data2) +{ + return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); +} + +#define atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) + +#endif /* _NET_BATMAN_ADV_MAIN_H_ */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c new file mode 100644 index 00000000..40a30bbc --- /dev/null +++ b/net/batman-adv/originator.c @@ -0,0 +1,644 @@ +/* + * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "originator.h" +#include "hash.h" +#include "translation-table.h" +#include "routing.h" +#include "gateway_client.h" +#include "hard-interface.h" +#include "unicast.h" +#include "soft-interface.h" + +static void purge_orig(struct work_struct *work); + +static void start_purge_timer(struct bat_priv *bat_priv) +{ + INIT_DELAYED_WORK(&bat_priv->orig_work, purge_orig); + queue_delayed_work(bat_event_workqueue, &bat_priv->orig_work, 1 * HZ); +} + +int originator_init(struct bat_priv *bat_priv) +{ + if (bat_priv->orig_hash) + return 1; + + bat_priv->orig_hash = hash_new(1024); + + if (!bat_priv->orig_hash) + goto err; + + start_purge_timer(bat_priv); + return 1; + +err: + return 0; +} + +void neigh_node_free_ref(struct neigh_node *neigh_node) +{ + if (atomic_dec_and_test(&neigh_node->refcount)) + kfree_rcu(neigh_node, rcu); +} + +/* increases the refcounter of a found router */ +struct neigh_node *orig_node_get_router(struct orig_node *orig_node) +{ + struct neigh_node *router; + + rcu_read_lock(); + router = rcu_dereference(orig_node->router); + + if (router && !atomic_inc_not_zero(&router->refcount)) + router = NULL; + + rcu_read_unlock(); + return router; +} + +struct neigh_node *create_neighbor(struct orig_node *orig_node, + struct orig_node *orig_neigh_node, + uint8_t *neigh, + struct hard_iface *if_incoming) +{ + struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct neigh_node *neigh_node; + + bat_dbg(DBG_BATMAN, bat_priv, + "Creating new last-hop neighbor of originator\n"); + + neigh_node = kzalloc(sizeof(struct neigh_node), GFP_ATOMIC); + if (!neigh_node) + return NULL; + + INIT_HLIST_NODE(&neigh_node->list); + INIT_LIST_HEAD(&neigh_node->bonding_list); + spin_lock_init(&neigh_node->tq_lock); + + memcpy(neigh_node->addr, neigh, ETH_ALEN); + neigh_node->orig_node = orig_neigh_node; + neigh_node->if_incoming = if_incoming; + + /* extra reference for return */ + atomic_set(&neigh_node->refcount, 2); + + spin_lock_bh(&orig_node->neigh_list_lock); + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + spin_unlock_bh(&orig_node->neigh_list_lock); + return neigh_node; +} + +static void orig_node_free_rcu(struct rcu_head *rcu) +{ + struct hlist_node *node, *node_tmp; + struct neigh_node *neigh_node, *tmp_neigh_node; + struct orig_node *orig_node; + + orig_node = container_of(rcu, struct orig_node, rcu); + + spin_lock_bh(&orig_node->neigh_list_lock); + + /* for all bonding members ... */ + list_for_each_entry_safe(neigh_node, tmp_neigh_node, + &orig_node->bond_list, bonding_list) { + list_del_rcu(&neigh_node->bonding_list); + neigh_node_free_ref(neigh_node); + } + + /* for all neighbors towards this originator ... */ + hlist_for_each_entry_safe(neigh_node, node, node_tmp, + &orig_node->neigh_list, list) { + hlist_del_rcu(&neigh_node->list); + neigh_node_free_ref(neigh_node); + } + + spin_unlock_bh(&orig_node->neigh_list_lock); + + frag_list_free(&orig_node->frag_list); + tt_global_del_orig(orig_node->bat_priv, orig_node, + "originator timed out"); + + kfree(orig_node->bcast_own); + kfree(orig_node->bcast_own_sum); + kfree(orig_node); +} + +void orig_node_free_ref(struct orig_node *orig_node) +{ + if (atomic_dec_and_test(&orig_node->refcount)) + call_rcu(&orig_node->rcu, orig_node_free_rcu); +} + +void originator_free(struct bat_priv *bat_priv) +{ + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node, *node_tmp; + struct hlist_head *head; + spinlock_t *list_lock; /* spinlock to protect write access */ + struct orig_node *orig_node; + int i; + + if (!hash) + return; + + cancel_delayed_work_sync(&bat_priv->orig_work); + + bat_priv->orig_hash = NULL; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + list_lock = &hash->list_locks[i]; + + spin_lock_bh(list_lock); + hlist_for_each_entry_safe(orig_node, node, node_tmp, + head, hash_entry) { + + hlist_del_rcu(node); + orig_node_free_ref(orig_node); + } + spin_unlock_bh(list_lock); + } + + hash_destroy(hash); +} + +/* this function finds or creates an originator entry for the given + * address if it does not exits */ +struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr) +{ + struct orig_node *orig_node; + int size; + int hash_added; + + orig_node = orig_hash_find(bat_priv, addr); + if (orig_node) + return orig_node; + + bat_dbg(DBG_BATMAN, bat_priv, + "Creating new originator: %pM\n", addr); + + orig_node = kzalloc(sizeof(struct orig_node), GFP_ATOMIC); + if (!orig_node) + return NULL; + + INIT_HLIST_HEAD(&orig_node->neigh_list); + INIT_LIST_HEAD(&orig_node->bond_list); + spin_lock_init(&orig_node->ogm_cnt_lock); + spin_lock_init(&orig_node->bcast_seqno_lock); + spin_lock_init(&orig_node->neigh_list_lock); + + /* extra reference for return */ + atomic_set(&orig_node->refcount, 2); + + orig_node->bat_priv = bat_priv; + memcpy(orig_node->orig, addr, ETH_ALEN); + orig_node->router = NULL; + orig_node->tt_buff = NULL; + orig_node->bcast_seqno_reset = jiffies - 1 + - msecs_to_jiffies(RESET_PROTECTION_MS); + orig_node->batman_seqno_reset = jiffies - 1 + - msecs_to_jiffies(RESET_PROTECTION_MS); + + atomic_set(&orig_node->bond_candidates, 0); + + size = bat_priv->num_ifaces * sizeof(unsigned long) * NUM_WORDS; + + orig_node->bcast_own = kzalloc(size, GFP_ATOMIC); + if (!orig_node->bcast_own) + goto free_orig_node; + + size = bat_priv->num_ifaces * sizeof(uint8_t); + orig_node->bcast_own_sum = kzalloc(size, GFP_ATOMIC); + + INIT_LIST_HEAD(&orig_node->frag_list); + orig_node->last_frag_packet = 0; + + if (!orig_node->bcast_own_sum) + goto free_bcast_own; + + hash_added = hash_add(bat_priv->orig_hash, compare_orig, + choose_orig, orig_node, &orig_node->hash_entry); + if (hash_added < 0) + goto free_bcast_own_sum; + + return orig_node; +free_bcast_own_sum: + kfree(orig_node->bcast_own_sum); +free_bcast_own: + kfree(orig_node->bcast_own); +free_orig_node: + kfree(orig_node); + return NULL; +} + +static bool purge_orig_neighbors(struct bat_priv *bat_priv, + struct orig_node *orig_node, + struct neigh_node **best_neigh_node) +{ + struct hlist_node *node, *node_tmp; + struct neigh_node *neigh_node; + bool neigh_purged = false; + + *best_neigh_node = NULL; + + spin_lock_bh(&orig_node->neigh_list_lock); + + /* for all neighbors towards this originator ... */ + hlist_for_each_entry_safe(neigh_node, node, node_tmp, + &orig_node->neigh_list, list) { + + if ((time_after(jiffies, + neigh_node->last_valid + PURGE_TIMEOUT * HZ)) || + (neigh_node->if_incoming->if_status == IF_INACTIVE) || + (neigh_node->if_incoming->if_status == IF_NOT_IN_USE) || + (neigh_node->if_incoming->if_status == IF_TO_BE_REMOVED)) { + + if ((neigh_node->if_incoming->if_status == + IF_INACTIVE) || + (neigh_node->if_incoming->if_status == + IF_NOT_IN_USE) || + (neigh_node->if_incoming->if_status == + IF_TO_BE_REMOVED)) + bat_dbg(DBG_BATMAN, bat_priv, + "neighbor purge: originator %pM, " + "neighbor: %pM, iface: %s\n", + orig_node->orig, neigh_node->addr, + neigh_node->if_incoming->net_dev->name); + else + bat_dbg(DBG_BATMAN, bat_priv, + "neighbor timeout: originator %pM, " + "neighbor: %pM, last_valid: %lu\n", + orig_node->orig, neigh_node->addr, + (neigh_node->last_valid / HZ)); + + neigh_purged = true; + + hlist_del_rcu(&neigh_node->list); + bonding_candidate_del(orig_node, neigh_node); + neigh_node_free_ref(neigh_node); + } else { + if ((!*best_neigh_node) || + (neigh_node->tq_avg > (*best_neigh_node)->tq_avg)) + *best_neigh_node = neigh_node; + } + } + + spin_unlock_bh(&orig_node->neigh_list_lock); + return neigh_purged; +} + +static bool purge_orig_node(struct bat_priv *bat_priv, + struct orig_node *orig_node) +{ + struct neigh_node *best_neigh_node; + + if (time_after(jiffies, + orig_node->last_valid + 2 * PURGE_TIMEOUT * HZ)) { + + bat_dbg(DBG_BATMAN, bat_priv, + "Originator timeout: originator %pM, last_valid %lu\n", + orig_node->orig, (orig_node->last_valid / HZ)); + return true; + } else { + if (purge_orig_neighbors(bat_priv, orig_node, + &best_neigh_node)) { + update_routes(bat_priv, orig_node, + best_neigh_node, + orig_node->tt_buff, + orig_node->tt_buff_len); + } + } + + return false; +} + +static void _purge_orig(struct bat_priv *bat_priv) +{ + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node, *node_tmp; + struct hlist_head *head; + spinlock_t *list_lock; /* spinlock to protect write access */ + struct orig_node *orig_node; + int i; + + if (!hash) + return; + + /* for all origins... */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + list_lock = &hash->list_locks[i]; + + spin_lock_bh(list_lock); + hlist_for_each_entry_safe(orig_node, node, node_tmp, + head, hash_entry) { + if (purge_orig_node(bat_priv, orig_node)) { + if (orig_node->gw_flags) + gw_node_delete(bat_priv, orig_node); + hlist_del_rcu(node); + orig_node_free_ref(orig_node); + continue; + } + + if (time_after(jiffies, orig_node->last_frag_packet + + msecs_to_jiffies(FRAG_TIMEOUT))) + frag_list_free(&orig_node->frag_list); + } + spin_unlock_bh(list_lock); + } + + gw_node_purge(bat_priv); + gw_election(bat_priv); + + softif_neigh_purge(bat_priv); +} + +static void purge_orig(struct work_struct *work) +{ + struct delayed_work *delayed_work = + container_of(work, struct delayed_work, work); + struct bat_priv *bat_priv = + container_of(delayed_work, struct bat_priv, orig_work); + + _purge_orig(bat_priv); + start_purge_timer(bat_priv); +} + +void purge_orig_ref(struct bat_priv *bat_priv) +{ + _purge_orig(bat_priv); +} + +int orig_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct bat_priv *bat_priv = netdev_priv(net_dev); + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node, *node_tmp; + struct hlist_head *head; + struct hard_iface *primary_if; + struct orig_node *orig_node; + struct neigh_node *neigh_node, *neigh_node_tmp; + int batman_count = 0; + int last_seen_secs; + int last_seen_msecs; + int i, ret = 0; + + primary_if = primary_if_get_selected(bat_priv); + + if (!primary_if) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - " + "please specify interfaces to enable it\n", + net_dev->name); + goto out; + } + + if (primary_if->if_status != IF_ACTIVE) { + ret = seq_printf(seq, "BATMAN mesh %s " + "disabled - primary interface not active\n", + net_dev->name); + goto out; + } + + seq_printf(seq, "[B.A.T.M.A.N. adv %s%s, MainIF/MAC: %s/%pM (%s)]\n", + SOURCE_VERSION, REVISION_VERSION_STR, + primary_if->net_dev->name, + primary_if->net_dev->dev_addr, net_dev->name); + seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n", + "Originator", "last-seen", "#", TQ_MAX_VALUE, "Nexthop", + "outgoingIF", "Potential nexthops"); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + neigh_node = orig_node_get_router(orig_node); + if (!neigh_node) + continue; + + if (neigh_node->tq_avg == 0) + goto next; + + last_seen_secs = jiffies_to_msecs(jiffies - + orig_node->last_valid) / 1000; + last_seen_msecs = jiffies_to_msecs(jiffies - + orig_node->last_valid) % 1000; + + seq_printf(seq, "%pM %4i.%03is (%3i) %pM [%10s]:", + orig_node->orig, last_seen_secs, + last_seen_msecs, neigh_node->tq_avg, + neigh_node->addr, + neigh_node->if_incoming->net_dev->name); + + hlist_for_each_entry_rcu(neigh_node_tmp, node_tmp, + &orig_node->neigh_list, list) { + seq_printf(seq, " %pM (%3i)", + neigh_node_tmp->addr, + neigh_node_tmp->tq_avg); + } + + seq_printf(seq, "\n"); + batman_count++; + +next: + neigh_node_free_ref(neigh_node); + } + rcu_read_unlock(); + } + + if (batman_count == 0) + seq_printf(seq, "No batman nodes in range ...\n"); + +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +static int orig_node_add_if(struct orig_node *orig_node, int max_if_num) +{ + void *data_ptr; + + data_ptr = kmalloc(max_if_num * sizeof(unsigned long) * NUM_WORDS, + GFP_ATOMIC); + if (!data_ptr) { + pr_err("Can't resize orig: out of memory\n"); + return -1; + } + + memcpy(data_ptr, orig_node->bcast_own, + (max_if_num - 1) * sizeof(unsigned long) * NUM_WORDS); + kfree(orig_node->bcast_own); + orig_node->bcast_own = data_ptr; + + data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC); + if (!data_ptr) { + pr_err("Can't resize orig: out of memory\n"); + return -1; + } + + memcpy(data_ptr, orig_node->bcast_own_sum, + (max_if_num - 1) * sizeof(uint8_t)); + kfree(orig_node->bcast_own_sum); + orig_node->bcast_own_sum = data_ptr; + + return 0; +} + +int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num) +{ + struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node; + struct hlist_head *head; + struct orig_node *orig_node; + int i, ret; + + /* resize all orig nodes because orig_node->bcast_own(_sum) depend on + * if_num */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + spin_lock_bh(&orig_node->ogm_cnt_lock); + ret = orig_node_add_if(orig_node, max_if_num); + spin_unlock_bh(&orig_node->ogm_cnt_lock); + + if (ret == -1) + goto err; + } + rcu_read_unlock(); + } + + return 0; + +err: + rcu_read_unlock(); + return -ENOMEM; +} + +static int orig_node_del_if(struct orig_node *orig_node, + int max_if_num, int del_if_num) +{ + void *data_ptr = NULL; + int chunk_size; + + /* last interface was removed */ + if (max_if_num == 0) + goto free_bcast_own; + + chunk_size = sizeof(unsigned long) * NUM_WORDS; + data_ptr = kmalloc(max_if_num * chunk_size, GFP_ATOMIC); + if (!data_ptr) { + pr_err("Can't resize orig: out of memory\n"); + return -1; + } + + /* copy first part */ + memcpy(data_ptr, orig_node->bcast_own, del_if_num * chunk_size); + + /* copy second part */ + memcpy(data_ptr + del_if_num * chunk_size, + orig_node->bcast_own + ((del_if_num + 1) * chunk_size), + (max_if_num - del_if_num) * chunk_size); + +free_bcast_own: + kfree(orig_node->bcast_own); + orig_node->bcast_own = data_ptr; + + if (max_if_num == 0) + goto free_own_sum; + + data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC); + if (!data_ptr) { + pr_err("Can't resize orig: out of memory\n"); + return -1; + } + + memcpy(data_ptr, orig_node->bcast_own_sum, + del_if_num * sizeof(uint8_t)); + + memcpy(data_ptr + del_if_num * sizeof(uint8_t), + orig_node->bcast_own_sum + ((del_if_num + 1) * sizeof(uint8_t)), + (max_if_num - del_if_num) * sizeof(uint8_t)); + +free_own_sum: + kfree(orig_node->bcast_own_sum); + orig_node->bcast_own_sum = data_ptr; + + return 0; +} + +int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num) +{ + struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node; + struct hlist_head *head; + struct hard_iface *hard_iface_tmp; + struct orig_node *orig_node; + int i, ret; + + /* resize all orig nodes because orig_node->bcast_own(_sum) depend on + * if_num */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + spin_lock_bh(&orig_node->ogm_cnt_lock); + ret = orig_node_del_if(orig_node, max_if_num, + hard_iface->if_num); + spin_unlock_bh(&orig_node->ogm_cnt_lock); + + if (ret == -1) + goto err; + } + rcu_read_unlock(); + } + + /* renumber remaining batman interfaces _inside_ of orig_hash_lock */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface_tmp, &hardif_list, list) { + if (hard_iface_tmp->if_status == IF_NOT_IN_USE) + continue; + + if (hard_iface == hard_iface_tmp) + continue; + + if (hard_iface->soft_iface != hard_iface_tmp->soft_iface) + continue; + + if (hard_iface_tmp->if_num > hard_iface->if_num) + hard_iface_tmp->if_num--; + } + rcu_read_unlock(); + + hard_iface->if_num = -1; + return 0; + +err: + rcu_read_unlock(); + return -ENOMEM; +} diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h new file mode 100644 index 00000000..e1d641f2 --- /dev/null +++ b/net/batman-adv/originator.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ +#define _NET_BATMAN_ADV_ORIGINATOR_H_ + +#include "hash.h" + +int originator_init(struct bat_priv *bat_priv); +void originator_free(struct bat_priv *bat_priv); +void purge_orig_ref(struct bat_priv *bat_priv); +void orig_node_free_ref(struct orig_node *orig_node); +struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr); +struct neigh_node *create_neighbor(struct orig_node *orig_node, + struct orig_node *orig_neigh_node, + uint8_t *neigh, + struct hard_iface *if_incoming); +void neigh_node_free_ref(struct neigh_node *neigh_node); +struct neigh_node *orig_node_get_router(struct orig_node *orig_node); +int orig_seq_print_text(struct seq_file *seq, void *offset); +int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num); +int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num); + + +/* returns 1 if they are the same originator */ +static inline int compare_orig(struct hlist_node *node, void *data2) +{ + void *data1 = container_of(node, struct orig_node, hash_entry); + + return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); +} + +/* hashfunction to choose an entry in a hash table of given size */ +/* hash algorithm from http://en.wikipedia.org/wiki/Hash_table */ +static inline int choose_orig(void *data, int32_t size) +{ + unsigned char *key = data; + uint32_t hash = 0; + size_t i; + + for (i = 0; i < 6; i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash % size; +} + +static inline struct orig_node *orig_hash_find(struct bat_priv *bat_priv, + void *data) +{ + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_head *head; + struct hlist_node *node; + struct orig_node *orig_node, *orig_node_tmp = NULL; + int index; + + if (!hash) + return NULL; + + index = choose_orig(data, hash->size); + head = &hash->table[index]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + if (!compare_eth(orig_node, data)) + continue; + + if (!atomic_inc_not_zero(&orig_node->refcount)) + continue; + + orig_node_tmp = orig_node; + break; + } + rcu_read_unlock(); + + return orig_node_tmp; +} + +#endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */ diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h new file mode 100644 index 00000000..eda99650 --- /dev/null +++ b/net/batman-adv/packet.h @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_PACKET_H_ +#define _NET_BATMAN_ADV_PACKET_H_ + +#define ETH_P_BATMAN 0x4305 /* unofficial/not registered Ethertype */ + +#define BAT_PACKET 0x01 +#define BAT_ICMP 0x02 +#define BAT_UNICAST 0x03 +#define BAT_BCAST 0x04 +#define BAT_VIS 0x05 +#define BAT_UNICAST_FRAG 0x06 + +/* this file is included by batctl which needs these defines */ +#define COMPAT_VERSION 12 +#define DIRECTLINK 0x40 +#define VIS_SERVER 0x20 +#define PRIMARIES_FIRST_HOP 0x10 + +/* ICMP message types */ +#define ECHO_REPLY 0 +#define DESTINATION_UNREACHABLE 3 +#define ECHO_REQUEST 8 +#define TTL_EXCEEDED 11 +#define PARAMETER_PROBLEM 12 + +/* vis defines */ +#define VIS_TYPE_SERVER_SYNC 0 +#define VIS_TYPE_CLIENT_UPDATE 1 + +/* fragmentation defines */ +#define UNI_FRAG_HEAD 0x01 +#define UNI_FRAG_LARGETAIL 0x02 + +struct batman_packet { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t flags; /* 0x40: DIRECTLINK flag, 0x20 VIS_SERVER flag... */ + uint8_t tq; + uint32_t seqno; + uint8_t orig[6]; + uint8_t prev_sender[6]; + uint8_t ttl; + uint8_t num_tt; + uint8_t gw_flags; /* flags related to gateway class */ + uint8_t align; +} __packed; + +#define BAT_PACKET_LEN sizeof(struct batman_packet) + +struct icmp_packet { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t msg_type; /* see ICMP message types above */ + uint8_t ttl; + uint8_t dst[6]; + uint8_t orig[6]; + uint16_t seqno; + uint8_t uid; +} __packed; + +#define BAT_RR_LEN 16 + +/* icmp_packet_rr must start with all fields from imcp_packet + * as this is assumed by code that handles ICMP packets */ +struct icmp_packet_rr { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t msg_type; /* see ICMP message types above */ + uint8_t ttl; + uint8_t dst[6]; + uint8_t orig[6]; + uint16_t seqno; + uint8_t uid; + uint8_t rr_cur; + uint8_t rr[BAT_RR_LEN][ETH_ALEN]; +} __packed; + +struct unicast_packet { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t dest[6]; + uint8_t ttl; +} __packed; + +struct unicast_frag_packet { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t dest[6]; + uint8_t ttl; + uint8_t flags; + uint8_t orig[6]; + uint16_t seqno; +} __packed; + +struct bcast_packet { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t orig[6]; + uint8_t ttl; + uint32_t seqno; +} __packed; + +struct vis_packet { + uint8_t packet_type; + uint8_t version; /* batman version field */ + uint8_t vis_type; /* which type of vis-participant sent this? */ + uint8_t entries; /* number of entries behind this struct */ + uint32_t seqno; /* sequence number */ + uint8_t ttl; /* TTL */ + uint8_t vis_orig[6]; /* originator that announces its neighbors */ + uint8_t target_orig[6]; /* who should receive this packet */ + uint8_t sender_orig[6]; /* who sent or rebroadcasted this packet */ +} __packed; + +#endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/net/batman-adv/ring_buffer.c b/net/batman-adv/ring_buffer.c new file mode 100644 index 00000000..5bb6a619 --- /dev/null +++ b/net/batman-adv/ring_buffer.c @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "ring_buffer.h" + +void ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, uint8_t value) +{ + lq_recv[*lq_index] = value; + *lq_index = (*lq_index + 1) % TQ_GLOBAL_WINDOW_SIZE; +} + +uint8_t ring_buffer_avg(uint8_t lq_recv[]) +{ + uint8_t *ptr; + uint16_t count = 0, i = 0, sum = 0; + + ptr = lq_recv; + + while (i < TQ_GLOBAL_WINDOW_SIZE) { + if (*ptr != 0) { + count++; + sum += *ptr; + } + + i++; + ptr++; + } + + if (count == 0) + return 0; + + return (uint8_t)(sum / count); +} diff --git a/net/batman-adv/ring_buffer.h b/net/batman-adv/ring_buffer.h new file mode 100644 index 00000000..0395b274 --- /dev/null +++ b/net/batman-adv/ring_buffer.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_RING_BUFFER_H_ +#define _NET_BATMAN_ADV_RING_BUFFER_H_ + +void ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, uint8_t value); +uint8_t ring_buffer_avg(uint8_t lq_recv[]); + +#endif /* _NET_BATMAN_ADV_RING_BUFFER_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c new file mode 100644 index 00000000..bb1c3ec7 --- /dev/null +++ b/net/batman-adv/routing.c @@ -0,0 +1,1535 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "routing.h" +#include "send.h" +#include "hash.h" +#include "soft-interface.h" +#include "hard-interface.h" +#include "icmp_socket.h" +#include "translation-table.h" +#include "originator.h" +#include "ring_buffer.h" +#include "vis.h" +#include "aggregation.h" +#include "gateway_common.h" +#include "gateway_client.h" +#include "unicast.h" + +void slide_own_bcast_window(struct hard_iface *hard_iface) +{ + struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node; + struct hlist_head *head; + struct orig_node *orig_node; + unsigned long *word; + int i; + size_t word_index; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + spin_lock_bh(&orig_node->ogm_cnt_lock); + word_index = hard_iface->if_num * NUM_WORDS; + word = &(orig_node->bcast_own[word_index]); + + bit_get_packet(bat_priv, word, 1, 0); + orig_node->bcast_own_sum[hard_iface->if_num] = + bit_packet_count(word); + spin_unlock_bh(&orig_node->ogm_cnt_lock); + } + rcu_read_unlock(); + } +} + +static void update_TT(struct bat_priv *bat_priv, struct orig_node *orig_node, + unsigned char *tt_buff, int tt_buff_len) +{ + if ((tt_buff_len != orig_node->tt_buff_len) || + ((tt_buff_len > 0) && + (orig_node->tt_buff_len > 0) && + (memcmp(orig_node->tt_buff, tt_buff, tt_buff_len) != 0))) { + + if (orig_node->tt_buff_len > 0) + tt_global_del_orig(bat_priv, orig_node, + "originator changed tt"); + + if ((tt_buff_len > 0) && (tt_buff)) + tt_global_add_orig(bat_priv, orig_node, + tt_buff, tt_buff_len); + } +} + +static void update_route(struct bat_priv *bat_priv, + struct orig_node *orig_node, + struct neigh_node *neigh_node, + unsigned char *tt_buff, int tt_buff_len) +{ + struct neigh_node *curr_router; + + curr_router = orig_node_get_router(orig_node); + + /* route deleted */ + if ((curr_router) && (!neigh_node)) { + + bat_dbg(DBG_ROUTES, bat_priv, "Deleting route towards: %pM\n", + orig_node->orig); + tt_global_del_orig(bat_priv, orig_node, + "originator timed out"); + + /* route added */ + } else if ((!curr_router) && (neigh_node)) { + + bat_dbg(DBG_ROUTES, bat_priv, + "Adding route towards: %pM (via %pM)\n", + orig_node->orig, neigh_node->addr); + tt_global_add_orig(bat_priv, orig_node, + tt_buff, tt_buff_len); + + /* route changed */ + } else { + bat_dbg(DBG_ROUTES, bat_priv, + "Changing route towards: %pM " + "(now via %pM - was via %pM)\n", + orig_node->orig, neigh_node->addr, + curr_router->addr); + } + + if (curr_router) + neigh_node_free_ref(curr_router); + + /* increase refcount of new best neighbor */ + if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount)) + neigh_node = NULL; + + spin_lock_bh(&orig_node->neigh_list_lock); + rcu_assign_pointer(orig_node->router, neigh_node); + spin_unlock_bh(&orig_node->neigh_list_lock); + + /* decrease refcount of previous best neighbor */ + if (curr_router) + neigh_node_free_ref(curr_router); +} + + +void update_routes(struct bat_priv *bat_priv, struct orig_node *orig_node, + struct neigh_node *neigh_node, unsigned char *tt_buff, + int tt_buff_len) +{ + struct neigh_node *router = NULL; + + if (!orig_node) + goto out; + + router = orig_node_get_router(orig_node); + + if (router != neigh_node) + update_route(bat_priv, orig_node, neigh_node, + tt_buff, tt_buff_len); + /* may be just TT changed */ + else + update_TT(bat_priv, orig_node, tt_buff, tt_buff_len); + +out: + if (router) + neigh_node_free_ref(router); +} + +static int is_bidirectional_neigh(struct orig_node *orig_node, + struct orig_node *orig_neigh_node, + struct batman_packet *batman_packet, + struct hard_iface *if_incoming) +{ + struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct neigh_node *neigh_node = NULL, *tmp_neigh_node; + struct hlist_node *node; + unsigned char total_count; + uint8_t orig_eq_count, neigh_rq_count, tq_own; + int tq_asym_penalty, ret = 0; + + /* find corresponding one hop neighbor */ + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, node, + &orig_neigh_node->neigh_list, list) { + + if (!compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig)) + continue; + + if (tmp_neigh_node->if_incoming != if_incoming) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + neigh_node = tmp_neigh_node; + break; + } + rcu_read_unlock(); + + if (!neigh_node) + neigh_node = create_neighbor(orig_neigh_node, + orig_neigh_node, + orig_neigh_node->orig, + if_incoming); + + if (!neigh_node) + goto out; + + /* if orig_node is direct neighbour update neigh_node last_valid */ + if (orig_node == orig_neigh_node) + neigh_node->last_valid = jiffies; + + orig_node->last_valid = jiffies; + + /* find packet count of corresponding one hop neighbor */ + spin_lock_bh(&orig_node->ogm_cnt_lock); + orig_eq_count = orig_neigh_node->bcast_own_sum[if_incoming->if_num]; + neigh_rq_count = neigh_node->real_packet_count; + spin_unlock_bh(&orig_node->ogm_cnt_lock); + + /* pay attention to not get a value bigger than 100 % */ + total_count = (orig_eq_count > neigh_rq_count ? + neigh_rq_count : orig_eq_count); + + /* if we have too few packets (too less data) we set tq_own to zero */ + /* if we receive too few packets it is not considered bidirectional */ + if ((total_count < TQ_LOCAL_BIDRECT_SEND_MINIMUM) || + (neigh_rq_count < TQ_LOCAL_BIDRECT_RECV_MINIMUM)) + tq_own = 0; + else + /* neigh_node->real_packet_count is never zero as we + * only purge old information when getting new + * information */ + tq_own = (TQ_MAX_VALUE * total_count) / neigh_rq_count; + + /* + * 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does + * affect the nearly-symmetric links only a little, but + * punishes asymmetric links more. This will give a value + * between 0 and TQ_MAX_VALUE + */ + tq_asym_penalty = TQ_MAX_VALUE - (TQ_MAX_VALUE * + (TQ_LOCAL_WINDOW_SIZE - neigh_rq_count) * + (TQ_LOCAL_WINDOW_SIZE - neigh_rq_count) * + (TQ_LOCAL_WINDOW_SIZE - neigh_rq_count)) / + (TQ_LOCAL_WINDOW_SIZE * + TQ_LOCAL_WINDOW_SIZE * + TQ_LOCAL_WINDOW_SIZE); + + batman_packet->tq = ((batman_packet->tq * tq_own * tq_asym_penalty) / + (TQ_MAX_VALUE * TQ_MAX_VALUE)); + + bat_dbg(DBG_BATMAN, bat_priv, + "bidirectional: " + "orig = %-15pM neigh = %-15pM => own_bcast = %2i, " + "real recv = %2i, local tq: %3i, asym_penalty: %3i, " + "total tq: %3i\n", + orig_node->orig, orig_neigh_node->orig, total_count, + neigh_rq_count, tq_own, tq_asym_penalty, batman_packet->tq); + + /* if link has the minimum required transmission quality + * consider it bidirectional */ + if (batman_packet->tq >= TQ_TOTAL_BIDRECT_LIMIT) + ret = 1; + +out: + if (neigh_node) + neigh_node_free_ref(neigh_node); + return ret; +} + +/* caller must hold the neigh_list_lock */ +void bonding_candidate_del(struct orig_node *orig_node, + struct neigh_node *neigh_node) +{ + /* this neighbor is not part of our candidate list */ + if (list_empty(&neigh_node->bonding_list)) + goto out; + + list_del_rcu(&neigh_node->bonding_list); + INIT_LIST_HEAD(&neigh_node->bonding_list); + neigh_node_free_ref(neigh_node); + atomic_dec(&orig_node->bond_candidates); + +out: + return; +} + +static void bonding_candidate_add(struct orig_node *orig_node, + struct neigh_node *neigh_node) +{ + struct hlist_node *node; + struct neigh_node *tmp_neigh_node, *router = NULL; + uint8_t interference_candidate = 0; + + spin_lock_bh(&orig_node->neigh_list_lock); + + /* only consider if it has the same primary address ... */ + if (!compare_eth(orig_node->orig, + neigh_node->orig_node->primary_addr)) + goto candidate_del; + + router = orig_node_get_router(orig_node); + if (!router) + goto candidate_del; + + /* ... and is good enough to be considered */ + if (neigh_node->tq_avg < router->tq_avg - BONDING_TQ_THRESHOLD) + goto candidate_del; + + /** + * check if we have another candidate with the same mac address or + * interface. If we do, we won't select this candidate because of + * possible interference. + */ + hlist_for_each_entry_rcu(tmp_neigh_node, node, + &orig_node->neigh_list, list) { + + if (tmp_neigh_node == neigh_node) + continue; + + /* we only care if the other candidate is even + * considered as candidate. */ + if (list_empty(&tmp_neigh_node->bonding_list)) + continue; + + if ((neigh_node->if_incoming == tmp_neigh_node->if_incoming) || + (compare_eth(neigh_node->addr, tmp_neigh_node->addr))) { + interference_candidate = 1; + break; + } + } + + /* don't care further if it is an interference candidate */ + if (interference_candidate) + goto candidate_del; + + /* this neighbor already is part of our candidate list */ + if (!list_empty(&neigh_node->bonding_list)) + goto out; + + if (!atomic_inc_not_zero(&neigh_node->refcount)) + goto out; + + list_add_rcu(&neigh_node->bonding_list, &orig_node->bond_list); + atomic_inc(&orig_node->bond_candidates); + goto out; + +candidate_del: + bonding_candidate_del(orig_node, neigh_node); + +out: + spin_unlock_bh(&orig_node->neigh_list_lock); + + if (router) + neigh_node_free_ref(router); +} + +/* copy primary address for bonding */ +static void bonding_save_primary(struct orig_node *orig_node, + struct orig_node *orig_neigh_node, + struct batman_packet *batman_packet) +{ + if (!(batman_packet->flags & PRIMARIES_FIRST_HOP)) + return; + + memcpy(orig_neigh_node->primary_addr, orig_node->orig, ETH_ALEN); +} + +static void update_orig(struct bat_priv *bat_priv, + struct orig_node *orig_node, + struct ethhdr *ethhdr, + struct batman_packet *batman_packet, + struct hard_iface *if_incoming, + unsigned char *tt_buff, int tt_buff_len, + char is_duplicate) +{ + struct neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL; + struct neigh_node *router = NULL; + struct orig_node *orig_node_tmp; + struct hlist_node *node; + int tmp_tt_buff_len; + uint8_t bcast_own_sum_orig, bcast_own_sum_neigh; + + bat_dbg(DBG_BATMAN, bat_priv, "update_originator(): " + "Searching and updating originator entry of received packet\n"); + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, node, + &orig_node->neigh_list, list) { + if (compare_eth(tmp_neigh_node->addr, ethhdr->h_source) && + (tmp_neigh_node->if_incoming == if_incoming) && + atomic_inc_not_zero(&tmp_neigh_node->refcount)) { + if (neigh_node) + neigh_node_free_ref(neigh_node); + neigh_node = tmp_neigh_node; + continue; + } + + if (is_duplicate) + continue; + + spin_lock_bh(&tmp_neigh_node->tq_lock); + ring_buffer_set(tmp_neigh_node->tq_recv, + &tmp_neigh_node->tq_index, 0); + tmp_neigh_node->tq_avg = + ring_buffer_avg(tmp_neigh_node->tq_recv); + spin_unlock_bh(&tmp_neigh_node->tq_lock); + } + + if (!neigh_node) { + struct orig_node *orig_tmp; + + orig_tmp = get_orig_node(bat_priv, ethhdr->h_source); + if (!orig_tmp) + goto unlock; + + neigh_node = create_neighbor(orig_node, orig_tmp, + ethhdr->h_source, if_incoming); + + orig_node_free_ref(orig_tmp); + if (!neigh_node) + goto unlock; + } else + bat_dbg(DBG_BATMAN, bat_priv, + "Updating existing last-hop neighbor of originator\n"); + + rcu_read_unlock(); + + orig_node->flags = batman_packet->flags; + neigh_node->last_valid = jiffies; + + spin_lock_bh(&neigh_node->tq_lock); + ring_buffer_set(neigh_node->tq_recv, + &neigh_node->tq_index, + batman_packet->tq); + neigh_node->tq_avg = ring_buffer_avg(neigh_node->tq_recv); + spin_unlock_bh(&neigh_node->tq_lock); + + if (!is_duplicate) { + orig_node->last_ttl = batman_packet->ttl; + neigh_node->last_ttl = batman_packet->ttl; + } + + bonding_candidate_add(orig_node, neigh_node); + + tmp_tt_buff_len = (tt_buff_len > batman_packet->num_tt * ETH_ALEN ? + batman_packet->num_tt * ETH_ALEN : tt_buff_len); + + /* if this neighbor already is our next hop there is nothing + * to change */ + router = orig_node_get_router(orig_node); + if (router == neigh_node) + goto update_tt; + + /* if this neighbor does not offer a better TQ we won't consider it */ + if (router && (router->tq_avg > neigh_node->tq_avg)) + goto update_tt; + + /* if the TQ is the same and the link not more symetric we + * won't consider it either */ + if (router && (neigh_node->tq_avg == router->tq_avg)) { + orig_node_tmp = router->orig_node; + spin_lock_bh(&orig_node_tmp->ogm_cnt_lock); + bcast_own_sum_orig = + orig_node_tmp->bcast_own_sum[if_incoming->if_num]; + spin_unlock_bh(&orig_node_tmp->ogm_cnt_lock); + + orig_node_tmp = neigh_node->orig_node; + spin_lock_bh(&orig_node_tmp->ogm_cnt_lock); + bcast_own_sum_neigh = + orig_node_tmp->bcast_own_sum[if_incoming->if_num]; + spin_unlock_bh(&orig_node_tmp->ogm_cnt_lock); + + if (bcast_own_sum_orig >= bcast_own_sum_neigh) + goto update_tt; + } + + update_routes(bat_priv, orig_node, neigh_node, + tt_buff, tmp_tt_buff_len); + goto update_gw; + +update_tt: + update_routes(bat_priv, orig_node, router, + tt_buff, tmp_tt_buff_len); + +update_gw: + if (orig_node->gw_flags != batman_packet->gw_flags) + gw_node_update(bat_priv, orig_node, batman_packet->gw_flags); + + orig_node->gw_flags = batman_packet->gw_flags; + + /* restart gateway selection if fast or late switching was enabled */ + if ((orig_node->gw_flags) && + (atomic_read(&bat_priv->gw_mode) == GW_MODE_CLIENT) && + (atomic_read(&bat_priv->gw_sel_class) > 2)) + gw_check_election(bat_priv, orig_node); + + goto out; + +unlock: + rcu_read_unlock(); +out: + if (neigh_node) + neigh_node_free_ref(neigh_node); + if (router) + neigh_node_free_ref(router); +} + +/* checks whether the host restarted and is in the protection time. + * returns: + * 0 if the packet is to be accepted + * 1 if the packet is to be ignored. + */ +static int window_protected(struct bat_priv *bat_priv, + int32_t seq_num_diff, + unsigned long *last_reset) +{ + if ((seq_num_diff <= -TQ_LOCAL_WINDOW_SIZE) + || (seq_num_diff >= EXPECTED_SEQNO_RANGE)) { + if (time_after(jiffies, *last_reset + + msecs_to_jiffies(RESET_PROTECTION_MS))) { + + *last_reset = jiffies; + bat_dbg(DBG_BATMAN, bat_priv, + "old packet received, start protection\n"); + + return 0; + } else + return 1; + } + return 0; +} + +/* processes a batman packet for all interfaces, adjusts the sequence number and + * finds out whether it is a duplicate. + * returns: + * 1 the packet is a duplicate + * 0 the packet has not yet been received + * -1 the packet is old and has been received while the seqno window + * was protected. Caller should drop it. + */ +static char count_real_packets(struct ethhdr *ethhdr, + struct batman_packet *batman_packet, + struct hard_iface *if_incoming) +{ + struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct orig_node *orig_node; + struct neigh_node *tmp_neigh_node; + struct hlist_node *node; + char is_duplicate = 0; + int32_t seq_diff; + int need_update = 0; + int set_mark, ret = -1; + + orig_node = get_orig_node(bat_priv, batman_packet->orig); + if (!orig_node) + return 0; + + spin_lock_bh(&orig_node->ogm_cnt_lock); + seq_diff = batman_packet->seqno - orig_node->last_real_seqno; + + /* signalize caller that the packet is to be dropped. */ + if (window_protected(bat_priv, seq_diff, + &orig_node->batman_seqno_reset)) + goto out; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, node, + &orig_node->neigh_list, list) { + + is_duplicate |= get_bit_status(tmp_neigh_node->real_bits, + orig_node->last_real_seqno, + batman_packet->seqno); + + if (compare_eth(tmp_neigh_node->addr, ethhdr->h_source) && + (tmp_neigh_node->if_incoming == if_incoming)) + set_mark = 1; + else + set_mark = 0; + + /* if the window moved, set the update flag. */ + need_update |= bit_get_packet(bat_priv, + tmp_neigh_node->real_bits, + seq_diff, set_mark); + + tmp_neigh_node->real_packet_count = + bit_packet_count(tmp_neigh_node->real_bits); + } + rcu_read_unlock(); + + if (need_update) { + bat_dbg(DBG_BATMAN, bat_priv, + "updating last_seqno: old %d, new %d\n", + orig_node->last_real_seqno, batman_packet->seqno); + orig_node->last_real_seqno = batman_packet->seqno; + } + + ret = is_duplicate; + +out: + spin_unlock_bh(&orig_node->ogm_cnt_lock); + orig_node_free_ref(orig_node); + return ret; +} + +void receive_bat_packet(struct ethhdr *ethhdr, + struct batman_packet *batman_packet, + unsigned char *tt_buff, int tt_buff_len, + struct hard_iface *if_incoming) +{ + struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct hard_iface *hard_iface; + struct orig_node *orig_neigh_node, *orig_node; + struct neigh_node *router = NULL, *router_router = NULL; + struct neigh_node *orig_neigh_router = NULL; + char has_directlink_flag; + char is_my_addr = 0, is_my_orig = 0, is_my_oldorig = 0; + char is_broadcast = 0, is_bidirectional, is_single_hop_neigh; + char is_duplicate; + uint32_t if_incoming_seqno; + + /* Silently drop when the batman packet is actually not a + * correct packet. + * + * This might happen if a packet is padded (e.g. Ethernet has a + * minimum frame length of 64 byte) and the aggregation interprets + * it as an additional length. + * + * TODO: A more sane solution would be to have a bit in the + * batman_packet to detect whether the packet is the last + * packet in an aggregation. Here we expect that the padding + * is always zero (or not 0x01) + */ + if (batman_packet->packet_type != BAT_PACKET) + return; + + /* could be changed by schedule_own_packet() */ + if_incoming_seqno = atomic_read(&if_incoming->seqno); + + has_directlink_flag = (batman_packet->flags & DIRECTLINK ? 1 : 0); + + is_single_hop_neigh = (compare_eth(ethhdr->h_source, + batman_packet->orig) ? 1 : 0); + + bat_dbg(DBG_BATMAN, bat_priv, + "Received BATMAN packet via NB: %pM, IF: %s [%pM] " + "(from OG: %pM, via prev OG: %pM, seqno %d, tq %d, " + "TTL %d, V %d, IDF %d)\n", + ethhdr->h_source, if_incoming->net_dev->name, + if_incoming->net_dev->dev_addr, batman_packet->orig, + batman_packet->prev_sender, batman_packet->seqno, + batman_packet->tq, batman_packet->ttl, batman_packet->version, + has_directlink_flag); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if (hard_iface->if_status != IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != if_incoming->soft_iface) + continue; + + if (compare_eth(ethhdr->h_source, + hard_iface->net_dev->dev_addr)) + is_my_addr = 1; + + if (compare_eth(batman_packet->orig, + hard_iface->net_dev->dev_addr)) + is_my_orig = 1; + + if (compare_eth(batman_packet->prev_sender, + hard_iface->net_dev->dev_addr)) + is_my_oldorig = 1; + + if (compare_eth(ethhdr->h_source, broadcast_addr)) + is_broadcast = 1; + } + rcu_read_unlock(); + + if (batman_packet->version != COMPAT_VERSION) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: incompatible batman version (%i)\n", + batman_packet->version); + return; + } + + if (is_my_addr) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: received my own broadcast (sender: %pM" + ")\n", + ethhdr->h_source); + return; + } + + if (is_broadcast) { + bat_dbg(DBG_BATMAN, bat_priv, "Drop packet: " + "ignoring all packets with broadcast source addr (sender: %pM" + ")\n", ethhdr->h_source); + return; + } + + if (is_my_orig) { + unsigned long *word; + int offset; + + orig_neigh_node = get_orig_node(bat_priv, ethhdr->h_source); + if (!orig_neigh_node) + return; + + /* neighbor has to indicate direct link and it has to + * come via the corresponding interface */ + /* if received seqno equals last send seqno save new + * seqno for bidirectional check */ + if (has_directlink_flag && + compare_eth(if_incoming->net_dev->dev_addr, + batman_packet->orig) && + (batman_packet->seqno - if_incoming_seqno + 2 == 0)) { + offset = if_incoming->if_num * NUM_WORDS; + + spin_lock_bh(&orig_neigh_node->ogm_cnt_lock); + word = &(orig_neigh_node->bcast_own[offset]); + bit_mark(word, 0); + orig_neigh_node->bcast_own_sum[if_incoming->if_num] = + bit_packet_count(word); + spin_unlock_bh(&orig_neigh_node->ogm_cnt_lock); + } + + bat_dbg(DBG_BATMAN, bat_priv, "Drop packet: " + "originator packet from myself (via neighbor)\n"); + orig_node_free_ref(orig_neigh_node); + return; + } + + if (is_my_oldorig) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: ignoring all rebroadcast echos (sender: " + "%pM)\n", ethhdr->h_source); + return; + } + + orig_node = get_orig_node(bat_priv, batman_packet->orig); + if (!orig_node) + return; + + is_duplicate = count_real_packets(ethhdr, batman_packet, if_incoming); + + if (is_duplicate == -1) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: packet within seqno protection time " + "(sender: %pM)\n", ethhdr->h_source); + goto out; + } + + if (batman_packet->tq == 0) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: originator packet with tq equal 0\n"); + goto out; + } + + router = orig_node_get_router(orig_node); + if (router) + router_router = orig_node_get_router(router->orig_node); + + /* avoid temporary routing loops */ + if (router && router_router && + (compare_eth(router->addr, batman_packet->prev_sender)) && + !(compare_eth(batman_packet->orig, batman_packet->prev_sender)) && + (compare_eth(router->addr, router_router->addr))) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: ignoring all rebroadcast packets that " + "may make me loop (sender: %pM)\n", ethhdr->h_source); + goto out; + } + + /* if sender is a direct neighbor the sender mac equals + * originator mac */ + orig_neigh_node = (is_single_hop_neigh ? + orig_node : + get_orig_node(bat_priv, ethhdr->h_source)); + if (!orig_neigh_node) + goto out; + + orig_neigh_router = orig_node_get_router(orig_neigh_node); + + /* drop packet if sender is not a direct neighbor and if we + * don't route towards it */ + if (!is_single_hop_neigh && (!orig_neigh_router)) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: OGM via unknown neighbor!\n"); + goto out_neigh; + } + + is_bidirectional = is_bidirectional_neigh(orig_node, orig_neigh_node, + batman_packet, if_incoming); + + bonding_save_primary(orig_node, orig_neigh_node, batman_packet); + + /* update ranking if it is not a duplicate or has the same + * seqno and similar ttl as the non-duplicate */ + if (is_bidirectional && + (!is_duplicate || + ((orig_node->last_real_seqno == batman_packet->seqno) && + (orig_node->last_ttl - 3 <= batman_packet->ttl)))) + update_orig(bat_priv, orig_node, ethhdr, batman_packet, + if_incoming, tt_buff, tt_buff_len, is_duplicate); + + /* is single hop (direct) neighbor */ + if (is_single_hop_neigh) { + + /* mark direct link on incoming interface */ + schedule_forward_packet(orig_node, ethhdr, batman_packet, + 1, tt_buff_len, if_incoming); + + bat_dbg(DBG_BATMAN, bat_priv, "Forwarding packet: " + "rebroadcast neighbor packet with direct link flag\n"); + goto out_neigh; + } + + /* multihop originator */ + if (!is_bidirectional) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: not received via bidirectional link\n"); + goto out_neigh; + } + + if (is_duplicate) { + bat_dbg(DBG_BATMAN, bat_priv, + "Drop packet: duplicate packet received\n"); + goto out_neigh; + } + + bat_dbg(DBG_BATMAN, bat_priv, + "Forwarding packet: rebroadcast originator packet\n"); + schedule_forward_packet(orig_node, ethhdr, batman_packet, + 0, tt_buff_len, if_incoming); + +out_neigh: + if ((orig_neigh_node) && (!is_single_hop_neigh)) + orig_node_free_ref(orig_neigh_node); +out: + if (router) + neigh_node_free_ref(router); + if (router_router) + neigh_node_free_ref(router_router); + if (orig_neigh_router) + neigh_node_free_ref(orig_neigh_router); + + orig_node_free_ref(orig_node); +} + +int recv_bat_packet(struct sk_buff *skb, struct hard_iface *hard_iface) +{ + struct ethhdr *ethhdr; + + /* drop packet if it has not necessary minimum size */ + if (unlikely(!pskb_may_pull(skb, sizeof(struct batman_packet)))) + return NET_RX_DROP; + + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + /* packet with broadcast indication but unicast recipient */ + if (!is_broadcast_ether_addr(ethhdr->h_dest)) + return NET_RX_DROP; + + /* packet with broadcast sender address */ + if (is_broadcast_ether_addr(ethhdr->h_source)) + return NET_RX_DROP; + + /* create a copy of the skb, if needed, to modify it. */ + if (skb_cow(skb, 0) < 0) + return NET_RX_DROP; + + /* keep skb linear */ + if (skb_linearize(skb) < 0) + return NET_RX_DROP; + + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + receive_aggr_bat_packet(ethhdr, + skb->data, + skb_headlen(skb), + hard_iface); + + kfree_skb(skb); + return NET_RX_SUCCESS; +} + +static int recv_my_icmp_packet(struct bat_priv *bat_priv, + struct sk_buff *skb, size_t icmp_len) +{ + struct hard_iface *primary_if = NULL; + struct orig_node *orig_node = NULL; + struct neigh_node *router = NULL; + struct icmp_packet_rr *icmp_packet; + int ret = NET_RX_DROP; + + icmp_packet = (struct icmp_packet_rr *)skb->data; + + /* add data to device queue */ + if (icmp_packet->msg_type != ECHO_REQUEST) { + bat_socket_receive_packet(icmp_packet, icmp_len); + goto out; + } + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + /* answer echo request (ping) */ + /* get routing information */ + orig_node = orig_hash_find(bat_priv, icmp_packet->orig); + if (!orig_node) + goto out; + + router = orig_node_get_router(orig_node); + if (!router) + goto out; + + /* create a copy of the skb, if needed, to modify it. */ + if (skb_cow(skb, sizeof(struct ethhdr)) < 0) + goto out; + + icmp_packet = (struct icmp_packet_rr *)skb->data; + + memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN); + memcpy(icmp_packet->orig, primary_if->net_dev->dev_addr, ETH_ALEN); + icmp_packet->msg_type = ECHO_REPLY; + icmp_packet->ttl = TTL; + + send_skb_packet(skb, router->if_incoming, router->addr); + ret = NET_RX_SUCCESS; + +out: + if (primary_if) + hardif_free_ref(primary_if); + if (router) + neigh_node_free_ref(router); + if (orig_node) + orig_node_free_ref(orig_node); + return ret; +} + +static int recv_icmp_ttl_exceeded(struct bat_priv *bat_priv, + struct sk_buff *skb) +{ + struct hard_iface *primary_if = NULL; + struct orig_node *orig_node = NULL; + struct neigh_node *router = NULL; + struct icmp_packet *icmp_packet; + int ret = NET_RX_DROP; + + icmp_packet = (struct icmp_packet *)skb->data; + + /* send TTL exceeded if packet is an echo request (traceroute) */ + if (icmp_packet->msg_type != ECHO_REQUEST) { + pr_debug("Warning - can't forward icmp packet from %pM to " + "%pM: ttl exceeded\n", icmp_packet->orig, + icmp_packet->dst); + goto out; + } + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + /* get routing information */ + orig_node = orig_hash_find(bat_priv, icmp_packet->orig); + if (!orig_node) + goto out; + + router = orig_node_get_router(orig_node); + if (!router) + goto out; + + /* create a copy of the skb, if needed, to modify it. */ + if (skb_cow(skb, sizeof(struct ethhdr)) < 0) + goto out; + + icmp_packet = (struct icmp_packet *)skb->data; + + memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN); + memcpy(icmp_packet->orig, primary_if->net_dev->dev_addr, ETH_ALEN); + icmp_packet->msg_type = TTL_EXCEEDED; + icmp_packet->ttl = TTL; + + send_skb_packet(skb, router->if_incoming, router->addr); + ret = NET_RX_SUCCESS; + +out: + if (primary_if) + hardif_free_ref(primary_if); + if (router) + neigh_node_free_ref(router); + if (orig_node) + orig_node_free_ref(orig_node); + return ret; +} + + +int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if) +{ + struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct icmp_packet_rr *icmp_packet; + struct ethhdr *ethhdr; + struct orig_node *orig_node = NULL; + struct neigh_node *router = NULL; + int hdr_size = sizeof(struct icmp_packet); + int ret = NET_RX_DROP; + + /** + * we truncate all incoming icmp packets if they don't match our size + */ + if (skb->len >= sizeof(struct icmp_packet_rr)) + hdr_size = sizeof(struct icmp_packet_rr); + + /* drop packet if it has not necessary minimum size */ + if (unlikely(!pskb_may_pull(skb, hdr_size))) + goto out; + + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + /* packet with unicast indication but broadcast recipient */ + if (is_broadcast_ether_addr(ethhdr->h_dest)) + goto out; + + /* packet with broadcast sender address */ + if (is_broadcast_ether_addr(ethhdr->h_source)) + goto out; + + /* not for me */ + if (!is_my_mac(ethhdr->h_dest)) + goto out; + + icmp_packet = (struct icmp_packet_rr *)skb->data; + + /* add record route information if not full */ + if ((hdr_size == sizeof(struct icmp_packet_rr)) && + (icmp_packet->rr_cur < BAT_RR_LEN)) { + memcpy(&(icmp_packet->rr[icmp_packet->rr_cur]), + ethhdr->h_dest, ETH_ALEN); + icmp_packet->rr_cur++; + } + + /* packet for me */ + if (is_my_mac(icmp_packet->dst)) + return recv_my_icmp_packet(bat_priv, skb, hdr_size); + + /* TTL exceeded */ + if (icmp_packet->ttl < 2) + return recv_icmp_ttl_exceeded(bat_priv, skb); + + /* get routing information */ + orig_node = orig_hash_find(bat_priv, icmp_packet->dst); + if (!orig_node) + goto out; + + router = orig_node_get_router(orig_node); + if (!router) + goto out; + + /* create a copy of the skb, if needed, to modify it. */ + if (skb_cow(skb, sizeof(struct ethhdr)) < 0) + goto out; + + icmp_packet = (struct icmp_packet_rr *)skb->data; + + /* decrement ttl */ + icmp_packet->ttl--; + + /* route it */ + send_skb_packet(skb, router->if_incoming, router->addr); + ret = NET_RX_SUCCESS; + +out: + if (router) + neigh_node_free_ref(router); + if (orig_node) + orig_node_free_ref(orig_node); + return ret; +} + +/* In the bonding case, send the packets in a round + * robin fashion over the remaining interfaces. + * + * This method rotates the bonding list and increases the + * returned router's refcount. */ +static struct neigh_node *find_bond_router(struct orig_node *primary_orig, + struct hard_iface *recv_if) +{ + struct neigh_node *tmp_neigh_node; + struct neigh_node *router = NULL, *first_candidate = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp_neigh_node, &primary_orig->bond_list, + bonding_list) { + if (!first_candidate) + first_candidate = tmp_neigh_node; + + /* recv_if == NULL on the first node. */ + if (tmp_neigh_node->if_incoming == recv_if) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + router = tmp_neigh_node; + break; + } + + /* use the first candidate if nothing was found. */ + if (!router && first_candidate && + atomic_inc_not_zero(&first_candidate->refcount)) + router = first_candidate; + + if (!router) + goto out; + + /* selected should point to the next element + * after the current router */ + spin_lock_bh(&primary_orig->neigh_list_lock); + /* this is a list_move(), which unfortunately + * does not exist as rcu version */ + list_del_rcu(&primary_orig->bond_list); + list_add_rcu(&primary_orig->bond_list, + &router->bonding_list); + spin_unlock_bh(&primary_orig->neigh_list_lock); + +out: + rcu_read_unlock(); + return router; +} + +/* Interface Alternating: Use the best of the + * remaining candidates which are not using + * this interface. + * + * Increases the returned router's refcount */ +static struct neigh_node *find_ifalter_router(struct orig_node *primary_orig, + struct hard_iface *recv_if) +{ + struct neigh_node *tmp_neigh_node; + struct neigh_node *router = NULL, *first_candidate = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp_neigh_node, &primary_orig->bond_list, + bonding_list) { + if (!first_candidate) + first_candidate = tmp_neigh_node; + + /* recv_if == NULL on the first node. */ + if (tmp_neigh_node->if_incoming == recv_if) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + /* if we don't have a router yet + * or this one is better, choose it. */ + if ((!router) || + (tmp_neigh_node->tq_avg > router->tq_avg)) { + /* decrement refcount of + * previously selected router */ + if (router) + neigh_node_free_ref(router); + + router = tmp_neigh_node; + atomic_inc_not_zero(&router->refcount); + } + + neigh_node_free_ref(tmp_neigh_node); + } + + /* use the first candidate if nothing was found. */ + if (!router && first_candidate && + atomic_inc_not_zero(&first_candidate->refcount)) + router = first_candidate; + + rcu_read_unlock(); + return router; +} + +/* find a suitable router for this originator, and use + * bonding if possible. increases the found neighbors + * refcount.*/ +struct neigh_node *find_router(struct bat_priv *bat_priv, + struct orig_node *orig_node, + struct hard_iface *recv_if) +{ + struct orig_node *primary_orig_node; + struct orig_node *router_orig; + struct neigh_node *router; + static uint8_t zero_mac[ETH_ALEN] = {0, 0, 0, 0, 0, 0}; + int bonding_enabled; + + if (!orig_node) + return NULL; + + router = orig_node_get_router(orig_node); + if (!router) + goto err; + + /* without bonding, the first node should + * always choose the default router. */ + bonding_enabled = atomic_read(&bat_priv->bonding); + + rcu_read_lock(); + /* select default router to output */ + router_orig = router->orig_node; + if (!router_orig) + goto err_unlock; + + if ((!recv_if) && (!bonding_enabled)) + goto return_router; + + /* if we have something in the primary_addr, we can search + * for a potential bonding candidate. */ + if (compare_eth(router_orig->primary_addr, zero_mac)) + goto return_router; + + /* find the orig_node which has the primary interface. might + * even be the same as our router_orig in many cases */ + + if (compare_eth(router_orig->primary_addr, router_orig->orig)) { + primary_orig_node = router_orig; + } else { + primary_orig_node = orig_hash_find(bat_priv, + router_orig->primary_addr); + if (!primary_orig_node) + goto return_router; + + orig_node_free_ref(primary_orig_node); + } + + /* with less than 2 candidates, we can't do any + * bonding and prefer the original router. */ + if (atomic_read(&primary_orig_node->bond_candidates) < 2) + goto return_router; + + /* all nodes between should choose a candidate which + * is is not on the interface where the packet came + * in. */ + + neigh_node_free_ref(router); + + if (bonding_enabled) + router = find_bond_router(primary_orig_node, recv_if); + else + router = find_ifalter_router(primary_orig_node, recv_if); + +return_router: + rcu_read_unlock(); + return router; +err_unlock: + rcu_read_unlock(); +err: + if (router) + neigh_node_free_ref(router); + return NULL; +} + +static int check_unicast_packet(struct sk_buff *skb, int hdr_size) +{ + struct ethhdr *ethhdr; + + /* drop packet if it has not necessary minimum size */ + if (unlikely(!pskb_may_pull(skb, hdr_size))) + return -1; + + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + /* packet with unicast indication but broadcast recipient */ + if (is_broadcast_ether_addr(ethhdr->h_dest)) + return -1; + + /* packet with broadcast sender address */ + if (is_broadcast_ether_addr(ethhdr->h_source)) + return -1; + + /* not for me */ + if (!is_my_mac(ethhdr->h_dest)) + return -1; + + return 0; +} + +int route_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if) +{ + struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct orig_node *orig_node = NULL; + struct neigh_node *neigh_node = NULL; + struct unicast_packet *unicast_packet; + struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + int ret = NET_RX_DROP; + struct sk_buff *new_skb; + + unicast_packet = (struct unicast_packet *)skb->data; + + /* TTL exceeded */ + if (unicast_packet->ttl < 2) { + pr_debug("Warning - can't forward unicast packet from %pM to " + "%pM: ttl exceeded\n", ethhdr->h_source, + unicast_packet->dest); + goto out; + } + + /* get routing information */ + orig_node = orig_hash_find(bat_priv, unicast_packet->dest); + + if (!orig_node) + goto out; + + /* find_router() increases neigh_nodes refcount if found. */ + neigh_node = find_router(bat_priv, orig_node, recv_if); + + if (!neigh_node) + goto out; + + /* create a copy of the skb, if needed, to modify it. */ + if (skb_cow(skb, sizeof(struct ethhdr)) < 0) + goto out; + + unicast_packet = (struct unicast_packet *)skb->data; + + if (unicast_packet->packet_type == BAT_UNICAST && + atomic_read(&bat_priv->fragmentation) && + skb->len > neigh_node->if_incoming->net_dev->mtu) { + ret = frag_send_skb(skb, bat_priv, + neigh_node->if_incoming, neigh_node->addr); + goto out; + } + + if (unicast_packet->packet_type == BAT_UNICAST_FRAG && + frag_can_reassemble(skb, neigh_node->if_incoming->net_dev->mtu)) { + + ret = frag_reassemble_skb(skb, bat_priv, &new_skb); + + if (ret == NET_RX_DROP) + goto out; + + /* packet was buffered for late merge */ + if (!new_skb) { + ret = NET_RX_SUCCESS; + goto out; + } + + skb = new_skb; + unicast_packet = (struct unicast_packet *)skb->data; + } + + /* decrement ttl */ + unicast_packet->ttl--; + + /* route it */ + send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + ret = NET_RX_SUCCESS; + +out: + if (neigh_node) + neigh_node_free_ref(neigh_node); + if (orig_node) + orig_node_free_ref(orig_node); + return ret; +} + +int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if) +{ + struct unicast_packet *unicast_packet; + int hdr_size = sizeof(struct unicast_packet); + + if (check_unicast_packet(skb, hdr_size) < 0) + return NET_RX_DROP; + + unicast_packet = (struct unicast_packet *)skb->data; + + /* packet for me */ + if (is_my_mac(unicast_packet->dest)) { + interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size); + return NET_RX_SUCCESS; + } + + return route_unicast_packet(skb, recv_if); +} + +int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if) +{ + struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct unicast_frag_packet *unicast_packet; + int hdr_size = sizeof(struct unicast_frag_packet); + struct sk_buff *new_skb = NULL; + int ret; + + if (check_unicast_packet(skb, hdr_size) < 0) + return NET_RX_DROP; + + unicast_packet = (struct unicast_frag_packet *)skb->data; + + /* packet for me */ + if (is_my_mac(unicast_packet->dest)) { + + ret = frag_reassemble_skb(skb, bat_priv, &new_skb); + + if (ret == NET_RX_DROP) + return NET_RX_DROP; + + /* packet was buffered for late merge */ + if (!new_skb) + return NET_RX_SUCCESS; + + interface_rx(recv_if->soft_iface, new_skb, recv_if, + sizeof(struct unicast_packet)); + return NET_RX_SUCCESS; + } + + return route_unicast_packet(skb, recv_if); +} + + +int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if) +{ + struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct orig_node *orig_node = NULL; + struct bcast_packet *bcast_packet; + struct ethhdr *ethhdr; + int hdr_size = sizeof(struct bcast_packet); + int ret = NET_RX_DROP; + int32_t seq_diff; + + /* drop packet if it has not necessary minimum size */ + if (unlikely(!pskb_may_pull(skb, hdr_size))) + goto out; + + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + /* packet with broadcast indication but unicast recipient */ + if (!is_broadcast_ether_addr(ethhdr->h_dest)) + goto out; + + /* packet with broadcast sender address */ + if (is_broadcast_ether_addr(ethhdr->h_source)) + goto out; + + /* ignore broadcasts sent by myself */ + if (is_my_mac(ethhdr->h_source)) + goto out; + + bcast_packet = (struct bcast_packet *)skb->data; + + /* ignore broadcasts originated by myself */ + if (is_my_mac(bcast_packet->orig)) + goto out; + + if (bcast_packet->ttl < 2) + goto out; + + orig_node = orig_hash_find(bat_priv, bcast_packet->orig); + + if (!orig_node) + goto out; + + spin_lock_bh(&orig_node->bcast_seqno_lock); + + /* check whether the packet is a duplicate */ + if (get_bit_status(orig_node->bcast_bits, orig_node->last_bcast_seqno, + ntohl(bcast_packet->seqno))) + goto spin_unlock; + + seq_diff = ntohl(bcast_packet->seqno) - orig_node->last_bcast_seqno; + + /* check whether the packet is old and the host just restarted. */ + if (window_protected(bat_priv, seq_diff, + &orig_node->bcast_seqno_reset)) + goto spin_unlock; + + /* mark broadcast in flood history, update window position + * if required. */ + if (bit_get_packet(bat_priv, orig_node->bcast_bits, seq_diff, 1)) + orig_node->last_bcast_seqno = ntohl(bcast_packet->seqno); + + spin_unlock_bh(&orig_node->bcast_seqno_lock); + + /* rebroadcast packet */ + add_bcast_packet_to_list(bat_priv, skb); + + /* broadcast for me */ + interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size); + ret = NET_RX_SUCCESS; + goto out; + +spin_unlock: + spin_unlock_bh(&orig_node->bcast_seqno_lock); +out: + if (orig_node) + orig_node_free_ref(orig_node); + return ret; +} + +int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if) +{ + struct vis_packet *vis_packet; + struct ethhdr *ethhdr; + struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface); + int hdr_size = sizeof(struct vis_packet); + + /* keep skb linear */ + if (skb_linearize(skb) < 0) + return NET_RX_DROP; + + if (unlikely(!pskb_may_pull(skb, hdr_size))) + return NET_RX_DROP; + + vis_packet = (struct vis_packet *)skb->data; + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + /* not for me */ + if (!is_my_mac(ethhdr->h_dest)) + return NET_RX_DROP; + + /* ignore own packets */ + if (is_my_mac(vis_packet->vis_orig)) + return NET_RX_DROP; + + if (is_my_mac(vis_packet->sender_orig)) + return NET_RX_DROP; + + switch (vis_packet->vis_type) { + case VIS_TYPE_SERVER_SYNC: + receive_server_sync_packet(bat_priv, vis_packet, + skb_headlen(skb)); + break; + + case VIS_TYPE_CLIENT_UPDATE: + receive_client_update_packet(bat_priv, vis_packet, + skb_headlen(skb)); + break; + + default: /* ignore unknown packet */ + break; + } + + /* We take a copy of the data in the packet, so we should + always free the skbuf. */ + return NET_RX_DROP; +} diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h new file mode 100644 index 00000000..870f2984 --- /dev/null +++ b/net/batman-adv/routing.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_ROUTING_H_ +#define _NET_BATMAN_ADV_ROUTING_H_ + +void slide_own_bcast_window(struct hard_iface *hard_iface); +void receive_bat_packet(struct ethhdr *ethhdr, + struct batman_packet *batman_packet, + unsigned char *tt_buff, int tt_buff_len, + struct hard_iface *if_incoming); +void update_routes(struct bat_priv *bat_priv, struct orig_node *orig_node, + struct neigh_node *neigh_node, unsigned char *tt_buff, + int tt_buff_len); +int route_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if); +int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if); +int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if); +int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if); +int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if); +int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if); +int recv_bat_packet(struct sk_buff *skb, struct hard_iface *recv_if); +struct neigh_node *find_router(struct bat_priv *bat_priv, + struct orig_node *orig_node, + struct hard_iface *recv_if); +void bonding_candidate_del(struct orig_node *orig_node, + struct neigh_node *neigh_node); + +#endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c new file mode 100644 index 00000000..33779278 --- /dev/null +++ b/net/batman-adv/send.c @@ -0,0 +1,611 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "send.h" +#include "routing.h" +#include "translation-table.h" +#include "soft-interface.h" +#include "hard-interface.h" +#include "vis.h" +#include "aggregation.h" +#include "gateway_common.h" +#include "originator.h" + +static void send_outstanding_bcast_packet(struct work_struct *work); + +/* apply hop penalty for a normal link */ +static uint8_t hop_penalty(const uint8_t tq, struct bat_priv *bat_priv) +{ + int hop_penalty = atomic_read(&bat_priv->hop_penalty); + return (tq * (TQ_MAX_VALUE - hop_penalty)) / (TQ_MAX_VALUE); +} + +/* when do we schedule our own packet to be sent */ +static unsigned long own_send_time(struct bat_priv *bat_priv) +{ + return jiffies + msecs_to_jiffies( + atomic_read(&bat_priv->orig_interval) - + JITTER + (random32() % 2*JITTER)); +} + +/* when do we schedule a forwarded packet to be sent */ +static unsigned long forward_send_time(void) +{ + return jiffies + msecs_to_jiffies(random32() % (JITTER/2)); +} + +/* send out an already prepared packet to the given address via the + * specified batman interface */ +int send_skb_packet(struct sk_buff *skb, + struct hard_iface *hard_iface, + uint8_t *dst_addr) +{ + struct ethhdr *ethhdr; + + if (hard_iface->if_status != IF_ACTIVE) + goto send_skb_err; + + if (unlikely(!hard_iface->net_dev)) + goto send_skb_err; + + if (!(hard_iface->net_dev->flags & IFF_UP)) { + pr_warning("Interface %s is not up - can't send packet via " + "that interface!\n", hard_iface->net_dev->name); + goto send_skb_err; + } + + /* push to the ethernet header. */ + if (my_skb_head_push(skb, sizeof(struct ethhdr)) < 0) + goto send_skb_err; + + skb_reset_mac_header(skb); + + ethhdr = (struct ethhdr *) skb_mac_header(skb); + memcpy(ethhdr->h_source, hard_iface->net_dev->dev_addr, ETH_ALEN); + memcpy(ethhdr->h_dest, dst_addr, ETH_ALEN); + ethhdr->h_proto = __constant_htons(ETH_P_BATMAN); + + skb_set_network_header(skb, ETH_HLEN); + skb->priority = TC_PRIO_CONTROL; + skb->protocol = __constant_htons(ETH_P_BATMAN); + + skb->dev = hard_iface->net_dev; + + /* dev_queue_xmit() returns a negative result on error. However on + * congestion and traffic shaping, it drops and returns NET_XMIT_DROP + * (which is > 0). This will not be treated as an error. */ + + return dev_queue_xmit(skb); +send_skb_err: + kfree_skb(skb); + return NET_XMIT_DROP; +} + +/* Send a packet to a given interface */ +static void send_packet_to_if(struct forw_packet *forw_packet, + struct hard_iface *hard_iface) +{ + struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + char *fwd_str; + uint8_t packet_num; + int16_t buff_pos; + struct batman_packet *batman_packet; + struct sk_buff *skb; + + if (hard_iface->if_status != IF_ACTIVE) + return; + + packet_num = 0; + buff_pos = 0; + batman_packet = (struct batman_packet *)forw_packet->skb->data; + + /* adjust all flags and log packets */ + while (aggregated_packet(buff_pos, + forw_packet->packet_len, + batman_packet->num_tt)) { + + /* we might have aggregated direct link packets with an + * ordinary base packet */ + if ((forw_packet->direct_link_flags & (1 << packet_num)) && + (forw_packet->if_incoming == hard_iface)) + batman_packet->flags |= DIRECTLINK; + else + batman_packet->flags &= ~DIRECTLINK; + + fwd_str = (packet_num > 0 ? "Forwarding" : (forw_packet->own ? + "Sending own" : + "Forwarding")); + bat_dbg(DBG_BATMAN, bat_priv, + "%s %spacket (originator %pM, seqno %d, TQ %d, TTL %d," + " IDF %s) on interface %s [%pM]\n", + fwd_str, (packet_num > 0 ? "aggregated " : ""), + batman_packet->orig, ntohl(batman_packet->seqno), + batman_packet->tq, batman_packet->ttl, + (batman_packet->flags & DIRECTLINK ? + "on" : "off"), + hard_iface->net_dev->name, + hard_iface->net_dev->dev_addr); + + buff_pos += sizeof(struct batman_packet) + + (batman_packet->num_tt * ETH_ALEN); + packet_num++; + batman_packet = (struct batman_packet *) + (forw_packet->skb->data + buff_pos); + } + + /* create clone because function is called more than once */ + skb = skb_clone(forw_packet->skb, GFP_ATOMIC); + if (skb) + send_skb_packet(skb, hard_iface, broadcast_addr); +} + +/* send a batman packet */ +static void send_packet(struct forw_packet *forw_packet) +{ + struct hard_iface *hard_iface; + struct net_device *soft_iface; + struct bat_priv *bat_priv; + struct batman_packet *batman_packet = + (struct batman_packet *)(forw_packet->skb->data); + unsigned char directlink = (batman_packet->flags & DIRECTLINK ? 1 : 0); + + if (!forw_packet->if_incoming) { + pr_err("Error - can't forward packet: incoming iface not " + "specified\n"); + return; + } + + soft_iface = forw_packet->if_incoming->soft_iface; + bat_priv = netdev_priv(soft_iface); + + if (forw_packet->if_incoming->if_status != IF_ACTIVE) + return; + + /* multihomed peer assumed */ + /* non-primary OGMs are only broadcasted on their interface */ + if ((directlink && (batman_packet->ttl == 1)) || + (forw_packet->own && (forw_packet->if_incoming->if_num > 0))) { + + /* FIXME: what about aggregated packets ? */ + bat_dbg(DBG_BATMAN, bat_priv, + "%s packet (originator %pM, seqno %d, TTL %d) " + "on interface %s [%pM]\n", + (forw_packet->own ? "Sending own" : "Forwarding"), + batman_packet->orig, ntohl(batman_packet->seqno), + batman_packet->ttl, + forw_packet->if_incoming->net_dev->name, + forw_packet->if_incoming->net_dev->dev_addr); + + /* skb is only used once and than forw_packet is free'd */ + send_skb_packet(forw_packet->skb, forw_packet->if_incoming, + broadcast_addr); + forw_packet->skb = NULL; + + return; + } + + /* broadcast on every interface */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + send_packet_to_if(forw_packet, hard_iface); + } + rcu_read_unlock(); +} + +static void rebuild_batman_packet(struct bat_priv *bat_priv, + struct hard_iface *hard_iface) +{ + int new_len; + unsigned char *new_buff; + struct batman_packet *batman_packet; + + new_len = sizeof(struct batman_packet) + + (bat_priv->num_local_tt * ETH_ALEN); + new_buff = kmalloc(new_len, GFP_ATOMIC); + + /* keep old buffer if kmalloc should fail */ + if (new_buff) { + memcpy(new_buff, hard_iface->packet_buff, + sizeof(struct batman_packet)); + batman_packet = (struct batman_packet *)new_buff; + + batman_packet->num_tt = tt_local_fill_buffer(bat_priv, + new_buff + sizeof(struct batman_packet), + new_len - sizeof(struct batman_packet)); + + kfree(hard_iface->packet_buff); + hard_iface->packet_buff = new_buff; + hard_iface->packet_len = new_len; + } +} + +void schedule_own_packet(struct hard_iface *hard_iface) +{ + struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct hard_iface *primary_if; + unsigned long send_time; + struct batman_packet *batman_packet; + int vis_server; + + if ((hard_iface->if_status == IF_NOT_IN_USE) || + (hard_iface->if_status == IF_TO_BE_REMOVED)) + return; + + vis_server = atomic_read(&bat_priv->vis_mode); + primary_if = primary_if_get_selected(bat_priv); + + /** + * the interface gets activated here to avoid race conditions between + * the moment of activating the interface in + * hardif_activate_interface() where the originator mac is set and + * outdated packets (especially uninitialized mac addresses) in the + * packet queue + */ + if (hard_iface->if_status == IF_TO_BE_ACTIVATED) + hard_iface->if_status = IF_ACTIVE; + + /* if local tt has changed and interface is a primary interface */ + if ((atomic_read(&bat_priv->tt_local_changed)) && + (hard_iface == primary_if)) + rebuild_batman_packet(bat_priv, hard_iface); + + /** + * NOTE: packet_buff might just have been re-allocated in + * rebuild_batman_packet() + */ + batman_packet = (struct batman_packet *)hard_iface->packet_buff; + + /* change sequence number to network order */ + batman_packet->seqno = + htonl((uint32_t)atomic_read(&hard_iface->seqno)); + + if (vis_server == VIS_TYPE_SERVER_SYNC) + batman_packet->flags |= VIS_SERVER; + else + batman_packet->flags &= ~VIS_SERVER; + + if ((hard_iface == primary_if) && + (atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER)) + batman_packet->gw_flags = + (uint8_t)atomic_read(&bat_priv->gw_bandwidth); + else + batman_packet->gw_flags = 0; + + atomic_inc(&hard_iface->seqno); + + slide_own_bcast_window(hard_iface); + send_time = own_send_time(bat_priv); + add_bat_packet_to_list(bat_priv, + hard_iface->packet_buff, + hard_iface->packet_len, + hard_iface, 1, send_time); + + if (primary_if) + hardif_free_ref(primary_if); +} + +void schedule_forward_packet(struct orig_node *orig_node, + struct ethhdr *ethhdr, + struct batman_packet *batman_packet, + uint8_t directlink, int tt_buff_len, + struct hard_iface *if_incoming) +{ + struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct neigh_node *router; + unsigned char in_tq, in_ttl, tq_avg = 0; + unsigned long send_time; + + if (batman_packet->ttl <= 1) { + bat_dbg(DBG_BATMAN, bat_priv, "ttl exceeded\n"); + return; + } + + router = orig_node_get_router(orig_node); + + in_tq = batman_packet->tq; + in_ttl = batman_packet->ttl; + + batman_packet->ttl--; + memcpy(batman_packet->prev_sender, ethhdr->h_source, ETH_ALEN); + + /* rebroadcast tq of our best ranking neighbor to ensure the rebroadcast + * of our best tq value */ + if (router && router->tq_avg != 0) { + + /* rebroadcast ogm of best ranking neighbor as is */ + if (!compare_eth(router->addr, ethhdr->h_source)) { + batman_packet->tq = router->tq_avg; + + if (router->last_ttl) + batman_packet->ttl = router->last_ttl - 1; + } + + tq_avg = router->tq_avg; + } + + if (router) + neigh_node_free_ref(router); + + /* apply hop penalty */ + batman_packet->tq = hop_penalty(batman_packet->tq, bat_priv); + + bat_dbg(DBG_BATMAN, bat_priv, + "Forwarding packet: tq_orig: %i, tq_avg: %i, " + "tq_forw: %i, ttl_orig: %i, ttl_forw: %i\n", + in_tq, tq_avg, batman_packet->tq, in_ttl - 1, + batman_packet->ttl); + + batman_packet->seqno = htonl(batman_packet->seqno); + + /* switch of primaries first hop flag when forwarding */ + batman_packet->flags &= ~PRIMARIES_FIRST_HOP; + if (directlink) + batman_packet->flags |= DIRECTLINK; + else + batman_packet->flags &= ~DIRECTLINK; + + send_time = forward_send_time(); + add_bat_packet_to_list(bat_priv, + (unsigned char *)batman_packet, + sizeof(struct batman_packet) + tt_buff_len, + if_incoming, 0, send_time); +} + +static void forw_packet_free(struct forw_packet *forw_packet) +{ + if (forw_packet->skb) + kfree_skb(forw_packet->skb); + if (forw_packet->if_incoming) + hardif_free_ref(forw_packet->if_incoming); + kfree(forw_packet); +} + +static void _add_bcast_packet_to_list(struct bat_priv *bat_priv, + struct forw_packet *forw_packet, + unsigned long send_time) +{ + INIT_HLIST_NODE(&forw_packet->list); + + /* add new packet to packet list */ + spin_lock_bh(&bat_priv->forw_bcast_list_lock); + hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list); + spin_unlock_bh(&bat_priv->forw_bcast_list_lock); + + /* start timer for this packet */ + INIT_DELAYED_WORK(&forw_packet->delayed_work, + send_outstanding_bcast_packet); + queue_delayed_work(bat_event_workqueue, &forw_packet->delayed_work, + send_time); +} + +/* add a broadcast packet to the queue and setup timers. broadcast packets + * are sent multiple times to increase probability for beeing received. + * + * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on + * errors. + * + * The skb is not consumed, so the caller should make sure that the + * skb is freed. */ +int add_bcast_packet_to_list(struct bat_priv *bat_priv, struct sk_buff *skb) +{ + struct hard_iface *primary_if = NULL; + struct forw_packet *forw_packet; + struct bcast_packet *bcast_packet; + + if (!atomic_dec_not_zero(&bat_priv->bcast_queue_left)) { + bat_dbg(DBG_BATMAN, bat_priv, "bcast packet queue full\n"); + goto out; + } + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out_and_inc; + + forw_packet = kmalloc(sizeof(struct forw_packet), GFP_ATOMIC); + + if (!forw_packet) + goto out_and_inc; + + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + goto packet_free; + + /* as we have a copy now, it is safe to decrease the TTL */ + bcast_packet = (struct bcast_packet *)skb->data; + bcast_packet->ttl--; + + skb_reset_mac_header(skb); + + forw_packet->skb = skb; + forw_packet->if_incoming = primary_if; + + /* how often did we send the bcast packet ? */ + forw_packet->num_packets = 0; + + _add_bcast_packet_to_list(bat_priv, forw_packet, 1); + return NETDEV_TX_OK; + +packet_free: + kfree(forw_packet); +out_and_inc: + atomic_inc(&bat_priv->bcast_queue_left); +out: + if (primary_if) + hardif_free_ref(primary_if); + return NETDEV_TX_BUSY; +} + +static void send_outstanding_bcast_packet(struct work_struct *work) +{ + struct hard_iface *hard_iface; + struct delayed_work *delayed_work = + container_of(work, struct delayed_work, work); + struct forw_packet *forw_packet = + container_of(delayed_work, struct forw_packet, delayed_work); + struct sk_buff *skb1; + struct net_device *soft_iface = forw_packet->if_incoming->soft_iface; + struct bat_priv *bat_priv = netdev_priv(soft_iface); + + spin_lock_bh(&bat_priv->forw_bcast_list_lock); + hlist_del(&forw_packet->list); + spin_unlock_bh(&bat_priv->forw_bcast_list_lock); + + if (atomic_read(&bat_priv->mesh_state) == MESH_DEACTIVATING) + goto out; + + /* rebroadcast packet */ + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + /* send a copy of the saved skb */ + skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); + if (skb1) + send_skb_packet(skb1, hard_iface, broadcast_addr); + } + rcu_read_unlock(); + + forw_packet->num_packets++; + + /* if we still have some more bcasts to send */ + if (forw_packet->num_packets < 3) { + _add_bcast_packet_to_list(bat_priv, forw_packet, + ((5 * HZ) / 1000)); + return; + } + +out: + forw_packet_free(forw_packet); + atomic_inc(&bat_priv->bcast_queue_left); +} + +void send_outstanding_bat_packet(struct work_struct *work) +{ + struct delayed_work *delayed_work = + container_of(work, struct delayed_work, work); + struct forw_packet *forw_packet = + container_of(delayed_work, struct forw_packet, delayed_work); + struct bat_priv *bat_priv; + + bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); + spin_lock_bh(&bat_priv->forw_bat_list_lock); + hlist_del(&forw_packet->list); + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + if (atomic_read(&bat_priv->mesh_state) == MESH_DEACTIVATING) + goto out; + + send_packet(forw_packet); + + /** + * we have to have at least one packet in the queue + * to determine the queues wake up time unless we are + * shutting down + */ + if (forw_packet->own) + schedule_own_packet(forw_packet->if_incoming); + +out: + /* don't count own packet */ + if (!forw_packet->own) + atomic_inc(&bat_priv->batman_queue_left); + + forw_packet_free(forw_packet); +} + +void purge_outstanding_packets(struct bat_priv *bat_priv, + struct hard_iface *hard_iface) +{ + struct forw_packet *forw_packet; + struct hlist_node *tmp_node, *safe_tmp_node; + bool pending; + + if (hard_iface) + bat_dbg(DBG_BATMAN, bat_priv, + "purge_outstanding_packets(): %s\n", + hard_iface->net_dev->name); + else + bat_dbg(DBG_BATMAN, bat_priv, + "purge_outstanding_packets()\n"); + + /* free bcast list */ + spin_lock_bh(&bat_priv->forw_bcast_list_lock); + hlist_for_each_entry_safe(forw_packet, tmp_node, safe_tmp_node, + &bat_priv->forw_bcast_list, list) { + + /** + * if purge_outstanding_packets() was called with an argmument + * we delete only packets belonging to the given interface + */ + if ((hard_iface) && + (forw_packet->if_incoming != hard_iface)) + continue; + + spin_unlock_bh(&bat_priv->forw_bcast_list_lock); + + /** + * send_outstanding_bcast_packet() will lock the list to + * delete the item from the list + */ + pending = cancel_delayed_work_sync(&forw_packet->delayed_work); + spin_lock_bh(&bat_priv->forw_bcast_list_lock); + + if (pending) { + hlist_del(&forw_packet->list); + forw_packet_free(forw_packet); + } + } + spin_unlock_bh(&bat_priv->forw_bcast_list_lock); + + /* free batman packet list */ + spin_lock_bh(&bat_priv->forw_bat_list_lock); + hlist_for_each_entry_safe(forw_packet, tmp_node, safe_tmp_node, + &bat_priv->forw_bat_list, list) { + + /** + * if purge_outstanding_packets() was called with an argmument + * we delete only packets belonging to the given interface + */ + if ((hard_iface) && + (forw_packet->if_incoming != hard_iface)) + continue; + + spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + /** + * send_outstanding_bat_packet() will lock the list to + * delete the item from the list + */ + pending = cancel_delayed_work_sync(&forw_packet->delayed_work); + spin_lock_bh(&bat_priv->forw_bat_list_lock); + + if (pending) { + hlist_del(&forw_packet->list); + forw_packet_free(forw_packet); + } + } + spin_unlock_bh(&bat_priv->forw_bat_list_lock); +} diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h new file mode 100644 index 00000000..247172d7 --- /dev/null +++ b/net/batman-adv/send.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_SEND_H_ +#define _NET_BATMAN_ADV_SEND_H_ + +int send_skb_packet(struct sk_buff *skb, + struct hard_iface *hard_iface, + uint8_t *dst_addr); +void schedule_own_packet(struct hard_iface *hard_iface); +void schedule_forward_packet(struct orig_node *orig_node, + struct ethhdr *ethhdr, + struct batman_packet *batman_packet, + uint8_t directlink, int tt_buff_len, + struct hard_iface *if_outgoing); +int add_bcast_packet_to_list(struct bat_priv *bat_priv, struct sk_buff *skb); +void send_outstanding_bat_packet(struct work_struct *work); +void purge_outstanding_packets(struct bat_priv *bat_priv, + struct hard_iface *hard_iface); + +#endif /* _NET_BATMAN_ADV_SEND_H_ */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c new file mode 100644 index 00000000..d5aa6099 --- /dev/null +++ b/net/batman-adv/soft-interface.c @@ -0,0 +1,927 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "soft-interface.h" +#include "hard-interface.h" +#include "routing.h" +#include "send.h" +#include "bat_debugfs.h" +#include "translation-table.h" +#include "hash.h" +#include "gateway_common.h" +#include "gateway_client.h" +#include "bat_sysfs.h" +#include +#include +#include +#include +#include "unicast.h" + + +static int bat_get_settings(struct net_device *dev, struct ethtool_cmd *cmd); +static void bat_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info); +static u32 bat_get_msglevel(struct net_device *dev); +static void bat_set_msglevel(struct net_device *dev, u32 value); +static u32 bat_get_link(struct net_device *dev); + +static const struct ethtool_ops bat_ethtool_ops = { + .get_settings = bat_get_settings, + .get_drvinfo = bat_get_drvinfo, + .get_msglevel = bat_get_msglevel, + .set_msglevel = bat_set_msglevel, + .get_link = bat_get_link, +}; + +int my_skb_head_push(struct sk_buff *skb, unsigned int len) +{ + int result; + + /** + * TODO: We must check if we can release all references to non-payload + * data using skb_header_release in our skbs to allow skb_cow_header to + * work optimally. This means that those skbs are not allowed to read + * or write any data which is before the current position of skb->data + * after that call and thus allow other skbs with the same data buffer + * to write freely in that area. + */ + result = skb_cow_head(skb, len); + if (result < 0) + return result; + + skb_push(skb, len); + return 0; +} + +static void softif_neigh_free_ref(struct softif_neigh *softif_neigh) +{ + if (atomic_dec_and_test(&softif_neigh->refcount)) + kfree_rcu(softif_neigh, rcu); +} + +static void softif_neigh_vid_free_rcu(struct rcu_head *rcu) +{ + struct softif_neigh_vid *softif_neigh_vid; + struct softif_neigh *softif_neigh; + struct hlist_node *node, *node_tmp; + struct bat_priv *bat_priv; + + softif_neigh_vid = container_of(rcu, struct softif_neigh_vid, rcu); + bat_priv = softif_neigh_vid->bat_priv; + + spin_lock_bh(&bat_priv->softif_neigh_lock); + hlist_for_each_entry_safe(softif_neigh, node, node_tmp, + &softif_neigh_vid->softif_neigh_list, list) { + hlist_del_rcu(&softif_neigh->list); + softif_neigh_free_ref(softif_neigh); + } + spin_unlock_bh(&bat_priv->softif_neigh_lock); + + kfree(softif_neigh_vid); +} + +static void softif_neigh_vid_free_ref(struct softif_neigh_vid *softif_neigh_vid) +{ + if (atomic_dec_and_test(&softif_neigh_vid->refcount)) + call_rcu(&softif_neigh_vid->rcu, softif_neigh_vid_free_rcu); +} + +static struct softif_neigh_vid *softif_neigh_vid_get(struct bat_priv *bat_priv, + short vid) +{ + struct softif_neigh_vid *softif_neigh_vid; + struct hlist_node *node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(softif_neigh_vid, node, + &bat_priv->softif_neigh_vids, list) { + if (softif_neigh_vid->vid != vid) + continue; + + if (!atomic_inc_not_zero(&softif_neigh_vid->refcount)) + continue; + + goto out; + } + + softif_neigh_vid = kzalloc(sizeof(struct softif_neigh_vid), + GFP_ATOMIC); + if (!softif_neigh_vid) + goto out; + + softif_neigh_vid->vid = vid; + softif_neigh_vid->bat_priv = bat_priv; + + /* initialize with 2 - caller decrements counter by one */ + atomic_set(&softif_neigh_vid->refcount, 2); + INIT_HLIST_HEAD(&softif_neigh_vid->softif_neigh_list); + INIT_HLIST_NODE(&softif_neigh_vid->list); + spin_lock_bh(&bat_priv->softif_neigh_vid_lock); + hlist_add_head_rcu(&softif_neigh_vid->list, + &bat_priv->softif_neigh_vids); + spin_unlock_bh(&bat_priv->softif_neigh_vid_lock); + +out: + rcu_read_unlock(); + return softif_neigh_vid; +} + +static struct softif_neigh *softif_neigh_get(struct bat_priv *bat_priv, + uint8_t *addr, short vid) +{ + struct softif_neigh_vid *softif_neigh_vid; + struct softif_neigh *softif_neigh = NULL; + struct hlist_node *node; + + softif_neigh_vid = softif_neigh_vid_get(bat_priv, vid); + if (!softif_neigh_vid) + goto out; + + rcu_read_lock(); + hlist_for_each_entry_rcu(softif_neigh, node, + &softif_neigh_vid->softif_neigh_list, + list) { + if (!compare_eth(softif_neigh->addr, addr)) + continue; + + if (!atomic_inc_not_zero(&softif_neigh->refcount)) + continue; + + softif_neigh->last_seen = jiffies; + goto unlock; + } + + softif_neigh = kzalloc(sizeof(struct softif_neigh), GFP_ATOMIC); + if (!softif_neigh) + goto unlock; + + memcpy(softif_neigh->addr, addr, ETH_ALEN); + softif_neigh->last_seen = jiffies; + /* initialize with 2 - caller decrements counter by one */ + atomic_set(&softif_neigh->refcount, 2); + + INIT_HLIST_NODE(&softif_neigh->list); + spin_lock_bh(&bat_priv->softif_neigh_lock); + hlist_add_head_rcu(&softif_neigh->list, + &softif_neigh_vid->softif_neigh_list); + spin_unlock_bh(&bat_priv->softif_neigh_lock); + +unlock: + rcu_read_unlock(); +out: + if (softif_neigh_vid) + softif_neigh_vid_free_ref(softif_neigh_vid); + return softif_neigh; +} + +static struct softif_neigh *softif_neigh_get_selected( + struct softif_neigh_vid *softif_neigh_vid) +{ + struct softif_neigh *softif_neigh; + + rcu_read_lock(); + softif_neigh = rcu_dereference(softif_neigh_vid->softif_neigh); + + if (softif_neigh && !atomic_inc_not_zero(&softif_neigh->refcount)) + softif_neigh = NULL; + + rcu_read_unlock(); + return softif_neigh; +} + +static struct softif_neigh *softif_neigh_vid_get_selected( + struct bat_priv *bat_priv, + short vid) +{ + struct softif_neigh_vid *softif_neigh_vid; + struct softif_neigh *softif_neigh = NULL; + + softif_neigh_vid = softif_neigh_vid_get(bat_priv, vid); + if (!softif_neigh_vid) + goto out; + + softif_neigh = softif_neigh_get_selected(softif_neigh_vid); +out: + if (softif_neigh_vid) + softif_neigh_vid_free_ref(softif_neigh_vid); + return softif_neigh; +} + +static void softif_neigh_vid_select(struct bat_priv *bat_priv, + struct softif_neigh *new_neigh, + short vid) +{ + struct softif_neigh_vid *softif_neigh_vid; + struct softif_neigh *curr_neigh; + + softif_neigh_vid = softif_neigh_vid_get(bat_priv, vid); + if (!softif_neigh_vid) + goto out; + + spin_lock_bh(&bat_priv->softif_neigh_lock); + + if (new_neigh && !atomic_inc_not_zero(&new_neigh->refcount)) + new_neigh = NULL; + + curr_neigh = softif_neigh_vid->softif_neigh; + rcu_assign_pointer(softif_neigh_vid->softif_neigh, new_neigh); + + if ((curr_neigh) && (!new_neigh)) + bat_dbg(DBG_ROUTES, bat_priv, + "Removing mesh exit point on vid: %d (prev: %pM).\n", + vid, curr_neigh->addr); + else if ((curr_neigh) && (new_neigh)) + bat_dbg(DBG_ROUTES, bat_priv, + "Changing mesh exit point on vid: %d from %pM " + "to %pM.\n", vid, curr_neigh->addr, new_neigh->addr); + else if ((!curr_neigh) && (new_neigh)) + bat_dbg(DBG_ROUTES, bat_priv, + "Setting mesh exit point on vid: %d to %pM.\n", + vid, new_neigh->addr); + + if (curr_neigh) + softif_neigh_free_ref(curr_neigh); + + spin_unlock_bh(&bat_priv->softif_neigh_lock); + +out: + if (softif_neigh_vid) + softif_neigh_vid_free_ref(softif_neigh_vid); +} + +static void softif_neigh_vid_deselect(struct bat_priv *bat_priv, + struct softif_neigh_vid *softif_neigh_vid) +{ + struct softif_neigh *curr_neigh; + struct softif_neigh *softif_neigh = NULL, *softif_neigh_tmp; + struct hard_iface *primary_if = NULL; + struct hlist_node *node; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + /* find new softif_neigh immediately to avoid temporary loops */ + rcu_read_lock(); + curr_neigh = rcu_dereference(softif_neigh_vid->softif_neigh); + + hlist_for_each_entry_rcu(softif_neigh_tmp, node, + &softif_neigh_vid->softif_neigh_list, + list) { + if (softif_neigh_tmp == curr_neigh) + continue; + + /* we got a neighbor but its mac is 'bigger' than ours */ + if (memcmp(primary_if->net_dev->dev_addr, + softif_neigh_tmp->addr, ETH_ALEN) < 0) + continue; + + if (!atomic_inc_not_zero(&softif_neigh_tmp->refcount)) + continue; + + softif_neigh = softif_neigh_tmp; + goto unlock; + } + +unlock: + rcu_read_unlock(); +out: + softif_neigh_vid_select(bat_priv, softif_neigh, softif_neigh_vid->vid); + + if (primary_if) + hardif_free_ref(primary_if); + if (softif_neigh) + softif_neigh_free_ref(softif_neigh); +} + +int softif_neigh_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct bat_priv *bat_priv = netdev_priv(net_dev); + struct softif_neigh_vid *softif_neigh_vid; + struct softif_neigh *softif_neigh; + struct hard_iface *primary_if; + struct hlist_node *node, *node_tmp; + struct softif_neigh *curr_softif_neigh; + int ret = 0, last_seen_secs, last_seen_msecs; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - " + "please specify interfaces to enable it\n", + net_dev->name); + goto out; + } + + if (primary_if->if_status != IF_ACTIVE) { + ret = seq_printf(seq, "BATMAN mesh %s " + "disabled - primary interface not active\n", + net_dev->name); + goto out; + } + + seq_printf(seq, "Softif neighbor list (%s)\n", net_dev->name); + + rcu_read_lock(); + hlist_for_each_entry_rcu(softif_neigh_vid, node, + &bat_priv->softif_neigh_vids, list) { + seq_printf(seq, " %-15s %s on vid: %d\n", + "Originator", "last-seen", softif_neigh_vid->vid); + + curr_softif_neigh = softif_neigh_get_selected(softif_neigh_vid); + + hlist_for_each_entry_rcu(softif_neigh, node_tmp, + &softif_neigh_vid->softif_neigh_list, + list) { + last_seen_secs = jiffies_to_msecs(jiffies - + softif_neigh->last_seen) / 1000; + last_seen_msecs = jiffies_to_msecs(jiffies - + softif_neigh->last_seen) % 1000; + seq_printf(seq, "%s %pM %3i.%03is\n", + curr_softif_neigh == softif_neigh + ? "=>" : " ", softif_neigh->addr, + last_seen_secs, last_seen_msecs); + } + + if (curr_softif_neigh) + softif_neigh_free_ref(curr_softif_neigh); + + seq_printf(seq, "\n"); + } + rcu_read_unlock(); + +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +void softif_neigh_purge(struct bat_priv *bat_priv) +{ + struct softif_neigh *softif_neigh, *curr_softif_neigh; + struct softif_neigh_vid *softif_neigh_vid; + struct hlist_node *node, *node_tmp, *node_tmp2; + char do_deselect; + + rcu_read_lock(); + hlist_for_each_entry_rcu(softif_neigh_vid, node, + &bat_priv->softif_neigh_vids, list) { + if (!atomic_inc_not_zero(&softif_neigh_vid->refcount)) + continue; + + curr_softif_neigh = softif_neigh_get_selected(softif_neigh_vid); + do_deselect = 0; + + spin_lock_bh(&bat_priv->softif_neigh_lock); + hlist_for_each_entry_safe(softif_neigh, node_tmp, node_tmp2, + &softif_neigh_vid->softif_neigh_list, + list) { + if ((!time_after(jiffies, softif_neigh->last_seen + + msecs_to_jiffies(SOFTIF_NEIGH_TIMEOUT))) && + (atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE)) + continue; + + if (curr_softif_neigh == softif_neigh) { + bat_dbg(DBG_ROUTES, bat_priv, + "Current mesh exit point on vid: %d " + "'%pM' vanished.\n", + softif_neigh_vid->vid, + softif_neigh->addr); + do_deselect = 1; + } + + hlist_del_rcu(&softif_neigh->list); + softif_neigh_free_ref(softif_neigh); + } + spin_unlock_bh(&bat_priv->softif_neigh_lock); + + /* soft_neigh_vid_deselect() needs to acquire the + * softif_neigh_lock */ + if (do_deselect) + softif_neigh_vid_deselect(bat_priv, softif_neigh_vid); + + if (curr_softif_neigh) + softif_neigh_free_ref(curr_softif_neigh); + + softif_neigh_vid_free_ref(softif_neigh_vid); + } + rcu_read_unlock(); + + spin_lock_bh(&bat_priv->softif_neigh_vid_lock); + hlist_for_each_entry_safe(softif_neigh_vid, node, node_tmp, + &bat_priv->softif_neigh_vids, list) { + if (!hlist_empty(&softif_neigh_vid->softif_neigh_list)) + continue; + + hlist_del_rcu(&softif_neigh_vid->list); + softif_neigh_vid_free_ref(softif_neigh_vid); + } + spin_unlock_bh(&bat_priv->softif_neigh_vid_lock); + +} + +static void softif_batman_recv(struct sk_buff *skb, struct net_device *dev, + short vid) +{ + struct bat_priv *bat_priv = netdev_priv(dev); + struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + struct batman_packet *batman_packet; + struct softif_neigh *softif_neigh = NULL; + struct hard_iface *primary_if = NULL; + struct softif_neigh *curr_softif_neigh = NULL; + + if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) + batman_packet = (struct batman_packet *) + (skb->data + ETH_HLEN + VLAN_HLEN); + else + batman_packet = (struct batman_packet *)(skb->data + ETH_HLEN); + + if (batman_packet->version != COMPAT_VERSION) + goto out; + + if (batman_packet->packet_type != BAT_PACKET) + goto out; + + if (!(batman_packet->flags & PRIMARIES_FIRST_HOP)) + goto out; + + if (is_my_mac(batman_packet->orig)) + goto out; + + softif_neigh = softif_neigh_get(bat_priv, batman_packet->orig, vid); + if (!softif_neigh) + goto out; + + curr_softif_neigh = softif_neigh_vid_get_selected(bat_priv, vid); + if (curr_softif_neigh == softif_neigh) + goto out; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + /* we got a neighbor but its mac is 'bigger' than ours */ + if (memcmp(primary_if->net_dev->dev_addr, + softif_neigh->addr, ETH_ALEN) < 0) + goto out; + + /* close own batX device and use softif_neigh as exit node */ + if (!curr_softif_neigh) { + softif_neigh_vid_select(bat_priv, softif_neigh, vid); + goto out; + } + + /* switch to new 'smallest neighbor' */ + if (memcmp(softif_neigh->addr, curr_softif_neigh->addr, ETH_ALEN) < 0) + softif_neigh_vid_select(bat_priv, softif_neigh, vid); + +out: + kfree_skb(skb); + if (softif_neigh) + softif_neigh_free_ref(softif_neigh); + if (curr_softif_neigh) + softif_neigh_free_ref(curr_softif_neigh); + if (primary_if) + hardif_free_ref(primary_if); + return; +} + +static int interface_open(struct net_device *dev) +{ + netif_start_queue(dev); + return 0; +} + +static int interface_release(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static struct net_device_stats *interface_stats(struct net_device *dev) +{ + struct bat_priv *bat_priv = netdev_priv(dev); + return &bat_priv->stats; +} + +static int interface_set_mac_addr(struct net_device *dev, void *p) +{ + struct bat_priv *bat_priv = netdev_priv(dev); + struct sockaddr *addr = p; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + /* only modify transtable if it has been initialised before */ + if (atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE) { + tt_local_remove(bat_priv, dev->dev_addr, + "mac address changed"); + tt_local_add(dev, addr->sa_data); + } + + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + return 0; +} + +static int interface_change_mtu(struct net_device *dev, int new_mtu) +{ + /* check ranges */ + if ((new_mtu < 68) || (new_mtu > hardif_min_mtu(dev))) + return -EINVAL; + + dev->mtu = new_mtu; + + return 0; +} + +int interface_tx(struct sk_buff *skb, struct net_device *soft_iface) +{ + struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + struct bat_priv *bat_priv = netdev_priv(soft_iface); + struct hard_iface *primary_if = NULL; + struct bcast_packet *bcast_packet; + struct vlan_ethhdr *vhdr; + struct softif_neigh *curr_softif_neigh = NULL; + int data_len = skb->len, ret; + short vid = -1; + bool do_bcast = false; + + if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE) + goto dropped; + + soft_iface->trans_start = jiffies; + + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_8021Q: + vhdr = (struct vlan_ethhdr *)skb->data; + vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; + + if (ntohs(vhdr->h_vlan_encapsulated_proto) != ETH_P_BATMAN) + break; + + /* fall through */ + case ETH_P_BATMAN: + softif_batman_recv(skb, soft_iface, vid); + goto end; + } + + /** + * if we have a another chosen mesh exit node in range + * it will transport the packets to the mesh + */ + curr_softif_neigh = softif_neigh_vid_get_selected(bat_priv, vid); + if (curr_softif_neigh) + goto dropped; + + /* TODO: check this for locks */ + tt_local_add(soft_iface, ethhdr->h_source); + + if (is_multicast_ether_addr(ethhdr->h_dest)) { + ret = gw_is_target(bat_priv, skb); + + if (ret < 0) + goto dropped; + + if (ret == 0) + do_bcast = true; + } + + /* ethernet packet should be broadcasted */ + if (do_bcast) { + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto dropped; + + if (my_skb_head_push(skb, sizeof(struct bcast_packet)) < 0) + goto dropped; + + bcast_packet = (struct bcast_packet *)skb->data; + bcast_packet->version = COMPAT_VERSION; + bcast_packet->ttl = TTL; + + /* batman packet type: broadcast */ + bcast_packet->packet_type = BAT_BCAST; + + /* hw address of first interface is the orig mac because only + * this mac is known throughout the mesh */ + memcpy(bcast_packet->orig, + primary_if->net_dev->dev_addr, ETH_ALEN); + + /* set broadcast sequence number */ + bcast_packet->seqno = + htonl(atomic_inc_return(&bat_priv->bcast_seqno)); + + add_bcast_packet_to_list(bat_priv, skb); + + /* a copy is stored in the bcast list, therefore removing + * the original skb. */ + kfree_skb(skb); + + /* unicast packet */ + } else { + ret = unicast_send_skb(skb, bat_priv); + if (ret != 0) + goto dropped_freed; + } + + bat_priv->stats.tx_packets++; + bat_priv->stats.tx_bytes += data_len; + goto end; + +dropped: + kfree_skb(skb); +dropped_freed: + bat_priv->stats.tx_dropped++; +end: + if (curr_softif_neigh) + softif_neigh_free_ref(curr_softif_neigh); + if (primary_if) + hardif_free_ref(primary_if); + return NETDEV_TX_OK; +} + +void interface_rx(struct net_device *soft_iface, + struct sk_buff *skb, struct hard_iface *recv_if, + int hdr_size) +{ + struct bat_priv *bat_priv = netdev_priv(soft_iface); + struct unicast_packet *unicast_packet; + struct ethhdr *ethhdr; + struct vlan_ethhdr *vhdr; + struct softif_neigh *curr_softif_neigh = NULL; + short vid = -1; + int ret; + + /* check if enough space is available for pulling, and pull */ + if (!pskb_may_pull(skb, hdr_size)) + goto dropped; + + skb_pull_rcsum(skb, hdr_size); + skb_reset_mac_header(skb); + + ethhdr = (struct ethhdr *)skb_mac_header(skb); + + switch (ntohs(ethhdr->h_proto)) { + case ETH_P_8021Q: + vhdr = (struct vlan_ethhdr *)skb->data; + vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; + + if (ntohs(vhdr->h_vlan_encapsulated_proto) != ETH_P_BATMAN) + break; + + /* fall through */ + case ETH_P_BATMAN: + goto dropped; + } + + /** + * if we have a another chosen mesh exit node in range + * it will transport the packets to the non-mesh network + */ + curr_softif_neigh = softif_neigh_vid_get_selected(bat_priv, vid); + if (curr_softif_neigh) { + skb_push(skb, hdr_size); + unicast_packet = (struct unicast_packet *)skb->data; + + if ((unicast_packet->packet_type != BAT_UNICAST) && + (unicast_packet->packet_type != BAT_UNICAST_FRAG)) + goto dropped; + + skb_reset_mac_header(skb); + + memcpy(unicast_packet->dest, + curr_softif_neigh->addr, ETH_ALEN); + ret = route_unicast_packet(skb, recv_if); + if (ret == NET_RX_DROP) + goto dropped; + + goto out; + } + + /* skb->dev & skb->pkt_type are set here */ + if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) + goto dropped; + skb->protocol = eth_type_trans(skb, soft_iface); + + /* should not be necessary anymore as we use skb_pull_rcsum() + * TODO: please verify this and remove this TODO + * -- Dec 21st 2009, Simon Wunderlich */ + +/* skb->ip_summed = CHECKSUM_UNNECESSARY;*/ + + bat_priv->stats.rx_packets++; + bat_priv->stats.rx_bytes += skb->len + sizeof(struct ethhdr); + + soft_iface->last_rx = jiffies; + + netif_rx(skb); + goto out; + +dropped: + kfree_skb(skb); +out: + if (curr_softif_neigh) + softif_neigh_free_ref(curr_softif_neigh); + return; +} + +#ifdef HAVE_NET_DEVICE_OPS +static const struct net_device_ops bat_netdev_ops = { + .ndo_open = interface_open, + .ndo_stop = interface_release, + .ndo_get_stats = interface_stats, + .ndo_set_mac_address = interface_set_mac_addr, + .ndo_change_mtu = interface_change_mtu, + .ndo_start_xmit = interface_tx, + .ndo_validate_addr = eth_validate_addr +}; +#endif + +static void interface_setup(struct net_device *dev) +{ + struct bat_priv *priv = netdev_priv(dev); + char dev_addr[ETH_ALEN]; + + ether_setup(dev); + +#ifdef HAVE_NET_DEVICE_OPS + dev->netdev_ops = &bat_netdev_ops; +#else + dev->open = interface_open; + dev->stop = interface_release; + dev->get_stats = interface_stats; + dev->set_mac_address = interface_set_mac_addr; + dev->change_mtu = interface_change_mtu; + dev->hard_start_xmit = interface_tx; +#endif + dev->destructor = free_netdev; + dev->tx_queue_len = 0; + + /** + * can't call min_mtu, because the needed variables + * have not been initialized yet + */ + dev->mtu = ETH_DATA_LEN; + /* reserve more space in the skbuff for our header */ + dev->hard_header_len = BAT_HEADER_LEN; + + /* generate random address */ + random_ether_addr(dev_addr); + memcpy(dev->dev_addr, dev_addr, ETH_ALEN); + + SET_ETHTOOL_OPS(dev, &bat_ethtool_ops); + + memset(priv, 0, sizeof(struct bat_priv)); +} + +struct net_device *softif_create(char *name) +{ + struct net_device *soft_iface; + struct bat_priv *bat_priv; + int ret; + + soft_iface = alloc_netdev(sizeof(struct bat_priv) , name, + interface_setup); + + if (!soft_iface) { + pr_err("Unable to allocate the batman interface: %s\n", name); + goto out; + } + + ret = register_netdevice(soft_iface); + if (ret < 0) { + pr_err("Unable to register the batman interface '%s': %i\n", + name, ret); + goto free_soft_iface; + } + + bat_priv = netdev_priv(soft_iface); + + atomic_set(&bat_priv->aggregated_ogms, 1); + atomic_set(&bat_priv->bonding, 0); + atomic_set(&bat_priv->vis_mode, VIS_TYPE_CLIENT_UPDATE); + atomic_set(&bat_priv->gw_mode, GW_MODE_OFF); + atomic_set(&bat_priv->gw_sel_class, 20); + atomic_set(&bat_priv->gw_bandwidth, 41); + atomic_set(&bat_priv->orig_interval, 1000); + atomic_set(&bat_priv->hop_penalty, 10); + atomic_set(&bat_priv->log_level, 0); + atomic_set(&bat_priv->fragmentation, 1); + atomic_set(&bat_priv->bcast_queue_left, BCAST_QUEUE_LEN); + atomic_set(&bat_priv->batman_queue_left, BATMAN_QUEUE_LEN); + + atomic_set(&bat_priv->mesh_state, MESH_INACTIVE); + atomic_set(&bat_priv->bcast_seqno, 1); + atomic_set(&bat_priv->tt_local_changed, 0); + + bat_priv->primary_if = NULL; + bat_priv->num_ifaces = 0; + + ret = sysfs_add_meshif(soft_iface); + if (ret < 0) + goto unreg_soft_iface; + + ret = debugfs_add_meshif(soft_iface); + if (ret < 0) + goto unreg_sysfs; + + ret = mesh_init(soft_iface); + if (ret < 0) + goto unreg_debugfs; + + return soft_iface; + +unreg_debugfs: + debugfs_del_meshif(soft_iface); +unreg_sysfs: + sysfs_del_meshif(soft_iface); +unreg_soft_iface: + unregister_netdev(soft_iface); + return NULL; + +free_soft_iface: + free_netdev(soft_iface); +out: + return NULL; +} + +void softif_destroy(struct net_device *soft_iface) +{ + debugfs_del_meshif(soft_iface); + sysfs_del_meshif(soft_iface); + mesh_free(soft_iface); + unregister_netdevice(soft_iface); +} + +int softif_is_valid(struct net_device *net_dev) +{ +#ifdef HAVE_NET_DEVICE_OPS + if (net_dev->netdev_ops->ndo_start_xmit == interface_tx) + return 1; +#else + if (net_dev->hard_start_xmit == interface_tx) + return 1; +#endif + + return 0; +} + +/* ethtool */ +static int bat_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + cmd->supported = 0; + cmd->advertising = 0; + ethtool_cmd_speed_set(cmd, SPEED_10); + cmd->duplex = DUPLEX_FULL; + cmd->port = PORT_TP; + cmd->phy_address = 0; + cmd->transceiver = XCVR_INTERNAL; + cmd->autoneg = AUTONEG_DISABLE; + cmd->maxtxpkt = 0; + cmd->maxrxpkt = 0; + + return 0; +} + +static void bat_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "B.A.T.M.A.N. advanced"); + strcpy(info->version, SOURCE_VERSION); + strcpy(info->fw_version, "N/A"); + strcpy(info->bus_info, "batman"); +} + +static u32 bat_get_msglevel(struct net_device *dev) +{ + return -EOPNOTSUPP; +} + +static void bat_set_msglevel(struct net_device *dev, u32 value) +{ +} + +static u32 bat_get_link(struct net_device *dev) +{ + return 1; +} + diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h new file mode 100644 index 00000000..4789b6f2 --- /dev/null +++ b/net/batman-adv/soft-interface.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ +#define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ + +int my_skb_head_push(struct sk_buff *skb, unsigned int len); +int softif_neigh_seq_print_text(struct seq_file *seq, void *offset); +void softif_neigh_purge(struct bat_priv *bat_priv); +int interface_tx(struct sk_buff *skb, struct net_device *soft_iface); +void interface_rx(struct net_device *soft_iface, + struct sk_buff *skb, struct hard_iface *recv_if, + int hdr_size); +struct net_device *softif_create(char *name); +void softif_destroy(struct net_device *soft_iface); +int softif_is_valid(struct net_device *net_dev); + +#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c new file mode 100644 index 00000000..7b729660 --- /dev/null +++ b/net/batman-adv/translation-table.c @@ -0,0 +1,638 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "translation-table.h" +#include "soft-interface.h" +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" + +static void tt_local_purge(struct work_struct *work); +static void _tt_global_del_orig(struct bat_priv *bat_priv, + struct tt_global_entry *tt_global_entry, + char *message); + +/* returns 1 if they are the same mac addr */ +static int compare_ltt(struct hlist_node *node, void *data2) +{ + void *data1 = container_of(node, struct tt_local_entry, hash_entry); + + return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); +} + +/* returns 1 if they are the same mac addr */ +static int compare_gtt(struct hlist_node *node, void *data2) +{ + void *data1 = container_of(node, struct tt_global_entry, hash_entry); + + return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0); +} + +static void tt_local_start_timer(struct bat_priv *bat_priv) +{ + INIT_DELAYED_WORK(&bat_priv->tt_work, tt_local_purge); + queue_delayed_work(bat_event_workqueue, &bat_priv->tt_work, 10 * HZ); +} + +static struct tt_local_entry *tt_local_hash_find(struct bat_priv *bat_priv, + void *data) +{ + struct hashtable_t *hash = bat_priv->tt_local_hash; + struct hlist_head *head; + struct hlist_node *node; + struct tt_local_entry *tt_local_entry, *tt_local_entry_tmp = NULL; + int index; + + if (!hash) + return NULL; + + index = choose_orig(data, hash->size); + head = &hash->table[index]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tt_local_entry, node, head, hash_entry) { + if (!compare_eth(tt_local_entry, data)) + continue; + + tt_local_entry_tmp = tt_local_entry; + break; + } + rcu_read_unlock(); + + return tt_local_entry_tmp; +} + +static struct tt_global_entry *tt_global_hash_find(struct bat_priv *bat_priv, + void *data) +{ + struct hashtable_t *hash = bat_priv->tt_global_hash; + struct hlist_head *head; + struct hlist_node *node; + struct tt_global_entry *tt_global_entry; + struct tt_global_entry *tt_global_entry_tmp = NULL; + int index; + + if (!hash) + return NULL; + + index = choose_orig(data, hash->size); + head = &hash->table[index]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tt_global_entry, node, head, hash_entry) { + if (!compare_eth(tt_global_entry, data)) + continue; + + tt_global_entry_tmp = tt_global_entry; + break; + } + rcu_read_unlock(); + + return tt_global_entry_tmp; +} + +int tt_local_init(struct bat_priv *bat_priv) +{ + if (bat_priv->tt_local_hash) + return 1; + + bat_priv->tt_local_hash = hash_new(1024); + + if (!bat_priv->tt_local_hash) + return 0; + + atomic_set(&bat_priv->tt_local_changed, 0); + tt_local_start_timer(bat_priv); + + return 1; +} + +void tt_local_add(struct net_device *soft_iface, uint8_t *addr) +{ + struct bat_priv *bat_priv = netdev_priv(soft_iface); + struct tt_local_entry *tt_local_entry; + struct tt_global_entry *tt_global_entry; + int required_bytes; + + spin_lock_bh(&bat_priv->tt_lhash_lock); + tt_local_entry = tt_local_hash_find(bat_priv, addr); + spin_unlock_bh(&bat_priv->tt_lhash_lock); + + if (tt_local_entry) { + tt_local_entry->last_seen = jiffies; + return; + } + + /* only announce as many hosts as possible in the batman-packet and + space in batman_packet->num_tt That also should give a limit to + MAC-flooding. */ + required_bytes = (bat_priv->num_local_tt + 1) * ETH_ALEN; + required_bytes += BAT_PACKET_LEN; + + if ((required_bytes > ETH_DATA_LEN) || + (atomic_read(&bat_priv->aggregated_ogms) && + required_bytes > MAX_AGGREGATION_BYTES) || + (bat_priv->num_local_tt + 1 > 255)) { + bat_dbg(DBG_ROUTES, bat_priv, + "Can't add new local tt entry (%pM): " + "number of local tt entries exceeds packet size\n", + addr); + return; + } + + bat_dbg(DBG_ROUTES, bat_priv, + "Creating new local tt entry: %pM\n", addr); + + tt_local_entry = kmalloc(sizeof(struct tt_local_entry), GFP_ATOMIC); + if (!tt_local_entry) + return; + + memcpy(tt_local_entry->addr, addr, ETH_ALEN); + tt_local_entry->last_seen = jiffies; + + /* the batman interface mac address should never be purged */ + if (compare_eth(addr, soft_iface->dev_addr)) + tt_local_entry->never_purge = 1; + else + tt_local_entry->never_purge = 0; + + spin_lock_bh(&bat_priv->tt_lhash_lock); + + hash_add(bat_priv->tt_local_hash, compare_ltt, choose_orig, + tt_local_entry, &tt_local_entry->hash_entry); + bat_priv->num_local_tt++; + atomic_set(&bat_priv->tt_local_changed, 1); + + spin_unlock_bh(&bat_priv->tt_lhash_lock); + + /* remove address from global hash if present */ + spin_lock_bh(&bat_priv->tt_ghash_lock); + + tt_global_entry = tt_global_hash_find(bat_priv, addr); + + if (tt_global_entry) + _tt_global_del_orig(bat_priv, tt_global_entry, + "local tt received"); + + spin_unlock_bh(&bat_priv->tt_ghash_lock); +} + +int tt_local_fill_buffer(struct bat_priv *bat_priv, + unsigned char *buff, int buff_len) +{ + struct hashtable_t *hash = bat_priv->tt_local_hash; + struct tt_local_entry *tt_local_entry; + struct hlist_node *node; + struct hlist_head *head; + int i, count = 0; + + spin_lock_bh(&bat_priv->tt_lhash_lock); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tt_local_entry, node, + head, hash_entry) { + if (buff_len < (count + 1) * ETH_ALEN) + break; + + memcpy(buff + (count * ETH_ALEN), tt_local_entry->addr, + ETH_ALEN); + + count++; + } + rcu_read_unlock(); + } + + /* if we did not get all new local tts see you next time ;-) */ + if (count == bat_priv->num_local_tt) + atomic_set(&bat_priv->tt_local_changed, 0); + + spin_unlock_bh(&bat_priv->tt_lhash_lock); + return count; +} + +int tt_local_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct bat_priv *bat_priv = netdev_priv(net_dev); + struct hashtable_t *hash = bat_priv->tt_local_hash; + struct tt_local_entry *tt_local_entry; + struct hard_iface *primary_if; + struct hlist_node *node; + struct hlist_head *head; + size_t buf_size, pos; + char *buff; + int i, ret = 0; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - " + "please specify interfaces to enable it\n", + net_dev->name); + goto out; + } + + if (primary_if->if_status != IF_ACTIVE) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - " + "primary interface not active\n", + net_dev->name); + goto out; + } + + seq_printf(seq, "Locally retrieved addresses (from %s) " + "announced via TT:\n", + net_dev->name); + + spin_lock_bh(&bat_priv->tt_lhash_lock); + + buf_size = 1; + /* Estimate length for: " * xx:xx:xx:xx:xx:xx\n" */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + __hlist_for_each_rcu(node, head) + buf_size += 21; + rcu_read_unlock(); + } + + buff = kmalloc(buf_size, GFP_ATOMIC); + if (!buff) { + spin_unlock_bh(&bat_priv->tt_lhash_lock); + ret = -ENOMEM; + goto out; + } + + buff[0] = '\0'; + pos = 0; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tt_local_entry, node, + head, hash_entry) { + pos += snprintf(buff + pos, 22, " * %pM\n", + tt_local_entry->addr); + } + rcu_read_unlock(); + } + + spin_unlock_bh(&bat_priv->tt_lhash_lock); + + seq_printf(seq, "%s", buff); + kfree(buff); +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +static void _tt_local_del(struct hlist_node *node, void *arg) +{ + struct bat_priv *bat_priv = (struct bat_priv *)arg; + void *data = container_of(node, struct tt_local_entry, hash_entry); + + kfree(data); + bat_priv->num_local_tt--; + atomic_set(&bat_priv->tt_local_changed, 1); +} + +static void tt_local_del(struct bat_priv *bat_priv, + struct tt_local_entry *tt_local_entry, + char *message) +{ + bat_dbg(DBG_ROUTES, bat_priv, "Deleting local tt entry (%pM): %s\n", + tt_local_entry->addr, message); + + hash_remove(bat_priv->tt_local_hash, compare_ltt, choose_orig, + tt_local_entry->addr); + _tt_local_del(&tt_local_entry->hash_entry, bat_priv); +} + +void tt_local_remove(struct bat_priv *bat_priv, + uint8_t *addr, char *message) +{ + struct tt_local_entry *tt_local_entry; + + spin_lock_bh(&bat_priv->tt_lhash_lock); + + tt_local_entry = tt_local_hash_find(bat_priv, addr); + + if (tt_local_entry) + tt_local_del(bat_priv, tt_local_entry, message); + + spin_unlock_bh(&bat_priv->tt_lhash_lock); +} + +static void tt_local_purge(struct work_struct *work) +{ + struct delayed_work *delayed_work = + container_of(work, struct delayed_work, work); + struct bat_priv *bat_priv = + container_of(delayed_work, struct bat_priv, tt_work); + struct hashtable_t *hash = bat_priv->tt_local_hash; + struct tt_local_entry *tt_local_entry; + struct hlist_node *node, *node_tmp; + struct hlist_head *head; + unsigned long timeout; + int i; + + spin_lock_bh(&bat_priv->tt_lhash_lock); + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + hlist_for_each_entry_safe(tt_local_entry, node, node_tmp, + head, hash_entry) { + if (tt_local_entry->never_purge) + continue; + + timeout = tt_local_entry->last_seen; + timeout += TT_LOCAL_TIMEOUT * HZ; + + if (time_before(jiffies, timeout)) + continue; + + tt_local_del(bat_priv, tt_local_entry, + "address timed out"); + } + } + + spin_unlock_bh(&bat_priv->tt_lhash_lock); + tt_local_start_timer(bat_priv); +} + +void tt_local_free(struct bat_priv *bat_priv) +{ + if (!bat_priv->tt_local_hash) + return; + + cancel_delayed_work_sync(&bat_priv->tt_work); + hash_delete(bat_priv->tt_local_hash, _tt_local_del, bat_priv); + bat_priv->tt_local_hash = NULL; +} + +int tt_global_init(struct bat_priv *bat_priv) +{ + if (bat_priv->tt_global_hash) + return 1; + + bat_priv->tt_global_hash = hash_new(1024); + + if (!bat_priv->tt_global_hash) + return 0; + + return 1; +} + +void tt_global_add_orig(struct bat_priv *bat_priv, + struct orig_node *orig_node, + unsigned char *tt_buff, int tt_buff_len) +{ + struct tt_global_entry *tt_global_entry; + struct tt_local_entry *tt_local_entry; + int tt_buff_count = 0; + unsigned char *tt_ptr; + + while ((tt_buff_count + 1) * ETH_ALEN <= tt_buff_len) { + spin_lock_bh(&bat_priv->tt_ghash_lock); + + tt_ptr = tt_buff + (tt_buff_count * ETH_ALEN); + tt_global_entry = tt_global_hash_find(bat_priv, tt_ptr); + + if (!tt_global_entry) { + spin_unlock_bh(&bat_priv->tt_ghash_lock); + + tt_global_entry = + kmalloc(sizeof(struct tt_global_entry), + GFP_ATOMIC); + + if (!tt_global_entry) + break; + + memcpy(tt_global_entry->addr, tt_ptr, ETH_ALEN); + + bat_dbg(DBG_ROUTES, bat_priv, + "Creating new global tt entry: " + "%pM (via %pM)\n", + tt_global_entry->addr, orig_node->orig); + + spin_lock_bh(&bat_priv->tt_ghash_lock); + hash_add(bat_priv->tt_global_hash, compare_gtt, + choose_orig, tt_global_entry, + &tt_global_entry->hash_entry); + + } + + tt_global_entry->orig_node = orig_node; + spin_unlock_bh(&bat_priv->tt_ghash_lock); + + /* remove address from local hash if present */ + spin_lock_bh(&bat_priv->tt_lhash_lock); + + tt_ptr = tt_buff + (tt_buff_count * ETH_ALEN); + tt_local_entry = tt_local_hash_find(bat_priv, tt_ptr); + + if (tt_local_entry) + tt_local_del(bat_priv, tt_local_entry, + "global tt received"); + + spin_unlock_bh(&bat_priv->tt_lhash_lock); + + tt_buff_count++; + } + + /* initialize, and overwrite if malloc succeeds */ + orig_node->tt_buff = NULL; + orig_node->tt_buff_len = 0; + + if (tt_buff_len > 0) { + orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC); + if (orig_node->tt_buff) { + memcpy(orig_node->tt_buff, tt_buff, tt_buff_len); + orig_node->tt_buff_len = tt_buff_len; + } + } +} + +int tt_global_seq_print_text(struct seq_file *seq, void *offset) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct bat_priv *bat_priv = netdev_priv(net_dev); + struct hashtable_t *hash = bat_priv->tt_global_hash; + struct tt_global_entry *tt_global_entry; + struct hard_iface *primary_if; + struct hlist_node *node; + struct hlist_head *head; + size_t buf_size, pos; + char *buff; + int i, ret = 0; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - please " + "specify interfaces to enable it\n", + net_dev->name); + goto out; + } + + if (primary_if->if_status != IF_ACTIVE) { + ret = seq_printf(seq, "BATMAN mesh %s disabled - " + "primary interface not active\n", + net_dev->name); + goto out; + } + + seq_printf(seq, + "Globally announced TT entries received via the mesh %s\n", + net_dev->name); + + spin_lock_bh(&bat_priv->tt_ghash_lock); + + buf_size = 1; + /* Estimate length for: " * xx:xx:xx:xx:xx:xx via xx:xx:xx:xx:xx:xx\n"*/ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + __hlist_for_each_rcu(node, head) + buf_size += 43; + rcu_read_unlock(); + } + + buff = kmalloc(buf_size, GFP_ATOMIC); + if (!buff) { + spin_unlock_bh(&bat_priv->tt_ghash_lock); + ret = -ENOMEM; + goto out; + } + buff[0] = '\0'; + pos = 0; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tt_global_entry, node, + head, hash_entry) { + pos += snprintf(buff + pos, 44, + " * %pM via %pM\n", + tt_global_entry->addr, + tt_global_entry->orig_node->orig); + } + rcu_read_unlock(); + } + + spin_unlock_bh(&bat_priv->tt_ghash_lock); + + seq_printf(seq, "%s", buff); + kfree(buff); +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +static void _tt_global_del_orig(struct bat_priv *bat_priv, + struct tt_global_entry *tt_global_entry, + char *message) +{ + bat_dbg(DBG_ROUTES, bat_priv, + "Deleting global tt entry %pM (via %pM): %s\n", + tt_global_entry->addr, tt_global_entry->orig_node->orig, + message); + + hash_remove(bat_priv->tt_global_hash, compare_gtt, choose_orig, + tt_global_entry->addr); + kfree(tt_global_entry); +} + +void tt_global_del_orig(struct bat_priv *bat_priv, + struct orig_node *orig_node, char *message) +{ + struct tt_global_entry *tt_global_entry; + int tt_buff_count = 0; + unsigned char *tt_ptr; + + if (orig_node->tt_buff_len == 0) + return; + + spin_lock_bh(&bat_priv->tt_ghash_lock); + + while ((tt_buff_count + 1) * ETH_ALEN <= orig_node->tt_buff_len) { + tt_ptr = orig_node->tt_buff + (tt_buff_count * ETH_ALEN); + tt_global_entry = tt_global_hash_find(bat_priv, tt_ptr); + + if ((tt_global_entry) && + (tt_global_entry->orig_node == orig_node)) + _tt_global_del_orig(bat_priv, tt_global_entry, + message); + + tt_buff_count++; + } + + spin_unlock_bh(&bat_priv->tt_ghash_lock); + + orig_node->tt_buff_len = 0; + kfree(orig_node->tt_buff); + orig_node->tt_buff = NULL; +} + +static void tt_global_del(struct hlist_node *node, void *arg) +{ + void *data = container_of(node, struct tt_global_entry, hash_entry); + + kfree(data); +} + +void tt_global_free(struct bat_priv *bat_priv) +{ + if (!bat_priv->tt_global_hash) + return; + + hash_delete(bat_priv->tt_global_hash, tt_global_del, NULL); + bat_priv->tt_global_hash = NULL; +} + +struct orig_node *transtable_search(struct bat_priv *bat_priv, uint8_t *addr) +{ + struct tt_global_entry *tt_global_entry; + struct orig_node *orig_node = NULL; + + spin_lock_bh(&bat_priv->tt_ghash_lock); + tt_global_entry = tt_global_hash_find(bat_priv, addr); + + if (!tt_global_entry) + goto out; + + if (!atomic_inc_not_zero(&tt_global_entry->orig_node->refcount)) + goto out; + + orig_node = tt_global_entry->orig_node; + +out: + spin_unlock_bh(&bat_priv->tt_ghash_lock); + return orig_node; +} diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h new file mode 100644 index 00000000..46152c38 --- /dev/null +++ b/net/batman-adv/translation-table.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ +#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ + +int tt_local_init(struct bat_priv *bat_priv); +void tt_local_add(struct net_device *soft_iface, uint8_t *addr); +void tt_local_remove(struct bat_priv *bat_priv, + uint8_t *addr, char *message); +int tt_local_fill_buffer(struct bat_priv *bat_priv, + unsigned char *buff, int buff_len); +int tt_local_seq_print_text(struct seq_file *seq, void *offset); +void tt_local_free(struct bat_priv *bat_priv); +int tt_global_init(struct bat_priv *bat_priv); +void tt_global_add_orig(struct bat_priv *bat_priv, + struct orig_node *orig_node, + unsigned char *tt_buff, int tt_buff_len); +int tt_global_seq_print_text(struct seq_file *seq, void *offset); +void tt_global_del_orig(struct bat_priv *bat_priv, + struct orig_node *orig_node, char *message); +void tt_global_free(struct bat_priv *bat_priv); +struct orig_node *transtable_search(struct bat_priv *bat_priv, uint8_t *addr); + +#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h new file mode 100644 index 00000000..fab70e8b --- /dev/null +++ b/net/batman-adv/types.h @@ -0,0 +1,291 @@ +/* + * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors: + * + * Marek Lindner, Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + + + +#ifndef _NET_BATMAN_ADV_TYPES_H_ +#define _NET_BATMAN_ADV_TYPES_H_ + +#include "packet.h" +#include "bitarray.h" + +#define BAT_HEADER_LEN (sizeof(struct ethhdr) + \ + ((sizeof(struct unicast_packet) > sizeof(struct bcast_packet) ? \ + sizeof(struct unicast_packet) : \ + sizeof(struct bcast_packet)))) + + +struct hard_iface { + struct list_head list; + int16_t if_num; + char if_status; + struct net_device *net_dev; + atomic_t seqno; + atomic_t frag_seqno; + unsigned char *packet_buff; + int packet_len; + struct kobject *hardif_obj; + atomic_t refcount; + struct packet_type batman_adv_ptype; + struct net_device *soft_iface; + struct rcu_head rcu; +}; + +/** + * orig_node - structure for orig_list maintaining nodes of mesh + * @primary_addr: hosts primary interface address + * @last_valid: when last packet from this node was received + * @bcast_seqno_reset: time when the broadcast seqno window was reset + * @batman_seqno_reset: time when the batman seqno window was reset + * @gw_flags: flags related to gateway class + * @flags: for now only VIS_SERVER flag + * @last_real_seqno: last and best known squence number + * @last_ttl: ttl of last received packet + * @last_bcast_seqno: last broadcast sequence number received by this host + * + * @candidates: how many candidates are available + * @selected: next bonding candidate + */ +struct orig_node { + uint8_t orig[ETH_ALEN]; + uint8_t primary_addr[ETH_ALEN]; + struct neigh_node __rcu *router; /* rcu protected pointer */ + unsigned long *bcast_own; + uint8_t *bcast_own_sum; + unsigned long last_valid; + unsigned long bcast_seqno_reset; + unsigned long batman_seqno_reset; + uint8_t gw_flags; + uint8_t flags; + unsigned char *tt_buff; + int16_t tt_buff_len; + uint32_t last_real_seqno; + uint8_t last_ttl; + unsigned long bcast_bits[NUM_WORDS]; + uint32_t last_bcast_seqno; + struct hlist_head neigh_list; + struct list_head frag_list; + spinlock_t neigh_list_lock; /* protects neigh_list and router */ + atomic_t refcount; + struct rcu_head rcu; + struct hlist_node hash_entry; + struct bat_priv *bat_priv; + unsigned long last_frag_packet; + /* ogm_cnt_lock protects: bcast_own, bcast_own_sum, + * neigh_node->real_bits, neigh_node->real_packet_count */ + spinlock_t ogm_cnt_lock; + /* bcast_seqno_lock protects bcast_bits, last_bcast_seqno */ + spinlock_t bcast_seqno_lock; + atomic_t bond_candidates; + struct list_head bond_list; +}; + +struct gw_node { + struct hlist_node list; + struct orig_node *orig_node; + unsigned long deleted; + atomic_t refcount; + struct rcu_head rcu; +}; + +/** + * neigh_node + * @last_valid: when last packet via this neighbor was received + */ +struct neigh_node { + struct hlist_node list; + uint8_t addr[ETH_ALEN]; + uint8_t real_packet_count; + uint8_t tq_recv[TQ_GLOBAL_WINDOW_SIZE]; + uint8_t tq_index; + uint8_t tq_avg; + uint8_t last_ttl; + struct list_head bonding_list; + unsigned long last_valid; + unsigned long real_bits[NUM_WORDS]; + atomic_t refcount; + struct rcu_head rcu; + struct orig_node *orig_node; + struct hard_iface *if_incoming; + spinlock_t tq_lock; /* protects: tq_recv, tq_index */ +}; + + +struct bat_priv { + atomic_t mesh_state; + struct net_device_stats stats; + atomic_t aggregated_ogms; /* boolean */ + atomic_t bonding; /* boolean */ + atomic_t fragmentation; /* boolean */ + atomic_t vis_mode; /* VIS_TYPE_* */ + atomic_t gw_mode; /* GW_MODE_* */ + atomic_t gw_sel_class; /* uint */ + atomic_t gw_bandwidth; /* gw bandwidth */ + atomic_t orig_interval; /* uint */ + atomic_t hop_penalty; /* uint */ + atomic_t log_level; /* uint */ + atomic_t bcast_seqno; + atomic_t bcast_queue_left; + atomic_t batman_queue_left; + char num_ifaces; + struct debug_log *debug_log; + struct kobject *mesh_obj; + struct dentry *debug_dir; + struct hlist_head forw_bat_list; + struct hlist_head forw_bcast_list; + struct hlist_head gw_list; + struct hlist_head softif_neigh_vids; + struct list_head vis_send_list; + struct hashtable_t *orig_hash; + struct hashtable_t *tt_local_hash; + struct hashtable_t *tt_global_hash; + struct hashtable_t *vis_hash; + spinlock_t forw_bat_list_lock; /* protects forw_bat_list */ + spinlock_t forw_bcast_list_lock; /* protects */ + spinlock_t tt_lhash_lock; /* protects tt_local_hash */ + spinlock_t tt_ghash_lock; /* protects tt_global_hash */ + spinlock_t gw_list_lock; /* protects gw_list and curr_gw */ + spinlock_t vis_hash_lock; /* protects vis_hash */ + spinlock_t vis_list_lock; /* protects vis_info::recv_list */ + spinlock_t softif_neigh_lock; /* protects soft-interface neigh list */ + spinlock_t softif_neigh_vid_lock; /* protects soft-interface vid list */ + int16_t num_local_tt; + atomic_t tt_local_changed; + struct delayed_work tt_work; + struct delayed_work orig_work; + struct delayed_work vis_work; + struct gw_node __rcu *curr_gw; /* rcu protected pointer */ + struct hard_iface __rcu *primary_if; /* rcu protected pointer */ + struct vis_info *my_vis_info; +}; + +struct socket_client { + struct list_head queue_list; + unsigned int queue_len; + unsigned char index; + spinlock_t lock; /* protects queue_list, queue_len, index */ + wait_queue_head_t queue_wait; + struct bat_priv *bat_priv; +}; + +struct socket_packet { + struct list_head list; + size_t icmp_len; + struct icmp_packet_rr icmp_packet; +}; + +struct tt_local_entry { + uint8_t addr[ETH_ALEN]; + unsigned long last_seen; + char never_purge; + struct hlist_node hash_entry; +}; + +struct tt_global_entry { + uint8_t addr[ETH_ALEN]; + struct orig_node *orig_node; + struct hlist_node hash_entry; +}; + +/** + * forw_packet - structure for forw_list maintaining packets to be + * send/forwarded + */ +struct forw_packet { + struct hlist_node list; + unsigned long send_time; + uint8_t own; + struct sk_buff *skb; + uint16_t packet_len; + uint32_t direct_link_flags; + uint8_t num_packets; + struct delayed_work delayed_work; + struct hard_iface *if_incoming; +}; + +/* While scanning for vis-entries of a particular vis-originator + * this list collects its interfaces to create a subgraph/cluster + * out of them later + */ +struct if_list_entry { + uint8_t addr[ETH_ALEN]; + bool primary; + struct hlist_node list; +}; + +struct debug_log { + char log_buff[LOG_BUF_LEN]; + unsigned long log_start; + unsigned long log_end; + spinlock_t lock; /* protects log_buff, log_start and log_end */ + wait_queue_head_t queue_wait; +}; + +struct frag_packet_list_entry { + struct list_head list; + uint16_t seqno; + struct sk_buff *skb; +}; + +struct vis_info { + unsigned long first_seen; + struct list_head recv_list; + /* list of server-neighbors we received a vis-packet + * from. we should not reply to them. */ + struct list_head send_list; + struct kref refcount; + struct hlist_node hash_entry; + struct bat_priv *bat_priv; + /* this packet might be part of the vis send queue. */ + struct sk_buff *skb_packet; + /* vis_info may follow here*/ +} __packed; + +struct vis_info_entry { + uint8_t src[ETH_ALEN]; + uint8_t dest[ETH_ALEN]; + uint8_t quality; /* quality = 0 client */ +} __packed; + +struct recvlist_node { + struct list_head list; + uint8_t mac[ETH_ALEN]; +}; + +struct softif_neigh_vid { + struct hlist_node list; + struct bat_priv *bat_priv; + short vid; + atomic_t refcount; + struct softif_neigh __rcu *softif_neigh; + struct rcu_head rcu; + struct hlist_head softif_neigh_list; +}; + +struct softif_neigh { + struct hlist_node list; + uint8_t addr[ETH_ALEN]; + unsigned long last_seen; + atomic_t refcount; + struct rcu_head rcu; +}; + +#endif /* _NET_BATMAN_ADV_TYPES_H_ */ diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c new file mode 100644 index 00000000..19c3daf3 --- /dev/null +++ b/net/batman-adv/unicast.c @@ -0,0 +1,355 @@ +/* + * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors: + * + * Andreas Langer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "unicast.h" +#include "send.h" +#include "soft-interface.h" +#include "gateway_client.h" +#include "originator.h" +#include "hash.h" +#include "translation-table.h" +#include "routing.h" +#include "hard-interface.h" + + +static struct sk_buff *frag_merge_packet(struct list_head *head, + struct frag_packet_list_entry *tfp, + struct sk_buff *skb) +{ + struct unicast_frag_packet *up = + (struct unicast_frag_packet *)skb->data; + struct sk_buff *tmp_skb; + struct unicast_packet *unicast_packet; + int hdr_len = sizeof(struct unicast_packet); + int uni_diff = sizeof(struct unicast_frag_packet) - hdr_len; + + /* set skb to the first part and tmp_skb to the second part */ + if (up->flags & UNI_FRAG_HEAD) { + tmp_skb = tfp->skb; + } else { + tmp_skb = skb; + skb = tfp->skb; + } + + if (skb_linearize(skb) < 0 || skb_linearize(tmp_skb) < 0) + goto err; + + skb_pull(tmp_skb, sizeof(struct unicast_frag_packet)); + if (pskb_expand_head(skb, 0, tmp_skb->len, GFP_ATOMIC) < 0) + goto err; + + /* move free entry to end */ + tfp->skb = NULL; + tfp->seqno = 0; + list_move_tail(&tfp->list, head); + + memcpy(skb_put(skb, tmp_skb->len), tmp_skb->data, tmp_skb->len); + kfree_skb(tmp_skb); + + memmove(skb->data + uni_diff, skb->data, hdr_len); + unicast_packet = (struct unicast_packet *) skb_pull(skb, uni_diff); + unicast_packet->packet_type = BAT_UNICAST; + + return skb; + +err: + /* free buffered skb, skb will be freed later */ + kfree_skb(tfp->skb); + return NULL; +} + +static void frag_create_entry(struct list_head *head, struct sk_buff *skb) +{ + struct frag_packet_list_entry *tfp; + struct unicast_frag_packet *up = + (struct unicast_frag_packet *)skb->data; + + /* free and oldest packets stand at the end */ + tfp = list_entry((head)->prev, typeof(*tfp), list); + kfree_skb(tfp->skb); + + tfp->seqno = ntohs(up->seqno); + tfp->skb = skb; + list_move(&tfp->list, head); + return; +} + +static int frag_create_buffer(struct list_head *head) +{ + int i; + struct frag_packet_list_entry *tfp; + + for (i = 0; i < FRAG_BUFFER_SIZE; i++) { + tfp = kmalloc(sizeof(struct frag_packet_list_entry), + GFP_ATOMIC); + if (!tfp) { + frag_list_free(head); + return -ENOMEM; + } + tfp->skb = NULL; + tfp->seqno = 0; + INIT_LIST_HEAD(&tfp->list); + list_add(&tfp->list, head); + } + + return 0; +} + +static struct frag_packet_list_entry *frag_search_packet(struct list_head *head, + struct unicast_frag_packet *up) +{ + struct frag_packet_list_entry *tfp; + struct unicast_frag_packet *tmp_up = NULL; + uint16_t search_seqno; + + if (up->flags & UNI_FRAG_HEAD) + search_seqno = ntohs(up->seqno)+1; + else + search_seqno = ntohs(up->seqno)-1; + + list_for_each_entry(tfp, head, list) { + + if (!tfp->skb) + continue; + + if (tfp->seqno == ntohs(up->seqno)) + goto mov_tail; + + tmp_up = (struct unicast_frag_packet *)tfp->skb->data; + + if (tfp->seqno == search_seqno) { + + if ((tmp_up->flags & UNI_FRAG_HEAD) != + (up->flags & UNI_FRAG_HEAD)) + return tfp; + else + goto mov_tail; + } + } + return NULL; + +mov_tail: + list_move_tail(&tfp->list, head); + return NULL; +} + +void frag_list_free(struct list_head *head) +{ + struct frag_packet_list_entry *pf, *tmp_pf; + + if (!list_empty(head)) { + + list_for_each_entry_safe(pf, tmp_pf, head, list) { + kfree_skb(pf->skb); + list_del(&pf->list); + kfree(pf); + } + } + return; +} + +/* frag_reassemble_skb(): + * returns NET_RX_DROP if the operation failed - skb is left intact + * returns NET_RX_SUCCESS if the fragment was buffered (skb_new will be NULL) + * or the skb could be reassembled (skb_new will point to the new packet and + * skb was freed) + */ +int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv, + struct sk_buff **new_skb) +{ + struct orig_node *orig_node; + struct frag_packet_list_entry *tmp_frag_entry; + int ret = NET_RX_DROP; + struct unicast_frag_packet *unicast_packet = + (struct unicast_frag_packet *)skb->data; + + *new_skb = NULL; + + orig_node = orig_hash_find(bat_priv, unicast_packet->orig); + if (!orig_node) + goto out; + + orig_node->last_frag_packet = jiffies; + + if (list_empty(&orig_node->frag_list) && + frag_create_buffer(&orig_node->frag_list)) { + pr_debug("couldn't create frag buffer\n"); + goto out; + } + + tmp_frag_entry = frag_search_packet(&orig_node->frag_list, + unicast_packet); + + if (!tmp_frag_entry) { + frag_create_entry(&orig_node->frag_list, skb); + ret = NET_RX_SUCCESS; + goto out; + } + + *new_skb = frag_merge_packet(&orig_node->frag_list, tmp_frag_entry, + skb); + /* if not, merge failed */ + if (*new_skb) + ret = NET_RX_SUCCESS; + +out: + if (orig_node) + orig_node_free_ref(orig_node); + return ret; +} + +int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv, + struct hard_iface *hard_iface, uint8_t dstaddr[]) +{ + struct unicast_packet tmp_uc, *unicast_packet; + struct hard_iface *primary_if; + struct sk_buff *frag_skb; + struct unicast_frag_packet *frag1, *frag2; + int uc_hdr_len = sizeof(struct unicast_packet); + int ucf_hdr_len = sizeof(struct unicast_frag_packet); + int data_len = skb->len - uc_hdr_len; + int large_tail = 0, ret = NET_RX_DROP; + uint16_t seqno; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto dropped; + + frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len); + if (!frag_skb) + goto dropped; + skb_reserve(frag_skb, ucf_hdr_len); + + unicast_packet = (struct unicast_packet *) skb->data; + memcpy(&tmp_uc, unicast_packet, uc_hdr_len); + skb_split(skb, frag_skb, data_len / 2 + uc_hdr_len); + + if (my_skb_head_push(skb, ucf_hdr_len - uc_hdr_len) < 0 || + my_skb_head_push(frag_skb, ucf_hdr_len) < 0) + goto drop_frag; + + frag1 = (struct unicast_frag_packet *)skb->data; + frag2 = (struct unicast_frag_packet *)frag_skb->data; + + memcpy(frag1, &tmp_uc, sizeof(struct unicast_packet)); + + frag1->ttl--; + frag1->version = COMPAT_VERSION; + frag1->packet_type = BAT_UNICAST_FRAG; + + memcpy(frag1->orig, primary_if->net_dev->dev_addr, ETH_ALEN); + memcpy(frag2, frag1, sizeof(struct unicast_frag_packet)); + + if (data_len & 1) + large_tail = UNI_FRAG_LARGETAIL; + + frag1->flags = UNI_FRAG_HEAD | large_tail; + frag2->flags = large_tail; + + seqno = atomic_add_return(2, &hard_iface->frag_seqno); + frag1->seqno = htons(seqno - 1); + frag2->seqno = htons(seqno); + + send_skb_packet(skb, hard_iface, dstaddr); + send_skb_packet(frag_skb, hard_iface, dstaddr); + ret = NET_RX_SUCCESS; + goto out; + +drop_frag: + kfree_skb(frag_skb); +dropped: + kfree_skb(skb); +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv) +{ + struct ethhdr *ethhdr = (struct ethhdr *)skb->data; + struct unicast_packet *unicast_packet; + struct orig_node *orig_node; + struct neigh_node *neigh_node; + int data_len = skb->len; + int ret = 1; + + /* get routing information */ + if (is_multicast_ether_addr(ethhdr->h_dest)) { + orig_node = (struct orig_node *)gw_get_selected_orig(bat_priv); + if (orig_node) + goto find_router; + } + + /* check for tt host - increases orig_node refcount */ + orig_node = transtable_search(bat_priv, ethhdr->h_dest); + +find_router: + /** + * find_router(): + * - if orig_node is NULL it returns NULL + * - increases neigh_nodes refcount if found. + */ + neigh_node = find_router(bat_priv, orig_node, NULL); + + if (!neigh_node) + goto out; + + if (neigh_node->if_incoming->if_status != IF_ACTIVE) + goto out; + + if (my_skb_head_push(skb, sizeof(struct unicast_packet)) < 0) + goto out; + + unicast_packet = (struct unicast_packet *)skb->data; + + unicast_packet->version = COMPAT_VERSION; + /* batman packet type: unicast */ + unicast_packet->packet_type = BAT_UNICAST; + /* set unicast ttl */ + unicast_packet->ttl = TTL; + /* copy the destination for faster routing */ + memcpy(unicast_packet->dest, orig_node->orig, ETH_ALEN); + + if (atomic_read(&bat_priv->fragmentation) && + data_len + sizeof(struct unicast_packet) > + neigh_node->if_incoming->net_dev->mtu) { + /* send frag skb decreases ttl */ + unicast_packet->ttl++; + ret = frag_send_skb(skb, bat_priv, + neigh_node->if_incoming, neigh_node->addr); + goto out; + } + + send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + ret = 0; + goto out; + +out: + if (neigh_node) + neigh_node_free_ref(neigh_node); + if (orig_node) + orig_node_free_ref(orig_node); + if (ret == 1) + kfree_skb(skb); + return ret; +} diff --git a/net/batman-adv/unicast.h b/net/batman-adv/unicast.h new file mode 100644 index 00000000..16ad7a92 --- /dev/null +++ b/net/batman-adv/unicast.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors: + * + * Andreas Langer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_UNICAST_H_ +#define _NET_BATMAN_ADV_UNICAST_H_ + +#include "packet.h" + +#define FRAG_TIMEOUT 10000 /* purge frag list entrys after time in ms */ +#define FRAG_BUFFER_SIZE 6 /* number of list elements in buffer */ + +int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv, + struct sk_buff **new_skb); +void frag_list_free(struct list_head *head); +int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv); +int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv, + struct hard_iface *hard_iface, uint8_t dstaddr[]); + +static inline int frag_can_reassemble(struct sk_buff *skb, int mtu) +{ + struct unicast_frag_packet *unicast_packet; + int uneven_correction = 0; + unsigned int merged_size; + + unicast_packet = (struct unicast_frag_packet *)skb->data; + + if (unicast_packet->flags & UNI_FRAG_LARGETAIL) { + if (unicast_packet->flags & UNI_FRAG_HEAD) + uneven_correction = 1; + else + uneven_correction = -1; + } + + merged_size = (skb->len - sizeof(struct unicast_frag_packet)) * 2; + merged_size += sizeof(struct unicast_packet) + uneven_correction; + + return merged_size <= mtu; +} + +#endif /* _NET_BATMAN_ADV_UNICAST_H_ */ diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c new file mode 100644 index 00000000..c39f20cc --- /dev/null +++ b/net/batman-adv/vis.c @@ -0,0 +1,993 @@ +/* + * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors: + * + * Simon Wunderlich + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#include "main.h" +#include "send.h" +#include "translation-table.h" +#include "vis.h" +#include "soft-interface.h" +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" + +#define MAX_VIS_PACKET_SIZE 1000 + +/* Returns the smallest signed integer in two's complement with the sizeof x */ +#define smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u))) + +/* Checks if a sequence number x is a predecessor/successor of y. + * they handle overflows/underflows and can correctly check for a + * predecessor/successor unless the variable sequence number has grown by + * more then 2**(bitwidth(x)-1)-1. + * This means that for a uint8_t with the maximum value 255, it would think: + * - when adding nothing - it is neither a predecessor nor a successor + * - before adding more than 127 to the starting value - it is a predecessor, + * - when adding 128 - it is neither a predecessor nor a successor, + * - after adding more than 127 to the starting value - it is a successor */ +#define seq_before(x, y) ({typeof(x) _dummy = (x - y); \ + _dummy > smallest_signed_int(_dummy); }) +#define seq_after(x, y) seq_before(y, x) + +static void start_vis_timer(struct bat_priv *bat_priv); + +/* free the info */ +static void free_info(struct kref *ref) +{ + struct vis_info *info = container_of(ref, struct vis_info, refcount); + struct bat_priv *bat_priv = info->bat_priv; + struct recvlist_node *entry, *tmp; + + list_del_init(&info->send_list); + spin_lock_bh(&bat_priv->vis_list_lock); + list_for_each_entry_safe(entry, tmp, &info->recv_list, list) { + list_del(&entry->list); + kfree(entry); + } + + spin_unlock_bh(&bat_priv->vis_list_lock); + kfree_skb(info->skb_packet); + kfree(info); +} + +/* Compare two vis packets, used by the hashing algorithm */ +static int vis_info_cmp(struct hlist_node *node, void *data2) +{ + struct vis_info *d1, *d2; + struct vis_packet *p1, *p2; + + d1 = container_of(node, struct vis_info, hash_entry); + d2 = data2; + p1 = (struct vis_packet *)d1->skb_packet->data; + p2 = (struct vis_packet *)d2->skb_packet->data; + return compare_eth(p1->vis_orig, p2->vis_orig); +} + +/* hash function to choose an entry in a hash table of given size */ +/* hash algorithm from http://en.wikipedia.org/wiki/Hash_table */ +static int vis_info_choose(void *data, int size) +{ + struct vis_info *vis_info = data; + struct vis_packet *packet; + unsigned char *key; + uint32_t hash = 0; + size_t i; + + packet = (struct vis_packet *)vis_info->skb_packet->data; + key = packet->vis_orig; + for (i = 0; i < ETH_ALEN; i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash % size; +} + +static struct vis_info *vis_hash_find(struct bat_priv *bat_priv, + void *data) +{ + struct hashtable_t *hash = bat_priv->vis_hash; + struct hlist_head *head; + struct hlist_node *node; + struct vis_info *vis_info, *vis_info_tmp = NULL; + int index; + + if (!hash) + return NULL; + + index = vis_info_choose(data, hash->size); + head = &hash->table[index]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(vis_info, node, head, hash_entry) { + if (!vis_info_cmp(node, data)) + continue; + + vis_info_tmp = vis_info; + break; + } + rcu_read_unlock(); + + return vis_info_tmp; +} + +/* insert interface to the list of interfaces of one originator, if it + * does not already exist in the list */ +static void vis_data_insert_interface(const uint8_t *interface, + struct hlist_head *if_list, + bool primary) +{ + struct if_list_entry *entry; + struct hlist_node *pos; + + hlist_for_each_entry(entry, pos, if_list, list) { + if (compare_eth(entry->addr, (void *)interface)) + return; + } + + /* its a new address, add it to the list */ + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return; + memcpy(entry->addr, interface, ETH_ALEN); + entry->primary = primary; + hlist_add_head(&entry->list, if_list); +} + +static ssize_t vis_data_read_prim_sec(char *buff, struct hlist_head *if_list) +{ + struct if_list_entry *entry; + struct hlist_node *pos; + size_t len = 0; + + hlist_for_each_entry(entry, pos, if_list, list) { + if (entry->primary) + len += sprintf(buff + len, "PRIMARY, "); + else + len += sprintf(buff + len, "SEC %pM, ", entry->addr); + } + + return len; +} + +static size_t vis_data_count_prim_sec(struct hlist_head *if_list) +{ + struct if_list_entry *entry; + struct hlist_node *pos; + size_t count = 0; + + hlist_for_each_entry(entry, pos, if_list, list) { + if (entry->primary) + count += 9; + else + count += 23; + } + + return count; +} + +/* read an entry */ +static ssize_t vis_data_read_entry(char *buff, struct vis_info_entry *entry, + uint8_t *src, bool primary) +{ + /* maximal length: max(4+17+2, 3+17+1+3+2) == 26 */ + if (primary && entry->quality == 0) + return sprintf(buff, "TT %pM, ", entry->dest); + else if (compare_eth(entry->src, src)) + return sprintf(buff, "TQ %pM %d, ", entry->dest, + entry->quality); + + return 0; +} + +int vis_seq_print_text(struct seq_file *seq, void *offset) +{ + struct hard_iface *primary_if; + struct hlist_node *node; + struct hlist_head *head; + struct vis_info *info; + struct vis_packet *packet; + struct vis_info_entry *entries; + struct net_device *net_dev = (struct net_device *)seq->private; + struct bat_priv *bat_priv = netdev_priv(net_dev); + struct hashtable_t *hash = bat_priv->vis_hash; + HLIST_HEAD(vis_if_list); + struct if_list_entry *entry; + struct hlist_node *pos, *n; + int i, j, ret = 0; + int vis_server = atomic_read(&bat_priv->vis_mode); + size_t buff_pos, buf_size; + char *buff; + int compare; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + if (vis_server == VIS_TYPE_CLIENT_UPDATE) + goto out; + + buf_size = 1; + /* Estimate length */ + spin_lock_bh(&bat_priv->vis_hash_lock); + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(info, node, head, hash_entry) { + packet = (struct vis_packet *)info->skb_packet->data; + entries = (struct vis_info_entry *) + ((char *)packet + sizeof(struct vis_packet)); + + for (j = 0; j < packet->entries; j++) { + if (entries[j].quality == 0) + continue; + compare = + compare_eth(entries[j].src, packet->vis_orig); + vis_data_insert_interface(entries[j].src, + &vis_if_list, + compare); + } + + hlist_for_each_entry(entry, pos, &vis_if_list, list) { + buf_size += 18 + 26 * packet->entries; + + /* add primary/secondary records */ + if (compare_eth(entry->addr, packet->vis_orig)) + buf_size += + vis_data_count_prim_sec(&vis_if_list); + + buf_size += 1; + } + + hlist_for_each_entry_safe(entry, pos, n, &vis_if_list, + list) { + hlist_del(&entry->list); + kfree(entry); + } + } + rcu_read_unlock(); + } + + buff = kmalloc(buf_size, GFP_ATOMIC); + if (!buff) { + spin_unlock_bh(&bat_priv->vis_hash_lock); + ret = -ENOMEM; + goto out; + } + buff[0] = '\0'; + buff_pos = 0; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(info, node, head, hash_entry) { + packet = (struct vis_packet *)info->skb_packet->data; + entries = (struct vis_info_entry *) + ((char *)packet + sizeof(struct vis_packet)); + + for (j = 0; j < packet->entries; j++) { + if (entries[j].quality == 0) + continue; + compare = + compare_eth(entries[j].src, packet->vis_orig); + vis_data_insert_interface(entries[j].src, + &vis_if_list, + compare); + } + + hlist_for_each_entry(entry, pos, &vis_if_list, list) { + buff_pos += sprintf(buff + buff_pos, "%pM,", + entry->addr); + + for (j = 0; j < packet->entries; j++) + buff_pos += vis_data_read_entry( + buff + buff_pos, + &entries[j], + entry->addr, + entry->primary); + + /* add primary/secondary records */ + if (compare_eth(entry->addr, packet->vis_orig)) + buff_pos += + vis_data_read_prim_sec(buff + buff_pos, + &vis_if_list); + + buff_pos += sprintf(buff + buff_pos, "\n"); + } + + hlist_for_each_entry_safe(entry, pos, n, &vis_if_list, + list) { + hlist_del(&entry->list); + kfree(entry); + } + } + rcu_read_unlock(); + } + + spin_unlock_bh(&bat_priv->vis_hash_lock); + + seq_printf(seq, "%s", buff); + kfree(buff); + +out: + if (primary_if) + hardif_free_ref(primary_if); + return ret; +} + +/* add the info packet to the send list, if it was not + * already linked in. */ +static void send_list_add(struct bat_priv *bat_priv, struct vis_info *info) +{ + if (list_empty(&info->send_list)) { + kref_get(&info->refcount); + list_add_tail(&info->send_list, &bat_priv->vis_send_list); + } +} + +/* delete the info packet from the send list, if it was + * linked in. */ +static void send_list_del(struct vis_info *info) +{ + if (!list_empty(&info->send_list)) { + list_del_init(&info->send_list); + kref_put(&info->refcount, free_info); + } +} + +/* tries to add one entry to the receive list. */ +static void recv_list_add(struct bat_priv *bat_priv, + struct list_head *recv_list, char *mac) +{ + struct recvlist_node *entry; + + entry = kmalloc(sizeof(struct recvlist_node), GFP_ATOMIC); + if (!entry) + return; + + memcpy(entry->mac, mac, ETH_ALEN); + spin_lock_bh(&bat_priv->vis_list_lock); + list_add_tail(&entry->list, recv_list); + spin_unlock_bh(&bat_priv->vis_list_lock); +} + +/* returns 1 if this mac is in the recv_list */ +static int recv_list_is_in(struct bat_priv *bat_priv, + struct list_head *recv_list, char *mac) +{ + struct recvlist_node *entry; + + spin_lock_bh(&bat_priv->vis_list_lock); + list_for_each_entry(entry, recv_list, list) { + if (compare_eth(entry->mac, mac)) { + spin_unlock_bh(&bat_priv->vis_list_lock); + return 1; + } + } + spin_unlock_bh(&bat_priv->vis_list_lock); + return 0; +} + +/* try to add the packet to the vis_hash. return NULL if invalid (e.g. too old, + * broken.. ). vis hash must be locked outside. is_new is set when the packet + * is newer than old entries in the hash. */ +static struct vis_info *add_packet(struct bat_priv *bat_priv, + struct vis_packet *vis_packet, + int vis_info_len, int *is_new, + int make_broadcast) +{ + struct vis_info *info, *old_info; + struct vis_packet *search_packet, *old_packet; + struct vis_info search_elem; + struct vis_packet *packet; + int hash_added; + + *is_new = 0; + /* sanity check */ + if (!bat_priv->vis_hash) + return NULL; + + /* see if the packet is already in vis_hash */ + search_elem.skb_packet = dev_alloc_skb(sizeof(struct vis_packet)); + if (!search_elem.skb_packet) + return NULL; + search_packet = (struct vis_packet *)skb_put(search_elem.skb_packet, + sizeof(struct vis_packet)); + + memcpy(search_packet->vis_orig, vis_packet->vis_orig, ETH_ALEN); + old_info = vis_hash_find(bat_priv, &search_elem); + kfree_skb(search_elem.skb_packet); + + if (old_info) { + old_packet = (struct vis_packet *)old_info->skb_packet->data; + if (!seq_after(ntohl(vis_packet->seqno), + ntohl(old_packet->seqno))) { + if (old_packet->seqno == vis_packet->seqno) { + recv_list_add(bat_priv, &old_info->recv_list, + vis_packet->sender_orig); + return old_info; + } else { + /* newer packet is already in hash. */ + return NULL; + } + } + /* remove old entry */ + hash_remove(bat_priv->vis_hash, vis_info_cmp, vis_info_choose, + old_info); + send_list_del(old_info); + kref_put(&old_info->refcount, free_info); + } + + info = kmalloc(sizeof(struct vis_info), GFP_ATOMIC); + if (!info) + return NULL; + + info->skb_packet = dev_alloc_skb(sizeof(struct vis_packet) + + vis_info_len + sizeof(struct ethhdr)); + if (!info->skb_packet) { + kfree(info); + return NULL; + } + skb_reserve(info->skb_packet, sizeof(struct ethhdr)); + packet = (struct vis_packet *)skb_put(info->skb_packet, + sizeof(struct vis_packet) + + vis_info_len); + + kref_init(&info->refcount); + INIT_LIST_HEAD(&info->send_list); + INIT_LIST_HEAD(&info->recv_list); + info->first_seen = jiffies; + info->bat_priv = bat_priv; + memcpy(packet, vis_packet, sizeof(struct vis_packet) + vis_info_len); + + /* initialize and add new packet. */ + *is_new = 1; + + /* Make it a broadcast packet, if required */ + if (make_broadcast) + memcpy(packet->target_orig, broadcast_addr, ETH_ALEN); + + /* repair if entries is longer than packet. */ + if (packet->entries * sizeof(struct vis_info_entry) > vis_info_len) + packet->entries = vis_info_len / sizeof(struct vis_info_entry); + + recv_list_add(bat_priv, &info->recv_list, packet->sender_orig); + + /* try to add it */ + hash_added = hash_add(bat_priv->vis_hash, vis_info_cmp, vis_info_choose, + info, &info->hash_entry); + if (hash_added < 0) { + /* did not work (for some reason) */ + kref_put(&info->refcount, free_info); + info = NULL; + } + + return info; +} + +/* handle the server sync packet, forward if needed. */ +void receive_server_sync_packet(struct bat_priv *bat_priv, + struct vis_packet *vis_packet, + int vis_info_len) +{ + struct vis_info *info; + int is_new, make_broadcast; + int vis_server = atomic_read(&bat_priv->vis_mode); + + make_broadcast = (vis_server == VIS_TYPE_SERVER_SYNC); + + spin_lock_bh(&bat_priv->vis_hash_lock); + info = add_packet(bat_priv, vis_packet, vis_info_len, + &is_new, make_broadcast); + if (!info) + goto end; + + /* only if we are server ourselves and packet is newer than the one in + * hash.*/ + if (vis_server == VIS_TYPE_SERVER_SYNC && is_new) + send_list_add(bat_priv, info); +end: + spin_unlock_bh(&bat_priv->vis_hash_lock); +} + +/* handle an incoming client update packet and schedule forward if needed. */ +void receive_client_update_packet(struct bat_priv *bat_priv, + struct vis_packet *vis_packet, + int vis_info_len) +{ + struct vis_info *info; + struct vis_packet *packet; + int is_new; + int vis_server = atomic_read(&bat_priv->vis_mode); + int are_target = 0; + + /* clients shall not broadcast. */ + if (is_broadcast_ether_addr(vis_packet->target_orig)) + return; + + /* Are we the target for this VIS packet? */ + if (vis_server == VIS_TYPE_SERVER_SYNC && + is_my_mac(vis_packet->target_orig)) + are_target = 1; + + spin_lock_bh(&bat_priv->vis_hash_lock); + info = add_packet(bat_priv, vis_packet, vis_info_len, + &is_new, are_target); + + if (!info) + goto end; + /* note that outdated packets will be dropped at this point. */ + + packet = (struct vis_packet *)info->skb_packet->data; + + /* send only if we're the target server or ... */ + if (are_target && is_new) { + packet->vis_type = VIS_TYPE_SERVER_SYNC; /* upgrade! */ + send_list_add(bat_priv, info); + + /* ... we're not the recipient (and thus need to forward). */ + } else if (!is_my_mac(packet->target_orig)) { + send_list_add(bat_priv, info); + } + +end: + spin_unlock_bh(&bat_priv->vis_hash_lock); +} + +/* Walk the originators and find the VIS server with the best tq. Set the packet + * address to its address and return the best_tq. + * + * Must be called with the originator hash locked */ +static int find_best_vis_server(struct bat_priv *bat_priv, + struct vis_info *info) +{ + struct hashtable_t *hash = bat_priv->orig_hash; + struct neigh_node *router; + struct hlist_node *node; + struct hlist_head *head; + struct orig_node *orig_node; + struct vis_packet *packet; + int best_tq = -1, i; + + packet = (struct vis_packet *)info->skb_packet->data; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + router = orig_node_get_router(orig_node); + if (!router) + continue; + + if ((orig_node->flags & VIS_SERVER) && + (router->tq_avg > best_tq)) { + best_tq = router->tq_avg; + memcpy(packet->target_orig, orig_node->orig, + ETH_ALEN); + } + neigh_node_free_ref(router); + } + rcu_read_unlock(); + } + + return best_tq; +} + +/* Return true if the vis packet is full. */ +static bool vis_packet_full(struct vis_info *info) +{ + struct vis_packet *packet; + packet = (struct vis_packet *)info->skb_packet->data; + + if (MAX_VIS_PACKET_SIZE / sizeof(struct vis_info_entry) + < packet->entries + 1) + return true; + return false; +} + +/* generates a packet of own vis data, + * returns 0 on success, -1 if no packet could be generated */ +static int generate_vis_packet(struct bat_priv *bat_priv) +{ + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node; + struct hlist_head *head; + struct orig_node *orig_node; + struct neigh_node *router; + struct vis_info *info = (struct vis_info *)bat_priv->my_vis_info; + struct vis_packet *packet = (struct vis_packet *)info->skb_packet->data; + struct vis_info_entry *entry; + struct tt_local_entry *tt_local_entry; + int best_tq = -1, i; + + info->first_seen = jiffies; + packet->vis_type = atomic_read(&bat_priv->vis_mode); + + memcpy(packet->target_orig, broadcast_addr, ETH_ALEN); + packet->ttl = TTL; + packet->seqno = htonl(ntohl(packet->seqno) + 1); + packet->entries = 0; + skb_trim(info->skb_packet, sizeof(struct vis_packet)); + + if (packet->vis_type == VIS_TYPE_CLIENT_UPDATE) { + best_tq = find_best_vis_server(bat_priv, info); + + if (best_tq < 0) + return -1; + } + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + router = orig_node_get_router(orig_node); + if (!router) + continue; + + if (!compare_eth(router->addr, orig_node->orig)) + goto next; + + if (router->if_incoming->if_status != IF_ACTIVE) + goto next; + + if (router->tq_avg < 1) + goto next; + + /* fill one entry into buffer. */ + entry = (struct vis_info_entry *) + skb_put(info->skb_packet, sizeof(*entry)); + memcpy(entry->src, + router->if_incoming->net_dev->dev_addr, + ETH_ALEN); + memcpy(entry->dest, orig_node->orig, ETH_ALEN); + entry->quality = router->tq_avg; + packet->entries++; + +next: + neigh_node_free_ref(router); + + if (vis_packet_full(info)) + goto unlock; + } + rcu_read_unlock(); + } + + hash = bat_priv->tt_local_hash; + + spin_lock_bh(&bat_priv->tt_lhash_lock); + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + hlist_for_each_entry(tt_local_entry, node, head, hash_entry) { + entry = (struct vis_info_entry *) + skb_put(info->skb_packet, + sizeof(*entry)); + memset(entry->src, 0, ETH_ALEN); + memcpy(entry->dest, tt_local_entry->addr, ETH_ALEN); + entry->quality = 0; /* 0 means TT */ + packet->entries++; + + if (vis_packet_full(info)) { + spin_unlock_bh(&bat_priv->tt_lhash_lock); + return 0; + } + } + } + + spin_unlock_bh(&bat_priv->tt_lhash_lock); + return 0; + +unlock: + rcu_read_unlock(); + return 0; +} + +/* free old vis packets. Must be called with this vis_hash_lock + * held */ +static void purge_vis_packets(struct bat_priv *bat_priv) +{ + int i; + struct hashtable_t *hash = bat_priv->vis_hash; + struct hlist_node *node, *node_tmp; + struct hlist_head *head; + struct vis_info *info; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + hlist_for_each_entry_safe(info, node, node_tmp, + head, hash_entry) { + /* never purge own data. */ + if (info == bat_priv->my_vis_info) + continue; + + if (time_after(jiffies, + info->first_seen + VIS_TIMEOUT * HZ)) { + hlist_del(node); + send_list_del(info); + kref_put(&info->refcount, free_info); + } + } + } +} + +static void broadcast_vis_packet(struct bat_priv *bat_priv, + struct vis_info *info) +{ + struct neigh_node *router; + struct hashtable_t *hash = bat_priv->orig_hash; + struct hlist_node *node; + struct hlist_head *head; + struct orig_node *orig_node; + struct vis_packet *packet; + struct sk_buff *skb; + struct hard_iface *hard_iface; + uint8_t dstaddr[ETH_ALEN]; + int i; + + + packet = (struct vis_packet *)info->skb_packet->data; + + /* send to all routers in range. */ + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) { + /* if it's a vis server and reachable, send it. */ + if (!(orig_node->flags & VIS_SERVER)) + continue; + + router = orig_node_get_router(orig_node); + if (!router) + continue; + + /* don't send it if we already received the packet from + * this node. */ + if (recv_list_is_in(bat_priv, &info->recv_list, + orig_node->orig)) { + neigh_node_free_ref(router); + continue; + } + + memcpy(packet->target_orig, orig_node->orig, ETH_ALEN); + hard_iface = router->if_incoming; + memcpy(dstaddr, router->addr, ETH_ALEN); + + neigh_node_free_ref(router); + + skb = skb_clone(info->skb_packet, GFP_ATOMIC); + if (skb) + send_skb_packet(skb, hard_iface, dstaddr); + + } + rcu_read_unlock(); + } +} + +static void unicast_vis_packet(struct bat_priv *bat_priv, + struct vis_info *info) +{ + struct orig_node *orig_node; + struct neigh_node *router = NULL; + struct sk_buff *skb; + struct vis_packet *packet; + + packet = (struct vis_packet *)info->skb_packet->data; + + orig_node = orig_hash_find(bat_priv, packet->target_orig); + if (!orig_node) + goto out; + + router = orig_node_get_router(orig_node); + if (!router) + goto out; + + skb = skb_clone(info->skb_packet, GFP_ATOMIC); + if (skb) + send_skb_packet(skb, router->if_incoming, router->addr); + +out: + if (router) + neigh_node_free_ref(router); + if (orig_node) + orig_node_free_ref(orig_node); +} + +/* only send one vis packet. called from send_vis_packets() */ +static void send_vis_packet(struct bat_priv *bat_priv, struct vis_info *info) +{ + struct hard_iface *primary_if; + struct vis_packet *packet; + + primary_if = primary_if_get_selected(bat_priv); + if (!primary_if) + goto out; + + packet = (struct vis_packet *)info->skb_packet->data; + if (packet->ttl < 2) { + pr_debug("Error - can't send vis packet: ttl exceeded\n"); + goto out; + } + + memcpy(packet->sender_orig, primary_if->net_dev->dev_addr, ETH_ALEN); + packet->ttl--; + + if (is_broadcast_ether_addr(packet->target_orig)) + broadcast_vis_packet(bat_priv, info); + else + unicast_vis_packet(bat_priv, info); + packet->ttl++; /* restore TTL */ + +out: + if (primary_if) + hardif_free_ref(primary_if); +} + +/* called from timer; send (and maybe generate) vis packet. */ +static void send_vis_packets(struct work_struct *work) +{ + struct delayed_work *delayed_work = + container_of(work, struct delayed_work, work); + struct bat_priv *bat_priv = + container_of(delayed_work, struct bat_priv, vis_work); + struct vis_info *info; + + spin_lock_bh(&bat_priv->vis_hash_lock); + purge_vis_packets(bat_priv); + + if (generate_vis_packet(bat_priv) == 0) { + /* schedule if generation was successful */ + send_list_add(bat_priv, bat_priv->my_vis_info); + } + + while (!list_empty(&bat_priv->vis_send_list)) { + info = list_first_entry(&bat_priv->vis_send_list, + typeof(*info), send_list); + + kref_get(&info->refcount); + spin_unlock_bh(&bat_priv->vis_hash_lock); + + send_vis_packet(bat_priv, info); + + spin_lock_bh(&bat_priv->vis_hash_lock); + send_list_del(info); + kref_put(&info->refcount, free_info); + } + spin_unlock_bh(&bat_priv->vis_hash_lock); + start_vis_timer(bat_priv); +} + +/* init the vis server. this may only be called when if_list is already + * initialized (e.g. bat0 is initialized, interfaces have been added) */ +int vis_init(struct bat_priv *bat_priv) +{ + struct vis_packet *packet; + int hash_added; + + if (bat_priv->vis_hash) + return 1; + + spin_lock_bh(&bat_priv->vis_hash_lock); + + bat_priv->vis_hash = hash_new(256); + if (!bat_priv->vis_hash) { + pr_err("Can't initialize vis_hash\n"); + goto err; + } + + bat_priv->my_vis_info = kmalloc(MAX_VIS_PACKET_SIZE, GFP_ATOMIC); + if (!bat_priv->my_vis_info) { + pr_err("Can't initialize vis packet\n"); + goto err; + } + + bat_priv->my_vis_info->skb_packet = dev_alloc_skb( + sizeof(struct vis_packet) + + MAX_VIS_PACKET_SIZE + + sizeof(struct ethhdr)); + if (!bat_priv->my_vis_info->skb_packet) + goto free_info; + + skb_reserve(bat_priv->my_vis_info->skb_packet, sizeof(struct ethhdr)); + packet = (struct vis_packet *)skb_put( + bat_priv->my_vis_info->skb_packet, + sizeof(struct vis_packet)); + + /* prefill the vis info */ + bat_priv->my_vis_info->first_seen = jiffies - + msecs_to_jiffies(VIS_INTERVAL); + INIT_LIST_HEAD(&bat_priv->my_vis_info->recv_list); + INIT_LIST_HEAD(&bat_priv->my_vis_info->send_list); + kref_init(&bat_priv->my_vis_info->refcount); + bat_priv->my_vis_info->bat_priv = bat_priv; + packet->version = COMPAT_VERSION; + packet->packet_type = BAT_VIS; + packet->ttl = TTL; + packet->seqno = 0; + packet->entries = 0; + + INIT_LIST_HEAD(&bat_priv->vis_send_list); + + hash_added = hash_add(bat_priv->vis_hash, vis_info_cmp, vis_info_choose, + bat_priv->my_vis_info, + &bat_priv->my_vis_info->hash_entry); + if (hash_added < 0) { + pr_err("Can't add own vis packet into hash\n"); + /* not in hash, need to remove it manually. */ + kref_put(&bat_priv->my_vis_info->refcount, free_info); + goto err; + } + + spin_unlock_bh(&bat_priv->vis_hash_lock); + start_vis_timer(bat_priv); + return 1; + +free_info: + kfree(bat_priv->my_vis_info); + bat_priv->my_vis_info = NULL; +err: + spin_unlock_bh(&bat_priv->vis_hash_lock); + vis_quit(bat_priv); + return 0; +} + +/* Decrease the reference count on a hash item info */ +static void free_info_ref(struct hlist_node *node, void *arg) +{ + struct vis_info *info; + + info = container_of(node, struct vis_info, hash_entry); + send_list_del(info); + kref_put(&info->refcount, free_info); +} + +/* shutdown vis-server */ +void vis_quit(struct bat_priv *bat_priv) +{ + if (!bat_priv->vis_hash) + return; + + cancel_delayed_work_sync(&bat_priv->vis_work); + + spin_lock_bh(&bat_priv->vis_hash_lock); + /* properly remove, kill timers ... */ + hash_delete(bat_priv->vis_hash, free_info_ref, NULL); + bat_priv->vis_hash = NULL; + bat_priv->my_vis_info = NULL; + spin_unlock_bh(&bat_priv->vis_hash_lock); +} + +/* schedule packets for (re)transmission */ +static void start_vis_timer(struct bat_priv *bat_priv) +{ + INIT_DELAYED_WORK(&bat_priv->vis_work, send_vis_packets); + queue_delayed_work(bat_event_workqueue, &bat_priv->vis_work, + msecs_to_jiffies(VIS_INTERVAL)); +} diff --git a/net/batman-adv/vis.h b/net/batman-adv/vis.h new file mode 100644 index 00000000..31b820d0 --- /dev/null +++ b/net/batman-adv/vis.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors: + * + * Simon Wunderlich, Marek Lindner + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + * + */ + +#ifndef _NET_BATMAN_ADV_VIS_H_ +#define _NET_BATMAN_ADV_VIS_H_ + +#define VIS_TIMEOUT 200 /* timeout of vis packets in seconds */ + +int vis_seq_print_text(struct seq_file *seq, void *offset); +void receive_server_sync_packet(struct bat_priv *bat_priv, + struct vis_packet *vis_packet, + int vis_info_len); +void receive_client_update_packet(struct bat_priv *bat_priv, + struct vis_packet *vis_packet, + int vis_info_len); +int vis_init(struct bat_priv *bat_priv); +void vis_quit(struct bat_priv *bat_priv); + +#endif /* _NET_BATMAN_ADV_VIS_H_ */ diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig new file mode 100644 index 00000000..bfb3dc03 --- /dev/null +++ b/net/bluetooth/Kconfig @@ -0,0 +1,70 @@ +# +# Bluetooth subsystem configuration +# + +menuconfig BT + tristate "Bluetooth subsystem support" + depends on NET && !S390 + depends on RFKILL || !RFKILL + select CRYPTO + help + Bluetooth is low-cost, low-power, short-range wireless technology. + It was designed as a replacement for cables and other short-range + technologies like IrDA. Bluetooth operates in personal area range + that typically extends up to 10 meters. More information about + Bluetooth can be found at . + + Linux Bluetooth subsystem consist of several layers: + Bluetooth Core (HCI device and connection manager, scheduler) + HCI Device drivers (Interface to the hardware) + SCO Module (SCO audio links) + L2CAP Module (Logical Link Control and Adaptation Protocol) + RFCOMM Module (RFCOMM Protocol) + BNEP Module (Bluetooth Network Encapsulation Protocol) + CMTP Module (CAPI Message Transport Protocol) + HIDP Module (Human Interface Device Protocol) + SMP Module (Security Manager Protocol) + + Say Y here to compile Bluetooth support into the kernel or say M to + compile it as module (bluetooth). + + To use Linux Bluetooth subsystem, you will need several user-space + utilities like hciconfig and bluetoothd. These utilities and updates + to Bluetooth kernel modules are provided in the BlueZ packages. For + more information, see . + +if BT != n + +config BT_L2CAP + bool "L2CAP protocol support" + select CRC16 + select CRYPTO + select CRYPTO_BLKCIPHER + select CRYPTO_AES + select CRYPTO_ECB + help + L2CAP (Logical Link Control and Adaptation Protocol) provides + connection oriented and connection-less data transport. L2CAP + support is required for most Bluetooth applications. + + Also included is support for SMP (Security Manager Protocol) which + is the security layer on top of LE (Low Energy) links. + +config BT_SCO + bool "SCO links support" + help + SCO link provides voice transport over Bluetooth. SCO support is + required for voice applications like Headset and Audio. + +endif + +source "net/bluetooth/rfcomm/Kconfig" + +source "net/bluetooth/bnep/Kconfig" + +source "net/bluetooth/cmtp/Kconfig" + +source "net/bluetooth/hidp/Kconfig" + +source "drivers/bluetooth/Kconfig" + diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile new file mode 100644 index 00000000..9b67f3d0 --- /dev/null +++ b/net/bluetooth/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the Linux Bluetooth subsystem. +# + +obj-$(CONFIG_BT) += bluetooth.o +obj-$(CONFIG_BT_RFCOMM) += rfcomm/ +obj-$(CONFIG_BT_BNEP) += bnep/ +obj-$(CONFIG_BT_CMTP) += cmtp/ +obj-$(CONFIG_BT_HIDP) += hidp/ + +bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o hci_sock.o hci_sysfs.o lib.o +bluetooth-$(CONFIG_BT_L2CAP) += l2cap_core.o l2cap_sock.o smp.o +bluetooth-$(CONFIG_BT_SCO) += sco.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c new file mode 100644 index 00000000..7c73a10d --- /dev/null +++ b/net/bluetooth/af_bluetooth.c @@ -0,0 +1,633 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth address family and sockets. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +#include +#endif + +#ifndef CONFIG_BT_SOCK_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "2.16" + +/* Bluetooth sockets */ +#define BT_MAX_PROTO 8 +static const struct net_proto_family *bt_proto[BT_MAX_PROTO]; +static DEFINE_RWLOCK(bt_proto_lock); + +static struct lock_class_key bt_lock_key[BT_MAX_PROTO]; +static const char *const bt_key_strings[BT_MAX_PROTO] = { + "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP", + "sk_lock-AF_BLUETOOTH-BTPROTO_HCI", + "sk_lock-AF_BLUETOOTH-BTPROTO_SCO", + "sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM", + "sk_lock-AF_BLUETOOTH-BTPROTO_BNEP", + "sk_lock-AF_BLUETOOTH-BTPROTO_CMTP", + "sk_lock-AF_BLUETOOTH-BTPROTO_HIDP", + "sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP", +}; + +static struct lock_class_key bt_slock_key[BT_MAX_PROTO]; +static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { + "slock-AF_BLUETOOTH-BTPROTO_L2CAP", + "slock-AF_BLUETOOTH-BTPROTO_HCI", + "slock-AF_BLUETOOTH-BTPROTO_SCO", + "slock-AF_BLUETOOTH-BTPROTO_RFCOMM", + "slock-AF_BLUETOOTH-BTPROTO_BNEP", + "slock-AF_BLUETOOTH-BTPROTO_CMTP", + "slock-AF_BLUETOOTH-BTPROTO_HIDP", + "slock-AF_BLUETOOTH-BTPROTO_AVDTP", +}; + +static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) +{ + struct sock *sk = sock->sk; + + if (!sk) + return; + + BUG_ON(sock_owned_by_user(sk)); + + sock_lock_init_class_and_name(sk, + bt_slock_key_strings[proto], &bt_slock_key[proto], + bt_key_strings[proto], &bt_lock_key[proto]); +} + +int bt_sock_register(int proto, const struct net_proto_family *ops) +{ + int err = 0; + + if (proto < 0 || proto >= BT_MAX_PROTO) + return -EINVAL; + + write_lock(&bt_proto_lock); + + if (bt_proto[proto]) + err = -EEXIST; + else + bt_proto[proto] = ops; + + write_unlock(&bt_proto_lock); + + return err; +} +EXPORT_SYMBOL(bt_sock_register); + +int bt_sock_unregister(int proto) +{ + int err = 0; + + if (proto < 0 || proto >= BT_MAX_PROTO) + return -EINVAL; + + write_lock(&bt_proto_lock); + + if (!bt_proto[proto]) + err = -ENOENT; + else + bt_proto[proto] = NULL; + + write_unlock(&bt_proto_lock); + + return err; +} +EXPORT_SYMBOL(bt_sock_unregister); + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +static inline int current_has_bt_admin(void) +{ + return (!current_euid() || in_egroup_p(AID_NET_BT_ADMIN)); +} + +static inline int current_has_bt(void) +{ + return (current_has_bt_admin() || in_egroup_p(AID_NET_BT)); +} +# else +static inline int current_has_bt_admin(void) +{ + return 1; +} + +static inline int current_has_bt(void) +{ + return 1; +} +#endif + +static int bt_sock_create(struct net *net, struct socket *sock, int proto, + int kern) +{ + int err; + + if (proto == BTPROTO_RFCOMM || proto == BTPROTO_SCO || + proto == BTPROTO_L2CAP) { + if (!current_has_bt()) + return -EPERM; + } else if (!current_has_bt_admin()) + return -EPERM; + + if (net != &init_net) + return -EAFNOSUPPORT; + + if (proto < 0 || proto >= BT_MAX_PROTO) + return -EINVAL; + + if (!bt_proto[proto]) + request_module("bt-proto-%d", proto); + + err = -EPROTONOSUPPORT; + + read_lock(&bt_proto_lock); + + if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { + err = bt_proto[proto]->create(net, sock, proto, kern); + bt_sock_reclassify_lock(sock, proto); + module_put(bt_proto[proto]->owner); + } + + read_unlock(&bt_proto_lock); + + return err; +} + +void bt_sock_link(struct bt_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_add_node(sk, &l->head); + write_unlock_bh(&l->lock); +} +EXPORT_SYMBOL(bt_sock_link); + +void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_del_node_init(sk); + write_unlock_bh(&l->lock); +} +EXPORT_SYMBOL(bt_sock_unlink); + +void bt_accept_enqueue(struct sock *parent, struct sock *sk) +{ + BT_DBG("parent %p, sk %p", parent, sk); + + sock_hold(sk); + list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); + bt_sk(sk)->parent = parent; + parent->sk_ack_backlog++; +} +EXPORT_SYMBOL(bt_accept_enqueue); + +void bt_accept_unlink(struct sock *sk) +{ + BT_DBG("sk %p state %d", sk, sk->sk_state); + + list_del_init(&bt_sk(sk)->accept_q); + bt_sk(sk)->parent->sk_ack_backlog--; + bt_sk(sk)->parent = NULL; + sock_put(sk); +} +EXPORT_SYMBOL(bt_accept_unlink); + +struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) +{ + struct list_head *p, *n; + struct sock *sk; + + BT_DBG("parent %p", parent); + + local_bh_disable(); + list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { + sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); + + bh_lock_sock(sk); + + /* FIXME: Is this check still needed */ + if (sk->sk_state == BT_CLOSED) { + bh_unlock_sock(sk); + bt_accept_unlink(sk); + continue; + } + + if (sk->sk_state == BT_CONNECTED || !newsock || + bt_sk(parent)->defer_setup) { + bt_accept_unlink(sk); + if (newsock) + sock_graft(sk, newsock); + + bh_unlock_sock(sk); + local_bh_enable(); + return sk; + } + + bh_unlock_sock(sk); + } + local_bh_enable(); + + return NULL; +} +EXPORT_SYMBOL(bt_accept_dequeue); + +int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + int noblock = flags & MSG_DONTWAIT; + struct sock *sk = sock->sk; + struct sk_buff *skb; + size_t copied; + int err; + + BT_DBG("sock %p sk %p len %zu", sock, sk, len); + + if (flags & (MSG_OOB)) + return -EOPNOTSUPP; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) { + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 0; + return err; + } + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb_reset_transport_header(skb); + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err == 0) + sock_recv_ts_and_drops(msg, sk, skb); + + skb_free_datagram(sk, skb); + + return err ? : copied; +} +EXPORT_SYMBOL(bt_sock_recvmsg); + +static long bt_sock_data_wait(struct sock *sk, long timeo) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk_sleep(sk), &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + + if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN)) + break; + + if (signal_pending(current) || !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + return timeo; +} + +int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int err = 0; + size_t target, copied = 0; + long timeo; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + msg->msg_namelen = 0; + + BT_DBG("sk %p size %zu", sk, size); + + lock_sock(sk); + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + struct sk_buff *skb; + int chunk; + + skb = skb_dequeue(&sk->sk_receive_queue); + if (!skb) { + if (copied >= target) + break; + + err = sock_error(sk); + if (err) + break; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + err = -EAGAIN; + if (!timeo) + break; + + timeo = bt_sock_data_wait(sk, timeo); + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + continue; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (!copied) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (!(flags & MSG_PEEK)) { + skb_pull(skb, chunk); + if (skb->len) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + kfree_skb(skb); + + } else { + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + +out: + release_sock(sk); + return copied ? : err; +} +EXPORT_SYMBOL(bt_sock_stream_recvmsg); + +static inline unsigned int bt_accept_poll(struct sock *parent) +{ + struct list_head *p, *n; + struct sock *sk; + + list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { + sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); + if (sk->sk_state == BT_CONNECTED || + (bt_sk(parent)->defer_setup && + sk->sk_state == BT_CONNECT2)) + return POLLIN | POLLRDNORM; + } + + return 0; +} + +unsigned int bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + poll_wait(file, sk_sleep(sk), wait); + + if (sk->sk_state == BT_LISTEN) + return bt_accept_poll(sk); + + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP | POLLIN | POLLRDNORM; + + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + if (sk->sk_state == BT_CLOSED) + mask |= POLLHUP; + + if (sk->sk_state == BT_CONNECT || + sk->sk_state == BT_CONNECT2 || + sk->sk_state == BT_CONFIG) + return mask; + + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} +EXPORT_SYMBOL(bt_sock_poll); + +int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + long amount; + int err; + + BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg); + + switch (cmd) { + case TIOCOUTQ: + if (sk->sk_state == BT_LISTEN) + return -EINVAL; + + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + err = put_user(amount, (int __user *) arg); + break; + + case TIOCINQ: + if (sk->sk_state == BT_LISTEN) + return -EINVAL; + + lock_sock(sk); + skb = skb_peek(&sk->sk_receive_queue); + amount = skb ? skb->len : 0; + release_sock(sk); + err = put_user(amount, (int __user *) arg); + break; + + case SIOCGSTAMP: + err = sock_get_timestamp(sk, (struct timeval __user *) arg); + break; + + case SIOCGSTAMPNS: + err = sock_get_timestampns(sk, (struct timespec __user *) arg); + break; + + default: + err = -ENOIOCTLCMD; + break; + } + + return err; +} +EXPORT_SYMBOL(bt_sock_ioctl); + +int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) +{ + DECLARE_WAITQUEUE(wait, current); + int err = 0; + + BT_DBG("sk %p", sk); + + add_wait_queue(sk_sleep(sk), &wait); + set_current_state(TASK_INTERRUPTIBLE); + while (sk->sk_state != state) { + if (!timeo) { + err = -EINPROGRESS; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + set_current_state(TASK_INTERRUPTIBLE); + + err = sock_error(sk); + if (err) + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + return err; +} +EXPORT_SYMBOL(bt_sock_wait_state); + +static struct net_proto_family bt_sock_family_ops = { + .owner = THIS_MODULE, + .family = PF_BLUETOOTH, + .create = bt_sock_create, +}; + +static int __init bt_init(void) +{ + int err; + + BT_INFO("Core ver %s", VERSION); + + err = bt_sysfs_init(); + if (err < 0) + return err; + + err = sock_register(&bt_sock_family_ops); + if (err < 0) { + bt_sysfs_cleanup(); + return err; + } + + BT_INFO("HCI device and connection manager initialized"); + + err = hci_sock_init(); + if (err < 0) + goto error; + + err = l2cap_init(); + if (err < 0) + goto sock_err; + + err = sco_init(); + if (err < 0) { + l2cap_exit(); + goto sock_err; + } + + return 0; + +sock_err: + hci_sock_cleanup(); + +error: + sock_unregister(PF_BLUETOOTH); + bt_sysfs_cleanup(); + + return err; +} + +static void __exit bt_exit(void) +{ + + sco_exit(); + + l2cap_exit(); + + hci_sock_cleanup(); + + sock_unregister(PF_BLUETOOTH); + + bt_sysfs_cleanup(); +} + +subsys_initcall(bt_init); +module_exit(bt_exit); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); diff --git a/net/bluetooth/bnep/Kconfig b/net/bluetooth/bnep/Kconfig new file mode 100644 index 00000000..35158b03 --- /dev/null +++ b/net/bluetooth/bnep/Kconfig @@ -0,0 +1,24 @@ +config BT_BNEP + tristate "BNEP protocol support" + depends on BT && BT_L2CAP + select CRC32 + help + BNEP (Bluetooth Network Encapsulation Protocol) is Ethernet + emulation layer on top of Bluetooth. BNEP is required for + Bluetooth PAN (Personal Area Network). + + Say Y here to compile BNEP support into the kernel or say M to + compile it as module (bnep). + +config BT_BNEP_MC_FILTER + bool "Multicast filter support" + depends on BT_BNEP + help + This option enables the multicast filter support for BNEP. + +config BT_BNEP_PROTO_FILTER + bool "Protocol filter support" + depends on BT_BNEP + help + This option enables the protocol filter support for BNEP. + diff --git a/net/bluetooth/bnep/Makefile b/net/bluetooth/bnep/Makefile new file mode 100644 index 00000000..c7821e76 --- /dev/null +++ b/net/bluetooth/bnep/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Bluetooth BNEP layer. +# + +obj-$(CONFIG_BT_BNEP) += bnep.o + +bnep-objs := core.o sock.o netdev.o diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h new file mode 100644 index 00000000..e7ee5314 --- /dev/null +++ b/net/bluetooth/bnep/bnep.h @@ -0,0 +1,180 @@ +/* + BNEP protocol definition for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef _BNEP_H +#define _BNEP_H + +#include +#include +#include + +/* Limits */ +#define BNEP_MAX_PROTO_FILTERS 5 +#define BNEP_MAX_MULTICAST_FILTERS 20 + +/* UUIDs */ +#define BNEP_BASE_UUID 0x0000000000001000800000805F9B34FB +#define BNEP_UUID16 0x02 +#define BNEP_UUID32 0x04 +#define BNEP_UUID128 0x16 + +#define BNEP_SVC_PANU 0x1115 +#define BNEP_SVC_NAP 0x1116 +#define BNEP_SVC_GN 0x1117 + +/* Packet types */ +#define BNEP_GENERAL 0x00 +#define BNEP_CONTROL 0x01 +#define BNEP_COMPRESSED 0x02 +#define BNEP_COMPRESSED_SRC_ONLY 0x03 +#define BNEP_COMPRESSED_DST_ONLY 0x04 + +/* Control types */ +#define BNEP_CMD_NOT_UNDERSTOOD 0x00 +#define BNEP_SETUP_CONN_REQ 0x01 +#define BNEP_SETUP_CONN_RSP 0x02 +#define BNEP_FILTER_NET_TYPE_SET 0x03 +#define BNEP_FILTER_NET_TYPE_RSP 0x04 +#define BNEP_FILTER_MULTI_ADDR_SET 0x05 +#define BNEP_FILTER_MULTI_ADDR_RSP 0x06 + +/* Extension types */ +#define BNEP_EXT_CONTROL 0x00 + +/* Response messages */ +#define BNEP_SUCCESS 0x00 + +#define BNEP_CONN_INVALID_DST 0x01 +#define BNEP_CONN_INVALID_SRC 0x02 +#define BNEP_CONN_INVALID_SVC 0x03 +#define BNEP_CONN_NOT_ALLOWED 0x04 + +#define BNEP_FILTER_UNSUPPORTED_REQ 0x01 +#define BNEP_FILTER_INVALID_RANGE 0x02 +#define BNEP_FILTER_INVALID_MCADDR 0x02 +#define BNEP_FILTER_LIMIT_REACHED 0x03 +#define BNEP_FILTER_DENIED_SECURITY 0x04 + +/* L2CAP settings */ +#define BNEP_MTU 1691 +#define BNEP_PSM 0x0f +#define BNEP_FLUSH_TO 0xffff +#define BNEP_CONNECT_TO 15 +#define BNEP_FILTER_TO 15 + +/* Headers */ +#define BNEP_TYPE_MASK 0x7f +#define BNEP_EXT_HEADER 0x80 + +struct bnep_setup_conn_req { + __u8 type; + __u8 ctrl; + __u8 uuid_size; + __u8 service[0]; +} __packed; + +struct bnep_set_filter_req { + __u8 type; + __u8 ctrl; + __be16 len; + __u8 list[0]; +} __packed; + +struct bnep_control_rsp { + __u8 type; + __u8 ctrl; + __be16 resp; +} __packed; + +struct bnep_ext_hdr { + __u8 type; + __u8 len; + __u8 data[0]; +} __packed; + +/* BNEP ioctl defines */ +#define BNEPCONNADD _IOW('B', 200, int) +#define BNEPCONNDEL _IOW('B', 201, int) +#define BNEPGETCONNLIST _IOR('B', 210, int) +#define BNEPGETCONNINFO _IOR('B', 211, int) + +struct bnep_connadd_req { + int sock; /* Connected socket */ + __u32 flags; + __u16 role; + char device[16]; /* Name of the Ethernet device */ +}; + +struct bnep_conndel_req { + __u32 flags; + __u8 dst[ETH_ALEN]; +}; + +struct bnep_conninfo { + __u32 flags; + __u16 role; + __u16 state; + __u8 dst[ETH_ALEN]; + char device[16]; +}; + +struct bnep_connlist_req { + __u32 cnum; + struct bnep_conninfo __user *ci; +}; + +struct bnep_proto_filter { + __u16 start; + __u16 end; +}; + +int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock); +int bnep_del_connection(struct bnep_conndel_req *req); +int bnep_get_connlist(struct bnep_connlist_req *req); +int bnep_get_conninfo(struct bnep_conninfo *ci); + +/* BNEP sessions */ +struct bnep_session { + struct list_head list; + + unsigned int role; + unsigned long state; + unsigned long flags; + atomic_t terminate; + struct task_struct *task; + + struct ethhdr eh; + struct msghdr msg; + + struct bnep_proto_filter proto_filter[BNEP_MAX_PROTO_FILTERS]; + unsigned long long mc_filter; + + struct socket *sock; + struct net_device *dev; +}; + +void bnep_net_setup(struct net_device *dev); +int bnep_sock_init(void); +void bnep_sock_cleanup(void); + +static inline int bnep_mc_hash(__u8 *addr) +{ + return crc32_be(~0, addr, ETH_ALEN) >> 26; +} + +#endif diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c new file mode 100644 index 00000000..d9edfe8b --- /dev/null +++ b/net/bluetooth/bnep/core.c @@ -0,0 +1,751 @@ +/* + BNEP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2001-2002 Inventel Systemes + Written 2001-2002 by + Clément Moreau + David Libault + + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +#include "bnep.h" + +#define VERSION "1.3" + +static int compress_src = 1; +static int compress_dst = 1; + +static LIST_HEAD(bnep_session_list); +static DECLARE_RWSEM(bnep_session_sem); + +static struct bnep_session *__bnep_get_session(u8 *dst) +{ + struct bnep_session *s; + struct list_head *p; + + BT_DBG(""); + + list_for_each(p, &bnep_session_list) { + s = list_entry(p, struct bnep_session, list); + if (!compare_ether_addr(dst, s->eh.h_source)) + return s; + } + return NULL; +} + +static void __bnep_link_session(struct bnep_session *s) +{ + /* It's safe to call __module_get() here because sessions are added + by the socket layer which has to hold the reference to this module. + */ + __module_get(THIS_MODULE); + list_add(&s->list, &bnep_session_list); +} + +static void __bnep_unlink_session(struct bnep_session *s) +{ + list_del(&s->list); + module_put(THIS_MODULE); +} + +static int bnep_send(struct bnep_session *s, void *data, size_t len) +{ + struct socket *sock = s->sock; + struct kvec iv = { data, len }; + + return kernel_sendmsg(sock, &s->msg, &iv, 1, len); +} + +static int bnep_send_rsp(struct bnep_session *s, u8 ctrl, u16 resp) +{ + struct bnep_control_rsp rsp; + rsp.type = BNEP_CONTROL; + rsp.ctrl = ctrl; + rsp.resp = htons(resp); + return bnep_send(s, &rsp, sizeof(rsp)); +} + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER +static inline void bnep_set_default_proto_filter(struct bnep_session *s) +{ + /* (IPv4, ARP) */ + s->proto_filter[0].start = ETH_P_IP; + s->proto_filter[0].end = ETH_P_ARP; + /* (RARP, AppleTalk) */ + s->proto_filter[1].start = ETH_P_RARP; + s->proto_filter[1].end = ETH_P_AARP; + /* (IPX, IPv6) */ + s->proto_filter[2].start = ETH_P_IPX; + s->proto_filter[2].end = ETH_P_IPV6; +} +#endif + +static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len) +{ + int n; + + if (len < 2) + return -EILSEQ; + + n = get_unaligned_be16(data); + data++; + len -= 2; + + if (len < n) + return -EILSEQ; + + BT_DBG("filter len %d", n); + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + n /= 4; + if (n <= BNEP_MAX_PROTO_FILTERS) { + struct bnep_proto_filter *f = s->proto_filter; + int i; + + for (i = 0; i < n; i++) { + f[i].start = get_unaligned_be16(data++); + f[i].end = get_unaligned_be16(data++); + + BT_DBG("proto filter start %d end %d", + f[i].start, f[i].end); + } + + if (i < BNEP_MAX_PROTO_FILTERS) + memset(f + i, 0, sizeof(*f)); + + if (n == 0) + bnep_set_default_proto_filter(s); + + bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_SUCCESS); + } else { + bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_LIMIT_REACHED); + } +#else + bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_UNSUPPORTED_REQ); +#endif + return 0; +} + +static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len) +{ + int n; + + if (len < 2) + return -EILSEQ; + + n = get_unaligned_be16(data); + data += 2; + len -= 2; + + if (len < n) + return -EILSEQ; + + BT_DBG("filter len %d", n); + +#ifdef CONFIG_BT_BNEP_MC_FILTER + n /= (ETH_ALEN * 2); + + if (n > 0) { + int i; + + s->mc_filter = 0; + + /* Always send broadcast */ + set_bit(bnep_mc_hash(s->dev->broadcast), (ulong *) &s->mc_filter); + + /* Add address ranges to the multicast hash */ + for (; n > 0; n--) { + u8 a1[6], *a2; + + memcpy(a1, data, ETH_ALEN); + data += ETH_ALEN; + a2 = data; + data += ETH_ALEN; + + BT_DBG("mc filter %s -> %s", + batostr((void *) a1), batostr((void *) a2)); + + /* Iterate from a1 to a2 */ + set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter); + while (memcmp(a1, a2, 6) < 0 && s->mc_filter != ~0LL) { + /* Increment a1 */ + i = 5; + while (i >= 0 && ++a1[i--] == 0) + ; + + set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter); + } + } + } + + BT_DBG("mc filter hash 0x%llx", s->mc_filter); + + bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_SUCCESS); +#else + bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_FILTER_UNSUPPORTED_REQ); +#endif + return 0; +} + +static int bnep_rx_control(struct bnep_session *s, void *data, int len) +{ + u8 cmd = *(u8 *)data; + int err = 0; + + data++; + len--; + + switch (cmd) { + case BNEP_CMD_NOT_UNDERSTOOD: + case BNEP_SETUP_CONN_RSP: + case BNEP_FILTER_NET_TYPE_RSP: + case BNEP_FILTER_MULTI_ADDR_RSP: + /* Ignore these for now */ + break; + + case BNEP_FILTER_NET_TYPE_SET: + err = bnep_ctrl_set_netfilter(s, data, len); + break; + + case BNEP_FILTER_MULTI_ADDR_SET: + err = bnep_ctrl_set_mcfilter(s, data, len); + break; + + case BNEP_SETUP_CONN_REQ: + err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, BNEP_CONN_NOT_ALLOWED); + break; + + default: { + u8 pkt[3]; + pkt[0] = BNEP_CONTROL; + pkt[1] = BNEP_CMD_NOT_UNDERSTOOD; + pkt[2] = cmd; + bnep_send(s, pkt, sizeof(pkt)); + } + break; + } + + return err; +} + +static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb) +{ + struct bnep_ext_hdr *h; + int err = 0; + + do { + h = (void *) skb->data; + if (!skb_pull(skb, sizeof(*h))) { + err = -EILSEQ; + break; + } + + BT_DBG("type 0x%x len %d", h->type, h->len); + + switch (h->type & BNEP_TYPE_MASK) { + case BNEP_EXT_CONTROL: + bnep_rx_control(s, skb->data, skb->len); + break; + + default: + /* Unknown extension, skip it. */ + break; + } + + if (!skb_pull(skb, h->len)) { + err = -EILSEQ; + break; + } + } while (!err && (h->type & BNEP_EXT_HEADER)); + + return err; +} + +static u8 __bnep_rx_hlen[] = { + ETH_HLEN, /* BNEP_GENERAL */ + 0, /* BNEP_CONTROL */ + 2, /* BNEP_COMPRESSED */ + ETH_ALEN + 2, /* BNEP_COMPRESSED_SRC_ONLY */ + ETH_ALEN + 2 /* BNEP_COMPRESSED_DST_ONLY */ +}; + +static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) +{ + struct net_device *dev = s->dev; + struct sk_buff *nskb; + u8 type; + + dev->stats.rx_bytes += skb->len; + + type = *(u8 *) skb->data; + skb_pull(skb, 1); + + if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen)) + goto badframe; + + if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) { + bnep_rx_control(s, skb->data, skb->len); + kfree_skb(skb); + return 0; + } + + skb_reset_mac_header(skb); + + /* Verify and pull out header */ + if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK])) + goto badframe; + + s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2)); + + if (type & BNEP_EXT_HEADER) { + if (bnep_rx_extension(s, skb) < 0) + goto badframe; + } + + /* Strip 802.1p header */ + if (ntohs(s->eh.h_proto) == 0x8100) { + if (!skb_pull(skb, 4)) + goto badframe; + s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2)); + } + + /* We have to alloc new skb and copy data here :(. Because original skb + * may not be modified and because of the alignment requirements. */ + nskb = alloc_skb(2 + ETH_HLEN + skb->len, GFP_KERNEL); + if (!nskb) { + dev->stats.rx_dropped++; + kfree_skb(skb); + return -ENOMEM; + } + skb_reserve(nskb, 2); + + /* Decompress header and construct ether frame */ + switch (type & BNEP_TYPE_MASK) { + case BNEP_COMPRESSED: + memcpy(__skb_put(nskb, ETH_HLEN), &s->eh, ETH_HLEN); + break; + + case BNEP_COMPRESSED_SRC_ONLY: + memcpy(__skb_put(nskb, ETH_ALEN), s->eh.h_dest, ETH_ALEN); + memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb), ETH_ALEN); + put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2)); + break; + + case BNEP_COMPRESSED_DST_ONLY: + memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb), + ETH_ALEN); + memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source, + ETH_ALEN + 2); + break; + + case BNEP_GENERAL: + memcpy(__skb_put(nskb, ETH_ALEN * 2), skb_mac_header(skb), + ETH_ALEN * 2); + put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2)); + break; + } + + skb_copy_from_linear_data(skb, __skb_put(nskb, skb->len), skb->len); + kfree_skb(skb); + + dev->stats.rx_packets++; + nskb->ip_summed = CHECKSUM_NONE; + nskb->protocol = eth_type_trans(nskb, dev); + netif_rx_ni(nskb); + return 0; + +badframe: + dev->stats.rx_errors++; + kfree_skb(skb); + return 0; +} + +static u8 __bnep_tx_types[] = { + BNEP_GENERAL, + BNEP_COMPRESSED_SRC_ONLY, + BNEP_COMPRESSED_DST_ONLY, + BNEP_COMPRESSED +}; + +static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb) +{ + struct ethhdr *eh = (void *) skb->data; + struct socket *sock = s->sock; + struct kvec iv[3]; + int len = 0, il = 0; + u8 type = 0; + + BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type); + + if (!skb->dev) { + /* Control frame sent by us */ + goto send; + } + + iv[il++] = (struct kvec) { &type, 1 }; + len++; + + if (compress_src && !compare_ether_addr(eh->h_dest, s->eh.h_source)) + type |= 0x01; + + if (compress_dst && !compare_ether_addr(eh->h_source, s->eh.h_dest)) + type |= 0x02; + + if (type) + skb_pull(skb, ETH_ALEN * 2); + + type = __bnep_tx_types[type]; + switch (type) { + case BNEP_COMPRESSED_SRC_ONLY: + iv[il++] = (struct kvec) { eh->h_source, ETH_ALEN }; + len += ETH_ALEN; + break; + + case BNEP_COMPRESSED_DST_ONLY: + iv[il++] = (struct kvec) { eh->h_dest, ETH_ALEN }; + len += ETH_ALEN; + break; + } + +send: + iv[il++] = (struct kvec) { skb->data, skb->len }; + len += skb->len; + + /* FIXME: linearize skb */ + { + len = kernel_sendmsg(sock, &s->msg, iv, il, len); + } + kfree_skb(skb); + + if (len > 0) { + s->dev->stats.tx_bytes += len; + s->dev->stats.tx_packets++; + return 0; + } + + return len; +} + +static int bnep_session(void *arg) +{ + struct bnep_session *s = arg; + struct net_device *dev = s->dev; + struct sock *sk = s->sock->sk; + struct sk_buff *skb; + wait_queue_t wait; + + BT_DBG(""); + + set_user_nice(current, -15); + + init_waitqueue_entry(&wait, current); + add_wait_queue(sk_sleep(sk), &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (atomic_read(&s->terminate)) + break; + /* RX */ + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + bnep_rx_frame(s, skb); + } + + if (sk->sk_state != BT_CONNECTED) + break; + + /* TX */ + while ((skb = skb_dequeue(&sk->sk_write_queue))) + if (bnep_tx_frame(s, skb)) + break; + netif_wake_queue(dev); + + schedule(); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Cleanup session */ + down_write(&bnep_session_sem); + + /* Delete network device */ + unregister_netdev(dev); + + /* Wakeup user-space polling for socket errors */ + s->sock->sk->sk_err = EUNATCH; + + wake_up_interruptible(sk_sleep(s->sock->sk)); + + /* Release the socket */ + fput(s->sock->file); + + __bnep_unlink_session(s); + + up_write(&bnep_session_sem); + free_netdev(dev); + return 0; +} + +static struct device *bnep_get_device(struct bnep_session *session) +{ + bdaddr_t *src = &bt_sk(session->sock->sk)->src; + bdaddr_t *dst = &bt_sk(session->sock->sk)->dst; + struct hci_dev *hdev; + struct hci_conn *conn; + + hdev = hci_get_route(dst, src); + if (!hdev) + return NULL; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); + + hci_dev_put(hdev); + + return conn ? &conn->dev : NULL; +} + +static struct device_type bnep_type = { + .name = "bluetooth", +}; + +int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) +{ + struct net_device *dev; + struct bnep_session *s, *ss; + u8 dst[ETH_ALEN], src[ETH_ALEN]; + int err; + + BT_DBG(""); + + baswap((void *) dst, &bt_sk(sock->sk)->dst); + baswap((void *) src, &bt_sk(sock->sk)->src); + + /* session struct allocated as private part of net_device */ + dev = alloc_netdev(sizeof(struct bnep_session), + (*req->device) ? req->device : "bnep%d", + bnep_net_setup); + if (!dev) + return -ENOMEM; + + down_write(&bnep_session_sem); + + ss = __bnep_get_session(dst); + if (ss && ss->state == BT_CONNECTED) { + err = -EEXIST; + goto failed; + } + + s = netdev_priv(dev); + + /* This is rx header therefore addresses are swapped. + * ie. eh.h_dest is our local address. */ + memcpy(s->eh.h_dest, &src, ETH_ALEN); + memcpy(s->eh.h_source, &dst, ETH_ALEN); + memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN); + + s->dev = dev; + s->sock = sock; + s->role = req->role; + s->state = BT_CONNECTED; + + s->msg.msg_flags = MSG_NOSIGNAL; + +#ifdef CONFIG_BT_BNEP_MC_FILTER + /* Set default mc filter */ + set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter); +#endif + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + /* Set default protocol filter */ + bnep_set_default_proto_filter(s); +#endif + + SET_NETDEV_DEV(dev, bnep_get_device(s)); + SET_NETDEV_DEVTYPE(dev, &bnep_type); + + err = register_netdev(dev); + if (err) + goto failed; + + __bnep_link_session(s); + + s->task = kthread_run(bnep_session, s, "kbnepd %s", dev->name); + if (IS_ERR(s->task)) { + /* Session thread start failed, gotta cleanup. */ + unregister_netdev(dev); + __bnep_unlink_session(s); + err = PTR_ERR(s->task); + goto failed; + } + + up_write(&bnep_session_sem); + strcpy(req->device, dev->name); + return 0; + +failed: + up_write(&bnep_session_sem); + free_netdev(dev); + return err; +} + +int bnep_del_connection(struct bnep_conndel_req *req) +{ + struct bnep_session *s; + int err = 0; + + BT_DBG(""); + + down_read(&bnep_session_sem); + + s = __bnep_get_session(req->dst); + if (s) { + atomic_inc(&s->terminate); + wake_up_process(s->task); + } else + err = -ENOENT; + + up_read(&bnep_session_sem); + return err; +} + +static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s) +{ + memset(ci, 0, sizeof(*ci)); + memcpy(ci->dst, s->eh.h_source, ETH_ALEN); + strcpy(ci->device, s->dev->name); + ci->flags = s->flags; + ci->state = s->state; + ci->role = s->role; +} + +int bnep_get_connlist(struct bnep_connlist_req *req) +{ + struct list_head *p; + int err = 0, n = 0; + + down_read(&bnep_session_sem); + + list_for_each(p, &bnep_session_list) { + struct bnep_session *s; + struct bnep_conninfo ci; + + s = list_entry(p, struct bnep_session, list); + + __bnep_copy_ci(&ci, s); + + if (copy_to_user(req->ci, &ci, sizeof(ci))) { + err = -EFAULT; + break; + } + + if (++n >= req->cnum) + break; + + req->ci++; + } + req->cnum = n; + + up_read(&bnep_session_sem); + return err; +} + +int bnep_get_conninfo(struct bnep_conninfo *ci) +{ + struct bnep_session *s; + int err = 0; + + down_read(&bnep_session_sem); + + s = __bnep_get_session(ci->dst); + if (s) + __bnep_copy_ci(ci, s); + else + err = -ENOENT; + + up_read(&bnep_session_sem); + return err; +} + +static int __init bnep_init(void) +{ + char flt[50] = ""; + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + strcat(flt, "protocol "); +#endif + +#ifdef CONFIG_BT_BNEP_MC_FILTER + strcat(flt, "multicast"); +#endif + + BT_INFO("BNEP (Ethernet Emulation) ver %s", VERSION); + if (flt[0]) + BT_INFO("BNEP filters: %s", flt); + + bnep_sock_init(); + return 0; +} + +static void __exit bnep_exit(void) +{ + bnep_sock_cleanup(); +} + +module_init(bnep_init); +module_exit(bnep_exit); + +module_param(compress_src, bool, 0644); +MODULE_PARM_DESC(compress_src, "Compress sources headers"); + +module_param(compress_dst, bool, 0644); +MODULE_PARM_DESC(compress_dst, "Compress destination headers"); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth BNEP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-4"); diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c new file mode 100644 index 00000000..d4f5dff7 --- /dev/null +++ b/net/bluetooth/bnep/netdev.c @@ -0,0 +1,238 @@ +/* + BNEP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2001-2002 Inventel Systemes + Written 2001-2002 by + Clément Moreau + David Libault + + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "bnep.h" + +#define BNEP_TX_QUEUE_LEN 20 + +static int bnep_net_open(struct net_device *dev) +{ + netif_start_queue(dev); + return 0; +} + +static int bnep_net_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static void bnep_net_set_mc_list(struct net_device *dev) +{ +#ifdef CONFIG_BT_BNEP_MC_FILTER + struct bnep_session *s = netdev_priv(dev); + struct sock *sk = s->sock->sk; + struct bnep_set_filter_req *r; + struct sk_buff *skb; + int size; + + BT_DBG("%s mc_count %d", dev->name, netdev_mc_count(dev)); + + size = sizeof(*r) + (BNEP_MAX_MULTICAST_FILTERS + 1) * ETH_ALEN * 2; + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + BT_ERR("%s Multicast list allocation failed", dev->name); + return; + } + + r = (void *) skb->data; + __skb_put(skb, sizeof(*r)); + + r->type = BNEP_CONTROL; + r->ctrl = BNEP_FILTER_MULTI_ADDR_SET; + + if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { + u8 start[ETH_ALEN] = { 0x01 }; + + /* Request all addresses */ + memcpy(__skb_put(skb, ETH_ALEN), start, ETH_ALEN); + memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN); + r->len = htons(ETH_ALEN * 2); + } else { + struct netdev_hw_addr *ha; + int i, len = skb->len; + + if (dev->flags & IFF_BROADCAST) { + memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN); + memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN); + } + + /* FIXME: We should group addresses here. */ + + i = 0; + netdev_for_each_mc_addr(ha, dev) { + if (i == BNEP_MAX_MULTICAST_FILTERS) + break; + memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN); + memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN); + + i++; + } + r->len = htons(skb->len - len); + } + + skb_queue_tail(&sk->sk_write_queue, skb); + wake_up_interruptible(sk_sleep(sk)); +#endif +} + +static int bnep_net_set_mac_addr(struct net_device *dev, void *arg) +{ + BT_DBG("%s", dev->name); + return 0; +} + +static void bnep_net_timeout(struct net_device *dev) +{ + BT_DBG("net_timeout"); + netif_wake_queue(dev); +} + +#ifdef CONFIG_BT_BNEP_MC_FILTER +static inline int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s) +{ + struct ethhdr *eh = (void *) skb->data; + + if ((eh->h_dest[0] & 1) && !test_bit(bnep_mc_hash(eh->h_dest), (ulong *) &s->mc_filter)) + return 1; + return 0; +} +#endif + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER +/* Determine ether protocol. Based on eth_type_trans. */ +static inline u16 bnep_net_eth_proto(struct sk_buff *skb) +{ + struct ethhdr *eh = (void *) skb->data; + u16 proto = ntohs(eh->h_proto); + + if (proto >= 1536) + return proto; + + if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF)) + return ETH_P_802_3; + + return ETH_P_802_2; +} + +static inline int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s) +{ + u16 proto = bnep_net_eth_proto(skb); + struct bnep_proto_filter *f = s->proto_filter; + int i; + + for (i = 0; i < BNEP_MAX_PROTO_FILTERS && f[i].end; i++) { + if (proto >= f[i].start && proto <= f[i].end) + return 0; + } + + BT_DBG("BNEP: filtered skb %p, proto 0x%.4x", skb, proto); + return 1; +} +#endif + +static netdev_tx_t bnep_net_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct bnep_session *s = netdev_priv(dev); + struct sock *sk = s->sock->sk; + + BT_DBG("skb %p, dev %p", skb, dev); + +#ifdef CONFIG_BT_BNEP_MC_FILTER + if (bnep_net_mc_filter(skb, s)) { + kfree_skb(skb); + return NETDEV_TX_OK; + } +#endif + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + if (bnep_net_proto_filter(skb, s)) { + kfree_skb(skb); + return NETDEV_TX_OK; + } +#endif + + /* + * We cannot send L2CAP packets from here as we are potentially in a bh. + * So we have to queue them and wake up session thread which is sleeping + * on the sk_sleep(sk). + */ + dev->trans_start = jiffies; + skb_queue_tail(&sk->sk_write_queue, skb); + wake_up_interruptible(sk_sleep(sk)); + + if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) { + BT_DBG("tx queue is full"); + + /* Stop queuing. + * Session thread will do netif_wake_queue() */ + netif_stop_queue(dev); + } + + return NETDEV_TX_OK; +} + +static const struct net_device_ops bnep_netdev_ops = { + .ndo_open = bnep_net_open, + .ndo_stop = bnep_net_close, + .ndo_start_xmit = bnep_net_xmit, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_multicast_list = bnep_net_set_mc_list, + .ndo_set_mac_address = bnep_net_set_mac_addr, + .ndo_tx_timeout = bnep_net_timeout, + .ndo_change_mtu = eth_change_mtu, + +}; + +void bnep_net_setup(struct net_device *dev) +{ + + memset(dev->broadcast, 0xff, ETH_ALEN); + dev->addr_len = ETH_ALEN; + + ether_setup(dev); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->netdev_ops = &bnep_netdev_ops; + + dev->watchdog_timeo = HZ * 2; +} diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c new file mode 100644 index 00000000..17800b1d --- /dev/null +++ b/net/bluetooth/bnep/sock.c @@ -0,0 +1,259 @@ +/* + BNEP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2001-2002 Inventel Systemes + Written 2001-2002 by + David Libault + + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "bnep.h" + +static int bnep_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + sock_orphan(sk); + sock_put(sk); + return 0; +} + +static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct bnep_connlist_req cl; + struct bnep_connadd_req ca; + struct bnep_conndel_req cd; + struct bnep_conninfo ci; + struct socket *nsock; + void __user *argp = (void __user *)arg; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case BNEPCONNADD: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ca, argp, sizeof(ca))) + return -EFAULT; + + nsock = sockfd_lookup(ca.sock, &err); + if (!nsock) + return err; + + if (nsock->sk->sk_state != BT_CONNECTED) { + sockfd_put(nsock); + return -EBADFD; + } + ca.device[sizeof(ca.device)-1] = 0; + + err = bnep_add_connection(&ca, nsock); + if (!err) { + if (copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; + } else + sockfd_put(nsock); + + return err; + + case BNEPCONNDEL: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&cd, argp, sizeof(cd))) + return -EFAULT; + + return bnep_del_connection(&cd); + + case BNEPGETCONNLIST: + if (copy_from_user(&cl, argp, sizeof(cl))) + return -EFAULT; + + if (cl.cnum <= 0) + return -EINVAL; + + err = bnep_get_connlist(&cl); + if (!err && copy_to_user(argp, &cl, sizeof(cl))) + return -EFAULT; + + return err; + + case BNEPGETCONNINFO: + if (copy_from_user(&ci, argp, sizeof(ci))) + return -EFAULT; + + err = bnep_get_conninfo(&ci); + if (!err && copy_to_user(argp, &ci, sizeof(ci))) + return -EFAULT; + + return err; + + default: + return -EINVAL; + } + + return 0; +} + +#ifdef CONFIG_COMPAT +static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + if (cmd == BNEPGETCONNLIST) { + struct bnep_connlist_req cl; + uint32_t uci; + int err; + + if (get_user(cl.cnum, (uint32_t __user *) arg) || + get_user(uci, (u32 __user *) (arg + 4))) + return -EFAULT; + + cl.ci = compat_ptr(uci); + + if (cl.cnum <= 0) + return -EINVAL; + + err = bnep_get_connlist(&cl); + + if (!err && put_user(cl.cnum, (uint32_t __user *) arg)) + err = -EFAULT; + + return err; + } + + return bnep_sock_ioctl(sock, cmd, arg); +} +#endif + +static const struct proto_ops bnep_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = bnep_sock_release, + .ioctl = bnep_sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bnep_sock_compat_ioctl, +#endif + .bind = sock_no_bind, + .getname = sock_no_getname, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .poll = sock_no_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto bnep_proto = { + .name = "BNEP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bt_sock) +}; + +static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock->ops = &bnep_sock_ops; + + sock->state = SS_UNCONNECTED; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + sk->sk_state = BT_OPEN; + + return 0; +} + +static const struct net_proto_family bnep_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = bnep_sock_create +}; + +int __init bnep_sock_init(void) +{ + int err; + + err = proto_register(&bnep_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_BNEP, &bnep_sock_family_ops); + if (err < 0) + goto error; + + return 0; + +error: + BT_ERR("Can't register BNEP socket"); + proto_unregister(&bnep_proto); + return err; +} + +void __exit bnep_sock_cleanup(void) +{ + if (bt_sock_unregister(BTPROTO_BNEP) < 0) + BT_ERR("Can't unregister BNEP socket"); + + proto_unregister(&bnep_proto); +} diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig new file mode 100644 index 00000000..d6b0382f --- /dev/null +++ b/net/bluetooth/cmtp/Kconfig @@ -0,0 +1,11 @@ +config BT_CMTP + tristate "CMTP protocol support" + depends on BT && BT_L2CAP && ISDN_CAPI + help + CMTP (CAPI Message Transport Protocol) is a transport layer + for CAPI messages. CMTP is required for the Bluetooth Common + ISDN Access Profile. + + Say Y here to compile CMTP support into the kernel or say M to + compile it as module (cmtp). + diff --git a/net/bluetooth/cmtp/Makefile b/net/bluetooth/cmtp/Makefile new file mode 100644 index 00000000..890a9a5a --- /dev/null +++ b/net/bluetooth/cmtp/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Bluetooth CMTP layer +# + +obj-$(CONFIG_BT_CMTP) += cmtp.o + +cmtp-objs := core.o sock.o capi.o diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c new file mode 100644 index 00000000..040f67b1 --- /dev/null +++ b/net/bluetooth/cmtp/capi.c @@ -0,0 +1,623 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cmtp.h" + +#define CAPI_INTEROPERABILITY 0x20 + +#define CAPI_INTEROPERABILITY_REQ CAPICMD(CAPI_INTEROPERABILITY, CAPI_REQ) +#define CAPI_INTEROPERABILITY_CONF CAPICMD(CAPI_INTEROPERABILITY, CAPI_CONF) +#define CAPI_INTEROPERABILITY_IND CAPICMD(CAPI_INTEROPERABILITY, CAPI_IND) +#define CAPI_INTEROPERABILITY_RESP CAPICMD(CAPI_INTEROPERABILITY, CAPI_RESP) + +#define CAPI_INTEROPERABILITY_REQ_LEN (CAPI_MSG_BASELEN + 2) +#define CAPI_INTEROPERABILITY_CONF_LEN (CAPI_MSG_BASELEN + 4) +#define CAPI_INTEROPERABILITY_IND_LEN (CAPI_MSG_BASELEN + 2) +#define CAPI_INTEROPERABILITY_RESP_LEN (CAPI_MSG_BASELEN + 2) + +#define CAPI_FUNCTION_REGISTER 0 +#define CAPI_FUNCTION_RELEASE 1 +#define CAPI_FUNCTION_GET_PROFILE 2 +#define CAPI_FUNCTION_GET_MANUFACTURER 3 +#define CAPI_FUNCTION_GET_VERSION 4 +#define CAPI_FUNCTION_GET_SERIAL_NUMBER 5 +#define CAPI_FUNCTION_MANUFACTURER 6 +#define CAPI_FUNCTION_LOOPBACK 7 + + +#define CMTP_MSGNUM 1 +#define CMTP_APPLID 2 +#define CMTP_MAPPING 3 + +static struct cmtp_application *cmtp_application_add(struct cmtp_session *session, __u16 appl) +{ + struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL); + + BT_DBG("session %p application %p appl %d", session, app, appl); + + if (!app) + return NULL; + + app->state = BT_OPEN; + app->appl = appl; + + list_add_tail(&app->list, &session->applications); + + return app; +} + +static void cmtp_application_del(struct cmtp_session *session, struct cmtp_application *app) +{ + BT_DBG("session %p application %p", session, app); + + if (app) { + list_del(&app->list); + kfree(app); + } +} + +static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value) +{ + struct cmtp_application *app; + struct list_head *p, *n; + + list_for_each_safe(p, n, &session->applications) { + app = list_entry(p, struct cmtp_application, list); + switch (pattern) { + case CMTP_MSGNUM: + if (app->msgnum == value) + return app; + break; + case CMTP_APPLID: + if (app->appl == value) + return app; + break; + case CMTP_MAPPING: + if (app->mapping == value) + return app; + break; + } + } + + return NULL; +} + +static int cmtp_msgnum_get(struct cmtp_session *session) +{ + session->msgnum++; + + if ((session->msgnum & 0xff) > 200) + session->msgnum = CMTP_INITIAL_MSGNUM + 1; + + return session->msgnum; +} + +static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb) +{ + struct cmtp_scb *scb = (void *) skb->cb; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + scb->id = -1; + scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3); + + skb_queue_tail(&session->transmit, skb); + + wake_up_interruptible(sk_sleep(session->sock->sk)); +} + +static void cmtp_send_interopmsg(struct cmtp_session *session, + __u8 subcmd, __u16 appl, __u16 msgnum, + __u16 function, unsigned char *buf, int len) +{ + struct sk_buff *skb; + unsigned char *s; + + BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum); + + skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC); + if (!skb) { + BT_ERR("Can't allocate memory for interoperability packet"); + return; + } + + s = skb_put(skb, CAPI_MSG_BASELEN + 6 + len); + + capimsg_setu16(s, 0, CAPI_MSG_BASELEN + 6 + len); + capimsg_setu16(s, 2, appl); + capimsg_setu8 (s, 4, CAPI_INTEROPERABILITY); + capimsg_setu8 (s, 5, subcmd); + capimsg_setu16(s, 6, msgnum); + + /* Interoperability selector (Bluetooth Device Management) */ + capimsg_setu16(s, 8, 0x0001); + + capimsg_setu8 (s, 10, 3 + len); + capimsg_setu16(s, 11, function); + capimsg_setu8 (s, 13, len); + + if (len > 0) + memcpy(s + 14, buf, len); + + cmtp_send_capimsg(session, skb); +} + +static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *skb) +{ + struct capi_ctr *ctrl = &session->ctrl; + struct cmtp_application *application; + __u16 appl, msgnum, func, info; + __u32 controller; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + switch (CAPIMSG_SUBCOMMAND(skb->data)) { + case CAPI_CONF: + if (skb->len < CAPI_MSG_BASELEN + 10) + break; + + func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 5); + info = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 8); + + switch (func) { + case CAPI_FUNCTION_REGISTER: + msgnum = CAPIMSG_MSGID(skb->data); + + application = cmtp_application_get(session, CMTP_MSGNUM, msgnum); + if (application) { + application->state = BT_CONNECTED; + application->msgnum = 0; + application->mapping = CAPIMSG_APPID(skb->data); + wake_up_interruptible(&session->wait); + } + + break; + + case CAPI_FUNCTION_RELEASE: + appl = CAPIMSG_APPID(skb->data); + + application = cmtp_application_get(session, CMTP_MAPPING, appl); + if (application) { + application->state = BT_CLOSED; + application->msgnum = 0; + wake_up_interruptible(&session->wait); + } + + break; + + case CAPI_FUNCTION_GET_PROFILE: + if (skb->len < CAPI_MSG_BASELEN + 11 + sizeof(capi_profile)) + break; + + controller = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 11); + msgnum = CAPIMSG_MSGID(skb->data); + + if (!info && (msgnum == CMTP_INITIAL_MSGNUM)) { + session->ncontroller = controller; + wake_up_interruptible(&session->wait); + break; + } + + if (!info && ctrl) { + memcpy(&ctrl->profile, + skb->data + CAPI_MSG_BASELEN + 11, + sizeof(capi_profile)); + session->state = BT_CONNECTED; + capi_ctr_ready(ctrl); + } + + break; + + case CAPI_FUNCTION_GET_MANUFACTURER: + if (skb->len < CAPI_MSG_BASELEN + 15) + break; + + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 10); + + if (!info && ctrl) { + int len = min_t(uint, CAPI_MANUFACTURER_LEN, + skb->data[CAPI_MSG_BASELEN + 14]); + + memset(ctrl->manu, 0, CAPI_MANUFACTURER_LEN); + strncpy(ctrl->manu, + skb->data + CAPI_MSG_BASELEN + 15, len); + } + + break; + + case CAPI_FUNCTION_GET_VERSION: + if (skb->len < CAPI_MSG_BASELEN + 32) + break; + + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12); + + if (!info && ctrl) { + ctrl->version.majorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 16); + ctrl->version.minorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 20); + ctrl->version.majormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 24); + ctrl->version.minormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 28); + } + + break; + + case CAPI_FUNCTION_GET_SERIAL_NUMBER: + if (skb->len < CAPI_MSG_BASELEN + 17) + break; + + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12); + + if (!info && ctrl) { + int len = min_t(uint, CAPI_SERIAL_LEN, + skb->data[CAPI_MSG_BASELEN + 16]); + + memset(ctrl->serial, 0, CAPI_SERIAL_LEN); + strncpy(ctrl->serial, + skb->data + CAPI_MSG_BASELEN + 17, len); + } + + break; + } + + break; + + case CAPI_IND: + if (skb->len < CAPI_MSG_BASELEN + 6) + break; + + func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 3); + + if (func == CAPI_FUNCTION_LOOPBACK) { + int len = min_t(uint, skb->len - CAPI_MSG_BASELEN - 6, + skb->data[CAPI_MSG_BASELEN + 5]); + appl = CAPIMSG_APPID(skb->data); + msgnum = CAPIMSG_MSGID(skb->data); + cmtp_send_interopmsg(session, CAPI_RESP, appl, msgnum, func, + skb->data + CAPI_MSG_BASELEN + 6, len); + } + + break; + } + + kfree_skb(skb); +} + +void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) +{ + struct capi_ctr *ctrl = &session->ctrl; + struct cmtp_application *application; + __u16 appl; + __u32 contr; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + if (skb->len < CAPI_MSG_BASELEN) + return; + + if (CAPIMSG_COMMAND(skb->data) == CAPI_INTEROPERABILITY) { + cmtp_recv_interopmsg(session, skb); + return; + } + + if (session->flags & (1 << CMTP_LOOPBACK)) { + kfree_skb(skb); + return; + } + + appl = CAPIMSG_APPID(skb->data); + contr = CAPIMSG_CONTROL(skb->data); + + application = cmtp_application_get(session, CMTP_MAPPING, appl); + if (application) { + appl = application->appl; + CAPIMSG_SETAPPID(skb->data, appl); + } else { + BT_ERR("Can't find application with id %d", appl); + kfree_skb(skb); + return; + } + + if ((contr & 0x7f) == 0x01) { + contr = (contr & 0xffffff80) | session->num; + CAPIMSG_SETCONTROL(skb->data, contr); + } + + if (!ctrl) { + BT_ERR("Can't find controller %d for message", session->num); + kfree_skb(skb); + return; + } + + capi_ctr_handle_message(ctrl, appl, skb); +} + +static int cmtp_load_firmware(struct capi_ctr *ctrl, capiloaddata *data) +{ + BT_DBG("ctrl %p data %p", ctrl, data); + + return 0; +} + +static void cmtp_reset_ctr(struct capi_ctr *ctrl) +{ + struct cmtp_session *session = ctrl->driverdata; + + BT_DBG("ctrl %p", ctrl); + + capi_ctr_down(ctrl); + + kthread_stop(session->task); +} + +static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_params *rp) +{ + DECLARE_WAITQUEUE(wait, current); + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *application; + unsigned long timeo = CMTP_INTEROP_TIMEOUT; + unsigned char buf[8]; + int err = 0, nconn, want = rp->level3cnt; + + BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d", + ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen); + + application = cmtp_application_add(session, appl); + if (!application) { + BT_ERR("Can't allocate memory for new application"); + return; + } + + if (want < 0) + nconn = ctrl->profile.nbchannel * -want; + else + nconn = want; + + if (nconn == 0) + nconn = ctrl->profile.nbchannel; + + capimsg_setu16(buf, 0, nconn); + capimsg_setu16(buf, 2, rp->datablkcnt); + capimsg_setu16(buf, 4, rp->datablklen); + + application->state = BT_CONFIG; + application->msgnum = cmtp_msgnum_get(session); + + cmtp_send_interopmsg(session, CAPI_REQ, 0x0000, application->msgnum, + CAPI_FUNCTION_REGISTER, buf, 6); + + add_wait_queue(&session->wait, &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!timeo) { + err = -EAGAIN; + break; + } + + if (application->state == BT_CLOSED) { + err = -application->err; + break; + } + + if (application->state == BT_CONNECTED) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + timeo = schedule_timeout(timeo); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&session->wait, &wait); + + if (err) { + cmtp_application_del(session, application); + return; + } +} + +static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl) +{ + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *application; + + BT_DBG("ctrl %p appl %d", ctrl, appl); + + application = cmtp_application_get(session, CMTP_APPLID, appl); + if (!application) { + BT_ERR("Can't find application"); + return; + } + + application->msgnum = cmtp_msgnum_get(session); + + cmtp_send_interopmsg(session, CAPI_REQ, application->mapping, application->msgnum, + CAPI_FUNCTION_RELEASE, NULL, 0); + + wait_event_interruptible_timeout(session->wait, + (application->state == BT_CLOSED), CMTP_INTEROP_TIMEOUT); + + cmtp_application_del(session, application); +} + +static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) +{ + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *application; + __u16 appl; + __u32 contr; + + BT_DBG("ctrl %p skb %p", ctrl, skb); + + appl = CAPIMSG_APPID(skb->data); + contr = CAPIMSG_CONTROL(skb->data); + + application = cmtp_application_get(session, CMTP_APPLID, appl); + if ((!application) || (application->state != BT_CONNECTED)) { + BT_ERR("Can't find application with id %d", appl); + return CAPI_ILLAPPNR; + } + + CAPIMSG_SETAPPID(skb->data, application->mapping); + + if ((contr & 0x7f) == session->num) { + contr = (contr & 0xffffff80) | 0x01; + CAPIMSG_SETCONTROL(skb->data, contr); + } + + cmtp_send_capimsg(session, skb); + + return CAPI_NOERROR; +} + +static char *cmtp_procinfo(struct capi_ctr *ctrl) +{ + return "CAPI Message Transport Protocol"; +} + +static int cmtp_proc_show(struct seq_file *m, void *v) +{ + struct capi_ctr *ctrl = m->private; + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *app; + struct list_head *p, *n; + + seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl)); + seq_printf(m, "addr %s\n", session->name); + seq_printf(m, "ctrl %d\n", session->num); + + list_for_each_safe(p, n, &session->applications) { + app = list_entry(p, struct cmtp_application, list); + seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); + } + + return 0; +} + +static int cmtp_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmtp_proc_show, PDE(inode)->data); +} + +static const struct file_operations cmtp_proc_fops = { + .owner = THIS_MODULE, + .open = cmtp_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int cmtp_attach_device(struct cmtp_session *session) +{ + unsigned char buf[4]; + long ret; + + BT_DBG("session %p", session); + + capimsg_setu32(buf, 0, 0); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, CMTP_INITIAL_MSGNUM, + CAPI_FUNCTION_GET_PROFILE, buf, 4); + + ret = wait_event_interruptible_timeout(session->wait, + session->ncontroller, CMTP_INTEROP_TIMEOUT); + + BT_INFO("Found %d CAPI controller(s) on device %s", session->ncontroller, session->name); + + if (!ret) + return -ETIMEDOUT; + + if (!session->ncontroller) + return -ENODEV; + + if (session->ncontroller > 1) + BT_INFO("Setting up only CAPI controller 1"); + + session->ctrl.owner = THIS_MODULE; + session->ctrl.driverdata = session; + strcpy(session->ctrl.name, session->name); + + session->ctrl.driver_name = "cmtp"; + session->ctrl.load_firmware = cmtp_load_firmware; + session->ctrl.reset_ctr = cmtp_reset_ctr; + session->ctrl.register_appl = cmtp_register_appl; + session->ctrl.release_appl = cmtp_release_appl; + session->ctrl.send_message = cmtp_send_message; + + session->ctrl.procinfo = cmtp_procinfo; + session->ctrl.proc_fops = &cmtp_proc_fops; + + if (attach_capi_ctr(&session->ctrl) < 0) { + BT_ERR("Can't attach new controller"); + return -EBUSY; + } + + session->num = session->ctrl.cnr; + + BT_DBG("session %p num %d", session, session->num); + + capimsg_setu32(buf, 0, 1); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_MANUFACTURER, buf, 4); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_VERSION, buf, 4); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_SERIAL_NUMBER, buf, 4); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_PROFILE, buf, 4); + + return 0; +} + +void cmtp_detach_device(struct cmtp_session *session) +{ + BT_DBG("session %p", session); + + detach_capi_ctr(&session->ctrl); +} diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h new file mode 100644 index 00000000..db43b54a --- /dev/null +++ b/net/bluetooth/cmtp/cmtp.h @@ -0,0 +1,128 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#ifndef __CMTP_H +#define __CMTP_H + +#include +#include + +#define BTNAMSIZ 18 + +/* CMTP ioctl defines */ +#define CMTPCONNADD _IOW('C', 200, int) +#define CMTPCONNDEL _IOW('C', 201, int) +#define CMTPGETCONNLIST _IOR('C', 210, int) +#define CMTPGETCONNINFO _IOR('C', 211, int) + +#define CMTP_LOOPBACK 0 + +struct cmtp_connadd_req { + int sock; /* Connected socket */ + __u32 flags; +}; + +struct cmtp_conndel_req { + bdaddr_t bdaddr; + __u32 flags; +}; + +struct cmtp_conninfo { + bdaddr_t bdaddr; + __u32 flags; + __u16 state; + int num; +}; + +struct cmtp_connlist_req { + __u32 cnum; + struct cmtp_conninfo __user *ci; +}; + +int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock); +int cmtp_del_connection(struct cmtp_conndel_req *req); +int cmtp_get_connlist(struct cmtp_connlist_req *req); +int cmtp_get_conninfo(struct cmtp_conninfo *ci); + +/* CMTP session defines */ +#define CMTP_INTEROP_TIMEOUT (HZ * 5) +#define CMTP_INITIAL_MSGNUM 0xff00 + +struct cmtp_session { + struct list_head list; + + struct socket *sock; + + bdaddr_t bdaddr; + + unsigned long state; + unsigned long flags; + + uint mtu; + + char name[BTNAMSIZ]; + + struct task_struct *task; + + wait_queue_head_t wait; + + int ncontroller; + int num; + struct capi_ctr ctrl; + + struct list_head applications; + + unsigned long blockids; + int msgnum; + + struct sk_buff_head transmit; + + struct sk_buff *reassembly[16]; +}; + +struct cmtp_application { + struct list_head list; + + unsigned long state; + int err; + + __u16 appl; + __u16 mapping; + + __u16 msgnum; +}; + +struct cmtp_scb { + int id; + int data; +}; + +int cmtp_attach_device(struct cmtp_session *session); +void cmtp_detach_device(struct cmtp_session *session); + +void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb); + +/* CMTP init defines */ +int cmtp_init_sockets(void); +void cmtp_cleanup_sockets(void); + +#endif /* __CMTP_H */ diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c new file mode 100644 index 00000000..c5b11af9 --- /dev/null +++ b/net/bluetooth/cmtp/core.c @@ -0,0 +1,497 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "cmtp.h" + +#define VERSION "1.0" + +static DECLARE_RWSEM(cmtp_session_sem); +static LIST_HEAD(cmtp_session_list); + +static struct cmtp_session *__cmtp_get_session(bdaddr_t *bdaddr) +{ + struct cmtp_session *session; + struct list_head *p; + + BT_DBG(""); + + list_for_each(p, &cmtp_session_list) { + session = list_entry(p, struct cmtp_session, list); + if (!bacmp(bdaddr, &session->bdaddr)) + return session; + } + return NULL; +} + +static void __cmtp_link_session(struct cmtp_session *session) +{ + __module_get(THIS_MODULE); + list_add(&session->list, &cmtp_session_list); +} + +static void __cmtp_unlink_session(struct cmtp_session *session) +{ + list_del(&session->list); + module_put(THIS_MODULE); +} + +static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_conninfo *ci) +{ + memset(ci, 0, sizeof(*ci)); + bacpy(&ci->bdaddr, &session->bdaddr); + + ci->flags = session->flags; + ci->state = session->state; + + ci->num = session->num; +} + + +static inline int cmtp_alloc_block_id(struct cmtp_session *session) +{ + int i, id = -1; + + for (i = 0; i < 16; i++) + if (!test_and_set_bit(i, &session->blockids)) { + id = i; + break; + } + + return id; +} + +static inline void cmtp_free_block_id(struct cmtp_session *session, int id) +{ + clear_bit(id, &session->blockids); +} + +static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const unsigned char *buf, int count) +{ + struct sk_buff *skb = session->reassembly[id], *nskb; + int size; + + BT_DBG("session %p buf %p count %d", session, buf, count); + + size = (skb) ? skb->len + count : count; + + nskb = alloc_skb(size, GFP_ATOMIC); + if (!nskb) { + BT_ERR("Can't allocate memory for CAPI message"); + return; + } + + if (skb && (skb->len > 0)) + skb_copy_from_linear_data(skb, skb_put(nskb, skb->len), skb->len); + + memcpy(skb_put(nskb, count), buf, count); + + session->reassembly[id] = nskb; + + kfree_skb(skb); +} + +static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb) +{ + __u8 hdr, hdrlen, id; + __u16 len; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + while (skb->len > 0) { + hdr = skb->data[0]; + + switch (hdr & 0xc0) { + case 0x40: + hdrlen = 2; + len = skb->data[1]; + break; + case 0x80: + hdrlen = 3; + len = skb->data[1] | (skb->data[2] << 8); + break; + default: + hdrlen = 1; + len = 0; + break; + } + + id = (hdr & 0x3c) >> 2; + + BT_DBG("hdr 0x%02x hdrlen %d len %d id %d", hdr, hdrlen, len, id); + + if (hdrlen + len > skb->len) { + BT_ERR("Wrong size or header information in CMTP frame"); + break; + } + + if (len == 0) { + skb_pull(skb, hdrlen); + continue; + } + + switch (hdr & 0x03) { + case 0x00: + cmtp_add_msgpart(session, id, skb->data + hdrlen, len); + cmtp_recv_capimsg(session, session->reassembly[id]); + session->reassembly[id] = NULL; + break; + case 0x01: + cmtp_add_msgpart(session, id, skb->data + hdrlen, len); + break; + default: + if (session->reassembly[id] != NULL) + kfree_skb(session->reassembly[id]); + session->reassembly[id] = NULL; + break; + } + + skb_pull(skb, hdrlen + len); + } + + kfree_skb(skb); + return 0; +} + +static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, int len) +{ + struct socket *sock = session->sock; + struct kvec iv = { data, len }; + struct msghdr msg; + + BT_DBG("session %p data %p len %d", session, data, len); + + if (!len) + return 0; + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, &iv, 1, len); +} + +static void cmtp_process_transmit(struct cmtp_session *session) +{ + struct sk_buff *skb, *nskb; + unsigned char *hdr; + unsigned int size, tail; + + BT_DBG("session %p", session); + + nskb = alloc_skb(session->mtu, GFP_ATOMIC); + if (!nskb) { + BT_ERR("Can't allocate memory for new frame"); + return; + } + + while ((skb = skb_dequeue(&session->transmit))) { + struct cmtp_scb *scb = (void *) skb->cb; + + tail = session->mtu - nskb->len; + if (tail < 5) { + cmtp_send_frame(session, nskb->data, nskb->len); + skb_trim(nskb, 0); + tail = session->mtu; + } + + size = min_t(uint, ((tail < 258) ? (tail - 2) : (tail - 3)), skb->len); + + if (scb->id < 0) { + scb->id = cmtp_alloc_block_id(session); + if (scb->id < 0) { + skb_queue_head(&session->transmit, skb); + break; + } + } + + if (size < 256) { + hdr = skb_put(nskb, 2); + hdr[0] = 0x40 + | ((scb->id << 2) & 0x3c) + | ((skb->len == size) ? 0x00 : 0x01); + hdr[1] = size; + } else { + hdr = skb_put(nskb, 3); + hdr[0] = 0x80 + | ((scb->id << 2) & 0x3c) + | ((skb->len == size) ? 0x00 : 0x01); + hdr[1] = size & 0xff; + hdr[2] = size >> 8; + } + + skb_copy_from_linear_data(skb, skb_put(nskb, size), size); + skb_pull(skb, size); + + if (skb->len > 0) { + skb_queue_head(&session->transmit, skb); + } else { + cmtp_free_block_id(session, scb->id); + if (scb->data) { + cmtp_send_frame(session, nskb->data, nskb->len); + skb_trim(nskb, 0); + } + kfree_skb(skb); + } + } + + cmtp_send_frame(session, nskb->data, nskb->len); + + kfree_skb(nskb); +} + +static int cmtp_session(void *arg) +{ + struct cmtp_session *session = arg; + struct sock *sk = session->sock->sk; + struct sk_buff *skb; + wait_queue_t wait; + + BT_DBG("session %p", session); + + set_user_nice(current, -15); + + init_waitqueue_entry(&wait, current); + add_wait_queue(sk_sleep(sk), &wait); + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + + if (sk->sk_state != BT_CONNECTED) + break; + + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + cmtp_recv_frame(session, skb); + } + + cmtp_process_transmit(session); + + schedule(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + down_write(&cmtp_session_sem); + + if (!(session->flags & (1 << CMTP_LOOPBACK))) + cmtp_detach_device(session); + + fput(session->sock->file); + + __cmtp_unlink_session(session); + + up_write(&cmtp_session_sem); + + kfree(session); + return 0; +} + +int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) +{ + struct cmtp_session *session, *s; + int i, err; + + BT_DBG(""); + + session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); + if (!session) + return -ENOMEM; + + down_write(&cmtp_session_sem); + + s = __cmtp_get_session(&bt_sk(sock->sk)->dst); + if (s && s->state == BT_CONNECTED) { + err = -EEXIST; + goto failed; + } + + bacpy(&session->bdaddr, &bt_sk(sock->sk)->dst); + + session->mtu = min_t(uint, l2cap_pi(sock->sk)->chan->omtu, + l2cap_pi(sock->sk)->chan->imtu); + + BT_DBG("mtu %d", session->mtu); + + sprintf(session->name, "%s", batostr(&bt_sk(sock->sk)->dst)); + + session->sock = sock; + session->state = BT_CONFIG; + + init_waitqueue_head(&session->wait); + + session->msgnum = CMTP_INITIAL_MSGNUM; + + INIT_LIST_HEAD(&session->applications); + + skb_queue_head_init(&session->transmit); + + for (i = 0; i < 16; i++) + session->reassembly[i] = NULL; + + session->flags = req->flags; + + __cmtp_link_session(session); + + session->task = kthread_run(cmtp_session, session, "kcmtpd_ctr_%d", + session->num); + if (IS_ERR(session->task)) { + err = PTR_ERR(session->task); + goto unlink; + } + + if (!(session->flags & (1 << CMTP_LOOPBACK))) { + err = cmtp_attach_device(session); + if (err < 0) + goto detach; + } + + up_write(&cmtp_session_sem); + return 0; + +detach: + cmtp_detach_device(session); + +unlink: + __cmtp_unlink_session(session); + +failed: + up_write(&cmtp_session_sem); + kfree(session); + return err; +} + +int cmtp_del_connection(struct cmtp_conndel_req *req) +{ + struct cmtp_session *session; + int err = 0; + + BT_DBG(""); + + down_read(&cmtp_session_sem); + + session = __cmtp_get_session(&req->bdaddr); + if (session) { + /* Flush the transmit queue */ + skb_queue_purge(&session->transmit); + + /* Stop session thread */ + kthread_stop(session->task); + } else + err = -ENOENT; + + up_read(&cmtp_session_sem); + return err; +} + +int cmtp_get_connlist(struct cmtp_connlist_req *req) +{ + struct list_head *p; + int err = 0, n = 0; + + BT_DBG(""); + + down_read(&cmtp_session_sem); + + list_for_each(p, &cmtp_session_list) { + struct cmtp_session *session; + struct cmtp_conninfo ci; + + session = list_entry(p, struct cmtp_session, list); + + __cmtp_copy_session(session, &ci); + + if (copy_to_user(req->ci, &ci, sizeof(ci))) { + err = -EFAULT; + break; + } + + if (++n >= req->cnum) + break; + + req->ci++; + } + req->cnum = n; + + up_read(&cmtp_session_sem); + return err; +} + +int cmtp_get_conninfo(struct cmtp_conninfo *ci) +{ + struct cmtp_session *session; + int err = 0; + + down_read(&cmtp_session_sem); + + session = __cmtp_get_session(&ci->bdaddr); + if (session) + __cmtp_copy_session(session, ci); + else + err = -ENOENT; + + up_read(&cmtp_session_sem); + return err; +} + + +static int __init cmtp_init(void) +{ + BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION); + + cmtp_init_sockets(); + + return 0; +} + +static void __exit cmtp_exit(void) +{ + cmtp_cleanup_sockets(); +} + +module_init(cmtp_init); +module_exit(cmtp_exit); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth CMTP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-5"); diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c new file mode 100644 index 00000000..3f2dd5c2 --- /dev/null +++ b/net/bluetooth/cmtp/sock.c @@ -0,0 +1,253 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "cmtp.h" + +static int cmtp_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + sock_orphan(sk); + sock_put(sk); + + return 0; +} + +static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct cmtp_connadd_req ca; + struct cmtp_conndel_req cd; + struct cmtp_connlist_req cl; + struct cmtp_conninfo ci; + struct socket *nsock; + void __user *argp = (void __user *)arg; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case CMTPCONNADD: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ca, argp, sizeof(ca))) + return -EFAULT; + + nsock = sockfd_lookup(ca.sock, &err); + if (!nsock) + return err; + + if (nsock->sk->sk_state != BT_CONNECTED) { + sockfd_put(nsock); + return -EBADFD; + } + + err = cmtp_add_connection(&ca, nsock); + if (!err) { + if (copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; + } else + sockfd_put(nsock); + + return err; + + case CMTPCONNDEL: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&cd, argp, sizeof(cd))) + return -EFAULT; + + return cmtp_del_connection(&cd); + + case CMTPGETCONNLIST: + if (copy_from_user(&cl, argp, sizeof(cl))) + return -EFAULT; + + if (cl.cnum <= 0) + return -EINVAL; + + err = cmtp_get_connlist(&cl); + if (!err && copy_to_user(argp, &cl, sizeof(cl))) + return -EFAULT; + + return err; + + case CMTPGETCONNINFO: + if (copy_from_user(&ci, argp, sizeof(ci))) + return -EFAULT; + + err = cmtp_get_conninfo(&ci); + if (!err && copy_to_user(argp, &ci, sizeof(ci))) + return -EFAULT; + + return err; + } + + return -EINVAL; +} + +#ifdef CONFIG_COMPAT +static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + if (cmd == CMTPGETCONNLIST) { + struct cmtp_connlist_req cl; + uint32_t uci; + int err; + + if (get_user(cl.cnum, (uint32_t __user *) arg) || + get_user(uci, (u32 __user *) (arg + 4))) + return -EFAULT; + + cl.ci = compat_ptr(uci); + + if (cl.cnum <= 0) + return -EINVAL; + + err = cmtp_get_connlist(&cl); + + if (!err && put_user(cl.cnum, (uint32_t __user *) arg)) + err = -EFAULT; + + return err; + } + + return cmtp_sock_ioctl(sock, cmd, arg); +} +#endif + +static const struct proto_ops cmtp_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = cmtp_sock_release, + .ioctl = cmtp_sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = cmtp_sock_compat_ioctl, +#endif + .bind = sock_no_bind, + .getname = sock_no_getname, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .poll = sock_no_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto cmtp_proto = { + .name = "CMTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bt_sock) +}; + +static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock->ops = &cmtp_sock_ops; + + sock->state = SS_UNCONNECTED; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + sk->sk_state = BT_OPEN; + + return 0; +} + +static const struct net_proto_family cmtp_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = cmtp_sock_create +}; + +int cmtp_init_sockets(void) +{ + int err; + + err = proto_register(&cmtp_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_CMTP, &cmtp_sock_family_ops); + if (err < 0) + goto error; + + return 0; + +error: + BT_ERR("Can't register CMTP socket"); + proto_unregister(&cmtp_proto); + return err; +} + +void cmtp_cleanup_sockets(void) +{ + if (bt_sock_unregister(BTPROTO_CMTP) < 0) + BT_ERR("Can't unregister CMTP socket"); + + proto_unregister(&cmtp_proto); +} diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c new file mode 100644 index 00000000..33c4e0cd --- /dev/null +++ b/net/bluetooth/hci_conn.c @@ -0,0 +1,991 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI connection handling. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static void hci_le_connect(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_le_create_conn cp; + + conn->state = BT_CONNECT; + conn->out = 1; + conn->link_mode |= HCI_LM_MASTER; + conn->sec_level = BT_SECURITY_LOW; + + memset(&cp, 0, sizeof(cp)); + cp.scan_interval = cpu_to_le16(0x0004); + cp.scan_window = cpu_to_le16(0x0004); + bacpy(&cp.peer_addr, &conn->dst); + cp.peer_addr_type = conn->dst_type; + cp.conn_interval_min = cpu_to_le16(0x0008); + cp.conn_interval_max = cpu_to_le16(0x0100); + cp.supervision_timeout = cpu_to_le16(0x0064); + cp.min_ce_len = cpu_to_le16(0x0001); + cp.max_ce_len = cpu_to_le16(0x0001); + + hci_send_cmd(hdev, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); +} + +static void hci_le_connect_cancel(struct hci_conn *conn) +{ + hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL); +} + +void hci_acl_connect(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + struct inquiry_entry *ie; + struct hci_cp_create_conn cp; + + BT_DBG("%p", conn); + + conn->state = BT_CONNECT; + conn->out = 1; + + conn->link_mode = HCI_LM_MASTER; + + conn->attempt++; + + conn->link_policy = hdev->link_policy; + + memset(&cp, 0, sizeof(cp)); + bacpy(&cp.bdaddr, &conn->dst); + cp.pscan_rep_mode = 0x02; + + ie = hci_inquiry_cache_lookup(hdev, &conn->dst); + if (ie) { + if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { + cp.pscan_rep_mode = ie->data.pscan_rep_mode; + cp.pscan_mode = ie->data.pscan_mode; + cp.clock_offset = ie->data.clock_offset | + cpu_to_le16(0x8000); + } + + memcpy(conn->dev_class, ie->data.dev_class, 3); + conn->ssp_mode = ie->data.ssp_mode; + } + + cp.pkt_type = cpu_to_le16(conn->pkt_type); + if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) + cp.role_switch = 0x01; + else + cp.role_switch = 0x00; + + hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp); +} + +static void hci_acl_connect_cancel(struct hci_conn *conn) +{ + struct hci_cp_create_conn_cancel cp; + + BT_DBG("%p", conn); + + if (conn->hdev->hci_ver < 2) + return; + + bacpy(&cp.bdaddr, &conn->dst); + hci_send_cmd(conn->hdev, HCI_OP_CREATE_CONN_CANCEL, sizeof(cp), &cp); +} + +void hci_acl_disconn(struct hci_conn *conn, __u8 reason) +{ + struct hci_cp_disconnect cp; + + BT_DBG("%p", conn); + + conn->state = BT_DISCONN; + + cp.handle = cpu_to_le16(conn->handle); + cp.reason = reason; + hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); +} + +void hci_add_sco(struct hci_conn *conn, __u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_add_sco cp; + + BT_DBG("%p", conn); + + conn->state = BT_CONNECT; + conn->out = 1; + + conn->attempt++; + + cp.handle = cpu_to_le16(handle); + cp.pkt_type = cpu_to_le16(conn->pkt_type); + + hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); +} + +void hci_setup_sync(struct hci_conn *conn, __u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_setup_sync_conn cp; + + BT_DBG("%p", conn); + + conn->state = BT_CONNECT; + conn->out = 1; + + conn->attempt++; + + cp.handle = cpu_to_le16(handle); + cp.pkt_type = cpu_to_le16(conn->pkt_type); + + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); + cp.max_latency = cpu_to_le16(0xffff); + cp.voice_setting = cpu_to_le16(hdev->voice_setting); + cp.retrans_effort = 0xff; + + hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp); +} + +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, + u16 latency, u16 to_multiplier) +{ + struct hci_cp_le_conn_update cp; + struct hci_dev *hdev = conn->hdev; + + memset(&cp, 0, sizeof(cp)); + + cp.handle = cpu_to_le16(conn->handle); + cp.conn_interval_min = cpu_to_le16(min); + cp.conn_interval_max = cpu_to_le16(max); + cp.conn_latency = cpu_to_le16(latency); + cp.supervision_timeout = cpu_to_le16(to_multiplier); + cp.min_ce_len = cpu_to_le16(0x0001); + cp.max_ce_len = cpu_to_le16(0x0001); + + hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); +} +EXPORT_SYMBOL(hci_le_conn_update); + +void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __u8 rand[8], + __u8 ltk[16]) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_le_start_enc cp; + + BT_DBG("%p", conn); + + memset(&cp, 0, sizeof(cp)); + + cp.handle = cpu_to_le16(conn->handle); + memcpy(cp.ltk, ltk, sizeof(cp.ltk)); + cp.ediv = ediv; + memcpy(cp.rand, rand, sizeof(rand)); + + hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); +} +EXPORT_SYMBOL(hci_le_start_enc); + +void hci_le_ltk_reply(struct hci_conn *conn, u8 ltk[16]) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_le_ltk_reply cp; + + BT_DBG("%p", conn); + + memset(&cp, 0, sizeof(cp)); + + cp.handle = cpu_to_le16(conn->handle); + memcpy(cp.ltk, ltk, sizeof(ltk)); + + hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp); +} +EXPORT_SYMBOL(hci_le_ltk_reply); + +void hci_le_ltk_neg_reply(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_le_ltk_neg_reply cp; + + BT_DBG("%p", conn); + + memset(&cp, 0, sizeof(cp)); + + cp.handle = cpu_to_le16(conn->handle); + + hci_send_cmd(hdev, HCI_OP_LE_LTK_NEG_REPLY, sizeof(cp), &cp); +} + +/* Device _must_ be locked */ +void hci_sco_setup(struct hci_conn *conn, __u8 status) +{ + struct hci_conn *sco = conn->link; + + BT_DBG("%p", conn); + + if (!sco) + return; + + if (!status) { + if (lmp_esco_capable(conn->hdev)) + hci_setup_sync(sco, conn->handle); + else + hci_add_sco(sco, conn->handle); + } else { + hci_proto_connect_cfm(sco, status); + hci_conn_del(sco); + } +} + +static void hci_conn_timeout(unsigned long arg) +{ + struct hci_conn *conn = (void *) arg; + struct hci_dev *hdev = conn->hdev; + __u8 reason; + + BT_DBG("conn %p state %d", conn, conn->state); + + if (atomic_read(&conn->refcnt)) + return; + + hci_dev_lock(hdev); + + switch (conn->state) { + case BT_CONNECT: + case BT_CONNECT2: + if (conn->out) { + if (conn->type == ACL_LINK) + hci_acl_connect_cancel(conn); + else if (conn->type == LE_LINK) + hci_le_connect_cancel(conn); + } + break; + case BT_CONFIG: + case BT_CONNECTED: + reason = hci_proto_disconn_ind(conn); + hci_acl_disconn(conn, reason); + break; + default: + conn->state = BT_CLOSED; + break; + } + + hci_dev_unlock(hdev); +} + +static void hci_conn_idle(unsigned long arg) +{ + struct hci_conn *conn = (void *) arg; + + BT_DBG("conn %p mode %d", conn, conn->mode); + + hci_conn_enter_sniff_mode(conn); +} + +static void hci_conn_auto_accept(unsigned long arg) +{ + struct hci_conn *conn = (void *) arg; + struct hci_dev *hdev = conn->hdev; + + hci_dev_lock(hdev); + + hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst), + &conn->dst); + + hci_dev_unlock(hdev); +} + +struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, + __u16 pkt_type, bdaddr_t *dst) +{ + struct hci_conn *conn; + + BT_DBG("%s dst %s", hdev->name, batostr(dst)); + + conn = kzalloc(sizeof(struct hci_conn), GFP_ATOMIC); + if (!conn) + return NULL; + + bacpy(&conn->dst, dst); + conn->hdev = hdev; + conn->type = type; + conn->mode = HCI_CM_ACTIVE; + conn->state = BT_OPEN; + conn->auth_type = HCI_AT_GENERAL_BONDING; + conn->io_capability = hdev->io_capability; + conn->remote_auth = 0xff; + conn->key_type = 0xff; + + conn->power_save = 1; + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + + switch (type) { + case ACL_LINK: + conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; + break; + case SCO_LINK: + if (!pkt_type) + pkt_type = SCO_ESCO_MASK; + case ESCO_LINK: + if (!pkt_type) + pkt_type = ALL_ESCO_MASK; + if (lmp_esco_capable(hdev)) { + /* HCI Setup Synchronous Connection Command uses + reverse logic on the EDR_ESCO_MASK bits */ + conn->pkt_type = (pkt_type ^ EDR_ESCO_MASK) & + hdev->esco_type; + } else { + /* Legacy HCI Add Sco Connection Command uses a + shifted bitmask */ + conn->pkt_type = (pkt_type << 5) & hdev->pkt_type & + SCO_PTYPE_MASK; + } + break; + } + + skb_queue_head_init(&conn->data_q); + + setup_timer(&conn->disc_timer, hci_conn_timeout, (unsigned long)conn); + setup_timer(&conn->idle_timer, hci_conn_idle, (unsigned long)conn); + setup_timer(&conn->auto_accept_timer, hci_conn_auto_accept, + (unsigned long) conn); + + atomic_set(&conn->refcnt, 0); + + hci_dev_hold(hdev); + + tasklet_disable(&hdev->tx_task); + + hci_conn_hash_add(hdev, conn); + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + + atomic_set(&conn->devref, 0); + + hci_conn_init_sysfs(conn); + + tasklet_enable(&hdev->tx_task); + + return conn; +} + +int hci_conn_del(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("%s conn %p handle %d", hdev->name, conn, conn->handle); + + del_timer(&conn->idle_timer); + + del_timer(&conn->disc_timer); + + del_timer(&conn->auto_accept_timer); + + if (conn->type == ACL_LINK) { + struct hci_conn *sco = conn->link; + if (sco) + sco->link = NULL; + + /* Unacked frames */ + hdev->acl_cnt += conn->sent; + } else if (conn->type == LE_LINK) { + if (hdev->le_pkts) + hdev->le_cnt += conn->sent; + else + hdev->acl_cnt += conn->sent; + } else { + struct hci_conn *acl = conn->link; + if (acl) { + acl->link = NULL; + hci_conn_put(acl); + } + } + + tasklet_disable(&hdev->tx_task); + + hci_conn_hash_del(hdev, conn); + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + + tasklet_enable(&hdev->tx_task); + + skb_queue_purge(&conn->data_q); + + hci_conn_put_device(conn); + + hci_dev_put(hdev); + + if (conn->handle == 0) + kfree(conn); + + return 0; +} + +struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) +{ + int use_src = bacmp(src, BDADDR_ANY); + struct hci_dev *hdev = NULL; + struct list_head *p; + + BT_DBG("%s -> %s", batostr(src), batostr(dst)); + + read_lock_bh(&hci_dev_list_lock); + + list_for_each(p, &hci_dev_list) { + struct hci_dev *d = list_entry(p, struct hci_dev, list); + + if (!test_bit(HCI_UP, &d->flags) || test_bit(HCI_RAW, &d->flags)) + continue; + + /* Simple routing: + * No source address - find interface with bdaddr != dst + * Source address - find interface with bdaddr == src + */ + + if (use_src) { + if (!bacmp(&d->bdaddr, src)) { + hdev = d; break; + } + } else { + if (bacmp(&d->bdaddr, dst)) { + hdev = d; break; + } + } + } + + if (hdev) + hdev = hci_dev_hold(hdev); + + read_unlock_bh(&hci_dev_list_lock); + return hdev; +} +EXPORT_SYMBOL(hci_get_route); + +/* Create SCO, ACL or LE connection. + * Device _must_ be locked */ +struct hci_conn *hci_connect(struct hci_dev *hdev, int type, + __u16 pkt_type, bdaddr_t *dst, + __u8 sec_level, __u8 auth_type) +{ + struct hci_conn *acl; + struct hci_conn *sco; + struct hci_conn *le; + + BT_DBG("%s dst %s", hdev->name, batostr(dst)); + + if (type == LE_LINK) { + struct adv_entry *entry; + + le = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst); + if (le) + return ERR_PTR(-EBUSY); + + entry = hci_find_adv_entry(hdev, dst); + if (!entry) + return ERR_PTR(-EHOSTUNREACH); + + le = hci_conn_add(hdev, LE_LINK, 0, dst); + if (!le) + return ERR_PTR(-ENOMEM); + + le->dst_type = entry->bdaddr_type; + + hci_le_connect(le); + + hci_conn_hold(le); + + return le; + } + + acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); + if (!acl) { + acl = hci_conn_add(hdev, ACL_LINK, 0, dst); + if (!acl) + return NULL; + } + + hci_conn_hold(acl); + + if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { + acl->sec_level = BT_SECURITY_LOW; + acl->pending_sec_level = sec_level; + acl->auth_type = auth_type; + hci_acl_connect(acl); + } + + if (type == ACL_LINK) + return acl; + + sco = hci_conn_hash_lookup_ba(hdev, type, dst); + if (!sco) { + sco = hci_conn_add(hdev, type, pkt_type, dst); + if (!sco) { + hci_conn_put(acl); + return NULL; + } + } + + acl->link = sco; + sco->link = acl; + + hci_conn_hold(sco); + + if (acl->state == BT_CONNECTED && + (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { + acl->power_save = 1; + hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON); + + if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->pend)) { + /* defer SCO setup until mode change completed */ + set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->pend); + return sco; + } + + hci_sco_setup(acl, 0x00); + } + + return sco; +} +EXPORT_SYMBOL(hci_connect); + +/* Check link security requirement */ +int hci_conn_check_link_mode(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0 && + !(conn->link_mode & HCI_LM_ENCRYPT)) + return 0; + + return 1; +} +EXPORT_SYMBOL(hci_conn_check_link_mode); + +/* Authenticate remote device */ +static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) +{ + BT_DBG("conn %p", conn); + + if (conn->pending_sec_level > sec_level) + sec_level = conn->pending_sec_level; + + if (sec_level > conn->sec_level) + conn->pending_sec_level = sec_level; + else if (conn->link_mode & HCI_LM_AUTH) + return 1; + + /* Make sure we preserve an existing MITM requirement*/ + auth_type |= (conn->auth_type & 0x01); + + conn->auth_type = auth_type; + + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + struct hci_cp_auth_requested cp; + + /* encrypt must be pending if auth is also pending */ + set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + + cp.handle = cpu_to_le16(conn->handle); + hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, + sizeof(cp), &cp); + if (conn->key_type != 0xff) + set_bit(HCI_CONN_REAUTH_PEND, &conn->pend); + } + + return 0; +} + +/* Encrypt the the link */ +static void hci_conn_encrypt(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { + struct hci_cp_set_conn_encrypt cp; + cp.handle = cpu_to_le16(conn->handle); + cp.encrypt = 0x01; + hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), + &cp); + } +} + +/* Enable security */ +int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) +{ + BT_DBG("conn %p", conn); + + /* For sdp we don't need the link key. */ + if (sec_level == BT_SECURITY_SDP) + return 1; + + /* For non 2.1 devices and low security level we don't need the link + key. */ + if (sec_level == BT_SECURITY_LOW && + (!conn->ssp_mode || !conn->hdev->ssp_mode)) + return 1; + + /* For other security levels we need the link key. */ + if (!(conn->link_mode & HCI_LM_AUTH)) + goto auth; + + /* An authenticated combination key has sufficient security for any + security level. */ + if (conn->key_type == HCI_LK_AUTH_COMBINATION) + goto encrypt; + + /* An unauthenticated combination key has sufficient security for + security level 1 and 2. */ + if (conn->key_type == HCI_LK_UNAUTH_COMBINATION && + (sec_level == BT_SECURITY_MEDIUM || + sec_level == BT_SECURITY_LOW)) + goto encrypt; + + /* A combination key has always sufficient security for the security + levels 1 or 2. High security level requires the combination key + is generated using maximum PIN code length (16). + For pre 2.1 units. */ + if (conn->key_type == HCI_LK_COMBINATION && + (sec_level != BT_SECURITY_HIGH || + conn->pin_length == 16)) + goto encrypt; + +auth: + if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) + return 0; + + if (!hci_conn_auth(conn, sec_level, auth_type)) + return 0; + +encrypt: + if (conn->link_mode & HCI_LM_ENCRYPT) + return 1; + + hci_conn_encrypt(conn); + return 0; +} +EXPORT_SYMBOL(hci_conn_security); + +/* Check secure link requirement */ +int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level) +{ + BT_DBG("conn %p", conn); + + if (sec_level != BT_SECURITY_HIGH) + return 1; /* Accept if non-secure is required */ + + if (conn->sec_level == BT_SECURITY_HIGH) + return 1; + + return 0; /* Reject not secure link */ +} +EXPORT_SYMBOL(hci_conn_check_secure); + +/* Change link key */ +int hci_conn_change_link_key(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + struct hci_cp_change_conn_link_key cp; + cp.handle = cpu_to_le16(conn->handle); + hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY, + sizeof(cp), &cp); + } + + return 0; +} +EXPORT_SYMBOL(hci_conn_change_link_key); + +/* Switch role */ +int hci_conn_switch_role(struct hci_conn *conn, __u8 role) +{ + BT_DBG("conn %p", conn); + + if (!role && conn->link_mode & HCI_LM_MASTER) + return 1; + + if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->pend)) { + struct hci_cp_switch_role cp; + bacpy(&cp.bdaddr, &conn->dst); + cp.role = role; + hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); + } + + return 0; +} +EXPORT_SYMBOL(hci_conn_switch_role); + +/* Enter active mode */ +void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("conn %p mode %d", conn, conn->mode); + + if (test_bit(HCI_RAW, &hdev->flags)) + return; + + if (conn->mode != HCI_CM_SNIFF) + goto timer; + + if (!conn->power_save && !force_active) + goto timer; + + if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) { + struct hci_cp_exit_sniff_mode cp; + cp.handle = cpu_to_le16(conn->handle); + hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp); + } + +timer: + if (hdev->idle_timeout > 0) + mod_timer(&conn->idle_timer, + jiffies + msecs_to_jiffies(hdev->idle_timeout)); +} + +/* Enter sniff mode */ +void hci_conn_enter_sniff_mode(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("conn %p mode %d", conn, conn->mode); + + if (test_bit(HCI_RAW, &hdev->flags)) + return; + + if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn)) + return; + + if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF)) + return; + + if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) { + struct hci_cp_sniff_subrate cp; + cp.handle = cpu_to_le16(conn->handle); + cp.max_latency = cpu_to_le16(0); + cp.min_remote_timeout = cpu_to_le16(0); + cp.min_local_timeout = cpu_to_le16(0); + hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp); + } + + if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) { + struct hci_cp_sniff_mode cp; + cp.handle = cpu_to_le16(conn->handle); + cp.max_interval = cpu_to_le16(hdev->sniff_max_interval); + cp.min_interval = cpu_to_le16(hdev->sniff_min_interval); + cp.attempt = cpu_to_le16(4); + cp.timeout = cpu_to_le16(1); + hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp); + } +} + +/* Drop all connection on the device */ +void hci_conn_hash_flush(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct list_head *p; + + BT_DBG("hdev %s", hdev->name); + + p = h->list.next; + while (p != &h->list) { + struct hci_conn *c; + + c = list_entry(p, struct hci_conn, list); + p = p->next; + + c->state = BT_CLOSED; + + hci_proto_disconn_cfm(c, 0x16); + hci_conn_del(c); + } +} + +/* Check pending connect attempts */ +void hci_conn_check_pending(struct hci_dev *hdev) +{ + struct hci_conn *conn; + + BT_DBG("hdev %s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2); + if (conn) + hci_acl_connect(conn); + + hci_dev_unlock(hdev); +} + +void hci_conn_hold_device(struct hci_conn *conn) +{ + atomic_inc(&conn->devref); +} +EXPORT_SYMBOL(hci_conn_hold_device); + +void hci_conn_put_device(struct hci_conn *conn) +{ + if (atomic_dec_and_test(&conn->devref)) + hci_conn_del_sysfs(conn); +} +EXPORT_SYMBOL(hci_conn_put_device); + +int hci_get_conn_list(void __user *arg) +{ + struct hci_conn_list_req req, *cl; + struct hci_conn_info *ci; + struct hci_dev *hdev; + struct list_head *p; + int n = 0, size, err; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci)) + return -EINVAL; + + size = sizeof(req) + req.conn_num * sizeof(*ci); + + cl = kmalloc(size, GFP_KERNEL); + if (!cl) + return -ENOMEM; + + hdev = hci_dev_get(req.dev_id); + if (!hdev) { + kfree(cl); + return -ENODEV; + } + + ci = cl->conn_info; + + hci_dev_lock_bh(hdev); + list_for_each(p, &hdev->conn_hash.list) { + register struct hci_conn *c; + c = list_entry(p, struct hci_conn, list); + + bacpy(&(ci + n)->bdaddr, &c->dst); + (ci + n)->handle = c->handle; + (ci + n)->type = c->type; + (ci + n)->out = c->out; + (ci + n)->state = c->state; + (ci + n)->link_mode = c->link_mode; + if (c->type == SCO_LINK) { + (ci + n)->mtu = hdev->sco_mtu; + (ci + n)->cnt = hdev->sco_cnt; + (ci + n)->pkts = hdev->sco_pkts; + } else { + (ci + n)->mtu = hdev->acl_mtu; + (ci + n)->cnt = hdev->acl_cnt; + (ci + n)->pkts = hdev->acl_pkts; + } + if (++n >= req.conn_num) + break; + } + hci_dev_unlock_bh(hdev); + + cl->dev_id = hdev->id; + cl->conn_num = n; + size = sizeof(req) + n * sizeof(*ci); + + hci_dev_put(hdev); + + err = copy_to_user(arg, cl, size); + kfree(cl); + + return err ? -EFAULT : 0; +} + +int hci_get_conn_info(struct hci_dev *hdev, void __user *arg) +{ + struct hci_conn_info_req req; + struct hci_conn_info ci; + struct hci_conn *conn; + char __user *ptr = arg + sizeof(req); + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + hci_dev_lock_bh(hdev); + conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr); + if (conn) { + bacpy(&ci.bdaddr, &conn->dst); + ci.handle = conn->handle; + ci.type = conn->type; + ci.out = conn->out; + ci.state = conn->state; + ci.link_mode = conn->link_mode; + if (req.type == SCO_LINK) { + ci.mtu = hdev->sco_mtu; + ci.cnt = hdev->sco_cnt; + ci.pkts = hdev->sco_pkts; + } else { + ci.mtu = hdev->acl_mtu; + ci.cnt = hdev->acl_cnt; + ci.pkts = hdev->acl_pkts; + } + } + hci_dev_unlock_bh(hdev); + + if (!conn) + return -ENOENT; + + return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0; +} + +int hci_get_auth_info(struct hci_dev *hdev, void __user *arg) +{ + struct hci_auth_info_req req; + struct hci_conn *conn; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + hci_dev_lock_bh(hdev); + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr); + if (conn) + req.type = conn->auth_type; + hci_dev_unlock_bh(hdev); + + if (!conn) + return -ENOENT; + + return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0; +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c new file mode 100644 index 00000000..f38e633c --- /dev/null +++ b/net/bluetooth/hci_core.c @@ -0,0 +1,2427 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI core. */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#define AUTO_OFF_TIMEOUT 2000 + +static void hci_cmd_task(unsigned long arg); +static void hci_rx_task(unsigned long arg); +static void hci_tx_task(unsigned long arg); + +static DEFINE_RWLOCK(hci_task_lock); + +/* HCI device list */ +LIST_HEAD(hci_dev_list); +DEFINE_RWLOCK(hci_dev_list_lock); + +/* HCI callback list */ +LIST_HEAD(hci_cb_list); +DEFINE_RWLOCK(hci_cb_list_lock); + +/* HCI protocols */ +#define HCI_MAX_PROTO 2 +struct hci_proto *hci_proto[HCI_MAX_PROTO]; + +/* HCI notifiers list */ +static ATOMIC_NOTIFIER_HEAD(hci_notifier); + +/* ---- HCI notifications ---- */ + +int hci_register_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&hci_notifier, nb); +} + +int hci_unregister_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&hci_notifier, nb); +} + +static void hci_notify(struct hci_dev *hdev, int event) +{ + atomic_notifier_call_chain(&hci_notifier, event, hdev); +} + +/* ---- HCI requests ---- */ + +void hci_req_complete(struct hci_dev *hdev, __u16 cmd, int result) +{ + BT_DBG("%s command 0x%04x result 0x%2.2x", hdev->name, cmd, result); + + /* If this is the init phase check if the completed command matches + * the last init command, and if not just return. + */ + if (test_bit(HCI_INIT, &hdev->flags) && hdev->init_last_cmd != cmd) + return; + + if (hdev->req_status == HCI_REQ_PEND) { + hdev->req_result = result; + hdev->req_status = HCI_REQ_DONE; + wake_up_interruptible(&hdev->req_wait_q); + } +} + +static void hci_req_cancel(struct hci_dev *hdev, int err) +{ + BT_DBG("%s err 0x%2.2x", hdev->name, err); + + if (hdev->req_status == HCI_REQ_PEND) { + hdev->req_result = err; + hdev->req_status = HCI_REQ_CANCELED; + wake_up_interruptible(&hdev->req_wait_q); + } +} + +/* Execute request and wait for completion. */ +static int __hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt), + unsigned long opt, __u32 timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int err = 0; + + BT_DBG("%s start", hdev->name); + + hdev->req_status = HCI_REQ_PEND; + + add_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + req(hdev, opt); + schedule_timeout(timeout); + + remove_wait_queue(&hdev->req_wait_q, &wait); + + if (signal_pending(current)) + return -EINTR; + + switch (hdev->req_status) { + case HCI_REQ_DONE: + err = -bt_to_errno(hdev->req_result); + break; + + case HCI_REQ_CANCELED: + err = -hdev->req_result; + break; + + default: + err = -ETIMEDOUT; + break; + } + + hdev->req_status = hdev->req_result = 0; + + BT_DBG("%s end: err %d", hdev->name, err); + + return err; +} + +static inline int hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt), + unsigned long opt, __u32 timeout) +{ + int ret; + + if (!test_bit(HCI_UP, &hdev->flags)) + return -ENETDOWN; + + /* Serialize all requests */ + hci_req_lock(hdev); + ret = __hci_request(hdev, req, opt, timeout); + hci_req_unlock(hdev); + + return ret; +} + +static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) +{ + BT_DBG("%s %ld", hdev->name, opt); + + /* Reset device */ + set_bit(HCI_RESET, &hdev->flags); + hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL); +} + +static void hci_init_req(struct hci_dev *hdev, unsigned long opt) +{ + struct hci_cp_delete_stored_link_key cp; + struct sk_buff *skb; + __le16 param; + __u8 flt_type; + + BT_DBG("%s %ld", hdev->name, opt); + + /* Driver initialization */ + + /* Special commands */ + while ((skb = skb_dequeue(&hdev->driver_init))) { + bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; + skb->dev = (void *) hdev; + + skb_queue_tail(&hdev->cmd_q, skb); + tasklet_schedule(&hdev->cmd_task); + } + skb_queue_purge(&hdev->driver_init); + + /* Mandatory initialization */ + + /* Reset */ + if (!test_bit(HCI_QUIRK_NO_RESET, &hdev->quirks)) { + set_bit(HCI_RESET, &hdev->flags); + hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL); + } + + /* Read Local Supported Features */ + hci_send_cmd(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); + + /* Read Local Version */ + hci_send_cmd(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL); + + /* Read Buffer Size (ACL mtu, max pkt, etc.) */ + hci_send_cmd(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL); + +#if 0 + /* Host buffer size */ + { + struct hci_cp_host_buffer_size cp; + cp.acl_mtu = cpu_to_le16(HCI_MAX_ACL_SIZE); + cp.sco_mtu = HCI_MAX_SCO_SIZE; + cp.acl_max_pkt = cpu_to_le16(0xffff); + cp.sco_max_pkt = cpu_to_le16(0xffff); + hci_send_cmd(hdev, HCI_OP_HOST_BUFFER_SIZE, sizeof(cp), &cp); + } +#endif + + /* Read BD Address */ + hci_send_cmd(hdev, HCI_OP_READ_BD_ADDR, 0, NULL); + + /* Read Class of Device */ + hci_send_cmd(hdev, HCI_OP_READ_CLASS_OF_DEV, 0, NULL); + + /* Read Local Name */ + hci_send_cmd(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL); + + /* Read Voice Setting */ + hci_send_cmd(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL); + + /* Optional initialization */ + + /* Clear Event Filters */ + flt_type = HCI_FLT_CLEAR_ALL; + hci_send_cmd(hdev, HCI_OP_SET_EVENT_FLT, 1, &flt_type); + + /* Connection accept timeout ~20 secs */ + param = cpu_to_le16(0x7d00); + hci_send_cmd(hdev, HCI_OP_WRITE_CA_TIMEOUT, 2, ¶m); + + bacpy(&cp.bdaddr, BDADDR_ANY); + cp.delete_all = 1; + hci_send_cmd(hdev, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp); +} + +static void hci_le_init_req(struct hci_dev *hdev, unsigned long opt) +{ + BT_DBG("%s", hdev->name); + + /* Read LE buffer size */ + hci_send_cmd(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL); +} + +static void hci_scan_req(struct hci_dev *hdev, unsigned long opt) +{ + __u8 scan = opt; + + BT_DBG("%s %x", hdev->name, scan); + + /* Inquiry and Page scans */ + hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); +} + +static void hci_auth_req(struct hci_dev *hdev, unsigned long opt) +{ + __u8 auth = opt; + + BT_DBG("%s %x", hdev->name, auth); + + /* Authentication */ + hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); +} + +static void hci_encrypt_req(struct hci_dev *hdev, unsigned long opt) +{ + __u8 encrypt = opt; + + BT_DBG("%s %x", hdev->name, encrypt); + + /* Encryption */ + hci_send_cmd(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); +} + +static void hci_linkpol_req(struct hci_dev *hdev, unsigned long opt) +{ + __le16 policy = cpu_to_le16(opt); + + BT_DBG("%s %x", hdev->name, policy); + + /* Default link policy */ + hci_send_cmd(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); +} + +/* Get HCI device by index. + * Device is held on return. */ +struct hci_dev *hci_dev_get(int index) +{ + struct hci_dev *hdev = NULL; + struct list_head *p; + + BT_DBG("%d", index); + + if (index < 0) + return NULL; + + read_lock(&hci_dev_list_lock); + list_for_each(p, &hci_dev_list) { + struct hci_dev *d = list_entry(p, struct hci_dev, list); + if (d->id == index) { + hdev = hci_dev_hold(d); + break; + } + } + read_unlock(&hci_dev_list_lock); + return hdev; +} + +/* ---- Inquiry support ---- */ +static void inquiry_cache_flush(struct hci_dev *hdev) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *next = cache->list, *e; + + BT_DBG("cache %p", cache); + + cache->list = NULL; + while ((e = next)) { + next = e->next; + kfree(e); + } +} + +struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *e; + + BT_DBG("cache %p, %s", cache, batostr(bdaddr)); + + for (e = cache->list; e; e = e->next) + if (!bacmp(&e->data.bdaddr, bdaddr)) + break; + return e; +} + +void hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *ie; + + BT_DBG("cache %p, %s", cache, batostr(&data->bdaddr)); + + ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); + if (!ie) { + /* Entry not in the cache. Add new one. */ + ie = kzalloc(sizeof(struct inquiry_entry), GFP_ATOMIC); + if (!ie) + return; + + ie->next = cache->list; + cache->list = ie; + } + + memcpy(&ie->data, data, sizeof(*data)); + ie->timestamp = jiffies; + cache->timestamp = jiffies; +} + +static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_info *info = (struct inquiry_info *) buf; + struct inquiry_entry *e; + int copied = 0; + + for (e = cache->list; e && copied < num; e = e->next, copied++) { + struct inquiry_data *data = &e->data; + bacpy(&info->bdaddr, &data->bdaddr); + info->pscan_rep_mode = data->pscan_rep_mode; + info->pscan_period_mode = data->pscan_period_mode; + info->pscan_mode = data->pscan_mode; + memcpy(info->dev_class, data->dev_class, 3); + info->clock_offset = data->clock_offset; + info++; + } + + BT_DBG("cache %p, copied %d", cache, copied); + return copied; +} + +static void hci_inq_req(struct hci_dev *hdev, unsigned long opt) +{ + struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt; + struct hci_cp_inquiry cp; + + BT_DBG("%s", hdev->name); + + if (test_bit(HCI_INQUIRY, &hdev->flags)) + return; + + /* Start Inquiry */ + memcpy(&cp.lap, &ir->lap, 3); + cp.length = ir->length; + cp.num_rsp = ir->num_rsp; + hci_send_cmd(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp); +} + +int hci_inquiry(void __user *arg) +{ + __u8 __user *ptr = arg; + struct hci_inquiry_req ir; + struct hci_dev *hdev; + int err = 0, do_inquiry = 0, max_rsp; + long timeo; + __u8 *buf; + + if (copy_from_user(&ir, ptr, sizeof(ir))) + return -EFAULT; + + hdev = hci_dev_get(ir.dev_id); + if (!hdev) + return -ENODEV; + + hci_dev_lock_bh(hdev); + if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || + inquiry_cache_empty(hdev) || + ir.flags & IREQ_CACHE_FLUSH) { + inquiry_cache_flush(hdev); + do_inquiry = 1; + } + hci_dev_unlock_bh(hdev); + + timeo = ir.length * msecs_to_jiffies(2000); + + if (do_inquiry) { + err = hci_request(hdev, hci_inq_req, (unsigned long)&ir, timeo); + if (err < 0) + goto done; + } + + /* for unlimited number of responses we will use buffer with 255 entries */ + max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; + + /* cache_dump can't sleep. Therefore we allocate temp buffer and then + * copy it to the user space. + */ + buf = kmalloc(sizeof(struct inquiry_info) * max_rsp, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + goto done; + } + + hci_dev_lock_bh(hdev); + ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); + hci_dev_unlock_bh(hdev); + + BT_DBG("num_rsp %d", ir.num_rsp); + + if (!copy_to_user(ptr, &ir, sizeof(ir))) { + ptr += sizeof(ir); + if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * + ir.num_rsp)) + err = -EFAULT; + } else + err = -EFAULT; + + kfree(buf); + +done: + hci_dev_put(hdev); + return err; +} + +/* ---- HCI ioctl helpers ---- */ + +int hci_dev_open(__u16 dev) +{ + struct hci_dev *hdev; + int ret = 0; + + hdev = hci_dev_get(dev); + if (!hdev) + return -ENODEV; + + BT_DBG("%s %p", hdev->name, hdev); + + hci_req_lock(hdev); + + if (test_bit(HCI_UNREGISTER, &hdev->flags)) { + ret = -ENODEV; + goto done; + } + + if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) { + ret = -ERFKILL; + goto done; + } + + if (test_bit(HCI_UP, &hdev->flags)) { + ret = -EALREADY; + goto done; + } + + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + set_bit(HCI_RAW, &hdev->flags); + + /* Treat all non BR/EDR controllers as raw devices for now */ + if (hdev->dev_type != HCI_BREDR) + set_bit(HCI_RAW, &hdev->flags); + + if (hdev->open(hdev)) { + ret = -EIO; + goto done; + } + + if (!test_bit(HCI_RAW, &hdev->flags)) { + atomic_set(&hdev->cmd_cnt, 1); + set_bit(HCI_INIT, &hdev->flags); + hdev->init_last_cmd = 0; + + ret = __hci_request(hdev, hci_init_req, 0, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + + if (lmp_host_le_capable(hdev)) + ret = __hci_request(hdev, hci_le_init_req, 0, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + + clear_bit(HCI_INIT, &hdev->flags); + } + + if (!ret) { + hci_dev_hold(hdev); + set_bit(HCI_UP, &hdev->flags); + hci_notify(hdev, HCI_DEV_UP); + if (!test_bit(HCI_SETUP, &hdev->flags)) + mgmt_powered(hdev->id, 1); + } else { + /* Init failed, cleanup */ + tasklet_kill(&hdev->rx_task); + tasklet_kill(&hdev->tx_task); + tasklet_kill(&hdev->cmd_task); + + skb_queue_purge(&hdev->cmd_q); + skb_queue_purge(&hdev->rx_q); + + if (hdev->flush) + hdev->flush(hdev); + + if (hdev->sent_cmd) { + kfree_skb(hdev->sent_cmd); + hdev->sent_cmd = NULL; + } + + hdev->close(hdev); + hdev->flags = 0; + } + +done: + hci_req_unlock(hdev); + hci_dev_put(hdev); + return ret; +} + +static int hci_dev_do_close(struct hci_dev *hdev) +{ + BT_DBG("%s %p", hdev->name, hdev); + + hci_req_cancel(hdev, ENODEV); + hci_req_lock(hdev); + + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { + del_timer_sync(&hdev->cmd_timer); + hci_req_unlock(hdev); + return 0; + } + + /* Kill RX and TX tasks */ + tasklet_kill(&hdev->rx_task); + tasklet_kill(&hdev->tx_task); + + hci_dev_lock_bh(hdev); + inquiry_cache_flush(hdev); + hci_conn_hash_flush(hdev); + hci_dev_unlock_bh(hdev); + + hci_notify(hdev, HCI_DEV_DOWN); + + if (hdev->flush) + hdev->flush(hdev); + + /* Reset device */ + skb_queue_purge(&hdev->cmd_q); + atomic_set(&hdev->cmd_cnt, 1); + if (!test_bit(HCI_RAW, &hdev->flags)) { + set_bit(HCI_INIT, &hdev->flags); + __hci_request(hdev, hci_reset_req, 0, + msecs_to_jiffies(250)); + clear_bit(HCI_INIT, &hdev->flags); + } + + /* Kill cmd task */ + tasklet_kill(&hdev->cmd_task); + + /* Drop queues */ + skb_queue_purge(&hdev->rx_q); + skb_queue_purge(&hdev->cmd_q); + skb_queue_purge(&hdev->raw_q); + + /* Drop last sent command */ + if (hdev->sent_cmd) { + del_timer_sync(&hdev->cmd_timer); + kfree_skb(hdev->sent_cmd); + hdev->sent_cmd = NULL; + } + + /* After this point our queues are empty + * and no tasks are scheduled. */ + hdev->close(hdev); + + mgmt_powered(hdev->id, 0); + + /* Clear flags */ + hdev->flags = 0; + + hci_req_unlock(hdev); + + hci_dev_put(hdev); + return 0; +} + +int hci_dev_close(__u16 dev) +{ + struct hci_dev *hdev; + int err; + + hdev = hci_dev_get(dev); + if (!hdev) + return -ENODEV; + err = hci_dev_do_close(hdev); + hci_dev_put(hdev); + return err; +} + +int hci_dev_reset(__u16 dev) +{ + struct hci_dev *hdev; + int ret = 0; + + hdev = hci_dev_get(dev); + if (!hdev) + return -ENODEV; + + hci_req_lock(hdev); + tasklet_disable(&hdev->tx_task); + + if (!test_bit(HCI_UP, &hdev->flags)) + goto done; + + /* Drop queues */ + skb_queue_purge(&hdev->rx_q); + skb_queue_purge(&hdev->cmd_q); + + hci_dev_lock_bh(hdev); + inquiry_cache_flush(hdev); + hci_conn_hash_flush(hdev); + hci_dev_unlock_bh(hdev); + + if (hdev->flush) + hdev->flush(hdev); + + atomic_set(&hdev->cmd_cnt, 1); + hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; + + if (!test_bit(HCI_RAW, &hdev->flags)) + ret = __hci_request(hdev, hci_reset_req, 0, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + +done: + tasklet_enable(&hdev->tx_task); + hci_req_unlock(hdev); + hci_dev_put(hdev); + return ret; +} + +int hci_dev_reset_stat(__u16 dev) +{ + struct hci_dev *hdev; + int ret = 0; + + hdev = hci_dev_get(dev); + if (!hdev) + return -ENODEV; + + memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); + + hci_dev_put(hdev); + + return ret; +} + +int hci_dev_cmd(unsigned int cmd, void __user *arg) +{ + struct hci_dev *hdev; + struct hci_dev_req dr; + int err = 0; + + if (copy_from_user(&dr, arg, sizeof(dr))) + return -EFAULT; + + hdev = hci_dev_get(dr.dev_id); + if (!hdev) + return -ENODEV; + + switch (cmd) { + case HCISETAUTH: + err = hci_request(hdev, hci_auth_req, dr.dev_opt, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + break; + + case HCISETENCRYPT: + if (!lmp_encrypt_capable(hdev)) { + err = -EOPNOTSUPP; + break; + } + + if (!test_bit(HCI_AUTH, &hdev->flags)) { + /* Auth must be enabled first */ + err = hci_request(hdev, hci_auth_req, dr.dev_opt, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + if (err) + break; + } + + err = hci_request(hdev, hci_encrypt_req, dr.dev_opt, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + break; + + case HCISETSCAN: + err = hci_request(hdev, hci_scan_req, dr.dev_opt, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + break; + + case HCISETLINKPOL: + err = hci_request(hdev, hci_linkpol_req, dr.dev_opt, + msecs_to_jiffies(HCI_INIT_TIMEOUT)); + break; + + case HCISETLINKMODE: + hdev->link_mode = ((__u16) dr.dev_opt) & + (HCI_LM_MASTER | HCI_LM_ACCEPT); + break; + + case HCISETPTYPE: + hdev->pkt_type = (__u16) dr.dev_opt; + break; + + case HCISETACLMTU: + hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1); + hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0); + break; + + case HCISETSCOMTU: + hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1); + hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0); + break; + + default: + err = -EINVAL; + break; + } + + hci_dev_put(hdev); + return err; +} + +int hci_get_dev_list(void __user *arg) +{ + struct hci_dev_list_req *dl; + struct hci_dev_req *dr; + struct list_head *p; + int n = 0, size, err; + __u16 dev_num; + + if (get_user(dev_num, (__u16 __user *) arg)) + return -EFAULT; + + if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) + return -EINVAL; + + size = sizeof(*dl) + dev_num * sizeof(*dr); + + dl = kzalloc(size, GFP_KERNEL); + if (!dl) + return -ENOMEM; + + dr = dl->dev_req; + + read_lock_bh(&hci_dev_list_lock); + list_for_each(p, &hci_dev_list) { + struct hci_dev *hdev; + + hdev = list_entry(p, struct hci_dev, list); + + hci_del_off_timer(hdev); + + if (!test_bit(HCI_MGMT, &hdev->flags)) + set_bit(HCI_PAIRABLE, &hdev->flags); + + (dr + n)->dev_id = hdev->id; + (dr + n)->dev_opt = hdev->flags; + + if (++n >= dev_num) + break; + } + read_unlock_bh(&hci_dev_list_lock); + + dl->dev_num = n; + size = sizeof(*dl) + n * sizeof(*dr); + + err = copy_to_user(arg, dl, size); + kfree(dl); + + return err ? -EFAULT : 0; +} + +int hci_get_dev_info(void __user *arg) +{ + struct hci_dev *hdev; + struct hci_dev_info di; + int err = 0; + + if (copy_from_user(&di, arg, sizeof(di))) + return -EFAULT; + + hdev = hci_dev_get(di.dev_id); + if (!hdev) + return -ENODEV; + + hci_del_off_timer(hdev); + + if (!test_bit(HCI_MGMT, &hdev->flags)) + set_bit(HCI_PAIRABLE, &hdev->flags); + + strcpy(di.name, hdev->name); + di.bdaddr = hdev->bdaddr; + di.type = (hdev->bus & 0x0f) | (hdev->dev_type << 4); + di.flags = hdev->flags; + di.pkt_type = hdev->pkt_type; + di.acl_mtu = hdev->acl_mtu; + di.acl_pkts = hdev->acl_pkts; + di.sco_mtu = hdev->sco_mtu; + di.sco_pkts = hdev->sco_pkts; + di.link_policy = hdev->link_policy; + di.link_mode = hdev->link_mode; + + memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); + memcpy(&di.features, &hdev->features, sizeof(di.features)); + + if (copy_to_user(arg, &di, sizeof(di))) + err = -EFAULT; + + hci_dev_put(hdev); + + return err; +} + +/* ---- Interface to HCI drivers ---- */ + +static int hci_rfkill_set_block(void *data, bool blocked) +{ + struct hci_dev *hdev = data; + + BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); + + if (!blocked) + return 0; + + hci_dev_do_close(hdev); + + return 0; +} + +static const struct rfkill_ops hci_rfkill_ops = { + .set_block = hci_rfkill_set_block, +}; + +/* Alloc HCI device */ +struct hci_dev *hci_alloc_dev(void) +{ + struct hci_dev *hdev; + + hdev = kzalloc(sizeof(struct hci_dev), GFP_KERNEL); + if (!hdev) + return NULL; + + skb_queue_head_init(&hdev->driver_init); + + return hdev; +} +EXPORT_SYMBOL(hci_alloc_dev); + +/* Free HCI device */ +void hci_free_dev(struct hci_dev *hdev) +{ + skb_queue_purge(&hdev->driver_init); + + /* will free via device release */ + put_device(&hdev->dev); +} +EXPORT_SYMBOL(hci_free_dev); + +static void hci_power_on(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, power_on); + + BT_DBG("%s", hdev->name); + + if (hci_dev_open(hdev->id) < 0) + return; + + if (test_bit(HCI_AUTO_OFF, &hdev->flags)) + mod_timer(&hdev->off_timer, + jiffies + msecs_to_jiffies(AUTO_OFF_TIMEOUT)); + + if (test_and_clear_bit(HCI_SETUP, &hdev->flags)) + mgmt_index_added(hdev->id); +} + +static void hci_power_off(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, power_off); + + BT_DBG("%s", hdev->name); + + hci_dev_close(hdev->id); +} + +static void hci_auto_off(unsigned long data) +{ + struct hci_dev *hdev = (struct hci_dev *) data; + + BT_DBG("%s", hdev->name); + + clear_bit(HCI_AUTO_OFF, &hdev->flags); + + queue_work(hdev->workqueue, &hdev->power_off); +} + +void hci_del_off_timer(struct hci_dev *hdev) +{ + BT_DBG("%s", hdev->name); + + clear_bit(HCI_AUTO_OFF, &hdev->flags); + del_timer(&hdev->off_timer); +} + +int hci_uuids_clear(struct hci_dev *hdev) +{ + struct list_head *p, *n; + + list_for_each_safe(p, n, &hdev->uuids) { + struct bt_uuid *uuid; + + uuid = list_entry(p, struct bt_uuid, list); + + list_del(p); + kfree(uuid); + } + + return 0; +} + +int hci_link_keys_clear(struct hci_dev *hdev) +{ + struct list_head *p, *n; + + list_for_each_safe(p, n, &hdev->link_keys) { + struct link_key *key; + + key = list_entry(p, struct link_key, list); + + list_del(p); + kfree(key); + } + + return 0; +} + +struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct list_head *p; + + list_for_each(p, &hdev->link_keys) { + struct link_key *k; + + k = list_entry(p, struct link_key, list); + + if (bacmp(bdaddr, &k->bdaddr) == 0) + return k; + } + + return NULL; +} + +static int hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, + u8 key_type, u8 old_key_type) +{ + /* Legacy key */ + if (key_type < 0x03) + return 1; + + /* Debug keys are insecure so don't store them persistently */ + if (key_type == HCI_LK_DEBUG_COMBINATION) + return 0; + + /* Changed combination key and there's no previous one */ + if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) + return 0; + + /* Security mode 3 case */ + if (!conn) + return 1; + + /* Neither local nor remote side had no-bonding as requirement */ + if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) + return 1; + + /* Local side had dedicated bonding as requirement */ + if (conn->auth_type == 0x02 || conn->auth_type == 0x03) + return 1; + + /* Remote side had dedicated bonding as requirement */ + if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) + return 1; + + /* If none of the above criteria match, then don't store the key + * persistently */ + return 0; +} + +struct link_key *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, u8 rand[8]) +{ + struct link_key *k; + + list_for_each_entry(k, &hdev->link_keys, list) { + struct key_master_id *id; + + if (k->type != HCI_LK_SMP_LTK) + continue; + + if (k->dlen != sizeof(*id)) + continue; + + id = (void *) &k->data; + if (id->ediv == ediv && + (memcmp(rand, id->rand, sizeof(id->rand)) == 0)) + return k; + } + + return NULL; +} +EXPORT_SYMBOL(hci_find_ltk); + +struct link_key *hci_find_link_key_type(struct hci_dev *hdev, + bdaddr_t *bdaddr, u8 type) +{ + struct link_key *k; + + list_for_each_entry(k, &hdev->link_keys, list) + if (k->type == type && bacmp(bdaddr, &k->bdaddr) == 0) + return k; + + return NULL; +} +EXPORT_SYMBOL(hci_find_link_key_type); + +int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, + bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len) +{ + struct link_key *key, *old_key; + u8 old_key_type, persistent; + + old_key = hci_find_link_key(hdev, bdaddr); + if (old_key) { + old_key_type = old_key->type; + key = old_key; + } else { + old_key_type = conn ? conn->key_type : 0xff; + key = kzalloc(sizeof(*key), GFP_ATOMIC); + if (!key) + return -ENOMEM; + list_add(&key->list, &hdev->link_keys); + } + + BT_DBG("%s key for %s type %u", hdev->name, batostr(bdaddr), type); + + /* Some buggy controller combinations generate a changed + * combination key for legacy pairing even when there's no + * previous key */ + if (type == HCI_LK_CHANGED_COMBINATION && + (!conn || conn->remote_auth == 0xff) && + old_key_type == 0xff) { + type = HCI_LK_COMBINATION; + if (conn) + conn->key_type = type; + } + + bacpy(&key->bdaddr, bdaddr); + memcpy(key->val, val, 16); + key->pin_len = pin_len; + + if (type == HCI_LK_CHANGED_COMBINATION) + key->type = old_key_type; + else + key->type = type; + + if (!new_key) + return 0; + + persistent = hci_persistent_key(hdev, conn, type, old_key_type); + + mgmt_new_key(hdev->id, key, persistent); + + if (!persistent) { + list_del(&key->list); + kfree(key); + } + + return 0; +} + +int hci_add_ltk(struct hci_dev *hdev, int new_key, bdaddr_t *bdaddr, + u8 key_size, __le16 ediv, u8 rand[8], u8 ltk[16]) +{ + struct link_key *key, *old_key; + struct key_master_id *id; + u8 old_key_type; + + BT_DBG("%s addr %s", hdev->name, batostr(bdaddr)); + + old_key = hci_find_link_key_type(hdev, bdaddr, HCI_LK_SMP_LTK); + if (old_key) { + key = old_key; + old_key_type = old_key->type; + } else { + key = kzalloc(sizeof(*key) + sizeof(*id), GFP_ATOMIC); + if (!key) + return -ENOMEM; + list_add(&key->list, &hdev->link_keys); + old_key_type = 0xff; + } + + key->dlen = sizeof(*id); + + bacpy(&key->bdaddr, bdaddr); + memcpy(key->val, ltk, sizeof(key->val)); + key->type = HCI_LK_SMP_LTK; + key->pin_len = key_size; + + id = (void *) &key->data; + id->ediv = ediv; + memcpy(id->rand, rand, sizeof(id->rand)); + + if (new_key) + mgmt_new_key(hdev->id, key, old_key_type); + + return 0; +} + +int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct link_key *key; + + key = hci_find_link_key(hdev, bdaddr); + if (!key) + return -ENOENT; + + BT_DBG("%s removing %s", hdev->name, batostr(bdaddr)); + + list_del(&key->list); + kfree(key); + + return 0; +} + +/* HCI command timer function */ +static void hci_cmd_timer(unsigned long arg) +{ + struct hci_dev *hdev = (void *) arg; + + BT_ERR("%s command tx timeout", hdev->name); + atomic_set(&hdev->cmd_cnt, 1); + tasklet_schedule(&hdev->cmd_task); +} + +struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, + bdaddr_t *bdaddr) +{ + struct oob_data *data; + + list_for_each_entry(data, &hdev->remote_oob_data, list) + if (bacmp(bdaddr, &data->bdaddr) == 0) + return data; + + return NULL; +} + +int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct oob_data *data; + + data = hci_find_remote_oob_data(hdev, bdaddr); + if (!data) + return -ENOENT; + + BT_DBG("%s removing %s", hdev->name, batostr(bdaddr)); + + list_del(&data->list); + kfree(data); + + return 0; +} + +int hci_remote_oob_data_clear(struct hci_dev *hdev) +{ + struct oob_data *data, *n; + + list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) { + list_del(&data->list); + kfree(data); + } + + return 0; +} + +int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *hash, + u8 *randomizer) +{ + struct oob_data *data; + + data = hci_find_remote_oob_data(hdev, bdaddr); + + if (!data) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (!data) + return -ENOMEM; + + bacpy(&data->bdaddr, bdaddr); + list_add(&data->list, &hdev->remote_oob_data); + } + + memcpy(data->hash, hash, sizeof(data->hash)); + memcpy(data->randomizer, randomizer, sizeof(data->randomizer)); + + BT_DBG("%s for %s", hdev->name, batostr(bdaddr)); + + return 0; +} + +struct bdaddr_list *hci_blacklist_lookup(struct hci_dev *hdev, + bdaddr_t *bdaddr) +{ + struct list_head *p; + + list_for_each(p, &hdev->blacklist) { + struct bdaddr_list *b; + + b = list_entry(p, struct bdaddr_list, list); + + if (bacmp(bdaddr, &b->bdaddr) == 0) + return b; + } + + return NULL; +} + +int hci_blacklist_clear(struct hci_dev *hdev) +{ + struct list_head *p, *n; + + list_for_each_safe(p, n, &hdev->blacklist) { + struct bdaddr_list *b; + + b = list_entry(p, struct bdaddr_list, list); + + list_del(p); + kfree(b); + } + + return 0; +} + +int hci_blacklist_add(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct bdaddr_list *entry; + int err; + + if (bacmp(bdaddr, BDADDR_ANY) == 0) + return -EBADF; + + hci_dev_lock_bh(hdev); + + if (hci_blacklist_lookup(hdev, bdaddr)) { + err = -EEXIST; + goto err; + } + + entry = kzalloc(sizeof(struct bdaddr_list), GFP_KERNEL); + if (!entry) { + return -ENOMEM; + goto err; + } + + bacpy(&entry->bdaddr, bdaddr); + + list_add(&entry->list, &hdev->blacklist); + + err = 0; + +err: + hci_dev_unlock_bh(hdev); + return err; +} + +int hci_blacklist_del(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct bdaddr_list *entry; + int err = 0; + + hci_dev_lock_bh(hdev); + + if (bacmp(bdaddr, BDADDR_ANY) == 0) { + hci_blacklist_clear(hdev); + goto done; + } + + entry = hci_blacklist_lookup(hdev, bdaddr); + if (!entry) { + err = -ENOENT; + goto done; + } + + list_del(&entry->list); + kfree(entry); + +done: + hci_dev_unlock_bh(hdev); + return err; +} + +static void hci_clear_adv_cache(unsigned long arg) +{ + struct hci_dev *hdev = (void *) arg; + + hci_dev_lock(hdev); + + hci_adv_entries_clear(hdev); + + hci_dev_unlock(hdev); +} + +int hci_adv_entries_clear(struct hci_dev *hdev) +{ + struct adv_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &hdev->adv_entries, list) { + list_del(&entry->list); + kfree(entry); + } + + BT_DBG("%s adv cache cleared", hdev->name); + + return 0; +} + +struct adv_entry *hci_find_adv_entry(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct adv_entry *entry; + + list_for_each_entry(entry, &hdev->adv_entries, list) + if (bacmp(bdaddr, &entry->bdaddr) == 0) + return entry; + + return NULL; +} + +static inline int is_connectable_adv(u8 evt_type) +{ + if (evt_type == ADV_IND || evt_type == ADV_DIRECT_IND) + return 1; + + return 0; +} + +int hci_add_adv_entry(struct hci_dev *hdev, + struct hci_ev_le_advertising_info *ev) +{ + struct adv_entry *entry; + + if (!is_connectable_adv(ev->evt_type)) + return -EINVAL; + + /* Only new entries should be added to adv_entries. So, if + * bdaddr was found, don't add it. */ + if (hci_find_adv_entry(hdev, &ev->bdaddr)) + return 0; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + bacpy(&entry->bdaddr, &ev->bdaddr); + entry->bdaddr_type = ev->bdaddr_type; + + list_add(&entry->list, &hdev->adv_entries); + + BT_DBG("%s adv entry added: address %s type %u", hdev->name, + batostr(&entry->bdaddr), entry->bdaddr_type); + + return 0; +} + +/* Register HCI device */ +int hci_register_dev(struct hci_dev *hdev) +{ + struct list_head *head = &hci_dev_list, *p; + int i, id = 0; + + BT_DBG("%p name %s bus %d owner %p", hdev, hdev->name, + hdev->bus, hdev->owner); + + if (!hdev->open || !hdev->close || !hdev->destruct) + return -EINVAL; + + write_lock_bh(&hci_dev_list_lock); + + /* Find first available device id */ + list_for_each(p, &hci_dev_list) { + if (list_entry(p, struct hci_dev, list)->id != id) + break; + head = p; id++; + } + + sprintf(hdev->name, "hci%d", id); + hdev->id = id; + list_add(&hdev->list, head); + + atomic_set(&hdev->refcnt, 1); + spin_lock_init(&hdev->lock); + + hdev->flags = 0; + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); + hdev->esco_type = (ESCO_HV1); + hdev->link_mode = (HCI_LM_ACCEPT); + hdev->io_capability = 0x03; /* No Input No Output */ + + hdev->idle_timeout = 0; + hdev->sniff_max_interval = 800; + hdev->sniff_min_interval = 80; + + tasklet_init(&hdev->cmd_task, hci_cmd_task, (unsigned long) hdev); + tasklet_init(&hdev->rx_task, hci_rx_task, (unsigned long) hdev); + tasklet_init(&hdev->tx_task, hci_tx_task, (unsigned long) hdev); + + skb_queue_head_init(&hdev->rx_q); + skb_queue_head_init(&hdev->cmd_q); + skb_queue_head_init(&hdev->raw_q); + + setup_timer(&hdev->cmd_timer, hci_cmd_timer, (unsigned long) hdev); + + for (i = 0; i < NUM_REASSEMBLY; i++) + hdev->reassembly[i] = NULL; + + init_waitqueue_head(&hdev->req_wait_q); + mutex_init(&hdev->req_lock); + + inquiry_cache_init(hdev); + + hci_conn_hash_init(hdev); + + INIT_LIST_HEAD(&hdev->blacklist); + + INIT_LIST_HEAD(&hdev->uuids); + + INIT_LIST_HEAD(&hdev->link_keys); + + INIT_LIST_HEAD(&hdev->remote_oob_data); + + INIT_LIST_HEAD(&hdev->adv_entries); + setup_timer(&hdev->adv_timer, hci_clear_adv_cache, + (unsigned long) hdev); + + INIT_WORK(&hdev->power_on, hci_power_on); + INIT_WORK(&hdev->power_off, hci_power_off); + setup_timer(&hdev->off_timer, hci_auto_off, (unsigned long) hdev); + + memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); + + atomic_set(&hdev->promisc, 0); + + write_unlock_bh(&hci_dev_list_lock); + + hdev->workqueue = create_singlethread_workqueue(hdev->name); + if (!hdev->workqueue) + goto nomem; + + hdev->tfm = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hdev->tfm)) + BT_INFO("Failed to load transform for ecb(aes): %ld", + PTR_ERR(hdev->tfm)); + + hci_register_sysfs(hdev); + + hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, + RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); + if (hdev->rfkill) { + if (rfkill_register(hdev->rfkill) < 0) { + rfkill_destroy(hdev->rfkill); + hdev->rfkill = NULL; + } + } + + set_bit(HCI_AUTO_OFF, &hdev->flags); + set_bit(HCI_SETUP, &hdev->flags); + queue_work(hdev->workqueue, &hdev->power_on); + + hci_notify(hdev, HCI_DEV_REG); + + return id; + +nomem: + write_lock_bh(&hci_dev_list_lock); + list_del(&hdev->list); + write_unlock_bh(&hci_dev_list_lock); + + return -ENOMEM; +} +EXPORT_SYMBOL(hci_register_dev); + +/* Unregister HCI device */ +int hci_unregister_dev(struct hci_dev *hdev) +{ + int i; + + BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); + + set_bit(HCI_UNREGISTER, &hdev->flags); + + write_lock_bh(&hci_dev_list_lock); + list_del(&hdev->list); + write_unlock_bh(&hci_dev_list_lock); + + hci_dev_do_close(hdev); + + for (i = 0; i < NUM_REASSEMBLY; i++) + kfree_skb(hdev->reassembly[i]); + + if (!test_bit(HCI_INIT, &hdev->flags) && + !test_bit(HCI_SETUP, &hdev->flags)) + mgmt_index_removed(hdev->id); + + if (!IS_ERR(hdev->tfm)) + crypto_free_blkcipher(hdev->tfm); + + hci_notify(hdev, HCI_DEV_UNREG); + + if (hdev->rfkill) { + rfkill_unregister(hdev->rfkill); + rfkill_destroy(hdev->rfkill); + } + + hci_unregister_sysfs(hdev); + + hci_del_off_timer(hdev); + del_timer(&hdev->adv_timer); + + destroy_workqueue(hdev->workqueue); + + hci_dev_lock_bh(hdev); + hci_blacklist_clear(hdev); + hci_uuids_clear(hdev); + hci_link_keys_clear(hdev); + hci_remote_oob_data_clear(hdev); + hci_adv_entries_clear(hdev); + hci_dev_unlock_bh(hdev); + + __hci_dev_put(hdev); + + return 0; +} +EXPORT_SYMBOL(hci_unregister_dev); + +/* Suspend HCI device */ +int hci_suspend_dev(struct hci_dev *hdev) +{ + hci_notify(hdev, HCI_DEV_SUSPEND); + return 0; +} +EXPORT_SYMBOL(hci_suspend_dev); + +/* Resume HCI device */ +int hci_resume_dev(struct hci_dev *hdev) +{ + hci_notify(hdev, HCI_DEV_RESUME); + return 0; +} +EXPORT_SYMBOL(hci_resume_dev); + +/* Receive frame from HCI drivers */ +int hci_recv_frame(struct sk_buff *skb) +{ + struct hci_dev *hdev = (struct hci_dev *) skb->dev; + if (!hdev || (!test_bit(HCI_UP, &hdev->flags) + && !test_bit(HCI_INIT, &hdev->flags))) { + kfree_skb(skb); + return -ENXIO; + } + + /* Incomming skb */ + bt_cb(skb)->incoming = 1; + + /* Time stamp */ + __net_timestamp(skb); + + /* Queue frame for rx task */ + skb_queue_tail(&hdev->rx_q, skb); + tasklet_schedule(&hdev->rx_task); + + return 0; +} +EXPORT_SYMBOL(hci_recv_frame); + +static int hci_reassembly(struct hci_dev *hdev, int type, void *data, + int count, __u8 index) +{ + int len = 0; + int hlen = 0; + int remain = count; + struct sk_buff *skb; + struct bt_skb_cb *scb; + + if ((type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) || + index >= NUM_REASSEMBLY) + return -EILSEQ; + + skb = hdev->reassembly[index]; + + if (!skb) { + switch (type) { + case HCI_ACLDATA_PKT: + len = HCI_MAX_FRAME_SIZE; + hlen = HCI_ACL_HDR_SIZE; + break; + case HCI_EVENT_PKT: + len = HCI_MAX_EVENT_SIZE; + hlen = HCI_EVENT_HDR_SIZE; + break; + case HCI_SCODATA_PKT: + len = HCI_MAX_SCO_SIZE; + hlen = HCI_SCO_HDR_SIZE; + break; + } + + skb = bt_skb_alloc(len, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + scb = (void *) skb->cb; + scb->expect = hlen; + scb->pkt_type = type; + + skb->dev = (void *) hdev; + hdev->reassembly[index] = skb; + } + + while (count) { + scb = (void *) skb->cb; + len = min(scb->expect, (__u16)count); + + memcpy(skb_put(skb, len), data, len); + + count -= len; + data += len; + scb->expect -= len; + remain = count; + + switch (type) { + case HCI_EVENT_PKT: + if (skb->len == HCI_EVENT_HDR_SIZE) { + struct hci_event_hdr *h = hci_event_hdr(skb); + scb->expect = h->plen; + + if (skb_tailroom(skb) < scb->expect) { + kfree_skb(skb); + hdev->reassembly[index] = NULL; + return -ENOMEM; + } + } + break; + + case HCI_ACLDATA_PKT: + if (skb->len == HCI_ACL_HDR_SIZE) { + struct hci_acl_hdr *h = hci_acl_hdr(skb); + scb->expect = __le16_to_cpu(h->dlen); + + if (skb_tailroom(skb) < scb->expect) { + kfree_skb(skb); + hdev->reassembly[index] = NULL; + return -ENOMEM; + } + } + break; + + case HCI_SCODATA_PKT: + if (skb->len == HCI_SCO_HDR_SIZE) { + struct hci_sco_hdr *h = hci_sco_hdr(skb); + scb->expect = h->dlen; + + if (skb_tailroom(skb) < scb->expect) { + kfree_skb(skb); + hdev->reassembly[index] = NULL; + return -ENOMEM; + } + } + break; + } + + if (scb->expect == 0) { + /* Complete frame */ + + bt_cb(skb)->pkt_type = type; + hci_recv_frame(skb); + + hdev->reassembly[index] = NULL; + return remain; + } + } + + return remain; +} + +int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count) +{ + int rem = 0; + + if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) + return -EILSEQ; + + while (count) { + rem = hci_reassembly(hdev, type, data, count, type - 1); + if (rem < 0) + return rem; + + data += (count - rem); + count = rem; + }; + + return rem; +} +EXPORT_SYMBOL(hci_recv_fragment); + +#define STREAM_REASSEMBLY 0 + +int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count) +{ + int type; + int rem = 0; + + while (count) { + struct sk_buff *skb = hdev->reassembly[STREAM_REASSEMBLY]; + + if (!skb) { + struct { char type; } *pkt; + + /* Start of the frame */ + pkt = data; + type = pkt->type; + + data++; + count--; + } else + type = bt_cb(skb)->pkt_type; + + rem = hci_reassembly(hdev, type, data, count, + STREAM_REASSEMBLY); + if (rem < 0) + return rem; + + data += (count - rem); + count = rem; + }; + + return rem; +} +EXPORT_SYMBOL(hci_recv_stream_fragment); + +/* ---- Interface to upper protocols ---- */ + +/* Register/Unregister protocols. + * hci_task_lock is used to ensure that no tasks are running. */ +int hci_register_proto(struct hci_proto *hp) +{ + int err = 0; + + BT_DBG("%p name %s id %d", hp, hp->name, hp->id); + + if (hp->id >= HCI_MAX_PROTO) + return -EINVAL; + + write_lock_bh(&hci_task_lock); + + if (!hci_proto[hp->id]) + hci_proto[hp->id] = hp; + else + err = -EEXIST; + + write_unlock_bh(&hci_task_lock); + + return err; +} +EXPORT_SYMBOL(hci_register_proto); + +int hci_unregister_proto(struct hci_proto *hp) +{ + int err = 0; + + BT_DBG("%p name %s id %d", hp, hp->name, hp->id); + + if (hp->id >= HCI_MAX_PROTO) + return -EINVAL; + + write_lock_bh(&hci_task_lock); + + if (hci_proto[hp->id]) + hci_proto[hp->id] = NULL; + else + err = -ENOENT; + + write_unlock_bh(&hci_task_lock); + + return err; +} +EXPORT_SYMBOL(hci_unregister_proto); + +int hci_register_cb(struct hci_cb *cb) +{ + BT_DBG("%p name %s", cb, cb->name); + + write_lock_bh(&hci_cb_list_lock); + list_add(&cb->list, &hci_cb_list); + write_unlock_bh(&hci_cb_list_lock); + + return 0; +} +EXPORT_SYMBOL(hci_register_cb); + +int hci_unregister_cb(struct hci_cb *cb) +{ + BT_DBG("%p name %s", cb, cb->name); + + write_lock_bh(&hci_cb_list_lock); + list_del(&cb->list); + write_unlock_bh(&hci_cb_list_lock); + + return 0; +} +EXPORT_SYMBOL(hci_unregister_cb); + +static int hci_send_frame(struct sk_buff *skb) +{ + struct hci_dev *hdev = (struct hci_dev *) skb->dev; + + if (!hdev) { + kfree_skb(skb); + return -ENODEV; + } + + BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); + + if (atomic_read(&hdev->promisc)) { + /* Time stamp */ + __net_timestamp(skb); + + hci_send_to_sock(hdev, skb, NULL); + } + + /* Get rid of skb owner, prior to sending to the driver. */ + skb_orphan(skb); + + return hdev->send(skb); +} + +/* Send HCI command */ +int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, void *param) +{ + int len = HCI_COMMAND_HDR_SIZE + plen; + struct hci_command_hdr *hdr; + struct sk_buff *skb; + + BT_DBG("%s opcode 0x%x plen %d", hdev->name, opcode, plen); + + skb = bt_skb_alloc(len, GFP_ATOMIC); + if (!skb) { + BT_ERR("%s no memory for command", hdev->name); + return -ENOMEM; + } + + hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE); + hdr->opcode = cpu_to_le16(opcode); + hdr->plen = plen; + + if (plen) + memcpy(skb_put(skb, plen), param, plen); + + BT_DBG("skb len %d", skb->len); + + bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; + skb->dev = (void *) hdev; + + if (test_bit(HCI_INIT, &hdev->flags)) + hdev->init_last_cmd = opcode; + + skb_queue_tail(&hdev->cmd_q, skb); + tasklet_schedule(&hdev->cmd_task); + + return 0; +} + +/* Get data from the previously sent command */ +void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) +{ + struct hci_command_hdr *hdr; + + if (!hdev->sent_cmd) + return NULL; + + hdr = (void *) hdev->sent_cmd->data; + + if (hdr->opcode != cpu_to_le16(opcode)) + return NULL; + + BT_DBG("%s opcode 0x%x", hdev->name, opcode); + + return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; +} + +/* Send ACL data */ +static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) +{ + struct hci_acl_hdr *hdr; + int len = skb->len; + + skb_push(skb, HCI_ACL_HDR_SIZE); + skb_reset_transport_header(skb); + hdr = (struct hci_acl_hdr *)skb_transport_header(skb); + hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); + hdr->dlen = cpu_to_le16(len); +} + +void hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) +{ + struct hci_dev *hdev = conn->hdev; + struct sk_buff *list; + + BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags); + + skb->dev = (void *) hdev; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; + hci_add_acl_hdr(skb, conn->handle, flags); + + list = skb_shinfo(skb)->frag_list; + if (!list) { + /* Non fragmented */ + BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); + + skb_queue_tail(&conn->data_q, skb); + } else { + /* Fragmented */ + BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); + + skb_shinfo(skb)->frag_list = NULL; + + /* Queue all fragments atomically */ + spin_lock_bh(&conn->data_q.lock); + + __skb_queue_tail(&conn->data_q, skb); + + flags &= ~ACL_START; + flags |= ACL_CONT; + do { + skb = list; list = list->next; + + skb->dev = (void *) hdev; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; + hci_add_acl_hdr(skb, conn->handle, flags); + + BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); + + __skb_queue_tail(&conn->data_q, skb); + } while (list); + + spin_unlock_bh(&conn->data_q.lock); + } + + tasklet_schedule(&hdev->tx_task); +} +EXPORT_SYMBOL(hci_send_acl); + +/* Send SCO data */ +void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_sco_hdr hdr; + + BT_DBG("%s len %d", hdev->name, skb->len); + + hdr.handle = cpu_to_le16(conn->handle); + hdr.dlen = skb->len; + + skb_push(skb, HCI_SCO_HDR_SIZE); + skb_reset_transport_header(skb); + memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); + + skb->dev = (void *) hdev; + bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; + + skb_queue_tail(&conn->data_q, skb); + tasklet_schedule(&hdev->tx_task); +} +EXPORT_SYMBOL(hci_send_sco); + +/* ---- HCI TX task (outgoing data) ---- */ + +/* HCI Connection scheduler */ +static inline struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *conn = NULL; + int num = 0, min = ~0; + struct list_head *p; + + /* We don't have to lock device here. Connections are always + * added and removed with TX task disabled. */ + list_for_each(p, &h->list) { + struct hci_conn *c; + c = list_entry(p, struct hci_conn, list); + + if (c->type != type || skb_queue_empty(&c->data_q)) + continue; + + if (c->state != BT_CONNECTED && c->state != BT_CONFIG) + continue; + + num++; + + if (c->sent < min) { + min = c->sent; + conn = c; + } + } + + if (conn) { + int cnt, q; + + switch (conn->type) { + case ACL_LINK: + cnt = hdev->acl_cnt; + break; + case SCO_LINK: + case ESCO_LINK: + cnt = hdev->sco_cnt; + break; + case LE_LINK: + cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; + break; + default: + cnt = 0; + BT_ERR("Unknown link type"); + } + + q = cnt / num; + *quote = q ? q : 1; + } else + *quote = 0; + + BT_DBG("conn %p quote %d", conn, *quote); + return conn; +} + +static inline void hci_link_tx_to(struct hci_dev *hdev, __u8 type) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct list_head *p; + struct hci_conn *c; + + BT_ERR("%s link tx timeout", hdev->name); + + /* Kill stalled connections */ + list_for_each(p, &h->list) { + c = list_entry(p, struct hci_conn, list); + if (c->type == type && c->sent) { + BT_ERR("%s killing stalled connection %s", + hdev->name, batostr(&c->dst)); + hci_acl_disconn(c, 0x13); + } + } +} + +static inline void hci_sched_acl(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!test_bit(HCI_RAW, &hdev->flags)) { + /* ACL tx timeout must be longer than maximum + * link supervision timeout (40.9 seconds) */ + if (!hdev->acl_cnt && time_after(jiffies, hdev->acl_last_tx + HZ * 45)) + hci_link_tx_to(hdev, ACL_LINK); + } + + while (hdev->acl_cnt && (conn = hci_low_sent(hdev, ACL_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + + hci_conn_enter_active_mode(conn, bt_cb(skb)->force_active); + + hci_send_frame(skb); + hdev->acl_last_tx = jiffies; + + hdev->acl_cnt--; + conn->sent++; + } + } +} + +/* Schedule SCO */ +static inline void hci_sched_sco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + +static inline void hci_sched_esco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + +static inline void hci_sched_le(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote, cnt; + + BT_DBG("%s", hdev->name); + + if (!test_bit(HCI_RAW, &hdev->flags)) { + /* LE tx timeout must be longer than maximum + * link supervision timeout (40.9 seconds) */ + if (!hdev->le_cnt && hdev->le_pkts && + time_after(jiffies, hdev->le_last_tx + HZ * 45)) + hci_link_tx_to(hdev, LE_LINK); + } + + cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; + while (cnt && (conn = hci_low_sent(hdev, LE_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + + hci_send_frame(skb); + hdev->le_last_tx = jiffies; + + cnt--; + conn->sent++; + } + } + if (hdev->le_pkts) + hdev->le_cnt = cnt; + else + hdev->acl_cnt = cnt; +} + +static void hci_tx_task(unsigned long arg) +{ + struct hci_dev *hdev = (struct hci_dev *) arg; + struct sk_buff *skb; + + read_lock(&hci_task_lock); + + BT_DBG("%s acl %d sco %d le %d", hdev->name, hdev->acl_cnt, + hdev->sco_cnt, hdev->le_cnt); + + /* Schedule queues and send stuff to HCI driver */ + + hci_sched_acl(hdev); + + hci_sched_sco(hdev); + + hci_sched_esco(hdev); + + hci_sched_le(hdev); + + /* Send next queued raw (unknown type) packet */ + while ((skb = skb_dequeue(&hdev->raw_q))) + hci_send_frame(skb); + + read_unlock(&hci_task_lock); +} + +/* ----- HCI RX task (incoming data processing) ----- */ + +/* ACL data packet */ +static inline void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_acl_hdr *hdr = (void *) skb->data; + struct hci_conn *conn; + __u16 handle, flags; + + skb_pull(skb, HCI_ACL_HDR_SIZE); + + handle = __le16_to_cpu(hdr->handle); + flags = hci_flags(handle); + handle = hci_handle(handle); + + BT_DBG("%s len %d handle 0x%x flags 0x%x", hdev->name, skb->len, handle, flags); + + hdev->stat.acl_rx++; + + hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_handle(hdev, handle); + hci_dev_unlock(hdev); + + if (conn) { + register struct hci_proto *hp; + + hci_conn_enter_active_mode(conn, bt_cb(skb)->force_active); + + /* Send to upper protocol */ + hp = hci_proto[HCI_PROTO_L2CAP]; + if (hp && hp->recv_acldata) { + hp->recv_acldata(conn, skb, flags); + return; + } + } else { + BT_ERR("%s ACL packet for unknown connection handle %d", + hdev->name, handle); + } + + kfree_skb(skb); +} + +/* SCO data packet */ +static inline void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_sco_hdr *hdr = (void *) skb->data; + struct hci_conn *conn; + __u16 handle; + + skb_pull(skb, HCI_SCO_HDR_SIZE); + + handle = __le16_to_cpu(hdr->handle); + + BT_DBG("%s len %d handle 0x%x", hdev->name, skb->len, handle); + + hdev->stat.sco_rx++; + + hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_handle(hdev, handle); + hci_dev_unlock(hdev); + + if (conn) { + register struct hci_proto *hp; + + /* Send to upper protocol */ + hp = hci_proto[HCI_PROTO_SCO]; + if (hp && hp->recv_scodata) { + hp->recv_scodata(conn, skb); + return; + } + } else { + BT_ERR("%s SCO packet for unknown connection handle %d", + hdev->name, handle); + } + + kfree_skb(skb); +} + +static void hci_rx_task(unsigned long arg) +{ + struct hci_dev *hdev = (struct hci_dev *) arg; + struct sk_buff *skb; + + BT_DBG("%s", hdev->name); + + read_lock(&hci_task_lock); + + while ((skb = skb_dequeue(&hdev->rx_q))) { + if (atomic_read(&hdev->promisc)) { + /* Send copy to the sockets */ + hci_send_to_sock(hdev, skb, NULL); + } + + if (test_bit(HCI_RAW, &hdev->flags)) { + kfree_skb(skb); + continue; + } + + if (test_bit(HCI_INIT, &hdev->flags)) { + /* Don't process data packets in this states. */ + switch (bt_cb(skb)->pkt_type) { + case HCI_ACLDATA_PKT: + case HCI_SCODATA_PKT: + kfree_skb(skb); + continue; + } + } + + /* Process frame */ + switch (bt_cb(skb)->pkt_type) { + case HCI_EVENT_PKT: + hci_event_packet(hdev, skb); + break; + + case HCI_ACLDATA_PKT: + BT_DBG("%s ACL data packet", hdev->name); + hci_acldata_packet(hdev, skb); + break; + + case HCI_SCODATA_PKT: + BT_DBG("%s SCO data packet", hdev->name); + hci_scodata_packet(hdev, skb); + break; + + default: + kfree_skb(skb); + break; + } + } + + read_unlock(&hci_task_lock); +} + +static void hci_cmd_task(unsigned long arg) +{ + struct hci_dev *hdev = (struct hci_dev *) arg; + struct sk_buff *skb; + + BT_DBG("%s cmd %d", hdev->name, atomic_read(&hdev->cmd_cnt)); + + /* Send queued commands */ + if (atomic_read(&hdev->cmd_cnt)) { + skb = skb_dequeue(&hdev->cmd_q); + if (!skb) + return; + + kfree_skb(hdev->sent_cmd); + + hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC); + if (hdev->sent_cmd) { + atomic_dec(&hdev->cmd_cnt); + hci_send_frame(skb); + if (test_bit(HCI_RESET, &hdev->flags)) + del_timer(&hdev->cmd_timer); + else + mod_timer(&hdev->cmd_timer, + jiffies + msecs_to_jiffies(HCI_CMD_TIMEOUT)); + } else { + skb_queue_head(&hdev->cmd_q, skb); + tasklet_schedule(&hdev->cmd_task); + } + } +} diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c new file mode 100755 index 00000000..5a7074a7 --- /dev/null +++ b/net/bluetooth/hci_event.c @@ -0,0 +1,3118 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI event handling. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static int enable_le; + +/* Handle HCI Event packets */ + +static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + if (test_and_clear_bit(HCI_INQUIRY, &hdev->flags) && + test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 0); + + hci_req_complete(hdev, HCI_OP_INQUIRY_CANCEL, status); + + hci_conn_check_pending(hdev); +} + +static void hci_cc_exit_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + if (test_and_clear_bit(HCI_INQUIRY, &hdev->flags) && + test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 0); + + hci_conn_check_pending(hdev); +} + +static void hci_cc_remote_name_req_cancel(struct hci_dev *hdev, struct sk_buff *skb) +{ + BT_DBG("%s", hdev->name); +} + +static void hci_cc_role_discovery(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_role_discovery *rp = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) { + if (rp->role) + conn->link_mode &= ~HCI_LM_MASTER; + else + conn->link_mode |= HCI_LM_MASTER; + } + + hci_dev_unlock(hdev); +} + +static void hci_cc_read_link_policy(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_link_policy *rp = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->link_policy = __le16_to_cpu(rp->policy); + + hci_dev_unlock(hdev); +} + +static void hci_cc_write_link_policy(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_write_link_policy *rp = (void *) skb->data; + struct hci_conn *conn; + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LINK_POLICY); + if (!sent) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) + conn->link_policy = get_unaligned_le16(sent + 2); + + hci_dev_unlock(hdev); +} + +static void hci_cc_read_def_link_policy(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_def_link_policy *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->link_policy = __le16_to_cpu(rp->policy); +} + +static void hci_cc_write_def_link_policy(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_LINK_POLICY); + if (!sent) + return; + + if (!status) + hdev->link_policy = get_unaligned_le16(sent); + + hci_req_complete(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, status); +} + +static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + clear_bit(HCI_RESET, &hdev->flags); + + hci_req_complete(hdev, HCI_OP_RESET, status); +} + +static void hci_cc_write_local_name(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LOCAL_NAME); + if (!sent) + return; + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_set_local_name_complete(hdev->id, sent, status); + + if (status) + return; + + memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH); +} + +static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_local_name *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH); +} + +static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_ENABLE); + if (!sent) + return; + + if (!status) { + __u8 param = *((__u8 *) sent); + + if (param == AUTH_ENABLED) + set_bit(HCI_AUTH, &hdev->flags); + else + clear_bit(HCI_AUTH, &hdev->flags); + } + + hci_req_complete(hdev, HCI_OP_WRITE_AUTH_ENABLE, status); +} + +static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_ENCRYPT_MODE); + if (!sent) + return; + + if (!status) { + __u8 param = *((__u8 *) sent); + + if (param) + set_bit(HCI_ENCRYPT, &hdev->flags); + else + clear_bit(HCI_ENCRYPT, &hdev->flags); + } + + hci_req_complete(hdev, HCI_OP_WRITE_ENCRYPT_MODE, status); +} + +static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SCAN_ENABLE); + if (!sent) + return; + + if (!status) { + __u8 param = *((__u8 *) sent); + int old_pscan, old_iscan; + + old_pscan = test_and_clear_bit(HCI_PSCAN, &hdev->flags); + old_iscan = test_and_clear_bit(HCI_ISCAN, &hdev->flags); + + if (param & SCAN_INQUIRY) { + set_bit(HCI_ISCAN, &hdev->flags); + if (!old_iscan) + mgmt_discoverable(hdev->id, 1); + } else if (old_iscan) + mgmt_discoverable(hdev->id, 0); + + if (param & SCAN_PAGE) { + set_bit(HCI_PSCAN, &hdev->flags); + if (!old_pscan) + mgmt_connectable(hdev->id, 1); + } else if (old_pscan) + mgmt_connectable(hdev->id, 0); + } + + hci_req_complete(hdev, HCI_OP_WRITE_SCAN_ENABLE, status); +} + +static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_class_of_dev *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + memcpy(hdev->dev_class, rp->dev_class, 3); + + BT_DBG("%s class 0x%.2x%.2x%.2x", hdev->name, + hdev->dev_class[2], hdev->dev_class[1], hdev->dev_class[0]); +} + +static void hci_cc_write_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_CLASS_OF_DEV); + if (!sent) + return; + + memcpy(hdev->dev_class, sent, 3); +} + +static void hci_cc_read_voice_setting(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_voice_setting *rp = (void *) skb->data; + __u16 setting; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + setting = __le16_to_cpu(rp->voice_setting); + + if (hdev->voice_setting == setting) + return; + + hdev->voice_setting = setting; + + BT_DBG("%s voice setting 0x%04x", hdev->name, setting); + + if (hdev->notify) { + tasklet_disable(&hdev->tx_task); + hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING); + tasklet_enable(&hdev->tx_task); + } +} + +static void hci_cc_write_voice_setting(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + __u16 setting; + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_VOICE_SETTING); + if (!sent) + return; + + setting = get_unaligned_le16(sent); + + if (hdev->voice_setting == setting) + return; + + hdev->voice_setting = setting; + + BT_DBG("%s voice setting 0x%04x", hdev->name, setting); + + if (hdev->notify) { + tasklet_disable(&hdev->tx_task); + hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING); + tasklet_enable(&hdev->tx_task); + } +} + +static void hci_cc_host_buffer_size(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_HOST_BUFFER_SIZE, status); +} + +static void hci_cc_read_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_ssp_mode *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->ssp_mode = rp->mode; +} + +static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + void *sent; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_MODE); + if (!sent) + return; + + hdev->ssp_mode = *((__u8 *) sent); +} + +static u8 hci_get_inquiry_mode(struct hci_dev *hdev) +{ + if (hdev->features[6] & LMP_EXT_INQ) + return 2; + + if (hdev->features[3] & LMP_RSSI_INQ) + return 1; + + if (hdev->manufacturer == 11 && hdev->hci_rev == 0x00 && + hdev->lmp_subver == 0x0757) + return 1; + + if (hdev->manufacturer == 15) { + if (hdev->hci_rev == 0x03 && hdev->lmp_subver == 0x6963) + return 1; + if (hdev->hci_rev == 0x09 && hdev->lmp_subver == 0x6963) + return 1; + if (hdev->hci_rev == 0x00 && hdev->lmp_subver == 0x6965) + return 1; + } + + if (hdev->manufacturer == 31 && hdev->hci_rev == 0x2005 && + hdev->lmp_subver == 0x1805) + return 1; + + return 0; +} + +static void hci_setup_inquiry_mode(struct hci_dev *hdev) +{ + u8 mode; + + mode = hci_get_inquiry_mode(hdev); + + hci_send_cmd(hdev, HCI_OP_WRITE_INQUIRY_MODE, 1, &mode); +} + +static void hci_setup_event_mask(struct hci_dev *hdev) +{ + /* The second byte is 0xff instead of 0x9f (two reserved bits + * disabled) since a Broadcom 1.2 dongle doesn't respond to the + * command otherwise */ + u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 }; + + /* CSR 1.1 dongles does not accept any bitfield so don't try to set + * any event mask for pre 1.2 devices */ + if (hdev->lmp_ver <= 1) + return; + + events[4] |= 0x01; /* Flow Specification Complete */ + events[4] |= 0x02; /* Inquiry Result with RSSI */ + events[4] |= 0x04; /* Read Remote Extended Features Complete */ + events[5] |= 0x08; /* Synchronous Connection Complete */ + events[5] |= 0x10; /* Synchronous Connection Changed */ + + if (hdev->features[3] & LMP_RSSI_INQ) + events[4] |= 0x04; /* Inquiry Result with RSSI */ + + if (hdev->features[5] & LMP_SNIFF_SUBR) + events[5] |= 0x20; /* Sniff Subrating */ + + if (hdev->features[5] & LMP_PAUSE_ENC) + events[5] |= 0x80; /* Encryption Key Refresh Complete */ + + if (hdev->features[6] & LMP_EXT_INQ) + events[5] |= 0x40; /* Extended Inquiry Result */ + + if (hdev->features[6] & LMP_NO_FLUSH) + events[7] |= 0x01; /* Enhanced Flush Complete */ + + if (hdev->features[7] & LMP_LSTO) + events[6] |= 0x80; /* Link Supervision Timeout Changed */ + + if (hdev->features[6] & LMP_SIMPLE_PAIR) { + events[6] |= 0x01; /* IO Capability Request */ + events[6] |= 0x02; /* IO Capability Response */ + events[6] |= 0x04; /* User Confirmation Request */ + events[6] |= 0x08; /* User Passkey Request */ + events[6] |= 0x10; /* Remote OOB Data Request */ + events[6] |= 0x20; /* Simple Pairing Complete */ + events[7] |= 0x04; /* User Passkey Notification */ + events[7] |= 0x08; /* Keypress Notification */ + events[7] |= 0x10; /* Remote Host Supported + * Features Notification */ + } + + if (hdev->features[4] & LMP_LE) + events[7] |= 0x20; /* LE Meta-Event */ + + hci_send_cmd(hdev, HCI_OP_SET_EVENT_MASK, sizeof(events), events); +} + +static void hci_set_le_support(struct hci_dev *hdev) +{ + struct hci_cp_write_le_host_supported cp; + + memset(&cp, 0, sizeof(cp)); + + if (enable_le) { + cp.le = 1; + cp.simul = !!(hdev->features[6] & LMP_SIMUL_LE_BR); + } + + hci_send_cmd(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp); +} + +static void hci_setup(struct hci_dev *hdev) +{ + hci_setup_event_mask(hdev); + + if (hdev->lmp_ver > 1) + hci_send_cmd(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL); + + if (hdev->features[6] & LMP_SIMPLE_PAIR) { + u8 mode = 0x01; + hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode); + } + + if (hdev->features[3] & LMP_RSSI_INQ) + hci_setup_inquiry_mode(hdev); + + if (hdev->features[7] & LMP_INQ_TX_PWR) + hci_send_cmd(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL); + + if (hdev->features[7] & LMP_EXTFEATURES) { + struct hci_cp_read_local_ext_features cp; + + cp.page = 0x01; + hci_send_cmd(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, + sizeof(cp), &cp); + } + + if (hdev->features[4] & LMP_LE) + hci_set_le_support(hdev); +} + +static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_local_version *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->hci_ver = rp->hci_ver; + hdev->hci_rev = __le16_to_cpu(rp->hci_rev); + hdev->lmp_ver = rp->lmp_ver; + hdev->manufacturer = __le16_to_cpu(rp->manufacturer); + hdev->lmp_subver = __le16_to_cpu(rp->lmp_subver); + + BT_DBG("%s manufacturer %d hci ver %d:%d", hdev->name, + hdev->manufacturer, + hdev->hci_ver, hdev->hci_rev); + + if (test_bit(HCI_INIT, &hdev->flags)) + hci_setup(hdev); +} + +static void hci_setup_link_policy(struct hci_dev *hdev) +{ + u16 link_policy = 0; + + if (hdev->features[0] & LMP_RSWITCH) + link_policy |= HCI_LP_RSWITCH; + if (hdev->features[0] & LMP_HOLD) + link_policy |= HCI_LP_HOLD; + if (hdev->features[0] & LMP_SNIFF) + link_policy |= HCI_LP_SNIFF; + if (hdev->features[1] & LMP_PARK) + link_policy |= HCI_LP_PARK; + + link_policy = cpu_to_le16(link_policy); + hci_send_cmd(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, + sizeof(link_policy), &link_policy); +} + +static void hci_cc_read_local_commands(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_local_commands *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + goto done; + + memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); + + if (test_bit(HCI_INIT, &hdev->flags) && (hdev->commands[5] & 0x10)) + hci_setup_link_policy(hdev); + +done: + hci_req_complete(hdev, HCI_OP_READ_LOCAL_COMMANDS, rp->status); +} + +static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_local_features *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + memcpy(hdev->features, rp->features, 8); + + /* Adjust default settings according to features + * supported by device. */ + + if (hdev->features[0] & LMP_3SLOT) + hdev->pkt_type |= (HCI_DM3 | HCI_DH3); + + if (hdev->features[0] & LMP_5SLOT) + hdev->pkt_type |= (HCI_DM5 | HCI_DH5); + + if (hdev->features[1] & LMP_HV2) { + hdev->pkt_type |= (HCI_HV2); + hdev->esco_type |= (ESCO_HV2); + } + + if (hdev->features[1] & LMP_HV3) { + hdev->pkt_type |= (HCI_HV3); + hdev->esco_type |= (ESCO_HV3); + } + + if (hdev->features[3] & LMP_ESCO) + hdev->esco_type |= (ESCO_EV3); + + if (hdev->features[4] & LMP_EV4) + hdev->esco_type |= (ESCO_EV4); + + if (hdev->features[4] & LMP_EV5) + hdev->esco_type |= (ESCO_EV5); + + if (hdev->features[5] & LMP_EDR_ESCO_2M) + hdev->esco_type |= (ESCO_2EV3); + + if (hdev->features[5] & LMP_EDR_ESCO_3M) + hdev->esco_type |= (ESCO_3EV3); + + if (hdev->features[5] & LMP_EDR_3S_ESCO) + hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5); + + BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, + hdev->features[0], hdev->features[1], + hdev->features[2], hdev->features[3], + hdev->features[4], hdev->features[5], + hdev->features[6], hdev->features[7]); +} + +static void hci_cc_read_local_ext_features(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_local_ext_features *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + memcpy(hdev->extfeatures, rp->features, 8); + + hci_req_complete(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, rp->status); +} + +static void hci_cc_read_buffer_size(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_buffer_size *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->acl_mtu = __le16_to_cpu(rp->acl_mtu); + hdev->sco_mtu = rp->sco_mtu; + hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt); + hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt); + + if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) { + hdev->sco_mtu = 64; + hdev->sco_pkts = 8; + } + + hdev->acl_cnt = hdev->acl_pkts; + hdev->sco_cnt = hdev->sco_pkts; + + BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name, + hdev->acl_mtu, hdev->acl_pkts, + hdev->sco_mtu, hdev->sco_pkts); +} + +static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_bd_addr *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (!rp->status) + bacpy(&hdev->bdaddr, &rp->bdaddr); + + hci_req_complete(hdev, HCI_OP_READ_BD_ADDR, rp->status); +} + +static void hci_cc_write_ca_timeout(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_WRITE_CA_TIMEOUT, status); +} + +static void hci_cc_delete_stored_link_key(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_DELETE_STORED_LINK_KEY, status); +} + +static void hci_cc_set_event_mask(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_SET_EVENT_MASK, status); +} + +static void hci_cc_write_inquiry_mode(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_WRITE_INQUIRY_MODE, status); +} + +static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, status); +} + +static void hci_cc_set_event_flt(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + hci_req_complete(hdev, HCI_OP_SET_EVENT_FLT, status); +} + +static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_pin_code_reply *rp = (void *) skb->data; + struct hci_cp_pin_code_reply *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_pin_code_reply_complete(hdev->id, &rp->bdaddr, rp->status); + + if (rp->status != 0) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_PIN_CODE_REPLY); + if (!cp) + return; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + if (conn) + conn->pin_length = cp->pin_len; +} + +static void hci_cc_pin_code_neg_reply(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_pin_code_neg_reply *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_pin_code_neg_reply_complete(hdev->id, &rp->bdaddr, + rp->status); +} +static void hci_cc_le_read_buffer_size(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_le_read_buffer_size *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->le_mtu = __le16_to_cpu(rp->le_mtu); + hdev->le_pkts = rp->le_max_pkt; + + hdev->le_cnt = hdev->le_pkts; + + BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts); + + hci_req_complete(hdev, HCI_OP_LE_READ_BUFFER_SIZE, rp->status); +} + +static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_user_confirm_reply *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_user_confirm_reply_complete(hdev->id, &rp->bdaddr, + rp->status); +} + +static void hci_cc_user_confirm_neg_reply(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_user_confirm_reply *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_user_confirm_neg_reply_complete(hdev->id, &rp->bdaddr, + rp->status); +} + +static void hci_cc_read_local_oob_data_reply(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_local_oob_data *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + mgmt_read_local_oob_data_reply_complete(hdev->id, rp->hash, + rp->randomizer, rp->status); +} + +static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_scan_enable *cp; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE); + if (!cp) + return; + + hci_dev_lock(hdev); + + if (cp->enable == 0x01) { + del_timer(&hdev->adv_timer); + hci_adv_entries_clear(hdev); + } else if (cp->enable == 0x00) { + mod_timer(&hdev->adv_timer, jiffies + ADV_CLEAR_TIMEOUT); + } + + hci_dev_unlock(hdev); +} + +static void hci_cc_le_ltk_reply(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_le_ltk_reply *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_req_complete(hdev, HCI_OP_LE_LTK_REPLY, rp->status); +} + +static void hci_cc_le_ltk_neg_reply(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_le_ltk_neg_reply *rp = (void *) skb->data; + + BT_DBG("%s status 0x%x", hdev->name, rp->status); + + if (rp->status) + return; + + hci_req_complete(hdev, HCI_OP_LE_LTK_NEG_REPLY, rp->status); +} + +static inline void hci_cc_write_le_host_supported(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_read_local_ext_features cp; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) + return; + + cp.page = 0x01; + hci_send_cmd(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), &cp); +} + +static inline void hci_cs_inquiry(struct hci_dev *hdev, __u8 status) +{ + BT_DBG("%s status 0x%x", hdev->name, status); + + if (status) { + hci_req_complete(hdev, HCI_OP_INQUIRY, status); + hci_conn_check_pending(hdev); + return; + } + + if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags) && + test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 1); +} + +static inline void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_create_conn *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_CONN); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + + BT_DBG("%s bdaddr %s conn %p", hdev->name, batostr(&cp->bdaddr), conn); + + if (status) { + if (conn && conn->state == BT_CONNECT) { + if (status != 0x0c || conn->attempt > 2) { + conn->state = BT_CLOSED; + hci_proto_connect_cfm(conn, status); + hci_conn_del(conn); + } else + conn->state = BT_CONNECT2; + } + } else { + if (!conn) { + conn = hci_conn_add(hdev, ACL_LINK, 0, &cp->bdaddr); + if (conn) { + conn->out = 1; + conn->link_mode |= HCI_LM_MASTER; + } else + BT_ERR("No memory for new connection"); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_add_sco *cp; + struct hci_conn *acl, *sco; + __u16 handle; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_ADD_SCO); + if (!cp) + return; + + handle = __le16_to_cpu(cp->handle); + + BT_DBG("%s handle %d", hdev->name, handle); + + hci_dev_lock(hdev); + + acl = hci_conn_hash_lookup_handle(hdev, handle); + if (acl) { + sco = acl->link; + if (sco) { + sco->state = BT_CLOSED; + + hci_proto_connect_cfm(sco, status); + hci_conn_del(sco); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_auth_requested *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_AUTH_REQUESTED); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + if (conn->state == BT_CONFIG) { + hci_proto_connect_cfm(conn, status); + hci_conn_put(conn); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_set_conn_encrypt *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_SET_CONN_ENCRYPT); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + if (conn->state == BT_CONFIG) { + hci_proto_connect_cfm(conn, status); + hci_conn_put(conn); + } + } + + hci_dev_unlock(hdev); +} + +static int hci_outgoing_auth_needed(struct hci_dev *hdev, + struct hci_conn *conn) +{ + if (conn->state != BT_CONFIG || !conn->out) + return 0; + + if (conn->pending_sec_level == BT_SECURITY_SDP) + return 0; + + /* Only request authentication for SSP connections or non-SSP + * devices with sec_level HIGH */ + if (!(hdev->ssp_mode > 0 && conn->ssp_mode > 0) && + conn->pending_sec_level != BT_SECURITY_HIGH) + return 0; + + return 1; +} + +static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_remote_name_req *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + /* If successful wait for the name req complete event before + * checking for the need to do authentication */ + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_REMOTE_NAME_REQ); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + if (!conn) + goto unlock; + + if (!hci_outgoing_auth_needed(hdev, conn)) + goto unlock; + + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + struct hci_cp_auth_requested cp; + cp.handle = __cpu_to_le16(conn->handle); + hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); + } + +unlock: + hci_dev_unlock(hdev); +} + +static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_read_remote_features *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_FEATURES); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + if (conn->state == BT_CONFIG) { + hci_proto_connect_cfm(conn, status); + hci_conn_put(conn); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_read_remote_ext_features *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + if (conn->state == BT_CONFIG) { + hci_proto_connect_cfm(conn, status); + hci_conn_put(conn); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_setup_sync_conn *cp; + struct hci_conn *acl, *sco; + __u16 handle; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN); + if (!cp) + return; + + handle = __le16_to_cpu(cp->handle); + + BT_DBG("%s handle %d", hdev->name, handle); + + hci_dev_lock(hdev); + + acl = hci_conn_hash_lookup_handle(hdev, handle); + if (acl) { + sco = acl->link; + if (sco) { + sco->state = BT_CLOSED; + + hci_proto_connect_cfm(sco, status); + hci_conn_del(sco); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_sniff_mode *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_SNIFF_MODE); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend); + + if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend)) + hci_sco_setup(conn, status); + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_exit_sniff_mode *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_EXIT_SNIFF_MODE); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend); + + if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend)) + hci_sco_setup(conn, status); + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_le_create_conn(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_le_create_conn *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%x", hdev->name, status); + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->peer_addr); + + BT_DBG("%s bdaddr %s conn %p", hdev->name, batostr(&cp->peer_addr), + conn); + + if (status) { + if (conn && conn->state == BT_CONNECT) { + conn->state = BT_CLOSED; + hci_proto_connect_cfm(conn, status); + hci_conn_del(conn); + } + } else { + if (!conn) { + conn = hci_conn_add(hdev, LE_LINK, 0, &cp->peer_addr); + if (conn) { + conn->dst_type = cp->peer_addr_type; + conn->out = 1; + } else { + BT_ERR("No memory for new connection"); + } + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status) +{ + BT_DBG("%s status 0x%x", hdev->name, status); +} + +static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status %d", hdev->name, status); + + if (test_and_clear_bit(HCI_INQUIRY, &hdev->flags) && + test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 0); + + hci_req_complete(hdev, HCI_OP_INQUIRY, status); + + hci_conn_check_pending(hdev); +} + +static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct inquiry_data data; + struct inquiry_info *info = (void *) (skb->data + 1); + int num_rsp = *((__u8 *) skb->data); + + BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + + if (!num_rsp) + return; + + hci_dev_lock(hdev); + + if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags)) { + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 1); + } + + for (; num_rsp; num_rsp--, info++) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = info->pscan_mode; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = 0x00; + data.ssp_mode = 0x00; + hci_inquiry_cache_update(hdev, &data); + mgmt_device_found(hdev->id, &info->bdaddr, info->dev_class, 0, + NULL); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_conn_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); + if (!conn) { + if (ev->link_type != SCO_LINK) + goto unlock; + + conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + conn->type = SCO_LINK; + } + + if (!ev->status) { + conn->handle = __le16_to_cpu(ev->handle); + + if (conn->type == ACL_LINK) { + conn->state = BT_CONFIG; + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + mgmt_connected(hdev->id, &ev->bdaddr); + } else + conn->state = BT_CONNECTED; + + hci_conn_hold_device(conn); + hci_conn_add_sysfs(conn); + + if (test_bit(HCI_AUTH, &hdev->flags)) + conn->link_mode |= HCI_LM_AUTH; + + if (test_bit(HCI_ENCRYPT, &hdev->flags)) + conn->link_mode |= HCI_LM_ENCRYPT; + + /* Get remote features */ + if (conn->type == ACL_LINK) { + struct hci_cp_read_remote_features cp; + cp.handle = ev->handle; + hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES, + sizeof(cp), &cp); + } + + /* Set packet type for incoming connection */ + if (!conn->out && hdev->hci_ver < 3) { + struct hci_cp_change_conn_ptype cp; + cp.handle = ev->handle; + cp.pkt_type = cpu_to_le16(conn->pkt_type); + hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, + sizeof(cp), &cp); + } + } else { + conn->state = BT_CLOSED; + if (conn->type == ACL_LINK) + mgmt_connect_failed(hdev->id, &ev->bdaddr, ev->status); + } + + if (conn->type == ACL_LINK) + hci_sco_setup(conn, ev->status); + + if (ev->status) { + hci_proto_connect_cfm(conn, ev->status); + hci_conn_del(conn); + } else if (ev->link_type != ACL_LINK) + hci_proto_connect_cfm(conn, ev->status); + +unlock: + hci_dev_unlock(hdev); + + hci_conn_check_pending(hdev); +} + +static inline bool is_sco_active(struct hci_dev *hdev) +{ + if (hci_conn_hash_lookup_state(hdev, SCO_LINK, BT_CONNECTED) || + (hci_conn_hash_lookup_state(hdev, ESCO_LINK, + BT_CONNECTED))) + return true; + return false; +} + +static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_conn_request *ev = (void *) skb->data; + int mask = hdev->link_mode; + + BT_DBG("%s bdaddr %s type 0x%x", hdev->name, + batostr(&ev->bdaddr), ev->link_type); + + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type); + + if ((mask & HCI_LM_ACCEPT) && + !hci_blacklist_lookup(hdev, &ev->bdaddr)) { + /* Connection accepted */ + struct inquiry_entry *ie; + struct hci_conn *conn; + + hci_dev_lock(hdev); + + ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); + if (ie) + memcpy(ie->data.dev_class, ev->dev_class, 3); + + conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); + if (!conn) { + /* pkt_type not yet used for incoming connections */ + conn = hci_conn_add(hdev, ev->link_type, 0, &ev->bdaddr); + if (!conn) { + BT_ERR("No memory for new connection"); + hci_dev_unlock(hdev); + return; + } + } + + memcpy(conn->dev_class, ev->dev_class, 3); + conn->state = BT_CONNECT; + + hci_dev_unlock(hdev); + + if (ev->link_type == ACL_LINK || !lmp_esco_capable(hdev)) { + struct hci_cp_accept_conn_req cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + + if (lmp_rswitch_capable(hdev) && ((mask & HCI_LM_MASTER) + || is_sco_active(hdev))) + cp.role = 0x00; /* Become master */ + else + cp.role = 0x01; /* Remain slave */ + + hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, + sizeof(cp), &cp); + } else { + struct hci_cp_accept_sync_conn_req cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + cp.pkt_type = cpu_to_le16(conn->pkt_type); + + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); + cp.max_latency = cpu_to_le16(0xffff); + cp.content_format = cpu_to_le16(hdev->voice_setting); + cp.retrans_effort = 0xff; + + hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, + sizeof(cp), &cp); + } + } else { + /* Connection rejected */ + struct hci_cp_reject_conn_req cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + cp.reason = 0x0f; + hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp); + } +} + +static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_disconn_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + if (ev->status) { + mgmt_disconnect_failed(hdev->id); + return; + } + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + conn->state = BT_CLOSED; + + if (conn->type == ACL_LINK || conn->type == LE_LINK) + mgmt_disconnected(hdev->id, &conn->dst); + + hci_proto_disconn_cfm(conn, ev->reason); + hci_conn_del(conn); + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_auth_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + if (!ev->status) { + if (!(conn->ssp_mode > 0 && hdev->ssp_mode > 0) && + test_bit(HCI_CONN_REAUTH_PEND, &conn->pend)) { + BT_INFO("re-auth of legacy device is not possible."); + } else { + conn->link_mode |= HCI_LM_AUTH; + conn->sec_level = conn->pending_sec_level; + } + } else { + mgmt_auth_failed(hdev->id, &conn->dst, ev->status); + } + + clear_bit(HCI_CONN_AUTH_PEND, &conn->pend); + clear_bit(HCI_CONN_REAUTH_PEND, &conn->pend); + + if (conn->state == BT_CONFIG) { + if (!ev->status && hdev->ssp_mode > 0 && conn->ssp_mode > 0) { + struct hci_cp_set_conn_encrypt cp; + cp.handle = ev->handle; + cp.encrypt = 0x01; + hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), + &cp); + } else { + conn->state = BT_CONNECTED; + hci_proto_connect_cfm(conn, ev->status); + hci_conn_put(conn); + } + } else { + hci_auth_cfm(conn, ev->status); + + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + hci_conn_put(conn); + } + + if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { + if (!ev->status) { + struct hci_cp_set_conn_encrypt cp; + cp.handle = ev->handle; + cp.encrypt = 0x01; + hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), + &cp); + } else { + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + hci_encrypt_cfm(conn, ev->status, 0x00); + } + } + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_remote_name_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_remote_name *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_conn_check_pending(hdev); + + hci_dev_lock(hdev); + + if (ev->status == 0 && test_bit(HCI_MGMT, &hdev->flags)) + mgmt_remote_name(hdev->id, &ev->bdaddr, ev->name); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + if (!hci_outgoing_auth_needed(hdev, conn)) + goto unlock; + + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + struct hci_cp_auth_requested cp; + cp.handle = __cpu_to_le16(conn->handle); + hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); + } + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_encrypt_change *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn) { + if (!ev->status) { + if (ev->encrypt) { + /* Encryption implies authentication */ + conn->link_mode |= HCI_LM_AUTH; + conn->link_mode |= HCI_LM_ENCRYPT; + conn->sec_level = conn->pending_sec_level; + } else + conn->link_mode &= ~HCI_LM_ENCRYPT; + } + + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + + if (conn->state == BT_CONFIG) { + if (!ev->status) + conn->state = BT_CONNECTED; + + hci_proto_connect_cfm(conn, ev->status); + hci_conn_put(conn); + } else + hci_encrypt_cfm(conn, ev->status, ev->encrypt); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_change_link_key_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_change_link_key_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn) { + if (!ev->status) + conn->link_mode |= HCI_LM_SECURE; + + clear_bit(HCI_CONN_AUTH_PEND, &conn->pend); + + hci_key_change_cfm(conn, ev->status); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_remote_features_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_remote_features *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + if (!ev->status) + memcpy(conn->features, ev->features, 8); + + if (conn->state != BT_CONFIG) + goto unlock; + + if (!ev->status && lmp_ssp_capable(hdev) && lmp_ssp_capable(conn)) { + struct hci_cp_read_remote_ext_features cp; + cp.handle = ev->handle; + cp.page = 0x01; + hci_send_cmd(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES, + sizeof(cp), &cp); + goto unlock; + } + + if (!ev->status) { + struct hci_cp_remote_name_req cp; + memset(&cp, 0, sizeof(cp)); + bacpy(&cp.bdaddr, &conn->dst); + cp.pscan_rep_mode = 0x02; + hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp); + } + + if (!hci_outgoing_auth_needed(hdev, conn)) { + conn->state = BT_CONNECTED; + hci_proto_connect_cfm(conn, ev->status); + hci_conn_put(conn); + } + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_remote_version_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + BT_DBG("%s", hdev->name); +} + +static inline void hci_qos_setup_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + BT_DBG("%s", hdev->name); +} + +static inline void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_cmd_complete *ev = (void *) skb->data; + __u16 opcode; + + skb_pull(skb, sizeof(*ev)); + + opcode = __le16_to_cpu(ev->opcode); + + switch (opcode) { + case HCI_OP_INQUIRY_CANCEL: + hci_cc_inquiry_cancel(hdev, skb); + break; + + case HCI_OP_EXIT_PERIODIC_INQ: + hci_cc_exit_periodic_inq(hdev, skb); + break; + + case HCI_OP_REMOTE_NAME_REQ_CANCEL: + hci_cc_remote_name_req_cancel(hdev, skb); + break; + + case HCI_OP_ROLE_DISCOVERY: + hci_cc_role_discovery(hdev, skb); + break; + + case HCI_OP_READ_LINK_POLICY: + hci_cc_read_link_policy(hdev, skb); + break; + + case HCI_OP_WRITE_LINK_POLICY: + hci_cc_write_link_policy(hdev, skb); + break; + + case HCI_OP_READ_DEF_LINK_POLICY: + hci_cc_read_def_link_policy(hdev, skb); + break; + + case HCI_OP_WRITE_DEF_LINK_POLICY: + hci_cc_write_def_link_policy(hdev, skb); + break; + + case HCI_OP_RESET: + hci_cc_reset(hdev, skb); + break; + + case HCI_OP_WRITE_LOCAL_NAME: + hci_cc_write_local_name(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_NAME: + hci_cc_read_local_name(hdev, skb); + break; + + case HCI_OP_WRITE_AUTH_ENABLE: + hci_cc_write_auth_enable(hdev, skb); + break; + + case HCI_OP_WRITE_ENCRYPT_MODE: + hci_cc_write_encrypt_mode(hdev, skb); + break; + + case HCI_OP_WRITE_SCAN_ENABLE: + hci_cc_write_scan_enable(hdev, skb); + break; + + case HCI_OP_READ_CLASS_OF_DEV: + hci_cc_read_class_of_dev(hdev, skb); + break; + + case HCI_OP_WRITE_CLASS_OF_DEV: + hci_cc_write_class_of_dev(hdev, skb); + break; + + case HCI_OP_READ_VOICE_SETTING: + hci_cc_read_voice_setting(hdev, skb); + break; + + case HCI_OP_WRITE_VOICE_SETTING: + hci_cc_write_voice_setting(hdev, skb); + break; + + case HCI_OP_HOST_BUFFER_SIZE: + hci_cc_host_buffer_size(hdev, skb); + break; + + case HCI_OP_READ_SSP_MODE: + hci_cc_read_ssp_mode(hdev, skb); + break; + + case HCI_OP_WRITE_SSP_MODE: + hci_cc_write_ssp_mode(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_VERSION: + hci_cc_read_local_version(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_COMMANDS: + hci_cc_read_local_commands(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_FEATURES: + hci_cc_read_local_features(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_EXT_FEATURES: + hci_cc_read_local_ext_features(hdev, skb); + break; + + case HCI_OP_READ_BUFFER_SIZE: + hci_cc_read_buffer_size(hdev, skb); + break; + + case HCI_OP_READ_BD_ADDR: + hci_cc_read_bd_addr(hdev, skb); + break; + + case HCI_OP_WRITE_CA_TIMEOUT: + hci_cc_write_ca_timeout(hdev, skb); + break; + + case HCI_OP_DELETE_STORED_LINK_KEY: + hci_cc_delete_stored_link_key(hdev, skb); + break; + + case HCI_OP_SET_EVENT_MASK: + hci_cc_set_event_mask(hdev, skb); + break; + + case HCI_OP_WRITE_INQUIRY_MODE: + hci_cc_write_inquiry_mode(hdev, skb); + break; + + case HCI_OP_READ_INQ_RSP_TX_POWER: + hci_cc_read_inq_rsp_tx_power(hdev, skb); + break; + + case HCI_OP_SET_EVENT_FLT: + hci_cc_set_event_flt(hdev, skb); + break; + + case HCI_OP_PIN_CODE_REPLY: + hci_cc_pin_code_reply(hdev, skb); + break; + + case HCI_OP_PIN_CODE_NEG_REPLY: + hci_cc_pin_code_neg_reply(hdev, skb); + break; + + case HCI_OP_READ_LOCAL_OOB_DATA: + hci_cc_read_local_oob_data_reply(hdev, skb); + break; + + case HCI_OP_LE_READ_BUFFER_SIZE: + hci_cc_le_read_buffer_size(hdev, skb); + break; + + case HCI_OP_USER_CONFIRM_REPLY: + hci_cc_user_confirm_reply(hdev, skb); + break; + + case HCI_OP_USER_CONFIRM_NEG_REPLY: + hci_cc_user_confirm_neg_reply(hdev, skb); + break; + + case HCI_OP_LE_SET_SCAN_ENABLE: + hci_cc_le_set_scan_enable(hdev, skb); + break; + + case HCI_OP_LE_LTK_REPLY: + hci_cc_le_ltk_reply(hdev, skb); + break; + + case HCI_OP_LE_LTK_NEG_REPLY: + hci_cc_le_ltk_neg_reply(hdev, skb); + break; + + case HCI_OP_WRITE_LE_HOST_SUPPORTED: + hci_cc_write_le_host_supported(hdev, skb); + break; + + default: + BT_DBG("%s opcode 0x%x", hdev->name, opcode); + break; + } + + if (ev->opcode != HCI_OP_NOP) + del_timer(&hdev->cmd_timer); + + if (ev->ncmd) { + atomic_set(&hdev->cmd_cnt, 1); + if (!skb_queue_empty(&hdev->cmd_q)) + tasklet_schedule(&hdev->cmd_task); + } +} + +static inline void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_cmd_status *ev = (void *) skb->data; + __u16 opcode; + + skb_pull(skb, sizeof(*ev)); + + opcode = __le16_to_cpu(ev->opcode); + + switch (opcode) { + case HCI_OP_INQUIRY: + hci_cs_inquiry(hdev, ev->status); + break; + + case HCI_OP_CREATE_CONN: + hci_cs_create_conn(hdev, ev->status); + break; + + case HCI_OP_ADD_SCO: + hci_cs_add_sco(hdev, ev->status); + break; + + case HCI_OP_AUTH_REQUESTED: + hci_cs_auth_requested(hdev, ev->status); + break; + + case HCI_OP_SET_CONN_ENCRYPT: + hci_cs_set_conn_encrypt(hdev, ev->status); + break; + + case HCI_OP_REMOTE_NAME_REQ: + hci_cs_remote_name_req(hdev, ev->status); + break; + + case HCI_OP_READ_REMOTE_FEATURES: + hci_cs_read_remote_features(hdev, ev->status); + break; + + case HCI_OP_READ_REMOTE_EXT_FEATURES: + hci_cs_read_remote_ext_features(hdev, ev->status); + break; + + case HCI_OP_SETUP_SYNC_CONN: + hci_cs_setup_sync_conn(hdev, ev->status); + break; + + case HCI_OP_SNIFF_MODE: + hci_cs_sniff_mode(hdev, ev->status); + break; + + case HCI_OP_EXIT_SNIFF_MODE: + hci_cs_exit_sniff_mode(hdev, ev->status); + break; + + case HCI_OP_DISCONNECT: + if (ev->status != 0) + mgmt_disconnect_failed(hdev->id); + break; + + case HCI_OP_LE_CREATE_CONN: + hci_cs_le_create_conn(hdev, ev->status); + break; + + case HCI_OP_LE_START_ENC: + hci_cs_le_start_enc(hdev, ev->status); + break; + + default: + BT_DBG("%s opcode 0x%x", hdev->name, opcode); + break; + } + + if (ev->opcode != HCI_OP_NOP) + del_timer(&hdev->cmd_timer); + + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { + atomic_set(&hdev->cmd_cnt, 1); + if (!skb_queue_empty(&hdev->cmd_q)) + tasklet_schedule(&hdev->cmd_task); + } +} + +static inline void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_role_change *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + if (!ev->status) { + if (ev->role) + conn->link_mode &= ~HCI_LM_MASTER; + else + conn->link_mode |= HCI_LM_MASTER; + } + + clear_bit(HCI_CONN_RSWITCH_PEND, &conn->pend); + + hci_role_switch_cfm(conn, ev->status, ev->role); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_num_comp_pkts *ev = (void *) skb->data; + __le16 *ptr; + int i; + + skb_pull(skb, sizeof(*ev)); + + BT_DBG("%s num_hndl %d", hdev->name, ev->num_hndl); + + if (skb->len < ev->num_hndl * 4) { + BT_DBG("%s bad parameters", hdev->name); + return; + } + + tasklet_disable(&hdev->tx_task); + + for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) { + struct hci_conn *conn; + __u16 handle, count; + + handle = get_unaligned_le16(ptr++); + count = get_unaligned_le16(ptr++); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn) { + conn->sent -= count; + + if (conn->type == ACL_LINK) { + hdev->acl_cnt += count; + if (hdev->acl_cnt > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; + } else if (conn->type == LE_LINK) { + if (hdev->le_pkts) { + hdev->le_cnt += count; + if (hdev->le_cnt > hdev->le_pkts) + hdev->le_cnt = hdev->le_pkts; + } else { + hdev->acl_cnt += count; + if (hdev->acl_cnt > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; + } + } else { + hdev->sco_cnt += count; + if (hdev->sco_cnt > hdev->sco_pkts) + hdev->sco_cnt = hdev->sco_pkts; + } + } + } + + tasklet_schedule(&hdev->tx_task); + + tasklet_enable(&hdev->tx_task); +} + +static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_mode_change *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn) { + conn->mode = ev->mode; + conn->interval = __le16_to_cpu(ev->interval); + + if (!test_and_clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) { + if (conn->mode == HCI_CM_ACTIVE) + conn->power_save = 1; + else + conn->power_save = 0; + } + + if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend)) + hci_sco_setup(conn, ev->status); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_pin_code_req *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn && conn->state == BT_CONNECTED) { + hci_conn_hold(conn); + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + hci_conn_put(conn); + } + + if (!test_bit(HCI_PAIRABLE, &hdev->flags)) + hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, + sizeof(ev->bdaddr), &ev->bdaddr); + else if (test_bit(HCI_MGMT, &hdev->flags)) { + u8 secure; + + if (conn->pending_sec_level == BT_SECURITY_HIGH) + secure = 1; + else + secure = 0; + + mgmt_pin_code_request(hdev->id, &ev->bdaddr, secure); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_link_key_req *ev = (void *) skb->data; + struct hci_cp_link_key_reply cp; + struct hci_conn *conn; + struct link_key *key; + + BT_DBG("%s", hdev->name); + + if (!test_bit(HCI_LINK_KEYS, &hdev->flags)) + return; + + hci_dev_lock(hdev); + + key = hci_find_link_key(hdev, &ev->bdaddr); + if (!key) { + BT_DBG("%s link key not found for %s", hdev->name, + batostr(&ev->bdaddr)); + goto not_found; + } + + BT_DBG("%s found key type %u for %s", hdev->name, key->type, + batostr(&ev->bdaddr)); + + if (!test_bit(HCI_DEBUG_KEYS, &hdev->flags) && + key->type == HCI_LK_DEBUG_COMBINATION) { + BT_DBG("%s ignoring debug key", hdev->name); + goto not_found; + } + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + if (key->type == HCI_LK_UNAUTH_COMBINATION && + conn->auth_type != 0xff && + (conn->auth_type & 0x01)) { + BT_DBG("%s ignoring unauthenticated key", hdev->name); + goto not_found; + } + + if (key->type == HCI_LK_COMBINATION && key->pin_len < 16 && + conn->pending_sec_level == BT_SECURITY_HIGH) { + BT_DBG("%s ignoring key unauthenticated for high \ + security", hdev->name); + goto not_found; + } + + conn->key_type = key->type; + conn->pin_length = key->pin_len; + } + + bacpy(&cp.bdaddr, &ev->bdaddr); + memcpy(cp.link_key, key->val, 16); + + hci_send_cmd(hdev, HCI_OP_LINK_KEY_REPLY, sizeof(cp), &cp); + + hci_dev_unlock(hdev); + + return; + +not_found: + hci_send_cmd(hdev, HCI_OP_LINK_KEY_NEG_REPLY, 6, &ev->bdaddr); + hci_dev_unlock(hdev); +} + +static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_link_key_notify *ev = (void *) skb->data; + struct hci_conn *conn; + u8 pin_len = 0; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + pin_len = conn->pin_length; + + if (ev->key_type != HCI_LK_CHANGED_COMBINATION) + conn->key_type = ev->key_type; + + hci_conn_put(conn); + } + + if (test_bit(HCI_LINK_KEYS, &hdev->flags)) + hci_add_link_key(hdev, conn, 1, &ev->bdaddr, ev->link_key, + ev->key_type, pin_len); + + hci_dev_unlock(hdev); +} + +static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_clock_offset *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn && !ev->status) { + struct inquiry_entry *ie; + + ie = hci_inquiry_cache_lookup(hdev, &conn->dst); + if (ie) { + ie->data.clock_offset = ev->clock_offset; + ie->timestamp = jiffies; + } + } + + hci_dev_unlock(hdev); +} + +static inline void hci_pkt_type_change_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_pkt_type_change *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn && !ev->status) + conn->pkt_type = __le16_to_cpu(ev->pkt_type); + + hci_dev_unlock(hdev); +} + +static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_pscan_rep_mode *ev = (void *) skb->data; + struct inquiry_entry *ie; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); + if (ie) { + ie->data.pscan_rep_mode = ev->pscan_rep_mode; + ie->timestamp = jiffies; + } + + hci_dev_unlock(hdev); +} + +static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct inquiry_data data; + int num_rsp = *((__u8 *) skb->data); + + BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + + if (!num_rsp) + return; + + hci_dev_lock(hdev); + + if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags)) { + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 1); + } + + if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) { + struct inquiry_info_with_rssi_and_pscan_mode *info; + info = (void *) (skb->data + 1); + + for (; num_rsp; num_rsp--, info++) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = info->pscan_mode; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + data.ssp_mode = 0x00; + hci_inquiry_cache_update(hdev, &data); + mgmt_device_found(hdev->id, &info->bdaddr, + info->dev_class, info->rssi, + NULL); + } + } else { + struct inquiry_info_with_rssi *info = (void *) (skb->data + 1); + + for (; num_rsp; num_rsp--, info++) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = 0x00; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + data.ssp_mode = 0x00; + hci_inquiry_cache_update(hdev, &data); + mgmt_device_found(hdev->id, &info->bdaddr, + info->dev_class, info->rssi, + NULL); + } + } + + hci_dev_unlock(hdev); +} + +static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_remote_ext_features *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + if (!ev->status && ev->page == 0x01) { + struct inquiry_entry *ie; + + ie = hci_inquiry_cache_lookup(hdev, &conn->dst); + if (ie) + ie->data.ssp_mode = (ev->features[0] & 0x01); + + conn->ssp_mode = (ev->features[0] & 0x01); + } + + if (conn->state != BT_CONFIG) + goto unlock; + + if (!ev->status) { + struct hci_cp_remote_name_req cp; + memset(&cp, 0, sizeof(cp)); + bacpy(&cp.bdaddr, &conn->dst); + cp.pscan_rep_mode = 0x02; + hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp); + } + + if (!hci_outgoing_auth_needed(hdev, conn)) { + conn->state = BT_CONNECTED; + hci_proto_connect_cfm(conn, ev->status); + hci_conn_put(conn); + } + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_sync_conn_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); + if (!conn) { + if (ev->link_type == ESCO_LINK) + goto unlock; + + conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + conn->type = SCO_LINK; + } + + switch (ev->status) { + case 0x00: + conn->handle = __le16_to_cpu(ev->handle); + conn->state = BT_CONNECTED; + + hci_conn_hold_device(conn); + hci_conn_add_sysfs(conn); + break; + + case 0x10: /* Connection Accept Timeout */ + case 0x11: /* Unsupported Feature or Parameter Value */ + case 0x1c: /* SCO interval rejected */ + case 0x1a: /* Unsupported Remote Feature */ + case 0x1f: /* Unspecified error */ + if (conn->out && conn->attempt < 2) { + conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | + (hdev->esco_type & EDR_ESCO_MASK); + hci_setup_sync(conn, conn->link->handle); + goto unlock; + } + /* fall through */ + + default: + conn->state = BT_CLOSED; + break; + } + + hci_proto_connect_cfm(conn, ev->status); + if (ev->status) + hci_conn_del(conn); + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_sync_conn_changed_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + BT_DBG("%s", hdev->name); +} + +static inline void hci_sniff_subrate_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_sniff_subrate *ev = (void *) skb->data; + + BT_DBG("%s status %d", hdev->name, ev->status); +} + +static inline void hci_extended_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct inquiry_data data; + struct extended_inquiry_info *info = (void *) (skb->data + 1); + int num_rsp = *((__u8 *) skb->data); + + BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + + if (!num_rsp) + return; + + if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags)) { + + if (test_bit(HCI_MGMT, &hdev->flags)) + mgmt_discovering(hdev->id, 1); + } + + hci_dev_lock(hdev); + + for (; num_rsp; num_rsp--, info++) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = 0x00; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + data.ssp_mode = 0x01; + hci_inquiry_cache_update(hdev, &data); + mgmt_device_found(hdev->id, &info->bdaddr, info->dev_class, + info->rssi, info->data); + } + + hci_dev_unlock(hdev); +} + +static inline u8 hci_get_auth_req(struct hci_conn *conn) +{ + /* If remote requests dedicated bonding follow that lead */ + if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) { + /* If both remote and local IO capabilities allow MITM + * protection then require it, otherwise don't */ + if (conn->remote_cap == 0x03 || conn->io_capability == 0x03) + return 0x02; + else + return 0x03; + } + + /* If remote requests no-bonding follow that lead */ + if (conn->remote_auth == 0x00 || conn->remote_auth == 0x01) + return conn->remote_auth | (conn->auth_type & 0x01); + + return conn->auth_type; +} + +static inline void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_io_capa_request *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + hci_conn_hold(conn); + + if (!test_bit(HCI_MGMT, &hdev->flags)) + goto unlock; + + if (test_bit(HCI_PAIRABLE, &hdev->flags) || + (conn->remote_auth & ~0x01) == HCI_AT_NO_BONDING) { + struct hci_cp_io_capability_reply cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + cp.capability = conn->io_capability; + conn->auth_type = hci_get_auth_req(conn); + cp.authentication = conn->auth_type; + + if ((conn->out == 0x01 || conn->remote_oob == 0x01) && + hci_find_remote_oob_data(hdev, &conn->dst)) + cp.oob_data = 0x01; + else + cp.oob_data = 0x00; + + hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_REPLY, + sizeof(cp), &cp); + } else { + struct hci_cp_io_capability_neg_reply cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + cp.reason = 0x18; /* Pairing not allowed */ + + hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_NEG_REPLY, + sizeof(cp), &cp); + } + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_io_capa_reply_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_io_capa_reply *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + conn->remote_cap = ev->capability; + conn->remote_oob = ev->oob_data; + conn->remote_auth = ev->authentication; + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_user_confirm_request_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_user_confirm_req *ev = (void *) skb->data; + int loc_mitm, rem_mitm, confirm_hint = 0; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + if (!test_bit(HCI_MGMT, &hdev->flags)) + goto unlock; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + loc_mitm = (conn->auth_type & 0x01); + rem_mitm = (conn->remote_auth & 0x01); + + /* If we require MITM but the remote device can't provide that + * (it has NoInputNoOutput) then reject the confirmation + * request. The only exception is when we're dedicated bonding + * initiators (connect_cfm_cb set) since then we always have the MITM + * bit set. */ + if (!conn->connect_cfm_cb && loc_mitm && conn->remote_cap == 0x03) { + BT_DBG("Rejecting request: remote device can't provide MITM"); + hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY, + sizeof(ev->bdaddr), &ev->bdaddr); + goto unlock; + } + + /* If no side requires MITM protection; auto-accept */ + if ((!loc_mitm || conn->remote_cap == 0x03) && + (!rem_mitm || conn->io_capability == 0x03)) { + + /* If we're not the initiators request authorization to + * proceed from user space (mgmt_user_confirm with + * confirm_hint set to 1). */ + if (!test_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + BT_DBG("Confirming auto-accept as acceptor"); + confirm_hint = 1; + goto confirm; + } + + BT_DBG("Auto-accept of user confirmation with %ums delay", + hdev->auto_accept_delay); + + if (hdev->auto_accept_delay > 0) { + int delay = msecs_to_jiffies(hdev->auto_accept_delay); + mod_timer(&conn->auto_accept_timer, jiffies + delay); + goto unlock; + } + + hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_REPLY, + sizeof(ev->bdaddr), &ev->bdaddr); + goto unlock; + } + +confirm: + mgmt_user_confirm_request(hdev->id, &ev->bdaddr, ev->passkey, + confirm_hint); + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_simple_pair_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_simple_pair_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (!conn) + goto unlock; + + /* To avoid duplicate auth_failed events to user space we check + * the HCI_CONN_AUTH_PEND flag which will be set if we + * initiated the authentication. A traditional auth_complete + * event gets always produced as initiator and is also mapped to + * the mgmt_auth_failed event */ + if (!test_bit(HCI_CONN_AUTH_PEND, &conn->pend) && ev->status != 0) + mgmt_auth_failed(hdev->id, &conn->dst, ev->status); + + hci_conn_put(conn); + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_remote_host_features_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_remote_host_features *ev = (void *) skb->data; + struct inquiry_entry *ie; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); + if (ie) + ie->data.ssp_mode = (ev->features[0] & 0x01); + + hci_dev_unlock(hdev); +} + +static inline void hci_remote_oob_data_request_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_remote_oob_data_request *ev = (void *) skb->data; + struct oob_data *data; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + if (!test_bit(HCI_MGMT, &hdev->flags)) + goto unlock; + + data = hci_find_remote_oob_data(hdev, &ev->bdaddr); + if (data) { + struct hci_cp_remote_oob_data_reply cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + memcpy(cp.hash, data->hash, sizeof(cp.hash)); + memcpy(cp.randomizer, data->randomizer, sizeof(cp.randomizer)); + + hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_REPLY, sizeof(cp), + &cp); + } else { + struct hci_cp_remote_oob_data_neg_reply cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_NEG_REPLY, sizeof(cp), + &cp); + } + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_le_conn_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &ev->bdaddr); + if (!conn) { + conn = hci_conn_add(hdev, LE_LINK, 0, &ev->bdaddr); + if (!conn) { + BT_ERR("No memory for new connection"); + hci_dev_unlock(hdev); + return; + } + + conn->dst_type = ev->bdaddr_type; + } + + if (ev->status) { + mgmt_connect_failed(hdev->id, &ev->bdaddr, ev->status); + hci_proto_connect_cfm(conn, ev->status); + conn->state = BT_CLOSED; + hci_conn_del(conn); + goto unlock; + } + + mgmt_connected(hdev->id, &ev->bdaddr); + + conn->sec_level = BT_SECURITY_LOW; + conn->handle = __le16_to_cpu(ev->handle); + conn->state = BT_CONNECTED; + + hci_conn_hold_device(conn); + hci_conn_add_sysfs(conn); + + hci_proto_connect_cfm(conn, ev->status); + +unlock: + hci_dev_unlock(hdev); +} + +static inline void hci_le_adv_report_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_advertising_info *ev; + u8 num_reports; + + num_reports = skb->data[0]; + ev = (void *) &skb->data[1]; + + hci_dev_lock(hdev); + + hci_add_adv_entry(hdev, ev); + + while (--num_reports) { + ev = (void *) (ev->data + ev->length + 1); + hci_add_adv_entry(hdev, ev); + } + + hci_dev_unlock(hdev); +} + +static inline void hci_le_ltk_request_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_ltk_req *ev = (void *) skb->data; + struct hci_cp_le_ltk_reply cp; + struct hci_cp_le_ltk_neg_reply neg; + struct hci_conn *conn; + struct link_key *ltk; + + BT_DBG("%s handle %d", hdev->name, cpu_to_le16(ev->handle)); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn == NULL) + goto not_found; + + ltk = hci_find_ltk(hdev, ev->ediv, ev->random); + if (ltk == NULL) + goto not_found; + + memcpy(cp.ltk, ltk->val, sizeof(ltk->val)); + cp.handle = cpu_to_le16(conn->handle); + conn->pin_length = ltk->pin_len; + + hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp); + + hci_dev_unlock(hdev); + + return; + +not_found: + neg.handle = ev->handle; + hci_send_cmd(hdev, HCI_OP_LE_LTK_NEG_REPLY, sizeof(neg), &neg); + hci_dev_unlock(hdev); +} + +static inline void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_le_meta *le_ev = (void *) skb->data; + + skb_pull(skb, sizeof(*le_ev)); + + switch (le_ev->subevent) { + case HCI_EV_LE_CONN_COMPLETE: + hci_le_conn_complete_evt(hdev, skb); + break; + + case HCI_EV_LE_ADVERTISING_REPORT: + hci_le_adv_report_evt(hdev, skb); + break; + + case HCI_EV_LE_LTK_REQ: + hci_le_ltk_request_evt(hdev, skb); + break; + + default: + break; + } +} + +void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_event_hdr *hdr = (void *) skb->data; + __u8 event = hdr->evt; + + skb_pull(skb, HCI_EVENT_HDR_SIZE); + + switch (event) { + case HCI_EV_INQUIRY_COMPLETE: + hci_inquiry_complete_evt(hdev, skb); + break; + + case HCI_EV_INQUIRY_RESULT: + hci_inquiry_result_evt(hdev, skb); + break; + + case HCI_EV_CONN_COMPLETE: + hci_conn_complete_evt(hdev, skb); + break; + + case HCI_EV_CONN_REQUEST: + hci_conn_request_evt(hdev, skb); + break; + + case HCI_EV_DISCONN_COMPLETE: + hci_disconn_complete_evt(hdev, skb); + break; + + case HCI_EV_AUTH_COMPLETE: + hci_auth_complete_evt(hdev, skb); + break; + + case HCI_EV_REMOTE_NAME: + hci_remote_name_evt(hdev, skb); + break; + + case HCI_EV_ENCRYPT_CHANGE: + hci_encrypt_change_evt(hdev, skb); + break; + + case HCI_EV_CHANGE_LINK_KEY_COMPLETE: + hci_change_link_key_complete_evt(hdev, skb); + break; + + case HCI_EV_REMOTE_FEATURES: + hci_remote_features_evt(hdev, skb); + break; + + case HCI_EV_REMOTE_VERSION: + hci_remote_version_evt(hdev, skb); + break; + + case HCI_EV_QOS_SETUP_COMPLETE: + hci_qos_setup_complete_evt(hdev, skb); + break; + + case HCI_EV_CMD_COMPLETE: + hci_cmd_complete_evt(hdev, skb); + break; + + case HCI_EV_CMD_STATUS: + hci_cmd_status_evt(hdev, skb); + break; + + case HCI_EV_ROLE_CHANGE: + hci_role_change_evt(hdev, skb); + break; + + case HCI_EV_NUM_COMP_PKTS: + hci_num_comp_pkts_evt(hdev, skb); + break; + + case HCI_EV_MODE_CHANGE: + hci_mode_change_evt(hdev, skb); + break; + + case HCI_EV_PIN_CODE_REQ: + hci_pin_code_request_evt(hdev, skb); + break; + + case HCI_EV_LINK_KEY_REQ: + hci_link_key_request_evt(hdev, skb); + break; + + case HCI_EV_LINK_KEY_NOTIFY: + hci_link_key_notify_evt(hdev, skb); + break; + + case HCI_EV_CLOCK_OFFSET: + hci_clock_offset_evt(hdev, skb); + break; + + case HCI_EV_PKT_TYPE_CHANGE: + hci_pkt_type_change_evt(hdev, skb); + break; + + case HCI_EV_PSCAN_REP_MODE: + hci_pscan_rep_mode_evt(hdev, skb); + break; + + case HCI_EV_INQUIRY_RESULT_WITH_RSSI: + hci_inquiry_result_with_rssi_evt(hdev, skb); + break; + + case HCI_EV_REMOTE_EXT_FEATURES: + hci_remote_ext_features_evt(hdev, skb); + break; + + case HCI_EV_SYNC_CONN_COMPLETE: + hci_sync_conn_complete_evt(hdev, skb); + break; + + case HCI_EV_SYNC_CONN_CHANGED: + hci_sync_conn_changed_evt(hdev, skb); + break; + + case HCI_EV_SNIFF_SUBRATE: + hci_sniff_subrate_evt(hdev, skb); + break; + + case HCI_EV_EXTENDED_INQUIRY_RESULT: + hci_extended_inquiry_result_evt(hdev, skb); + break; + + case HCI_EV_IO_CAPA_REQUEST: + hci_io_capa_request_evt(hdev, skb); + break; + + case HCI_EV_IO_CAPA_REPLY: + hci_io_capa_reply_evt(hdev, skb); + break; + + case HCI_EV_USER_CONFIRM_REQUEST: + hci_user_confirm_request_evt(hdev, skb); + break; + + case HCI_EV_SIMPLE_PAIR_COMPLETE: + hci_simple_pair_complete_evt(hdev, skb); + break; + + case HCI_EV_REMOTE_HOST_FEATURES: + hci_remote_host_features_evt(hdev, skb); + break; + + case HCI_EV_LE_META: + hci_le_meta_evt(hdev, skb); + break; + + case HCI_EV_REMOTE_OOB_DATA_REQUEST: + hci_remote_oob_data_request_evt(hdev, skb); + break; + + default: + BT_DBG("%s event 0x%x", hdev->name, event); + break; + } + + kfree_skb(skb); + hdev->stat.evt_rx++; +} + +/* Generate internal stack event */ +void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) +{ + struct hci_event_hdr *hdr; + struct hci_ev_stack_internal *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC); + if (!skb) + return; + + hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE); + hdr->evt = HCI_EV_STACK_INTERNAL; + hdr->plen = sizeof(*ev) + dlen; + + ev = (void *) skb_put(skb, sizeof(*ev) + dlen); + ev->type = type; + memcpy(ev->data, data, dlen); + + bt_cb(skb)->incoming = 1; + __net_timestamp(skb); + + bt_cb(skb)->pkt_type = HCI_EVENT_PKT; + skb->dev = (void *) hdev; + hci_send_to_sock(hdev, skb, NULL); + kfree_skb(skb); +} + +module_param(enable_le, bool, 0444); +MODULE_PARM_DESC(enable_le, "Enable LE support"); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c new file mode 100644 index 00000000..ff02cf5e --- /dev/null +++ b/net/bluetooth/hci_sock.c @@ -0,0 +1,817 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI sockets. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static int enable_mgmt; + +/* ----- HCI socket interface ----- */ + +static inline int hci_test_bit(int nr, void *addr) +{ + return *((__u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); +} + +/* Security filter */ +static struct hci_sec_filter hci_sec_filter = { + /* Packet types */ + 0x10, + /* Events */ + { 0x1000d9fe, 0x0000b00c }, + /* Commands */ + { + { 0x0 }, + /* OGF_LINK_CTL */ + { 0xbe000006, 0x00000001, 0x00000000, 0x00 }, + /* OGF_LINK_POLICY */ + { 0x00005200, 0x00000000, 0x00000000, 0x00 }, + /* OGF_HOST_CTL */ + { 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 }, + /* OGF_INFO_PARAM */ + { 0x000002be, 0x00000000, 0x00000000, 0x00 }, + /* OGF_STATUS_PARAM */ + { 0x000000ea, 0x00000000, 0x00000000, 0x00 } + } +}; + +static struct bt_sock_list hci_sk_list = { + .lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock) +}; + +/* Send frame to RAW socket */ +void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb, + struct sock *skip_sk) +{ + struct sock *sk; + struct hlist_node *node; + + BT_DBG("hdev %p len %d", hdev, skb->len); + + read_lock(&hci_sk_list.lock); + sk_for_each(sk, node, &hci_sk_list.head) { + struct hci_filter *flt; + struct sk_buff *nskb; + + if (sk == skip_sk) + continue; + + if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev) + continue; + + /* Don't send frame to the socket it came from */ + if (skb->sk == sk) + continue; + + if (bt_cb(skb)->channel != hci_pi(sk)->channel) + continue; + + if (bt_cb(skb)->channel == HCI_CHANNEL_CONTROL) + goto clone; + + /* Apply filter */ + flt = &hci_pi(sk)->filter; + + if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ? + 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) + continue; + + if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) { + register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); + + if (!hci_test_bit(evt, &flt->event_mask)) + continue; + + if (flt->opcode && + ((evt == HCI_EV_CMD_COMPLETE && + flt->opcode != + get_unaligned((__le16 *)(skb->data + 3))) || + (evt == HCI_EV_CMD_STATUS && + flt->opcode != + get_unaligned((__le16 *)(skb->data + 4))))) + continue; + } + +clone: + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + continue; + + /* Put type byte before the data */ + if (bt_cb(skb)->channel == HCI_CHANNEL_RAW) + memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1); + + if (sock_queue_rcv_skb(sk, nskb)) + kfree_skb(nskb); + } + read_unlock(&hci_sk_list.lock); +} + +static int hci_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct hci_dev *hdev; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + hdev = hci_pi(sk)->hdev; + + bt_sock_unlink(&hci_sk_list, sk); + + if (hdev) { + atomic_dec(&hdev->promisc); + hci_dev_put(hdev); + } + + sock_orphan(sk); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + + sock_put(sk); + return 0; +} + +static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) +{ + bdaddr_t bdaddr; + + if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) + return -EFAULT; + + return hci_blacklist_add(hdev, &bdaddr); +} + +static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) +{ + bdaddr_t bdaddr; + + if (copy_from_user(&bdaddr, arg, sizeof(bdaddr))) + return -EFAULT; + + return hci_blacklist_del(hdev, &bdaddr); +} + +/* Ioctls that require bound socket */ +static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + struct hci_dev *hdev = hci_pi(sk)->hdev; + + if (!hdev) + return -EBADFD; + + switch (cmd) { + case HCISETRAW: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + return -EPERM; + + if (arg) + set_bit(HCI_RAW, &hdev->flags); + else + clear_bit(HCI_RAW, &hdev->flags); + + return 0; + + case HCIGETCONNINFO: + return hci_get_conn_info(hdev, (void __user *) arg); + + case HCIGETAUTHINFO: + return hci_get_auth_info(hdev, (void __user *) arg); + + case HCIBLOCKADDR: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_sock_blacklist_add(hdev, (void __user *) arg); + + case HCIUNBLOCKADDR: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_sock_blacklist_del(hdev, (void __user *) arg); + + default: + if (hdev->ioctl) + return hdev->ioctl(hdev, cmd, arg); + return -EINVAL; + } +} + +static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *) arg; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case HCIGETDEVLIST: + return hci_get_dev_list(argp); + + case HCIGETDEVINFO: + return hci_get_dev_info(argp); + + case HCIGETCONNLIST: + return hci_get_conn_list(argp); + + case HCIDEVUP: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_open(arg); + + case HCIDEVDOWN: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_close(arg); + + case HCIDEVRESET: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_reset(arg); + + case HCIDEVRESTAT: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_reset_stat(arg); + + case HCISETSCAN: + case HCISETAUTH: + case HCISETENCRYPT: + case HCISETPTYPE: + case HCISETLINKPOL: + case HCISETLINKMODE: + case HCISETACLMTU: + case HCISETSCOMTU: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_cmd(cmd, argp); + + case HCIINQUIRY: + return hci_inquiry(argp); + + default: + lock_sock(sk); + err = hci_sock_bound_ioctl(sk, cmd, arg); + release_sock(sk); + return err; + } +} + +static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_hci haddr; + struct sock *sk = sock->sk; + struct hci_dev *hdev = NULL; + int len, err = 0; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!addr) + return -EINVAL; + + memset(&haddr, 0, sizeof(haddr)); + len = min_t(unsigned int, sizeof(haddr), addr_len); + memcpy(&haddr, addr, len); + + if (haddr.hci_family != AF_BLUETOOTH) + return -EINVAL; + + if (haddr.hci_channel > HCI_CHANNEL_CONTROL) + return -EINVAL; + + if (haddr.hci_channel == HCI_CHANNEL_CONTROL && !enable_mgmt) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state == BT_BOUND || hci_pi(sk)->hdev) { + err = -EALREADY; + goto done; + } + + if (haddr.hci_dev != HCI_DEV_NONE) { + hdev = hci_dev_get(haddr.hci_dev); + if (!hdev) { + err = -ENODEV; + goto done; + } + + atomic_inc(&hdev->promisc); + } + + hci_pi(sk)->channel = haddr.hci_channel; + hci_pi(sk)->hdev = hdev; + sk->sk_state = BT_BOUND; + +done: + release_sock(sk); + return err; +} + +static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) +{ + struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr; + struct sock *sk = sock->sk; + struct hci_dev *hdev = hci_pi(sk)->hdev; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!hdev) + return -EBADFD; + + lock_sock(sk); + + *addr_len = sizeof(*haddr); + haddr->hci_family = AF_BLUETOOTH; + haddr->hci_dev = hdev->id; + + release_sock(sk); + return 0; +} + +static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + __u32 mask = hci_pi(sk)->cmsg_mask; + + if (mask & HCI_CMSG_DIR) { + int incoming = bt_cb(skb)->incoming; + put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); + } + + if (mask & HCI_CMSG_TSTAMP) { +#ifdef CONFIG_COMPAT + struct compat_timeval ctv; +#endif + struct timeval tv; + void *data; + int len; + + skb_get_timestamp(skb, &tv); + + data = &tv; + len = sizeof(tv); +#ifdef CONFIG_COMPAT + if (msg->msg_flags & MSG_CMSG_COMPAT) { + ctv.tv_sec = tv.tv_sec; + ctv.tv_usec = tv.tv_usec; + data = &ctv; + len = sizeof(ctv); + } +#endif + + put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data); + } +} + +static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + int noblock = flags & MSG_DONTWAIT; + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (flags & (MSG_OOB)) + return -EOPNOTSUPP; + + if (sk->sk_state == BT_CLOSED) + return 0; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + return err; + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb_reset_transport_header(skb); + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + hci_sock_cmsg(sk, msg, skb); + + skb_free_datagram(sk, skb); + + return err ? : copied; +} + +static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct hci_dev *hdev; + struct sk_buff *skb; + int err; + + BT_DBG("sock %p sk %p", sock, sk); + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE)) + return -EINVAL; + + if (len < 4 || len > HCI_MAX_FRAME_SIZE) + return -EINVAL; + + lock_sock(sk); + + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_RAW: + break; + case HCI_CHANNEL_CONTROL: + err = mgmt_control(sk, msg, len); + goto done; + default: + err = -EINVAL; + goto done; + } + + hdev = hci_pi(sk)->hdev; + if (!hdev) { + err = -EBADFD; + goto done; + } + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = -ENETDOWN; + goto done; + } + + skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + goto done; + + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + goto drop; + } + + bt_cb(skb)->pkt_type = *((unsigned char *) skb->data); + skb_pull(skb, 1); + skb->dev = (void *) hdev; + + if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { + u16 opcode = get_unaligned_le16(skb->data); + u16 ogf = hci_opcode_ogf(opcode); + u16 ocf = hci_opcode_ocf(opcode); + + if (((ogf > HCI_SFLT_MAX_OGF) || + !hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) && + !capable(CAP_NET_RAW)) { + err = -EPERM; + goto drop; + } + + if (test_bit(HCI_RAW, &hdev->flags) || (ogf == 0x3f)) { + skb_queue_tail(&hdev->raw_q, skb); + tasklet_schedule(&hdev->tx_task); + } else { + skb_queue_tail(&hdev->cmd_q, skb); + tasklet_schedule(&hdev->cmd_task); + } + } else { + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto drop; + } + + skb_queue_tail(&hdev->raw_q, skb); + tasklet_schedule(&hdev->tx_task); + } + + err = len; + +done: + release_sock(sk); + return err; + +drop: + kfree_skb(skb); + goto done; +} + +static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int len) +{ + struct hci_ufilter uf = { .opcode = 0 }; + struct sock *sk = sock->sk; + int err = 0, opt = 0; + + BT_DBG("sk %p, opt %d", sk, optname); + + lock_sock(sk); + + switch (optname) { + case HCI_DATA_DIR: + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + break; + } + + if (opt) + hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; + else + hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR; + break; + + case HCI_TIME_STAMP: + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + break; + } + + if (opt) + hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; + else + hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP; + break; + + case HCI_FILTER: + { + struct hci_filter *f = &hci_pi(sk)->filter; + + uf.type_mask = f->type_mask; + uf.opcode = f->opcode; + uf.event_mask[0] = *((u32 *) f->event_mask + 0); + uf.event_mask[1] = *((u32 *) f->event_mask + 1); + } + + len = min_t(unsigned int, len, sizeof(uf)); + if (copy_from_user(&uf, optval, len)) { + err = -EFAULT; + break; + } + + if (!capable(CAP_NET_RAW)) { + uf.type_mask &= hci_sec_filter.type_mask; + uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0); + uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1); + } + + { + struct hci_filter *f = &hci_pi(sk)->filter; + + f->type_mask = uf.type_mask; + f->opcode = uf.opcode; + *((u32 *) f->event_mask + 0) = uf.event_mask[0]; + *((u32 *) f->event_mask + 1) = uf.event_mask[1]; + } + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct hci_ufilter uf; + struct sock *sk = sock->sk; + int len, opt; + + if (get_user(len, optlen)) + return -EFAULT; + + switch (optname) { + case HCI_DATA_DIR: + if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR) + opt = 1; + else + opt = 0; + + if (put_user(opt, optval)) + return -EFAULT; + break; + + case HCI_TIME_STAMP: + if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP) + opt = 1; + else + opt = 0; + + if (put_user(opt, optval)) + return -EFAULT; + break; + + case HCI_FILTER: + { + struct hci_filter *f = &hci_pi(sk)->filter; + + uf.type_mask = f->type_mask; + uf.opcode = f->opcode; + uf.event_mask[0] = *((u32 *) f->event_mask + 0); + uf.event_mask[1] = *((u32 *) f->event_mask + 1); + } + + len = min_t(unsigned int, len, sizeof(uf)); + if (copy_to_user(optval, &uf, len)) + return -EFAULT; + break; + + default: + return -ENOPROTOOPT; + break; + } + + return 0; +} + +static const struct proto_ops hci_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = hci_sock_release, + .bind = hci_sock_bind, + .getname = hci_sock_getname, + .sendmsg = hci_sock_sendmsg, + .recvmsg = hci_sock_recvmsg, + .ioctl = hci_sock_ioctl, + .poll = datagram_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = hci_sock_setsockopt, + .getsockopt = hci_sock_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto hci_sk_proto = { + .name = "HCI", + .owner = THIS_MODULE, + .obj_size = sizeof(struct hci_pinfo) +}; + +static int hci_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sock->ops = &hci_sock_ops; + + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + + sock->state = SS_UNCONNECTED; + sk->sk_state = BT_OPEN; + + bt_sock_link(&hci_sk_list, sk); + return 0; +} + +static int hci_sock_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct hci_dev *hdev = (struct hci_dev *) ptr; + struct hci_ev_si_device ev; + + BT_DBG("hdev %s event %ld", hdev->name, event); + + /* Send event to sockets */ + ev.event = event; + ev.dev_id = hdev->id; + hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + + if (event == HCI_DEV_UNREG) { + struct sock *sk; + struct hlist_node *node; + + /* Detach sockets from device */ + read_lock(&hci_sk_list.lock); + sk_for_each(sk, node, &hci_sk_list.head) { + local_bh_disable(); + bh_lock_sock_nested(sk); + if (hci_pi(sk)->hdev == hdev) { + hci_pi(sk)->hdev = NULL; + sk->sk_err = EPIPE; + sk->sk_state = BT_OPEN; + sk->sk_state_change(sk); + + hci_dev_put(hdev); + } + bh_unlock_sock(sk); + local_bh_enable(); + } + read_unlock(&hci_sk_list.lock); + } + + return NOTIFY_DONE; +} + +static const struct net_proto_family hci_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = hci_sock_create, +}; + +static struct notifier_block hci_sock_nblock = { + .notifier_call = hci_sock_dev_event +}; + +int __init hci_sock_init(void) +{ + int err; + + err = proto_register(&hci_sk_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops); + if (err < 0) + goto error; + + hci_register_notifier(&hci_sock_nblock); + + BT_INFO("HCI socket layer initialized"); + + return 0; + +error: + BT_ERR("HCI socket registration failed"); + proto_unregister(&hci_sk_proto); + return err; +} + +void hci_sock_cleanup(void) +{ + if (bt_sock_unregister(BTPROTO_HCI) < 0) + BT_ERR("HCI socket unregistration failed"); + + hci_unregister_notifier(&hci_sock_nblock); + + proto_unregister(&hci_sk_proto); +} + +module_param(enable_mgmt, bool, 0644); +MODULE_PARM_DESC(enable_mgmt, "Enable Management interface"); diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c new file mode 100644 index 00000000..a6c3aa8b --- /dev/null +++ b/net/bluetooth/hci_sysfs.c @@ -0,0 +1,607 @@ +/* Bluetooth HCI driver model support. */ + +#include +#include +#include +#include +#include + +#include +#include + +static struct class *bt_class; + +struct dentry *bt_debugfs; +EXPORT_SYMBOL_GPL(bt_debugfs); + +static inline char *link_typetostr(int type) +{ + switch (type) { + case ACL_LINK: + return "ACL"; + case SCO_LINK: + return "SCO"; + case ESCO_LINK: + return "eSCO"; + default: + return "UNKNOWN"; + } +} + +static ssize_t show_link_type(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_conn *conn = dev_get_drvdata(dev); + return sprintf(buf, "%s\n", link_typetostr(conn->type)); +} + +static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_conn *conn = dev_get_drvdata(dev); + return sprintf(buf, "%s\n", batostr(&conn->dst)); +} + +static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_conn *conn = dev_get_drvdata(dev); + + return sprintf(buf, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + conn->features[0], conn->features[1], + conn->features[2], conn->features[3], + conn->features[4], conn->features[5], + conn->features[6], conn->features[7]); +} + +#define LINK_ATTR(_name, _mode, _show, _store) \ +struct device_attribute link_attr_##_name = __ATTR(_name, _mode, _show, _store) + +static LINK_ATTR(type, S_IRUGO, show_link_type, NULL); +static LINK_ATTR(address, S_IRUGO, show_link_address, NULL); +static LINK_ATTR(features, S_IRUGO, show_link_features, NULL); + +static struct attribute *bt_link_attrs[] = { + &link_attr_type.attr, + &link_attr_address.attr, + &link_attr_features.attr, + NULL +}; + +static struct attribute_group bt_link_group = { + .attrs = bt_link_attrs, +}; + +static const struct attribute_group *bt_link_groups[] = { + &bt_link_group, + NULL +}; + +static void bt_link_release(struct device *dev) +{ + void *data = dev_get_drvdata(dev); + kfree(data); +} + +static struct device_type bt_link = { + .name = "link", + .groups = bt_link_groups, + .release = bt_link_release, +}; + +static void add_conn(struct work_struct *work) +{ + struct hci_conn *conn = container_of(work, struct hci_conn, work_add); + struct hci_dev *hdev = conn->hdev; + + dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); + + dev_set_drvdata(&conn->dev, conn); + + if (device_add(&conn->dev) < 0) { + BT_ERR("Failed to register connection device"); + return; + } + + hci_dev_hold(hdev); +} + +/* + * The rfcomm tty device will possibly retain even when conn + * is down, and sysfs doesn't support move zombie device, + * so we should move the device before conn device is destroyed. + */ +static int __match_tty(struct device *dev, void *data) +{ + return !strncmp(dev_name(dev), "rfcomm", 6); +} + +static void del_conn(struct work_struct *work) +{ + struct hci_conn *conn = container_of(work, struct hci_conn, work_del); + struct hci_dev *hdev = conn->hdev; + + if (!device_is_registered(&conn->dev)) + return; + + while (1) { + struct device *dev; + + dev = device_find_child(&conn->dev, NULL, __match_tty); + if (!dev) + break; + device_move(dev, NULL, DPM_ORDER_DEV_LAST); + put_device(dev); + } + + device_del(&conn->dev); + put_device(&conn->dev); + + hci_dev_put(hdev); +} + +void hci_conn_init_sysfs(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("conn %p", conn); + + conn->dev.type = &bt_link; + conn->dev.class = bt_class; + conn->dev.parent = &hdev->dev; + + device_initialize(&conn->dev); + + INIT_WORK(&conn->work_add, add_conn); + INIT_WORK(&conn->work_del, del_conn); +} + +void hci_conn_add_sysfs(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + queue_work(conn->hdev->workqueue, &conn->work_add); +} + +void hci_conn_del_sysfs(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + queue_work(conn->hdev->workqueue, &conn->work_del); +} + +static inline char *host_bustostr(int bus) +{ + switch (bus) { + case HCI_VIRTUAL: + return "VIRTUAL"; + case HCI_USB: + return "USB"; + case HCI_PCCARD: + return "PCCARD"; + case HCI_UART: + return "UART"; + case HCI_RS232: + return "RS232"; + case HCI_PCI: + return "PCI"; + case HCI_SDIO: + return "SDIO"; + default: + return "UNKNOWN"; + } +} + +static inline char *host_typetostr(int type) +{ + switch (type) { + case HCI_BREDR: + return "BR/EDR"; + case HCI_AMP: + return "AMP"; + default: + return "UNKNOWN"; + } +} + +static ssize_t show_bus(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%s\n", host_bustostr(hdev->bus)); +} + +static ssize_t show_type(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%s\n", host_typetostr(hdev->dev_type)); +} + +static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + char name[HCI_MAX_NAME_LENGTH + 1]; + int i; + + for (i = 0; i < HCI_MAX_NAME_LENGTH; i++) + name[i] = hdev->dev_name[i]; + + name[HCI_MAX_NAME_LENGTH] = '\0'; + return sprintf(buf, "%s\n", name); +} + +static ssize_t show_class(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "0x%.2x%.2x%.2x\n", + hdev->dev_class[2], hdev->dev_class[1], hdev->dev_class[0]); +} + +static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%s\n", batostr(&hdev->bdaddr)); +} + +static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + hdev->features[0], hdev->features[1], + hdev->features[2], hdev->features[3], + hdev->features[4], hdev->features[5], + hdev->features[6], hdev->features[7]); +} + +static ssize_t show_manufacturer(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", hdev->manufacturer); +} + +static ssize_t show_hci_version(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", hdev->hci_ver); +} + +static ssize_t show_hci_revision(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", hdev->hci_rev); +} + +static ssize_t show_idle_timeout(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", hdev->idle_timeout); +} + +static ssize_t store_idle_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + unsigned int val; + int rv; + + rv = kstrtouint(buf, 0, &val); + if (rv < 0) + return rv; + + if (val != 0 && (val < 500 || val > 3600000)) + return -EINVAL; + + hdev->idle_timeout = val; + + return count; +} + +static ssize_t show_sniff_max_interval(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", hdev->sniff_max_interval); +} + +static ssize_t store_sniff_max_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + u16 val; + int rv; + + rv = kstrtou16(buf, 0, &val); + if (rv < 0) + return rv; + + if (val == 0 || val % 2 || val < hdev->sniff_min_interval) + return -EINVAL; + + hdev->sniff_max_interval = val; + + return count; +} + +static ssize_t show_sniff_min_interval(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + return sprintf(buf, "%d\n", hdev->sniff_min_interval); +} + +static ssize_t store_sniff_min_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) +{ + struct hci_dev *hdev = dev_get_drvdata(dev); + u16 val; + int rv; + + rv = kstrtou16(buf, 0, &val); + if (rv < 0) + return rv; + + if (val == 0 || val % 2 || val > hdev->sniff_max_interval) + return -EINVAL; + + hdev->sniff_min_interval = val; + + return count; +} + +static DEVICE_ATTR(bus, S_IRUGO, show_bus, NULL); +static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); +static DEVICE_ATTR(name, S_IRUGO, show_name, NULL); +static DEVICE_ATTR(class, S_IRUGO, show_class, NULL); +static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static DEVICE_ATTR(features, S_IRUGO, show_features, NULL); +static DEVICE_ATTR(manufacturer, S_IRUGO, show_manufacturer, NULL); +static DEVICE_ATTR(hci_version, S_IRUGO, show_hci_version, NULL); +static DEVICE_ATTR(hci_revision, S_IRUGO, show_hci_revision, NULL); + +static DEVICE_ATTR(idle_timeout, S_IRUGO | S_IWUSR, + show_idle_timeout, store_idle_timeout); +static DEVICE_ATTR(sniff_max_interval, S_IRUGO | S_IWUSR, + show_sniff_max_interval, store_sniff_max_interval); +static DEVICE_ATTR(sniff_min_interval, S_IRUGO | S_IWUSR, + show_sniff_min_interval, store_sniff_min_interval); + +static struct attribute *bt_host_attrs[] = { + &dev_attr_bus.attr, + &dev_attr_type.attr, + &dev_attr_name.attr, + &dev_attr_class.attr, + &dev_attr_address.attr, + &dev_attr_features.attr, + &dev_attr_manufacturer.attr, + &dev_attr_hci_version.attr, + &dev_attr_hci_revision.attr, + &dev_attr_idle_timeout.attr, + &dev_attr_sniff_max_interval.attr, + &dev_attr_sniff_min_interval.attr, + NULL +}; + +static struct attribute_group bt_host_group = { + .attrs = bt_host_attrs, +}; + +static const struct attribute_group *bt_host_groups[] = { + &bt_host_group, + NULL +}; + +static void bt_host_release(struct device *dev) +{ + void *data = dev_get_drvdata(dev); + kfree(data); +} + +static struct device_type bt_host = { + .name = "host", + .groups = bt_host_groups, + .release = bt_host_release, +}; + +static int inquiry_cache_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *e; + + hci_dev_lock_bh(hdev); + + for (e = cache->list; e; e = e->next) { + struct inquiry_data *data = &e->data; + seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n", + batostr(&data->bdaddr), + data->pscan_rep_mode, data->pscan_period_mode, + data->pscan_mode, data->dev_class[2], + data->dev_class[1], data->dev_class[0], + __le16_to_cpu(data->clock_offset), + data->rssi, data->ssp_mode, e->timestamp); + } + + hci_dev_unlock_bh(hdev); + + return 0; +} + +static int inquiry_cache_open(struct inode *inode, struct file *file) +{ + return single_open(file, inquiry_cache_show, inode->i_private); +} + +static const struct file_operations inquiry_cache_fops = { + .open = inquiry_cache_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int blacklist_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + struct list_head *l; + + hci_dev_lock_bh(hdev); + + list_for_each(l, &hdev->blacklist) { + struct bdaddr_list *b; + + b = list_entry(l, struct bdaddr_list, list); + + seq_printf(f, "%s\n", batostr(&b->bdaddr)); + } + + hci_dev_unlock_bh(hdev); + + return 0; +} + +static int blacklist_open(struct inode *inode, struct file *file) +{ + return single_open(file, blacklist_show, inode->i_private); +} + +static const struct file_operations blacklist_fops = { + .open = blacklist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_bt_uuid(struct seq_file *f, u8 *uuid) +{ + u32 data0, data4; + u16 data1, data2, data3, data5; + + memcpy(&data0, &uuid[0], 4); + memcpy(&data1, &uuid[4], 2); + memcpy(&data2, &uuid[6], 2); + memcpy(&data3, &uuid[8], 2); + memcpy(&data4, &uuid[10], 4); + memcpy(&data5, &uuid[14], 2); + + seq_printf(f, "%.8x-%.4x-%.4x-%.4x-%.8x%.4x\n", + ntohl(data0), ntohs(data1), ntohs(data2), + ntohs(data3), ntohl(data4), ntohs(data5)); +} + +static int uuids_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + struct list_head *l; + + hci_dev_lock_bh(hdev); + + list_for_each(l, &hdev->uuids) { + struct bt_uuid *uuid; + + uuid = list_entry(l, struct bt_uuid, list); + + print_bt_uuid(f, uuid->uuid); + } + + hci_dev_unlock_bh(hdev); + + return 0; +} + +static int uuids_open(struct inode *inode, struct file *file) +{ + return single_open(file, uuids_show, inode->i_private); +} + +static const struct file_operations uuids_fops = { + .open = uuids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int auto_accept_delay_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock_bh(hdev); + + hdev->auto_accept_delay = val; + + hci_dev_unlock_bh(hdev); + + return 0; +} + +static int auto_accept_delay_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock_bh(hdev); + + *val = hdev->auto_accept_delay; + + hci_dev_unlock_bh(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get, + auto_accept_delay_set, "%llu\n"); + +int hci_register_sysfs(struct hci_dev *hdev) +{ + struct device *dev = &hdev->dev; + int err; + + BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); + + dev->type = &bt_host; + dev->class = bt_class; + dev->parent = hdev->parent; + + dev_set_name(dev, "%s", hdev->name); + + dev_set_drvdata(dev, hdev); + + err = device_register(dev); + if (err < 0) + return err; + + if (!bt_debugfs) + return 0; + + hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); + if (!hdev->debugfs) + return 0; + + debugfs_create_file("inquiry_cache", 0444, hdev->debugfs, + hdev, &inquiry_cache_fops); + + debugfs_create_file("blacklist", 0444, hdev->debugfs, + hdev, &blacklist_fops); + + debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops); + + debugfs_create_file("auto_accept_delay", 0444, hdev->debugfs, hdev, + &auto_accept_delay_fops); + return 0; +} + +void hci_unregister_sysfs(struct hci_dev *hdev) +{ + BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); + + debugfs_remove_recursive(hdev->debugfs); + + device_del(&hdev->dev); +} + +int __init bt_sysfs_init(void) +{ + bt_debugfs = debugfs_create_dir("bluetooth", NULL); + + bt_class = class_create(THIS_MODULE, "bluetooth"); + if (IS_ERR(bt_class)) + return PTR_ERR(bt_class); + + return 0; +} + +void bt_sysfs_cleanup(void) +{ + class_destroy(bt_class); + + debugfs_remove_recursive(bt_debugfs); +} diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig new file mode 100644 index 00000000..86a91543 --- /dev/null +++ b/net/bluetooth/hidp/Kconfig @@ -0,0 +1,12 @@ +config BT_HIDP + tristate "HIDP protocol support" + depends on BT && BT_L2CAP && INPUT && HID_SUPPORT + select HID + help + HIDP (Human Interface Device Protocol) is a transport layer + for HID reports. HIDP is required for the Bluetooth Human + Interface Device Profile. + + Say Y here to compile HIDP support into the kernel or say M to + compile it as module (hidp). + diff --git a/net/bluetooth/hidp/Makefile b/net/bluetooth/hidp/Makefile new file mode 100644 index 00000000..a9ee1156 --- /dev/null +++ b/net/bluetooth/hidp/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Bluetooth HIDP layer +# + +obj-$(CONFIG_BT_HIDP) += hidp.o + +hidp-objs := core.o sock.o diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c new file mode 100644 index 00000000..fb68f344 --- /dev/null +++ b/net/bluetooth/hidp/core.c @@ -0,0 +1,1220 @@ +/* + HIDP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2003-2004 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "hidp.h" + +#define VERSION "1.2" + +static DECLARE_RWSEM(hidp_session_sem); +static LIST_HEAD(hidp_session_list); + +static unsigned char hidp_keycode[256] = { + 0, 0, 0, 0, 30, 48, 46, 32, 18, 33, 34, 35, 23, 36, + 37, 38, 50, 49, 24, 25, 16, 19, 31, 20, 22, 47, 17, 45, + 21, 44, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 1, + 14, 15, 57, 12, 13, 26, 27, 43, 43, 39, 40, 41, 51, 52, + 53, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 87, 88, + 99, 70, 119, 110, 102, 104, 111, 107, 109, 106, 105, 108, 103, 69, + 98, 55, 74, 78, 96, 79, 80, 81, 75, 76, 77, 71, 72, 73, + 82, 83, 86, 127, 116, 117, 183, 184, 185, 186, 187, 188, 189, 190, + 191, 192, 193, 194, 134, 138, 130, 132, 128, 129, 131, 137, 133, 135, + 136, 113, 115, 114, 0, 0, 0, 121, 0, 89, 93, 124, 92, 94, + 95, 0, 0, 0, 122, 123, 90, 91, 85, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 29, 42, 56, 125, 97, 54, 100, 126, 164, 166, 165, 163, 161, 115, + 114, 113, 150, 158, 159, 128, 136, 177, 178, 176, 142, 152, 173, 140 +}; + +static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }; + +static struct hidp_session *__hidp_get_session(bdaddr_t *bdaddr) +{ + struct hidp_session *session; + struct list_head *p; + + BT_DBG(""); + + list_for_each(p, &hidp_session_list) { + session = list_entry(p, struct hidp_session, list); + if (!bacmp(bdaddr, &session->bdaddr)) + return session; + } + return NULL; +} + +static void __hidp_link_session(struct hidp_session *session) +{ + __module_get(THIS_MODULE); + list_add(&session->list, &hidp_session_list); + + hci_conn_hold_device(session->conn); +} + +static void __hidp_unlink_session(struct hidp_session *session) +{ + hci_conn_put_device(session->conn); + + list_del(&session->list); + module_put(THIS_MODULE); +} + +static void __hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci) +{ + memset(ci, 0, sizeof(*ci)); + bacpy(&ci->bdaddr, &session->bdaddr); + + ci->flags = session->flags; + ci->state = session->state; + + ci->vendor = 0x0000; + ci->product = 0x0000; + ci->version = 0x0000; + + if (session->input) { + ci->vendor = session->input->id.vendor; + ci->product = session->input->id.product; + ci->version = session->input->id.version; + if (session->input->name) + strncpy(ci->name, session->input->name, 128); + else + strncpy(ci->name, "HID Boot Device", 128); + } + + if (session->hid) { + ci->vendor = session->hid->vendor; + ci->product = session->hid->product; + ci->version = session->hid->version; + strncpy(ci->name, session->hid->name, 128); + } +} + +static int hidp_queue_event(struct hidp_session *session, struct input_dev *dev, + unsigned int type, unsigned int code, int value) +{ + unsigned char newleds; + struct sk_buff *skb; + + BT_DBG("session %p type %d code %d value %d", session, type, code, value); + + if (type != EV_LED) + return -1; + + newleds = (!!test_bit(LED_KANA, dev->led) << 3) | + (!!test_bit(LED_COMPOSE, dev->led) << 3) | + (!!test_bit(LED_SCROLLL, dev->led) << 2) | + (!!test_bit(LED_CAPSL, dev->led) << 1) | + (!!test_bit(LED_NUML, dev->led)); + + if (session->leds == newleds) + return 0; + + session->leds = newleds; + + skb = alloc_skb(3, GFP_ATOMIC); + if (!skb) { + BT_ERR("Can't allocate memory for new frame"); + return -ENOMEM; + } + + *skb_put(skb, 1) = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; + *skb_put(skb, 1) = 0x01; + *skb_put(skb, 1) = newleds; + + skb_queue_tail(&session->intr_transmit, skb); + + hidp_schedule(session); + + return 0; +} + +static int hidp_hidinput_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) +{ + struct hid_device *hid = input_get_drvdata(dev); + struct hidp_session *session = hid->driver_data; + + return hidp_queue_event(session, dev, type, code, value); +} + +static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) +{ + struct hidp_session *session = input_get_drvdata(dev); + + return hidp_queue_event(session, dev, type, code, value); +} + +static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) +{ + struct input_dev *dev = session->input; + unsigned char *keys = session->keys; + unsigned char *udata = skb->data + 1; + signed char *sdata = skb->data + 1; + int i, size = skb->len - 1; + + switch (skb->data[0]) { + case 0x01: /* Keyboard report */ + for (i = 0; i < 8; i++) + input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1); + + /* If all the key codes have been set to 0x01, it means + * too many keys were pressed at the same time. */ + if (!memcmp(udata + 2, hidp_mkeyspat, 6)) + break; + + for (i = 2; i < 8; i++) { + if (keys[i] > 3 && memscan(udata + 2, keys[i], 6) == udata + 8) { + if (hidp_keycode[keys[i]]) + input_report_key(dev, hidp_keycode[keys[i]], 0); + else + BT_ERR("Unknown key (scancode %#x) released.", keys[i]); + } + + if (udata[i] > 3 && memscan(keys + 2, udata[i], 6) == keys + 8) { + if (hidp_keycode[udata[i]]) + input_report_key(dev, hidp_keycode[udata[i]], 1); + else + BT_ERR("Unknown key (scancode %#x) pressed.", udata[i]); + } + } + + memcpy(keys, udata, 8); + break; + + case 0x02: /* Mouse report */ + input_report_key(dev, BTN_LEFT, sdata[0] & 0x01); + input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02); + input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04); + input_report_key(dev, BTN_SIDE, sdata[0] & 0x08); + input_report_key(dev, BTN_EXTRA, sdata[0] & 0x10); + + input_report_rel(dev, REL_X, sdata[1]); + input_report_rel(dev, REL_Y, sdata[2]); + + if (size > 3) + input_report_rel(dev, REL_WHEEL, sdata[3]); + break; + } + + input_sync(dev); +} + +static int __hidp_send_ctrl_message(struct hidp_session *session, + unsigned char hdr, unsigned char *data, int size) +{ + struct sk_buff *skb; + + BT_DBG("session %p data %p size %d", session, data, size); + + skb = alloc_skb(size + 1, GFP_ATOMIC); + if (!skb) { + BT_ERR("Can't allocate memory for new frame"); + return -ENOMEM; + } + + *skb_put(skb, 1) = hdr; + if (data && size > 0) + memcpy(skb_put(skb, size), data, size); + + skb_queue_tail(&session->ctrl_transmit, skb); + + return 0; +} + +static inline int hidp_send_ctrl_message(struct hidp_session *session, + unsigned char hdr, unsigned char *data, int size) +{ + int err; + + err = __hidp_send_ctrl_message(session, hdr, data, size); + + hidp_schedule(session); + + return err; +} + +static int hidp_queue_report(struct hidp_session *session, + unsigned char *data, int size) +{ + struct sk_buff *skb; + + BT_DBG("session %p hid %p data %p size %d", session, session->hid, data, size); + + skb = alloc_skb(size + 1, GFP_ATOMIC); + if (!skb) { + BT_ERR("Can't allocate memory for new frame"); + return -ENOMEM; + } + + *skb_put(skb, 1) = 0xa2; + if (size > 0) + memcpy(skb_put(skb, size), data, size); + + skb_queue_tail(&session->intr_transmit, skb); + + hidp_schedule(session); + + return 0; +} + +static int hidp_send_report(struct hidp_session *session, struct hid_report *report) +{ + unsigned char buf[32]; + int rsize; + + rsize = ((report->size - 1) >> 3) + 1 + (report->id > 0); + if (rsize > sizeof(buf)) + return -EIO; + + hid_output_report(report, buf); + + return hidp_queue_report(session, buf, rsize); +} + +static int hidp_get_raw_report(struct hid_device *hid, + unsigned char report_number, + unsigned char *data, size_t count, + unsigned char report_type) +{ + struct hidp_session *session = hid->driver_data; + struct sk_buff *skb; + size_t len; + int numbered_reports = hid->report_enum[report_type].numbered; + + switch (report_type) { + case HID_FEATURE_REPORT: + report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE; + break; + case HID_INPUT_REPORT: + report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_INPUT; + break; + case HID_OUTPUT_REPORT: + report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_OUPUT; + break; + default: + return -EINVAL; + } + + if (mutex_lock_interruptible(&session->report_mutex)) + return -ERESTARTSYS; + + /* Set up our wait, and send the report request to the device. */ + session->waiting_report_type = report_type & HIDP_DATA_RTYPE_MASK; + session->waiting_report_number = numbered_reports ? report_number : -1; + set_bit(HIDP_WAITING_FOR_RETURN, &session->flags); + data[0] = report_number; + if (hidp_send_ctrl_message(hid->driver_data, report_type, data, 1)) + goto err_eio; + + /* Wait for the return of the report. The returned report + gets put in session->report_return. */ + while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) { + int res; + + res = wait_event_interruptible_timeout(session->report_queue, + !test_bit(HIDP_WAITING_FOR_RETURN, &session->flags), + 5*HZ); + if (res == 0) { + /* timeout */ + goto err_eio; + } + if (res < 0) { + /* signal */ + goto err_restartsys; + } + } + + skb = session->report_return; + if (skb) { + len = skb->len < count ? skb->len : count; + memcpy(data, skb->data, len); + + kfree_skb(skb); + session->report_return = NULL; + } else { + /* Device returned a HANDSHAKE, indicating protocol error. */ + len = -EIO; + } + + clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); + mutex_unlock(&session->report_mutex); + + return len; + +err_restartsys: + clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); + mutex_unlock(&session->report_mutex); + return -ERESTARTSYS; +err_eio: + clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); + mutex_unlock(&session->report_mutex); + return -EIO; +} + +static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count, + unsigned char report_type) +{ + struct hidp_session *session = hid->driver_data; + int ret; + + switch (report_type) { + case HID_FEATURE_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE; + break; + case HID_OUTPUT_REPORT: + report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT; + break; + default: + return -EINVAL; + } + + if (mutex_lock_interruptible(&session->report_mutex)) + return -ERESTARTSYS; + + /* Set up our wait, and send the report request to the device. */ + set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); + if (hidp_send_ctrl_message(hid->driver_data, report_type, + data, count)) { + ret = -ENOMEM; + goto err; + } + + /* Wait for the ACK from the device. */ + while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) { + int res; + + res = wait_event_interruptible_timeout(session->report_queue, + !test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags), + 10*HZ); + if (res == 0) { + /* timeout */ + ret = -EIO; + goto err; + } + if (res < 0) { + /* signal */ + ret = -ERESTARTSYS; + goto err; + } + } + + if (!session->output_report_success) { + ret = -EIO; + goto err; + } + + ret = count; + +err: + clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); + mutex_unlock(&session->report_mutex); + return ret; +} + +static void hidp_idle_timeout(unsigned long arg) +{ + struct hidp_session *session = (struct hidp_session *) arg; + + atomic_inc(&session->terminate); + wake_up_process(session->task); +} + +static void hidp_set_timer(struct hidp_session *session) +{ + if (session->idle_to > 0) + mod_timer(&session->timer, jiffies + HZ * session->idle_to); +} + +static inline void hidp_del_timer(struct hidp_session *session) +{ + if (session->idle_to > 0) + del_timer(&session->timer); +} + +static void hidp_process_handshake(struct hidp_session *session, + unsigned char param) +{ + BT_DBG("session %p param 0x%02x", session, param); + session->output_report_success = 0; /* default condition */ + + switch (param) { + case HIDP_HSHK_SUCCESSFUL: + /* FIXME: Call into SET_ GET_ handlers here */ + session->output_report_success = 1; + break; + + case HIDP_HSHK_NOT_READY: + case HIDP_HSHK_ERR_INVALID_REPORT_ID: + case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST: + case HIDP_HSHK_ERR_INVALID_PARAMETER: + if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) { + clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); + wake_up_interruptible(&session->report_queue); + } + /* FIXME: Call into SET_ GET_ handlers here */ + break; + + case HIDP_HSHK_ERR_UNKNOWN: + break; + + case HIDP_HSHK_ERR_FATAL: + /* Device requests a reboot, as this is the only way this error + * can be recovered. */ + __hidp_send_ctrl_message(session, + HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0); + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); + break; + } + + /* Wake up the waiting thread. */ + if (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) { + clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags); + wake_up_interruptible(&session->report_queue); + } +} + +static void hidp_process_hid_control(struct hidp_session *session, + unsigned char param) +{ + BT_DBG("session %p param 0x%02x", session, param); + + if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) { + /* Flush the transmit queues */ + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + + atomic_inc(&session->terminate); + wake_up_process(current); + } +} + +/* Returns true if the passed-in skb should be freed by the caller. */ +static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb, + unsigned char param) +{ + int done_with_skb = 1; + BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param); + + switch (param) { + case HIDP_DATA_RTYPE_INPUT: + hidp_set_timer(session); + + if (session->input) + hidp_input_report(session, skb); + + if (session->hid) + hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 0); + break; + + case HIDP_DATA_RTYPE_OTHER: + case HIDP_DATA_RTYPE_OUPUT: + case HIDP_DATA_RTYPE_FEATURE: + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); + } + + if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) && + param == session->waiting_report_type) { + if (session->waiting_report_number < 0 || + session->waiting_report_number == skb->data[0]) { + /* hidp_get_raw_report() is waiting on this report. */ + session->report_return = skb; + done_with_skb = 0; + clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags); + wake_up_interruptible(&session->report_queue); + } + } + + return done_with_skb; +} + +static void hidp_recv_ctrl_frame(struct hidp_session *session, + struct sk_buff *skb) +{ + unsigned char hdr, type, param; + int free_skb = 1; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + hdr = skb->data[0]; + skb_pull(skb, 1); + + type = hdr & HIDP_HEADER_TRANS_MASK; + param = hdr & HIDP_HEADER_PARAM_MASK; + + switch (type) { + case HIDP_TRANS_HANDSHAKE: + hidp_process_handshake(session, param); + break; + + case HIDP_TRANS_HID_CONTROL: + hidp_process_hid_control(session, param); + break; + + case HIDP_TRANS_DATA: + free_skb = hidp_process_data(session, skb, param); + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0); + break; + } + + if (free_skb) + kfree_skb(skb); +} + +static void hidp_recv_intr_frame(struct hidp_session *session, + struct sk_buff *skb) +{ + unsigned char hdr; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + hdr = skb->data[0]; + skb_pull(skb, 1); + + if (hdr == (HIDP_TRANS_DATA | HIDP_DATA_RTYPE_INPUT)) { + hidp_set_timer(session); + + if (session->input) + hidp_input_report(session, skb); + + if (session->hid) { + hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 1); + BT_DBG("report len %d", skb->len); + } + } else { + BT_DBG("Unsupported protocol header 0x%02x", hdr); + } + + kfree_skb(skb); +} + +static int hidp_send_frame(struct socket *sock, unsigned char *data, int len) +{ + struct kvec iv = { data, len }; + struct msghdr msg; + + BT_DBG("sock %p data %p len %d", sock, data, len); + + if (!len) + return 0; + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, &iv, 1, len); +} + +static void hidp_process_transmit(struct hidp_session *session) +{ + struct sk_buff *skb; + + BT_DBG("session %p", session); + + while ((skb = skb_dequeue(&session->ctrl_transmit))) { + if (hidp_send_frame(session->ctrl_sock, skb->data, skb->len) < 0) { + skb_queue_head(&session->ctrl_transmit, skb); + break; + } + + hidp_set_timer(session); + kfree_skb(skb); + } + + while ((skb = skb_dequeue(&session->intr_transmit))) { + if (hidp_send_frame(session->intr_sock, skb->data, skb->len) < 0) { + skb_queue_head(&session->intr_transmit, skb); + break; + } + + hidp_set_timer(session); + kfree_skb(skb); + } +} + +static int hidp_session(void *arg) +{ + struct hidp_session *session = arg; + struct sock *ctrl_sk = session->ctrl_sock->sk; + struct sock *intr_sk = session->intr_sock->sk; + struct sk_buff *skb; + wait_queue_t ctrl_wait, intr_wait; + + BT_DBG("session %p", session); + + set_user_nice(current, -15); + + init_waitqueue_entry(&ctrl_wait, current); + init_waitqueue_entry(&intr_wait, current); + add_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait); + add_wait_queue(sk_sleep(intr_sk), &intr_wait); + session->waiting_for_startup = 0; + wake_up_interruptible(&session->startup_queue); + set_current_state(TASK_INTERRUPTIBLE); + while (!atomic_read(&session->terminate)) { + if (ctrl_sk->sk_state != BT_CONNECTED || + intr_sk->sk_state != BT_CONNECTED) + break; + + while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) { + skb_orphan(skb); + hidp_recv_ctrl_frame(session, skb); + } + + while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) { + skb_orphan(skb); + hidp_recv_intr_frame(session, skb); + } + + hidp_process_transmit(session); + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(intr_sk), &intr_wait); + remove_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait); + + down_write(&hidp_session_sem); + + hidp_del_timer(session); + + if (session->input) { + input_unregister_device(session->input); + session->input = NULL; + } + + if (session->hid) { + hid_destroy_device(session->hid); + session->hid = NULL; + } + + /* Wakeup user-space polling for socket errors */ + session->intr_sock->sk->sk_err = EUNATCH; + session->ctrl_sock->sk->sk_err = EUNATCH; + + hidp_schedule(session); + + fput(session->intr_sock->file); + + wait_event_timeout(*(sk_sleep(ctrl_sk)), + (ctrl_sk->sk_state == BT_CLOSED), msecs_to_jiffies(500)); + + fput(session->ctrl_sock->file); + + __hidp_unlink_session(session); + + up_write(&hidp_session_sem); + + kfree(session->rd_data); + kfree(session); + return 0; +} + +static struct device *hidp_get_device(struct hidp_session *session) +{ + bdaddr_t *src = &bt_sk(session->ctrl_sock->sk)->src; + bdaddr_t *dst = &bt_sk(session->ctrl_sock->sk)->dst; + struct device *device = NULL; + struct hci_dev *hdev; + + hdev = hci_get_route(dst, src); + if (!hdev) + return NULL; + + session->conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); + if (session->conn) + device = &session->conn->dev; + + hci_dev_put(hdev); + + return device; +} + +static int hidp_setup_input(struct hidp_session *session, + struct hidp_connadd_req *req) +{ + struct input_dev *input; + int err, i; + + input = input_allocate_device(); + if (!input) + return -ENOMEM; + + session->input = input; + + input_set_drvdata(input, session); + + input->name = "Bluetooth HID Boot Protocol Device"; + + input->id.bustype = BUS_BLUETOOTH; + input->id.vendor = req->vendor; + input->id.product = req->product; + input->id.version = req->version; + + if (req->subclass & 0x40) { + set_bit(EV_KEY, input->evbit); + set_bit(EV_LED, input->evbit); + set_bit(EV_REP, input->evbit); + + set_bit(LED_NUML, input->ledbit); + set_bit(LED_CAPSL, input->ledbit); + set_bit(LED_SCROLLL, input->ledbit); + set_bit(LED_COMPOSE, input->ledbit); + set_bit(LED_KANA, input->ledbit); + + for (i = 0; i < sizeof(hidp_keycode); i++) + set_bit(hidp_keycode[i], input->keybit); + clear_bit(0, input->keybit); + } + + if (req->subclass & 0x80) { + input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL); + input->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE); + input->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); + input->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) | + BIT_MASK(BTN_EXTRA); + input->relbit[0] |= BIT_MASK(REL_WHEEL); + } + + input->dev.parent = hidp_get_device(session); + + input->event = hidp_input_event; + + err = input_register_device(input); + if (err < 0) { + input_free_device(input); + session->input = NULL; + return err; + } + + return 0; +} + +static int hidp_open(struct hid_device *hid) +{ + return 0; +} + +static void hidp_close(struct hid_device *hid) +{ +} + +static int hidp_parse(struct hid_device *hid) +{ + struct hidp_session *session = hid->driver_data; + + return hid_parse_report(session->hid, session->rd_data, + session->rd_size); +} + +static int hidp_start(struct hid_device *hid) +{ + struct hidp_session *session = hid->driver_data; + struct hid_report *report; + + list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT]. + report_list, list) + hidp_send_report(session, report); + + list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT]. + report_list, list) + hidp_send_report(session, report); + + return 0; +} + +static void hidp_stop(struct hid_device *hid) +{ + struct hidp_session *session = hid->driver_data; + + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + + hid->claimed = 0; +} + +static struct hid_ll_driver hidp_hid_driver = { + .parse = hidp_parse, + .start = hidp_start, + .stop = hidp_stop, + .open = hidp_open, + .close = hidp_close, + .hidinput_input_event = hidp_hidinput_event, +}; + +/* This function sets up the hid device. It does not add it + to the HID system. That is done in hidp_add_connection(). */ +static int hidp_setup_hid(struct hidp_session *session, + struct hidp_connadd_req *req) +{ + struct hid_device *hid; + int err; + + session->rd_data = kzalloc(req->rd_size, GFP_KERNEL); + if (!session->rd_data) + return -ENOMEM; + + if (copy_from_user(session->rd_data, req->rd_data, req->rd_size)) { + err = -EFAULT; + goto fault; + } + session->rd_size = req->rd_size; + + hid = hid_allocate_device(); + if (IS_ERR(hid)) { + err = PTR_ERR(hid); + goto fault; + } + + session->hid = hid; + + hid->driver_data = session; + + hid->bus = BUS_BLUETOOTH; + hid->vendor = req->vendor; + hid->product = req->product; + hid->version = req->version; + hid->country = req->country; + + strncpy(hid->name, req->name, 128); + strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64); + strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64); + + hid->dev.parent = hidp_get_device(session); + hid->ll_driver = &hidp_hid_driver; + + hid->hid_get_raw_report = hidp_get_raw_report; + hid->hid_output_raw_report = hidp_output_raw_report; + + return 0; + +fault: + kfree(session->rd_data); + session->rd_data = NULL; + + return err; +} + +int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) +{ + struct hidp_session *session, *s; + int vendor, product; + int err; + + BT_DBG(""); + + if (bacmp(&bt_sk(ctrl_sock->sk)->src, &bt_sk(intr_sock->sk)->src) || + bacmp(&bt_sk(ctrl_sock->sk)->dst, &bt_sk(intr_sock->sk)->dst)) + return -ENOTUNIQ; + + session = kzalloc(sizeof(struct hidp_session), GFP_KERNEL); + if (!session) + return -ENOMEM; + + BT_DBG("rd_data %p rd_size %d", req->rd_data, req->rd_size); + + down_write(&hidp_session_sem); + + s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst); + if (s && s->state == BT_CONNECTED) { + err = -EEXIST; + goto failed; + } + + bacpy(&session->bdaddr, &bt_sk(ctrl_sock->sk)->dst); + + session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl_sock->sk)->chan->omtu, + l2cap_pi(ctrl_sock->sk)->chan->imtu); + session->intr_mtu = min_t(uint, l2cap_pi(intr_sock->sk)->chan->omtu, + l2cap_pi(intr_sock->sk)->chan->imtu); + + BT_DBG("ctrl mtu %d intr mtu %d", session->ctrl_mtu, session->intr_mtu); + + session->ctrl_sock = ctrl_sock; + session->intr_sock = intr_sock; + session->state = BT_CONNECTED; + + setup_timer(&session->timer, hidp_idle_timeout, (unsigned long)session); + + skb_queue_head_init(&session->ctrl_transmit); + skb_queue_head_init(&session->intr_transmit); + + mutex_init(&session->report_mutex); + init_waitqueue_head(&session->report_queue); + init_waitqueue_head(&session->startup_queue); + session->waiting_for_startup = 1; + session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID); + session->idle_to = req->idle_to; + + if (req->rd_size > 0) { + err = hidp_setup_hid(session, req); + if (err && err != -ENODEV) + goto purge; + } + + if (!session->hid) { + err = hidp_setup_input(session, req); + if (err < 0) + goto purge; + } + + __hidp_link_session(session); + + hidp_set_timer(session); + + if (session->hid) { + vendor = session->hid->vendor; + product = session->hid->product; + } else if (session->input) { + vendor = session->input->id.vendor; + product = session->input->id.product; + } else { + vendor = 0x0000; + product = 0x0000; + } + + session->task = kthread_run(hidp_session, session, "khidpd_%04x%04x", + vendor, product); + if (IS_ERR(session->task)) { + err = PTR_ERR(session->task); + goto unlink; + } + + while (session->waiting_for_startup) { + wait_event_interruptible(session->startup_queue, + !session->waiting_for_startup); + } + + err = hid_add_device(session->hid); + if (err < 0) { + atomic_inc(&session->terminate); + wake_up_process(session->task); + up_write(&hidp_session_sem); + return err; + } + + if (session->input) { + hidp_send_ctrl_message(session, + HIDP_TRANS_SET_PROTOCOL | HIDP_PROTO_BOOT, NULL, 0); + session->flags |= (1 << HIDP_BOOT_PROTOCOL_MODE); + + session->leds = 0xff; + hidp_input_event(session->input, EV_LED, 0, 0); + } + + up_write(&hidp_session_sem); + return 0; + +unlink: + hidp_del_timer(session); + + __hidp_unlink_session(session); + + if (session->input) { + input_unregister_device(session->input); + session->input = NULL; + } + + if (session->hid) { + hid_destroy_device(session->hid); + session->hid = NULL; + } + + kfree(session->rd_data); + session->rd_data = NULL; + +purge: + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + +failed: + up_write(&hidp_session_sem); + + kfree(session); + return err; +} + +int hidp_del_connection(struct hidp_conndel_req *req) +{ + struct hidp_session *session; + int err = 0; + + BT_DBG(""); + + down_read(&hidp_session_sem); + + session = __hidp_get_session(&req->bdaddr); + if (session) { + if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) { + hidp_send_ctrl_message(session, + HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); + } else { + /* Flush the transmit queues */ + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + + atomic_inc(&session->terminate); + wake_up_process(session->task); + } + } else + err = -ENOENT; + + up_read(&hidp_session_sem); + return err; +} + +int hidp_get_connlist(struct hidp_connlist_req *req) +{ + struct list_head *p; + int err = 0, n = 0; + + BT_DBG(""); + + down_read(&hidp_session_sem); + + list_for_each(p, &hidp_session_list) { + struct hidp_session *session; + struct hidp_conninfo ci; + + session = list_entry(p, struct hidp_session, list); + + __hidp_copy_session(session, &ci); + + if (copy_to_user(req->ci, &ci, sizeof(ci))) { + err = -EFAULT; + break; + } + + if (++n >= req->cnum) + break; + + req->ci++; + } + req->cnum = n; + + up_read(&hidp_session_sem); + return err; +} + +int hidp_get_conninfo(struct hidp_conninfo *ci) +{ + struct hidp_session *session; + int err = 0; + + down_read(&hidp_session_sem); + + session = __hidp_get_session(&ci->bdaddr); + if (session) + __hidp_copy_session(session, ci); + else + err = -ENOENT; + + up_read(&hidp_session_sem); + return err; +} + +static const struct hid_device_id hidp_table[] = { + { HID_BLUETOOTH_DEVICE(HID_ANY_ID, HID_ANY_ID) }, + { } +}; + +static struct hid_driver hidp_driver = { + .name = "generic-bluetooth", + .id_table = hidp_table, +}; + +static int __init hidp_init(void) +{ + int ret; + + BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION); + + ret = hid_register_driver(&hidp_driver); + if (ret) + goto err; + + ret = hidp_init_sockets(); + if (ret) + goto err_drv; + + return 0; +err_drv: + hid_unregister_driver(&hidp_driver); +err: + return ret; +} + +static void __exit hidp_exit(void) +{ + hidp_cleanup_sockets(); + hid_unregister_driver(&hidp_driver); +} + +module_init(hidp_init); +module_exit(hidp_exit); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-6"); diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h new file mode 100644 index 00000000..af1bcc82 --- /dev/null +++ b/net/bluetooth/hidp/hidp.h @@ -0,0 +1,191 @@ +/* + HIDP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2003-2004 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#ifndef __HIDP_H +#define __HIDP_H + +#include +#include + +/* HIDP header masks */ +#define HIDP_HEADER_TRANS_MASK 0xf0 +#define HIDP_HEADER_PARAM_MASK 0x0f + +/* HIDP transaction types */ +#define HIDP_TRANS_HANDSHAKE 0x00 +#define HIDP_TRANS_HID_CONTROL 0x10 +#define HIDP_TRANS_GET_REPORT 0x40 +#define HIDP_TRANS_SET_REPORT 0x50 +#define HIDP_TRANS_GET_PROTOCOL 0x60 +#define HIDP_TRANS_SET_PROTOCOL 0x70 +#define HIDP_TRANS_GET_IDLE 0x80 +#define HIDP_TRANS_SET_IDLE 0x90 +#define HIDP_TRANS_DATA 0xa0 +#define HIDP_TRANS_DATC 0xb0 + +/* HIDP handshake results */ +#define HIDP_HSHK_SUCCESSFUL 0x00 +#define HIDP_HSHK_NOT_READY 0x01 +#define HIDP_HSHK_ERR_INVALID_REPORT_ID 0x02 +#define HIDP_HSHK_ERR_UNSUPPORTED_REQUEST 0x03 +#define HIDP_HSHK_ERR_INVALID_PARAMETER 0x04 +#define HIDP_HSHK_ERR_UNKNOWN 0x0e +#define HIDP_HSHK_ERR_FATAL 0x0f + +/* HIDP control operation parameters */ +#define HIDP_CTRL_NOP 0x00 +#define HIDP_CTRL_HARD_RESET 0x01 +#define HIDP_CTRL_SOFT_RESET 0x02 +#define HIDP_CTRL_SUSPEND 0x03 +#define HIDP_CTRL_EXIT_SUSPEND 0x04 +#define HIDP_CTRL_VIRTUAL_CABLE_UNPLUG 0x05 + +/* HIDP data transaction headers */ +#define HIDP_DATA_RTYPE_MASK 0x03 +#define HIDP_DATA_RSRVD_MASK 0x0c +#define HIDP_DATA_RTYPE_OTHER 0x00 +#define HIDP_DATA_RTYPE_INPUT 0x01 +#define HIDP_DATA_RTYPE_OUPUT 0x02 +#define HIDP_DATA_RTYPE_FEATURE 0x03 + +/* HIDP protocol header parameters */ +#define HIDP_PROTO_BOOT 0x00 +#define HIDP_PROTO_REPORT 0x01 + +/* HIDP ioctl defines */ +#define HIDPCONNADD _IOW('H', 200, int) +#define HIDPCONNDEL _IOW('H', 201, int) +#define HIDPGETCONNLIST _IOR('H', 210, int) +#define HIDPGETCONNINFO _IOR('H', 211, int) + +#define HIDP_VIRTUAL_CABLE_UNPLUG 0 +#define HIDP_BOOT_PROTOCOL_MODE 1 +#define HIDP_BLUETOOTH_VENDOR_ID 9 +#define HIDP_WAITING_FOR_RETURN 10 +#define HIDP_WAITING_FOR_SEND_ACK 11 + +struct hidp_connadd_req { + int ctrl_sock; /* Connected control socket */ + int intr_sock; /* Connected interrupt socket */ + __u16 parser; + __u16 rd_size; + __u8 __user *rd_data; + __u8 country; + __u8 subclass; + __u16 vendor; + __u16 product; + __u16 version; + __u32 flags; + __u32 idle_to; + char name[128]; +}; + +struct hidp_conndel_req { + bdaddr_t bdaddr; + __u32 flags; +}; + +struct hidp_conninfo { + bdaddr_t bdaddr; + __u32 flags; + __u16 state; + __u16 vendor; + __u16 product; + __u16 version; + char name[128]; +}; + +struct hidp_connlist_req { + __u32 cnum; + struct hidp_conninfo __user *ci; +}; + +int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); +int hidp_del_connection(struct hidp_conndel_req *req); +int hidp_get_connlist(struct hidp_connlist_req *req); +int hidp_get_conninfo(struct hidp_conninfo *ci); + +/* HIDP session defines */ +struct hidp_session { + struct list_head list; + + struct hci_conn *conn; + + struct socket *ctrl_sock; + struct socket *intr_sock; + + bdaddr_t bdaddr; + + unsigned long state; + unsigned long flags; + unsigned long idle_to; + + uint ctrl_mtu; + uint intr_mtu; + + atomic_t terminate; + struct task_struct *task; + + unsigned char keys[8]; + unsigned char leds; + + struct input_dev *input; + + struct hid_device *hid; + + struct timer_list timer; + + struct sk_buff_head ctrl_transmit; + struct sk_buff_head intr_transmit; + + /* Used in hidp_get_raw_report() */ + int waiting_report_type; /* HIDP_DATA_RTYPE_* */ + int waiting_report_number; /* -1 for not numbered */ + struct mutex report_mutex; + struct sk_buff *report_return; + wait_queue_head_t report_queue; + + /* Used in hidp_output_raw_report() */ + int output_report_success; /* boolean */ + + /* Report descriptor */ + __u8 *rd_data; + uint rd_size; + + wait_queue_head_t startup_queue; + int waiting_for_startup; +}; + +static inline void hidp_schedule(struct hidp_session *session) +{ + struct sock *ctrl_sk = session->ctrl_sock->sk; + struct sock *intr_sk = session->intr_sock->sk; + + wake_up_interruptible(sk_sleep(ctrl_sk)); + wake_up_interruptible(sk_sleep(intr_sk)); +} + +/* HIDP init defines */ +extern int __init hidp_init_sockets(void); +extern void __exit hidp_cleanup_sockets(void); + +#endif /* __HIDP_H */ diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c new file mode 100644 index 00000000..178ac7f1 --- /dev/null +++ b/net/bluetooth/hidp/sock.c @@ -0,0 +1,305 @@ +/* + HIDP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2003-2004 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hidp.h" + +static int hidp_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + sock_orphan(sk); + sock_put(sk); + + return 0; +} + +static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *) arg; + struct hidp_connadd_req ca; + struct hidp_conndel_req cd; + struct hidp_connlist_req cl; + struct hidp_conninfo ci; + struct socket *csock; + struct socket *isock; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case HIDPCONNADD: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ca, argp, sizeof(ca))) + return -EFAULT; + + csock = sockfd_lookup(ca.ctrl_sock, &err); + if (!csock) + return err; + + isock = sockfd_lookup(ca.intr_sock, &err); + if (!isock) { + sockfd_put(csock); + return err; + } + + if (csock->sk->sk_state != BT_CONNECTED || + isock->sk->sk_state != BT_CONNECTED) { + sockfd_put(csock); + sockfd_put(isock); + return -EBADFD; + } + + err = hidp_add_connection(&ca, csock, isock); + if (!err) { + if (copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; + } else { + sockfd_put(csock); + sockfd_put(isock); + } + + return err; + + case HIDPCONNDEL: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&cd, argp, sizeof(cd))) + return -EFAULT; + + return hidp_del_connection(&cd); + + case HIDPGETCONNLIST: + if (copy_from_user(&cl, argp, sizeof(cl))) + return -EFAULT; + + if (cl.cnum <= 0) + return -EINVAL; + + err = hidp_get_connlist(&cl); + if (!err && copy_to_user(argp, &cl, sizeof(cl))) + return -EFAULT; + + return err; + + case HIDPGETCONNINFO: + if (copy_from_user(&ci, argp, sizeof(ci))) + return -EFAULT; + + err = hidp_get_conninfo(&ci); + if (!err && copy_to_user(argp, &ci, sizeof(ci))) + return -EFAULT; + + return err; + } + + return -EINVAL; +} + +#ifdef CONFIG_COMPAT +struct compat_hidp_connadd_req { + int ctrl_sock; /* Connected control socket */ + int intr_sock; /* Connected interrupt socket */ + __u16 parser; + __u16 rd_size; + compat_uptr_t rd_data; + __u8 country; + __u8 subclass; + __u16 vendor; + __u16 product; + __u16 version; + __u32 flags; + __u32 idle_to; + char name[128]; +}; + +static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + if (cmd == HIDPGETCONNLIST) { + struct hidp_connlist_req cl; + uint32_t uci; + int err; + + if (get_user(cl.cnum, (uint32_t __user *) arg) || + get_user(uci, (u32 __user *) (arg + 4))) + return -EFAULT; + + cl.ci = compat_ptr(uci); + + if (cl.cnum <= 0) + return -EINVAL; + + err = hidp_get_connlist(&cl); + + if (!err && put_user(cl.cnum, (uint32_t __user *) arg)) + err = -EFAULT; + + return err; + } else if (cmd == HIDPCONNADD) { + struct compat_hidp_connadd_req ca; + struct hidp_connadd_req __user *uca; + + uca = compat_alloc_user_space(sizeof(*uca)); + + if (copy_from_user(&ca, (void __user *) arg, sizeof(ca))) + return -EFAULT; + + if (put_user(ca.ctrl_sock, &uca->ctrl_sock) || + put_user(ca.intr_sock, &uca->intr_sock) || + put_user(ca.parser, &uca->parser) || + put_user(ca.rd_size, &uca->rd_size) || + put_user(compat_ptr(ca.rd_data), &uca->rd_data) || + put_user(ca.country, &uca->country) || + put_user(ca.subclass, &uca->subclass) || + put_user(ca.vendor, &uca->vendor) || + put_user(ca.product, &uca->product) || + put_user(ca.version, &uca->version) || + put_user(ca.flags, &uca->flags) || + put_user(ca.idle_to, &uca->idle_to) || + copy_to_user(&uca->name[0], &ca.name[0], 128)) + return -EFAULT; + + arg = (unsigned long) uca; + + /* Fall through. We don't actually write back any _changes_ + to the structure anyway, so there's no need to copy back + into the original compat version */ + } + + return hidp_sock_ioctl(sock, cmd, arg); +} +#endif + +static const struct proto_ops hidp_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = hidp_sock_release, + .ioctl = hidp_sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = hidp_sock_compat_ioctl, +#endif + .bind = sock_no_bind, + .getname = sock_no_getname, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .poll = sock_no_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto hidp_proto = { + .name = "HIDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bt_sock) +}; + +static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock->ops = &hidp_sock_ops; + + sock->state = SS_UNCONNECTED; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + sk->sk_state = BT_OPEN; + + return 0; +} + +static const struct net_proto_family hidp_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = hidp_sock_create +}; + +int __init hidp_init_sockets(void) +{ + int err; + + err = proto_register(&hidp_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops); + if (err < 0) + goto error; + + return 0; + +error: + BT_ERR("Can't register HIDP socket"); + proto_unregister(&hidp_proto); + return err; +} + +void __exit hidp_cleanup_sockets(void) +{ + if (bt_sock_unregister(BTPROTO_HIDP) < 0) + BT_ERR("Can't unregister HIDP socket"); + + proto_unregister(&hidp_proto); +} diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c new file mode 100644 index 00000000..5a0ce738 --- /dev/null +++ b/net/bluetooth/l2cap_core.c @@ -0,0 +1,4378 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + Copyright (C) 2009-2010 Gustavo F. Padovan + Copyright (C) 2010 Google Inc. + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth L2CAP core. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +int disable_ertm; + +static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN; +static u8 l2cap_fixed_chan[8] = { 0x02, }; + +static LIST_HEAD(chan_list); +static DEFINE_RWLOCK(chan_list_lock); + +static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, + u8 code, u8 ident, u16 dlen, void *data); +static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, + void *data); +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data); +static void l2cap_send_disconn_req(struct l2cap_conn *conn, + struct l2cap_chan *chan, int err); + +static int l2cap_ertm_data_rcv(struct sock *sk, struct sk_buff *skb); + +/* ---- L2CAP channels ---- */ + +static inline void chan_hold(struct l2cap_chan *c) +{ + atomic_inc(&c->refcnt); +} + +static inline void chan_put(struct l2cap_chan *c) +{ + if (atomic_dec_and_test(&c->refcnt)) + kfree(c); +} + +static struct l2cap_chan *__l2cap_get_chan_by_dcid(struct l2cap_conn *conn, u16 cid) +{ + struct l2cap_chan *c; + + list_for_each_entry(c, &conn->chan_l, list) { + if (c->dcid == cid) + return c; + } + return NULL; + +} + +static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn, u16 cid) +{ + struct l2cap_chan *c; + + list_for_each_entry(c, &conn->chan_l, list) { + if (c->scid == cid) + return c; + } + return NULL; +} + +/* Find channel with given SCID. + * Returns locked socket */ +static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, u16 cid) +{ + struct l2cap_chan *c; + + read_lock(&conn->chan_lock); + c = __l2cap_get_chan_by_scid(conn, cid); + if (c) + bh_lock_sock(c->sk); + read_unlock(&conn->chan_lock); + return c; +} + +static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn, u8 ident) +{ + struct l2cap_chan *c; + + list_for_each_entry(c, &conn->chan_l, list) { + if (c->ident == ident) + return c; + } + return NULL; +} + +static inline struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn, u8 ident) +{ + struct l2cap_chan *c; + + read_lock(&conn->chan_lock); + c = __l2cap_get_chan_by_ident(conn, ident); + if (c) + bh_lock_sock(c->sk); + read_unlock(&conn->chan_lock); + return c; +} + +static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src) +{ + struct l2cap_chan *c; + + list_for_each_entry(c, &chan_list, global_l) { + if (c->sport == psm && !bacmp(&bt_sk(c->sk)->src, src)) + goto found; + } + + c = NULL; +found: + return c; +} + +int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm) +{ + int err; + + write_lock_bh(&chan_list_lock); + + if (psm && __l2cap_global_chan_by_addr(psm, src)) { + err = -EADDRINUSE; + goto done; + } + + if (psm) { + chan->psm = psm; + chan->sport = psm; + err = 0; + } else { + u16 p; + + err = -EINVAL; + for (p = 0x1001; p < 0x1100; p += 2) + if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) { + chan->psm = cpu_to_le16(p); + chan->sport = cpu_to_le16(p); + err = 0; + break; + } + } + +done: + write_unlock_bh(&chan_list_lock); + return err; +} + +int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid) +{ + write_lock_bh(&chan_list_lock); + + chan->scid = scid; + + write_unlock_bh(&chan_list_lock); + + return 0; +} + +static u16 l2cap_alloc_cid(struct l2cap_conn *conn) +{ + u16 cid = L2CAP_CID_DYN_START; + + for (; cid < L2CAP_CID_DYN_END; cid++) { + if (!__l2cap_get_chan_by_scid(conn, cid)) + return cid; + } + + return 0; +} + +static void l2cap_set_timer(struct l2cap_chan *chan, struct timer_list *timer, long timeout) +{ + BT_DBG("chan %p state %d timeout %ld", chan->sk, chan->state, timeout); + + if (!mod_timer(timer, jiffies + msecs_to_jiffies(timeout))) + chan_hold(chan); +} + +static void l2cap_clear_timer(struct l2cap_chan *chan, struct timer_list *timer) +{ + BT_DBG("chan %p state %d", chan, chan->state); + + if (timer_pending(timer) && del_timer(timer)) + chan_put(chan); +} + +static void l2cap_state_change(struct l2cap_chan *chan, int state) +{ + chan->state = state; + chan->ops->state_change(chan->data, state); +} + +static void l2cap_chan_timeout(unsigned long arg) +{ + struct l2cap_chan *chan = (struct l2cap_chan *) arg; + struct sock *sk = chan->sk; + int reason; + + BT_DBG("chan %p state %d", chan, chan->state); + + bh_lock_sock(sk); + + if (sock_owned_by_user(sk)) { + /* sk is owned by user. Try again later */ + __set_chan_timer(chan, HZ / 5); + bh_unlock_sock(sk); + chan_put(chan); + return; + } + + if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG) + reason = ECONNREFUSED; + else if (chan->state == BT_CONNECT && + chan->sec_level != BT_SECURITY_SDP) + reason = ECONNREFUSED; + else + reason = ETIMEDOUT; + + l2cap_chan_close(chan, reason); + + bh_unlock_sock(sk); + + chan->ops->close(chan->data); + chan_put(chan); +} + +struct l2cap_chan *l2cap_chan_create(struct sock *sk) +{ + struct l2cap_chan *chan; + + chan = kzalloc(sizeof(*chan), GFP_ATOMIC); + if (!chan) + return NULL; + + chan->sk = sk; + + write_lock_bh(&chan_list_lock); + list_add(&chan->global_l, &chan_list); + write_unlock_bh(&chan_list_lock); + + setup_timer(&chan->chan_timer, l2cap_chan_timeout, (unsigned long) chan); + + chan->state = BT_OPEN; + + atomic_set(&chan->refcnt, 1); + + return chan; +} + +void l2cap_chan_destroy(struct l2cap_chan *chan) +{ + write_lock_bh(&chan_list_lock); + list_del(&chan->global_l); + write_unlock_bh(&chan_list_lock); + + chan_put(chan); +} + +static void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) +{ + BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, + chan->psm, chan->dcid); + + conn->disc_reason = 0x13; + + chan->conn = conn; + + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) { + if (conn->hcon->type == LE_LINK) { + /* LE connection */ + chan->omtu = L2CAP_LE_DEFAULT_MTU; + chan->scid = L2CAP_CID_LE_DATA; + chan->dcid = L2CAP_CID_LE_DATA; + } else { + /* Alloc CID for connection-oriented socket */ + chan->scid = l2cap_alloc_cid(conn); + chan->omtu = L2CAP_DEFAULT_MTU; + } + } else if (chan->chan_type == L2CAP_CHAN_CONN_LESS) { + /* Connectionless socket */ + chan->scid = L2CAP_CID_CONN_LESS; + chan->dcid = L2CAP_CID_CONN_LESS; + chan->omtu = L2CAP_DEFAULT_MTU; + } else { + /* Raw socket can send/recv signalling messages only */ + chan->scid = L2CAP_CID_SIGNALING; + chan->dcid = L2CAP_CID_SIGNALING; + chan->omtu = L2CAP_DEFAULT_MTU; + } + + chan_hold(chan); + + list_add(&chan->list, &conn->chan_l); +} + +/* Delete channel. + * Must be called on the locked socket. */ +static void l2cap_chan_del(struct l2cap_chan *chan, int err) +{ + struct sock *sk = chan->sk; + struct l2cap_conn *conn = chan->conn; + struct sock *parent = bt_sk(sk)->parent; + + __clear_chan_timer(chan); + + BT_DBG("chan %p, conn %p, err %d", chan, conn, err); + + if (conn) { + /* Delete from channel list */ + write_lock_bh(&conn->chan_lock); + list_del(&chan->list); + write_unlock_bh(&conn->chan_lock); + chan_put(chan); + + chan->conn = NULL; + hci_conn_put(conn->hcon); + } + + l2cap_state_change(chan, BT_CLOSED); + sock_set_flag(sk, SOCK_ZAPPED); + + if (err) + sk->sk_err = err; + + if (parent) { + bt_accept_unlink(sk); + parent->sk_data_ready(parent, 0); + } else + sk->sk_state_change(sk); + + if (!(test_bit(CONF_OUTPUT_DONE, &chan->conf_state) && + test_bit(CONF_INPUT_DONE, &chan->conf_state))) + return; + + skb_queue_purge(&chan->tx_q); + + if (chan->mode == L2CAP_MODE_ERTM) { + struct srej_list *l, *tmp; + + __clear_retrans_timer(chan); + __clear_monitor_timer(chan); + __clear_ack_timer(chan); + + skb_queue_purge(&chan->srej_q); + + list_for_each_entry_safe(l, tmp, &chan->srej_l, list) { + list_del(&l->list); + kfree(l); + } + } +} + +static void l2cap_chan_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + BT_DBG("parent %p", parent); + + /* Close not yet accepted channels */ + while ((sk = bt_accept_dequeue(parent, NULL))) { + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + __clear_chan_timer(chan); + lock_sock(sk); + l2cap_chan_close(chan, ECONNRESET); + release_sock(sk); + chan->ops->close(chan->data); + } +} + +void l2cap_chan_close(struct l2cap_chan *chan, int reason) +{ + struct l2cap_conn *conn = chan->conn; + struct sock *sk = chan->sk; + + BT_DBG("chan %p state %d socket %p", chan, chan->state, sk->sk_socket); + + switch (chan->state) { + case BT_LISTEN: + l2cap_chan_cleanup_listen(sk); + + l2cap_state_change(chan, BT_CLOSED); + sock_set_flag(sk, SOCK_ZAPPED); + break; + + case BT_CONNECTED: + case BT_CONFIG: + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && + conn->hcon->type == ACL_LINK) { + __clear_chan_timer(chan); + __set_chan_timer(chan, sk->sk_sndtimeo); + l2cap_send_disconn_req(conn, chan, reason); + } else + l2cap_chan_del(chan, reason); + break; + + case BT_CONNECT2: + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && + conn->hcon->type == ACL_LINK) { + struct l2cap_conn_rsp rsp; + __u16 result; + + if (bt_sk(sk)->defer_setup) + result = L2CAP_CR_SEC_BLOCK; + else + result = L2CAP_CR_BAD_PSM; + l2cap_state_change(chan, BT_DISCONN); + + rsp.scid = cpu_to_le16(chan->dcid); + rsp.dcid = cpu_to_le16(chan->scid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, + sizeof(rsp), &rsp); + } + + l2cap_chan_del(chan, reason); + break; + + case BT_CONNECT: + case BT_DISCONN: + l2cap_chan_del(chan, reason); + break; + + default: + sock_set_flag(sk, SOCK_ZAPPED); + break; + } +} + +static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) +{ + if (chan->chan_type == L2CAP_CHAN_RAW) { + switch (chan->sec_level) { + case BT_SECURITY_HIGH: + return HCI_AT_DEDICATED_BONDING_MITM; + case BT_SECURITY_MEDIUM: + return HCI_AT_DEDICATED_BONDING; + default: + return HCI_AT_NO_BONDING; + } + } else if (chan->psm == cpu_to_le16(0x0001)) { + if (chan->sec_level == BT_SECURITY_LOW) + chan->sec_level = BT_SECURITY_SDP; + + if (chan->sec_level == BT_SECURITY_HIGH) + return HCI_AT_NO_BONDING_MITM; + else + return HCI_AT_NO_BONDING; + } else { + switch (chan->sec_level) { + case BT_SECURITY_HIGH: + return HCI_AT_GENERAL_BONDING_MITM; + case BT_SECURITY_MEDIUM: + return HCI_AT_GENERAL_BONDING; + default: + return HCI_AT_NO_BONDING; + } + } +} + +/* Service level security */ +static inline int l2cap_check_security(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + __u8 auth_type; + + auth_type = l2cap_get_auth_type(chan); + + return hci_conn_security(conn->hcon, chan->sec_level, auth_type); +} + +static u8 l2cap_get_ident(struct l2cap_conn *conn) +{ + u8 id; + + /* Get next available identificator. + * 1 - 128 are used by kernel. + * 129 - 199 are reserved. + * 200 - 254 are used by utilities like l2ping, etc. + */ + + spin_lock_bh(&conn->lock); + + if (++conn->tx_ident > 128) + conn->tx_ident = 1; + + id = conn->tx_ident; + + spin_unlock_bh(&conn->lock); + + return id; +} + +static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) +{ + struct sk_buff *skb = l2cap_build_cmd(conn, code, ident, len, data); + u8 flags; + + BT_DBG("code 0x%2.2x", code); + + if (!skb) + return; + + if (lmp_no_flush_capable(conn->hcon->hdev)) + flags = ACL_START_NO_FLUSH; + else + flags = ACL_START; + + bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON; + + hci_send_acl(conn->hcon, skb, flags); +} + +static inline void l2cap_send_sframe(struct l2cap_chan *chan, u16 control) +{ + struct sk_buff *skb; + struct l2cap_hdr *lh; + struct l2cap_conn *conn = chan->conn; + int count, hlen = L2CAP_HDR_SIZE + 2; + u8 flags; + + if (chan->state != BT_CONNECTED) + return; + + if (chan->fcs == L2CAP_FCS_CRC16) + hlen += 2; + + BT_DBG("chan %p, control 0x%2.2x", chan, control); + + count = min_t(unsigned int, conn->mtu, hlen); + control |= L2CAP_CTRL_FRAME_TYPE; + + if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state)) + control |= L2CAP_CTRL_FINAL; + + if (test_and_clear_bit(CONN_SEND_PBIT, &chan->conn_state)) + control |= L2CAP_CTRL_POLL; + + skb = bt_skb_alloc(count, GFP_ATOMIC); + if (!skb) + return; + + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->len = cpu_to_le16(hlen - L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(chan->dcid); + put_unaligned_le16(control, skb_put(skb, 2)); + + if (chan->fcs == L2CAP_FCS_CRC16) { + u16 fcs = crc16(0, (u8 *)lh, count - 2); + put_unaligned_le16(fcs, skb_put(skb, 2)); + } + + if (lmp_no_flush_capable(conn->hcon->hdev)) + flags = ACL_START_NO_FLUSH; + else + flags = ACL_START; + + bt_cb(skb)->force_active = chan->force_active; + + hci_send_acl(chan->conn->hcon, skb, flags); +} + +static inline void l2cap_send_rr_or_rnr(struct l2cap_chan *chan, u16 control) +{ + if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) { + control |= L2CAP_SUPER_RCV_NOT_READY; + set_bit(CONN_RNR_SENT, &chan->conn_state); + } else + control |= L2CAP_SUPER_RCV_READY; + + control |= chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT; + + l2cap_send_sframe(chan, control); +} + +static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan) +{ + return !test_bit(CONF_CONNECT_PEND, &chan->conf_state); +} + +static void l2cap_do_start(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) + return; + + if (l2cap_check_security(chan) && + __l2cap_no_conn_pending(chan)) { + struct l2cap_conn_req req; + req.scid = cpu_to_le16(chan->scid); + req.psm = chan->psm; + + chan->ident = l2cap_get_ident(conn); + set_bit(CONF_CONNECT_PEND, &chan->conf_state); + + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, + sizeof(req), &req); + } + } else { + struct l2cap_info_req req; + req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; + conn->info_ident = l2cap_get_ident(conn); + + mod_timer(&conn->info_timer, jiffies + + msecs_to_jiffies(L2CAP_INFO_TIMEOUT)); + + l2cap_send_cmd(conn, conn->info_ident, + L2CAP_INFO_REQ, sizeof(req), &req); + } +} + +static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask) +{ + u32 local_feat_mask = l2cap_feat_mask; + if (!disable_ertm) + local_feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING; + + switch (mode) { + case L2CAP_MODE_ERTM: + return L2CAP_FEAT_ERTM & feat_mask & local_feat_mask; + case L2CAP_MODE_STREAMING: + return L2CAP_FEAT_STREAMING & feat_mask & local_feat_mask; + default: + return 0x00; + } +} + +static void l2cap_send_disconn_req(struct l2cap_conn *conn, struct l2cap_chan *chan, int err) +{ + struct sock *sk; + struct l2cap_disconn_req req; + + if (!conn) + return; + + sk = chan->sk; + + if (chan->mode == L2CAP_MODE_ERTM) { + __clear_retrans_timer(chan); + __clear_monitor_timer(chan); + __clear_ack_timer(chan); + } + + req.dcid = cpu_to_le16(chan->dcid); + req.scid = cpu_to_le16(chan->scid); + l2cap_send_cmd(conn, l2cap_get_ident(conn), + L2CAP_DISCONN_REQ, sizeof(req), &req); + + l2cap_state_change(chan, BT_DISCONN); + sk->sk_err = err; +} + +/* ---- L2CAP connections ---- */ +static void l2cap_conn_start(struct l2cap_conn *conn) +{ + struct l2cap_chan *chan, *tmp; + + BT_DBG("conn %p", conn); + + read_lock(&conn->chan_lock); + + list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { + struct sock *sk = chan->sk; + + bh_lock_sock(sk); + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + bh_unlock_sock(sk); + continue; + } + + if (chan->state == BT_CONNECT) { + struct l2cap_conn_req req; + + if (!l2cap_check_security(chan) || + !__l2cap_no_conn_pending(chan)) { + bh_unlock_sock(sk); + continue; + } + + if (!l2cap_mode_supported(chan->mode, conn->feat_mask) + && test_bit(CONF_STATE2_DEVICE, + &chan->conf_state)) { + /* l2cap_chan_close() calls list_del(chan) + * so release the lock */ + read_unlock(&conn->chan_lock); + l2cap_chan_close(chan, ECONNRESET); + read_lock(&conn->chan_lock); + bh_unlock_sock(sk); + continue; + } + + req.scid = cpu_to_le16(chan->scid); + req.psm = chan->psm; + + chan->ident = l2cap_get_ident(conn); + set_bit(CONF_CONNECT_PEND, &chan->conf_state); + + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, + sizeof(req), &req); + + } else if (chan->state == BT_CONNECT2) { + struct l2cap_conn_rsp rsp; + char buf[128]; + rsp.scid = cpu_to_le16(chan->dcid); + rsp.dcid = cpu_to_le16(chan->scid); + + if (l2cap_check_security(chan)) { + if (bt_sk(sk)->defer_setup) { + struct sock *parent = bt_sk(sk)->parent; + rsp.result = cpu_to_le16(L2CAP_CR_PEND); + rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND); + if (parent) + parent->sk_data_ready(parent, 0); + + } else { + l2cap_state_change(chan, BT_CONFIG); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + } + } else { + rsp.result = cpu_to_le16(L2CAP_CR_PEND); + rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND); + } + + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, + sizeof(rsp), &rsp); + + if (test_bit(CONF_REQ_SENT, &chan->conf_state) || + rsp.result != L2CAP_CR_SUCCESS) { + bh_unlock_sock(sk); + continue; + } + + set_bit(CONF_REQ_SENT, &chan->conf_state); + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(chan, buf), buf); + chan->num_conf_req++; + } + + bh_unlock_sock(sk); + } + + read_unlock(&conn->chan_lock); +} + +/* Find socket with cid and source bdaddr. + * Returns closest match, locked. + */ +static struct l2cap_chan *l2cap_global_chan_by_scid(int state, __le16 cid, bdaddr_t *src) +{ + struct l2cap_chan *c, *c1 = NULL; + + read_lock(&chan_list_lock); + + list_for_each_entry(c, &chan_list, global_l) { + struct sock *sk = c->sk; + + if (state && c->state != state) + continue; + + if (c->scid == cid) { + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) { + read_unlock(&chan_list_lock); + return c; + } + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + c1 = c; + } + } + + read_unlock(&chan_list_lock); + + return c1; +} + +static void l2cap_le_conn_ready(struct l2cap_conn *conn) +{ + struct sock *parent, *sk; + struct l2cap_chan *chan, *pchan; + + BT_DBG(""); + + /* Check if we have socket listening on cid */ + pchan = l2cap_global_chan_by_scid(BT_LISTEN, L2CAP_CID_LE_DATA, + conn->src); + if (!pchan) + return; + + parent = pchan->sk; + + bh_lock_sock(parent); + + /* Check for backlog size */ + if (sk_acceptq_is_full(parent)) { + BT_DBG("backlog full %d", parent->sk_ack_backlog); + goto clean; + } + + chan = pchan->ops->new_connection(pchan->data); + if (!chan) + goto clean; + + sk = chan->sk; + + write_lock_bh(&conn->chan_lock); + + hci_conn_hold(conn->hcon); + + bacpy(&bt_sk(sk)->src, conn->src); + bacpy(&bt_sk(sk)->dst, conn->dst); + + bt_accept_enqueue(parent, sk); + + __l2cap_chan_add(conn, chan); + + __set_chan_timer(chan, sk->sk_sndtimeo); + + l2cap_state_change(chan, BT_CONNECTED); + parent->sk_data_ready(parent, 0); + + write_unlock_bh(&conn->chan_lock); + +clean: + bh_unlock_sock(parent); +} + +static void l2cap_chan_ready(struct sock *sk) +{ + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct sock *parent = bt_sk(sk)->parent; + + BT_DBG("sk %p, parent %p", sk, parent); + + chan->conf_state = 0; + __clear_chan_timer(chan); + + l2cap_state_change(chan, BT_CONNECTED); + sk->sk_state_change(sk); + + if (parent) + parent->sk_data_ready(parent, 0); +} + +static void l2cap_conn_ready(struct l2cap_conn *conn) +{ + struct l2cap_chan *chan; + + BT_DBG("conn %p", conn); + + if (!conn->hcon->out && conn->hcon->type == LE_LINK) + l2cap_le_conn_ready(conn); + + read_lock(&conn->chan_lock); + + list_for_each_entry(chan, &conn->chan_l, list) { + struct sock *sk = chan->sk; + + bh_lock_sock(sk); + + if (conn->hcon->type == LE_LINK) { + if (smp_conn_security(conn, chan->sec_level)) + l2cap_chan_ready(sk); + + } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + __clear_chan_timer(chan); + l2cap_state_change(chan, BT_CONNECTED); + sk->sk_state_change(sk); + + } else if (chan->state == BT_CONNECT) + l2cap_do_start(chan); + + bh_unlock_sock(sk); + } + + read_unlock(&conn->chan_lock); +} + +/* Notify sockets that we cannot guaranty reliability anymore */ +static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) +{ + struct l2cap_chan *chan; + + BT_DBG("conn %p", conn); + + read_lock(&conn->chan_lock); + + list_for_each_entry(chan, &conn->chan_l, list) { + struct sock *sk = chan->sk; + + if (chan->force_reliable) + sk->sk_err = err; + } + + read_unlock(&conn->chan_lock); +} + +static void l2cap_info_timeout(unsigned long arg) +{ + struct l2cap_conn *conn = (void *) arg; + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); +} + +static void l2cap_conn_del(struct hci_conn *hcon, int err) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan, *l; + struct sock *sk; + + if (!conn) + return; + + BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + + kfree_skb(conn->rx_skb); + + /* Kill channels */ + list_for_each_entry_safe(chan, l, &conn->chan_l, list) { + sk = chan->sk; + bh_lock_sock(sk); + l2cap_chan_del(chan, err); + bh_unlock_sock(sk); + chan->ops->close(chan->data); + } + + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) + del_timer_sync(&conn->info_timer); + + if (test_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend)) + del_timer(&conn->security_timer); + + hcon->l2cap_data = NULL; + kfree(conn); +} + +static void security_timeout(unsigned long arg) +{ + struct l2cap_conn *conn = (void *) arg; + + l2cap_conn_del(conn->hcon, ETIMEDOUT); +} + +static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + + if (conn || status) + return conn; + + conn = kzalloc(sizeof(struct l2cap_conn), GFP_ATOMIC); + if (!conn) + return NULL; + + hcon->l2cap_data = conn; + conn->hcon = hcon; + + BT_DBG("hcon %p conn %p", hcon, conn); + + if (hcon->hdev->le_mtu && hcon->type == LE_LINK) + conn->mtu = hcon->hdev->le_mtu; + else + conn->mtu = hcon->hdev->acl_mtu; + + conn->src = &hcon->hdev->bdaddr; + conn->dst = &hcon->dst; + + conn->feat_mask = 0; + + spin_lock_init(&conn->lock); + rwlock_init(&conn->chan_lock); + + INIT_LIST_HEAD(&conn->chan_l); + + if (hcon->type == LE_LINK) + setup_timer(&conn->security_timer, security_timeout, + (unsigned long) conn); + else + setup_timer(&conn->info_timer, l2cap_info_timeout, + (unsigned long) conn); + + conn->disc_reason = 0x13; + + return conn; +} + +static inline void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) +{ + write_lock_bh(&conn->chan_lock); + __l2cap_chan_add(conn, chan); + write_unlock_bh(&conn->chan_lock); +} + +/* ---- Socket interface ---- */ + +/* Find socket with psm and source bdaddr. + * Returns closest match. + */ +static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr_t *src) +{ + struct l2cap_chan *c, *c1 = NULL; + + read_lock(&chan_list_lock); + + list_for_each_entry(c, &chan_list, global_l) { + struct sock *sk = c->sk; + + if (state && c->state != state) + continue; + + if (c->psm == psm) { + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) { + read_unlock(&chan_list_lock); + return c; + } + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + c1 = c; + } + } + + read_unlock(&chan_list_lock); + + return c1; +} + +int l2cap_chan_connect(struct l2cap_chan *chan) +{ + struct sock *sk = chan->sk; + bdaddr_t *src = &bt_sk(sk)->src; + bdaddr_t *dst = &bt_sk(sk)->dst; + struct l2cap_conn *conn; + struct hci_conn *hcon; + struct hci_dev *hdev; + __u8 auth_type; + int err; + + BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), + chan->psm); + + hdev = hci_get_route(dst, src); + if (!hdev) + return -EHOSTUNREACH; + + hci_dev_lock_bh(hdev); + + auth_type = l2cap_get_auth_type(chan); + + if (chan->dcid == L2CAP_CID_LE_DATA) + hcon = hci_connect(hdev, LE_LINK, 0, dst, + chan->sec_level, auth_type); + else + hcon = hci_connect(hdev, ACL_LINK, 0, dst, + chan->sec_level, auth_type); + + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto done; + } + + conn = l2cap_conn_add(hcon, 0); + if (!conn) { + hci_conn_put(hcon); + err = -ENOMEM; + goto done; + } + + /* Update source addr of the socket */ + bacpy(src, conn->src); + + l2cap_chan_add(conn, chan); + + l2cap_state_change(chan, BT_CONNECT); + __set_chan_timer(chan, sk->sk_sndtimeo); + + if (hcon->state == BT_CONNECTED) { + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + __clear_chan_timer(chan); + if (l2cap_check_security(chan)) + l2cap_state_change(chan, BT_CONNECTED); + } else + l2cap_do_start(chan); + } + + err = 0; + +done: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + return err; +} + +int __l2cap_wait_ack(struct sock *sk) +{ + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + DECLARE_WAITQUEUE(wait, current); + int err = 0; + int timeo = HZ/5; + + add_wait_queue(sk_sleep(sk), &wait); + set_current_state(TASK_INTERRUPTIBLE); + while (chan->unacked_frames > 0 && chan->conn) { + if (!timeo) + timeo = HZ/5; + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + set_current_state(TASK_INTERRUPTIBLE); + + err = sock_error(sk); + if (err) + break; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + return err; +} + +static void l2cap_monitor_timeout(unsigned long arg) +{ + struct l2cap_chan *chan = (void *) arg; + struct sock *sk = chan->sk; + + BT_DBG("chan %p", chan); + + bh_lock_sock(sk); + if (chan->retry_count >= chan->remote_max_tx) { + l2cap_send_disconn_req(chan->conn, chan, ECONNABORTED); + bh_unlock_sock(sk); + return; + } + + chan->retry_count++; + __set_monitor_timer(chan); + + l2cap_send_rr_or_rnr(chan, L2CAP_CTRL_POLL); + bh_unlock_sock(sk); +} + +static void l2cap_retrans_timeout(unsigned long arg) +{ + struct l2cap_chan *chan = (void *) arg; + struct sock *sk = chan->sk; + + BT_DBG("chan %p", chan); + + bh_lock_sock(sk); + chan->retry_count = 1; + __set_monitor_timer(chan); + + set_bit(CONN_WAIT_F, &chan->conn_state); + + l2cap_send_rr_or_rnr(chan, L2CAP_CTRL_POLL); + bh_unlock_sock(sk); +} + +static void l2cap_drop_acked_frames(struct l2cap_chan *chan) +{ + struct sk_buff *skb; + + while ((skb = skb_peek(&chan->tx_q)) && + chan->unacked_frames) { + if (bt_cb(skb)->tx_seq == chan->expected_ack_seq) + break; + + skb = skb_dequeue(&chan->tx_q); + kfree_skb(skb); + + chan->unacked_frames--; + } + + if (!chan->unacked_frames) + __clear_retrans_timer(chan); +} + +void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb) +{ + struct hci_conn *hcon = chan->conn->hcon; + u16 flags; + + BT_DBG("chan %p, skb %p len %d", chan, skb, skb->len); + + if (!chan->flushable && lmp_no_flush_capable(hcon->hdev)) + flags = ACL_START_NO_FLUSH; + else + flags = ACL_START; + + bt_cb(skb)->force_active = chan->force_active; + hci_send_acl(hcon, skb, flags); +} + +void l2cap_streaming_send(struct l2cap_chan *chan) +{ + struct sk_buff *skb; + u16 control, fcs; + + while ((skb = skb_dequeue(&chan->tx_q))) { + control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE); + control |= chan->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT; + put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE); + + if (chan->fcs == L2CAP_FCS_CRC16) { + fcs = crc16(0, (u8 *)skb->data, skb->len - 2); + put_unaligned_le16(fcs, skb->data + skb->len - 2); + } + + l2cap_do_send(chan, skb); + + chan->next_tx_seq = (chan->next_tx_seq + 1) % 64; + } +} + +static void l2cap_retransmit_one_frame(struct l2cap_chan *chan, u8 tx_seq) +{ + struct sk_buff *skb, *tx_skb; + u16 control, fcs; + + skb = skb_peek(&chan->tx_q); + if (!skb) + return; + + do { + if (bt_cb(skb)->tx_seq == tx_seq) + break; + + if (skb_queue_is_last(&chan->tx_q, skb)) + return; + + } while ((skb = skb_queue_next(&chan->tx_q, skb))); + + if (chan->remote_max_tx && + bt_cb(skb)->retries == chan->remote_max_tx) { + l2cap_send_disconn_req(chan->conn, chan, ECONNABORTED); + return; + } + + tx_skb = skb_clone(skb, GFP_ATOMIC); + bt_cb(skb)->retries++; + control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE); + control &= L2CAP_CTRL_SAR; + + if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state)) + control |= L2CAP_CTRL_FINAL; + + control |= (chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT) + | (tx_seq << L2CAP_CTRL_TXSEQ_SHIFT); + + put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE); + + if (chan->fcs == L2CAP_FCS_CRC16) { + fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2); + put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2); + } + + l2cap_do_send(chan, tx_skb); +} + +int l2cap_ertm_send(struct l2cap_chan *chan) +{ + struct sk_buff *skb, *tx_skb; + u16 control, fcs; + int nsent = 0; + + if (chan->state != BT_CONNECTED) + return -ENOTCONN; + + while ((skb = chan->tx_send_head) && (!l2cap_tx_window_full(chan))) { + + if (chan->remote_max_tx && + bt_cb(skb)->retries == chan->remote_max_tx) { + l2cap_send_disconn_req(chan->conn, chan, ECONNABORTED); + break; + } + + tx_skb = skb_clone(skb, GFP_ATOMIC); + + bt_cb(skb)->retries++; + + control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE); + control &= L2CAP_CTRL_SAR; + + if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state)) + control |= L2CAP_CTRL_FINAL; + + control |= (chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT) + | (chan->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT); + put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE); + + + if (chan->fcs == L2CAP_FCS_CRC16) { + fcs = crc16(0, (u8 *)skb->data, tx_skb->len - 2); + put_unaligned_le16(fcs, skb->data + tx_skb->len - 2); + } + + l2cap_do_send(chan, tx_skb); + + __set_retrans_timer(chan); + + bt_cb(skb)->tx_seq = chan->next_tx_seq; + chan->next_tx_seq = (chan->next_tx_seq + 1) % 64; + + if (bt_cb(skb)->retries == 1) + chan->unacked_frames++; + + chan->frames_sent++; + + if (skb_queue_is_last(&chan->tx_q, skb)) + chan->tx_send_head = NULL; + else + chan->tx_send_head = skb_queue_next(&chan->tx_q, skb); + + nsent++; + } + + return nsent; +} + +static int l2cap_retransmit_frames(struct l2cap_chan *chan) +{ + int ret; + + if (!skb_queue_empty(&chan->tx_q)) + chan->tx_send_head = chan->tx_q.next; + + chan->next_tx_seq = chan->expected_ack_seq; + ret = l2cap_ertm_send(chan); + return ret; +} + +static void l2cap_send_ack(struct l2cap_chan *chan) +{ + u16 control = 0; + + control |= chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT; + + if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) { + control |= L2CAP_SUPER_RCV_NOT_READY; + set_bit(CONN_RNR_SENT, &chan->conn_state); + l2cap_send_sframe(chan, control); + return; + } + + if (l2cap_ertm_send(chan) > 0) + return; + + control |= L2CAP_SUPER_RCV_READY; + l2cap_send_sframe(chan, control); +} + +static void l2cap_send_srejtail(struct l2cap_chan *chan) +{ + struct srej_list *tail; + u16 control; + + control = L2CAP_SUPER_SELECT_REJECT; + control |= L2CAP_CTRL_FINAL; + + tail = list_entry((&chan->srej_l)->prev, struct srej_list, list); + control |= tail->tx_seq << L2CAP_CTRL_REQSEQ_SHIFT; + + l2cap_send_sframe(chan, control); +} + +static inline int l2cap_skbuff_fromiovec(struct sock *sk, struct msghdr *msg, int len, int count, struct sk_buff *skb) +{ + struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn; + struct sk_buff **frag; + int err, sent = 0; + + if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) + return -EFAULT; + + sent += count; + len -= count; + + /* Continuation fragments (no L2CAP header) */ + frag = &skb_shinfo(skb)->frag_list; + while (len) { + count = min_t(unsigned int, conn->mtu, len); + + *frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err); + if (!*frag) + return err; + if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count)) + return -EFAULT; + + sent += count; + len -= count; + + frag = &(*frag)->next; + } + + return sent; +} + +struct sk_buff *l2cap_create_connless_pdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len) +{ + struct sock *sk = chan->sk; + struct l2cap_conn *conn = chan->conn; + struct sk_buff *skb; + int err, count, hlen = L2CAP_HDR_SIZE + 2; + struct l2cap_hdr *lh; + + BT_DBG("sk %p len %d", sk, (int)len); + + count = min_t(unsigned int, (conn->mtu - hlen), len); + skb = bt_skb_send_alloc(sk, count + hlen, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return ERR_PTR(err); + + /* Create L2CAP header */ + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(chan->dcid); + lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + put_unaligned_le16(chan->psm, skb_put(skb, 2)); + + err = l2cap_skbuff_fromiovec(sk, msg, len, count, skb); + if (unlikely(err < 0)) { + kfree_skb(skb); + return ERR_PTR(err); + } + return skb; +} + +struct sk_buff *l2cap_create_basic_pdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len) +{ + struct sock *sk = chan->sk; + struct l2cap_conn *conn = chan->conn; + struct sk_buff *skb; + int err, count, hlen = L2CAP_HDR_SIZE; + struct l2cap_hdr *lh; + + BT_DBG("sk %p len %d", sk, (int)len); + + count = min_t(unsigned int, (conn->mtu - hlen), len); + skb = bt_skb_send_alloc(sk, count + hlen, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return ERR_PTR(err); + + /* Create L2CAP header */ + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(chan->dcid); + lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + + err = l2cap_skbuff_fromiovec(sk, msg, len, count, skb); + if (unlikely(err < 0)) { + kfree_skb(skb); + return ERR_PTR(err); + } + return skb; +} + +struct sk_buff *l2cap_create_iframe_pdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len, u16 control, u16 sdulen) +{ + struct sock *sk = chan->sk; + struct l2cap_conn *conn = chan->conn; + struct sk_buff *skb; + int err, count, hlen = L2CAP_HDR_SIZE + 2; + struct l2cap_hdr *lh; + + BT_DBG("sk %p len %d", sk, (int)len); + + if (!conn) + return ERR_PTR(-ENOTCONN); + + if (sdulen) + hlen += 2; + + if (chan->fcs == L2CAP_FCS_CRC16) + hlen += 2; + + count = min_t(unsigned int, (conn->mtu - hlen), len); + skb = bt_skb_send_alloc(sk, count + hlen, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return ERR_PTR(err); + + /* Create L2CAP header */ + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = cpu_to_le16(chan->dcid); + lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + put_unaligned_le16(control, skb_put(skb, 2)); + if (sdulen) + put_unaligned_le16(sdulen, skb_put(skb, 2)); + + err = l2cap_skbuff_fromiovec(sk, msg, len, count, skb); + if (unlikely(err < 0)) { + kfree_skb(skb); + return ERR_PTR(err); + } + + if (chan->fcs == L2CAP_FCS_CRC16) + put_unaligned_le16(0, skb_put(skb, 2)); + + bt_cb(skb)->retries = 0; + return skb; +} + +int l2cap_sar_segment_sdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len) +{ + struct sk_buff *skb; + struct sk_buff_head sar_queue; + u16 control; + size_t size = 0; + + skb_queue_head_init(&sar_queue); + control = L2CAP_SDU_START; + skb = l2cap_create_iframe_pdu(chan, msg, chan->remote_mps, control, len); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + __skb_queue_tail(&sar_queue, skb); + len -= chan->remote_mps; + size += chan->remote_mps; + + while (len > 0) { + size_t buflen; + + if (len > chan->remote_mps) { + control = L2CAP_SDU_CONTINUE; + buflen = chan->remote_mps; + } else { + control = L2CAP_SDU_END; + buflen = len; + } + + skb = l2cap_create_iframe_pdu(chan, msg, buflen, control, 0); + if (IS_ERR(skb)) { + skb_queue_purge(&sar_queue); + return PTR_ERR(skb); + } + + __skb_queue_tail(&sar_queue, skb); + len -= buflen; + size += buflen; + } + skb_queue_splice_tail(&sar_queue, &chan->tx_q); + if (chan->tx_send_head == NULL) + chan->tx_send_head = sar_queue.next; + + return size; +} + +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) +{ + struct sk_buff *skb; + u16 control; + int err; + + /* Connectionless channel */ + if (chan->chan_type == L2CAP_CHAN_CONN_LESS) { + skb = l2cap_create_connless_pdu(chan, msg, len); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + l2cap_do_send(chan, skb); + return len; + } + + switch (chan->mode) { + case L2CAP_MODE_BASIC: + /* Check outgoing MTU */ + if (len > chan->omtu) + return -EMSGSIZE; + + /* Create a basic PDU */ + skb = l2cap_create_basic_pdu(chan, msg, len); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + l2cap_do_send(chan, skb); + err = len; + break; + + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + /* Entire SDU fits into one PDU */ + if (len <= chan->remote_mps) { + control = L2CAP_SDU_UNSEGMENTED; + skb = l2cap_create_iframe_pdu(chan, msg, len, control, + 0); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + __skb_queue_tail(&chan->tx_q, skb); + + if (chan->tx_send_head == NULL) + chan->tx_send_head = skb; + + } else { + /* Segment SDU into multiples PDUs */ + err = l2cap_sar_segment_sdu(chan, msg, len); + if (err < 0) + return err; + } + + if (chan->mode == L2CAP_MODE_STREAMING) { + l2cap_streaming_send(chan); + err = len; + break; + } + + if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state) && + test_bit(CONN_WAIT_F, &chan->conn_state)) { + err = len; + break; + } + + err = l2cap_ertm_send(chan); + if (err >= 0) + err = len; + + break; + + default: + BT_DBG("bad state %1.1x", chan->mode); + err = -EBADFD; + } + + return err; +} + +/* Copy frame to all raw sockets on that connection */ +static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct sk_buff *nskb; + struct l2cap_chan *chan; + + BT_DBG("conn %p", conn); + + read_lock(&conn->chan_lock); + list_for_each_entry(chan, &conn->chan_l, list) { + struct sock *sk = chan->sk; + if (chan->chan_type != L2CAP_CHAN_RAW) + continue; + + /* Don't send frame to the socket it came from */ + if (skb->sk == sk) + continue; + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + continue; + + if (chan->ops->recv(chan->data, nskb)) + kfree_skb(nskb); + } + read_unlock(&conn->chan_lock); +} + +/* ---- L2CAP signalling commands ---- */ +static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, + u8 code, u8 ident, u16 dlen, void *data) +{ + struct sk_buff *skb, **frag; + struct l2cap_cmd_hdr *cmd; + struct l2cap_hdr *lh; + int len, count; + + BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %d", + conn, code, ident, dlen); + + len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen; + count = min_t(unsigned int, conn->mtu, len); + + skb = bt_skb_alloc(count, GFP_ATOMIC); + if (!skb) + return NULL; + + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->len = cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen); + + if (conn->hcon->type == LE_LINK) + lh->cid = cpu_to_le16(L2CAP_CID_LE_SIGNALING); + else + lh->cid = cpu_to_le16(L2CAP_CID_SIGNALING); + + cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE); + cmd->code = code; + cmd->ident = ident; + cmd->len = cpu_to_le16(dlen); + + if (dlen) { + count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE; + memcpy(skb_put(skb, count), data, count); + data += count; + } + + len -= skb->len; + + /* Continuation fragments (no L2CAP header) */ + frag = &skb_shinfo(skb)->frag_list; + while (len) { + count = min_t(unsigned int, conn->mtu, len); + + *frag = bt_skb_alloc(count, GFP_ATOMIC); + if (!*frag) + goto fail; + + memcpy(skb_put(*frag, count), data, count); + + len -= count; + data += count; + + frag = &(*frag)->next; + } + + return skb; + +fail: + kfree_skb(skb); + return NULL; +} + +static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, unsigned long *val) +{ + struct l2cap_conf_opt *opt = *ptr; + int len; + + len = L2CAP_CONF_OPT_SIZE + opt->len; + *ptr += len; + + *type = opt->type; + *olen = opt->len; + + switch (opt->len) { + case 1: + *val = *((u8 *) opt->val); + break; + + case 2: + *val = get_unaligned_le16(opt->val); + break; + + case 4: + *val = get_unaligned_le32(opt->val); + break; + + default: + *val = (unsigned long) opt->val; + break; + } + + BT_DBG("type 0x%2.2x len %d val 0x%lx", *type, opt->len, *val); + return len; +} + +static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) +{ + struct l2cap_conf_opt *opt = *ptr; + + BT_DBG("type 0x%2.2x len %d val 0x%lx", type, len, val); + + opt->type = type; + opt->len = len; + + switch (len) { + case 1: + *((u8 *) opt->val) = val; + break; + + case 2: + put_unaligned_le16(val, opt->val); + break; + + case 4: + put_unaligned_le32(val, opt->val); + break; + + default: + memcpy(opt->val, (void *) val, len); + break; + } + + *ptr += L2CAP_CONF_OPT_SIZE + len; +} + +static void l2cap_ack_timeout(unsigned long arg) +{ + struct l2cap_chan *chan = (void *) arg; + + bh_lock_sock(chan->sk); + l2cap_send_ack(chan); + bh_unlock_sock(chan->sk); +} + +static inline void l2cap_ertm_init(struct l2cap_chan *chan) +{ + struct sock *sk = chan->sk; + + chan->expected_ack_seq = 0; + chan->unacked_frames = 0; + chan->buffer_seq = 0; + chan->num_acked = 0; + chan->frames_sent = 0; + + setup_timer(&chan->retrans_timer, l2cap_retrans_timeout, + (unsigned long) chan); + setup_timer(&chan->monitor_timer, l2cap_monitor_timeout, + (unsigned long) chan); + setup_timer(&chan->ack_timer, l2cap_ack_timeout, (unsigned long) chan); + + skb_queue_head_init(&chan->srej_q); + + INIT_LIST_HEAD(&chan->srej_l); + + + sk->sk_backlog_rcv = l2cap_ertm_data_rcv; +} + +static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask) +{ + switch (mode) { + case L2CAP_MODE_STREAMING: + case L2CAP_MODE_ERTM: + if (l2cap_mode_supported(mode, remote_feat_mask)) + return mode; + /* fall through */ + default: + return L2CAP_MODE_BASIC; + } +} + +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) +{ + struct l2cap_conf_req *req = data; + struct l2cap_conf_rfc rfc = { .mode = chan->mode }; + void *ptr = req->data; + + BT_DBG("chan %p", chan); + + if (chan->num_conf_req || chan->num_conf_rsp) + goto done; + + switch (chan->mode) { + case L2CAP_MODE_STREAMING: + case L2CAP_MODE_ERTM: + if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state)) + break; + + /* fall through */ + default: + chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask); + break; + } + +done: + if (chan->imtu != L2CAP_DEFAULT_MTU) + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + + switch (chan->mode) { + case L2CAP_MODE_BASIC: + if (!(chan->conn->feat_mask & L2CAP_FEAT_ERTM) && + !(chan->conn->feat_mask & L2CAP_FEAT_STREAMING)) + break; + + rfc.mode = L2CAP_MODE_BASIC; + rfc.txwin_size = 0; + rfc.max_transmit = 0; + rfc.retrans_timeout = 0; + rfc.monitor_timeout = 0; + rfc.max_pdu_size = 0; + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), + (unsigned long) &rfc); + break; + + case L2CAP_MODE_ERTM: + rfc.mode = L2CAP_MODE_ERTM; + rfc.txwin_size = chan->tx_win; + rfc.max_transmit = chan->max_tx; + rfc.retrans_timeout = 0; + rfc.monitor_timeout = 0; + rfc.max_pdu_size = cpu_to_le16(L2CAP_DEFAULT_MAX_PDU_SIZE); + if (L2CAP_DEFAULT_MAX_PDU_SIZE > chan->conn->mtu - 10) + rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10); + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), + (unsigned long) &rfc); + + if (!(chan->conn->feat_mask & L2CAP_FEAT_FCS)) + break; + + if (chan->fcs == L2CAP_FCS_NONE || + test_bit(CONF_NO_FCS_RECV, &chan->conf_state)) { + chan->fcs = L2CAP_FCS_NONE; + l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, chan->fcs); + } + break; + + case L2CAP_MODE_STREAMING: + rfc.mode = L2CAP_MODE_STREAMING; + rfc.txwin_size = 0; + rfc.max_transmit = 0; + rfc.retrans_timeout = 0; + rfc.monitor_timeout = 0; + rfc.max_pdu_size = cpu_to_le16(L2CAP_DEFAULT_MAX_PDU_SIZE); + if (L2CAP_DEFAULT_MAX_PDU_SIZE > chan->conn->mtu - 10) + rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10); + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), + (unsigned long) &rfc); + + if (!(chan->conn->feat_mask & L2CAP_FEAT_FCS)) + break; + + if (chan->fcs == L2CAP_FCS_NONE || + test_bit(CONF_NO_FCS_RECV, &chan->conf_state)) { + chan->fcs = L2CAP_FCS_NONE; + l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, chan->fcs); + } + break; + } + + req->dcid = cpu_to_le16(chan->dcid); + req->flags = cpu_to_le16(0); + + return ptr - data; +} + +static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) +{ + struct l2cap_conf_rsp *rsp = data; + void *ptr = rsp->data; + void *req = chan->conf_req; + int len = chan->conf_len; + int type, hint, olen; + unsigned long val; + struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; + u16 mtu = L2CAP_DEFAULT_MTU; + u16 result = L2CAP_CONF_SUCCESS; + + BT_DBG("chan %p", chan); + + while (len >= L2CAP_CONF_OPT_SIZE) { + len -= l2cap_get_conf_opt(&req, &type, &olen, &val); + + hint = type & L2CAP_CONF_HINT; + type &= L2CAP_CONF_MASK; + + switch (type) { + case L2CAP_CONF_MTU: + mtu = val; + break; + + case L2CAP_CONF_FLUSH_TO: + chan->flush_to = val; + break; + + case L2CAP_CONF_QOS: + break; + + case L2CAP_CONF_RFC: + if (olen == sizeof(rfc)) + memcpy(&rfc, (void *) val, olen); + break; + + case L2CAP_CONF_FCS: + if (val == L2CAP_FCS_NONE) + set_bit(CONF_NO_FCS_RECV, &chan->conf_state); + + break; + + default: + if (hint) + break; + + result = L2CAP_CONF_UNKNOWN; + *((u8 *) ptr++) = type; + break; + } + } + + if (chan->num_conf_rsp || chan->num_conf_req > 1) + goto done; + + switch (chan->mode) { + case L2CAP_MODE_STREAMING: + case L2CAP_MODE_ERTM: + if (!test_bit(CONF_STATE2_DEVICE, &chan->conf_state)) { + chan->mode = l2cap_select_mode(rfc.mode, + chan->conn->feat_mask); + break; + } + + if (chan->mode != rfc.mode) + return -ECONNREFUSED; + + break; + } + +done: + if (chan->mode != rfc.mode) { + result = L2CAP_CONF_UNACCEPT; + rfc.mode = chan->mode; + + if (chan->num_conf_rsp == 1) + return -ECONNREFUSED; + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, + sizeof(rfc), (unsigned long) &rfc); + } + + + if (result == L2CAP_CONF_SUCCESS) { + /* Configure output options and let the other side know + * which ones we don't like. */ + + if (mtu < L2CAP_DEFAULT_MIN_MTU) + result = L2CAP_CONF_UNACCEPT; + else { + chan->omtu = mtu; + set_bit(CONF_MTU_DONE, &chan->conf_state); + } + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu); + + switch (rfc.mode) { + case L2CAP_MODE_BASIC: + chan->fcs = L2CAP_FCS_NONE; + set_bit(CONF_MODE_DONE, &chan->conf_state); + break; + + case L2CAP_MODE_ERTM: + chan->remote_tx_win = rfc.txwin_size; + chan->remote_max_tx = rfc.max_transmit; + + if (le16_to_cpu(rfc.max_pdu_size) > chan->conn->mtu - 10) + rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10); + + chan->remote_mps = le16_to_cpu(rfc.max_pdu_size); + + rfc.retrans_timeout = + le16_to_cpu(L2CAP_DEFAULT_RETRANS_TO); + rfc.monitor_timeout = + le16_to_cpu(L2CAP_DEFAULT_MONITOR_TO); + + set_bit(CONF_MODE_DONE, &chan->conf_state); + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, + sizeof(rfc), (unsigned long) &rfc); + + break; + + case L2CAP_MODE_STREAMING: + if (le16_to_cpu(rfc.max_pdu_size) > chan->conn->mtu - 10) + rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10); + + chan->remote_mps = le16_to_cpu(rfc.max_pdu_size); + + set_bit(CONF_MODE_DONE, &chan->conf_state); + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, + sizeof(rfc), (unsigned long) &rfc); + + break; + + default: + result = L2CAP_CONF_UNACCEPT; + + memset(&rfc, 0, sizeof(rfc)); + rfc.mode = chan->mode; + } + + if (result == L2CAP_CONF_SUCCESS) + set_bit(CONF_OUTPUT_DONE, &chan->conf_state); + } + rsp->scid = cpu_to_le16(chan->dcid); + rsp->result = cpu_to_le16(result); + rsp->flags = cpu_to_le16(0x0000); + + return ptr - data; +} + +static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, void *data, u16 *result) +{ + struct l2cap_conf_req *req = data; + void *ptr = req->data; + int type, olen; + unsigned long val; + struct l2cap_conf_rfc rfc; + + BT_DBG("chan %p, rsp %p, len %d, req %p", chan, rsp, len, data); + + while (len >= L2CAP_CONF_OPT_SIZE) { + len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val); + + switch (type) { + case L2CAP_CONF_MTU: + if (val < L2CAP_DEFAULT_MIN_MTU) { + *result = L2CAP_CONF_UNACCEPT; + chan->imtu = L2CAP_DEFAULT_MIN_MTU; + } else + chan->imtu = val; + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + break; + + case L2CAP_CONF_FLUSH_TO: + chan->flush_to = val; + l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, + 2, chan->flush_to); + break; + + case L2CAP_CONF_RFC: + if (olen == sizeof(rfc)) + memcpy(&rfc, (void *)val, olen); + + if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state) && + rfc.mode != chan->mode) + return -ECONNREFUSED; + + chan->fcs = 0; + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, + sizeof(rfc), (unsigned long) &rfc); + break; + } + } + + if (chan->mode == L2CAP_MODE_BASIC && chan->mode != rfc.mode) + return -ECONNREFUSED; + + chan->mode = rfc.mode; + + if (*result == L2CAP_CONF_SUCCESS) { + switch (rfc.mode) { + case L2CAP_MODE_ERTM: + chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout); + chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout); + chan->mps = le16_to_cpu(rfc.max_pdu_size); + break; + case L2CAP_MODE_STREAMING: + chan->mps = le16_to_cpu(rfc.max_pdu_size); + } + } + + req->dcid = cpu_to_le16(chan->dcid); + req->flags = cpu_to_le16(0x0000); + + return ptr - data; +} + +static int l2cap_build_conf_rsp(struct l2cap_chan *chan, void *data, u16 result, u16 flags) +{ + struct l2cap_conf_rsp *rsp = data; + void *ptr = rsp->data; + + BT_DBG("chan %p", chan); + + rsp->scid = cpu_to_le16(chan->dcid); + rsp->result = cpu_to_le16(result); + rsp->flags = cpu_to_le16(flags); + + return ptr - data; +} + +void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) +{ + struct l2cap_conn_rsp rsp; + struct l2cap_conn *conn = chan->conn; + u8 buf[128]; + + rsp.scid = cpu_to_le16(chan->dcid); + rsp.dcid = cpu_to_le16(chan->scid); + rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); + l2cap_send_cmd(conn, chan->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + + if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) + return; + + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(chan, buf), buf); + chan->num_conf_req++; +} + +static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len) +{ + int type, olen; + unsigned long val; + struct l2cap_conf_rfc rfc; + + BT_DBG("chan %p, rsp %p, len %d", chan, rsp, len); + + if ((chan->mode != L2CAP_MODE_ERTM) && (chan->mode != L2CAP_MODE_STREAMING)) + return; + + while (len >= L2CAP_CONF_OPT_SIZE) { + len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val); + + switch (type) { + case L2CAP_CONF_RFC: + if (olen == sizeof(rfc)) + memcpy(&rfc, (void *)val, olen); + goto done; + } + } + +done: + switch (rfc.mode) { + case L2CAP_MODE_ERTM: + chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout); + chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout); + chan->mps = le16_to_cpu(rfc.max_pdu_size); + break; + case L2CAP_MODE_STREAMING: + chan->mps = le16_to_cpu(rfc.max_pdu_size); + } +} + +static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_cmd_rej *rej = (struct l2cap_cmd_rej *) data; + + if (rej->reason != 0x0000) + return 0; + + if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && + cmd->ident == conn->info_ident) { + del_timer(&conn->info_timer); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } + + return 0; +} + +static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_conn_req *req = (struct l2cap_conn_req *) data; + struct l2cap_conn_rsp rsp; + struct l2cap_chan *chan = NULL, *pchan; + struct sock *parent, *sk = NULL; + int result, status = L2CAP_CS_NO_INFO; + + u16 dcid = 0, scid = __le16_to_cpu(req->scid); + __le16 psm = req->psm; + + BT_DBG("psm 0x%2.2x scid 0x%4.4x", psm, scid); + + /* Check if we have socket listening on psm */ + pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, conn->src); + if (!pchan) { + result = L2CAP_CR_BAD_PSM; + goto sendresp; + } + + parent = pchan->sk; + + bh_lock_sock(parent); + + /* Check if the ACL is secure enough (if not SDP) */ + if (psm != cpu_to_le16(0x0001) && + !hci_conn_check_link_mode(conn->hcon)) { + conn->disc_reason = 0x05; + result = L2CAP_CR_SEC_BLOCK; + goto response; + } + + result = L2CAP_CR_NO_MEM; + + /* Check for backlog size */ + if (sk_acceptq_is_full(parent)) { + BT_DBG("backlog full %d", parent->sk_ack_backlog); + goto response; + } + + chan = pchan->ops->new_connection(pchan->data); + if (!chan) + goto response; + + sk = chan->sk; + + write_lock_bh(&conn->chan_lock); + + /* Check if we already have channel with that dcid */ + if (__l2cap_get_chan_by_dcid(conn, scid)) { + write_unlock_bh(&conn->chan_lock); + sock_set_flag(sk, SOCK_ZAPPED); + chan->ops->close(chan->data); + goto response; + } + + hci_conn_hold(conn->hcon); + + bacpy(&bt_sk(sk)->src, conn->src); + bacpy(&bt_sk(sk)->dst, conn->dst); + chan->psm = psm; + chan->dcid = scid; + + bt_accept_enqueue(parent, sk); + + __l2cap_chan_add(conn, chan); + + dcid = chan->scid; + + __set_chan_timer(chan, sk->sk_sndtimeo); + + chan->ident = cmd->ident; + + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) { + if (l2cap_check_security(chan)) { + if (bt_sk(sk)->defer_setup) { + l2cap_state_change(chan, BT_CONNECT2); + result = L2CAP_CR_PEND; + status = L2CAP_CS_AUTHOR_PEND; + parent->sk_data_ready(parent, 0); + } else { + l2cap_state_change(chan, BT_CONFIG); + result = L2CAP_CR_SUCCESS; + status = L2CAP_CS_NO_INFO; + } + } else { + l2cap_state_change(chan, BT_CONNECT2); + result = L2CAP_CR_PEND; + status = L2CAP_CS_AUTHEN_PEND; + } + } else { + l2cap_state_change(chan, BT_CONNECT2); + result = L2CAP_CR_PEND; + status = L2CAP_CS_NO_INFO; + } + + write_unlock_bh(&conn->chan_lock); + +response: + bh_unlock_sock(parent); + +sendresp: + rsp.scid = cpu_to_le16(scid); + rsp.dcid = cpu_to_le16(dcid); + rsp.result = cpu_to_le16(result); + rsp.status = cpu_to_le16(status); + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); + + if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) { + struct l2cap_info_req info; + info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; + conn->info_ident = l2cap_get_ident(conn); + + mod_timer(&conn->info_timer, jiffies + + msecs_to_jiffies(L2CAP_INFO_TIMEOUT)); + + l2cap_send_cmd(conn, conn->info_ident, + L2CAP_INFO_REQ, sizeof(info), &info); + } + + if (chan && !test_bit(CONF_REQ_SENT, &chan->conf_state) && + result == L2CAP_CR_SUCCESS) { + u8 buf[128]; + set_bit(CONF_REQ_SENT, &chan->conf_state); + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(chan, buf), buf); + chan->num_conf_req++; + } + + return 0; +} + +static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_conn_rsp *rsp = (struct l2cap_conn_rsp *) data; + u16 scid, dcid, result, status; + struct l2cap_chan *chan; + struct sock *sk; + u8 req[128]; + + scid = __le16_to_cpu(rsp->scid); + dcid = __le16_to_cpu(rsp->dcid); + result = __le16_to_cpu(rsp->result); + status = __le16_to_cpu(rsp->status); + + BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); + + if (scid) { + chan = l2cap_get_chan_by_scid(conn, scid); + if (!chan) + return -EFAULT; + } else { + chan = l2cap_get_chan_by_ident(conn, cmd->ident); + if (!chan) + return -EFAULT; + } + + sk = chan->sk; + + switch (result) { + case L2CAP_CR_SUCCESS: + l2cap_state_change(chan, BT_CONFIG); + chan->ident = 0; + chan->dcid = dcid; + clear_bit(CONF_CONNECT_PEND, &chan->conf_state); + + if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) + break; + + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(chan, req), req); + chan->num_conf_req++; + break; + + case L2CAP_CR_PEND: + set_bit(CONF_CONNECT_PEND, &chan->conf_state); + break; + + default: + /* don't delete l2cap channel if sk is owned by user */ + if (sock_owned_by_user(sk)) { + l2cap_state_change(chan, BT_DISCONN); + __clear_chan_timer(chan); + __set_chan_timer(chan, HZ / 5); + break; + } + + l2cap_chan_del(chan, ECONNREFUSED); + break; + } + + bh_unlock_sock(sk); + return 0; +} + +static inline void set_default_fcs(struct l2cap_chan *chan) +{ + /* FCS is enabled only in ERTM or streaming mode, if one or both + * sides request it. + */ + if (chan->mode != L2CAP_MODE_ERTM && chan->mode != L2CAP_MODE_STREAMING) + chan->fcs = L2CAP_FCS_NONE; + else if (!test_bit(CONF_NO_FCS_RECV, &chan->conf_state)) + chan->fcs = L2CAP_FCS_CRC16; +} + +static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) +{ + struct l2cap_conf_req *req = (struct l2cap_conf_req *) data; + u16 dcid, flags; + u8 rsp[64]; + struct l2cap_chan *chan; + struct sock *sk; + int len; + + dcid = __le16_to_cpu(req->dcid); + flags = __le16_to_cpu(req->flags); + + BT_DBG("dcid 0x%4.4x flags 0x%2.2x", dcid, flags); + + chan = l2cap_get_chan_by_scid(conn, dcid); + if (!chan) + return -ENOENT; + + sk = chan->sk; + + if (sk->sk_state != BT_CONFIG && sk->sk_state != BT_CONNECT2) { + struct l2cap_cmd_rej rej; + + rej.reason = cpu_to_le16(0x0002); + l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ, + sizeof(rej), &rej); + goto unlock; + } + + /* Reject if config buffer is too small. */ + len = cmd_len - sizeof(*req); + if (len < 0 || chan->conf_len + len > sizeof(chan->conf_req)) { + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, + l2cap_build_conf_rsp(chan, rsp, + L2CAP_CONF_REJECT, flags), rsp); + goto unlock; + } + + /* Store config. */ + memcpy(chan->conf_req + chan->conf_len, req->data, len); + chan->conf_len += len; + + if (flags & 0x0001) { + /* Incomplete config. Send empty response. */ + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, + l2cap_build_conf_rsp(chan, rsp, + L2CAP_CONF_SUCCESS, 0x0001), rsp); + goto unlock; + } + + /* Complete config. */ + len = l2cap_parse_conf_req(chan, rsp); + if (len < 0) { + l2cap_send_disconn_req(conn, chan, ECONNRESET); + goto unlock; + } + + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp); + chan->num_conf_rsp++; + + /* Reset config buffer. */ + chan->conf_len = 0; + + if (!test_bit(CONF_OUTPUT_DONE, &chan->conf_state)) + goto unlock; + + if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) { + set_default_fcs(chan); + + l2cap_state_change(chan, BT_CONNECTED); + + chan->next_tx_seq = 0; + chan->expected_tx_seq = 0; + skb_queue_head_init(&chan->tx_q); + if (chan->mode == L2CAP_MODE_ERTM) + l2cap_ertm_init(chan); + + l2cap_chan_ready(sk); + goto unlock; + } + + if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) { + u8 buf[64]; + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(chan, buf), buf); + chan->num_conf_req++; + } + +unlock: + bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_conf_rsp *rsp = (struct l2cap_conf_rsp *)data; + u16 scid, flags, result; + struct l2cap_chan *chan; + struct sock *sk; + int len = cmd->len - sizeof(*rsp); + + scid = __le16_to_cpu(rsp->scid); + flags = __le16_to_cpu(rsp->flags); + result = __le16_to_cpu(rsp->result); + + BT_DBG("scid 0x%4.4x flags 0x%2.2x result 0x%2.2x", + scid, flags, result); + + chan = l2cap_get_chan_by_scid(conn, scid); + if (!chan) + return 0; + + sk = chan->sk; + + switch (result) { + case L2CAP_CONF_SUCCESS: + l2cap_conf_rfc_get(chan, rsp->data, len); + break; + + case L2CAP_CONF_UNACCEPT: + if (chan->num_conf_rsp <= L2CAP_CONF_MAX_CONF_RSP) { + char req[64]; + + if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) { + l2cap_send_disconn_req(conn, chan, ECONNRESET); + goto done; + } + + /* throw out any old stored conf requests */ + result = L2CAP_CONF_SUCCESS; + len = l2cap_parse_conf_rsp(chan, rsp->data, len, + req, &result); + if (len < 0) { + l2cap_send_disconn_req(conn, chan, ECONNRESET); + goto done; + } + + l2cap_send_cmd(conn, l2cap_get_ident(conn), + L2CAP_CONF_REQ, len, req); + chan->num_conf_req++; + if (result != L2CAP_CONF_SUCCESS) + goto done; + break; + } + + default: + sk->sk_err = ECONNRESET; + __set_chan_timer(chan, HZ * 5); + l2cap_send_disconn_req(conn, chan, ECONNRESET); + goto done; + } + + if (flags & 0x01) + goto done; + + set_bit(CONF_INPUT_DONE, &chan->conf_state); + + if (test_bit(CONF_OUTPUT_DONE, &chan->conf_state)) { + set_default_fcs(chan); + + l2cap_state_change(chan, BT_CONNECTED); + chan->next_tx_seq = 0; + chan->expected_tx_seq = 0; + skb_queue_head_init(&chan->tx_q); + if (chan->mode == L2CAP_MODE_ERTM) + l2cap_ertm_init(chan); + + l2cap_chan_ready(sk); + } + +done: + bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_disconnect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_disconn_req *req = (struct l2cap_disconn_req *) data; + struct l2cap_disconn_rsp rsp; + u16 dcid, scid; + struct l2cap_chan *chan; + struct sock *sk; + + scid = __le16_to_cpu(req->scid); + dcid = __le16_to_cpu(req->dcid); + + BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid); + + chan = l2cap_get_chan_by_scid(conn, dcid); + if (!chan) + return 0; + + sk = chan->sk; + + rsp.dcid = cpu_to_le16(chan->scid); + rsp.scid = cpu_to_le16(chan->dcid); + l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp); + + sk->sk_shutdown = SHUTDOWN_MASK; + + /* don't delete l2cap channel if sk is owned by user */ + if (sock_owned_by_user(sk)) { + l2cap_state_change(chan, BT_DISCONN); + __clear_chan_timer(chan); + __set_chan_timer(chan, HZ / 5); + bh_unlock_sock(sk); + return 0; + } + + l2cap_chan_del(chan, ECONNRESET); + bh_unlock_sock(sk); + + chan->ops->close(chan->data); + return 0; +} + +static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_disconn_rsp *rsp = (struct l2cap_disconn_rsp *) data; + u16 dcid, scid; + struct l2cap_chan *chan; + struct sock *sk; + + scid = __le16_to_cpu(rsp->scid); + dcid = __le16_to_cpu(rsp->dcid); + + BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid); + + chan = l2cap_get_chan_by_scid(conn, scid); + if (!chan) + return 0; + + sk = chan->sk; + + /* don't delete l2cap channel if sk is owned by user */ + if (sock_owned_by_user(sk)) { + l2cap_state_change(chan,BT_DISCONN); + __clear_chan_timer(chan); + __set_chan_timer(chan, HZ / 5); + bh_unlock_sock(sk); + return 0; + } + + l2cap_chan_del(chan, 0); + bh_unlock_sock(sk); + + chan->ops->close(chan->data); + return 0; +} + +static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_info_req *req = (struct l2cap_info_req *) data; + u16 type; + + type = __le16_to_cpu(req->type); + + BT_DBG("type 0x%4.4x", type); + + if (type == L2CAP_IT_FEAT_MASK) { + u8 buf[8]; + u32 feat_mask = l2cap_feat_mask; + struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf; + rsp->type = cpu_to_le16(L2CAP_IT_FEAT_MASK); + rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS); + if (!disable_ertm) + feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING + | L2CAP_FEAT_FCS; + put_unaligned_le32(feat_mask, rsp->data); + l2cap_send_cmd(conn, cmd->ident, + L2CAP_INFO_RSP, sizeof(buf), buf); + } else if (type == L2CAP_IT_FIXED_CHAN) { + u8 buf[12]; + struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf; + rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS); + memcpy(buf + 4, l2cap_fixed_chan, 8); + l2cap_send_cmd(conn, cmd->ident, + L2CAP_INFO_RSP, sizeof(buf), buf); + } else { + struct l2cap_info_rsp rsp; + rsp.type = cpu_to_le16(type); + rsp.result = cpu_to_le16(L2CAP_IR_NOTSUPP); + l2cap_send_cmd(conn, cmd->ident, + L2CAP_INFO_RSP, sizeof(rsp), &rsp); + } + + return 0; +} + +static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data; + u16 type, result; + + type = __le16_to_cpu(rsp->type); + result = __le16_to_cpu(rsp->result); + + BT_DBG("type 0x%4.4x result 0x%2.2x", type, result); + + /* L2CAP Info req/rsp are unbound to channels, add extra checks */ + if (cmd->ident != conn->info_ident || + conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) + return 0; + + del_timer(&conn->info_timer); + + if (result != L2CAP_IR_SUCCESS) { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + + return 0; + } + + if (type == L2CAP_IT_FEAT_MASK) { + conn->feat_mask = get_unaligned_le32(rsp->data); + + if (conn->feat_mask & L2CAP_FEAT_FIXED_CHAN) { + struct l2cap_info_req req; + req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN); + + conn->info_ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, conn->info_ident, + L2CAP_INFO_REQ, sizeof(req), &req); + } else { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } + } else if (type == L2CAP_IT_FIXED_CHAN) { + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; + conn->info_ident = 0; + + l2cap_conn_start(conn); + } + + return 0; +} + +static inline int l2cap_check_conn_param(u16 min, u16 max, u16 latency, + u16 to_multiplier) +{ + u16 max_latency; + + if (min > max || min < 6 || max > 3200) + return -EINVAL; + + if (to_multiplier < 10 || to_multiplier > 3200) + return -EINVAL; + + if (max >= to_multiplier * 8) + return -EINVAL; + + max_latency = (to_multiplier * 8 / max) - 1; + if (latency > 499 || latency > max_latency) + return -EINVAL; + + return 0; +} + +static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct hci_conn *hcon = conn->hcon; + struct l2cap_conn_param_update_req *req; + struct l2cap_conn_param_update_rsp rsp; + u16 min, max, latency, to_multiplier, cmd_len; + int err; + + if (!(hcon->link_mode & HCI_LM_MASTER)) + return -EINVAL; + + cmd_len = __le16_to_cpu(cmd->len); + if (cmd_len != sizeof(struct l2cap_conn_param_update_req)) + return -EPROTO; + + req = (struct l2cap_conn_param_update_req *) data; + min = __le16_to_cpu(req->min); + max = __le16_to_cpu(req->max); + latency = __le16_to_cpu(req->latency); + to_multiplier = __le16_to_cpu(req->to_multiplier); + + BT_DBG("min 0x%4.4x max 0x%4.4x latency: 0x%4.4x Timeout: 0x%4.4x", + min, max, latency, to_multiplier); + + memset(&rsp, 0, sizeof(rsp)); + + err = l2cap_check_conn_param(min, max, latency, to_multiplier); + if (err) + rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); + else + rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_ACCEPTED); + + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP, + sizeof(rsp), &rsp); + + if (!err) + hci_le_conn_update(hcon, min, max, latency, to_multiplier); + + return 0; +} + +static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) +{ + int err = 0; + + switch (cmd->code) { + case L2CAP_COMMAND_REJ: + l2cap_command_rej(conn, cmd, data); + break; + + case L2CAP_CONN_REQ: + err = l2cap_connect_req(conn, cmd, data); + break; + + case L2CAP_CONN_RSP: + err = l2cap_connect_rsp(conn, cmd, data); + break; + + case L2CAP_CONF_REQ: + err = l2cap_config_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_CONF_RSP: + err = l2cap_config_rsp(conn, cmd, data); + break; + + case L2CAP_DISCONN_REQ: + err = l2cap_disconnect_req(conn, cmd, data); + break; + + case L2CAP_DISCONN_RSP: + err = l2cap_disconnect_rsp(conn, cmd, data); + break; + + case L2CAP_ECHO_REQ: + l2cap_send_cmd(conn, cmd->ident, L2CAP_ECHO_RSP, cmd_len, data); + break; + + case L2CAP_ECHO_RSP: + break; + + case L2CAP_INFO_REQ: + err = l2cap_information_req(conn, cmd, data); + break; + + case L2CAP_INFO_RSP: + err = l2cap_information_rsp(conn, cmd, data); + break; + + default: + BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code); + err = -EINVAL; + break; + } + + return err; +} + +static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u8 *data) +{ + switch (cmd->code) { + case L2CAP_COMMAND_REJ: + return 0; + + case L2CAP_CONN_PARAM_UPDATE_REQ: + return l2cap_conn_param_update_req(conn, cmd, data); + + case L2CAP_CONN_PARAM_UPDATE_RSP: + return 0; + + default: + BT_ERR("Unknown LE signaling command 0x%2.2x", cmd->code); + return -EINVAL; + } +} + +static inline void l2cap_sig_channel(struct l2cap_conn *conn, + struct sk_buff *skb) +{ + u8 *data = skb->data; + int len = skb->len; + struct l2cap_cmd_hdr cmd; + int err; + + l2cap_raw_recv(conn, skb); + + while (len >= L2CAP_CMD_HDR_SIZE) { + u16 cmd_len; + memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE); + data += L2CAP_CMD_HDR_SIZE; + len -= L2CAP_CMD_HDR_SIZE; + + cmd_len = le16_to_cpu(cmd.len); + + BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd_len, cmd.ident); + + if (cmd_len > len || !cmd.ident) { + BT_DBG("corrupted command"); + break; + } + + if (conn->hcon->type == LE_LINK) + err = l2cap_le_sig_cmd(conn, &cmd, data); + else + err = l2cap_bredr_sig_cmd(conn, &cmd, cmd_len, data); + + if (err) { + struct l2cap_cmd_rej rej; + + BT_ERR("Wrong link type (%d)", err); + + /* FIXME: Map err to a valid reason */ + rej.reason = cpu_to_le16(0); + l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); + } + + data += cmd_len; + len -= cmd_len; + } + + kfree_skb(skb); +} + +static int l2cap_check_fcs(struct l2cap_chan *chan, struct sk_buff *skb) +{ + u16 our_fcs, rcv_fcs; + int hdr_size = L2CAP_HDR_SIZE + 2; + + if (chan->fcs == L2CAP_FCS_CRC16) { + skb_trim(skb, skb->len - 2); + rcv_fcs = get_unaligned_le16(skb->data + skb->len); + our_fcs = crc16(0, skb->data - hdr_size, skb->len + hdr_size); + + if (our_fcs != rcv_fcs) + return -EBADMSG; + } + return 0; +} + +static inline void l2cap_send_i_or_rr_or_rnr(struct l2cap_chan *chan) +{ + u16 control = 0; + + chan->frames_sent = 0; + + control |= chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT; + + if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) { + control |= L2CAP_SUPER_RCV_NOT_READY; + l2cap_send_sframe(chan, control); + set_bit(CONN_RNR_SENT, &chan->conn_state); + } + + if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state)) + l2cap_retransmit_frames(chan); + + l2cap_ertm_send(chan); + + if (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state) && + chan->frames_sent == 0) { + control |= L2CAP_SUPER_RCV_READY; + l2cap_send_sframe(chan, control); + } +} + +static int l2cap_add_to_srej_queue(struct l2cap_chan *chan, struct sk_buff *skb, u8 tx_seq, u8 sar) +{ + struct sk_buff *next_skb; + int tx_seq_offset, next_tx_seq_offset; + + bt_cb(skb)->tx_seq = tx_seq; + bt_cb(skb)->sar = sar; + + next_skb = skb_peek(&chan->srej_q); + if (!next_skb) { + __skb_queue_tail(&chan->srej_q, skb); + return 0; + } + + tx_seq_offset = (tx_seq - chan->buffer_seq) % 64; + if (tx_seq_offset < 0) + tx_seq_offset += 64; + + do { + if (bt_cb(next_skb)->tx_seq == tx_seq) + return -EINVAL; + + next_tx_seq_offset = (bt_cb(next_skb)->tx_seq - + chan->buffer_seq) % 64; + if (next_tx_seq_offset < 0) + next_tx_seq_offset += 64; + + if (next_tx_seq_offset > tx_seq_offset) { + __skb_queue_before(&chan->srej_q, next_skb, skb); + return 0; + } + + if (skb_queue_is_last(&chan->srej_q, next_skb)) + break; + + } while ((next_skb = skb_queue_next(&chan->srej_q, next_skb))); + + __skb_queue_tail(&chan->srej_q, skb); + + return 0; +} + +static int l2cap_ertm_reassembly_sdu(struct l2cap_chan *chan, struct sk_buff *skb, u16 control) +{ + struct sk_buff *_skb; + int err; + + switch (control & L2CAP_CTRL_SAR) { + case L2CAP_SDU_UNSEGMENTED: + if (test_bit(CONN_SAR_SDU, &chan->conn_state)) + goto drop; + + return chan->ops->recv(chan->data, skb); + + case L2CAP_SDU_START: + if (test_bit(CONN_SAR_SDU, &chan->conn_state)) + goto drop; + + chan->sdu_len = get_unaligned_le16(skb->data); + + if (chan->sdu_len > chan->imtu) + goto disconnect; + + chan->sdu = bt_skb_alloc(chan->sdu_len, GFP_ATOMIC); + if (!chan->sdu) + return -ENOMEM; + + /* pull sdu_len bytes only after alloc, because of Local Busy + * condition we have to be sure that this will be executed + * only once, i.e., when alloc does not fail */ + skb_pull(skb, 2); + + memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len); + + set_bit(CONN_SAR_SDU, &chan->conn_state); + chan->partial_sdu_len = skb->len; + break; + + case L2CAP_SDU_CONTINUE: + if (!test_bit(CONN_SAR_SDU, &chan->conn_state)) + goto disconnect; + + if (!chan->sdu) + goto disconnect; + + chan->partial_sdu_len += skb->len; + if (chan->partial_sdu_len > chan->sdu_len) + goto drop; + + memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len); + + break; + + case L2CAP_SDU_END: + if (!test_bit(CONN_SAR_SDU, &chan->conn_state)) + goto disconnect; + + if (!chan->sdu) + goto disconnect; + + chan->partial_sdu_len += skb->len; + + if (chan->partial_sdu_len > chan->imtu) + goto drop; + + if (chan->partial_sdu_len != chan->sdu_len) + goto drop; + + memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len); + + _skb = skb_clone(chan->sdu, GFP_ATOMIC); + if (!_skb) { + return -ENOMEM; + } + + err = chan->ops->recv(chan->data, _skb); + if (err < 0) { + kfree_skb(_skb); + return err; + } + + clear_bit(CONN_SAR_SDU, &chan->conn_state); + + kfree_skb(chan->sdu); + break; + } + + kfree_skb(skb); + return 0; + +drop: + kfree_skb(chan->sdu); + chan->sdu = NULL; + +disconnect: + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + kfree_skb(skb); + return 0; +} + +static void l2cap_ertm_enter_local_busy(struct l2cap_chan *chan) +{ + u16 control; + + BT_DBG("chan %p, Enter local busy", chan); + + set_bit(CONN_LOCAL_BUSY, &chan->conn_state); + + control = chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT; + control |= L2CAP_SUPER_RCV_NOT_READY; + l2cap_send_sframe(chan, control); + + set_bit(CONN_RNR_SENT, &chan->conn_state); + + __clear_ack_timer(chan); +} + +static void l2cap_ertm_exit_local_busy(struct l2cap_chan *chan) +{ + u16 control; + + if (!test_bit(CONN_RNR_SENT, &chan->conn_state)) + goto done; + + control = chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT; + control |= L2CAP_SUPER_RCV_READY | L2CAP_CTRL_POLL; + l2cap_send_sframe(chan, control); + chan->retry_count = 1; + + __clear_retrans_timer(chan); + __set_monitor_timer(chan); + + set_bit(CONN_WAIT_F, &chan->conn_state); + +done: + clear_bit(CONN_LOCAL_BUSY, &chan->conn_state); + clear_bit(CONN_RNR_SENT, &chan->conn_state); + + BT_DBG("chan %p, Exit local busy", chan); +} + +void l2cap_chan_busy(struct l2cap_chan *chan, int busy) +{ + if (chan->mode == L2CAP_MODE_ERTM) { + if (busy) + l2cap_ertm_enter_local_busy(chan); + else + l2cap_ertm_exit_local_busy(chan); + } +} + +static int l2cap_streaming_reassembly_sdu(struct l2cap_chan *chan, struct sk_buff *skb, u16 control) +{ + struct sk_buff *_skb; + int err = -EINVAL; + + /* + * TODO: We have to notify the userland if some data is lost with the + * Streaming Mode. + */ + + switch (control & L2CAP_CTRL_SAR) { + case L2CAP_SDU_UNSEGMENTED: + if (test_bit(CONN_SAR_SDU, &chan->conn_state)) { + kfree_skb(chan->sdu); + break; + } + + err = chan->ops->recv(chan->data, skb); + if (!err) + return 0; + + break; + + case L2CAP_SDU_START: + if (test_bit(CONN_SAR_SDU, &chan->conn_state)) { + kfree_skb(chan->sdu); + break; + } + + chan->sdu_len = get_unaligned_le16(skb->data); + skb_pull(skb, 2); + + if (chan->sdu_len > chan->imtu) { + err = -EMSGSIZE; + break; + } + + chan->sdu = bt_skb_alloc(chan->sdu_len, GFP_ATOMIC); + if (!chan->sdu) { + err = -ENOMEM; + break; + } + + memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len); + + set_bit(CONN_SAR_SDU, &chan->conn_state); + chan->partial_sdu_len = skb->len; + err = 0; + break; + + case L2CAP_SDU_CONTINUE: + if (!test_bit(CONN_SAR_SDU, &chan->conn_state)) + break; + + memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len); + + chan->partial_sdu_len += skb->len; + if (chan->partial_sdu_len > chan->sdu_len) + kfree_skb(chan->sdu); + else + err = 0; + + break; + + case L2CAP_SDU_END: + if (!test_bit(CONN_SAR_SDU, &chan->conn_state)) + break; + + memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len); + + clear_bit(CONN_SAR_SDU, &chan->conn_state); + chan->partial_sdu_len += skb->len; + + if (chan->partial_sdu_len > chan->imtu) + goto drop; + + if (chan->partial_sdu_len == chan->sdu_len) { + _skb = skb_clone(chan->sdu, GFP_ATOMIC); + err = chan->ops->recv(chan->data, _skb); + if (err < 0) + kfree_skb(_skb); + } + err = 0; + +drop: + kfree_skb(chan->sdu); + break; + } + + kfree_skb(skb); + return err; +} + +static void l2cap_check_srej_gap(struct l2cap_chan *chan, u8 tx_seq) +{ + struct sk_buff *skb; + u16 control; + + while ((skb = skb_peek(&chan->srej_q)) && + !test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) { + int err; + + if (bt_cb(skb)->tx_seq != tx_seq) + break; + + skb = skb_dequeue(&chan->srej_q); + control = bt_cb(skb)->sar << L2CAP_CTRL_SAR_SHIFT; + err = l2cap_ertm_reassembly_sdu(chan, skb, control); + + if (err < 0) { + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + break; + } + + chan->buffer_seq_srej = + (chan->buffer_seq_srej + 1) % 64; + tx_seq = (tx_seq + 1) % 64; + } +} + +static void l2cap_resend_srejframe(struct l2cap_chan *chan, u8 tx_seq) +{ + struct srej_list *l, *tmp; + u16 control; + + list_for_each_entry_safe(l, tmp, &chan->srej_l, list) { + if (l->tx_seq == tx_seq) { + list_del(&l->list); + kfree(l); + return; + } + control = L2CAP_SUPER_SELECT_REJECT; + control |= l->tx_seq << L2CAP_CTRL_REQSEQ_SHIFT; + l2cap_send_sframe(chan, control); + list_del(&l->list); + list_add_tail(&l->list, &chan->srej_l); + } +} + +static void l2cap_send_srejframe(struct l2cap_chan *chan, u8 tx_seq) +{ + struct srej_list *new; + u16 control; + + while (tx_seq != chan->expected_tx_seq) { + control = L2CAP_SUPER_SELECT_REJECT; + control |= chan->expected_tx_seq << L2CAP_CTRL_REQSEQ_SHIFT; + l2cap_send_sframe(chan, control); + + new = kzalloc(sizeof(struct srej_list), GFP_ATOMIC); + new->tx_seq = chan->expected_tx_seq; + chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64; + list_add_tail(&new->list, &chan->srej_l); + } + chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64; +} + +static inline int l2cap_data_channel_iframe(struct l2cap_chan *chan, u16 rx_control, struct sk_buff *skb) +{ + u8 tx_seq = __get_txseq(rx_control); + u8 req_seq = __get_reqseq(rx_control); + u8 sar = rx_control >> L2CAP_CTRL_SAR_SHIFT; + int tx_seq_offset, expected_tx_seq_offset; + int num_to_ack = (chan->tx_win/6) + 1; + int err = 0; + + BT_DBG("chan %p len %d tx_seq %d rx_control 0x%4.4x", chan, skb->len, + tx_seq, rx_control); + + if (L2CAP_CTRL_FINAL & rx_control && + test_bit(CONN_WAIT_F, &chan->conn_state)) { + __clear_monitor_timer(chan); + if (chan->unacked_frames > 0) + __set_retrans_timer(chan); + clear_bit(CONN_WAIT_F, &chan->conn_state); + } + + chan->expected_ack_seq = req_seq; + l2cap_drop_acked_frames(chan); + + tx_seq_offset = (tx_seq - chan->buffer_seq) % 64; + if (tx_seq_offset < 0) + tx_seq_offset += 64; + + /* invalid tx_seq */ + if (tx_seq_offset >= chan->tx_win) { + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + goto drop; + } + + if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) + goto drop; + + if (tx_seq == chan->expected_tx_seq) + goto expected; + + if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) { + struct srej_list *first; + + first = list_first_entry(&chan->srej_l, + struct srej_list, list); + if (tx_seq == first->tx_seq) { + l2cap_add_to_srej_queue(chan, skb, tx_seq, sar); + l2cap_check_srej_gap(chan, tx_seq); + + list_del(&first->list); + kfree(first); + + if (list_empty(&chan->srej_l)) { + chan->buffer_seq = chan->buffer_seq_srej; + clear_bit(CONN_SREJ_SENT, &chan->conn_state); + l2cap_send_ack(chan); + BT_DBG("chan %p, Exit SREJ_SENT", chan); + } + } else { + struct srej_list *l; + + /* duplicated tx_seq */ + if (l2cap_add_to_srej_queue(chan, skb, tx_seq, sar) < 0) + goto drop; + + list_for_each_entry(l, &chan->srej_l, list) { + if (l->tx_seq == tx_seq) { + l2cap_resend_srejframe(chan, tx_seq); + return 0; + } + } + l2cap_send_srejframe(chan, tx_seq); + } + } else { + expected_tx_seq_offset = + (chan->expected_tx_seq - chan->buffer_seq) % 64; + if (expected_tx_seq_offset < 0) + expected_tx_seq_offset += 64; + + /* duplicated tx_seq */ + if (tx_seq_offset < expected_tx_seq_offset) + goto drop; + + set_bit(CONN_SREJ_SENT, &chan->conn_state); + + BT_DBG("chan %p, Enter SREJ", chan); + + INIT_LIST_HEAD(&chan->srej_l); + chan->buffer_seq_srej = chan->buffer_seq; + + __skb_queue_head_init(&chan->srej_q); + l2cap_add_to_srej_queue(chan, skb, tx_seq, sar); + + set_bit(CONN_SEND_PBIT, &chan->conn_state); + + l2cap_send_srejframe(chan, tx_seq); + + __clear_ack_timer(chan); + } + return 0; + +expected: + chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64; + + if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) { + bt_cb(skb)->tx_seq = tx_seq; + bt_cb(skb)->sar = sar; + __skb_queue_tail(&chan->srej_q, skb); + return 0; + } + + err = l2cap_ertm_reassembly_sdu(chan, skb, rx_control); + chan->buffer_seq = (chan->buffer_seq + 1) % 64; + if (err < 0) { + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + return err; + } + + if (rx_control & L2CAP_CTRL_FINAL) { + if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state)) + l2cap_retransmit_frames(chan); + } + + __set_ack_timer(chan); + + chan->num_acked = (chan->num_acked + 1) % num_to_ack; + if (chan->num_acked == num_to_ack - 1) + l2cap_send_ack(chan); + + return 0; + +drop: + kfree_skb(skb); + return 0; +} + +static inline void l2cap_data_channel_rrframe(struct l2cap_chan *chan, u16 rx_control) +{ + BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, __get_reqseq(rx_control), + rx_control); + + chan->expected_ack_seq = __get_reqseq(rx_control); + l2cap_drop_acked_frames(chan); + + if (rx_control & L2CAP_CTRL_POLL) { + set_bit(CONN_SEND_FBIT, &chan->conn_state); + if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) { + if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state) && + (chan->unacked_frames > 0)) + __set_retrans_timer(chan); + + clear_bit(CONN_REMOTE_BUSY, &chan->conn_state); + l2cap_send_srejtail(chan); + } else { + l2cap_send_i_or_rr_or_rnr(chan); + } + + } else if (rx_control & L2CAP_CTRL_FINAL) { + clear_bit(CONN_REMOTE_BUSY, &chan->conn_state); + + if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state)) + l2cap_retransmit_frames(chan); + + } else { + if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state) && + (chan->unacked_frames > 0)) + __set_retrans_timer(chan); + + clear_bit(CONN_REMOTE_BUSY, &chan->conn_state); + if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) + l2cap_send_ack(chan); + else + l2cap_ertm_send(chan); + } +} + +static inline void l2cap_data_channel_rejframe(struct l2cap_chan *chan, u16 rx_control) +{ + u8 tx_seq = __get_reqseq(rx_control); + + BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, tx_seq, rx_control); + + clear_bit(CONN_REMOTE_BUSY, &chan->conn_state); + + chan->expected_ack_seq = tx_seq; + l2cap_drop_acked_frames(chan); + + if (rx_control & L2CAP_CTRL_FINAL) { + if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state)) + l2cap_retransmit_frames(chan); + } else { + l2cap_retransmit_frames(chan); + + if (test_bit(CONN_WAIT_F, &chan->conn_state)) + set_bit(CONN_REJ_ACT, &chan->conn_state); + } +} +static inline void l2cap_data_channel_srejframe(struct l2cap_chan *chan, u16 rx_control) +{ + u8 tx_seq = __get_reqseq(rx_control); + + BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, tx_seq, rx_control); + + clear_bit(CONN_REMOTE_BUSY, &chan->conn_state); + + if (rx_control & L2CAP_CTRL_POLL) { + chan->expected_ack_seq = tx_seq; + l2cap_drop_acked_frames(chan); + + set_bit(CONN_SEND_FBIT, &chan->conn_state); + l2cap_retransmit_one_frame(chan, tx_seq); + + l2cap_ertm_send(chan); + + if (test_bit(CONN_WAIT_F, &chan->conn_state)) { + chan->srej_save_reqseq = tx_seq; + set_bit(CONN_SREJ_ACT, &chan->conn_state); + } + } else if (rx_control & L2CAP_CTRL_FINAL) { + if (test_bit(CONN_SREJ_ACT, &chan->conn_state) && + chan->srej_save_reqseq == tx_seq) + clear_bit(CONN_SREJ_ACT, &chan->conn_state); + else + l2cap_retransmit_one_frame(chan, tx_seq); + } else { + l2cap_retransmit_one_frame(chan, tx_seq); + if (test_bit(CONN_WAIT_F, &chan->conn_state)) { + chan->srej_save_reqseq = tx_seq; + set_bit(CONN_SREJ_ACT, &chan->conn_state); + } + } +} + +static inline void l2cap_data_channel_rnrframe(struct l2cap_chan *chan, u16 rx_control) +{ + u8 tx_seq = __get_reqseq(rx_control); + + BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, tx_seq, rx_control); + + set_bit(CONN_REMOTE_BUSY, &chan->conn_state); + chan->expected_ack_seq = tx_seq; + l2cap_drop_acked_frames(chan); + + if (rx_control & L2CAP_CTRL_POLL) + set_bit(CONN_SEND_FBIT, &chan->conn_state); + + if (!test_bit(CONN_SREJ_SENT, &chan->conn_state)) { + __clear_retrans_timer(chan); + if (rx_control & L2CAP_CTRL_POLL) + l2cap_send_rr_or_rnr(chan, L2CAP_CTRL_FINAL); + return; + } + + if (rx_control & L2CAP_CTRL_POLL) + l2cap_send_srejtail(chan); + else + l2cap_send_sframe(chan, L2CAP_SUPER_RCV_READY); +} + +static inline int l2cap_data_channel_sframe(struct l2cap_chan *chan, u16 rx_control, struct sk_buff *skb) +{ + BT_DBG("chan %p rx_control 0x%4.4x len %d", chan, rx_control, skb->len); + + if (L2CAP_CTRL_FINAL & rx_control && + test_bit(CONN_WAIT_F, &chan->conn_state)) { + __clear_monitor_timer(chan); + if (chan->unacked_frames > 0) + __set_retrans_timer(chan); + clear_bit(CONN_WAIT_F, &chan->conn_state); + } + + switch (rx_control & L2CAP_CTRL_SUPERVISE) { + case L2CAP_SUPER_RCV_READY: + l2cap_data_channel_rrframe(chan, rx_control); + break; + + case L2CAP_SUPER_REJECT: + l2cap_data_channel_rejframe(chan, rx_control); + break; + + case L2CAP_SUPER_SELECT_REJECT: + l2cap_data_channel_srejframe(chan, rx_control); + break; + + case L2CAP_SUPER_RCV_NOT_READY: + l2cap_data_channel_rnrframe(chan, rx_control); + break; + } + + kfree_skb(skb); + return 0; +} + +static int l2cap_ertm_data_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + u16 control; + u8 req_seq; + int len, next_tx_seq_offset, req_seq_offset; + + control = get_unaligned_le16(skb->data); + skb_pull(skb, 2); + len = skb->len; + + /* + * We can just drop the corrupted I-frame here. + * Receiver will miss it and start proper recovery + * procedures and ask retransmission. + */ + if (l2cap_check_fcs(chan, skb)) + goto drop; + + if (__is_sar_start(control) && __is_iframe(control)) + len -= 2; + + if (chan->fcs == L2CAP_FCS_CRC16) + len -= 2; + + if (len > chan->mps) { + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + goto drop; + } + + req_seq = __get_reqseq(control); + req_seq_offset = (req_seq - chan->expected_ack_seq) % 64; + if (req_seq_offset < 0) + req_seq_offset += 64; + + next_tx_seq_offset = + (chan->next_tx_seq - chan->expected_ack_seq) % 64; + if (next_tx_seq_offset < 0) + next_tx_seq_offset += 64; + + /* check for invalid req-seq */ + if (req_seq_offset > next_tx_seq_offset) { + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + goto drop; + } + + if (__is_iframe(control)) { + if (len < 0) { + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + goto drop; + } + + l2cap_data_channel_iframe(chan, control, skb); + } else { + if (len != 0) { + BT_ERR("%d", len); + l2cap_send_disconn_req(chan->conn, chan, ECONNRESET); + goto drop; + } + + l2cap_data_channel_sframe(chan, control, skb); + } + + return 0; + +drop: + kfree_skb(skb); + return 0; +} + +static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb) +{ + struct l2cap_chan *chan; + struct sock *sk = NULL; + u16 control; + u8 tx_seq; + int len; + + chan = l2cap_get_chan_by_scid(conn, cid); + if (!chan) { + BT_DBG("unknown cid 0x%4.4x", cid); + goto drop; + } + + sk = chan->sk; + + BT_DBG("chan %p, len %d", chan, skb->len); + + if (chan->state != BT_CONNECTED) + goto drop; + + switch (chan->mode) { + case L2CAP_MODE_BASIC: + /* If socket recv buffers overflows we drop data here + * which is *bad* because L2CAP has to be reliable. + * But we don't have any other choice. L2CAP doesn't + * provide flow control mechanism. */ + + if (chan->imtu < skb->len) + goto drop; + + if (!chan->ops->recv(chan->data, skb)) + goto done; + break; + + case L2CAP_MODE_ERTM: + if (!sock_owned_by_user(sk)) { + l2cap_ertm_data_rcv(sk, skb); + } else { + if (sk_add_backlog(sk, skb)) + goto drop; + } + + goto done; + + case L2CAP_MODE_STREAMING: + control = get_unaligned_le16(skb->data); + skb_pull(skb, 2); + len = skb->len; + + if (l2cap_check_fcs(chan, skb)) + goto drop; + + if (__is_sar_start(control)) + len -= 2; + + if (chan->fcs == L2CAP_FCS_CRC16) + len -= 2; + + if (len > chan->mps || len < 0 || __is_sframe(control)) + goto drop; + + tx_seq = __get_txseq(control); + + if (chan->expected_tx_seq == tx_seq) + chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64; + else + chan->expected_tx_seq = (tx_seq + 1) % 64; + + l2cap_streaming_reassembly_sdu(chan, skb, control); + + goto done; + + default: + BT_DBG("chan %p: bad mode 0x%2.2x", chan, chan->mode); + break; + } + +drop: + kfree_skb(skb); + +done: + if (sk) + bh_unlock_sock(sk); + + return 0; +} + +static inline int l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, struct sk_buff *skb) +{ + struct sock *sk = NULL; + struct l2cap_chan *chan; + + chan = l2cap_global_chan_by_psm(0, psm, conn->src); + if (!chan) + goto drop; + + sk = chan->sk; + + bh_lock_sock(sk); + + BT_DBG("sk %p, len %d", sk, skb->len); + + if (chan->state != BT_BOUND && chan->state != BT_CONNECTED) + goto drop; + + if (chan->imtu < skb->len) + goto drop; + + if (!chan->ops->recv(chan->data, skb)) + goto done; + +drop: + kfree_skb(skb); + +done: + if (sk) + bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_att_channel(struct l2cap_conn *conn, __le16 cid, struct sk_buff *skb) +{ + struct sock *sk = NULL; + struct l2cap_chan *chan; + + chan = l2cap_global_chan_by_scid(0, cid, conn->src); + if (!chan) + goto drop; + + sk = chan->sk; + + bh_lock_sock(sk); + + BT_DBG("sk %p, len %d", sk, skb->len); + + if (chan->state != BT_BOUND && chan->state != BT_CONNECTED) + goto drop; + + if (chan->imtu < skb->len) + goto drop; + + if (!chan->ops->recv(chan->data, skb)) + goto done; + +drop: + kfree_skb(skb); + +done: + if (sk) + bh_unlock_sock(sk); + return 0; +} + +static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct l2cap_hdr *lh = (void *) skb->data; + u16 cid, len; + __le16 psm; + + skb_pull(skb, L2CAP_HDR_SIZE); + cid = __le16_to_cpu(lh->cid); + len = __le16_to_cpu(lh->len); + + if (len != skb->len) { + kfree_skb(skb); + return; + } + + BT_DBG("len %d, cid 0x%4.4x", len, cid); + + switch (cid) { + case L2CAP_CID_LE_SIGNALING: + case L2CAP_CID_SIGNALING: + l2cap_sig_channel(conn, skb); + break; + + case L2CAP_CID_CONN_LESS: + psm = get_unaligned_le16(skb->data); + skb_pull(skb, 2); + l2cap_conless_channel(conn, psm, skb); + break; + + case L2CAP_CID_LE_DATA: + l2cap_att_channel(conn, cid, skb); + break; + + case L2CAP_CID_SMP: + if (smp_sig_channel(conn, skb)) + l2cap_conn_del(conn->hcon, EACCES); + break; + + default: + l2cap_data_channel(conn, cid, skb); + break; + } +} + +/* ---- L2CAP interface with lower layer (HCI) ---- */ + +static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +{ + int exact = 0, lm1 = 0, lm2 = 0; + struct l2cap_chan *c; + + if (type != ACL_LINK) + return -EINVAL; + + BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); + + /* Find listening sockets and check their link_mode */ + read_lock(&chan_list_lock); + list_for_each_entry(c, &chan_list, global_l) { + struct sock *sk = c->sk; + + if (c->state != BT_LISTEN) + continue; + + if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) { + lm1 |= HCI_LM_ACCEPT; + if (c->role_switch) + lm1 |= HCI_LM_MASTER; + exact++; + } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) { + lm2 |= HCI_LM_ACCEPT; + if (c->role_switch) + lm2 |= HCI_LM_MASTER; + } + } + read_unlock(&chan_list_lock); + + return exact ? lm1 : lm2; +} + +static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status) +{ + struct l2cap_conn *conn; + + BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status); + + if (!(hcon->type == ACL_LINK || hcon->type == LE_LINK)) + return -EINVAL; + + if (!status) { + conn = l2cap_conn_add(hcon, status); + if (conn) + l2cap_conn_ready(conn); + } else + l2cap_conn_del(hcon, bt_to_errno(status)); + + return 0; +} + +static int l2cap_disconn_ind(struct hci_conn *hcon) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + + BT_DBG("hcon %p", hcon); + + if ((hcon->type != ACL_LINK && hcon->type != LE_LINK) || !conn) + return 0x13; + + return conn->disc_reason; +} + +static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) +{ + BT_DBG("hcon %p reason %d", hcon, reason); + + if (!(hcon->type == ACL_LINK || hcon->type == LE_LINK)) + return -EINVAL; + + l2cap_conn_del(hcon, bt_to_errno(reason)); + + return 0; +} + +static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt) +{ + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) + return; + + if (encrypt == 0x00) { + if (chan->sec_level == BT_SECURITY_MEDIUM) { + __clear_chan_timer(chan); + __set_chan_timer(chan, HZ * 5); + } else if (chan->sec_level == BT_SECURITY_HIGH) + l2cap_chan_close(chan, ECONNREFUSED); + } else { + if (chan->sec_level == BT_SECURITY_MEDIUM) + __clear_chan_timer(chan); + } +} + +static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; + + if (!conn) + return 0; + + BT_DBG("conn %p", conn); + + read_lock(&conn->chan_lock); + + list_for_each_entry(chan, &conn->chan_l, list) { + struct sock *sk = chan->sk; + + bh_lock_sock(sk); + + BT_DBG("chan->scid %d", chan->scid); + + if (chan->scid == L2CAP_CID_LE_DATA) { + if (!status && encrypt) { + chan->sec_level = hcon->sec_level; + del_timer(&conn->security_timer); + l2cap_chan_ready(sk); + smp_distribute_keys(conn, 0); + } + + bh_unlock_sock(sk); + continue; + } + + if (test_bit(CONF_CONNECT_PEND, &chan->conf_state)) { + bh_unlock_sock(sk); + continue; + } + + if (!status && (chan->state == BT_CONNECTED || + chan->state == BT_CONFIG)) { + l2cap_check_encryption(chan, encrypt); + bh_unlock_sock(sk); + continue; + } + + if (chan->state == BT_CONNECT) { + if (!status) { + struct l2cap_conn_req req; + req.scid = cpu_to_le16(chan->scid); + req.psm = chan->psm; + + chan->ident = l2cap_get_ident(conn); + set_bit(CONF_CONNECT_PEND, &chan->conf_state); + + l2cap_send_cmd(conn, chan->ident, + L2CAP_CONN_REQ, sizeof(req), &req); + } else { + __clear_chan_timer(chan); + __set_chan_timer(chan, HZ / 10); + } + } else if (chan->state == BT_CONNECT2) { + struct l2cap_conn_rsp rsp; + __u16 res, stat; + + if (!status) { + if (bt_sk(sk)->defer_setup) { + struct sock *parent = bt_sk(sk)->parent; + res = L2CAP_CR_PEND; + stat = L2CAP_CS_AUTHOR_PEND; + if (parent) + parent->sk_data_ready(parent, 0); + } else { + l2cap_state_change(chan, BT_CONFIG); + res = L2CAP_CR_SUCCESS; + stat = L2CAP_CS_NO_INFO; + } + } else { + l2cap_state_change(chan, BT_DISCONN); + __set_chan_timer(chan, HZ / 10); + res = L2CAP_CR_SEC_BLOCK; + stat = L2CAP_CS_NO_INFO; + } + + rsp.scid = cpu_to_le16(chan->dcid); + rsp.dcid = cpu_to_le16(chan->scid); + rsp.result = cpu_to_le16(res); + rsp.status = cpu_to_le16(stat); + l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, + sizeof(rsp), &rsp); + } + + bh_unlock_sock(sk); + } + + read_unlock(&conn->chan_lock); + + return 0; +} + +static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + + if (!conn) + conn = l2cap_conn_add(hcon, 0); + + if (!conn) + goto drop; + + BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags); + + if (!(flags & ACL_CONT)) { + struct l2cap_hdr *hdr; + struct l2cap_chan *chan; + u16 cid; + int len; + + if (conn->rx_len) { + BT_ERR("Unexpected start frame (len %d)", skb->len); + kfree_skb(conn->rx_skb); + conn->rx_skb = NULL; + conn->rx_len = 0; + l2cap_conn_unreliable(conn, ECOMM); + } + + /* Start fragment always begin with Basic L2CAP header */ + if (skb->len < L2CAP_HDR_SIZE) { + BT_ERR("Frame is too short (len %d)", skb->len); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + hdr = (struct l2cap_hdr *) skb->data; + len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE; + cid = __le16_to_cpu(hdr->cid); + + if (len == skb->len) { + /* Complete frame received */ + l2cap_recv_frame(conn, skb); + return 0; + } + + BT_DBG("Start: total len %d, frag len %d", len, skb->len); + + if (skb->len > len) { + BT_ERR("Frame is too long (len %d, expected len %d)", + skb->len, len); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + chan = l2cap_get_chan_by_scid(conn, cid); + + if (chan && chan->sk) { + struct sock *sk = chan->sk; + + if (chan->imtu < len - L2CAP_HDR_SIZE) { + BT_ERR("Frame exceeding recv MTU (len %d, " + "MTU %d)", len, + chan->imtu); + bh_unlock_sock(sk); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + bh_unlock_sock(sk); + } + + /* Allocate skb for the complete frame (with header) */ + conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC); + if (!conn->rx_skb) + goto drop; + + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), + skb->len); + conn->rx_len = len - skb->len; + } else { + BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); + + if (!conn->rx_len) { + BT_ERR("Unexpected continuation frame (len %d)", skb->len); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + if (skb->len > conn->rx_len) { + BT_ERR("Fragment is too long (len %d, expected %d)", + skb->len, conn->rx_len); + kfree_skb(conn->rx_skb); + conn->rx_skb = NULL; + conn->rx_len = 0; + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), + skb->len); + conn->rx_len -= skb->len; + + if (!conn->rx_len) { + /* Complete frame received */ + l2cap_recv_frame(conn, conn->rx_skb); + conn->rx_skb = NULL; + } + } + +drop: + kfree_skb(skb); + return 0; +} + +static int l2cap_debugfs_show(struct seq_file *f, void *p) +{ + struct l2cap_chan *c; + + read_lock_bh(&chan_list_lock); + + list_for_each_entry(c, &chan_list, global_l) { + struct sock *sk = c->sk; + + seq_printf(f, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d %d\n", + batostr(&bt_sk(sk)->src), + batostr(&bt_sk(sk)->dst), + c->state, __le16_to_cpu(c->psm), + c->scid, c->dcid, c->imtu, c->omtu, + c->sec_level, c->mode); + } + + read_unlock_bh(&chan_list_lock); + + return 0; +} + +static int l2cap_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, l2cap_debugfs_show, inode->i_private); +} + +static const struct file_operations l2cap_debugfs_fops = { + .open = l2cap_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *l2cap_debugfs; + +static struct hci_proto l2cap_hci_proto = { + .name = "L2CAP", + .id = HCI_PROTO_L2CAP, + .connect_ind = l2cap_connect_ind, + .connect_cfm = l2cap_connect_cfm, + .disconn_ind = l2cap_disconn_ind, + .disconn_cfm = l2cap_disconn_cfm, + .security_cfm = l2cap_security_cfm, + .recv_acldata = l2cap_recv_acldata +}; + +int __init l2cap_init(void) +{ + int err; + + err = l2cap_init_sockets(); + if (err < 0) + return err; + + err = hci_register_proto(&l2cap_hci_proto); + if (err < 0) { + BT_ERR("L2CAP protocol registration failed"); + bt_sock_unregister(BTPROTO_L2CAP); + goto error; + } + + if (bt_debugfs) { + l2cap_debugfs = debugfs_create_file("l2cap", 0444, + bt_debugfs, NULL, &l2cap_debugfs_fops); + if (!l2cap_debugfs) + BT_ERR("Failed to create L2CAP debug file"); + } + + return 0; + +error: + l2cap_cleanup_sockets(); + return err; +} + +void l2cap_exit(void) +{ + debugfs_remove(l2cap_debugfs); + + if (hci_unregister_proto(&l2cap_hci_proto) < 0) + BT_ERR("L2CAP protocol unregistration failed"); + + l2cap_cleanup_sockets(); +} + +module_param(disable_ertm, bool, 0644); +MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode"); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c new file mode 100644 index 00000000..61f1f623 --- /dev/null +++ b/net/bluetooth/l2cap_sock.c @@ -0,0 +1,1094 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + Copyright (C) 2009-2010 Gustavo F. Padovan + Copyright (C) 2010 Google Inc. + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth L2CAP sockets. */ + +#include +#include +#include +#include + +static const struct proto_ops l2cap_sock_ops; +static void l2cap_sock_init(struct sock *sk, struct sock *parent); +static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio); + +static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct sockaddr_l2 la; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + memset(&la, 0, sizeof(la)); + len = min_t(unsigned int, sizeof(la), alen); + memcpy(&la, addr, len); + + if (la.l2_cid && la.l2_psm) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN) { + err = -EBADFD; + goto done; + } + + if (la.l2_psm) { + __u16 psm = __le16_to_cpu(la.l2_psm); + + /* PSM must be odd and lsb of upper byte must be 0 */ + if ((psm & 0x0101) != 0x0001) { + err = -EINVAL; + goto done; + } + + /* Restrict usage of well-known PSMs */ + if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) { + err = -EACCES; + goto done; + } + } + + if (la.l2_cid) + err = l2cap_add_scid(chan, la.l2_cid); + else + err = l2cap_add_psm(chan, &la.l2_bdaddr, la.l2_psm); + + if (err < 0) + goto done; + + if (__le16_to_cpu(la.l2_psm) == 0x0001 || + __le16_to_cpu(la.l2_psm) == 0x0003) + chan->sec_level = BT_SECURITY_SDP; + + bacpy(&bt_sk(sk)->src, &la.l2_bdaddr); + + chan->state = BT_BOUND; + sk->sk_state = BT_BOUND; + +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct sockaddr_l2 la; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (!addr || alen < sizeof(addr->sa_family) || + addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + memset(&la, 0, sizeof(la)); + len = min_t(unsigned int, sizeof(la), alen); + memcpy(&la, addr, len); + + if (la.l2_cid && la.l2_psm) + return -EINVAL; + + lock_sock(sk); + + if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED + && !(la.l2_psm || la.l2_cid)) { + err = -EINVAL; + goto done; + } + + switch (chan->mode) { + case L2CAP_MODE_BASIC: + break; + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + if (!disable_ertm) + break; + /* fall through */ + default: + err = -ENOTSUPP; + goto done; + } + + switch (sk->sk_state) { + case BT_CONNECT: + case BT_CONNECT2: + case BT_CONFIG: + /* Already connecting */ + goto wait; + + case BT_CONNECTED: + /* Already connected */ + err = -EISCONN; + goto done; + + case BT_OPEN: + case BT_BOUND: + /* Can connect */ + break; + + default: + err = -EBADFD; + goto done; + } + + /* PSM must be odd and lsb of upper byte must be 0 */ + if ((__le16_to_cpu(la.l2_psm) & 0x0101) != 0x0001 && !la.l2_cid && + chan->chan_type != L2CAP_CHAN_RAW) { + err = -EINVAL; + goto done; + } + + /* Set destination address and psm */ + bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr); + chan->psm = la.l2_psm; + chan->dcid = la.l2_cid; + + err = l2cap_chan_connect(l2cap_pi(sk)->chan); + if (err) + goto done; + +wait: + err = bt_sock_wait_state(sk, BT_CONNECTED, + sock_sndtimeo(sk, flags & O_NONBLOCK)); +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + int err = 0; + + BT_DBG("sk %p backlog %d", sk, backlog); + + lock_sock(sk); + + if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) + || sk->sk_state != BT_BOUND) { + err = -EBADFD; + goto done; + } + + switch (chan->mode) { + case L2CAP_MODE_BASIC: + break; + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + if (!disable_ertm) + break; + /* fall through */ + default: + err = -ENOTSUPP; + goto done; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + + chan->state = BT_LISTEN; + sk->sk_state = BT_LISTEN; + +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *nsk; + long timeo; + int err = 0; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + BT_DBG("sk %p timeo %ld", sk, timeo); + + /* Wait for an incoming connection. (wake-one). */ + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + break; + } + + nsk = bt_accept_dequeue(sk, newsock); + if (nsk) + break; + + if (!timeo) { + err = -EAGAIN; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + + BT_DBG("new socket %p", nsk); + +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +{ + struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + + BT_DBG("sock %p, sk %p", sock, sk); + + addr->sa_family = AF_BLUETOOTH; + *len = sizeof(struct sockaddr_l2); + + if (peer) { + la->l2_psm = chan->psm; + bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst); + la->l2_cid = cpu_to_le16(chan->dcid); + } else { + la->l2_psm = chan->sport; + bacpy(&la->l2_bdaddr, &bt_sk(sk)->src); + la->l2_cid = cpu_to_le16(chan->scid); + } + + return 0; +} + +static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct l2cap_options opts; + struct l2cap_conninfo cinfo; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case L2CAP_OPTIONS: + memset(&opts, 0, sizeof(opts)); + opts.imtu = chan->imtu; + opts.omtu = chan->omtu; + opts.flush_to = chan->flush_to; + opts.mode = chan->mode; + opts.fcs = chan->fcs; + opts.max_tx = chan->max_tx; + opts.txwin_size = (__u16)chan->tx_win; + + len = min_t(unsigned int, len, sizeof(opts)); + if (copy_to_user(optval, (char *) &opts, len)) + err = -EFAULT; + + break; + + case L2CAP_LM: + switch (chan->sec_level) { + case BT_SECURITY_LOW: + opt = L2CAP_LM_AUTH; + break; + case BT_SECURITY_MEDIUM: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT; + break; + case BT_SECURITY_HIGH: + opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT | + L2CAP_LM_SECURE; + break; + default: + opt = 0; + break; + } + + if (chan->role_switch) + opt |= L2CAP_LM_MASTER; + + if (chan->force_reliable) + opt |= L2CAP_LM_RELIABLE; + + if (put_user(opt, (u32 __user *) optval)) + err = -EFAULT; + break; + + case L2CAP_CONNINFO: + if (sk->sk_state != BT_CONNECTED && + !(sk->sk_state == BT_CONNECT2 && + bt_sk(sk)->defer_setup)) { + err = -ENOTCONN; + break; + } + + memset(&cinfo, 0, sizeof(cinfo)); + cinfo.hci_handle = chan->conn->hcon->handle; + memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3); + + len = min_t(unsigned int, len, sizeof(cinfo)); + if (copy_to_user(optval, (char *) &cinfo, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct bt_security sec; + struct bt_power pwr; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_L2CAP) + return l2cap_sock_getsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && + chan->chan_type != L2CAP_CHAN_RAW) { + err = -EINVAL; + break; + } + + memset(&sec, 0, sizeof(sec)); + sec.level = chan->sec_level; + + if (sk->sk_state == BT_CONNECTED) + sec.key_size = chan->conn->hcon->enc_key_size; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) + err = -EFAULT; + + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval)) + err = -EFAULT; + + break; + + case BT_FLUSHABLE: + if (put_user(chan->flushable, (u32 __user *) optval)) + err = -EFAULT; + + break; + + case BT_POWER: + if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM + && sk->sk_type != SOCK_RAW) { + err = -EINVAL; + break; + } + + pwr.force_active = chan->force_active; + + len = min_t(unsigned int, len, sizeof(pwr)); + if (copy_to_user(optval, (char *) &pwr, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct l2cap_options opts; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + lock_sock(sk); + + switch (optname) { + case L2CAP_OPTIONS: + if (sk->sk_state == BT_CONNECTED) { + err = -EINVAL; + break; + } + + opts.imtu = chan->imtu; + opts.omtu = chan->omtu; + opts.flush_to = chan->flush_to; + opts.mode = chan->mode; + opts.fcs = chan->fcs; + opts.max_tx = chan->max_tx; + opts.txwin_size = (__u16)chan->tx_win; + + len = min_t(unsigned int, sizeof(opts), optlen); + if (copy_from_user((char *) &opts, optval, len)) { + err = -EFAULT; + break; + } + + if (opts.txwin_size > L2CAP_DEFAULT_TX_WINDOW) { + err = -EINVAL; + break; + } + + chan->mode = opts.mode; + switch (chan->mode) { + case L2CAP_MODE_BASIC: + clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); + break; + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + if (!disable_ertm) + break; + /* fall through */ + default: + err = -EINVAL; + break; + } + + chan->imtu = opts.imtu; + chan->omtu = opts.omtu; + chan->fcs = opts.fcs; + chan->max_tx = opts.max_tx; + chan->tx_win = (__u8)opts.txwin_size; + break; + + case L2CAP_LM: + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + if (opt & L2CAP_LM_AUTH) + chan->sec_level = BT_SECURITY_LOW; + if (opt & L2CAP_LM_ENCRYPT) + chan->sec_level = BT_SECURITY_MEDIUM; + if (opt & L2CAP_LM_SECURE) + chan->sec_level = BT_SECURITY_HIGH; + + chan->role_switch = (opt & L2CAP_LM_MASTER); + chan->force_reliable = (opt & L2CAP_LM_RELIABLE); + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct bt_security sec; + struct bt_power pwr; + struct l2cap_conn *conn; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (level == SOL_L2CAP) + return l2cap_sock_setsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && + chan->chan_type != L2CAP_CHAN_RAW) { + err = -EINVAL; + break; + } + + sec.level = BT_SECURITY_LOW; + + len = min_t(unsigned int, sizeof(sec), optlen); + if (copy_from_user((char *) &sec, optval, len)) { + err = -EFAULT; + break; + } + + if (sec.level < BT_SECURITY_LOW || + sec.level > BT_SECURITY_HIGH) { + err = -EINVAL; + break; + } + + chan->sec_level = sec.level; + + conn = chan->conn; + if (conn && chan->scid == L2CAP_CID_LE_DATA) { + if (!conn->hcon->out) { + err = -EINVAL; + break; + } + + if (smp_conn_security(conn, sec.level)) + break; + + err = 0; + sk->sk_state = BT_CONFIG; + } + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + bt_sk(sk)->defer_setup = opt; + break; + + case BT_FLUSHABLE: + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + if (opt > BT_FLUSHABLE_ON) { + err = -EINVAL; + break; + } + + if (opt == BT_FLUSHABLE_OFF) { + struct l2cap_conn *conn = chan->conn; + /* proceed further only when we have l2cap_conn and + No Flush support in the LM */ + if (!conn || !lmp_no_flush_capable(conn->hcon->hdev)) { + err = -EINVAL; + break; + } + } + + chan->flushable = opt; + break; + + case BT_POWER: + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED && + chan->chan_type != L2CAP_CHAN_RAW) { + err = -EINVAL; + break; + } + + pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; + + len = min_t(unsigned int, sizeof(pwr), optlen); + if (copy_from_user((char *) &pwr, optval, len)) { + err = -EFAULT; + break; + } + chan->force_active = pwr.force_active; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + int err; + + BT_DBG("sock %p, sk %p", sock, sk); + + err = sock_error(sk); + if (err) + return err; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + lock_sock(sk); + + if (sk->sk_state != BT_CONNECTED) { + release_sock(sk); + return -ENOTCONN; + } + + err = l2cap_chan_send(chan, msg, len); + + release_sock(sk); + return err; +} + +static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct l2cap_pinfo *pi = l2cap_pi(sk); + int err; + + lock_sock(sk); + + if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) { + sk->sk_state = BT_CONFIG; + + __l2cap_connect_rsp_defer(pi->chan); + release_sock(sk); + return 0; + } + + release_sock(sk); + + if (sock->type == SOCK_STREAM) + err = bt_sock_stream_recvmsg(iocb, sock, msg, len, flags); + else + err = bt_sock_recvmsg(iocb, sock, msg, len, flags); + + if (pi->chan->mode != L2CAP_MODE_ERTM) + return err; + + /* Attempt to put pending rx data in the socket buffer */ + + lock_sock(sk); + + if (!test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state)) + goto done; + + if (pi->rx_busy_skb) { + if (!sock_queue_rcv_skb(sk, pi->rx_busy_skb)) + pi->rx_busy_skb = NULL; + else + goto done; + } + + /* Restore data flow when half of the receive buffer is + * available. This avoids resending large numbers of + * frames. + */ + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1) + l2cap_chan_busy(pi->chan, 0); + +done: + release_sock(sk); + return err; +} + +/* Kill socket (only if zapped and orphan) + * Must be called on unlocked socket. + */ +static void l2cap_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + BT_DBG("sk %p state %d", sk, sk->sk_state); + + /* Kill poor orphan */ + + l2cap_chan_destroy(l2cap_pi(sk)->chan); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +static int l2cap_sock_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct l2cap_chan *chan = l2cap_pi(sk)->chan; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + lock_sock(sk); + if (!sk->sk_shutdown) { + if (chan->mode == L2CAP_MODE_ERTM) + err = __l2cap_wait_ack(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + l2cap_chan_close(chan, 0); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + err = bt_sock_wait_state(sk, BT_CLOSED, + sk->sk_lingertime); + } + + if (!err && sk->sk_err) + err = -sk->sk_err; + + release_sock(sk); + return err; +} + +static int l2cap_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + err = l2cap_sock_shutdown(sock, 2); + + sock_orphan(sk); + l2cap_sock_kill(sk); + return err; +} + +static struct l2cap_chan *l2cap_sock_new_connection_cb(void *data) +{ + struct sock *sk, *parent = data; + + sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, + GFP_ATOMIC); + if (!sk) + return NULL; + + l2cap_sock_init(sk, parent); + + return l2cap_pi(sk)->chan; +} + +static int l2cap_sock_recv_cb(void *data, struct sk_buff *skb) +{ + int err; + struct sock *sk = data; + struct l2cap_pinfo *pi = l2cap_pi(sk); + + if (pi->rx_busy_skb) + return -ENOMEM; + + err = sock_queue_rcv_skb(sk, skb); + + /* For ERTM, handle one skb that doesn't fit into the recv + * buffer. This is important to do because the data frames + * have already been acked, so the skb cannot be discarded. + * + * Notify the l2cap core that the buffer is full, so the + * LOCAL_BUSY state is entered and no more frames are + * acked and reassembled until there is buffer space + * available. + */ + if (err < 0 && pi->chan->mode == L2CAP_MODE_ERTM) { + pi->rx_busy_skb = skb; + l2cap_chan_busy(pi->chan, 1); + err = 0; + } + + return err; +} + +static void l2cap_sock_close_cb(void *data) +{ + struct sock *sk = data; + + l2cap_sock_kill(sk); +} + +static void l2cap_sock_state_change_cb(void *data, int state) +{ + struct sock *sk = data; + + sk->sk_state = state; +} + +static struct l2cap_ops l2cap_chan_ops = { + .name = "L2CAP Socket Interface", + .new_connection = l2cap_sock_new_connection_cb, + .recv = l2cap_sock_recv_cb, + .close = l2cap_sock_close_cb, + .state_change = l2cap_sock_state_change_cb, +}; + +static void l2cap_sock_destruct(struct sock *sk) +{ + BT_DBG("sk %p", sk); + + if (l2cap_pi(sk)->rx_busy_skb) { + kfree_skb(l2cap_pi(sk)->rx_busy_skb); + l2cap_pi(sk)->rx_busy_skb = NULL; + } + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); +} + +static void l2cap_sock_init(struct sock *sk, struct sock *parent) +{ + struct l2cap_pinfo *pi = l2cap_pi(sk); + struct l2cap_chan *chan = pi->chan; + + BT_DBG("sk %p", sk); + + if (parent) { + struct l2cap_chan *pchan = l2cap_pi(parent)->chan; + + sk->sk_type = parent->sk_type; + bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup; + + chan->chan_type = pchan->chan_type; + chan->imtu = pchan->imtu; + chan->omtu = pchan->omtu; + chan->conf_state = pchan->conf_state; + chan->mode = pchan->mode; + chan->fcs = pchan->fcs; + chan->max_tx = pchan->max_tx; + chan->tx_win = pchan->tx_win; + chan->sec_level = pchan->sec_level; + chan->role_switch = pchan->role_switch; + chan->force_reliable = pchan->force_reliable; + chan->flushable = pchan->flushable; + chan->force_active = pchan->force_active; + } else { + + switch (sk->sk_type) { + case SOCK_RAW: + chan->chan_type = L2CAP_CHAN_RAW; + break; + case SOCK_DGRAM: + chan->chan_type = L2CAP_CHAN_CONN_LESS; + break; + case SOCK_SEQPACKET: + case SOCK_STREAM: + chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; + break; + } + + chan->imtu = L2CAP_DEFAULT_MTU; + chan->omtu = 0; + if (!disable_ertm && sk->sk_type == SOCK_STREAM) { + chan->mode = L2CAP_MODE_ERTM; + set_bit(CONF_STATE2_DEVICE, &chan->conf_state); + } else { + chan->mode = L2CAP_MODE_BASIC; + } + chan->max_tx = L2CAP_DEFAULT_MAX_TX; + chan->fcs = L2CAP_FCS_CRC16; + chan->tx_win = L2CAP_DEFAULT_TX_WINDOW; + chan->sec_level = BT_SECURITY_LOW; + chan->role_switch = 0; + chan->force_reliable = 0; + chan->flushable = BT_FLUSHABLE_OFF; + chan->force_active = BT_POWER_FORCE_ACTIVE_ON; + + } + + /* Default config options */ + chan->flush_to = L2CAP_DEFAULT_FLUSH_TO; + + chan->data = sk; + chan->ops = &l2cap_chan_ops; +} + +static struct proto l2cap_proto = { + .name = "L2CAP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct l2cap_pinfo) +}; + +static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +{ + struct sock *sk; + struct l2cap_chan *chan; + + sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + sk->sk_destruct = l2cap_sock_destruct; + sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + chan = l2cap_chan_create(sk); + if (!chan) { + l2cap_sock_kill(sk); + return NULL; + } + + l2cap_pi(sk)->chan = chan; + + return sk; +} + +static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM && + sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + return -EPERM; + + sock->ops = &l2cap_sock_ops; + + sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC); + if (!sk) + return -ENOMEM; + + l2cap_sock_init(sk, NULL); + return 0; +} + +static const struct proto_ops l2cap_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = l2cap_sock_release, + .bind = l2cap_sock_bind, + .connect = l2cap_sock_connect, + .listen = l2cap_sock_listen, + .accept = l2cap_sock_accept, + .getname = l2cap_sock_getname, + .sendmsg = l2cap_sock_sendmsg, + .recvmsg = l2cap_sock_recvmsg, + .poll = bt_sock_poll, + .ioctl = bt_sock_ioctl, + .mmap = sock_no_mmap, + .socketpair = sock_no_socketpair, + .shutdown = l2cap_sock_shutdown, + .setsockopt = l2cap_sock_setsockopt, + .getsockopt = l2cap_sock_getsockopt +}; + +static const struct net_proto_family l2cap_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = l2cap_sock_create, +}; + +int __init l2cap_init_sockets(void) +{ + int err; + + err = proto_register(&l2cap_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops); + if (err < 0) + goto error; + + BT_INFO("L2CAP socket layer initialized"); + + return 0; + +error: + BT_ERR("L2CAP socket registration failed"); + proto_unregister(&l2cap_proto); + return err; +} + +void l2cap_cleanup_sockets(void) +{ + if (bt_sock_unregister(BTPROTO_L2CAP) < 0) + BT_ERR("L2CAP socket unregistration failed"); + + proto_unregister(&l2cap_proto); +} diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c new file mode 100644 index 00000000..86a6bed2 --- /dev/null +++ b/net/bluetooth/lib.c @@ -0,0 +1,171 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth kernel library. */ + +#include + +#include +#include +#include +#include + +#include + +void baswap(bdaddr_t *dst, bdaddr_t *src) +{ + unsigned char *d = (unsigned char *) dst; + unsigned char *s = (unsigned char *) src; + unsigned int i; + + for (i = 0; i < 6; i++) + d[i] = s[5 - i]; +} +EXPORT_SYMBOL(baswap); + +char *batostr(bdaddr_t *ba) +{ + static char str[2][18]; + static int i = 1; + + i ^= 1; + sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", + ba->b[5], ba->b[4], ba->b[3], + ba->b[2], ba->b[1], ba->b[0]); + + return str[i]; +} +EXPORT_SYMBOL(batostr); + +/* Bluetooth error codes to Unix errno mapping */ +int bt_to_errno(__u16 code) +{ + switch (code) { + case 0: + return 0; + + case 0x01: + return EBADRQC; + + case 0x02: + return ENOTCONN; + + case 0x03: + return EIO; + + case 0x04: + return EHOSTDOWN; + + case 0x05: + return EACCES; + + case 0x06: + return EBADE; + + case 0x07: + return ENOMEM; + + case 0x08: + return ETIMEDOUT; + + case 0x09: + return EMLINK; + + case 0x0a: + return EMLINK; + + case 0x0b: + return EALREADY; + + case 0x0c: + return EBUSY; + + case 0x0d: + case 0x0e: + case 0x0f: + return ECONNREFUSED; + + case 0x10: + return ETIMEDOUT; + + case 0x11: + case 0x27: + case 0x29: + case 0x20: + return EOPNOTSUPP; + + case 0x12: + return EINVAL; + + case 0x13: + case 0x14: + case 0x15: + return ECONNRESET; + + case 0x16: + return ECONNABORTED; + + case 0x17: + return ELOOP; + + case 0x18: + return EACCES; + + case 0x1a: + return EPROTONOSUPPORT; + + case 0x1b: + return ECONNREFUSED; + + case 0x19: + case 0x1e: + case 0x23: + case 0x24: + case 0x25: + return EPROTO; + + default: + return ENOSYS; + } +} +EXPORT_SYMBOL(bt_to_errno); + +int bt_printk(const char *level, const char *format, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + r = printk("%sBluetooth: %pV\n", level, &vaf); + + va_end(args); + + return r; +} +EXPORT_SYMBOL(bt_printk); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c new file mode 100644 index 00000000..98327213 --- /dev/null +++ b/net/bluetooth/mgmt.c @@ -0,0 +1,2288 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2010 Nokia Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI Management interface */ + +#include +#include + +#include +#include +#include + +#define MGMT_VERSION 0 +#define MGMT_REVISION 1 + +struct pending_cmd { + struct list_head list; + __u16 opcode; + int index; + void *param; + struct sock *sk; + void *user_data; +}; + +static LIST_HEAD(cmd_list); + +static int cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + struct mgmt_ev_cmd_status *ev; + + BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status); + + skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + + hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(sizeof(*ev)); + + ev = (void *) skb_put(skb, sizeof(*ev)); + ev->status = status; + put_unaligned_le16(cmd, &ev->opcode); + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); + + return 0; +} + +static int cmd_complete(struct sock *sk, u16 index, u16 cmd, void *rp, + size_t rp_len) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + struct mgmt_ev_cmd_complete *ev; + + BT_DBG("sock %p", sk); + + skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + + hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); + + ev = (void *) skb_put(skb, sizeof(*ev) + rp_len); + put_unaligned_le16(cmd, &ev->opcode); + + if (rp) + memcpy(ev->data, rp, rp_len); + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); + + return 0; +} + +static int read_version(struct sock *sk) +{ + struct mgmt_rp_read_version rp; + + BT_DBG("sock %p", sk); + + rp.version = MGMT_VERSION; + put_unaligned_le16(MGMT_REVISION, &rp.revision); + + return cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, &rp, + sizeof(rp)); +} + +static int read_index_list(struct sock *sk) +{ + struct mgmt_rp_read_index_list *rp; + struct list_head *p; + size_t rp_len; + u16 count; + int i, err; + + BT_DBG("sock %p", sk); + + read_lock(&hci_dev_list_lock); + + count = 0; + list_for_each(p, &hci_dev_list) { + count++; + } + + rp_len = sizeof(*rp) + (2 * count); + rp = kmalloc(rp_len, GFP_ATOMIC); + if (!rp) { + read_unlock(&hci_dev_list_lock); + return -ENOMEM; + } + + put_unaligned_le16(count, &rp->num_controllers); + + i = 0; + list_for_each(p, &hci_dev_list) { + struct hci_dev *d = list_entry(p, struct hci_dev, list); + + hci_del_off_timer(d); + + set_bit(HCI_MGMT, &d->flags); + + if (test_bit(HCI_SETUP, &d->flags)) + continue; + + put_unaligned_le16(d->id, &rp->index[i++]); + BT_DBG("Added hci%u", d->id); + } + + read_unlock(&hci_dev_list_lock); + + err = cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST, rp, + rp_len); + + kfree(rp); + + return err; +} + +static int read_controller_info(struct sock *sk, u16 index) +{ + struct mgmt_rp_read_info rp; + struct hci_dev *hdev; + + BT_DBG("sock %p hci%u", sk, index); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_READ_INFO, ENODEV); + + hci_del_off_timer(hdev); + + hci_dev_lock_bh(hdev); + + set_bit(HCI_MGMT, &hdev->flags); + + memset(&rp, 0, sizeof(rp)); + + rp.type = hdev->dev_type; + + rp.powered = test_bit(HCI_UP, &hdev->flags); + rp.connectable = test_bit(HCI_PSCAN, &hdev->flags); + rp.discoverable = test_bit(HCI_ISCAN, &hdev->flags); + rp.pairable = test_bit(HCI_PSCAN, &hdev->flags); + + if (test_bit(HCI_AUTH, &hdev->flags)) + rp.sec_mode = 3; + else if (hdev->ssp_mode > 0) + rp.sec_mode = 4; + else + rp.sec_mode = 2; + + bacpy(&rp.bdaddr, &hdev->bdaddr); + memcpy(rp.features, hdev->features, 8); + memcpy(rp.dev_class, hdev->dev_class, 3); + put_unaligned_le16(hdev->manufacturer, &rp.manufacturer); + rp.hci_ver = hdev->hci_ver; + put_unaligned_le16(hdev->hci_rev, &rp.hci_rev); + + memcpy(rp.name, hdev->dev_name, sizeof(hdev->dev_name)); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return cmd_complete(sk, index, MGMT_OP_READ_INFO, &rp, sizeof(rp)); +} + +static void mgmt_pending_free(struct pending_cmd *cmd) +{ + sock_put(cmd->sk); + kfree(cmd->param); + kfree(cmd); +} + +static struct pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, + u16 index, void *data, u16 len) +{ + struct pending_cmd *cmd; + + cmd = kmalloc(sizeof(*cmd), GFP_ATOMIC); + if (!cmd) + return NULL; + + cmd->opcode = opcode; + cmd->index = index; + + cmd->param = kmalloc(len, GFP_ATOMIC); + if (!cmd->param) { + kfree(cmd); + return NULL; + } + + if (data) + memcpy(cmd->param, data, len); + + cmd->sk = sk; + sock_hold(sk); + + list_add(&cmd->list, &cmd_list); + + return cmd; +} + +static void mgmt_pending_foreach(u16 opcode, int index, + void (*cb)(struct pending_cmd *cmd, void *data), + void *data) +{ + struct list_head *p, *n; + + list_for_each_safe(p, n, &cmd_list) { + struct pending_cmd *cmd; + + cmd = list_entry(p, struct pending_cmd, list); + + if (cmd->opcode != opcode) + continue; + + if (index >= 0 && cmd->index != index) + continue; + + cb(cmd, data); + } +} + +static struct pending_cmd *mgmt_pending_find(u16 opcode, int index) +{ + struct list_head *p; + + list_for_each(p, &cmd_list) { + struct pending_cmd *cmd; + + cmd = list_entry(p, struct pending_cmd, list); + + if (cmd->opcode != opcode) + continue; + + if (index >= 0 && cmd->index != index) + continue; + + return cmd; + } + + return NULL; +} + +static void mgmt_pending_remove(struct pending_cmd *cmd) +{ + list_del(&cmd->list); + mgmt_pending_free(cmd); +} + +static int set_powered(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct mgmt_mode *cp; + struct hci_dev *hdev; + struct pending_cmd *cmd; + int err, up; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_POWERED, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_POWERED, ENODEV); + + hci_dev_lock_bh(hdev); + + up = test_bit(HCI_UP, &hdev->flags); + if ((cp->val && up) || (!cp->val && !up)) { + err = cmd_status(sk, index, MGMT_OP_SET_POWERED, EALREADY); + goto failed; + } + + if (mgmt_pending_find(MGMT_OP_SET_POWERED, index)) { + err = cmd_status(sk, index, MGMT_OP_SET_POWERED, EBUSY); + goto failed; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + if (cp->val) + queue_work(hdev->workqueue, &hdev->power_on); + else + queue_work(hdev->workqueue, &hdev->power_off); + + err = 0; + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + return err; +} + +static int set_discoverable(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct mgmt_mode *cp; + struct hci_dev *hdev; + struct pending_cmd *cmd; + u8 scan; + int err; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, ENETDOWN); + goto failed; + } + + if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, index) || + mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, index)) { + err = cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, EBUSY); + goto failed; + } + + if (cp->val == test_bit(HCI_ISCAN, &hdev->flags) && + test_bit(HCI_PSCAN, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, EALREADY); + goto failed; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + scan = SCAN_PAGE; + + if (cp->val) + scan |= SCAN_INQUIRY; + + err = hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int set_connectable(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct mgmt_mode *cp; + struct hci_dev *hdev; + struct pending_cmd *cmd; + u8 scan; + int err; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, ENETDOWN); + goto failed; + } + + if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, index) || + mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, index)) { + err = cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, EBUSY); + goto failed; + } + + if (cp->val == test_bit(HCI_PSCAN, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, EALREADY); + goto failed; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + if (cp->val) + scan = SCAN_PAGE; + else + scan = 0; + + err = hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int mgmt_event(u16 event, u16 index, void *data, u16 data_len, + struct sock *skip_sk) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + + skb = alloc_skb(sizeof(*hdr) + data_len, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + bt_cb(skb)->channel = HCI_CHANNEL_CONTROL; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + hdr->opcode = cpu_to_le16(event); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(data_len); + + if (data) + memcpy(skb_put(skb, data_len), data, data_len); + + hci_send_to_sock(NULL, skb, skip_sk); + kfree_skb(skb); + + return 0; +} + +static int send_mode_rsp(struct sock *sk, u16 opcode, u16 index, u8 val) +{ + struct mgmt_mode rp; + + rp.val = val; + + return cmd_complete(sk, index, opcode, &rp, sizeof(rp)); +} + +static int set_pairable(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct mgmt_mode *cp, ev; + struct hci_dev *hdev; + int err; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_PAIRABLE, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_PAIRABLE, ENODEV); + + hci_dev_lock_bh(hdev); + + if (cp->val) + set_bit(HCI_PAIRABLE, &hdev->flags); + else + clear_bit(HCI_PAIRABLE, &hdev->flags); + + err = send_mode_rsp(sk, MGMT_OP_SET_PAIRABLE, index, cp->val); + if (err < 0) + goto failed; + + ev.val = cp->val; + + err = mgmt_event(MGMT_EV_PAIRABLE, index, &ev, sizeof(ev), sk); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +#define EIR_FLAGS 0x01 /* flags */ +#define EIR_UUID16_SOME 0x02 /* 16-bit UUID, more available */ +#define EIR_UUID16_ALL 0x03 /* 16-bit UUID, all listed */ +#define EIR_UUID32_SOME 0x04 /* 32-bit UUID, more available */ +#define EIR_UUID32_ALL 0x05 /* 32-bit UUID, all listed */ +#define EIR_UUID128_SOME 0x06 /* 128-bit UUID, more available */ +#define EIR_UUID128_ALL 0x07 /* 128-bit UUID, all listed */ +#define EIR_NAME_SHORT 0x08 /* shortened local name */ +#define EIR_NAME_COMPLETE 0x09 /* complete local name */ +#define EIR_TX_POWER 0x0A /* transmit power level */ +#define EIR_DEVICE_ID 0x10 /* device ID */ + +#define PNP_INFO_SVCLASS_ID 0x1200 + +static u8 bluetooth_base_uuid[] = { + 0xFB, 0x34, 0x9B, 0x5F, 0x80, 0x00, 0x00, 0x80, + 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static u16 get_uuid16(u8 *uuid128) +{ + u32 val; + int i; + + for (i = 0; i < 12; i++) { + if (bluetooth_base_uuid[i] != uuid128[i]) + return 0; + } + + memcpy(&val, &uuid128[12], 4); + + val = le32_to_cpu(val); + if (val > 0xffff) + return 0; + + return (u16) val; +} + +static void create_eir(struct hci_dev *hdev, u8 *data) +{ + u8 *ptr = data; + u16 eir_len = 0; + u16 uuid16_list[HCI_MAX_EIR_LENGTH / sizeof(u16)]; + int i, truncated = 0; + struct list_head *p; + size_t name_len; + + name_len = strlen(hdev->dev_name); + + if (name_len > 0) { + /* EIR Data type */ + if (name_len > 48) { + name_len = 48; + ptr[1] = EIR_NAME_SHORT; + } else + ptr[1] = EIR_NAME_COMPLETE; + + /* EIR Data length */ + ptr[0] = name_len + 1; + + memcpy(ptr + 2, hdev->dev_name, name_len); + + eir_len += (name_len + 2); + ptr += (name_len + 2); + } + + memset(uuid16_list, 0, sizeof(uuid16_list)); + + /* Group all UUID16 types */ + list_for_each(p, &hdev->uuids) { + struct bt_uuid *uuid = list_entry(p, struct bt_uuid, list); + u16 uuid16; + + uuid16 = get_uuid16(uuid->uuid); + if (uuid16 == 0) + return; + + if (uuid16 < 0x1100) + continue; + + if (uuid16 == PNP_INFO_SVCLASS_ID) + continue; + + /* Stop if not enough space to put next UUID */ + if (eir_len + 2 + sizeof(u16) > HCI_MAX_EIR_LENGTH) { + truncated = 1; + break; + } + + /* Check for duplicates */ + for (i = 0; uuid16_list[i] != 0; i++) + if (uuid16_list[i] == uuid16) + break; + + if (uuid16_list[i] == 0) { + uuid16_list[i] = uuid16; + eir_len += sizeof(u16); + } + } + + if (uuid16_list[0] != 0) { + u8 *length = ptr; + + /* EIR Data type */ + ptr[1] = truncated ? EIR_UUID16_SOME : EIR_UUID16_ALL; + + ptr += 2; + eir_len += 2; + + for (i = 0; uuid16_list[i] != 0; i++) { + *ptr++ = (uuid16_list[i] & 0x00ff); + *ptr++ = (uuid16_list[i] & 0xff00) >> 8; + } + + /* EIR Data length */ + *length = (i * sizeof(u16)) + 1; + } +} + +static int update_eir(struct hci_dev *hdev) +{ + struct hci_cp_write_eir cp; + + if (!(hdev->features[6] & LMP_EXT_INQ)) + return 0; + + if (hdev->ssp_mode == 0) + return 0; + + if (test_bit(HCI_SERVICE_CACHE, &hdev->flags)) + return 0; + + memset(&cp, 0, sizeof(cp)); + + create_eir(hdev, cp.data); + + if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) + return 0; + + memcpy(hdev->eir, cp.data, sizeof(cp.data)); + + return hci_send_cmd(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp); +} + +static u8 get_service_classes(struct hci_dev *hdev) +{ + struct list_head *p; + u8 val = 0; + + list_for_each(p, &hdev->uuids) { + struct bt_uuid *uuid = list_entry(p, struct bt_uuid, list); + + val |= uuid->svc_hint; + } + + return val; +} + +static int update_class(struct hci_dev *hdev) +{ + u8 cod[3]; + + BT_DBG("%s", hdev->name); + + if (test_bit(HCI_SERVICE_CACHE, &hdev->flags)) + return 0; + + cod[0] = hdev->minor_class; + cod[1] = hdev->major_class; + cod[2] = get_service_classes(hdev); + + if (memcmp(cod, hdev->dev_class, 3) == 0) + return 0; + + return hci_send_cmd(hdev, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); +} + +static int add_uuid(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct mgmt_cp_add_uuid *cp; + struct hci_dev *hdev; + struct bt_uuid *uuid; + int err; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_ADD_UUID, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_ADD_UUID, ENODEV); + + hci_dev_lock_bh(hdev); + + uuid = kmalloc(sizeof(*uuid), GFP_ATOMIC); + if (!uuid) { + err = -ENOMEM; + goto failed; + } + + memcpy(uuid->uuid, cp->uuid, 16); + uuid->svc_hint = cp->svc_hint; + + list_add(&uuid->list, &hdev->uuids); + + err = update_class(hdev); + if (err < 0) + goto failed; + + err = update_eir(hdev); + if (err < 0) + goto failed; + + err = cmd_complete(sk, index, MGMT_OP_ADD_UUID, NULL, 0); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int remove_uuid(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct list_head *p, *n; + struct mgmt_cp_remove_uuid *cp; + struct hci_dev *hdev; + u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int err, found; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_REMOVE_UUID, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_REMOVE_UUID, ENODEV); + + hci_dev_lock_bh(hdev); + + if (memcmp(cp->uuid, bt_uuid_any, 16) == 0) { + err = hci_uuids_clear(hdev); + goto unlock; + } + + found = 0; + + list_for_each_safe(p, n, &hdev->uuids) { + struct bt_uuid *match = list_entry(p, struct bt_uuid, list); + + if (memcmp(match->uuid, cp->uuid, 16) != 0) + continue; + + list_del(&match->list); + found++; + } + + if (found == 0) { + err = cmd_status(sk, index, MGMT_OP_REMOVE_UUID, ENOENT); + goto unlock; + } + + err = update_class(hdev); + if (err < 0) + goto unlock; + + err = update_eir(hdev); + if (err < 0) + goto unlock; + + err = cmd_complete(sk, index, MGMT_OP_REMOVE_UUID, NULL, 0); + +unlock: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int set_dev_class(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_set_dev_class *cp; + int err; + + cp = (void *) data; + + BT_DBG("request for hci%u", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_DEV_CLASS, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_DEV_CLASS, ENODEV); + + hci_dev_lock_bh(hdev); + + hdev->major_class = cp->major; + hdev->minor_class = cp->minor; + + err = update_class(hdev); + + if (err == 0) + err = cmd_complete(sk, index, MGMT_OP_SET_DEV_CLASS, NULL, 0); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int set_service_cache(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_set_service_cache *cp; + int err; + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_SERVICE_CACHE, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_SERVICE_CACHE, ENODEV); + + hci_dev_lock_bh(hdev); + + BT_DBG("hci%u enable %d", index, cp->enable); + + if (cp->enable) { + set_bit(HCI_SERVICE_CACHE, &hdev->flags); + err = 0; + } else { + clear_bit(HCI_SERVICE_CACHE, &hdev->flags); + err = update_class(hdev); + if (err == 0) + err = update_eir(hdev); + } + + if (err == 0) + err = cmd_complete(sk, index, MGMT_OP_SET_SERVICE_CACHE, NULL, + 0); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int load_keys(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_load_keys *cp; + u16 key_count, expected_len; + int i, err; + + cp = (void *) data; + + if (len < sizeof(*cp)) + return -EINVAL; + + key_count = get_unaligned_le16(&cp->key_count); + + expected_len = sizeof(*cp) + key_count * sizeof(struct mgmt_key_info); + if (expected_len > len) { + BT_ERR("load_keys: expected at least %u bytes, got %u bytes", + expected_len, len); + return -EINVAL; + } + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_LOAD_KEYS, ENODEV); + + BT_DBG("hci%u debug_keys %u key_count %u", index, cp->debug_keys, + key_count); + + hci_dev_lock_bh(hdev); + + hci_link_keys_clear(hdev); + + set_bit(HCI_LINK_KEYS, &hdev->flags); + + if (cp->debug_keys) + set_bit(HCI_DEBUG_KEYS, &hdev->flags); + else + clear_bit(HCI_DEBUG_KEYS, &hdev->flags); + + len -= sizeof(*cp); + i = 0; + + while (i < len) { + struct mgmt_key_info *key = (void *) cp->keys + i; + + i += sizeof(*key) + key->dlen; + + if (key->type == HCI_LK_SMP_LTK) { + struct key_master_id *id = (void *) key->data; + + if (key->dlen != sizeof(struct key_master_id)) + continue; + + hci_add_ltk(hdev, 0, &key->bdaddr, key->pin_len, + id->ediv, id->rand, key->val); + + continue; + } + + hci_add_link_key(hdev, NULL, 0, &key->bdaddr, key->val, key->type, + key->pin_len); + } + + err = cmd_complete(sk, index, MGMT_OP_LOAD_KEYS, NULL, 0); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int remove_key(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_remove_key *cp; + struct hci_conn *conn; + int err; + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_REMOVE_KEY, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_REMOVE_KEY, ENODEV); + + hci_dev_lock_bh(hdev); + + err = hci_remove_link_key(hdev, &cp->bdaddr); + if (err < 0) { + err = cmd_status(sk, index, MGMT_OP_REMOVE_KEY, -err); + goto unlock; + } + + err = 0; + + if (!test_bit(HCI_UP, &hdev->flags) || !cp->disconnect) + goto unlock; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + if (conn) { + struct hci_cp_disconnect dc; + + put_unaligned_le16(conn->handle, &dc.handle); + dc.reason = 0x13; /* Remote User Terminated Connection */ + err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc); + } + +unlock: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int disconnect(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_disconnect *cp; + struct hci_cp_disconnect dc; + struct pending_cmd *cmd; + struct hci_conn *conn; + int err; + + BT_DBG(""); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_DISCONNECT, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_DISCONNECT, ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_DISCONNECT, ENETDOWN); + goto failed; + } + + if (mgmt_pending_find(MGMT_OP_DISCONNECT, index)) { + err = cmd_status(sk, index, MGMT_OP_DISCONNECT, EBUSY); + goto failed; + } + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + if (!conn) + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->bdaddr); + + if (!conn) { + err = cmd_status(sk, index, MGMT_OP_DISCONNECT, ENOTCONN); + goto failed; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_DISCONNECT, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + put_unaligned_le16(conn->handle, &dc.handle); + dc.reason = 0x13; /* Remote User Terminated Connection */ + + err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int get_connections(struct sock *sk, u16 index) +{ + struct mgmt_rp_get_connections *rp; + struct hci_dev *hdev; + struct list_head *p; + size_t rp_len; + u16 count; + int i, err; + + BT_DBG(""); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_GET_CONNECTIONS, ENODEV); + + hci_dev_lock_bh(hdev); + + count = 0; + list_for_each(p, &hdev->conn_hash.list) { + count++; + } + + rp_len = sizeof(*rp) + (count * sizeof(bdaddr_t)); + rp = kmalloc(rp_len, GFP_ATOMIC); + if (!rp) { + err = -ENOMEM; + goto unlock; + } + + put_unaligned_le16(count, &rp->conn_count); + + i = 0; + list_for_each(p, &hdev->conn_hash.list) { + struct hci_conn *c = list_entry(p, struct hci_conn, list); + + bacpy(&rp->conn[i++], &c->dst); + } + + err = cmd_complete(sk, index, MGMT_OP_GET_CONNECTIONS, rp, rp_len); + +unlock: + kfree(rp); + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + return err; +} + +static int send_pin_code_neg_reply(struct sock *sk, u16 index, + struct hci_dev *hdev, struct mgmt_cp_pin_code_neg_reply *cp) +{ + struct pending_cmd *cmd; + int err; + + cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_NEG_REPLY, index, cp, + sizeof(*cp)); + if (!cmd) + return -ENOMEM; + + err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(cp->bdaddr), + &cp->bdaddr); + if (err < 0) + mgmt_pending_remove(cmd); + + return err; +} + +static int pin_code_reply(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct hci_conn *conn; + struct mgmt_cp_pin_code_reply *cp; + struct mgmt_cp_pin_code_neg_reply ncp; + struct hci_cp_pin_code_reply reply; + struct pending_cmd *cmd; + int err; + + BT_DBG(""); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, ENETDOWN); + goto failed; + } + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + if (!conn) { + err = cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, ENOTCONN); + goto failed; + } + + if (conn->pending_sec_level == BT_SECURITY_HIGH && cp->pin_len != 16) { + bacpy(&ncp.bdaddr, &cp->bdaddr); + + BT_ERR("PIN code is not 16 bytes long"); + + err = send_pin_code_neg_reply(sk, index, hdev, &ncp); + if (err >= 0) + err = cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, + EINVAL); + + goto failed; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_REPLY, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + bacpy(&reply.bdaddr, &cp->bdaddr); + reply.pin_len = cp->pin_len; + memcpy(reply.pin_code, cp->pin_code, sizeof(reply.pin_code)); + + err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_REPLY, sizeof(reply), &reply); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int pin_code_neg_reply(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_pin_code_neg_reply *cp; + int err; + + BT_DBG(""); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_PIN_CODE_NEG_REPLY, + EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_PIN_CODE_NEG_REPLY, + ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_PIN_CODE_NEG_REPLY, + ENETDOWN); + goto failed; + } + + err = send_pin_code_neg_reply(sk, index, hdev, cp); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int set_io_capability(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_set_io_capability *cp; + + BT_DBG(""); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_SET_IO_CAPABILITY, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_IO_CAPABILITY, ENODEV); + + hci_dev_lock_bh(hdev); + + hdev->io_capability = cp->io_capability; + + BT_DBG("%s IO capability set to 0x%02x", hdev->name, + hdev->io_capability); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return cmd_complete(sk, index, MGMT_OP_SET_IO_CAPABILITY, NULL, 0); +} + +static inline struct pending_cmd *find_pairing(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + struct list_head *p; + + list_for_each(p, &cmd_list) { + struct pending_cmd *cmd; + + cmd = list_entry(p, struct pending_cmd, list); + + if (cmd->opcode != MGMT_OP_PAIR_DEVICE) + continue; + + if (cmd->index != hdev->id) + continue; + + if (cmd->user_data != conn) + continue; + + return cmd; + } + + return NULL; +} + +static void pairing_complete(struct pending_cmd *cmd, u8 status) +{ + struct mgmt_rp_pair_device rp; + struct hci_conn *conn = cmd->user_data; + + bacpy(&rp.bdaddr, &conn->dst); + rp.status = status; + + cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, &rp, sizeof(rp)); + + /* So we don't get further callbacks for this connection */ + conn->connect_cfm_cb = NULL; + conn->security_cfm_cb = NULL; + conn->disconn_cfm_cb = NULL; + + hci_conn_put(conn); + + mgmt_pending_remove(cmd); +} + +static void pairing_complete_cb(struct hci_conn *conn, u8 status) +{ + struct pending_cmd *cmd; + + BT_DBG("status %u", status); + + cmd = find_pairing(conn); + if (!cmd) { + BT_DBG("Unable to find a pending command"); + return; + } + + pairing_complete(cmd, status); +} + +static int pair_device(struct sock *sk, u16 index, unsigned char *data, u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_pair_device *cp; + struct pending_cmd *cmd; + u8 sec_level, auth_type; + struct hci_conn *conn; + int err; + + BT_DBG(""); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_PAIR_DEVICE, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_PAIR_DEVICE, ENODEV); + + hci_dev_lock_bh(hdev); + + if (cp->io_cap == 0x03) { + sec_level = BT_SECURITY_MEDIUM; + auth_type = HCI_AT_DEDICATED_BONDING; + } else { + sec_level = BT_SECURITY_HIGH; + auth_type = HCI_AT_DEDICATED_BONDING_MITM; + } + + conn = hci_connect(hdev, ACL_LINK, 0, &cp->bdaddr, sec_level, auth_type); + if (IS_ERR(conn)) { + err = PTR_ERR(conn); + goto unlock; + } + + if (conn->connect_cfm_cb) { + hci_conn_put(conn); + err = cmd_status(sk, index, MGMT_OP_PAIR_DEVICE, EBUSY); + goto unlock; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_PAIR_DEVICE, index, data, len); + if (!cmd) { + err = -ENOMEM; + hci_conn_put(conn); + goto unlock; + } + + conn->connect_cfm_cb = pairing_complete_cb; + conn->security_cfm_cb = pairing_complete_cb; + conn->disconn_cfm_cb = pairing_complete_cb; + conn->io_capability = cp->io_cap; + cmd->user_data = conn; + + if (conn->state == BT_CONNECTED && + hci_conn_security(conn, sec_level, auth_type)) + pairing_complete(cmd, 0); + + err = 0; + +unlock: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int user_confirm_reply(struct sock *sk, u16 index, unsigned char *data, + u16 len, int success) +{ + struct mgmt_cp_user_confirm_reply *cp = (void *) data; + u16 mgmt_op, hci_op; + struct pending_cmd *cmd; + struct hci_dev *hdev; + int err; + + BT_DBG(""); + + if (success) { + mgmt_op = MGMT_OP_USER_CONFIRM_REPLY; + hci_op = HCI_OP_USER_CONFIRM_REPLY; + } else { + mgmt_op = MGMT_OP_USER_CONFIRM_NEG_REPLY; + hci_op = HCI_OP_USER_CONFIRM_NEG_REPLY; + } + + if (len != sizeof(*cp)) + return cmd_status(sk, index, mgmt_op, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, mgmt_op, ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, mgmt_op, ENETDOWN); + goto failed; + } + + cmd = mgmt_pending_add(sk, mgmt_op, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + err = hci_send_cmd(hdev, hci_op, sizeof(cp->bdaddr), &cp->bdaddr); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int set_local_name(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct mgmt_cp_set_local_name *mgmt_cp = (void *) data; + struct hci_cp_write_local_name hci_cp; + struct hci_dev *hdev; + struct pending_cmd *cmd; + int err; + + BT_DBG(""); + + if (len != sizeof(*mgmt_cp)) + return cmd_status(sk, index, MGMT_OP_SET_LOCAL_NAME, EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_SET_LOCAL_NAME, ENODEV); + + hci_dev_lock_bh(hdev); + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_LOCAL_NAME, index, data, len); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + memcpy(hci_cp.name, mgmt_cp->name, sizeof(hci_cp.name)); + err = hci_send_cmd(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(hci_cp), + &hci_cp); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int read_local_oob_data(struct sock *sk, u16 index) +{ + struct hci_dev *hdev; + struct pending_cmd *cmd; + int err; + + BT_DBG("hci%u", index); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, + ENODEV); + + hci_dev_lock_bh(hdev); + + if (!test_bit(HCI_UP, &hdev->flags)) { + err = cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, + ENETDOWN); + goto unlock; + } + + if (!(hdev->features[6] & LMP_SIMPLE_PAIR)) { + err = cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, + EOPNOTSUPP); + goto unlock; + } + + if (mgmt_pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, index)) { + err = cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, EBUSY); + goto unlock; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_DATA, index, NULL, 0); + if (!cmd) { + err = -ENOMEM; + goto unlock; + } + + err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL); + if (err < 0) + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int add_remote_oob_data(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_add_remote_oob_data *cp = (void *) data; + int err; + + BT_DBG("hci%u ", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA, + EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA, + ENODEV); + + hci_dev_lock_bh(hdev); + + err = hci_add_remote_oob_data(hdev, &cp->bdaddr, cp->hash, + cp->randomizer); + if (err < 0) + err = cmd_status(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA, -err); + else + err = cmd_complete(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA, NULL, + 0); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int remove_remote_oob_data(struct sock *sk, u16 index, + unsigned char *data, u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_remove_remote_oob_data *cp = (void *) data; + int err; + + BT_DBG("hci%u ", index); + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA, + EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA, + ENODEV); + + hci_dev_lock_bh(hdev); + + err = hci_remove_remote_oob_data(hdev, &cp->bdaddr); + if (err < 0) + err = cmd_status(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA, + -err); + else + err = cmd_complete(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA, + NULL, 0); + + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int start_discovery(struct sock *sk, u16 index) +{ + u8 lap[3] = { 0x33, 0x8b, 0x9e }; + struct hci_cp_inquiry cp; + struct pending_cmd *cmd; + struct hci_dev *hdev; + int err; + + BT_DBG("hci%u", index); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_START_DISCOVERY, ENODEV); + + hci_dev_lock_bh(hdev); + + cmd = mgmt_pending_add(sk, MGMT_OP_START_DISCOVERY, index, NULL, 0); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + memset(&cp, 0, sizeof(cp)); + memcpy(&cp.lap, lap, 3); + cp.length = 0x08; + cp.num_rsp = 0x00; + + err = hci_send_cmd(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int stop_discovery(struct sock *sk, u16 index) +{ + struct hci_dev *hdev; + struct pending_cmd *cmd; + int err; + + BT_DBG("hci%u", index); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_STOP_DISCOVERY, ENODEV); + + hci_dev_lock_bh(hdev); + + cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, index, NULL, 0); + if (!cmd) { + err = -ENOMEM; + goto failed; + } + + err = hci_send_cmd(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL); + if (err < 0) + mgmt_pending_remove(cmd); + +failed: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + + return err; +} + +static int block_device(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_block_device *cp; + int err; + + BT_DBG("hci%u", index); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_BLOCK_DEVICE, + EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_BLOCK_DEVICE, + ENODEV); + + err = hci_blacklist_add(hdev, &cp->bdaddr); + + if (err < 0) + err = cmd_status(sk, index, MGMT_OP_BLOCK_DEVICE, -err); + else + err = cmd_complete(sk, index, MGMT_OP_BLOCK_DEVICE, + NULL, 0); + hci_dev_put(hdev); + + return err; +} + +static int unblock_device(struct sock *sk, u16 index, unsigned char *data, + u16 len) +{ + struct hci_dev *hdev; + struct mgmt_cp_unblock_device *cp; + int err; + + BT_DBG("hci%u", index); + + cp = (void *) data; + + if (len != sizeof(*cp)) + return cmd_status(sk, index, MGMT_OP_UNBLOCK_DEVICE, + EINVAL); + + hdev = hci_dev_get(index); + if (!hdev) + return cmd_status(sk, index, MGMT_OP_UNBLOCK_DEVICE, + ENODEV); + + err = hci_blacklist_del(hdev, &cp->bdaddr); + + if (err < 0) + err = cmd_status(sk, index, MGMT_OP_UNBLOCK_DEVICE, -err); + else + err = cmd_complete(sk, index, MGMT_OP_UNBLOCK_DEVICE, + NULL, 0); + hci_dev_put(hdev); + + return err; +} + +int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen) +{ + unsigned char *buf; + struct mgmt_hdr *hdr; + u16 opcode, index, len; + int err; + + BT_DBG("got %zu bytes", msglen); + + if (msglen < sizeof(*hdr)) + return -EINVAL; + + buf = kmalloc(msglen, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (memcpy_fromiovec(buf, msg->msg_iov, msglen)) { + err = -EFAULT; + goto done; + } + + hdr = (struct mgmt_hdr *) buf; + opcode = get_unaligned_le16(&hdr->opcode); + index = get_unaligned_le16(&hdr->index); + len = get_unaligned_le16(&hdr->len); + + if (len != msglen - sizeof(*hdr)) { + err = -EINVAL; + goto done; + } + + switch (opcode) { + case MGMT_OP_READ_VERSION: + err = read_version(sk); + break; + case MGMT_OP_READ_INDEX_LIST: + err = read_index_list(sk); + break; + case MGMT_OP_READ_INFO: + err = read_controller_info(sk, index); + break; + case MGMT_OP_SET_POWERED: + err = set_powered(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_SET_DISCOVERABLE: + err = set_discoverable(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_SET_CONNECTABLE: + err = set_connectable(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_SET_PAIRABLE: + err = set_pairable(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_ADD_UUID: + err = add_uuid(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_REMOVE_UUID: + err = remove_uuid(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_SET_DEV_CLASS: + err = set_dev_class(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_SET_SERVICE_CACHE: + err = set_service_cache(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_LOAD_KEYS: + err = load_keys(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_REMOVE_KEY: + err = remove_key(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_DISCONNECT: + err = disconnect(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_GET_CONNECTIONS: + err = get_connections(sk, index); + break; + case MGMT_OP_PIN_CODE_REPLY: + err = pin_code_reply(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_PIN_CODE_NEG_REPLY: + err = pin_code_neg_reply(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_SET_IO_CAPABILITY: + err = set_io_capability(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_PAIR_DEVICE: + err = pair_device(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_USER_CONFIRM_REPLY: + err = user_confirm_reply(sk, index, buf + sizeof(*hdr), len, 1); + break; + case MGMT_OP_USER_CONFIRM_NEG_REPLY: + err = user_confirm_reply(sk, index, buf + sizeof(*hdr), len, 0); + break; + case MGMT_OP_SET_LOCAL_NAME: + err = set_local_name(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_READ_LOCAL_OOB_DATA: + err = read_local_oob_data(sk, index); + break; + case MGMT_OP_ADD_REMOTE_OOB_DATA: + err = add_remote_oob_data(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_REMOVE_REMOTE_OOB_DATA: + err = remove_remote_oob_data(sk, index, buf + sizeof(*hdr), + len); + break; + case MGMT_OP_START_DISCOVERY: + err = start_discovery(sk, index); + break; + case MGMT_OP_STOP_DISCOVERY: + err = stop_discovery(sk, index); + break; + case MGMT_OP_BLOCK_DEVICE: + err = block_device(sk, index, buf + sizeof(*hdr), len); + break; + case MGMT_OP_UNBLOCK_DEVICE: + err = unblock_device(sk, index, buf + sizeof(*hdr), len); + break; + default: + BT_DBG("Unknown op %u", opcode); + err = cmd_status(sk, index, opcode, 0x01); + break; + } + + if (err < 0) + goto done; + + err = msglen; + +done: + kfree(buf); + return err; +} + +int mgmt_index_added(u16 index) +{ + return mgmt_event(MGMT_EV_INDEX_ADDED, index, NULL, 0, NULL); +} + +int mgmt_index_removed(u16 index) +{ + return mgmt_event(MGMT_EV_INDEX_REMOVED, index, NULL, 0, NULL); +} + +struct cmd_lookup { + u8 val; + struct sock *sk; +}; + +static void mode_rsp(struct pending_cmd *cmd, void *data) +{ + struct mgmt_mode *cp = cmd->param; + struct cmd_lookup *match = data; + + if (cp->val != match->val) + return; + + send_mode_rsp(cmd->sk, cmd->opcode, cmd->index, cp->val); + + list_del(&cmd->list); + + if (match->sk == NULL) { + match->sk = cmd->sk; + sock_hold(match->sk); + } + + mgmt_pending_free(cmd); +} + +int mgmt_powered(u16 index, u8 powered) +{ + struct mgmt_mode ev; + struct cmd_lookup match = { powered, NULL }; + int ret; + + mgmt_pending_foreach(MGMT_OP_SET_POWERED, index, mode_rsp, &match); + + ev.val = powered; + + ret = mgmt_event(MGMT_EV_POWERED, index, &ev, sizeof(ev), match.sk); + + if (match.sk) + sock_put(match.sk); + + return ret; +} + +int mgmt_discoverable(u16 index, u8 discoverable) +{ + struct mgmt_mode ev; + struct cmd_lookup match = { discoverable, NULL }; + int ret; + + mgmt_pending_foreach(MGMT_OP_SET_DISCOVERABLE, index, mode_rsp, &match); + + ev.val = discoverable; + + ret = mgmt_event(MGMT_EV_DISCOVERABLE, index, &ev, sizeof(ev), + match.sk); + + if (match.sk) + sock_put(match.sk); + + return ret; +} + +int mgmt_connectable(u16 index, u8 connectable) +{ + struct mgmt_mode ev; + struct cmd_lookup match = { connectable, NULL }; + int ret; + + mgmt_pending_foreach(MGMT_OP_SET_CONNECTABLE, index, mode_rsp, &match); + + ev.val = connectable; + + ret = mgmt_event(MGMT_EV_CONNECTABLE, index, &ev, sizeof(ev), match.sk); + + if (match.sk) + sock_put(match.sk); + + return ret; +} + +int mgmt_new_key(u16 index, struct link_key *key, u8 persistent) +{ + struct mgmt_ev_new_key *ev; + int err, total; + + total = sizeof(struct mgmt_ev_new_key) + key->dlen; + ev = kzalloc(total, GFP_ATOMIC); + if (!ev) + return -ENOMEM; + + bacpy(&ev->key.bdaddr, &key->bdaddr); + ev->key.type = key->type; + memcpy(ev->key.val, key->val, 16); + ev->key.pin_len = key->pin_len; + ev->key.dlen = key->dlen; + ev->store_hint = persistent; + + memcpy(ev->key.data, key->data, key->dlen); + + err = mgmt_event(MGMT_EV_NEW_KEY, index, ev, total, NULL); + + kfree(ev); + + return err; +} + +int mgmt_connected(u16 index, bdaddr_t *bdaddr) +{ + struct mgmt_ev_connected ev; + + bacpy(&ev.bdaddr, bdaddr); + + return mgmt_event(MGMT_EV_CONNECTED, index, &ev, sizeof(ev), NULL); +} + +static void disconnect_rsp(struct pending_cmd *cmd, void *data) +{ + struct mgmt_cp_disconnect *cp = cmd->param; + struct sock **sk = data; + struct mgmt_rp_disconnect rp; + + bacpy(&rp.bdaddr, &cp->bdaddr); + + cmd_complete(cmd->sk, cmd->index, MGMT_OP_DISCONNECT, &rp, sizeof(rp)); + + *sk = cmd->sk; + sock_hold(*sk); + + mgmt_pending_remove(cmd); +} + +int mgmt_disconnected(u16 index, bdaddr_t *bdaddr) +{ + struct mgmt_ev_disconnected ev; + struct sock *sk = NULL; + int err; + + mgmt_pending_foreach(MGMT_OP_DISCONNECT, index, disconnect_rsp, &sk); + + bacpy(&ev.bdaddr, bdaddr); + + err = mgmt_event(MGMT_EV_DISCONNECTED, index, &ev, sizeof(ev), sk); + + if (sk) + sock_put(sk); + + return err; +} + +int mgmt_disconnect_failed(u16 index) +{ + struct pending_cmd *cmd; + int err; + + cmd = mgmt_pending_find(MGMT_OP_DISCONNECT, index); + if (!cmd) + return -ENOENT; + + err = cmd_status(cmd->sk, index, MGMT_OP_DISCONNECT, EIO); + + mgmt_pending_remove(cmd); + + return err; +} + +int mgmt_connect_failed(u16 index, bdaddr_t *bdaddr, u8 status) +{ + struct mgmt_ev_connect_failed ev; + + bacpy(&ev.bdaddr, bdaddr); + ev.status = status; + + return mgmt_event(MGMT_EV_CONNECT_FAILED, index, &ev, sizeof(ev), NULL); +} + +int mgmt_pin_code_request(u16 index, bdaddr_t *bdaddr, u8 secure) +{ + struct mgmt_ev_pin_code_request ev; + + bacpy(&ev.bdaddr, bdaddr); + ev.secure = secure; + + return mgmt_event(MGMT_EV_PIN_CODE_REQUEST, index, &ev, sizeof(ev), + NULL); +} + +int mgmt_pin_code_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status) +{ + struct pending_cmd *cmd; + struct mgmt_rp_pin_code_reply rp; + int err; + + cmd = mgmt_pending_find(MGMT_OP_PIN_CODE_REPLY, index); + if (!cmd) + return -ENOENT; + + bacpy(&rp.bdaddr, bdaddr); + rp.status = status; + + err = cmd_complete(cmd->sk, index, MGMT_OP_PIN_CODE_REPLY, &rp, + sizeof(rp)); + + mgmt_pending_remove(cmd); + + return err; +} + +int mgmt_pin_code_neg_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status) +{ + struct pending_cmd *cmd; + struct mgmt_rp_pin_code_reply rp; + int err; + + cmd = mgmt_pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, index); + if (!cmd) + return -ENOENT; + + bacpy(&rp.bdaddr, bdaddr); + rp.status = status; + + err = cmd_complete(cmd->sk, index, MGMT_OP_PIN_CODE_NEG_REPLY, &rp, + sizeof(rp)); + + mgmt_pending_remove(cmd); + + return err; +} + +int mgmt_user_confirm_request(u16 index, bdaddr_t *bdaddr, __le32 value, + u8 confirm_hint) +{ + struct mgmt_ev_user_confirm_request ev; + + BT_DBG("hci%u", index); + + bacpy(&ev.bdaddr, bdaddr); + ev.confirm_hint = confirm_hint; + put_unaligned_le32(value, &ev.value); + + return mgmt_event(MGMT_EV_USER_CONFIRM_REQUEST, index, &ev, sizeof(ev), + NULL); +} + +static int confirm_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status, + u8 opcode) +{ + struct pending_cmd *cmd; + struct mgmt_rp_user_confirm_reply rp; + int err; + + cmd = mgmt_pending_find(opcode, index); + if (!cmd) + return -ENOENT; + + bacpy(&rp.bdaddr, bdaddr); + rp.status = status; + err = cmd_complete(cmd->sk, index, opcode, &rp, sizeof(rp)); + + mgmt_pending_remove(cmd); + + return err; +} + +int mgmt_user_confirm_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status) +{ + return confirm_reply_complete(index, bdaddr, status, + MGMT_OP_USER_CONFIRM_REPLY); +} + +int mgmt_user_confirm_neg_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status) +{ + return confirm_reply_complete(index, bdaddr, status, + MGMT_OP_USER_CONFIRM_NEG_REPLY); +} + +int mgmt_auth_failed(u16 index, bdaddr_t *bdaddr, u8 status) +{ + struct mgmt_ev_auth_failed ev; + + bacpy(&ev.bdaddr, bdaddr); + ev.status = status; + + return mgmt_event(MGMT_EV_AUTH_FAILED, index, &ev, sizeof(ev), NULL); +} + +int mgmt_set_local_name_complete(u16 index, u8 *name, u8 status) +{ + struct pending_cmd *cmd; + struct hci_dev *hdev; + struct mgmt_cp_set_local_name ev; + int err; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.name, name, HCI_MAX_NAME_LENGTH); + + cmd = mgmt_pending_find(MGMT_OP_SET_LOCAL_NAME, index); + if (!cmd) + goto send_event; + + if (status) { + err = cmd_status(cmd->sk, index, MGMT_OP_SET_LOCAL_NAME, EIO); + goto failed; + } + + hdev = hci_dev_get(index); + if (hdev) { + hci_dev_lock_bh(hdev); + update_eir(hdev); + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + } + + err = cmd_complete(cmd->sk, index, MGMT_OP_SET_LOCAL_NAME, &ev, + sizeof(ev)); + if (err < 0) + goto failed; + +send_event: + err = mgmt_event(MGMT_EV_LOCAL_NAME_CHANGED, index, &ev, sizeof(ev), + cmd ? cmd->sk : NULL); + +failed: + if (cmd) + mgmt_pending_remove(cmd); + return err; +} + +int mgmt_read_local_oob_data_reply_complete(u16 index, u8 *hash, u8 *randomizer, + u8 status) +{ + struct pending_cmd *cmd; + int err; + + BT_DBG("hci%u status %u", index, status); + + cmd = mgmt_pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, index); + if (!cmd) + return -ENOENT; + + if (status) { + err = cmd_status(cmd->sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, + EIO); + } else { + struct mgmt_rp_read_local_oob_data rp; + + memcpy(rp.hash, hash, sizeof(rp.hash)); + memcpy(rp.randomizer, randomizer, sizeof(rp.randomizer)); + + err = cmd_complete(cmd->sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, + &rp, sizeof(rp)); + } + + mgmt_pending_remove(cmd); + + return err; +} + +int mgmt_device_found(u16 index, bdaddr_t *bdaddr, u8 *dev_class, s8 rssi, + u8 *eir) +{ + struct mgmt_ev_device_found ev; + + memset(&ev, 0, sizeof(ev)); + + bacpy(&ev.bdaddr, bdaddr); + memcpy(ev.dev_class, dev_class, sizeof(ev.dev_class)); + ev.rssi = rssi; + + if (eir) + memcpy(ev.eir, eir, sizeof(ev.eir)); + + return mgmt_event(MGMT_EV_DEVICE_FOUND, index, &ev, sizeof(ev), NULL); +} + +int mgmt_remote_name(u16 index, bdaddr_t *bdaddr, u8 *name) +{ + struct mgmt_ev_remote_name ev; + + memset(&ev, 0, sizeof(ev)); + + bacpy(&ev.bdaddr, bdaddr); + memcpy(ev.name, name, HCI_MAX_NAME_LENGTH); + + return mgmt_event(MGMT_EV_REMOTE_NAME, index, &ev, sizeof(ev), NULL); +} + +int mgmt_discovering(u16 index, u8 discovering) +{ + return mgmt_event(MGMT_EV_DISCOVERING, index, &discovering, + sizeof(discovering), NULL); +} diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig new file mode 100644 index 00000000..405a0e61 --- /dev/null +++ b/net/bluetooth/rfcomm/Kconfig @@ -0,0 +1,17 @@ +config BT_RFCOMM + tristate "RFCOMM protocol support" + depends on BT && BT_L2CAP + help + RFCOMM provides connection oriented stream transport. RFCOMM + support is required for Dialup Networking, OBEX and other Bluetooth + applications. + + Say Y here to compile RFCOMM support into the kernel or say M to + compile it as module (rfcomm). + +config BT_RFCOMM_TTY + bool "RFCOMM TTY support" + depends on BT_RFCOMM + help + This option enables TTY emulation support for RFCOMM channels. + diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile new file mode 100644 index 00000000..fe07988a --- /dev/null +++ b/net/bluetooth/rfcomm/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux Bluetooth RFCOMM layer. +# + +obj-$(CONFIG_BT_RFCOMM) += rfcomm.o + +rfcomm-y := core.o sock.o +rfcomm-$(CONFIG_BT_RFCOMM_TTY) += tty.o diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c new file mode 100644 index 00000000..c2486a53 --- /dev/null +++ b/net/bluetooth/rfcomm/core.c @@ -0,0 +1,2227 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * Bluetooth RFCOMM core. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#define VERSION "1.11" + +static int disable_cfc; +static int l2cap_ertm; +static int channel_mtu = -1; +static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU; + +static struct task_struct *rfcomm_thread; + +static DEFINE_MUTEX(rfcomm_mutex); +#define rfcomm_lock() mutex_lock(&rfcomm_mutex) +#define rfcomm_unlock() mutex_unlock(&rfcomm_mutex) + + +static LIST_HEAD(session_list); + +static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len); +static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci); +static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci); +static int rfcomm_queue_disc(struct rfcomm_dlc *d); +static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type); +static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d); +static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig); +static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len); +static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits); +static void rfcomm_make_uih(struct sk_buff *skb, u8 addr); + +static void rfcomm_process_connect(struct rfcomm_session *s); + +static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, + bdaddr_t *dst, + u8 sec_level, + int *err); +static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst); +static void rfcomm_session_del(struct rfcomm_session *s); + +/* ---- RFCOMM frame parsing macros ---- */ +#define __get_dlci(b) ((b & 0xfc) >> 2) +#define __get_channel(b) ((b & 0xf8) >> 3) +#define __get_dir(b) ((b & 0x04) >> 2) +#define __get_type(b) ((b & 0xef)) + +#define __test_ea(b) ((b & 0x01)) +#define __test_cr(b) ((b & 0x02)) +#define __test_pf(b) ((b & 0x10)) + +#define __addr(cr, dlci) (((dlci & 0x3f) << 2) | (cr << 1) | 0x01) +#define __ctrl(type, pf) (((type & 0xef) | (pf << 4))) +#define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir) +#define __srv_channel(dlci) (dlci >> 1) +#define __dir(dlci) (dlci & 0x01) + +#define __len8(len) (((len) << 1) | 1) +#define __len16(len) ((len) << 1) + +/* MCC macros */ +#define __mcc_type(cr, type) (((type << 2) | (cr << 1) | 0x01)) +#define __get_mcc_type(b) ((b & 0xfc) >> 2) +#define __get_mcc_len(b) ((b & 0xfe) >> 1) + +/* RPN macros */ +#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3)) +#define __get_rpn_data_bits(line) ((line) & 0x3) +#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) +#define __get_rpn_parity(line) (((line) >> 3) & 0x7) + +static inline void rfcomm_schedule(void) +{ + if (!rfcomm_thread) + return; + wake_up_process(rfcomm_thread); +} + +static inline void rfcomm_session_put(struct rfcomm_session *s) +{ + if (atomic_dec_and_test(&s->refcnt)) + rfcomm_session_del(s); +} + +/* ---- RFCOMM FCS computation ---- */ + +/* reversed, 8-bit, poly=0x07 */ +static unsigned char rfcomm_crc_table[256] = { + 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, + 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, + 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, + 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, + + 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, + 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, + 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, + 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, + + 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, + 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, + 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, + 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, + + 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, + 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, + 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, + 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, + + 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, + 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, + 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, + 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, + + 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, + 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, + 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, + 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, + + 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, + 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, + 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, + 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, + + 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, + 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, + 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, + 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf +}; + +/* CRC on 2 bytes */ +#define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]]) + +/* FCS on 2 bytes */ +static inline u8 __fcs(u8 *data) +{ + return 0xff - __crc(data); +} + +/* FCS on 3 bytes */ +static inline u8 __fcs2(u8 *data) +{ + return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]]; +} + +/* Check FCS */ +static inline int __check_fcs(u8 *data, int type, u8 fcs) +{ + u8 f = __crc(data); + + if (type != RFCOMM_UIH) + f = rfcomm_crc_table[f ^ data[2]]; + + return rfcomm_crc_table[f ^ fcs] != 0xcf; +} + +/* ---- L2CAP callbacks ---- */ +static void rfcomm_l2state_change(struct sock *sk) +{ + BT_DBG("%p state %d", sk, sk->sk_state); + rfcomm_schedule(); +} + +static void rfcomm_l2data_ready(struct sock *sk, int bytes) +{ + BT_DBG("%p bytes %d", sk, bytes); + rfcomm_schedule(); +} + +static int rfcomm_l2sock_create(struct socket **sock) +{ + int err; + + BT_DBG(""); + + err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); + if (!err) { + struct sock *sk = (*sock)->sk; + sk->sk_data_ready = rfcomm_l2data_ready; + sk->sk_state_change = rfcomm_l2state_change; + } + return err; +} + +static inline int rfcomm_check_security(struct rfcomm_dlc *d) +{ + struct sock *sk = d->session->sock->sk; + struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn; + + __u8 auth_type; + + switch (d->sec_level) { + case BT_SECURITY_HIGH: + auth_type = HCI_AT_GENERAL_BONDING_MITM; + break; + case BT_SECURITY_MEDIUM: + auth_type = HCI_AT_GENERAL_BONDING; + break; + default: + auth_type = HCI_AT_NO_BONDING; + break; + } + + return hci_conn_security(conn->hcon, d->sec_level, auth_type); +} + +static void rfcomm_session_timeout(unsigned long arg) +{ + struct rfcomm_session *s = (void *) arg; + + BT_DBG("session %p state %ld", s, s->state); + + set_bit(RFCOMM_TIMED_OUT, &s->flags); + rfcomm_schedule(); +} + +static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout) +{ + BT_DBG("session %p state %ld timeout %ld", s, s->state, timeout); + + if (!mod_timer(&s->timer, jiffies + timeout)) + rfcomm_session_hold(s); +} + +static void rfcomm_session_clear_timer(struct rfcomm_session *s) +{ + BT_DBG("session %p state %ld", s, s->state); + + if (timer_pending(&s->timer) && del_timer(&s->timer)) + rfcomm_session_put(s); +} + +/* ---- RFCOMM DLCs ---- */ +static void rfcomm_dlc_timeout(unsigned long arg) +{ + struct rfcomm_dlc *d = (void *) arg; + + BT_DBG("dlc %p state %ld", d, d->state); + + set_bit(RFCOMM_TIMED_OUT, &d->flags); + rfcomm_dlc_put(d); + rfcomm_schedule(); +} + +static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout) +{ + BT_DBG("dlc %p state %ld timeout %ld", d, d->state, timeout); + + if (!mod_timer(&d->timer, jiffies + timeout)) + rfcomm_dlc_hold(d); +} + +static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p state %ld", d, d->state); + + if (timer_pending(&d->timer) && del_timer(&d->timer)) + rfcomm_dlc_put(d); +} + +static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d) +{ + BT_DBG("%p", d); + + d->state = BT_OPEN; + d->flags = 0; + d->mscex = 0; + d->sec_level = BT_SECURITY_LOW; + d->mtu = RFCOMM_DEFAULT_MTU; + d->v24_sig = RFCOMM_V24_RTC | RFCOMM_V24_RTR | RFCOMM_V24_DV; + + d->cfc = RFCOMM_CFC_DISABLED; + d->rx_credits = RFCOMM_DEFAULT_CREDITS; +} + +struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) +{ + struct rfcomm_dlc *d = kzalloc(sizeof(*d), prio); + + if (!d) + return NULL; + + setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d); + + skb_queue_head_init(&d->tx_queue); + spin_lock_init(&d->lock); + atomic_set(&d->refcnt, 1); + + rfcomm_dlc_clear_state(d); + + BT_DBG("%p", d); + + return d; +} + +void rfcomm_dlc_free(struct rfcomm_dlc *d) +{ + BT_DBG("%p", d); + + skb_queue_purge(&d->tx_queue); + kfree(d); +} + +static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p session %p", d, s); + + rfcomm_session_hold(s); + + rfcomm_session_clear_timer(s); + rfcomm_dlc_hold(d); + list_add(&d->list, &s->dlcs); + d->session = s; +} + +static void rfcomm_dlc_unlink(struct rfcomm_dlc *d) +{ + struct rfcomm_session *s = d->session; + + BT_DBG("dlc %p refcnt %d session %p", d, atomic_read(&d->refcnt), s); + + list_del(&d->list); + d->session = NULL; + rfcomm_dlc_put(d); + + if (list_empty(&s->dlcs)) + rfcomm_session_set_timer(s, RFCOMM_IDLE_TIMEOUT); + + rfcomm_session_put(s); +} + +static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_dlc *d; + struct list_head *p; + + list_for_each(p, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + if (d->dlci == dlci) + return d; + } + return NULL; +} + +static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) +{ + struct rfcomm_session *s; + int err = 0; + u8 dlci; + + BT_DBG("dlc %p state %ld %s %s channel %d", + d, d->state, batostr(src), batostr(dst), channel); + + if (channel < 1 || channel > 30) + return -EINVAL; + + if (d->state != BT_OPEN && d->state != BT_CLOSED) + return 0; + + s = rfcomm_session_get(src, dst); + if (!s) { + s = rfcomm_session_create(src, dst, d->sec_level, &err); + if (!s) + return err; + } + + dlci = __dlci(!s->initiator, channel); + + /* Check if DLCI already exists */ + if (rfcomm_dlc_get(s, dlci)) + return -EBUSY; + + rfcomm_dlc_clear_state(d); + + d->dlci = dlci; + d->addr = __addr(s->initiator, dlci); + d->priority = 7; + + d->state = BT_CONFIG; + rfcomm_dlc_link(s, d); + + d->out = 1; + + d->mtu = s->mtu; + d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc; + + if (s->state == BT_CONNECTED) { + if (rfcomm_check_security(d)) + rfcomm_send_pn(s, 1, d); + else + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + } + + rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); + + return 0; +} + +int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) +{ + int r; + + rfcomm_lock(); + + r = __rfcomm_dlc_open(d, src, dst, channel); + + rfcomm_unlock(); + return r; +} + +static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) +{ + struct rfcomm_session *s = d->session; + if (!s) + return 0; + + BT_DBG("dlc %p state %ld dlci %d err %d session %p", + d, d->state, d->dlci, err, s); + + switch (d->state) { + case BT_CONNECT: + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + rfcomm_schedule(); + break; + } + /* Fall through */ + + case BT_CONNECTED: + d->state = BT_DISCONN; + if (skb_queue_empty(&d->tx_queue)) { + rfcomm_send_disc(s, d->dlci); + rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT); + } else { + rfcomm_queue_disc(d); + rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2); + } + break; + + case BT_OPEN: + case BT_CONNECT2: + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + rfcomm_schedule(); + break; + } + /* Fall through */ + + default: + rfcomm_dlc_clear_timer(d); + + rfcomm_dlc_lock(d); + d->state = BT_CLOSED; + d->state_change(d, err); + rfcomm_dlc_unlock(d); + + skb_queue_purge(&d->tx_queue); + rfcomm_dlc_unlink(d); + } + + return 0; +} + +int rfcomm_dlc_close(struct rfcomm_dlc *d, int err) +{ + int r; + + rfcomm_lock(); + + r = __rfcomm_dlc_close(d, err); + + rfcomm_unlock(); + return r; +} + +int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) +{ + int len = skb->len; + + if (d->state != BT_CONNECTED) + return -ENOTCONN; + + BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + + if (len > d->mtu) + return -EINVAL; + + rfcomm_make_uih(skb, d->addr); + skb_queue_tail(&d->tx_queue, skb); + + if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + rfcomm_schedule(); + return len; +} + +void __rfcomm_dlc_throttle(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p state %ld", d, d->state); + + if (!d->cfc) { + d->v24_sig |= RFCOMM_V24_FC; + set_bit(RFCOMM_MSC_PENDING, &d->flags); + } + rfcomm_schedule(); +} + +void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p state %ld", d, d->state); + + if (!d->cfc) { + d->v24_sig &= ~RFCOMM_V24_FC; + set_bit(RFCOMM_MSC_PENDING, &d->flags); + } + rfcomm_schedule(); +} + +/* + Set/get modem status functions use _local_ status i.e. what we report + to the other side. + Remote status is provided by dlc->modem_status() callback. + */ +int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig) +{ + BT_DBG("dlc %p state %ld v24_sig 0x%x", + d, d->state, v24_sig); + + if (test_bit(RFCOMM_RX_THROTTLED, &d->flags)) + v24_sig |= RFCOMM_V24_FC; + else + v24_sig &= ~RFCOMM_V24_FC; + + d->v24_sig = v24_sig; + + if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags)) + rfcomm_schedule(); + + return 0; +} + +int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig) +{ + BT_DBG("dlc %p state %ld v24_sig 0x%x", + d, d->state, d->v24_sig); + + *v24_sig = d->v24_sig; + return 0; +} + +/* ---- RFCOMM sessions ---- */ +static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state) +{ + struct rfcomm_session *s = kzalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + return NULL; + + BT_DBG("session %p sock %p", s, sock); + + setup_timer(&s->timer, rfcomm_session_timeout, (unsigned long) s); + + INIT_LIST_HEAD(&s->dlcs); + s->state = state; + s->sock = sock; + + s->mtu = RFCOMM_DEFAULT_MTU; + s->cfc = disable_cfc ? RFCOMM_CFC_DISABLED : RFCOMM_CFC_UNKNOWN; + + /* Do not increment module usage count for listening sessions. + * Otherwise we won't be able to unload the module. */ + if (state != BT_LISTEN) + if (!try_module_get(THIS_MODULE)) { + kfree(s); + return NULL; + } + + list_add(&s->list, &session_list); + + return s; +} + +static void rfcomm_session_del(struct rfcomm_session *s) +{ + int state = s->state; + + BT_DBG("session %p state %ld", s, s->state); + + list_del(&s->list); + + if (state == BT_CONNECTED) + rfcomm_send_disc(s, 0); + + rfcomm_session_clear_timer(s); + sock_release(s->sock); + kfree(s); + + if (state != BT_LISTEN) + module_put(THIS_MODULE); +} + +static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) +{ + struct rfcomm_session *s; + struct list_head *p, *n; + struct bt_sock *sk; + list_for_each_safe(p, n, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + sk = bt_sk(s->sock->sk); + + if ((!bacmp(src, BDADDR_ANY) || !bacmp(&sk->src, src)) && + !bacmp(&sk->dst, dst)) + return s; + } + return NULL; +} + +static void rfcomm_session_close(struct rfcomm_session *s, int err) +{ + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("session %p state %ld err %d", s, s->state, err); + + rfcomm_session_hold(s); + + s->state = BT_CLOSED; + + /* Close all dlcs */ + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, err); + } + + rfcomm_session_clear_timer(s); + rfcomm_session_put(s); +} + +static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, + bdaddr_t *dst, + u8 sec_level, + int *err) +{ + struct rfcomm_session *s = NULL; + struct sockaddr_l2 addr; + struct socket *sock; + struct sock *sk; + + BT_DBG("%s %s", batostr(src), batostr(dst)); + + *err = rfcomm_l2sock_create(&sock); + if (*err < 0) + return NULL; + + bacpy(&addr.l2_bdaddr, src); + addr.l2_family = AF_BLUETOOTH; + addr.l2_psm = 0; + addr.l2_cid = 0; + *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + if (*err < 0) + goto failed; + + /* Set L2CAP options */ + sk = sock->sk; + lock_sock(sk); + l2cap_pi(sk)->chan->imtu = l2cap_mtu; + l2cap_pi(sk)->chan->sec_level = sec_level; + if (l2cap_ertm) + l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM; + release_sock(sk); + + s = rfcomm_session_add(sock, BT_BOUND); + if (!s) { + *err = -ENOMEM; + goto failed; + } + + s->initiator = 1; + + bacpy(&addr.l2_bdaddr, dst); + addr.l2_family = AF_BLUETOOTH; + addr.l2_psm = cpu_to_le16(RFCOMM_PSM); + addr.l2_cid = 0; + *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); + if (*err == 0 || *err == -EINPROGRESS) + return s; + + rfcomm_session_del(s); + return NULL; + +failed: + sock_release(sock); + return NULL; +} + +void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst) +{ + struct sock *sk = s->sock->sk; + if (src) + bacpy(src, &bt_sk(sk)->src); + if (dst) + bacpy(dst, &bt_sk(sk)->dst); +} + +/* ---- RFCOMM frame sending ---- */ +static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len) +{ + struct socket *sock = s->sock; + struct kvec iv = { data, len }; + struct msghdr msg; + + BT_DBG("session %p len %d", s, len); + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, &iv, 1, len); +} + +static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_SABM, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(!s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_UA, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_DISC, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_queue_disc(struct rfcomm_dlc *d) +{ + struct rfcomm_cmd *cmd; + struct sk_buff *skb; + + BT_DBG("dlc %p dlci %d", d, d->dlci); + + skb = alloc_skb(sizeof(*cmd), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + cmd = (void *) __skb_put(skb, sizeof(*cmd)); + cmd->addr = d->addr; + cmd->ctrl = __ctrl(RFCOMM_DISC, 1); + cmd->len = __len8(0); + cmd->fcs = __fcs2((u8 *) cmd); + + skb_queue_tail(&d->tx_queue, skb); + rfcomm_schedule(); + return 0; +} + +static int rfcomm_send_dm(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(!s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_DM, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d type %d", s, cr, type); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + 1); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_NSC); + mcc->len = __len8(1); + + /* Type that we didn't like */ + *ptr = __mcc_type(cr, type); ptr++; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_pn *pn; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d dlci %d mtu %d", s, cr, d->dlci, d->mtu); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*pn)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_PN); + mcc->len = __len8(sizeof(*pn)); + + pn = (void *) ptr; ptr += sizeof(*pn); + pn->dlci = d->dlci; + pn->priority = d->priority; + pn->ack_timer = 0; + pn->max_retrans = 0; + + if (s->cfc) { + pn->flow_ctrl = cr ? 0xf0 : 0xe0; + pn->credits = RFCOMM_DEFAULT_CREDITS; + } else { + pn->flow_ctrl = 0; + pn->credits = 0; + } + + if (cr && channel_mtu >= 0) + pn->mtu = cpu_to_le16(channel_mtu); + else + pn->mtu = cpu_to_le16(d->mtu); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, + u8 bit_rate, u8 data_bits, u8 stop_bits, + u8 parity, u8 flow_ctrl_settings, + u8 xon_char, u8 xoff_char, u16 param_mask) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_rpn *rpn; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x" + " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", + s, cr, dlci, bit_rate, data_bits, stop_bits, parity, + flow_ctrl_settings, xon_char, xoff_char, param_mask); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*rpn)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_RPN); + mcc->len = __len8(sizeof(*rpn)); + + rpn = (void *) ptr; ptr += sizeof(*rpn); + rpn->dlci = __addr(1, dlci); + rpn->bit_rate = bit_rate; + rpn->line_settings = __rpn_line_settings(data_bits, stop_bits, parity); + rpn->flow_ctrl = flow_ctrl_settings; + rpn->xon_char = xon_char; + rpn->xoff_char = xoff_char; + rpn->param_mask = cpu_to_le16(param_mask); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_rls(struct rfcomm_session *s, int cr, u8 dlci, u8 status) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_rls *rls; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d status 0x%x", s, cr, status); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*rls)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_RLS); + mcc->len = __len8(sizeof(*rls)); + + rls = (void *) ptr; ptr += sizeof(*rls); + rls->dlci = __addr(1, dlci); + rls->status = status; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_msc *msc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d v24 0x%x", s, cr, v24_sig); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*msc)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_MSC); + mcc->len = __len8(sizeof(*msc)); + + msc = (void *) ptr; ptr += sizeof(*msc); + msc->dlci = __addr(1, dlci); + msc->v24_sig = v24_sig | 0x01; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_fcoff(struct rfcomm_session *s, int cr) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d", s, cr); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_FCOFF); + mcc->len = __len8(0); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_fcon(struct rfcomm_session *s, int cr) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d", s, cr); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_FCON); + mcc->len = __len8(0); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len) +{ + struct socket *sock = s->sock; + struct kvec iv[3]; + struct msghdr msg; + unsigned char hdr[5], crc[1]; + + if (len > 125) + return -EINVAL; + + BT_DBG("%p cr %d", s, cr); + + hdr[0] = __addr(s->initiator, 0); + hdr[1] = __ctrl(RFCOMM_UIH, 0); + hdr[2] = 0x01 | ((len + 2) << 1); + hdr[3] = 0x01 | ((cr & 0x01) << 1) | (RFCOMM_TEST << 2); + hdr[4] = 0x01 | (len << 1); + + crc[0] = __fcs(hdr); + + iv[0].iov_base = hdr; + iv[0].iov_len = 5; + iv[1].iov_base = pattern; + iv[1].iov_len = len; + iv[2].iov_base = crc; + iv[2].iov_len = 1; + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, iv, 3, 6 + len); +} + +static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits) +{ + struct rfcomm_hdr *hdr; + u8 buf[16], *ptr = buf; + + BT_DBG("%p addr %d credits %d", s, addr, credits); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = addr; + hdr->ctrl = __ctrl(RFCOMM_UIH, 1); + hdr->len = __len8(0); + + *ptr = credits; ptr++; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static void rfcomm_make_uih(struct sk_buff *skb, u8 addr) +{ + struct rfcomm_hdr *hdr; + int len = skb->len; + u8 *crc; + + if (len > 127) { + hdr = (void *) skb_push(skb, 4); + put_unaligned(cpu_to_le16(__len16(len)), (__le16 *) &hdr->len); + } else { + hdr = (void *) skb_push(skb, 3); + hdr->len = __len8(len); + } + hdr->addr = addr; + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + + crc = skb_put(skb, 1); + *crc = __fcs((void *) hdr); +} + +/* ---- RFCOMM frame reception ---- */ +static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) +{ + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (dlci) { + /* Data channel */ + struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); + if (!d) { + rfcomm_send_dm(s, dlci); + return 0; + } + + switch (d->state) { + case BT_CONNECT: + rfcomm_dlc_clear_timer(d); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECTED; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + + rfcomm_send_msc(s, 1, dlci, d->v24_sig); + break; + + case BT_DISCONN: + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, 0); + + if (list_empty(&s->dlcs)) { + s->state = BT_DISCONN; + rfcomm_send_disc(s, 0); + } + + break; + } + } else { + /* Control channel */ + switch (s->state) { + case BT_CONNECT: + s->state = BT_CONNECTED; + rfcomm_process_connect(s); + break; + + case BT_DISCONN: + /* When socket is closed and we are not RFCOMM + * initiator rfcomm_process_rx already calls + * rfcomm_session_put() */ + if (s->sock->sk->sk_state != BT_CLOSED) + if (list_empty(&s->dlcs)) + rfcomm_session_put(s); + break; + } + } + return 0; +} + +static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci) +{ + int err = 0; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (dlci) { + /* Data DLC */ + struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); + if (d) { + if (d->state == BT_CONNECT || d->state == BT_CONFIG) + err = ECONNREFUSED; + else + err = ECONNRESET; + + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, err); + } + } else { + if (s->state == BT_CONNECT) + err = ECONNREFUSED; + else + err = ECONNRESET; + + s->state = BT_CLOSED; + rfcomm_session_close(s, err); + } + return 0; +} + +static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) +{ + int err = 0; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (dlci) { + struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); + if (d) { + rfcomm_send_ua(s, dlci); + + if (d->state == BT_CONNECT || d->state == BT_CONFIG) + err = ECONNREFUSED; + else + err = ECONNRESET; + + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, err); + } else + rfcomm_send_dm(s, dlci); + + } else { + rfcomm_send_ua(s, 0); + + if (s->state == BT_CONNECT) + err = ECONNREFUSED; + else + err = ECONNRESET; + + s->state = BT_CLOSED; + rfcomm_session_close(s, err); + } + + return 0; +} + +void rfcomm_dlc_accept(struct rfcomm_dlc *d) +{ + struct sock *sk = d->session->sock->sk; + struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn; + + BT_DBG("dlc %p", d); + + rfcomm_send_ua(d->session, d->dlci); + + rfcomm_dlc_clear_timer(d); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECTED; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + + if (d->role_switch) + hci_conn_switch_role(conn->hcon, 0x00); + + rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); +} + +static void rfcomm_check_accept(struct rfcomm_dlc *d) +{ + if (rfcomm_check_security(d)) { + if (d->defer_setup) { + set_bit(RFCOMM_DEFER_SETUP, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECT2; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + } else + rfcomm_dlc_accept(d); + } else { + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + } +} + +static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_dlc *d; + u8 channel; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (!dlci) { + rfcomm_send_ua(s, 0); + + if (s->state == BT_OPEN) { + s->state = BT_CONNECTED; + rfcomm_process_connect(s); + } + return 0; + } + + /* Check if DLC exists */ + d = rfcomm_dlc_get(s, dlci); + if (d) { + if (d->state == BT_OPEN) { + /* DLC was previously opened by PN request */ + rfcomm_check_accept(d); + } + return 0; + } + + /* Notify socket layer about incoming connection */ + channel = __srv_channel(dlci); + if (rfcomm_connect_ind(s, channel, &d)) { + d->dlci = dlci; + d->addr = __addr(s->initiator, dlci); + rfcomm_dlc_link(s, d); + + rfcomm_check_accept(d); + } else { + rfcomm_send_dm(s, dlci); + } + + return 0; +} + +static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn) +{ + struct rfcomm_session *s = d->session; + + BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d", + d, d->state, d->dlci, pn->mtu, pn->flow_ctrl, pn->credits); + + if ((pn->flow_ctrl == 0xf0 && s->cfc != RFCOMM_CFC_DISABLED) || + pn->flow_ctrl == 0xe0) { + d->cfc = RFCOMM_CFC_ENABLED; + d->tx_credits = pn->credits; + } else { + d->cfc = RFCOMM_CFC_DISABLED; + set_bit(RFCOMM_TX_THROTTLED, &d->flags); + } + + if (s->cfc == RFCOMM_CFC_UNKNOWN) + s->cfc = d->cfc; + + d->priority = pn->priority; + + d->mtu = __le16_to_cpu(pn->mtu); + + if (cr && d->mtu > s->mtu) + d->mtu = s->mtu; + + return 0; +} + +static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb) +{ + struct rfcomm_pn *pn = (void *) skb->data; + struct rfcomm_dlc *d; + u8 dlci = pn->dlci; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (!dlci) + return 0; + + d = rfcomm_dlc_get(s, dlci); + if (d) { + if (cr) { + /* PN request */ + rfcomm_apply_pn(d, cr, pn); + rfcomm_send_pn(s, 0, d); + } else { + /* PN response */ + switch (d->state) { + case BT_CONFIG: + rfcomm_apply_pn(d, cr, pn); + + d->state = BT_CONNECT; + rfcomm_send_sabm(s, d->dlci); + break; + } + } + } else { + u8 channel = __srv_channel(dlci); + + if (!cr) + return 0; + + /* PN request for non existing DLC. + * Assume incoming connection. */ + if (rfcomm_connect_ind(s, channel, &d)) { + d->dlci = dlci; + d->addr = __addr(s->initiator, dlci); + rfcomm_dlc_link(s, d); + + rfcomm_apply_pn(d, cr, pn); + + d->state = BT_OPEN; + rfcomm_send_pn(s, 0, d); + } else { + rfcomm_send_dm(s, dlci); + } + } + return 0; +} + +static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb) +{ + struct rfcomm_rpn *rpn = (void *) skb->data; + u8 dlci = __get_dlci(rpn->dlci); + + u8 bit_rate = 0; + u8 data_bits = 0; + u8 stop_bits = 0; + u8 parity = 0; + u8 flow_ctrl = 0; + u8 xon_char = 0; + u8 xoff_char = 0; + u16 rpn_mask = RFCOMM_RPN_PM_ALL; + + BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", + dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, + rpn->xon_char, rpn->xoff_char, rpn->param_mask); + + if (!cr) + return 0; + + if (len == 1) { + /* This is a request, return default (according to ETSI TS 07.10) settings */ + bit_rate = RFCOMM_RPN_BR_9600; + data_bits = RFCOMM_RPN_DATA_8; + stop_bits = RFCOMM_RPN_STOP_1; + parity = RFCOMM_RPN_PARITY_NONE; + flow_ctrl = RFCOMM_RPN_FLOW_NONE; + xon_char = RFCOMM_RPN_XON_CHAR; + xoff_char = RFCOMM_RPN_XOFF_CHAR; + goto rpn_out; + } + + /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit, + * no parity, no flow control lines, normal XON/XOFF chars */ + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) { + bit_rate = rpn->bit_rate; + if (bit_rate > RFCOMM_RPN_BR_230400) { + BT_DBG("RPN bit rate mismatch 0x%x", bit_rate); + bit_rate = RFCOMM_RPN_BR_9600; + rpn_mask ^= RFCOMM_RPN_PM_BITRATE; + } + } + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_DATA)) { + data_bits = __get_rpn_data_bits(rpn->line_settings); + if (data_bits != RFCOMM_RPN_DATA_8) { + BT_DBG("RPN data bits mismatch 0x%x", data_bits); + data_bits = RFCOMM_RPN_DATA_8; + rpn_mask ^= RFCOMM_RPN_PM_DATA; + } + } + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_STOP)) { + stop_bits = __get_rpn_stop_bits(rpn->line_settings); + if (stop_bits != RFCOMM_RPN_STOP_1) { + BT_DBG("RPN stop bits mismatch 0x%x", stop_bits); + stop_bits = RFCOMM_RPN_STOP_1; + rpn_mask ^= RFCOMM_RPN_PM_STOP; + } + } + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_PARITY)) { + parity = __get_rpn_parity(rpn->line_settings); + if (parity != RFCOMM_RPN_PARITY_NONE) { + BT_DBG("RPN parity mismatch 0x%x", parity); + parity = RFCOMM_RPN_PARITY_NONE; + rpn_mask ^= RFCOMM_RPN_PM_PARITY; + } + } + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_FLOW)) { + flow_ctrl = rpn->flow_ctrl; + if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) { + BT_DBG("RPN flow ctrl mismatch 0x%x", flow_ctrl); + flow_ctrl = RFCOMM_RPN_FLOW_NONE; + rpn_mask ^= RFCOMM_RPN_PM_FLOW; + } + } + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XON)) { + xon_char = rpn->xon_char; + if (xon_char != RFCOMM_RPN_XON_CHAR) { + BT_DBG("RPN XON char mismatch 0x%x", xon_char); + xon_char = RFCOMM_RPN_XON_CHAR; + rpn_mask ^= RFCOMM_RPN_PM_XON; + } + } + + if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XOFF)) { + xoff_char = rpn->xoff_char; + if (xoff_char != RFCOMM_RPN_XOFF_CHAR) { + BT_DBG("RPN XOFF char mismatch 0x%x", xoff_char); + xoff_char = RFCOMM_RPN_XOFF_CHAR; + rpn_mask ^= RFCOMM_RPN_PM_XOFF; + } + } + +rpn_out: + rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits, + parity, flow_ctrl, xon_char, xoff_char, rpn_mask); + + return 0; +} + +static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb) +{ + struct rfcomm_rls *rls = (void *) skb->data; + u8 dlci = __get_dlci(rls->dlci); + + BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); + + if (!cr) + return 0; + + /* We should probably do something with this information here. But + * for now it's sufficient just to reply -- Bluetooth 1.1 says it's + * mandatory to recognise and respond to RLS */ + + rfcomm_send_rls(s, 0, dlci, rls->status); + + return 0; +} + +static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb) +{ + struct rfcomm_msc *msc = (void *) skb->data; + struct rfcomm_dlc *d; + u8 dlci = __get_dlci(msc->dlci); + + BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); + + d = rfcomm_dlc_get(s, dlci); + if (!d) + return 0; + + if (cr) { + if (msc->v24_sig & RFCOMM_V24_FC && !d->cfc) + set_bit(RFCOMM_TX_THROTTLED, &d->flags); + else + clear_bit(RFCOMM_TX_THROTTLED, &d->flags); + + rfcomm_dlc_lock(d); + + d->remote_v24_sig = msc->v24_sig; + + if (d->modem_status) + d->modem_status(d, msc->v24_sig); + + rfcomm_dlc_unlock(d); + + rfcomm_send_msc(s, 0, dlci, msc->v24_sig); + + d->mscex |= RFCOMM_MSCEX_RX; + } else + d->mscex |= RFCOMM_MSCEX_TX; + + return 0; +} + +static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb) +{ + struct rfcomm_mcc *mcc = (void *) skb->data; + u8 type, cr, len; + + cr = __test_cr(mcc->type); + type = __get_mcc_type(mcc->type); + len = __get_mcc_len(mcc->len); + + BT_DBG("%p type 0x%x cr %d", s, type, cr); + + skb_pull(skb, 2); + + switch (type) { + case RFCOMM_PN: + rfcomm_recv_pn(s, cr, skb); + break; + + case RFCOMM_RPN: + rfcomm_recv_rpn(s, cr, len, skb); + break; + + case RFCOMM_RLS: + rfcomm_recv_rls(s, cr, skb); + break; + + case RFCOMM_MSC: + rfcomm_recv_msc(s, cr, skb); + break; + + case RFCOMM_FCOFF: + if (cr) { + set_bit(RFCOMM_TX_THROTTLED, &s->flags); + rfcomm_send_fcoff(s, 0); + } + break; + + case RFCOMM_FCON: + if (cr) { + clear_bit(RFCOMM_TX_THROTTLED, &s->flags); + rfcomm_send_fcon(s, 0); + } + break; + + case RFCOMM_TEST: + if (cr) + rfcomm_send_test(s, 0, skb->data, skb->len); + break; + + case RFCOMM_NSC: + break; + + default: + BT_ERR("Unknown control type 0x%02x", type); + rfcomm_send_nsc(s, cr, type); + break; + } + return 0; +} + +static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk_buff *skb) +{ + struct rfcomm_dlc *d; + + BT_DBG("session %p state %ld dlci %d pf %d", s, s->state, dlci, pf); + + d = rfcomm_dlc_get(s, dlci); + if (!d) { + rfcomm_send_dm(s, dlci); + goto drop; + } + + if (pf && d->cfc) { + u8 credits = *(u8 *) skb->data; skb_pull(skb, 1); + + d->tx_credits += credits; + if (d->tx_credits) + clear_bit(RFCOMM_TX_THROTTLED, &d->flags); + } + + if (skb->len && d->state == BT_CONNECTED) { + rfcomm_dlc_lock(d); + d->rx_credits--; + d->data_ready(d, skb); + rfcomm_dlc_unlock(d); + return 0; + } + +drop: + kfree_skb(skb); + return 0; +} + +static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) +{ + struct rfcomm_hdr *hdr = (void *) skb->data; + u8 type, dlci, fcs; + + dlci = __get_dlci(hdr->addr); + type = __get_type(hdr->ctrl); + + /* Trim FCS */ + skb->len--; skb->tail--; + fcs = *(u8 *)skb_tail_pointer(skb); + + if (__check_fcs(skb->data, type, fcs)) { + BT_ERR("bad checksum in packet"); + kfree_skb(skb); + return -EILSEQ; + } + + if (__test_ea(hdr->len)) + skb_pull(skb, 3); + else + skb_pull(skb, 4); + + switch (type) { + case RFCOMM_SABM: + if (__test_pf(hdr->ctrl)) + rfcomm_recv_sabm(s, dlci); + break; + + case RFCOMM_DISC: + if (__test_pf(hdr->ctrl)) + rfcomm_recv_disc(s, dlci); + break; + + case RFCOMM_UA: + if (__test_pf(hdr->ctrl)) + rfcomm_recv_ua(s, dlci); + break; + + case RFCOMM_DM: + rfcomm_recv_dm(s, dlci); + break; + + case RFCOMM_UIH: + if (dlci) + return rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb); + + rfcomm_recv_mcc(s, skb); + break; + + default: + BT_ERR("Unknown packet type 0x%02x", type); + break; + } + kfree_skb(skb); + return 0; +} + +/* ---- Connection and data processing ---- */ + +static void rfcomm_process_connect(struct rfcomm_session *s) +{ + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("session %p state %ld", s, s->state); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + if (d->state == BT_CONFIG) { + d->mtu = s->mtu; + if (rfcomm_check_security(d)) { + rfcomm_send_pn(s, 1, d); + } else { + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + } + } + } +} + +/* Send data queued for the DLC. + * Return number of frames left in the queue. + */ +static inline int rfcomm_process_tx(struct rfcomm_dlc *d) +{ + struct sk_buff *skb; + int err; + + BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d", + d, d->state, d->cfc, d->rx_credits, d->tx_credits); + + /* Send pending MSC */ + if (test_and_clear_bit(RFCOMM_MSC_PENDING, &d->flags)) + rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); + + if (d->cfc) { + /* CFC enabled. + * Give them some credits */ + if (!test_bit(RFCOMM_RX_THROTTLED, &d->flags) && + d->rx_credits <= (d->cfc >> 2)) { + rfcomm_send_credits(d->session, d->addr, d->cfc - d->rx_credits); + d->rx_credits = d->cfc; + } + } else { + /* CFC disabled. + * Give ourselves some credits */ + d->tx_credits = 5; + } + + if (test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + return skb_queue_len(&d->tx_queue); + + while (d->tx_credits && (skb = skb_dequeue(&d->tx_queue))) { + err = rfcomm_send_frame(d->session, skb->data, skb->len); + if (err < 0) { + skb_queue_head(&d->tx_queue, skb); + break; + } + kfree_skb(skb); + d->tx_credits--; + } + + if (d->cfc && !d->tx_credits) { + /* We're out of TX credits. + * Set TX_THROTTLED flag to avoid unnesary wakeups by dlc_send. */ + set_bit(RFCOMM_TX_THROTTLED, &d->flags); + } + + return skb_queue_len(&d->tx_queue); +} + +static inline void rfcomm_process_dlcs(struct rfcomm_session *s) +{ + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("session %p state %ld", s, s->state); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + + if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) { + __rfcomm_dlc_close(d, ETIMEDOUT); + continue; + } + + if (test_and_clear_bit(RFCOMM_AUTH_ACCEPT, &d->flags)) { + rfcomm_dlc_clear_timer(d); + if (d->out) { + rfcomm_send_pn(s, 1, d); + rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); + } else { + if (d->defer_setup) { + set_bit(RFCOMM_DEFER_SETUP, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECT2; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + } else + rfcomm_dlc_accept(d); + } + continue; + } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) { + rfcomm_dlc_clear_timer(d); + if (!d->out) + rfcomm_send_dm(s, d->dlci); + else + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } + + if (test_bit(RFCOMM_SEC_PENDING, &d->flags)) + continue; + + if (test_bit(RFCOMM_TX_THROTTLED, &s->flags)) + continue; + + if ((d->state == BT_CONNECTED || d->state == BT_DISCONN) && + d->mscex == RFCOMM_MSCEX_OK) + rfcomm_process_tx(d); + } +} + +static inline void rfcomm_process_rx(struct rfcomm_session *s) +{ + struct socket *sock = s->sock; + struct sock *sk = sock->sk; + struct sk_buff *skb; + + BT_DBG("session %p state %ld qlen %d", s, s->state, skb_queue_len(&sk->sk_receive_queue)); + + /* Get data directly from socket receive queue without copying it. */ + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + rfcomm_recv_frame(s, skb); + } + + if (sk->sk_state == BT_CLOSED) { + if (!s->initiator) + rfcomm_session_put(s); + + rfcomm_session_close(s, sk->sk_err); + } +} + +static inline void rfcomm_accept_connection(struct rfcomm_session *s) +{ + struct socket *sock = s->sock, *nsock; + int err; + + /* Fast check for a new connection. + * Avoids unnesesary socket allocations. */ + if (list_empty(&bt_sk(sock->sk)->accept_q)) + return; + + BT_DBG("session %p", s); + + err = kernel_accept(sock, &nsock, O_NONBLOCK); + if (err < 0) + return; + + /* Set our callbacks */ + nsock->sk->sk_data_ready = rfcomm_l2data_ready; + nsock->sk->sk_state_change = rfcomm_l2state_change; + + s = rfcomm_session_add(nsock, BT_OPEN); + if (s) { + rfcomm_session_hold(s); + + /* We should adjust MTU on incoming sessions. + * L2CAP MTU minus UIH header and FCS. */ + s->mtu = min(l2cap_pi(nsock->sk)->chan->omtu, + l2cap_pi(nsock->sk)->chan->imtu) - 5; + + rfcomm_schedule(); + } else + sock_release(nsock); +} + +static inline void rfcomm_check_connection(struct rfcomm_session *s) +{ + struct sock *sk = s->sock->sk; + + BT_DBG("%p state %ld", s, s->state); + + switch (sk->sk_state) { + case BT_CONNECTED: + s->state = BT_CONNECT; + + /* We can adjust MTU on outgoing sessions. + * L2CAP MTU minus UIH header and FCS. */ + s->mtu = min(l2cap_pi(sk)->chan->omtu, l2cap_pi(sk)->chan->imtu) - 5; + + rfcomm_send_sabm(s, 0); + break; + + case BT_CLOSED: + s->state = BT_CLOSED; + rfcomm_session_close(s, sk->sk_err); + break; + } +} + +static inline void rfcomm_process_sessions(void) +{ + struct list_head *p, *n; + + rfcomm_lock(); + + list_for_each_safe(p, n, &session_list) { + struct rfcomm_session *s; + s = list_entry(p, struct rfcomm_session, list); + + if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) { + s->state = BT_DISCONN; + rfcomm_send_disc(s, 0); + rfcomm_session_put(s); + continue; + } + + if (s->state == BT_LISTEN) { + rfcomm_accept_connection(s); + continue; + } + + rfcomm_session_hold(s); + + switch (s->state) { + case BT_BOUND: + rfcomm_check_connection(s); + break; + + default: + rfcomm_process_rx(s); + break; + } + + rfcomm_process_dlcs(s); + + rfcomm_session_put(s); + } + + rfcomm_unlock(); +} + +static int rfcomm_add_listener(bdaddr_t *ba) +{ + struct sockaddr_l2 addr; + struct socket *sock; + struct sock *sk; + struct rfcomm_session *s; + int err = 0; + + /* Create socket */ + err = rfcomm_l2sock_create(&sock); + if (err < 0) { + BT_ERR("Create socket failed %d", err); + return err; + } + + /* Bind socket */ + bacpy(&addr.l2_bdaddr, ba); + addr.l2_family = AF_BLUETOOTH; + addr.l2_psm = cpu_to_le16(RFCOMM_PSM); + addr.l2_cid = 0; + err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + if (err < 0) { + BT_ERR("Bind failed %d", err); + goto failed; + } + + /* Set L2CAP options */ + sk = sock->sk; + lock_sock(sk); + l2cap_pi(sk)->chan->imtu = l2cap_mtu; + release_sock(sk); + + /* Start listening on the socket */ + err = kernel_listen(sock, 10); + if (err) { + BT_ERR("Listen failed %d", err); + goto failed; + } + + /* Add listening session */ + s = rfcomm_session_add(sock, BT_LISTEN); + if (!s) + goto failed; + + rfcomm_session_hold(s); + return 0; +failed: + sock_release(sock); + return err; +} + +static void rfcomm_kill_listener(void) +{ + struct rfcomm_session *s; + struct list_head *p, *n; + + BT_DBG(""); + + list_for_each_safe(p, n, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + rfcomm_session_del(s); + } +} + +static int rfcomm_run(void *unused) +{ + BT_DBG(""); + + set_user_nice(current, -10); + + rfcomm_add_listener(BDADDR_ANY); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + break; + + /* Process stuff */ + rfcomm_process_sessions(); + + schedule(); + } + __set_current_state(TASK_RUNNING); + + rfcomm_kill_listener(); + + return 0; +} + +static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) +{ + struct rfcomm_session *s; + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt); + + s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst); + if (!s) + return; + + rfcomm_session_hold(s); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + + if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) { + rfcomm_dlc_clear_timer(d); + if (status || encrypt == 0x00) { + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } + } + + if (d->state == BT_CONNECTED && !status && encrypt == 0x00) { + if (d->sec_level == BT_SECURITY_MEDIUM) { + set_bit(RFCOMM_SEC_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + continue; + } else if (d->sec_level == BT_SECURITY_HIGH) { + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } + } + + if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) + continue; + + if (!status && hci_conn_check_secure(conn, d->sec_level)) + set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); + else + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + } + + rfcomm_session_put(s); + + rfcomm_schedule(); +} + +static struct hci_cb rfcomm_cb = { + .name = "RFCOMM", + .security_cfm = rfcomm_security_cfm +}; + +static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x) +{ + struct rfcomm_session *s; + struct list_head *pp, *p; + + rfcomm_lock(); + + list_for_each(p, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + list_for_each(pp, &s->dlcs) { + struct sock *sk = s->sock->sk; + struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list); + + seq_printf(f, "%s %s %ld %d %d %d %d\n", + batostr(&bt_sk(sk)->src), + batostr(&bt_sk(sk)->dst), + d->state, d->dlci, d->mtu, + d->rx_credits, d->tx_credits); + } + } + + rfcomm_unlock(); + + return 0; +} + +static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private); +} + +static const struct file_operations rfcomm_dlc_debugfs_fops = { + .open = rfcomm_dlc_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rfcomm_dlc_debugfs; + +/* ---- Initialization ---- */ +static int __init rfcomm_init(void) +{ + int err; + + hci_register_cb(&rfcomm_cb); + + rfcomm_thread = kthread_run(rfcomm_run, NULL, "krfcommd"); + if (IS_ERR(rfcomm_thread)) { + err = PTR_ERR(rfcomm_thread); + goto unregister; + } + + if (bt_debugfs) { + rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444, + bt_debugfs, NULL, &rfcomm_dlc_debugfs_fops); + if (!rfcomm_dlc_debugfs) + BT_ERR("Failed to create RFCOMM debug file"); + } + + err = rfcomm_init_ttys(); + if (err < 0) + goto stop; + + err = rfcomm_init_sockets(); + if (err < 0) + goto cleanup; + + BT_INFO("RFCOMM ver %s", VERSION); + + return 0; + +cleanup: + rfcomm_cleanup_ttys(); + +stop: + kthread_stop(rfcomm_thread); + +unregister: + hci_unregister_cb(&rfcomm_cb); + + return err; +} + +static void __exit rfcomm_exit(void) +{ + debugfs_remove(rfcomm_dlc_debugfs); + + hci_unregister_cb(&rfcomm_cb); + + kthread_stop(rfcomm_thread); + + rfcomm_cleanup_ttys(); + + rfcomm_cleanup_sockets(); +} + +module_init(rfcomm_init); +module_exit(rfcomm_exit); + +module_param(disable_cfc, bool, 0644); +MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control"); + +module_param(channel_mtu, int, 0644); +MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel"); + +module_param(l2cap_mtu, uint, 0644); +MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection"); + +module_param(l2cap_ertm, bool, 0644); +MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection"); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth RFCOMM ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-3"); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c new file mode 100644 index 00000000..b02f0d47 --- /dev/null +++ b/net/bluetooth/rfcomm/sock.c @@ -0,0 +1,1072 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * RFCOMM sockets. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static const struct proto_ops rfcomm_sock_ops; + +static struct bt_sock_list rfcomm_sk_list = { + .lock = __RW_LOCK_UNLOCKED(rfcomm_sk_list.lock) +}; + +static void rfcomm_sock_close(struct sock *sk); +static void rfcomm_sock_kill(struct sock *sk); + +/* ---- DLC callbacks ---- + * + * called under rfcomm_dlc_lock() + */ +static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) +{ + struct sock *sk = d->owner; + if (!sk) + return; + + atomic_add(skb->len, &sk->sk_rmem_alloc); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + rfcomm_dlc_throttle(d); +} + +static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) +{ + struct sock *sk = d->owner, *parent; + unsigned long flags; + + if (!sk) + return; + + BT_DBG("dlc %p state %ld err %d", d, d->state, err); + + local_irq_save(flags); + bh_lock_sock(sk); + + if (err) + sk->sk_err = err; + + sk->sk_state = d->state; + + parent = bt_sk(sk)->parent; + if (parent) { + if (d->state == BT_CLOSED) { + sock_set_flag(sk, SOCK_ZAPPED); + bt_accept_unlink(sk); + } + parent->sk_data_ready(parent, 0); + } else { + if (d->state == BT_CONNECTED) + rfcomm_session_getaddr(d->session, &bt_sk(sk)->src, NULL); + sk->sk_state_change(sk); + } + + bh_unlock_sock(sk); + local_irq_restore(flags); + + if (parent && sock_flag(sk, SOCK_ZAPPED)) { + /* We have to drop DLC lock here, otherwise + * rfcomm_sock_destruct() will dead lock. */ + rfcomm_dlc_unlock(d); + rfcomm_sock_kill(sk); + rfcomm_dlc_lock(d); + } +} + +/* ---- Socket functions ---- */ +static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src) +{ + struct sock *sk = NULL; + struct hlist_node *node; + + sk_for_each(sk, node, &rfcomm_sk_list.head) { + if (rfcomm_pi(sk)->channel == channel && + !bacmp(&bt_sk(sk)->src, src)) + break; + } + + return node ? sk : NULL; +} + +/* Find socket with channel and source bdaddr. + * Returns closest match. + */ +static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src) +{ + struct sock *sk = NULL, *sk1 = NULL; + struct hlist_node *node; + + read_lock(&rfcomm_sk_list.lock); + + sk_for_each(sk, node, &rfcomm_sk_list.head) { + if (state && sk->sk_state != state) + continue; + + if (rfcomm_pi(sk)->channel == channel) { + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) + break; + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + sk1 = sk; + } + } + + read_unlock(&rfcomm_sk_list.lock); + + return node ? sk : sk1; +} + +static void rfcomm_sock_destruct(struct sock *sk) +{ + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + + BT_DBG("sk %p dlc %p", sk, d); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + + rfcomm_dlc_lock(d); + rfcomm_pi(sk)->dlc = NULL; + + /* Detach DLC if it's owned by this socket */ + if (d->owner == sk) + d->owner = NULL; + rfcomm_dlc_unlock(d); + + rfcomm_dlc_put(d); +} + +static void rfcomm_sock_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + BT_DBG("parent %p", parent); + + /* Close not yet accepted dlcs */ + while ((sk = bt_accept_dequeue(parent, NULL))) { + rfcomm_sock_close(sk); + rfcomm_sock_kill(sk); + } + + parent->sk_state = BT_CLOSED; + sock_set_flag(parent, SOCK_ZAPPED); +} + +/* Kill socket (only if zapped and orphan) + * Must be called on unlocked socket. + */ +static void rfcomm_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, atomic_read(&sk->sk_refcnt)); + + /* Kill poor orphan */ + bt_sock_unlink(&rfcomm_sk_list, sk); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +static void __rfcomm_sock_close(struct sock *sk) +{ + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + + BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); + + switch (sk->sk_state) { + case BT_LISTEN: + rfcomm_sock_cleanup_listen(sk); + break; + + case BT_CONNECT: + case BT_CONNECT2: + case BT_CONFIG: + case BT_CONNECTED: + rfcomm_dlc_close(d, 0); + + default: + sock_set_flag(sk, SOCK_ZAPPED); + break; + } +} + +/* Close socket. + * Must be called on unlocked socket. + */ +static void rfcomm_sock_close(struct sock *sk) +{ + lock_sock(sk); + __rfcomm_sock_close(sk); + release_sock(sk); +} + +static void rfcomm_sock_init(struct sock *sk, struct sock *parent) +{ + struct rfcomm_pinfo *pi = rfcomm_pi(sk); + + BT_DBG("sk %p", sk); + + if (parent) { + sk->sk_type = parent->sk_type; + pi->dlc->defer_setup = bt_sk(parent)->defer_setup; + + pi->sec_level = rfcomm_pi(parent)->sec_level; + pi->role_switch = rfcomm_pi(parent)->role_switch; + } else { + pi->dlc->defer_setup = 0; + + pi->sec_level = BT_SECURITY_LOW; + pi->role_switch = 0; + } + + pi->dlc->sec_level = pi->sec_level; + pi->dlc->role_switch = pi->role_switch; +} + +static struct proto rfcomm_proto = { + .name = "RFCOMM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rfcomm_pinfo) +}; + +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +{ + struct rfcomm_dlc *d; + struct sock *sk; + + sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + d = rfcomm_dlc_alloc(prio); + if (!d) { + sk_free(sk); + return NULL; + } + + d->data_ready = rfcomm_sk_data_ready; + d->state_change = rfcomm_sk_state_change; + + rfcomm_pi(sk)->dlc = d; + d->owner = sk; + + sk->sk_destruct = rfcomm_sock_destruct; + sk->sk_sndtimeo = RFCOMM_CONN_TIMEOUT; + + sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; + sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + bt_sock_link(&rfcomm_sk_list, sk); + + BT_DBG("sk %p", sk); + return sk; +} + +static int rfcomm_sock_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_STREAM && sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sock->ops = &rfcomm_sock_ops; + + sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC); + if (!sk) + return -ENOMEM; + + rfcomm_sock_init(sk, NULL); + return 0; +} + +static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p %s", sk, batostr(&sa->rc_bdaddr)); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN) { + err = -EBADFD; + goto done; + } + + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + goto done; + } + + write_lock_bh(&rfcomm_sk_list.lock); + + if (sa->rc_channel && __rfcomm_get_sock_by_addr(sa->rc_channel, &sa->rc_bdaddr)) { + err = -EADDRINUSE; + } else { + /* Save source address */ + bacpy(&bt_sk(sk)->src, &sa->rc_bdaddr); + rfcomm_pi(sk)->channel = sa->rc_channel; + sk->sk_state = BT_BOUND; + } + + write_unlock_bh(&rfcomm_sk_list.lock); + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +{ + struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + int err = 0; + + BT_DBG("sk %p", sk); + + if (alen < sizeof(struct sockaddr_rc) || + addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { + err = -EBADFD; + goto done; + } + + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + goto done; + } + + sk->sk_state = BT_CONNECT; + bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr); + rfcomm_pi(sk)->channel = sa->rc_channel; + + d->sec_level = rfcomm_pi(sk)->sec_level; + d->role_switch = rfcomm_pi(sk)->role_switch; + + err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel); + if (!err) + err = bt_sock_wait_state(sk, BT_CONNECTED, + sock_sndtimeo(sk, flags & O_NONBLOCK)); + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p backlog %d", sk, backlog); + + lock_sock(sk); + + if (sk->sk_state != BT_BOUND) { + err = -EBADFD; + goto done; + } + + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + goto done; + } + + if (!rfcomm_pi(sk)->channel) { + bdaddr_t *src = &bt_sk(sk)->src; + u8 channel; + + err = -EINVAL; + + write_lock_bh(&rfcomm_sk_list.lock); + + for (channel = 1; channel < 31; channel++) + if (!__rfcomm_get_sock_by_addr(channel, src)) { + rfcomm_pi(sk)->channel = channel; + err = 0; + break; + } + + write_unlock_bh(&rfcomm_sk_list.lock); + + if (err < 0) + goto done; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = BT_LISTEN; + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *nsk; + long timeo; + int err = 0; + + lock_sock(sk); + + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + goto done; + } + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + BT_DBG("sk %p timeo %ld", sk, timeo); + + /* Wait for an incoming connection. (wake-one). */ + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + break; + } + + nsk = bt_accept_dequeue(sk, newsock); + if (nsk) + break; + + if (!timeo) { + err = -EAGAIN; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + + BT_DBG("new socket %p", nsk); + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +{ + struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sock *sk = sock->sk; + + BT_DBG("sock %p, sk %p", sock, sk); + + sa->rc_family = AF_BLUETOOTH; + sa->rc_channel = rfcomm_pi(sk)->channel; + if (peer) + bacpy(&sa->rc_bdaddr, &bt_sk(sk)->dst); + else + bacpy(&sa->rc_bdaddr, &bt_sk(sk)->src); + + *len = sizeof(struct sockaddr_rc); + return 0; +} + +static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + struct sk_buff *skb; + int sent = 0; + + if (test_bit(RFCOMM_DEFER_SETUP, &d->flags)) + return -ENOTCONN; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + return -EPIPE; + + BT_DBG("sock %p, sk %p", sock, sk); + + lock_sock(sk); + + while (len) { + size_t size = min_t(size_t, len, d->mtu); + int err; + + skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) { + if (sent == 0) + sent = err; + break; + } + skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err) { + kfree_skb(skb); + if (sent == 0) + sent = err; + break; + } + + err = rfcomm_dlc_send(d, skb); + if (err < 0) { + kfree_skb(skb); + if (sent == 0) + sent = err; + break; + } + + sent += size; + len -= size; + } + + release_sock(sk); + + return sent; +} + +static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + int len; + + if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { + rfcomm_dlc_accept(d); + return 0; + } + + len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags); + + lock_sock(sk); + if (!(flags & MSG_PEEK) && len > 0) + atomic_sub(len, &sk->sk_rmem_alloc); + + if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2)) + rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc); + release_sock(sk); + + return len; +} + +static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + lock_sock(sk); + + switch (optname) { + case RFCOMM_LM: + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + if (opt & RFCOMM_LM_AUTH) + rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW; + if (opt & RFCOMM_LM_ENCRYPT) + rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM; + if (opt & RFCOMM_LM_SECURE) + rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH; + + rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER); + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int err = 0; + size_t len; + u32 opt; + + BT_DBG("sk %p", sk); + + if (level == SOL_RFCOMM) + return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + break; + } + + sec.level = BT_SECURITY_LOW; + + len = min_t(unsigned int, sizeof(sec), optlen); + if (copy_from_user((char *) &sec, optval, len)) { + err = -EFAULT; + break; + } + + if (sec.level > BT_SECURITY_HIGH) { + err = -EINVAL; + break; + } + + rfcomm_pi(sk)->sec_level = sec.level; + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + bt_sk(sk)->defer_setup = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct rfcomm_conninfo cinfo; + struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn; + int len, err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case RFCOMM_LM: + switch (rfcomm_pi(sk)->sec_level) { + case BT_SECURITY_LOW: + opt = RFCOMM_LM_AUTH; + break; + case BT_SECURITY_MEDIUM: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT; + break; + case BT_SECURITY_HIGH: + opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | + RFCOMM_LM_SECURE; + break; + default: + opt = 0; + break; + } + + if (rfcomm_pi(sk)->role_switch) + opt |= RFCOMM_LM_MASTER; + + if (put_user(opt, (u32 __user *) optval)) + err = -EFAULT; + break; + + case RFCOMM_CONNINFO: + if (sk->sk_state != BT_CONNECTED && + !rfcomm_pi(sk)->dlc->defer_setup) { + err = -ENOTCONN; + break; + } + + + memset(&cinfo, 0, sizeof(cinfo)); + cinfo.hci_handle = conn->hcon->handle; + memcpy(cinfo.dev_class, conn->hcon->dev_class, 3); + + len = min_t(unsigned int, len, sizeof(cinfo)); + if (copy_to_user(optval, (char *) &cinfo, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct bt_security sec; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_RFCOMM) + return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case BT_SECURITY: + if (sk->sk_type != SOCK_STREAM) { + err = -EINVAL; + break; + } + + sec.level = rfcomm_pi(sk)->sec_level; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) + err = -EFAULT; + + break; + + case BT_DEFER_SETUP: + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { + err = -EINVAL; + break; + } + + if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk __maybe_unused = sock->sk; + int err; + + BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg); + + err = bt_sock_ioctl(sock, cmd, arg); + + if (err == -ENOIOCTLCMD) { +#ifdef CONFIG_BT_RFCOMM_TTY + lock_sock(sk); + err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg); + release_sock(sk); +#else + err = -EOPNOTSUPP; +#endif + } + + return err; +} + +static int rfcomm_sock_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + lock_sock(sk); + if (!sk->sk_shutdown) { + sk->sk_shutdown = SHUTDOWN_MASK; + __rfcomm_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + } + release_sock(sk); + return err; +} + +static int rfcomm_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + err = rfcomm_sock_shutdown(sock, 2); + + sock_orphan(sk); + rfcomm_sock_kill(sk); + return err; +} + +/* ---- RFCOMM core layer callbacks ---- + * + * called under rfcomm_lock() + */ +int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d) +{ + struct sock *sk, *parent; + bdaddr_t src, dst; + int result = 0; + + BT_DBG("session %p channel %d", s, channel); + + rfcomm_session_getaddr(s, &src, &dst); + + /* Check if we have socket listening on channel */ + parent = rfcomm_get_sock_by_channel(BT_LISTEN, channel, &src); + if (!parent) + return 0; + + bh_lock_sock(parent); + + /* Check for backlog size */ + if (sk_acceptq_is_full(parent)) { + BT_DBG("backlog full %d", parent->sk_ack_backlog); + goto done; + } + + sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC); + if (!sk) + goto done; + + rfcomm_sock_init(sk, parent); + bacpy(&bt_sk(sk)->src, &src); + bacpy(&bt_sk(sk)->dst, &dst); + rfcomm_pi(sk)->channel = channel; + + sk->sk_state = BT_CONFIG; + bt_accept_enqueue(parent, sk); + + /* Accept connection and return socket DLC */ + *d = rfcomm_pi(sk)->dlc; + result = 1; + +done: + bh_unlock_sock(parent); + + if (bt_sk(parent)->defer_setup) + parent->sk_state_change(parent); + + return result; +} + +static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p) +{ + struct sock *sk; + struct hlist_node *node; + + read_lock_bh(&rfcomm_sk_list.lock); + + sk_for_each(sk, node, &rfcomm_sk_list.head) { + seq_printf(f, "%s %s %d %d\n", + batostr(&bt_sk(sk)->src), + batostr(&bt_sk(sk)->dst), + sk->sk_state, rfcomm_pi(sk)->channel); + } + + read_unlock_bh(&rfcomm_sk_list.lock); + + return 0; +} + +static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, rfcomm_sock_debugfs_show, inode->i_private); +} + +static const struct file_operations rfcomm_sock_debugfs_fops = { + .open = rfcomm_sock_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rfcomm_sock_debugfs; + +static const struct proto_ops rfcomm_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = rfcomm_sock_release, + .bind = rfcomm_sock_bind, + .connect = rfcomm_sock_connect, + .listen = rfcomm_sock_listen, + .accept = rfcomm_sock_accept, + .getname = rfcomm_sock_getname, + .sendmsg = rfcomm_sock_sendmsg, + .recvmsg = rfcomm_sock_recvmsg, + .shutdown = rfcomm_sock_shutdown, + .setsockopt = rfcomm_sock_setsockopt, + .getsockopt = rfcomm_sock_getsockopt, + .ioctl = rfcomm_sock_ioctl, + .poll = bt_sock_poll, + .socketpair = sock_no_socketpair, + .mmap = sock_no_mmap +}; + +static const struct net_proto_family rfcomm_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = rfcomm_sock_create +}; + +int __init rfcomm_init_sockets(void) +{ + int err; + + err = proto_register(&rfcomm_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_RFCOMM, &rfcomm_sock_family_ops); + if (err < 0) + goto error; + + if (bt_debugfs) { + rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444, + bt_debugfs, NULL, &rfcomm_sock_debugfs_fops); + if (!rfcomm_sock_debugfs) + BT_ERR("Failed to create RFCOMM debug file"); + } + + BT_INFO("RFCOMM socket layer initialized"); + + return 0; + +error: + BT_ERR("RFCOMM socket layer registration failed"); + proto_unregister(&rfcomm_proto); + return err; +} + +void __exit rfcomm_cleanup_sockets(void) +{ + debugfs_remove(rfcomm_sock_debugfs); + + if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) + BT_ERR("RFCOMM socket layer unregistration failed"); + + proto_unregister(&rfcomm_proto); +} diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c new file mode 100644 index 00000000..c2587963 --- /dev/null +++ b/net/bluetooth/rfcomm/tty.c @@ -0,0 +1,1190 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * RFCOMM TTY. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */ +#define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */ +#define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */ +#define RFCOMM_TTY_MINOR 0 + +static struct tty_driver *rfcomm_tty_driver; + +struct rfcomm_dev { + struct list_head list; + atomic_t refcnt; + + char name[12]; + int id; + unsigned long flags; + atomic_t opened; + int err; + + bdaddr_t src; + bdaddr_t dst; + u8 channel; + + uint modem_status; + + struct rfcomm_dlc *dlc; + struct tty_struct *tty; + wait_queue_head_t wait; + struct tasklet_struct wakeup_task; + + struct device *tty_dev; + + atomic_t wmem_alloc; + + struct sk_buff_head pending; +}; + +static LIST_HEAD(rfcomm_dev_list); +static DEFINE_RWLOCK(rfcomm_dev_lock); + +static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb); +static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err); +static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig); + +static void rfcomm_tty_wakeup(unsigned long arg); + +/* ---- Device functions ---- */ +static void rfcomm_dev_destruct(struct rfcomm_dev *dev) +{ + struct rfcomm_dlc *dlc = dev->dlc; + + BT_DBG("dev %p dlc %p", dev, dlc); + + /* Refcount should only hit zero when called from rfcomm_dev_del() + which will have taken us off the list. Everything else are + refcounting bugs. */ + BUG_ON(!list_empty(&dev->list)); + + rfcomm_dlc_lock(dlc); + /* Detach DLC if it's owned by this dev */ + if (dlc->owner == dev) + dlc->owner = NULL; + rfcomm_dlc_unlock(dlc); + + rfcomm_dlc_put(dlc); + + tty_unregister_device(rfcomm_tty_driver, dev->id); + + kfree(dev); + + /* It's safe to call module_put() here because socket still + holds reference to this module. */ + module_put(THIS_MODULE); +} + +static inline void rfcomm_dev_hold(struct rfcomm_dev *dev) +{ + atomic_inc(&dev->refcnt); +} + +static inline void rfcomm_dev_put(struct rfcomm_dev *dev) +{ + /* The reason this isn't actually a race, as you no + doubt have a little voice screaming at you in your + head, is that the refcount should never actually + reach zero unless the device has already been taken + off the list, in rfcomm_dev_del(). And if that's not + true, we'll hit the BUG() in rfcomm_dev_destruct() + anyway. */ + if (atomic_dec_and_test(&dev->refcnt)) + rfcomm_dev_destruct(dev); +} + +static struct rfcomm_dev *__rfcomm_dev_get(int id) +{ + struct rfcomm_dev *dev; + struct list_head *p; + + list_for_each(p, &rfcomm_dev_list) { + dev = list_entry(p, struct rfcomm_dev, list); + if (dev->id == id) + return dev; + } + + return NULL; +} + +static inline struct rfcomm_dev *rfcomm_dev_get(int id) +{ + struct rfcomm_dev *dev; + + read_lock(&rfcomm_dev_lock); + + dev = __rfcomm_dev_get(id); + + if (dev) { + if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) + dev = NULL; + else + rfcomm_dev_hold(dev); + } + + read_unlock(&rfcomm_dev_lock); + + return dev; +} + +static struct device *rfcomm_get_device(struct rfcomm_dev *dev) +{ + struct hci_dev *hdev; + struct hci_conn *conn; + + hdev = hci_get_route(&dev->dst, &dev->src); + if (!hdev) + return NULL; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst); + + hci_dev_put(hdev); + + return conn ? &conn->dev : NULL; +} + +static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) +{ + struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); + return sprintf(buf, "%s\n", batostr(&dev->dst)); +} + +static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf) +{ + struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); + return sprintf(buf, "%d\n", dev->channel); +} + +static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL); + +static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) +{ + struct rfcomm_dev *dev; + struct list_head *head = &rfcomm_dev_list, *p; + int err = 0; + + BT_DBG("id %d channel %d", req->dev_id, req->channel); + + dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + write_lock_bh(&rfcomm_dev_lock); + + if (req->dev_id < 0) { + dev->id = 0; + + list_for_each(p, &rfcomm_dev_list) { + if (list_entry(p, struct rfcomm_dev, list)->id != dev->id) + break; + + dev->id++; + head = p; + } + } else { + dev->id = req->dev_id; + + list_for_each(p, &rfcomm_dev_list) { + struct rfcomm_dev *entry = list_entry(p, struct rfcomm_dev, list); + + if (entry->id == dev->id) { + err = -EADDRINUSE; + goto out; + } + + if (entry->id > dev->id - 1) + break; + + head = p; + } + } + + if ((dev->id < 0) || (dev->id > RFCOMM_MAX_DEV - 1)) { + err = -ENFILE; + goto out; + } + + sprintf(dev->name, "rfcomm%d", dev->id); + + list_add(&dev->list, head); + atomic_set(&dev->refcnt, 1); + + bacpy(&dev->src, &req->src); + bacpy(&dev->dst, &req->dst); + dev->channel = req->channel; + + dev->flags = req->flags & + ((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC)); + + atomic_set(&dev->opened, 0); + + init_waitqueue_head(&dev->wait); + tasklet_init(&dev->wakeup_task, rfcomm_tty_wakeup, (unsigned long) dev); + + skb_queue_head_init(&dev->pending); + + rfcomm_dlc_lock(dlc); + + if (req->flags & (1 << RFCOMM_REUSE_DLC)) { + struct sock *sk = dlc->owner; + struct sk_buff *skb; + + BUG_ON(!sk); + + rfcomm_dlc_throttle(dlc); + + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + skb_queue_tail(&dev->pending, skb); + atomic_sub(skb->len, &sk->sk_rmem_alloc); + } + } + + dlc->data_ready = rfcomm_dev_data_ready; + dlc->state_change = rfcomm_dev_state_change; + dlc->modem_status = rfcomm_dev_modem_status; + + dlc->owner = dev; + dev->dlc = dlc; + + rfcomm_dev_modem_status(dlc, dlc->remote_v24_sig); + + rfcomm_dlc_unlock(dlc); + + /* It's safe to call __module_get() here because socket already + holds reference to this module. */ + __module_get(THIS_MODULE); + +out: + write_unlock_bh(&rfcomm_dev_lock); + + if (err < 0) + goto free; + + dev->tty_dev = tty_register_device(rfcomm_tty_driver, dev->id, NULL); + + if (IS_ERR(dev->tty_dev)) { + err = PTR_ERR(dev->tty_dev); + list_del(&dev->list); + goto free; + } + + dev_set_drvdata(dev->tty_dev, dev); + + if (device_create_file(dev->tty_dev, &dev_attr_address) < 0) + BT_ERR("Failed to create address attribute"); + + if (device_create_file(dev->tty_dev, &dev_attr_channel) < 0) + BT_ERR("Failed to create channel attribute"); + + return dev->id; + +free: + kfree(dev); + return err; +} + +static void rfcomm_dev_del(struct rfcomm_dev *dev) +{ + BT_DBG("dev %p", dev); + + BUG_ON(test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags)); + + if (atomic_read(&dev->opened) > 0) + return; + + write_lock_bh(&rfcomm_dev_lock); + list_del_init(&dev->list); + write_unlock_bh(&rfcomm_dev_lock); + + rfcomm_dev_put(dev); +} + +/* ---- Send buffer ---- */ +static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc) +{ + /* We can't let it be zero, because we don't get a callback + when tx_credits becomes nonzero, hence we'd never wake up */ + return dlc->mtu * (dlc->tx_credits?:1); +} + +static void rfcomm_wfree(struct sk_buff *skb) +{ + struct rfcomm_dev *dev = (void *) skb->sk; + atomic_sub(skb->truesize, &dev->wmem_alloc); + if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags)) + tasklet_schedule(&dev->wakeup_task); + rfcomm_dev_put(dev); +} + +static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev) +{ + rfcomm_dev_hold(dev); + atomic_add(skb->truesize, &dev->wmem_alloc); + skb->sk = (void *) dev; + skb->destructor = rfcomm_wfree; +} + +static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority) +{ + if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + rfcomm_set_owner_w(skb, dev); + return skb; + } + } + return NULL; +} + +/* ---- Device IOCTLs ---- */ + +#define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP)) + +static int rfcomm_create_dev(struct sock *sk, void __user *arg) +{ + struct rfcomm_dev_req req; + struct rfcomm_dlc *dlc; + int id; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + BT_DBG("sk %p dev_id %d flags 0x%x", sk, req.dev_id, req.flags); + + if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) + return -EPERM; + + if (req.flags & (1 << RFCOMM_REUSE_DLC)) { + /* Socket must be connected */ + if (sk->sk_state != BT_CONNECTED) + return -EBADFD; + + dlc = rfcomm_pi(sk)->dlc; + rfcomm_dlc_hold(dlc); + } else { + dlc = rfcomm_dlc_alloc(GFP_KERNEL); + if (!dlc) + return -ENOMEM; + } + + id = rfcomm_dev_add(&req, dlc); + if (id < 0) { + rfcomm_dlc_put(dlc); + return id; + } + + if (req.flags & (1 << RFCOMM_REUSE_DLC)) { + /* DLC is now used by device. + * Socket must be disconnected */ + sk->sk_state = BT_CLOSED; + } + + return id; +} + +static int rfcomm_release_dev(void __user *arg) +{ + struct rfcomm_dev_req req; + struct rfcomm_dev *dev; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + BT_DBG("dev_id %d flags 0x%x", req.dev_id, req.flags); + + dev = rfcomm_dev_get(req.dev_id); + if (!dev) + return -ENODEV; + + if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) { + rfcomm_dev_put(dev); + return -EPERM; + } + + if (req.flags & (1 << RFCOMM_HANGUP_NOW)) + rfcomm_dlc_close(dev->dlc, 0); + + /* Shut down TTY synchronously before freeing rfcomm_dev */ + if (dev->tty) + tty_vhangup(dev->tty); + + if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) + rfcomm_dev_del(dev); + rfcomm_dev_put(dev); + return 0; +} + +static int rfcomm_get_dev_list(void __user *arg) +{ + struct rfcomm_dev_list_req *dl; + struct rfcomm_dev_info *di; + struct list_head *p; + int n = 0, size, err; + u16 dev_num; + + BT_DBG(""); + + if (get_user(dev_num, (u16 __user *) arg)) + return -EFAULT; + + if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di)) + return -EINVAL; + + size = sizeof(*dl) + dev_num * sizeof(*di); + + dl = kmalloc(size, GFP_KERNEL); + if (!dl) + return -ENOMEM; + + di = dl->dev_info; + + read_lock_bh(&rfcomm_dev_lock); + + list_for_each(p, &rfcomm_dev_list) { + struct rfcomm_dev *dev = list_entry(p, struct rfcomm_dev, list); + if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) + continue; + (di + n)->id = dev->id; + (di + n)->flags = dev->flags; + (di + n)->state = dev->dlc->state; + (di + n)->channel = dev->channel; + bacpy(&(di + n)->src, &dev->src); + bacpy(&(di + n)->dst, &dev->dst); + if (++n >= dev_num) + break; + } + + read_unlock_bh(&rfcomm_dev_lock); + + dl->dev_num = n; + size = sizeof(*dl) + n * sizeof(*di); + + err = copy_to_user(arg, dl, size); + kfree(dl); + + return err ? -EFAULT : 0; +} + +static int rfcomm_get_dev_info(void __user *arg) +{ + struct rfcomm_dev *dev; + struct rfcomm_dev_info di; + int err = 0; + + BT_DBG(""); + + if (copy_from_user(&di, arg, sizeof(di))) + return -EFAULT; + + dev = rfcomm_dev_get(di.id); + if (!dev) + return -ENODEV; + + di.flags = dev->flags; + di.channel = dev->channel; + di.state = dev->dlc->state; + bacpy(&di.src, &dev->src); + bacpy(&di.dst, &dev->dst); + + if (copy_to_user(arg, &di, sizeof(di))) + err = -EFAULT; + + rfcomm_dev_put(dev); + return err; +} + +int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + BT_DBG("cmd %d arg %p", cmd, arg); + + switch (cmd) { + case RFCOMMCREATEDEV: + return rfcomm_create_dev(sk, arg); + + case RFCOMMRELEASEDEV: + return rfcomm_release_dev(arg); + + case RFCOMMGETDEVLIST: + return rfcomm_get_dev_list(arg); + + case RFCOMMGETDEVINFO: + return rfcomm_get_dev_info(arg); + } + + return -EINVAL; +} + +/* ---- DLC callbacks ---- */ +static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb) +{ + struct rfcomm_dev *dev = dlc->owner; + struct tty_struct *tty; + + if (!dev) { + kfree_skb(skb); + return; + } + + tty = dev->tty; + if (!tty || !skb_queue_empty(&dev->pending)) { + skb_queue_tail(&dev->pending, skb); + return; + } + + BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len); + + tty_insert_flip_string(tty, skb->data, skb->len); + tty_flip_buffer_push(tty); + + kfree_skb(skb); +} + +static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err) +{ + struct rfcomm_dev *dev = dlc->owner; + if (!dev) + return; + + BT_DBG("dlc %p dev %p err %d", dlc, dev, err); + + dev->err = err; + wake_up_interruptible(&dev->wait); + + if (dlc->state == BT_CLOSED) { + if (!dev->tty) { + if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { + /* Drop DLC lock here to avoid deadlock + * 1. rfcomm_dev_get will take rfcomm_dev_lock + * but in rfcomm_dev_add there's lock order: + * rfcomm_dev_lock -> dlc lock + * 2. rfcomm_dev_put will deadlock if it's + * the last reference + */ + rfcomm_dlc_unlock(dlc); + if (rfcomm_dev_get(dev->id) == NULL) { + rfcomm_dlc_lock(dlc); + return; + } + + rfcomm_dev_del(dev); + rfcomm_dev_put(dev); + rfcomm_dlc_lock(dlc); + } + } else + tty_hangup(dev->tty); + } +} + +static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) +{ + struct rfcomm_dev *dev = dlc->owner; + if (!dev) + return; + + BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); + + if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) { + if (dev->tty && !C_CLOCAL(dev->tty)) + tty_hangup(dev->tty); + } + + dev->modem_status = + ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | + ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | + ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) | + ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0); +} + +/* ---- TTY functions ---- */ +static void rfcomm_tty_wakeup(unsigned long arg) +{ + struct rfcomm_dev *dev = (void *) arg; + struct tty_struct *tty = dev->tty; + if (!tty) + return; + + BT_DBG("dev %p tty %p", dev, tty); + tty_wakeup(tty); +} + +static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev) +{ + struct tty_struct *tty = dev->tty; + struct sk_buff *skb; + int inserted = 0; + + if (!tty) + return; + + BT_DBG("dev %p tty %p", dev, tty); + + rfcomm_dlc_lock(dev->dlc); + + while ((skb = skb_dequeue(&dev->pending))) { + inserted += tty_insert_flip_string(tty, skb->data, skb->len); + kfree_skb(skb); + } + + rfcomm_dlc_unlock(dev->dlc); + + if (inserted > 0) + tty_flip_buffer_push(tty); +} + +static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) +{ + DECLARE_WAITQUEUE(wait, current); + struct rfcomm_dev *dev; + struct rfcomm_dlc *dlc; + int err, id; + + id = tty->index; + + BT_DBG("tty %p id %d", tty, id); + + /* We don't leak this refcount. For reasons which are not entirely + clear, the TTY layer will call our ->close() method even if the + open fails. We decrease the refcount there, and decreasing it + here too would cause breakage. */ + dev = rfcomm_dev_get(id); + if (!dev) + return -ENODEV; + + BT_DBG("dev %p dst %s channel %d opened %d", dev, batostr(&dev->dst), + dev->channel, atomic_read(&dev->opened)); + + if (atomic_inc_return(&dev->opened) > 1) + return 0; + + dlc = dev->dlc; + + /* Attach TTY and open DLC */ + + rfcomm_dlc_lock(dlc); + tty->driver_data = dev; + dev->tty = tty; + rfcomm_dlc_unlock(dlc); + set_bit(RFCOMM_TTY_ATTACHED, &dev->flags); + + err = rfcomm_dlc_open(dlc, &dev->src, &dev->dst, dev->channel); + if (err < 0) + return err; + + /* Wait for DLC to connect */ + add_wait_queue(&dev->wait, &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (dlc->state == BT_CLOSED) { + err = -dev->err; + break; + } + + if (dlc->state == BT_CONNECTED) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + tty_unlock(); + schedule(); + tty_lock(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&dev->wait, &wait); + + if (err == 0) + device_move(dev->tty_dev, rfcomm_get_device(dev), + DPM_ORDER_DEV_AFTER_PARENT); + + rfcomm_tty_copy_pending(dev); + + rfcomm_dlc_unthrottle(dev->dlc); + + return err; +} + +static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + if (!dev) + return; + + BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc, + atomic_read(&dev->opened)); + + if (atomic_dec_and_test(&dev->opened)) { + if (dev->tty_dev->parent) + device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST); + + /* Close DLC and dettach TTY */ + rfcomm_dlc_close(dev->dlc, 0); + + clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags); + tasklet_kill(&dev->wakeup_task); + + rfcomm_dlc_lock(dev->dlc); + tty->driver_data = NULL; + dev->tty = NULL; + rfcomm_dlc_unlock(dev->dlc); + + if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) { + write_lock_bh(&rfcomm_dev_lock); + list_del_init(&dev->list); + write_unlock_bh(&rfcomm_dev_lock); + + rfcomm_dev_put(dev); + } + } + + rfcomm_dev_put(dev); +} + +static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + struct sk_buff *skb; + int err = 0, sent = 0, size; + + BT_DBG("tty %p count %d", tty, count); + + while (count) { + size = min_t(uint, count, dlc->mtu); + + skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC); + + if (!skb) + break; + + skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); + + memcpy(skb_put(skb, size), buf + sent, size); + + err = rfcomm_dlc_send(dlc, skb); + if (err < 0) { + kfree_skb(skb); + break; + } + + sent += size; + count -= size; + } + + return sent ? sent : err; +} + +static int rfcomm_tty_write_room(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + int room; + + BT_DBG("tty %p", tty); + + if (!dev || !dev->dlc) + return 0; + + room = rfcomm_room(dev->dlc) - atomic_read(&dev->wmem_alloc); + if (room < 0) + room = 0; + + return room; +} + +static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) +{ + BT_DBG("tty %p cmd 0x%02x", tty, cmd); + + switch (cmd) { + case TCGETS: + BT_DBG("TCGETS is not supported"); + return -ENOIOCTLCMD; + + case TCSETS: + BT_DBG("TCSETS is not supported"); + return -ENOIOCTLCMD; + + case TIOCMIWAIT: + BT_DBG("TIOCMIWAIT"); + break; + + case TIOCGSERIAL: + BT_ERR("TIOCGSERIAL is not supported"); + return -ENOIOCTLCMD; + + case TIOCSSERIAL: + BT_ERR("TIOCSSERIAL is not supported"); + return -ENOIOCTLCMD; + + case TIOCSERGSTRUCT: + BT_ERR("TIOCSERGSTRUCT is not supported"); + return -ENOIOCTLCMD; + + case TIOCSERGETLSR: + BT_ERR("TIOCSERGETLSR is not supported"); + return -ENOIOCTLCMD; + + case TIOCSERCONFIG: + BT_ERR("TIOCSERCONFIG is not supported"); + return -ENOIOCTLCMD; + + default: + return -ENOIOCTLCMD; /* ioctls which we must ignore */ + + } + + return -ENOIOCTLCMD; +} + +static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old) +{ + struct ktermios *new = tty->termios; + int old_baud_rate = tty_termios_baud_rate(old); + int new_baud_rate = tty_termios_baud_rate(new); + + u8 baud, data_bits, stop_bits, parity, x_on, x_off; + u16 changes = 0; + + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p termios %p", tty, old); + + if (!dev || !dev->dlc || !dev->dlc->session) + return; + + /* Handle turning off CRTSCTS */ + if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) + BT_DBG("Turning off CRTSCTS unsupported"); + + /* Parity on/off and when on, odd/even */ + if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) || + ((old->c_cflag & PARODD) != (new->c_cflag & PARODD))) { + changes |= RFCOMM_RPN_PM_PARITY; + BT_DBG("Parity change detected."); + } + + /* Mark and space parity are not supported! */ + if (new->c_cflag & PARENB) { + if (new->c_cflag & PARODD) { + BT_DBG("Parity is ODD"); + parity = RFCOMM_RPN_PARITY_ODD; + } else { + BT_DBG("Parity is EVEN"); + parity = RFCOMM_RPN_PARITY_EVEN; + } + } else { + BT_DBG("Parity is OFF"); + parity = RFCOMM_RPN_PARITY_NONE; + } + + /* Setting the x_on / x_off characters */ + if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) { + BT_DBG("XOFF custom"); + x_on = new->c_cc[VSTOP]; + changes |= RFCOMM_RPN_PM_XON; + } else { + BT_DBG("XOFF default"); + x_on = RFCOMM_RPN_XON_CHAR; + } + + if (old->c_cc[VSTART] != new->c_cc[VSTART]) { + BT_DBG("XON custom"); + x_off = new->c_cc[VSTART]; + changes |= RFCOMM_RPN_PM_XOFF; + } else { + BT_DBG("XON default"); + x_off = RFCOMM_RPN_XOFF_CHAR; + } + + /* Handle setting of stop bits */ + if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB)) + changes |= RFCOMM_RPN_PM_STOP; + + /* POSIX does not support 1.5 stop bits and RFCOMM does not + * support 2 stop bits. So a request for 2 stop bits gets + * translated to 1.5 stop bits */ + if (new->c_cflag & CSTOPB) + stop_bits = RFCOMM_RPN_STOP_15; + else + stop_bits = RFCOMM_RPN_STOP_1; + + /* Handle number of data bits [5-8] */ + if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) + changes |= RFCOMM_RPN_PM_DATA; + + switch (new->c_cflag & CSIZE) { + case CS5: + data_bits = RFCOMM_RPN_DATA_5; + break; + case CS6: + data_bits = RFCOMM_RPN_DATA_6; + break; + case CS7: + data_bits = RFCOMM_RPN_DATA_7; + break; + case CS8: + data_bits = RFCOMM_RPN_DATA_8; + break; + default: + data_bits = RFCOMM_RPN_DATA_8; + break; + } + + /* Handle baudrate settings */ + if (old_baud_rate != new_baud_rate) + changes |= RFCOMM_RPN_PM_BITRATE; + + switch (new_baud_rate) { + case 2400: + baud = RFCOMM_RPN_BR_2400; + break; + case 4800: + baud = RFCOMM_RPN_BR_4800; + break; + case 7200: + baud = RFCOMM_RPN_BR_7200; + break; + case 9600: + baud = RFCOMM_RPN_BR_9600; + break; + case 19200: + baud = RFCOMM_RPN_BR_19200; + break; + case 38400: + baud = RFCOMM_RPN_BR_38400; + break; + case 57600: + baud = RFCOMM_RPN_BR_57600; + break; + case 115200: + baud = RFCOMM_RPN_BR_115200; + break; + case 230400: + baud = RFCOMM_RPN_BR_230400; + break; + default: + /* 9600 is standard accordinag to the RFCOMM specification */ + baud = RFCOMM_RPN_BR_9600; + break; + + } + + if (changes) + rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud, + data_bits, stop_bits, parity, + RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes); +} + +static void rfcomm_tty_throttle(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + rfcomm_dlc_throttle(dev->dlc); +} + +static void rfcomm_tty_unthrottle(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + rfcomm_dlc_unthrottle(dev->dlc); +} + +static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + if (!dev || !dev->dlc) + return 0; + + if (!skb_queue_empty(&dev->dlc->tx_queue)) + return dev->dlc->mtu; + + return 0; +} + +static void rfcomm_tty_flush_buffer(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + if (!dev || !dev->dlc) + return; + + skb_queue_purge(&dev->dlc->tx_queue); + tty_wakeup(tty); +} + +static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch) +{ + BT_DBG("tty %p ch %c", tty, ch); +} + +static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) +{ + BT_DBG("tty %p timeout %d", tty, timeout); +} + +static void rfcomm_tty_hangup(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + if (!dev) + return; + + rfcomm_tty_flush_buffer(tty); + + if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { + if (rfcomm_dev_get(dev->id) == NULL) + return; + rfcomm_dev_del(dev); + rfcomm_dev_put(dev); + } +} + +static int rfcomm_tty_tiocmget(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + return dev->modem_status; +} + +static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + u8 v24_sig; + + BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); + + rfcomm_dlc_get_modem_status(dlc, &v24_sig); + + if (set & TIOCM_DSR || set & TIOCM_DTR) + v24_sig |= RFCOMM_V24_RTC; + if (set & TIOCM_RTS || set & TIOCM_CTS) + v24_sig |= RFCOMM_V24_RTR; + if (set & TIOCM_RI) + v24_sig |= RFCOMM_V24_IC; + if (set & TIOCM_CD) + v24_sig |= RFCOMM_V24_DV; + + if (clear & TIOCM_DSR || clear & TIOCM_DTR) + v24_sig &= ~RFCOMM_V24_RTC; + if (clear & TIOCM_RTS || clear & TIOCM_CTS) + v24_sig &= ~RFCOMM_V24_RTR; + if (clear & TIOCM_RI) + v24_sig &= ~RFCOMM_V24_IC; + if (clear & TIOCM_CD) + v24_sig &= ~RFCOMM_V24_DV; + + rfcomm_dlc_set_modem_status(dlc, v24_sig); + + return 0; +} + +/* ---- TTY structure ---- */ + +static const struct tty_operations rfcomm_ops = { + .open = rfcomm_tty_open, + .close = rfcomm_tty_close, + .write = rfcomm_tty_write, + .write_room = rfcomm_tty_write_room, + .chars_in_buffer = rfcomm_tty_chars_in_buffer, + .flush_buffer = rfcomm_tty_flush_buffer, + .ioctl = rfcomm_tty_ioctl, + .throttle = rfcomm_tty_throttle, + .unthrottle = rfcomm_tty_unthrottle, + .set_termios = rfcomm_tty_set_termios, + .send_xchar = rfcomm_tty_send_xchar, + .hangup = rfcomm_tty_hangup, + .wait_until_sent = rfcomm_tty_wait_until_sent, + .tiocmget = rfcomm_tty_tiocmget, + .tiocmset = rfcomm_tty_tiocmset, +}; + +int __init rfcomm_init_ttys(void) +{ + rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS); + if (!rfcomm_tty_driver) + return -1; + + rfcomm_tty_driver->owner = THIS_MODULE; + rfcomm_tty_driver->driver_name = "rfcomm"; + rfcomm_tty_driver->name = "rfcomm"; + rfcomm_tty_driver->major = RFCOMM_TTY_MAJOR; + rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR; + rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; + rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL; + rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; + rfcomm_tty_driver->init_termios = tty_std_termios; + rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; + rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON; + tty_set_operations(rfcomm_tty_driver, &rfcomm_ops); + + if (tty_register_driver(rfcomm_tty_driver)) { + BT_ERR("Can't register RFCOMM TTY driver"); + put_tty_driver(rfcomm_tty_driver); + return -1; + } + + BT_INFO("RFCOMM TTY layer initialized"); + + return 0; +} + +void rfcomm_cleanup_ttys(void) +{ + tty_unregister_driver(rfcomm_tty_driver); + put_tty_driver(rfcomm_tty_driver); +} diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c new file mode 100644 index 00000000..d3d48b5b --- /dev/null +++ b/net/bluetooth/sco.c @@ -0,0 +1,1104 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth SCO sockets. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static int disable_esco; + +static const struct proto_ops sco_sock_ops; + +static struct bt_sock_list sco_sk_list = { + .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock) +}; + +static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent); +static void sco_chan_del(struct sock *sk, int err); + +static int sco_conn_del(struct hci_conn *conn, int err); + +static void sco_sock_close(struct sock *sk); +static void sco_sock_kill(struct sock *sk); + +/* ---- SCO timers ---- */ +static void sco_sock_timeout(unsigned long arg) +{ + struct sock *sk = (struct sock *) arg; + + BT_DBG("sock %p state %d", sk, sk->sk_state); + + bh_lock_sock(sk); + sk->sk_err = ETIMEDOUT; + sk->sk_state_change(sk); + bh_unlock_sock(sk); + + sco_sock_kill(sk); + sock_put(sk); +} + +static void sco_sock_set_timer(struct sock *sk, long timeout) +{ + BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); + sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); +} + +static void sco_sock_clear_timer(struct sock *sk) +{ + BT_DBG("sock %p state %d", sk, sk->sk_state); + sk_stop_timer(sk, &sk->sk_timer); +} + +/* ---- SCO connections ---- */ +static struct sco_conn *sco_conn_add(struct hci_conn *hcon, __u8 status) +{ + struct hci_dev *hdev = hcon->hdev; + struct sco_conn *conn = hcon->sco_data; + + if (conn || status) + return conn; + + conn = kzalloc(sizeof(struct sco_conn), GFP_ATOMIC); + if (!conn) + return NULL; + + spin_lock_init(&conn->lock); + + hcon->sco_data = conn; + conn->hcon = hcon; + + conn->src = &hdev->bdaddr; + conn->dst = &hcon->dst; + + if (hdev->sco_mtu > 0) + conn->mtu = hdev->sco_mtu; + else + conn->mtu = 60; + + BT_DBG("hcon %p conn %p", hcon, conn); + + return conn; +} + +static inline struct sock *sco_chan_get(struct sco_conn *conn) +{ + struct sock *sk = NULL; + sco_conn_lock(conn); + sk = conn->sk; + sco_conn_unlock(conn); + return sk; +} + +static int sco_conn_del(struct hci_conn *hcon, int err) +{ + struct sco_conn *conn = hcon->sco_data; + struct sock *sk; + + if (!conn) + return 0; + + BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + + /* Kill socket */ + sk = sco_chan_get(conn); + if (sk) { + bh_lock_sock(sk); + sco_sock_clear_timer(sk); + sco_chan_del(sk, err); + bh_unlock_sock(sk); + sco_sock_kill(sk); + } + + hcon->sco_data = NULL; + kfree(conn); + return 0; +} + +static inline int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) +{ + int err = 0; + + sco_conn_lock(conn); + if (conn->sk) + err = -EBUSY; + else + __sco_chan_add(conn, sk, parent); + + sco_conn_unlock(conn); + return err; +} + +static int sco_connect(struct sock *sk) +{ + bdaddr_t *src = &bt_sk(sk)->src; + bdaddr_t *dst = &bt_sk(sk)->dst; + __u16 pkt_type = sco_pi(sk)->pkt_type; + struct sco_conn *conn; + struct hci_conn *hcon; + struct hci_dev *hdev; + int err, type; + + BT_DBG("%s -> %s", batostr(src), batostr(dst)); + + hdev = hci_get_route(dst, src); + if (!hdev) + return -EHOSTUNREACH; + + hci_dev_lock_bh(hdev); + + if (lmp_esco_capable(hdev) && !disable_esco) + type = ESCO_LINK; + else { + type = SCO_LINK; + pkt_type &= SCO_ESCO_MASK; + } + + hcon = hci_connect(hdev, type, pkt_type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto done; + } + + conn = sco_conn_add(hcon, 0); + if (!conn) { + hci_conn_put(hcon); + err = -ENOMEM; + goto done; + } + + /* Update source addr of the socket */ + bacpy(src, conn->src); + + err = sco_chan_add(conn, sk, NULL); + if (err) + goto done; + + if (hcon->state == BT_CONNECTED) { + sco_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + } else { + sk->sk_state = BT_CONNECT; + sco_sock_set_timer(sk, sk->sk_sndtimeo); + } + +done: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + return err; +} + +static inline int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) +{ + struct sco_conn *conn = sco_pi(sk)->conn; + struct sk_buff *skb; + int err, count; + + /* Check outgoing MTU */ + if (len > conn->mtu) + return -EINVAL; + + BT_DBG("sk %p len %d", sk, len); + + count = min_t(unsigned int, conn->mtu, len); + skb = bt_skb_send_alloc(sk, count, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) { + kfree_skb(skb); + return -EFAULT; + } + + hci_send_sco(conn->hcon, skb); + + return count; +} + +static inline void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) +{ + struct sock *sk = sco_chan_get(conn); + + if (!sk) + goto drop; + + BT_DBG("sk %p len %d", sk, skb->len); + + if (sk->sk_state != BT_CONNECTED) + goto drop; + + if (!sock_queue_rcv_skb(sk, skb)) + return; + +drop: + kfree_skb(skb); +} + +/* -------- Socket interface ---------- */ +static struct sock *__sco_get_sock_by_addr(bdaddr_t *ba) +{ + struct sock *sk; + struct hlist_node *node; + + sk_for_each(sk, node, &sco_sk_list.head) + if (!bacmp(&bt_sk(sk)->src, ba)) + goto found; + sk = NULL; +found: + return sk; +} + +/* Find socket listening on source bdaddr. + * Returns closest match. + */ +static struct sock *sco_get_sock_listen(bdaddr_t *src) +{ + struct sock *sk = NULL, *sk1 = NULL; + struct hlist_node *node; + + read_lock(&sco_sk_list.lock); + + sk_for_each(sk, node, &sco_sk_list.head) { + if (sk->sk_state != BT_LISTEN) + continue; + + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) + break; + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + sk1 = sk; + } + + read_unlock(&sco_sk_list.lock); + + return node ? sk : sk1; +} + +static void sco_sock_destruct(struct sock *sk) +{ + BT_DBG("sk %p", sk); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); +} + +static void sco_sock_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + BT_DBG("parent %p", parent); + + /* Close not yet accepted channels */ + while ((sk = bt_accept_dequeue(parent, NULL))) { + sco_sock_close(sk); + sco_sock_kill(sk); + } + + parent->sk_state = BT_CLOSED; + sock_set_flag(parent, SOCK_ZAPPED); +} + +/* Kill socket (only if zapped and orphan) + * Must be called on unlocked socket. + */ +static void sco_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + BT_DBG("sk %p state %d", sk, sk->sk_state); + + /* Kill poor orphan */ + bt_sock_unlink(&sco_sk_list, sk); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +static void __sco_sock_close(struct sock *sk) +{ + BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); + + switch (sk->sk_state) { + case BT_LISTEN: + sco_sock_cleanup_listen(sk); + break; + + case BT_CONNECTED: + case BT_CONFIG: + if (sco_pi(sk)->conn) { + sk->sk_state = BT_DISCONN; + sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); + hci_conn_put(sco_pi(sk)->conn->hcon); + sco_pi(sk)->conn->hcon = NULL; + } else + sco_chan_del(sk, ECONNRESET); + break; + + case BT_CONNECT: + case BT_DISCONN: + sco_chan_del(sk, ECONNRESET); + break; + + default: + sock_set_flag(sk, SOCK_ZAPPED); + break; + } +} + +/* Must be called on unlocked socket. */ +static void sco_sock_close(struct sock *sk) +{ + sco_sock_clear_timer(sk); + lock_sock(sk); + __sco_sock_close(sk); + release_sock(sk); + sco_sock_kill(sk); +} + +static void sco_sock_init(struct sock *sk, struct sock *parent) +{ + BT_DBG("sk %p", sk); + + if (parent) + sk->sk_type = parent->sk_type; +} + +static struct proto sco_proto = { + .name = "SCO", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sco_pinfo) +}; + +static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +{ + struct sock *sk; + + sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + sk->sk_destruct = sco_sock_destruct; + sk->sk_sndtimeo = SCO_CONN_TIMEOUT; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk); + + bt_sock_link(&sco_sk_list, sk); + return sk; +} + +static int sco_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_SEQPACKET) + return -ESOCKTNOSUPPORT; + + sock->ops = &sco_sock_ops; + + sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC); + if (!sk) + return -ENOMEM; + + sco_sock_init(sk, NULL); + return 0; +} + +static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +{ + struct sockaddr_sco sa; + struct sock *sk = sock->sk; + bdaddr_t *src = &sa.sco_bdaddr; + int len, err = 0; + + BT_DBG("sk %p %s", sk, batostr(&sa.sco_bdaddr)); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + memset(&sa, 0, sizeof(sa)); + len = min_t(unsigned int, sizeof(sa), alen); + memcpy(&sa, addr, len); + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN) { + err = -EBADFD; + goto done; + } + + write_lock_bh(&sco_sk_list.lock); + + if (bacmp(src, BDADDR_ANY) && __sco_get_sock_by_addr(src)) { + err = -EADDRINUSE; + } else { + /* Save source address */ + bacpy(&bt_sk(sk)->src, &sa.sco_bdaddr); + sco_pi(sk)->pkt_type = sa.sco_pkt_type; + sk->sk_state = BT_BOUND; + } + + write_unlock_bh(&sco_sk_list.lock); + +done: + release_sock(sk); + return err; +} + +static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_sco sa; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + memset(&sa, 0, sizeof(sa)); + len = min_t(unsigned int, sizeof(sa), alen); + memcpy(&sa, addr, len); + + lock_sock(sk); + + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EINVAL; + goto done; + } + + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { + err = -EBADFD; + goto done; + } + + /* Set destination address and psm */ + bacpy(&bt_sk(sk)->dst, &sa.sco_bdaddr); + sco_pi(sk)->pkt_type = sa.sco_pkt_type; + + err = sco_connect(sk); + if (err) + goto done; + + err = bt_sock_wait_state(sk, BT_CONNECTED, + sock_sndtimeo(sk, flags & O_NONBLOCK)); + +done: + release_sock(sk); + return err; +} + +static int sco_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p backlog %d", sk, backlog); + + lock_sock(sk); + + if (sk->sk_state != BT_BOUND || sock->type != SOCK_SEQPACKET) { + err = -EBADFD; + goto done; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = BT_LISTEN; + +done: + release_sock(sk); + return err; +} + +static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *ch; + long timeo; + int err = 0; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + BT_DBG("sk %p timeo %ld", sk, timeo); + + /* Wait for an incoming connection. (wake-one). */ + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + break; + } + + ch = bt_accept_dequeue(sk, newsock); + if (ch) + break; + + if (!timeo) { + err = -EAGAIN; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + + BT_DBG("new socket %p", ch); + +done: + release_sock(sk); + return err; +} + +static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +{ + struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; + struct sock *sk = sock->sk; + + BT_DBG("sock %p, sk %p", sock, sk); + + addr->sa_family = AF_BLUETOOTH; + *len = sizeof(struct sockaddr_sco); + + if (peer) + bacpy(&sa->sco_bdaddr, &bt_sk(sk)->dst); + else + bacpy(&sa->sco_bdaddr, &bt_sk(sk)->src); + sa->sco_pkt_type = sco_pi(sk)->pkt_type; + + return 0; +} + +static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + int err; + + BT_DBG("sock %p, sk %p", sock, sk); + + err = sock_error(sk); + if (err) + return err; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + lock_sock(sk); + + if (sk->sk_state == BT_CONNECTED) + err = sco_send_frame(sk, msg, len); + else + err = -ENOTCONN; + + release_sock(sk); + return err; +} + +static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p", sk); + + lock_sock(sk); + + switch (optname) { + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct sco_options opts; + struct sco_conninfo cinfo; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case SCO_OPTIONS: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + opts.mtu = sco_pi(sk)->conn->mtu; + + BT_DBG("mtu %d", opts.mtu); + + len = min_t(unsigned int, len, sizeof(opts)); + if (copy_to_user(optval, (char *)&opts, len)) + err = -EFAULT; + + break; + + case SCO_CONNINFO: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + memset(&cinfo, 0, sizeof(cinfo)); + cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; + memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); + + len = min_t(unsigned int, len, sizeof(cinfo)); + if (copy_to_user(optval, (char *)&cinfo, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (level == SOL_SCO) + return sco_sock_getsockopt_old(sock, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int sco_sock_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + lock_sock(sk); + if (!sk->sk_shutdown) { + sk->sk_shutdown = SHUTDOWN_MASK; + sco_sock_clear_timer(sk); + __sco_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + err = bt_sock_wait_state(sk, BT_CLOSED, + sk->sk_lingertime); + } + release_sock(sk); + return err; +} + +static int sco_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + sco_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) { + lock_sock(sk); + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + release_sock(sk); + } + + sock_orphan(sk); + sco_sock_kill(sk); + return err; +} + +static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) +{ + BT_DBG("conn %p", conn); + + sco_pi(sk)->conn = conn; + conn->sk = sk; + + if (parent) + bt_accept_enqueue(parent, sk); +} + +/* Delete channel. + * Must be called on the locked socket. */ +static void sco_chan_del(struct sock *sk, int err) +{ + struct sco_conn *conn; + + conn = sco_pi(sk)->conn; + + BT_DBG("sk %p, conn %p, err %d", sk, conn, err); + + if (conn) { + sco_conn_lock(conn); + conn->sk = NULL; + sco_pi(sk)->conn = NULL; + sco_conn_unlock(conn); + + if (conn->hcon) + hci_conn_put(conn->hcon); + } + + sk->sk_state = BT_CLOSED; + sk->sk_err = err; + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_ZAPPED); +} + +static void sco_conn_ready(struct sco_conn *conn) +{ + struct sock *parent; + struct sock *sk = conn->sk; + + BT_DBG("conn %p", conn); + + sco_conn_lock(conn); + + if (sk) { + sco_sock_clear_timer(sk); + bh_lock_sock(sk); + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + bh_unlock_sock(sk); + } else { + parent = sco_get_sock_listen(conn->src); + if (!parent) + goto done; + + bh_lock_sock(parent); + + sk = sco_sock_alloc(sock_net(parent), NULL, + BTPROTO_SCO, GFP_ATOMIC); + if (!sk) { + bh_unlock_sock(parent); + goto done; + } + + sco_sock_init(sk, parent); + + bacpy(&bt_sk(sk)->src, conn->src); + bacpy(&bt_sk(sk)->dst, conn->dst); + + hci_conn_hold(conn->hcon); + __sco_chan_add(conn, sk, parent); + + sk->sk_state = BT_CONNECTED; + + /* Wake up parent */ + parent->sk_data_ready(parent, 1); + + bh_unlock_sock(parent); + } + +done: + sco_conn_unlock(conn); +} + +/* ----- SCO interface with lower layer (HCI) ----- */ +static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type) +{ + register struct sock *sk; + struct hlist_node *node; + int lm = 0; + + if (type != SCO_LINK && type != ESCO_LINK) + return -EINVAL; + + BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); + + /* Find listening sockets */ + read_lock(&sco_sk_list.lock); + sk_for_each(sk, node, &sco_sk_list.head) { + if (sk->sk_state != BT_LISTEN) + continue; + + if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) || + !bacmp(&bt_sk(sk)->src, BDADDR_ANY)) { + lm |= HCI_LM_ACCEPT; + break; + } + } + read_unlock(&sco_sk_list.lock); + + return lm; +} + +static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) +{ + BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status); + + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return -EINVAL; + + if (!status) { + struct sco_conn *conn; + + conn = sco_conn_add(hcon, status); + if (conn) + sco_conn_ready(conn); + } else + sco_conn_del(hcon, bt_to_errno(status)); + + return 0; +} + +static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) +{ + BT_DBG("hcon %p reason %d", hcon, reason); + + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return -EINVAL; + + sco_conn_del(hcon, bt_to_errno(reason)); + + return 0; +} + +static int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +{ + struct sco_conn *conn = hcon->sco_data; + + if (!conn) + goto drop; + + BT_DBG("conn %p len %d", conn, skb->len); + + if (skb->len) { + sco_recv_frame(conn, skb); + return 0; + } + +drop: + kfree_skb(skb); + return 0; +} + +static int sco_debugfs_show(struct seq_file *f, void *p) +{ + struct sock *sk; + struct hlist_node *node; + + read_lock_bh(&sco_sk_list.lock); + + sk_for_each(sk, node, &sco_sk_list.head) { + seq_printf(f, "%s %s %d\n", batostr(&bt_sk(sk)->src), + batostr(&bt_sk(sk)->dst), sk->sk_state); + } + + read_unlock_bh(&sco_sk_list.lock); + + return 0; +} + +static int sco_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, sco_debugfs_show, inode->i_private); +} + +static const struct file_operations sco_debugfs_fops = { + .open = sco_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *sco_debugfs; + +static const struct proto_ops sco_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = sco_sock_release, + .bind = sco_sock_bind, + .connect = sco_sock_connect, + .listen = sco_sock_listen, + .accept = sco_sock_accept, + .getname = sco_sock_getname, + .sendmsg = sco_sock_sendmsg, + .recvmsg = bt_sock_recvmsg, + .poll = bt_sock_poll, + .ioctl = bt_sock_ioctl, + .mmap = sock_no_mmap, + .socketpair = sock_no_socketpair, + .shutdown = sco_sock_shutdown, + .setsockopt = sco_sock_setsockopt, + .getsockopt = sco_sock_getsockopt +}; + +static const struct net_proto_family sco_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = sco_sock_create, +}; + +static struct hci_proto sco_hci_proto = { + .name = "SCO", + .id = HCI_PROTO_SCO, + .connect_ind = sco_connect_ind, + .connect_cfm = sco_connect_cfm, + .disconn_cfm = sco_disconn_cfm, + .recv_scodata = sco_recv_scodata +}; + +int __init sco_init(void) +{ + int err; + + err = proto_register(&sco_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops); + if (err < 0) { + BT_ERR("SCO socket registration failed"); + goto error; + } + + err = hci_register_proto(&sco_hci_proto); + if (err < 0) { + BT_ERR("SCO protocol registration failed"); + bt_sock_unregister(BTPROTO_SCO); + goto error; + } + + if (bt_debugfs) { + sco_debugfs = debugfs_create_file("sco", 0444, + bt_debugfs, NULL, &sco_debugfs_fops); + if (!sco_debugfs) + BT_ERR("Failed to create SCO debug file"); + } + + BT_INFO("SCO socket layer initialized"); + + return 0; + +error: + proto_unregister(&sco_proto); + return err; +} + +void __exit sco_exit(void) +{ + debugfs_remove(sco_debugfs); + + if (bt_sock_unregister(BTPROTO_SCO) < 0) + BT_ERR("SCO socket unregistration failed"); + + if (hci_unregister_proto(&sco_hci_proto) < 0) + BT_ERR("SCO protocol unregistration failed"); + + proto_unregister(&sco_proto); +} + +module_param(disable_esco, bool, 0644); +MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation"); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c new file mode 100644 index 00000000..391888b8 --- /dev/null +++ b/net/bluetooth/smp.c @@ -0,0 +1,702 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include +#include +#include +#include +#include +#include + +#define SMP_TIMEOUT 30000 /* 30 seconds */ + +static inline void swap128(u8 src[16], u8 dst[16]) +{ + int i; + for (i = 0; i < 16; i++) + dst[15 - i] = src[i]; +} + +static inline void swap56(u8 src[7], u8 dst[7]) +{ + int i; + for (i = 0; i < 7; i++) + dst[6 - i] = src[i]; +} + +static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) +{ + struct blkcipher_desc desc; + struct scatterlist sg; + int err, iv_len; + unsigned char iv[128]; + + if (tfm == NULL) { + BT_ERR("tfm %p", tfm); + return -EINVAL; + } + + desc.tfm = tfm; + desc.flags = 0; + + err = crypto_blkcipher_setkey(tfm, k, 16); + if (err) { + BT_ERR("cipher setkey failed: %d", err); + return err; + } + + sg_init_one(&sg, r, 16); + + iv_len = crypto_blkcipher_ivsize(tfm); + if (iv_len) { + memset(&iv, 0xff, iv_len); + crypto_blkcipher_set_iv(tfm, iv, iv_len); + } + + err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16); + if (err) + BT_ERR("Encrypt data error %d", err); + + return err; +} + +static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16], + u8 preq[7], u8 pres[7], u8 _iat, bdaddr_t *ia, + u8 _rat, bdaddr_t *ra, u8 res[16]) +{ + u8 p1[16], p2[16]; + int err; + + memset(p1, 0, 16); + + /* p1 = pres || preq || _rat || _iat */ + swap56(pres, p1); + swap56(preq, p1 + 7); + p1[14] = _rat; + p1[15] = _iat; + + memset(p2, 0, 16); + + /* p2 = padding || ia || ra */ + baswap((bdaddr_t *) (p2 + 4), ia); + baswap((bdaddr_t *) (p2 + 10), ra); + + /* res = r XOR p1 */ + u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); + + /* res = e(k, res) */ + err = smp_e(tfm, k, res); + if (err) { + BT_ERR("Encrypt data error"); + return err; + } + + /* res = res XOR p2 */ + u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); + + /* res = e(k, res) */ + err = smp_e(tfm, k, res); + if (err) + BT_ERR("Encrypt data error"); + + return err; +} + +static int smp_s1(struct crypto_blkcipher *tfm, u8 k[16], + u8 r1[16], u8 r2[16], u8 _r[16]) +{ + int err; + + /* Just least significant octets from r1 and r2 are considered */ + memcpy(_r, r1 + 8, 8); + memcpy(_r + 8, r2 + 8, 8); + + err = smp_e(tfm, k, _r); + if (err) + BT_ERR("Encrypt data error"); + + return err; +} + +static int smp_rand(u8 *buf) +{ + get_random_bytes(buf, 16); + + return 0; +} + +static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, + u16 dlen, void *data) +{ + struct sk_buff *skb; + struct l2cap_hdr *lh; + int len; + + len = L2CAP_HDR_SIZE + sizeof(code) + dlen; + + if (len > conn->mtu) + return NULL; + + skb = bt_skb_alloc(len, GFP_ATOMIC); + if (!skb) + return NULL; + + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->len = cpu_to_le16(sizeof(code) + dlen); + lh->cid = cpu_to_le16(L2CAP_CID_SMP); + + memcpy(skb_put(skb, sizeof(code)), &code, sizeof(code)); + + memcpy(skb_put(skb, dlen), data, dlen); + + return skb; +} + +static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) +{ + struct sk_buff *skb = smp_build_cmd(conn, code, len, data); + + BT_DBG("code 0x%2.2x", code); + + if (!skb) + return; + + hci_send_acl(conn->hcon, skb, 0); +} + +static __u8 seclevel_to_authreq(__u8 level) +{ + switch (level) { + case BT_SECURITY_HIGH: + /* Right now we don't support bonding */ + return SMP_AUTH_MITM; + + default: + return SMP_AUTH_NONE; + } +} + +static void build_pairing_cmd(struct l2cap_conn *conn, + struct smp_cmd_pairing *req, + struct smp_cmd_pairing *rsp, + __u8 authreq) +{ + u8 dist_keys; + + dist_keys = 0; + if (test_bit(HCI_PAIRABLE, &conn->hcon->hdev->flags)) { + dist_keys = SMP_DIST_ENC_KEY | SMP_DIST_ID_KEY | SMP_DIST_SIGN; + authreq |= SMP_AUTH_BONDING; + } + + if (rsp == NULL) { + req->io_capability = conn->hcon->io_capability; + req->oob_flag = SMP_OOB_NOT_PRESENT; + req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->init_key_dist = dist_keys; + req->resp_key_dist = dist_keys; + req->auth_req = authreq; + return; + } + + rsp->io_capability = conn->hcon->io_capability; + rsp->oob_flag = SMP_OOB_NOT_PRESENT; + rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->init_key_dist = req->init_key_dist & dist_keys; + rsp->resp_key_dist = req->resp_key_dist & dist_keys; + rsp->auth_req = authreq; +} + +static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) +{ + if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || + (max_key_size < SMP_MIN_ENC_KEY_SIZE)) + return SMP_ENC_KEY_SIZE; + + conn->smp_key_size = max_key_size; + + return 0; +} + +static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_pairing rsp, *req = (void *) skb->data; + u8 key_size; + + BT_DBG("conn %p", conn); + + conn->preq[0] = SMP_CMD_PAIRING_REQ; + memcpy(&conn->preq[1], req, sizeof(*req)); + skb_pull(skb, sizeof(*req)); + + if (req->oob_flag) + return SMP_OOB_NOT_AVAIL; + + /* We didn't start the pairing, so no requirements */ + build_pairing_cmd(conn, req, &rsp, SMP_AUTH_NONE); + + key_size = min(req->max_key_size, rsp.max_key_size); + if (check_enc_key_size(conn, key_size)) + return SMP_ENC_KEY_SIZE; + + /* Just works */ + memset(conn->tk, 0, sizeof(conn->tk)); + + conn->prsp[0] = SMP_CMD_PAIRING_RSP; + memcpy(&conn->prsp[1], &rsp, sizeof(rsp)); + + smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); + + mod_timer(&conn->security_timer, jiffies + + msecs_to_jiffies(SMP_TIMEOUT)); + + return 0; +} + +static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_pairing *req, *rsp = (void *) skb->data; + struct smp_cmd_pairing_confirm cp; + struct crypto_blkcipher *tfm = conn->hcon->hdev->tfm; + int ret; + u8 res[16], key_size; + + BT_DBG("conn %p", conn); + + skb_pull(skb, sizeof(*rsp)); + + req = (void *) &conn->preq[1]; + + key_size = min(req->max_key_size, rsp->max_key_size); + if (check_enc_key_size(conn, key_size)) + return SMP_ENC_KEY_SIZE; + + if (rsp->oob_flag) + return SMP_OOB_NOT_AVAIL; + + /* Just works */ + memset(conn->tk, 0, sizeof(conn->tk)); + + conn->prsp[0] = SMP_CMD_PAIRING_RSP; + memcpy(&conn->prsp[1], rsp, sizeof(*rsp)); + + ret = smp_rand(conn->prnd); + if (ret) + return SMP_UNSPECIFIED; + + ret = smp_c1(tfm, conn->tk, conn->prnd, conn->preq, conn->prsp, 0, + conn->src, conn->hcon->dst_type, conn->dst, res); + if (ret) + return SMP_UNSPECIFIED; + + swap128(res, cp.confirm_val); + + smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); + + return 0; +} + +static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct crypto_blkcipher *tfm = conn->hcon->hdev->tfm; + + BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); + + memcpy(conn->pcnf, skb->data, sizeof(conn->pcnf)); + skb_pull(skb, sizeof(conn->pcnf)); + + if (conn->hcon->out) { + u8 random[16]; + + swap128(conn->prnd, random); + smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(random), + random); + } else { + struct smp_cmd_pairing_confirm cp; + int ret; + u8 res[16]; + + ret = smp_rand(conn->prnd); + if (ret) + return SMP_UNSPECIFIED; + + ret = smp_c1(tfm, conn->tk, conn->prnd, conn->preq, conn->prsp, + conn->hcon->dst_type, conn->dst, + 0, conn->src, res); + if (ret) + return SMP_CONFIRM_FAILED; + + swap128(res, cp.confirm_val); + + smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); + } + + mod_timer(&conn->security_timer, jiffies + + msecs_to_jiffies(SMP_TIMEOUT)); + + return 0; +} + +static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct hci_conn *hcon = conn->hcon; + struct crypto_blkcipher *tfm = hcon->hdev->tfm; + int ret; + u8 key[16], res[16], random[16], confirm[16]; + + swap128(skb->data, random); + skb_pull(skb, sizeof(random)); + + if (conn->hcon->out) + ret = smp_c1(tfm, conn->tk, random, conn->preq, conn->prsp, 0, + conn->src, conn->hcon->dst_type, conn->dst, + res); + else + ret = smp_c1(tfm, conn->tk, random, conn->preq, conn->prsp, + conn->hcon->dst_type, conn->dst, 0, conn->src, + res); + if (ret) + return SMP_UNSPECIFIED; + + BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); + + swap128(res, confirm); + + if (memcmp(conn->pcnf, confirm, sizeof(conn->pcnf)) != 0) { + BT_ERR("Pairing failed (confirmation values mismatch)"); + return SMP_CONFIRM_FAILED; + } + + if (conn->hcon->out) { + u8 stk[16], rand[8]; + __le16 ediv; + + memset(rand, 0, sizeof(rand)); + ediv = 0; + + smp_s1(tfm, conn->tk, random, conn->prnd, key); + swap128(key, stk); + + memset(stk + conn->smp_key_size, 0, + SMP_MAX_ENC_KEY_SIZE - conn->smp_key_size); + + hci_le_start_enc(hcon, ediv, rand, stk); + hcon->enc_key_size = conn->smp_key_size; + } else { + u8 stk[16], r[16], rand[8]; + __le16 ediv; + + memset(rand, 0, sizeof(rand)); + ediv = 0; + + swap128(conn->prnd, r); + smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(r), r); + + smp_s1(tfm, conn->tk, conn->prnd, random, key); + swap128(key, stk); + + memset(stk + conn->smp_key_size, 0, + SMP_MAX_ENC_KEY_SIZE - conn->smp_key_size); + + hci_add_ltk(conn->hcon->hdev, 0, conn->dst, conn->smp_key_size, + ediv, rand, stk); + } + + return 0; +} + +static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_security_req *rp = (void *) skb->data; + struct smp_cmd_pairing cp; + struct hci_conn *hcon = conn->hcon; + + BT_DBG("conn %p", conn); + + if (test_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend)) + return 0; + + skb_pull(skb, sizeof(*rp)); + + memset(&cp, 0, sizeof(cp)); + build_pairing_cmd(conn, &cp, NULL, rp->auth_req); + + conn->preq[0] = SMP_CMD_PAIRING_REQ; + memcpy(&conn->preq[1], &cp, sizeof(cp)); + + smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + + mod_timer(&conn->security_timer, jiffies + + msecs_to_jiffies(SMP_TIMEOUT)); + + set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend); + + return 0; +} + +int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level) +{ + struct hci_conn *hcon = conn->hcon; + __u8 authreq; + + BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); + + if (!lmp_host_le_capable(hcon->hdev)) + return 1; + + if (IS_ERR(hcon->hdev->tfm)) + return 1; + + if (test_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend)) + return 0; + + if (sec_level == BT_SECURITY_LOW) + return 1; + + if (hcon->sec_level >= sec_level) + return 1; + + authreq = seclevel_to_authreq(sec_level); + + if (hcon->link_mode & HCI_LM_MASTER) { + struct smp_cmd_pairing cp; + struct link_key *key; + + key = hci_find_link_key_type(hcon->hdev, conn->dst, + HCI_LK_SMP_LTK); + if (key) { + struct key_master_id *master = (void *) key->data; + + hci_le_start_enc(hcon, master->ediv, master->rand, + key->val); + hcon->enc_key_size = key->pin_len; + + goto done; + } + + build_pairing_cmd(conn, &cp, NULL, authreq); + conn->preq[0] = SMP_CMD_PAIRING_REQ; + memcpy(&conn->preq[1], &cp, sizeof(cp)); + + mod_timer(&conn->security_timer, jiffies + + msecs_to_jiffies(SMP_TIMEOUT)); + + smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + } else { + struct smp_cmd_security_req cp; + cp.auth_req = authreq; + smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); + } + +done: + hcon->pending_sec_level = sec_level; + set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend); + + return 0; +} + +static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_encrypt_info *rp = (void *) skb->data; + + skb_pull(skb, sizeof(*rp)); + + memcpy(conn->tk, rp->ltk, sizeof(conn->tk)); + + return 0; +} + +static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct smp_cmd_master_ident *rp = (void *) skb->data; + + skb_pull(skb, sizeof(*rp)); + + hci_add_ltk(conn->hcon->hdev, 1, conn->src, conn->smp_key_size, + rp->ediv, rp->rand, conn->tk); + + smp_distribute_keys(conn, 1); + + return 0; +} + +int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) +{ + __u8 code = skb->data[0]; + __u8 reason; + int err = 0; + + if (!lmp_host_le_capable(conn->hcon->hdev)) { + err = -ENOTSUPP; + reason = SMP_PAIRING_NOTSUPP; + goto done; + } + + if (IS_ERR(conn->hcon->hdev->tfm)) { + err = PTR_ERR(conn->hcon->hdev->tfm); + reason = SMP_PAIRING_NOTSUPP; + goto done; + } + + skb_pull(skb, sizeof(code)); + + switch (code) { + case SMP_CMD_PAIRING_REQ: + reason = smp_cmd_pairing_req(conn, skb); + break; + + case SMP_CMD_PAIRING_FAIL: + reason = 0; + err = -EPERM; + break; + + case SMP_CMD_PAIRING_RSP: + reason = smp_cmd_pairing_rsp(conn, skb); + break; + + case SMP_CMD_SECURITY_REQ: + reason = smp_cmd_security_req(conn, skb); + break; + + case SMP_CMD_PAIRING_CONFIRM: + reason = smp_cmd_pairing_confirm(conn, skb); + break; + + case SMP_CMD_PAIRING_RANDOM: + reason = smp_cmd_pairing_random(conn, skb); + break; + + case SMP_CMD_ENCRYPT_INFO: + reason = smp_cmd_encrypt_info(conn, skb); + break; + + case SMP_CMD_MASTER_IDENT: + reason = smp_cmd_master_ident(conn, skb); + break; + + case SMP_CMD_IDENT_INFO: + case SMP_CMD_IDENT_ADDR_INFO: + case SMP_CMD_SIGN_INFO: + /* Just ignored */ + reason = 0; + break; + + default: + BT_DBG("Unknown command code 0x%2.2x", code); + + reason = SMP_CMD_NOTSUPP; + err = -EOPNOTSUPP; + goto done; + } + +done: + if (reason) + smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), + &reason); + + kfree_skb(skb); + return err; +} + +int smp_distribute_keys(struct l2cap_conn *conn, __u8 force) +{ + struct smp_cmd_pairing *req, *rsp; + __u8 *keydist; + + BT_DBG("conn %p force %d", conn, force); + + if (IS_ERR(conn->hcon->hdev->tfm)) + return PTR_ERR(conn->hcon->hdev->tfm); + + rsp = (void *) &conn->prsp[1]; + + /* The responder sends its keys first */ + if (!force && conn->hcon->out && (rsp->resp_key_dist & 0x07)) + return 0; + + req = (void *) &conn->preq[1]; + + if (conn->hcon->out) { + keydist = &rsp->init_key_dist; + *keydist &= req->init_key_dist; + } else { + keydist = &rsp->resp_key_dist; + *keydist &= req->resp_key_dist; + } + + + BT_DBG("keydist 0x%x", *keydist); + + if (*keydist & SMP_DIST_ENC_KEY) { + struct smp_cmd_encrypt_info enc; + struct smp_cmd_master_ident ident; + __le16 ediv; + + get_random_bytes(enc.ltk, sizeof(enc.ltk)); + get_random_bytes(&ediv, sizeof(ediv)); + get_random_bytes(ident.rand, sizeof(ident.rand)); + + smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); + + hci_add_ltk(conn->hcon->hdev, 1, conn->dst, conn->smp_key_size, + ediv, ident.rand, enc.ltk); + + ident.ediv = cpu_to_le16(ediv); + + smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); + + *keydist &= ~SMP_DIST_ENC_KEY; + } + + if (*keydist & SMP_DIST_ID_KEY) { + struct smp_cmd_ident_addr_info addrinfo; + struct smp_cmd_ident_info idinfo; + + /* Send a dummy key */ + get_random_bytes(idinfo.irk, sizeof(idinfo.irk)); + + smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); + + /* Just public address */ + memset(&addrinfo, 0, sizeof(addrinfo)); + bacpy(&addrinfo.bdaddr, conn->src); + + smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), + &addrinfo); + + *keydist &= ~SMP_DIST_ID_KEY; + } + + if (*keydist & SMP_DIST_SIGN) { + struct smp_cmd_sign_info sign; + + /* Send a dummy key */ + get_random_bytes(sign.csrk, sizeof(sign.csrk)); + + smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); + + *keydist &= ~SMP_DIST_SIGN; + } + + return 0; +} diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig new file mode 100644 index 00000000..6dee7bf6 --- /dev/null +++ b/net/bridge/Kconfig @@ -0,0 +1,48 @@ +# +# 802.1d Ethernet Bridging +# + +config BRIDGE + tristate "802.1d Ethernet Bridging" + select LLC + select STP + depends on IPV6 || IPV6=n + ---help--- + If you say Y here, then your Linux box will be able to act as an + Ethernet bridge, which means that the different Ethernet segments it + is connected to will appear as one Ethernet to the participants. + Several such bridges can work together to create even larger + networks of Ethernets using the IEEE 802.1 spanning tree algorithm. + As this is a standard, Linux bridges will cooperate properly with + other third party bridge products. + + In order to use the Ethernet bridge, you'll need the bridge + configuration tools; see + for location. Please read the Bridge mini-HOWTO for more + information. + + If you enable iptables support along with the bridge support then you + turn your bridge into a bridging IP firewall. + iptables will then see the IP packets being bridged, so you need to + take this into account when setting up your firewall rules. + Enabling arptables support when bridging will let arptables see + bridged ARP traffic in the arptables FORWARD chain. + + To compile this code as a module, choose M here: the module + will be called bridge. + + If unsure, say N. + +config BRIDGE_IGMP_SNOOPING + bool "IGMP/MLD snooping" + depends on BRIDGE + depends on INET + default y + ---help--- + If you say Y here, then the Ethernet bridge will be able selectively + forward multicast traffic based on IGMP/MLD traffic received from + each port. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. diff --git a/net/bridge/Makefile b/net/bridge/Makefile new file mode 100644 index 00000000..d0359ea8 --- /dev/null +++ b/net/bridge/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the IEEE 802.1d ethernet bridging layer. +# + +obj-$(CONFIG_BRIDGE) += bridge.o + +bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ + br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ + br_stp_if.o br_stp_timer.o br_netlink.o + +bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o + +bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o + +bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o + +obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/ diff --git a/net/bridge/br.c b/net/bridge/br.c new file mode 100644 index 00000000..f20c4fd9 --- /dev/null +++ b/net/bridge/br.c @@ -0,0 +1,107 @@ +/* + * Generic parts + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +static const struct stp_proto br_stp_proto = { + .rcv = br_stp_rcv, +}; + +static struct pernet_operations br_net_ops = { + .exit = br_net_exit, +}; + +static int __init br_init(void) +{ + int err; + + err = stp_proto_register(&br_stp_proto); + if (err < 0) { + pr_err("bridge: can't register sap for STP\n"); + return err; + } + + err = br_fdb_init(); + if (err) + goto err_out; + + err = register_pernet_subsys(&br_net_ops); + if (err) + goto err_out1; + + err = br_netfilter_init(); + if (err) + goto err_out2; + + err = register_netdevice_notifier(&br_device_notifier); + if (err) + goto err_out3; + + err = br_netlink_init(); + if (err) + goto err_out4; + + brioctl_set(br_ioctl_deviceless_stub); + +#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) + br_fdb_test_addr_hook = br_fdb_test_addr; +#endif + + return 0; +err_out4: + unregister_netdevice_notifier(&br_device_notifier); +err_out3: + br_netfilter_fini(); +err_out2: + unregister_pernet_subsys(&br_net_ops); +err_out1: + br_fdb_fini(); +err_out: + stp_proto_unregister(&br_stp_proto); + return err; +} + +static void __exit br_deinit(void) +{ + stp_proto_unregister(&br_stp_proto); + + br_netlink_fini(); + unregister_netdevice_notifier(&br_device_notifier); + brioctl_set(NULL); + + unregister_pernet_subsys(&br_net_ops); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ + + br_netfilter_fini(); +#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) + br_fdb_test_addr_hook = NULL; +#endif + + br_fdb_fini(); +} + +module_init(br_init) +module_exit(br_deinit) +MODULE_LICENSE("GPL"); +MODULE_VERSION(BR_VERSION); +MODULE_ALIAS_RTNL_LINK("bridge"); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c new file mode 100644 index 00000000..dac6a214 --- /dev/null +++ b/net/bridge/br_device.c @@ -0,0 +1,371 @@ +/* + * Device handling code + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "br_private.h" + +/* net device transmit always called with BH disabled */ +netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + const unsigned char *dest = skb->data; + struct net_bridge_fdb_entry *dst; + struct net_bridge_mdb_entry *mdst; + struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { + br_nf_pre_routing_finish_bridge_slow(skb); + return NETDEV_TX_OK; + } +#endif + + BR_INPUT_SKB_CB(skb)->brdev = dev; + + skb_reset_mac_header(skb); + skb_pull(skb, ETH_HLEN); + + u64_stats_update_begin(&brstats->syncp); + brstats->tx_packets++; + /* Exclude ETH_HLEN from byte stats for consistency with Rx chain */ + brstats->tx_bytes += skb->len; + u64_stats_update_end(&brstats->syncp); + + rcu_read_lock(); + if (is_broadcast_ether_addr(dest)) + br_flood_deliver(br, skb); + else if (is_multicast_ether_addr(dest)) { + if (unlikely(netpoll_tx_running(dev))) { + br_flood_deliver(br, skb); + goto out; + } + if (br_multicast_rcv(br, NULL, skb)) { + kfree_skb(skb); + goto out; + } + + mdst = br_mdb_get(br, skb); + if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) + br_multicast_deliver(mdst, skb); + else + br_flood_deliver(br, skb); + } else if ((dst = __br_fdb_get(br, dest)) != NULL) + br_deliver(dst->dst, skb); + else + br_flood_deliver(br, skb); + +out: + rcu_read_unlock(); + return NETDEV_TX_OK; +} + +static int br_dev_init(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + br->stats = alloc_percpu(struct br_cpu_netstats); + if (!br->stats) + return -ENOMEM; + + return 0; +} + +static int br_dev_open(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + netdev_update_features(dev); + netif_start_queue(dev); + br_stp_enable_bridge(br); + br_multicast_open(br); + + return 0; +} + +static void br_dev_set_multicast_list(struct net_device *dev) +{ +} + +static int br_dev_stop(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + br_stp_disable_bridge(br); + br_multicast_stop(br); + + netif_stop_queue(dev); + + return 0; +} + +static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct net_bridge *br = netdev_priv(dev); + struct br_cpu_netstats tmp, sum = { 0 }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + unsigned int start; + const struct br_cpu_netstats *bstats + = per_cpu_ptr(br->stats, cpu); + do { + start = u64_stats_fetch_begin(&bstats->syncp); + memcpy(&tmp, bstats, sizeof(tmp)); + } while (u64_stats_fetch_retry(&bstats->syncp, start)); + sum.tx_bytes += tmp.tx_bytes; + sum.tx_packets += tmp.tx_packets; + sum.rx_bytes += tmp.rx_bytes; + sum.rx_packets += tmp.rx_packets; + } + + stats->tx_bytes = sum.tx_bytes; + stats->tx_packets = sum.tx_packets; + stats->rx_bytes = sum.rx_bytes; + stats->rx_packets = sum.rx_packets; + + return stats; +} + +static int br_change_mtu(struct net_device *dev, int new_mtu) +{ + struct net_bridge *br = netdev_priv(dev); + if (new_mtu < 68 || new_mtu > br_min_mtu(br)) + return -EINVAL; + + dev->mtu = new_mtu; + +#ifdef CONFIG_BRIDGE_NETFILTER + /* remember the MTU in the rtable for PMTU */ + dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); +#endif + + return 0; +} + +/* Allow setting mac address to any valid ethernet address. */ +static int br_set_mac_address(struct net_device *dev, void *p) +{ + struct net_bridge *br = netdev_priv(dev); + struct sockaddr *addr = p; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EINVAL; + + spin_lock_bh(&br->lock); + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + br_stp_change_bridge_id(br, addr->sa_data); + br->flags |= BR_SET_MAC_ADDR; + spin_unlock_bh(&br->lock); + + return 0; +} + +static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "bridge"); + strcpy(info->version, BR_VERSION); + strcpy(info->fw_version, "N/A"); + strcpy(info->bus_info, "N/A"); +} + +static u32 br_fix_features(struct net_device *dev, u32 features) +{ + struct net_bridge *br = netdev_priv(dev); + + return br_features_recompute(br, features); +} + +#ifdef CONFIG_NET_POLL_CONTROLLER +static void br_poll_controller(struct net_device *br_dev) +{ +} + +static void br_netpoll_cleanup(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p, *n; + + list_for_each_entry_safe(p, n, &br->port_list, list) { + br_netpoll_disable(p); + } +} + +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p, *n; + int err = 0; + + list_for_each_entry_safe(p, n, &br->port_list, list) { + if (!p->dev) + continue; + + err = br_netpoll_enable(p); + if (err) + goto fail; + } + +out: + return err; + +fail: + br_netpoll_cleanup(dev); + goto out; +} + +int br_netpoll_enable(struct net_bridge_port *p) +{ + struct netpoll *np; + int err = 0; + + np = kzalloc(sizeof(*p->np), GFP_KERNEL); + err = -ENOMEM; + if (!np) + goto out; + + np->dev = p->dev; + strlcpy(np->dev_name, p->dev->name, IFNAMSIZ); + + err = __netpoll_setup(np); + if (err) { + kfree(np); + goto out; + } + + p->np = np; + +out: + return err; +} + +void br_netpoll_disable(struct net_bridge_port *p) +{ + struct netpoll *np = p->np; + + if (!np) + return; + + p->np = NULL; + + /* Wait for transmitting packets to finish before freeing. */ + synchronize_rcu_bh(); + + __netpoll_cleanup(np); + kfree(np); +} + +#endif + +static int br_add_slave(struct net_device *dev, struct net_device *slave_dev) + +{ + struct net_bridge *br = netdev_priv(dev); + + return br_add_if(br, slave_dev); +} + +static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) +{ + struct net_bridge *br = netdev_priv(dev); + + return br_del_if(br, slave_dev); +} + +static const struct ethtool_ops br_ethtool_ops = { + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, +}; + +static const struct net_device_ops br_netdev_ops = { + .ndo_open = br_dev_open, + .ndo_stop = br_dev_stop, + .ndo_init = br_dev_init, + .ndo_start_xmit = br_dev_xmit, + .ndo_get_stats64 = br_get_stats64, + .ndo_set_mac_address = br_set_mac_address, + .ndo_set_multicast_list = br_dev_set_multicast_list, + .ndo_change_mtu = br_change_mtu, + .ndo_do_ioctl = br_dev_ioctl, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_netpoll_setup = br_netpoll_setup, + .ndo_netpoll_cleanup = br_netpoll_cleanup, + .ndo_poll_controller = br_poll_controller, +#endif + .ndo_add_slave = br_add_slave, + .ndo_del_slave = br_del_slave, + .ndo_fix_features = br_fix_features, +}; + +static void br_dev_free(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + free_percpu(br->stats); + free_netdev(dev); +} + +static struct device_type br_type = { + .name = "bridge", +}; + +void br_dev_setup(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + random_ether_addr(dev->dev_addr); + ether_setup(dev); + + dev->netdev_ops = &br_netdev_ops; + dev->destructor = br_dev_free; + SET_ETHTOOL_OPS(dev, &br_ethtool_ops); + SET_NETDEV_DEVTYPE(dev, &br_type); + dev->tx_queue_len = 0; + dev->priv_flags = IFF_EBRIDGE; + + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX | + NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_TX; + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | + NETIF_F_HW_VLAN_TX; + + br->dev = dev; + spin_lock_init(&br->lock); + INIT_LIST_HEAD(&br->port_list); + spin_lock_init(&br->hash_lock); + + br->bridge_id.prio[0] = 0x80; + br->bridge_id.prio[1] = 0x00; + + memcpy(br->group_addr, br_group_address, ETH_ALEN); + + br->stp_enabled = BR_NO_STP; + br->designated_root = br->bridge_id; + br->bridge_max_age = br->max_age = 20 * HZ; + br->bridge_hello_time = br->hello_time = 2 * HZ; + br->bridge_forward_delay = br->forward_delay = 15 * HZ; + br->ageing_time = 300 * HZ; + + br_netfilter_rtable_init(br); + br_stp_timer_init(br); + br_multicast_init(br); +} diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c new file mode 100644 index 00000000..e0dfbc15 --- /dev/null +++ b/net/bridge/br_fdb.c @@ -0,0 +1,696 @@ +/* + * Forwarding database + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "br_private.h" + +static struct kmem_cache *br_fdb_cache __read_mostly; +static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr); +static void fdb_notify(const struct net_bridge_fdb_entry *, int); + +static u32 fdb_salt __read_mostly; + +int __init br_fdb_init(void) +{ + br_fdb_cache = kmem_cache_create("bridge_fdb_cache", + sizeof(struct net_bridge_fdb_entry), + 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!br_fdb_cache) + return -ENOMEM; + + get_random_bytes(&fdb_salt, sizeof(fdb_salt)); + return 0; +} + +void br_fdb_fini(void) +{ + kmem_cache_destroy(br_fdb_cache); +} + + +/* if topology_changing then use forward_delay (default 15 sec) + * otherwise keep longer (default 5 minutes) + */ +static inline unsigned long hold_time(const struct net_bridge *br) +{ + return br->topology_change ? br->forward_delay : br->ageing_time; +} + +static inline int has_expired(const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb) +{ + return !fdb->is_static && + time_before_eq(fdb->updated + hold_time(br), jiffies); +} + +static inline int br_mac_hash(const unsigned char *mac) +{ + /* use 1 byte of OUI cnd 3 bytes of NIC */ + u32 key = get_unaligned((u32 *)(mac + 2)); + return jhash_1word(key, fdb_salt) & (BR_HASH_SIZE - 1); +} + +static void fdb_rcu_free(struct rcu_head *head) +{ + struct net_bridge_fdb_entry *ent + = container_of(head, struct net_bridge_fdb_entry, rcu); + kmem_cache_free(br_fdb_cache, ent); +} + +static inline void fdb_delete(struct net_bridge_fdb_entry *f) +{ + fdb_notify(f, RTM_DELNEIGH); + hlist_del_rcu(&f->hlist); + call_rcu(&f->rcu, fdb_rcu_free); +} + +void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) +{ + struct net_bridge *br = p->br; + int i; + + spin_lock_bh(&br->hash_lock); + + /* Search all chains since old address/hash is unknown */ + for (i = 0; i < BR_HASH_SIZE; i++) { + struct hlist_node *h; + hlist_for_each(h, &br->hash[i]) { + struct net_bridge_fdb_entry *f; + + f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); + if (f->dst == p && f->is_local) { + /* maybe another port has same hw addr? */ + struct net_bridge_port *op; + list_for_each_entry(op, &br->port_list, list) { + if (op != p && + !compare_ether_addr(op->dev->dev_addr, + f->addr.addr)) { + f->dst = op; + goto insert; + } + } + + /* delete old one */ + fdb_delete(f); + goto insert; + } + } + } + insert: + /* insert new address, may fail if invalid address or dup. */ + fdb_insert(br, p, newaddr); + + spin_unlock_bh(&br->hash_lock); +} + +void br_fdb_cleanup(unsigned long _data) +{ + struct net_bridge *br = (struct net_bridge *)_data; + unsigned long delay = hold_time(br); + unsigned long next_timer = jiffies + br->ageing_time; + int i; + + spin_lock_bh(&br->hash_lock); + for (i = 0; i < BR_HASH_SIZE; i++) { + struct net_bridge_fdb_entry *f; + struct hlist_node *h, *n; + + hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) { + unsigned long this_timer; + if (f->is_static) + continue; + this_timer = f->updated + delay; + if (time_before_eq(this_timer, jiffies)) + fdb_delete(f); + else if (time_before(this_timer, next_timer)) + next_timer = this_timer; + } + } + spin_unlock_bh(&br->hash_lock); + + mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); +} + +/* Completely flush all dynamic entries in forwarding database.*/ +void br_fdb_flush(struct net_bridge *br) +{ + int i; + + spin_lock_bh(&br->hash_lock); + for (i = 0; i < BR_HASH_SIZE; i++) { + struct net_bridge_fdb_entry *f; + struct hlist_node *h, *n; + hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) { + if (!f->is_static) + fdb_delete(f); + } + } + spin_unlock_bh(&br->hash_lock); +} + +/* Flush all entries referring to a specific port. + * if do_all is set also flush static entries + */ +void br_fdb_delete_by_port(struct net_bridge *br, + const struct net_bridge_port *p, + int do_all) +{ + int i; + + spin_lock_bh(&br->hash_lock); + for (i = 0; i < BR_HASH_SIZE; i++) { + struct hlist_node *h, *g; + + hlist_for_each_safe(h, g, &br->hash[i]) { + struct net_bridge_fdb_entry *f + = hlist_entry(h, struct net_bridge_fdb_entry, hlist); + if (f->dst != p) + continue; + + if (f->is_static && !do_all) + continue; + /* + * if multiple ports all have the same device address + * then when one port is deleted, assign + * the local entry to other port + */ + if (f->is_local) { + struct net_bridge_port *op; + list_for_each_entry(op, &br->port_list, list) { + if (op != p && + !compare_ether_addr(op->dev->dev_addr, + f->addr.addr)) { + f->dst = op; + goto skip_delete; + } + } + } + + fdb_delete(f); + skip_delete: ; + } + } + spin_unlock_bh(&br->hash_lock); +} + +/* No locking or refcounting, assumes caller has rcu_read_lock */ +struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, + const unsigned char *addr) +{ + struct hlist_node *h; + struct net_bridge_fdb_entry *fdb; + + hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) { + if (!compare_ether_addr(fdb->addr.addr, addr)) { + if (unlikely(has_expired(br, fdb))) + break; + return fdb; + } + } + + return NULL; +} + +#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +/* Interface used by ATM LANE hook to test + * if an addr is on some other bridge port */ +int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) +{ + struct net_bridge_fdb_entry *fdb; + struct net_bridge_port *port; + int ret; + + rcu_read_lock(); + port = br_port_get_rcu(dev); + if (!port) + ret = 0; + else { + fdb = __br_fdb_get(port->br, addr); + ret = fdb && fdb->dst->dev != dev && + fdb->dst->state == BR_STATE_FORWARDING; + } + rcu_read_unlock(); + + return ret; +} +#endif /* CONFIG_ATM_LANE */ + +/* + * Fill buffer with forwarding table records in + * the API format. + */ +int br_fdb_fillbuf(struct net_bridge *br, void *buf, + unsigned long maxnum, unsigned long skip) +{ + struct __fdb_entry *fe = buf; + int i, num = 0; + struct hlist_node *h; + struct net_bridge_fdb_entry *f; + + memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); + + rcu_read_lock(); + for (i = 0; i < BR_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { + if (num >= maxnum) + goto out; + + if (has_expired(br, f)) + continue; + + if (skip) { + --skip; + continue; + } + + /* convert from internal format to API */ + memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN); + + /* due to ABI compat need to split into hi/lo */ + fe->port_no = f->dst->port_no; + fe->port_hi = f->dst->port_no >> 8; + + fe->is_local = f->is_local; + if (!f->is_static) + fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->updated); + ++fe; + ++num; + } + } + + out: + rcu_read_unlock(); + + return num; +} + +static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, + const unsigned char *addr) +{ + struct hlist_node *h; + struct net_bridge_fdb_entry *fdb; + + hlist_for_each_entry(fdb, h, head, hlist) { + if (!compare_ether_addr(fdb->addr.addr, addr)) + return fdb; + } + return NULL; +} + +static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, + const unsigned char *addr) +{ + struct hlist_node *h; + struct net_bridge_fdb_entry *fdb; + + hlist_for_each_entry_rcu(fdb, h, head, hlist) { + if (!compare_ether_addr(fdb->addr.addr, addr)) + return fdb; + } + return NULL; +} + +static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, + struct net_bridge_port *source, + const unsigned char *addr) +{ + struct net_bridge_fdb_entry *fdb; + + fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); + if (fdb) { + memcpy(fdb->addr.addr, addr, ETH_ALEN); + fdb->dst = source; + fdb->is_local = 0; + fdb->is_static = 0; + fdb->updated = fdb->used = jiffies; + hlist_add_head_rcu(&fdb->hlist, head); + fdb_notify(fdb, RTM_NEWNEIGH); + } + return fdb; +} + +static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + if (!is_valid_ether_addr(addr)) + return -EINVAL; + + fdb = fdb_find(head, addr); + if (fdb) { + /* it is okay to have multiple ports with same + * address, just use the first one. + */ + if (fdb->is_local) + return 0; + br_warn(br, "adding interface %s with same address " + "as a received packet\n", + source->dev->name); + fdb_delete(fdb); + } + + fdb = fdb_create(head, source, addr); + if (!fdb) + return -ENOMEM; + + fdb->is_local = fdb->is_static = 1; + return 0; +} + +/* Add entry for local address of interface */ +int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr) +{ + int ret; + + spin_lock_bh(&br->hash_lock); + ret = fdb_insert(br, source, addr); + spin_unlock_bh(&br->hash_lock); + return ret; +} + +void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + /* some users want to always flood. */ + if (hold_time(br) == 0) + return; + + /* ignore packets unless we are using this port */ + if (!(source->state == BR_STATE_LEARNING || + source->state == BR_STATE_FORWARDING)) + return; + + fdb = fdb_find_rcu(head, addr); + if (likely(fdb)) { + /* attempt to update an entry for a local interface */ + if (unlikely(fdb->is_local)) { + if (net_ratelimit()) + br_warn(br, "received packet on %s with " + "own address as source address\n", + source->dev->name); + } else { + /* fastpath: update of existing entry */ + fdb->dst = source; + fdb->updated = jiffies; + } + } else { + spin_lock(&br->hash_lock); + if (likely(!fdb_find(head, addr))) + fdb_create(head, source, addr); + + /* else we lose race and someone else inserts + * it first, don't bother updating + */ + spin_unlock(&br->hash_lock); + } +} + +static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb) +{ + if (fdb->is_local) + return NUD_PERMANENT; + else if (fdb->is_static) + return NUD_NOARP; + else if (has_expired(fdb->dst->br, fdb)) + return NUD_STALE; + else + return NUD_REACHABLE; +} + +static int fdb_fill_info(struct sk_buff *skb, + const struct net_bridge_fdb_entry *fdb, + u32 pid, u32 seq, int type, unsigned int flags) +{ + unsigned long now = jiffies; + struct nda_cacheinfo ci; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + ndm->ndm_ifindex = fdb->dst->dev->ifindex; + ndm->ndm_state = fdb_to_nud(fdb); + + NLA_PUT(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr); + + ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_confirmed = 0; + ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_refcnt = 0; + NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static inline size_t fdb_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)); +} + +static void fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) +{ + struct net *net = dev_net(fdb->dst->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = fdb_fill_info(skb, fdb, 0, 0, type, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +/* Dump information about entries, in response to GETNEIGH */ +int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx = 0; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct net_bridge *br = netdev_priv(dev); + int i; + + if (!(dev->priv_flags & IFF_EBRIDGE)) + continue; + + for (i = 0; i < BR_HASH_SIZE; i++) { + struct hlist_node *h; + struct net_bridge_fdb_entry *f; + + hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { + if (idx < cb->args[0]) + goto skip; + + if (fdb_fill_info(skb, f, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI) < 0) + break; +skip: + ++idx; + } + } + } + rcu_read_unlock(); + + cb->args[0] = idx; + + return skb->len; +} + +/* Create new static fdb entry */ +static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, + __u16 state) +{ + struct net_bridge *br = source->br; + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr); + if (fdb) + return -EEXIST; + + fdb = fdb_create(head, source, addr); + if (!fdb) + return -ENOMEM; + + if (state & NUD_PERMANENT) + fdb->is_local = fdb->is_static = 1; + else if (state & NUD_NOARP) + fdb->is_static = 1; + return 0; +} + +/* Add new permanent fdb entry with RTM_NEWNEIGH */ +int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct net_device *dev; + struct net_bridge_port *p; + const __u8 *addr; + int err; + + ASSERT_RTNL(); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + return err; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex == 0) { + pr_info("bridge: RTM_NEWNEIGH with invalid ifindex\n"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + pr_info("bridge: RTM_NEWNEIGH with unknown ifindex\n"); + return -ENODEV; + } + + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + pr_info("bridge: RTM_NEWNEIGH with invalid address\n"); + return -EINVAL; + } + + addr = nla_data(tb[NDA_LLADDR]); + if (!is_valid_ether_addr(addr)) { + pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n"); + return -EINVAL; + } + + p = br_port_get_rtnl(dev); + if (p == NULL) { + pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + + spin_lock_bh(&p->br->hash_lock); + err = fdb_add_entry(p, addr, ndm->ndm_state); + spin_unlock_bh(&p->br->hash_lock); + + return err; +} + +static int fdb_delete_by_addr(struct net_bridge_port *p, const u8 *addr) +{ + struct net_bridge *br = p->br; + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr); + if (!fdb) + return -ENOENT; + + fdb_delete(fdb); + return 0; +} + +/* Remove neighbor entry with RTM_DELNEIGH */ +int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct net_bridge_port *p; + struct nlattr *llattr; + const __u8 *addr; + struct net_device *dev; + int err; + + ASSERT_RTNL(); + if (nlmsg_len(nlh) < sizeof(*ndm)) + return -EINVAL; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex == 0) { + pr_info("bridge: RTM_DELNEIGH with invalid ifindex\n"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + pr_info("bridge: RTM_DELNEIGH with unknown ifindex\n"); + return -ENODEV; + } + + llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR); + if (llattr == NULL || nla_len(llattr) != ETH_ALEN) { + pr_info("bridge: RTM_DELNEIGH with invalid address\n"); + return -EINVAL; + } + + addr = nla_data(llattr); + + p = br_port_get_rtnl(dev); + if (p == NULL) { + pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + + spin_lock_bh(&p->br->hash_lock); + err = fdb_delete_by_addr(p, addr); + spin_unlock_bh(&p->br->hash_lock); + + return err; +} diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c new file mode 100644 index 00000000..ee64287f --- /dev/null +++ b/net/bridge/br_forward.c @@ -0,0 +1,271 @@ +/* + * Forwarding decision + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "br_private.h" + +static int deliver_clone(const struct net_bridge_port *prev, + struct sk_buff *skb, + void (*__packet_hook)(const struct net_bridge_port *p, + struct sk_buff *skb)); + +/* Don't forward packets to originating port or forwarding diasabled */ +static inline int should_deliver(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return (((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && + p->state == BR_STATE_FORWARDING); +} + +static inline unsigned packet_length(const struct sk_buff *skb) +{ + return skb->len - (skb->protocol == htons(ETH_P_8021Q) ? VLAN_HLEN : 0); +} + +int br_dev_queue_push_xmit(struct sk_buff *skb) +{ + /* ip_fragment doesn't copy the MAC header */ + if (nf_bridge_maybe_copy_header(skb) || + (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) { + kfree_skb(skb); + } else { + skb_push(skb, ETH_HLEN); + dev_queue_xmit(skb); + } + + return 0; +} + +int br_forward_finish(struct sk_buff *skb) +{ + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, + br_dev_queue_push_xmit); + +} + +static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +{ + skb->dev = to->dev; + + if (unlikely(netpoll_tx_running(to->dev))) { + if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) + kfree_skb(skb); + else { + skb_push(skb, ETH_HLEN); + br_netpoll_send_skb(to, skb); + } + return; + } + + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + br_forward_finish); +} + +static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) +{ + struct net_device *indev; + + if (skb_warn_if_lro(skb)) { + kfree_skb(skb); + return; + } + + indev = skb->dev; + skb->dev = to->dev; + skb_forward_csum(skb); + + NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev, + br_forward_finish); +} + +/* called with rcu_read_lock */ +void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +{ + if (should_deliver(to, skb)) { + __br_deliver(to, skb); + return; + } + + kfree_skb(skb); +} + +/* called with rcu_read_lock */ +void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0) +{ + if (should_deliver(to, skb)) { + if (skb0) + deliver_clone(to, skb, __br_forward); + else + __br_forward(to, skb); + return; + } + + if (!skb0) + kfree_skb(skb); +} + +static int deliver_clone(const struct net_bridge_port *prev, + struct sk_buff *skb, + void (*__packet_hook)(const struct net_bridge_port *p, + struct sk_buff *skb)) +{ + struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + dev->stats.tx_dropped++; + return -ENOMEM; + } + + __packet_hook(prev, skb); + return 0; +} + +static struct net_bridge_port *maybe_deliver( + struct net_bridge_port *prev, struct net_bridge_port *p, + struct sk_buff *skb, + void (*__packet_hook)(const struct net_bridge_port *p, + struct sk_buff *skb)) +{ + int err; + + if (!should_deliver(p, skb)) + return prev; + + if (!prev) + goto out; + + err = deliver_clone(prev, skb, __packet_hook); + if (err) + return ERR_PTR(err); + +out: + return p; +} + +/* called under bridge lock */ +static void br_flood(struct net_bridge *br, struct sk_buff *skb, + struct sk_buff *skb0, + void (*__packet_hook)(const struct net_bridge_port *p, + struct sk_buff *skb)) +{ + struct net_bridge_port *p; + struct net_bridge_port *prev; + + prev = NULL; + + list_for_each_entry_rcu(p, &br->port_list, list) { + prev = maybe_deliver(prev, p, skb, __packet_hook); + if (IS_ERR(prev)) + goto out; + } + + if (!prev) + goto out; + + if (skb0) + deliver_clone(prev, skb, __packet_hook); + else + __packet_hook(prev, skb); + return; + +out: + if (!skb0) + kfree_skb(skb); +} + + +/* called with rcu_read_lock */ +void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) +{ + br_flood(br, skb, NULL, __br_deliver); +} + +/* called under bridge lock */ +void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, + struct sk_buff *skb2) +{ + br_flood(br, skb, skb2, __br_forward); +} + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +/* called with rcu_read_lock */ +static void br_multicast_flood(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, struct sk_buff *skb0, + void (*__packet_hook)( + const struct net_bridge_port *p, + struct sk_buff *skb)) +{ + struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *prev = NULL; + struct net_bridge_port_group *p; + struct hlist_node *rp; + + rp = rcu_dereference(hlist_first_rcu(&br->router_list)); + p = mdst ? rcu_dereference(mdst->ports) : NULL; + while (p || rp) { + struct net_bridge_port *port, *lport, *rport; + + lport = p ? p->port : NULL; + rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) : + NULL; + + port = (unsigned long)lport > (unsigned long)rport ? + lport : rport; + + prev = maybe_deliver(prev, port, skb, __packet_hook); + if (IS_ERR(prev)) + goto out; + + if ((unsigned long)lport >= (unsigned long)port) + p = rcu_dereference(p->next); + if ((unsigned long)rport >= (unsigned long)port) + rp = rcu_dereference(hlist_next_rcu(rp)); + } + + if (!prev) + goto out; + + if (skb0) + deliver_clone(prev, skb, __packet_hook); + else + __packet_hook(prev, skb); + return; + +out: + if (!skb0) + kfree_skb(skb); +} + +/* called with rcu_read_lock */ +void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb) +{ + br_multicast_flood(mdst, skb, NULL, __br_deliver); +} + +/* called with rcu_read_lock */ +void br_multicast_forward(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, struct sk_buff *skb2) +{ + br_multicast_flood(mdst, skb, skb2, __br_forward); +} +#endif diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c new file mode 100644 index 00000000..44908737 --- /dev/null +++ b/net/bridge/br_if.c @@ -0,0 +1,454 @@ +/* + * Userspace interface + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +/* + * Determine initial path cost based on speed. + * using recommendations from 802.1d standard + * + * Since driver might sleep need to not be holding any locks. + */ +static int port_cost(struct net_device *dev) +{ + if (dev->ethtool_ops && dev->ethtool_ops->get_settings) { + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, }; + + if (!dev_ethtool_get_settings(dev, &ecmd)) { + switch (ethtool_cmd_speed(&ecmd)) { + case SPEED_10000: + return 2; + case SPEED_1000: + return 4; + case SPEED_100: + return 19; + case SPEED_10: + return 100; + } + } + } + + /* Old silly heuristics based on name */ + if (!strncmp(dev->name, "lec", 3)) + return 7; + + if (!strncmp(dev->name, "plip", 4)) + return 2500; + + return 100; /* assume old 10Mbps */ +} + + +/* Check for port carrier transistions. */ +void br_port_carrier_check(struct net_bridge_port *p) +{ + struct net_device *dev = p->dev; + struct net_bridge *br = p->br; + + if (netif_running(dev) && netif_carrier_ok(dev)) + p->path_cost = port_cost(dev); + + if (!netif_running(br->dev)) + return; + + spin_lock_bh(&br->lock); + if (netif_running(dev) && netif_carrier_ok(dev)) { + if (p->state == BR_STATE_DISABLED) + br_stp_enable_port(p); + } else { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + } + spin_unlock_bh(&br->lock); +} + +static void release_nbp(struct kobject *kobj) +{ + struct net_bridge_port *p + = container_of(kobj, struct net_bridge_port, kobj); + kfree(p); +} + +static struct kobj_type brport_ktype = { +#ifdef CONFIG_SYSFS + .sysfs_ops = &brport_sysfs_ops, +#endif + .release = release_nbp, +}; + +static void destroy_nbp(struct net_bridge_port *p) +{ + struct net_device *dev = p->dev; + + p->br = NULL; + p->dev = NULL; + dev_put(dev); + + kobject_put(&p->kobj); +} + +static void destroy_nbp_rcu(struct rcu_head *head) +{ + struct net_bridge_port *p = + container_of(head, struct net_bridge_port, rcu); + destroy_nbp(p); +} + +/* Delete port(interface) from bridge is done in two steps. + * via RCU. First step, marks device as down. That deletes + * all the timers and stops new packets from flowing through. + * + * Final cleanup doesn't occur until after all CPU's finished + * processing packets. + * + * Protected from multiple admin operations by RTNL mutex + */ +static void del_nbp(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + struct net_device *dev = p->dev; + + sysfs_remove_link(br->ifobj, p->dev->name); + + dev_set_promiscuity(dev, -1); + + spin_lock_bh(&br->lock); + br_stp_disable_port(p); + spin_unlock_bh(&br->lock); + + br_ifinfo_notify(RTM_DELLINK, p); + + br_fdb_delete_by_port(br, p, 1); + + list_del_rcu(&p->list); + + dev->priv_flags &= ~IFF_BRIDGE_PORT; + + netdev_rx_handler_unregister(dev); + synchronize_net(); + + netdev_set_master(dev, NULL); + + br_multicast_del_port(p); + + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + + br_netpoll_disable(p); + + call_rcu(&p->rcu, destroy_nbp_rcu); +} + +/* Delete bridge device */ +void br_dev_delete(struct net_device *dev, struct list_head *head) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p, *n; + + list_for_each_entry_safe(p, n, &br->port_list, list) { + del_nbp(p); + } + + del_timer_sync(&br->gc_timer); + + br_sysfs_delbr(br->dev); + unregister_netdevice_queue(br->dev, head); +} + +/* find an available port number */ +static int find_portno(struct net_bridge *br) +{ + int index; + struct net_bridge_port *p; + unsigned long *inuse; + + inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), + GFP_KERNEL); + if (!inuse) + return -ENOMEM; + + set_bit(0, inuse); /* zero is reserved */ + list_for_each_entry(p, &br->port_list, list) { + set_bit(p->port_no, inuse); + } + index = find_first_zero_bit(inuse, BR_MAX_PORTS); + kfree(inuse); + + return (index >= BR_MAX_PORTS) ? -EXFULL : index; +} + +/* called with RTNL but without bridge lock */ +static struct net_bridge_port *new_nbp(struct net_bridge *br, + struct net_device *dev) +{ + int index; + struct net_bridge_port *p; + + index = find_portno(br); + if (index < 0) + return ERR_PTR(index); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return ERR_PTR(-ENOMEM); + + p->br = br; + dev_hold(dev); + p->dev = dev; + p->path_cost = port_cost(dev); + p->priority = 0x8000 >> BR_PORT_BITS; + p->port_no = index; + p->flags = 0; + br_init_port(p); + p->state = BR_STATE_DISABLED; + br_stp_port_timer_init(p); + br_multicast_add_port(p); + + return p; +} + +int br_add_bridge(struct net *net, const char *name) +{ + struct net_device *dev; + int res; + + dev = alloc_netdev(sizeof(struct net_bridge), name, + br_dev_setup); + + if (!dev) + return -ENOMEM; + + dev_net_set(dev, net); + + res = register_netdev(dev); + if (res) + free_netdev(dev); + return res; +} + +int br_del_bridge(struct net *net, const char *name) +{ + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = __dev_get_by_name(net, name); + if (dev == NULL) + ret = -ENXIO; /* Could not find device */ + + else if (!(dev->priv_flags & IFF_EBRIDGE)) { + /* Attempt to delete non bridge device! */ + ret = -EPERM; + } + + else if (dev->flags & IFF_UP) { + /* Not shutdown yet. */ + ret = -EBUSY; + } + + else + br_dev_delete(dev, NULL); + + rtnl_unlock(); + return ret; +} + +/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ +int br_min_mtu(const struct net_bridge *br) +{ + const struct net_bridge_port *p; + int mtu = 0; + + ASSERT_RTNL(); + + if (list_empty(&br->port_list)) + mtu = ETH_DATA_LEN; + else { + list_for_each_entry(p, &br->port_list, list) { + if (!mtu || p->dev->mtu < mtu) + mtu = p->dev->mtu; + } + } + return mtu; +} + +/* + * Recomputes features using slave's features + */ +u32 br_features_recompute(struct net_bridge *br, u32 features) +{ + struct net_bridge_port *p; + u32 mask; + + if (list_empty(&br->port_list)) + return features; + + mask = features; + features &= ~NETIF_F_ONE_FOR_ALL; + + list_for_each_entry(p, &br->port_list, list) { + features = netdev_increment_features(features, + p->dev->features, mask); + } + + return features; +} + +/* called with RTNL */ +int br_add_if(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p; + int err = 0; + bool changed_addr; + + /* Don't allow bridging non-ethernet like devices */ + if ((dev->flags & IFF_LOOPBACK) || + dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) + return -EINVAL; + + /* No bridging of bridges */ + if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) + return -ELOOP; + + /* Device is already being bridged */ + if (br_port_exists(dev)) + return -EBUSY; + + /* No bridging devices that dislike that (e.g. wireless) */ + if (dev->priv_flags & IFF_DONT_BRIDGE) + return -EOPNOTSUPP; + + p = new_nbp(br, dev); + if (IS_ERR(p)) + return PTR_ERR(p); + + call_netdevice_notifiers(NETDEV_JOIN, dev); + + err = dev_set_promiscuity(dev, 1); + if (err) + goto put_back; + + err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), + SYSFS_BRIDGE_PORT_ATTR); + if (err) + goto err0; + + err = br_fdb_insert(br, p, dev->dev_addr); + if (err) + goto err1; + + err = br_sysfs_addif(p); + if (err) + goto err2; + + if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) + goto err3; + + err = netdev_set_master(dev, br->dev); + if (err) + goto err3; + + err = netdev_rx_handler_register(dev, br_handle_frame, p); + if (err) + goto err4; + + dev->priv_flags |= IFF_BRIDGE_PORT; + + dev_disable_lro(dev); + + list_add_rcu(&p->list, &br->port_list); + + netdev_update_features(br->dev); + + spin_lock_bh(&br->lock); + changed_addr = br_stp_recalculate_bridge_id(br); + + if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && + (br->dev->flags & IFF_UP)) + br_stp_enable_port(p); + spin_unlock_bh(&br->lock); + + br_ifinfo_notify(RTM_NEWLINK, p); + + if (changed_addr) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + + dev_set_mtu(br->dev, br_min_mtu(br)); + + kobject_uevent(&p->kobj, KOBJ_ADD); + + return 0; + +err4: + netdev_set_master(dev, NULL); +err3: + sysfs_remove_link(br->ifobj, p->dev->name); +err2: + br_fdb_delete_by_port(br, p, 1); +err1: + kobject_put(&p->kobj); + p = NULL; /* kobject_put frees */ +err0: + dev_set_promiscuity(dev, -1); +put_back: + dev_put(dev); + kfree(p); + return err; +} + +/* called with RTNL */ +int br_del_if(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p; + + p = br_port_get_rtnl(dev); + if (!p || p->br != br) + return -EINVAL; + + del_nbp(p); + + spin_lock_bh(&br->lock); + br_stp_recalculate_bridge_id(br); + spin_unlock_bh(&br->lock); + + netdev_update_features(br->dev); + + return 0; +} + +void __net_exit br_net_exit(struct net *net) +{ + struct net_device *dev; + LIST_HEAD(list); + + rtnl_lock(); + for_each_netdev(net, dev) + if (dev->priv_flags & IFF_EBRIDGE) + br_dev_delete(dev, &list); + + unregister_netdevice_many(&list); + rtnl_unlock(); + +} diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c new file mode 100644 index 00000000..f06ee39c --- /dev/null +++ b/net/bridge/br_input.c @@ -0,0 +1,206 @@ +/* + * Handle incoming frames + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "br_private.h" + +/* Bridge group multicast address 802.1d (pg 51). */ +const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +/* Hook for brouter */ +br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; +EXPORT_SYMBOL(br_should_route_hook); + +static int br_pass_frame_up(struct sk_buff *skb) +{ + struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; + struct net_bridge *br = netdev_priv(brdev); + struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + + u64_stats_update_begin(&brstats->syncp); + brstats->rx_packets++; + brstats->rx_bytes += skb->len; + u64_stats_update_end(&brstats->syncp); + + indev = skb->dev; + skb->dev = brdev; + + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, + netif_receive_skb); +} + +/* note: already called with rcu_read_lock */ +int br_handle_frame_finish(struct sk_buff *skb) +{ + const unsigned char *dest = eth_hdr(skb)->h_dest; + struct net_bridge_port *p = br_port_get_rcu(skb->dev); + struct net_bridge *br; + struct net_bridge_fdb_entry *dst; + struct net_bridge_mdb_entry *mdst; + struct sk_buff *skb2; + + if (!p || p->state == BR_STATE_DISABLED) + goto drop; + + /* insert into forwarding database after filtering to avoid spoofing */ + br = p->br; + br_fdb_update(br, p, eth_hdr(skb)->h_source); + + if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && + br_multicast_rcv(br, p, skb)) + goto drop; + + if (p->state == BR_STATE_LEARNING) + goto drop; + + BR_INPUT_SKB_CB(skb)->brdev = br->dev; + + /* The packet skb2 goes to the local host (NULL to skip). */ + skb2 = NULL; + + if (br->dev->flags & IFF_PROMISC) + skb2 = skb; + + dst = NULL; + + if (is_broadcast_ether_addr(dest)) + skb2 = skb; + else if (is_multicast_ether_addr(dest)) { + mdst = br_mdb_get(br, skb); + if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { + if ((mdst && mdst->mglist) || + br_multicast_is_router(br)) + skb2 = skb; + br_multicast_forward(mdst, skb, skb2); + skb = NULL; + if (!skb2) + goto out; + } else + skb2 = skb; + + br->dev->stats.multicast++; + } else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) { + skb2 = skb; + /* Do not forward the packet since it's local. */ + skb = NULL; + } + + if (skb) { + if (dst) { + dst->used = jiffies; + br_forward(dst->dst, skb, skb2); + } else + br_flood_forward(br, skb, skb2); + } + + if (skb2) + return br_pass_frame_up(skb2); + +out: + return 0; +drop: + kfree_skb(skb); + goto out; +} + +/* note: already called with rcu_read_lock */ +static int br_handle_local_finish(struct sk_buff *skb) +{ + struct net_bridge_port *p = br_port_get_rcu(skb->dev); + + br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + return 0; /* process further */ +} + +/* Does address match the link local multicast address. + * 01:80:c2:00:00:0X + */ +static inline int is_link_local(const unsigned char *dest) +{ + __be16 *a = (__be16 *)dest; + static const __be16 *b = (const __be16 *)br_group_address; + static const __be16 m = cpu_to_be16(0xfff0); + + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; +} + +/* + * Return NULL if skb is handled + * note: already called with rcu_read_lock + */ +rx_handler_result_t br_handle_frame(struct sk_buff **pskb) +{ + struct net_bridge_port *p; + struct sk_buff *skb = *pskb; + const unsigned char *dest = eth_hdr(skb)->h_dest; + br_should_route_hook_t *rhook; + + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + + if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) + goto drop; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return RX_HANDLER_CONSUMED; + + p = br_port_get_rcu(skb->dev); + + if (unlikely(is_link_local(dest))) { + /* Pause frames shouldn't be passed up by driver anyway */ + if (skb->protocol == htons(ETH_P_PAUSE)) + goto drop; + + /* If STP is turned off, then forward */ + if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0) + goto forward; + + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, + NULL, br_handle_local_finish)) { + return RX_HANDLER_CONSUMED; /* consumed by filter */ + } else { + *pskb = skb; + return RX_HANDLER_PASS; /* continue processing */ + } + } + +forward: + switch (p->state) { + case BR_STATE_FORWARDING: + rhook = rcu_dereference(br_should_route_hook); + if (rhook) { + if ((*rhook)(skb)) { + *pskb = skb; + return RX_HANDLER_PASS; + } + dest = eth_hdr(skb)->h_dest; + } + /* fall through */ + case BR_STATE_LEARNING: + if (!compare_ether_addr(p->br->dev->dev_addr, dest)) + skb->pkt_type = PACKET_HOST; + + NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + br_handle_frame_finish); + break; + default: +drop: + kfree_skb(skb); + } + return RX_HANDLER_CONSUMED; +} diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c new file mode 100644 index 00000000..7222fe1d --- /dev/null +++ b/net/bridge/br_ioctl.c @@ -0,0 +1,395 @@ +/* + * Ioctl handler + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "br_private.h" + +/* called with RTNL */ +static int get_bridge_ifindices(struct net *net, int *indices, int num) +{ + struct net_device *dev; + int i = 0; + + for_each_netdev(net, dev) { + if (i >= num) + break; + if (dev->priv_flags & IFF_EBRIDGE) + indices[i++] = dev->ifindex; + } + + return i; +} + +/* called with RTNL */ +static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->port_no < num) + ifindices[p->port_no] = p->dev->ifindex; + } +} + +/* + * Format up to a page worth of forwarding table entries + * userbuf -- where to copy result + * maxnum -- maximum number of entries desired + * (limited to a page for sanity) + * offset -- number of records to skip + */ +static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, + unsigned long maxnum, unsigned long offset) +{ + int num; + void *buf; + size_t size; + + /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */ + if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry)) + maxnum = PAGE_SIZE/sizeof(struct __fdb_entry); + + size = maxnum * sizeof(struct __fdb_entry); + + buf = kmalloc(size, GFP_USER); + if (!buf) + return -ENOMEM; + + num = br_fdb_fillbuf(br, buf, maxnum, offset); + if (num > 0) { + if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry))) + num = -EFAULT; + } + kfree(buf); + + return num; +} + +/* called with RTNL */ +static int add_del_if(struct net_bridge *br, int ifindex, int isadd) +{ + struct net_device *dev; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + dev = __dev_get_by_index(dev_net(br->dev), ifindex); + if (dev == NULL) + return -EINVAL; + + if (isadd) + ret = br_add_if(br, dev); + else + ret = br_del_if(br, dev); + + return ret; +} + +/* + * Legacy ioctl's through SIOCDEVPRIVATE + * This interface is deprecated because it was too difficult to + * to do the translation for 32/64bit ioctl compatibility. + */ +static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + struct net_bridge *br = netdev_priv(dev); + unsigned long args[4]; + + if (copy_from_user(args, rq->ifr_data, sizeof(args))) + return -EFAULT; + + switch (args[0]) { + case BRCTL_ADD_IF: + case BRCTL_DEL_IF: + return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF); + + case BRCTL_GET_BRIDGE_INFO: + { + struct __bridge_info b; + + memset(&b, 0, sizeof(struct __bridge_info)); + rcu_read_lock(); + memcpy(&b.designated_root, &br->designated_root, 8); + memcpy(&b.bridge_id, &br->bridge_id, 8); + b.root_path_cost = br->root_path_cost; + b.max_age = jiffies_to_clock_t(br->max_age); + b.hello_time = jiffies_to_clock_t(br->hello_time); + b.forward_delay = br->forward_delay; + b.bridge_max_age = br->bridge_max_age; + b.bridge_hello_time = br->bridge_hello_time; + b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay); + b.topology_change = br->topology_change; + b.topology_change_detected = br->topology_change_detected; + b.root_port = br->root_port; + + b.stp_enabled = (br->stp_enabled != BR_NO_STP); + b.ageing_time = jiffies_to_clock_t(br->ageing_time); + b.hello_timer_value = br_timer_value(&br->hello_timer); + b.tcn_timer_value = br_timer_value(&br->tcn_timer); + b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); + b.gc_timer_value = br_timer_value(&br->gc_timer); + rcu_read_unlock(); + + if (copy_to_user((void __user *)args[1], &b, sizeof(b))) + return -EFAULT; + + return 0; + } + + case BRCTL_GET_PORT_LIST: + { + int num, *indices; + + num = args[2]; + if (num < 0) + return -EINVAL; + if (num == 0) + num = 256; + if (num > BR_MAX_PORTS) + num = BR_MAX_PORTS; + + indices = kcalloc(num, sizeof(int), GFP_KERNEL); + if (indices == NULL) + return -ENOMEM; + + get_port_ifindices(br, indices, num); + if (copy_to_user((void __user *)args[1], indices, num*sizeof(int))) + num = -EFAULT; + kfree(indices); + return num; + } + + case BRCTL_SET_BRIDGE_FORWARD_DELAY: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return br_set_forward_delay(br, args[1]); + + case BRCTL_SET_BRIDGE_HELLO_TIME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return br_set_hello_time(br, args[1]); + + case BRCTL_SET_BRIDGE_MAX_AGE: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return br_set_max_age(br, args[1]); + + case BRCTL_SET_AGEING_TIME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br->ageing_time = clock_t_to_jiffies(args[1]); + return 0; + + case BRCTL_GET_PORT_INFO: + { + struct __port_info p; + struct net_bridge_port *pt; + + rcu_read_lock(); + if ((pt = br_get_port(br, args[2])) == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + memset(&p, 0, sizeof(struct __port_info)); + memcpy(&p.designated_root, &pt->designated_root, 8); + memcpy(&p.designated_bridge, &pt->designated_bridge, 8); + p.port_id = pt->port_id; + p.designated_port = pt->designated_port; + p.path_cost = pt->path_cost; + p.designated_cost = pt->designated_cost; + p.state = pt->state; + p.top_change_ack = pt->topology_change_ack; + p.config_pending = pt->config_pending; + p.message_age_timer_value = br_timer_value(&pt->message_age_timer); + p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer); + p.hold_timer_value = br_timer_value(&pt->hold_timer); + + rcu_read_unlock(); + + if (copy_to_user((void __user *)args[1], &p, sizeof(p))) + return -EFAULT; + + return 0; + } + + case BRCTL_SET_BRIDGE_STP_STATE: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br_stp_set_enabled(br, args[1]); + return 0; + + case BRCTL_SET_BRIDGE_PRIORITY: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + br_stp_set_bridge_priority(br, args[1]); + spin_unlock_bh(&br->lock); + return 0; + + case BRCTL_SET_PORT_PRIORITY: + { + struct net_bridge_port *p; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + if ((p = br_get_port(br, args[1])) == NULL) + ret = -EINVAL; + else + ret = br_stp_set_port_priority(p, args[2]); + spin_unlock_bh(&br->lock); + return ret; + } + + case BRCTL_SET_PATH_COST: + { + struct net_bridge_port *p; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + if ((p = br_get_port(br, args[1])) == NULL) + ret = -EINVAL; + else + ret = br_stp_set_path_cost(p, args[2]); + spin_unlock_bh(&br->lock); + + return ret; + } + + case BRCTL_GET_FDB_ENTRIES: + return get_fdb_entries(br, (void __user *)args[1], + args[2], args[3]); + } + + return -EOPNOTSUPP; +} + +static int old_deviceless(struct net *net, void __user *uarg) +{ + unsigned long args[3]; + + if (copy_from_user(args, uarg, sizeof(args))) + return -EFAULT; + + switch (args[0]) { + case BRCTL_GET_VERSION: + return BRCTL_VERSION; + + case BRCTL_GET_BRIDGES: + { + int *indices; + int ret = 0; + + if (args[2] >= 2048) + return -ENOMEM; + indices = kcalloc(args[2], sizeof(int), GFP_KERNEL); + if (indices == NULL) + return -ENOMEM; + + args[2] = get_bridge_ifindices(net, indices, args[2]); + + ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int)) + ? -EFAULT : args[2]; + + kfree(indices); + return ret; + } + + case BRCTL_ADD_BRIDGE: + case BRCTL_DEL_BRIDGE: + { + char buf[IFNAMSIZ]; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ)) + return -EFAULT; + + buf[IFNAMSIZ-1] = 0; + + if (args[0] == BRCTL_ADD_BRIDGE) + return br_add_bridge(net, buf); + + return br_del_bridge(net, buf); + } + } + + return -EOPNOTSUPP; +} + +int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg) +{ + switch (cmd) { + case SIOCGIFBR: + case SIOCSIFBR: + return old_deviceless(net, uarg); + + case SIOCBRADDBR: + case SIOCBRDELBR: + { + char buf[IFNAMSIZ]; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(buf, uarg, IFNAMSIZ)) + return -EFAULT; + + buf[IFNAMSIZ-1] = 0; + if (cmd == SIOCBRADDBR) + return br_add_bridge(net, buf); + + return br_del_bridge(net, buf); + } + } + return -EOPNOTSUPP; +} + +int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + struct net_bridge *br = netdev_priv(dev); + + switch(cmd) { + case SIOCDEVPRIVATE: + return old_dev_ioctl(dev, rq, cmd); + + case SIOCBRADDIF: + case SIOCBRDELIF: + return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF); + + } + + br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd); + return -EOPNOTSUPP; +} diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c new file mode 100644 index 00000000..e78269d7 --- /dev/null +++ b/net/bridge/br_multicast.c @@ -0,0 +1,1777 @@ +/* + * Bridge multicast support. + * + * Copyright (c) 2010 Herbert Xu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#include +#include +#include +#endif + +#include "br_private.h" + +#define mlock_dereference(X, br) \ + rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline int ipv6_is_transient_multicast(const struct in6_addr *addr) +{ + if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr)) + return 1; + return 0; +} +#endif + +static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) +{ + if (a->proto != b->proto) + return 0; + switch (a->proto) { + case htons(ETH_P_IP): + return a->u.ip4 == b->u.ip4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case htons(ETH_P_IPV6): + return ipv6_addr_equal(&a->u.ip6, &b->u.ip6); +#endif + } + return 0; +} + +static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip) +{ + return jhash_1word(mdb->secret, (__force u32)ip) & (mdb->max - 1); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb, + const struct in6_addr *ip) +{ + return jhash2((__force u32 *)ip->s6_addr32, 4, mdb->secret) & (mdb->max - 1); +} +#endif + +static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, + struct br_ip *ip) +{ + switch (ip->proto) { + case htons(ETH_P_IP): + return __br_ip4_hash(mdb, ip->u.ip4); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case htons(ETH_P_IPV6): + return __br_ip6_hash(mdb, &ip->u.ip6); +#endif + } + return 0; +} + +static struct net_bridge_mdb_entry *__br_mdb_ip_get( + struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash) +{ + struct net_bridge_mdb_entry *mp; + struct hlist_node *p; + + hlist_for_each_entry_rcu(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) { + if (br_ip_equal(&mp->addr, dst)) + return mp; + } + + return NULL; +} + +static struct net_bridge_mdb_entry *br_mdb_ip_get( + struct net_bridge_mdb_htable *mdb, struct br_ip *dst) +{ + if (!mdb) + return NULL; + + return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst)); +} + +static struct net_bridge_mdb_entry *br_mdb_ip4_get( + struct net_bridge_mdb_htable *mdb, __be32 dst) +{ + struct br_ip br_dst; + + br_dst.u.ip4 = dst; + br_dst.proto = htons(ETH_P_IP); + + return br_mdb_ip_get(mdb, &br_dst); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static struct net_bridge_mdb_entry *br_mdb_ip6_get( + struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst) +{ + struct br_ip br_dst; + + ipv6_addr_copy(&br_dst.u.ip6, dst); + br_dst.proto = htons(ETH_P_IPV6); + + return br_mdb_ip_get(mdb, &br_dst); +} +#endif + +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, + struct sk_buff *skb) +{ + struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb); + struct br_ip ip; + + if (br->multicast_disabled) + return NULL; + + if (BR_INPUT_SKB_CB(skb)->igmp) + return NULL; + + ip.proto = skb->protocol; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ip.u.ip4 = ip_hdr(skb)->daddr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case htons(ETH_P_IPV6): + ipv6_addr_copy(&ip.u.ip6, &ipv6_hdr(skb)->daddr); + break; +#endif + default: + return NULL; + } + + return br_mdb_ip_get(mdb, &ip); +} + +static void br_mdb_free(struct rcu_head *head) +{ + struct net_bridge_mdb_htable *mdb = + container_of(head, struct net_bridge_mdb_htable, rcu); + struct net_bridge_mdb_htable *old = mdb->old; + + mdb->old = NULL; + kfree(old->mhash); + kfree(old); +} + +static int br_mdb_copy(struct net_bridge_mdb_htable *new, + struct net_bridge_mdb_htable *old, + int elasticity) +{ + struct net_bridge_mdb_entry *mp; + struct hlist_node *p; + int maxlen; + int len; + int i; + + for (i = 0; i < old->max; i++) + hlist_for_each_entry(mp, p, &old->mhash[i], hlist[old->ver]) + hlist_add_head(&mp->hlist[new->ver], + &new->mhash[br_ip_hash(new, &mp->addr)]); + + if (!elasticity) + return 0; + + maxlen = 0; + for (i = 0; i < new->max; i++) { + len = 0; + hlist_for_each_entry(mp, p, &new->mhash[i], hlist[new->ver]) + len++; + if (len > maxlen) + maxlen = len; + } + + return maxlen > elasticity ? -EINVAL : 0; +} + +static void br_multicast_free_pg(struct rcu_head *head) +{ + struct net_bridge_port_group *p = + container_of(head, struct net_bridge_port_group, rcu); + + kfree(p); +} + +static void br_multicast_free_group(struct rcu_head *head) +{ + struct net_bridge_mdb_entry *mp = + container_of(head, struct net_bridge_mdb_entry, rcu); + + kfree(mp); +} + +static void br_multicast_group_expired(unsigned long data) +{ + struct net_bridge_mdb_entry *mp = (void *)data; + struct net_bridge *br = mp->br; + struct net_bridge_mdb_htable *mdb; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || timer_pending(&mp->timer)) + goto out; + + mp->mglist = false; + + if (mp->ports) + goto out; + + mdb = mlock_dereference(br->mdb, br); + + hlist_del_rcu(&mp->hlist[mdb->ver]); + mdb->size--; + + call_rcu_bh(&mp->rcu, br_multicast_free_group); + +out: + spin_unlock(&br->multicast_lock); +} + +static void br_multicast_del_pg(struct net_bridge *br, + struct net_bridge_port_group *pg) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct net_bridge_port_group *p; + struct net_bridge_port_group __rcu **pp; + + mdb = mlock_dereference(br->mdb, br); + + mp = br_mdb_ip_get(mdb, &pg->addr); + if (WARN_ON(!mp)) + return; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p != pg) + continue; + + rcu_assign_pointer(*pp, p->next); + hlist_del_init(&p->mglist); + del_timer(&p->timer); + call_rcu_bh(&p->rcu, br_multicast_free_pg); + + if (!mp->ports && !mp->mglist && + netif_running(br->dev)) + mod_timer(&mp->timer, jiffies); + + return; + } + + WARN_ON(1); +} + +static void br_multicast_port_group_expired(unsigned long data) +{ + struct net_bridge_port_group *pg = (void *)data; + struct net_bridge *br = pg->port->br; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || timer_pending(&pg->timer) || + hlist_unhashed(&pg->mglist)) + goto out; + + br_multicast_del_pg(br, pg); + +out: + spin_unlock(&br->multicast_lock); +} + +static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max, + int elasticity) +{ + struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1); + struct net_bridge_mdb_htable *mdb; + int err; + + mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC); + if (!mdb) + return -ENOMEM; + + mdb->max = max; + mdb->old = old; + + mdb->mhash = kzalloc(max * sizeof(*mdb->mhash), GFP_ATOMIC); + if (!mdb->mhash) { + kfree(mdb); + return -ENOMEM; + } + + mdb->size = old ? old->size : 0; + mdb->ver = old ? old->ver ^ 1 : 0; + + if (!old || elasticity) + get_random_bytes(&mdb->secret, sizeof(mdb->secret)); + else + mdb->secret = old->secret; + + if (!old) + goto out; + + err = br_mdb_copy(mdb, old, elasticity); + if (err) { + kfree(mdb->mhash); + kfree(mdb); + return err; + } + + call_rcu_bh(&mdb->rcu, br_mdb_free); + +out: + rcu_assign_pointer(*mdbp, mdb); + + return 0; +} + +static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, + __be32 group) +{ + struct sk_buff *skb; + struct igmphdr *ih; + struct ethhdr *eth; + struct iphdr *iph; + + skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) + + sizeof(*ih) + 4); + if (!skb) + goto out; + + skb->protocol = htons(ETH_P_IP); + + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + + memcpy(eth->h_source, br->dev->dev_addr, 6); + eth->h_dest[0] = 1; + eth->h_dest[1] = 0; + eth->h_dest[2] = 0x5e; + eth->h_dest[3] = 0; + eth->h_dest[4] = 0; + eth->h_dest[5] = 1; + eth->h_proto = htons(ETH_P_IP); + skb_put(skb, sizeof(*eth)); + + skb_set_network_header(skb, skb->len); + iph = ip_hdr(skb); + + iph->version = 4; + iph->ihl = 6; + iph->tos = 0xc0; + iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4); + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = 1; + iph->protocol = IPPROTO_IGMP; + iph->saddr = 0; + iph->daddr = htonl(INADDR_ALLHOSTS_GROUP); + ((u8 *)&iph[1])[0] = IPOPT_RA; + ((u8 *)&iph[1])[1] = 4; + ((u8 *)&iph[1])[2] = 0; + ((u8 *)&iph[1])[3] = 0; + ip_send_check(iph); + skb_put(skb, 24); + + skb_set_transport_header(skb, skb->len); + ih = igmp_hdr(skb); + ih->type = IGMP_HOST_MEMBERSHIP_QUERY; + ih->code = (group ? br->multicast_last_member_interval : + br->multicast_query_response_interval) / + (HZ / IGMP_TIMER_SCALE); + ih->group = group; + ih->csum = 0; + ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); + skb_put(skb, sizeof(*ih)); + + __skb_pull(skb, sizeof(*eth)); + +out: + return skb; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, + const struct in6_addr *group) +{ + struct sk_buff *skb; + struct ipv6hdr *ip6h; + struct mld_msg *mldq; + struct ethhdr *eth; + u8 *hopopt; + unsigned long interval; + + skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) + + 8 + sizeof(*mldq)); + if (!skb) + goto out; + + skb->protocol = htons(ETH_P_IPV6); + + /* Ethernet header */ + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + + memcpy(eth->h_source, br->dev->dev_addr, 6); + eth->h_proto = htons(ETH_P_IPV6); + skb_put(skb, sizeof(*eth)); + + /* IPv6 header + HbH option */ + skb_set_network_header(skb, skb->len); + ip6h = ipv6_hdr(skb); + + *(__force __be32 *)ip6h = htonl(0x60000000); + ip6h->payload_len = htons(8 + sizeof(*mldq)); + ip6h->nexthdr = IPPROTO_HOPOPTS; + ip6h->hop_limit = 1; + ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); + if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, + &ip6h->saddr)) { + kfree_skb(skb); + return NULL; + } + ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); + + hopopt = (u8 *)(ip6h + 1); + hopopt[0] = IPPROTO_ICMPV6; /* next hdr */ + hopopt[1] = 0; /* length of HbH */ + hopopt[2] = IPV6_TLV_ROUTERALERT; /* Router Alert */ + hopopt[3] = 2; /* Length of RA Option */ + hopopt[4] = 0; /* Type = 0x0000 (MLD) */ + hopopt[5] = 0; + hopopt[6] = IPV6_TLV_PAD0; /* Pad0 */ + hopopt[7] = IPV6_TLV_PAD0; /* Pad0 */ + + skb_put(skb, sizeof(*ip6h) + 8); + + /* ICMPv6 */ + skb_set_transport_header(skb, skb->len); + mldq = (struct mld_msg *) icmp6_hdr(skb); + + interval = ipv6_addr_any(group) ? br->multicast_last_member_interval : + br->multicast_query_response_interval; + + mldq->mld_type = ICMPV6_MGM_QUERY; + mldq->mld_code = 0; + mldq->mld_cksum = 0; + mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); + mldq->mld_reserved = 0; + ipv6_addr_copy(&mldq->mld_mca, group); + + /* checksum */ + mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + sizeof(*mldq), IPPROTO_ICMPV6, + csum_partial(mldq, + sizeof(*mldq), 0)); + skb_put(skb, sizeof(*mldq)); + + __skb_pull(skb, sizeof(*eth)); + +out: + return skb; +} +#endif + +static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, + struct br_ip *addr) +{ + switch (addr->proto) { + case htons(ETH_P_IP): + return br_ip4_multicast_alloc_query(br, addr->u.ip4); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case htons(ETH_P_IPV6): + return br_ip6_multicast_alloc_query(br, &addr->u.ip6); +#endif + } + return NULL; +} + +static struct net_bridge_mdb_entry *br_multicast_get_group( + struct net_bridge *br, struct net_bridge_port *port, + struct br_ip *group, int hash) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct hlist_node *p; + unsigned count = 0; + unsigned max; + int elasticity; + int err; + + mdb = rcu_dereference_protected(br->mdb, 1); + hlist_for_each_entry(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) { + count++; + if (unlikely(br_ip_equal(group, &mp->addr))) + return mp; + } + + elasticity = 0; + max = mdb->max; + + if (unlikely(count > br->hash_elasticity && count)) { + if (net_ratelimit()) + br_info(br, "Multicast hash table " + "chain limit reached: %s\n", + port ? port->dev->name : br->dev->name); + + elasticity = br->hash_elasticity; + } + + if (mdb->size >= max) { + max *= 2; + if (unlikely(max >= br->hash_max)) { + br_warn(br, "Multicast hash table maximum " + "reached, disabling snooping: %s, %d\n", + port ? port->dev->name : br->dev->name, max); + err = -E2BIG; +disable: + br->multicast_disabled = 1; + goto err; + } + } + + if (max > mdb->max || elasticity) { + if (mdb->old) { + if (net_ratelimit()) + br_info(br, "Multicast hash table " + "on fire: %s\n", + port ? port->dev->name : br->dev->name); + err = -EEXIST; + goto err; + } + + err = br_mdb_rehash(&br->mdb, max, elasticity); + if (err) { + br_warn(br, "Cannot rehash multicast " + "hash table, disabling snooping: %s, %d, %d\n", + port ? port->dev->name : br->dev->name, + mdb->size, err); + goto disable; + } + + err = -EAGAIN; + goto err; + } + + return NULL; + +err: + mp = ERR_PTR(err); + return mp; +} + +static struct net_bridge_mdb_entry *br_multicast_new_group( + struct net_bridge *br, struct net_bridge_port *port, + struct br_ip *group) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + int hash; + int err; + + mdb = rcu_dereference_protected(br->mdb, 1); + if (!mdb) { + err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0); + if (err) + return ERR_PTR(err); + goto rehash; + } + + hash = br_ip_hash(mdb, group); + mp = br_multicast_get_group(br, port, group, hash); + switch (PTR_ERR(mp)) { + case 0: + break; + + case -EAGAIN: +rehash: + mdb = rcu_dereference_protected(br->mdb, 1); + hash = br_ip_hash(mdb, group); + break; + + default: + goto out; + } + + mp = kzalloc(sizeof(*mp), GFP_ATOMIC); + if (unlikely(!mp)) + return ERR_PTR(-ENOMEM); + + mp->br = br; + mp->addr = *group; + setup_timer(&mp->timer, br_multicast_group_expired, + (unsigned long)mp); + + hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]); + mdb->size++; + +out: + return mp; +} + +static int br_multicast_add_group(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *group) +{ + struct net_bridge_mdb_entry *mp; + struct net_bridge_port_group *p; + struct net_bridge_port_group __rcu **pp; + unsigned long now = jiffies; + int err; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || + (port && port->state == BR_STATE_DISABLED)) + goto out; + + mp = br_multicast_new_group(br, port, group); + err = PTR_ERR(mp); + if (IS_ERR(mp)) + goto err; + + if (!port) { + mp->mglist = true; + mod_timer(&mp->timer, now + br->multicast_membership_interval); + goto out; + } + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p->port == port) + goto found; + if ((unsigned long)p->port < (unsigned long)port) + break; + } + + p = kzalloc(sizeof(*p), GFP_ATOMIC); + err = -ENOMEM; + if (unlikely(!p)) + goto err; + + p->addr = *group; + p->port = port; + p->next = *pp; + hlist_add_head(&p->mglist, &port->mglist); + setup_timer(&p->timer, br_multicast_port_group_expired, + (unsigned long)p); + + rcu_assign_pointer(*pp, p); + +found: + mod_timer(&p->timer, now + br->multicast_membership_interval); +out: + err = 0; + +err: + spin_unlock(&br->multicast_lock); + return err; +} + +static int br_ip4_multicast_add_group(struct net_bridge *br, + struct net_bridge_port *port, + __be32 group) +{ + struct br_ip br_group; + + if (ipv4_is_local_multicast(group)) + return 0; + + br_group.u.ip4 = group; + br_group.proto = htons(ETH_P_IP); + + return br_multicast_add_group(br, port, &br_group); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int br_ip6_multicast_add_group(struct net_bridge *br, + struct net_bridge_port *port, + const struct in6_addr *group) +{ + struct br_ip br_group; + + if (!ipv6_is_transient_multicast(group)) + return 0; + + ipv6_addr_copy(&br_group.u.ip6, group); + br_group.proto = htons(ETH_P_IPV6); + + return br_multicast_add_group(br, port, &br_group); +} +#endif + +static void br_multicast_router_expired(unsigned long data) +{ + struct net_bridge_port *port = (void *)data; + struct net_bridge *br = port->br; + + spin_lock(&br->multicast_lock); + if (port->multicast_router != 1 || + timer_pending(&port->multicast_router_timer) || + hlist_unhashed(&port->rlist)) + goto out; + + hlist_del_init_rcu(&port->rlist); + +out: + spin_unlock(&br->multicast_lock); +} + +static void br_multicast_local_router_expired(unsigned long data) +{ +} + +static void __br_multicast_send_query(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *ip) +{ + struct sk_buff *skb; + + skb = br_multicast_alloc_query(br, ip); + if (!skb) + return; + + if (port) { + __skb_push(skb, sizeof(struct ethhdr)); + skb->dev = port->dev; + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); + } else + netif_rx(skb); +} + +static void br_multicast_send_query(struct net_bridge *br, + struct net_bridge_port *port, u32 sent) +{ + unsigned long time; + struct br_ip br_group; + + if (!netif_running(br->dev) || br->multicast_disabled || + timer_pending(&br->multicast_querier_timer)) + return; + + memset(&br_group.u, 0, sizeof(br_group.u)); + + br_group.proto = htons(ETH_P_IP); + __br_multicast_send_query(br, port, &br_group); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + br_group.proto = htons(ETH_P_IPV6); + __br_multicast_send_query(br, port, &br_group); +#endif + + time = jiffies; + time += sent < br->multicast_startup_query_count ? + br->multicast_startup_query_interval : + br->multicast_query_interval; + mod_timer(port ? &port->multicast_query_timer : + &br->multicast_query_timer, time); +} + +static void br_multicast_port_query_expired(unsigned long data) +{ + struct net_bridge_port *port = (void *)data; + struct net_bridge *br = port->br; + + spin_lock(&br->multicast_lock); + if (port->state == BR_STATE_DISABLED || + port->state == BR_STATE_BLOCKING) + goto out; + + if (port->multicast_startup_queries_sent < + br->multicast_startup_query_count) + port->multicast_startup_queries_sent++; + + br_multicast_send_query(port->br, port, + port->multicast_startup_queries_sent); + +out: + spin_unlock(&br->multicast_lock); +} + +void br_multicast_add_port(struct net_bridge_port *port) +{ + port->multicast_router = 1; + + setup_timer(&port->multicast_router_timer, br_multicast_router_expired, + (unsigned long)port); + setup_timer(&port->multicast_query_timer, + br_multicast_port_query_expired, (unsigned long)port); +} + +void br_multicast_del_port(struct net_bridge_port *port) +{ + del_timer_sync(&port->multicast_router_timer); +} + +static void __br_multicast_enable_port(struct net_bridge_port *port) +{ + port->multicast_startup_queries_sent = 0; + + if (try_to_del_timer_sync(&port->multicast_query_timer) >= 0 || + del_timer(&port->multicast_query_timer)) + mod_timer(&port->multicast_query_timer, jiffies); +} + +void br_multicast_enable_port(struct net_bridge_port *port) +{ + struct net_bridge *br = port->br; + + spin_lock(&br->multicast_lock); + if (br->multicast_disabled || !netif_running(br->dev)) + goto out; + + __br_multicast_enable_port(port); + +out: + spin_unlock(&br->multicast_lock); +} + +void br_multicast_disable_port(struct net_bridge_port *port) +{ + struct net_bridge *br = port->br; + struct net_bridge_port_group *pg; + struct hlist_node *p, *n; + + spin_lock(&br->multicast_lock); + hlist_for_each_entry_safe(pg, p, n, &port->mglist, mglist) + br_multicast_del_pg(br, pg); + + if (!hlist_unhashed(&port->rlist)) + hlist_del_init_rcu(&port->rlist); + del_timer(&port->multicast_router_timer); + del_timer(&port->multicast_query_timer); + spin_unlock(&br->multicast_lock); +} + +static int br_ip4_multicast_igmp3_report(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct igmpv3_report *ih; + struct igmpv3_grec *grec; + int i; + int len; + int num; + int type; + int err = 0; + __be32 group; + + if (!pskb_may_pull(skb, sizeof(*ih))) + return -EINVAL; + + ih = igmpv3_report_hdr(skb); + num = ntohs(ih->ngrec); + len = sizeof(*ih); + + for (i = 0; i < num; i++) { + len += sizeof(*grec); + if (!pskb_may_pull(skb, len)) + return -EINVAL; + + grec = (void *)(skb->data + len - sizeof(*grec)); + group = grec->grec_mca; + type = grec->grec_type; + + len += ntohs(grec->grec_nsrcs) * 4; + if (!pskb_may_pull(skb, len)) + return -EINVAL; + + /* We treat this as an IGMPv2 report for now. */ + switch (type) { + case IGMPV3_MODE_IS_INCLUDE: + case IGMPV3_MODE_IS_EXCLUDE: + case IGMPV3_CHANGE_TO_INCLUDE: + case IGMPV3_CHANGE_TO_EXCLUDE: + case IGMPV3_ALLOW_NEW_SOURCES: + case IGMPV3_BLOCK_OLD_SOURCES: + break; + + default: + continue; + } + + err = br_ip4_multicast_add_group(br, port, group); + if (err) + break; + } + + return err; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int br_ip6_multicast_mld2_report(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct icmp6hdr *icmp6h; + struct mld2_grec *grec; + int i; + int len; + int num; + int err = 0; + + if (!pskb_may_pull(skb, sizeof(*icmp6h))) + return -EINVAL; + + icmp6h = icmp6_hdr(skb); + num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); + len = sizeof(*icmp6h); + + for (i = 0; i < num; i++) { + __be16 *nsrcs, _nsrcs; + + nsrcs = skb_header_pointer(skb, + len + offsetof(struct mld2_grec, + grec_nsrcs), + sizeof(_nsrcs), &_nsrcs); + if (!nsrcs) + return -EINVAL; + + if (!pskb_may_pull(skb, + len + sizeof(*grec) + + sizeof(struct in6_addr) * ntohs(*nsrcs))) + return -EINVAL; + + grec = (struct mld2_grec *)(skb->data + len); + len += sizeof(*grec) + + sizeof(struct in6_addr) * ntohs(*nsrcs); + + /* We treat these as MLDv1 reports for now. */ + switch (grec->grec_type) { + case MLD2_MODE_IS_INCLUDE: + case MLD2_MODE_IS_EXCLUDE: + case MLD2_CHANGE_TO_INCLUDE: + case MLD2_CHANGE_TO_EXCLUDE: + case MLD2_ALLOW_NEW_SOURCES: + case MLD2_BLOCK_OLD_SOURCES: + break; + + default: + continue; + } + + err = br_ip6_multicast_add_group(br, port, &grec->grec_mca); + if (!err) + break; + } + + return err; +} +#endif + +/* + * Add port to rotuer_list + * list is maintained ordered by pointer value + * and locked by br->multicast_lock and RCU + */ +static void br_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port) +{ + struct net_bridge_port *p; + struct hlist_node *n, *slot = NULL; + + hlist_for_each_entry(p, n, &br->router_list, rlist) { + if ((unsigned long) port >= (unsigned long) p) + break; + slot = n; + } + + if (slot) + hlist_add_after_rcu(slot, &port->rlist); + else + hlist_add_head_rcu(&port->rlist, &br->router_list); +} + +static void br_multicast_mark_router(struct net_bridge *br, + struct net_bridge_port *port) +{ + unsigned long now = jiffies; + + if (!port) { + if (br->multicast_router == 1) + mod_timer(&br->multicast_router_timer, + now + br->multicast_querier_interval); + return; + } + + if (port->multicast_router != 1) + return; + + if (!hlist_unhashed(&port->rlist)) + goto timer; + + br_multicast_add_router(br, port); + +timer: + mod_timer(&port->multicast_router_timer, + now + br->multicast_querier_interval); +} + +static void br_multicast_query_received(struct net_bridge *br, + struct net_bridge_port *port, + int saddr) +{ + if (saddr) + mod_timer(&br->multicast_querier_timer, + jiffies + br->multicast_querier_interval); + else if (timer_pending(&br->multicast_querier_timer)) + return; + + br_multicast_mark_router(br, port); +} + +static int br_ip4_multicast_query(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + struct igmphdr *ih = igmp_hdr(skb); + struct net_bridge_mdb_entry *mp; + struct igmpv3_query *ih3; + struct net_bridge_port_group *p; + struct net_bridge_port_group __rcu **pp; + unsigned long max_delay; + unsigned long now = jiffies; + __be32 group; + int err = 0; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || + (port && port->state == BR_STATE_DISABLED)) + goto out; + + br_multicast_query_received(br, port, !!iph->saddr); + + group = ih->group; + + if (skb->len == sizeof(*ih)) { + max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); + + if (!max_delay) { + max_delay = 10 * HZ; + group = 0; + } + } else { + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { + err = -EINVAL; + goto out; + } + + ih3 = igmpv3_query_hdr(skb); + if (ih3->nsrcs) + goto out; + + max_delay = ih3->code ? + IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; + } + + if (!group) + goto out; + + mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group); + if (!mp) + goto out; + + max_delay *= br->multicast_last_member_count; + + if (mp->mglist && + (timer_pending(&mp->timer) ? + time_after(mp->timer.expires, now + max_delay) : + try_to_del_timer_sync(&mp->timer) >= 0)) + mod_timer(&mp->timer, now + max_delay); + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (timer_pending(&p->timer) ? + time_after(p->timer.expires, now + max_delay) : + try_to_del_timer_sync(&p->timer) >= 0) + mod_timer(&p->timer, now + max_delay); + } + +out: + spin_unlock(&br->multicast_lock); + return err; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int br_ip6_multicast_query(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct mld_msg *mld = (struct mld_msg *) icmp6_hdr(skb); + struct net_bridge_mdb_entry *mp; + struct mld2_query *mld2q; + struct net_bridge_port_group *p; + struct net_bridge_port_group __rcu **pp; + unsigned long max_delay; + unsigned long now = jiffies; + const struct in6_addr *group = NULL; + int err = 0; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || + (port && port->state == BR_STATE_DISABLED)) + goto out; + + br_multicast_query_received(br, port, !ipv6_addr_any(&ip6h->saddr)); + + if (skb->len == sizeof(*mld)) { + if (!pskb_may_pull(skb, sizeof(*mld))) { + err = -EINVAL; + goto out; + } + mld = (struct mld_msg *) icmp6_hdr(skb); + max_delay = msecs_to_jiffies(htons(mld->mld_maxdelay)); + if (max_delay) + group = &mld->mld_mca; + } else if (skb->len >= sizeof(*mld2q)) { + if (!pskb_may_pull(skb, sizeof(*mld2q))) { + err = -EINVAL; + goto out; + } + mld2q = (struct mld2_query *)icmp6_hdr(skb); + if (!mld2q->mld2q_nsrcs) + group = &mld2q->mld2q_mca; + max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(mld2q->mld2q_mrc) : 1; + } + + if (!group) + goto out; + + mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group); + if (!mp) + goto out; + + max_delay *= br->multicast_last_member_count; + if (mp->mglist && + (timer_pending(&mp->timer) ? + time_after(mp->timer.expires, now + max_delay) : + try_to_del_timer_sync(&mp->timer) >= 0)) + mod_timer(&mp->timer, now + max_delay); + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (timer_pending(&p->timer) ? + time_after(p->timer.expires, now + max_delay) : + try_to_del_timer_sync(&p->timer) >= 0) + mod_timer(&p->timer, now + max_delay); + } + +out: + spin_unlock(&br->multicast_lock); + return err; +} +#endif + +static void br_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *group) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct net_bridge_port_group *p; + unsigned long now; + unsigned long time; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || + (port && port->state == BR_STATE_DISABLED) || + timer_pending(&br->multicast_querier_timer)) + goto out; + + mdb = mlock_dereference(br->mdb, br); + mp = br_mdb_ip_get(mdb, group); + if (!mp) + goto out; + + now = jiffies; + time = now + br->multicast_last_member_count * + br->multicast_last_member_interval; + + if (!port) { + if (mp->mglist && + (timer_pending(&mp->timer) ? + time_after(mp->timer.expires, time) : + try_to_del_timer_sync(&mp->timer) >= 0)) { + mod_timer(&mp->timer, time); + } + + goto out; + } + + for (p = mlock_dereference(mp->ports, br); + p != NULL; + p = mlock_dereference(p->next, br)) { + if (p->port != port) + continue; + + if (!hlist_unhashed(&p->mglist) && + (timer_pending(&p->timer) ? + time_after(p->timer.expires, time) : + try_to_del_timer_sync(&p->timer) >= 0)) { + mod_timer(&p->timer, time); + } + + break; + } + +out: + spin_unlock(&br->multicast_lock); +} + +static void br_ip4_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + __be32 group) +{ + struct br_ip br_group; + + if (ipv4_is_local_multicast(group)) + return; + + br_group.u.ip4 = group; + br_group.proto = htons(ETH_P_IP); + + br_multicast_leave_group(br, port, &br_group); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void br_ip6_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + const struct in6_addr *group) +{ + struct br_ip br_group; + + if (!ipv6_is_transient_multicast(group)) + return; + + ipv6_addr_copy(&br_group.u.ip6, group); + br_group.proto = htons(ETH_P_IPV6); + + br_multicast_leave_group(br, port, &br_group); +} +#endif + +static int br_multicast_ipv4_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct sk_buff *skb2 = skb; + const struct iphdr *iph; + struct igmphdr *ih; + unsigned len; + unsigned offset; + int err; + + /* We treat OOM as packet loss for now. */ + if (!pskb_may_pull(skb, sizeof(*iph))) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->ihl < 5 || iph->version != 4) + return -EINVAL; + + if (!pskb_may_pull(skb, ip_hdrlen(skb))) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + if (iph->protocol != IPPROTO_IGMP) { + if ((iph->daddr & IGMP_LOCAL_GROUP_MASK) != IGMP_LOCAL_GROUP) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; + return 0; + } + + len = ntohs(iph->tot_len); + if (skb->len < len || len < ip_hdrlen(skb)) + return -EINVAL; + + if (skb->len > len) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + return -ENOMEM; + + err = pskb_trim_rcsum(skb2, len); + if (err) + goto err_out; + } + + len -= ip_hdrlen(skb2); + offset = skb_network_offset(skb2) + ip_hdrlen(skb2); + __skb_pull(skb2, offset); + skb_reset_transport_header(skb2); + + err = -EINVAL; + if (!pskb_may_pull(skb2, sizeof(*ih))) + goto out; + + switch (skb2->ip_summed) { + case CHECKSUM_COMPLETE: + if (!csum_fold(skb2->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb2->csum = 0; + if (skb_checksum_complete(skb2)) + goto out; + } + + err = 0; + + BR_INPUT_SKB_CB(skb)->igmp = 1; + ih = igmp_hdr(skb2); + + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; + err = br_ip4_multicast_add_group(br, port, ih->group); + break; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + err = br_ip4_multicast_igmp3_report(br, port, skb2); + break; + case IGMP_HOST_MEMBERSHIP_QUERY: + err = br_ip4_multicast_query(br, port, skb2); + break; + case IGMP_HOST_LEAVE_MESSAGE: + br_ip4_multicast_leave_group(br, port, ih->group); + break; + } + +out: + __skb_push(skb2, offset); +err_out: + if (skb2 != skb) + kfree_skb(skb2); + return err; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int br_multicast_ipv6_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + struct sk_buff *skb2; + const struct ipv6hdr *ip6h; + u8 icmp6_type; + u8 nexthdr; + unsigned len; + int offset; + int err; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + /* + * We're interested in MLD messages only. + * - Version is 6 + * - MLD has always Router Alert hop-by-hop option + * - But we do not support jumbrograms. + */ + if (ip6h->version != 6 || + ip6h->nexthdr != IPPROTO_HOPOPTS || + ip6h->payload_len == 0) + return 0; + + len = ntohs(ip6h->payload_len) + sizeof(*ip6h); + if (skb->len < len) + return -EINVAL; + + nexthdr = ip6h->nexthdr; + offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr); + + if (offset < 0 || nexthdr != IPPROTO_ICMPV6) + return 0; + + /* Okay, we found ICMPv6 header */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + return -ENOMEM; + + err = -EINVAL; + if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) + goto out; + + len -= offset - skb_network_offset(skb2); + + __skb_pull(skb2, offset); + skb_reset_transport_header(skb2); + + icmp6_type = icmp6_hdr(skb2)->icmp6_type; + + switch (icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + break; + default: + err = 0; + goto out; + } + + /* Okay, we found MLD message. Check further. */ + if (skb2->len > len) { + err = pskb_trim_rcsum(skb2, len); + if (err) + goto out; + err = -EINVAL; + } + + ip6h = ipv6_hdr(skb2); + + switch (skb2->ip_summed) { + case CHECKSUM_COMPLETE: + if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len, + IPPROTO_ICMPV6, skb2->csum)) + break; + /*FALLTHROUGH*/ + case CHECKSUM_NONE: + skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb2->len, + IPPROTO_ICMPV6, 0)); + if (__skb_checksum_complete(skb2)) + goto out; + } + + err = 0; + + BR_INPUT_SKB_CB(skb)->igmp = 1; + + switch (icmp6_type) { + case ICMPV6_MGM_REPORT: + { + struct mld_msg *mld; + if (!pskb_may_pull(skb2, sizeof(*mld))) { + err = -EINVAL; + goto out; + } + mld = (struct mld_msg *)skb_transport_header(skb2); + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; + err = br_ip6_multicast_add_group(br, port, &mld->mld_mca); + break; + } + case ICMPV6_MLD2_REPORT: + err = br_ip6_multicast_mld2_report(br, port, skb2); + break; + case ICMPV6_MGM_QUERY: + err = br_ip6_multicast_query(br, port, skb2); + break; + case ICMPV6_MGM_REDUCTION: + { + struct mld_msg *mld; + if (!pskb_may_pull(skb2, sizeof(*mld))) { + err = -EINVAL; + goto out; + } + mld = (struct mld_msg *)skb_transport_header(skb2); + br_ip6_multicast_leave_group(br, port, &mld->mld_mca); + } + } + +out: + kfree_skb(skb2); + return err; +} +#endif + +int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, + struct sk_buff *skb) +{ + BR_INPUT_SKB_CB(skb)->igmp = 0; + BR_INPUT_SKB_CB(skb)->mrouters_only = 0; + + if (br->multicast_disabled) + return 0; + + switch (skb->protocol) { + case htons(ETH_P_IP): + return br_multicast_ipv4_rcv(br, port, skb); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case htons(ETH_P_IPV6): + return br_multicast_ipv6_rcv(br, port, skb); +#endif + } + + return 0; +} + +static void br_multicast_query_expired(unsigned long data) +{ + struct net_bridge *br = (void *)data; + + spin_lock(&br->multicast_lock); + if (br->multicast_startup_queries_sent < + br->multicast_startup_query_count) + br->multicast_startup_queries_sent++; + + br_multicast_send_query(br, NULL, br->multicast_startup_queries_sent); + + spin_unlock(&br->multicast_lock); +} + +void br_multicast_init(struct net_bridge *br) +{ + br->hash_elasticity = 4; + br->hash_max = 512; + + br->multicast_router = 1; + br->multicast_last_member_count = 2; + br->multicast_startup_query_count = 2; + + br->multicast_last_member_interval = HZ; + br->multicast_query_response_interval = 10 * HZ; + br->multicast_startup_query_interval = 125 * HZ / 4; + br->multicast_query_interval = 125 * HZ; + br->multicast_querier_interval = 255 * HZ; + br->multicast_membership_interval = 260 * HZ; + + spin_lock_init(&br->multicast_lock); + setup_timer(&br->multicast_router_timer, + br_multicast_local_router_expired, 0); + setup_timer(&br->multicast_querier_timer, + br_multicast_local_router_expired, 0); + setup_timer(&br->multicast_query_timer, br_multicast_query_expired, + (unsigned long)br); +} + +void br_multicast_open(struct net_bridge *br) +{ + br->multicast_startup_queries_sent = 0; + + if (br->multicast_disabled) + return; + + mod_timer(&br->multicast_query_timer, jiffies); +} + +void br_multicast_stop(struct net_bridge *br) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct hlist_node *p, *n; + u32 ver; + int i; + + del_timer_sync(&br->multicast_router_timer); + del_timer_sync(&br->multicast_querier_timer); + del_timer_sync(&br->multicast_query_timer); + + spin_lock_bh(&br->multicast_lock); + mdb = mlock_dereference(br->mdb, br); + if (!mdb) + goto out; + + br->mdb = NULL; + + ver = mdb->ver; + for (i = 0; i < mdb->max; i++) { + hlist_for_each_entry_safe(mp, p, n, &mdb->mhash[i], + hlist[ver]) { + del_timer(&mp->timer); + call_rcu_bh(&mp->rcu, br_multicast_free_group); + } + } + + if (mdb->old) { + spin_unlock_bh(&br->multicast_lock); + rcu_barrier_bh(); + spin_lock_bh(&br->multicast_lock); + WARN_ON(mdb->old); + } + + mdb->old = mdb; + call_rcu_bh(&mdb->rcu, br_mdb_free); + +out: + spin_unlock_bh(&br->multicast_lock); +} + +int br_multicast_set_router(struct net_bridge *br, unsigned long val) +{ + int err = -ENOENT; + + spin_lock_bh(&br->multicast_lock); + if (!netif_running(br->dev)) + goto unlock; + + switch (val) { + case 0: + case 2: + del_timer(&br->multicast_router_timer); + /* fall through */ + case 1: + br->multicast_router = val; + err = 0; + break; + + default: + err = -EINVAL; + break; + } + +unlock: + spin_unlock_bh(&br->multicast_lock); + + return err; +} + +int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) +{ + struct net_bridge *br = p->br; + int err = -ENOENT; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED) + goto unlock; + + switch (val) { + case 0: + case 1: + case 2: + p->multicast_router = val; + err = 0; + + if (val < 2 && !hlist_unhashed(&p->rlist)) + hlist_del_init_rcu(&p->rlist); + + if (val == 1) + break; + + del_timer(&p->multicast_router_timer); + + if (val == 0) + break; + + br_multicast_add_router(br, p); + break; + + default: + err = -EINVAL; + break; + } + +unlock: + spin_unlock(&br->multicast_lock); + + return err; +} + +int br_multicast_toggle(struct net_bridge *br, unsigned long val) +{ + struct net_bridge_port *port; + int err = 0; + struct net_bridge_mdb_htable *mdb; + + spin_lock(&br->multicast_lock); + if (br->multicast_disabled == !val) + goto unlock; + + br->multicast_disabled = !val; + if (br->multicast_disabled) + goto unlock; + + if (!netif_running(br->dev)) + goto unlock; + + mdb = mlock_dereference(br->mdb, br); + if (mdb) { + if (mdb->old) { + err = -EEXIST; +rollback: + br->multicast_disabled = !!val; + goto unlock; + } + + err = br_mdb_rehash(&br->mdb, mdb->max, + br->hash_elasticity); + if (err) + goto rollback; + } + + br_multicast_open(br); + list_for_each_entry(port, &br->port_list, list) { + if (port->state == BR_STATE_DISABLED || + port->state == BR_STATE_BLOCKING) + continue; + + __br_multicast_enable_port(port); + } + +unlock: + spin_unlock(&br->multicast_lock); + + return err; +} + +int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) +{ + int err = -ENOENT; + u32 old; + struct net_bridge_mdb_htable *mdb; + + spin_lock(&br->multicast_lock); + if (!netif_running(br->dev)) + goto unlock; + + err = -EINVAL; + if (!is_power_of_2(val)) + goto unlock; + + mdb = mlock_dereference(br->mdb, br); + if (mdb && val < mdb->size) + goto unlock; + + err = 0; + + old = br->hash_max; + br->hash_max = val; + + if (mdb) { + if (mdb->old) { + err = -EEXIST; +rollback: + br->hash_max = old; + goto unlock; + } + + err = br_mdb_rehash(&br->mdb, br->hash_max, + br->hash_elasticity); + if (err) + goto rollback; + } + +unlock: + spin_unlock(&br->multicast_lock); + + return err; +} diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c new file mode 100644 index 00000000..3dc7f544 --- /dev/null +++ b/net/bridge/br_netfilter.c @@ -0,0 +1,1037 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +#define skb_origaddr(skb) (((struct bridge_skb_cb *) \ + (skb->nf_bridge->data))->daddr.ipv4) +#define store_orig_dstaddr(skb) (skb_origaddr(skb) = ip_hdr(skb)->daddr) +#define dnat_took_place(skb) (skb_origaddr(skb) != ip_hdr(skb)->daddr) + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *brnf_sysctl_header; +static int brnf_call_iptables __read_mostly = 1; +static int brnf_call_ip6tables __read_mostly = 1; +static int brnf_call_arptables __read_mostly = 1; +static int brnf_filter_vlan_tagged __read_mostly = 0; +static int brnf_filter_pppoe_tagged __read_mostly = 0; +#else +#define brnf_call_iptables 1 +#define brnf_call_ip6tables 1 +#define brnf_call_arptables 1 +#define brnf_filter_vlan_tagged 0 +#define brnf_filter_pppoe_tagged 0 +#endif + +static inline __be16 vlan_proto(const struct sk_buff *skb) +{ + if (vlan_tx_tag_present(skb)) + return skb->protocol; + else if (skb->protocol == htons(ETH_P_8021Q)) + return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + else + return 0; +} + +#define IS_VLAN_IP(skb) \ + (vlan_proto(skb) == htons(ETH_P_IP) && \ + brnf_filter_vlan_tagged) + +#define IS_VLAN_IPV6(skb) \ + (vlan_proto(skb) == htons(ETH_P_IPV6) && \ + brnf_filter_vlan_tagged) + +#define IS_VLAN_ARP(skb) \ + (vlan_proto(skb) == htons(ETH_P_ARP) && \ + brnf_filter_vlan_tagged) + +static inline __be16 pppoe_proto(const struct sk_buff *skb) +{ + return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); +} + +#define IS_PPPOE_IP(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IP) && \ + brnf_filter_pppoe_tagged) + +#define IS_PPPOE_IPV6(skb) \ + (skb->protocol == htons(ETH_P_PPP_SES) && \ + pppoe_proto(skb) == htons(PPP_IPV6) && \ + brnf_filter_pppoe_tagged) + +static void fake_update_pmtu(struct dst_entry *dst, u32 mtu) +{ +} + +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +static struct dst_ops fake_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .update_pmtu = fake_update_pmtu, + .cow_metrics = fake_cow_metrics, +}; + +/* + * Initialize bogus route table used to keep netfilter happy. + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. + */ +static const u32 br_dst_default_metrics[RTAX_MAX] = { + [RTAX_MTU - 1] = 1500, +}; + +void br_netfilter_rtable_init(struct net_bridge *br) +{ + struct rtable *rt = &br->fake_rtable; + + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM; + rt->dst.ops = &fake_dst_ops; +} + +static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) +{ + struct net_bridge_port *port; + + port = br_port_get_rcu(dev); + return port ? &port->br->fake_rtable : NULL; +} + +static inline struct net_device *bridge_parent(const struct net_device *dev) +{ + struct net_bridge_port *port; + + port = br_port_get_rcu(dev); + return port ? port->br->dev : NULL; +} + +static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) +{ + skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); + if (likely(skb->nf_bridge)) + atomic_set(&(skb->nf_bridge->use), 1); + + return skb->nf_bridge; +} + +static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (atomic_read(&nf_bridge->use) > 1) { + struct nf_bridge_info *tmp = nf_bridge_alloc(skb); + + if (tmp) { + memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); + atomic_set(&tmp->use, 1); + } + nf_bridge_put(nf_bridge); + nf_bridge = tmp; + } + return nf_bridge; +} + +static inline void nf_bridge_push_encap_header(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_push(skb, len); + skb->network_header -= len; +} + +static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull(skb, len); + skb->network_header += len; +} + +static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) +{ + unsigned int len = nf_bridge_encap_header_len(skb); + + skb_pull_rcsum(skb, len); + skb->network_header += len; +} + +static inline void nf_bridge_save_header(struct sk_buff *skb) +{ + int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + + skb_copy_from_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); +} + +static inline void nf_bridge_update_protocol(struct sk_buff *skb) +{ + if (skb->nf_bridge->mask & BRNF_8021Q) + skb->protocol = htons(ETH_P_8021Q); + else if (skb->nf_bridge->mask & BRNF_PPPoE) + skb->protocol = htons(ETH_P_PPP_SES); +} + +/* When handing a packet over to the IP layer + * check whether we have a skb that is in the + * expected format + */ + +static int br_parse_ip_options(struct sk_buff *skb) +{ + struct ip_options *opt; + const struct iphdr *iph; + struct net_device *dev = skb->dev; + u32 len; + + iph = ip_hdr(skb); + opt = &(IPCB(skb)->opt); + + /* Basic sanity checks */ + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = ip_hdr(skb); + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) + goto inhdr_error; + + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + if (iph->ihl == 5) + return 0; + + opt->optlen = iph->ihl*4 - sizeof(struct iphdr); + if (ip_options_compile(dev_net(dev), opt, skb)) + goto inhdr_error; + + /* Check correct handling of SRR option */ + if (unlikely(opt->srr)) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) + goto drop; + + if (ip_options_rcv_srr(skb)) + goto drop; + } + + return 0; + +inhdr_error: + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +/* Fill in the header for fragmented IP packets handled by + * the IPv4 connection tracking code. + */ +int nf_bridge_copy_header(struct sk_buff *skb) +{ + int err; + unsigned int header_size; + + nf_bridge_update_protocol(skb); + header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); + err = skb_cow_head(skb, header_size); + if (err) + return err; + + skb_copy_to_linear_data_offset(skb, -header_size, + skb->nf_bridge->data, header_size); + __skb_push(skb, nf_bridge_encap_header_len(skb)); + return 0; +} + +/* PF_BRIDGE/PRE_ROUTING *********************************************/ +/* Undo the changes made for ip6tables PREROUTING and continue the + * bridge PRE_ROUTING hook. */ +static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct rtable *rt; + + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* Obtain the correct destination MAC address, while preserving the original + * source MAC address. If we already know this address, we just copy it. If we + * don't, we use the neighbour framework to find out. In both cases, we make + * sure that br_handle_frame_finish() is called afterwards. + */ +static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct neighbour *neigh; + struct dst_entry *dst; + + skb->dev = bridge_parent(skb->dev); + if (!skb->dev) + goto free_skb; + dst = skb_dst(skb); + neigh = dst_get_neighbour(dst); + if (dst->hh) { + neigh_hh_bridge(dst->hh, skb); + skb->dev = nf_bridge->physindev; + return br_handle_frame_finish(skb); + } else if (neigh) { + /* the neighbour function below overwrites the complete + * MAC header, so we save the Ethernet source address and + * protocol number. */ + skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), skb->nf_bridge->data, ETH_HLEN-ETH_ALEN); + /* tell br_dev_xmit to continue with forwarding */ + nf_bridge->mask |= BRNF_BRIDGED_DNAT; + return neigh->output(skb); + } +free_skb: + kfree_skb(skb); + return 0; +} + +/* This requires some explaining. If DNAT has taken place, + * we will need to fix up the destination Ethernet address. + * + * There are two cases to consider: + * 1. The packet was DNAT'ed to a device in the same bridge + * port group as it was received on. We can still bridge + * the packet. + * 2. The packet was DNAT'ed to a different device, either + * a non-bridged device or another bridge port group. + * The packet will need to be routed. + * + * The correct way of distinguishing between these two cases is to + * call ip_route_input() and to look at skb->dst->dev, which is + * changed to the destination device if ip_route_input() succeeds. + * + * Let's first consider the case that ip_route_input() succeeds: + * + * If the output device equals the logical bridge device the packet + * came in on, we can consider this bridging. The corresponding MAC + * address will be obtained in br_nf_pre_routing_finish_bridge. + * Otherwise, the packet is considered to be routed and we just + * change the destination MAC address so that the packet will + * later be passed up to the IP stack to be routed. For a redirected + * packet, ip_route_input() will give back the localhost as output device, + * which differs from the bridge device. + * + * Let's now consider the case that ip_route_input() fails: + * + * This can be because the destination address is martian, in which case + * the packet will be dropped. + * If IP forwarding is disabled, ip_route_input() will fail, while + * ip_route_output_key() can return success. The source + * address for ip_route_output_key() is set to zero, so ip_route_output_key() + * thinks we're handling a locally generated packet and won't care + * if IP forwarding is enabled. If the output device equals the logical bridge + * device, we proceed as if ip_route_input() succeeded. If it differs from the + * logical bridge port or if ip_route_output_key() fails we drop the packet. + */ +static int br_nf_pre_routing_finish(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct iphdr *iph = ip_hdr(skb); + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct rtable *rt; + int err; + + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + if (dnat_took_place(skb)) { + if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + /* If err equals -EHOSTUNREACH the error is due to a + * martian destination or due to the fact that + * forwarding is disabled. For most martian packets, + * ip_route_output_key() will fail. It won't fail for 2 types of + * martian destinations: loopback destinations and destination + * 0.0.0.0. In both cases the packet will be dropped because the + * destination is the loopback device and not the bridge. */ + if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) + goto free_skb; + + rt = ip_route_output(dev_net(dev), iph->daddr, 0, + RT_TOS(iph->tos), 0); + if (!IS_ERR(rt)) { + /* - Bridged-and-DNAT'ed traffic doesn't + * require ip_forwarding. */ + if (rt->dst.dev == dev) { + skb_dst_set(skb, &rt->dst); + goto bridged_dnat; + } + ip_rt_put(rt); + } +free_skb: + kfree_skb(skb); + return 0; + } else { + if (skb_dst(skb)->dev == dev) { +bridged_dnat: + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, + NF_BR_PRE_ROUTING, + skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN); + skb->pkt_type = PACKET_HOST; + } + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* Some common code for IPv4/IPv6 */ +static struct net_device *setup_pre_routing(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->mask |= BRNF_PKT_TYPE; + } + + nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->physindev = skb->dev; + skb->dev = bridge_parent(skb->dev); + if (skb->protocol == htons(ETH_P_8021Q)) + nf_bridge->mask |= BRNF_8021Q; + else if (skb->protocol == htons(ETH_P_PPP_SES)) + nf_bridge->mask |= BRNF_PPPoE; + + return skb->dev; +} + +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */ +static int check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); + u32 pkt_len; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; + int len = (raw[1] + 1) << 3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (nh[off + 1] != 4 || (off & 3) != 2) + goto bad; + pkt_len = ntohl(*(__be32 *) (nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + goto bad; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pskb_trim_rcsum(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + nh = skb_network_header(skb); + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; + +} + +/* Replicate the checks that IPv6 does on packet reception and pass the packet + * to ip6tables, which doesn't support NAT, so things are fairly simple. */ +static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct ipv6hdr *hdr; + u32 pkt_len; + + if (skb->len < sizeof(struct ipv6hdr)) + return NF_DROP; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return NF_DROP; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + return NF_DROP; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + return NF_DROP; + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) + return NF_DROP; + } + if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + skb->protocol = htons(ETH_P_IPV6); + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + br_nf_pre_routing_finish_ipv6); + + return NF_STOLEN; +} + +/* Direct IPv6 traffic to br_nf_pre_routing_ipv6. + * Replicate the checks that IPv4 does on packet reception. + * Set skb->dev to the bridge device (i.e. parent of the + * receiving device) to make netfilter happy, the REDIRECT + * target in particular. Save the original destination IP + * address to be able to detect DNAT afterwards. */ +static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct net_bridge_port *p; + struct net_bridge *br; + __u32 len = nf_bridge_encap_header_len(skb); + + if (unlikely(!pskb_may_pull(skb, len))) + return NF_DROP; + + p = br_port_get_rcu(in); + if (p == NULL) + return NF_DROP; + br = p->br; + + if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || + IS_PPPOE_IPV6(skb)) { + if (!brnf_call_ip6tables && !br->nf_call_ip6tables) + return NF_ACCEPT; + + nf_bridge_pull_encap_header_rcsum(skb); + return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn); + } + + if (!brnf_call_iptables && !br->nf_call_iptables) + return NF_ACCEPT; + + if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) && + !IS_PPPOE_IP(skb)) + return NF_ACCEPT; + + nf_bridge_pull_encap_header_rcsum(skb); + + if (br_parse_ip_options(skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + store_orig_dstaddr(skb); + skb->protocol = htons(ETH_P_IP); + + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + br_nf_pre_routing_finish); + + return NF_STOLEN; +} + + +/* PF_BRIDGE/LOCAL_IN ************************************************/ +/* The packet is locally destined, which requires a real + * dst_entry, so detach the fake one. On the way up, the + * packet would pass through PRE_ROUTING again (which already + * took place when the packet entered the bridge), but we + * register an IPv4 PRE_ROUTING 'sabotage' hook that will + * prevent this from happening. */ +static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = skb_rtable(skb); + + if (rt && rt == bridge_parent_rtable(in)) + skb_dst_drop(skb); + + return NF_ACCEPT; +} + +/* PF_BRIDGE/FORWARD *************************************************/ +static int br_nf_forward_finish(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct net_device *in; + + if (skb->protocol != htons(ETH_P_ARP) && !IS_VLAN_ARP(skb)) { + in = nf_bridge->physindev; + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + nf_bridge_update_protocol(skb); + } else { + in = *((struct net_device **)(skb->cb)); + } + nf_bridge_push_encap_header(skb); + + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, in, + skb->dev, br_forward_finish, 1); + return 0; +} + +/* This is the 'purely bridged' case. For IP, we pass the packet to + * netfilter with indev and outdev set to the bridge device, + * but we are still able to filter on the 'real' indev/outdev + * because of the physdev module. For ARP, indev and outdev are the + * bridge ports. */ +static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_bridge_info *nf_bridge; + struct net_device *parent; + u_int8_t pf; + + if (!skb->nf_bridge) + return NF_ACCEPT; + + /* Need exclusive nf_bridge_info since we might have multiple + * different physoutdevs. */ + if (!nf_bridge_unshare(skb)) + return NF_DROP; + + parent = bridge_parent(out); + if (!parent) + return NF_DROP; + + if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || + IS_PPPOE_IP(skb)) + pf = PF_INET; + else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || + IS_PPPOE_IPV6(skb)) + pf = PF_INET6; + else + return NF_ACCEPT; + + nf_bridge_pull_encap_header(skb); + + nf_bridge = skb->nf_bridge; + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->mask |= BRNF_PKT_TYPE; + } + + if (pf == PF_INET && br_parse_ip_options(skb)) + return NF_DROP; + + /* The physdev module checks on this */ + nf_bridge->mask |= BRNF_BRIDGED; + nf_bridge->physoutdev = skb->dev; + if (pf == PF_INET) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + NF_HOOK(pf, NF_INET_FORWARD, skb, bridge_parent(in), parent, + br_nf_forward_finish); + + return NF_STOLEN; +} + +static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct net_bridge_port *p; + struct net_bridge *br; + struct net_device **d = (struct net_device **)(skb->cb); + + p = br_port_get_rcu(out); + if (p == NULL) + return NF_ACCEPT; + br = p->br; + + if (!brnf_call_arptables && !br->nf_call_arptables) + return NF_ACCEPT; + + if (skb->protocol != htons(ETH_P_ARP)) { + if (!IS_VLAN_ARP(skb)) + return NF_ACCEPT; + nf_bridge_pull_encap_header(skb); + } + + if (arp_hdr(skb)->ar_pln != 4) { + if (IS_VLAN_ARP(skb)) + nf_bridge_push_encap_header(skb); + return NF_ACCEPT; + } + *d = (struct net_device *)in; + NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in, + (struct net_device *)out, br_nf_forward_finish); + + return NF_STOLEN; +} + +#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) +static int br_nf_dev_queue_xmit(struct sk_buff *skb) +{ + int ret; + + if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && + skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && + !skb_is_gso(skb)) { + if (br_parse_ip_options(skb)) + /* Drop invalid packet */ + return NF_DROP; + ret = ip_fragment(skb, br_dev_queue_push_xmit); + } else + ret = br_dev_queue_push_xmit(skb); + + return ret; +} +#else +static int br_nf_dev_queue_xmit(struct sk_buff *skb) +{ + return br_dev_queue_push_xmit(skb); +} +#endif + +/* PF_BRIDGE/POST_ROUTING ********************************************/ +static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct net_device *realoutdev = bridge_parent(skb->dev); + u_int8_t pf; + + if (!nf_bridge || !(nf_bridge->mask & BRNF_BRIDGED)) + return NF_ACCEPT; + + if (!realoutdev) + return NF_DROP; + + if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || + IS_PPPOE_IP(skb)) + pf = PF_INET; + else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || + IS_PPPOE_IPV6(skb)) + pf = PF_INET6; + else + return NF_ACCEPT; + + /* We assume any code from br_dev_queue_push_xmit onwards doesn't care + * about the value of skb->pkt_type. */ + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->mask |= BRNF_PKT_TYPE; + } + + nf_bridge_pull_encap_header(skb); + nf_bridge_save_header(skb); + if (pf == PF_INET) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + NF_HOOK(pf, NF_INET_POST_ROUTING, skb, NULL, realoutdev, + br_nf_dev_queue_xmit); + + return NF_STOLEN; +} + +/* IP/SABOTAGE *****************************************************/ +/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING + * for the second time. */ +static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (skb->nf_bridge && + !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { + return NF_STOP; + } + + return NF_ACCEPT; +} + +/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because + * br_dev_queue_push_xmit is called afterwards */ +static struct nf_hook_ops br_nf_ops[] __read_mostly = { + { + .hook = br_nf_pre_routing, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_local_in, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_forward_ip, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF - 1, + }, + { + .hook = br_nf_forward_arp, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF, + }, + { + .hook = br_nf_post_routing, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_BR_PRI_LAST, + }, + { + .hook = ip_sabotage_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_FIRST, + }, + { + .hook = ip_sabotage_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_FIRST, + }, +}; + +#ifdef CONFIG_SYSCTL +static +int brnf_sysctl_call_tables(ctl_table * ctl, int write, + void __user * buffer, size_t * lenp, loff_t * ppos) +{ + int ret; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write && *(int *)(ctl->data)) + *(int *)(ctl->data) = 1; + return ret; +} + +static ctl_table brnf_table[] = { + { + .procname = "bridge-nf-call-arptables", + .data = &brnf_call_arptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-call-iptables", + .data = &brnf_call_iptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-call-ip6tables", + .data = &brnf_call_ip6tables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-filter-vlan-tagged", + .data = &brnf_filter_vlan_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { + .procname = "bridge-nf-filter-pppoe-tagged", + .data = &brnf_filter_pppoe_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = brnf_sysctl_call_tables, + }, + { } +}; + +static struct ctl_path brnf_path[] = { + { .procname = "net", }, + { .procname = "bridge", }, + { } +}; +#endif + +int __init br_netfilter_init(void) +{ + int ret; + + ret = dst_entries_init(&fake_dst_ops); + if (ret < 0) + return ret; + + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret < 0) { + dst_entries_destroy(&fake_dst_ops); + return ret; + } +#ifdef CONFIG_SYSCTL + brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table); + if (brnf_sysctl_header == NULL) { + printk(KERN_WARNING + "br_netfilter: can't register to sysctl.\n"); + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + dst_entries_destroy(&fake_dst_ops); + return -ENOMEM; + } +#endif + printk(KERN_NOTICE "Bridge firewalling registered\n"); + return 0; +} + +void br_netfilter_fini(void) +{ + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(brnf_sysctl_header); +#endif + dst_entries_destroy(&fake_dst_ops); +} diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c new file mode 100644 index 00000000..2c160552 --- /dev/null +++ b/net/bridge/br_netlink.c @@ -0,0 +1,252 @@ +/* + * Bridge netlink control interface + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +static inline size_t br_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1); /* IFLA_PROTINFO */ +} + +/* + * Create one netlink message for one interface + * Contains port and master info as well as carrier and bridge state. + */ +static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, + u32 pid, u32 seq, int event, unsigned int flags) +{ + const struct net_bridge *br = port->br; + const struct net_device *dev = port->dev; + struct ifinfomsg *hdr; + struct nlmsghdr *nlh; + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + + br_debug(br, "br_fill_info event %d port %s master %s\n", + event, dev->name, br->dev->name); + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); + if (nlh == NULL) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifi_family = AF_BRIDGE; + hdr->__ifi_pad = 0; + hdr->ifi_type = dev->type; + hdr->ifi_index = dev->ifindex; + hdr->ifi_flags = dev_get_flags(dev); + hdr->ifi_change = 0; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + NLA_PUT_U32(skb, IFLA_MASTER, br->dev->ifindex); + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + NLA_PUT_U8(skb, IFLA_OPERSTATE, operstate); + + if (dev->addr_len) + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + if (event == RTM_NEWLINK) + NLA_PUT_U8(skb, IFLA_PROTINFO, port->state); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +/* + * Notify listeners of a change in port information + */ +void br_ifinfo_notify(int event, struct net_bridge_port *port) +{ + struct net *net = dev_net(port->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + br_debug(port->br, "port %u(%s) event %d\n", + (unsigned)port->port_no, port->dev->name, event); + + skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = br_fill_ifinfo(skb, port, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in br_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); +} + +/* + * Dump information about all ports, in response to GETLINK + */ +static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx; + + idx = 0; + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct net_bridge_port *port = br_port_get_rcu(dev); + + /* not a bridge port */ + if (!port || idx < cb->args[0]) + goto skip; + + if (br_fill_ifinfo(skb, port, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWLINK, + NLM_F_MULTI) < 0) + break; +skip: + ++idx; + } + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +/* + * Change state of port (ie from forwarding to blocking etc) + * Used by spanning tree in user space. + */ +static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct nlattr *protinfo; + struct net_device *dev; + struct net_bridge_port *p; + u8 new_state; + + if (nlmsg_len(nlh) < sizeof(*ifm)) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO); + if (!protinfo || nla_len(protinfo) < sizeof(u8)) + return -EINVAL; + + new_state = nla_get_u8(protinfo); + if (new_state > BR_STATE_BLOCKING) + return -EINVAL; + + dev = __dev_get_by_index(net, ifm->ifi_index); + if (!dev) + return -ENODEV; + + p = br_port_get_rtnl(dev); + if (!p) + return -EINVAL; + + /* if kernel STP is running, don't allow changes */ + if (p->br->stp_enabled == BR_KERNEL_STP) + return -EBUSY; + + if (!netif_running(dev) || + (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED)) + return -ENETDOWN; + + p->state = new_state; + br_log_state(p); + return 0; +} + +static int br_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + return 0; +} + +static struct rtnl_link_ops br_link_ops __read_mostly = { + .kind = "bridge", + .priv_size = sizeof(struct net_bridge), + .setup = br_dev_setup, + .validate = br_validate, + .dellink = br_dev_delete, +}; + +int __init br_netlink_init(void) +{ + int err; + + err = rtnl_link_register(&br_link_ops); + if (err < 0) + goto err1; + + err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, br_dump_ifinfo); + if (err) + goto err2; + err = __rtnl_register(PF_BRIDGE, RTM_SETLINK, br_rtm_setlink, NULL); + if (err) + goto err3; + err = __rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, br_fdb_add, NULL); + if (err) + goto err3; + err = __rtnl_register(PF_BRIDGE, RTM_DELNEIGH, br_fdb_delete, NULL); + if (err) + goto err3; + err = __rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, br_fdb_dump); + if (err) + goto err3; + + return 0; + +err3: + rtnl_unregister_all(PF_BRIDGE); +err2: + rtnl_link_unregister(&br_link_ops); +err1: + return err; +} + +void __exit br_netlink_fini(void) +{ + rtnl_link_unregister(&br_link_ops); + rtnl_unregister_all(PF_BRIDGE); +} diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c new file mode 100644 index 00000000..6545ee95 --- /dev/null +++ b/net/bridge/br_notify.c @@ -0,0 +1,108 @@ +/* + * Device event handling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "br_private.h" + +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr); + +struct notifier_block br_device_notifier = { + .notifier_call = br_device_event +}; + +/* + * Handle changes in state of network devices enslaved to a bridge. + * + * Note: don't care about up/down if bridge itself is down, because + * port state is checked when bridge is brought up. + */ +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net_bridge_port *p; + struct net_bridge *br; + int err; + + /* register of bridge completed, add sysfs entries */ + if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { + br_sysfs_addbr(dev); + return NOTIFY_DONE; + } + + /* not a port of a bridge */ + p = br_port_get_rtnl(dev); + if (!p) + return NOTIFY_DONE; + + br = p->br; + + switch (event) { + case NETDEV_CHANGEMTU: + dev_set_mtu(br->dev, br_min_mtu(br)); + break; + + case NETDEV_CHANGEADDR: + spin_lock_bh(&br->lock); + br_fdb_changeaddr(p, dev->dev_addr); + br_stp_recalculate_bridge_id(br); + spin_unlock_bh(&br->lock); + break; + + case NETDEV_CHANGE: + br_port_carrier_check(p); + break; + + case NETDEV_FEAT_CHANGE: + netdev_update_features(br->dev); + break; + + case NETDEV_DOWN: + spin_lock_bh(&br->lock); + if (br->dev->flags & IFF_UP) + br_stp_disable_port(p); + spin_unlock_bh(&br->lock); + break; + + case NETDEV_UP: + if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) { + spin_lock_bh(&br->lock); + br_stp_enable_port(p); + spin_unlock_bh(&br->lock); + } + break; + + case NETDEV_UNREGISTER: + br_del_if(br, dev); + break; + + case NETDEV_CHANGENAME: + err = br_sysfs_renameif(p); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_PRE_TYPE_CHANGE: + /* Forbid underlaying device to change its type. */ + return NOTIFY_BAD; + } + + /* Events that may cause spanning tree to refresh */ + if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || + event == NETDEV_CHANGE || event == NETDEV_DOWN) + br_ifinfo_notify(RTM_NEWLINK, p); + + return NOTIFY_DONE; +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h new file mode 100644 index 00000000..1ca1b1c7 --- /dev/null +++ b/net/bridge/br_private.h @@ -0,0 +1,554 @@ +/* + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _BR_PRIVATE_H +#define _BR_PRIVATE_H + +#include +#include +#include +#include +#include + +#define BR_HASH_BITS 8 +#define BR_HASH_SIZE (1 << BR_HASH_BITS) + +#define BR_HOLD_TIME (1*HZ) + +#define BR_PORT_BITS 10 +#define BR_MAX_PORTS (1<priv_flags & IFF_BRIDGE_PORT) + +static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev) +{ + struct net_bridge_port *port = rcu_dereference(dev->rx_handler_data); + return br_port_exists(dev) ? port : NULL; +} + +static inline struct net_bridge_port *br_port_get_rtnl(struct net_device *dev) +{ + return br_port_exists(dev) ? + rtnl_dereference(dev->rx_handler_data) : NULL; +} + +struct br_cpu_netstats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; +}; + +struct net_bridge +{ + spinlock_t lock; + struct list_head port_list; + struct net_device *dev; + + struct br_cpu_netstats __percpu *stats; + spinlock_t hash_lock; + struct hlist_head hash[BR_HASH_SIZE]; +#ifdef CONFIG_BRIDGE_NETFILTER + struct rtable fake_rtable; + bool nf_call_iptables; + bool nf_call_ip6tables; + bool nf_call_arptables; +#endif + unsigned long flags; +#define BR_SET_MAC_ADDR 0x00000001 + + /* STP */ + bridge_id designated_root; + bridge_id bridge_id; + u32 root_path_cost; + unsigned long max_age; + unsigned long hello_time; + unsigned long forward_delay; + unsigned long bridge_max_age; + unsigned long ageing_time; + unsigned long bridge_hello_time; + unsigned long bridge_forward_delay; + + u8 group_addr[ETH_ALEN]; + u16 root_port; + + enum { + BR_NO_STP, /* no spanning tree */ + BR_KERNEL_STP, /* old STP in kernel */ + BR_USER_STP, /* new RSTP in userspace */ + } stp_enabled; + + unsigned char topology_change; + unsigned char topology_change_detected; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + unsigned char multicast_router; + + u8 multicast_disabled:1; + + u32 hash_elasticity; + u32 hash_max; + + u32 multicast_last_member_count; + u32 multicast_startup_queries_sent; + u32 multicast_startup_query_count; + + unsigned long multicast_last_member_interval; + unsigned long multicast_membership_interval; + unsigned long multicast_querier_interval; + unsigned long multicast_query_interval; + unsigned long multicast_query_response_interval; + unsigned long multicast_startup_query_interval; + + spinlock_t multicast_lock; + struct net_bridge_mdb_htable __rcu *mdb; + struct hlist_head router_list; + + struct timer_list multicast_router_timer; + struct timer_list multicast_querier_timer; + struct timer_list multicast_query_timer; +#endif + + struct timer_list hello_timer; + struct timer_list tcn_timer; + struct timer_list topology_change_timer; + struct timer_list gc_timer; + struct kobject *ifobj; +}; + +struct br_input_skb_cb { + struct net_device *brdev; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + int igmp; + int mrouters_only; +#endif +}; + +#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (BR_INPUT_SKB_CB(__skb)->mrouters_only) +#else +# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (0) +#endif + +#define br_printk(level, br, format, args...) \ + printk(level "%s: " format, (br)->dev->name, ##args) + +#define br_err(__br, format, args...) \ + br_printk(KERN_ERR, __br, format, ##args) +#define br_warn(__br, format, args...) \ + br_printk(KERN_WARNING, __br, format, ##args) +#define br_notice(__br, format, args...) \ + br_printk(KERN_NOTICE, __br, format, ##args) +#define br_info(__br, format, args...) \ + br_printk(KERN_INFO, __br, format, ##args) + +#define br_debug(br, format, args...) \ + pr_debug("%s: " format, (br)->dev->name, ##args) + +extern struct notifier_block br_device_notifier; +extern const u8 br_group_address[ETH_ALEN]; + +/* called under bridge lock */ +static inline int br_is_root_bridge(const struct net_bridge *br) +{ + return !memcmp(&br->bridge_id, &br->designated_root, 8); +} + +/* br_device.c */ +extern void br_dev_setup(struct net_device *dev); +extern void br_dev_delete(struct net_device *dev, struct list_head *list); +extern netdev_tx_t br_dev_xmit(struct sk_buff *skb, + struct net_device *dev); +#ifdef CONFIG_NET_POLL_CONTROLLER +static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) +{ + return br->dev->npinfo; +} + +static inline void br_netpoll_send_skb(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + struct netpoll *np = p->np; + + if (np) + netpoll_send_skb(np, skb); +} + +extern int br_netpoll_enable(struct net_bridge_port *p); +extern void br_netpoll_disable(struct net_bridge_port *p); +#else +static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) +{ + return NULL; +} + +static inline void br_netpoll_send_skb(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline int br_netpoll_enable(struct net_bridge_port *p) +{ + return 0; +} + +static inline void br_netpoll_disable(struct net_bridge_port *p) +{ +} +#endif + +/* br_fdb.c */ +extern int br_fdb_init(void); +extern void br_fdb_fini(void); +extern void br_fdb_flush(struct net_bridge *br); +extern void br_fdb_changeaddr(struct net_bridge_port *p, + const unsigned char *newaddr); +extern void br_fdb_cleanup(unsigned long arg); +extern void br_fdb_delete_by_port(struct net_bridge *br, + const struct net_bridge_port *p, int do_all); +extern struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, + const unsigned char *addr); +extern int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); +extern int br_fdb_fillbuf(struct net_bridge *br, void *buf, + unsigned long count, unsigned long off); +extern int br_fdb_insert(struct net_bridge *br, + struct net_bridge_port *source, + const unsigned char *addr); +extern void br_fdb_update(struct net_bridge *br, + struct net_bridge_port *source, + const unsigned char *addr); +extern int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb); +extern int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg); +extern int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg); + +/* br_forward.c */ +extern void br_deliver(const struct net_bridge_port *to, + struct sk_buff *skb); +extern int br_dev_queue_push_xmit(struct sk_buff *skb); +extern void br_forward(const struct net_bridge_port *to, + struct sk_buff *skb, struct sk_buff *skb0); +extern int br_forward_finish(struct sk_buff *skb); +extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); +extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, + struct sk_buff *skb2); + +/* br_if.c */ +extern void br_port_carrier_check(struct net_bridge_port *p); +extern int br_add_bridge(struct net *net, const char *name); +extern int br_del_bridge(struct net *net, const char *name); +extern void br_net_exit(struct net *net); +extern int br_add_if(struct net_bridge *br, + struct net_device *dev); +extern int br_del_if(struct net_bridge *br, + struct net_device *dev); +extern int br_min_mtu(const struct net_bridge *br); +extern u32 br_features_recompute(struct net_bridge *br, u32 features); + +/* br_input.c */ +extern int br_handle_frame_finish(struct sk_buff *skb); +extern rx_handler_result_t br_handle_frame(struct sk_buff **pskb); + +/* br_ioctl.c */ +extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); +extern int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *arg); + +/* br_multicast.c */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +extern int br_multicast_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb); +extern struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, + struct sk_buff *skb); +extern void br_multicast_add_port(struct net_bridge_port *port); +extern void br_multicast_del_port(struct net_bridge_port *port); +extern void br_multicast_enable_port(struct net_bridge_port *port); +extern void br_multicast_disable_port(struct net_bridge_port *port); +extern void br_multicast_init(struct net_bridge *br); +extern void br_multicast_open(struct net_bridge *br); +extern void br_multicast_stop(struct net_bridge *br); +extern void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb); +extern void br_multicast_forward(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, struct sk_buff *skb2); +extern int br_multicast_set_router(struct net_bridge *br, unsigned long val); +extern int br_multicast_set_port_router(struct net_bridge_port *p, + unsigned long val); +extern int br_multicast_toggle(struct net_bridge *br, unsigned long val); +extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); + +static inline bool br_multicast_is_router(struct net_bridge *br) +{ + return br->multicast_router == 2 || + (br->multicast_router == 1 && + timer_pending(&br->multicast_router_timer)); +} +#else +static inline int br_multicast_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + return 0; +} + +static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, + struct sk_buff *skb) +{ + return NULL; +} + +static inline void br_multicast_add_port(struct net_bridge_port *port) +{ +} + +static inline void br_multicast_del_port(struct net_bridge_port *port) +{ +} + +static inline void br_multicast_enable_port(struct net_bridge_port *port) +{ +} + +static inline void br_multicast_disable_port(struct net_bridge_port *port) +{ +} + +static inline void br_multicast_init(struct net_bridge *br) +{ +} + +static inline void br_multicast_open(struct net_bridge *br) +{ +} + +static inline void br_multicast_stop(struct net_bridge *br) +{ +} + +static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb) +{ +} + +static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst, + struct sk_buff *skb, + struct sk_buff *skb2) +{ +} +static inline bool br_multicast_is_router(struct net_bridge *br) +{ + return 0; +} +#endif + +/* br_netfilter.c */ +#ifdef CONFIG_BRIDGE_NETFILTER +extern int br_netfilter_init(void); +extern void br_netfilter_fini(void); +extern void br_netfilter_rtable_init(struct net_bridge *); +#else +#define br_netfilter_init() (0) +#define br_netfilter_fini() do { } while(0) +#define br_netfilter_rtable_init(x) +#endif + +/* br_stp.c */ +extern void br_log_state(const struct net_bridge_port *p); +extern struct net_bridge_port *br_get_port(struct net_bridge *br, + u16 port_no); +extern void br_init_port(struct net_bridge_port *p); +extern void br_become_designated_port(struct net_bridge_port *p); + +extern int br_set_forward_delay(struct net_bridge *br, unsigned long x); +extern int br_set_hello_time(struct net_bridge *br, unsigned long x); +extern int br_set_max_age(struct net_bridge *br, unsigned long x); + + +/* br_stp_if.c */ +extern void br_stp_enable_bridge(struct net_bridge *br); +extern void br_stp_disable_bridge(struct net_bridge *br); +extern void br_stp_set_enabled(struct net_bridge *br, unsigned long val); +extern void br_stp_enable_port(struct net_bridge_port *p); +extern void br_stp_disable_port(struct net_bridge_port *p); +extern bool br_stp_recalculate_bridge_id(struct net_bridge *br); +extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); +extern void br_stp_set_bridge_priority(struct net_bridge *br, + u16 newprio); +extern int br_stp_set_port_priority(struct net_bridge_port *p, + unsigned long newprio); +extern int br_stp_set_path_cost(struct net_bridge_port *p, + unsigned long path_cost); +extern ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id); + +/* br_stp_bpdu.c */ +struct stp_proto; +extern void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, + struct net_device *dev); + +/* br_stp_timer.c */ +extern void br_stp_timer_init(struct net_bridge *br); +extern void br_stp_port_timer_init(struct net_bridge_port *p); +extern unsigned long br_timer_value(const struct timer_list *timer); + +/* br.c */ +#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); +#endif + +/* br_netlink.c */ +extern int br_netlink_init(void); +extern void br_netlink_fini(void); +extern void br_ifinfo_notify(int event, struct net_bridge_port *port); + +#ifdef CONFIG_SYSFS +/* br_sysfs_if.c */ +extern const struct sysfs_ops brport_sysfs_ops; +extern int br_sysfs_addif(struct net_bridge_port *p); +extern int br_sysfs_renameif(struct net_bridge_port *p); + +/* br_sysfs_br.c */ +extern int br_sysfs_addbr(struct net_device *dev); +extern void br_sysfs_delbr(struct net_device *dev); + +#else + +#define br_sysfs_addif(p) (0) +#define br_sysfs_renameif(p) (0) +#define br_sysfs_addbr(dev) (0) +#define br_sysfs_delbr(dev) do { } while(0) +#endif /* CONFIG_SYSFS */ + +#endif diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h new file mode 100644 index 00000000..642ef47a --- /dev/null +++ b/net/bridge/br_private_stp.h @@ -0,0 +1,69 @@ +/* + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _BR_PRIVATE_STP_H +#define _BR_PRIVATE_STP_H + +#define BPDU_TYPE_CONFIG 0 +#define BPDU_TYPE_TCN 0x80 + +/* IEEE 802.1D-1998 timer values */ +#define BR_MIN_HELLO_TIME (1*HZ) +#define BR_MAX_HELLO_TIME (10*HZ) + +#define BR_MIN_FORWARD_DELAY (2*HZ) +#define BR_MAX_FORWARD_DELAY (30*HZ) + +#define BR_MIN_MAX_AGE (6*HZ) +#define BR_MAX_MAX_AGE (40*HZ) + +#define BR_MIN_PATH_COST 1 +#define BR_MAX_PATH_COST 65535 + +struct br_config_bpdu +{ + unsigned topology_change:1; + unsigned topology_change_ack:1; + bridge_id root; + int root_path_cost; + bridge_id bridge_id; + port_id port_id; + int message_age; + int max_age; + int hello_time; + int forward_delay; +}; + +/* called under bridge lock */ +static inline int br_is_designated_port(const struct net_bridge_port *p) +{ + return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) && + (p->designated_port == p->port_id); +} + + +/* br_stp.c */ +extern void br_become_root_bridge(struct net_bridge *br); +extern void br_config_bpdu_generation(struct net_bridge *); +extern void br_configuration_update(struct net_bridge *); +extern void br_port_state_selection(struct net_bridge *); +extern void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu); +extern void br_received_tcn_bpdu(struct net_bridge_port *p); +extern void br_transmit_config(struct net_bridge_port *p); +extern void br_transmit_tcn(struct net_bridge *br); +extern void br_topology_change_detection(struct net_bridge *br); + +/* br_stp_bpdu.c */ +extern void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); +extern void br_send_tcn_bpdu(struct net_bridge_port *); + +#endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c new file mode 100644 index 00000000..fcff6225 --- /dev/null +++ b/net/bridge/br_stp.c @@ -0,0 +1,534 @@ +/* + * Spanning tree protocol; generic parts + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + +/* since time values in bpdu are in jiffies and then scaled (1/256) + * before sending, make sure that is at least one. + */ +#define MESSAGE_AGE_INCR ((HZ < 256) ? 1 : (HZ/256)) + +static const char *const br_port_state_names[] = { + [BR_STATE_DISABLED] = "disabled", + [BR_STATE_LISTENING] = "listening", + [BR_STATE_LEARNING] = "learning", + [BR_STATE_FORWARDING] = "forwarding", + [BR_STATE_BLOCKING] = "blocking", +}; + +void br_log_state(const struct net_bridge_port *p) +{ + br_info(p->br, "port %u(%s) entering %s state\n", + (unsigned) p->port_no, p->dev->name, + br_port_state_names[p->state]); +} + +/* called under bridge lock */ +struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) +{ + struct net_bridge_port *p; + + list_for_each_entry_rcu(p, &br->port_list, list) { + if (p->port_no == port_no) + return p; + } + + return NULL; +} + +/* called under bridge lock */ +static int br_should_become_root_port(const struct net_bridge_port *p, + u16 root_port) +{ + struct net_bridge *br; + struct net_bridge_port *rp; + int t; + + br = p->br; + if (p->state == BR_STATE_DISABLED || + br_is_designated_port(p)) + return 0; + + if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0) + return 0; + + if (!root_port) + return 1; + + rp = br_get_port(br, root_port); + + t = memcmp(&p->designated_root, &rp->designated_root, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (p->designated_cost + p->path_cost < + rp->designated_cost + rp->path_cost) + return 1; + else if (p->designated_cost + p->path_cost > + rp->designated_cost + rp->path_cost) + return 0; + + t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (p->designated_port < rp->designated_port) + return 1; + else if (p->designated_port > rp->designated_port) + return 0; + + if (p->port_id < rp->port_id) + return 1; + + return 0; +} + +/* called under bridge lock */ +static void br_root_selection(struct net_bridge *br) +{ + struct net_bridge_port *p; + u16 root_port = 0; + + list_for_each_entry(p, &br->port_list, list) { + if (br_should_become_root_port(p, root_port)) + root_port = p->port_no; + + } + + br->root_port = root_port; + + if (!root_port) { + br->designated_root = br->bridge_id; + br->root_path_cost = 0; + } else { + p = br_get_port(br, root_port); + br->designated_root = p->designated_root; + br->root_path_cost = p->designated_cost + p->path_cost; + } +} + +/* called under bridge lock */ +void br_become_root_bridge(struct net_bridge *br) +{ + br->max_age = br->bridge_max_age; + br->hello_time = br->bridge_hello_time; + br->forward_delay = br->bridge_forward_delay; + br_topology_change_detection(br); + del_timer(&br->tcn_timer); + + if (br->dev->flags & IFF_UP) { + br_config_bpdu_generation(br); + mod_timer(&br->hello_timer, jiffies + br->hello_time); + } +} + +/* called under bridge lock */ +void br_transmit_config(struct net_bridge_port *p) +{ + struct br_config_bpdu bpdu; + struct net_bridge *br; + + + if (timer_pending(&p->hold_timer)) { + p->config_pending = 1; + return; + } + + br = p->br; + + bpdu.topology_change = br->topology_change; + bpdu.topology_change_ack = p->topology_change_ack; + bpdu.root = br->designated_root; + bpdu.root_path_cost = br->root_path_cost; + bpdu.bridge_id = br->bridge_id; + bpdu.port_id = p->port_id; + if (br_is_root_bridge(br)) + bpdu.message_age = 0; + else { + struct net_bridge_port *root + = br_get_port(br, br->root_port); + bpdu.message_age = (jiffies - root->designated_age) + + MESSAGE_AGE_INCR; + } + bpdu.max_age = br->max_age; + bpdu.hello_time = br->hello_time; + bpdu.forward_delay = br->forward_delay; + + if (bpdu.message_age < br->max_age) { + br_send_config_bpdu(p, &bpdu); + p->topology_change_ack = 0; + p->config_pending = 0; + mod_timer(&p->hold_timer, + round_jiffies(jiffies + BR_HOLD_TIME)); + } +} + +/* called under bridge lock */ +static inline void br_record_config_information(struct net_bridge_port *p, + const struct br_config_bpdu *bpdu) +{ + p->designated_root = bpdu->root; + p->designated_cost = bpdu->root_path_cost; + p->designated_bridge = bpdu->bridge_id; + p->designated_port = bpdu->port_id; + p->designated_age = jiffies + bpdu->message_age; + + mod_timer(&p->message_age_timer, jiffies + + (p->br->max_age - bpdu->message_age)); +} + +/* called under bridge lock */ +static inline void br_record_config_timeout_values(struct net_bridge *br, + const struct br_config_bpdu *bpdu) +{ + br->max_age = bpdu->max_age; + br->hello_time = bpdu->hello_time; + br->forward_delay = bpdu->forward_delay; + br->topology_change = bpdu->topology_change; +} + +/* called under bridge lock */ +void br_transmit_tcn(struct net_bridge *br) +{ + br_send_tcn_bpdu(br_get_port(br, br->root_port)); +} + +/* called under bridge lock */ +static int br_should_become_designated_port(const struct net_bridge_port *p) +{ + struct net_bridge *br; + int t; + + br = p->br; + if (br_is_designated_port(p)) + return 1; + + if (memcmp(&p->designated_root, &br->designated_root, 8)) + return 1; + + if (br->root_path_cost < p->designated_cost) + return 1; + else if (br->root_path_cost > p->designated_cost) + return 0; + + t = memcmp(&br->bridge_id, &p->designated_bridge, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (p->port_id < p->designated_port) + return 1; + + return 0; +} + +/* called under bridge lock */ +static void br_designated_port_selection(struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + br_should_become_designated_port(p)) + br_become_designated_port(p); + + } +} + +/* called under bridge lock */ +static int br_supersedes_port_info(struct net_bridge_port *p, struct br_config_bpdu *bpdu) +{ + int t; + + t = memcmp(&bpdu->root, &p->designated_root, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (bpdu->root_path_cost < p->designated_cost) + return 1; + else if (bpdu->root_path_cost > p->designated_cost) + return 0; + + t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8)) + return 1; + + if (bpdu->port_id <= p->designated_port) + return 1; + + return 0; +} + +/* called under bridge lock */ +static inline void br_topology_change_acknowledged(struct net_bridge *br) +{ + br->topology_change_detected = 0; + del_timer(&br->tcn_timer); +} + +/* called under bridge lock */ +void br_topology_change_detection(struct net_bridge *br) +{ + int isroot = br_is_root_bridge(br); + + if (br->stp_enabled != BR_KERNEL_STP) + return; + + br_info(br, "topology change detected, %s\n", + isroot ? "propagating" : "sending tcn bpdu"); + + if (isroot) { + br->topology_change = 1; + mod_timer(&br->topology_change_timer, jiffies + + br->bridge_forward_delay + br->bridge_max_age); + } else if (!br->topology_change_detected) { + br_transmit_tcn(br); + mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); + } + + br->topology_change_detected = 1; +} + +/* called under bridge lock */ +void br_config_bpdu_generation(struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + br_is_designated_port(p)) + br_transmit_config(p); + } +} + +/* called under bridge lock */ +static inline void br_reply(struct net_bridge_port *p) +{ + br_transmit_config(p); +} + +/* called under bridge lock */ +void br_configuration_update(struct net_bridge *br) +{ + br_root_selection(br); + br_designated_port_selection(br); +} + +/* called under bridge lock */ +void br_become_designated_port(struct net_bridge_port *p) +{ + struct net_bridge *br; + + br = p->br; + p->designated_root = br->designated_root; + p->designated_cost = br->root_path_cost; + p->designated_bridge = br->bridge_id; + p->designated_port = p->port_id; +} + + +/* called under bridge lock */ +static void br_make_blocking(struct net_bridge_port *p) +{ + if (p->state != BR_STATE_DISABLED && + p->state != BR_STATE_BLOCKING) { + if (p->state == BR_STATE_FORWARDING || + p->state == BR_STATE_LEARNING) + br_topology_change_detection(p->br); + + p->state = BR_STATE_BLOCKING; + br_log_state(p); + del_timer(&p->forward_delay_timer); + } +} + +/* called under bridge lock */ +static void br_make_forwarding(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + + if (p->state != BR_STATE_BLOCKING) + return; + + if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { + p->state = BR_STATE_FORWARDING; + br_topology_change_detection(br); + del_timer(&p->forward_delay_timer); + } + else if (br->stp_enabled == BR_KERNEL_STP) + p->state = BR_STATE_LISTENING; + else + p->state = BR_STATE_LEARNING; + + br_multicast_enable_port(p); + + br_log_state(p); + + if (br->forward_delay != 0) + mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); +} + +/* called under bridge lock */ +void br_port_state_selection(struct net_bridge *br) +{ + struct net_bridge_port *p; + unsigned int liveports = 0; + + /* Don't change port states if userspace is handling STP */ + if (br->stp_enabled == BR_USER_STP) + return; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state == BR_STATE_DISABLED) + continue; + + if (p->port_no == br->root_port) { + p->config_pending = 0; + p->topology_change_ack = 0; + br_make_forwarding(p); + } else if (br_is_designated_port(p)) { + del_timer(&p->message_age_timer); + br_make_forwarding(p); + } else { + p->config_pending = 0; + p->topology_change_ack = 0; + br_make_blocking(p); + } + + if (p->state == BR_STATE_FORWARDING) + ++liveports; + } + + if (liveports == 0) + netif_carrier_off(br->dev); + else + netif_carrier_on(br->dev); +} + +/* called under bridge lock */ +static inline void br_topology_change_acknowledge(struct net_bridge_port *p) +{ + p->topology_change_ack = 1; + br_transmit_config(p); +} + +/* called under bridge lock */ +void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) +{ + struct net_bridge *br; + int was_root; + + br = p->br; + was_root = br_is_root_bridge(br); + + if (br_supersedes_port_info(p, bpdu)) { + br_record_config_information(p, bpdu); + br_configuration_update(br); + br_port_state_selection(br); + + if (!br_is_root_bridge(br) && was_root) { + del_timer(&br->hello_timer); + if (br->topology_change_detected) { + del_timer(&br->topology_change_timer); + br_transmit_tcn(br); + + mod_timer(&br->tcn_timer, + jiffies + br->bridge_hello_time); + } + } + + if (p->port_no == br->root_port) { + br_record_config_timeout_values(br, bpdu); + br_config_bpdu_generation(br); + if (bpdu->topology_change_ack) + br_topology_change_acknowledged(br); + } + } else if (br_is_designated_port(p)) { + br_reply(p); + } +} + +/* called under bridge lock */ +void br_received_tcn_bpdu(struct net_bridge_port *p) +{ + if (br_is_designated_port(p)) { + br_info(p->br, "port %u(%s) received tcn bpdu\n", + (unsigned) p->port_no, p->dev->name); + + br_topology_change_detection(p->br); + br_topology_change_acknowledge(p); + } +} + +/* Change bridge STP parameter */ +int br_set_hello_time(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + + if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME) + return -ERANGE; + + spin_lock_bh(&br->lock); + br->bridge_hello_time = t; + if (br_is_root_bridge(br)) + br->hello_time = br->bridge_hello_time; + spin_unlock_bh(&br->lock); + return 0; +} + +int br_set_max_age(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + + if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE) + return -ERANGE; + + spin_lock_bh(&br->lock); + br->bridge_max_age = t; + if (br_is_root_bridge(br)) + br->max_age = br->bridge_max_age; + spin_unlock_bh(&br->lock); + return 0; + +} + +int br_set_forward_delay(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + + if (br->stp_enabled != BR_NO_STP && + (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) + return -ERANGE; + + spin_lock_bh(&br->lock); + br->bridge_forward_delay = t; + if (br_is_root_bridge(br)) + br->forward_delay = br->bridge_forward_delay; + spin_unlock_bh(&br->lock); + return 0; +} diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c new file mode 100644 index 00000000..289646ec --- /dev/null +++ b/net/bridge/br_stp_bpdu.c @@ -0,0 +1,223 @@ +/* + * Spanning tree protocol; BPDU handling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + +#define STP_HZ 256 + +#define LLC_RESERVE sizeof(struct llc_pdu_un) + +static void br_send_bpdu(struct net_bridge_port *p, + const unsigned char *data, int length) +{ + struct sk_buff *skb; + + skb = dev_alloc_skb(length+LLC_RESERVE); + if (!skb) + return; + + skb->dev = p->dev; + skb->protocol = htons(ETH_P_802_2); + + skb_reserve(skb, LLC_RESERVE); + memcpy(__skb_put(skb, length), data, length); + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, LLC_SAP_BSPAN, + LLC_SAP_BSPAN, LLC_PDU_CMD); + llc_pdu_init_as_ui_cmd(skb); + + llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr); + + skb_reset_mac_header(skb); + + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); +} + +static inline void br_set_ticks(unsigned char *dest, int j) +{ + unsigned long ticks = (STP_HZ * j)/ HZ; + + put_unaligned_be16(ticks, dest); +} + +static inline int br_get_ticks(const unsigned char *src) +{ + unsigned long ticks = get_unaligned_be16(src); + + return DIV_ROUND_UP(ticks * HZ, STP_HZ); +} + +/* called under bridge lock */ +void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) +{ + unsigned char buf[35]; + + if (p->br->stp_enabled != BR_KERNEL_STP) + return; + + buf[0] = 0; + buf[1] = 0; + buf[2] = 0; + buf[3] = BPDU_TYPE_CONFIG; + buf[4] = (bpdu->topology_change ? 0x01 : 0) | + (bpdu->topology_change_ack ? 0x80 : 0); + buf[5] = bpdu->root.prio[0]; + buf[6] = bpdu->root.prio[1]; + buf[7] = bpdu->root.addr[0]; + buf[8] = bpdu->root.addr[1]; + buf[9] = bpdu->root.addr[2]; + buf[10] = bpdu->root.addr[3]; + buf[11] = bpdu->root.addr[4]; + buf[12] = bpdu->root.addr[5]; + buf[13] = (bpdu->root_path_cost >> 24) & 0xFF; + buf[14] = (bpdu->root_path_cost >> 16) & 0xFF; + buf[15] = (bpdu->root_path_cost >> 8) & 0xFF; + buf[16] = bpdu->root_path_cost & 0xFF; + buf[17] = bpdu->bridge_id.prio[0]; + buf[18] = bpdu->bridge_id.prio[1]; + buf[19] = bpdu->bridge_id.addr[0]; + buf[20] = bpdu->bridge_id.addr[1]; + buf[21] = bpdu->bridge_id.addr[2]; + buf[22] = bpdu->bridge_id.addr[3]; + buf[23] = bpdu->bridge_id.addr[4]; + buf[24] = bpdu->bridge_id.addr[5]; + buf[25] = (bpdu->port_id >> 8) & 0xFF; + buf[26] = bpdu->port_id & 0xFF; + + br_set_ticks(buf+27, bpdu->message_age); + br_set_ticks(buf+29, bpdu->max_age); + br_set_ticks(buf+31, bpdu->hello_time); + br_set_ticks(buf+33, bpdu->forward_delay); + + br_send_bpdu(p, buf, 35); +} + +/* called under bridge lock */ +void br_send_tcn_bpdu(struct net_bridge_port *p) +{ + unsigned char buf[4]; + + if (p->br->stp_enabled != BR_KERNEL_STP) + return; + + buf[0] = 0; + buf[1] = 0; + buf[2] = 0; + buf[3] = BPDU_TYPE_TCN; + br_send_bpdu(p, buf, 4); +} + +/* + * Called from llc. + * + * NO locks, but rcu_read_lock + */ +void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, + struct net_device *dev) +{ + const unsigned char *dest = eth_hdr(skb)->h_dest; + struct net_bridge_port *p; + struct net_bridge *br; + const unsigned char *buf; + + if (!pskb_may_pull(skb, 4)) + goto err; + + /* compare of protocol id and version */ + buf = skb->data; + if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0) + goto err; + + p = br_port_get_rcu(dev); + if (!p) + goto err; + + br = p->br; + spin_lock(&br->lock); + + if (br->stp_enabled != BR_KERNEL_STP) + goto out; + + if (!(br->dev->flags & IFF_UP)) + goto out; + + if (p->state == BR_STATE_DISABLED) + goto out; + + if (compare_ether_addr(dest, br->group_addr) != 0) + goto out; + + buf = skb_pull(skb, 3); + + if (buf[0] == BPDU_TYPE_CONFIG) { + struct br_config_bpdu bpdu; + + if (!pskb_may_pull(skb, 32)) + goto out; + + buf = skb->data; + bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0; + bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0; + + bpdu.root.prio[0] = buf[2]; + bpdu.root.prio[1] = buf[3]; + bpdu.root.addr[0] = buf[4]; + bpdu.root.addr[1] = buf[5]; + bpdu.root.addr[2] = buf[6]; + bpdu.root.addr[3] = buf[7]; + bpdu.root.addr[4] = buf[8]; + bpdu.root.addr[5] = buf[9]; + bpdu.root_path_cost = + (buf[10] << 24) | + (buf[11] << 16) | + (buf[12] << 8) | + buf[13]; + bpdu.bridge_id.prio[0] = buf[14]; + bpdu.bridge_id.prio[1] = buf[15]; + bpdu.bridge_id.addr[0] = buf[16]; + bpdu.bridge_id.addr[1] = buf[17]; + bpdu.bridge_id.addr[2] = buf[18]; + bpdu.bridge_id.addr[3] = buf[19]; + bpdu.bridge_id.addr[4] = buf[20]; + bpdu.bridge_id.addr[5] = buf[21]; + bpdu.port_id = (buf[22] << 8) | buf[23]; + + bpdu.message_age = br_get_ticks(buf+24); + bpdu.max_age = br_get_ticks(buf+26); + bpdu.hello_time = br_get_ticks(buf+28); + bpdu.forward_delay = br_get_ticks(buf+30); + + br_received_config_bpdu(p, &bpdu); + } + + else if (buf[0] == BPDU_TYPE_TCN) { + br_received_tcn_bpdu(p); + } + out: + spin_unlock(&br->lock); + err: + kfree_skb(skb); +} diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c new file mode 100644 index 00000000..6f615b81 --- /dev/null +++ b/net/bridge/br_stp_if.c @@ -0,0 +1,301 @@ +/* + * Spanning tree protocol; interface code + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + + +/* Port id is composed of priority and port number. + * NB: some bits of priority are dropped to + * make room for more ports. + */ +static inline port_id br_make_port_id(__u8 priority, __u16 port_no) +{ + return ((u16)priority << BR_PORT_BITS) + | (port_no & ((1<> BR_PORT_BITS) + +/* called under bridge lock */ +void br_init_port(struct net_bridge_port *p) +{ + p->port_id = br_make_port_id(p->priority, p->port_no); + br_become_designated_port(p); + p->state = BR_STATE_BLOCKING; + p->topology_change_ack = 0; + p->config_pending = 0; +} + +/* called under bridge lock */ +void br_stp_enable_bridge(struct net_bridge *br) +{ + struct net_bridge_port *p; + + spin_lock_bh(&br->lock); + mod_timer(&br->hello_timer, jiffies + br->hello_time); + mod_timer(&br->gc_timer, jiffies + HZ/10); + + br_config_bpdu_generation(br); + + list_for_each_entry(p, &br->port_list, list) { + if ((p->dev->flags & IFF_UP) && netif_carrier_ok(p->dev)) + br_stp_enable_port(p); + + } + spin_unlock_bh(&br->lock); +} + +/* NO locks held */ +void br_stp_disable_bridge(struct net_bridge *br) +{ + struct net_bridge_port *p; + + spin_lock_bh(&br->lock); + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + + } + + br->topology_change = 0; + br->topology_change_detected = 0; + spin_unlock_bh(&br->lock); + + del_timer_sync(&br->hello_timer); + del_timer_sync(&br->topology_change_timer); + del_timer_sync(&br->tcn_timer); + del_timer_sync(&br->gc_timer); +} + +/* called under bridge lock */ +void br_stp_enable_port(struct net_bridge_port *p) +{ + br_init_port(p); + br_port_state_selection(p->br); + br_log_state(p); +} + +/* called under bridge lock */ +void br_stp_disable_port(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + int wasroot; + + br_log_state(p); + + wasroot = br_is_root_bridge(br); + br_become_designated_port(p); + p->state = BR_STATE_DISABLED; + p->topology_change_ack = 0; + p->config_pending = 0; + + del_timer(&p->message_age_timer); + del_timer(&p->forward_delay_timer); + del_timer(&p->hold_timer); + + br_fdb_delete_by_port(br, p, 0); + br_multicast_disable_port(p); + + br_configuration_update(br); + + br_port_state_selection(br); + + if (br_is_root_bridge(br) && !wasroot) + br_become_root_bridge(br); +} + +static void br_stp_start(struct net_bridge *br) +{ + int r; + char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; + char *envp[] = { NULL }; + + r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + if (r == 0) { + br->stp_enabled = BR_USER_STP; + br_debug(br, "userspace STP started\n"); + } else { + br->stp_enabled = BR_KERNEL_STP; + br_debug(br, "using kernel STP\n"); + + /* To start timers on any ports left in blocking */ + spin_lock_bh(&br->lock); + br_port_state_selection(br); + spin_unlock_bh(&br->lock); + } +} + +static void br_stp_stop(struct net_bridge *br) +{ + int r; + char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL }; + char *envp[] = { NULL }; + + if (br->stp_enabled == BR_USER_STP) { + r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + br_info(br, "userspace STP stopped, return code %d\n", r); + + /* To start timers on any ports left in blocking */ + spin_lock_bh(&br->lock); + br_port_state_selection(br); + spin_unlock_bh(&br->lock); + } + + br->stp_enabled = BR_NO_STP; +} + +void br_stp_set_enabled(struct net_bridge *br, unsigned long val) +{ + ASSERT_RTNL(); + + if (val) { + if (br->stp_enabled == BR_NO_STP) + br_stp_start(br); + } else { + if (br->stp_enabled != BR_NO_STP) + br_stp_stop(br); + } +} + +/* called under bridge lock */ +void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) +{ + /* should be aligned on 2 bytes for compare_ether_addr() */ + unsigned short oldaddr_aligned[ETH_ALEN >> 1]; + unsigned char *oldaddr = (unsigned char *)oldaddr_aligned; + struct net_bridge_port *p; + int wasroot; + + wasroot = br_is_root_bridge(br); + + memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); + memcpy(br->bridge_id.addr, addr, ETH_ALEN); + memcpy(br->dev->dev_addr, addr, ETH_ALEN); + + list_for_each_entry(p, &br->port_list, list) { + if (!compare_ether_addr(p->designated_bridge.addr, oldaddr)) + memcpy(p->designated_bridge.addr, addr, ETH_ALEN); + + if (!compare_ether_addr(p->designated_root.addr, oldaddr)) + memcpy(p->designated_root.addr, addr, ETH_ALEN); + + } + + br_configuration_update(br); + br_port_state_selection(br); + if (br_is_root_bridge(br) && !wasroot) + br_become_root_bridge(br); +} + +/* should be aligned on 2 bytes for compare_ether_addr() */ +static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; + +/* called under bridge lock */ +bool br_stp_recalculate_bridge_id(struct net_bridge *br) +{ + const unsigned char *br_mac_zero = + (const unsigned char *)br_mac_zero_aligned; + const unsigned char *addr = br_mac_zero; + struct net_bridge_port *p; + + /* user has chosen a value so keep it */ + if (br->flags & BR_SET_MAC_ADDR) + return false; + + list_for_each_entry(p, &br->port_list, list) { + if (addr == br_mac_zero || + memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) + addr = p->dev->dev_addr; + + } + + if (compare_ether_addr(br->bridge_id.addr, addr) == 0) + return false; /* no change */ + + br_stp_change_bridge_id(br, addr); + return true; +} + +/* called under bridge lock */ +void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) +{ + struct net_bridge_port *p; + int wasroot; + + wasroot = br_is_root_bridge(br); + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + br_is_designated_port(p)) { + p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF; + p->designated_bridge.prio[1] = newprio & 0xFF; + } + + } + + br->bridge_id.prio[0] = (newprio >> 8) & 0xFF; + br->bridge_id.prio[1] = newprio & 0xFF; + br_configuration_update(br); + br_port_state_selection(br); + if (br_is_root_bridge(br) && !wasroot) + br_become_root_bridge(br); +} + +/* called under bridge lock */ +int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio) +{ + port_id new_port_id; + + if (newprio > BR_MAX_PORT_PRIORITY) + return -ERANGE; + + new_port_id = br_make_port_id(newprio, p->port_no); + if (br_is_designated_port(p)) + p->designated_port = new_port_id; + + p->port_id = new_port_id; + p->priority = newprio; + if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) && + p->port_id < p->designated_port) { + br_become_designated_port(p); + br_port_state_selection(p->br); + } + + return 0; +} + +/* called under bridge lock */ +int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) +{ + if (path_cost < BR_MIN_PATH_COST || + path_cost > BR_MAX_PATH_COST) + return -ERANGE; + + p->path_cost = path_cost; + br_configuration_update(p->br); + br_port_state_selection(p->br); + return 0; +} + +ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) +{ + return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", + id->prio[0], id->prio[1], + id->addr[0], id->addr[1], id->addr[2], + id->addr[3], id->addr[4], id->addr[5]); +} diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c new file mode 100644 index 00000000..3e965140 --- /dev/null +++ b/net/bridge/br_stp_timer.c @@ -0,0 +1,173 @@ +/* + * Spanning tree protocol; timer-related code + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + +/* called under bridge lock */ +static int br_is_designated_for_some_port(const struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + !memcmp(&p->designated_bridge, &br->bridge_id, 8)) + return 1; + } + + return 0; +} + +static void br_hello_timer_expired(unsigned long arg) +{ + struct net_bridge *br = (struct net_bridge *)arg; + + br_debug(br, "hello timer expired\n"); + spin_lock(&br->lock); + if (br->dev->flags & IFF_UP) { + br_config_bpdu_generation(br); + + mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); + } + spin_unlock(&br->lock); +} + +static void br_message_age_timer_expired(unsigned long arg) +{ + struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge *br = p->br; + const bridge_id *id = &p->designated_bridge; + int was_root; + + if (p->state == BR_STATE_DISABLED) + return; + + br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n", + (unsigned) p->port_no, p->dev->name, + id->prio[0], id->prio[1], &id->addr); + + /* + * According to the spec, the message age timer cannot be + * running when we are the root bridge. So.. this was_root + * check is redundant. I'm leaving it in for now, though. + */ + spin_lock(&br->lock); + if (p->state == BR_STATE_DISABLED) + goto unlock; + was_root = br_is_root_bridge(br); + + br_become_designated_port(p); + br_configuration_update(br); + br_port_state_selection(br); + if (br_is_root_bridge(br) && !was_root) + br_become_root_bridge(br); + unlock: + spin_unlock(&br->lock); +} + +static void br_forward_delay_timer_expired(unsigned long arg) +{ + struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge *br = p->br; + + br_debug(br, "port %u(%s) forward delay timer\n", + (unsigned) p->port_no, p->dev->name); + spin_lock(&br->lock); + if (p->state == BR_STATE_LISTENING) { + p->state = BR_STATE_LEARNING; + mod_timer(&p->forward_delay_timer, + jiffies + br->forward_delay); + } else if (p->state == BR_STATE_LEARNING) { + p->state = BR_STATE_FORWARDING; + if (br_is_designated_for_some_port(br)) + br_topology_change_detection(br); + netif_carrier_on(br->dev); + } + br_log_state(p); + spin_unlock(&br->lock); +} + +static void br_tcn_timer_expired(unsigned long arg) +{ + struct net_bridge *br = (struct net_bridge *) arg; + + br_debug(br, "tcn timer expired\n"); + spin_lock(&br->lock); + if (br->dev->flags & IFF_UP) { + br_transmit_tcn(br); + + mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time); + } + spin_unlock(&br->lock); +} + +static void br_topology_change_timer_expired(unsigned long arg) +{ + struct net_bridge *br = (struct net_bridge *) arg; + + br_debug(br, "topo change timer expired\n"); + spin_lock(&br->lock); + br->topology_change_detected = 0; + br->topology_change = 0; + spin_unlock(&br->lock); +} + +static void br_hold_timer_expired(unsigned long arg) +{ + struct net_bridge_port *p = (struct net_bridge_port *) arg; + + br_debug(p->br, "port %u(%s) hold timer expired\n", + (unsigned) p->port_no, p->dev->name); + + spin_lock(&p->br->lock); + if (p->config_pending) + br_transmit_config(p); + spin_unlock(&p->br->lock); +} + +void br_stp_timer_init(struct net_bridge *br) +{ + setup_timer(&br->hello_timer, br_hello_timer_expired, + (unsigned long) br); + + setup_timer(&br->tcn_timer, br_tcn_timer_expired, + (unsigned long) br); + + setup_timer(&br->topology_change_timer, + br_topology_change_timer_expired, + (unsigned long) br); + + setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br); +} + +void br_stp_port_timer_init(struct net_bridge_port *p) +{ + setup_timer(&p->message_age_timer, br_message_age_timer_expired, + (unsigned long) p); + + setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired, + (unsigned long) p); + + setup_timer(&p->hold_timer, br_hold_timer_expired, + (unsigned long) p); +} + +/* Report ticks left (in USER_HZ) used for API */ +unsigned long br_timer_value(const struct timer_list *timer) +{ + return timer_pending(timer) + ? jiffies_to_clock_t(timer->expires - jiffies) : 0; +} diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c new file mode 100644 index 00000000..68b893ea --- /dev/null +++ b/net/bridge/br_sysfs_br.c @@ -0,0 +1,784 @@ +/* + * Sysfs attributes of bridge ports + * Linux ethernet bridge + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +#define to_dev(obj) container_of(obj, struct device, kobj) +#define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) + +/* + * Common code for storing bridge parameters. + */ +static ssize_t store_bridge_parm(struct device *d, + const char *buf, size_t len, + int (*set)(struct net_bridge *, unsigned long)) +{ + struct net_bridge *br = to_bridge(d); + char *endp; + unsigned long val; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + err = (*set)(br, val); + return err ? err : len; +} + + +static ssize_t show_forward_delay(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); +} + +static ssize_t store_forward_delay(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_set_forward_delay); +} +static DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR, + show_forward_delay, store_forward_delay); + +static ssize_t show_hello_time(struct device *d, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->hello_time)); +} + +static ssize_t store_hello_time(struct device *d, + struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, br_set_hello_time); +} +static DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time, + store_hello_time); + +static ssize_t show_max_age(struct device *d, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->max_age)); +} + +static ssize_t store_max_age(struct device *d, struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_set_max_age); +} +static DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, store_max_age); + +static ssize_t show_ageing_time(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); +} + +static int set_ageing_time(struct net_bridge *br, unsigned long val) +{ + br->ageing_time = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_ageing_time(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_ageing_time); +} +static DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time, + store_ageing_time); + +static ssize_t show_stp_state(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->stp_enabled); +} + + +static ssize_t store_stp_state(struct device *d, + struct device_attribute *attr, const char *buf, + size_t len) +{ + struct net_bridge *br = to_bridge(d); + char *endp; + unsigned long val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + br_stp_set_enabled(br, val); + rtnl_unlock(); + + return len; +} +static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, + store_stp_state); + +static ssize_t show_priority(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", + (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); +} + +static int set_priority(struct net_bridge *br, unsigned long val) +{ + br_stp_set_bridge_priority(br, (u16) val); + return 0; +} + +static ssize_t store_priority(struct device *d, struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_priority); +} +static DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority, store_priority); + +static ssize_t show_root_id(struct device *d, struct device_attribute *attr, + char *buf) +{ + return br_show_bridge_id(buf, &to_bridge(d)->designated_root); +} +static DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL); + +static ssize_t show_bridge_id(struct device *d, struct device_attribute *attr, + char *buf) +{ + return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); +} +static DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL); + +static ssize_t show_root_port(struct device *d, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", to_bridge(d)->root_port); +} +static DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL); + +static ssize_t show_root_path_cost(struct device *d, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); +} +static DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL); + +static ssize_t show_topology_change(struct device *d, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", to_bridge(d)->topology_change); +} +static DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL); + +static ssize_t show_topology_change_detected(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->topology_change_detected); +} +static DEVICE_ATTR(topology_change_detected, S_IRUGO, + show_topology_change_detected, NULL); + +static ssize_t show_hello_timer(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); +} +static DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL); + +static ssize_t show_tcn_timer(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); +} +static DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL); + +static ssize_t show_topology_change_timer(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); +} +static DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer, + NULL); + +static ssize_t show_gc_timer(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); +} +static DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL); + +static ssize_t show_group_addr(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%x:%x:%x:%x:%x:%x\n", + br->group_addr[0], br->group_addr[1], + br->group_addr[2], br->group_addr[3], + br->group_addr[4], br->group_addr[5]); +} + +static ssize_t store_group_addr(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_bridge *br = to_bridge(d); + unsigned new_addr[6]; + int i; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (sscanf(buf, "%x:%x:%x:%x:%x:%x", + &new_addr[0], &new_addr[1], &new_addr[2], + &new_addr[3], &new_addr[4], &new_addr[5]) != 6) + return -EINVAL; + + /* Must be 01:80:c2:00:00:0X */ + for (i = 0; i < 5; i++) + if (new_addr[i] != br_group_address[i]) + return -EINVAL; + + if (new_addr[5] & ~0xf) + return -EINVAL; + + if (new_addr[5] == 1 || /* 802.3x Pause address */ + new_addr[5] == 2 || /* 802.3ad Slow protocols */ + new_addr[5] == 3) /* 802.1X PAE address */ + return -EINVAL; + + spin_lock_bh(&br->lock); + for (i = 0; i < 6; i++) + br->group_addr[i] = new_addr[i]; + spin_unlock_bh(&br->lock); + return len; +} + +static DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR, + show_group_addr, store_group_addr); + +static ssize_t store_flush(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_bridge *br = to_bridge(d); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br_fdb_flush(br); + return len; +} +static DEVICE_ATTR(flush, S_IWUSR, NULL, store_flush); + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +static ssize_t show_multicast_router(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->multicast_router); +} + +static ssize_t store_multicast_router(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_set_router); +} +static DEVICE_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, + store_multicast_router); + +static ssize_t show_multicast_snooping(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", !br->multicast_disabled); +} + +static ssize_t store_multicast_snooping(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_toggle); +} +static DEVICE_ATTR(multicast_snooping, S_IRUGO | S_IWUSR, + show_multicast_snooping, store_multicast_snooping); + +static ssize_t show_hash_elasticity(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->hash_elasticity); +} + +static int set_elasticity(struct net_bridge *br, unsigned long val) +{ + br->hash_elasticity = val; + return 0; +} + +static ssize_t store_hash_elasticity(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_elasticity); +} +static DEVICE_ATTR(hash_elasticity, S_IRUGO | S_IWUSR, show_hash_elasticity, + store_hash_elasticity); + +static ssize_t show_hash_max(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->hash_max); +} + +static ssize_t store_hash_max(struct device *d, struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_set_hash_max); +} +static DEVICE_ATTR(hash_max, S_IRUGO | S_IWUSR, show_hash_max, + store_hash_max); + +static ssize_t show_multicast_last_member_count(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->multicast_last_member_count); +} + +static int set_last_member_count(struct net_bridge *br, unsigned long val) +{ + br->multicast_last_member_count = val; + return 0; +} + +static ssize_t store_multicast_last_member_count(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_last_member_count); +} +static DEVICE_ATTR(multicast_last_member_count, S_IRUGO | S_IWUSR, + show_multicast_last_member_count, + store_multicast_last_member_count); + +static ssize_t show_multicast_startup_query_count( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->multicast_startup_query_count); +} + +static int set_startup_query_count(struct net_bridge *br, unsigned long val) +{ + br->multicast_startup_query_count = val; + return 0; +} + +static ssize_t store_multicast_startup_query_count( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_startup_query_count); +} +static DEVICE_ATTR(multicast_startup_query_count, S_IRUGO | S_IWUSR, + show_multicast_startup_query_count, + store_multicast_startup_query_count); + +static ssize_t show_multicast_last_member_interval( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_last_member_interval)); +} + +static int set_last_member_interval(struct net_bridge *br, unsigned long val) +{ + br->multicast_last_member_interval = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_multicast_last_member_interval( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_last_member_interval); +} +static DEVICE_ATTR(multicast_last_member_interval, S_IRUGO | S_IWUSR, + show_multicast_last_member_interval, + store_multicast_last_member_interval); + +static ssize_t show_multicast_membership_interval( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_membership_interval)); +} + +static int set_membership_interval(struct net_bridge *br, unsigned long val) +{ + br->multicast_membership_interval = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_multicast_membership_interval( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_membership_interval); +} +static DEVICE_ATTR(multicast_membership_interval, S_IRUGO | S_IWUSR, + show_multicast_membership_interval, + store_multicast_membership_interval); + +static ssize_t show_multicast_querier_interval(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_querier_interval)); +} + +static int set_querier_interval(struct net_bridge *br, unsigned long val) +{ + br->multicast_querier_interval = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_multicast_querier_interval(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_querier_interval); +} +static DEVICE_ATTR(multicast_querier_interval, S_IRUGO | S_IWUSR, + show_multicast_querier_interval, + store_multicast_querier_interval); + +static ssize_t show_multicast_query_interval(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_query_interval)); +} + +static int set_query_interval(struct net_bridge *br, unsigned long val) +{ + br->multicast_query_interval = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_multicast_query_interval(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_query_interval); +} +static DEVICE_ATTR(multicast_query_interval, S_IRUGO | S_IWUSR, + show_multicast_query_interval, + store_multicast_query_interval); + +static ssize_t show_multicast_query_response_interval( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf( + buf, "%lu\n", + jiffies_to_clock_t(br->multicast_query_response_interval)); +} + +static int set_query_response_interval(struct net_bridge *br, unsigned long val) +{ + br->multicast_query_response_interval = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_multicast_query_response_interval( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_query_response_interval); +} +static DEVICE_ATTR(multicast_query_response_interval, S_IRUGO | S_IWUSR, + show_multicast_query_response_interval, + store_multicast_query_response_interval); + +static ssize_t show_multicast_startup_query_interval( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf( + buf, "%lu\n", + jiffies_to_clock_t(br->multicast_startup_query_interval)); +} + +static int set_startup_query_interval(struct net_bridge *br, unsigned long val) +{ + br->multicast_startup_query_interval = clock_t_to_jiffies(val); + return 0; +} + +static ssize_t store_multicast_startup_query_interval( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_startup_query_interval); +} +static DEVICE_ATTR(multicast_startup_query_interval, S_IRUGO | S_IWUSR, + show_multicast_startup_query_interval, + store_multicast_startup_query_interval); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER +static ssize_t show_nf_call_iptables( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->nf_call_iptables); +} + +static int set_nf_call_iptables(struct net_bridge *br, unsigned long val) +{ + br->nf_call_iptables = val ? true : false; + return 0; +} + +static ssize_t store_nf_call_iptables( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_nf_call_iptables); +} +static DEVICE_ATTR(nf_call_iptables, S_IRUGO | S_IWUSR, + show_nf_call_iptables, store_nf_call_iptables); + +static ssize_t show_nf_call_ip6tables( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->nf_call_ip6tables); +} + +static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val) +{ + br->nf_call_ip6tables = val ? true : false; + return 0; +} + +static ssize_t store_nf_call_ip6tables( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); +} +static DEVICE_ATTR(nf_call_ip6tables, S_IRUGO | S_IWUSR, + show_nf_call_ip6tables, store_nf_call_ip6tables); + +static ssize_t show_nf_call_arptables( + struct device *d, struct device_attribute *attr, char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->nf_call_arptables); +} + +static int set_nf_call_arptables(struct net_bridge *br, unsigned long val) +{ + br->nf_call_arptables = val ? true : false; + return 0; +} + +static ssize_t store_nf_call_arptables( + struct device *d, struct device_attribute *attr, const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_nf_call_arptables); +} +static DEVICE_ATTR(nf_call_arptables, S_IRUGO | S_IWUSR, + show_nf_call_arptables, store_nf_call_arptables); +#endif + +static struct attribute *bridge_attrs[] = { + &dev_attr_forward_delay.attr, + &dev_attr_hello_time.attr, + &dev_attr_max_age.attr, + &dev_attr_ageing_time.attr, + &dev_attr_stp_state.attr, + &dev_attr_priority.attr, + &dev_attr_bridge_id.attr, + &dev_attr_root_id.attr, + &dev_attr_root_path_cost.attr, + &dev_attr_root_port.attr, + &dev_attr_topology_change.attr, + &dev_attr_topology_change_detected.attr, + &dev_attr_hello_timer.attr, + &dev_attr_tcn_timer.attr, + &dev_attr_topology_change_timer.attr, + &dev_attr_gc_timer.attr, + &dev_attr_group_addr.attr, + &dev_attr_flush.attr, +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + &dev_attr_multicast_router.attr, + &dev_attr_multicast_snooping.attr, + &dev_attr_hash_elasticity.attr, + &dev_attr_hash_max.attr, + &dev_attr_multicast_last_member_count.attr, + &dev_attr_multicast_startup_query_count.attr, + &dev_attr_multicast_last_member_interval.attr, + &dev_attr_multicast_membership_interval.attr, + &dev_attr_multicast_querier_interval.attr, + &dev_attr_multicast_query_interval.attr, + &dev_attr_multicast_query_response_interval.attr, + &dev_attr_multicast_startup_query_interval.attr, +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + &dev_attr_nf_call_iptables.attr, + &dev_attr_nf_call_ip6tables.attr, + &dev_attr_nf_call_arptables.attr, +#endif + NULL +}; + +static struct attribute_group bridge_group = { + .name = SYSFS_BRIDGE_ATTR, + .attrs = bridge_attrs, +}; + +/* + * Export the forwarding information table as a binary file + * The records are struct __fdb_entry. + * + * Returns the number of bytes read. + */ +static ssize_t brforward_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + struct device *dev = to_dev(kobj); + struct net_bridge *br = to_bridge(dev); + int n; + + /* must read whole records */ + if (off % sizeof(struct __fdb_entry) != 0) + return -EINVAL; + + n = br_fdb_fillbuf(br, buf, + count / sizeof(struct __fdb_entry), + off / sizeof(struct __fdb_entry)); + + if (n > 0) + n *= sizeof(struct __fdb_entry); + + return n; +} + +static struct bin_attribute bridge_forward = { + .attr = { .name = SYSFS_BRIDGE_FDB, + .mode = S_IRUGO, }, + .read = brforward_read, +}; + +/* + * Add entries in sysfs onto the existing network class device + * for the bridge. + * Adds a attribute group "bridge" containing tuning parameters. + * Binary attribute containing the forward table + * Sub directory to hold links to interfaces. + * + * Note: the ifobj exists only to be a subdirectory + * to hold links. The ifobj exists in same data structure + * as it's parent the bridge so reference counting works. + */ +int br_sysfs_addbr(struct net_device *dev) +{ + struct kobject *brobj = &dev->dev.kobj; + struct net_bridge *br = netdev_priv(dev); + int err; + + err = sysfs_create_group(brobj, &bridge_group); + if (err) { + pr_info("%s: can't create group %s/%s\n", + __func__, dev->name, bridge_group.name); + goto out1; + } + + err = sysfs_create_bin_file(brobj, &bridge_forward); + if (err) { + pr_info("%s: can't create attribute file %s/%s\n", + __func__, dev->name, bridge_forward.attr.name); + goto out2; + } + + br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj); + if (!br->ifobj) { + pr_info("%s: can't add kobject (directory) %s/%s\n", + __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); + goto out3; + } + return 0; + out3: + sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward); + out2: + sysfs_remove_group(&dev->dev.kobj, &bridge_group); + out1: + return err; + +} + +void br_sysfs_delbr(struct net_device *dev) +{ + struct kobject *kobj = &dev->dev.kobj; + struct net_bridge *br = netdev_priv(dev); + + kobject_put(br->ifobj); + sysfs_remove_bin_file(kobj, &bridge_forward); + sysfs_remove_group(kobj, &bridge_group); +} diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c new file mode 100644 index 00000000..6229b627 --- /dev/null +++ b/net/bridge/br_sysfs_if.c @@ -0,0 +1,283 @@ +/* + * Sysfs attributes of bridge ports + * Linux ethernet bridge + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +struct brport_attribute { + struct attribute attr; + ssize_t (*show)(struct net_bridge_port *, char *); + int (*store)(struct net_bridge_port *, unsigned long); +}; + +#define BRPORT_ATTR(_name,_mode,_show,_store) \ +struct brport_attribute brport_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; + +static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->path_cost); +} + +static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR, + show_path_cost, br_stp_set_path_cost); + +static ssize_t show_priority(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->priority); +} + +static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR, + show_priority, br_stp_set_port_priority); + +static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) +{ + return br_show_bridge_id(buf, &p->designated_root); +} +static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL); + +static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) +{ + return br_show_bridge_id(buf, &p->designated_bridge); +} +static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL); + +static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->designated_port); +} +static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL); + +static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->designated_cost); +} +static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL); + +static ssize_t show_port_id(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "0x%x\n", p->port_id); +} +static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL); + +static ssize_t show_port_no(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "0x%x\n", p->port_no); +} + +static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL); + +static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->topology_change_ack); +} +static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL); + +static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->config_pending); +} +static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL); + +static ssize_t show_port_state(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->state); +} +static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL); + +static ssize_t show_message_age_timer(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); +} +static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL); + +static ssize_t show_forward_delay_timer(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); +} +static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL); + +static ssize_t show_hold_timer(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); +} +static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); + +static int store_flush(struct net_bridge_port *p, unsigned long v) +{ + br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry + return 0; +} +static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); + +static ssize_t show_hairpin_mode(struct net_bridge_port *p, char *buf) +{ + int hairpin_mode = (p->flags & BR_HAIRPIN_MODE) ? 1 : 0; + return sprintf(buf, "%d\n", hairpin_mode); +} +static int store_hairpin_mode(struct net_bridge_port *p, unsigned long v) +{ + if (v) + p->flags |= BR_HAIRPIN_MODE; + else + p->flags &= ~BR_HAIRPIN_MODE; + return 0; +} +static BRPORT_ATTR(hairpin_mode, S_IRUGO | S_IWUSR, + show_hairpin_mode, store_hairpin_mode); + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->multicast_router); +} + +static int store_multicast_router(struct net_bridge_port *p, + unsigned long v) +{ + return br_multicast_set_port_router(p, v); +} +static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, + store_multicast_router); +#endif + +static struct brport_attribute *brport_attrs[] = { + &brport_attr_path_cost, + &brport_attr_priority, + &brport_attr_port_id, + &brport_attr_port_no, + &brport_attr_designated_root, + &brport_attr_designated_bridge, + &brport_attr_designated_port, + &brport_attr_designated_cost, + &brport_attr_state, + &brport_attr_change_ack, + &brport_attr_config_pending, + &brport_attr_message_age_timer, + &brport_attr_forward_delay_timer, + &brport_attr_hold_timer, + &brport_attr_flush, + &brport_attr_hairpin_mode, +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + &brport_attr_multicast_router, +#endif + NULL +}; + +#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) +#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj) + +static ssize_t brport_show(struct kobject * kobj, + struct attribute * attr, char * buf) +{ + struct brport_attribute * brport_attr = to_brport_attr(attr); + struct net_bridge_port * p = to_brport(kobj); + + return brport_attr->show(p, buf); +} + +static ssize_t brport_store(struct kobject * kobj, + struct attribute * attr, + const char * buf, size_t count) +{ + struct brport_attribute * brport_attr = to_brport_attr(attr); + struct net_bridge_port * p = to_brport(kobj); + ssize_t ret = -EINVAL; + char *endp; + unsigned long val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp != buf) { + if (!rtnl_trylock()) + return restart_syscall(); + if (p->dev && p->br && brport_attr->store) { + spin_lock_bh(&p->br->lock); + ret = brport_attr->store(p, val); + spin_unlock_bh(&p->br->lock); + if (ret == 0) + ret = count; + } + rtnl_unlock(); + } + return ret; +} + +const struct sysfs_ops brport_sysfs_ops = { + .show = brport_show, + .store = brport_store, +}; + +/* + * Add sysfs entries to ethernet device added to a bridge. + * Creates a brport subdirectory with bridge attributes. + * Puts symlink in bridge's brif subdirectory + */ +int br_sysfs_addif(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + struct brport_attribute **a; + int err; + + err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, + SYSFS_BRIDGE_PORT_LINK); + if (err) + return err; + + for (a = brport_attrs; *a; ++a) { + err = sysfs_create_file(&p->kobj, &((*a)->attr)); + if (err) + return err; + } + + strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ); + return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name); +} + +/* Rename bridge's brif symlink */ +int br_sysfs_renameif(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + int err; + + /* If a rename fails, the rollback will cause another + * rename call with the existing name. + */ + if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ)) + return 0; + + err = sysfs_rename_link(br->ifobj, &p->kobj, + p->sysfs_name, p->dev->name); + if (err) + netdev_notice(br->dev, "unable to rename link %s to %s", + p->sysfs_name, p->dev->name); + else + strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ); + + return err; +} diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig new file mode 100644 index 00000000..ba6f73eb --- /dev/null +++ b/net/bridge/netfilter/Kconfig @@ -0,0 +1,221 @@ +# +# Bridge netfilter configuration +# + +menuconfig BRIDGE_NF_EBTABLES + tristate "Ethernet Bridge tables (ebtables) support" + depends on BRIDGE && BRIDGE_NETFILTER + select NETFILTER_XTABLES + help + ebtables is a general, extensible frame/packet identification + framework. Say 'Y' or 'M' here if you want to do Ethernet + filtering/NAT/brouting on the Ethernet bridge. + +if BRIDGE_NF_EBTABLES + +# +# tables +# +config BRIDGE_EBT_BROUTE + tristate "ebt: broute table support" + help + The ebtables broute table is used to define rules that decide between + bridging and routing frames, giving Linux the functionality of a + brouter. See the man page for ebtables(8) and examples on the ebtables + website. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_T_FILTER + tristate "ebt: filter table support" + help + The ebtables filter table is used to define frame filtering rules at + local input, forwarding and local output. See the man page for + ebtables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_T_NAT + tristate "ebt: nat table support" + help + The ebtables nat table is used to define rules that alter the MAC + source address (MAC SNAT) or the MAC destination address (MAC DNAT). + See the man page for ebtables(8). + + To compile it as a module, choose M here. If unsure, say N. +# +# matches +# +config BRIDGE_EBT_802_3 + tristate "ebt: 802.3 filter support" + help + This option adds matching support for 802.3 Ethernet frames. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_AMONG + tristate "ebt: among filter support" + help + This option adds the among match, which allows matching the MAC source + and/or destination address on a list of addresses. Optionally, + MAC/IP address pairs can be matched, f.e. for anti-spoofing rules. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_ARP + tristate "ebt: ARP filter support" + help + This option adds the ARP match, which allows ARP and RARP header field + filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_IP + tristate "ebt: IP filter support" + help + This option adds the IP match, which allows basic IP header field + filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_IP6 + tristate "ebt: IP6 filter support" + depends on BRIDGE_NF_EBTABLES && IPV6 + help + This option adds the IP6 match, which allows basic IPV6 header field + filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_LIMIT + tristate "ebt: limit match support" + help + This option adds the limit match, which allows you to control + the rate at which a rule can be matched. This match is the + equivalent of the iptables limit match. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config BRIDGE_EBT_MARK + tristate "ebt: mark filter support" + help + This option adds the mark match, which allows matching frames based on + the 'nfmark' value in the frame. This can be set by the mark target. + This value is the same as the one used in the iptables mark match and + target. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_PKTTYPE + tristate "ebt: packet type filter support" + help + This option adds the packet type match, which allows matching on the + type of packet based on its Ethernet "class" (as determined by + the generic networking code): broadcast, multicast, + for this host alone or for another host. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_STP + tristate "ebt: STP filter support" + help + This option adds the Spanning Tree Protocol match, which + allows STP header field filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_VLAN + tristate "ebt: 802.1Q VLAN filter support" + help + This option adds the 802.1Q vlan match, which allows the filtering of + 802.1Q vlan fields. + + To compile it as a module, choose M here. If unsure, say N. +# +# targets +# +config BRIDGE_EBT_ARPREPLY + tristate "ebt: arp reply target support" + depends on BRIDGE_NF_EBTABLES && INET + help + This option adds the arp reply target, which allows + automatically sending arp replies to arp requests. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_DNAT + tristate "ebt: dnat target support" + help + This option adds the MAC DNAT target, which allows altering the MAC + destination address of frames. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_MARK_T + tristate "ebt: mark target support" + help + This option adds the mark target, which allows marking frames by + setting the 'nfmark' value in the frame. + This value is the same as the one used in the iptables mark match and + target. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_REDIRECT + tristate "ebt: redirect target support" + help + This option adds the MAC redirect target, which allows altering the MAC + destination address of a frame to that of the device it arrived on. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_SNAT + tristate "ebt: snat target support" + help + This option adds the MAC SNAT target, which allows altering the MAC + source address of frames. + + To compile it as a module, choose M here. If unsure, say N. +# +# watchers +# +config BRIDGE_EBT_LOG + tristate "ebt: log support" + help + This option adds the log watcher, that you can use in any rule + in any ebtables table. It records info about the frame header + to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_ULOG + tristate "ebt: ulog support (OBSOLETE)" + help + This option enables the old bridge-specific "ebt_ulog" implementation + which has been obsoleted by the new "nfnetlink_log" code (see + CONFIG_NETFILTER_NETLINK_LOG). + + This option adds the ulog watcher, that you can use in any rule + in any ebtables table. The packet is passed to a userspace + logging daemon using netlink multicast sockets. This differs + from the log watcher in the sense that the complete packet is + sent to userspace instead of a descriptive text and that + netlink multicast sockets are used instead of the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_NFLOG + tristate "ebt: nflog support" + help + This option enables the nflog watcher, which allows to LOG + messages through the netfilter logging API, which can use + either the old LOG target, the old ULOG target or nfnetlink_log + as backend. + + This option adds the nflog watcher, that you can use in any rule + in any ebtables table. + + To compile it as a module, choose M here. If unsure, say N. + +endif # BRIDGE_NF_EBTABLES diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile new file mode 100644 index 00000000..07186995 --- /dev/null +++ b/net/bridge/netfilter/Makefile @@ -0,0 +1,34 @@ +# +# Makefile for the netfilter modules for Link Layer filtering on a bridge. +# + +obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o + +# tables +obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o +obj-$(CONFIG_BRIDGE_EBT_T_FILTER) += ebtable_filter.o +obj-$(CONFIG_BRIDGE_EBT_T_NAT) += ebtable_nat.o + +#matches +obj-$(CONFIG_BRIDGE_EBT_802_3) += ebt_802_3.o +obj-$(CONFIG_BRIDGE_EBT_AMONG) += ebt_among.o +obj-$(CONFIG_BRIDGE_EBT_ARP) += ebt_arp.o +obj-$(CONFIG_BRIDGE_EBT_IP) += ebt_ip.o +obj-$(CONFIG_BRIDGE_EBT_IP6) += ebt_ip6.o +obj-$(CONFIG_BRIDGE_EBT_LIMIT) += ebt_limit.o +obj-$(CONFIG_BRIDGE_EBT_MARK) += ebt_mark_m.o +obj-$(CONFIG_BRIDGE_EBT_PKTTYPE) += ebt_pkttype.o +obj-$(CONFIG_BRIDGE_EBT_STP) += ebt_stp.o +obj-$(CONFIG_BRIDGE_EBT_VLAN) += ebt_vlan.o + +# targets +obj-$(CONFIG_BRIDGE_EBT_ARPREPLY) += ebt_arpreply.o +obj-$(CONFIG_BRIDGE_EBT_MARK_T) += ebt_mark.o +obj-$(CONFIG_BRIDGE_EBT_DNAT) += ebt_dnat.o +obj-$(CONFIG_BRIDGE_EBT_REDIRECT) += ebt_redirect.o +obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o + +# watchers +obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o +obj-$(CONFIG_BRIDGE_EBT_ULOG) += ebt_ulog.o +obj-$(CONFIG_BRIDGE_EBT_NFLOG) += ebt_nflog.o diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c new file mode 100644 index 00000000..2a449b7a --- /dev/null +++ b/net/bridge/netfilter/ebt_802_3.c @@ -0,0 +1,72 @@ +/* + * 802_3 + * + * Author: + * Chris Vitale csv@bluetail.com + * + * May 2003 + * + */ +#include +#include +#include +#include + +static bool +ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_802_3_info *info = par->matchinfo; + const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb); + __be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type; + + if (info->bitmask & EBT_802_3_SAP) { + if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP)) + return false; + if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP)) + return false; + } + + if (info->bitmask & EBT_802_3_TYPE) { + if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE)) + return false; + if (FWINV(info->type != type, EBT_802_3_TYPE)) + return false; + } + + return true; +} + +static int ebt_802_3_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_802_3_info *info = par->matchinfo; + + if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK) + return -EINVAL; + + return 0; +} + +static struct xt_match ebt_802_3_mt_reg __read_mostly = { + .name = "802_3", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_802_3_mt, + .checkentry = ebt_802_3_mt_check, + .matchsize = sizeof(struct ebt_802_3_info), + .me = THIS_MODULE, +}; + +static int __init ebt_802_3_init(void) +{ + return xt_register_match(&ebt_802_3_mt_reg); +} + +static void __exit ebt_802_3_fini(void) +{ + xt_unregister_match(&ebt_802_3_mt_reg); +} + +module_init(ebt_802_3_init); +module_exit(ebt_802_3_fini); +MODULE_DESCRIPTION("Ebtables: DSAP/SSAP field and SNAP type matching"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c new file mode 100644 index 00000000..8b84c581 --- /dev/null +++ b/net/bridge/netfilter/ebt_among.c @@ -0,0 +1,229 @@ +/* + * ebt_among + * + * Authors: + * Grzegorz Borowiak + * + * August, 2003 + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh, + const char *mac, __be32 ip) +{ + /* You may be puzzled as to how this code works. + * Some tricks were used, refer to + * include/linux/netfilter_bridge/ebt_among.h + * as there you can find a solution of this mystery. + */ + const struct ebt_mac_wormhash_tuple *p; + int start, limit, i; + uint32_t cmp[2] = { 0, 0 }; + int key = ((const unsigned char *)mac)[5]; + + memcpy(((char *) cmp) + 2, mac, 6); + start = wh->table[key]; + limit = wh->table[key + 1]; + if (ip) { + for (i = start; i < limit; i++) { + p = &wh->pool[i]; + if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) + if (p->ip == 0 || p->ip == ip) + return true; + } + } else { + for (i = start; i < limit; i++) { + p = &wh->pool[i]; + if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) + if (p->ip == 0) + return true; + } + } + return false; +} + +static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash + *wh) +{ + int i; + + for (i = 0; i < 256; i++) { + if (wh->table[i] > wh->table[i + 1]) + return -0x100 - i; + if (wh->table[i] < 0) + return -0x200 - i; + if (wh->table[i] > wh->poolsize) + return -0x300 - i; + } + if (wh->table[256] > wh->poolsize) + return -0xc00; + return 0; +} + +static int get_ip_dst(const struct sk_buff *skb, __be32 *addr) +{ + if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { + const struct iphdr *ih; + struct iphdr _iph; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) + return -1; + *addr = ih->daddr; + } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { + const struct arphdr *ah; + struct arphdr _arph; + const __be32 *bp; + __be32 buf; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL || + ah->ar_pln != sizeof(__be32) || + ah->ar_hln != ETH_ALEN) + return -1; + bp = skb_header_pointer(skb, sizeof(struct arphdr) + + 2 * ETH_ALEN + sizeof(__be32), + sizeof(__be32), &buf); + if (bp == NULL) + return -1; + *addr = *bp; + } + return 0; +} + +static int get_ip_src(const struct sk_buff *skb, __be32 *addr) +{ + if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { + const struct iphdr *ih; + struct iphdr _iph; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) + return -1; + *addr = ih->saddr; + } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { + const struct arphdr *ah; + struct arphdr _arph; + const __be32 *bp; + __be32 buf; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL || + ah->ar_pln != sizeof(__be32) || + ah->ar_hln != ETH_ALEN) + return -1; + bp = skb_header_pointer(skb, sizeof(struct arphdr) + + ETH_ALEN, sizeof(__be32), &buf); + if (bp == NULL) + return -1; + *addr = *bp; + } + return 0; +} + +static bool +ebt_among_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_among_info *info = par->matchinfo; + const char *dmac, *smac; + const struct ebt_mac_wormhash *wh_dst, *wh_src; + __be32 dip = 0, sip = 0; + + wh_dst = ebt_among_wh_dst(info); + wh_src = ebt_among_wh_src(info); + + if (wh_src) { + smac = eth_hdr(skb)->h_source; + if (get_ip_src(skb, &sip)) + return false; + if (!(info->bitmask & EBT_AMONG_SRC_NEG)) { + /* we match only if it contains */ + if (!ebt_mac_wormhash_contains(wh_src, smac, sip)) + return false; + } else { + /* we match only if it DOES NOT contain */ + if (ebt_mac_wormhash_contains(wh_src, smac, sip)) + return false; + } + } + + if (wh_dst) { + dmac = eth_hdr(skb)->h_dest; + if (get_ip_dst(skb, &dip)) + return false; + if (!(info->bitmask & EBT_AMONG_DST_NEG)) { + /* we match only if it contains */ + if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip)) + return false; + } else { + /* we match only if it DOES NOT contain */ + if (ebt_mac_wormhash_contains(wh_dst, dmac, dip)) + return false; + } + } + + return true; +} + +static int ebt_among_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_among_info *info = par->matchinfo; + const struct ebt_entry_match *em = + container_of(par->matchinfo, const struct ebt_entry_match, data); + int expected_length = sizeof(struct ebt_among_info); + const struct ebt_mac_wormhash *wh_dst, *wh_src; + int err; + + wh_dst = ebt_among_wh_dst(info); + wh_src = ebt_among_wh_src(info); + expected_length += ebt_mac_wormhash_size(wh_dst); + expected_length += ebt_mac_wormhash_size(wh_src); + + if (em->match_size != EBT_ALIGN(expected_length)) { + pr_info("wrong size: %d against expected %d, rounded to %Zd\n", + em->match_size, expected_length, + EBT_ALIGN(expected_length)); + return -EINVAL; + } + if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) { + pr_info("dst integrity fail: %x\n", -err); + return -EINVAL; + } + if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) { + pr_info("src integrity fail: %x\n", -err); + return -EINVAL; + } + return 0; +} + +static struct xt_match ebt_among_mt_reg __read_mostly = { + .name = "among", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_among_mt, + .checkentry = ebt_among_mt_check, + .matchsize = -1, /* special case */ + .me = THIS_MODULE, +}; + +static int __init ebt_among_init(void) +{ + return xt_register_match(&ebt_among_mt_reg); +} + +static void __exit ebt_among_fini(void) +{ + xt_unregister_match(&ebt_among_mt_reg); +} + +module_init(ebt_among_init); +module_exit(ebt_among_fini); +MODULE_DESCRIPTION("Ebtables: Combined MAC/IP address list matching"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c new file mode 100644 index 00000000..cd457b89 --- /dev/null +++ b/net/bridge/netfilter/ebt_arp.c @@ -0,0 +1,140 @@ +/* + * ebt_arp + * + * Authors: + * Bart De Schuymer + * Tim Gardner + * + * April, 2002 + * + */ +#include +#include +#include +#include +#include +#include + +static bool +ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_arp_info *info = par->matchinfo; + const struct arphdr *ah; + struct arphdr _arph; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) + return false; + if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode != + ah->ar_op, EBT_ARP_OPCODE)) + return false; + if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype != + ah->ar_hrd, EBT_ARP_HTYPE)) + return false; + if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype != + ah->ar_pro, EBT_ARP_PTYPE)) + return false; + + if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) { + const __be32 *sap, *dap; + __be32 saddr, daddr; + + if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP)) + return false; + sap = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln, sizeof(saddr), + &saddr); + if (sap == NULL) + return false; + dap = skb_header_pointer(skb, sizeof(struct arphdr) + + 2*ah->ar_hln+sizeof(saddr), + sizeof(daddr), &daddr); + if (dap == NULL) + return false; + if (info->bitmask & EBT_ARP_SRC_IP && + FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP)) + return false; + if (info->bitmask & EBT_ARP_DST_IP && + FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP)) + return false; + if (info->bitmask & EBT_ARP_GRAT && + FWINV(*dap != *sap, EBT_ARP_GRAT)) + return false; + } + + if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { + const unsigned char *mp; + unsigned char _mac[ETH_ALEN]; + uint8_t verdict, i; + + if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER)) + return false; + if (info->bitmask & EBT_ARP_SRC_MAC) { + mp = skb_header_pointer(skb, sizeof(struct arphdr), + sizeof(_mac), &_mac); + if (mp == NULL) + return false; + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (mp[i] ^ info->smaddr[i]) & + info->smmsk[i]; + if (FWINV(verdict != 0, EBT_ARP_SRC_MAC)) + return false; + } + + if (info->bitmask & EBT_ARP_DST_MAC) { + mp = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln + ah->ar_pln, + sizeof(_mac), &_mac); + if (mp == NULL) + return false; + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (mp[i] ^ info->dmaddr[i]) & + info->dmmsk[i]; + if (FWINV(verdict != 0, EBT_ARP_DST_MAC)) + return false; + } + } + + return true; +} + +static int ebt_arp_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_arp_info *info = par->matchinfo; + const struct ebt_entry *e = par->entryinfo; + + if ((e->ethproto != htons(ETH_P_ARP) && + e->ethproto != htons(ETH_P_RARP)) || + e->invflags & EBT_IPROTO) + return -EINVAL; + if (info->bitmask & ~EBT_ARP_MASK || info->invflags & ~EBT_ARP_MASK) + return -EINVAL; + return 0; +} + +static struct xt_match ebt_arp_mt_reg __read_mostly = { + .name = "arp", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_arp_mt, + .checkentry = ebt_arp_mt_check, + .matchsize = sizeof(struct ebt_arp_info), + .me = THIS_MODULE, +}; + +static int __init ebt_arp_init(void) +{ + return xt_register_match(&ebt_arp_mt_reg); +} + +static void __exit ebt_arp_fini(void) +{ + xt_unregister_match(&ebt_arp_mt_reg); +} + +module_init(ebt_arp_init); +module_exit(ebt_arp_fini); +MODULE_DESCRIPTION("Ebtables: ARP protocol packet match"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c new file mode 100644 index 00000000..070cf134 --- /dev/null +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -0,0 +1,98 @@ +/* + * ebt_arpreply + * + * Authors: + * Grzegorz Borowiak + * Bart De Schuymer + * + * August, 2003 + * + */ +#include +#include +#include +#include +#include +#include + +static unsigned int +ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_arpreply_info *info = par->targinfo; + const __be32 *siptr, *diptr; + __be32 _sip, _dip; + const struct arphdr *ap; + struct arphdr _ah; + const unsigned char *shp; + unsigned char _sha[ETH_ALEN]; + + ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah); + if (ap == NULL) + return EBT_DROP; + + if (ap->ar_op != htons(ARPOP_REQUEST) || + ap->ar_hln != ETH_ALEN || + ap->ar_pro != htons(ETH_P_IP) || + ap->ar_pln != 4) + return EBT_CONTINUE; + + shp = skb_header_pointer(skb, sizeof(_ah), ETH_ALEN, &_sha); + if (shp == NULL) + return EBT_DROP; + + siptr = skb_header_pointer(skb, sizeof(_ah) + ETH_ALEN, + sizeof(_sip), &_sip); + if (siptr == NULL) + return EBT_DROP; + + diptr = skb_header_pointer(skb, + sizeof(_ah) + 2 * ETH_ALEN + sizeof(_sip), + sizeof(_dip), &_dip); + if (diptr == NULL) + return EBT_DROP; + + arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in, + *diptr, shp, info->mac, shp); + + return info->target; +} + +static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par) +{ + const struct ebt_arpreply_info *info = par->targinfo; + const struct ebt_entry *e = par->entryinfo; + + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + if (e->ethproto != htons(ETH_P_ARP) || + e->invflags & EBT_IPROTO) + return -EINVAL; + return 0; +} + +static struct xt_target ebt_arpreply_tg_reg __read_mostly = { + .name = "arpreply", + .revision = 0, + .family = NFPROTO_BRIDGE, + .table = "nat", + .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING), + .target = ebt_arpreply_tg, + .checkentry = ebt_arpreply_tg_check, + .targetsize = sizeof(struct ebt_arpreply_info), + .me = THIS_MODULE, +}; + +static int __init ebt_arpreply_init(void) +{ + return xt_register_target(&ebt_arpreply_tg_reg); +} + +static void __exit ebt_arpreply_fini(void) +{ + xt_unregister_target(&ebt_arpreply_tg_reg); +} + +module_init(ebt_arpreply_init); +module_exit(ebt_arpreply_fini); +MODULE_DESCRIPTION("Ebtables: ARP reply target"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c new file mode 100644 index 00000000..c59f7bfa --- /dev/null +++ b/net/bridge/netfilter/ebt_dnat.c @@ -0,0 +1,74 @@ +/* + * ebt_dnat + * + * Authors: + * Bart De Schuymer + * + * June, 2002 + * + */ +#include +#include +#include +#include +#include +#include + +static unsigned int +ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_nat_info *info = par->targinfo; + + if (!skb_make_writable(skb, 0)) + return EBT_DROP; + + memcpy(eth_hdr(skb)->h_dest, info->mac, ETH_ALEN); + return info->target; +} + +static int ebt_dnat_tg_check(const struct xt_tgchk_param *par) +{ + const struct ebt_nat_info *info = par->targinfo; + unsigned int hook_mask; + + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + + hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS); + if ((strcmp(par->table, "nat") != 0 || + (hook_mask & ~((1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_LOCAL_OUT)))) && + (strcmp(par->table, "broute") != 0 || + hook_mask & ~(1 << NF_BR_BROUTING))) + return -EINVAL; + if (INVALID_TARGET) + return -EINVAL; + return 0; +} + +static struct xt_target ebt_dnat_tg_reg __read_mostly = { + .name = "dnat", + .revision = 0, + .family = NFPROTO_BRIDGE, + .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_BROUTING), + .target = ebt_dnat_tg, + .checkentry = ebt_dnat_tg_check, + .targetsize = sizeof(struct ebt_nat_info), + .me = THIS_MODULE, +}; + +static int __init ebt_dnat_init(void) +{ + return xt_register_target(&ebt_dnat_tg_reg); +} + +static void __exit ebt_dnat_fini(void) +{ + xt_unregister_target(&ebt_dnat_tg_reg); +} + +module_init(ebt_dnat_init); +module_exit(ebt_dnat_fini); +MODULE_DESCRIPTION("Ebtables: Destination MAC address translation"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c new file mode 100644 index 00000000..23bca62d --- /dev/null +++ b/net/bridge/netfilter/ebt_ip.c @@ -0,0 +1,130 @@ +/* + * ebt_ip + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + * Changes: + * added ip-sport and ip-dport + * Innominate Security Technologies AG + * September, 2002 + */ +#include +#include +#include +#include +#include +#include +#include + +struct tcpudphdr { + __be16 src; + __be16 dst; +}; + +static bool +ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_ip_info *info = par->matchinfo; + const struct iphdr *ih; + struct iphdr _iph; + const struct tcpudphdr *pptr; + struct tcpudphdr _ports; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) + return false; + if (info->bitmask & EBT_IP_TOS && + FWINV(info->tos != ih->tos, EBT_IP_TOS)) + return false; + if (info->bitmask & EBT_IP_SOURCE && + FWINV((ih->saddr & info->smsk) != + info->saddr, EBT_IP_SOURCE)) + return false; + if ((info->bitmask & EBT_IP_DEST) && + FWINV((ih->daddr & info->dmsk) != + info->daddr, EBT_IP_DEST)) + return false; + if (info->bitmask & EBT_IP_PROTO) { + if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO)) + return false; + if (!(info->bitmask & EBT_IP_DPORT) && + !(info->bitmask & EBT_IP_SPORT)) + return true; + if (ntohs(ih->frag_off) & IP_OFFSET) + return false; + pptr = skb_header_pointer(skb, ih->ihl*4, + sizeof(_ports), &_ports); + if (pptr == NULL) + return false; + if (info->bitmask & EBT_IP_DPORT) { + u32 dst = ntohs(pptr->dst); + if (FWINV(dst < info->dport[0] || + dst > info->dport[1], + EBT_IP_DPORT)) + return false; + } + if (info->bitmask & EBT_IP_SPORT) { + u32 src = ntohs(pptr->src); + if (FWINV(src < info->sport[0] || + src > info->sport[1], + EBT_IP_SPORT)) + return false; + } + } + return true; +} + +static int ebt_ip_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_ip_info *info = par->matchinfo; + const struct ebt_entry *e = par->entryinfo; + + if (e->ethproto != htons(ETH_P_IP) || + e->invflags & EBT_IPROTO) + return -EINVAL; + if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK) + return -EINVAL; + if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) { + if (info->invflags & EBT_IP_PROTO) + return -EINVAL; + if (info->protocol != IPPROTO_TCP && + info->protocol != IPPROTO_UDP && + info->protocol != IPPROTO_UDPLITE && + info->protocol != IPPROTO_SCTP && + info->protocol != IPPROTO_DCCP) + return -EINVAL; + } + if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1]) + return -EINVAL; + if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) + return -EINVAL; + return 0; +} + +static struct xt_match ebt_ip_mt_reg __read_mostly = { + .name = "ip", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_ip_mt, + .checkentry = ebt_ip_mt_check, + .matchsize = sizeof(struct ebt_ip_info), + .me = THIS_MODULE, +}; + +static int __init ebt_ip_init(void) +{ + return xt_register_match(&ebt_ip_mt_reg); +} + +static void __exit ebt_ip_fini(void) +{ + xt_unregister_match(&ebt_ip_mt_reg); +} + +module_init(ebt_ip_init); +module_exit(ebt_ip_fini); +MODULE_DESCRIPTION("Ebtables: IPv4 protocol packet match"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c new file mode 100644 index 00000000..2ed0056a --- /dev/null +++ b/net/bridge/netfilter/ebt_ip6.c @@ -0,0 +1,155 @@ +/* + * ebt_ip6 + * + * Authors: + * Manohar Castelino + * Kuo-Lang Tseng + * Jan Engelhardt + * + * Summary: + * This is just a modification of the IPv4 code written by + * Bart De Schuymer + * with the changes required to support IPv6 + * + * Jan, 2008 + */ +#include +#include +#include +#include +#include +#include +#include +#include + +union pkthdr { + struct { + __be16 src; + __be16 dst; + } tcpudphdr; + struct { + u8 type; + u8 code; + } icmphdr; +}; + +static bool +ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_ip6_info *info = par->matchinfo; + const struct ipv6hdr *ih6; + struct ipv6hdr _ip6h; + const union pkthdr *pptr; + union pkthdr _pkthdr; + + ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); + if (ih6 == NULL) + return false; + if (info->bitmask & EBT_IP6_TCLASS && + FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS)) + return false; + if (FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, + &info->saddr), EBT_IP6_SOURCE) || + FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, + &info->daddr), EBT_IP6_DEST)) + return false; + if (info->bitmask & EBT_IP6_PROTO) { + uint8_t nexthdr = ih6->nexthdr; + int offset_ph; + + offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr); + if (offset_ph == -1) + return false; + if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) + return false; + if (!(info->bitmask & ( EBT_IP6_DPORT | + EBT_IP6_SPORT | EBT_IP6_ICMP6))) + return true; + + /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */ + pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr), + &_pkthdr); + if (pptr == NULL) + return false; + if (info->bitmask & EBT_IP6_DPORT) { + u16 dst = ntohs(pptr->tcpudphdr.dst); + if (FWINV(dst < info->dport[0] || + dst > info->dport[1], EBT_IP6_DPORT)) + return false; + } + if (info->bitmask & EBT_IP6_SPORT) { + u16 src = ntohs(pptr->tcpudphdr.src); + if (FWINV(src < info->sport[0] || + src > info->sport[1], EBT_IP6_SPORT)) + return false; + } + if ((info->bitmask & EBT_IP6_ICMP6) && + FWINV(pptr->icmphdr.type < info->icmpv6_type[0] || + pptr->icmphdr.type > info->icmpv6_type[1] || + pptr->icmphdr.code < info->icmpv6_code[0] || + pptr->icmphdr.code > info->icmpv6_code[1], + EBT_IP6_ICMP6)) + return false; + } + return true; +} + +static int ebt_ip6_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_entry *e = par->entryinfo; + struct ebt_ip6_info *info = par->matchinfo; + + if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO) + return -EINVAL; + if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK) + return -EINVAL; + if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) { + if (info->invflags & EBT_IP6_PROTO) + return -EINVAL; + if (info->protocol != IPPROTO_TCP && + info->protocol != IPPROTO_UDP && + info->protocol != IPPROTO_UDPLITE && + info->protocol != IPPROTO_SCTP && + info->protocol != IPPROTO_DCCP) + return -EINVAL; + } + if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1]) + return -EINVAL; + if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1]) + return -EINVAL; + if (info->bitmask & EBT_IP6_ICMP6) { + if ((info->invflags & EBT_IP6_PROTO) || + info->protocol != IPPROTO_ICMPV6) + return -EINVAL; + if (info->icmpv6_type[0] > info->icmpv6_type[1] || + info->icmpv6_code[0] > info->icmpv6_code[1]) + return -EINVAL; + } + return 0; +} + +static struct xt_match ebt_ip6_mt_reg __read_mostly = { + .name = "ip6", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_ip6_mt, + .checkentry = ebt_ip6_mt_check, + .matchsize = sizeof(struct ebt_ip6_info), + .me = THIS_MODULE, +}; + +static int __init ebt_ip6_init(void) +{ + return xt_register_match(&ebt_ip6_mt_reg); +} + +static void __exit ebt_ip6_fini(void) +{ + xt_unregister_match(&ebt_ip6_mt_reg); +} + +module_init(ebt_ip6_init); +module_exit(ebt_ip6_fini); +MODULE_DESCRIPTION("Ebtables: IPv6 protocol packet match"); +MODULE_AUTHOR("Kuo-Lang Tseng "); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c new file mode 100644 index 00000000..517e78be --- /dev/null +++ b/net/bridge/netfilter/ebt_limit.c @@ -0,0 +1,127 @@ +/* + * ebt_limit + * + * Authors: + * Tom Marshall + * + * Mostly copied from netfilter's ipt_limit.c, see that file for + * more explanation + * + * September, 2003 + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(limit_lock); + +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static bool +ebt_limit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ebt_limit_info *info = (void *)par->matchinfo; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + info->credit += (now - xchg(&info->prev, now)) * CREDITS_PER_JIFFY; + if (info->credit > info->credit_cap) + info->credit = info->credit_cap; + + if (info->credit >= info->cost) { + /* We're not limited. */ + info->credit -= info->cost; + spin_unlock_bh(&limit_lock); + return true; + } + + spin_unlock_bh(&limit_lock); + return false; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / EBT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE; +} + +static int ebt_limit_mt_check(const struct xt_mtchk_param *par) +{ + struct ebt_limit_info *info = par->matchinfo; + + /* Check for overflow. */ + if (info->burst == 0 || + user2credits(info->avg * info->burst) < user2credits(info->avg)) { + pr_info("overflow, try lower: %u/%u\n", + info->avg, info->burst); + return -EINVAL; + } + + /* User avg in seconds * EBT_LIMIT_SCALE: convert to jiffies * 128. */ + info->prev = jiffies; + info->credit = user2credits(info->avg * info->burst); + info->credit_cap = user2credits(info->avg * info->burst); + info->cost = user2credits(info->avg); + return 0; +} + + +#ifdef CONFIG_COMPAT +/* + * no conversion function needed -- + * only avg/burst have meaningful values in userspace. + */ +struct ebt_compat_limit_info { + compat_uint_t avg, burst; + compat_ulong_t prev; + compat_uint_t credit, credit_cap, cost; +}; +#endif + +static struct xt_match ebt_limit_mt_reg __read_mostly = { + .name = "limit", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_limit_mt, + .checkentry = ebt_limit_mt_check, + .matchsize = sizeof(struct ebt_limit_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct ebt_compat_limit_info), +#endif + .me = THIS_MODULE, +}; + +static int __init ebt_limit_init(void) +{ + return xt_register_match(&ebt_limit_mt_reg); +} + +static void __exit ebt_limit_fini(void) +{ + xt_unregister_match(&ebt_limit_mt_reg); +} + +module_init(ebt_limit_init); +module_exit(ebt_limit_fini); +MODULE_DESCRIPTION("Ebtables: Rate-limit match"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c new file mode 100644 index 00000000..6e5a8bb9 --- /dev/null +++ b/net/bridge/netfilter/ebt_log.c @@ -0,0 +1,228 @@ +/* + * ebt_log + * + * Authors: + * Bart De Schuymer + * Harald Welte + * + * April, 2002 + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(ebt_log_lock); + +static int ebt_log_tg_check(const struct xt_tgchk_param *par) +{ + struct ebt_log_info *info = par->targinfo; + + if (info->bitmask & ~EBT_LOG_MASK) + return -EINVAL; + if (info->loglevel >= 8) + return -EINVAL; + info->prefix[EBT_LOG_PREFIX_SIZE - 1] = '\0'; + return 0; +} + +struct tcpudphdr +{ + __be16 src; + __be16 dst; +}; + +struct arppayload +{ + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +static void +print_ports(const struct sk_buff *skb, uint8_t protocol, int offset) +{ + if (protocol == IPPROTO_TCP || + protocol == IPPROTO_UDP || + protocol == IPPROTO_UDPLITE || + protocol == IPPROTO_SCTP || + protocol == IPPROTO_DCCP) { + const struct tcpudphdr *pptr; + struct tcpudphdr _ports; + + pptr = skb_header_pointer(skb, offset, + sizeof(_ports), &_ports); + if (pptr == NULL) { + printk(" INCOMPLETE TCP/UDP header"); + return; + } + printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst)); + } +} + +static void +ebt_log_packet(u_int8_t pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) +{ + unsigned int bitmask; + + spin_lock_bh(&ebt_log_lock); + printk("<%c>%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", out ? out->name : "", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + + if (loginfo->type == NF_LOG_TYPE_LOG) + bitmask = loginfo->u.log.logflags; + else + bitmask = NF_LOG_MASK; + + if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == + htons(ETH_P_IP)){ + const struct iphdr *ih; + struct iphdr _iph; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) { + printk(" INCOMPLETE IP header"); + goto out; + } + printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d", + &ih->saddr, &ih->daddr, ih->tos, ih->protocol); + print_ports(skb, ih->protocol, ih->ihl*4); + goto out; + } + +#if defined(CONFIG_BRIDGE_EBT_IP6) || defined(CONFIG_BRIDGE_EBT_IP6_MODULE) + if ((bitmask & EBT_LOG_IP6) && eth_hdr(skb)->h_proto == + htons(ETH_P_IPV6)) { + const struct ipv6hdr *ih; + struct ipv6hdr _iph; + uint8_t nexthdr; + int offset_ph; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) { + printk(" INCOMPLETE IPv6 header"); + goto out; + } + printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d", + &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr); + nexthdr = ih->nexthdr; + offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr); + if (offset_ph == -1) + goto out; + print_ports(skb, nexthdr, offset_ph); + goto out; + } +#endif + + if ((bitmask & EBT_LOG_ARP) && + ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || + (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { + const struct arphdr *ah; + struct arphdr _arph; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) { + printk(" INCOMPLETE ARP header"); + goto out; + } + printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), + ntohs(ah->ar_op)); + + /* If it's for Ethernet and the lengths are OK, + * then log the ARP payload */ + if (ah->ar_hrd == htons(1) && + ah->ar_hln == ETH_ALEN && + ah->ar_pln == sizeof(__be32)) { + const struct arppayload *ap; + struct arppayload _arpp; + + ap = skb_header_pointer(skb, sizeof(_arph), + sizeof(_arpp), &_arpp); + if (ap == NULL) { + printk(" INCOMPLETE ARP payload"); + goto out; + } + printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4", + ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); + } + } +out: + printk("\n"); + spin_unlock_bh(&ebt_log_lock); + +} + +static unsigned int +ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_log_info *info = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = info->loglevel; + li.u.log.logflags = info->bitmask; + + if (info->bitmask & EBT_LOG_NFLOG) + nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, + par->out, &li, "%s", info->prefix); + else + ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, + par->out, &li, info->prefix); + return EBT_CONTINUE; +} + +static struct xt_target ebt_log_tg_reg __read_mostly = { + .name = "log", + .revision = 0, + .family = NFPROTO_BRIDGE, + .target = ebt_log_tg, + .checkentry = ebt_log_tg_check, + .targetsize = sizeof(struct ebt_log_info), + .me = THIS_MODULE, +}; + +static struct nf_logger ebt_log_logger __read_mostly = { + .name = "ebt_log", + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + +static int __init ebt_log_init(void) +{ + int ret; + + ret = xt_register_target(&ebt_log_tg_reg); + if (ret < 0) + return ret; + nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger); + return 0; +} + +static void __exit ebt_log_fini(void) +{ + nf_log_unregister(&ebt_log_logger); + xt_unregister_target(&ebt_log_tg_reg); +} + +module_init(ebt_log_init); +module_exit(ebt_log_fini); +MODULE_DESCRIPTION("Ebtables: Packet logging to syslog"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c new file mode 100644 index 00000000..66697cbd --- /dev/null +++ b/net/bridge/netfilter/ebt_mark.c @@ -0,0 +1,110 @@ +/* + * ebt_mark + * + * Authors: + * Bart De Schuymer + * + * July, 2002 + * + */ + +/* The mark target can be used in any chain, + * I believe adding a mangle table just for marking is total overkill. + * Marking a frame doesn't really change anything in the frame anyway. + */ + +#include +#include +#include +#include + +static unsigned int +ebt_mark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_mark_t_info *info = par->targinfo; + int action = info->target & -16; + + if (action == MARK_SET_VALUE) + skb->mark = info->mark; + else if (action == MARK_OR_VALUE) + skb->mark |= info->mark; + else if (action == MARK_AND_VALUE) + skb->mark &= info->mark; + else + skb->mark ^= info->mark; + + return info->target | ~EBT_VERDICT_BITS; +} + +static int ebt_mark_tg_check(const struct xt_tgchk_param *par) +{ + const struct ebt_mark_t_info *info = par->targinfo; + int tmp; + + tmp = info->target | ~EBT_VERDICT_BITS; + if (BASE_CHAIN && tmp == EBT_RETURN) + return -EINVAL; + if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) + return -EINVAL; + tmp = info->target & ~EBT_VERDICT_BITS; + if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE && + tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE) + return -EINVAL; + return 0; +} +#ifdef CONFIG_COMPAT +struct compat_ebt_mark_t_info { + compat_ulong_t mark; + compat_uint_t target; +}; + +static void mark_tg_compat_from_user(void *dst, const void *src) +{ + const struct compat_ebt_mark_t_info *user = src; + struct ebt_mark_t_info *kern = dst; + + kern->mark = user->mark; + kern->target = user->target; +} + +static int mark_tg_compat_to_user(void __user *dst, const void *src) +{ + struct compat_ebt_mark_t_info __user *user = dst; + const struct ebt_mark_t_info *kern = src; + + if (put_user(kern->mark, &user->mark) || + put_user(kern->target, &user->target)) + return -EFAULT; + return 0; +} +#endif + +static struct xt_target ebt_mark_tg_reg __read_mostly = { + .name = "mark", + .revision = 0, + .family = NFPROTO_BRIDGE, + .target = ebt_mark_tg, + .checkentry = ebt_mark_tg_check, + .targetsize = sizeof(struct ebt_mark_t_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_ebt_mark_t_info), + .compat_from_user = mark_tg_compat_from_user, + .compat_to_user = mark_tg_compat_to_user, +#endif + .me = THIS_MODULE, +}; + +static int __init ebt_mark_init(void) +{ + return xt_register_target(&ebt_mark_tg_reg); +} + +static void __exit ebt_mark_fini(void) +{ + xt_unregister_target(&ebt_mark_tg_reg); +} + +module_init(ebt_mark_init); +module_exit(ebt_mark_fini); +MODULE_DESCRIPTION("Ebtables: Packet mark modification"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c new file mode 100644 index 00000000..d98baefc --- /dev/null +++ b/net/bridge/netfilter/ebt_mark_m.c @@ -0,0 +1,98 @@ +/* + * ebt_mark_m + * + * Authors: + * Bart De Schuymer + * + * July, 2002 + * + */ +#include +#include +#include +#include + +static bool +ebt_mark_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_mark_m_info *info = par->matchinfo; + + if (info->bitmask & EBT_MARK_OR) + return !!(skb->mark & info->mask) ^ info->invert; + return ((skb->mark & info->mask) == info->mark) ^ info->invert; +} + +static int ebt_mark_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_mark_m_info *info = par->matchinfo; + + if (info->bitmask & ~EBT_MARK_MASK) + return -EINVAL; + if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND)) + return -EINVAL; + if (!info->bitmask) + return -EINVAL; + return 0; +} + + +#ifdef CONFIG_COMPAT +struct compat_ebt_mark_m_info { + compat_ulong_t mark, mask; + uint8_t invert, bitmask; +}; + +static void mark_mt_compat_from_user(void *dst, const void *src) +{ + const struct compat_ebt_mark_m_info *user = src; + struct ebt_mark_m_info *kern = dst; + + kern->mark = user->mark; + kern->mask = user->mask; + kern->invert = user->invert; + kern->bitmask = user->bitmask; +} + +static int mark_mt_compat_to_user(void __user *dst, const void *src) +{ + struct compat_ebt_mark_m_info __user *user = dst; + const struct ebt_mark_m_info *kern = src; + + if (put_user(kern->mark, &user->mark) || + put_user(kern->mask, &user->mask) || + put_user(kern->invert, &user->invert) || + put_user(kern->bitmask, &user->bitmask)) + return -EFAULT; + return 0; +} +#endif + +static struct xt_match ebt_mark_mt_reg __read_mostly = { + .name = "mark_m", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_mark_mt, + .checkentry = ebt_mark_mt_check, + .matchsize = sizeof(struct ebt_mark_m_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_ebt_mark_m_info), + .compat_from_user = mark_mt_compat_from_user, + .compat_to_user = mark_mt_compat_to_user, +#endif + .me = THIS_MODULE, +}; + +static int __init ebt_mark_m_init(void) +{ + return xt_register_match(&ebt_mark_mt_reg); +} + +static void __exit ebt_mark_m_fini(void) +{ + xt_unregister_match(&ebt_mark_mt_reg); +} + +module_init(ebt_mark_m_init); +module_exit(ebt_mark_m_fini); +MODULE_DESCRIPTION("Ebtables: Packet mark match"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c new file mode 100644 index 00000000..5be68bbc --- /dev/null +++ b/net/bridge/netfilter/ebt_nflog.c @@ -0,0 +1,72 @@ +/* + * ebt_nflog + * + * Author: + * Peter Warasin + * + * February, 2008 + * + * Based on: + * xt_NFLOG.c, (C) 2006 by Patrick McHardy + * ebt_ulog.c, (C) 2004 by Bart De Schuymer + * + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int +ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_nflog_info *info = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_ULOG; + li.u.ulog.copy_len = info->len; + li.u.ulog.group = info->group; + li.u.ulog.qthreshold = info->threshold; + + nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out, + &li, "%s", info->prefix); + return EBT_CONTINUE; +} + +static int ebt_nflog_tg_check(const struct xt_tgchk_param *par) +{ + struct ebt_nflog_info *info = par->targinfo; + + if (info->flags & ~EBT_NFLOG_MASK) + return -EINVAL; + info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0'; + return 0; +} + +static struct xt_target ebt_nflog_tg_reg __read_mostly = { + .name = "nflog", + .revision = 0, + .family = NFPROTO_BRIDGE, + .target = ebt_nflog_tg, + .checkentry = ebt_nflog_tg_check, + .targetsize = sizeof(struct ebt_nflog_info), + .me = THIS_MODULE, +}; + +static int __init ebt_nflog_init(void) +{ + return xt_register_target(&ebt_nflog_tg_reg); +} + +static void __exit ebt_nflog_fini(void) +{ + xt_unregister_target(&ebt_nflog_tg_reg); +} + +module_init(ebt_nflog_init); +module_exit(ebt_nflog_fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Peter Warasin "); +MODULE_DESCRIPTION("ebtables NFLOG netfilter logging module"); diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c new file mode 100644 index 00000000..496a5651 --- /dev/null +++ b/net/bridge/netfilter/ebt_pkttype.c @@ -0,0 +1,56 @@ +/* + * ebt_pkttype + * + * Authors: + * Bart De Schuymer + * + * April, 2003 + * + */ +#include +#include +#include +#include + +static bool +ebt_pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_pkttype_info *info = par->matchinfo; + + return (skb->pkt_type == info->pkt_type) ^ info->invert; +} + +static int ebt_pkttype_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_pkttype_info *info = par->matchinfo; + + if (info->invert != 0 && info->invert != 1) + return -EINVAL; + /* Allow any pkt_type value */ + return 0; +} + +static struct xt_match ebt_pkttype_mt_reg __read_mostly = { + .name = "pkttype", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_pkttype_mt, + .checkentry = ebt_pkttype_mt_check, + .matchsize = sizeof(struct ebt_pkttype_info), + .me = THIS_MODULE, +}; + +static int __init ebt_pkttype_init(void) +{ + return xt_register_match(&ebt_pkttype_mt_reg); +} + +static void __exit ebt_pkttype_fini(void) +{ + xt_unregister_match(&ebt_pkttype_mt_reg); +} + +module_init(ebt_pkttype_init); +module_exit(ebt_pkttype_fini); +MODULE_DESCRIPTION("Ebtables: Link layer packet type match"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c new file mode 100644 index 00000000..46624bb6 --- /dev/null +++ b/net/bridge/netfilter/ebt_redirect.c @@ -0,0 +1,80 @@ +/* + * ebt_redirect + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ +#include +#include +#include "../br_private.h" +#include +#include +#include +#include + +static unsigned int +ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_redirect_info *info = par->targinfo; + + if (!skb_make_writable(skb, 0)) + return EBT_DROP; + + if (par->hooknum != NF_BR_BROUTING) + /* rcu_read_lock()ed by nf_hook_slow */ + memcpy(eth_hdr(skb)->h_dest, + br_port_get_rcu(par->in)->br->dev->dev_addr, ETH_ALEN); + else + memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN); + skb->pkt_type = PACKET_HOST; + return info->target; +} + +static int ebt_redirect_tg_check(const struct xt_tgchk_param *par) +{ + const struct ebt_redirect_info *info = par->targinfo; + unsigned int hook_mask; + + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + + hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS); + if ((strcmp(par->table, "nat") != 0 || + hook_mask & ~(1 << NF_BR_PRE_ROUTING)) && + (strcmp(par->table, "broute") != 0 || + hook_mask & ~(1 << NF_BR_BROUTING))) + return -EINVAL; + if (INVALID_TARGET) + return -EINVAL; + return 0; +} + +static struct xt_target ebt_redirect_tg_reg __read_mostly = { + .name = "redirect", + .revision = 0, + .family = NFPROTO_BRIDGE, + .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_BROUTING), + .target = ebt_redirect_tg, + .checkentry = ebt_redirect_tg_check, + .targetsize = sizeof(struct ebt_redirect_info), + .me = THIS_MODULE, +}; + +static int __init ebt_redirect_init(void) +{ + return xt_register_target(&ebt_redirect_tg_reg); +} + +static void __exit ebt_redirect_fini(void) +{ + xt_unregister_target(&ebt_redirect_tg_reg); +} + +module_init(ebt_redirect_init); +module_exit(ebt_redirect_fini); +MODULE_DESCRIPTION("Ebtables: Packet redirection to localhost"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c new file mode 100644 index 00000000..f8f0bd1a --- /dev/null +++ b/net/bridge/netfilter/ebt_snat.c @@ -0,0 +1,87 @@ +/* + * ebt_snat + * + * Authors: + * Bart De Schuymer + * + * June, 2002 + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int +ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ebt_nat_info *info = par->targinfo; + + if (!skb_make_writable(skb, 0)) + return EBT_DROP; + + memcpy(eth_hdr(skb)->h_source, info->mac, ETH_ALEN); + if (!(info->target & NAT_ARP_BIT) && + eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { + const struct arphdr *ap; + struct arphdr _ah; + + ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah); + if (ap == NULL) + return EBT_DROP; + if (ap->ar_hln != ETH_ALEN) + goto out; + if (skb_store_bits(skb, sizeof(_ah), info->mac,ETH_ALEN)) + return EBT_DROP; + } +out: + return info->target | ~EBT_VERDICT_BITS; +} + +static int ebt_snat_tg_check(const struct xt_tgchk_param *par) +{ + const struct ebt_nat_info *info = par->targinfo; + int tmp; + + tmp = info->target | ~EBT_VERDICT_BITS; + if (BASE_CHAIN && tmp == EBT_RETURN) + return -EINVAL; + + if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) + return -EINVAL; + tmp = info->target | EBT_VERDICT_BITS; + if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT) + return -EINVAL; + return 0; +} + +static struct xt_target ebt_snat_tg_reg __read_mostly = { + .name = "snat", + .revision = 0, + .family = NFPROTO_BRIDGE, + .table = "nat", + .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_POST_ROUTING), + .target = ebt_snat_tg, + .checkentry = ebt_snat_tg_check, + .targetsize = sizeof(struct ebt_nat_info), + .me = THIS_MODULE, +}; + +static int __init ebt_snat_init(void) +{ + return xt_register_target(&ebt_snat_tg_reg); +} + +static void __exit ebt_snat_fini(void) +{ + xt_unregister_target(&ebt_snat_tg_reg); +} + +module_init(ebt_snat_init); +module_exit(ebt_snat_fini); +MODULE_DESCRIPTION("Ebtables: Source MAC address translation"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c new file mode 100644 index 00000000..5b33a2e6 --- /dev/null +++ b/net/bridge/netfilter/ebt_stp.c @@ -0,0 +1,197 @@ +/* + * ebt_stp + * + * Authors: + * Bart De Schuymer + * Stephen Hemminger + * + * July, 2003 + */ +#include +#include +#include +#include +#include + +#define BPDU_TYPE_CONFIG 0 +#define BPDU_TYPE_TCN 0x80 + +struct stp_header { + uint8_t dsap; + uint8_t ssap; + uint8_t ctrl; + uint8_t pid; + uint8_t vers; + uint8_t type; +}; + +struct stp_config_pdu { + uint8_t flags; + uint8_t root[8]; + uint8_t root_cost[4]; + uint8_t sender[8]; + uint8_t port[2]; + uint8_t msg_age[2]; + uint8_t max_age[2]; + uint8_t hello_time[2]; + uint8_t forward_delay[2]; +}; + +#define NR16(p) (p[0] << 8 | p[1]) +#define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]) + +static bool ebt_filter_config(const struct ebt_stp_info *info, + const struct stp_config_pdu *stpc) +{ + const struct ebt_stp_config_info *c; + uint16_t v16; + uint32_t v32; + int verdict, i; + + c = &info->config; + if ((info->bitmask & EBT_STP_FLAGS) && + FWINV(c->flags != stpc->flags, EBT_STP_FLAGS)) + return false; + if (info->bitmask & EBT_STP_ROOTPRIO) { + v16 = NR16(stpc->root); + if (FWINV(v16 < c->root_priol || + v16 > c->root_priou, EBT_STP_ROOTPRIO)) + return false; + } + if (info->bitmask & EBT_STP_ROOTADDR) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (stpc->root[2+i] ^ c->root_addr[i]) & + c->root_addrmsk[i]; + if (FWINV(verdict != 0, EBT_STP_ROOTADDR)) + return false; + } + if (info->bitmask & EBT_STP_ROOTCOST) { + v32 = NR32(stpc->root_cost); + if (FWINV(v32 < c->root_costl || + v32 > c->root_costu, EBT_STP_ROOTCOST)) + return false; + } + if (info->bitmask & EBT_STP_SENDERPRIO) { + v16 = NR16(stpc->sender); + if (FWINV(v16 < c->sender_priol || + v16 > c->sender_priou, EBT_STP_SENDERPRIO)) + return false; + } + if (info->bitmask & EBT_STP_SENDERADDR) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) & + c->sender_addrmsk[i]; + if (FWINV(verdict != 0, EBT_STP_SENDERADDR)) + return false; + } + if (info->bitmask & EBT_STP_PORT) { + v16 = NR16(stpc->port); + if (FWINV(v16 < c->portl || + v16 > c->portu, EBT_STP_PORT)) + return false; + } + if (info->bitmask & EBT_STP_MSGAGE) { + v16 = NR16(stpc->msg_age); + if (FWINV(v16 < c->msg_agel || + v16 > c->msg_ageu, EBT_STP_MSGAGE)) + return false; + } + if (info->bitmask & EBT_STP_MAXAGE) { + v16 = NR16(stpc->max_age); + if (FWINV(v16 < c->max_agel || + v16 > c->max_ageu, EBT_STP_MAXAGE)) + return false; + } + if (info->bitmask & EBT_STP_HELLOTIME) { + v16 = NR16(stpc->hello_time); + if (FWINV(v16 < c->hello_timel || + v16 > c->hello_timeu, EBT_STP_HELLOTIME)) + return false; + } + if (info->bitmask & EBT_STP_FWDD) { + v16 = NR16(stpc->forward_delay); + if (FWINV(v16 < c->forward_delayl || + v16 > c->forward_delayu, EBT_STP_FWDD)) + return false; + } + return true; +} + +static bool +ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_stp_info *info = par->matchinfo; + const struct stp_header *sp; + struct stp_header _stph; + const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; + + sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph); + if (sp == NULL) + return false; + + /* The stp code only considers these */ + if (memcmp(sp, header, sizeof(header))) + return false; + + if (info->bitmask & EBT_STP_TYPE && + FWINV(info->type != sp->type, EBT_STP_TYPE)) + return false; + + if (sp->type == BPDU_TYPE_CONFIG && + info->bitmask & EBT_STP_CONFIG_MASK) { + const struct stp_config_pdu *st; + struct stp_config_pdu _stpc; + + st = skb_header_pointer(skb, sizeof(_stph), + sizeof(_stpc), &_stpc); + if (st == NULL) + return false; + return ebt_filter_config(info, st); + } + return true; +} + +static int ebt_stp_mt_check(const struct xt_mtchk_param *par) +{ + const struct ebt_stp_info *info = par->matchinfo; + const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; + const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const struct ebt_entry *e = par->entryinfo; + + if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || + !(info->bitmask & EBT_STP_MASK)) + return -EINVAL; + /* Make sure the match only receives stp frames */ + if (compare_ether_addr(e->destmac, bridge_ula) || + compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC)) + return -EINVAL; + + return 0; +} + +static struct xt_match ebt_stp_mt_reg __read_mostly = { + .name = "stp", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_stp_mt, + .checkentry = ebt_stp_mt_check, + .matchsize = sizeof(struct ebt_stp_info), + .me = THIS_MODULE, +}; + +static int __init ebt_stp_init(void) +{ + return xt_register_match(&ebt_stp_mt_reg); +} + +static void __exit ebt_stp_fini(void) +{ + xt_unregister_match(&ebt_stp_mt_reg); +} + +module_init(ebt_stp_init); +module_exit(ebt_stp_fini); +MODULE_DESCRIPTION("Ebtables: Spanning Tree Protocol packet match"); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c new file mode 100644 index 00000000..26377e96 --- /dev/null +++ b/net/bridge/netfilter/ebt_ulog.c @@ -0,0 +1,342 @@ +/* + * netfilter module for userspace bridged Ethernet frames logging daemons + * + * Authors: + * Bart De Schuymer + * Harald Welte + * + * November, 2004 + * + * Based on ipt_ULOG.c, which is + * (C) 2000-2002 by Harald Welte + * + * This module accepts two parameters: + * + * nlbufsiz: + * The parameter specifies how big the buffer for each netlink multicast + * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will + * get accumulated in the kernel until they are sent to userspace. It is + * NOT possible to allocate more than 128kB, and it is strongly discouraged, + * because atomically allocating 128kB inside the network rx softirq is not + * reliable. Please also keep in mind that this buffer size is allocated for + * each nlgroup you are using, so the total kernel memory usage increases + * by that factor. + * + * flushtimeout: + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../br_private.h" + +static unsigned int nlbufsiz = NLMSG_GOODSIZE; +module_param(nlbufsiz, uint, 0600); +MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) " + "(defaults to 4096)"); + +static unsigned int flushtimeout = 10; +module_param(flushtimeout, uint, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) " + "(defaults to 10)"); + +typedef struct { + unsigned int qlen; /* number of nlmsgs' in the skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct sk_buff *skb; /* the pre-allocated skb */ + struct timer_list timer; /* the timer function */ + spinlock_t lock; /* the per-queue lock */ +} ebt_ulog_buff_t; + +static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; +static struct sock *ebtulognl; + +/* send one ulog_buff_t to userspace */ +static void ulog_send(unsigned int nlgroup) +{ + ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup]; + + if (timer_pending(&ub->timer)) + del_timer(&ub->timer); + + if (!ub->skb) + return; + + /* last nlmsg needs NLMSG_DONE */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_type = NLMSG_DONE; + + NETLINK_CB(ub->skb).dst_group = nlgroup + 1; + netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); + + ub->qlen = 0; + ub->skb = NULL; +} + +/* timer function to flush queue in flushtimeout time */ +static void ulog_timer(unsigned long data) +{ + spin_lock_bh(&ulog_buffers[data].lock); + if (ulog_buffers[data].skb) + ulog_send(data); + spin_unlock_bh(&ulog_buffers[data].lock); +} + +static struct sk_buff *ulog_alloc_skb(unsigned int size) +{ + struct sk_buff *skb; + unsigned int n; + + n = max(size, nlbufsiz); + skb = alloc_skb(n, GFP_ATOMIC); + if (!skb) { + pr_debug("cannot alloc whole buffer of size %ub!\n", n); + if (n > size) { + /* try to allocate only as much as we need for + * current packet */ + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + pr_debug("cannot even allocate " + "buffer of size %ub\n", size); + } + } + + return skb; +} + +static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct ebt_ulog_info *uloginfo, const char *prefix) +{ + ebt_ulog_packet_msg_t *pm; + size_t size, copy_len; + struct nlmsghdr *nlh; + unsigned int group = uloginfo->nlgroup; + ebt_ulog_buff_t *ub = &ulog_buffers[group]; + spinlock_t *lock = &ub->lock; + ktime_t kt; + + if ((uloginfo->cprange == 0) || + (uloginfo->cprange > skb->len + ETH_HLEN)) + copy_len = skb->len + ETH_HLEN; + else + copy_len = uloginfo->cprange; + + size = NLMSG_SPACE(sizeof(*pm) + copy_len); + if (size > nlbufsiz) { + pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz); + return; + } + + spin_lock_bh(lock); + + if (!ub->skb) { + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } else if (size > skb_tailroom(ub->skb)) { + ulog_send(group); + + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } + + nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, 0, + size - NLMSG_ALIGN(sizeof(*nlh))); + ub->qlen++; + + pm = NLMSG_DATA(nlh); + + /* Fill in the ulog data */ + pm->version = EBT_ULOG_VERSION; + kt = ktime_get_real(); + pm->stamp = ktime_to_timeval(kt); + if (ub->qlen == 1) + ub->skb->tstamp = kt; + pm->data_len = copy_len; + pm->mark = skb->mark; + pm->hook = hooknr; + if (uloginfo->prefix != NULL) + strcpy(pm->prefix, uloginfo->prefix); + else + *(pm->prefix) = '\0'; + + if (in) { + strcpy(pm->physindev, in->name); + /* If in isn't a bridge, then physindev==indev */ + if (br_port_exists(in)) + /* rcu_read_lock()ed by nf_hook_slow */ + strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name); + else + strcpy(pm->indev, in->name); + } else + pm->indev[0] = pm->physindev[0] = '\0'; + + if (out) { + /* If out exists, then out is a bridge port */ + strcpy(pm->physoutdev, out->name); + /* rcu_read_lock()ed by nf_hook_slow */ + strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name); + } else + pm->outdev[0] = pm->physoutdev[0] = '\0'; + + if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0) + BUG(); + + if (ub->qlen > 1) + ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; + + ub->lastnlh = nlh; + + if (ub->qlen >= uloginfo->qthreshold) + ulog_send(group); + else if (!timer_pending(&ub->timer)) { + ub->timer.expires = jiffies + flushtimeout * HZ / 100; + add_timer(&ub->timer); + } + +unlock: + spin_unlock_bh(lock); + + return; + +nlmsg_failure: + pr_debug("error during NLMSG_PUT. This should " + "not happen, please report to author.\n"); + goto unlock; +alloc_failure: + goto unlock; +} + +/* this function is registered with the netfilter core */ +static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *li, + const char *prefix) +{ + struct ebt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; + loginfo.cprange = 0; + loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nlgroup = li->u.ulog.group; + loginfo.cprange = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } + + ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static unsigned int +ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + ebt_ulog_packet(par->hooknum, skb, par->in, par->out, + par->targinfo, NULL); + return EBT_CONTINUE; +} + +static int ebt_ulog_tg_check(const struct xt_tgchk_param *par) +{ + struct ebt_ulog_info *uloginfo = par->targinfo; + + if (uloginfo->nlgroup > 31) + return -EINVAL; + + uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0'; + + if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN) + uloginfo->qthreshold = EBT_ULOG_MAX_QLEN; + + return 0; +} + +static struct xt_target ebt_ulog_tg_reg __read_mostly = { + .name = "ulog", + .revision = 0, + .family = NFPROTO_BRIDGE, + .target = ebt_ulog_tg, + .checkentry = ebt_ulog_tg_check, + .targetsize = sizeof(struct ebt_ulog_info), + .me = THIS_MODULE, +}; + +static struct nf_logger ebt_ulog_logger __read_mostly = { + .name = "ebt_ulog", + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + +static int __init ebt_ulog_init(void) +{ + int ret; + int i; + + if (nlbufsiz >= 128*1024) { + pr_warning("Netlink buffer has to be <= 128kB," + " please try a smaller nlbufsiz parameter.\n"); + return -EINVAL; + } + + /* initialize ulog_buffers */ + for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { + setup_timer(&ulog_buffers[i].timer, ulog_timer, i); + spin_lock_init(&ulog_buffers[i].lock); + } + + ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, + EBT_ULOG_MAXNLGROUPS, NULL, NULL, + THIS_MODULE); + if (!ebtulognl) + ret = -ENOMEM; + else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0) + netlink_kernel_release(ebtulognl); + + if (ret == 0) + nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); + + return ret; +} + +static void __exit ebt_ulog_fini(void) +{ + ebt_ulog_buff_t *ub; + int i; + + nf_log_unregister(&ebt_ulog_logger); + xt_unregister_target(&ebt_ulog_tg_reg); + for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { + ub = &ulog_buffers[i]; + if (timer_pending(&ub->timer)) + del_timer(&ub->timer); + spin_lock_bh(&ub->lock); + if (ub->skb) { + kfree_skb(ub->skb); + ub->skb = NULL; + } + spin_unlock_bh(&ub->lock); + } + netlink_kernel_release(ebtulognl); +} + +module_init(ebt_ulog_init); +module_exit(ebt_ulog_fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Ebtables: Packet logging to netlink using ULOG"); diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c new file mode 100644 index 00000000..eae67bf0 --- /dev/null +++ b/net/bridge/netfilter/ebt_vlan.c @@ -0,0 +1,181 @@ +/* + * Description: EBTables 802.1Q match extension kernelspace module. + * Authors: Nick Fedchik + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_VERS "0.6" + +MODULE_AUTHOR("Nick Fedchik "); +MODULE_DESCRIPTION("Ebtables: 802.1Q VLAN tag match"); +MODULE_LICENSE("GPL"); + +#define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_ +#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; } + +static bool +ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ebt_vlan_info *info = par->matchinfo; + + unsigned short TCI; /* Whole TCI, given from parsed frame */ + unsigned short id; /* VLAN ID, given from frame TCI */ + unsigned char prio; /* user_priority, given from frame TCI */ + /* VLAN encapsulated Type/Length field, given from orig frame */ + __be16 encap; + + if (vlan_tx_tag_present(skb)) { + TCI = vlan_tx_tag_get(skb); + encap = skb->protocol; + } else { + const struct vlan_hdr *fp; + struct vlan_hdr _frame; + + fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); + if (fp == NULL) + return false; + + TCI = ntohs(fp->h_vlan_TCI); + encap = fp->h_vlan_encapsulated_proto; + } + + /* Tag Control Information (TCI) consists of the following elements: + * - User_priority. The user_priority field is three bits in length, + * interpreted as a binary number. + * - Canonical Format Indicator (CFI). The Canonical Format Indicator + * (CFI) is a single bit flag value. Currently ignored. + * - VLAN Identifier (VID). The VID is encoded as + * an unsigned binary number. */ + id = TCI & VLAN_VID_MASK; + prio = (TCI >> 13) & 0x7; + + /* Checking VLAN Identifier (VID) */ + if (GET_BITMASK(EBT_VLAN_ID)) + EXIT_ON_MISMATCH(id, EBT_VLAN_ID); + + /* Checking user_priority */ + if (GET_BITMASK(EBT_VLAN_PRIO)) + EXIT_ON_MISMATCH(prio, EBT_VLAN_PRIO); + + /* Checking Encapsulated Proto (Length/Type) field */ + if (GET_BITMASK(EBT_VLAN_ENCAP)) + EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP); + + return true; +} + +static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) +{ + struct ebt_vlan_info *info = par->matchinfo; + const struct ebt_entry *e = par->entryinfo; + + /* Is it 802.1Q frame checked? */ + if (e->ethproto != htons(ETH_P_8021Q)) { + pr_debug("passed entry proto %2.4X is not 802.1Q (8100)\n", + ntohs(e->ethproto)); + return -EINVAL; + } + + /* Check for bitmask range + * True if even one bit is out of mask */ + if (info->bitmask & ~EBT_VLAN_MASK) { + pr_debug("bitmask %2X is out of mask (%2X)\n", + info->bitmask, EBT_VLAN_MASK); + return -EINVAL; + } + + /* Check for inversion flags range */ + if (info->invflags & ~EBT_VLAN_MASK) { + pr_debug("inversion flags %2X is out of mask (%2X)\n", + info->invflags, EBT_VLAN_MASK); + return -EINVAL; + } + + /* Reserved VLAN ID (VID) values + * ----------------------------- + * 0 - The null VLAN ID. + * 1 - The default Port VID (PVID) + * 0x0FFF - Reserved for implementation use. + * if_vlan.h: VLAN_N_VID 4096. */ + if (GET_BITMASK(EBT_VLAN_ID)) { + if (!!info->id) { /* if id!=0 => check vid range */ + if (info->id > VLAN_N_VID) { + pr_debug("id %d is out of range (1-4096)\n", + info->id); + return -EINVAL; + } + /* Note: This is valid VLAN-tagged frame point. + * Any value of user_priority are acceptable, + * but should be ignored according to 802.1Q Std. + * So we just drop the prio flag. */ + info->bitmask &= ~EBT_VLAN_PRIO; + } + /* Else, id=0 (null VLAN ID) => user_priority range (any?) */ + } + + if (GET_BITMASK(EBT_VLAN_PRIO)) { + if ((unsigned char) info->prio > 7) { + pr_debug("prio %d is out of range (0-7)\n", + info->prio); + return -EINVAL; + } + } + /* Check for encapsulated proto range - it is possible to be + * any value for u_short range. + * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */ + if (GET_BITMASK(EBT_VLAN_ENCAP)) { + if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) { + pr_debug("encap frame length %d is less than " + "minimal\n", ntohs(info->encap)); + return -EINVAL; + } + } + + return 0; +} + +static struct xt_match ebt_vlan_mt_reg __read_mostly = { + .name = "vlan", + .revision = 0, + .family = NFPROTO_BRIDGE, + .match = ebt_vlan_mt, + .checkentry = ebt_vlan_mt_check, + .matchsize = sizeof(struct ebt_vlan_info), + .me = THIS_MODULE, +}; + +static int __init ebt_vlan_init(void) +{ + pr_debug("ebtables 802.1Q extension module v" MODULE_VERS "\n"); + return xt_register_match(&ebt_vlan_mt_reg); +} + +static void __exit ebt_vlan_fini(void) +{ + xt_unregister_match(&ebt_vlan_mt_reg); +} + +module_init(ebt_vlan_init); +module_exit(ebt_vlan_fini); diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c new file mode 100644 index 00000000..1bcaf36a --- /dev/null +++ b/net/bridge/netfilter/ebtable_broute.c @@ -0,0 +1,104 @@ +/* + * ebtable_broute + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + * This table lets you choose between routing and bridging for frames + * entering on a bridge enslaved nic. This table is traversed before any + * other ebtables table. See net/bridge/br_input.c. + */ + +#include +#include +#include + +/* EBT_ACCEPT means the frame will be bridged + * EBT_DROP means the frame will be routed + */ +static struct ebt_entries initial_chain = { + .name = "BROUTING", + .policy = EBT_ACCEPT, +}; + +static struct ebt_replace_kernel initial_table = +{ + .name = "broute", + .valid_hooks = 1 << NF_BR_BROUTING, + .entries_size = sizeof(struct ebt_entries), + .hook_entry = { + [NF_BR_BROUTING] = &initial_chain, + }, + .entries = (char *)&initial_chain, +}; + +static int check(const struct ebt_table_info *info, unsigned int valid_hooks) +{ + if (valid_hooks & ~(1 << NF_BR_BROUTING)) + return -EINVAL; + return 0; +} + +static const struct ebt_table broute_table = +{ + .name = "broute", + .table = &initial_table, + .valid_hooks = 1 << NF_BR_BROUTING, + .check = check, + .me = THIS_MODULE, +}; + +static int ebt_broute(struct sk_buff *skb) +{ + int ret; + + ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL, + dev_net(skb->dev)->xt.broute_table); + if (ret == NF_DROP) + return 1; /* route it */ + return 0; /* bridge it */ +} + +static int __net_init broute_net_init(struct net *net) +{ + net->xt.broute_table = ebt_register_table(net, &broute_table); + if (IS_ERR(net->xt.broute_table)) + return PTR_ERR(net->xt.broute_table); + return 0; +} + +static void __net_exit broute_net_exit(struct net *net) +{ + ebt_unregister_table(net, net->xt.broute_table); +} + +static struct pernet_operations broute_net_ops = { + .init = broute_net_init, + .exit = broute_net_exit, +}; + +static int __init ebtable_broute_init(void) +{ + int ret; + + ret = register_pernet_subsys(&broute_net_ops); + if (ret < 0) + return ret; + /* see br_input.c */ + rcu_assign_pointer(br_should_route_hook, + (br_should_route_hook_t *)ebt_broute); + return 0; +} + +static void __exit ebtable_broute_fini(void) +{ + rcu_assign_pointer(br_should_route_hook, NULL); + synchronize_net(); + unregister_pernet_subsys(&broute_net_ops); +} + +module_init(ebtable_broute_init); +module_exit(ebtable_broute_fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c new file mode 100644 index 00000000..42e6bd09 --- /dev/null +++ b/net/bridge/netfilter/ebtable_filter.c @@ -0,0 +1,139 @@ +/* + * ebtable_filter + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ + +#include +#include + +#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ + (1 << NF_BR_LOCAL_OUT)) + +static struct ebt_entries initial_chains[] = +{ + { + .name = "INPUT", + .policy = EBT_ACCEPT, + }, + { + .name = "FORWARD", + .policy = EBT_ACCEPT, + }, + { + .name = "OUTPUT", + .policy = EBT_ACCEPT, + }, +}; + +static struct ebt_replace_kernel initial_table = +{ + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .entries_size = 3 * sizeof(struct ebt_entries), + .hook_entry = { + [NF_BR_LOCAL_IN] = &initial_chains[0], + [NF_BR_FORWARD] = &initial_chains[1], + [NF_BR_LOCAL_OUT] = &initial_chains[2], + }, + .entries = (char *)initial_chains, +}; + +static int check(const struct ebt_table_info *info, unsigned int valid_hooks) +{ + if (valid_hooks & ~FILTER_VALID_HOOKS) + return -EINVAL; + return 0; +} + +static const struct ebt_table frame_filter = +{ + .name = "filter", + .table = &initial_table, + .valid_hooks = FILTER_VALID_HOOKS, + .check = check, + .me = THIS_MODULE, +}; + +static unsigned int +ebt_in_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_filter); +} + +static unsigned int +ebt_out_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_filter); +} + +static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { + { + .hook = ebt_in_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_FILTER_BRIDGED, + }, + { + .hook = ebt_in_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_FILTER_BRIDGED, + }, + { + .hook = ebt_out_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_OUT, + .priority = NF_BR_PRI_FILTER_OTHER, + }, +}; + +static int __net_init frame_filter_net_init(struct net *net) +{ + net->xt.frame_filter = ebt_register_table(net, &frame_filter); + if (IS_ERR(net->xt.frame_filter)) + return PTR_ERR(net->xt.frame_filter); + return 0; +} + +static void __net_exit frame_filter_net_exit(struct net *net) +{ + ebt_unregister_table(net, net->xt.frame_filter); +} + +static struct pernet_operations frame_filter_net_ops = { + .init = frame_filter_net_init, + .exit = frame_filter_net_exit, +}; + +static int __init ebtable_filter_init(void) +{ + int ret; + + ret = register_pernet_subsys(&frame_filter_net_ops); + if (ret < 0) + return ret; + ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter)); + if (ret < 0) + unregister_pernet_subsys(&frame_filter_net_ops); + return ret; +} + +static void __exit ebtable_filter_fini(void) +{ + nf_unregister_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter)); + unregister_pernet_subsys(&frame_filter_net_ops); +} + +module_init(ebtable_filter_init); +module_exit(ebtable_filter_fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c new file mode 100644 index 00000000..6dc2f878 --- /dev/null +++ b/net/bridge/netfilter/ebtable_nat.c @@ -0,0 +1,139 @@ +/* + * ebtable_nat + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ + +#include +#include + +#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ + (1 << NF_BR_POST_ROUTING)) + +static struct ebt_entries initial_chains[] = +{ + { + .name = "PREROUTING", + .policy = EBT_ACCEPT, + }, + { + .name = "OUTPUT", + .policy = EBT_ACCEPT, + }, + { + .name = "POSTROUTING", + .policy = EBT_ACCEPT, + } +}; + +static struct ebt_replace_kernel initial_table = +{ + .name = "nat", + .valid_hooks = NAT_VALID_HOOKS, + .entries_size = 3 * sizeof(struct ebt_entries), + .hook_entry = { + [NF_BR_PRE_ROUTING] = &initial_chains[0], + [NF_BR_LOCAL_OUT] = &initial_chains[1], + [NF_BR_POST_ROUTING] = &initial_chains[2], + }, + .entries = (char *)initial_chains, +}; + +static int check(const struct ebt_table_info *info, unsigned int valid_hooks) +{ + if (valid_hooks & ~NAT_VALID_HOOKS) + return -EINVAL; + return 0; +} + +static struct ebt_table frame_nat = +{ + .name = "nat", + .table = &initial_table, + .valid_hooks = NAT_VALID_HOOKS, + .check = check, + .me = THIS_MODULE, +}; + +static unsigned int +ebt_nat_in(unsigned int hook, struct sk_buff *skb, const struct net_device *in + , const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_nat); +} + +static unsigned int +ebt_nat_out(unsigned int hook, struct sk_buff *skb, const struct net_device *in + , const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_nat); +} + +static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { + { + .hook = ebt_nat_out, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_LOCAL_OUT, + .priority = NF_BR_PRI_NAT_DST_OTHER, + }, + { + .hook = ebt_nat_out, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_BR_PRI_NAT_SRC, + }, + { + .hook = ebt_nat_in, + .owner = THIS_MODULE, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_NAT_DST_BRIDGED, + }, +}; + +static int __net_init frame_nat_net_init(struct net *net) +{ + net->xt.frame_nat = ebt_register_table(net, &frame_nat); + if (IS_ERR(net->xt.frame_nat)) + return PTR_ERR(net->xt.frame_nat); + return 0; +} + +static void __net_exit frame_nat_net_exit(struct net *net) +{ + ebt_unregister_table(net, net->xt.frame_nat); +} + +static struct pernet_operations frame_nat_net_ops = { + .init = frame_nat_net_init, + .exit = frame_nat_net_exit, +}; + +static int __init ebtable_nat_init(void) +{ + int ret; + + ret = register_pernet_subsys(&frame_nat_net_ops); + if (ret < 0) + return ret; + ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat)); + if (ret < 0) + unregister_pernet_subsys(&frame_nat_net_ops); + return ret; +} + +static void __exit ebtable_nat_fini(void) +{ + nf_unregister_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat)); + unregister_pernet_subsys(&frame_nat_net_ops); +} + +module_init(ebtable_nat_init); +module_exit(ebtable_nat_fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c new file mode 100644 index 00000000..2b5ca1a0 --- /dev/null +++ b/net/bridge/netfilter/ebtables.c @@ -0,0 +1,2416 @@ +/* + * ebtables + * + * Author: + * Bart De Schuymer + * + * ebtables.c,v 2.0, July, 2002 + * + * This code is stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* needed for logical [in,out]-dev filtering */ +#include "../br_private.h" + +#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\ + "report to author: "format, ## args) +/* #define BUGPRINT(format, args...) */ + +/* + * Each cpu has its own set of counters, so there is no need for write_lock in + * the softirq + * For reading or updating the counters, the user context needs to + * get a write_lock + */ + +/* The size of each set of counters is altered to get cache alignment */ +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +#define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter))) +#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \ + COUNTER_OFFSET(n) * cpu)) + + + +static DEFINE_MUTEX(ebt_mutex); + +#ifdef CONFIG_COMPAT +static void ebt_standard_compat_from_user(void *dst, const void *src) +{ + int v = *(compat_int_t *)src; + + if (v >= 0) + v += xt_compat_calc_jump(NFPROTO_BRIDGE, v); + memcpy(dst, &v, sizeof(v)); +} + +static int ebt_standard_compat_to_user(void __user *dst, const void *src) +{ + compat_int_t cv = *(int *)src; + + if (cv >= 0) + cv -= xt_compat_calc_jump(NFPROTO_BRIDGE, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} +#endif + + +static struct xt_target ebt_standard_target = { + .name = "standard", + .revision = 0, + .family = NFPROTO_BRIDGE, + .targetsize = sizeof(int), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = ebt_standard_compat_from_user, + .compat_to_user = ebt_standard_compat_to_user, +#endif +}; + +static inline int +ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb, + struct xt_action_param *par) +{ + par->target = w->u.watcher; + par->targinfo = w->data; + w->u.watcher->target(skb, par); + /* watchers don't give a verdict */ + return 0; +} + +static inline int +ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb, + struct xt_action_param *par) +{ + par->match = m->u.match; + par->matchinfo = m->data; + return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH; +} + +static inline int +ebt_dev_check(const char *entry, const struct net_device *device) +{ + int i = 0; + const char *devname; + + if (*entry == '\0') + return 0; + if (!device) + return 1; + devname = device->name; + /* 1 is the wildcard token */ + while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i]) + i++; + return (devname[i] != entry[i] && entry[i] != 1); +} + +#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg)) +/* process standard matches */ +static inline int +ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out) +{ + const struct ethhdr *h = eth_hdr(skb); + const struct net_bridge_port *p; + __be16 ethproto; + int verdict, i; + + if (vlan_tx_tag_present(skb)) + ethproto = htons(ETH_P_8021Q); + else + ethproto = h->h_proto; + + if (e->bitmask & EBT_802_3) { + if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO)) + return 1; + } else if (!(e->bitmask & EBT_NOPROTO) && + FWINV2(e->ethproto != ethproto, EBT_IPROTO)) + return 1; + + if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN)) + return 1; + if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT)) + return 1; + /* rcu_read_lock()ed by nf_hook_slow */ + if (in && (p = br_port_get_rcu(in)) != NULL && + FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN)) + return 1; + if (out && (p = br_port_get_rcu(out)) != NULL && + FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT)) + return 1; + + if (e->bitmask & EBT_SOURCEMAC) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (h->h_source[i] ^ e->sourcemac[i]) & + e->sourcemsk[i]; + if (FWINV2(verdict != 0, EBT_ISOURCE) ) + return 1; + } + if (e->bitmask & EBT_DESTMAC) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (h->h_dest[i] ^ e->destmac[i]) & + e->destmsk[i]; + if (FWINV2(verdict != 0, EBT_IDEST) ) + return 1; + } + return 0; +} + +static inline __pure +struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) +{ + return (void *)entry + entry->next_offset; +} + +/* Do some firewalling */ +unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + struct ebt_table *table) +{ + int i, nentries; + struct ebt_entry *point; + struct ebt_counter *counter_base, *cb_base; + const struct ebt_entry_target *t; + int verdict, sp = 0; + struct ebt_chainstack *cs; + struct ebt_entries *chaininfo; + const char *base; + const struct ebt_table_info *private; + struct xt_action_param acpar; + + acpar.family = NFPROTO_BRIDGE; + acpar.in = in; + acpar.out = out; + acpar.hotdrop = false; + acpar.hooknum = hook; + + read_lock_bh(&table->lock); + private = table->private; + cb_base = COUNTER_BASE(private->counters, private->nentries, + smp_processor_id()); + if (private->chainstack) + cs = private->chainstack[smp_processor_id()]; + else + cs = NULL; + chaininfo = private->hook_entry[hook]; + nentries = private->hook_entry[hook]->nentries; + point = (struct ebt_entry *)(private->hook_entry[hook]->data); + counter_base = cb_base + private->hook_entry[hook]->counter_offset; + /* base for chain jumps */ + base = private->entries; + i = 0; + while (i < nentries) { + if (ebt_basic_match(point, skb, in, out)) + goto letscontinue; + + if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0) + goto letscontinue; + if (acpar.hotdrop) { + read_unlock_bh(&table->lock); + return NF_DROP; + } + + /* increase counter */ + (*(counter_base + i)).pcnt++; + (*(counter_base + i)).bcnt += skb->len; + + /* these should only watch: not modify, nor tell us + what to do with the packet */ + EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); + + t = (struct ebt_entry_target *) + (((char *)point) + point->target_offset); + /* standard target */ + if (!t->u.target->target) + verdict = ((struct ebt_standard_target *)t)->verdict; + else { + acpar.target = t->u.target; + acpar.targinfo = t->data; + verdict = t->u.target->target(skb, &acpar); + } + if (verdict == EBT_ACCEPT) { + read_unlock_bh(&table->lock); + return NF_ACCEPT; + } + if (verdict == EBT_DROP) { + read_unlock_bh(&table->lock); + return NF_DROP; + } + if (verdict == EBT_RETURN) { +letsreturn: +#ifdef CONFIG_NETFILTER_DEBUG + if (sp == 0) { + BUGPRINT("RETURN on base chain"); + /* act like this is EBT_CONTINUE */ + goto letscontinue; + } +#endif + sp--; + /* put all the local variables right */ + i = cs[sp].n; + chaininfo = cs[sp].chaininfo; + nentries = chaininfo->nentries; + point = cs[sp].e; + counter_base = cb_base + + chaininfo->counter_offset; + continue; + } + if (verdict == EBT_CONTINUE) + goto letscontinue; +#ifdef CONFIG_NETFILTER_DEBUG + if (verdict < 0) { + BUGPRINT("bogus standard verdict\n"); + read_unlock_bh(&table->lock); + return NF_DROP; + } +#endif + /* jump to a udc */ + cs[sp].n = i + 1; + cs[sp].chaininfo = chaininfo; + cs[sp].e = ebt_next_entry(point); + i = 0; + chaininfo = (struct ebt_entries *) (base + verdict); +#ifdef CONFIG_NETFILTER_DEBUG + if (chaininfo->distinguisher) { + BUGPRINT("jump to non-chain\n"); + read_unlock_bh(&table->lock); + return NF_DROP; + } +#endif + nentries = chaininfo->nentries; + point = (struct ebt_entry *)chaininfo->data; + counter_base = cb_base + chaininfo->counter_offset; + sp++; + continue; +letscontinue: + point = ebt_next_entry(point); + i++; + } + + /* I actually like this :) */ + if (chaininfo->policy == EBT_RETURN) + goto letsreturn; + if (chaininfo->policy == EBT_ACCEPT) { + read_unlock_bh(&table->lock); + return NF_ACCEPT; + } + read_unlock_bh(&table->lock); + return NF_DROP; +} + +/* If it succeeds, returns element and locks mutex */ +static inline void * +find_inlist_lock_noload(struct list_head *head, const char *name, int *error, + struct mutex *mutex) +{ + struct { + struct list_head list; + char name[EBT_FUNCTION_MAXNAMELEN]; + } *e; + + *error = mutex_lock_interruptible(mutex); + if (*error != 0) + return NULL; + + list_for_each_entry(e, head, list) { + if (strcmp(e->name, name) == 0) + return e; + } + *error = -ENOENT; + mutex_unlock(mutex); + return NULL; +} + +static void * +find_inlist_lock(struct list_head *head, const char *name, const char *prefix, + int *error, struct mutex *mutex) +{ + return try_then_request_module( + find_inlist_lock_noload(head, name, error, mutex), + "%s%s", prefix, name); +} + +static inline struct ebt_table * +find_table_lock(struct net *net, const char *name, int *error, + struct mutex *mutex) +{ + return find_inlist_lock(&net->xt.tables[NFPROTO_BRIDGE], name, + "ebtable_", error, mutex); +} + +static inline int +ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, + unsigned int *cnt) +{ + const struct ebt_entry *e = par->entryinfo; + struct xt_match *match; + size_t left = ((char *)e + e->watchers_offset) - (char *)m; + int ret; + + if (left < sizeof(struct ebt_entry_match) || + left - sizeof(struct ebt_entry_match) < m->match_size) + return -EINVAL; + + match = xt_request_find_match(NFPROTO_BRIDGE, m->u.name, 0); + if (IS_ERR(match)) + return PTR_ERR(match); + m->u.match = match; + + par->match = match; + par->matchinfo = m->data; + ret = xt_check_match(par, m->match_size, + e->ethproto, e->invflags & EBT_IPROTO); + if (ret < 0) { + module_put(match->me); + return ret; + } + + (*cnt)++; + return 0; +} + +static inline int +ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, + unsigned int *cnt) +{ + const struct ebt_entry *e = par->entryinfo; + struct xt_target *watcher; + size_t left = ((char *)e + e->target_offset) - (char *)w; + int ret; + + if (left < sizeof(struct ebt_entry_watcher) || + left - sizeof(struct ebt_entry_watcher) < w->watcher_size) + return -EINVAL; + + watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0); + if (IS_ERR(watcher)) + return PTR_ERR(watcher); + w->u.watcher = watcher; + + par->target = watcher; + par->targinfo = w->data; + ret = xt_check_target(par, w->watcher_size, + e->ethproto, e->invflags & EBT_IPROTO); + if (ret < 0) { + module_put(watcher->me); + return ret; + } + + (*cnt)++; + return 0; +} + +static int ebt_verify_pointers(const struct ebt_replace *repl, + struct ebt_table_info *newinfo) +{ + unsigned int limit = repl->entries_size; + unsigned int valid_hooks = repl->valid_hooks; + unsigned int offset = 0; + int i; + + for (i = 0; i < NF_BR_NUMHOOKS; i++) + newinfo->hook_entry[i] = NULL; + + newinfo->entries_size = repl->entries_size; + newinfo->nentries = repl->nentries; + + while (offset < limit) { + size_t left = limit - offset; + struct ebt_entry *e = (void *)newinfo->entries + offset; + + if (left < sizeof(unsigned int)) + break; + + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if ((valid_hooks & (1 << i)) == 0) + continue; + if ((char __user *)repl->hook_entry[i] == + repl->entries + offset) + break; + } + + if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) { + if (e->bitmask != 0) { + /* we make userspace set this right, + so there is no misunderstanding */ + BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set " + "in distinguisher\n"); + return -EINVAL; + } + if (i != NF_BR_NUMHOOKS) + newinfo->hook_entry[i] = (struct ebt_entries *)e; + if (left < sizeof(struct ebt_entries)) + break; + offset += sizeof(struct ebt_entries); + } else { + if (left < sizeof(struct ebt_entry)) + break; + if (left < e->next_offset) + break; + if (e->next_offset < sizeof(struct ebt_entry)) + return -EINVAL; + offset += e->next_offset; + } + } + if (offset != limit) { + BUGPRINT("entries_size too small\n"); + return -EINVAL; + } + + /* check if all valid hooks have a chain */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if (!newinfo->hook_entry[i] && + (valid_hooks & (1 << i))) { + BUGPRINT("Valid hook without chain\n"); + return -EINVAL; + } + } + return 0; +} + +/* + * this one is very careful, as it is the first function + * to parse the userspace data + */ +static inline int +ebt_check_entry_size_and_hooks(const struct ebt_entry *e, + const struct ebt_table_info *newinfo, + unsigned int *n, unsigned int *cnt, + unsigned int *totalcnt, unsigned int *udc_cnt) +{ + int i; + + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if ((void *)e == (void *)newinfo->hook_entry[i]) + break; + } + /* beginning of a new chain + if i == NF_BR_NUMHOOKS it must be a user defined chain */ + if (i != NF_BR_NUMHOOKS || !e->bitmask) { + /* this checks if the previous chain has as many entries + as it said it has */ + if (*n != *cnt) { + BUGPRINT("nentries does not equal the nr of entries " + "in the chain\n"); + return -EINVAL; + } + if (((struct ebt_entries *)e)->policy != EBT_DROP && + ((struct ebt_entries *)e)->policy != EBT_ACCEPT) { + /* only RETURN from udc */ + if (i != NF_BR_NUMHOOKS || + ((struct ebt_entries *)e)->policy != EBT_RETURN) { + BUGPRINT("bad policy\n"); + return -EINVAL; + } + } + if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */ + (*udc_cnt)++; + if (((struct ebt_entries *)e)->counter_offset != *totalcnt) { + BUGPRINT("counter_offset != totalcnt"); + return -EINVAL; + } + *n = ((struct ebt_entries *)e)->nentries; + *cnt = 0; + return 0; + } + /* a plain old entry, heh */ + if (sizeof(struct ebt_entry) > e->watchers_offset || + e->watchers_offset > e->target_offset || + e->target_offset >= e->next_offset) { + BUGPRINT("entry offsets not in right order\n"); + return -EINVAL; + } + /* this is not checked anywhere else */ + if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) { + BUGPRINT("target size too small\n"); + return -EINVAL; + } + (*cnt)++; + (*totalcnt)++; + return 0; +} + +struct ebt_cl_stack +{ + struct ebt_chainstack cs; + int from; + unsigned int hookmask; +}; + +/* + * we need these positions to check that the jumps to a different part of the + * entries is a jump to the beginning of a new chain. + */ +static inline int +ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, + unsigned int *n, struct ebt_cl_stack *udc) +{ + int i; + + /* we're only interested in chain starts */ + if (e->bitmask) + return 0; + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if (newinfo->hook_entry[i] == (struct ebt_entries *)e) + break; + } + /* only care about udc */ + if (i != NF_BR_NUMHOOKS) + return 0; + + udc[*n].cs.chaininfo = (struct ebt_entries *)e; + /* these initialisations are depended on later in check_chainloops() */ + udc[*n].cs.n = 0; + udc[*n].hookmask = 0; + + (*n)++; + return 0; +} + +static inline int +ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i) +{ + struct xt_mtdtor_param par; + + if (i && (*i)-- == 0) + return 1; + + par.net = net; + par.match = m->u.match; + par.matchinfo = m->data; + par.family = NFPROTO_BRIDGE; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); + return 0; +} + +static inline int +ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i) +{ + struct xt_tgdtor_param par; + + if (i && (*i)-- == 0) + return 1; + + par.net = net; + par.target = w->u.watcher; + par.targinfo = w->data; + par.family = NFPROTO_BRIDGE; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); + return 0; +} + +static inline int +ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) +{ + struct xt_tgdtor_param par; + struct ebt_entry_target *t; + + if (e->bitmask == 0) + return 0; + /* we're done */ + if (cnt && (*cnt)-- == 0) + return 1; + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); + EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + + par.net = net; + par.target = t->u.target; + par.targinfo = t->data; + par.family = NFPROTO_BRIDGE; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); + return 0; +} + +static inline int +ebt_check_entry(struct ebt_entry *e, struct net *net, + const struct ebt_table_info *newinfo, + const char *name, unsigned int *cnt, + struct ebt_cl_stack *cl_s, unsigned int udc_cnt) +{ + struct ebt_entry_target *t; + struct xt_target *target; + unsigned int i, j, hook = 0, hookmask = 0; + size_t gap; + int ret; + struct xt_mtchk_param mtpar; + struct xt_tgchk_param tgpar; + + /* don't mess with the struct ebt_entries */ + if (e->bitmask == 0) + return 0; + + if (e->bitmask & ~EBT_F_MASK) { + BUGPRINT("Unknown flag for bitmask\n"); + return -EINVAL; + } + if (e->invflags & ~EBT_INV_MASK) { + BUGPRINT("Unknown flag for inv bitmask\n"); + return -EINVAL; + } + if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) { + BUGPRINT("NOPROTO & 802_3 not allowed\n"); + return -EINVAL; + } + /* what hook do we belong to? */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if (!newinfo->hook_entry[i]) + continue; + if ((char *)newinfo->hook_entry[i] < (char *)e) + hook = i; + else + break; + } + /* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on + a base chain */ + if (i < NF_BR_NUMHOOKS) + hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); + else { + for (i = 0; i < udc_cnt; i++) + if ((char *)(cl_s[i].cs.chaininfo) > (char *)e) + break; + if (i == 0) + hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); + else + hookmask = cl_s[i - 1].hookmask; + } + i = 0; + + mtpar.net = tgpar.net = net; + mtpar.table = tgpar.table = name; + mtpar.entryinfo = tgpar.entryinfo = e; + mtpar.hook_mask = tgpar.hook_mask = hookmask; + mtpar.family = tgpar.family = NFPROTO_BRIDGE; + ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i); + if (ret != 0) + goto cleanup_matches; + j = 0; + ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); + if (ret != 0) + goto cleanup_watchers; + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + gap = e->next_offset - e->target_offset; + + target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0); + if (IS_ERR(target)) { + ret = PTR_ERR(target); + goto cleanup_watchers; + } + + t->u.target = target; + if (t->u.target == &ebt_standard_target) { + if (gap < sizeof(struct ebt_standard_target)) { + BUGPRINT("Standard target size too big\n"); + ret = -EFAULT; + goto cleanup_watchers; + } + if (((struct ebt_standard_target *)t)->verdict < + -NUM_STANDARD_TARGETS) { + BUGPRINT("Invalid standard target\n"); + ret = -EFAULT; + goto cleanup_watchers; + } + } else if (t->target_size > gap - sizeof(struct ebt_entry_target)) { + module_put(t->u.target->me); + ret = -EFAULT; + goto cleanup_watchers; + } + + tgpar.target = target; + tgpar.targinfo = t->data; + ret = xt_check_target(&tgpar, t->target_size, + e->ethproto, e->invflags & EBT_IPROTO); + if (ret < 0) { + module_put(target->me); + goto cleanup_watchers; + } + (*cnt)++; + return 0; +cleanup_watchers: + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j); +cleanup_matches: + EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i); + return ret; +} + +/* + * checks for loops and sets the hook mask for udc + * the hook mask for udc tells us from which base chains the udc can be + * accessed. This mask is a parameter to the check() functions of the extensions + */ +static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s, + unsigned int udc_cnt, unsigned int hooknr, char *base) +{ + int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict; + const struct ebt_entry *e = (struct ebt_entry *)chain->data; + const struct ebt_entry_target *t; + + while (pos < nentries || chain_nr != -1) { + /* end of udc, go back one 'recursion' step */ + if (pos == nentries) { + /* put back values of the time when this chain was called */ + e = cl_s[chain_nr].cs.e; + if (cl_s[chain_nr].from != -1) + nentries = + cl_s[cl_s[chain_nr].from].cs.chaininfo->nentries; + else + nentries = chain->nentries; + pos = cl_s[chain_nr].cs.n; + /* make sure we won't see a loop that isn't one */ + cl_s[chain_nr].cs.n = 0; + chain_nr = cl_s[chain_nr].from; + if (pos == nentries) + continue; + } + t = (struct ebt_entry_target *) + (((char *)e) + e->target_offset); + if (strcmp(t->u.name, EBT_STANDARD_TARGET)) + goto letscontinue; + if (e->target_offset + sizeof(struct ebt_standard_target) > + e->next_offset) { + BUGPRINT("Standard target size too big\n"); + return -1; + } + verdict = ((struct ebt_standard_target *)t)->verdict; + if (verdict >= 0) { /* jump to another chain */ + struct ebt_entries *hlp2 = + (struct ebt_entries *)(base + verdict); + for (i = 0; i < udc_cnt; i++) + if (hlp2 == cl_s[i].cs.chaininfo) + break; + /* bad destination or loop */ + if (i == udc_cnt) { + BUGPRINT("bad destination\n"); + return -1; + } + if (cl_s[i].cs.n) { + BUGPRINT("loop\n"); + return -1; + } + if (cl_s[i].hookmask & (1 << hooknr)) + goto letscontinue; + /* this can't be 0, so the loop test is correct */ + cl_s[i].cs.n = pos + 1; + pos = 0; + cl_s[i].cs.e = ebt_next_entry(e); + e = (struct ebt_entry *)(hlp2->data); + nentries = hlp2->nentries; + cl_s[i].from = chain_nr; + chain_nr = i; + /* this udc is accessible from the base chain for hooknr */ + cl_s[i].hookmask |= (1 << hooknr); + continue; + } +letscontinue: + e = ebt_next_entry(e); + pos++; + } + return 0; +} + +/* do the parsing of the table/chains/entries/matches/watchers/targets, heh */ +static int translate_table(struct net *net, const char *name, + struct ebt_table_info *newinfo) +{ + unsigned int i, j, k, udc_cnt; + int ret; + struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */ + + i = 0; + while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i]) + i++; + if (i == NF_BR_NUMHOOKS) { + BUGPRINT("No valid hooks specified\n"); + return -EINVAL; + } + if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) { + BUGPRINT("Chains don't start at beginning\n"); + return -EINVAL; + } + /* make sure chains are ordered after each other in same order + as their corresponding hooks */ + for (j = i + 1; j < NF_BR_NUMHOOKS; j++) { + if (!newinfo->hook_entry[j]) + continue; + if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) { + BUGPRINT("Hook order must be followed\n"); + return -EINVAL; + } + i = j; + } + + /* do some early checkings and initialize some things */ + i = 0; /* holds the expected nr. of entries for the chain */ + j = 0; /* holds the up to now counted entries for the chain */ + k = 0; /* holds the total nr. of entries, should equal + newinfo->nentries afterwards */ + udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */ + ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_check_entry_size_and_hooks, newinfo, + &i, &j, &k, &udc_cnt); + + if (ret != 0) + return ret; + + if (i != j) { + BUGPRINT("nentries does not equal the nr of entries in the " + "(last) chain\n"); + return -EINVAL; + } + if (k != newinfo->nentries) { + BUGPRINT("Total nentries is wrong\n"); + return -EINVAL; + } + + /* get the location of the udc, put them in an array + while we're at it, allocate the chainstack */ + if (udc_cnt) { + /* this will get free'd in do_replace()/ebt_register_table() + if an error occurs */ + newinfo->chainstack = + vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack))); + if (!newinfo->chainstack) + return -ENOMEM; + for_each_possible_cpu(i) { + newinfo->chainstack[i] = + vmalloc(udc_cnt * sizeof(*(newinfo->chainstack[0]))); + if (!newinfo->chainstack[i]) { + while (i) + vfree(newinfo->chainstack[--i]); + vfree(newinfo->chainstack); + newinfo->chainstack = NULL; + return -ENOMEM; + } + } + + cl_s = vmalloc(udc_cnt * sizeof(*cl_s)); + if (!cl_s) + return -ENOMEM; + i = 0; /* the i'th udc */ + EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_get_udc_positions, newinfo, &i, cl_s); + /* sanity check */ + if (i != udc_cnt) { + BUGPRINT("i != udc_cnt\n"); + vfree(cl_s); + return -EFAULT; + } + } + + /* Check for loops */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) + if (newinfo->hook_entry[i]) + if (check_chainloops(newinfo->hook_entry[i], + cl_s, udc_cnt, i, newinfo->entries)) { + vfree(cl_s); + return -EINVAL; + } + + /* we now know the following (along with E=mc²): + - the nr of entries in each chain is right + - the size of the allocated space is right + - all valid hooks have a corresponding chain + - there are no loops + - wrong data can still be on the level of a single entry + - could be there are jumps to places that are not the + beginning of a chain. This can only occur in chains that + are not accessible from any base chains, so we don't care. */ + + /* used to know what we need to clean up if something goes wrong */ + i = 0; + ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt); + if (ret != 0) { + EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_cleanup_entry, net, &i); + } + vfree(cl_s); + return ret; +} + +/* called under write_lock */ +static void get_counters(const struct ebt_counter *oldcounters, + struct ebt_counter *counters, unsigned int nentries) +{ + int i, cpu; + struct ebt_counter *counter_base; + + /* counters of cpu 0 */ + memcpy(counters, oldcounters, + sizeof(struct ebt_counter) * nentries); + + /* add other counters to those of cpu 0 */ + for_each_possible_cpu(cpu) { + if (cpu == 0) + continue; + counter_base = COUNTER_BASE(oldcounters, nentries, cpu); + for (i = 0; i < nentries; i++) { + counters[i].pcnt += counter_base[i].pcnt; + counters[i].bcnt += counter_base[i].bcnt; + } + } +} + +static int do_replace_finish(struct net *net, struct ebt_replace *repl, + struct ebt_table_info *newinfo) +{ + int ret, i; + struct ebt_counter *counterstmp = NULL; + /* used to be able to unlock earlier */ + struct ebt_table_info *table; + struct ebt_table *t; + + /* the user wants counters back + the check on the size is done later, when we have the lock */ + if (repl->num_counters) { + unsigned long size = repl->num_counters * sizeof(*counterstmp); + counterstmp = vmalloc(size); + if (!counterstmp) + return -ENOMEM; + } + + newinfo->chainstack = NULL; + ret = ebt_verify_pointers(repl, newinfo); + if (ret != 0) + goto free_counterstmp; + + ret = translate_table(net, repl->name, newinfo); + + if (ret != 0) + goto free_counterstmp; + + t = find_table_lock(net, repl->name, &ret, &ebt_mutex); + if (!t) { + ret = -ENOENT; + goto free_iterate; + } + + /* the table doesn't like it */ + if (t->check && (ret = t->check(newinfo, repl->valid_hooks))) + goto free_unlock; + + if (repl->num_counters && repl->num_counters != t->private->nentries) { + BUGPRINT("Wrong nr. of counters requested\n"); + ret = -EINVAL; + goto free_unlock; + } + + /* we have the mutex lock, so no danger in reading this pointer */ + table = t->private; + /* make sure the table can only be rmmod'ed if it contains no rules */ + if (!table->nentries && newinfo->nentries && !try_module_get(t->me)) { + ret = -ENOENT; + goto free_unlock; + } else if (table->nentries && !newinfo->nentries) + module_put(t->me); + /* we need an atomic snapshot of the counters */ + write_lock_bh(&t->lock); + if (repl->num_counters) + get_counters(t->private->counters, counterstmp, + t->private->nentries); + + t->private = newinfo; + write_unlock_bh(&t->lock); + mutex_unlock(&ebt_mutex); + /* so, a user can change the chains while having messed up her counter + allocation. Only reason why this is done is because this way the lock + is held only once, while this doesn't bring the kernel into a + dangerous state. */ + if (repl->num_counters && + copy_to_user(repl->counters, counterstmp, + repl->num_counters * sizeof(struct ebt_counter))) { + ret = -EFAULT; + } + else + ret = 0; + + /* decrease module count and free resources */ + EBT_ENTRY_ITERATE(table->entries, table->entries_size, + ebt_cleanup_entry, net, NULL); + + vfree(table->entries); + if (table->chainstack) { + for_each_possible_cpu(i) + vfree(table->chainstack[i]); + vfree(table->chainstack); + } + vfree(table); + + vfree(counterstmp); + return ret; + +free_unlock: + mutex_unlock(&ebt_mutex); +free_iterate: + EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_cleanup_entry, net, NULL); +free_counterstmp: + vfree(counterstmp); + /* can be initialized in translate_table() */ + if (newinfo->chainstack) { + for_each_possible_cpu(i) + vfree(newinfo->chainstack[i]); + vfree(newinfo->chainstack); + } + return ret; +} + +/* replace the table */ +static int do_replace(struct net *net, const void __user *user, + unsigned int len) +{ + int ret, countersize; + struct ebt_table_info *newinfo; + struct ebt_replace tmp; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.entries_size) { + BUGPRINT("Wrong len argument\n"); + return -EINVAL; + } + + if (tmp.entries_size == 0) { + BUGPRINT("Entries_size never zero\n"); + return -EINVAL; + } + /* overflow check */ + if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) / + NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter)) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) + return -ENOMEM; + + tmp.name[sizeof(tmp.name) - 1] = 0; + + countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; + newinfo = vmalloc(sizeof(*newinfo) + countersize); + if (!newinfo) + return -ENOMEM; + + if (countersize) + memset(newinfo->counters, 0, countersize); + + newinfo->entries = vmalloc(tmp.entries_size); + if (!newinfo->entries) { + ret = -ENOMEM; + goto free_newinfo; + } + if (copy_from_user( + newinfo->entries, tmp.entries, tmp.entries_size) != 0) { + BUGPRINT("Couldn't copy entries from userspace\n"); + ret = -EFAULT; + goto free_entries; + } + + ret = do_replace_finish(net, &tmp, newinfo); + if (ret == 0) + return ret; +free_entries: + vfree(newinfo->entries); +free_newinfo: + vfree(newinfo); + return ret; +} + +struct ebt_table * +ebt_register_table(struct net *net, const struct ebt_table *input_table) +{ + struct ebt_table_info *newinfo; + struct ebt_table *t, *table; + struct ebt_replace_kernel *repl; + int ret, i, countersize; + void *p; + + if (input_table == NULL || (repl = input_table->table) == NULL || + repl->entries == NULL || repl->entries_size == 0 || + repl->counters != NULL || input_table->private != NULL) { + BUGPRINT("Bad table data for ebt_register_table!!!\n"); + return ERR_PTR(-EINVAL); + } + + /* Don't add one table to multiple lists. */ + table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL); + if (!table) { + ret = -ENOMEM; + goto out; + } + + countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids; + newinfo = vmalloc(sizeof(*newinfo) + countersize); + ret = -ENOMEM; + if (!newinfo) + goto free_table; + + p = vmalloc(repl->entries_size); + if (!p) + goto free_newinfo; + + memcpy(p, repl->entries, repl->entries_size); + newinfo->entries = p; + + newinfo->entries_size = repl->entries_size; + newinfo->nentries = repl->nentries; + + if (countersize) + memset(newinfo->counters, 0, countersize); + + /* fill in newinfo and parse the entries */ + newinfo->chainstack = NULL; + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if ((repl->valid_hooks & (1 << i)) == 0) + newinfo->hook_entry[i] = NULL; + else + newinfo->hook_entry[i] = p + + ((char *)repl->hook_entry[i] - repl->entries); + } + ret = translate_table(net, repl->name, newinfo); + if (ret != 0) { + BUGPRINT("Translate_table failed\n"); + goto free_chainstack; + } + + if (table->check && table->check(newinfo, table->valid_hooks)) { + BUGPRINT("The table doesn't like its own initial data, lol\n"); + return ERR_PTR(-EINVAL); + } + + table->private = newinfo; + rwlock_init(&table->lock); + ret = mutex_lock_interruptible(&ebt_mutex); + if (ret != 0) + goto free_chainstack; + + list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { + if (strcmp(t->name, table->name) == 0) { + ret = -EEXIST; + BUGPRINT("Table name already exists\n"); + goto free_unlock; + } + } + + /* Hold a reference count if the chains aren't empty */ + if (newinfo->nentries && !try_module_get(table->me)) { + ret = -ENOENT; + goto free_unlock; + } + list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); + mutex_unlock(&ebt_mutex); + return table; +free_unlock: + mutex_unlock(&ebt_mutex); +free_chainstack: + if (newinfo->chainstack) { + for_each_possible_cpu(i) + vfree(newinfo->chainstack[i]); + vfree(newinfo->chainstack); + } + vfree(newinfo->entries); +free_newinfo: + vfree(newinfo); +free_table: + kfree(table); +out: + return ERR_PTR(ret); +} + +void ebt_unregister_table(struct net *net, struct ebt_table *table) +{ + int i; + + if (!table) { + BUGPRINT("Request to unregister NULL table!!!\n"); + return; + } + mutex_lock(&ebt_mutex); + list_del(&table->list); + mutex_unlock(&ebt_mutex); + EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, + ebt_cleanup_entry, net, NULL); + if (table->private->nentries) + module_put(table->me); + vfree(table->private->entries); + if (table->private->chainstack) { + for_each_possible_cpu(i) + vfree(table->private->chainstack[i]); + vfree(table->private->chainstack); + } + vfree(table->private); + kfree(table); +} + +/* userspace just supplied us with counters */ +static int do_update_counters(struct net *net, const char *name, + struct ebt_counter __user *counters, + unsigned int num_counters, + const void __user *user, unsigned int len) +{ + int i, ret; + struct ebt_counter *tmp; + struct ebt_table *t; + + if (num_counters == 0) + return -EINVAL; + + tmp = vmalloc(num_counters * sizeof(*tmp)); + if (!tmp) + return -ENOMEM; + + t = find_table_lock(net, name, &ret, &ebt_mutex); + if (!t) + goto free_tmp; + + if (num_counters != t->private->nentries) { + BUGPRINT("Wrong nr of counters\n"); + ret = -EINVAL; + goto unlock_mutex; + } + + if (copy_from_user(tmp, counters, num_counters * sizeof(*counters))) { + ret = -EFAULT; + goto unlock_mutex; + } + + /* we want an atomic add of the counters */ + write_lock_bh(&t->lock); + + /* we add to the counters of the first cpu */ + for (i = 0; i < num_counters; i++) { + t->private->counters[i].pcnt += tmp[i].pcnt; + t->private->counters[i].bcnt += tmp[i].bcnt; + } + + write_unlock_bh(&t->lock); + ret = 0; +unlock_mutex: + mutex_unlock(&ebt_mutex); +free_tmp: + vfree(tmp); + return ret; +} + +static int update_counters(struct net *net, const void __user *user, + unsigned int len) +{ + struct ebt_replace hlp; + + if (copy_from_user(&hlp, user, sizeof(hlp))) + return -EFAULT; + + if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) + return -EINVAL; + + return do_update_counters(net, hlp.name, hlp.counters, + hlp.num_counters, user, len); +} + +static inline int ebt_make_matchname(const struct ebt_entry_match *m, + const char *base, char __user *ubase) +{ + char __user *hlp = ubase + ((char *)m - base); + if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN)) + return -EFAULT; + return 0; +} + +static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, + const char *base, char __user *ubase) +{ + char __user *hlp = ubase + ((char *)w - base); + if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN)) + return -EFAULT; + return 0; +} + +static inline int +ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) +{ + int ret; + char __user *hlp; + const struct ebt_entry_target *t; + + if (e->bitmask == 0) + return 0; + + hlp = ubase + (((char *)e + e->target_offset) - base); + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + + ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase); + if (ret != 0) + return ret; + ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); + if (ret != 0) + return ret; + if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN)) + return -EFAULT; + return 0; +} + +static int copy_counters_to_user(struct ebt_table *t, + const struct ebt_counter *oldcounters, + void __user *user, unsigned int num_counters, + unsigned int nentries) +{ + struct ebt_counter *counterstmp; + int ret = 0; + + /* userspace might not need the counters */ + if (num_counters == 0) + return 0; + + if (num_counters != nentries) { + BUGPRINT("Num_counters wrong\n"); + return -EINVAL; + } + + counterstmp = vmalloc(nentries * sizeof(*counterstmp)); + if (!counterstmp) + return -ENOMEM; + + write_lock_bh(&t->lock); + get_counters(oldcounters, counterstmp, nentries); + write_unlock_bh(&t->lock); + + if (copy_to_user(user, counterstmp, + nentries * sizeof(struct ebt_counter))) + ret = -EFAULT; + vfree(counterstmp); + return ret; +} + +/* called with ebt_mutex locked */ +static int copy_everything_to_user(struct ebt_table *t, void __user *user, + const int *len, int cmd) +{ + struct ebt_replace tmp; + const struct ebt_counter *oldcounters; + unsigned int entries_size, nentries; + int ret; + char *entries; + + if (cmd == EBT_SO_GET_ENTRIES) { + entries_size = t->private->entries_size; + nentries = t->private->nentries; + entries = t->private->entries; + oldcounters = t->private->counters; + } else { + entries_size = t->table->entries_size; + nentries = t->table->nentries; + entries = t->table->entries; + oldcounters = t->table->counters; + } + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (*len != sizeof(struct ebt_replace) + entries_size + + (tmp.num_counters? nentries * sizeof(struct ebt_counter): 0)) + return -EINVAL; + + if (tmp.nentries != nentries) { + BUGPRINT("Nentries wrong\n"); + return -EINVAL; + } + + if (tmp.entries_size != entries_size) { + BUGPRINT("Wrong size\n"); + return -EINVAL; + } + + ret = copy_counters_to_user(t, oldcounters, tmp.counters, + tmp.num_counters, nentries); + if (ret) + return ret; + + if (copy_to_user(tmp.entries, entries, entries_size)) { + BUGPRINT("Couldn't copy entries to userspace\n"); + return -EFAULT; + } + /* set the match/watcher/target names right */ + return EBT_ENTRY_ITERATE(entries, entries_size, + ebt_make_names, entries, tmp.entries); +} + +static int do_ebt_set_ctl(struct sock *sk, + int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch(cmd) { + case EBT_SO_SET_ENTRIES: + ret = do_replace(sock_net(sk), user, len); + break; + case EBT_SO_SET_COUNTERS: + ret = update_counters(sock_net(sk), user, len); + break; + default: + ret = -EINVAL; + } + return ret; +} + +static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + struct ebt_replace tmp; + struct ebt_table *t; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex); + if (!t) + return ret; + + switch(cmd) { + case EBT_SO_GET_INFO: + case EBT_SO_GET_INIT_INFO: + if (*len != sizeof(struct ebt_replace)){ + ret = -EINVAL; + mutex_unlock(&ebt_mutex); + break; + } + if (cmd == EBT_SO_GET_INFO) { + tmp.nentries = t->private->nentries; + tmp.entries_size = t->private->entries_size; + tmp.valid_hooks = t->valid_hooks; + } else { + tmp.nentries = t->table->nentries; + tmp.entries_size = t->table->entries_size; + tmp.valid_hooks = t->table->valid_hooks; + } + mutex_unlock(&ebt_mutex); + if (copy_to_user(user, &tmp, *len) != 0){ + BUGPRINT("c2u Didn't work\n"); + ret = -EFAULT; + break; + } + ret = 0; + break; + + case EBT_SO_GET_ENTRIES: + case EBT_SO_GET_INIT_ENTRIES: + ret = copy_everything_to_user(t, user, len, cmd); + mutex_unlock(&ebt_mutex); + break; + + default: + mutex_unlock(&ebt_mutex); + ret = -EINVAL; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +/* 32 bit-userspace compatibility definitions. */ +struct compat_ebt_replace { + char name[EBT_TABLE_MAXNAMELEN]; + compat_uint_t valid_hooks; + compat_uint_t nentries; + compat_uint_t entries_size; + /* start of the chains */ + compat_uptr_t hook_entry[NF_BR_NUMHOOKS]; + /* nr of counters userspace expects back */ + compat_uint_t num_counters; + /* where the kernel will put the old counters. */ + compat_uptr_t counters; + compat_uptr_t entries; +}; + +/* struct ebt_entry_match, _target and _watcher have same layout */ +struct compat_ebt_entry_mwt { + union { + char name[EBT_FUNCTION_MAXNAMELEN]; + compat_uptr_t ptr; + } u; + compat_uint_t match_size; + compat_uint_t data[0]; +}; + +/* account for possible padding between match_size and ->data */ +static int ebt_compat_entry_padsize(void) +{ + BUILD_BUG_ON(XT_ALIGN(sizeof(struct ebt_entry_match)) < + COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt))); + return (int) XT_ALIGN(sizeof(struct ebt_entry_match)) - + COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt)); +} + +static int ebt_compat_match_offset(const struct xt_match *match, + unsigned int userlen) +{ + /* + * ebt_among needs special handling. The kernel .matchsize is + * set to -1 at registration time; at runtime an EBT_ALIGN()ed + * value is expected. + * Example: userspace sends 4500, ebt_among.c wants 4504. + */ + if (unlikely(match->matchsize == -1)) + return XT_ALIGN(userlen) - COMPAT_XT_ALIGN(userlen); + return xt_compat_match_offset(match); +} + +static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, + unsigned int *size) +{ + const struct xt_match *match = m->u.match; + struct compat_ebt_entry_mwt __user *cm = *dstptr; + int off = ebt_compat_match_offset(match, m->match_size); + compat_uint_t msize = m->match_size - off; + + BUG_ON(off >= m->match_size); + + if (copy_to_user(cm->u.name, match->name, + strlen(match->name) + 1) || put_user(msize, &cm->match_size)) + return -EFAULT; + + if (match->compat_to_user) { + if (match->compat_to_user(cm->data, m->data)) + return -EFAULT; + } else if (copy_to_user(cm->data, m->data, msize)) + return -EFAULT; + + *size -= ebt_compat_entry_padsize() + off; + *dstptr = cm->data; + *dstptr += msize; + return 0; +} + +static int compat_target_to_user(struct ebt_entry_target *t, + void __user **dstptr, + unsigned int *size) +{ + const struct xt_target *target = t->u.target; + struct compat_ebt_entry_mwt __user *cm = *dstptr; + int off = xt_compat_target_offset(target); + compat_uint_t tsize = t->target_size - off; + + BUG_ON(off >= t->target_size); + + if (copy_to_user(cm->u.name, target->name, + strlen(target->name) + 1) || put_user(tsize, &cm->match_size)) + return -EFAULT; + + if (target->compat_to_user) { + if (target->compat_to_user(cm->data, t->data)) + return -EFAULT; + } else if (copy_to_user(cm->data, t->data, tsize)) + return -EFAULT; + + *size -= ebt_compat_entry_padsize() + off; + *dstptr = cm->data; + *dstptr += tsize; + return 0; +} + +static int compat_watcher_to_user(struct ebt_entry_watcher *w, + void __user **dstptr, + unsigned int *size) +{ + return compat_target_to_user((struct ebt_entry_target *)w, + dstptr, size); +} + +static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr, + unsigned int *size) +{ + struct ebt_entry_target *t; + struct ebt_entry __user *ce; + u32 watchers_offset, target_offset, next_offset; + compat_uint_t origsize; + int ret; + + if (e->bitmask == 0) { + if (*size < sizeof(struct ebt_entries)) + return -EINVAL; + if (copy_to_user(*dstptr, e, sizeof(struct ebt_entries))) + return -EFAULT; + + *dstptr += sizeof(struct ebt_entries); + *size -= sizeof(struct ebt_entries); + return 0; + } + + if (*size < sizeof(*ce)) + return -EINVAL; + + ce = (struct ebt_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(*ce))) + return -EFAULT; + + origsize = *size; + *dstptr += sizeof(*ce); + + ret = EBT_MATCH_ITERATE(e, compat_match_to_user, dstptr, size); + if (ret) + return ret; + watchers_offset = e->watchers_offset - (origsize - *size); + + ret = EBT_WATCHER_ITERATE(e, compat_watcher_to_user, dstptr, size); + if (ret) + return ret; + target_offset = e->target_offset - (origsize - *size); + + t = (struct ebt_entry_target *) ((char *) e + e->target_offset); + + ret = compat_target_to_user(t, dstptr, size); + if (ret) + return ret; + next_offset = e->next_offset - (origsize - *size); + + if (put_user(watchers_offset, &ce->watchers_offset) || + put_user(target_offset, &ce->target_offset) || + put_user(next_offset, &ce->next_offset)) + return -EFAULT; + + *size -= sizeof(*ce); + return 0; +} + +static int compat_calc_match(struct ebt_entry_match *m, int *off) +{ + *off += ebt_compat_match_offset(m->u.match, m->match_size); + *off += ebt_compat_entry_padsize(); + return 0; +} + +static int compat_calc_watcher(struct ebt_entry_watcher *w, int *off) +{ + *off += xt_compat_target_offset(w->u.watcher); + *off += ebt_compat_entry_padsize(); + return 0; +} + +static int compat_calc_entry(const struct ebt_entry *e, + const struct ebt_table_info *info, + const void *base, + struct compat_ebt_replace *newinfo) +{ + const struct ebt_entry_target *t; + unsigned int entry_offset; + int off, ret, i; + + if (e->bitmask == 0) + return 0; + + off = 0; + entry_offset = (void *)e - base; + + EBT_MATCH_ITERATE(e, compat_calc_match, &off); + EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off); + + t = (const struct ebt_entry_target *) ((char *) e + e->target_offset); + + off += xt_compat_target_offset(t->u.target); + off += ebt_compat_entry_padsize(); + + newinfo->entries_size -= off; + + ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + const void *hookptr = info->hook_entry[i]; + if (info->hook_entry[i] && + (e < (struct ebt_entry *)(base - hookptr))) { + newinfo->hook_entry[i] -= off; + pr_debug("0x%08X -> 0x%08X\n", + newinfo->hook_entry[i] + off, + newinfo->hook_entry[i]); + } + } + + return 0; +} + + +static int compat_table_info(const struct ebt_table_info *info, + struct compat_ebt_replace *newinfo) +{ + unsigned int size = info->entries_size; + const void *entries = info->entries; + + newinfo->entries_size = size; + + xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); + return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, + entries, newinfo); +} + +static int compat_copy_everything_to_user(struct ebt_table *t, + void __user *user, int *len, int cmd) +{ + struct compat_ebt_replace repl, tmp; + struct ebt_counter *oldcounters; + struct ebt_table_info tinfo; + int ret; + void __user *pos; + + memset(&tinfo, 0, sizeof(tinfo)); + + if (cmd == EBT_SO_GET_ENTRIES) { + tinfo.entries_size = t->private->entries_size; + tinfo.nentries = t->private->nentries; + tinfo.entries = t->private->entries; + oldcounters = t->private->counters; + } else { + tinfo.entries_size = t->table->entries_size; + tinfo.nentries = t->table->nentries; + tinfo.entries = t->table->entries; + oldcounters = t->table->counters; + } + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (tmp.nentries != tinfo.nentries || + (tmp.num_counters && tmp.num_counters != tinfo.nentries)) + return -EINVAL; + + memcpy(&repl, &tmp, sizeof(repl)); + if (cmd == EBT_SO_GET_ENTRIES) + ret = compat_table_info(t->private, &repl); + else + ret = compat_table_info(&tinfo, &repl); + if (ret) + return ret; + + if (*len != sizeof(tmp) + repl.entries_size + + (tmp.num_counters? tinfo.nentries * sizeof(struct ebt_counter): 0)) { + pr_err("wrong size: *len %d, entries_size %u, replsz %d\n", + *len, tinfo.entries_size, repl.entries_size); + return -EINVAL; + } + + /* userspace might not need the counters */ + ret = copy_counters_to_user(t, oldcounters, compat_ptr(tmp.counters), + tmp.num_counters, tinfo.nentries); + if (ret) + return ret; + + pos = compat_ptr(tmp.entries); + return EBT_ENTRY_ITERATE(tinfo.entries, tinfo.entries_size, + compat_copy_entry_to_user, &pos, &tmp.entries_size); +} + +struct ebt_entries_buf_state { + char *buf_kern_start; /* kernel buffer to copy (translated) data to */ + u32 buf_kern_len; /* total size of kernel buffer */ + u32 buf_kern_offset; /* amount of data copied so far */ + u32 buf_user_offset; /* read position in userspace buffer */ +}; + +static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz) +{ + state->buf_kern_offset += sz; + return state->buf_kern_offset >= sz ? 0 : -EINVAL; +} + +static int ebt_buf_add(struct ebt_entries_buf_state *state, + void *data, unsigned int sz) +{ + if (state->buf_kern_start == NULL) + goto count_only; + + BUG_ON(state->buf_kern_offset + sz > state->buf_kern_len); + + memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz); + + count_only: + state->buf_user_offset += sz; + return ebt_buf_count(state, sz); +} + +static int ebt_buf_add_pad(struct ebt_entries_buf_state *state, unsigned int sz) +{ + char *b = state->buf_kern_start; + + BUG_ON(b && state->buf_kern_offset > state->buf_kern_len); + + if (b != NULL && sz > 0) + memset(b + state->buf_kern_offset, 0, sz); + /* do not adjust ->buf_user_offset here, we added kernel-side padding */ + return ebt_buf_count(state, sz); +} + +enum compat_mwt { + EBT_COMPAT_MATCH, + EBT_COMPAT_WATCHER, + EBT_COMPAT_TARGET, +}; + +static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, + enum compat_mwt compat_mwt, + struct ebt_entries_buf_state *state, + const unsigned char *base) +{ + char name[EBT_FUNCTION_MAXNAMELEN]; + struct xt_match *match; + struct xt_target *wt; + void *dst = NULL; + int off, pad = 0; + unsigned int size_kern, match_size = mwt->match_size; + + strlcpy(name, mwt->u.name, sizeof(name)); + + if (state->buf_kern_start) + dst = state->buf_kern_start + state->buf_kern_offset; + + switch (compat_mwt) { + case EBT_COMPAT_MATCH: + match = try_then_request_module(xt_find_match(NFPROTO_BRIDGE, + name, 0), "ebt_%s", name); + if (match == NULL) + return -ENOENT; + if (IS_ERR(match)) + return PTR_ERR(match); + + off = ebt_compat_match_offset(match, match_size); + if (dst) { + if (match->compat_from_user) + match->compat_from_user(dst, mwt->data); + else + memcpy(dst, mwt->data, match_size); + } + + size_kern = match->matchsize; + if (unlikely(size_kern == -1)) + size_kern = match_size; + module_put(match->me); + break; + case EBT_COMPAT_WATCHER: /* fallthrough */ + case EBT_COMPAT_TARGET: + wt = try_then_request_module(xt_find_target(NFPROTO_BRIDGE, + name, 0), "ebt_%s", name); + if (wt == NULL) + return -ENOENT; + if (IS_ERR(wt)) + return PTR_ERR(wt); + off = xt_compat_target_offset(wt); + + if (dst) { + if (wt->compat_from_user) + wt->compat_from_user(dst, mwt->data); + else + memcpy(dst, mwt->data, match_size); + } + + size_kern = wt->targetsize; + module_put(wt->me); + break; + + default: + return -EINVAL; + } + + state->buf_kern_offset += match_size + off; + state->buf_user_offset += match_size; + pad = XT_ALIGN(size_kern) - size_kern; + + if (pad > 0 && dst) { + BUG_ON(state->buf_kern_len <= pad); + BUG_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad); + memset(dst + size_kern, 0, pad); + } + return off + match_size; +} + +/* + * return size of all matches, watchers or target, including necessary + * alignment and padding. + */ +static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, + unsigned int size_left, enum compat_mwt type, + struct ebt_entries_buf_state *state, const void *base) +{ + int growth = 0; + char *buf; + + if (size_left == 0) + return 0; + + buf = (char *) match32; + + while (size_left >= sizeof(*match32)) { + struct ebt_entry_match *match_kern; + int ret; + + match_kern = (struct ebt_entry_match *) state->buf_kern_start; + if (match_kern) { + char *tmp; + tmp = state->buf_kern_start + state->buf_kern_offset; + match_kern = (struct ebt_entry_match *) tmp; + } + ret = ebt_buf_add(state, buf, sizeof(*match32)); + if (ret < 0) + return ret; + size_left -= sizeof(*match32); + + /* add padding before match->data (if any) */ + ret = ebt_buf_add_pad(state, ebt_compat_entry_padsize()); + if (ret < 0) + return ret; + + if (match32->match_size > size_left) + return -EINVAL; + + size_left -= match32->match_size; + + ret = compat_mtw_from_user(match32, type, state, base); + if (ret < 0) + return ret; + + BUG_ON(ret < match32->match_size); + growth += ret - match32->match_size; + growth += ebt_compat_entry_padsize(); + + buf += sizeof(*match32); + buf += match32->match_size; + + if (match_kern) + match_kern->match_size = ret; + + WARN_ON(type == EBT_COMPAT_TARGET && size_left); + match32 = (struct compat_ebt_entry_mwt *) buf; + } + + return growth; +} + +/* called for all ebt_entry structures. */ +static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, + unsigned int *total, + struct ebt_entries_buf_state *state) +{ + unsigned int i, j, startoff, new_offset = 0; + /* stores match/watchers/targets & offset of next struct ebt_entry: */ + unsigned int offsets[4]; + unsigned int *offsets_update = NULL; + int ret; + char *buf_start; + + if (*total < sizeof(struct ebt_entries)) + return -EINVAL; + + if (!entry->bitmask) { + *total -= sizeof(struct ebt_entries); + return ebt_buf_add(state, entry, sizeof(struct ebt_entries)); + } + if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry)) + return -EINVAL; + + startoff = state->buf_user_offset; + /* pull in most part of ebt_entry, it does not need to be changed. */ + ret = ebt_buf_add(state, entry, + offsetof(struct ebt_entry, watchers_offset)); + if (ret < 0) + return ret; + + offsets[0] = sizeof(struct ebt_entry); /* matches come first */ + memcpy(&offsets[1], &entry->watchers_offset, + sizeof(offsets) - sizeof(offsets[0])); + + if (state->buf_kern_start) { + buf_start = state->buf_kern_start + state->buf_kern_offset; + offsets_update = (unsigned int *) buf_start; + } + ret = ebt_buf_add(state, &offsets[1], + sizeof(offsets) - sizeof(offsets[0])); + if (ret < 0) + return ret; + buf_start = (char *) entry; + /* + * 0: matches offset, always follows ebt_entry. + * 1: watchers offset, from ebt_entry structure + * 2: target offset, from ebt_entry structure + * 3: next ebt_entry offset, from ebt_entry structure + * + * offsets are relative to beginning of struct ebt_entry (i.e., 0). + */ + for (i = 0, j = 1 ; j < 4 ; j++, i++) { + struct compat_ebt_entry_mwt *match32; + unsigned int size; + char *buf = buf_start; + + buf = buf_start + offsets[i]; + if (offsets[i] > offsets[j]) + return -EINVAL; + + match32 = (struct compat_ebt_entry_mwt *) buf; + size = offsets[j] - offsets[i]; + ret = ebt_size_mwt(match32, size, i, state, base); + if (ret < 0) + return ret; + new_offset += ret; + if (offsets_update && new_offset) { + pr_debug("change offset %d to %d\n", + offsets_update[i], offsets[j] + new_offset); + offsets_update[i] = offsets[j] + new_offset; + } + } + + if (state->buf_kern_start == NULL) { + unsigned int offset = buf_start - (char *) base; + + ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset); + if (ret < 0) + return ret; + } + + startoff = state->buf_user_offset - startoff; + + BUG_ON(*total < startoff); + *total -= startoff; + return 0; +} + +/* + * repl->entries_size is the size of the ebt_entry blob in userspace. + * It might need more memory when copied to a 64 bit kernel in case + * userspace is 32-bit. So, first task: find out how much memory is needed. + * + * Called before validation is performed. + */ +static int compat_copy_entries(unsigned char *data, unsigned int size_user, + struct ebt_entries_buf_state *state) +{ + unsigned int size_remaining = size_user; + int ret; + + ret = EBT_ENTRY_ITERATE(data, size_user, size_entry_mwt, data, + &size_remaining, state); + if (ret < 0) + return ret; + + WARN_ON(size_remaining); + return state->buf_kern_offset; +} + + +static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl, + void __user *user, unsigned int len) +{ + struct compat_ebt_replace tmp; + int i; + + if (len < sizeof(tmp)) + return -EINVAL; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.entries_size) + return -EINVAL; + + if (tmp.entries_size == 0) + return -EINVAL; + + if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) / + NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter)) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) + return -ENOMEM; + + memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry)); + + /* starting with hook_entry, 32 vs. 64 bit structures are different */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) + repl->hook_entry[i] = compat_ptr(tmp.hook_entry[i]); + + repl->num_counters = tmp.num_counters; + repl->counters = compat_ptr(tmp.counters); + repl->entries = compat_ptr(tmp.entries); + return 0; +} + +static int compat_do_replace(struct net *net, void __user *user, + unsigned int len) +{ + int ret, i, countersize, size64; + struct ebt_table_info *newinfo; + struct ebt_replace tmp; + struct ebt_entries_buf_state state; + void *entries_tmp; + + ret = compat_copy_ebt_replace_from_user(&tmp, user, len); + if (ret) { + /* try real handler in case userland supplied needed padding */ + if (ret == -EINVAL && do_replace(net, user, len) == 0) + ret = 0; + return ret; + } + + countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; + newinfo = vmalloc(sizeof(*newinfo) + countersize); + if (!newinfo) + return -ENOMEM; + + if (countersize) + memset(newinfo->counters, 0, countersize); + + memset(&state, 0, sizeof(state)); + + newinfo->entries = vmalloc(tmp.entries_size); + if (!newinfo->entries) { + ret = -ENOMEM; + goto free_newinfo; + } + if (copy_from_user( + newinfo->entries, tmp.entries, tmp.entries_size) != 0) { + ret = -EFAULT; + goto free_entries; + } + + entries_tmp = newinfo->entries; + + xt_compat_lock(NFPROTO_BRIDGE); + + xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); + ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); + if (ret < 0) + goto out_unlock; + + pr_debug("tmp.entries_size %d, kern off %d, user off %d delta %d\n", + tmp.entries_size, state.buf_kern_offset, state.buf_user_offset, + xt_compat_calc_jump(NFPROTO_BRIDGE, tmp.entries_size)); + + size64 = ret; + newinfo->entries = vmalloc(size64); + if (!newinfo->entries) { + vfree(entries_tmp); + ret = -ENOMEM; + goto out_unlock; + } + + memset(&state, 0, sizeof(state)); + state.buf_kern_start = newinfo->entries; + state.buf_kern_len = size64; + + ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); + BUG_ON(ret < 0); /* parses same data again */ + + vfree(entries_tmp); + tmp.entries_size = size64; + + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + char __user *usrptr; + if (tmp.hook_entry[i]) { + unsigned int delta; + usrptr = (char __user *) tmp.hook_entry[i]; + delta = usrptr - tmp.entries; + usrptr += xt_compat_calc_jump(NFPROTO_BRIDGE, delta); + tmp.hook_entry[i] = (struct ebt_entries __user *)usrptr; + } + } + + xt_compat_flush_offsets(NFPROTO_BRIDGE); + xt_compat_unlock(NFPROTO_BRIDGE); + + ret = do_replace_finish(net, &tmp, newinfo); + if (ret == 0) + return ret; +free_entries: + vfree(newinfo->entries); +free_newinfo: + vfree(newinfo); + return ret; +out_unlock: + xt_compat_flush_offsets(NFPROTO_BRIDGE); + xt_compat_unlock(NFPROTO_BRIDGE); + goto free_entries; +} + +static int compat_update_counters(struct net *net, void __user *user, + unsigned int len) +{ + struct compat_ebt_replace hlp; + + if (copy_from_user(&hlp, user, sizeof(hlp))) + return -EFAULT; + + /* try real handler in case userland supplied needed padding */ + if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) + return update_counters(net, user, len); + + return do_update_counters(net, hlp.name, compat_ptr(hlp.counters), + hlp.num_counters, user, len); +} + +static int compat_do_ebt_set_ctl(struct sock *sk, + int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case EBT_SO_SET_ENTRIES: + ret = compat_do_replace(sock_net(sk), user, len); + break; + case EBT_SO_SET_COUNTERS: + ret = compat_update_counters(sock_net(sk), user, len); + break; + default: + ret = -EINVAL; + } + return ret; +} + +static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, + void __user *user, int *len) +{ + int ret; + struct compat_ebt_replace tmp; + struct ebt_table *t; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* try real handler in case userland supplied needed padding */ + if ((cmd == EBT_SO_GET_INFO || + cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(tmp)) + return do_ebt_get_ctl(sk, cmd, user, len); + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex); + if (!t) + return ret; + + xt_compat_lock(NFPROTO_BRIDGE); + switch (cmd) { + case EBT_SO_GET_INFO: + tmp.nentries = t->private->nentries; + ret = compat_table_info(t->private, &tmp); + if (ret) + goto out; + tmp.valid_hooks = t->valid_hooks; + + if (copy_to_user(user, &tmp, *len) != 0) { + ret = -EFAULT; + break; + } + ret = 0; + break; + case EBT_SO_GET_INIT_INFO: + tmp.nentries = t->table->nentries; + tmp.entries_size = t->table->entries_size; + tmp.valid_hooks = t->table->valid_hooks; + + if (copy_to_user(user, &tmp, *len) != 0) { + ret = -EFAULT; + break; + } + ret = 0; + break; + case EBT_SO_GET_ENTRIES: + case EBT_SO_GET_INIT_ENTRIES: + /* + * try real handler first in case of userland-side padding. + * in case we are dealing with an 'ordinary' 32 bit binary + * without 64bit compatibility padding, this will fail right + * after copy_from_user when the *len argument is validated. + * + * the compat_ variant needs to do one pass over the kernel + * data set to adjust for size differences before it the check. + */ + if (copy_everything_to_user(t, user, len, cmd) == 0) + ret = 0; + else + ret = compat_copy_everything_to_user(t, user, len, cmd); + break; + default: + ret = -EINVAL; + } + out: + xt_compat_flush_offsets(NFPROTO_BRIDGE); + xt_compat_unlock(NFPROTO_BRIDGE); + mutex_unlock(&ebt_mutex); + return ret; +} +#endif + +static struct nf_sockopt_ops ebt_sockopts = +{ + .pf = PF_INET, + .set_optmin = EBT_BASE_CTL, + .set_optmax = EBT_SO_SET_MAX + 1, + .set = do_ebt_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_ebt_set_ctl, +#endif + .get_optmin = EBT_BASE_CTL, + .get_optmax = EBT_SO_GET_MAX + 1, + .get = do_ebt_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_ebt_get_ctl, +#endif + .owner = THIS_MODULE, +}; + +static int __init ebtables_init(void) +{ + int ret; + + ret = xt_register_target(&ebt_standard_target); + if (ret < 0) + return ret; + ret = nf_register_sockopt(&ebt_sockopts); + if (ret < 0) { + xt_unregister_target(&ebt_standard_target); + return ret; + } + + printk(KERN_INFO "Ebtables v2.0 registered\n"); + return 0; +} + +static void __exit ebtables_fini(void) +{ + nf_unregister_sockopt(&ebt_sockopts); + xt_unregister_target(&ebt_standard_target); + printk(KERN_INFO "Ebtables v2.0 unregistered\n"); +} + +EXPORT_SYMBOL(ebt_register_table); +EXPORT_SYMBOL(ebt_unregister_table); +EXPORT_SYMBOL(ebt_do_table); +module_init(ebtables_init); +module_exit(ebtables_fini); +MODULE_LICENSE("GPL"); diff --git a/net/caif/Kconfig b/net/caif/Kconfig new file mode 100644 index 00000000..529750da --- /dev/null +++ b/net/caif/Kconfig @@ -0,0 +1,42 @@ +# +# CAIF net configurations +# + +menuconfig CAIF + tristate "CAIF support" + select CRC_CCITT + default n + ---help--- + The "Communication CPU to Application CPU Interface" (CAIF) is a packet + based connection-oriented MUX protocol developed by ST-Ericsson for use + with its modems. It is accessed from user space as sockets (PF_CAIF). + + Say Y (or M) here if you build for a phone product (e.g. Android or + MeeGo ) that uses CAIF as transport, if unsure say N. + + If you select to build it as module then CAIF_NETDEV also needs to be + built as modules. You will also need to say yes to any CAIF physical + devices that your platform requires. + + See Documentation/networking/caif for a further explanation on how to + use and configure CAIF. + +config CAIF_DEBUG + bool "Enable Debug" + depends on CAIF + default n + --- help --- + Enable the inclusion of debug code in the CAIF stack. + Be aware that doing this will impact performance. + If unsure say N. + +config CAIF_NETDEV + tristate "CAIF GPRS Network device" + depends on CAIF + default CAIF + ---help--- + Say Y if you will be using a CAIF based GPRS network device. + This can be either built-in or a loadable module, + If you select to build it as a built-in then the main CAIF device must + also be a built-in. + If unsure say Y. diff --git a/net/caif/Makefile b/net/caif/Makefile new file mode 100644 index 00000000..ebcd4e7e --- /dev/null +++ b/net/caif/Makefile @@ -0,0 +1,14 @@ +ccflags-$(CONFIG_CAIF_DEBUG) := -DDEBUG + +caif-y := caif_dev.o \ + cfcnfg.o cfmuxl.o cfctrl.o \ + cffrml.o cfveil.o cfdbgl.o\ + cfserl.o cfdgml.o \ + cfrfml.o cfvidl.o cfutill.o \ + cfsrvl.o cfpkt_skbuff.o + +obj-$(CONFIG_CAIF) += caif.o +obj-$(CONFIG_CAIF_NETDEV) += chnl_net.o +obj-$(CONFIG_CAIF) += caif_socket.o + +export-y := caif.o diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c new file mode 100644 index 00000000..5ba4366a --- /dev/null +++ b/net/caif/caif_dev.c @@ -0,0 +1,433 @@ +/* + * CAIF Interface registration. + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + * + * Borrowed heavily from file: pn_dev.c. Thanks to + * Remi Denis-Courmont + * and Sakari Ailus + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); + +/* Used for local tracking of the CAIF net devices */ +struct caif_device_entry { + struct cflayer layer; + struct list_head list; + struct net_device *netdev; + int __percpu *pcpu_refcnt; +}; + +struct caif_device_entry_list { + struct list_head list; + /* Protects simulanous deletes in list */ + struct mutex lock; +}; + +struct caif_net { + struct cfcnfg *cfg; + struct caif_device_entry_list caifdevs; +}; + +static int caif_net_id; + +struct cfcnfg *get_cfcnfg(struct net *net) +{ + struct caif_net *caifn; + BUG_ON(!net); + caifn = net_generic(net, caif_net_id); + return caifn->cfg; +} +EXPORT_SYMBOL(get_cfcnfg); + +static struct caif_device_entry_list *caif_device_list(struct net *net) +{ + struct caif_net *caifn; + BUG_ON(!net); + caifn = net_generic(net, caif_net_id); + return &caifn->caifdevs; +} + +static void caifd_put(struct caif_device_entry *e) +{ + irqsafe_cpu_dec(*e->pcpu_refcnt); +} + +static void caifd_hold(struct caif_device_entry *e) +{ + irqsafe_cpu_inc(*e->pcpu_refcnt); +} + +static int caifd_refcnt_read(struct caif_device_entry *e) +{ + int i, refcnt = 0; + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(e->pcpu_refcnt, i); + return refcnt; +} + +/* Allocate new CAIF device. */ +static struct caif_device_entry *caif_device_alloc(struct net_device *dev) +{ + struct caif_device_entry_list *caifdevs; + struct caif_device_entry *caifd; + + caifdevs = caif_device_list(dev_net(dev)); + + caifd = kzalloc(sizeof(*caifd), GFP_ATOMIC); + if (!caifd) + return NULL; + caifd->pcpu_refcnt = alloc_percpu(int); + caifd->netdev = dev; + dev_hold(dev); + return caifd; +} + +static struct caif_device_entry *caif_get(struct net_device *dev) +{ + struct caif_device_entry_list *caifdevs = + caif_device_list(dev_net(dev)); + struct caif_device_entry *caifd; + + list_for_each_entry_rcu(caifd, &caifdevs->list, list) { + if (caifd->netdev == dev) + return caifd; + } + return NULL; +} + +static int transmit(struct cflayer *layer, struct cfpkt *pkt) +{ + int err; + struct caif_device_entry *caifd = + container_of(layer, struct caif_device_entry, layer); + struct sk_buff *skb; + + skb = cfpkt_tonative(pkt); + skb->dev = caifd->netdev; + + err = dev_queue_xmit(skb); + if (err > 0) + err = -EIO; + + return err; +} + +/* + * Stuff received packets into the CAIF stack. + * On error, returns non-zero and releases the skb. + */ +static int receive(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pkttype, struct net_device *orig_dev) +{ + struct cfpkt *pkt; + struct caif_device_entry *caifd; + int err; + + pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); + + rcu_read_lock(); + caifd = caif_get(dev); + + if (!caifd || !caifd->layer.up || !caifd->layer.up->receive || + !netif_oper_up(caifd->netdev)) { + rcu_read_unlock(); + kfree_skb(skb); + return NET_RX_DROP; + } + + /* Hold reference to netdevice while using CAIF stack */ + caifd_hold(caifd); + rcu_read_unlock(); + + err = caifd->layer.up->receive(caifd->layer.up, pkt); + + /* For -EILSEQ the packet is not freed so so it now */ + if (err == -EILSEQ) + cfpkt_destroy(pkt); + + /* Release reference to stack upwards */ + caifd_put(caifd); + return 0; +} + +static struct packet_type caif_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_CAIF), + .func = receive, +}; + +static void dev_flowctrl(struct net_device *dev, int on) +{ + struct caif_device_entry *caifd; + + rcu_read_lock(); + + caifd = caif_get(dev); + if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { + rcu_read_unlock(); + return; + } + + caifd_hold(caifd); + rcu_read_unlock(); + + caifd->layer.up->ctrlcmd(caifd->layer.up, + on ? + _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND : + _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, + caifd->layer.id); + caifd_put(caifd); +} + +/* notify Caif of device events */ +static int caif_device_notify(struct notifier_block *me, unsigned long what, + void *arg) +{ + struct net_device *dev = arg; + struct caif_device_entry *caifd = NULL; + struct caif_dev_common *caifdev; + enum cfcnfg_phy_preference pref; + enum cfcnfg_phy_type phy_type; + struct cfcnfg *cfg; + struct caif_device_entry_list *caifdevs; + + if (dev->type != ARPHRD_CAIF) + return 0; + + cfg = get_cfcnfg(dev_net(dev)); + if (cfg == NULL) + return 0; + + caifdevs = caif_device_list(dev_net(dev)); + + switch (what) { + case NETDEV_REGISTER: + caifd = caif_device_alloc(dev); + if (!caifd) + return 0; + + caifdev = netdev_priv(dev); + caifdev->flowctrl = dev_flowctrl; + + caifd->layer.transmit = transmit; + + if (caifdev->use_frag) + phy_type = CFPHYTYPE_FRAG; + else + phy_type = CFPHYTYPE_CAIF; + + switch (caifdev->link_select) { + case CAIF_LINK_HIGH_BANDW: + pref = CFPHYPREF_HIGH_BW; + break; + case CAIF_LINK_LOW_LATENCY: + pref = CFPHYPREF_LOW_LAT; + break; + default: + pref = CFPHYPREF_HIGH_BW; + break; + } + strncpy(caifd->layer.name, dev->name, + sizeof(caifd->layer.name) - 1); + caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; + + mutex_lock(&caifdevs->lock); + list_add_rcu(&caifd->list, &caifdevs->list); + + cfcnfg_add_phy_layer(cfg, + phy_type, + dev, + &caifd->layer, + pref, + caifdev->use_fcs, + caifdev->use_stx); + mutex_unlock(&caifdevs->lock); + break; + + case NETDEV_UP: + rcu_read_lock(); + + caifd = caif_get(dev); + if (caifd == NULL) { + rcu_read_unlock(); + break; + } + + cfcnfg_set_phy_state(cfg, &caifd->layer, true); + rcu_read_unlock(); + + break; + + case NETDEV_DOWN: + rcu_read_lock(); + + caifd = caif_get(dev); + if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { + rcu_read_unlock(); + return -EINVAL; + } + + cfcnfg_set_phy_state(cfg, &caifd->layer, false); + caifd_hold(caifd); + rcu_read_unlock(); + + caifd->layer.up->ctrlcmd(caifd->layer.up, + _CAIF_CTRLCMD_PHYIF_DOWN_IND, + caifd->layer.id); + caifd_put(caifd); + break; + + case NETDEV_UNREGISTER: + mutex_lock(&caifdevs->lock); + + caifd = caif_get(dev); + if (caifd == NULL) { + mutex_unlock(&caifdevs->lock); + break; + } + list_del_rcu(&caifd->list); + + /* + * NETDEV_UNREGISTER is called repeatedly until all reference + * counts for the net-device are released. If references to + * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for + * the next call to NETDEV_UNREGISTER. + * + * If any packets are in flight down the CAIF Stack, + * cfcnfg_del_phy_layer will return nonzero. + * If no packets are in flight, the CAIF Stack associated + * with the net-device un-registering is freed. + */ + + if (caifd_refcnt_read(caifd) != 0 || + cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) { + + pr_info("Wait for device inuse\n"); + /* Enrole device if CAIF Stack is still in use */ + list_add_rcu(&caifd->list, &caifdevs->list); + mutex_unlock(&caifdevs->lock); + break; + } + + synchronize_rcu(); + dev_put(caifd->netdev); + free_percpu(caifd->pcpu_refcnt); + kfree(caifd); + + mutex_unlock(&caifdevs->lock); + break; + } + return 0; +} + +static struct notifier_block caif_device_notifier = { + .notifier_call = caif_device_notify, + .priority = 0, +}; + +/* Per-namespace Caif devices handling */ +static int caif_init_net(struct net *net) +{ + struct caif_net *caifn = net_generic(net, caif_net_id); + + INIT_LIST_HEAD(&caifn->caifdevs.list); + mutex_init(&caifn->caifdevs.lock); + + caifn->cfg = cfcnfg_create(); + if (!caifn->cfg) { + pr_warn("can't create cfcnfg\n"); + return -ENOMEM; + } + + return 0; +} + +static void caif_exit_net(struct net *net) +{ + struct caif_device_entry *caifd, *tmp; + struct caif_device_entry_list *caifdevs = + caif_device_list(net); + struct cfcnfg *cfg; + + rtnl_lock(); + mutex_lock(&caifdevs->lock); + + cfg = get_cfcnfg(net); + if (cfg == NULL) { + mutex_unlock(&caifdevs->lock); + return; + } + + list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { + int i = 0; + list_del_rcu(&caifd->list); + cfcnfg_set_phy_state(cfg, &caifd->layer, false); + + while (i < 10 && + (caifd_refcnt_read(caifd) != 0 || + cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) { + + pr_info("Wait for device inuse\n"); + msleep(250); + i++; + } + synchronize_rcu(); + dev_put(caifd->netdev); + free_percpu(caifd->pcpu_refcnt); + kfree(caifd); + } + cfcnfg_remove(cfg); + + mutex_unlock(&caifdevs->lock); + rtnl_unlock(); +} + +static struct pernet_operations caif_net_ops = { + .init = caif_init_net, + .exit = caif_exit_net, + .id = &caif_net_id, + .size = sizeof(struct caif_net), +}; + +/* Initialize Caif devices list */ +static int __init caif_device_init(void) +{ + int result; + + result = register_pernet_subsys(&caif_net_ops); + + if (result) + return result; + + register_netdevice_notifier(&caif_device_notifier); + dev_add_pack(&caif_packet_type); + + return result; +} + +static void __exit caif_device_exit(void) +{ + unregister_pernet_subsys(&caif_net_ops); + unregister_netdevice_notifier(&caif_device_notifier); + dev_remove_pack(&caif_packet_type); +} + +module_init(caif_device_init); +module_exit(caif_device_exit); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c new file mode 100644 index 00000000..a9862808 --- /dev/null +++ b/net/caif/caif_socket.c @@ -0,0 +1,1220 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(AF_CAIF); + +/* + * CAIF state is re-using the TCP socket states. + * caif_states stored in sk_state reflect the state as reported by + * the CAIF stack, while sk_socket->state is the state of the socket. + */ +enum caif_states { + CAIF_CONNECTED = TCP_ESTABLISHED, + CAIF_CONNECTING = TCP_SYN_SENT, + CAIF_DISCONNECTED = TCP_CLOSE +}; + +#define TX_FLOW_ON_BIT 1 +#define RX_FLOW_ON_BIT 2 + +static struct dentry *debugfsdir; + +#ifdef CONFIG_DEBUG_FS +struct debug_fs_counter { + atomic_t caif_nr_socks; + atomic_t caif_sock_create; + atomic_t num_connect_req; + atomic_t num_connect_resp; + atomic_t num_connect_fail_resp; + atomic_t num_disconnect; + atomic_t num_remote_shutdown_ind; + atomic_t num_tx_flow_off_ind; + atomic_t num_tx_flow_on_ind; + atomic_t num_rx_flow_off; + atomic_t num_rx_flow_on; +}; +static struct debug_fs_counter cnt; +#define dbfs_atomic_inc(v) atomic_inc_return(v) +#define dbfs_atomic_dec(v) atomic_dec_return(v) +#else +#define dbfs_atomic_inc(v) 0 +#define dbfs_atomic_dec(v) 0 +#endif + +struct caifsock { + struct sock sk; /* must be first member */ + struct cflayer layer; + char name[CAIF_LAYER_NAME_SZ]; /* Used for debugging */ + u32 flow_state; + struct caif_connect_request conn_req; + struct mutex readlock; + struct dentry *debugfs_socket_dir; + int headroom, tailroom, maxframe; +}; + +static int rx_flow_is_on(struct caifsock *cf_sk) +{ + return test_bit(RX_FLOW_ON_BIT, + (void *) &cf_sk->flow_state); +} + +static int tx_flow_is_on(struct caifsock *cf_sk) +{ + return test_bit(TX_FLOW_ON_BIT, + (void *) &cf_sk->flow_state); +} + +static void set_rx_flow_off(struct caifsock *cf_sk) +{ + clear_bit(RX_FLOW_ON_BIT, + (void *) &cf_sk->flow_state); +} + +static void set_rx_flow_on(struct caifsock *cf_sk) +{ + set_bit(RX_FLOW_ON_BIT, + (void *) &cf_sk->flow_state); +} + +static void set_tx_flow_off(struct caifsock *cf_sk) +{ + clear_bit(TX_FLOW_ON_BIT, + (void *) &cf_sk->flow_state); +} + +static void set_tx_flow_on(struct caifsock *cf_sk) +{ + set_bit(TX_FLOW_ON_BIT, + (void *) &cf_sk->flow_state); +} + +static void caif_read_lock(struct sock *sk) +{ + struct caifsock *cf_sk; + cf_sk = container_of(sk, struct caifsock, sk); + mutex_lock(&cf_sk->readlock); +} + +static void caif_read_unlock(struct sock *sk) +{ + struct caifsock *cf_sk; + cf_sk = container_of(sk, struct caifsock, sk); + mutex_unlock(&cf_sk->readlock); +} + +static int sk_rcvbuf_lowwater(struct caifsock *cf_sk) +{ + /* A quarter of full buffer is used a low water mark */ + return cf_sk->sk.sk_rcvbuf / 4; +} + +static void caif_flow_ctrl(struct sock *sk, int mode) +{ + struct caifsock *cf_sk; + cf_sk = container_of(sk, struct caifsock, sk); + if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd) + cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode); +} + +/* + * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are + * not dropped, but CAIF is sending flow off instead. + */ +static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { + if (net_ratelimit()) + pr_debug("sending flow OFF (queue len = %d %d)\n", + atomic_read(&cf_sk->sk.sk_rmem_alloc), + sk_rcvbuf_lowwater(cf_sk)); + set_rx_flow_off(cf_sk); + dbfs_atomic_inc(&cnt.num_rx_flow_off); + caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); + } + + err = sk_filter(sk, skb); + if (err) + return err; + if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { + set_rx_flow_off(cf_sk); + if (net_ratelimit()) + pr_debug("sending flow OFF due to rmem_schedule\n"); + dbfs_atomic_inc(&cnt.num_rx_flow_off); + caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); + } + skb->dev = NULL; + skb_set_owner_r(skb, sk); + /* Cache the SKB length before we tack it onto the receive + * queue. Once it is added it no longer belongs to us and + * may be freed by other threads of control pulling packets + * from the queue. + */ + skb_len = skb->len; + spin_lock_irqsave(&list->lock, flags); + if (!sock_flag(sk, SOCK_DEAD)) + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); + else + kfree_skb(skb); + return 0; +} + +/* Packet Receive Callback function called from CAIF Stack */ +static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt) +{ + struct caifsock *cf_sk; + struct sk_buff *skb; + + cf_sk = container_of(layr, struct caifsock, layer); + skb = cfpkt_tonative(pkt); + + if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) { + kfree_skb(skb); + return 0; + } + caif_queue_rcv_skb(&cf_sk->sk, skb); + return 0; +} + +static void cfsk_hold(struct cflayer *layr) +{ + struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); + sock_hold(&cf_sk->sk); +} + +static void cfsk_put(struct cflayer *layr) +{ + struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); + sock_put(&cf_sk->sk); +} + +/* Packet Control Callback function called from CAIF */ +static void caif_ctrl_cb(struct cflayer *layr, + enum caif_ctrlcmd flow, + int phyid) +{ + struct caifsock *cf_sk = container_of(layr, struct caifsock, layer); + switch (flow) { + case CAIF_CTRLCMD_FLOW_ON_IND: + /* OK from modem to start sending again */ + dbfs_atomic_inc(&cnt.num_tx_flow_on_ind); + set_tx_flow_on(cf_sk); + cf_sk->sk.sk_state_change(&cf_sk->sk); + break; + + case CAIF_CTRLCMD_FLOW_OFF_IND: + /* Modem asks us to shut up */ + dbfs_atomic_inc(&cnt.num_tx_flow_off_ind); + set_tx_flow_off(cf_sk); + cf_sk->sk.sk_state_change(&cf_sk->sk); + break; + + case CAIF_CTRLCMD_INIT_RSP: + /* We're now connected */ + caif_client_register_refcnt(&cf_sk->layer, + cfsk_hold, cfsk_put); + dbfs_atomic_inc(&cnt.num_connect_resp); + cf_sk->sk.sk_state = CAIF_CONNECTED; + set_tx_flow_on(cf_sk); + cf_sk->sk.sk_state_change(&cf_sk->sk); + break; + + case CAIF_CTRLCMD_DEINIT_RSP: + /* We're now disconnected */ + cf_sk->sk.sk_state = CAIF_DISCONNECTED; + cf_sk->sk.sk_state_change(&cf_sk->sk); + break; + + case CAIF_CTRLCMD_INIT_FAIL_RSP: + /* Connect request failed */ + dbfs_atomic_inc(&cnt.num_connect_fail_resp); + cf_sk->sk.sk_err = ECONNREFUSED; + cf_sk->sk.sk_state = CAIF_DISCONNECTED; + cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; + /* + * Socket "standards" seems to require POLLOUT to + * be set at connect failure. + */ + set_tx_flow_on(cf_sk); + cf_sk->sk.sk_state_change(&cf_sk->sk); + break; + + case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: + /* Modem has closed this connection, or device is down. */ + dbfs_atomic_inc(&cnt.num_remote_shutdown_ind); + cf_sk->sk.sk_shutdown = SHUTDOWN_MASK; + cf_sk->sk.sk_err = ECONNRESET; + set_rx_flow_on(cf_sk); + cf_sk->sk.sk_error_report(&cf_sk->sk); + break; + + default: + pr_debug("Unexpected flow command %d\n", flow); + } +} + +static void caif_check_flow_release(struct sock *sk) +{ + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + + if (rx_flow_is_on(cf_sk)) + return; + + if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) { + dbfs_atomic_inc(&cnt.num_rx_flow_on); + set_rx_flow_on(cf_sk); + caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ); + } +} + +/* + * Copied from unix_dgram_recvmsg, but removed credit checks, + * changed locking, address handling and added MSG_TRUNC. + */ +static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t len, int flags) + +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int ret; + int copylen; + + ret = -EOPNOTSUPP; + if (m->msg_flags&MSG_OOB) + goto read_error; + + skb = skb_recv_datagram(sk, flags, 0 , &ret); + if (!skb) + goto read_error; + copylen = skb->len; + if (len < copylen) { + m->msg_flags |= MSG_TRUNC; + copylen = len; + } + + ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen); + if (ret) + goto out_free; + + ret = (flags & MSG_TRUNC) ? skb->len : copylen; +out_free: + skb_free_datagram(sk, skb); + caif_check_flow_release(sk); + return ret; + +read_error: + return ret; +} + + +/* Copied from unix_stream_wait_data, identical except for lock call. */ +static long caif_stream_data_wait(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + lock_sock(sk); + + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + if (!skb_queue_empty(&sk->sk_receive_queue) || + sk->sk_err || + sk->sk_state != CAIF_CONNECTED || + sock_flag(sk, SOCK_DEAD) || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current) || + !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + } + + finish_wait(sk_sleep(sk), &wait); + release_sock(sk); + return timeo; +} + + +/* + * Copied from unix_stream_recvmsg, but removed credit checks, + * changed locking calls, changed address handling. + */ +static int caif_stream_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + int copied = 0; + int target; + int err = 0; + long timeo; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + msg->msg_namelen = 0; + + /* + * Lock the socket to prevent queue disordering + * while sleeps in memcpy_tomsg + */ + err = -EAGAIN; + if (sk->sk_state == CAIF_CONNECTING) + goto out; + + caif_read_lock(sk); + target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); + + do { + int chunk; + struct sk_buff *skb; + + lock_sock(sk); + skb = skb_dequeue(&sk->sk_receive_queue); + caif_check_flow_release(sk); + + if (skb == NULL) { + if (copied >= target) + goto unlock; + /* + * POSIX 1003.1g mandates this order. + */ + err = sock_error(sk); + if (err) + goto unlock; + err = -ECONNRESET; + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto unlock; + + err = -EPIPE; + if (sk->sk_state != CAIF_CONNECTED) + goto unlock; + if (sock_flag(sk, SOCK_DEAD)) + goto unlock; + + release_sock(sk); + + err = -EAGAIN; + if (!timeo) + break; + + caif_read_unlock(sk); + + timeo = caif_stream_data_wait(sk, timeo); + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + caif_read_lock(sk); + continue; +unlock: + release_sock(sk); + break; + } + release_sock(sk); + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) { + skb_pull(skb, chunk); + + /* put the skb back if we didn't use it up. */ + if (skb->len) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + kfree_skb(skb); + + } else { + /* + * It is questionable, see note in unix_dgram_recvmsg. + */ + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + caif_read_unlock(sk); + +out: + return copied ? : err; +} + +/* + * Copied from sock.c:sock_wait_for_wmem, but change to wait for + * CAIF flow-on and sock_writable. + */ +static long caif_wait_for_flow_on(struct caifsock *cf_sk, + int wait_writeable, long timeo, int *err) +{ + struct sock *sk = &cf_sk->sk; + DEFINE_WAIT(wait); + for (;;) { + *err = 0; + if (tx_flow_is_on(cf_sk) && + (!wait_writeable || sock_writeable(&cf_sk->sk))) + break; + *err = -ETIMEDOUT; + if (!timeo) + break; + *err = -ERESTARTSYS; + if (signal_pending(current)) + break; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + *err = -ECONNRESET; + if (sk->sk_shutdown & SHUTDOWN_MASK) + break; + *err = -sk->sk_err; + if (sk->sk_err) + break; + *err = -EPIPE; + if (cf_sk->sk.sk_state != CAIF_CONNECTED) + break; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(sk), &wait); + return timeo; +} + +/* + * Transmit a SKB. The device may temporarily request re-transmission + * by returning EAGAIN. + */ +static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, + int noblock, long timeo) +{ + struct cfpkt *pkt; + + pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); + memset(skb->cb, 0, sizeof(struct caif_payload_info)); + + if (cf_sk->layer.dn == NULL) + return -EINVAL; + + return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); +} + +/* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ +static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + int buffer_size; + int ret = 0; + struct sk_buff *skb = NULL; + int noblock; + long timeo; + caif_assert(cf_sk); + ret = sock_error(sk); + if (ret) + goto err; + + ret = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto err; + + ret = -EOPNOTSUPP; + if (msg->msg_namelen) + goto err; + + ret = -EINVAL; + if (unlikely(msg->msg_iov->iov_base == NULL)) + goto err; + noblock = msg->msg_flags & MSG_DONTWAIT; + + timeo = sock_sndtimeo(sk, noblock); + timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk), + 1, timeo, &ret); + + if (ret) + goto err; + ret = -EPIPE; + if (cf_sk->sk.sk_state != CAIF_CONNECTED || + sock_flag(sk, SOCK_DEAD) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + goto err; + + /* Error if trying to write more than maximum frame size. */ + ret = -EMSGSIZE; + if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM) + goto err; + + buffer_size = len + cf_sk->headroom + cf_sk->tailroom; + + ret = -ENOMEM; + skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret); + + if (!skb || skb_tailroom(skb) < buffer_size) + goto err; + + skb_reserve(skb, cf_sk->headroom); + + ret = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + + if (ret) + goto err; + ret = transmit_skb(skb, cf_sk, noblock, timeo); + if (ret < 0) + /* skb is already freed */ + return ret; + + return len; +err: + kfree_skb(skb); + return ret; +} + +/* + * Copied from unix_stream_sendmsg and adapted to CAIF: + * Changed removed permission handling and added waiting for flow on + * and other minor adaptations. + */ +static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + int err, size; + struct sk_buff *skb; + int sent = 0; + long timeo; + + err = -EOPNOTSUPP; + if (unlikely(msg->msg_flags&MSG_OOB)) + goto out_err; + + if (unlikely(msg->msg_namelen)) + goto out_err; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err); + + if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) + goto pipe_err; + + while (sent < len) { + + size = len-sent; + + if (size > cf_sk->maxframe) + size = cf_sk->maxframe; + + /* If size is more than half of sndbuf, chop up message */ + if (size > ((sk->sk_sndbuf >> 1) - 64)) + size = (sk->sk_sndbuf >> 1) - 64; + + if (size > SKB_MAX_ALLOC) + size = SKB_MAX_ALLOC; + + skb = sock_alloc_send_skb(sk, + size + cf_sk->headroom + + cf_sk->tailroom, + msg->msg_flags&MSG_DONTWAIT, + &err); + if (skb == NULL) + goto out_err; + + skb_reserve(skb, cf_sk->headroom); + /* + * If you pass two values to the sock_alloc_send_skb + * it tries to grab the large buffer with GFP_NOFS + * (which can fail easily), and if it fails grab the + * fallback size buffer which is under a page and will + * succeed. [Alan] + */ + size = min_t(int, size, skb_tailroom(skb)); + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err) { + kfree_skb(skb); + goto out_err; + } + err = transmit_skb(skb, cf_sk, + msg->msg_flags&MSG_DONTWAIT, timeo); + if (err < 0) { + kfree_skb(skb); + goto pipe_err; + } + sent += size; + } + + return sent; + +pipe_err: + if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + err = -EPIPE; +out_err: + return sent ? : err; +} + +static int setsockopt(struct socket *sock, + int lvl, int opt, char __user *ov, unsigned int ol) +{ + struct sock *sk = sock->sk; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + int linksel; + + if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) + return -ENOPROTOOPT; + + switch (opt) { + case CAIFSO_LINK_SELECT: + if (ol < sizeof(int)) + return -EINVAL; + if (lvl != SOL_CAIF) + goto bad_sol; + if (copy_from_user(&linksel, ov, sizeof(int))) + return -EINVAL; + lock_sock(&(cf_sk->sk)); + cf_sk->conn_req.link_selector = linksel; + release_sock(&cf_sk->sk); + return 0; + + case CAIFSO_REQ_PARAM: + if (lvl != SOL_CAIF) + goto bad_sol; + if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL) + return -ENOPROTOOPT; + lock_sock(&(cf_sk->sk)); + if (ol > sizeof(cf_sk->conn_req.param.data) || + copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) { + release_sock(&cf_sk->sk); + return -EINVAL; + } + cf_sk->conn_req.param.size = ol; + release_sock(&cf_sk->sk); + return 0; + + default: + return -ENOPROTOOPT; + } + + return 0; +bad_sol: + return -ENOPROTOOPT; + +} + +/* + * caif_connect() - Connect a CAIF Socket + * Copied and modified af_irda.c:irda_connect(). + * + * Note : by consulting "errno", the user space caller may learn the cause + * of the failure. Most of them are visible in the function, others may come + * from subroutines called and are listed here : + * o -EAFNOSUPPORT: bad socket family or type. + * o -ESOCKTNOSUPPORT: bad socket type or protocol + * o -EINVAL: bad socket address, or CAIF link type + * o -ECONNREFUSED: remote end refused the connection. + * o -EINPROGRESS: connect request sent but timed out (or non-blocking) + * o -EISCONN: already connected. + * o -ETIMEDOUT: Connection timed out (send timeout) + * o -ENODEV: No link layer to send request + * o -ECONNRESET: Received Shutdown indication or lost link layer + * o -ENOMEM: Out of memory + * + * State Strategy: + * o sk_state: holds the CAIF_* protocol state, it's updated by + * caif_ctrl_cb. + * o sock->state: holds the SS_* socket state and is updated by connect and + * disconnect. + */ +static int caif_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + long timeo; + int err; + int ifindex, headroom, tailroom; + unsigned int mtu; + struct net_device *dev; + + lock_sock(sk); + + err = -EAFNOSUPPORT; + if (uaddr->sa_family != AF_CAIF) + goto out; + + switch (sock->state) { + case SS_UNCONNECTED: + /* Normal case, a fresh connect */ + caif_assert(sk->sk_state == CAIF_DISCONNECTED); + break; + case SS_CONNECTING: + switch (sk->sk_state) { + case CAIF_CONNECTED: + sock->state = SS_CONNECTED; + err = -EISCONN; + goto out; + case CAIF_DISCONNECTED: + /* Reconnect allowed */ + break; + case CAIF_CONNECTING: + err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; + goto wait_connect; + } + break; + case SS_CONNECTED: + caif_assert(sk->sk_state == CAIF_CONNECTED || + sk->sk_state == CAIF_DISCONNECTED); + if (sk->sk_shutdown & SHUTDOWN_MASK) { + /* Allow re-connect after SHUTDOWN_IND */ + caif_disconnect_client(sock_net(sk), &cf_sk->layer); + caif_free_client(&cf_sk->layer); + break; + } + /* No reconnect on a seqpacket socket */ + err = -EISCONN; + goto out; + case SS_DISCONNECTING: + case SS_FREE: + caif_assert(1); /*Should never happen */ + break; + } + sk->sk_state = CAIF_DISCONNECTED; + sock->state = SS_UNCONNECTED; + sk_stream_kill_queues(&cf_sk->sk); + + err = -EINVAL; + if (addr_len != sizeof(struct sockaddr_caif)) + goto out; + + memcpy(&cf_sk->conn_req.sockaddr, uaddr, + sizeof(struct sockaddr_caif)); + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = CAIF_CONNECTING; + + /* Check priority value comming from socket */ + /* if priority value is out of range it will be ajusted */ + if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) + cf_sk->conn_req.priority = CAIF_PRIO_MAX; + else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) + cf_sk->conn_req.priority = CAIF_PRIO_MIN; + else + cf_sk->conn_req.priority = cf_sk->sk.sk_priority; + + /*ifindex = id of the interface.*/ + cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; + + dbfs_atomic_inc(&cnt.num_connect_req); + cf_sk->layer.receive = caif_sktrecv_cb; + + err = caif_connect_client(sock_net(sk), &cf_sk->conn_req, + &cf_sk->layer, &ifindex, &headroom, &tailroom); + + if (err < 0) { + cf_sk->sk.sk_socket->state = SS_UNCONNECTED; + cf_sk->sk.sk_state = CAIF_DISCONNECTED; + goto out; + } + + err = -ENODEV; + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); + if (!dev) { + rcu_read_unlock(); + goto out; + } + cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); + mtu = dev->mtu; + rcu_read_unlock(); + + cf_sk->tailroom = tailroom; + cf_sk->maxframe = mtu - (headroom + tailroom); + if (cf_sk->maxframe < 1) { + pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); + err = -ENODEV; + goto out; + } + + err = -EINPROGRESS; +wait_connect: + + if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK)) + goto out; + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + release_sock(sk); + err = -ERESTARTSYS; + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + sk->sk_state != CAIF_CONNECTING, + timeo); + lock_sock(sk); + if (timeo < 0) + goto out; /* -ERESTARTSYS */ + + err = -ETIMEDOUT; + if (timeo == 0 && sk->sk_state != CAIF_CONNECTED) + goto out; + if (sk->sk_state != CAIF_CONNECTED) { + sock->state = SS_UNCONNECTED; + err = sock_error(sk); + if (!err) + err = -ECONNREFUSED; + goto out; + } + sock->state = SS_CONNECTED; + err = 0; +out: + release_sock(sk); + return err; +} + +/* + * caif_release() - Disconnect a CAIF Socket + * Copied and modified af_irda.c:irda_release(). + */ +static int caif_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + + if (!sk) + return 0; + + set_tx_flow_off(cf_sk); + + /* + * Ensure that packets are not queued after this point in time. + * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock, + * this ensures no packets when sock is dead. + */ + spin_lock_bh(&sk->sk_receive_queue.lock); + sock_set_flag(sk, SOCK_DEAD); + spin_unlock_bh(&sk->sk_receive_queue.lock); + sock->sk = NULL; + + dbfs_atomic_inc(&cnt.num_disconnect); + + WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); + if (cf_sk->debugfs_socket_dir != NULL) + debugfs_remove_recursive(cf_sk->debugfs_socket_dir); + + lock_sock(&(cf_sk->sk)); + sk->sk_state = CAIF_DISCONNECTED; + sk->sk_shutdown = SHUTDOWN_MASK; + + caif_disconnect_client(sock_net(sk), &cf_sk->layer); + cf_sk->sk.sk_socket->state = SS_DISCONNECTING; + wake_up_interruptible_poll(sk_sleep(sk), POLLERR|POLLHUP); + + sock_orphan(sk); + sk_stream_kill_queues(&cf_sk->sk); + release_sock(sk); + sock_put(sk); + return 0; +} + +/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ +static unsigned int caif_poll(struct file *file, + struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err) + mask |= POLLERR; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + /* + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ + if (sock_writeable(sk) && tx_flow_is_on(cf_sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + +static const struct proto_ops caif_seqpacket_ops = { + .family = PF_CAIF, + .owner = THIS_MODULE, + .release = caif_release, + .bind = sock_no_bind, + .connect = caif_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = caif_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = caif_seqpkt_sendmsg, + .recvmsg = caif_seqpkt_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops caif_stream_ops = { + .family = PF_CAIF, + .owner = THIS_MODULE, + .release = caif_release, + .bind = sock_no_bind, + .connect = caif_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = caif_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = caif_stream_sendmsg, + .recvmsg = caif_stream_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +/* This function is called when a socket is finally destroyed. */ +static void caif_sock_destructor(struct sock *sk) +{ + struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + caif_assert(!atomic_read(&sk->sk_wmem_alloc)); + caif_assert(sk_unhashed(sk)); + caif_assert(!sk->sk_socket); + if (!sock_flag(sk, SOCK_DEAD)) { + pr_debug("Attempt to release alive CAIF socket: %p\n", sk); + return; + } + sk_stream_kill_queues(&cf_sk->sk); + dbfs_atomic_dec(&cnt.caif_nr_socks); + caif_free_client(&cf_sk->layer); +} + +static int caif_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + int num; + struct sock *sk = NULL; + struct caifsock *cf_sk = NULL; + static struct proto prot = {.name = "PF_CAIF", + .owner = THIS_MODULE, + .obj_size = sizeof(struct caifsock), + }; + + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN)) + return -EPERM; + /* + * The sock->type specifies the socket type to use. + * The CAIF socket is a packet stream in the sense + * that it is packet based. CAIF trusts the reliability + * of the link, no resending is implemented. + */ + if (sock->type == SOCK_SEQPACKET) + sock->ops = &caif_seqpacket_ops; + else if (sock->type == SOCK_STREAM) + sock->ops = &caif_stream_ops; + else + return -ESOCKTNOSUPPORT; + + if (protocol < 0 || protocol >= CAIFPROTO_MAX) + return -EPROTONOSUPPORT; + /* + * Set the socket state to unconnected. The socket state + * is really not used at all in the net/core or socket.c but the + * initialization makes sure that sock->state is not uninitialized. + */ + sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot); + if (!sk) + return -ENOMEM; + + cf_sk = container_of(sk, struct caifsock, sk); + + /* Store the protocol */ + sk->sk_protocol = (unsigned char) protocol; + + /* + * Lock in order to try to stop someone from opening the socket + * too early. + */ + lock_sock(&(cf_sk->sk)); + + /* Initialize the nozero default sock structure data. */ + sock_init_data(sock, sk); + sk->sk_destruct = caif_sock_destructor; + + mutex_init(&cf_sk->readlock); /* single task reading lock */ + cf_sk->layer.ctrlcmd = caif_ctrl_cb; + cf_sk->sk.sk_socket->state = SS_UNCONNECTED; + cf_sk->sk.sk_state = CAIF_DISCONNECTED; + + set_tx_flow_off(cf_sk); + set_rx_flow_on(cf_sk); + + /* Set default options on configuration */ + cf_sk->sk.sk_priority = CAIF_PRIO_NORMAL; + cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; + cf_sk->conn_req.protocol = protocol; + /* Increase the number of sockets created. */ + dbfs_atomic_inc(&cnt.caif_nr_socks); + num = dbfs_atomic_inc(&cnt.caif_sock_create); +#ifdef CONFIG_DEBUG_FS + if (!IS_ERR(debugfsdir)) { + + /* Fill in some information concerning the misc socket. */ + snprintf(cf_sk->name, sizeof(cf_sk->name), "cfsk%d", num); + + cf_sk->debugfs_socket_dir = + debugfs_create_dir(cf_sk->name, debugfsdir); + + debugfs_create_u32("sk_state", S_IRUSR | S_IWUSR, + cf_sk->debugfs_socket_dir, + (u32 *) &cf_sk->sk.sk_state); + debugfs_create_u32("flow_state", S_IRUSR | S_IWUSR, + cf_sk->debugfs_socket_dir, &cf_sk->flow_state); + debugfs_create_u32("sk_rmem_alloc", S_IRUSR | S_IWUSR, + cf_sk->debugfs_socket_dir, + (u32 *) &cf_sk->sk.sk_rmem_alloc); + debugfs_create_u32("sk_wmem_alloc", S_IRUSR | S_IWUSR, + cf_sk->debugfs_socket_dir, + (u32 *) &cf_sk->sk.sk_wmem_alloc); + debugfs_create_u32("identity", S_IRUSR | S_IWUSR, + cf_sk->debugfs_socket_dir, + (u32 *) &cf_sk->layer.id); + } +#endif + release_sock(&cf_sk->sk); + return 0; +} + + +static struct net_proto_family caif_family_ops = { + .family = PF_CAIF, + .create = caif_create, + .owner = THIS_MODULE, +}; + +static int af_caif_init(void) +{ + int err = sock_register(&caif_family_ops); + if (!err) + return err; + return 0; +} + +static int __init caif_sktinit_module(void) +{ +#ifdef CONFIG_DEBUG_FS + debugfsdir = debugfs_create_dir("caif_sk", NULL); + if (!IS_ERR(debugfsdir)) { + debugfs_create_u32("num_sockets", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.caif_nr_socks); + debugfs_create_u32("num_create", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.caif_sock_create); + debugfs_create_u32("num_connect_req", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_connect_req); + debugfs_create_u32("num_connect_resp", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_connect_resp); + debugfs_create_u32("num_connect_fail_resp", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_connect_fail_resp); + debugfs_create_u32("num_disconnect", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_disconnect); + debugfs_create_u32("num_remote_shutdown_ind", + S_IRUSR | S_IWUSR, debugfsdir, + (u32 *) &cnt.num_remote_shutdown_ind); + debugfs_create_u32("num_tx_flow_off_ind", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_tx_flow_off_ind); + debugfs_create_u32("num_tx_flow_on_ind", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_tx_flow_on_ind); + debugfs_create_u32("num_rx_flow_off", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_rx_flow_off); + debugfs_create_u32("num_rx_flow_on", S_IRUSR | S_IWUSR, + debugfsdir, + (u32 *) &cnt.num_rx_flow_on); + } +#endif + return af_caif_init(); +} + +static void __exit caif_sktexit_module(void) +{ + sock_unregister(PF_CAIF); + if (debugfsdir != NULL) + debugfs_remove_recursive(debugfsdir); +} +module_init(caif_sktinit_module); +module_exit(caif_sktexit_module); diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c new file mode 100644 index 00000000..bca32d7c --- /dev/null +++ b/net/caif/cfcnfg.c @@ -0,0 +1,631 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) container_of(layr, struct cfcnfg, layer) + +/* Information about CAIF physical interfaces held by Config Module in order + * to manage physical interfaces + */ +struct cfcnfg_phyinfo { + struct list_head node; + bool up; + + /* Pointer to the layer below the MUX (framing layer) */ + struct cflayer *frm_layer; + /* Pointer to the lowest actual physical layer */ + struct cflayer *phy_layer; + /* Unique identifier of the physical interface */ + unsigned int id; + /* Preference of the physical in interface */ + enum cfcnfg_phy_preference pref; + + /* Information about the physical device */ + struct dev_info dev_info; + + /* Interface index */ + int ifindex; + + /* Use Start of frame extension */ + bool use_stx; + + /* Use Start of frame checksum */ + bool use_fcs; +}; + +struct cfcnfg { + struct cflayer layer; + struct cflayer *ctrl; + struct cflayer *mux; + struct list_head phys; + struct mutex lock; +}; + +static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, + enum cfctrl_srv serv, u8 phyid, + struct cflayer *adapt_layer); +static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); +static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, + struct cflayer *adapt_layer); +static void cfctrl_resp_func(void); +static void cfctrl_enum_resp(void); + +struct cfcnfg *cfcnfg_create(void) +{ + struct cfcnfg *this; + struct cfctrl_rsp *resp; + + might_sleep(); + + /* Initiate this layer */ + this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); + if (!this) { + pr_warn("Out of memory\n"); + return NULL; + } + this->mux = cfmuxl_create(); + if (!this->mux) + goto out_of_mem; + this->ctrl = cfctrl_create(); + if (!this->ctrl) + goto out_of_mem; + /* Initiate response functions */ + resp = cfctrl_get_respfuncs(this->ctrl); + resp->enum_rsp = cfctrl_enum_resp; + resp->linkerror_ind = cfctrl_resp_func; + resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; + resp->sleep_rsp = cfctrl_resp_func; + resp->wake_rsp = cfctrl_resp_func; + resp->restart_rsp = cfctrl_resp_func; + resp->radioset_rsp = cfctrl_resp_func; + resp->linksetup_rsp = cfcnfg_linkup_rsp; + resp->reject_rsp = cfcnfg_reject_rsp; + INIT_LIST_HEAD(&this->phys); + + cfmuxl_set_uplayer(this->mux, this->ctrl, 0); + layer_set_dn(this->ctrl, this->mux); + layer_set_up(this->ctrl, this); + mutex_init(&this->lock); + + return this; +out_of_mem: + pr_warn("Out of memory\n"); + + synchronize_rcu(); + + kfree(this->mux); + kfree(this->ctrl); + kfree(this); + return NULL; +} + +void cfcnfg_remove(struct cfcnfg *cfg) +{ + might_sleep(); + if (cfg) { + synchronize_rcu(); + + kfree(cfg->mux); + cfctrl_remove(cfg->ctrl); + kfree(cfg); + } +} + +static void cfctrl_resp_func(void) +{ +} + +static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, + u8 phyid) +{ + struct cfcnfg_phyinfo *phy; + + list_for_each_entry_rcu(phy, &cnfg->phys, node) + if (phy->id == phyid) + return phy; + return NULL; +} + +static void cfctrl_enum_resp(void) +{ +} + +static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, + enum cfcnfg_phy_preference phy_pref) +{ + /* Try to match with specified preference */ + struct cfcnfg_phyinfo *phy; + + list_for_each_entry_rcu(phy, &cnfg->phys, node) { + if (phy->up && phy->pref == phy_pref && + phy->frm_layer != NULL) + + return &phy->dev_info; + } + + /* Otherwise just return something */ + list_for_each_entry_rcu(phy, &cnfg->phys, node) + if (phy->up) + return &phy->dev_info; + + return NULL; +} + +static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) +{ + struct cfcnfg_phyinfo *phy; + + list_for_each_entry_rcu(phy, &cnfg->phys, node) + if (phy->ifindex == ifi && phy->up) + return phy->id; + return -ENODEV; +} + +int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) +{ + u8 channel_id; + struct cfcnfg *cfg = get_cfcnfg(net); + + caif_assert(adap_layer != NULL); + cfctrl_cancel_req(cfg->ctrl, adap_layer); + channel_id = adap_layer->id; + if (channel_id != 0) { + struct cflayer *servl; + servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); + if (servl != NULL) + layer_set_up(servl, NULL); + } else + pr_debug("nothing to disconnect\n"); + cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); + + /* Do RCU sync before initiating cleanup */ + synchronize_rcu(); + if (adap_layer->ctrlcmd != NULL) + adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); + return 0; + +} +EXPORT_SYMBOL(caif_disconnect_client); + +static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) +{ +} + +static const int protohead[CFCTRL_SRV_MASK] = { + [CFCTRL_SRV_VEI] = 4, + [CFCTRL_SRV_DATAGRAM] = 7, + [CFCTRL_SRV_UTIL] = 4, + [CFCTRL_SRV_RFM] = 3, + [CFCTRL_SRV_DBG] = 3, +}; + + +static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, + struct caif_connect_request *s, + struct cfctrl_link_param *l) +{ + struct dev_info *dev_info; + enum cfcnfg_phy_preference pref; + int res; + + memset(l, 0, sizeof(*l)); + /* In caif protocol low value is high priority */ + l->priority = CAIF_PRIO_MAX - s->priority + 1; + + if (s->ifindex != 0) { + res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); + if (res < 0) + return res; + l->phyid = res; + } else { + switch (s->link_selector) { + case CAIF_LINK_HIGH_BANDW: + pref = CFPHYPREF_HIGH_BW; + break; + case CAIF_LINK_LOW_LATENCY: + pref = CFPHYPREF_LOW_LAT; + break; + default: + return -EINVAL; + } + dev_info = cfcnfg_get_phyid(cnfg, pref); + if (dev_info == NULL) + return -ENODEV; + l->phyid = dev_info->id; + } + switch (s->protocol) { + case CAIFPROTO_AT: + l->linktype = CFCTRL_SRV_VEI; + l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; + l->chtype = s->sockaddr.u.at.type & 0x3; + break; + case CAIFPROTO_DATAGRAM: + l->linktype = CFCTRL_SRV_DATAGRAM; + l->chtype = 0x00; + l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; + break; + case CAIFPROTO_DATAGRAM_LOOP: + l->linktype = CFCTRL_SRV_DATAGRAM; + l->chtype = 0x03; + l->endpoint = 0x00; + l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; + break; + case CAIFPROTO_RFM: + l->linktype = CFCTRL_SRV_RFM; + l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; + strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, + sizeof(l->u.rfm.volume)-1); + l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0; + break; + case CAIFPROTO_UTIL: + l->linktype = CFCTRL_SRV_UTIL; + l->endpoint = 0x00; + l->chtype = 0x00; + strncpy(l->u.utility.name, s->sockaddr.u.util.service, + sizeof(l->u.utility.name)-1); + l->u.utility.name[sizeof(l->u.utility.name)-1] = 0; + caif_assert(sizeof(l->u.utility.name) > 10); + l->u.utility.paramlen = s->param.size; + if (l->u.utility.paramlen > sizeof(l->u.utility.params)) + l->u.utility.paramlen = sizeof(l->u.utility.params); + + memcpy(l->u.utility.params, s->param.data, + l->u.utility.paramlen); + + break; + case CAIFPROTO_DEBUG: + l->linktype = CFCTRL_SRV_DBG; + l->endpoint = s->sockaddr.u.dbg.service; + l->chtype = s->sockaddr.u.dbg.type; + break; + default: + return -EINVAL; + } + return 0; +} + +int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, + struct cflayer *adap_layer, int *ifindex, + int *proto_head, + int *proto_tail) +{ + struct cflayer *frml; + struct cfcnfg_phyinfo *phy; + int err; + struct cfctrl_link_param param; + struct cfcnfg *cfg = get_cfcnfg(net); + + rcu_read_lock(); + err = caif_connect_req_to_link_param(cfg, conn_req, ¶m); + if (err) + goto unlock; + + phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); + if (!phy) { + err = -ENODEV; + goto unlock; + } + err = -EINVAL; + + if (adap_layer == NULL) { + pr_err("adap_layer is zero\n"); + goto unlock; + } + if (adap_layer->receive == NULL) { + pr_err("adap_layer->receive is NULL\n"); + goto unlock; + } + if (adap_layer->ctrlcmd == NULL) { + pr_err("adap_layer->ctrlcmd == NULL\n"); + goto unlock; + } + + err = -ENODEV; + frml = phy->frm_layer; + if (frml == NULL) { + pr_err("Specified PHY type does not exist!\n"); + goto unlock; + } + caif_assert(param.phyid == phy->id); + caif_assert(phy->frm_layer->id == + param.phyid); + caif_assert(phy->phy_layer->id == + param.phyid); + + *ifindex = phy->ifindex; + *proto_tail = 2; + *proto_head = + + protohead[param.linktype] + (phy->use_stx ? 1 : 0); + + rcu_read_unlock(); + + /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ + cfctrl_enum_req(cfg->ctrl, param.phyid); + return cfctrl_linkup_request(cfg->ctrl, ¶m, adap_layer); + +unlock: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(caif_connect_client); + +static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, + struct cflayer *adapt_layer) +{ + if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) + adapt_layer->ctrlcmd(adapt_layer, + CAIF_CTRLCMD_INIT_FAIL_RSP, 0); +} + +static void +cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, + u8 phyid, struct cflayer *adapt_layer) +{ + struct cfcnfg *cnfg = container_obj(layer); + struct cflayer *servicel = NULL; + struct cfcnfg_phyinfo *phyinfo; + struct net_device *netdev; + + if (channel_id == 0) { + pr_warn("received channel_id zero\n"); + if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) + adapt_layer->ctrlcmd(adapt_layer, + CAIF_CTRLCMD_INIT_FAIL_RSP, 0); + return; + } + + rcu_read_lock(); + + if (adapt_layer == NULL) { + pr_debug("link setup response but no client exist," + "send linkdown back\n"); + cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); + goto unlock; + } + + caif_assert(cnfg != NULL); + caif_assert(phyid != 0); + + phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); + if (phyinfo == NULL) { + pr_err("ERROR: Link Layer Device dissapeared" + "while connecting\n"); + goto unlock; + } + + caif_assert(phyinfo != NULL); + caif_assert(phyinfo->id == phyid); + caif_assert(phyinfo->phy_layer != NULL); + caif_assert(phyinfo->phy_layer->id == phyid); + + adapt_layer->id = channel_id; + + switch (serv) { + case CFCTRL_SRV_VEI: + servicel = cfvei_create(channel_id, &phyinfo->dev_info); + break; + case CFCTRL_SRV_DATAGRAM: + servicel = cfdgml_create(channel_id, + &phyinfo->dev_info); + break; + case CFCTRL_SRV_RFM: + netdev = phyinfo->dev_info.dev; + servicel = cfrfml_create(channel_id, &phyinfo->dev_info, + netdev->mtu); + break; + case CFCTRL_SRV_UTIL: + servicel = cfutill_create(channel_id, &phyinfo->dev_info); + break; + case CFCTRL_SRV_VIDEO: + servicel = cfvidl_create(channel_id, &phyinfo->dev_info); + break; + case CFCTRL_SRV_DBG: + servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); + break; + default: + pr_err("Protocol error. Link setup response " + "- unknown channel type\n"); + goto unlock; + } + if (!servicel) { + pr_warn("Out of memory\n"); + goto unlock; + } + layer_set_dn(servicel, cnfg->mux); + cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); + layer_set_up(servicel, adapt_layer); + layer_set_dn(adapt_layer, servicel); + + rcu_read_unlock(); + + servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); + return; +unlock: + rcu_read_unlock(); +} + +void +cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, + struct net_device *dev, struct cflayer *phy_layer, + enum cfcnfg_phy_preference pref, + bool fcs, bool stx) +{ + struct cflayer *frml; + struct cflayer *phy_driver = NULL; + struct cfcnfg_phyinfo *phyinfo; + int i; + u8 phyid; + + mutex_lock(&cnfg->lock); + + /* CAIF protocol allow maximum 6 link-layers */ + for (i = 0; i < 7; i++) { + phyid = (dev->ifindex + i) & 0x7; + if (phyid == 0) + continue; + if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) + goto got_phyid; + } + pr_warn("Too many CAIF Link Layers (max 6)\n"); + goto out; + +got_phyid: + phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); + + switch (phy_type) { + case CFPHYTYPE_FRAG: + phy_driver = + cfserl_create(CFPHYTYPE_FRAG, phyid, stx); + if (!phy_driver) { + pr_warn("Out of memory\n"); + goto out; + } + break; + case CFPHYTYPE_CAIF: + phy_driver = NULL; + break; + default: + goto out; + } + phy_layer->id = phyid; + phyinfo->pref = pref; + phyinfo->id = phyid; + phyinfo->dev_info.id = phyid; + phyinfo->dev_info.dev = dev; + phyinfo->phy_layer = phy_layer; + phyinfo->ifindex = dev->ifindex; + phyinfo->use_stx = stx; + phyinfo->use_fcs = fcs; + + frml = cffrml_create(phyid, fcs); + + if (!frml) { + pr_warn("Out of memory\n"); + kfree(phyinfo); + goto out; + } + phyinfo->frm_layer = frml; + layer_set_up(frml, cnfg->mux); + + if (phy_driver != NULL) { + phy_driver->id = phyid; + layer_set_dn(frml, phy_driver); + layer_set_up(phy_driver, frml); + layer_set_dn(phy_driver, phy_layer); + layer_set_up(phy_layer, phy_driver); + } else { + layer_set_dn(frml, phy_layer); + layer_set_up(phy_layer, frml); + } + + list_add_rcu(&phyinfo->node, &cnfg->phys); +out: + mutex_unlock(&cnfg->lock); +} +EXPORT_SYMBOL(cfcnfg_add_phy_layer); + +int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, + bool up) +{ + struct cfcnfg_phyinfo *phyinfo; + + rcu_read_lock(); + phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); + if (phyinfo == NULL) { + rcu_read_unlock(); + return -ENODEV; + } + + if (phyinfo->up == up) { + rcu_read_unlock(); + return 0; + } + phyinfo->up = up; + + if (up) { + cffrml_hold(phyinfo->frm_layer); + cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, + phy_layer->id); + } else { + cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); + cffrml_put(phyinfo->frm_layer); + } + + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(cfcnfg_set_phy_state); + +int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) +{ + struct cflayer *frml, *frml_dn; + u16 phyid; + struct cfcnfg_phyinfo *phyinfo; + + might_sleep(); + + mutex_lock(&cnfg->lock); + + phyid = phy_layer->id; + phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); + + if (phyinfo == NULL) { + mutex_unlock(&cnfg->lock); + return 0; + } + caif_assert(phyid == phyinfo->id); + caif_assert(phy_layer == phyinfo->phy_layer); + caif_assert(phy_layer->id == phyid); + caif_assert(phyinfo->frm_layer->id == phyid); + + list_del_rcu(&phyinfo->node); + synchronize_rcu(); + + /* Fail if reference count is not zero */ + if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { + pr_info("Wait for device inuse\n"); + list_add_rcu(&phyinfo->node, &cnfg->phys); + mutex_unlock(&cnfg->lock); + return -EAGAIN; + } + + frml = phyinfo->frm_layer; + frml_dn = frml->dn; + cffrml_set_uplayer(frml, NULL); + cffrml_set_dnlayer(frml, NULL); + if (phy_layer != frml_dn) { + layer_set_up(frml_dn, NULL); + layer_set_dn(frml_dn, NULL); + } + layer_set_up(phy_layer, NULL); + + if (phyinfo->phy_layer != frml_dn) + kfree(frml_dn); + + cffrml_free(frml); + kfree(phyinfo); + mutex_unlock(&cnfg->lock); + + return 0; +} +EXPORT_SYMBOL(cfcnfg_del_phy_layer); diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c new file mode 100644 index 00000000..e22671be --- /dev/null +++ b/net/caif/cfctrl.c @@ -0,0 +1,648 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) container_of(layr, struct cfctrl, serv.layer) +#define UTILITY_NAME_LENGTH 16 +#define CFPKT_CTRL_PKT_LEN 20 + +#ifdef CAIF_NO_LOOP +static int handle_loop(struct cfctrl *ctrl, + int cmd, struct cfpkt *pkt){ + return -1; +} +#else +static int handle_loop(struct cfctrl *ctrl, + int cmd, struct cfpkt *pkt); +#endif +static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt); +static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid); + + +struct cflayer *cfctrl_create(void) +{ + struct dev_info dev_info; + struct cfctrl *this = + kmalloc(sizeof(struct cfctrl), GFP_ATOMIC); + if (!this) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfctrl, serv.layer) == 0); + memset(&dev_info, 0, sizeof(dev_info)); + dev_info.id = 0xff; + memset(this, 0, sizeof(*this)); + cfsrvl_init(&this->serv, 0, &dev_info, false); + atomic_set(&this->req_seq_no, 1); + atomic_set(&this->rsp_seq_no, 1); + this->serv.layer.receive = cfctrl_recv; + sprintf(this->serv.layer.name, "ctrl"); + this->serv.layer.ctrlcmd = cfctrl_ctrlcmd; +#ifndef CAIF_NO_LOOP + spin_lock_init(&this->loop_linkid_lock); + this->loop_linkid = 1; +#endif + spin_lock_init(&this->info_list_lock); + INIT_LIST_HEAD(&this->list); + return &this->serv.layer; +} + +void cfctrl_remove(struct cflayer *layer) +{ + struct cfctrl_request_info *p, *tmp; + struct cfctrl *ctrl = container_obj(layer); + + spin_lock_bh(&ctrl->info_list_lock); + list_for_each_entry_safe(p, tmp, &ctrl->list, list) { + list_del(&p->list); + kfree(p); + } + spin_unlock_bh(&ctrl->info_list_lock); + kfree(layer); +} + +static bool param_eq(const struct cfctrl_link_param *p1, + const struct cfctrl_link_param *p2) +{ + bool eq = + p1->linktype == p2->linktype && + p1->priority == p2->priority && + p1->phyid == p2->phyid && + p1->endpoint == p2->endpoint && p1->chtype == p2->chtype; + + if (!eq) + return false; + + switch (p1->linktype) { + case CFCTRL_SRV_VEI: + return true; + case CFCTRL_SRV_DATAGRAM: + return p1->u.datagram.connid == p2->u.datagram.connid; + case CFCTRL_SRV_RFM: + return + p1->u.rfm.connid == p2->u.rfm.connid && + strcmp(p1->u.rfm.volume, p2->u.rfm.volume) == 0; + case CFCTRL_SRV_UTIL: + return + p1->u.utility.fifosize_kb == p2->u.utility.fifosize_kb + && p1->u.utility.fifosize_bufs == + p2->u.utility.fifosize_bufs + && strcmp(p1->u.utility.name, p2->u.utility.name) == 0 + && p1->u.utility.paramlen == p2->u.utility.paramlen + && memcmp(p1->u.utility.params, p2->u.utility.params, + p1->u.utility.paramlen) == 0; + + case CFCTRL_SRV_VIDEO: + return p1->u.video.connid == p2->u.video.connid; + case CFCTRL_SRV_DBG: + return true; + case CFCTRL_SRV_DECM: + return false; + default: + return false; + } + return false; +} + +static bool cfctrl_req_eq(const struct cfctrl_request_info *r1, + const struct cfctrl_request_info *r2) +{ + if (r1->cmd != r2->cmd) + return false; + if (r1->cmd == CFCTRL_CMD_LINK_SETUP) + return param_eq(&r1->param, &r2->param); + else + return r1->channel_id == r2->channel_id; +} + +/* Insert request at the end */ +static void cfctrl_insert_req(struct cfctrl *ctrl, + struct cfctrl_request_info *req) +{ + spin_lock_bh(&ctrl->info_list_lock); + atomic_inc(&ctrl->req_seq_no); + req->sequence_no = atomic_read(&ctrl->req_seq_no); + list_add_tail(&req->list, &ctrl->list); + spin_unlock_bh(&ctrl->info_list_lock); +} + +/* Compare and remove request */ +static struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl, + struct cfctrl_request_info *req) +{ + struct cfctrl_request_info *p, *tmp, *first; + + first = list_first_entry(&ctrl->list, struct cfctrl_request_info, list); + + list_for_each_entry_safe(p, tmp, &ctrl->list, list) { + if (cfctrl_req_eq(req, p)) { + if (p != first) + pr_warn("Requests are not received in order\n"); + + atomic_set(&ctrl->rsp_seq_no, + p->sequence_no); + list_del(&p->list); + goto out; + } + } + p = NULL; +out: + return p; +} + +struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer) +{ + struct cfctrl *this = container_obj(layer); + return &this->res; +} + +static void init_info(struct caif_payload_info *info, struct cfctrl *cfctrl) +{ + info->hdr_len = 0; + info->channel_id = cfctrl->serv.layer.id; + info->dev_info = &cfctrl->serv.dev_info; +} + +void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) +{ + struct cfctrl *cfctrl = container_obj(layer); + struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); + struct cflayer *dn = cfctrl->serv.layer.dn; + if (!pkt) { + pr_warn("Out of memory\n"); + return; + } + if (!dn) { + pr_debug("not able to send enum request\n"); + return; + } + caif_assert(offsetof(struct cfctrl, serv.layer) == 0); + init_info(cfpkt_info(pkt), cfctrl); + cfpkt_info(pkt)->dev_info->id = physlinkid; + cfctrl->serv.dev_info.id = physlinkid; + cfpkt_addbdy(pkt, CFCTRL_CMD_ENUM); + cfpkt_addbdy(pkt, physlinkid); + dn->transmit(dn, pkt); +} + +int cfctrl_linkup_request(struct cflayer *layer, + struct cfctrl_link_param *param, + struct cflayer *user_layer) +{ + struct cfctrl *cfctrl = container_obj(layer); + u32 tmp32; + u16 tmp16; + u8 tmp8; + struct cfctrl_request_info *req; + int ret; + char utility_name[16]; + struct cfpkt *pkt; + struct cflayer *dn = cfctrl->serv.layer.dn; + + if (!dn) { + pr_debug("not able to send linkup request\n"); + return -ENODEV; + } + + if (cfctrl_cancel_req(layer, user_layer) > 0) { + /* Slight Paranoia, check if already connecting */ + pr_err("Duplicate connect request for same client\n"); + WARN_ON(1); + return -EALREADY; + } + + pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); + if (!pkt) { + pr_warn("Out of memory\n"); + return -ENOMEM; + } + cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP); + cfpkt_addbdy(pkt, (param->chtype << 4) | param->linktype); + cfpkt_addbdy(pkt, (param->priority << 3) | param->phyid); + cfpkt_addbdy(pkt, param->endpoint & 0x03); + + switch (param->linktype) { + case CFCTRL_SRV_VEI: + break; + case CFCTRL_SRV_VIDEO: + cfpkt_addbdy(pkt, (u8) param->u.video.connid); + break; + case CFCTRL_SRV_DBG: + break; + case CFCTRL_SRV_DATAGRAM: + tmp32 = cpu_to_le32(param->u.datagram.connid); + cfpkt_add_body(pkt, &tmp32, 4); + break; + case CFCTRL_SRV_RFM: + /* Construct a frame, convert DatagramConnectionID to network + * format long and copy it out... + */ + tmp32 = cpu_to_le32(param->u.rfm.connid); + cfpkt_add_body(pkt, &tmp32, 4); + /* Add volume name, including zero termination... */ + cfpkt_add_body(pkt, param->u.rfm.volume, + strlen(param->u.rfm.volume) + 1); + break; + case CFCTRL_SRV_UTIL: + tmp16 = cpu_to_le16(param->u.utility.fifosize_kb); + cfpkt_add_body(pkt, &tmp16, 2); + tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); + cfpkt_add_body(pkt, &tmp16, 2); + memset(utility_name, 0, sizeof(utility_name)); + strncpy(utility_name, param->u.utility.name, + UTILITY_NAME_LENGTH - 1); + cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); + tmp8 = param->u.utility.paramlen; + cfpkt_add_body(pkt, &tmp8, 1); + cfpkt_add_body(pkt, param->u.utility.params, + param->u.utility.paramlen); + break; + default: + pr_warn("Request setup of bad link type = %d\n", + param->linktype); + return -EINVAL; + } + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) { + pr_warn("Out of memory\n"); + return -ENOMEM; + } + req->client_layer = user_layer; + req->cmd = CFCTRL_CMD_LINK_SETUP; + req->param = *param; + cfctrl_insert_req(cfctrl, req); + init_info(cfpkt_info(pkt), cfctrl); + /* + * NOTE:Always send linkup and linkdown request on the same + * device as the payload. Otherwise old queued up payload + * might arrive with the newly allocated channel ID. + */ + cfpkt_info(pkt)->dev_info->id = param->phyid; + ret = + dn->transmit(dn, pkt); + if (ret < 0) { + int count; + + count = cfctrl_cancel_req(&cfctrl->serv.layer, + user_layer); + if (count != 1) + pr_err("Could not remove request (%d)", count); + return -ENODEV; + } + return 0; +} + +int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, + struct cflayer *client) +{ + int ret; + struct cfctrl *cfctrl = container_obj(layer); + struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); + struct cflayer *dn = cfctrl->serv.layer.dn; + + if (!pkt) { + pr_warn("Out of memory\n"); + return -ENOMEM; + } + + if (!dn) { + pr_debug("not able to send link-down request\n"); + return -ENODEV; + } + + cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY); + cfpkt_addbdy(pkt, channelid); + init_info(cfpkt_info(pkt), cfctrl); + ret = + dn->transmit(dn, pkt); +#ifndef CAIF_NO_LOOP + cfctrl->loop_linkused[channelid] = 0; +#endif + return ret; +} + +int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) +{ + struct cfctrl_request_info *p, *tmp; + struct cfctrl *ctrl = container_obj(layr); + int found = 0; + spin_lock_bh(&ctrl->info_list_lock); + + list_for_each_entry_safe(p, tmp, &ctrl->list, list) { + if (p->client_layer == adap_layer) { + list_del(&p->list); + kfree(p); + found++; + } + } + + spin_unlock_bh(&ctrl->info_list_lock); + return found; +} + +static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) +{ + u8 cmdrsp; + u8 cmd; + int ret = -1; + u16 tmp16; + u8 len; + u8 param[255]; + u8 linkid; + struct cfctrl *cfctrl = container_obj(layer); + struct cfctrl_request_info rsp, *req; + + + cfpkt_extr_head(pkt, &cmdrsp, 1); + cmd = cmdrsp & CFCTRL_CMD_MASK; + if (cmd != CFCTRL_CMD_LINK_ERR + && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp) + && CFCTRL_ERR_BIT != (CFCTRL_ERR_BIT & cmdrsp)) { + if (handle_loop(cfctrl, cmd, pkt) != 0) + cmdrsp |= CFCTRL_ERR_BIT; + } + + switch (cmd) { + case CFCTRL_CMD_LINK_SETUP: + { + enum cfctrl_srv serv; + enum cfctrl_srv servtype; + u8 endpoint; + u8 physlinkid; + u8 prio; + u8 tmp; + u32 tmp32; + u8 *cp; + int i; + struct cfctrl_link_param linkparam; + memset(&linkparam, 0, sizeof(linkparam)); + + cfpkt_extr_head(pkt, &tmp, 1); + + serv = tmp & CFCTRL_SRV_MASK; + linkparam.linktype = serv; + + servtype = tmp >> 4; + linkparam.chtype = servtype; + + cfpkt_extr_head(pkt, &tmp, 1); + physlinkid = tmp & 0x07; + prio = tmp >> 3; + + linkparam.priority = prio; + linkparam.phyid = physlinkid; + cfpkt_extr_head(pkt, &endpoint, 1); + linkparam.endpoint = endpoint & 0x03; + + switch (serv) { + case CFCTRL_SRV_VEI: + case CFCTRL_SRV_DBG: + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + cfpkt_extr_head(pkt, &linkid, 1); + break; + case CFCTRL_SRV_VIDEO: + cfpkt_extr_head(pkt, &tmp, 1); + linkparam.u.video.connid = tmp; + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + cfpkt_extr_head(pkt, &linkid, 1); + break; + + case CFCTRL_SRV_DATAGRAM: + cfpkt_extr_head(pkt, &tmp32, 4); + linkparam.u.datagram.connid = + le32_to_cpu(tmp32); + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + cfpkt_extr_head(pkt, &linkid, 1); + break; + case CFCTRL_SRV_RFM: + /* Construct a frame, convert + * DatagramConnectionID + * to network format long and copy it out... + */ + cfpkt_extr_head(pkt, &tmp32, 4); + linkparam.u.rfm.connid = + le32_to_cpu(tmp32); + cp = (u8 *) linkparam.u.rfm.volume; + for (cfpkt_extr_head(pkt, &tmp, 1); + cfpkt_more(pkt) && tmp != '\0'; + cfpkt_extr_head(pkt, &tmp, 1)) + *cp++ = tmp; + *cp = '\0'; + + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + cfpkt_extr_head(pkt, &linkid, 1); + + break; + case CFCTRL_SRV_UTIL: + /* Construct a frame, convert + * DatagramConnectionID + * to network format long and copy it out... + */ + /* Fifosize KB */ + cfpkt_extr_head(pkt, &tmp16, 2); + linkparam.u.utility.fifosize_kb = + le16_to_cpu(tmp16); + /* Fifosize bufs */ + cfpkt_extr_head(pkt, &tmp16, 2); + linkparam.u.utility.fifosize_bufs = + le16_to_cpu(tmp16); + /* name */ + cp = (u8 *) linkparam.u.utility.name; + caif_assert(sizeof(linkparam.u.utility.name) + >= UTILITY_NAME_LENGTH); + for (i = 0; + i < UTILITY_NAME_LENGTH + && cfpkt_more(pkt); i++) { + cfpkt_extr_head(pkt, &tmp, 1); + *cp++ = tmp; + } + /* Length */ + cfpkt_extr_head(pkt, &len, 1); + linkparam.u.utility.paramlen = len; + /* Param Data */ + cp = linkparam.u.utility.params; + while (cfpkt_more(pkt) && len--) { + cfpkt_extr_head(pkt, &tmp, 1); + *cp++ = tmp; + } + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + cfpkt_extr_head(pkt, &linkid, 1); + /* Length */ + cfpkt_extr_head(pkt, &len, 1); + /* Param Data */ + cfpkt_extr_head(pkt, ¶m, len); + break; + default: + pr_warn("Request setup, invalid type (%d)\n", + serv); + goto error; + } + + rsp.cmd = cmd; + rsp.param = linkparam; + spin_lock_bh(&cfctrl->info_list_lock); + req = cfctrl_remove_req(cfctrl, &rsp); + + if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || + cfpkt_erroneous(pkt)) { + pr_err("Invalid O/E bit or parse error " + "on CAIF control channel\n"); + cfctrl->res.reject_rsp(cfctrl->serv.layer.up, + 0, + req ? req->client_layer + : NULL); + } else { + cfctrl->res.linksetup_rsp(cfctrl->serv. + layer.up, linkid, + serv, physlinkid, + req ? req-> + client_layer : NULL); + } + + if (req != NULL) + kfree(req); + + spin_unlock_bh(&cfctrl->info_list_lock); + } + break; + case CFCTRL_CMD_LINK_DESTROY: + cfpkt_extr_head(pkt, &linkid, 1); + cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid); + break; + case CFCTRL_CMD_LINK_ERR: + pr_err("Frame Error Indication received\n"); + cfctrl->res.linkerror_ind(); + break; + case CFCTRL_CMD_ENUM: + cfctrl->res.enum_rsp(); + break; + case CFCTRL_CMD_SLEEP: + cfctrl->res.sleep_rsp(); + break; + case CFCTRL_CMD_WAKE: + cfctrl->res.wake_rsp(); + break; + case CFCTRL_CMD_LINK_RECONF: + cfctrl->res.restart_rsp(); + break; + case CFCTRL_CMD_RADIO_SET: + cfctrl->res.radioset_rsp(); + break; + default: + pr_err("Unrecognized Control Frame\n"); + goto error; + break; + } + ret = 0; +error: + cfpkt_destroy(pkt); + return ret; +} + +static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid) +{ + struct cfctrl *this = container_obj(layr); + switch (ctrl) { + case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: + case CAIF_CTRLCMD_FLOW_OFF_IND: + spin_lock_bh(&this->info_list_lock); + if (!list_empty(&this->list)) + pr_debug("Received flow off in control layer\n"); + spin_unlock_bh(&this->info_list_lock); + break; + case _CAIF_CTRLCMD_PHYIF_DOWN_IND: { + struct cfctrl_request_info *p, *tmp; + + /* Find all connect request and report failure */ + spin_lock_bh(&this->info_list_lock); + list_for_each_entry_safe(p, tmp, &this->list, list) { + if (p->param.phyid == phyid) { + list_del(&p->list); + p->client_layer->ctrlcmd(p->client_layer, + CAIF_CTRLCMD_INIT_FAIL_RSP, + phyid); + kfree(p); + } + } + spin_unlock_bh(&this->info_list_lock); + break; + } + default: + break; + } +} + +#ifndef CAIF_NO_LOOP +static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt) +{ + static int last_linkid; + static int dec; + u8 linkid, linktype, tmp; + switch (cmd) { + case CFCTRL_CMD_LINK_SETUP: + spin_lock_bh(&ctrl->loop_linkid_lock); + if (!dec) { + for (linkid = last_linkid + 1; linkid < 254; linkid++) + if (!ctrl->loop_linkused[linkid]) + goto found; + } + dec = 1; + for (linkid = last_linkid - 1; linkid > 1; linkid--) + if (!ctrl->loop_linkused[linkid]) + goto found; + spin_unlock_bh(&ctrl->loop_linkid_lock); + return -1; +found: + if (linkid < 10) + dec = 0; + + if (!ctrl->loop_linkused[linkid]) + ctrl->loop_linkused[linkid] = 1; + + last_linkid = linkid; + + cfpkt_add_trail(pkt, &linkid, 1); + spin_unlock_bh(&ctrl->loop_linkid_lock); + cfpkt_peek_head(pkt, &linktype, 1); + if (linktype == CFCTRL_SRV_UTIL) { + tmp = 0x01; + cfpkt_add_trail(pkt, &tmp, 1); + cfpkt_add_trail(pkt, &tmp, 1); + } + break; + + case CFCTRL_CMD_LINK_DESTROY: + spin_lock_bh(&ctrl->loop_linkid_lock); + cfpkt_peek_head(pkt, &linkid, 1); + ctrl->loop_linkused[linkid] = 0; + spin_unlock_bh(&ctrl->loop_linkid_lock); + break; + default: + break; + } + return 0; +} +#endif diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c new file mode 100644 index 00000000..11a2af4c --- /dev/null +++ b/net/caif/cfdbgl.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include + +#define container_obj(layr) ((struct cfsrvl *) layr) + +static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt); + +struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info) +{ + struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + if (!dbg) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfsrvl, layer) == 0); + memset(dbg, 0, sizeof(struct cfsrvl)); + cfsrvl_init(dbg, channel_id, dev_info, false); + dbg->layer.receive = cfdbgl_receive; + dbg->layer.transmit = cfdbgl_transmit; + snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ - 1, "dbg%d", channel_id); + return &dbg->layer; +} + +static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + return layr->up->receive(layr->up, pkt); +} + +static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + struct cfsrvl *service = container_obj(layr); + struct caif_payload_info *info; + int ret; + + if (!cfsrvl_ready(service, &ret)) + return ret; + + /* Add info for MUX-layer to route the packet out */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->dev_info = &service->dev_info; + + return layr->dn->transmit(layr->dn, pkt); +} diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c new file mode 100644 index 00000000..0382dec8 --- /dev/null +++ b/net/caif/cfdgml.c @@ -0,0 +1,112 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include + + +#define container_obj(layr) ((struct cfsrvl *) layr) + +#define DGM_CMD_BIT 0x80 +#define DGM_FLOW_OFF 0x81 +#define DGM_FLOW_ON 0x80 +#define DGM_MTU 1500 + +static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt); + +struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info) +{ + struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + if (!dgm) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfsrvl, layer) == 0); + memset(dgm, 0, sizeof(struct cfsrvl)); + cfsrvl_init(dgm, channel_id, dev_info, true); + dgm->layer.receive = cfdgml_receive; + dgm->layer.transmit = cfdgml_transmit; + snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ - 1, "dgm%d", channel_id); + dgm->layer.name[CAIF_LAYER_NAME_SZ - 1] = '\0'; + return &dgm->layer; +} + +static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 cmd = -1; + u8 dgmhdr[3]; + int ret; + caif_assert(layr->up != NULL); + caif_assert(layr->receive != NULL); + caif_assert(layr->ctrlcmd != NULL); + + if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + + if ((cmd & DGM_CMD_BIT) == 0) { + if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + ret = layr->up->receive(layr->up, pkt); + return ret; + } + + switch (cmd) { + case DGM_FLOW_OFF: /* FLOW OFF */ + layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0); + cfpkt_destroy(pkt); + return 0; + case DGM_FLOW_ON: /* FLOW ON */ + layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0); + cfpkt_destroy(pkt); + return 0; + default: + cfpkt_destroy(pkt); + pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd); + return -EPROTO; + } +} + +static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 packet_type; + u32 zero = 0; + struct caif_payload_info *info; + struct cfsrvl *service = container_obj(layr); + int ret; + if (!cfsrvl_ready(service, &ret)) + return ret; + + /* STE Modem cannot handle more than 1500 bytes datagrams */ + if (cfpkt_getlen(pkt) > DGM_MTU) + return -EMSGSIZE; + + cfpkt_add_head(pkt, &zero, 3); + packet_type = 0x08; /* B9 set - UNCLASSIFIED */ + cfpkt_add_head(pkt, &packet_type, 1); + + /* Add info for MUX-layer to route the packet out. */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + /* To optimize alignment, we add up the size of CAIF header + * before payload. + */ + info->hdr_len = 4; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); +} diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c new file mode 100644 index 00000000..04204b20 --- /dev/null +++ b/net/caif/cffrml.c @@ -0,0 +1,199 @@ +/* + * CAIF Framing Layer. + * + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) container_of(layr, struct cffrml, layer) + +struct cffrml { + struct cflayer layer; + bool dofcs; /* !< FCS active */ + int __percpu *pcpu_refcnt; +}; + +static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt); +static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid); + +static u32 cffrml_rcv_error; +static u32 cffrml_rcv_checsum_error; +struct cflayer *cffrml_create(u16 phyid, bool use_fcs) +{ + struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC); + if (!this) { + pr_warn("Out of memory\n"); + return NULL; + } + this->pcpu_refcnt = alloc_percpu(int); + if (this->pcpu_refcnt == NULL) { + kfree(this); + return NULL; + } + + caif_assert(offsetof(struct cffrml, layer) == 0); + + memset(this, 0, sizeof(struct cflayer)); + this->layer.receive = cffrml_receive; + this->layer.transmit = cffrml_transmit; + this->layer.ctrlcmd = cffrml_ctrlcmd; + snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid); + this->dofcs = use_fcs; + this->layer.id = phyid; + return (struct cflayer *) this; +} + +void cffrml_free(struct cflayer *layer) +{ + struct cffrml *this = container_obj(layer); + free_percpu(this->pcpu_refcnt); + kfree(layer); +} + +void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up) +{ + this->up = up; +} + +void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn) +{ + this->dn = dn; +} + +static u16 cffrml_checksum(u16 chks, void *buf, u16 len) +{ + /* FIXME: FCS should be moved to glue in order to use OS-Specific + * solutions + */ + return crc_ccitt(chks, buf, len); +} + +static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u16 tmp; + u16 len; + u16 hdrchks; + u16 pktchks; + struct cffrml *this; + this = container_obj(layr); + + cfpkt_extr_head(pkt, &tmp, 2); + len = le16_to_cpu(tmp); + + /* Subtract for FCS on length if FCS is not used. */ + if (!this->dofcs) + len -= 2; + + if (cfpkt_setlen(pkt, len) < 0) { + ++cffrml_rcv_error; + pr_err("Framing length error (%d)\n", len); + cfpkt_destroy(pkt); + return -EPROTO; + } + /* + * Don't do extract if FCS is false, rather do setlen - then we don't + * get a cache-miss. + */ + if (this->dofcs) { + cfpkt_extr_trail(pkt, &tmp, 2); + hdrchks = le16_to_cpu(tmp); + pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); + if (pktchks != hdrchks) { + cfpkt_add_trail(pkt, &tmp, 2); + ++cffrml_rcv_error; + ++cffrml_rcv_checsum_error; + pr_info("Frame checksum error (0x%x != 0x%x)\n", + hdrchks, pktchks); + return -EILSEQ; + } + } + if (cfpkt_erroneous(pkt)) { + ++cffrml_rcv_error; + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + + if (layr->up == NULL) { + pr_err("Layr up is missing!\n"); + cfpkt_destroy(pkt); + return -EINVAL; + } + + return layr->up->receive(layr->up, pkt); +} + +static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + int tmp; + u16 chks; + u16 len; + struct cffrml *this = container_obj(layr); + if (this->dofcs) { + chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); + tmp = cpu_to_le16(chks); + cfpkt_add_trail(pkt, &tmp, 2); + } else { + cfpkt_pad_trail(pkt, 2); + } + len = cfpkt_getlen(pkt); + tmp = cpu_to_le16(len); + cfpkt_add_head(pkt, &tmp, 2); + cfpkt_info(pkt)->hdr_len += 2; + if (cfpkt_erroneous(pkt)) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + + if (layr->dn == NULL) { + cfpkt_destroy(pkt); + return -ENODEV; + + } + return layr->dn->transmit(layr->dn, pkt); +} + +static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid) +{ + if (layr->up && layr->up->ctrlcmd) + layr->up->ctrlcmd(layr->up, ctrl, layr->id); +} + +void cffrml_put(struct cflayer *layr) +{ + struct cffrml *this = container_obj(layr); + if (layr != NULL && this->pcpu_refcnt != NULL) + irqsafe_cpu_dec(*this->pcpu_refcnt); +} + +void cffrml_hold(struct cflayer *layr) +{ + struct cffrml *this = container_obj(layr); + if (layr != NULL && this->pcpu_refcnt != NULL) + irqsafe_cpu_inc(*this->pcpu_refcnt); +} + +int cffrml_refcnt_read(struct cflayer *layr) +{ + int i, refcnt = 0; + struct cffrml *this = container_obj(layr); + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(this->pcpu_refcnt, i); + return refcnt; +} diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c new file mode 100644 index 00000000..c23979e7 --- /dev/null +++ b/net/caif/cfmuxl.c @@ -0,0 +1,273 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) container_of(layr, struct cfmuxl, layer) + +#define CAIF_CTRL_CHANNEL 0 +#define UP_CACHE_SIZE 8 +#define DN_CACHE_SIZE 8 + +struct cfmuxl { + struct cflayer layer; + struct list_head srvl_list; + struct list_head frml_list; + struct cflayer *up_cache[UP_CACHE_SIZE]; + struct cflayer *dn_cache[DN_CACHE_SIZE]; + /* + * Set when inserting or removing downwards layers. + */ + spinlock_t transmit_lock; + + /* + * Set when inserting or removing upwards layers. + */ + spinlock_t receive_lock; + +}; + +static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt); +static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid); +static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); + +struct cflayer *cfmuxl_create(void) +{ + struct cfmuxl *this = kmalloc(sizeof(struct cfmuxl), GFP_ATOMIC); + if (!this) + return NULL; + memset(this, 0, sizeof(*this)); + this->layer.receive = cfmuxl_receive; + this->layer.transmit = cfmuxl_transmit; + this->layer.ctrlcmd = cfmuxl_ctrlcmd; + INIT_LIST_HEAD(&this->srvl_list); + INIT_LIST_HEAD(&this->frml_list); + spin_lock_init(&this->transmit_lock); + spin_lock_init(&this->receive_lock); + snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux"); + return &this->layer; +} + +int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid) +{ + struct cfmuxl *muxl = (struct cfmuxl *) layr; + + spin_lock_bh(&muxl->transmit_lock); + list_add_rcu(&dn->node, &muxl->frml_list); + spin_unlock_bh(&muxl->transmit_lock); + return 0; +} + +static struct cflayer *get_from_id(struct list_head *list, u16 id) +{ + struct cflayer *lyr; + list_for_each_entry_rcu(lyr, list, node) { + if (lyr->id == id) + return lyr; + } + + return NULL; +} + +int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid) +{ + struct cfmuxl *muxl = container_obj(layr); + struct cflayer *old; + + spin_lock_bh(&muxl->receive_lock); + + /* Two entries with same id is wrong, so remove old layer from mux */ + old = get_from_id(&muxl->srvl_list, linkid); + if (old != NULL) + list_del_rcu(&old->node); + + list_add_rcu(&up->node, &muxl->srvl_list); + spin_unlock_bh(&muxl->receive_lock); + + return 0; +} + +struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid) +{ + struct cfmuxl *muxl = container_obj(layr); + struct cflayer *dn; + int idx = phyid % DN_CACHE_SIZE; + + spin_lock_bh(&muxl->transmit_lock); + rcu_assign_pointer(muxl->dn_cache[idx], NULL); + dn = get_from_id(&muxl->frml_list, phyid); + if (dn == NULL) + goto out; + + list_del_rcu(&dn->node); + caif_assert(dn != NULL); +out: + spin_unlock_bh(&muxl->transmit_lock); + return dn; +} + +static struct cflayer *get_up(struct cfmuxl *muxl, u16 id) +{ + struct cflayer *up; + int idx = id % UP_CACHE_SIZE; + up = rcu_dereference(muxl->up_cache[idx]); + if (up == NULL || up->id != id) { + spin_lock_bh(&muxl->receive_lock); + up = get_from_id(&muxl->srvl_list, id); + rcu_assign_pointer(muxl->up_cache[idx], up); + spin_unlock_bh(&muxl->receive_lock); + } + return up; +} + +static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info) +{ + struct cflayer *dn; + int idx = dev_info->id % DN_CACHE_SIZE; + dn = rcu_dereference(muxl->dn_cache[idx]); + if (dn == NULL || dn->id != dev_info->id) { + spin_lock_bh(&muxl->transmit_lock); + dn = get_from_id(&muxl->frml_list, dev_info->id); + rcu_assign_pointer(muxl->dn_cache[idx], dn); + spin_unlock_bh(&muxl->transmit_lock); + } + return dn; +} + +struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id) +{ + struct cflayer *up; + struct cfmuxl *muxl = container_obj(layr); + int idx = id % UP_CACHE_SIZE; + + if (id == 0) { + pr_warn("Trying to remove control layer\n"); + return NULL; + } + + spin_lock_bh(&muxl->receive_lock); + up = get_from_id(&muxl->srvl_list, id); + if (up == NULL) + goto out; + + rcu_assign_pointer(muxl->up_cache[idx], NULL); + list_del_rcu(&up->node); +out: + spin_unlock_bh(&muxl->receive_lock); + return up; +} + +static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + int ret; + struct cfmuxl *muxl = container_obj(layr); + u8 id; + struct cflayer *up; + if (cfpkt_extr_head(pkt, &id, 1) < 0) { + pr_err("erroneous Caif Packet\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + rcu_read_lock(); + up = get_up(muxl, id); + + if (up == NULL) { + pr_debug("Received data on unknown link ID = %d (0x%x)" + " up == NULL", id, id); + cfpkt_destroy(pkt); + /* + * Don't return ERROR, since modem misbehaves and sends out + * flow on before linksetup response. + */ + + rcu_read_unlock(); + return /* CFGLU_EPROT; */ 0; + } + + /* We can't hold rcu_lock during receive, so take a ref count instead */ + cfsrvl_get(up); + rcu_read_unlock(); + + ret = up->receive(up, pkt); + + cfsrvl_put(up); + return ret; +} + +static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + struct cfmuxl *muxl = container_obj(layr); + int err; + u8 linkid; + struct cflayer *dn; + struct caif_payload_info *info = cfpkt_info(pkt); + BUG_ON(!info); + + rcu_read_lock(); + + dn = get_dn(muxl, info->dev_info); + if (dn == NULL) { + pr_debug("Send data on unknown phy ID = %d (0x%x)\n", + info->dev_info->id, info->dev_info->id); + rcu_read_unlock(); + cfpkt_destroy(pkt); + return -ENOTCONN; + } + + info->hdr_len += 1; + linkid = info->channel_id; + cfpkt_add_head(pkt, &linkid, 1); + + /* We can't hold rcu_lock during receive, so take a ref count instead */ + cffrml_hold(dn); + + rcu_read_unlock(); + + err = dn->transmit(dn, pkt); + + cffrml_put(dn); + return err; +} + +static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid) +{ + struct cfmuxl *muxl = container_obj(layr); + struct cflayer *layer; + int idx; + + rcu_read_lock(); + list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { + + if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) { + + if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || + ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && + layer->id != 0) { + + idx = layer->id % UP_CACHE_SIZE; + spin_lock_bh(&muxl->receive_lock); + rcu_assign_pointer(muxl->up_cache[idx], NULL); + list_del_rcu(&layer->node); + spin_unlock_bh(&muxl->receive_lock); + } + /* NOTE: ctrlcmd is not allowed to block */ + layer->ctrlcmd(layer, ctrl, phyid); + } + } + rcu_read_unlock(); +} diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c new file mode 100644 index 00000000..75d4bfae --- /dev/null +++ b/net/caif/cfpkt_skbuff.c @@ -0,0 +1,400 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include + +#define PKT_PREFIX 48 +#define PKT_POSTFIX 2 +#define PKT_LEN_WHEN_EXTENDING 128 +#define PKT_ERROR(pkt, errmsg) \ +do { \ + cfpkt_priv(pkt)->erronous = true; \ + skb_reset_tail_pointer(&pkt->skb); \ + pr_warn(errmsg); \ +} while (0) + +struct cfpktq { + struct sk_buff_head head; + atomic_t count; + /* Lock protects count updates */ + spinlock_t lock; +}; + +/* + * net/caif/ is generic and does not + * understand SKB, so we do this typecast + */ +struct cfpkt { + struct sk_buff skb; +}; + +/* Private data inside SKB */ +struct cfpkt_priv_data { + struct dev_info dev_info; + bool erronous; +}; + +static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt) +{ + return (struct cfpkt_priv_data *) pkt->skb.cb; +} + +static inline bool is_erronous(struct cfpkt *pkt) +{ + return cfpkt_priv(pkt)->erronous; +} + +static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt) +{ + return &pkt->skb; +} + +static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb) +{ + return (struct cfpkt *) skb; +} + + +struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt) +{ + struct cfpkt *pkt = skb_to_pkt(nativepkt); + cfpkt_priv(pkt)->erronous = false; + return pkt; +} +EXPORT_SYMBOL(cfpkt_fromnative); + +void *cfpkt_tonative(struct cfpkt *pkt) +{ + return (void *) pkt; +} +EXPORT_SYMBOL(cfpkt_tonative); + +static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx) +{ + struct sk_buff *skb; + + if (likely(in_interrupt())) + skb = alloc_skb(len + pfx, GFP_ATOMIC); + else + skb = alloc_skb(len + pfx, GFP_KERNEL); + + if (unlikely(skb == NULL)) + return NULL; + + skb_reserve(skb, pfx); + return skb_to_pkt(skb); +} + +inline struct cfpkt *cfpkt_create(u16 len) +{ + return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX); +} + +void cfpkt_destroy(struct cfpkt *pkt) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + kfree_skb(skb); +} + + +inline bool cfpkt_more(struct cfpkt *pkt) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + return skb->len > 0; +} + + +int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + if (skb_headlen(skb) >= len) { + memcpy(data, skb->data, len); + return 0; + } + return !cfpkt_extr_head(pkt, data, len) && + !cfpkt_add_head(pkt, data, len); +} + +int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + u8 *from; + if (unlikely(is_erronous(pkt))) + return -EPROTO; + + if (unlikely(len > skb->len)) { + PKT_ERROR(pkt, "read beyond end of packet\n"); + return -EPROTO; + } + + if (unlikely(len > skb_headlen(skb))) { + if (unlikely(skb_linearize(skb) != 0)) { + PKT_ERROR(pkt, "linearize failed\n"); + return -EPROTO; + } + } + from = skb_pull(skb, len); + from -= len; + memcpy(data, from, len); + return 0; +} + +int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + u8 *data = dta; + u8 *from; + if (unlikely(is_erronous(pkt))) + return -EPROTO; + + if (unlikely(skb_linearize(skb) != 0)) { + PKT_ERROR(pkt, "linearize failed\n"); + return -EPROTO; + } + if (unlikely(skb->data + len > skb_tail_pointer(skb))) { + PKT_ERROR(pkt, "read beyond end of packet\n"); + return -EPROTO; + } + from = skb_tail_pointer(skb) - len; + skb_trim(skb, skb->len - len); + memcpy(data, from, len); + return 0; +} + + +int cfpkt_pad_trail(struct cfpkt *pkt, u16 len) +{ + return cfpkt_add_body(pkt, NULL, len); +} + + +int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + struct sk_buff *lastskb; + u8 *to; + u16 addlen = 0; + + + if (unlikely(is_erronous(pkt))) + return -EPROTO; + + lastskb = skb; + + /* Check whether we need to add space at the tail */ + if (unlikely(skb_tailroom(skb) < len)) { + if (likely(len < PKT_LEN_WHEN_EXTENDING)) + addlen = PKT_LEN_WHEN_EXTENDING; + else + addlen = len; + } + + /* Check whether we need to change the SKB before writing to the tail */ + if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) { + + /* Make sure data is writable */ + if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { + PKT_ERROR(pkt, "cow failed\n"); + return -EPROTO; + } + /* + * Is the SKB non-linear after skb_cow_data()? If so, we are + * going to add data to the last SKB, so we need to adjust + * lengths of the top SKB. + */ + if (lastskb != skb) { + pr_warn("Packet is non-linear\n"); + skb->len += len; + skb->data_len += len; + } + } + + /* All set to put the last SKB and optionally write data there. */ + to = skb_put(lastskb, len); + if (likely(data)) + memcpy(to, data, len); + return 0; +} + +inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data) +{ + return cfpkt_add_body(pkt, &data, 1); +} + +int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + struct sk_buff *lastskb; + u8 *to; + const u8 *data = data2; + int ret; + if (unlikely(is_erronous(pkt))) + return -EPROTO; + if (unlikely(skb_headroom(skb) < len)) { + PKT_ERROR(pkt, "no headroom\n"); + return -EPROTO; + } + + /* Make sure data is writable */ + ret = skb_cow_data(skb, 0, &lastskb); + if (unlikely(ret < 0)) { + PKT_ERROR(pkt, "cow failed\n"); + return ret; + } + + to = skb_push(skb, len); + memcpy(to, data, len); + return 0; +} + + +inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len) +{ + return cfpkt_add_body(pkt, data, len); +} + + +inline u16 cfpkt_getlen(struct cfpkt *pkt) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + return skb->len; +} + + +inline u16 cfpkt_iterate(struct cfpkt *pkt, + u16 (*iter_func)(u16, void *, u16), + u16 data) +{ + /* + * Don't care about the performance hit of linearizing, + * Checksum should not be used on high-speed interfaces anyway. + */ + if (unlikely(is_erronous(pkt))) + return -EPROTO; + if (unlikely(skb_linearize(&pkt->skb) != 0)) { + PKT_ERROR(pkt, "linearize failed\n"); + return -EPROTO; + } + return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); +} + + +int cfpkt_setlen(struct cfpkt *pkt, u16 len) +{ + struct sk_buff *skb = pkt_to_skb(pkt); + + + if (unlikely(is_erronous(pkt))) + return -EPROTO; + + if (likely(len <= skb->len)) { + if (unlikely(skb->data_len)) + ___pskb_trim(skb, len); + else + skb_trim(skb, len); + + return cfpkt_getlen(pkt); + } + + /* Need to expand SKB */ + if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) + PKT_ERROR(pkt, "skb_pad_trail failed\n"); + + return cfpkt_getlen(pkt); +} + +struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, + struct cfpkt *addpkt, + u16 expectlen) +{ + struct sk_buff *dst = pkt_to_skb(dstpkt); + struct sk_buff *add = pkt_to_skb(addpkt); + u16 addlen = skb_headlen(add); + u16 neededtailspace; + struct sk_buff *tmp; + u16 dstlen; + u16 createlen; + if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) { + return dstpkt; + } + if (expectlen > addlen) + neededtailspace = expectlen; + else + neededtailspace = addlen; + + if (dst->tail + neededtailspace > dst->end) { + /* Create a dumplicate of 'dst' with more tail space */ + struct cfpkt *tmppkt; + dstlen = skb_headlen(dst); + createlen = dstlen + neededtailspace; + tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX); + if (tmppkt == NULL) + return NULL; + tmp = pkt_to_skb(tmppkt); + skb_set_tail_pointer(tmp, dstlen); + tmp->len = dstlen; + memcpy(tmp->data, dst->data, dstlen); + cfpkt_destroy(dstpkt); + dst = tmp; + } + memcpy(skb_tail_pointer(dst), add->data, skb_headlen(add)); + cfpkt_destroy(addpkt); + dst->tail += addlen; + dst->len += addlen; + return skb_to_pkt(dst); +} + +struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) +{ + struct sk_buff *skb2; + struct sk_buff *skb = pkt_to_skb(pkt); + struct cfpkt *tmppkt; + u8 *split = skb->data + pos; + u16 len2nd = skb_tail_pointer(skb) - split; + + if (unlikely(is_erronous(pkt))) + return NULL; + + if (skb->data + pos > skb_tail_pointer(skb)) { + PKT_ERROR(pkt, "trying to split beyond end of packet\n"); + return NULL; + } + + /* Create a new packet for the second part of the data */ + tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX, + PKT_PREFIX); + if (tmppkt == NULL) + return NULL; + skb2 = pkt_to_skb(tmppkt); + + + if (skb2 == NULL) + return NULL; + + /* Reduce the length of the original packet */ + skb_set_tail_pointer(skb, pos); + skb->len = pos; + + memcpy(skb2->data, split, len2nd); + skb2->tail += len2nd; + skb2->len += len2nd; + return skb_to_pkt(skb2); +} + +bool cfpkt_erroneous(struct cfpkt *pkt) +{ + return cfpkt_priv(pkt)->erronous; +} + +struct caif_payload_info *cfpkt_info(struct cfpkt *pkt) +{ + return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb; +} diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c new file mode 100644 index 00000000..0deabb44 --- /dev/null +++ b/net/caif/cfrfml.c @@ -0,0 +1,308 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) container_of(layr, struct cfrfml, serv.layer) +#define RFM_SEGMENTATION_BIT 0x01 +#define RFM_HEAD_SIZE 7 + +static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt); + +struct cfrfml { + struct cfsrvl serv; + struct cfpkt *incomplete_frm; + int fragment_size; + u8 seghead[6]; + u16 pdu_size; + /* Protects serialized processing of packets */ + spinlock_t sync; +}; + +static void cfrfml_release(struct cflayer *layer) +{ + struct cfsrvl *srvl = container_of(layer, struct cfsrvl, layer); + struct cfrfml *rfml = container_obj(&srvl->layer); + + if (rfml->incomplete_frm) + cfpkt_destroy(rfml->incomplete_frm); + + kfree(srvl); +} + +struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info, + int mtu_size) +{ + int tmp; + struct cfrfml *this = + kzalloc(sizeof(struct cfrfml), GFP_ATOMIC); + + if (!this) { + pr_warn("Out of memory\n"); + return NULL; + } + + cfsrvl_init(&this->serv, channel_id, dev_info, false); + this->serv.release = cfrfml_release; + this->serv.layer.receive = cfrfml_receive; + this->serv.layer.transmit = cfrfml_transmit; + + /* Round down to closest multiple of 16 */ + tmp = (mtu_size - RFM_HEAD_SIZE - 6) / 16; + tmp *= 16; + + this->fragment_size = tmp; + spin_lock_init(&this->sync); + snprintf(this->serv.layer.name, CAIF_LAYER_NAME_SZ, + "rfm%d", channel_id); + + return &this->serv.layer; +} + +static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead, + struct cfpkt *pkt, int *err) +{ + struct cfpkt *tmppkt; + *err = -EPROTO; + /* n-th but not last segment */ + + if (cfpkt_extr_head(pkt, seghead, 6) < 0) + return NULL; + + /* Verify correct header */ + if (memcmp(seghead, rfml->seghead, 6) != 0) + return NULL; + + tmppkt = cfpkt_append(rfml->incomplete_frm, pkt, + rfml->pdu_size + RFM_HEAD_SIZE); + + /* If cfpkt_append failes input pkts are not freed */ + *err = -ENOMEM; + if (tmppkt == NULL) + return NULL; + + *err = 0; + return tmppkt; +} + +static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 tmp; + bool segmented; + int err; + u8 seghead[6]; + struct cfrfml *rfml; + struct cfpkt *tmppkt = NULL; + + caif_assert(layr->up != NULL); + caif_assert(layr->receive != NULL); + rfml = container_obj(layr); + spin_lock(&rfml->sync); + + err = -EPROTO; + if (cfpkt_extr_head(pkt, &tmp, 1) < 0) + goto out; + segmented = tmp & RFM_SEGMENTATION_BIT; + + if (segmented) { + if (rfml->incomplete_frm == NULL) { + /* Initial Segment */ + if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0) + goto out; + + rfml->pdu_size = get_unaligned_le16(rfml->seghead+4); + + if (cfpkt_erroneous(pkt)) + goto out; + rfml->incomplete_frm = pkt; + pkt = NULL; + } else { + + tmppkt = rfm_append(rfml, seghead, pkt, &err); + if (tmppkt == NULL) + goto out; + + if (cfpkt_erroneous(tmppkt)) + goto out; + + rfml->incomplete_frm = tmppkt; + + + if (cfpkt_erroneous(tmppkt)) + goto out; + } + err = 0; + goto out; + } + + if (rfml->incomplete_frm) { + + /* Last Segment */ + tmppkt = rfm_append(rfml, seghead, pkt, &err); + if (tmppkt == NULL) + goto out; + + if (cfpkt_erroneous(tmppkt)) + goto out; + + rfml->incomplete_frm = NULL; + pkt = tmppkt; + tmppkt = NULL; + + /* Verify that length is correct */ + err = EPROTO; + if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1) + goto out; + } + + err = rfml->serv.layer.up->receive(rfml->serv.layer.up, pkt); + +out: + + if (err != 0) { + if (tmppkt) + cfpkt_destroy(tmppkt); + if (pkt) + cfpkt_destroy(pkt); + if (rfml->incomplete_frm) + cfpkt_destroy(rfml->incomplete_frm); + rfml->incomplete_frm = NULL; + + pr_info("Connection error %d triggered on RFM link\n", err); + + /* Trigger connection error upon failure.*/ + layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, + rfml->serv.dev_info.id); + } + spin_unlock(&rfml->sync); + return err; +} + + +static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt) +{ + caif_assert(cfpkt_getlen(pkt) < rfml->fragment_size); + + /* Add info for MUX-layer to route the packet out. */ + cfpkt_info(pkt)->channel_id = rfml->serv.layer.id; + + /* + * To optimize alignment, we add up the size of CAIF header before + * payload. + */ + cfpkt_info(pkt)->hdr_len = RFM_HEAD_SIZE; + cfpkt_info(pkt)->dev_info = &rfml->serv.dev_info; + + return rfml->serv.layer.dn->transmit(rfml->serv.layer.dn, pkt); +} + +static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + int err; + u8 seg; + u8 head[6]; + struct cfpkt *rearpkt = NULL; + struct cfpkt *frontpkt = pkt; + struct cfrfml *rfml = container_obj(layr); + + caif_assert(layr->dn != NULL); + caif_assert(layr->dn->transmit != NULL); + + if (!cfsrvl_ready(&rfml->serv, &err)) + return err; + + err = -EPROTO; + if (cfpkt_getlen(pkt) <= RFM_HEAD_SIZE-1) + goto out; + + err = 0; + if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE) + err = cfpkt_peek_head(pkt, head, 6); + + if (err < 0) + goto out; + + while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) { + + seg = 1; + err = -EPROTO; + + if (cfpkt_add_head(frontpkt, &seg, 1) < 0) + goto out; + /* + * On OOM error cfpkt_split returns NULL. + * + * NOTE: Segmented pdu is not correctly aligned. + * This has negative performance impact. + */ + + rearpkt = cfpkt_split(frontpkt, rfml->fragment_size); + if (rearpkt == NULL) + goto out; + + err = cfrfml_transmit_segment(rfml, frontpkt); + + if (err != 0) + goto out; + frontpkt = rearpkt; + rearpkt = NULL; + + err = -ENOMEM; + if (frontpkt == NULL) + goto out; + err = -EPROTO; + if (cfpkt_add_head(frontpkt, head, 6) < 0) + goto out; + + } + + seg = 0; + err = -EPROTO; + + if (cfpkt_add_head(frontpkt, &seg, 1) < 0) + goto out; + + err = cfrfml_transmit_segment(rfml, frontpkt); + + frontpkt = NULL; +out: + + if (err != 0) { + pr_info("Connection error %d triggered on RFM link\n", err); + /* Trigger connection error upon failure.*/ + + layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, + rfml->serv.dev_info.id); + + if (rearpkt) + cfpkt_destroy(rearpkt); + + if (frontpkt && frontpkt != pkt) { + + cfpkt_destroy(frontpkt); + /* + * Socket layer will free the original packet, + * but this packet may already be sent and + * freed. So we have to return 0 in this case + * to avoid socket layer to re-free this packet. + * The return of shutdown indication will + * cause connection to be invalidated anyhow. + */ + err = 0; + } + } + + return err; +} diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c new file mode 100644 index 00000000..2715c84c --- /dev/null +++ b/net/caif/cfserl.c @@ -0,0 +1,192 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) ((struct cfserl *) layr) + +#define CFSERL_STX 0x02 +#define SERIAL_MINIUM_PACKET_SIZE 4 +#define SERIAL_MAX_FRAMESIZE 4096 +struct cfserl { + struct cflayer layer; + struct cfpkt *incomplete_frm; + /* Protects parallel processing of incoming packets */ + spinlock_t sync; + bool usestx; +}; + +static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); +static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid); + +struct cflayer *cfserl_create(int type, int instance, bool use_stx) +{ + struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC); + if (!this) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfserl, layer) == 0); + memset(this, 0, sizeof(struct cfserl)); + this->layer.receive = cfserl_receive; + this->layer.transmit = cfserl_transmit; + this->layer.ctrlcmd = cfserl_ctrlcmd; + this->layer.type = type; + this->usestx = use_stx; + spin_lock_init(&this->sync); + snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "ser1"); + return &this->layer; +} + +static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt) +{ + struct cfserl *layr = container_obj(l); + u16 pkt_len; + struct cfpkt *pkt = NULL; + struct cfpkt *tail_pkt = NULL; + u8 tmp8; + u16 tmp; + u8 stx = CFSERL_STX; + int ret; + u16 expectlen = 0; + + caif_assert(newpkt != NULL); + spin_lock(&layr->sync); + + if (layr->incomplete_frm != NULL) { + layr->incomplete_frm = + cfpkt_append(layr->incomplete_frm, newpkt, expectlen); + pkt = layr->incomplete_frm; + if (pkt == NULL) { + spin_unlock(&layr->sync); + return -ENOMEM; + } + } else { + pkt = newpkt; + } + layr->incomplete_frm = NULL; + + do { + /* Search for STX at start of pkt if STX is used */ + if (layr->usestx) { + cfpkt_extr_head(pkt, &tmp8, 1); + if (tmp8 != CFSERL_STX) { + while (cfpkt_more(pkt) + && tmp8 != CFSERL_STX) { + cfpkt_extr_head(pkt, &tmp8, 1); + } + if (!cfpkt_more(pkt)) { + cfpkt_destroy(pkt); + layr->incomplete_frm = NULL; + spin_unlock(&layr->sync); + return -EPROTO; + } + } + } + + pkt_len = cfpkt_getlen(pkt); + + /* + * pkt_len is the accumulated length of the packet data + * we have received so far. + * Exit if frame doesn't hold length. + */ + + if (pkt_len < 2) { + if (layr->usestx) + cfpkt_add_head(pkt, &stx, 1); + layr->incomplete_frm = pkt; + spin_unlock(&layr->sync); + return 0; + } + + /* + * Find length of frame. + * expectlen is the length we need for a full frame. + */ + cfpkt_peek_head(pkt, &tmp, 2); + expectlen = le16_to_cpu(tmp) + 2; + /* + * Frame error handling + */ + if (expectlen < SERIAL_MINIUM_PACKET_SIZE + || expectlen > SERIAL_MAX_FRAMESIZE) { + if (!layr->usestx) { + if (pkt != NULL) + cfpkt_destroy(pkt); + layr->incomplete_frm = NULL; + expectlen = 0; + spin_unlock(&layr->sync); + return -EPROTO; + } + continue; + } + + if (pkt_len < expectlen) { + /* Too little received data */ + if (layr->usestx) + cfpkt_add_head(pkt, &stx, 1); + layr->incomplete_frm = pkt; + spin_unlock(&layr->sync); + return 0; + } + + /* + * Enough data for at least one frame. + * Split the frame, if too long + */ + if (pkt_len > expectlen) + tail_pkt = cfpkt_split(pkt, expectlen); + else + tail_pkt = NULL; + + /* Send the first part of packet upwards.*/ + spin_unlock(&layr->sync); + ret = layr->layer.up->receive(layr->layer.up, pkt); + spin_lock(&layr->sync); + if (ret == -EILSEQ) { + if (layr->usestx) { + if (tail_pkt != NULL) + pkt = cfpkt_append(pkt, tail_pkt, 0); + /* Start search for next STX if frame failed */ + continue; + } else { + cfpkt_destroy(pkt); + pkt = NULL; + } + } + + pkt = tail_pkt; + + } while (pkt != NULL); + + spin_unlock(&layr->sync); + return 0; +} + +static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt) +{ + struct cfserl *layr = container_obj(layer); + u8 tmp8 = CFSERL_STX; + if (layr->usestx) + cfpkt_add_head(newpkt, &tmp8, 1); + return layer->dn->transmit(layer->dn, newpkt); +} + +static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid) +{ + layr->up->ctrlcmd(layr->up, ctrl, phyid); +} diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c new file mode 100644 index 00000000..535a1e72 --- /dev/null +++ b/net/caif/cfsrvl.c @@ -0,0 +1,225 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SRVL_CTRL_PKT_SIZE 1 +#define SRVL_FLOW_OFF 0x81 +#define SRVL_FLOW_ON 0x80 +#define SRVL_SET_PIN 0x82 +#define SRVL_CTRL_PKT_SIZE 1 + +#define container_obj(layr) container_of(layr, struct cfsrvl, layer) + +static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, + int phyid) +{ + struct cfsrvl *service = container_obj(layr); + + if (layr->up == NULL || layr->up->ctrlcmd == NULL) + return; + + switch (ctrl) { + case CAIF_CTRLCMD_INIT_RSP: + service->open = true; + layr->up->ctrlcmd(layr->up, ctrl, phyid); + break; + case CAIF_CTRLCMD_DEINIT_RSP: + case CAIF_CTRLCMD_INIT_FAIL_RSP: + service->open = false; + layr->up->ctrlcmd(layr->up, ctrl, phyid); + break; + case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: + if (phyid != service->dev_info.id) + break; + if (service->modem_flow_on) + layr->up->ctrlcmd(layr->up, + CAIF_CTRLCMD_FLOW_OFF_IND, phyid); + service->phy_flow_on = false; + break; + case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: + if (phyid != service->dev_info.id) + return; + if (service->modem_flow_on) { + layr->up->ctrlcmd(layr->up, + CAIF_CTRLCMD_FLOW_ON_IND, + phyid); + } + service->phy_flow_on = true; + break; + case CAIF_CTRLCMD_FLOW_OFF_IND: + if (service->phy_flow_on) { + layr->up->ctrlcmd(layr->up, + CAIF_CTRLCMD_FLOW_OFF_IND, phyid); + } + service->modem_flow_on = false; + break; + case CAIF_CTRLCMD_FLOW_ON_IND: + if (service->phy_flow_on) { + layr->up->ctrlcmd(layr->up, + CAIF_CTRLCMD_FLOW_ON_IND, phyid); + } + service->modem_flow_on = true; + break; + case _CAIF_CTRLCMD_PHYIF_DOWN_IND: + /* In case interface is down, let's fake a remove shutdown */ + layr->up->ctrlcmd(layr->up, + CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); + break; + case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: + layr->up->ctrlcmd(layr->up, ctrl, phyid); + break; + default: + pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); + /* We have both modem and phy flow on, send flow on */ + layr->up->ctrlcmd(layr->up, ctrl, phyid); + service->phy_flow_on = true; + break; + } +} + +static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) +{ + struct cfsrvl *service = container_obj(layr); + + caif_assert(layr != NULL); + caif_assert(layr->dn != NULL); + caif_assert(layr->dn->transmit != NULL); + + if (!service->supports_flowctrl) + return 0; + + switch (ctrl) { + case CAIF_MODEMCMD_FLOW_ON_REQ: + { + struct cfpkt *pkt; + struct caif_payload_info *info; + u8 flow_on = SRVL_FLOW_ON; + pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); + if (!pkt) { + pr_warn("Out of memory\n"); + return -ENOMEM; + } + + if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->hdr_len = 1; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); + } + case CAIF_MODEMCMD_FLOW_OFF_REQ: + { + struct cfpkt *pkt; + struct caif_payload_info *info; + u8 flow_off = SRVL_FLOW_OFF; + pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); + if (!pkt) { + pr_warn("Out of memory\n"); + return -ENOMEM; + } + + if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->hdr_len = 1; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); + } + default: + break; + } + return -EINVAL; +} + +static void cfsrvl_release(struct cflayer *layer) +{ + struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); + kfree(service); +} + +void cfsrvl_init(struct cfsrvl *service, + u8 channel_id, + struct dev_info *dev_info, + bool supports_flowctrl + ) +{ + caif_assert(offsetof(struct cfsrvl, layer) == 0); + service->open = false; + service->modem_flow_on = true; + service->phy_flow_on = true; + service->layer.id = channel_id; + service->layer.ctrlcmd = cfservl_ctrlcmd; + service->layer.modemcmd = cfservl_modemcmd; + service->dev_info = *dev_info; + service->supports_flowctrl = supports_flowctrl; + service->release = cfsrvl_release; +} + +bool cfsrvl_ready(struct cfsrvl *service, int *err) +{ + if (service->open && service->modem_flow_on && service->phy_flow_on) + return true; + if (!service->open) { + *err = -ENOTCONN; + return false; + } + caif_assert(!(service->modem_flow_on && service->phy_flow_on)); + *err = -EAGAIN; + return false; +} + +u8 cfsrvl_getphyid(struct cflayer *layer) +{ + struct cfsrvl *servl = container_obj(layer); + return servl->dev_info.id; +} + +bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) +{ + struct cfsrvl *servl = container_obj(layer); + return servl->dev_info.id == phyid; +} + +void caif_free_client(struct cflayer *adap_layer) +{ + struct cfsrvl *servl; + if (adap_layer == NULL || adap_layer->dn == NULL) + return; + servl = container_obj(adap_layer->dn); + servl->release(&servl->layer); +} +EXPORT_SYMBOL(caif_free_client); + +void caif_client_register_refcnt(struct cflayer *adapt_layer, + void (*hold)(struct cflayer *lyr), + void (*put)(struct cflayer *lyr)) +{ + struct cfsrvl *service; + service = container_of(adapt_layer->dn, struct cfsrvl, layer); + + WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL); + service->hold = hold; + service->put = put; +} +EXPORT_SYMBOL(caif_client_register_refcnt); diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c new file mode 100644 index 00000000..98e027db --- /dev/null +++ b/net/caif/cfutill.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) ((struct cfsrvl *) layr) +#define UTIL_PAYLOAD 0x00 +#define UTIL_CMD_BIT 0x80 +#define UTIL_REMOTE_SHUTDOWN 0x82 +#define UTIL_FLOW_OFF 0x81 +#define UTIL_FLOW_ON 0x80 + +static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt); + +struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info) +{ + struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + if (!util) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfsrvl, layer) == 0); + memset(util, 0, sizeof(struct cfsrvl)); + cfsrvl_init(util, channel_id, dev_info, true); + util->layer.receive = cfutill_receive; + util->layer.transmit = cfutill_transmit; + snprintf(util->layer.name, CAIF_LAYER_NAME_SZ - 1, "util1"); + return &util->layer; +} + +static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 cmd = -1; + struct cfsrvl *service = container_obj(layr); + caif_assert(layr != NULL); + caif_assert(layr->up != NULL); + caif_assert(layr->up->receive != NULL); + caif_assert(layr->up->ctrlcmd != NULL); + if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + + switch (cmd) { + case UTIL_PAYLOAD: + return layr->up->receive(layr->up, pkt); + case UTIL_FLOW_OFF: + layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0); + cfpkt_destroy(pkt); + return 0; + case UTIL_FLOW_ON: + layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0); + cfpkt_destroy(pkt); + return 0; + case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */ + pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n"); + layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0); + service->open = false; + cfpkt_destroy(pkt); + return 0; + default: + cfpkt_destroy(pkt); + pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd); + return -EPROTO; + } +} + +static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 zero = 0; + struct caif_payload_info *info; + int ret; + struct cfsrvl *service = container_obj(layr); + caif_assert(layr != NULL); + caif_assert(layr->dn != NULL); + caif_assert(layr->dn->transmit != NULL); + if (!cfsrvl_ready(service, &ret)) + return ret; + + cfpkt_add_head(pkt, &zero, 1); + /* Add info for MUX-layer to route the packet out. */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + /* + * To optimize alignment, we add up the size of CAIF header before + * payload. + */ + info->hdr_len = 1; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); +} diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c new file mode 100644 index 00000000..3ec83fbc --- /dev/null +++ b/net/caif/cfveil.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include + +#define VEI_PAYLOAD 0x00 +#define VEI_CMD_BIT 0x80 +#define VEI_FLOW_OFF 0x81 +#define VEI_FLOW_ON 0x80 +#define VEI_SET_PIN 0x82 + +#define container_obj(layr) container_of(layr, struct cfsrvl, layer) + +static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt); + +struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info) +{ + struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + if (!vei) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfsrvl, layer) == 0); + memset(vei, 0, sizeof(struct cfsrvl)); + cfsrvl_init(vei, channel_id, dev_info, true); + vei->layer.receive = cfvei_receive; + vei->layer.transmit = cfvei_transmit; + snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ - 1, "vei%d", channel_id); + return &vei->layer; +} + +static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 cmd; + int ret; + caif_assert(layr->up != NULL); + caif_assert(layr->receive != NULL); + caif_assert(layr->ctrlcmd != NULL); + + + if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + switch (cmd) { + case VEI_PAYLOAD: + ret = layr->up->receive(layr->up, pkt); + return ret; + case VEI_FLOW_OFF: + layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0); + cfpkt_destroy(pkt); + return 0; + case VEI_FLOW_ON: + layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0); + cfpkt_destroy(pkt); + return 0; + case VEI_SET_PIN: /* SET RS232 PIN */ + cfpkt_destroy(pkt); + return 0; + default: /* SET RS232 PIN */ + pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd); + cfpkt_destroy(pkt); + return -EPROTO; + } +} + +static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + u8 tmp = 0; + struct caif_payload_info *info; + int ret; + struct cfsrvl *service = container_obj(layr); + if (!cfsrvl_ready(service, &ret)) + goto err; + caif_assert(layr->dn != NULL); + caif_assert(layr->dn->transmit != NULL); + + if (cfpkt_add_head(pkt, &tmp, 1) < 0) { + pr_err("Packet is erroneous!\n"); + ret = -EPROTO; + goto err; + } + + /* Add info-> for MUX-layer to route the packet out. */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->hdr_len = 1; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); +err: + cfpkt_destroy(pkt); + return ret; +} diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c new file mode 100644 index 00000000..b2f5989a --- /dev/null +++ b/net/caif/cfvidl.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Sjur Brendeland/sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include + +#define container_obj(layr) ((struct cfsrvl *) layr) + +static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt); +static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt); + +struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info) +{ + struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + if (!vid) { + pr_warn("Out of memory\n"); + return NULL; + } + caif_assert(offsetof(struct cfsrvl, layer) == 0); + + memset(vid, 0, sizeof(struct cfsrvl)); + cfsrvl_init(vid, channel_id, dev_info, false); + vid->layer.receive = cfvidl_receive; + vid->layer.transmit = cfvidl_transmit; + snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ - 1, "vid1"); + return &vid->layer; +} + +static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt) +{ + u32 videoheader; + if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) { + pr_err("Packet is erroneous!\n"); + cfpkt_destroy(pkt); + return -EPROTO; + } + return layr->up->receive(layr->up, pkt); +} + +static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt) +{ + struct cfsrvl *service = container_obj(layr); + struct caif_payload_info *info; + u32 videoheader = 0; + int ret; + if (!cfsrvl_ready(service, &ret)) + return ret; + cfpkt_add_head(pkt, &videoheader, 4); + /* Add info for MUX-layer to route the packet out */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); +} diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c new file mode 100644 index 00000000..adbb4244 --- /dev/null +++ b/net/caif/chnl_net.c @@ -0,0 +1,545 @@ +/* + * Copyright (C) ST-Ericsson AB 2010 + * Authors: Sjur Brendeland/sjur.brandeland@stericsson.com + * Daniel Martensson / Daniel.Martensson@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* GPRS PDP connection has MTU to 1500 */ +#define GPRS_PDP_MTU 1500 +/* 5 sec. connect timeout */ +#define CONNECT_TIMEOUT (5 * HZ) +#define CAIF_NET_DEFAULT_QUEUE_LEN 500 + +/*This list is protected by the rtnl lock. */ +static LIST_HEAD(chnl_net_list); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("caif"); + +enum caif_states { + CAIF_CONNECTED = 1, + CAIF_CONNECTING, + CAIF_DISCONNECTED, + CAIF_SHUTDOWN +}; + +struct chnl_net { + struct cflayer chnl; + struct net_device_stats stats; + struct caif_connect_request conn_req; + struct list_head list_field; + struct net_device *netdev; + char name[256]; + wait_queue_head_t netmgmt_wq; + /* Flow status to remember and control the transmission. */ + bool flowenabled; + enum caif_states state; +}; + +static void robust_list_del(struct list_head *delete_node) +{ + struct list_head *list_node; + struct list_head *n; + ASSERT_RTNL(); + list_for_each_safe(list_node, n, &chnl_net_list) { + if (list_node == delete_node) { + list_del(list_node); + return; + } + } + WARN_ON(1); +} + +static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) +{ + struct sk_buff *skb; + struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); + int pktlen; + int err = 0; + const u8 *ip_version; + u8 buf; + + priv = container_of(layr, struct chnl_net, chnl); + + if (!priv) + return -EINVAL; + + skb = (struct sk_buff *) cfpkt_tonative(pkt); + + /* Get length of CAIF packet. */ + pktlen = skb->len; + + /* Pass some minimum information and + * send the packet to the net stack. + */ + skb->dev = priv->netdev; + + /* check the version of IP */ + ip_version = skb_header_pointer(skb, 0, 1, &buf); + if (!ip_version) + return -EINVAL; + switch (*ip_version >> 4) { + case 4: + skb->protocol = htons(ETH_P_IP); + break; + case 6: + skb->protocol = htons(ETH_P_IPV6); + break; + default: + return -EINVAL; + } + + /* If we change the header in loop mode, the checksum is corrupted. */ + if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb->ip_summed = CHECKSUM_NONE; + + if (in_interrupt()) + netif_rx(skb); + else + netif_rx_ni(skb); + + /* Update statistics. */ + priv->netdev->stats.rx_packets++; + priv->netdev->stats.rx_bytes += pktlen; + + return err; +} + +static int delete_device(struct chnl_net *dev) +{ + ASSERT_RTNL(); + if (dev->netdev) + unregister_netdevice(dev->netdev); + return 0; +} + +static void close_work(struct work_struct *work) +{ + struct chnl_net *dev = NULL; + struct list_head *list_node; + struct list_head *_tmp; + + rtnl_lock(); + list_for_each_safe(list_node, _tmp, &chnl_net_list) { + dev = list_entry(list_node, struct chnl_net, list_field); + if (dev->state == CAIF_SHUTDOWN) + dev_close(dev->netdev); + } + rtnl_unlock(); +} +static DECLARE_WORK(close_worker, close_work); + +static void chnl_hold(struct cflayer *lyr) +{ + struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); + dev_hold(priv->netdev); +} + +static void chnl_put(struct cflayer *lyr) +{ + struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); + dev_put(priv->netdev); +} + +static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, + int phyid) +{ + struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); + pr_debug("NET flowctrl func called flow: %s\n", + flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : + flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : + flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : + flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" : + flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" : + flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ? + "REMOTE_SHUTDOWN" : "UKNOWN CTRL COMMAND"); + + + + switch (flow) { + case CAIF_CTRLCMD_FLOW_OFF_IND: + priv->flowenabled = false; + netif_stop_queue(priv->netdev); + break; + case CAIF_CTRLCMD_DEINIT_RSP: + priv->state = CAIF_DISCONNECTED; + break; + case CAIF_CTRLCMD_INIT_FAIL_RSP: + priv->state = CAIF_DISCONNECTED; + wake_up_interruptible(&priv->netmgmt_wq); + break; + case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: + priv->state = CAIF_SHUTDOWN; + netif_tx_disable(priv->netdev); + schedule_work(&close_worker); + break; + case CAIF_CTRLCMD_FLOW_ON_IND: + priv->flowenabled = true; + netif_wake_queue(priv->netdev); + break; + case CAIF_CTRLCMD_INIT_RSP: + caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put); + priv->state = CAIF_CONNECTED; + priv->flowenabled = true; + netif_wake_queue(priv->netdev); + wake_up_interruptible(&priv->netmgmt_wq); + break; + default: + break; + } +} + +static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct chnl_net *priv; + struct cfpkt *pkt = NULL; + int len; + int result = -1; + /* Get our private data. */ + priv = netdev_priv(dev); + + if (skb->len > priv->netdev->mtu) { + pr_warn("Size of skb exceeded MTU\n"); + return -ENOSPC; + } + + if (!priv->flowenabled) { + pr_debug("dropping packets flow off\n"); + return NETDEV_TX_BUSY; + } + + if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) + swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + + /* Store original SKB length. */ + len = skb->len; + + pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb); + + /* Send the packet down the stack. */ + result = priv->chnl.dn->transmit(priv->chnl.dn, pkt); + if (result) { + if (result == -EAGAIN) + result = NETDEV_TX_BUSY; + return result; + } + + /* Update statistics. */ + dev->stats.tx_packets++; + dev->stats.tx_bytes += len; + + return NETDEV_TX_OK; +} + +static int chnl_net_open(struct net_device *dev) +{ + struct chnl_net *priv = NULL; + int result = -1; + int llifindex, headroom, tailroom, mtu; + struct net_device *lldev; + ASSERT_RTNL(); + priv = netdev_priv(dev); + if (!priv) { + pr_debug("chnl_net_open: no priv\n"); + return -ENODEV; + } + + if (priv->state != CAIF_CONNECTING) { + priv->state = CAIF_CONNECTING; + result = caif_connect_client(dev_net(dev), &priv->conn_req, + &priv->chnl, &llifindex, + &headroom, &tailroom); + if (result != 0) { + pr_debug("err: " + "Unable to register and open device," + " Err:%d\n", + result); + goto error; + } + + lldev = dev_get_by_index(dev_net(dev), llifindex); + + if (lldev == NULL) { + pr_debug("no interface?\n"); + result = -ENODEV; + goto error; + } + + dev->needed_tailroom = tailroom + lldev->needed_tailroom; + dev->hard_header_len = headroom + lldev->hard_header_len + + lldev->needed_tailroom; + + /* + * MTU, head-room etc is not know before we have a + * CAIF link layer device available. MTU calculation may + * override initial RTNL configuration. + * MTU is minimum of current mtu, link layer mtu pluss + * CAIF head and tail, and PDP GPRS contexts max MTU. + */ + mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom)); + mtu = min_t(int, GPRS_PDP_MTU, mtu); + dev_set_mtu(dev, mtu); + dev_put(lldev); + + if (mtu < 100) { + pr_warn("CAIF Interface MTU too small (%d)\n", mtu); + result = -ENODEV; + goto error; + } + } + + rtnl_unlock(); /* Release RTNL lock during connect wait */ + + result = wait_event_interruptible_timeout(priv->netmgmt_wq, + priv->state != CAIF_CONNECTING, + CONNECT_TIMEOUT); + + rtnl_lock(); + + if (result == -ERESTARTSYS) { + pr_debug("wait_event_interruptible woken by a signal\n"); + result = -ERESTARTSYS; + goto error; + } + + if (result == 0) { + pr_debug("connect timeout\n"); + caif_disconnect_client(dev_net(dev), &priv->chnl); + priv->state = CAIF_DISCONNECTED; + pr_debug("state disconnected\n"); + result = -ETIMEDOUT; + goto error; + } + + if (priv->state != CAIF_CONNECTED) { + pr_debug("connect failed\n"); + result = -ECONNREFUSED; + goto error; + } + pr_debug("CAIF Netdevice connected\n"); + return 0; + +error: + caif_disconnect_client(dev_net(dev), &priv->chnl); + priv->state = CAIF_DISCONNECTED; + pr_debug("state disconnected\n"); + return result; + +} + +static int chnl_net_stop(struct net_device *dev) +{ + struct chnl_net *priv; + + ASSERT_RTNL(); + priv = netdev_priv(dev); + priv->state = CAIF_DISCONNECTED; + caif_disconnect_client(dev_net(dev), &priv->chnl); + return 0; +} + +static int chnl_net_init(struct net_device *dev) +{ + struct chnl_net *priv; + ASSERT_RTNL(); + priv = netdev_priv(dev); + strncpy(priv->name, dev->name, sizeof(priv->name)); + return 0; +} + +static void chnl_net_uninit(struct net_device *dev) +{ + struct chnl_net *priv; + ASSERT_RTNL(); + priv = netdev_priv(dev); + robust_list_del(&priv->list_field); +} + +static const struct net_device_ops netdev_ops = { + .ndo_open = chnl_net_open, + .ndo_stop = chnl_net_stop, + .ndo_init = chnl_net_init, + .ndo_uninit = chnl_net_uninit, + .ndo_start_xmit = chnl_net_start_xmit, +}; + +static void chnl_net_destructor(struct net_device *dev) +{ + struct chnl_net *priv = netdev_priv(dev); + caif_free_client(&priv->chnl); + free_netdev(dev); +} + +static void ipcaif_net_setup(struct net_device *dev) +{ + struct chnl_net *priv; + dev->netdev_ops = &netdev_ops; + dev->destructor = chnl_net_destructor; + dev->flags |= IFF_NOARP; + dev->flags |= IFF_POINTOPOINT; + dev->mtu = GPRS_PDP_MTU; + dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN; + + priv = netdev_priv(dev); + priv->chnl.receive = chnl_recv_cb; + priv->chnl.ctrlcmd = chnl_flowctrl_cb; + priv->netdev = dev; + priv->conn_req.protocol = CAIFPROTO_DATAGRAM; + priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW; + priv->conn_req.priority = CAIF_PRIO_LOW; + /* Insert illegal value */ + priv->conn_req.sockaddr.u.dgm.connection_id = 0; + priv->flowenabled = false; + + init_waitqueue_head(&priv->netmgmt_wq); +} + + +static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct chnl_net *priv; + u8 loop; + priv = netdev_priv(dev); + NLA_PUT_U32(skb, IFLA_CAIF_IPV4_CONNID, + priv->conn_req.sockaddr.u.dgm.connection_id); + NLA_PUT_U32(skb, IFLA_CAIF_IPV6_CONNID, + priv->conn_req.sockaddr.u.dgm.connection_id); + loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP; + NLA_PUT_U8(skb, IFLA_CAIF_LOOPBACK, loop); + + + return 0; +nla_put_failure: + return -EMSGSIZE; + +} + +static void caif_netlink_parms(struct nlattr *data[], + struct caif_connect_request *conn_req) +{ + if (!data) { + pr_warn("no params data found\n"); + return; + } + if (data[IFLA_CAIF_IPV4_CONNID]) + conn_req->sockaddr.u.dgm.connection_id = + nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]); + if (data[IFLA_CAIF_IPV6_CONNID]) + conn_req->sockaddr.u.dgm.connection_id = + nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]); + if (data[IFLA_CAIF_LOOPBACK]) { + if (nla_get_u8(data[IFLA_CAIF_LOOPBACK])) + conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP; + else + conn_req->protocol = CAIFPROTO_DATAGRAM; + } +} + +static int ipcaif_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + int ret; + struct chnl_net *caifdev; + ASSERT_RTNL(); + caifdev = netdev_priv(dev); + caif_netlink_parms(data, &caifdev->conn_req); + dev_net_set(caifdev->netdev, src_net); + + ret = register_netdevice(dev); + if (ret) + pr_warn("device rtml registration failed\n"); + else + list_add(&caifdev->list_field, &chnl_net_list); + + /* Take ifindex as connection-id if null */ + if (caifdev->conn_req.sockaddr.u.dgm.connection_id == 0) + caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex; + return ret; +} + +static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct chnl_net *caifdev; + ASSERT_RTNL(); + caifdev = netdev_priv(dev); + caif_netlink_parms(data, &caifdev->conn_req); + netdev_state_change(dev); + return 0; +} + +static size_t ipcaif_get_size(const struct net_device *dev) +{ + return + /* IFLA_CAIF_IPV4_CONNID */ + nla_total_size(4) + + /* IFLA_CAIF_IPV6_CONNID */ + nla_total_size(4) + + /* IFLA_CAIF_LOOPBACK */ + nla_total_size(2) + + 0; +} + +static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = { + [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 }, + [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 }, + [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 } +}; + + +static struct rtnl_link_ops ipcaif_link_ops __read_mostly = { + .kind = "caif", + .priv_size = sizeof(struct chnl_net), + .setup = ipcaif_net_setup, + .maxtype = IFLA_CAIF_MAX, + .policy = ipcaif_policy, + .newlink = ipcaif_newlink, + .changelink = ipcaif_changelink, + .get_size = ipcaif_get_size, + .fill_info = ipcaif_fill_info, + +}; + +static int __init chnl_init_module(void) +{ + return rtnl_link_register(&ipcaif_link_ops); +} + +static void __exit chnl_exit_module(void) +{ + struct chnl_net *dev = NULL; + struct list_head *list_node; + struct list_head *_tmp; + rtnl_link_unregister(&ipcaif_link_ops); + rtnl_lock(); + list_for_each_safe(list_node, _tmp, &chnl_net_list) { + dev = list_entry(list_node, struct chnl_net, list_field); + list_del(list_node); + delete_device(dev); + } + rtnl_unlock(); +} + +module_init(chnl_init_module); +module_exit(chnl_exit_module); diff --git a/net/can/Kconfig b/net/can/Kconfig new file mode 100644 index 00000000..89395b2c --- /dev/null +++ b/net/can/Kconfig @@ -0,0 +1,44 @@ +# +# Controller Area Network (CAN) network layer core configuration +# + +menuconfig CAN + depends on NET + tristate "CAN bus subsystem support" + ---help--- + Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial + communications protocol which was developed by Bosch in + 1991, mainly for automotive, but now widely used in marine + (NMEA2000), industrial, and medical applications. + More information on the CAN network protocol family PF_CAN + is contained in . + + If you want CAN support you should say Y here and also to the + specific driver for your controller(s) below. + +config CAN_RAW + tristate "Raw CAN Protocol (raw access with CAN-ID filtering)" + depends on CAN + default N + ---help--- + The raw CAN protocol option offers access to the CAN bus via + the BSD socket API. You probably want to use the raw socket in + most cases where no higher level protocol is being used. The raw + socket has several filter options e.g. ID masking / error frames. + To receive/send raw CAN messages, use AF_CAN with protocol CAN_RAW. + +config CAN_BCM + tristate "Broadcast Manager CAN Protocol (with content filtering)" + depends on CAN + default N + ---help--- + The Broadcast Manager offers content filtering, timeout monitoring, + sending of RTR frames, and cyclic CAN messages without permanent user + interaction. The BCM can be 'programmed' via the BSD socket API and + informs you on demand e.g. only on content updates / timeouts. + You probably want to use the bcm socket in most cases where cyclic + CAN messages are used on the bus (e.g. in automotive environments). + To use the Broadcast Manager, use AF_CAN with protocol CAN_BCM. + + +source "drivers/net/can/Kconfig" diff --git a/net/can/Makefile b/net/can/Makefile new file mode 100644 index 00000000..2d3894b3 --- /dev/null +++ b/net/can/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Linux Controller Area Network core. +# + +obj-$(CONFIG_CAN) += can.o +can-y := af_can.o proc.o + +obj-$(CONFIG_CAN_RAW) += can-raw.o +can-raw-y := raw.o + +obj-$(CONFIG_CAN_BCM) += can-bcm.o +can-bcm-y := bcm.o diff --git a/net/can/af_can.c b/net/can/af_can.c new file mode 100644 index 00000000..094fc533 --- /dev/null +++ b/net/can/af_can.c @@ -0,0 +1,888 @@ +/* + * af_can.c - Protocol family CAN core module + * (used by different CAN protocol modules) + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "af_can.h" + +static __initdata const char banner[] = KERN_INFO + "can: controller area network core (" CAN_VERSION_STRING ")\n"; + +MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Urs Thuermann , " + "Oliver Hartkopp "); + +MODULE_ALIAS_NETPROTO(PF_CAN); + +static int stats_timer __read_mostly = 1; +module_param(stats_timer, int, S_IRUGO); +MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); + +/* receive filters subscribed for 'all' CAN devices */ +struct dev_rcv_lists can_rx_alldev_list; +static DEFINE_SPINLOCK(can_rcvlists_lock); + +static struct kmem_cache *rcv_cache __read_mostly; + +/* table of registered CAN protocols */ +static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly; +static DEFINE_MUTEX(proto_tab_lock); + +struct timer_list can_stattimer; /* timer for statistics update */ +struct s_stats can_stats; /* packet statistics */ +struct s_pstats can_pstats; /* receive list statistics */ + +/* + * af_can socket functions + */ + +int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch (cmd) { + + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + default: + return -ENOIOCTLCMD; + } +} +EXPORT_SYMBOL(can_ioctl); + +static void can_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + +static const struct can_proto *can_get_proto(int protocol) +{ + const struct can_proto *cp; + + rcu_read_lock(); + cp = rcu_dereference(proto_tab[protocol]); + if (cp && !try_module_get(cp->prot->owner)) + cp = NULL; + rcu_read_unlock(); + + return cp; +} + +static inline void can_put_proto(const struct can_proto *cp) +{ + module_put(cp->prot->owner); +} + +static int can_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + const struct can_proto *cp; + int err = 0; + + sock->state = SS_UNCONNECTED; + + if (protocol < 0 || protocol >= CAN_NPROTO) + return -EINVAL; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + cp = can_get_proto(protocol); + +#ifdef CONFIG_MODULES + if (!cp) { + /* try to load protocol module if kernel is modular */ + + err = request_module("can-proto-%d", protocol); + + /* + * In case of error we only print a message but don't + * return the error code immediately. Below we will + * return -EPROTONOSUPPORT + */ + if (err && printk_ratelimit()) + printk(KERN_ERR "can: request_module " + "(can-proto-%d) failed.\n", protocol); + + cp = can_get_proto(protocol); + } +#endif + + /* check for available protocol and correct usage */ + + if (!cp) + return -EPROTONOSUPPORT; + + if (cp->type != sock->type) { + err = -EPROTOTYPE; + goto errout; + } + + sock->ops = cp->ops; + + sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot); + if (!sk) { + err = -ENOMEM; + goto errout; + } + + sock_init_data(sock, sk); + sk->sk_destruct = can_sock_destruct; + + if (sk->sk_prot->init) + err = sk->sk_prot->init(sk); + + if (err) { + /* release sk on errors */ + sock_orphan(sk); + sock_put(sk); + } + + errout: + can_put_proto(cp); + return err; +} + +/* + * af_can tx path + */ + +/** + * can_send - transmit a CAN frame (optional with local loopback) + * @skb: pointer to socket buffer with CAN frame in data section + * @loop: loopback for listeners on local CAN sockets (recommended default!) + * + * Due to the loopback this routine must not be called from hardirq context. + * + * Return: + * 0 on success + * -ENETDOWN when the selected interface is down + * -ENOBUFS on full driver queue (see net_xmit_errno()) + * -ENOMEM when local loopback failed at calling skb_clone() + * -EPERM when trying to send on a non-CAN interface + * -EINVAL when the skb->data does not contain a valid CAN frame + */ +int can_send(struct sk_buff *skb, int loop) +{ + struct sk_buff *newskb = NULL; + struct can_frame *cf = (struct can_frame *)skb->data; + int err; + + if (skb->len != sizeof(struct can_frame) || cf->can_dlc > 8) { + kfree_skb(skb); + return -EINVAL; + } + + if (skb->dev->type != ARPHRD_CAN) { + kfree_skb(skb); + return -EPERM; + } + + if (!(skb->dev->flags & IFF_UP)) { + kfree_skb(skb); + return -ENETDOWN; + } + + skb->protocol = htons(ETH_P_CAN); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + if (loop) { + /* local loopback of sent CAN frames */ + + /* indication for the CAN driver: do loopback */ + skb->pkt_type = PACKET_LOOPBACK; + + /* + * The reference to the originating sock may be required + * by the receiving socket to check whether the frame is + * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS + * Therefore we have to ensure that skb->sk remains the + * reference to the originating sock by restoring skb->sk + * after each skb_clone() or skb_orphan() usage. + */ + + if (!(skb->dev->flags & IFF_ECHO)) { + /* + * If the interface is not capable to do loopback + * itself, we do it here. + */ + newskb = skb_clone(skb, GFP_ATOMIC); + if (!newskb) { + kfree_skb(skb); + return -ENOMEM; + } + + newskb->sk = skb->sk; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + newskb->pkt_type = PACKET_BROADCAST; + } + } else { + /* indication for the CAN driver: no loopback required */ + skb->pkt_type = PACKET_HOST; + } + + /* send to netdevice */ + err = dev_queue_xmit(skb); + if (err > 0) + err = net_xmit_errno(err); + + if (err) { + kfree_skb(newskb); + return err; + } + + if (newskb) + netif_rx_ni(newskb); + + /* update statistics */ + can_stats.tx_frames++; + can_stats.tx_frames_delta++; + + return 0; +} +EXPORT_SYMBOL(can_send); + +/* + * af_can rx path + */ + +static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev) +{ + if (!dev) + return &can_rx_alldev_list; + else + return (struct dev_rcv_lists *)dev->ml_priv; +} + +/** + * find_rcv_list - determine optimal filterlist inside device filter struct + * @can_id: pointer to CAN identifier of a given can_filter + * @mask: pointer to CAN mask of a given can_filter + * @d: pointer to the device filter struct + * + * Description: + * Returns the optimal filterlist to reduce the filter handling in the + * receive path. This function is called by service functions that need + * to register or unregister a can_filter in the filter lists. + * + * A filter matches in general, when + * + * & mask == can_id & mask + * + * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe + * relevant bits for the filter. + * + * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can + * filter for error frames (CAN_ERR_FLAG bit set in mask). For error frames + * there is a special filterlist and a special rx path filter handling. + * + * Return: + * Pointer to optimal filterlist for the given can_id/mask pair. + * Constistency checked mask. + * Reduced can_id to have a preprocessed filter compare value. + */ +static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, + struct dev_rcv_lists *d) +{ + canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ + + /* filter for error frames in extra filterlist */ + if (*mask & CAN_ERR_FLAG) { + /* clear CAN_ERR_FLAG in filter entry */ + *mask &= CAN_ERR_MASK; + return &d->rx[RX_ERR]; + } + + /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ + +#define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG) + + /* ensure valid values in can_mask for 'SFF only' frame filtering */ + if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG)) + *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS); + + /* reduce condition testing at receive time */ + *can_id &= *mask; + + /* inverse can_id/can_mask filter */ + if (inv) + return &d->rx[RX_INV]; + + /* mask == 0 => no condition testing at receive time */ + if (!(*mask)) + return &d->rx[RX_ALL]; + + /* extra filterlists for the subscription of a single non-RTR can_id */ + if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && + !(*can_id & CAN_RTR_FLAG)) { + + if (*can_id & CAN_EFF_FLAG) { + if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) { + /* RFC: a future use-case for hash-tables? */ + return &d->rx[RX_EFF]; + } + } else { + if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) + return &d->rx_sff[*can_id]; + } + } + + /* default: filter via can_id/can_mask */ + return &d->rx[RX_FIL]; +} + +/** + * can_rx_register - subscribe CAN frames from a specific interface + * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list) + * @can_id: CAN identifier (see description) + * @mask: CAN mask (see description) + * @func: callback function on filter match + * @data: returned parameter for callback function + * @ident: string for calling module indentification + * + * Description: + * Invokes the callback function with the received sk_buff and the given + * parameter 'data' on a matching receive filter. A filter matches, when + * + * & mask == can_id & mask + * + * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can + * filter for error frames (CAN_ERR_FLAG bit set in mask). + * + * The provided pointer to the sk_buff is guaranteed to be valid as long as + * the callback function is running. The callback function must *not* free + * the given sk_buff while processing it's task. When the given sk_buff is + * needed after the end of the callback function it must be cloned inside + * the callback function with skb_clone(). + * + * Return: + * 0 on success + * -ENOMEM on missing cache mem to create subscription entry + * -ENODEV unknown device + */ +int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, + void (*func)(struct sk_buff *, void *), void *data, + char *ident) +{ + struct receiver *r; + struct hlist_head *rl; + struct dev_rcv_lists *d; + int err = 0; + + /* insert new receiver (dev,canid,mask) -> (func,data) */ + + if (dev && dev->type != ARPHRD_CAN) + return -ENODEV; + + r = kmem_cache_alloc(rcv_cache, GFP_KERNEL); + if (!r) + return -ENOMEM; + + spin_lock(&can_rcvlists_lock); + + d = find_dev_rcv_lists(dev); + if (d) { + rl = find_rcv_list(&can_id, &mask, d); + + r->can_id = can_id; + r->mask = mask; + r->matches = 0; + r->func = func; + r->data = data; + r->ident = ident; + + hlist_add_head_rcu(&r->list, rl); + d->entries++; + + can_pstats.rcv_entries++; + if (can_pstats.rcv_entries_max < can_pstats.rcv_entries) + can_pstats.rcv_entries_max = can_pstats.rcv_entries; + } else { + kmem_cache_free(rcv_cache, r); + err = -ENODEV; + } + + spin_unlock(&can_rcvlists_lock); + + return err; +} +EXPORT_SYMBOL(can_rx_register); + +/* + * can_rx_delete_receiver - rcu callback for single receiver entry removal + */ +static void can_rx_delete_receiver(struct rcu_head *rp) +{ + struct receiver *r = container_of(rp, struct receiver, rcu); + + kmem_cache_free(rcv_cache, r); +} + +/** + * can_rx_unregister - unsubscribe CAN frames from a specific interface + * @dev: pointer to netdevice (NULL => unsubcribe from 'all' CAN devices list) + * @can_id: CAN identifier + * @mask: CAN mask + * @func: callback function on filter match + * @data: returned parameter for callback function + * + * Description: + * Removes subscription entry depending on given (subscription) values. + */ +void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, + void (*func)(struct sk_buff *, void *), void *data) +{ + struct receiver *r = NULL; + struct hlist_head *rl; + struct hlist_node *next; + struct dev_rcv_lists *d; + + if (dev && dev->type != ARPHRD_CAN) + return; + + spin_lock(&can_rcvlists_lock); + + d = find_dev_rcv_lists(dev); + if (!d) { + printk(KERN_ERR "BUG: receive list not found for " + "dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); + goto out; + } + + rl = find_rcv_list(&can_id, &mask, d); + + /* + * Search the receiver list for the item to delete. This should + * exist, since no receiver may be unregistered that hasn't + * been registered before. + */ + + hlist_for_each_entry_rcu(r, next, rl, list) { + if (r->can_id == can_id && r->mask == mask && + r->func == func && r->data == data) + break; + } + + /* + * Check for bugs in CAN protocol implementations: + * If no matching list item was found, the list cursor variable next + * will be NULL, while r will point to the last item of the list. + */ + + if (!next) { + printk(KERN_ERR "BUG: receive list entry not found for " + "dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); + r = NULL; + goto out; + } + + hlist_del_rcu(&r->list); + d->entries--; + + if (can_pstats.rcv_entries > 0) + can_pstats.rcv_entries--; + + /* remove device structure requested by NETDEV_UNREGISTER */ + if (d->remove_on_zero_entries && !d->entries) { + kfree(d); + dev->ml_priv = NULL; + } + + out: + spin_unlock(&can_rcvlists_lock); + + /* schedule the receiver item for deletion */ + if (r) + call_rcu(&r->rcu, can_rx_delete_receiver); +} +EXPORT_SYMBOL(can_rx_unregister); + +static inline void deliver(struct sk_buff *skb, struct receiver *r) +{ + r->func(skb, r->data); + r->matches++; +} + +static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) +{ + struct receiver *r; + struct hlist_node *n; + int matches = 0; + struct can_frame *cf = (struct can_frame *)skb->data; + canid_t can_id = cf->can_id; + + if (d->entries == 0) + return 0; + + if (can_id & CAN_ERR_FLAG) { + /* check for error frame entries only */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_ERR], list) { + if (can_id & r->mask) { + deliver(skb, r); + matches++; + } + } + return matches; + } + + /* check for unfiltered entries */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_ALL], list) { + deliver(skb, r); + matches++; + } + + /* check for can_id/mask entries */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_FIL], list) { + if ((can_id & r->mask) == r->can_id) { + deliver(skb, r); + matches++; + } + } + + /* check for inverted can_id/mask entries */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_INV], list) { + if ((can_id & r->mask) != r->can_id) { + deliver(skb, r); + matches++; + } + } + + /* check filterlists for single non-RTR can_ids */ + if (can_id & CAN_RTR_FLAG) + return matches; + + if (can_id & CAN_EFF_FLAG) { + hlist_for_each_entry_rcu(r, n, &d->rx[RX_EFF], list) { + if (r->can_id == can_id) { + deliver(skb, r); + matches++; + } + } + } else { + can_id &= CAN_SFF_MASK; + hlist_for_each_entry_rcu(r, n, &d->rx_sff[can_id], list) { + deliver(skb, r); + matches++; + } + } + + return matches; +} + +static int can_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dev_rcv_lists *d; + struct can_frame *cf = (struct can_frame *)skb->data; + int matches; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + if (WARN_ONCE(dev->type != ARPHRD_CAN || + skb->len != sizeof(struct can_frame) || + cf->can_dlc > 8, + "PF_CAN: dropped non conform skbuf: " + "dev type %d, len %d, can_dlc %d\n", + dev->type, skb->len, cf->can_dlc)) + goto drop; + + /* update statistics */ + can_stats.rx_frames++; + can_stats.rx_frames_delta++; + + rcu_read_lock(); + + /* deliver the packet to sockets listening on all devices */ + matches = can_rcv_filter(&can_rx_alldev_list, skb); + + /* find receive list for this device */ + d = find_dev_rcv_lists(dev); + if (d) + matches += can_rcv_filter(d, skb); + + rcu_read_unlock(); + + /* consume the skbuff allocated by the netdevice driver */ + consume_skb(skb); + + if (matches > 0) { + can_stats.matches++; + can_stats.matches_delta++; + } + + return NET_RX_SUCCESS; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * af_can protocol functions + */ + +/** + * can_proto_register - register CAN transport protocol + * @cp: pointer to CAN protocol structure + * + * Return: + * 0 on success + * -EINVAL invalid (out of range) protocol number + * -EBUSY protocol already in use + * -ENOBUF if proto_register() fails + */ +int can_proto_register(const struct can_proto *cp) +{ + int proto = cp->protocol; + int err = 0; + + if (proto < 0 || proto >= CAN_NPROTO) { + printk(KERN_ERR "can: protocol number %d out of range\n", + proto); + return -EINVAL; + } + + err = proto_register(cp->prot, 0); + if (err < 0) + return err; + + mutex_lock(&proto_tab_lock); + + if (proto_tab[proto]) { + printk(KERN_ERR "can: protocol %d already registered\n", + proto); + err = -EBUSY; + } else + rcu_assign_pointer(proto_tab[proto], cp); + + mutex_unlock(&proto_tab_lock); + + if (err < 0) + proto_unregister(cp->prot); + + return err; +} +EXPORT_SYMBOL(can_proto_register); + +/** + * can_proto_unregister - unregister CAN transport protocol + * @cp: pointer to CAN protocol structure + */ +void can_proto_unregister(const struct can_proto *cp) +{ + int proto = cp->protocol; + + mutex_lock(&proto_tab_lock); + BUG_ON(proto_tab[proto] != cp); + rcu_assign_pointer(proto_tab[proto], NULL); + mutex_unlock(&proto_tab_lock); + + synchronize_rcu(); + + proto_unregister(cp->prot); +} +EXPORT_SYMBOL(can_proto_unregister); + +/* + * af_can notifier to create/remove CAN netdevice specific structs + */ +static int can_notifier(struct notifier_block *nb, unsigned long msg, + void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct dev_rcv_lists *d; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_REGISTER: + + /* create new dev_rcv_lists for this device */ + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) { + printk(KERN_ERR + "can: allocation of receive list failed\n"); + return NOTIFY_DONE; + } + BUG_ON(dev->ml_priv); + dev->ml_priv = d; + + break; + + case NETDEV_UNREGISTER: + spin_lock(&can_rcvlists_lock); + + d = dev->ml_priv; + if (d) { + if (d->entries) + d->remove_on_zero_entries = 1; + else { + kfree(d); + dev->ml_priv = NULL; + } + } else + printk(KERN_ERR "can: notifier: receive list not " + "found for dev %s\n", dev->name); + + spin_unlock(&can_rcvlists_lock); + + break; + } + + return NOTIFY_DONE; +} + +/* + * af_can module init/exit functions + */ + +static struct packet_type can_packet __read_mostly = { + .type = cpu_to_be16(ETH_P_CAN), + .dev = NULL, + .func = can_rcv, +}; + +static const struct net_proto_family can_family_ops = { + .family = PF_CAN, + .create = can_create, + .owner = THIS_MODULE, +}; + +/* notifier block for netdevice event */ +static struct notifier_block can_netdev_notifier __read_mostly = { + .notifier_call = can_notifier, +}; + +static __init int can_init(void) +{ + printk(banner); + + memset(&can_rx_alldev_list, 0, sizeof(can_rx_alldev_list)); + + rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), + 0, 0, NULL); + if (!rcv_cache) + return -ENOMEM; + + if (stats_timer) { + /* the statistics are updated every second (timer triggered) */ + setup_timer(&can_stattimer, can_stat_update, 0); + mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); + } else + can_stattimer.function = NULL; + + can_init_proc(); + + /* protocol register */ + sock_register(&can_family_ops); + register_netdevice_notifier(&can_netdev_notifier); + dev_add_pack(&can_packet); + + return 0; +} + +static __exit void can_exit(void) +{ + struct net_device *dev; + + if (stats_timer) + del_timer(&can_stattimer); + + can_remove_proc(); + + /* protocol unregister */ + dev_remove_pack(&can_packet); + unregister_netdevice_notifier(&can_netdev_notifier); + sock_unregister(PF_CAN); + + /* remove created dev_rcv_lists from still registered CAN devices */ + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if (dev->type == ARPHRD_CAN && dev->ml_priv){ + + struct dev_rcv_lists *d = dev->ml_priv; + + BUG_ON(d->entries); + kfree(d); + dev->ml_priv = NULL; + } + } + rcu_read_unlock(); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ + + kmem_cache_destroy(rcv_cache); +} + +module_init(can_init); +module_exit(can_exit); diff --git a/net/can/af_can.h b/net/can/af_can.h new file mode 100644 index 00000000..34253b84 --- /dev/null +++ b/net/can/af_can.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to + * + */ + +#ifndef AF_CAN_H +#define AF_CAN_H + +#include +#include +#include +#include +#include + +/* af_can rx dispatcher structures */ + +struct receiver { + struct hlist_node list; + struct rcu_head rcu; + canid_t can_id; + canid_t mask; + unsigned long matches; + void (*func)(struct sk_buff *, void *); + void *data; + char *ident; +}; + +enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_EFF, RX_MAX }; + +/* per device receive filters linked at dev->ml_priv */ +struct dev_rcv_lists { + struct hlist_head rx[RX_MAX]; + struct hlist_head rx_sff[0x800]; + int remove_on_zero_entries; + int entries; +}; + +/* statistic structures */ + +/* can be reset e.g. by can_init_stats() */ +struct s_stats { + unsigned long jiffies_init; + + unsigned long rx_frames; + unsigned long tx_frames; + unsigned long matches; + + unsigned long total_rx_rate; + unsigned long total_tx_rate; + unsigned long total_rx_match_ratio; + + unsigned long current_rx_rate; + unsigned long current_tx_rate; + unsigned long current_rx_match_ratio; + + unsigned long max_rx_rate; + unsigned long max_tx_rate; + unsigned long max_rx_match_ratio; + + unsigned long rx_frames_delta; + unsigned long tx_frames_delta; + unsigned long matches_delta; +}; + +/* persistent statistics */ +struct s_pstats { + unsigned long stats_reset; + unsigned long user_reset; + unsigned long rcv_entries; + unsigned long rcv_entries_max; +}; + +/* function prototypes for the CAN networklayer procfs (proc.c) */ +extern void can_init_proc(void); +extern void can_remove_proc(void); +extern void can_stat_update(unsigned long data); + +/* structures and variables from af_can.c needed in proc.c for reading */ +extern struct timer_list can_stattimer; /* timer for statistics update */ +extern struct s_stats can_stats; /* packet statistics */ +extern struct s_pstats can_pstats; /* receive list statistics */ +extern struct hlist_head can_rx_dev_list; /* rx dispatcher structures */ + +#endif /* AF_CAN_H */ diff --git a/net/can/bcm.c b/net/can/bcm.c new file mode 100644 index 00000000..c6cc66f7 --- /dev/null +++ b/net/can/bcm.c @@ -0,0 +1,1632 @@ +/* + * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * To send multiple CAN frame content within TX_SETUP or to filter + * CAN messages with multiplex index within RX_SETUP, the number of + * different filters is limited to 256 due to the one byte index value. + */ +#define MAX_NFRAMES 256 + +/* use of last_frames[index].can_dlc */ +#define RX_RECV 0x40 /* received data for this element */ +#define RX_THR 0x80 /* element not been sent due to throttle feature */ +#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */ + +/* get best masking value for can_rx_register() for a given single can_id */ +#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ + (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ + (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) + +#define CAN_BCM_VERSION CAN_VERSION +static __initdata const char banner[] = KERN_INFO + "can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n"; + +MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Oliver Hartkopp "); +MODULE_ALIAS("can-proto-2"); + +/* easy access to can_frame payload */ +static inline u64 GET_U64(const struct can_frame *cp) +{ + return *(u64 *)cp->data; +} + +struct bcm_op { + struct list_head list; + int ifindex; + canid_t can_id; + u32 flags; + unsigned long frames_abs, frames_filtered; + struct timeval ival1, ival2; + struct hrtimer timer, thrtimer; + struct tasklet_struct tsklet, thrtsklet; + ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; + int rx_ifindex; + u32 count; + u32 nframes; + u32 currframe; + struct can_frame *frames; + struct can_frame *last_frames; + struct can_frame sframe; + struct can_frame last_sframe; + struct sock *sk; + struct net_device *rx_reg_dev; +}; + +static struct proc_dir_entry *proc_dir; + +struct bcm_sock { + struct sock sk; + int bound; + int ifindex; + struct notifier_block notifier; + struct list_head rx_ops; + struct list_head tx_ops; + unsigned long dropped_usr_msgs; + struct proc_dir_entry *bcm_proc_read; + char procname [32]; /* inode number in decimal with \0 */ +}; + +static inline struct bcm_sock *bcm_sk(const struct sock *sk) +{ + return (struct bcm_sock *)sk; +} + +#define CFSIZ sizeof(struct can_frame) +#define OPSIZ sizeof(struct bcm_op) +#define MHSIZ sizeof(struct bcm_msg_head) + +/* + * procfs functions + */ +static char *bcm_proc_getifname(char *result, int ifindex) +{ + struct net_device *dev; + + if (!ifindex) + return "any"; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(&init_net, ifindex); + if (dev) + strcpy(result, dev->name); + else + strcpy(result, "???"); + rcu_read_unlock(); + + return result; +} + +static int bcm_proc_show(struct seq_file *m, void *v) +{ + char ifname[IFNAMSIZ]; + struct sock *sk = (struct sock *)m->private; + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op; + + seq_printf(m, ">>> socket %pK", sk->sk_socket); + seq_printf(m, " / sk %pK", sk); + seq_printf(m, " / bo %pK", bo); + seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); + seq_printf(m, " / bound %s", bcm_proc_getifname(ifname, bo->ifindex)); + seq_printf(m, " <<<\n"); + + list_for_each_entry(op, &bo->rx_ops, list) { + + unsigned long reduction; + + /* print only active entries & prevent division by zero */ + if (!op->frames_abs) + continue; + + seq_printf(m, "rx_op: %03X %-5s ", + op->can_id, bcm_proc_getifname(ifname, op->ifindex)); + seq_printf(m, "[%u]%c ", op->nframes, + (op->flags & RX_CHECK_DLC)?'d':' '); + if (op->kt_ival1.tv64) + seq_printf(m, "timeo=%lld ", + (long long) + ktime_to_us(op->kt_ival1)); + + if (op->kt_ival2.tv64) + seq_printf(m, "thr=%lld ", + (long long) + ktime_to_us(op->kt_ival2)); + + seq_printf(m, "# recv %ld (%ld) => reduction: ", + op->frames_filtered, op->frames_abs); + + reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; + + seq_printf(m, "%s%ld%%\n", + (reduction == 100)?"near ":"", reduction); + } + + list_for_each_entry(op, &bo->tx_ops, list) { + + seq_printf(m, "tx_op: %03X %s [%u] ", + op->can_id, + bcm_proc_getifname(ifname, op->ifindex), + op->nframes); + + if (op->kt_ival1.tv64) + seq_printf(m, "t1=%lld ", + (long long) ktime_to_us(op->kt_ival1)); + + if (op->kt_ival2.tv64) + seq_printf(m, "t2=%lld ", + (long long) ktime_to_us(op->kt_ival2)); + + seq_printf(m, "# sent %ld\n", op->frames_abs); + } + seq_putc(m, '\n'); + return 0; +} + +static int bcm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, bcm_proc_show, PDE(inode)->data); +} + +static const struct file_operations bcm_proc_fops = { + .owner = THIS_MODULE, + .open = bcm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface + * of the given bcm tx op + */ +static void bcm_can_tx(struct bcm_op *op) +{ + struct sk_buff *skb; + struct net_device *dev; + struct can_frame *cf = &op->frames[op->currframe]; + + /* no target device? => exit */ + if (!op->ifindex) + return; + + dev = dev_get_by_index(&init_net, op->ifindex); + if (!dev) { + /* RFC: should this bcm_op remove itself here? */ + return; + } + + skb = alloc_skb(CFSIZ, gfp_any()); + if (!skb) + goto out; + + memcpy(skb_put(skb, CFSIZ), cf, CFSIZ); + + /* send with loopback */ + skb->dev = dev; + skb->sk = op->sk; + can_send(skb, 1); + + /* update statistics */ + op->currframe++; + op->frames_abs++; + + /* reached last frame? */ + if (op->currframe >= op->nframes) + op->currframe = 0; + out: + dev_put(dev); +} + +/* + * bcm_send_to_user - send a BCM message to the userspace + * (consisting of bcm_msg_head + x CAN frames) + */ +static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, + struct can_frame *frames, int has_timestamp) +{ + struct sk_buff *skb; + struct can_frame *firstframe; + struct sockaddr_can *addr; + struct sock *sk = op->sk; + unsigned int datalen = head->nframes * CFSIZ; + int err; + + skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); + if (!skb) + return; + + memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head)); + + if (head->nframes) { + /* can_frames starting here */ + firstframe = (struct can_frame *)skb_tail_pointer(skb); + + memcpy(skb_put(skb, datalen), frames, datalen); + + /* + * the BCM uses the can_dlc-element of the can_frame + * structure for internal purposes. This is only + * relevant for updates that are generated by the + * BCM, where nframes is 1 + */ + if (head->nframes == 1) + firstframe->can_dlc &= BCM_CAN_DLC_MASK; + } + + if (has_timestamp) { + /* restore rx timestamp */ + skb->tstamp = op->rx_stamp; + } + + /* + * Put the datagram to the queue so that bcm_recvmsg() can + * get it from there. We need to pass the interface index to + * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb + * containing the interface index. + */ + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + addr = (struct sockaddr_can *)skb->cb; + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = op->rx_ifindex; + + err = sock_queue_rcv_skb(sk, skb); + if (err < 0) { + struct bcm_sock *bo = bcm_sk(sk); + + kfree_skb(skb); + /* don't care about overflows in this statistic */ + bo->dropped_usr_msgs++; + } +} + +static void bcm_tx_start_timer(struct bcm_op *op) +{ + if (op->kt_ival1.tv64 && op->count) + hrtimer_start(&op->timer, + ktime_add(ktime_get(), op->kt_ival1), + HRTIMER_MODE_ABS); + else if (op->kt_ival2.tv64) + hrtimer_start(&op->timer, + ktime_add(ktime_get(), op->kt_ival2), + HRTIMER_MODE_ABS); +} + +static void bcm_tx_timeout_tsklet(unsigned long data) +{ + struct bcm_op *op = (struct bcm_op *)data; + struct bcm_msg_head msg_head; + + if (op->kt_ival1.tv64 && (op->count > 0)) { + + op->count--; + if (!op->count && (op->flags & TX_COUNTEVT)) { + + /* create notification to user */ + msg_head.opcode = TX_EXPIRED; + msg_head.flags = op->flags; + msg_head.count = op->count; + msg_head.ival1 = op->ival1; + msg_head.ival2 = op->ival2; + msg_head.can_id = op->can_id; + msg_head.nframes = 0; + + bcm_send_to_user(op, &msg_head, NULL, 0); + } + bcm_can_tx(op); + + } else if (op->kt_ival2.tv64) + bcm_can_tx(op); + + bcm_tx_start_timer(op); +} + +/* + * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions + */ +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) +{ + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); + + tasklet_schedule(&op->tsklet); + + return HRTIMER_NORESTART; +} + +/* + * bcm_rx_changed - create a RX_CHANGED notification due to changed content + */ +static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) +{ + struct bcm_msg_head head; + + /* update statistics */ + op->frames_filtered++; + + /* prevent statistics overflow */ + if (op->frames_filtered > ULONG_MAX/100) + op->frames_filtered = op->frames_abs = 0; + + /* this element is not throttled anymore */ + data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV); + + head.opcode = RX_CHANGED; + head.flags = op->flags; + head.count = op->count; + head.ival1 = op->ival1; + head.ival2 = op->ival2; + head.can_id = op->can_id; + head.nframes = 1; + + bcm_send_to_user(op, &head, data, 1); +} + +/* + * bcm_rx_update_and_send - process a detected relevant receive content change + * 1. update the last received data + * 2. send a notification to the user (if possible) + */ +static void bcm_rx_update_and_send(struct bcm_op *op, + struct can_frame *lastdata, + const struct can_frame *rxdata) +{ + memcpy(lastdata, rxdata, CFSIZ); + + /* mark as used and throttled by default */ + lastdata->can_dlc |= (RX_RECV|RX_THR); + + /* throtteling mode inactive ? */ + if (!op->kt_ival2.tv64) { + /* send RX_CHANGED to the user immediately */ + bcm_rx_changed(op, lastdata); + return; + } + + /* with active throttling timer we are just done here */ + if (hrtimer_active(&op->thrtimer)) + return; + + /* first receiption with enabled throttling mode */ + if (!op->kt_lastmsg.tv64) + goto rx_changed_settime; + + /* got a second frame inside a potential throttle period? */ + if (ktime_us_delta(ktime_get(), op->kt_lastmsg) < + ktime_to_us(op->kt_ival2)) { + /* do not send the saved data - only start throttle timer */ + hrtimer_start(&op->thrtimer, + ktime_add(op->kt_lastmsg, op->kt_ival2), + HRTIMER_MODE_ABS); + return; + } + + /* the gap was that big, that throttling was not needed here */ +rx_changed_settime: + bcm_rx_changed(op, lastdata); + op->kt_lastmsg = ktime_get(); +} + +/* + * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly + * received data stored in op->last_frames[] + */ +static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, + const struct can_frame *rxdata) +{ + /* + * no one uses the MSBs of can_dlc for comparation, + * so we use it here to detect the first time of reception + */ + + if (!(op->last_frames[index].can_dlc & RX_RECV)) { + /* received data for the first time => send update to user */ + bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); + return; + } + + /* do a real check in can_frame data section */ + + if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) != + (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) { + bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); + return; + } + + if (op->flags & RX_CHECK_DLC) { + /* do a real check in can_frame dlc */ + if (rxdata->can_dlc != (op->last_frames[index].can_dlc & + BCM_CAN_DLC_MASK)) { + bcm_rx_update_and_send(op, &op->last_frames[index], + rxdata); + return; + } + } +} + +/* + * bcm_rx_starttimer - enable timeout monitoring for CAN frame receiption + */ +static void bcm_rx_starttimer(struct bcm_op *op) +{ + if (op->flags & RX_NO_AUTOTIMER) + return; + + if (op->kt_ival1.tv64) + hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); +} + +static void bcm_rx_timeout_tsklet(unsigned long data) +{ + struct bcm_op *op = (struct bcm_op *)data; + struct bcm_msg_head msg_head; + + /* create notification to user */ + msg_head.opcode = RX_TIMEOUT; + msg_head.flags = op->flags; + msg_head.count = op->count; + msg_head.ival1 = op->ival1; + msg_head.ival2 = op->ival2; + msg_head.can_id = op->can_id; + msg_head.nframes = 0; + + bcm_send_to_user(op, &msg_head, NULL, 0); +} + +/* + * bcm_rx_timeout_handler - when the (cyclic) CAN frame receiption timed out + */ +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) +{ + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); + + /* schedule before NET_RX_SOFTIRQ */ + tasklet_hi_schedule(&op->tsklet); + + /* no restart of the timer is done here! */ + + /* if user wants to be informed, when cyclic CAN-Messages come back */ + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { + /* clear received can_frames to indicate 'nothing received' */ + memset(op->last_frames, 0, op->nframes * CFSIZ); + } + + return HRTIMER_NORESTART; +} + +/* + * bcm_rx_do_flush - helper for bcm_rx_thr_flush + */ +static inline int bcm_rx_do_flush(struct bcm_op *op, int update, + unsigned int index) +{ + if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) { + if (update) + bcm_rx_changed(op, &op->last_frames[index]); + return 1; + } + return 0; +} + +/* + * bcm_rx_thr_flush - Check for throttled data and send it to the userspace + * + * update == 0 : just check if throttled data is available (any irq context) + * update == 1 : check and send throttled data to userspace (soft_irq context) + */ +static int bcm_rx_thr_flush(struct bcm_op *op, int update) +{ + int updated = 0; + + if (op->nframes > 1) { + unsigned int i; + + /* for MUX filter we start at index 1 */ + for (i = 1; i < op->nframes; i++) + updated += bcm_rx_do_flush(op, update, i); + + } else { + /* for RX_FILTER_ID and simple filter */ + updated += bcm_rx_do_flush(op, update, 0); + } + + return updated; +} + +static void bcm_rx_thr_tsklet(unsigned long data) +{ + struct bcm_op *op = (struct bcm_op *)data; + + /* push the changed data to the userspace */ + bcm_rx_thr_flush(op, 1); +} + +/* + * bcm_rx_thr_handler - the time for blocked content updates is over now: + * Check for throttled data and send it to the userspace + */ +static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) +{ + struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); + + tasklet_schedule(&op->thrtsklet); + + if (bcm_rx_thr_flush(op, 0)) { + hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); + return HRTIMER_RESTART; + } else { + /* rearm throttle handling */ + op->kt_lastmsg = ktime_set(0, 0); + return HRTIMER_NORESTART; + } +} + +/* + * bcm_rx_handler - handle a CAN frame receiption + */ +static void bcm_rx_handler(struct sk_buff *skb, void *data) +{ + struct bcm_op *op = (struct bcm_op *)data; + const struct can_frame *rxframe = (struct can_frame *)skb->data; + unsigned int i; + + /* disable timeout */ + hrtimer_cancel(&op->timer); + + if (op->can_id != rxframe->can_id) + return; + + /* save rx timestamp */ + op->rx_stamp = skb->tstamp; + /* save originator for recvfrom() */ + op->rx_ifindex = skb->dev->ifindex; + /* update statistics */ + op->frames_abs++; + + if (op->flags & RX_RTR_FRAME) { + /* send reply for RTR-request (placed in op->frames[0]) */ + bcm_can_tx(op); + return; + } + + if (op->flags & RX_FILTER_ID) { + /* the easiest case */ + bcm_rx_update_and_send(op, &op->last_frames[0], rxframe); + goto rx_starttimer; + } + + if (op->nframes == 1) { + /* simple compare with index 0 */ + bcm_rx_cmp_to_index(op, 0, rxframe); + goto rx_starttimer; + } + + if (op->nframes > 1) { + /* + * multiplex compare + * + * find the first multiplex mask that fits. + * Remark: The MUX-mask is stored in index 0 + */ + + for (i = 1; i < op->nframes; i++) { + if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) == + (GET_U64(&op->frames[0]) & + GET_U64(&op->frames[i]))) { + bcm_rx_cmp_to_index(op, i, rxframe); + break; + } + } + } + +rx_starttimer: + bcm_rx_starttimer(op); +} + +/* + * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements + */ +static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id, + int ifindex) +{ + struct bcm_op *op; + + list_for_each_entry(op, ops, list) { + if ((op->can_id == can_id) && (op->ifindex == ifindex)) + return op; + } + + return NULL; +} + +static void bcm_remove_op(struct bcm_op *op) +{ + hrtimer_cancel(&op->timer); + hrtimer_cancel(&op->thrtimer); + + if (op->tsklet.func) + tasklet_kill(&op->tsklet); + + if (op->thrtsklet.func) + tasklet_kill(&op->thrtsklet); + + if ((op->frames) && (op->frames != &op->sframe)) + kfree(op->frames); + + if ((op->last_frames) && (op->last_frames != &op->last_sframe)) + kfree(op->last_frames); + + kfree(op); +} + +static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) +{ + if (op->rx_reg_dev == dev) { + can_rx_unregister(dev, op->can_id, REGMASK(op->can_id), + bcm_rx_handler, op); + + /* mark as removed subscription */ + op->rx_reg_dev = NULL; + } else + printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " + "mismatch %p %p\n", op->rx_reg_dev, dev); +} + +/* + * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) + */ +static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) +{ + struct bcm_op *op, *n; + + list_for_each_entry_safe(op, n, ops, list) { + if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + + /* + * Don't care if we're bound or not (due to netdev + * problems) can_rx_unregister() is always a save + * thing to do here. + */ + if (op->ifindex) { + /* + * Only remove subscriptions that had not + * been removed due to NETDEV_UNREGISTER + * in bcm_notifier() + */ + if (op->rx_reg_dev) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, + op->ifindex); + if (dev) { + bcm_rx_unreg(dev, op); + dev_put(dev); + } + } + } else + can_rx_unregister(NULL, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op); + + list_del(&op->list); + bcm_remove_op(op); + return 1; /* done */ + } + } + + return 0; /* not found */ +} + +/* + * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) + */ +static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex) +{ + struct bcm_op *op, *n; + + list_for_each_entry_safe(op, n, ops, list) { + if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + list_del(&op->list); + bcm_remove_op(op); + return 1; /* done */ + } + } + + return 0; /* not found */ +} + +/* + * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) + */ +static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, + int ifindex) +{ + struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex); + + if (!op) + return -EINVAL; + + /* put current values into msg_head */ + msg_head->flags = op->flags; + msg_head->count = op->count; + msg_head->ival1 = op->ival1; + msg_head->ival2 = op->ival2; + msg_head->nframes = op->nframes; + + bcm_send_to_user(op, msg_head, op->frames, 0); + + return MHSIZ; +} + +/* + * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) + */ +static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, + int ifindex, struct sock *sk) +{ + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op; + unsigned int i; + int err; + + /* we need a real device to send frames */ + if (!ifindex) + return -ENODEV; + + /* check nframes boundaries - we need at least one can_frame */ + if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) + return -EINVAL; + + /* check the given can_id */ + op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex); + + if (op) { + /* update existing BCM operation */ + + /* + * Do we need more space for the can_frames than currently + * allocated? -> This is a _really_ unusual use-case and + * therefore (complexity / locking) it is not supported. + */ + if (msg_head->nframes > op->nframes) + return -E2BIG; + + /* update can_frames content */ + for (i = 0; i < msg_head->nframes; i++) { + err = memcpy_fromiovec((u8 *)&op->frames[i], + msg->msg_iov, CFSIZ); + + if (op->frames[i].can_dlc > 8) + err = -EINVAL; + + if (err < 0) + return err; + + if (msg_head->flags & TX_CP_CAN_ID) { + /* copy can_id into frame */ + op->frames[i].can_id = msg_head->can_id; + } + } + + } else { + /* insert new BCM operation for the given can_id */ + + op = kzalloc(OPSIZ, GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->can_id = msg_head->can_id; + + /* create array for can_frames and copy the data */ + if (msg_head->nframes > 1) { + op->frames = kmalloc(msg_head->nframes * CFSIZ, + GFP_KERNEL); + if (!op->frames) { + kfree(op); + return -ENOMEM; + } + } else + op->frames = &op->sframe; + + for (i = 0; i < msg_head->nframes; i++) { + err = memcpy_fromiovec((u8 *)&op->frames[i], + msg->msg_iov, CFSIZ); + + if (op->frames[i].can_dlc > 8) + err = -EINVAL; + + if (err < 0) { + if (op->frames != &op->sframe) + kfree(op->frames); + kfree(op); + return err; + } + + if (msg_head->flags & TX_CP_CAN_ID) { + /* copy can_id into frame */ + op->frames[i].can_id = msg_head->can_id; + } + } + + /* tx_ops never compare with previous received messages */ + op->last_frames = NULL; + + /* bcm_can_tx / bcm_tx_timeout_handler needs this */ + op->sk = sk; + op->ifindex = ifindex; + + /* initialize uninitialized (kzalloc) structure */ + hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + op->timer.function = bcm_tx_timeout_handler; + + /* initialize tasklet for tx countevent notification */ + tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet, + (unsigned long) op); + + /* currently unused in tx_ops */ + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + /* add this bcm_op to the list of the tx_ops */ + list_add(&op->list, &bo->tx_ops); + + } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ + + if (op->nframes != msg_head->nframes) { + op->nframes = msg_head->nframes; + /* start multiple frame transmission with index 0 */ + op->currframe = 0; + } + + /* check flags */ + + op->flags = msg_head->flags; + + if (op->flags & TX_RESET_MULTI_IDX) { + /* start multiple frame transmission with index 0 */ + op->currframe = 0; + } + + if (op->flags & SETTIMER) { + /* set timer values */ + op->count = msg_head->count; + op->ival1 = msg_head->ival1; + op->ival2 = msg_head->ival2; + op->kt_ival1 = timeval_to_ktime(msg_head->ival1); + op->kt_ival2 = timeval_to_ktime(msg_head->ival2); + + /* disable an active timer due to zero values? */ + if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64) + hrtimer_cancel(&op->timer); + } + + if (op->flags & STARTTIMER) { + hrtimer_cancel(&op->timer); + /* spec: send can_frame when starting timer */ + op->flags |= TX_ANNOUNCE; + } + + if (op->flags & TX_ANNOUNCE) { + bcm_can_tx(op); + if (op->count) + op->count--; + } + + if (op->flags & STARTTIMER) + bcm_tx_start_timer(op); + + return msg_head->nframes * CFSIZ + MHSIZ; +} + +/* + * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) + */ +static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, + int ifindex, struct sock *sk) +{ + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op; + int do_rx_register; + int err = 0; + + if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { + /* be robust against wrong usage ... */ + msg_head->flags |= RX_FILTER_ID; + /* ignore trailing garbage */ + msg_head->nframes = 0; + } + + /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ + if (msg_head->nframes > MAX_NFRAMES + 1) + return -EINVAL; + + if ((msg_head->flags & RX_RTR_FRAME) && + ((msg_head->nframes != 1) || + (!(msg_head->can_id & CAN_RTR_FLAG)))) + return -EINVAL; + + /* check the given can_id */ + op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex); + if (op) { + /* update existing BCM operation */ + + /* + * Do we need more space for the can_frames than currently + * allocated? -> This is a _really_ unusual use-case and + * therefore (complexity / locking) it is not supported. + */ + if (msg_head->nframes > op->nframes) + return -E2BIG; + + if (msg_head->nframes) { + /* update can_frames content */ + err = memcpy_fromiovec((u8 *)op->frames, + msg->msg_iov, + msg_head->nframes * CFSIZ); + if (err < 0) + return err; + + /* clear last_frames to indicate 'nothing received' */ + memset(op->last_frames, 0, msg_head->nframes * CFSIZ); + } + + op->nframes = msg_head->nframes; + + /* Only an update -> do not call can_rx_register() */ + do_rx_register = 0; + + } else { + /* insert new BCM operation for the given can_id */ + op = kzalloc(OPSIZ, GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->can_id = msg_head->can_id; + op->nframes = msg_head->nframes; + + if (msg_head->nframes > 1) { + /* create array for can_frames and copy the data */ + op->frames = kmalloc(msg_head->nframes * CFSIZ, + GFP_KERNEL); + if (!op->frames) { + kfree(op); + return -ENOMEM; + } + + /* create and init array for received can_frames */ + op->last_frames = kzalloc(msg_head->nframes * CFSIZ, + GFP_KERNEL); + if (!op->last_frames) { + kfree(op->frames); + kfree(op); + return -ENOMEM; + } + + } else { + op->frames = &op->sframe; + op->last_frames = &op->last_sframe; + } + + if (msg_head->nframes) { + err = memcpy_fromiovec((u8 *)op->frames, msg->msg_iov, + msg_head->nframes * CFSIZ); + if (err < 0) { + if (op->frames != &op->sframe) + kfree(op->frames); + if (op->last_frames != &op->last_sframe) + kfree(op->last_frames); + kfree(op); + return err; + } + } + + /* bcm_can_tx / bcm_tx_timeout_handler needs this */ + op->sk = sk; + op->ifindex = ifindex; + + /* initialize uninitialized (kzalloc) structure */ + hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + op->timer.function = bcm_rx_timeout_handler; + + /* initialize tasklet for rx timeout notification */ + tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet, + (unsigned long) op); + + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + op->thrtimer.function = bcm_rx_thr_handler; + + /* initialize tasklet for rx throttle handling */ + tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet, + (unsigned long) op); + + /* add this bcm_op to the list of the rx_ops */ + list_add(&op->list, &bo->rx_ops); + + /* call can_rx_register() */ + do_rx_register = 1; + + } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ + + /* check flags */ + op->flags = msg_head->flags; + + if (op->flags & RX_RTR_FRAME) { + + /* no timers in RTR-mode */ + hrtimer_cancel(&op->thrtimer); + hrtimer_cancel(&op->timer); + + /* + * funny feature in RX(!)_SETUP only for RTR-mode: + * copy can_id into frame BUT without RTR-flag to + * prevent a full-load-loopback-test ... ;-] + */ + if ((op->flags & TX_CP_CAN_ID) || + (op->frames[0].can_id == op->can_id)) + op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG; + + } else { + if (op->flags & SETTIMER) { + + /* set timer value */ + op->ival1 = msg_head->ival1; + op->ival2 = msg_head->ival2; + op->kt_ival1 = timeval_to_ktime(msg_head->ival1); + op->kt_ival2 = timeval_to_ktime(msg_head->ival2); + + /* disable an active timer due to zero value? */ + if (!op->kt_ival1.tv64) + hrtimer_cancel(&op->timer); + + /* + * In any case cancel the throttle timer, flush + * potentially blocked msgs and reset throttle handling + */ + op->kt_lastmsg = ktime_set(0, 0); + hrtimer_cancel(&op->thrtimer); + bcm_rx_thr_flush(op, 1); + } + + if ((op->flags & STARTTIMER) && op->kt_ival1.tv64) + hrtimer_start(&op->timer, op->kt_ival1, + HRTIMER_MODE_REL); + } + + /* now we can register for can_ids, if we added a new bcm_op */ + if (do_rx_register) { + if (ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ifindex); + if (dev) { + err = can_rx_register(dev, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op, + "bcm"); + + op->rx_reg_dev = dev; + dev_put(dev); + } + + } else + err = can_rx_register(NULL, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op, "bcm"); + if (err) { + /* this bcm rx op is broken -> remove it */ + list_del(&op->list); + bcm_remove_op(op); + return err; + } + } + + return msg_head->nframes * CFSIZ + MHSIZ; +} + +/* + * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) + */ +static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) +{ + struct sk_buff *skb; + struct net_device *dev; + int err; + + /* we need a real device to send frames */ + if (!ifindex) + return -ENODEV; + + skb = alloc_skb(CFSIZ, GFP_KERNEL); + + if (!skb) + return -ENOMEM; + + err = memcpy_fromiovec(skb_put(skb, CFSIZ), msg->msg_iov, CFSIZ); + if (err < 0) { + kfree_skb(skb); + return err; + } + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + + skb->dev = dev; + skb->sk = sk; + err = can_send(skb, 1); /* send with loopback */ + dev_put(dev); + + if (err) + return err; + + return CFSIZ + MHSIZ; +} + +/* + * bcm_sendmsg - process BCM commands (opcodes) from the userspace + */ +static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct bcm_sock *bo = bcm_sk(sk); + int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ + struct bcm_msg_head msg_head; + int ret; /* read bytes or error codes as return value */ + + if (!bo->bound) + return -ENOTCONN; + + /* check for valid message length from userspace */ + if (size < MHSIZ || (size - MHSIZ) % CFSIZ) + return -EINVAL; + + /* check for alternative ifindex for this bcm_op */ + + if (!ifindex && msg->msg_name) { + /* no bound device as default => check msg_name */ + struct sockaddr_can *addr = + (struct sockaddr_can *)msg->msg_name; + + if (msg->msg_namelen < sizeof(*addr)) + return -EINVAL; + + if (addr->can_family != AF_CAN) + return -EINVAL; + + /* ifindex from sendto() */ + ifindex = addr->can_ifindex; + + if (ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) + return -ENODEV; + + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + return -ENODEV; + } + + dev_put(dev); + } + } + + /* read message head information */ + + ret = memcpy_fromiovec((u8 *)&msg_head, msg->msg_iov, MHSIZ); + if (ret < 0) + return ret; + + lock_sock(sk); + + switch (msg_head.opcode) { + + case TX_SETUP: + ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); + break; + + case RX_SETUP: + ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); + break; + + case TX_DELETE: + if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex)) + ret = MHSIZ; + else + ret = -EINVAL; + break; + + case RX_DELETE: + if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex)) + ret = MHSIZ; + else + ret = -EINVAL; + break; + + case TX_READ: + /* reuse msg_head for the reply to TX_READ */ + msg_head.opcode = TX_STATUS; + ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); + break; + + case RX_READ: + /* reuse msg_head for the reply to RX_READ */ + msg_head.opcode = RX_STATUS; + ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); + break; + + case TX_SEND: + /* we need exactly one can_frame behind the msg head */ + if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ)) + ret = -EINVAL; + else + ret = bcm_tx_send(msg, ifindex, sk); + break; + + default: + ret = -EINVAL; + break; + } + + release_sock(sk); + + return ret; +} + +/* + * notification handler for netdevice status changes + */ +static int bcm_notifier(struct notifier_block *nb, unsigned long msg, + void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); + struct sock *sk = &bo->sk; + struct bcm_op *op; + int notify_enodev = 0; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_UNREGISTER: + lock_sock(sk); + + /* remove device specific receive entries */ + list_for_each_entry(op, &bo->rx_ops, list) + if (op->rx_reg_dev == dev) + bcm_rx_unreg(dev, op); + + /* remove device reference, if this is our bound device */ + if (bo->bound && bo->ifindex == dev->ifindex) { + bo->bound = 0; + bo->ifindex = 0; + notify_enodev = 1; + } + + release_sock(sk); + + if (notify_enodev) { + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + break; + + case NETDEV_DOWN: + if (bo->bound && bo->ifindex == dev->ifindex) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + } + + return NOTIFY_DONE; +} + +/* + * initial settings for all BCM sockets to be set at socket creation time + */ +static int bcm_init(struct sock *sk) +{ + struct bcm_sock *bo = bcm_sk(sk); + + bo->bound = 0; + bo->ifindex = 0; + bo->dropped_usr_msgs = 0; + bo->bcm_proc_read = NULL; + + INIT_LIST_HEAD(&bo->tx_ops); + INIT_LIST_HEAD(&bo->rx_ops); + + /* set notifier */ + bo->notifier.notifier_call = bcm_notifier; + + register_netdevice_notifier(&bo->notifier); + + return 0; +} + +/* + * standard socket functions + */ +static int bcm_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct bcm_sock *bo; + struct bcm_op *op, *next; + + if (sk == NULL) + return 0; + + bo = bcm_sk(sk); + + /* remove bcm_ops, timer, rx_unregister(), etc. */ + + unregister_netdevice_notifier(&bo->notifier); + + lock_sock(sk); + + list_for_each_entry_safe(op, next, &bo->tx_ops, list) + bcm_remove_op(op); + + list_for_each_entry_safe(op, next, &bo->rx_ops, list) { + /* + * Don't care if we're bound or not (due to netdev problems) + * can_rx_unregister() is always a save thing to do here. + */ + if (op->ifindex) { + /* + * Only remove subscriptions that had not + * been removed due to NETDEV_UNREGISTER + * in bcm_notifier() + */ + if (op->rx_reg_dev) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, op->ifindex); + if (dev) { + bcm_rx_unreg(dev, op); + dev_put(dev); + } + } + } else + can_rx_unregister(NULL, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op); + + bcm_remove_op(op); + } + + /* remove procfs entry */ + if (proc_dir && bo->bcm_proc_read) + remove_proc_entry(bo->procname, proc_dir); + + /* remove device reference */ + if (bo->bound) { + bo->bound = 0; + bo->ifindex = 0; + } + + sock_orphan(sk); + sock->sk = NULL; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, + int flags) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct bcm_sock *bo = bcm_sk(sk); + + if (len < sizeof(*addr)) + return -EINVAL; + + if (bo->bound) + return -EISCONN; + + /* bind a device to this socket */ + if (addr->can_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, addr->can_ifindex); + if (!dev) + return -ENODEV; + + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + return -ENODEV; + } + + bo->ifindex = dev->ifindex; + dev_put(dev); + + } else { + /* no interface reference for ifindex = 0 ('any' CAN device) */ + bo->ifindex = 0; + } + + bo->bound = 1; + + if (proc_dir) { + /* unique socket address as filename */ + sprintf(bo->procname, "%lu", sock_i_ino(sk)); + bo->bcm_proc_read = proc_create_data(bo->procname, 0644, + proc_dir, + &bcm_proc_fops, sk); + } + + return 0; +} + +static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int error = 0; + int noblock; + int err; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + skb = skb_recv_datagram(sk, flags, noblock, &error); + if (!skb) + return error; + + if (skb->len < size) + size = skb->len; + + err = memcpy_toiovec(msg->msg_iov, skb->data, size); + if (err < 0) { + skb_free_datagram(sk, skb); + return err; + } + + sock_recv_ts_and_drops(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + skb_free_datagram(sk, skb); + + return size; +} + +static const struct proto_ops bcm_ops = { + .family = PF_CAN, + .release = bcm_release, + .bind = sock_no_bind, + .connect = bcm_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = bcm_sendmsg, + .recvmsg = bcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto bcm_proto __read_mostly = { + .name = "CAN_BCM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bcm_sock), + .init = bcm_init, +}; + +static const struct can_proto bcm_can_proto = { + .type = SOCK_DGRAM, + .protocol = CAN_BCM, + .ops = &bcm_ops, + .prot = &bcm_proto, +}; + +static int __init bcm_module_init(void) +{ + int err; + + printk(banner); + + err = can_proto_register(&bcm_can_proto); + if (err < 0) { + printk(KERN_ERR "can: registration of bcm protocol failed\n"); + return err; + } + + /* create /proc/net/can-bcm directory */ + proc_dir = proc_mkdir("can-bcm", init_net.proc_net); + return 0; +} + +static void __exit bcm_module_exit(void) +{ + can_proto_unregister(&bcm_can_proto); + + if (proc_dir) + proc_net_remove(&init_net, "can-bcm"); +} + +module_init(bcm_module_init); +module_exit(bcm_module_exit); diff --git a/net/can/proc.c b/net/can/proc.c new file mode 100644 index 00000000..0016f733 --- /dev/null +++ b/net/can/proc.c @@ -0,0 +1,540 @@ +/* + * proc.c - procfs support for Protocol family CAN core module + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to + * + */ + +#include +#include +#include +#include +#include +#include + +#include "af_can.h" + +/* + * proc filenames for the PF_CAN core + */ + +#define CAN_PROC_VERSION "version" +#define CAN_PROC_STATS "stats" +#define CAN_PROC_RESET_STATS "reset_stats" +#define CAN_PROC_RCVLIST_ALL "rcvlist_all" +#define CAN_PROC_RCVLIST_FIL "rcvlist_fil" +#define CAN_PROC_RCVLIST_INV "rcvlist_inv" +#define CAN_PROC_RCVLIST_SFF "rcvlist_sff" +#define CAN_PROC_RCVLIST_EFF "rcvlist_eff" +#define CAN_PROC_RCVLIST_ERR "rcvlist_err" + +static struct proc_dir_entry *can_dir; +static struct proc_dir_entry *pde_version; +static struct proc_dir_entry *pde_stats; +static struct proc_dir_entry *pde_reset_stats; +static struct proc_dir_entry *pde_rcvlist_all; +static struct proc_dir_entry *pde_rcvlist_fil; +static struct proc_dir_entry *pde_rcvlist_inv; +static struct proc_dir_entry *pde_rcvlist_sff; +static struct proc_dir_entry *pde_rcvlist_eff; +static struct proc_dir_entry *pde_rcvlist_err; + +static int user_reset; + +static const char rx_list_name[][8] = { + [RX_ERR] = "rx_err", + [RX_ALL] = "rx_all", + [RX_FIL] = "rx_fil", + [RX_INV] = "rx_inv", + [RX_EFF] = "rx_eff", +}; + +/* receive filters subscribed for 'all' CAN devices */ +extern struct dev_rcv_lists can_rx_alldev_list; + +/* + * af_can statistics stuff + */ + +static void can_init_stats(void) +{ + /* + * This memset function is called from a timer context (when + * can_stattimer is active which is the default) OR in a process + * context (reading the proc_fs when can_stattimer is disabled). + */ + memset(&can_stats, 0, sizeof(can_stats)); + can_stats.jiffies_init = jiffies; + + can_pstats.stats_reset++; + + if (user_reset) { + user_reset = 0; + can_pstats.user_reset++; + } +} + +static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, + unsigned long count) +{ + unsigned long rate; + + if (oldjif == newjif) + return 0; + + /* see can_stat_update() - this should NEVER happen! */ + if (count > (ULONG_MAX / HZ)) { + printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n", + count); + return 99999999; + } + + rate = (count * HZ) / (newjif - oldjif); + + return rate; +} + +void can_stat_update(unsigned long data) +{ + unsigned long j = jiffies; /* snapshot */ + + /* restart counting in timer context on user request */ + if (user_reset) + can_init_stats(); + + /* restart counting on jiffies overflow */ + if (j < can_stats.jiffies_init) + can_init_stats(); + + /* prevent overflow in calc_rate() */ + if (can_stats.rx_frames > (ULONG_MAX / HZ)) + can_init_stats(); + + /* prevent overflow in calc_rate() */ + if (can_stats.tx_frames > (ULONG_MAX / HZ)) + can_init_stats(); + + /* matches overflow - very improbable */ + if (can_stats.matches > (ULONG_MAX / 100)) + can_init_stats(); + + /* calc total values */ + if (can_stats.rx_frames) + can_stats.total_rx_match_ratio = (can_stats.matches * 100) / + can_stats.rx_frames; + + can_stats.total_tx_rate = calc_rate(can_stats.jiffies_init, j, + can_stats.tx_frames); + can_stats.total_rx_rate = calc_rate(can_stats.jiffies_init, j, + can_stats.rx_frames); + + /* calc current values */ + if (can_stats.rx_frames_delta) + can_stats.current_rx_match_ratio = + (can_stats.matches_delta * 100) / + can_stats.rx_frames_delta; + + can_stats.current_tx_rate = calc_rate(0, HZ, can_stats.tx_frames_delta); + can_stats.current_rx_rate = calc_rate(0, HZ, can_stats.rx_frames_delta); + + /* check / update maximum values */ + if (can_stats.max_tx_rate < can_stats.current_tx_rate) + can_stats.max_tx_rate = can_stats.current_tx_rate; + + if (can_stats.max_rx_rate < can_stats.current_rx_rate) + can_stats.max_rx_rate = can_stats.current_rx_rate; + + if (can_stats.max_rx_match_ratio < can_stats.current_rx_match_ratio) + can_stats.max_rx_match_ratio = can_stats.current_rx_match_ratio; + + /* clear values for 'current rate' calculation */ + can_stats.tx_frames_delta = 0; + can_stats.rx_frames_delta = 0; + can_stats.matches_delta = 0; + + /* restart timer (one second) */ + mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); +} + +/* + * proc read functions + */ + +static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list, + struct net_device *dev) +{ + struct receiver *r; + struct hlist_node *n; + + hlist_for_each_entry_rcu(r, n, rx_list, list) { + char *fmt = (r->can_id & CAN_EFF_FLAG)? + " %-5s %08x %08x %pK %pK %8ld %s\n" : + " %-5s %03x %08x %pK %pK %8ld %s\n"; + + seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask, + r->func, r->data, r->matches, r->ident); + } +} + +static void can_print_recv_banner(struct seq_file *m) +{ + /* + * can1. 00000000 00000000 00000000 + * ....... 0 tp20 + */ + seq_puts(m, " device can_id can_mask function" + " userdata matches ident\n"); +} + +static int can_stats_proc_show(struct seq_file *m, void *v) +{ + seq_putc(m, '\n'); + seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats.tx_frames); + seq_printf(m, " %8ld received frames (RXF)\n", can_stats.rx_frames); + seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats.matches); + + seq_putc(m, '\n'); + + if (can_stattimer.function == can_stat_update) { + seq_printf(m, " %8ld %% total match ratio (RXMR)\n", + can_stats.total_rx_match_ratio); + + seq_printf(m, " %8ld frames/s total tx rate (TXR)\n", + can_stats.total_tx_rate); + seq_printf(m, " %8ld frames/s total rx rate (RXR)\n", + can_stats.total_rx_rate); + + seq_putc(m, '\n'); + + seq_printf(m, " %8ld %% current match ratio (CRXMR)\n", + can_stats.current_rx_match_ratio); + + seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n", + can_stats.current_tx_rate); + seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n", + can_stats.current_rx_rate); + + seq_putc(m, '\n'); + + seq_printf(m, " %8ld %% max match ratio (MRXMR)\n", + can_stats.max_rx_match_ratio); + + seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n", + can_stats.max_tx_rate); + seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n", + can_stats.max_rx_rate); + + seq_putc(m, '\n'); + } + + seq_printf(m, " %8ld current receive list entries (CRCV)\n", + can_pstats.rcv_entries); + seq_printf(m, " %8ld maximum receive list entries (MRCV)\n", + can_pstats.rcv_entries_max); + + if (can_pstats.stats_reset) + seq_printf(m, "\n %8ld statistic resets (STR)\n", + can_pstats.stats_reset); + + if (can_pstats.user_reset) + seq_printf(m, " %8ld user statistic resets (USTR)\n", + can_pstats.user_reset); + + seq_putc(m, '\n'); + return 0; +} + +static int can_stats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, can_stats_proc_show, NULL); +} + +static const struct file_operations can_stats_proc_fops = { + .owner = THIS_MODULE, + .open = can_stats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int can_reset_stats_proc_show(struct seq_file *m, void *v) +{ + user_reset = 1; + + if (can_stattimer.function == can_stat_update) { + seq_printf(m, "Scheduled statistic reset #%ld.\n", + can_pstats.stats_reset + 1); + + } else { + if (can_stats.jiffies_init != jiffies) + can_init_stats(); + + seq_printf(m, "Performed statistic reset #%ld.\n", + can_pstats.stats_reset); + } + return 0; +} + +static int can_reset_stats_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, can_reset_stats_proc_show, NULL); +} + +static const struct file_operations can_reset_stats_proc_fops = { + .owner = THIS_MODULE, + .open = can_reset_stats_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int can_version_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", CAN_VERSION_STRING); + return 0; +} + +static int can_version_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, can_version_proc_show, NULL); +} + +static const struct file_operations can_version_proc_fops = { + .owner = THIS_MODULE, + .open = can_version_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, + struct net_device *dev, + struct dev_rcv_lists *d) +{ + if (!hlist_empty(&d->rx[idx])) { + can_print_recv_banner(m); + can_print_rcvlist(m, &d->rx[idx], dev); + } else + seq_printf(m, " (%s: no entry)\n", DNAME(dev)); + +} + +static int can_rcvlist_proc_show(struct seq_file *m, void *v) +{ + /* double cast to prevent GCC warning */ + int idx = (int)(long)m->private; + struct net_device *dev; + struct dev_rcv_lists *d; + + seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]); + + rcu_read_lock(); + + /* receive list for 'all' CAN devices (dev == NULL) */ + d = &can_rx_alldev_list; + can_rcvlist_proc_show_one(m, idx, NULL, d); + + /* receive list for registered CAN devices */ + for_each_netdev_rcu(&init_net, dev) { + if (dev->type == ARPHRD_CAN && dev->ml_priv) + can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv); + } + + rcu_read_unlock(); + + seq_putc(m, '\n'); + return 0; +} + +static int can_rcvlist_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, can_rcvlist_proc_show, PDE(inode)->data); +} + +static const struct file_operations can_rcvlist_proc_fops = { + .owner = THIS_MODULE, + .open = can_rcvlist_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline void can_rcvlist_sff_proc_show_one(struct seq_file *m, + struct net_device *dev, + struct dev_rcv_lists *d) +{ + int i; + int all_empty = 1; + + /* check wether at least one list is non-empty */ + for (i = 0; i < 0x800; i++) + if (!hlist_empty(&d->rx_sff[i])) { + all_empty = 0; + break; + } + + if (!all_empty) { + can_print_recv_banner(m); + for (i = 0; i < 0x800; i++) { + if (!hlist_empty(&d->rx_sff[i])) + can_print_rcvlist(m, &d->rx_sff[i], dev); + } + } else + seq_printf(m, " (%s: no entry)\n", DNAME(dev)); +} + +static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) +{ + struct net_device *dev; + struct dev_rcv_lists *d; + + /* RX_SFF */ + seq_puts(m, "\nreceive list 'rx_sff':\n"); + + rcu_read_lock(); + + /* sff receive list for 'all' CAN devices (dev == NULL) */ + d = &can_rx_alldev_list; + can_rcvlist_sff_proc_show_one(m, NULL, d); + + /* sff receive list for registered CAN devices */ + for_each_netdev_rcu(&init_net, dev) { + if (dev->type == ARPHRD_CAN && dev->ml_priv) + can_rcvlist_sff_proc_show_one(m, dev, dev->ml_priv); + } + + rcu_read_unlock(); + + seq_putc(m, '\n'); + return 0; +} + +static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, can_rcvlist_sff_proc_show, NULL); +} + +static const struct file_operations can_rcvlist_sff_proc_fops = { + .owner = THIS_MODULE, + .open = can_rcvlist_sff_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * proc utility functions + */ + +static void can_remove_proc_readentry(const char *name) +{ + if (can_dir) + remove_proc_entry(name, can_dir); +} + +/* + * can_init_proc - create main CAN proc directory and procfs entries + */ +void can_init_proc(void) +{ + /* create /proc/net/can directory */ + can_dir = proc_mkdir("can", init_net.proc_net); + + if (!can_dir) { + printk(KERN_INFO "can: failed to create /proc/net/can . " + "CONFIG_PROC_FS missing?\n"); + return; + } + + /* own procfs entries from the AF_CAN core */ + pde_version = proc_create(CAN_PROC_VERSION, 0644, can_dir, + &can_version_proc_fops); + pde_stats = proc_create(CAN_PROC_STATS, 0644, can_dir, + &can_stats_proc_fops); + pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644, can_dir, + &can_reset_stats_proc_fops); + pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644, can_dir, + &can_rcvlist_proc_fops, (void *)RX_ERR); + pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644, can_dir, + &can_rcvlist_proc_fops, (void *)RX_ALL); + pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644, can_dir, + &can_rcvlist_proc_fops, (void *)RX_FIL); + pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, can_dir, + &can_rcvlist_proc_fops, (void *)RX_INV); + pde_rcvlist_eff = proc_create_data(CAN_PROC_RCVLIST_EFF, 0644, can_dir, + &can_rcvlist_proc_fops, (void *)RX_EFF); + pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, can_dir, + &can_rcvlist_sff_proc_fops); +} + +/* + * can_remove_proc - remove procfs entries and main CAN proc directory + */ +void can_remove_proc(void) +{ + if (pde_version) + can_remove_proc_readentry(CAN_PROC_VERSION); + + if (pde_stats) + can_remove_proc_readentry(CAN_PROC_STATS); + + if (pde_reset_stats) + can_remove_proc_readentry(CAN_PROC_RESET_STATS); + + if (pde_rcvlist_err) + can_remove_proc_readentry(CAN_PROC_RCVLIST_ERR); + + if (pde_rcvlist_all) + can_remove_proc_readentry(CAN_PROC_RCVLIST_ALL); + + if (pde_rcvlist_fil) + can_remove_proc_readentry(CAN_PROC_RCVLIST_FIL); + + if (pde_rcvlist_inv) + can_remove_proc_readentry(CAN_PROC_RCVLIST_INV); + + if (pde_rcvlist_eff) + can_remove_proc_readentry(CAN_PROC_RCVLIST_EFF); + + if (pde_rcvlist_sff) + can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF); + + if (can_dir) + proc_net_remove(&init_net, "can"); +} diff --git a/net/can/raw.c b/net/can/raw.c new file mode 100644 index 00000000..dea99a6e --- /dev/null +++ b/net/can/raw.c @@ -0,0 +1,803 @@ +/* + * raw.c - Raw sockets for protocol family CAN + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAN_RAW_VERSION CAN_VERSION +static __initdata const char banner[] = + KERN_INFO "can: raw protocol (rev " CAN_RAW_VERSION ")\n"; + +MODULE_DESCRIPTION("PF_CAN raw protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Urs Thuermann "); +MODULE_ALIAS("can-proto-1"); + +#define MASK_ALL 0 + +/* + * A raw socket has a list of can_filters attached to it, each receiving + * the CAN frames matching that filter. If the filter list is empty, + * no CAN frames will be received by the socket. The default after + * opening the socket, is to have one filter which receives all frames. + * The filter list is allocated dynamically with the exception of the + * list containing only one item. This common case is optimized by + * storing the single filter in dfilter, to avoid using dynamic memory. + */ + +struct raw_sock { + struct sock sk; + int bound; + int ifindex; + struct notifier_block notifier; + int loopback; + int recv_own_msgs; + int count; /* number of active filters */ + struct can_filter dfilter; /* default/single filter */ + struct can_filter *filter; /* pointer to filter(s) */ + can_err_mask_t err_mask; +}; + +/* + * Return pointer to store the extra msg flags for raw_recvmsg(). + * We use the space of one unsigned int beyond the 'struct sockaddr_can' + * in skb->cb. + */ +static inline unsigned int *raw_flags(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) + + sizeof(unsigned int))); + + /* return pointer after struct sockaddr_can */ + return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); +} + +static inline struct raw_sock *raw_sk(const struct sock *sk) +{ + return (struct raw_sock *)sk; +} + +static void raw_rcv(struct sk_buff *oskb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct raw_sock *ro = raw_sk(sk); + struct sockaddr_can *addr; + struct sk_buff *skb; + unsigned int *pflags; + + /* check the received tx sock reference */ + if (!ro->recv_own_msgs && oskb->sk == sk) + return; + + /* clone the given skb to be able to enqueue it into the rcv queue */ + skb = skb_clone(oskb, GFP_ATOMIC); + if (!skb) + return; + + /* + * Put the datagram to the queue so that raw_recvmsg() can + * get it from there. We need to pass the interface index to + * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb + * containing the interface index. + */ + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + addr = (struct sockaddr_can *)skb->cb; + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = skb->dev->ifindex; + + /* add CAN specific message flags for raw_recvmsg() */ + pflags = raw_flags(skb); + *pflags = 0; + if (oskb->sk) + *pflags |= MSG_DONTROUTE; + if (oskb->sk == sk) + *pflags |= MSG_CONFIRM; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} + +static int raw_enable_filters(struct net_device *dev, struct sock *sk, + struct can_filter *filter, int count) +{ + int err = 0; + int i; + + for (i = 0; i < count; i++) { + err = can_rx_register(dev, filter[i].can_id, + filter[i].can_mask, + raw_rcv, sk, "raw"); + if (err) { + /* clean up successfully registered filters */ + while (--i >= 0) + can_rx_unregister(dev, filter[i].can_id, + filter[i].can_mask, + raw_rcv, sk); + break; + } + } + + return err; +} + +static int raw_enable_errfilter(struct net_device *dev, struct sock *sk, + can_err_mask_t err_mask) +{ + int err = 0; + + if (err_mask) + err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, + raw_rcv, sk, "raw"); + + return err; +} + +static void raw_disable_filters(struct net_device *dev, struct sock *sk, + struct can_filter *filter, int count) +{ + int i; + + for (i = 0; i < count; i++) + can_rx_unregister(dev, filter[i].can_id, filter[i].can_mask, + raw_rcv, sk); +} + +static inline void raw_disable_errfilter(struct net_device *dev, + struct sock *sk, + can_err_mask_t err_mask) + +{ + if (err_mask) + can_rx_unregister(dev, 0, err_mask | CAN_ERR_FLAG, + raw_rcv, sk); +} + +static inline void raw_disable_allfilters(struct net_device *dev, + struct sock *sk) +{ + struct raw_sock *ro = raw_sk(sk); + + raw_disable_filters(dev, sk, ro->filter, ro->count); + raw_disable_errfilter(dev, sk, ro->err_mask); +} + +static int raw_enable_allfilters(struct net_device *dev, struct sock *sk) +{ + struct raw_sock *ro = raw_sk(sk); + int err; + + err = raw_enable_filters(dev, sk, ro->filter, ro->count); + if (!err) { + err = raw_enable_errfilter(dev, sk, ro->err_mask); + if (err) + raw_disable_filters(dev, sk, ro->filter, ro->count); + } + + return err; +} + +static int raw_notifier(struct notifier_block *nb, + unsigned long msg, void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); + struct sock *sk = &ro->sk; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + if (ro->ifindex != dev->ifindex) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_UNREGISTER: + lock_sock(sk); + /* remove current filters & unregister */ + if (ro->bound) + raw_disable_allfilters(dev, sk); + + if (ro->count > 1) + kfree(ro->filter); + + ro->ifindex = 0; + ro->bound = 0; + ro->count = 0; + release_sock(sk); + + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + + case NETDEV_DOWN: + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + } + + return NOTIFY_DONE; +} + +static int raw_init(struct sock *sk) +{ + struct raw_sock *ro = raw_sk(sk); + + ro->bound = 0; + ro->ifindex = 0; + + /* set default filter to single entry dfilter */ + ro->dfilter.can_id = 0; + ro->dfilter.can_mask = MASK_ALL; + ro->filter = &ro->dfilter; + ro->count = 1; + + /* set default loopback behaviour */ + ro->loopback = 1; + ro->recv_own_msgs = 0; + + /* set notifier */ + ro->notifier.notifier_call = raw_notifier; + + register_netdevice_notifier(&ro->notifier); + + return 0; +} + +static int raw_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro; + + if (!sk) + return 0; + + ro = raw_sk(sk); + + unregister_netdevice_notifier(&ro->notifier); + + lock_sock(sk); + + /* remove current filters & unregister */ + if (ro->bound) { + if (ro->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ro->ifindex); + if (dev) { + raw_disable_allfilters(dev, sk); + dev_put(dev); + } + } else + raw_disable_allfilters(NULL, sk); + } + + if (ro->count > 1) + kfree(ro->filter); + + ro->ifindex = 0; + ro->bound = 0; + ro->count = 0; + + sock_orphan(sk); + sock->sk = NULL; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + int ifindex; + int err = 0; + int notify_enetdown = 0; + + if (len < sizeof(*addr)) + return -EINVAL; + + lock_sock(sk); + + if (ro->bound && addr->can_ifindex == ro->ifindex) + goto out; + + if (addr->can_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, addr->can_ifindex); + if (!dev) { + err = -ENODEV; + goto out; + } + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + err = -ENODEV; + goto out; + } + if (!(dev->flags & IFF_UP)) + notify_enetdown = 1; + + ifindex = dev->ifindex; + + /* filters set by default/setsockopt */ + err = raw_enable_allfilters(dev, sk); + dev_put(dev); + } else { + ifindex = 0; + + /* filters set by default/setsockopt */ + err = raw_enable_allfilters(NULL, sk); + } + + if (!err) { + if (ro->bound) { + /* unregister old filters */ + if (ro->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ro->ifindex); + if (dev) { + raw_disable_allfilters(dev, sk); + dev_put(dev); + } + } else + raw_disable_allfilters(NULL, sk); + } + ro->ifindex = ifindex; + ro->bound = 1; + } + + out: + release_sock(sk); + + if (notify_enetdown) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + + return err; +} + +static int raw_getname(struct socket *sock, struct sockaddr *uaddr, + int *len, int peer) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + + if (peer) + return -EOPNOTSUPP; + + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = ro->ifindex; + + *len = sizeof(*addr); + + return 0; +} + +static int raw_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + struct can_filter *filter = NULL; /* dyn. alloc'ed filters */ + struct can_filter sfilter; /* single filter */ + struct net_device *dev = NULL; + can_err_mask_t err_mask = 0; + int count = 0; + int err = 0; + + if (level != SOL_CAN_RAW) + return -EINVAL; + + switch (optname) { + + case CAN_RAW_FILTER: + if (optlen % sizeof(struct can_filter) != 0) + return -EINVAL; + + count = optlen / sizeof(struct can_filter); + + if (count > 1) { + /* filter does not fit into dfilter => alloc space */ + filter = memdup_user(optval, optlen); + if (IS_ERR(filter)) + return PTR_ERR(filter); + } else if (count == 1) { + if (copy_from_user(&sfilter, optval, sizeof(sfilter))) + return -EFAULT; + } + + lock_sock(sk); + + if (ro->bound && ro->ifindex) + dev = dev_get_by_index(&init_net, ro->ifindex); + + if (ro->bound) { + /* (try to) register the new filters */ + if (count == 1) + err = raw_enable_filters(dev, sk, &sfilter, 1); + else + err = raw_enable_filters(dev, sk, filter, + count); + if (err) { + if (count > 1) + kfree(filter); + goto out_fil; + } + + /* remove old filter registrations */ + raw_disable_filters(dev, sk, ro->filter, ro->count); + } + + /* remove old filter space */ + if (ro->count > 1) + kfree(ro->filter); + + /* link new filters to the socket */ + if (count == 1) { + /* copy filter data for single filter */ + ro->dfilter = sfilter; + filter = &ro->dfilter; + } + ro->filter = filter; + ro->count = count; + + out_fil: + if (dev) + dev_put(dev); + + release_sock(sk); + + break; + + case CAN_RAW_ERR_FILTER: + if (optlen != sizeof(err_mask)) + return -EINVAL; + + if (copy_from_user(&err_mask, optval, optlen)) + return -EFAULT; + + err_mask &= CAN_ERR_MASK; + + lock_sock(sk); + + if (ro->bound && ro->ifindex) + dev = dev_get_by_index(&init_net, ro->ifindex); + + /* remove current error mask */ + if (ro->bound) { + /* (try to) register the new err_mask */ + err = raw_enable_errfilter(dev, sk, err_mask); + + if (err) + goto out_err; + + /* remove old err_mask registration */ + raw_disable_errfilter(dev, sk, ro->err_mask); + } + + /* link new err_mask to the socket */ + ro->err_mask = err_mask; + + out_err: + if (dev) + dev_put(dev); + + release_sock(sk); + + break; + + case CAN_RAW_LOOPBACK: + if (optlen != sizeof(ro->loopback)) + return -EINVAL; + + if (copy_from_user(&ro->loopback, optval, optlen)) + return -EFAULT; + + break; + + case CAN_RAW_RECV_OWN_MSGS: + if (optlen != sizeof(ro->recv_own_msgs)) + return -EINVAL; + + if (copy_from_user(&ro->recv_own_msgs, optval, optlen)) + return -EFAULT; + + break; + + default: + return -ENOPROTOOPT; + } + return err; +} + +static int raw_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + int len; + void *val; + int err = 0; + + if (level != SOL_CAN_RAW) + return -EINVAL; + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + + case CAN_RAW_FILTER: + lock_sock(sk); + if (ro->count > 0) { + int fsize = ro->count * sizeof(struct can_filter); + if (len > fsize) + len = fsize; + if (copy_to_user(optval, ro->filter, len)) + err = -EFAULT; + } else + len = 0; + release_sock(sk); + + if (!err) + err = put_user(len, optlen); + return err; + + case CAN_RAW_ERR_FILTER: + if (len > sizeof(can_err_mask_t)) + len = sizeof(can_err_mask_t); + val = &ro->err_mask; + break; + + case CAN_RAW_LOOPBACK: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->loopback; + break; + + case CAN_RAW_RECV_OWN_MSGS: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->recv_own_msgs; + break; + + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, val, len)) + return -EFAULT; + return 0; +} + +static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + struct sk_buff *skb; + struct net_device *dev; + int ifindex; + int err; + + if (msg->msg_name) { + struct sockaddr_can *addr = + (struct sockaddr_can *)msg->msg_name; + + if (msg->msg_namelen < sizeof(*addr)) + return -EINVAL; + + if (addr->can_family != AF_CAN) + return -EINVAL; + + ifindex = addr->can_ifindex; + } else + ifindex = ro->ifindex; + + if (size != sizeof(struct can_frame)) + return -EINVAL; + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) + return -ENXIO; + + skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, + &err); + if (!skb) + goto put_dev; + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err < 0) + goto free_skb; + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + if (err < 0) + goto free_skb; + + /* to be able to check the received tx sock reference in raw_rcv() */ + skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF; + + skb->dev = dev; + skb->sk = sk; + + err = can_send(skb, ro->loopback); + + dev_put(dev); + + if (err) + goto send_failed; + + return size; + +free_skb: + kfree_skb(skb); +put_dev: + dev_put(dev); +send_failed: + return err; +} + +static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int err = 0; + int noblock; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + return err; + + if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + else + size = skb->len; + + err = memcpy_toiovec(msg->msg_iov, skb->data, size); + if (err < 0) { + skb_free_datagram(sk, skb); + return err; + } + + sock_recv_ts_and_drops(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + /* assign the flags that have been recorded in raw_rcv() */ + msg->msg_flags |= *(raw_flags(skb)); + + skb_free_datagram(sk, skb); + + return size; +} + +static const struct proto_ops raw_ops = { + .family = PF_CAN, + .release = raw_release, + .bind = raw_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = raw_getname, + .poll = datagram_poll, + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = raw_setsockopt, + .getsockopt = raw_getsockopt, + .sendmsg = raw_sendmsg, + .recvmsg = raw_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto raw_proto __read_mostly = { + .name = "CAN_RAW", + .owner = THIS_MODULE, + .obj_size = sizeof(struct raw_sock), + .init = raw_init, +}; + +static const struct can_proto raw_can_proto = { + .type = SOCK_RAW, + .protocol = CAN_RAW, + .ops = &raw_ops, + .prot = &raw_proto, +}; + +static __init int raw_module_init(void) +{ + int err; + + printk(banner); + + err = can_proto_register(&raw_can_proto); + if (err < 0) + printk(KERN_ERR "can: registration of raw protocol failed\n"); + + return err; +} + +static __exit void raw_module_exit(void) +{ + can_proto_unregister(&raw_can_proto); +} + +module_init(raw_module_init); +module_exit(raw_module_exit); diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig new file mode 100644 index 00000000..be683f2d --- /dev/null +++ b/net/ceph/Kconfig @@ -0,0 +1,29 @@ +config CEPH_LIB + tristate "Ceph core library (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + select KEYS + default n + help + Choose Y or M here to include cephlib, which provides the + common functionality to both the Ceph filesystem and + to the rados block device (rbd). + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_LIB_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_LIB + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This increases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + diff --git a/net/ceph/Makefile b/net/ceph/Makefile new file mode 100644 index 00000000..e87ef435 --- /dev/null +++ b/net/ceph/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for CEPH filesystem. +# +obj-$(CONFIG_CEPH_LIB) += libceph.o + +libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o \ + pagevec.o + diff --git a/net/ceph/armor.c b/net/ceph/armor.c new file mode 100644 index 00000000..1fc1ee11 --- /dev/null +++ b/net/ceph/armor.c @@ -0,0 +1,105 @@ + +#include + +int ceph_armor(char *dst, const char *src, const char *end); +int ceph_unarmor(char *dst, const char *src, const char *end); + +/* + * base64 encode/decode. + */ + +static const char *pem_key = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static int encode_bits(int c) +{ + return pem_key[c]; +} + +static int decode_bits(char c) +{ + if (c >= 'A' && c <= 'Z') + return c - 'A'; + if (c >= 'a' && c <= 'z') + return c - 'a' + 26; + if (c >= '0' && c <= '9') + return c - '0' + 52; + if (c == '+') + return 62; + if (c == '/') + return 63; + if (c == '=') + return 0; /* just non-negative, please */ + return -EINVAL; +} + +int ceph_armor(char *dst, const char *src, const char *end) +{ + int olen = 0; + int line = 0; + + while (src < end) { + unsigned char a, b, c; + + a = *src++; + *dst++ = encode_bits(a >> 2); + if (src < end) { + b = *src++; + *dst++ = encode_bits(((a & 3) << 4) | (b >> 4)); + if (src < end) { + c = *src++; + *dst++ = encode_bits(((b & 15) << 2) | + (c >> 6)); + *dst++ = encode_bits(c & 63); + } else { + *dst++ = encode_bits((b & 15) << 2); + *dst++ = '='; + } + } else { + *dst++ = encode_bits(((a & 3) << 4)); + *dst++ = '='; + *dst++ = '='; + } + olen += 4; + line += 4; + if (line == 64) { + line = 0; + *(dst++) = '\n'; + olen++; + } + } + return olen; +} + +int ceph_unarmor(char *dst, const char *src, const char *end) +{ + int olen = 0; + + while (src < end) { + int a, b, c, d; + + if (src[0] == '\n') { + src++; + continue; + } + if (src + 4 > end) + return -EINVAL; + a = decode_bits(src[0]); + b = decode_bits(src[1]); + c = decode_bits(src[2]); + d = decode_bits(src[3]); + if (a < 0 || b < 0 || c < 0 || d < 0) + return -EINVAL; + + *dst++ = (a << 2) | (b >> 4); + if (src[2] == '=') + return olen + 1; + *dst++ = ((b & 15) << 4) | (c >> 2); + if (src[3] == '=') + return olen + 2; + *dst++ = ((c & 3) << 6) | d; + olen += 3; + src += 4; + } + return olen; +} diff --git a/net/ceph/auth.c b/net/ceph/auth.c new file mode 100644 index 00000000..b4bf4ac0 --- /dev/null +++ b/net/ceph/auth.c @@ -0,0 +1,259 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include "auth_none.h" +#include "auth_x.h" + + +/* + * get protocol handler + */ +static u32 supported_protocols[] = { + CEPH_AUTH_NONE, + CEPH_AUTH_CEPHX +}; + +static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +{ + switch (protocol) { + case CEPH_AUTH_NONE: + return ceph_auth_none_init(ac); + case CEPH_AUTH_CEPHX: + return ceph_x_init(ac); + default: + return -ENOENT; + } +} + +/* + * setup, teardown. + */ +struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key) +{ + struct ceph_auth_client *ac; + int ret; + + dout("auth_init name '%s'\n", name); + + ret = -ENOMEM; + ac = kzalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + goto out; + + ac->negotiating = true; + if (name) + ac->name = name; + else + ac->name = CEPH_AUTH_NAME_DEFAULT; + dout("auth_init name %s\n", ac->name); + ac->key = key; + return ac; + +out: + return ERR_PTR(ret); +} + +void ceph_auth_destroy(struct ceph_auth_client *ac) +{ + dout("auth_destroy %p\n", ac); + if (ac->ops) + ac->ops->destroy(ac); + kfree(ac); +} + +/* + * Reset occurs when reconnecting to the monitor. + */ +void ceph_auth_reset(struct ceph_auth_client *ac) +{ + dout("auth_reset %p\n", ac); + if (ac->ops && !ac->negotiating) + ac->ops->reset(ac); + ac->negotiating = true; +} + +int ceph_entity_name_encode(const char *name, void **p, void *end) +{ + int len = strlen(name); + + if (*p + 2*sizeof(u32) + len > end) + return -ERANGE; + ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_32(p, len); + ceph_encode_copy(p, name, len); + return 0; +} + +/* + * Initiate protocol negotiation with monitor. Include entity name + * and list supported protocols. + */ +int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) +{ + struct ceph_mon_request_header *monhdr = buf; + void *p = monhdr + 1, *end = buf + len, *lenp; + int i, num; + int ret; + + dout("auth_build_hello\n"); + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, 0); /* no protocol, yet */ + + lenp = p; + p += sizeof(u32); + + ceph_decode_need(&p, end, 1 + sizeof(u32), bad); + ceph_encode_8(&p, 1); + num = ARRAY_SIZE(supported_protocols); + ceph_encode_32(&p, num); + ceph_decode_need(&p, end, num * sizeof(u32), bad); + for (i = 0; i < num; i++) + ceph_encode_32(&p, supported_protocols[i]); + + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + return ret; + ceph_decode_need(&p, end, sizeof(u64), bad); + ceph_encode_64(&p, ac->global_id); + + ceph_encode_32(&lenp, p - lenp - sizeof(u32)); + return p - buf; + +bad: + return -ERANGE; +} + +static int ceph_build_auth_request(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + struct ceph_mon_request_header *monhdr = msg_buf; + void *p = monhdr + 1; + void *end = msg_buf + msg_len; + int ret; + + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, ac->protocol); + + ret = ac->ops->build_request(ac, p + sizeof(u32), end); + if (ret < 0) { + pr_err("error %d building auth method %s request\n", ret, + ac->ops->name); + return ret; + } + dout(" built request %d bytes\n", ret); + ceph_encode_32(&p, ret); + return p + ret - msg_buf; +} + +/* + * Handle auth message from monitor. + */ +int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len) +{ + void *p = buf; + void *end = buf + len; + int protocol; + s32 result; + u64 global_id; + void *payload, *payload_end; + int payload_len; + char *result_msg; + int result_msg_len; + int ret = -EINVAL; + + dout("handle_auth_reply %p %p\n", p, end); + ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); + protocol = ceph_decode_32(&p); + result = ceph_decode_32(&p); + global_id = ceph_decode_64(&p); + payload_len = ceph_decode_32(&p); + payload = p; + p += payload_len; + ceph_decode_need(&p, end, sizeof(u32), bad); + result_msg_len = ceph_decode_32(&p); + result_msg = p; + p += result_msg_len; + if (p != end) + goto bad; + + dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len, + result_msg, global_id, payload_len); + + payload_end = payload + payload_len; + + if (global_id && ac->global_id != global_id) { + dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); + ac->global_id = global_id; + } + + if (ac->negotiating) { + /* server does not support our protocols? */ + if (!protocol && result < 0) { + ret = result; + goto out; + } + /* set up (new) protocol handler? */ + if (ac->protocol && ac->protocol != protocol) { + ac->ops->destroy(ac); + ac->protocol = 0; + ac->ops = NULL; + } + if (ac->protocol != protocol) { + ret = ceph_auth_init_protocol(ac, protocol); + if (ret) { + pr_err("error %d on auth protocol %d init\n", + ret, protocol); + goto out; + } + } + + ac->negotiating = false; + } + + ret = ac->ops->handle_reply(ac, result, payload, payload_end); + if (ret == -EAGAIN) { + return ceph_build_auth_request(ac, reply_buf, reply_len); + } else if (ret) { + pr_err("auth method '%s' error %d\n", ac->ops->name, ret); + return ret; + } + return 0; + +bad: + pr_err("failed to decode auth msg\n"); +out: + return ret; +} + +int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + if (!ac->protocol) + return ceph_auth_build_hello(ac, msg_buf, msg_len); + BUG_ON(!ac->ops); + if (ac->ops->should_authenticate(ac)) + return ceph_build_auth_request(ac, msg_buf, msg_len); + return 0; +} + +int ceph_auth_is_authenticated(struct ceph_auth_client *ac) +{ + if (!ac->ops) + return 0; + return ac->ops->is_authenticated(ac); +} diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c new file mode 100644 index 00000000..214c2bb4 --- /dev/null +++ b/net/ceph/auth_none.c @@ -0,0 +1,132 @@ + +#include + +#include +#include +#include +#include + +#include +#include + +#include "auth_none.h" + +static void reset(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = true; + xi->built_authorizer = false; +} + +static void destroy(struct ceph_auth_client *ac) +{ + kfree(ac->private); + ac->private = NULL; +} + +static int is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return !xi->starting; +} + +static int should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return xi->starting; +} + +/* + * the generic auth code decode the global_id, and we carry no actual + * authenticate state, so nothing happens here. + */ +static int handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = false; + return result; +} + +/* + * build an 'authorizer' with our entity_name and global_id. we can + * reuse a single static copy since it is identical for all services + * we connect to. + */ +static int ceph_auth_none_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len) +{ + struct ceph_auth_none_info *ai = ac->private; + struct ceph_none_authorizer *au = &ai->au; + void *p, *end; + int ret; + + if (!ai->built_authorizer) { + p = au->buf; + end = p + sizeof(au->buf); + ceph_encode_8(&p, 1); + ret = ceph_entity_name_encode(ac->name, &p, end - 8); + if (ret < 0) + goto bad; + ceph_decode_need(&p, end, sizeof(u64), bad2); + ceph_encode_64(&p, ac->global_id); + au->buf_len = p - (void *)au->buf; + ai->built_authorizer = true; + dout("built authorizer len %d\n", au->buf_len); + } + + *a = (struct ceph_authorizer *)au; + *buf = au->buf; + *len = au->buf_len; + *reply_buf = au->reply_buf; + *reply_len = sizeof(au->reply_buf); + return 0; + +bad2: + ret = -ERANGE; +bad: + return ret; +} + +static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + /* nothing to do */ +} + +static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .name = "none", + .reset = reset, + .destroy = destroy, + .is_authenticated = is_authenticated, + .should_authenticate = should_authenticate, + .handle_reply = handle_reply, + .create_authorizer = ceph_auth_none_create_authorizer, + .destroy_authorizer = ceph_auth_none_destroy_authorizer, +}; + +int ceph_auth_none_init(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi; + + dout("ceph_auth_none_init %p\n", ac); + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + return -ENOMEM; + + xi->starting = true; + xi->built_authorizer = false; + + ac->protocol = CEPH_AUTH_NONE; + ac->private = xi; + ac->ops = &ceph_auth_none_ops; + return 0; +} + diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h new file mode 100644 index 00000000..ed7d088b --- /dev/null +++ b/net/ceph/auth_none.h @@ -0,0 +1,29 @@ +#ifndef _FS_CEPH_AUTH_NONE_H +#define _FS_CEPH_AUTH_NONE_H + +#include +#include + +/* + * null security mode. + * + * we use a single static authorizer that simply encodes our entity name + * and global id. + */ + +struct ceph_none_authorizer { + char buf[128]; + int buf_len; + char reply_buf[0]; +}; + +struct ceph_auth_none_info { + bool starting; + bool built_authorizer; + struct ceph_none_authorizer au; /* we only need one; it's static */ +}; + +extern int ceph_auth_none_init(struct ceph_auth_client *ac); + +#endif + diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c new file mode 100644 index 00000000..1587dc60 --- /dev/null +++ b/net/ceph/auth_x.c @@ -0,0 +1,690 @@ + +#include + +#include +#include +#include +#include + +#include +#include + +#include "crypto.h" +#include "auth_x.h" +#include "auth_x_protocol.h" + +#define TEMP_TICKET_BUF_LEN 256 + +static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); + +static int ceph_x_is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return (ac->want_keys & xi->have_keys) == ac->want_keys; +} + +static int ceph_x_should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return need != 0; +} + +static int ceph_x_encrypt_buflen(int ilen) +{ + return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + + sizeof(u32); +} + +static int ceph_x_encrypt(struct ceph_crypto_key *secret, + void *ibuf, int ilen, void *obuf, size_t olen) +{ + struct ceph_x_encrypt_header head = { + .struct_v = 1, + .magic = cpu_to_le64(CEPHX_ENC_MAGIC) + }; + size_t len = olen - sizeof(u32); + int ret; + + ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, + &head, sizeof(head), ibuf, ilen); + if (ret) + return ret; + ceph_encode_32(&obuf, len); + return len + sizeof(u32); +} + +static int ceph_x_decrypt(struct ceph_crypto_key *secret, + void **p, void *end, void *obuf, size_t olen) +{ + struct ceph_x_encrypt_header head; + size_t head_len = sizeof(head); + int len, ret; + + len = ceph_decode_32(p); + if (*p + len > end) + return -EINVAL; + + dout("ceph_x_decrypt len %d\n", len); + ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen, + *p, len); + if (ret) + return ret; + if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) + return -EPERM; + *p += len; + return olen; +} + +/* + * get existing (or insert new) ticket handler + */ +static struct ceph_x_ticket_handler * +get_ticket_handler(struct ceph_auth_client *ac, int service) +{ + struct ceph_x_ticket_handler *th; + struct ceph_x_info *xi = ac->private; + struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; + + while (*p) { + parent = *p; + th = rb_entry(parent, struct ceph_x_ticket_handler, node); + if (service < th->service) + p = &(*p)->rb_left; + else if (service > th->service) + p = &(*p)->rb_right; + else + return th; + } + + /* add it */ + th = kzalloc(sizeof(*th), GFP_NOFS); + if (!th) + return ERR_PTR(-ENOMEM); + th->service = service; + rb_link_node(&th->node, parent, p); + rb_insert_color(&th->node, &xi->ticket_handlers); + return th; +} + +static void remove_ticket_handler(struct ceph_auth_client *ac, + struct ceph_x_ticket_handler *th) +{ + struct ceph_x_info *xi = ac->private; + + dout("remove_ticket_handler %p %d\n", th, th->service); + rb_erase(&th->node, &xi->ticket_handlers); + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + kfree(th); +} + +static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + int num; + void *p = buf; + int ret; + char *dbuf; + char *ticket_buf; + u8 reply_struct_v; + + dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); + if (!dbuf) + return -ENOMEM; + + ret = -ENOMEM; + ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); + if (!ticket_buf) + goto out_dbuf; + + ceph_decode_need(&p, end, 1 + sizeof(u32), bad); + reply_struct_v = ceph_decode_8(&p); + if (reply_struct_v != 1) + goto bad; + num = ceph_decode_32(&p); + dout("%d tickets\n", num); + while (num--) { + int type; + u8 tkt_struct_v, blob_struct_v; + struct ceph_x_ticket_handler *th; + void *dp, *dend; + int dlen; + char is_enc; + struct timespec validity; + struct ceph_crypto_key old_key; + void *tp, *tpend; + struct ceph_timespec new_validity; + struct ceph_crypto_key new_session_key; + struct ceph_buffer *new_ticket_blob; + unsigned long new_expires, new_renew_after; + u64 new_secret_id; + + ceph_decode_need(&p, end, sizeof(u32) + 1, bad); + + type = ceph_decode_32(&p); + dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); + + tkt_struct_v = ceph_decode_8(&p); + if (tkt_struct_v != 1) + goto bad; + + th = get_ticket_handler(ac, type); + if (IS_ERR(th)) { + ret = PTR_ERR(th); + goto out; + } + + /* blob for me */ + dlen = ceph_x_decrypt(secret, &p, end, dbuf, + TEMP_TICKET_BUF_LEN); + if (dlen <= 0) { + ret = dlen; + goto out; + } + dout(" decrypted %d bytes\n", dlen); + dend = dbuf + dlen; + dp = dbuf; + + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) + goto bad; + + memcpy(&old_key, &th->session_key, sizeof(old_key)); + ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); + if (ret) + goto out; + + ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); + ceph_decode_timespec(&validity, &new_validity); + new_expires = get_seconds() + validity.tv_sec; + new_renew_after = new_expires - (validity.tv_sec / 4); + dout(" expires=%lu renew_after=%lu\n", new_expires, + new_renew_after); + + /* ticket blob for service */ + ceph_decode_8_safe(&p, end, is_enc, bad); + tp = ticket_buf; + if (is_enc) { + /* encrypted */ + dout(" encrypted ticket\n"); + dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf, + TEMP_TICKET_BUF_LEN); + if (dlen < 0) { + ret = dlen; + goto out; + } + dlen = ceph_decode_32(&tp); + } else { + /* unencrypted */ + ceph_decode_32_safe(&p, end, dlen, bad); + ceph_decode_need(&p, end, dlen, bad); + ceph_decode_copy(&p, ticket_buf, dlen); + } + tpend = tp + dlen; + dout(" ticket blob is %d bytes\n", dlen); + ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); + blob_struct_v = ceph_decode_8(&tp); + new_secret_id = ceph_decode_64(&tp); + ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); + if (ret) + goto out; + + /* all is well, update our ticket */ + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + th->session_key = new_session_key; + th->ticket_blob = new_ticket_blob; + th->validity = new_validity; + th->secret_id = new_secret_id; + th->expires = new_expires; + th->renew_after = new_renew_after; + dout(" got ticket service %d (%s) secret_id %lld len %d\n", + type, ceph_entity_type_name(type), th->secret_id, + (int)th->ticket_blob->vec.iov_len); + xi->have_keys |= th->service; + } + + ret = 0; +out: + kfree(ticket_buf); +out_dbuf: + kfree(dbuf); + return ret; + +bad: + ret = -EINVAL; + goto out; +} + +static int ceph_x_build_authorizer(struct ceph_auth_client *ac, + struct ceph_x_ticket_handler *th, + struct ceph_x_authorizer *au) +{ + int maxlen; + struct ceph_x_authorize_a *msg_a; + struct ceph_x_authorize_b msg_b; + void *p, *end; + int ret; + int ticket_blob_len = + (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); + + dout("build_authorizer for %s %p\n", + ceph_entity_type_name(th->service), au); + + maxlen = sizeof(*msg_a) + sizeof(msg_b) + + ceph_x_encrypt_buflen(ticket_blob_len); + dout(" need len %d\n", maxlen); + if (au->buf && au->buf->alloc_len < maxlen) { + ceph_buffer_put(au->buf); + au->buf = NULL; + } + if (!au->buf) { + au->buf = ceph_buffer_new(maxlen, GFP_NOFS); + if (!au->buf) + return -ENOMEM; + } + au->service = th->service; + + msg_a = au->buf->vec.iov_base; + msg_a->struct_v = 1; + msg_a->global_id = cpu_to_le64(ac->global_id); + msg_a->service_id = cpu_to_le32(th->service); + msg_a->ticket_blob.struct_v = 1; + msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); + msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); + if (ticket_blob_len) { + memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, + th->ticket_blob->vec.iov_len); + } + dout(" th %p secret_id %lld %lld\n", th, th->secret_id, + le64_to_cpu(msg_a->ticket_blob.secret_id)); + + p = msg_a + 1; + p += ticket_blob_len; + end = au->buf->vec.iov_base + au->buf->vec.iov_len; + + get_random_bytes(&au->nonce, sizeof(au->nonce)); + msg_b.struct_v = 1; + msg_b.nonce = cpu_to_le64(au->nonce); + ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b), + p, end - p); + if (ret < 0) + goto out_buf; + p += ret; + au->buf->vec.iov_len = p - au->buf->vec.iov_base; + dout(" built authorizer nonce %llx len %d\n", au->nonce, + (int)au->buf->vec.iov_len); + BUG_ON(au->buf->vec.iov_len > maxlen); + return 0; + +out_buf: + ceph_buffer_put(au->buf); + au->buf = NULL; + return ret; +} + +static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, + void **p, void *end) +{ + ceph_decode_need(p, end, 1 + sizeof(u64), bad); + ceph_encode_8(p, 1); + ceph_encode_64(p, th->secret_id); + if (th->ticket_blob) { + const char *buf = th->ticket_blob->vec.iov_base; + u32 len = th->ticket_blob->vec.iov_len; + + ceph_encode_32_safe(p, end, len, bad); + ceph_encode_copy_safe(p, end, buf, len, bad); + } else { + ceph_encode_32_safe(p, end, 0, bad); + } + + return 0; +bad: + return -ERANGE; +} + +static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) +{ + int want = ac->want_keys; + struct ceph_x_info *xi = ac->private; + int service; + + *pneed = ac->want_keys & ~(xi->have_keys); + + for (service = 1; service <= want; service <<= 1) { + struct ceph_x_ticket_handler *th; + + if (!(ac->want_keys & service)) + continue; + + if (*pneed & service) + continue; + + th = get_ticket_handler(ac, service); + + if (IS_ERR(th)) { + *pneed |= service; + continue; + } + + if (get_seconds() >= th->renew_after) + *pneed |= service; + if (get_seconds() >= th->expires) + xi->have_keys &= ~service; + } +} + + +static int ceph_x_build_request(struct ceph_auth_client *ac, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + int need; + struct ceph_x_request_header *head = buf; + int ret; + struct ceph_x_ticket_handler *th = + get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + + if (IS_ERR(th)) + return PTR_ERR(th); + + ceph_x_validate_tickets(ac, &need); + + dout("build_request want %x have %x need %x\n", + ac->want_keys, xi->have_keys, need); + + if (need & CEPH_ENTITY_TYPE_AUTH) { + struct ceph_x_authenticate *auth = (void *)(head + 1); + void *p = auth + 1; + struct ceph_x_challenge_blob tmp; + char tmp_enc[40]; + u64 *u; + + if (p > end) + return -ERANGE; + + dout(" get_auth_session_key\n"); + head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); + + /* encrypt and hash */ + get_random_bytes(&auth->client_challenge, sizeof(u64)); + tmp.client_challenge = auth->client_challenge; + tmp.server_challenge = cpu_to_le64(xi->server_challenge); + ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), + tmp_enc, sizeof(tmp_enc)); + if (ret < 0) + return ret; + + auth->struct_v = 1; + auth->key = 0; + for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) + auth->key ^= *(__le64 *)u; + dout(" server_challenge %llx client_challenge %llx key %llx\n", + xi->server_challenge, le64_to_cpu(auth->client_challenge), + le64_to_cpu(auth->key)); + + /* now encode the old ticket if exists */ + ret = ceph_x_encode_ticket(th, &p, end); + if (ret < 0) + return ret; + + return p - buf; + } + + if (need) { + void *p = head + 1; + struct ceph_x_service_ticket_request *req; + + if (p > end) + return -ERANGE; + head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); + + ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); + if (ret) + return ret; + ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, + xi->auth_authorizer.buf->vec.iov_len); + + req = p; + req->keys = cpu_to_le32(need); + p += sizeof(*req); + return p - buf; + } + + return 0; +} + +static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + struct ceph_x_reply_header *head = buf; + struct ceph_x_ticket_handler *th; + int len = end - buf; + int op; + int ret; + + if (result) + return result; /* XXX hmm? */ + + if (xi->starting) { + /* it's a hello */ + struct ceph_x_server_challenge *sc = buf; + + if (len != sizeof(*sc)) + return -EINVAL; + xi->server_challenge = le64_to_cpu(sc->server_challenge); + dout("handle_reply got server challenge %llx\n", + xi->server_challenge); + xi->starting = false; + xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; + return -EAGAIN; + } + + op = le16_to_cpu(head->op); + result = le32_to_cpu(head->result); + dout("handle_reply op %d result %d\n", op, result); + switch (op) { + case CEPHX_GET_AUTH_SESSION_KEY: + /* verify auth key */ + ret = ceph_x_proc_ticket_reply(ac, &xi->secret, + buf + sizeof(*head), end); + break; + + case CEPHX_GET_PRINCIPAL_SESSION_KEY: + th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, + buf + sizeof(*head), end); + break; + + default: + return -EINVAL; + } + if (ret) + return ret; + if (ac->want_keys == xi->have_keys) + return 0; + return -EAGAIN; +} + +static int ceph_x_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + int ret; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = kzalloc(sizeof(*au), GFP_NOFS); + if (!au) + return -ENOMEM; + + ret = ceph_x_build_authorizer(ac, th, au); + if (ret) { + kfree(au); + return ret; + } + + *a = (struct ceph_authorizer *)au; + *buf = au->buf->vec.iov_base; + *len = au->buf->vec.iov_len; + *reply_buf = au->reply_buf; + *reply_len = sizeof(au->reply_buf); + return 0; +} + +static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + struct ceph_x_authorizer *au = (void *)a; + struct ceph_x_ticket_handler *th; + int ret = 0; + struct ceph_x_authorize_reply reply; + void *p = au->reply_buf; + void *end = p + sizeof(au->reply_buf); + + th = get_ticket_handler(ac, au->service); + if (IS_ERR(th)) + return PTR_ERR(th); + ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); + if (ret < 0) + return ret; + if (ret != sizeof(reply)) + return -EPERM; + + if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) + ret = -EPERM; + else + ret = 0; + dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", + au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); + return ret; +} + +static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + struct ceph_x_authorizer *au = (void *)a; + + ceph_buffer_put(au->buf); + kfree(au); +} + + +static void ceph_x_reset(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + + dout("reset\n"); + xi->starting = true; + xi->server_challenge = 0; +} + +static void ceph_x_destroy(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + struct rb_node *p; + + dout("ceph_x_destroy %p\n", ac); + ceph_crypto_key_destroy(&xi->secret); + + while ((p = rb_first(&xi->ticket_handlers)) != NULL) { + struct ceph_x_ticket_handler *th = + rb_entry(p, struct ceph_x_ticket_handler, node); + remove_ticket_handler(ac, th); + } + + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + + kfree(ac->private); + ac->private = NULL; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (!IS_ERR(th)) + remove_ticket_handler(ac, th); +} + + +static const struct ceph_auth_client_ops ceph_x_ops = { + .name = "x", + .is_authenticated = ceph_x_is_authenticated, + .should_authenticate = ceph_x_should_authenticate, + .build_request = ceph_x_build_request, + .handle_reply = ceph_x_handle_reply, + .create_authorizer = ceph_x_create_authorizer, + .verify_authorizer_reply = ceph_x_verify_authorizer_reply, + .destroy_authorizer = ceph_x_destroy_authorizer, + .invalidate_authorizer = ceph_x_invalidate_authorizer, + .reset = ceph_x_reset, + .destroy = ceph_x_destroy, +}; + + +int ceph_x_init(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi; + int ret; + + dout("ceph_x_init %p\n", ac); + ret = -ENOMEM; + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + goto out; + + ret = -EINVAL; + if (!ac->key) { + pr_err("no secret set (for auth_x protocol)\n"); + goto out_nomem; + } + + ret = ceph_crypto_key_clone(&xi->secret, ac->key); + if (ret < 0) { + pr_err("cannot clone key: %d\n", ret); + goto out_nomem; + } + + xi->starting = true; + xi->ticket_handlers = RB_ROOT; + + ac->protocol = CEPH_AUTH_CEPHX; + ac->private = xi; + ac->ops = &ceph_x_ops; + return 0; + +out_nomem: + kfree(xi); +out: + return ret; +} + + diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h new file mode 100644 index 00000000..e02da7a5 --- /dev/null +++ b/net/ceph/auth_x.h @@ -0,0 +1,50 @@ +#ifndef _FS_CEPH_AUTH_X_H +#define _FS_CEPH_AUTH_X_H + +#include + +#include + +#include "crypto.h" +#include "auth_x_protocol.h" + +/* + * Handle ticket for a single service. + */ +struct ceph_x_ticket_handler { + struct rb_node node; + unsigned service; + + struct ceph_crypto_key session_key; + struct ceph_timespec validity; + + u64 secret_id; + struct ceph_buffer *ticket_blob; + + unsigned long renew_after, expires; +}; + + +struct ceph_x_authorizer { + struct ceph_buffer *buf; + unsigned service; + u64 nonce; + char reply_buf[128]; /* big enough for encrypted blob */ +}; + +struct ceph_x_info { + struct ceph_crypto_key secret; + + bool starting; + u64 server_challenge; + + unsigned have_keys; + struct rb_root ticket_handlers; + + struct ceph_x_authorizer auth_authorizer; +}; + +extern int ceph_x_init(struct ceph_auth_client *ac); + +#endif + diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h new file mode 100644 index 00000000..671d3057 --- /dev/null +++ b/net/ceph/auth_x_protocol.h @@ -0,0 +1,90 @@ +#ifndef __FS_CEPH_AUTH_X_PROTOCOL +#define __FS_CEPH_AUTH_X_PROTOCOL + +#define CEPHX_GET_AUTH_SESSION_KEY 0x0100 +#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 +#define CEPHX_GET_ROTATING_KEY 0x0400 + +/* common bits */ +struct ceph_x_ticket_blob { + __u8 struct_v; + __le64 secret_id; + __le32 blob_len; + char blob[]; +} __attribute__ ((packed)); + + +/* common request/reply headers */ +struct ceph_x_request_header { + __le16 op; +} __attribute__ ((packed)); + +struct ceph_x_reply_header { + __le16 op; + __le32 result; +} __attribute__ ((packed)); + + +/* authenticate handshake */ + +/* initial hello (no reply header) */ +struct ceph_x_server_challenge { + __u8 struct_v; + __le64 server_challenge; +} __attribute__ ((packed)); + +struct ceph_x_authenticate { + __u8 struct_v; + __le64 client_challenge; + __le64 key; + /* ticket blob */ +} __attribute__ ((packed)); + +struct ceph_x_service_ticket_request { + __u8 struct_v; + __le32 keys; +} __attribute__ ((packed)); + +struct ceph_x_challenge_blob { + __le64 server_challenge; + __le64 client_challenge; +} __attribute__ ((packed)); + + + +/* authorize handshake */ + +/* + * The authorizer consists of two pieces: + * a - service id, ticket blob + * b - encrypted with session key + */ +struct ceph_x_authorize_a { + __u8 struct_v; + __le64 global_id; + __le32 service_id; + struct ceph_x_ticket_blob ticket_blob; +} __attribute__ ((packed)); + +struct ceph_x_authorize_b { + __u8 struct_v; + __le64 nonce; +} __attribute__ ((packed)); + +struct ceph_x_authorize_reply { + __u8 struct_v; + __le64 nonce_plus_one; +} __attribute__ ((packed)); + + +/* + * encyption bundle + */ +#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull + +struct ceph_x_encrypt_header { + __u8 struct_v; + __le64 magic; +} __attribute__ ((packed)); + +#endif diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c new file mode 100644 index 00000000..bf3e6a13 --- /dev/null +++ b/net/ceph/buffer.c @@ -0,0 +1,68 @@ + +#include + +#include +#include + +#include +#include + +struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) +{ + struct ceph_buffer *b; + + b = kmalloc(sizeof(*b), gfp); + if (!b) + return NULL; + + b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); + if (b->vec.iov_base) { + b->is_vmalloc = false; + } else { + b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); + if (!b->vec.iov_base) { + kfree(b); + return NULL; + } + b->is_vmalloc = true; + } + + kref_init(&b->kref); + b->alloc_len = len; + b->vec.iov_len = len; + dout("buffer_new %p\n", b); + return b; +} +EXPORT_SYMBOL(ceph_buffer_new); + +void ceph_buffer_release(struct kref *kref) +{ + struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); + + dout("buffer_release %p\n", b); + if (b->vec.iov_base) { + if (b->is_vmalloc) + vfree(b->vec.iov_base); + else + kfree(b->vec.iov_base); + } + kfree(b); +} +EXPORT_SYMBOL(ceph_buffer_release); + +int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) +{ + size_t len; + + ceph_decode_need(p, end, sizeof(u32), bad); + len = ceph_decode_32(p); + dout("decode_buffer len %d\n", (int)len); + ceph_decode_need(p, end, len, bad); + *b = ceph_buffer_new(len, GFP_NOFS); + if (!*b) + return -ENOMEM; + ceph_decode_copy(p, (*b)->vec.iov_base, len); + return 0; +bad: + return -EINVAL; +} diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c new file mode 100644 index 00000000..132963ab --- /dev/null +++ b/net/ceph/ceph_common.c @@ -0,0 +1,622 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include "crypto.h" + + + +/* + * find filename portion of a path (/foo/bar/baz -> baz) + */ +const char *ceph_file_part(const char *s, int len) +{ + const char *e = s + len; + + while (e != s && *(e-1) != '/') + e--; + return e; +} +EXPORT_SYMBOL(ceph_file_part); + +const char *ceph_msg_type_name(int type) +{ + switch (type) { + case CEPH_MSG_SHUTDOWN: return "shutdown"; + case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; + case CEPH_MSG_MON_MAP: return "mon_map"; + case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; + case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; + case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; + case CEPH_MSG_STATFS: return "statfs"; + case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_CLIENT_SESSION: return "client_session"; + case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; + case CEPH_MSG_CLIENT_REQUEST: return "client_request"; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; + case CEPH_MSG_CLIENT_REPLY: return "client_reply"; + case CEPH_MSG_CLIENT_CAPS: return "client_caps"; + case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_SNAP: return "client_snap"; + case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_OSD_MAP: return "osd_map"; + case CEPH_MSG_OSD_OP: return "osd_op"; + case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; + default: return "unknown"; + } +} +EXPORT_SYMBOL(ceph_msg_type_name); + +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); + return -1; + } + } else { + pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); + memcpy(&client->fsid, fsid, sizeof(*fsid)); + ceph_debugfs_client_init(client); + client->have_fsid = true; + } + return 0; +} +EXPORT_SYMBOL(ceph_check_fsid); + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client) +{ + struct ceph_options *opt1 = new_opt; + struct ceph_options *opt2 = client->options; + int ofs = offsetof(struct ceph_options, mon_addr); + int i; + int ret; + + ret = memcmp(opt1, opt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(opt1->name, opt2->name); + if (ret) + return ret; + + if (opt1->key && !opt2->key) + return -1; + if (!opt1->key && opt2->key) + return 1; + if (opt1->key && opt2->key) { + if (opt1->key->type != opt2->key->type) + return -1; + if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) + return -1; + if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) + return -1; + if (opt1->key->len != opt2->key->len) + return -1; + if (opt1->key->key && !opt2->key->key) + return -1; + if (!opt1->key->key && opt2->key->key) + return 1; + if (opt1->key->key && opt2->key->key) { + ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); + if (ret) + return ret; + } + } + + /* any matching mon ip implies a match */ + for (i = 0; i < opt1->num_mon; i++) { + if (ceph_monmap_contains(client->monc.monmap, + &opt1->mon_addr[i])) + return 0; + } + return -1; +} +EXPORT_SYMBOL(ceph_compare_options); + + +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid %pU", err, fsid); + return err; +} + +/* + * ceph options + */ +enum { + Opt_osdtimeout, + Opt_osdkeepalivetimeout, + Opt_mount_timeout, + Opt_osd_idle_ttl, + Opt_last_int, + /* int args above */ + Opt_fsid, + Opt_name, + Opt_secret, + Opt_key, + Opt_ip, + Opt_last_string, + /* string args above */ + Opt_noshare, + Opt_nocrc, +}; + +static match_table_t opt_tokens = { + {Opt_osdtimeout, "osdtimeout=%d"}, + {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, + {Opt_mount_timeout, "mount_timeout=%d"}, + {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + /* int args above */ + {Opt_fsid, "fsid=%s"}, + {Opt_name, "name=%s"}, + {Opt_secret, "secret=%s"}, + {Opt_key, "key=%s"}, + {Opt_ip, "ip=%s"}, + /* string args above */ + {Opt_noshare, "noshare"}, + {Opt_nocrc, "nocrc"}, + {-1, NULL} +}; + +void ceph_destroy_options(struct ceph_options *opt) +{ + dout("destroy_options %p\n", opt); + kfree(opt->name); + if (opt->key) { + ceph_crypto_key_destroy(opt->key); + kfree(opt->key); + } + kfree(opt); +} +EXPORT_SYMBOL(ceph_destroy_options); + +/* get secret from key store */ +static int get_secret(struct ceph_crypto_key *dst, const char *name) { + struct key *ukey; + int key_err; + int err = 0; + struct ceph_crypto_key *ckey; + + ukey = request_key(&key_type_ceph, name, NULL); + if (!ukey || IS_ERR(ukey)) { + /* request_key errors don't map nicely to mount(2) + errors; don't even try, but still printk */ + key_err = PTR_ERR(ukey); + switch (key_err) { + case -ENOKEY: + pr_warning("ceph: Mount failed due to key not found: %s\n", name); + break; + case -EKEYEXPIRED: + pr_warning("ceph: Mount failed due to expired key: %s\n", name); + break; + case -EKEYREVOKED: + pr_warning("ceph: Mount failed due to revoked key: %s\n", name); + break; + default: + pr_warning("ceph: Mount failed due to unknown key error" + " %d: %s\n", key_err, name); + } + err = -EPERM; + goto out; + } + + ckey = ukey->payload.data; + err = ceph_crypto_key_clone(dst, ckey); + if (err) + goto out_key; + /* pass through, err is 0 */ + +out_key: + key_put(ukey); +out: + return err; +} + +int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) +{ + struct ceph_options *opt; + const char *c; + int err = -ENOMEM; + substring_t argstr[MAX_OPT_ARGS]; + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return err; + opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), + GFP_KERNEL); + if (!opt->mon_addr) + goto out; + + dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, + dev_name); + + /* start with defaults */ + opt->flags = CEPH_OPT_DEFAULT; + opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + + /* get mon ip(s) */ + /* ip1[:port1][,ip2[:port2]...] */ + err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, + CEPH_MAX_MON, &opt->num_mon); + if (err < 0) + goto out; + + /* parse mount options */ + while ((c = strsep(&options, ",")) != NULL) { + int token, intval, ret; + if (!*c) + continue; + err = -EINVAL; + token = match_token((char *)c, opt_tokens, argstr); + if (token < 0 && parse_extra_token) { + /* extra? */ + err = parse_extra_token((char *)c, private); + if (err < 0) { + pr_err("bad option at '%s'\n", c); + goto out; + } + continue; + } + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + continue; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + switch (token) { + case Opt_ip: + err = ceph_parse_ips(argstr[0].from, + argstr[0].to, + &opt->my_addr, + 1, NULL); + if (err < 0) + goto out; + opt->flags |= CEPH_OPT_MYIP; + break; + + case Opt_fsid: + err = parse_fsid(argstr[0].from, &opt->fsid); + if (err == 0) + opt->flags |= CEPH_OPT_FSID; + break; + case Opt_name: + opt->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_secret: + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + if (!opt->key) { + err = -ENOMEM; + goto out; + } + err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); + if (err < 0) + goto out; + break; + case Opt_key: + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + if (!opt->key) { + err = -ENOMEM; + goto out; + } + err = get_secret(opt->key, argstr[0].from); + if (err < 0) + goto out; + break; + + /* misc */ + case Opt_osdtimeout: + opt->osd_timeout = intval; + break; + case Opt_osdkeepalivetimeout: + opt->osd_keepalive_timeout = intval; + break; + case Opt_osd_idle_ttl: + opt->osd_idle_ttl = intval; + break; + case Opt_mount_timeout: + opt->mount_timeout = intval; + break; + + case Opt_noshare: + opt->flags |= CEPH_OPT_NOSHARE; + break; + + case Opt_nocrc: + opt->flags |= CEPH_OPT_NOCRC; + break; + + default: + BUG_ON(token); + } + } + + /* success */ + *popt = opt; + return 0; + +out: + ceph_destroy_options(opt); + return err; +} +EXPORT_SYMBOL(ceph_parse_options); + +u64 ceph_client_id(struct ceph_client *client) +{ + return client->monc.auth->global_id; +} +EXPORT_SYMBOL(ceph_client_id); + +/* + * create a fresh client instance + */ +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) +{ + struct ceph_client *client; + int err = -ENOMEM; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (client == NULL) + return ERR_PTR(-ENOMEM); + + client->private = private; + client->options = opt; + + mutex_init(&client->mount_mutex); + init_waitqueue_head(&client->auth_wq); + client->auth_err = 0; + + client->extra_mon_dispatch = NULL; + client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; + client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; + + client->msgr = NULL; + + /* subsystems */ + err = ceph_monc_init(&client->monc, client); + if (err < 0) + goto fail; + err = ceph_osdc_init(&client->osdc, client); + if (err < 0) + goto fail_monc; + + return client; + +fail_monc: + ceph_monc_stop(&client->monc); +fail: + kfree(client); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ceph_create_client); + +void ceph_destroy_client(struct ceph_client *client) +{ + dout("destroy_client %p\n", client); + + /* unmount */ + ceph_osdc_stop(&client->osdc); + + /* + * make sure osd connections close out before destroying the + * auth module, which is needed to free those connections' + * ceph_authorizers. + */ + ceph_msgr_flush(); + + ceph_monc_stop(&client->monc); + + ceph_debugfs_client_cleanup(client); + + if (client->msgr) + ceph_messenger_destroy(client->msgr); + + ceph_destroy_options(client->options); + + kfree(client); + dout("destroy_client %p done\n", client); +} +EXPORT_SYMBOL(ceph_destroy_client); + +/* + * true if we have the mon map (and have thus joined the cluster) + */ +static int have_mon_and_osd_map(struct ceph_client *client) +{ + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +int __ceph_open_session(struct ceph_client *client, unsigned long started) +{ + struct ceph_entity_addr *myaddr = NULL; + int err; + unsigned long timeout = client->options->mount_timeout * HZ; + + /* initialize the messenger */ + if (client->msgr == NULL) { + if (ceph_test_opt(client, MYIP)) + myaddr = &client->options->my_addr; + client->msgr = ceph_messenger_create(myaddr, + client->supported_features, + client->required_features); + if (IS_ERR(client->msgr)) { + client->msgr = NULL; + return PTR_ERR(client->msgr); + } + client->msgr->nocrc = ceph_test_opt(client, NOCRC); + } + + /* open session, and wait for mon and osd maps */ + err = ceph_monc_open_session(&client->monc); + if (err < 0) + return err; + + while (!have_mon_and_osd_map(client)) { + err = -EIO; + if (timeout && time_after_eq(jiffies, started + timeout)) + return err; + + /* wait */ + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); + if (err == -EINTR || err == -ERESTARTSYS) + return err; + if (client->auth_err < 0) + return client->auth_err; + } + + return 0; +} +EXPORT_SYMBOL(__ceph_open_session); + + +int ceph_open_session(struct ceph_client *client) +{ + int ret; + unsigned long started = jiffies; /* note the start time */ + + dout("open_session start\n"); + mutex_lock(&client->mount_mutex); + + ret = __ceph_open_session(client, started); + + mutex_unlock(&client->mount_mutex); + return ret; +} +EXPORT_SYMBOL(ceph_open_session); + + +static int __init init_ceph_lib(void) +{ + int ret = 0; + + ret = ceph_debugfs_init(); + if (ret < 0) + goto out; + + ret = ceph_crypto_init(); + if (ret < 0) + goto out_debugfs; + + ret = ceph_msgr_init(); + if (ret < 0) + goto out_crypto; + + pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + + return 0; + +out_crypto: + ceph_crypto_shutdown(); +out_debugfs: + ceph_debugfs_cleanup(); +out: + return ret; +} + +static void __exit exit_ceph_lib(void) +{ + dout("exit_ceph_lib\n"); + ceph_msgr_exit(); + ceph_crypto_shutdown(); + ceph_debugfs_cleanup(); +} + +module_init(init_ceph_lib); +module_exit(exit_ceph_lib); + +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +MODULE_AUTHOR("Patience Warnick "); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c new file mode 100644 index 00000000..41466ccb --- /dev/null +++ b/net/ceph/ceph_fs.c @@ -0,0 +1,78 @@ +/* + * Some non-inline ceph helpers + */ +#include +#include + +/* + * return true if @layout appears to be valid + */ +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) +{ + __u32 su = le32_to_cpu(layout->fl_stripe_unit); + __u32 sc = le32_to_cpu(layout->fl_stripe_count); + __u32 os = le32_to_cpu(layout->fl_object_size); + + /* stripe unit, object size must be non-zero, 64k increment */ + if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + /* object size must be a multiple of stripe unit */ + if (os < su || os % su) + return 0; + /* stripe count must be non-zero */ + if (!sc) + return 0; + return 1; +} + + +int ceph_flags_to_mode(int flags) +{ + int mode; + +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif + + switch (flags & O_ACCMODE) { + case O_WRONLY: + mode = CEPH_FILE_MODE_WR; + break; + case O_RDONLY: + mode = CEPH_FILE_MODE_RD; + break; + case O_RDWR: + case O_ACCMODE: /* this is what the VFS does */ + mode = CEPH_FILE_MODE_RDWR; + break; + } +#ifdef O_LAZY + if (flags & O_LAZY) + mode |= CEPH_FILE_MODE_LAZY; +#endif + + return mode; +} +EXPORT_SYMBOL(ceph_flags_to_mode); + +int ceph_caps_for_mode(int mode) +{ + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; +} +EXPORT_SYMBOL(ceph_caps_for_mode); diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c new file mode 100644 index 00000000..0a1b53bc --- /dev/null +++ b/net/ceph/ceph_hash.c @@ -0,0 +1,121 @@ + +#include +#include + +/* + * Robert Jenkin's hash function. + * http://burtleburtle.net/bob/hash/evahash.html + * This is in the public domain. + */ +#define mix(a, b, c) \ + do { \ + a = a - b; a = a - c; a = a ^ (c >> 13); \ + b = b - c; b = b - a; b = b ^ (a << 8); \ + c = c - a; c = c - b; c = c ^ (b >> 13); \ + a = a - b; a = a - c; a = a ^ (c >> 12); \ + b = b - c; b = b - a; b = b ^ (a << 16); \ + c = c - a; c = c - b; c = c ^ (b >> 5); \ + a = a - b; a = a - c; a = a ^ (c >> 3); \ + b = b - c; b = b - a; b = b ^ (a << 10); \ + c = c - a; c = c - b; c = c ^ (b >> 15); \ + } while (0) + +unsigned ceph_str_hash_rjenkins(const char *str, unsigned length) +{ + const unsigned char *k = (const unsigned char *)str; + __u32 a, b, c; /* the internal state */ + __u32 len; /* how many key bytes still need mixing */ + + /* Set up the internal state */ + len = length; + a = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + b = a; + c = 0; /* variable initialization of internal state */ + + /* handle most of the key */ + while (len >= 12) { + a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) + + ((__u32)k[3] << 24)); + b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) + + ((__u32)k[7] << 24)); + c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) + + ((__u32)k[11] << 24)); + mix(a, b, c); + k = k + 12; + len = len - 12; + } + + /* handle the last 11 bytes */ + c = c + length; + switch (len) { /* all the case statements fall through */ + case 11: + c = c + ((__u32)k[10] << 24); + case 10: + c = c + ((__u32)k[9] << 16); + case 9: + c = c + ((__u32)k[8] << 8); + /* the first byte of c is reserved for the length */ + case 8: + b = b + ((__u32)k[7] << 24); + case 7: + b = b + ((__u32)k[6] << 16); + case 6: + b = b + ((__u32)k[5] << 8); + case 5: + b = b + k[4]; + case 4: + a = a + ((__u32)k[3] << 24); + case 3: + a = a + ((__u32)k[2] << 16); + case 2: + a = a + ((__u32)k[1] << 8); + case 1: + a = a + k[0]; + /* case 0: nothing left to add */ + } + mix(a, b, c); + + return c; +} + +/* + * linux dcache hash + */ +unsigned ceph_str_hash_linux(const char *str, unsigned length) +{ + unsigned long hash = 0; + unsigned char c; + + while (length--) { + c = *str++; + hash = (hash + (c << 4) + (c >> 4)) * 11; + } + return hash; +} + + +unsigned ceph_str_hash(int type, const char *s, unsigned len) +{ + switch (type) { + case CEPH_STR_HASH_LINUX: + return ceph_str_hash_linux(s, len); + case CEPH_STR_HASH_RJENKINS: + return ceph_str_hash_rjenkins(s, len); + default: + return -1; + } +} +EXPORT_SYMBOL(ceph_str_hash); + +const char *ceph_str_hash_name(int type) +{ + switch (type) { + case CEPH_STR_HASH_LINUX: + return "linux"; + case CEPH_STR_HASH_RJENKINS: + return "rjenkins"; + default: + return "unknown"; + } +} +EXPORT_SYMBOL(ceph_str_hash_name); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c new file mode 100644 index 00000000..3fbda04d --- /dev/null +++ b/net/ceph/ceph_strings.c @@ -0,0 +1,84 @@ +/* + * Ceph string constants + */ +#include +#include + +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + +const char *ceph_osd_op_name(int op) +{ + switch (op) { + case CEPH_OSD_OP_READ: return "read"; + case CEPH_OSD_OP_STAT: return "stat"; + + case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; + + case CEPH_OSD_OP_WRITE: return "write"; + case CEPH_OSD_OP_DELETE: return "delete"; + case CEPH_OSD_OP_TRUNCATE: return "truncate"; + case CEPH_OSD_OP_ZERO: return "zero"; + case CEPH_OSD_OP_WRITEFULL: return "writefull"; + case CEPH_OSD_OP_ROLLBACK: return "rollback"; + + case CEPH_OSD_OP_APPEND: return "append"; + case CEPH_OSD_OP_STARTSYNC: return "startsync"; + case CEPH_OSD_OP_SETTRUNC: return "settrunc"; + case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; + + case CEPH_OSD_OP_TMAPUP: return "tmapup"; + case CEPH_OSD_OP_TMAPGET: return "tmapget"; + case CEPH_OSD_OP_TMAPPUT: return "tmapput"; + + case CEPH_OSD_OP_GETXATTR: return "getxattr"; + case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; + case CEPH_OSD_OP_SETXATTR: return "setxattr"; + case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; + case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; + case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; + + case CEPH_OSD_OP_PULL: return "pull"; + case CEPH_OSD_OP_PUSH: return "push"; + case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; + case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; + case CEPH_OSD_OP_SCRUB: return "scrub"; + + case CEPH_OSD_OP_WRLOCK: return "wrlock"; + case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; + case CEPH_OSD_OP_RDLOCK: return "rdlock"; + case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; + case CEPH_OSD_OP_UPLOCK: return "uplock"; + case CEPH_OSD_OP_DNLOCK: return "dnlock"; + + case CEPH_OSD_OP_CALL: return "call"; + + case CEPH_OSD_OP_PGLS: return "pgls"; + } + return "???"; +} + + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c new file mode 100644 index 00000000..d6ebb13a --- /dev/null +++ b/net/ceph/crush/crush.c @@ -0,0 +1,151 @@ + +#ifdef __KERNEL__ +# include +#else +# include +# include +# define kfree(x) do { if (x) free(x); } while (0) +# define BUG_ON(x) assert(!(x)) +#endif + +#include + +const char *crush_bucket_alg_name(int alg) +{ + switch (alg) { + case CRUSH_BUCKET_UNIFORM: return "uniform"; + case CRUSH_BUCKET_LIST: return "list"; + case CRUSH_BUCKET_TREE: return "tree"; + case CRUSH_BUCKET_STRAW: return "straw"; + default: return "unknown"; + } +} + +/** + * crush_get_bucket_item_weight - Get weight of an item in given bucket + * @b: bucket pointer + * @p: item index in bucket + */ +int crush_get_bucket_item_weight(struct crush_bucket *b, int p) +{ + if (p >= b->size) + return 0; + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return ((struct crush_bucket_uniform *)b)->item_weight; + case CRUSH_BUCKET_LIST: + return ((struct crush_bucket_list *)b)->item_weights[p]; + case CRUSH_BUCKET_TREE: + if (p & 1) + return ((struct crush_bucket_tree *)b)->node_weights[p]; + return 0; + case CRUSH_BUCKET_STRAW: + return ((struct crush_bucket_straw *)b)->item_weights[p]; + } + return 0; +} + +/** + * crush_calc_parents - Calculate parent vectors for the given crush map. + * @map: crush_map pointer + */ +void crush_calc_parents(struct crush_map *map) +{ + int i, b, c; + + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == NULL) + continue; + for (i = 0; i < map->buckets[b]->size; i++) { + c = map->buckets[b]->items[i]; + BUG_ON(c >= map->max_devices || + c < -map->max_buckets); + if (c >= 0) + map->device_parents[c] = map->buckets[b]->id; + else + map->bucket_parents[-1-c] = map->buckets[b]->id; + } + } +} + +void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) +{ + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_list(struct crush_bucket_list *b) +{ + kfree(b->item_weights); + kfree(b->sum_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_tree(struct crush_bucket_tree *b) +{ + kfree(b->node_weights); + kfree(b); +} + +void crush_destroy_bucket_straw(struct crush_bucket_straw *b) +{ + kfree(b->straws); + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket(struct crush_bucket *b) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); + break; + case CRUSH_BUCKET_LIST: + crush_destroy_bucket_list((struct crush_bucket_list *)b); + break; + case CRUSH_BUCKET_TREE: + crush_destroy_bucket_tree((struct crush_bucket_tree *)b); + break; + case CRUSH_BUCKET_STRAW: + crush_destroy_bucket_straw((struct crush_bucket_straw *)b); + break; + } +} + +/** + * crush_destroy - Destroy a crush_map + * @map: crush_map pointer + */ +void crush_destroy(struct crush_map *map) +{ + int b; + + /* buckets */ + if (map->buckets) { + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == NULL) + continue; + crush_destroy_bucket(map->buckets[b]); + } + kfree(map->buckets); + } + + /* rules */ + if (map->rules) { + for (b = 0; b < map->max_rules; b++) + kfree(map->rules[b]); + kfree(map->rules); + } + + kfree(map->bucket_parents); + kfree(map->device_parents); + kfree(map); +} + + diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c new file mode 100644 index 00000000..5bb63e37 --- /dev/null +++ b/net/ceph/crush/hash.c @@ -0,0 +1,149 @@ + +#include +#include + +/* + * Robert Jenkins' function for mixing 32-bit values + * http://burtleburtle.net/bob/hash/evahash.html + * a, b = random bits, c = input and output + */ +#define crush_hashmix(a, b, c) do { \ + a = a-b; a = a-c; a = a^(c>>13); \ + b = b-c; b = b-a; b = b^(a<<8); \ + c = c-a; c = c-b; c = c^(b>>13); \ + a = a-b; a = a-c; a = a^(c>>12); \ + b = b-c; b = b-a; b = b^(a<<16); \ + c = c-a; c = c-b; c = c^(b>>5); \ + a = a-b; a = a-c; a = a^(c>>3); \ + b = b-c; b = b-a; b = b^(a<<10); \ + c = c-a; c = c-b; c = c^(b>>15); \ + } while (0) + +#define crush_hash_seed 1315423911 + +static __u32 crush_hash32_rjenkins1(__u32 a) +{ + __u32 hash = crush_hash_seed ^ a; + __u32 b = a; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(b, x, hash); + crush_hashmix(y, a, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) +{ + __u32 hash = crush_hash_seed ^ a ^ b; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(x, a, hash); + crush_hashmix(b, y, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(a, x, hash); + crush_hashmix(y, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, d, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, + __u32 e) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(e, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + crush_hashmix(d, x, hash); + crush_hashmix(y, e, hash); + return hash; +} + + +__u32 crush_hash32(int type, __u32 a) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1(a); + default: + return 0; + } +} + +__u32 crush_hash32_2(int type, __u32 a, __u32 b) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_2(a, b); + default: + return 0; + } +} + +__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_3(a, b, c); + default: + return 0; + } +} + +__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_4(a, b, c, d); + default: + return 0; + } +} + +__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_5(a, b, c, d, e); + default: + return 0; + } +} + +const char *crush_hash_name(int type) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return "rjenkins1"; + default: + return "unknown"; + } +} diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c new file mode 100644 index 00000000..42599e31 --- /dev/null +++ b/net/ceph/crush/mapper.c @@ -0,0 +1,609 @@ + +#ifdef __KERNEL__ +# include +# include +# include +# include +# ifndef dprintk +# define dprintk(args...) +# endif +#else +# include +# include +# include +# include +# define BUG_ON(x) assert(!(x)) +# define dprintk(args...) /* printf(args) */ +# define kmalloc(x, f) malloc(x) +# define kfree(x) free(x) +#endif + +#include +#include + +/* + * Implement the core CRUSH mapping algorithm. + */ + +/** + * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. + * @map: the crush_map + * @ruleset: the storage ruleset id (user defined) + * @type: storage ruleset type (user defined) + * @size: output set size + */ +int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) +{ + int i; + + for (i = 0; i < map->max_rules; i++) { + if (map->rules[i] && + map->rules[i]->mask.ruleset == ruleset && + map->rules[i]->mask.type == type && + map->rules[i]->mask.min_size <= size && + map->rules[i]->mask.max_size >= size) + return i; + } + return -1; +} + + +/* + * bucket choose methods + * + * For each bucket algorithm, we have a "choose" method that, given a + * crush input @x and replica position (usually, position in output set) @r, + * will produce an item in the bucket. + */ + +/* + * Choose based on a random permutation of the bucket. + * + * We used to use some prime number arithmetic to do this, but it + * wasn't very random, and had some other bad behaviors. Instead, we + * calculate an actual random permutation of the bucket members. + * Since this is expensive, we optimize for the r=0 case, which + * captures the vast majority of calls. + */ +static int bucket_perm_choose(struct crush_bucket *bucket, + int x, int r) +{ + unsigned pr = r % bucket->size; + unsigned i, s; + + /* start a new permutation if @x has changed */ + if (bucket->perm_x != x || bucket->perm_n == 0) { + dprintk("bucket %d new x=%d\n", bucket->id, x); + bucket->perm_x = x; + + /* optimize common r=0 case */ + if (pr == 0) { + s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % + bucket->size; + bucket->perm[0] = s; + bucket->perm_n = 0xffff; /* magic value, see below */ + goto out; + } + + for (i = 0; i < bucket->size; i++) + bucket->perm[i] = i; + bucket->perm_n = 0; + } else if (bucket->perm_n == 0xffff) { + /* clean up after the r=0 case above */ + for (i = 1; i < bucket->size; i++) + bucket->perm[i] = i; + bucket->perm[bucket->perm[0]] = 0; + bucket->perm_n = 1; + } + + /* calculate permutation up to pr */ + for (i = 0; i < bucket->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); + while (bucket->perm_n <= pr) { + unsigned p = bucket->perm_n; + /* no point in swapping the final entry */ + if (p < bucket->size - 1) { + i = crush_hash32_3(bucket->hash, x, bucket->id, p) % + (bucket->size - p); + if (i) { + unsigned t = bucket->perm[p + i]; + bucket->perm[p + i] = bucket->perm[p]; + bucket->perm[p] = t; + } + dprintk(" perm_choose swap %d with %d\n", p, p+i); + } + bucket->perm_n++; + } + for (i = 0; i < bucket->size; i++) + dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); + + s = bucket->perm[pr]; +out: + dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, + bucket->size, x, r, pr, s); + return bucket->items[s]; +} + +/* uniform */ +static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, + int x, int r) +{ + return bucket_perm_choose(&bucket->h, x, r); +} + +/* list */ +static int bucket_list_choose(struct crush_bucket_list *bucket, + int x, int r) +{ + int i; + + for (i = bucket->h.size-1; i >= 0; i--) { + __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], + r, bucket->h.id); + w &= 0xffff; + dprintk("list_choose i=%d x=%d r=%d item %d weight %x " + "sw %x rand %llx", + i, x, r, bucket->h.items[i], bucket->item_weights[i], + bucket->sum_weights[i], w); + w *= bucket->sum_weights[i]; + w = w >> 16; + /*dprintk(" scaled %llx\n", w);*/ + if (w < bucket->item_weights[i]) + return bucket->h.items[i]; + } + + BUG_ON(1); + return 0; +} + + +/* (binary) tree */ +static int height(int n) +{ + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} + +static int left(int x) +{ + int h = height(x); + return x - (1 << (h-1)); +} + +static int right(int x) +{ + int h = height(x); + return x + (1 << (h-1)); +} + +static int terminal(int x) +{ + return x & 1; +} + +static int bucket_tree_choose(struct crush_bucket_tree *bucket, + int x, int r) +{ + int n, l; + __u32 w; + __u64 t; + + /* start at root */ + n = bucket->num_nodes >> 1; + + while (!terminal(n)) { + /* pick point in [0, w) */ + w = bucket->node_weights[n]; + t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, + bucket->h.id) * (__u64)w; + t = t >> 32; + + /* descend to the left or right? */ + l = left(n); + if (t < bucket->node_weights[l]) + n = l; + else + n = right(n); + } + + return bucket->h.items[n >> 1]; +} + + +/* straw */ + +static int bucket_straw_choose(struct crush_bucket_straw *bucket, + int x, int r) +{ + int i; + int high = 0; + __u64 high_draw = 0; + __u64 draw; + + for (i = 0; i < bucket->h.size; i++) { + draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); + draw &= 0xffff; + draw *= bucket->straws[i]; + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + +static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +{ + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); + switch (in->alg) { + case CRUSH_BUCKET_UNIFORM: + return bucket_uniform_choose((struct crush_bucket_uniform *)in, + x, r); + case CRUSH_BUCKET_LIST: + return bucket_list_choose((struct crush_bucket_list *)in, + x, r); + case CRUSH_BUCKET_TREE: + return bucket_tree_choose((struct crush_bucket_tree *)in, + x, r); + case CRUSH_BUCKET_STRAW: + return bucket_straw_choose((struct crush_bucket_straw *)in, + x, r); + default: + BUG_ON(1); + return in->items[0]; + } +} + +/* + * true if device is marked "out" (failed, fully offloaded) + * of the cluster + */ +static int is_out(struct crush_map *map, __u32 *weight, int item, int x) +{ + if (weight[item] >= 0x10000) + return 0; + if (weight[item] == 0) + return 1; + if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) + < weight[item]) + return 0; + return 1; +} + +/** + * crush_choose - choose numrep distinct items of given type + * @map: the crush_map + * @bucket: the bucket we are choose an item from + * @x: crush input value + * @numrep: the number of items to choose + * @type: the type of item to choose + * @out: pointer to output vector + * @outpos: our position in that vector + * @firstn: true if choosing "first n" items, false if choosing "indep" + * @recurse_to_leaf: true if we want one device under each item of given type + * @out2: second output vector for leaf items (if @recurse_to_leaf) + */ +static int crush_choose(struct crush_map *map, + struct crush_bucket *bucket, + __u32 *weight, + int x, int numrep, int type, + int *out, int outpos, + int firstn, int recurse_to_leaf, + int *out2) +{ + int rep; + int ftotal, flocal; + int retry_descent, retry_bucket, skip_rep; + struct crush_bucket *in = bucket; + int r; + int i; + int item = 0; + int itemtype; + int collide, reject; + const int orig_tries = 5; /* attempts before we fall back to search */ + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + for (rep = outpos; rep < numrep; rep++) { + /* keep trying until we get a non-out, non-colliding item */ + ftotal = 0; + skip_rep = 0; + do { + retry_descent = 0; + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + flocal = 0; + do { + collide = 0; + retry_bucket = 0; + r = rep; + if (in->alg == CRUSH_BUCKET_UNIFORM) { + /* be careful */ + if (firstn || numrep >= in->size) + /* r' = r + f_total */ + r += ftotal; + else if (in->size % numrep == 0) + /* r'=r+(n+1)*f_local */ + r += (numrep+1) * + (flocal+ftotal); + else + /* r' = r + n*f_local */ + r += numrep * (flocal+ftotal); + } else { + if (firstn) + /* r' = r + f_total */ + r += ftotal; + else + /* r' = r + n*f_local */ + r += numrep * (flocal+ftotal); + } + + /* bucket choose */ + if (in->size == 0) { + reject = 1; + goto reject; + } + if (flocal >= (in->size>>1) && + flocal > orig_tries) + item = bucket_perm_choose(in, x, r); + else + item = crush_bucket_choose(in, x, r); + BUG_ON(item >= map->max_devices); + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + BUG_ON(item >= 0 || + (-1-item) >= map->max_buckets); + in = map->buckets[-1-item]; + retry_bucket = 1; + continue; + } + + /* collision? */ + for (i = 0; i < outpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + + reject = 0; + if (recurse_to_leaf) { + if (item < 0) { + if (crush_choose(map, + map->buckets[-1-item], + weight, + x, outpos+1, 0, + out2, outpos, + firstn, 0, + NULL) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject) { + /* out? */ + if (itemtype == 0) + reject = is_out(map, weight, + item, x); + else + reject = 0; + } + +reject: + if (reject || collide) { + ftotal++; + flocal++; + + if (collide && flocal < 3) + /* retry locally a few times */ + retry_bucket = 1; + else if (flocal < in->size + orig_tries) + /* exhaustive bucket search */ + retry_bucket = 1; + else if (ftotal < 20) + /* then retry descent */ + retry_descent = 1; + else + /* else give up */ + skip_rep = 1; + dprintk(" reject %d collide %d " + "ftotal %d flocal %d\n", + reject, collide, ftotal, + flocal); + } + } while (retry_bucket); + } while (retry_descent); + + if (skip_rep) { + dprintk("skip rep\n"); + continue; + } + + dprintk("CHOOSE got %d\n", item); + out[outpos] = item; + outpos++; + } + + dprintk("CHOOSE returns %d\n", outpos); + return outpos; +} + + +/** + * crush_do_rule - calculate a mapping with the given input and rule + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @force: force initial replica choice; -1 for none + */ +int crush_do_rule(struct crush_map *map, + int ruleno, int x, int *result, int result_max, + int force, __u32 *weight) +{ + int result_len; + int force_context[CRUSH_MAX_DEPTH]; + int force_pos = -1; + int a[CRUSH_MAX_SET]; + int b[CRUSH_MAX_SET]; + int c[CRUSH_MAX_SET]; + int recurse_to_leaf; + int *w; + int wsize = 0; + int *o; + int osize; + int *tmp; + struct crush_rule *rule; + int step; + int i, j; + int numrep; + int firstn; + int rc = -1; + + BUG_ON(ruleno >= map->max_rules); + + rule = map->rules[ruleno]; + result_len = 0; + w = a; + o = b; + + /* + * determine hierarchical context of force, if any. note + * that this may or may not correspond to the specific types + * referenced by the crush rule. + */ + if (force >= 0) { + if (force >= map->max_devices || + map->device_parents[force] == 0) { + /*dprintk("CRUSH: forcefed device dne\n");*/ + rc = -1; /* force fed device dne */ + goto out; + } + if (!is_out(map, weight, force, x)) { + while (1) { + force_context[++force_pos] = force; + if (force >= 0) + force = map->device_parents[force]; + else + force = map->bucket_parents[-1-force]; + if (force == 0) + break; + } + } + } + + for (step = 0; step < rule->len; step++) { + firstn = 0; + switch (rule->steps[step].op) { + case CRUSH_RULE_TAKE: + w[0] = rule->steps[step].arg1; + if (force_pos >= 0) { + BUG_ON(force_context[force_pos] != w[0]); + force_pos--; + } + wsize = 1; + break; + + case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: + case CRUSH_RULE_CHOOSE_FIRSTN: + firstn = 1; + case CRUSH_RULE_CHOOSE_LEAF_INDEP: + case CRUSH_RULE_CHOOSE_INDEP: + BUG_ON(wsize == 0); + + recurse_to_leaf = + rule->steps[step].op == + CRUSH_RULE_CHOOSE_LEAF_FIRSTN || + rule->steps[step].op == + CRUSH_RULE_CHOOSE_LEAF_INDEP; + + /* reset output */ + osize = 0; + + for (i = 0; i < wsize; i++) { + /* + * see CRUSH_N, CRUSH_N_MINUS macros. + * basically, numrep <= 0 means relative to + * the provided result_max + */ + numrep = rule->steps[step].arg1; + if (numrep <= 0) { + numrep += result_max; + if (numrep <= 0) + continue; + } + j = 0; + if (osize == 0 && force_pos >= 0) { + /* skip any intermediate types */ + while (force_pos && + force_context[force_pos] < 0 && + rule->steps[step].arg2 != + map->buckets[-1 - + force_context[force_pos]]->type) + force_pos--; + o[osize] = force_context[force_pos]; + if (recurse_to_leaf) + c[osize] = force_context[0]; + j++; + force_pos--; + } + osize += crush_choose(map, + map->buckets[-1-w[i]], + weight, + x, numrep, + rule->steps[step].arg2, + o+osize, j, + firstn, + recurse_to_leaf, c+osize); + } + + if (recurse_to_leaf) + /* copy final _leaf_ values to output set */ + memcpy(o, c, osize*sizeof(*o)); + + /* swap t and w arrays */ + tmp = o; + o = w; + w = tmp; + wsize = osize; + break; + + + case CRUSH_RULE_EMIT: + for (i = 0; i < wsize && result_len < result_max; i++) { + result[result_len] = w[i]; + result_len++; + } + wsize = 0; + break; + + default: + BUG_ON(1); + } + } + rc = result_len; + +out: + return rc; +} + + diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c new file mode 100644 index 00000000..5a8009c9 --- /dev/null +++ b/net/ceph/crypto.c @@ -0,0 +1,485 @@ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include "crypto.h" + +int ceph_crypto_key_clone(struct ceph_crypto_key *dst, + const struct ceph_crypto_key *src) +{ + memcpy(dst, src, sizeof(struct ceph_crypto_key)); + dst->key = kmalloc(src->len, GFP_NOFS); + if (!dst->key) + return -ENOMEM; + memcpy(dst->key, src->key, src->len); + return 0; +} + +int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) +{ + if (*p + sizeof(u16) + sizeof(key->created) + + sizeof(u16) + key->len > end) + return -ERANGE; + ceph_encode_16(p, key->type); + ceph_encode_copy(p, &key->created, sizeof(key->created)); + ceph_encode_16(p, key->len); + ceph_encode_copy(p, key->key, key->len); + return 0; +} + +int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) +{ + ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); + key->type = ceph_decode_16(p); + ceph_decode_copy(p, &key->created, sizeof(key->created)); + key->len = ceph_decode_16(p); + ceph_decode_need(p, end, key->len, bad); + key->key = kmalloc(key->len, GFP_NOFS); + if (!key->key) + return -ENOMEM; + ceph_decode_copy(p, key->key, key->len); + return 0; + +bad: + dout("failed to decode crypto key\n"); + return -EINVAL; +} + +int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) +{ + int inlen = strlen(inkey); + int blen = inlen * 3 / 4; + void *buf, *p; + int ret; + + dout("crypto_key_unarmor %s\n", inkey); + buf = kmalloc(blen, GFP_NOFS); + if (!buf) + return -ENOMEM; + blen = ceph_unarmor(buf, inkey, inkey+inlen); + if (blen < 0) { + kfree(buf); + return blen; + } + + p = buf; + ret = ceph_crypto_key_decode(key, &p, p + blen); + kfree(buf); + if (ret) + return ret; + dout("crypto_key_unarmor key %p type %d len %d\n", key, + key->type, key->len); + return 0; +} + + + +#define AES_KEY_SIZE 16 + +static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) +{ + return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); +} + +static const u8 *aes_iv = (u8 *)CEPH_AES_IV; + +static int ceph_aes_encrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[2], sg_out[1]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + int ret; + void *iv; + int ivsize; + size_t zero_padding = (0x10 - (src_len & 0x0f)); + char pad[16]; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + memset(pad, zero_padding, zero_padding); + + *dst_len = src_len + zero_padding; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + sg_init_table(sg_in, 2); + sg_set_buf(&sg_in[0], src, src_len); + sg_set_buf(&sg_in[1], pad, zero_padding); + sg_init_table(sg_out, 1); + sg_set_buf(sg_out, dst, *dst_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + /* + print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, + pad, zero_padding, 1); + */ + ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, + src_len + zero_padding); + crypto_free_blkcipher(tfm); + if (ret < 0) + pr_err("ceph_aes_crypt failed %d\n", ret); + /* + print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + return 0; +} + +static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, + size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) +{ + struct scatterlist sg_in[3], sg_out[1]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + int ret; + void *iv; + int ivsize; + size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f)); + char pad[16]; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + memset(pad, zero_padding, zero_padding); + + *dst_len = src1_len + src2_len + zero_padding; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + sg_init_table(sg_in, 3); + sg_set_buf(&sg_in[0], src1, src1_len); + sg_set_buf(&sg_in[1], src2, src2_len); + sg_set_buf(&sg_in[2], pad, zero_padding); + sg_init_table(sg_out, 1); + sg_set_buf(sg_out, dst, *dst_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + /* + print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1, + src1, src1_len, 1); + print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1, + src2, src2_len, 1); + print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, + pad, zero_padding, 1); + */ + ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, + src1_len + src2_len + zero_padding); + crypto_free_blkcipher(tfm); + if (ret < 0) + pr_err("ceph_aes_crypt2 failed %d\n", ret); + /* + print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + return 0; +} + +static int ceph_aes_decrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[1], sg_out[2]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm }; + char pad[16]; + void *iv; + int ivsize; + int ret; + int last_byte; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + sg_init_table(sg_in, 1); + sg_init_table(sg_out, 2); + sg_set_buf(sg_in, src, src_len); + sg_set_buf(&sg_out[0], dst, *dst_len); + sg_set_buf(&sg_out[1], pad, sizeof(pad)); + + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + */ + + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); + crypto_free_blkcipher(tfm); + if (ret < 0) { + pr_err("ceph_aes_decrypt failed %d\n", ret); + return ret; + } + + if (src_len <= *dst_len) + last_byte = ((char *)dst)[src_len - 1]; + else + last_byte = pad[src_len - *dst_len - 1]; + if (last_byte <= 16 && src_len >= last_byte) { + *dst_len = src_len - last_byte; + } else { + pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", + last_byte, (int)src_len); + return -EPERM; /* bad padding */ + } + /* + print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + return 0; +} + +static int ceph_aes_decrypt2(const void *key, int key_len, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[1], sg_out[3]; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm }; + char pad[16]; + void *iv; + int ivsize; + int ret; + int last_byte; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + sg_init_table(sg_in, 1); + sg_set_buf(sg_in, src, src_len); + sg_init_table(sg_out, 3); + sg_set_buf(&sg_out[0], dst1, *dst1_len); + sg_set_buf(&sg_out[1], dst2, *dst2_len); + sg_set_buf(&sg_out[2], pad, sizeof(pad)); + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + */ + + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); + crypto_free_blkcipher(tfm); + if (ret < 0) { + pr_err("ceph_aes_decrypt failed %d\n", ret); + return ret; + } + + if (src_len <= *dst1_len) + last_byte = ((char *)dst1)[src_len - 1]; + else if (src_len <= *dst1_len + *dst2_len) + last_byte = ((char *)dst2)[src_len - *dst1_len - 1]; + else + last_byte = pad[src_len - *dst1_len - *dst2_len - 1]; + if (last_byte <= 16 && src_len >= last_byte) { + src_len -= last_byte; + } else { + pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", + last_byte, (int)src_len); + return -EPERM; /* bad padding */ + } + + if (src_len < *dst1_len) { + *dst1_len = src_len; + *dst2_len = 0; + } else { + *dst2_len = src_len - *dst1_len; + } + /* + print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1, + dst1, *dst1_len, 1); + print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1, + dst2, *dst2_len, 1); + */ + + return 0; +} + + +int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src_len) + return -ERANGE; + memcpy(dst, src, src_len); + *dst_len = src_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_decrypt(secret->key, secret->len, dst, + dst_len, src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_decrypt2(struct ceph_crypto_key *secret, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) +{ + size_t t; + + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst1_len + *dst2_len < src_len) + return -ERANGE; + t = min(*dst1_len, src_len); + memcpy(dst1, src, t); + *dst1_len = t; + src += t; + src_len -= t; + if (src_len) { + t = min(*dst2_len, src_len); + memcpy(dst2, src, t); + *dst2_len = t; + } + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_decrypt2(secret->key, secret->len, + dst1, dst1_len, dst2, dst2_len, + src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src_len) + return -ERANGE; + memcpy(dst, src, src_len); + *dst_len = src_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_encrypt(secret->key, secret->len, dst, + dst_len, src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src1_len + src2_len) + return -ERANGE; + memcpy(dst, src1, src1_len); + memcpy(dst + src1_len, src2, src2_len); + *dst_len = src1_len + src2_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len, + src1, src1_len, src2, src2_len); + + default: + return -EINVAL; + } +} + +int ceph_key_instantiate(struct key *key, const void *data, size_t datalen) +{ + struct ceph_crypto_key *ckey; + int ret; + void *p; + + ret = -EINVAL; + if (datalen <= 0 || datalen > 32767 || !data) + goto err; + + ret = key_payload_reserve(key, datalen); + if (ret < 0) + goto err; + + ret = -ENOMEM; + ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); + if (!ckey) + goto err; + + /* TODO ceph_crypto_key_decode should really take const input */ + p = (void*)data; + ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen); + if (ret < 0) + goto err_ckey; + + key->payload.data = ckey; + return 0; + +err_ckey: + kfree(ckey); +err: + return ret; +} + +int ceph_key_match(const struct key *key, const void *description) +{ + return strcmp(key->description, description) == 0; +} + +void ceph_key_destroy(struct key *key) { + struct ceph_crypto_key *ckey = key->payload.data; + + ceph_crypto_key_destroy(ckey); +} + +struct key_type key_type_ceph = { + .name = "ceph", + .instantiate = ceph_key_instantiate, + .match = ceph_key_match, + .destroy = ceph_key_destroy, +}; + +int ceph_crypto_init(void) { + return register_key_type(&key_type_ceph); +} + +void ceph_crypto_shutdown(void) { + unregister_key_type(&key_type_ceph); +} diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h new file mode 100644 index 00000000..1919d155 --- /dev/null +++ b/net/ceph/crypto.h @@ -0,0 +1,52 @@ +#ifndef _FS_CEPH_CRYPTO_H +#define _FS_CEPH_CRYPTO_H + +#include +#include + +/* + * cryptographic secret + */ +struct ceph_crypto_key { + int type; + struct ceph_timespec created; + int len; + void *key; +}; + +static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) +{ + kfree(key->key); +} + +extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, + const struct ceph_crypto_key *src); +extern int ceph_crypto_key_encode(struct ceph_crypto_key *key, + void **p, void *end); +extern int ceph_crypto_key_decode(struct ceph_crypto_key *key, + void **p, void *end); +extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); + +/* crypto.c */ +extern int ceph_decrypt(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src, size_t src_len); +extern int ceph_encrypt(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src, size_t src_len); +extern int ceph_decrypt2(struct ceph_crypto_key *secret, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len); +extern int ceph_encrypt2(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len); +extern int ceph_crypto_init(void); +extern void ceph_crypto_shutdown(void); + +/* armor.c */ +extern int ceph_armor(char *dst, const char *src, const char *end); +extern int ceph_unarmor(char *dst, const char *src, const char *end); + +#endif diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c new file mode 100644 index 00000000..27d4ea31 --- /dev/null +++ b/net/ceph/debugfs.c @@ -0,0 +1,267 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_DEBUG_FS + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../monc - mon client state + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + ceph_pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + struct rb_node *n; + + if (client->osdc.osdmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + seq_printf(s, "flags%s%s\n", + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL" : "", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL" : ""); + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pool = + rb_entry(n, struct ceph_pg_pool_info, node); + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", + pool->id, pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } + for (i = 0; i < client->osdc.osdmap->max_osd; i++) { + struct ceph_entity_addr *addr = + &client->osdc.osdmap->osd_addr[i]; + int state = client->osdc.osdmap->osd_state[i]; + char sb[64]; + + seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + i, ceph_pr_addr(&addr->in_addr), + ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state)); + } + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_generic_request *req; + struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); + } + + mutex_unlock(&monc->mutex); + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + int num_ops; + int opcode, olen; + int i; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + le32_to_cpu(req->r_pgid.pool), + le16_to_cpu(req->r_pgid.ps)); + + head = req->r_request->front.iov_base; + op = (void *)(head + 1); + + num_ops = le16_to_cpu(head->num_ops); + olen = le32_to_cpu(head->object_len); + seq_printf(s, "%.*s", olen, + (const char *)(head->ops + num_ops)); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < num_ops; i++) { + opcode = le16_to_cpu(op->op); + seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); + op++; + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(monmap_show) +CEPH_DEFINE_SHOW_FUNC(osdmap_show) +CEPH_DEFINE_SHOW_FUNC(monc_show) +CEPH_DEFINE_SHOW_FUNC(osdc_show) + +int ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = -ENOMEM; + char name[80]; + + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); + + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_dir); +} + +#else /* CONFIG_DEBUG_FS */ + +int ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif /* CONFIG_DEBUG_FS */ + +EXPORT_SYMBOL(ceph_debugfs_init); +EXPORT_SYMBOL(ceph_debugfs_cleanup); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c new file mode 100644 index 00000000..78b55f49 --- /dev/null +++ b/net/ceph/messenger.c @@ -0,0 +1,2489 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Ceph uses the messenger to exchange ceph_msg messages with other + * hosts in the system. The messenger provides ordered and reliable + * delivery. We tolerate TCP disconnects by reconnecting (with + * exponential backoff) in the case of a fault (disconnection, bad + * crc, protocol error). Acks allow sent messages to be discarded by + * the sender. + */ + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key socket_class; +#endif + + +static void queue_con(struct ceph_connection *con); +static void con_work(struct work_struct *); +static void ceph_fault(struct ceph_connection *con); + +/* + * nicely render a sockaddr as a string. + */ +#define MAX_ADDR_STR 20 +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; +static DEFINE_SPINLOCK(addr_str_lock); +static int last_addr_str; + +const char *ceph_pr_addr(const struct sockaddr_storage *ss) +{ + int i; + char *s; + struct sockaddr_in *in4 = (void *)ss; + struct sockaddr_in6 *in6 = (void *)ss; + + spin_lock(&addr_str_lock); + i = last_addr_str++; + if (last_addr_str == MAX_ADDR_STR) + last_addr_str = 0; + spin_unlock(&addr_str_lock); + s = addr_str[i]; + + switch (ss->ss_family) { + case AF_INET: + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); + break; + + case AF_INET6: + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); + break; + + default: + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", + (int)ss->ss_family); + } + + return s; +} +EXPORT_SYMBOL(ceph_pr_addr); + +static void encode_my_addr(struct ceph_messenger *msgr) +{ + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); + ceph_encode_addr(&msgr->my_enc_addr); +} + +/* + * work queue for all reading and writing to/from the socket. + */ +struct workqueue_struct *ceph_msgr_wq; + +int ceph_msgr_init(void) +{ + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); + if (!ceph_msgr_wq) { + pr_err("msgr_init failed to create workqueue\n"); + return -ENOMEM; + } + return 0; +} +EXPORT_SYMBOL(ceph_msgr_init); + +void ceph_msgr_exit(void) +{ + destroy_workqueue(ceph_msgr_wq); +} +EXPORT_SYMBOL(ceph_msgr_exit); + +void ceph_msgr_flush(void) +{ + flush_workqueue(ceph_msgr_wq); +} +EXPORT_SYMBOL(ceph_msgr_flush); + + +/* + * socket callback functions + */ + +/* data available on socket, or listen socket received a connect */ +static void ceph_data_ready(struct sock *sk, int count_unused) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + if (sk->sk_state != TCP_CLOSE_WAIT) { + dout("ceph_data_ready on %p state = %lu, queueing work\n", + con, con->state); + queue_con(con); + } +} + +/* socket has buffer space for writing */ +static void ceph_write_space(struct sock *sk) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + + /* only queue to workqueue if there is data we want to write. */ + if (test_bit(WRITE_PENDING, &con->state)) { + dout("ceph_write_space %p queueing write work\n", con); + queue_con(con); + } else { + dout("ceph_write_space %p nothing to write\n", con); + } + + /* since we have our own write_space, clear the SOCK_NOSPACE flag */ + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +} + +/* socket's state has changed */ +static void ceph_state_change(struct sock *sk) +{ + struct ceph_connection *con = + (struct ceph_connection *)sk->sk_user_data; + + dout("ceph_state_change %p state = %lu sk_state = %u\n", + con, con->state, sk->sk_state); + + if (test_bit(CLOSED, &con->state)) + return; + + switch (sk->sk_state) { + case TCP_CLOSE: + dout("ceph_state_change TCP_CLOSE\n"); + case TCP_CLOSE_WAIT: + dout("ceph_state_change TCP_CLOSE_WAIT\n"); + if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { + if (test_bit(CONNECTING, &con->state)) + con->error_msg = "connection failed"; + else + con->error_msg = "socket closed"; + queue_con(con); + } + break; + case TCP_ESTABLISHED: + dout("ceph_state_change TCP_ESTABLISHED\n"); + queue_con(con); + break; + } +} + +/* + * set up socket callbacks + */ +static void set_sock_callbacks(struct socket *sock, + struct ceph_connection *con) +{ + struct sock *sk = sock->sk; + sk->sk_user_data = (void *)con; + sk->sk_data_ready = ceph_data_ready; + sk->sk_write_space = ceph_write_space; + sk->sk_state_change = ceph_state_change; +} + + +/* + * socket helpers + */ + +/* + * initiate connection to a remote socket. + */ +static struct socket *ceph_tcp_connect(struct ceph_connection *con) +{ + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; + struct socket *sock; + int ret; + + BUG_ON(con->sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret) + return ERR_PTR(ret); + con->sock = sock; + sock->sk->sk_allocation = GFP_NOFS; + +#ifdef CONFIG_LOCKDEP + lockdep_set_class(&sock->sk->sk_lock, &socket_class); +#endif + + set_sock_callbacks(sock, con); + + dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); + if (ret == -EINPROGRESS) { + dout("connect %s EINPROGRESS sk_state = %u\n", + ceph_pr_addr(&con->peer_addr.in_addr), + sock->sk->sk_state); + ret = 0; + } + if (ret < 0) { + pr_err("connect %s error %d\n", + ceph_pr_addr(&con->peer_addr.in_addr), ret); + sock_release(sock); + con->sock = NULL; + con->error_msg = "connect error"; + } + + if (ret < 0) + return ERR_PTR(ret); + return sock; +} + +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, int more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; +} + + +/* + * Shutdown/close the socket for the given connection. + */ +static int con_close_socket(struct ceph_connection *con) +{ + int rc; + + dout("con_close_socket on %p sock %p\n", con, con->sock); + if (!con->sock) + return 0; + set_bit(SOCK_CLOSED, &con->state); + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + clear_bit(SOCK_CLOSED, &con->state); + return rc; +} + +/* + * Reset a connection. Discard all incoming and outgoing messages + * and clear *_seq state. + */ +static void ceph_msg_remove(struct ceph_msg *msg) +{ + list_del_init(&msg->list_head); + ceph_msg_put(msg); +} +static void ceph_msg_remove_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, + list_head); + ceph_msg_remove(msg); + } +} + +static void reset_connection(struct ceph_connection *con) +{ + /* reset connection, out_queue, msg_ and connect_seq */ + /* discard existing out_queue and msg_seq */ + ceph_msg_remove_list(&con->out_queue); + ceph_msg_remove_list(&con->out_sent); + + if (con->in_msg) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + + con->connect_seq = 0; + con->out_seq = 0; + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + con->in_seq = 0; + con->in_seq_acked = 0; +} + +/* + * mark a peer down. drop any open connections. + */ +void ceph_con_close(struct ceph_connection *con) +{ + dout("con_close %p peer %s\n", con, + ceph_pr_addr(&con->peer_addr.in_addr)); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ + clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ + clear_bit(KEEPALIVE_PENDING, &con->state); + clear_bit(WRITE_PENDING, &con->state); + mutex_lock(&con->mutex); + reset_connection(con); + con->peer_global_seq = 0; + cancel_delayed_work(&con->work); + mutex_unlock(&con->mutex); + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_close); + +/* + * Reopen a closed connection, with a new peer address. + */ +void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +{ + dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); + set_bit(OPENING, &con->state); + clear_bit(CLOSED, &con->state); + memcpy(&con->peer_addr, addr, sizeof(*addr)); + con->delay = 0; /* reset backoff memory */ + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_open); + +/* + * return true if this connection ever successfully opened + */ +bool ceph_con_opened(struct ceph_connection *con) +{ + return con->connect_seq > 0; +} + +/* + * generic get/put + */ +struct ceph_connection *ceph_con_get(struct ceph_connection *con) +{ + dout("con_get %p nref = %d -> %d\n", con, + atomic_read(&con->nref), atomic_read(&con->nref) + 1); + if (atomic_inc_not_zero(&con->nref)) + return con; + return NULL; +} + +void ceph_con_put(struct ceph_connection *con) +{ + dout("con_put %p nref = %d -> %d\n", con, + atomic_read(&con->nref), atomic_read(&con->nref) - 1); + BUG_ON(atomic_read(&con->nref) == 0); + if (atomic_dec_and_test(&con->nref)) { + BUG_ON(con->sock); + kfree(con); + } +} + +/* + * initialize a new connection. + */ +void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +{ + dout("con_init %p\n", con); + memset(con, 0, sizeof(*con)); + atomic_set(&con->nref, 1); + con->msgr = msgr; + mutex_init(&con->mutex); + INIT_LIST_HEAD(&con->out_queue); + INIT_LIST_HEAD(&con->out_sent); + INIT_DELAYED_WORK(&con->work, con_work); +} +EXPORT_SYMBOL(ceph_con_init); + + +/* + * We maintain a global counter to order connection attempts. Get + * a unique seq greater than @gt. + */ +static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +{ + u32 ret; + + spin_lock(&msgr->global_seq_lock); + if (msgr->global_seq < gt) + msgr->global_seq = gt; + ret = ++msgr->global_seq; + spin_unlock(&msgr->global_seq_lock); + return ret; +} + + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con, int v) +{ + struct ceph_msg *m = con->out_msg; + + dout("prepare_write_message_footer %p\n", con); + con->out_kvec_is_msg = true; + con->out_kvec[v].iov_base = &m->footer; + con->out_kvec[v].iov_len = sizeof(m->footer); + con->out_kvec_bytes += sizeof(m->footer); + con->out_kvec_left++; + con->out_more = m->more_to_follow; + con->out_msg_done = true; +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + int v = 0; + + con->out_kvec_bytes = 0; + con->out_kvec_is_msg = true; + con->out_msg_done = false; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con->out_kvec[v].iov_base = &tag_ack; + con->out_kvec[v++].iov_len = 1; + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con->out_kvec[v].iov_base = &con->out_temp_ack; + con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); + con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + } + + m = list_first_entry(&con->out_queue, + struct ceph_msg, list_head); + con->out_msg = m; + if (test_bit(LOSSYTX, &con->state)) { + list_del_init(&m->list_head); + } else { + /* put message on sent list */ + ceph_msg_get(m); + list_move_tail(&m->list_head, &con->out_sent); + } + + /* + * only assign outgoing seq # if we haven't sent this message + * yet. if it is requeued, resend with it's original seq. + */ + if (m->needs_out_seq) { + m->hdr.seq = cpu_to_le64(++con->out_seq); + m->needs_out_seq = false; + } + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + le32_to_cpu(m->hdr.data_len), + m->nr_pages); + BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); + + /* tag + hdr + front + middle */ + con->out_kvec[v].iov_base = &tag_msg; + con->out_kvec[v++].iov_len = 1; + con->out_kvec[v].iov_base = &m->hdr; + con->out_kvec[v++].iov_len = sizeof(m->hdr); + con->out_kvec[v++] = m->front; + if (m->middle) + con->out_kvec[v++] = m->middle->vec; + con->out_kvec_left = v; + con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + + (m->middle ? m->middle->vec.iov_len : 0); + con->out_kvec_cur = con->out_kvec; + + /* fill in crc (except data pages), footer */ + con->out_msg->hdr.crc = + cpu_to_le32(crc32c(0, (void *)&m->hdr, + sizeof(m->hdr) - sizeof(m->hdr.crc))); + con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; + con->out_msg->footer.front_crc = + cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); + if (m->middle) + con->out_msg->footer.middle_crc = + cpu_to_le32(crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len)); + else + con->out_msg->footer.middle_crc = 0; + con->out_msg->footer.data_crc = 0; + dout("prepare_write_message front_crc %u data_crc %u\n", + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + + /* is there a data payload? */ + if (le32_to_cpu(m->hdr.data_len) > 0) { + /* initialize page iterator */ + con->out_msg_pos.page = 0; + if (m->pages) + con->out_msg_pos.page_pos = m->page_alignment; + else + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.data_pos = 0; + con->out_msg_pos.did_page_crc = 0; + con->out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con, v); + } + + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con->out_kvec[0].iov_base = &tag_ack; + con->out_kvec[0].iov_len = 1; + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con->out_kvec[1].iov_base = &con->out_temp_ack; + con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); + con->out_kvec_left = 2; + con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); + con->out_kvec_cur = con->out_kvec; + con->out_more = 1; /* more will follow.. eventually.. */ + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con->out_kvec[0].iov_base = &tag_keepalive; + con->out_kvec[0].iov_len = 1; + con->out_kvec_left = 1; + con->out_kvec_bytes = 1; + con->out_kvec_cur = con->out_kvec; + set_bit(WRITE_PENDING, &con->state); +} + +/* + * Connection negotiation. + */ + +static int prepare_connect_authorizer(struct ceph_connection *con) +{ + void *auth_buf; + int auth_len = 0; + int auth_protocol = 0; + + mutex_unlock(&con->mutex); + if (con->ops->get_authorizer) + con->ops->get_authorizer(con, &auth_buf, &auth_len, + &auth_protocol, &con->auth_reply_buf, + &con->auth_reply_buf_len, + con->auth_retry); + mutex_lock(&con->mutex); + + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; + + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); + con->out_connect.authorizer_len = cpu_to_le32(auth_len); + + if (auth_len) { + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; + } + return 0; +} + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_banner(struct ceph_messenger *msgr, + struct ceph_connection *con) +{ + int len = strlen(CEPH_BANNER); + + con->out_kvec[0].iov_base = CEPH_BANNER; + con->out_kvec[0].iov_len = len; + con->out_kvec[1].iov_base = &msgr->my_enc_addr; + con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); + con->out_kvec_left = 2; + con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); +} + +static int prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) +{ + unsigned global_seq = get_global_seq(con->msgr, 0); + int proto; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->connect_seq, global_seq, proto); + + con->out_connect.features = cpu_to_le64(msgr->supported_features); + con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); + con->out_connect.global_seq = cpu_to_le32(global_seq); + con->out_connect.protocol_version = cpu_to_le32(proto); + con->out_connect.flags = 0; + + if (!after_banner) { + con->out_kvec_left = 0; + con->out_kvec_bytes = 0; + } + con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; + con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); + con->out_kvec_left++; + con->out_kvec_bytes += sizeof(con->out_connect); + con->out_kvec_cur = con->out_kvec; + con->out_more = 0; + set_bit(WRITE_PENDING, &con->state); + + return prepare_connect_authorizer(con); +} + + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); + while (con->out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, + con->out_kvec_left, con->out_kvec_bytes, + con->out_more); + if (ret <= 0) + goto out; + con->out_kvec_bytes -= ret; + if (con->out_kvec_bytes == 0) + break; /* done */ + while (ret > 0) { + if (ret >= con->out_kvec_cur->iov_len) { + ret -= con->out_kvec_cur->iov_len; + con->out_kvec_cur++; + con->out_kvec_left--; + } else { + con->out_kvec_cur->iov_len -= ret; + con->out_kvec_cur->iov_base += ret; + ret = 0; + break; + } + } + } + con->out_kvec_left = 0; + con->out_kvec_is_msg = false; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->out_kvec_bytes, con->out_kvec_left, ret); + return ret; /* done! */ +} + +#ifdef CONFIG_BLOCK +static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) +{ + if (!bio) { + *iter = NULL; + *seg = 0; + return; + } + *iter = bio; + *seg = bio->bi_idx; +} + +static void iter_bio_next(struct bio **bio_iter, int *seg) +{ + if (*bio_iter == NULL) + return; + + BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + + (*seg)++; + if (*seg == (*bio_iter)->bi_vcnt) + init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +} +#endif + +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_msg_pages(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + unsigned data_len = le32_to_cpu(msg->hdr.data_len); + size_t len; + int crc = con->msgr->nocrc; + int ret; + int total_max_write; + int in_trail = 0; + size_t trail_len = (msg->trail ? msg->trail->length : 0); + + dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", + con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, + con->out_msg_pos.page_pos); + +#ifdef CONFIG_BLOCK + if (msg->bio && !msg->bio_iter) + init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); +#endif + + while (data_len > con->out_msg_pos.data_pos) { + struct page *page = NULL; + void *kaddr = NULL; + int max_write = PAGE_SIZE; + int page_shift = 0; + + total_max_write = data_len - trail_len - + con->out_msg_pos.data_pos; + + /* + * if we are calculating the data crc (the default), we need + * to map the page. if our pages[] has been revoked, use the + * zero page. + */ + + /* have we reached the trail part of the data? */ + if (con->out_msg_pos.data_pos >= data_len - trail_len) { + in_trail = 1; + + total_max_write = data_len - con->out_msg_pos.data_pos; + + page = list_first_entry(&msg->trail->head, + struct page, lru); + if (crc) + kaddr = kmap(page); + max_write = PAGE_SIZE; + } else if (msg->pages) { + page = msg->pages[con->out_msg_pos.page]; + if (crc) + kaddr = kmap(page); + } else if (msg->pagelist) { + page = list_first_entry(&msg->pagelist->head, + struct page, lru); + if (crc) + kaddr = kmap(page); +#ifdef CONFIG_BLOCK + } else if (msg->bio) { + struct bio_vec *bv; + + bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); + page = bv->bv_page; + page_shift = bv->bv_offset; + if (crc) + kaddr = kmap(page) + page_shift; + max_write = bv->bv_len; +#endif + } else { + page = con->msgr->zero_page; + if (crc) + kaddr = page_address(con->msgr->zero_page); + } + len = min_t(int, max_write - con->out_msg_pos.page_pos, + total_max_write); + + if (crc && !con->out_msg_pos.did_page_crc) { + void *base = kaddr + con->out_msg_pos.page_pos; + u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); + + BUG_ON(kaddr == NULL); + con->out_msg->footer.data_crc = + cpu_to_le32(crc32c(tmpcrc, base, len)); + con->out_msg_pos.did_page_crc = 1; + } + ret = kernel_sendpage(con->sock, page, + con->out_msg_pos.page_pos + page_shift, + len, + MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_MORE); + + if (crc && + (msg->pages || msg->pagelist || msg->bio || in_trail)) + kunmap(page); + + if (ret == -EAGAIN) + ret = 0; + if (ret <= 0) + goto out; + + con->out_msg_pos.data_pos += ret; + con->out_msg_pos.page_pos += ret; + if (ret == len) { + con->out_msg_pos.page_pos = 0; + con->out_msg_pos.page++; + con->out_msg_pos.did_page_crc = 0; + if (in_trail) + list_move_tail(&page->lru, + &msg->trail->head); + else if (msg->pagelist) + list_move_tail(&page->lru, + &msg->pagelist->head); +#ifdef CONFIG_BLOCK + else if (msg->bio) + iter_bio_next(&msg->bio_iter, &msg->bio_seg); +#endif + } + } + + dout("write_partial_msg_pages %p msg %p done\n", con, msg); + + /* prepare and queue up footer, too */ + if (!crc) + con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con->out_kvec_bytes = 0; + con->out_kvec_left = 0; + con->out_kvec_cur = con->out_kvec; + prepare_write_message_footer(con, 0); + ret = 1; +out: + return ret; +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int ret; + + while (con->out_skip > 0) { + struct kvec iov = { + .iov_base = page_address(con->msgr->zero_page), + .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) + }; + + ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); + if (ret <= 0) + goto out; + con->out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_READY; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + + +static int read_partial(struct ceph_connection *con, + int *to, int size, void *object) +{ + *to += size; + while (con->in_base_pos < *to) { + int left = *to - con->in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + return 1; +} + + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_banner(struct ceph_connection *con) +{ + int ret, to = 0; + + dout("read_partial_banner %p at %d\n", con, con->in_base_pos); + + /* peer's banner */ + ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->actual_peer_addr), + &con->actual_peer_addr); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), + &con->peer_addr_for_me); + if (ret <= 0) + goto out; +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int ret, to = 0; + + dout("read_partial_connect %p at %d\n", con, con->in_base_pos); + + ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); + if (ret <= 0) + goto out; + ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), + con->auth_reply_buf); + if (ret <= 0) + goto out; + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, (int)con->in_reply.tag, + le32_to_cpu(con->in_reply.connect_seq), + le32_to_cpu(con->in_reply.global_seq)); +out: + return ret; + +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to %s got bad banner\n", + ceph_pr_addr(&con->peer_addr.in_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static bool addr_is_blank(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; + case AF_INET6: + return + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; + } + return false; +} + +static int addr_port(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)ss)->sin_port); + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); + } + return 0; +} + +static void addr_set_port(struct sockaddr_storage *ss, int p) +{ + switch (ss->ss_family) { + case AF_INET: + ((struct sockaddr_in *)ss)->sin_port = htons(p); + break; + case AF_INET6: + ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + break; + } +} + +/* + * Parse an ip[:port] list into an addr array. Use the default + * monitor port if a port isn't specified. + */ +int ceph_parse_ips(const char *c, const char *end, + struct ceph_entity_addr *addr, + int max_count, int *count) +{ + int i; + const char *p = c; + + dout("parse_ips on '%.*s'\n", (int)(end-c), c); + for (i = 0; i < max_count; i++) { + const char *ipend; + struct sockaddr_storage *ss = &addr[i].in_addr; + struct sockaddr_in *in4 = (void *)ss; + struct sockaddr_in6 *in6 = (void *)ss; + int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } + + memset(ss, 0, sizeof(*ss)); + if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, + delim, &ipend)) + ss->ss_family = AF_INET; + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) + ss->ss_family = AF_INET6; + else + goto bad; + p = ipend; + + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + + /* port? */ + if (p < end && *p == ':') { + port = 0; + p++; + while (p < end && *p >= '0' && *p <= '9') { + port = (port * 10) + (*p - '0'); + p++; + } + if (port > 65535 || port == 0) + goto bad; + } else { + port = CEPH_MON_PORT; + } + + addr_set_port(ss, port); + + dout("parse_ips got %s\n", ceph_pr_addr(ss)); + + if (p == end) + break; + if (*p != ',') + goto bad; + p++; + } + + if (p != end) + goto bad; + + if (count) + *count = i + 1; + return 0; + +bad: + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); + return -EINVAL; +} +EXPORT_SYMBOL(ceph_parse_ips); + +static int process_banner(struct ceph_connection *con) +{ + dout("process_banner on %p\n", con); + + if (verify_hello(con) < 0) + return -1; + + ceph_decode_addr(&con->actual_peer_addr); + ceph_decode_addr(&con->peer_addr_for_me); + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (memcmp(&con->peer_addr, &con->actual_peer_addr, + sizeof(con->peer_addr)) != 0 && + !(addr_is_blank(&con->actual_peer_addr.in_addr) && + con->actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_warning("wrong peer, want %s/%d, got %s/%d\n", + ceph_pr_addr(&con->peer_addr.in_addr), + (int)le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&con->actual_peer_addr.in_addr), + (int)le32_to_cpu(con->actual_peer_addr.nonce)); + con->error_msg = "wrong peer at address"; + return -1; + } + + /* + * did we learn our address? + */ + if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { + int port = addr_port(&con->msgr->inst.addr.in_addr); + + memcpy(&con->msgr->inst.addr.in_addr, + &con->peer_addr_for_me.in_addr, + sizeof(con->peer_addr_for_me.in_addr)); + addr_set_port(&con->msgr->inst.addr.in_addr, port); + encode_my_addr(con->msgr); + dout("process_banner learned my addr is %s\n", + ceph_pr_addr(&con->msgr->inst.addr.in_addr)); + } + + set_bit(NEGOTIATING, &con->state); + prepare_read_connect(con); + return 0; +} + +static void fail_protocol(struct ceph_connection *con) +{ + reset_connection(con); + set_bit(CLOSED, &con->state); /* in case there's queued work */ + + mutex_unlock(&con->mutex); + if (con->ops->bad_proto) + con->ops->bad_proto(con); + mutex_lock(&con->mutex); +} + +static int process_connect(struct ceph_connection *con) +{ + u64 sup_feat = con->msgr->supported_features; + u64 req_feat = con->msgr->required_features; + u64 server_feat = le64_to_cpu(con->in_reply.features); + int ret; + + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + + switch (con->in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + fail_protocol(con); + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + le32_to_cpu(con->out_connect.protocol_version), + le32_to_cpu(con->in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + fail_protocol(con); + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->auth_retry); + if (con->auth_retry == 2) { + con->error_msg = "connect authorization failure"; + return -1; + } + con->auth_retry = 1; + ret = prepare_write_connect(con->msgr, con, 0); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->in_connect.connect_seq)); + pr_err("%s%lld %s connection reset\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr)); + reset_connection(con); + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect(con); + + /* Tell ceph about it. */ + mutex_unlock(&con->mutex); + pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + le32_to_cpu(con->out_connect.connect_seq), + le32_to_cpu(con->in_connect.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", + con->peer_global_seq, + le32_to_cpu(con->in_connect.global_seq)); + get_global_seq(con->msgr, + le32_to_cpu(con->in_connect.global_seq)); + prepare_write_connect(con->msgr, con, 0); + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + fail_protocol(con); + return -1; + } + clear_bit(CONNECTING, &con->state); + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); + con->connect_seq++; + con->peer_features = server_feat; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->peer_global_seq, + le32_to_cpu(con->in_reply.connect_seq), + con->connect_seq); + WARN_ON(con->connect_seq != + le32_to_cpu(con->in_reply.connect_seq)); + + if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + set_bit(LOSSYTX, &con->state); + + prepare_read_tag(con); + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + pr_err("process_connect got WAIT as client\n"); + con->error_msg = "protocol error, got WAIT as client"; + return -1; + + default: + pr_err("connect protocol error, will retry\n"); + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int to = 0; + + return read_partial(con, &to, sizeof(con->in_temp_ack), + &con->in_temp_ack); +} + + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + struct ceph_msg *m; + u64 ack = le64_to_cpu(con->in_temp_ack); + u64 seq; + + while (!list_empty(&con->out_sent)) { + m = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + seq = le64_to_cpu(m->hdr.seq); + if (seq > ack) + break; + dout("got ack for seq %llu type %d at %p\n", seq, + le16_to_cpu(m->hdr.type), m); + ceph_msg_remove(m); + } + prepare_read_tag(con); +} + + + + +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + int ret, left; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, + section->iov_len); + } + + return 1; +} + +static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip); + + +static int read_partial_message_pages(struct ceph_connection *con, + struct page **pages, + unsigned data_len, int datacrc) +{ + void *p; + int ret; + int left; + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); + /* (page) data */ + BUG_ON(pages == NULL); + p = kmap(pages[con->in_msg_pos.page]); + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(pages[con->in_msg_pos.page]); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == PAGE_SIZE) { + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.page++; + } + + return ret; +} + +#ifdef CONFIG_BLOCK +static int read_partial_message_bio(struct ceph_connection *con, + struct bio **bio_iter, int *bio_seg, + unsigned data_len, int datacrc) +{ + struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); + void *p; + int ret, left; + + if (IS_ERR(bv)) + return PTR_ERR(bv); + + left = min((int)(data_len - con->in_msg_pos.data_pos), + (int)(bv->bv_len - con->in_msg_pos.page_pos)); + + p = kmap(bv->bv_page) + bv->bv_offset; + + ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, + left); + if (ret > 0 && datacrc) + con->in_data_crc = + crc32c(con->in_data_crc, + p + con->in_msg_pos.page_pos, ret); + kunmap(bv->bv_page); + if (ret <= 0) + return ret; + con->in_msg_pos.data_pos += ret; + con->in_msg_pos.page_pos += ret; + if (con->in_msg_pos.page_pos == bv->bv_len) { + con->in_msg_pos.page_pos = 0; + iter_bio_next(bio_iter, bio_seg); + } + + return ret; +} +#endif + +/* + * read (part of) a message. + */ +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + int ret; + int to, left; + unsigned front_len, middle_len, data_len; + int datacrc = con->msgr->nocrc; + int skip; + u64 seq; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + while (con->in_base_pos < sizeof(con->in_hdr)) { + left = sizeof(con->in_hdr) - con->in_base_pos; + ret = ceph_tcp_recvmsg(con->sock, + (char *)&con->in_hdr + con->in_base_pos, + left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + if (con->in_base_pos == sizeof(con->in_hdr)) { + u32 crc = crc32c(0, (void *)&con->in_hdr, + sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); + if (crc != le32_to_cpu(con->in_hdr.crc)) { + pr_err("read_partial_message bad hdr " + " crc %u != expected %u\n", + crc, con->in_hdr.crc); + return -EBADMSG; + } + } + } + front_len = le32_to_cpu(con->in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + data_len = le32_to_cpu(con->in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + + /* verify seq# */ + seq = le64_to_cpu(con->in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld expected %lld\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + seq, con->in_seq + 1); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + return 0; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADMSG; + } + + /* allocate message? */ + if (!con->in_msg) { + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, + con->in_hdr.front_len, con->in_hdr.data_len); + skip = 0; + con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); + if (skip) { + /* skip this message */ + dout("alloc_msg said skip message\n"); + BUG_ON(con->in_msg); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 0; + } + if (!con->in_msg) { + con->error_msg = + "error allocating memory for incoming message"; + return -ENOMEM; + } + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; + + con->in_msg_pos.page = 0; + if (m->pages) + con->in_msg_pos.page_pos = m->page_alignment; + else + con->in_msg_pos.page_pos = 0; + con->in_msg_pos.data_pos = 0; + } + + /* front */ + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; + + /* middle */ + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, + &con->in_middle_crc); + if (ret <= 0) + return ret; + } +#ifdef CONFIG_BLOCK + if (m->bio && !m->bio_iter) + init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); +#endif + + /* (page) data */ + while (con->in_msg_pos.data_pos < data_len) { + if (m->pages) { + ret = read_partial_message_pages(con, m->pages, + data_len, datacrc); + if (ret <= 0) + return ret; +#ifdef CONFIG_BLOCK + } else if (m->bio) { + + ret = read_partial_message_bio(con, + &m->bio_iter, &m->bio_seg, + data_len, datacrc); + if (ret <= 0) + return ret; +#endif + } else { + BUG_ON(1); + } + } + + /* footer */ + to = sizeof(m->hdr) + sizeof(m->footer); + while (con->in_base_pos < to) { + left = to - con->in_base_pos; + ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + + (con->in_base_pos - sizeof(m->hdr)), + left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + return 1; /* done! */ +} + +/* + * Process message. This happens in the worker thread. The callback should + * be careful not to do anything that waits on other incoming messages or it + * may deadlock. + */ +static void process_message(struct ceph_connection *con) +{ + struct ceph_msg *msg; + + msg = con->in_msg; + con->in_msg = NULL; + + /* if first message, set peer_name */ + if (con->peer_name.type == 0) + con->peer_name = msg->hdr.src; + + con->in_seq++; + mutex_unlock(&con->mutex); + + dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + msg, le64_to_cpu(msg->hdr.seq), + ENTITY_NAME(msg->hdr.src), + le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.data_len), + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + con->ops->dispatch(con, msg); + + mutex_lock(&con->mutex); + prepare_read_tag(con); +} + + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +static int try_write(struct ceph_connection *con) +{ + struct ceph_messenger *msgr = con->msgr; + int ret = 1; + + dout("try_write start %p state %lu nref %d\n", con, con->state, + atomic_read(&con->nref)); + +more: + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); + + /* open the socket first? */ + if (con->sock == NULL) { + prepare_write_banner(msgr, con); + prepare_write_connect(msgr, con, 1); + prepare_read_banner(con); + set_bit(CONNECTING, &con->state); + clear_bit(NEGOTIATING, &con->state); + + BUG_ON(con->in_msg); + con->in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %lu\n", + con, con->state); + con->sock = ceph_tcp_connect(con); + if (IS_ERR(con->sock)) { + con->sock = NULL; + con->error_msg = "connect error"; + ret = -1; + goto out; + } + } + +more_kvec: + /* kvec data queued? */ + if (con->out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto out; + } + if (con->out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto out; + } + + /* msg pages? */ + if (con->out_msg) { + if (con->out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + + ret = write_partial_msg_pages(con); + if (ret == 1) + goto more_kvec; /* we need to send the footer, too! */ + if (ret == 0) + goto out; + if (ret < 0) { + dout("try_write write_partial_msg_pages err %d\n", + ret); + goto out; + } + } + +do_next: + if (!test_bit(CONNECTING, &con->state)) { + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { + prepare_write_keepalive(con); + goto more; + } + } + + /* Nothing to do! */ + clear_bit(WRITE_PENDING, &con->state); + dout("try_write nothing else to write.\n"); + ret = 0; +out: + dout("try_write done on %p ret %d\n", con, ret); + return ret; +} + + + +/* + * Read what we can from the socket. + */ +static int try_read(struct ceph_connection *con) +{ + int ret = -1; + + if (!con->sock) + return 0; + + if (test_bit(STANDBY, &con->state)) + return 0; + + dout("try_read start on %p\n", con); + +more: + dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, + con->in_base_pos); + + /* + * process_connect and process_message drop and re-take + * con->mutex. make sure we handle a racing close or reopen. + */ + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) { + ret = -EAGAIN; + goto out; + } + + if (test_bit(CONNECTING, &con->state)) { + if (!test_bit(NEGOTIATING, &con->state)) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + } + ret = read_partial_connect(con); + if (ret <= 0) + goto out; + ret = process_connect(con); + if (ret < 0) + goto out; + goto more; + } + + if (con->in_base_pos < 0) { + /* + * skipping + discarding content. + * + * FIXME: there must be a better way to do this! + */ + static char buf[1024]; + int skip = min(1024, -con->in_base_pos); + dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); + ret = ceph_tcp_recvmsg(con->sock, buf, skip); + if (ret <= 0) + goto out; + con->in_base_pos += ret; + if (con->in_base_pos) + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); + if (ret <= 0) + goto out; + dout("try_read got tag %d\n", (int)con->in_tag); + switch (con->in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + set_bit(CLOSED, &con->state); /* fixme */ + goto out; + default: + goto bad_tag; + } + } + if (con->in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc"; + ret = -EIO; + break; + case -EIO: + con->error_msg = "io error"; + break; + } + goto out; + } + if (con->in_tag == CEPH_MSGR_TAG_READY) + goto more; + process_message(con); + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_ACK) { + ret = read_partial_ack(con); + if (ret <= 0) + goto out; + process_ack(con); + goto more; + } + +out: + dout("try_read done on %p ret %d\n", con, ret); + return ret; + +bad_tag: + pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + + +/* + * Atomically queue work on a connection. Bump @con reference to + * avoid races with connection teardown. + */ +static void queue_con(struct ceph_connection *con) +{ + if (test_bit(DEAD, &con->state)) { + dout("queue_con %p ignoring: DEAD\n", + con); + return; + } + + if (!con->ops->get(con)) { + dout("queue_con %p ref count 0\n", con); + return; + } + + if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) { + dout("queue_con %p - already queued\n", con); + con->ops->put(con); + } else { + dout("queue_con %p\n", con); + } +} + +/* + * Do some work on a connection. Drop a connection ref when we're done. + */ +static void con_work(struct work_struct *work) +{ + struct ceph_connection *con = container_of(work, struct ceph_connection, + work.work); + int ret; + + mutex_lock(&con->mutex); +restart: + if (test_and_clear_bit(BACKOFF, &con->state)) { + dout("con_work %p backing off\n", con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay))) { + dout("con_work %p backoff %lu\n", con, con->delay); + mutex_unlock(&con->mutex); + return; + } else { + con->ops->put(con); + dout("con_work %p FAILED to back off %lu\n", con, + con->delay); + } + } + + if (test_bit(STANDBY, &con->state)) { + dout("con_work %p STANDBY\n", con); + goto done; + } + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ + dout("con_work CLOSED\n"); + con_close_socket(con); + goto done; + } + if (test_and_clear_bit(OPENING, &con->state)) { + /* reopen w/ new peer */ + dout("con_work OPENING\n"); + con_close_socket(con); + } + + if (test_and_clear_bit(SOCK_CLOSED, &con->state)) + goto fault; + + ret = try_read(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; + + ret = try_write(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; + +done: + mutex_unlock(&con->mutex); +done_unlocked: + con->ops->put(con); + return; + +fault: + mutex_unlock(&con->mutex); + ceph_fault(con); /* error/fault path */ + goto done_unlocked; +} + + +/* + * Generic error/fault handler. A retry mechanism is used with + * exponential backoff + */ +static void ceph_fault(struct ceph_connection *con) +{ + pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + dout("fault %p state %lu to peer %s\n", + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + + if (test_bit(LOSSYTX, &con->state)) { + dout("fault on LOSSYTX channel\n"); + goto out; + } + + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) + goto out_unlock; + + con_close_socket(con); + + if (con->in_msg) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + + /* Requeue anything that hasn't been acked */ + list_splice_init(&con->out_sent, &con->out_queue); + + /* If there are no messages queued or keepalive pending, place + * the connection in a STANDBY state */ + if (list_empty(&con->out_queue) && + !test_bit(KEEPALIVE_PENDING, &con->state)) { + dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); + clear_bit(WRITE_PENDING, &con->state); + set_bit(STANDBY, &con->state); + } else { + /* retry after a delay. */ + if (con->delay == 0) + con->delay = BASE_DELAY_INTERVAL; + else if (con->delay < MAX_DELAY_INTERVAL) + con->delay *= 2; + con->ops->get(con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay))) { + dout("fault queued %p delay %lu\n", con, con->delay); + } else { + con->ops->put(con); + dout("fault failed to queue %p delay %lu, backoff\n", + con, con->delay); + /* + * In many cases we see a socket state change + * while con_work is running and end up + * queuing (non-delayed) work, such that we + * can't backoff with a delay. Set a flag so + * that when con_work restarts we schedule the + * delay then. + */ + set_bit(BACKOFF, &con->state); + } + } + +out_unlock: + mutex_unlock(&con->mutex); +out: + /* + * in case we faulted due to authentication, invalidate our + * current tickets so that we can get new ones. + */ + if (con->auth_retry && con->ops->invalidate_authorizer) { + dout("calling invalidate_authorizer()\n"); + con->ops->invalidate_authorizer(con); + } + + if (con->ops->fault) + con->ops->fault(con); +} + + + +/* + * create a new messenger instance + */ +struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, + u32 supported_features, + u32 required_features) +{ + struct ceph_messenger *msgr; + + msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); + if (msgr == NULL) + return ERR_PTR(-ENOMEM); + + msgr->supported_features = supported_features; + msgr->required_features = required_features; + + spin_lock_init(&msgr->global_seq_lock); + + /* the zero page is needed if a request is "canceled" while the message + * is being written over the socket */ + msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); + if (!msgr->zero_page) { + kfree(msgr); + return ERR_PTR(-ENOMEM); + } + kmap(msgr->zero_page); + + if (myaddr) + msgr->inst.addr = *myaddr; + + /* select a random nonce */ + msgr->inst.addr.type = 0; + get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); + encode_my_addr(msgr); + + dout("messenger_create %p\n", msgr); + return msgr; +} +EXPORT_SYMBOL(ceph_messenger_create); + +void ceph_messenger_destroy(struct ceph_messenger *msgr) +{ + dout("destroy %p\n", msgr); + kunmap(msgr->zero_page); + __free_page(msgr->zero_page); + kfree(msgr); + dout("destroyed messenger %p\n", msgr); +} +EXPORT_SYMBOL(ceph_messenger_destroy); + +static void clear_standby(struct ceph_connection *con) +{ + /* come back from STANDBY? */ + if (test_and_clear_bit(STANDBY, &con->state)) { + mutex_lock(&con->mutex); + dout("clear_standby %p and ++connect_seq\n", con); + con->connect_seq++; + WARN_ON(test_bit(WRITE_PENDING, &con->state)); + WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); + mutex_unlock(&con->mutex); + } +} + +/* + * Queue up an outgoing message on the given connection. + */ +void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) +{ + if (test_bit(CLOSED, &con->state)) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + return; + } + + /* set src+dst */ + msg->hdr.src = con->msgr->inst.name; + + BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); + + msg->needs_out_seq = true; + + /* queue */ + mutex_lock(&con->mutex); + BUG_ON(!list_empty(&msg->list_head)); + list_add_tail(&msg->list_head, &con->out_queue); + dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, + ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), + le32_to_cpu(msg->hdr.data_len)); + mutex_unlock(&con->mutex); + + /* if there wasn't anything waiting to send before, queue + * new work */ + clear_standby(con); + if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_send); + +/* + * Revoke a message that was previously queued for send + */ +void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +{ + mutex_lock(&con->mutex); + if (!list_empty(&msg->list_head)) { + dout("con_revoke %p msg %p - was on queue\n", con, msg); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + msg->hdr.seq = 0; + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; + if (con->out_kvec_is_msg) { + con->out_skip = con->out_kvec_bytes; + con->out_kvec_is_msg = false; + } + ceph_msg_put(msg); + msg->hdr.seq = 0; + } + mutex_unlock(&con->mutex); +} + +/* + * Revoke a message that we may be reading data into + */ +void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + mutex_lock(&con->mutex); + if (con->in_msg && con->in_msg == msg) { + unsigned front_len = le32_to_cpu(con->in_hdr.front_len); + unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); + unsigned data_len = le32_to_cpu(con->in_hdr.data_len); + + /* skip rest of message */ + dout("con_revoke_pages %p msg %p revoked\n", con, msg); + con->in_base_pos = con->in_base_pos - + sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - + sizeof(struct ceph_msg_footer); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + } else { + dout("con_revoke_pages %p msg %p pages %p no-op\n", + con, con->in_msg, msg); + } + mutex_unlock(&con->mutex); +} + +/* + * Queue a keepalive byte to ensure the tcp connection is alive. + */ +void ceph_con_keepalive(struct ceph_connection *con) +{ + dout("con_keepalive %p\n", con); + clear_standby(con); + if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && + test_and_set_bit(WRITE_PENDING, &con->state) == 0) + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_keepalive); + + +/* + * construct a new message with given type, size + * the new msg has a ref count of 1. + */ +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) +{ + struct ceph_msg *m; + + m = kmalloc(sizeof(*m), flags); + if (m == NULL) + goto out; + kref_init(&m->kref); + INIT_LIST_HEAD(&m->list_head); + + m->hdr.tid = 0; + m->hdr.type = cpu_to_le16(type); + m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->hdr.version = 0; + m->hdr.front_len = cpu_to_le32(front_len); + m->hdr.middle_len = 0; + m->hdr.data_len = 0; + m->hdr.data_off = 0; + m->hdr.reserved = 0; + m->footer.front_crc = 0; + m->footer.middle_crc = 0; + m->footer.data_crc = 0; + m->footer.flags = 0; + m->front_max = front_len; + m->front_is_vmalloc = false; + m->more_to_follow = false; + m->pool = NULL; + + /* middle */ + m->middle = NULL; + + /* data */ + m->nr_pages = 0; + m->page_alignment = 0; + m->pages = NULL; + m->pagelist = NULL; + m->bio = NULL; + m->bio_iter = NULL; + m->bio_seg = 0; + m->trail = NULL; + + /* front */ + if (front_len) { + if (front_len > PAGE_CACHE_SIZE) { + m->front.iov_base = __vmalloc(front_len, flags, + PAGE_KERNEL); + m->front_is_vmalloc = true; + } else { + m->front.iov_base = kmalloc(front_len, flags); + } + if (m->front.iov_base == NULL) { + pr_err("msg_new can't allocate %d bytes\n", + front_len); + goto out2; + } + } else { + m->front.iov_base = NULL; + } + m->front.iov_len = front_len; + + dout("ceph_msg_new %p front %d\n", m, front_len); + return m; + +out2: + ceph_msg_put(m); +out: + pr_err("msg_new can't create type %d front %d\n", type, front_len); + return NULL; +} +EXPORT_SYMBOL(ceph_msg_new); + +/* + * Allocate "middle" portion of a message, if it is needed and wasn't + * allocated by alloc_msg. This allows us to read a small fixed-size + * per-type header in the front and then gracefully fail (i.e., + * propagate the error to the caller based on info in the front) when + * the middle is too large. + */ +static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) +{ + int type = le16_to_cpu(msg->hdr.type); + int middle_len = le32_to_cpu(msg->hdr.middle_len); + + dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, + ceph_msg_type_name(type), middle_len); + BUG_ON(!middle_len); + BUG_ON(msg->middle); + + msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); + if (!msg->middle) + return -ENOMEM; + return 0; +} + +/* + * Generic message allocator, for incoming messages. + */ +static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg = NULL; + int ret; + + if (con->ops->alloc_msg) { + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (!msg || *skip) + return NULL; + } + if (!msg) { + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + msg->page_alignment = le16_to_cpu(hdr->data_off); + } + memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + + if (middle_len && !msg->middle) { + ret = ceph_alloc_middle(con, msg); + if (ret < 0) { + ceph_msg_put(msg); + return NULL; + } + } + + return msg; +} + + +/* + * Free a generically kmalloc'd message. + */ +void ceph_msg_kfree(struct ceph_msg *m) +{ + dout("msg_kfree %p\n", m); + if (m->front_is_vmalloc) + vfree(m->front.iov_base); + else + kfree(m->front.iov_base); + kfree(m); +} + +/* + * Drop a msg ref. Destroy as needed. + */ +void ceph_msg_last_put(struct kref *kref) +{ + struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + + dout("ceph_msg_put last one on %p\n", m); + WARN_ON(!list_empty(&m->list_head)); + + /* drop middle, data, if any */ + if (m->middle) { + ceph_buffer_put(m->middle); + m->middle = NULL; + } + m->nr_pages = 0; + m->pages = NULL; + + if (m->pagelist) { + ceph_pagelist_release(m->pagelist); + kfree(m->pagelist); + m->pagelist = NULL; + } + + m->trail = NULL; + + if (m->pool) + ceph_msgpool_put(m->pool, m); + else + ceph_msg_kfree(m); +} +EXPORT_SYMBOL(ceph_msg_last_put); + +void ceph_msg_dump(struct ceph_msg *msg) +{ + pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, + msg->front_max, msg->nr_pages); + print_hex_dump(KERN_DEBUG, "header: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->hdr, sizeof(msg->hdr), true); + print_hex_dump(KERN_DEBUG, " front: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->front.iov_base, msg->front.iov_len, true); + if (msg->middle) + print_hex_dump(KERN_DEBUG, "middle: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->middle->vec.iov_base, + msg->middle->vec.iov_len, true); + print_hex_dump(KERN_DEBUG, "footer: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->footer, sizeof(msg->footer), true); +} +EXPORT_SYMBOL(ceph_msg_dump); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c new file mode 100644 index 00000000..cbe31fa4 --- /dev/null +++ b/net/ceph/mon_client.c @@ -0,0 +1,1027 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +/* + * Interact with Ceph monitor cluster. Handle requests for new map + * versions, and periodically resend as needed. Also implement + * statfs() and umount(). + * + * A small cluster of Ceph "monitors" are responsible for managing critical + * cluster configuration and state information. An odd number (e.g., 3, 5) + * of cmon daemons use a modified version of the Paxos part-time parliament + * algorithm to manage the MDS map (mds cluster membership), OSD map, and + * list of clients who have mounted the file system. + * + * We maintain an open, active session with a monitor at all times in order to + * receive timely MDSMap updates. We periodically send a keepalive byte on the + * TCP socket to ensure we detect a failure. If the connection does break, we + * randomly hunt for a new monitor. Once the connection is reestablished, we + * resend any outstanding requests. + */ + +static const struct ceph_connection_operations mon_con_ops; + +static int __validate_auth(struct ceph_mon_client *monc); + +/* + * Decode a monmap blob (e.g., during mount). + */ +struct ceph_monmap *ceph_monmap_decode(void *p, void *end) +{ + struct ceph_monmap *m = NULL; + int i, err = -EINVAL; + struct ceph_fsid fsid; + u32 epoch, num_mon; + u16 version; + u32 len; + + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); + + dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); + + ceph_decode_16_safe(&p, end, version, bad); + + ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + epoch = ceph_decode_32(&p); + + num_mon = ceph_decode_32(&p); + ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); + + if (num_mon >= CEPH_MAX_MON) + goto bad; + m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); + if (m == NULL) + return ERR_PTR(-ENOMEM); + m->fsid = fsid; + m->epoch = epoch; + m->num_mon = num_mon; + ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); + for (i = 0; i < num_mon; i++) + ceph_decode_addr(&m->mon_inst[i].addr); + + dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, + m->num_mon); + for (i = 0; i < m->num_mon; i++) + dout("monmap_decode mon%d is %s\n", i, + ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); + return m; + +bad: + dout("monmap_decode failed with %d\n", err); + kfree(m); + return ERR_PTR(err); +} + +/* + * return true if *addr is included in the monmap. + */ +int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) +{ + int i; + + for (i = 0; i < m->num_mon; i++) + if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) + return 1; + return 0; +} + +/* + * Send an auth request. + */ +static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) +{ + monc->pending_auth = 1; + monc->m_auth->front.iov_len = len; + monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_con_revoke(monc->con, monc->m_auth); + ceph_msg_get(monc->m_auth); /* keep our ref */ + ceph_con_send(monc->con, monc->m_auth); +} + +/* + * Close monitor session, if any. + */ +static void __close_session(struct ceph_mon_client *monc) +{ + if (monc->con) { + dout("__close_session closing mon%d\n", monc->cur_mon); + ceph_con_revoke(monc->con, monc->m_auth); + ceph_con_close(monc->con); + monc->cur_mon = -1; + monc->pending_auth = 0; + ceph_auth_reset(monc->auth); + } +} + +/* + * Open a session with a (new) monitor. + */ +static int __open_session(struct ceph_mon_client *monc) +{ + char r; + int ret; + + if (monc->cur_mon < 0) { + get_random_bytes(&r, 1); + monc->cur_mon = r % monc->monmap->num_mon; + dout("open_session num=%d r=%d -> mon%d\n", + monc->monmap->num_mon, r, monc->cur_mon); + monc->sub_sent = 0; + monc->sub_renew_after = jiffies; /* i.e., expired */ + monc->want_next_osdmap = !!monc->want_next_osdmap; + + dout("open_session mon%d opening\n", monc->cur_mon); + monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; + monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); + ceph_con_open(monc->con, + &monc->monmap->mon_inst[monc->cur_mon].addr); + + /* initiatiate authentication handshake */ + ret = ceph_auth_build_hello(monc->auth, + monc->m_auth->front.iov_base, + monc->m_auth->front_max); + __send_prepared_auth_request(monc, ret); + } else { + dout("open_session mon%d already open\n", monc->cur_mon); + } + return 0; +} + +static bool __sub_expired(struct ceph_mon_client *monc) +{ + return time_after_eq(jiffies, monc->sub_renew_after); +} + +/* + * Reschedule delayed work timer. + */ +static void __schedule_delayed(struct ceph_mon_client *monc) +{ + unsigned delay; + + if (monc->cur_mon < 0 || __sub_expired(monc)) + delay = 10 * HZ; + else + delay = 20 * HZ; + dout("__schedule_delayed after %u\n", delay); + schedule_delayed_work(&monc->delayed_work, delay); +} + +/* + * Send subscribe request for mdsmap and/or osdmap. + */ +static void __send_subscribe(struct ceph_mon_client *monc) +{ + dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", + (unsigned)monc->sub_sent, __sub_expired(monc), + monc->want_next_osdmap); + if ((__sub_expired(monc) && !monc->sub_sent) || + monc->want_next_osdmap == 1) { + struct ceph_msg *msg = monc->m_subscribe; + struct ceph_mon_subscribe_item *i; + void *p, *end; + int num; + + p = msg->front.iov_base; + end = p + msg->front_max; + + num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; + ceph_encode_32(&p, num); + + if (monc->want_next_osdmap) { + dout("__send_subscribe to 'osdmap' %u\n", + (unsigned)monc->have_osdmap); + ceph_encode_string(&p, end, "osdmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_osdmap); + i->onetime = 1; + p += sizeof(*i); + monc->want_next_osdmap = 2; /* requested */ + } + if (monc->want_mdsmap) { + dout("__send_subscribe to 'mdsmap' %u+\n", + (unsigned)monc->have_mdsmap); + ceph_encode_string(&p, end, "mdsmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_mdsmap); + i->onetime = 0; + p += sizeof(*i); + } + ceph_encode_string(&p, end, "monmap", 6); + i = p; + i->have = 0; + i->onetime = 0; + p += sizeof(*i); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_con_revoke(monc->con, msg); + ceph_con_send(monc->con, ceph_msg_get(msg)); + + monc->sub_sent = jiffies | 1; /* never 0 */ + } +} + +static void handle_subscribe_ack(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + unsigned seconds; + struct ceph_mon_subscribe_ack *h = msg->front.iov_base; + + if (msg->front.iov_len < sizeof(*h)) + goto bad; + seconds = le32_to_cpu(h->duration); + + mutex_lock(&monc->mutex); + if (monc->hunting) { + pr_info("mon%d %s session established\n", + monc->cur_mon, + ceph_pr_addr(&monc->con->peer_addr.in_addr)); + monc->hunting = false; + } + dout("handle_subscribe_ack after %d seconds\n", seconds); + monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; + monc->sub_sent = 0; + mutex_unlock(&monc->mutex); + return; +bad: + pr_err("got corrupt subscribe-ack msg\n"); + ceph_msg_dump(msg); +} + +/* + * Keep track of which maps we have + */ +int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) +{ + mutex_lock(&monc->mutex); + monc->have_mdsmap = got; + mutex_unlock(&monc->mutex); + return 0; +} +EXPORT_SYMBOL(ceph_monc_got_mdsmap); + +int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) +{ + mutex_lock(&monc->mutex); + monc->have_osdmap = got; + monc->want_next_osdmap = 0; + mutex_unlock(&monc->mutex); + return 0; +} + +/* + * Register interest in the next osdmap + */ +void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) +{ + dout("request_next_osdmap have %u\n", monc->have_osdmap); + mutex_lock(&monc->mutex); + if (!monc->want_next_osdmap) + monc->want_next_osdmap = 1; + if (monc->want_next_osdmap < 2) + __send_subscribe(monc); + mutex_unlock(&monc->mutex); +} + +/* + * + */ +int ceph_monc_open_session(struct ceph_mon_client *monc) +{ + if (!monc->con) { + monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); + if (!monc->con) + return -ENOMEM; + ceph_con_init(monc->client->msgr, monc->con); + monc->con->private = monc; + monc->con->ops = &mon_con_ops; + } + + mutex_lock(&monc->mutex); + __open_session(monc); + __schedule_delayed(monc); + mutex_unlock(&monc->mutex); + return 0; +} +EXPORT_SYMBOL(ceph_monc_open_session); + +/* + * The monitor responds with mount ack indicate mount success. The + * included client ticket allows the client to talk to MDSs and OSDs. + */ +static void ceph_monc_handle_map(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_client *client = monc->client; + struct ceph_monmap *monmap = NULL, *old = monc->monmap; + void *p, *end; + + mutex_lock(&monc->mutex); + + dout("handle_monmap\n"); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + monmap = ceph_monmap_decode(p, end); + if (IS_ERR(monmap)) { + pr_err("problem decoding monmap, %d\n", + (int)PTR_ERR(monmap)); + goto out; + } + + if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { + kfree(monmap); + goto out; + } + + client->monc.monmap = monmap; + kfree(old); + +out: + mutex_unlock(&monc->mutex); + wake_up_all(&client->auth_wq); +} + +/* + * generic requests (e.g., statfs, poolop) + */ +static struct ceph_mon_generic_request *__lookup_generic_req( + struct ceph_mon_client *monc, u64 tid) +{ + struct ceph_mon_generic_request *req; + struct rb_node *n = monc->generic_request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mon_generic_request, node); + if (tid < req->tid) + n = n->rb_left; + else if (tid > req->tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static void __insert_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *new) +{ + struct rb_node **p = &monc->generic_request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mon_generic_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mon_generic_request, node); + if (new->tid < req->tid) + p = &(*p)->rb_left; + else if (new->tid > req->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &monc->generic_request_tree); +} + +static void release_generic_request(struct kref *kref) +{ + struct ceph_mon_generic_request *req = + container_of(kref, struct ceph_mon_generic_request, kref); + + if (req->reply) + ceph_msg_put(req->reply); + if (req->request) + ceph_msg_put(req->request); + + kfree(req); +} + +static void put_generic_request(struct ceph_mon_generic_request *req) +{ + kref_put(&req->kref, release_generic_request); +} + +static void get_generic_request(struct ceph_mon_generic_request *req) +{ + kref_get(&req->kref); +} + +static struct ceph_msg *get_generic_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(hdr->tid); + struct ceph_msg *m; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (!req) { + dout("get_generic_reply %lld dne\n", tid); + *skip = 1; + m = NULL; + } else { + dout("get_generic_reply %lld got %p\n", tid, req->reply); + m = ceph_msg_get(req->reply); + /* + * we don't need to track the connection reading into + * this reply because we only have one open connection + * at a time, ever. + */ + } + mutex_unlock(&monc->mutex); + return m; +} + +static int do_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + int err; + + /* register request */ + mutex_lock(&monc->mutex); + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; + ceph_con_send(monc->con, ceph_msg_get(req->request)); + mutex_unlock(&monc->mutex); + + err = wait_for_completion_interruptible(&req->completion); + + mutex_lock(&monc->mutex); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; + mutex_unlock(&monc->mutex); + + if (!err) + err = req->result; + return err; +} + +/* + * statfs + */ +static void handle_statfs_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_statfs_reply *reply = msg->front.iov_base; + u64 tid = le64_to_cpu(msg->hdr.tid); + + if (msg->front.iov_len != sizeof(*reply)) + goto bad; + dout("handle_statfs_reply %p tid %llu\n", msg, tid); + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (req) { + *(struct ceph_statfs *)req->buf = reply->st; + req->result = 0; + get_generic_request(req); + } + mutex_unlock(&monc->mutex); + if (req) { + complete_all(&req->completion); + put_generic_request(req); + } + return; + +bad: + pr_err("corrupt generic reply, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +/* + * Do a synchronous statfs(). + */ +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_statfs *h; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + req->buf_len = sizeof(*buf); + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + h = req->request->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + + err = do_generic_request(monc, req); + +out: + kref_put(&req->kref, release_generic_request); + return err; +} +EXPORT_SYMBOL(ceph_monc_do_statfs); + +/* + * pool ops + */ +static int get_poolop_reply_buf(const char *src, size_t src_len, + char *dst, size_t dst_len) +{ + u32 buf_len; + + if (src_len != sizeof(u32) + dst_len) + return -EINVAL; + + buf_len = le32_to_cpu(*(u32 *)src); + if (buf_len != dst_len) + return -EINVAL; + + memcpy(dst, src + sizeof(u32), dst_len); + return 0; +} + +static void handle_poolop_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_poolop_reply *reply = msg->front.iov_base; + u64 tid = le64_to_cpu(msg->hdr.tid); + + if (msg->front.iov_len < sizeof(*reply)) + goto bad; + dout("handle_poolop_reply %p tid %llu\n", msg, tid); + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (req) { + if (req->buf_len && + get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), + msg->front.iov_len - sizeof(*reply), + req->buf, req->buf_len) < 0) { + mutex_unlock(&monc->mutex); + goto bad; + } + req->result = le32_to_cpu(reply->reply_code); + get_generic_request(req); + } + mutex_unlock(&monc->mutex); + if (req) { + complete(&req->completion); + put_generic_request(req); + } + return; + +bad: + pr_err("corrupt generic reply, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +/* + * Do a synchronous pool op. + */ +int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, + u32 pool, u64 snapid, + char *buf, int len) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_poolop *h; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + req->buf_len = len; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + req->request->hdr.version = cpu_to_le16(2); + h = req->request->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + h->pool = cpu_to_le32(pool); + h->op = cpu_to_le32(op); + h->auid = 0; + h->snapid = cpu_to_le64(snapid); + h->name_len = 0; + + err = do_generic_request(monc, req); + +out: + kref_put(&req->kref, release_generic_request); + return err; +} + +int ceph_monc_create_snapid(struct ceph_mon_client *monc, + u32 pool, u64 *snapid) +{ + return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + pool, 0, (char *)snapid, sizeof(*snapid)); + +} +EXPORT_SYMBOL(ceph_monc_create_snapid); + +int ceph_monc_delete_snapid(struct ceph_mon_client *monc, + u32 pool, u64 snapid) +{ + return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + pool, snapid, 0, 0); + +} + +/* + * Resend pending generic requests. + */ +static void __resend_generic_request(struct ceph_mon_client *monc) +{ + struct ceph_mon_generic_request *req; + struct rb_node *p; + + for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_generic_request, node); + ceph_con_revoke(monc->con, req->request); + ceph_con_send(monc->con, ceph_msg_get(req->request)); + } +} + +/* + * Delayed work. If we haven't mounted yet, retry. Otherwise, + * renew/retry subscription as needed (in case it is timing out, or we + * got an ENOMEM). And keep the monitor connection alive. + */ +static void delayed_work(struct work_struct *work) +{ + struct ceph_mon_client *monc = + container_of(work, struct ceph_mon_client, delayed_work.work); + + dout("monc delayed_work\n"); + mutex_lock(&monc->mutex); + if (monc->hunting) { + __close_session(monc); + __open_session(monc); /* continue hunting */ + } else { + ceph_con_keepalive(monc->con); + + __validate_auth(monc); + + if (monc->auth->ops->is_authenticated(monc->auth)) + __send_subscribe(monc); + } + __schedule_delayed(monc); + mutex_unlock(&monc->mutex); +} + +/* + * On startup, we build a temporary monmap populated with the IPs + * provided by mount(2). + */ +static int build_initial_monmap(struct ceph_mon_client *monc) +{ + struct ceph_options *opt = monc->client->options; + struct ceph_entity_addr *mon_addr = opt->mon_addr; + int num_mon = opt->num_mon; + int i; + + /* build initial monmap */ + monc->monmap = kzalloc(sizeof(*monc->monmap) + + num_mon*sizeof(monc->monmap->mon_inst[0]), + GFP_KERNEL); + if (!monc->monmap) + return -ENOMEM; + for (i = 0; i < num_mon; i++) { + monc->monmap->mon_inst[i].addr = mon_addr[i]; + monc->monmap->mon_inst[i].addr.nonce = 0; + monc->monmap->mon_inst[i].name.type = + CEPH_ENTITY_TYPE_MON; + monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); + } + monc->monmap->num_mon = num_mon; + monc->have_fsid = false; + return 0; +} + +int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) +{ + int err = 0; + + dout("init\n"); + memset(monc, 0, sizeof(*monc)); + monc->client = cl; + monc->monmap = NULL; + mutex_init(&monc->mutex); + + err = build_initial_monmap(monc); + if (err) + goto out; + + monc->con = NULL; + + /* authentication */ + monc->auth = ceph_auth_init(cl->options->name, + cl->options->key); + if (IS_ERR(monc->auth)) + return PTR_ERR(monc->auth); + monc->auth->want_keys = + CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | + CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; + + /* msgs */ + err = -ENOMEM; + monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, + sizeof(struct ceph_mon_subscribe_ack), + GFP_NOFS); + if (!monc->m_subscribe_ack) + goto out_monmap; + + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); + if (!monc->m_subscribe) + goto out_subscribe_ack; + + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); + if (!monc->m_auth_reply) + goto out_subscribe; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); + monc->pending_auth = 0; + if (!monc->m_auth) + goto out_auth_reply; + + monc->cur_mon = -1; + monc->hunting = true; + monc->sub_renew_after = jiffies; + monc->sub_sent = 0; + + INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); + monc->generic_request_tree = RB_ROOT; + monc->num_generic_requests = 0; + monc->last_tid = 0; + + monc->have_mdsmap = 0; + monc->have_osdmap = 0; + monc->want_next_osdmap = 1; + return 0; + +out_auth_reply: + ceph_msg_put(monc->m_auth_reply); +out_subscribe: + ceph_msg_put(monc->m_subscribe); +out_subscribe_ack: + ceph_msg_put(monc->m_subscribe_ack); +out_monmap: + kfree(monc->monmap); +out: + return err; +} +EXPORT_SYMBOL(ceph_monc_init); + +void ceph_monc_stop(struct ceph_mon_client *monc) +{ + dout("stop\n"); + cancel_delayed_work_sync(&monc->delayed_work); + + mutex_lock(&monc->mutex); + __close_session(monc); + if (monc->con) { + monc->con->private = NULL; + monc->con->ops->put(monc->con); + monc->con = NULL; + } + mutex_unlock(&monc->mutex); + + ceph_auth_destroy(monc->auth); + + ceph_msg_put(monc->m_auth); + ceph_msg_put(monc->m_auth_reply); + ceph_msg_put(monc->m_subscribe); + ceph_msg_put(monc->m_subscribe_ack); + + kfree(monc->monmap); +} +EXPORT_SYMBOL(ceph_monc_stop); + +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + int ret; + int was_auth = 0; + + mutex_lock(&monc->mutex); + if (monc->auth->ops) + was_auth = monc->auth->ops->is_authenticated(monc->auth); + monc->pending_auth = 0; + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_max); + if (ret < 0) { + monc->client->auth_err = ret; + wake_up_all(&monc->client->auth_wq); + } else if (ret > 0) { + __send_prepared_auth_request(monc, ret); + } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { + dout("authenticated, starting session\n"); + + monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr->inst.name.num = + cpu_to_le64(monc->auth->global_id); + + __send_subscribe(monc); + __resend_generic_request(monc); + } + mutex_unlock(&monc->mutex); +} + +static int __validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + if (monc->pending_auth) + return 0; + + ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, + monc->m_auth->front_max); + if (ret <= 0) + return ret; /* either an error, or no need to authenticate */ + __send_prepared_auth_request(monc, ret); + return 0; +} + +int ceph_monc_validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + mutex_lock(&monc->mutex); + ret = __validate_auth(monc); + mutex_unlock(&monc->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_monc_validate_auth); + +/* + * handle incoming message + */ +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mon_client *monc = con->private; + int type = le16_to_cpu(msg->hdr.type); + + if (!monc) + return; + + switch (type) { + case CEPH_MSG_AUTH_REPLY: + handle_auth_reply(monc, msg); + break; + + case CEPH_MSG_MON_SUBSCRIBE_ACK: + handle_subscribe_ack(monc, msg); + break; + + case CEPH_MSG_STATFS_REPLY: + handle_statfs_reply(monc, msg); + break; + + case CEPH_MSG_POOLOP_REPLY: + handle_poolop_reply(monc, msg); + break; + + case CEPH_MSG_MON_MAP: + ceph_monc_handle_map(monc, msg); + break; + + case CEPH_MSG_OSD_MAP: + ceph_osdc_handle_map(&monc->client->osdc, msg); + break; + + default: + /* can the chained handler handle it? */ + if (monc->client->extra_mon_dispatch && + monc->client->extra_mon_dispatch(monc->client, msg) == 0) + break; + + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } + ceph_msg_put(msg); +} + +/* + * Allocate memory for incoming message + */ +static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + struct ceph_msg *m = NULL; + + *skip = 0; + + switch (type) { + case CEPH_MSG_MON_SUBSCRIBE_ACK: + m = ceph_msg_get(monc->m_subscribe_ack); + break; + case CEPH_MSG_POOLOP_REPLY: + case CEPH_MSG_STATFS_REPLY: + return get_generic_reply(con, hdr, skip); + case CEPH_MSG_AUTH_REPLY: + m = ceph_msg_get(monc->m_auth_reply); + break; + case CEPH_MSG_MON_MAP: + case CEPH_MSG_MDS_MAP: + case CEPH_MSG_OSD_MAP: + m = ceph_msg_new(type, front_len, GFP_NOFS); + break; + } + + if (!m) { + pr_info("alloc_msg unknown type %d\n", type); + *skip = 1; + } + return m; +} + +/* + * If the monitor connection resets, pick a new monitor and resubmit + * any pending requests. + */ +static void mon_fault(struct ceph_connection *con) +{ + struct ceph_mon_client *monc = con->private; + + if (!monc) + return; + + dout("mon_fault\n"); + mutex_lock(&monc->mutex); + if (!con->private) + goto out; + + if (monc->con && !monc->hunting) + pr_info("mon%d %s session lost, " + "hunting for new mon\n", monc->cur_mon, + ceph_pr_addr(&monc->con->peer_addr.in_addr)); + + __close_session(monc); + if (!monc->hunting) { + /* start hunting */ + monc->hunting = true; + __open_session(monc); + } else { + /* already hunting, let's wait a bit */ + __schedule_delayed(monc); + } +out: + mutex_unlock(&monc->mutex); +} + +static const struct ceph_connection_operations mon_con_ops = { + .get = ceph_con_get, + .put = ceph_con_put, + .dispatch = dispatch, + .fault = mon_fault, + .alloc_msg = mon_alloc_msg, +}; diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c new file mode 100644 index 00000000..d5f2d97a --- /dev/null +++ b/net/ceph/msgpool.c @@ -0,0 +1,64 @@ +#include + +#include +#include +#include +#include + +#include + +static void *alloc_fn(gfp_t gfp_mask, void *arg) +{ + struct ceph_msgpool *pool = arg; + void *p; + + p = ceph_msg_new(0, pool->front_len, gfp_mask); + if (!p) + pr_err("msgpool %s alloc failed\n", pool->name); + return p; +} + +static void free_fn(void *element, void *arg) +{ + ceph_msg_put(element); +} + +int ceph_msgpool_init(struct ceph_msgpool *pool, + int front_len, int size, bool blocking, const char *name) +{ + pool->front_len = front_len; + pool->pool = mempool_create(size, alloc_fn, free_fn, pool); + if (!pool->pool) + return -ENOMEM; + pool->name = name; + return 0; +} + +void ceph_msgpool_destroy(struct ceph_msgpool *pool) +{ + mempool_destroy(pool->pool); +} + +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, + int front_len) +{ + if (front_len > pool->front_len) { + pr_err("msgpool_get pool %s need front %d, pool size is %d\n", + pool->name, front_len, pool->front_len); + WARN_ON(1); + + /* try to alloc a fresh message */ + return ceph_msg_new(0, front_len, GFP_NOFS); + } + + return mempool_alloc(pool->pool, GFP_NOFS); +} + +void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) +{ + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); + + kref_init(&msg->kref); /* retake single ref */ +} diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c new file mode 100644 index 00000000..7330c275 --- /dev/null +++ b/net/ceph/osd_client.c @@ -0,0 +1,2162 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BLOCK +#include +#endif + +#include +#include +#include +#include +#include +#include + +#define OSD_OP_FRONT_LEN 4096 +#define OSD_OPREPLY_FRONT_LEN 512 + +static const struct ceph_connection_operations osd_con_ops; + +static void send_queued(struct ceph_osd_client *osdc); +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static int __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); + +static int op_needs_trail(int op) +{ + switch (op) { + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY: + return 1; + default: + return 0; + } +} + +static int op_has_extent(int op) +{ + return (op == CEPH_OSD_OP_READ || + op == CEPH_OSD_OP_WRITE); +} + +void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + + reqhead->snapid = cpu_to_le64(snapid); + + /* object extent? */ + ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (*plen < orig_len) + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + + if (op_has_extent(op->op)) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + req->r_page_alignment = off & ~PAGE_MASK; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", + *bno, objoff, objlen, req->r_num_pages); + +} +EXPORT_SYMBOL(ceph_calc_raw_layout); + +/* + * Implement client access to distributed object storage cluster. + * + * All data objects are stored within a cluster/cloud of OSDs, or + * "object storage devices." (Note that Ceph OSDs have _nothing_ to + * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply + * remote daemons serving up and coordinating consistent and safe + * access to storage. + * + * Cluster membership and the mapping of data objects onto storage devices + * are described by the osd map. + * + * We keep track of pending OSD requests (read, write), resubmit + * requests to different OSDs when the cluster topology/data layout + * change, or retry the affected requests when the communications + * channel with an OSD is reset. + */ + +/* + * calculate the mapping of a file extent onto an object, and fill out the + * request accordingly. shorten extent as necessary if it crosses an + * object boundary. + * + * fill osd op in request message. + */ +static void calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + u64 bno; + + ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); + + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); + req->r_oid_len = strlen(req->r_oid); +} + +/* + * requests + */ +void ceph_osdc_release_request(struct kref *kref) +{ + struct ceph_osd_request *req = container_of(kref, + struct ceph_osd_request, + r_kref); + + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) + ceph_msg_put(req->r_reply); + if (req->r_con_filling_msg) { + dout("release_request revoking pages %p from con %p\n", + req->r_pages, req->r_con_filling_msg); + ceph_con_revoke_message(req->r_con_filling_msg, + req->r_reply); + ceph_con_put(req->r_con_filling_msg); + } + if (req->r_own_pages) + ceph_release_page_vector(req->r_pages, + req->r_num_pages); +#ifdef CONFIG_BLOCK + if (req->r_bio) + bio_put(req->r_bio); +#endif + ceph_put_snap_context(req->r_snapc); + if (req->r_trail) { + ceph_pagelist_release(req->r_trail); + kfree(req->r_trail); + } + if (req->r_mempool) + mempool_free(req, req->r_osdc->req_mempool); + else + kfree(req); +} +EXPORT_SYMBOL(ceph_osdc_release_request); + +static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) +{ + int i = 0; + + if (needs_trail) + *needs_trail = 0; + while (ops[i].op) { + if (needs_trail && op_needs_trail(ops[i].op)) + *needs_trail = 1; + i++; + } + + return i; +} + +struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, + struct ceph_snap_context *snapc, + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio) +{ + struct ceph_osd_request *req; + struct ceph_msg *msg; + int needs_trail; + int num_op = get_num_ops(ops, &needs_trail); + size_t msg_size = sizeof(struct ceph_osd_request_head); + + msg_size += num_op*sizeof(struct ceph_osd_op); + + if (use_mempool) { + req = mempool_alloc(osdc->req_mempool, gfp_flags); + memset(req, 0, sizeof(*req)); + } else { + req = kzalloc(sizeof(*req), gfp_flags); + } + if (req == NULL) + return NULL; + + req->r_osdc = osdc; + req->r_mempool = use_mempool; + + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd); + req->r_flags = flags; + + WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); + + /* create reply message */ + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, + OSD_OPREPLY_FRONT_LEN, gfp_flags); + if (!msg) { + ceph_osdc_put_request(req); + return NULL; + } + req->r_reply = msg; + + /* allocate space for the trailing data */ + if (needs_trail) { + req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); + if (!req->r_trail) { + ceph_osdc_put_request(req); + return NULL; + } + ceph_pagelist_init(req->r_trail); + } + /* create request message; allow space for oid */ + msg_size += 40; + if (snapc) + msg_size += sizeof(u64) * snapc->num_snaps; + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); + if (!msg) { + ceph_osdc_put_request(req); + return NULL; + } + + msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); + memset(msg->front.iov_base, 0, msg->front.iov_len); + + req->r_request = msg; + req->r_pages = pages; +#ifdef CONFIG_BLOCK + if (bio) { + req->r_bio = bio; + bio_get(req->r_bio); + } +#endif + + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); + +static void osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, + struct ceph_osd_req_op *src) +{ + dst->op = cpu_to_le16(src->op); + + switch (dst->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + dst->extent.offset = + cpu_to_le64(src->extent.offset); + dst->extent.length = + cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + break; + + case CEPH_OSD_OP_GETXATTR: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + BUG_ON(!req->r_trail); + + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); + dst->xattr.cmp_op = src->xattr.cmp_op; + dst->xattr.cmp_mode = src->xattr.cmp_mode; + ceph_pagelist_append(req->r_trail, src->xattr.name, + src->xattr.name_len); + ceph_pagelist_append(req->r_trail, src->xattr.val, + src->xattr.value_len); + break; + case CEPH_OSD_OP_CALL: + BUG_ON(!req->r_trail); + + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + + ceph_pagelist_append(req->r_trail, src->cls.class_name, + src->cls.class_len); + ceph_pagelist_append(req->r_trail, src->cls.method_name, + src->cls.method_len); + ceph_pagelist_append(req->r_trail, src->cls.indata, + src->cls.indata_len); + break; + case CEPH_OSD_OP_ROLLBACK: + dst->snap.snapid = cpu_to_le64(src->snap.snapid); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + case CEPH_OSD_OP_NOTIFY: + { + __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); + __le32 timeout = cpu_to_le32(src->watch.timeout); + + BUG_ON(!req->r_trail); + + ceph_pagelist_append(req->r_trail, + &prot_ver, sizeof(prot_ver)); + ceph_pagelist_append(req->r_trail, + &timeout, sizeof(timeout)); + } + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; + default: + pr_err("unrecognized osd opcode %d\n", dst->op); + WARN_ON(1); + break; + } + dst->payload_len = cpu_to_le32(src->payload_len); +} + +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len) +{ + struct ceph_msg *msg = req->r_request; + struct ceph_osd_request_head *head; + struct ceph_osd_req_op *src_op; + struct ceph_osd_op *op; + void *p; + int num_op = get_num_ops(src_ops, NULL); + size_t msg_size = sizeof(*head) + num_op*sizeof(*op); + int flags = req->r_flags; + u64 data_len = 0; + int i; + + head = msg->front.iov_base; + op = (void *)(head + 1); + p = (void *)(op + num_op); + + req->r_snapc = ceph_get_snap_context(snapc); + + head->client_inc = cpu_to_le32(1); /* always, for now. */ + head->flags = cpu_to_le32(flags); + if (flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(&head->mtime, mtime); + head->num_ops = cpu_to_le16(num_op); + + + /* fill in oid */ + head->object_len = cpu_to_le32(oid_len); + memcpy(p, oid, oid_len); + p += oid_len; + + src_op = src_ops; + while (src_op->op) { + osd_req_encode_op(req, op, src_op); + src_op++; + op++; + } + + if (req->r_trail) + data_len += req->r_trail->length; + + if (snapc) { + head->snap_seq = cpu_to_le64(snapc->seq); + head->num_snaps = cpu_to_le32(snapc->num_snaps); + for (i = 0; i < snapc->num_snaps; i++) { + put_unaligned_le64(snapc->snaps[i], p); + p += sizeof(u64); + } + } + + if (flags & CEPH_OSD_FLAG_WRITE) { + req->r_request->hdr.data_off = cpu_to_le16(off); + req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + } else if (data_len) { + req->r_request->hdr.data_off = 0; + req->r_request->hdr.data_len = cpu_to_le32(data_len); + } + + req->r_request->page_alignment = req->r_page_alignment; + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + return; +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/* + * build new request AND message, calculate layout, and adjust file + * extent as needed. + * + * if the file was recently truncated, we include information about its + * old and new size so that the object can be updated appropriately. (we + * avoid synchronously deleting truncated objects because it's slow.) + * + * if @do_sync, include a 'startsync' command so that the osd will flush + * data quickly. + */ +struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 off, u64 *plen, + int opcode, int flags, + struct ceph_snap_context *snapc, + int do_sync, + u32 truncate_seq, + u64 truncate_size, + struct timespec *mtime, + bool use_mempool, int num_reply, + int page_align) +{ + struct ceph_osd_req_op ops[3]; + struct ceph_osd_request *req; + + ops[0].op = opcode; + ops[0].extent.truncate_seq = truncate_seq; + ops[0].extent.truncate_size = truncate_size; + ops[0].payload_len = 0; + + if (do_sync) { + ops[1].op = CEPH_OSD_OP_STARTSYNC; + ops[1].payload_len = 0; + ops[2].op = 0; + } else + ops[1].op = 0; + + req = ceph_osdc_alloc_request(osdc, flags, + snapc, ops, + use_mempool, + GFP_NOFS, NULL, NULL); + if (!req) + return NULL; + + /* calculate max write size */ + calc_layout(osdc, vino, layout, off, plen, req, ops); + req->r_file_layout = *layout; /* keep a copy */ + + /* in case it differs from natural (file) alignment that + calc_layout filled in for us */ + req->r_num_pages = calc_pages_for(page_align, *plen); + req->r_page_alignment = page_align; + + ceph_osdc_build_request(req, off, plen, ops, + snapc, + mtime, + req->r_oid, req->r_oid_len); + + return req; +} +EXPORT_SYMBOL(ceph_osdc_new_request); + +/* + * We keep osd requests in an rbtree, sorted by ->r_tid. + */ +static void __insert_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *new) +{ + struct rb_node **p = &osdc->requests.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_osd_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &osdc->requests); +} + +static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, + u64 tid) +{ + struct ceph_osd_request *req; + struct rb_node *n = osdc->requests.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_osd_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static struct ceph_osd_request * +__lookup_request_ge(struct ceph_osd_client *osdc, + u64 tid) +{ + struct ceph_osd_request *req; + struct rb_node *n = osdc->requests.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_osd_request, r_node); + if (tid < req->r_tid) { + if (!n->rb_left) + return req; + n = n->rb_left; + } else if (tid > req->r_tid) { + n = n->rb_right; + } else { + return req; + } + } + return NULL; +} + +/* + * Resubmit requests pending on the given osd. + */ +static void __kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + struct ceph_osd_request *req, *nreq; + int err; + + dout("__kick_osd_requests osd%d\n", osd->o_osd); + err = __reset_osd(osdc, osd); + if (err == -EAGAIN) + return; + + list_for_each_entry(req, &osd->o_requests, r_osd_item) { + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, + r_linger_osd) { + /* + * reregister request prior to unregistering linger so + * that r_osd is preserved. + */ + BUG_ON(!list_empty(&req->r_req_lru_item)); + __register_request(osdc, req); + list_add(&req->r_req_lru_item, &osdc->req_unsent); + list_add(&req->r_osd_item, &req->r_osd->o_requests); + __unregister_linger_request(osdc, req); + dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + } +} + +static void kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, kickosd); + mutex_unlock(&osdc->request_mutex); +} + +/* + * If the osd connection drops, we need to resubmit all requests. + */ +static void osd_reset(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + + if (!osd) + return; + dout("osd_reset osd%d\n", osd->o_osd); + osdc = osd->o_osdc; + down_read(&osdc->map_sem); + kick_osd_requests(osdc, osd); + send_queued(osdc); + up_read(&osdc->map_sem); +} + +/* + * Track open sessions with osds. + */ +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) +{ + struct ceph_osd *osd; + + osd = kzalloc(sizeof(*osd), GFP_NOFS); + if (!osd) + return NULL; + + atomic_set(&osd->o_ref, 1); + osd->o_osdc = osdc; + INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); + INIT_LIST_HEAD(&osd->o_osd_lru); + osd->o_incarnation = 1; + + ceph_con_init(osdc->client->msgr, &osd->o_con); + osd->o_con.private = osd; + osd->o_con.ops = &osd_con_ops; + osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; + + INIT_LIST_HEAD(&osd->o_keepalive_item); + return osd; +} + +static struct ceph_osd *get_osd(struct ceph_osd *osd) +{ + if (atomic_inc_not_zero(&osd->o_ref)) { + dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, + atomic_read(&osd->o_ref)); + return osd; + } else { + dout("get_osd %p FAIL\n", osd); + return NULL; + } +} + +static void put_osd(struct ceph_osd *osd) +{ + dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), + atomic_read(&osd->o_ref) - 1); + if (atomic_dec_and_test(&osd->o_ref)) { + struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; + + if (osd->o_authorizer) + ac->ops->destroy_authorizer(ac, osd->o_authorizer); + kfree(osd); + } +} + +/* + * remove an osd from our map + */ +static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + dout("__remove_osd %p\n", osd); + BUG_ON(!list_empty(&osd->o_requests)); + rb_erase(&osd->o_node, &osdc->osds); + list_del_init(&osd->o_osd_lru); + ceph_con_close(&osd->o_con); + put_osd(osd); +} + +static void __move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("__move_osd_to_lru %p\n", osd); + BUG_ON(!list_empty(&osd->o_osd_lru)); + list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; +} + +static void __remove_osd_from_lru(struct ceph_osd *osd) +{ + dout("__remove_osd_from_lru %p\n", osd); + if (!list_empty(&osd->o_osd_lru)) + list_del_init(&osd->o_osd_lru); +} + +static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) +{ + struct ceph_osd *osd, *nosd; + + dout("__remove_old_osds %p\n", osdc); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { + if (!remove_all && time_before(jiffies, osd->lru_ttl)) + break; + __remove_osd(osdc, osd); + } + mutex_unlock(&osdc->request_mutex); +} + +/* + * reset osd connect + */ +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + struct ceph_osd_request *req; + int ret = 0; + + dout("__reset_osd %p osd%d\n", osd, osd->o_osd); + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) { + __remove_osd(osdc, osd); + } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], + &osd->o_con.peer_addr, + sizeof(osd->o_con.peer_addr)) == 0 && + !ceph_con_opened(&osd->o_con)) { + dout(" osd addr hasn't changed and connection never opened," + " letting msgr retry"); + /* touch each r_stamp for handle_timeout()'s benfit */ + list_for_each_entry(req, &osd->o_requests, r_osd_item) + req->r_stamp = jiffies; + ret = -EAGAIN; + } else { + ceph_con_close(&osd->o_con); + ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); + osd->o_incarnation++; + } + return ret; +} + +static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) +{ + struct rb_node **p = &osdc->osds.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd *osd = NULL; + + while (*p) { + parent = *p; + osd = rb_entry(parent, struct ceph_osd, o_node); + if (new->o_osd < osd->o_osd) + p = &(*p)->rb_left; + else if (new->o_osd > osd->o_osd) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->o_node, parent, p); + rb_insert_color(&new->o_node, &osdc->osds); +} + +static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) +{ + struct ceph_osd *osd; + struct rb_node *n = osdc->osds.rb_node; + + while (n) { + osd = rb_entry(n, struct ceph_osd, o_node); + if (o < osd->o_osd) + n = n->rb_left; + else if (o > osd->o_osd) + n = n->rb_right; + else + return osd; + } + return NULL; +} + +static void __schedule_osd_timeout(struct ceph_osd_client *osdc) +{ + schedule_delayed_work(&osdc->timeout_work, + osdc->client->options->osd_keepalive_timeout * HZ); +} + +static void __cancel_osd_timeout(struct ceph_osd_client *osdc) +{ + cancel_delayed_work(&osdc->timeout_work); +} + +/* + * Register request, assign tid. If this is the first request, set up + * the timeout event. + */ +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + req->r_tid = ++osdc->last_tid; + req->r_request->hdr.tid = cpu_to_le64(req->r_tid); + INIT_LIST_HEAD(&req->r_req_lru_item); + + dout("__register_request %p tid %lld\n", req, req->r_tid); + __insert_request(osdc, req); + ceph_osdc_get_request(req); + osdc->num_requests++; + + if (osdc->num_requests == 1) { + dout(" first request, scheduling timeout\n"); + __schedule_osd_timeout(osdc); + } +} + +static void register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + __register_request(osdc, req); + mutex_unlock(&osdc->request_mutex); +} + +/* + * called under osdc->request_mutex + */ +static void __unregister_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &osdc->requests); + osdc->num_requests--; + + if (req->r_osd) { + /* make sure the original request isn't in flight. */ + ceph_con_revoke(&req->r_osd->o_con, req->r_request); + + list_del_init(&req->r_osd_item); + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); + __move_osd_to_lru(osdc, req->r_osd); + } + if (list_empty(&req->r_linger_item)) + req->r_osd = NULL; + } + + ceph_osdc_put_request(req); + + list_del_init(&req->r_req_lru_item); + if (osdc->num_requests == 0) { + dout(" no requests, canceling timeout\n"); + __cancel_osd_timeout(osdc); + } +} + +/* + * Cancel a previously queued request message + */ +static void __cancel_request(struct ceph_osd_request *req) +{ + if (req->r_sent && req->r_osd) { + ceph_con_revoke(&req->r_osd->o_con, req->r_request); + req->r_sent = 0; + } +} + +static void __register_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__register_linger_request %p\n", req); + list_add_tail(&req->r_linger_item, &osdc->req_linger); + list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); +} + +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__unregister_linger_request %p\n", req); + if (req->r_osd) { + list_del_init(&req->r_linger_item); + list_del_init(&req->r_linger_osd); + + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); + __move_osd_to_lru(osdc, req->r_osd); + } + if (list_empty(&req->r_osd_item)) + req->r_osd = NULL; + } +} + +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + if (req->r_linger) { + __unregister_linger_request(osdc, req); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); +} +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); + +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + if (!req->r_linger) { + dout("set_request_linger %p\n", req); + req->r_linger = 1; + /* + * caller is now responsible for calling + * unregister_linger_request + */ + ceph_osdc_get_request(req); + } +} +EXPORT_SYMBOL(ceph_osdc_set_request_linger); + +/* + * Pick an osd (the first 'up' osd in the pg), allocate the osd struct + * (as needed), and set the request r_osd appropriately. If there is + * no up osd, set r_osd to NULL. Move the request to the appropriate list + * (unsent, homeless) or leave on in-flight lru. + * + * Return 0 if unchanged, 1 if changed, or negative on error. + * + * Caller should hold map_sem for read and request_mutex. + */ +static int __map_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; + struct ceph_pg pgid; + int acting[CEPH_PG_MAX_SIZE]; + int o = -1, num = 0; + int err; + + dout("map_request %p tid %lld\n", req, req->r_tid); + err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, + &req->r_file_layout, osdc->osdmap); + if (err) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); + return err; + } + pgid = reqhead->layout.ol_pgid; + req->r_pgid = pgid; + + err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); + if (err > 0) { + o = acting[0]; + num = err; + } + + if ((req->r_osd && req->r_osd->o_osd == o && + req->r_sent >= req->r_osd->o_incarnation && + req->r_num_pg_osds == num && + memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || + (req->r_osd == NULL && o == -1)) + return 0; /* no change */ + + dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", + req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, + req->r_osd ? req->r_osd->o_osd : -1); + + /* record full pg acting set */ + memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); + req->r_num_pg_osds = num; + + if (req->r_osd) { + __cancel_request(req); + list_del_init(&req->r_osd_item); + req->r_osd = NULL; + } + + req->r_osd = __lookup_osd(osdc, o); + if (!req->r_osd && o >= 0) { + err = -ENOMEM; + req->r_osd = create_osd(osdc); + if (!req->r_osd) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); + goto out; + } + + dout("map_request osd %p is osd%d\n", req->r_osd, o); + req->r_osd->o_osd = o; + req->r_osd->o_con.peer_name.num = cpu_to_le64(o); + __insert_osd(osdc, req->r_osd); + + ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); + } + + if (req->r_osd) { + __remove_osd_from_lru(req->r_osd); + list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + } else { + list_move(&req->r_req_lru_item, &osdc->req_notarget); + } + err = 1; /* osd or pg changed */ + +out: + return err; +} + +/* + * caller should hold map_sem (for read) and request_mutex + */ +static int __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + struct ceph_osd_request_head *reqhead; + + dout("send_request %p tid %llu to osd%d flags %d\n", + req, req->r_tid, req->r_osd->o_osd, req->r_flags); + + reqhead = req->r_request->front.iov_base; + reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); + reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ + reqhead->reassert_version = req->r_reassert_version; + + req->r_stamp = jiffies; + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); + + ceph_msg_get(req->r_request); /* send consumes a ref */ + ceph_con_send(&req->r_osd->o_con, req->r_request); + req->r_sent = req->r_osd->o_incarnation; + return 0; +} + +/* + * Send any requests in the queue (req_unsent). + */ +static void send_queued(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *tmp; + + dout("send_queued\n"); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { + __send_request(osdc, req); + } + mutex_unlock(&osdc->request_mutex); +} + +/* + * Timeout callback, called every N seconds when 1 or more osd + * requests has been active for more than N seconds. When this + * happens, we ping all OSDs with requests who have timed out to + * ensure any communications channel reset is detected. Reset the + * request timeouts another N seconds in the future as we go. + * Reschedule the timeout event another N seconds in future (unless + * there are no open requests). + */ +static void handle_timeout(struct work_struct *work) +{ + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, timeout_work.work); + struct ceph_osd_request *req, *last_req = NULL; + struct ceph_osd *osd; + unsigned long timeout = osdc->client->options->osd_timeout * HZ; + unsigned long keepalive = + osdc->client->options->osd_keepalive_timeout * HZ; + unsigned long last_stamp = 0; + struct list_head slow_osds; + dout("timeout\n"); + down_read(&osdc->map_sem); + + ceph_monc_request_next_osdmap(&osdc->client->monc); + + mutex_lock(&osdc->request_mutex); + + /* + * reset osds that appear to be _really_ unresponsive. this + * is a failsafe measure.. we really shouldn't be getting to + * this point if the system is working properly. the monitors + * should mark the osd as failed and we should find out about + * it from an updated osd map. + */ + while (timeout && !list_empty(&osdc->req_lru)) { + req = list_entry(osdc->req_lru.next, struct ceph_osd_request, + r_req_lru_item); + + if (time_before(jiffies, req->r_stamp + timeout)) + break; + + BUG_ON(req == last_req && req->r_stamp == last_stamp); + last_req = req; + last_stamp = req->r_stamp; + + osd = req->r_osd; + BUG_ON(!osd); + pr_warning(" tid %llu timed out on osd%d, will reset osd\n", + req->r_tid, osd->o_osd); + __kick_osd_requests(osdc, osd); + } + + /* + * ping osds that are a bit slow. this ensures that if there + * is a break in the TCP connection we will notice, and reopen + * a connection with that osd (from the fault callback). + */ + INIT_LIST_HEAD(&slow_osds); + list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { + if (time_before(jiffies, req->r_stamp + keepalive)) + break; + + osd = req->r_osd; + BUG_ON(!osd); + dout(" tid %llu is slow, will send keepalive on osd%d\n", + req->r_tid, osd->o_osd); + list_move_tail(&osd->o_keepalive_item, &slow_osds); + } + while (!list_empty(&slow_osds)) { + osd = list_entry(slow_osds.next, struct ceph_osd, + o_keepalive_item); + list_del_init(&osd->o_keepalive_item); + ceph_con_keepalive(&osd->o_con); + } + + __schedule_osd_timeout(osdc); + mutex_unlock(&osdc->request_mutex); + send_queued(osdc); + up_read(&osdc->map_sem); +} + +static void handle_osds_timeout(struct work_struct *work) +{ + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, + osds_timeout_work.work); + unsigned long delay = + osdc->client->options->osd_idle_ttl * HZ >> 2; + + dout("osds timeout\n"); + down_read(&osdc->map_sem); + remove_old_osds(osdc, 0); + up_read(&osdc->map_sem); + + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(delay)); +} + +static void complete_request(struct ceph_osd_request *req) +{ + if (req->r_safe_callback) + req->r_safe_callback(req, NULL); + complete_all(&req->r_safe_completion); /* fsync waiter */ +} + +/* + * handle osd op reply. either call the callback if it is specified, + * or do the completion to wake up the waiting thread. + */ +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, + struct ceph_connection *con) +{ + struct ceph_osd_reply_head *rhead = msg->front.iov_base; + struct ceph_osd_request *req; + u64 tid; + int numops, object_len, flags; + s32 result; + + tid = le64_to_cpu(msg->hdr.tid); + if (msg->front.iov_len < sizeof(*rhead)) + goto bad; + numops = le32_to_cpu(rhead->num_ops); + object_len = le32_to_cpu(rhead->object_len); + result = le32_to_cpu(rhead->result); + if (msg->front.iov_len != sizeof(*rhead) + object_len + + numops * sizeof(struct ceph_osd_op)) + goto bad; + dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); + /* lookup */ + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (req == NULL) { + dout("handle_reply tid %llu dne\n", tid); + mutex_unlock(&osdc->request_mutex); + return; + } + ceph_osdc_get_request(req); + flags = le32_to_cpu(rhead->flags); + + /* + * if this connection filled our message, drop our reference now, to + * avoid a (safe but slower) revoke later. + */ + if (req->r_con_filling_msg == con && req->r_reply == msg) { + dout(" dropping con_filling_msg ref %p\n", con); + req->r_con_filling_msg = NULL; + ceph_con_put(con); + } + + if (!req->r_got_reply) { + unsigned bytes; + + req->r_result = le32_to_cpu(rhead->result); + bytes = le32_to_cpu(msg->hdr.data_len); + dout("handle_reply result %d bytes %d\n", req->r_result, + bytes); + if (req->r_result == 0) + req->r_result = bytes; + + /* in case this is a write and we need to replay, */ + req->r_reassert_version = rhead->reassert_version; + + req->r_got_reply = 1; + } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { + dout("handle_reply tid %llu dup ack\n", tid); + mutex_unlock(&osdc->request_mutex); + goto done; + } + + dout("handle_reply tid %llu flags %d\n", tid, flags); + + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) + __register_linger_request(osdc, req); + + /* either this is a read, or we got the safe response */ + if (result < 0 || + (flags & CEPH_OSD_FLAG_ONDISK) || + ((flags & CEPH_OSD_FLAG_WRITE) == 0)) + __unregister_request(osdc, req); + + mutex_unlock(&osdc->request_mutex); + + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + + if (flags & CEPH_OSD_FLAG_ONDISK) + complete_request(req); + +done: + dout("req=%p req->r_linger=%d\n", req, req->r_linger); + ceph_osdc_put_request(req); + return; + +bad: + pr_err("corrupt osd_op_reply got %d %d expected %d\n", + (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), + (int)sizeof(*rhead)); + ceph_msg_dump(msg); +} + +static void reset_changed_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *p, *n; + + for (p = rb_first(&osdc->osds); p; p = n) { + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + + n = rb_next(p); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) + __reset_osd(osdc, osd); + } +} + +/* + * Requeue requests whose mapping to an OSD has changed. If requests map to + * no osd, request a new map. + * + * Caller should hold map_sem for read and request_mutex. + */ +static void kick_requests(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *nreq; + struct rb_node *p; + int needmap = 0; + int err; + + dout("kick_requests\n"); + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_osd_request, r_node); + err = __map_request(osdc, req); + if (err < 0) + continue; /* error */ + if (req->r_osd == NULL) { + dout("%p tid %llu maps to no osd\n", req, req->r_tid); + needmap++; /* request a newer map */ + } else if (err > 0) { + dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } + + list_for_each_entry_safe(req, nreq, &osdc->req_linger, + r_linger_item) { + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); + + err = __map_request(osdc, req); + if (err == 0) + continue; /* no change and no osd was specified */ + if (err < 0) + continue; /* hrm! */ + if (req->r_osd == NULL) { + dout("tid %llu maps to no valid osd\n", req->r_tid); + needmap++; /* request a newer map */ + continue; + } + + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + __unregister_linger_request(osdc, req); + __register_request(osdc, req); + } + mutex_unlock(&osdc->request_mutex); + + if (needmap) { + dout("%d requests for down osds, need new map\n", needmap); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } +} + + +/* + * Process updated osd map. + * + * The message contains any number of incremental and full maps, normally + * indicating some sort of topology change in the cluster. Kick requests + * off to different OSDs as needed. + */ +void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end, *next; + u32 nr_maps, maplen; + u32 epoch; + struct ceph_osdmap *newmap = NULL, *oldmap; + int err; + struct ceph_fsid fsid; + + dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + /* verify fsid */ + ceph_decode_need(&p, end, sizeof(fsid), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(osdc->client, &fsid) < 0) + return; + + down_write(&osdc->map_sem); + + /* incremental maps */ + ceph_decode_32_safe(&p, end, nr_maps, bad); + dout(" %d inc maps\n", nr_maps); + while (nr_maps > 0) { + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + ceph_decode_need(&p, end, maplen, bad); + next = p + maplen; + if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { + dout("applying incremental map %u len %d\n", + epoch, maplen); + newmap = osdmap_apply_incremental(&p, next, + osdc->osdmap, + osdc->client->msgr); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad; + } + BUG_ON(!newmap); + if (newmap != osdc->osdmap) { + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = newmap; + } + kick_requests(osdc); + reset_changed_osds(osdc); + } else { + dout("ignoring incremental map %u len %d\n", + epoch, maplen); + } + p = next; + nr_maps--; + } + if (newmap) + goto done; + + /* full maps */ + ceph_decode_32_safe(&p, end, nr_maps, bad); + dout(" %d full maps\n", nr_maps); + while (nr_maps) { + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + ceph_decode_need(&p, end, maplen, bad); + if (nr_maps > 1) { + dout("skipping non-latest full map %u len %d\n", + epoch, maplen); + } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { + dout("skipping full map %u len %d, " + "older than our %u\n", epoch, maplen, + osdc->osdmap->epoch); + } else { + dout("taking full map %u len %d\n", epoch, maplen); + newmap = osdmap_decode(&p, p+maplen); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad; + } + BUG_ON(!newmap); + oldmap = osdc->osdmap; + osdc->osdmap = newmap; + if (oldmap) + ceph_osdmap_destroy(oldmap); + kick_requests(osdc); + } + p += maplen; + nr_maps--; + } + +done: + downgrade_write(&osdc->map_sem); + ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + + /* + * subscribe to subsequent osdmap updates if full to ensure + * we find out when we are no longer full and stop returning + * ENOSPC. + */ + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + ceph_monc_request_next_osdmap(&osdc->client->monc); + + send_queued(osdc); + up_read(&osdc->map_sem); + wake_up_all(&osdc->client->auth_wq); + return; + +bad: + pr_err("osdc handle_map corrupt msg\n"); + ceph_msg_dump(msg); + up_write(&osdc->map_sem); + return; +} + +/* + * watch/notify callback event infrastructure + * + * These callbacks are used both for watch and notify operations. + */ +static void __release_event(struct kref *kref) +{ + struct ceph_osd_event *event = + container_of(kref, struct ceph_osd_event, kref); + + dout("__release_event %p\n", event); + kfree(event); +} + +static void get_event(struct ceph_osd_event *event) +{ + kref_get(&event->kref); +} + +void ceph_osdc_put_event(struct ceph_osd_event *event) +{ + kref_put(&event->kref, __release_event); +} +EXPORT_SYMBOL(ceph_osdc_put_event); + +static void __insert_event(struct ceph_osd_client *osdc, + struct ceph_osd_event *new) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (new->cookie < event->cookie) + p = &(*p)->rb_left; + else if (new->cookie > event->cookie) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &osdc->event_tree); +} + +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, + u64 cookie) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (cookie < event->cookie) + p = &(*p)->rb_left; + else if (cookie > event->cookie) + p = &(*p)->rb_right; + else + return event; + } + return NULL; +} + +static void __remove_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + if (!RB_EMPTY_NODE(&event->node)) { + dout("__remove_event removed %p\n", event); + rb_erase(&event->node, &osdc->event_tree); + ceph_osdc_put_event(event); + } else { + dout("__remove_event didn't remove %p\n", event); + } +} + +int ceph_osdc_create_event(struct ceph_osd_client *osdc, + void (*event_cb)(u64, u64, u8, void *), + int one_shot, void *data, + struct ceph_osd_event **pevent) +{ + struct ceph_osd_event *event; + + event = kmalloc(sizeof(*event), GFP_NOIO); + if (!event) + return -ENOMEM; + + dout("create_event %p\n", event); + event->cb = event_cb; + event->one_shot = one_shot; + event->data = data; + event->osdc = osdc; + INIT_LIST_HEAD(&event->osd_node); + kref_init(&event->kref); /* one ref for us */ + kref_get(&event->kref); /* one ref for the caller */ + init_completion(&event->completion); + + spin_lock(&osdc->event_lock); + event->cookie = ++osdc->event_count; + __insert_event(osdc, event); + spin_unlock(&osdc->event_lock); + + *pevent = event; + return 0; +} +EXPORT_SYMBOL(ceph_osdc_create_event); + +void ceph_osdc_cancel_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + dout("cancel_event %p\n", event); + spin_lock(&osdc->event_lock); + __remove_event(event); + spin_unlock(&osdc->event_lock); + ceph_osdc_put_event(event); /* caller's */ +} +EXPORT_SYMBOL(ceph_osdc_cancel_event); + + +static void do_event_work(struct work_struct *work) +{ + struct ceph_osd_event_work *event_work = + container_of(work, struct ceph_osd_event_work, work); + struct ceph_osd_event *event = event_work->event; + u64 ver = event_work->ver; + u64 notify_id = event_work->notify_id; + u8 opcode = event_work->opcode; + + dout("do_event_work completing %p\n", event); + event->cb(ver, notify_id, opcode, event->data); + complete(&event->completion); + dout("do_event_work completed %p\n", event); + ceph_osdc_put_event(event); + kfree(event_work); +} + + +/* + * Process osd watch notifications + */ +void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end; + u8 proto_ver; + u64 cookie, ver, notify_id; + u8 opcode; + struct ceph_osd_event *event; + struct ceph_osd_event_work *event_work; + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_8_safe(&p, end, proto_ver, bad); + ceph_decode_8_safe(&p, end, opcode, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + ceph_decode_64_safe(&p, end, ver, bad); + ceph_decode_64_safe(&p, end, notify_id, bad); + + spin_lock(&osdc->event_lock); + event = __find_event(osdc, cookie); + if (event) { + get_event(event); + if (event->one_shot) + __remove_event(event); + } + spin_unlock(&osdc->event_lock); + dout("handle_watch_notify cookie %lld ver %lld event %p\n", + cookie, ver, event); + if (event) { + event_work = kmalloc(sizeof(*event_work), GFP_NOIO); + if (!event_work) { + dout("ERROR: could not allocate event_work\n"); + goto done_err; + } + INIT_WORK(&event_work->work, do_event_work); + event_work->event = event; + event_work->ver = ver; + event_work->notify_id = notify_id; + event_work->opcode = opcode; + if (!queue_work(osdc->notify_wq, &event_work->work)) { + dout("WARNING: failed to queue notify event work\n"); + goto done_err; + } + } + + return; + +done_err: + complete(&event->completion); + ceph_osdc_put_event(event); + return; + +bad: + pr_err("osdc handle_watch_notify corrupt msg\n"); + return; +} + +int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) +{ + int err; + + dout("wait_event %p\n", event); + err = wait_for_completion_interruptible_timeout(&event->completion, + timeout * HZ); + ceph_osdc_put_event(event); + if (err > 0) + err = 0; + dout("wait_event %p returns %d\n", event, err); + return err; +} +EXPORT_SYMBOL(ceph_osdc_wait_event); + +/* + * Register request, send initial attempt. + */ +int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc = 0; + + req->r_request->pages = req->r_pages; + req->r_request->nr_pages = req->r_num_pages; +#ifdef CONFIG_BLOCK + req->r_request->bio = req->r_bio; +#endif + req->r_request->trail = req->r_trail; + + register_request(osdc, req); + + down_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + /* + * a racing kick_requests() may have sent the message for us + * while we dropped request_mutex above, so only send now if + * the request still han't been touched yet. + */ + if (req->r_sent == 0) { + rc = __map_request(osdc, req); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } + goto out_unlock; + } + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + rc = __send_request(osdc, req); + if (rc) { + if (nofail) { + dout("osdc_start_request failed send, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } + } + } + } + +out_unlock: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + return rc; +} +EXPORT_SYMBOL(ceph_osdc_start_request); + +/* + * wait for a request to complete + */ +int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + int rc; + + rc = wait_for_completion_interruptible(&req->r_completion); + if (rc < 0) { + mutex_lock(&osdc->request_mutex); + __cancel_request(req); + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + complete_request(req); + dout("wait_request tid %llu canceled/timed out\n", req->r_tid); + return rc; + } + + dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); + return req->r_result; +} +EXPORT_SYMBOL(ceph_osdc_wait_request); + +/* + * sync - wait for all in-flight requests to flush. avoid starvation. + */ +void ceph_osdc_sync(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req; + u64 last_tid, next_tid = 0; + + mutex_lock(&osdc->request_mutex); + last_tid = osdc->last_tid; + while (1) { + req = __lookup_request_ge(osdc, next_tid); + if (!req) + break; + if (req->r_tid > last_tid) + break; + + next_tid = req->r_tid + 1; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) + continue; + + ceph_osdc_get_request(req); + mutex_unlock(&osdc->request_mutex); + dout("sync waiting on tid %llu (last is %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&osdc->request_mutex); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); + dout("sync done (thru tid %llu)\n", last_tid); +} +EXPORT_SYMBOL(ceph_osdc_sync); + +/* + * init, shutdown + */ +int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) +{ + int err; + + dout("init\n"); + osdc->client = client; + osdc->osdmap = NULL; + init_rwsem(&osdc->map_sem); + init_completion(&osdc->map_waiters); + osdc->last_requested_map = 0; + mutex_init(&osdc->request_mutex); + osdc->last_tid = 0; + osdc->osds = RB_ROOT; + INIT_LIST_HEAD(&osdc->osd_lru); + osdc->requests = RB_ROOT; + INIT_LIST_HEAD(&osdc->req_lru); + INIT_LIST_HEAD(&osdc->req_unsent); + INIT_LIST_HEAD(&osdc->req_notarget); + INIT_LIST_HEAD(&osdc->req_linger); + osdc->num_requests = 0; + INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); + INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); + spin_lock_init(&osdc->event_lock); + osdc->event_tree = RB_ROOT; + osdc->event_count = 0; + + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); + + err = -ENOMEM; + osdc->req_mempool = mempool_create_kmalloc_pool(10, + sizeof(struct ceph_osd_request)); + if (!osdc->req_mempool) + goto out; + + err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + "osd_op"); + if (err < 0) + goto out_mempool; + err = ceph_msgpool_init(&osdc->msgpool_op_reply, + OSD_OPREPLY_FRONT_LEN, 10, true, + "osd_op_reply"); + if (err < 0) + goto out_msgpool; + + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); + if (IS_ERR(osdc->notify_wq)) { + err = PTR_ERR(osdc->notify_wq); + osdc->notify_wq = NULL; + goto out_msgpool; + } + return 0; + +out_msgpool: + ceph_msgpool_destroy(&osdc->msgpool_op); +out_mempool: + mempool_destroy(osdc->req_mempool); +out: + return err; +} +EXPORT_SYMBOL(ceph_osdc_init); + +void ceph_osdc_stop(struct ceph_osd_client *osdc) +{ + flush_workqueue(osdc->notify_wq); + destroy_workqueue(osdc->notify_wq); + cancel_delayed_work_sync(&osdc->timeout_work); + cancel_delayed_work_sync(&osdc->osds_timeout_work); + if (osdc->osdmap) { + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = NULL; + } + remove_old_osds(osdc, 1); + WARN_ON(!RB_EMPTY_ROOT(&osdc->osds)); + mempool_destroy(osdc->req_mempool); + ceph_msgpool_destroy(&osdc->msgpool_op); + ceph_msgpool_destroy(&osdc->msgpool_op_reply); +} +EXPORT_SYMBOL(ceph_osdc_stop); + +/* + * Read some contiguous pages. If we cross a stripe boundary, shorten + * *plen. Return number of bytes read, or error. + */ +int ceph_osdc_readpages(struct ceph_osd_client *osdc, + struct ceph_vino vino, struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int num_pages, int page_align) +{ + struct ceph_osd_request *req; + int rc = 0; + + dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, + vino.snap, off, *plen); + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, truncate_seq, truncate_size, NULL, + false, 1, page_align); + if (!req) + return -ENOMEM; + + /* it may be a short read due to an object boundary */ + req->r_pages = pages; + + dout("readpages final extent is %llu~%llu (%d pages align %d)\n", + off, *plen, req->r_num_pages, page_align); + + rc = ceph_osdc_start_request(osdc, req, false); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + dout("readpages result %d\n", rc); + return rc; +} +EXPORT_SYMBOL(ceph_osdc_readpages); + +/* + * do a synchronous write on N pages + */ +int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *snapc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec *mtime, + struct page **pages, int num_pages, + int flags, int do_sync, bool nofail) +{ + struct ceph_osd_request *req; + int rc = 0; + int page_align = off & ~PAGE_MASK; + + BUG_ON(vino.snap != CEPH_NOSNAP); + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, + CEPH_OSD_OP_WRITE, + flags | CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE, + snapc, do_sync, + truncate_seq, truncate_size, mtime, + nofail, 1, page_align); + if (!req) + return -ENOMEM; + + /* it may be a short write due to an object boundary */ + req->r_pages = pages; + dout("writepages %llu~%llu (%d pages)\n", off, len, + req->r_num_pages); + + rc = ceph_osdc_start_request(osdc, req, nofail); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + if (rc == 0) + rc = len; + dout("writepages result %d\n", rc); + return rc; +} +EXPORT_SYMBOL(ceph_osdc_writepages); + +/* + * handle incoming message + */ +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + int type = le16_to_cpu(msg->hdr.type); + + if (!osd) + goto out; + osdc = osd->o_osdc; + + switch (type) { + case CEPH_MSG_OSD_MAP: + ceph_osdc_handle_map(osdc, msg); + break; + case CEPH_MSG_OSD_OPREPLY: + handle_reply(osdc, msg, con); + break; + case CEPH_MSG_WATCH_NOTIFY: + handle_watch_notify(osdc, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } +out: + ceph_msg_put(msg); +} + +/* + * lookup and return message for incoming reply. set up reply message + * pages. + */ +static struct ceph_msg *get_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc = osd->o_osdc; + struct ceph_msg *m; + struct ceph_osd_request *req; + int front = le32_to_cpu(hdr->front_len); + int data_len = le32_to_cpu(hdr->data_len); + u64 tid; + + tid = le64_to_cpu(hdr->tid); + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (!req) { + *skip = 1; + m = NULL; + pr_info("get_reply unknown tid %llu from osd%d\n", tid, + osd->o_osd); + goto out; + } + + if (req->r_con_filling_msg) { + dout("get_reply revoking msg %p from old con %p\n", + req->r_reply, req->r_con_filling_msg); + ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); + ceph_con_put(req->r_con_filling_msg); + req->r_con_filling_msg = NULL; + } + + if (front > req->r_reply->front.iov_len) { + pr_warning("get_reply front %d > preallocated %d\n", + front, (int)req->r_reply->front.iov_len); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); + if (!m) + goto out; + ceph_msg_put(req->r_reply); + req->r_reply = m; + } + m = ceph_msg_get(req->r_reply); + + if (data_len > 0) { + int want = calc_pages_for(req->r_page_alignment, data_len); + + if (unlikely(req->r_num_pages < want)) { + pr_warning("tid %lld reply has %d bytes %d pages, we" + " had only %d pages ready\n", tid, data_len, + want, req->r_num_pages); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; + m->page_alignment = req->r_page_alignment; +#ifdef CONFIG_BLOCK + m->bio = req->r_bio; +#endif + } + *skip = 0; + req->r_con_filling_msg = ceph_con_get(con); + dout("get_reply tid %lld %p\n", tid, m); + +out: + mutex_unlock(&osdc->request_mutex); + return m; + +} + +static struct ceph_msg *alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + int type = le16_to_cpu(hdr->type); + int front = le32_to_cpu(hdr->front_len); + + switch (type) { + case CEPH_MSG_OSD_MAP: + case CEPH_MSG_WATCH_NOTIFY: + return ceph_msg_new(type, front, GFP_NOFS); + case CEPH_MSG_OSD_OPREPLY: + return get_reply(con, hdr, skip); + default: + pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, + osd->o_osd); + *skip = 1; + return NULL; + } +} + +/* + * Wrappers to refcount containing ceph_osd struct + */ +static struct ceph_connection *get_osd_con(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + if (get_osd(osd)) + return con; + return NULL; +} + +static void put_osd_con(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + put_osd(osd); +} + +/* + * authentication + */ +static int get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + int ret = 0; + + if (force_new && o->o_authorizer) { + ac->ops->destroy_authorizer(ac, o->o_authorizer); + o->o_authorizer = NULL; + } + if (o->o_authorizer == NULL) { + ret = ac->ops->create_authorizer( + ac, CEPH_ENTITY_TYPE_OSD, + &o->o_authorizer, + &o->o_authorizer_buf, + &o->o_authorizer_buf_len, + &o->o_authorizer_reply_buf, + &o->o_authorizer_reply_buf_len); + if (ret) + return ret; + } + + *proto = ac->protocol; + *buf = o->o_authorizer_buf; + *len = o->o_authorizer_buf_len; + *reply_buf = o->o_authorizer_reply_buf; + *reply_len = o->o_authorizer_reply_buf_len; + return 0; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); + + return ceph_monc_validate_auth(&osdc->client->monc); +} + +static const struct ceph_connection_operations osd_con_ops = { + .get = get_osd_con, + .put = put_osd_con, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .alloc_msg = alloc_msg, + .fault = osd_reset, +}; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c new file mode 100644 index 00000000..e97c3588 --- /dev/null +++ b/net/ceph/osdmap.c @@ -0,0 +1,1135 @@ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +char *ceph_osdmap_state_str(char *str, int len, int state) +{ + int flag = 0; + + if (!len) + goto done; + + *str = '\0'; + if (state) { + if (state & CEPH_OSD_EXISTS) { + snprintf(str, len, "exists"); + flag = 1; + } + if (state & CEPH_OSD_UP) { + snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), + "up"); + flag = 1; + } + } else { + snprintf(str, len, "doesn't exist"); + } +done: + return str; +} + +/* maps */ + +static int calc_bits_of(unsigned t) +{ + int b = 0; + while (t) { + t = t >> 1; + b++; + } + return b; +} + +/* + * the foo_mask is the smallest value 2^n-1 that is >= foo. + */ +static void calc_pg_masks(struct ceph_pg_pool_info *pi) +{ + pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; + pi->pgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; + pi->lpg_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; + pi->lpgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; +} + +/* + * decode crush map + */ +static int crush_decode_uniform_bucket(void **p, void *end, + struct crush_bucket_uniform *b) +{ + dout("crush_decode_uniform_bucket %p to %p\n", *p, end); + ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); + b->item_weight = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_list_bucket(void **p, void *end, + struct crush_bucket_list *b) +{ + int j; + dout("crush_decode_list_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->sum_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + b->item_weights[j] = ceph_decode_32(p); + b->sum_weights[j] = ceph_decode_32(p); + } + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_tree_bucket(void **p, void *end, + struct crush_bucket_tree *b) +{ + int j; + dout("crush_decode_tree_bucket %p to %p\n", *p, end); + ceph_decode_32_safe(p, end, b->num_nodes, bad); + b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); + if (b->node_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); + for (j = 0; j < b->num_nodes; j++) + b->node_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_straw_bucket(void **p, void *end, + struct crush_bucket_straw *b) +{ + int j; + dout("crush_decode_straw_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->straws == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + b->item_weights[j] = ceph_decode_32(p); + b->straws[j] = ceph_decode_32(p); + } + return 0; +bad: + return -EINVAL; +} + +static struct crush_map *crush_decode(void *pbyval, void *end) +{ + struct crush_map *c; + int err = -EINVAL; + int i, j; + void **p = &pbyval; + void *start = pbyval; + u32 magic; + + dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + c = kzalloc(sizeof(*c), GFP_NOFS); + if (c == NULL) + return ERR_PTR(-ENOMEM); + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + magic = ceph_decode_32(p); + if (magic != CRUSH_MAGIC) { + pr_err("crush_decode magic %x != current %x\n", + (unsigned)magic, (unsigned)CRUSH_MAGIC); + goto bad; + } + c->max_buckets = ceph_decode_32(p); + c->max_rules = ceph_decode_32(p); + c->max_devices = ceph_decode_32(p); + + c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); + if (c->device_parents == NULL) + goto badmem; + c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); + if (c->bucket_parents == NULL) + goto badmem; + + c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); + if (c->buckets == NULL) + goto badmem; + c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); + if (c->rules == NULL) + goto badmem; + + /* buckets */ + for (i = 0; i < c->max_buckets; i++) { + int size = 0; + u32 alg; + struct crush_bucket *b; + + ceph_decode_32_safe(p, end, alg, bad); + if (alg == 0) { + c->buckets[i] = NULL; + continue; + } + dout("crush_decode bucket %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(struct crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(struct crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(struct crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(struct crush_bucket_straw); + break; + default: + err = -EINVAL; + goto bad; + } + BUG_ON(size == 0); + b = c->buckets[i] = kzalloc(size, GFP_NOFS); + if (b == NULL) + goto badmem; + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + b->id = ceph_decode_32(p); + b->type = ceph_decode_16(p); + b->alg = ceph_decode_8(p); + b->hash = ceph_decode_8(p); + b->weight = ceph_decode_32(p); + b->size = ceph_decode_32(p); + + dout("crush_decode bucket size %d off %x %p to %p\n", + b->size, (int)(*p-start), *p, end); + + b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); + if (b->items == NULL) + goto badmem; + b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); + if (b->perm == NULL) + goto badmem; + b->perm_n = 0; + + ceph_decode_need(p, end, b->size*sizeof(u32), bad); + for (j = 0; j < b->size; j++) + b->items[j] = ceph_decode_32(p); + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + err = crush_decode_uniform_bucket(p, end, + (struct crush_bucket_uniform *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_LIST: + err = crush_decode_list_bucket(p, end, + (struct crush_bucket_list *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_TREE: + err = crush_decode_tree_bucket(p, end, + (struct crush_bucket_tree *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_STRAW: + err = crush_decode_straw_bucket(p, end, + (struct crush_bucket_straw *)b); + if (err < 0) + goto bad; + break; + } + } + + /* rules */ + dout("rule vec is %p\n", c->rules); + for (i = 0; i < c->max_rules; i++) { + u32 yes; + struct crush_rule *r; + + ceph_decode_32_safe(p, end, yes, bad); + if (!yes) { + dout("crush_decode NO rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + c->rules[i] = NULL; + continue; + } + + dout("crush_decode rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + /* len */ + ceph_decode_32_safe(p, end, yes, bad); +#if BITS_PER_LONG == 32 + err = -EINVAL; + if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) + goto bad; +#endif + r = c->rules[i] = kmalloc(sizeof(*r) + + yes*sizeof(struct crush_rule_step), + GFP_NOFS); + if (r == NULL) + goto badmem; + dout(" rule %d is at %p\n", i, r); + r->len = yes; + ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ + ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); + for (j = 0; j < r->len; j++) { + r->steps[j].op = ceph_decode_32(p); + r->steps[j].arg1 = ceph_decode_32(p); + r->steps[j].arg2 = ceph_decode_32(p); + } + } + + /* ignore trailing name maps. */ + + dout("crush_decode success\n"); + return c; + +badmem: + err = -ENOMEM; +bad: + dout("crush_decode fail %d\n", err); + crush_destroy(c); + return ERR_PTR(err); +} + +/* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) + */ +static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +{ + u64 a = *(u64 *)&l; + u64 b = *(u64 *)&r; + + if (a < b) + return -1; + if (a > b) + return 1; + return 0; +} + +static int __insert_pg_mapping(struct ceph_pg_mapping *new, + struct rb_root *root) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_mapping *pg = NULL; + int c; + + while (*p) { + parent = *p; + pg = rb_entry(parent, struct ceph_pg_mapping, node); + c = pgid_cmp(new->pgid, pg->pgid); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, + struct ceph_pg pgid) +{ + struct rb_node *n = root->rb_node; + struct ceph_pg_mapping *pg; + int c; + + while (n) { + pg = rb_entry(n, struct ceph_pg_mapping, node); + c = pgid_cmp(pgid, pg->pgid); + if (c < 0) + n = n->rb_left; + else if (c > 0) + n = n->rb_right; + else + return pg; + } + return NULL; +} + +/* + * rbtree of pg pool info + */ +static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_pool_info *pi = NULL; + + while (*p) { + parent = *p; + pi = rb_entry(parent, struct ceph_pg_pool_info, node); + if (new->id < pi->id) + p = &(*p)->rb_left; + else if (new->id > pi->id) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +{ + struct ceph_pg_pool_info *pi; + struct rb_node *n = root->rb_node; + + while (n) { + pi = rb_entry(n, struct ceph_pg_pool_info, node); + if (id < pi->id) + n = n->rb_left; + else if (id > pi->id) + n = n->rb_right; + else + return pi; + } + return NULL; +} + +int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) +{ + struct rb_node *rbp; + + for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { + struct ceph_pg_pool_info *pi = + rb_entry(rbp, struct ceph_pg_pool_info, node); + if (pi->name && strcmp(pi->name, name) == 0) + return pi->id; + } + return -ENOENT; +} +EXPORT_SYMBOL(ceph_pg_poolid_by_name); + +static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) +{ + rb_erase(&pi->node, root); + kfree(pi->name); + kfree(pi); +} + +static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) +{ + unsigned n, m; + + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + calc_pg_masks(pi); + + /* num_snaps * snap_info_t */ + n = le32_to_cpu(pi->v.num_snaps); + while (n--) { + ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + + sizeof(struct ceph_timespec), bad); + *p += sizeof(u64) + /* key */ + 1 + sizeof(u64) + /* u8, snapid */ + sizeof(struct ceph_timespec); + m = ceph_decode_32(p); /* snap name */ + *p += m; + } + + *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; + return 0; + +bad: + return -EINVAL; +} + +static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +{ + struct ceph_pg_pool_info *pi; + u32 num, len, pool; + + ceph_decode_32_safe(p, end, num, bad); + dout(" %d pool names\n", num); + while (num--) { + ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_32_safe(p, end, len, bad); + dout(" pool %d len %d\n", pool, len); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + kfree(pi->name); + pi->name = kmalloc(len + 1, GFP_NOFS); + if (pi->name) { + memcpy(pi->name, *p, len); + pi->name[len] = '\0'; + dout(" name is %s\n", pi->name); + } + } + *p += len; + } + return 0; + +bad: + return -EINVAL; +} + +/* + * osd map + */ +void ceph_osdmap_destroy(struct ceph_osdmap *map) +{ + dout("osdmap_destroy %p\n", map); + if (map->crush) + crush_destroy(map->crush); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_temp); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + __remove_pg_pool(&map->pg_pools, pi); + } + kfree(map->osd_state); + kfree(map->osd_weight); + kfree(map->osd_addr); + kfree(map); +} + +/* + * adjust max osd value. reallocate arrays. + */ +static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +{ + u8 *state; + struct ceph_entity_addr *addr; + u32 *weight; + + state = kcalloc(max, sizeof(*state), GFP_NOFS); + addr = kcalloc(max, sizeof(*addr), GFP_NOFS); + weight = kcalloc(max, sizeof(*weight), GFP_NOFS); + if (state == NULL || addr == NULL || weight == NULL) { + kfree(state); + kfree(addr); + kfree(weight); + return -ENOMEM; + } + + /* copy old? */ + if (map->osd_state) { + memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); + memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); + memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); + kfree(map->osd_state); + kfree(map->osd_addr); + kfree(map->osd_weight); + } + + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; + map->max_osd = max; + return 0; +} + +/* + * decode a full map. + */ +struct ceph_osdmap *osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + u16 version; + u32 len, max, i; + u8 ev; + int err = -EINVAL; + void *start = *p; + struct ceph_pg_pool_info *pi; + + dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (map == NULL) + return ERR_PTR(-ENOMEM); + map->pg_temp = RB_ROOT; + + ceph_decode_16_safe(p, end, version, bad); + if (version > CEPH_OSDMAP_VERSION) { + pr_warning("got unknown v %d > %d of osdmap\n", version, + CEPH_OSDMAP_VERSION); + goto bad; + } + + ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); + ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); + map->epoch = ceph_decode_32(p); + ceph_decode_copy(p, &map->created, sizeof(map->created)); + ceph_decode_copy(p, &map->modified, sizeof(map->modified)); + + ceph_decode_32_safe(p, end, max, bad); + while (max--) { + ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + goto bad; + pi->id = ceph_decode_32(p); + ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); + kfree(pi); + goto bad; + } + err = __decode_pool(p, end, pi); + if (err < 0) { + kfree(pi); + goto bad; + } + __insert_pg_pool(&map->pg_pools, pi); + } + + if (version >= 5 && __decode_pool_names(p, end, map) < 0) + goto bad; + + ceph_decode_32_safe(p, end, map->pool_max, bad); + + ceph_decode_32_safe(p, end, map->flags, bad); + + max = ceph_decode_32(p); + + /* (re)alloc osd arrays */ + err = osdmap_set_max_osd(map, max); + if (err < 0) + goto bad; + dout("osdmap_decode max_osd = %d\n", map->max_osd); + + /* osds */ + err = -EINVAL; + ceph_decode_need(p, end, 3*sizeof(u32) + + map->max_osd*(1 + sizeof(*map->osd_weight) + + sizeof(*map->osd_addr)), bad); + *p += 4; /* skip length field (should match max) */ + ceph_decode_copy(p, map->osd_state, map->max_osd); + + *p += 4; /* skip length field (should match max) */ + for (i = 0; i < map->max_osd; i++) + map->osd_weight[i] = ceph_decode_32(p); + + *p += 4; /* skip length field (should match max) */ + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); + for (i = 0; i < map->max_osd; i++) + ceph_decode_addr(&map->osd_addr[i]); + + /* pg_temp */ + ceph_decode_32_safe(p, end, len, bad); + for (i = 0; i < len; i++) { + int n, j; + struct ceph_pg pgid; + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); + ceph_decode_copy(p, &pgid, sizeof(pgid)); + n = ceph_decode_32(p); + ceph_decode_need(p, end, n * sizeof(u32), bad); + err = -ENOMEM; + pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); + if (!pg) + goto bad; + pg->pgid = pgid; + pg->len = n; + for (j = 0; j < n; j++) + pg->osds[j] = ceph_decode_32(p); + + err = __insert_pg_mapping(pg, &map->pg_temp); + if (err) + goto bad; + dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); + } + + /* crush */ + ceph_decode_32_safe(p, end, len, bad); + dout("osdmap_decode crush len %d from off 0x%x\n", len, + (int)(*p - start)); + ceph_decode_need(p, end, len, bad); + map->crush = crush_decode(*p, end); + *p += len; + if (IS_ERR(map->crush)) { + err = PTR_ERR(map->crush); + map->crush = NULL; + goto bad; + } + + /* ignore the rest of the map */ + *p = end; + + dout("osdmap_decode done %p %p\n", *p, end); + return map; + +bad: + dout("osdmap_decode fail\n"); + ceph_osdmap_destroy(map); + return ERR_PTR(err); +} + +/* + * decode and apply an incremental map update. + */ +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map, + struct ceph_messenger *msgr) +{ + struct crush_map *newcrush = NULL; + struct ceph_fsid fsid; + u32 epoch = 0; + struct ceph_timespec modified; + u32 len, pool; + __s32 new_pool_max, new_flags, max; + void *start = *p; + int err = -EINVAL; + u16 version; + struct rb_node *rbp; + + ceph_decode_16_safe(p, end, version, bad); + if (version > CEPH_OSDMAP_INC_VERSION) { + pr_warning("got unknown v %d > %d of inc osdmap\n", version, + CEPH_OSDMAP_INC_VERSION); + goto bad; + } + + ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), + bad); + ceph_decode_copy(p, &fsid, sizeof(fsid)); + epoch = ceph_decode_32(p); + BUG_ON(epoch != map->epoch+1); + ceph_decode_copy(p, &modified, sizeof(modified)); + new_pool_max = ceph_decode_32(p); + new_flags = ceph_decode_32(p); + + /* full map? */ + ceph_decode_32_safe(p, end, len, bad); + if (len > 0) { + dout("apply_incremental full map len %d, %p to %p\n", + len, *p, end); + return osdmap_decode(p, min(*p+len, end)); + } + + /* new crush? */ + ceph_decode_32_safe(p, end, len, bad); + if (len > 0) { + dout("apply_incremental new crush map len %d, %p to %p\n", + len, *p, end); + newcrush = crush_decode(*p, min(*p+len, end)); + if (IS_ERR(newcrush)) + return ERR_CAST(newcrush); + *p += len; + } + + /* new flags? */ + if (new_flags >= 0) + map->flags = new_flags; + if (new_pool_max >= 0) + map->pool_max = new_pool_max; + + ceph_decode_need(p, end, 5*sizeof(u32), bad); + + /* new max? */ + max = ceph_decode_32(p); + if (max >= 0) { + err = osdmap_set_max_osd(map, max); + if (err < 0) + goto bad; + } + + map->epoch++; + map->modified = modified; + if (newcrush) { + if (map->crush) + crush_destroy(map->crush); + map->crush = newcrush; + newcrush = NULL; + } + + /* new_pool */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + __u8 ev; + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); + ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); + goto bad; + } + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!pi) { + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) { + err = -ENOMEM; + goto bad; + } + pi->id = pool; + __insert_pg_pool(&map->pg_pools, pi); + } + err = __decode_pool(p, end, pi); + if (err < 0) + goto bad; + } + if (version >= 5 && __decode_pool_names(p, end, map) < 0) + goto bad; + + /* old_pool */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) + __remove_pg_pool(&map->pg_pools, pi); + } + + /* new_up */ + err = -EINVAL; + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd; + struct ceph_entity_addr addr; + ceph_decode_32_safe(p, end, osd, bad); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_addr(&addr); + pr_info("osd%d up\n", osd); + BUG_ON(osd >= map->max_osd); + map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + /* new_state */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd; + u8 xorstate; + ceph_decode_32_safe(p, end, osd, bad); + xorstate = **(u8 **)p; + (*p)++; /* clean flag */ + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + if (xorstate & CEPH_OSD_UP) + pr_info("osd%d down\n", osd); + if (osd < map->max_osd) + map->osd_state[osd] ^= xorstate; + } + + /* new_weight */ + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + u32 osd, off; + ceph_decode_need(p, end, sizeof(u32)*2, bad); + osd = ceph_decode_32(p); + off = ceph_decode_32(p); + pr_info("osd%d weight 0x%x %s\n", osd, off, + off == CEPH_OSD_IN ? "(in)" : + (off == CEPH_OSD_OUT ? "(out)" : "")); + if (osd < map->max_osd) + map->osd_weight[osd] = off; + } + + /* new_pg_temp */ + rbp = rb_first(&map->pg_temp); + ceph_decode_32_safe(p, end, len, bad); + while (len--) { + struct ceph_pg_mapping *pg; + int j; + struct ceph_pg pgid; + u32 pglen; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + ceph_decode_copy(p, &pgid, sizeof(pgid)); + pglen = ceph_decode_32(p); + + /* remove any? */ + while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, + node)->pgid, pgid) <= 0) { + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + + rbp = rb_next(rbp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); + } + + if (pglen) { + /* insert */ + ceph_decode_need(p, end, pglen*sizeof(u32), bad); + pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); + if (!pg) { + err = -ENOMEM; + goto bad; + } + pg->pgid = pgid; + pg->len = pglen; + for (j = 0; j < pglen; j++) + pg->osds[j] = ceph_decode_32(p); + err = __insert_pg_mapping(pg, &map->pg_temp); + if (err) { + kfree(pg); + goto bad; + } + dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, + pglen); + } + } + while (rbp) { + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + + rbp = rb_next(rbp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); + } + + /* ignore the rest */ + *p = end; + return map; + +bad: + pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", + epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + if (newcrush) + crush_destroy(newcrush); + return ERR_PTR(err); +} + + + + +/* + * calculate file layout from given offset, length. + * fill in correct oid, logical length, and object extent + * offset, length. + * + * for now, we write only a single su, until we can + * pass a stride back to the caller. + */ +void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *ono, + u64 *oxoff, u64 *oxlen) +{ + u32 osize = le32_to_cpu(layout->fl_object_size); + u32 su = le32_to_cpu(layout->fl_stripe_unit); + u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 bl, stripeno, stripepos, objsetno; + u32 su_per_object; + u64 t, su_offset; + + dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + osize, su); + su_per_object = osize / su; + dout("osize %u / su %u = su_per_object %u\n", osize, su, + su_per_object); + + BUG_ON((su & ~PAGE_MASK) != 0); + /* bl = *off / su; */ + t = off; + do_div(t, su); + bl = t; + dout("off %llu / su %u = bl %u\n", off, su, bl); + + stripeno = bl / sc; + stripepos = bl % sc; + objsetno = stripeno / su_per_object; + + *ono = objsetno * sc + stripepos; + dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); + + /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ + t = off; + su_offset = do_div(t, su); + *oxoff = su_offset + (stripeno % su_per_object) * su; + + /* + * Calculate the length of the extent being written to the selected + * object. This is the minimum of the full length requested (plen) or + * the remainder of the current stripe being written to. + */ + *oxlen = min_t(u64, *plen, su - su_offset); + *plen = *oxlen; + + dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); +} +EXPORT_SYMBOL(ceph_calc_file_object_mapping); + +/* + * calculate an object layout (i.e. pgid) from an oid, + * file_layout, and osdmap + */ +int ceph_calc_object_layout(struct ceph_object_layout *ol, + const char *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap) +{ + unsigned num, num_mask; + struct ceph_pg pgid; + s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); + int poolid = le32_to_cpu(fl->fl_pg_pool); + struct ceph_pg_pool_info *pool; + unsigned ps; + + BUG_ON(!osdmap); + + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return -EIO; + ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); + if (preferred >= 0) { + ps += preferred; + num = le32_to_cpu(pool->v.lpg_num); + num_mask = pool->lpg_num_mask; + } else { + num = le32_to_cpu(pool->v.pg_num); + num_mask = pool->pg_num_mask; + } + + pgid.ps = cpu_to_le16(ps); + pgid.preferred = cpu_to_le16(preferred); + pgid.pool = fl->fl_pg_pool; + if (preferred >= 0) + dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, + (int)preferred); + else + dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); + + ol->ol_pgid = pgid; + ol->ol_stripe_unit = fl->fl_object_stripe_unit; + return 0; +} +EXPORT_SYMBOL(ceph_calc_object_layout); + +/* + * Calculate raw osd vector for the given pgid. Return pointer to osd + * array, or NULL on failure. + */ +static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *osds, int *num) +{ + struct ceph_pg_mapping *pg; + struct ceph_pg_pool_info *pool; + int ruleno; + unsigned poolid, ps, pps; + int preferred; + + /* pg_temp? */ + pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + if (pg) { + *num = pg->len; + return pg->osds; + } + + /* crush */ + poolid = le32_to_cpu(pgid.pool); + ps = le16_to_cpu(pgid.ps); + preferred = (s16)le16_to_cpu(pgid.preferred); + + /* don't forcefeed bad device ids to crush */ + if (preferred >= osdmap->max_osd || + preferred >= osdmap->crush->max_devices) + preferred = -1; + + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return NULL; + ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, + pool->v.type, pool->v.size); + if (ruleno < 0) { + pr_err("no crush rule pool %d ruleset %d type %d size %d\n", + poolid, pool->v.crush_ruleset, pool->v.type, + pool->v.size); + return NULL; + } + + if (preferred >= 0) + pps = ceph_stable_mod(ps, + le32_to_cpu(pool->v.lpgp_num), + pool->lpgp_num_mask); + else + pps = ceph_stable_mod(ps, + le32_to_cpu(pool->v.pgp_num), + pool->pgp_num_mask); + pps += poolid; + *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, + min_t(int, pool->v.size, *num), + preferred, osdmap->osd_weight); + return osds; +} + +/* + * Return acting set for given pgid. + */ +int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *acting) +{ + int rawosds[CEPH_PG_MAX_SIZE], *osds; + int i, o, num = CEPH_PG_MAX_SIZE; + + osds = calc_pg_raw(osdmap, pgid, rawosds, &num); + if (!osds) + return -1; + + /* primary is first up osd */ + o = 0; + for (i = 0; i < num; i++) + if (ceph_osd_is_up(osdmap, osds[i])) + acting[o++] = osds[i]; + return o; +} + +/* + * Return primary osd for given pgid, or -1 if none. + */ +int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) +{ + int rawosds[CEPH_PG_MAX_SIZE], *osds; + int i, num = CEPH_PG_MAX_SIZE; + + osds = calc_pg_raw(osdmap, pgid, rawosds, &num); + if (!osds) + return -1; + + /* primary is first up osd */ + for (i = 0; i < num; i++) + if (ceph_osd_is_up(osdmap, osds[i])) + return osds[i]; + return -1; +} +EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c new file mode 100644 index 00000000..13cb409a --- /dev/null +++ b/net/ceph/pagelist.c @@ -0,0 +1,154 @@ + +#include +#include +#include +#include +#include + +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) { + struct page *page = list_entry(pl->head.prev, struct page, lru); + kunmap(page); + pl->mapped_tail = NULL; + } +} + +int ceph_pagelist_release(struct ceph_pagelist *pl) +{ + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + ceph_pagelist_free_reserve(pl); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_release); + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page; + + if (!pl->num_pages_free) { + page = __page_cache_alloc(GFP_NOFS); + } else { + page = list_first_entry(&pl->free_list, struct page, lru); + list_del(&page->lru); + --pl->num_pages_free; + } + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + ceph_pagelist_unmap_tail(pl); + list_add_tail(&page->lru, &pl->head); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_append); + +/** + * Allocate enough pages for a pagelist to append the given amount + * of data without without allocating. + * Returns: 0 on success, -ENOMEM on error. + */ +int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) +{ + if (space <= pl->room) + return 0; + space -= pl->room; + space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */ + + while (space > pl->num_pages_free) { + struct page *page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + list_add_tail(&page->lru, &pl->free_list); + ++pl->num_pages_free; + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_reserve); + +/** + * Free any pages that have been preallocated. + */ +int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) +{ + while (!list_empty(&pl->free_list)) { + struct page *page = list_first_entry(&pl->free_list, + struct page, lru); + list_del(&page->lru); + __free_page(page); + --pl->num_pages_free; + } + BUG_ON(pl->num_pages_free); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_free_reserve); + +/** + * Create a truncation point. + */ +void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + c->pl = pl; + c->page_lru = pl->head.prev; + c->room = pl->room; +} +EXPORT_SYMBOL(ceph_pagelist_set_cursor); + +/** + * Truncate a pagelist to the given point. Move extra pages to reserve. + * This won't sleep. + * Returns: 0 on success, + * -EINVAL if the pagelist doesn't match the trunc point pagelist + */ +int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + struct page *page; + + if (pl != c->pl) + return -EINVAL; + ceph_pagelist_unmap_tail(pl); + while (pl->head.prev != c->page_lru) { + page = list_entry(pl->head.prev, struct page, lru); + list_del(&page->lru); /* remove from pagelist */ + list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + ++pl->num_pages_free; + } + pl->room = c->room; + if (!list_empty(&pl->head)) { + page = list_entry(pl->head.prev, struct page, lru); + pl->mapped_tail = kmap(page); + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_truncate); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c new file mode 100644 index 00000000..cd9c21df --- /dev/null +++ b/net/ceph/pagevec.c @@ -0,0 +1,233 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * build a vector of user pages + */ +struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, bool write_page) +{ + struct page **pages; + int got = 0; + int rc = 0; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + down_read(¤t->mm->mmap_sem); + while (got < num_pages) { + rc = get_user_pages(current, current->mm, + (unsigned long)data + ((unsigned long)got * PAGE_SIZE), + num_pages - got, write_page, 0, pages + got, NULL); + if (rc < 0) + break; + BUG_ON(rc == 0); + got += rc; + } + up_read(¤t->mm->mmap_sem); + if (rc < 0) + goto fail; + return pages; + +fail: + ceph_put_page_vector(pages, got, false); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(ceph_get_direct_page_vector); + +void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) +{ + int i; + + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} +EXPORT_SYMBOL(ceph_put_page_vector); + +void ceph_release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} +EXPORT_SYMBOL(ceph_release_page_vector); + +/* + * allocate a vector new pages + */ +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, flags); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = __page_cache_alloc(flags); + if (pages[i] == NULL) { + ceph_release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} +EXPORT_SYMBOL(ceph_alloc_page_vector); + +/* + * copy user data into a page vector + */ +int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_CACHE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + po += l - bad; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_user_to_page_vector); + +int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(page_address(pages[i]) + po, data, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_to_page_vector); + +int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + size_t l; + + while (left > 0) { + l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(data, page_address(pages[i]) + po, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_from_page_vector); + +/* + * copy user data from a page vector into a user pointer + */ +int ceph_copy_page_vector_to_user(struct page **pages, + char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, left, PAGE_CACHE_SIZE-po); + bad = copy_to_user(data, page_address(pages[i]) + po, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + i++; + } + return len; +} +EXPORT_SYMBOL(ceph_copy_page_vector_to_user); + +/* + * Zero an extent within a page vector. Offset is relative to the + * start of the first page. + */ +void ceph_zero_page_vector_range(int off, int len, struct page **pages) +{ + int i = off >> PAGE_CACHE_SHIFT; + + off &= ~PAGE_CACHE_MASK; + + dout("zero_page_vector_page %u~%u\n", off, len); + + /* leading partial page? */ + if (off) { + int end = min((int)PAGE_CACHE_SIZE, off + len); + dout("zeroing %d %p head from %d\n", i, pages[i], + (int)off); + zero_user_segment(pages[i], off, end); + len -= (end - off); + i++; + } + while (len >= PAGE_CACHE_SIZE) { + dout("zeroing %d %p len=%d\n", i, pages[i], len); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + i++; + } + /* trailing partial page? */ + if (len) { + dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); + zero_user_segment(pages[i], 0, len); + } +} +EXPORT_SYMBOL(ceph_zero_page_vector_range); + diff --git a/net/compat.c b/net/compat.c new file mode 100644 index 00000000..c578d938 --- /dev/null +++ b/net/compat.c @@ -0,0 +1,867 @@ +/* + * 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c. + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger + * Copyright (C) 1999 Arun Sharma + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang + * Copyright (C) 2000,2001 Andi Kleen, SuSE Labs + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static inline int iov_from_user_compat_to_kern(struct iovec *kiov, + struct compat_iovec __user *uiov32, + int niov) +{ + int tot_len = 0; + + while (niov > 0) { + compat_uptr_t buf; + compat_size_t len; + + if (get_user(len, &uiov32->iov_len) || + get_user(buf, &uiov32->iov_base)) + return -EFAULT; + + if (len > INT_MAX - tot_len) + len = INT_MAX - tot_len; + + tot_len += len; + kiov->iov_base = compat_ptr(buf); + kiov->iov_len = (__kernel_size_t) len; + uiov32++; + kiov++; + niov--; + } + return tot_len; +} + +int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg) +{ + compat_uptr_t tmp1, tmp2, tmp3; + + if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || + __get_user(tmp1, &umsg->msg_name) || + __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || + __get_user(tmp2, &umsg->msg_iov) || + __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) || + __get_user(tmp3, &umsg->msg_control) || + __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || + __get_user(kmsg->msg_flags, &umsg->msg_flags)) + return -EFAULT; + kmsg->msg_name = compat_ptr(tmp1); + kmsg->msg_iov = compat_ptr(tmp2); + kmsg->msg_control = compat_ptr(tmp3); + return 0; +} + +/* I've named the args so it is easy to tell whose space the pointers are in. */ +int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov, + struct sockaddr *kern_address, int mode) +{ + int tot_len; + + if (kern_msg->msg_namelen) { + if (mode == VERIFY_READ) { + int err = move_addr_to_kernel(kern_msg->msg_name, + kern_msg->msg_namelen, + kern_address); + if (err < 0) + return err; + } + kern_msg->msg_name = kern_address; + } else + kern_msg->msg_name = NULL; + + tot_len = iov_from_user_compat_to_kern(kern_iov, + (struct compat_iovec __user *)kern_msg->msg_iov, + kern_msg->msg_iovlen); + if (tot_len >= 0) + kern_msg->msg_iov = kern_iov; + + return tot_len; +} + +/* Bleech... */ +#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32)) + +#define CMSG_COMPAT_DATA(cmsg) \ + ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)))) +#define CMSG_COMPAT_SPACE(len) \ + (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len)) +#define CMSG_COMPAT_LEN(len) \ + (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len)) + +#define CMSG_COMPAT_FIRSTHDR(msg) \ + (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \ + (struct compat_cmsghdr __user *)((msg)->msg_control) : \ + (struct compat_cmsghdr __user *)NULL) + +#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \ + ((ucmlen) >= sizeof(struct compat_cmsghdr) && \ + (ucmlen) <= (unsigned long) \ + ((mhdr)->msg_controllen - \ + ((char *)(ucmsg) - (char *)(mhdr)->msg_control))) + +static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg, + struct compat_cmsghdr __user *cmsg, int cmsg_len) +{ + char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len); + if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) > + msg->msg_controllen) + return NULL; + return (struct compat_cmsghdr __user *)ptr; +} + +/* There is a lot of hair here because the alignment rules (and + * thus placement) of cmsg headers and length are different for + * 32-bit apps. -DaveM + */ +int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, + unsigned char *stackbuf, int stackbuf_size) +{ + struct compat_cmsghdr __user *ucmsg; + struct cmsghdr *kcmsg, *kcmsg_base; + compat_size_t ucmlen; + __kernel_size_t kcmlen, tmp; + int err = -EFAULT; + + kcmlen = 0; + kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; + ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); + while (ucmsg != NULL) { + if (get_user(ucmlen, &ucmsg->cmsg_len)) + return -EFAULT; + + /* Catch bogons. */ + if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) + return -EINVAL; + + tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + + CMSG_ALIGN(sizeof(struct cmsghdr))); + tmp = CMSG_ALIGN(tmp); + kcmlen += tmp; + ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); + } + if (kcmlen == 0) + return -EINVAL; + + /* The kcmlen holds the 64-bit version of the control length. + * It may not be modified as we do not stick it into the kmsg + * until we have successfully copied over all of the data + * from the user. + */ + if (kcmlen > stackbuf_size) + kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL); + if (kcmsg == NULL) + return -ENOBUFS; + + /* Now copy them over neatly. */ + memset(kcmsg, 0, kcmlen); + ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); + while (ucmsg != NULL) { + if (__get_user(ucmlen, &ucmsg->cmsg_len)) + goto Efault; + if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) + goto Einval; + tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + + CMSG_ALIGN(sizeof(struct cmsghdr))); + if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) + goto Einval; + kcmsg->cmsg_len = tmp; + tmp = CMSG_ALIGN(tmp); + if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) || + __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || + copy_from_user(CMSG_DATA(kcmsg), + CMSG_COMPAT_DATA(ucmsg), + (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))))) + goto Efault; + + /* Advance. */ + kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp); + ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); + } + + /* Ok, looks like we made it. Hook it up and return success. */ + kmsg->msg_control = kcmsg_base; + kmsg->msg_controllen = kcmlen; + return 0; + +Einval: + err = -EINVAL; +Efault: + if (kcmsg_base != (struct cmsghdr *)stackbuf) + sock_kfree_s(sk, kcmsg_base, kcmlen); + return err; +} + +int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data) +{ + struct compat_timeval ctv; + struct compat_timespec cts[3]; + struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; + struct compat_cmsghdr cmhdr; + int cmlen; + + if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) { + kmsg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + + if (level == SOL_SOCKET && type == SCM_TIMESTAMP) { + struct timeval *tv = (struct timeval *)data; + ctv.tv_sec = tv->tv_sec; + ctv.tv_usec = tv->tv_usec; + data = &ctv; + len = sizeof(ctv); + } + if (level == SOL_SOCKET && + (type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) { + int count = type == SCM_TIMESTAMPNS ? 1 : 3; + int i; + struct timespec *ts = (struct timespec *)data; + for (i = 0; i < count; i++) { + cts[i].tv_sec = ts[i].tv_sec; + cts[i].tv_nsec = ts[i].tv_nsec; + } + data = &cts; + len = sizeof(cts[0]) * count; + } + + cmlen = CMSG_COMPAT_LEN(len); + if (kmsg->msg_controllen < cmlen) { + kmsg->msg_flags |= MSG_CTRUNC; + cmlen = kmsg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + return -EFAULT; + if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr))) + return -EFAULT; + cmlen = CMSG_COMPAT_SPACE(len); + if (kmsg->msg_controllen < cmlen) + cmlen = kmsg->msg_controllen; + kmsg->msg_control += cmlen; + kmsg->msg_controllen -= cmlen; + return 0; +} + +void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) +{ + struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; + int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int); + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int __user *cmfptr; + int err = 0, i; + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) { + int new_fd; + err = security_file_receive(fp[i]); + if (err) + break; + err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags + ? O_CLOEXEC : 0); + if (err < 0) + break; + new_fd = err; + err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + get_file(fp[i]); + fd_install(new_fd, fp[i]); + } + + if (i > 0) { + int cmlen = CMSG_COMPAT_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_COMPAT_SPACE(i * sizeof(int)); + kmsg->msg_control += cmlen; + kmsg->msg_controllen -= cmlen; + } + } + if (i < fdnum) + kmsg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} + +/* + * A struct sock_filter is architecture independent. + */ +struct compat_sock_fprog { + u16 len; + compat_uptr_t filter; /* struct sock_filter * */ +}; + +static int do_set_attach_filter(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval; + struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog)); + compat_uptr_t ptr; + u16 len; + + if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) || + !access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) || + __get_user(len, &fprog32->len) || + __get_user(ptr, &fprog32->filter) || + __put_user(len, &kfprog->len) || + __put_user(compat_ptr(ptr), &kfprog->filter)) + return -EFAULT; + + return sock_setsockopt(sock, level, optname, (char __user *)kfprog, + sizeof(struct sock_fprog)); +} + +static int do_set_sock_timeout(struct socket *sock, int level, + int optname, char __user *optval, unsigned int optlen) +{ + struct compat_timeval __user *up = (struct compat_timeval __user *)optval; + struct timeval ktime; + mm_segment_t old_fs; + int err; + + if (optlen < sizeof(*up)) + return -EINVAL; + if (!access_ok(VERIFY_READ, up, sizeof(*up)) || + __get_user(ktime.tv_sec, &up->tv_sec) || + __get_user(ktime.tv_usec, &up->tv_usec)) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sock_setsockopt(sock, level, optname, (char *)&ktime, sizeof(ktime)); + set_fs(old_fs); + + return err; +} + +static int compat_sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (optname == SO_ATTACH_FILTER) + return do_set_attach_filter(sock, level, optname, + optval, optlen); + if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) + return do_set_sock_timeout(sock, level, optname, optval, optlen); + + return sock_setsockopt(sock, level, optname, optval, optlen); +} + +asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int err; + struct socket *sock = sockfd_lookup(fd, &err); + + if (sock) { + err = security_socket_setsockopt(sock, level, optname); + if (err) { + sockfd_put(sock); + return err; + } + + if (level == SOL_SOCKET) + err = compat_sock_setsockopt(sock, level, + optname, optval, optlen); + else if (sock->ops->compat_setsockopt) + err = sock->ops->compat_setsockopt(sock, level, + optname, optval, optlen); + else + err = sock->ops->setsockopt(sock, level, + optname, optval, optlen); + sockfd_put(sock); + } + return err; +} + +static int do_get_sock_timeout(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct compat_timeval __user *up; + struct timeval ktime; + mm_segment_t old_fs; + int len, err; + + up = (struct compat_timeval __user *) optval; + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(*up)) + return -EINVAL; + len = sizeof(ktime); + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len); + set_fs(old_fs); + + if (!err) { + if (put_user(sizeof(*up), optlen) || + !access_ok(VERIFY_WRITE, up, sizeof(*up)) || + __put_user(ktime.tv_sec, &up->tv_sec) || + __put_user(ktime.tv_usec, &up->tv_usec)) + err = -EFAULT; + } + return err; +} + +static int compat_sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) + return do_get_sock_timeout(sock, level, optname, optval, optlen); + return sock_getsockopt(sock, level, optname, optval, optlen); +} + +int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +{ + struct compat_timeval __user *ctv = + (struct compat_timeval __user *) userstamp; + int err = -ENOENT; + struct timeval tv; + + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + tv = ktime_to_timeval(sk->sk_stamp); + if (tv.tv_sec == -1) + return err; + if (tv.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + tv = ktime_to_timeval(sk->sk_stamp); + } + err = 0; + if (put_user(tv.tv_sec, &ctv->tv_sec) || + put_user(tv.tv_usec, &ctv->tv_usec)) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL(compat_sock_get_timestamp); + +int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) +{ + struct compat_timespec __user *ctv = + (struct compat_timespec __user *) userstamp; + int err = -ENOENT; + struct timespec ts; + + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + ts = ktime_to_timespec(sk->sk_stamp); + if (ts.tv_sec == -1) + return err; + if (ts.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + ts = ktime_to_timespec(sk->sk_stamp); + } + err = 0; + if (put_user(ts.tv_sec, &ctv->tv_sec) || + put_user(ts.tv_nsec, &ctv->tv_nsec)) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL(compat_sock_get_timestampns); + +asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err; + struct socket *sock = sockfd_lookup(fd, &err); + + if (sock) { + err = security_socket_getsockopt(sock, level, optname); + if (err) { + sockfd_put(sock); + return err; + } + + if (level == SOL_SOCKET) + err = compat_sock_getsockopt(sock, level, + optname, optval, optlen); + else if (sock->ops->compat_getsockopt) + err = sock->ops->compat_getsockopt(sock, level, + optname, optval, optlen); + else + err = sock->ops->getsockopt(sock, level, + optname, optval, optlen); + sockfd_put(sock); + } + return err; +} + +struct compat_group_req { + __u32 gr_interface; + struct __kernel_sockaddr_storage gr_group + __attribute__ ((aligned(4))); +} __packed; + +struct compat_group_source_req { + __u32 gsr_interface; + struct __kernel_sockaddr_storage gsr_group + __attribute__ ((aligned(4))); + struct __kernel_sockaddr_storage gsr_source + __attribute__ ((aligned(4))); +} __packed; + +struct compat_group_filter { + __u32 gf_interface; + struct __kernel_sockaddr_storage gf_group + __attribute__ ((aligned(4))); + __u32 gf_fmode; + __u32 gf_numsrc; + struct __kernel_sockaddr_storage gf_slist[1] + __attribute__ ((aligned(4))); +} __packed; + +#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \ + sizeof(struct __kernel_sockaddr_storage)) + + +int compat_mc_setsockopt(struct sock *sock, int level, int optname, + char __user *optval, unsigned int optlen, + int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int)) +{ + char __user *koptval = optval; + int koptlen = optlen; + + switch (optname) { + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct compat_group_req __user *gr32 = (void *)optval; + struct group_req __user *kgr = + compat_alloc_user_space(sizeof(struct group_req)); + u32 interface; + + if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) || + !access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) || + __get_user(interface, &gr32->gr_interface) || + __put_user(interface, &kgr->gr_interface) || + copy_in_user(&kgr->gr_group, &gr32->gr_group, + sizeof(kgr->gr_group))) + return -EFAULT; + koptval = (char __user *)kgr; + koptlen = sizeof(struct group_req); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct compat_group_source_req __user *gsr32 = (void *)optval; + struct group_source_req __user *kgsr = compat_alloc_user_space( + sizeof(struct group_source_req)); + u32 interface; + + if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) || + !access_ok(VERIFY_WRITE, kgsr, + sizeof(struct group_source_req)) || + __get_user(interface, &gsr32->gsr_interface) || + __put_user(interface, &kgsr->gsr_interface) || + copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group, + sizeof(kgsr->gsr_group)) || + copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source, + sizeof(kgsr->gsr_source))) + return -EFAULT; + koptval = (char __user *)kgsr; + koptlen = sizeof(struct group_source_req); + break; + } + case MCAST_MSFILTER: + { + struct compat_group_filter __user *gf32 = (void *)optval; + struct group_filter __user *kgf; + u32 interface, fmode, numsrc; + + if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) || + __get_user(interface, &gf32->gf_interface) || + __get_user(fmode, &gf32->gf_fmode) || + __get_user(numsrc, &gf32->gf_numsrc)) + return -EFAULT; + koptlen = optlen + sizeof(struct group_filter) - + sizeof(struct compat_group_filter); + if (koptlen < GROUP_FILTER_SIZE(numsrc)) + return -EINVAL; + kgf = compat_alloc_user_space(koptlen); + if (!access_ok(VERIFY_WRITE, kgf, koptlen) || + __put_user(interface, &kgf->gf_interface) || + __put_user(fmode, &kgf->gf_fmode) || + __put_user(numsrc, &kgf->gf_numsrc) || + copy_in_user(&kgf->gf_group, &gf32->gf_group, + sizeof(kgf->gf_group)) || + (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist, + numsrc * sizeof(kgf->gf_slist[0])))) + return -EFAULT; + koptval = (char __user *)kgf; + break; + } + + default: + break; + } + return setsockopt(sock, level, optname, koptval, koptlen); +} +EXPORT_SYMBOL(compat_mc_setsockopt); + +int compat_mc_getsockopt(struct sock *sock, int level, int optname, + char __user *optval, int __user *optlen, + int (*getsockopt)(struct sock *, int, int, char __user *, int __user *)) +{ + struct compat_group_filter __user *gf32 = (void *)optval; + struct group_filter __user *kgf; + int __user *koptlen; + u32 interface, fmode, numsrc; + int klen, ulen, err; + + if (optname != MCAST_MSFILTER) + return getsockopt(sock, level, optname, optval, optlen); + + koptlen = compat_alloc_user_space(sizeof(*koptlen)); + if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) || + __get_user(ulen, optlen)) + return -EFAULT; + + /* adjust len for pad */ + klen = ulen + sizeof(*kgf) - sizeof(*gf32); + + if (klen < GROUP_FILTER_SIZE(0)) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) || + __put_user(klen, koptlen)) + return -EFAULT; + + /* have to allow space for previous compat_alloc_user_space, too */ + kgf = compat_alloc_user_space(klen+sizeof(*optlen)); + + if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) || + __get_user(interface, &gf32->gf_interface) || + __get_user(fmode, &gf32->gf_fmode) || + __get_user(numsrc, &gf32->gf_numsrc) || + __put_user(interface, &kgf->gf_interface) || + __put_user(fmode, &kgf->gf_fmode) || + __put_user(numsrc, &kgf->gf_numsrc) || + copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group))) + return -EFAULT; + + err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen); + if (err) + return err; + + if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) || + __get_user(klen, koptlen)) + return -EFAULT; + + ulen = klen - (sizeof(*kgf)-sizeof(*gf32)); + + if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) || + __put_user(ulen, optlen)) + return -EFAULT; + + if (!access_ok(VERIFY_READ, kgf, klen) || + !access_ok(VERIFY_WRITE, gf32, ulen) || + __get_user(interface, &kgf->gf_interface) || + __get_user(fmode, &kgf->gf_fmode) || + __get_user(numsrc, &kgf->gf_numsrc) || + __put_user(interface, &gf32->gf_interface) || + __put_user(fmode, &gf32->gf_fmode) || + __put_user(numsrc, &gf32->gf_numsrc)) + return -EFAULT; + if (numsrc) { + int copylen; + + klen -= GROUP_FILTER_SIZE(0); + copylen = numsrc * sizeof(gf32->gf_slist[0]); + if (copylen > klen) + copylen = klen; + if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen)) + return -EFAULT; + } + return err; +} +EXPORT_SYMBOL(compat_mc_getsockopt); + + +/* Argument list sizes for compat_sys_socketcall */ +#define AL(x) ((x) * sizeof(u32)) +static unsigned char nas[21] = { + AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), + AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), + AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), + AL(4), AL(5), AL(4) +}; +#undef AL + +asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags) +{ + return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); +} + +asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, + unsigned vlen, unsigned int flags) +{ + return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT); +} + +asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) +{ + return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); +} + +asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned flags) +{ + return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT); +} + +asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len, + unsigned flags, struct sockaddr __user *addr, + int __user *addrlen) +{ + return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen); +} + +asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, + unsigned vlen, unsigned int flags, + struct compat_timespec __user *timeout) +{ + int datagrams; + struct timespec ktspec; + + if (timeout == NULL) + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, NULL); + + if (get_compat_timespec(&ktspec, timeout)) + return -EFAULT; + + datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, &ktspec); + if (datagrams > 0 && put_compat_timespec(&ktspec, timeout)) + datagrams = -EFAULT; + + return datagrams; +} + +asmlinkage long compat_sys_socketcall(int call, u32 __user *args) +{ + int ret; + u32 a[6]; + u32 a0, a1; + + if (call < SYS_SOCKET || call > SYS_SENDMMSG) + return -EINVAL; + if (copy_from_user(a, args, nas[call])) + return -EFAULT; + a0 = a[0]; + a1 = a[1]; + + switch (call) { + case SYS_SOCKET: + ret = sys_socket(a0, a1, a[2]); + break; + case SYS_BIND: + ret = sys_bind(a0, compat_ptr(a1), a[2]); + break; + case SYS_CONNECT: + ret = sys_connect(a0, compat_ptr(a1), a[2]); + break; + case SYS_LISTEN: + ret = sys_listen(a0, a1); + break; + case SYS_ACCEPT: + ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); + break; + case SYS_GETSOCKNAME: + ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); + break; + case SYS_GETPEERNAME: + ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); + break; + case SYS_SOCKETPAIR: + ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); + break; + case SYS_SEND: + ret = sys_send(a0, compat_ptr(a1), a[2], a[3]); + break; + case SYS_SENDTO: + ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]); + break; + case SYS_RECV: + ret = compat_sys_recv(a0, compat_ptr(a1), a[2], a[3]); + break; + case SYS_RECVFROM: + ret = compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], + compat_ptr(a[4]), compat_ptr(a[5])); + break; + case SYS_SHUTDOWN: + ret = sys_shutdown(a0, a1); + break; + case SYS_SETSOCKOPT: + ret = compat_sys_setsockopt(a0, a1, a[2], + compat_ptr(a[3]), a[4]); + break; + case SYS_GETSOCKOPT: + ret = compat_sys_getsockopt(a0, a1, a[2], + compat_ptr(a[3]), compat_ptr(a[4])); + break; + case SYS_SENDMSG: + ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); + break; + case SYS_SENDMMSG: + ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]); + break; + case SYS_RECVMSG: + ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); + break; + case SYS_RECVMMSG: + ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3], + compat_ptr(a[4])); + break; + case SYS_ACCEPT4: + ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} diff --git a/net/core/Makefile b/net/core/Makefile new file mode 100644 index 00000000..0d357b1c --- /dev/null +++ b/net/core/Makefile @@ -0,0 +1,21 @@ +# +# Makefile for the Linux networking core. +# + +obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_core.o + +obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ + neighbour.o rtnetlink.o utils.o link_watch.o filter.o + +obj-$(CONFIG_XFRM) += flow.o +obj-y += net-sysfs.o +obj-$(CONFIG_NET_PKTGEN) += pktgen.o +obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_NET_DMA) += user_dma.o +obj-$(CONFIG_FIB_RULES) += fib_rules.o +obj-$(CONFIG_TRACEPOINTS) += net-traces.o +obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o diff --git a/net/core/datagram.c b/net/core/datagram.c new file mode 100644 index 00000000..18ac112e --- /dev/null +++ b/net/core/datagram.c @@ -0,0 +1,775 @@ +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all + * protocols. Possibly a generic IP version on top of these would + * make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and + * NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was + * shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox . (datagram_poll() from old + * udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() + * understood + * Alan Cox : Rewrote skb_read_datagram to avoid the + * skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. + * IPX can no longer use the SO_TYPE hack + * but AX.25 now works right, and SPX is + * feasible. + * Alan Cox : Fixed write poll of non IP protocol + * crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * Pete Wyckoff : Unconnected accept() fix. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* + * Is a socket 'connection oriented' ? + */ +static inline int connection_based(struct sock *sk) +{ + return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; +} + +static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + unsigned long bits = (unsigned long)key; + + /* + * Avoid a wakeup if event not interesting for us + */ + if (bits && !(bits & (POLLIN | POLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} +/* + * Wait for a packet.. + */ +static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +{ + int error; + DEFINE_WAIT_FUNC(wait, receiver_wake_function); + + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out_err; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + goto out; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out_noerr; + + /* Sequenced packets can come disconnected. + * If so we report the problem + */ + error = -ENOTCONN; + if (connection_based(sk) && + !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) + goto out_err; + + /* handle signals */ + if (signal_pending(current)) + goto interrupted; + + error = 0; + *timeo_p = schedule_timeout(*timeo_p); +out: + finish_wait(sk_sleep(sk), &wait); + return error; +interrupted: + error = sock_intr_errno(*timeo_p); +out_err: + *err = error; + goto out; +out_noerr: + *err = 0; + error = 1; + goto out; +} + +/** + * __skb_recv_datagram - Receive a datagram skbuff + * @sk: socket + * @flags: MSG_ flags + * @peeked: returns non-zero if this packet has been seen before + * @err: error code returned + * + * Get a datagram skbuff, understands the peeking, nonblocking wakeups + * and possible races. This replaces identical code in packet, raw and + * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes + * the long standing peek and read race for datagram sockets. If you + * alter this routine remember it must be re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling + * skb_free_datagram) + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, + int *peeked, int *err) +{ + struct sk_buff *skb; + long timeo; + /* + * Caller is allowed not to check sk->sk_err before skb_recv_datagram() + */ + int error = sock_error(sk); + + if (error) + goto no_packet; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + /* Again only user level code calls this function, so nothing + * interrupt level will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was correct in any case. 8) + */ + unsigned long cpu_flags; + + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) { + *peeked = skb->peeked; + if (flags & MSG_PEEK) { + skb->peeked = 1; + atomic_inc(&skb->users); + } else + __skb_unlink(skb, &sk->sk_receive_queue); + } + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + + if (skb) + return skb; + + /* User doesn't want to wait */ + error = -EAGAIN; + if (!timeo) + goto no_packet; + + } while (!wait_for_packet(sk, err, &timeo)); + + return NULL; + +no_packet: + *err = error; + return NULL; +} +EXPORT_SYMBOL(__skb_recv_datagram); + +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, + int noblock, int *err) +{ + int peeked; + + return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, err); +} +EXPORT_SYMBOL(skb_recv_datagram); + +void skb_free_datagram(struct sock *sk, struct sk_buff *skb) +{ + consume_skb(skb); + sk_mem_reclaim_partial(sk); +} +EXPORT_SYMBOL(skb_free_datagram); + +void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +{ + bool slow; + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + + slow = lock_sock_fast(sk); + skb_orphan(skb); + sk_mem_reclaim_partial(sk); + unlock_sock_fast(sk, slow); + + /* skb is now orphaned, can be freed outside of locked section */ + trace_kfree_skb(skb, skb_free_datagram_locked); + __kfree_skb(skb); +} +EXPORT_SYMBOL(skb_free_datagram_locked); + +/** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + * + * It returns 0 if the packet was removed by us. + */ + +int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + int err = 0; + + if (flags & MSG_PEEK) { + err = -ENOENT; + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + err = 0; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + kfree_skb(skb); + atomic_inc(&sk->sk_drops); + sk_mem_reclaim_partial(sk); + + return err; +} +EXPORT_SYMBOL(skb_kill_datagram); + +/** + * skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @len: amount of data to copy from buffer to iovec + * + * Note: the iovec is modified during the copy. + */ +int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, + struct iovec *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + trace_skb_copy_datagram_iovec(skb, len); + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovec(to, skb->data + offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovec(to, vaddr + frag->page_offset + + offset - start, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iovec(frag_iter, + offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_iovec); + +/** + * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @to_offset: offset in the io vector to start copying to + * @len: amount of data to copy from buffer to iovec + * + * Returns 0 or -EFAULT. + * Note: the iovec is not modified during the copy. + */ +int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, + const struct iovec *to, int to_offset, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovecend(to, vaddr + frag->page_offset + + offset - start, to_offset, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_const_iovec(frag_iter, + offset - start, + to, to_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_const_iovec); + +/** + * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying to + * @from: io vector to copy to + * @from_offset: offset in the io vector to start copying from + * @len: amount of data to copy to buffer from iovec + * + * Returns 0 or -EFAULT. + * Note: the iovec is not modified during the copy. + */ +int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, + const struct iovec *from, int from_offset, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_fromiovecend(skb->data + offset, from, from_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from_offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_fromiovecend(vaddr + frag->page_offset + + offset - start, + from, from_offset, copy); + kunmap(page); + if (err) + goto fault; + + if (!(len -= copy)) + return 0; + offset += copy; + from_offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_from_iovec(frag_iter, + offset - start, + from, + from_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from_offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iovec); + +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, + u8 __user *to, int len, + __wsum *csump) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + int err = 0; + if (copy > len) + copy = len; + *csump = csum_and_copy_to_user(skb->data + offset, to, copy, + *csump, &err); + if (err) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + __wsum csum2; + int err = 0; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + csum2 = csum_and_copy_to_user(vaddr + + frag->page_offset + + offset - start, + to, copy, 0, &err); + kunmap(page); + if (err) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if (!(len -= copy)) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2 = 0; + if (copy > len) + copy = len; + if (skb_copy_and_csum_datagram(frag_iter, + offset - start, + to, copy, + &csum2)) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + netdev_rx_csum_fault(skb->dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + return __skb_checksum_complete_head(skb, skb->len); +} +EXPORT_SYMBOL(__skb_checksum_complete); + +/** + * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * @skb: skbuff + * @hlen: hardware length + * @iov: io vector + * + * Caller _must_ check that skb will fit to this iovec. + * + * Returns: 0 - success. + * -EINVAL - checksum failure. + * -EFAULT - fault during copy. Beware, in this case iovec + * can be modified! + */ +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, + int hlen, struct iovec *iov) +{ + __wsum csum; + int chunk = skb->len - hlen; + + if (!chunk) + return 0; + + /* Skip filled elements. + * Pretty silly, look at memcpy_toiovec, though 8) + */ + while (!iov->iov_len) + iov++; + + if (iov->iov_len < chunk) { + if (__skb_checksum_complete(skb)) + goto csum_error; + if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + goto fault; + } else { + csum = csum_partial(skb->data, hlen, skb->csum); + if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + chunk, &csum)) + goto fault; + if (csum_fold(csum)) + goto csum_error; + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + netdev_rx_csum_fault(skb->dev); + iov->iov_len -= chunk; + iov->iov_base += chunk; + } + return 0; +csum_error: + return -EINVAL; +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ +unsigned int datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP | POLLIN | POLLRDNORM; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + if (sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->sk_state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} +EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c new file mode 100644 index 00000000..a71eafc3 --- /dev/null +++ b/net/core/dev.c @@ -0,0 +1,6522 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * + * Additional Authors: + * Florian la Roche + * Alan Cox + * David Hinds + * Alexey Kuznetsov + * Adam Sulmicki + * Pekka Riikonen + * + * Changes: + * D.J. Barrow : Fixed bug where dev->refcnt gets set + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant + * stunts to keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into + * drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before + * calling netif_rx. Saves a function + * call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close + * changes. + * Rudi Cilibrasi : Pass the right thing to + * set_mac_address() + * Dave Miller : 32bit quantity for the device lock to + * make it work out on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russell : SIOCSIFNAME + * Pekka Riikonen : Netdev boot-time settings code + * Andrew Morton : Make unregister_netdevice wait + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "net-sysfs.h" + +/* Instead of increasing this, you should create a hash table. */ +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * NOTE: That is no longer true with the addition of VLAN tags. Not + * sure which should go first, but I bet it won't make much + * difference if we are running VLANs. The good news is that + * this protocol won't be in the list unless compiled in, so + * the average user (w/out VLANs) will not be adversely affected. + * --BLG + * + * 0800 IP + * 8100 802.1Q VLAN + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +#define PTYPE_HASH_SIZE (16) +#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) + +static DEFINE_SPINLOCK(ptype_lock); +static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +static struct list_head ptype_all __read_mostly; /* Taps */ + +/* + * The @dev_base_head list is protected by @dev_base_lock and the rtnl + * semaphore. + * + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base_head list, and hold dev_base_lock for writing when they do the + * actual updates. This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +DEFINE_RWLOCK(dev_base_lock); +EXPORT_SYMBOL(dev_base_lock); + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; +} + +static inline void rps_lock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_lock(&sd->input_pkt_queue.lock); +#endif +} + +static inline void rps_unlock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_unlock(&sd->input_pkt_queue.lock); +#endif +} + +/* Device list insertion */ +static int list_netdevice(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + ASSERT_RTNL(); + + write_lock_bh(&dev_base_lock); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + return 0; +} + +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ +static void unlist_netdevice(struct net_device *dev) +{ + ASSERT_RTNL(); + + /* Unlink dev from the device chain */ + write_lock_bh(&dev_base_lock); + list_del_rcu(&dev->dev_list); + hlist_del_rcu(&dev->name_hlist); + hlist_del_rcu(&dev->index_hlist); + write_unlock_bh(&dev_base_lock); +} + +/* + * Our notifier list + */ + +static RAW_NOTIFIER_HEAD(netdev_chain); + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the local softnet handler. + */ + +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +EXPORT_PER_CPU_SYMBOL(softnet_data); + +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = + {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, + ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, + ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, + ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = + {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", + "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", + "_xmit_PHONET_PIPE", "_xmit_IEEE802154", + "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} +#endif + +/******************************************************************************* + + Protocol management and registration routines + +*******************************************************************************/ + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscuous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explanation follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + +/** + * dev_add_pack - add packet handler + * @pt: packet type declaration + * + * Add a protocol handler to the networking stack. The passed &packet_type + * is linked into kernel lists and may not be freed until it has been + * removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new packet type (until the next received packet). + */ + +void dev_add_pack(struct packet_type *pt) +{ + struct list_head *head = ptype_head(pt); + + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); +} +EXPORT_SYMBOL(dev_add_pack); + +/** + * __dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ + struct list_head *head = ptype_head(pt); + struct packet_type *pt1; + + spin_lock(&ptype_lock); + + list_for_each_entry(pt1, head, list) { + if (pt == pt1) { + list_del_rcu(&pt->list); + goto out; + } + } + + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +out: + spin_unlock(&ptype_lock); +} +EXPORT_SYMBOL(__dev_remove_pack); + +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ + __dev_remove_pack(pt); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_pack); + +/****************************************************************************** + + Device Boot-time Settings Routines + +*******************************************************************************/ + +/* Boot time configuration table */ +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + +/** + * netdev_boot_setup_add - add new setup entry + * @name: name of the device + * @map: configured settings for the device + * + * Adds new setup entry to the dev_boot_setup list. The function + * returns 0 on error and 1 on success. This is a generic routine to + * all netdevices. + */ +static int netdev_boot_setup_add(char *name, struct ifmap *map) +{ + struct netdev_boot_setup *s; + int i; + + s = dev_boot_setup; + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { + memset(s[i].name, 0, sizeof(s[i].name)); + strlcpy(s[i].name, name, IFNAMSIZ); + memcpy(&s[i].map, map, sizeof(s[i].map)); + break; + } + } + + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; +} + +/** + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice + * + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. + */ +int netdev_boot_setup_check(struct net_device *dev) +{ + struct netdev_boot_setup *s = dev_boot_setup; + int i; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && + !strcmp(dev->name, s[i].name)) { + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; + return 1; + } + } + return 0; +} +EXPORT_SYMBOL(netdev_boot_setup_check); + + +/** + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device + * + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. + */ +unsigned long netdev_boot_base(const char *prefix, int unit) +{ + const struct netdev_boot_setup *s = dev_boot_setup; + char name[IFNAMSIZ]; + int i; + + sprintf(name, "%s%d", prefix, unit); + + /* + * If device already registered then return base of 1 + * to indicate not to probe for this interface + */ + if (__dev_get_by_name(&init_net, name)) + return 1; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) + if (!strcmp(name, s[i].name)) + return s[i].map.base_addr; + return 0; +} + +/* + * Saves at boot time configured settings for any netdevice. + */ +int __init netdev_boot_setup(char *str) +{ + int ints[5]; + struct ifmap map; + + str = get_options(str, ARRAY_SIZE(ints), ints); + if (!str || !*str) + return 0; + + /* Save settings */ + memset(&map, 0, sizeof(map)); + if (ints[0] > 0) + map.irq = ints[1]; + if (ints[0] > 1) + map.base_addr = ints[2]; + if (ints[0] > 2) + map.mem_start = ints[3]; + if (ints[0] > 3) + map.mem_end = ints[4]; + + /* Add new entry to the list */ + return netdev_boot_setup_add(str, &map); +} + +__setup("netdev=", netdev_boot_setup); + +/******************************************************************************* + + Device Interface Subroutines + +*******************************************************************************/ + +/** + * __dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * or @dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +struct net_device *__dev_get_by_name(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_get_by_name); + +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); + +/** + * dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + if (dev) + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + +/** + * __dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the RTNL semaphore + * or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_get_by_index); + +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); + + +/** + * dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); + +/** + * dev_getbyhwaddr_rcu - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. + * The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + */ + +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + for_each_netdev_rcu(net, dev) + if (dev->type == type && + !memcmp(dev->dev_addr, ha, dev->addr_len)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); + +struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev->type == type) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_getfirstbyhwtype); + +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev, *ret = NULL; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + if (dev->type == type) { + dev_hold(dev); + ret = dev; + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + * dev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. Returns NULL if a device + * is not found or a pointer to the device. Must be called inside + * rcu_read_lock(), and result refcount is unchanged. + */ + +struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, + unsigned short mask) +{ + struct net_device *dev, *ret; + + ret = NULL; + for_each_netdev_rcu(net, dev) { + if (((dev->flags ^ if_flags) & mask) == 0) { + ret = dev; + break; + } + } + return ret; +} +EXPORT_SYMBOL(dev_get_by_flags_rcu); + +/** + * dev_valid_name - check if name is okay for network device + * @name: name string + * + * Network device names need to be valid file names to + * to allow sysfs to work. We also disallow any kind of + * whitespace. + */ +int dev_valid_name(const char *name) +{ + if (*name == '\0') + return 0; + if (strlen(name) >= IFNAMSIZ) + return 0; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return 0; + + while (*name) { + if (*name == '/' || isspace(*name)) + return 0; + name++; + } + return 1; +} +EXPORT_SYMBOL(dev_valid_name); + +/** + * __dev_alloc_name - allocate a name for a device + * @net: network namespace to allocate the device name in + * @name: name format string + * @buf: scratch buffer and result name string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +static int __dev_alloc_name(struct net *net, const char *name, char *buf) +{ + int i = 0; + const char *p; + const int max_netdevices = 8*PAGE_SIZE; + unsigned long *inuse; + struct net_device *d; + + p = strnchr(name, IFNAMSIZ-1, '%'); + if (p) { + /* + * Verify the string as this thing may have come from + * the user. There must be either one "%d" and no other "%" + * characters. + */ + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + /* Use one page as a bit array of possible slots */ + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); + if (!inuse) + return -ENOMEM; + + for_each_netdev(net, d) { + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, max_netdevices); + free_page((unsigned long) inuse); + } + + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); + if (!__dev_get_by_name(net, buf)) + return i; + + /* It is possible to run out of possible slots + * when the name is long and there isn't enough space left + * for the digits, or if all bits are used. + */ + return -ENFILE; +} + +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + char buf[IFNAMSIZ]; + struct net *net; + int ret; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} +EXPORT_SYMBOL(dev_alloc_name); + +static int dev_get_valid_name(struct net_device *dev, const char *name) +{ + struct net *net; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (strchr(name, '%')) + return dev_alloc_name(dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; +} + +/** + * dev_change_name - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + char oldname[IFNAMSIZ]; + int err = 0; + int ret; + struct net *net; + + ASSERT_RTNL(); + BUG_ON(!dev_net(dev)); + + net = dev_net(dev); + if (dev->flags & IFF_UP) + return -EBUSY; + + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) + return 0; + + memcpy(oldname, dev->name, IFNAMSIZ); + + err = dev_get_valid_name(dev, newname); + if (err < 0) + return err; + +rollback: + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; + } + + write_lock_bh(&dev_base_lock); + hlist_del_rcu(&dev->name_hlist); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); + ret = notifier_to_errno(ret); + + if (ret) { + /* err >= 0 after dev_alloc_name() or stores the first errno */ + if (err >= 0) { + err = ret; + memcpy(dev->name, oldname, IFNAMSIZ); + goto rollback; + } else { + printk(KERN_ERR + "%s: name change rollback failed: %d.\n", + dev->name, ret); + } + } + + return err; +} + +/** + * dev_set_alias - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device, + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + ASSERT_RTNL(); + + if (len >= IFALIASZ) + return -EINVAL; + + if (!len) { + if (dev->ifalias) { + kfree(dev->ifalias); + dev->ifalias = NULL; + } + return 0; + } + + dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); + if (!dev->ifalias) + return -ENOMEM; + + strlcpy(dev->ifalias, alias, len+1); + return len; +} + + +/** + * netdev_features_change - device changes features + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifiers(NETDEV_CHANGE, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + } +} +EXPORT_SYMBOL(netdev_state_change); + +int netdev_bonding_change(struct net_device *dev, unsigned long event) +{ + return call_netdevice_notifiers(event, dev); +} +EXPORT_SYMBOL(netdev_bonding_change); + +/** + * dev_load - load a network module + * @net: the applicable net namespace + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ + struct net_device *dev; + int no_module; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); + + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device " +"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " +"instead\n", name); + } +} +EXPORT_SYMBOL(dev_load); + +static int __dev_open(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret; + + ASSERT_RTNL(); + + if (!netif_device_present(dev)) + return -ENODEV; + + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = notifier_to_errno(ret); + if (ret) + return ret; + + set_bit(__LINK_STATE_START, &dev->state); + + if (ops->ndo_validate_addr) + ret = ops->ndo_validate_addr(dev); + + if (!ret && ops->ndo_open) + ret = ops->ndo_open(dev); + + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + else { + dev->flags |= IFF_UP; + net_dmaengine_get(); + dev_set_rx_mode(dev); + dev_activate(dev); + } + + return ret; +} + +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + */ +int dev_open(struct net_device *dev) +{ + int ret; + + if (dev->flags & IFF_UP) + return 0; + + ret = __dev_open(dev); + if (ret < 0) + return ret; + + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_UP, dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +static int __dev_close_many(struct list_head *head) +{ + struct net_device *dev; + + ASSERT_RTNL(); + might_sleep(); + + list_for_each_entry(dev, head, unreg_list) { + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + } + + dev_deactivate_many(head); + + list_for_each_entry(dev, head, unreg_list) { + const struct net_device_ops *ops = dev->netdev_ops; + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); + + dev->flags &= ~IFF_UP; + net_dmaengine_put(); + } + + return 0; +} + +static int __dev_close(struct net_device *dev) +{ + int retval; + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + retval = __dev_close_many(&single); + list_del(&single); + return retval; +} + +static int dev_close_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) + if (!(dev->flags & IFF_UP)) + list_move(&dev->unreg_list, &tmp_list); + + __dev_close_many(head); + + list_for_each_entry(dev, head, unreg_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + /* rollback_registered_many needs the complete original list */ + list_splice(&tmp_list, head); + return 0; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } + return 0; +} +EXPORT_SYMBOL(dev_close); + + +/** + * dev_disable_lro - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + u32 flags; + + /* + * If we're trying to disable lro on a vlan device + * use the underlying physical device instead + */ + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + + if (dev->ethtool_ops && dev->ethtool_ops->get_flags) + flags = dev->ethtool_ops->get_flags(dev); + else + flags = ethtool_op_get_flags(dev); + + if (!(flags & ETH_FLAG_LRO)) + return; + + __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); +} +EXPORT_SYMBOL(dev_disable_lro); + + +static int dev_boot_phase = 1; + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + struct net_device *last; + struct net *net; + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + for_each_net(net) { + for_each_netdev(net, dev) { + err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + goto rollback; + + if (!(dev->flags & IFF_UP)) + continue; + + nb->notifier_call(nb, NETDEV_UP, dev); + } + } + +unlock: + rtnl_unlock(); + return err; + +rollback: + last = dev; + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev == last) + break; + + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + } + } + + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier); + +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + struct net *net; + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&netdev_chain, nb); + if (err) + goto unlock; + + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + } + } +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier); + +/** + * call_netdevice_notifiers - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, struct net_device *dev) +{ + ASSERT_RTNL(); + return raw_notifier_call_chain(&netdev_chain, val, dev); +} +EXPORT_SYMBOL(call_netdevice_notifiers); + +/* When > 0 there are consumers of rx skb time stamps */ +static atomic_t netstamp_needed = ATOMIC_INIT(0); + +void net_enable_timestamp(void) +{ + atomic_inc(&netstamp_needed); +} +EXPORT_SYMBOL(net_enable_timestamp); + +void net_disable_timestamp(void) +{ + atomic_dec(&netstamp_needed); +} +EXPORT_SYMBOL(net_disable_timestamp); + +static inline void net_timestamp_set(struct sk_buff *skb) +{ + if (atomic_read(&netstamp_needed)) + __net_timestamp(skb); + else + skb->tstamp.tv64 = 0; +} + +static inline void net_timestamp_check(struct sk_buff *skb) +{ + if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) + __net_timestamp(skb); +} + +static inline bool is_skb_forwardable(struct net_device *dev, + struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped, but freed) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + skb_orphan(skb); + nf_reset(skb); + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + skb->dev = dev; + skb_dst_drop(skb); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, dev); + skb->mark = 0; + secpath_reset(skb); + nf_reset(skb); + return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ + struct packet_type *ptype; + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + (ptype->af_packet_priv == NULL || + (struct sock *)ptype->af_packet_priv != skb->sk)) { + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + break; + + net_timestamp_set(skb2); + + /* skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb2->network_header > skb2->tail) { + if (net_ratelimit()) + printk(KERN_CRIT "protocol %04x is " + "buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); + } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } + } + if (pt_prev) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + rcu_read_unlock(); +} + +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues changed " + "invalidating tc mappings. Priority " + "traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues " + "changed. Priority %i to tc " + "mapping %i is no longer valid " + "setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ + int rc; + + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (dev->num_tc) + netif_setup_tc(dev, txq); + + if (txq < dev->real_num_tx_queues) + qdisc_reset_all_tx_gt(dev, txq); + } + + dev->real_num_tx_queues = txq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); + +#ifdef CONFIG_RPS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it always + * succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + +static inline void __netif_reschedule(struct Qdisc *q) +{ + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} + +void __netif_schedule(struct Qdisc *q) +{ + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) + __netif_reschedule(q); +} +EXPORT_SYMBOL(__netif_schedule); + +void dev_kfree_skb_irq(struct sk_buff *skb) +{ + if (atomic_dec_and_test(&skb->users)) { + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + skb->next = sd->completion_queue; + sd->completion_queue = skb; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(dev_kfree_skb_irq); + +void dev_kfree_skb_any(struct sk_buff *skb) +{ + if (in_irq() || irqs_disabled()) + dev_kfree_skb_irq(skb); + else + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(dev_kfree_skb_any); + + +/** + * netif_device_detach - mark device as removed + * @dev: network device + * + * Mark device as removed from system and therefore no longer available. + */ +void netif_device_detach(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_stop_all_queues(dev); + } +} +EXPORT_SYMBOL(netif_device_detach); + +/** + * netif_device_attach - mark device as attached + * @dev: network device + * + * Mark device as attached from system and restart if needed. + */ +void netif_device_attach(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_wake_all_queues(dev); + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_device_attach); + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb) +{ + __wsum csum; + int ret = 0, offset; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + goto out_set_summed; + + if (unlikely(skb_shinfo(skb)->gso_size)) { + /* Let GSO fix up the checksum. */ + goto out_set_summed; + } + + offset = skb_checksum_start_offset(skb); + BUG_ON(offset >= skb_headlen(skb)); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + + offset += skb->csum_offset; + BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); + + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__sum16))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + + *(__sum16 *)(skb->data + offset) = csum_fold(csum); +out_set_summed: + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} +EXPORT_SYMBOL(skb_checksum_help); + +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + __be16 type = skb->protocol; + int vlan_depth = ETH_HLEN; + int err; + + while (type == htons(ETH_P_8021Q)) { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) + return ERR_PTR(-EINVAL); + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } + + skb_reset_mac_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + __skb_pull(skb, skb->mac_len); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + struct net_device *dev = skb->dev; + struct ethtool_drvinfo info = {}; + + if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) + dev->ethtool_ops->get_drvinfo(dev, &info); + + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", + info.driver, dev ? dev->features : 0L, + skb->sk ? skb->sk->sk_route_caps : 0L, + skb->len, skb->data_len, skb->ip_summed); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + err = ptype->gso_send_check(skb); + segs = ERR_PTR(err); + if (err || skb_gso_ok(skb, features)) + break; + __skb_push(skb, (skb->data - + skb_network_header(skb))); + } + segs = ptype->gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_gso_segment); + +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : ""); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + +/* Actually, we should eliminate this check as soon as we know, that: + * 1. IOMMU is present and allows to map all the memory. + * 2. No high memory really exists on this machine. + */ + +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_HIGHMEM + int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + return 1; + } + + if (PCI_DMA_BUS_IS_PHYS) { + struct device *pdev = dev->dev.parent; + + if (!pdev) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + return 1; + } + } +#endif + return 0; +} + +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * @features: device features as applicable to this skb + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + + /* Verifying header integrity only. */ + if (!segs) + return 0; + + if (IS_ERR(segs)) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; + + return 0; +} + +/* + * Try to orphan skb early, right before transmission by the device. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c + */ +static inline void skb_orphan_try(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (sk && !skb_shinfo(skb)->tx_flags) { + /* skb_tx_hash() wont be able to get sk. + * We copy sk_hash into skb->rxhash + */ + if (!skb->rxhash) + skb->rxhash = sk->sk_hash; + skb_orphan(skb); + } +} + +static bool can_checksum_protocol(unsigned long features, __be16 protocol) +{ + return ((features & NETIF_F_GEN_CSUM) || + ((features & NETIF_F_V4_CSUM) && + protocol == htons(ETH_P_IP)) || + ((features & NETIF_F_V6_CSUM) && + protocol == htons(ETH_P_IPV6)) || + ((features & NETIF_F_FCOE_CRC) && + protocol == htons(ETH_P_FCOE))); +} + +static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +{ + if (!can_checksum_protocol(features, protocol)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } + + return features; +} + +u32 netif_skb_features(struct sk_buff *skb) +{ + __be16 protocol = skb->protocol; + u32 features = skb->dev->features; + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, protocol, features); + } + + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + + if (protocol != htons(ETH_P_8021Q)) { + return harmonize_features(skb, protocol, features); + } else { + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; + return harmonize_features(skb, protocol, features); + } +} +EXPORT_SYMBOL(netif_skb_features); + +/* + * Returns true if either: + * 1. skb has frag_list and the device doesn't support FRAGLIST, or + * 2. skb is fragmented and the device does not support SG, or if + * at least one of fragments is in highmem and device does not + * support DMA from it. + */ +static inline int skb_needs_linearize(struct sk_buff *skb, + int features) +{ + return skb_is_nonlinear(skb) && + ((skb_has_frag_list(skb) && + !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && + !(features & NETIF_F_SG))); +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int rc = NETDEV_TX_OK; + unsigned int skb_len; + + if (likely(!skb->next)) { + u32 features; + + /* + * If device doesn't need skb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + skb_orphan_try(skb); + + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && + !(features & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; + } + } + + skb_len = skb->len; + rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc, dev, skb_len); + if (rc == NETDEV_TX_OK) + txq_trans_update(txq); + return rc; + } + +gso: + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + + /* + * If device doesn't need nskb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(nskb); + + skb_len = nskb->len; + rc = ops->ndo_start_xmit(nskb, dev); + trace_net_dev_xmit(nskb, rc, dev, skb_len); + if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; + nskb->next = skb->next; + skb->next = nskb; + return rc; + } + txq_trans_update(txq); + if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) + return NETDEV_TX_BUSY; + } while (skb->next); + +out_kfree_gso_skb: + if (likely(skb->next == NULL)) + skb->destructor = DEV_GSO_CB(skb)->destructor; +out_kfree_skb: + kfree_skb(skb); +out: + return rc; +} + +static u32 hashrnd __read_mostly; + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ skb->rxhash; + hash = jhash_1word(hash, hashrnd); + + return (u16) (((u64) hash * qcount) >> 32) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + if (net_ratelimit()) { + pr_warning("%s selects TX queue %d, but " + "real number of TX queues is %d\n", + dev->name, queue_index, dev->real_num_tx_queues); + } + return 0; + } + return queue_index; +} + +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static struct netdev_queue *dev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + int queue_index; + const struct net_device_ops *ops = dev->netdev_ops; + + if (dev->real_num_tx_queues == 1) + queue_index = 0; + else if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } else { + struct sock *sk = skb->sk; + queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int old_index = queue_index; + + queue_index = get_xps_queue(dev, skb); + if (queue_index < 0) + queue_index = skb_tx_hash(dev, skb); + + if (queue_index != old_index && sk) { + struct dst_entry *dst = + rcu_dereference_check(sk->sk_dst_cache, 1); + + if (dst && skb_dst(skb) == dst) + sk_tx_queue_set(sk, queue_index); + } + } + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + +static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, + struct netdev_queue *txq) +{ + spinlock_t *root_lock = qdisc_lock(q); + bool contended; + int rc; + + qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_calculate_pkt_len(skb, q); + /* + * Heuristic to force contended enqueues to serialize on a + * separate lock before trying to get qdisc main lock. + * This permits __QDISC_STATE_RUNNING owner to get the lock more often + * and dequeue packets faster. + */ + contended = qdisc_is_running(q); + if (unlikely(contended)) + spin_lock(&q->busylock); + + spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + kfree_skb(skb); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + qdisc_run_begin(q)) { + /* + * This is a work-conserving queue; there are no old skbs + * waiting to be sent out; and the qdisc is not running - + * xmit the skb directly. + */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); + + qdisc_bstats_update(q, skb); + + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } else + qdisc_run_end(q); + + rc = NET_XMIT_SUCCESS; + } else { + skb_dst_force(skb); + rc = q->enqueue(skb, q) & NET_XMIT_MASK; + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } + } + spin_unlock(root_lock); + if (unlikely(contended)) + spin_unlock(&q->busylock); + return rc; +} + +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 10 + +/** + * dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. + * + * A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + * + * ----------------------------------------------------------------------------------- + * I notice this method can also return errors from the queue disciplines, + * including NET_XMIT_DROP, which is a positive value. So, errors can also + * be positive. + * + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) + * + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. + * --BLG + */ +int dev_queue_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + struct Qdisc *q; + int rc = -ENOMEM; + + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + rcu_read_lock_bh(); + + txq = dev_pick_tx(dev, skb); + q = rcu_dereference_bh(txq->qdisc); + +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); +#endif + trace_net_dev_queue(skb); + if (q->enqueue) { + rc = __dev_xmit_skb(skb, q, dev, txq); + goto out; + } + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics + counters.) + However, it is possible, that they rely on protection + made by us here. + + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) + */ + if (dev->flags & IFF_UP) { + int cpu = smp_processor_id(); /* ok because BHs are off */ + + if (txq->xmit_lock_owner != cpu) { + + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; + + HARD_TX_LOCK(dev, txq, cpu); + + if (!netif_tx_queue_stopped(txq)) { + __this_cpu_inc(xmit_recursion); + rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); + if (dev_xmit_complete(rc)) { + HARD_TX_UNLOCK(dev, txq); + goto out; + } + } + HARD_TX_UNLOCK(dev, txq); + if (net_ratelimit()) + printk(KERN_CRIT "Virtual device %s asks to " + "queue packet!\n", dev->name); + } else { + /* Recursion is detected! It is possible, + * unfortunately + */ +recursion_alert: + if (net_ratelimit()) + printk(KERN_CRIT "Dead loop on virtual device " + "%s, fix it urgently!\n", dev->name); + } + } + + rc = -ENETDOWN; + rcu_read_unlock_bh(); + + kfree_skb(skb); + return rc; +out: + rcu_read_unlock_bh(); + return rc; +} +EXPORT_SYMBOL(dev_queue_xmit); + + +/*======================================================================= + Receiver routines + =======================================================================*/ + +int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1; +int netdev_budget __read_mostly = 300; +int weight_p __read_mostly = 64; /* old backlog weight */ + +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + +/* + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. + */ +__u32 __skb_get_rxhash(struct sk_buff *skb) +{ + int nhoff, hash = 0, poff; + const struct ipv6hdr *ip6; + const struct iphdr *ip; + u8 ip_proto; + u32 addr1, addr2, ihl; + union { + u32 v32; + u16 v16[2]; + } ports; + + nhoff = skb_network_offset(skb); + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) + goto done; + + ip = (const struct iphdr *) (skb->data + nhoff); + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; + addr1 = (__force u32) ip->saddr; + addr2 = (__force u32) ip->daddr; + ihl = ip->ihl; + break; + case __constant_htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) + goto done; + + ip6 = (const struct ipv6hdr *) (skb->data + nhoff); + ip_proto = ip6->nexthdr; + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; + ihl = (40 >> 2); + break; + default: + goto done; + } + + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); + if (ports.v16[1] < ports.v16[0]) + swap(ports.v16[0], ports.v16[1]); + } + } + + /* get a consistent hash (same value on both flow directions) */ + if (addr2 < addr1) + swap(addr1, addr2); + + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + u16 tcpu; + + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb->rxhash & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->cpu = next_cpu; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, tcpu).input_queue_head; + } + + return rflow; +} + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1 && + !rcu_dereference_raw(rxqueue->rps_flow_table)) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { + goto done; + } + + skb_reset_network_header(skb); + if (!skb_get_rxhash(skb)) + goto done; + + flow_table = rcu_dereference(rxqueue->rps_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + u16 next_cpu; + struct rps_dev_flow *rflow; + + rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + tcpu = rflow->cpu; + + next_cpu = sock_flow_table->ents[skb->rxhash & + sock_flow_table->mask]; + + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + + if (map) { + tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + return cpu; +} + +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = ACCESS_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) +{ + struct softnet_data *sd = data; + + ____napi_schedule(sd, &sd->backlog); + sd->received_rps++; +} + +#endif /* CONFIG_RPS */ + +/* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int rps_ipi_queued(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *mysd = &__get_cpu_var(softnet_data); + + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + return 0; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) +{ + struct softnet_data *sd; + unsigned long flags; + + sd = &per_cpu(softnet_data, cpu); + + local_irq_save(flags); + + rps_lock(sd); + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { +enqueue: + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + rps_unlock(sd); + local_irq_restore(flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); + } + goto enqueue; + } + + sd->dropped++; + rps_unlock(sd); + + local_irq_restore(flags); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; +} + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + int ret; + + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) + return NET_RX_DROP; + + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); + + trace_netif_rx(skb); +#ifdef CONFIG_RPS + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + + preempt_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu < 0) + cpu = smp_processor_id(); + + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); + preempt_enable(); + } +#else + { + unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); + put_cpu(); + } +#endif + return ret; +} +EXPORT_SYMBOL(netif_rx); + +int netif_rx_ni(struct sk_buff *skb) +{ + int err; + + preempt_disable(); + err = netif_rx(skb); + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); + + return err; +} +EXPORT_SYMBOL(netif_rx_ni); + +static void net_tx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_disable(); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_enable(); + + while (clist) { + struct sk_buff *skb = clist; + clist = clist->next; + + WARN_ON(atomic_read(&skb->users)); + trace_kfree_skb(skb, net_tx_action); + __kfree_skb(skb); + } + } + + if (sd->output_queue) { + struct Qdisc *head; + + local_irq_disable(); + head = sd->output_queue; + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; + local_irq_enable(); + + while (head) { + struct Qdisc *q = head; + spinlock_t *root_lock; + + head = head->next_sched; + + root_lock = qdisc_lock(q); + if (spin_trylock(root_lock)) { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + qdisc_run(q); + spin_unlock(root_lock); + } else { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &q->state)) { + __netif_reschedule(q); + } else { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + } + } + } + } +} + +#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ + (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +/* This hook is defined here for ATM LANE */ +int (*br_fdb_test_addr_hook)(struct net_device *dev, + unsigned char *addr) __read_mostly; +EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); +#endif + +#ifdef CONFIG_NET_CLS_ACT +/* TODO: Maybe we should just force sch_ingress to be compiled in + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions + * a compare and 2 stores extra right now if we dont have it on + * but have CONFIG_NET_CLS_ACT + * NOTE: This doesn't stop any functionality; if you dont have + * the ingress scheduler, you just can't add policies on ingress. + * + */ +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) +{ + struct net_device *dev = skb->dev; + u32 ttl = G_TC_RTTL(skb->tc_verd); + int result = TC_ACT_OK; + struct Qdisc *q; + + if (unlikely(MAX_RED_LOOP < ttl++)) { + if (net_ratelimit()) + pr_warning( "Redir loop detected Dropping packet (%d->%d)\n", + skb->skb_iif, dev->ifindex); + return TC_ACT_SHOT; + } + + skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + + q = rxq->qdisc; + if (q != &noop_qdisc) { + spin_lock(qdisc_lock(q)); + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) + result = qdisc_enqueue_root(skb, q); + spin_unlock(qdisc_lock(q)); + } + + return result; +} + +static inline struct sk_buff *handle_ing(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) + goto out; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + switch (ing_filter(skb, rxq)) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NULL; + } + +out: + skb->tc_verd = 0; + return skb; +} +#endif + +/** + * netdev_rx_handler_register - register receive handler + * @dev: device to register a handler for + * @rx_handler: receive handler to register + * @rx_handler_data: data pointer that is used by rx handler + * + * Register a receive hander for a device. This handler will then be + * called from __netif_receive_skb. A negative errno code is returned + * on a failure. + * + * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. + */ +int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler, + void *rx_handler_data) +{ + ASSERT_RTNL(); + + if (dev->rx_handler) + return -EBUSY; + + rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); + rcu_assign_pointer(dev->rx_handler, rx_handler); + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_register); + +/** + * netdev_rx_handler_unregister - unregister receive handler + * @dev: device to unregister a handler from + * + * Unregister a receive hander from a device. + * + * The caller must hold the rtnl_mutex. + */ +void netdev_rx_handler_unregister(struct net_device *dev) +{ + + ASSERT_RTNL(); + rcu_assign_pointer(dev->rx_handler, NULL); + rcu_assign_pointer(dev->rx_handler_data, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + +static int __netif_receive_skb(struct sk_buff *skb) +{ + struct packet_type *ptype, *pt_prev; + rx_handler_func_t *rx_handler; + struct net_device *orig_dev; + struct net_device *null_or_dev; + bool deliver_exact = false; + int ret = NET_RX_DROP; + __be16 type; + + if (!netdev_tstamp_prequeue) + net_timestamp_check(skb); + + trace_netif_receive_skb(skb); + + /* if we've gotten here through NAPI, check netpoll */ + if (netpoll_receive_skb(skb)) + return NET_RX_DROP; + + if (!skb->skb_iif) + skb->skb_iif = skb->dev->ifindex; + orig_dev = skb->dev; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + pt_prev = NULL; + + rcu_read_lock(); + +another_round: + + __this_cpu_inc(softnet_data.processed); + + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + +#ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + goto ncls; + } +#endif + + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (!ptype->dev || ptype->dev == skb->dev) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + } + +#ifdef CONFIG_NET_CLS_ACT + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; +ncls: +#endif + + rx_handler = rcu_dereference(skb->dev->rx_handler); + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: + goto out; + case RX_HANDLER_ANOTHER: + goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } + } + + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb)) { + ret = __netif_receive_skb(skb); + goto out; + } else if (unlikely(!skb)) + goto out; + } + + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; + + type = skb->protocol; + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { + if (ptype->type == type && + (ptype->dev == null_or_dev || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + } + + if (pt_prev) { + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } else { + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } + +out: + rcu_read_unlock(); + return ret; +} + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + if (netdev_tstamp_prequeue) + net_timestamp_check(skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + +#ifdef CONFIG_RPS + { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; + + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + ret = __netif_receive_skb(skb); + } + + return ret; + } +#else + return __netif_receive_skb(skb); +#endif +} +EXPORT_SYMBOL(netif_receive_skb); + +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */ +static void flush_backlog(void *arg) +{ + struct net_device *dev = arg; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct sk_buff *skb, *tmp; + + rps_lock(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->input_pkt_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } + rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } +} + +static int napi_gro_complete(struct sk_buff *skb) +{ + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int err = -ENOENT; + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || ptype->dev || !ptype->gro_complete) + continue; + + err = ptype->gro_complete(skb); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return NET_RX_SUCCESS; + } + +out: + return netif_receive_skb(skb); +} + +inline void napi_gro_flush(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + napi_gro_complete(skb); + } + + napi->gro_count = 0; + napi->gro_list = NULL; +} +EXPORT_SYMBOL(napi_gro_flush); + +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int same_flow; + int mac_len; + enum gro_result ret; + + if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) + goto normal; + + if (skb_is_gso(skb) || skb_has_frag_list(skb)) + goto normal; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || ptype->dev || !ptype->gro_receive) + continue; + + skb_set_network_header(skb, skb_gro_offset(skb)); + mac_len = skb->network_header - skb->mac_header; + skb->mac_len = mac_len; + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->free = 0; + + pp = ptype->gro_receive(&napi->gro_list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + struct sk_buff *nskb = *pp; + + *pp = nskb->next; + nskb->next = NULL; + napi_gro_complete(nskb); + napi->gro_count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) + goto normal; + + napi->gro_count++; + NAPI_GRO_CB(skb)->count = 1; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + skb->next = napi->gro_list; + napi->gro_list = skb; + ret = GRO_HELD; + +pull: + if (skb_headlen(skb) < skb_gro_offset(skb)) { + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->tail += grow; + skb->data_len -= grow; + + skb_shinfo(skb)->frags[0].page_offset += grow; + skb_shinfo(skb)->frags[0].size -= grow; + + if (unlikely(!skb_shinfo(skb)->frags[0].size)) { + put_page(skb_shinfo(skb)->frags[0].page); + memmove(skb_shinfo(skb)->frags, + skb_shinfo(skb)->frags + 1, + --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + } + } + +ok: + return ret; + +normal: + ret = GRO_NORMAL; + goto pull; +} +EXPORT_SYMBOL(dev_gro_receive); + +static inline gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; + + for (p = napi->gro_list; p; p = p->next) { + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); + NAPI_GRO_CB(p)->same_flow = !diffs; + NAPI_GRO_CB(p)->flush = 0; + } + + return dev_gro_receive(napi, skb); +} + +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +{ + switch (ret) { + case GRO_NORMAL: + if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; + + case GRO_DROP: + case GRO_MERGED_FREE: + kfree_skb(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + break; + } + + return ret; +} +EXPORT_SYMBOL(napi_skb_finish); + +void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb->mac_header == skb->tail && + !PageHighMem(skb_shinfo(skb)->frags[0].page)) { + NAPI_GRO_CB(skb)->frag0 = + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; + NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + } +} +EXPORT_SYMBOL(skb_gro_reset_offset); + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + skb_gro_reset_offset(skb); + + return napi_skb_finish(__napi_gro_receive(napi, skb), skb); +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; + skb->dev = napi->dev; + skb->skb_iif = 0; + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + if (skb) + napi->skb = skb; + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + skb->protocol = eth_type_trans(skb, skb->dev); + + if (ret == GRO_HELD) + skb_gro_pull(skb, -ETH_HLEN); + else if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; + + case GRO_DROP: + case GRO_MERGED_FREE: + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + break; + } + + return ret; +} +EXPORT_SYMBOL(napi_frags_finish); + +struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + struct ethhdr *eth; + unsigned int hlen; + unsigned int off; + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + off = skb_gro_offset(skb); + hlen = off + sizeof(*eth); + eth = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + eth = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + } + + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; + +out: + return skb; +} +EXPORT_SYMBOL(napi_frags_skb); + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi_frags_skb(napi); + + if (!skb) + return GRO_DROP; + + return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +} +EXPORT_SYMBOL(napi_gro_frags); + +/* + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + __smp_call_function_single(remsd->cpu, + &remsd->csd, 0); + remsd = next; + } + } else +#endif + local_irq_enable(); +} + +static int process_backlog(struct napi_struct *napi, int quota) +{ + int work = 0; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + +#ifdef CONFIG_RPS + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd->rps_ipi_list) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } +#endif + napi->weight = weight_p; + local_irq_disable(); + while (work < quota) { + struct sk_buff *skb; + unsigned int qlen; + + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); + __netif_receive_skb(skb); + local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } + } + + rps_lock(sd); + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen) + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + + if (qlen < quota - work) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set on backlog. + * we can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + list_del(&napi->poll_list); + napi->state = 0; + + quota = work + qlen; + } + rps_unlock(sd); + } + local_irq_enable(); + + return work; +} + +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run + */ +void __napi_schedule(struct napi_struct *n) +{ + unsigned long flags; + + local_irq_save(flags); + ____napi_schedule(&__get_cpu_var(softnet_data), n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); + +void __napi_complete(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + BUG_ON(n->gro_list); + + list_del(&n->poll_list); + smp_mb__before_clear_bit(); + clear_bit(NAPI_STATE_SCHED, &n->state); +} +EXPORT_SYMBOL(__napi_complete); + +void napi_complete(struct napi_struct *n) +{ + unsigned long flags; + + /* + * don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu + */ + if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + return; + + napi_gro_flush(n); + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(napi_complete); + +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + INIT_LIST_HEAD(&napi->poll_list); + napi->gro_count = 0; + napi->gro_list = NULL; + napi->skb = NULL; + napi->poll = poll; + napi->weight = weight; + list_add(&napi->dev_list, &dev->napi_list); + napi->dev = dev; +#ifdef CONFIG_NETPOLL + spin_lock_init(&napi->poll_lock); + napi->poll_owner = -1; +#endif + set_bit(NAPI_STATE_SCHED, &napi->state); +} +EXPORT_SYMBOL(netif_napi_add); + +void netif_napi_del(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + list_del_init(&napi->dev_list); + napi_free_frags(napi); + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + kfree_skb(skb); + } + + napi->gro_list = NULL; + napi->gro_count = 0; +} +EXPORT_SYMBOL(netif_napi_del); + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + unsigned long time_limit = jiffies + 2; + int budget = netdev_budget; + void *have; + + local_irq_disable(); + + while (!list_empty(&sd->poll_list)) { + struct napi_struct *n; + int work, weight; + + /* If softirq window is exhuasted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) + goto softnet_break; + + local_irq_enable(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + + have = netpoll_poll_lock(n); + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); + } + + WARN_ON_ONCE(work > weight); + + budget -= work; + + local_irq_disable(); + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(work == weight)) { + if (unlikely(napi_disable_pending(n))) { + local_irq_enable(); + napi_complete(n); + local_irq_disable(); + } else + list_move_tail(&n->poll_list, &sd->poll_list); + } + + netpoll_poll_unlock(have); + } +out: + net_rps_action_and_irq_enable(sd); + +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + dma_issue_pending_all(); +#endif + + return; + +softnet_break: + sd->time_squeeze++; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + goto out; +} + +static gifconf_func_t *gifconf_list[NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} +EXPORT_SYMBOL(register_gifconf); + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ + struct net_device *dev; + struct ifreq ifr; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + + strcpy(ifr.ifr_name, dev->name); + rcu_read_unlock(); + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for_each_netdev(net, dev) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +#ifdef CONFIG_PROC_FS +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct net *net = seq_file_net(seq); + loff_t off; + struct net_device *dev; + + rcu_read_lock(); + if (!*pos) + return SEQ_START_TOKEN; + + off = 1; + for_each_netdev_rcu(net, dev) + if (off++ == *pos) + return dev; + + return NULL; +} + +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + dev = first_net_device_rcu(seq_file_net(seq)); + else + dev = next_net_device_rcu(dev); + + ++*pos; + return dev; +} + +void dev_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ + struct softnet_data *sd = NULL; + + while (*pos < nr_cpu_ids) + if (cpu_online(*pos)) { + sd = &per_cpu(softnet_data, *pos); + break; + } else + ++*pos; + return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct softnet_data *sd = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + sd->processed, sd->dropped, sd->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + sd->cpu_collision, sd->received_rps); + return 0; +} + +static const struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ + struct packet_type *pt = NULL; + loff_t i = 0; + int t; + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < PTYPE_HASH_SIZE; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(0); + + pt = v; + nxt = pt->list.next; + if (pt->type == htons(ETH_P_ALL)) { + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + + while (nxt == &ptype_base[hash]) { + if (++hash >= PTYPE_HASH_SIZE) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s %pF\n", + pt->dev ? pt->dev->name : "", pt->func); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ptype_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { + .owner = THIS_MODULE, + .open = ptype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ + int rc = -ENOMEM; + + if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; + if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) + goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; + rc = 0; +out: + return rc; +out_ptype: + proc_net_remove(net, "ptype"); +out_softnet: + proc_net_remove(net, "softnet_stat"); +out_dev: + proc_net_remove(net, "dev"); + goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + proc_net_remove(net, "ptype"); + proc_net_remove(net, "softnet_stat"); + proc_net_remove(net, "dev"); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int __init dev_proc_init(void) +{ + return register_pernet_subsys(&dev_proc_ops); +} +#else +#define dev_proc_init() 0 +#endif /* CONFIG_PROC_FS */ + + +/** + * netdev_set_master - set up master pointer + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success the reference counts + * are adjusted and the function returns zero. + */ +int netdev_set_master(struct net_device *slave, struct net_device *master) +{ + struct net_device *old = slave->master; + + ASSERT_RTNL(); + + if (master) { + if (old) + return -EBUSY; + dev_hold(master); + } + + slave->master = master; + + if (old) + dev_put(old); + return 0; +} +EXPORT_SYMBOL(netdev_set_master); + +/** + * netdev_set_bond_master - set up bonding master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success %RTM_NEWLINK is sent + * to the routing socket and the function returns zero. + */ +int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +{ + int err; + + ASSERT_RTNL(); + + err = netdev_set_master(slave, master); + if (err) + return err; + if (master) + slave->flags |= IFF_SLAVE; + else + slave->flags &= ~IFF_SLAVE; + + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); + return 0; +} +EXPORT_SYMBOL(netdev_set_bond_master); + +static void dev_change_rx_flags(struct net_device *dev, int flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + ops->ndo_change_rx_flags(dev, flags); +} + +static int __dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + uid_t uid; + gid_t gid; + + ASSERT_RTNL(); + + dev->flags |= IFF_PROMISC; + dev->promiscuity += inc; + if (dev->promiscuity == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch promisc and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_PROMISC; + else { + dev->promiscuity -= inc; + printk(KERN_WARNING "%s: promiscuity touches roof, " + "set promiscuity failed, promiscuity feature " + "of device might be broken.\n", dev->name); + return -EOVERFLOW; + } + } + if (dev->flags != old_flags) { + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags & IFF_PROMISC) ? "entered" : + "left"); + if (audit_enabled) { + current_uid_gid(&uid, &gid); + audit_log(current->audit_context, GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + audit_get_loginuid(current), + uid, gid, + audit_get_sessionid(current)); + } + + dev_change_rx_flags(dev, IFF_PROMISC); + } + return 0; +} + +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + int err; + + err = __dev_set_promiscuity(dev, inc); + if (err < 0) + return err; + if (dev->flags != old_flags) + dev_set_rx_mode(dev); + return err; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + ASSERT_RTNL(); + + dev->flags |= IFF_ALLMULTI; + dev->allmulti += inc; + if (dev->allmulti == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch allmulti and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_ALLMULTI; + else { + dev->allmulti -= inc; + printk(KERN_WARNING "%s: allmulti touches roof, " + "set allmulti failed, allmulti feature of " + "device might be broken.\n", dev->name); + return -EOVERFLOW; + } + } + if (dev->flags ^ old_flags) { + dev_change_rx_flags(dev, IFF_ALLMULTI); + dev_set_rx_mode(dev); + } + return 0; +} +EXPORT_SYMBOL(dev_set_allmulti); + +/* + * Upload unicast and multicast address lists to device and + * configure RX filtering. When the device doesn't support unicast + * filtering it is put in promiscuous mode while unicast addresses + * are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags&IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); + else { + /* Unicast addresses changes may only happen under the rtnl, + * therefore calling __dev_set_promiscuity here is safe. + */ + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { + __dev_set_promiscuity(dev, 1); + dev->uc_promisc = 1; + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { + __dev_set_promiscuity(dev, -1); + dev->uc_promisc = 0; + } + + if (ops->ndo_set_multicast_list) + ops->ndo_set_multicast_list(dev); + } +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); +} + +/** + * dev_ethtool_get_settings - call device's ethtool_ops::get_settings() + * @dev: device + * @cmd: memory area for ethtool_ops::get_settings() result + * + * The cmd arg is initialized properly (cleared and + * ethtool_cmd::cmd field set to ETHTOOL_GSET). + * + * Return device's ethtool_ops::get_settings() result value or + * -EOPNOTSUPP when device doesn't expose + * ethtool_ops::get_settings() operation. + */ +int dev_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd) +{ + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(cmd, 0, sizeof(struct ethtool_cmd)); + cmd->cmd = ETHTOOL_GSET; + return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(dev_ethtool_get_settings); + +/** + * dev_get_flags - get flags reported to userspace + * @dev: device + * + * Get the combination of flag bits exported through APIs to userspace. + */ +unsigned dev_get_flags(const struct net_device *dev) +{ + unsigned flags; + + flags = (dev->flags & ~(IFF_PROMISC | + IFF_ALLMULTI | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | + (dev->gflags & (IFF_PROMISC | + IFF_ALLMULTI)); + + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } + + return flags; +} +EXPORT_SYMBOL(dev_get_flags); + +int __dev_change_flags(struct net_device *dev, unsigned int flags) +{ + int old_flags = dev->flags; + int ret; + + ASSERT_RTNL(); + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | + IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + if ((old_flags ^ flags) & IFF_MULTICAST) + dev_change_rx_flags(dev, IFF_MULTICAST); + + dev_set_rx_mode(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); + + if (!ret) + dev_set_rx_mode(dev); + } + + if ((flags ^ dev->gflags) & IFF_PROMISC) { + int inc = (flags & IFF_PROMISC) ? 1 : -1; + + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags ^ dev->gflags) & IFF_ALLMULTI) { + int inc = (flags & IFF_ALLMULTI) ? 1 : -1; + + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) + call_netdevice_notifiers(NETDEV_CHANGE, dev); +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret, changes; + int old_flags = dev->flags; + + ret = __dev_change_flags(dev, flags); + if (ret < 0) + return ret; + + changes = old_flags ^ dev->flags; + if (changes) + rtmsg_ifinfo(RTM_NEWLINK, dev, changes); + + __dev_notify_flags(dev, old_flags); + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +/** + * dev_set_mtu - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * + * Change the maximum transfer size of the network device. + */ +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (new_mtu == dev->mtu) + return 0; + + /* MTU must be positive. */ + if (new_mtu < 0) + return -EINVAL; + + if (!netif_device_present(dev)) + return -ENODEV; + + err = 0; + if (ops->ndo_change_mtu) + err = ops->ndo_change_mtu(dev, new_mtu); + else + dev->mtu = new_mtu; + + if (!err && dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + return err; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/** + * dev_set_mac_address - Change Media Access Control Address + * @dev: device + * @sa: new address + * + * Change the hardware (MAC) address of the device + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (!ops->ndo_set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = ops->ndo_set_mac_address(dev, sa); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_set_mac_address); + +/* + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (short) dev_get_flags(dev); + return 0; + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCGIFSLAVE: + err = -EINVAL; + break; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -ENOTTY; + break; + + } + return err; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops; + + if (!dev) + return -ENODEV; + + ops = dev->netdev_ops; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCSIFMAP: + if (ops->ndo_set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCDELMULTI: + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + /* + * Unknown or private ioctl + */ + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @net: the applicable net namespace + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_lock(); + ret = dev_ifconf(net, (char __user *) arg); + rtnl_unlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname(net, (struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr.ifr_name); + rcu_read_lock(); + ret = dev_ifsioc_locked(net, &ifr, cmd); + rcu_read_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(net, &ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSHWTSTAMP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -ENOTTY; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(net, &ifr, cmd, arg); + return -ENOTTY; + } +} + + +/** + * dev_new_index - allocate an ifindex + * @net: the applicable net namespace + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(struct net *net) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(net, ifindex)) + return ifindex; + } +} + +/* Delayed registration/unregisteration */ +static LIST_HEAD(net_todo_list); + +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); +} + +static void rollback_registered_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. */ + dev_close_many(head); + + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + + dev->reg_state = NETREG_UNREGISTERING; + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + } + + /* Process any work delayed until the end of the batch */ + dev = list_first_entry(head, struct net_device, unreg_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + + rcu_barrier(); + + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); +} + +static void rollback_registered(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); + list_del(&single); +} + +u32 netdev_fix_features(struct net_device *dev, u32 features) +{ + /* Fix illegal checksum combinations */ + if ((features & NETIF_F_HW_CSUM) && + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed HW and IP checksum settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } + + if ((features & NETIF_F_NO_CSUM) && + (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed no checksumming and other settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); + } + + /* Fix illegal SG+CSUM combinations. */ + if ((features & NETIF_F_SG) && + !(features & NETIF_F_ALL_CSUM)) { + netdev_dbg(dev, + "Dropping NETIF_F_SG since no checksum feature.\n"); + features &= ~NETIF_F_SG; + } + + /* TSO requires that SG is present as well. */ + if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); + features &= ~NETIF_F_ALL_TSO; + } + + /* TSO ECN requires that TSO is present as well. */ + if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) + features &= ~NETIF_F_TSO_ECN; + + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; + } + + /* UFO needs SG and checksumming */ + if (features & NETIF_F_UFO) { + /* maybe split UFO into V4 and V6? */ + if (!((features & NETIF_F_GEN_CSUM) || + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no checksum offload features.\n"); + features &= ~NETIF_F_UFO; + } + + if (!(features & NETIF_F_SG)) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); + features &= ~NETIF_F_UFO; + } + } + + return features; +} +EXPORT_SYMBOL(netdev_fix_features); + +int __netdev_update_features(struct net_device *dev) +{ + u32 features; + int err = 0; + + ASSERT_RTNL(); + + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + if (dev->features == features) + return 0; + + netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", + dev->features, features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + + if (unlikely(err < 0)) { + netdev_err(dev, + "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", + err, features, dev->features); + return -1; + } + + if (!err) + dev->features = features; + + return 1; +} + +/** + * netdev_update_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications if it + * has changed. Should be called after driver or hardware dependent + * conditions might have changed that influence the features. + */ +void netdev_update_features(struct net_device *dev) +{ + if (__netdev_update_features(dev)) + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_update_features); + +/** + * netdev_change_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications even + * if they have not changed. Should be called instead of + * netdev_update_features() if also dev->vlan_features might + * have changed to allow the changes to be propagated to stacked + * VLAN devices. + */ +void netdev_change_features(struct net_device *dev) +{ + __netdev_update_features(dev); + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features); + +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + +#ifdef CONFIG_RPS +static int netif_alloc_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + + BUG_ON(count < 1); + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", count); + return -ENOMEM; + } + dev->_rx = rx; + + for (i = 0; i < count; i++) + rx[i].dev = dev; + return 0; +} +#endif + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) +{ + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + + BUG_ON(count < 1); + + tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + pr_err("netdev: Unable to allocate %u tx queues.\n", + count); + return -ENOMEM; + } + dev->_tx = tx; + + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); + + return 0; +} + +/** + * register_netdevice - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * Callers must hold the rtnl semaphore. You may want + * register_netdev() instead of this. + * + * BUGS: + * The locking appears insufficient to guarantee two parallel registers + * will not get the same name. + */ + +int register_netdevice(struct net_device *dev) +{ + int ret; + struct net *net = dev_net(dev); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + might_sleep(); + + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + BUG_ON(!net); + + spin_lock_init(&dev->addr_list_lock); + netdev_set_addr_lockdep_class(dev); + + dev->iflink = -1; + + ret = dev_get_valid_name(dev, dev->name); + if (ret < 0) + goto out; + + /* Init, if this function is available */ + if (dev->netdev_ops->ndo_init) { + ret = dev->netdev_ops->ndo_init(dev); + if (ret) { + if (ret > 0) + ret = -EIO; + goto out; + } + } + + dev->ifindex = dev_new_index(net); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->features |= NETIF_F_SOFT_FEATURES; + dev->wanted_features = dev->features & dev->hw_features; + + /* Turn on no cache copy if HW is doing checksum */ + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if ((dev->features & NETIF_F_ALL_CSUM) && + !(dev->features & NETIF_F_NO_CSUM)) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } + + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, + * vlan_dev_init() will do the dev->features check, so these features + * are enabled only if supported by underlying device. + */ + dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + + ret = netdev_register_kobject(dev); + if (ret) + goto err_uninit; + dev->reg_state = NETREG_REGISTERED; + + __netdev_update_features(dev); + + /* + * Default initial state at registry is that the + * device is present. + */ + + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev_init_scheduler(dev); + dev_hold(dev); + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + ret = notifier_to_errno(ret); + if (ret) { + rollback_registered(dev); + dev->reg_state = NETREG_UNREGISTERED; + } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + +out: + return ret; + +err_uninit: + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + goto out; +} +EXPORT_SYMBOL(register_netdevice); + +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + +/** + * register_netdev - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * This is a wrapper around register_netdevice that takes the rtnl semaphore + * and expands the device name if you passed a format string to + * alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ + int err; + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdev); + +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + +/* + * netdev_wait_allrefs - wait until all references are gone. + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put. + */ +static void netdev_wait_allrefs(struct net_device *dev) +{ + unsigned long rebroadcast_time, warning_time; + int refcnt; + + linkwatch_forget_dev(dev); + + rebroadcast_time = warning_time = jiffies; + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { + rtnl_lock(); + + /* Rebroadcast unregister notification */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users + * should have already handle it the first time */ + + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + } + + __rtnl_unlock(); + + rebroadcast_time = jiffies; + } + + msleep(250); + + refcnt = netdev_refcnt_read(dev); + + if (time_after(jiffies, warning_time + 10 * HZ)) { + printk(KERN_EMERG "unregister_netdevice: " + "waiting for %s to become free. Usage " + "count = %d\n", + dev->name, refcnt); + warning_time = jiffies; + } + } +} + +/* The sequence is: + * + * rtnl_lock(); + * ... + * register_netdevice(x1); + * register_netdevice(x2); + * ... + * unregister_netdevice(y1); + * unregister_netdevice(y2); + * ... + * rtnl_unlock(); + * free_netdev(y1); + * free_netdev(y2); + * + * We are invoked by rtnl_unlock(). + * This allows us to deal with problems: + * 1) We can delete sysfs objects which invoke hotplug + * without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + * safely in order to wait for the netdev refcnt to drop to zero. + * + * We must not return until all unregister events added during + * the interval the lock was held have been completed. + */ +void netdev_run_todo(void) +{ + struct list_head list; + + /* Snapshot list, allow later requests */ + list_replace_init(&net_todo_list, &list); + + __rtnl_unlock(); + + while (!list_empty(&list)) { + struct net_device *dev + = list_first_entry(&list, struct net_device, todo_list); + list_del(&dev->todo_list); + + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { + printk(KERN_ERR "network todo '%s' but state %d\n", + dev->name, dev->reg_state); + dump_stack(); + continue; + } + + dev->reg_state = NETREG_UNREGISTERED; + + on_each_cpu(flush_backlog, dev, 1); + + netdev_wait_allrefs(dev); + + /* paranoia */ + BUG_ON(netdev_refcnt_read(dev)); + WARN_ON(rcu_dereference_raw(dev->ip_ptr)); + WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); + WARN_ON(dev->dn_ptr); + + if (dev->destructor) + dev->destructor(dev); + + /* Free network device */ + kobject_put(&dev->dev.kobj); + } +} + +/* Convert net_device_stats to rtnl_link_stats64. They have the same + * fields in the same order, with only the type differing. + */ +static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, + const struct net_device_stats *netdev_stats) +{ +#if BITS_PER_LONG == 64 + BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + memcpy(stats64, netdev_stats, sizeof(*stats64)); +#else + size_t i, n = sizeof(*stats64) / sizeof(u64); + const unsigned long *src = (const unsigned long *)netdev_stats; + u64 *dst = (u64 *)stats64; + + BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != + sizeof(*stats64) / sizeof(u64)); + for (i = 0; i < n; i++) + dst[i] = src[i]; +#endif +} + +/** + * dev_get_stats - get network device statistics + * @dev: device to get statistics from + * @storage: place to store stats + * + * Get network statistics from device. Return @storage. + * The device driver may provide its own method by setting + * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; + * otherwise the internal statistics structure is used. + */ +struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_stats64) { + memset(storage, 0, sizeof(*storage)); + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { + netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else { + netdev_stats_to_stats64(storage, &dev->stats); + } + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + return storage; +} +EXPORT_SYMBOL(dev_get_stats); + +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + +/** + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subquue structs + * for each queue on the device. + */ +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) +{ + struct net_device *dev; + size_t alloc_size; + struct net_device *p; + + BUG_ON(strlen(name) >= sizeof(dev->name)); + + if (txqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero queues.\n"); + return NULL; + } + +#ifdef CONFIG_RPS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero RX queues.\n"); + return NULL; + } +#endif + + alloc_size = sizeof(struct net_device); + if (sizeof_priv) { + /* ensure 32-byte alignment of private area */ + alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); + alloc_size += sizeof_priv; + } + /* ensure 32-byte alignment of whole construct */ + alloc_size += NETDEV_ALIGN - 1; + + p = kzalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); + return NULL; + } + + dev = PTR_ALIGN(p, NETDEV_ALIGN); + dev->padded = (char *)dev - (char *)p; + + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) + goto free_p; + + if (dev_addr_init(dev)) + goto free_pcpu; + + dev_mc_init(dev); + dev_uc_init(dev); + + dev_net_set(dev, &init_net); + + dev->gso_max_size = GSO_MAX_SIZE; + + INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); + dev->ethtool_ntuple_list.count = 0; + INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->link_watch_list); + dev->priv_flags = IFF_XMIT_DST_RELEASE; + setup(dev); + + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_all; + +#ifdef CONFIG_RPS + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_all; +#endif + + strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; + return dev; + +free_all: + free_netdev(dev); + return NULL; + +free_pcpu: + free_percpu(dev->pcpu_refcnt); + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + +free_p: + kfree(p); + return NULL; +} +EXPORT_SYMBOL(alloc_netdev_mqs); + +/** + * free_netdev - free network device + * @dev: device + * + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. + * If this is the last reference then it will be freed. + */ +void free_netdev(struct net_device *dev) +{ + struct napi_struct *p, *n; + + release_net(dev_net(dev)); + + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + + kfree(rcu_dereference_raw(dev->ingress_queue)); + + /* Flush device addresses */ + dev_addr_flush(dev); + + /* Clear ethtool n-tuple list */ + ethtool_ntuple_flush(dev); + + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + netif_napi_del(p); + + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + + /* Compatibility with error handling in drivers */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + kfree((char *)dev - dev->padded); + return; + } + + BUG_ON(dev->reg_state != NETREG_UNREGISTERED); + dev->reg_state = NETREG_RELEASED; + + /* will free via device release */ + put_device(&dev->dev); +} +EXPORT_SYMBOL(free_netdev); + +/** + * synchronize_net - Synchronize with packet receive processing + * + * Wait for packets currently being received to be done. + * Does not block later packets from starting. + */ +void synchronize_net(void) +{ + might_sleep(); + if (rtnl_is_locked()) + synchronize_rcu_expedited(); + else + synchronize_rcu(); +} +EXPORT_SYMBOL(synchronize_net); + +/** + * unregister_netdevice_queue - remove device from the kernel + * @dev: device + * @head: list + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. + * + * Callers must hold the rtnl semaphore. You may want + * unregister_netdev() instead of this. + */ + +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) +{ + ASSERT_RTNL(); + + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); + } +} +EXPORT_SYMBOL(unregister_netdevice_queue); + +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; + + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + } +} +EXPORT_SYMBOL(unregister_netdevice_many); + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(unregister_netdev); + +/** + * dev_change_net_namespace - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + */ + +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + int err; + + ASSERT_RTNL(); + + /* Don't allow namespace local devices to be moved. */ + err = -EINVAL; + if (dev->features & NETIF_F_NETNS_LOCAL) + goto out; + + /* Ensure the device has been registrered */ + err = -EINVAL; + if (dev->reg_state != NETREG_REGISTERED) + goto out; + + /* Get out if there is nothing todo */ + err = 0; + if (net_eq(dev_net(dev), net)) + goto out; + + /* Pick the destination device name, and ensure + * we can use it in the destination network namespace. + */ + err = -EEXIST; + if (__dev_get_by_name(net, dev->name)) { + /* We get here if we can't use the current device name */ + if (!pat) + goto out; + if (dev_get_valid_name(dev, pat) < 0) + goto out; + } + + /* + * And now a mini version of register_netdevice unregister_netdevice. + */ + + /* If device is running close it first. */ + dev_close(dev); + + /* And unlink it from device chain */ + err = -ENODEV; + unlist_netdevice(dev); + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + /* Actually switch the network namespace */ + dev_net_set(dev, net); + + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) { + int iflink = (dev->iflink == dev->ifindex); + dev->ifindex = dev_new_index(net); + if (iflink) + dev->iflink = dev->ifindex; + } + + /* Fixup kobjects */ + err = device_rename(&dev->dev, dev->name); + WARN_ON(err); + + /* Add the device back in the hashes */ + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + call_netdevice_notifiers(NETDEV_REGISTER, dev); + + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + + synchronize_net(); + err = 0; +out: + return err; +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); + +static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +{ + struct sk_buff **list_skb; + struct sk_buff *skb; + unsigned int cpu, oldcpu = (unsigned long)ocpu; + struct softnet_data *sd, *oldsd; + + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) + return NOTIFY_OK; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Append output queue from offline CPU. */ + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } + /* Append NAPI poll list from offline CPU. */ + if (!list_empty(&oldsd->poll_list)) { + list_splice_init(&oldsd->poll_list, &sd->poll_list); + raise_softirq_irqoff(NET_RX_SOFTIRQ); + } + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->process_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + + return NOTIFY_OK; +} + + +/** + * netdev_increment_features - increment feature set by one + * @all: current feature set + * @one: new feature set + * @mask: mask feature set + * + * Computes a new feature set after adding a device with feature set + * @one to the master device with current feature set @all. Will not + * enable anything that is off in @mask. Returns the new feature set. + */ +u32 netdev_increment_features(u32 all, u32 one, u32 mask) +{ + if (mask & NETIF_F_GEN_CSUM) + mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_VLAN_CHALLENGED; + + all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all &= one | ~NETIF_F_ALL_FOR_ALL; + + /* If device needs checksumming, downgrade to it. */ + if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) + all &= ~NETIF_F_NO_CSUM; + + /* If one device supports hw checksumming, set for all. */ + if (all & NETIF_F_GEN_CSUM) + all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + + return all; +} +EXPORT_SYMBOL(netdev_increment_features); + +static struct hlist_head *netdev_create_hash(void) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +/* Initialize per network namespace state */ +static int __net_init netdev_init(struct net *net) +{ + INIT_LIST_HEAD(&net->dev_base_head); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) + goto err_name; + + net->dev_index_head = netdev_create_hash(); + if (net->dev_index_head == NULL) + goto err_idx; + + return 0; + +err_idx: + kfree(net->dev_name_head); +err_name: + return -ENOMEM; +} + +/** + * netdev_drivername - network driver for the device + * @dev: network device + * + * Determine network driver for device. + */ +const char *netdev_drivername(const struct net_device *dev) +{ + const struct device_driver *driver; + const struct device *parent; + const char *empty = ""; + + parent = dev->dev.parent; + if (!parent) + return empty; + + driver = parent->driver; + if (driver && driver->name) + return driver->name; + return empty; +} + +static int __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) +{ + int r; + + if (dev && dev->dev.parent) + r = dev_printk(level, dev->dev.parent, "%s: %pV", + netdev_name(dev), vaf); + else if (dev) + r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + else + r = printk("%s(NULL net_device): %pV", level, vaf); + + return r; +} + +int netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + r = __netdev_printk(level, dev, &vaf); + va_end(args); + + return r; +} +EXPORT_SYMBOL(netdev_printk); + +#define define_netdev_printk_level(func, level) \ +int func(const struct net_device *dev, const char *fmt, ...) \ +{ \ + int r; \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __netdev_printk(level, dev, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ +EXPORT_SYMBOL(func); + +define_netdev_printk_level(netdev_emerg, KERN_EMERG); +define_netdev_printk_level(netdev_alert, KERN_ALERT); +define_netdev_printk_level(netdev_crit, KERN_CRIT); +define_netdev_printk_level(netdev_err, KERN_ERR); +define_netdev_printk_level(netdev_warn, KERN_WARNING); +define_netdev_printk_level(netdev_notice, KERN_NOTICE); +define_netdev_printk_level(netdev_info, KERN_INFO); + +static void __net_exit netdev_exit(struct net *net) +{ + kfree(net->dev_name_head); + kfree(net->dev_index_head); +} + +static struct pernet_operations __net_initdata netdev_net_ops = { + .init = netdev_init, + .exit = netdev_exit, +}; + +static void __net_exit default_device_exit(struct net *net) +{ + struct net_device *dev, *aux; + /* + * Push all migratable network devices back to the + * initial network namespace + */ + rtnl_lock(); + for_each_netdev_safe(net, dev, aux) { + int err; + char fb_name[IFNAMSIZ]; + + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; + + /* Push remaining network devices to init_net */ + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", + __func__, dev->name, err); + BUG(); + } + } + rtnl_unlock(); +} + +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + list_del(&dev_kill_list); + rtnl_unlock(); +} + +static struct pernet_operations __net_initdata default_device_ops = { + .exit = default_device_exit, + .exit_batch = default_device_exit_batch, +}; + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ + +/* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ + int i, rc = -ENOMEM; + + BUG_ON(!dev_boot_phase); + + if (dev_proc_init()) + goto out; + + if (netdev_kobject_init()) + goto out; + + INIT_LIST_HEAD(&ptype_all); + for (i = 0; i < PTYPE_HASH_SIZE; i++) + INIT_LIST_HEAD(&ptype_base[i]); + + if (register_pernet_subsys(&netdev_net_ops)) + goto out; + + /* + * Initialise the packet receive queues. + */ + + for_each_possible_cpu(i) { + struct softnet_data *sd = &per_cpu(softnet_data, i); + + memset(sd, 0, sizeof(*sd)); + skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + sd->completion_queue = NULL; + INIT_LIST_HEAD(&sd->poll_list); + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; +#ifdef CONFIG_RPS + sd->csd.func = rps_trigger_softirq; + sd->csd.info = sd; + sd->csd.flags = 0; + sd->cpu = i; +#endif + + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; + sd->backlog.gro_list = NULL; + sd->backlog.gro_count = 0; + } + + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ + if (register_pernet_device(&loopback_net_ops)) + goto out; + + if (register_pernet_device(&default_device_ops)) + goto out; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action); + open_softirq(NET_RX_SOFTIRQ, net_rx_action); + + hotcpu_notifier(dev_cpu_callback, 0); + dst_init(); + dev_mcast_init(); + rc = 0; +out: + return rc; +} + +subsys_initcall(net_dev_init); + +static int __init initialize_hashrnd(void) +{ + get_random_bytes(&hashrnd, sizeof(hashrnd)); + return 0; +} + +late_initcall_sync(initialize_hashrnd); + diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c new file mode 100644 index 00000000..e2e66939 --- /dev/null +++ b/net/core/dev_addr_lists.c @@ -0,0 +1,733 @@ +/* + * net/core/dev_addr_lists.c - Functions for handling net device lists + * Copyright (c) 2010 Jiri Pirko + * + * This file contains functions for working with unicast, multicast and device + * addresses lists. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +/* + * General list handling functions + */ + +static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + ha->type == addr_type) { + if (global) { + /* check if addr is already used as global */ + if (ha->global_use) + return 0; + else + ha->global_use = true; + } + ha->refcount++; + return 0; + } + } + + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + ha->refcount = 1; + ha->global_use = global; + ha->synced = false; + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + return 0; +} + +static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); +} + +static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) +{ + struct netdev_hw_addr *ha; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + if (global) { + if (!ha->global_use) + break; + else + ha->global_use = false; + } + if (--ha->refcount) + return 0; + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + list->count--; + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); +} + +int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, &from_list->list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del(to_list, ha2->addr, addr_len, type); + } + return err; +} +EXPORT_SYMBOL(__hw_addr_add_multiple); + +void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del(to_list, ha->addr, addr_len, type); + } +} +EXPORT_SYMBOL(__hw_addr_del_multiple); + +int __hw_addr_sync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (!ha->synced) { + err = __hw_addr_add(to_list, ha->addr, + addr_len, ha->type); + if (err) + break; + ha->synced = true; + ha->refcount++; + } else if (ha->refcount == 1) { + __hw_addr_del(to_list, ha->addr, addr_len, ha->type); + __hw_addr_del(from_list, ha->addr, addr_len, ha->type); + } + } + return err; +} +EXPORT_SYMBOL(__hw_addr_sync); + +void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->synced) { + __hw_addr_del(to_list, ha->addr, + addr_len, ha->type); + ha->synced = false; + __hw_addr_del(from_list, ha->addr, + addr_len, ha->type); + } + } +} +EXPORT_SYMBOL(__hw_addr_unsync); + +void __hw_addr_flush(struct netdev_hw_addr_list *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + } + list->count = 0; +} +EXPORT_SYMBOL(__hw_addr_flush); + +void __hw_addr_init(struct netdev_hw_addr_list *list) +{ + INIT_LIST_HEAD(&list->list); + list->count = 0; +} +EXPORT_SYMBOL(__hw_addr_init); + +/* + * Device addresses handling functions + */ + +/** + * dev_addr_flush - Flush device address list + * @dev: device + * + * Flush device address list and reset ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addrs); + dev->dev_addr = NULL; +} +EXPORT_SYMBOL(dev_addr_flush); + +/** + * dev_addr_init - Init device address list + * @dev: device + * + * Init device address list and create the first element, + * used by ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + __hw_addr_init(&dev->dev_addrs); + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} +EXPORT_SYMBOL(dev_addr_init); + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha; + + ASSERT_RTNL(); + + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + if (ha->addr == dev->dev_addr && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device supplying the addresses to be deleted + * @addr_type: address type - 0 means type will be used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* + * Unicast list handling functions + */ + +/** + * dev_uc_add - Add a secondary unicast address + * @dev: device + * @addr: address to add + * + * Add a secondary unicast address to the device or increase + * the reference count if it already exists. + */ +int dev_uc_add(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_add); + +/** + * dev_uc_del - Release secondary unicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a secondary unicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_uc_del(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_del); + +/** + * dev_uc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. + */ +int dev_uc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_bh(to); + err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync); + +/** + * dev_uc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_uc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_uc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_uc_unsync); + +/** + * dev_uc_flush - Flush unicast addresses + * @dev: device + * + * Flush unicast addresses. + */ +void dev_uc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->uc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_uc_flush); + +/** + * dev_uc_flush - Init unicast address list + * @dev: device + * + * Init unicast address list. + */ +void dev_uc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->uc); +} +EXPORT_SYMBOL(dev_uc_init); + +/* + * Multicast list handling functions + */ + +static int __dev_mc_add(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +/** + * dev_mc_add - Add a multicast address + * @dev: device + * @addr: address to add + * + * Add a multicast address to the device or increase + * the reference count if it already exists. + */ +int dev_mc_add(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_add); + +/** + * dev_mc_add_global - Add a global multicast address + * @dev: device + * @addr: address to add + * + * Add a global multicast address to the device. + */ +int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_add_global); + +static int __dev_mc_del(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} + +/** + * dev_mc_del - Delete a multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_del); + +/** + * dev_mc_del_global - Delete a global multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_del_global); + +/** + * dev_mc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_tx_lock_bh. + * + * This function is intended to be called from the dev->set_multicast_list + * or dev->set_rx_mode function of layered software devices. + */ +int dev_mc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_bh(to); + err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock_bh(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync); + +/** + * dev_mc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_mc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_mc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_mc_unsync); + +/** + * dev_mc_flush - Flush multicast addresses + * @dev: device + * + * Flush multicast addresses. + */ +void dev_mc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->mc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_mc_flush); + +/** + * dev_mc_flush - Init multicast address list + * @dev: device + * + * Init multicast address list. + */ +void dev_mc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->mc); +} +EXPORT_SYMBOL(dev_mc_init); + +#ifdef CONFIG_PROC_FS +#include + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, ha->refcount, ha->global_use); + + for (i = 0; i < dev->addr_len; i++) + seq_printf(seq, "%02x", ha->addr[i]); + + seq_putc(seq, '\n'); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + proc_net_remove(net, "dev_mcast"); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +void __init dev_mcast_init(void) +{ + register_pernet_subsys(&dev_mc_net_ops); +} + diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c new file mode 100644 index 00000000..7f36b38e --- /dev/null +++ b/net/core/drop_monitor.c @@ -0,0 +1,385 @@ +/* + * Monitoring code for network dropped packet alerts + * + * Copyright (C) 2009 Neil Horman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define TRACE_ON 1 +#define TRACE_OFF 0 + +static void send_dm_alert(struct work_struct *unused); + + +/* + * Globals, our netlink socket pointer + * and the work handle that will send up + * netlink alerts + */ +static int trace_state = TRACE_OFF; +static DEFINE_SPINLOCK(trace_state_lock); + +struct per_cpu_dm_data { + struct work_struct dm_alert_work; + struct sk_buff *skb; + atomic_t dm_hit_count; + struct timer_list send_timer; +}; + +struct dm_hw_stat_delta { + struct net_device *dev; + unsigned long last_rx; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + +static struct genl_family net_drop_monitor_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .maxattr = NET_DM_CMD_MAX, +}; + +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); + +static int dm_hit_limit = 64; +static int dm_delay = 1; +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); + +static void reset_per_cpu_data(struct per_cpu_dm_data *data) +{ + size_t al; + struct net_dm_alert_msg *msg; + struct nlattr *nla; + + al = sizeof(struct net_dm_alert_msg); + al += dm_hit_limit * sizeof(struct net_dm_drop_point); + al += sizeof(struct nlattr); + + data->skb = genlmsg_new(al, GFP_KERNEL); + genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); + msg = nla_data(nla); + memset(msg, 0, al); + atomic_set(&data->dm_hit_count, dm_hit_limit); +} + +static void send_dm_alert(struct work_struct *unused) +{ + struct sk_buff *skb; + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + /* + * Grab the skb we're about to send + */ + skb = data->skb; + + /* + * Replace it with a new one + */ + reset_per_cpu_data(data); + + /* + * Ship it! + */ + genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + +} + +/* + * This is the timer function to delay the sending of an alert + * in the event that more drops will arrive during the + * hysteresis period. Note that it operates under the timer interrupt + * so we don't need to disable preemption here + */ +static void sched_send_work(unsigned long unused) +{ + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + schedule_work(&data->dm_alert_work); +} + +static void trace_drop_common(struct sk_buff *skb, void *location) +{ + struct net_dm_alert_msg *msg; + struct nlmsghdr *nlh; + struct nlattr *nla; + int i; + struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); + + + if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { + /* + * we're already at zero, discard this hit + */ + goto out; + } + + nlh = (struct nlmsghdr *)data->skb->data; + nla = genlmsg_data(nlmsg_data(nlh)); + msg = nla_data(nla); + for (i = 0; i < msg->entries; i++) { + if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { + msg->points[i].count++; + goto out; + } + } + + /* + * We need to create a new entry + */ + __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); + nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); + memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); + msg->points[msg->entries].count = 1; + msg->entries++; + + if (!timer_pending(&data->send_timer)) { + data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer_on(&data->send_timer, smp_processor_id()); + } + +out: + return; +} + +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Don't check napi structures with no associated device + */ + if (!napi->dev) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + /* + * only add a note to our monitor buffer if: + * 1) this is the dev we received on + * 2) its after the last_rx delta + * 3) our rx_dropped count has gone up + */ + if ((new_stat->dev == napi->dev) && + (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + new_stat->last_rx = jiffies; + break; + } + } + rcu_read_unlock(); +} + +static int set_all_monitor_traces(int state) +{ + int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + spin_lock(&trace_state_lock); + + if (state == trace_state) { + rc = -EAGAIN; + goto out_unlock; + } + + switch (state) { + case TRACE_ON: + rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); + break; + case TRACE_OFF: + rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); + + tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + kfree_rcu(new_stat, rcu); + } + } + break; + default: + rc = 1; + break; + } + + if (!rc) + trace_state = state; + else + rc = -EINPROGRESS; + +out_unlock: + spin_unlock(&trace_state_lock); + + return rc; +} + + +static int net_dm_cmd_config(struct sk_buff *skb, + struct genl_info *info) +{ + return -ENOTSUPP; +} + +static int net_dm_cmd_trace(struct sk_buff *skb, + struct genl_info *info) +{ + switch (info->genlhdr->cmd) { + case NET_DM_CMD_START: + return set_all_monitor_traces(TRACE_ON); + break; + case NET_DM_CMD_STOP: + return set_all_monitor_traces(TRACE_OFF); + break; + } + + return -ENOTSUPP; +} + +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *tmp; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + new_stat->last_rx = jiffies; + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + spin_lock(&trace_state_lock); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; + if (trace_state == TRACE_OFF) { + list_del_rcu(&new_stat->list); + kfree_rcu(new_stat, rcu); + break; + } + } + } + spin_unlock(&trace_state_lock); + break; + } +out: + return NOTIFY_DONE; +} + +static struct genl_ops dropmon_ops[] = { + { + .cmd = NET_DM_CMD_CONFIG, + .doit = net_dm_cmd_config, + }, + { + .cmd = NET_DM_CMD_START, + .doit = net_dm_cmd_trace, + }, + { + .cmd = NET_DM_CMD_STOP, + .doit = net_dm_cmd_trace, + }, +}; + +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + +static int __init init_net_drop_monitor(void) +{ + struct per_cpu_dm_data *data; + int cpu, rc; + + printk(KERN_INFO "Initializing network drop monitor service\n"); + + if (sizeof(void *) > 8) { + printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); + return -ENOSPC; + } + + rc = genl_register_family_with_ops(&net_drop_monitor_family, + dropmon_ops, + ARRAY_SIZE(dropmon_ops)); + if (rc) { + printk(KERN_ERR "Could not create drop monitor netlink family\n"); + return rc; + } + + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + + rc = 0; + + for_each_present_cpu(cpu) { + data = &per_cpu(dm_cpu_data, cpu); + reset_per_cpu_data(data); + INIT_WORK(&data->dm_alert_work, send_dm_alert); + init_timer(&data->send_timer); + data->send_timer.data = cpu; + data->send_timer.function = sched_send_work; + } + + goto out; + +out_unreg: + genl_unregister_family(&net_drop_monitor_family); +out: + return rc; +} + +late_initcall(init_net_drop_monitor); diff --git a/net/core/dst.c b/net/core/dst.c new file mode 100644 index 00000000..8246d47a --- /dev/null +++ b/net/core/dst.c @@ -0,0 +1,426 @@ +/* + * net/core/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Theory of operations: + * 1) We use a list, protected by a spinlock, to add + * new entries from both BH and non-BH context. + * 2) In order to keep spinlock held for a small delay, + * we use a second list where are stored long lived + * entries, that are handled by the garbage collect thread + * fired by a workqueue. + * 3) This list is guarded by a mutex, + * so that the gc_task and dst_dev_event() can be synchronized. + */ + +/* + * We want to keep lock & list close together + * to dirty as few cache lines as possible in __dst_free(). + * As this is not a very strong hint, we dont force an alignment on SMP. + */ +static struct { + spinlock_t lock; + struct dst_entry *list; + unsigned long timer_inc; + unsigned long timer_expires; +} dst_garbage = { + .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), + .timer_inc = DST_GC_MAX, +}; +static void dst_gc_task(struct work_struct *work); +static void ___dst_free(struct dst_entry *dst); + +static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); + +static DEFINE_MUTEX(dst_gc_mutex); +/* + * long lived entries are maintained in this list, guarded by dst_gc_mutex + */ +static struct dst_entry *dst_busy_list; + +static void dst_gc_task(struct work_struct *work) +{ + int delayed = 0; + int work_performed = 0; + unsigned long expires = ~0L; + struct dst_entry *dst, *next, head; + struct dst_entry *last = &head; + + mutex_lock(&dst_gc_mutex); + next = dst_busy_list; + +loop: + while ((dst = next) != NULL) { + next = dst->next; + prefetch(&next->next); + cond_resched(); + if (likely(atomic_read(&dst->__refcnt))) { + last->next = dst; + last = dst; + delayed++; + continue; + } + work_performed++; + + dst = dst_destroy(dst); + if (dst) { + /* NOHASH and still referenced. Unless it is already + * on gc list, invalidate it and add to gc list. + * + * Note: this is temporary. Actually, NOHASH dst's + * must be obsoleted when parent is obsoleted. + * But we do not have state "obsoleted, but + * referenced by parent", so it is right. + */ + if (dst->obsolete > 1) + continue; + + ___dst_free(dst); + dst->next = next; + next = dst; + } + } + + spin_lock_bh(&dst_garbage.lock); + next = dst_garbage.list; + if (next) { + dst_garbage.list = NULL; + spin_unlock_bh(&dst_garbage.lock); + goto loop; + } + last->next = NULL; + dst_busy_list = head.next; + if (!dst_busy_list) + dst_garbage.timer_inc = DST_GC_MAX; + else { + /* + * if we freed less than 1/10 of delayed entries, + * we can sleep longer. + */ + if (work_performed <= delayed/10) { + dst_garbage.timer_expires += dst_garbage.timer_inc; + if (dst_garbage.timer_expires > DST_GC_MAX) + dst_garbage.timer_expires = DST_GC_MAX; + dst_garbage.timer_inc += DST_GC_INC; + } else { + dst_garbage.timer_inc = DST_GC_INC; + dst_garbage.timer_expires = DST_GC_MIN; + } + expires = dst_garbage.timer_expires; + /* + * if the next desired timer is more than 4 seconds in the + * future then round the timer to whole seconds + */ + if (expires > 4*HZ) + expires = round_jiffies_relative(expires); + schedule_delayed_work(&dst_gc_work, expires); + } + + spin_unlock_bh(&dst_garbage.lock); + mutex_unlock(&dst_gc_mutex); +} + +int dst_discard(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(dst_discard); + +const u32 dst_default_metrics[RTAX_MAX]; + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, int flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + dst->child = NULL; + dst->dev = dev; + if (dev) + dev_hold(dev); + dst->ops = ops; + dst_init_metrics(dst, dst_default_metrics, true); + dst->expires = 0UL; + dst->path = dst; + RCU_INIT_POINTER(dst->_neighbour, NULL); + dst->hh = NULL; +#ifdef CONFIG_XFRM + dst->xfrm = NULL; +#endif + dst->input = dst_discard; + dst->output = dst_discard; + dst->error = 0; + dst->obsolete = initial_obsolete; + dst->header_len = 0; + dst->trailer_len = 0; +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->tclassid = 0; +#endif + atomic_set(&dst->__refcnt, initial_ref); + dst->__use = 0; + dst->lastuse = jiffies; + dst->flags = flags; + dst->next = NULL; + if (!(flags & DST_NOCOUNT)) + dst_entries_add(ops, 1); + return dst; +} +EXPORT_SYMBOL(dst_alloc); + +static void ___dst_free(struct dst_entry *dst) +{ + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) + dst->input = dst->output = dst_discard; + dst->obsolete = 2; +} + +void __dst_free(struct dst_entry *dst) +{ + spin_lock_bh(&dst_garbage.lock); + ___dst_free(dst); + dst->next = dst_garbage.list; + dst_garbage.list = dst; + if (dst_garbage.timer_inc > DST_GC_INC) { + dst_garbage.timer_inc = DST_GC_INC; + dst_garbage.timer_expires = DST_GC_MIN; + cancel_delayed_work(&dst_gc_work); + schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires); + } + spin_unlock_bh(&dst_garbage.lock); +} +EXPORT_SYMBOL(__dst_free); + +struct dst_entry *dst_destroy(struct dst_entry * dst) +{ + struct dst_entry *child; + struct neighbour *neigh; + struct hh_cache *hh; + + smp_rmb(); + +again: + neigh = rcu_dereference_protected(dst->_neighbour, 1); + hh = dst->hh; + child = dst->child; + + dst->hh = NULL; + if (hh) + hh_cache_put(hh); + + if (neigh) { + RCU_INIT_POINTER(dst->_neighbour, NULL); + neigh_release(neigh); + } + + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + if (dst->dev) + dev_put(dst->dev); + kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) { + int nohash = dst->flags & DST_NOHASH; + + if (atomic_dec_and_test(&dst->__refcnt)) { + /* We were real parent of this dst, so kill child. */ + if (nohash) + goto again; + } else { + /* Child is still referenced, return it for freeing. */ + if (nohash) + return dst; + /* Child is still in his hash table */ + } + } + return NULL; +} +EXPORT_SYMBOL(dst_destroy); + +void dst_release(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + WARN_ON(newrefcnt < 0); + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } + } +} +EXPORT_SYMBOL(dst_release); + +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + + if (p) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + kfree(p); + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} +EXPORT_SYMBOL(dst_cow_metrics_generic); + +/* Caller asserts that dst_metrics_read_only(dst) is false. */ +void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + unsigned long prev, new; + + new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + prev = cmpxchg(&dst->_metrics, old, new); + if (prev == old) + kfree(__DST_METRICS_PTR(old)); +} +EXPORT_SYMBOL(__dst_destroy_metrics_generic); + +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was not taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + /* If dst not in cache, we must take a reference, because + * dst_release() will destroy dst as soon as its refcount becomes zero + */ + if (unlikely(dst->flags & DST_NOCACHE)) { + dst_hold(dst); + skb_dst_set(skb, dst); + } else { + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; + } +} +EXPORT_SYMBOL(skb_dst_set_noref); + +/* Dirty hack. We did it in 2.2 (in __dst_free), + * we have _very_ good reasons not to repeat + * this mistake in 2.3, but we have no choice + * now. _It_ _is_ _explicit_ _deliberate_ + * _race_ _condition_. + * + * Commented and originally written by Alexey. + */ +static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, unregister); + + if (dev != dst->dev) + return; + + if (!unregister) { + dst->input = dst->output = dst_discard; + } else { + struct neighbour *neigh; + + dst->dev = dev_net(dst->dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); + rcu_read_lock(); + neigh = dst_get_neighbour(dst); + if (neigh && neigh->dev == dev) { + neigh->dev = dst->dev; + dev_hold(dst->dev); + dev_put(dev); + } + rcu_read_unlock(); + } +} + +static int dst_dev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct dst_entry *dst, *last = NULL; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + mutex_lock(&dst_gc_mutex); + for (dst = dst_busy_list; dst; dst = dst->next) { + last = dst; + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } + + spin_lock_bh(&dst_garbage.lock); + dst = dst_garbage.list; + dst_garbage.list = NULL; + spin_unlock_bh(&dst_garbage.lock); + + if (last) + last->next = dst; + else + dst_busy_list = dst; + for (; dst; dst = dst->next) + dst_ifdown(dst, dev, event != NETDEV_DOWN); + mutex_unlock(&dst_gc_mutex); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block dst_dev_notifier = { + .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ +}; + +void __init dst_init(void) +{ + register_netdevice_notifier(&dst_dev_notifier); +} diff --git a/net/core/ethtool.c b/net/core/ethtool.c new file mode 100644 index 00000000..4fb77049 --- /dev/null +++ b/net/core/ethtool.c @@ -0,0 +1,2166 @@ +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ + return netif_carrier_ok(dev) ? 1 : 0; +} +EXPORT_SYMBOL(ethtool_op_get_link); + +u32 ethtool_op_get_tx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_ALL_CSUM) != 0; +} +EXPORT_SYMBOL(ethtool_op_get_tx_csum); + +int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_IP_CSUM; + else + dev->features &= ~NETIF_F_IP_CSUM; + + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_tx_csum); + +int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_HW_CSUM; + else + dev->features &= ~NETIF_F_HW_CSUM; + + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); + +int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + else + dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); + +u32 ethtool_op_get_sg(struct net_device *dev) +{ + return (dev->features & NETIF_F_SG) != 0; +} +EXPORT_SYMBOL(ethtool_op_get_sg); + +int ethtool_op_set_sg(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_SG; + else + dev->features &= ~NETIF_F_SG; + + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_sg); + +u32 ethtool_op_get_tso(struct net_device *dev) +{ + return (dev->features & NETIF_F_TSO) != 0; +} +EXPORT_SYMBOL(ethtool_op_get_tso); + +int ethtool_op_set_tso(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_TSO; + else + dev->features &= ~NETIF_F_TSO; + + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_tso); + +u32 ethtool_op_get_ufo(struct net_device *dev) +{ + return (dev->features & NETIF_F_UFO) != 0; +} +EXPORT_SYMBOL(ethtool_op_get_ufo); + +int ethtool_op_set_ufo(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_UFO; + else + dev->features &= ~NETIF_F_UFO; + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_ufo); + +/* the following list of flags are the same as their associated + * NETIF_F_xxx values in include/linux/netdevice.h + */ +static const u32 flags_dup_features = + (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | + ETH_FLAG_RXHASH); + +u32 ethtool_op_get_flags(struct net_device *dev) +{ + /* in the future, this function will probably contain additional + * handling for flags which are not so easily handled + * by a simple masking operation + */ + + return dev->features & flags_dup_features; +} +EXPORT_SYMBOL(ethtool_op_get_flags); + +/* Check if device can enable (or disable) particular feature coded in "data" + * argument. Flags "supported" describe features that can be toggled by device. + * If feature can not be toggled, it state (enabled or disabled) must match + * hardcoded device features state, otherwise flags are marked as invalid. + */ +bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) +{ + u32 features = dev->features & flags_dup_features; + /* "data" can contain only flags_dup_features bits, + * see __ethtool_set_flags */ + + return (features & ~supported) != (data & ~supported); +} +EXPORT_SYMBOL(ethtool_invalid_flags); + +int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) +{ + if (ethtool_invalid_flags(dev, data, supported)) + return -EINVAL; + + dev->features = ((dev->features & ~flags_dup_features) | + (data & flags_dup_features)); + return 0; +} +EXPORT_SYMBOL(ethtool_op_set_flags); + +void ethtool_ntuple_flush(struct net_device *dev) +{ + struct ethtool_rx_ntuple_flow_spec_container *fsc, *f; + + list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) { + list_del(&fsc->list); + kfree(fsc); + } + dev->ethtool_ntuple_list.count = 0; +} +EXPORT_SYMBOL(ethtool_ntuple_flush); + +/* Handlers for each ethtool command */ + +#define ETHTOOL_DEV_FEATURE_WORDS 1 + +static void ethtool_get_features_compat(struct net_device *dev, + struct ethtool_get_features_block *features) +{ + if (!dev->ethtool_ops) + return; + + /* getting RX checksum */ + if (dev->ethtool_ops->get_rx_csum) + if (dev->ethtool_ops->get_rx_csum(dev)) + features[0].active |= NETIF_F_RXCSUM; + + /* mark legacy-changeable features */ + if (dev->ethtool_ops->set_sg) + features[0].available |= NETIF_F_SG; + if (dev->ethtool_ops->set_tx_csum) + features[0].available |= NETIF_F_ALL_CSUM; + if (dev->ethtool_ops->set_tso) + features[0].available |= NETIF_F_ALL_TSO; + if (dev->ethtool_ops->set_rx_csum) + features[0].available |= NETIF_F_RXCSUM; + if (dev->ethtool_ops->set_flags) + features[0].available |= flags_dup_features; +} + +static int ethtool_set_feature_compat(struct net_device *dev, + int (*legacy_set)(struct net_device *, u32), + struct ethtool_set_features_block *features, u32 mask) +{ + u32 do_set; + + if (!legacy_set) + return 0; + + if (!(features[0].valid & mask)) + return 0; + + features[0].valid &= ~mask; + + do_set = !!(features[0].requested & mask); + + if (legacy_set(dev, do_set) < 0) + netdev_info(dev, + "Legacy feature change (%s) failed for 0x%08x\n", + do_set ? "set" : "clear", mask); + + return 1; +} + +static int ethtool_set_flags_compat(struct net_device *dev, + int (*legacy_set)(struct net_device *, u32), + struct ethtool_set_features_block *features, u32 mask) +{ + u32 value; + + if (!legacy_set) + return 0; + + if (!(features[0].valid & mask)) + return 0; + + value = dev->features & ~features[0].valid; + value |= features[0].requested; + + features[0].valid &= ~mask; + + if (legacy_set(dev, value & mask) < 0) + netdev_info(dev, "Legacy flags change failed\n"); + + return 1; +} + +static int ethtool_set_features_compat(struct net_device *dev, + struct ethtool_set_features_block *features) +{ + int compat; + + if (!dev->ethtool_ops) + return 0; + + compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg, + features, NETIF_F_SG); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum, + features, NETIF_F_ALL_CSUM); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso, + features, NETIF_F_ALL_TSO); + compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum, + features, NETIF_F_RXCSUM); + compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags, + features, flags_dup_features); + + return compat; +} + +static int ethtool_get_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gfeatures cmd = { + .cmd = ETHTOOL_GFEATURES, + .size = ETHTOOL_DEV_FEATURE_WORDS, + }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { + { + .available = dev->hw_features, + .requested = dev->wanted_features, + .active = dev->features, + .never_changed = NETIF_F_NEVER_CHANGE, + }, + }; + u32 __user *sizeaddr; + u32 copy_size; + + ethtool_get_features_compat(dev, features); + + sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); + if (get_user(copy_size, sizeaddr)) + return -EFAULT; + + if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) + copy_size = ETHTOOL_DEV_FEATURE_WORDS; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_sfeatures cmd; + struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + int ret = 0; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + + if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) + return -EINVAL; + + if (copy_from_user(features, useraddr, sizeof(features))) + return -EFAULT; + + if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; + + if (ethtool_set_features_compat(dev, features)) + ret |= ETHTOOL_F_COMPAT; + + if (features[0].valid & ~dev->hw_features) { + features[0].valid &= dev->hw_features; + ret |= ETHTOOL_F_UNSUPPORTED; + } + + dev->wanted_features &= ~features[0].valid; + dev->wanted_features |= features[0].valid & features[0].requested; + __netdev_update_features(dev); + + if ((dev->wanted_features ^ dev->features) & features[0].valid) + ret |= ETHTOOL_F_WISH; + + return ret; +} + +static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = { + /* NETIF_F_SG */ "tx-scatter-gather", + /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4", + /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded", + /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic", + /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6", + /* NETIF_F_HIGHDMA */ "highdma", + /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist", + /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert", + + /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse", + /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter", + /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged", + /* NETIF_F_GSO */ "tx-generic-segmentation", + /* NETIF_F_LLTX */ "tx-lockless", + /* NETIF_F_NETNS_LOCAL */ "netns-local", + /* NETIF_F_GRO */ "rx-gro", + /* NETIF_F_LRO */ "rx-lro", + + /* NETIF_F_TSO */ "tx-tcp-segmentation", + /* NETIF_F_UFO */ "tx-udp-fragmentation", + /* NETIF_F_GSO_ROBUST */ "tx-gso-robust", + /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation", + /* NETIF_F_TSO6 */ "tx-tcp6-segmentation", + /* NETIF_F_FSO */ "tx-fcoe-segmentation", + "", + "", + + /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc", + /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp", + /* NETIF_F_FCOE_MTU */ "fcoe-mtu", + /* NETIF_F_NTUPLE */ "rx-ntuple-filter", + /* NETIF_F_RXHASH */ "rx-hashing", + /* NETIF_F_RXCSUM */ "rx-checksum", + /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy", + /* NETIF_F_LOOPBACK */ "loopback", +}; + +static int __ethtool_get_sset_count(struct net_device *dev, int sset) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (sset == ETH_SS_FEATURES) + return ARRAY_SIZE(netdev_features_strings); + + if (ops && ops->get_sset_count && ops->get_strings) + return ops->get_sset_count(dev, sset); + else + return -EOPNOTSUPP; +} + +static void __ethtool_get_strings(struct net_device *dev, + u32 stringset, u8 *data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (stringset == ETH_SS_FEATURES) + memcpy(data, netdev_features_strings, + sizeof(netdev_features_strings)); + else + /* ops->get_strings is valid because checked earlier */ + ops->get_strings(dev, stringset, data); +} + +static u32 ethtool_get_feature_mask(u32 eth_cmd) +{ + /* feature masks of legacy discrete ethtool ops */ + + switch (eth_cmd) { + case ETHTOOL_GTXCSUM: + case ETHTOOL_STXCSUM: + return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + case ETHTOOL_GRXCSUM: + case ETHTOOL_SRXCSUM: + return NETIF_F_RXCSUM; + case ETHTOOL_GSG: + case ETHTOOL_SSG: + return NETIF_F_SG; + case ETHTOOL_GTSO: + case ETHTOOL_STSO: + return NETIF_F_ALL_TSO; + case ETHTOOL_GUFO: + case ETHTOOL_SUFO: + return NETIF_F_UFO; + case ETHTOOL_GGSO: + case ETHTOOL_SGSO: + return NETIF_F_GSO; + case ETHTOOL_GGRO: + case ETHTOOL_SGRO: + return NETIF_F_GRO; + default: + BUG(); + } +} + +static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops) + return NULL; + + switch (ethcmd) { + case ETHTOOL_GTXCSUM: + return ops->get_tx_csum; + case ETHTOOL_GRXCSUM: + return ops->get_rx_csum; + case ETHTOOL_SSG: + return ops->get_sg; + case ETHTOOL_STSO: + return ops->get_tso; + case ETHTOOL_SUFO: + return ops->get_ufo; + default: + return NULL; + } +} + +static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev) +{ + return !!(dev->features & NETIF_F_ALL_CSUM); +} + +static int ethtool_get_one_feature(struct net_device *dev, + char __user *useraddr, u32 ethcmd) +{ + u32 mask = ethtool_get_feature_mask(ethcmd); + struct ethtool_value edata = { + .cmd = ethcmd, + .data = !!(dev->features & mask), + }; + + /* compatibility with discrete get_ ops */ + if (!(dev->hw_features & mask)) { + u32 (*actor)(struct net_device *); + + actor = __ethtool_get_one_feature_actor(dev, ethcmd); + + /* bug compatibility with old get_rx_csum */ + if (ethcmd == ETHTOOL_GRXCSUM && !actor) + actor = __ethtool_get_rx_csum_oldbug; + + if (actor) + edata.data = actor(dev); + } + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); +static int __ethtool_set_rx_csum(struct net_device *dev, u32 data); +static int __ethtool_set_sg(struct net_device *dev, u32 data); +static int __ethtool_set_tso(struct net_device *dev, u32 data); +static int __ethtool_set_ufo(struct net_device *dev, u32 data); + +static int ethtool_set_one_feature(struct net_device *dev, + void __user *useraddr, u32 ethcmd) +{ + struct ethtool_value edata; + u32 mask; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + mask = ethtool_get_feature_mask(ethcmd); + mask &= dev->hw_features; + if (mask) { + if (edata.data) + dev->wanted_features |= mask; + else + dev->wanted_features &= ~mask; + + __netdev_update_features(dev); + return 0; + } + + /* Driver is not converted to ndo_fix_features or does not + * support changing this offload. In the latter case it won't + * have corresponding ethtool_ops field set. + * + * Following part is to be removed after all drivers advertise + * their changeable features in netdev->hw_features and stop + * using discrete offload setting ops. + */ + + switch (ethcmd) { + case ETHTOOL_STXCSUM: + return __ethtool_set_tx_csum(dev, edata.data); + case ETHTOOL_SRXCSUM: + return __ethtool_set_rx_csum(dev, edata.data); + case ETHTOOL_SSG: + return __ethtool_set_sg(dev, edata.data); + case ETHTOOL_STSO: + return __ethtool_set_tso(dev, edata.data); + case ETHTOOL_SUFO: + return __ethtool_set_ufo(dev, edata.data); + default: + return -EOPNOTSUPP; + } +} + +int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + u32 changed; + + if (data & ~flags_dup_features) + return -EINVAL; + + /* legacy set_flags() op */ + if (dev->ethtool_ops->set_flags) { + if (unlikely(dev->hw_features & flags_dup_features)) + netdev_warn(dev, + "driver BUG: mixed hw_features and set_flags()\n"); + return dev->ethtool_ops->set_flags(dev, data); + } + + /* allow changing only bits set in hw_features */ + changed = (data ^ dev->features) & flags_dup_features; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (data & dev->hw_features); + + __netdev_update_features(dev); + + return 0; +} + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + return 0; +} + +static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + return dev->ethtool_ops->set_settings(dev, &cmd); +} + +static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_drvinfo info; + const struct ethtool_ops *ops = dev->ethtool_ops; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } + + /* + * this method of obtaining string set info is deprecated; + * Use ETHTOOL_GSSET_INFO instead. + */ + if (ops && ops->get_sset_count) { + int rc; + + rc = ops->get_sset_count(dev, ETH_SS_TEST); + if (rc >= 0) + info.testinfo_len = rc; + rc = ops->get_sset_count(dev, ETH_SS_STATS); + if (rc >= 0) + info.n_stats = rc; + rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); + if (rc >= 0) + info.n_priv_flags = rc; + } + if (ops && ops->get_regs_len) + info.regdump_len = ops->get_regs_len(dev); + if (ops && ops->get_eeprom_len) + info.eedump_len = ops->get_eeprom_len(dev); + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_sset_info info; + u64 sset_mask; + int i, idx = 0, n_bits = 0, ret, rc; + u32 *info_buf = NULL; + + if (copy_from_user(&info, useraddr, sizeof(info))) + return -EFAULT; + + /* store copy of mask, because we zero struct later on */ + sset_mask = info.sset_mask; + if (!sset_mask) + return 0; + + /* calculate size of return buffer */ + n_bits = hweight64(sset_mask); + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GSSET_INFO; + + info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); + if (!info_buf) + return -ENOMEM; + + /* + * fill return buffer based on input bitmask and successful + * get_sset_count return + */ + for (i = 0; i < 64; i++) { + if (!(sset_mask & (1ULL << i))) + continue; + + rc = __ethtool_get_sset_count(dev, i); + if (rc >= 0) { + info.sset_mask |= (1ULL << i); + info_buf[idx++] = rc; + } + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, sizeof(info))) + goto out; + + useraddr += offsetof(struct ethtool_sset_info, data); + if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + goto out; + + ret = 0; + +out: + kfree(info_buf); + return ret; +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + + return dev->ethtool_ops->set_rxnfc(dev, &info); +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kzalloc(info.rule_cnt * sizeof(u32), + GFP_USER); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, info_size)) + goto err_out; + + if (rule_buf) { + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + if (copy_to_user(useraddr, rule_buf, + info.rule_cnt * sizeof(u32))) + goto err_out; + } + ret = 0; + +err_out: + kfree(rule_buf); + + return ret; +} + +static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rxfh_indir *indir; + u32 table_size; + size_t full_size; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir) + return -EOPNOTSUPP; + + if (copy_from_user(&table_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(table_size))) + return -EFAULT; + + if (table_size > + (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) + return -ENOMEM; + full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; + indir = kzalloc(full_size, GFP_USER); + if (!indir) + return -ENOMEM; + + indir->cmd = ETHTOOL_GRXFHINDIR; + indir->size = table_size; + ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); + if (ret) + goto out; + + if (copy_to_user(useraddr, indir, full_size)) + ret = -EFAULT; + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rxfh_indir *indir; + u32 table_size; + size_t full_size; + int ret; + + if (!dev->ethtool_ops->set_rxfh_indir) + return -EOPNOTSUPP; + + if (copy_from_user(&table_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(table_size))) + return -EFAULT; + + if (table_size > + (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) + return -ENOMEM; + full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; + indir = kmalloc(full_size, GFP_USER); + if (!indir) + return -ENOMEM; + + if (copy_from_user(indir, useraddr, full_size)) { + ret = -EFAULT; + goto out; + } + + ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); + +out: + kfree(indir); + return ret; +} + +static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, + struct ethtool_rx_ntuple_flow_spec *spec, + struct ethtool_rx_ntuple_flow_spec_container *fsc) +{ + + /* don't add filters forever */ + if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) { + /* free the container */ + kfree(fsc); + return; + } + + /* Copy the whole filter over */ + fsc->fs.flow_type = spec->flow_type; + memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u)); + memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u)); + + fsc->fs.vlan_tag = spec->vlan_tag; + fsc->fs.vlan_tag_mask = spec->vlan_tag_mask; + fsc->fs.data = spec->data; + fsc->fs.data_mask = spec->data_mask; + fsc->fs.action = spec->action; + + /* add to the list */ + list_add_tail_rcu(&fsc->list, &list->list); + list->count++; +} + +/* + * ethtool does not (or did not) set masks for flow parameters that are + * not specified, so if both value and mask are 0 then this must be + * treated as equivalent to a mask with all bits set. Implement that + * here rather than in drivers. + */ +static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec; + struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec; + + if (fs->flow_type != TCP_V4_FLOW && + fs->flow_type != UDP_V4_FLOW && + fs->flow_type != SCTP_V4_FLOW) + return; + + if (!(entry->ip4src | mask->ip4src)) + mask->ip4src = htonl(0xffffffff); + if (!(entry->ip4dst | mask->ip4dst)) + mask->ip4dst = htonl(0xffffffff); + if (!(entry->psrc | mask->psrc)) + mask->psrc = htons(0xffff); + if (!(entry->pdst | mask->pdst)) + mask->pdst = htons(0xffff); + if (!(entry->tos | mask->tos)) + mask->tos = 0xff; + if (!(fs->vlan_tag | fs->vlan_tag_mask)) + fs->vlan_tag_mask = 0xffff; + if (!(fs->data | fs->data_mask)) + fs->data_mask = 0xffffffffffffffffULL; +} + +static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rx_ntuple cmd; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; + int ret; + + if (!ops->set_rx_ntuple) + return -EOPNOTSUPP; + + if (!(dev->features & NETIF_F_NTUPLE)) + return -EINVAL; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + rx_ntuple_fix_masks(&cmd.fs); + + /* + * Cache filter in dev struct for GET operation only if + * the underlying driver doesn't have its own GET operation, and + * only if the filter was added successfully. First make sure we + * can allocate the filter, then continue if successful. + */ + if (!ops->get_rx_ntuple) { + fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC); + if (!fsc) + return -ENOMEM; + } + + ret = ops->set_rx_ntuple(dev, &cmd); + if (ret) { + kfree(fsc); + return ret; + } + + if (!ops->get_rx_ntuple) + __rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc); + + return ret; +} + +static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rx_ntuple_flow_spec_container *fsc; + u8 *data; + char *p; + int ret, i, num_strings = 0; + + if (!ops->get_sset_count) + return -EOPNOTSUPP; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; + + data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + if (ops->get_rx_ntuple) { + /* driver-specific filter grab */ + ret = ops->get_rx_ntuple(dev, gstrings.string_set, data); + goto copy; + } + + /* default ethtool filter grab */ + i = 0; + p = (char *)data; + list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) { + sprintf(p, "Filter %d:\n", i); + p += ETH_GSTRING_LEN; + num_strings++; + + switch (fsc->fs.flow_type) { + case TCP_V4_FLOW: + sprintf(p, "\tFlow Type: TCP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case UDP_V4_FLOW: + sprintf(p, "\tFlow Type: UDP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case SCTP_V4_FLOW: + sprintf(p, "\tFlow Type: SCTP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case AH_ESP_V4_FLOW: + sprintf(p, "\tFlow Type: AH ESP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case ESP_V4_FLOW: + sprintf(p, "\tFlow Type: ESP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IP_USER_FLOW: + sprintf(p, "\tFlow Type: Raw IP\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IPV4_FLOW: + sprintf(p, "\tFlow Type: IPv4\n"); + p += ETH_GSTRING_LEN; + num_strings++; + break; + default: + sprintf(p, "\tFlow Type: Unknown\n"); + p += ETH_GSTRING_LEN; + num_strings++; + goto unknown_filter; + } + + /* now the rest of the filters */ + switch (fsc->fs.flow_type) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.tcp_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.tcp_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc Port: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.psrc, + fsc->fs.m_u.tcp_ip4_spec.psrc); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest Port: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.pdst, + fsc->fs.m_u.tcp_ip4_spec.pdst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.tcp_ip4_spec.tos, + fsc->fs.m_u.tcp_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case AH_ESP_V4_FLOW: + case ESP_V4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.ah_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.ah_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSPI: %d, mask: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.spi, + fsc->fs.m_u.ah_ip4_spec.spi); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.ah_ip4_spec.tos, + fsc->fs.m_u.ah_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IP_USER_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + break; + case IPV4_FLOW: + sprintf(p, "\tSrc IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tSrc IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4src); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP addr: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tDest IP mask: 0x%x\n", + fsc->fs.m_u.usr_ip4_spec.ip4dst); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.l4_4_bytes, + fsc->fs.m_u.usr_ip4_spec.l4_4_bytes); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tTOS: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.tos, + fsc->fs.m_u.usr_ip4_spec.tos); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tIP Version: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.ip_ver, + fsc->fs.m_u.usr_ip4_spec.ip_ver); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tProtocol: %d, mask: 0x%x\n", + fsc->fs.h_u.usr_ip4_spec.proto, + fsc->fs.m_u.usr_ip4_spec.proto); + p += ETH_GSTRING_LEN; + num_strings++; + break; + } + sprintf(p, "\tVLAN: %d, mask: 0x%x\n", + fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data); + p += ETH_GSTRING_LEN; + num_strings++; + sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask); + p += ETH_GSTRING_LEN; + num_strings++; + if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP) + sprintf(p, "\tAction: Drop\n"); + else + sprintf(p, "\tAction: Direct to queue %d\n", + fsc->fs.action); + p += ETH_GSTRING_LEN; + num_strings++; +unknown_filter: + i++; + } +copy: + /* indicate to userspace how many strings we actually have */ + gstrings.len = num_strings; + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_regs regs; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *regbuf; + int reglen, ret; + + if (!ops->get_regs || !ops->get_regs_len) + return -EOPNOTSUPP; + + if (copy_from_user(®s, useraddr, sizeof(regs))) + return -EFAULT; + + reglen = ops->get_regs_len(dev); + if (regs.len > reglen) + regs.len = reglen; + + regbuf = vzalloc(reglen); + if (reglen && !regbuf) + return -ENOMEM; + + ops->get_regs(dev, ®s, regbuf); + + ret = -EFAULT; + if (copy_to_user(useraddr, ®s, sizeof(regs))) + goto out; + useraddr += offsetof(struct ethtool_regs, data); + if (regbuf && copy_to_user(useraddr, regbuf, regs.len)) + goto out; + ret = 0; + + out: + vfree(regbuf); + return ret; +} + +static int ethtool_reset(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value reset; + int ret; + + if (!dev->ethtool_ops->reset) + return -EOPNOTSUPP; + + if (copy_from_user(&reset, useraddr, sizeof(reset))) + return -EFAULT; + + ret = dev->ethtool_ops->reset(dev, &reset.data); + if (ret) + return ret; + + if (copy_to_user(useraddr, &reset, sizeof(reset))) + return -EFAULT; + return 0; +} + +static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_wol(dev, &wol); + + if (copy_to_user(useraddr, &wol, sizeof(wol))) + return -EFAULT; + return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol; + + if (!dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + if (copy_from_user(&wol, useraddr, sizeof(wol))) + return -EFAULT; + + return dev->ethtool_ops->set_wol(dev, &wol); +} + +static int ethtool_nway_reset(struct net_device *dev) +{ + if (!dev->ethtool_ops->nway_reset) + return -EOPNOTSUPP; + + return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + const struct ethtool_ops *ops = dev->ethtool_ops; + void __user *userbuf = useraddr + sizeof(eeprom); + u32 bytes_remaining; + u8 *data; + int ret = 0; + + if (!ops->get_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(PAGE_SIZE, GFP_USER); + if (!data) + return -ENOMEM; + + bytes_remaining = eeprom.len; + while (bytes_remaining > 0) { + eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); + + ret = ops->get_eeprom(dev, &eeprom, data); + if (ret) + break; + if (copy_to_user(userbuf, data, eeprom.len)) { + ret = -EFAULT; + break; + } + userbuf += eeprom.len; + eeprom.offset += eeprom.len; + bytes_remaining -= eeprom.len; + } + + eeprom.len = userbuf - (useraddr + sizeof(eeprom)); + eeprom.offset -= eeprom.len; + if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) + ret = -EFAULT; + + kfree(data); + return ret; +} + +static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + const struct ethtool_ops *ops = dev->ethtool_ops; + void __user *userbuf = useraddr + sizeof(eeprom); + u32 bytes_remaining; + u8 *data; + int ret = 0; + + if (!ops->set_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(PAGE_SIZE, GFP_USER); + if (!data) + return -ENOMEM; + + bytes_remaining = eeprom.len; + while (bytes_remaining > 0) { + eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); + + if (copy_from_user(data, userbuf, eeprom.len)) { + ret = -EFAULT; + break; + } + ret = ops->set_eeprom(dev, &eeprom, data); + if (ret) + break; + userbuf += eeprom.len; + eeprom.offset += eeprom.len; + bytes_remaining -= eeprom.len; + } + + kfree(data); + return ret; +} + +static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_coalesce(dev, &coalesce); + + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_coalesce coalesce; + + if (!dev->ethtool_ops->set_coalesce) + return -EOPNOTSUPP; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) + return -EFAULT; + + return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_ringparam(dev, &ringparam); + + if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam; + + if (!dev->ethtool_ops->set_ringparam) + return -EOPNOTSUPP; + + if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) + return -EFAULT; + + return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + if (copy_to_user(useraddr, &channels, sizeof(channels))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels; + + if (!dev->ethtool_ops->set_channels) + return -EOPNOTSUPP; + + if (copy_from_user(&channels, useraddr, sizeof(channels))) + return -EFAULT; + + return dev->ethtool_ops->set_channels(dev, &channels); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + + if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam; + + if (!dev->ethtool_ops->set_pauseparam) + return -EOPNOTSUPP; + + if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) + return -EFAULT; + + return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int __ethtool_set_sg(struct net_device *dev, u32 data) +{ + int err; + + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + + if (data && !(dev->features & NETIF_F_ALL_CSUM)) + return -EINVAL; + + if (!data && dev->ethtool_ops->set_tso) { + err = dev->ethtool_ops->set_tso(dev, 0); + if (err) + return err; + } + + if (!data && dev->ethtool_ops->set_ufo) { + err = dev->ethtool_ops->set_ufo(dev, 0); + if (err) + return err; + } + return dev->ethtool_ops->set_sg(dev, data); +} + +static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) +{ + int err; + + if (!dev->ethtool_ops->set_tx_csum) + return -EOPNOTSUPP; + + if (!data && dev->ethtool_ops->set_sg) { + err = __ethtool_set_sg(dev, 0); + if (err) + return err; + } + + return dev->ethtool_ops->set_tx_csum(dev, data); +} + +static int __ethtool_set_rx_csum(struct net_device *dev, u32 data) +{ + if (!dev->ethtool_ops->set_rx_csum) + return -EOPNOTSUPP; + + if (!data) + dev->features &= ~NETIF_F_GRO; + + return dev->ethtool_ops->set_rx_csum(dev, data); +} + +static int __ethtool_set_tso(struct net_device *dev, u32 data) +{ + if (!dev->ethtool_ops->set_tso) + return -EOPNOTSUPP; + + if (data && !(dev->features & NETIF_F_SG)) + return -EINVAL; + + return dev->ethtool_ops->set_tso(dev, data); +} + +static int __ethtool_set_ufo(struct net_device *dev, u32 data) +{ + if (!dev->ethtool_ops->set_ufo) + return -EOPNOTSUPP; + if (data && !(dev->features & NETIF_F_SG)) + return -EINVAL; + if (data && !((dev->features & NETIF_F_GEN_CSUM) || + (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) + return -EINVAL; + return dev->ethtool_ops->set_ufo(dev, data); +} + +static int ethtool_self_test(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_test test; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret, test_len; + + if (!ops->self_test || !ops->get_sset_count) + return -EOPNOTSUPP; + + test_len = ops->get_sset_count(dev, ETH_SS_TEST); + if (test_len < 0) + return test_len; + WARN_ON(test_len == 0); + + if (copy_from_user(&test, useraddr, sizeof(test))) + return -EFAULT; + + test.len = test_len; + data = kmalloc(test_len * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->self_test(dev, &test, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &test, sizeof(test))) + goto out; + useraddr += sizeof(test); + if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + u8 *data; + int ret; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + ret = __ethtool_get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; + + data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + __ethtool_get_strings(dev, gstrings.string_set, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value id; + static bool busy; + int rc; + + if (!dev->ethtool_ops->set_phys_id) + return -EOPNOTSUPP; + + if (busy) + return -EBUSY; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + if (rc < 0) + return rc; + + /* Drop the RTNL lock while waiting, but prevent reentry or + * removal of the device. + */ + busy = true; + dev_hold(dev); + rtnl_unlock(); + + if (rc == 0) { + /* Driver will handle this itself */ + schedule_timeout_interruptible( + id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); + } else { + /* Driver expects to be called at twice the frequency in rc */ + int n = rc * 2, i, interval = HZ / n; + + /* Count down seconds */ + do { + /* Count down iterations per second */ + i = n; + do { + rtnl_lock(); + rc = dev->ethtool_ops->set_phys_id(dev, + (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(interval); + } while (!signal_pending(current) && --i != 0); + } while (!signal_pending(current) && + (id.data == 0 || --id.data != 0)); + } + + rtnl_lock(); + dev_put(dev); + busy = false; + + (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + return rc; +} + +static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret, n_stats; + + if (!ops->get_ethtool_stats || !ops->get_sset_count) + return -EOPNOTSUPP; + + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); + if (n_stats < 0) + return n_stats; + WARN_ON(n_stats == 0); + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + data = kmalloc(n_stats * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_ethtool_stats(dev, &stats, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_perm_addr epaddr; + + if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) + return -EFAULT; + + if (epaddr.size < dev->addr_len) + return -ETOOSMALL; + epaddr.size = dev->addr_len; + + if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) + return -EFAULT; + useraddr += sizeof(epaddr); + if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) + return -EFAULT; + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char __user *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, + void (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + actor(dev, edata.data); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char __user *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return actor(dev, edata.data); +} + +static noinline_for_stack int ethtool_flash_device(struct net_device *dev, + char __user *useraddr) +{ + struct ethtool_flash efl; + + if (copy_from_user(&efl, useraddr, sizeof(efl))) + return -EFAULT; + + if (!dev->ethtool_ops->flash_device) + return -EOPNOTSUPP; + + return dev->ethtool_ops->flash_device(dev, &efl); +} + +static int ethtool_set_dump(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_dump dump; + + if (!dev->ethtool_ops->set_dump) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + return dev->ethtool_ops->set_dump(dev, &dump); +} + +static int ethtool_get_dump_flag(struct net_device *dev, + void __user *useraddr) +{ + int ret; + struct ethtool_dump dump; + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!dev->ethtool_ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + ret = ops->get_dump_flag(dev, &dump); + if (ret) + return ret; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) + return -EFAULT; + return 0; +} + +static int ethtool_get_dump_data(struct net_device *dev, + void __user *useraddr) +{ + int ret; + __u32 len; + struct ethtool_dump dump, tmp; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data = NULL; + + if (!dev->ethtool_ops->get_dump_data || + !dev->ethtool_ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + memset(&tmp, 0, sizeof(tmp)); + tmp.cmd = ETHTOOL_GET_DUMP_FLAG; + ret = ops->get_dump_flag(dev, &tmp); + if (ret) + return ret; + + len = (tmp.len > dump.len) ? dump.len : tmp.len; + if (!len) + return -EFAULT; + + data = vzalloc(tmp.len); + if (!data) + return -ENOMEM; + ret = ops->get_dump_data(dev, &dump, data); + if (ret) + goto out; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) { + ret = -EFAULT; + goto out; + } + useraddr += offsetof(struct ethtool_dump, data); + if (copy_to_user(useraddr, data, len)) + ret = -EFAULT; +out: + vfree(data); + return ret; +} + +/* The main entry point in this file. Called from net/core/dev.c */ + +int dev_ethtool(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void __user *useraddr = ifr->ifr_data; + u32 ethcmd; + int rc; + u32 old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + + /* Allow some commands to be done by anyone */ + switch (ethcmd) { + case ETHTOOL_GSET: + case ETHTOOL_GDRVINFO: + case ETHTOOL_GMSGLVL: + case ETHTOOL_GCOALESCE: + case ETHTOOL_GRINGPARAM: + case ETHTOOL_GPAUSEPARAM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GTXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GSTRINGS: + case ETHTOOL_GTSO: + case ETHTOOL_GPERMADDR: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + case ETHTOOL_GFLAGS: + case ETHTOOL_GPFLAGS: + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GFEATURES: + break; + default: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + } + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GSET: + rc = ethtool_get_settings(dev, useraddr); + break; + case ETHTOOL_SSET: + rc = ethtool_set_settings(dev, useraddr); + break; + case ETHTOOL_GDRVINFO: + rc = ethtool_get_drvinfo(dev, useraddr); + break; + case ETHTOOL_GREGS: + rc = ethtool_get_regs(dev, useraddr); + break; + case ETHTOOL_GWOL: + rc = ethtool_get_wol(dev, useraddr); + break; + case ETHTOOL_SWOL: + rc = ethtool_set_wol(dev, useraddr); + break; + case ETHTOOL_GMSGLVL: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_msglevel); + break; + case ETHTOOL_SMSGLVL: + rc = ethtool_set_value_void(dev, useraddr, + dev->ethtool_ops->set_msglevel); + break; + case ETHTOOL_NWAY_RST: + rc = ethtool_nway_reset(dev); + break; + case ETHTOOL_GLINK: + rc = ethtool_get_link(dev, useraddr); + break; + case ETHTOOL_GEEPROM: + rc = ethtool_get_eeprom(dev, useraddr); + break; + case ETHTOOL_SEEPROM: + rc = ethtool_set_eeprom(dev, useraddr); + break; + case ETHTOOL_GCOALESCE: + rc = ethtool_get_coalesce(dev, useraddr); + break; + case ETHTOOL_SCOALESCE: + rc = ethtool_set_coalesce(dev, useraddr); + break; + case ETHTOOL_GRINGPARAM: + rc = ethtool_get_ringparam(dev, useraddr); + break; + case ETHTOOL_SRINGPARAM: + rc = ethtool_set_ringparam(dev, useraddr); + break; + case ETHTOOL_GPAUSEPARAM: + rc = ethtool_get_pauseparam(dev, useraddr); + break; + case ETHTOOL_SPAUSEPARAM: + rc = ethtool_set_pauseparam(dev, useraddr); + break; + case ETHTOOL_TEST: + rc = ethtool_self_test(dev, useraddr); + break; + case ETHTOOL_GSTRINGS: + rc = ethtool_get_strings(dev, useraddr); + break; + case ETHTOOL_PHYS_ID: + rc = ethtool_phys_id(dev, useraddr); + break; + case ETHTOOL_GSTATS: + rc = ethtool_get_stats(dev, useraddr); + break; + case ETHTOOL_GPERMADDR: + rc = ethtool_get_perm_addr(dev, useraddr); + break; + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + (dev->ethtool_ops->get_flags ? + dev->ethtool_ops->get_flags : + ethtool_op_get_flags)); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GPFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_priv_flags); + break; + case ETHTOOL_SPFLAGS: + rc = ethtool_set_value(dev, useraddr, + dev->ethtool_ops->set_priv_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_FLASHDEV: + rc = ethtool_flash_device(dev, useraddr); + break; + case ETHTOOL_RESET: + rc = ethtool_reset(dev, useraddr); + break; + case ETHTOOL_SRXNTUPLE: + rc = ethtool_set_rx_ntuple(dev, useraddr); + break; + case ETHTOOL_GRXNTUPLE: + rc = ethtool_get_rx_ntuple(dev, useraddr); + break; + case ETHTOOL_GSSET_INFO: + rc = ethtool_get_sset_info(dev, useraddr); + break; + case ETHTOOL_GRXFHINDIR: + rc = ethtool_get_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_SRXFHINDIR: + rc = ethtool_set_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_GFEATURES: + rc = ethtool_get_features(dev, useraddr); + break; + case ETHTOOL_SFEATURES: + rc = ethtool_set_features(dev, useraddr); + break; + case ETHTOOL_GTXCSUM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GTSO: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + rc = ethtool_get_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_STXCSUM: + case ETHTOOL_SRXCSUM: + case ETHTOOL_SSG: + case ETHTOOL_STSO: + case ETHTOOL_SUFO: + case ETHTOOL_SGSO: + case ETHTOOL_SGRO: + rc = ethtool_set_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + case ETHTOOL_SCHANNELS: + rc = ethtool_set_channels(dev, useraddr); + break; + case ETHTOOL_SET_DUMP: + rc = ethtool_set_dump(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_FLAG: + rc = ethtool_get_dump_flag(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_DATA: + rc = ethtool_get_dump_data(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c new file mode 100644 index 00000000..f39ef5c6 --- /dev/null +++ b/net/core/fib_rules.c @@ -0,0 +1,766 @@ +/* + * net/core/fib_rules.c Generic Routing Rules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include + +int fib_default_rule_add(struct fib_rules_ops *ops, + u32 pref, u32 table, u32 flags) +{ + struct fib_rule *r; + + r = kzalloc(ops->rule_size, GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + atomic_set(&r->refcnt, 1); + r->action = FR_ACT_TO_TBL; + r->pref = pref; + r->table = table; + r->flags = flags; + r->fr_net = hold_net(ops->fro_net); + + /* The lock is not required here, the list in unreacheable + * at the moment this function is called */ + list_add_tail(&r->list, &ops->rules_list); + return 0; +} +EXPORT_SYMBOL(fib_default_rule_add); + +u32 fib_default_rule_pref(struct fib_rules_ops *ops) +{ + struct list_head *pos; + struct fib_rule *rule; + + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { + rule = list_entry(pos->next, struct fib_rule, list); + if (rule->pref) + return rule->pref - 1; + } + } + + return 0; +} +EXPORT_SYMBOL(fib_default_rule_pref); + +static void notify_rule_change(int event, struct fib_rule *rule, + struct fib_rules_ops *ops, struct nlmsghdr *nlh, + u32 pid); + +static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +{ + struct fib_rules_ops *ops; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->rules_ops, list) { + if (ops->family == family) { + if (!try_module_get(ops->owner)) + ops = NULL; + rcu_read_unlock(); + return ops; + } + } + rcu_read_unlock(); + + return NULL; +} + +static void rules_ops_put(struct fib_rules_ops *ops) +{ + if (ops) + module_put(ops->owner); +} + +static void flush_route_cache(struct fib_rules_ops *ops) +{ + if (ops->flush_cache) + ops->flush_cache(ops); +} + +static int __fib_rules_register(struct fib_rules_ops *ops) +{ + int err = -EEXIST; + struct fib_rules_ops *o; + struct net *net; + + net = ops->fro_net; + + if (ops->rule_size < sizeof(struct fib_rule)) + return -EINVAL; + + if (ops->match == NULL || ops->configure == NULL || + ops->compare == NULL || ops->fill == NULL || + ops->action == NULL) + return -EINVAL; + + spin_lock(&net->rules_mod_lock); + list_for_each_entry(o, &net->rules_ops, list) + if (ops->family == o->family) + goto errout; + + hold_net(net); + list_add_tail_rcu(&ops->list, &net->rules_ops); + err = 0; +errout: + spin_unlock(&net->rules_mod_lock); + + return err; +} + +struct fib_rules_ops * +fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) +{ + struct fib_rules_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + err = __fib_rules_register(ops); + if (err) { + kfree(ops); + ops = ERR_PTR(err); + } + + return ops; +} +EXPORT_SYMBOL_GPL(fib_rules_register); + +static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) +{ + struct fib_rule *rule, *tmp; + + list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { + list_del_rcu(&rule->list); + fib_rule_put(rule); + } +} + +static void fib_rules_put_rcu(struct rcu_head *head) +{ + struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); + struct net *net = ops->fro_net; + + release_net(net); + kfree(ops); +} + +void fib_rules_unregister(struct fib_rules_ops *ops) +{ + struct net *net = ops->fro_net; + + spin_lock(&net->rules_mod_lock); + list_del_rcu(&ops->list); + fib_rules_cleanup_ops(ops); + spin_unlock(&net->rules_mod_lock); + + call_rcu(&ops->rcu, fib_rules_put_rcu); +} +EXPORT_SYMBOL_GPL(fib_rules_unregister); + +static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, + struct flowi *fl, int flags) +{ + int ret = 0; + + if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + goto out; + + if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + goto out; + + if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) + goto out; + + ret = ops->match(rule, fl, flags); +out: + return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; +} + +int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, + int flags, struct fib_lookup_arg *arg) +{ + struct fib_rule *rule; + int err; + + rcu_read_lock(); + + list_for_each_entry_rcu(rule, &ops->rules_list, list) { +jumped: + if (!fib_rule_match(rule, ops, fl, flags)) + continue; + + if (rule->action == FR_ACT_GOTO) { + struct fib_rule *target; + + target = rcu_dereference(rule->ctarget); + if (target == NULL) { + continue; + } else { + rule = target; + goto jumped; + } + } else if (rule->action == FR_ACT_NOP) + continue; + else + err = ops->action(rule, fl, flags, arg); + + if (err != -EAGAIN) { + if ((arg->flags & FIB_LOOKUP_NOREF) || + likely(atomic_inc_not_zero(&rule->refcnt))) { + arg->rule = rule; + goto out; + } + break; + } + } + + err = -ESRCH; +out: + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(fib_rules_lookup); + +static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, + struct fib_rules_ops *ops) +{ + int err = -EINVAL; + + if (frh->src_len) + if (tb[FRA_SRC] == NULL || + frh->src_len > (ops->addr_size * 8) || + nla_len(tb[FRA_SRC]) != ops->addr_size) + goto errout; + + if (frh->dst_len) + if (tb[FRA_DST] == NULL || + frh->dst_len > (ops->addr_size * 8) || + nla_len(tb[FRA_DST]) != ops->addr_size) + goto errout; + + err = 0; +errout: + return err; +} + +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule, *r, *last = NULL; + struct nlattr *tb[FRA_MAX+1]; + int err = -EINVAL, unresolved = 0; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + goto errout; + + ops = lookup_rules_ops(net, frh->family); + if (ops == NULL) { + err = -EAFNOSUPPORT; + goto errout; + } + + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + if (err < 0) + goto errout; + + err = validate_rulemsg(frh, tb, ops); + if (err < 0) + goto errout; + + rule = kzalloc(ops->rule_size, GFP_KERNEL); + if (rule == NULL) { + err = -ENOMEM; + goto errout; + } + rule->fr_net = hold_net(net); + + if (tb[FRA_PRIORITY]) + rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + rule->iifindex = -1; + nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->iifname); + if (dev) + rule->iifindex = dev->ifindex; + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + rule->oifindex = -1; + nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->oifname); + if (dev) + rule->oifindex = dev->ifindex; + } + + if (tb[FRA_FWMARK]) { + rule->mark = nla_get_u32(tb[FRA_FWMARK]); + if (rule->mark) + /* compatibility: if the mark value is non-zero all bits + * are compared unless a mask is explicitly specified. + */ + rule->mark_mask = 0xFFFFFFFF; + } + + if (tb[FRA_FWMASK]) + rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + + rule->action = frh->action; + rule->flags = frh->flags; + rule->table = frh_get_table(frh, tb); + + if (!tb[FRA_PRIORITY] && ops->default_pref) + rule->pref = ops->default_pref(ops); + + err = -EINVAL; + if (tb[FRA_GOTO]) { + if (rule->action != FR_ACT_GOTO) + goto errout_free; + + rule->target = nla_get_u32(tb[FRA_GOTO]); + /* Backward jumps are prohibited to avoid endless loops */ + if (rule->target <= rule->pref) + goto errout_free; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref == rule->target) { + RCU_INIT_POINTER(rule->ctarget, r); + break; + } + } + + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) + unresolved = 1; + } else if (rule->action == FR_ACT_GOTO) + goto errout_free; + + err = ops->configure(rule, skb, frh, tb); + if (err < 0) + goto errout_free; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref > rule->pref) + break; + last = r; + } + + fib_rule_get(rule); + + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + + if (ops->unresolved_rules) { + /* + * There are unresolved goto rules in the list, check if + * any of them are pointing to this new rule. + */ + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action == FR_ACT_GOTO && + r->target == rule->pref && + rtnl_dereference(r->ctarget) == NULL) { + rcu_assign_pointer(r->ctarget, rule); + if (--ops->unresolved_rules == 0) + break; + } + } + } + + if (rule->action == FR_ACT_GOTO) + ops->nr_goto_rules++; + + if (unresolved) + ops->unresolved_rules++; + + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + flush_route_cache(ops); + rules_ops_put(ops); + return 0; + +errout_free: + release_net(rule->fr_net); + kfree(rule); +errout: + rules_ops_put(ops); + return err; +} + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule, *tmp; + struct nlattr *tb[FRA_MAX+1]; + int err = -EINVAL; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + goto errout; + + ops = lookup_rules_ops(net, frh->family); + if (ops == NULL) { + err = -EAFNOSUPPORT; + goto errout; + } + + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + if (err < 0) + goto errout; + + err = validate_rulemsg(frh, tb, ops); + if (err < 0) + goto errout; + + list_for_each_entry(rule, &ops->rules_list, list) { + if (frh->action && (frh->action != rule->action)) + continue; + + if (frh->table && (frh_get_table(frh, tb) != rule->table)) + continue; + + if (tb[FRA_PRIORITY] && + (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) + continue; + + if (tb[FRA_IIFNAME] && + nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) + continue; + + if (tb[FRA_OIFNAME] && + nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) + continue; + + if (tb[FRA_FWMARK] && + (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) + continue; + + if (tb[FRA_FWMASK] && + (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) + continue; + + if (!ops->compare(rule, frh, tb)) + continue; + + if (rule->flags & FIB_RULE_PERMANENT) { + err = -EPERM; + goto errout; + } + + list_del_rcu(&rule->list); + + if (rule->action == FR_ACT_GOTO) + ops->nr_goto_rules--; + + /* + * Check if this rule is a target to any of them. If so, + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules have + * actually been added. + */ + if (ops->nr_goto_rules > 0) { + list_for_each_entry(tmp, &ops->rules_list, list) { + if (rtnl_dereference(tmp->ctarget) == rule) { + rcu_assign_pointer(tmp->ctarget, NULL); + ops->unresolved_rules++; + } + } + } + + notify_rule_change(RTM_DELRULE, rule, ops, nlh, + NETLINK_CB(skb).pid); + fib_rule_put(rule); + flush_route_cache(ops); + rules_ops_put(ops); + return 0; + } + + err = -ENOENT; +errout: + rules_ops_put(ops); + return err; +} + +static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + struct fib_rule *rule) +{ + size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + + nla_total_size(4) /* FRA_PRIORITY */ + + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(4) /* FRA_FWMARK */ + + nla_total_size(4); /* FRA_FWMASK */ + + if (ops->nlmsg_payload) + payload += ops->nlmsg_payload(rule); + + return payload; +} + +static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, + u32 pid, u32 seq, int type, int flags, + struct fib_rules_ops *ops) +{ + struct nlmsghdr *nlh; + struct fib_rule_hdr *frh; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); + if (nlh == NULL) + return -EMSGSIZE; + + frh = nlmsg_data(nlh); + frh->family = ops->family; + frh->table = rule->table; + NLA_PUT_U32(skb, FRA_TABLE, rule->table); + frh->res1 = 0; + frh->res2 = 0; + frh->action = rule->action; + frh->flags = rule->flags; + + if (rule->action == FR_ACT_GOTO && + rcu_dereference_raw(rule->ctarget) == NULL) + frh->flags |= FIB_RULE_UNRESOLVED; + + if (rule->iifname[0]) { + NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname); + + if (rule->iifindex == -1) + frh->flags |= FIB_RULE_IIF_DETACHED; + } + + if (rule->oifname[0]) { + NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname); + + if (rule->oifindex == -1) + frh->flags |= FIB_RULE_OIF_DETACHED; + } + + if (rule->pref) + NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref); + + if (rule->mark) + NLA_PUT_U32(skb, FRA_FWMARK, rule->mark); + + if (rule->mark_mask || rule->mark) + NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask); + + if (rule->target) + NLA_PUT_U32(skb, FRA_GOTO, rule->target); + + if (ops->fill(rule, skb, frh) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_rules_ops *ops) +{ + int idx = 0; + struct fib_rule *rule; + + rcu_read_lock(); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + if (idx < cb->args[1]) + goto skip; + + if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops) < 0) + break; +skip: + idx++; + } + rcu_read_unlock(); + cb->args[1] = idx; + rules_ops_put(ops); + + return skb->len; +} + +static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct fib_rules_ops *ops; + int idx = 0, family; + + family = rtnl_msg_family(cb->nlh); + if (family != AF_UNSPEC) { + /* Protocol specific dump request */ + ops = lookup_rules_ops(net, family); + if (ops == NULL) + return -EAFNOSUPPORT; + + return dump_rules(skb, cb, ops); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->rules_ops, list) { + if (idx < cb->args[0] || !try_module_get(ops->owner)) + goto skip; + + if (dump_rules(skb, cb, ops) < 0) + break; + + cb->args[1] = 0; +skip: + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +static void notify_rule_change(int event, struct fib_rule *rule, + struct fib_rules_ops *ops, struct nlmsghdr *nlh, + u32 pid) +{ + struct net *net; + struct sk_buff *skb; + int err = -ENOBUFS; + + net = ops->fro_net; + skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, ops->nlgroup, err); +} + +static void attach_rules(struct list_head *rules, struct net_device *dev) +{ + struct fib_rule *rule; + + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == -1 && + strcmp(dev->name, rule->iifname) == 0) + rule->iifindex = dev->ifindex; + if (rule->oifindex == -1 && + strcmp(dev->name, rule->oifname) == 0) + rule->oifindex = dev->ifindex; + } +} + +static void detach_rules(struct list_head *rules, struct net_device *dev) +{ + struct fib_rule *rule; + + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == dev->ifindex) + rule->iifindex = -1; + if (rule->oifindex == dev->ifindex) + rule->oifindex = -1; + } +} + + +static int fib_rules_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct fib_rules_ops *ops; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_REGISTER: + list_for_each_entry(ops, &net->rules_ops, list) + attach_rules(&ops->rules_list, dev); + break; + + case NETDEV_UNREGISTER: + list_for_each_entry(ops, &net->rules_ops, list) + detach_rules(&ops->rules_list, dev); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block fib_rules_notifier = { + .notifier_call = fib_rules_event, +}; + +static int __net_init fib_rules_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->rules_ops); + spin_lock_init(&net->rules_mod_lock); + return 0; +} + +static struct pernet_operations fib_rules_net_ops = { + .init = fib_rules_net_init, +}; + +static int __init fib_rules_init(void) +{ + int err; + rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL); + rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule); + + err = register_pernet_subsys(&fib_rules_net_ops); + if (err < 0) + goto fail; + + err = register_netdevice_notifier(&fib_rules_notifier); + if (err < 0) + goto fail_unregister; + + return 0; + +fail_unregister: + unregister_pernet_subsys(&fib_rules_net_ops); +fail: + rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); + rtnl_unregister(PF_UNSPEC, RTM_DELRULE); + rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + return err; +} + +subsys_initcall(fib_rules_init); diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 00000000..36f975fa --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,654 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* No hurry in this branch */ +static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + return NULL; +} + +static inline void *load_pointer(const struct sk_buff *skb, int k, + unsigned int size, void *buffer) +{ + if (k >= 0) + return skb_header_pointer(skb, k, size, buffer); + return __load_pointer(skb, k, size); +} + +/** + * sk_filter - run a packet through a socket filter + * @sk: sock associated with &sk_buff + * @skb: buffer to filter + * + * Run the filter code and then cut skb->data to correct size returned by + * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller + * than pkt_len we keep whole skb->data. This is the socket level + * wrapper to sk_run_filter. It returns 0 if the packet should + * be accepted or -EPERM if the packet should be tossed. + * + */ +int sk_filter(struct sock *sk, struct sk_buff *skb) +{ + int err; + struct sk_filter *filter; + + err = security_sock_rcv_skb(sk, skb); + if (err) + return err; + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter) { + unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; + } + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL(sk_filter); + +/** + * sk_run_filter - run a filter on a socket + * @skb: buffer to run the filter on + * @fentry: filter to apply + * + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. @skb is the data we are + * filtering, @filter is the array of filter instructions. + * Because all jumps are guaranteed to be before last instruction, + * and last instruction guaranteed to be a RET, we dont need to check + * flen. (We used to pass to this function the length of filter) + */ +unsigned int sk_run_filter(const struct sk_buff *skb, + const struct sock_filter *fentry) +{ + void *ptr; + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + u32 tmp; + int k; + + /* + * Process array of filter instructions. + */ + for (;; fentry++) { +#if defined(CONFIG_X86_32) +#define K (fentry->k) +#else + const u32 K = fentry->k; +#endif + + switch (fentry->code) { + case BPF_S_ALU_ADD_X: + A += X; + continue; + case BPF_S_ALU_ADD_K: + A += K; + continue; + case BPF_S_ALU_SUB_X: + A -= X; + continue; + case BPF_S_ALU_SUB_K: + A -= K; + continue; + case BPF_S_ALU_MUL_X: + A *= X; + continue; + case BPF_S_ALU_MUL_K: + A *= K; + continue; + case BPF_S_ALU_DIV_X: + if (X == 0) + return 0; + A /= X; + continue; + case BPF_S_ALU_DIV_K: + A = reciprocal_divide(A, K); + continue; + case BPF_S_ALU_AND_X: + A &= X; + continue; + case BPF_S_ALU_AND_K: + A &= K; + continue; + case BPF_S_ALU_OR_X: + A |= X; + continue; + case BPF_S_ALU_OR_K: + A |= K; + continue; + case BPF_S_ALU_LSH_X: + A <<= X; + continue; + case BPF_S_ALU_LSH_K: + A <<= K; + continue; + case BPF_S_ALU_RSH_X: + A >>= X; + continue; + case BPF_S_ALU_RSH_K: + A >>= K; + continue; + case BPF_S_ALU_NEG: + A = -A; + continue; + case BPF_S_JMP_JA: + fentry += K; + continue; + case BPF_S_JMP_JGT_K: + fentry += (A > K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JGE_K: + fentry += (A >= K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JEQ_K: + fentry += (A == K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JSET_K: + fentry += (A & K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JGT_X: + fentry += (A > X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JGE_X: + fentry += (A >= X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JEQ_X: + fentry += (A == X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JSET_X: + fentry += (A & X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_LD_W_ABS: + k = K; +load_w: + ptr = load_pointer(skb, k, 4, &tmp); + if (ptr != NULL) { + A = get_unaligned_be32(ptr); + continue; + } + return 0; + case BPF_S_LD_H_ABS: + k = K; +load_h: + ptr = load_pointer(skb, k, 2, &tmp); + if (ptr != NULL) { + A = get_unaligned_be16(ptr); + continue; + } + return 0; + case BPF_S_LD_B_ABS: + k = K; +load_b: + ptr = load_pointer(skb, k, 1, &tmp); + if (ptr != NULL) { + A = *(u8 *)ptr; + continue; + } + return 0; + case BPF_S_LD_W_LEN: + A = skb->len; + continue; + case BPF_S_LDX_W_LEN: + X = skb->len; + continue; + case BPF_S_LD_W_IND: + k = X + K; + goto load_w; + case BPF_S_LD_H_IND: + k = X + K; + goto load_h; + case BPF_S_LD_B_IND: + k = X + K; + goto load_b; + case BPF_S_LDX_B_MSH: + ptr = load_pointer(skb, K, 1, &tmp); + if (ptr != NULL) { + X = (*(u8 *)ptr & 0xf) << 2; + continue; + } + return 0; + case BPF_S_LD_IMM: + A = K; + continue; + case BPF_S_LDX_IMM: + X = K; + continue; + case BPF_S_LD_MEM: + A = mem[K]; + continue; + case BPF_S_LDX_MEM: + X = mem[K]; + continue; + case BPF_S_MISC_TAX: + X = A; + continue; + case BPF_S_MISC_TXA: + A = X; + continue; + case BPF_S_RET_K: + return K; + case BPF_S_RET_A: + return A; + case BPF_S_ST: + mem[K] = A; + continue; + case BPF_S_STX: + mem[K] = X; + continue; + case BPF_S_ANC_PROTOCOL: + A = ntohs(skb->protocol); + continue; + case BPF_S_ANC_PKTTYPE: + A = skb->pkt_type; + continue; + case BPF_S_ANC_IFINDEX: + if (!skb->dev) + return 0; + A = skb->dev->ifindex; + continue; + case BPF_S_ANC_MARK: + A = skb->mark; + continue; + case BPF_S_ANC_QUEUE: + A = skb->queue_mapping; + continue; + case BPF_S_ANC_HATYPE: + if (!skb->dev) + return 0; + A = skb->dev->type; + continue; + case BPF_S_ANC_RXHASH: + A = skb->rxhash; + continue; + case BPF_S_ANC_CPU: + A = raw_smp_processor_id(); + continue; + case BPF_S_ANC_NLATTR: { + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) + return 0; + + nla = nla_find((struct nlattr *)&skb->data[A], + skb->len - A, X); + if (nla) + A = (void *)nla - (void *)skb->data; + else + A = 0; + continue; + } + case BPF_S_ANC_NLATTR_NEST: { + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) + return 0; + + nla = (struct nlattr *)&skb->data[A]; + if (nla->nla_len > A - skb->len) + return 0; + + nla = nla_find_nested(nla, X); + if (nla) + A = (void *)nla - (void *)skb->data; + else + A = 0; + continue; + } + default: + WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", + fentry->code, fentry->jt, + fentry->jf, fentry->k); + return 0; + } + } + + return 0; +} +EXPORT_SYMBOL(sk_run_filter); + +/* + * Security : + * A BPF program is able to use 16 cells of memory to store intermediate + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * As we dont want to clear mem[] array for each packet going through + * sk_run_filter(), we check that filter loaded by user never try to read + * a cell if not previously written, and we check all branches to be sure + * a malicious user doesn't try to abuse us. + */ +static int check_load_and_stores(struct sock_filter *filter, int flen) +{ + u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + int pc, ret = 0; + + BUILD_BUG_ON(BPF_MEMWORDS > 16); + masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + if (!masks) + return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); + + for (pc = 0; pc < flen; pc++) { + memvalid &= masks[pc]; + + switch (filter[pc].code) { + case BPF_S_ST: + case BPF_S_STX: + memvalid |= (1 << filter[pc].k); + break; + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + if (!(memvalid & (1 << filter[pc].k))) { + ret = -EINVAL; + goto error; + } + break; + case BPF_S_JMP_JA: + /* a jump must set masks on target */ + masks[pc + 1 + filter[pc].k] &= memvalid; + memvalid = ~0; + break; + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: + /* a jump must set masks on targets */ + masks[pc + 1 + filter[pc].jt] &= memvalid; + masks[pc + 1 + filter[pc].jf] &= memvalid; + memvalid = ~0; + break; + } + } +error: + kfree(masks); + return ret; +} + +/** + * sk_chk_filter - verify socket filter code + * @filter: filter to verify + * @flen: length of filter + * + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. + * + * All jumps are forward as they are not signed. + * + * Returns 0 if the rule set is legal or -EINVAL if not. + */ +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + /* + * Valid instructions are initialized to non-0. + * Invalid instructions are initialized to 0. + */ + static const u8 codes[] = { + [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, + [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, + [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, + [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, + [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, + [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, + [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, + [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, + [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, + [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, + [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, + [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, + [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, + [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, + [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, + [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, + [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, + [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, + [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, + [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, + [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, + [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, + [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, + [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, + [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, + [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, + [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, + [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, + [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, + [BPF_RET|BPF_K] = BPF_S_RET_K, + [BPF_RET|BPF_A] = BPF_S_RET_A, + [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, + [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, + [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, + [BPF_ST] = BPF_S_ST, + [BPF_STX] = BPF_S_STX, + [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, + [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, + [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, + [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, + [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, + [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, + [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, + [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, + [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, + }; + int pc; + + if (flen == 0 || flen > BPF_MAXINSNS) + return -EINVAL; + + /* check the filter code now */ + for (pc = 0; pc < flen; pc++) { + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + + if (code >= ARRAY_SIZE(codes)) + return -EINVAL; + code = codes[code]; + if (!code) + return -EINVAL; + /* Some instructions need special checks */ + switch (code) { + case BPF_S_ALU_DIV_K: + /* check for division by zero */ + if (ftest->k == 0) + return -EINVAL; + ftest->k = reciprocal_value(ftest->k); + break; + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + case BPF_S_ST: + case BPF_S_STX: + /* check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + case BPF_S_JMP_JA: + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + break; + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: + /* for conditionals both must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: +#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ + code = BPF_S_ANC_##CODE; \ + break + switch (ftest->k) { + ANCILLARY(PROTOCOL); + ANCILLARY(PKTTYPE); + ANCILLARY(IFINDEX); + ANCILLARY(NLATTR); + ANCILLARY(NLATTR_NEST); + ANCILLARY(MARK); + ANCILLARY(QUEUE); + ANCILLARY(HATYPE); + ANCILLARY(RXHASH); + ANCILLARY(CPU); + } + } + ftest->code = code; + } + + /* last instruction must be a RET code */ + switch (filter[flen - 1].code) { + case BPF_S_RET_K: + case BPF_S_RET_A: + return check_load_and_stores(filter, flen); + } + return -EINVAL; +} +EXPORT_SYMBOL(sk_chk_filter); + +/** + * sk_filter_release_rcu - Release a socket filter by rcu_head + * @rcu: rcu_head that contains the sk_filter to free + */ +void sk_filter_release_rcu(struct rcu_head *rcu) +{ + struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); + + bpf_jit_free(fp); + kfree(fp); +} +EXPORT_SYMBOL(sk_filter_release_rcu); + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return -EINVAL; + + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + return -EFAULT; + } + + atomic_set(&fp->refcnt, 1); + fp->len = fprog->len; + fp->bpf_func = sk_run_filter; + + err = sk_chk_filter(fp->insns, fp->len); + if (err) { + sk_filter_uncharge(sk, fp); + return err; + } + + bpf_jit_compile(fp); + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + return 0; +} +EXPORT_SYMBOL_GPL(sk_attach_filter); + +int sk_detach_filter(struct sock *sk) +{ + int ret = -ENOENT; + struct sk_filter *filter; + + filter = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + if (filter) { + rcu_assign_pointer(sk->sk_filter, NULL); + sk_filter_uncharge(sk, filter); + ret = 0; + } + return ret; +} +EXPORT_SYMBOL_GPL(sk_detach_filter); diff --git a/net/core/flow.c b/net/core/flow.c new file mode 100644 index 00000000..a6bda2a5 --- /dev/null +++ b/net/core/flow.c @@ -0,0 +1,437 @@ +/* flow.c: Generic flow cache. + * + * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct flow_cache_entry { + union { + struct hlist_node hlist; + struct list_head gc_list; + } u; + u16 family; + u8 dir; + u32 genid; + struct flowi key; + struct flow_cache_object *object; +}; + +struct flow_cache_percpu { + struct hlist_head *hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; + struct tasklet_struct flush_tasklet; +}; + +struct flow_flush_info { + struct flow_cache *cache; + atomic_t cpuleft; + struct completion completion; +}; + +struct flow_cache { + u32 hash_shift; + struct flow_cache_percpu __percpu *percpu; + struct notifier_block hotcpu_notifier; + int low_watermark; + int high_watermark; + struct timer_list rnd_timer; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); +EXPORT_SYMBOL(flow_cache_genid); +static struct flow_cache flow_cache_global; +static struct kmem_cache *flow_cachep __read_mostly; + +static DEFINE_SPINLOCK(flow_cache_gc_lock); +static LIST_HEAD(flow_cache_gc_list); + +#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + +static void flow_cache_new_hashrnd(unsigned long arg) +{ + struct flow_cache *fc = (void *) arg; + int i; + + for_each_possible_cpu(i) + per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; + + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); +} + +static int flow_entry_valid(struct flow_cache_entry *fle) +{ + if (atomic_read(&flow_cache_genid) != fle->genid) + return 0; + if (fle->object && !fle->object->ops->check(fle->object)) + return 0; + return 1; +} + +static void flow_entry_kill(struct flow_cache_entry *fle) +{ + if (fle->object) + fle->object->ops->delete(fle->object); + kmem_cache_free(flow_cachep, fle); +} + +static void flow_cache_gc_task(struct work_struct *work) +{ + struct list_head gc_list; + struct flow_cache_entry *fce, *n; + + INIT_LIST_HEAD(&gc_list); + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail_init(&flow_cache_gc_list, &gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + flow_entry_kill(fce); +} +static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); + +static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, + int deleted, struct list_head *gc_list) +{ + if (deleted) { + fcp->hash_count -= deleted; + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail(gc_list, &flow_cache_gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + schedule_work(&flow_cache_gc_work); + } +} + +static void __flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + int shrink_to) +{ + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; + + for (i = 0; i < flow_cache_hash_size(fc); i++) { + int saved = 0; + + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { + if (saved < shrink_to && + flow_entry_valid(fle)) { + saved++; + } else { + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); + } + } + } + + flow_cache_queue_garbage(fcp, deleted, &gc_list); +} + +static void flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp) +{ + int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); + + __flow_cache_shrink(fc, fcp, shrink_to); +} + +static void flow_new_hash_rnd(struct flow_cache *fc, + struct flow_cache_percpu *fcp) +{ + get_random_bytes(&fcp->hash_rnd, sizeof(u32)); + fcp->hash_rnd_recalc = 0; + __flow_cache_shrink(fc, fcp, 0); +} + +static u32 flow_hash_code(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + const struct flowi *key, + size_t keysize) +{ + const u32 *k = (const u32 *) key; + const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); + + return jhash2(k, length, fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1); +} + +/* I hear what you're saying, use memcmp. But memcmp cannot make + * important assumptions that we can here, such as alignment. + */ +static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, + size_t keysize) +{ + const flow_compare_t *k1, *k1_lim, *k2; + + k1 = (const flow_compare_t *) key1; + k1_lim = k1 + keysize; + + k2 = (const flow_compare_t *) key2; + + do { + if (*k1++ != *k2++) + return 1; + } while (k1 < k1_lim); + + return 0; +} + +struct flow_cache_object * +flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver, void *ctx) +{ + struct flow_cache *fc = &flow_cache_global; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle, *tfle; + struct hlist_node *entry; + struct flow_cache_object *flo; + size_t keysize; + unsigned int hash; + + local_bh_disable(); + fcp = this_cpu_ptr(fc->percpu); + + fle = NULL; + flo = NULL; + + keysize = flow_key_size(family); + if (!keysize) + goto nocache; + + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ + if (!fcp->hash_table) + goto nocache; + + if (fcp->hash_rnd_recalc) + flow_new_hash_rnd(fc, fcp); + + hash = flow_hash_code(fc, fcp, key, keysize); + hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { + if (tfle->family == family && + tfle->dir == dir && + flow_key_compare(key, &tfle->key, keysize) == 0) { + fle = tfle; + break; + } + } + + if (unlikely(!fle)) { + if (fcp->hash_count > fc->high_watermark) + flow_cache_shrink(fc, fcp); + + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); + if (fle) { + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, keysize * sizeof(flow_compare_t)); + fle->object = NULL; + hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); + fcp->hash_count++; + } + } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { + flo = fle->object; + if (!flo) + goto ret_object; + flo = flo->ops->get(flo); + if (flo) + goto ret_object; + } else if (fle->object) { + flo = fle->object; + flo->ops->delete(flo); + fle->object = NULL; + } + +nocache: + flo = NULL; + if (fle) { + flo = fle->object; + fle->object = NULL; + } + flo = resolver(net, key, family, dir, flo, ctx); + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + if (!IS_ERR(flo)) + fle->object = flo; + else + fle->genid--; + } else { + if (flo && !IS_ERR(flo)) + flo->ops->delete(flo); + } +ret_object: + local_bh_enable(); + return flo; +} +EXPORT_SYMBOL(flow_cache_lookup); + +static void flow_cache_flush_tasklet(unsigned long data) +{ + struct flow_flush_info *info = (void *)data; + struct flow_cache *fc = info->cache; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; + + fcp = this_cpu_ptr(fc->percpu); + for (i = 0; i < flow_cache_hash_size(fc); i++) { + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { + if (flow_entry_valid(fle)) + continue; + + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); + } + } + + flow_cache_queue_garbage(fcp, deleted, &gc_list); + + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); +} + +static void flow_cache_flush_per_cpu(void *data) +{ + struct flow_flush_info *info = data; + int cpu; + struct tasklet_struct *tasklet; + + cpu = smp_processor_id(); + tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; + tasklet->data = (unsigned long)info; + tasklet_schedule(tasklet); +} + +void flow_cache_flush(void) +{ + struct flow_flush_info info; + static DEFINE_MUTEX(flow_flush_sem); + + /* Don't want cpus going down or up during this. */ + get_online_cpus(); + mutex_lock(&flow_flush_sem); + info.cache = &flow_cache_global; + atomic_set(&info.cpuleft, num_online_cpus()); + init_completion(&info.completion); + + local_bh_disable(); + smp_call_function(flow_cache_flush_per_cpu, &info, 0); + flow_cache_flush_tasklet((unsigned long)&info); + local_bh_enable(); + + wait_for_completion(&info.completion); + mutex_unlock(&flow_flush_sem); + put_online_cpus(); +} + +static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) +{ + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); + + if (!fcp->hash_table) { + fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); + if (!fcp->hash_table) { + pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + return -ENOMEM; + } + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + } + return 0; +} + +static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + int res, cpu = (unsigned long) hcpu; + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + res = flow_cache_cpu_prepare(fc, cpu); + if (res) + return notifier_from_errno(res); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __flow_cache_shrink(fc, fcp, 0); + break; + } + return NOTIFY_OK; +} + +static int __init flow_cache_init(struct flow_cache *fc) +{ + int i; + + fc->hash_shift = 10; + fc->low_watermark = 2 * flow_cache_hash_size(fc); + fc->high_watermark = 4 * flow_cache_hash_size(fc); + + fc->percpu = alloc_percpu(struct flow_cache_percpu); + if (!fc->percpu) + return -ENOMEM; + + for_each_online_cpu(i) { + if (flow_cache_cpu_prepare(fc, i)) + return -ENOMEM; + } + fc->hotcpu_notifier = (struct notifier_block){ + .notifier_call = flow_cache_cpu, + }; + register_hotcpu_notifier(&fc->hotcpu_notifier); + + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, + (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); + + return 0; +} + +static int __init flow_cache_init_global(void) +{ + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_PANIC, NULL); + + return flow_cache_init(&flow_cache_global); +} + +module_init(flow_cache_init_global); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c new file mode 100644 index 00000000..43b03dd7 --- /dev/null +++ b/net/core/gen_estimator.c @@ -0,0 +1,322 @@ +/* + * net/sched/gen_estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * Jamal Hadi Salim - moved it to net/core and reshulfed + * names to make it usable in general net subsystem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<stats_lock); + read_lock(&est_lock); + if (e->bstats == NULL) + goto skip; + + nbytes = e->bstats->bytes; + npackets = e->bstats->packets; + brate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); + e->rate_est->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); + e->rate_est->pps = (e->avpps+0x1FF)>>10; +skip: + read_unlock(&est_lock); + spin_unlock(e->stats_lock); + } + + if (!list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); + rcu_read_unlock(); +} + +static void gen_add_node(struct gen_estimator *est) +{ + struct rb_node **p = &est_root.rb_node, *parent = NULL; + + while (*p) { + struct gen_estimator *e; + + parent = *p; + e = rb_entry(parent, struct gen_estimator, node); + + if (est->bstats > e->bstats) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&est->node, parent, p); + rb_insert_color(&est->node, &est_root); +} + +static +struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, + const struct gnet_stats_rate_est *rate_est) +{ + struct rb_node *p = est_root.rb_node; + + while (p) { + struct gen_estimator *e; + + e = rb_entry(p, struct gen_estimator, node); + + if (bstats > e->bstats) + p = p->rb_right; + else if (bstats < e->bstats || rate_est != e->rate_est) + p = p->rb_left; + else + return e; + } + return NULL; +} + +/** + * gen_new_estimator - create a new rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est + * as destination. A new timer with the interval specified in the + * configuration TLV is created. Upon each interval, the latest statistics + * will be read from &bstats and the estimated rate will be stored in + * &rate_est with the statistics lock grabed during this period. + * + * Returns 0 on success or a negative error code. + * + */ +int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_rate_est *rate_est, + spinlock_t *stats_lock, + struct nlattr *opt) +{ + struct gen_estimator *est; + struct gnet_estimator *parm = nla_data(opt); + int idx; + + if (nla_len(opt) < sizeof(*parm)) + return -EINVAL; + + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kzalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + idx = parm->interval + 2; + est->bstats = bstats; + est->rate_est = rate_est; + est->stats_lock = stats_lock; + est->ewma_log = parm->ewma_log; + est->last_bytes = bstats->bytes; + est->avbps = rate_est->bps<<5; + est->last_packets = bstats->packets; + est->avpps = rate_est->pps<<10; + + spin_lock_bh(&est_tree_lock); + if (!elist[idx].timer.function) { + INIT_LIST_HEAD(&elist[idx].list); + setup_timer(&elist[idx].timer, est_timer, idx); + } + + if (list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); + + list_add_rcu(&est->list, &elist[idx].list); + gen_add_node(est); + spin_unlock_bh(&est_tree_lock); + + return 0; +} +EXPORT_SYMBOL(gen_new_estimator); + +/** + * gen_kill_estimator - remove a rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Removes the rate estimator specified by &bstats and &rate_est. + * + * Note : Caller should respect an RCU grace period before freeing stats_lock + */ +void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_rate_est *rate_est) +{ + struct gen_estimator *e; + + spin_lock_bh(&est_tree_lock); + while ((e = gen_find_node(bstats, rate_est))) { + rb_erase(&e->node, &est_root); + + write_lock(&est_lock); + e->bstats = NULL; + write_unlock(&est_lock); + + list_del_rcu(&e->list); + kfree_rcu(e, e_rcu); + } + spin_unlock_bh(&est_tree_lock); +} +EXPORT_SYMBOL(gen_kill_estimator); + +/** + * gen_replace_estimator - replace rate estimator configuration + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling + * gen_kill_estimator() and gen_new_estimator(). + * + * Returns 0 on success or a negative error code. + */ +int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_rate_est *rate_est, + spinlock_t *stats_lock, struct nlattr *opt) +{ + gen_kill_estimator(bstats, rate_est); + return gen_new_estimator(bstats, rate_est, stats_lock, opt); +} +EXPORT_SYMBOL(gen_replace_estimator); + +/** + * gen_estimator_active - test if estimator is currently in use + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Returns true if estimator is active, and false if not. + */ +bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, + const struct gnet_stats_rate_est *rate_est) +{ + bool res; + + ASSERT_RTNL(); + + spin_lock_bh(&est_tree_lock); + res = gen_find_node(bstats, rate_est) != NULL; + spin_unlock_bh(&est_tree_lock); + + return res; +} +EXPORT_SYMBOL(gen_estimator_active); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c new file mode 100644 index 00000000..0452eb27 --- /dev/null +++ b/net/core/gen_stats.c @@ -0,0 +1,250 @@ +/* + * net/core/gen_stats.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * Jamal Hadi Salim + * Alexey Kuznetsov, + * + * See Documentation/networking/gen_stats.txt + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static inline int +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +{ + NLA_PUT(d->skb, type, size, buf); + return 0; + +nla_put_failure: + spin_unlock_bh(d->lock); + return -1; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV + * @xstats_type: TLV type for backward compatibility xstats TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * The dumping handle is marked to be in backward compatibility mode telling + * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, + int xstats_type, spinlock_t *lock, struct gnet_dump *d) + __acquires(lock) +{ + memset(d, 0, sizeof(*d)); + + spin_lock_bh(lock); + d->lock = lock; + if (type) + d->tail = (struct nlattr *)skb_tail_pointer(skb); + d->skb = skb; + d->compat_tc_stats = tc_stats_type; + d->compat_xstats = xstats_type; + + if (d->tail) + return gnet_stats_copy(d, type, NULL, 0); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_start_copy_compat); + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d) +{ + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); +} +EXPORT_SYMBOL(gnet_stats_start_copy); + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @d: dumping handle + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +{ + if (d->compat_tc_stats) { + d->tc_stats.bytes = b->bytes; + d->tc_stats.packets = b->packets; + } + + if (d->tail) { + struct gnet_stats_basic sb; + + memset(&sb, 0, sizeof(sb)); + sb.bytes = b->bytes; + sb.packets = b->packets; + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + } + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_basic); + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @b: basic statistics + * @r: rate estimator statistics + * + * Appends the rate estimator statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_rate_est(struct gnet_dump *d, + const struct gnet_stats_basic_packed *b, + struct gnet_stats_rate_est *r) +{ + if (b && !gen_estimator_active(b, r)) + return 0; + + if (d->compat_tc_stats) { + d->tc_stats.bps = r->bps; + d->tc_stats.pps = r->pps; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_rate_est); + +/** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV + * @d: dumping handle + * @q: queue statistics + * + * Appends the queue statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +{ + if (d->compat_tc_stats) { + d->tc_stats.drops = q->drops; + d->tc_stats.qlen = q->qlen; + d->tc_stats.backlog = q->backlog; + d->tc_stats.overlimits = q->overlimits; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_queue); + +/** + * gnet_stats_copy_app - copy application specific statistics into statistics TLV + * @d: dumping handle + * @st: application specific statistics data + * @len: length of data + * + * Appends the application sepecific statistics to the top level TLV created by + * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping + * handle is in backward compatibility mode. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) +{ + if (d->compat_xstats) { + d->xstats = st; + d->xstats_len = len; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_APP, st, len); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_app); + +/** + * gnet_stats_finish_copy - finish dumping procedure + * @d: dumping handle + * + * Corrects the length of the top level TLV to include all TLVs added + * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs + * if gnet_stats_start_copy_compat() was used and releases the statistics + * lock. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_finish_copy(struct gnet_dump *d) +{ + if (d->tail) + d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; + + if (d->compat_tc_stats) + if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, + sizeof(d->tc_stats)) < 0) + return -1; + + if (d->compat_xstats && d->xstats) { + if (gnet_stats_copy(d, d->compat_xstats, d->xstats, + d->xstats_len) < 0) + return -1; + } + + spin_unlock_bh(d->lock); + return 0; +} +EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/iovec.c b/net/core/iovec.c new file mode 100644 index 00000000..c40f27e7 --- /dev/null +++ b/net/core/iovec.c @@ -0,0 +1,264 @@ +/* + * iovec manipulation routines. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Andrew Lunn : Errors in iovec copying. + * Pedro Roque : Added memcpy_fromiovecend and + * csum_..._fromiovecend. + * Andi Kleen : fixed error handling for 2.1 + * Alexey Kuznetsov: 2.1 optimisations + * Andi Kleen : Fix csum*fromiovecend for IPv6. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. + * + * Save time not doing access_ok. copy_*_user will make this work + * in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +{ + int size, ct, err; + + if (m->msg_namelen) { + if (mode == VERIFY_READ) { + void __user *namep; + namep = (void __user __force *) m->msg_name; + err = move_addr_to_kernel(namep, m->msg_namelen, + address); + if (err < 0) + return err; + } + m->msg_name = address; + } else { + m->msg_name = NULL; + } + + size = m->msg_iovlen * sizeof(struct iovec); + if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) + return -EFAULT; + + m->msg_iov = iov; + err = 0; + + for (ct = 0; ct < m->msg_iovlen; ct++) { + size_t len = iov[ct].iov_len; + + if (len > INT_MAX - err) { + len = INT_MAX - err; + iov[ct].iov_len = len; + } + err += len; + } + + return err; +} + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + return -EFAULT; + kdata += copy; + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_toiovec); + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + */ + +int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, + int offset, int len) +{ + int copy; + for (; len > 0; ++iov) { + /* Skip over the finished iovecs */ + if (unlikely(offset >= iov->iov_len)) { + offset -= iov->iov_len; + continue; + } + copy = min_t(unsigned int, iov->iov_len - offset, len); + if (copy_to_user(iov->iov_base + offset, kdata, copy)) + return -EFAULT; + offset = 0; + kdata += copy; + len -= copy; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_toiovecend); + +/* + * Copy iovec to kernel. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + iov++; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_fromiovec); + +/* + * Copy iovec from kernel. Returns -EFAULT on error. + */ + +int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, + int offset, int len) +{ + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + if (copy_from_user(kdata, base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov++; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_fromiovecend); + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + */ +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, __wsum *csump) +{ + __wsum csum = *csump; + int partial_cnt = 0, err = 0; + + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + + /* There is a remnant from previous iov. */ + if (partial_cnt) { + int par_len = 4 - partial_cnt; + + /* iov component is too short ... */ + if (par_len > copy) { + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; + base += copy; + partial_cnt += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; + } + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; + base += par_len; + copy -= par_len; + len -= par_len; + partial_cnt = 0; + } + + if (len > copy) { + partial_cnt = copy % 4; + if (partial_cnt) { + copy -= partial_cnt; + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; + } + } + + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; + } + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + *csump = csum; +out: + return err; + +out_fault: + err = -EFAULT; + goto out; +} +EXPORT_SYMBOL(csum_partial_copy_fromiovecend); diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h new file mode 100644 index 00000000..283c2b99 --- /dev/null +++ b/net/core/kmap_skb.h @@ -0,0 +1,19 @@ +#include + +static inline void *kmap_skb_frag(const skb_frag_t *frag) +{ +#ifdef CONFIG_HIGHMEM + BUG_ON(in_irq()); + + local_bh_disable(); +#endif + return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); +} + +static inline void kunmap_skb_frag(void *vaddr) +{ + kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ); +#ifdef CONFIG_HIGHMEM + local_bh_enable(); +#endif +} diff --git a/net/core/link_watch.c b/net/core/link_watch.c new file mode 100644 index 00000000..357bd4ee --- /dev/null +++ b/net/core/link_watch.c @@ -0,0 +1,246 @@ +/* + * Linux network device link state notification + * + * Author: + * Stefan Rompf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +enum lw_bits { + LW_URGENT = 0, +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); + +static LIST_HEAD(lweventlist); +static DEFINE_SPINLOCK(lweventlist_lock); + +static unsigned char default_operstate(const struct net_device *dev) +{ + if (!netif_carrier_ok(dev)) + return (dev->ifindex != dev->iflink ? + IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); + + if (netif_dormant(dev)) + return IF_OPER_DORMANT; + + return IF_OPER_UP; +} + + +static void rfc2863_policy(struct net_device *dev) +{ + unsigned char operstate = default_operstate(dev); + + if (operstate == dev->operstate) + return; + + write_lock_bh(&dev_base_lock); + + switch(dev->link_mode) { + case IF_LINK_MODE_DORMANT: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_DORMANT; + break; + + case IF_LINK_MODE_DEFAULT: + default: + break; + } + + dev->operstate = operstate; + + write_unlock_bh(&dev_base_lock); +} + + +static bool linkwatch_urgent_event(struct net_device *dev) +{ + return netif_running(dev) && netif_carrier_ok(dev) && + qdisc_tx_changing(dev); +} + + +static void linkwatch_add_event(struct net_device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (list_empty(&dev->link_watch_list)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + dev_hold(dev); + } + spin_unlock_irqrestore(&lweventlist_lock, flags); +} + + +static void linkwatch_schedule_work(int urgent) +{ + unsigned long delay = linkwatch_nextevent - jiffies; + + if (test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* Minimise down-time: drop delay for up event. */ + if (urgent) { + if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) + return; + delay = 0; + } + + /* If we wrap around we'll delay it by at most HZ. */ + if (delay > HZ) + delay = 0; + + /* + * This is true if we've scheduled it immeditately or if we don't + * need an immediate execution and it's already pending. + */ + if (schedule_delayed_work(&linkwatch_work, delay) == !delay) + return; + + /* Don't bother if there is nothing urgent. */ + if (!test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* It's already running which is good enough. */ + if (!__cancel_delayed_work(&linkwatch_work)) + return; + + /* Otherwise we reschedule it again for immediate execution. */ + schedule_delayed_work(&linkwatch_work, 0); +} + + +static void linkwatch_do_dev(struct net_device *dev) +{ + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_clear_bit(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + dev_put(dev); +} + +static void __linkwatch_run_queue(int urgent_only) +{ + struct net_device *dev; + LIST_HEAD(wrk); + + /* + * Limit the number of linkwatch events to one + * per second so that a runaway driver does not + * cause a storm of messages on the netlink + * socket. This limit does not apply to up events + * while the device qdisc is down. + */ + if (!urgent_only) + linkwatch_nextevent = jiffies + HZ; + /* Limit wrap-around effect on delay. */ + else if (time_after(linkwatch_nextevent, jiffies + HZ)) + linkwatch_nextevent = jiffies; + + clear_bit(LW_URGENT, &linkwatch_flags); + + spin_lock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &wrk); + + while (!list_empty(&wrk)) { + + dev = list_first_entry(&wrk, struct net_device, link_watch_list); + list_del_init(&dev->link_watch_list); + + if (urgent_only && !linkwatch_urgent_event(dev)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + continue; + } + spin_unlock_irq(&lweventlist_lock); + linkwatch_do_dev(dev); + spin_lock_irq(&lweventlist_lock); + } + + if (!list_empty(&lweventlist)) + linkwatch_schedule_work(0); + spin_unlock_irq(&lweventlist_lock); +} + +void linkwatch_forget_dev(struct net_device *dev) +{ + unsigned long flags; + int clean = 0; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (!list_empty(&dev->link_watch_list)) { + list_del_init(&dev->link_watch_list); + clean = 1; + } + spin_unlock_irqrestore(&lweventlist_lock, flags); + if (clean) + linkwatch_do_dev(dev); +} + + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ + __linkwatch_run_queue(0); +} + + +static void linkwatch_event(struct work_struct *dummy) +{ + rtnl_lock(); + __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); + rtnl_unlock(); +} + + +void linkwatch_fire_event(struct net_device *dev) +{ + bool urgent = linkwatch_urgent_event(dev); + + if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { + linkwatch_add_event(dev); + } else if (!urgent) + return; + + linkwatch_schedule_work(urgent); +} +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/neighbour.c b/net/core/neighbour.c new file mode 100644 index 00000000..96bb0a33 --- /dev/null +++ b/net/core/neighbour.c @@ -0,0 +1,2934 @@ +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + * Harald Welte Add neighbour cache statistics like rtstat + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +#define PNEIGH_HASHMASK 0xF + +static void neigh_timer_handler(unsigned long arg); +static void __neigh_notify(struct neighbour *n, int type, int flags); +static void neigh_update_notify(struct neighbour *neigh); +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); + +static struct neigh_table *neigh_tables; +#ifdef CONFIG_PROC_FS +static const struct file_operations neigh_stat_seq_fops; +#endif + +/* + Neighbour hash table buckets are protected with rwlock tbl->lock. + + - All the scans/updates to hash buckets MUST be made under this lock. + - NOTHING clever should be made under this lock: no callbacks + to protocol backends, no attempts to send something to network. + It will result in deadlocks, if backend/driver wants to use neighbour + cache. + - If the entry requires some non-trivial actions, increase + its reference count and release table lock. + + Neighbour entries are protected: + - with reference count. + - with rwlock neigh->lock + + Reference count prevents destruction. + + neigh->lock mainly serializes ll address data and its validity state. + However, the same lock is used to protect another entry fields: + - timer + - resolution queue + + Again, nothing clever shall be made under neigh->lock, + the most complicated procedure, which we allow is dev->hard_header. + It is supposed, that dev->hard_header is simplistic and does + not make callbacks to neighbour tables. + + The last lock is neigh_tbl_lock. It is pure SMP lock, protecting + list of neighbour tables. This list is used only in process context, + */ + +static DEFINE_RWLOCK(neigh_tbl_lock); + +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +static void neigh_cleanup_and_release(struct neighbour *neigh) +{ + if (neigh->parms->neigh_cleanup) + neigh->parms->neigh_cleanup(neigh); + + __neigh_notify(neigh, RTM_DELNEIGH, 0); + neigh_release(neigh); +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonable choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return base ? (net_random() % base) + (base >> 1) : 0; +} +EXPORT_SYMBOL(neigh_rand_reach_time); + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int shrunk = 0; + int i; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n; + struct neighbour __rcu **np; + + np = &nht->hash_buckets[i]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + /* Neighbour record may be discarded if: + * - nobody refers to it. + * - it is not permanent + */ + write_lock(&n->lock); + if (atomic_read(&n->refcnt) == 1 && + !(n->nud_state & NUD_PERMANENT)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + n->dead = 1; + shrunk = 1; + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } + write_unlock(&n->lock); + np = &n->next; + } + } + + tbl->last_flush = jiffies; + + write_unlock_bh(&tbl->lock); + + return shrunk; +} + +static void neigh_add_timer(struct neighbour *n, unsigned long when) +{ + neigh_hold(n); + if (unlikely(mod_timer(&n->timer, when))) { + printk("NEIGH: BUG, double timer add, state is %x\n", + n->nud_state); + dump_stack(); + } +} + +static int neigh_del_timer(struct neighbour *n) +{ + if ((n->nud_state & NUD_IN_TIMER) && + del_timer(&n->timer)) { + neigh_release(n); + return 1; + } + return 0; +} + +static void pneigh_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(list)) != NULL) { + dev_put(skb->dev); + kfree_skb(skb); + } +} + +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + struct neigh_hash_table *nht; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + write_lock(&n->lock); + neigh_del_timer(n); + n->dead = 1; + + if (atomic_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + } + } +} + +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ + write_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev); + write_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_changeaddr); + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + write_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev); + pneigh_ifdown(tbl, dev); + write_unlock_bh(&tbl->lock); + + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + return 0; +} +EXPORT_SYMBOL(neigh_ifdown); + +static struct neighbour *neigh_alloc(struct neigh_table *tbl) +{ + struct neighbour *n = NULL; + unsigned long now = jiffies; + int entries; + + entries = atomic_inc_return(&tbl->entries) - 1; + if (entries >= tbl->gc_thresh3 || + (entries >= tbl->gc_thresh2 && + time_after(now, tbl->last_flush + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && + entries >= tbl->gc_thresh3) + goto out_entries; + } + + n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); + if (!n) + goto out_entries; + + skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = neigh_parms_clone(&tbl->parms); + setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); + + NEIGH_CACHE_STAT_INC(tbl, allocs); + n->tbl = tbl; + atomic_set(&n->refcnt, 1); + n->dead = 1; +out: + return n; + +out_entries: + atomic_dec(&tbl->entries); + goto out; +} + +static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) +{ + size_t size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour __rcu **buckets; + + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour __rcu **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; + } + ret->hash_buckets = buckets; + ret->hash_mask = entries - 1; + get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); + return ret; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); + struct neighbour __rcu **buckets = nht->hash_buckets; + + if (size <= PAGE_SIZE) + kfree(buckets); + else + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); +} + +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_entries) +{ + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; + + NEIGH_CACHE_STAT_INC(tbl, hash_grows); + + BUG_ON(!is_power_of_2(new_entries)); + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_entries); + if (!new_nht) + return old_nht; + + for (i = 0; i <= old_nht->hash_mask; i++) { + struct neighbour *n, *next; + + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); + + hash &= new_nht->hash_mask; + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); + } + } + + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; +} + +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + + rcu_read_unlock_bh(); + return n; +} +EXPORT_SYMBOL(neigh_lookup); + +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, + const void *pkey) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!memcmp(n->primary_key, pkey, key_len) && + net_eq(dev_net(n->dev), net)) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + + rcu_read_unlock_bh(); + return n; +} +EXPORT_SYMBOL(neigh_lookup_nodev); + +struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + u32 hash_val; + int key_len = tbl->key_len; + int error; + struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neigh_hash_table *nht; + + if (!n) { + rc = ERR_PTR(-ENOBUFS); + goto out; + } + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + dev_hold(dev); + + /* Protocol specific setup. */ + if (tbl->constructor && (error = tbl->constructor(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + /* Device specific setup. */ + if (n->parms->neigh_setup && + (error = n->parms->neigh_setup(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + if (atomic_read(&tbl->entries) > (nht->hash_mask + 1)) + nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1); + + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; + + if (n->parms->dead) { + rc = ERR_PTR(-EINVAL); + goto out_tbl_unlock; + } + + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { + if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + neigh_hold(n1); + rc = n1; + goto out_tbl_unlock; + } + } + + n->dead = 0; + neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK2("neigh %p is created.\n", n); + rc = n; +out: + return rc; +out_tbl_unlock: + write_unlock_bh(&tbl->lock); +out_neigh_release: + neigh_release(n); + goto out; +} +EXPORT_SYMBOL(neigh_create); + +static u32 pneigh_hash(const void *pkey, int key_len) +{ + u32 hash_val = *(u32 *)(pkey + key_len - 4); + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + return hash_val; +} + +static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, + struct net *net, + const void *pkey, + int key_len, + struct net_device *dev) +{ + while (n) { + if (!memcmp(n->key, pkey, key_len) && + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; + n = n->next; + } + return NULL; +} + +struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, struct net_device *dev) +{ + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); +} +EXPORT_SYMBOL_GPL(__pneigh_lookup); + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev, int creat) +{ + struct pneigh_entry *n; + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + if (n || !creat) + goto out; + + ASSERT_RTNL(); + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (!n) + goto out; + + write_pnet(&n->net, hold_net(net)); + memcpy(n->key, pkey, key_len); + n->dev = dev; + if (dev) + dev_hold(dev); + + if (tbl->pconstructor && tbl->pconstructor(n)) { + if (dev) + dev_put(dev); + release_net(net); + kfree(n); + n = NULL; + goto out; + } + + write_lock_bh(&tbl->lock); + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + write_unlock_bh(&tbl->lock); +out: + return n; +} +EXPORT_SYMBOL(pneigh_lookup); + + +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n, **np; + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + write_lock_bh(&tbl->lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + np = &n->next) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev && + net_eq(pneigh_net(n), net)) { + *np = n->next; + write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + release_net(pneigh_net(n)); + kfree(n); + return 0; + } + } + write_unlock_bh(&tbl->lock); + return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + struct pneigh_entry *n, **np; + u32 h; + + for (h = 0; h <= PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n = *np) != NULL) { + if (!dev || n->dev == dev) { + *np = n->next; + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + release_net(pneigh_net(n)); + kfree(n); + continue; + } + np = &n->next; + } + } + return -ENOENT; +} + +static void neigh_parms_destroy(struct neigh_parms *parms); + +static inline void neigh_parms_put(struct neigh_parms *parms) +{ + if (atomic_dec_and_test(&parms->refcnt)) + neigh_parms_destroy(parms); +} + +static void neigh_destroy_rcu(struct rcu_head *head) +{ + struct neighbour *neigh = container_of(head, struct neighbour, rcu); + + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); + + if (!neigh->dead) { + printk(KERN_WARNING + "Destroying alive neighbour %p\n", neigh); + dump_stack(); + return; + } + + if (neigh_del_timer(neigh)) + printk(KERN_WARNING "Impossible event.\n"); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + + write_seqlock_bh(&hh->hh_lock); + hh->hh_output = neigh_blackhole; + write_sequnlock_bh(&hh->hh_lock); + hh_cache_put(hh); + } + + skb_queue_purge(&neigh->arp_queue); + + dev_put(neigh->dev); + neigh_parms_put(neigh->parms); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + + atomic_dec(&neigh->tbl->entries); + call_rcu(&neigh->rcu, neigh_destroy_rcu); +} +EXPORT_SYMBOL(neigh_destroy); + +/* Neighbour state is suspicious; + disable fast path. + + Called with write_locked neigh. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + + neigh->output = neigh->ops->output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + + Called with write_locked neigh. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; +} + +static void neigh_periodic_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neighbour *n; + struct neighbour __rcu **np; + unsigned int i; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + /* + * periodically recompute ReachableTime from random function + */ + + if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { + struct neigh_parms *p; + tbl->last_rand = jiffies; + for (p = &tbl->parms; p; p = p->next) + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + } + + for (i = 0 ; i <= nht->hash_mask; i++) { + np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + unsigned int state; + + write_lock(&n->lock); + + state = n->nud_state; + if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + write_unlock(&n->lock); + goto next_elt; + } + + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(jiffies, n->used + n->parms->gc_staletime))) { + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } + write_unlock(&n->lock); + +next_elt: + np = &n->next; + } + /* + * It's fine to release lock here, even if hash table + * grows while we are preempted. + */ + write_unlock_bh(&tbl->lock); + cond_resched(); + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + } + /* Cycle through all hash buckets every base_reachable_time/2 ticks. + * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 + * base_reachable_time. + */ + schedule_delayed_work(&tbl->gc_work, + tbl->parms.base_reachable_time >> 1); + write_unlock_bh(&tbl->lock); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return (n->nud_state & NUD_PROBE) ? + p->ucast_probes : + p->ucast_probes + p->app_probes + p->mcast_probes; +} + +static void neigh_invalidate(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + struct sk_buff *skb; + + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + neigh->updated = jiffies; + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); +} + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now, next; + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; + + write_lock(&neigh->lock); + + state = neigh->nud_state; + now = jiffies; + next = now + HZ; + + if (!(state & NUD_IN_TIMER)) { +#ifndef CONFIG_SMP + printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); +#endif + goto out; + } + + if (state & NUD_REACHABLE) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->reachable_time)) { + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else if (time_before_eq(now, + neigh->used + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->updated = jiffies; + neigh_suspect(neigh); + next = now + neigh->parms->delay_probe_time; + } else { + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh->nud_state = NUD_STALE; + neigh->updated = jiffies; + neigh_suspect(neigh); + notify = 1; + } + } else if (state & NUD_DELAY) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + neigh->nud_state = NUD_REACHABLE; + neigh->updated = jiffies; + neigh_connect(neigh); + notify = 1; + next = neigh->confirmed + neigh->parms->reachable_time; + } else { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + neigh->updated = jiffies; + atomic_set(&neigh->probes, 0); + next = now + neigh->parms->retrans_time; + } + } else { + /* NUD_PROBE|NUD_INCOMPLETE */ + next = now + neigh->parms->retrans_time; + } + + if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && + atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { + neigh->nud_state = NUD_FAILED; + notify = 1; + neigh_invalidate(neigh); + } + + if (neigh->nud_state & NUD_IN_TIMER) { + if (time_before(next, jiffies + HZ/2)) + next = jiffies + HZ/2; + if (!mod_timer(&neigh->timer, next)) + neigh_hold(neigh); + } + if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { + struct sk_buff *skb = skb_peek(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb = skb_copy(skb, GFP_ATOMIC); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + kfree_skb(skb); + } else { +out: + write_unlock(&neigh->lock); + } + + if (notify) + neigh_update_notify(neigh); + + neigh_release(neigh); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + int rc; + unsigned long now; + + write_lock_bh(&neigh->lock); + + rc = 0; + if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) + goto out_unlock_bh; + + now = jiffies; + + if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + atomic_set(&neigh->probes, neigh->parms->ucast_probes); + neigh->nud_state = NUD_INCOMPLETE; + neigh->updated = jiffies; + neigh_add_timer(neigh, now + 1); + } else { + neigh->nud_state = NUD_FAILED; + neigh->updated = jiffies; + write_unlock_bh(&neigh->lock); + + kfree_skb(skb); + return 1; + } + } else if (neigh->nud_state & NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->updated = jiffies; + neigh_add_timer(neigh, + jiffies + neigh->parms->delay_probe_time); + } + + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= + neigh->parms->queue_len) { + struct sk_buff *buff; + buff = __skb_dequeue(&neigh->arp_queue); + kfree_skb(buff); + NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); + } + skb_dst_force(skb); + __skb_queue_tail(&neigh->arp_queue, skb); + } + rc = 1; + } +out_unlock_bh: + write_unlock_bh(&neigh->lock); + return rc; +} +EXPORT_SYMBOL(__neigh_event_send); + +static void neigh_update_hhs(const struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) + = NULL; + + if (neigh->dev->header_ops) + update = neigh->dev->header_ops->cache_update; + + if (update) { + for (hh = neigh->hh; hh; hh = hh->hh_next) { + write_seqlock_bh(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_sequnlock_bh(&hh->hh_lock); + } + } +} + + + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- flags + NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, + if it is different. + NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" + lladdr instead of overriding it + if it is different. + It also allows to retain current state + if lladdr is unchanged. + NEIGH_UPDATE_F_ADMIN means that the change is administrative. + + NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing + NTF_ROUTER flag. + NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as + a router. + + Caller MUST hold reference count on the entry. + */ + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags) +{ + u8 old; + int err; + int notify = 0; + struct net_device *dev; + int update_isrouter = 0; + + write_lock_bh(&neigh->lock); + + dev = neigh->dev; + old = neigh->nud_state; + err = -EPERM; + + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; + + if (!(new & NUD_VALID)) { + neigh_del_timer(neigh); + if (old & NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + err = 0; + notify = old & NUD_VALID; + if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + (new & NUD_FAILED)) { + neigh_invalidate(neigh); + notify = 1; + } + goto out; + } + + /* Compare new lladdr with cached one */ + if (!dev->addr_len) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if ((old & NUD_VALID) && + !memcmp(lladdr, neigh->ha, dev->addr_len)) + lladdr = neigh->ha; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + err = -EINVAL; + if (!(old & NUD_VALID)) + goto out; + lladdr = neigh->ha; + } + + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + err = 0; + update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + if (old & NUD_VALID) { + if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { + update_isrouter = 0; + if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && + (old & NUD_CONNECTED)) { + lladdr = neigh->ha; + new = NUD_STALE; + } else + goto out; + } else { + if (lladdr == neigh->ha && new == NUD_STALE && + ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || + (old & NUD_CONNECTED)) + ) + new = old; + } + } + + if (new != old) { + neigh_del_timer(neigh); + if (new & NUD_IN_TIMER) + neigh_add_timer(neigh, (jiffies + + ((new & NUD_REACHABLE) ? + neigh->parms->reachable_time : + 0))); + neigh->nud_state = new; + } + + if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); + memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); + neigh_update_hhs(neigh); + if (!(new & NUD_CONNECTED)) + neigh->confirmed = jiffies - + (neigh->parms->base_reachable_time << 1); + notify = 1; + } + if (new == old) + goto out; + if (new & NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old & NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; + write_unlock_bh(&neigh->lock); + + rcu_read_lock(); + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (dst && (n2 = dst_get_neighbour(dst)) != NULL) + n1 = n2; + n1->output(skb); + rcu_read_unlock(); + + write_lock_bh(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + } +out: + if (update_isrouter) { + neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? + (neigh->flags | NTF_ROUTER) : + (neigh->flags & ~NTF_ROUTER); + } + write_unlock_bh(&neigh->lock); + + if (notify) + neigh_update_notify(neigh); + + return err; +} +EXPORT_SYMBOL(neigh_update); + +struct neighbour *neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct net_device *dev) +{ + struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, + lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_OVERRIDE); + return neigh; +} +EXPORT_SYMBOL(neigh_event_ns); + +static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, + __be16 protocol) +{ + struct hh_cache *hh; + + smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */ + for (hh = n->hh; hh; hh = hh->hh_next) { + if (hh->hh_type == protocol) { + atomic_inc(&hh->hh_refcnt); + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); + return true; + } + } + return false; +} + +/* called with read_lock_bh(&n->lock); */ +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, + __be16 protocol) +{ + struct hh_cache *hh; + struct net_device *dev = dst->dev; + + if (likely(neigh_hh_lookup(n, dst, protocol))) + return; + + /* slow path */ + hh = kzalloc(sizeof(*hh), GFP_ATOMIC); + if (!hh) + return; + + seqlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 2); + + if (dev->header_ops->cache(n, hh)) { + kfree(hh); + return; + } + + write_lock_bh(&n->lock); + + /* must check if another thread already did the insert */ + if (neigh_hh_lookup(n, dst, protocol)) { + kfree(hh); + goto end; + } + + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + + hh->hh_next = n->hh; + smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */ + n->hh = hh; + + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); +end: + write_unlock_bh(&n->lock); +} + +/* This function can be used in contexts, where only old dev_queue_xmit + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. + */ + +int neigh_compat_output(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + __skb_pull(skb, skb_network_offset(skb)); + + if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, + skb->len) < 0 && + dev->header_ops->rebuild(skb)) + return 0; + + return dev_queue_xmit(skb); +} +EXPORT_SYMBOL(neigh_compat_output); + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct neighbour *neigh = dst_get_neighbour(dst); + int rc = 0; + + if (!dst) + goto discard; + + __skb_pull(skb, skb_network_offset(skb)); + + if (!neigh_event_send(neigh, skb)) { + int err; + struct net_device *dev = neigh->dev; + unsigned int seq; + + if (dev->header_ops->cache && + !dst->hh && + !(dst->flags & DST_NOCACHE)) + neigh_hh_init(neigh, dst, dst->ops->protocol); + + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + + if (err >= 0) + rc = neigh->ops->queue_xmit(skb); + else + goto out_kfree_skb; + } +out: + return rc; +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", + dst, neigh); +out_kfree_skb: + rc = -EINVAL; + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_resolve_output); + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb_dst(skb); + struct neighbour *neigh = dst_get_neighbour(dst); + struct net_device *dev = neigh->dev; + unsigned int seq; + + __skb_pull(skb, skb_network_offset(skb)); + + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + + if (err >= 0) + err = neigh->ops->queue_xmit(skb); + else { + err = -EINVAL; + kfree_skb(skb); + } + return err; +} +EXPORT_SYMBOL(neigh_connected_output); + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb, *n; + + spin_lock(&tbl->proxy_queue.lock); + + skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { + long tdif = NEIGH_CB(skb)->sched_next - now; + + if (tdif <= 0) { + struct net_device *dev = skb->dev; + + __skb_unlink(skb, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) { + rcu_read_lock(); + tbl->proxy_redo(skb); + rcu_read_unlock(); + } else { + kfree_skb(skb); + } + + dev_put(dev); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) + mod_timer(&tbl->proxy_timer, jiffies + sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + unsigned long sched_next = now + (net_random() % p->proxy_delay); + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + + NEIGH_CB(skb)->sched_next = sched_next; + NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; + + spin_lock(&tbl->proxy_queue.lock); + if (del_timer(&tbl->proxy_timer)) { + if (time_before(tbl->proxy_timer.expires, sched_next)) + sched_next = tbl->proxy_timer.expires; + } + skb_dst_drop(skb); + dev_hold(skb->dev); + __skb_queue_tail(&tbl->proxy_queue, skb); + mod_timer(&tbl->proxy_timer, sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} +EXPORT_SYMBOL(pneigh_enqueue); + +static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, + struct net *net, int ifindex) +{ + struct neigh_parms *p; + + for (p = &tbl->parms; p; p = p->next) { + if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || + (!p->dev && !ifindex)) + return p; + } + + return NULL; +} + +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, + struct neigh_table *tbl) +{ + struct neigh_parms *p, *ref; + struct net *net = dev_net(dev); + const struct net_device_ops *ops = dev->netdev_ops; + + ref = lookup_neigh_parms(tbl, net, 0); + if (!ref) + return NULL; + + p = kmemdup(ref, sizeof(*p), GFP_KERNEL); + if (p) { + p->tbl = tbl; + atomic_set(&p->refcnt, 1); + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + + if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; + write_lock_bh(&tbl->lock); + p->next = tbl->parms.next; + tbl->parms.next = p; + write_unlock_bh(&tbl->lock); + } + return p; +} +EXPORT_SYMBOL(neigh_parms_alloc); + +static void neigh_rcu_free_parms(struct rcu_head *head) +{ + struct neigh_parms *parms = + container_of(head, struct neigh_parms, rcu_head); + + neigh_parms_put(parms); +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (!parms || parms == &tbl->parms) + return; + write_lock_bh(&tbl->lock); + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + *p = parms->next; + parms->dead = 1; + write_unlock_bh(&tbl->lock); + if (parms->dev) + dev_put(parms->dev); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); + return; + } + } + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK1("neigh_parms_release: not found\n"); +} +EXPORT_SYMBOL(neigh_parms_release); + +static void neigh_parms_destroy(struct neigh_parms *parms) +{ + release_net(neigh_parms_net(parms)); + kfree(parms); +} + +static struct lock_class_key neigh_table_proxy_queue_class; + +void neigh_table_init_no_netlink(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + unsigned long phsize; + + write_pnet(&tbl->parms.net, &init_net); + atomic_set(&tbl->parms.refcnt, 1); + tbl->parms.reachable_time = + neigh_rand_reach_time(tbl->parms.base_reachable_time); + + if (!tbl->kmem_cachep) + tbl->kmem_cachep = + kmem_cache_create(tbl->id, tbl->entry_size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); + tbl->stats = alloc_percpu(struct neigh_statistics); + if (!tbl->stats) + panic("cannot create neighbour cache statistics"); + +#ifdef CONFIG_PROC_FS + if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat, + &neigh_stat_seq_fops, tbl)) + panic("cannot create neighbour proc dir entry"); +#endif + + RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8)); + + phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); + tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); + + if (!tbl->nht || !tbl->phash_buckets) + panic("cannot allocate neighbour cache hashes"); + + rwlock_init(&tbl->lock); + INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); + schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); + setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); + skb_queue_head_init_class(&tbl->proxy_queue, + &neigh_table_proxy_queue_class); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time * 20; +} +EXPORT_SYMBOL(neigh_table_init_no_netlink); + +void neigh_table_init(struct neigh_table *tbl) +{ + struct neigh_table *tmp; + + neigh_table_init_no_netlink(tbl); + write_lock(&neigh_tbl_lock); + for (tmp = neigh_tables; tmp; tmp = tmp->next) { + if (tmp->family == tbl->family) + break; + } + tbl->next = neigh_tables; + neigh_tables = tbl; + write_unlock(&neigh_tbl_lock); + + if (unlikely(tmp)) { + printk(KERN_ERR "NEIGH: Registering multiple tables for " + "family %d\n", tbl->family); + dump_stack(); + } +} +EXPORT_SYMBOL(neigh_table_init); + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + /* It is not clean... Fix it to unload IPv6 module safely */ + cancel_delayed_work_sync(&tbl->gc_work); + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + neigh_ifdown(tbl, NULL); + if (atomic_read(&tbl->entries)) + printk(KERN_CRIT "neighbour leakage\n"); + write_lock(&neigh_tbl_lock); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + break; + } + } + write_unlock(&neigh_tbl_lock); + + call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, + neigh_hash_free_rcu); + tbl->nht = NULL; + + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + + remove_proc_entry(tbl->id, init_net.proc_net_stat); + + free_percpu(tbl->stats); + tbl->stats = NULL; + + kmem_cache_destroy(tbl->kmem_cachep); + tbl->kmem_cachep = NULL; + + return 0; +} +EXPORT_SYMBOL(neigh_table_clear); + +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *dst_attr; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err = -EINVAL; + + ASSERT_RTNL(); + if (nlmsg_len(nlh) < sizeof(*ndm)) + goto out; + + dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); + if (dst_attr == NULL) + goto out; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + } + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + struct neighbour *neigh; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + if (nla_len(dst_attr) < tbl->key_len) + goto out; + + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } + + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; + goto out; + } + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + goto out; + } + read_unlock(&neigh_tbl_lock); + err = -EAFNOSUPPORT; + +out: + return err; +} + +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err; + + ASSERT_RTNL(); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + goto out; + + err = -EINVAL; + if (tb[NDA_DST] == NULL) + goto out; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + goto out; + } + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; + struct neighbour *neigh; + void *dst, *lladdr; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; + err = 0; + } + goto out; + } + + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; + goto out; + } + + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); + goto out; + } + + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; + } + + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + goto out; + } + + read_unlock(&neigh_tbl_lock); + err = -EAFNOSUPPORT; +out: + return err; +} + +static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, NDTA_PARMS); + if (nest == NULL) + return -ENOBUFS; + + if (parms->dev) + NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); + + NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); + NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); + NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); + NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); + NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); + NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes); + NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time); + NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME, + parms->base_reachable_time); + NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime); + NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time); + NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time); + NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay); + NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay); + NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime); + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, + u32 pid, u32 seq, int type, int flags) +{ + struct nlmsghdr *nlh; + struct ndtmsg *ndtmsg; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndtmsg = nlmsg_data(nlh); + + read_lock_bh(&tbl->lock); + ndtmsg->ndtm_family = tbl->family; + ndtmsg->ndtm_pad1 = 0; + ndtmsg->ndtm_pad2 = 0; + + NLA_PUT_STRING(skb, NDTA_NAME, tbl->id); + NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); + NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1); + NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2); + NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3); + + { + unsigned long now = jiffies; + unsigned int flush_delta = now - tbl->last_flush; + unsigned int rand_delta = now - tbl->last_rand; + struct neigh_hash_table *nht; + struct ndt_config ndc = { + .ndtc_key_len = tbl->key_len, + .ndtc_entry_size = tbl->entry_size, + .ndtc_entries = atomic_read(&tbl->entries), + .ndtc_last_flush = jiffies_to_msecs(flush_delta), + .ndtc_last_rand = jiffies_to_msecs(rand_delta), + .ndtc_proxy_qlen = tbl->proxy_queue.qlen, + }; + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd; + ndc.ndtc_hash_mask = nht->hash_mask; + rcu_read_unlock_bh(); + + NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); + } + + { + int cpu; + struct ndt_stats ndst; + + memset(&ndst, 0, sizeof(ndst)); + + for_each_possible_cpu(cpu) { + struct neigh_statistics *st; + + st = per_cpu_ptr(tbl->stats, cpu); + ndst.ndts_allocs += st->allocs; + ndst.ndts_destroys += st->destroys; + ndst.ndts_hash_grows += st->hash_grows; + ndst.ndts_res_failed += st->res_failed; + ndst.ndts_lookups += st->lookups; + ndst.ndts_hits += st->hits; + ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; + ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; + ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; + ndst.ndts_forced_gc_runs += st->forced_gc_runs; + } + + NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst); + } + + BUG_ON(tbl->parms.dev); + if (neightbl_fill_parms(skb, &tbl->parms) < 0) + goto nla_put_failure; + + read_unlock_bh(&tbl->lock); + return nlmsg_end(skb, nlh); + +nla_put_failure: + read_unlock_bh(&tbl->lock); + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int neightbl_fill_param_info(struct sk_buff *skb, + struct neigh_table *tbl, + struct neigh_parms *parms, + u32 pid, u32 seq, int type, + unsigned int flags) +{ + struct ndtmsg *ndtmsg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndtmsg = nlmsg_data(nlh); + + read_lock_bh(&tbl->lock); + ndtmsg->ndtm_family = tbl->family; + ndtmsg->ndtm_pad1 = 0; + ndtmsg->ndtm_pad2 = 0; + + if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || + neightbl_fill_parms(skb, parms) < 0) + goto errout; + + read_unlock_bh(&tbl->lock); + return nlmsg_end(skb, nlh); +errout: + read_unlock_bh(&tbl->lock); + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { + [NDTA_NAME] = { .type = NLA_STRING }, + [NDTA_THRESH1] = { .type = NLA_U32 }, + [NDTA_THRESH2] = { .type = NLA_U32 }, + [NDTA_THRESH3] = { .type = NLA_U32 }, + [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, + [NDTA_PARMS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { + [NDTPA_IFINDEX] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, + [NDTPA_APP_PROBES] = { .type = NLA_U32 }, + [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, + [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, + [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, + [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, + [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, + [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, + [NDTPA_LOCKTIME] = { .type = NLA_U64 }, +}; + +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct neigh_table *tbl; + struct ndtmsg *ndtmsg; + struct nlattr *tb[NDTA_MAX+1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, + nl_neightbl_policy); + if (err < 0) + goto errout; + + if (tb[NDTA_NAME] == NULL) { + err = -EINVAL; + goto errout; + } + + ndtmsg = nlmsg_data(nlh); + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) + continue; + + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + break; + } + + if (tbl == NULL) { + err = -ENOENT; + goto errout_locked; + } + + /* + * We acquire tbl->lock to be nice to the periodic timers and + * make sure they always see a consistent set of values. + */ + write_lock_bh(&tbl->lock); + + if (tb[NDTA_PARMS]) { + struct nlattr *tbp[NDTPA_MAX+1]; + struct neigh_parms *p; + int i, ifindex = 0; + + err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], + nl_ntbl_parm_policy); + if (err < 0) + goto errout_tbl_lock; + + if (tbp[NDTPA_IFINDEX]) + ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); + + p = lookup_neigh_parms(tbl, net, ifindex); + if (p == NULL) { + err = -ENOENT; + goto errout_tbl_lock; + } + + for (i = 1; i <= NDTPA_MAX; i++) { + if (tbp[i] == NULL) + continue; + + switch (i) { + case NDTPA_QUEUE_LEN: + p->queue_len = nla_get_u32(tbp[i]); + break; + case NDTPA_PROXY_QLEN: + p->proxy_qlen = nla_get_u32(tbp[i]); + break; + case NDTPA_APP_PROBES: + p->app_probes = nla_get_u32(tbp[i]); + break; + case NDTPA_UCAST_PROBES: + p->ucast_probes = nla_get_u32(tbp[i]); + break; + case NDTPA_MCAST_PROBES: + p->mcast_probes = nla_get_u32(tbp[i]); + break; + case NDTPA_BASE_REACHABLE_TIME: + p->base_reachable_time = nla_get_msecs(tbp[i]); + break; + case NDTPA_GC_STALETIME: + p->gc_staletime = nla_get_msecs(tbp[i]); + break; + case NDTPA_DELAY_PROBE_TIME: + p->delay_probe_time = nla_get_msecs(tbp[i]); + break; + case NDTPA_RETRANS_TIME: + p->retrans_time = nla_get_msecs(tbp[i]); + break; + case NDTPA_ANYCAST_DELAY: + p->anycast_delay = nla_get_msecs(tbp[i]); + break; + case NDTPA_PROXY_DELAY: + p->proxy_delay = nla_get_msecs(tbp[i]); + break; + case NDTPA_LOCKTIME: + p->locktime = nla_get_msecs(tbp[i]); + break; + } + } + } + + if (tb[NDTA_THRESH1]) + tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); + + if (tb[NDTA_THRESH2]) + tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); + + if (tb[NDTA_THRESH3]) + tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); + + if (tb[NDTA_GC_INTERVAL]) + tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); + + err = 0; + +errout_tbl_lock: + write_unlock_bh(&tbl->lock); +errout_locked: + read_unlock(&neigh_tbl_lock); +errout: + return err; +} + +static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int family, tidx, nidx = 0; + int tbl_skip = cb->args[0]; + int neigh_skip = cb->args[1]; + struct neigh_table *tbl; + + family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + struct neigh_parms *p; + + if (tidx < tbl_skip || (family && tbl->family != family)) + continue; + + if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + NLM_F_MULTI) <= 0) + break; + + for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + if (!net_eq(neigh_parms_net(p), net)) + continue; + + if (nidx < neigh_skip) + goto next; + + if (neightbl_fill_param_info(skb, tbl, p, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGHTBL, + NLM_F_MULTI) <= 0) + goto out; + next: + nidx++; + } + + neigh_skip = 0; + } +out: + read_unlock(&neigh_tbl_lock); + cb->args[0] = tidx; + cb->args[1] = nidx; + + return skb->len; +} + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) +{ + unsigned long now = jiffies; + struct nda_cacheinfo ci; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = neigh->ops->family; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = neigh->flags; + ndm->ndm_type = neigh->type; + ndm->ndm_ifindex = neigh->dev->ifindex; + + NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key); + + read_lock_bh(&neigh->lock); + ndm->ndm_state = neigh->nud_state; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } + } + + ci.ndm_used = jiffies_to_clock_t(now - neigh->used); + ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); + ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); + ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; + read_unlock_bh(&neigh->lock); + + NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes)); + NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void neigh_update_notify(struct neighbour *neigh) +{ + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + __neigh_notify(neigh, RTM_NEWNEIGH, 0); +} + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct neighbour *n; + int rc, h, s_h = cb->args[1]; + int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + for (h = 0; h <= nht->hash_mask; h++) { + if (h < s_h) + continue; + if (h > s_h) + s_idx = 0; + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!net_eq(dev_net(n->dev), net)) + continue; + if (idx < s_idx) + goto next; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI) <= 0) { + rc = -1; + goto out; + } +next: + idx++; + } + } + rc = skb->len; +out: + rcu_read_unlock_bh(); + cb->args[1] = h; + cb->args[2] = idx; + return rc; +} + +static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neigh_table *tbl; + int t, family, s_t; + + read_lock(&neigh_tbl_lock); + family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + s_t = cb->args[0]; + + for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { + if (t < s_t || (family && tbl->family != family)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args) - + sizeof(cb->args[0])); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + read_unlock(&neigh_tbl_lock); + + cb->args[0] = t; + return skb->len; +} + +void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) +{ + int chain; + struct neigh_hash_table *nht; + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); /* avoid resizes */ + for (chain = 0; chain <= nht->hash_mask; chain++) { + struct neighbour *n; + + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) + cb(n, cookie); + } + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); +} +EXPORT_SYMBOL(neigh_for_each); + +/* The tbl->lock must be held as a writer and BH disabled. */ +void __neigh_for_each_release(struct neigh_table *tbl, + int (*cb)(struct neighbour *)) +{ + int chain; + struct neigh_hash_table *nht; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain <= nht->hash_mask; chain++) { + struct neighbour *n; + struct neighbour __rcu **np; + + np = &nht->hash_buckets[chain]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + int release; + + write_lock(&n->lock); + release = cb(n); + if (release) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + n->dead = 1; + } else + np = &n->next; + write_unlock(&n->lock); + if (release) + neigh_cleanup_and_release(n); + } + } +} +EXPORT_SYMBOL(__neigh_for_each_release); + +#ifdef CONFIG_PROC_FS + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; + struct neighbour *n = NULL; + int bucket = state->bucket; + + state->flags &= ~NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= nht->hash_mask; bucket++) { + n = rcu_dereference_bh(nht->hash_buckets[bucket]); + + while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, &fakep); + if (!v) + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + if (n->nud_state & ~NUD_NOARP) + break; +next: + n = rcu_dereference_bh(n->next); + } + + if (n) + break; + } + state->bucket = bucket; + + return n; +} + +static struct neighbour *neigh_get_next(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; + + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + } + n = rcu_dereference_bh(n->next); + + while (1) { + while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + + if (n->nud_state & ~NUD_NOARP) + break; +next: + n = rcu_dereference_bh(n->next); + } + + if (n) + break; + + if (++state->bucket > nht->hash_mask) + break; + + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); + } + + if (n && pos) + --(*pos); + return n; +} + +static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct neighbour *n = neigh_get_first(seq); + + if (n) { + --(*pos); + while (*pos) { + n = neigh_get_next(seq, n, pos); + if (!n) + break; + } + } + return *pos ? NULL : n; +} + +static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_table *tbl = state->tbl; + struct pneigh_entry *pn = NULL; + int bucket = state->bucket; + + state->flags |= NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { + pn = tbl->phash_buckets[bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; + if (pn) + break; + } + state->bucket = bucket; + + return pn; +} + +static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, + struct pneigh_entry *pn, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_table *tbl = state->tbl; + + pn = pn->next; + while (!pn) { + if (++state->bucket > PNEIGH_HASHMASK) + break; + pn = tbl->phash_buckets[state->bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; + if (pn) + break; + } + + if (pn && pos) + --(*pos); + + return pn; +} + +static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct pneigh_entry *pn = pneigh_get_first(seq); + + if (pn) { + --(*pos); + while (*pos) { + pn = pneigh_get_next(seq, pn, pos); + if (!pn) + break; + } + } + return *pos ? NULL : pn; +} + +static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + void *rc; + loff_t idxpos = *pos; + + rc = neigh_get_idx(seq, &idxpos); + if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_idx(seq, &idxpos); + + return rc; +} + +void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) + __acquires(rcu_bh) +{ + struct neigh_seq_state *state = seq->private; + + state->tbl = tbl; + state->bucket = 0; + state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); + + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); + + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL(neigh_seq_start); + +void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_seq_state *state; + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = neigh_get_first(seq); + goto out; + } + + state = seq->private; + if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { + rc = neigh_get_next(seq, v, NULL); + if (rc) + goto out; + if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_first(seq); + } else { + BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); + rc = pneigh_get_next(seq, v, NULL); + } +out: + ++(*pos); + return rc; +} +EXPORT_SYMBOL(neigh_seq_next); + +void neigh_seq_stop(struct seq_file *seq, void *v) + __releases(rcu_bh) +{ + rcu_read_unlock_bh(); +} +EXPORT_SYMBOL(neigh_seq_stop); + +/* statistics via seq_file */ + +static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct neigh_table *tbl = seq->private; + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_table *tbl = seq->private; + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void neigh_stat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int neigh_stat_seq_show(struct seq_file *seq, void *v) +{ + struct neigh_table *tbl = seq->private; + struct neigh_statistics *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); + return 0; + } + + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx %08lx %08lx\n", + atomic_read(&tbl->entries), + + st->allocs, + st->destroys, + st->hash_grows, + + st->lookups, + st->hits, + + st->res_failed, + + st->rcv_probes_mcast, + st->rcv_probes_ucast, + + st->periodic_gc_runs, + st->forced_gc_runs, + st->unres_discards + ); + + return 0; +} + +static const struct seq_operations neigh_stat_seq_ops = { + .start = neigh_stat_seq_start, + .next = neigh_stat_seq_next, + .stop = neigh_stat_seq_stop, + .show = neigh_stat_seq_show, +}; + +static int neigh_stat_seq_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &neigh_stat_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode)->data; + } + return ret; +}; + +static const struct file_operations neigh_stat_seq_fops = { + .owner = THIS_MODULE, + .open = neigh_stat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4); /* NDA_PROBES */ +} + +static void __neigh_notify(struct neighbour *n, int type, int flags) +{ + struct net *net = dev_net(n->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = neigh_fill_info(skb, n, 0, 0, type, flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); +} +EXPORT_SYMBOL(neigh_app_ns); +#endif /* CONFIG_ARPD */ + +#ifdef CONFIG_SYSCTL + +#define NEIGH_VARS_MAX 19 + +static struct neigh_sysctl_table { + struct ctl_table_header *sysctl_header; + struct ctl_table neigh_vars[NEIGH_VARS_MAX]; + char *dev_name; +} neigh_sysctl_template __read_mostly = { + .neigh_vars = { + { + .procname = "mcast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ucast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "app_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "retrans_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + { + .procname = "base_reachable_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "delay_first_probe_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_stale_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "unres_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "proxy_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "anycast_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + { + .procname = "proxy_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + { + .procname = "locktime", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + { + .procname = "retrans_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "base_reachable_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "gc_interval", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_thresh1", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_thresh2", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_thresh3", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {}, + }, +}; + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, + char *p_name, proc_handler *handler) +{ + struct neigh_sysctl_table *t; + const char *dev_name_source = NULL; + +#define NEIGH_CTL_PATH_ROOT 0 +#define NEIGH_CTL_PATH_PROTO 1 +#define NEIGH_CTL_PATH_NEIGH 2 +#define NEIGH_CTL_PATH_DEV 3 + + struct ctl_path neigh_path[] = { + { .procname = "net", }, + { .procname = "proto", }, + { .procname = "neigh", }, + { .procname = "default", }, + { }, + }; + + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + if (!t) + goto err; + + t->neigh_vars[0].data = &p->mcast_probes; + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->base_reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + t->neigh_vars[12].data = &p->retrans_time; + t->neigh_vars[13].data = &p->base_reachable_time; + + if (dev) { + dev_name_source = dev->name; + /* Terminate the table early */ + memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); + } else { + dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; + t->neigh_vars[14].data = (int *)(p + 1); + t->neigh_vars[15].data = (int *)(p + 1) + 1; + t->neigh_vars[16].data = (int *)(p + 1) + 2; + t->neigh_vars[17].data = (int *)(p + 1) + 3; + } + + + if (handler) { + /* RetransTime */ + t->neigh_vars[3].proc_handler = handler; + t->neigh_vars[3].extra1 = dev; + /* ReachableTime */ + t->neigh_vars[4].proc_handler = handler; + t->neigh_vars[4].extra1 = dev; + /* RetransTime (in milliseconds)*/ + t->neigh_vars[12].proc_handler = handler; + t->neigh_vars[12].extra1 = dev; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[13].proc_handler = handler; + t->neigh_vars[13].extra1 = dev; + } + + t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); + if (!t->dev_name) + goto free; + + neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name; + neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name; + + t->sysctl_header = + register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars); + if (!t->sysctl_header) + goto free_procname; + + p->sysctl_table = t; + return 0; + +free_procname: + kfree(t->dev_name); +free: + kfree(t); +err: + return -ENOBUFS; +} +EXPORT_SYMBOL(neigh_sysctl_register); + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); + } +} +EXPORT_SYMBOL(neigh_sysctl_unregister); + +#endif /* CONFIG_SYSCTL */ + +static int __init neigh_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL); + rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info); + + rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info); + rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL); + + return 0; +} + +subsys_initcall(neigh_init); + diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c new file mode 100644 index 00000000..33d2a1fb --- /dev/null +++ b/net/core/net-sysfs.c @@ -0,0 +1,1339 @@ +/* + * net-sysfs.c - network device class and attributes + * + * Copyright (c) 2003 Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "net-sysfs.h" + +#ifdef CONFIG_SYSFS +static const char fmt_hex[] = "%#x\n"; +static const char fmt_long_hex[] = "%#lx\n"; +static const char fmt_dec[] = "%d\n"; +static const char fmt_udec[] = "%u\n"; +static const char fmt_ulong[] = "%lu\n"; +static const char fmt_u64[] = "%llu\n"; + +static inline int dev_isalive(const struct net_device *dev) +{ + return dev->reg_state <= NETREG_REGISTERED; +} + +/* use same locking rules as GIF* ioctl's */ +static ssize_t netdev_show(const struct device *dev, + struct device_attribute *attr, char *buf, + ssize_t (*format)(const struct net_device *, char *)) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = (*format)(net, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* generate a show function for simple field */ +#define NETDEVICE_SHOW(field, format_string) \ +static ssize_t format_##field(const struct net_device *net, char *buf) \ +{ \ + return sprintf(buf, format_string, net->field); \ +} \ +static ssize_t show_##field(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return netdev_show(dev, attr, buf, format_##field); \ +} + + +/* use same locking and permission rules as SIF* ioctl's */ +static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *net = to_net_dev(dev); + char *endp; + unsigned long new; + int ret = -EINVAL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + new = simple_strtoul(buf, &endp, 0); + if (endp == buf) + goto err; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(net)) { + if ((ret = (*set)(net, new)) == 0) + ret = len; + } + rtnl_unlock(); + err: + return ret; +} + +NETDEVICE_SHOW(dev_id, fmt_hex); +NETDEVICE_SHOW(addr_assign_type, fmt_dec); +NETDEVICE_SHOW(addr_len, fmt_dec); +NETDEVICE_SHOW(iflink, fmt_dec); +NETDEVICE_SHOW(ifindex, fmt_dec); +NETDEVICE_SHOW(features, fmt_hex); +NETDEVICE_SHOW(type, fmt_dec); +NETDEVICE_SHOW(link_mode, fmt_dec); + +/* use same locking rules as GIFHWADDR ioctl's */ +static ssize_t show_address(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); + read_unlock(&dev_base_lock); + return ret; +} + +static ssize_t show_broadcast(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *net = to_net_dev(dev); + if (dev_isalive(net)) + return sysfs_format_mac(buf, net->broadcast, net->addr_len); + return -EINVAL; +} + +static ssize_t show_carrier(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + if (netif_running(netdev)) { + return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + return -EINVAL; +} + +static ssize_t show_speed(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!dev_ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_duplex(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!dev_ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", + cmd.duplex ? "full" : "half"); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_dormant(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + + return -EINVAL; +} + +static const char *const operstates[] = { + "unknown", + "notpresent", /* currently unused */ + "down", + "lowerlayerdown", + "testing", /* currently unused */ + "dormant", + "up" +}; + +static ssize_t show_operstate(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + unsigned char operstate; + + read_lock(&dev_base_lock); + operstate = netdev->operstate; + if (!netif_running(netdev)) + operstate = IF_OPER_DOWN; + read_unlock(&dev_base_lock); + + if (operstate >= ARRAY_SIZE(operstates)) + return -EINVAL; /* should not happen */ + + return sprintf(buf, "%s\n", operstates[operstate]); +} + +/* read-write attributes */ +NETDEVICE_SHOW(mtu, fmt_dec); + +static int change_mtu(struct net_device *net, unsigned long new_mtu) +{ + return dev_set_mtu(net, (int) new_mtu); +} + +static ssize_t store_mtu(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_mtu); +} + +NETDEVICE_SHOW(flags, fmt_hex); + +static int change_flags(struct net_device *net, unsigned long new_flags) +{ + return dev_change_flags(net, (unsigned) new_flags); +} + +static ssize_t store_flags(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_flags); +} + +NETDEVICE_SHOW(tx_queue_len, fmt_ulong); + +static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +{ + net->tx_queue_len = new_len; + return 0; +} + +static ssize_t store_tx_queue_len(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_tx_queue_len); +} + +static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_device *netdev = to_net_dev(dev); + size_t count = len; + ssize_t ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* ignore trailing newline */ + if (len > 0 && buf[len - 1] == '\n') + --count; + + if (!rtnl_trylock()) + return restart_syscall(); + ret = dev_set_alias(netdev, buf, count); + rtnl_unlock(); + + return ret < 0 ? ret : len; +} + +static ssize_t show_ifalias(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + ssize_t ret = 0; + + if (!rtnl_trylock()) + return restart_syscall(); + if (netdev->ifalias) + ret = sprintf(buf, "%s\n", netdev->ifalias); + rtnl_unlock(); + return ret; +} + +NETDEVICE_SHOW(group, fmt_dec); + +static int change_group(struct net_device *net, unsigned long new_group) +{ + dev_set_group(net, (int) new_group); + return 0; +} + +static ssize_t store_group(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_group); +} + +static struct device_attribute net_class_attributes[] = { + __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), + __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), + __ATTR(dev_id, S_IRUGO, show_dev_id, NULL), + __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), + __ATTR(iflink, S_IRUGO, show_iflink, NULL), + __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), + __ATTR(features, S_IRUGO, show_features, NULL), + __ATTR(type, S_IRUGO, show_type, NULL), + __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), + __ATTR(address, S_IRUGO, show_address, NULL), + __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), + __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(speed, S_IRUGO, show_speed, NULL), + __ATTR(duplex, S_IRUGO, show_duplex, NULL), + __ATTR(dormant, S_IRUGO, show_dormant, NULL), + __ATTR(operstate, S_IRUGO, show_operstate, NULL), + __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), + __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), + __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, + store_tx_queue_len), + __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), + {} +}; + +/* Show a given an attribute in the statistics group */ +static ssize_t netstat_show(const struct device *d, + struct device_attribute *attr, char *buf, + unsigned long offset) +{ + struct net_device *dev = to_net_dev(d); + ssize_t ret = -EINVAL; + + WARN_ON(offset > sizeof(struct rtnl_link_stats64) || + offset % sizeof(u64) != 0); + + read_lock(&dev_base_lock); + if (dev_isalive(dev)) { + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); + } + read_unlock(&dev_base_lock); + return ret; +} + +/* generate a read-only statistics attribute */ +#define NETSTAT_ENTRY(name) \ +static ssize_t show_##name(struct device *d, \ + struct device_attribute *attr, char *buf) \ +{ \ + return netstat_show(d, attr, buf, \ + offsetof(struct rtnl_link_stats64, name)); \ +} \ +static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +NETSTAT_ENTRY(rx_packets); +NETSTAT_ENTRY(tx_packets); +NETSTAT_ENTRY(rx_bytes); +NETSTAT_ENTRY(tx_bytes); +NETSTAT_ENTRY(rx_errors); +NETSTAT_ENTRY(tx_errors); +NETSTAT_ENTRY(rx_dropped); +NETSTAT_ENTRY(tx_dropped); +NETSTAT_ENTRY(multicast); +NETSTAT_ENTRY(collisions); +NETSTAT_ENTRY(rx_length_errors); +NETSTAT_ENTRY(rx_over_errors); +NETSTAT_ENTRY(rx_crc_errors); +NETSTAT_ENTRY(rx_frame_errors); +NETSTAT_ENTRY(rx_fifo_errors); +NETSTAT_ENTRY(rx_missed_errors); +NETSTAT_ENTRY(tx_aborted_errors); +NETSTAT_ENTRY(tx_carrier_errors); +NETSTAT_ENTRY(tx_fifo_errors); +NETSTAT_ENTRY(tx_heartbeat_errors); +NETSTAT_ENTRY(tx_window_errors); +NETSTAT_ENTRY(rx_compressed); +NETSTAT_ENTRY(tx_compressed); + +static struct attribute *netstat_attrs[] = { + &dev_attr_rx_packets.attr, + &dev_attr_tx_packets.attr, + &dev_attr_rx_bytes.attr, + &dev_attr_tx_bytes.attr, + &dev_attr_rx_errors.attr, + &dev_attr_tx_errors.attr, + &dev_attr_rx_dropped.attr, + &dev_attr_tx_dropped.attr, + &dev_attr_multicast.attr, + &dev_attr_collisions.attr, + &dev_attr_rx_length_errors.attr, + &dev_attr_rx_over_errors.attr, + &dev_attr_rx_crc_errors.attr, + &dev_attr_rx_frame_errors.attr, + &dev_attr_rx_fifo_errors.attr, + &dev_attr_rx_missed_errors.attr, + &dev_attr_tx_aborted_errors.attr, + &dev_attr_tx_carrier_errors.attr, + &dev_attr_tx_fifo_errors.attr, + &dev_attr_tx_heartbeat_errors.attr, + &dev_attr_tx_window_errors.attr, + &dev_attr_rx_compressed.attr, + &dev_attr_tx_compressed.attr, + NULL +}; + + +static struct attribute_group netstat_group = { + .name = "statistics", + .attrs = netstat_attrs, +}; + +#ifdef CONFIG_WIRELESS_EXT_SYSFS +/* helper function that does all the locking etc for wireless stats */ +static ssize_t wireless_show(struct device *d, char *buf, + ssize_t (*format)(const struct iw_statistics *, + char *)) +{ + struct net_device *dev = to_net_dev(d); + const struct iw_statistics *iw; + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + if (dev_isalive(dev)) { + iw = get_wireless_stats(dev); + if (iw) + ret = (*format)(iw, buf); + } + rtnl_unlock(); + + return ret; +} + +/* show function template for wireless fields */ +#define WIRELESS_SHOW(name, field, format_string) \ +static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ +{ \ + return sprintf(buf, format_string, iw->field); \ +} \ +static ssize_t show_iw_##name(struct device *d, \ + struct device_attribute *attr, char *buf) \ +{ \ + return wireless_show(d, buf, format_iw_##name); \ +} \ +static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) + +WIRELESS_SHOW(status, status, fmt_hex); +WIRELESS_SHOW(link, qual.qual, fmt_dec); +WIRELESS_SHOW(level, qual.level, fmt_dec); +WIRELESS_SHOW(noise, qual.noise, fmt_dec); +WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); +WIRELESS_SHOW(crypt, discard.code, fmt_dec); +WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); +WIRELESS_SHOW(misc, discard.misc, fmt_dec); +WIRELESS_SHOW(retries, discard.retries, fmt_dec); +WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); + +static struct attribute *wireless_attrs[] = { + &dev_attr_status.attr, + &dev_attr_link.attr, + &dev_attr_level.attr, + &dev_attr_noise.attr, + &dev_attr_nwid.attr, + &dev_attr_crypt.attr, + &dev_attr_fragment.attr, + &dev_attr_retries.attr, + &dev_attr_misc.attr, + &dev_attr_beacon.attr, + NULL +}; + +static struct attribute_group wireless_group = { + .name = "wireless", + .attrs = wireless_attrs, +}; +#endif +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_RPS +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_rx_queue_attr(_attr) container_of(_attr, \ + struct rx_queue_attribute, attr) + +#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) + +static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops rx_queue_sysfs_ops = { + .show = rx_queue_attr_show, + .store = rx_queue_attr_store, +}; + +static ssize_t show_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + struct rps_map *map; + cpumask_var_t mask; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map) + for (i = 0; i < map->len; i++) + cpumask_set_cpu(map->cpus[i], mask); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + rcu_read_unlock(); + free_cpumask_var(mask); + return -EINVAL; + } + rcu_read_unlock(); + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_SPINLOCK(rps_map_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + map = kzalloc(max_t(unsigned, + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + + if (i) + map->len = i; + else { + kfree(map); + map = NULL; + } + + spin_lock(&rps_map_lock); + old_map = rcu_dereference_protected(queue->rps_map, + lockdep_is_held(&rps_map_lock)); + rcu_assign_pointer(queue->rps_map, map); + spin_unlock(&rps_map_lock); + + if (old_map) + kfree_rcu(old_map, rcu); + + free_cpumask_var(mask); + return len; +} + +static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + char *buf) +{ + struct rps_dev_flow_table *flow_table; + unsigned int val = 0; + + rcu_read_lock(); + flow_table = rcu_dereference(queue->rps_flow_table); + if (flow_table) + val = flow_table->mask + 1; + rcu_read_unlock(); + + return sprintf(buf, "%u\n", val); +} + +static void rps_dev_flow_table_release_work(struct work_struct *work) +{ + struct rps_dev_flow_table *table = container_of(work, + struct rps_dev_flow_table, free_work); + + vfree(table); +} + +static void rps_dev_flow_table_release(struct rcu_head *rcu) +{ + struct rps_dev_flow_table *table = container_of(rcu, + struct rps_dev_flow_table, rcu); + + INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); + schedule_work(&table->free_work); +} + +static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + const char *buf, size_t len) +{ + unsigned int count; + char *endp; + struct rps_dev_flow_table *table, *old_table; + static DEFINE_SPINLOCK(rps_dev_flow_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + count = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + if (count) { + int i; + + if (count > 1<<30) { + /* Enforce a limit to prevent overflow */ + return -EINVAL; + } + count = roundup_pow_of_two(count); + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); + if (!table) + return -ENOMEM; + + table->mask = count - 1; + for (i = 0; i < count; i++) + table->flows[i].cpu = RPS_NO_CPU; + } else + table = NULL; + + spin_lock(&rps_dev_flow_lock); + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); + rcu_assign_pointer(queue->rps_flow_table, table); + spin_unlock(&rps_dev_flow_lock); + + if (old_table) + call_rcu(&old_table->rcu, rps_dev_flow_table_release); + + return len; +} + +static struct rx_queue_attribute rps_cpus_attribute = + __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + + +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = + __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); + +static struct attribute *rx_queue_default_attrs[] = { + &rps_cpus_attribute.attr, + &rps_dev_flow_table_cnt_attribute.attr, + NULL +}; + +static void rx_queue_release(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + + + map = rcu_dereference_raw(queue->rps_map); + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); + kfree_rcu(map, rcu); + } + + flow_table = rcu_dereference_raw(queue->rps_flow_table); + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type rx_queue_ktype = { + .sysfs_ops = &rx_queue_sysfs_ops, + .release = rx_queue_release, + .default_attrs = rx_queue_default_attrs, +}; + +static int rx_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_rx_queue *queue = net->_rx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, + "rx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; +} +#endif /* CONFIG_RPS */ + +int +net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ +#ifdef CONFIG_RPS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = rx_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_rx[i].kobj); + + return error; +#else + return 0; +#endif +} + +#ifdef CONFIG_XPS +/* + * netdev_queue sysfs structures and functions. + */ +struct netdev_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_netdev_queue_attr(_attr) container_of(_attr, \ + struct netdev_queue_attribute, attr) + +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) + +static ssize_t netdev_queue_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t netdev_queue_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops netdev_queue_sysfs_ops = { + .show = netdev_queue_attr_show, + .store = netdev_queue_attr_store, +}; + +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + + +static ssize_t show_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + cpumask_var_t mask; + unsigned long index; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + for_each_possible_cpu(i) { + struct xps_map *map = + rcu_dereference(dev_maps->cpu_map[i]); + if (map) { + int j; + for (j = 0; j < map->len; j++) { + if (map->queues[j] == index) { + cpumask_set_cpu(i, mask); + break; + } + } + } + } + } + rcu_read_unlock(); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + free_cpumask_var(mask); + return -EINVAL; + } + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static ssize_t store_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + cpumask_var_t mask; + int err, i, cpu, pos, map_len, alloc_len, need_set; + unsigned long index; + struct xps_map *map, *new_map; + struct xps_dev_maps *dev_maps, *new_dev_maps; + int nonempty = 0; + int numa_node = -2; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + new_dev_maps = kzalloc(max_t(unsigned, + XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); + if (!new_dev_maps) { + free_cpumask_var(mask); + return -ENOMEM; + } + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + new_map = map; + if (map) { + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + map_len = map->len; + alloc_len = map->alloc_len; + } else + pos = map_len = alloc_len = 0; + + need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); +#ifdef CONFIG_NUMA + if (need_set) { + if (numa_node == -2) + numa_node = cpu_to_node(cpu); + else if (numa_node != cpu_to_node(cpu)) + numa_node = -1; + } +#endif + if (need_set && pos >= map_len) { + /* Need to add queue to this CPU's map */ + if (map_len >= alloc_len) { + alloc_len = alloc_len ? + 2 * alloc_len : XPS_MIN_MAP_ALLOC; + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + goto error; + new_map->alloc_len = alloc_len; + for (i = 0; i < map_len; i++) + new_map->queues[i] = map->queues[i]; + new_map->len = map_len; + } + new_map->queues[new_map->len++] = index; + } else if (!need_set && pos < map_len) { + /* Need to remove queue from this CPU's map */ + if (map_len > 1) + new_map->queues[pos] = + new_map->queues[--new_map->len]; + else + new_map = NULL; + } + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); + } + + /* Cleanup old maps */ + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) + kfree_rcu(map, rcu); + if (new_dev_maps->cpu_map[cpu]) + nonempty = 1; + } + + if (nonempty) + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + else { + kfree(new_dev_maps); + rcu_assign_pointer(dev->xps_maps, NULL); + } + + if (dev_maps) + kfree_rcu(dev_maps, rcu); + + netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : + NUMA_NO_NODE); + + mutex_unlock(&xps_map_mutex); + + free_cpumask_var(mask); + return len; + +error: + mutex_unlock(&xps_map_mutex); + + if (new_dev_maps) + for_each_possible_cpu(i) + kfree(rcu_dereference_protected( + new_dev_maps->cpu_map[i], + 1)); + kfree(new_dev_maps); + free_cpumask_var(mask); + return -ENOMEM; +} + +static struct netdev_queue_attribute xps_cpus_attribute = + __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); + +static struct attribute *netdev_queue_default_attrs[] = { + &xps_cpus_attribute.attr, + NULL +}; + +static void netdev_queue_release(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (dev_maps) { + for_each_possible_cpu(i) { + map = xmap_dereference(dev_maps->cpu_map[i]); + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + kfree_rcu(map, rcu); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + } + + mutex_unlock(&xps_map_mutex); + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type netdev_queue_ktype = { + .sysfs_ops = &netdev_queue_sysfs_ops, + .release = netdev_queue_release, + .default_attrs = netdev_queue_default_attrs, +}; + +static int netdev_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_queue *queue = net->_tx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; +} +#endif /* CONFIG_XPS */ + +int +netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ +#ifdef CONFIG_XPS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = netdev_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_tx[i].kobj); + + return error; +#else + return 0; +#endif +} + +static int register_queue_kobjects(struct net_device *net) +{ + int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; + +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; +#endif + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + error = net_rx_queue_update_kobjects(net, 0, real_rx); + if (error) + goto error; + rxq = real_rx; + + error = netdev_queue_update_kobjects(net, 0, real_tx); + if (error) + goto error; + txq = real_tx; + + return 0; + +error: + netdev_queue_update_kobjects(net, txq, 0); + net_rx_queue_update_kobjects(net, rxq, 0); + return error; +} + +static void remove_queue_kobjects(struct net_device *net) +{ + int real_rx = 0, real_tx = 0; + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + net_rx_queue_update_kobjects(net, real_rx, 0); + netdev_queue_update_kobjects(net, real_tx, 0); +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) + kset_unregister(net->queues_kset); +#endif +} + +static void *net_grab_current_ns(void) +{ + struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS + if (ns) + atomic_inc(&ns->passive); +#endif + return ns; +} + +static const void *net_initial_ns(void) +{ + return &init_net; +} + +static const void *net_netlink_ns(struct sock *sk) +{ + return sock_net(sk); +} + +struct kobj_ns_type_operations net_ns_type_operations = { + .type = KOBJ_NS_TYPE_NET, + .grab_current_ns = net_grab_current_ns, + .netlink_ns = net_netlink_ns, + .initial_ns = net_initial_ns, + .drop_ns = net_drop_ns, +}; +EXPORT_SYMBOL_GPL(net_ns_type_operations); + +#ifdef CONFIG_HOTPLUG +static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) +{ + struct net_device *dev = to_net_dev(d); + int retval; + + /* pass interface to uevent. */ + retval = add_uevent_var(env, "INTERFACE=%s", dev->name); + if (retval) + goto exit; + + /* pass ifindex to uevent. + * ifindex is useful as it won't change (interface name may change) + * and is what RtNetlink uses natively. */ + retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); + +exit: + return retval; +} +#endif + +/* + * netdev_release -- destroy and free a dead device. + * Called when last reference to device kobject is gone. + */ +static void netdev_release(struct device *d) +{ + struct net_device *dev = to_net_dev(d); + + BUG_ON(dev->reg_state != NETREG_RELEASED); + + kfree(dev->ifalias); + kfree((char *)dev - dev->padded); +} + +static const void *net_namespace(struct device *d) +{ + struct net_device *dev; + dev = container_of(d, struct net_device, dev); + return dev_net(dev); +} + +static struct class net_class = { + .name = "net", + .dev_release = netdev_release, +#ifdef CONFIG_SYSFS + .dev_attrs = net_class_attributes, +#endif /* CONFIG_SYSFS */ +#ifdef CONFIG_HOTPLUG + .dev_uevent = netdev_uevent, +#endif + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, +}; + +/* Delete sysfs entries but hold kobject reference until after all + * netdev references are gone. + */ +void netdev_unregister_kobject(struct net_device * net) +{ + struct device *dev = &(net->dev); + + kobject_get(&dev->kobj); + + remove_queue_kobjects(net); + + device_del(dev); +} + +/* Create sysfs entries for network device. */ +int netdev_register_kobject(struct net_device *net) +{ + struct device *dev = &(net->dev); + const struct attribute_group **groups = net->sysfs_groups; + int error = 0; + + device_initialize(dev); + dev->class = &net_class; + dev->platform_data = net; + dev->groups = groups; + + dev_set_name(dev, "%s", net->name); + +#ifdef CONFIG_SYSFS + /* Allow for a device specific group */ + if (*groups) + groups++; + + *groups++ = &netstat_group; +#ifdef CONFIG_WIRELESS_EXT_SYSFS + if (net->ieee80211_ptr) + *groups++ = &wireless_group; +#ifdef CONFIG_WIRELESS_EXT + else if (net->wireless_handlers) + *groups++ = &wireless_group; +#endif +#endif +#endif /* CONFIG_SYSFS */ + + error = device_add(dev); + if (error) + return error; + + error = register_queue_kobjects(net); + if (error) { + device_del(dev); + return error; + } + + return error; +} + +int netdev_class_create_file(struct class_attribute *class_attr) +{ + return class_create_file(&net_class, class_attr); +} +EXPORT_SYMBOL(netdev_class_create_file); + +void netdev_class_remove_file(struct class_attribute *class_attr) +{ + class_remove_file(&net_class, class_attr); +} +EXPORT_SYMBOL(netdev_class_remove_file); + +int netdev_kobject_init(void) +{ + kobj_ns_type_register(&net_ns_type_operations); + return class_register(&net_class); +} diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h new file mode 100644 index 00000000..bd7751ec --- /dev/null +++ b/net/core/net-sysfs.h @@ -0,0 +1,11 @@ +#ifndef __NET_SYSFS_H__ +#define __NET_SYSFS_H__ + +int netdev_kobject_init(void); +int netdev_register_kobject(struct net_device *); +void netdev_unregister_kobject(struct net_device *); +int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +int netdev_queue_update_kobjects(struct net_device *net, + int old_num, int new_num); + +#endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c new file mode 100644 index 00000000..7f1bb2ab --- /dev/null +++ b/net/core/net-traces.c @@ -0,0 +1,34 @@ +/* + * consolidates trace point definitions + * + * Copyright (C) 2009 Neil Horman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include +#include +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c new file mode 100644 index 00000000..2772ed11 --- /dev/null +++ b/net/core/net_namespace.c @@ -0,0 +1,638 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Our network namespace constructor/destructor lists + */ + +static LIST_HEAD(pernet_list); +static struct list_head *first_device = &pernet_list; +static DEFINE_MUTEX(net_mutex); + +LIST_HEAD(net_namespace_list); +EXPORT_SYMBOL_GPL(net_namespace_list); + +struct net init_net; +EXPORT_SYMBOL(init_net); + +#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ + +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = max_gen_ptrs; + + return ng; +} + +static int net_assign_generic(struct net *net, int id, void *data) +{ + struct net_generic *ng, *old_ng; + + BUG_ON(!mutex_is_locked(&net_mutex)); + BUG_ON(id == 0); + + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&net_mutex)); + ng = old_ng; + if (old_ng->len >= id) + goto assign; + + ng = net_alloc_generic(); + if (ng == NULL) + return -ENOMEM; + + /* + * Some synchronisation notes: + * + * The net_generic explores the net->gen array inside rcu + * read section. Besides once set the net->gen->ptr[x] + * pointer never changes (see rules in netns/generic.h). + * + * That said, we simply duplicate this array and schedule + * the old copy for kfree after a grace period. + */ + + memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + + rcu_assign_pointer(net->gen, ng); + kfree_rcu(old_ng, rcu); +assign: + ng->ptr[id - 1] = data; + return 0; +} + +static int ops_init(const struct pernet_operations *ops, struct net *net) +{ + int err = -ENOMEM; + void *data = NULL; + + if (ops->id && ops->size) { + data = kzalloc(ops->size, GFP_KERNEL); + if (!data) + goto out; + + err = net_assign_generic(net, *ops->id, data); + if (err) + goto cleanup; + } + err = 0; + if (ops->init) + err = ops->init(net); + if (!err) + return 0; + +cleanup: + kfree(data); + +out: + return err; +} + +static void ops_free(const struct pernet_operations *ops, struct net *net) +{ + if (ops->id && ops->size) { + int id = *ops->id; + kfree(net_generic(net, id)); + } +} + +static void ops_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->exit(net); + } + if (ops->exit_batch) + ops->exit_batch(net_exit_list); +} + +static void ops_free_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->size && ops->id) { + list_for_each_entry(net, net_exit_list, exit_list) + ops_free(ops, net); + } +} + +/* + * setup_net runs the initializers for the network namespace object. + */ +static __net_init int setup_net(struct net *net) +{ + /* Must be called with net_mutex held */ + const struct pernet_operations *ops, *saved_ops; + int error = 0; + LIST_HEAD(net_exit_list); + + atomic_set(&net->count, 1); + atomic_set(&net->passive, 1); + +#ifdef NETNS_REFCNT_DEBUG + atomic_set(&net->use_count, 0); +#endif + + list_for_each_entry(ops, &pernet_list, list) { + error = ops_init(ops, net); + if (error < 0) + goto out_undo; + } +out: + return error; + +out_undo: + /* Walk through the list backwards calling the exit functions + * for the pernet modules whose init functions did not fail. + */ + list_add(&net->exit_list, &net_exit_list); + saved_ops = ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); + + rcu_barrier(); + goto out; +} + + +#ifdef CONFIG_NET_NS +static struct kmem_cache *net_cachep; +static struct workqueue_struct *netns_wq; + +static struct net *net_alloc(void) +{ + struct net *net = NULL; + struct net_generic *ng; + + ng = net_alloc_generic(); + if (!ng) + goto out; + + net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); + if (!net) + goto out_free; + + rcu_assign_pointer(net->gen, ng); +out: + return net; + +out_free: + kfree(ng); + goto out; +} + +static void net_free(struct net *net) +{ +#ifdef NETNS_REFCNT_DEBUG + if (unlikely(atomic_read(&net->use_count) != 0)) { + printk(KERN_EMERG "network namespace not free! Usage: %d\n", + atomic_read(&net->use_count)); + return; + } +#endif + kfree(net->gen); + kmem_cache_free(net_cachep, net); +} + +void net_drop_ns(void *p) +{ + struct net *ns = p; + if (ns && atomic_dec_and_test(&ns->passive)) + net_free(ns); +} + +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + struct net *net; + int rv; + + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + + net = net_alloc(); + if (!net) + return ERR_PTR(-ENOMEM); + mutex_lock(&net_mutex); + rv = setup_net(net); + if (rv == 0) { + rtnl_lock(); + list_add_tail_rcu(&net->list, &net_namespace_list); + rtnl_unlock(); + } + mutex_unlock(&net_mutex); + if (rv < 0) { + net_drop_ns(net); + return ERR_PTR(rv); + } + return net; +} + +static DEFINE_SPINLOCK(cleanup_list_lock); +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ + +static void cleanup_net(struct work_struct *work) +{ + const struct pernet_operations *ops; + struct net *net, *tmp; + LIST_HEAD(net_kill_list); + LIST_HEAD(net_exit_list); + + /* Atomically snapshot the list of namespaces to cleanup */ + spin_lock_irq(&cleanup_list_lock); + list_replace_init(&cleanup_list, &net_kill_list); + spin_unlock_irq(&cleanup_list_lock); + + mutex_lock(&net_mutex); + + /* Don't let anyone else find us. */ + rtnl_lock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) { + list_del_rcu(&net->list); + list_add_tail(&net->exit_list, &net_exit_list); + } + rtnl_unlock(); + + /* + * Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so + * the rcu_barrier() below isn't sufficient alone. + */ + synchronize_rcu(); + + /* Run all of the network namespace exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + /* Free the net generic variables */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); + + mutex_unlock(&net_mutex); + + /* Ensure there are no outstanding rcu callbacks using this + * network namespace. + */ + rcu_barrier(); + + /* Finally it is safe to free my network namespace structure */ + list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + list_del_init(&net->exit_list); + net_drop_ns(net); + } +} +static DECLARE_WORK(net_cleanup_work, cleanup_net); + +void __put_net(struct net *net) +{ + /* Cleanup the network namespace in process context */ + unsigned long flags; + + spin_lock_irqsave(&cleanup_list_lock, flags); + list_add(&net->cleanup_list, &cleanup_list); + spin_unlock_irqrestore(&cleanup_list_lock, flags); + + queue_work(netns_wq, &net_cleanup_work); +} +EXPORT_SYMBOL_GPL(__put_net); + +struct net *get_net_ns_by_fd(int fd) +{ + struct proc_inode *ei; + struct file *file; + struct net *net; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return ERR_CAST(file); + + ei = PROC_I(file->f_dentry->d_inode); + if (ei->ns_ops == &netns_operations) + net = get_net(ei->ns); + else + net = ERR_PTR(-EINVAL); + + fput(file); + return net; +} + +#else +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + if (flags & CLONE_NEWNET) + return ERR_PTR(-EINVAL); + return old_net; +} + +struct net *get_net_ns_by_fd(int fd) +{ + return ERR_PTR(-EINVAL); +} +#endif + +struct net *get_net_ns_by_pid(pid_t pid) +{ + struct task_struct *tsk; + struct net *net; + + /* Lookup the network namespace */ + net = ERR_PTR(-ESRCH); + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) { + struct nsproxy *nsproxy; + nsproxy = task_nsproxy(tsk); + if (nsproxy) + net = get_net(nsproxy->net_ns); + } + rcu_read_unlock(); + return net; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_pid); + +static int __init net_ns_init(void) +{ + struct net_generic *ng; + +#ifdef CONFIG_NET_NS + net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), + SMP_CACHE_BYTES, + SLAB_PANIC, NULL); + + /* Create workqueue for cleanup */ + netns_wq = create_singlethread_workqueue("netns"); + if (!netns_wq) + panic("Could not create netns workq"); +#endif + + ng = net_alloc_generic(); + if (!ng) + panic("Could not allocate generic netns"); + + rcu_assign_pointer(init_net.gen, ng); + + mutex_lock(&net_mutex); + if (setup_net(&init_net)) + panic("Could not setup the initial network namespace"); + + rtnl_lock(); + list_add_tail_rcu(&init_net.list, &net_namespace_list); + rtnl_unlock(); + + mutex_unlock(&net_mutex); + + return 0; +} + +pure_initcall(net_ns_init); + +#ifdef CONFIG_NET_NS +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + struct net *net; + int error; + LIST_HEAD(net_exit_list); + + list_add_tail(&ops->list, list); + if (ops->init || (ops->id && ops->size)) { + for_each_net(net) { + error = ops_init(ops, net); + if (error) + goto out_undo; + list_add_tail(&net->exit_list, &net_exit_list); + } + } + return 0; + +out_undo: + /* If I have an error cleanup all namespaces I initialized */ + list_del(&ops->list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); + return error; +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + struct net *net; + LIST_HEAD(net_exit_list); + + list_del(&ops->list); + for_each_net(net) + list_add_tail(&net->exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); +} + +#else + +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + return ops_init(ops, &init_net); +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); +} + +#endif /* CONFIG_NET_NS */ + +static DEFINE_IDA(net_generic_ids); + +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + int error; + + if (ops->id) { +again: + error = ida_get_new_above(&net_generic_ids, 1, ops->id); + if (error < 0) { + if (error == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + return error; + } + max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); + } + error = __register_pernet_operations(list, ops); + if (error) { + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); + } + + return error; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + + __unregister_pernet_operations(ops); + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); +} + +/** + * register_pernet_subsys - register a network namespace subsystem + * @ops: pernet operations structure for the subsystem + * + * Register a subsystem which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_subsys(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(first_device, ops); + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_subsys); + +/** + * unregister_pernet_subsys - unregister a network namespace subsystem + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_subsys(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); + +/** + * register_pernet_device - register a network namespace device + * @ops: pernet operations structure for the subsystem + * + * Register a device which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_device(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(&pernet_list, ops); + if (!error && (first_device == &pernet_list)) + first_device = &ops->list; + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_device); + +/** + * unregister_pernet_device - unregister a network namespace netdevice + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_device(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_device); + +#ifdef CONFIG_NET_NS +static void *netns_get(struct task_struct *task) +{ + struct net *net = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) + net = get_net(nsproxy->net_ns); + rcu_read_unlock(); + + return net; +} + +static void netns_put(void *ns) +{ + put_net(ns); +} + +static int netns_install(struct nsproxy *nsproxy, void *ns) +{ + put_net(nsproxy->net_ns); + nsproxy->net_ns = get_net(ns); + return 0; +} + +const struct proc_ns_operations netns_operations = { + .name = "net", + .type = CLONE_NEWNET, + .get = netns_get, + .put = netns_put, + .install = netns_install, +}; +#endif diff --git a/net/core/netevent.c b/net/core/netevent.c new file mode 100644 index 00000000..865f0ceb --- /dev/null +++ b/net/core/netevent.c @@ -0,0 +1,69 @@ +/* + * Network event notifiers + * + * Authors: + * Tom Tucker + * Steve Wise + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + */ + +#include +#include +#include + +static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); + +/** + * register_netevent_notifier - register a netevent notifier block + * @nb: notifier + * + * Register a notifier to be called when a netevent occurs. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + */ +int register_netevent_notifier(struct notifier_block *nb) +{ + int err; + + err = atomic_notifier_chain_register(&netevent_notif_chain, nb); + return err; +} +EXPORT_SYMBOL_GPL(register_netevent_notifier); + +/** + * netevent_unregister_notifier - unregister a netevent notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_neigh_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&netevent_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_netevent_notifier); + +/** + * call_netevent_notifiers - call all netevent notifier blocks + * @val: value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + * Call all neighbour notifier blocks. Parameters and return value + * are as for notifier_call_chain(). + */ + +int call_netevent_notifiers(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&netevent_notif_chain, val, v); +} +EXPORT_SYMBOL_GPL(call_netevent_notifiers); diff --git a/net/core/netpoll.c b/net/core/netpoll.c new file mode 100644 index 00000000..05db410f --- /dev/null +++ b/net/core/netpoll.c @@ -0,0 +1,954 @@ +/* + * Common framework for low-level network console, dump, and debugger code + * + * Sep 8 2003 Matt Mackall + * + * based on the netconsole code from: + * + * Copyright (C) 2001 Ingo Molnar + * Copyright (C) 2002 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * We maintain a small pool of fully-sized skbs, to make sure the + * message gets out even in extreme OOM situations. + */ + +#define MAX_UDP_CHUNK 1460 +#define MAX_SKBS 32 + +static struct sk_buff_head skb_pool; + +static atomic_t trapped; + +#define USEC_PER_POLL 50 +#define NETPOLL_RX_ENABLED 1 +#define NETPOLL_RX_DROP 2 + +#define MAX_SKB_SIZE \ + (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ + sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void zap_completion_queue(void); +static void arp_reply(struct sk_buff *skb); + +static unsigned int carrier_timeout = 4; +module_param(carrier_timeout, uint, 0644); + +static void queue_process(struct work_struct *work) +{ + struct netpoll_info *npinfo = + container_of(work, struct netpoll_info, tx_work.work); + struct sk_buff *skb; + unsigned long flags; + + while ((skb = skb_dequeue(&npinfo->txq))) { + struct net_device *dev = skb->dev; + const struct net_device_ops *ops = dev->netdev_ops; + struct netdev_queue *txq; + + if (!netif_device_present(dev) || !netif_running(dev)) { + __kfree_skb(skb); + continue; + } + + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + local_irq_save(flags); + __netif_tx_lock(txq, smp_processor_id()); + if (netif_tx_queue_frozen_or_stopped(txq) || + ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { + skb_queue_head(&npinfo->txq, skb); + __netif_tx_unlock(txq); + local_irq_restore(flags); + + schedule_delayed_work(&npinfo->tx_work, HZ/10); + return; + } + __netif_tx_unlock(txq); + local_irq_restore(flags); + } +} + +static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, __be32 saddr, __be32 daddr) +{ + __wsum psum; + + if (uh->check == 0 || skb_csum_unnecessary(skb)) + return 0; + + psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + if (skb->ip_summed == CHECKSUM_COMPLETE && + !csum_fold(csum_add(psum, skb->csum))) + return 0; + + skb->csum = psum; + + return __skb_checksum_complete(skb); +} + +/* + * Check whether delayed processing was scheduled for our NIC. If so, + * we attempt to grab the poll lock and use ->poll() to pump the card. + * If this fails, either we've recursed in ->poll() or it's already + * running on another CPU. + * + * Note: we don't mask interrupts with this lock because we're using + * trylock here and interrupts are already disabled in the softirq + * case. Further, we test the poll_owner to avoid recursion on UP + * systems where the lock doesn't exist. + * + * In cases where there is bi-directional communications, reading only + * one message at a time can lead to packets being dropped by the + * network adapter, forcing superfluous retries and possibly timeouts. + * Thus, we set our budget to greater than 1. + */ +static int poll_one_napi(struct netpoll_info *npinfo, + struct napi_struct *napi, int budget) +{ + int work; + + /* net_rx_action's ->poll() invocations and our's are + * synchronized by this test which is only made while + * holding the napi->poll_lock. + */ + if (!test_bit(NAPI_STATE_SCHED, &napi->state)) + return budget; + + npinfo->rx_flags |= NETPOLL_RX_DROP; + atomic_inc(&trapped); + set_bit(NAPI_STATE_NPSVC, &napi->state); + + work = napi->poll(napi, budget); + trace_napi_poll(napi); + + clear_bit(NAPI_STATE_NPSVC, &napi->state); + atomic_dec(&trapped); + npinfo->rx_flags &= ~NETPOLL_RX_DROP; + + return budget - work; +} + +static void poll_napi(struct net_device *dev) +{ + struct napi_struct *napi; + int budget = 16; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (napi->poll_owner != smp_processor_id() && + spin_trylock(&napi->poll_lock)) { + budget = poll_one_napi(dev->npinfo, napi, budget); + spin_unlock(&napi->poll_lock); + + if (!budget) + break; + } + } +} + +static void service_arp_queue(struct netpoll_info *npi) +{ + if (npi) { + struct sk_buff *skb; + + while ((skb = skb_dequeue(&npi->arp_tx))) + arp_reply(skb); + } +} + +void netpoll_poll_dev(struct net_device *dev) +{ + const struct net_device_ops *ops; + + if (!dev || !netif_running(dev)) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_poll_controller) + return; + + /* Process pending work on NIC */ + ops->ndo_poll_controller(dev); + + poll_napi(dev); + + if (dev->flags & IFF_SLAVE) { + if (dev->npinfo) { + struct net_device *bond_dev = dev->master; + struct sk_buff *skb; + while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + skb->dev = bond_dev; + skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + } + } + } + + service_arp_queue(dev->npinfo); + + zap_completion_queue(); +} +EXPORT_SYMBOL(netpoll_poll_dev); + +void netpoll_poll(struct netpoll *np) +{ + netpoll_poll_dev(np->dev); +} +EXPORT_SYMBOL(netpoll_poll); + +static void refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&skb_pool.lock, flags); + while (skb_pool.qlen < MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + + __skb_queue_tail(&skb_pool, skb); + } + spin_unlock_irqrestore(&skb_pool.lock, flags); +} + +static void zap_completion_queue(void) +{ + unsigned long flags; + struct softnet_data *sd = &get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if (skb->destructor) { + atomic_inc(&skb->users); + dev_kfree_skb_any(skb); /* put this one back */ + } else { + __kfree_skb(skb); + } + } + } + + put_cpu_var(softnet_data); +} + +static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) +{ + int count = 0; + struct sk_buff *skb; + + zap_completion_queue(); + refill_skbs(); +repeat: + + skb = alloc_skb(len, GFP_ATOMIC); + if (!skb) + skb = skb_dequeue(&skb_pool); + + if (!skb) { + if (++count < 10) { + netpoll_poll(np); + goto repeat; + } + return NULL; + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, reserve); + return skb; +} + +static int netpoll_owner_active(struct net_device *dev) +{ + struct napi_struct *napi; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (napi->poll_owner == smp_processor_id()) + return 1; + } + return 0; +} + +void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, + struct net_device *dev) +{ + int status = NETDEV_TX_BUSY; + unsigned long tries; + const struct net_device_ops *ops = dev->netdev_ops; + /* It is up to the caller to keep npinfo alive. */ + struct netpoll_info *npinfo = np->dev->npinfo; + + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { + __kfree_skb(skb); + return; + } + + /* don't get messages out of order, and no recursion */ + if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { + struct netdev_queue *txq; + unsigned long flags; + + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + local_irq_save(flags); + /* try until next clock tick */ + for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; + tries > 0; --tries) { + if (__netif_tx_trylock(txq)) { + if (!netif_tx_queue_stopped(txq)) { + status = ops->ndo_start_xmit(skb, dev); + if (status == NETDEV_TX_OK) + txq_trans_update(txq); + } + __netif_tx_unlock(txq); + + if (status == NETDEV_TX_OK) + break; + + } + + /* tickle device maybe there is some cleanup */ + netpoll_poll(np); + + udelay(USEC_PER_POLL); + } + + WARN_ONCE(!irqs_disabled(), + "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + dev->name, ops->ndo_start_xmit); + + local_irq_restore(flags); + } + + if (status != NETDEV_TX_OK) { + skb_queue_tail(&npinfo->txq, skb); + schedule_delayed_work(&npinfo->tx_work,0); + } +} +EXPORT_SYMBOL(netpoll_send_skb_on_dev); + +void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ + int total_len, eth_len, ip_len, udp_len; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = len + sizeof(*udph); + ip_len = eth_len = udp_len + sizeof(*iph); + total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; + + skb = find_skb(np, total_len, total_len - len); + if (!skb) + return; + + skb_copy_to_linear_data(skb, msg, len); + skb->len += len; + + skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + udph->check = 0; + udph->check = csum_tcpudp_magic(np->local_ip, + np->remote_ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + put_unaligned(0x45, (unsigned char *)iph); + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip, &(iph->saddr)); + put_unaligned(np->remote_ip, &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); + memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); + + skb->dev = np->dev; + + netpoll_send_skb(np, skb); +} +EXPORT_SYMBOL(netpoll_send_udp); + +static void arp_reply(struct sk_buff *skb) +{ + struct netpoll_info *npinfo = skb->dev->npinfo; + struct arphdr *arp; + unsigned char *arp_ptr; + int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; + __be32 sip, tip; + unsigned char *sha; + struct sk_buff *send_skb; + struct netpoll *np, *tmp; + unsigned long flags; + int hits = 0; + + if (list_empty(&npinfo->rx_np)) + return; + + /* Before checking the packet, we do some early + inspection whether this is interesting at all */ + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->dev == skb->dev) + hits++; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + + /* No netpoll struct is using this dev */ + if (!hits) + return; + + /* No arp on this interface */ + if (skb->dev->flags & IFF_NOARP) + return; + + if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) + return; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + arp = arp_hdr(skb); + + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_op != htons(ARPOP_REQUEST)) + return; + + arp_ptr = (unsigned char *)(arp+1); + /* save the location of the src hw addr */ + sha = arp_ptr; + arp_ptr += skb->dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + /* If we actually cared about dst hw addr, + it would get copied here */ + arp_ptr += skb->dev->addr_len; + memcpy(&tip, arp_ptr, 4); + + /* Should we ignore arp? */ + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + return; + + size = arp_hdr_len(skb->dev); + + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip) + continue; + + send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + if (!send_skb) + continue; + + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); + + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } + + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ + + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); +} + +int __netpoll_rx(struct sk_buff *skb) +{ + int proto, len, ulen; + int hits = 0; + const struct iphdr *iph; + struct udphdr *uh; + struct netpoll_info *npinfo = skb->dev->npinfo; + struct netpoll *np, *tmp; + + if (list_empty(&npinfo->rx_np)) + goto out; + + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + /* check if netpoll clients need ARP */ + if (skb->protocol == htons(ETH_P_ARP) && + atomic_read(&trapped)) { + skb_queue_tail(&npinfo->arp_tx, skb); + return 1; + } + + proto = ntohs(eth_hdr(skb)->h_proto); + if (proto != ETH_P_IP) + goto out; + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + if (skb_shared(skb)) + goto out; + + iph = (struct iphdr *)skb->data; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + if (iph->ihl < 5 || iph->version != 4) + goto out; + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + /* + * Our transport medium may have padded the buffer out. + * Now We trim to the true length of the frame. + */ + if (pskb_trim_rcsum(skb, len)) + goto out; + + if (iph->protocol != IPPROTO_UDP) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len) + goto out; + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) + goto out; + + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip && np->local_ip != iph->daddr) + continue; + if (np->remote_ip && np->remote_ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + + if (!hits) + goto out; + + kfree_skb(skb); + return 1; + +out: + if (atomic_read(&trapped)) { + kfree_skb(skb); + return 1; + } + + return 0; +} + +void netpoll_print_options(struct netpoll *np) +{ + printk(KERN_INFO "%s: local port %d\n", + np->name, np->local_port); + printk(KERN_INFO "%s: local IP %pI4\n", + np->name, &np->local_ip); + printk(KERN_INFO "%s: interface '%s'\n", + np->name, np->dev_name); + printk(KERN_INFO "%s: remote port %d\n", + np->name, np->remote_port); + printk(KERN_INFO "%s: remote IP %pI4\n", + np->name, &np->remote_ip); + printk(KERN_INFO "%s: remote ethernet address %pM\n", + np->name, np->remote_mac); +} +EXPORT_SYMBOL(netpoll_print_options); + +int netpoll_parse_options(struct netpoll *np, char *opt) +{ + char *cur=opt, *delim; + + if (*cur != '@') { + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim = 0; + np->local_port = simple_strtol(cur, NULL, 10); + cur = delim; + } + cur++; + + if (*cur != '/') { + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim = 0; + np->local_ip = in_aton(cur); + cur = delim; + } + cur++; + + if (*cur != ',') { + /* parse out dev name */ + if ((delim = strchr(cur, ',')) == NULL) + goto parse_failed; + *delim = 0; + strlcpy(np->dev_name, cur, sizeof(np->dev_name)); + cur = delim; + } + cur++; + + if (*cur != '@') { + /* dst port */ + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim = 0; + if (*cur == ' ' || *cur == '\t') + printk(KERN_INFO "%s: warning: whitespace" + "is not allowed\n", np->name); + np->remote_port = simple_strtol(cur, NULL, 10); + cur = delim; + } + cur++; + + /* dst ip */ + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim = 0; + np->remote_ip = in_aton(cur); + cur = delim + 1; + + if (*cur != 0) { + /* MAC address */ + if (!mac_pton(cur, np->remote_mac)) + goto parse_failed; + } + + netpoll_print_options(np); + + return 0; + + parse_failed: + printk(KERN_INFO "%s: couldn't parse config at '%s'!\n", + np->name, cur); + return -1; +} +EXPORT_SYMBOL(netpoll_parse_options); + +int __netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = np->dev; + struct netpoll_info *npinfo; + const struct net_device_ops *ops; + unsigned long flags; + int err; + + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || + !ndev->netdev_ops->ndo_poll_controller) { + printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", + np->name, np->dev_name); + err = -ENOTSUPP; + goto out; + } + + if (!ndev->npinfo) { + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + if (!npinfo) { + err = -ENOMEM; + goto out; + } + + npinfo->rx_flags = 0; + INIT_LIST_HEAD(&npinfo->rx_np); + + spin_lock_init(&npinfo->rx_lock); + skb_queue_head_init(&npinfo->arp_tx); + skb_queue_head_init(&npinfo->txq); + INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); + + atomic_set(&npinfo->refcnt, 1); + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_setup) { + err = ops->ndo_netpoll_setup(ndev, npinfo); + if (err) + goto free_npinfo; + } + } else { + npinfo = ndev->npinfo; + atomic_inc(&npinfo->refcnt); + } + + npinfo->netpoll = np; + + if (np->rx_hook) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + npinfo->rx_flags |= NETPOLL_RX_ENABLED; + list_add_tail(&np->rx, &npinfo->rx_np); + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } + + /* last thing to do is link it to the net device structure */ + rcu_assign_pointer(ndev->npinfo, npinfo); + + return 0; + +free_npinfo: + kfree(npinfo); +out: + return err; +} +EXPORT_SYMBOL_GPL(__netpoll_setup); + +int netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = NULL; + struct in_device *in_dev; + int err; + + if (np->dev_name) + ndev = dev_get_by_name(&init_net, np->dev_name); + if (!ndev) { + printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", + np->name, np->dev_name); + return -ENODEV; + } + + if (ndev->master) { + printk(KERN_ERR "%s: %s is a slave device, aborting.\n", + np->name, np->dev_name); + err = -EBUSY; + goto put; + } + + if (!netif_running(ndev)) { + unsigned long atmost, atleast; + + printk(KERN_INFO "%s: device %s not up yet, forcing it\n", + np->name, np->dev_name); + + rtnl_lock(); + err = dev_open(ndev); + rtnl_unlock(); + + if (err) { + printk(KERN_ERR "%s: failed to open %s\n", + np->name, ndev->name); + goto put; + } + + atleast = jiffies + HZ/10; + atmost = jiffies + carrier_timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + printk(KERN_NOTICE + "%s: timeout waiting for carrier\n", + np->name); + break; + } + msleep(1); + } + + /* If carrier appears to come up instantly, we don't + * trust it and pause so that we don't pump all our + * queued console messages into the bitbucket. + */ + + if (time_before(jiffies, atleast)) { + printk(KERN_NOTICE "%s: carrier detect appears" + " untrustworthy, waiting 4 seconds\n", + np->name); + msleep(4000); + } + } + + if (!np->local_ip) { + rcu_read_lock(); + in_dev = __in_dev_get_rcu(ndev); + + if (!in_dev || !in_dev->ifa_list) { + rcu_read_unlock(); + printk(KERN_ERR "%s: no IP address for %s, aborting\n", + np->name, np->dev_name); + err = -EDESTADDRREQ; + goto put; + } + + np->local_ip = in_dev->ifa_list->ifa_local; + rcu_read_unlock(); + printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip); + } + + np->dev = ndev; + + /* fill up the skb queue */ + refill_skbs(); + + rtnl_lock(); + err = __netpoll_setup(np); + rtnl_unlock(); + + if (err) + goto put; + + return 0; + +put: + dev_put(ndev); + return err; +} +EXPORT_SYMBOL(netpoll_setup); + +static int __init netpoll_init(void) +{ + skb_queue_head_init(&skb_pool); + return 0; +} +core_initcall(netpoll_init); + +void __netpoll_cleanup(struct netpoll *np) +{ + struct netpoll_info *npinfo; + unsigned long flags; + + npinfo = np->dev->npinfo; + if (!npinfo) + return; + + if (!list_empty(&npinfo->rx_np)) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_del(&np->rx); + if (list_empty(&npinfo->rx_np)) + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } + + if (atomic_dec_and_test(&npinfo->refcnt)) { + const struct net_device_ops *ops; + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_cleanup) + ops->ndo_netpoll_cleanup(np->dev); + + rcu_assign_pointer(np->dev->npinfo, NULL); + + /* avoid racing with NAPI reading npinfo */ + synchronize_rcu_bh(); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + cancel_delayed_work_sync(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + kfree(npinfo); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); + +void netpoll_cleanup(struct netpoll *np) +{ + if (!np->dev) + return; + + rtnl_lock(); + __netpoll_cleanup(np); + rtnl_unlock(); + + dev_put(np->dev); + np->dev = NULL; +} +EXPORT_SYMBOL(netpoll_cleanup); + +int netpoll_trap(void) +{ + return atomic_read(&trapped); +} +EXPORT_SYMBOL(netpoll_trap); + +void netpoll_set_trap(int trap) +{ + if (trap) + atomic_inc(&trapped); + else + atomic_dec(&trapped); +} +EXPORT_SYMBOL(netpoll_set_trap); diff --git a/net/core/pktgen.c b/net/core/pktgen.c new file mode 100644 index 00000000..c0e0f767 --- /dev/null +++ b/net/core/pktgen.c @@ -0,0 +1,3796 @@ +/* + * Authors: + * Copyright 2001, 2002 by Robert Olsson + * Uppsala University and + * Swedish University of Agricultural Sciences + * + * Alexey Kuznetsov + * Ben Greear + * Jens Låås + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * A tool for loading the network with preconfigurated packets. + * The tool is implemented as a linux module. Parameters are output + * device, delay (to hard_xmit), number of packets, and whether + * to use multiple SKBs or just the same one. + * pktgen uses the installed interface's output routine. + * + * Additional hacking by: + * + * Jens.Laas@data.slu.se + * Improved by ANK. 010120. + * Improved by ANK even more. 010212. + * MAC address typo fixed. 010417 --ro + * Integrated. 020301 --DaveM + * Added multiskb option 020301 --DaveM + * Scaling of results. 020417--sigurdur@linpro.no + * Significant re-work of the module: + * * Convert to threaded model to more efficiently be able to transmit + * and receive on multiple interfaces at once. + * * Converted many counters to __u64 to allow longer runs. + * * Allow configuration of ranges, like min/max IP address, MACs, + * and UDP-ports, for both source and destination, and can + * set to use a random distribution or sequentially walk the range. + * * Can now change most values after starting. + * * Place 12-byte packet in UDP payload with magic number, + * sequence number, and timestamp. + * * Add receiver code that detects dropped pkts, re-ordered pkts, and + * latencies (with micro-second) precision. + * * Add IOCTL interface to easily get counters & configuration. + * --Ben Greear + * + * Renamed multiskb to clone_skb and cleaned up sending core for two distinct + * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0 + * as a "fastpath" with a configurable number of clones after alloc's. + * clone_skb=0 means all packets are allocated this also means ranges time + * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100 + * clones. + * + * Also moved to /proc/net/pktgen/ + * --ro + * + * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever + * mistakes. Also merged in DaveM's patch in the -pre6 patch. + * --Ben Greear + * + * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) + * + * + * 021124 Finished major redesign and rewrite for new functionality. + * See Documentation/networking/pktgen.txt for how to use this. + * + * The new operation: + * For each CPU one thread/process is created at start. This process checks + * for running devices in the if_list and sends packets until count is 0 it + * also the thread checks the thread->control which is used for inter-process + * communication. controlling process "posts" operations to the threads this + * way. The if_lock should be possible to remove when add/rem_device is merged + * into this too. + * + * By design there should only be *one* "controlling" process. In practice + * multiple write accesses gives unpredictable result. Understood by "write" + * to /proc gives result code thats should be read be the "writer". + * For practical use this should be no problem. + * + * Note when adding devices to a specific CPU there good idea to also assign + * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU. + * --ro + * + * Fix refcount off by one if first packet fails, potential null deref, + * memleak 030710- KJP + * + * First "ranges" functionality for ipv6 030726 --ro + * + * Included flow support. 030802 ANK. + * + * Fixed unaligned access on IA-64 Grant Grundler + * + * Remove if fix from added Harald Welte 040419 + * ia64 compilation fix from Aron Griffis 040604 + * + * New xmit() return, do_div and misc clean up by Stephen Hemminger + * 040923 + * + * Randy Dunlap fixed u64 printk compiler waring + * + * Remove FCS from BW calculation. Lennert Buytenhek + * New time handling. Lennert Buytenhek 041213 + * + * Corrections from Nikolai Malykh (nmalykh@bilim.com) + * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230 + * + * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan + * 050103 + * + * MPLS support by Steven Whitehouse + * + * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) + * + * Fixed src_mac command to set source mac of packet to value specified in + * command by Adit Ranadive + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_XFRM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include /* do_div */ + +#define VERSION "2.74" +#define IP_NAME_SZ 32 +#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ +#define MPLS_STACK_BOTTOM htonl(0x00000100) + +#define func_enter() pr_debug("entering %s\n", __func__); + +/* Device flag bits */ +#define F_IPSRC_RND (1<<0) /* IP-Src Random */ +#define F_IPDST_RND (1<<1) /* IP-Dst Random */ +#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ +#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ +#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ +#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ +#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ +#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ +#define F_MPLS_RND (1<<8) /* Random MPLS labels */ +#define F_VID_RND (1<<9) /* Random VLAN ID */ +#define F_SVID_RND (1<<10) /* Random SVLAN ID */ +#define F_FLOW_SEQ (1<<11) /* Sequential flows */ +#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ +#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ +#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ +#define F_NODE (1<<15) /* Node memory alloc*/ + +/* Thread control flag bits */ +#define T_STOP (1<<0) /* Stop run */ +#define T_RUN (1<<1) /* Start run */ +#define T_REMDEVALL (1<<2) /* Remove all devs */ +#define T_REMDEV (1<<3) /* Remove one dev */ + +/* If lock -- can be removed after some work */ +#define if_lock(t) spin_lock(&(t->if_lock)); +#define if_unlock(t) spin_unlock(&(t->if_lock)); + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" +#define PGCTRL "pgctrl" +static struct proc_dir_entry *pg_proc_dir; + +#define MAX_CFLOWS 65536 + +#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) +#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) + +struct flow_state { + __be32 cur_daddr; + int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif + __u32 flags; +}; + +/* flow flag bits */ +#define F_INIT (1<<0) /* flow has been initialized */ + +struct pktgen_dev { + /* + * Try to keep frequent/infrequent used vars. separated. + */ + struct proc_dir_entry *entry; /* proc file */ + struct pktgen_thread *pg_thread;/* the owner */ + struct list_head list; /* chaining in the thread's run-queue */ + + int running; /* if false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + + int min_pkt_size; /* = ETH_ZLEN; */ + int max_pkt_size; /* = ETH_ZLEN; */ + int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ + int nfrags; + struct page *page; + u64 delay; /* nano-seconds */ + + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, */ + + /* runtime counters relating to clone_skb */ + + __u64 allocated_skbs; + __u32 clone_count; + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? + * This will keep sequence numbers in order + */ + ktime_t next_tx; + ktime_t started_at; + ktime_t stopped_at; + u64 idle_acc; /* nano-seconds */ + + __u32 seq_num; + + int clone_skb; /* + * Use multiple SKBs during packet gen. + * If this number is greater than 1, then + * that many copies of the same packet will be + * sent before a new packet is allocated. + * If you want to send 1024 identical packets + * before creating a new packet, + * set clone_skb to 1024. + */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __be32 saddr_min; /* inclusive, source IP address */ + __be32 saddr_max; /* exclusive, source IP address */ + __be32 daddr_min; /* inclusive, dest IP address */ + __be32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* DSCP + ECN */ + __u8 tos; /* six MSB of (former) IPv4 TOS + are for dscp codepoint */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + (see RFC 3260, sec. 4) */ + + /* MPLS */ + unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + /* VLAN/SVLAN (802.1Q/Q-in-Q) */ + __u8 vlan_p; + __u8 vlan_cfi; + __u16 vlan_id; /* 0xffff means no vlan tag */ + + __u8 svlan_p; + __u8 svlan_cfi; + __u16 svlan_id; /* 0xffff means no svlan tag */ + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __be32 cur_saddr; + __be32 cur_daddr; + __u16 ip_id; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u16 cur_queue_map; + __u32 cur_pkt_size; + __u32 last_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff *skb; /* skb we are to transmit next, used for when we + * are transmitting the same one multiple times + */ + struct net_device *odev; /* The out-going device. + * Note that the device should have it's + * pg_info pointer pointing back to this + * device. + * Set when the user specifies the out-going + * device name (not when the inject is + * started as it used to do.) + */ + char odevname[32]; + struct flow_state *flows; + unsigned cflows; /* Concurrent flows (config) */ + unsigned lflow; /* Flow length (config) */ + unsigned nflows; /* accumulated flows (stats) */ + unsigned curfl; /* current sequenced flow (state)*/ + + u16 queue_map_min; + u16 queue_map_max; + __u32 skb_priority; /* skb priority field */ + int node; /* Memory node */ + +#ifdef CONFIG_XFRM + __u8 ipsmode; /* IPSEC mode (config) */ + __u8 ipsproto; /* IPSEC type (config) */ +#endif + char result[512]; +}; + +struct pktgen_hdr { + __be32 pgh_magic; + __be32 seq_num; + __be32 tv_sec; + __be32 tv_usec; +}; + +static bool pktgen_exiting __read_mostly; + +struct pktgen_thread { + spinlock_t if_lock; /* for list of devices */ + struct list_head if_list; /* All device here */ + struct list_head th_list; + struct task_struct *tsk; + char result[512]; + + /* Field for thread to receive "posted" events terminate, + stop ifs etc. */ + + u32 control; + int cpu; + + wait_queue_head_t queue; + struct completion start_done; +}; + +#define REMOVE 1 +#define FIND 0 + +static inline ktime_t ktime_now(void) +{ + struct timespec ts; + ktime_get_ts(&ts); + + return timespec_to_ktime(ts); +} + +/* This works even if 32 bit because of careful byte order choice */ +static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 < cmp2.tv64; +} + +static const char version[] = + "Packet Generator for packet performance testing. " + "Version: " VERSION "\n"; + +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, + const char *ifname, bool exact); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(void); +static void pktgen_reset_all_threads(void); +static void pktgen_stop_all_threads_ifs(void); + +static void pktgen_stop(struct pktgen_thread *t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); + +static unsigned int scan_ip6(const char *s, char ip[16]); + +/* Module parameters, defaults. */ +static int pg_count_d __read_mostly = 1000; +static int pg_delay_d __read_mostly; +static int pg_clone_skb_d __read_mostly; +static int debug __read_mostly; + +static DEFINE_MUTEX(pktgen_thread_lock); +static LIST_HEAD(pktgen_threads); + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, +}; + +/* + * /proc handling functions + * + */ + +static int pgctrl_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, version); + return 0; +} + +static ssize_t pgctrl_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int err = 0; + char data[128]; + + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + goto out; + } + + if (count > sizeof(data)) + count = sizeof(data); + + if (copy_from_user(data, buf, count)) { + err = -EFAULT; + goto out; + } + data[count - 1] = 0; /* Make string */ + + if (!strcmp(data, "stop")) + pktgen_stop_all_threads_ifs(); + + else if (!strcmp(data, "start")) + pktgen_run_all_threads(); + + else if (!strcmp(data, "reset")) + pktgen_reset_all_threads(); + + else + pr_warning("Unknown command: %s\n", data); + + err = count; + +out: + return err; +} + +static int pgctrl_open(struct inode *inode, struct file *file) +{ + return single_open(file, pgctrl_show, PDE(inode)->data); +} + +static const struct file_operations pktgen_fops = { + .owner = THIS_MODULE, + .open = pgctrl_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pgctrl_write, + .release = single_release, +}; + +static int pktgen_if_show(struct seq_file *seq, void *v) +{ + const struct pktgen_dev *pkt_dev = seq->private; + ktime_t stopped; + u64 idle; + + seq_printf(seq, + "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", + (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, + pkt_dev->max_pkt_size); + + seq_printf(seq, + " frags: %d delay: %llu clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, + pkt_dev->clone_skb, pkt_dev->odevname); + + seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, + pkt_dev->lflow); + + seq_printf(seq, + " queue_map_min: %u queue_map_max: %u\n", + pkt_dev->queue_map_min, + pkt_dev->queue_map_max); + + if (pkt_dev->skb_priority) + seq_printf(seq, " skb_priority: %u\n", + pkt_dev->skb_priority); + + if (pkt_dev->flags & F_IPV6) { + seq_printf(seq, + " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n" + " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n", + &pkt_dev->in6_saddr, + &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr, + &pkt_dev->in6_daddr, + &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr); + } else { + seq_printf(seq, + " dst_min: %s dst_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max); + seq_printf(seq, + " src_min: %s src_max: %s\n", + pkt_dev->src_min, pkt_dev->src_max); + } + + seq_puts(seq, " src_mac: "); + + seq_printf(seq, "%pM ", + is_zero_ether_addr(pkt_dev->src_mac) ? + pkt_dev->odev->dev_addr : pkt_dev->src_mac); + + seq_printf(seq, "dst_mac: "); + seq_printf(seq, "%pM\n", pkt_dev->dst_mac); + + seq_printf(seq, + " udp_src_min: %d udp_src_max: %d" + " udp_dst_min: %d udp_dst_max: %d\n", + pkt_dev->udp_src_min, pkt_dev->udp_src_max, + pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); + + seq_printf(seq, + " src_mac_count: %d dst_mac_count: %d\n", + pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + + if (pkt_dev->nr_labels) { + unsigned i; + seq_printf(seq, " mpls: "); + for (i = 0; i < pkt_dev->nr_labels; i++) + seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), + i == pkt_dev->nr_labels-1 ? "\n" : ", "); + } + + if (pkt_dev->vlan_id != 0xffff) + seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n", + pkt_dev->vlan_id, pkt_dev->vlan_p, + pkt_dev->vlan_cfi); + + if (pkt_dev->svlan_id != 0xffff) + seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n", + pkt_dev->svlan_id, pkt_dev->svlan_p, + pkt_dev->svlan_cfi); + + if (pkt_dev->tos) + seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos); + + if (pkt_dev->traffic_class) + seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + + if (pkt_dev->node >= 0) + seq_printf(seq, " node: %d\n", pkt_dev->node); + + seq_printf(seq, " Flags: "); + + if (pkt_dev->flags & F_IPV6) + seq_printf(seq, "IPV6 "); + + if (pkt_dev->flags & F_IPSRC_RND) + seq_printf(seq, "IPSRC_RND "); + + if (pkt_dev->flags & F_IPDST_RND) + seq_printf(seq, "IPDST_RND "); + + if (pkt_dev->flags & F_TXSIZE_RND) + seq_printf(seq, "TXSIZE_RND "); + + if (pkt_dev->flags & F_UDPSRC_RND) + seq_printf(seq, "UDPSRC_RND "); + + if (pkt_dev->flags & F_UDPDST_RND) + seq_printf(seq, "UDPDST_RND "); + + if (pkt_dev->flags & F_MPLS_RND) + seq_printf(seq, "MPLS_RND "); + + if (pkt_dev->flags & F_QUEUE_MAP_RND) + seq_printf(seq, "QUEUE_MAP_RND "); + + if (pkt_dev->flags & F_QUEUE_MAP_CPU) + seq_printf(seq, "QUEUE_MAP_CPU "); + + if (pkt_dev->cflows) { + if (pkt_dev->flags & F_FLOW_SEQ) + seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/ + else + seq_printf(seq, "FLOW_RND "); + } + +#ifdef CONFIG_XFRM + if (pkt_dev->flags & F_IPSEC_ON) + seq_printf(seq, "IPSEC "); +#endif + + if (pkt_dev->flags & F_MACSRC_RND) + seq_printf(seq, "MACSRC_RND "); + + if (pkt_dev->flags & F_MACDST_RND) + seq_printf(seq, "MACDST_RND "); + + if (pkt_dev->flags & F_VID_RND) + seq_printf(seq, "VID_RND "); + + if (pkt_dev->flags & F_SVID_RND) + seq_printf(seq, "SVID_RND "); + + if (pkt_dev->flags & F_NODE) + seq_printf(seq, "NODE_ALLOC "); + + seq_puts(seq, "\n"); + + /* not really stopped, more like last-running-at */ + stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; + idle = pkt_dev->idle_acc; + do_div(idle, NSEC_PER_USEC); + + seq_printf(seq, + "Current:\n pkts-sofar: %llu errors: %llu\n", + (unsigned long long)pkt_dev->sofar, + (unsigned long long)pkt_dev->errors); + + seq_printf(seq, + " started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) ktime_to_us(pkt_dev->started_at), + (unsigned long long) ktime_to_us(stopped), + (unsigned long long) idle); + + seq_printf(seq, + " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", + pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, + pkt_dev->cur_src_mac_offset); + + if (pkt_dev->flags & F_IPV6) { + seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n", + &pkt_dev->cur_in6_saddr, + &pkt_dev->cur_in6_daddr); + } else + seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", + pkt_dev->cur_saddr, pkt_dev->cur_daddr); + + seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", + pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + + seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map); + + seq_printf(seq, " flows: %u\n", pkt_dev->nflows); + + if (pkt_dev->result[0]) + seq_printf(seq, "Result: %s\n", pkt_dev->result); + else + seq_printf(seq, "Result: Idle\n"); + + return 0; +} + + +static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, + __u32 *num) +{ + int i = 0; + *num = 0; + + for (; i < maxlen; i++) { + int value; + char c; + *num <<= 4; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + value = hex_to_bin(c); + if (value >= 0) + *num |= value; + else + break; + } + return i; +} + +static int count_trail_chars(const char __user * user_buffer, + unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + case '=': + break; + default: + goto done; + } + } +done: + return i; +} + +static unsigned long num_arg(const char __user * user_buffer, + unsigned long maxlen, unsigned long *num) +{ + int i; + *num = 0; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + if ((c >= '0') && (c <= '9')) { + *num *= 10; + *num += c - '0'; + } else + break; + } + return i; +} + +static int strn_len(const char __user * user_buffer, unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + goto done_str; + break; + default: + break; + } + } +done_str: + return i; +} + +static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +{ + unsigned n = 0; + char c; + ssize_t i = 0; + int len; + + pkt_dev->nr_labels = 0; + do { + __u32 tmp; + len = hex32_arg(&buffer[i], 8, &tmp); + if (len <= 0) + return len; + pkt_dev->labels[n] = htonl(tmp); + if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) + pkt_dev->flags |= F_MPLS_RND; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + i++; + n++; + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + } while (c == ','); + + pkt_dev->nr_labels = n; + return i; +} + +static ssize_t pktgen_if_write(struct file *file, + const char __user * user_buffer, size_t count, + loff_t * offset) +{ + struct seq_file *seq = file->private_data; + struct pktgen_dev *pkt_dev = seq->private; + int i, max, len; + char name[16], valstr[32]; + unsigned long value = 0; + char *pg_result = NULL; + int tmp = 0; + char buf[128]; + + pg_result = &(pkt_dev->result[0]); + + if (count < 1) { + pr_warning("wrong command format\n"); + return -EINVAL; + } + + max = count; + tmp = count_trail_chars(user_buffer, max); + if (tmp < 0) { + pr_warning("illegal format\n"); + return tmp; + } + i = tmp; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) { + size_t copy = min_t(size_t, count, 1023); + char tb[copy + 1]; + if (copy_from_user(tb, user_buffer, copy)) + return -EFAULT; + tb[copy] = 0; + printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, + (unsigned long)count, tb); + } + + if (!strcmp(name, "min_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: min_pkt_size=%u", + pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "max_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->max_pkt_size) { + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: max_pkt_size=%u", + pkt_dev->max_pkt_size); + return count; + } + + /* Shortcut for min = max */ + + if (!strcmp(name, "pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "debug")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + debug = value; + sprintf(pg_result, "OK: debug=%u", debug); + return count; + } + + if (!strcmp(name, "frags")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->nfrags = value; + sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); + return count; + } + if (!strcmp(name, "delay")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value == 0x7FFFFFFF) + pkt_dev->delay = ULLONG_MAX; + else + pkt_dev->delay = (u64)value; + + sprintf(pg_result, "OK: delay=%llu", + (unsigned long long) pkt_dev->delay); + return count; + } + if (!strcmp(name, "rate")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "ratep")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = NSEC_PER_SEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "udp_src_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_src_min) { + pkt_dev->udp_src_min = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min); + return count; + } + if (!strcmp(name, "udp_dst_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_dst_min) { + pkt_dev->udp_dst_min = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min); + return count; + } + if (!strcmp(name, "udp_src_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_src_max) { + pkt_dev->udp_src_max = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max); + return count; + } + if (!strcmp(name, "udp_dst_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_dst_max) { + pkt_dev->udp_dst_max = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max); + return count; + } + if (!strcmp(name, "clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + if ((value > 0) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + return -ENOTSUPP; + i += len; + pkt_dev->clone_skb = value; + + sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); + return count; + } + if (!strcmp(name, "count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->count = value; + sprintf(pg_result, "OK: count=%llu", + (unsigned long long)pkt_dev->count); + return count; + } + if (!strcmp(name, "src_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (pkt_dev->src_mac_count != value) { + pkt_dev->src_mac_count = value; + pkt_dev->cur_src_mac_offset = 0; + } + sprintf(pg_result, "OK: src_mac_count=%d", + pkt_dev->src_mac_count); + return count; + } + if (!strcmp(name, "dst_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (pkt_dev->dst_mac_count != value) { + pkt_dev->dst_mac_count = value; + pkt_dev->cur_dst_mac_offset = 0; + } + sprintf(pg_result, "OK: dst_mac_count=%d", + pkt_dev->dst_mac_count); + return count; + } + if (!strcmp(name, "node")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + + if (node_possible(value)) { + pkt_dev->node = value; + sprintf(pg_result, "OK: node=%d", pkt_dev->node); + if (pkt_dev->page) { + put_page(pkt_dev->page); + pkt_dev->page = NULL; + } + } + else + sprintf(pg_result, "ERROR: node not possible"); + return count; + } + if (!strcmp(name, "flag")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + if (strcmp(f, "IPSRC_RND") == 0) + pkt_dev->flags |= F_IPSRC_RND; + + else if (strcmp(f, "!IPSRC_RND") == 0) + pkt_dev->flags &= ~F_IPSRC_RND; + + else if (strcmp(f, "TXSIZE_RND") == 0) + pkt_dev->flags |= F_TXSIZE_RND; + + else if (strcmp(f, "!TXSIZE_RND") == 0) + pkt_dev->flags &= ~F_TXSIZE_RND; + + else if (strcmp(f, "IPDST_RND") == 0) + pkt_dev->flags |= F_IPDST_RND; + + else if (strcmp(f, "!IPDST_RND") == 0) + pkt_dev->flags &= ~F_IPDST_RND; + + else if (strcmp(f, "UDPSRC_RND") == 0) + pkt_dev->flags |= F_UDPSRC_RND; + + else if (strcmp(f, "!UDPSRC_RND") == 0) + pkt_dev->flags &= ~F_UDPSRC_RND; + + else if (strcmp(f, "UDPDST_RND") == 0) + pkt_dev->flags |= F_UDPDST_RND; + + else if (strcmp(f, "!UDPDST_RND") == 0) + pkt_dev->flags &= ~F_UDPDST_RND; + + else if (strcmp(f, "MACSRC_RND") == 0) + pkt_dev->flags |= F_MACSRC_RND; + + else if (strcmp(f, "!MACSRC_RND") == 0) + pkt_dev->flags &= ~F_MACSRC_RND; + + else if (strcmp(f, "MACDST_RND") == 0) + pkt_dev->flags |= F_MACDST_RND; + + else if (strcmp(f, "!MACDST_RND") == 0) + pkt_dev->flags &= ~F_MACDST_RND; + + else if (strcmp(f, "MPLS_RND") == 0) + pkt_dev->flags |= F_MPLS_RND; + + else if (strcmp(f, "!MPLS_RND") == 0) + pkt_dev->flags &= ~F_MPLS_RND; + + else if (strcmp(f, "VID_RND") == 0) + pkt_dev->flags |= F_VID_RND; + + else if (strcmp(f, "!VID_RND") == 0) + pkt_dev->flags &= ~F_VID_RND; + + else if (strcmp(f, "SVID_RND") == 0) + pkt_dev->flags |= F_SVID_RND; + + else if (strcmp(f, "!SVID_RND") == 0) + pkt_dev->flags &= ~F_SVID_RND; + + else if (strcmp(f, "FLOW_SEQ") == 0) + pkt_dev->flags |= F_FLOW_SEQ; + + else if (strcmp(f, "QUEUE_MAP_RND") == 0) + pkt_dev->flags |= F_QUEUE_MAP_RND; + + else if (strcmp(f, "!QUEUE_MAP_RND") == 0) + pkt_dev->flags &= ~F_QUEUE_MAP_RND; + + else if (strcmp(f, "QUEUE_MAP_CPU") == 0) + pkt_dev->flags |= F_QUEUE_MAP_CPU; + + else if (strcmp(f, "!QUEUE_MAP_CPU") == 0) + pkt_dev->flags &= ~F_QUEUE_MAP_CPU; +#ifdef CONFIG_XFRM + else if (strcmp(f, "IPSEC") == 0) + pkt_dev->flags |= F_IPSEC_ON; +#endif + + else if (strcmp(f, "!IPV6") == 0) + pkt_dev->flags &= ~F_IPV6; + + else if (strcmp(f, "NODE_ALLOC") == 0) + pkt_dev->flags |= F_NODE; + + else if (strcmp(f, "!NODE_ALLOC") == 0) + pkt_dev->flags &= ~F_NODE; + + else { + sprintf(pg_result, + "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", + f, + "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); + return count; + } + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; + } + if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_min) != 0) { + memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); + strncpy(pkt_dev->dst_min, buf, len); + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->cur_daddr = pkt_dev->daddr_min; + } + if (debug) + printk(KERN_DEBUG "pktgen: dst_min set to: %s\n", + pkt_dev->dst_min); + i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); + return count; + } + if (!strcmp(name, "dst_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + if (len < 0) + return len; + + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_max) != 0) { + memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); + strncpy(pkt_dev->dst_max, buf, len); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + pkt_dev->cur_daddr = pkt_dev->daddr_max; + } + if (debug) + printk(KERN_DEBUG "pktgen: dst_max set to: %s\n", + pkt_dev->dst_max); + i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); + return count; + } + if (!strcmp(name, "dst6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); + + ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); + + if (debug) + printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6=%s", buf); + return count; + } + if (!strcmp(name, "dst6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); + + ipv6_addr_copy(&pkt_dev->cur_in6_daddr, + &pkt_dev->min_in6_daddr); + if (debug) + printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_min=%s", buf); + return count; + } + if (!strcmp(name, "dst6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr); + + if (debug) + printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_max=%s", buf); + return count; + } + if (!strcmp(name, "src6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); + + ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); + + if (debug) + printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6=%s", buf); + return count; + } + if (!strcmp(name, "src_min")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_min) != 0) { + memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); + strncpy(pkt_dev->src_min, buf, len); + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->cur_saddr = pkt_dev->saddr_min; + } + if (debug) + printk(KERN_DEBUG "pktgen: src_min set to: %s\n", + pkt_dev->src_min); + i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); + return count; + } + if (!strcmp(name, "src_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_max) != 0) { + memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); + strncpy(pkt_dev->src_max, buf, len); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + pkt_dev->cur_saddr = pkt_dev->saddr_max; + } + if (debug) + printk(KERN_DEBUG "pktgen: src_max set to: %s\n", + pkt_dev->src_max); + i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); + return count; + } + if (!strcmp(name, "dst_mac")) { + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) + return len; + + memset(valstr, 0, sizeof(valstr)); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + + if (!mac_pton(valstr, pkt_dev->dst_mac)) + return -EINVAL; + /* Set up Dest MAC */ + memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); + + sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); + return count; + } + if (!strcmp(name, "src_mac")) { + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) + return len; + + memset(valstr, 0, sizeof(valstr)); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + + if (!mac_pton(valstr, pkt_dev->src_mac)) + return -EINVAL; + /* Set up Src MAC */ + memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); + + sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); + return count; + } + + if (!strcmp(name, "clear_counters")) { + pktgen_clear_counters(pkt_dev); + sprintf(pg_result, "OK: Clearing counters.\n"); + return count; + } + + if (!strcmp(name, "flows")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value > MAX_CFLOWS) + value = MAX_CFLOWS; + + pkt_dev->cflows = value; + sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); + return count; + } + + if (!strcmp(name, "flowlen")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->lflow = value; + sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); + return count; + } + + if (!strcmp(name, "queue_map_min")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->queue_map_min = value; + sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); + return count; + } + + if (!strcmp(name, "queue_map_max")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->queue_map_max = value; + sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); + return count; + } + + if (!strcmp(name, "mpls")) { + unsigned n, cnt; + + len = get_labels(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + i += len; + cnt = sprintf(pg_result, "OK: mpls="); + for (n = 0; n < pkt_dev->nr_labels; n++) + cnt += sprintf(pg_result + cnt, + "%08x%s", ntohl(pkt_dev->labels[n]), + n == pkt_dev->nr_labels-1 ? "" : ","); + + if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN/SVLAN auto turned off\n"); + } + return count; + } + + if (!strcmp(name, "vlan_id")) { + len = num_arg(&user_buffer[i], 4, &value); + if (len < 0) + return len; + + i += len; + if (value <= 4095) { + pkt_dev->vlan_id = value; /* turn on VLAN */ + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN turned on\n"); + + if (debug && pkt_dev->nr_labels) + printk(KERN_DEBUG "pktgen: MPLS auto turned off\n"); + + pkt_dev->nr_labels = 0; /* turn off MPLS */ + sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id); + } else { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n"); + } + return count; + } + + if (!strcmp(name, "vlan_p")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_p = value; + sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); + } else { + sprintf(pg_result, "ERROR: vlan_p must be 0-7"); + } + return count; + } + + if (!strcmp(name, "vlan_cfi")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_cfi = value; + sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); + } else { + sprintf(pg_result, "ERROR: vlan_cfi must be 0-1"); + } + return count; + } + + if (!strcmp(name, "svlan_id")) { + len = num_arg(&user_buffer[i], 4, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { + pkt_dev->svlan_id = value; /* turn on SVLAN */ + + if (debug) + printk(KERN_DEBUG "pktgen: SVLAN turned on\n"); + + if (debug && pkt_dev->nr_labels) + printk(KERN_DEBUG "pktgen: MPLS auto turned off\n"); + + pkt_dev->nr_labels = 0; /* turn off MPLS */ + sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id); + } else { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n"); + } + return count; + } + + if (!strcmp(name, "svlan_p")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_p = value; + sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); + } else { + sprintf(pg_result, "ERROR: svlan_p must be 0-7"); + } + return count; + } + + if (!strcmp(name, "svlan_cfi")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_cfi = value; + sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); + } else { + sprintf(pg_result, "ERROR: svlan_cfi must be 0-1"); + } + return count; + } + + if (!strcmp(name, "tos")) { + __u32 tmp_value = 0; + len = hex32_arg(&user_buffer[i], 2, &tmp_value); + if (len < 0) + return len; + + i += len; + if (len == 2) { + pkt_dev->tos = tmp_value; + sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); + } else { + sprintf(pg_result, "ERROR: tos must be 00-ff"); + } + return count; + } + + if (!strcmp(name, "traffic_class")) { + __u32 tmp_value = 0; + len = hex32_arg(&user_buffer[i], 2, &tmp_value); + if (len < 0) + return len; + + i += len; + if (len == 2) { + pkt_dev->traffic_class = tmp_value; + sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); + } else { + sprintf(pg_result, "ERROR: traffic_class must be 00-ff"); + } + return count; + } + + if (!strcmp(name, "skb_priority")) { + len = num_arg(&user_buffer[i], 9, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->skb_priority = value; + sprintf(pg_result, "OK: skb_priority=%i", + pkt_dev->skb_priority); + return count; + } + + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); + return -EINVAL; +} + +static int pktgen_if_open(struct inode *inode, struct file *file) +{ + return single_open(file, pktgen_if_show, PDE(inode)->data); +} + +static const struct file_operations pktgen_if_fops = { + .owner = THIS_MODULE, + .open = pktgen_if_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pktgen_if_write, + .release = single_release, +}; + +static int pktgen_thread_show(struct seq_file *seq, void *v) +{ + struct pktgen_thread *t = seq->private; + const struct pktgen_dev *pkt_dev; + + BUG_ON(!t); + + seq_printf(seq, "Running: "); + + if_lock(t); + list_for_each_entry(pkt_dev, &t->if_list, list) + if (pkt_dev->running) + seq_printf(seq, "%s ", pkt_dev->odevname); + + seq_printf(seq, "\nStopped: "); + + list_for_each_entry(pkt_dev, &t->if_list, list) + if (!pkt_dev->running) + seq_printf(seq, "%s ", pkt_dev->odevname); + + if (t->result[0]) + seq_printf(seq, "\nResult: %s\n", t->result); + else + seq_printf(seq, "\nResult: NA\n"); + + if_unlock(t); + + return 0; +} + +static ssize_t pktgen_thread_write(struct file *file, + const char __user * user_buffer, + size_t count, loff_t * offset) +{ + struct seq_file *seq = file->private_data; + struct pktgen_thread *t = seq->private; + int i, max, len, ret; + char name[40]; + char *pg_result; + + if (count < 1) { + // sprintf(pg_result, "Wrong command format"); + return -EINVAL; + } + + max = count; + len = count_trail_chars(user_buffer, max); + if (len < 0) + return len; + + i = len; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) + printk(KERN_DEBUG "pktgen: t=%s, count=%lu\n", + name, (unsigned long)count); + + if (!t) { + pr_err("ERROR: No thread\n"); + ret = -EINVAL; + goto out; + } + + pg_result = &(t->result[0]); + + if (!strcmp(name, "add_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + mutex_lock(&pktgen_thread_lock); + pktgen_add_device(t, f); + mutex_unlock(&pktgen_thread_lock); + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + goto out; + } + + if (!strcmp(name, "rem_device_all")) { + mutex_lock(&pktgen_thread_lock); + t->control |= T_REMDEVALL; + mutex_unlock(&pktgen_thread_lock); + schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + ret = count; + sprintf(pg_result, "OK: rem_device_all"); + goto out; + } + + if (!strcmp(name, "max_before_softirq")) { + sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use"); + ret = count; + goto out; + } + + ret = -EINVAL; +out: + return ret; +} + +static int pktgen_thread_open(struct inode *inode, struct file *file) +{ + return single_open(file, pktgen_thread_show, PDE(inode)->data); +} + +static const struct file_operations pktgen_thread_fops = { + .owner = THIS_MODULE, + .open = pktgen_thread_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pktgen_thread_write, + .release = single_release, +}; + +/* Think find or remove for NN */ +static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) +{ + struct pktgen_thread *t; + struct pktgen_dev *pkt_dev = NULL; + bool exact = (remove == FIND); + + list_for_each_entry(t, &pktgen_threads, th_list) { + pkt_dev = pktgen_find_dev(t, ifname, exact); + if (pkt_dev) { + if (remove) { + if_lock(t); + pkt_dev->removal_mark = 1; + t->control |= T_REMDEV; + if_unlock(t); + } + break; + } + } + return pkt_dev; +} + +/* + * mark a device for removal + */ +static void pktgen_mark_device(const char *ifname) +{ + struct pktgen_dev *pkt_dev = NULL; + const int max_tries = 10, msec_per_try = 125; + int i = 0; + + mutex_lock(&pktgen_thread_lock); + pr_debug("%s: marking %s for removal\n", __func__, ifname); + + while (1) { + + pkt_dev = __pktgen_NN_threads(ifname, REMOVE); + if (pkt_dev == NULL) + break; /* success */ + + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); + schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { + pr_err("%s: timed out after waiting %d msec for device %s to be removed\n", + __func__, msec_per_try * i, ifname); + break; + } + + } + + mutex_unlock(&pktgen_thread_lock); +} + +static void pktgen_change_name(struct net_device *dev) +{ + struct pktgen_thread *t; + + list_for_each_entry(t, &pktgen_threads, th_list) { + struct pktgen_dev *pkt_dev; + + list_for_each_entry(pkt_dev, &t->if_list, list) { + if (pkt_dev->odev != dev) + continue; + + remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + + pkt_dev->entry = proc_create_data(dev->name, 0600, + pg_proc_dir, + &pktgen_if_fops, + pkt_dev); + if (!pkt_dev->entry) + pr_err("can't move proc entry for '%s'\n", + dev->name); + break; + } + } +} + +static int pktgen_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net) || pktgen_exiting) + return NOTIFY_DONE; + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGENAME: + pktgen_change_name(dev); + break; + + case NETDEV_UNREGISTER: + pktgen_mark_device(dev->name); + break; + } + + return NOTIFY_DONE; +} + +static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, + const char *ifname) +{ + char b[IFNAMSIZ+5]; + int i; + + for (i = 0; ifname[i] != '@'; i++) { + if (i == IFNAMSIZ) + break; + + b[i] = ifname[i]; + } + b[i] = 0; + + return dev_get_by_name(&init_net, b); +} + + +/* Associate pktgen_dev with a device. */ + +static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +{ + struct net_device *odev; + int err; + + /* Clean old setups */ + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + odev = pktgen_dev_get_by_name(pkt_dev, ifname); + if (!odev) { + pr_err("no such netdevice: \"%s\"\n", ifname); + return -ENODEV; + } + + if (odev->type != ARPHRD_ETHER) { + pr_err("not an ethernet device: \"%s\"\n", ifname); + err = -EINVAL; + } else if (!netif_running(odev)) { + pr_err("device is down: \"%s\"\n", ifname); + err = -ENETDOWN; + } else { + pkt_dev->odev = odev; + return 0; + } + + dev_put(odev); + return err; +} + +/* Read pkt_dev from the interface and set up internal pktgen_dev + * structure to have the right information to create/send packets + */ +static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) +{ + int ntxq; + + if (!pkt_dev->odev) { + pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n"); + sprintf(pkt_dev->result, + "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + return; + } + + /* make sure that we don't pick a non-existing transmit queue */ + ntxq = pkt_dev->odev->real_num_tx_queues; + + if (ntxq <= pkt_dev->queue_map_min) { + pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); + pkt_dev->queue_map_min = ntxq - 1; + } + if (pkt_dev->queue_map_max >= ntxq) { + pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); + pkt_dev->queue_map_max = ntxq - 1; + } + + /* Default to the interface's mac if not explicitly set. */ + + if (is_zero_ether_addr(pkt_dev->src_mac)) + memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + + /* Set up Dest MAC */ + memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); + + /* Set up pkt size */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + + if (pkt_dev->flags & F_IPV6) { + /* + * Skip this automatic address setting until locks or functions + * gets exported + */ + +#ifdef NOTNOW + int i, set = 0, err = 1; + struct inet6_dev *idev; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + if (pkt_dev->cur_in6_saddr.s6_addr[i]) { + set = 1; + break; + } + + if (!set) { + + /* + * Use linklevel address if unconfigured. + * + * use ipv6_get_lladdr if/when it's get exported + */ + + rcu_read_lock(); + idev = __in6_dev_get(pkt_dev->odev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; + ifp = ifp->if_next) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & IFA_F_TENTATIVE)) { + ipv6_addr_copy(&pkt_dev-> + cur_in6_saddr, + &ifp->addr); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + if (err) + pr_err("ERROR: IPv6 link address not available\n"); + } +#endif + } else { + pkt_dev->saddr_min = 0; + pkt_dev->saddr_max = 0; + if (strlen(pkt_dev->src_min) == 0) { + + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(pkt_dev->odev); + if (in_dev) { + if (in_dev->ifa_list) { + pkt_dev->saddr_min = + in_dev->ifa_list->ifa_address; + pkt_dev->saddr_max = pkt_dev->saddr_min; + } + } + rcu_read_unlock(); + } else { + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + } + + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + } + /* Initialize current values. */ + pkt_dev->cur_dst_mac_offset = 0; + pkt_dev->cur_src_mac_offset = 0; + pkt_dev->cur_saddr = pkt_dev->saddr_min; + pkt_dev->cur_daddr = pkt_dev->daddr_min; + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + pkt_dev->nflows = 0; +} + + +static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) +{ + ktime_t start_time, end_time; + s64 remaining; + struct hrtimer_sleeper t; + + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, spin_until); + + remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); + if (remaining <= 0) { + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + return; + } + + start_time = ktime_now(); + if (remaining < 100000) + ndelay(remaining); /* really small just spin */ + else { + /* see do_nanosleep */ + hrtimer_init_sleeper(&t, current); + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + } while (t.task && pkt_dev->running && !signal_pending(current)); + __set_current_state(TASK_RUNNING); + } + end_time = ktime_now(); + + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); +} + +static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) +{ + pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); + pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); + pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); +} + +static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow) +{ + return !!(pkt_dev->flows[flow].flags & F_INIT); +} + +static inline int f_pick(struct pktgen_dev *pkt_dev) +{ + int flow = pkt_dev->curfl; + + if (pkt_dev->flags & F_FLOW_SEQ) { + if (pkt_dev->flows[flow].count >= pkt_dev->lflow) { + /* reset time */ + pkt_dev->flows[flow].count = 0; + pkt_dev->flows[flow].flags = 0; + pkt_dev->curfl += 1; + if (pkt_dev->curfl >= pkt_dev->cflows) + pkt_dev->curfl = 0; /*reset */ + } + } else { + flow = random32() % pkt_dev->cflows; + pkt_dev->curfl = flow; + + if (pkt_dev->flows[flow].count > pkt_dev->lflow) { + pkt_dev->flows[flow].count = 0; + pkt_dev->flows[flow].flags = 0; + } + } + + return pkt_dev->curfl; +} + + +#ifdef CONFIG_XFRM +/* If there was already an IPSEC SA, we keep it as is, else + * we go look for it ... +*/ +#define DUMMY_MARK 0 +static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) +{ + struct xfrm_state *x = pkt_dev->flows[flow].x; + if (!x) { + /*slow path: we dont already have xfrm_state*/ + x = xfrm_stateonly_find(&init_net, DUMMY_MARK, + (xfrm_address_t *)&pkt_dev->cur_daddr, + (xfrm_address_t *)&pkt_dev->cur_saddr, + AF_INET, + pkt_dev->ipsmode, + pkt_dev->ipsproto, 0); + if (x) { + pkt_dev->flows[flow].x = x; + set_pkt_overhead(pkt_dev); + pkt_dev->pkt_overhead += x->props.header_len; + } + + } +} +#endif +static void set_cur_queue_map(struct pktgen_dev *pkt_dev) +{ + + if (pkt_dev->flags & F_QUEUE_MAP_CPU) + pkt_dev->cur_queue_map = smp_processor_id(); + + else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { + __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { + t = random32() % + (pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; + } else { + t = pkt_dev->cur_queue_map + 1; + if (t > pkt_dev->queue_map_max) + t = pkt_dev->queue_map_min; + } + pkt_dev->cur_queue_map = t; + } + pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues; +} + +/* Increment/randomize headers according to flags and current values + * for IP src/dest, UDP src/dst port, MAC-Addr src/dst + */ +static void mod_cur_headers(struct pktgen_dev *pkt_dev) +{ + __u32 imn; + __u32 imx; + int flow = 0; + + if (pkt_dev->cflows) + flow = f_pick(pkt_dev); + + /* Deal with source MAC */ + if (pkt_dev->src_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACSRC_RND) + mc = random32() % pkt_dev->src_mac_count; + else { + mc = pkt_dev->cur_src_mac_offset++; + if (pkt_dev->cur_src_mac_offset >= + pkt_dev->src_mac_count) + pkt_dev->cur_src_mac_offset = 0; + } + + tmp = pkt_dev->src_mac[5] + (mc & 0xFF); + pkt_dev->hh[11] = tmp; + tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[10] = tmp; + tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[9] = tmp; + tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[8] = tmp; + tmp = (pkt_dev->src_mac[1] + (tmp >> 8)); + pkt_dev->hh[7] = tmp; + } + + /* Deal with Destination MAC */ + if (pkt_dev->dst_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACDST_RND) + mc = random32() % pkt_dev->dst_mac_count; + + else { + mc = pkt_dev->cur_dst_mac_offset++; + if (pkt_dev->cur_dst_mac_offset >= + pkt_dev->dst_mac_count) { + pkt_dev->cur_dst_mac_offset = 0; + } + } + + tmp = pkt_dev->dst_mac[5] + (mc & 0xFF); + pkt_dev->hh[5] = tmp; + tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[4] = tmp; + tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[3] = tmp; + tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[2] = tmp; + tmp = (pkt_dev->dst_mac[1] + (tmp >> 8)); + pkt_dev->hh[1] = tmp; + } + + if (pkt_dev->flags & F_MPLS_RND) { + unsigned i; + for (i = 0; i < pkt_dev->nr_labels; i++) + if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) + pkt_dev->labels[i] = MPLS_STACK_BOTTOM | + ((__force __be32)random32() & + htonl(0x000fffff)); + } + + if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_id = random32() & (4096-1); + } + + if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_id = random32() & (4096 - 1); + } + + if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { + if (pkt_dev->flags & F_UDPSRC_RND) + pkt_dev->cur_udp_src = random32() % + (pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; + + else { + pkt_dev->cur_udp_src++; + if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max) + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + } + } + + if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { + if (pkt_dev->flags & F_UDPDST_RND) { + pkt_dev->cur_udp_dst = random32() % + (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; + } else { + pkt_dev->cur_udp_dst++; + if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + } + } + + if (!(pkt_dev->flags & F_IPV6)) { + + imn = ntohl(pkt_dev->saddr_min); + imx = ntohl(pkt_dev->saddr_max); + if (imn < imx) { + __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) + t = random32() % (imx - imn) + imn; + else { + t = ntohl(pkt_dev->cur_saddr); + t++; + if (t > imx) + t = imn; + + } + pkt_dev->cur_saddr = htonl(t); + } + + if (pkt_dev->cflows && f_seen(pkt_dev, flow)) { + pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr; + } else { + imn = ntohl(pkt_dev->daddr_min); + imx = ntohl(pkt_dev->daddr_max); + if (imn < imx) { + __u32 t; + __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { + + t = random32() % (imx - imn) + imn; + s = htonl(t); + + while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)) { + t = random32() % (imx - imn) + imn; + s = htonl(t); + } + pkt_dev->cur_daddr = s; + } else { + t = ntohl(pkt_dev->cur_daddr); + t++; + if (t > imx) { + t = imn; + } + pkt_dev->cur_daddr = htonl(t); + } + } + if (pkt_dev->cflows) { + pkt_dev->flows[flow].flags |= F_INIT; + pkt_dev->flows[flow].cur_daddr = + pkt_dev->cur_daddr; +#ifdef CONFIG_XFRM + if (pkt_dev->flags & F_IPSEC_ON) + get_ipsec_sa(pkt_dev, flow); +#endif + pkt_dev->nflows++; + } + } + } else { /* IPV6 * */ + + if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ; + else { + int i; + + /* Only random destinations yet */ + + for (i = 0; i < 4; i++) { + pkt_dev->cur_in6_daddr.s6_addr32[i] = + (((__force __be32)random32() | + pkt_dev->min_in6_daddr.s6_addr32[i]) & + pkt_dev->max_in6_daddr.s6_addr32[i]); + } + } + } + + if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { + __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { + t = random32() % + (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; + } else { + t = pkt_dev->cur_pkt_size + 1; + if (t > pkt_dev->max_pkt_size) + t = pkt_dev->min_pkt_size; + } + pkt_dev->cur_pkt_size = t; + } + + set_cur_queue_map(pkt_dev); + + pkt_dev->flows[flow].count++; +} + + +#ifdef CONFIG_XFRM +static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) +{ + struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; + int err = 0; + + if (!x) + return 0; + /* XXX: we dont support tunnel mode for now until + * we resolve the dst issue */ + if (x->props.mode != XFRM_MODE_TRANSPORT) + return 0; + + spin_lock(&x->lock); + + err = x->outer_mode->output(x, skb); + if (err) + goto error; + err = x->type->output(x, skb); + if (err) + goto error; + + x->curlft.bytes += skb->len; + x->curlft.packets++; +error: + spin_unlock(&x->lock); + return err; +} + +static void free_SAs(struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->cflows) { + /* let go of the SAs if we have them */ + int i; + for (i = 0; i < pkt_dev->cflows; i++) { + struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { + xfrm_state_put(x); + pkt_dev->flows[i].x = NULL; + } + } + } +} + +static int process_ipsec(struct pktgen_dev *pkt_dev, + struct sk_buff *skb, __be16 protocol) +{ + if (pkt_dev->flags & F_IPSEC_ON) { + struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; + int nhead = 0; + if (x) { + int ret; + __u8 *eth; + nhead = x->props.header_len - skb_headroom(skb); + if (nhead > 0) { + ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); + if (ret < 0) { + pr_err("Error expanding ipsec packet %d\n", + ret); + goto err; + } + } + + /* ipsec is not expecting ll header */ + skb_pull(skb, ETH_HLEN); + ret = pktgen_output_ipsec(skb, pkt_dev); + if (ret) { + pr_err("Error creating ipsec packet %d\n", ret); + goto err; + } + /* restore ll */ + eth = (__u8 *) skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 12); + *(u16 *) ð[12] = protocol; + } + } + return 1; +err: + kfree_skb(skb); + return 0; +} +#endif + +static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) +{ + unsigned i; + for (i = 0; i < pkt_dev->nr_labels; i++) + *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; + + mpls--; + *mpls |= MPLS_STACK_BOTTOM; +} + +static inline __be16 build_tci(unsigned int id, unsigned int cfi, + unsigned int prio) +{ + return htons(id | (cfi << 12) | (prio << 13)); +} + +static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, + int datalen) +{ + struct timeval timestamp; + struct pktgen_hdr *pgh; + + pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); + datalen -= sizeof(*pgh); + + if (pkt_dev->nfrags <= 0) { + memset(skb_put(skb, datalen), 0, datalen); + } else { + int frags = pkt_dev->nfrags; + int i, len; + int frag_len; + + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + len = datalen - frags * PAGE_SIZE; + if (len > 0) { + memset(skb_put(skb, len), 0, len); + datalen = frags * PAGE_SIZE; + } + + i = 0; + frag_len = (datalen/frags) < PAGE_SIZE ? + (datalen/frags) : PAGE_SIZE; + while (datalen > 0) { + if (unlikely(!pkt_dev->page)) { + int node = numa_node_id(); + + if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE)) + node = pkt_dev->node; + pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!pkt_dev->page) + break; + } + skb_shinfo(skb)->frags[i].page = pkt_dev->page; + get_page(pkt_dev->page); + skb_shinfo(skb)->frags[i].page_offset = 0; + /*last fragment, fill rest of data*/ + if (i == (frags - 1)) + skb_shinfo(skb)->frags[i].size = + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + else + skb_shinfo(skb)->frags[i].size = frag_len; + datalen -= skb_shinfo(skb)->frags[i].size; + skb->len += skb_shinfo(skb)->frags[i].size; + skb->data_len += skb_shinfo(skb)->frags[i].size; + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, + * convert them to network byte order + */ + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); +} + +static struct sk_buff *fill_packet_ipv4(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen, iplen; + struct iphdr *iph; + __be16 protocol = htons(ETH_P_IP); + __be32 *mpls; + __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ + __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ + __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ + __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; + + if (pkt_dev->nr_labels) + protocol = htons(ETH_P_MPLS_UC); + + if (pkt_dev->vlan_id != 0xffff) + protocol = htons(ETH_P_8021Q); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + + datalen = (odev->hard_header_len + 16) & ~0xf; + + if (pkt_dev->flags & F_NODE) { + int node; + + if (pkt_dev->node >= 0) + node = pkt_dev->node; + else + node = numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = odev; + } + } + else + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); + + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + prefetchw(skb->data); + + skb_reserve(skb, datalen); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + if (pkt_dev->nr_labels) + mpls_push(mpls, pkt_dev); + + if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { + svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_tci = build_tci(pkt_dev->svlan_id, + pkt_dev->svlan_cfi, + pkt_dev->svlan_p); + svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); + } + vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_tci = build_tci(pkt_dev->vlan_id, + pkt_dev->vlan_cfi, + pkt_dev->vlan_p); + vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_encapsulated_proto = htons(ETH_P_IP); + } + + skb->network_header = skb->tail; + skb->transport_header = skb->network_header + sizeof(struct iphdr); + skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + + iph = ip_hdr(skb); + udph = udp_hdr(skb); + + memcpy(eth, pkt_dev->hh, 12); + *(__be16 *) & eth[12] = protocol; + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - + pkt_dev->pkt_overhead; + if (datalen < sizeof(struct pktgen_hdr)) + datalen = sizeof(struct pktgen_hdr); + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + 8); /* DATA + udphdr */ + udph->check = 0; /* No checksum */ + + iph->ihl = 5; + iph->version = 4; + iph->ttl = 32; + iph->tos = pkt_dev->tos; + iph->protocol = IPPROTO_UDP; /* UDP */ + iph->saddr = pkt_dev->cur_saddr; + iph->daddr = pkt_dev->cur_daddr; + iph->id = htons(pkt_dev->ip_id); + pkt_dev->ip_id++; + iph->frag_off = 0; + iplen = 20 + 8 + datalen; + iph->tot_len = htons(iplen); + iph->check = 0; + iph->check = ip_fast_csum((void *)iph, iph->ihl); + skb->protocol = protocol; + skb->mac_header = (skb->network_header - ETH_HLEN - + pkt_dev->pkt_overhead); + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + pktgen_finalize_skb(pkt_dev, skb, datalen); + +#ifdef CONFIG_XFRM + if (!process_ipsec(pkt_dev, skb, protocol)) + return NULL; +#endif + + return skb; +} + +/* + * scan_ip6, fmt_ip taken from dietlibc-0.21 + * Author Felix von Leitner + * + * Slightly modified for kernel. + * Should be candidate for net/ipv4/utils.c + * --ro + */ + +static unsigned int scan_ip6(const char *s, char ip[16]) +{ + unsigned int i; + unsigned int len = 0; + unsigned long u; + char suffix[16]; + unsigned int prefixlen = 0; + unsigned int suffixlen = 0; + __be32 tmp; + char *pos; + + for (i = 0; i < 16; i++) + ip[i] = 0; + + for (;;) { + if (*s == ':') { + len++; + if (s[1] == ':') { /* Found "::", skip to part 2 */ + s += 2; + len++; + break; + } + s++; + } + + u = simple_strtoul(s, &pos, 16); + i = pos - s; + if (!i) + return 0; + if (prefixlen == 12 && s[i] == '.') { + + /* the last 4 bytes may be written as IPv4 address */ + + tmp = in_aton(s); + memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp)); + return i + len; + } + ip[prefixlen++] = (u >> 8); + ip[prefixlen++] = (u & 255); + s += i; + len += i; + if (prefixlen == 16) + return len; + } + +/* part 2, after "::" */ + for (;;) { + if (*s == ':') { + if (suffixlen == 0) + break; + s++; + len++; + } else if (suffixlen != 0) + break; + + u = simple_strtol(s, &pos, 16); + i = pos - s; + if (!i) { + if (*s) + len--; + break; + } + if (suffixlen + prefixlen <= 12 && s[i] == '.') { + tmp = in_aton(s); + memcpy((struct in_addr *)(suffix + suffixlen), &tmp, + sizeof(tmp)); + suffixlen += 4; + len += strlen(s); + break; + } + suffix[suffixlen++] = (u >> 8); + suffix[suffixlen++] = (u & 255); + s += i; + len += i; + if (prefixlen + suffixlen == 16) + break; + } + for (i = 0; i < suffixlen; i++) + ip[16 - suffixlen + i] = suffix[i]; + return len; +} + +static struct sk_buff *fill_packet_ipv6(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen; + struct ipv6hdr *iph; + __be16 protocol = htons(ETH_P_IPV6); + __be32 *mpls; + __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ + __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ + __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ + __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; + + if (pkt_dev->nr_labels) + protocol = htons(ETH_P_MPLS_UC); + + if (pkt_dev->vlan_id != 0xffff) + protocol = htons(ETH_P_8021Q); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + prefetchw(skb->data); + + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + if (pkt_dev->nr_labels) + mpls_push(mpls, pkt_dev); + + if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { + svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_tci = build_tci(pkt_dev->svlan_id, + pkt_dev->svlan_cfi, + pkt_dev->svlan_p); + svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); + } + vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_tci = build_tci(pkt_dev->vlan_id, + pkt_dev->vlan_cfi, + pkt_dev->vlan_p); + vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_encapsulated_proto = htons(ETH_P_IPV6); + } + + skb->network_header = skb->tail; + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + iph = ipv6_hdr(skb); + udph = udp_hdr(skb); + + memcpy(eth, pkt_dev->hh, 12); + *(__be16 *) ð[12] = protocol; + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - + sizeof(struct ipv6hdr) - sizeof(struct udphdr) - + pkt_dev->pkt_overhead; + + if (datalen < sizeof(struct pktgen_hdr)) { + datalen = sizeof(struct pktgen_hdr); + if (net_ratelimit()) + pr_info("increased datalen to %d\n", datalen); + } + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + sizeof(struct udphdr)); + udph->check = 0; /* No checksum */ + + *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ + + if (pkt_dev->traffic_class) { + /* Version + traffic class + flow (0) */ + *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20)); + } + + iph->hop_limit = 32; + + iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->nexthdr = IPPROTO_UDP; + + ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); + ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); + + skb->mac_header = (skb->network_header - ETH_HLEN - + pkt_dev->pkt_overhead); + skb->protocol = protocol; + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + pktgen_finalize_skb(pkt_dev, skb, datalen); + + return skb; +} + +static struct sk_buff *fill_packet(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->flags & F_IPV6) + return fill_packet_ipv6(odev, pkt_dev); + else + return fill_packet_ipv4(odev, pkt_dev); +} + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) +{ + pkt_dev->seq_num = 1; + pkt_dev->idle_acc = 0; + pkt_dev->sofar = 0; + pkt_dev->tx_bytes = 0; + pkt_dev->errors = 0; +} + +/* Set up structure for sending pkts, clear counters */ + +static void pktgen_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev; + int started = 0; + + func_enter(); + + if_lock(t); + list_for_each_entry(pkt_dev, &t->if_list, list) { + + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if (pkt_dev->odev) { + pktgen_clear_counters(pkt_dev); + pkt_dev->running = 1; /* Cranke yeself! */ + pkt_dev->skb = NULL; + pkt_dev->started_at = + pkt_dev->next_tx = ktime_now(); + + set_pkt_overhead(pkt_dev); + + strcpy(pkt_dev->result, "Starting"); + started++; + } else + strcpy(pkt_dev->result, "Error starting"); + } + if_unlock(t); + if (started) + t->control &= ~(T_STOP); +} + +static void pktgen_stop_all_threads_ifs(void) +{ + struct pktgen_thread *t; + + func_enter(); + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= T_STOP; + + mutex_unlock(&pktgen_thread_lock); +} + +static int thread_is_running(const struct pktgen_thread *t) +{ + const struct pktgen_dev *pkt_dev; + + list_for_each_entry(pkt_dev, &t->if_list, list) + if (pkt_dev->running) + return 1; + return 0; +} + +static int pktgen_wait_thread_run(struct pktgen_thread *t) +{ + if_lock(t); + + while (thread_is_running(t)) { + + if_unlock(t); + + msleep_interruptible(100); + + if (signal_pending(current)) + goto signal; + if_lock(t); + } + if_unlock(t); + return 1; +signal: + return 0; +} + +static int pktgen_wait_all_threads_run(void) +{ + struct pktgen_thread *t; + int sig = 1; + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) { + sig = pktgen_wait_thread_run(t); + if (sig == 0) + break; + } + + if (sig == 0) + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= (T_STOP); + + mutex_unlock(&pktgen_thread_lock); + return sig; +} + +static void pktgen_run_all_threads(void) +{ + struct pktgen_thread *t; + + func_enter(); + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= (T_RUN); + + mutex_unlock(&pktgen_thread_lock); + + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); + + pktgen_wait_all_threads_run(); +} + +static void pktgen_reset_all_threads(void) +{ + struct pktgen_thread *t; + + func_enter(); + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= (T_REMDEVALL); + + mutex_unlock(&pktgen_thread_lock); + + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); + + pktgen_wait_all_threads_run(); +} + +static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) +{ + __u64 bps, mbps, pps; + char *p = pkt_dev->result; + ktime_t elapsed = ktime_sub(pkt_dev->stopped_at, + pkt_dev->started_at); + ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); + + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", + (unsigned long long)ktime_to_us(elapsed), + (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), + (unsigned long long)ktime_to_us(idle), + (unsigned long long)pkt_dev->sofar, + pkt_dev->cur_pkt_size, nr_frags); + + pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, + ktime_to_ns(elapsed)); + + bps = pps * 8 * pkt_dev->cur_pkt_size; + + mbps = bps; + do_div(mbps, 1000000); + p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu", + (unsigned long long)pps, + (unsigned long long)mbps, + (unsigned long long)bps, + (unsigned long long)pkt_dev->errors); +} + +/* Set stopped-at timer, remove from running list, do counters & statistics */ +static int pktgen_stop_device(struct pktgen_dev *pkt_dev) +{ + int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; + + if (!pkt_dev->running) { + pr_warning("interface: %s is already stopped\n", + pkt_dev->odevname); + return -EINVAL; + } + + kfree_skb(pkt_dev->skb); + pkt_dev->skb = NULL; + pkt_dev->stopped_at = ktime_now(); + pkt_dev->running = 0; + + show_results(pkt_dev, nr_frags); + + return 0; +} + +static struct pktgen_dev *next_to_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev, *best = NULL; + + if_lock(t); + + list_for_each_entry(pkt_dev, &t->if_list, list) { + if (!pkt_dev->running) + continue; + if (best == NULL) + best = pkt_dev; + else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) + best = pkt_dev; + } + if_unlock(t); + return best; +} + +static void pktgen_stop(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev; + + func_enter(); + + if_lock(t); + + list_for_each_entry(pkt_dev, &t->if_list, list) { + pktgen_stop_device(pkt_dev); + } + + if_unlock(t); +} + +/* + * one of our devices needs to be removed - find it + * and remove it + */ +static void pktgen_rem_one_if(struct pktgen_thread *t) +{ + struct list_head *q, *n; + struct pktgen_dev *cur; + + func_enter(); + + if_lock(t); + + list_for_each_safe(q, n, &t->if_list) { + cur = list_entry(q, struct pktgen_dev, list); + + if (!cur->removal_mark) + continue; + + kfree_skb(cur->skb); + cur->skb = NULL; + + pktgen_remove_device(t, cur); + + break; + } + + if_unlock(t); +} + +static void pktgen_rem_all_ifs(struct pktgen_thread *t) +{ + struct list_head *q, *n; + struct pktgen_dev *cur; + + func_enter(); + + /* Remove all devices, free mem */ + + if_lock(t); + + list_for_each_safe(q, n, &t->if_list) { + cur = list_entry(q, struct pktgen_dev, list); + + kfree_skb(cur->skb); + cur->skb = NULL; + + pktgen_remove_device(t, cur); + } + + if_unlock(t); +} + +static void pktgen_rem_thread(struct pktgen_thread *t) +{ + /* Remove from the thread list */ + + remove_proc_entry(t->tsk->comm, pg_proc_dir); + +} + +static void pktgen_resched(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_now(); + schedule(); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +} + +static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_now(); + + while (atomic_read(&(pkt_dev->skb->users)) != 1) { + if (signal_pending(current)) + break; + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +} + +static void pktgen_xmit(struct pktgen_dev *pkt_dev) +{ + struct net_device *odev = pkt_dev->odev; + netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) + = odev->netdev_ops->ndo_start_xmit; + struct netdev_queue *txq; + u16 queue_map; + int ret; + + /* If device is offline, then don't send */ + if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) { + pktgen_stop_device(pkt_dev); + return; + } + + /* This is max DELAY, this has special meaning of + * "never transmit" + */ + if (unlikely(pkt_dev->delay == ULLONG_MAX)) { + pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); + return; + } + + /* If no skb or clone count exhausted then get new one */ + if (!pkt_dev->skb || (pkt_dev->last_ok && + ++pkt_dev->clone_count >= pkt_dev->clone_skb)) { + /* build a new pkt */ + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = fill_packet(odev, pkt_dev); + if (pkt_dev->skb == NULL) { + pr_err("ERROR: couldn't allocate skb in fill_packet\n"); + schedule(); + pkt_dev->clone_count--; /* back out increment, OOM */ + return; + } + pkt_dev->last_pkt_size = pkt_dev->skb->len; + pkt_dev->allocated_skbs++; + pkt_dev->clone_count = 0; /* reset counter */ + } + + if (pkt_dev->delay && pkt_dev->last_ok) + spin(pkt_dev, pkt_dev->next_tx); + + queue_map = skb_get_queue_mapping(pkt_dev->skb); + txq = netdev_get_tx_queue(odev, queue_map); + + __netif_tx_lock_bh(txq); + + if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { + ret = NETDEV_TX_BUSY; + pkt_dev->last_ok = 0; + goto unlock; + } + atomic_inc(&(pkt_dev->skb->users)); + ret = (*xmit)(pkt_dev->skb, odev); + + switch (ret) { + case NETDEV_TX_OK: + txq_trans_update(txq); + pkt_dev->last_ok = 1; + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + case NET_XMIT_POLICED: + /* skb has been consumed */ + pkt_dev->errors++; + break; + default: /* Drivers are not supposed to return other values! */ + if (net_ratelimit()) + pr_info("%s xmit error: %d\n", pkt_dev->odevname, ret); + pkt_dev->errors++; + /* fallthru */ + case NETDEV_TX_LOCKED: + case NETDEV_TX_BUSY: + /* Retry it next time */ + atomic_dec(&(pkt_dev->skb->users)); + pkt_dev->last_ok = 0; + } +unlock: + __netif_tx_unlock_bh(txq); + + /* If pkt_dev->count is zero, then run forever */ + if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { + pktgen_wait_for_skb(pkt_dev); + + /* Done with this */ + pktgen_stop_device(pkt_dev); + } +} + +/* + * Main loop of the thread goes here + */ + +static int pktgen_thread_worker(void *arg) +{ + DEFINE_WAIT(wait); + struct pktgen_thread *t = arg; + struct pktgen_dev *pkt_dev = NULL; + int cpu = t->cpu; + + BUG_ON(smp_processor_id() != cpu); + + init_waitqueue_head(&t->queue); + complete(&t->start_done); + + pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); + + set_current_state(TASK_INTERRUPTIBLE); + + set_freezable(); + + while (!kthread_should_stop()) { + pkt_dev = next_to_run(t); + + if (unlikely(!pkt_dev && t->control == 0)) { + if (pktgen_exiting) + break; + wait_event_interruptible_timeout(t->queue, + t->control != 0, + HZ/10); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + if (likely(pkt_dev)) { + pktgen_xmit(pkt_dev); + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + + if (t->control & T_STOP) { + pktgen_stop(t); + t->control &= ~(T_STOP); + } + + if (t->control & T_RUN) { + pktgen_run(t); + t->control &= ~(T_RUN); + } + + if (t->control & T_REMDEVALL) { + pktgen_rem_all_ifs(t); + t->control &= ~(T_REMDEVALL); + } + + if (t->control & T_REMDEV) { + pktgen_rem_one_if(t); + t->control &= ~(T_REMDEV); + } + + try_to_freeze(); + + set_current_state(TASK_INTERRUPTIBLE); + } + + pr_debug("%s stopping all device\n", t->tsk->comm); + pktgen_stop(t); + + pr_debug("%s removing all device\n", t->tsk->comm); + pktgen_rem_all_ifs(t); + + pr_debug("%s removing thread\n", t->tsk->comm); + pktgen_rem_thread(t); + + /* Wait for kthread_stop */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, + const char *ifname, bool exact) +{ + struct pktgen_dev *p, *pkt_dev = NULL; + size_t len = strlen(ifname); + + if_lock(t); + list_for_each_entry(p, &t->if_list, list) + if (strncmp(p->odevname, ifname, len) == 0) { + if (p->odevname[len]) { + if (exact || p->odevname[len] != '@') + continue; + } + pkt_dev = p; + break; + } + + if_unlock(t); + pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); + return pkt_dev; +} + +/* + * Adds a dev at front of if_list. + */ + +static int add_dev_to_thread(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + int rv = 0; + + if_lock(t); + + if (pkt_dev->pg_thread) { + pr_err("ERROR: already assigned to a thread\n"); + rv = -EBUSY; + goto out; + } + + list_add(&pkt_dev->list, &t->if_list); + pkt_dev->pg_thread = t; + pkt_dev->running = 0; + +out: + if_unlock(t); + return rv; +} + +/* Called under thread lock */ + +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) +{ + struct pktgen_dev *pkt_dev; + int err; + int node = cpu_to_node(t->cpu); + + /* We don't allow a device to be on several threads */ + + pkt_dev = __pktgen_NN_threads(ifname, FIND); + if (pkt_dev) { + pr_err("ERROR: interface already used\n"); + return -EBUSY; + } + + pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node); + if (!pkt_dev) + return -ENOMEM; + + strcpy(pkt_dev->odevname, ifname); + pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + node); + if (pkt_dev->flows == NULL) { + kfree(pkt_dev); + return -ENOMEM; + } + + pkt_dev->removal_mark = 0; + pkt_dev->min_pkt_size = ETH_ZLEN; + pkt_dev->max_pkt_size = ETH_ZLEN; + pkt_dev->nfrags = 0; + pkt_dev->delay = pg_delay_d; + pkt_dev->count = pg_count_d; + pkt_dev->sofar = 0; + pkt_dev->udp_src_min = 9; /* sink port */ + pkt_dev->udp_src_max = 9; + pkt_dev->udp_dst_min = 9; + pkt_dev->udp_dst_max = 9; + pkt_dev->vlan_p = 0; + pkt_dev->vlan_cfi = 0; + pkt_dev->vlan_id = 0xffff; + pkt_dev->svlan_p = 0; + pkt_dev->svlan_cfi = 0; + pkt_dev->svlan_id = 0xffff; + pkt_dev->node = -1; + + err = pktgen_setup_dev(pkt_dev, ifname); + if (err) + goto out1; + if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) + pkt_dev->clone_skb = pg_clone_skb_d; + + pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, + &pktgen_if_fops, pkt_dev); + if (!pkt_dev->entry) { + pr_err("cannot create %s/%s procfs entry\n", + PG_PROC_DIR, ifname); + err = -EINVAL; + goto out2; + } +#ifdef CONFIG_XFRM + pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; + pkt_dev->ipsproto = IPPROTO_ESP; +#endif + + return add_dev_to_thread(t, pkt_dev); +out2: + dev_put(pkt_dev->odev); +out1: +#ifdef CONFIG_XFRM + free_SAs(pkt_dev); +#endif + vfree(pkt_dev->flows); + kfree(pkt_dev); + return err; +} + +static int __init pktgen_create_thread(int cpu) +{ + struct pktgen_thread *t; + struct proc_dir_entry *pe; + struct task_struct *p; + + t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL, + cpu_to_node(cpu)); + if (!t) { + pr_err("ERROR: out of memory, can't create new thread\n"); + return -ENOMEM; + } + + spin_lock_init(&t->if_lock); + t->cpu = cpu; + + INIT_LIST_HEAD(&t->if_list); + + list_add_tail(&t->th_list, &pktgen_threads); + init_completion(&t->start_done); + + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); + if (IS_ERR(p)) { + pr_err("kernel_thread() failed for cpu %d\n", t->cpu); + list_del(&t->th_list); + kfree(t); + return PTR_ERR(p); + } + kthread_bind(p, cpu); + t->tsk = p; + + pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, + &pktgen_thread_fops, t); + if (!pe) { + pr_err("cannot create %s/%s procfs entry\n", + PG_PROC_DIR, t->tsk->comm); + kthread_stop(p); + list_del(&t->th_list); + kfree(t); + return -EINVAL; + } + + wake_up_process(p); + wait_for_completion(&t->start_done); + + return 0; +} + +/* + * Removes a device from the thread if_list. + */ +static void _rem_dev_from_if_list(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + struct list_head *q, *n; + struct pktgen_dev *p; + + list_for_each_safe(q, n, &t->if_list) { + p = list_entry(q, struct pktgen_dev, list); + if (p == pkt_dev) + list_del(&p->list); + } +} + +static int pktgen_remove_device(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + + pr_debug("remove_device pkt_dev=%p\n", pkt_dev); + + if (pkt_dev->running) { + pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); + pktgen_stop_device(pkt_dev); + } + + /* Dis-associate from the interface */ + + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + /* And update the thread if_list */ + + _rem_dev_from_if_list(t, pkt_dev); + + if (pkt_dev->entry) + remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + +#ifdef CONFIG_XFRM + free_SAs(pkt_dev); +#endif + vfree(pkt_dev->flows); + if (pkt_dev->page) + put_page(pkt_dev->page); + kfree(pkt_dev); + return 0; +} + +static int __init pg_init(void) +{ + int cpu; + struct proc_dir_entry *pe; + int ret = 0; + + pr_info("%s", version); + + pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); + if (!pg_proc_dir) + return -ENODEV; + + pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); + if (pe == NULL) { + pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); + ret = -EINVAL; + goto remove_dir; + } + + register_netdevice_notifier(&pktgen_notifier_block); + + for_each_online_cpu(cpu) { + int err; + + err = pktgen_create_thread(cpu); + if (err) + pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", + cpu, err); + } + + if (list_empty(&pktgen_threads)) { + pr_err("ERROR: Initialization failed for all threads\n"); + ret = -ENODEV; + goto unregister; + } + + return 0; + + unregister: + unregister_netdevice_notifier(&pktgen_notifier_block); + remove_proc_entry(PGCTRL, pg_proc_dir); + remove_dir: + proc_net_remove(&init_net, PG_PROC_DIR); + return ret; +} + +static void __exit pg_cleanup(void) +{ + struct pktgen_thread *t; + struct list_head *q, *n; + LIST_HEAD(list); + + /* Stop all interfaces & threads */ + pktgen_exiting = true; + + mutex_lock(&pktgen_thread_lock); + list_splice_init(&pktgen_threads, &list); + mutex_unlock(&pktgen_thread_lock); + + list_for_each_safe(q, n, &list) { + t = list_entry(q, struct pktgen_thread, th_list); + list_del(&t->th_list); + kthread_stop(t->tsk); + kfree(t); + } + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&pktgen_notifier_block); + + /* Clean up proc file system */ + remove_proc_entry(PGCTRL, pg_proc_dir); + proc_net_remove(&init_net, PG_PROC_DIR); +} + +module_init(pg_init); +module_exit(pg_cleanup); + +MODULE_AUTHOR("Robert Olsson "); +MODULE_DESCRIPTION("Packet Generator tool"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(VERSION); +module_param(pg_count_d, int, 0); +MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject"); +module_param(pg_delay_d, int, 0); +MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)"); +module_param(pg_clone_skb_d, int, 0); +MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet"); +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Enable debugging of pktgen module"); diff --git a/net/core/request_sock.c b/net/core/request_sock.c new file mode 100644 index 00000000..182236b2 --- /dev/null +++ b/net/core/request_sock.c @@ -0,0 +1,131 @@ +/* + * NET Generic infrastructure for Network protocols. + * + * Authors: Arnaldo Carvalho de Melo + * + * From code originally in include/net/tcp.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + * + * It was 128 by default. Experiments with real servers show, that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. This value is adjusted to 128 for very small machines + * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * Note : Dont forget somaxconn that may limit backlog too. + */ +int sysctl_max_syn_backlog = 256; +EXPORT_SYMBOL(sysctl_max_syn_backlog); + +int reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries) +{ + size_t lopt_size = sizeof(struct listen_sock); + struct listen_sock *lopt; + + nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); + nr_table_entries = max_t(u32, nr_table_entries, 8); + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); + lopt_size += nr_table_entries * sizeof(struct request_sock *); + if (lopt_size > PAGE_SIZE) + lopt = vzalloc(lopt_size); + else + lopt = kzalloc(lopt_size, GFP_KERNEL); + if (lopt == NULL) + return -ENOMEM; + + for (lopt->max_qlen_log = 3; + (1 << lopt->max_qlen_log) < nr_table_entries; + lopt->max_qlen_log++); + + get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); + rwlock_init(&queue->syn_wait_lock); + queue->rskq_accept_head = NULL; + lopt->nr_table_entries = nr_table_entries; + + write_lock_bh(&queue->syn_wait_lock); + queue->listen_opt = lopt; + write_unlock_bh(&queue->syn_wait_lock); + + return 0; +} + +void __reqsk_queue_destroy(struct request_sock_queue *queue) +{ + struct listen_sock *lopt; + size_t lopt_size; + + /* + * this is an error recovery path only + * no locking needed and the lopt is not NULL + */ + + lopt = queue->listen_opt; + lopt_size = sizeof(struct listen_sock) + + lopt->nr_table_entries * sizeof(struct request_sock *); + + if (lopt_size > PAGE_SIZE) + vfree(lopt); + else + kfree(lopt); +} + +static inline struct listen_sock *reqsk_queue_yank_listen_sk( + struct request_sock_queue *queue) +{ + struct listen_sock *lopt; + + write_lock_bh(&queue->syn_wait_lock); + lopt = queue->listen_opt; + queue->listen_opt = NULL; + write_unlock_bh(&queue->syn_wait_lock); + + return lopt; +} + +void reqsk_queue_destroy(struct request_sock_queue *queue) +{ + /* make all the listen_opt local to us */ + struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); + size_t lopt_size = sizeof(struct listen_sock) + + lopt->nr_table_entries * sizeof(struct request_sock *); + + if (lopt->qlen != 0) { + unsigned int i; + + for (i = 0; i < lopt->nr_table_entries; i++) { + struct request_sock *req; + + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + reqsk_free(req); + } + } + } + + WARN_ON(lopt->qlen != 0); + if (lopt_size > PAGE_SIZE) + vfree(lopt); + else + kfree(lopt); +} + diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c new file mode 100644 index 00000000..abd936d8 --- /dev/null +++ b/net/core/rtnetlink.c @@ -0,0 +1,2030 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct rtnl_link { + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; +}; + +static DEFINE_MUTEX(rtnl_mutex); + +void rtnl_lock(void) +{ + mutex_lock(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock); + +void __rtnl_unlock(void) +{ + mutex_unlock(&rtnl_mutex); +} + +void rtnl_unlock(void) +{ + /* This fellow will unlock it for us. */ + netdev_run_todo(); +} +EXPORT_SYMBOL(rtnl_unlock); + +int rtnl_trylock(void) +{ + return mutex_trylock(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_trylock); + +int rtnl_is_locked(void) +{ + return mutex_is_locked(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_is_locked); + +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rtnl_is_held(void) +{ + return lockdep_is_held(&rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_is_held); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ + +static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; + +static inline int rtm_msgindex(int msgtype) +{ + int msgindex = msgtype - RTM_BASE; + + /* + * msgindex < 0 implies someone tried to register a netlink + * control code. msgindex >= RTM_NR_MSGTYPES may indicate that + * the message type has not been added to linux/rtnetlink.h + */ + BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + + return msgindex; +} + +static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + + if (tab == NULL || tab[msgindex].doit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].doit : NULL; +} + +static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + + if (tab == NULL || tab[msgindex].dumpit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].dumpit : NULL; +} + +/** + * __rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit) +{ + struct rtnl_link *tab; + int msgindex; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + msgindex = rtm_msgindex(msgtype); + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL) { + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); + if (tab == NULL) + return -ENOBUFS; + + rtnl_msg_handlers[protocol] = tab; + } + + if (doit) + tab[msgindex].doit = doit; + + if (dumpit) + tab[msgindex].dumpit = dumpit; + + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_register); + +/** + * rtnl_register - Register a rtnetlink message type + * + * Identical to __rtnl_register() but panics on failure. This is useful + * as failure of this function is very unlikely, it can only happen due + * to lack of memory when allocating the chain to store all message + * handlers for a protocol. Meant for use in init functions where lack + * of memory implies no sense in continuing. + */ +void rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit) +{ + if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0) + panic("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", + protocol, msgtype); +} +EXPORT_SYMBOL_GPL(rtnl_register); + +/** + * rtnl_unregister - Unregister a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * + * Returns 0 on success or a negative error code. + */ +int rtnl_unregister(int protocol, int msgtype) +{ + int msgindex; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + msgindex = rtm_msgindex(msgtype); + + if (rtnl_msg_handlers[protocol] == NULL) + return -ENOENT; + + rtnl_msg_handlers[protocol][msgindex].doit = NULL; + rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_unregister); + +/** + * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol + * @protocol : Protocol family or PF_UNSPEC + * + * Identical to calling rtnl_unregster() for all registered message types + * of a certain protocol family. + */ +void rtnl_unregister_all(int protocol) +{ + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + + kfree(rtnl_msg_handlers[protocol]); + rtnl_msg_handlers[protocol] = NULL; +} +EXPORT_SYMBOL_GPL(rtnl_unregister_all); + +static LIST_HEAD(link_ops); + +/** + * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * The caller must hold the rtnl_mutex. This function should be used + * by drivers that create devices during module initialization. It + * must be called before registering the devices. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_link_register(struct rtnl_link_ops *ops) +{ + if (!ops->dellink) + ops->dellink = unregister_netdevice_queue; + + list_add_tail(&ops->list, &link_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_link_register); + +/** + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_link_register(struct rtnl_link_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_link_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_link_register); + +static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) +{ + struct net_device *dev; + LIST_HEAD(list_kill); + + for_each_netdev(net, dev) { + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); + } + unregister_netdevice_many(&list_kill); +} + +/** + * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + struct net *net; + + for_each_net(net) { + __rtnl_kill_links(net, ops); + } + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_link_unregister); + +/** + * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + */ +void rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + rtnl_lock(); + __rtnl_link_unregister(ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_link_unregister); + +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ + const struct rtnl_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) + return ops; + } + return NULL; +} + +static size_t rtnl_link_get_size(const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + size_t size; + + if (!ops) + return 0; + + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + + if (ops->get_size) + /* IFLA_INFO_DATA + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + ops->get_size(dev); + + if (ops->get_xstats_size) + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); + + return size; +} + +static LIST_HEAD(rtnl_af_ops); + +static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +{ + const struct rtnl_af_ops *ops; + + list_for_each_entry(ops, &rtnl_af_ops, list) { + if (ops->family == family) + return ops; + } + + return NULL; +} + +/** + * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * The caller must hold the rtnl_mutex. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_af_register(struct rtnl_af_ops *ops) +{ + list_add_tail(&ops->list, &rtnl_af_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_af_register); + +/** + * rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_af_register(struct rtnl_af_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_af_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_af_register); + +/** + * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_af_unregister); + +/** + * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + */ +void rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + rtnl_lock(); + __rtnl_af_unregister(ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_af_unregister); + +static size_t rtnl_link_get_af_size(const struct net_device *dev) +{ + struct rtnl_af_ops *af_ops; + size_t size; + + /* IFLA_AF_SPEC */ + size = nla_total_size(sizeof(struct nlattr)); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_link_af_size) { + /* AF_* + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + af_ops->get_link_af_size(dev); + } + } + + return size; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *linkinfo, *data; + int err = -EMSGSIZE; + + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) + goto err_cancel_link; + if (ops->fill_xstats) { + err = ops->fill_xstats(skb, dev); + if (err < 0) + goto err_cancel_link; + } + if (ops->fill_info) { + data = nla_nest_start(skb, IFLA_INFO_DATA); + if (data == NULL) + goto err_cancel_link; + err = ops->fill_info(skb, dev); + if (err < 0) + goto err_cancel_data; + nla_nest_end(skb, data); + } + + nla_nest_end(skb, linkinfo); + return 0; + +err_cancel_data: + nla_nest_cancel(skb, data); +err_cancel_link: + nla_nest_cancel(skb, linkinfo); +out: + return err; +} + +static const int rtm_min[RTM_NR_FAMILIES] = +{ + [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), + [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), + [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), + [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), + [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), +}; + +static const int rta_max[RTM_NR_FAMILIES] = +{ + [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, + [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, + [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, + [RTM_FAM(RTM_NEWRULE)] = FRA_MAX, + [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, + [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, + [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, + [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); + memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); +} +EXPORT_SYMBOL(__rta_fill); + +int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) +{ + struct sock *rtnl = net->rtnl; + int err = 0; + + NETLINK_CB(skb).dst_group = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) +{ + struct sock *rtnl = net->rtnl; + + return nlmsg_unicast(rtnl, skb, pid); +} +EXPORT_SYMBOL(rtnl_unicast); + +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) +{ + struct sock *rtnl = net->rtnl; + int report = 0; + + if (nlh) + report = nlmsg_report(nlh); + + nlmsg_notify(rtnl, skb, pid, group, report, flags); +} +EXPORT_SYMBOL(rtnl_notify); + +void rtnl_set_sk_err(struct net *net, u32 group, int error) +{ + struct sock *rtnl = net->rtnl; + + netlink_set_err(rtnl, 0, group, error); +} +EXPORT_SYMBOL(rtnl_set_sk_err); + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ + struct nlattr *mx; + int i, valid = 0; + + mx = nla_nest_start(skb, RTA_METRICS); + if (mx == NULL) + return -ENOBUFS; + + for (i = 0; i < RTAX_MAX; i++) { + if (metrics[i]) { + valid++; + NLA_PUT_U32(skb, i+1, metrics[i]); + } + } + + if (!valid) { + nla_nest_cancel(skb, mx); + return 0; + } + + return nla_nest_end(skb, mx); + +nla_put_failure: + nla_nest_cancel(skb, mx); + return -EMSGSIZE; +} +EXPORT_SYMBOL(rtnetlink_put_metrics); + +int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, + u32 ts, u32 tsage, long expires, u32 error) +{ + struct rta_cacheinfo ci = { + .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), + .rta_used = dst->__use, + .rta_clntref = atomic_read(&(dst->__refcnt)), + .rta_error = error, + .rta_id = id, + .rta_ts = ts, + .rta_tsage = tsage, + }; + + if (expires) + ci.rta_expires = jiffies_to_clock_t(expires); + + return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); +} +EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); + +static void set_operstate(struct net_device *dev, unsigned char transition) +{ + unsigned char operstate = dev->operstate; + + switch (transition) { + case IF_OPER_UP: + if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_UNKNOWN) && + !netif_dormant(dev)) + operstate = IF_OPER_UP; + break; + + case IF_OPER_DORMANT: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_DORMANT; + break; + } + + if (dev->operstate != operstate) { + write_lock_bh(&dev_base_lock); + dev->operstate = operstate; + write_unlock_bh(&dev_base_lock); + netdev_state_change(dev); + } +} + +static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + const struct ifinfomsg *ifm) +{ + unsigned int flags = ifm->ifi_flags; + + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | + (dev->flags & ~ifm->ifi_change); + + return flags; +} + +static void copy_rtnl_link_stats(struct rtnl_link_stats *a, + const struct rtnl_link_stats64 *b) +{ + a->rx_packets = b->rx_packets; + a->tx_packets = b->tx_packets; + a->rx_bytes = b->rx_bytes; + a->tx_bytes = b->tx_bytes; + a->rx_errors = b->rx_errors; + a->tx_errors = b->tx_errors; + a->rx_dropped = b->rx_dropped; + a->tx_dropped = b->tx_dropped; + + a->multicast = b->multicast; + a->collisions = b->collisions; + + a->rx_length_errors = b->rx_length_errors; + a->rx_over_errors = b->rx_over_errors; + a->rx_crc_errors = b->rx_crc_errors; + a->rx_frame_errors = b->rx_frame_errors; + a->rx_fifo_errors = b->rx_fifo_errors; + a->rx_missed_errors = b->rx_missed_errors; + + a->tx_aborted_errors = b->tx_aborted_errors; + a->tx_carrier_errors = b->tx_carrier_errors; + a->tx_fifo_errors = b->tx_fifo_errors; + a->tx_heartbeat_errors = b->tx_heartbeat_errors; + a->tx_window_errors = b->tx_window_errors; + + a->rx_compressed = b->rx_compressed; + a->tx_compressed = b->tx_compressed; +} + +static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) +{ + memcpy(v, b, sizeof(*b)); +} + +/* All VF info */ +static inline int rtnl_vfinfo_size(const struct net_device *dev) +{ + if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { + + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nla_total_size(sizeof(struct nlattr)); + size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size += num_vfs * + (nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate))); + return size; + } else + return 0; +} + +static size_t rtnl_port_size(const struct net_device *dev) +{ + size_t port_size = nla_total_size(4) /* PORT_VF */ + + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + + nla_total_size(sizeof(struct ifla_port_vsi)) + /* PORT_VSI_TYPE */ + + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + + nla_total_size(1) /* PROT_VDP_REQUEST */ + + nla_total_size(2); /* PORT_VDP_RESPONSE */ + size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); + size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + if (dev_num_vf(dev->dev.parent)) + return port_self_size + vf_ports_size + + vf_port_size * dev_num_vf(dev->dev.parent); + else + return port_self_size; +} + +static noinline size_t if_nlmsg_size(const struct net_device *dev) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + + nla_total_size(sizeof(struct rtnl_link_ifmap)) + + nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size(sizeof(struct rtnl_link_stats64)) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + + nla_total_size(4) /* IFLA_TXQLEN */ + + nla_total_size(4) /* IFLA_WEIGHT */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(4) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ +} + +static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *vf_ports; + struct nlattr *vf_port; + int vf; + int err; + + vf_ports = nla_nest_start(skb, IFLA_VF_PORTS); + if (!vf_ports) + return -EMSGSIZE; + + for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { + vf_port = nla_nest_start(skb, IFLA_VF_PORT); + if (!vf_port) + goto nla_put_failure; + NLA_PUT_U32(skb, IFLA_PORT_VF, vf); + err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); + if (err == -EMSGSIZE) + goto nla_put_failure; + if (err) { + nla_nest_cancel(skb, vf_port); + continue; + } + nla_nest_end(skb, vf_port); + } + + nla_nest_end(skb, vf_ports); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, vf_ports); + return -EMSGSIZE; +} + +static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *port_self; + int err; + + port_self = nla_nest_start(skb, IFLA_PORT_SELF); + if (!port_self) + return -EMSGSIZE; + + err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); + if (err) { + nla_nest_cancel(skb, port_self); + return (err == -EMSGSIZE) ? err : 0; + } + + nla_nest_end(skb, port_self); + + return 0; +} + +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + + err = rtnl_port_self_fill(skb, dev); + if (err) + return err; + + if (dev_num_vf(dev->dev.parent)) { + err = rtnl_vf_ports_fill(skb, dev); + if (err) + return err; + } + + return 0; +} + +static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags) +{ + struct ifinfomsg *ifm; + struct nlmsghdr *nlh; + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats; + struct nlattr *attr, *af_spec; + struct rtnl_af_ops *af_ops; + + ASSERT_RTNL(); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifi_family = AF_UNSPEC; + ifm->__ifi_pad = 0; + ifm->ifi_type = dev->type; + ifm->ifi_index = dev->ifindex; + ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_change = change; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len); + NLA_PUT_U8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN); + NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + NLA_PUT_U32(skb, IFLA_GROUP, dev->group); + + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + if (dev->master) + NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); + + if (dev->qdisc) + NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id); + + if (dev->ifalias) + NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias); + + if (1) { + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + NLA_PUT(skb, IFLA_MAP, sizeof(map), &map); + } + + if (dev->addr_len) { + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (attr == NULL) + goto nla_put_failure; + + stats = dev_get_stats(dev, &temp); + copy_rtnl_link_stats(nla_data(attr), stats); + + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (attr == NULL) + goto nla_put_failure; + copy_rtnl_link_stats64(nla_data(attr), stats); + + if (dev->dev.parent) + NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); + + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { + int i; + + struct nlattr *vfinfo, *vf; + int num_vfs = dev_num_vf(dev->dev.parent); + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + goto nla_put_failure; + for (i = 0; i < num_vfs; i++) { + struct ifla_vf_info ivi; + struct ifla_vf_mac vf_mac; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_tx_rate vf_tx_rate; + if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) + break; + vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.tx_rate; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); + NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); + nla_nest_end(skb, vf); + } + nla_nest_end(skb, vfinfo); + } + + if (rtnl_port_fill(skb, dev)) + goto nla_put_failure; + + if (dev->rtnl_link_ops) { + if (rtnl_link_fill(skb, dev) < 0) + goto nla_put_failure; + } + + if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_link_af) { + struct nlattr *af; + int err; + + if (!(af = nla_nest_start(skb, af_ops->family))) + goto nla_put_failure; + + err = af_ops->fill_link_af(skb, dev); + + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, af_spec); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rcu_read_lock(); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI) <= 0) + goto out; +cont: + idx++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, + [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, + [IFLA_MTU] = { .type = NLA_U32 }, + [IFLA_LINK] = { .type = NLA_U32 }, + [IFLA_MASTER] = { .type = NLA_U32 }, + [IFLA_TXQLEN] = { .type = NLA_U32 }, + [IFLA_WEIGHT] = { .type = NLA_U32 }, + [IFLA_OPERSTATE] = { .type = NLA_U8 }, + [IFLA_LINKMODE] = { .type = NLA_U8 }, + [IFLA_LINKINFO] = { .type = NLA_NESTED }, + [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_NET_NS_FD] = { .type = NLA_U32 }, + [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, + [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, + [IFLA_VF_PORTS] = { .type = NLA_NESTED }, + [IFLA_PORT_SELF] = { .type = NLA_NESTED }, + [IFLA_AF_SPEC] = { .type = NLA_NESTED }, +}; +EXPORT_SYMBOL(ifla_policy); + +static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { + [IFLA_INFO_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_DATA] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { + [IFLA_VF_INFO] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { + [IFLA_VF_MAC] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_VLAN] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_tx_rate) }, +}; + +static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { + [IFLA_PORT_VF] = { .type = NLA_U32 }, + [IFLA_PORT_PROFILE] = { .type = NLA_STRING, + .len = PORT_PROFILE_MAX }, + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi)}, + [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, + .len = PORT_UUID_MAX }, + [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, + .len = PORT_UUID_MAX }, + [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, + [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, +}; + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net; + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else if (tb[IFLA_NET_NS_FD]) + net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); + else + net = get_net(src_net); + return net; +} +EXPORT_SYMBOL(rtnl_link_get_net); + +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +{ + if (dev) { + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem, err; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + return -EAFNOSUPPORT; + + if (!af_ops->set_link_af) + return -EOPNOTSUPP; + + if (af_ops->validate_link_af) { + err = af_ops->validate_link_af(dev, af); + if (err < 0) + return err; + } + } + } + + return 0; +} + +static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +{ + int rem, err = -EINVAL; + struct nlattr *vf; + const struct net_device_ops *ops = dev->netdev_ops; + + nla_for_each_nested(vf, attr, rem) { + switch (nla_type(vf)) { + case IFLA_VF_MAC: { + struct ifla_vf_mac *ivm; + ivm = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + break; + } + case IFLA_VF_VLAN: { + struct ifla_vf_vlan *ivv; + ivv = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, + ivv->vlan, + ivv->qos); + break; + } + case IFLA_VF_TX_RATE: { + struct ifla_vf_tx_rate *ivt; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_tx_rate) + err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, + ivt->rate); + break; + } + default: + err = -EINVAL; + break; + } + if (err) + break; + } + return err; +} + +static int do_set_master(struct net_device *dev, int ifindex) +{ + struct net_device *master_dev; + const struct net_device_ops *ops; + int err; + + if (dev->master) { + if (dev->master->ifindex == ifindex) + return 0; + ops = dev->master->netdev_ops; + if (ops->ndo_del_slave) { + err = ops->ndo_del_slave(dev->master, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + + if (ifindex) { + master_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!master_dev) + return -EINVAL; + ops = master_dev->netdev_ops; + if (ops->ndo_add_slave) { + err = ops->ndo_add_slave(master_dev, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + return 0; +} + +static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, + struct nlattr **tb, char *ifname, int modified) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int send_addr_notify = 0; + int err; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { + struct net *net = rtnl_link_get_net(dev_net(dev), tb); + if (IS_ERR(net)) { + err = PTR_ERR(net); + goto errout; + } + err = dev_change_net_namespace(dev, net, ifname); + put_net(net); + if (err) + goto errout; + modified = 1; + } + + if (tb[IFLA_MAP]) { + struct rtnl_link_ifmap *u_map; + struct ifmap k_map; + + if (!ops->ndo_set_config) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + u_map = nla_data(tb[IFLA_MAP]); + k_map.mem_start = (unsigned long) u_map->mem_start; + k_map.mem_end = (unsigned long) u_map->mem_end; + k_map.base_addr = (unsigned short) u_map->base_addr; + k_map.irq = (unsigned char) u_map->irq; + k_map.dma = (unsigned char) u_map->dma; + k_map.port = (unsigned char) u_map->port; + + err = ops->ndo_set_config(dev, &k_map); + if (err < 0) + goto errout; + + modified = 1; + } + + if (tb[IFLA_ADDRESS]) { + struct sockaddr *sa; + int len; + + if (!ops->ndo_set_mac_address) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + len = sizeof(sa_family_t) + dev->addr_len; + sa = kmalloc(len, GFP_KERNEL); + if (!sa) { + err = -ENOMEM; + goto errout; + } + sa->sa_family = dev->type; + memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), + dev->addr_len); + err = ops->ndo_set_mac_address(dev, sa); + kfree(sa); + if (err) + goto errout; + send_addr_notify = 1; + modified = 1; + } + + if (tb[IFLA_MTU]) { + err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_GROUP]) { + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + modified = 1; + } + + /* + * Interface selected by interface index but interface + * name provided implies that a name change has been + * requested. + */ + if (ifm->ifi_index > 0 && ifname[0]) { + err = dev_change_name(dev, ifname); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_IFALIAS]) { + err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_BROADCAST]) { + nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); + send_addr_notify = 1; + } + + if (ifm->ifi_flags || ifm->ifi_change) { + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + if (err < 0) + goto errout; + } + + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + if (err) + goto errout; + modified = 1; + } + + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + + if (tb[IFLA_LINKMODE]) { + write_lock_bh(&dev_base_lock); + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + write_unlock_bh(&dev_base_lock); + } + + if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *attr; + int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { + if (nla_type(attr) != IFLA_VF_INFO) { + err = -EINVAL; + goto errout; + } + err = do_setvfinfo(dev, attr); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_VF_PORTS]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + struct nlattr *attr; + int vf; + int rem; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_port) + goto errout; + + nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { + if (nla_type(attr) != IFLA_VF_PORT) + continue; + err = nla_parse_nested(port, IFLA_PORT_MAX, + attr, ifla_port_policy); + if (err < 0) + goto errout; + if (!port[IFLA_PORT_VF]) { + err = -EOPNOTSUPP; + goto errout; + } + vf = nla_get_u32(port[IFLA_PORT_VF]); + err = ops->ndo_set_vf_port(dev, vf, port); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_PORT_SELF]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + + err = nla_parse_nested(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], ifla_port_policy); + if (err < 0) + goto errout; + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_port) + err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + BUG(); + + err = af_ops->set_link_af(dev, af); + if (err < 0) + goto errout; + + modified = 1; + } + } + err = 0; + +errout: + if (err < 0 && modified && net_ratelimit()) + printk(KERN_WARNING "A link change request failed with " + "some changes committed already. Interface %s may " + "have been left with an inconsistent configuration, " + "please check.\n", dev->name); + + if (send_addr_notify) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} + +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + int err; + struct nlattr *tb[IFLA_MAX+1]; + char ifname[IFNAMSIZ]; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + goto errout; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + err = -EINVAL; + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + goto errout; + + if (dev == NULL) { + err = -ENODEV; + goto errout; + } + + err = validate_linkmsg(dev, tb); + if (err < 0) + goto errout; + + err = do_setlink(dev, ifm, tb, ifname, 0); +errout: + return err; +} + +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct net_device *dev; + struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + int err; + LIST_HEAD(list_kill); + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + ops = dev->rtnl_link_ops; + if (!ops) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + list_del(&list_kill); + return 0; +} + +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +{ + unsigned int old_flags; + int err; + + old_flags = dev->flags; + if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + if (err < 0) + return err; + } + + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + + __dev_notify_flags(dev, old_flags); + return 0; +} +EXPORT_SYMBOL(rtnl_configure_link); + +struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) +{ + int err; + struct net_device *dev; + unsigned int num_queues = 1; + unsigned int real_num_queues = 1; + + if (ops->get_tx_queues) { + err = ops->get_tx_queues(src_net, tb, &num_queues, + &real_num_queues); + if (err) + goto err; + } + err = -ENOMEM; + dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); + if (!dev) + goto err; + + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; + dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->real_num_tx_queues = real_num_queues; + + if (tb[IFLA_MTU]) + dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_ADDRESS]) + memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); + if (tb[IFLA_BROADCAST]) + memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), + nla_len(tb[IFLA_BROADCAST])); + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + if (tb[IFLA_LINKMODE]) + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (tb[IFLA_GROUP]) + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + + return dev; + +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL(rtnl_create_link); + +static int rtnl_group_changelink(struct net *net, int group, + struct ifinfomsg *ifm, + struct nlattr **tb) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + if (dev->group == group) { + err = do_setlink(dev, ifm, tb, NULL, 0); + if (err < 0) + return err; + } + } + + return 0; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct net_device *dev; + struct ifinfomsg *ifm; + char kind[MODULE_NAME_LEN]; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + struct nlattr *linkinfo[IFLA_INFO_MAX+1]; + int err; + +#ifdef CONFIG_MODULES +replay: +#endif + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else { + if (ifname[0]) + dev = __dev_get_by_name(net, ifname); + else + dev = NULL; + } + + err = validate_linkmsg(dev, tb); + if (err < 0) + return err; + + if (tb[IFLA_LINKINFO]) { + err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], ifla_info_policy); + if (err < 0) + return err; + } else + memset(linkinfo, 0, sizeof(linkinfo)); + + if (linkinfo[IFLA_INFO_KIND]) { + nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } else { + kind[0] = '\0'; + ops = NULL; + } + + if (1) { + struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct net *dest_net; + + if (ops) { + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data); + if (err < 0) + return err; + } + } + + if (dev) { + int modified = 0; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data); + if (err < 0) + return err; + modified = 1; + } + + return do_setlink(dev, ifm, tb, ifname, modified); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, tb); + return -ENODEV; + } + + if (ifm->ifi_index) + return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { +#ifdef CONFIG_MODULES + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; + } +#endif + return -EOPNOTSUPP; + } + + if (!ifname[0]) + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + + dest_net = rtnl_link_get_net(net, tb); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); + + if (IS_ERR(dev)) + err = PTR_ERR(dev); + else if (ops->newlink) + err = ops->newlink(net, dev, tb, data); + else + err = register_netdevice(dev); + + if (err < 0 && !IS_ERR(dev)) + free_netdev(dev); + if (err < 0) + goto out; + + err = rtnl_configure_link(dev, ifm); + if (err < 0) + unregister_netdevice(dev); +out: + put_net(dest_net); + return err; + } +} + +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct sk_buff *nskb; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + return -EINVAL; + + if (dev == NULL) + return -ENODEV; + + nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + if (nskb == NULL) + return -ENOBUFS; + + err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); + + return err; +} + +static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnl_msg_handlers[idx] == NULL || + rtnl_msg_handlers[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) + break; + } + cb->family = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); +} + +/* Protected by RTNL sempahore. */ +static struct rtattr **rta_buf; +static int rtattr_max; + +/* Process one rtnetlink message. */ + +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + rtnl_doit_func doit; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + type = nlh->nlmsg_type; + if (type > RTM_MAX) + return -EOPNOTSUPP; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + struct sock *rtnl; + rtnl_dumpit_func dumpit; + + dumpit = rtnl_get_dumpit(family, type); + if (dumpit == NULL) + return -EOPNOTSUPP; + + __rtnl_unlock(); + rtnl = net->rtnl; + err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); + rtnl_lock(); + return err; + } + + memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + return -EINVAL; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + return -EINVAL; + rta_buf[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + doit = rtnl_get_doit(family, type); + if (doit == NULL) + return -EOPNOTSUPP; + + return doit(skb, nlh, (void *)&rta_buf[0]); +} + +static void rtnetlink_rcv(struct sk_buff *skb) +{ + rtnl_lock(); + netlink_rcv_skb(skb, &rtnetlink_rcv_msg); + rtnl_unlock(); +} + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_PRE_UP: + case NETDEV_POST_INIT: + case NETDEV_REGISTER: + case NETDEV_CHANGE: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_GOING_DOWN: + case NETDEV_UNREGISTER: + case NETDEV_UNREGISTER_BATCH: + case NETDEV_RELEASE: + case NETDEV_JOIN: + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { + .notifier_call = rtnetlink_event, +}; + + +static int __net_init rtnetlink_net_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, + rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + if (!sk) + return -ENOMEM; + net->rtnl = sk; + return 0; +} + +static void __net_exit rtnetlink_net_exit(struct net *net) +{ + netlink_kernel_release(net->rtnl); + net->rtnl = NULL; +} + +static struct pernet_operations rtnetlink_net_ops = { + .init = rtnetlink_net_init, + .exit = rtnetlink_net_exit, +}; + +void __init rtnetlink_init(void) +{ + int i; + + rtattr_max = 0; + for (i = 0; i < ARRAY_SIZE(rta_max); i++) + if (rta_max[i] > rtattr_max) + rtattr_max = rta_max[i]; + rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); + if (!rta_buf) + panic("rtnetlink_init: cannot allocate rta_buf\n"); + + if (register_pernet_subsys(&rtnetlink_net_ops)) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + + netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); + register_netdevice_notifier(&rtnetlink_dev_notifier); + + rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo); + rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL); + rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all); + rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); +} + diff --git a/net/core/scm.c b/net/core/scm.c new file mode 100644 index 00000000..811b53fb --- /dev/null +++ b/net/core/scm.c @@ -0,0 +1,344 @@ +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, + * Alignment and value checking mods by Craig Metz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + const struct cred *cred = current_cred(); + + if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && + ((creds->uid == cred->uid || creds->uid == cred->euid || + creds->uid == cred->suid) || capable(CAP_SETUID)) && + ((creds->gid == cred->gid || creds->gid == cred->egid || + creds->gid == cred->sgid) || capable(CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + fpl->max = SCM_MAX_FD; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > fpl->max) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget_raw(fd))) + return -EBADF; + *fpp++ = file; + fpl->count++; + } + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + if (current->scm_work_list) { + list_add_tail(&fpl->list, current->scm_work_list); + } else { + LIST_HEAD(work_list); + + current->scm_work_list = &work_list; + + list_add(&fpl->list, &work_list); + while (!list_empty(&work_list)) { + fpl = list_first_entry(&work_list, struct scm_fp_list, list); + + list_del(&fpl->list); + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } + + current->scm_work_list = NULL; + } + } +} +EXPORT_SYMBOL(__scm_destroy); + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) + { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (!CMSG_OK(msg, cmsg)) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + if (!sock->ops || sock->ops->family != PF_UNIX) + goto error; + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&p->creds); + if (err) + goto error; + + if (pid_vnr(p->pid) != p->creds.pid) { + struct pid *pid; + err = -ESRCH; + pid = find_get_pid(p->creds.pid); + if (!pid) + goto error; + put_pid(p->pid); + p->pid = pid; + } + + if ((p->cred->euid != p->creds.uid) || + (p->cred->egid != p->creds.gid)) { + struct cred *cred; + err = -ENOMEM; + cred = prepare_creds(); + if (!cred) + goto error; + + cred->uid = cred->euid = p->creds.uid; + cred->gid = cred->egid = p->creds.gid; + put_cred(p->cred); + p->cred = cred; + } + break; + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + return 0; + +error: + scm_destroy(p); + return err; +} +EXPORT_SYMBOL(__scm_send); + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + struct cmsghdr __user *cm + = (__force struct cmsghdr __user *)msg->msg_control; + struct cmsghdr cmhdr; + int cmlen = CMSG_LEN(len); + int err; + + if (MSG_CMSG_COMPAT & msg->msg_flags) + return put_cmsg_compat(msg, level, type, len, data); + + if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: + return err; +} +EXPORT_SYMBOL(put_cmsg); + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr __user *cm + = (__force struct cmsghdr __user*)msg->msg_control; + + int fdmax = 0; + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int __user *cmfptr; + int err = 0, i; + + if (MSG_CMSG_COMPAT & msg->msg_flags) { + scm_detach_fds_compat(msg, scm); + return; + } + + if (msg->msg_controllen > sizeof(struct cmsghdr)) + fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) + / sizeof(int)); + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); imsg_flags + ? O_CLOEXEC : 0); + if (err < 0) + break; + new_fd = err; + err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + get_file(fp[i]); + fd_install(new_fd, fp[i]); + } + + if (i > 0) + { + int cmlen = CMSG_LEN(i*sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i*sizeof(int)); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + if (i < fdnum || (fdnum && fdmax <= 0)) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} +EXPORT_SYMBOL(scm_detach_fds); + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), + GFP_KERNEL); + if (new_fpl) { + for (i = 0; i < fpl->count; i++) + get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; + } + return new_fpl; +} +EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c new file mode 100644 index 00000000..45329d7c --- /dev/null +++ b/net/core/secure_seq.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; + +static int __init net_secret_init(void) +{ + get_random_bytes(net_secret, sizeof(net_secret)); + return 0; +} +late_initcall(net_secret_init); + +static u32 seq_scale(u32 seq) +{ + /* + * As close as possible to RFC 793, which + * suggests using a 250 kHz clock. + * Further reading shows this assumes 2 Mb/s networks. + * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. + * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but + * we also need to limit the resolution so that the u32 seq + * overlaps less than one time per MSL (2 minutes). + * Choosing a clock of 64 ns period is OK. (period of 274 s) + */ + return seq + (ktime_to_ns(ktime_get_real()) >> 6); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + daddr[i]; + secret[4] = net_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + return seq_scale(hash[0]); +} +EXPORT_SYMBOL(secure_tcpv6_sequence_number); + +u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + (__force u32) daddr[i]; + secret[4] = net_secret[4] + (__force u32)dport; + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + return hash[0]; +} +#endif + +#ifdef CONFIG_INET +__u32 secure_ip_id(__be32 daddr) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force __u32) daddr; + hash[1] = net_secret[13]; + hash[2] = net_secret[14]; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return hash[0]; +} + +__u32 secure_ipv6_id(const __be32 daddr[4]) +{ + __u32 hash[4]; + + memcpy(hash, daddr, 16); + md5_transform(hash, net_secret); + + return hash[0]; +} + +__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return seq_scale(hash[0]); +} + +u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = (__force u32)dport ^ net_secret[14]; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return hash[0]; +} +EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); +#endif + +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + u64 seq; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + seq = hash[0] | (((u64)hash[1]) << 32); + seq += ktime_to_ns(ktime_get_real()); + seq &= (1ull << 48) - 1; + + return seq; +} +EXPORT_SYMBOL(secure_dccp_sequence_number); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u64 seq; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + daddr[i]; + secret[4] = net_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + seq = hash[0] | (((u64)hash[1]) << 32); + seq += ktime_to_ns(ktime_get_real()); + seq &= (1ull << 48) - 1; + + return seq; +} +EXPORT_SYMBOL(secure_dccpv6_sequence_number); +#endif +#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c new file mode 100644 index 00000000..4821df84 --- /dev/null +++ b/net/core/skbuff.c @@ -0,0 +1,3084 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox + * Florian La Roche + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NET_CLS_ACT +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "kmap_skb.h" + +static struct kmem_cache *skbuff_head_cache __read_mostly; +static struct kmem_cache *skbuff_fclone_cache __read_mostly; + +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + put_page(buf->page); +} + +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + get_page(buf->page); +} + +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + + +/* Pipe buffer operations for a socket. */ +static const struct pipe_buf_operations sock_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = sock_pipe_buf_release, + .steal = sock_pipe_buf_steal, + .get = sock_pipe_buf_get, +}; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put(). Not user callable. + */ +static void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : ""); + BUG(); +} + +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push(). Not user callable. + */ + +static void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : ""); + BUG(); +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +/** + * __alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * @fclone: allocate from fclone cache instead of head cache + * and allocate a cloned (child) skb + * @node: numa node to allocate memory on + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + int fclone, int node) +{ + struct kmem_cache *cache; + struct skb_shared_info *shinfo; + struct sk_buff *skb; + u8 *data; + + cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); + if (!skb) + goto out; + prefetchw(skb); + + size = SKB_DATA_ALIGN(size); + data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), + gfp_mask, node); + if (!data) + goto nodata; + prefetchw(data + size); + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + kmemcheck_annotate_variable(shinfo->destructor_arg); + + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); + + kmemcheck_annotate_bitfield(child, flags1); + kmemcheck_annotate_bitfield(child, flags2); + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(fclone_ref, 1); + + child->fclone = SKB_FCLONE_UNAVAILABLE; + } +out: + return skb; +nodata: + kmem_cache_free(cache, skb); + skb = NULL; + goto out; +} +EXPORT_SYMBOL(__alloc_skb); + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb); + +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, + int size) +{ + skb_fill_page_desc(skb, i, page, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += size; +} +EXPORT_SYMBOL(skb_add_rx_frag); + +/** + * dev_alloc_skb - allocate an skbuff for receiving + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ +struct sk_buff *dev_alloc_skb(unsigned int length) +{ + /* + * There is more code here than it seems: + * __dev_alloc_skb is an inline + */ + return __dev_alloc_skb(length, GFP_ATOMIC); +} +EXPORT_SYMBOL(dev_alloc_skb); + +static void skb_drop_list(struct sk_buff **listp) +{ + struct sk_buff *list = *listp; + + *listp = NULL; + + do { + struct sk_buff *this = list; + list = list->next; + kfree_skb(this); + } while (list); +} + +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(&skb_shinfo(skb)->frag_list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list; + + skb_walk_frags(skb, list) + skb_get(list); +} + +static void skb_release_data(struct sk_buff *skb) +{ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + } + + if (skb_has_frag_list(skb)) + skb_drop_fraglist(skb); + + kfree(skb->head); + } +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +static void kfree_skbmem(struct sk_buff *skb) +{ + struct sk_buff *other; + atomic_t *fclone_ref; + + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + break; + + case SKB_FCLONE_ORIG: + fclone_ref = (atomic_t *) (skb + 2); + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, skb); + break; + + case SKB_FCLONE_CLONE: + fclone_ref = (atomic_t *) (skb + 1); + other = skb - 1; + + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, other); + break; + } +} + +static void skb_release_head_state(struct sk_buff *skb) +{ + skb_dst_drop(skb); +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif + if (skb->destructor) { + WARN_ON(in_irq()); + skb->destructor(skb); + } +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(skb->nf_bridge); +#endif +/* XXX: IS this still necessary? - JHS */ +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; +#endif +#endif +} + +/* Free everything but the sk_buff shell. */ +static void skb_release_all(struct sk_buff *skb) +{ + skb_release_head_state(skb); + skb_release_data(skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ + skb_release_all(skb); + kfree_skbmem(skb); +} +EXPORT_SYMBOL(__kfree_skb); + +/** + * kfree_skb - free an sk_buff + * @skb: buffer to free + * + * Drop a reference to the buffer and free it if the usage count has + * hit zero. + */ +void kfree_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + trace_kfree_skb(skb, __builtin_return_address(0)); + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb); + +/** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + trace_consume_skb(skb); + __kfree_skb(skb); +} +EXPORT_SYMBOL(consume_skb); + +/** + * skb_recycle_check - check if skb can be reused for receive + * @skb: buffer + * @skb_size: minimum receive buffer size + * + * Checks that the skb passed in is not shared or cloned, and + * that it is linear and its head portion at least as large as + * skb_size so that it can be recycled as a receive buffer. + * If these conditions are met, this function does any necessary + * reference count dropping and cleans up the skbuff as if it + * just came from __alloc_skb(). + */ +bool skb_recycle_check(struct sk_buff *skb, int skb_size) +{ + struct skb_shared_info *shinfo; + + if (irqs_disabled()) + return false; + + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) + return false; + + skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); + if (skb_end_pointer(skb) - skb->head < skb_size) + return false; + + if (skb_shared(skb) || skb_cloned(skb)) + return false; + + skb_release_head_state(skb); + + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = skb->head + NET_SKB_PAD; + skb_reset_tail_pointer(skb); + + return true; +} +EXPORT_SYMBOL(skb_recycle_check); + +static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + new->tstamp = old->tstamp; + new->dev = old->dev; + new->transport_header = old->transport_header; + new->network_header = old->network_header; + new->mac_header = old->mac_header; + skb_dst_copy(new, old); + new->rxhash = old->rxhash; +#ifdef CONFIG_XFRM + new->sp = secpath_get(old->sp); +#endif + memcpy(new->cb, old->cb, sizeof(old->cb)); + new->csum = old->csum; + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; + new->ip_summed = old->ip_summed; + skb_copy_queue_mapping(new, old); + new->priority = old->priority; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + new->ipvs_property = old->ipvs_property; +#endif + new->protocol = old->protocol; + new->mark = old->mark; + new->skb_iif = old->skb_iif; + __nf_copy(new, old); +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + new->nf_trace = old->nf_trace; +#endif +#ifdef CONFIG_NET_SCHED + new->tc_index = old->tc_index; +#ifdef CONFIG_NET_CLS_ACT + new->tc_verd = old->tc_verd; +#endif +#endif + new->vlan_tci = old->vlan_tci; + + skb_copy_secmark(new, old); +} + +/* + * You should not add any new code to this function. Add it to + * __copy_skb_header above instead. + */ +static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ +#define C(x) n->x = skb->x + + n->next = n->prev = NULL; + n->sk = NULL; + __copy_skb_header(n, skb); + + C(len); + C(data_len); + C(mac_len); + n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; + n->cloned = 1; + n->nohdr = 0; + n->destructor = NULL; + C(tail); + C(end); + C(head); + C(data); + C(truesize); + atomic_set(&n->users, 1); + + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + + return n; +#undef C +} + +/** + * skb_morph - morph one skb into another + * @dst: the skb to receive the contents + * @src: the skb to supply the contents + * + * This is identical to skb_clone except that the target skb is + * supplied by the user. + * + * The target skb is returned upon exit. + */ +struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) +{ + skb_release_all(dst); + return __skb_clone(dst, src); +} +EXPORT_SYMBOL_GPL(skb_morph); + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +{ + struct sk_buff *n; + + n = skb + 1; + if (skb->fclone == SKB_FCLONE_ORIG && + n->fclone == SKB_FCLONE_UNAVAILABLE) { + atomic_t *fclone_ref = (atomic_t *) (n + 1); + n->fclone = SKB_FCLONE_CLONE; + atomic_inc(fclone_ref); + } else { + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + + kmemcheck_annotate_bitfield(n, flags1); + kmemcheck_annotate_bitfield(n, flags2); + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + + return __skb_clone(n, skb); +} +EXPORT_SYMBOL(skb_clone); + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* + * Shift between the two data areas in bytes + */ + unsigned long offset = new->data - old->data; +#endif + + __copy_skb_header(new, old); + +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* {transport,network,mac}_header are relative to skb->head */ + new->transport_header += offset; + new->network_header += offset; + if (skb_mac_header_was_set(new)) + new->mac_header += offset; +#endif + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; +} + +/** + * skb_copy - create private copy of an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * As by-product this function converts non-linear &sk_buff to linear + * one, so that &sk_buff becomes completely private and caller is allowed + * to modify all the data of returned buffer. This means that this + * function is not recommended for use in circumstances when only + * header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) +{ + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) + BUG(); + + copy_skb_header(n, skb); + return n; +} +EXPORT_SYMBOL(skb_copy); + +/** + * pskb_copy - create copy of an sk_buff with private head. + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and part of its data, located + * in header. Fragmented data remain shared. This is used when + * the caller wishes to modify only header of &sk_buff and needs + * private copy of the header to alter. Returns %NULL on failure + * or the pointer to the buffer on success. + * The returned buffer has a reference count of 1. + */ + +struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) +{ + unsigned int size = skb_end_pointer(skb) - skb->head; + struct sk_buff *n = alloc_skb(size, gfp_mask); + + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, skb_headroom(skb)); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + skb_copy_from_linear_data(skb, n->data, n->len); + + n->truesize += skb->data_len; + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + get_page(skb_shinfo(n)->frags[i].page); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_has_frag_list(skb)) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + copy_skb_header(n, skb); +out: + return n; +} +EXPORT_SYMBOL(pskb_copy); + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if &nhead and &ntail are zero) + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + gfp_t gfp_mask) +{ + int i; + u8 *data; + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; + long off; + bool fastpath; + + BUG_ON(nhead < 0); + + if (skb_shared(skb)) + BUG(); + + size = SKB_DATA_ALIGN(size); + + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; + + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } + + if (fastpath && + size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { + memmove(skb->head + size, skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + memmove(skb->head + nhead, skb->head, + skb_tail_pointer(skb) - skb->head); + off = nhead; + goto adjust_others; + } + + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); + + if (fastpath) { + kfree(skb->head); + } else { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + skb_release_data(skb); + } + off = (data + nhead) - skb->head; + + skb->head = data; +adjust_others: + skb->data += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + off = nhead; +#else + skb->end = skb->head + size; +#endif + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->tail += off; + skb->transport_header += off; + skb->network_header += off; + if (skb_mac_header_was_set(skb)) + skb->mac_header += off; + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += nhead; + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; + +nodata: + return -ENOMEM; +} +EXPORT_SYMBOL(pskb_expand_head); + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, + GFP_ATOMIC)) { + kfree_skb(skb2); + skb2 = NULL; + } + } + return skb2; +} +EXPORT_SYMBOL(skb_realloc_headroom); + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, + gfp_t gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask); + int oldheadroom = skb_headroom(skb); + int head_copy_len, head_copy_off; + int off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = oldheadroom; + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) + BUG(); + + copy_skb_header(n, skb); + + off = newheadroom - oldheadroom; + if (n->ip_summed == CHECKSUM_PARTIAL) + n->csum_start += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n->transport_header += off; + n->network_header += off; + if (skb_mac_header_was_set(skb)) + n->mac_header += off; +#endif + + return n; +} +EXPORT_SYMBOL(skb_copy_expand); + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error. + */ + +int skb_pad(struct sk_buff *skb, int pad) +{ + int err; + int ntail; + + /* If the skbuff is non linear tailroom is always zero.. */ + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { + memset(skb->data+skb->len, 0, pad); + return 0; + } + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: + kfree_skb(skb); + return err; +} +EXPORT_SYMBOL(skb_pad); + +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ +unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp = skb_tail_pointer(skb); + SKB_LINEAR_ASSERT(skb); + skb->tail += len; + skb->len += len; + if (unlikely(skb->tail > skb->end)) + skb_over_panic(skb, len, __builtin_return_address(0)); + return tmp; +} +EXPORT_SYMBOL(skb_put); + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ +unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->datahead)) + skb_under_panic(skb, len, __builtin_return_address(0)); + return skb->data; +} +EXPORT_SYMBOL(skb_push); + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ +unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +{ + return skb_pull_inline(skb, len); +} +EXPORT_SYMBOL(skb_pull); + +/** + * skb_trim - remove end from a buffer + * @skb: buffer to alter + * @len: new length + * + * Cut the length of a buffer down by removing data from the tail. If + * the buffer is already under the length specified it is not modified. + * The skb must be linear. + */ +void skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (skb->len > len) + __skb_trim(skb, len); +} +EXPORT_SYMBOL(skb_trim); + +/* Trims skb to length len. It can change skb pointers. + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len) +{ + struct sk_buff **fragp; + struct sk_buff *frag; + int offset = skb_headlen(skb); + int nfrags = skb_shinfo(skb)->nr_frags; + int i; + int err; + + if (skb_cloned(skb) && + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) + return err; + + i = 0; + if (offset >= len) + goto drop_pages; + + for (; i < nfrags; i++) { + int end = offset + skb_shinfo(skb)->frags[i].size; + + if (end < len) { + offset = end; + continue; + } + + skb_shinfo(skb)->frags[i++].size = len - offset; + +drop_pages: + skb_shinfo(skb)->nr_frags = i; + + for (; i < nfrags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + + if (skb_has_frag_list(skb)) + skb_drop_fraglist(skb); + goto done; + } + + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); + fragp = &frag->next) { + int end = offset + frag->len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag->next = frag->next; + kfree_skb(frag); + frag = nfrag; + *fragp = frag; + } + + if (end < len) { + offset = end; + continue; + } + + if (end > len && + unlikely((err = pskb_trim(frag, len - offset)))) + return err; + + if (frag->next) + skb_drop_list(&frag->next); + break; + } + +done: + if (len > skb_headlen(skb)) { + skb->data_len -= skb->len - len; + skb->len = len; + } else { + skb->len = len; + skb->data_len = 0; + skb_set_tail_pointer(skb, len); + } + + return 0; +} +EXPORT_SYMBOL(___pskb_trim); + +/** + * __pskb_pull_tail - advance tail of skb header + * @skb: buffer to reallocate + * @delta: number of bytes to advance tail + * + * The function makes a sense only on a fragmented &sk_buff, + * it expands header moving its tail forward and copying necessary + * data from fragmented part. + * + * &sk_buff MUST have reference count of 1. + * + * Returns %NULL (and &sk_buff does not change) if pull failed + * or value of new tail of skb in the case of success. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ + /* If skb has not enough free space at tail, get new one + * plus 128 bytes for future expansions. If we have enough + * room at tail, reallocate without expansion only if skb is cloned. + */ + int i, k, eat = (skb->tail + delta) - skb->end; + + if (eat > 0 || skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, + GFP_ATOMIC)) + return NULL; + } + + if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) + BUG(); + + /* Optimization: no fragments, no reasons to preestimate + * size of pulled pages. Superb. + */ + if (!skb_has_frag_list(skb)) + goto pull_pages; + + /* Estimate size of pulled pages. */ + eat = delta; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size >= eat) + goto pull_pages; + eat -= skb_shinfo(skb)->frags[i].size; + } + + /* If we need update frag list, we are in troubles. + * Certainly, it possible to add an offset to skb data, + * but taking into account that pulling is expected to + * be very rare operation, it is worth to fight against + * further bloating skb head and crucify ourselves here instead. + * Pure masohism, indeed. 8)8) + */ + if (eat) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + BUG_ON(!list); + + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + + if (skb_shared(list)) { + /* Sucks! We need to fork list. :-( */ + clone = skb_clone(list, GFP_ATOMIC); + if (!clone) + return NULL; + insp = list->next; + list = clone; + } else { + /* This may be pulled without + * problems. */ + insp = list; + } + if (!pskb_pull(list, eat)) { + kfree_skb(clone); + return NULL; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = skb_shinfo(skb)->frag_list) != insp) { + skb_shinfo(skb)->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + skb_shinfo(skb)->frag_list = clone; + } + } + /* Success! Now we may commit changes to skb data. */ + +pull_pages: + eat = delta; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail += delta; + skb->data_len -= delta; + + return skb_tail_pointer(skb); +} +EXPORT_SYMBOL(__pskb_pull_tail); + +/* Copy some data bits from skb to kernel buffer. */ + +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_from_linear_data_offset(skb, offset, to, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(to, + vaddr + skb_shinfo(skb)->frags[i].page_offset+ + offset - start, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(frag_iter, offset - start, to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_bits); + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + put_page(spd->pages[i]); +} + +static inline struct page *linear_to_page(struct page *page, unsigned int *len, + unsigned int *offset, + struct sk_buff *skb, struct sock *sk) +{ + struct page *p = sk->sk_sndmsg_page; + unsigned int off; + + if (!p) { +new_page: + p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); + if (!p) + return NULL; + + off = sk->sk_sndmsg_off = 0; + /* hold one ref to this page until it's full */ + } else { + unsigned int mlen; + + off = sk->sk_sndmsg_off; + mlen = PAGE_SIZE - off; + if (mlen < 64 && mlen < *len) { + put_page(p); + goto new_page; + } + + *len = min_t(unsigned int, *len, mlen); + } + + memcpy(page_address(p) + off, page_address(page) + *offset, *len); + sk->sk_sndmsg_off += *len; + *offset = off; + get_page(p); + + return p; +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static inline int spd_fill_page(struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, struct page *page, + unsigned int *len, unsigned int offset, + struct sk_buff *skb, int linear, + struct sock *sk) +{ + if (unlikely(spd->nr_pages == pipe->buffers)) + return 1; + + if (linear) { + page = linear_to_page(page, len, &offset, skb, sk); + if (!page) + return 1; + } else + get_page(page); + + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = *len; + spd->partial[spd->nr_pages].offset = offset; + spd->nr_pages++; + + return 0; +} + +static inline void __segment_seek(struct page **page, unsigned int *poff, + unsigned int *plen, unsigned int off) +{ + unsigned long n; + + *poff += off; + n = *poff / PAGE_SIZE; + if (n) + *page = nth_page(*page, n); + + *poff = *poff % PAGE_SIZE; + *plen -= off; +} + +static inline int __splice_segment(struct page *page, unsigned int poff, + unsigned int plen, unsigned int *off, + unsigned int *len, struct sk_buff *skb, + struct splice_pipe_desc *spd, int linear, + struct sock *sk, + struct pipe_inode_info *pipe) +{ + if (!*len) + return 1; + + /* skip this segment if already processed */ + if (*off >= plen) { + *off -= plen; + return 0; + } + + /* ignore any bits we already processed */ + if (*off) { + __segment_seek(&page, &poff, &plen, *off); + *off = 0; + } + + do { + unsigned int flen = min(*len, plen); + + /* the linear region may spread across several pages */ + flen = min_t(unsigned int, flen, PAGE_SIZE - poff); + + if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) + return 1; + + __segment_seek(&page, &poff, &plen, flen); + *len -= flen; + + } while (*len && plen); + + return 0; +} + +/* + * Map linear and fragment data from the skb to spd. It reports failure if the + * pipe is full or if we already spliced the requested length. + */ +static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, + unsigned int *offset, unsigned int *len, + struct splice_pipe_desc *spd, struct sock *sk) +{ + int seg; + + /* + * map the linear part + */ + if (__splice_segment(virt_to_page(skb->data), + (unsigned long) skb->data & (PAGE_SIZE - 1), + skb_headlen(skb), + offset, len, skb, spd, 1, sk, pipe)) + return 1; + + /* + * then map the fragments + */ + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + if (__splice_segment(f->page, f->page_offset, f->size, + offset, len, skb, spd, 0, sk, pipe)) + return 1; + } + + return 0; +} + +/* + * Map data from the skb to a pipe. Should handle both the linear part, + * the fragments, and the frag list. It does NOT handle frag lists within + * the frag list, if such a thing exists. We'd probably need to recurse to + * handle that cleanly. + */ +int skb_splice_bits(struct sk_buff *skb, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &sock_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *frag_iter; + struct sock *sk = skb->sk; + int ret = 0; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + /* + * __skb_splice_bits() only fails if the output has no room left, + * so no point in going over the frag_list for the error case. + */ + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) + goto done; + else if (!tlen) + goto done; + + /* + * now see if we have a frag_list to map + */ + skb_walk_frags(skb, frag_iter) { + if (!tlen) + break; + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) + break; + } + +done: + if (spd.nr_pages) { + /* + * Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, &spd); + lock_sock(sk); + } + + splice_shrink_spd(pipe, &spd); + return ret; +} + +/** + * skb_store_bits - store bits from kernel buffer to skb + * @skb: destination buffer + * @offset: offset in destination + * @from: source buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source buffer to the + * destination skb. This function handles all the messy bits of + * traversing fragment lists and such. + */ + +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + if (offset > (int)skb->len - len) + goto fault; + + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_to_linear_data_offset(skb, offset, from, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + int end; + + WARN_ON(start > offset + len); + + end = start + frag->size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(frag); + memcpy(vaddr + frag->page_offset + offset - start, + from, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_store_bits(frag_iter, offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_store_bits); + +/* Checksum skb data. */ + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial(skb->data + offset, copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + __wsum csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial(vaddr + frag->page_offset + + offset - start, copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2; + if (copy > len) + copy = len; + csum2 = skb_checksum(frag_iter, offset - start, + copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + BUG_ON(len); + + return csum; +} +EXPORT_SYMBOL(skb_checksum); + +/* Both of above in one bottle. */ + +__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, + u8 *to, int len, __wsum csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial_copy_nocheck(skb->data + offset, to, + copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + __wsum csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial_copy_nocheck(vaddr + + frag->page_offset + + offset - start, to, + copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + __wsum csum2; + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(frag_iter, + offset - start, + to, copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + BUG_ON(len); + return csum; +} +EXPORT_SYMBOL(skb_copy_and_csum_bits); + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ + __wsum csum; + long csstart; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csstart = skb_checksum_start_offset(skb); + else + csstart = skb_headlen(skb); + + BUG_ON(csstart > skb_headlen(skb)); + + skb_copy_from_linear_data(skb, to, csstart); + + csum = 0; + if (csstart != skb->len) + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, + skb->len - csstart, 0); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + long csstuff = csstart + skb->csum_offset; + + *((__sum16 *)(to + csstuff)) = csum_fold(csum); + } +} +EXPORT_SYMBOL(skb_copy_and_csum_dev); + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} +EXPORT_SYMBOL(skb_dequeue); + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} +EXPORT_SYMBOL(skb_dequeue_tail); + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} +EXPORT_SYMBOL(skb_queue_purge); + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_queue_head); + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_queue_tail); + +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * @list: list to use + * + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls + * + * You must know what list the SKB is on. + */ +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_unlink); + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_after(list, old, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_append); + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet before a given packet in a list. The list locks are + * taken and this function is atomic with respect to other list locked + * calls. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_insert(newsk, old->prev, old, list); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_insert); + +static inline void skb_split_inside_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, const int pos) +{ + int i; + + skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), + pos - len); + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb_set_tail_pointer(skb, len); +} + +static inline void skb_split_no_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, int pos) +{ + int i, k = 0; + const int nfrags = skb_shinfo(skb)->nr_frags; + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i = 0; i < nfrags; i++) { + int size = skb_shinfo(skb)->frags[i].size; + + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_shinfo(skb1)->frags[0].size -= len - pos; + skb_shinfo(skb)->frags[i].size = len - pos; + skb_shinfo(skb)->nr_frags++; + } + k++; + } else + skb_shinfo(skb)->nr_frags++; + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ + int pos = skb_headlen(skb); + + if (len < pos) /* Split line is inside header. */ + skb_split_inside_header(skb, skb1, len, pos); + else /* Second chunk has no header, nothing to copy. */ + skb_split_no_header(skb, skb1, len, pos); +} +EXPORT_SYMBOL(skb_split); + +/* Shifting from/to a cloned skb is a no-go. + * + * Caller cannot keep skb_shinfo related pointers past calling here! + */ +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/** + * skb_shift - Shifts paged data partially from skb to another + * @tgt: buffer into which tail data gets added + * @skb: buffer from which the paged data comes from + * @shiftlen: shift up to this many bytes + * + * Attempts to shift up to shiftlen worth of bytes, which may be less than + * the length of the skb, from tgt to skb. Returns number bytes shifted. + * It's up to caller to free skb if everything was shifted. + * + * If @tgt runs out of frags, the whole operation is aborted. + * + * Skb cannot include anything else but paged data while tgt is allowed + * to have non-paged data as well. + * + * TODO: full sized shift could be optimized but that would need + * specialized skb free'er to handle frags without up-to-date nr_frags. + */ +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + struct skb_frag_struct *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { + merge = -1; + } else { + merge = to - 1; + + todo -= fragfrom->size; + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + /* All previous frag pointers might be stale! */ + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += shiftlen; + fragfrom->size -= shiftlen; + fragfrom->page_offset += shiftlen; + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= fragfrom->size) { + *fragto = *fragfrom; + todo -= fragfrom->size; + from++; + to++; + + } else { + get_page(fragfrom->page); + fragto->page = fragfrom->page; + fragto->page_offset = fragfrom->page_offset; + fragto->size = todo; + + fragfrom->page_offset += todo; + fragfrom->size -= todo; + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += fragfrom->size; + put_page(fragfrom->page); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + /* Yak, is it really working this way? Some helper please? */ + skb->len -= shiftlen; + skb->data_len -= shiftlen; + skb->truesize -= shiftlen; + tgt->len += shiftlen; + tgt->data_len += shiftlen; + tgt->truesize += shiftlen; + + return shiftlen; +} + +/** + * skb_prepare_seq_read - Prepare a sequential read of skb data + * @skb: the buffer to read + * @from: lower offset of data to be read + * @to: upper offset of data to be read + * @st: state variable + * + * Initializes the specified state variable. Must be called before + * invoking skb_seq_read() for the first time. + */ +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, + unsigned int to, struct skb_seq_state *st) +{ + st->lower_offset = from; + st->upper_offset = to; + st->root_skb = st->cur_skb = skb; + st->frag_idx = st->stepped_offset = 0; + st->frag_data = NULL; +} +EXPORT_SYMBOL(skb_prepare_seq_read); + +/** + * skb_seq_read - Sequentially read skb data + * @consumed: number of bytes consumed by the caller so far + * @data: destination pointer for data to be returned + * @st: state variable + * + * Reads a block of skb data at &consumed relative to the + * lower offset specified to skb_prepare_seq_read(). Assigns + * the head of the data block to &data and returns the length + * of the block or 0 if the end of the skb data or the upper + * offset has been reached. + * + * The caller is not required to consume all of the data + * returned, i.e. &consumed is typically set to the number + * of bytes already consumed and the next call to + * skb_seq_read() will return the remaining part of the block. + * + * Note 1: The size of each block of data returned can be arbitrary, + * this limitation is the cost for zerocopy seqeuental + * reads of potentially non linear data. + * + * Note 2: Fragment lists within fragments are not implemented + * at the moment, state->root_skb could be replaced with + * a stack for this purpose. + */ +unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st) +{ + unsigned int block_limit, abs_offset = consumed + st->lower_offset; + skb_frag_t *frag; + + if (unlikely(abs_offset >= st->upper_offset)) + return 0; + +next_skb: + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; + + if (abs_offset < block_limit && !st->frag_data) { + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); + return block_limit - abs_offset; + } + + if (st->frag_idx == 0 && !st->frag_data) + st->stepped_offset += skb_headlen(st->cur_skb); + + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; + block_limit = frag->size + st->stepped_offset; + + if (abs_offset < block_limit) { + if (!st->frag_data) + st->frag_data = kmap_skb_frag(frag); + + *data = (u8 *) st->frag_data + frag->page_offset + + (abs_offset - st->stepped_offset); + + return block_limit - abs_offset; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + st->frag_idx++; + st->stepped_offset += frag->size; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + st->frag_idx = 0; + goto next_skb; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; + goto next_skb; + } + + return 0; +} +EXPORT_SYMBOL(skb_seq_read); + +/** + * skb_abort_seq_read - Abort a sequential read of skb data + * @st: state variable + * + * Must be called if skb_seq_read() was not called until it + * returned 0. + */ +void skb_abort_seq_read(struct skb_seq_state *st) +{ + if (st->frag_data) + kunmap_skb_frag(st->frag_data); +} +EXPORT_SYMBOL(skb_abort_seq_read); + +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) + +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, + struct ts_config *conf, + struct ts_state *state) +{ + return skb_seq_read(offset, text, TS_SKB_CB(state)); +} + +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) +{ + skb_abort_seq_read(TS_SKB_CB(state)); +} + +/** + * skb_find_text - Find a text pattern in skb data + * @skb: the buffer to look in + * @from: search offset + * @to: search limit + * @config: textsearch configuration + * @state: uninitialized textsearch state variable + * + * Finds a pattern in the skb data according to the specified + * textsearch configuration. Use textsearch_next() to retrieve + * subsequent occurrences of the pattern. Returns the offset + * to the first occurrence or UINT_MAX if no match was found. + */ +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config, + struct ts_state *state) +{ + unsigned int ret; + + config->get_next_block = skb_ts_get_next_block; + config->finish = skb_ts_finish; + + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + + ret = textsearch_find(config, state); + return (ret <= to - from ? ret : UINT_MAX); +} +EXPORT_SYMBOL(skb_find_text); + +/** + * skb_append_datato_frags: - append the user data to a skb + * @sk: sock structure + * @skb: skb structure to be appened with user data. + * @getfrag: call back function to be used for getting the user data + * @from: pointer to user message iov + * @length: length of the iov message + * + * Description: This procedure append the user data in the fragment part + * of the skb if any page alloc fails user this procedure returns -ENOMEM + */ +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, + int (*getfrag)(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length) +{ + int frg_cnt = 0; + skb_frag_t *frag = NULL; + struct page *page = NULL; + int copy, left; + int offset = 0; + int ret; + + do { + /* Return error if we don't have space for new frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + if (frg_cnt >= MAX_SKB_FRAGS) + return -EFAULT; + + /* allocate a new page for next frag */ + page = alloc_pages(sk->sk_allocation, 0); + + /* If alloc_page fails just return failure and caller will + * free previous allocated pages by doing kfree_skb() + */ + if (page == NULL) + return -ENOMEM; + + /* initialize the next frag */ + skb_fill_page_desc(skb, frg_cnt, page, 0, 0); + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + + /* get the new initialized frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; + + /* copy the user data to page */ + left = PAGE_SIZE - frag->page_offset; + copy = (length > left)? left : length; + + ret = getfrag(from, (page_address(frag->page) + + frag->page_offset + frag->size), + offset, copy, 0, skb); + if (ret < 0) + return -EFAULT; + + /* copy was successful so update the size parameters */ + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + offset += copy; + length -= copy; + + } while (length > 0); + + return 0; +} +EXPORT_SYMBOL(skb_append_datato_frags); + +/** + * skb_pull_rcsum - pull skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_pull on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_pull unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +{ + BUG_ON(len > skb->len); + skb->len -= len; + BUG_ON(skb->len < skb->data_len); + skb_postpull_rcsum(skb, skb->data, len); + return skb->data += len; +} +EXPORT_SYMBOL_GPL(skb_pull_rcsum); + +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function performs segmentation on the given skb. It returns + * a pointer to the first in a list of new skbs for the segments. + * In case of error it returns ERR_PTR(err). + */ +struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + struct sk_buff *fskb = skb_shinfo(skb)->frag_list; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb_mac_header(skb); + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int sg = !!(features & NETIF_F_SG); + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + + if (!hsize && i >= nfrags) { + BUG_ON(fskb->len != len); + + pos += len; + nskb = skb_clone(fskb, GFP_ATOMIC); + fskb = fskb->next; + + if (unlikely(!nskb)) + goto err; + + hsize = skb_end_pointer(nskb) - nskb->head; + if (skb_cow_head(nskb, doffset + headroom)) { + kfree_skb(nskb); + goto err; + } + + nskb->truesize += skb_end_pointer(nskb) - nskb->head - + hsize; + skb_release_head_state(nskb); + __skb_push(nskb, doffset); + } else { + nskb = alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC); + + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, headroom); + __skb_put(nskb, doffset); + } + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + __copy_skb_header(nskb, skb); + nskb->mac_len = skb->mac_len; + + /* nskb and skb might have different headroom */ + if (nskb->ip_summed == CHECKSUM_PARTIAL) + nskb->csum_start += skb_headroom(nskb) - headroom; + + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, skb->mac_len); + nskb->transport_header = (nskb->network_header + + skb_network_header_len(skb)); + skb_copy_from_linear_data(skb, nskb->data, doffset); + + if (fskb != skb_shinfo(skb)->frag_list) + continue; + + if (!sg) { + nskb->ip_summed = CHECKSUM_NONE; + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + + skb_copy_from_linear_data_offset(skb, offset, + skb_put(nskb, hsize), hsize); + + while (pos < offset + len && i < nfrags) { + *frag = skb_shinfo(skb)->frags[i]; + get_page(frag->page); + size = frag->size; + + if (pos < offset) { + frag->page_offset += offset - pos; + frag->size -= offset - pos; + } + + skb_shinfo(nskb)->nr_frags++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + frag->size -= pos + size - (offset + len); + goto skip_fraglist; + } + + frag++; + } + + if (pos < offset + len) { + struct sk_buff *fskb2 = fskb; + + BUG_ON(pos + fskb->len != offset + len); + + pos += fskb->len; + fskb = fskb->next; + + if (fskb2->next) { + fskb2 = skb_clone(fskb2, GFP_ATOMIC); + if (!fskb2) + goto err; + } else + skb_get(fskb2); + + SKB_FRAG_ASSERT(nskb); + skb_shinfo(nskb)->frag_list = fskb2; + } + +skip_fraglist: + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree_skb(skb); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(skb_segment); + +int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff *p = *head; + struct sk_buff *nskb; + struct skb_shared_info *skbinfo = skb_shinfo(skb); + struct skb_shared_info *pinfo = skb_shinfo(p); + unsigned int headroom; + unsigned int len = skb_gro_len(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + + if (p->len + len >= 65536) + return -E2BIG; + + if (pinfo->frag_list) + goto merge; + else if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + offset -= headlen; + + if (nr_frags > MAX_SKB_FRAGS) + return -E2BIG; + + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + frag->page_offset += offset; + frag->size -= offset; + + skb->truesize -= skb->data_len; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = 1; + goto done; + } else if (skb_gro_len(p) != pinfo->gso_size) + return -E2BIG; + + headroom = skb_headroom(p); + nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); + if (unlikely(!nskb)) + return -ENOMEM; + + __copy_skb_header(nskb, p); + nskb->mac_len = p->mac_len; + + skb_reserve(nskb, headroom); + __skb_put(nskb, skb_gro_offset(p)); + + skb_set_mac_header(nskb, skb_mac_header(p) - p->data); + skb_set_network_header(nskb, skb_network_offset(p)); + skb_set_transport_header(nskb, skb_transport_offset(p)); + + __skb_pull(p, skb_gro_offset(p)); + memcpy(skb_mac_header(nskb), skb_mac_header(p), + p->data - skb_mac_header(p)); + + *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); + skb_shinfo(nskb)->frag_list = p; + skb_shinfo(nskb)->gso_size = pinfo->gso_size; + pinfo->gso_size = 0; + skb_header_release(p); + nskb->prev = p; + + nskb->data_len += p->len; + nskb->truesize += p->len; + nskb->len += p->len; + + *head = nskb; + nskb->next = p->next; + p->next = NULL; + + p = nskb; + +merge: + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skbinfo->frags[0].page_offset += eat; + skbinfo->frags[0].size -= eat; + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + p->prev->next = skb; + p->prev = skb; + skb_header_release(skb); + +done: + NAPI_GRO_CB(p)->count++; + p->data_len += len; + p->truesize += len; + p->len += len; + + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive); + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + (2*sizeof(struct sk_buff)) + + sizeof(atomic_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} + +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. + */ +static int +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int elt = 0; + + if (copy > 0) { + if (copy > len) + copy = len; + sg_set_buf(sg, skb->data + offset, copy); + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + sg_set_page(&sg[elt], frag->page, copy, + frag->page_offset+offset-start); + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy); + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + BUG_ON(len); + return elt; +} + +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len); + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + +/** + * skb_cow_data - Check that a socket buffer's data buffers are writable + * @skb: The socket buffer to check. + * @tailbits: Amount of trailing space to be added + * @trailer: Returned pointer to the skb where the @tailbits space begins + * + * Make sure that the data buffers attached to a socket buffer are + * writable. If they are not, private copies are made of the data buffers + * and the socket buffer is set to use these instead. + * + * If @tailbits is given, make sure that there is space to write @tailbits + * bytes of data beyond current end of socket buffer. @trailer will be + * set to point to the skb in which this space begins. + * + * The number of scatterlist elements required to completely map the + * COW'd and extended socket buffer will be returned. + */ +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_has_frag_list(skb)) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_has_frag_list(skb1) || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_has_frag_list(skb1)) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb2, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} +EXPORT_SYMBOL_GPL(skb_cow_data); + +static void sock_rmem_free(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + +/* + * Note: We dont mem charge error packets (no sk_forward_alloc changes) + */ +int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) +{ + int len = skb->len; + + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) + return -ENOMEM; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_rmem_free; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + /* before exiting rcu section, make sure dst is refcounted */ + skb_dst_force(skb); + + skb_queue_tail(&sk->sk_error_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, len); + return 0; +} +EXPORT_SYMBOL(sock_queue_err_skb); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = orig_skb->sk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int err; + + if (!sk) + return; + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (hwtstamps) { + *skb_hwtstamps(skb) = + *hwtstamps; + } else { + /* + * no hardware time stamps available, + * so keep the shared tx_flags and only + * store software time stamp + */ + skb->tstamp = ktime_get_real(); + } + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + + err = sock_queue_err_skb(sk, skb); + + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + + +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + if (unlikely(start > skb_headlen(skb)) || + unlikely((int)start + off > skb_headlen(skb) - 2)) { + if (net_ratelimit()) + printk(KERN_WARNING + "bad partial csum: csum=%u/%u len=%u\n", + start, off, skb_headlen(skb)); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + start; + skb->csum_offset = off; + return true; +} +EXPORT_SYMBOL_GPL(skb_partial_csum_set); + +void __skb_warn_lro_forwarding(const struct sk_buff *skb) +{ + if (net_ratelimit()) + pr_warning("%s: received packets cannot be forwarded" + " while LRO is enabled\n", skb->dev->name); +} +EXPORT_SYMBOL(__skb_warn_lro_forwarding); diff --git a/net/core/sock.c b/net/core/sock.c new file mode 100644 index 00000000..b4bb59a9 --- /dev/null +++ b/net/core/sock.c @@ -0,0 +1,2579 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Florian La Roche, + * Alan Cox, + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * Chris Evans : Security fixes - signedness again + * Arnaldo C. Melo : cleanups, use skb_queue_purge + * + * To Fix: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_INET +#include +#endif + +/* + * Each address family might have different locking rules, so we have + * one slock key per address family: + */ +static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_slock_keys[AF_MAX]; + +/* + * Make lock validator output more readable. (we pre-construct these + * strings build-time, so that runtime initialization of socket + * locks is fast): + */ +static const char *const af_family_key_strings[AF_MAX+1] = { + "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , + "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", + "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , + "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , + "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , + "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , + "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , + "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , + "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , + "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , + "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , + "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , + "sk_lock-AF_MAX" +}; +static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , + "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", + "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , + "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , + "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , + "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , + "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , + "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , + "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , + "slock-27" , "slock-28" , "slock-AF_CAN" , + "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , + "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , + "slock-AF_MAX" +}; +static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , + "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", + "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , + "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , + "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , + "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , + "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , + "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , + "clock-27" , "clock-28" , "clock-AF_CAN" , + "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , + "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , + "clock-AF_MAX" +}; + +/* + * sk_callback_lock locking rules are per-address-family, + * so split the lock classes by using a per-AF key: + */ +static struct lock_class_key af_callback_keys[AF_MAX]; + +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms. This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancillary data plus some space */ +int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +EXPORT_SYMBOL(sysctl_optmem_max); + +#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +int net_cls_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_cls_subsys_id); +#endif + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +{ + struct timeval tv; + + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_user(&tv, optval, sizeof(tv))) + return -EFAULT; + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) + return -EDOM; + + if (tv.tv_sec < 0) { + static int warned __read_mostly; + + *timeo_p = 0; + if (warned < 10 && net_ratelimit()) { + warned++; + printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " + "tries to set negative timeout\n", + current->comm, task_pid_nr(current)); + } + return 0; + } + *timeo_p = MAX_SCHEDULE_TIMEOUT; + if (tv.tv_sec == 0 && tv.tv_usec == 0) + return 0; + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) + *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + return 0; +} + +static void sock_warn_obsolete_bsdism(const char *name) +{ + static int warned; + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +} + +static void sock_disable_timestamp(struct sock *sk, int flag) +{ + if (sock_flag(sk, flag)) { + sock_reset_flag(sk, flag); + if (!sock_flag(sk, SOCK_TIMESTAMP) && + !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { + net_disable_timestamp(); + } + } +} + + +int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; + + /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces + number of warnings when compiling with -W --ANK + */ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) { + atomic_inc(&sk->sk_drops); + return -ENOMEM; + } + + err = sk_filter(sk, skb); + if (err) + return err; + + if (!sk_rmem_schedule(sk, skb->truesize)) { + atomic_inc(&sk->sk_drops); + return -ENOBUFS; + } + + skb->dev = NULL; + skb_set_owner_r(skb, sk); + + /* Cache the SKB length before we tack it onto the receive + * queue. Once it is added it no longer belongs to us and + * may be freed by other threads of control pulling packets + * from the queue. + */ + skb_len = skb->len; + + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + + spin_lock_irqsave(&list->lock, flags); + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); + return 0; +} +EXPORT_SYMBOL(sock_queue_rcv_skb); + +int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter(sk, skb)) + goto discard_and_relse; + + skb->dev = NULL; + + if (sk_rcvqueues_full(sk, skb)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + if (nested) + bh_lock_sock_nested(sk); + else + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + /* + * trylock + unlock semantics: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); + + rc = sk_backlog_rcv(sk, skb); + + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + } else if (sk_add_backlog(sk, skb)) { + bh_unlock_sock(sk); + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + + bh_unlock_sock(sk); +out: + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(sk_receive_skb); + +void sk_reset_txq(struct sock *sk) +{ + sk_tx_queue_clear(sk); +} +EXPORT_SYMBOL(sk_reset_txq); + +struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_tx_queue_clear(sk); + rcu_assign_pointer(sk->sk_dst_cache, NULL); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(__sk_dst_check); + +struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = sk_dst_get(sk); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_dst_reset(sk); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(sk_dst_check); + +static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + int index; + + /* Sorry... */ + ret = -EPERM; + if (!capable(CAP_NET_RAW)) + goto out; + + ret = -EINVAL; + if (optlen < 0) + goto out; + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + memset(devname, 0, sizeof(devname)); + + ret = -EFAULT; + if (copy_from_user(devname, optval, optlen)) + goto out; + + index = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); + ret = -ENODEV; + if (!dev) + goto out; + } + + lock_sock(sk); + sk->sk_bound_dev_if = index; + sk_dst_reset(sk); + release_sock(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +{ + if (valbool) + sock_set_flag(sk, bit); + else + sock_reset_flag(sk, bit); +} + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int val; + int valbool; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + + if (optname == SO_BINDTODEVICE) + return sock_bindtodevice(sk, optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + valbool = val ? 1 : 0; + + lock_sock(sk); + + switch (optname) { + case SO_DEBUG: + if (val && !capable(CAP_NET_ADMIN)) + ret = -EACCES; + else + sock_valbool_flag(sk, SOCK_DBG, valbool); + break; + case SO_REUSEADDR: + sk->sk_reuse = valbool; + break; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; +set_sndbuf: + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + if ((val * 2) < SOCK_MIN_SNDBUF) + sk->sk_sndbuf = SOCK_MIN_SNDBUF; + else + sk->sk_sndbuf = val * 2; + + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->sk_write_space(sk); + break; + + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_sndbuf; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; +set_rcvbuf: + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + /* + * We double it on the way in to account for + * "struct sk_buff" etc. overhead. Applications + * assume that the SO_RCVBUF setting they make will + * allow that much actual data to be received on that + * socket. + * + * Applications are unaware that "struct sk_buff" and + * other overheads allocate from the receive buffer + * during socket buffer allocation. + * + * And after considering the possible alternatives, + * returning the value we actually used in getsockopt + * is the most desirable behavior. + */ + if ((val * 2) < SOCK_MIN_RCVBUF) + sk->sk_rcvbuf = SOCK_MIN_RCVBUF; + else + sk->sk_rcvbuf = val * 2; + break; + + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_rcvbuf; + + case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->sk_protocol == IPPROTO_TCP) + tcp_set_keepalive(sk, valbool); +#endif + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->sk_priority = val; + else + ret = -EPERM; + break; + + case SO_LINGER: + if (optlen < sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ + break; + } + if (copy_from_user(&ling, optval, sizeof(ling))) { + ret = -EFAULT; + break; + } + if (!ling.l_onoff) + sock_reset_flag(sk, SOCK_LINGER); + else { +#if (BITS_PER_LONG == 32) + if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + else +#endif + sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("setsockopt"); + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP: + case SO_TIMESTAMPNS: + if (valbool) { + if (optname == SO_TIMESTAMP) + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + else + sock_set_flag(sk, SOCK_RCVTSTAMPNS); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + } + break; + + case SO_TIMESTAMPING: + if (val & ~SOF_TIMESTAMPING_MASK) { + ret = -EINVAL; + break; + } + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, + val & SOF_TIMESTAMPING_TX_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, + val & SOF_TIMESTAMPING_TX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, + val & SOF_TIMESTAMPING_RX_HARDWARE); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, + val & SOF_TIMESTAMPING_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, + val & SOF_TIMESTAMPING_SYS_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, + val & SOF_TIMESTAMPING_RAW_HARDWARE); + break; + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + break; + + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + break; + + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + ret = sk_detach_filter(sk); + break; + + case SO_PASSSEC: + if (valbool) + set_bit(SOCK_PASSSEC, &sock->flags); + else + clear_bit(SOCK_PASSSEC, &sock->flags); + break; + case SO_MARK: + if (!capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + sk->sk_mark = val; + break; + + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + case SO_RXQ_OVFL: + if (valbool) + sock_set_flag(sk, SOCK_RXQ_OVFL); + else + sock_reset_flag(sk, SOCK_RXQ_OVFL); + break; + default: + ret = -ENOPROTOOPT; + break; + } + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(sock_setsockopt); + + +void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) +{ + ucred->pid = pid_vnr(pid); + ucred->uid = ucred->gid = -1; + if (cred) { + struct user_namespace *current_ns = current_user_ns(); + + ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid); + ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid); + } +} +EXPORT_SYMBOL_GPL(cred_to_ucred); + +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + union { + int val; + struct linger ling; + struct timeval tm; + } v; + + int lv = sizeof(int); + int len; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + memset(&v, 0, sizeof(v)); + + switch (optname) { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = !!sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = sk->sk_sndbuf; + break; + + case SO_RCVBUF: + v.val = sk->sk_rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_KEEPALIVE: + v.val = !!sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_PROTOCOL: + v.val = sk->sk_protocol; + break; + + case SO_DOMAIN: + v.val = sk->sk_family; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if (v.val == 0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = !!sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check; + break; + + case SO_PRIORITY: + v.val = sk->sk_priority; + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = sk->sk_lingertime / HZ; + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("getsockopt"); + break; + + case SO_TIMESTAMP: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPNS: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPING: + v.val = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_TX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) + v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + break; + + case SO_RCVTIMEO: + lv = sizeof(struct timeval); + if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_rcvtimeo / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_SNDTIMEO: + lv = sizeof(struct timeval); + if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_sndtimeo / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_RCVLOWAT: + v.val = sk->sk_rcvlowat; + break; + + case SO_SNDLOWAT: + v.val = 1; + break; + + case SO_PASSCRED: + v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; + break; + + case SO_PEERCRED: + { + struct ucred peercred; + if (len > sizeof(peercred)) + len = sizeof(peercred); + cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + if (copy_to_user(optval, &peercred, len)) + return -EFAULT; + goto lenout; + } + + case SO_PEERNAME: + { + char address[128]; + + if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } + + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; + + case SO_PASSSEC: + v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; + break; + + case SO_PEERSEC: + return security_socket_getpeersec_stream(sock, optval, optlen, len); + + case SO_MARK: + v.val = sk->sk_mark; + break; + + case SO_RXQ_OVFL: + v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); + break; + + default: + return -ENOPROTOOPT; + } + + if (len > lv) + len = lv; + if (copy_to_user(optval, &v, len)) + return -EFAULT; +lenout: + if (put_user(len, optlen)) + return -EFAULT; + return 0; +} + +/* + * Initialize an sk_lock. + * + * (We also register the sk_lock with the lock validator.) + */ +static inline void sock_lock_init(struct sock *sk) +{ + sock_lock_init_class_and_name(sk, + af_family_slock_key_strings[sk->sk_family], + af_family_slock_keys + sk->sk_family, + af_family_key_strings[sk->sk_family], + af_family_keys + sk->sk_family); +} + +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end + */ +static void sock_copy(struct sock *nsk, const struct sock *osk) +{ +#ifdef CONFIG_SECURITY_NETWORK + void *sptr = nsk->sk_security; +#endif + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); + + memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + +#ifdef CONFIG_SECURITY_NETWORK + nsk->sk_security = sptr; + security_sk_clone(osk, nsk); +#endif +} + +/* + * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * un-modified. Special care is taken when initializing object to zero. + */ +static inline void sk_prot_clear_nulls(struct sock *sk, int size) +{ + if (offsetof(struct sock, sk_node.next) != 0) + memset(sk, 0, offsetof(struct sock, sk_node.next)); + memset(&sk->sk_node.pprev, 0, + size - offsetof(struct sock, sk_node.pprev)); +} + +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) +{ + unsigned long nulls1, nulls2; + + nulls1 = offsetof(struct sock, __sk_common.skc_node.next); + nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); + if (nulls1 > nulls2) + swap(nulls1, nulls2); + + if (nulls1 != 0) + memset((char *)sk, 0, nulls1); + memset((char *)sk + nulls1 + sizeof(void *), 0, + nulls2 - nulls1 - sizeof(void *)); + memset((char *)sk + nulls2 + sizeof(void *), 0, + size - nulls2 - sizeof(void *)); +} +EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); + +static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + int family) +{ + struct sock *sk; + struct kmem_cache *slab; + + slab = prot->slab; + if (slab != NULL) { + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; + if (priority & __GFP_ZERO) { + if (prot->clear_sk) + prot->clear_sk(sk, prot->obj_size); + else + sk_prot_clear_nulls(sk, prot->obj_size); + } + } else + sk = kmalloc(prot->obj_size, priority); + + if (sk != NULL) { + kmemcheck_annotate_bitfield(sk, flags); + + if (security_sk_alloc(sk, family, priority)) + goto out_free; + + if (!try_module_get(prot->owner)) + goto out_free_sec; + sk_tx_queue_clear(sk); + } + + return sk; + +out_free_sec: + security_sk_free(sk); +out_free: + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + return NULL; +} + +static void sk_prot_free(struct proto *prot, struct sock *sk) +{ + struct kmem_cache *slab; + struct module *owner; + + owner = prot->owner; + slab = prot->slab; + + security_sk_free(sk); + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + module_put(owner); +} + +#ifdef CONFIG_CGROUPS +void sock_update_classid(struct sock *sk) +{ + u32 classid; + + rcu_read_lock(); /* doing current task, which cannot vanish. */ + classid = task_cls_classid(current); + rcu_read_unlock(); + if (classid && classid != sk->sk_classid) + sk->sk_classid = classid; +} +EXPORT_SYMBOL(sock_update_classid); +#endif + +/** + * sk_alloc - All socket objects are allocated here + * @net: the applicable net namespace + * @family: protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + */ +struct sock *sk_alloc(struct net *net, int family, gfp_t priority, + struct proto *prot) +{ + struct sock *sk; + + sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); + if (sk) { + sk->sk_family = family; + /* + * See comment in struct sock definition to understand + * why we need sk_prot_creator -acme + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); + sock_net_set(sk, get_net(net)); + atomic_set(&sk->sk_wmem_alloc, 1); + + sock_update_classid(sk); + } + + return sk; +} +EXPORT_SYMBOL(sk_alloc); + +static void __sk_free(struct sock *sk) +{ + struct sk_filter *filter; + + if (sk->sk_destruct) + sk->sk_destruct(sk); + + filter = rcu_dereference_check(sk->sk_filter, + atomic_read(&sk->sk_wmem_alloc) == 0); + if (filter) { + sk_filter_uncharge(sk, filter); + rcu_assign_pointer(sk->sk_filter, NULL); + } + + sock_disable_timestamp(sk, SOCK_TIMESTAMP); + sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); + + if (atomic_read(&sk->sk_omem_alloc)) + printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", + __func__, atomic_read(&sk->sk_omem_alloc)); + + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); + put_net(sock_net(sk)); + sk_prot_free(sk->sk_prot_creator, sk); +} + +void sk_free(struct sock *sk) +{ + /* + * We subtract one from sk_wmem_alloc and can know if + * some packets are still in some tx queue. + * If not null, sock_wfree() will call __sk_free(sk) later + */ + if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sk_free); + +/* + * Last sock_put should drop reference to sk->sk_net. It has already + * been dropped in sk_change_net. Taking reference to stopping namespace + * is not an option. + * Take reference to a socket to remove it from hash _alive_ and after that + * destroy it in the context of init_net. + */ +void sk_release_kernel(struct sock *sk) +{ + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_hold(sk); + sock_release(sk->sk_socket); + release_net(sock_net(sk)); + sock_net_set(sk, get_net(&init_net)); + sock_put(sk); +} +EXPORT_SYMBOL(sk_release_kernel); + +struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +{ + struct sock *newsk; + + newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); + if (newsk != NULL) { + struct sk_filter *filter; + + sock_copy(newsk, sk); + + /* SANITY */ + get_net(sock_net(newsk)); + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; + + atomic_set(&newsk->sk_rmem_alloc, 0); + /* + * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) + */ + atomic_set(&newsk->sk_wmem_alloc, 1); + atomic_set(&newsk->sk_omem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif + + spin_lock_init(&newsk->sk_dst_lock); + rwlock_init(&newsk->sk_callback_lock); + lockdep_set_class_and_name(&newsk->sk_callback_lock, + af_callback_keys + newsk->sk_family, + af_family_clock_key_strings[newsk->sk_family]); + + newsk->sk_dst_cache = NULL; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + sock_reset_flag(newsk, SOCK_DONE); + skb_queue_head_init(&newsk->sk_error_queue); + + filter = rcu_dereference_protected(newsk->sk_filter, 1); + if (filter != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; + } + + newsk->sk_err = 0; + newsk->sk_priority = 0; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); + atomic_set(&newsk->sk_refcnt, 2); + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + newsk->sk_wq = NULL; + + if (newsk->sk_prot->sockets_allocated) + percpu_counter_inc(newsk->sk_prot->sockets_allocated); + + if (sock_flag(newsk, SOCK_TIMESTAMP) || + sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + net_enable_timestamp(); + } +out: + return newsk; +} +EXPORT_SYMBOL_GPL(sk_clone); + +void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (sk_can_gso(sk)) { + if (dst->header_len) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + } else { + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + sk->sk_gso_max_size = dst->dev->gso_max_size; + } + } +} +EXPORT_SYMBOL_GPL(sk_setup_caps); + +void __init sk_init(void) +{ + if (totalram_pages <= 4096) { + sysctl_wmem_max = 32767; + sysctl_rmem_max = 32767; + sysctl_wmem_default = 32767; + sysctl_rmem_default = 32767; + } else if (totalram_pages >= 131072) { + sysctl_wmem_max = 131071; + sysctl_rmem_max = 131071; + } +} + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + /* + * Keep a reference on sk_wmem_alloc, this will be released + * after sk_write_space() call + */ + atomic_sub(len - 1, &sk->sk_wmem_alloc); + sk->sk_write_space(sk); + len = 1; + } + /* + * if sk_wmem_alloc reaches 0, we must finish what sk_free() + * could not do because of in-flight packets + */ + if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sock_wfree); + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + atomic_sub(len, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, len); +} +EXPORT_SYMBOL(sock_rfree); + + +int sock_i_uid(struct sock *sk) +{ + int uid; + + read_lock_bh(&sk->sk_callback_lock); + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; + read_unlock_bh(&sk->sk_callback_lock); + return uid; +} +EXPORT_SYMBOL(sock_i_uid); + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + read_lock_bh(&sk->sk_callback_lock); + ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; + read_unlock_bh(&sk->sk_callback_lock); + return ino; +} +EXPORT_SYMBOL(sock_i_ino); + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, + gfp_t priority) +{ + if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_w(skb, sk); + return skb; + } + } + return NULL; +} +EXPORT_SYMBOL(sock_wmalloc); + +/* + * Allocate a skb from the socket's receive buffer. + */ +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, + gfp_t priority) +{ + if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_r(skb, sk); + return skb; + } + } + return NULL; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) +{ + if ((unsigned)size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->sk_omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->sk_omem_alloc); + } + return NULL; +} +EXPORT_SYMBOL(sock_kmalloc); + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + kfree(mem); + atomic_sub(size, &sk->sk_omem_alloc); +} +EXPORT_SYMBOL(sock_kfree_s); + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static long sock_wait_for_wmem(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + for (;;) { + if (!timeo) + break; + if (signal_pending(current)) + break; + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + break; + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(sk), &wait); + return timeo; +} + + +/* + * Generic send/receive buffer handlers + */ + +struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + unsigned long data_len, int noblock, + int *errcode) +{ + struct sk_buff *skb; + gfp_t gfp_mask; + long timeo; + int err; + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + + err = -EMSGSIZE; + if (npages > MAX_SKB_FRAGS) + goto failure; + + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + timeo = sock_sndtimeo(sk, noblock); + while (1) { + err = sock_error(sk); + if (err != 0) + goto failure; + + err = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(header_len, gfp_mask); + if (skb) { + int i; + + /* No pages, we're done... */ + if (!data_len) + break; + + skb->truesize += data_len; + skb_shinfo(skb)->nr_frags = npages; + for (i = 0; i < npages; i++) { + struct page *page; + skb_frag_t *frag; + + page = alloc_pages(sk->sk_allocation, 0); + if (!page) { + err = -ENOBUFS; + skb_shinfo(skb)->nr_frags = i; + kfree_skb(skb); + goto failure; + } + + frag = &skb_shinfo(skb)->frags[i]; + frag->page = page; + frag->page_offset = 0; + frag->size = (data_len >= PAGE_SIZE ? + PAGE_SIZE : + data_len); + data_len -= PAGE_SIZE; + } + + /* Full success... */ + break; + } + err = -ENOBUFS; + goto failure; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + } + + skb_set_owner_w(skb, sk); + return skb; + +interrupted: + err = sock_intr_errno(timeo); +failure: + *errcode = err; + return NULL; +} +EXPORT_SYMBOL(sock_alloc_send_pskb); + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) +{ + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); +} +EXPORT_SYMBOL(sock_alloc_send_skb); + +static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_bh(&sk->sk_lock.slock); + schedule(); + spin_lock_bh(&sk->sk_lock.slock); + if (!sock_owned_by_user(sk)) + break; + } + finish_wait(&sk->sk_lock.wq, &wait); +} + +static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + do { + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + bh_unlock_sock(sk); + + do { + struct sk_buff *next = skb->next; + + WARN_ON_ONCE(skb_dst_is_noref(skb)); + skb->next = NULL; + sk_backlog_rcv(sk, skb); + + /* + * We are in process context here with softirqs + * disabled, use cond_resched_softirq() to preempt. + * This is safe to do because we've taken the backlog + * queue private: + */ + cond_resched_softirq(); + + skb = next; + } while (skb != NULL); + + bh_lock_sock(sk); + } while ((skb = sk->sk_backlog.head) != NULL); + + /* + * Doing the zeroing here guarantee we can not loop forever + * while a wild producer attempts to flood us. + */ + sk->sk_backlog.len = 0; +} + +/** + * sk_wait_data - wait for data to arrive at sk_receive_queue + * @sk: sock to wait on + * @timeo: for how long + * + * Now socket state including sk->sk_err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ +int sk_wait_data(struct sock *sk, long *timeo) +{ + int rc; + DEFINE_WAIT(wait); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk_sleep(sk), &wait); + return rc; +} +EXPORT_SYMBOL(sk_wait_data); + +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + struct proto *prot = sk->sk_prot; + int amt = sk_mem_pages(size); + long allocated; + + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + allocated = atomic_long_add_return(amt, prot->memory_allocated); + + /* Under limit. */ + if (allocated <= prot->sysctl_mem[0]) { + if (prot->memory_pressure && *prot->memory_pressure) + *prot->memory_pressure = 0; + return 1; + } + + /* Under pressure. */ + if (allocated > prot->sysctl_mem[1]) + if (prot->enter_memory_pressure) + prot->enter_memory_pressure(sk); + + /* Over hard limit. */ + if (allocated > prot->sysctl_mem[2]) + goto suppress_allocation; + + /* guarantee minimum buffer size under pressure */ + if (kind == SK_MEM_RECV) { + if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + return 1; + } else { /* SK_MEM_SEND */ + if (sk->sk_type == SOCK_STREAM) { + if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + return 1; + } else if (atomic_read(&sk->sk_wmem_alloc) < + prot->sysctl_wmem[0]) + return 1; + } + + if (prot->memory_pressure) { + int alloc; + + if (!*prot->memory_pressure) + return 1; + alloc = percpu_counter_read_positive(prot->sockets_allocated); + if (prot->sysctl_mem[2] > alloc * + sk_mem_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + } + +suppress_allocation: + + if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; + atomic_long_sub(amt, prot->memory_allocated); + return 0; +} +EXPORT_SYMBOL(__sk_mem_schedule); + +/** + * __sk_reclaim - reclaim memory_allocated + * @sk: socket + */ +void __sk_mem_reclaim(struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + prot->memory_allocated); + sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + + if (prot->memory_pressure && *prot->memory_pressure && + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) + *prot->memory_pressure = 0; +} +EXPORT_SYMBOL(__sk_mem_reclaim); + + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_bind); + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_connect); + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_socketpair); + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_accept); + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_getname); + +unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) +{ + return 0; +} +EXPORT_SYMBOL(sock_no_poll); + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_ioctl); + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_listen); + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_shutdown); + +int sock_no_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_setsockopt); + +int sock_no_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_getsockopt); + +int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_sendmsg); + +int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_recvmsg); + +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* Mirror missing mmap method error code */ + return -ENODEV; +} +EXPORT_SYMBOL(sock_no_mmap); + +ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg(sock, &msg, &iov, 1, size); + kunmap(page); + return res; +} +EXPORT_SYMBOL(sock_no_sendpage); + +/* + * Default Socket Callbacks + */ + +static void sock_def_wakeup(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static void sock_def_error_report(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + rcu_read_unlock(); +} + +static void sock_def_readable(struct sock *sk, int len) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +static void sock_def_write_space(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | + POLLWRNORM | POLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } + + rcu_read_unlock(); +} + +static void sock_def_destruct(struct sock *sk) +{ + kfree(sk->sk_protinfo); +} + +void sk_send_sigurg(struct sock *sk) +{ + if (sk->sk_socket && sk->sk_socket->file) + if (send_sigurg(&sk->sk_socket->file->f_owner)) + sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); +} +EXPORT_SYMBOL(sk_send_sigurg); + +void sk_reset_timer(struct sock *sk, struct timer_list* timer, + unsigned long expires) +{ + if (!mod_timer(timer, expires)) + sock_hold(sk); +} +EXPORT_SYMBOL(sk_reset_timer); + +void sk_stop_timer(struct sock *sk, struct timer_list* timer) +{ + if (timer_pending(timer) && del_timer(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&sk->sk_async_wait_queue); +#endif + + sk->sk_send_head = NULL; + + init_timer(&sk->sk_timer); + + sk->sk_allocation = GFP_KERNEL; + sk->sk_rcvbuf = sysctl_rmem_default; + sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_state = TCP_CLOSE; + sk_set_socket(sk, sock); + + sock_set_flag(sk, SOCK_ZAPPED); + + if (sock) { + sk->sk_type = sock->type; + sk->sk_wq = sock->wq; + sock->sk = sk; + } else + sk->sk_wq = NULL; + + spin_lock_init(&sk->sk_dst_lock); + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); + + sk->sk_state_change = sock_def_wakeup; + sk->sk_data_ready = sock_def_readable; + sk->sk_write_space = sock_def_write_space; + sk->sk_error_report = sock_def_error_report; + sk->sk_destruct = sock_def_destruct; + + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + + sk->sk_peer_pid = NULL; + sk->sk_peer_cred = NULL; + sk->sk_write_pending = 0; + sk->sk_rcvlowat = 1; + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + + sk->sk_stamp = ktime_set(-1L, 0); + + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); + atomic_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_drops, 0); +} +EXPORT_SYMBOL(sock_init_data); + +void lock_sock_nested(struct sock *sk, int subclass) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_lock.owned) + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + local_bh_enable(); +} +EXPORT_SYMBOL(lock_sock_nested); + +void release_sock(struct sock *sk) +{ + /* + * The sk_lock has mutex_unlock() semantics: + */ + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_backlog.tail) + __release_sock(sk); + sk->sk_lock.owned = 0; + if (waitqueue_active(&sk->sk_lock.wq)) + wake_up(&sk->sk_lock.wq); + spin_unlock_bh(&sk->sk_lock.slock); +} +EXPORT_SYMBOL(release_sock); + +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken + * sk_lock.slock locked, owned = 0, BH disabled + * return true if slow path is taken + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +bool lock_sock_fast(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + + if (!sk->sk_lock.owned) + /* + * Note : We must disable BH + */ + return false; + + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + local_bh_enable(); + return true; +} +EXPORT_SYMBOL(lock_sock_fast); + +int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +{ + struct timeval tv; + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + tv = ktime_to_timeval(sk->sk_stamp); + if (tv.tv_sec == -1) + return -ENOENT; + if (tv.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + tv = ktime_to_timeval(sk->sk_stamp); + } + return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestamp); + +int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) +{ + struct timespec ts; + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + ts = ktime_to_timespec(sk->sk_stamp); + if (ts.tv_sec == -1) + return -ENOENT; + if (ts.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + ts = ktime_to_timespec(sk->sk_stamp); + } + return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestampns); + +void sock_enable_timestamp(struct sock *sk, int flag) +{ + if (!sock_flag(sk, flag)) { + sock_set_flag(sk, flag); + /* + * we just set one of the two flags which require net + * time stamping, but time stamping might have been on + * already because of the other one + */ + if (!sock_flag(sk, + flag == SOCK_TIMESTAMP ? + SOCK_TIMESTAMPING_RX_SOFTWARE : + SOCK_TIMESTAMP)) + net_enable_timestamp(); + } +} + +/* + * Get a socket option on an socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). + */ +int sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(sock_common_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + if (sk->sk_prot->compat_getsockopt != NULL) + return sk->sk_prot->compat_getsockopt(sk, level, optname, + optval, optlen); + return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(compat_sock_common_getsockopt); +#endif + +int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} +EXPORT_SYMBOL(sock_common_recvmsg); + +/* + * Set socket options on an inet socket. + */ +int sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(sock_common_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + + if (sk->sk_prot->compat_setsockopt != NULL) + return sk->sk_prot->compat_setsockopt(sk, level, optname, + optval, optlen); + return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(compat_sock_common_setsockopt); +#endif + +void sk_common_release(struct sock *sk) +{ + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + + /* + * Observation: when sock_common_release is called, processes have + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. + */ + + sk->sk_prot->unhash(sk); + + /* + * In this point socket cannot receive new packets, but it is possible + * that some packets are in flight because some CPU runs receiver and + * did hash table lookup before we unhashed socket. They will achieve + * receive queue and will be purged by socket destructor. + * + * Also we still have packets pending on receive queue and probably, + * our own packets waiting in device queues. sock_destroy will drain + * receive queue, but transmitted packets will delay socket destruction + * until the last reference will be released. + */ + + sock_orphan(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + sock_put(sk); +} +EXPORT_SYMBOL(sk_common_release); + +static DEFINE_RWLOCK(proto_list_lock); +static LIST_HEAD(proto_list); + +#ifdef CONFIG_PROC_FS +#define PROTO_INUSE_NR 64 /* should be enough for the first time */ +struct prot_inuse { + int val[PROTO_INUSE_NR]; +}; + +static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); + +#ifdef CONFIG_NET_NS +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); + +static int __net_init sock_inuse_init_net(struct net *net) +{ + net->core.inuse = alloc_percpu(struct prot_inuse); + return net->core.inuse ? 0 : -ENOMEM; +} + +static void __net_exit sock_inuse_exit_net(struct net *net) +{ + free_percpu(net->core.inuse); +} + +static struct pernet_operations net_inuse_ops = { + .init = sock_inuse_init_net, + .exit = sock_inuse_exit_net, +}; + +static __init int net_inuse_init(void) +{ + if (register_pernet_subsys(&net_inuse_ops)) + panic("Cannot initialize net inuse counters"); + + return 0; +} + +core_initcall(net_inuse_init); +#else +static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); + +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu(prot_inuse, cpu).val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +#endif + +static void assign_proto_idx(struct proto *prot) +{ + prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); + + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + printk(KERN_ERR "PROTO_INUSE_NR exhausted\n"); + return; + } + + set_bit(prot->inuse_idx, proto_inuse_idx); +} + +static void release_proto_idx(struct proto *prot) +{ + if (prot->inuse_idx != PROTO_INUSE_NR - 1) + clear_bit(prot->inuse_idx, proto_inuse_idx); +} +#else +static inline void assign_proto_idx(struct proto *prot) +{ +} + +static inline void release_proto_idx(struct proto *prot) +{ +} +#endif + +int proto_register(struct proto *prot, int alloc_slab) +{ + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, + SLAB_HWCACHE_ALIGN | prot->slab_flags, + NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", + prot->name); + goto out; + } + + if (prot->rsk_prot != NULL) { + prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); + if (prot->rsk_prot->slab_name == NULL) + goto out_free_sock_slab; + + prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, + prot->rsk_prot->obj_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + + if (prot->rsk_prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", + prot->name); + goto out_free_request_sock_slab_name; + } + } + + if (prot->twsk_prot != NULL) { + prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); + + if (prot->twsk_prot->twsk_slab_name == NULL) + goto out_free_request_sock_slab; + + prot->twsk_prot->twsk_slab = + kmem_cache_create(prot->twsk_prot->twsk_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, + SLAB_HWCACHE_ALIGN | + prot->slab_flags, + NULL); + if (prot->twsk_prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; + } + } + + write_lock(&proto_list_lock); + list_add(&prot->node, &proto_list); + assign_proto_idx(prot); + write_unlock(&proto_list_lock); + return 0; + +out_free_timewait_sock_slab_name: + kfree(prot->twsk_prot->twsk_slab_name); +out_free_request_sock_slab: + if (prot->rsk_prot && prot->rsk_prot->slab) { + kmem_cache_destroy(prot->rsk_prot->slab); + prot->rsk_prot->slab = NULL; + } +out_free_request_sock_slab_name: + if (prot->rsk_prot) + kfree(prot->rsk_prot->slab_name); +out_free_sock_slab: + kmem_cache_destroy(prot->slab); + prot->slab = NULL; +out: + return -ENOBUFS; +} +EXPORT_SYMBOL(proto_register); + +void proto_unregister(struct proto *prot) +{ + write_lock(&proto_list_lock); + release_proto_idx(prot); + list_del(&prot->node); + write_unlock(&proto_list_lock); + + if (prot->slab != NULL) { + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } + + if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { + kmem_cache_destroy(prot->rsk_prot->slab); + kfree(prot->rsk_prot->slab_name); + prot->rsk_prot->slab = NULL; + } + + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + kmem_cache_destroy(prot->twsk_prot->twsk_slab); + kfree(prot->twsk_prot->twsk_slab_name); + prot->twsk_prot->twsk_slab = NULL; + } +} +EXPORT_SYMBOL(proto_unregister); + +#ifdef CONFIG_PROC_FS +static void *proto_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(proto_list_lock) +{ + read_lock(&proto_list_lock); + return seq_list_start_head(&proto_list, *pos); +} + +static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &proto_list, pos); +} + +static void proto_seq_stop(struct seq_file *seq, void *v) + __releases(proto_list_lock) +{ + read_unlock(&proto_list_lock); +} + +static char proto_method_implemented(const void *method) +{ + return method == NULL ? 'n' : 'y'; +} + +static void proto_seq_printf(struct seq_file *seq, struct proto *proto) +{ + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + proto->name, + proto->obj_size, + sock_prot_inuse_get(seq_file_net(seq), proto), + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, + proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + proto->max_header, + proto->slab == NULL ? "no" : "yes", + module_name(proto->owner), + proto_method_implemented(proto->close), + proto_method_implemented(proto->connect), + proto_method_implemented(proto->disconnect), + proto_method_implemented(proto->accept), + proto_method_implemented(proto->ioctl), + proto_method_implemented(proto->init), + proto_method_implemented(proto->destroy), + proto_method_implemented(proto->shutdown), + proto_method_implemented(proto->setsockopt), + proto_method_implemented(proto->getsockopt), + proto_method_implemented(proto->sendmsg), + proto_method_implemented(proto->recvmsg), + proto_method_implemented(proto->sendpage), + proto_method_implemented(proto->bind), + proto_method_implemented(proto->backlog_rcv), + proto_method_implemented(proto->hash), + proto_method_implemented(proto->unhash), + proto_method_implemented(proto->get_port), + proto_method_implemented(proto->enter_memory_pressure)); +} + +static int proto_seq_show(struct seq_file *seq, void *v) +{ + if (v == &proto_list) + seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", + "protocol", + "size", + "sockets", + "memory", + "press", + "maxhdr", + "slab", + "module", + "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + else + proto_seq_printf(seq, list_entry(v, struct proto, node)); + return 0; +} + +static const struct seq_operations proto_seq_ops = { + .start = proto_seq_start, + .next = proto_seq_next, + .stop = proto_seq_stop, + .show = proto_seq_show, +}; + +static int proto_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &proto_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations proto_seq_fops = { + .owner = THIS_MODULE, + .open = proto_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static __net_init int proto_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + return -ENOMEM; + + return 0; +} + +static __net_exit void proto_exit_net(struct net *net) +{ + proc_net_remove(net, "protocols"); +} + + +static __net_initdata struct pernet_operations proto_net_ops = { + .init = proto_init_net, + .exit = proto_exit_net, +}; + +static int __init proto_init(void) +{ + return register_pernet_subsys(&proto_net_ops); +} + +subsys_initcall(proto_init); + +#endif /* PROC_FS */ diff --git a/net/core/stream.c b/net/core/stream.c new file mode 100644 index 00000000..f5df85dc --- /dev/null +++ b/net/core/stream.c @@ -0,0 +1,208 @@ +/* + * SUCS NET3: + * + * Generic stream handling routines. These are generic for most + * protocols. Even IP. Tonight 8-). + * This is used because TCP, LLC (others too) layer all have mostly + * identical sendmsg() and recvmsg() code. + * So we (will) share it here. + * + * Authors: Arnaldo Carvalho de Melo + * (from old tcp.c code) + * Alan Cox (Borrowed comments 8-)) + */ + +#include +#include +#include +#include +#include +#include + +/** + * sk_stream_write_space - stream socket write_space callback. + * @sk: socket + * + * FIXME: write proper description + */ +void sk_stream_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + struct socket_wq *wq; + + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLOUT | + POLLWRNORM | POLLWRBAND); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sk_stream_write_space); + +/** + * sk_stream_wait_connect - Wait for a socket to get into the connected state + * @sk: sock to wait on + * @timeo_p: for how long to wait + * + * Must be called with the socket locked. + */ +int sk_stream_wait_connect(struct sock *sk, long *timeo_p) +{ + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + int done; + + do { + int err = sock_error(sk); + if (err) + return err; + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) + return -EPIPE; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(tsk)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + done = sk_wait_event(sk, timeo_p, + !sk->sk_err && + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); + finish_wait(sk_sleep(sk), &wait); + sk->sk_write_pending--; + } while (!done); + return 0; +} +EXPORT_SYMBOL(sk_stream_wait_connect); + +/** + * sk_stream_closing - Return 1 if we still have things to send in our buffers. + * @sk: socket to verify + */ +static inline int sk_stream_closing(struct sock *sk) +{ + return (1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); +} + +void sk_stream_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); + } +} +EXPORT_SYMBOL(sk_stream_wait_close); + +/** + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk: socket to wait for memory + * @timeo_p: for how long + */ +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + int err = 0; + long vm_wait = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + if (sk_stream_memory_free(sk)) + current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + + while (1) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo_p) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sk_stream_memory_free(sk) && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, ¤t_timeo, sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && + !vm_wait)); + sk->sk_write_pending--; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo_p = current_timeo; + } +out: + finish_wait(sk_sleep(sk), &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_nonblock: + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; +} +EXPORT_SYMBOL(sk_stream_wait_memory); + +int sk_stream_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} +EXPORT_SYMBOL(sk_stream_error); + +void sk_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + + /* Account for returned memory. */ + sk_mem_reclaim(sk); + + WARN_ON(sk->sk_wmem_queued); + WARN_ON(sk->sk_forward_alloc); + + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ +} +EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 00000000..77a65f03 --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,259 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_RPS +static int rps_sock_flow_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int orig_size, size; + int ret, i; + ctl_table tmp = { + .data = &size, + .maxlen = sizeof(size), + .mode = table->mode + }; + struct rps_sock_flow_table *orig_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + + mutex_lock(&sock_flow_mutex); + + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); + size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + if (write) { + if (size) { + if (size > 1<<30) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = + vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); + if (!sock_table) { + mutex_unlock(&sock_flow_mutex); + return -ENOMEM; + } + + sock_table->mask = size - 1; + } else + sock_table = orig_sock_table; + + for (i = 0; i < size; i++) + sock_table->ents[i] = RPS_NO_CPU; + } else + sock_table = NULL; + + if (sock_table != orig_sock_table) { + rcu_assign_pointer(rps_sock_flow_table, sock_table); + synchronize_rcu(); + vfree(orig_sock_table); + } + } + + mutex_unlock(&sock_flow_mutex); + + return ret; +} +#endif /* CONFIG_RPS */ + +static struct ctl_table net_core_table[] = { +#ifdef CONFIG_NET + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "dev_weight", + .data = &weight_p, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "netdev_max_backlog", + .data = &netdev_max_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_BPF_JIT + { + .procname = "bpf_jit_enable", + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "netdev_tstamp_prequeue", + .data = &netdev_tstamp_prequeue, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "message_cost", + .data = &net_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "message_burst", + .data = &net_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "optmem_max", + .data = &sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_RPS + { + .procname = "rps_sock_flow_entries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rps_sock_flow_sysctl + }, +#endif +#endif /* CONFIG_NET */ + { + .procname = "netdev_budget", + .data = &netdev_budget, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "warnings", + .data = &net_msg_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +static struct ctl_table netns_core_table[] = { + { + .procname = "somaxconn", + .data = &init_net.core.sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +__net_initdata struct ctl_path net_core_path[] = { + { .procname = "net", }, + { .procname = "core", }, + { }, +}; + +static __net_init int sysctl_core_net_init(struct net *net) +{ + struct ctl_table *tbl; + + net->core.sysctl_somaxconn = SOMAXCONN; + + tbl = netns_core_table; + if (!net_eq(net, &init_net)) { + tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + + tbl[0].data = &net->core.sysctl_somaxconn; + } + + net->core.sysctl_hdr = register_net_sysctl_table(net, + net_core_path, tbl); + if (net->core.sysctl_hdr == NULL) + goto err_reg; + + return 0; + +err_reg: + if (tbl != netns_core_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_core_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->core.sysctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->core.sysctl_hdr); + BUG_ON(tbl == netns_core_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_core_ops = { + .init = sysctl_core_net_init, + .exit = sysctl_core_net_exit, +}; + +static __init int sysctl_core_init(void) +{ + static struct ctl_table empty[1]; + + register_sysctl_paths(net_core_path, empty); + register_net_sysctl_rotable(net_core_path, net_core_table); + return register_pernet_subsys(&sysctl_core_ops); +} + +fs_initcall(sysctl_core_init); diff --git a/net/core/timestamping.c b/net/core/timestamping.c new file mode 100644 index 00000000..97d036a6 --- /dev/null +++ b/net/core/timestamping.c @@ -0,0 +1,136 @@ +/* + * PTP 1588 clock support - support for timestamping in PHY devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include + +static struct sock_filter ptp_filter[] = { + PTP_FILTER +}; + +static unsigned int classify(const struct sk_buff *skb) +{ + if (likely(skb->dev && + skb->dev->phydev && + skb->dev->phydev->drv)) + return sk_run_filter(skb, ptp_filter); + else + return PTP_CLASS_NONE; +} + +void skb_clone_tx_timestamp(struct sk_buff *skb) +{ + struct phy_device *phydev; + struct sk_buff *clone; + struct sock *sk = skb->sk; + unsigned int type; + + if (!sk) + return; + + type = classify(skb); + + switch (type) { + case PTP_CLASS_V1_IPV4: + case PTP_CLASS_V1_IPV6: + case PTP_CLASS_V2_IPV4: + case PTP_CLASS_V2_IPV6: + case PTP_CLASS_V2_L2: + case PTP_CLASS_V2_VLAN: + phydev = skb->dev->phydev; + if (likely(phydev->drv->txtstamp)) { + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + return; + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return; + } + clone->sk = sk; + phydev->drv->txtstamp(phydev, clone, type); + } + break; + default: + break; + } +} + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; + int err; + + if (!hwtstamps) { + sock_put(sk); + kfree_skb(skb); + return; + } + + *skb_hwtstamps(skb) = *hwtstamps; + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + skb->sk = NULL; + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +bool skb_defer_rx_timestamp(struct sk_buff *skb) +{ + struct phy_device *phydev; + unsigned int type; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + __skb_push(skb, ETH_HLEN); + + type = classify(skb); + + __skb_pull(skb, ETH_HLEN); + + switch (type) { + case PTP_CLASS_V1_IPV4: + case PTP_CLASS_V1_IPV6: + case PTP_CLASS_V2_IPV4: + case PTP_CLASS_V2_IPV6: + case PTP_CLASS_V2_L2: + case PTP_CLASS_V2_VLAN: + phydev = skb->dev->phydev; + if (likely(phydev->drv->rxtstamp)) + return phydev->drv->rxtstamp(phydev, skb, type); + break; + default: + break; + } + + return false; +} + +void __init skb_timestamping_init(void) +{ + BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); +} diff --git a/net/core/user_dma.c b/net/core/user_dma.c new file mode 100644 index 00000000..25d717eb --- /dev/null +++ b/net/core/user_dma.c @@ -0,0 +1,130 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include +#include +#include +#include + +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; +EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); + +/** + * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * @pinned_list - locked iovec buffer data + * + * Note: the iovec is modified during the copy. + */ +int dma_skb_copy_datagram_iovec(struct dma_chan *chan, + struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + dma_cookie_t cookie = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_memcpy_to_iovec(chan, to, pinned_list, + skb->data + offset, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + copy = end - offset; + if (copy > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + + cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, + frag->page_offset + offset - start, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, + offset - start, + to, copy, + pinned_list); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + +end: + if (!len) { + skb->dma_cookie = cookie; + return cookie; + } + +fault: + return -EFAULT; +} diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 00000000..386e263f --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,323 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andi Kleen + * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project + * + * Created by Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +int net_msg_warn __read_mostly = 1; +EXPORT_SYMBOL(net_msg_warn); + +DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); +/* + * All net warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + return __ratelimit(&net_ratelimit_state); +} +EXPORT_SYMBOL(net_ratelimit); + +/* + * Convert an ASCII string to binary IP. + * This is outside of net/ipv4/ because various code that uses IP addresses + * is otherwise not dependent on the TCP/IP stack. + */ + +__be32 in_aton(const char *str) +{ + unsigned long l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) + { + l <<= 8; + if (*str != '\0') + { + val = 0; + while (*str != '\0' && *str != '.' && *str != '\n') + { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return htonl(l); +} +EXPORT_SYMBOL(in_aton); + +#define IN6PTON_XDIGIT 0x00010000 +#define IN6PTON_DIGIT 0x00020000 +#define IN6PTON_COLON_MASK 0x00700000 +#define IN6PTON_COLON_1 0x00100000 /* single : requested */ +#define IN6PTON_COLON_2 0x00200000 /* second : requested */ +#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */ +#define IN6PTON_DOT 0x00800000 /* . */ +#define IN6PTON_DELIM 0x10000000 +#define IN6PTON_NULL 0x20000000 /* first/tail */ +#define IN6PTON_UNKNOWN 0x40000000 + +static inline int xdigit2bin(char c, int delim) +{ + int val; + + if (c == delim || c == '\0') + return IN6PTON_DELIM; + if (c == ':') + return IN6PTON_COLON_MASK; + if (c == '.') + return IN6PTON_DOT; + + val = hex_to_bin(c); + if (val >= 0) + return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); + + if (delim == -1) + return IN6PTON_DELIM; + return IN6PTON_UNKNOWN; +} + +int in4_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s; + u8 *d; + u8 dbuf[4]; + int ret = 0; + int i; + int w = 0; + + if (srclen < 0) + srclen = strlen(src); + s = src; + d = dbuf; + i = 0; + while(1) { + int c; + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { + goto out; + } + if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (w == 0) + goto out; + *d++ = w & 0xff; + w = 0; + i++; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (i != 4) + goto out; + break; + } + goto cont; + } + w = (w * 10) + c; + if ((w & 0xffff) > 255) { + goto out; + } +cont: + if (i >= 4) + goto out; + s++; + srclen--; + } + ret = 1; + memcpy(dst, dbuf, sizeof(dbuf)); +out: + if (end) + *end = s; + return ret; +} +EXPORT_SYMBOL(in4_pton); + +int in6_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s, *tok = NULL; + u8 *d, *dc = NULL; + u8 dbuf[16]; + int ret = 0; + int i; + int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL; + int w = 0; + + memset(dbuf, 0, sizeof(dbuf)); + + s = src; + d = dbuf; + if (srclen < 0) + srclen = strlen(src); + + while (1) { + int c; + + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & state)) + goto out; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + /* process one 16-bit word */ + if (!(state & IN6PTON_NULL)) { + *d++ = (w >> 8) & 0xff; + *d++ = w & 0xff; + } + w = 0; + if (c & IN6PTON_DELIM) { + /* We've processed last word */ + break; + } + /* + * COLON_1 => XDIGIT + * COLON_2 => XDIGIT|DELIM + * COLON_1_2 => COLON_2 + */ + switch (state & IN6PTON_COLON_MASK) { + case IN6PTON_COLON_2: + dc = d; + state = IN6PTON_XDIGIT | IN6PTON_DELIM; + if (dc - dbuf >= sizeof(dbuf)) + state |= IN6PTON_NULL; + break; + case IN6PTON_COLON_1|IN6PTON_COLON_1_2: + state = IN6PTON_XDIGIT | IN6PTON_COLON_2; + break; + case IN6PTON_COLON_1: + state = IN6PTON_XDIGIT; + break; + case IN6PTON_COLON_1_2: + state = IN6PTON_COLON_2; + break; + default: + state = 0; + } + tok = s + 1; + goto cont; + } + + if (c & IN6PTON_DOT) { + ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s); + if (ret > 0) { + d += 4; + break; + } + goto out; + } + + w = (w << 4) | (0xff & c); + state = IN6PTON_COLON_1 | IN6PTON_DELIM; + if (!(w & 0xf000)) { + state |= IN6PTON_XDIGIT; + } + if (!dc && d + 2 < dbuf + sizeof(dbuf)) { + state |= IN6PTON_COLON_1_2; + state &= ~IN6PTON_DELIM; + } + if (d + 2 >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2); + } +cont: + if ((dc && d + 4 < dbuf + sizeof(dbuf)) || + d + 4 == dbuf + sizeof(dbuf)) { + state |= IN6PTON_DOT; + } + if (d >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK); + } + s++; + srclen--; + } + + i = 15; d--; + + if (dc) { + while(d >= dc) + dst[i--] = *d--; + while(i >= dc - dbuf) + dst[i--] = 0; + while(i >= 0) + dst[i--] = *d--; + } else + memcpy(dst, dbuf, sizeof(dbuf)); + + ret = 1; +out: + if (end) + *end = s; + return ret; +} +EXPORT_SYMBOL(in6_pton); + +void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, int pseudohdr) +{ + __be32 diff[] = { ~from, to }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial(diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace4); + +int mac_pton(const char *s, u8 *mac) +{ + int i; + + /* XX:XX:XX:XX:XX:XX */ + if (strlen(s) < 3 * ETH_ALEN - 1) + return 0; + + /* Don't dirty result unless string is valid MAC. */ + for (i = 0; i < ETH_ALEN; i++) { + if (!strchr("0123456789abcdefABCDEF", s[i * 3])) + return 0; + if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1])) + return 0; + if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':') + return 0; + } + for (i = 0; i < ETH_ALEN; i++) { + mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]); + } + return 1; +} +EXPORT_SYMBOL(mac_pton); diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig new file mode 100644 index 00000000..4066d59c --- /dev/null +++ b/net/dcb/Kconfig @@ -0,0 +1,22 @@ +config DCB + bool "Data Center Bridging support" + default n + ---help--- + This enables support for configuring Data Center Bridging (DCB) + features on DCB capable Ethernet adapters via rtnetlink. Say 'Y' + if you have a DCB capable Ethernet adapter which supports this + interface and you are connected to a DCB capable switch. + + DCB is a collection of Ethernet enhancements which allow DCB capable + NICs and switches to support network traffic with differing + requirements (highly reliable, no drops vs. best effort vs. low + latency) to co-exist on Ethernet. + + DCB features include: + Enhanced Transmission Selection (aka Priority Grouping) - provides a + framework for assigning bandwidth guarantees to traffic classes. + Priority-based Flow Control (PFC) - a MAC control pause frame which + works at the granularity of the 802.1p priority instead of the + link (802.3x). + + If unsure, say N. diff --git a/net/dcb/Makefile b/net/dcb/Makefile new file mode 100644 index 00000000..c1282c9e --- /dev/null +++ b/net/dcb/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_DCB) += dcbnl.o dcbevent.o diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c new file mode 100644 index 00000000..665a8802 --- /dev/null +++ b/net/dcb/dcbevent.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: John Fastabend + */ + +#include +#include + +static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain); + +int register_dcbevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&dcbevent_notif_chain, nb); +} +EXPORT_SYMBOL(register_dcbevent_notifier); + +int unregister_dcbevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&dcbevent_notif_chain, nb); +} +EXPORT_SYMBOL(unregister_dcbevent_notifier); + +int call_dcbevent_notifiers(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&dcbevent_notif_chain, val, v); +} diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c new file mode 100644 index 00000000..3609eaca --- /dev/null +++ b/net/dcb/dcbnl.c @@ -0,0 +1,1835 @@ +/* + * Copyright (c) 2008-2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Lucy Liu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Data Center Bridging (DCB) is a collection of Ethernet enhancements + * intended to allow network traffic with differing requirements + * (highly reliable, no drops vs. best effort vs. low latency) to operate + * and co-exist on Ethernet. Current DCB features are: + * + * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a + * framework for assigning bandwidth guarantees to traffic classes. + * + * Priority-based Flow Control (PFC) - provides a flow control mechanism which + * can work independently for each 802.1p priority. + * + * Congestion Notification - provides a mechanism for end-to-end congestion + * control for protocols which do not have built-in congestion management. + * + * More information about the emerging standards for these Ethernet features + * can be found at: http://www.ieee802.org/1/pages/dcbridges.html + * + * This file implements an rtnetlink interface to allow configuration of DCB + * features for capable devices. + */ + +MODULE_AUTHOR("Lucy Liu, "); +MODULE_DESCRIPTION("Data Center Bridging netlink interface"); +MODULE_LICENSE("GPL"); + +/**************** DCB attribute policies *************************************/ + +/* DCB netlink attributes policy */ +static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { + [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, + [DCB_ATTR_STATE] = {.type = NLA_U8}, + [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, + [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, + [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, + [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, + [DCB_ATTR_CAP] = {.type = NLA_NESTED}, + [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, + [DCB_ATTR_BCN] = {.type = NLA_NESTED}, + [DCB_ATTR_APP] = {.type = NLA_NESTED}, + [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, + [DCB_ATTR_DCBX] = {.type = NLA_U8}, + [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, +}; + +/* DCB priority flow control to User Priority nested attributes */ +static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { + [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, + [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, +}; + +/* DCB priority grouping nested attributes */ +static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { + [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, + [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, + [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, +}; + +/* DCB traffic class nested attributes. */ +static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { + [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, + [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, + [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, + [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, + [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, +}; + +/* DCB capabilities nested attributes. */ +static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { + [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, + [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, + [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, + [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, + [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, + [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, + [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, + [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, + [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, +}; + +/* DCB capabilities nested attributes. */ +static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { + [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, + [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, + [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, +}; + +/* DCB BCN nested attributes. */ +static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { + [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, + [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, + [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, + [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, + [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, + [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, + [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, + [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, + [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, + [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, + [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, + [DCB_BCN_ATTR_W] = {.type = NLA_U32}, + [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, + [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, + [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, + [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, + [DCB_BCN_ATTR_C] = {.type = NLA_U32}, + [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, +}; + +/* DCB APP nested attributes. */ +static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { + [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, + [DCB_APP_ATTR_ID] = {.type = NLA_U16}, + [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, +}; + +/* IEEE 802.1Qaz nested attributes. */ +static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { + [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, + [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, + [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, +}; + +static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = { + [DCB_ATTR_IEEE_APP] = {.len = sizeof(struct dcb_app)}, +}; + +/* DCB number of traffic classes nested attributes. */ +static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { + [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, + [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, + [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, + [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, +}; + +static LIST_HEAD(dcb_app_list); +static DEFINE_SPINLOCK(dcb_lock); + +/* standard netlink reply call */ +static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid, + u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct dcbmsg *dcb; + struct nlmsghdr *nlh; + int ret = -EINVAL; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + return ret; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, event, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = cmd; + dcb->dcb_pad = 0; + + ret = nla_put_u8(dcbnl_skb, attr, value); + if (ret) + goto err; + + /* end the message, assign the nlmsg_len. */ + nlmsg_end(dcbnl_skb, nlh); + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + return -EINVAL; + + return 0; +nlmsg_failure: +err: + kfree_skb(dcbnl_skb); + return ret; +} + +static int dcbnl_getstate(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret = -EINVAL; + + /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ + if (!netdev->dcbnl_ops->getstate) + return ret; + + ret = dcbnl_reply(netdev->dcbnl_ops->getstate(netdev), RTM_GETDCB, + DCB_CMD_GSTATE, DCB_ATTR_STATE, pid, seq, flags); + + return ret; +} + +static int dcbnl_getpfccfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; + u8 value; + int ret = -EINVAL; + int i; + int getall = 0; + + if (!tb[DCB_ATTR_PFC_CFG] || !netdev->dcbnl_ops->getpfccfg) + return ret; + + ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX, + tb[DCB_ATTR_PFC_CFG], + dcbnl_pfc_up_nest); + if (ret) + goto err_out; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + goto err_out; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_PFC_GCFG; + + nest = nla_nest_start(dcbnl_skb, DCB_ATTR_PFC_CFG); + if (!nest) + goto err; + + if (data[DCB_PFC_UP_ATTR_ALL]) + getall = 1; + + for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { + if (!getall && !data[i]) + continue; + + netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, + &value); + ret = nla_put_u8(dcbnl_skb, i, value); + + if (ret) { + nla_nest_cancel(dcbnl_skb, nest); + goto err; + } + } + nla_nest_end(dcbnl_skb, nest); + + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + goto err_out; + + return 0; +nlmsg_failure: +err: + kfree_skb(dcbnl_skb); +err_out: + return -EINVAL; +} + +static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + u8 perm_addr[MAX_ADDR_LEN]; + int ret = -EINVAL; + + if (!netdev->dcbnl_ops->getpermhwaddr) + return ret; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + goto err_out; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_GPERM_HWADDR; + + netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); + + ret = nla_put(dcbnl_skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), + perm_addr); + + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + goto err_out; + + return 0; + +nlmsg_failure: + kfree_skb(dcbnl_skb); +err_out: + return -EINVAL; +} + +static int dcbnl_getcap(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; + u8 value; + int ret = -EINVAL; + int i; + int getall = 0; + + if (!tb[DCB_ATTR_CAP] || !netdev->dcbnl_ops->getcap) + return ret; + + ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], + dcbnl_cap_nest); + if (ret) + goto err_out; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + goto err_out; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_GCAP; + + nest = nla_nest_start(dcbnl_skb, DCB_ATTR_CAP); + if (!nest) + goto err; + + if (data[DCB_CAP_ATTR_ALL]) + getall = 1; + + for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { + if (!getall && !data[i]) + continue; + + if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { + ret = nla_put_u8(dcbnl_skb, i, value); + + if (ret) { + nla_nest_cancel(dcbnl_skb, nest); + goto err; + } + } + } + nla_nest_end(dcbnl_skb, nest); + + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + goto err_out; + + return 0; +nlmsg_failure: +err: + kfree_skb(dcbnl_skb); +err_out: + return -EINVAL; +} + +static int dcbnl_getnumtcs(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; + u8 value; + int ret = -EINVAL; + int i; + int getall = 0; + + if (!tb[DCB_ATTR_NUMTCS] || !netdev->dcbnl_ops->getnumtcs) + return ret; + + ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], + dcbnl_numtcs_nest); + if (ret) { + ret = -EINVAL; + goto err_out; + } + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) { + ret = -EINVAL; + goto err_out; + } + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_GNUMTCS; + + nest = nla_nest_start(dcbnl_skb, DCB_ATTR_NUMTCS); + if (!nest) { + ret = -EINVAL; + goto err; + } + + if (data[DCB_NUMTCS_ATTR_ALL]) + getall = 1; + + for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { + if (!getall && !data[i]) + continue; + + ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); + if (!ret) { + ret = nla_put_u8(dcbnl_skb, i, value); + + if (ret) { + nla_nest_cancel(dcbnl_skb, nest); + ret = -EINVAL; + goto err; + } + } else { + goto err; + } + } + nla_nest_end(dcbnl_skb, nest); + + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) { + ret = -EINVAL; + goto err_out; + } + + return 0; +nlmsg_failure: +err: + kfree_skb(dcbnl_skb); +err_out: + return ret; +} + +static int dcbnl_setnumtcs(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; + int ret = -EINVAL; + u8 value; + int i; + + if (!tb[DCB_ATTR_NUMTCS] || !netdev->dcbnl_ops->setnumtcs) + return ret; + + ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], + dcbnl_numtcs_nest); + + if (ret) { + ret = -EINVAL; + goto err; + } + + for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { + if (data[i] == NULL) + continue; + + value = nla_get_u8(data[i]); + + ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); + + if (ret) + goto operr; + } + +operr: + ret = dcbnl_reply(!!ret, RTM_SETDCB, DCB_CMD_SNUMTCS, + DCB_ATTR_NUMTCS, pid, seq, flags); + +err: + return ret; +} + +static int dcbnl_getpfcstate(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret = -EINVAL; + + if (!netdev->dcbnl_ops->getpfcstate) + return ret; + + ret = dcbnl_reply(netdev->dcbnl_ops->getpfcstate(netdev), RTM_GETDCB, + DCB_CMD_PFC_GSTATE, DCB_ATTR_PFC_STATE, + pid, seq, flags); + + return ret; +} + +static int dcbnl_setpfcstate(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret = -EINVAL; + u8 value; + + if (!tb[DCB_ATTR_PFC_STATE] || !netdev->dcbnl_ops->setpfcstate) + return ret; + + value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); + + netdev->dcbnl_ops->setpfcstate(netdev, value); + + ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_PFC_SSTATE, DCB_ATTR_PFC_STATE, + pid, seq, flags); + + return ret; +} + +static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *app_nest; + struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; + u16 id; + u8 up, idtype; + int ret = -EINVAL; + + if (!tb[DCB_ATTR_APP]) + goto out; + + ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], + dcbnl_app_nest); + if (ret) + goto out; + + ret = -EINVAL; + /* all must be non-null */ + if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || + (!app_tb[DCB_APP_ATTR_ID])) + goto out; + + /* either by eth type or by socket number */ + idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); + if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && + (idtype != DCB_APP_IDTYPE_PORTNUM)) + goto out; + + id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); + + if (netdev->dcbnl_ops->getapp) { + up = netdev->dcbnl_ops->getapp(netdev, idtype, id); + } else { + struct dcb_app app = { + .selector = idtype, + .protocol = id, + }; + up = dcb_getapp(netdev, &app); + } + + /* send this back */ + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + goto out; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_GAPP; + + app_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_APP); + if (!app_nest) + goto out_cancel; + + ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_IDTYPE, idtype); + if (ret) + goto out_cancel; + + ret = nla_put_u16(dcbnl_skb, DCB_APP_ATTR_ID, id); + if (ret) + goto out_cancel; + + ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_PRIORITY, up); + if (ret) + goto out_cancel; + + nla_nest_end(dcbnl_skb, app_nest); + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + goto nlmsg_failure; + + goto out; + +out_cancel: + nla_nest_cancel(dcbnl_skb, app_nest); +nlmsg_failure: + kfree_skb(dcbnl_skb); +out: + return ret; +} + +static int dcbnl_setapp(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int err, ret = -EINVAL; + u16 id; + u8 up, idtype; + struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; + + if (!tb[DCB_ATTR_APP]) + goto out; + + ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], + dcbnl_app_nest); + if (ret) + goto out; + + ret = -EINVAL; + /* all must be non-null */ + if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || + (!app_tb[DCB_APP_ATTR_ID]) || + (!app_tb[DCB_APP_ATTR_PRIORITY])) + goto out; + + /* either by eth type or by socket number */ + idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); + if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && + (idtype != DCB_APP_IDTYPE_PORTNUM)) + goto out; + + id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); + up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); + + if (netdev->dcbnl_ops->setapp) { + err = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); + } else { + struct dcb_app app; + app.selector = idtype; + app.protocol = id; + app.priority = up; + err = dcb_setapp(netdev, &app); + } + + ret = dcbnl_reply(err, RTM_SETDCB, DCB_CMD_SAPP, DCB_ATTR_APP, + pid, seq, flags); +out: + return ret; +} + +static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags, int dir) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *pg_nest, *param_nest, *data; + struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; + struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; + u8 prio, pgid, tc_pct, up_map; + int ret = -EINVAL; + int getall = 0; + int i; + + if (!tb[DCB_ATTR_PG_CFG] || + !netdev->dcbnl_ops->getpgtccfgtx || + !netdev->dcbnl_ops->getpgtccfgrx || + !netdev->dcbnl_ops->getpgbwgcfgtx || + !netdev->dcbnl_ops->getpgbwgcfgrx) + return ret; + + ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, + tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest); + + if (ret) + goto err_out; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + goto err_out; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = (dir) ? DCB_CMD_PGRX_GCFG : DCB_CMD_PGTX_GCFG; + + pg_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_PG_CFG); + if (!pg_nest) + goto err; + + if (pg_tb[DCB_PG_ATTR_TC_ALL]) + getall = 1; + + for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { + if (!getall && !pg_tb[i]) + continue; + + if (pg_tb[DCB_PG_ATTR_TC_ALL]) + data = pg_tb[DCB_PG_ATTR_TC_ALL]; + else + data = pg_tb[i]; + ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, + data, dcbnl_tc_param_nest); + if (ret) + goto err_pg; + + param_nest = nla_nest_start(dcbnl_skb, i); + if (!param_nest) + goto err_pg; + + pgid = DCB_ATTR_VALUE_UNDEFINED; + prio = DCB_ATTR_VALUE_UNDEFINED; + tc_pct = DCB_ATTR_VALUE_UNDEFINED; + up_map = DCB_ATTR_VALUE_UNDEFINED; + + if (dir) { + /* Rx */ + netdev->dcbnl_ops->getpgtccfgrx(netdev, + i - DCB_PG_ATTR_TC_0, &prio, + &pgid, &tc_pct, &up_map); + } else { + /* Tx */ + netdev->dcbnl_ops->getpgtccfgtx(netdev, + i - DCB_PG_ATTR_TC_0, &prio, + &pgid, &tc_pct, &up_map); + } + + if (param_tb[DCB_TC_ATTR_PARAM_PGID] || + param_tb[DCB_TC_ATTR_PARAM_ALL]) { + ret = nla_put_u8(dcbnl_skb, + DCB_TC_ATTR_PARAM_PGID, pgid); + if (ret) + goto err_param; + } + if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || + param_tb[DCB_TC_ATTR_PARAM_ALL]) { + ret = nla_put_u8(dcbnl_skb, + DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); + if (ret) + goto err_param; + } + if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || + param_tb[DCB_TC_ATTR_PARAM_ALL]) { + ret = nla_put_u8(dcbnl_skb, + DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); + if (ret) + goto err_param; + } + if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || + param_tb[DCB_TC_ATTR_PARAM_ALL]) { + ret = nla_put_u8(dcbnl_skb, DCB_TC_ATTR_PARAM_BW_PCT, + tc_pct); + if (ret) + goto err_param; + } + nla_nest_end(dcbnl_skb, param_nest); + } + + if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) + getall = 1; + else + getall = 0; + + for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { + if (!getall && !pg_tb[i]) + continue; + + tc_pct = DCB_ATTR_VALUE_UNDEFINED; + + if (dir) { + /* Rx */ + netdev->dcbnl_ops->getpgbwgcfgrx(netdev, + i - DCB_PG_ATTR_BW_ID_0, &tc_pct); + } else { + /* Tx */ + netdev->dcbnl_ops->getpgbwgcfgtx(netdev, + i - DCB_PG_ATTR_BW_ID_0, &tc_pct); + } + ret = nla_put_u8(dcbnl_skb, i, tc_pct); + + if (ret) + goto err_pg; + } + + nla_nest_end(dcbnl_skb, pg_nest); + + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + goto err_out; + + return 0; + +err_param: + nla_nest_cancel(dcbnl_skb, param_nest); +err_pg: + nla_nest_cancel(dcbnl_skb, pg_nest); +nlmsg_failure: +err: + kfree_skb(dcbnl_skb); +err_out: + ret = -EINVAL; + return ret; +} + +static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + return __dcbnl_pg_getcfg(netdev, tb, pid, seq, flags, 0); +} + +static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + return __dcbnl_pg_getcfg(netdev, tb, pid, seq, flags, 1); +} + +static int dcbnl_setstate(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret = -EINVAL; + u8 value; + + if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->setstate) + return ret; + + value = nla_get_u8(tb[DCB_ATTR_STATE]); + + ret = dcbnl_reply(netdev->dcbnl_ops->setstate(netdev, value), + RTM_SETDCB, DCB_CMD_SSTATE, DCB_ATTR_STATE, + pid, seq, flags); + + return ret; +} + +static int dcbnl_setpfccfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; + int i; + int ret = -EINVAL; + u8 value; + + if (!tb[DCB_ATTR_PFC_CFG] || !netdev->dcbnl_ops->setpfccfg) + return ret; + + ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX, + tb[DCB_ATTR_PFC_CFG], + dcbnl_pfc_up_nest); + if (ret) + goto err; + + for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { + if (data[i] == NULL) + continue; + value = nla_get_u8(data[i]); + netdev->dcbnl_ops->setpfccfg(netdev, + data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); + } + + ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_PFC_SCFG, DCB_ATTR_PFC_CFG, + pid, seq, flags); +err: + return ret; +} + +static int dcbnl_setall(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret = -EINVAL; + + if (!tb[DCB_ATTR_SET_ALL] || !netdev->dcbnl_ops->setall) + return ret; + + ret = dcbnl_reply(netdev->dcbnl_ops->setall(netdev), RTM_SETDCB, + DCB_CMD_SET_ALL, DCB_ATTR_SET_ALL, pid, seq, flags); + + return ret; +} + +static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags, int dir) +{ + struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; + struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; + int ret = -EINVAL; + int i; + u8 pgid; + u8 up_map; + u8 prio; + u8 tc_pct; + + if (!tb[DCB_ATTR_PG_CFG] || + !netdev->dcbnl_ops->setpgtccfgtx || + !netdev->dcbnl_ops->setpgtccfgrx || + !netdev->dcbnl_ops->setpgbwgcfgtx || + !netdev->dcbnl_ops->setpgbwgcfgrx) + return ret; + + ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, + tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest); + if (ret) + goto err; + + for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { + if (!pg_tb[i]) + continue; + + ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, + pg_tb[i], dcbnl_tc_param_nest); + if (ret) + goto err; + + pgid = DCB_ATTR_VALUE_UNDEFINED; + prio = DCB_ATTR_VALUE_UNDEFINED; + tc_pct = DCB_ATTR_VALUE_UNDEFINED; + up_map = DCB_ATTR_VALUE_UNDEFINED; + + if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) + prio = + nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); + + if (param_tb[DCB_TC_ATTR_PARAM_PGID]) + pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); + + if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) + tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); + + if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) + up_map = + nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); + + /* dir: Tx = 0, Rx = 1 */ + if (dir) { + /* Rx */ + netdev->dcbnl_ops->setpgtccfgrx(netdev, + i - DCB_PG_ATTR_TC_0, + prio, pgid, tc_pct, up_map); + } else { + /* Tx */ + netdev->dcbnl_ops->setpgtccfgtx(netdev, + i - DCB_PG_ATTR_TC_0, + prio, pgid, tc_pct, up_map); + } + } + + for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { + if (!pg_tb[i]) + continue; + + tc_pct = nla_get_u8(pg_tb[i]); + + /* dir: Tx = 0, Rx = 1 */ + if (dir) { + /* Rx */ + netdev->dcbnl_ops->setpgbwgcfgrx(netdev, + i - DCB_PG_ATTR_BW_ID_0, tc_pct); + } else { + /* Tx */ + netdev->dcbnl_ops->setpgbwgcfgtx(netdev, + i - DCB_PG_ATTR_BW_ID_0, tc_pct); + } + } + + ret = dcbnl_reply(0, RTM_SETDCB, + (dir ? DCB_CMD_PGRX_SCFG : DCB_CMD_PGTX_SCFG), + DCB_ATTR_PG_CFG, pid, seq, flags); + +err: + return ret; +} + +static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 0); +} + +static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 1); +} + +static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *bcn_nest; + struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; + u8 value_byte; + u32 value_integer; + int ret = -EINVAL; + bool getall = false; + int i; + + if (!tb[DCB_ATTR_BCN] || !netdev->dcbnl_ops->getbcnrp || + !netdev->dcbnl_ops->getbcncfg) + return ret; + + ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX, + tb[DCB_ATTR_BCN], dcbnl_bcn_nest); + + if (ret) + goto err_out; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) + goto err_out; + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_BCN_GCFG; + + bcn_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_BCN); + if (!bcn_nest) + goto err; + + if (bcn_tb[DCB_BCN_ATTR_ALL]) + getall = true; + + for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { + if (!getall && !bcn_tb[i]) + continue; + + netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, + &value_byte); + ret = nla_put_u8(dcbnl_skb, i, value_byte); + if (ret) + goto err_bcn; + } + + for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { + if (!getall && !bcn_tb[i]) + continue; + + netdev->dcbnl_ops->getbcncfg(netdev, i, + &value_integer); + ret = nla_put_u32(dcbnl_skb, i, value_integer); + if (ret) + goto err_bcn; + } + + nla_nest_end(dcbnl_skb, bcn_nest); + + nlmsg_end(dcbnl_skb, nlh); + + ret = rtnl_unicast(dcbnl_skb, &init_net, pid); + if (ret) + goto err_out; + + return 0; + +err_bcn: + nla_nest_cancel(dcbnl_skb, bcn_nest); +nlmsg_failure: +err: + kfree_skb(dcbnl_skb); +err_out: + ret = -EINVAL; + return ret; +} + +static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; + int i; + int ret = -EINVAL; + u8 value_byte; + u32 value_int; + + if (!tb[DCB_ATTR_BCN] || !netdev->dcbnl_ops->setbcncfg || + !netdev->dcbnl_ops->setbcnrp) + return ret; + + ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX, + tb[DCB_ATTR_BCN], + dcbnl_pfc_up_nest); + if (ret) + goto err; + + for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { + if (data[i] == NULL) + continue; + value_byte = nla_get_u8(data[i]); + netdev->dcbnl_ops->setbcnrp(netdev, + data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); + } + + for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { + if (data[i] == NULL) + continue; + value_int = nla_get_u32(data[i]); + netdev->dcbnl_ops->setbcncfg(netdev, + i, value_int); + } + + ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_BCN_SCFG, DCB_ATTR_BCN, + pid, seq, flags); +err: + return ret; +} + +/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not + * be completed the entire msg is aborted and error value is returned. + * No attempt is made to reconcile the case where only part of the + * cmd can be completed. + */ +static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; + int err = -EOPNOTSUPP; + + if (!ops) + goto err; + + err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, + tb[DCB_ATTR_IEEE], dcbnl_ieee_policy); + if (err) + goto err; + + if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { + struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); + err = ops->ieee_setets(netdev, ets); + if (err) + goto err; + } + + if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { + struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); + err = ops->ieee_setpfc(netdev, pfc); + if (err) + goto err; + } + + if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { + struct nlattr *attr; + int rem; + + nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { + struct dcb_app *app_data; + if (nla_type(attr) != DCB_ATTR_IEEE_APP) + continue; + app_data = nla_data(attr); + if (ops->ieee_setapp) + err = ops->ieee_setapp(netdev, app_data); + else + err = dcb_setapp(netdev, app_data); + if (err) + goto err; + } + } + +err: + dcbnl_reply(err, RTM_SETDCB, DCB_CMD_IEEE_SET, DCB_ATTR_IEEE, + pid, seq, flags); + return err; +} + +static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, + int app_nested_type, int app_info_type, + int app_entry_type) +{ + struct dcb_peer_app_info info; + struct dcb_app *table = NULL; + const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + u16 app_count; + int err; + + + /** + * retrieve the peer app configuration form the driver. If the driver + * handlers fail exit without doing anything + */ + err = ops->peer_getappinfo(netdev, &info, &app_count); + if (!err && app_count) { + table = kmalloc(sizeof(struct dcb_app) * app_count, GFP_KERNEL); + if (!table) + return -ENOMEM; + + err = ops->peer_getapptable(netdev, table); + } + + if (!err) { + u16 i; + struct nlattr *app; + + /** + * build the message, from here on the only possible failure + * is due to the skb size + */ + err = -EMSGSIZE; + + app = nla_nest_start(skb, app_nested_type); + if (!app) + goto nla_put_failure; + + if (app_info_type) + NLA_PUT(skb, app_info_type, sizeof(info), &info); + + for (i = 0; i < app_count; i++) + NLA_PUT(skb, app_entry_type, sizeof(struct dcb_app), + &table[i]); + + nla_nest_end(skb, app); + } + err = 0; + +nla_put_failure: + kfree(table); + return err; +} + +/* Handle IEEE 802.1Qaz GET commands. */ +static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *ieee, *app; + struct dcb_app_type *itr; + const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + int err; + + if (!ops) + return -EOPNOTSUPP; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_IEEE_GET; + + NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name); + + ieee = nla_nest_start(skb, DCB_ATTR_IEEE); + if (!ieee) + goto nla_put_failure; + + if (ops->ieee_getets) { + struct ieee_ets ets; + err = ops->ieee_getets(netdev, &ets); + if (!err) + NLA_PUT(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets); + } + + if (ops->ieee_getpfc) { + struct ieee_pfc pfc; + err = ops->ieee_getpfc(netdev, &pfc); + if (!err) + NLA_PUT(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc); + } + + app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE); + if (!app) + goto nla_put_failure; + + spin_lock(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (strncmp(itr->name, netdev->name, IFNAMSIZ) == 0) { + err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app), + &itr->app); + if (err) { + spin_unlock(&dcb_lock); + goto nla_put_failure; + } + } + } + spin_unlock(&dcb_lock); + nla_nest_end(skb, app); + + /* get peer info if available */ + if (ops->ieee_peer_getets) { + struct ieee_ets ets; + err = ops->ieee_peer_getets(netdev, &ets); + if (!err) + NLA_PUT(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets); + } + + if (ops->ieee_peer_getpfc) { + struct ieee_pfc pfc; + err = ops->ieee_peer_getpfc(netdev, &pfc); + if (!err) + NLA_PUT(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc); + } + + if (ops->peer_getappinfo && ops->peer_getapptable) { + err = dcbnl_build_peer_app(netdev, skb, + DCB_ATTR_IEEE_PEER_APP, + DCB_ATTR_IEEE_APP_UNSPEC, + DCB_ATTR_IEEE_APP); + if (err) + goto nla_put_failure; + } + + nla_nest_end(skb, ieee); + nlmsg_end(skb, nlh); + + return rtnl_unicast(skb, &init_net, pid); +nla_put_failure: + nlmsg_cancel(skb, nlh); +nlmsg_failure: + kfree_skb(skb); + return -1; +} + +/* DCBX configuration */ +static int dcbnl_getdcbx(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret; + + if (!netdev->dcbnl_ops->getdcbx) + return -EOPNOTSUPP; + + ret = dcbnl_reply(netdev->dcbnl_ops->getdcbx(netdev), RTM_GETDCB, + DCB_CMD_GDCBX, DCB_ATTR_DCBX, pid, seq, flags); + + return ret; +} + +static int dcbnl_setdcbx(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + int ret; + u8 value; + + if (!netdev->dcbnl_ops->setdcbx) + return -EOPNOTSUPP; + + if (!tb[DCB_ATTR_DCBX]) + return -EINVAL; + + value = nla_get_u8(tb[DCB_ATTR_DCBX]); + + ret = dcbnl_reply(netdev->dcbnl_ops->setdcbx(netdev, value), + RTM_SETDCB, DCB_CMD_SDCBX, DCB_ATTR_DCBX, + pid, seq, flags); + + return ret; +} + +static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *dcbnl_skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; + u8 value; + int ret, i; + int getall = 0; + + if (!netdev->dcbnl_ops->getfeatcfg) + return -EOPNOTSUPP; + + if (!tb[DCB_ATTR_FEATCFG]) + return -EINVAL; + + ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], + dcbnl_featcfg_nest); + if (ret) + goto err_out; + + dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!dcbnl_skb) { + ret = -ENOBUFS; + goto err_out; + } + + nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_GFEATCFG; + + nest = nla_nest_start(dcbnl_skb, DCB_ATTR_FEATCFG); + if (!nest) { + ret = -EMSGSIZE; + goto nla_put_failure; + } + + if (data[DCB_FEATCFG_ATTR_ALL]) + getall = 1; + + for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { + if (!getall && !data[i]) + continue; + + ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); + if (!ret) + ret = nla_put_u8(dcbnl_skb, i, value); + + if (ret) { + nla_nest_cancel(dcbnl_skb, nest); + goto nla_put_failure; + } + } + nla_nest_end(dcbnl_skb, nest); + + nlmsg_end(dcbnl_skb, nlh); + + return rtnl_unicast(dcbnl_skb, &init_net, pid); +nla_put_failure: + nlmsg_cancel(dcbnl_skb, nlh); +nlmsg_failure: + kfree_skb(dcbnl_skb); +err_out: + return ret; +} + +static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; + int ret, i; + u8 value; + + if (!netdev->dcbnl_ops->setfeatcfg) + return -ENOTSUPP; + + if (!tb[DCB_ATTR_FEATCFG]) + return -EINVAL; + + ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], + dcbnl_featcfg_nest); + + if (ret) + goto err; + + for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { + if (data[i] == NULL) + continue; + + value = nla_get_u8(data[i]); + + ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); + + if (ret) + goto err; + } +err: + dcbnl_reply(ret, RTM_SETDCB, DCB_CMD_SFEATCFG, DCB_ATTR_FEATCFG, + pid, seq, flags); + + return ret; +} + +/* Handle CEE DCBX GET commands. */ +static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb, + u32 pid, u32 seq, u16 flags) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct dcbmsg *dcb; + struct nlattr *cee; + const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + int err; + + if (!ops) + return -EOPNOTSUPP; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags); + + dcb = NLMSG_DATA(nlh); + dcb->dcb_family = AF_UNSPEC; + dcb->cmd = DCB_CMD_CEE_GET; + + NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name); + + cee = nla_nest_start(skb, DCB_ATTR_CEE); + if (!cee) + goto nla_put_failure; + + /* get peer info if available */ + if (ops->cee_peer_getpg) { + struct cee_pg pg; + err = ops->cee_peer_getpg(netdev, &pg); + if (!err) + NLA_PUT(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg); + } + + if (ops->cee_peer_getpfc) { + struct cee_pfc pfc; + err = ops->cee_peer_getpfc(netdev, &pfc); + if (!err) + NLA_PUT(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc); + } + + if (ops->peer_getappinfo && ops->peer_getapptable) { + err = dcbnl_build_peer_app(netdev, skb, + DCB_ATTR_CEE_PEER_APP_TABLE, + DCB_ATTR_CEE_PEER_APP_INFO, + DCB_ATTR_CEE_PEER_APP); + if (err) + goto nla_put_failure; + } + + nla_nest_end(skb, cee); + nlmsg_end(skb, nlh); + + return rtnl_unicast(skb, &init_net, pid); +nla_put_failure: + nlmsg_cancel(skb, nlh); +nlmsg_failure: + kfree_skb(skb); + return -1; +} + +static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + struct dcbmsg *dcb = (struct dcbmsg *)NLMSG_DATA(nlh); + struct nlattr *tb[DCB_ATTR_MAX + 1]; + u32 pid = skb ? NETLINK_CB(skb).pid : 0; + int ret = -EINVAL; + + if (!net_eq(net, &init_net)) + return -EINVAL; + + ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, + dcbnl_rtnl_policy); + if (ret < 0) + return ret; + + if (!tb[DCB_ATTR_IFNAME]) + return -EINVAL; + + netdev = dev_get_by_name(&init_net, nla_data(tb[DCB_ATTR_IFNAME])); + if (!netdev) + return -EINVAL; + + if (!netdev->dcbnl_ops) + goto errout; + + switch (dcb->cmd) { + case DCB_CMD_GSTATE: + ret = dcbnl_getstate(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PFC_GCFG: + ret = dcbnl_getpfccfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_GPERM_HWADDR: + ret = dcbnl_getperm_hwaddr(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PGTX_GCFG: + ret = dcbnl_pgtx_getcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PGRX_GCFG: + ret = dcbnl_pgrx_getcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_BCN_GCFG: + ret = dcbnl_bcn_getcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_SSTATE: + ret = dcbnl_setstate(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PFC_SCFG: + ret = dcbnl_setpfccfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + + case DCB_CMD_SET_ALL: + ret = dcbnl_setall(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PGTX_SCFG: + ret = dcbnl_pgtx_setcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PGRX_SCFG: + ret = dcbnl_pgrx_setcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_GCAP: + ret = dcbnl_getcap(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_GNUMTCS: + ret = dcbnl_getnumtcs(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_SNUMTCS: + ret = dcbnl_setnumtcs(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PFC_GSTATE: + ret = dcbnl_getpfcstate(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_PFC_SSTATE: + ret = dcbnl_setpfcstate(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_BCN_SCFG: + ret = dcbnl_bcn_setcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_GAPP: + ret = dcbnl_getapp(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_SAPP: + ret = dcbnl_setapp(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_IEEE_SET: + ret = dcbnl_ieee_set(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_IEEE_GET: + ret = dcbnl_ieee_get(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_GDCBX: + ret = dcbnl_getdcbx(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_SDCBX: + ret = dcbnl_setdcbx(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_GFEATCFG: + ret = dcbnl_getfeatcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_SFEATCFG: + ret = dcbnl_setfeatcfg(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + case DCB_CMD_CEE_GET: + ret = dcbnl_cee_get(netdev, tb, pid, nlh->nlmsg_seq, + nlh->nlmsg_flags); + goto out; + default: + goto errout; + } +errout: + ret = -EINVAL; +out: + dev_put(netdev); + return ret; +} + +/** + * dcb_getapp - retrieve the DCBX application user priority + * + * On success returns a non-zero 802.1p user priority bitmap + * otherwise returns 0 as the invalid user priority bitmap to + * indicate an error. + */ +u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) +{ + struct dcb_app_type *itr; + u8 prio = 0; + + spin_lock(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->app.selector == app->selector && + itr->app.protocol == app->protocol && + (strncmp(itr->name, dev->name, IFNAMSIZ) == 0)) { + prio = itr->app.priority; + break; + } + } + spin_unlock(&dcb_lock); + + return prio; +} +EXPORT_SYMBOL(dcb_getapp); + +/** + * ixgbe_dcbnl_setapp - add dcb application data to app list + * + * Priority 0 is the default priority this removes applications + * from the app list if the priority is set to zero. + */ +u8 dcb_setapp(struct net_device *dev, struct dcb_app *new) +{ + struct dcb_app_type *itr; + struct dcb_app_type event; + + memcpy(&event.name, dev->name, sizeof(event.name)); + memcpy(&event.app, new, sizeof(event.app)); + + spin_lock(&dcb_lock); + /* Search for existing match and replace */ + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->app.selector == new->selector && + itr->app.protocol == new->protocol && + (strncmp(itr->name, dev->name, IFNAMSIZ) == 0)) { + if (new->priority) + itr->app.priority = new->priority; + else { + list_del(&itr->list); + kfree(itr); + } + goto out; + } + } + /* App type does not exist add new application type */ + if (new->priority) { + struct dcb_app_type *entry; + entry = kmalloc(sizeof(struct dcb_app_type), GFP_ATOMIC); + if (!entry) { + spin_unlock(&dcb_lock); + return -ENOMEM; + } + + memcpy(&entry->app, new, sizeof(*new)); + strncpy(entry->name, dev->name, IFNAMSIZ); + list_add(&entry->list, &dcb_app_list); + } +out: + spin_unlock(&dcb_lock); + call_dcbevent_notifiers(DCB_APP_EVENT, &event); + return 0; +} +EXPORT_SYMBOL(dcb_setapp); + +static void dcb_flushapp(void) +{ + struct dcb_app_type *app; + struct dcb_app_type *tmp; + + spin_lock(&dcb_lock); + list_for_each_entry_safe(app, tmp, &dcb_app_list, list) { + list_del(&app->list); + kfree(app); + } + spin_unlock(&dcb_lock); +} + +static int __init dcbnl_init(void) +{ + INIT_LIST_HEAD(&dcb_app_list); + + rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL); + rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL); + + return 0; +} +module_init(dcbnl_init); + +static void __exit dcbnl_exit(void) +{ + rtnl_unregister(PF_UNSPEC, RTM_GETDCB); + rtnl_unregister(PF_UNSPEC, RTM_SETDCB); + dcb_flushapp(); +} +module_exit(dcbnl_exit); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig new file mode 100644 index 00000000..b75968a0 --- /dev/null +++ b/net/dccp/Kconfig @@ -0,0 +1,62 @@ +menuconfig IP_DCCP + tristate "The DCCP Protocol (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + ---help--- + Datagram Congestion Control Protocol (RFC 4340) + + From http://www.ietf.org/rfc/rfc4340.txt: + + The Datagram Congestion Control Protocol (DCCP) is a transport + protocol that implements bidirectional, unicast connections of + congestion-controlled, unreliable datagrams. It should be suitable + for use by applications such as streaming media, Internet telephony, + and on-line games. + + To compile this protocol support as a module, choose M here: the + module will be called dccp. + + If in doubt, say N. + +if IP_DCCP + +config INET_DCCP_DIAG + depends on INET_DIAG + def_tristate y if (IP_DCCP = y && INET_DIAG = y) + def_tristate m + +source "net/dccp/ccids/Kconfig" + +menu "DCCP Kernel Hacking" + depends on DEBUG_KERNEL=y + +config IP_DCCP_DEBUG + bool "DCCP debug messages" + ---help--- + Only use this if you're hacking DCCP. + + When compiling DCCP as a module, this debugging output can be toggled + by setting the parameter dccp_debug of the `dccp' module to 0 or 1. + + Just say N. + +config NET_DCCPPROBE + tristate "DCCP connection probing" + depends on PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to DCCP connection + state in response to incoming packets. It is used for debugging + DCCP congestion avoidance modules. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use DCCP connection probing can be found + at: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe + + To compile this code as a module, choose M here: the + module will be called dccp_probe. + + +endmenu + +endif # IP_DDCP diff --git a/net/dccp/Makefile b/net/dccp/Makefile new file mode 100644 index 00000000..5c8362b0 --- /dev/null +++ b/net/dccp/Makefile @@ -0,0 +1,28 @@ +obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o + +dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ + qpolicy.o +# +# CCID algorithms to be used by dccp.ko +# +# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency +dccp-y += ccids/ccid2.o ackvec.o +dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o +dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ + ccids/lib/tfrc_equation.o \ + ccids/lib/packet_history.o \ + ccids/lib/loss_interval.o + +dccp_ipv4-y := ipv4.o + +# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module +obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o +dccp_ipv6-y := ipv6.o + +obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o +obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o + +dccp-$(CONFIG_SYSCTL) += sysctl.o + +dccp_diag-y := diag.o +dccp_probe-y := probe.o diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c new file mode 100644 index 00000000..25b7a8d1 --- /dev/null +++ b/net/dccp/ackvec.c @@ -0,0 +1,408 @@ +/* + * net/dccp/ackvec.c + * + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License; + */ +#include "dccp.h" +#include +#include + +static struct kmem_cache *dccp_ackvec_slab; +static struct kmem_cache *dccp_ackvec_record_slab; + +struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) +{ + struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); + + if (av != NULL) { + av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; + INIT_LIST_HEAD(&av->av_records); + } + return av; +} + +static void dccp_ackvec_purge_records(struct dccp_ackvec *av) +{ + struct dccp_ackvec_record *cur, *next; + + list_for_each_entry_safe(cur, next, &av->av_records, avr_node) + kmem_cache_free(dccp_ackvec_record_slab, cur); + INIT_LIST_HEAD(&av->av_records); +} + +void dccp_ackvec_free(struct dccp_ackvec *av) +{ + if (likely(av != NULL)) { + dccp_ackvec_purge_records(av); + kmem_cache_free(dccp_ackvec_slab, av); + } +} + +/** + * dccp_ackvec_update_records - Record information about sent Ack Vectors + * @av: Ack Vector records to update + * @seqno: Sequence number of the packet carrying the Ack Vector just sent + * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector + */ +int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) +{ + struct dccp_ackvec_record *avr; + + avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); + if (avr == NULL) + return -ENOBUFS; + + avr->avr_ack_seqno = seqno; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = nonce_sum; + avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); + /* + * When the buffer overflows, we keep no more than one record. This is + * the simplest way of disambiguating sender-Acks dating from before the + * overflow from sender-Acks which refer to after the overflow; a simple + * solution is preferable here since we are handling an exception. + */ + if (av->av_overflow) + dccp_ackvec_purge_records(av); + /* + * Since GSS is incremented for each packet, the list is automatically + * arranged in descending order of @ack_seqno. + */ + list_add(&avr->avr_node, &av->av_records); + + dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", + (unsigned long long)avr->avr_ack_seqno, + (unsigned long long)avr->avr_ack_ackno, + avr->avr_ack_runlen); + return 0; +} + +static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, + const u64 ackno) +{ + struct dccp_ackvec_record *avr; + /* + * Exploit that records are inserted in descending order of sequence + * number, start with the oldest record first. If @ackno is `before' + * the earliest ack_ackno, the packet is too old to be considered. + */ + list_for_each_entry_reverse(avr, av_list, avr_node) { + if (avr->avr_ack_seqno == ackno) + return avr; + if (before48(ackno, avr->avr_ack_seqno)) + break; + } + return NULL; +} + +/* + * Buffer index and length computation using modulo-buffersize arithmetic. + * Note that, as pointers move from right to left, head is `before' tail. + */ +static inline u16 __ackvec_idx_add(const u16 a, const u16 b) +{ + return (a + b) % DCCPAV_MAX_ACKVEC_LEN; +} + +static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) +{ + return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); +} + +u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) +{ + if (unlikely(av->av_overflow)) + return DCCPAV_MAX_ACKVEC_LEN; + return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); +} + +/** + * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 + * @av: non-empty buffer to update + * @distance: negative or zero distance of @seqno from buf_ackno downward + * @seqno: the (old) sequence number whose record is to be updated + * @state: state in which packet carrying @seqno was received + */ +static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, + u64 seqno, enum dccp_ackvec_states state) +{ + u16 ptr = av->av_buf_head; + + BUG_ON(distance > 0); + if (unlikely(dccp_ackvec_is_empty(av))) + return; + + do { + u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); + + if (distance + runlen >= 0) { + /* + * Only update the state if packet has not been received + * yet. This is OK as per the second table in RFC 4340, + * 11.4.1; i.e. here we are using the following table: + * RECEIVED + * 0 1 3 + * S +---+---+---+ + * T 0 | 0 | 0 | 0 | + * O +---+---+---+ + * R 1 | 1 | 1 | 1 | + * E +---+---+---+ + * D 3 | 0 | 1 | 3 | + * +---+---+---+ + * The "Not Received" state was set by reserve_seats(). + */ + if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) + av->av_buf[ptr] = state; + else + dccp_pr_debug("Not changing %llu state to %u\n", + (unsigned long long)seqno, state); + break; + } + + distance += runlen + 1; + ptr = __ackvec_idx_add(ptr, 1); + + } while (ptr != av->av_buf_tail); +} + +/* Mark @num entries after buf_head as "Not yet received". */ +static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) +{ + u16 start = __ackvec_idx_add(av->av_buf_head, 1), + len = DCCPAV_MAX_ACKVEC_LEN - start; + + /* check for buffer wrap-around */ + if (num > len) { + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); + start = 0; + num -= len; + } + if (num) + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); +} + +/** + * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer + * @av: container of buffer to update (can be empty or non-empty) + * @num_packets: number of packets to register (must be >= 1) + * @seqno: sequence number of the first packet in @num_packets + * @state: state in which packet carrying @seqno was received + */ +static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, + u64 seqno, enum dccp_ackvec_states state) +{ + u32 num_cells = num_packets; + + if (num_packets > DCCPAV_BURST_THRESH) { + u32 lost_packets = num_packets - 1; + + DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); + /* + * We received 1 packet and have a loss of size "num_packets-1" + * which we squeeze into num_cells-1 rather than reserving an + * entire byte for each lost packet. + * The reason is that the vector grows in O(burst_length); when + * it grows too large there will no room left for the payload. + * This is a trade-off: if a few packets out of the burst show + * up later, their state will not be changed; it is simply too + * costly to reshuffle/reallocate/copy the buffer each time. + * Should such problems persist, we will need to switch to a + * different underlying data structure. + */ + for (num_packets = num_cells = 1; lost_packets; ++num_cells) { + u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); + + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); + av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; + + lost_packets -= len; + } + } + + if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { + DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); + av->av_overflow = true; + } + + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); + if (av->av_overflow) + av->av_buf_tail = av->av_buf_head; + + av->av_buf[av->av_buf_head] = state; + av->av_buf_ackno = seqno; + + if (num_packets > 1) + dccp_ackvec_reserve_seats(av, num_packets - 1); +} + +/** + * dccp_ackvec_input - Register incoming packet in the buffer + */ +void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) +{ + u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; + enum dccp_ackvec_states state = DCCPAV_RECEIVED; + + if (dccp_ackvec_is_empty(av)) { + dccp_ackvec_add_new(av, 1, seqno, state); + av->av_tail_ackno = seqno; + + } else { + s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); + u8 *current_head = av->av_buf + av->av_buf_head; + + if (num_packets == 1 && + dccp_ackvec_state(current_head) == state && + dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { + + *current_head += 1; + av->av_buf_ackno = seqno; + + } else if (num_packets > 0) { + dccp_ackvec_add_new(av, num_packets, seqno, state); + } else { + dccp_ackvec_update_old(av, num_packets, seqno, state); + } + } +} + +/** + * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection + * This routine is called when the peer acknowledges the receipt of Ack Vectors + * up to and including @ackno. While based on on section A.3 of RFC 4340, here + * are additional precautions to prevent corrupted buffer state. In particular, + * we use tail_ackno to identify outdated records; it always marks the earliest + * packet of group (2) in 11.4.2. + */ +void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) +{ + struct dccp_ackvec_record *avr, *next; + u8 runlen_now, eff_runlen; + s64 delta; + + avr = dccp_ackvec_lookup(&av->av_records, ackno); + if (avr == NULL) + return; + /* + * Deal with outdated acknowledgments: this arises when e.g. there are + * several old records and the acks from the peer come in slowly. In + * that case we may still have records that pre-date tail_ackno. + */ + delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); + if (delta < 0) + goto free_records; + /* + * Deal with overlapping Ack Vectors: don't subtract more than the + * number of packets between tail_ackno and ack_ackno. + */ + eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; + + runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); + /* + * The run length of Ack Vector cells does not decrease over time. If + * the run length is the same as at the time the Ack Vector was sent, we + * free the ack_ptr cell. That cell can however not be freed if the run + * length has increased: in this case we need to move the tail pointer + * backwards (towards higher indices), to its next-oldest neighbour. + */ + if (runlen_now > eff_runlen) { + + av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; + av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); + + /* This move may not have cleared the overflow flag. */ + if (av->av_overflow) + av->av_overflow = (av->av_buf_head == av->av_buf_tail); + } else { + av->av_buf_tail = avr->avr_ack_ptr; + /* + * We have made sure that avr points to a valid cell within the + * buffer. This cell is either older than head, or equals head + * (empty buffer): in both cases we no longer have any overflow. + */ + av->av_overflow = 0; + } + + /* + * The peer has acknowledged up to and including ack_ackno. Hence the + * first packet in group (2) of 11.4.2 is the successor of ack_ackno. + */ + av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); + +free_records: + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del(&avr->avr_node); + kmem_cache_free(dccp_ackvec_record_slab, avr); + } +} + +/* + * Routines to keep track of Ack Vectors received in an skb + */ +int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) +{ + struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); + + if (new == NULL) + return -ENOBUFS; + new->vec = vec; + new->len = len; + new->nonce = nonce; + + list_add_tail(&new->node, head); + return 0; +} +EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); + +void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) +{ + struct dccp_ackvec_parsed *cur, *next; + + list_for_each_entry_safe(cur, next, parsed_chunks, node) + kfree(cur); + INIT_LIST_HEAD(parsed_chunks); +} +EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); + +int __init dccp_ackvec_init(void) +{ + dccp_ackvec_slab = kmem_cache_create("dccp_ackvec", + sizeof(struct dccp_ackvec), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (dccp_ackvec_slab == NULL) + goto out_err; + + dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", + sizeof(struct dccp_ackvec_record), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (dccp_ackvec_record_slab == NULL) + goto out_destroy_slab; + + return 0; + +out_destroy_slab: + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; +out_err: + DCCP_CRIT("Unable to create Ack Vector slab cache"); + return -ENOBUFS; +} + +void dccp_ackvec_exit(void) +{ + if (dccp_ackvec_slab != NULL) { + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; + } + if (dccp_ackvec_record_slab != NULL) { + kmem_cache_destroy(dccp_ackvec_record_slab); + dccp_ackvec_record_slab = NULL; + } +} diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h new file mode 100644 index 00000000..e2ab0627 --- /dev/null +++ b/net/dccp/ackvec.h @@ -0,0 +1,136 @@ +#ifndef _ACKVEC_H +#define _ACKVEC_H +/* + * net/dccp/ackvec.h + * + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +/* + * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, + * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 + * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives + * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. + * The maximum value is bounded by the u16 types for indices and functions. + */ +#define DCCPAV_NUM_ACKVECS 2 +#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) + +/* Estimated minimum average Ack Vector length - used for updating MPS */ +#define DCCPAV_MIN_OPTLEN 16 + +/* Threshold for coping with large bursts of losses */ +#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) + +enum dccp_ackvec_states { + DCCPAV_RECEIVED = 0x00, + DCCPAV_ECN_MARKED = 0x40, + DCCPAV_RESERVED = 0x80, + DCCPAV_NOT_RECEIVED = 0xC0 +}; +#define DCCPAV_MAX_RUNLEN 0x3F + +static inline u8 dccp_ackvec_runlen(const u8 *cell) +{ + return *cell & DCCPAV_MAX_RUNLEN; +} + +static inline u8 dccp_ackvec_state(const u8 *cell) +{ + return *cell & ~DCCPAV_MAX_RUNLEN; +} + +/** struct dccp_ackvec - Ack Vector main data structure + * + * This implements a fixed-size circular buffer within an array and is largely + * based on Appendix A of RFC 4340. + * + * @av_buf: circular buffer storage area + * @av_buf_head: head index; begin of live portion in @av_buf + * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf + * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf + * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf + * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to + * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf + * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound + * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) + */ +struct dccp_ackvec { + u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; + u16 av_buf_head; + u16 av_buf_tail; + u64 av_buf_ackno:48; + u64 av_tail_ackno:48; + bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; + u8 av_overflow:1; + struct list_head av_records; +}; + +/** struct dccp_ackvec_record - Records information about sent Ack Vectors + * + * These list entries define the additional information which the HC-Receiver + * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. + * + * @avr_node: the list node in @av_records + * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on + * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to + * @avr_ack_ptr: pointer into @av_buf where this record starts + * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending + * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent + * + * The list as a whole is sorted in descending order by @avr_ack_seqno. + */ +struct dccp_ackvec_record { + struct list_head avr_node; + u64 avr_ack_seqno:48; + u64 avr_ack_ackno:48; + u16 avr_ack_ptr; + u8 avr_ack_runlen; + u8 avr_ack_nonce:1; +}; + +extern int dccp_ackvec_init(void); +extern void dccp_ackvec_exit(void); + +extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); +extern void dccp_ackvec_free(struct dccp_ackvec *av); + +extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); +extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); +extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); +extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); + +static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) +{ + return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; +} + +/** + * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb + * @vec: start of vector (offset into skb) + * @len: length of @vec + * @nonce: whether @vec had an ECN nonce of 0 or 1 + * @node: FIFO - arranged in descending order of ack_ackno + * This structure is used by CCIDs to access Ack Vectors in a received skb. + */ +struct dccp_ackvec_parsed { + u8 *vec, + len, + nonce:1; + struct list_head node; +}; + +extern int dccp_ackvec_parsed_add(struct list_head *head, + u8 *vec, u8 len, u8 nonce); +extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); +#endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c new file mode 100644 index 00000000..36479ca6 --- /dev/null +++ b/net/dccp/ccid.c @@ -0,0 +1,222 @@ +/* + * net/dccp/ccid.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +#include "ccid.h" +#include "ccids/lib/tfrc.h" + +static struct ccid_operations *ccids[] = { + &ccid2_ops, +#ifdef CONFIG_IP_DCCP_CCID3 + &ccid3_ops, +#endif +}; + +static struct ccid_operations *ccid_by_number(const u8 id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ccids); i++) + if (ccids[i]->ccid_id == id) + return ccids[i]; + return NULL; +} + +/* check that up to @array_len members in @ccid_array are supported */ +bool ccid_support_check(u8 const *ccid_array, u8 array_len) +{ + while (array_len > 0) + if (ccid_by_number(ccid_array[--array_len]) == NULL) + return false; + return true; +} + +/** + * ccid_get_builtin_ccids - Populate a list of built-in CCIDs + * @ccid_array: pointer to copy into + * @array_len: value to return length into + * This function allocates memory - caller must see that it is freed after use. + */ +int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) +{ + *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); + if (*ccid_array == NULL) + return -ENOBUFS; + + for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) + (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; + return 0; +} + +int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + u8 *ccid_array, array_len; + int err = 0; + + if (ccid_get_builtin_ccids(&ccid_array, &array_len)) + return -ENOBUFS; + + if (put_user(array_len, optlen)) + err = -EFAULT; + else if (len > 0 && copy_to_user(optval, ccid_array, + len > array_len ? array_len : len)) + err = -EFAULT; + + kfree(ccid_array); + return err; +} + +static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) +{ + struct kmem_cache *slab; + va_list args; + + va_start(args, fmt); + vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args); + va_end(args); + + slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + return slab; +} + +static void ccid_kmem_cache_destroy(struct kmem_cache *slab) +{ + if (slab != NULL) + kmem_cache_destroy(slab); +} + +static int ccid_activate(struct ccid_operations *ccid_ops) +{ + int err = -ENOBUFS; + + ccid_ops->ccid_hc_rx_slab = + ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, + ccid_ops->ccid_hc_rx_slab_name, + "ccid%u_hc_rx_sock", + ccid_ops->ccid_id); + if (ccid_ops->ccid_hc_rx_slab == NULL) + goto out; + + ccid_ops->ccid_hc_tx_slab = + ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, + ccid_ops->ccid_hc_tx_slab_name, + "ccid%u_hc_tx_sock", + ccid_ops->ccid_id); + if (ccid_ops->ccid_hc_tx_slab == NULL) + goto out_free_rx_slab; + + pr_info("CCID: Activated CCID %d (%s)\n", + ccid_ops->ccid_id, ccid_ops->ccid_name); + err = 0; +out: + return err; +out_free_rx_slab: + ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); + ccid_ops->ccid_hc_rx_slab = NULL; + goto out; +} + +static void ccid_deactivate(struct ccid_operations *ccid_ops) +{ + ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); + ccid_ops->ccid_hc_tx_slab = NULL; + ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); + ccid_ops->ccid_hc_rx_slab = NULL; + + pr_info("CCID: Deactivated CCID %d (%s)\n", + ccid_ops->ccid_id, ccid_ops->ccid_name); +} + +struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) +{ + struct ccid_operations *ccid_ops = ccid_by_number(id); + struct ccid *ccid = NULL; + + if (ccid_ops == NULL) + goto out; + + ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : + ccid_ops->ccid_hc_tx_slab, gfp_any()); + if (ccid == NULL) + goto out; + ccid->ccid_ops = ccid_ops; + if (rx) { + memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); + if (ccid->ccid_ops->ccid_hc_rx_init != NULL && + ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0) + goto out_free_ccid; + } else { + memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size); + if (ccid->ccid_ops->ccid_hc_tx_init != NULL && + ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0) + goto out_free_ccid; + } +out: + return ccid; +out_free_ccid: + kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : + ccid_ops->ccid_hc_tx_slab, ccid); + ccid = NULL; + goto out; +} + +void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) +{ + if (ccid != NULL) { + if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) + ccid->ccid_ops->ccid_hc_rx_exit(sk); + kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); + } +} + +void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) +{ + if (ccid != NULL) { + if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) + ccid->ccid_ops->ccid_hc_tx_exit(sk); + kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); + } +} + +int __init ccid_initialize_builtins(void) +{ + int i, err = tfrc_lib_init(); + + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(ccids); i++) { + err = ccid_activate(ccids[i]); + if (err) + goto unwind_registrations; + } + return 0; + +unwind_registrations: + while(--i >= 0) + ccid_deactivate(ccids[i]); + tfrc_lib_exit(); + return err; +} + +void ccid_cleanup_builtins(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ccids); i++) + ccid_deactivate(ccids[i]); + tfrc_lib_exit(); +} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h new file mode 100644 index 00000000..75c3582a --- /dev/null +++ b/net/dccp/ccid.h @@ -0,0 +1,265 @@ +#ifndef _CCID_H +#define _CCID_H +/* + * net/dccp/ccid.h + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +/* maximum value for a CCID (RFC 4340, 19.5) */ +#define CCID_MAX 255 +#define CCID_SLAB_NAME_LENGTH 32 + +struct tcp_info; + +/** + * struct ccid_operations - Interface to Congestion-Control Infrastructure + * + * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) + * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) + * @ccid_name: alphabetical identifier string for @ccid_id + * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection + * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket + * + * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) + * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) + * @ccid_hc_rx_packet_recv: implements the HC-receiver side + * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options + * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options + * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender + * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender + * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender + * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender + * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender + */ +struct ccid_operations { + unsigned char ccid_id; + __u32 ccid_ccmps; + const char *ccid_name; + struct kmem_cache *ccid_hc_rx_slab, + *ccid_hc_tx_slab; + char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; + char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; + __u32 ccid_hc_rx_obj_size, + ccid_hc_tx_obj_size; + /* Interface Routines */ + int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); + int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); + void (*ccid_hc_rx_exit)(struct sock *sk); + void (*ccid_hc_tx_exit)(struct sock *sk); + void (*ccid_hc_rx_packet_recv)(struct sock *sk, + struct sk_buff *skb); + int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); + int (*ccid_hc_rx_insert_options)(struct sock *sk, + struct sk_buff *skb); + void (*ccid_hc_tx_packet_recv)(struct sock *sk, + struct sk_buff *skb); + int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); + int (*ccid_hc_tx_send_packet)(struct sock *sk, + struct sk_buff *skb); + void (*ccid_hc_tx_packet_sent)(struct sock *sk, + unsigned int len); + void (*ccid_hc_rx_get_info)(struct sock *sk, + struct tcp_info *info); + void (*ccid_hc_tx_get_info)(struct sock *sk, + struct tcp_info *info); + int (*ccid_hc_rx_getsockopt)(struct sock *sk, + const int optname, int len, + u32 __user *optval, + int __user *optlen); + int (*ccid_hc_tx_getsockopt)(struct sock *sk, + const int optname, int len, + u32 __user *optval, + int __user *optlen); +}; + +extern struct ccid_operations ccid2_ops; +#ifdef CONFIG_IP_DCCP_CCID3 +extern struct ccid_operations ccid3_ops; +#endif + +extern int ccid_initialize_builtins(void); +extern void ccid_cleanup_builtins(void); + +struct ccid { + struct ccid_operations *ccid_ops; + char ccid_priv[0]; +}; + +static inline void *ccid_priv(const struct ccid *ccid) +{ + return (void *)ccid->ccid_priv; +} + +extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); +extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); +extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *, int __user *); + +extern struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); + +static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) +{ + struct ccid *ccid = dp->dccps_hc_rx_ccid; + + if (ccid == NULL || ccid->ccid_ops == NULL) + return -1; + return ccid->ccid_ops->ccid_id; +} + +static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) +{ + struct ccid *ccid = dp->dccps_hc_tx_ccid; + + if (ccid == NULL || ccid->ccid_ops == NULL) + return -1; + return ccid->ccid_ops->ccid_id; +} + +extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); +extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); + +/* + * Congestion control of queued data packets via CCID decision. + * + * The TX CCID performs its congestion-control by indicating whether and when a + * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). + * The following modes are supported via the symbolic constants below: + * - timer-based pacing (CCID returns a delay value in milliseconds); + * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). + */ + +enum ccid_dequeueing_decision { + CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ + CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ + CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ + CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ + CCID_PACKET_ERR = 0xF0000, /* error condition */ +}; + +static inline int ccid_packet_dequeue_eval(const int return_code) +{ + if (return_code < 0) + return CCID_PACKET_ERR; + if (return_code == 0) + return CCID_PACKET_SEND_AT_ONCE; + if (return_code <= CCID_PACKET_DELAY_MAX) + return CCID_PACKET_DELAY; + return return_code; +} + +static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) + return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return CCID_PACKET_SEND_AT_ONCE; +} + +static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, + unsigned int len) +{ + if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); +} + +static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) + ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); +} + +static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) + ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); +} + +/** + * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver + * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) + * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) + * @val: value of @opt + * @len: length of @val in bytes + */ +static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, + u8 pkt, u8 opt, u8 *val, u8 len) +{ + if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); +} + +/** + * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender + * Arguments are analogous to ccid_hc_tx_parse_options() + */ +static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, + u8 pkt, u8 opt, u8 *val, u8 len) +{ + if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); +} + +static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) + return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); + return 0; +} + +static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, + struct tcp_info *info) +{ + if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) + ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); +} + +static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, + struct tcp_info *info) +{ + if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) + ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); +} + +static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, + const int optname, int len, + u32 __user *optval, int __user *optlen) +{ + int rc = -ENOPROTOOPT; + if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) + rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, + optval, optlen); + return rc; +} + +static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, + const int optname, int len, + u32 __user *optval, int __user *optlen) +{ + int rc = -ENOPROTOOPT; + if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) + rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, + optval, optlen); + return rc; +} +#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig new file mode 100644 index 00000000..0581143c --- /dev/null +++ b/net/dccp/ccids/Kconfig @@ -0,0 +1,55 @@ +menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on EXPERIMENTAL + +config IP_DCCP_CCID2_DEBUG + bool "CCID-2 debugging messages" + ---help--- + Enable CCID-2 specific debugging messages. + + The debugging output can additionally be toggled by setting the + ccid2_debug parameter to 0 or 1. + + If in doubt, say N. + +config IP_DCCP_CCID3 + bool "CCID-3 (TCP-Friendly) (EXPERIMENTAL)" + def_bool y if (IP_DCCP = y || IP_DCCP = m) + ---help--- + CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based + rate-controlled congestion control mechanism. TFRC is designed to + be reasonably fair when competing for bandwidth with TCP-like flows, + where a flow is "reasonably fair" if its sending rate is generally + within a factor of two of the sending rate of a TCP flow under the + same conditions. However, TFRC has a much lower variation of + throughput over time compared with TCP, which makes CCID-3 more + suitable than CCID-2 for applications such streaming media where a + relatively smooth sending rate is of importance. + + CCID-3 is further described in RFC 4342, + http://www.ietf.org/rfc/rfc4342.txt + + The TFRC congestion control algorithms were initially described in + RFC 5348. + + This text was extracted from RFC 4340 (sec. 10.2), + http://www.ietf.org/rfc/rfc4340.txt + + If in doubt, say N. + +config IP_DCCP_CCID3_DEBUG + bool "CCID-3 debugging messages" + depends on IP_DCCP_CCID3 + ---help--- + Enable CCID-3 specific debugging messages. + + The debugging output can additionally be toggled by setting the + ccid3_debug parameter to 0 or 1. + + If in doubt, say N. + +config IP_DCCP_TFRC_LIB + def_bool y if IP_DCCP_CCID3 + +config IP_DCCP_TFRC_DEBUG + def_bool y if IP_DCCP_CCID3_DEBUG +endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c new file mode 100644 index 00000000..fadecd20 --- /dev/null +++ b/net/dccp/ccids/ccid2.c @@ -0,0 +1,671 @@ +/* + * Copyright (c) 2005, 2006 Andrea Bittau + * + * Changes to meet Linux coding standards, and DCCP infrastructure fixes. + * + * Copyright (c) 2006 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * This implementation should follow RFC 4341 + */ +#include +#include "../feat.h" +#include "ccid2.h" + + +#ifdef CONFIG_IP_DCCP_CCID2_DEBUG +static int ccid2_debug; +#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) +#else +#define ccid2_pr_debug(format, a...) +#endif + +static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) +{ + struct ccid2_seq *seqp; + int i; + + /* check if we have space to preserve the pointer to the buffer */ + if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / + sizeof(struct ccid2_seq *))) + return -ENOMEM; + + /* allocate buffer and initialize linked list */ + seqp = kmalloc(CCID2_SEQBUF_LEN * sizeof(struct ccid2_seq), gfp_any()); + if (seqp == NULL) + return -ENOMEM; + + for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { + seqp[i].ccid2s_next = &seqp[i + 1]; + seqp[i + 1].ccid2s_prev = &seqp[i]; + } + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; + seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + + /* This is the first allocation. Initiate the head and tail. */ + if (hc->tx_seqbufc == 0) + hc->tx_seqh = hc->tx_seqt = seqp; + else { + /* link the existing list with the one we just created */ + hc->tx_seqh->ccid2s_next = seqp; + seqp->ccid2s_prev = hc->tx_seqh; + + hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; + } + + /* store the original pointer to the buffer so we can free it */ + hc->tx_seqbuf[hc->tx_seqbufc] = seqp; + hc->tx_seqbufc++; + + return 0; +} + +static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) +{ + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) + return CCID_PACKET_WILL_DEQUEUE_LATER; + return CCID_PACKET_SEND_AT_ONCE; +} + +static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) +{ + struct dccp_sock *dp = dccp_sk(sk); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); + + /* + * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from + * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always + * acceptable since this causes starvation/deadlock whenever cwnd < 2. + * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). + */ + if (val == 0 || val > max_ratio) { + DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); + val = max_ratio; + } + if (val > DCCPF_ACK_RATIO_MAX) + val = DCCPF_ACK_RATIO_MAX; + + if (val == dp->dccps_l_ack_ratio) + return; + + ccid2_pr_debug("changing local ack ratio to %u\n", val); + dp->dccps_l_ack_ratio = val; +} + +static void ccid2_hc_tx_rto_expire(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); + goto out; + } + + ccid2_pr_debug("RTO_EXPIRE\n"); + + /* back-off timer */ + hc->tx_rto <<= 1; + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; + + /* adjust pipe, cwnd etc */ + hc->tx_ssthresh = hc->tx_cwnd / 2; + if (hc->tx_ssthresh < 2) + hc->tx_ssthresh = 2; + hc->tx_cwnd = 1; + hc->tx_pipe = 0; + + /* clear state about stuff we sent */ + hc->tx_seqt = hc->tx_seqh; + hc->tx_packets_acked = 0; + + /* clear ack ratio state. */ + hc->tx_rpseq = 0; + hc->tx_rpdupack = -1; + ccid2_change_l_ack_ratio(sk, 1); + + /* if we were blocked before, we may now send cwnd=1 packet */ + if (sender_was_blocked) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + /* restart backed-off timer */ + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + struct ccid2_seq *next; + + hc->tx_pipe++; + + hc->tx_seqh->ccid2s_seq = dp->dccps_gss; + hc->tx_seqh->ccid2s_acked = 0; + hc->tx_seqh->ccid2s_sent = ccid2_time_stamp; + + next = hc->tx_seqh->ccid2s_next; + /* check if we need to alloc more space */ + if (next == hc->tx_seqt) { + if (ccid2_hc_tx_alloc_seq(hc)) { + DCCP_CRIT("packet history - out of memory!"); + /* FIXME: find a more graceful way to bail out */ + return; + } + next = hc->tx_seqh->ccid2s_next; + BUG_ON(next == hc->tx_seqt); + } + hc->tx_seqh = next; + + ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); + + /* + * FIXME: The code below is broken and the variables have been removed + * from the socket struct. The `ackloss' variable was always set to 0, + * and with arsent there are several problems: + * (i) it doesn't just count the number of Acks, but all sent packets; + * (ii) it is expressed in # of packets, not # of windows, so the + * comparison below uses the wrong formula: Appendix A of RFC 4341 + * comes up with the number K = cwnd / (R^2 - R) of consecutive windows + * of data with no lost or marked Ack packets. If arsent were the # of + * consecutive Acks received without loss, then Ack Ratio needs to be + * decreased by 1 when + * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) + * where cwnd / R is the number of Acks received per window of data + * (cf. RFC 4341, App. A). The problems are that + * - arsent counts other packets as well; + * - the comparison uses a formula different from RFC 4341; + * - computing a cubic/quadratic equation each time is too complicated. + * Hence a different algorithm is needed. + */ +#if 0 + /* Ack Ratio. Need to maintain a concept of how many windows we sent */ + hc->tx_arsent++; + /* We had an ack loss in this window... */ + if (hc->tx_ackloss) { + if (hc->tx_arsent >= hc->tx_cwnd) { + hc->tx_arsent = 0; + hc->tx_ackloss = 0; + } + } else { + /* No acks lost up to now... */ + /* decrease ack ratio if enough packets were sent */ + if (dp->dccps_l_ack_ratio > 1) { + /* XXX don't calculate denominator each time */ + int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - + dp->dccps_l_ack_ratio; + + denom = hc->tx_cwnd * hc->tx_cwnd / denom; + + if (hc->tx_arsent >= denom) { + ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); + hc->tx_arsent = 0; + } + } else { + /* we can't increase ack ratio further [1] */ + hc->tx_arsent = 0; /* or maybe set it to cwnd*/ + } + } +#endif + + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); + +#ifdef CONFIG_IP_DCCP_CCID2_DEBUG + do { + struct ccid2_seq *seqp = hc->tx_seqt; + + while (seqp != hc->tx_seqh) { + ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", + (unsigned long long)seqp->ccid2s_seq, + seqp->ccid2s_acked, seqp->ccid2s_sent); + seqp = seqp->ccid2s_next; + } + } while (0); + ccid2_pr_debug("=========\n"); +#endif +} + +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. + */ +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; + + if (hc->tx_srtt == 0) { + /* First measurement m */ + hc->tx_srtt = m << 3; + hc->tx_mdev = m << 1; + + hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); + hc->tx_rttvar = hc->tx_mdev_max; + + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hc->tx_srtt >> 3); + hc->tx_srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hc->tx_mdev >> 2); + /* + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). + */ + if (m > 0) + m >>= 3; + } else { + m -= (hc->tx_mdev >> 2); + } + hc->tx_mdev += m; + + if (hc->tx_mdev > hc->tx_mdev_max) { + hc->tx_mdev_max = hc->tx_mdev; + if (hc->tx_mdev_max > hc->tx_rttvar) + hc->tx_rttvar = hc->tx_mdev_max; + } + + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe. + * AWL is probably too low here, as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { + if (hc->tx_mdev_max < hc->tx_rttvar) + hc->tx_rttvar -= (hc->tx_rttvar - + hc->tx_mdev_max) >> 2; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + hc->tx_mdev_max = tcp_rto_min(sk); + } + } + + /* + * Set RTO from SRTT and RTTVAR + * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. + * This agrees with RFC 4341, 5: + * "Because DCCP does not retransmit data, DCCP does not require + * TCP's recommended minimum timeout of one second". + */ + hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; + + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; +} + +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + + if (hc->tx_cwnd < hc->tx_ssthresh) { + if (*maxincr > 0 && ++hc->tx_packets_acked == 2) { + hc->tx_cwnd += 1; + *maxincr -= 1; + hc->tx_packets_acked = 0; + } + } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { + hc->tx_cwnd += 1; + hc->tx_packets_acked = 0; + } + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ + ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); +} + +static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + + if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { + ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); + return; + } + + hc->tx_last_cong = ccid2_time_stamp; + + hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; + hc->tx_ssthresh = max(hc->tx_cwnd, 2U); + + /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ + if (dccp_sk(sk)->dccps_l_ack_ratio > hc->tx_cwnd) + ccid2_change_l_ack_ratio(sk, hc->tx_cwnd); +} + +static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + + switch (option) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, + option - DCCPO_ACK_VECTOR_0); + } + return 0; +} + +static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); + struct dccp_ackvec_parsed *avp; + u64 ackno, seqno; + struct ccid2_seq *seqp; + int done = 0; + unsigned int maxincr = 0; + + /* check reverse path congestion */ + seqno = DCCP_SKB_CB(skb)->dccpd_seq; + + /* XXX this whole "algorithm" is broken. Need to fix it to keep track + * of the seqnos of the dupacks so that rpseq and rpdupack are correct + * -sorbo. + */ + /* need to bootstrap */ + if (hc->tx_rpdupack == -1) { + hc->tx_rpdupack = 0; + hc->tx_rpseq = seqno; + } else { + /* check if packet is consecutive */ + if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) + hc->tx_rpseq = seqno; + /* it's a later packet */ + else if (after48(seqno, hc->tx_rpseq)) { + hc->tx_rpdupack++; + + /* check if we got enough dupacks */ + if (hc->tx_rpdupack >= NUMDUPACK) { + hc->tx_rpdupack = -1; /* XXX lame */ + hc->tx_rpseq = 0; + + ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); + } + } + } + + /* check forward path congestion */ + if (dccp_packet_without_ack(skb)) + return; + + /* still didn't send out new data packets */ + if (hc->tx_seqh == hc->tx_seqt) + goto done; + + ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; + if (after48(ackno, hc->tx_high_ack)) + hc->tx_high_ack = ackno; + + seqp = hc->tx_seqt; + while (before48(seqp->ccid2s_seq, ackno)) { + seqp = seqp->ccid2s_next; + if (seqp == hc->tx_seqh) { + seqp = hc->tx_seqh->ccid2s_prev; + break; + } + } + + /* + * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 + * packets per acknowledgement. Rounding up avoids that cwnd is not + * advanced when Ack Ratio is 1 and gives a slight edge otherwise. + */ + if (hc->tx_cwnd < hc->tx_ssthresh) + maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); + + /* go through all ack vectors */ + list_for_each_entry(avp, &hc->tx_av_chunks, node) { + /* go through this ack vector */ + for (; avp->len--; avp->vec++) { + u64 ackno_end_rl = SUB48(ackno, + dccp_ackvec_runlen(avp->vec)); + + ccid2_pr_debug("ackvec %llu |%u,%u|\n", + (unsigned long long)ackno, + dccp_ackvec_state(avp->vec) >> 6, + dccp_ackvec_runlen(avp->vec)); + /* if the seqno we are analyzing is larger than the + * current ackno, then move towards the tail of our + * seqnos. + */ + while (after48(seqp->ccid2s_seq, ackno)) { + if (seqp == hc->tx_seqt) { + done = 1; + break; + } + seqp = seqp->ccid2s_prev; + } + if (done) + break; + + /* check all seqnos in the range of the vector + * run length + */ + while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { + const u8 state = dccp_ackvec_state(avp->vec); + + /* new packet received or marked */ + if (state != DCCPAV_NOT_RECEIVED && + !seqp->ccid2s_acked) { + if (state == DCCPAV_ECN_MARKED) + ccid2_congestion_event(sk, + seqp); + else + ccid2_new_ack(sk, seqp, + &maxincr); + + seqp->ccid2s_acked = 1; + ccid2_pr_debug("Got ack for %llu\n", + (unsigned long long)seqp->ccid2s_seq); + hc->tx_pipe--; + } + if (seqp == hc->tx_seqt) { + done = 1; + break; + } + seqp = seqp->ccid2s_prev; + } + if (done) + break; + + ackno = SUB48(ackno_end_rl, 1); + } + if (done) + break; + } + + /* The state about what is acked should be correct now + * Check for NUMDUPACK + */ + seqp = hc->tx_seqt; + while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { + seqp = seqp->ccid2s_next; + if (seqp == hc->tx_seqh) { + seqp = hc->tx_seqh->ccid2s_prev; + break; + } + } + done = 0; + while (1) { + if (seqp->ccid2s_acked) { + done++; + if (done == NUMDUPACK) + break; + } + if (seqp == hc->tx_seqt) + break; + seqp = seqp->ccid2s_prev; + } + + /* If there are at least 3 acknowledgements, anything unacknowledged + * below the last sequence number is considered lost + */ + if (done == NUMDUPACK) { + struct ccid2_seq *last_acked = seqp; + + /* check for lost packets */ + while (1) { + if (!seqp->ccid2s_acked) { + ccid2_pr_debug("Packet lost: %llu\n", + (unsigned long long)seqp->ccid2s_seq); + /* XXX need to traverse from tail -> head in + * order to detect multiple congestion events in + * one ack vector. + */ + ccid2_congestion_event(sk, seqp); + hc->tx_pipe--; + } + if (seqp == hc->tx_seqt) + break; + seqp = seqp->ccid2s_prev; + } + + hc->tx_seqt = last_acked; + } + + /* trim acked packets in tail */ + while (hc->tx_seqt != hc->tx_seqh) { + if (!hc->tx_seqt->ccid2s_acked) + break; + + hc->tx_seqt = hc->tx_seqt->ccid2s_next; + } + + /* restart RTO timer if not all outstanding data has been acked */ + if (hc->tx_pipe == 0) + sk_stop_timer(sk, &hc->tx_rtotimer); + else + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); +done: + /* check if incoming Acks allow pending packets to be sent */ + if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); +} + +/* + * Convert RFC 3390 larger initial window into an equivalent number of packets. + * This is based on the numbers specified in RFC 5681, 3.1. + */ +static inline u32 rfc3390_bytes_to_packets(const u32 smss) +{ + return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); +} + +static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) +{ + struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); + struct dccp_sock *dp = dccp_sk(sk); + u32 max_ratio; + + /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ + hc->tx_ssthresh = ~0U; + + /* Use larger initial windows (RFC 4341, section 5). */ + hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); + + /* Make sure that Ack Ratio is enabled and within bounds. */ + max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); + if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) + dp->dccps_l_ack_ratio = max_ratio; + + /* XXX init ~ to window size... */ + if (ccid2_hc_tx_alloc_seq(hc)) + return -ENOMEM; + + hc->tx_rto = DCCP_TIMEOUT_INIT; + hc->tx_rpdupack = -1; + hc->tx_last_cong = ccid2_time_stamp; + setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, + (unsigned long)sk); + INIT_LIST_HEAD(&hc->tx_av_chunks); + return 0; +} + +static void ccid2_hc_tx_exit(struct sock *sk) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + int i; + + sk_stop_timer(sk, &hc->tx_rtotimer); + + for (i = 0; i < hc->tx_seqbufc; i++) + kfree(hc->tx_seqbuf[i]); + hc->tx_seqbufc = 0; +} + +static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); + + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_DATA: + case DCCP_PKT_DATAACK: + hc->rx_data++; + if (hc->rx_data >= dp->dccps_r_ack_ratio) { + dccp_send_ack(sk); + hc->rx_data = 0; + } + break; + } +} + +struct ccid_operations ccid2_ops = { + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, +}; + +#ifdef CONFIG_IP_DCCP_CCID2_DEBUG +module_param(ccid2_debug, bool, 0644); +MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); +#endif diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h new file mode 100644 index 00000000..e9985daf --- /dev/null +++ b/net/dccp/ccids/ccid2.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2005 Andrea Bittau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _DCCP_CCID2_H_ +#define _DCCP_CCID2_H_ + +#include +#include +#include "../ccid.h" +#include "../dccp.h" + +/* + * CCID-2 timestamping faces the same issues as TCP timestamping. + * Hence we reuse/share as much of the code as possible. + */ +#define ccid2_time_stamp tcp_time_stamp + +/* NUMDUPACK parameter from RFC 4341, p. 6 */ +#define NUMDUPACK 3 + +struct ccid2_seq { + u64 ccid2s_seq; + u32 ccid2s_sent; + int ccid2s_acked; + struct ccid2_seq *ccid2s_prev; + struct ccid2_seq *ccid2s_next; +}; + +#define CCID2_SEQBUF_LEN 1024 +#define CCID2_SEQBUF_MAX 128 + +/** + * struct ccid2_hc_tx_sock - CCID2 TX half connection + * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 + * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) + * @tx_srtt: smoothed RTT estimate, scaled by 2^3 + * @tx_mdev: smoothed RTT variation, scaled by 2^2 + * @tx_mdev_max: maximum of @mdev during one flight + * @tx_rttvar: moving average/maximum of @mdev_max + * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) + * @tx_rtt_seq: to decay RTTVAR at most once per flight + * @tx_rpseq: last consecutive seqno + * @tx_rpdupack: dupacks since rpseq + * @tx_av_chunks: list of Ack Vectors received on current skb + */ +struct ccid2_hc_tx_sock { + u32 tx_cwnd; + u32 tx_ssthresh; + u32 tx_pipe; + u32 tx_packets_acked; + struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; + int tx_seqbufc; + struct ccid2_seq *tx_seqh; + struct ccid2_seq *tx_seqt; + + /* RTT measurement: variables/principles are the same as in TCP */ + u32 tx_srtt, + tx_mdev, + tx_mdev_max, + tx_rttvar, + tx_rto; + u64 tx_rtt_seq:48; + struct timer_list tx_rtotimer; + + u64 tx_rpseq; + int tx_rpdupack; + u32 tx_last_cong; + u64 tx_high_ack; + struct list_head tx_av_chunks; +}; + +static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) +{ + return hc->tx_pipe >= hc->tx_cwnd; +} + +struct ccid2_hc_rx_sock { + int rx_data; +}; + +static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) +{ + return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); +} + +static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) +{ + return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); +} +#endif /* _DCCP_CCID2_H_ */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c new file mode 100644 index 00000000..3d604e13 --- /dev/null +++ b/net/dccp/ccids/ccid3.c @@ -0,0 +1,866 @@ +/* + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005-7 Ian McDonald + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include "../dccp.h" +#include "ccid3.h" + +#include + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static int ccid3_debug; +#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) +#else +#define ccid3_pr_debug(format, a...) +#endif + +/* + * Transmitter Half-Connection Routines + */ +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static const char *const ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + }; + + return ccid3_state_names[state]; +} +#endif + +static void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) +{ + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + enum ccid3_hc_tx_states oldstate = hc->tx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), + ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hc->tx_state = state; +} + +/* + * Compute the initial sending rate X_init in the manner of RFC 3390: + * + * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT + * + * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis + * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. + * For consistency with other parts of the code, X_init is scaled by 2^6. + */ +static inline u64 rfc3390_initial_rate(struct sock *sk) +{ + const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); + + return scaled_div(w_init << 6, hc->tx_rtt); +} + +/** + * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst + * This respects the granularity of X_inst (64 * bytes/second). + */ +static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) +{ + hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); + + ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, + hc->tx_s, (unsigned)(hc->tx_x >> 6)); +} + +static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) +{ + u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); + + return delta / hc->tx_rtt; +} + +/** + * ccid3_hc_tx_update_x - Update allowed sending rate X + * @stamp: most recent time if available - can be left NULL. + * This function tracks draft rfc3448bis, check there for latest details. + * + * Note: X and X_recv are both stored in units of 64 * bytes/second, to support + * fine-grained resolution of sending rates. This requires scaling by 2^6 + * throughout the code. Only X_calc is unscaled (in bytes/second). + * + */ +static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) +{ + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + __u64 min_rate = 2 * hc->tx_x_recv; + const __u64 old_x = hc->tx_x; + ktime_t now = stamp ? *stamp : ktime_get_real(); + + /* + * Handle IDLE periods: do not reduce below RFC3390 initial sending rate + * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: + * a sender is idle if it has not sent anything over a 2-RTT-period. + * For consistency with X and X_recv, min_rate is also scaled by 2^6. + */ + if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { + min_rate = rfc3390_initial_rate(sk); + min_rate = max(min_rate, 2 * hc->tx_x_recv); + } + + if (hc->tx_p > 0) { + + hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); + hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); + + } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { + + hc->tx_x = min(2 * hc->tx_x, min_rate); + hc->tx_x = max(hc->tx_x, + scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); + hc->tx_t_ld = now; + } + + if (hc->tx_x != old_x) { + ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " + "X_recv=%u\n", (unsigned)(old_x >> 6), + (unsigned)(hc->tx_x >> 6), hc->tx_x_calc, + (unsigned)(hc->tx_x_recv >> 6)); + + ccid3_update_send_interval(hc); + } +} + +/* + * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) + * @len: DCCP packet payload size in bytes + */ +static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) +{ + const u16 old_s = hc->tx_s; + + hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); + + if (hc->tx_s != old_s) + ccid3_update_send_interval(hc); +} + +/* + * Update Window Counter using the algorithm from [RFC 4342, 8.1]. + * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). + */ +static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, + ktime_t now) +{ + u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), + quarter_rtts = (4 * delta) / hc->tx_rtt; + + if (quarter_rtts > 0) { + hc->tx_t_last_win_count = now; + hc->tx_last_win_count += min(quarter_rtts, 5U); + hc->tx_last_win_count &= 0xF; /* mod 16 */ + } +} + +static void ccid3_hc_tx_no_feedback_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + unsigned long t_nfb = USEC_PER_SEC / 5; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + /* XXX: set some sensible MIB */ + goto restart_timer; + } + + ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, + ccid3_tx_state_name(hc->tx_state)); + + /* Ignore and do not restart after leaving the established state */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + goto out; + + /* Reset feedback state to "no feedback received" */ + if (hc->tx_state == TFRC_SSTATE_FBACK) + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + + /* + * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + * RTO is 0 if and only if no feedback has been received yet. + */ + if (hc->tx_t_rto == 0 || hc->tx_p == 0) { + + /* halve send rate directly */ + hc->tx_x = max(hc->tx_x / 2, + (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); + ccid3_update_send_interval(hc); + } else { + /* + * Modify the cached value of X_recv + * + * If (X_calc > 2 * X_recv) + * X_recv = max(X_recv / 2, s / (2 * t_mbi)); + * Else + * X_recv = X_calc / 4; + * + * Note that X_recv is scaled by 2^6 while X_calc is not + */ + BUG_ON(hc->tx_p && !hc->tx_x_calc); + + if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) + hc->tx_x_recv = + max(hc->tx_x_recv / 2, + (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); + else { + hc->tx_x_recv = hc->tx_x_calc; + hc->tx_x_recv <<= 4; + } + ccid3_hc_tx_update_x(sk, NULL); + } + ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", + (unsigned long long)hc->tx_x); + + /* + * Set new timeout for the nofeedback timer. + * See comments in packet_recv() regarding the value of t_RTO. + */ + if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ + t_nfb = TFRC_INITIAL_TIMEOUT; + else + t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); + +restart_timer: + sk_reset_timer(sk, &hc->tx_no_feedback_timer, + jiffies + usecs_to_jiffies(t_nfb)); +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/** + * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @skb: next packet candidate to send on @sk + * This function uses the convention of ccid_packet_dequeue_eval() and + * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. + */ +static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + ktime_t now = ktime_get_real(); + s64 delay; + + /* + * This function is called only for Data and DataAck packets. Sending + * zero-sized Data(Ack)s is theoretically possible, but for congestion + * control this case is pathological - ignore it. + */ + if (unlikely(skb->len == 0)) + return -EBADMSG; + + if (hc->tx_state == TFRC_SSTATE_NO_SENT) { + sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); + hc->tx_last_win_count = 0; + hc->tx_t_last_win_count = now; + + /* Set t_0 for initial packet */ + hc->tx_t_nom = now; + + hc->tx_s = skb->len; + + /* + * Use initial RTT sample when available: recommended by erratum + * to RFC 4342. This implements the initialisation procedure of + * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. + */ + if (dp->dccps_syn_rtt) { + ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); + hc->tx_rtt = dp->dccps_syn_rtt; + hc->tx_x = rfc3390_initial_rate(sk); + hc->tx_t_ld = now; + } else { + /* + * Sender does not have RTT sample: + * - set fallback RTT (RFC 4340, 3.4) since a RTT value + * is needed in several parts (e.g. window counter); + * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. + */ + hc->tx_rtt = DCCP_FALLBACK_RTT; + hc->tx_x = hc->tx_s; + hc->tx_x <<= 6; + } + ccid3_update_send_interval(hc); + + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + + } else { + delay = ktime_us_delta(hc->tx_t_nom, now); + ccid3_pr_debug("delay=%ld\n", (long)delay); + /* + * Scheduling of packet transmissions (RFC 5348, 8.3) + * + * if (t_now > t_nom - delta) + * // send the packet now + * else + * // send the packet in (t_nom - t_now) milliseconds. + */ + if (delay >= TFRC_T_DELTA) + return (u32)delay / USEC_PER_MSEC; + + ccid3_hc_tx_update_win_count(hc, now); + } + + /* prepare to send now (add options etc.) */ + dp->dccps_hc_tx_insert_options = 1; + DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; + + /* set the nominal send time for the next following packet */ + hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); + return CCID_PACKET_SEND_AT_ONCE; +} + +static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) +{ + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + + ccid3_hc_tx_update_s(hc, len); + + if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) + DCCP_CRIT("packet history - out of memory!"); +} + +static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + struct tfrc_tx_hist_entry *acked; + ktime_t now; + unsigned long t_nfb; + u32 r_sample; + + /* we are only interested in ACKs */ + if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || + DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) + return; + /* + * Locate the acknowledged packet in the TX history. + * + * Returning "entry not found" here can for instance happen when + * - the host has not sent out anything (e.g. a passive server), + * - the Ack is outdated (packet with higher Ack number was received), + * - it is a bogus Ack (for a packet not sent on this connection). + */ + acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); + if (acked == NULL) + return; + /* For the sake of RTT sampling, ignore/remove all older entries */ + tfrc_tx_hist_purge(&acked->next); + + /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ + now = ktime_get_real(); + r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); + hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); + + /* + * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 + */ + if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); + + if (hc->tx_t_rto == 0) { + /* + * Initial feedback packet: Larger Initial Windows (4.2) + */ + hc->tx_x = rfc3390_initial_rate(sk); + hc->tx_t_ld = now; + + ccid3_update_send_interval(hc); + + goto done_computing_x; + } else if (hc->tx_p == 0) { + /* + * First feedback after nofeedback timer expiry (4.3) + */ + goto done_computing_x; + } + } + + /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ + if (hc->tx_p > 0) + hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); + ccid3_hc_tx_update_x(sk, &now); + +done_computing_x: + ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " + "p=%u, X_calc=%u, X_recv=%u, X=%u\n", + dccp_role(sk), sk, hc->tx_rtt, r_sample, + hc->tx_s, hc->tx_p, hc->tx_x_calc, + (unsigned)(hc->tx_x_recv >> 6), + (unsigned)(hc->tx_x >> 6)); + + /* unschedule no feedback timer */ + sk_stop_timer(sk, &hc->tx_no_feedback_timer); + + /* + * As we have calculated new ipi, delta, t_nom it is possible + * that we now can send a packet, so wake up dccp_wait_for_ccid + */ + sk->sk_write_space(sk); + + /* + * Update timeout interval for the nofeedback timer. In order to control + * rate halving on networks with very low RTTs (<= 1 ms), use per-route + * tunable RTAX_RTO_MIN value as the lower bound. + */ + hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, + USEC_PER_SEC/HZ * tcp_rto_min(sk)); + /* + * Schedule no feedback timer to expire in + * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) + */ + t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); + + ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " + "expire in %lu jiffies (%luus)\n", + dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); + + sk_reset_timer(sk, &hc->tx_no_feedback_timer, + jiffies + usecs_to_jiffies(t_nfb)); +} + +static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) +{ + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + __be32 opt_val; + + switch (option) { + case TFRC_OPT_RECEIVE_RATE: + case TFRC_OPT_LOSS_EVENT_RATE: + /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ + if (packet_type == DCCP_PKT_DATA) + break; + if (unlikely(optlen != 4)) { + DCCP_WARN("%s(%p), invalid len %d for %u\n", + dccp_role(sk), sk, optlen, option); + return -EINVAL; + } + opt_val = ntohl(get_unaligned((__be32 *)optval)); + + if (option == TFRC_OPT_RECEIVE_RATE) { + /* Receive Rate is kept in units of 64 bytes/second */ + hc->tx_x_recv = opt_val; + hc->tx_x_recv <<= 6; + + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", + dccp_role(sk), sk, opt_val); + } else { + /* Update the fixpoint Loss Event Rate fraction */ + hc->tx_p = tfrc_invert_loss_event_rate(opt_val); + + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, opt_val); + } + } + return 0; +} + +static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) +{ + struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); + + hc->tx_state = TFRC_SSTATE_NO_SENT; + hc->tx_hist = NULL; + setup_timer(&hc->tx_no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + return 0; +} + +static void ccid3_hc_tx_exit(struct sock *sk) +{ + struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + + sk_stop_timer(sk, &hc->tx_no_feedback_timer); + tfrc_tx_hist_purge(&hc->tx_hist); +} + +static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) +{ + info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; +} + +static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, + u32 __user *optval, int __user *optlen) +{ + const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + struct tfrc_tx_info tfrc; + const void *val; + + switch (optname) { + case DCCP_SOCKOPT_CCID_TX_INFO: + if (len < sizeof(tfrc)) + return -EINVAL; + tfrc.tfrctx_x = hc->tx_x; + tfrc.tfrctx_x_recv = hc->tx_x_recv; + tfrc.tfrctx_x_calc = hc->tx_x_calc; + tfrc.tfrctx_rtt = hc->tx_rtt; + tfrc.tfrctx_p = hc->tx_p; + tfrc.tfrctx_rto = hc->tx_t_rto; + tfrc.tfrctx_ipi = hc->tx_t_ipi; + len = sizeof(tfrc); + val = &tfrc; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen) || copy_to_user(optval, val, len)) + return -EFAULT; + + return 0; +} + +/* + * Receiver Half-Connection Routines + */ + +/* CCID3 feedback types */ +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE +}; + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static const char *const ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) +{ + struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + enum ccid3_hc_rx_states oldstate = hc->rx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), + ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hc->rx_state = state; +} + +static void ccid3_hc_rx_send_feedback(struct sock *sk, + const struct sk_buff *skb, + enum ccid3_fback_type fbtype) +{ + struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + ktime_t now = ktime_get_real(); + s64 delta = 0; + + switch (fbtype) { + case CCID3_FBACK_INITIAL: + hc->rx_x_recv = 0; + hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ + break; + case CCID3_FBACK_PARAM_CHANGE: + /* + * When parameters change (new loss or p > p_prev), we do not + * have a reliable estimate for R_m of [RFC 3448, 6.2] and so + * need to reuse the previous value of X_recv. However, when + * X_recv was 0 (due to early loss), this would kill X down to + * s/t_mbi (i.e. one packet in 64 seconds). + * To avoid such drastic reduction, we approximate X_recv as + * the number of bytes since last feedback. + * This is a safe fallback, since X is bounded above by X_calc. + */ + if (hc->rx_x_recv > 0) + break; + /* fall through */ + case CCID3_FBACK_PERIODIC: + delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); + if (delta <= 0) + DCCP_BUG("delta (%ld) <= 0", (long)delta); + else + hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); + break; + default: + return; + } + + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + hc->rx_x_recv, hc->rx_pinv); + + hc->rx_tstamp_last_feedback = now; + hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; + hc->rx_bytes_recv = 0; + + dp->dccps_hc_rx_insert_options = 1; + dccp_send_ack(sk); +} + +static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + __be32 x_recv, pinv; + + if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + return 0; + + if (dccp_packet_without_ack(skb)) + return 0; + + x_recv = htonl(hc->rx_x_recv); + pinv = htonl(hc->rx_pinv); + + if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, + &pinv, sizeof(pinv)) || + dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, + &x_recv, sizeof(x_recv))) + return -1; + + return 0; +} + +/** + * ccid3_first_li - Implements [RFC 5348, 6.3.1] + * + * Determine the length of the first loss interval via inverse lookup. + * Assume that X_recv can be computed by the throughput equation + * s + * X_recv = -------- + * R * fval + * Find some p such that f(p) = fval; return 1/p (scaled). + */ +static u32 ccid3_first_li(struct sock *sk) +{ + struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + u32 x_recv, p, delta; + u64 fval; + + if (hc->rx_rtt == 0) { + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); + hc->rx_rtt = DCCP_FALLBACK_RTT; + } + + delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback)); + x_recv = scaled_div32(hc->rx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN("X_recv==0\n"); + if (hc->rx_x_recv == 0) { + DCCP_BUG("stored value of X_recv is zero"); + return ~0U; + } + x_recv = hc->rx_x_recv; + } + + fval = scaled_div(hc->rx_s, hc->rx_rtt); + fval = scaled_div32(fval, x_recv); + p = tfrc_calc_x_reverse_lookup(fval); + + ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " + "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); + + return p == 0 ? ~0U : scaled_div(1, p); +} + +static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; + const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; + const bool is_data_packet = dccp_data_packet(skb); + + if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + do_feedback = CCID3_FBACK_INITIAL; + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + hc->rx_s = payload; + /* + * Not necessary to update rx_bytes_recv here, + * since X_recv = 0 for the first feedback packet (cf. + * RFC 3448, 6.3) -- gerrit + */ + } + goto update_records; + } + + if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) + return; /* done receiving */ + + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + /* + * Update moving-average of s and the sum of received payload bytes + */ + hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); + hc->rx_bytes_recv += payload; + } + + /* + * Perform loss detection and handle pending losses + */ + if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, + skb, ndp, ccid3_first_li, sk)) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; + } + + if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) + return; /* done receiving */ + + /* + * Handle data packets: RTT sampling and monitoring p + */ + if (unlikely(!is_data_packet)) + goto update_records; + + if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { + const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); + /* + * Empty loss history: no loss so far, hence p stays 0. + * Sample RTT values, since an RTT estimate is required for the + * computation of p when the first loss occurs; RFC 3448, 6.3.1. + */ + if (sample != 0) + hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); + + } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { + /* + * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean + * has decreased (resp. p has increased), send feedback now. + */ + do_feedback = CCID3_FBACK_PARAM_CHANGE; + } + + /* + * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 + */ + if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) + do_feedback = CCID3_FBACK_PERIODIC; + +update_records: + tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); + +done_receiving: + if (do_feedback) + ccid3_hc_rx_send_feedback(sk, skb, do_feedback); +} + +static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) +{ + struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); + + hc->rx_state = TFRC_RSTATE_NO_DATA; + tfrc_lh_init(&hc->rx_li_hist); + return tfrc_rx_hist_alloc(&hc->rx_hist); +} + +static void ccid3_hc_rx_exit(struct sock *sk) +{ + struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + + tfrc_rx_hist_purge(&hc->rx_hist); + tfrc_lh_cleanup(&hc->rx_li_hist); +} + +static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) +{ + info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; +} + +static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, + u32 __user *optval, int __user *optlen) +{ + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); + struct tfrc_rx_info rx_info; + const void *val; + + switch (optname) { + case DCCP_SOCKOPT_CCID_RX_INFO: + if (len < sizeof(rx_info)) + return -EINVAL; + rx_info.tfrcrx_x_recv = hc->rx_x_recv; + rx_info.tfrcrx_rtt = hc->rx_rtt; + rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); + len = sizeof(rx_info); + val = &rx_info; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen) || copy_to_user(optval, val, len)) + return -EFAULT; + + return 0; +} + +struct ccid_operations ccid3_ops = { + .ccid_id = DCCPC_CCID3, + .ccid_name = "TCP-Friendly Rate Control", + .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), + .ccid_hc_tx_init = ccid3_hc_tx_init, + .ccid_hc_tx_exit = ccid3_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, + .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, + .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), + .ccid_hc_rx_init = ccid3_hc_rx_init, + .ccid_hc_rx_exit = ccid3_hc_rx_exit, + .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, + .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, + .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, + .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, + .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, + .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, +}; + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +module_param(ccid3_debug, bool, 0644); +MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); +#endif diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h new file mode 100644 index 00000000..1a9933c2 --- /dev/null +++ b/net/dccp/ccids/ccid3.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _DCCP_CCID3_H_ +#define _DCCP_CCID3_H_ + +#include +#include +#include +#include +#include "lib/tfrc.h" +#include "../ccid.h" + +/* Two seconds as per RFC 5348, 4.2 */ +#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) + +/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ +#define TFRC_T_MBI 64 + +/* + * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are + * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. + * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse + * resolution of HZ < 500 means that the error is below one timer tick (t_gran) + * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). + */ +#if (HZ >= 500) +# define TFRC_T_DELTA USEC_PER_MSEC +#else +# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) +#endif + +enum ccid3_options { + TFRC_OPT_LOSS_EVENT_RATE = 192, + TFRC_OPT_LOSS_INTERVALS = 193, + TFRC_OPT_RECEIVE_RATE = 194, +}; + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, +}; + +/** + * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket + * @tx_x: Current sending rate in 64 * bytes per second + * @tx_x_recv: Receive rate in 64 * bytes per second + * @tx_x_calc: Calculated rate in bytes per second + * @tx_rtt: Estimate of current round trip time in usecs + * @tx_p: Current loss event rate (0-1) scaled by 1000000 + * @tx_s: Packet size in bytes + * @tx_t_rto: Nofeedback Timer setting in usecs + * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs + * @tx_state: Sender state, one of %ccid3_hc_tx_states + * @tx_last_win_count: Last window counter sent + * @tx_t_last_win_count: Timestamp of earliest packet + * with last_win_count value sent + * @tx_no_feedback_timer: Handle to no feedback timer + * @tx_t_ld: Time last doubled during slow start + * @tx_t_nom: Nominal send time of next packet + * @tx_hist: Packet history + */ +struct ccid3_hc_tx_sock { + u64 tx_x; + u64 tx_x_recv; + u32 tx_x_calc; + u32 tx_rtt; + u32 tx_p; + u32 tx_t_rto; + u32 tx_t_ipi; + u16 tx_s; + enum ccid3_hc_tx_states tx_state:8; + u8 tx_last_win_count; + ktime_t tx_t_last_win_count; + struct timer_list tx_no_feedback_timer; + ktime_t tx_t_ld; + ktime_t tx_t_nom; + struct tfrc_tx_hist_entry *tx_hist; +}; + +static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) +{ + struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); + BUG_ON(hctx == NULL); + return hctx; +} + +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, +}; + +/** + * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket + * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) + * @rx_state: Receiver state, one of %ccid3_hc_rx_states + * @rx_bytes_recv: Total sum of DCCP payload bytes + * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @rx_rtt: Receiver estimate of RTT + * @rx_tstamp_last_feedback: Time at which last feedback was sent + * @rx_hist: Packet history (loss detection + RTT sampling) + * @rx_li_hist: Loss Interval database + * @rx_s: Received packet size in bytes + * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5) + */ +struct ccid3_hc_rx_sock { + u8 rx_last_counter:4; + enum ccid3_hc_rx_states rx_state:8; + u32 rx_bytes_recv; + u32 rx_x_recv; + u32 rx_rtt; + ktime_t rx_tstamp_last_feedback; + struct tfrc_rx_hist rx_hist; + struct tfrc_loss_hist rx_li_hist; + u16 rx_s; +#define rx_pinv rx_li_hist.i_mean +}; + +static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) +{ + struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); + BUG_ON(hcrx == NULL); + return hcrx; +} + +#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c new file mode 100644 index 00000000..497723c4 --- /dev/null +++ b/net/dccp/ccids/lib/loss_interval.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005-7 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include +#include "tfrc.h" + +static struct kmem_cache *tfrc_lh_slab __read_mostly; +/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ +static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; + +/* implements LIFO semantics on the array */ +static inline u8 LIH_INDEX(const u8 ctr) +{ + return LIH_SIZE - 1 - (ctr % LIH_SIZE); +} + +/* the `counter' index always points at the next entry to be populated */ +static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) +{ + return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; +} + +/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ +static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) +{ + BUG_ON(i >= lh->counter); + return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; +} + +/* + * On-demand allocation and de-allocation of entries + */ +static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) +{ + if (lh->ring[LIH_INDEX(lh->counter)] == NULL) + lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, + GFP_ATOMIC); + return lh->ring[LIH_INDEX(lh->counter)]; +} + +void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) +{ + if (!tfrc_lh_is_initialised(lh)) + return; + + for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) + if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { + kmem_cache_free(tfrc_lh_slab, + lh->ring[LIH_INDEX(lh->counter)]); + lh->ring[LIH_INDEX(lh->counter)] = NULL; + } +} + +static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) +{ + u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; + int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ + + if (k <= 0) + return; + + for (i = 0; i <= k; i++) { + i_i = tfrc_lh_get_interval(lh, i); + + if (i < k) { + i_tot0 += i_i * tfrc_lh_weights[i]; + w_tot += tfrc_lh_weights[i]; + } + if (i > 0) + i_tot1 += i_i * tfrc_lh_weights[i-1]; + } + + lh->i_mean = max(i_tot0, i_tot1) / w_tot; +} + +/** + * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 + * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev + */ +u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) +{ + struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); + u32 old_i_mean = lh->i_mean; + s64 len; + + if (cur == NULL) /* not initialised */ + return 0; + + len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; + + if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ + return 0; + + if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) + /* + * Implements RFC 4342, 10.2: + * If a packet S (skb) exists whose seqno comes `after' the one + * starting the current loss interval (cur) and if the modulo-16 + * distance from C(cur) to C(S) is greater than 4, consider all + * subsequent packets as belonging to a new loss interval. This + * test is necessary since CCVal may wrap between intervals. + */ + cur->li_is_closed = 1; + + if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ + return 0; + + cur->li_length = len; + tfrc_lh_calc_i_mean(lh); + + return lh->i_mean < old_i_mean; +} + +/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ +static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, + struct tfrc_rx_hist_entry *new_loss) +{ + return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && + (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); +} + +/** + * tfrc_lh_interval_add - Insert new record into the Loss Interval database + * @lh: Loss Interval database + * @rh: Receive history containing a fresh loss event + * @calc_first_li: Caller-dependent routine to compute length of first interval + * @sk: Used by @calc_first_li in caller-specific way (subtyping) + * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. + */ +int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, + u32 (*calc_first_li)(struct sock *), struct sock *sk) +{ + struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; + + if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) + return 0; + + new = tfrc_lh_demand_next(lh); + if (unlikely(new == NULL)) { + DCCP_CRIT("Cannot allocate/add loss record."); + return 0; + } + + new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; + new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; + new->li_is_closed = 0; + + if (++lh->counter == 1) + lh->i_mean = new->li_length = (*calc_first_li)(sk); + else { + cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); + new->li_length = dccp_delta_seqno(new->li_seqno, + tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1; + if (lh->counter > (2*LIH_SIZE)) + lh->counter -= LIH_SIZE; + + tfrc_lh_calc_i_mean(lh); + } + return 1; +} + +int __init tfrc_li_init(void) +{ + tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", + sizeof(struct tfrc_loss_interval), 0, + SLAB_HWCACHE_ALIGN, NULL); + return tfrc_lh_slab == NULL ? -ENOBUFS : 0; +} + +void tfrc_li_exit(void) +{ + if (tfrc_lh_slab != NULL) { + kmem_cache_destroy(tfrc_lh_slab); + tfrc_lh_slab = NULL; + } +} diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h new file mode 100644 index 00000000..d1d2f538 --- /dev/null +++ b/net/dccp/ccids/lib/loss_interval.h @@ -0,0 +1,73 @@ +#ifndef _DCCP_LI_HIST_ +#define _DCCP_LI_HIST_ +/* + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005-7 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ +#include +#include +#include + +/* + * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than + * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. + */ +#define NINTERVAL 8 +#define LIH_SIZE (NINTERVAL + 1) + +/** + * tfrc_loss_interval - Loss history record for TFRC-based protocols + * @li_seqno: Highest received seqno before the start of loss + * @li_ccval: The CCVal belonging to @li_seqno + * @li_is_closed: Whether @li_seqno is older than 1 RTT + * @li_length: Loss interval sequence length + */ +struct tfrc_loss_interval { + u64 li_seqno:48, + li_ccval:4, + li_is_closed:1; + u32 li_length; +}; + +/** + * tfrc_loss_hist - Loss record database + * @ring: Circular queue managed in LIFO manner + * @counter: Current count of entries (can be more than %LIH_SIZE) + * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] + */ +struct tfrc_loss_hist { + struct tfrc_loss_interval *ring[LIH_SIZE]; + u8 counter; + u32 i_mean; +}; + +static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) +{ + memset(lh, 0, sizeof(struct tfrc_loss_hist)); +} + +static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) +{ + return lh->counter > 0; +} + +static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) +{ + return min(lh->counter, (u8)LIH_SIZE); +} + +struct tfrc_rx_hist; + +extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, + u32 (*first_li)(struct sock *), struct sock *); +extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); +extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); + +#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c new file mode 100644 index 00000000..de8fe294 --- /dev/null +++ b/net/dccp/ccids/lib/packet_history.c @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include "packet_history.h" +#include "../../dccp.h" + +/* + * Transmitter History Routines + */ +static struct kmem_cache *tfrc_tx_hist_slab; + +int __init tfrc_tx_packet_history_init(void) +{ + tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", + sizeof(struct tfrc_tx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, NULL); + return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; +} + +void tfrc_tx_packet_history_exit(void) +{ + if (tfrc_tx_hist_slab != NULL) { + kmem_cache_destroy(tfrc_tx_hist_slab); + tfrc_tx_hist_slab = NULL; + } +} + +int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) +{ + struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); + + if (entry == NULL) + return -ENOBUFS; + entry->seqno = seqno; + entry->stamp = ktime_get_real(); + entry->next = *headp; + *headp = entry; + return 0; +} + +void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) +{ + struct tfrc_tx_hist_entry *head = *headp; + + while (head != NULL) { + struct tfrc_tx_hist_entry *next = head->next; + + kmem_cache_free(tfrc_tx_hist_slab, head); + head = next; + } + + *headp = NULL; +} + +/* + * Receiver History Routines + */ +static struct kmem_cache *tfrc_rx_hist_slab; + +int __init tfrc_rx_packet_history_init(void) +{ + tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", + sizeof(struct tfrc_rx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, NULL); + return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; +} + +void tfrc_rx_packet_history_exit(void) +{ + if (tfrc_rx_hist_slab != NULL) { + kmem_cache_destroy(tfrc_rx_hist_slab); + tfrc_rx_hist_slab = NULL; + } +} + +static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, + const struct sk_buff *skb, + const u64 ndp) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + + entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->tfrchrx_ccval = dh->dccph_ccval; + entry->tfrchrx_type = dh->dccph_type; + entry->tfrchrx_ndp = ndp; + entry->tfrchrx_tstamp = ktime_get_real(); +} + +void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, + const struct sk_buff *skb, + const u64 ndp) +{ + struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); + + tfrc_rx_hist_entry_from_skb(entry, skb, ndp); +} + +/* has the packet contained in skb been seen before? */ +int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) +{ + const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; + int i; + + if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) + return 1; + + for (i = 1; i <= h->loss_count; i++) + if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) + return 1; + + return 0; +} + +static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) +{ + const u8 idx_a = tfrc_rx_hist_index(h, a), + idx_b = tfrc_rx_hist_index(h, b); + struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; + + h->ring[idx_a] = h->ring[idx_b]; + h->ring[idx_b] = tmp; +} + +/* + * Private helper functions for loss detection. + * + * In the descriptions, `Si' refers to the sequence number of entry number i, + * whose NDP count is `Ni' (lower case is used for variables). + * Note: All __xxx_loss functions expect that a test against duplicates has been + * performed already: the seqno of the skb must not be less than the seqno + * of loss_prev; and it must not equal that of any valid history entry. + */ +static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) +{ + u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, + s1 = DCCP_SKB_CB(skb)->dccpd_seq; + + if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ + h->loss_count = 1; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); + } +} + +static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) +{ + u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, + s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, + s2 = DCCP_SKB_CB(skb)->dccpd_seq; + + if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ + h->loss_count = 2; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); + return; + } + + /* S0 < S2 < S1 */ + + if (dccp_loss_free(s0, s2, n2)) { + u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; + + if (dccp_loss_free(s2, s1, n1)) { + /* hole is filled: S0, S2, and S1 are consecutive */ + h->loss_count = 0; + h->loss_start = tfrc_rx_hist_index(h, 1); + } else + /* gap between S2 and S1: just update loss_prev */ + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); + + } else { /* gap between S0 and S2 */ + /* + * Reorder history to insert S2 between S0 and S1 + */ + tfrc_rx_hist_swap(h, 0, 3); + h->loss_start = tfrc_rx_hist_index(h, 3); + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); + h->loss_count = 2; + } +} + +/* return 1 if a new loss event has been identified */ +static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) +{ + u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, + s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, + s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, + s3 = DCCP_SKB_CB(skb)->dccpd_seq; + + if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ + h->loss_count = 3; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); + return 1; + } + + /* S3 < S2 */ + + if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ + /* + * Reorder history to insert S3 between S1 and S2 + */ + tfrc_rx_hist_swap(h, 2, 3); + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); + h->loss_count = 3; + return 1; + } + + /* S0 < S3 < S1 */ + + if (dccp_loss_free(s0, s3, n3)) { + u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; + + if (dccp_loss_free(s3, s1, n1)) { + /* hole between S0 and S1 filled by S3 */ + u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; + + if (dccp_loss_free(s1, s2, n2)) { + /* entire hole filled by S0, S3, S1, S2 */ + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 0; + } else { + /* gap remains between S1 and S2 */ + h->loss_start = tfrc_rx_hist_index(h, 1); + h->loss_count = 1; + } + + } else /* gap exists between S3 and S1, loss_count stays at 2 */ + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); + + return 0; + } + + /* + * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 + * Reorder history to insert S3 between S0 and S1. + */ + tfrc_rx_hist_swap(h, 0, 3); + h->loss_start = tfrc_rx_hist_index(h, 3); + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); + h->loss_count = 3; + + return 1; +} + +/* recycle RX history records to continue loss detection if necessary */ +static void __three_after_loss(struct tfrc_rx_hist *h) +{ + /* + * At this stage we know already that there is a gap between S0 and S1 + * (since S0 was the highest sequence number received before detecting + * the loss). To recycle the loss record, it is thus only necessary to + * check for other possible gaps between S1/S2 and between S2/S3. + */ + u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, + s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, + s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; + u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, + n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; + + if (dccp_loss_free(s1, s2, n2)) { + + if (dccp_loss_free(s2, s3, n3)) { + /* no gap between S2 and S3: entire hole is filled */ + h->loss_start = tfrc_rx_hist_index(h, 3); + h->loss_count = 0; + } else { + /* gap between S2 and S3 */ + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 1; + } + + } else { /* gap between S1 and S2 */ + h->loss_start = tfrc_rx_hist_index(h, 1); + h->loss_count = 2; + } +} + +/** + * tfrc_rx_handle_loss - Loss detection and further processing + * @h: The non-empty RX history object + * @lh: Loss Intervals database to update + * @skb: Currently received packet + * @ndp: The NDP count belonging to @skb + * @calc_first_li: Caller-dependent computation of first loss interval in @lh + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * Chooses action according to pending loss, updates LI database when a new + * loss was detected, and does required post-processing. Returns 1 when caller + * should send feedback, 0 otherwise. + * Since it also takes care of reordering during loss detection and updates the + * records accordingly, the caller should not perform any more RX history + * operations when loss_count is greater than 0 after calling this function. + */ +int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*calc_first_li)(struct sock *), struct sock *sk) +{ + int is_new_loss = 0; + + if (h->loss_count == 0) { + __do_track_loss(h, skb, ndp); + } else if (h->loss_count == 1) { + __one_after_loss(h, skb, ndp); + } else if (h->loss_count != 2) { + DCCP_BUG("invalid loss_count %d", h->loss_count); + } else if (__two_after_loss(h, skb, ndp)) { + /* + * Update Loss Interval database and recycle RX records + */ + is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); + __three_after_loss(h); + } + return is_new_loss; +} + +int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) +{ + int i; + + for (i = 0; i <= TFRC_NDUPACK; i++) { + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); + if (h->ring[i] == NULL) + goto out_free; + } + + h->loss_count = h->loss_start = 0; + return 0; + +out_free: + while (i-- != 0) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; + } + return -ENOBUFS; +} + +void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) +{ + int i; + + for (i = 0; i <= TFRC_NDUPACK; ++i) + if (h->ring[i] != NULL) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; + } +} + +/** + * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) +{ + return h->ring[0]; +} + +/** + * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) +{ + return h->ring[h->rtt_sample_prev]; +} + +/** + * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal + * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able + * to compute a sample with given data - calling function should check this. + */ +u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) +{ + u32 sample = 0, + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + + if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ + if (h->rtt_sample_prev == 2) { /* previous candidate stored */ + sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + if (sample) + sample = 4 / sample * + ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); + else /* + * FIXME: This condition is in principle not + * possible but occurs when CCID is used for + * two-way data traffic. I have tried to trace + * it, but the cause does not seem to be here. + */ + DCCP_BUG("please report to dccp@vger.kernel.org" + " => prev = %u, last = %u", + tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + } else if (delta_v < 1) { + h->rtt_sample_prev = 1; + goto keep_ref_for_next_time; + } + + } else if (delta_v == 4) /* optimal match */ + sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); + else { /* suboptimal match */ + h->rtt_sample_prev = 2; + goto keep_ref_for_next_time; + } + + if (unlikely(sample > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %u too large, using max\n", sample); + sample = DCCP_SANE_RTT_MAX; + } + + h->rtt_sample_prev = 0; /* use current entry as next reference */ +keep_ref_for_next_time: + + return sample; +} diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h new file mode 100644 index 00000000..7ee4a9d9 --- /dev/null +++ b/net/dccp/ccids/lib/packet_history.h @@ -0,0 +1,158 @@ +/* + * Packet RX/TX history data structures and routines for TFRC-based protocols. + * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DCCP_PKT_HIST_ +#define _DCCP_PKT_HIST_ + +#include +#include +#include "tfrc.h" + +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + +static inline struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + return head; +} + +extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); +extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); + +/* Subtraction a-b modulo-16, respects circular wrap-around */ +#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) + +/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ +#define TFRC_NDUPACK 3 + +/** + * tfrc_rx_hist_entry - Store information about a single received packet + * @tfrchrx_seqno: DCCP packet sequence number + * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) + * @tfrchrx_ndp: the NDP count (if any) of the packet + * @tfrchrx_tstamp: actual receive time of packet + */ +struct tfrc_rx_hist_entry { + u64 tfrchrx_seqno:48, + tfrchrx_ccval:4, + tfrchrx_type:4; + u64 tfrchrx_ndp:48; + ktime_t tfrchrx_tstamp; +}; + +/** + * tfrc_rx_hist - RX history structure for TFRC-based protocols + * @ring: Packet history for RTT sampling and loss detection + * @loss_count: Number of entries in circular history + * @loss_start: Movable index (for loss detection) + * @rtt_sample_prev: Used during RTT sampling, points to candidate entry + */ +struct tfrc_rx_hist { + struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; + u8 loss_count:2, + loss_start:2; +#define rtt_sample_prev loss_start +}; + +/** + * tfrc_rx_hist_index - index to reach n-th entry after loss_start + */ +static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) +{ + return (h->loss_start + n) & TFRC_NDUPACK; +} + +/** + * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) +{ + return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; +} + +/** + * tfrc_rx_hist_entry - return the n-th history entry after loss_start + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) +{ + return h->ring[tfrc_rx_hist_index(h, n)]; +} + +/** + * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) +{ + return h->ring[h->loss_start]; +} + +/* indicate whether previously a packet was detected missing */ +static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) +{ + return h->loss_count > 0; +} + +extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, + const struct sk_buff *skb, const u64 ndp); + +extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); + +struct tfrc_loss_hist; +extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*first_li)(struct sock *sk), + struct sock *sk); +extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, + const struct sk_buff *skb); +extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); +extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); + +#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c new file mode 100644 index 00000000..49020298 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.c @@ -0,0 +1,44 @@ +/* + * TFRC library initialisation + * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2007 Arnaldo Carvalho de Melo + */ +#include "tfrc.h" + +#ifdef CONFIG_IP_DCCP_TFRC_DEBUG +int tfrc_debug; +module_param(tfrc_debug, bool, 0644); +MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); +#endif + +int __init tfrc_lib_init(void) +{ + int rc = tfrc_li_init(); + + if (rc) + goto out; + + rc = tfrc_tx_packet_history_init(); + if (rc) + goto out_free_loss_intervals; + + rc = tfrc_rx_packet_history_init(); + if (rc) + goto out_free_tx_history; + return 0; + +out_free_tx_history: + tfrc_tx_packet_history_exit(); +out_free_loss_intervals: + tfrc_li_exit(); +out: + return rc; +} + +void tfrc_lib_exit(void) +{ + tfrc_rx_packet_history_exit(); + tfrc_tx_packet_history_exit(); + tfrc_li_exit(); +} diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h new file mode 100644 index 00000000..f8ee3f54 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.h @@ -0,0 +1,77 @@ +#ifndef _TFRC_H_ +#define _TFRC_H_ +/* + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005-6 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include +#include +#include "../../dccp.h" + +/* internal includes that this library exports: */ +#include "loss_interval.h" +#include "packet_history.h" + +#ifdef CONFIG_IP_DCCP_TFRC_DEBUG +extern int tfrc_debug; +#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) +#else +#define tfrc_pr_debug(format, a...) +#endif + +/* integer-arithmetic divisions of type (a * 1000000)/b */ +static inline u64 scaled_div(u64 a, u64 b) +{ + BUG_ON(b == 0); + return div64_u64(a * 1000000, b); +} + +static inline u32 scaled_div32(u64 a, u64 b) +{ + u64 result = scaled_div(a, b); + + if (result > UINT_MAX) { + DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", + (unsigned long long)a, (unsigned long long)b); + return UINT_MAX; + } + return result; +} + +/** + * tfrc_ewma - Exponentially weighted moving average + * @weight: Weight to be used as damping factor, in units of 1/10 + */ +static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) +{ + return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; +} + +extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); +extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); +extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); + +extern int tfrc_tx_packet_history_init(void); +extern void tfrc_tx_packet_history_exit(void); +extern int tfrc_rx_packet_history_init(void); +extern void tfrc_rx_packet_history_exit(void); + +extern int tfrc_li_init(void); +extern void tfrc_li_exit(void); + +#ifdef CONFIG_IP_DCCP_TFRC_LIB +extern int tfrc_lib_init(void); +extern void tfrc_lib_exit(void); +#else +#define tfrc_lib_init() (0) +#define tfrc_lib_exit() +#endif +#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c new file mode 100644 index 00000000..a052a437 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include "../../dccp.h" +#include "tfrc.h" + +#define TFRC_CALC_X_ARRSIZE 500 +#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ +#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) + +/* + TFRC TCP Reno Throughput Equation Lookup Table for f(p) + + The following two-column lookup table implements a part of the TCP throughput + equation from [RFC 3448, sec. 3.1]: + + s + X_calc = -------------------------------------------------------------- + R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) + + Where: + X is the transmit rate in bytes/second + s is the packet size in bytes + R is the round trip time in seconds + p is the loss event rate, between 0 and 1.0, of the number of loss + events as a fraction of the number of packets transmitted + t_RTO is the TCP retransmission timeout value in seconds + b is the number of packets acknowledged by a single TCP ACK + + We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: + + s + X_calc = ------------------------------------------------------- + R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) + + which we can break down into: + + s + X_calc = --------- + R * f(p) + + where f(p) is given for 0 < p <= 1 by: + + f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) + + Since this is kernel code, floating-point arithmetic is avoided in favour of + integer arithmetic. This means that nearly all fractional parameters are + scaled by 1000000: + * the parameters p and R + * the return result f(p) + The lookup table therefore actually tabulates the following function g(q): + + g(q) = 1000000 * f(q/1000000) + + Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer + granularity for the practically more relevant case of small values of p (up to + 5%), the second column is used; the first one ranges up to 100%. This split + corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also + determines the smallest resolution possible with this lookup table: + + TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE + + The entire table is generated by: + for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { + lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); + lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); + } + + With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, + lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) + lookup[M][0] = g(1000000) = 1000000 * f(100%) + lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) + lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) + + In summary, the two columns represent f(p) for the following ranges: + * The first column is for 0.002 <= p <= 1.0 + * The second column is for 0.0001 <= p <= 0.05 + Where the columns overlap, the second (finer-grained) is given preference, + i.e. the first column is used only for p >= 0.05. + */ +static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { + { 37172, 8172 }, + { 53499, 11567 }, + { 66664, 14180 }, + { 78298, 16388 }, + { 89021, 18339 }, + { 99147, 20108 }, + { 108858, 21738 }, + { 118273, 23260 }, + { 127474, 24693 }, + { 136520, 26052 }, + { 145456, 27348 }, + { 154316, 28589 }, + { 163130, 29783 }, + { 171919, 30935 }, + { 180704, 32049 }, + { 189502, 33130 }, + { 198328, 34180 }, + { 207194, 35202 }, + { 216114, 36198 }, + { 225097, 37172 }, + { 234153, 38123 }, + { 243294, 39055 }, + { 252527, 39968 }, + { 261861, 40864 }, + { 271305, 41743 }, + { 280866, 42607 }, + { 290553, 43457 }, + { 300372, 44293 }, + { 310333, 45117 }, + { 320441, 45929 }, + { 330705, 46729 }, + { 341131, 47518 }, + { 351728, 48297 }, + { 362501, 49066 }, + { 373460, 49826 }, + { 384609, 50577 }, + { 395958, 51320 }, + { 407513, 52054 }, + { 419281, 52780 }, + { 431270, 53499 }, + { 443487, 54211 }, + { 455940, 54916 }, + { 468635, 55614 }, + { 481581, 56306 }, + { 494785, 56991 }, + { 508254, 57671 }, + { 521996, 58345 }, + { 536019, 59014 }, + { 550331, 59677 }, + { 564939, 60335 }, + { 579851, 60988 }, + { 595075, 61636 }, + { 610619, 62279 }, + { 626491, 62918 }, + { 642700, 63553 }, + { 659253, 64183 }, + { 676158, 64809 }, + { 693424, 65431 }, + { 711060, 66050 }, + { 729073, 66664 }, + { 747472, 67275 }, + { 766266, 67882 }, + { 785464, 68486 }, + { 805073, 69087 }, + { 825103, 69684 }, + { 845562, 70278 }, + { 866460, 70868 }, + { 887805, 71456 }, + { 909606, 72041 }, + { 931873, 72623 }, + { 954614, 73202 }, + { 977839, 73778 }, + { 1001557, 74352 }, + { 1025777, 74923 }, + { 1050508, 75492 }, + { 1075761, 76058 }, + { 1101544, 76621 }, + { 1127867, 77183 }, + { 1154739, 77741 }, + { 1182172, 78298 }, + { 1210173, 78852 }, + { 1238753, 79405 }, + { 1267922, 79955 }, + { 1297689, 80503 }, + { 1328066, 81049 }, + { 1359060, 81593 }, + { 1390684, 82135 }, + { 1422947, 82675 }, + { 1455859, 83213 }, + { 1489430, 83750 }, + { 1523671, 84284 }, + { 1558593, 84817 }, + { 1594205, 85348 }, + { 1630518, 85878 }, + { 1667543, 86406 }, + { 1705290, 86932 }, + { 1743770, 87457 }, + { 1782994, 87980 }, + { 1822973, 88501 }, + { 1863717, 89021 }, + { 1905237, 89540 }, + { 1947545, 90057 }, + { 1990650, 90573 }, + { 2034566, 91087 }, + { 2079301, 91600 }, + { 2124869, 92111 }, + { 2171279, 92622 }, + { 2218543, 93131 }, + { 2266673, 93639 }, + { 2315680, 94145 }, + { 2365575, 94650 }, + { 2416371, 95154 }, + { 2468077, 95657 }, + { 2520707, 96159 }, + { 2574271, 96660 }, + { 2628782, 97159 }, + { 2684250, 97658 }, + { 2740689, 98155 }, + { 2798110, 98651 }, + { 2856524, 99147 }, + { 2915944, 99641 }, + { 2976382, 100134 }, + { 3037850, 100626 }, + { 3100360, 101117 }, + { 3163924, 101608 }, + { 3228554, 102097 }, + { 3294263, 102586 }, + { 3361063, 103073 }, + { 3428966, 103560 }, + { 3497984, 104045 }, + { 3568131, 104530 }, + { 3639419, 105014 }, + { 3711860, 105498 }, + { 3785467, 105980 }, + { 3860253, 106462 }, + { 3936229, 106942 }, + { 4013410, 107422 }, + { 4091808, 107902 }, + { 4171435, 108380 }, + { 4252306, 108858 }, + { 4334431, 109335 }, + { 4417825, 109811 }, + { 4502501, 110287 }, + { 4588472, 110762 }, + { 4675750, 111236 }, + { 4764349, 111709 }, + { 4854283, 112182 }, + { 4945564, 112654 }, + { 5038206, 113126 }, + { 5132223, 113597 }, + { 5227627, 114067 }, + { 5324432, 114537 }, + { 5422652, 115006 }, + { 5522299, 115474 }, + { 5623389, 115942 }, + { 5725934, 116409 }, + { 5829948, 116876 }, + { 5935446, 117342 }, + { 6042439, 117808 }, + { 6150943, 118273 }, + { 6260972, 118738 }, + { 6372538, 119202 }, + { 6485657, 119665 }, + { 6600342, 120128 }, + { 6716607, 120591 }, + { 6834467, 121053 }, + { 6953935, 121514 }, + { 7075025, 121976 }, + { 7197752, 122436 }, + { 7322131, 122896 }, + { 7448175, 123356 }, + { 7575898, 123815 }, + { 7705316, 124274 }, + { 7836442, 124733 }, + { 7969291, 125191 }, + { 8103877, 125648 }, + { 8240216, 126105 }, + { 8378321, 126562 }, + { 8518208, 127018 }, + { 8659890, 127474 }, + { 8803384, 127930 }, + { 8948702, 128385 }, + { 9095861, 128840 }, + { 9244875, 129294 }, + { 9395760, 129748 }, + { 9548529, 130202 }, + { 9703198, 130655 }, + { 9859782, 131108 }, + { 10018296, 131561 }, + { 10178755, 132014 }, + { 10341174, 132466 }, + { 10505569, 132917 }, + { 10671954, 133369 }, + { 10840345, 133820 }, + { 11010757, 134271 }, + { 11183206, 134721 }, + { 11357706, 135171 }, + { 11534274, 135621 }, + { 11712924, 136071 }, + { 11893673, 136520 }, + { 12076536, 136969 }, + { 12261527, 137418 }, + { 12448664, 137867 }, + { 12637961, 138315 }, + { 12829435, 138763 }, + { 13023101, 139211 }, + { 13218974, 139658 }, + { 13417071, 140106 }, + { 13617407, 140553 }, + { 13819999, 140999 }, + { 14024862, 141446 }, + { 14232012, 141892 }, + { 14441465, 142339 }, + { 14653238, 142785 }, + { 14867346, 143230 }, + { 15083805, 143676 }, + { 15302632, 144121 }, + { 15523842, 144566 }, + { 15747453, 145011 }, + { 15973479, 145456 }, + { 16201939, 145900 }, + { 16432847, 146345 }, + { 16666221, 146789 }, + { 16902076, 147233 }, + { 17140429, 147677 }, + { 17381297, 148121 }, + { 17624696, 148564 }, + { 17870643, 149007 }, + { 18119154, 149451 }, + { 18370247, 149894 }, + { 18623936, 150336 }, + { 18880241, 150779 }, + { 19139176, 151222 }, + { 19400759, 151664 }, + { 19665007, 152107 }, + { 19931936, 152549 }, + { 20201564, 152991 }, + { 20473907, 153433 }, + { 20748982, 153875 }, + { 21026807, 154316 }, + { 21307399, 154758 }, + { 21590773, 155199 }, + { 21876949, 155641 }, + { 22165941, 156082 }, + { 22457769, 156523 }, + { 22752449, 156964 }, + { 23049999, 157405 }, + { 23350435, 157846 }, + { 23653774, 158287 }, + { 23960036, 158727 }, + { 24269236, 159168 }, + { 24581392, 159608 }, + { 24896521, 160049 }, + { 25214642, 160489 }, + { 25535772, 160929 }, + { 25859927, 161370 }, + { 26187127, 161810 }, + { 26517388, 162250 }, + { 26850728, 162690 }, + { 27187165, 163130 }, + { 27526716, 163569 }, + { 27869400, 164009 }, + { 28215234, 164449 }, + { 28564236, 164889 }, + { 28916423, 165328 }, + { 29271815, 165768 }, + { 29630428, 166208 }, + { 29992281, 166647 }, + { 30357392, 167087 }, + { 30725779, 167526 }, + { 31097459, 167965 }, + { 31472452, 168405 }, + { 31850774, 168844 }, + { 32232445, 169283 }, + { 32617482, 169723 }, + { 33005904, 170162 }, + { 33397730, 170601 }, + { 33792976, 171041 }, + { 34191663, 171480 }, + { 34593807, 171919 }, + { 34999428, 172358 }, + { 35408544, 172797 }, + { 35821174, 173237 }, + { 36237335, 173676 }, + { 36657047, 174115 }, + { 37080329, 174554 }, + { 37507197, 174993 }, + { 37937673, 175433 }, + { 38371773, 175872 }, + { 38809517, 176311 }, + { 39250924, 176750 }, + { 39696012, 177190 }, + { 40144800, 177629 }, + { 40597308, 178068 }, + { 41053553, 178507 }, + { 41513554, 178947 }, + { 41977332, 179386 }, + { 42444904, 179825 }, + { 42916290, 180265 }, + { 43391509, 180704 }, + { 43870579, 181144 }, + { 44353520, 181583 }, + { 44840352, 182023 }, + { 45331092, 182462 }, + { 45825761, 182902 }, + { 46324378, 183342 }, + { 46826961, 183781 }, + { 47333531, 184221 }, + { 47844106, 184661 }, + { 48358706, 185101 }, + { 48877350, 185541 }, + { 49400058, 185981 }, + { 49926849, 186421 }, + { 50457743, 186861 }, + { 50992759, 187301 }, + { 51531916, 187741 }, + { 52075235, 188181 }, + { 52622735, 188622 }, + { 53174435, 189062 }, + { 53730355, 189502 }, + { 54290515, 189943 }, + { 54854935, 190383 }, + { 55423634, 190824 }, + { 55996633, 191265 }, + { 56573950, 191706 }, + { 57155606, 192146 }, + { 57741621, 192587 }, + { 58332014, 193028 }, + { 58926806, 193470 }, + { 59526017, 193911 }, + { 60129666, 194352 }, + { 60737774, 194793 }, + { 61350361, 195235 }, + { 61967446, 195677 }, + { 62589050, 196118 }, + { 63215194, 196560 }, + { 63845897, 197002 }, + { 64481179, 197444 }, + { 65121061, 197886 }, + { 65765563, 198328 }, + { 66414705, 198770 }, + { 67068508, 199213 }, + { 67726992, 199655 }, + { 68390177, 200098 }, + { 69058085, 200540 }, + { 69730735, 200983 }, + { 70408147, 201426 }, + { 71090343, 201869 }, + { 71777343, 202312 }, + { 72469168, 202755 }, + { 73165837, 203199 }, + { 73867373, 203642 }, + { 74573795, 204086 }, + { 75285124, 204529 }, + { 76001380, 204973 }, + { 76722586, 205417 }, + { 77448761, 205861 }, + { 78179926, 206306 }, + { 78916102, 206750 }, + { 79657310, 207194 }, + { 80403571, 207639 }, + { 81154906, 208084 }, + { 81911335, 208529 }, + { 82672880, 208974 }, + { 83439562, 209419 }, + { 84211402, 209864 }, + { 84988421, 210309 }, + { 85770640, 210755 }, + { 86558080, 211201 }, + { 87350762, 211647 }, + { 88148708, 212093 }, + { 88951938, 212539 }, + { 89760475, 212985 }, + { 90574339, 213432 }, + { 91393551, 213878 }, + { 92218133, 214325 }, + { 93048107, 214772 }, + { 93883493, 215219 }, + { 94724314, 215666 }, + { 95570590, 216114 }, + { 96422343, 216561 }, + { 97279594, 217009 }, + { 98142366, 217457 }, + { 99010679, 217905 }, + { 99884556, 218353 }, + { 100764018, 218801 }, + { 101649086, 219250 }, + { 102539782, 219698 }, + { 103436128, 220147 }, + { 104338146, 220596 }, + { 105245857, 221046 }, + { 106159284, 221495 }, + { 107078448, 221945 }, + { 108003370, 222394 }, + { 108934074, 222844 }, + { 109870580, 223294 }, + { 110812910, 223745 }, + { 111761087, 224195 }, + { 112715133, 224646 }, + { 113675069, 225097 }, + { 114640918, 225548 }, + { 115612702, 225999 }, + { 116590442, 226450 }, + { 117574162, 226902 }, + { 118563882, 227353 }, + { 119559626, 227805 }, + { 120561415, 228258 }, + { 121569272, 228710 }, + { 122583219, 229162 }, + { 123603278, 229615 }, + { 124629471, 230068 }, + { 125661822, 230521 }, + { 126700352, 230974 }, + { 127745083, 231428 }, + { 128796039, 231882 }, + { 129853241, 232336 }, + { 130916713, 232790 }, + { 131986475, 233244 }, + { 133062553, 233699 }, + { 134144966, 234153 }, + { 135233739, 234608 }, + { 136328894, 235064 }, + { 137430453, 235519 }, + { 138538440, 235975 }, + { 139652876, 236430 }, + { 140773786, 236886 }, + { 141901190, 237343 }, + { 143035113, 237799 }, + { 144175576, 238256 }, + { 145322604, 238713 }, + { 146476218, 239170 }, + { 147636442, 239627 }, + { 148803298, 240085 }, + { 149976809, 240542 }, + { 151156999, 241000 }, + { 152343890, 241459 }, + { 153537506, 241917 }, + { 154737869, 242376 }, + { 155945002, 242835 }, + { 157158929, 243294 }, + { 158379673, 243753 }, + { 159607257, 244213 }, + { 160841704, 244673 }, + { 162083037, 245133 }, + { 163331279, 245593 }, + { 164586455, 246054 }, + { 165848586, 246514 }, + { 167117696, 246975 }, + { 168393810, 247437 }, + { 169676949, 247898 }, + { 170967138, 248360 }, + { 172264399, 248822 }, + { 173568757, 249284 }, + { 174880235, 249747 }, + { 176198856, 250209 }, + { 177524643, 250672 }, + { 178857621, 251136 }, + { 180197813, 251599 }, + { 181545242, 252063 }, + { 182899933, 252527 }, + { 184261908, 252991 }, + { 185631191, 253456 }, + { 187007807, 253920 }, + { 188391778, 254385 }, + { 189783129, 254851 }, + { 191181884, 255316 }, + { 192588065, 255782 }, + { 194001698, 256248 }, + { 195422805, 256714 }, + { 196851411, 257181 }, + { 198287540, 257648 }, + { 199731215, 258115 }, + { 201182461, 258582 }, + { 202641302, 259050 }, + { 204107760, 259518 }, + { 205581862, 259986 }, + { 207063630, 260454 }, + { 208553088, 260923 }, + { 210050262, 261392 }, + { 211555174, 261861 }, + { 213067849, 262331 }, + { 214588312, 262800 }, + { 216116586, 263270 }, + { 217652696, 263741 }, + { 219196666, 264211 }, + { 220748520, 264682 }, + { 222308282, 265153 }, + { 223875978, 265625 }, + { 225451630, 266097 }, + { 227035265, 266569 }, + { 228626905, 267041 }, + { 230226576, 267514 }, + { 231834302, 267986 }, + { 233450107, 268460 }, + { 235074016, 268933 }, + { 236706054, 269407 }, + { 238346244, 269881 }, + { 239994613, 270355 }, + { 241651183, 270830 }, + { 243315981, 271305 } +}; + +/* return largest index i such that fval <= lookup[i][small] */ +static inline u32 tfrc_binsearch(u32 fval, u8 small) +{ + u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; + + while (low < high) { + try = (low + high) / 2; + if (fval <= tfrc_calc_x_lookup[try][small]) + high = try; + else + low = try + 1; + } + return high; +} + +/** + * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 + * @s: packet size in bytes + * @R: RTT scaled by 1000000 (i.e., microseconds) + * @p: loss ratio estimate scaled by 1000000 + * Returns X_calc in bytes per second (not scaled). + */ +u32 tfrc_calc_x(u16 s, u32 R, u32 p) +{ + u16 index; + u32 f; + u64 result; + + /* check against invalid parameters and divide-by-zero */ + BUG_ON(p > 1000000); /* p must not exceed 100% */ + BUG_ON(p == 0); /* f(0) = 0, divide by zero */ + if (R == 0) { /* possible divide by zero */ + DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); + return ~0U; + } + + if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ + if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ + DCCP_WARN("Value of p (%d) below resolution. " + "Substituting %d\n", p, TFRC_SMALLEST_P); + index = 0; + } else /* 0.0001 <= p <= 0.05 */ + index = p/TFRC_SMALLEST_P - 1; + + f = tfrc_calc_x_lookup[index][1]; + + } else { /* 0.05 < p <= 1.00 */ + index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; + + f = tfrc_calc_x_lookup[index][0]; + } + + /* + * Compute X = s/(R*f(p)) in bytes per second. + * Since f(p) and R are both scaled by 1000000, we need to multiply by + * 1000000^2. To avoid overflow, the result is computed in two stages. + * This works under almost all reasonable operational conditions, for a + * wide range of parameters. Yet, should some strange combination of + * parameters result in overflow, the use of scaled_div32 will catch + * this and return UINT_MAX - which is a logically adequate consequence. + */ + result = scaled_div(s, R); + return scaled_div32(result, f); +} + +/** + * tfrc_calc_x_reverse_lookup - try to find p given f(p) + * @fvalue: function value to match, scaled by 1000000 + * Returns closest match for p, also scaled by 1000000 + */ +u32 tfrc_calc_x_reverse_lookup(u32 fvalue) +{ + int index; + + if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ + return 0; + + /* Error cases. */ + if (fvalue < tfrc_calc_x_lookup[0][1]) { + DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); + return TFRC_SMALLEST_P; + } + if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { + DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); + return 1000000; + } + + if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { + index = tfrc_binsearch(fvalue, 1); + return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; + } + + /* else ... it must be in the coarse-grained column */ + index = tfrc_binsearch(fvalue, 0); + return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; +} + +/** + * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * When @loss_event_rate is large, there is a chance that p is truncated to 0. + * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. + */ +u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) +{ + if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ + return 0; + if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ + return 1000000; + return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); +} diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h new file mode 100644 index 00000000..5fdb0722 --- /dev/null +++ b/net/dccp/dccp.h @@ -0,0 +1,506 @@ +#ifndef _DCCP_H +#define _DCCP_H +/* + * net/dccp/dccp.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2005-6 Ian McDonald + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include "ackvec.h" + +/* + * DCCP - specific warning and debugging macros. + */ +#define DCCP_WARN(fmt, a...) LIMIT_NETDEBUG(KERN_WARNING "%s: " fmt, \ + __func__, ##a) +#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ + __FILE__, __LINE__, __func__) +#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) +#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ + DCCP_BUG("\"%s\" holds (exception!)", \ + __stringify(cond)); \ + } while (0) + +#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ + printk(fmt, ##args); \ + } while(0) +#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ + "%s: " fmt, __func__, ##a) + +#ifdef CONFIG_IP_DCCP_DEBUG +extern int dccp_debug; +#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) +#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) +#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) +#else +#define dccp_pr_debug(format, a...) +#define dccp_pr_debug_cat(format, a...) +#define dccp_debug(format, a...) +#endif + +extern struct inet_hashinfo dccp_hashinfo; + +extern struct percpu_counter dccp_orphan_count; + +extern void dccp_time_wait(struct sock *sk, int state, int timeo); + +/* + * Set safe upper bounds for header and option length. Since Data Offset is 8 + * bits (RFC 4340, sec. 5.1), the total header length can never be more than + * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): + * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR + * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields + * Hence a safe upper bound for the maximum option length is 1020-28 = 992 + */ +#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) +#define DCCP_MAX_PACKET_HDR 28 +#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) +#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) + +/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ +#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) + +#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT + * state, about 60 seconds */ + +/* RFC 1122, 4.2.3.1 initial RTO value */ +#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) + +/* + * The maximum back-off value for retransmissions. This is needed for + * - retransmitting client-Requests (sec. 8.1.1), + * - retransmitting Close/CloseReq when closing (sec. 8.3), + * - feature-negotiation retransmission (sec. 6.6.3), + * - Acks in client-PARTOPEN state (sec. 8.1.5). + */ +#define DCCP_RTO_MAX ((unsigned)(64 * HZ)) + +/* + * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 + */ +#define DCCP_SANE_RTT_MIN 100 +#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) +#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) + +/* sysctl variables for DCCP */ +extern int sysctl_dccp_request_retries; +extern int sysctl_dccp_retries1; +extern int sysctl_dccp_retries2; +extern int sysctl_dccp_tx_qlen; +extern int sysctl_dccp_sync_ratelimit; + +/* + * 48-bit sequence number arithmetic (signed and unsigned) + */ +#define INT48_MIN 0x800000000000LL /* 2^47 */ +#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ +#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ +#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) +#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) +#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) +#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) + +static inline void dccp_set_seqno(u64 *seqno, u64 value) +{ + *seqno = value & UINT48_MAX; +} + +static inline void dccp_inc_seqno(u64 *seqno) +{ + *seqno = ADD48(*seqno, 1); +} + +/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ +static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) +{ + u64 delta = SUB48(seqno2, seqno1); + + return TO_SIGNED48(delta); +} + +/* is seq1 < seq2 ? */ +static inline int before48(const u64 seq1, const u64 seq2) +{ + return (s64)((seq2 << 16) - (seq1 << 16)) > 0; +} + +/* is seq1 > seq2 ? */ +#define after48(seq1, seq2) before48(seq2, seq1) + +/* is seq2 <= seq1 <= seq3 ? */ +static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) +{ + return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); +} + +static inline u64 max48(const u64 seq1, const u64 seq2) +{ + return after48(seq1, seq2) ? seq1 : seq2; +} + +/** + * dccp_loss_count - Approximate the number of lost data packets in a burst loss + * @s1: last known sequence number before the loss ('hole') + * @s2: first sequence number seen after the 'hole' + * @ndp: NDP count on packet with sequence number @s2 + */ +static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) +{ + s64 delta = dccp_delta_seqno(s1, s2); + + WARN_ON(delta < 0); + delta -= ndp + 1; + + return delta > 0 ? delta : 0; +} + +/** + * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 + */ +static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) +{ + return dccp_loss_count(s1, s2, ndp) == 0; +} + +enum { + DCCP_MIB_NUM = 0, + DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ + DCCP_MIB_ESTABRESETS, /* EstabResets */ + DCCP_MIB_CURRESTAB, /* CurrEstab */ + DCCP_MIB_OUTSEGS, /* OutSegs */ + DCCP_MIB_OUTRSTS, + DCCP_MIB_ABORTONTIMEOUT, + DCCP_MIB_TIMEOUTS, + DCCP_MIB_ABORTFAILED, + DCCP_MIB_PASSIVEOPENS, + DCCP_MIB_ATTEMPTFAILS, + DCCP_MIB_OUTDATAGRAMS, + DCCP_MIB_INERRS, + DCCP_MIB_OPTMANDATORYERROR, + DCCP_MIB_INVALIDOPT, + __DCCP_MIB_MAX +}; + +#define DCCP_MIB_MAX __DCCP_MIB_MAX +struct dccp_mib { + unsigned long mibs[DCCP_MIB_MAX]; +}; + +DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) + +/* + * Checksumming routines + */ +static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) +{ + const struct dccp_hdr* dh = dccp_hdr(skb); + + if (dh->dccph_cscov == 0) + return skb->len; + return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); +} + +static inline void dccp_csum_outgoing(struct sk_buff *skb) +{ + unsigned int cov = dccp_csum_coverage(skb); + + if (cov >= skb->len) + dccp_hdr(skb)->dccph_cscov = 0; + + skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); +} + +extern void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); + +extern int dccp_retransmit_skb(struct sock *sk); + +extern void dccp_send_ack(struct sock *sk); +extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *rsk); + +extern void dccp_send_sync(struct sock *sk, const u64 seq, + const enum dccp_pkt_type pkt_type); + +/* + * TX Packet Dequeueing Interface + */ +extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); +extern bool dccp_qpolicy_full(struct sock *sk); +extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); +extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); +extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); +extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); + +/* + * TX Packet Output and TX Timers + */ +extern void dccp_write_xmit(struct sock *sk); +extern void dccp_write_space(struct sock *sk); +extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); + +extern void dccp_init_xmit_timers(struct sock *sk); +static inline void dccp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} + +extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); + +extern const char *dccp_packet_name(const int type); + +extern void dccp_set_state(struct sock *sk, const int state); +extern void dccp_done(struct sock *sk); + +extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, + struct sk_buff const *skb); + +extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); + +extern struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb); + +extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); + +extern struct sock *dccp_v4_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev); + +extern int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); +extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len); +extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len); + +extern int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); +extern void dccp_destroy_sock(struct sock *sk); + +extern void dccp_close(struct sock *sk, long timeout); +extern struct sk_buff *dccp_make_response(struct sock *sk, + struct dst_entry *dst, + struct request_sock *req); + +extern int dccp_connect(struct sock *sk); +extern int dccp_disconnect(struct sock *sk, int flags); +extern int dccp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +extern int dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +#ifdef CONFIG_COMPAT +extern int compat_dccp_getsockopt(struct sock *sk, + int level, int optname, + char __user *optval, int __user *optlen); +extern int compat_dccp_setsockopt(struct sock *sk, + int level, int optname, + char __user *optval, unsigned int optlen); +#endif +extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); +extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len); +extern void dccp_shutdown(struct sock *sk, int how); +extern int inet_dccp_listen(struct socket *sock, int backlog); +extern unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); +extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len); + +extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, + struct sk_buff *skb); +extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); +extern void dccp_send_close(struct sock *sk, const int active); +extern int dccp_invalid_packet(struct sk_buff *skb); +extern u32 dccp_sample_rtt(struct sock *sk, long delta); + +static inline int dccp_bad_service_code(const struct sock *sk, + const __be32 service) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_service == service) + return 0; + return !dccp_list_has_service(dp->dccps_service_list, service); +} + +/** + * dccp_skb_cb - DCCP per-packet control information + * @dccpd_type: one of %dccp_pkt_type (or unknown) + * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 + * @dccpd_reset_code: one of %dccp_reset_codes + * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) + * @dccpd_opt_len: total length of all options (5.8) in the packet + * @dccpd_seq: sequence number + * @dccpd_ack_seq: acknowledgment number subheader field value + * This is used for transmission as well as for reception. + */ +struct dccp_skb_cb { + union { + struct inet_skb_parm h4; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + struct inet6_skb_parm h6; +#endif + } header; + __u8 dccpd_type:4; + __u8 dccpd_ccval:4; + __u8 dccpd_reset_code, + dccpd_reset_data[3]; + __u16 dccpd_opt_len; + __u64 dccpd_seq; + __u64 dccpd_ack_seq; +}; + +#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) + +/* RFC 4340, sec. 7.7 */ +static inline int dccp_non_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_ACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ || + type == DCCP_PKT_RESET || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK; +} + +/* RFC 4340, sec. 7.7 */ +static inline int dccp_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_DATA || + type == DCCP_PKT_DATAACK || + type == DCCP_PKT_REQUEST || + type == DCCP_PKT_RESPONSE; +} + +static inline int dccp_packet_without_ack(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; +} + +#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) + +static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) +{ + struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + + sizeof(*dh)); + dh->dccph_seq2 = 0; + dh->dccph_seq = htons((gss >> 32) & 0xfffff); + dhx->dccph_seq_low = htonl(gss & 0xffffffff); +} + +static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, + const u64 gsr) +{ + dhack->dccph_reserved1 = 0; + dhack->dccph_ack_nr_high = htons(gsr >> 32); + dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); +} + +static inline void dccp_update_gsr(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (after48(seq, dp->dccps_gsr)) + dp->dccps_gsr = seq; + /* Sequence validity window depends on remote Sequence Window (7.5.1) */ + dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + /* + * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, + * 7.5.1 we perform this check beyond the initial handshake: W/W' are + * always > 32, so for the first W/W' packets in the lifetime of a + * connection we always have to adjust SWL. + * A second reason why we are doing this is that the window depends on + * the feature-remote value of Sequence Window: nothing stops the peer + * from updating this value while we are busy adjusting SWL for the + * first W packets (we would have to count from scratch again then). + * Therefore it is safer to always make sure that the Sequence Window + * is not artificially extended by a peer who grows SWL downwards by + * continually updating the feature-remote Sequence-Window. + * If sequence numbers wrap it is bad luck. But that will take a while + * (48 bit), and this measure prevents Sequence-number attacks. + */ + if (before48(dp->dccps_swl, dp->dccps_isr)) + dp->dccps_swl = dp->dccps_isr; + dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); +} + +static inline void dccp_update_gss(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + + dp->dccps_gss = seq; + /* Ack validity window depends on local Sequence Window value (7.5.1) */ + dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + /* Adjust AWL so that it is not below ISS - see comment above for SWL */ + if (before48(dp->dccps_awl, dp->dccps_iss)) + dp->dccps_awl = dp->dccps_iss; + dp->dccps_awh = dp->dccps_gss; +} + +static inline int dccp_ackvec_pending(const struct sock *sk) +{ + return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && + !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); +} + +static inline int dccp_ack_pending(const struct sock *sk) +{ + return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); +} + +extern int dccp_feat_finalise_settings(struct dccp_sock *dp); +extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); +extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, + struct sk_buff *skb); +extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); +extern void dccp_feat_list_purge(struct list_head *fn_list); + +extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); +extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); +extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed); +extern u32 dccp_timestamp(void); +extern void dccp_timestamping_init(void); +extern int dccp_insert_option(struct sk_buff *skb, unsigned char option, + const void *value, unsigned char len); + +#ifdef CONFIG_SYSCTL +extern int dccp_sysctl_init(void); +extern void dccp_sysctl_exit(void); +#else +static inline int dccp_sysctl_init(void) +{ + return 0; +} + +static inline void dccp_sysctl_exit(void) +{ +} +#endif + +#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c new file mode 100644 index 00000000..b21f261d --- /dev/null +++ b/net/dccp/diag.c @@ -0,0 +1,74 @@ +/* + * net/dccp/diag.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include +#include + +#include "ccid.h" +#include "dccp.h" + +static void dccp_get_info(struct sock *sk, struct tcp_info *info) +{ + struct dccp_sock *dp = dccp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + memset(info, 0, sizeof(*info)); + + info->tcpi_state = sk->sk_state; + info->tcpi_retransmits = icsk->icsk_retransmits; + info->tcpi_probes = icsk->icsk_probes_out; + info->tcpi_backoff = icsk->icsk_backoff; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; + + if (dp->dccps_hc_rx_ackvec != NULL) + info->tcpi_options |= TCPI_OPT_SACK; + + if (dp->dccps_hc_rx_ccid != NULL) + ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); + + if (dp->dccps_hc_tx_ccid != NULL) + ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); +} + +static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + r->idiag_rqueue = r->idiag_wqueue = 0; + + if (_info != NULL) + dccp_get_info(sk, _info); +} + +static const struct inet_diag_handler dccp_diag_handler = { + .idiag_hashinfo = &dccp_hashinfo, + .idiag_get_info = dccp_diag_get_info, + .idiag_type = DCCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static int __init dccp_diag_init(void) +{ + return inet_diag_register(&dccp_diag_handler); +} + +static void __exit dccp_diag_fini(void) +{ + inet_diag_unregister(&dccp_diag_handler); +} + +module_init(dccp_diag_init); +module_exit(dccp_diag_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP inet_diag handler"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, DCCPDIAG_GETSOCK); diff --git a/net/dccp/feat.c b/net/dccp/feat.c new file mode 100644 index 00000000..568def95 --- /dev/null +++ b/net/dccp/feat.c @@ -0,0 +1,1359 @@ +/* + * net/dccp/feat.c + * + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) + * + * Copyright (c) 2008 Gerrit Renker + * Rewrote from scratch, some bits from earlier code by + * Copyright (c) 2005 Andrea Bittau + * + * + * ASSUMPTIONS + * ----------- + * o Feature negotiation is coordinated with connection setup (as in TCP), wild + * changes of parameters of an established connection are not supported. + * o All currently known SP features have 1-byte quantities. If in the future + * extensions of RFCs 4340..42 define features with item lengths larger than + * one byte, a feature-specific extension of the code will be required. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include "ccid.h" +#include "feat.h" + +/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ +unsigned long sysctl_dccp_sequence_window __read_mostly = 100; +int sysctl_dccp_rx_ccid __read_mostly = 2, + sysctl_dccp_tx_ccid __read_mostly = 2; + +/* + * Feature activation handlers. + * + * These all use an u64 argument, to provide enough room for NN/SP features. At + * this stage the negotiated values have been checked to be within their range. + */ +static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid *new_ccid = ccid_new(ccid, sk, rx); + + if (new_ccid == NULL) + return -ENOMEM; + + if (rx) { + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + dp->dccps_hc_rx_ccid = new_ccid; + } else { + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = new_ccid; + } + return 0; +} + +static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + dp->dccps_r_seq_win = seq_win; + /* propagate changes to update SWL/SWH */ + dccp_update_gsr(sk, dp->dccps_gsr); + } else { + dp->dccps_l_seq_win = seq_win; + /* propagate changes to update AWL */ + dccp_update_gss(sk, dp->dccps_gss); + } + return 0; +} + +static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) +{ + if (rx) + dccp_sk(sk)->dccps_r_ack_ratio = ratio; + else + dccp_sk(sk)->dccps_l_ack_ratio = ratio; + return 0; +} + +static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + if (enable && dp->dccps_hc_rx_ackvec == NULL) { + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); + if (dp->dccps_hc_rx_ackvec == NULL) + return -ENOMEM; + } else if (!enable) { + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + } + } + return 0; +} + +static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) +{ + if (!rx) + dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); + return 0; +} + +/* + * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that + * `rx' holds when the sending peer informs about his partial coverage via a + * ChangeR() option. In the other case, we are the sender and the receiver + * announces its coverage via ChangeL() options. The policy here is to honour + * such communication by enabling the corresponding partial coverage - but only + * if it has not been set manually before; the warning here means that all + * packets will be dropped. + */ +static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) + dp->dccps_pcrlen = cscov; + else { + if (dp->dccps_pcslen == 0) + dp->dccps_pcslen = cscov; + else if (cscov > dp->dccps_pcslen) + DCCP_WARN("CsCov %u too small, peer requires >= %u\n", + dp->dccps_pcslen, (u8)cscov); + } + return 0; +} + +static const struct { + u8 feat_num; /* DCCPF_xxx */ + enum dccp_feat_type rxtx; /* RX or TX */ + enum dccp_feat_type reconciliation; /* SP or NN */ + u8 default_value; /* as in 6.4 */ + int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); +/* + * Lookup table for location and type of features (from RFC 4340/4342) + * +--------------------------+----+-----+----+----+---------+-----------+ + * | Feature | Location | Reconc. | Initial | Section | + * | | RX | TX | SP | NN | Value | Reference | + * +--------------------------+----+-----+----+----+---------+-----------+ + * | DCCPF_CCID | | X | X | | 2 | 10 | + * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | + * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | + * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | + * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | + * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | + * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | + * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | + * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | + * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | + * +--------------------------+----+-----+----+----+---------+-----------+ + */ +} dccp_feat_table[] = { + { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, + { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, + { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, + { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, + { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, + { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, + { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, + { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, + { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, + { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, +}; +#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) + +/** + * dccp_feat_index - Hash function to map feature number into array position + * Returns consecutive array index or -1 if the feature is not understood. + */ +static int dccp_feat_index(u8 feat_num) +{ + /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ + if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) + return feat_num - 1; + + /* + * Other features: add cases for new feature types here after adding + * them to the above table. + */ + switch (feat_num) { + case DCCPF_SEND_LEV_RATE: + return DCCP_FEAT_SUPPORTED_MAX - 1; + } + return -1; +} + +static u8 dccp_feat_type(u8 feat_num) +{ + int idx = dccp_feat_index(feat_num); + + if (idx < 0) + return FEAT_UNKNOWN; + return dccp_feat_table[idx].reconciliation; +} + +static int dccp_feat_default_value(u8 feat_num) +{ + int idx = dccp_feat_index(feat_num); + /* + * There are no default values for unknown features, so encountering a + * negative index here indicates a serious problem somewhere else. + */ + DCCP_BUG_ON(idx < 0); + + return idx < 0 ? 0 : dccp_feat_table[idx].default_value; +} + +/* + * Debugging and verbose-printing section + */ +static const char *dccp_feat_fname(const u8 feat) +{ + static const char *const feature_names[] = { + [DCCPF_RESERVED] = "Reserved", + [DCCPF_CCID] = "CCID", + [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", + [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", + [DCCPF_ECN_INCAPABLE] = "ECN Incapable", + [DCCPF_ACK_RATIO] = "Ack Ratio", + [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", + [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", + [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", + [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", + }; + if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) + return feature_names[DCCPF_RESERVED]; + + if (feat == DCCPF_SEND_LEV_RATE) + return "Send Loss Event Rate"; + if (feat >= DCCPF_MIN_CCID_SPECIFIC) + return "CCID-specific"; + + return feature_names[feat]; +} + +static const char *const dccp_feat_sname[] = { + "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", +}; + +#ifdef CONFIG_IP_DCCP_DEBUG +static const char *dccp_feat_oname(const u8 opt) +{ + switch (opt) { + case DCCPO_CHANGE_L: return "Change_L"; + case DCCPO_CONFIRM_L: return "Confirm_L"; + case DCCPO_CHANGE_R: return "Change_R"; + case DCCPO_CONFIRM_R: return "Confirm_R"; + } + return NULL; +} + +static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) +{ + u8 i, type = dccp_feat_type(feat_num); + + if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) + dccp_pr_debug_cat("(NULL)"); + else if (type == FEAT_SP) + for (i = 0; i < val->sp.len; i++) + dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); + else if (type == FEAT_NN) + dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); + else + dccp_pr_debug_cat("unknown type %u", type); +} + +static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) +{ + u8 type = dccp_feat_type(feat_num); + dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; + + if (type == FEAT_NN) + fval.nn = dccp_decode_value_var(list, len); + dccp_feat_printval(feat_num, &fval); +} + +static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) +{ + dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", + dccp_feat_fname(entry->feat_num)); + dccp_feat_printval(entry->feat_num, &entry->val); + dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], + entry->needs_confirm ? "(Confirm pending)" : ""); +} + +#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ + dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ + dccp_feat_printvals(feat, val, len); \ + dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) + +#define dccp_feat_print_fnlist(fn_list) { \ + const struct dccp_feat_entry *___entry; \ + \ + dccp_pr_debug("List Dump:\n"); \ + list_for_each_entry(___entry, fn_list, node) \ + dccp_feat_print_entry(___entry); \ +} +#else /* ! CONFIG_IP_DCCP_DEBUG */ +#define dccp_feat_print_opt(opt, feat, val, len, mandatory) +#define dccp_feat_print_fnlist(fn_list) +#endif + +static int __dccp_feat_activate(struct sock *sk, const int idx, + const bool is_local, dccp_feat_val const *fval) +{ + bool rx; + u64 val; + + if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) + return -1; + if (dccp_feat_table[idx].activation_hdlr == NULL) + return 0; + + if (fval == NULL) { + val = dccp_feat_table[idx].default_value; + } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { + if (fval->sp.vec == NULL) { + /* + * This can happen when an empty Confirm is sent + * for an SP (i.e. known) feature. In this case + * we would be using the default anyway. + */ + DCCP_CRIT("Feature #%d undefined: using default", idx); + val = dccp_feat_table[idx].default_value; + } else { + val = fval->sp.vec[0]; + } + } else { + val = fval->nn; + } + + /* Location is RX if this is a local-RX or remote-TX feature */ + rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); + + dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", + dccp_feat_fname(dccp_feat_table[idx].feat_num), + fval ? "" : "default ", (unsigned long long)val); + + return dccp_feat_table[idx].activation_hdlr(sk, val, rx); +} + +/* Test for "Req'd" feature (RFC 4340, 6.4) */ +static inline int dccp_feat_must_be_understood(u8 feat_num) +{ + return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || + feat_num == DCCPF_SEQUENCE_WINDOW; +} + +/* copy constructor, fval must not already contain allocated memory */ +static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) +{ + fval->sp.len = len; + if (fval->sp.len > 0) { + fval->sp.vec = kmemdup(val, len, gfp_any()); + if (fval->sp.vec == NULL) { + fval->sp.len = 0; + return -ENOBUFS; + } + } + return 0; +} + +static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) +{ + if (unlikely(val == NULL)) + return; + if (dccp_feat_type(feat_num) == FEAT_SP) + kfree(val->sp.vec); + memset(val, 0, sizeof(*val)); +} + +static struct dccp_feat_entry * + dccp_feat_clone_entry(struct dccp_feat_entry const *original) +{ + struct dccp_feat_entry *new; + u8 type = dccp_feat_type(original->feat_num); + + if (type == FEAT_UNKNOWN) + return NULL; + + new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); + if (new == NULL) + return NULL; + + if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, + original->val.sp.vec, + original->val.sp.len)) { + kfree(new); + return NULL; + } + return new; +} + +static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) +{ + if (entry != NULL) { + dccp_feat_val_destructor(entry->feat_num, &entry->val); + kfree(entry); + } +} + +/* + * List management functions + * + * Feature negotiation lists rely on and maintain the following invariants: + * - each feat_num in the list is known, i.e. we know its type and default value + * - each feat_num/is_local combination is unique (old entries are overwritten) + * - SP values are always freshly allocated + * - list is sorted in increasing order of feature number (faster lookup) + */ +static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, + u8 feat_num, bool is_local) +{ + struct dccp_feat_entry *entry; + + list_for_each_entry(entry, fn_list, node) { + if (entry->feat_num == feat_num && entry->is_local == is_local) + return entry; + else if (entry->feat_num > feat_num) + break; + } + return NULL; +} + +/** + * dccp_feat_entry_new - Central list update routine (called by all others) + * @head: list to add to + * @feat: feature number + * @local: whether the local (1) or remote feature with number @feat is meant + * This is the only constructor and serves to ensure the above invariants. + */ +static struct dccp_feat_entry * + dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) +{ + struct dccp_feat_entry *entry; + + list_for_each_entry(entry, head, node) + if (entry->feat_num == feat && entry->is_local == local) { + dccp_feat_val_destructor(entry->feat_num, &entry->val); + return entry; + } else if (entry->feat_num > feat) { + head = &entry->node; + break; + } + + entry = kmalloc(sizeof(*entry), gfp_any()); + if (entry != NULL) { + entry->feat_num = feat; + entry->is_local = local; + list_add_tail(&entry->node, head); + } + return entry; +} + +/** + * dccp_feat_push_change - Add/overwrite a Change option in the list + * @fn_list: feature-negotiation list to update + * @feat: one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is meant + * @needs_mandatory: whether to use Mandatory feature negotiation options + * @fval: pointer to NN/SP value to be inserted (will be copied) + */ +static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, + u8 mandatory, dccp_feat_val *fval) +{ + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + + if (new == NULL) + return -ENOMEM; + + new->feat_num = feat; + new->is_local = local; + new->state = FEAT_INITIALISING; + new->needs_confirm = 0; + new->empty_confirm = 0; + new->val = *fval; + new->needs_mandatory = mandatory; + + return 0; +} + +/** + * dccp_feat_push_confirm - Add a Confirm entry to the FN list + * @fn_list: feature-negotiation list to add to + * @feat: one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is being confirmed + * @fval: pointer to NN/SP value to be inserted or NULL + * Returns 0 on success, a Reset code for further processing otherwise. + */ +static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, + dccp_feat_val *fval) +{ + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + + if (new == NULL) + return DCCP_RESET_CODE_TOO_BUSY; + + new->feat_num = feat; + new->is_local = local; + new->state = FEAT_STABLE; /* transition in 6.6.2 */ + new->needs_confirm = 1; + new->empty_confirm = (fval == NULL); + new->val.nn = 0; /* zeroes the whole structure */ + if (!new->empty_confirm) + new->val = *fval; + new->needs_mandatory = 0; + + return 0; +} + +static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) +{ + return dccp_feat_push_confirm(fn_list, feat, local, NULL); +} + +static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) +{ + list_del(&entry->node); + dccp_feat_entry_destructor(entry); +} + +void dccp_feat_list_purge(struct list_head *fn_list) +{ + struct dccp_feat_entry *entry, *next; + + list_for_each_entry_safe(entry, next, fn_list, node) + dccp_feat_entry_destructor(entry); + INIT_LIST_HEAD(fn_list); +} +EXPORT_SYMBOL_GPL(dccp_feat_list_purge); + +/* generate @to as full clone of @from - @to must not contain any nodes */ +int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) +{ + struct dccp_feat_entry *entry, *new; + + INIT_LIST_HEAD(to); + list_for_each_entry(entry, from, node) { + new = dccp_feat_clone_entry(entry); + if (new == NULL) + goto cloning_failed; + list_add_tail(&new->node, to); + } + return 0; + +cloning_failed: + dccp_feat_list_purge(to); + return -ENOMEM; +} + +/** + * dccp_feat_valid_nn_length - Enforce length constraints on NN options + * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, + * incoming options are accepted as long as their values are valid. + */ +static u8 dccp_feat_valid_nn_length(u8 feat_num) +{ + if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ + return 2; + if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ + return 6; + return 0; +} + +static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) +{ + switch (feat_num) { + case DCCPF_ACK_RATIO: + return val <= DCCPF_ACK_RATIO_MAX; + case DCCPF_SEQUENCE_WINDOW: + return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; + } + return 0; /* feature unknown - so we can't tell */ +} + +/* check that SP values are within the ranges defined in RFC 4340 */ +static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) +{ + switch (feat_num) { + case DCCPF_CCID: + return val == DCCPC_CCID2 || val == DCCPC_CCID3; + /* Type-check Boolean feature values: */ + case DCCPF_SHORT_SEQNOS: + case DCCPF_ECN_INCAPABLE: + case DCCPF_SEND_ACK_VECTOR: + case DCCPF_SEND_NDP_COUNT: + case DCCPF_DATA_CHECKSUM: + case DCCPF_SEND_LEV_RATE: + return val < 2; + case DCCPF_MIN_CSUM_COVER: + return val < 16; + } + return 0; /* feature unknown */ +} + +static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) +{ + if (sp_list == NULL || sp_len < 1) + return 0; + while (sp_len--) + if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) + return 0; + return 1; +} + +/** + * dccp_feat_insert_opts - Generate FN options from current list state + * @skb: next sk_buff to be sent to the peer + * @dp: for client during handshake and general negotiation + * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) + */ +int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, + struct sk_buff *skb) +{ + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; + struct dccp_feat_entry *pos, *next; + u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; + bool rpt; + + /* put entries into @skb in the order they appear in the list */ + list_for_each_entry_safe_reverse(pos, next, fn, node) { + opt = dccp_feat_genopt(pos); + type = dccp_feat_type(pos->feat_num); + rpt = false; + + if (pos->empty_confirm) { + len = 0; + ptr = NULL; + } else { + if (type == FEAT_SP) { + len = pos->val.sp.len; + ptr = pos->val.sp.vec; + rpt = pos->needs_confirm; + } else if (type == FEAT_NN) { + len = dccp_feat_valid_nn_length(pos->feat_num); + ptr = nn_in_nbo; + dccp_encode_value_var(pos->val.nn, ptr, len); + } else { + DCCP_BUG("unknown feature %u", pos->feat_num); + return -1; + } + } + dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); + + if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) + return -1; + if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) + return -1; + /* + * Enter CHANGING after transmitting the Change option (6.6.2). + */ + if (pos->state == FEAT_INITIALISING) + pos->state = FEAT_CHANGING; + } + return 0; +} + +/** + * __feat_register_nn - Register new NN value on socket + * @fn: feature-negotiation list to register with + * @feat: an NN feature from %dccp_feature_numbers + * @mandatory: use Mandatory option if 1 + * @nn_val: value to register (restricted to 4 bytes) + * Note that NN features are local by definition (RFC 4340, 6.3.2). + */ +static int __feat_register_nn(struct list_head *fn, u8 feat, + u8 mandatory, u64 nn_val) +{ + dccp_feat_val fval = { .nn = nn_val }; + + if (dccp_feat_type(feat) != FEAT_NN || + !dccp_feat_is_valid_nn_val(feat, nn_val)) + return -EINVAL; + + /* Don't bother with default values, they will be activated anyway. */ + if (nn_val - (u64)dccp_feat_default_value(feat) == 0) + return 0; + + return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); +} + +/** + * __feat_register_sp - Register new SP value/list on socket + * @fn: feature-negotiation list to register with + * @feat: an SP feature from %dccp_feature_numbers + * @is_local: whether the local (1) or the remote (0) @feat is meant + * @mandatory: use Mandatory option if 1 + * @sp_val: SP value followed by optional preference list + * @sp_len: length of @sp_val in bytes + */ +static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, + u8 mandatory, u8 const *sp_val, u8 sp_len) +{ + dccp_feat_val fval; + + if (dccp_feat_type(feat) != FEAT_SP || + !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) + return -EINVAL; + + /* Avoid negotiating alien CCIDs by only advertising supported ones */ + if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) + return -EOPNOTSUPP; + + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) + return -ENOMEM; + + return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); +} + +/** + * dccp_feat_register_sp - Register requests to change SP feature values + * @sk: client or listening socket + * @feat: one of %dccp_feature_numbers + * @is_local: whether the local (1) or remote (0) @feat is meant + * @list: array of preferred values, in descending order of preference + * @len: length of @list in bytes + */ +int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len) +{ /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_SP) + return -EINVAL; + return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, + 0, list, len); +} + + +/* + * Tracking features whose value depend on the choice of CCID + * + * This is designed with an extension in mind so that a list walk could be done + * before activating any features. However, the existing framework was found to + * work satisfactorily up until now, the automatic verification is left open. + * When adding new CCIDs, add a corresponding dependency table here. + */ +static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) +{ + static const struct ccid_dependency ccid2_dependencies[2][2] = { + /* + * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX + * feature and Send Ack Vector is an RX feature, `is_local' + * needs to be reversed. + */ + { /* Dependencies of the receiver-side (remote) CCID2 */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = true, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 } + }, + { /* Dependencies of the sender-side (local) CCID2 */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 } + } + }; + static const struct ccid_dependency ccid3_dependencies[2][5] = { + { /* + * Dependencies of the receiver-side CCID3 + */ + { /* locally disable Ack Vectors */ + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = true, + .is_mandatory = false, + .val = 0 + }, + { /* see below why Send Loss Event Rate is on */ + .dependent_feat = DCCPF_SEND_LEV_RATE, + .is_local = true, + .is_mandatory = true, + .val = 1 + }, + { /* NDP Count is needed as per RFC 4342, 6.1.1 */ + .dependent_feat = DCCPF_SEND_NDP_COUNT, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 }, + }, + { /* + * CCID3 at the TX side: we request that the HC-receiver + * will not send Ack Vectors (they will be ignored, so + * Mandatory is not set); we enable Send Loss Event Rate + * (Mandatory since the implementation does not support + * the Loss Intervals option of RFC 4342, 8.6). + * The last two options are for peer's information only. + */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = false, + .is_mandatory = false, + .val = 0 + }, + { + .dependent_feat = DCCPF_SEND_LEV_RATE, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { /* this CCID does not support Ack Ratio */ + .dependent_feat = DCCPF_ACK_RATIO, + .is_local = true, + .is_mandatory = false, + .val = 0 + }, + { /* tell receiver we are sending NDP counts */ + .dependent_feat = DCCPF_SEND_NDP_COUNT, + .is_local = true, + .is_mandatory = false, + .val = 1 + }, + { 0, 0, 0, 0 } + } + }; + switch (ccid) { + case DCCPC_CCID2: + return ccid2_dependencies[is_local]; + case DCCPC_CCID3: + return ccid3_dependencies[is_local]; + default: + return NULL; + } +} + +/** + * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID + * @fn: feature-negotiation list to update + * @id: CCID number to track + * @is_local: whether TX CCID (1) or RX CCID (0) is meant + * This function needs to be called after registering all other features. + */ +static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) +{ + const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); + int i, rc = (table == NULL); + + for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) + if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) + rc = __feat_register_sp(fn, table[i].dependent_feat, + table[i].is_local, + table[i].is_mandatory, + &table[i].val, 1); + else + rc = __feat_register_nn(fn, table[i].dependent_feat, + table[i].is_mandatory, + table[i].val); + return rc; +} + +/** + * dccp_feat_finalise_settings - Finalise settings before starting negotiation + * @dp: client or listening socket (settings will be inherited) + * This is called after all registrations (socket initialisation, sysctls, and + * sockopt calls), and before sending the first packet containing Change options + * (ie. client-Request or server-Response), to ensure internal consistency. + */ +int dccp_feat_finalise_settings(struct dccp_sock *dp) +{ + struct list_head *fn = &dp->dccps_featneg; + struct dccp_feat_entry *entry; + int i = 2, ccids[2] = { -1, -1 }; + + /* + * Propagating CCIDs: + * 1) not useful to propagate CCID settings if this host advertises more + * than one CCID: the choice of CCID may still change - if this is + * the client, or if this is the server and the client sends + * singleton CCID values. + * 2) since is that propagate_ccid changes the list, we defer changing + * the sorted list until after the traversal. + */ + list_for_each_entry(entry, fn, node) + if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) + ccids[entry->is_local] = entry->val.sp.vec[0]; + while (i--) + if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) + return -1; + dccp_feat_print_fnlist(fn); + return 0; +} + +/** + * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features + * It is the server which resolves the dependencies once the CCID has been + * fully negotiated. If no CCID has been negotiated, it uses the default CCID. + */ +int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) +{ + struct list_head *fn = &dreq->dreq_featneg; + struct dccp_feat_entry *entry; + u8 is_local, ccid; + + for (is_local = 0; is_local <= 1; is_local++) { + entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); + + if (entry != NULL && !entry->empty_confirm) + ccid = entry->val.sp.vec[0]; + else + ccid = dccp_feat_default_value(DCCPF_CCID); + + if (dccp_feat_propagate_ccid(fn, ccid, is_local)) + return -1; + } + return 0; +} + +/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ +static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) +{ + u8 c, s; + + for (s = 0; s < slen; s++) + for (c = 0; c < clen; c++) + if (servlist[s] == clilist[c]) + return servlist[s]; + return -1; +} + +/** + * dccp_feat_prefer - Move preferred entry to the start of array + * Reorder the @array_len elements in @array so that @preferred_value comes + * first. Returns >0 to indicate that @preferred_value does occur in @array. + */ +static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) +{ + u8 i, does_occur = 0; + + if (array != NULL) { + for (i = 0; i < array_len; i++) + if (array[i] == preferred_value) { + array[i] = array[0]; + does_occur++; + } + if (does_occur) + array[0] = preferred_value; + } + return does_occur; +} + +/** + * dccp_feat_reconcile - Reconcile SP preference lists + * @fval: SP list to reconcile into + * @arr: received SP preference list + * @len: length of @arr in bytes + * @is_server: whether this side is the server (and @fv is the server's list) + * @reorder: whether to reorder the list in @fv after reconciling with @arr + * When successful, > 0 is returned and the reconciled list is in @fval. + * A value of 0 means that negotiation failed (no shared entry). + */ +static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, + bool is_server, bool reorder) +{ + int rc; + + if (!fv->sp.vec || !arr) { + DCCP_CRIT("NULL feature value or array"); + return 0; + } + + if (is_server) + rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); + else + rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); + + if (!reorder) + return rc; + if (rc < 0) + return 0; + + /* + * Reorder list: used for activating features and in dccp_insert_fn_opt. + */ + return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); +} + +/** + * dccp_feat_change_recv - Process incoming ChangeL/R options + * @fn: feature-negotiation list to update + * @is_mandatory: whether the Change was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R + * @feat: one of %dccp_feature_numbers + * @val: NN value or SP value/preference list + * @len: length of @val in bytes + * @server: whether this node is the server (1) or the client (0) + */ +static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, + u8 feat, u8 *val, u8 len, const bool server) +{ + u8 defval, type = dccp_feat_type(feat); + const bool local = (opt == DCCPO_CHANGE_R); + struct dccp_feat_entry *entry; + dccp_feat_val fval; + + if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ + goto unknown_feature_or_value; + + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); + + /* + * Negotiation of NN features: Change R is invalid, so there is no + * simultaneous negotiation; hence we do not look up in the list. + */ + if (type == FEAT_NN) { + if (local || len > sizeof(fval.nn)) + goto unknown_feature_or_value; + + /* 6.3.2: "The feature remote MUST accept any valid value..." */ + fval.nn = dccp_decode_value_var(val, len); + if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) + goto unknown_feature_or_value; + + return dccp_feat_push_confirm(fn, feat, local, &fval); + } + + /* + * Unidirectional/simultaneous negotiation of SP features (6.3.1) + */ + entry = dccp_feat_list_lookup(fn, feat, local); + if (entry == NULL) { + /* + * No particular preferences have been registered. We deal with + * this situation by assuming that all valid values are equally + * acceptable, and apply the following checks: + * - if the peer's list is a singleton, we accept a valid value; + * - if we are the server, we first try to see if the peer (the + * client) advertises the default value. If yes, we use it, + * otherwise we accept the preferred value; + * - else if we are the client, we use the first list element. + */ + if (dccp_feat_clone_sp_val(&fval, val, 1)) + return DCCP_RESET_CODE_TOO_BUSY; + + if (len > 1 && server) { + defval = dccp_feat_default_value(feat); + if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) + fval.sp.vec[0] = defval; + } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { + kfree(fval.sp.vec); + goto unknown_feature_or_value; + } + + /* Treat unsupported CCIDs like invalid values */ + if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { + kfree(fval.sp.vec); + goto not_valid_or_not_known; + } + + return dccp_feat_push_confirm(fn, feat, local, &fval); + + } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ + return 0; + } + + if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { + entry->empty_confirm = 0; + } else if (is_mandatory) { + return DCCP_RESET_CODE_MANDATORY_ERROR; + } else if (entry->state == FEAT_INITIALISING) { + /* + * Failed simultaneous negotiation (server only): try to `save' + * the connection by checking whether entry contains the default + * value for @feat. If yes, send an empty Confirm to signal that + * the received Change was not understood - which implies using + * the default value. + * If this also fails, we use Reset as the last resort. + */ + WARN_ON(!server); + defval = dccp_feat_default_value(feat); + if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) + return DCCP_RESET_CODE_OPTION_ERROR; + entry->empty_confirm = 1; + } + entry->needs_confirm = 1; + entry->needs_mandatory = 0; + entry->state = FEAT_STABLE; + return 0; + +unknown_feature_or_value: + if (!is_mandatory) + return dccp_push_empty_confirm(fn, feat, local); + +not_valid_or_not_known: + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} + +/** + * dccp_feat_confirm_recv - Process received Confirm options + * @fn: feature-negotiation list to update + * @is_mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R + * @feat: one of %dccp_feature_numbers + * @val: NN value or SP value/preference list + * @len: length of @val in bytes + * @server: whether this node is server (1) or client (0) + */ +static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, + u8 feat, u8 *val, u8 len, const bool server) +{ + u8 *plist, plen, type = dccp_feat_type(feat); + const bool local = (opt == DCCPO_CONFIRM_R); + struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); + + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); + + if (entry == NULL) { /* nothing queued: ignore or handle error */ + if (is_mandatory && type == FEAT_UNKNOWN) + return DCCP_RESET_CODE_MANDATORY_ERROR; + + if (!local && type == FEAT_NN) /* 6.3.2 */ + goto confirmation_failed; + return 0; + } + + if (entry->state != FEAT_CHANGING) /* 6.6.2 */ + return 0; + + if (len == 0) { + if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ + goto confirmation_failed; + /* + * Empty Confirm during connection setup: this means reverting + * to the `old' value, which in this case is the default. Since + * we handle default values automatically when no other values + * have been set, we revert to the old value by removing this + * entry from the list. + */ + dccp_feat_list_pop(entry); + return 0; + } + + if (type == FEAT_NN) { + if (len > sizeof(entry->val.nn)) + goto confirmation_failed; + + if (entry->val.nn == dccp_decode_value_var(val, len)) + goto confirmation_succeeded; + + DCCP_WARN("Bogus Confirm for non-existing value\n"); + goto confirmation_failed; + } + + /* + * Parsing SP Confirms: the first element of @val is the preferred + * SP value which the peer confirms, the remainder depends on @len. + * Note that only the confirmed value need to be a valid SP value. + */ + if (!dccp_feat_is_valid_sp_val(feat, *val)) + goto confirmation_failed; + + if (len == 1) { /* peer didn't supply a preference list */ + plist = val; + plen = len; + } else { /* preferred value + preference list */ + plist = val + 1; + plen = len - 1; + } + + /* Check whether the peer got the reconciliation right (6.6.8) */ + if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { + DCCP_WARN("Confirm selected the wrong value %u\n", *val); + return DCCP_RESET_CODE_OPTION_ERROR; + } + entry->val.sp.vec[0] = *val; + +confirmation_succeeded: + entry->state = FEAT_STABLE; + return 0; + +confirmation_failed: + DCCP_WARN("Confirmation failed\n"); + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} + +/** + * dccp_feat_parse_options - Process Feature-Negotiation Options + * @sk: for general use and used by the client during connection setup + * @dreq: used by the server during connection setup + * @mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R + * @feat: one of %dccp_feature_numbers + * @val: value contents of @opt + * @len: length of @val in bytes + * Returns 0 on success, a Reset code for ending the connection otherwise. + */ +int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, + u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; + bool server = false; + + switch (sk->sk_state) { + /* + * Negotiation during connection setup + */ + case DCCP_LISTEN: + server = true; /* fall through */ + case DCCP_REQUESTING: + switch (opt) { + case DCCPO_CHANGE_L: + case DCCPO_CHANGE_R: + return dccp_feat_change_recv(fn, mandatory, opt, feat, + val, len, server); + case DCCPO_CONFIRM_R: + case DCCPO_CONFIRM_L: + return dccp_feat_confirm_recv(fn, mandatory, opt, feat, + val, len, server); + } + } + return 0; /* ignore FN options in all other states */ +} + +/** + * dccp_feat_init - Seed feature negotiation with host-specific defaults + * This initialises global defaults, depending on the value of the sysctls. + * These can later be overridden by registering changes via setsockopt calls. + * The last link in the chain is finalise_settings, to make sure that between + * here and the start of actual feature negotiation no inconsistencies enter. + * + * All features not appearing below use either defaults or are otherwise + * later adjusted through dccp_feat_finalise_settings(). + */ +int dccp_feat_init(struct sock *sk) +{ + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; + u8 on = 1, off = 0; + int rc; + struct { + u8 *val; + u8 len; + } tx, rx; + + /* Non-negotiable (NN) features */ + rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, + sysctl_dccp_sequence_window); + if (rc) + return rc; + + /* Server-priority (SP) features */ + + /* Advertise that short seqnos are not supported (7.6.1) */ + rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); + if (rc) + return rc; + + /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ + rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); + if (rc) + return rc; + + /* + * We advertise the available list of CCIDs and reorder according to + * preferences, to avoid failure resulting from negotiating different + * singleton values (which always leads to failure). + * These settings can still (later) be overridden via sockopts. + */ + if (ccid_get_builtin_ccids(&tx.val, &tx.len) || + ccid_get_builtin_ccids(&rx.val, &rx.len)) + return -ENOBUFS; + + if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || + !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); + if (rc) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); + +free_ccid_lists: + kfree(tx.val); + kfree(rx.val); + return rc; +} + +int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_feat_entry *cur, *next; + int idx; + dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { + [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } + }; + + list_for_each_entry(cur, fn_list, node) { + /* + * An empty Confirm means that either an unknown feature type + * or an invalid value was present. In the first case there is + * nothing to activate, in the other the default value is used. + */ + if (cur->empty_confirm) + continue; + + idx = dccp_feat_index(cur->feat_num); + if (idx < 0) { + DCCP_BUG("Unknown feature %u", cur->feat_num); + goto activation_failed; + } + if (cur->state != FEAT_STABLE) { + DCCP_CRIT("Negotiation of %s %s failed in state %s", + cur->is_local ? "local" : "remote", + dccp_feat_fname(cur->feat_num), + dccp_feat_sname[cur->state]); + goto activation_failed; + } + fvals[idx][cur->is_local] = &cur->val; + } + + /* + * Activate in decreasing order of index, so that the CCIDs are always + * activated as the last feature. This avoids the case where a CCID + * relies on the initialisation of one or more features that it depends + * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). + */ + for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) + if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || + __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { + DCCP_CRIT("Could not activate %d", idx); + goto activation_failed; + } + + /* Clean up Change options which have been confirmed already */ + list_for_each_entry_safe(cur, next, fn_list, node) + if (!cur->needs_confirm) + dccp_feat_list_pop(cur); + + dccp_pr_debug("Activation OK\n"); + return 0; + +activation_failed: + /* + * We clean up everything that may have been allocated, since + * it is difficult to track at which stage negotiation failed. + * This is ok, since all allocation functions below are robust + * against NULL arguments. + */ + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + return -1; +} diff --git a/net/dccp/feat.h b/net/dccp/feat.h new file mode 100644 index 00000000..e56a4e5e --- /dev/null +++ b/net/dccp/feat.h @@ -0,0 +1,136 @@ +#ifndef _DCCP_FEAT_H +#define _DCCP_FEAT_H +/* + * net/dccp/feat.h + * + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) + * Copyright (c) 2008 Gerrit Renker + * Copyright (c) 2005 Andrea Bittau + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include "dccp.h" + +/* + * Known limit values + */ +/* Ack Ratio takes 2-byte integer values (11.3) */ +#define DCCPF_ACK_RATIO_MAX 0xFFFF +/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ +#define DCCPF_SEQ_WMIN 32 +#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull +/* Maximum number of SP values that fit in a single (Confirm) option */ +#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) + +enum dccp_feat_type { + FEAT_AT_RX = 1, /* located at RX side of half-connection */ + FEAT_AT_TX = 2, /* located at TX side of half-connection */ + FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ + FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ + FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ +}; + +enum dccp_feat_state { + FEAT_DEFAULT = 0, /* using default values from 6.4 */ + FEAT_INITIALISING, /* feature is being initialised */ + FEAT_CHANGING, /* Change sent but not confirmed yet */ + FEAT_UNSTABLE, /* local modification in state CHANGING */ + FEAT_STABLE /* both ends (think they) agree */ +}; + +/** + * dccp_feat_val - Container for SP or NN feature values + * @nn: single NN value + * @sp.vec: single SP value plus optional preference list + * @sp.len: length of @sp.vec in bytes + */ +typedef union { + u64 nn; + struct { + u8 *vec; + u8 len; + } sp; +} dccp_feat_val; + +/** + * struct feat_entry - Data structure to perform feature negotiation + * @val: feature's current value (SP features may have preference list) + * @state: feature's current state + * @feat_num: one of %dccp_feature_numbers + * @needs_mandatory: whether Mandatory options should be sent + * @needs_confirm: whether to send a Confirm instead of a Change + * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) + * @is_local: feature location (1) or feature-remote (0) + * @node: list pointers, entries arranged in FIFO order + */ +struct dccp_feat_entry { + dccp_feat_val val; + enum dccp_feat_state state:8; + u8 feat_num; + + bool needs_mandatory, + needs_confirm, + empty_confirm, + is_local; + + struct list_head node; +}; + +static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) +{ + if (entry->needs_confirm) + return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; + return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; +} + +/** + * struct ccid_dependency - Track changes resulting from choosing a CCID + * @dependent_feat: one of %dccp_feature_numbers + * @is_local: local (1) or remote (0) @dependent_feat + * @is_mandatory: whether presence of @dependent_feat is mission-critical or not + * @val: corresponding default value for @dependent_feat (u8 is sufficient here) + */ +struct ccid_dependency { + u8 dependent_feat; + bool is_local:1, + is_mandatory:1; + u8 val; +}; + +/* + * Sysctls to seed defaults for feature negotiation + */ +extern unsigned long sysctl_dccp_sequence_window; +extern int sysctl_dccp_rx_ccid; +extern int sysctl_dccp_tx_ccid; + +extern int dccp_feat_init(struct sock *sk); +extern void dccp_feat_initialise_sysctls(void); +extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len); +extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, + u8 mand, u8 opt, u8 feat, u8 *val, u8 len); +extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); + +/* + * Encoding variable-length options and their maximum length. + * + * This affects NN options (SP options are all u8) and other variable-length + * options (see table 3 in RFC 4340). The limit is currently given the Sequence + * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other + * options consume less than 6 bytes (timestamps are 4 bytes). + * When updating this constant (e.g. due to new internet drafts / RFCs), make + * sure that you also update all code which refers to it. + */ +#define DCCP_OPTVAL_MAXLEN 6 + +extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); +extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); + +extern int dccp_insert_option_mandatory(struct sk_buff *skb); +extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len, bool repeat_first); +#endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c new file mode 100644 index 00000000..4222e7a6 --- /dev/null +++ b/net/dccp/input.c @@ -0,0 +1,732 @@ +/* + * net/dccp/input.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include + +#include "ackvec.h" +#include "ccid.h" +#include "dccp.h" + +/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ +int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; + +static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) +{ + __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); +} + +static void dccp_fin(struct sock *sk, struct sk_buff *skb) +{ + /* + * On receiving Close/CloseReq, both RD/WR shutdown are performed. + * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after + * receiving the closing segment, but there is no guarantee that such + * data will be processed at all. + */ + sk->sk_shutdown = SHUTDOWN_MASK; + sock_set_flag(sk, SOCK_DONE); + dccp_enqueue_skb(sk, skb); +} + +static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +{ + int queued = 0; + + switch (sk->sk_state) { + /* + * We ignore Close when received in one of the following states: + * - CLOSED (may be a late or duplicate packet) + * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) + * - RESPOND (already handled by dccp_check_req) + */ + case DCCP_CLOSING: + /* + * Simultaneous-close: receiving a Close after sending one. This + * can happen if both client and server perform active-close and + * will result in an endless ping-pong of crossing and retrans- + * mitted Close packets, which only terminates when one of the + * nodes times out (min. 64 seconds). Quicker convergence can be + * achieved when one of the nodes acts as tie-breaker. + * This is ok as both ends are done with data transfer and each + * end is just waiting for the other to acknowledge termination. + */ + if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) + break; + /* fall through */ + case DCCP_REQUESTING: + case DCCP_ACTIVE_CLOSEREQ: + dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_done(sk); + break; + case DCCP_OPEN: + case DCCP_PARTOPEN: + /* Give waiting application a chance to read pending data */ + queued = 1; + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_PASSIVE_CLOSE); + /* fall through */ + case DCCP_PASSIVE_CLOSE: + /* + * Retransmitted Close: we have already enqueued the first one. + */ + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + } + return queued; +} + +static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) +{ + int queued = 0; + + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); + return queued; + } + + /* Step 13: process relevant Client states < CLOSEREQ */ + switch (sk->sk_state) { + case DCCP_REQUESTING: + dccp_send_close(sk, 0); + dccp_set_state(sk, DCCP_CLOSING); + break; + case DCCP_OPEN: + case DCCP_PARTOPEN: + /* Give waiting application a chance to read pending data */ + queued = 1; + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); + /* fall through */ + case DCCP_PASSIVE_CLOSEREQ: + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + } + return queued; +} + +static u16 dccp_reset_code_convert(const u8 code) +{ + const u16 error_code[] = { + [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ + [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ + [DCCP_RESET_CODE_ABORTED] = ECONNRESET, + + [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, + [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, + [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, + [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, + + [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, + [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, + [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, + [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, + [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, + }; + + return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; +} + +static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) +{ + u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); + + sk->sk_err = err; + + /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ + dccp_fin(sk, skb); + + if (err && !sock_flag(sk, SOCK_DEAD)) + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); +} + +static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; + + if (av == NULL) + return; + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvec_input(av, skb); +} + +static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + /* Don't deliver to RX CCID when node has shut down read end. */ + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + /* + * Until the TX queue has been drained, we can not honour SHUT_WR, since + * we need received feedback as input to adjust congestion control. + */ + if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); +} + +static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + struct dccp_sock *dp = dccp_sk(sk); + u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, + ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; + + /* + * Step 5: Prepare sequence numbers for Sync + * If P.type == Sync or P.type == SyncAck, + * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, + * / * P is valid, so update sequence number variables + * accordingly. After this update, P will pass the tests + * in Step 6. A SyncAck is generated if necessary in + * Step 15 * / + * Update S.GSR, S.SWL, S.SWH + * Otherwise, + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_SYNC || + dh->dccph_type == DCCP_PKT_SYNCACK) { + if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && + dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) + dccp_update_gsr(sk, seqno); + else + return -1; + } + + /* + * Step 6: Check sequence numbers + * Let LSWL = S.SWL and LAWL = S.AWL + * If P.type == CloseReq or P.type == Close or P.type == Reset, + * LSWL := S.GSR + 1, LAWL := S.GAR + * If LSWL <= P.seqno <= S.SWH + * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), + * Update S.GSR, S.SWL, S.SWH + * If P.type != Sync, + * Update S.GAR + */ + lswl = dp->dccps_swl; + lawl = dp->dccps_awl; + + if (dh->dccph_type == DCCP_PKT_CLOSEREQ || + dh->dccph_type == DCCP_PKT_CLOSE || + dh->dccph_type == DCCP_PKT_RESET) { + lswl = ADD48(dp->dccps_gsr, 1); + lawl = dp->dccps_gar; + } + + if (between48(seqno, lswl, dp->dccps_swh) && + (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || + between48(ackno, lawl, dp->dccps_awh))) { + dccp_update_gsr(sk, seqno); + + if (dh->dccph_type != DCCP_PKT_SYNC && + ackno != DCCP_PKT_WITHOUT_ACK_SEQ && + after48(ackno, dp->dccps_gar)) + dp->dccps_gar = ackno; + } else { + unsigned long now = jiffies; + /* + * Step 6: Check sequence numbers + * Otherwise, + * If P.type == Reset, + * Send Sync packet acknowledging S.GSR + * Otherwise, + * Send Sync packet acknowledging P.seqno + * Drop packet and return + * + * These Syncs are rate-limited as per RFC 4340, 7.5.4: + * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. + */ + if (time_before(now, (dp->dccps_rate_last + + sysctl_dccp_sync_ratelimit))) + return -1; + + DCCP_WARN("Step 6 failed for %s packet, " + "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " + "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " + "sending SYNC...\n", dccp_packet_name(dh->dccph_type), + (unsigned long long) lswl, (unsigned long long) seqno, + (unsigned long long) dp->dccps_swh, + (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" + : "exists", + (unsigned long long) lawl, (unsigned long long) ackno, + (unsigned long long) dp->dccps_awh); + + dp->dccps_rate_last = now; + + if (dh->dccph_type == DCCP_PKT_RESET) + seqno = dp->dccps_gsr; + dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); + return -1; + } + + return 0; +} + +static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + switch (dccp_hdr(skb)->dccph_type) { + case DCCP_PKT_DATAACK: + case DCCP_PKT_DATA: + /* + * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when + * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" + * - sk_receive_queue is full, use Code 2, "Receive Buffer" + */ + dccp_enqueue_skb(sk, skb); + return 0; + case DCCP_PKT_ACK: + goto discard; + case DCCP_PKT_RESET: + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + dccp_rcv_reset(sk, skb); + return 0; + case DCCP_PKT_CLOSEREQ: + if (dccp_rcv_closereq(sk, skb)) + return 0; + goto discard; + case DCCP_PKT_CLOSE: + if (dccp_rcv_close(sk, skb)) + return 0; + goto discard; + case DCCP_PKT_REQUEST: + /* Step 7 + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state >= OPEN and P.type == Request + * and P.seqno >= S.OSR) + * or (S.state >= OPEN and P.type == Response + * and P.seqno >= S.OSR) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dp->dccps_role != DCCP_ROLE_LISTEN) + goto send_sync; + goto check_seq; + case DCCP_PKT_RESPONSE: + if (dp->dccps_role != DCCP_ROLE_CLIENT) + goto send_sync; +check_seq: + if (dccp_delta_seqno(dp->dccps_osr, + DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { +send_sync: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNC); + } + break; + case DCCP_PKT_SYNC: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNCACK); + /* + * From RFC 4340, sec. 5.7 + * + * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets + * MAY have non-zero-length application data areas, whose + * contents receivers MUST ignore. + */ + goto discard; + } + + DCCP_INC_STATS_BH(DCCP_MIB_INERRS); +discard: + __kfree_skb(skb); + return 0; +} + +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, NULL, skb)) + return 1; + + dccp_handle_ackvec_processing(sk, skb); + dccp_deliver_input_to_ccids(sk, skb); + + return __dccp_rcv_established(sk, skb, dh, len); +discard: + __kfree_skb(skb); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rcv_established); + +static int dccp_rcv_request_sent_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + /* + * Step 4: Prepare sequence numbers in REQUEST + * If S.state == REQUEST, + * If (P.type == Response or P.type == Reset) + * and S.AWL <= P.ackno <= S.AWH, + * / * Set sequence number variables corresponding to the + * other endpoint, so P will pass the tests in Step 6 * / + * Set S.GSR, S.ISR, S.SWL, S.SWH + * / * Response processing continues in Step 10; Reset + * processing continues in Step 9 * / + */ + if (dh->dccph_type == DCCP_PKT_RESPONSE) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dccp_sock *dp = dccp_sk(sk); + long tstamp = dccp_timestamp(); + + if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + dp->dccps_awl, dp->dccps_awh)) { + dccp_pr_debug("invalid ackno: S.AWL=%llu, " + "P.ackno=%llu, S.AWH=%llu\n", + (unsigned long long)dp->dccps_awl, + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long)dp->dccps_awh); + goto out_invalid_packet; + } + + /* + * If option processing (Step 8) failed, return 1 here so that + * dccp_v4_do_rcv() sends a Reset. The Reset code depends on + * the option type and is set in dccp_parse_options(). + */ + if (dccp_parse_options(sk, NULL, skb)) + return 1; + + /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ + if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) + dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - + dp->dccps_options_received.dccpor_timestamp_echo)); + + /* Stop the REQUEST timer */ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + WARN_ON(sk->sk_send_head == NULL); + kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + + /* + * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect + * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH + * is done as part of activating the feature values below, since + * these settings depend on the local/remote Sequence Window + * features, which were undefined or not confirmed until now. + */ + dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + + dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); + + /* + * Step 10: Process REQUEST state (second part) + * If S.state == REQUEST, + * / * If we get here, P is a valid Response from the + * server (see Step 4), and we should move to + * PARTOPEN state. PARTOPEN means send an Ack, + * don't send Data packets, retransmit Acks + * periodically, and always include any Init Cookie + * from the Response * / + * S.state := PARTOPEN + * Set PARTOPEN timer + * Continue with S.state == PARTOPEN + * / * Step 12 will send the Ack completing the + * three-way handshake * / + */ + dccp_set_state(sk, DCCP_PARTOPEN); + + /* + * If feature negotiation was successful, activate features now; + * an activation failure means that this host could not activate + * one ore more features (e.g. insufficient memory), which would + * leave at least one feature in an undefined state. + */ + if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) + goto unable_to_proceed; + + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } + + if (sk->sk_write_pending || icsk->icsk_ack.pingpong || + icsk->icsk_accept_queue.rskq_defer_accept) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + /* + * OK, in DCCP we can as well do a similar trick, its + * even in the draft, but there is no need for us to + * schedule an ack here, as dccp_sendmsg does this for + * us, also stated in the draft. -acme + */ + __kfree_skb(skb); + return 0; + } + dccp_send_ack(sk); + return -1; + } + +out_invalid_packet: + /* dccp_v4_do_rcv will send a reset */ + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; + return 1; + +unable_to_proceed: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; + /* + * We mark this socket as no longer usable, so that the loop in + * dccp_sendmsg() terminates and the application gets notified. + */ + dccp_set_state(sk, DCCP_CLOSED); + sk->sk_err = ECOMM; + return 1; +} + +static int dccp_rcv_respond_partopen_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; + int queued = 0; + + switch (dh->dccph_type) { + case DCCP_PKT_RESET: + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + break; + case DCCP_PKT_DATA: + if (sk->sk_state == DCCP_RESPOND) + break; + case DCCP_PKT_DATAACK: + case DCCP_PKT_ACK: + /* + * FIXME: we should be reseting the PARTOPEN (DELACK) timer + * here but only if we haven't used the DELACK timer for + * something else, like sending a delayed ack for a TIMESTAMP + * echo, etc, for now were not clearing it, sending an extra + * ACK when there is nothing else to do in DELACK is not a big + * deal after all. + */ + + /* Stop the PARTOPEN timer */ + if (sk->sk_state == DCCP_PARTOPEN) + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + + /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ + if (likely(sample)) { + long delta = dccp_timestamp() - sample; + + dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); + } + + dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_set_state(sk, DCCP_OPEN); + + if (dh->dccph_type == DCCP_PKT_DATAACK || + dh->dccph_type == DCCP_PKT_DATA) { + __dccp_rcv_established(sk, skb, dh, len); + queued = 1; /* packet was queued + (by __dccp_rcv_established) */ + } + break; + } + + return queued; +} + +int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const int old_state = sk->sk_state; + int queued = 0; + + /* + * Step 3: Process LISTEN state + * + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie option, + * (* Must scan the packet's options to check for Init + * Cookies. Only Init Cookies are processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies *) + * (* Generate a new socket and switch to that socket *) + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookies + * Initialize S.GAR := S.ISS + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init + * Cookies Continue with S.state == RESPOND + * (* A Response packet will be generated in Step 11 *) + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk->sk_state == DCCP_LISTEN) { + if (dh->dccph_type == DCCP_PKT_REQUEST) { + if (inet_csk(sk)->icsk_af_ops->conn_request(sk, + skb) < 0) + return 1; + goto discard; + } + if (dh->dccph_type == DCCP_PKT_RESET) + goto discard; + + /* Caller (dccp_v4_do_rcv) will send Reset */ + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; + } else if (sk->sk_state == DCCP_CLOSED) { + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; + } + + if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { + if (dccp_check_seqno(sk, skb)) + goto discard; + + /* + * Step 8: Process options and mark acknowledgeable + */ + if (dccp_parse_options(sk, NULL, skb)) + return 1; + + dccp_handle_ackvec_processing(sk, skb); + dccp_deliver_input_to_ccids(sk, skb); + } + + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_RESET) { + dccp_rcv_reset(sk, skb); + return 0; + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_RESPONSE) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && + dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { + if (dccp_rcv_closereq(sk, skb)) + return 0; + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { + if (dccp_rcv_close(sk, skb)) + return 0; + goto discard; + } + + switch (sk->sk_state) { + case DCCP_REQUESTING: + queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); + if (queued >= 0) + return queued; + + __kfree_skb(skb); + return 0; + + case DCCP_RESPOND: + case DCCP_PARTOPEN: + queued = dccp_rcv_respond_partopen_state_process(sk, skb, + dh, len); + break; + } + + if (dh->dccph_type == DCCP_PKT_ACK || + dh->dccph_type == DCCP_PKT_DATAACK) { + switch (old_state) { + case DCCP_PARTOPEN: + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + break; + } + } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); + goto discard; + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rcv_state_process); + +/** + * dccp_sample_rtt - Validate and finalise computation of RTT sample + * @delta: number of microseconds between packet and acknowledgment + * The routine is kept generic to work in different contexts. It should be + * called immediately when the ACK used for the RTT sample arrives. + */ +u32 dccp_sample_rtt(struct sock *sk, long delta) +{ + /* dccpor_elapsed_time is either zeroed out or set and > 0 */ + delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; + + if (unlikely(delta <= 0)) { + DCCP_WARN("unusable RTT sample %ld, using min\n", delta); + return DCCP_SANE_RTT_MIN; + } + if (unlikely(delta > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %ld too large, using max\n", delta); + return DCCP_SANE_RTT_MAX; + } + + return delta; +} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c new file mode 100644 index 00000000..332639b5 --- /dev/null +++ b/net/dccp/ipv4.c @@ -0,0 +1,1080 @@ +/* + * net/dccp/ipv4.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ackvec.h" +#include "ccid.h" +#include "dccp.h" +#include "feat.h" + +/* + * The per-net dccp.v4_ctl_sk socket is used for responding to + * the Out-of-the-blue (OOTB) packets. A control sock will be created + * for this socket at the initialization time. + */ + +int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; + struct flowi4 *fl4; + struct rtable *rt; + int err; + struct ip_options_rcu *inet_opt; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt != NULL && inet_opt->opt.srr) { + if (daddr == 0) + return -EINVAL; + nexthop = inet_opt->opt.faddr; + } + + orig_sport = inet->inet_sport; + orig_dport = usin->sin_port; + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_DCCP, + orig_sport, orig_dport, sk, true); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (inet_opt == NULL || !inet_opt->opt.srr) + daddr = fl4->daddr; + + if (inet->inet_saddr == 0) + inet->inet_saddr = fl4->saddr; + inet->inet_rcv_saddr = inet->inet_saddr; + + inet->inet_dport = usin->sin_port; + inet->inet_daddr = daddr; + + inet_csk(sk)->icsk_ext_hdr_len = 0; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + /* + * Socket identity is still unknown (sport may be zero). + * However we set state to DCCP_REQUESTING and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + dccp_set_state(sk, DCCP_REQUESTING); + err = inet_hash_connect(&dccp_death_row, sk); + if (err != 0) + goto failure; + + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, + inet->inet_sport, inet->inet_dport, sk); + if (IS_ERR(rt)) { + rt = NULL; + goto failure; + } + /* OK, now commit destination to socket. */ + sk_setup_caps(sk, &rt->dst); + + dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + inet->inet_dport); + inet->inet_id = dp->dccps_iss ^ jiffies; + + err = dccp_connect(sk); + rt = NULL; + if (err != 0) + goto failure; +out: + return err; +failure: + /* + * This unhashes the socket and releases the local port, if necessary. + */ + dccp_set_state(sk, DCCP_CLOSED); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->inet_dport = 0; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_v4_connect); + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void dccp_do_pmtu_discovery(struct sock *sk, + const struct iphdr *iph, + u32 mtu) +{ + struct dst_entry *dst; + const struct inet_sock *inet = inet_sk(sk); + const struct dccp_sock *dp = dccp_sk(sk); + + /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs + * send out by Linux are always < 576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == DCCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + inet_csk(sk)->icsk_pmtu_cookie > mtu) { + dccp_sync_mss(sk, mtu); + + /* + * From RFC 4340, sec. 14.1: + * + * DCCP-Sync packets are the best choice for upward + * probing, since DCCP-Sync probes do not risk application + * data loss. + */ + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); + } /* else let the usual retransmit timer handle it */ +} + +/* + * This routine is called by the ICMP module when it gets some sort of error + * condition. If err < 0 then the socket should be closed and the error + * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. + * After adjustment header points to the first 8 bytes of the tcp header. We + * need to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When someone else + * accesses the socket the ICMP is just dropped and for some paths there is no + * check at all. A more general error queue to queue errors for later handling + * is probably better. + */ +static void dccp_v4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (struct iphdr *)skb->data; + const u8 offset = iph->ihl << 2; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + struct dccp_sock *dp; + struct inet_sock *inet; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct sock *sk; + __u64 seq; + int err; + struct net *net = dev_net(skb->dev); + + if (skb->len < offset + sizeof(*dh) || + skb->len < offset + __dccp_basic_hdr_len(dh)) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + return; + } + + sk = inet_lookup(net, &dccp_hashinfo, + iph->daddr, dh->dccph_dport, + iph->saddr, dh->dccph_sport, inet_iif(skb)); + if (sk == NULL) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + dp = dccp_sk(sk); + seq = dccp_hdr_seq(dh); + if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && + !between48(seq, dp->dccps_awl, dp->dccps_awh)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + dccp_do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct request_sock *req , **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + req = inet_csk_search_req(sk, &prev, dh->dccph_dport, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + WARN_ON(req->sk); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + /* + * Still in RESPOND, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else /* Only an error on timeout */ + sk->sk_err_soft = err; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, + __be32 src, __be32 dst) +{ + return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); +} + +void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) +{ + const struct inet_sock *inet = inet_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dccp_csum_outgoing(skb); + dh->dccph_checksum = dccp_v4_csum_finish(skb, + inet->inet_saddr, + inet->inet_daddr); +} + +EXPORT_SYMBOL_GPL(dccp_v4_send_check); + +static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) +{ + return secure_dccp_sequence_number(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + dccp_hdr(skb)->dccph_dport, + dccp_hdr(skb)->dccph_sport); +} + +/* + * The three way handshake has completed - we got a valid ACK or DATAACK - + * now create the new socket. + * + * This is the equivalent of TCP's tcp_v4_syn_recv_sock + */ +struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_request_sock *ireq; + struct inet_sock *newinet; + struct sock *newsk; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto exit_nonewsk; + + newinet = inet_sk(newsk); + ireq = inet_rsk(req); + newinet->inet_daddr = ireq->rmt_addr; + newinet->inet_rcv_saddr = ireq->loc_addr; + newinet->inet_saddr = ireq->loc_addr; + newinet->inet_opt = ireq->opt; + ireq->opt = NULL; + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = ip_hdr(skb)->ttl; + newinet->inet_id = jiffies; + + if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) + goto put_and_exit; + + sk_setup_caps(newsk, dst); + + dccp_sync_mss(newsk, dst_mtu(dst)); + + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; + __inet_hash_nolisten(newsk, NULL); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); +exit: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return NULL; +put_and_exit: + sock_put(newsk); + goto exit; +} + +EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); + +static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet_csk_search_req(sk, &prev, + dh->dccph_sport, + iph->saddr, iph->daddr); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, + iph->saddr, dh->dccph_sport, + iph->daddr, dh->dccph_dport, + inet_iif(skb)); + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put(inet_twsk(nsk)); + return NULL; + } + + return sk; +} + +static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4 = { + .flowi4_oif = skb_rtable(skb)->rt_iif, + .daddr = ip_hdr(skb)->saddr, + .saddr = ip_hdr(skb)->daddr, + .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_proto = sk->sk_protocol, + .fl4_sport = dccp_hdr(skb)->dccph_dport, + .fl4_dport = dccp_hdr(skb)->dccph_sport, + }; + + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) { + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + + return &rt->dst; +} + +static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, + struct request_values *rv_unused) +{ + int err = -1; + struct sk_buff *skb; + struct dst_entry *dst; + struct flowi4 fl4; + + dst = inet_csk_route_req(sk, &fl4, req); + if (dst == NULL) + goto out; + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr, + ireq->rmt_addr); + err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, + ireq->rmt_addr, + ireq->opt); + err = net_xmit_eval(err); + } + +out: + dst_release(dst); + return err; +} + +static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +{ + int err; + const struct iphdr *rxiph; + struct sk_buff *skb; + struct dst_entry *dst; + struct net *net = dev_net(skb_dst(rxskb)->dev); + struct sock *ctl_sk = net->dccp.v4_ctl_sk; + + /* Never send a reset in response to a reset. */ + if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) + return; + + if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) + return; + + dst = dccp_v4_route_skb(net, ctl_sk, rxskb); + if (dst == NULL) + return; + + skb = dccp_ctl_make_reset(ctl_sk, rxskb); + if (skb == NULL) + goto out; + + rxiph = ip_hdr(rxskb); + dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, + rxiph->daddr); + skb_dst_set(skb, dst_clone(dst)); + + bh_lock_sock(ctl_sk); + err = ip_build_and_send_pkt(skb, ctl_sk, + rxiph->daddr, rxiph->saddr, NULL); + bh_unlock_sock(ctl_sk); + + if (net_xmit_eval(err) == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +out: + dst_release(dst); +} + +static void dccp_v4_reqsk_destructor(struct request_sock *req) +{ + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); + kfree(inet_rsk(req)->opt); +} + +static struct request_sock_ops dccp_request_sock_ops __read_mostly = { + .family = PF_INET, + .obj_size = sizeof(struct dccp_request_sock), + .rtx_syn_ack = dccp_v4_send_response, + .send_ack = dccp_reqsk_send_ack, + .destructor = dccp_v4_reqsk_destructor, + .send_reset = dccp_v4_ctl_send_reset, +}; + +int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct request_sock *req; + struct dccp_request_sock *dreq; + const __be32 service = dccp_hdr_request(skb)->dccph_req_service; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + + /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return 0; /* discard, don't send a reset here */ + + if (dccp_bad_service_code(sk, service)) { + dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; + goto drop; + } + /* + * TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + /* + * Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet_reqsk_alloc(&dccp_request_sock_ops); + if (req == NULL) + goto drop; + + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; + + dreq = dccp_rsk(req); + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + ireq = inet_rsk(req); + ireq->loc_addr = ip_hdr(skb)->daddr; + ireq->rmt_addr = ip_hdr(skb)->saddr; + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq->dreq_isr = dcb->dccpd_seq; + dreq->dreq_iss = dccp_v4_init_sequence(skb); + dreq->dreq_service = service; + + if (dccp_v4_send_response(sk, req, NULL)) + goto drop_and_free; + + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + return -1; +} + +EXPORT_SYMBOL_GPL(dccp_v4_conn_request); + +int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_hdr *dh = dccp_hdr(skb); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dh, skb->len)) + goto reset; + return 0; + } + + /* + * Step 3: Process LISTEN state + * If P.type == Request or P contains a valid Init Cookie option, + * (* Must scan the packet's options to check for Init + * Cookies. Only Init Cookies are processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies *) + * (* Generate a new socket and switch to that socket *) + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookies + * Initialize S.GAR := S.ISS + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies + * Continue with S.state == RESPOND + * (* A Response packet will be generated in Step 11 *) + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in + * dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v4_hnd_req(sk, skb); + + if (nsk == NULL) + goto discard; + + if (nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dh, skb->len)) + goto reset; + return 0; + +reset: + dccp_v4_ctl_send_reset(sk, skb); +discard: + kfree_skb(skb); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); + +/** + * dccp_invalid_packet - check for malformed packets + * Implements RFC 4340, 8.5: Step 1: Check header basics + * Packets that fail these checks are ignored and do not receive Resets. + */ +int dccp_invalid_packet(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + unsigned int cscov; + + if (skb->pkt_type != PACKET_HOST) + return 1; + + /* If the packet is shorter than 12 bytes, drop packet and return */ + if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { + DCCP_WARN("pskb_may_pull failed\n"); + return 1; + } + + dh = dccp_hdr(skb); + + /* If P.type is not understood, drop packet and return */ + if (dh->dccph_type >= DCCP_PKT_INVALID) { + DCCP_WARN("invalid packet type\n"); + return 1; + } + + /* + * If P.Data Offset is too small for packet type, drop packet and return + */ + if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { + DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff); + return 1; + } + /* + * If P.Data Offset is too too large for packet, drop packet and return + */ + if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { + DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff); + return 1; + } + + /* + * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet + * has short sequence numbers), drop packet and return + */ + if ((dh->dccph_type < DCCP_PKT_DATA || + dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { + DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", + dccp_packet_name(dh->dccph_type)); + return 1; + } + + /* + * If P.CsCov is too large for the packet size, drop packet and return. + * This must come _before_ checksumming (not as RFC 4340 suggests). + */ + cscov = dccp_csum_coverage(skb); + if (cscov > skb->len) { + DCCP_WARN("P.CsCov %u exceeds packet length %d\n", + dh->dccph_cscov, skb->len); + return 1; + } + + /* If header checksum is incorrect, drop packet and return. + * (This step is completed in the AF-dependent functions.) */ + skb->csum = skb_checksum(skb, 0, cscov, 0); + + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_invalid_packet); + +/* this is called when real data arrives */ +static int dccp_v4_rcv(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + const struct iphdr *iph; + struct sock *sk; + int min_cov; + + /* Step 1: Check header basics */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + iph = ip_hdr(skb); + /* Step 1: If header checksum is incorrect, drop packet and return */ + if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { + DCCP_WARN("dropped packet with invalid checksum\n"); + goto discard_it; + } + + dh = dccp_hdr(skb); + + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", + dccp_packet_name(dh->dccph_type), + &iph->saddr, ntohs(dh->dccph_sport), + &iph->daddr, ntohs(dh->dccph_dport), + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); + + if (dccp_packet_without_ack(skb)) { + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + dccp_pr_debug_cat("\n"); + } else { + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + } + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet_lookup_skb(&dccp_hashinfo, skb, + dh->dccph_sport, dh->dccph_dport); + /* + * Step 2: + * If no socket ... + */ + if (sk == NULL) { + dccp_pr_debug("failed to look up flow ID in table and " + "get corresponding socket\n"); + goto no_dccp_socket; + } + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk->sk_state == DCCP_TIME_WAIT) { + dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); + inet_twsk_put(inet_twsk(sk)); + goto no_dccp_socket; + } + + /* + * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage + * o if MinCsCov = 0, only packets with CsCov = 0 are accepted + * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov + */ + min_cov = dccp_sk(sk)->dccps_pcrlen; + if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { + dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", + dh->dccph_cscov, min_cov); + /* FIXME: "Such packets SHOULD be reported using Data Dropped + * options (Section 11.7) with Drop Code 0, Protocol + * Constraints." */ + goto discard_and_relse; + } + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + nf_reset(skb); + + return sk_receive_skb(sk, skb, 1); + +no_dccp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(sk, skb); + } + +discard_it: + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; +} + +static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v4_conn_request, + .syn_recv_sock = dccp_v4_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), + .bind_conflict = inet_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, +#endif +}; + +static int dccp_v4_init_sock(struct sock *sk) +{ + static __u8 dccp_v4_ctl_sock_initialized; + int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); + + if (err == 0) { + if (unlikely(!dccp_v4_ctl_sock_initialized)) + dccp_v4_ctl_sock_initialized = 1; + inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; + } + + return err; +} + +static struct timewait_sock_ops dccp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; + +static struct proto dccp_v4_prot = { + .name = "DCCP", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v4_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v4_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v4_do_rcv, + .hash = inet_hash, + .unhash = inet_unhash, + .accept = inet_csk_accept, + .get_port = inet_csk_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .rsk_prot = &dccp_request_sock_ops, + .twsk_prot = &dccp_timewait_sock_ops, + .h.hashinfo = &dccp_hashinfo, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_dccp_setsockopt, + .compat_getsockopt = compat_dccp_getsockopt, +#endif +}; + +static const struct net_protocol dccp_v4_protocol = { + .handler = dccp_v4_rcv, + .err_handler = dccp_v4_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct proto_ops inet_dccp_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ + .poll = dccp_poll, + .ioctl = inet_ioctl, + /* FIXME: work on inet_listen to rename it to sock_common_listen */ + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw dccp_v4_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v4_prot, + .ops = &inet_dccp_ops, + .no_check = 0, + .flags = INET_PROTOSW_ICSK, +}; + +static int __net_init dccp_v4_init_net(struct net *net) +{ + if (dccp_hashinfo.bhash == NULL) + return -ESOCKTNOSUPPORT; + + return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET, + SOCK_DCCP, IPPROTO_DCCP, net); +} + +static void __net_exit dccp_v4_exit_net(struct net *net) +{ + inet_ctl_sock_destroy(net->dccp.v4_ctl_sk); +} + +static struct pernet_operations dccp_v4_ops = { + .init = dccp_v4_init_net, + .exit = dccp_v4_exit_net, +}; + +static int __init dccp_v4_init(void) +{ + int err = proto_register(&dccp_v4_prot, 1); + + if (err != 0) + goto out; + + err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); + if (err != 0) + goto out_proto_unregister; + + inet_register_protosw(&dccp_v4_protosw); + + err = register_pernet_subsys(&dccp_v4_ops); + if (err) + goto out_destroy_ctl_sock; +out: + return err; +out_destroy_ctl_sock: + inet_unregister_protosw(&dccp_v4_protosw); + inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); +out_proto_unregister: + proto_unregister(&dccp_v4_prot); + goto out; +} + +static void __exit dccp_v4_exit(void) +{ + unregister_pernet_subsys(&dccp_v4_ops); + inet_unregister_protosw(&dccp_v4_protosw); + inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); + proto_unregister(&dccp_v4_prot); +} + +module_init(dccp_v4_init); +module_exit(dccp_v4_exit); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c new file mode 100644 index 00000000..b74f7611 --- /dev/null +++ b/net/dccp/ipv6.c @@ -0,0 +1,1211 @@ +/* + * DCCP over IPv6 + * Linux INET6 implementation + * + * Based on net/dccp6/ipv6.c + * + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dccp.h" +#include "ipv6.h" +#include "feat.h" + +/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */ + +static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; +static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; + +static void dccp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != DCCP_CLOSED) { + if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { + inet_hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(sk, NULL); + local_bh_enable(); + } +} + +/* add pseudo-header to DCCP checksum stored in skb->csum */ +static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); +} + +static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dccp_csum_outgoing(skb); + dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &np->daddr); +} + +static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) +{ + return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + dccp_hdr(skb)->dccph_dport, + dccp_hdr(skb)->dccph_sport ); + +} + +static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + struct dccp_sock *dp; + struct ipv6_pinfo *np; + struct sock *sk; + int err; + __u64 seq; + struct net *net = dev_net(skb->dev); + + if (skb->len < offset + sizeof(*dh) || + skb->len < offset + __dccp_basic_hdr_len(dh)) { + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return; + } + + sk = inet6_lookup(net, &dccp_hashinfo, + &hdr->daddr, dh->dccph_dport, + &hdr->saddr, dh->dccph_sport, inet6_iif(skb)); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + dp = dccp_sk(sk); + seq = dccp_hdr_seq(dh); + if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && + !between48(seq, dp->dccps_awl, dp->dccps_awh)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi6 fl6; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false); + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + goto out; + } + } else + dst_hold(dst); + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + dccp_sync_mss(sk, dst_mtu(dst)); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, dh->dccph_dport, + &hdr->daddr, &hdr->saddr, + inet6_iif(skb)); + if (req == NULL) + goto out; + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + WARN_ON(req->sk != NULL); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + /* + * Wake people up to see the error + * (see connect in sock.c) + */ + sk->sk_error_report(sk); + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, + struct request_values *rv_unused) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr *final_p, final; + struct flowi6 fl6; + int err = -1; + struct dst_entry *dst; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr); + fl6.flowlabel = 0; + fl6.flowi6_oif = ireq6->iif; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + opt = np->opt; + + final_p = fl6_update_dst(&fl6, opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto done; + } + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = dccp_v6_csum_finish(skb, + &ireq6->loc_addr, + &ireq6->rmt_addr); + ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + err = ip6_xmit(sk, skb, &fl6, opt); + err = net_xmit_eval(err); + } + +done: + if (opt != NULL && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return err; +} + +static void dccp_v6_reqsk_destructor(struct request_sock *req) +{ + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); + if (inet6_rsk(req)->pktopts != NULL) + kfree_skb(inet6_rsk(req)->pktopts); +} + +static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) +{ + const struct ipv6hdr *rxip6h; + struct sk_buff *skb; + struct flowi6 fl6; + struct net *net = dev_net(skb_dst(rxskb)->dev); + struct sock *ctl_sk = net->dccp.v6_ctl_sk; + struct dst_entry *dst; + + if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) + return; + + if (!ipv6_unicast_destination(rxskb)) + return; + + skb = dccp_ctl_make_reset(ctl_sk, rxskb); + if (skb == NULL) + return; + + rxip6h = ipv6_hdr(rxskb); + dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, + &rxip6h->daddr); + + memset(&fl6, 0, sizeof(fl6)); + ipv6_addr_copy(&fl6.daddr, &rxip6h->saddr); + ipv6_addr_copy(&fl6.saddr, &rxip6h->daddr); + + fl6.flowi6_proto = IPPROTO_DCCP; + fl6.flowi6_oif = inet6_iif(rxskb); + fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; + fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; + security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); + + /* sk = NULL, but it is safe for now. RST socket required. */ + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false); + if (!IS_ERR(dst)) { + skb_dst_set(skb, dst); + ip6_xmit(ctl_sk, skb, &fl6, NULL); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + return; + } + + kfree_skb(skb); +} + +static struct request_sock_ops dccp6_request_sock_ops = { + .family = AF_INET6, + .obj_size = sizeof(struct dccp6_request_sock), + .rtx_syn_ack = dccp_v6_send_response, + .send_ack = dccp_reqsk_send_ack, + .destructor = dccp_v6_reqsk_destructor, + .send_reset = dccp_v6_ctl_send_reset, +}; + +static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet6_csk_search_req(sk, &prev, + dh->dccph_sport, + &iph->saddr, + &iph->daddr, + inet6_iif(skb)); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, + &iph->saddr, dh->dccph_sport, + &iph->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put(inet_twsk(nsk)); + return NULL; + } + + return sk; +} + +static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct request_sock *req; + struct dccp_request_sock *dreq; + struct inet6_request_sock *ireq6; + struct ipv6_pinfo *np = inet6_sk(sk); + const __be32 service = dccp_hdr_request(skb)->dccph_req_service; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + return 0; /* discard, don't send a reset here */ + + if (dccp_bad_service_code(sk, service)) { + dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; + goto drop; + } + /* + * There are no SYN attacks on IPv6, yet... + */ + dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(&dccp6_request_sock_ops); + if (req == NULL) + goto drop; + + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; + + dreq = dccp_rsk(req); + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + ireq6 = inet6_rsk(req); + ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + ireq6->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq->dreq_isr = dcb->dccpd_seq; + dreq->dreq_iss = dccp_v6_init_sequence(skb); + dreq->dreq_service = service; + + if (dccp_v6_send_response(sk, req, NULL)) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + return -1; +} + +static struct sock *dccp_v6_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct inet_sock *newinet; + struct dccp6_sock *newdp6; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + if (newsk == NULL) + return NULL; + + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr); + + ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; + newsk->sk_backlog_rcv = dccp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, dccp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (dst == NULL) { + struct in6_addr *final_p, final; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + final_p = fl6_update_dst(&fl6, opt, &final); + ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) + goto out; + } + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out_nonewsk; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, dccp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + __ip6_dst_store(newsk, dst, NULL, NULL); + newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | + NETIF_F_TSO); + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr); + ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr); + newsk->sk_bound_dev_if = ireq6->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->inet_opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (ireq6->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC); + kfree_skb(ireq6->pktopts); + ireq6->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + + /* + * Clone native IPv6 options from listening socket (if any) + * + * Yes, keeping reference count would be much more clever, but we make + * one more one thing there: reattach optmem to newsk. + */ + if (opt != NULL) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (newnp->opt != NULL) + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); + + dccp_sync_mss(newsk, dst_mtu(dst)); + + newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; + newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } + __inet6_hash(newsk, NULL); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + if (opt != NULL && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return NULL; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, dccp_rcv_established and rcv_established + handle them correctly, but it is not case with + dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv is currently + * called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + /* + * FIXME: Add handling of IPV6_PKTOPTIONS skb. See the comments below + * (wrt ipv6_pktopions) and net/ipv6/tcp_ipv6.c for an example. + */ + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + if (opt_skb) { + /* XXX This is where we would goto ipv6_pktoptions. */ + __kfree_skb(opt_skb); + } + return 0; + } + + /* + * Step 3: Process LISTEN state + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie option, + * (* Must scan the packet's options to check for Init + * Cookies. Only Init Cookies are processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies *) + * (* Generate a new socket and switch to that socket *) + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookies + * Initialize S.GAR := S.ISS + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies + * Continue with S.state == RESPOND + * (* A Response packet will be generated in Step 11 *) + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in + * dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v6_hnd_req(sk, skb); + + if (nsk == NULL) + goto discard; + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if (nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb != NULL) + __kfree_skb(opt_skb); + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + if (opt_skb) { + /* XXX This is where we would goto ipv6_pktoptions. */ + __kfree_skb(opt_skb); + } + return 0; + +reset: + dccp_v6_ctl_send_reset(sk, skb); +discard: + if (opt_skb != NULL) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +} + +static int dccp_v6_rcv(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + struct sock *sk; + int min_cov; + + /* Step 1: Check header basics */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + /* Step 1: If header checksum is incorrect, drop packet and return. */ + if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr)) { + DCCP_WARN("dropped packet with invalid checksum\n"); + goto discard_it; + } + + dh = dccp_hdr(skb); + + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + if (dccp_packet_without_ack(skb)) + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + else + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + dh->dccph_sport, dh->dccph_dport); + /* + * Step 2: + * If no socket ... + */ + if (sk == NULL) { + dccp_pr_debug("failed to look up flow ID in table and " + "get corresponding socket\n"); + goto no_dccp_socket; + } + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk->sk_state == DCCP_TIME_WAIT) { + dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); + inet_twsk_put(inet_twsk(sk)); + goto no_dccp_socket; + } + + /* + * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage + * o if MinCsCov = 0, only packets with CsCov = 0 are accepted + * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov + */ + min_cov = dccp_sk(sk)->dccps_pcrlen; + if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { + dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", + dh->dccph_cscov, min_cov); + /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ + goto discard_and_relse; + } + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + return sk_receive_skb(sk, skb, 1) ? -1 : 0; + +no_dccp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v6_ctl_send_reset(sk, skb); + } + +discard_it: + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; +} + +static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct in6_addr *saddr = NULL, *final_p, final; + struct flowi6 fl6; + struct dst_entry *dst; + int addr_type; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl6, 0, sizeof(fl6)); + + if (np->sndflow) { + fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6.flowlabel); + if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + if (ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if (addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl6.flowlabel; + + /* + * DCCP over IPv4 + */ + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = icsk->icsk_ext_hdr_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + icsk->icsk_af_ops = &dccp_ipv6_mapped; + sk->sk_backlog_rcv = dccp_v4_do_rcv; + + err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_backlog_rcv = dccp_v6_do_rcv; + goto failure; + } + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr); + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl6.flowi6_proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, saddr ? saddr : &np->saddr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.fl6_dport = usin->sin6_port; + fl6.fl6_sport = inet->inet_sport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto failure; + } + + if (saddr == NULL) { + saddr = &fl6.saddr; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + + __ip6_dst_store(sk, dst, NULL, NULL); + + icsk->icsk_ext_hdr_len = 0; + if (np->opt != NULL) + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); + + inet->inet_dport = usin->sin6_port; + + dccp_set_state(sk, DCCP_REQUESTING); + err = inet6_hash_connect(&dccp_death_row, sk); + if (err) + goto late_failure; + + dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); + err = dccp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + dccp_set_state(sk, DCCP_CLOSED); + __sk_dst_reset(sk); +failure: + inet->inet_dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { + .queue_xmit = inet6_csk_xmit, + .send_check = dccp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +/* + * DCCP over IPv4 via INET6 API + */ +static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int dccp_v6_init_sock(struct sock *sk) +{ + static __u8 dccp_v6_ctl_sock_initialized; + int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); + + if (err == 0) { + if (unlikely(!dccp_v6_ctl_sock_initialized)) + dccp_v6_ctl_sock_initialized = 1; + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + } + + return err; +} + +static void dccp_v6_destroy_sock(struct sock *sk) +{ + dccp_destroy_sock(sk); + inet6_destroy_sock(sk); +} + +static struct timewait_sock_ops dccp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + +static struct proto dccp_v6_prot = { + .name = "DCCPv6", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v6_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v6_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v6_do_rcv, + .hash = dccp_v6_hash, + .unhash = inet_unhash, + .accept = inet_csk_accept, + .get_port = inet_csk_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v6_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .rsk_prot = &dccp6_request_sock_ops, + .twsk_prot = &dccp6_timewait_sock_ops, + .h.hashinfo = &dccp_hashinfo, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_dccp_setsockopt, + .compat_getsockopt = compat_dccp_getsockopt, +#endif +}; + +static const struct inet6_protocol dccp_v6_protocol = { + .handler = dccp_v6_rcv, + .err_handler = dccp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static const struct proto_ops inet6_dccp_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = dccp_poll, + .ioctl = inet6_ioctl, + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw dccp_v6_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v6_prot, + .ops = &inet6_dccp_ops, + .flags = INET_PROTOSW_ICSK, +}; + +static int __net_init dccp_v6_init_net(struct net *net) +{ + if (dccp_hashinfo.bhash == NULL) + return -ESOCKTNOSUPPORT; + + return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6, + SOCK_DCCP, IPPROTO_DCCP, net); +} + +static void __net_exit dccp_v6_exit_net(struct net *net) +{ + inet_ctl_sock_destroy(net->dccp.v6_ctl_sk); +} + +static struct pernet_operations dccp_v6_ops = { + .init = dccp_v6_init_net, + .exit = dccp_v6_exit_net, +}; + +static int __init dccp_v6_init(void) +{ + int err = proto_register(&dccp_v6_prot, 1); + + if (err != 0) + goto out; + + err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + if (err != 0) + goto out_unregister_proto; + + inet6_register_protosw(&dccp_v6_protosw); + + err = register_pernet_subsys(&dccp_v6_ops); + if (err != 0) + goto out_destroy_ctl_sock; +out: + return err; + +out_destroy_ctl_sock: + inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + inet6_unregister_protosw(&dccp_v6_protosw); +out_unregister_proto: + proto_unregister(&dccp_v6_prot); + goto out; +} + +static void __exit dccp_v6_exit(void) +{ + unregister_pernet_subsys(&dccp_v6_ops); + inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + inet6_unregister_protosw(&dccp_v6_protosw); + proto_unregister(&dccp_v6_prot); +} + +module_init(dccp_v6_init); +module_exit(dccp_v6_exit); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h new file mode 100644 index 00000000..6eef81fd --- /dev/null +++ b/net/dccp/ipv6.h @@ -0,0 +1,36 @@ +#ifndef _DCCP_IPV6_H +#define _DCCP_IPV6_H +/* + * net/dccp/ipv6.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +struct dccp6_sock { + struct dccp_sock dccp; + /* + * ipv6_pinfo has to be the last member of dccp6_sock, + * see inet6_sk_generic. + */ + struct ipv6_pinfo inet6; +}; + +struct dccp6_request_sock { + struct dccp_request_sock dccp; + struct inet6_request_sock inet6; +}; + +struct dccp6_timewait_sock { + struct inet_timewait_sock inet; + struct inet6_timewait_sock tw6; +}; + +#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c new file mode 100644 index 00000000..d7041a09 --- /dev/null +++ b/net/dccp/minisocks.c @@ -0,0 +1,278 @@ +/* + * net/dccp/minisocks.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "ackvec.h" +#include "ccid.h" +#include "dccp.h" +#include "feat.h" + +struct inet_timewait_death_row dccp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock), + .hashinfo = &dccp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&dccp_death_row), + .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work, + inet_twdr_twkill_work), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&dccp_death_row), +}; + +EXPORT_SYMBOL_GPL(dccp_death_row); + +void dccp_time_wait(struct sock *sk, int state, int timeo) +{ + struct inet_timewait_sock *tw = NULL; + + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; + + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } +#endif + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + tw->tw_timeout = DCCP_TIMEWAIT_LEN; + if (state == DCCP_TIME_WAIT) + timeo = DCCP_TIMEWAIT_LEN; + + inet_twsk_schedule(tw, &dccp_death_row, timeo, + DCCP_TIMEWAIT_LEN); + inet_twsk_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + DCCP_WARN("time wait bucket table overflow\n"); + } + + dccp_done(sk); +} + +struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) +{ + /* + * Step 3: Process LISTEN state + * + * (* Generate a new socket and switch to that socket *) + * Set S := new socket for this port pair + */ + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + + if (newsk != NULL) { + struct dccp_request_sock *dreq = dccp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(newsk); + struct dccp_sock *newdp = dccp_sk(newsk); + + newdp->dccps_role = DCCP_ROLE_SERVER; + newdp->dccps_hc_rx_ackvec = NULL; + newdp->dccps_service_list = NULL; + newdp->dccps_service = dreq->dreq_service; + newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; + newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; + newicsk->icsk_rto = DCCP_TIMEOUT_INIT; + + INIT_LIST_HEAD(&newdp->dccps_featneg); + /* + * Step 3: Process LISTEN state + * + * Choose S.ISS (initial seqno) or set from Init Cookies + * Initialize S.GAR := S.ISS + * Set S.ISR, S.GSR from packet (or Init Cookies) + * + * Setting AWL/AWH and SWL/SWH happens as part of the feature + * activation below, as these windows all depend on the local + * and remote Sequence Window feature values (7.5.2). + */ + newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; + newdp->dccps_gar = newdp->dccps_iss; + newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; + + /* + * Activate features: initialise CCIDs, sequence windows etc. + */ + if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + dccp_init_xmit_timers(newsk); + + DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +EXPORT_SYMBOL_GPL(dccp_create_openreq_child); + +/* + * Process an incoming packet for RESPOND sockets represented + * as an request_sock. + */ +struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev) +{ + struct sock *child = NULL; + struct dccp_request_sock *dreq = dccp_rsk(req); + + /* Check for retransmitted REQUEST */ + if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { + + if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) { + dccp_pr_debug("Retransmitted REQUEST\n"); + dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq; + /* + * Send another RESPONSE packet + * To protect against Request floods, increment retrans + * counter (backoff, monitored by dccp_response_timer). + */ + req->retrans++; + req->rsk_ops->rtx_syn_ack(sk, req, NULL); + } + /* Network Duplicate, discard packet */ + return NULL; + } + + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; + + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && + dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) + goto drop; + + /* Invalid ACK */ + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dreq->dreq_iss) { + dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " + "dreq_iss=%llu\n", + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long) dreq->dreq_iss); + goto drop; + } + + if (dccp_parse_options(sk, dreq, skb)) + goto drop; + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_add(sk, req, child); +out: + return child; +listen_overflow: + dccp_pr_debug("listen_overflow!\n"); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; +drop: + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) + req->rsk_ops->send_reset(sk, skb); + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_check_req); + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ +int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + const int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), + skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == DCCP_RESPOND && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + __sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} + +EXPORT_SYMBOL_GPL(dccp_child_process); + +void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *rsk) +{ + DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); +} + +EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); + +int dccp_reqsk_init(struct request_sock *req, + struct dccp_sock const *dp, struct sk_buff const *skb) +{ + struct dccp_request_sock *dreq = dccp_rsk(req); + + inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->loc_port = dccp_hdr(skb)->dccph_dport; + inet_rsk(req)->acked = 0; + dreq->dreq_timestamp_echo = 0; + + /* inherit feature negotiation options from listening socket */ + return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); +} + +EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c new file mode 100644 index 00000000..4b2ab657 --- /dev/null +++ b/net/dccp/options.c @@ -0,0 +1,640 @@ +/* + * net/dccp/options.c + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Aristeu Sergio Rozanski Filho + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2005 Ian McDonald + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include + +#include "ackvec.h" +#include "ccid.h" +#include "dccp.h" +#include "feat.h" + +u64 dccp_decode_value_var(const u8 *bf, const u8 len) +{ + u64 value = 0; + + if (len >= DCCP_OPTVAL_MAXLEN) + value += ((u64)*bf++) << 40; + if (len > 4) + value += ((u64)*bf++) << 32; + if (len > 3) + value += ((u64)*bf++) << 24; + if (len > 2) + value += ((u64)*bf++) << 16; + if (len > 1) + value += ((u64)*bf++) << 8; + if (len > 0) + value += *bf; + + return value; +} + +/** + * dccp_parse_options - Parse DCCP options present in @skb + * @sk: client|server|listening dccp socket (when @dreq != NULL) + * @dreq: request socket to use during connection setup, or NULL + */ +int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, + struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + const struct dccp_hdr *dh = dccp_hdr(skb); + const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr = options; + const unsigned char *opt_end = (unsigned char *)dh + + (dh->dccph_doff * 4); + struct dccp_options_received *opt_recv = &dp->dccps_options_received; + unsigned char opt, len; + unsigned char *uninitialized_var(value); + u32 elapsed_time; + __be32 opt_val; + int rc; + int mandatory = 0; + + memset(opt_recv, 0, sizeof(*opt_recv)); + + opt = len = 0; + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_nonsensical_length; + + len = *opt_ptr++; + if (len < 2) + goto out_nonsensical_length; + /* + * Remove the type and len fields, leaving + * just the value size + */ + len -= 2; + value = opt_ptr; + opt_ptr += len; + + if (opt_ptr > opt_end) + goto out_nonsensical_length; + } + + /* + * CCID-specific options are ignored during connection setup, as + * negotiation may still be in progress (see RFC 4340, 10.3). + * The same applies to Ack Vectors, as these depend on the CCID. + */ + if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || + opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) + goto ignore_option; + + switch (opt) { + case DCCPO_PADDING: + break; + case DCCPO_MANDATORY: + if (mandatory) + goto out_invalid_option; + if (pkt_type != DCCP_PKT_DATA) + mandatory = 1; + break; + case DCCPO_NDP_COUNT: + if (len > 6) + goto out_invalid_option; + + opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); + dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), + (unsigned long long)opt_recv->dccpor_ndp); + break; + case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: + if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ + break; + if (len == 0) + goto out_invalid_option; + rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, + *value, value + 1, len - 1); + if (rc) + goto out_featneg_failed; + break; + case DCCPO_TIMESTAMP: + if (len != 4) + goto out_invalid_option; + /* + * RFC 4340 13.1: "The precise time corresponding to + * Timestamp Value zero is not specified". We use + * zero to indicate absence of a meaningful timestamp. + */ + opt_val = get_unaligned((__be32 *)value); + if (unlikely(opt_val == 0)) { + DCCP_WARN("Timestamp with zero value\n"); + break; + } + + if (dreq != NULL) { + dreq->dreq_timestamp_echo = ntohl(opt_val); + dreq->dreq_timestamp_time = dccp_timestamp(); + } else { + opt_recv->dccpor_timestamp = + dp->dccps_timestamp_echo = ntohl(opt_val); + dp->dccps_timestamp_time = dccp_timestamp(); + } + dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", + dccp_role(sk), ntohl(opt_val), + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* schedule an Ack in case this sender is quiescent */ + inet_csk_schedule_ack(sk); + break; + case DCCPO_TIMESTAMP_ECHO: + if (len != 4 && len != 6 && len != 8) + goto out_invalid_option; + + opt_val = get_unaligned((__be32 *)value); + opt_recv->dccpor_timestamp_echo = ntohl(opt_val); + + dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " + "ackno=%llu", dccp_role(sk), + opt_recv->dccpor_timestamp_echo, + len + 2, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + + value += 4; + + if (len == 4) { /* no elapsed time included */ + dccp_pr_debug_cat("\n"); + break; + } + + if (len == 6) { /* 2-byte elapsed time */ + __be16 opt_val2 = get_unaligned((__be16 *)value); + elapsed_time = ntohs(opt_val2); + } else { /* 4-byte elapsed time */ + opt_val = get_unaligned((__be32 *)value); + elapsed_time = ntohl(opt_val); + } + + dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); + + /* Give precedence to the biggest ELAPSED_TIME */ + if (elapsed_time > opt_recv->dccpor_elapsed_time) + opt_recv->dccpor_elapsed_time = elapsed_time; + break; + case DCCPO_ELAPSED_TIME: + if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ + break; + + if (len == 2) { + __be16 opt_val2 = get_unaligned((__be16 *)value); + elapsed_time = ntohs(opt_val2); + } else if (len == 4) { + opt_val = get_unaligned((__be32 *)value); + elapsed_time = ntohl(opt_val); + } else { + goto out_invalid_option; + } + + if (elapsed_time > opt_recv->dccpor_elapsed_time) + opt_recv->dccpor_elapsed_time = elapsed_time; + + dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", + dccp_role(sk), elapsed_time); + break; + case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, + pkt_type, opt, value, len)) + goto out_invalid_option; + break; + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ + break; + /* + * Ack vectors are processed by the TX CCID if it is + * interested. The RX CCID need not parse Ack Vectors, + * since it is only interested in clearing old state. + * Fall through. + */ + case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, + pkt_type, opt, value, len)) + goto out_invalid_option; + break; + default: + DCCP_CRIT("DCCP(%p): option %d(len=%d) not " + "implemented, ignoring", sk, opt, len); + break; + } +ignore_option: + if (opt != DCCPO_MANDATORY) + mandatory = 0; + } + + /* mandatory was the last byte in option list -> reset connection */ + if (mandatory) + goto out_invalid_option; + +out_nonsensical_length: + /* RFC 4340, 5.8: ignore option and all remaining option space */ + return 0; + +out_invalid_option: + DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); + rc = DCCP_RESET_CODE_OPTION_ERROR; +out_featneg_failed: + DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); + DCCP_SKB_CB(skb)->dccpd_reset_code = rc; + DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; + DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; + DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; + return -1; +} + +EXPORT_SYMBOL_GPL(dccp_parse_options); + +void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) +{ + if (len >= DCCP_OPTVAL_MAXLEN) + *to++ = (value & 0xFF0000000000ull) >> 40; + if (len > 4) + *to++ = (value & 0xFF00000000ull) >> 32; + if (len > 3) + *to++ = (value & 0xFF000000) >> 24; + if (len > 2) + *to++ = (value & 0xFF0000) >> 16; + if (len > 1) + *to++ = (value & 0xFF00) >> 8; + if (len > 0) + *to++ = (value & 0xFF); +} + +static inline u8 dccp_ndp_len(const u64 ndp) +{ + if (likely(ndp <= 0xFF)) + return 1; + return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); +} + +int dccp_insert_option(struct sk_buff *skb, const unsigned char option, + const void *value, const unsigned char len) +{ + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; + + to = skb_push(skb, len + 2); + *to++ = option; + *to++ = len + 2; + + memcpy(to, value, len); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_insert_option); + +static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + u64 ndp = dp->dccps_ndp_count; + + if (dccp_non_data_packet(skb)) + ++dp->dccps_ndp_count; + else + dp->dccps_ndp_count = 0; + + if (ndp > 0) { + unsigned char *ptr; + const int ndp_len = dccp_ndp_len(ndp); + const int len = ndp_len + 2; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + ptr = skb_push(skb, len); + *ptr++ = DCCPO_NDP_COUNT; + *ptr++ = len; + dccp_encode_value_var(ndp, ptr, ndp_len); + } + + return 0; +} + +static inline int dccp_elapsed_time_len(const u32 elapsed_time) +{ + return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; +} + +/* FIXME: This function is currently not used anywhere */ +int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time) +{ + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 2 + elapsed_time_len; + unsigned char *to; + + if (elapsed_time_len == 0) + return 0; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ELAPSED_TIME; + *to++ = len; + + if (elapsed_time_len == 2) { + const __be16 var16 = htons((u16)elapsed_time); + memcpy(to, &var16, 2); + } else { + const __be32 var32 = htonl(elapsed_time); + memcpy(to, &var32, 4); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); + +static int dccp_insert_option_timestamp(struct sk_buff *skb) +{ + __be32 now = htonl(dccp_timestamp()); + /* yes this will overflow but that is the point as we want a + * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ + + return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); +} + +static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, + struct dccp_request_sock *dreq, + struct sk_buff *skb) +{ + __be32 tstamp_echo; + unsigned char *to; + u32 elapsed_time, elapsed_time_len, len; + + if (dreq != NULL) { + elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; + tstamp_echo = htonl(dreq->dreq_timestamp_echo); + dreq->dreq_timestamp_echo = 0; + } else { + elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; + tstamp_echo = htonl(dp->dccps_timestamp_echo); + dp->dccps_timestamp_echo = 0; + } + + elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + len = 6 + elapsed_time_len; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_TIMESTAMP_ECHO; + *to++ = len; + + memcpy(to, &tstamp_echo, 4); + to += 4; + + if (elapsed_time_len == 2) { + const __be16 var16 = htons((u16)elapsed_time); + memcpy(to, &var16, 2); + } else if (elapsed_time_len == 4) { + const __be32 var32 = htonl(elapsed_time); + memcpy(to, &var32, 4); + } + + return 0; +} + +static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const u16 buflen = dccp_ackvec_buflen(av); + /* Figure out how many options do we need to represent the ackvec */ + const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); + u16 len = buflen + 2 * nr_opts; + u8 i, nonce = 0; + const unsigned char *tail, *from; + unsigned char *to; + + if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, + dccp_packet_name(dcb->dccpd_type)); + return -1; + } + /* + * Since Ack Vectors are variable-length, we can not always predict + * their size. To catch exception cases where the space is running out + * on the skb, a separate Sync is scheduled to carry the Ack Vector. + */ + if (len > DCCPAV_MIN_OPTLEN && + len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { + DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " + "MPS=%u ==> reduce payload size?\n", len, skb->len, + dcb->dccpd_opt_len, dp->dccps_mss_cache); + dp->dccps_sync_scheduled = 1; + return 0; + } + dcb->dccpd_opt_len += len; + + to = skb_push(skb, len); + len = buflen; + from = av->av_buf + av->av_buf_head; + tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; + + for (i = 0; i < nr_opts; ++i) { + int copylen = len; + + if (len > DCCP_SINGLE_OPT_MAXLEN) + copylen = DCCP_SINGLE_OPT_MAXLEN; + + /* + * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via + * its type; ack_nonce is the sum of all individual buf_nonce's. + */ + nonce ^= av->av_buf_nonce[i]; + + *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; + *to++ = copylen + 2; + + /* Check if buf_head wraps */ + if (from + copylen > tail) { + const u16 tailsize = tail - from; + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + copylen -= tailsize; + from = av->av_buf; + } + + memcpy(to, from, copylen); + from += copylen; + to += copylen; + len -= copylen; + } + /* + * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. + */ + if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) + return -ENOBUFS; + return 0; +} + +/** + * dccp_insert_option_mandatory - Mandatory option (5.8.2) + * Note that since we are using skb_push, this function needs to be called + * _after_ inserting the option it is supposed to influence (stack order). + */ +int dccp_insert_option_mandatory(struct sk_buff *skb) +{ + if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len++; + *skb_push(skb, 1) = DCCPO_MANDATORY; + return 0; +} + +/** + * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb + * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R + * @feat: one out of %dccp_feature_numbers + * @val: NN value or SP array (preferred element first) to copy + * @len: true length of @val in bytes (excluding first element repetition) + * @repeat_first: whether to copy the first element of @val twice + * The last argument is used to construct Confirm options, where the preferred + * value and the preference list appear separately (RFC 4340, 6.3.1). Preference + * lists are kept such that the preferred entry is always first, so we only need + * to copy twice, and avoid the overhead of cloning into a bigger array. + */ +int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len, bool repeat_first) +{ + u8 tot_len, *to; + + /* take the `Feature' field and possible repetition into account */ + if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { + DCCP_WARN("length %u for feature %u too large\n", len, feat); + return -1; + } + + if (unlikely(val == NULL || len == 0)) + len = repeat_first = 0; + tot_len = 3 + repeat_first + len; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("packet too small for feature %d option!\n", feat); + return -1; + } + DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; + + to = skb_push(skb, tot_len); + *to++ = type; + *to++ = tot_len; + *to++ = feat; + + if (repeat_first) + *to++ = *val; + if (len) + memcpy(to, val, len); + return 0; +} + +/* The length of all options needs to be a multiple of 4 (5.8) */ +static void dccp_insert_option_padding(struct sk_buff *skb) +{ + int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; + + if (padding != 0) { + padding = 4 - padding; + memset(skb_push(skb, padding), 0, padding); + DCCP_SKB_CB(skb)->dccpd_opt_len += padding; + } +} + +int dccp_insert_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + + if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) + return -1; + + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { + + /* Feature Negotiation */ + if (dccp_feat_insert_opts(dp, NULL, skb)) + return -1; + + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { + /* + * Obtain RTT sample from Request/Response exchange. + * This is currently used for TFRC initialisation. + */ + if (dccp_insert_option_timestamp(skb)) + return -1; + + } else if (dccp_ackvec_pending(sk) && + dccp_insert_option_ackvec(sk, skb)) { + return -1; + } + } + + if (dp->dccps_hc_rx_insert_options) { + if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) + return -1; + dp->dccps_hc_rx_insert_options = 0; + } + + if (dp->dccps_timestamp_echo != 0 && + dccp_insert_option_timestamp_echo(dp, NULL, skb)) + return -1; + + dccp_insert_option_padding(skb); + return 0; +} + +int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) +{ + DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + + if (dccp_feat_insert_opts(NULL, dreq, skb)) + return -1; + + /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ + if (dccp_insert_option_timestamp(skb)) + return -1; + + if (dreq->dreq_timestamp_echo != 0 && + dccp_insert_option_timestamp_echo(NULL, dreq, skb)) + return -1; + + dccp_insert_option_padding(skb); + return 0; +} diff --git a/net/dccp/output.c b/net/dccp/output.c new file mode 100644 index 00000000..fab108e5 --- /dev/null +++ b/net/dccp/output.c @@ -0,0 +1,697 @@ +/* + * net/dccp/output.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include + +#include "ackvec.h" +#include "ccid.h" +#include "dccp.h" + +static inline void dccp_event_ack_sent(struct sock *sk) +{ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) +{ + skb_set_owner_w(skb, sk); + WARN_ON(sk->sk_send_head); + sk->sk_send_head = skb; +} + +/* + * All SKB's seen here are completely headerless. It is our + * job to build the DCCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + */ +static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb != NULL)) { + struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + struct dccp_hdr *dh; + /* XXX For now we're using only 48 bits sequence numbers */ + const u32 dccp_header_size = sizeof(*dh) + + sizeof(struct dccp_hdr_ext) + + dccp_packet_hdr_len(dcb->dccpd_type); + int err, set_ack = 1; + u64 ackno = dp->dccps_gsr; + /* + * Increment GSS here already in case the option code needs it. + * Update GSS for real only if option processing below succeeds. + */ + dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); + + switch (dcb->dccpd_type) { + case DCCP_PKT_DATA: + set_ack = 0; + /* fall through */ + case DCCP_PKT_DATAACK: + case DCCP_PKT_RESET: + break; + + case DCCP_PKT_REQUEST: + set_ack = 0; + /* Use ISS on the first (non-retransmitted) Request. */ + if (icsk->icsk_retransmits == 0) + dcb->dccpd_seq = dp->dccps_iss; + /* fall through */ + + case DCCP_PKT_SYNC: + case DCCP_PKT_SYNCACK: + ackno = dcb->dccpd_ack_seq; + /* fall through */ + default: + /* + * Set owner/destructor: some skbs are allocated via + * alloc_skb (e.g. when retransmission may happen). + * Only Data, DataAck, and Reset packets should come + * through here with skb->sk set. + */ + WARN_ON(skb->sk); + skb_set_owner_w(skb, sk); + break; + } + + if (dccp_insert_options(sk, skb)) { + kfree_skb(skb); + return -EPROTO; + } + + + /* Build DCCP header and checksum it. */ + dh = dccp_zeroed_hdr(skb, dccp_header_size); + dh->dccph_type = dcb->dccpd_type; + dh->dccph_sport = inet->inet_sport; + dh->dccph_dport = inet->inet_dport; + dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; + dh->dccph_ccval = dcb->dccpd_ccval; + dh->dccph_cscov = dp->dccps_pcslen; + /* XXX For now we're using only 48 bits sequence numbers */ + dh->dccph_x = 1; + + dccp_update_gss(sk, dcb->dccpd_seq); + dccp_hdr_set_seq(dh, dp->dccps_gss); + if (set_ack) + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); + + switch (dcb->dccpd_type) { + case DCCP_PKT_REQUEST: + dccp_hdr_request(skb)->dccph_req_service = + dp->dccps_service; + /* + * Limit Ack window to ISS <= P.ackno <= GSS, so that + * only Responses to Requests we sent are considered. + */ + dp->dccps_awl = dp->dccps_iss; + break; + case DCCP_PKT_RESET: + dccp_hdr_reset(skb)->dccph_reset_code = + dcb->dccpd_reset_code; + break; + } + + icsk->icsk_af_ops->send_check(sk, skb); + + if (set_ack) + dccp_event_ack_sent(sk); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + + err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + return net_xmit_eval(err); + } + return -ENOBUFS; +} + +/** + * dccp_determine_ccmps - Find out about CCID-specific packet-size limits + * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), + * since the RX CCID is restricted to feedback packets (Acks), which are small + * in comparison with the data traffic. A value of 0 means "no current CCMPS". + */ +static u32 dccp_determine_ccmps(const struct dccp_sock *dp) +{ + const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; + + if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) + return 0; + return tx_ccid->ccid_ops->ccid_ccmps; +} + +unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct dccp_sock *dp = dccp_sk(sk); + u32 ccmps = dccp_determine_ccmps(dp); + u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; + + /* Account for header lengths and IPv4/v6 option overhead */ + cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); + + /* + * Leave enough headroom for common DCCP header options. + * This only considers options which may appear on DCCP-Data packets, as + * per table 3 in RFC 4340, 5.8. When running out of space for other + * options (eg. Ack Vector which can take up to 255 bytes), it is better + * to schedule a separate Ack. Thus we leave headroom for the following: + * - 1 byte for Slow Receiver (11.6) + * - 6 bytes for Timestamp (13.1) + * - 10 bytes for Timestamp Echo (13.3) + * - 8 bytes for NDP count (7.7, when activated) + * - 6 bytes for Data Checksum (9.3) + * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) + */ + cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + + (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); + + /* And store cached results */ + icsk->icsk_pmtu_cookie = pmtu; + dp->dccps_mss_cache = cur_mps; + + return cur_mps; +} + +EXPORT_SYMBOL_GPL(dccp_sync_mss); + +void dccp_write_space(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + + rcu_read_unlock(); +} + +/** + * dccp_wait_for_ccid - Await CCID send permission + * @sk: socket to wait for + * @delay: timeout in jiffies + * This is used by CCIDs which need to delay the send time in process context. + */ +static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) +{ + DEFINE_WAIT(wait); + long remaining; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + release_sock(sk); + + remaining = schedule_timeout(delay); + + lock_sock(sk); + sk->sk_write_pending--; + finish_wait(sk_sleep(sk), &wait); + + if (signal_pending(current) || sk->sk_err) + return -1; + return remaining; +} + +/** + * dccp_xmit_packet - Send data packet under control of CCID + * Transmits next-queued payload and informs CCID to account for the packet. + */ +static void dccp_xmit_packet(struct sock *sk) +{ + int err, len; + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb = dccp_qpolicy_pop(sk); + + if (unlikely(skb == NULL)) + return; + len = skb->len; + + if (sk->sk_state == DCCP_PARTOPEN) { + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } + + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; + } + + err = dccp_transmit_skb(sk, skb); + if (err) + dccp_pr_debug("transmit_skb() returned err=%d\n", err); + /* + * Register this one as sent even if an error occurred. To the remote + * end a local packet drop is indistinguishable from network loss, i.e. + * any local drop will eventually be reported via receiver feedback. + */ + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); + + /* + * If the CCID needs to transfer additional header options out-of-band + * (e.g. Ack Vectors or feature-negotiation options), it activates this + * flag to schedule a Sync. The Sync will automatically incorporate all + * currently pending header options, thus clearing the backlog. + */ + if (dp->dccps_sync_scheduled) + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); +} + +/** + * dccp_flush_write_queue - Drain queue at end of connection + * Since dccp_sendmsg queues packets without waiting for them to be sent, it may + * happen that the TX queue is not empty at the end of a connection. We give the + * HC-sender CCID a grace period of up to @time_budget jiffies. If this function + * returns with a non-empty write queue, it will be purged later. + */ +void dccp_flush_write_queue(struct sock *sk, long *time_budget) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + long delay, rc; + + while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + /* + * If the CCID determines when to send, the next sending + * time is unknown or the CCID may not even send again + * (e.g. remote host crashes or lost Ack packets). + */ + DCCP_WARN("CCID did not manage to send all packets\n"); + return; + case CCID_PACKET_DELAY: + delay = msecs_to_jiffies(rc); + if (delay > *time_budget) + return; + rc = dccp_wait_for_ccid(sk, delay); + if (rc < 0) + return; + *time_budget -= (delay - rc); + /* check again if we can send now */ + break; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%ld\n", rc); + } + } +} + +void dccp_write_xmit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + + while ((skb = dccp_qpolicy_top(sk))) { + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + return; + case CCID_PACKET_DELAY: + sk_reset_timer(sk, &dp->dccps_xmit_timer, + jiffies + msecs_to_jiffies(rc)); + return; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + dccp_qpolicy_drop(sk, skb); + dccp_pr_debug("packet discarded due to err=%d\n", rc); + } + } +} + +/** + * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets + * There are only four retransmittable packet types in DCCP: + * - Request in client-REQUEST state (sec. 8.1.1), + * - CloseReq in server-CLOSEREQ state (sec. 8.3), + * - Close in node-CLOSING state (sec. 8.3), + * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). + * This function expects sk->sk_send_head to contain the original skb. + */ +int dccp_retransmit_skb(struct sock *sk) +{ + WARN_ON(sk->sk_send_head == NULL); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + /* this count is used to distinguish original and retransmitted skb */ + inet_csk(sk)->icsk_retransmits++; + + return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); +} + +struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) +{ + struct dccp_hdr *dh; + struct dccp_request_sock *dreq; + const u32 dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_response); + struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, sk->sk_prot->max_header); + + skb_dst_set(skb, dst_clone(dst)); + + dreq = dccp_rsk(req); + if (inet_rsk(req)->acked) /* increase ISS upon retransmission */ + dccp_inc_seqno(&dreq->dreq_iss); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; + DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; + + /* Resolve feature dependencies resulting from choice of CCID */ + if (dccp_feat_server_ccid_dependencies(dreq)) + goto response_failed; + + if (dccp_insert_options_rsk(dreq, skb)) + goto response_failed; + + /* Build and checksum header */ + dh = dccp_zeroed_hdr(skb, dccp_header_size); + + dh->dccph_sport = inet_rsk(req)->loc_port; + dh->dccph_dport = inet_rsk(req)->rmt_port; + dh->dccph_doff = (dccp_header_size + + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESPONSE; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dreq->dreq_iss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr); + dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; + + dccp_csum_outgoing(skb); + + /* We use `acked' to remember that a Response was already sent. */ + inet_rsk(req)->acked = 1; + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +response_failed: + kfree_skb(skb); + return NULL; +} + +EXPORT_SYMBOL_GPL(dccp_make_response); + +/* answer offending packet in @rcv_skb with Reset from control socket @ctl */ +struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) +{ + struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); + const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct dccp_hdr_reset *dhr; + struct sk_buff *skb; + + skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + skb_reserve(skb, sk->sk_prot->max_header); + + /* Swap the send and the receive. */ + dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + + dhr = dccp_hdr_reset(skb); + dhr->dccph_reset_code = dcb->dccpd_reset_code; + + switch (dcb->dccpd_reset_code) { + case DCCP_RESET_CODE_PACKET_ERROR: + dhr->dccph_reset_data[0] = rxdh->dccph_type; + break; + case DCCP_RESET_CODE_OPTION_ERROR: /* fall through */ + case DCCP_RESET_CODE_MANDATORY_ERROR: + memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); + break; + } + /* + * From RFC 4340, 8.3.1: + * If P.ackno exists, set R.seqno := P.ackno + 1. + * Else set R.seqno := 0. + */ + if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); + + dccp_csum_outgoing(skb); + return skb; +} + +EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); + +/* send Reset on established socket, to close or abort the connection */ +int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) +{ + struct sk_buff *skb; + /* + * FIXME: what if rebuild_header fails? + * Should we be doing a rebuild_header here? + */ + int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); + + if (err != 0) + return err; + + skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); + if (skb == NULL) + return -ENOBUFS; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, sk->sk_prot->max_header); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; + DCCP_SKB_CB(skb)->dccpd_reset_code = code; + + return dccp_transmit_skb(sk, skb); +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +int dccp_connect(struct sock *sk) +{ + struct sk_buff *skb; + struct dccp_sock *dp = dccp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + + dccp_sync_mss(sk, dst_mtu(dst)); + + /* do not connect if feature negotiation setup fails */ + if (dccp_feat_finalise_settings(dccp_sk(sk))) + return -EPROTO; + + /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ + dp->dccps_gar = dp->dccps_iss; + + skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); + if (unlikely(skb == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(skb, sk->sk_prot->max_header); + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; + + dccp_skb_entail(sk, skb); + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the REQUEST until an answer. */ + icsk->icsk_retransmits = 0; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, DCCP_RTO_MAX); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_connect); + +void dccp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if (sk->sk_state != DCCP_CLOSED) { + struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, + GFP_ATOMIC); + + if (skb == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, + DCCP_RTO_MAX); + return; + } + + /* Reserve space for headers */ + skb_reserve(skb, sk->sk_prot->max_header); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; + dccp_transmit_skb(sk, skb); + } +} + +EXPORT_SYMBOL_GPL(dccp_send_ack); + +#if 0 +/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ +void dccp_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + /* + * FIXME: tune this timer. elapsed time fixes the skew, so no problem + * with using 2s, and active senders also piggyback the ACK into a + * DATAACK packet, so this is really for quiescent senders. + */ + unsigned long timeout = jiffies + 2 * HZ; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + * + * FIXME: check the "about to expire" part + */ + if (icsk->icsk_ack.blocked) { + dccp_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} +#endif + +void dccp_send_sync(struct sock *sk, const u64 ackno, + const enum dccp_pkt_type pkt_type) +{ + /* + * We are not putting this on the write queue, so + * dccp_transmit_skb() will set the ownership to this + * sock. + */ + struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); + + if (skb == NULL) { + /* FIXME: how to make sure the sync is sent? */ + DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, sk->sk_prot->max_header); + DCCP_SKB_CB(skb)->dccpd_type = pkt_type; + DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; + + /* + * Clear the flag in case the Sync was scheduled for out-of-band data, + * such as carrying a long Ack Vector. + */ + dccp_sk(sk)->dccps_sync_scheduled = 0; + + dccp_transmit_skb(sk, skb); +} + +EXPORT_SYMBOL_GPL(dccp_send_sync); + +/* + * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This + * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under + * any circumstances. + */ +void dccp_send_close(struct sock *sk, const int active) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; + + skb = alloc_skb(sk->sk_prot->max_header, prio); + if (skb == NULL) + return; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, sk->sk_prot->max_header); + if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; + else + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; + + if (active) { + dccp_skb_entail(sk, skb); + dccp_transmit_skb(sk, skb_clone(skb, prio)); + /* + * Retransmission timer for active-close: RFC 4340, 8.3 requires + * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ + * state can be left. The initial timeout is 2 RTTs. + * Since RTT measurement is done by the CCIDs, there is no easy + * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 + * is too low (200ms); we use a high value to avoid unnecessary + * retransmissions when the link RTT is > 0.2 seconds. + * FIXME: Let main module sample RTTs and use that instead. + */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); + } else + dccp_transmit_skb(sk, skb); +} diff --git a/net/dccp/probe.c b/net/dccp/probe.c new file mode 100644 index 00000000..33d0e629 --- /dev/null +++ b/net/dccp/probe.c @@ -0,0 +1,198 @@ +/* + * dccp_probe - Observe the DCCP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * Modified for DCCP from Stephen Hemminger's code + * Copyright (C) 2006, Ian McDonald + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dccp.h" +#include "ccid.h" +#include "ccids/ccid3.h" + +static int port; + +static int bufsize = 64 * 1024; + +static const char procname[] = "dccpprobe"; + +static struct { + struct kfifo fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timespec tstart; +} dccpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + struct timespec now; + char tbuf[256]; + + va_start(args, fmt); + getnstimeofday(&now); + + now = timespec_sub(now, dccpw.tstart); + + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_nsec / NSEC_PER_USEC); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + + kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); + wake_up(&dccpw.wait); +} + +static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct inet_sock *inet = inet_sk(sk); + struct ccid3_hc_tx_sock *hc = NULL; + + if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) + hc = ccid3_hc_tx_sk(sk); + + if (port == 0 || ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port) { + if (hc) + printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n", + &inet->inet_saddr, ntohs(inet->inet_sport), + &inet->inet_daddr, ntohs(inet->inet_dport), size, + hc->tx_s, hc->tx_rtt, hc->tx_p, + hc->tx_x_calc, hc->tx_x_recv >> 6, + hc->tx_x >> 6, hc->tx_t_ipi); + else + printl("%pI4:%u %pI4:%u %d\n", + &inet->inet_saddr, ntohs(inet->inet_sport), + &inet->inet_daddr, ntohs(inet->inet_dport), + size); + } + + jprobe_return(); + return 0; +} + +static struct jprobe dccp_send_probe = { + .kp = { + .symbol_name = "dccp_sendmsg", + }, + .entry = jdccp_sendmsg, +}; + +static int dccpprobe_open(struct inode *inode, struct file *file) +{ + kfifo_reset(&dccpw.fifo); + getnstimeofday(&dccpw.tstart); + return 0; +} + +static ssize_t dccpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt = 0; + unsigned char *tbuf; + + if (!buf) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(dccpw.wait, + kfifo_len(&dccpw.fifo) != 0); + if (error) + goto out_free; + + cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock); + error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; + +out_free: + vfree(tbuf); + + return error ? error : cnt; +} + +static const struct file_operations dccpprobe_fops = { + .owner = THIS_MODULE, + .open = dccpprobe_open, + .read = dccpprobe_read, + .llseek = noop_llseek, +}; + +static __init int dccpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&dccpw.wait); + spin_lock_init(&dccpw.lock); + if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL)) + return ret; + if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops)) + goto err0; + + try_then_request_module((ret = register_jprobe(&dccp_send_probe)) == 0, + "dccp"); + if (ret) + goto err1; + + pr_info("DCCP watch registered (port=%d)\n", port); + return 0; +err1: + proc_net_remove(&init_net, procname); +err0: + kfifo_free(&dccpw.fifo); + return ret; +} +module_init(dccpprobe_init); + +static __exit void dccpprobe_exit(void) +{ + kfifo_free(&dccpw.fifo); + proc_net_remove(&init_net, procname); + unregister_jprobe(&dccp_send_probe); + +} +module_exit(dccpprobe_exit); + +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +MODULE_AUTHOR("Ian McDonald "); +MODULE_DESCRIPTION("DCCP snooper"); +MODULE_LICENSE("GPL"); diff --git a/net/dccp/proto.c b/net/dccp/proto.c new file mode 100644 index 00000000..152975d9 --- /dev/null +++ b/net/dccp/proto.c @@ -0,0 +1,1257 @@ +/* + * net/dccp/proto.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" +#include "feat.h" + +DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; + +EXPORT_SYMBOL_GPL(dccp_statistics); + +struct percpu_counter dccp_orphan_count; +EXPORT_SYMBOL_GPL(dccp_orphan_count); + +struct inet_hashinfo dccp_hashinfo; +EXPORT_SYMBOL_GPL(dccp_hashinfo); + +/* the maximum queue length for tx in packets. 0 is no limit */ +int sysctl_dccp_tx_qlen __read_mostly = 5; + +#ifdef CONFIG_IP_DCCP_DEBUG +static const char *dccp_state_name(const int state) +{ + static const char *const dccp_state_names[] = { + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", + [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", + [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", + }; + + if (state >= DCCP_MAX_STATES) + return "INVALID STATE!"; + else + return dccp_state_names[state]; +} +#endif + +void dccp_set_state(struct sock *sk, const int state) +{ + const int oldstate = sk->sk_state; + + dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, + dccp_state_name(oldstate), dccp_state_name(state)); + WARN_ON(state == oldstate); + + switch (state) { + case DCCP_OPEN: + if (oldstate != DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_CURRESTAB); + /* Client retransmits all Confirm options until entering OPEN */ + if (oldstate == DCCP_PARTOPEN) + dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); + break; + + case DCCP_CLOSED: + if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || + oldstate == DCCP_CLOSING) + DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash != NULL && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(sk); + /* fall through */ + default: + if (oldstate == DCCP_OPEN) + DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; +} + +EXPORT_SYMBOL_GPL(dccp_set_state); + +static void dccp_finish_passive_close(struct sock *sk) +{ + switch (sk->sk_state) { + case DCCP_PASSIVE_CLOSE: + /* Node (client or server) has received Close packet. */ + dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_set_state(sk, DCCP_CLOSED); + break; + case DCCP_PASSIVE_CLOSEREQ: + /* + * Client received CloseReq. We set the `active' flag so that + * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. + */ + dccp_send_close(sk, 1); + dccp_set_state(sk, DCCP_CLOSING); + } +} + +void dccp_done(struct sock *sk) +{ + dccp_set_state(sk, DCCP_CLOSED); + dccp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} + +EXPORT_SYMBOL_GPL(dccp_done); + +const char *dccp_packet_name(const int type) +{ + static const char *const dccp_packet_names[] = { + [DCCP_PKT_REQUEST] = "REQUEST", + [DCCP_PKT_RESPONSE] = "RESPONSE", + [DCCP_PKT_DATA] = "DATA", + [DCCP_PKT_ACK] = "ACK", + [DCCP_PKT_DATAACK] = "DATAACK", + [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", + [DCCP_PKT_CLOSE] = "CLOSE", + [DCCP_PKT_RESET] = "RESET", + [DCCP_PKT_SYNC] = "SYNC", + [DCCP_PKT_SYNCACK] = "SYNCACK", + }; + + if (type >= DCCP_NR_PKT_TYPES) + return "INVALID"; + else + return dccp_packet_names[type]; +} + +EXPORT_SYMBOL_GPL(dccp_packet_name); + +int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_syn_retries = sysctl_dccp_request_retries; + sk->sk_state = DCCP_CLOSED; + sk->sk_write_space = dccp_write_space; + icsk->icsk_sync_mss = dccp_sync_mss; + dp->dccps_mss_cache = 536; + dp->dccps_rate_last = jiffies; + dp->dccps_role = DCCP_ROLE_UNDEFINED; + dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; + dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; + dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; + + dccp_init_xmit_timers(sk); + + INIT_LIST_HEAD(&dp->dccps_featneg); + /* control socket doesn't need feat nego */ + if (likely(ctl_sock_initialized)) + return dccp_feat_init(sk); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_init_sock); + +void dccp_destroy_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + /* + * DCCP doesn't use sk_write_queue, just sk_send_head + * for retransmissions + */ + if (sk->sk_send_head != NULL) { + kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + /* Clean up a referenced DCCP bind bucket. */ + if (inet_csk(sk)->icsk_bind_hash != NULL) + inet_put_port(sk); + + kfree(dp->dccps_service_list); + dp->dccps_service_list = NULL; + + if (dp->dccps_hc_rx_ackvec != NULL) { + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + } + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + + /* clean up feature negotiation state */ + dccp_feat_list_purge(&dp->dccps_featneg); +} + +EXPORT_SYMBOL_GPL(dccp_destroy_sock); + +static inline int dccp_listen_start(struct sock *sk, int backlog) +{ + struct dccp_sock *dp = dccp_sk(sk); + + dp->dccps_role = DCCP_ROLE_LISTEN; + /* do not start to listen if feature negotiation setup fails */ + if (dccp_feat_finalise_settings(dp)) + return -EPROTO; + return inet_csk_listen_start(sk, backlog); +} + +static inline int dccp_need_reset(int state) +{ + return state != DCCP_CLOSED && state != DCCP_LISTEN && + state != DCCP_REQUESTING; +} + +int dccp_disconnect(struct sock *sk, int flags) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + int err = 0; + const int old_state = sk->sk_state; + + if (old_state != DCCP_CLOSED) + dccp_set_state(sk, DCCP_CLOSED); + + /* + * This corresponds to the ABORT function of RFC793, sec. 3.8 + * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". + */ + if (old_state == DCCP_LISTEN) { + inet_csk_listen_stop(sk); + } else if (dccp_need_reset(old_state)) { + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + sk->sk_err = ECONNRESET; + } else if (old_state == DCCP_REQUESTING) + sk->sk_err = ECONNRESET; + + dccp_clear_xmit_timers(sk); + + __skb_queue_purge(&sk->sk_receive_queue); + __skb_queue_purge(&sk->sk_write_queue); + if (sk->sk_send_head != NULL) { + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + inet->inet_dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + + icsk->icsk_backoff = 0; + inet_csk_delack_init(sk); + __sk_dst_reset(sk); + + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + + sk->sk_error_report(sk); + return err; +} + +EXPORT_SYMBOL_GPL(dccp_disconnect); + +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == DCCP_LISTEN) + return inet_csk_listen_poll(sk); + + /* Socket is not locked. We are protected from async events + by poll logic and correct handling of state changes + made by another threads is impossible in any case. + */ + + mask = 0; + if (sk->sk_err) + mask = POLLERR; + + if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + + /* Connected? */ + if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { + if (atomic_read(&sk->sk_rmem_alloc) > 0) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } + } + return mask; +} + +EXPORT_SYMBOL_GPL(dccp_poll); + +int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + int rc = -ENOTCONN; + + lock_sock(sk); + + if (sk->sk_state == DCCP_LISTEN) + goto out; + + switch (cmd) { + case SIOCINQ: { + struct sk_buff *skb; + unsigned long amount = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount of this packet since + * that is all that will be read. + */ + amount = skb->len; + } + rc = put_user(amount, (int __user *)arg); + } + break; + default: + rc = -ENOIOCTLCMD; + break; + } +out: + release_sock(sk); + return rc; +} + +EXPORT_SYMBOL_GPL(dccp_ioctl); + +static int dccp_setsockopt_service(struct sock *sk, const __be32 service, + char __user *optval, unsigned int optlen) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_service_list *sl = NULL; + + if (service == DCCP_SERVICE_INVALID_VALUE || + optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) + return -EINVAL; + + if (optlen > sizeof(service)) { + sl = kmalloc(optlen, GFP_KERNEL); + if (sl == NULL) + return -ENOMEM; + + sl->dccpsl_nr = optlen / sizeof(u32) - 1; + if (copy_from_user(sl->dccpsl_list, + optval + sizeof(service), + optlen - sizeof(service)) || + dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { + kfree(sl); + return -EFAULT; + } + } + + lock_sock(sk); + dp->dccps_service = service; + + kfree(dp->dccps_service_list); + + dp->dccps_service_list = sl; + release_sock(sk); + return 0; +} + +static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) +{ + u8 *list, len; + int i, rc; + + if (cscov < 0 || cscov > 15) + return -EINVAL; + /* + * Populate a list of permissible values, in the range cscov...15. This + * is necessary since feature negotiation of single values only works if + * both sides incidentally choose the same value. Since the list starts + * lowest-value first, negotiation will pick the smallest shared value. + */ + if (cscov == 0) + return 0; + len = 16 - cscov; + + list = kmalloc(len, GFP_KERNEL); + if (list == NULL) + return -ENOBUFS; + + for (i = 0; i < len; i++) + list[i] = cscov++; + + rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); + + if (rc == 0) { + if (rx) + dccp_sk(sk)->dccps_pcrlen = cscov; + else + dccp_sk(sk)->dccps_pcslen = cscov; + } + kfree(list); + return rc; +} + +static int dccp_setsockopt_ccid(struct sock *sk, int type, + char __user *optval, unsigned int optlen) +{ + u8 *val; + int rc = 0; + + if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) + return -EINVAL; + + val = memdup_user(optval, optlen); + if (IS_ERR(val)) + return PTR_ERR(val); + + lock_sock(sk); + if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); + + if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); + release_sock(sk); + + kfree(val); + return rc; +} + +static int do_dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct dccp_sock *dp = dccp_sk(sk); + int val, err = 0; + + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); + return 0; + case DCCP_SOCKOPT_CHANGE_L: + case DCCP_SOCKOPT_CHANGE_R: + DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); + return 0; + case DCCP_SOCKOPT_CCID: + case DCCP_SOCKOPT_RX_CCID: + case DCCP_SOCKOPT_TX_CCID: + return dccp_setsockopt_ccid(sk, optname, optval, optlen); + } + + if (optlen < (int)sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + if (optname == DCCP_SOCKOPT_SERVICE) + return dccp_setsockopt_service(sk, val, optval, optlen); + + lock_sock(sk); + switch (optname) { + case DCCP_SOCKOPT_SERVER_TIMEWAIT: + if (dp->dccps_role != DCCP_ROLE_SERVER) + err = -EOPNOTSUPP; + else + dp->dccps_server_timewait = (val != 0); + break; + case DCCP_SOCKOPT_SEND_CSCOV: + err = dccp_setsockopt_cscov(sk, val, false); + break; + case DCCP_SOCKOPT_RECV_CSCOV: + err = dccp_setsockopt_cscov(sk, val, true); + break; + case DCCP_SOCKOPT_QPOLICY_ID: + if (sk->sk_state != DCCP_CLOSED) + err = -EISCONN; + else if (val < 0 || val >= DCCPQ_POLICY_MAX) + err = -EINVAL; + else + dp->dccps_qpolicy = val; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + if (val < 0) + err = -EINVAL; + else + dp->dccps_tx_qlen = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + release_sock(sk); + + return err; +} + +int dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level != SOL_DCCP) + return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, + optname, optval, + optlen); + return do_dccp_setsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL_GPL(dccp_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level != SOL_DCCP) + return inet_csk_compat_setsockopt(sk, level, optname, + optval, optlen); + return do_dccp_setsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL_GPL(compat_dccp_setsockopt); +#endif + +static int dccp_getsockopt_service(struct sock *sk, int len, + __be32 __user *optval, + int __user *optlen) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const struct dccp_service_list *sl; + int err = -ENOENT, slen = 0, total_len = sizeof(u32); + + lock_sock(sk); + if ((sl = dp->dccps_service_list) != NULL) { + slen = sl->dccpsl_nr * sizeof(u32); + total_len += slen; + } + + err = -EINVAL; + if (total_len > len) + goto out; + + err = 0; + if (put_user(total_len, optlen) || + put_user(dp->dccps_service, optval) || + (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) + err = -EFAULT; +out: + release_sock(sk); + return err; +} + +static int do_dccp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct dccp_sock *dp; + int val, len; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < (int)sizeof(int)) + return -EINVAL; + + dp = dccp_sk(sk); + + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); + return 0; + case DCCP_SOCKOPT_SERVICE: + return dccp_getsockopt_service(sk, len, + (__be32 __user *)optval, optlen); + case DCCP_SOCKOPT_GET_CUR_MPS: + val = dp->dccps_mss_cache; + break; + case DCCP_SOCKOPT_AVAILABLE_CCIDS: + return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); + case DCCP_SOCKOPT_TX_CCID: + val = ccid_get_current_tx_ccid(dp); + if (val < 0) + return -ENOPROTOOPT; + break; + case DCCP_SOCKOPT_RX_CCID: + val = ccid_get_current_rx_ccid(dp); + if (val < 0) + return -ENOPROTOOPT; + break; + case DCCP_SOCKOPT_SERVER_TIMEWAIT: + val = dp->dccps_server_timewait; + break; + case DCCP_SOCKOPT_SEND_CSCOV: + val = dp->dccps_pcslen; + break; + case DCCP_SOCKOPT_RECV_CSCOV: + val = dp->dccps_pcrlen; + break; + case DCCP_SOCKOPT_QPOLICY_ID: + val = dp->dccps_qpolicy; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + val = dp->dccps_tx_qlen; + break; + case 128 ... 191: + return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, + len, (u32 __user *)optval, optlen); + case 192 ... 255: + return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, + len, (u32 __user *)optval, optlen); + default: + return -ENOPROTOOPT; + } + + len = sizeof(val); + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +int dccp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_DCCP) + return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, + optname, optval, + optlen); + return do_dccp_getsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL_GPL(dccp_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_dccp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_DCCP) + return inet_csk_compat_getsockopt(sk, level, optname, + optval, optlen); + return do_dccp_getsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); +#endif + +static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) +{ + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + + /* + * Assign an (opaque) qpolicy priority value to skb->priority. + * + * We are overloading this skb field for use with the qpolicy subystem. + * The skb->priority is normally used for the SO_PRIORITY option, which + * is initialised from sk_priority. Since the assignment of sk_priority + * to skb->priority happens later (on layer 3), we overload this field + * for use with queueing priorities as long as the skb is on layer 4. + * The default priority value (if nothing is set) is 0. + */ + skb->priority = 0; + + for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_DCCP) + continue; + + if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && + !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) + return -EINVAL; + + switch (cmsg->cmsg_type) { + case DCCP_SCM_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) + return -EINVAL; + skb->priority = *(__u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} + +int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const int flags = msg->msg_flags; + const int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int rc, size; + long timeo; + + if (len > dp->dccps_mss_cache) + return -EMSGSIZE; + + lock_sock(sk); + + if (dccp_qpolicy_full(sk)) { + rc = -EAGAIN; + goto out_release; + } + + timeo = sock_sndtimeo(sk, noblock); + + /* + * We have to use sk_stream_wait_connect here to set sk_write_pending, + * so that the trick in dccp_rcv_request_sent_state_process. + */ + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_release; + + size = sk->sk_prot->max_header + len; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + if (skb == NULL) + goto out_release; + + skb_reserve(skb, sk->sk_prot->max_header); + rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (rc != 0) + goto out_discard; + + rc = dccp_msghdr_parse(msg, skb); + if (rc != 0) + goto out_discard; + + dccp_qpolicy_push(sk, skb); + /* + * The xmit_timer is set if the TX CCID is rate-based and will expire + * when congestion control permits to release further packets into the + * network. Window-based CCIDs do not use this timer. + */ + if (!timer_pending(&dp->dccps_xmit_timer)) + dccp_write_xmit(sk); +out_release: + release_sock(sk); + return rc ? : len; +out_discard: + kfree_skb(skb); + goto out_release; +} + +EXPORT_SYMBOL_GPL(dccp_sendmsg); + +int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + const struct dccp_hdr *dh; + long timeo; + + lock_sock(sk); + + if (sk->sk_state == DCCP_LISTEN) { + len = -ENOTCONN; + goto out; + } + + timeo = sock_rcvtimeo(sk, nonblock); + + do { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + if (skb == NULL) + goto verify_sock_status; + + dh = dccp_hdr(skb); + + switch (dh->dccph_type) { + case DCCP_PKT_DATA: + case DCCP_PKT_DATAACK: + goto found_ok_skb; + + case DCCP_PKT_CLOSE: + case DCCP_PKT_CLOSEREQ: + if (!(flags & MSG_PEEK)) + dccp_finish_passive_close(sk); + /* fall through */ + case DCCP_PKT_RESET: + dccp_pr_debug("found fin (%s) ok!\n", + dccp_packet_name(dh->dccph_type)); + len = 0; + goto found_fin_ok; + default: + dccp_pr_debug("packet_type=%s\n", + dccp_packet_name(dh->dccph_type)); + sk_eat_skb(sk, skb, 0); + } +verify_sock_status: + if (sock_flag(sk, SOCK_DONE)) { + len = 0; + break; + } + + if (sk->sk_err) { + len = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) { + len = 0; + break; + } + + if (sk->sk_state == DCCP_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + len = -ENOTCONN; + break; + } + len = 0; + break; + } + + if (!timeo) { + len = -EAGAIN; + break; + } + + if (signal_pending(current)) { + len = sock_intr_errno(timeo); + break; + } + + sk_wait_data(sk, &timeo); + continue; + found_ok_skb: + if (len > skb->len) + len = skb->len; + else if (len < skb->len) + msg->msg_flags |= MSG_TRUNC; + + if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + /* Exception. Bailout! */ + len = -EFAULT; + break; + } + if (flags & MSG_TRUNC) + len = skb->len; + found_fin_ok: + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb, 0); + break; + } while (1); +out: + release_sock(sk); + return len; +} + +EXPORT_SYMBOL_GPL(dccp_recvmsg); + +int inet_dccp_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != DCCP_LISTEN) { + /* + * FIXME: here it probably should be sk->sk_prot->listen_start + * see tcp_listen_start + */ + err = dccp_listen_start(sk, backlog); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +EXPORT_SYMBOL_GPL(inet_dccp_listen); + +static void dccp_terminate_connection(struct sock *sk) +{ + u8 next_state = DCCP_CLOSED; + + switch (sk->sk_state) { + case DCCP_PASSIVE_CLOSE: + case DCCP_PASSIVE_CLOSEREQ: + dccp_finish_passive_close(sk); + break; + case DCCP_PARTOPEN: + dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + /* fall through */ + case DCCP_OPEN: + dccp_send_close(sk, 1); + + if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && + !dccp_sk(sk)->dccps_server_timewait) + next_state = DCCP_ACTIVE_CLOSEREQ; + else + next_state = DCCP_CLOSING; + /* fall through */ + default: + dccp_set_state(sk, next_state); + } +} + +void dccp_close(struct sock *sk, long timeout) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + u32 data_was_unread = 0; + int state; + + lock_sock(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == DCCP_LISTEN) { + dccp_set_state(sk, DCCP_CLOSED); + + /* Special case. */ + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + sk_stop_timer(sk, &dp->dccps_xmit_timer); + + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + *reader process may not have drained the data yet! + */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + data_was_unread += skb->len; + __kfree_skb(skb); + } + + if (data_was_unread) { + /* Unread data was tossed, send an appropriate Reset Code */ + DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_set_state(sk, DCCP_CLOSED); + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + } else if (sk->sk_state != DCCP_CLOSED) { + /* + * Normal connection termination. May need to wait if there are + * still packets in the TX queue that are delayed by the CCID. + */ + dccp_flush_write_queue(sk, &timeout); + dccp_terminate_connection(sk); + } + + /* + * Flush write queue. This may be necessary in several cases: + * - we have been closed by the peer but still have application data; + * - abortive termination (unread data or zero linger time), + * - normal termination but queue could not be flushed within time limit + */ + __skb_queue_purge(&sk->sk_write_queue); + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + + /* + * It is the last release_sock in its life. It will remove backlog. + */ + release_sock(sk); + /* + * Now socket is owned by kernel and we acquire BH lock + * to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + WARN_ON(sock_owned_by_user(sk)); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + /* Have we already been destroyed by a softirq or backlog? */ + if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) + goto out; + + if (sk->sk_state == DCCP_CLOSED) + inet_csk_destroy_sock(sk); + + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +EXPORT_SYMBOL_GPL(dccp_close); + +void dccp_shutdown(struct sock *sk, int how) +{ + dccp_pr_debug("called shutdown(%x)\n", how); +} + +EXPORT_SYMBOL_GPL(dccp_shutdown); + +static inline int dccp_mib_init(void) +{ + return snmp_mib_init((void __percpu **)dccp_statistics, + sizeof(struct dccp_mib), + __alignof__(struct dccp_mib)); +} + +static inline void dccp_mib_exit(void) +{ + snmp_mib_free((void __percpu **)dccp_statistics); +} + +static int thash_entries; +module_param(thash_entries, int, 0444); +MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); + +#ifdef CONFIG_IP_DCCP_DEBUG +int dccp_debug; +module_param(dccp_debug, bool, 0644); +MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +EXPORT_SYMBOL_GPL(dccp_debug); +#endif + +static int __init dccp_init(void) +{ + unsigned long goal; + int ehash_order, bhash_order, i; + int rc; + + BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > + FIELD_SIZEOF(struct sk_buff, cb)); + rc = percpu_counter_init(&dccp_orphan_count, 0); + if (rc) + goto out_fail; + rc = -ENOBUFS; + inet_hashinfo_init(&dccp_hashinfo); + dccp_hashinfo.bind_bucket_cachep = + kmem_cache_create("dccp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dccp_hashinfo.bind_bucket_cachep) + goto out_free_percpu; + + /* + * Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + if (totalram_pages >= (128 * 1024)) + goal = totalram_pages >> (21 - PAGE_SHIFT); + else + goal = totalram_pages >> (23 - PAGE_SHIFT); + + if (thash_entries) + goal = (thash_entries * + sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; + for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) + ; + do { + unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / + sizeof(struct inet_ehash_bucket); + + while (hash_size & (hash_size - 1)) + hash_size--; + dccp_hashinfo.ehash_mask = hash_size - 1; + dccp_hashinfo.ehash = (struct inet_ehash_bucket *) + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); + } while (!dccp_hashinfo.ehash && --ehash_order > 0); + + if (!dccp_hashinfo.ehash) { + DCCP_CRIT("Failed to allocate DCCP established hash table"); + goto out_free_bind_bucket_cachep; + } + + for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) { + INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); + INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i); + } + + if (inet_ehash_locks_alloc(&dccp_hashinfo)) + goto out_free_dccp_ehash; + + bhash_order = ehash_order; + + do { + dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / + sizeof(struct inet_bind_hashbucket); + if ((dccp_hashinfo.bhash_size > (64 * 1024)) && + bhash_order > 0) + continue; + dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); + } while (!dccp_hashinfo.bhash && --bhash_order >= 0); + + if (!dccp_hashinfo.bhash) { + DCCP_CRIT("Failed to allocate DCCP bind hash table"); + goto out_free_dccp_locks; + } + + for (i = 0; i < dccp_hashinfo.bhash_size; i++) { + spin_lock_init(&dccp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); + } + + rc = dccp_mib_init(); + if (rc) + goto out_free_dccp_bhash; + + rc = dccp_ackvec_init(); + if (rc) + goto out_free_dccp_mib; + + rc = dccp_sysctl_init(); + if (rc) + goto out_ackvec_exit; + + rc = ccid_initialize_builtins(); + if (rc) + goto out_sysctl_exit; + + dccp_timestamping_init(); + + return 0; + +out_sysctl_exit: + dccp_sysctl_exit(); +out_ackvec_exit: + dccp_ackvec_exit(); +out_free_dccp_mib: + dccp_mib_exit(); +out_free_dccp_bhash: + free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); +out_free_dccp_locks: + inet_ehash_locks_free(&dccp_hashinfo); +out_free_dccp_ehash: + free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); +out_free_bind_bucket_cachep: + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); +out_free_percpu: + percpu_counter_destroy(&dccp_orphan_count); +out_fail: + dccp_hashinfo.bhash = NULL; + dccp_hashinfo.ehash = NULL; + dccp_hashinfo.bind_bucket_cachep = NULL; + return rc; +} + +static void __exit dccp_fini(void) +{ + ccid_cleanup_builtins(); + dccp_mib_exit(); + free_pages((unsigned long)dccp_hashinfo.bhash, + get_order(dccp_hashinfo.bhash_size * + sizeof(struct inet_bind_hashbucket))); + free_pages((unsigned long)dccp_hashinfo.ehash, + get_order((dccp_hashinfo.ehash_mask + 1) * + sizeof(struct inet_ehash_bucket))); + inet_ehash_locks_free(&dccp_hashinfo); + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + dccp_ackvec_exit(); + dccp_sysctl_exit(); + percpu_counter_destroy(&dccp_orphan_count); +} + +module_init(dccp_init); +module_exit(dccp_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c new file mode 100644 index 00000000..63c30bfa --- /dev/null +++ b/net/dccp/qpolicy.c @@ -0,0 +1,137 @@ +/* + * net/dccp/qpolicy.c + * + * Policy-based packet dequeueing interface for DCCP. + * + * Copyright (c) 2008 Tomasz Grobelny + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License v2 + * as published by the Free Software Foundation. + */ +#include "dccp.h" + +/* + * Simple Dequeueing Policy: + * If tx_qlen is different from 0, enqueue up to tx_qlen elements. + */ +static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) +{ + skb_queue_tail(&sk->sk_write_queue, skb); +} + +static bool qpolicy_simple_full(struct sock *sk) +{ + return dccp_sk(sk)->dccps_tx_qlen && + sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; +} + +static struct sk_buff *qpolicy_simple_top(struct sock *sk) +{ + return skb_peek(&sk->sk_write_queue); +} + +/* + * Priority-based Dequeueing Policy: + * If tx_qlen is different from 0 and the queue has reached its upper bound + * of tx_qlen elements, replace older packets lowest-priority-first. + */ +static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) +{ + struct sk_buff *skb, *best = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (best == NULL || skb->priority > best->priority) + best = skb; + return best; +} + +static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) +{ + struct sk_buff *skb, *worst = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (worst == NULL || skb->priority < worst->priority) + worst = skb; + return worst; +} + +static bool qpolicy_prio_full(struct sock *sk) +{ + if (qpolicy_simple_full(sk)) + dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); + return false; +} + +/** + * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface + * @push: add a new @skb to the write queue + * @full: indicates that no more packets will be admitted + * @top: peeks at whatever the queueing policy defines as its `top' + */ +static struct dccp_qpolicy_operations { + void (*push) (struct sock *sk, struct sk_buff *skb); + bool (*full) (struct sock *sk); + struct sk_buff* (*top) (struct sock *sk); + __be32 params; + +} qpol_table[DCCPQ_POLICY_MAX] = { + [DCCPQ_POLICY_SIMPLE] = { + .push = qpolicy_simple_push, + .full = qpolicy_simple_full, + .top = qpolicy_simple_top, + .params = 0, + }, + [DCCPQ_POLICY_PRIO] = { + .push = qpolicy_simple_push, + .full = qpolicy_prio_full, + .top = qpolicy_prio_best_skb, + .params = DCCP_SCM_PRIORITY, + }, +}; + +/* + * Externally visible interface + */ +void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) +{ + qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); +} + +bool dccp_qpolicy_full(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); +} + +void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) +{ + if (skb != NULL) { + skb_unlink(skb, &sk->sk_write_queue); + kfree_skb(skb); + } +} + +struct sk_buff *dccp_qpolicy_top(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); +} + +struct sk_buff *dccp_qpolicy_pop(struct sock *sk) +{ + struct sk_buff *skb = dccp_qpolicy_top(sk); + + if (skb != NULL) { + /* Clear any skb fields that we used internally */ + skb->priority = 0; + skb_unlink(skb, &sk->sk_write_queue); + } + return skb; +} + +bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) +{ + /* check if exactly one bit is set */ + if (!param || (param & (param - 1))) + return false; + return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; +} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c new file mode 100644 index 00000000..42348824 --- /dev/null +++ b/net/dccp/sysctl.c @@ -0,0 +1,124 @@ +/* + * net/dccp/sysctl.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License v2 + * as published by the Free Software Foundation. + */ + +#include +#include +#include "dccp.h" +#include "feat.h" + +#ifndef CONFIG_SYSCTL +#error This file should not be compiled without CONFIG_SYSCTL defined +#endif + +/* Boundary values */ +static int zero = 0, + u8_max = 0xFF; +static unsigned long seqw_min = DCCPF_SEQ_WMIN, + seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ + +static struct ctl_table dccp_default_table[] = { + { + .procname = "seq_window", + .data = &sysctl_dccp_sequence_window, + .maxlen = sizeof(sysctl_dccp_sequence_window), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ + .extra2 = &seqw_max, + }, + { + .procname = "rx_ccid", + .data = &sysctl_dccp_rx_ccid, + .maxlen = sizeof(sysctl_dccp_rx_ccid), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ + }, + { + .procname = "tx_ccid", + .data = &sysctl_dccp_tx_ccid, + .maxlen = sizeof(sysctl_dccp_tx_ccid), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ + }, + { + .procname = "request_retries", + .data = &sysctl_dccp_request_retries, + .maxlen = sizeof(sysctl_dccp_request_retries), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, + }, + { + .procname = "retries1", + .data = &sysctl_dccp_retries1, + .maxlen = sizeof(sysctl_dccp_retries1), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, + }, + { + .procname = "retries2", + .data = &sysctl_dccp_retries2, + .maxlen = sizeof(sysctl_dccp_retries2), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, + }, + { + .procname = "tx_qlen", + .data = &sysctl_dccp_tx_qlen, + .maxlen = sizeof(sysctl_dccp_tx_qlen), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "sync_ratelimit", + .data = &sysctl_dccp_sync_ratelimit, + .maxlen = sizeof(sysctl_dccp_sync_ratelimit), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + + { } +}; + +static struct ctl_path dccp_path[] = { + { .procname = "net", }, + { .procname = "dccp", }, + { .procname = "default", }, + { } +}; + +static struct ctl_table_header *dccp_table_header; + +int __init dccp_sysctl_init(void) +{ + dccp_table_header = register_sysctl_paths(dccp_path, + dccp_default_table); + + return dccp_table_header != NULL ? 0 : -ENOMEM; +} + +void dccp_sysctl_exit(void) +{ + if (dccp_table_header != NULL) { + unregister_sysctl_table(dccp_table_header); + dccp_table_header = NULL; + } +} diff --git a/net/dccp/timer.c b/net/dccp/timer.c new file mode 100644 index 00000000..7587870b --- /dev/null +++ b/net/dccp/timer.c @@ -0,0 +1,292 @@ +/* + * net/dccp/timer.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include "dccp.h" + +/* sysctl variables governing numbers of retransmission attempts */ +int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; +int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; +int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; + +static void dccp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_done(sk); + DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT); +} + +/* A write timeout has occurred. Process the after effects. */ +static int dccp_write_timeout(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + int retry_until; + + if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { + if (icsk->icsk_retransmits != 0) + dst_negative_advice(sk); + retry_until = icsk->icsk_syn_retries ? + : sysctl_dccp_request_retries; + } else { + if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu + black hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(sk); + } + + retry_until = sysctl_dccp_retries2; + /* + * FIXME: see tcp_write_timout and tcp_out_of_resources + */ + } + + if (icsk->icsk_retransmits >= retry_until) { + /* Has it gone just too far? */ + dccp_write_err(sk); + return 1; + } + return 0; +} + +/* + * The DCCP retransmit timer. + */ +static void dccp_retransmit_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* + * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was + * sent, no need to retransmit, this sock is dead. + */ + if (dccp_write_timeout(sk)) + return; + + /* + * We want to know the number of packets retransmitted, not the + * total number of retransmissions of clones of original packets. + */ + if (icsk->icsk_retransmits == 0) + DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS); + + if (dccp_retransmit_skb(sk) != 0) { + /* + * Retransmission failed because of local congestion, + * do not backoff. + */ + if (--icsk->icsk_retransmits == 0) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, + TCP_RESOURCE_PROBE_INTERVAL), + DCCP_RTO_MAX); + return; + } + + icsk->icsk_backoff++; + + icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, + DCCP_RTO_MAX); + if (icsk->icsk_retransmits > sysctl_dccp_retries1) + __sk_dst_reset(sk); +} + +static void dccp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + int event = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + jiffies + (HZ / 20)); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + icsk->icsk_timeout); + goto out; + } + + event = icsk->icsk_pending; + icsk->icsk_pending = 0; + + switch (event) { + case ICSK_TIME_RETRANS: + dccp_retransmit_timer(sk); + break; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ +static void dccp_response_timer(struct sock *sk) +{ + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, + DCCP_RTO_MAX); +} + +static void dccp_keepalive_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer(sk, HZ / 20); + goto out; + } + + if (sk->sk_state == DCCP_LISTEN) { + dccp_response_timer(sk); + goto out; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ +static void dccp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + icsk->icsk_ack.blocked = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &icsk->icsk_delack_timer, + jiffies + TCP_DELACK_MIN); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, + icsk->icsk_ack.timeout); + goto out; + } + + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, + icsk->icsk_rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } + dccp_send_ack(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/** + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * See the comments above %ccid_dequeueing_decision for supported modes. + */ +static void dccp_write_xmitlet(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); + else + dccp_write_xmit(sk); + bh_unlock_sock(sk); +} + +static void dccp_write_xmit_timer(unsigned long data) +{ + dccp_write_xmitlet(data); + sock_put((struct sock *)data); +} + +void dccp_init_xmit_timers(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); + inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, + &dccp_keepalive_timer); +} + +static ktime_t dccp_timestamp_seed; +/** + * dccp_timestamp - 10s of microseconds time source + * Returns the number of 10s of microseconds since loading DCCP. This is native + * DCCP time difference format (RFC 4340, sec. 13). + * Please note: This will wrap around about circa every 11.9 hours. + */ +u32 dccp_timestamp(void) +{ + s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); + + do_div(delta, 10); + return delta; +} +EXPORT_SYMBOL_GPL(dccp_timestamp); + +void __init dccp_timestamping_init(void) +{ + dccp_timestamp_seed = ktime_get_real(); +} diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig new file mode 100644 index 00000000..7914fd61 --- /dev/null +++ b/net/decnet/Kconfig @@ -0,0 +1,43 @@ +# +# DECnet configuration +# +config DECNET + tristate "DECnet Support" + ---help--- + The DECnet networking protocol was used in many products made by + Digital (now Compaq). It provides reliable stream and sequenced + packet communications over which run a variety of services similar + to those which run over TCP/IP. + + To find some tools to use with the kernel layer support, please + look at Patrick Caulfield's web site: + . + + More detailed documentation is available in + . + + Be sure to say Y to "/proc file system support" and "Sysctl support" + below when using DECnet, since you will need sysctl support to aid + in configuration at run time. + + The DECnet code is also available as a module ( = code which can be + inserted in and removed from the running kernel whenever you want). + The module is called decnet. + +config DECNET_ROUTER + bool "DECnet: router support (EXPERIMENTAL)" + depends on DECNET && EXPERIMENTAL + select FIB_RULES + ---help--- + Add support for turning your DECnet Endnode into a level 1 or 2 + router. This is an experimental, but functional option. If you + do say Y here, then make sure that you also say Y to "Kernel/User + network link driver", "Routing messages" and "Network packet + filtering". The first two are required to allow configuration via + rtnetlink (you will need Alexey Kuznetsov's iproute2 package + from ). The "Network packet + filtering" option will be required for the forthcoming routing daemon + to work. + + See for more information. + diff --git a/net/decnet/Makefile b/net/decnet/Makefile new file mode 100644 index 00000000..e44003af --- /dev/null +++ b/net/decnet/Makefile @@ -0,0 +1,10 @@ + +obj-$(CONFIG_DECNET) += decnet.o + +decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \ + dn_route.o dn_dev.o dn_neigh.o dn_timer.o +decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o +decnet-y += sysctl_net_decnet.o + +obj-$(CONFIG_NETFILTER) += netfilter/ + diff --git a/net/decnet/README b/net/decnet/README new file mode 100644 index 00000000..60e7ec88 --- /dev/null +++ b/net/decnet/README @@ -0,0 +1,8 @@ + Linux DECnet Project + ====================== + +The documentation for this kernel subsystem is available in the +Documentation/networking subdirectory of this distribution and also +on line at http://www.chygwyn.com/DECnet/ + +Steve Whitehouse diff --git a/net/decnet/TODO b/net/decnet/TODO new file mode 100644 index 00000000..ebb5ac69 --- /dev/null +++ b/net/decnet/TODO @@ -0,0 +1,41 @@ +Steve's quick list of things that need finishing off: +[they are in no particular order and range from the trivial to the long winded] + + o Proper timeouts on each neighbour (in routing mode) rather than + just the 60 second On-Ethernet cache value. + + o Support for X.25 linklayer + + o Support for DDCMP link layer + + o The DDCMP device itself + + o PPP support (rfc1762) + + o Lots of testing with real applications + + o Verify errors etc. against POSIX 1003.1g (draft) + + o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) + [maybe this should be done at socket level... the control data in the + send/recvmsg() calls should simply be a vector of set/getsockopt() + calls] + + o check MSG_CTRUNC is set where it should be. + + o Find all the commonality between DECnet and IPv4 routing code and extract + it into a small library of routines. [probably a project for 2.7.xx] + + o Add perfect socket hashing - an idea suggested by Paul Koning. Currently + we have a half-way house scheme which seems to work reasonably well, but + the full scheme is still worth implementing, its not not top of my list + right now. + + o Add session control message flow control + + o Add NSP message flow control + + o DECnet sendpages() function + + o AIO for DECnet + diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c new file mode 100644 index 00000000..ea3b6ee2 --- /dev/null +++ b/net/decnet/af_decnet.c @@ -0,0 +1,2423 @@ + +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Socket Layer Interface + * + * Authors: Eduardo Marcelo Serrat + * Patrick Caulfield + * + * Changes: + * Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's + * version of the code. Original copyright preserved + * below. + * Steve Whitehouse: Some bug fixes, cleaning up some code to make it + * compatible with my routing layer. + * Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick + * Caulfield. + * Steve Whitehouse: Further bug fixes, checking module code still works + * with new routing layer. + * Steve Whitehouse: Additional set/get_sockopt() calls. + * Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new + * code. + * Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like + * way. Didn't manage it entirely, but its better. + * Steve Whitehouse: ditto for sendmsg(). + * Steve Whitehouse: A selection of bug fixes to various things. + * Steve Whitehouse: Added TIOCOUTQ ioctl. + * Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username. + * Steve Whitehouse: Fixes to connect() error returns. + * Patrick Caulfield: Fixes to delayed acceptance logic. + * David S. Miller: New socket locking + * Steve Whitehouse: Socket list hashing/locking + * Arnaldo C. Melo: use capable, not suser + * Steve Whitehouse: Removed unused code. Fix to use sk->allocation + * when required. + * Patrick Caulfield: /proc/net/decnet now has object name/number + * Steve Whitehouse: Fixed local port allocation, hashed sk list + * Matthew Wilcox: Fixes for dn_ioctl() + * Steve Whitehouse: New connect/accept logic to allow timeouts and + * prepare for sendpage etc. + */ + + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + +HISTORY: + +Version Kernel Date Author/Comments +------- ------ ---- --------------- +Version 0.0.1 2.0.30 01-dic-97 Eduardo Marcelo Serrat + (emserrat@geocities.com) + + First Development of DECnet Socket La- + yer for Linux. Only supports outgoing + connections. + +Version 0.0.2 2.1.105 20-jun-98 Patrick J. Caulfield + (patrick@pandh.demon.co.uk) + + Port to new kernel development version. + +Version 0.0.3 2.1.106 25-jun-98 Eduardo Marcelo Serrat + (emserrat@geocities.com) + _ + Added support for incoming connections + so we can start developing server apps + on Linux. + - + Module Support +Version 0.0.4 2.1.109 21-jul-98 Eduardo Marcelo Serrat + (emserrat@geocities.com) + _ + Added support for X11R6.4. Now we can + use DECnet transport for X on Linux!!! + - +Version 0.0.5 2.1.110 01-aug-98 Eduardo Marcelo Serrat + (emserrat@geocities.com) + Removed bugs on flow control + Removed bugs on incoming accessdata + order + - +Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat + dn_recvmsg fixes + + Patrick J. Caulfield + dn_bind fixes +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_sock { + struct sock sk; + struct dn_scp scp; +}; + +static void dn_keepalive(struct sock *sk); + +#define DN_SK_HASH_SHIFT 8 +#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT) +#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) + + +static const struct proto_ops dn_proto_ops; +static DEFINE_RWLOCK(dn_hash_lock); +static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; +static struct hlist_head dn_wild_sk; +static atomic_long_t decnet_memory_allocated; + +static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); +static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); + +static struct hlist_head *dn_find_list(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->addr.sdn_flags & SDF_WILD) + return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL; + + return &dn_sk_hash[le16_to_cpu(scp->addrloc) & DN_SK_HASH_MASK]; +} + +/* + * Valid ports are those greater than zero and not already in use. + */ +static int check_port(__le16 port) +{ + struct sock *sk; + struct hlist_node *node; + + if (port == 0) + return -1; + + sk_for_each(sk, node, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) { + struct dn_scp *scp = DN_SK(sk); + if (scp->addrloc == port) + return -1; + } + return 0; +} + +static unsigned short port_alloc(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); +static unsigned short port = 0x2000; + unsigned short i_port = port; + + while(check_port(cpu_to_le16(++port)) != 0) { + if (port == i_port) + return 0; + } + + scp->addrloc = cpu_to_le16(port); + + return 1; +} + +/* + * Since this is only ever called from user + * level, we don't need a write_lock() version + * of this. + */ +static int dn_hash_sock(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + struct hlist_head *list; + int rv = -EUSERS; + + BUG_ON(sk_hashed(sk)); + + write_lock_bh(&dn_hash_lock); + + if (!scp->addrloc && !port_alloc(sk)) + goto out; + + rv = -EADDRINUSE; + if ((list = dn_find_list(sk)) == NULL) + goto out; + + sk_add_node(sk, list); + rv = 0; +out: + write_unlock_bh(&dn_hash_lock); + return rv; +} + +static void dn_unhash_sock(struct sock *sk) +{ + write_lock(&dn_hash_lock); + sk_del_node_init(sk); + write_unlock(&dn_hash_lock); +} + +static void dn_unhash_sock_bh(struct sock *sk) +{ + write_lock_bh(&dn_hash_lock); + sk_del_node_init(sk); + write_unlock_bh(&dn_hash_lock); +} + +static struct hlist_head *listen_hash(struct sockaddr_dn *addr) +{ + int i; + unsigned hash = addr->sdn_objnum; + + if (hash == 0) { + hash = addr->sdn_objnamel; + for(i = 0; i < le16_to_cpu(addr->sdn_objnamel); i++) { + hash ^= addr->sdn_objname[i]; + hash ^= (hash << 3); + } + } + + return &dn_sk_hash[hash & DN_SK_HASH_MASK]; +} + +/* + * Called to transform a socket from bound (i.e. with a local address) + * into a listening socket (doesn't need a local port number) and rehashes + * based upon the object name/number. + */ +static void dn_rehash_sock(struct sock *sk) +{ + struct hlist_head *list; + struct dn_scp *scp = DN_SK(sk); + + if (scp->addr.sdn_flags & SDF_WILD) + return; + + write_lock_bh(&dn_hash_lock); + sk_del_node_init(sk); + DN_SK(sk)->addrloc = 0; + list = listen_hash(&DN_SK(sk)->addr); + sk_add_node(sk, list); + write_unlock_bh(&dn_hash_lock); +} + +int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type) +{ + int len = 2; + + *buf++ = type; + + switch(type) { + case 0: + *buf++ = sdn->sdn_objnum; + break; + case 1: + *buf++ = 0; + *buf++ = le16_to_cpu(sdn->sdn_objnamel); + memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel)); + len = 3 + le16_to_cpu(sdn->sdn_objnamel); + break; + case 2: + memset(buf, 0, 5); + buf += 5; + *buf++ = le16_to_cpu(sdn->sdn_objnamel); + memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel)); + len = 7 + le16_to_cpu(sdn->sdn_objnamel); + break; + } + + return len; +} + +/* + * On reception of usernames, we handle types 1 and 0 for destination + * addresses only. Types 2 and 4 are used for source addresses, but the + * UIC, GIC are ignored and they are both treated the same way. Type 3 + * is never used as I've no idea what its purpose might be or what its + * format is. + */ +int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt) +{ + unsigned char type; + int size = len; + int namel = 12; + + sdn->sdn_objnum = 0; + sdn->sdn_objnamel = cpu_to_le16(0); + memset(sdn->sdn_objname, 0, DN_MAXOBJL); + + if (len < 2) + return -1; + + len -= 2; + *fmt = *data++; + type = *data++; + + switch(*fmt) { + case 0: + sdn->sdn_objnum = type; + return 2; + case 1: + namel = 16; + break; + case 2: + len -= 4; + data += 4; + break; + case 4: + len -= 8; + data += 8; + break; + default: + return -1; + } + + len -= 1; + + if (len < 0) + return -1; + + sdn->sdn_objnamel = cpu_to_le16(*data++); + len -= le16_to_cpu(sdn->sdn_objnamel); + + if ((len < 0) || (le16_to_cpu(sdn->sdn_objnamel) > namel)) + return -1; + + memcpy(sdn->sdn_objname, data, le16_to_cpu(sdn->sdn_objnamel)); + + return size - len; +} + +struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr) +{ + struct hlist_head *list = listen_hash(addr); + struct hlist_node *node; + struct sock *sk; + + read_lock(&dn_hash_lock); + sk_for_each(sk, node, list) { + struct dn_scp *scp = DN_SK(sk); + if (sk->sk_state != TCP_LISTEN) + continue; + if (scp->addr.sdn_objnum) { + if (scp->addr.sdn_objnum != addr->sdn_objnum) + continue; + } else { + if (addr->sdn_objnum) + continue; + if (scp->addr.sdn_objnamel != addr->sdn_objnamel) + continue; + if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, le16_to_cpu(addr->sdn_objnamel)) != 0) + continue; + } + sock_hold(sk); + read_unlock(&dn_hash_lock); + return sk; + } + + sk = sk_head(&dn_wild_sk); + if (sk) { + if (sk->sk_state == TCP_LISTEN) + sock_hold(sk); + else + sk = NULL; + } + + read_unlock(&dn_hash_lock); + return sk; +} + +struct sock *dn_find_by_skb(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct sock *sk; + struct hlist_node *node; + struct dn_scp *scp; + + read_lock(&dn_hash_lock); + sk_for_each(sk, node, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) { + scp = DN_SK(sk); + if (cb->src != dn_saddr2dn(&scp->peer)) + continue; + if (cb->dst_port != scp->addrloc) + continue; + if (scp->addrrem && (cb->src_port != scp->addrrem)) + continue; + sock_hold(sk); + goto found; + } + sk = NULL; +found: + read_unlock(&dn_hash_lock); + return sk; +} + + + +static void dn_destruct(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + skb_queue_purge(&scp->data_xmit_queue); + skb_queue_purge(&scp->other_xmit_queue); + skb_queue_purge(&scp->other_receive_queue); + + dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); +} + +static int dn_memory_pressure; + +static void dn_enter_memory_pressure(struct sock *sk) +{ + if (!dn_memory_pressure) { + dn_memory_pressure = 1; + } +} + +static struct proto dn_proto = { + .name = "NSP", + .owner = THIS_MODULE, + .enter_memory_pressure = dn_enter_memory_pressure, + .memory_pressure = &dn_memory_pressure, + .memory_allocated = &decnet_memory_allocated, + .sysctl_mem = sysctl_decnet_mem, + .sysctl_wmem = sysctl_decnet_wmem, + .sysctl_rmem = sysctl_decnet_rmem, + .max_header = DN_MAX_NSP_DATA_HEADER + 64, + .obj_size = sizeof(struct dn_sock), +}; + +static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp) +{ + struct dn_scp *scp; + struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto); + + if (!sk) + goto out; + + if (sock) + sock->ops = &dn_proto_ops; + sock_init_data(sock, sk); + + sk->sk_backlog_rcv = dn_nsp_backlog_rcv; + sk->sk_destruct = dn_destruct; + sk->sk_no_check = 1; + sk->sk_family = PF_DECnet; + sk->sk_protocol = 0; + sk->sk_allocation = gfp; + sk->sk_sndbuf = sysctl_decnet_wmem[1]; + sk->sk_rcvbuf = sysctl_decnet_rmem[1]; + + /* Initialization of DECnet Session Control Port */ + scp = DN_SK(sk); + scp->state = DN_O; /* Open */ + scp->numdat = 1; /* Next data seg to tx */ + scp->numoth = 1; /* Next oth data to tx */ + scp->ackxmt_dat = 0; /* Last data seg ack'ed */ + scp->ackxmt_oth = 0; /* Last oth data ack'ed */ + scp->ackrcv_dat = 0; /* Highest data ack recv*/ + scp->ackrcv_oth = 0; /* Last oth data ack rec*/ + scp->flowrem_sw = DN_SEND; + scp->flowloc_sw = DN_SEND; + scp->flowrem_dat = 0; + scp->flowrem_oth = 1; + scp->flowloc_dat = 0; + scp->flowloc_oth = 1; + scp->services_rem = 0; + scp->services_loc = 1 | NSP_FC_NONE; + scp->info_rem = 0; + scp->info_loc = 0x03; /* NSP version 4.1 */ + scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */ + scp->nonagle = 0; + scp->multi_ireq = 1; + scp->accept_mode = ACC_IMMED; + scp->addr.sdn_family = AF_DECnet; + scp->peer.sdn_family = AF_DECnet; + scp->accessdata.acc_accl = 5; + memcpy(scp->accessdata.acc_acc, "LINUX", 5); + + scp->max_window = NSP_MAX_WINDOW; + scp->snd_window = NSP_MIN_WINDOW; + scp->nsp_srtt = NSP_INITIAL_SRTT; + scp->nsp_rttvar = NSP_INITIAL_RTTVAR; + scp->nsp_rxtshift = 0; + + skb_queue_head_init(&scp->data_xmit_queue); + skb_queue_head_init(&scp->other_xmit_queue); + skb_queue_head_init(&scp->other_receive_queue); + + scp->persist = 0; + scp->persist_fxn = NULL; + scp->keepalive = 10 * HZ; + scp->keepalive_fxn = dn_keepalive; + + init_timer(&scp->delack_timer); + scp->delack_pending = 0; + scp->delack_fxn = dn_nsp_delayed_ack; + + dn_start_slow_timer(sk); +out: + return sk; +} + +/* + * Keepalive timer. + * FIXME: Should respond to SO_KEEPALIVE etc. + */ +static void dn_keepalive(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + /* + * By checking the other_data transmit queue is empty + * we are double checking that we are not sending too + * many of these keepalive frames. + */ + if (skb_queue_empty(&scp->other_xmit_queue)) + dn_nsp_send_link(sk, DN_NOCHANGE, 0); +} + + +/* + * Timer for shutdown/destroyed sockets. + * When socket is dead & no packets have been sent for a + * certain amount of time, they are removed by this + * routine. Also takes care of sending out DI & DC + * frames at correct times. + */ +int dn_destroy_timer(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + scp->persist = dn_nsp_persist(sk); + + switch(scp->state) { + case DN_DI: + dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC); + if (scp->nsp_rxtshift >= decnet_di_count) + scp->state = DN_CN; + return 0; + + case DN_DR: + dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC); + if (scp->nsp_rxtshift >= decnet_dr_count) + scp->state = DN_DRC; + return 0; + + case DN_DN: + if (scp->nsp_rxtshift < decnet_dn_count) { + /* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */ + dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC); + return 0; + } + } + + scp->persist = (HZ * decnet_time_wait); + + if (sk->sk_socket) + return 0; + + if ((jiffies - scp->stamp) >= (HZ * decnet_time_wait)) { + dn_unhash_sock(sk); + sock_put(sk); + return 1; + } + + return 0; +} + +static void dn_destroy_sock(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + scp->nsp_rxtshift = 0; /* reset back off */ + + if (sk->sk_socket) { + if (sk->sk_socket->state != SS_UNCONNECTED) + sk->sk_socket->state = SS_DISCONNECTING; + } + + sk->sk_state = TCP_CLOSE; + + switch(scp->state) { + case DN_DN: + dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, + sk->sk_allocation); + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + break; + case DN_CR: + scp->state = DN_DR; + goto disc_reject; + case DN_RUN: + scp->state = DN_DI; + case DN_DI: + case DN_DR: +disc_reject: + dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation); + case DN_NC: + case DN_NR: + case DN_RJ: + case DN_DIC: + case DN_CN: + case DN_DRC: + case DN_CI: + case DN_CD: + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + break; + default: + printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n"); + case DN_O: + dn_stop_slow_timer(sk); + + dn_unhash_sock_bh(sk); + sock_put(sk); + + break; + } +} + +char *dn_addr2asc(__u16 addr, char *buf) +{ + unsigned short node, area; + + node = addr & 0x03ff; + area = addr >> 10; + sprintf(buf, "%hd.%hd", area, node); + + return buf; +} + + + +static int dn_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + switch(sock->type) { + case SOCK_SEQPACKET: + if (protocol != DNPROTO_NSP) + return -EPROTONOSUPPORT; + break; + case SOCK_STREAM: + break; + default: + return -ESOCKTNOSUPPORT; + } + + + if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL) + return -ENOBUFS; + + sk->sk_protocol = protocol; + + return 0; +} + + +static int +dn_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock_orphan(sk); + sock_hold(sk); + lock_sock(sk); + dn_destroy_sock(sk); + release_sock(sk); + sock_put(sk); + } + + return 0; +} + +static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr; + struct net_device *dev, *ldev; + int rv; + + if (addr_len != sizeof(struct sockaddr_dn)) + return -EINVAL; + + if (saddr->sdn_family != AF_DECnet) + return -EINVAL; + + if (le16_to_cpu(saddr->sdn_nodeaddrl) && (le16_to_cpu(saddr->sdn_nodeaddrl) != 2)) + return -EINVAL; + + if (le16_to_cpu(saddr->sdn_objnamel) > DN_MAXOBJL) + return -EINVAL; + + if (saddr->sdn_flags & ~SDF_WILD) + return -EINVAL; + + if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum || + (saddr->sdn_flags & SDF_WILD))) + return -EACCES; + + if (!(saddr->sdn_flags & SDF_WILD)) { + if (le16_to_cpu(saddr->sdn_nodeaddrl)) { + rcu_read_lock(); + ldev = NULL; + for_each_netdev_rcu(&init_net, dev) { + if (!dev->dn_ptr) + continue; + if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) { + ldev = dev; + break; + } + } + rcu_read_unlock(); + if (ldev == NULL) + return -EADDRNOTAVAIL; + } + } + + rv = -EINVAL; + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) { + memcpy(&scp->addr, saddr, addr_len); + sock_reset_flag(sk, SOCK_ZAPPED); + + rv = dn_hash_sock(sk); + if (rv) + sock_set_flag(sk, SOCK_ZAPPED); + } + release_sock(sk); + + return rv; +} + + +static int dn_auto_bind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int rv; + + sock_reset_flag(sk, SOCK_ZAPPED); + + scp->addr.sdn_flags = 0; + scp->addr.sdn_objnum = 0; + + /* + * This stuff is to keep compatibility with Eduardo's + * patch. I hope I can dispense with it shortly... + */ + if ((scp->accessdata.acc_accl != 0) && + (scp->accessdata.acc_accl <= 12)) { + + scp->addr.sdn_objnamel = cpu_to_le16(scp->accessdata.acc_accl); + memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, le16_to_cpu(scp->addr.sdn_objnamel)); + + scp->accessdata.acc_accl = 0; + memset(scp->accessdata.acc_acc, 0, 40); + } + /* End of compatibility stuff */ + + scp->addr.sdn_add.a_len = cpu_to_le16(2); + rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr); + if (rv == 0) { + rv = dn_hash_sock(sk); + if (rv) + sock_set_flag(sk, SOCK_ZAPPED); + } + + return rv; +} + +static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) +{ + struct dn_scp *scp = DN_SK(sk); + DEFINE_WAIT(wait); + int err; + + if (scp->state != DN_CR) + return -EINVAL; + + scp->state = DN_CC; + scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk)); + dn_send_conn_conf(sk, allocation); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + for(;;) { + release_sock(sk); + if (scp->state == DN_CC) + *timeo = schedule_timeout(*timeo); + lock_sock(sk); + err = 0; + if (scp->state == DN_RUN) + break; + err = sock_error(sk); + if (err) + break; + err = sock_intr_errno(*timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!*timeo) + break; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + if (err == 0) { + sk->sk_socket->state = SS_CONNECTED; + } else if (scp->state != DN_CC) { + sk->sk_socket->state = SS_UNCONNECTED; + } + return err; +} + +static int dn_wait_run(struct sock *sk, long *timeo) +{ + struct dn_scp *scp = DN_SK(sk); + DEFINE_WAIT(wait); + int err = 0; + + if (scp->state == DN_RUN) + goto out; + + if (!*timeo) + return -EALREADY; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + for(;;) { + release_sock(sk); + if (scp->state == DN_CI || scp->state == DN_CC) + *timeo = schedule_timeout(*timeo); + lock_sock(sk); + err = 0; + if (scp->state == DN_RUN) + break; + err = sock_error(sk); + if (err) + break; + err = sock_intr_errno(*timeo); + if (signal_pending(current)) + break; + err = -ETIMEDOUT; + if (!*timeo) + break; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); +out: + if (err == 0) { + sk->sk_socket->state = SS_CONNECTED; + } else if (scp->state != DN_CI && scp->state != DN_CC) { + sk->sk_socket->state = SS_UNCONNECTED; + } + return err; +} + +static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags) +{ + struct socket *sock = sk->sk_socket; + struct dn_scp *scp = DN_SK(sk); + int err = -EISCONN; + struct flowidn fld; + + if (sock->state == SS_CONNECTED) + goto out; + + if (sock->state == SS_CONNECTING) { + err = 0; + if (scp->state == DN_RUN) { + sock->state = SS_CONNECTED; + goto out; + } + err = -ECONNREFUSED; + if (scp->state != DN_CI && scp->state != DN_CC) { + sock->state = SS_UNCONNECTED; + goto out; + } + return dn_wait_run(sk, timeo); + } + + err = -EINVAL; + if (scp->state != DN_O) + goto out; + + if (addr == NULL || addrlen != sizeof(struct sockaddr_dn)) + goto out; + if (addr->sdn_family != AF_DECnet) + goto out; + if (addr->sdn_flags & SDF_WILD) + goto out; + + if (sock_flag(sk, SOCK_ZAPPED)) { + err = dn_auto_bind(sk->sk_socket); + if (err) + goto out; + } + + memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn)); + + err = -EHOSTUNREACH; + memset(&fld, 0, sizeof(fld)); + fld.flowidn_oif = sk->sk_bound_dev_if; + fld.daddr = dn_saddr2dn(&scp->peer); + fld.saddr = dn_saddr2dn(&scp->addr); + dn_sk_ports_copy(&fld, scp); + fld.flowidn_proto = DNPROTO_NSP; + if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0) + goto out; + sk->sk_route_caps = sk->sk_dst_cache->dev->features; + sock->state = SS_CONNECTING; + scp->state = DN_CI; + scp->segsize_loc = dst_metric_advmss(sk->sk_dst_cache); + + dn_nsp_send_conninit(sk, NSP_CI); + err = -EINPROGRESS; + if (*timeo) { + err = dn_wait_run(sk, timeo); + } +out: + return err; +} + +static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) +{ + struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr; + struct sock *sk = sock->sk; + int err; + long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + lock_sock(sk); + err = __dn_connect(sk, addr, addrlen, &timeo, 0); + release_sock(sk); + + return err; +} + +static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags) +{ + struct dn_scp *scp = DN_SK(sk); + + switch(scp->state) { + case DN_RUN: + return 0; + case DN_CR: + return dn_confirm_accept(sk, timeo, sk->sk_allocation); + case DN_CI: + case DN_CC: + return dn_wait_run(sk, timeo); + case DN_O: + return __dn_connect(sk, addr, addrlen, timeo, flags); + } + + return -EINVAL; +} + + +static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc) +{ + unsigned char *ptr = skb->data; + + acc->acc_userl = *ptr++; + memcpy(&acc->acc_user, ptr, acc->acc_userl); + ptr += acc->acc_userl; + + acc->acc_passl = *ptr++; + memcpy(&acc->acc_pass, ptr, acc->acc_passl); + ptr += acc->acc_passl; + + acc->acc_accl = *ptr++; + memcpy(&acc->acc_acc, ptr, acc->acc_accl); + + skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3); + +} + +static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt) +{ + unsigned char *ptr = skb->data; + u16 len = *ptr++; /* yes, it's 8bit on the wire */ + + BUG_ON(len > 16); /* we've checked the contents earlier */ + opt->opt_optl = cpu_to_le16(len); + opt->opt_status = 0; + memcpy(opt->opt_data, ptr, len); + skb_pull(skb, len + 1); +} + +static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) +{ + DEFINE_WAIT(wait); + struct sk_buff *skb = NULL; + int err = 0; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + for(;;) { + release_sock(sk); + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) { + *timeo = schedule_timeout(*timeo); + skb = skb_dequeue(&sk->sk_receive_queue); + } + lock_sock(sk); + if (skb != NULL) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(*timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!*timeo) + break; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + + return skb == NULL ? ERR_PTR(err) : skb; +} + +static int dn_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk, *newsk; + struct sk_buff *skb = NULL; + struct dn_skb_cb *cb; + unsigned char menuver; + int err = 0; + unsigned char type; + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + struct dst_entry *dst; + + lock_sock(sk); + + if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) { + release_sock(sk); + return -EINVAL; + } + + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) { + skb = dn_wait_for_connect(sk, &timeo); + if (IS_ERR(skb)) { + release_sock(sk); + return PTR_ERR(skb); + } + } + + cb = DN_SKB_CB(skb); + sk->sk_ack_backlog--; + newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation); + if (newsk == NULL) { + release_sock(sk); + kfree_skb(skb); + return -ENOBUFS; + } + release_sock(sk); + + dst = skb_dst(skb); + sk_dst_set(newsk, dst); + skb_dst_set(skb, NULL); + + DN_SK(newsk)->state = DN_CR; + DN_SK(newsk)->addrrem = cb->src_port; + DN_SK(newsk)->services_rem = cb->services; + DN_SK(newsk)->info_rem = cb->info; + DN_SK(newsk)->segsize_rem = cb->segsize; + DN_SK(newsk)->accept_mode = DN_SK(sk)->accept_mode; + + if (DN_SK(newsk)->segsize_rem < 230) + DN_SK(newsk)->segsize_rem = 230; + + if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE) + DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd; + + newsk->sk_state = TCP_LISTEN; + memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn)); + + /* + * If we are listening on a wild socket, we don't want + * the newly created socket on the wrong hash queue. + */ + DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD; + + skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type)); + skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type)); + *(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src; + *(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst; + + menuver = *skb->data; + skb_pull(skb, 1); + + if (menuver & DN_MENUVER_ACC) + dn_access_copy(skb, &(DN_SK(newsk)->accessdata)); + + if (menuver & DN_MENUVER_USR) + dn_user_copy(skb, &(DN_SK(newsk)->conndata_in)); + + if (menuver & DN_MENUVER_PRX) + DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY; + + if (menuver & DN_MENUVER_UIC) + DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY; + + kfree_skb(skb); + + memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out), + sizeof(struct optdata_dn)); + memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out), + sizeof(struct optdata_dn)); + + lock_sock(newsk); + err = dn_hash_sock(newsk); + if (err == 0) { + sock_reset_flag(newsk, SOCK_ZAPPED); + dn_send_conn_ack(newsk); + + /* + * Here we use sk->sk_allocation since although the conn conf is + * for the newsk, the context is the old socket. + */ + if (DN_SK(newsk)->accept_mode == ACC_IMMED) + err = dn_confirm_accept(newsk, &timeo, + sk->sk_allocation); + } + release_sock(newsk); + return err; +} + + +static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer) +{ + struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr; + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + + *uaddr_len = sizeof(struct sockaddr_dn); + + lock_sock(sk); + + if (peer) { + if ((sock->state != SS_CONNECTED && + sock->state != SS_CONNECTING) && + scp->accept_mode == ACC_IMMED) { + release_sock(sk); + return -ENOTCONN; + } + + memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn)); + } else { + memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn)); + } + + release_sock(sk); + + return 0; +} + + +static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int mask = datagram_poll(file, sock, wait); + + if (!skb_queue_empty(&scp->other_receive_queue)) + mask |= POLLRDBAND; + + return mask; +} + +static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int err = -EOPNOTSUPP; + long amount = 0; + struct sk_buff *skb; + int val; + + switch(cmd) + { + case SIOCGIFADDR: + case SIOCSIFADDR: + return dn_dev_ioctl(cmd, (void __user *)arg); + + case SIOCATMARK: + lock_sock(sk); + val = !skb_queue_empty(&scp->other_receive_queue); + if (scp->state != DN_RUN) + val = -ENOTCONN; + release_sock(sk); + return val; + + case TIOCOUTQ: + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + err = put_user(amount, (int __user *)arg); + break; + + case TIOCINQ: + lock_sock(sk); + skb = skb_peek(&scp->other_receive_queue); + if (skb) { + amount = skb->len; + } else { + skb_queue_walk(&sk->sk_receive_queue, skb) + amount += skb->len; + } + release_sock(sk); + err = put_user(amount, (int __user *)arg); + break; + + default: + err = -ENOIOCTLCMD; + break; + } + + return err; +} + +static int dn_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = -EINVAL; + + lock_sock(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + + if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN)) + goto out; + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = TCP_LISTEN; + err = 0; + dn_rehash_sock(sk); + +out: + release_sock(sk); + + return err; +} + + +static int dn_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int err = -ENOTCONN; + + lock_sock(sk); + + if (sock->state == SS_UNCONNECTED) + goto out; + + err = 0; + if (sock->state == SS_DISCONNECTING) + goto out; + + err = -EINVAL; + if (scp->state == DN_O) + goto out; + + if (how != SHUTDOWN_MASK) + goto out; + + sk->sk_shutdown = how; + dn_destroy_sock(sk); + err = 0; + +out: + release_sock(sk); + + return err; +} + +static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + err = __dn_setsockopt(sock, level, optname, optval, optlen, 0); + release_sock(sk); + + return err; +} + +static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + long timeo; + union { + struct optdata_dn opt; + struct accessdata_dn acc; + int mode; + unsigned long win; + int val; + unsigned char services; + unsigned char info; + } u; + int err; + + if (optlen && !optval) + return -EINVAL; + + if (optlen > sizeof(u)) + return -EINVAL; + + if (copy_from_user(&u, optval, optlen)) + return -EFAULT; + + switch(optname) { + case DSO_CONDATA: + if (sock->state == SS_CONNECTED) + return -EISCONN; + if ((scp->state != DN_O) && (scp->state != DN_CR)) + return -EINVAL; + + if (optlen != sizeof(struct optdata_dn)) + return -EINVAL; + + if (le16_to_cpu(u.opt.opt_optl) > 16) + return -EINVAL; + + memcpy(&scp->conndata_out, &u.opt, optlen); + break; + + case DSO_DISDATA: + if (sock->state != SS_CONNECTED && scp->accept_mode == ACC_IMMED) + return -ENOTCONN; + + if (optlen != sizeof(struct optdata_dn)) + return -EINVAL; + + if (le16_to_cpu(u.opt.opt_optl) > 16) + return -EINVAL; + + memcpy(&scp->discdata_out, &u.opt, optlen); + break; + + case DSO_CONACCESS: + if (sock->state == SS_CONNECTED) + return -EISCONN; + if (scp->state != DN_O) + return -EINVAL; + + if (optlen != sizeof(struct accessdata_dn)) + return -EINVAL; + + if ((u.acc.acc_accl > DN_MAXACCL) || + (u.acc.acc_passl > DN_MAXACCL) || + (u.acc.acc_userl > DN_MAXACCL)) + return -EINVAL; + + memcpy(&scp->accessdata, &u.acc, optlen); + break; + + case DSO_ACCEPTMODE: + if (sock->state == SS_CONNECTED) + return -EISCONN; + if (scp->state != DN_O) + return -EINVAL; + + if (optlen != sizeof(int)) + return -EINVAL; + + if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER)) + return -EINVAL; + + scp->accept_mode = (unsigned char)u.mode; + break; + + case DSO_CONACCEPT: + + if (scp->state != DN_CR) + return -EINVAL; + timeo = sock_rcvtimeo(sk, 0); + err = dn_confirm_accept(sk, &timeo, sk->sk_allocation); + return err; + + case DSO_CONREJECT: + + if (scp->state != DN_CR) + return -EINVAL; + + scp->state = DN_DR; + sk->sk_shutdown = SHUTDOWN_MASK; + dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation); + break; + + default: +#ifdef CONFIG_NETFILTER + return nf_setsockopt(sk, PF_DECnet, optname, optval, optlen); +#endif + case DSO_LINKINFO: + case DSO_STREAM: + case DSO_SEQPACKET: + return -ENOPROTOOPT; + + case DSO_MAXWINDOW: + if (optlen != sizeof(unsigned long)) + return -EINVAL; + if (u.win > NSP_MAX_WINDOW) + u.win = NSP_MAX_WINDOW; + if (u.win == 0) + return -EINVAL; + scp->max_window = u.win; + if (scp->snd_window > u.win) + scp->snd_window = u.win; + break; + + case DSO_NODELAY: + if (optlen != sizeof(int)) + return -EINVAL; + if (scp->nonagle == 2) + return -EINVAL; + scp->nonagle = (u.val == 0) ? 0 : 1; + /* if (scp->nonagle == 1) { Push pending frames } */ + break; + + case DSO_CORK: + if (optlen != sizeof(int)) + return -EINVAL; + if (scp->nonagle == 1) + return -EINVAL; + scp->nonagle = (u.val == 0) ? 0 : 2; + /* if (scp->nonagle == 0) { Push pending frames } */ + break; + + case DSO_SERVICES: + if (optlen != sizeof(unsigned char)) + return -EINVAL; + if ((u.services & ~NSP_FC_MASK) != 0x01) + return -EINVAL; + if ((u.services & NSP_FC_MASK) == NSP_FC_MASK) + return -EINVAL; + scp->services_loc = u.services; + break; + + case DSO_INFO: + if (optlen != sizeof(unsigned char)) + return -EINVAL; + if (u.info & 0xfc) + return -EINVAL; + scp->info_loc = u.info; + break; + } + + return 0; +} + +static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + err = __dn_getsockopt(sock, level, optname, optval, optlen, 0); + release_sock(sk); + + return err; +} + +static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + struct linkinfo_dn link; + unsigned int r_len; + void *r_data = NULL; + unsigned int val; + + if(get_user(r_len , optlen)) + return -EFAULT; + + switch(optname) { + case DSO_CONDATA: + if (r_len > sizeof(struct optdata_dn)) + r_len = sizeof(struct optdata_dn); + r_data = &scp->conndata_in; + break; + + case DSO_DISDATA: + if (r_len > sizeof(struct optdata_dn)) + r_len = sizeof(struct optdata_dn); + r_data = &scp->discdata_in; + break; + + case DSO_CONACCESS: + if (r_len > sizeof(struct accessdata_dn)) + r_len = sizeof(struct accessdata_dn); + r_data = &scp->accessdata; + break; + + case DSO_ACCEPTMODE: + if (r_len > sizeof(unsigned char)) + r_len = sizeof(unsigned char); + r_data = &scp->accept_mode; + break; + + case DSO_LINKINFO: + if (r_len > sizeof(struct linkinfo_dn)) + r_len = sizeof(struct linkinfo_dn); + + memset(&link, 0, sizeof(link)); + + switch(sock->state) { + case SS_CONNECTING: + link.idn_linkstate = LL_CONNECTING; + break; + case SS_DISCONNECTING: + link.idn_linkstate = LL_DISCONNECTING; + break; + case SS_CONNECTED: + link.idn_linkstate = LL_RUNNING; + break; + default: + link.idn_linkstate = LL_INACTIVE; + } + + link.idn_segsize = scp->segsize_rem; + r_data = &link; + break; + + default: +#ifdef CONFIG_NETFILTER + { + int ret, len; + + if(get_user(len, optlen)) + return -EFAULT; + + ret = nf_getsockopt(sk, PF_DECnet, optname, + optval, &len); + if (ret >= 0) + ret = put_user(len, optlen); + return ret; + } +#endif + case DSO_STREAM: + case DSO_SEQPACKET: + case DSO_CONACCEPT: + case DSO_CONREJECT: + return -ENOPROTOOPT; + + case DSO_MAXWINDOW: + if (r_len > sizeof(unsigned long)) + r_len = sizeof(unsigned long); + r_data = &scp->max_window; + break; + + case DSO_NODELAY: + if (r_len > sizeof(int)) + r_len = sizeof(int); + val = (scp->nonagle == 1); + r_data = &val; + break; + + case DSO_CORK: + if (r_len > sizeof(int)) + r_len = sizeof(int); + val = (scp->nonagle == 2); + r_data = &val; + break; + + case DSO_SERVICES: + if (r_len > sizeof(unsigned char)) + r_len = sizeof(unsigned char); + r_data = &scp->services_rem; + break; + + case DSO_INFO: + if (r_len > sizeof(unsigned char)) + r_len = sizeof(unsigned char); + r_data = &scp->info_rem; + break; + } + + if (r_data) { + if (copy_to_user(optval, r_data, r_len)) + return -EFAULT; + if (put_user(r_len, optlen)) + return -EFAULT; + } + + return 0; +} + + +static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target) +{ + struct sk_buff *skb; + int len = 0; + + if (flags & MSG_OOB) + return !skb_queue_empty(q) ? 1 : 0; + + skb_queue_walk(q, skb) { + struct dn_skb_cb *cb = DN_SKB_CB(skb); + len += skb->len; + + if (cb->nsp_flags & 0x40) { + /* SOCK_SEQPACKET reads to EOM */ + if (sk->sk_type == SOCK_SEQPACKET) + return 1; + /* so does SOCK_STREAM unless WAITALL is specified */ + if (!(flags & MSG_WAITALL)) + return 1; + } + + /* minimum data length for read exceeded */ + if (len >= target) + return 1; + } + + return 0; +} + + +static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + struct sk_buff_head *queue = &sk->sk_receive_queue; + size_t target = size > 1 ? 1 : 0; + size_t copied = 0; + int rv = 0; + struct sk_buff *skb, *n; + struct dn_skb_cb *cb = NULL; + unsigned char eor = 0; + long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) { + rv = -EADDRNOTAVAIL; + goto out; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) { + rv = 0; + goto out; + } + + rv = dn_check_state(sk, NULL, 0, &timeo, flags); + if (rv) + goto out; + + if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { + rv = -EOPNOTSUPP; + goto out; + } + + if (flags & MSG_OOB) + queue = &scp->other_receive_queue; + + if (flags & MSG_WAITALL) + target = size; + + + /* + * See if there is data ready to read, sleep if there isn't + */ + for(;;) { + DEFINE_WAIT(wait); + + if (sk->sk_err) + goto out; + + if (!skb_queue_empty(&scp->other_receive_queue)) { + if (!(flags & MSG_OOB)) { + msg->msg_flags |= MSG_OOB; + if (!scp->other_report) { + scp->other_report = 1; + goto out; + } + } + } + + if (scp->state != DN_RUN) + goto out; + + if (signal_pending(current)) { + rv = sock_intr_errno(timeo); + goto out; + } + + if (dn_data_ready(sk, queue, flags, target)) + break; + + if (flags & MSG_DONTWAIT) { + rv = -EWOULDBLOCK; + goto out; + } + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target)); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk_sleep(sk), &wait); + } + + skb_queue_walk_safe(queue, skb, n) { + unsigned int chunk = skb->len; + cb = DN_SKB_CB(skb); + + if ((chunk + copied) > size) + chunk = size - copied; + + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + rv = -EFAULT; + break; + } + copied += chunk; + + if (!(flags & MSG_PEEK)) + skb_pull(skb, chunk); + + eor = cb->nsp_flags & 0x40; + + if (skb->len == 0) { + skb_unlink(skb, queue); + kfree_skb(skb); + /* + * N.B. Don't refer to skb or cb after this point + * in loop. + */ + if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) { + scp->flowloc_sw = DN_SEND; + dn_nsp_send_link(sk, DN_SEND, 0); + } + } + + if (eor) { + if (sk->sk_type == SOCK_SEQPACKET) + break; + if (!(flags & MSG_WAITALL)) + break; + } + + if (flags & MSG_OOB) + break; + + if (copied >= target) + break; + } + + rv = copied; + + + if (eor && (sk->sk_type == SOCK_SEQPACKET)) + msg->msg_flags |= MSG_EOR; + +out: + if (rv == 0) + rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk); + + if ((rv >= 0) && msg->msg_name) { + memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn)); + msg->msg_namelen = sizeof(struct sockaddr_dn); + } + + release_sock(sk); + + return rv; +} + + +static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags) +{ + unsigned char fctype = scp->services_rem & NSP_FC_MASK; + if (skb_queue_len(queue) >= scp->snd_window) + return 1; + if (fctype != NSP_FC_NONE) { + if (flags & MSG_OOB) { + if (scp->flowrem_oth == 0) + return 1; + } else { + if (scp->flowrem_dat == 0) + return 1; + } + } + return 0; +} + +/* + * The DECnet spec requires that the "routing layer" accepts packets which + * are at least 230 bytes in size. This excludes any headers which the NSP + * layer might add, so we always assume that we'll be using the maximal + * length header on data packets. The variation in length is due to the + * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't + * make much practical difference. + */ +unsigned dn_mss_from_pmtu(struct net_device *dev, int mtu) +{ + unsigned mss = 230 - DN_MAX_NSP_DATA_HEADER; + if (dev) { + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + mtu -= LL_RESERVED_SPACE(dev); + if (dn_db->use_long) + mtu -= 21; + else + mtu -= 6; + mtu -= DN_MAX_NSP_DATA_HEADER; + } else { + /* + * 21 = long header, 16 = guess at MAC header length + */ + mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16); + } + if (mtu > mss) + mss = mtu; + return mss; +} + +static inline unsigned int dn_current_mss(struct sock *sk, int flags) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct dn_scp *scp = DN_SK(sk); + int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem); + + /* Other data messages are limited to 16 bytes per packet */ + if (flags & MSG_OOB) + return 16; + + /* This works out the maximum size of segment we can send out */ + if (dst) { + u32 mtu = dst_mtu(dst); + mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now); + } + + return mss_now; +} + +/* + * N.B. We get the timeout wrong here, but then we always did get it + * wrong before and this is another step along the road to correcting + * it. It ought to get updated each time we pass through the routine, + * but in practise it probably doesn't matter too much for now. + */ +static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk, + unsigned long datalen, int noblock, + int *errcode) +{ + struct sk_buff *skb = sock_alloc_send_skb(sk, datalen, + noblock, errcode); + if (skb) { + skb->protocol = htons(ETH_P_DNA_RT); + skb->pkt_type = PACKET_OUTGOING; + } + return skb; +} + +static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + size_t mss; + struct sk_buff_head *queue = &scp->data_xmit_queue; + int flags = msg->msg_flags; + int err = 0; + size_t sent = 0; + int addr_len = msg->msg_namelen; + struct sockaddr_dn *addr = (struct sockaddr_dn *)msg->msg_name; + struct sk_buff *skb = NULL; + struct dn_skb_cb *cb; + size_t len; + unsigned char fctype; + long timeo; + + if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT)) + return -EOPNOTSUPP; + + if (addr_len && (addr_len != sizeof(struct sockaddr_dn))) + return -EINVAL; + + lock_sock(sk); + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + /* + * The only difference between stream sockets and sequenced packet + * sockets is that the stream sockets always behave as if MSG_EOR + * has been set. + */ + if (sock->type == SOCK_STREAM) { + if (flags & MSG_EOR) { + err = -EINVAL; + goto out; + } + flags |= MSG_EOR; + } + + + err = dn_check_state(sk, addr, addr_len, &timeo, flags); + if (err) + goto out_err; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + if (!(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + goto out_err; + } + + if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) + dst_negative_advice(sk); + + mss = scp->segsize_rem; + fctype = scp->services_rem & NSP_FC_MASK; + + mss = dn_current_mss(sk, flags); + + if (flags & MSG_OOB) { + queue = &scp->other_xmit_queue; + if (size > mss) { + err = -EMSGSIZE; + goto out; + } + } + + scp->persist_fxn = dn_nsp_xmit_timeout; + + while(sent < size) { + err = sock_error(sk); + if (err) + goto out; + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + + /* + * Calculate size that we wish to send. + */ + len = size - sent; + + if (len > mss) + len = mss; + + /* + * Wait for queue size to go down below the window + * size. + */ + if (dn_queue_too_long(scp, queue, flags)) { + DEFINE_WAIT(wait); + + if (flags & MSG_DONTWAIT) { + err = -EWOULDBLOCK; + goto out; + } + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_wait_event(sk, &timeo, + !dn_queue_too_long(scp, queue, flags)); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk_sleep(sk), &wait); + continue; + } + + /* + * Get a suitably sized skb. + * 64 is a bit of a hack really, but its larger than any + * link-layer headers and has served us well as a good + * guess as to their real length. + */ + skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER, + flags & MSG_DONTWAIT, &err); + + if (err) + break; + + if (!skb) + continue; + + cb = DN_SKB_CB(skb); + + skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER); + + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + goto out; + } + + if (flags & MSG_OOB) { + cb->nsp_flags = 0x30; + if (fctype != NSP_FC_NONE) + scp->flowrem_oth--; + } else { + cb->nsp_flags = 0x00; + if (scp->seg_total == 0) + cb->nsp_flags |= 0x20; + + scp->seg_total += len; + + if (((sent + len) == size) && (flags & MSG_EOR)) { + cb->nsp_flags |= 0x40; + scp->seg_total = 0; + if (fctype == NSP_FC_SCMC) + scp->flowrem_dat--; + } + if (fctype == NSP_FC_SRC) + scp->flowrem_dat--; + } + + sent += len; + dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB); + skb = NULL; + + scp->persist = dn_nsp_persist(sk); + + } +out: + + kfree_skb(skb); + + release_sock(sk); + + return sent ? sent : err; + +out_err: + err = sk_stream_error(sk, flags, err); + release_sock(sk); + return err; +} + +static int dn_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + switch(event) { + case NETDEV_UP: + dn_dev_up(dev); + break; + case NETDEV_DOWN: + dn_dev_down(dev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block dn_dev_notifier = { + .notifier_call = dn_device_event, +}; + +extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); + +static struct packet_type dn_dix_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_DNA_RT), + .func = dn_route_rcv, +}; + +#ifdef CONFIG_PROC_FS +struct dn_iter_state { + int bucket; +}; + +static struct sock *dn_socket_get_first(struct seq_file *seq) +{ + struct dn_iter_state *state = seq->private; + struct sock *n = NULL; + + for(state->bucket = 0; + state->bucket < DN_SK_HASH_SIZE; + ++state->bucket) { + n = sk_head(&dn_sk_hash[state->bucket]); + if (n) + break; + } + + return n; +} + +static struct sock *dn_socket_get_next(struct seq_file *seq, + struct sock *n) +{ + struct dn_iter_state *state = seq->private; + + n = sk_next(n); +try_again: + if (n) + goto out; + if (++state->bucket >= DN_SK_HASH_SIZE) + goto out; + n = sk_head(&dn_sk_hash[state->bucket]); + goto try_again; +out: + return n; +} + +static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct sock *sk = dn_socket_get_first(seq); + + if (sk) { + while(*pos && (sk = dn_socket_get_next(seq, sk))) + --*pos; + } + return *pos ? NULL : sk; +} + +static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc; + read_lock_bh(&dn_hash_lock); + rc = socket_get_idx(seq, &pos); + if (!rc) { + read_unlock_bh(&dn_hash_lock); + } + return rc; +} + +static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = dn_socket_get_idx(seq, 0); + goto out; + } + + rc = dn_socket_get_next(seq, v); + if (rc) + goto out; + read_unlock_bh(&dn_hash_lock); +out: + ++*pos; + return rc; +} + +static void dn_socket_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + read_unlock_bh(&dn_hash_lock); +} + +#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126) + +static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf) +{ + int i; + + switch (le16_to_cpu(dn->sdn_objnamel)) { + case 0: + sprintf(buf, "%d", dn->sdn_objnum); + break; + default: + for (i = 0; i < le16_to_cpu(dn->sdn_objnamel); i++) { + buf[i] = dn->sdn_objname[i]; + if (IS_NOT_PRINTABLE(buf[i])) + buf[i] = '.'; + } + buf[i] = 0; + } +} + +static char *dn_state2asc(unsigned char state) +{ + switch(state) { + case DN_O: + return "OPEN"; + case DN_CR: + return " CR"; + case DN_DR: + return " DR"; + case DN_DRC: + return " DRC"; + case DN_CC: + return " CC"; + case DN_CI: + return " CI"; + case DN_NR: + return " NR"; + case DN_NC: + return " NC"; + case DN_CD: + return " CD"; + case DN_RJ: + return " RJ"; + case DN_RUN: + return " RUN"; + case DN_DI: + return " DI"; + case DN_DIC: + return " DIC"; + case DN_DN: + return " DN"; + case DN_CL: + return " CL"; + case DN_CN: + return " CN"; + } + + return "????"; +} + +static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + char buf1[DN_ASCBUF_LEN]; + char buf2[DN_ASCBUF_LEN]; + char local_object[DN_MAXOBJL+3]; + char remote_object[DN_MAXOBJL+3]; + + dn_printable_object(&scp->addr, local_object); + dn_printable_object(&scp->peer, remote_object); + + seq_printf(seq, + "%6s/%04X %04d:%04d %04d:%04d %01d %-16s " + "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n", + dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->addr)), buf1), + scp->addrloc, + scp->numdat, + scp->numoth, + scp->ackxmt_dat, + scp->ackxmt_oth, + scp->flowloc_sw, + local_object, + dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->peer)), buf2), + scp->addrrem, + scp->numdat_rcv, + scp->numoth_rcv, + scp->ackrcv_dat, + scp->ackrcv_oth, + scp->flowrem_sw, + remote_object, + dn_state2asc(scp->state), + ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER")); +} + +static int dn_socket_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Local Remote\n"); + } else { + dn_socket_format_entry(seq, v); + } + return 0; +} + +static const struct seq_operations dn_socket_seq_ops = { + .start = dn_socket_seq_start, + .next = dn_socket_seq_next, + .stop = dn_socket_seq_stop, + .show = dn_socket_seq_show, +}; + +static int dn_socket_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &dn_socket_seq_ops, + sizeof(struct dn_iter_state)); +} + +static const struct file_operations dn_socket_seq_fops = { + .owner = THIS_MODULE, + .open = dn_socket_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static const struct net_proto_family dn_family_ops = { + .family = AF_DECnet, + .create = dn_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops dn_proto_ops = { + .family = AF_DECnet, + .owner = THIS_MODULE, + .release = dn_release, + .bind = dn_bind, + .connect = dn_connect, + .socketpair = sock_no_socketpair, + .accept = dn_accept, + .getname = dn_getname, + .poll = dn_poll, + .ioctl = dn_ioctl, + .listen = dn_listen, + .shutdown = dn_shutdown, + .setsockopt = dn_setsockopt, + .getsockopt = dn_getsockopt, + .sendmsg = dn_sendmsg, + .recvmsg = dn_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +void dn_register_sysctl(void); +void dn_unregister_sysctl(void); + +MODULE_DESCRIPTION("The Linux DECnet Network Protocol"); +MODULE_AUTHOR("Linux DECnet Project Team"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_DECnet); + +static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; + +static int __init decnet_init(void) +{ + int rc; + + printk(banner); + + rc = proto_register(&dn_proto, 1); + if (rc != 0) + goto out; + + dn_neigh_init(); + dn_dev_init(); + dn_route_init(); + dn_fib_init(); + + sock_register(&dn_family_ops); + dev_add_pack(&dn_dix_packet_type); + register_netdevice_notifier(&dn_dev_notifier); + + proc_net_fops_create(&init_net, "decnet", S_IRUGO, &dn_socket_seq_fops); + dn_register_sysctl(); +out: + return rc; + +} +module_init(decnet_init); + +/* + * Prevent DECnet module unloading until its fixed properly. + * Requires an audit of the code to check for memory leaks and + * initialisation problems etc. + */ +#if 0 +static void __exit decnet_exit(void) +{ + sock_unregister(AF_DECnet); + rtnl_unregister_all(PF_DECnet); + dev_remove_pack(&dn_dix_packet_type); + + dn_unregister_sysctl(); + + unregister_netdevice_notifier(&dn_dev_notifier); + + dn_route_cleanup(); + dn_dev_cleanup(); + dn_neigh_cleanup(); + dn_fib_cleanup(); + + proc_net_remove(&init_net, "decnet"); + + proto_unregister(&dn_proto); + + rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */ +} +module_exit(decnet_exit); +#endif diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c new file mode 100644 index 00000000..cf26ac74 --- /dev/null +++ b/net/decnet/dn_dev.c @@ -0,0 +1,1445 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Device Layer + * + * Authors: Steve Whitehouse + * Eduardo Marcelo Serrat + * + * Changes: + * Steve Whitehouse : Devices now see incoming frames so they + * can mark on who it came from. + * Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour + * can now have a device specific setup func. + * Steve Whitehouse : Added /proc/sys/net/decnet/conf// + * Steve Whitehouse : Fixed bug which sometimes killed timer + * Steve Whitehouse : Multiple ifaddr support + * Steve Whitehouse : SIOCGIFCONF is now a compile time option + * Steve Whitehouse : /proc/sys/net/decnet/conf//forwarding + * Steve Whitehouse : Removed timer1 - it's a user space issue now + * Patrick Caulfield : Fixed router hello message format + * Steve Whitehouse : Got rid of constant sizes for blksize for + * devices. All mtu based now. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn)) + +static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00}; +static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00}; +static char dn_hiord[ETH_ALEN] = {0xAA,0x00,0x04,0x00,0x00,0x00}; +static unsigned char dn_eco_version[3] = {0x02,0x00,0x00}; + +extern struct neigh_table dn_neigh_table; + +/* + * decnet_address is kept in network order. + */ +__le16 decnet_address = 0; + +static DEFINE_SPINLOCK(dndev_lock); +static struct net_device *decnet_default_device; +static BLOCKING_NOTIFIER_HEAD(dnaddr_chain); + +static struct dn_dev *dn_dev_create(struct net_device *dev, int *err); +static void dn_dev_delete(struct net_device *dev); +static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa); + +static int dn_eth_up(struct net_device *); +static void dn_eth_down(struct net_device *); +static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa); +static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa); + +static struct dn_dev_parms dn_dev_list[] = { +{ + .type = ARPHRD_ETHER, /* Ethernet */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "ethernet", + .up = dn_eth_up, + .down = dn_eth_down, + .timer3 = dn_send_brd_hello, +}, +{ + .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "ipgre", + .timer3 = dn_send_brd_hello, +}, +#if 0 +{ + .type = ARPHRD_X25, /* Bog standard X.25 */ + .mode = DN_DEV_UCAST, + .state = DN_DEV_S_DS, + .t2 = 1, + .t3 = 120, + .name = "x25", + .timer3 = dn_send_ptp_hello, +}, +#endif +#if 0 +{ + .type = ARPHRD_PPP, /* DECnet over PPP */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "ppp", + .timer3 = dn_send_brd_hello, +}, +#endif +{ + .type = ARPHRD_DDCMP, /* DECnet over DDCMP */ + .mode = DN_DEV_UCAST, + .state = DN_DEV_S_DS, + .t2 = 1, + .t3 = 120, + .name = "ddcmp", + .timer3 = dn_send_ptp_hello, +}, +{ + .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "loopback", + .timer3 = dn_send_brd_hello, +} +}; + +#define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list) + +#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x) + +#ifdef CONFIG_SYSCTL + +static int min_t2[] = { 1 }; +static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */ +static int min_t3[] = { 1 }; +static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */ + +static int min_priority[1]; +static int max_priority[] = { 127 }; /* From DECnet spec */ + +static int dn_forwarding_proc(ctl_table *, int, + void __user *, size_t *, loff_t *); +static struct dn_dev_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table dn_dev_vars[5]; +} dn_dev_sysctl = { + NULL, + { + { + .procname = "forwarding", + .data = (void *)DN_DEV_PARMS_OFFSET(forwarding), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = dn_forwarding_proc, + }, + { + .procname = "priority", + .data = (void *)DN_DEV_PARMS_OFFSET(priority), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_priority, + .extra2 = &max_priority + }, + { + .procname = "t2", + .data = (void *)DN_DEV_PARMS_OFFSET(t2), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t2, + .extra2 = &max_t2 + }, + { + .procname = "t3", + .data = (void *)DN_DEV_PARMS_OFFSET(t3), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t3, + .extra2 = &max_t3 + }, + {0} + }, +}; + +static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) +{ + struct dn_dev_sysctl_table *t; + int i; + +#define DN_CTL_PATH_DEV 3 + + struct ctl_path dn_ctl_path[] = { + { .procname = "net", }, + { .procname = "decnet", }, + { .procname = "conf", }, + { /* to be set */ }, + { }, + }; + + t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + + for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) { + long offset = (long)t->dn_dev_vars[i].data; + t->dn_dev_vars[i].data = ((char *)parms) + offset; + } + + if (dev) { + dn_ctl_path[DN_CTL_PATH_DEV].procname = dev->name; + } else { + dn_ctl_path[DN_CTL_PATH_DEV].procname = parms->name; + } + + t->dn_dev_vars[0].extra1 = (void *)dev; + + t->sysctl_header = register_sysctl_paths(dn_ctl_path, t->dn_dev_vars); + if (t->sysctl_header == NULL) + kfree(t); + else + parms->sysctl = t; +} + +static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) +{ + if (parms->sysctl) { + struct dn_dev_sysctl_table *t = parms->sysctl; + parms->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + +static int dn_forwarding_proc(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ +#ifdef CONFIG_DECNET_ROUTER + struct net_device *dev = table->extra1; + struct dn_dev *dn_db; + int err; + int tmp, old; + + if (table->extra1 == NULL) + return -EINVAL; + + dn_db = rcu_dereference_raw(dev->dn_ptr); + old = dn_db->parms.forwarding; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + + if ((err >= 0) && write) { + if (dn_db->parms.forwarding < 0) + dn_db->parms.forwarding = 0; + if (dn_db->parms.forwarding > 2) + dn_db->parms.forwarding = 2; + /* + * What an ugly hack this is... its works, just. It + * would be nice if sysctl/proc were just that little + * bit more flexible so I don't have to write a special + * routine, or suffer hacks like this - SJW + */ + tmp = dn_db->parms.forwarding; + dn_db->parms.forwarding = old; + if (dn_db->parms.down) + dn_db->parms.down(dev); + dn_db->parms.forwarding = tmp; + if (dn_db->parms.up) + dn_db->parms.up(dev); + } + + return err; +#else + return -EINVAL; +#endif +} + +#else /* CONFIG_SYSCTL */ +static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) +{ +} +static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) +{ +} + +#endif /* CONFIG_SYSCTL */ + +static inline __u16 mtu2blksize(struct net_device *dev) +{ + u32 blksize = dev->mtu; + if (blksize > 0xffff) + blksize = 0xffff; + + if (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_PPP || + dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_LOOPBACK) + blksize -= 2; + + return (__u16)blksize; +} + +static struct dn_ifaddr *dn_dev_alloc_ifa(void) +{ + struct dn_ifaddr *ifa; + + ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); + + return ifa; +} + +static void dn_dev_free_ifa(struct dn_ifaddr *ifa) +{ + kfree_rcu(ifa, rcu); +} + +static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy) +{ + struct dn_ifaddr *ifa1 = rtnl_dereference(*ifap); + unsigned char mac_addr[6]; + struct net_device *dev = dn_db->dev; + + ASSERT_RTNL(); + + *ifap = ifa1->ifa_next; + + if (dn_db->dev->type == ARPHRD_ETHER) { + if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) { + dn_dn2eth(mac_addr, ifa1->ifa_local); + dev_mc_del(dev, mac_addr); + } + } + + dn_ifaddr_notify(RTM_DELADDR, ifa1); + blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + dn_dev_free_ifa(ifa1); + + if (dn_db->ifa_list == NULL) + dn_dev_delete(dn_db->dev); + } +} + +static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa) +{ + struct net_device *dev = dn_db->dev; + struct dn_ifaddr *ifa1; + unsigned char mac_addr[6]; + + ASSERT_RTNL(); + + /* Check for duplicates */ + for (ifa1 = rtnl_dereference(dn_db->ifa_list); + ifa1 != NULL; + ifa1 = rtnl_dereference(ifa1->ifa_next)) { + if (ifa1->ifa_local == ifa->ifa_local) + return -EEXIST; + } + + if (dev->type == ARPHRD_ETHER) { + if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) { + dn_dn2eth(mac_addr, ifa->ifa_local); + dev_mc_add(dev, mac_addr); + } + } + + ifa->ifa_next = dn_db->ifa_list; + rcu_assign_pointer(dn_db->ifa_list, ifa); + + dn_ifaddr_notify(RTM_NEWADDR, ifa); + blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa) +{ + struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); + int rv; + + if (dn_db == NULL) { + int err; + dn_db = dn_dev_create(dev, &err); + if (dn_db == NULL) + return err; + } + + ifa->ifa_dev = dn_db; + + if (dev->flags & IFF_LOOPBACK) + ifa->ifa_scope = RT_SCOPE_HOST; + + rv = dn_dev_insert_ifa(dn_db, ifa); + if (rv) + dn_dev_free_ifa(ifa); + return rv; +} + + +int dn_dev_ioctl(unsigned int cmd, void __user *arg) +{ + char buffer[DN_IFREQ_SIZE]; + struct ifreq *ifr = (struct ifreq *)buffer; + struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr; + struct dn_dev *dn_db; + struct net_device *dev; + struct dn_ifaddr *ifa = NULL; + struct dn_ifaddr __rcu **ifap = NULL; + int ret = 0; + + if (copy_from_user(ifr, arg, DN_IFREQ_SIZE)) + return -EFAULT; + ifr->ifr_name[IFNAMSIZ-1] = 0; + + dev_load(&init_net, ifr->ifr_name); + + switch(cmd) { + case SIOCGIFADDR: + break; + case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + if (sdn->sdn_family != AF_DECnet) + return -EINVAL; + break; + default: + return -EINVAL; + } + + rtnl_lock(); + + if ((dev = __dev_get_by_name(&init_net, ifr->ifr_name)) == NULL) { + ret = -ENODEV; + goto done; + } + + if ((dn_db = rtnl_dereference(dev->dn_ptr)) != NULL) { + for (ifap = &dn_db->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; + ifap = &ifa->ifa_next) + if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0) + break; + } + + if (ifa == NULL && cmd != SIOCSIFADDR) { + ret = -EADDRNOTAVAIL; + goto done; + } + + switch(cmd) { + case SIOCGIFADDR: + *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local; + goto rarok; + + case SIOCSIFADDR: + if (!ifa) { + if ((ifa = dn_dev_alloc_ifa()) == NULL) { + ret = -ENOBUFS; + break; + } + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + } else { + if (ifa->ifa_local == dn_saddr2dn(sdn)) + break; + dn_dev_del_ifa(dn_db, ifap, 0); + } + + ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn); + + ret = dn_dev_set_ifa(dev, ifa); + } +done: + rtnl_unlock(); + + return ret; +rarok: + if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) + ret = -EFAULT; + goto done; +} + +struct net_device *dn_dev_get_default(void) +{ + struct net_device *dev; + + spin_lock(&dndev_lock); + dev = decnet_default_device; + if (dev) { + if (dev->dn_ptr) + dev_hold(dev); + else + dev = NULL; + } + spin_unlock(&dndev_lock); + + return dev; +} + +int dn_dev_set_default(struct net_device *dev, int force) +{ + struct net_device *old = NULL; + int rv = -EBUSY; + if (!dev->dn_ptr) + return -ENODEV; + + spin_lock(&dndev_lock); + if (force || decnet_default_device == NULL) { + old = decnet_default_device; + decnet_default_device = dev; + rv = 0; + } + spin_unlock(&dndev_lock); + + if (old) + dev_put(old); + return rv; +} + +static void dn_dev_check_default(struct net_device *dev) +{ + spin_lock(&dndev_lock); + if (dev == decnet_default_device) { + decnet_default_device = NULL; + } else { + dev = NULL; + } + spin_unlock(&dndev_lock); + + if (dev) + dev_put(dev); +} + +/* + * Called with RTNL + */ +static struct dn_dev *dn_dev_by_index(int ifindex) +{ + struct net_device *dev; + struct dn_dev *dn_dev = NULL; + + dev = __dev_get_by_index(&init_net, ifindex); + if (dev) + dn_dev = rtnl_dereference(dev->dn_ptr); + + return dn_dev; +} + +static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = { + [IFA_ADDRESS] = { .type = NLA_U16 }, + [IFA_LOCAL] = { .type = NLA_U16 }, + [IFA_LABEL] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, +}; + +static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX+1]; + struct dn_dev *dn_db; + struct ifaddrmsg *ifm; + struct dn_ifaddr *ifa; + struct dn_ifaddr __rcu **ifap; + int err = -EINVAL; + + if (!net_eq(net, &init_net)) + goto errout; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy); + if (err < 0) + goto errout; + + err = -ENODEV; + ifm = nlmsg_data(nlh); + if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL) + goto errout; + + err = -EADDRNOTAVAIL; + for (ifap = &dn_db->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; + ifap = &ifa->ifa_next) { + if (tb[IFA_LOCAL] && + nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2)) + continue; + + if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) + continue; + + dn_dev_del_ifa(dn_db, ifap, 1); + return 0; + } + +errout: + return err; +} + +static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX+1]; + struct net_device *dev; + struct dn_dev *dn_db; + struct ifaddrmsg *ifm; + struct dn_ifaddr *ifa; + int err; + + if (!net_eq(net, &init_net)) + return -EINVAL; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy); + if (err < 0) + return err; + + if (tb[IFA_LOCAL] == NULL) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if ((dev = __dev_get_by_index(&init_net, ifm->ifa_index)) == NULL) + return -ENODEV; + + if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) { + dn_db = dn_dev_create(dev, &err); + if (!dn_db) + return err; + } + + if ((ifa = dn_dev_alloc_ifa()) == NULL) + return -ENOBUFS; + + if (tb[IFA_ADDRESS] == NULL) + tb[IFA_ADDRESS] = tb[IFA_LOCAL]; + + ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]); + ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + ifa->ifa_dev = dn_db; + + if (tb[IFA_LABEL]) + nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + err = dn_dev_insert_ifa(dn_db, ifa); + if (err) + dn_dev_free_ifa(ifa); + + return err; +} + +static inline size_t dn_ifaddr_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + + nla_total_size(2) /* IFA_ADDRESS */ + + nla_total_size(2); /* IFA_LOCAL */ +} + +static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_DECnet; + ifm->ifa_prefixlen = 16; + ifm->ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + + if (ifa->ifa_address) + NLA_PUT_LE16(skb, IFA_ADDRESS, ifa->ifa_address); + if (ifa->ifa_local) + NLA_PUT_LE16(skb, IFA_LOCAL, ifa->ifa_local); + if (ifa->ifa_label[0]) + NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in dn_ifaddr_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); +} + +static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int idx, dn_idx = 0, skip_ndevs, skip_naddr; + struct net_device *dev; + struct dn_dev *dn_db; + struct dn_ifaddr *ifa; + + if (!net_eq(net, &init_net)) + return 0; + + skip_ndevs = cb->args[0]; + skip_naddr = cb->args[1]; + + idx = 0; + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if (idx < skip_ndevs) + goto cont; + else if (idx > skip_ndevs) { + /* Only skip over addresses for first dev dumped + * in this iteration (idx == skip_ndevs) */ + skip_naddr = 0; + } + + if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL) + goto cont; + + for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa; + ifa = rcu_dereference(ifa->ifa_next), dn_idx++) { + if (dn_idx < skip_naddr) + continue; + + if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR, + NLM_F_MULTI) < 0) + goto done; + } +cont: + idx++; + } +done: + rcu_read_unlock(); + cb->args[0] = idx; + cb->args[1] = dn_idx; + + return skb->len; +} + +static int dn_dev_get_first(struct net_device *dev, __le16 *addr) +{ + struct dn_dev *dn_db; + struct dn_ifaddr *ifa; + int rv = -ENODEV; + + rcu_read_lock(); + dn_db = rcu_dereference(dev->dn_ptr); + if (dn_db == NULL) + goto out; + + ifa = rcu_dereference(dn_db->ifa_list); + if (ifa != NULL) { + *addr = ifa->ifa_local; + rv = 0; + } +out: + rcu_read_unlock(); + return rv; +} + +/* + * Find a default address to bind to. + * + * This is one of those areas where the initial VMS concepts don't really + * map onto the Linux concepts, and since we introduced multiple addresses + * per interface we have to cope with slightly odd ways of finding out what + * "our address" really is. Mostly it's not a problem; for this we just guess + * a sensible default. Eventually the routing code will take care of all the + * nasties for us I hope. + */ +int dn_dev_bind_default(__le16 *addr) +{ + struct net_device *dev; + int rv; + dev = dn_dev_get_default(); +last_chance: + if (dev) { + rv = dn_dev_get_first(dev, addr); + dev_put(dev); + if (rv == 0 || dev == init_net.loopback_dev) + return rv; + } + dev = init_net.loopback_dev; + dev_hold(dev); + goto last_chance; +} + +static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + struct endnode_hello_message *msg; + struct sk_buff *skb = NULL; + __le16 *pktlen; + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + + if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL) + return; + + skb->dev = dev; + + msg = (struct endnode_hello_message *)skb_put(skb,sizeof(*msg)); + + msg->msgflg = 0x0D; + memcpy(msg->tiver, dn_eco_version, 3); + dn_dn2eth(msg->id, ifa->ifa_local); + msg->iinfo = DN_RT_INFO_ENDN; + msg->blksize = cpu_to_le16(mtu2blksize(dev)); + msg->area = 0x00; + memset(msg->seed, 0, 8); + memcpy(msg->neighbor, dn_hiord, ETH_ALEN); + + if (dn_db->router) { + struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + dn_dn2eth(msg->neighbor, dn->addr); + } + + msg->timer = cpu_to_le16((unsigned short)dn_db->parms.t3); + msg->mpd = 0x00; + msg->datalen = 0x02; + memset(msg->data, 0xAA, 2); + + pktlen = (__le16 *)skb_push(skb,2); + *pktlen = cpu_to_le16(skb->len - 2); + + skb_reset_network_header(skb); + + dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id); +} + + +#define DRDELAY (5 * HZ) + +static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa) +{ + /* First check time since device went up */ + if ((jiffies - dn_db->uptime) < DRDELAY) + return 0; + + /* If there is no router, then yes... */ + if (!dn_db->router) + return 1; + + /* otherwise only if we have a higher priority or.. */ + if (dn->priority < dn_db->parms.priority) + return 1; + + /* if we have equal priority and a higher node number */ + if (dn->priority != dn_db->parms.priority) + return 0; + + if (le16_to_cpu(dn->addr) < le16_to_cpu(ifa->ifa_local)) + return 1; + + return 0; +} + +static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + int n; + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + struct sk_buff *skb; + size_t size; + unsigned char *ptr; + unsigned char *i1, *i2; + __le16 *pktlen; + char *src; + + if (mtu2blksize(dev) < (26 + 7)) + return; + + n = mtu2blksize(dev) - 26; + n /= 7; + + if (n > 32) + n = 32; + + size = 2 + 26 + 7 * n; + + if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL) + return; + + skb->dev = dev; + ptr = skb_put(skb, size); + + *ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH; + *ptr++ = 2; /* ECO */ + *ptr++ = 0; + *ptr++ = 0; + dn_dn2eth(ptr, ifa->ifa_local); + src = ptr; + ptr += ETH_ALEN; + *ptr++ = dn_db->parms.forwarding == 1 ? + DN_RT_INFO_L1RT : DN_RT_INFO_L2RT; + *((__le16 *)ptr) = cpu_to_le16(mtu2blksize(dev)); + ptr += 2; + *ptr++ = dn_db->parms.priority; /* Priority */ + *ptr++ = 0; /* Area: Reserved */ + *((__le16 *)ptr) = cpu_to_le16((unsigned short)dn_db->parms.t3); + ptr += 2; + *ptr++ = 0; /* MPD: Reserved */ + i1 = ptr++; + memset(ptr, 0, 7); /* Name: Reserved */ + ptr += 7; + i2 = ptr++; + + n = dn_neigh_elist(dev, ptr, n); + + *i2 = 7 * n; + *i1 = 8 + *i2; + + skb_trim(skb, (27 + *i2)); + + pktlen = (__le16 *)skb_push(skb, 2); + *pktlen = cpu_to_le16(skb->len - 2); + + skb_reset_network_header(skb); + + if (dn_am_i_a_router(dn, dn_db, ifa)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); + if (skb2) { + dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src); + } + } + + dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src); +} + +static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + + if (dn_db->parms.forwarding == 0) + dn_send_endnode_hello(dev, ifa); + else + dn_send_router_hello(dev, ifa); +} + +static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + int tdlen = 16; + int size = dev->hard_header_len + 2 + 4 + tdlen; + struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC); + int i; + unsigned char *ptr; + char src[ETH_ALEN]; + + if (skb == NULL) + return ; + + skb->dev = dev; + skb_push(skb, dev->hard_header_len); + ptr = skb_put(skb, 2 + 4 + tdlen); + + *ptr++ = DN_RT_PKT_HELO; + *((__le16 *)ptr) = ifa->ifa_local; + ptr += 2; + *ptr++ = tdlen; + + for(i = 0; i < tdlen; i++) + *ptr++ = 0252; + + dn_dn2eth(src, ifa->ifa_local); + dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src); +} + +static int dn_eth_up(struct net_device *dev) +{ + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + + if (dn_db->parms.forwarding == 0) + dev_mc_add(dev, dn_rt_all_end_mcast); + else + dev_mc_add(dev, dn_rt_all_rt_mcast); + + dn_db->use_long = 1; + + return 0; +} + +static void dn_eth_down(struct net_device *dev) +{ + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + + if (dn_db->parms.forwarding == 0) + dev_mc_del(dev, dn_rt_all_end_mcast); + else + dev_mc_del(dev, dn_rt_all_rt_mcast); +} + +static void dn_dev_set_timer(struct net_device *dev); + +static void dn_dev_timer_func(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + struct dn_dev *dn_db; + struct dn_ifaddr *ifa; + + rcu_read_lock(); + dn_db = rcu_dereference(dev->dn_ptr); + if (dn_db->t3 <= dn_db->parms.t2) { + if (dn_db->parms.timer3) { + for (ifa = rcu_dereference(dn_db->ifa_list); + ifa; + ifa = rcu_dereference(ifa->ifa_next)) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) + dn_db->parms.timer3(dev, ifa); + } + } + dn_db->t3 = dn_db->parms.t3; + } else { + dn_db->t3 -= dn_db->parms.t2; + } + rcu_read_unlock(); + dn_dev_set_timer(dev); +} + +static void dn_dev_set_timer(struct net_device *dev) +{ + struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); + + if (dn_db->parms.t2 > dn_db->parms.t3) + dn_db->parms.t2 = dn_db->parms.t3; + + dn_db->timer.data = (unsigned long)dev; + dn_db->timer.function = dn_dev_timer_func; + dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ); + + add_timer(&dn_db->timer); +} + +static struct dn_dev *dn_dev_create(struct net_device *dev, int *err) +{ + int i; + struct dn_dev_parms *p = dn_dev_list; + struct dn_dev *dn_db; + + for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) { + if (p->type == dev->type) + break; + } + + *err = -ENODEV; + if (i == DN_DEV_LIST_SIZE) + return NULL; + + *err = -ENOBUFS; + if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL) + return NULL; + + memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms)); + + rcu_assign_pointer(dev->dn_ptr, dn_db); + dn_db->dev = dev; + init_timer(&dn_db->timer); + + dn_db->uptime = jiffies; + + dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table); + if (!dn_db->neigh_parms) { + rcu_assign_pointer(dev->dn_ptr, NULL); + kfree(dn_db); + return NULL; + } + + if (dn_db->parms.up) { + if (dn_db->parms.up(dev) < 0) { + neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); + dev->dn_ptr = NULL; + kfree(dn_db); + return NULL; + } + } + + dn_dev_sysctl_register(dev, &dn_db->parms); + + dn_dev_set_timer(dev); + + *err = 0; + return dn_db; +} + + +/* + * This processes a device up event. We only start up + * the loopback device & ethernet devices with correct + * MAC addresses automatically. Others must be started + * specifically. + * + * FIXME: How should we configure the loopback address ? If we could dispense + * with using decnet_address here and for autobind, it will be one less thing + * for users to worry about setting up. + */ + +void dn_dev_up(struct net_device *dev) +{ + struct dn_ifaddr *ifa; + __le16 addr = decnet_address; + int maybe_default = 0; + struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); + + if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) + return; + + /* + * Need to ensure that loopback device has a dn_db attached to it + * to allow creation of neighbours against it, even though it might + * not have a local address of its own. Might as well do the same for + * all autoconfigured interfaces. + */ + if (dn_db == NULL) { + int err; + dn_db = dn_dev_create(dev, &err); + if (dn_db == NULL) + return; + } + + if (dev->type == ARPHRD_ETHER) { + if (memcmp(dev->dev_addr, dn_hiord, 4) != 0) + return; + addr = dn_eth2dn(dev->dev_addr); + maybe_default = 1; + } + + if (addr == 0) + return; + + if ((ifa = dn_dev_alloc_ifa()) == NULL) + return; + + ifa->ifa_local = ifa->ifa_address = addr; + ifa->ifa_flags = 0; + ifa->ifa_scope = RT_SCOPE_UNIVERSE; + strcpy(ifa->ifa_label, dev->name); + + dn_dev_set_ifa(dev, ifa); + + /* + * Automagically set the default device to the first automatically + * configured ethernet card in the system. + */ + if (maybe_default) { + dev_hold(dev); + if (dn_dev_set_default(dev, 0)) + dev_put(dev); + } +} + +static void dn_dev_delete(struct net_device *dev) +{ + struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); + + if (dn_db == NULL) + return; + + del_timer_sync(&dn_db->timer); + dn_dev_sysctl_unregister(&dn_db->parms); + dn_dev_check_default(dev); + neigh_ifdown(&dn_neigh_table, dev); + + if (dn_db->parms.down) + dn_db->parms.down(dev); + + dev->dn_ptr = NULL; + + neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); + neigh_ifdown(&dn_neigh_table, dev); + + if (dn_db->router) + neigh_release(dn_db->router); + if (dn_db->peer) + neigh_release(dn_db->peer); + + kfree(dn_db); +} + +void dn_dev_down(struct net_device *dev) +{ + struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr); + struct dn_ifaddr *ifa; + + if (dn_db == NULL) + return; + + while ((ifa = rtnl_dereference(dn_db->ifa_list)) != NULL) { + dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0); + dn_dev_free_ifa(ifa); + } + + dn_dev_delete(dev); +} + +void dn_dev_init_pkt(struct sk_buff *skb) +{ +} + +void dn_dev_veri_pkt(struct sk_buff *skb) +{ +} + +void dn_dev_hello(struct sk_buff *skb) +{ +} + +void dn_dev_devices_off(void) +{ + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(&init_net, dev) + dn_dev_down(dev); + rtnl_unlock(); + +} + +void dn_dev_devices_on(void) +{ + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (dev->flags & IFF_UP) + dn_dev_up(dev); + } + rtnl_unlock(); +} + +int register_dnaddr_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&dnaddr_chain, nb); +} + +int unregister_dnaddr_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&dnaddr_chain, nb); +} + +#ifdef CONFIG_PROC_FS +static inline int is_dn_dev(struct net_device *dev) +{ + return dev->dn_ptr != NULL; +} + +static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + int i; + struct net_device *dev; + + rcu_read_lock(); + + if (*pos == 0) + return SEQ_START_TOKEN; + + i = 1; + for_each_netdev_rcu(&init_net, dev) { + if (!is_dn_dev(dev)) + continue; + + if (i++ == *pos) + return dev; + } + + return NULL; +} + +static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev; + + ++*pos; + + dev = (struct net_device *)v; + if (v == SEQ_START_TOKEN) + dev = net_device_entry(&init_net.dev_base_head); + + for_each_netdev_continue_rcu(&init_net, dev) { + if (!is_dn_dev(dev)) + continue; + + return dev; + } + + return NULL; +} + +static void dn_dev_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static char *dn_type2asc(char type) +{ + switch(type) { + case DN_DEV_BCAST: + return "B"; + case DN_DEV_UCAST: + return "U"; + case DN_DEV_MPOINT: + return "M"; + } + + return "?"; +} + +static int dn_dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Name Flags T1 Timer1 T3 Timer3 BlkSize Pri State DevType Router Peer\n"); + else { + struct net_device *dev = v; + char peer_buf[DN_ASCBUF_LEN]; + char router_buf[DN_ASCBUF_LEN]; + struct dn_dev *dn_db = rcu_dereference(dev->dn_ptr); + + seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu" + " %04hu %03d %02x %-10s %-7s %-7s\n", + dev->name ? dev->name : "???", + dn_type2asc(dn_db->parms.mode), + 0, 0, + dn_db->t3, dn_db->parms.t3, + mtu2blksize(dev), + dn_db->parms.priority, + dn_db->parms.state, dn_db->parms.name, + dn_db->router ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->router->primary_key), router_buf) : "", + dn_db->peer ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->peer->primary_key), peer_buf) : ""); + } + return 0; +} + +static const struct seq_operations dn_dev_seq_ops = { + .start = dn_dev_seq_start, + .next = dn_dev_seq_next, + .stop = dn_dev_seq_stop, + .show = dn_dev_seq_show, +}; + +static int dn_dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dn_dev_seq_ops); +} + +static const struct file_operations dn_dev_seq_fops = { + .owner = THIS_MODULE, + .open = dn_dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static int addr[2]; +module_param_array(addr, int, NULL, 0444); +MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node"); + +void __init dn_dev_init(void) +{ + if (addr[0] > 63 || addr[0] < 0) { + printk(KERN_ERR "DECnet: Area must be between 0 and 63"); + return; + } + + if (addr[1] > 1023 || addr[1] < 0) { + printk(KERN_ERR "DECnet: Node must be between 0 and 1023"); + return; + } + + decnet_address = cpu_to_le16((addr[0] << 10) | addr[1]); + + dn_dev_devices_on(); + + rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL); + rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL); + rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr); + + proc_net_fops_create(&init_net, "decnet_dev", S_IRUGO, &dn_dev_seq_fops); + +#ifdef CONFIG_SYSCTL + { + int i; + for(i = 0; i < DN_DEV_LIST_SIZE; i++) + dn_dev_sysctl_register(NULL, &dn_dev_list[i]); + } +#endif /* CONFIG_SYSCTL */ +} + +void __exit dn_dev_cleanup(void) +{ +#ifdef CONFIG_SYSCTL + { + int i; + for(i = 0; i < DN_DEV_LIST_SIZE; i++) + dn_dev_sysctl_unregister(&dn_dev_list[i]); + } +#endif /* CONFIG_SYSCTL */ + + proc_net_remove(&init_net, "decnet_dev"); + + dn_dev_devices_off(); +} diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c new file mode 100644 index 00000000..1c74ed36 --- /dev/null +++ b/net/decnet/dn_fib.c @@ -0,0 +1,770 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Forwarding Information Base (Glue/Info List) + * + * Author: Steve Whitehouse + * + * + * Changes: + * Alexey Kuznetsov : SMP locking changes + * Steve Whitehouse : Rewrote it... Well to be more correct, I + * copied most of it from the ipv4 fib code. + * Steve Whitehouse : Updated it in style and fixed a few bugs + * which were fixed in the ipv4 code since + * this code was copied from it. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RT_MIN_TABLE 1 + +#define for_fib_info() { struct dn_fib_info *fi;\ + for(fi = dn_fib_info_list; fi; fi = fi->fib_next) +#define endfor_fib_info() } + +#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ + for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\ + for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define endfor_nexthops(fi) } + +static DEFINE_SPINLOCK(dn_fib_multipath_lock); +static struct dn_fib_info *dn_fib_info_list; +static DEFINE_SPINLOCK(dn_fib_info_lock); + +static struct +{ + int error; + u8 scope; +} dn_fib_props[RTN_MAX+1] = { + [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE }, + [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE }, + [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST }, + [RTN_BROADCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, + [RTN_ANYCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, + [RTN_MULTICAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, + [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE }, + [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE }, + [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE }, + [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE }, + [RTN_NAT] = { .error = 0, .scope = RT_SCOPE_NOWHERE }, + [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, +}; + +static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force); +static int dn_fib_sync_up(struct net_device *dev); + +void dn_fib_free_info(struct dn_fib_info *fi) +{ + if (fi->fib_dead == 0) { + printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n"); + return; + } + + change_nexthops(fi) { + if (nh->nh_dev) + dev_put(nh->nh_dev); + nh->nh_dev = NULL; + } endfor_nexthops(fi); + kfree(fi); +} + +void dn_fib_release_info(struct dn_fib_info *fi) +{ + spin_lock(&dn_fib_info_lock); + if (fi && --fi->fib_treeref == 0) { + if (fi->fib_next) + fi->fib_next->fib_prev = fi->fib_prev; + if (fi->fib_prev) + fi->fib_prev->fib_next = fi->fib_next; + if (fi == dn_fib_info_list) + dn_fib_info_list = fi->fib_next; + fi->fib_dead = 1; + dn_fib_info_put(fi); + } + spin_unlock(&dn_fib_info_lock); +} + +static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi) +{ + const struct dn_fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || + nh->nh_weight != onh->nh_weight || + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi) +{ + for_fib_info() { + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && + memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0)) + return fi; + } endfor_fib_info(); + return NULL; +} + +__le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type) +{ + while(RTA_OK(attr,attrlen)) { + if (attr->rta_type == type) + return *(__le16*)RTA_DATA(attr); + attr = RTA_NEXT(attr, attrlen); + } + + return 0; +} + +static int dn_fib_count_nhs(struct rtattr *rta) +{ + int nhs = 0; + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + while(nhlen >= (int)sizeof(struct rtnexthop)) { + if ((nhlen -= nhp->rtnh_len) < 0) + return 0; + nhs++; + nhp = RTNH_NEXT(nhp); + } + + return nhs; +} + +static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + change_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + + nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; + nh->nh_oif = nhp->rtnh_ifindex; + nh->nh_weight = nhp->rtnh_hops + 1; + + if (attrlen) { + nh->nh_gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + + return 0; +} + + +static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh) +{ + int err; + + if (nh->nh_gw) { + struct flowidn fld; + struct dn_fib_res res; + + if (nh->nh_flags&RTNH_F_ONLINK) { + struct net_device *dev; + + if (r->rtm_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(dev); + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + + memset(&fld, 0, sizeof(fld)); + fld.daddr = nh->nh_gw; + fld.flowidn_oif = nh->nh_oif; + fld.flowidn_scope = r->rtm_scope + 1; + + if (fld.flowidn_scope < RT_SCOPE_LINK) + fld.flowidn_scope = RT_SCOPE_LINK; + + if ((err = dn_fib_lookup(&fld, &res)) != 0) + return err; + + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + goto out; + nh->nh_scope = res.scope; + nh->nh_oif = DN_FIB_RES_OIF(res); + nh->nh_dev = DN_FIB_RES_DEV(res); + if (nh->nh_dev == NULL) + goto out; + dev_hold(nh->nh_dev); + err = -ENETDOWN; + if (!(nh->nh_dev->flags & IFF_UP)) + goto out; + err = 0; +out: + dn_fib_res_put(&res); + return err; + } else { + struct net_device *dev; + + if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + return -EINVAL; + + dev = __dev_get_by_index(&init_net, nh->nh_oif); + if (dev == NULL || dev->dn_ptr == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(nh->nh_dev); + nh->nh_scope = RT_SCOPE_HOST; + } + + return 0; +} + + +struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta *rta, const struct nlmsghdr *nlh, int *errp) +{ + int err; + struct dn_fib_info *fi = NULL; + struct dn_fib_info *ofi; + int nhs = 1; + + if (r->rtm_type > RTN_MAX) + goto err_inval; + + if (dn_fib_props[r->rtm_type].scope > r->rtm_scope) + goto err_inval; + + if (rta->rta_mp) { + nhs = dn_fib_count_nhs(rta->rta_mp); + if (nhs == 0) + goto err_inval; + } + + fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL); + err = -ENOBUFS; + if (fi == NULL) + goto failure; + + fi->fib_protocol = r->rtm_protocol; + fi->fib_nhs = nhs; + fi->fib_flags = r->rtm_flags; + if (rta->rta_priority) + fi->fib_priority = *rta->rta_priority; + if (rta->rta_mx) { + int attrlen = RTA_PAYLOAD(rta->rta_mx); + struct rtattr *attr = RTA_DATA(rta->rta_mx); + + while(RTA_OK(attr, attrlen)) { + unsigned flavour = attr->rta_type; + if (flavour) { + if (flavour > RTAX_MAX) + goto err_inval; + fi->fib_metrics[flavour-1] = *(unsigned*)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + if (rta->rta_prefsrc) + memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 2); + + if (rta->rta_mp) { + if ((err = dn_fib_get_nhs(fi, rta->rta_mp, r)) != 0) + goto failure; + if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + goto err_inval; + if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 2)) + goto err_inval; + } else { + struct dn_fib_nh *nh = fi->fib_nh; + if (rta->rta_oif) + nh->nh_oif = *rta->rta_oif; + if (rta->rta_gw) + memcpy(&nh->nh_gw, rta->rta_gw, 2); + nh->nh_flags = r->rtm_flags; + nh->nh_weight = 1; + } + + if (r->rtm_type == RTN_NAT) { + if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif) + goto err_inval; + memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 2); + goto link_it; + } + + if (dn_fib_props[r->rtm_type].error) { + if (rta->rta_gw || rta->rta_oif || rta->rta_mp) + goto err_inval; + goto link_it; + } + + if (r->rtm_scope > RT_SCOPE_HOST) + goto err_inval; + + if (r->rtm_scope == RT_SCOPE_HOST) { + struct dn_fib_nh *nh = fi->fib_nh; + + /* Local address is added */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + if ((err = dn_fib_check_nh(r, fi, nh)) != 0) + goto failure; + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || + memcmp(&fi->fib_prefsrc, rta->rta_dst, 2)) + if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + goto err_inval; + } + +link_it: + if ((ofi = dn_fib_find_info(fi)) != NULL) { + fi->fib_dead = 1; + dn_fib_free_info(fi); + ofi->fib_treeref++; + return ofi; + } + + fi->fib_treeref++; + atomic_inc(&fi->fib_clntref); + spin_lock(&dn_fib_info_lock); + fi->fib_next = dn_fib_info_list; + fi->fib_prev = NULL; + if (dn_fib_info_list) + dn_fib_info_list->fib_prev = fi; + dn_fib_info_list = fi; + spin_unlock(&dn_fib_info_lock); + return fi; + +err_inval: + err = -EINVAL; + +failure: + *errp = err; + if (fi) { + fi->fib_dead = 1; + dn_fib_free_info(fi); + } + + return NULL; +} + +int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res) +{ + int err = dn_fib_props[type].error; + + if (err == 0) { + if (fi->fib_flags & RTNH_F_DEAD) + return 1; + + res->fi = fi; + + switch(type) { + case RTN_NAT: + DN_FIB_RES_RESET(*res); + atomic_inc(&fi->fib_clntref); + return 0; + case RTN_UNICAST: + case RTN_LOCAL: + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (!fld->flowidn_oif || + fld->flowidn_oif == nh->nh_oif) + break; + } + if (nhsel < fi->fib_nhs) { + res->nh_sel = nhsel; + atomic_inc(&fi->fib_clntref); + return 0; + } + endfor_nexthops(fi); + res->fi = NULL; + return 1; + default: + if (net_ratelimit()) + printk("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n", type); + res->fi = NULL; + return -EINVAL; + } + } + return err; +} + +void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res) +{ + struct dn_fib_info *fi = res->fi; + int w; + + spin_lock_bh(&dn_fib_multipath_lock); + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; + if (power < 0) { + spin_unlock_bh(&dn_fib_multipath_lock); + res->nh_sel = 0; + return; + } + } + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + spin_unlock_bh(&dn_fib_multipath_lock); + return; + } + } + } endfor_nexthops(fi); + res->nh_sel = 0; + spin_unlock_bh(&dn_fib_multipath_lock); +} + + +static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta) +{ + int i; + + for(i = 1; i <= RTA_MAX; i++) { + struct rtattr *attr = rta[i-1]; + if (attr) { + if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2) + return -EINVAL; + if (i != RTA_MULTIPATH && i != RTA_METRICS && + i != RTA_TABLE) + rta[i-1] = (struct rtattr *)RTA_DATA(attr); + } + } + + return 0; +} + +static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct dn_fib_table *tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (!net_eq(net, &init_net)) + return -EINVAL; + + if (dn_fib_check_attr(r, rta)) + return -EINVAL; + + tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 0); + if (tb) + return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb)); + + return -ESRCH; +} + +static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct dn_fib_table *tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (!net_eq(net, &init_net)) + return -EINVAL; + + if (dn_fib_check_attr(r, rta)) + return -EINVAL; + + tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 1); + if (tb) + return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb)); + + return -ENOBUFS; +} + +static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa) +{ + struct dn_fib_table *tb; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct dn_kern_rta rta; + + memset(&req.rtm, 0, sizeof(req.rtm)); + memset(&rta, 0, sizeof(rta)); + + if (type == RTN_UNICAST) + tb = dn_fib_get_table(RT_MIN_TABLE, 1); + else + tb = dn_fib_get_table(RT_TABLE_LOCAL, 1); + + if (tb == NULL) + return; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = cmd; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 0; + + req.rtm.rtm_dst_len = dst_len; + req.rtm.rtm_table = tb->n; + req.rtm.rtm_protocol = RTPROT_KERNEL; + req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); + req.rtm.rtm_type = type; + + rta.rta_dst = &dst; + rta.rta_prefsrc = &ifa->ifa_local; + rta.rta_oif = &ifa->ifa_dev->dev->ifindex; + + if (cmd == RTM_NEWROUTE) + tb->insert(tb, &req.rtm, &rta, &req.nlh, NULL); + else + tb->delete(tb, &req.rtm, &rta, &req.nlh, NULL); +} + +static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa) +{ + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa); + +#if 0 + if (!(dev->flags&IFF_UP)) + return; + /* In the future, we will want to add default routes here */ + +#endif +} + +static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa) +{ + int found_it = 0; + struct net_device *dev; + struct dn_dev *dn_db; + struct dn_ifaddr *ifa2; + + ASSERT_RTNL(); + + /* Scan device list */ + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + dn_db = rcu_dereference(dev->dn_ptr); + if (dn_db == NULL) + continue; + for (ifa2 = rcu_dereference(dn_db->ifa_list); + ifa2 != NULL; + ifa2 = rcu_dereference(ifa2->ifa_next)) { + if (ifa2->ifa_local == ifa->ifa_local) { + found_it = 1; + break; + } + } + } + rcu_read_unlock(); + + if (found_it == 0) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa); + + if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + if (dn_fib_sync_down(ifa->ifa_local, NULL, 0)) + dn_fib_flush(); + } + } +} + +static void dn_fib_disable_addr(struct net_device *dev, int force) +{ + if (dn_fib_sync_down(0, dev, force)) + dn_fib_flush(); + dn_rt_cache_flush(0); + neigh_ifdown(&dn_neigh_table, dev); +} + +static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr; + + switch(event) { + case NETDEV_UP: + dn_fib_add_ifaddr(ifa); + dn_fib_sync_up(ifa->ifa_dev->dev); + dn_rt_cache_flush(-1); + break; + case NETDEV_DOWN: + dn_fib_del_ifaddr(ifa); + if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + dn_fib_disable_addr(ifa->ifa_dev->dev, 1); + } else { + dn_rt_cache_flush(-1); + } + break; + } + return NOTIFY_DONE; +} + +static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force) +{ + int ret = 0; + int scope = RT_SCOPE_NOWHERE; + + if (force) + scope = -1; + + for_fib_info() { + /* + * This makes no sense for DECnet.... we will almost + * certainly have more than one local address the same + * over all our interfaces. It needs thinking about + * some more. + */ + if (local && fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } else if (dev && fi->fib_nhs) { + int dead = 0; + + change_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + dead++; + else if (nh->nh_dev == dev && + nh->nh_scope != scope) { + spin_lock_bh(&dn_fib_multipath_lock); + nh->nh_flags |= RTNH_F_DEAD; + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; + spin_unlock_bh(&dn_fib_multipath_lock); + dead++; + } + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } endfor_fib_info(); + return ret; +} + + +static int dn_fib_sync_up(struct net_device *dev) +{ + int ret = 0; + + if (!(dev->flags&IFF_UP)) + return 0; + + for_fib_info() { + int alive = 0; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + alive++; + continue; + } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || dev->dn_ptr == NULL) + continue; + alive++; + spin_lock_bh(&dn_fib_multipath_lock); + nh->nh_power = 0; + nh->nh_flags &= ~RTNH_F_DEAD; + spin_unlock_bh(&dn_fib_multipath_lock); + } endfor_nexthops(fi); + + if (alive > 0) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } endfor_fib_info(); + return ret; +} + +static struct notifier_block dn_fib_dnaddr_notifier = { + .notifier_call = dn_fib_dnaddr_event, +}; + +void __exit dn_fib_cleanup(void) +{ + dn_fib_table_cleanup(); + dn_fib_rules_cleanup(); + + unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier); +} + + +void __init dn_fib_init(void) +{ + dn_fib_table_init(); + dn_fib_rules_init(); + + register_dnaddr_notifier(&dn_fib_dnaddr_notifier); + + rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL); + rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL); +} + + diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c new file mode 100644 index 00000000..9810610d --- /dev/null +++ b/net/decnet/dn_neigh.c @@ -0,0 +1,612 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Neighbour Functions (Adjacency Database and + * On-Ethernet Cache) + * + * Author: Steve Whitehouse + * + * + * Changes: + * Steve Whitehouse : Fixed router listing routine + * Steve Whitehouse : Added error_report functions + * Steve Whitehouse : Added default router detection + * Steve Whitehouse : Hop counts in outgoing messages + * Steve Whitehouse : Fixed src/dst in outgoing messages so + * forwarding now stands a good chance of + * working. + * Steve Whitehouse : Fixed neighbour states (for now anyway). + * Steve Whitehouse : Made error_report functions dummies. This + * is not the right place to return skbs. + * Steve Whitehouse : Convert to seq_file + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int dn_neigh_construct(struct neighbour *); +static void dn_long_error_report(struct neighbour *, struct sk_buff *); +static void dn_short_error_report(struct neighbour *, struct sk_buff *); +static int dn_long_output(struct sk_buff *); +static int dn_short_output(struct sk_buff *); +static int dn_phase3_output(struct sk_buff *); + + +/* + * For talking to broadcast devices: Ethernet & PPP + */ +static const struct neigh_ops dn_long_ops = { + .family = AF_DECnet, + .error_report = dn_long_error_report, + .output = dn_long_output, + .connected_output = dn_long_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +/* + * For talking to pointopoint and multidrop devices: DDCMP and X.25 + */ +static const struct neigh_ops dn_short_ops = { + .family = AF_DECnet, + .error_report = dn_short_error_report, + .output = dn_short_output, + .connected_output = dn_short_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +/* + * For talking to DECnet phase III nodes + */ +static const struct neigh_ops dn_phase3_ops = { + .family = AF_DECnet, + .error_report = dn_short_error_report, /* Can use short version here */ + .output = dn_phase3_output, + .connected_output = dn_phase3_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit +}; + +static u32 dn_neigh_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) +{ + return jhash_2words(*(__u16 *)pkey, 0, hash_rnd); +} + +struct neigh_table dn_neigh_table = { + .family = PF_DECnet, + .entry_size = sizeof(struct dn_neigh), + .key_len = sizeof(__le16), + .hash = dn_neigh_hash, + .constructor = dn_neigh_construct, + .id = "dn_neigh_cache", + .parms ={ + .tbl = &dn_neigh_table, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 0, + .app_probes = 0, + .mcast_probes = 0, + .anycast_delay = 0, + .proxy_delay = 0, + .proxy_qlen = 0, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +static int dn_neigh_construct(struct neighbour *neigh) +{ + struct net_device *dev = neigh->dev; + struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_dev *dn_db; + struct neigh_parms *parms; + + rcu_read_lock(); + dn_db = rcu_dereference(dev->dn_ptr); + if (dn_db == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = dn_db->neigh_parms; + if (!parms) { + rcu_read_unlock(); + return -EINVAL; + } + + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + + if (dn_db->use_long) + neigh->ops = &dn_long_ops; + else + neigh->ops = &dn_short_ops; + rcu_read_unlock(); + + if (dn->flags & DN_NDFLAG_P3) + neigh->ops = &dn_phase3_ops; + + neigh->nud_state = NUD_NOARP; + neigh->output = neigh->ops->connected_output; + + if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT)) + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK)) + dn_dn2eth(neigh->ha, dn->addr); + else { + if (net_ratelimit()) + printk(KERN_DEBUG "Trying to create neigh for hw %d\n", dev->type); + return -EINVAL; + } + + /* + * Make an estimate of the remote block size by assuming that its + * two less then the device mtu, which it true for ethernet (and + * other things which support long format headers) since there is + * an extra length field (of 16 bits) which isn't part of the + * ethernet headers and which the DECnet specs won't admit is part + * of the DECnet routing headers either. + * + * If we over estimate here its no big deal, the NSP negotiations + * will prevent us from sending packets which are too large for the + * remote node to handle. In any case this figure is normally updated + * by a hello message in most cases. + */ + dn->blksize = dev->mtu - 2; + + return 0; +} + +static void dn_long_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + printk(KERN_DEBUG "dn_long_error_report: called\n"); + kfree_skb(skb); +} + + +static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + printk(KERN_DEBUG "dn_short_error_report: called\n"); + kfree_skb(skb); +} + +static int dn_neigh_output_packet(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct dn_route *rt = (struct dn_route *)dst; + struct neighbour *neigh = dst_get_neighbour(dst); + struct net_device *dev = neigh->dev; + char mac_addr[ETH_ALEN]; + + dn_dn2eth(mac_addr, rt->rt_local_src); + if (dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, + mac_addr, skb->len) >= 0) + return neigh->ops->queue_xmit(skb); + + if (net_ratelimit()) + printk(KERN_DEBUG "dn_neigh_output_packet: oops, can't send packet\n"); + + kfree_skb(skb); + return -EINVAL; +} + +static int dn_long_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct neighbour *neigh = dst_get_neighbour(dst); + struct net_device *dev = neigh->dev; + int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3; + unsigned char *data; + struct dn_long_packet *lp; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + + if (skb_headroom(skb) < headroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); + if (skb2 == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "dn_long_output: no memory\n"); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + if (net_ratelimit()) + printk(KERN_INFO "dn_long_output: Increasing headroom\n"); + } + + data = skb_push(skb, sizeof(struct dn_long_packet) + 3); + lp = (struct dn_long_packet *)(data+3); + + *((__le16 *)data) = cpu_to_le16(skb->len - 2); + *(data + 2) = 1 | DN_RT_F_PF; /* Padding */ + + lp->msgflg = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS)); + lp->d_area = lp->d_subarea = 0; + dn_dn2eth(lp->d_id, cb->dst); + lp->s_area = lp->s_subarea = 0; + dn_dn2eth(lp->s_id, cb->src); + lp->nl2 = 0; + lp->visit_ct = cb->hops & 0x3f; + lp->s_class = 0; + lp->pt = 0; + + skb_reset_network_header(skb); + + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL, + neigh->dev, dn_neigh_output_packet); +} + +static int dn_short_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct neighbour *neigh = dst_get_neighbour(dst); + struct net_device *dev = neigh->dev; + int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; + struct dn_short_packet *sp; + unsigned char *data; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + + if (skb_headroom(skb) < headroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); + if (skb2 == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "dn_short_output: no memory\n"); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + if (net_ratelimit()) + printk(KERN_INFO "dn_short_output: Increasing headroom\n"); + } + + data = skb_push(skb, sizeof(struct dn_short_packet) + 2); + *((__le16 *)data) = cpu_to_le16(skb->len - 2); + sp = (struct dn_short_packet *)(data+2); + + sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS)); + sp->dstnode = cb->dst; + sp->srcnode = cb->src; + sp->forward = cb->hops & 0x3f; + + skb_reset_network_header(skb); + + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL, + neigh->dev, dn_neigh_output_packet); +} + +/* + * Phase 3 output is the same is short output, execpt that + * it clears the area bits before transmission. + */ +static int dn_phase3_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct neighbour *neigh = dst_get_neighbour(dst); + struct net_device *dev = neigh->dev; + int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; + struct dn_short_packet *sp; + unsigned char *data; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + if (skb_headroom(skb) < headroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); + if (skb2 == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "dn_phase3_output: no memory\n"); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + if (net_ratelimit()) + printk(KERN_INFO "dn_phase3_output: Increasing headroom\n"); + } + + data = skb_push(skb, sizeof(struct dn_short_packet) + 2); + *((__le16 *)data) = cpu_to_le16(skb->len - 2); + sp = (struct dn_short_packet *)(data + 2); + + sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS)); + sp->dstnode = cb->dst & cpu_to_le16(0x03ff); + sp->srcnode = cb->src & cpu_to_le16(0x03ff); + sp->forward = cb->hops & 0x3f; + + skb_reset_network_header(skb); + + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL, + neigh->dev, dn_neigh_output_packet); +} + +/* + * Unfortunately, the neighbour code uses the device in its hash + * function, so we don't get any advantage from it. This function + * basically does a neigh_lookup(), but without comparing the device + * field. This is required for the On-Ethernet cache + */ + +/* + * Pointopoint link receives a hello message + */ +void dn_neigh_pointopoint_hello(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* + * Ethernet router hello message received + */ +int dn_neigh_router_hello(struct sk_buff *skb) +{ + struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data; + + struct neighbour *neigh; + struct dn_neigh *dn; + struct dn_dev *dn_db; + __le16 src; + + src = dn_eth2dn(msg->id); + + neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); + + dn = (struct dn_neigh *)neigh; + + if (neigh) { + write_lock(&neigh->lock); + + neigh->used = jiffies; + dn_db = rcu_dereference(neigh->dev->dn_ptr); + + if (!(neigh->nud_state & NUD_PERMANENT)) { + neigh->updated = jiffies; + + if (neigh->dev->type == ARPHRD_ETHER) + memcpy(neigh->ha, ð_hdr(skb)->h_source, ETH_ALEN); + + dn->blksize = le16_to_cpu(msg->blksize); + dn->priority = msg->priority; + + dn->flags &= ~DN_NDFLAG_P3; + + switch(msg->iinfo & DN_RT_INFO_TYPE) { + case DN_RT_INFO_L1RT: + dn->flags &=~DN_NDFLAG_R2; + dn->flags |= DN_NDFLAG_R1; + break; + case DN_RT_INFO_L2RT: + dn->flags |= DN_NDFLAG_R2; + } + } + + /* Only use routers in our area */ + if ((le16_to_cpu(src)>>10) == (le16_to_cpu((decnet_address))>>10)) { + if (!dn_db->router) { + dn_db->router = neigh_clone(neigh); + } else { + if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + } + } + write_unlock(&neigh->lock); + neigh_release(neigh); + } + + kfree_skb(skb); + return 0; +} + +/* + * Endnode hello message received + */ +int dn_neigh_endnode_hello(struct sk_buff *skb) +{ + struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data; + struct neighbour *neigh; + struct dn_neigh *dn; + __le16 src; + + src = dn_eth2dn(msg->id); + + neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); + + dn = (struct dn_neigh *)neigh; + + if (neigh) { + write_lock(&neigh->lock); + + neigh->used = jiffies; + + if (!(neigh->nud_state & NUD_PERMANENT)) { + neigh->updated = jiffies; + + if (neigh->dev->type == ARPHRD_ETHER) + memcpy(neigh->ha, ð_hdr(skb)->h_source, ETH_ALEN); + dn->flags &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2); + dn->blksize = le16_to_cpu(msg->blksize); + dn->priority = 0; + } + + write_unlock(&neigh->lock); + neigh_release(neigh); + } + + kfree_skb(skb); + return 0; +} + +static char *dn_find_slot(char *base, int max, int priority) +{ + int i; + unsigned char *min = NULL; + + base += 6; /* skip first id */ + + for(i = 0; i < max; i++) { + if (!min || (*base < *min)) + min = base; + base += 7; /* find next priority */ + } + + if (!min) + return NULL; + + return (*min < priority) ? (min - 6) : NULL; +} + +struct elist_cb_state { + struct net_device *dev; + unsigned char *ptr; + unsigned char *rs; + int t, n; +}; + +static void neigh_elist_cb(struct neighbour *neigh, void *_info) +{ + struct elist_cb_state *s = _info; + struct dn_neigh *dn; + + if (neigh->dev != s->dev) + return; + + dn = (struct dn_neigh *) neigh; + if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2))) + return; + + if (s->t == s->n) + s->rs = dn_find_slot(s->ptr, s->n, dn->priority); + else + s->t++; + if (s->rs == NULL) + return; + + dn_dn2eth(s->rs, dn->addr); + s->rs += 6; + *(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0; + *(s->rs) |= dn->priority; + s->rs++; +} + +int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n) +{ + struct elist_cb_state state; + + state.dev = dev; + state.t = 0; + state.n = n; + state.ptr = ptr; + state.rs = ptr; + + neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state); + + return state.t; +} + + +#ifdef CONFIG_PROC_FS + +static inline void dn_neigh_format_entry(struct seq_file *seq, + struct neighbour *n) +{ + struct dn_neigh *dn = (struct dn_neigh *) n; + char buf[DN_ASCBUF_LEN]; + + read_lock(&n->lock); + seq_printf(seq, "%-7s %s%s%s %02x %02d %07ld %-8s\n", + dn_addr2asc(le16_to_cpu(dn->addr), buf), + (dn->flags&DN_NDFLAG_R1) ? "1" : "-", + (dn->flags&DN_NDFLAG_R2) ? "2" : "-", + (dn->flags&DN_NDFLAG_P3) ? "3" : "-", + dn->n.nud_state, + atomic_read(&dn->n.refcnt), + dn->blksize, + (dn->n.dev) ? dn->n.dev->name : "?"); + read_unlock(&n->lock); +} + +static int dn_neigh_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Addr Flags State Use Blksize Dev\n"); + } else { + dn_neigh_format_entry(seq, v); + } + + return 0; +} + +static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos) +{ + return neigh_seq_start(seq, pos, &dn_neigh_table, + NEIGH_SEQ_NEIGH_ONLY); +} + +static const struct seq_operations dn_neigh_seq_ops = { + .start = dn_neigh_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = dn_neigh_seq_show, +}; + +static int dn_neigh_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dn_neigh_seq_ops, + sizeof(struct neigh_seq_state)); +} + +static const struct file_operations dn_neigh_seq_fops = { + .owner = THIS_MODULE, + .open = dn_neigh_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +void __init dn_neigh_init(void) +{ + neigh_table_init(&dn_neigh_table); + proc_net_fops_create(&init_net, "decnet_neigh", S_IRUGO, &dn_neigh_seq_fops); +} + +void __exit dn_neigh_cleanup(void) +{ + proc_net_remove(&init_net, "decnet_neigh"); + neigh_table_clear(&dn_neigh_table); +} diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c new file mode 100644 index 00000000..b430549e --- /dev/null +++ b/net/decnet/dn_nsp_in.c @@ -0,0 +1,912 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Network Services Protocol (Input) + * + * Author: Eduardo Marcelo Serrat + * + * Changes: + * + * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from + * original dn_nsp.c. + * Steve Whitehouse: Updated to work with my new routing architecture. + * Steve Whitehouse: Add changes from Eduardo Serrat's patches. + * Steve Whitehouse: Put all ack handling code in a common routine. + * Steve Whitehouse: Put other common bits into dn_nsp_rx() + * Steve Whitehouse: More checks on skb->len to catch bogus packets + * Fixed various race conditions and possible nasties. + * Steve Whitehouse: Now handles returned conninit frames. + * David S. Miller: New socket locking + * Steve Whitehouse: Fixed lockup when socket filtering was enabled. + * Paul Koning: Fix to push CC sockets into RUN when acks are + * received. + * Steve Whitehouse: + * Patrick Caulfield: Checking conninits for correctness & sending of error + * responses. + * Steve Whitehouse: Added backlog congestion level return codes. + * Patrick Caulfield: + * Steve Whitehouse: Added flow control support (outbound) + * Steve Whitehouse: Prepare for nonlinear skbs + */ + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int decnet_log_martians; + +static void dn_log_martian(struct sk_buff *skb, const char *msg) +{ + if (decnet_log_martians && net_ratelimit()) { + char *devname = skb->dev ? skb->dev->name : "???"; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + printk(KERN_INFO "DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n", + msg, devname, le16_to_cpu(cb->src), le16_to_cpu(cb->dst), + le16_to_cpu(cb->src_port), le16_to_cpu(cb->dst_port)); + } +} + +/* + * For this function we've flipped the cross-subchannel bit + * if the message is an otherdata or linkservice message. Thus + * we can use it to work out what to update. + */ +static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short type = ((ack >> 12) & 0x0003); + int wakeup = 0; + + switch(type) { + case 0: /* ACK - Data */ + if (dn_after(ack, scp->ackrcv_dat)) { + scp->ackrcv_dat = ack & 0x0fff; + wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->data_xmit_queue, ack); + } + break; + case 1: /* NAK - Data */ + break; + case 2: /* ACK - OtherData */ + if (dn_after(ack, scp->ackrcv_oth)) { + scp->ackrcv_oth = ack & 0x0fff; + wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->other_xmit_queue, ack); + } + break; + case 3: /* NAK - OtherData */ + break; + } + + if (wakeup && !sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +} + +/* + * This function is a universal ack processor. + */ +static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth) +{ + __le16 *ptr = (__le16 *)skb->data; + int len = 0; + unsigned short ack; + + if (skb->len < 2) + return len; + + if ((ack = le16_to_cpu(*ptr)) & 0x8000) { + skb_pull(skb, 2); + ptr++; + len += 2; + if ((ack & 0x4000) == 0) { + if (oth) + ack ^= 0x2000; + dn_ack(sk, skb, ack); + } + } + + if (skb->len < 2) + return len; + + if ((ack = le16_to_cpu(*ptr)) & 0x8000) { + skb_pull(skb, 2); + len += 2; + if ((ack & 0x4000) == 0) { + if (oth) + ack ^= 0x2000; + dn_ack(sk, skb, ack); + } + } + + return len; +} + + +/** + * dn_check_idf - Check an image data field format is correct. + * @pptr: Pointer to pointer to image data + * @len: Pointer to length of image data + * @max: The maximum allowed length of the data in the image data field + * @follow_on: Check that this many bytes exist beyond the end of the image data + * + * Returns: 0 if ok, -1 on error + */ +static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on) +{ + unsigned char *ptr = *pptr; + unsigned char flen = *ptr++; + + (*len)--; + if (flen > max) + return -1; + if ((flen + follow_on) > *len) + return -1; + + *len -= flen; + *pptr = ptr + flen; + return 0; +} + +/* + * Table of reason codes to pass back to node which sent us a badly + * formed message, plus text messages for the log. A zero entry in + * the reason field means "don't reply" otherwise a disc init is sent with + * the specified reason code. + */ +static struct { + unsigned short reason; + const char *text; +} ci_err_table[] = { + { 0, "CI: Truncated message" }, + { NSP_REASON_ID, "CI: Destination username error" }, + { NSP_REASON_ID, "CI: Destination username type" }, + { NSP_REASON_US, "CI: Source username error" }, + { 0, "CI: Truncated at menuver" }, + { 0, "CI: Truncated before access or user data" }, + { NSP_REASON_IO, "CI: Access data format error" }, + { NSP_REASON_IO, "CI: User data format error" } +}; + +/* + * This function uses a slightly different lookup method + * to find its sockets, since it searches on object name/number + * rather than port numbers. Various tests are done to ensure that + * the incoming data is in the correct format before it is queued to + * a socket. + */ +static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data; + struct sockaddr_dn dstaddr; + struct sockaddr_dn srcaddr; + unsigned char type = 0; + int dstlen; + int srclen; + unsigned char *ptr; + int len; + int err = 0; + unsigned char menuver; + + memset(&dstaddr, 0, sizeof(struct sockaddr_dn)); + memset(&srcaddr, 0, sizeof(struct sockaddr_dn)); + + /* + * 1. Decode & remove message header + */ + cb->src_port = msg->srcaddr; + cb->dst_port = msg->dstaddr; + cb->services = msg->services; + cb->info = msg->info; + cb->segsize = le16_to_cpu(msg->segsize); + + if (!pskb_may_pull(skb, sizeof(*msg))) + goto err_out; + + skb_pull(skb, sizeof(*msg)); + + len = skb->len; + ptr = skb->data; + + /* + * 2. Check destination end username format + */ + dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type); + err++; + if (dstlen < 0) + goto err_out; + + err++; + if (type > 1) + goto err_out; + + len -= dstlen; + ptr += dstlen; + + /* + * 3. Check source end username format + */ + srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type); + err++; + if (srclen < 0) + goto err_out; + + len -= srclen; + ptr += srclen; + err++; + if (len < 1) + goto err_out; + + menuver = *ptr; + ptr++; + len--; + + /* + * 4. Check that optional data actually exists if menuver says it does + */ + err++; + if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1)) + goto err_out; + + /* + * 5. Check optional access data format + */ + err++; + if (menuver & DN_MENUVER_ACC) { + if (dn_check_idf(&ptr, &len, 39, 1)) + goto err_out; + if (dn_check_idf(&ptr, &len, 39, 1)) + goto err_out; + if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0)) + goto err_out; + } + + /* + * 6. Check optional user data format + */ + err++; + if (menuver & DN_MENUVER_USR) { + if (dn_check_idf(&ptr, &len, 16, 0)) + goto err_out; + } + + /* + * 7. Look up socket based on destination end username + */ + return dn_sklist_find_listener(&dstaddr); +err_out: + dn_log_martian(skb, ci_err_table[err].text); + *reason = ci_err_table[err].reason; + return NULL; +} + + +static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) +{ + if (sk_acceptq_is_full(sk)) { + kfree_skb(skb); + return; + } + + sk->sk_ack_backlog++; + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_state_change(sk); +} + +static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dn_scp *scp = DN_SK(sk); + unsigned char *ptr; + + if (skb->len < 4) + goto out; + + ptr = skb->data; + cb->services = *ptr++; + cb->info = *ptr++; + cb->segsize = le16_to_cpu(*(__le16 *)ptr); + + if ((scp->state == DN_CI) || (scp->state == DN_CD)) { + scp->persist = 0; + scp->addrrem = cb->src_port; + sk->sk_state = TCP_ESTABLISHED; + scp->state = DN_RUN; + scp->services_rem = cb->services; + scp->info_rem = cb->info; + scp->segsize_rem = cb->segsize; + + if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE) + scp->max_window = decnet_no_fc_max_cwnd; + + if (skb->len > 0) { + u16 dlen = *skb->data; + if ((dlen <= 16) && (dlen <= skb->len)) { + scp->conndata_in.opt_optl = cpu_to_le16(dlen); + skb_copy_from_linear_data_offset(skb, 1, + scp->conndata_in.opt_data, dlen); + } + } + dn_nsp_send_link(sk, DN_NOCHANGE, 0); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + } + +out: + kfree_skb(skb); +} + +static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CI) { + scp->state = DN_CD; + scp->persist = 0; + } + + kfree_skb(skb); +} + +static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned short reason; + + if (skb->len < 2) + goto out; + + reason = le16_to_cpu(*(__le16 *)skb->data); + skb_pull(skb, 2); + + scp->discdata_in.opt_status = cpu_to_le16(reason); + scp->discdata_in.opt_optl = 0; + memset(scp->discdata_in.opt_data, 0, 16); + + if (skb->len > 0) { + u16 dlen = *skb->data; + if ((dlen <= 16) && (dlen <= skb->len)) { + scp->discdata_in.opt_optl = cpu_to_le16(dlen); + skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen); + } + } + + scp->addrrem = cb->src_port; + sk->sk_state = TCP_CLOSE; + + switch(scp->state) { + case DN_CI: + case DN_CD: + scp->state = DN_RJ; + sk->sk_err = ECONNREFUSED; + break; + case DN_RUN: + sk->sk_shutdown |= SHUTDOWN_MASK; + scp->state = DN_DN; + break; + case DN_DI: + scp->state = DN_DIC; + break; + } + + if (!sock_flag(sk, SOCK_DEAD)) { + if (sk->sk_socket->state != SS_UNCONNECTED) + sk->sk_socket->state = SS_DISCONNECTING; + sk->sk_state_change(sk); + } + + /* + * It appears that its possible for remote machines to send disc + * init messages with no port identifier if we are in the CI and + * possibly also the CD state. Obviously we shouldn't reply with + * a message if we don't know what the end point is. + */ + if (scp->addrrem) { + dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC); + } + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + +out: + kfree_skb(skb); +} + +/* + * disc_conf messages are also called no_resources or no_link + * messages depending upon the "reason" field. + */ +static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short reason; + + if (skb->len != 2) + goto out; + + reason = le16_to_cpu(*(__le16 *)skb->data); + + sk->sk_state = TCP_CLOSE; + + switch(scp->state) { + case DN_CI: + scp->state = DN_NR; + break; + case DN_DR: + if (reason == NSP_REASON_DC) + scp->state = DN_DRC; + if (reason == NSP_REASON_NL) + scp->state = DN_CN; + break; + case DN_DI: + scp->state = DN_DIC; + break; + case DN_RUN: + sk->sk_shutdown |= SHUTDOWN_MASK; + case DN_CC: + scp->state = DN_CN; + } + + if (!sock_flag(sk, SOCK_DEAD)) { + if (sk->sk_socket->state != SS_UNCONNECTED) + sk->sk_socket->state = SS_DISCONNECTING; + sk->sk_state_change(sk); + } + + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + +out: + kfree_skb(skb); +} + +static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short segnum; + unsigned char lsflags; + signed char fcval; + int wake_up = 0; + char *ptr = skb->data; + unsigned char fctype = scp->services_rem & NSP_FC_MASK; + + if (skb->len != 4) + goto out; + + segnum = le16_to_cpu(*(__le16 *)ptr); + ptr += 2; + lsflags = *(unsigned char *)ptr++; + fcval = *ptr; + + /* + * Here we ignore erronous packets which should really + * should cause a connection abort. It is not critical + * for now though. + */ + if (lsflags & 0xf8) + goto out; + + if (seq_next(scp->numoth_rcv, segnum)) { + seq_add(&scp->numoth_rcv, 1); + switch(lsflags & 0x04) { /* FCVAL INT */ + case 0x00: /* Normal Request */ + switch(lsflags & 0x03) { /* FCVAL MOD */ + case 0x00: /* Request count */ + if (fcval < 0) { + unsigned char p_fcval = -fcval; + if ((scp->flowrem_dat > p_fcval) && + (fctype == NSP_FC_SCMC)) { + scp->flowrem_dat -= p_fcval; + } + } else if (fcval > 0) { + scp->flowrem_dat += fcval; + wake_up = 1; + } + break; + case 0x01: /* Stop outgoing data */ + scp->flowrem_sw = DN_DONTSEND; + break; + case 0x02: /* Ok to start again */ + scp->flowrem_sw = DN_SEND; + dn_nsp_output(sk); + wake_up = 1; + } + break; + case 0x04: /* Interrupt Request */ + if (fcval > 0) { + scp->flowrem_oth += fcval; + wake_up = 1; + } + break; + } + if (wake_up && !sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + } + + dn_nsp_send_oth_ack(sk); + +out: + kfree_skb(skb); +} + +/* + * Copy of sock_queue_rcv_skb (from sock.h) without + * bh_lock_sock() (its already held when this is called) which + * also allows data and other data to be queued to a socket. + */ +static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue) +{ + int err; + int skb_len; + + /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces + number of warnings when compiling with -W --ANK + */ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) { + err = -ENOMEM; + goto out; + } + + err = sk_filter(sk, skb); + if (err) + goto out; + + skb_len = skb->len; + skb_set_owner_r(skb, sk); + skb_queue_tail(queue, skb); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); +out: + return err; +} + +static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short segnum; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int queued = 0; + + if (skb->len < 2) + goto out; + + cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data); + skb_pull(skb, 2); + + if (seq_next(scp->numoth_rcv, segnum)) { + + if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) { + seq_add(&scp->numoth_rcv, 1); + scp->other_report = 0; + queued = 1; + } + } + + dn_nsp_send_oth_ack(sk); +out: + if (!queued) + kfree_skb(skb); +} + +static void dn_nsp_data(struct sock *sk, struct sk_buff *skb) +{ + int queued = 0; + unsigned short segnum; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dn_scp *scp = DN_SK(sk); + + if (skb->len < 2) + goto out; + + cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data); + skb_pull(skb, 2); + + if (seq_next(scp->numdat_rcv, segnum)) { + if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) { + seq_add(&scp->numdat_rcv, 1); + queued = 1; + } + + if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) { + scp->flowloc_sw = DN_DONTSEND; + dn_nsp_send_link(sk, DN_DONTSEND, 0); + } + } + + dn_nsp_send_data_ack(sk); +out: + if (!queued) + kfree_skb(skb); +} + +/* + * If one of our conninit messages is returned, this function + * deals with it. It puts the socket into the NO_COMMUNICATION + * state. + */ +static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CI) { + scp->state = DN_NC; + sk->sk_state = TCP_CLOSE; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + } + + kfree_skb(skb); +} + +static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int ret = NET_RX_DROP; + + /* Must not reply to returned packets */ + if (cb->rt_flags & DN_RT_F_RTS) + goto out; + + if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) { + switch(cb->nsp_flags & 0x70) { + case 0x10: + case 0x60: /* (Retransmitted) Connect Init */ + dn_nsp_return_disc(skb, NSP_DISCINIT, reason); + ret = NET_RX_SUCCESS; + break; + case 0x20: /* Connect Confirm */ + dn_nsp_return_disc(skb, NSP_DISCCONF, reason); + ret = NET_RX_SUCCESS; + break; + } + } + +out: + kfree_skb(skb); + return ret; +} + +static int dn_nsp_rx_packet(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct sock *sk = NULL; + unsigned char *ptr = (unsigned char *)skb->data; + unsigned short reason = NSP_REASON_NL; + + if (!pskb_may_pull(skb, 2)) + goto free_out; + + skb_reset_transport_header(skb); + cb->nsp_flags = *ptr++; + + if (decnet_debug_level & 2) + printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags); + + if (cb->nsp_flags & 0x83) + goto free_out; + + /* + * Filter out conninits and useless packet types + */ + if ((cb->nsp_flags & 0x0c) == 0x08) { + switch(cb->nsp_flags & 0x70) { + case 0x00: /* NOP */ + case 0x70: /* Reserved */ + case 0x50: /* Reserved, Phase II node init */ + goto free_out; + case 0x10: + case 0x60: + if (unlikely(cb->rt_flags & DN_RT_F_RTS)) + goto free_out; + sk = dn_find_listener(skb, &reason); + goto got_it; + } + } + + if (!pskb_may_pull(skb, 3)) + goto free_out; + + /* + * Grab the destination address. + */ + cb->dst_port = *(__le16 *)ptr; + cb->src_port = 0; + ptr += 2; + + /* + * If not a connack, grab the source address too. + */ + if (pskb_may_pull(skb, 5)) { + cb->src_port = *(__le16 *)ptr; + ptr += 2; + skb_pull(skb, 5); + } + + /* + * Returned packets... + * Swap src & dst and look up in the normal way. + */ + if (unlikely(cb->rt_flags & DN_RT_F_RTS)) { + __le16 tmp = cb->dst_port; + cb->dst_port = cb->src_port; + cb->src_port = tmp; + tmp = cb->dst; + cb->dst = cb->src; + cb->src = tmp; + } + + /* + * Find the socket to which this skb is destined. + */ + sk = dn_find_by_skb(skb); +got_it: + if (sk != NULL) { + struct dn_scp *scp = DN_SK(sk); + + /* Reset backoff */ + scp->nsp_rxtshift = 0; + + /* + * We linearize everything except data segments here. + */ + if (cb->nsp_flags & ~0x60) { + if (unlikely(skb_linearize(skb))) + goto free_out; + } + + return sk_receive_skb(sk, skb, 0); + } + + return dn_nsp_no_socket(skb, reason); + +free_out: + kfree_skb(skb); + return NET_RX_DROP; +} + +int dn_nsp_rx(struct sk_buff *skb) +{ + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, skb, skb->dev, NULL, + dn_nsp_rx_packet); +} + +/* + * This is the main receive routine for sockets. It is called + * from the above when the socket is not busy, and also from + * sock_release() when there is a backlog queued up. + */ +int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + if (cb->rt_flags & DN_RT_F_RTS) { + if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68) + dn_returned_conn_init(sk, skb); + else + kfree_skb(skb); + return NET_RX_SUCCESS; + } + + /* + * Control packet. + */ + if ((cb->nsp_flags & 0x0c) == 0x08) { + switch(cb->nsp_flags & 0x70) { + case 0x10: + case 0x60: + dn_nsp_conn_init(sk, skb); + break; + case 0x20: + dn_nsp_conn_conf(sk, skb); + break; + case 0x30: + dn_nsp_disc_init(sk, skb); + break; + case 0x40: + dn_nsp_disc_conf(sk, skb); + break; + } + + } else if (cb->nsp_flags == 0x24) { + /* + * Special for connacks, 'cos they don't have + * ack data or ack otherdata info. + */ + dn_nsp_conn_ack(sk, skb); + } else { + int other = 1; + + /* both data and ack frames can kick a CC socket into RUN */ + if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) { + scp->state = DN_RUN; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_state_change(sk); + } + + if ((cb->nsp_flags & 0x1c) == 0) + other = 0; + if (cb->nsp_flags == 0x04) + other = 0; + + /* + * Read out ack data here, this applies equally + * to data, other data, link serivce and both + * ack data and ack otherdata. + */ + dn_process_ack(sk, skb, other); + + /* + * If we've some sort of data here then call a + * suitable routine for dealing with it, otherwise + * the packet is an ack and can be discarded. + */ + if ((cb->nsp_flags & 0x0c) == 0) { + + if (scp->state != DN_RUN) + goto free_out; + + switch(cb->nsp_flags) { + case 0x10: /* LS */ + dn_nsp_linkservice(sk, skb); + break; + case 0x30: /* OD */ + dn_nsp_otherdata(sk, skb); + break; + default: + dn_nsp_data(sk, skb); + } + + } else { /* Ack, chuck it out here */ +free_out: + kfree_skb(skb); + } + } + + return NET_RX_SUCCESS; +} + diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c new file mode 100644 index 00000000..bd78836a --- /dev/null +++ b/net/decnet/dn_nsp_out.c @@ -0,0 +1,720 @@ + +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Network Services Protocol (Output) + * + * Author: Eduardo Marcelo Serrat + * + * Changes: + * + * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from + * original dn_nsp.c. + * Steve Whitehouse: Updated to work with my new routing architecture. + * Steve Whitehouse: Added changes from Eduardo Serrat's patches. + * Steve Whitehouse: Now conninits have the "return" bit set. + * Steve Whitehouse: Fixes to check alloc'd skbs are non NULL! + * Moved output state machine into one function + * Steve Whitehouse: New output state machine + * Paul Koning: Connect Confirm message fix. + * Eduardo Serrat: Fix to stop dn_nsp_do_disc() sending malformed packets. + * Steve Whitehouse: dn_nsp_output() and friends needed a spring clean + * Steve Whitehouse: Moved dn_nsp_send() in here from route.h + */ + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +static void dn_nsp_send(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct dn_scp *scp = DN_SK(sk); + struct dst_entry *dst; + struct flowidn fld; + + skb_reset_transport_header(skb); + scp->stamp = jiffies; + + dst = sk_dst_check(sk, 0); + if (dst) { +try_again: + skb_dst_set(skb, dst); + dst_output(skb); + return; + } + + memset(&fld, 0, sizeof(fld)); + fld.flowidn_oif = sk->sk_bound_dev_if; + fld.saddr = dn_saddr2dn(&scp->addr); + fld.daddr = dn_saddr2dn(&scp->peer); + dn_sk_ports_copy(&fld, scp); + fld.flowidn_proto = DNPROTO_NSP; + if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) { + dst = sk_dst_get(sk); + sk->sk_route_caps = dst->dev->features; + goto try_again; + } + + sk->sk_err = EHOSTUNREACH; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +} + + +/* + * If sk == NULL, then we assume that we are supposed to be making + * a routing layer skb. If sk != NULL, then we are supposed to be + * creating an skb for the NSP layer. + * + * The eventual aim is for each socket to have a cached header size + * for its outgoing packets, and to set hdr from this when sk != NULL. + */ +struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri) +{ + struct sk_buff *skb; + int hdr = 64; + + if ((skb = alloc_skb(size + hdr, pri)) == NULL) + return NULL; + + skb->protocol = htons(ETH_P_DNA_RT); + skb->pkt_type = PACKET_OUTGOING; + + if (sk) + skb_set_owner_w(skb, sk); + + skb_reserve(skb, hdr); + + return skb; +} + +/* + * Calculate persist timer based upon the smoothed round + * trip time and the variance. Backoff according to the + * nsp_backoff[] array. + */ +unsigned long dn_nsp_persist(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1; + + t *= nsp_backoff[scp->nsp_rxtshift]; + + if (t < HZ) t = HZ; + if (t > (600*HZ)) t = (600*HZ); + + if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT) + scp->nsp_rxtshift++; + + /* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */ + + return t; +} + +/* + * This is called each time we get an estimate for the rtt + * on the link. + */ +static void dn_nsp_rtt(struct sock *sk, long rtt) +{ + struct dn_scp *scp = DN_SK(sk); + long srtt = (long)scp->nsp_srtt; + long rttvar = (long)scp->nsp_rttvar; + long delta; + + /* + * If the jiffies clock flips over in the middle of timestamp + * gathering this value might turn out negative, so we make sure + * that is it always positive here. + */ + if (rtt < 0) + rtt = -rtt; + /* + * Add new rtt to smoothed average + */ + delta = ((rtt << 3) - srtt); + srtt += (delta >> 3); + if (srtt >= 1) + scp->nsp_srtt = (unsigned long)srtt; + else + scp->nsp_srtt = 1; + + /* + * Add new rtt varience to smoothed varience + */ + delta >>= 1; + rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2); + if (rttvar >= 1) + scp->nsp_rttvar = (unsigned long)rttvar; + else + scp->nsp_rttvar = 1; + + /* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */ +} + +/** + * dn_nsp_clone_and_send - Send a data packet by cloning it + * @skb: The packet to clone and transmit + * @gfp: memory allocation flag + * + * Clone a queued data or other data packet and transmit it. + * + * Returns: The number of times the packet has been sent previously + */ +static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb, + gfp_t gfp) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct sk_buff *skb2; + int ret = 0; + + if ((skb2 = skb_clone(skb, gfp)) != NULL) { + ret = cb->xmit_count; + cb->xmit_count++; + cb->stamp = jiffies; + skb2->sk = skb->sk; + dn_nsp_send(skb2); + } + + return ret; +} + +/** + * dn_nsp_output - Try and send something from socket queues + * @sk: The socket whose queues are to be investigated + * + * Try and send the packet on the end of the data and other data queues. + * Other data gets priority over data, and if we retransmit a packet we + * reduce the window by dividing it in two. + * + */ +void dn_nsp_output(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb; + unsigned reduce_win = 0; + + /* + * First we check for otherdata/linkservice messages + */ + if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL) + reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC); + + /* + * If we may not send any data, we don't. + * If we are still trying to get some other data down the + * channel, we don't try and send any data. + */ + if (reduce_win || (scp->flowrem_sw != DN_SEND)) + goto recalc_window; + + if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL) + reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC); + + /* + * If we've sent any frame more than once, we cut the + * send window size in half. There is always a minimum + * window size of one available. + */ +recalc_window: + if (reduce_win) { + scp->snd_window >>= 1; + if (scp->snd_window < NSP_MIN_WINDOW) + scp->snd_window = NSP_MIN_WINDOW; + } +} + +int dn_nsp_xmit_timeout(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + dn_nsp_output(sk); + + if (!skb_queue_empty(&scp->data_xmit_queue) || + !skb_queue_empty(&scp->other_xmit_queue)) + scp->persist = dn_nsp_persist(sk); + + return 0; +} + +static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len) +{ + unsigned char *ptr = skb_push(skb, len); + + BUG_ON(len < 5); + + *ptr++ = msgflag; + *((__le16 *)ptr) = scp->addrrem; + ptr += 2; + *((__le16 *)ptr) = scp->addrloc; + ptr += 2; + return (__le16 __force *)ptr; +} + +static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short acknum = scp->numdat_rcv & 0x0FFF; + unsigned short ackcrs = scp->numoth_rcv & 0x0FFF; + __le16 *ptr; + + BUG_ON(hlen < 9); + + scp->ackxmt_dat = acknum; + scp->ackxmt_oth = ackcrs; + acknum |= 0x8000; + ackcrs |= 0x8000; + + /* If this is an "other data/ack" message, swap acknum and ackcrs */ + if (other) { + unsigned short tmp = acknum; + acknum = ackcrs; + ackcrs = tmp; + } + + /* Set "cross subchannel" bit in ackcrs */ + ackcrs |= 0x2000; + + ptr = (__le16 *)dn_mk_common_header(scp, skb, msgflag, hlen); + + *ptr++ = cpu_to_le16(acknum); + *ptr++ = cpu_to_le16(ackcrs); + + return ptr; +} + +static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + __le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth); + + if (unlikely(oth)) { + cb->segnum = scp->numoth; + seq_add(&scp->numoth, 1); + } else { + cb->segnum = scp->numdat; + seq_add(&scp->numdat, 1); + } + *(ptr++) = cpu_to_le16(cb->segnum); + + return ptr; +} + +void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, + gfp_t gfp, int oth) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1; + + cb->xmit_count = 0; + dn_nsp_mk_data_header(sk, skb, oth); + + /* + * Slow start: If we have been idle for more than + * one RTT, then reset window to min size. + */ + if ((jiffies - scp->stamp) > t) + scp->snd_window = NSP_MIN_WINDOW; + + if (oth) + skb_queue_tail(&scp->other_xmit_queue, skb); + else + skb_queue_tail(&scp->data_xmit_queue, skb); + + if (scp->flowrem_sw != DN_SEND) + return; + + dn_nsp_clone_and_send(skb, gfp); +} + + +int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb2, *n, *ack = NULL; + int wakeup = 0; + int try_retrans = 0; + unsigned long reftime = cb->stamp; + unsigned long pkttime; + unsigned short xmit_count; + unsigned short segnum; + + skb_queue_walk_safe(q, skb2, n) { + struct dn_skb_cb *cb2 = DN_SKB_CB(skb2); + + if (dn_before_or_equal(cb2->segnum, acknum)) + ack = skb2; + + /* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */ + + if (ack == NULL) + continue; + + /* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */ + + /* Does _last_ packet acked have xmit_count > 1 */ + try_retrans = 0; + /* Remember to wake up the sending process */ + wakeup = 1; + /* Keep various statistics */ + pkttime = cb2->stamp; + xmit_count = cb2->xmit_count; + segnum = cb2->segnum; + /* Remove and drop ack'ed packet */ + skb_unlink(ack, q); + kfree_skb(ack); + ack = NULL; + + /* + * We don't expect to see acknowledgements for packets we + * haven't sent yet. + */ + WARN_ON(xmit_count == 0); + + /* + * If the packet has only been sent once, we can use it + * to calculate the RTT and also open the window a little + * further. + */ + if (xmit_count == 1) { + if (dn_equal(segnum, acknum)) + dn_nsp_rtt(sk, (long)(pkttime - reftime)); + + if (scp->snd_window < scp->max_window) + scp->snd_window++; + } + + /* + * Packet has been sent more than once. If this is the last + * packet to be acknowledged then we want to send the next + * packet in the send queue again (assumes the remote host does + * go-back-N error control). + */ + if (xmit_count > 1) + try_retrans = 1; + } + + if (try_retrans) + dn_nsp_output(sk); + + return wakeup; +} + +void dn_nsp_send_data_ack(struct sock *sk) +{ + struct sk_buff *skb = NULL; + + if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, 9); + dn_mk_ack_header(sk, skb, 0x04, 9, 0); + dn_nsp_send(skb); +} + +void dn_nsp_send_oth_ack(struct sock *sk) +{ + struct sk_buff *skb = NULL; + + if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, 9); + dn_mk_ack_header(sk, skb, 0x14, 9, 1); + dn_nsp_send(skb); +} + + +void dn_send_conn_ack (struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb = NULL; + struct nsp_conn_ack_msg *msg; + + if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL) + return; + + msg = (struct nsp_conn_ack_msg *)skb_put(skb, 3); + msg->msgflg = 0x24; + msg->dstaddr = scp->addrrem; + + dn_nsp_send(skb); +} + +void dn_nsp_delayed_ack(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->ackxmt_oth != scp->numoth_rcv) + dn_nsp_send_oth_ack(sk); + + if (scp->ackxmt_dat != scp->numdat_rcv) + dn_nsp_send_data_ack(sk); +} + +static int dn_nsp_retrans_conn_conf(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CC) + dn_send_conn_conf(sk, GFP_ATOMIC); + + return 0; +} + +void dn_send_conn_conf(struct sock *sk, gfp_t gfp) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb = NULL; + struct nsp_conn_init_msg *msg; + __u8 len = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); + + if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL) + return; + + msg = (struct nsp_conn_init_msg *)skb_put(skb, sizeof(*msg)); + msg->msgflg = 0x28; + msg->dstaddr = scp->addrrem; + msg->srcaddr = scp->addrloc; + msg->services = scp->services_loc; + msg->info = scp->info_loc; + msg->segsize = cpu_to_le16(scp->segsize_loc); + + *skb_put(skb,1) = len; + + if (len > 0) + memcpy(skb_put(skb, len), scp->conndata_out.opt_data, len); + + + dn_nsp_send(skb); + + scp->persist = dn_nsp_persist(sk); + scp->persist_fxn = dn_nsp_retrans_conn_conf; +} + + +static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, + unsigned short reason, gfp_t gfp, + struct dst_entry *dst, + int ddl, unsigned char *dd, __le16 rem, __le16 loc) +{ + struct sk_buff *skb = NULL; + int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0); + unsigned char *msg; + + if ((dst == NULL) || (rem == 0)) { + if (net_ratelimit()) + printk(KERN_DEBUG "DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", le16_to_cpu(rem), dst); + return; + } + + if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL) + return; + + msg = skb_put(skb, size); + *msg++ = msgflg; + *(__le16 *)msg = rem; + msg += 2; + *(__le16 *)msg = loc; + msg += 2; + *(__le16 *)msg = cpu_to_le16(reason); + msg += 2; + if (msgflg == NSP_DISCINIT) + *msg++ = ddl; + + if (ddl) { + memcpy(msg, dd, ddl); + } + + /* + * This doesn't go via the dn_nsp_send() function since we need + * to be able to send disc packets out which have no socket + * associations. + */ + skb_dst_set(skb, dst_clone(dst)); + dst_output(skb); +} + + +void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg, + unsigned short reason, gfp_t gfp) +{ + struct dn_scp *scp = DN_SK(sk); + int ddl = 0; + + if (msgflg == NSP_DISCINIT) + ddl = le16_to_cpu(scp->discdata_out.opt_optl); + + if (reason == 0) + reason = le16_to_cpu(scp->discdata_out.opt_status); + + dn_nsp_do_disc(sk, msgflg, reason, gfp, sk->sk_dst_cache, ddl, + scp->discdata_out.opt_data, scp->addrrem, scp->addrloc); +} + + +void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg, + unsigned short reason) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int ddl = 0; + gfp_t gfp = GFP_ATOMIC; + + dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl, + NULL, cb->src_port, cb->dst_port); +} + + +void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb; + unsigned char *ptr; + gfp_t gfp = GFP_ATOMIC; + + if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL) + return; + + skb_reserve(skb, DN_MAX_NSP_DATA_HEADER); + ptr = skb_put(skb, 2); + DN_SKB_CB(skb)->nsp_flags = 0x10; + *ptr++ = lsflags; + *ptr = fcval; + + dn_nsp_queue_xmit(sk, skb, gfp, 1); + + scp->persist = dn_nsp_persist(sk); + scp->persist_fxn = dn_nsp_xmit_timeout; +} + +static int dn_nsp_retrans_conninit(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CI) + dn_nsp_send_conninit(sk, NSP_RCI); + + return 0; +} + +void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) +{ + struct dn_scp *scp = DN_SK(sk); + struct nsp_conn_init_msg *msg; + unsigned char aux; + unsigned char menuver; + struct dn_skb_cb *cb; + unsigned char type = 1; + gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC; + struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation); + + if (!skb) + return; + + cb = DN_SKB_CB(skb); + msg = (struct nsp_conn_init_msg *)skb_put(skb,sizeof(*msg)); + + msg->msgflg = msgflg; + msg->dstaddr = 0x0000; /* Remote Node will assign it*/ + + msg->srcaddr = scp->addrloc; + msg->services = scp->services_loc; /* Requested flow control */ + msg->info = scp->info_loc; /* Version Number */ + msg->segsize = cpu_to_le16(scp->segsize_loc); /* Max segment size */ + + if (scp->peer.sdn_objnum) + type = 0; + + skb_put(skb, dn_sockaddr2username(&scp->peer, + skb_tail_pointer(skb), type)); + skb_put(skb, dn_sockaddr2username(&scp->addr, + skb_tail_pointer(skb), 2)); + + menuver = DN_MENUVER_ACC | DN_MENUVER_USR; + if (scp->peer.sdn_flags & SDF_PROXY) + menuver |= DN_MENUVER_PRX; + if (scp->peer.sdn_flags & SDF_UICPROXY) + menuver |= DN_MENUVER_UIC; + + *skb_put(skb, 1) = menuver; /* Menu Version */ + + aux = scp->accessdata.acc_userl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); + + aux = scp->accessdata.acc_passl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); + + aux = scp->accessdata.acc_accl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); + + aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux); + + scp->persist = dn_nsp_persist(sk); + scp->persist_fxn = dn_nsp_retrans_conninit; + + cb->rt_flags = DN_RT_F_RQR; + + dn_nsp_send(skb); +} + diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c new file mode 100644 index 00000000..b91b6036 --- /dev/null +++ b/net/decnet/dn_route.c @@ -0,0 +1,1861 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Functions (Endnode and Router) + * + * Authors: Steve Whitehouse + * Eduardo Marcelo Serrat + * + * Changes: + * Steve Whitehouse : Fixes to allow "intra-ethernet" and + * "return-to-sender" bits on outgoing + * packets. + * Steve Whitehouse : Timeouts for cached routes. + * Steve Whitehouse : Use dst cache for input routes too. + * Steve Whitehouse : Fixed error values in dn_send_skb. + * Steve Whitehouse : Rework routing functions to better fit + * DECnet routing design + * Alexey Kuznetsov : New SMP locking + * Steve Whitehouse : More SMP locking changes & dn_cache_dump() + * Steve Whitehouse : Prerouting NF hook, now really is prerouting. + * Fixed possible skb leak in rtnetlink funcs. + * Steve Whitehouse : Dave Miller's dynamic hash table sizing and + * Alexey Kuznetsov's finer grained locking + * from ipv4/route.c. + * Steve Whitehouse : Routing is now starting to look like a + * sensible set of code now, mainly due to + * my copying the IPv4 routing code. The + * hooks here are modified and will continue + * to evolve for a while. + * Steve Whitehouse : Real SMP at last :-) Also new netfilter + * stuff. Look out raw sockets your days + * are numbered! + * Steve Whitehouse : Added return-to-sender functions. Added + * backlog congestion level return codes. + * Steve Whitehouse : Fixed bug where routes were set up with + * no ref count on net devices. + * Steve Whitehouse : RCU for the route cache + * Steve Whitehouse : Preparations for the flow cache + * Steve Whitehouse : Prepare for nonlinear skbs + */ + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_rt_hash_bucket +{ + struct dn_route __rcu *chain; + spinlock_t lock; +}; + +extern struct neigh_table dn_neigh_table; + + +static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00}; + +static const int dn_rt_min_delay = 2 * HZ; +static const int dn_rt_max_delay = 10 * HZ; +static const int dn_rt_mtu_expires = 10 * 60 * HZ; + +static unsigned long dn_rt_deadline; + +static int dn_dst_gc(struct dst_ops *ops); +static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); +static unsigned int dn_dst_default_advmss(const struct dst_entry *dst); +static unsigned int dn_dst_default_mtu(const struct dst_entry *dst); +static void dn_dst_destroy(struct dst_entry *); +static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); +static void dn_dst_link_failure(struct sk_buff *); +static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu); +static int dn_route_input(struct sk_buff *); +static void dn_run_flush(unsigned long dummy); + +static struct dn_rt_hash_bucket *dn_rt_hash_table; +static unsigned dn_rt_hash_mask; + +static struct timer_list dn_route_timer; +static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0); +int decnet_dst_gc_interval = 2; + +static struct dst_ops dn_dst_ops = { + .family = PF_DECnet, + .protocol = cpu_to_be16(ETH_P_DNA_RT), + .gc_thresh = 128, + .gc = dn_dst_gc, + .check = dn_dst_check, + .default_advmss = dn_dst_default_advmss, + .default_mtu = dn_dst_default_mtu, + .cow_metrics = dst_cow_metrics_generic, + .destroy = dn_dst_destroy, + .negative_advice = dn_dst_negative_advice, + .link_failure = dn_dst_link_failure, + .update_pmtu = dn_dst_update_pmtu, +}; + +static void dn_dst_destroy(struct dst_entry *dst) +{ + dst_destroy_metrics_generic(dst); +} + +static __inline__ unsigned dn_hash(__le16 src, __le16 dst) +{ + __u16 tmp = (__u16 __force)(src ^ dst); + tmp ^= (tmp >> 3); + tmp ^= (tmp >> 5); + tmp ^= (tmp >> 10); + return dn_rt_hash_mask & (unsigned)tmp; +} + +static inline void dnrt_free(struct dn_route *rt) +{ + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); +} + +static inline void dnrt_drop(struct dn_route *rt) +{ + dst_release(&rt->dst); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); +} + +static void dn_dst_check_expire(unsigned long dummy) +{ + int i; + struct dn_route *rt; + struct dn_route __rcu **rtp; + unsigned long now = jiffies; + unsigned long expire = 120 * HZ; + + for (i = 0; i <= dn_rt_hash_mask; i++) { + rtp = &dn_rt_hash_table[i].chain; + + spin_lock(&dn_rt_hash_table[i].lock); + while ((rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { + if (atomic_read(&rt->dst.__refcnt) || + (now - rt->dst.lastuse) < expire) { + rtp = &rt->dst.dn_next; + continue; + } + *rtp = rt->dst.dn_next; + rt->dst.dn_next = NULL; + dnrt_free(rt); + } + spin_unlock(&dn_rt_hash_table[i].lock); + + if ((jiffies - now) > 0) + break; + } + + mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ); +} + +static int dn_dst_gc(struct dst_ops *ops) +{ + struct dn_route *rt; + struct dn_route __rcu **rtp; + int i; + unsigned long now = jiffies; + unsigned long expire = 10 * HZ; + + for (i = 0; i <= dn_rt_hash_mask; i++) { + + spin_lock_bh(&dn_rt_hash_table[i].lock); + rtp = &dn_rt_hash_table[i].chain; + + while ((rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) { + if (atomic_read(&rt->dst.__refcnt) || + (now - rt->dst.lastuse) < expire) { + rtp = &rt->dst.dn_next; + continue; + } + *rtp = rt->dst.dn_next; + rt->dst.dn_next = NULL; + dnrt_drop(rt); + break; + } + spin_unlock_bh(&dn_rt_hash_table[i].lock); + } + + return 0; +} + +/* + * The decnet standards don't impose a particular minimum mtu, what they + * do insist on is that the routing layer accepts a datagram of at least + * 230 bytes long. Here we have to subtract the routing header length from + * 230 to get the minimum acceptable mtu. If there is no neighbour, then we + * assume the worst and use a long header size. + * + * We update both the mtu and the advertised mss (i.e. the segment size we + * advertise to the other end). + */ +static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct neighbour *n = dst_get_neighbour(dst); + u32 min_mtu = 230; + struct dn_dev *dn; + + dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL; + + if (dn && dn->use_long == 0) + min_mtu -= 6; + else + min_mtu -= 21; + + if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) { + if (!(dst_metric_locked(dst, RTAX_MTU))) { + dst_metric_set(dst, RTAX_MTU, mtu); + dst_set_expires(dst, dn_rt_mtu_expires); + } + if (!(dst_metric_locked(dst, RTAX_ADVMSS))) { + u32 mss = mtu - DN_MAX_NSP_DATA_HEADER; + u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS); + if (!existing_mss || existing_mss > mss) + dst_metric_set(dst, RTAX_ADVMSS, mss); + } + } +} + +/* + * When a route has been marked obsolete. (e.g. routing cache flush) + */ +static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie) +{ + return NULL; +} + +static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst) +{ + dst_release(dst); + return NULL; +} + +static void dn_dst_link_failure(struct sk_buff *skb) +{ +} + +static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2) +{ + return ((fl1->daddr ^ fl2->daddr) | + (fl1->saddr ^ fl2->saddr) | + (fl1->flowidn_mark ^ fl2->flowidn_mark) | + (fl1->flowidn_scope ^ fl2->flowidn_scope) | + (fl1->flowidn_oif ^ fl2->flowidn_oif) | + (fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0; +} + +static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp) +{ + struct dn_route *rth; + struct dn_route __rcu **rthp; + unsigned long now = jiffies; + + rthp = &dn_rt_hash_table[hash].chain; + + spin_lock_bh(&dn_rt_hash_table[hash].lock); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) { + if (compare_keys(&rth->fld, &rt->fld)) { + /* Put it first */ + *rthp = rth->dst.dn_next; + rcu_assign_pointer(rth->dst.dn_next, + dn_rt_hash_table[hash].chain); + rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); + + dst_use(&rth->dst, now); + spin_unlock_bh(&dn_rt_hash_table[hash].lock); + + dnrt_drop(rt); + *rp = rth; + return 0; + } + rthp = &rth->dst.dn_next; + } + + rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain); + rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); + + dst_use(&rt->dst, now); + spin_unlock_bh(&dn_rt_hash_table[hash].lock); + *rp = rt; + return 0; +} + +static void dn_run_flush(unsigned long dummy) +{ + int i; + struct dn_route *rt, *next; + + for (i = 0; i < dn_rt_hash_mask; i++) { + spin_lock_bh(&dn_rt_hash_table[i].lock); + + if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL) + goto nothing_to_declare; + + for(; rt; rt = next) { + next = rcu_dereference_raw(rt->dst.dn_next); + RCU_INIT_POINTER(rt->dst.dn_next, NULL); + dst_free((struct dst_entry *)rt); + } + +nothing_to_declare: + spin_unlock_bh(&dn_rt_hash_table[i].lock); + } +} + +static DEFINE_SPINLOCK(dn_rt_flush_lock); + +void dn_rt_cache_flush(int delay) +{ + unsigned long now = jiffies; + int user_mode = !in_interrupt(); + + if (delay < 0) + delay = dn_rt_min_delay; + + spin_lock_bh(&dn_rt_flush_lock); + + if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) { + long tmo = (long)(dn_rt_deadline - now); + + if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay) + tmo = 0; + + if (delay > tmo) + delay = tmo; + } + + if (delay <= 0) { + spin_unlock_bh(&dn_rt_flush_lock); + dn_run_flush(0); + return; + } + + if (dn_rt_deadline == 0) + dn_rt_deadline = now + dn_rt_max_delay; + + dn_rt_flush_timer.expires = now + delay; + add_timer(&dn_rt_flush_timer); + spin_unlock_bh(&dn_rt_flush_lock); +} + +/** + * dn_return_short - Return a short packet to its sender + * @skb: The packet to return + * + */ +static int dn_return_short(struct sk_buff *skb) +{ + struct dn_skb_cb *cb; + unsigned char *ptr; + __le16 *src; + __le16 *dst; + + /* Add back headers */ + skb_push(skb, skb->data - skb_network_header(skb)); + + if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + cb = DN_SKB_CB(skb); + /* Skip packet length and point to flags */ + ptr = skb->data + 2; + *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS; + + dst = (__le16 *)ptr; + ptr += 2; + src = (__le16 *)ptr; + ptr += 2; + *ptr = 0; /* Zero hop count */ + + swap(*src, *dst); + + skb->pkt_type = PACKET_OUTGOING; + dn_rt_finish_output(skb, NULL, NULL); + return NET_RX_SUCCESS; +} + +/** + * dn_return_long - Return a long packet to its sender + * @skb: The long format packet to return + * + */ +static int dn_return_long(struct sk_buff *skb) +{ + struct dn_skb_cb *cb; + unsigned char *ptr; + unsigned char *src_addr, *dst_addr; + unsigned char tmp[ETH_ALEN]; + + /* Add back all headers */ + skb_push(skb, skb->data - skb_network_header(skb)); + + if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + cb = DN_SKB_CB(skb); + /* Ignore packet length and point to flags */ + ptr = skb->data + 2; + + /* Skip padding */ + if (*ptr & DN_RT_F_PF) { + char padlen = (*ptr & ~DN_RT_F_PF); + ptr += padlen; + } + + *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS; + ptr += 2; + dst_addr = ptr; + ptr += 8; + src_addr = ptr; + ptr += 6; + *ptr = 0; /* Zero hop count */ + + /* Swap source and destination */ + memcpy(tmp, src_addr, ETH_ALEN); + memcpy(src_addr, dst_addr, ETH_ALEN); + memcpy(dst_addr, tmp, ETH_ALEN); + + skb->pkt_type = PACKET_OUTGOING; + dn_rt_finish_output(skb, dst_addr, src_addr); + return NET_RX_SUCCESS; +} + +/** + * dn_route_rx_packet - Try and find a route for an incoming packet + * @skb: The packet to find a route for + * + * Returns: result of input function if route is found, error code otherwise + */ +static int dn_route_rx_packet(struct sk_buff *skb) +{ + struct dn_skb_cb *cb; + int err; + + if ((err = dn_route_input(skb)) == 0) + return dst_input(skb); + + cb = DN_SKB_CB(skb); + if (decnet_debug_level & 4) { + char *devname = skb->dev ? skb->dev->name : "???"; + + printk(KERN_DEBUG + "DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n", + (int)cb->rt_flags, devname, skb->len, + le16_to_cpu(cb->src), le16_to_cpu(cb->dst), + err, skb->pkt_type); + } + + if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) { + switch(cb->rt_flags & DN_RT_PKT_MSK) { + case DN_RT_PKT_SHORT: + return dn_return_short(skb); + case DN_RT_PKT_LONG: + return dn_return_long(skb); + } + } + + kfree_skb(skb); + return NET_RX_DROP; +} + +static int dn_route_rx_long(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned char *ptr = skb->data; + + if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */ + goto drop_it; + + skb_pull(skb, 20); + skb_reset_transport_header(skb); + + /* Destination info */ + ptr += 2; + cb->dst = dn_eth2dn(ptr); + if (memcmp(ptr, dn_hiord_addr, 4) != 0) + goto drop_it; + ptr += 6; + + + /* Source info */ + ptr += 2; + cb->src = dn_eth2dn(ptr); + if (memcmp(ptr, dn_hiord_addr, 4) != 0) + goto drop_it; + ptr += 6; + /* Other junk */ + ptr++; + cb->hops = *ptr++; /* Visit Count */ + + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, + dn_route_rx_packet); + +drop_it: + kfree_skb(skb); + return NET_RX_DROP; +} + + + +static int dn_route_rx_short(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned char *ptr = skb->data; + + if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */ + goto drop_it; + + skb_pull(skb, 5); + skb_reset_transport_header(skb); + + cb->dst = *(__le16 *)ptr; + ptr += 2; + cb->src = *(__le16 *)ptr; + ptr += 2; + cb->hops = *ptr & 0x3f; + + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, + dn_route_rx_packet); + +drop_it: + kfree_skb(skb); + return NET_RX_DROP; +} + +static int dn_route_discard(struct sk_buff *skb) +{ + /* + * I know we drop the packet here, but thats considered success in + * this case + */ + kfree_skb(skb); + return NET_RX_SUCCESS; +} + +static int dn_route_ptp_hello(struct sk_buff *skb) +{ + dn_dev_hello(skb); + dn_neigh_pointopoint_hello(skb); + return NET_RX_SUCCESS; +} + +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct dn_skb_cb *cb; + unsigned char flags = 0; + __u16 len = le16_to_cpu(*(__le16 *)skb->data); + struct dn_dev *dn = rcu_dereference(dev->dn_ptr); + unsigned char padlen = 0; + + if (!net_eq(dev_net(dev), &init_net)) + goto dump_it; + + if (dn == NULL) + goto dump_it; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out; + + if (!pskb_may_pull(skb, 3)) + goto dump_it; + + skb_pull(skb, 2); + + if (len > skb->len) + goto dump_it; + + skb_trim(skb, len); + + flags = *skb->data; + + cb = DN_SKB_CB(skb); + cb->stamp = jiffies; + cb->iif = dev->ifindex; + + /* + * If we have padding, remove it. + */ + if (flags & DN_RT_F_PF) { + padlen = flags & ~DN_RT_F_PF; + if (!pskb_may_pull(skb, padlen + 1)) + goto dump_it; + skb_pull(skb, padlen); + flags = *skb->data; + } + + skb_reset_network_header(skb); + + /* + * Weed out future version DECnet + */ + if (flags & DN_RT_F_VER) + goto dump_it; + + cb->rt_flags = flags; + + if (decnet_debug_level & 1) + printk(KERN_DEBUG + "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n", + (int)flags, (dev) ? dev->name : "???", len, skb->len, + padlen); + + if (flags & DN_RT_PKT_CNTL) { + if (unlikely(skb_linearize(skb))) + goto dump_it; + + switch(flags & DN_RT_CNTL_MSK) { + case DN_RT_PKT_INIT: + dn_dev_init_pkt(skb); + break; + case DN_RT_PKT_VERI: + dn_dev_veri_pkt(skb); + break; + } + + if (dn->parms.state != DN_DEV_S_RU) + goto dump_it; + + switch(flags & DN_RT_CNTL_MSK) { + case DN_RT_PKT_HELO: + return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, + skb, skb->dev, NULL, + dn_route_ptp_hello); + + case DN_RT_PKT_L1RT: + case DN_RT_PKT_L2RT: + return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE, + skb, skb->dev, NULL, + dn_route_discard); + case DN_RT_PKT_ERTH: + return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, + skb, skb->dev, NULL, + dn_neigh_router_hello); + + case DN_RT_PKT_EEDH: + return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, + skb, skb->dev, NULL, + dn_neigh_endnode_hello); + } + } else { + if (dn->parms.state != DN_DEV_S_RU) + goto dump_it; + + skb_pull(skb, 1); /* Pull flags */ + + switch(flags & DN_RT_PKT_MSK) { + case DN_RT_PKT_LONG: + return dn_route_rx_long(skb); + case DN_RT_PKT_SHORT: + return dn_route_rx_short(skb); + } + } + +dump_it: + kfree_skb(skb); +out: + return NET_RX_DROP; +} + +static int dn_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct dn_route *rt = (struct dn_route *)dst; + struct net_device *dev = dst->dev; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct neighbour *neigh; + + int err = -EINVAL; + + if ((neigh = dst_get_neighbour(dst)) == NULL) + goto error; + + skb->dev = dev; + + cb->src = rt->rt_saddr; + cb->dst = rt->rt_daddr; + + /* + * Always set the Intra-Ethernet bit on all outgoing packets + * originated on this node. Only valid flag from upper layers + * is return-to-sender-requested. Set hop count to 0 too. + */ + cb->rt_flags &= ~DN_RT_F_RQR; + cb->rt_flags |= DN_RT_F_IE; + cb->hops = 0; + + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, skb, NULL, dev, + neigh->output); + +error: + if (net_ratelimit()) + printk(KERN_DEBUG "dn_output: This should not happen\n"); + + kfree_skb(skb); + + return err; +} + +static int dn_forward(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dst_entry *dst = skb_dst(skb); + struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr); + struct dn_route *rt; + struct neighbour *neigh = dst_get_neighbour(dst); + int header_len; +#ifdef CONFIG_NETFILTER + struct net_device *dev = skb->dev; +#endif + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + /* Ensure that we have enough space for headers */ + rt = (struct dn_route *)skb_dst(skb); + header_len = dn_db->use_long ? 21 : 6; + if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len)) + goto drop; + + /* + * Hop count exceeded. + */ + if (++cb->hops > 30) + goto drop; + + skb->dev = rt->dst.dev; + + /* + * If packet goes out same interface it came in on, then set + * the Intra-Ethernet bit. This has no effect for short + * packets, so we don't need to test for them here. + */ + cb->rt_flags &= ~DN_RT_F_IE; + if (rt->rt_flags & RTCF_DOREDIRECT) + cb->rt_flags |= DN_RT_F_IE; + + return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, skb, dev, skb->dev, + neigh->output); + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Used to catch bugs. This should never normally get + * called. + */ +static int dn_rt_bug(struct sk_buff *skb) +{ + if (net_ratelimit()) { + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + printk(KERN_DEBUG "dn_rt_bug: skb from:%04x to:%04x\n", + le16_to_cpu(cb->src), le16_to_cpu(cb->dst)); + } + + kfree_skb(skb); + + return NET_RX_DROP; +} + +static unsigned int dn_dst_default_advmss(const struct dst_entry *dst) +{ + return dn_mss_from_pmtu(dst->dev, dst_mtu(dst)); +} + +static unsigned int dn_dst_default_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) +{ + struct dn_fib_info *fi = res->fi; + struct net_device *dev = rt->dst.dev; + unsigned int mss_metric; + struct neighbour *n; + + if (fi) { + if (DN_FIB_RES_GW(*res) && + DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = DN_FIB_RES_GW(*res); + dst_init_metrics(&rt->dst, fi->fib_metrics, true); + } + rt->rt_type = res->type; + + if (dev != NULL && dst_get_neighbour(&rt->dst) == NULL) { + n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + dst_set_neighbour(&rt->dst, n); + } + + if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu) + dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu); + mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS); + if (mss_metric) { + unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst)); + if (mss_metric > mss) + dst_metric_set(&rt->dst, RTAX_ADVMSS, mss); + } + return 0; +} + +static inline int dn_match_addr(__le16 addr1, __le16 addr2) +{ + __u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2); + int match = 16; + while(tmp) { + tmp >>= 1; + match--; + } + return match; +} + +static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope) +{ + __le16 saddr = 0; + struct dn_dev *dn_db; + struct dn_ifaddr *ifa; + int best_match = 0; + int ret; + + rcu_read_lock(); + dn_db = rcu_dereference(dev->dn_ptr); + for (ifa = rcu_dereference(dn_db->ifa_list); + ifa != NULL; + ifa = rcu_dereference(ifa->ifa_next)) { + if (ifa->ifa_scope > scope) + continue; + if (!daddr) { + saddr = ifa->ifa_local; + break; + } + ret = dn_match_addr(daddr, ifa->ifa_local); + if (ret > best_match) + saddr = ifa->ifa_local; + if (best_match == 0) + saddr = ifa->ifa_local; + } + rcu_read_unlock(); + + return saddr; +} + +static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res) +{ + return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope); +} + +static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res) +{ + __le16 mask = dnet_make_mask(res->prefixlen); + return (daddr&~mask)|res->fi->fib_nh->nh_gw; +} + +static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard) +{ + struct flowidn fld = { + .daddr = oldflp->daddr, + .saddr = oldflp->saddr, + .flowidn_scope = RT_SCOPE_UNIVERSE, + .flowidn_mark = oldflp->flowidn_mark, + .flowidn_iif = init_net.loopback_dev->ifindex, + .flowidn_oif = oldflp->flowidn_oif, + }; + struct dn_route *rt = NULL; + struct net_device *dev_out = NULL, *dev; + struct neighbour *neigh = NULL; + unsigned hash; + unsigned flags = 0; + struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST }; + int err; + int free_res = 0; + __le16 gateway = 0; + + if (decnet_debug_level & 16) + printk(KERN_DEBUG + "dn_route_output_slow: dst=%04x src=%04x mark=%d" + " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr), + le16_to_cpu(oldflp->saddr), + oldflp->flowidn_mark, init_net.loopback_dev->ifindex, + oldflp->flowidn_oif); + + /* If we have an output interface, verify its a DECnet device */ + if (oldflp->flowidn_oif) { + dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif); + err = -ENODEV; + if (dev_out && dev_out->dn_ptr == NULL) { + dev_put(dev_out); + dev_out = NULL; + } + if (dev_out == NULL) + goto out; + } + + /* If we have a source address, verify that its a local address */ + if (oldflp->saddr) { + err = -EADDRNOTAVAIL; + + if (dev_out) { + if (dn_dev_islocal(dev_out, oldflp->saddr)) + goto source_ok; + dev_put(dev_out); + goto out; + } + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if (!dev->dn_ptr) + continue; + if (!dn_dev_islocal(dev, oldflp->saddr)) + continue; + if ((dev->flags & IFF_LOOPBACK) && + oldflp->daddr && + !dn_dev_islocal(dev, oldflp->daddr)) + continue; + + dev_out = dev; + break; + } + rcu_read_unlock(); + if (dev_out == NULL) + goto out; + dev_hold(dev_out); +source_ok: + ; + } + + /* No destination? Assume its local */ + if (!fld.daddr) { + fld.daddr = fld.saddr; + + err = -EADDRNOTAVAIL; + if (dev_out) + dev_put(dev_out); + dev_out = init_net.loopback_dev; + dev_hold(dev_out); + if (!fld.daddr) { + fld.daddr = + fld.saddr = dnet_select_source(dev_out, 0, + RT_SCOPE_HOST); + if (!fld.daddr) + goto out; + } + fld.flowidn_oif = init_net.loopback_dev->ifindex; + res.type = RTN_LOCAL; + goto make_route; + } + + if (decnet_debug_level & 16) + printk(KERN_DEBUG + "dn_route_output_slow: initial checks complete." + " dst=%o4x src=%04x oif=%d try_hard=%d\n", + le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr), + fld.flowidn_oif, try_hard); + + /* + * N.B. If the kernel is compiled without router support then + * dn_fib_lookup() will evaluate to non-zero so this if () block + * will always be executed. + */ + err = -ESRCH; + if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) { + struct dn_dev *dn_db; + if (err != -ESRCH) + goto out; + /* + * Here the fallback is basically the standard algorithm for + * routing in endnodes which is described in the DECnet routing + * docs + * + * If we are not trying hard, look in neighbour cache. + * The result is tested to ensure that if a specific output + * device/source address was requested, then we honour that + * here + */ + if (!try_hard) { + neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr); + if (neigh) { + if ((oldflp->flowidn_oif && + (neigh->dev->ifindex != oldflp->flowidn_oif)) || + (oldflp->saddr && + (!dn_dev_islocal(neigh->dev, + oldflp->saddr)))) { + neigh_release(neigh); + neigh = NULL; + } else { + if (dev_out) + dev_put(dev_out); + if (dn_dev_islocal(neigh->dev, fld.daddr)) { + dev_out = init_net.loopback_dev; + res.type = RTN_LOCAL; + } else { + dev_out = neigh->dev; + } + dev_hold(dev_out); + goto select_source; + } + } + } + + /* Not there? Perhaps its a local address */ + if (dev_out == NULL) + dev_out = dn_dev_get_default(); + err = -ENODEV; + if (dev_out == NULL) + goto out; + dn_db = rcu_dereference_raw(dev_out->dn_ptr); + /* Possible improvement - check all devices for local addr */ + if (dn_dev_islocal(dev_out, fld.daddr)) { + dev_put(dev_out); + dev_out = init_net.loopback_dev; + dev_hold(dev_out); + res.type = RTN_LOCAL; + goto select_source; + } + /* Not local either.... try sending it to the default router */ + neigh = neigh_clone(dn_db->router); + BUG_ON(neigh && neigh->dev != dev_out); + + /* Ok then, we assume its directly connected and move on */ +select_source: + if (neigh) + gateway = ((struct dn_neigh *)neigh)->addr; + if (gateway == 0) + gateway = fld.daddr; + if (fld.saddr == 0) { + fld.saddr = dnet_select_source(dev_out, gateway, + res.type == RTN_LOCAL ? + RT_SCOPE_HOST : + RT_SCOPE_LINK); + if (fld.saddr == 0 && res.type != RTN_LOCAL) + goto e_addr; + } + fld.flowidn_oif = dev_out->ifindex; + goto make_route; + } + free_res = 1; + + if (res.type == RTN_NAT) + goto e_inval; + + if (res.type == RTN_LOCAL) { + if (!fld.saddr) + fld.saddr = fld.daddr; + if (dev_out) + dev_put(dev_out); + dev_out = init_net.loopback_dev; + dev_hold(dev_out); + fld.flowidn_oif = dev_out->ifindex; + if (res.fi) + dn_fib_info_put(res.fi); + res.fi = NULL; + goto make_route; + } + + if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0) + dn_fib_select_multipath(&fld, &res); + + /* + * We could add some logic to deal with default routes here and + * get rid of some of the special casing above. + */ + + if (!fld.saddr) + fld.saddr = DN_FIB_RES_PREFSRC(res); + + if (dev_out) + dev_put(dev_out); + dev_out = DN_FIB_RES_DEV(res); + dev_hold(dev_out); + fld.flowidn_oif = dev_out->ifindex; + gateway = DN_FIB_RES_GW(res); + +make_route: + if (dev_out->flags & IFF_LOOPBACK) + flags |= RTCF_LOCAL; + + rt = dst_alloc(&dn_dst_ops, dev_out, 1, 0, DST_HOST); + if (rt == NULL) + goto e_nobufs; + + memset(&rt->fld, 0, sizeof(rt->fld)); + rt->fld.saddr = oldflp->saddr; + rt->fld.daddr = oldflp->daddr; + rt->fld.flowidn_oif = oldflp->flowidn_oif; + rt->fld.flowidn_iif = 0; + rt->fld.flowidn_mark = oldflp->flowidn_mark; + + rt->rt_saddr = fld.saddr; + rt->rt_daddr = fld.daddr; + rt->rt_gateway = gateway ? gateway : fld.daddr; + rt->rt_local_src = fld.saddr; + + rt->rt_dst_map = fld.daddr; + rt->rt_src_map = fld.saddr; + + dst_set_neighbour(&rt->dst, neigh); + neigh = NULL; + + rt->dst.lastuse = jiffies; + rt->dst.output = dn_output; + rt->dst.input = dn_rt_bug; + rt->rt_flags = flags; + if (flags & RTCF_LOCAL) + rt->dst.input = dn_nsp_rx; + + err = dn_rt_set_next_hop(rt, &res); + if (err) + goto e_neighbour; + + hash = dn_hash(rt->fld.saddr, rt->fld.daddr); + dn_insert_route(rt, hash, (struct dn_route **)pprt); + +done: + if (neigh) + neigh_release(neigh); + if (free_res) + dn_fib_res_put(&res); + if (dev_out) + dev_put(dev_out); +out: + return err; + +e_addr: + err = -EADDRNOTAVAIL; + goto done; +e_inval: + err = -EINVAL; + goto done; +e_nobufs: + err = -ENOBUFS; + goto done; +e_neighbour: + dst_free(&rt->dst); + goto e_nobufs; +} + + +/* + * N.B. The flags may be moved into the flowi at some future stage. + */ +static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags) +{ + unsigned hash = dn_hash(flp->saddr, flp->daddr); + struct dn_route *rt = NULL; + + if (!(flags & MSG_TRYHARD)) { + rcu_read_lock_bh(); + for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt; + rt = rcu_dereference_bh(rt->dst.dn_next)) { + if ((flp->daddr == rt->fld.daddr) && + (flp->saddr == rt->fld.saddr) && + (flp->flowidn_mark == rt->fld.flowidn_mark) && + dn_is_output_route(rt) && + (rt->fld.flowidn_oif == flp->flowidn_oif)) { + dst_use(&rt->dst, jiffies); + rcu_read_unlock_bh(); + *pprt = &rt->dst; + return 0; + } + } + rcu_read_unlock_bh(); + } + + return dn_route_output_slow(pprt, flp, flags); +} + +static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags) +{ + int err; + + err = __dn_route_output_key(pprt, flp, flags); + if (err == 0 && flp->flowidn_proto) { + *pprt = xfrm_lookup(&init_net, *pprt, + flowidn_to_flowi(flp), NULL, 0); + if (IS_ERR(*pprt)) { + err = PTR_ERR(*pprt); + *pprt = NULL; + } + } + return err; +} + +int dn_route_output_sock(struct dst_entry **pprt, struct flowidn *fl, struct sock *sk, int flags) +{ + int err; + + err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD); + if (err == 0 && fl->flowidn_proto) { + if (!(flags & MSG_DONTWAIT)) + fl->flowidn_flags |= FLOWI_FLAG_CAN_SLEEP; + *pprt = xfrm_lookup(&init_net, *pprt, + flowidn_to_flowi(fl), sk, 0); + if (IS_ERR(*pprt)) { + err = PTR_ERR(*pprt); + *pprt = NULL; + } + } + return err; +} + +static int dn_route_input_slow(struct sk_buff *skb) +{ + struct dn_route *rt = NULL; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct net_device *in_dev = skb->dev; + struct net_device *out_dev = NULL; + struct dn_dev *dn_db; + struct neighbour *neigh = NULL; + unsigned hash; + int flags = 0; + __le16 gateway = 0; + __le16 local_src = 0; + struct flowidn fld = { + .daddr = cb->dst, + .saddr = cb->src, + .flowidn_scope = RT_SCOPE_UNIVERSE, + .flowidn_mark = skb->mark, + .flowidn_iif = skb->dev->ifindex, + }; + struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE }; + int err = -EINVAL; + int free_res = 0; + + dev_hold(in_dev); + + if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL) + goto out; + + /* Zero source addresses are not allowed */ + if (fld.saddr == 0) + goto out; + + /* + * In this case we've just received a packet from a source + * outside ourselves pretending to come from us. We don't + * allow it any further to prevent routing loops, spoofing and + * other nasties. Loopback packets already have the dst attached + * so this only affects packets which have originated elsewhere. + */ + err = -ENOTUNIQ; + if (dn_dev_islocal(in_dev, cb->src)) + goto out; + + err = dn_fib_lookup(&fld, &res); + if (err) { + if (err != -ESRCH) + goto out; + /* + * Is the destination us ? + */ + if (!dn_dev_islocal(in_dev, cb->dst)) + goto e_inval; + + res.type = RTN_LOCAL; + } else { + __le16 src_map = fld.saddr; + free_res = 1; + + out_dev = DN_FIB_RES_DEV(res); + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in dn_route_input_slow() " + "No output device\n"); + goto e_inval; + } + dev_hold(out_dev); + + if (res.r) + src_map = fld.saddr; /* no NAT support for now */ + + gateway = DN_FIB_RES_GW(res); + if (res.type == RTN_NAT) { + fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res); + dn_fib_res_put(&res); + free_res = 0; + if (dn_fib_lookup(&fld, &res)) + goto e_inval; + free_res = 1; + if (res.type != RTN_UNICAST) + goto e_inval; + flags |= RTCF_DNAT; + gateway = fld.daddr; + } + fld.saddr = src_map; + } + + switch(res.type) { + case RTN_UNICAST: + /* + * Forwarding check here, we only check for forwarding + * being turned off, if you want to only forward intra + * area, its up to you to set the routing tables up + * correctly. + */ + if (dn_db->parms.forwarding == 0) + goto e_inval; + + if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0) + dn_fib_select_multipath(&fld, &res); + + /* + * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT + * flag as a hint to set the intra-ethernet bit when + * forwarding. If we've got NAT in operation, we don't do + * this optimisation. + */ + if (out_dev == in_dev && !(flags & RTCF_NAT)) + flags |= RTCF_DOREDIRECT; + + local_src = DN_FIB_RES_PREFSRC(res); + + case RTN_BLACKHOLE: + case RTN_UNREACHABLE: + break; + case RTN_LOCAL: + flags |= RTCF_LOCAL; + fld.saddr = cb->dst; + fld.daddr = cb->src; + + /* Routing tables gave us a gateway */ + if (gateway) + goto make_route; + + /* Packet was intra-ethernet, so we know its on-link */ + if (cb->rt_flags & DN_RT_F_IE) { + gateway = cb->src; + flags |= RTCF_DIRECTSRC; + goto make_route; + } + + /* Use the default router if there is one */ + neigh = neigh_clone(dn_db->router); + if (neigh) { + gateway = ((struct dn_neigh *)neigh)->addr; + goto make_route; + } + + /* Close eyes and pray */ + gateway = cb->src; + flags |= RTCF_DIRECTSRC; + goto make_route; + default: + goto e_inval; + } + +make_route: + rt = dst_alloc(&dn_dst_ops, out_dev, 0, 0, DST_HOST); + if (rt == NULL) + goto e_nobufs; + + memset(&rt->fld, 0, sizeof(rt->fld)); + rt->rt_saddr = fld.saddr; + rt->rt_daddr = fld.daddr; + rt->rt_gateway = fld.daddr; + if (gateway) + rt->rt_gateway = gateway; + rt->rt_local_src = local_src ? local_src : rt->rt_saddr; + + rt->rt_dst_map = fld.daddr; + rt->rt_src_map = fld.saddr; + + rt->fld.saddr = cb->src; + rt->fld.daddr = cb->dst; + rt->fld.flowidn_oif = 0; + rt->fld.flowidn_iif = in_dev->ifindex; + rt->fld.flowidn_mark = fld.flowidn_mark; + + dst_set_neighbour(&rt->dst, neigh); + rt->dst.lastuse = jiffies; + rt->dst.output = dn_rt_bug; + switch(res.type) { + case RTN_UNICAST: + rt->dst.input = dn_forward; + break; + case RTN_LOCAL: + rt->dst.output = dn_output; + rt->dst.input = dn_nsp_rx; + rt->dst.dev = in_dev; + flags |= RTCF_LOCAL; + break; + default: + case RTN_UNREACHABLE: + case RTN_BLACKHOLE: + rt->dst.input = dst_discard; + } + rt->rt_flags = flags; + + err = dn_rt_set_next_hop(rt, &res); + if (err) + goto e_neighbour; + + hash = dn_hash(rt->fld.saddr, rt->fld.daddr); + dn_insert_route(rt, hash, &rt); + skb_dst_set(skb, &rt->dst); + +done: + if (neigh) + neigh_release(neigh); + if (free_res) + dn_fib_res_put(&res); + dev_put(in_dev); + if (out_dev) + dev_put(out_dev); +out: + return err; + +e_inval: + err = -EINVAL; + goto done; + +e_nobufs: + err = -ENOBUFS; + goto done; + +e_neighbour: + dst_free(&rt->dst); + goto done; +} + +static int dn_route_input(struct sk_buff *skb) +{ + struct dn_route *rt; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned hash = dn_hash(cb->src, cb->dst); + + if (skb_dst(skb)) + return 0; + + rcu_read_lock(); + for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; + rt = rcu_dereference(rt->dst.dn_next)) { + if ((rt->fld.saddr == cb->src) && + (rt->fld.daddr == cb->dst) && + (rt->fld.flowidn_oif == 0) && + (rt->fld.flowidn_mark == skb->mark) && + (rt->fld.flowidn_iif == cb->iif)) { + dst_use(&rt->dst, jiffies); + rcu_read_unlock(); + skb_dst_set(skb, (struct dst_entry *)rt); + return 0; + } + } + rcu_read_unlock(); + + return dn_route_input_slow(skb); +} + +static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, unsigned int flags) +{ + struct dn_route *rt = (struct dn_route *)skb_dst(skb); + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + long expires; + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); + r = NLMSG_DATA(nlh); + r->rtm_family = AF_DECnet; + r->rtm_dst_len = 16; + r->rtm_src_len = 0; + r->rtm_tos = 0; + r->rtm_table = RT_TABLE_MAIN; + RTA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); + r->rtm_type = rt->rt_type; + r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + if (rt->rt_flags & RTCF_NOTIFY) + r->rtm_flags |= RTM_F_NOTIFY; + RTA_PUT(skb, RTA_DST, 2, &rt->rt_daddr); + if (rt->fld.saddr) { + r->rtm_src_len = 16; + RTA_PUT(skb, RTA_SRC, 2, &rt->fld.saddr); + } + if (rt->dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->dst.dev->ifindex); + /* + * Note to self - change this if input routes reverse direction when + * they deal only with inputs and not with replies like they do + * currently. + */ + RTA_PUT(skb, RTA_PREFSRC, 2, &rt->rt_local_src); + if (rt->rt_daddr != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway); + if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + goto rtattr_failure; + expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, + rt->dst.error) < 0) + goto rtattr_failure; + if (dn_is_input_route(rt)) + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fld.flowidn_iif); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + nlmsg_trim(skb, b); + return -1; +} + +/* + * This is called by both endnodes and routers now. + */ +static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct dn_route *rt = NULL; + struct dn_skb_cb *cb; + int err; + struct sk_buff *skb; + struct flowidn fld; + + if (!net_eq(net, &init_net)) + return -EINVAL; + + memset(&fld, 0, sizeof(fld)); + fld.flowidn_proto = DNPROTO_NSP; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; + skb_reset_mac_header(skb); + cb = DN_SKB_CB(skb); + + if (rta[RTA_SRC-1]) + memcpy(&fld.saddr, RTA_DATA(rta[RTA_SRC-1]), 2); + if (rta[RTA_DST-1]) + memcpy(&fld.daddr, RTA_DATA(rta[RTA_DST-1]), 2); + if (rta[RTA_IIF-1]) + memcpy(&fld.flowidn_iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); + + if (fld.flowidn_iif) { + struct net_device *dev; + if ((dev = dev_get_by_index(&init_net, fld.flowidn_iif)) == NULL) { + kfree_skb(skb); + return -ENODEV; + } + if (!dev->dn_ptr) { + dev_put(dev); + kfree_skb(skb); + return -ENODEV; + } + skb->protocol = htons(ETH_P_DNA_RT); + skb->dev = dev; + cb->src = fld.saddr; + cb->dst = fld.daddr; + local_bh_disable(); + err = dn_route_input(skb); + local_bh_enable(); + memset(cb, 0, sizeof(struct dn_skb_cb)); + rt = (struct dn_route *)skb_dst(skb); + if (!err && -rt->dst.error) + err = rt->dst.error; + } else { + int oif = 0; + if (rta[RTA_OIF - 1]) + memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); + fld.flowidn_oif = oif; + err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0); + } + + if (skb->dev) + dev_put(skb->dev); + skb->dev = NULL; + if (err) + goto out_free; + skb_dst_set(skb, &rt->dst); + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); + + if (err == 0) + goto out_free; + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + +out_free: + kfree_skb(skb); + return err; +} + +/* + * For routers, this is called from dn_fib_dump, but for endnodes its + * called directly from the rtnetlink dispatch table. + */ +int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct dn_route *rt; + int h, s_h; + int idx, s_idx; + + if (!net_eq(net, &init_net)) + return 0; + + if (NLMSG_PAYLOAD(cb->nlh, 0) < sizeof(struct rtmsg)) + return -EINVAL; + if (!(((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)) + return 0; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for(h = 0; h <= dn_rt_hash_mask; h++) { + if (h < s_h) + continue; + if (h > s_h) + s_idx = 0; + rcu_read_lock_bh(); + for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; + rt; + rt = rcu_dereference_bh(rt->dst.dn_next), idx++) { + if (idx < s_idx) + continue; + skb_dst_set(skb, dst_clone(&rt->dst)); + if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + 1, NLM_F_MULTI) <= 0) { + skb_dst_drop(skb); + rcu_read_unlock_bh(); + goto done; + } + skb_dst_drop(skb); + } + rcu_read_unlock_bh(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; +} + +#ifdef CONFIG_PROC_FS +struct dn_rt_cache_iter_state { + int bucket; +}; + +static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq) +{ + struct dn_route *rt = NULL; + struct dn_rt_cache_iter_state *s = seq->private; + + for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) { + rcu_read_lock_bh(); + rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain); + if (rt) + break; + rcu_read_unlock_bh(); + } + return rt; +} + +static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt) +{ + struct dn_rt_cache_iter_state *s = seq->private; + + rt = rcu_dereference_bh(rt->dst.dn_next); + while (!rt) { + rcu_read_unlock_bh(); + if (--s->bucket < 0) + break; + rcu_read_lock_bh(); + rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain); + } + return rt; +} + +static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct dn_route *rt = dn_rt_cache_get_first(seq); + + if (rt) { + while(*pos && (rt = dn_rt_cache_get_next(seq, rt))) + --*pos; + } + return *pos ? NULL : rt; +} + +static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dn_route *rt = dn_rt_cache_get_next(seq, v); + ++*pos; + return rt; +} + +static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v) +{ + if (v) + rcu_read_unlock_bh(); +} + +static int dn_rt_cache_seq_show(struct seq_file *seq, void *v) +{ + struct dn_route *rt = v; + char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; + + seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", + rt->dst.dev ? rt->dst.dev->name : "*", + dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), + dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), + atomic_read(&rt->dst.__refcnt), + rt->dst.__use, + (int) dst_metric(&rt->dst, RTAX_RTT)); + return 0; +} + +static const struct seq_operations dn_rt_cache_seq_ops = { + .start = dn_rt_cache_seq_start, + .next = dn_rt_cache_seq_next, + .stop = dn_rt_cache_seq_stop, + .show = dn_rt_cache_seq_show, +}; + +static int dn_rt_cache_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &dn_rt_cache_seq_ops, + sizeof(struct dn_rt_cache_iter_state)); +} + +static const struct file_operations dn_rt_cache_seq_fops = { + .owner = THIS_MODULE, + .open = dn_rt_cache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* CONFIG_PROC_FS */ + +void __init dn_route_init(void) +{ + int i, goal, order; + + dn_dst_ops.kmem_cachep = + kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + dst_entries_init(&dn_dst_ops); + setup_timer(&dn_route_timer, dn_dst_check_expire, 0); + dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; + add_timer(&dn_route_timer); + + goal = totalram_pages >> (26 - PAGE_SHIFT); + + for(order = 0; (1UL << order) < goal; order++) + /* NOTHING */; + + /* + * Only want 1024 entries max, since the table is very, very unlikely + * to be larger than that. + */ + while(order && ((((1UL << order) * PAGE_SIZE) / + sizeof(struct dn_rt_hash_bucket)) >= 2048)) + order--; + + do { + dn_rt_hash_mask = (1UL << order) * PAGE_SIZE / + sizeof(struct dn_rt_hash_bucket); + while(dn_rt_hash_mask & (dn_rt_hash_mask - 1)) + dn_rt_hash_mask--; + dn_rt_hash_table = (struct dn_rt_hash_bucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (dn_rt_hash_table == NULL && --order > 0); + + if (!dn_rt_hash_table) + panic("Failed to allocate DECnet route cache hash table\n"); + + printk(KERN_INFO + "DECnet: Routing cache hash table of %u buckets, %ldKbytes\n", + dn_rt_hash_mask, + (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024); + + dn_rt_hash_mask--; + for(i = 0; i <= dn_rt_hash_mask; i++) { + spin_lock_init(&dn_rt_hash_table[i].lock); + dn_rt_hash_table[i].chain = NULL; + } + + dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); + + proc_net_fops_create(&init_net, "decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops); + +#ifdef CONFIG_DECNET_ROUTER + rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, dn_fib_dump); +#else + rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, + dn_cache_dump); +#endif +} + +void __exit dn_route_cleanup(void) +{ + del_timer(&dn_route_timer); + dn_run_flush(0); + + proc_net_remove(&init_net, "decnet_cache"); + dst_entries_destroy(&dn_dst_ops); +} + diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c new file mode 100644 index 00000000..f0efb0cc --- /dev/null +++ b/net/decnet/dn_rules.c @@ -0,0 +1,254 @@ + +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Forwarding Information Base (Rules) + * + * Author: Steve Whitehouse + * Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c + * + * + * Changes: + * Steve Whitehouse + * Updated for Thomas Graf's generic rules + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct fib_rules_ops *dn_fib_rules_ops; + +struct dn_fib_rule +{ + struct fib_rule common; + unsigned char dst_len; + unsigned char src_len; + __le16 src; + __le16 srcmask; + __le16 dst; + __le16 dstmask; + __le16 srcmap; + u8 flags; +}; + + +int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res) +{ + struct fib_lookup_arg arg = { + .result = res, + }; + int err; + + err = fib_rules_lookup(dn_fib_rules_ops, + flowidn_to_flowi(flp), 0, &arg); + res->r = arg.rule; + + return err; +} + +static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct flowidn *fld = &flp->u.dn; + int err = -EAGAIN; + struct dn_fib_table *tbl; + + switch(rule->action) { + case FR_ACT_TO_TBL: + break; + + case FR_ACT_UNREACHABLE: + err = -ENETUNREACH; + goto errout; + + case FR_ACT_PROHIBIT: + err = -EACCES; + goto errout; + + case FR_ACT_BLACKHOLE: + default: + err = -EINVAL; + goto errout; + } + + tbl = dn_fib_get_table(rule->table, 0); + if (tbl == NULL) + goto errout; + + err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result); + if (err > 0) + err = -EAGAIN; +errout: + return err; +} + +static const struct nla_policy dn_fib_rule_policy[FRA_MAX+1] = { + FRA_GENERIC_POLICY, +}; + +static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +{ + struct dn_fib_rule *r = (struct dn_fib_rule *)rule; + struct flowidn *fld = &fl->u.dn; + __le16 daddr = fld->daddr; + __le16 saddr = fld->saddr; + + if (((saddr ^ r->src) & r->srcmask) || + ((daddr ^ r->dst) & r->dstmask)) + return 0; + + return 1; +} + +static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + int err = -EINVAL; + struct dn_fib_rule *r = (struct dn_fib_rule *)rule; + + if (frh->tos) + goto errout; + + if (rule->table == RT_TABLE_UNSPEC) { + if (rule->action == FR_ACT_TO_TBL) { + struct dn_fib_table *table; + + table = dn_fib_empty_table(); + if (table == NULL) { + err = -ENOBUFS; + goto errout; + } + + rule->table = table->n; + } + } + + if (frh->src_len) + r->src = nla_get_le16(tb[FRA_SRC]); + + if (frh->dst_len) + r->dst = nla_get_le16(tb[FRA_DST]); + + r->src_len = frh->src_len; + r->srcmask = dnet_make_mask(r->src_len); + r->dst_len = frh->dst_len; + r->dstmask = dnet_make_mask(r->dst_len); + err = 0; +errout: + return err; +} + +static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + struct dn_fib_rule *r = (struct dn_fib_rule *)rule; + + if (frh->src_len && (r->src_len != frh->src_len)) + return 0; + + if (frh->dst_len && (r->dst_len != frh->dst_len)) + return 0; + + if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC]))) + return 0; + + if (frh->dst_len && (r->dst != nla_get_le16(tb[FRA_DST]))) + return 0; + + return 1; +} + +unsigned dnet_addr_type(__le16 addr) +{ + struct flowidn fld = { .daddr = addr }; + struct dn_fib_res res; + unsigned ret = RTN_UNICAST; + struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0); + + res.r = NULL; + + if (tb) { + if (!tb->lookup(tb, &fld, &res)) { + ret = res.type; + dn_fib_res_put(&res); + } + } + return ret; +} + +static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + struct dn_fib_rule *r = (struct dn_fib_rule *)rule; + + frh->dst_len = r->dst_len; + frh->src_len = r->src_len; + frh->tos = 0; + + if (r->dst_len) + NLA_PUT_LE16(skb, FRA_DST, r->dst); + if (r->src_len) + NLA_PUT_LE16(skb, FRA_SRC, r->src); + + return 0; + +nla_put_failure: + return -ENOBUFS; +} + +static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops) +{ + dn_rt_cache_flush(-1); +} + +static const struct fib_rules_ops __net_initdata dn_fib_rules_ops_template = { + .family = AF_DECnet, + .rule_size = sizeof(struct dn_fib_rule), + .addr_size = sizeof(u16), + .action = dn_fib_rule_action, + .match = dn_fib_rule_match, + .configure = dn_fib_rule_configure, + .compare = dn_fib_rule_compare, + .fill = dn_fib_rule_fill, + .default_pref = fib_default_rule_pref, + .flush_cache = dn_fib_rule_flush_cache, + .nlgroup = RTNLGRP_DECnet_RULE, + .policy = dn_fib_rule_policy, + .owner = THIS_MODULE, + .fro_net = &init_net, +}; + +void __init dn_fib_rules_init(void) +{ + dn_fib_rules_ops = + fib_rules_register(&dn_fib_rules_ops_template, &init_net); + BUG_ON(IS_ERR(dn_fib_rules_ops)); + BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff, + RT_TABLE_MAIN, 0)); +} + +void __exit dn_fib_rules_cleanup(void) +{ + fib_rules_unregister(dn_fib_rules_ops); + rcu_barrier(); +} + + diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c new file mode 100644 index 00000000..bd0a52dd --- /dev/null +++ b/net/decnet/dn_table.c @@ -0,0 +1,908 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Forwarding Information Base (Routing Tables) + * + * Author: Steve Whitehouse + * Mostly copied from the IPv4 routing code + * + * + * Changes: + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* RTF_xxx */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_zone +{ + struct dn_zone *dz_next; + struct dn_fib_node **dz_hash; + int dz_nent; + int dz_divisor; + u32 dz_hashmask; +#define DZ_HASHMASK(dz) ((dz)->dz_hashmask) + int dz_order; + __le16 dz_mask; +#define DZ_MASK(dz) ((dz)->dz_mask) +}; + +struct dn_hash +{ + struct dn_zone *dh_zones[17]; + struct dn_zone *dh_zone_list; +}; + +#define dz_key_0(key) ((key).datum = 0) + +#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ + for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define endfor_nexthops(fi) } + +#define DN_MAX_DIVISOR 1024 +#define DN_S_ZOMBIE 1 +#define DN_S_ACCESSED 2 + +#define DN_FIB_SCAN(f, fp) \ +for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next) + +#define DN_FIB_SCAN_KEY(f, fp, key) \ +for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next) + +#define RT_TABLE_MIN 1 +#define DN_FIB_TABLE_HASHSZ 256 +static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ]; +static DEFINE_RWLOCK(dn_fib_tables_lock); + +static struct kmem_cache *dn_hash_kmem __read_mostly; +static int dn_fib_hash_zombies; + +static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz) +{ + u16 h = le16_to_cpu(key.datum)>>(16 - dz->dz_order); + h ^= (h >> 10); + h ^= (h >> 6); + h &= DZ_HASHMASK(dz); + return *(dn_fib_idx_t *)&h; +} + +static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz) +{ + dn_fib_key_t k; + k.datum = dst & DZ_MASK(dz); + return k; +} + +static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz) +{ + return &dz->dz_hash[dn_hash(key, dz).datum]; +} + +static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz) +{ + return dz->dz_hash[dn_hash(key, dz).datum]; +} + +static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b) +{ + return a.datum == b.datum; +} + +static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b) +{ + return a.datum <= b.datum; +} + +static inline void dn_rebuild_zone(struct dn_zone *dz, + struct dn_fib_node **old_ht, + int old_divisor) +{ + struct dn_fib_node *f, **fp, *next; + int i; + + for(i = 0; i < old_divisor; i++) { + for(f = old_ht[i]; f; f = next) { + next = f->fn_next; + for(fp = dn_chain_p(f->fn_key, dz); + *fp && dn_key_leq((*fp)->fn_key, f->fn_key); + fp = &(*fp)->fn_next) + /* NOTHING */; + f->fn_next = *fp; + *fp = f; + } + } +} + +static void dn_rehash_zone(struct dn_zone *dz) +{ + struct dn_fib_node **ht, **old_ht; + int old_divisor, new_divisor; + u32 new_hashmask; + + old_divisor = dz->dz_divisor; + + switch(old_divisor) { + case 16: + new_divisor = 256; + new_hashmask = 0xFF; + break; + default: + printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", old_divisor); + case 256: + new_divisor = 1024; + new_hashmask = 0x3FF; + break; + } + + ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL); + if (ht == NULL) + return; + + write_lock_bh(&dn_fib_tables_lock); + old_ht = dz->dz_hash; + dz->dz_hash = ht; + dz->dz_hashmask = new_hashmask; + dz->dz_divisor = new_divisor; + dn_rebuild_zone(dz, old_ht, old_divisor); + write_unlock_bh(&dn_fib_tables_lock); + kfree(old_ht); +} + +static void dn_free_node(struct dn_fib_node *f) +{ + dn_fib_release_info(DN_FIB_INFO(f)); + kmem_cache_free(dn_hash_kmem, f); +} + + +static struct dn_zone *dn_new_zone(struct dn_hash *table, int z) +{ + int i; + struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL); + if (!dz) + return NULL; + + if (z) { + dz->dz_divisor = 16; + dz->dz_hashmask = 0x0F; + } else { + dz->dz_divisor = 1; + dz->dz_hashmask = 0; + } + + dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL); + if (!dz->dz_hash) { + kfree(dz); + return NULL; + } + + dz->dz_order = z; + dz->dz_mask = dnet_make_mask(z); + + for(i = z + 1; i <= 16; i++) + if (table->dh_zones[i]) + break; + + write_lock_bh(&dn_fib_tables_lock); + if (i>16) { + dz->dz_next = table->dh_zone_list; + table->dh_zone_list = dz; + } else { + dz->dz_next = table->dh_zones[i]->dz_next; + table->dh_zones[i]->dz_next = dz; + } + table->dh_zones[z] = dz; + write_unlock_bh(&dn_fib_tables_lock); + return dz; +} + + +static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern_rta *rta, struct dn_fib_info *fi) +{ + struct rtnexthop *nhp; + int nhlen; + + if (rta->rta_priority && *rta->rta_priority != fi->fib_priority) + return 1; + + if (rta->rta_oif || rta->rta_gw) { + if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && + (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 2) == 0)) + return 0; + return 1; + } + + if (rta->rta_mp == NULL) + return 0; + + nhp = RTA_DATA(rta->rta_mp); + nhlen = RTA_PAYLOAD(rta->rta_mp); + + for_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + __le16 gw; + + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) + return 1; + if (attrlen) { + gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + + if (gw && gw != nh->nh_gw) + return 1; + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + + return 0; +} + +static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi) +{ + size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(2) /* RTA_DST */ + + nla_total_size(4); /* RTA_PRIORITY */ + + /* space for nested metrics */ + payload += nla_total_size((RTAX_MAX * nla_total_size(4))); + + if (fi->fib_nhs) { + /* Also handles the special case fib_nhs == 1 */ + + /* each nexthop is packed in an attribute */ + size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); + + /* may contain a gateway attribute */ + nhsize += nla_total_size(4); + + /* all nexthops are packed in a nested attribute */ + payload += nla_total_size(fi->fib_nhs * nhsize); + } + + return payload; +} + +static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u32 tb_id, u8 type, u8 scope, void *dst, int dst_len, + struct dn_fib_info *fi, unsigned int flags) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_DECnet; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = tb_id; + RTA_PUT_U32(skb, RTA_TABLE, tb_id); + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = scope; + rtm->rtm_type = type; + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 2, dst); + rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_priority) + RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority); + if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + goto rtattr_failure; + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 2, &fi->fib_nh->nh_gw); + if (fi->fib_nh->nh_oif) + RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); + } + if (fi->fib_nhs > 1) { + struct rtnexthop *nhp; + struct rtattr *mp_head; + if (skb_tailroom(skb) <= RTA_SPACE(0)) + goto rtattr_failure; + mp_head = (struct rtattr *)skb_put(skb, RTA_SPACE(0)); + + for_nexthops(fi) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = nh->nh_flags & 0xFF; + nhp->rtnh_hops = nh->nh_weight - 1; + nhp->rtnh_ifindex = nh->nh_oif; + if (nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 2, &nh->nh_gw); + nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp; + } endfor_nexthops(fi); + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + } + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + + +nlmsg_failure: +rtattr_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + + +static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id, + struct nlmsghdr *nlh, struct netlink_skb_parms *req) +{ + struct sk_buff *skb; + u32 pid = req ? req->pid : 0; + int err = -ENOBUFS; + + skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id, + f->fn_type, f->fn_scope, &f->fn_key, z, + DN_FIB_INFO(f), 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in dn_fib_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); +} + +static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb, + struct netlink_callback *cb, + struct dn_fib_table *tb, + struct dn_zone *dz, + struct dn_fib_node *f) +{ + int i, s_i; + + s_i = cb->args[4]; + for(i = 0; f; i++, f = f->fn_next) { + if (i < s_i) + continue; + if (f->fn_state & DN_S_ZOMBIE) + continue; + if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->n, + (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type, + f->fn_scope, &f->fn_key, dz->dz_order, + f->fn_info, NLM_F_MULTI) < 0) { + cb->args[4] = i; + return -1; + } + } + cb->args[4] = i; + return skb->len; +} + +static __inline__ int dn_hash_dump_zone(struct sk_buff *skb, + struct netlink_callback *cb, + struct dn_fib_table *tb, + struct dn_zone *dz) +{ + int h, s_h; + + s_h = cb->args[3]; + for(h = 0; h < dz->dz_divisor; h++) { + if (h < s_h) + continue; + if (h > s_h) + memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); + if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL) + continue; + if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) { + cb->args[3] = h; + return -1; + } + } + cb->args[3] = h; + return skb->len; +} + +static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + int m, s_m; + struct dn_zone *dz; + struct dn_hash *table = (struct dn_hash *)tb->data; + + s_m = cb->args[2]; + read_lock(&dn_fib_tables_lock); + for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) { + if (m < s_m) + continue; + if (m > s_m) + memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); + + if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) { + cb->args[2] = m; + read_unlock(&dn_fib_tables_lock); + return -1; + } + } + read_unlock(&dn_fib_tables_lock); + cb->args[2] = m; + + return skb->len; +} + +int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int h, s_h; + unsigned int e = 0, s_e; + struct dn_fib_table *tb; + struct hlist_node *node; + int dumped = 0; + + if (!net_eq(net, &init_net)) + return 0; + + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return dn_cache_dump(skb, cb); + + s_h = cb->args[0]; + s_e = cb->args[1]; + + for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) { + e = 0; + hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) { + if (e < s_e) + goto next; + if (dumped) + memset(&cb->args[2], 0, sizeof(cb->args) - + 2 * sizeof(cb->args[0])); + if (tb->dump(tb, skb, cb) < 0) + goto out; + dumped = 1; +next: + e++; + } + } +out: + cb->args[1] = e; + cb->args[0] = h; + + return skb->len; +} + +static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct dn_hash *table = (struct dn_hash *)tb->data; + struct dn_fib_node *new_f, *f, **fp, **del_fp; + struct dn_zone *dz; + struct dn_fib_info *fi; + int z = r->rtm_dst_len; + int type = r->rtm_type; + dn_fib_key_t key; + int err; + + if (z > 16) + return -EINVAL; + + dz = table->dh_zones[z]; + if (!dz && !(dz = dn_new_zone(table, z))) + return -ENOBUFS; + + dz_key_0(key); + if (rta->rta_dst) { + __le16 dst; + memcpy(&dst, rta->rta_dst, 2); + if (dst & ~DZ_MASK(dz)) + return -EINVAL; + key = dz_key(dst, dz); + } + + if ((fi = dn_fib_create_info(r, rta, n, &err)) == NULL) + return err; + + if (dz->dz_nent > (dz->dz_divisor << 2) && + dz->dz_divisor > DN_MAX_DIVISOR && + (z==16 || (1< dz->dz_divisor)) + dn_rehash_zone(dz); + + fp = dn_chain_p(key, dz); + + DN_FIB_SCAN(f, fp) { + if (dn_key_leq(key, f->fn_key)) + break; + } + + del_fp = NULL; + + if (f && (f->fn_state & DN_S_ZOMBIE) && + dn_key_eq(f->fn_key, key)) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto create; + } + + DN_FIB_SCAN_KEY(f, fp, key) { + if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority) + break; + } + + if (f && dn_key_eq(f->fn_key, key) && + fi->fib_priority == DN_FIB_INFO(f)->fib_priority) { + struct dn_fib_node **ins_fp; + + err = -EEXIST; + if (n->nlmsg_flags & NLM_F_EXCL) + goto out; + + if (n->nlmsg_flags & NLM_F_REPLACE) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto replace; + } + + ins_fp = fp; + err = -EEXIST; + + DN_FIB_SCAN_KEY(f, fp, key) { + if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority) + break; + if (f->fn_type == type && + f->fn_scope == r->rtm_scope && + DN_FIB_INFO(f) == fi) + goto out; + } + + if (!(n->nlmsg_flags & NLM_F_APPEND)) { + fp = ins_fp; + f = *fp; + } + } + +create: + err = -ENOENT; + if (!(n->nlmsg_flags & NLM_F_CREATE)) + goto out; + +replace: + err = -ENOBUFS; + new_f = kmem_cache_zalloc(dn_hash_kmem, GFP_KERNEL); + if (new_f == NULL) + goto out; + + new_f->fn_key = key; + new_f->fn_type = type; + new_f->fn_scope = r->rtm_scope; + DN_FIB_INFO(new_f) = fi; + + new_f->fn_next = f; + write_lock_bh(&dn_fib_tables_lock); + *fp = new_f; + write_unlock_bh(&dn_fib_tables_lock); + dz->dz_nent++; + + if (del_fp) { + f = *del_fp; + write_lock_bh(&dn_fib_tables_lock); + *del_fp = f->fn_next; + write_unlock_bh(&dn_fib_tables_lock); + + if (!(f->fn_state & DN_S_ZOMBIE)) + dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req); + if (f->fn_state & DN_S_ACCESSED) + dn_rt_cache_flush(-1); + dn_free_node(f); + dz->dz_nent--; + } else { + dn_rt_cache_flush(-1); + } + + dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req); + + return 0; +out: + dn_fib_release_info(fi); + return err; +} + + +static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct dn_hash *table = (struct dn_hash*)tb->data; + struct dn_fib_node **fp, **del_fp, *f; + int z = r->rtm_dst_len; + struct dn_zone *dz; + dn_fib_key_t key; + int matched; + + + if (z > 16) + return -EINVAL; + + if ((dz = table->dh_zones[z]) == NULL) + return -ESRCH; + + dz_key_0(key); + if (rta->rta_dst) { + __le16 dst; + memcpy(&dst, rta->rta_dst, 2); + if (dst & ~DZ_MASK(dz)) + return -EINVAL; + key = dz_key(dst, dz); + } + + fp = dn_chain_p(key, dz); + + DN_FIB_SCAN(f, fp) { + if (dn_key_eq(f->fn_key, key)) + break; + if (dn_key_leq(key, f->fn_key)) + return -ESRCH; + } + + matched = 0; + del_fp = NULL; + DN_FIB_SCAN_KEY(f, fp, key) { + struct dn_fib_info *fi = DN_FIB_INFO(f); + + if (f->fn_state & DN_S_ZOMBIE) + return -ESRCH; + + matched++; + + if (del_fp == NULL && + (!r->rtm_type || f->fn_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) && + (!r->rtm_protocol || + fi->fib_protocol == r->rtm_protocol) && + dn_fib_nh_match(r, n, rta, fi) == 0) + del_fp = fp; + } + + if (del_fp) { + f = *del_fp; + dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req); + + if (matched != 1) { + write_lock_bh(&dn_fib_tables_lock); + *del_fp = f->fn_next; + write_unlock_bh(&dn_fib_tables_lock); + + if (f->fn_state & DN_S_ACCESSED) + dn_rt_cache_flush(-1); + dn_free_node(f); + dz->dz_nent--; + } else { + f->fn_state |= DN_S_ZOMBIE; + if (f->fn_state & DN_S_ACCESSED) { + f->fn_state &= ~DN_S_ACCESSED; + dn_rt_cache_flush(-1); + } + if (++dn_fib_hash_zombies > 128) + dn_fib_flush(); + } + + return 0; + } + + return -ESRCH; +} + +static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table) +{ + int found = 0; + struct dn_fib_node *f; + + while((f = *fp) != NULL) { + struct dn_fib_info *fi = DN_FIB_INFO(f); + + if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) { + write_lock_bh(&dn_fib_tables_lock); + *fp = f->fn_next; + write_unlock_bh(&dn_fib_tables_lock); + + dn_free_node(f); + found++; + continue; + } + fp = &f->fn_next; + } + + return found; +} + +static int dn_fib_table_flush(struct dn_fib_table *tb) +{ + struct dn_hash *table = (struct dn_hash *)tb->data; + struct dn_zone *dz; + int found = 0; + + dn_fib_hash_zombies = 0; + for(dz = table->dh_zone_list; dz; dz = dz->dz_next) { + int i; + int tmp = 0; + for(i = dz->dz_divisor-1; i >= 0; i--) + tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table); + dz->dz_nent -= tmp; + found += tmp; + } + + return found; +} + +static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res) +{ + int err; + struct dn_zone *dz; + struct dn_hash *t = (struct dn_hash *)tb->data; + + read_lock(&dn_fib_tables_lock); + for(dz = t->dh_zone_list; dz; dz = dz->dz_next) { + struct dn_fib_node *f; + dn_fib_key_t k = dz_key(flp->daddr, dz); + + for(f = dz_chain(k, dz); f; f = f->fn_next) { + if (!dn_key_eq(k, f->fn_key)) { + if (dn_key_leq(k, f->fn_key)) + break; + else + continue; + } + + f->fn_state |= DN_S_ACCESSED; + + if (f->fn_state&DN_S_ZOMBIE) + continue; + + if (f->fn_scope < flp->flowidn_scope) + continue; + + err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res); + + if (err == 0) { + res->type = f->fn_type; + res->scope = f->fn_scope; + res->prefixlen = dz->dz_order; + goto out; + } + if (err < 0) + goto out; + } + } + err = 1; +out: + read_unlock(&dn_fib_tables_lock); + return err; +} + + +struct dn_fib_table *dn_fib_get_table(u32 n, int create) +{ + struct dn_fib_table *t; + struct hlist_node *node; + unsigned int h; + + if (n < RT_TABLE_MIN) + return NULL; + + if (n > RT_TABLE_MAX) + return NULL; + + h = n & (DN_FIB_TABLE_HASHSZ - 1); + rcu_read_lock(); + hlist_for_each_entry_rcu(t, node, &dn_fib_table_hash[h], hlist) { + if (t->n == n) { + rcu_read_unlock(); + return t; + } + } + rcu_read_unlock(); + + if (!create) + return NULL; + + if (in_interrupt() && net_ratelimit()) { + printk(KERN_DEBUG "DECnet: BUG! Attempt to create routing table from interrupt\n"); + return NULL; + } + + t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash), + GFP_KERNEL); + if (t == NULL) + return NULL; + + t->n = n; + t->insert = dn_fib_table_insert; + t->delete = dn_fib_table_delete; + t->lookup = dn_fib_table_lookup; + t->flush = dn_fib_table_flush; + t->dump = dn_fib_table_dump; + hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]); + + return t; +} + +struct dn_fib_table *dn_fib_empty_table(void) +{ + u32 id; + + for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++) + if (dn_fib_get_table(id, 0) == NULL) + return dn_fib_get_table(id, 1); + return NULL; +} + +void dn_fib_flush(void) +{ + int flushed = 0; + struct dn_fib_table *tb; + struct hlist_node *node; + unsigned int h; + + for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) { + hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) + flushed += tb->flush(tb); + } + + if (flushed) + dn_rt_cache_flush(-1); +} + +void __init dn_fib_table_init(void) +{ + dn_hash_kmem = kmem_cache_create("dn_fib_info_cache", + sizeof(struct dn_fib_info), + 0, SLAB_HWCACHE_ALIGN, + NULL); +} + +void __exit dn_fib_table_cleanup(void) +{ + struct dn_fib_table *t; + struct hlist_node *node, *next; + unsigned int h; + + write_lock(&dn_fib_tables_lock); + for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) { + hlist_for_each_entry_safe(t, node, next, &dn_fib_table_hash[h], + hlist) { + hlist_del(&t->hlist); + kfree(t); + } + } + write_unlock(&dn_fib_tables_lock); +} diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c new file mode 100644 index 00000000..09825711 --- /dev/null +++ b/net/decnet/dn_timer.c @@ -0,0 +1,109 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Socket Timer Functions + * + * Author: Steve Whitehouse + * + * + * Changes: + * Steve Whitehouse : Made keepalive timer part of the same + * timer idea. + * Steve Whitehouse : Added checks for sk->sock_readers + * David S. Miller : New socket locking + * Steve Whitehouse : Timer grabs socket ref. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Slow timer is for everything else (n * 500mS) + */ + +#define SLOW_INTERVAL (HZ/2) + +static void dn_slow_timer(unsigned long arg); + +void dn_start_slow_timer(struct sock *sk) +{ + sk->sk_timer.expires = jiffies + SLOW_INTERVAL; + sk->sk_timer.function = dn_slow_timer; + sk->sk_timer.data = (unsigned long)sk; + + add_timer(&sk->sk_timer); +} + +void dn_stop_slow_timer(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +static void dn_slow_timer(unsigned long arg) +{ + struct sock *sk = (struct sock *)arg; + struct dn_scp *scp = DN_SK(sk); + + sock_hold(sk); + bh_lock_sock(sk); + + if (sock_owned_by_user(sk)) { + sk->sk_timer.expires = jiffies + HZ / 10; + add_timer(&sk->sk_timer); + goto out; + } + + /* + * The persist timer is the standard slow timer used for retransmits + * in both connection establishment and disconnection as well as + * in the RUN state. The different states are catered for by changing + * the function pointer in the socket. Setting the timer to a value + * of zero turns it off. We allow the persist_fxn to turn the + * timer off in a permant way by returning non-zero, so that + * timer based routines may remove sockets. This is why we have a + * sock_hold()/sock_put() around the timer to prevent the socket + * going away in the middle. + */ + if (scp->persist && scp->persist_fxn) { + if (scp->persist <= SLOW_INTERVAL) { + scp->persist = 0; + + if (scp->persist_fxn(sk)) + goto out; + } else { + scp->persist -= SLOW_INTERVAL; + } + } + + /* + * Check for keepalive timeout. After the other timer 'cos if + * the previous timer caused a retransmit, we don't need to + * do this. scp->stamp is the last time that we sent a packet. + * The keepalive function sends a link service packet to the + * other end. If it remains unacknowledged, the standard + * socket timers will eventually shut the socket down. Each + * time we do this, scp->stamp will be updated, thus + * we won't try and send another until scp->keepalive has passed + * since the last successful transmission. + */ + if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) { + if ((jiffies - scp->stamp) >= scp->keepalive) + scp->keepalive_fxn(sk); + } + + sk->sk_timer.expires = jiffies + SLOW_INTERVAL; + + add_timer(&sk->sk_timer); +out: + bh_unlock_sock(sk); + sock_put(sk); +} diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig new file mode 100644 index 00000000..2f81de5e --- /dev/null +++ b/net/decnet/netfilter/Kconfig @@ -0,0 +1,16 @@ +# +# DECnet netfilter configuration +# + +menu "DECnet: Netfilter Configuration" + depends on DECNET && NETFILTER && EXPERIMENTAL + depends on NETFILTER_ADVANCED + +config DECNET_NF_GRABULATOR + tristate "Routing message grabulator (for userland routing daemon)" + help + Enable this module if you want to use the userland DECnet routing + daemon. You will also need to enable routing support for DECnet + unless you just want to monitor routing messages from other nodes. + +endmenu diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile new file mode 100644 index 00000000..255c1ae9 --- /dev/null +++ b/net/decnet/netfilter/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for DECnet netfilter modules +# + +obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o + diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c new file mode 100644 index 00000000..64a7f39e --- /dev/null +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -0,0 +1,161 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Message Grabulator + * + * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * This code may be copied under the GPL v.2 or at your option + * any later version. + * + * Author: Steven Whitehouse + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct sock *dnrmg = NULL; + + +static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp) +{ + struct sk_buff *skb = NULL; + size_t size; + sk_buff_data_t old_tail; + struct nlmsghdr *nlh; + unsigned char *ptr; + struct nf_dn_rtmsg *rtm; + + size = NLMSG_SPACE(rt_skb->len); + size += NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)); + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, 0, size - sizeof(*nlh)); + rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh); + rtm->nfdn_ifindex = rt_skb->dev->ifindex; + ptr = NFDN_RTMSG(rtm); + skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len); + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -ENOMEM; + if (net_ratelimit()) + printk(KERN_ERR "dn_rtmsg: error creating netlink message\n"); + return NULL; +} + +static void dnrmg_send_peer(struct sk_buff *skb) +{ + struct sk_buff *skb2; + int status = 0; + int group = 0; + unsigned char flags = *skb->data; + + switch(flags & DN_RT_CNTL_MSK) { + case DN_RT_PKT_L1RT: + group = DNRNG_NLGRP_L1; + break; + case DN_RT_PKT_L2RT: + group = DNRNG_NLGRP_L2; + break; + default: + return; + } + + skb2 = dnrmg_build_message(skb, &status); + if (skb2 == NULL) + return; + NETLINK_CB(skb2).dst_group = group; + netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC); +} + + +static unsigned int dnrmg_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + dnrmg_send_peer(skb); + return NF_ACCEPT; +} + + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void dnrmg_receive_user_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + /* Eventually we might send routing messages too */ + + RCV_SKB_FAIL(-EINVAL); +} + +static struct nf_hook_ops dnrmg_ops __read_mostly = { + .hook = dnrmg_hook, + .pf = PF_DECnet, + .hooknum = NF_DN_ROUTE, + .priority = NF_DN_PRI_DNRTMSG, +}; + +static int __init dn_rtmsg_init(void) +{ + int rv = 0; + + dnrmg = netlink_kernel_create(&init_net, + NETLINK_DNRTMSG, DNRNG_NLGRP_MAX, + dnrmg_receive_user_skb, + NULL, THIS_MODULE); + if (dnrmg == NULL) { + printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); + return -ENOMEM; + } + + rv = nf_register_hook(&dnrmg_ops); + if (rv) { + netlink_kernel_release(dnrmg); + } + + return rv; +} + +static void __exit dn_rtmsg_fini(void) +{ + nf_unregister_hook(&dnrmg_ops); + netlink_kernel_release(dnrmg); +} + + +MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); +MODULE_AUTHOR("Steven Whitehouse "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); + +module_init(dn_rtmsg_init); +module_exit(dn_rtmsg_fini); + diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c new file mode 100644 index 00000000..28f8b5e5 --- /dev/null +++ b/net/decnet/sysctl_net_decnet.c @@ -0,0 +1,377 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet sysctl support functions + * + * Author: Steve Whitehouse + * + * + * Changes: + * Steve Whitehouse - C99 changes and default device handling + * Steve Whitehouse - Memory buffer settings, like the tcp ones + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + + +int decnet_debug_level; +int decnet_time_wait = 30; +int decnet_dn_count = 1; +int decnet_di_count = 3; +int decnet_dr_count = 3; +int decnet_log_martians = 1; +int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; + +/* Reasonable defaults, I hope, based on tcp's defaults */ +long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + +#ifdef CONFIG_SYSCTL +extern int decnet_dst_gc_interval; +static int min_decnet_time_wait[] = { 5 }; +static int max_decnet_time_wait[] = { 600 }; +static int min_state_count[] = { 1 }; +static int max_state_count[] = { NSP_MAXRXTSHIFT }; +static int min_decnet_dst_gc_interval[] = { 1 }; +static int max_decnet_dst_gc_interval[] = { 60 }; +static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW }; +static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW }; +static char node_name[7] = "???"; + +static struct ctl_table_header *dn_table_header = NULL; + +/* + * ctype.h :-) + */ +#define ISNUM(x) (((x) >= '0') && ((x) <= '9')) +#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z')) +#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z')) +#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x)) +#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x)) + +static void strip_it(char *str) +{ + for(;;) { + switch(*str) { + case ' ': + case '\n': + case '\r': + case ':': + *str = 0; + case 0: + return; + } + str++; + } +} + +/* + * Simple routine to parse an ascii DECnet address + * into a network order address. + */ +static int parse_addr(__le16 *addr, char *str) +{ + __u16 area, node; + + while(*str && !ISNUM(*str)) str++; + + if (*str == 0) + return -1; + + area = (*str++ - '0'); + if (ISNUM(*str)) { + area *= 10; + area += (*str++ - '0'); + } + + if (*str++ != '.') + return -1; + + if (!ISNUM(*str)) + return -1; + + node = *str++ - '0'; + if (ISNUM(*str)) { + node *= 10; + node += (*str++ - '0'); + } + if (ISNUM(*str)) { + node *= 10; + node += (*str++ - '0'); + } + if (ISNUM(*str)) { + node *= 10; + node += (*str++ - '0'); + } + + if ((node > 1023) || (area > 63)) + return -1; + + if (INVALID_END_CHAR(*str)) + return -1; + + *addr = cpu_to_le16((area << 10) | node); + + return 0; +} + +static int dn_node_address_handler(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + char addr[DN_ASCBUF_LEN]; + size_t len; + __le16 dnaddr; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); + + if (copy_from_user(addr, buffer, len)) + return -EFAULT; + + addr[len] = 0; + strip_it(addr); + + if (parse_addr(&dnaddr, addr)) + return -EINVAL; + + dn_dev_devices_off(); + + decnet_address = dnaddr; + + dn_dev_devices_on(); + + *ppos += len; + + return 0; + } + + dn_addr2asc(le16_to_cpu(decnet_address), addr); + len = strlen(addr); + addr[len++] = '\n'; + + if (len > *lenp) len = *lenp; + + if (copy_to_user(buffer, addr, len)) + return -EFAULT; + + *lenp = len; + *ppos += len; + + return 0; +} + +static int dn_def_dev_handler(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + size_t len; + struct net_device *dev; + char devname[17]; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + if (*lenp > 16) + return -E2BIG; + + if (copy_from_user(devname, buffer, *lenp)) + return -EFAULT; + + devname[*lenp] = 0; + strip_it(devname); + + dev = dev_get_by_name(&init_net, devname); + if (dev == NULL) + return -ENODEV; + + if (dev->dn_ptr == NULL) { + dev_put(dev); + return -ENODEV; + } + + if (dn_dev_set_default(dev, 1)) { + dev_put(dev); + return -ENODEV; + } + *ppos += *lenp; + + return 0; + } + + dev = dn_dev_get_default(); + if (dev == NULL) { + *lenp = 0; + return 0; + } + + strcpy(devname, dev->name); + dev_put(dev); + len = strlen(devname); + devname[len++] = '\n'; + + if (len > *lenp) len = *lenp; + + if (copy_to_user(buffer, devname, len)) + return -EFAULT; + + *lenp = len; + *ppos += len; + + return 0; +} + +static ctl_table dn_table[] = { + { + .procname = "node_address", + .maxlen = 7, + .mode = 0644, + .proc_handler = dn_node_address_handler, + }, + { + .procname = "node_name", + .data = node_name, + .maxlen = 7, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "default_device", + .maxlen = 16, + .mode = 0644, + .proc_handler = dn_def_dev_handler, + }, + { + .procname = "time_wait", + .data = &decnet_time_wait, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_decnet_time_wait, + .extra2 = &max_decnet_time_wait + }, + { + .procname = "dn_count", + .data = &decnet_dn_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_state_count, + .extra2 = &max_state_count + }, + { + .procname = "di_count", + .data = &decnet_di_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_state_count, + .extra2 = &max_state_count + }, + { + .procname = "dr_count", + .data = &decnet_dr_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_state_count, + .extra2 = &max_state_count + }, + { + .procname = "dst_gc_interval", + .data = &decnet_dst_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_decnet_dst_gc_interval, + .extra2 = &max_decnet_dst_gc_interval + }, + { + .procname = "no_fc_max_cwnd", + .data = &decnet_no_fc_max_cwnd, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_decnet_no_fc_max_cwnd, + .extra2 = &max_decnet_no_fc_max_cwnd + }, + { + .procname = "decnet_mem", + .data = &sysctl_decnet_mem, + .maxlen = sizeof(sysctl_decnet_mem), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, + { + .procname = "decnet_rmem", + .data = &sysctl_decnet_rmem, + .maxlen = sizeof(sysctl_decnet_rmem), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "decnet_wmem", + .data = &sysctl_decnet_wmem, + .maxlen = sizeof(sysctl_decnet_wmem), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "debug", + .data = &decnet_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path dn_path[] = { + { .procname = "net", }, + { .procname = "decnet", }, + { } +}; + +void dn_register_sysctl(void) +{ + dn_table_header = register_sysctl_paths(dn_path, dn_table); +} + +void dn_unregister_sysctl(void) +{ + unregister_sysctl_table(dn_table_header); +} + +#else /* CONFIG_SYSCTL */ +void dn_unregister_sysctl(void) +{ +} +void dn_register_sysctl(void) +{ +} + +#endif diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig new file mode 100644 index 00000000..50d49f7e --- /dev/null +++ b/net/dns_resolver/Kconfig @@ -0,0 +1,27 @@ +# +# Configuration for DNS Resolver +# +config DNS_RESOLVER + tristate "DNS Resolver support" + depends on NET && KEYS + help + Saying Y here will include support for the DNS Resolver key type + which can be used to make upcalls to perform DNS lookups in + userspace. + + DNS Resolver is used to query DNS server for information. Examples + being resolving a UNC hostname element to an IP address for CIFS or + performing a DNS query for AFSDB records so that AFS can locate a + cell's volume location database servers. + + DNS Resolver is used by the CIFS and AFS modules, and would support + SMB2 later. DNS Resolver is supported by the userspace upcall + helper "/sbin/dns.resolver" via /etc/request-key.conf. + + See for further + information. + + To compile this as a module, choose M here: the module will be called + dnsresolver. + + If unsure, say N. diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile new file mode 100644 index 00000000..d5c13c2e --- /dev/null +++ b/net/dns_resolver/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux DNS Resolver. +# + +obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o + +dns_resolver-y := dns_key.o dns_query.o diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c new file mode 100644 index 00000000..fa000d26 --- /dev/null +++ b/net/dns_resolver/dns_key.c @@ -0,0 +1,309 @@ +/* Key type used to cache DNS lookups made by the kernel + * + * See Documentation/networking/dns_resolver.txt + * + * Copyright (c) 2007 Igor Mammedov + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +MODULE_DESCRIPTION("DNS Resolver"); +MODULE_AUTHOR("Wang Lei"); +MODULE_LICENSE("GPL"); + +unsigned dns_resolver_debug; +module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "DNS Resolver debugging mask"); + +const struct cred *dns_resolver_cache; + +#define DNS_ERRORNO_OPTION "dnserror" + +/* + * Instantiate a user defined key for dns_resolver. + * + * The data must be a NUL-terminated string, with the NUL char accounted in + * datalen. + * + * If the data contains a '#' characters, then we take the clause after each + * one to be an option of the form 'key=value'. The actual data of interest is + * the string leading up to the first '#'. For instance: + * + * "ip1,ip2,...#foo=bar" + */ +static int +dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen) +{ + struct user_key_payload *upayload; + unsigned long derrno; + int ret; + size_t result_len = 0; + const char *data = _data, *end, *opt; + + kenter("%%%d,%s,'%*.*s',%zu", + key->serial, key->description, + (int)datalen, (int)datalen, data, datalen); + + if (datalen <= 1 || !data || data[datalen - 1] != '\0') + return -EINVAL; + datalen--; + + /* deal with any options embedded in the data */ + end = data + datalen; + opt = memchr(data, '#', datalen); + if (!opt) { + /* no options: the entire data is the result */ + kdebug("no options"); + result_len = datalen; + } else { + const char *next_opt; + + result_len = opt - data; + opt++; + kdebug("options: '%s'", opt); + do { + const char *eq; + int opt_len, opt_nlen, opt_vlen, tmp; + + next_opt = memchr(opt, '#', end - opt) ?: end; + opt_len = next_opt - opt; + if (!opt_len) { + printk(KERN_WARNING + "Empty option to dns_resolver key %d\n", + key->serial); + return -EINVAL; + } + + eq = memchr(opt, '=', opt_len) ?: end; + opt_nlen = eq - opt; + eq++; + opt_vlen = next_opt - eq; /* will be -1 if no value */ + + tmp = opt_vlen >= 0 ? opt_vlen : 0; + kdebug("option '%*.*s' val '%*.*s'", + opt_nlen, opt_nlen, opt, tmp, tmp, eq); + + /* see if it's an error number representing a DNS error + * that's to be recorded as the result in this key */ + if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 && + memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) { + kdebug("dns error number option"); + if (opt_vlen <= 0) + goto bad_option_value; + + ret = strict_strtoul(eq, 10, &derrno); + if (ret < 0) + goto bad_option_value; + + if (derrno < 1 || derrno > 511) + goto bad_option_value; + + kdebug("dns error no. = %lu", derrno); + key->type_data.x[0] = -derrno; + continue; + } + + bad_option_value: + printk(KERN_WARNING + "Option '%*.*s' to dns_resolver key %d:" + " bad/missing value\n", + opt_nlen, opt_nlen, opt, key->serial); + return -EINVAL; + } while (opt = next_opt + 1, opt < end); + } + + /* don't cache the result if we're caching an error saying there's no + * result */ + if (key->type_data.x[0]) { + kleave(" = 0 [h_error %ld]", key->type_data.x[0]); + return 0; + } + + kdebug("store result"); + ret = key_payload_reserve(key, result_len); + if (ret < 0) + return -EINVAL; + + upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL); + if (!upayload) { + kleave(" = -ENOMEM"); + return -ENOMEM; + } + + upayload->datalen = result_len; + memcpy(upayload->data, data, result_len); + upayload->data[result_len] = '\0'; + rcu_assign_pointer(key->payload.data, upayload); + + kleave(" = 0"); + return 0; +} + +/* + * The description is of the form "[:]" + * + * The domain name may be a simple name or an absolute domain name (which + * should end with a period). The domain name is case-independent. + */ +static int +dns_resolver_match(const struct key *key, const void *description) +{ + int slen, dlen, ret = 0; + const char *src = key->description, *dsp = description; + + kenter("%s,%s", src, dsp); + + if (!src || !dsp) + goto no_match; + + if (strcasecmp(src, dsp) == 0) + goto matched; + + slen = strlen(src); + dlen = strlen(dsp); + if (slen <= 0 || dlen <= 0) + goto no_match; + if (src[slen - 1] == '.') + slen--; + if (dsp[dlen - 1] == '.') + dlen--; + if (slen != dlen || strncasecmp(src, dsp, slen) != 0) + goto no_match; + +matched: + ret = 1; +no_match: + kleave(" = %d", ret); + return ret; +} + +/* + * Describe a DNS key + */ +static void dns_resolver_describe(const struct key *key, struct seq_file *m) +{ + int err = key->type_data.x[0]; + + seq_puts(m, key->description); + if (key_is_instantiated(key)) { + if (err) + seq_printf(m, ": %d", err); + else + seq_printf(m, ": %u", key->datalen); + } +} + +/* + * read the DNS data + * - the key's semaphore is read-locked + */ +static long dns_resolver_read(const struct key *key, + char __user *buffer, size_t buflen) +{ + if (key->type_data.x[0]) + return key->type_data.x[0]; + + return user_read(key, buffer, buflen); +} + +struct key_type key_type_dns_resolver = { + .name = "dns_resolver", + .instantiate = dns_resolver_instantiate, + .match = dns_resolver_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = dns_resolver_describe, + .read = dns_resolver_read, +}; + +static int __init init_dns_resolver(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + printk(KERN_NOTICE "Registering the %s key type\n", + key_type_dns_resolver.name); + + /* create an override credential set with a special thread keyring in + * which DNS requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_dns_resolver); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + dns_resolver_cache = cred; + + kdebug("DNS resolver keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +static void __exit exit_dns_resolver(void) +{ + key_revoke(dns_resolver_cache->thread_keyring); + unregister_key_type(&key_type_dns_resolver); + put_cred(dns_resolver_cache); + printk(KERN_NOTICE "Unregistered %s key type\n", + key_type_dns_resolver.name); +} + +module_init(init_dns_resolver) +module_exit(exit_dns_resolver) +MODULE_LICENSE("GPL"); diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c new file mode 100644 index 00000000..c32be292 --- /dev/null +++ b/net/dns_resolver/dns_query.c @@ -0,0 +1,165 @@ +/* Upcall routine, designed to work as a key type and working through + * /sbin/request-key to contact userspace when handling DNS queries. + * + * See Documentation/networking/dns_resolver.txt + * + * Copyright (c) 2007 Igor Mammedov + * Author(s): Igor Mammedov (niallain@gmail.com) + * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) + * + * The upcall wrapper used to make an arbitrary DNS query. + * + * This function requires the appropriate userspace tool dns.upcall to be + * installed and something like the following lines should be added to the + * /etc/request-key.conf file: + * + * create dns_resolver * * /sbin/dns.upcall %k + * + * For example to use this module to query AFSDB RR: + * + * create dns_resolver afsdb:* * /sbin/dns.afsdb %k + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * dns_query - Query the DNS + * @type: Query type (or NULL for straight host->IP lookup) + * @name: Name to look up + * @namelen: Length of name + * @options: Request options (or NULL if no options) + * @_result: Where to place the returned data. + * @_expiry: Where to store the result expiry time (or NULL) + * + * The data will be returned in the pointer at *result, and the caller is + * responsible for freeing it. + * + * The description should be of the form "[:]", and + * the options need to be appropriate for the query type requested. If no + * query_type is given, then the query is a straight hostname to IP address + * lookup. + * + * The DNS resolution lookup is performed by upcalling to userspace by way of + * requesting a key of type dns_resolver. + * + * Returns the size of the result on success, -ve error code otherwise. + */ +int dns_query(const char *type, const char *name, size_t namelen, + const char *options, char **_result, time_t *_expiry) +{ + struct key *rkey; + struct user_key_payload *upayload; + const struct cred *saved_cred; + size_t typelen, desclen; + char *desc, *cp; + int ret, len; + + kenter("%s,%*.*s,%zu,%s", + type, (int)namelen, (int)namelen, name, namelen, options); + + if (!name || namelen == 0 || !_result) + return -EINVAL; + + /* construct the query key description as "[:]" */ + typelen = 0; + desclen = 0; + if (type) { + typelen = strlen(type); + if (typelen < 1) + return -EINVAL; + desclen += typelen + 1; + } + + if (!namelen) + namelen = strlen(name); + if (namelen < 3) + return -EINVAL; + desclen += namelen + 1; + + desc = kmalloc(desclen, GFP_KERNEL); + if (!desc) + return -ENOMEM; + + cp = desc; + if (type) { + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + } + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + + if (!options) + options = ""; + kdebug("call request_key(,%s,%s)", desc, options); + + /* make the upcall, using special credentials to prevent the use of + * add_key() to preinstall malicious redirections + */ + saved_cred = override_creds(dns_resolver_cache); + rkey = request_key(&key_type_dns_resolver, desc, options); + revert_creds(saved_cred); + kfree(desc); + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + down_read(&rkey->sem); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto put; + + /* If the DNS server gave an error, return that to the caller */ + ret = rkey->type_data.x[0]; + if (ret) + goto put; + + upayload = rcu_dereference_protected(rkey->payload.data, + lockdep_is_held(&rkey->sem)); + len = upayload->datalen; + + ret = -ENOMEM; + *_result = kmalloc(len + 1, GFP_KERNEL); + if (!*_result) + goto put; + + memcpy(*_result, upayload->data, len + 1); + if (_expiry) + *_expiry = rkey->expiry; + + ret = len; +put: + up_read(&rkey->sem); + key_put(rkey); +out: + kleave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(dns_query); diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h new file mode 100644 index 00000000..189ca9e9 --- /dev/null +++ b/net/dns_resolver/internal.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 Wang Lei + * Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved. + * + * Internal DNS Rsolver stuff + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +/* + * dns_key.c + */ +extern const struct cred *dns_resolver_cache; + +/* + * debug tracing + */ +extern unsigned dns_resolver_debug; + +#define kdebug(FMT, ...) \ +do { \ + if (unlikely(dns_resolver_debug)) \ + printk(KERN_DEBUG "[%-6.6s] "FMT"\n", \ + current->comm, ##__VA_ARGS__); \ +} while (0) + +#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig new file mode 100644 index 00000000..c53ded2a --- /dev/null +++ b/net/dsa/Kconfig @@ -0,0 +1,60 @@ +menuconfig NET_DSA + bool "Distributed Switch Architecture support" + default n + depends on EXPERIMENTAL && NETDEVICES && !S390 + select PHYLIB + ---help--- + This allows you to use hardware switch chips that use + the Distributed Switch Architecture. + + +if NET_DSA + +# tagging formats +config NET_DSA_TAG_DSA + bool + default n + +config NET_DSA_TAG_EDSA + bool + default n + +config NET_DSA_TAG_TRAILER + bool + default n + + +# switch drivers +config NET_DSA_MV88E6XXX + bool + default n + +config NET_DSA_MV88E6060 + bool "Marvell 88E6060 ethernet switch chip support" + select NET_DSA_TAG_TRAILER + ---help--- + This enables support for the Marvell 88E6060 ethernet switch + chip. + +config NET_DSA_MV88E6XXX_NEED_PPU + bool + default n + +config NET_DSA_MV88E6131 + bool "Marvell 88E6085/6095/6095F/6131 ethernet switch chip support" + select NET_DSA_MV88E6XXX + select NET_DSA_MV88E6XXX_NEED_PPU + select NET_DSA_TAG_DSA + ---help--- + This enables support for the Marvell 88E6085/6095/6095F/6131 + ethernet switch chips. + +config NET_DSA_MV88E6123_61_65 + bool "Marvell 88E6123/6161/6165 ethernet switch chip support" + select NET_DSA_MV88E6XXX + select NET_DSA_TAG_EDSA + ---help--- + This enables support for the Marvell 88E6123/6161/6165 + ethernet switch chips. + +endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile new file mode 100644 index 00000000..2374faff --- /dev/null +++ b/net/dsa/Makefile @@ -0,0 +1,13 @@ +# tagging formats +obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o +obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o + +# switch drivers +obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o +obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o +obj-$(CONFIG_NET_DSA_MV88E6123_61_65) += mv88e6123_61_65.o +obj-$(CONFIG_NET_DSA_MV88E6131) += mv88e6131.o + +# the core +obj-$(CONFIG_NET_DSA) += dsa.o slave.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c new file mode 100644 index 00000000..3fb14b7c --- /dev/null +++ b/net/dsa/dsa.c @@ -0,0 +1,434 @@ +/* + * net/dsa/dsa.c - Hardware switch handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "dsa_priv.h" + +char dsa_driver_version[] = "0.1"; + + +/* switch driver registration ***********************************************/ +static DEFINE_MUTEX(dsa_switch_drivers_mutex); +static LIST_HEAD(dsa_switch_drivers); + +void register_switch_driver(struct dsa_switch_driver *drv) +{ + mutex_lock(&dsa_switch_drivers_mutex); + list_add_tail(&drv->list, &dsa_switch_drivers); + mutex_unlock(&dsa_switch_drivers_mutex); +} + +void unregister_switch_driver(struct dsa_switch_driver *drv) +{ + mutex_lock(&dsa_switch_drivers_mutex); + list_del_init(&drv->list); + mutex_unlock(&dsa_switch_drivers_mutex); +} + +static struct dsa_switch_driver * +dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) +{ + struct dsa_switch_driver *ret; + struct list_head *list; + char *name; + + ret = NULL; + name = NULL; + + mutex_lock(&dsa_switch_drivers_mutex); + list_for_each(list, &dsa_switch_drivers) { + struct dsa_switch_driver *drv; + + drv = list_entry(list, struct dsa_switch_driver, list); + + name = drv->probe(bus, sw_addr); + if (name != NULL) { + ret = drv; + break; + } + } + mutex_unlock(&dsa_switch_drivers_mutex); + + *_name = name; + + return ret; +} + + +/* basic switch operations **************************************************/ +static struct dsa_switch * +dsa_switch_setup(struct dsa_switch_tree *dst, int index, + struct device *parent, struct mii_bus *bus) +{ + struct dsa_chip_data *pd = dst->pd->chip + index; + struct dsa_switch_driver *drv; + struct dsa_switch *ds; + int ret; + char *name; + int i; + + /* + * Probe for switch model. + */ + drv = dsa_switch_probe(bus, pd->sw_addr, &name); + if (drv == NULL) { + printk(KERN_ERR "%s[%d]: could not detect attached switch\n", + dst->master_netdev->name, index); + return ERR_PTR(-EINVAL); + } + printk(KERN_INFO "%s[%d]: detected a %s switch\n", + dst->master_netdev->name, index, name); + + + /* + * Allocate and initialise switch state. + */ + ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); + if (ds == NULL) + return ERR_PTR(-ENOMEM); + + ds->dst = dst; + ds->index = index; + ds->pd = dst->pd->chip + index; + ds->drv = drv; + ds->master_mii_bus = bus; + + + /* + * Validate supplied switch configuration. + */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + char *name; + + name = pd->port_names[i]; + if (name == NULL) + continue; + + if (!strcmp(name, "cpu")) { + if (dst->cpu_switch != -1) { + printk(KERN_ERR "multiple cpu ports?!\n"); + ret = -EINVAL; + goto out; + } + dst->cpu_switch = index; + dst->cpu_port = i; + } else if (!strcmp(name, "dsa")) { + ds->dsa_port_mask |= 1 << i; + } else { + ds->phys_port_mask |= 1 << i; + } + } + + + /* + * If the CPU connects to this switch, set the switch tree + * tagging protocol to the preferred tagging format of this + * switch. + */ + if (ds->dst->cpu_switch == index) + ds->dst->tag_protocol = drv->tag_protocol; + + + /* + * Do basic register setup. + */ + ret = drv->setup(ds); + if (ret < 0) + goto out; + + ret = drv->set_addr(ds, dst->master_netdev->dev_addr); + if (ret < 0) + goto out; + + ds->slave_mii_bus = mdiobus_alloc(); + if (ds->slave_mii_bus == NULL) { + ret = -ENOMEM; + goto out; + } + dsa_slave_mii_bus_init(ds); + + ret = mdiobus_register(ds->slave_mii_bus); + if (ret < 0) + goto out_free; + + + /* + * Create network devices for physical switch ports. + */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + struct net_device *slave_dev; + + if (!(ds->phys_port_mask & (1 << i))) + continue; + + slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]); + if (slave_dev == NULL) { + printk(KERN_ERR "%s[%d]: can't create dsa " + "slave device for port %d(%s)\n", + dst->master_netdev->name, + index, i, pd->port_names[i]); + continue; + } + + ds->ports[i] = slave_dev; + } + + return ds; + +out_free: + mdiobus_free(ds->slave_mii_bus); +out: + kfree(ds); + return ERR_PTR(ret); +} + +static void dsa_switch_destroy(struct dsa_switch *ds) +{ +} + + +/* hooks for ethertype-less tagging formats *********************************/ +/* + * The original DSA tag format and some other tag formats have no + * ethertype, which means that we need to add a little hack to the + * networking receive path to make sure that received frames get + * the right ->protocol assigned to them when one of those tag + * formats is in use. + */ +bool dsa_uses_dsa_tags(void *dsa_ptr) +{ + struct dsa_switch_tree *dst = dsa_ptr; + + return !!(dst->tag_protocol == htons(ETH_P_DSA)); +} + +bool dsa_uses_trailer_tags(void *dsa_ptr) +{ + struct dsa_switch_tree *dst = dsa_ptr; + + return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); +} + + +/* link polling *************************************************************/ +static void dsa_link_poll_work(struct work_struct *ugly) +{ + struct dsa_switch_tree *dst; + int i; + + dst = container_of(ugly, struct dsa_switch_tree, link_poll_work); + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL && ds->drv->poll_link != NULL) + ds->drv->poll_link(ds); + } + + mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ)); +} + +static void dsa_link_poll_timer(unsigned long _dst) +{ + struct dsa_switch_tree *dst = (void *)_dst; + + schedule_work(&dst->link_poll_work); +} + + +/* platform driver init and cleanup *****************************************/ +static int dev_is_class(struct device *dev, void *class) +{ + if (dev->class != NULL && !strcmp(dev->class->name, class)) + return 1; + + return 0; +} + +static struct device *dev_find_class(struct device *parent, char *class) +{ + if (dev_is_class(parent, class)) { + get_device(parent); + return parent; + } + + return device_find_child(parent, class, dev_is_class); +} + +static struct mii_bus *dev_to_mii_bus(struct device *dev) +{ + struct device *d; + + d = dev_find_class(dev, "mdio_bus"); + if (d != NULL) { + struct mii_bus *bus; + + bus = to_mii_bus(d); + put_device(d); + + return bus; + } + + return NULL; +} + +static struct net_device *dev_to_net_device(struct device *dev) +{ + struct device *d; + + d = dev_find_class(dev, "net"); + if (d != NULL) { + struct net_device *nd; + + nd = to_net_dev(d); + dev_hold(nd); + put_device(d); + + return nd; + } + + return NULL; +} + +static int dsa_probe(struct platform_device *pdev) +{ + static int dsa_version_printed; + struct dsa_platform_data *pd = pdev->dev.platform_data; + struct net_device *dev; + struct dsa_switch_tree *dst; + int i; + + if (!dsa_version_printed++) + printk(KERN_NOTICE "Distributed Switch Architecture " + "driver version %s\n", dsa_driver_version); + + if (pd == NULL || pd->netdev == NULL) + return -EINVAL; + + dev = dev_to_net_device(pd->netdev); + if (dev == NULL) + return -EINVAL; + + if (dev->dsa_ptr != NULL) { + dev_put(dev); + return -EEXIST; + } + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (dst == NULL) { + dev_put(dev); + return -ENOMEM; + } + + platform_set_drvdata(pdev, dst); + + dst->pd = pd; + dst->master_netdev = dev; + dst->cpu_switch = -1; + dst->cpu_port = -1; + + for (i = 0; i < pd->nr_chips; i++) { + struct mii_bus *bus; + struct dsa_switch *ds; + + bus = dev_to_mii_bus(pd->chip[i].mii_bus); + if (bus == NULL) { + printk(KERN_ERR "%s[%d]: no mii bus found for " + "dsa switch\n", dev->name, i); + continue; + } + + ds = dsa_switch_setup(dst, i, &pdev->dev, bus); + if (IS_ERR(ds)) { + printk(KERN_ERR "%s[%d]: couldn't create dsa switch " + "instance (error %ld)\n", dev->name, i, + PTR_ERR(ds)); + continue; + } + + dst->ds[i] = ds; + if (ds->drv->poll_link != NULL) + dst->link_poll_needed = 1; + } + + /* + * If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dev->dsa_ptr = (void *)dst; + + if (dst->link_poll_needed) { + INIT_WORK(&dst->link_poll_work, dsa_link_poll_work); + init_timer(&dst->link_poll_timer); + dst->link_poll_timer.data = (unsigned long)dst; + dst->link_poll_timer.function = dsa_link_poll_timer; + dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); + add_timer(&dst->link_poll_timer); + } + + return 0; +} + +static int dsa_remove(struct platform_device *pdev) +{ + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i; + + if (dst->link_poll_needed) + del_timer_sync(&dst->link_poll_timer); + + flush_work_sync(&dst->link_poll_work); + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + dsa_switch_destroy(ds); + } + + return 0; +} + +static void dsa_shutdown(struct platform_device *pdev) +{ +} + +static struct platform_driver dsa_driver = { + .probe = dsa_probe, + .remove = dsa_remove, + .shutdown = dsa_shutdown, + .driver = { + .name = "dsa", + .owner = THIS_MODULE, + }, +}; + +static int __init dsa_init_module(void) +{ + return platform_driver_register(&dsa_driver); +} +module_init(dsa_init_module); + +static void __exit dsa_cleanup_module(void) +{ + platform_driver_unregister(&dsa_driver); +} +module_exit(dsa_cleanup_module); + +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:dsa"); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h new file mode 100644 index 00000000..4b0ea054 --- /dev/null +++ b/net/dsa/dsa_priv.h @@ -0,0 +1,181 @@ +/* + * net/dsa/dsa_priv.h - Hardware switch handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __DSA_PRIV_H +#define __DSA_PRIV_H + +#include +#include +#include +#include +#include + +struct dsa_switch { + /* + * Parent switch tree, and switch index. + */ + struct dsa_switch_tree *dst; + int index; + + /* + * Configuration data for this switch. + */ + struct dsa_chip_data *pd; + + /* + * The used switch driver. + */ + struct dsa_switch_driver *drv; + + /* + * Reference to mii bus to use. + */ + struct mii_bus *master_mii_bus; + + /* + * Slave mii_bus and devices for the individual ports. + */ + u32 dsa_port_mask; + u32 phys_port_mask; + struct mii_bus *slave_mii_bus; + struct net_device *ports[DSA_MAX_PORTS]; +}; + +struct dsa_switch_tree { + /* + * Configuration data for the platform device that owns + * this dsa switch tree instance. + */ + struct dsa_platform_data *pd; + + /* + * Reference to network device to use, and which tagging + * protocol to use. + */ + struct net_device *master_netdev; + __be16 tag_protocol; + + /* + * The switch and port to which the CPU is attached. + */ + s8 cpu_switch; + s8 cpu_port; + + /* + * Link state polling. + */ + int link_poll_needed; + struct work_struct link_poll_work; + struct timer_list link_poll_timer; + + /* + * Data for the individual switch chips. + */ + struct dsa_switch *ds[DSA_MAX_SWITCHES]; +}; + +static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) +{ + return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port); +} + +static inline u8 dsa_upstream_port(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + /* + * If this is the root switch (i.e. the switch that connects + * to the CPU), return the cpu port number on this switch. + * Else return the (DSA) port number that connects to the + * switch that is one hop closer to the cpu. + */ + if (dst->cpu_switch == ds->index) + return dst->cpu_port; + else + return ds->pd->rtable[dst->cpu_switch]; +} + +struct dsa_slave_priv { + /* + * The linux network interface corresponding to this + * switch port. + */ + struct net_device *dev; + + /* + * Which switch this port is a part of, and the port index + * for this port. + */ + struct dsa_switch *parent; + u8 port; + + /* + * The phylib phy_device pointer for the PHY connected + * to this port. + */ + struct phy_device *phy; +}; + +struct dsa_switch_driver { + struct list_head list; + + __be16 tag_protocol; + int priv_size; + + /* + * Probing and setup. + */ + char *(*probe)(struct mii_bus *bus, int sw_addr); + int (*setup)(struct dsa_switch *ds); + int (*set_addr)(struct dsa_switch *ds, u8 *addr); + + /* + * Access to the switch's PHY registers. + */ + int (*phy_read)(struct dsa_switch *ds, int port, int regnum); + int (*phy_write)(struct dsa_switch *ds, int port, + int regnum, u16 val); + + /* + * Link state polling and IRQ handling. + */ + void (*poll_link)(struct dsa_switch *ds); + + /* + * ethtool hardware statistics. + */ + void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data); + void (*get_ethtool_stats)(struct dsa_switch *ds, + int port, uint64_t *data); + int (*get_sset_count)(struct dsa_switch *ds); +}; + +/* dsa.c */ +extern char dsa_driver_version[]; +void register_switch_driver(struct dsa_switch_driver *type); +void unregister_switch_driver(struct dsa_switch_driver *type); + +/* slave.c */ +void dsa_slave_mii_bus_init(struct dsa_switch *ds); +struct net_device *dsa_slave_create(struct dsa_switch *ds, + struct device *parent, + int port, char *name); + +/* tag_dsa.c */ +netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); + +/* tag_edsa.c */ +netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); + +/* tag_trailer.c */ +netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); + + +#endif diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c new file mode 100644 index 00000000..8f4ff5a2 --- /dev/null +++ b/net/dsa/mv88e6060.c @@ -0,0 +1,288 @@ +/* + * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include "dsa_priv.h" + +#define REG_PORT(p) (8 + (p)) +#define REG_GLOBAL 0x0f + +static int reg_read(struct dsa_switch *ds, int addr, int reg) +{ + return mdiobus_read(ds->master_mii_bus, ds->pd->sw_addr + addr, reg); +} + +#define REG_READ(addr, reg) \ + ({ \ + int __ret; \ + \ + __ret = reg_read(ds, addr, reg); \ + if (__ret < 0) \ + return __ret; \ + __ret; \ + }) + + +static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) +{ + return mdiobus_write(ds->master_mii_bus, ds->pd->sw_addr + addr, + reg, val); +} + +#define REG_WRITE(addr, reg, val) \ + ({ \ + int __ret; \ + \ + __ret = reg_write(ds, addr, reg, val); \ + if (__ret < 0) \ + return __ret; \ + }) + +static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr) +{ + int ret; + + ret = mdiobus_read(bus, sw_addr + REG_PORT(0), 0x03); + if (ret >= 0) { + ret &= 0xfff0; + if (ret == 0x0600) + return "Marvell 88E6060"; + } + + return NULL; +} + +static int mv88e6060_switch_reset(struct dsa_switch *ds) +{ + int i; + int ret; + + /* + * Set all ports to the disabled state. + */ + for (i = 0; i < 6; i++) { + ret = REG_READ(REG_PORT(i), 0x04); + REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); + } + + /* + * Wait for transmit queues to drain. + */ + msleep(2); + + /* + * Reset the switch. + */ + REG_WRITE(REG_GLOBAL, 0x0a, 0xa130); + + /* + * Wait up to one second for reset to complete. + */ + for (i = 0; i < 1000; i++) { + ret = REG_READ(REG_GLOBAL, 0x00); + if ((ret & 0x8000) == 0x0000) + break; + + msleep(1); + } + if (i == 1000) + return -ETIMEDOUT; + + return 0; +} + +static int mv88e6060_setup_global(struct dsa_switch *ds) +{ + /* + * Disable discarding of frames with excessive collisions, + * set the maximum frame size to 1536 bytes, and mask all + * interrupt sources. + */ + REG_WRITE(REG_GLOBAL, 0x04, 0x0800); + + /* + * Enable automatic address learning, set the address + * database size to 1024 entries, and set the default aging + * time to 5 minutes. + */ + REG_WRITE(REG_GLOBAL, 0x0a, 0x2130); + + return 0; +} + +static int mv88e6060_setup_port(struct dsa_switch *ds, int p) +{ + int addr = REG_PORT(p); + + /* + * Do not force flow control, disable Ingress and Egress + * Header tagging, disable VLAN tunneling, and set the port + * state to Forwarding. Additionally, if this is the CPU + * port, enable Ingress and Egress Trailer tagging mode. + */ + REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003); + + /* + * Port based VLAN map: give each port its own address + * database, allow the CPU port to talk to each of the 'real' + * ports, and allow each of the 'real' ports to only talk to + * the CPU port. + */ + REG_WRITE(addr, 0x06, + ((p & 0xf) << 12) | + (dsa_is_cpu_port(ds, p) ? + ds->phys_port_mask : + (1 << ds->dst->cpu_port))); + + /* + * Port Association Vector: when learning source addresses + * of packets, add the address to the address database using + * a port bitmap that has only the bit for this port set and + * the other bits clear. + */ + REG_WRITE(addr, 0x0b, 1 << p); + + return 0; +} + +static int mv88e6060_setup(struct dsa_switch *ds) +{ + int i; + int ret; + + ret = mv88e6060_switch_reset(ds); + if (ret < 0) + return ret; + + /* @@@ initialise atu */ + + ret = mv88e6060_setup_global(ds); + if (ret < 0) + return ret; + + for (i = 0; i < 6; i++) { + ret = mv88e6060_setup_port(ds, i); + if (ret < 0) + return ret; + } + + return 0; +} + +static int mv88e6060_set_addr(struct dsa_switch *ds, u8 *addr) +{ + REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]); + REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]); + REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]); + + return 0; +} + +static int mv88e6060_port_to_phy_addr(int port) +{ + if (port >= 0 && port <= 5) + return port; + return -1; +} + +static int mv88e6060_phy_read(struct dsa_switch *ds, int port, int regnum) +{ + int addr; + + addr = mv88e6060_port_to_phy_addr(port); + if (addr == -1) + return 0xffff; + + return reg_read(ds, addr, regnum); +} + +static int +mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) +{ + int addr; + + addr = mv88e6060_port_to_phy_addr(port); + if (addr == -1) + return 0xffff; + + return reg_write(ds, addr, regnum, val); +} + +static void mv88e6060_poll_link(struct dsa_switch *ds) +{ + int i; + + for (i = 0; i < DSA_MAX_PORTS; i++) { + struct net_device *dev; + int uninitialized_var(port_status); + int link; + int speed; + int duplex; + int fc; + + dev = ds->ports[i]; + if (dev == NULL) + continue; + + link = 0; + if (dev->flags & IFF_UP) { + port_status = reg_read(ds, REG_PORT(i), 0x00); + if (port_status < 0) + continue; + + link = !!(port_status & 0x1000); + } + + if (!link) { + if (netif_carrier_ok(dev)) { + printk(KERN_INFO "%s: link down\n", dev->name); + netif_carrier_off(dev); + } + continue; + } + + speed = (port_status & 0x0100) ? 100 : 10; + duplex = (port_status & 0x0200) ? 1 : 0; + fc = ((port_status & 0xc000) == 0xc000) ? 1 : 0; + + if (!netif_carrier_ok(dev)) { + printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, " + "flow control %sabled\n", dev->name, + speed, duplex ? "full" : "half", + fc ? "en" : "dis"); + netif_carrier_on(dev); + } + } +} + +static struct dsa_switch_driver mv88e6060_switch_driver = { + .tag_protocol = htons(ETH_P_TRAILER), + .probe = mv88e6060_probe, + .setup = mv88e6060_setup, + .set_addr = mv88e6060_set_addr, + .phy_read = mv88e6060_phy_read, + .phy_write = mv88e6060_phy_write, + .poll_link = mv88e6060_poll_link, +}; + +static int __init mv88e6060_init(void) +{ + register_switch_driver(&mv88e6060_switch_driver); + return 0; +} +module_init(mv88e6060_init); + +static void __exit mv88e6060_cleanup(void) +{ + unregister_switch_driver(&mv88e6060_switch_driver); +} +module_exit(mv88e6060_cleanup); diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c new file mode 100644 index 00000000..52faaa21 --- /dev/null +++ b/net/dsa/mv88e6123_61_65.c @@ -0,0 +1,447 @@ +/* + * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include "dsa_priv.h" +#include "mv88e6xxx.h" + +static char *mv88e6123_61_65_probe(struct mii_bus *bus, int sw_addr) +{ + int ret; + + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); + if (ret >= 0) { + ret &= 0xfff0; + if (ret == 0x1210) + return "Marvell 88E6123"; + if (ret == 0x1610) + return "Marvell 88E6161"; + if (ret == 0x1650) + return "Marvell 88E6165"; + } + + return NULL; +} + +static int mv88e6123_61_65_switch_reset(struct dsa_switch *ds) +{ + int i; + int ret; + + /* + * Set all ports to the disabled state. + */ + for (i = 0; i < 8; i++) { + ret = REG_READ(REG_PORT(i), 0x04); + REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); + } + + /* + * Wait for transmit queues to drain. + */ + msleep(2); + + /* + * Reset the switch. + */ + REG_WRITE(REG_GLOBAL, 0x04, 0xc400); + + /* + * Wait up to one second for reset to complete. + */ + for (i = 0; i < 1000; i++) { + ret = REG_READ(REG_GLOBAL, 0x00); + if ((ret & 0xc800) == 0xc800) + break; + + msleep(1); + } + if (i == 1000) + return -ETIMEDOUT; + + return 0; +} + +static int mv88e6123_61_65_setup_global(struct dsa_switch *ds) +{ + int ret; + int i; + + /* + * Disable the PHY polling unit (since there won't be any + * external PHYs to poll), don't discard packets with + * excessive collisions, and mask all interrupt sources. + */ + REG_WRITE(REG_GLOBAL, 0x04, 0x0000); + + /* + * Set the default address aging time to 5 minutes, and + * enable address learn messages to be sent to all message + * ports. + */ + REG_WRITE(REG_GLOBAL, 0x0a, 0x0148); + + /* + * Configure the priority mapping registers. + */ + ret = mv88e6xxx_config_prio(ds); + if (ret < 0) + return ret; + + /* + * Configure the upstream port, and configure the upstream + * port as the port to which ingress and egress monitor frames + * are to be sent. + */ + REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110)); + + /* + * Disable remote management for now, and set the switch's + * DSA device number. + */ + REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f); + + /* + * Send all frames with destination addresses matching + * 01:80:c2:00:00:2x to the CPU port. + */ + REG_WRITE(REG_GLOBAL2, 0x02, 0xffff); + + /* + * Send all frames with destination addresses matching + * 01:80:c2:00:00:0x to the CPU port. + */ + REG_WRITE(REG_GLOBAL2, 0x03, 0xffff); + + /* + * Disable the loopback filter, disable flow control + * messages, disable flood broadcast override, disable + * removing of provider tags, disable ATU age violation + * interrupts, disable tag flow control, force flow + * control priority to the highest, and send all special + * multicast frames to the CPU at the highest priority. + */ + REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); + + /* + * Program the DSA routing table. + */ + for (i = 0; i < 32; i++) { + int nexthop; + + nexthop = 0x1f; + if (i != ds->index && i < ds->dst->pd->nr_chips) + nexthop = ds->pd->rtable[i] & 0x1f; + + REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); + } + + /* + * Clear all trunk masks. + */ + for (i = 0; i < 8; i++) + REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff); + + /* + * Clear all trunk mappings. + */ + for (i = 0; i < 16; i++) + REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11)); + + /* + * Disable ingress rate limiting by resetting all ingress + * rate limit registers to their initial state. + */ + for (i = 0; i < 6; i++) + REG_WRITE(REG_GLOBAL2, 0x09, 0x9000 | (i << 8)); + + /* + * Initialise cross-chip port VLAN table to reset defaults. + */ + REG_WRITE(REG_GLOBAL2, 0x0b, 0x9000); + + /* + * Clear the priority override table. + */ + for (i = 0; i < 16; i++) + REG_WRITE(REG_GLOBAL2, 0x0f, 0x8000 | (i << 8)); + + /* @@@ initialise AVB (22/23) watchdog (27) sdet (29) registers */ + + return 0; +} + +static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) +{ + int addr = REG_PORT(p); + u16 val; + + /* + * MAC Forcing register: don't force link, speed, duplex + * or flow control state to any particular values on physical + * ports, but force the CPU port and all DSA ports to 1000 Mb/s + * full duplex. + */ + if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) + REG_WRITE(addr, 0x01, 0x003e); + else + REG_WRITE(addr, 0x01, 0x0003); + + /* + * Do not limit the period of time that this port can be + * paused for by the remote end or the period of time that + * this port can pause the remote end. + */ + REG_WRITE(addr, 0x02, 0x0000); + + /* + * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock, + * disable Header mode, enable IGMP/MLD snooping, disable VLAN + * tunneling, determine priority by looking at 802.1p and IP + * priority fields (IP prio has precedence), and set STP state + * to Forwarding. + * + * If this is the CPU link, use DSA or EDSA tagging depending + * on which tagging mode was configured. + * + * If this is a link to another switch, use DSA tagging mode. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown unicasts and multicasts. + */ + val = 0x0433; + if (dsa_is_cpu_port(ds, p)) { + if (ds->dst->tag_protocol == htons(ETH_P_EDSA)) + val |= 0x3300; + else + val |= 0x0100; + } + if (ds->dsa_port_mask & (1 << p)) + val |= 0x0100; + if (p == dsa_upstream_port(ds)) + val |= 0x000c; + REG_WRITE(addr, 0x04, val); + + /* + * Port Control 1: disable trunking. Also, if this is the + * CPU port, enable learn messages to be sent to this port. + */ + REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); + + /* + * Port based VLAN map: give each port its own address + * database, allow the CPU port to talk to each of the 'real' + * ports, and allow each of the 'real' ports to only talk to + * the upstream port. + */ + val = (p & 0xf) << 12; + if (dsa_is_cpu_port(ds, p)) + val |= ds->phys_port_mask; + else + val |= 1 << dsa_upstream_port(ds); + REG_WRITE(addr, 0x06, val); + + /* + * Default VLAN ID and priority: don't set a default VLAN + * ID, and set the default packet priority to zero. + */ + REG_WRITE(addr, 0x07, 0x0000); + + /* + * Port Control 2: don't force a good FCS, set the maximum + * frame size to 10240 bytes, don't let the switch add or + * strip 802.1q tags, don't discard tagged or untagged frames + * on this port, do a destination address lookup on all + * received packets as usual, disable ARP mirroring and don't + * send a copy of all transmitted/received frames on this port + * to the CPU. + */ + REG_WRITE(addr, 0x08, 0x2080); + + /* + * Egress rate control: disable egress rate control. + */ + REG_WRITE(addr, 0x09, 0x0001); + + /* + * Egress rate control 2: disable egress rate control. + */ + REG_WRITE(addr, 0x0a, 0x0000); + + /* + * Port Association Vector: when learning source addresses + * of packets, add the address to the address database using + * a port bitmap that has only the bit for this port set and + * the other bits clear. + */ + REG_WRITE(addr, 0x0b, 1 << p); + + /* + * Port ATU control: disable limiting the number of address + * database entries that this port is allowed to use. + */ + REG_WRITE(addr, 0x0c, 0x0000); + + /* + * Priorit Override: disable DA, SA and VTU priority override. + */ + REG_WRITE(addr, 0x0d, 0x0000); + + /* + * Port Ethertype: use the Ethertype DSA Ethertype value. + */ + REG_WRITE(addr, 0x0f, ETH_P_EDSA); + + /* + * Tag Remap: use an identity 802.1p prio -> switch prio + * mapping. + */ + REG_WRITE(addr, 0x18, 0x3210); + + /* + * Tag Remap 2: use an identity 802.1p prio -> switch prio + * mapping. + */ + REG_WRITE(addr, 0x19, 0x7654); + + return 0; +} + +static int mv88e6123_61_65_setup(struct dsa_switch *ds) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int i; + int ret; + + mutex_init(&ps->smi_mutex); + mutex_init(&ps->stats_mutex); + + ret = mv88e6123_61_65_switch_reset(ds); + if (ret < 0) + return ret; + + /* @@@ initialise vtu and atu */ + + ret = mv88e6123_61_65_setup_global(ds); + if (ret < 0) + return ret; + + for (i = 0; i < 6; i++) { + ret = mv88e6123_61_65_setup_port(ds, i); + if (ret < 0) + return ret; + } + + return 0; +} + +static int mv88e6123_61_65_port_to_phy_addr(int port) +{ + if (port >= 0 && port <= 4) + return port; + return -1; +} + +static int +mv88e6123_61_65_phy_read(struct dsa_switch *ds, int port, int regnum) +{ + int addr = mv88e6123_61_65_port_to_phy_addr(port); + return mv88e6xxx_phy_read(ds, addr, regnum); +} + +static int +mv88e6123_61_65_phy_write(struct dsa_switch *ds, + int port, int regnum, u16 val) +{ + int addr = mv88e6123_61_65_port_to_phy_addr(port); + return mv88e6xxx_phy_write(ds, addr, regnum, val); +} + +static struct mv88e6xxx_hw_stat mv88e6123_61_65_hw_stats[] = { + { "in_good_octets", 8, 0x00, }, + { "in_bad_octets", 4, 0x02, }, + { "in_unicast", 4, 0x04, }, + { "in_broadcasts", 4, 0x06, }, + { "in_multicasts", 4, 0x07, }, + { "in_pause", 4, 0x16, }, + { "in_undersize", 4, 0x18, }, + { "in_fragments", 4, 0x19, }, + { "in_oversize", 4, 0x1a, }, + { "in_jabber", 4, 0x1b, }, + { "in_rx_error", 4, 0x1c, }, + { "in_fcs_error", 4, 0x1d, }, + { "out_octets", 8, 0x0e, }, + { "out_unicast", 4, 0x10, }, + { "out_broadcasts", 4, 0x13, }, + { "out_multicasts", 4, 0x12, }, + { "out_pause", 4, 0x15, }, + { "excessive", 4, 0x11, }, + { "collisions", 4, 0x1e, }, + { "deferred", 4, 0x05, }, + { "single", 4, 0x14, }, + { "multiple", 4, 0x17, }, + { "out_fcs_error", 4, 0x03, }, + { "late", 4, 0x1f, }, + { "hist_64bytes", 4, 0x08, }, + { "hist_65_127bytes", 4, 0x09, }, + { "hist_128_255bytes", 4, 0x0a, }, + { "hist_256_511bytes", 4, 0x0b, }, + { "hist_512_1023bytes", 4, 0x0c, }, + { "hist_1024_max_bytes", 4, 0x0d, }, +}; + +static void +mv88e6123_61_65_get_strings(struct dsa_switch *ds, int port, uint8_t *data) +{ + mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats), + mv88e6123_61_65_hw_stats, port, data); +} + +static void +mv88e6123_61_65_get_ethtool_stats(struct dsa_switch *ds, + int port, uint64_t *data) +{ + mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats), + mv88e6123_61_65_hw_stats, port, data); +} + +static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds) +{ + return ARRAY_SIZE(mv88e6123_61_65_hw_stats); +} + +static struct dsa_switch_driver mv88e6123_61_65_switch_driver = { + .tag_protocol = cpu_to_be16(ETH_P_EDSA), + .priv_size = sizeof(struct mv88e6xxx_priv_state), + .probe = mv88e6123_61_65_probe, + .setup = mv88e6123_61_65_setup, + .set_addr = mv88e6xxx_set_addr_indirect, + .phy_read = mv88e6123_61_65_phy_read, + .phy_write = mv88e6123_61_65_phy_write, + .poll_link = mv88e6xxx_poll_link, + .get_strings = mv88e6123_61_65_get_strings, + .get_ethtool_stats = mv88e6123_61_65_get_ethtool_stats, + .get_sset_count = mv88e6123_61_65_get_sset_count, +}; + +static int __init mv88e6123_61_65_init(void) +{ + register_switch_driver(&mv88e6123_61_65_switch_driver); + return 0; +} +module_init(mv88e6123_61_65_init); + +static void __exit mv88e6123_61_65_cleanup(void) +{ + unregister_switch_driver(&mv88e6123_61_65_switch_driver); +} +module_exit(mv88e6123_61_65_cleanup); diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c new file mode 100644 index 00000000..45f7411e --- /dev/null +++ b/net/dsa/mv88e6131.c @@ -0,0 +1,439 @@ +/* + * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include "dsa_priv.h" +#include "mv88e6xxx.h" + +/* + * Switch product IDs + */ +#define ID_6085 0x04a0 +#define ID_6095 0x0950 +#define ID_6131 0x1060 + +static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr) +{ + int ret; + + ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); + if (ret >= 0) { + ret &= 0xfff0; + if (ret == ID_6085) + return "Marvell 88E6085"; + if (ret == ID_6095) + return "Marvell 88E6095/88E6095F"; + if (ret == ID_6131) + return "Marvell 88E6131"; + } + + return NULL; +} + +static int mv88e6131_switch_reset(struct dsa_switch *ds) +{ + int i; + int ret; + + /* + * Set all ports to the disabled state. + */ + for (i = 0; i < 11; i++) { + ret = REG_READ(REG_PORT(i), 0x04); + REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); + } + + /* + * Wait for transmit queues to drain. + */ + msleep(2); + + /* + * Reset the switch. + */ + REG_WRITE(REG_GLOBAL, 0x04, 0xc400); + + /* + * Wait up to one second for reset to complete. + */ + for (i = 0; i < 1000; i++) { + ret = REG_READ(REG_GLOBAL, 0x00); + if ((ret & 0xc800) == 0xc800) + break; + + msleep(1); + } + if (i == 1000) + return -ETIMEDOUT; + + return 0; +} + +static int mv88e6131_setup_global(struct dsa_switch *ds) +{ + int ret; + int i; + + /* + * Enable the PHY polling unit, don't discard packets with + * excessive collisions, use a weighted fair queueing scheme + * to arbitrate between packet queues, set the maximum frame + * size to 1632, and mask all interrupt sources. + */ + REG_WRITE(REG_GLOBAL, 0x04, 0x4400); + + /* + * Set the default address aging time to 5 minutes, and + * enable address learn messages to be sent to all message + * ports. + */ + REG_WRITE(REG_GLOBAL, 0x0a, 0x0148); + + /* + * Configure the priority mapping registers. + */ + ret = mv88e6xxx_config_prio(ds); + if (ret < 0) + return ret; + + /* + * Set the VLAN ethertype to 0x8100. + */ + REG_WRITE(REG_GLOBAL, 0x19, 0x8100); + + /* + * Disable ARP mirroring, and configure the upstream port as + * the port to which ingress and egress monitor frames are to + * be sent. + */ + REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0); + + /* + * Disable cascade port functionality, and set the switch's + * DSA device number. + */ + REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f)); + + /* + * Send all frames with destination addresses matching + * 01:80:c2:00:00:0x to the CPU port. + */ + REG_WRITE(REG_GLOBAL2, 0x03, 0xffff); + + /* + * Ignore removed tag data on doubly tagged packets, disable + * flow control messages, force flow control priority to the + * highest, and send all special multicast frames to the CPU + * port at the highest priority. + */ + REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); + + /* + * Program the DSA routing table. + */ + for (i = 0; i < 32; i++) { + int nexthop; + + nexthop = 0x1f; + if (i != ds->index && i < ds->dst->pd->nr_chips) + nexthop = ds->pd->rtable[i] & 0x1f; + + REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop); + } + + /* + * Clear all trunk masks. + */ + for (i = 0; i < 8; i++) + REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff); + + /* + * Clear all trunk mappings. + */ + for (i = 0; i < 16; i++) + REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11)); + + /* + * Force the priority of IGMP/MLD snoop frames and ARP frames + * to the highest setting. + */ + REG_WRITE(REG_GLOBAL2, 0x0f, 0x00ff); + + return 0; +} + +static int mv88e6131_setup_port(struct dsa_switch *ds, int p) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int addr = REG_PORT(p); + u16 val; + + /* + * MAC Forcing register: don't force link, speed, duplex + * or flow control state to any particular values on physical + * ports, but force the CPU port and all DSA ports to 1000 Mb/s + * (100 Mb/s on 6085) full duplex. + */ + if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p)) + if (ps->id == ID_6085) + REG_WRITE(addr, 0x01, 0x003d); /* 100 Mb/s */ + else + REG_WRITE(addr, 0x01, 0x003e); /* 1000 Mb/s */ + else + REG_WRITE(addr, 0x01, 0x0003); + + /* + * Port Control: disable Core Tag, disable Drop-on-Lock, + * transmit frames unmodified, disable Header mode, + * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN + * tunneling, determine priority by looking at 802.1p and + * IP priority fields (IP prio has precedence), and set STP + * state to Forwarding. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown unicasts, and enable DSA tagging + * mode. + * + * If this is the link to another switch, use DSA tagging + * mode, but do not enable forwarding of unknown unicasts. + */ + val = 0x0433; + if (p == dsa_upstream_port(ds)) { + val |= 0x0104; + /* + * On 6085, unknown multicast forward is controlled + * here rather than in Port Control 2 register. + */ + if (ps->id == ID_6085) + val |= 0x0008; + } + if (ds->dsa_port_mask & (1 << p)) + val |= 0x0100; + REG_WRITE(addr, 0x04, val); + + /* + * Port Control 1: disable trunking. Also, if this is the + * CPU port, enable learn messages to be sent to this port. + */ + REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000); + + /* + * Port based VLAN map: give each port its own address + * database, allow the CPU port to talk to each of the 'real' + * ports, and allow each of the 'real' ports to only talk to + * the upstream port. + */ + val = (p & 0xf) << 12; + if (dsa_is_cpu_port(ds, p)) + val |= ds->phys_port_mask; + else + val |= 1 << dsa_upstream_port(ds); + REG_WRITE(addr, 0x06, val); + + /* + * Default VLAN ID and priority: don't set a default VLAN + * ID, and set the default packet priority to zero. + */ + REG_WRITE(addr, 0x07, 0x0000); + + /* + * Port Control 2: don't force a good FCS, don't use + * VLAN-based, source address-based or destination + * address-based priority overrides, don't let the switch + * add or strip 802.1q tags, don't discard tagged or + * untagged frames on this port, do a destination address + * lookup on received packets as usual, don't send a copy + * of all transmitted/received frames on this port to the + * CPU, and configure the upstream port number. + * + * If this is the upstream port for this switch, enable + * forwarding of unknown multicast addresses. + */ + if (ps->id == ID_6085) + /* + * on 6085, bits 3:0 are reserved, bit 6 control ARP + * mirroring, and multicast forward is handled in + * Port Control register. + */ + REG_WRITE(addr, 0x08, 0x0080); + else { + val = 0x0080 | dsa_upstream_port(ds); + if (p == dsa_upstream_port(ds)) + val |= 0x0040; + REG_WRITE(addr, 0x08, val); + } + + /* + * Rate Control: disable ingress rate limiting. + */ + REG_WRITE(addr, 0x09, 0x0000); + + /* + * Rate Control 2: disable egress rate limiting. + */ + REG_WRITE(addr, 0x0a, 0x0000); + + /* + * Port Association Vector: when learning source addresses + * of packets, add the address to the address database using + * a port bitmap that has only the bit for this port set and + * the other bits clear. + */ + REG_WRITE(addr, 0x0b, 1 << p); + + /* + * Tag Remap: use an identity 802.1p prio -> switch prio + * mapping. + */ + REG_WRITE(addr, 0x18, 0x3210); + + /* + * Tag Remap 2: use an identity 802.1p prio -> switch prio + * mapping. + */ + REG_WRITE(addr, 0x19, 0x7654); + + return 0; +} + +static int mv88e6131_setup(struct dsa_switch *ds) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int i; + int ret; + + mutex_init(&ps->smi_mutex); + mv88e6xxx_ppu_state_init(ds); + mutex_init(&ps->stats_mutex); + + ps->id = REG_READ(REG_PORT(0), 0x03) & 0xfff0; + + ret = mv88e6131_switch_reset(ds); + if (ret < 0) + return ret; + + /* @@@ initialise vtu and atu */ + + ret = mv88e6131_setup_global(ds); + if (ret < 0) + return ret; + + for (i = 0; i < 11; i++) { + ret = mv88e6131_setup_port(ds, i); + if (ret < 0) + return ret; + } + + return 0; +} + +static int mv88e6131_port_to_phy_addr(int port) +{ + if (port >= 0 && port <= 11) + return port; + return -1; +} + +static int +mv88e6131_phy_read(struct dsa_switch *ds, int port, int regnum) +{ + int addr = mv88e6131_port_to_phy_addr(port); + return mv88e6xxx_phy_read_ppu(ds, addr, regnum); +} + +static int +mv88e6131_phy_write(struct dsa_switch *ds, + int port, int regnum, u16 val) +{ + int addr = mv88e6131_port_to_phy_addr(port); + return mv88e6xxx_phy_write_ppu(ds, addr, regnum, val); +} + +static struct mv88e6xxx_hw_stat mv88e6131_hw_stats[] = { + { "in_good_octets", 8, 0x00, }, + { "in_bad_octets", 4, 0x02, }, + { "in_unicast", 4, 0x04, }, + { "in_broadcasts", 4, 0x06, }, + { "in_multicasts", 4, 0x07, }, + { "in_pause", 4, 0x16, }, + { "in_undersize", 4, 0x18, }, + { "in_fragments", 4, 0x19, }, + { "in_oversize", 4, 0x1a, }, + { "in_jabber", 4, 0x1b, }, + { "in_rx_error", 4, 0x1c, }, + { "in_fcs_error", 4, 0x1d, }, + { "out_octets", 8, 0x0e, }, + { "out_unicast", 4, 0x10, }, + { "out_broadcasts", 4, 0x13, }, + { "out_multicasts", 4, 0x12, }, + { "out_pause", 4, 0x15, }, + { "excessive", 4, 0x11, }, + { "collisions", 4, 0x1e, }, + { "deferred", 4, 0x05, }, + { "single", 4, 0x14, }, + { "multiple", 4, 0x17, }, + { "out_fcs_error", 4, 0x03, }, + { "late", 4, 0x1f, }, + { "hist_64bytes", 4, 0x08, }, + { "hist_65_127bytes", 4, 0x09, }, + { "hist_128_255bytes", 4, 0x0a, }, + { "hist_256_511bytes", 4, 0x0b, }, + { "hist_512_1023bytes", 4, 0x0c, }, + { "hist_1024_max_bytes", 4, 0x0d, }, +}; + +static void +mv88e6131_get_strings(struct dsa_switch *ds, int port, uint8_t *data) +{ + mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6131_hw_stats), + mv88e6131_hw_stats, port, data); +} + +static void +mv88e6131_get_ethtool_stats(struct dsa_switch *ds, + int port, uint64_t *data) +{ + mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6131_hw_stats), + mv88e6131_hw_stats, port, data); +} + +static int mv88e6131_get_sset_count(struct dsa_switch *ds) +{ + return ARRAY_SIZE(mv88e6131_hw_stats); +} + +static struct dsa_switch_driver mv88e6131_switch_driver = { + .tag_protocol = cpu_to_be16(ETH_P_DSA), + .priv_size = sizeof(struct mv88e6xxx_priv_state), + .probe = mv88e6131_probe, + .setup = mv88e6131_setup, + .set_addr = mv88e6xxx_set_addr_direct, + .phy_read = mv88e6131_phy_read, + .phy_write = mv88e6131_phy_write, + .poll_link = mv88e6xxx_poll_link, + .get_strings = mv88e6131_get_strings, + .get_ethtool_stats = mv88e6131_get_ethtool_stats, + .get_sset_count = mv88e6131_get_sset_count, +}; + +static int __init mv88e6131_init(void) +{ + register_switch_driver(&mv88e6131_switch_driver); + return 0; +} +module_init(mv88e6131_init); + +static void __exit mv88e6131_cleanup(void) +{ + unregister_switch_driver(&mv88e6131_switch_driver); +} +module_exit(mv88e6131_cleanup); diff --git a/net/dsa/mv88e6xxx.c b/net/dsa/mv88e6xxx.c new file mode 100644 index 00000000..efe661a9 --- /dev/null +++ b/net/dsa/mv88e6xxx.c @@ -0,0 +1,522 @@ +/* + * net/dsa/mv88e6xxx.c - Marvell 88e6xxx switch chip support + * Copyright (c) 2008 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include "dsa_priv.h" +#include "mv88e6xxx.h" + +/* + * If the switch's ADDR[4:0] strap pins are strapped to zero, it will + * use all 32 SMI bus addresses on its SMI bus, and all switch registers + * will be directly accessible on some {device address,register address} + * pair. If the ADDR[4:0] pins are not strapped to zero, the switch + * will only respond to SMI transactions to that specific address, and + * an indirect addressing mechanism needs to be used to access its + * registers. + */ +static int mv88e6xxx_reg_wait_ready(struct mii_bus *bus, int sw_addr) +{ + int ret; + int i; + + for (i = 0; i < 16; i++) { + ret = mdiobus_read(bus, sw_addr, 0); + if (ret < 0) + return ret; + + if ((ret & 0x8000) == 0) + return 0; + } + + return -ETIMEDOUT; +} + +int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg) +{ + int ret; + + if (sw_addr == 0) + return mdiobus_read(bus, addr, reg); + + /* + * Wait for the bus to become free. + */ + ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); + if (ret < 0) + return ret; + + /* + * Transmit the read command. + */ + ret = mdiobus_write(bus, sw_addr, 0, 0x9800 | (addr << 5) | reg); + if (ret < 0) + return ret; + + /* + * Wait for the read command to complete. + */ + ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); + if (ret < 0) + return ret; + + /* + * Read the data. + */ + ret = mdiobus_read(bus, sw_addr, 1); + if (ret < 0) + return ret; + + return ret & 0xffff; +} + +int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int ret; + + mutex_lock(&ps->smi_mutex); + ret = __mv88e6xxx_reg_read(ds->master_mii_bus, + ds->pd->sw_addr, addr, reg); + mutex_unlock(&ps->smi_mutex); + + return ret; +} + +int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr, + int reg, u16 val) +{ + int ret; + + if (sw_addr == 0) + return mdiobus_write(bus, addr, reg, val); + + /* + * Wait for the bus to become free. + */ + ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); + if (ret < 0) + return ret; + + /* + * Transmit the data to write. + */ + ret = mdiobus_write(bus, sw_addr, 1, val); + if (ret < 0) + return ret; + + /* + * Transmit the write command. + */ + ret = mdiobus_write(bus, sw_addr, 0, 0x9400 | (addr << 5) | reg); + if (ret < 0) + return ret; + + /* + * Wait for the write command to complete. + */ + ret = mv88e6xxx_reg_wait_ready(bus, sw_addr); + if (ret < 0) + return ret; + + return 0; +} + +int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int ret; + + mutex_lock(&ps->smi_mutex); + ret = __mv88e6xxx_reg_write(ds->master_mii_bus, + ds->pd->sw_addr, addr, reg, val); + mutex_unlock(&ps->smi_mutex); + + return ret; +} + +int mv88e6xxx_config_prio(struct dsa_switch *ds) +{ + /* + * Configure the IP ToS mapping registers. + */ + REG_WRITE(REG_GLOBAL, 0x10, 0x0000); + REG_WRITE(REG_GLOBAL, 0x11, 0x0000); + REG_WRITE(REG_GLOBAL, 0x12, 0x5555); + REG_WRITE(REG_GLOBAL, 0x13, 0x5555); + REG_WRITE(REG_GLOBAL, 0x14, 0xaaaa); + REG_WRITE(REG_GLOBAL, 0x15, 0xaaaa); + REG_WRITE(REG_GLOBAL, 0x16, 0xffff); + REG_WRITE(REG_GLOBAL, 0x17, 0xffff); + + /* + * Configure the IEEE 802.1p priority mapping register. + */ + REG_WRITE(REG_GLOBAL, 0x18, 0xfa41); + + return 0; +} + +int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr) +{ + REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]); + REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]); + REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]); + + return 0; +} + +int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr) +{ + int i; + int ret; + + for (i = 0; i < 6; i++) { + int j; + + /* + * Write the MAC address byte. + */ + REG_WRITE(REG_GLOBAL2, 0x0d, 0x8000 | (i << 8) | addr[i]); + + /* + * Wait for the write to complete. + */ + for (j = 0; j < 16; j++) { + ret = REG_READ(REG_GLOBAL2, 0x0d); + if ((ret & 0x8000) == 0) + break; + } + if (j == 16) + return -ETIMEDOUT; + } + + return 0; +} + +int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum) +{ + if (addr >= 0) + return mv88e6xxx_reg_read(ds, addr, regnum); + return 0xffff; +} + +int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val) +{ + if (addr >= 0) + return mv88e6xxx_reg_write(ds, addr, regnum, val); + return 0; +} + +#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU +static int mv88e6xxx_ppu_disable(struct dsa_switch *ds) +{ + int ret; + int i; + + ret = REG_READ(REG_GLOBAL, 0x04); + REG_WRITE(REG_GLOBAL, 0x04, ret & ~0x4000); + + for (i = 0; i < 1000; i++) { + ret = REG_READ(REG_GLOBAL, 0x00); + msleep(1); + if ((ret & 0xc000) != 0xc000) + return 0; + } + + return -ETIMEDOUT; +} + +static int mv88e6xxx_ppu_enable(struct dsa_switch *ds) +{ + int ret; + int i; + + ret = REG_READ(REG_GLOBAL, 0x04); + REG_WRITE(REG_GLOBAL, 0x04, ret | 0x4000); + + for (i = 0; i < 1000; i++) { + ret = REG_READ(REG_GLOBAL, 0x00); + msleep(1); + if ((ret & 0xc000) == 0xc000) + return 0; + } + + return -ETIMEDOUT; +} + +static void mv88e6xxx_ppu_reenable_work(struct work_struct *ugly) +{ + struct mv88e6xxx_priv_state *ps; + + ps = container_of(ugly, struct mv88e6xxx_priv_state, ppu_work); + if (mutex_trylock(&ps->ppu_mutex)) { + struct dsa_switch *ds = ((struct dsa_switch *)ps) - 1; + + if (mv88e6xxx_ppu_enable(ds) == 0) + ps->ppu_disabled = 0; + mutex_unlock(&ps->ppu_mutex); + } +} + +static void mv88e6xxx_ppu_reenable_timer(unsigned long _ps) +{ + struct mv88e6xxx_priv_state *ps = (void *)_ps; + + schedule_work(&ps->ppu_work); +} + +static int mv88e6xxx_ppu_access_get(struct dsa_switch *ds) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int ret; + + mutex_lock(&ps->ppu_mutex); + + /* + * If the PHY polling unit is enabled, disable it so that + * we can access the PHY registers. If it was already + * disabled, cancel the timer that is going to re-enable + * it. + */ + if (!ps->ppu_disabled) { + ret = mv88e6xxx_ppu_disable(ds); + if (ret < 0) { + mutex_unlock(&ps->ppu_mutex); + return ret; + } + ps->ppu_disabled = 1; + } else { + del_timer(&ps->ppu_timer); + ret = 0; + } + + return ret; +} + +static void mv88e6xxx_ppu_access_put(struct dsa_switch *ds) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + + /* + * Schedule a timer to re-enable the PHY polling unit. + */ + mod_timer(&ps->ppu_timer, jiffies + msecs_to_jiffies(10)); + mutex_unlock(&ps->ppu_mutex); +} + +void mv88e6xxx_ppu_state_init(struct dsa_switch *ds) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + + mutex_init(&ps->ppu_mutex); + INIT_WORK(&ps->ppu_work, mv88e6xxx_ppu_reenable_work); + init_timer(&ps->ppu_timer); + ps->ppu_timer.data = (unsigned long)ps; + ps->ppu_timer.function = mv88e6xxx_ppu_reenable_timer; +} + +int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum) +{ + int ret; + + ret = mv88e6xxx_ppu_access_get(ds); + if (ret >= 0) { + ret = mv88e6xxx_reg_read(ds, addr, regnum); + mv88e6xxx_ppu_access_put(ds); + } + + return ret; +} + +int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr, + int regnum, u16 val) +{ + int ret; + + ret = mv88e6xxx_ppu_access_get(ds); + if (ret >= 0) { + ret = mv88e6xxx_reg_write(ds, addr, regnum, val); + mv88e6xxx_ppu_access_put(ds); + } + + return ret; +} +#endif + +void mv88e6xxx_poll_link(struct dsa_switch *ds) +{ + int i; + + for (i = 0; i < DSA_MAX_PORTS; i++) { + struct net_device *dev; + int uninitialized_var(port_status); + int link; + int speed; + int duplex; + int fc; + + dev = ds->ports[i]; + if (dev == NULL) + continue; + + link = 0; + if (dev->flags & IFF_UP) { + port_status = mv88e6xxx_reg_read(ds, REG_PORT(i), 0x00); + if (port_status < 0) + continue; + + link = !!(port_status & 0x0800); + } + + if (!link) { + if (netif_carrier_ok(dev)) { + printk(KERN_INFO "%s: link down\n", dev->name); + netif_carrier_off(dev); + } + continue; + } + + switch (port_status & 0x0300) { + case 0x0000: + speed = 10; + break; + case 0x0100: + speed = 100; + break; + case 0x0200: + speed = 1000; + break; + default: + speed = -1; + break; + } + duplex = (port_status & 0x0400) ? 1 : 0; + fc = (port_status & 0x8000) ? 1 : 0; + + if (!netif_carrier_ok(dev)) { + printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, " + "flow control %sabled\n", dev->name, + speed, duplex ? "full" : "half", + fc ? "en" : "dis"); + netif_carrier_on(dev); + } + } +} + +static int mv88e6xxx_stats_wait(struct dsa_switch *ds) +{ + int ret; + int i; + + for (i = 0; i < 10; i++) { + ret = REG_READ(REG_GLOBAL, 0x1d); + if ((ret & 0x8000) == 0) + return 0; + } + + return -ETIMEDOUT; +} + +static int mv88e6xxx_stats_snapshot(struct dsa_switch *ds, int port) +{ + int ret; + + /* + * Snapshot the hardware statistics counters for this port. + */ + REG_WRITE(REG_GLOBAL, 0x1d, 0xdc00 | port); + + /* + * Wait for the snapshotting to complete. + */ + ret = mv88e6xxx_stats_wait(ds); + if (ret < 0) + return ret; + + return 0; +} + +static void mv88e6xxx_stats_read(struct dsa_switch *ds, int stat, u32 *val) +{ + u32 _val; + int ret; + + *val = 0; + + ret = mv88e6xxx_reg_write(ds, REG_GLOBAL, 0x1d, 0xcc00 | stat); + if (ret < 0) + return; + + ret = mv88e6xxx_stats_wait(ds); + if (ret < 0) + return; + + ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1e); + if (ret < 0) + return; + + _val = ret << 16; + + ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1f); + if (ret < 0) + return; + + *val = _val | ret; +} + +void mv88e6xxx_get_strings(struct dsa_switch *ds, + int nr_stats, struct mv88e6xxx_hw_stat *stats, + int port, uint8_t *data) +{ + int i; + + for (i = 0; i < nr_stats; i++) { + memcpy(data + i * ETH_GSTRING_LEN, + stats[i].string, ETH_GSTRING_LEN); + } +} + +void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, + int nr_stats, struct mv88e6xxx_hw_stat *stats, + int port, uint64_t *data) +{ + struct mv88e6xxx_priv_state *ps = (void *)(ds + 1); + int ret; + int i; + + mutex_lock(&ps->stats_mutex); + + ret = mv88e6xxx_stats_snapshot(ds, port); + if (ret < 0) { + mutex_unlock(&ps->stats_mutex); + return; + } + + /* + * Read each of the counters. + */ + for (i = 0; i < nr_stats; i++) { + struct mv88e6xxx_hw_stat *s = stats + i; + u32 low; + u32 high; + + mv88e6xxx_stats_read(ds, s->reg, &low); + if (s->sizeof_stat == 8) + mv88e6xxx_stats_read(ds, s->reg + 1, &high); + else + high = 0; + + data[i] = (((u64)high) << 32) | low; + } + + mutex_unlock(&ps->stats_mutex); +} diff --git a/net/dsa/mv88e6xxx.h b/net/dsa/mv88e6xxx.h new file mode 100644 index 00000000..61156ca2 --- /dev/null +++ b/net/dsa/mv88e6xxx.h @@ -0,0 +1,95 @@ +/* + * net/dsa/mv88e6xxx.h - Marvell 88e6xxx switch chip support + * Copyright (c) 2008 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __MV88E6XXX_H +#define __MV88E6XXX_H + +#define REG_PORT(p) (0x10 + (p)) +#define REG_GLOBAL 0x1b +#define REG_GLOBAL2 0x1c + +struct mv88e6xxx_priv_state { + /* + * When using multi-chip addressing, this mutex protects + * access to the indirect access registers. (In single-chip + * mode, this mutex is effectively useless.) + */ + struct mutex smi_mutex; + +#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU + /* + * Handles automatic disabling and re-enabling of the PHY + * polling unit. + */ + struct mutex ppu_mutex; + int ppu_disabled; + struct work_struct ppu_work; + struct timer_list ppu_timer; +#endif + + /* + * This mutex serialises access to the statistics unit. + * Hold this mutex over snapshot + dump sequences. + */ + struct mutex stats_mutex; + + int id; /* switch product id */ +}; + +struct mv88e6xxx_hw_stat { + char string[ETH_GSTRING_LEN]; + int sizeof_stat; + int reg; +}; + +int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg); +int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg); +int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr, + int reg, u16 val); +int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val); +int mv88e6xxx_config_prio(struct dsa_switch *ds); +int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr); +int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr); +int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum); +int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val); +void mv88e6xxx_ppu_state_init(struct dsa_switch *ds); +int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum); +int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr, + int regnum, u16 val); +void mv88e6xxx_poll_link(struct dsa_switch *ds); +void mv88e6xxx_get_strings(struct dsa_switch *ds, + int nr_stats, struct mv88e6xxx_hw_stat *stats, + int port, uint8_t *data); +void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, + int nr_stats, struct mv88e6xxx_hw_stat *stats, + int port, uint64_t *data); + +#define REG_READ(addr, reg) \ + ({ \ + int __ret; \ + \ + __ret = mv88e6xxx_reg_read(ds, addr, reg); \ + if (__ret < 0) \ + return __ret; \ + __ret; \ + }) + +#define REG_WRITE(addr, reg, val) \ + ({ \ + int __ret; \ + \ + __ret = mv88e6xxx_reg_write(ds, addr, reg, val); \ + if (__ret < 0) \ + return __ret; \ + }) + + + +#endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c new file mode 100644 index 00000000..0a47b6c3 --- /dev/null +++ b/net/dsa/slave.c @@ -0,0 +1,407 @@ +/* + * net/dsa/slave.c - Slave device handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +/* slave mii_bus handling ***************************************************/ +static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) +{ + struct dsa_switch *ds = bus->priv; + + if (ds->phys_port_mask & (1 << addr)) + return ds->drv->phy_read(ds, addr, reg); + + return 0xffff; +} + +static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) +{ + struct dsa_switch *ds = bus->priv; + + if (ds->phys_port_mask & (1 << addr)) + return ds->drv->phy_write(ds, addr, reg, val); + + return 0; +} + +void dsa_slave_mii_bus_init(struct dsa_switch *ds) +{ + ds->slave_mii_bus->priv = (void *)ds; + ds->slave_mii_bus->name = "dsa slave smi"; + ds->slave_mii_bus->read = dsa_slave_phy_read; + ds->slave_mii_bus->write = dsa_slave_phy_write; + snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x", + ds->master_mii_bus->id, ds->pd->sw_addr); + ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; +} + + +/* slave device handling ****************************************************/ +static int dsa_slave_init(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + dev->iflink = p->parent->dst->master_netdev->ifindex; + + return 0; +} + +static int dsa_slave_open(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct net_device *master = p->parent->dst->master_netdev; + int err; + + if (!(master->flags & IFF_UP)) + return -ENETDOWN; + + if (compare_ether_addr(dev->dev_addr, master->dev_addr)) { + err = dev_uc_add(master, dev->dev_addr); + if (err < 0) + goto out; + } + + if (dev->flags & IFF_ALLMULTI) { + err = dev_set_allmulti(master, 1); + if (err < 0) + goto del_unicast; + } + if (dev->flags & IFF_PROMISC) { + err = dev_set_promiscuity(master, 1); + if (err < 0) + goto clear_allmulti; + } + + return 0; + +clear_allmulti: + if (dev->flags & IFF_ALLMULTI) + dev_set_allmulti(master, -1); +del_unicast: + if (compare_ether_addr(dev->dev_addr, master->dev_addr)) + dev_uc_del(master, dev->dev_addr); +out: + return err; +} + +static int dsa_slave_close(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct net_device *master = p->parent->dst->master_netdev; + + dev_mc_unsync(master, dev); + dev_uc_unsync(master, dev); + if (dev->flags & IFF_ALLMULTI) + dev_set_allmulti(master, -1); + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(master, -1); + + if (compare_ether_addr(dev->dev_addr, master->dev_addr)) + dev_uc_del(master, dev->dev_addr); + + return 0; +} + +static void dsa_slave_change_rx_flags(struct net_device *dev, int change) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct net_device *master = p->parent->dst->master_netdev; + + if (change & IFF_ALLMULTI) + dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); + if (change & IFF_PROMISC) + dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1); +} + +static void dsa_slave_set_rx_mode(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct net_device *master = p->parent->dst->master_netdev; + + dev_mc_sync(master, dev); + dev_uc_sync(master, dev); +} + +static int dsa_slave_set_mac_address(struct net_device *dev, void *a) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct net_device *master = p->parent->dst->master_netdev; + struct sockaddr *addr = a; + int err; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + if (!(dev->flags & IFF_UP)) + goto out; + + if (compare_ether_addr(addr->sa_data, master->dev_addr)) { + err = dev_uc_add(master, addr->sa_data); + if (err < 0) + return err; + } + + if (compare_ether_addr(dev->dev_addr, master->dev_addr)) + dev_uc_del(master, dev->dev_addr); + +out: + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + + return 0; +} + +static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + if (p->phy != NULL) + return phy_mii_ioctl(p->phy, ifr, cmd); + + return -EOPNOTSUPP; +} + + +/* ethtool operations *******************************************************/ +static int +dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + int err; + + err = -EOPNOTSUPP; + if (p->phy != NULL) { + err = phy_read_status(p->phy); + if (err == 0) + err = phy_ethtool_gset(p->phy, cmd); + } + + return err; +} + +static int +dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + if (p->phy != NULL) + return phy_ethtool_sset(p->phy, cmd); + + return -EOPNOTSUPP; +} + +static void dsa_slave_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strncpy(drvinfo->driver, "dsa", 32); + strncpy(drvinfo->version, dsa_driver_version, 32); + strncpy(drvinfo->fw_version, "N/A", 32); + strncpy(drvinfo->bus_info, "platform", 32); +} + +static int dsa_slave_nway_reset(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + if (p->phy != NULL) + return genphy_restart_aneg(p->phy); + + return -EOPNOTSUPP; +} + +static u32 dsa_slave_get_link(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + if (p->phy != NULL) { + genphy_update_link(p->phy); + return p->phy->link; + } + + return -EOPNOTSUPP; +} + +static void dsa_slave_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (stringset == ETH_SS_STATS) { + int len = ETH_GSTRING_LEN; + + strncpy(data, "tx_packets", len); + strncpy(data + len, "tx_bytes", len); + strncpy(data + 2 * len, "rx_packets", len); + strncpy(data + 3 * len, "rx_bytes", len); + if (ds->drv->get_strings != NULL) + ds->drv->get_strings(ds, p->port, data + 4 * len); + } +} + +static void dsa_slave_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + data[0] = p->dev->stats.tx_packets; + data[1] = p->dev->stats.tx_bytes; + data[2] = p->dev->stats.rx_packets; + data[3] = p->dev->stats.rx_bytes; + if (ds->drv->get_ethtool_stats != NULL) + ds->drv->get_ethtool_stats(ds, p->port, data + 4); +} + +static int dsa_slave_get_sset_count(struct net_device *dev, int sset) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (sset == ETH_SS_STATS) { + int count; + + count = 4; + if (ds->drv->get_sset_count != NULL) + count += ds->drv->get_sset_count(ds); + + return count; + } + + return -EOPNOTSUPP; +} + +static const struct ethtool_ops dsa_slave_ethtool_ops = { + .get_settings = dsa_slave_get_settings, + .set_settings = dsa_slave_set_settings, + .get_drvinfo = dsa_slave_get_drvinfo, + .nway_reset = dsa_slave_nway_reset, + .get_link = dsa_slave_get_link, + .get_strings = dsa_slave_get_strings, + .get_ethtool_stats = dsa_slave_get_ethtool_stats, + .get_sset_count = dsa_slave_get_sset_count, +}; + +#ifdef CONFIG_NET_DSA_TAG_DSA +static const struct net_device_ops dsa_netdev_ops = { + .ndo_init = dsa_slave_init, + .ndo_open = dsa_slave_open, + .ndo_stop = dsa_slave_close, + .ndo_start_xmit = dsa_xmit, + .ndo_change_rx_flags = dsa_slave_change_rx_flags, + .ndo_set_rx_mode = dsa_slave_set_rx_mode, + .ndo_set_multicast_list = dsa_slave_set_rx_mode, + .ndo_set_mac_address = dsa_slave_set_mac_address, + .ndo_do_ioctl = dsa_slave_ioctl, +}; +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA +static const struct net_device_ops edsa_netdev_ops = { + .ndo_init = dsa_slave_init, + .ndo_open = dsa_slave_open, + .ndo_stop = dsa_slave_close, + .ndo_start_xmit = edsa_xmit, + .ndo_change_rx_flags = dsa_slave_change_rx_flags, + .ndo_set_rx_mode = dsa_slave_set_rx_mode, + .ndo_set_multicast_list = dsa_slave_set_rx_mode, + .ndo_set_mac_address = dsa_slave_set_mac_address, + .ndo_do_ioctl = dsa_slave_ioctl, +}; +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER +static const struct net_device_ops trailer_netdev_ops = { + .ndo_init = dsa_slave_init, + .ndo_open = dsa_slave_open, + .ndo_stop = dsa_slave_close, + .ndo_start_xmit = trailer_xmit, + .ndo_change_rx_flags = dsa_slave_change_rx_flags, + .ndo_set_rx_mode = dsa_slave_set_rx_mode, + .ndo_set_multicast_list = dsa_slave_set_rx_mode, + .ndo_set_mac_address = dsa_slave_set_mac_address, + .ndo_do_ioctl = dsa_slave_ioctl, +}; +#endif + +/* slave device setup *******************************************************/ +struct net_device * +dsa_slave_create(struct dsa_switch *ds, struct device *parent, + int port, char *name) +{ + struct net_device *master = ds->dst->master_netdev; + struct net_device *slave_dev; + struct dsa_slave_priv *p; + int ret; + + slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), + name, ether_setup); + if (slave_dev == NULL) + return slave_dev; + + slave_dev->features = master->vlan_features; + SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops); + memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN); + slave_dev->tx_queue_len = 0; + + switch (ds->dst->tag_protocol) { +#ifdef CONFIG_NET_DSA_TAG_DSA + case htons(ETH_P_DSA): + slave_dev->netdev_ops = &dsa_netdev_ops; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + case htons(ETH_P_EDSA): + slave_dev->netdev_ops = &edsa_netdev_ops; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + case htons(ETH_P_TRAILER): + slave_dev->netdev_ops = &trailer_netdev_ops; + break; +#endif + default: + BUG(); + } + + SET_NETDEV_DEV(slave_dev, parent); + slave_dev->vlan_features = master->vlan_features; + + p = netdev_priv(slave_dev); + p->dev = slave_dev; + p->parent = ds; + p->port = port; + p->phy = ds->slave_mii_bus->phy_map[port]; + + ret = register_netdev(slave_dev); + if (ret) { + printk(KERN_ERR "%s: error %d registering interface %s\n", + master->name, ret, slave_dev->name); + free_netdev(slave_dev); + return NULL; + } + + netif_carrier_off(slave_dev); + + if (p->phy != NULL) { + phy_attach(slave_dev, dev_name(&p->phy->dev), + 0, PHY_INTERFACE_MODE_GMII); + + p->phy->autoneg = AUTONEG_ENABLE; + p->phy->speed = 0; + p->phy->duplex = 0; + p->phy->advertising = p->phy->supported | ADVERTISED_Autoneg; + phy_start_aneg(p->phy); + } + + return slave_dev; +} diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c new file mode 100644 index 00000000..98dfe80b --- /dev/null +++ b/net/dsa/tag_dsa.c @@ -0,0 +1,205 @@ +/* + * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +#define DSA_HLEN 4 + +netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u8 *dsa_header; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + /* + * Convert the outermost 802.1q tag to a DSA tag for tagged + * packets, or insert a DSA tag between the addresses and + * the ethertype field for untagged packets. + */ + if (skb->protocol == htons(ETH_P_8021Q)) { + if (skb_cow_head(skb, 0) < 0) + goto out_free; + + /* + * Construct tagged FROM_CPU DSA tag from 802.1q tag. + */ + dsa_header = skb->data + 2 * ETH_ALEN; + dsa_header[0] = 0x60 | p->parent->index; + dsa_header[1] = p->port << 3; + + /* + * Move CFI field from byte 2 to byte 1. + */ + if (dsa_header[2] & 0x10) { + dsa_header[1] |= 0x01; + dsa_header[2] &= ~0x10; + } + } else { + if (skb_cow_head(skb, DSA_HLEN) < 0) + goto out_free; + skb_push(skb, DSA_HLEN); + + memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); + + /* + * Construct untagged FROM_CPU DSA tag. + */ + dsa_header = skb->data + 2 * ETH_ALEN; + dsa_header[0] = 0x40 | p->parent->index; + dsa_header[1] = p->port << 3; + dsa_header[2] = 0x00; + dsa_header[3] = 0x00; + } + + skb->protocol = htons(ETH_P_DSA); + + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; + +out_free: + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + u8 *dsa_header; + int source_device; + int source_port; + + if (unlikely(dst == NULL)) + goto out_drop; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb == NULL) + goto out; + + if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) + goto out_drop; + + /* + * The ethertype field is part of the DSA header. + */ + dsa_header = skb->data - 2; + + /* + * Check that frame type is either TO_CPU or FORWARD. + */ + if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0) + goto out_drop; + + /* + * Determine source device and port. + */ + source_device = dsa_header[0] & 0x1f; + source_port = (dsa_header[1] >> 3) & 0x1f; + + /* + * Check that the source device exists and that the source + * port is a registered DSA port. + */ + if (source_device >= dst->pd->nr_chips) + goto out_drop; + ds = dst->ds[source_device]; + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + goto out_drop; + + /* + * Convert the DSA header to an 802.1q header if the 'tagged' + * bit in the DSA header is set. If the 'tagged' bit is clear, + * delete the DSA header entirely. + */ + if (dsa_header[0] & 0x20) { + u8 new_header[4]; + + /* + * Insert 802.1q ethertype and copy the VLAN-related + * fields, but clear the bit that will hold CFI (since + * DSA uses that bit location for another purpose). + */ + new_header[0] = (ETH_P_8021Q >> 8) & 0xff; + new_header[1] = ETH_P_8021Q & 0xff; + new_header[2] = dsa_header[2] & ~0x10; + new_header[3] = dsa_header[3]; + + /* + * Move CFI bit from its place in the DSA header to + * its 802.1q-designated place. + */ + if (dsa_header[1] & 0x01) + new_header[2] |= 0x10; + + /* + * Update packet checksum if skb is CHECKSUM_COMPLETE. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __wsum c = skb->csum; + c = csum_add(c, csum_partial(new_header + 2, 2, 0)); + c = csum_sub(c, csum_partial(dsa_header + 2, 2, 0)); + skb->csum = c; + } + + memcpy(dsa_header, new_header, DSA_HLEN); + } else { + /* + * Remove DSA tag and update checksum. + */ + skb_pull_rcsum(skb, DSA_HLEN); + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - DSA_HLEN, + 2 * ETH_ALEN); + } + + skb->dev = ds->ports[source_port]; + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +static struct packet_type dsa_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_DSA), + .func = dsa_rcv, +}; + +static int __init dsa_init_module(void) +{ + dev_add_pack(&dsa_packet_type); + return 0; +} +module_init(dsa_init_module); + +static void __exit dsa_cleanup_module(void) +{ + dev_remove_pack(&dsa_packet_type); +} +module_exit(dsa_cleanup_module); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c new file mode 100644 index 00000000..6f383322 --- /dev/null +++ b/net/dsa/tag_edsa.c @@ -0,0 +1,224 @@ +/* + * net/dsa/tag_edsa.c - Ethertype DSA tagging + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +#define DSA_HLEN 4 +#define EDSA_HLEN 8 + +netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u8 *edsa_header; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + /* + * Convert the outermost 802.1q tag to a DSA tag and prepend + * a DSA ethertype field is the packet is tagged, or insert + * a DSA ethertype plus DSA tag between the addresses and the + * current ethertype field if the packet is untagged. + */ + if (skb->protocol == htons(ETH_P_8021Q)) { + if (skb_cow_head(skb, DSA_HLEN) < 0) + goto out_free; + skb_push(skb, DSA_HLEN); + + memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN); + + /* + * Construct tagged FROM_CPU DSA tag from 802.1q tag. + */ + edsa_header = skb->data + 2 * ETH_ALEN; + edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; + edsa_header[1] = ETH_P_EDSA & 0xff; + edsa_header[2] = 0x00; + edsa_header[3] = 0x00; + edsa_header[4] = 0x60 | p->parent->index; + edsa_header[5] = p->port << 3; + + /* + * Move CFI field from byte 6 to byte 5. + */ + if (edsa_header[6] & 0x10) { + edsa_header[5] |= 0x01; + edsa_header[6] &= ~0x10; + } + } else { + if (skb_cow_head(skb, EDSA_HLEN) < 0) + goto out_free; + skb_push(skb, EDSA_HLEN); + + memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN); + + /* + * Construct untagged FROM_CPU DSA tag. + */ + edsa_header = skb->data + 2 * ETH_ALEN; + edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; + edsa_header[1] = ETH_P_EDSA & 0xff; + edsa_header[2] = 0x00; + edsa_header[3] = 0x00; + edsa_header[4] = 0x40 | p->parent->index; + edsa_header[5] = p->port << 3; + edsa_header[6] = 0x00; + edsa_header[7] = 0x00; + } + + skb->protocol = htons(ETH_P_EDSA); + + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; + +out_free: + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + u8 *edsa_header; + int source_device; + int source_port; + + if (unlikely(dst == NULL)) + goto out_drop; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb == NULL) + goto out; + + if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) + goto out_drop; + + /* + * Skip the two null bytes after the ethertype. + */ + edsa_header = skb->data + 2; + + /* + * Check that frame type is either TO_CPU or FORWARD. + */ + if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0) + goto out_drop; + + /* + * Determine source device and port. + */ + source_device = edsa_header[0] & 0x1f; + source_port = (edsa_header[1] >> 3) & 0x1f; + + /* + * Check that the source device exists and that the source + * port is a registered DSA port. + */ + if (source_device >= dst->pd->nr_chips) + goto out_drop; + ds = dst->ds[source_device]; + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + goto out_drop; + + /* + * If the 'tagged' bit is set, convert the DSA tag to a 802.1q + * tag and delete the ethertype part. If the 'tagged' bit is + * clear, delete the ethertype and the DSA tag parts. + */ + if (edsa_header[0] & 0x20) { + u8 new_header[4]; + + /* + * Insert 802.1q ethertype and copy the VLAN-related + * fields, but clear the bit that will hold CFI (since + * DSA uses that bit location for another purpose). + */ + new_header[0] = (ETH_P_8021Q >> 8) & 0xff; + new_header[1] = ETH_P_8021Q & 0xff; + new_header[2] = edsa_header[2] & ~0x10; + new_header[3] = edsa_header[3]; + + /* + * Move CFI bit from its place in the DSA header to + * its 802.1q-designated place. + */ + if (edsa_header[1] & 0x01) + new_header[2] |= 0x10; + + skb_pull_rcsum(skb, DSA_HLEN); + + /* + * Update packet checksum if skb is CHECKSUM_COMPLETE. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __wsum c = skb->csum; + c = csum_add(c, csum_partial(new_header + 2, 2, 0)); + c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0)); + skb->csum = c; + } + + memcpy(edsa_header, new_header, DSA_HLEN); + + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - DSA_HLEN, + 2 * ETH_ALEN); + } else { + /* + * Remove DSA tag and update checksum. + */ + skb_pull_rcsum(skb, EDSA_HLEN); + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - EDSA_HLEN, + 2 * ETH_ALEN); + } + + skb->dev = ds->ports[source_port]; + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +static struct packet_type edsa_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_EDSA), + .func = edsa_rcv, +}; + +static int __init edsa_init_module(void) +{ + dev_add_pack(&edsa_packet_type); + return 0; +} +module_init(edsa_init_module); + +static void __exit edsa_cleanup_module(void) +{ + dev_remove_pack(&edsa_packet_type); +} +module_exit(edsa_cleanup_module); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c new file mode 100644 index 00000000..d6d7d0ad --- /dev/null +++ b/net/dsa/tag_trailer.c @@ -0,0 +1,133 @@ +/* + * net/dsa/tag_trailer.c - Trailer tag format handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include "dsa_priv.h" + +netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct sk_buff *nskb; + int padlen; + u8 *trailer; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + /* + * We have to make sure that the trailer ends up as the very + * last 4 bytes of the packet. This means that we have to pad + * the packet to the minimum ethernet frame size, if necessary, + * before adding the trailer. + */ + padlen = 0; + if (skb->len < 60) + padlen = 60 - skb->len; + + nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC); + if (nskb == NULL) { + kfree_skb(skb); + return NETDEV_TX_OK; + } + skb_reserve(nskb, NET_IP_ALIGN); + + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, skb_network_header(skb) - skb->head); + skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); + skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); + kfree_skb(skb); + + if (padlen) { + u8 *pad = skb_put(nskb, padlen); + memset(pad, 0, padlen); + } + + trailer = skb_put(nskb, 4); + trailer[0] = 0x80; + trailer[1] = 1 << p->port; + trailer[2] = 0x10; + trailer[3] = 0x00; + + nskb->protocol = htons(ETH_P_TRAILER); + + nskb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(nskb); + + return NETDEV_TX_OK; +} + +static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + u8 *trailer; + int source_port; + + if (unlikely(dst == NULL)) + goto out_drop; + ds = dst->ds[0]; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb == NULL) + goto out; + + if (skb_linearize(skb)) + goto out_drop; + + trailer = skb_tail_pointer(skb) - 4; + if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || + (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) + goto out_drop; + + source_port = trailer[1] & 7; + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + goto out_drop; + + pskb_trim_rcsum(skb, skb->len - 4); + + skb->dev = ds->ports[source_port]; + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +static struct packet_type trailer_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_TRAILER), + .func = trailer_rcv, +}; + +static int __init trailer_init_module(void) +{ + dev_add_pack(&trailer_packet_type); + return 0; +} +module_init(trailer_init_module); + +static void __exit trailer_cleanup_module(void) +{ + dev_remove_pack(&trailer_packet_type); +} +module_exit(trailer_cleanup_module); diff --git a/net/econet/Kconfig b/net/econet/Kconfig new file mode 100644 index 00000000..39a2d297 --- /dev/null +++ b/net/econet/Kconfig @@ -0,0 +1,36 @@ +# +# Acorn Econet/AUN protocols +# + +config ECONET + tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)" + depends on EXPERIMENTAL && INET + ---help--- + Econet is a fairly old and slow networking protocol mainly used by + Acorn computers to access file and print servers. It uses native + Econet network cards. AUN is an implementation of the higher level + parts of Econet that runs over ordinary Ethernet connections, on + top of the UDP packet protocol, which in turn runs on top of the + Internet protocol IP. + + If you say Y here, you can choose with the next two options whether + to send Econet/AUN traffic over a UDP Ethernet connection or over + a native Econet network card. + + To compile this driver as a module, choose M here: the module + will be called econet. + +config ECONET_AUNUDP + bool "AUN over UDP" + depends on ECONET + help + Say Y here if you want to send Econet/AUN traffic over a UDP + connection (UDP is a packet based protocol that runs on top of the + Internet protocol IP) using an ordinary Ethernet network card. + +config ECONET_NATIVE + bool "Native Econet" + depends on ECONET + help + Say Y here if you have a native Econet network card installed in + your computer. diff --git a/net/econet/Makefile b/net/econet/Makefile new file mode 100644 index 00000000..05fae8be --- /dev/null +++ b/net/econet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for Econet support code. +# + +obj-$(CONFIG_ECONET) += econet.o + +econet-y := af_econet.o diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c new file mode 100644 index 00000000..a1d9f378 --- /dev/null +++ b/net/econet/af_econet.c @@ -0,0 +1,1174 @@ +/* + * An implementation of the Acorn Econet and AUN protocols. + * Philip Blundell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static const struct proto_ops econet_ops; +static struct hlist_head econet_sklist; +static DEFINE_SPINLOCK(econet_lock); +static DEFINE_MUTEX(econet_mutex); + +/* Since there are only 256 possible network numbers (or fewer, depends + how you count) it makes sense to use a simple lookup table. */ +static struct net_device *net2dev_map[256]; + +#define EC_PORT_IP 0xd2 + +#ifdef CONFIG_ECONET_AUNUDP +static DEFINE_SPINLOCK(aun_queue_lock); +static struct socket *udpsock; +#define AUN_PORT 0x8000 + + +struct aunhdr +{ + unsigned char code; /* AUN magic protocol byte */ + unsigned char port; + unsigned char cb; + unsigned char pad; + unsigned long handle; +}; + +static unsigned long aun_seq; + +/* Queue of packets waiting to be transmitted. */ +static struct sk_buff_head aun_queue; +static struct timer_list ab_cleanup_timer; + +#endif /* CONFIG_ECONET_AUNUDP */ + +/* Per-packet information */ +struct ec_cb +{ + struct sockaddr_ec sec; + unsigned long cookie; /* Supplied by user. */ +#ifdef CONFIG_ECONET_AUNUDP + int done; + unsigned long seq; /* Sequencing */ + unsigned long timeout; /* Timeout */ + unsigned long start; /* jiffies */ +#endif +#ifdef CONFIG_ECONET_NATIVE + void (*sent)(struct sk_buff *, int result); +#endif +}; + +static void econet_remove_socket(struct hlist_head *list, struct sock *sk) +{ + spin_lock_bh(&econet_lock); + sk_del_node_init(sk); + spin_unlock_bh(&econet_lock); +} + +static void econet_insert_socket(struct hlist_head *list, struct sock *sk) +{ + spin_lock_bh(&econet_lock); + sk_add_node(sk, list); + spin_unlock_bh(&econet_lock); +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +static int econet_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + size_t copied; + int err; + + msg->msg_namelen = sizeof(struct sockaddr_ec); + + mutex_lock(&econet_mutex); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + goto out; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free; + sk->sk_stamp = skb->tstamp; + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + mutex_unlock(&econet_mutex); + return err; +} + +/* + * Bind an Econet socket. + */ + +static int econet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + struct sock *sk; + struct econet_sock *eo; + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ec) || + sec->sec_family != AF_ECONET) + return -EINVAL; + + mutex_lock(&econet_mutex); + + sk = sock->sk; + eo = ec_sk(sk); + + eo->cb = sec->cb; + eo->port = sec->port; + eo->station = sec->addr.station; + eo->net = sec->addr.net; + + mutex_unlock(&econet_mutex); + + return 0; +} + +#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE) +/* + * Queue a transmit result for the user to be told about. + */ + +static void tx_result(struct sock *sk, unsigned long cookie, int result) +{ + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + struct ec_cb *eb; + struct sockaddr_ec *sec; + + if (skb == NULL) + { + printk(KERN_DEBUG "ec: memory squeeze, transmit result dropped.\n"); + return; + } + + eb = (struct ec_cb *)&skb->cb; + sec = (struct sockaddr_ec *)&eb->sec; + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->cookie = cookie; + sec->type = ECTYPE_TRANSMIT_STATUS | result; + sec->sec_family = AF_ECONET; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} +#endif + +#ifdef CONFIG_ECONET_NATIVE +/* + * Called by the Econet hardware driver when a packet transmit + * has completed. Tell the user. + */ + +static void ec_tx_done(struct sk_buff *skb, int result) +{ + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + tx_result(skb->sk, eb->cookie, result); +} +#endif + +/* + * Send a packet. We have to work out which device it's going out on + * and hence whether to use real Econet or the UDP emulation. + */ + +static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name; + struct net_device *dev; + struct ec_addr addr; + int err; + unsigned char port, cb; +#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE) + struct sock *sk = sock->sk; + struct sk_buff *skb; + struct ec_cb *eb; +#endif +#ifdef CONFIG_ECONET_AUNUDP + struct msghdr udpmsg; + struct iovec iov[2]; + struct aunhdr ah; + struct sockaddr_in udpdest; + __kernel_size_t size; + mm_segment_t oldfs; + char *userbuf; +#endif + + /* + * Check the flags. + */ + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + /* + * Get and verify the address. + */ + + mutex_lock(&econet_mutex); + + if (saddr == NULL || msg->msg_namelen < sizeof(struct sockaddr_ec)) { + mutex_unlock(&econet_mutex); + return -EINVAL; + } + addr.station = saddr->addr.station; + addr.net = saddr->addr.net; + port = saddr->port; + cb = saddr->cb; + + /* Look for a device with the right network number. */ + dev = net2dev_map[addr.net]; + + /* If not directly reachable, use some default */ + if (dev == NULL) { + dev = net2dev_map[0]; + /* No interfaces at all? */ + if (dev == NULL) { + mutex_unlock(&econet_mutex); + return -ENETDOWN; + } + } + + if (dev->type == ARPHRD_ECONET) { + /* Real hardware Econet. We're not worthy etc. */ +#ifdef CONFIG_ECONET_NATIVE + unsigned short proto = 0; + int res; + + if (len + 15 > dev->mtu) { + mutex_unlock(&econet_mutex); + return -EMSGSIZE; + } + + dev_hold(dev); + + skb = sock_alloc_send_skb(sk, len+LL_ALLOCATED_SPACE(dev), + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb==NULL) + goto out_unlock; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reset_network_header(skb); + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->sec = *saddr; + eb->sent = ec_tx_done; + + err = -EINVAL; + res = dev_hard_header(skb, dev, ntohs(proto), &addr, NULL, len); + if (res < 0) + goto out_free; + if (res > 0) { + struct ec_framehdr *fh; + /* Poke in our control byte and + port number. Hack, hack. */ + fh = (struct ec_framehdr *)(skb->data); + fh->cb = cb; + fh->port = port; + if (sock->type != SOCK_DGRAM) { + skb_reset_tail_pointer(skb); + skb->len = 0; + } + } + + /* Copy the data. Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + /* + * Now send it + */ + + dev_queue_xmit(skb); + dev_put(dev); + mutex_unlock(&econet_mutex); + return len; + + out_free: + kfree_skb(skb); + out_unlock: + if (dev) + dev_put(dev); +#else + err = -EPROTOTYPE; +#endif + mutex_unlock(&econet_mutex); + + return err; + } + +#ifdef CONFIG_ECONET_AUNUDP + /* AUN virtual Econet. */ + + if (udpsock == NULL) { + mutex_unlock(&econet_mutex); + return -ENETDOWN; /* No socket - can't send */ + } + + if (len > 32768) { + err = -E2BIG; + goto error; + } + + /* Make up a UDP datagram and hand it off to some higher intellect. */ + + memset(&udpdest, 0, sizeof(udpdest)); + udpdest.sin_family = AF_INET; + udpdest.sin_port = htons(AUN_PORT); + + /* At the moment we use the stupid Acorn scheme of Econet address + y.x maps to IP a.b.c.x. This should be replaced with something + more flexible and more aware of subnet masks. */ + { + struct in_device *idev; + unsigned long network = 0; + + rcu_read_lock(); + idev = __in_dev_get_rcu(dev); + if (idev) { + if (idev->ifa_list) + network = ntohl(idev->ifa_list->ifa_address) & + 0xffffff00; /* !!! */ + } + rcu_read_unlock(); + udpdest.sin_addr.s_addr = htonl(network | addr.station); + } + + memset(&ah, 0, sizeof(ah)); + ah.port = port; + ah.cb = cb & 0x7f; + ah.code = 2; /* magic */ + + /* tack our header on the front of the iovec */ + size = sizeof(struct aunhdr); + iov[0].iov_base = (void *)&ah; + iov[0].iov_len = size; + + userbuf = vmalloc(len); + if (userbuf == NULL) { + err = -ENOMEM; + goto error; + } + + iov[1].iov_base = userbuf; + iov[1].iov_len = len; + err = memcpy_fromiovec(userbuf, msg->msg_iov, len); + if (err) + goto error_free_buf; + + /* Get a skbuff (no data, just holds our cb information) */ + if ((skb = sock_alloc_send_skb(sk, 0, + msg->msg_flags & MSG_DONTWAIT, + &err)) == NULL) + goto error_free_buf; + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->timeout = (5*HZ); + eb->start = jiffies; + ah.handle = aun_seq; + eb->seq = (aun_seq++); + eb->sec = *saddr; + + skb_queue_tail(&aun_queue, skb); + + udpmsg.msg_name = (void *)&udpdest; + udpmsg.msg_namelen = sizeof(udpdest); + udpmsg.msg_iov = &iov[0]; + udpmsg.msg_iovlen = 2; + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */ + err = sock_sendmsg(udpsock, &udpmsg, size); + set_fs(oldfs); + +error_free_buf: + vfree(userbuf); +error: +#else + err = -EPROTOTYPE; +#endif + mutex_unlock(&econet_mutex); + + return err; +} + +/* + * Look up the address of a socket. + */ + +static int econet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk; + struct econet_sock *eo; + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + + if (peer) + return -EOPNOTSUPP; + + memset(sec, 0, sizeof(*sec)); + mutex_lock(&econet_mutex); + + sk = sock->sk; + eo = ec_sk(sk); + + sec->sec_family = AF_ECONET; + sec->port = eo->port; + sec->addr.station = eo->station; + sec->addr.net = eo->net; + + mutex_unlock(&econet_mutex); + + *uaddr_len = sizeof(*sec); + return 0; +} + +static void econet_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + + if (!sk_has_allocations(sk)) { + sk_free(sk); + return; + } + + sk->sk_timer.expires = jiffies + 10 * HZ; + add_timer(&sk->sk_timer); + printk(KERN_DEBUG "econet socket destroy delayed\n"); +} + +/* + * Close an econet socket. + */ + +static int econet_release(struct socket *sock) +{ + struct sock *sk; + + mutex_lock(&econet_mutex); + + sk = sock->sk; + if (!sk) + goto out_unlock; + + econet_remove_socket(&econet_sklist, sk); + + /* + * Now the socket is dead. No more input will appear. + */ + + sk->sk_state_change(sk); /* It is useless. Just for sanity. */ + + sock_orphan(sk); + + /* Purge queues */ + + skb_queue_purge(&sk->sk_receive_queue); + + if (sk_has_allocations(sk)) { + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.expires = jiffies + HZ; + sk->sk_timer.function = econet_destroy_timer; + add_timer(&sk->sk_timer); + + goto out_unlock; + } + + sk_free(sk); + +out_unlock: + mutex_unlock(&econet_mutex); + return 0; +} + +static struct proto econet_proto = { + .name = "ECONET", + .owner = THIS_MODULE, + .obj_size = sizeof(struct econet_sock), +}; + +/* + * Create an Econet socket + */ + +static int econet_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct econet_sock *eo; + int err; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + /* Econet only provides datagram services. */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + + err = -ENOBUFS; + sk = sk_alloc(net, PF_ECONET, GFP_KERNEL, &econet_proto); + if (sk == NULL) + goto out; + + sk->sk_reuse = 1; + sock->ops = &econet_ops; + sock_init_data(sock, sk); + + eo = ec_sk(sk); + sock_reset_flag(sk, SOCK_ZAPPED); + sk->sk_family = PF_ECONET; + eo->num = protocol; + + econet_insert_socket(&econet_sklist, sk); + return 0; +out: + return err; +} + +/* + * Handle Econet specific ioctls + */ + +static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + struct ec_device *edev; + struct net_device *dev; + struct sockaddr_ec *sec; + int err; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + if ((dev = dev_get_by_name(&init_net, ifr.ifr_name)) == NULL) + return -ENODEV; + + sec = (struct sockaddr_ec *)&ifr.ifr_addr; + + mutex_lock(&econet_mutex); + + err = 0; + switch (cmd) { + case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + + edev = dev->ec_ptr; + if (edev == NULL) { + /* Magic up a new one. */ + edev = kzalloc(sizeof(struct ec_device), GFP_KERNEL); + if (edev == NULL) { + err = -ENOMEM; + break; + } + dev->ec_ptr = edev; + } else + net2dev_map[edev->net] = NULL; + edev->station = sec->addr.station; + edev->net = sec->addr.net; + net2dev_map[sec->addr.net] = dev; + if (!net2dev_map[0]) + net2dev_map[0] = dev; + break; + + case SIOCGIFADDR: + edev = dev->ec_ptr; + if (edev == NULL) { + err = -ENODEV; + break; + } + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->addr.station = edev->station; + sec->addr.net = edev->net; + sec->sec_family = AF_ECONET; + dev_put(dev); + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + err = -EFAULT; + break; + + default: + err = -EINVAL; + break; + } + + mutex_unlock(&econet_mutex); + + dev_put(dev); + + return err; +} + +/* + * Handle generic ioctls + */ + +static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + + switch(cmd) { + case SIOCGSTAMP: + return sock_get_timestamp(sk, argp); + + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, argp); + + case SIOCSIFADDR: + case SIOCGIFADDR: + return ec_dev_ioctl(sock, cmd, argp); + break; + + default: + return -ENOIOCTLCMD; + } + /*NOTREACHED*/ + return 0; +} + +static const struct net_proto_family econet_family_ops = { + .family = PF_ECONET, + .create = econet_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops econet_ops = { + .family = PF_ECONET, + .owner = THIS_MODULE, + .release = econet_release, + .bind = econet_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = econet_getname, + .poll = datagram_poll, + .ioctl = econet_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = econet_sendmsg, + .recvmsg = econet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE) +/* + * Find the listening socket, if any, for the given data. + */ + +static struct sock *ec_listening_socket(unsigned char port, unsigned char + station, unsigned char net) +{ + struct sock *sk; + struct hlist_node *node; + + spin_lock(&econet_lock); + sk_for_each(sk, node, &econet_sklist) { + struct econet_sock *opt = ec_sk(sk); + if ((opt->port == port || opt->port == 0) && + (opt->station == station || opt->station == 0) && + (opt->net == net || opt->net == 0)) { + sock_hold(sk); + goto found; + } + } + sk = NULL; +found: + spin_unlock(&econet_lock); + return sk; +} + +/* + * Queue a received packet for a socket. + */ + +static int ec_queue_packet(struct sock *sk, struct sk_buff *skb, + unsigned char stn, unsigned char net, + unsigned char cb, unsigned char port) +{ + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + struct sockaddr_ec *sec = (struct sockaddr_ec *)&eb->sec; + + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->sec_family = AF_ECONET; + sec->type = ECTYPE_PACKET_RECEIVED; + sec->port = port; + sec->cb = cb; + sec->addr.net = net; + sec->addr.station = stn; + + return sock_queue_rcv_skb(sk, skb); +} +#endif + +#ifdef CONFIG_ECONET_AUNUDP +/* + * Send an AUN protocol response. + */ + +static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_port = htons(AUN_PORT), + .sin_addr = {.s_addr = addr} + }; + struct aunhdr ah = {.code = code, .cb = cb, .handle = seq}; + struct kvec iov = {.iov_base = (void *)&ah, .iov_len = sizeof(ah)}; + struct msghdr udpmsg; + + udpmsg.msg_name = (void *)&sin; + udpmsg.msg_namelen = sizeof(sin); + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + kernel_sendmsg(udpsock, &udpmsg, &iov, 1, sizeof(ah)); +} + + +/* + * Handle incoming AUN packets. Work out if anybody wants them, + * and send positive or negative acknowledgements as appropriate. + */ + +static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len) +{ + struct iphdr *ip = ip_hdr(skb); + unsigned char stn = ntohl(ip->saddr) & 0xff; + struct dst_entry *dst = skb_dst(skb); + struct ec_device *edev = NULL; + struct sock *sk = NULL; + struct sk_buff *newskb; + + if (dst) + edev = dst->dev->ec_ptr; + + if (! edev) + goto bad; + + if ((sk = ec_listening_socket(ah->port, stn, edev->net)) == NULL) + goto bad; /* Nobody wants it */ + + newskb = alloc_skb((len - sizeof(struct aunhdr) + 15) & ~15, + GFP_ATOMIC); + if (newskb == NULL) + { + printk(KERN_DEBUG "AUN: memory squeeze, dropping packet.\n"); + /* Send nack and hope sender tries again */ + goto bad; + } + + memcpy(skb_put(newskb, len - sizeof(struct aunhdr)), (void *)(ah+1), + len - sizeof(struct aunhdr)); + + if (ec_queue_packet(sk, newskb, stn, edev->net, ah->cb, ah->port)) + { + /* Socket is bankrupt. */ + kfree_skb(newskb); + goto bad; + } + + aun_send_response(ip->saddr, ah->handle, 3, 0); + sock_put(sk); + return; + +bad: + aun_send_response(ip->saddr, ah->handle, 4, 0); + if (sk) + sock_put(sk); +} + +/* + * Handle incoming AUN transmit acknowledgements. If the sequence + * number matches something in our backlog then kill it and tell + * the user. If the remote took too long to reply then we may have + * dropped the packet already. + */ + +static void aun_tx_ack(unsigned long seq, int result) +{ + struct sk_buff *skb; + unsigned long flags; + struct ec_cb *eb; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb_queue_walk(&aun_queue, skb) { + eb = (struct ec_cb *)&skb->cb; + if (eb->seq == seq) + goto foundit; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + printk(KERN_DEBUG "AUN: unknown sequence %ld\n", seq); + return; + +foundit: + tx_result(skb->sk, eb->cookie, result); + skb_unlink(skb, &aun_queue); + spin_unlock_irqrestore(&aun_queue_lock, flags); + kfree_skb(skb); +} + +/* + * Deal with received AUN frames - sort out what type of thing it is + * and hand it to the right function. + */ + +static void aun_data_available(struct sock *sk, int slen) +{ + int err; + struct sk_buff *skb; + unsigned char *data; + struct aunhdr *ah; + size_t len; + + while ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) { + if (err == -EAGAIN) { + printk(KERN_ERR "AUN: no data available?!"); + return; + } + printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err); + } + + data = skb_transport_header(skb) + sizeof(struct udphdr); + ah = (struct aunhdr *)data; + len = skb->len - sizeof(struct udphdr); + + switch (ah->code) + { + case 2: + aun_incoming(skb, ah, len); + break; + case 3: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_OK); + break; + case 4: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_NOT_LISTENING); + break; + default: + printk(KERN_DEBUG "unknown AUN packet (type %d)\n", data[0]); + } + + skb_free_datagram(sk, skb); +} + +/* + * Called by the timer to manage the AUN transmit queue. If a packet + * was sent to a dead or nonexistent host then we will never get an + * acknowledgement back. After a few seconds we need to spot this and + * drop the packet. + */ + +static void ab_cleanup(unsigned long h) +{ + struct sk_buff *skb, *n; + unsigned long flags; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb_queue_walk_safe(&aun_queue, skb, n) { + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + if ((jiffies - eb->start) > eb->timeout) { + tx_result(skb->sk, eb->cookie, + ECTYPE_TRANSMIT_NOT_PRESENT); + skb_unlink(skb, &aun_queue); + kfree_skb(skb); + } + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + + mod_timer(&ab_cleanup_timer, jiffies + (HZ*2)); +} + +static int __init aun_udp_initialise(void) +{ + int error; + struct sockaddr_in sin; + + skb_queue_head_init(&aun_queue); + setup_timer(&ab_cleanup_timer, ab_cleanup, 0); + ab_cleanup_timer.expires = jiffies + (HZ*2); + add_timer(&ab_cleanup_timer); + + memset(&sin, 0, sizeof(sin)); + sin.sin_port = htons(AUN_PORT); + + /* We can count ourselves lucky Acorn machines are too dim to + speak IPv6. :-) */ + if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, 0, &udpsock)) < 0) + { + printk("AUN: socket error %d\n", -error); + return error; + } + + udpsock->sk->sk_reuse = 1; + udpsock->sk->sk_allocation = GFP_ATOMIC; /* we're going to call it + from interrupts */ + + error = udpsock->ops->bind(udpsock, (struct sockaddr *)&sin, + sizeof(sin)); + if (error < 0) + { + printk("AUN: bind error %d\n", -error); + goto release; + } + + udpsock->sk->sk_data_ready = aun_data_available; + + return 0; + +release: + sock_release(udpsock); + udpsock = NULL; + return error; +} +#endif + +#ifdef CONFIG_ECONET_NATIVE + +/* + * Receive an Econet frame from a device. + */ + +static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct ec_framehdr *hdr; + struct sock *sk = NULL; + struct ec_device *edev = dev->ec_ptr; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if (!edev) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, sizeof(struct ec_framehdr))) + goto drop; + + hdr = (struct ec_framehdr *) skb->data; + + /* First check for encapsulated IP */ + if (hdr->port == EC_PORT_IP) { + skb->protocol = htons(ETH_P_IP); + skb_pull(skb, sizeof(struct ec_framehdr)); + netif_rx(skb); + return NET_RX_SUCCESS; + } + + sk = ec_listening_socket(hdr->port, hdr->src_stn, hdr->src_net); + if (!sk) + goto drop; + + if (ec_queue_packet(sk, skb, edev->net, hdr->src_stn, hdr->cb, + hdr->port)) + goto drop; + sock_put(sk); + return NET_RX_SUCCESS; + +drop: + if (sk) + sock_put(sk); + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type econet_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_ECONET), + .func = econet_rcv, +}; + +static void econet_hw_initialise(void) +{ + dev_add_pack(&econet_packet_type); +} + +#endif + +static int econet_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct ec_device *edev; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + switch (msg) { + case NETDEV_UNREGISTER: + /* A device has gone down - kill any data we hold for it. */ + edev = dev->ec_ptr; + if (edev) + { + if (net2dev_map[0] == dev) + net2dev_map[0] = NULL; + net2dev_map[edev->net] = NULL; + kfree(edev); + dev->ec_ptr = NULL; + } + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block econet_netdev_notifier = { + .notifier_call =econet_notifier, +}; + +static void __exit econet_proto_exit(void) +{ +#ifdef CONFIG_ECONET_AUNUDP + del_timer(&ab_cleanup_timer); + if (udpsock) + sock_release(udpsock); +#endif + unregister_netdevice_notifier(&econet_netdev_notifier); +#ifdef CONFIG_ECONET_NATIVE + dev_remove_pack(&econet_packet_type); +#endif + sock_unregister(econet_family_ops.family); + proto_unregister(&econet_proto); +} + +static int __init econet_proto_init(void) +{ + int err = proto_register(&econet_proto, 0); + + if (err != 0) + goto out; + sock_register(&econet_family_ops); +#ifdef CONFIG_ECONET_AUNUDP + aun_udp_initialise(); +#endif +#ifdef CONFIG_ECONET_NATIVE + econet_hw_initialise(); +#endif + register_netdevice_notifier(&econet_netdev_notifier); +out: + return err; +} + +module_init(econet_proto_init); +module_exit(econet_proto_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_ECONET); diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile new file mode 100644 index 00000000..7cef1d8a --- /dev/null +++ b/net/ethernet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Ethernet layer. +# + +obj-y += eth.o +obj-$(subst m,y,$(CONFIG_IPX)) += pe2.o +obj-$(subst m,y,$(CONFIG_ATALK)) += pe2.o diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c new file mode 100644 index 00000000..2780e9b2 --- /dev/null +++ b/net/ethernet/eth.c @@ -0,0 +1,395 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Ethernet-type device handling. + * + * Version: @(#)eth.c 1.0.7 05/25/93 + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Florian La Roche, + * Alan Cox, + * + * Fixes: + * Mr Linux : Arp problems + * Alan Cox : Generic queue tidyup (very tiny here) + * Alan Cox : eth_header ntohs should be htons + * Alan Cox : eth_rebuild_header missing an htons and + * minor other things. + * Tegge : Arp bug fixes. + * Florian : Removed many unnecessary functions, code cleanup + * and changes for new arp and skbuff. + * Alan Cox : Redid header building to reflect new format. + * Alan Cox : ARP only when compiled with CONFIG_INET + * Greg Page : 802.2 and SNAP stuff. + * Alan Cox : MAC layer pointers/new format. + * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding. + * Alan Cox : Protect against forwarding explosions with + * older network drivers and IFF_ALLMULTI. + * Christer Weinigel : Better rebuild header message. + * Andrew Morton : 26Feb01: kill ether_setup() - use netdev_boot_setup(). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__setup("ether=", netdev_boot_setup); + +/** + * eth_header - create the Ethernet header + * @skb: buffer to alter + * @dev: source device + * @type: Ethernet type field + * @daddr: destination address (NULL leave destination address) + * @saddr: source address (NULL use device source address) + * @len: packet length (<= skb->len) + * + * + * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length + * in here instead. + */ +int eth_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + + if (type != ETH_P_802_3 && type != ETH_P_802_2) + eth->h_proto = htons(type); + else + eth->h_proto = htons(len); + + /* + * Set the source hardware address. + */ + + if (!saddr) + saddr = dev->dev_addr; + memcpy(eth->h_source, saddr, ETH_ALEN); + + if (daddr) { + memcpy(eth->h_dest, daddr, ETH_ALEN); + return ETH_HLEN; + } + + /* + * Anyway, the loopback-device should never use this function... + */ + + if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) { + memset(eth->h_dest, 0, ETH_ALEN); + return ETH_HLEN; + } + + return -ETH_HLEN; +} +EXPORT_SYMBOL(eth_header); + +/** + * eth_rebuild_header- rebuild the Ethernet MAC header. + * @skb: socket buffer to update + * + * This is called after an ARP or IPV6 ndisc it's resolution on this + * sk_buff. We now let protocol (ARP) fill in the other fields. + * + * This routine CANNOT use cached dst->neigh! + * Really, it is used only when dst->neigh is wrong. + */ +int eth_rebuild_header(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + struct net_device *dev = skb->dev; + + switch (eth->h_proto) { +#ifdef CONFIG_INET + case htons(ETH_P_IP): + return arp_find(eth->h_dest, skb); +#endif + default: + printk(KERN_DEBUG + "%s: unable to resolve type %X addresses.\n", + dev->name, ntohs(eth->h_proto)); + + memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); + break; + } + + return 0; +} +EXPORT_SYMBOL(eth_rebuild_header); + +/** + * eth_type_trans - determine the packet's protocol ID. + * @skb: received socket data + * @dev: receiving network device + * + * The rule here is that we + * assume 802.3 if the type field is short enough to be a length. + * This is normal practice and works for any 'now in use' protocol. + */ +__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + struct ethhdr *eth; + + skb->dev = dev; + skb_reset_mac_header(skb); + skb_pull_inline(skb, ETH_HLEN); + eth = eth_hdr(skb); + + if (unlikely(is_multicast_ether_addr(eth->h_dest))) { + if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } + + /* + * This ALLMULTI check should be redundant by 1.4 + * so don't forget to remove it. + * + * Seems, you forgot to remove it. All silly devices + * seems to set IFF_PROMISC. + */ + + else if (1 /*dev->flags&IFF_PROMISC */ ) { + if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr))) + skb->pkt_type = PACKET_OTHERHOST; + } + + /* + * Some variants of DSA tagging don't have an ethertype field + * at all, so we check here whether one of those tagging + * variants has been configured on the receiving interface, + * and if so, set skb->protocol without looking at the packet. + */ + if (netdev_uses_dsa_tags(dev)) + return htons(ETH_P_DSA); + if (netdev_uses_trailer_tags(dev)) + return htons(ETH_P_TRAILER); + + if (ntohs(eth->h_proto) >= 1536) + return eth->h_proto; + + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ + if (skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF) + return htons(ETH_P_802_3); + + /* + * Real 802.2 LLC + */ + return htons(ETH_P_802_2); +} +EXPORT_SYMBOL(eth_type_trans); + +/** + * eth_header_parse - extract hardware address from packet + * @skb: packet to extract header from + * @haddr: destination buffer + */ +int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + const struct ethhdr *eth = eth_hdr(skb); + memcpy(haddr, eth->h_source, ETH_ALEN); + return ETH_ALEN; +} +EXPORT_SYMBOL(eth_header_parse); + +/** + * eth_header_cache - fill cache entry from neighbour + * @neigh: source neighbour + * @hh: destination cache entry + * Create an Ethernet header template from the neighbour. + */ +int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh) +{ + __be16 type = hh->hh_type; + struct ethhdr *eth; + const struct net_device *dev = neigh->dev; + + eth = (struct ethhdr *) + (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth)))); + + if (type == htons(ETH_P_802_3)) + return -1; + + eth->h_proto = type; + memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); + memcpy(eth->h_dest, neigh->ha, ETH_ALEN); + hh->hh_len = ETH_HLEN; + return 0; +} +EXPORT_SYMBOL(eth_header_cache); + +/** + * eth_header_cache_update - update cache entry + * @hh: destination cache entry + * @dev: network device + * @haddr: new hardware address + * + * Called by Address Resolution module to notify changes in address. + */ +void eth_header_cache_update(struct hh_cache *hh, + const struct net_device *dev, + const unsigned char *haddr) +{ + memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)), + haddr, ETH_ALEN); +} +EXPORT_SYMBOL(eth_header_cache_update); + +/** + * eth_mac_addr - set new Ethernet hardware address + * @dev: network device + * @p: socket address + * Change hardware address of device. + * + * This doesn't change hardware matching, so needs to be overridden + * for most real devices. + */ +int eth_mac_addr(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (netif_running(dev)) + return -EBUSY; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + return 0; +} +EXPORT_SYMBOL(eth_mac_addr); + +/** + * eth_change_mtu - set new MTU size + * @dev: network device + * @new_mtu: new Maximum Transfer Unit + * + * Allow changing MTU size. Needs to be overridden for devices + * supporting jumbo frames. + */ +int eth_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > ETH_DATA_LEN) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL(eth_change_mtu); + +int eth_validate_addr(struct net_device *dev) +{ + if (!is_valid_ether_addr(dev->dev_addr)) + return -EADDRNOTAVAIL; + + return 0; +} +EXPORT_SYMBOL(eth_validate_addr); + +const struct header_ops eth_header_ops ____cacheline_aligned = { + .create = eth_header, + .parse = eth_header_parse, + .rebuild = eth_rebuild_header, + .cache = eth_header_cache, + .cache_update = eth_header_cache_update, +}; + +/** + * ether_setup - setup Ethernet network device + * @dev: network device + * Fill in the fields of the device structure with Ethernet-generic values. + */ +void ether_setup(struct net_device *dev) +{ + dev->header_ops = ð_header_ops; + dev->type = ARPHRD_ETHER; + dev->hard_header_len = ETH_HLEN; + dev->mtu = ETH_DATA_LEN; + dev->addr_len = ETH_ALEN; + dev->tx_queue_len = 1000; /* Ethernet wants good queues */ + dev->flags = IFF_BROADCAST|IFF_MULTICAST; + dev->priv_flags = IFF_TX_SKB_SHARING; + + memset(dev->broadcast, 0xFF, ETH_ALEN); + +} +EXPORT_SYMBOL(ether_setup); + +/** + * alloc_etherdev_mqs - Allocates and sets up an Ethernet device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this Ethernet device + * @txqs: The number of TX queues this device has. + * @rxqs: The number of RX queues this device has. + * + * Fill in the fields of the device structure with Ethernet-generic + * values. Basically does everything except registering the device. + * + * Constructs a new net device, complete with a private data area of + * size (sizeof_priv). A 32-byte (not bit) alignment is enforced for + * this private data area. + */ + +struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, + unsigned int rxqs) +{ + return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs); +} +EXPORT_SYMBOL(alloc_etherdev_mqs); + +static size_t _format_mac_addr(char *buf, int buflen, + const unsigned char *addr, int len) +{ + int i; + char *cp = buf; + + for (i = 0; i < len; i++) { + cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]); + if (i == len - 1) + break; + cp += scnprintf(cp, buflen - (cp - buf), ":"); + } + return cp - buf; +} + +ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) +{ + size_t l; + + l = _format_mac_addr(buf, PAGE_SIZE, addr, len); + l += scnprintf(buf + l, PAGE_SIZE - l, "\n"); + return (ssize_t)l; +} +EXPORT_SYMBOL(sysfs_format_mac); diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c new file mode 100644 index 00000000..85d574ad --- /dev/null +++ b/net/ethernet/pe2.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include +#include + +#include + +static int pEII_request(struct datalink_proto *dl, + struct sk_buff *skb, unsigned char *dest_node) +{ + struct net_device *dev = skb->dev; + + skb->protocol = htons(ETH_P_IPX); + dev_hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len); + return dev_queue_xmit(skb); +} + +struct datalink_proto *make_EII_client(void) +{ + struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + + if (proto) { + proto->header_length = 0; + proto->request = pEII_request; + } + + return proto; +} +EXPORT_SYMBOL(make_EII_client); + +void destroy_EII_client(struct datalink_proto *dl) +{ + kfree(dl); +} +EXPORT_SYMBOL(destroy_EII_client); diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig new file mode 100644 index 00000000..1c1de97d --- /dev/null +++ b/net/ieee802154/Kconfig @@ -0,0 +1,12 @@ +config IEEE802154 + tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + IEEE Std 802.15.4 defines a low data rate, low power and low + complexity short range wireless personal area networks. It was + designed to organise networks of sensors, switches, etc automation + devices. Maximum allowed data rate is 250 kb/s and typical personal + operating space around 10m. + + Say Y here to compile LR-WPAN support into the kernel or say M to + compile it as modules. diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile new file mode 100644 index 00000000..5761185f --- /dev/null +++ b/net/ieee802154/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_IEEE802154) += ieee802154.o af_802154.o +ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o wpan-class.o +af_802154-y := af_ieee802154.o raw.o dgram.o diff --git a/net/ieee802154/af802154.h b/net/ieee802154/af802154.h new file mode 100644 index 00000000..b1ec5253 --- /dev/null +++ b/net/ieee802154/af802154.h @@ -0,0 +1,36 @@ +/* + * Internal interfaces for ieee 802.15.4 address family. + * + * Copyright 2007, 2008, 2009 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Dmitry Eremin-Solenikov + */ + +#ifndef AF802154_H +#define AF802154_H + +struct sk_buff; +struct net_devce; +extern struct proto ieee802154_raw_prot; +extern struct proto ieee802154_dgram_prot; +void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb); +int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb); +struct net_device *ieee802154_get_dev(struct net *net, + struct ieee802154_addr *addr); + +#endif diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c new file mode 100644 index 00000000..6df6ecf4 --- /dev/null +++ b/net/ieee802154/af_ieee802154.c @@ -0,0 +1,373 @@ +/* + * IEEE802154.4 socket interface + * + * Copyright 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Maxim Gorbachyov + */ + +#include +#include +#include +#include +#include +#include /* For TIOCOUTQ/INQ */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "af802154.h" + +/* + * Utility function for families + */ +struct net_device *ieee802154_get_dev(struct net *net, + struct ieee802154_addr *addr) +{ + struct net_device *dev = NULL; + struct net_device *tmp; + u16 pan_id, short_addr; + + switch (addr->addr_type) { + case IEEE802154_ADDR_LONG: + rcu_read_lock(); + dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, addr->hwaddr); + if (dev) + dev_hold(dev); + rcu_read_unlock(); + break; + case IEEE802154_ADDR_SHORT: + if (addr->pan_id == 0xffff || + addr->short_addr == IEEE802154_ADDR_UNDEF || + addr->short_addr == 0xffff) + break; + + rtnl_lock(); + + for_each_netdev(net, tmp) { + if (tmp->type != ARPHRD_IEEE802154) + continue; + + pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp); + short_addr = + ieee802154_mlme_ops(tmp)->get_short_addr(tmp); + + if (pan_id == addr->pan_id && + short_addr == addr->short_addr) { + dev = tmp; + dev_hold(dev); + break; + } + } + + rtnl_unlock(); + break; + default: + pr_warning("Unsupported ieee802154 address type: %d\n", + addr->addr_type); + break; + } + + return dev; +} + +static int ieee802154_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + sk->sk_prot->close(sk, 0); + } + return 0; +} +static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->sendmsg(iocb, sk, msg, len); +} + +static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sock *sk = sock->sk; + + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + return sock_no_bind(sock, uaddr, addr_len); +} + +static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + + if (uaddr->sa_family == AF_UNSPEC) + return sk->sk_prot->disconnect(sk, flags); + + return sk->sk_prot->connect(sk, uaddr, addr_len); +} + +static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, + unsigned int cmd) +{ + struct ifreq ifr; + int ret = -ENOIOCTLCMD; + struct net_device *dev; + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + dev_load(sock_net(sk), ifr.ifr_name); + dev = dev_get_by_name(sock_net(sk), ifr.ifr_name); + + if (!dev) + return -ENODEV; + + if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) + ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); + + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + ret = -EFAULT; + dev_put(dev); + + return ret; +} + +static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *)arg); + case SIOCGIFADDR: + case SIOCSIFADDR: + return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, + cmd); + default: + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); + } +} + +static const struct proto_ops ieee802154_raw_ops = { + .family = PF_IEEE802154, + .owner = THIS_MODULE, + .release = ieee802154_sock_release, + .bind = ieee802154_sock_bind, + .connect = ieee802154_sock_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = ieee802154_sock_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = ieee802154_sock_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static const struct proto_ops ieee802154_dgram_ops = { + .family = PF_IEEE802154, + .owner = THIS_MODULE, + .release = ieee802154_sock_release, + .bind = ieee802154_sock_bind, + .connect = ieee802154_sock_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = ieee802154_sock_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = ieee802154_sock_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + + +/* + * Create a socket. Initialise the socket, blank the addresses + * set the state. + */ +static int ieee802154_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct sock *sk; + int rc; + struct proto *proto; + const struct proto_ops *ops; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + switch (sock->type) { + case SOCK_RAW: + proto = &ieee802154_raw_prot; + ops = &ieee802154_raw_ops; + break; + case SOCK_DGRAM: + proto = &ieee802154_dgram_prot; + ops = &ieee802154_dgram_ops; + break; + default: + rc = -ESOCKTNOSUPPORT; + goto out; + } + + rc = -ENOMEM; + sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto); + if (!sk) + goto out; + rc = 0; + + sock->ops = ops; + + sock_init_data(sock, sk); + /* FIXME: sk->sk_destruct */ + sk->sk_family = PF_IEEE802154; + + /* Checksums on by default */ + sock_set_flag(sk, SOCK_ZAPPED); + + if (sk->sk_prot->hash) + sk->sk_prot->hash(sk); + + if (sk->sk_prot->init) { + rc = sk->sk_prot->init(sk); + if (rc) + sk_common_release(sk); + } +out: + return rc; +} + +static const struct net_proto_family ieee802154_family_ops = { + .family = PF_IEEE802154, + .create = ieee802154_create, + .owner = THIS_MODULE, +}; + +static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + if (!netif_running(dev)) + return -ENODEV; + pr_debug("got frame, type %d, dev %p\n", dev->type, dev); +#ifdef DEBUG + print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); +#endif + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + ieee802154_raw_deliver(dev, skb); + + if (dev->type != ARPHRD_IEEE802154) + goto drop; + + if (skb->pkt_type != PACKET_OTHERHOST) + return ieee802154_dgram_deliver(dev, skb); + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + + +static struct packet_type ieee802154_packet_type = { + .type = __constant_htons(ETH_P_IEEE802154), + .func = ieee802154_rcv, +}; + +static int __init af_ieee802154_init(void) +{ + int rc = -EINVAL; + + rc = proto_register(&ieee802154_raw_prot, 1); + if (rc) + goto out; + + rc = proto_register(&ieee802154_dgram_prot, 1); + if (rc) + goto err_dgram; + + /* Tell SOCKET that we are alive */ + rc = sock_register(&ieee802154_family_ops); + if (rc) + goto err_sock; + dev_add_pack(&ieee802154_packet_type); + + rc = 0; + goto out; + +err_sock: + proto_unregister(&ieee802154_dgram_prot); +err_dgram: + proto_unregister(&ieee802154_raw_prot); +out: + return rc; +} +static void __exit af_ieee802154_remove(void) +{ + dev_remove_pack(&ieee802154_packet_type); + sock_unregister(PF_IEEE802154); + proto_unregister(&ieee802154_dgram_prot); + proto_unregister(&ieee802154_raw_prot); +} + +module_init(af_ieee802154_init); +module_exit(af_ieee802154_remove); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_IEEE802154); diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c new file mode 100644 index 00000000..1a3334c2 --- /dev/null +++ b/net/ieee802154/dgram.c @@ -0,0 +1,461 @@ +/* + * ZigBee socket interface + * + * Copyright 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Dmitry Eremin-Solenikov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "af802154.h" + +static HLIST_HEAD(dgram_head); +static DEFINE_RWLOCK(dgram_lock); + +struct dgram_sock { + struct sock sk; + + struct ieee802154_addr src_addr; + struct ieee802154_addr dst_addr; + + unsigned bound:1; + unsigned want_ack:1; +}; + +static inline struct dgram_sock *dgram_sk(const struct sock *sk) +{ + return container_of(sk, struct dgram_sock, sk); +} + +static void dgram_hash(struct sock *sk) +{ + write_lock_bh(&dgram_lock); + sk_add_node(sk, &dgram_head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&dgram_lock); +} + +static void dgram_unhash(struct sock *sk) +{ + write_lock_bh(&dgram_lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&dgram_lock); +} + +static int dgram_init(struct sock *sk) +{ + struct dgram_sock *ro = dgram_sk(sk); + + ro->dst_addr.addr_type = IEEE802154_ADDR_LONG; + ro->dst_addr.pan_id = 0xffff; + ro->want_ack = 1; + memset(&ro->dst_addr.hwaddr, 0xff, sizeof(ro->dst_addr.hwaddr)); + return 0; +} + +static void dgram_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) +{ + struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; + struct dgram_sock *ro = dgram_sk(sk); + int err = -EINVAL; + struct net_device *dev; + + lock_sock(sk); + + ro->bound = 0; + + if (len < sizeof(*addr)) + goto out; + + if (addr->family != AF_IEEE802154) + goto out; + + dev = ieee802154_get_dev(sock_net(sk), &addr->addr); + if (!dev) { + err = -ENODEV; + goto out; + } + + if (dev->type != ARPHRD_IEEE802154) { + err = -ENODEV; + goto out_put; + } + + memcpy(&ro->src_addr, &addr->addr, sizeof(struct ieee802154_addr)); + + ro->bound = 1; + err = 0; +out_put: + dev_put(dev); +out: + release_sock(sk); + + return err; +} + +static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + { + int amount = sk_wmem_alloc_get(sk); + + return put_user(amount, (int __user *)arg); + } + + case SIOCINQ: + { + struct sk_buff *skb; + unsigned long amount; + + amount = 0; + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + /* FIXME: parse the header for more correct value */ + amount = skb->len - (3+8+8); + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + } + return -ENOIOCTLCMD; +} + +/* FIXME: autobind */ +static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, + int len) +{ + struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; + struct dgram_sock *ro = dgram_sk(sk); + int err = 0; + + if (len < sizeof(*addr)) + return -EINVAL; + + if (addr->family != AF_IEEE802154) + return -EINVAL; + + lock_sock(sk); + + if (!ro->bound) { + err = -ENETUNREACH; + goto out; + } + + memcpy(&ro->dst_addr, &addr->addr, sizeof(struct ieee802154_addr)); + +out: + release_sock(sk); + return err; +} + +static int dgram_disconnect(struct sock *sk, int flags) +{ + struct dgram_sock *ro = dgram_sk(sk); + + lock_sock(sk); + + ro->dst_addr.addr_type = IEEE802154_ADDR_LONG; + memset(&ro->dst_addr.hwaddr, 0xff, sizeof(ro->dst_addr.hwaddr)); + + release_sock(sk); + + return 0; +} + +static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + struct net_device *dev; + unsigned mtu; + struct sk_buff *skb; + struct dgram_sock *ro = dgram_sk(sk); + int err; + + if (msg->msg_flags & MSG_OOB) { + pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); + return -EOPNOTSUPP; + } + + if (!ro->bound) + dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); + else + dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr); + + if (!dev) { + pr_debug("no dev\n"); + err = -ENXIO; + goto out; + } + mtu = dev->mtu; + pr_debug("name = %s, mtu = %u\n", dev->name, mtu); + + skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + size, + msg->msg_flags & MSG_DONTWAIT, + &err); + if (!skb) + goto out_dev; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb_reset_network_header(skb); + + mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA; + if (ro->want_ack) + mac_cb(skb)->flags |= MAC_CB_FLAG_ACKREQ; + + mac_cb(skb)->seq = ieee802154_mlme_ops(dev)->get_dsn(dev); + err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &ro->dst_addr, + ro->bound ? &ro->src_addr : NULL, size); + if (err < 0) + goto out_skb; + + skb_reset_mac_header(skb); + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err < 0) + goto out_skb; + + if (size > mtu) { + pr_debug("size = %Zu, mtu = %u\n", size, mtu); + err = -EINVAL; + goto out_skb; + } + + skb->dev = dev; + skb->sk = sk; + skb->protocol = htons(ETH_P_IEEE802154); + + dev_put(dev); + + err = dev_queue_xmit(skb); + if (err > 0) + err = net_xmit_errno(err); + + return err ?: size; + +out_skb: + kfree_skb(skb); +out_dev: + dev_put(dev); +out: + return err; +} + +static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int noblock, int flags, + int *addr_len) +{ + size_t copied = 0; + int err = -EOPNOTSUPP; + struct sk_buff *skb; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + /* FIXME: skip headers if necessary ?! */ + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (flags & MSG_TRUNC) + copied = skb->len; +done: + skb_free_datagram(sk, skb); +out: + if (err) + return err; + return copied; +} + +static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (sock_queue_rcv_skb(sk, skb) < 0) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + +static inline int ieee802154_match_sock(u8 *hw_addr, u16 pan_id, + u16 short_addr, struct dgram_sock *ro) +{ + if (!ro->bound) + return 1; + + if (ro->src_addr.addr_type == IEEE802154_ADDR_LONG && + !memcmp(ro->src_addr.hwaddr, hw_addr, IEEE802154_ADDR_LEN)) + return 1; + + if (ro->src_addr.addr_type == IEEE802154_ADDR_SHORT && + pan_id == ro->src_addr.pan_id && + short_addr == ro->src_addr.short_addr) + return 1; + + return 0; +} + +int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk, *prev = NULL; + struct hlist_node *node; + int ret = NET_RX_SUCCESS; + u16 pan_id, short_addr; + + /* Data frame processing */ + BUG_ON(dev->type != ARPHRD_IEEE802154); + + pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev); + + read_lock(&dgram_lock); + sk_for_each(sk, node, &dgram_head) { + if (ieee802154_match_sock(dev->dev_addr, pan_id, short_addr, + dgram_sk(sk))) { + if (prev) { + struct sk_buff *clone; + clone = skb_clone(skb, GFP_ATOMIC); + if (clone) + dgram_rcv_skb(prev, clone); + } + + prev = sk; + } + } + + if (prev) + dgram_rcv_skb(prev, skb); + else { + kfree_skb(skb); + ret = NET_RX_DROP; + } + read_unlock(&dgram_lock); + + return ret; +} + +static int dgram_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct dgram_sock *ro = dgram_sk(sk); + + int val, len; + + if (level != SOL_IEEE802154) + return -EOPNOTSUPP; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + switch (optname) { + case WPAN_WANTACK: + val = ro->want_ack; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +static int dgram_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct dgram_sock *ro = dgram_sk(sk); + int val; + int err = 0; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case WPAN_WANTACK: + ro->want_ack = !!val; + break; + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +struct proto ieee802154_dgram_prot = { + .name = "IEEE-802.15.4-MAC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct dgram_sock), + .init = dgram_init, + .close = dgram_close, + .bind = dgram_bind, + .sendmsg = dgram_sendmsg, + .recvmsg = dgram_recvmsg, + .hash = dgram_hash, + .unhash = dgram_unhash, + .connect = dgram_connect, + .disconnect = dgram_disconnect, + .ioctl = dgram_ioctl, + .getsockopt = dgram_getsockopt, + .setsockopt = dgram_setsockopt, +}; + diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h new file mode 100644 index 00000000..aadec428 --- /dev/null +++ b/net/ieee802154/ieee802154.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2007, 2008, 2009 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef IEEE_802154_LOCAL_H +#define IEEE_802154_LOCAL_H + +int __init ieee802154_nl_init(void); +void __exit ieee802154_nl_exit(void); + +#define IEEE802154_OP(_cmd, _func) \ + { \ + .cmd = _cmd, \ + .policy = ieee802154_policy, \ + .doit = _func, \ + .dumpit = NULL, \ + .flags = GENL_ADMIN_PERM, \ + } + +#define IEEE802154_DUMP(_cmd, _func, _dump) \ + { \ + .cmd = _cmd, \ + .policy = ieee802154_policy, \ + .doit = _func, \ + .dumpit = _dump, \ + } + +struct genl_info; + +struct sk_buff *ieee802154_nl_create(int flags, u8 req); +int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group); +struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, + int flags, u8 req); +int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info); + +extern struct genl_family nl802154_family; +int nl802154_mac_register(void); +int nl802154_phy_register(void); + +#endif diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c new file mode 100644 index 00000000..c8097ae2 --- /dev/null +++ b/net/ieee802154/netlink.c @@ -0,0 +1,139 @@ +/* + * Netlink inteface for IEEE 802.15.4 stack + * + * Copyright 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Dmitry Eremin-Solenikov + * Maxim Osipov + */ + +#include +#include +#include +#include + +#include "ieee802154.h" + +static unsigned int ieee802154_seq_num; +static DEFINE_SPINLOCK(ieee802154_seq_lock); + +struct genl_family nl802154_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IEEE802154_NL_NAME, + .version = 1, + .maxattr = IEEE802154_ATTR_MAX, +}; + +/* Requests to userspace */ +struct sk_buff *ieee802154_nl_create(int flags, u8 req) +{ + void *hdr; + struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); + unsigned long f; + + if (!msg) + return NULL; + + spin_lock_irqsave(&ieee802154_seq_lock, f); + hdr = genlmsg_put(msg, 0, ieee802154_seq_num++, + &nl802154_family, flags, req); + spin_unlock_irqrestore(&ieee802154_seq_lock, f); + if (!hdr) { + nlmsg_free(msg); + return NULL; + } + + return msg; +} + +int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group) +{ + /* XXX: nlh is right at the start of msg */ + void *hdr = genlmsg_data(NLMSG_DATA(msg->data)); + + if (genlmsg_end(msg, hdr) < 0) + goto out; + + return genlmsg_multicast(msg, 0, group, GFP_ATOMIC); +out: + nlmsg_free(msg); + return -ENOBUFS; +} + +struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, + int flags, u8 req) +{ + void *hdr; + struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); + + if (!msg) + return NULL; + + hdr = genlmsg_put_reply(msg, info, + &nl802154_family, flags, req); + if (!hdr) { + nlmsg_free(msg); + return NULL; + } + + return msg; +} + +int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) +{ + /* XXX: nlh is right at the start of msg */ + void *hdr = genlmsg_data(NLMSG_DATA(msg->data)); + + if (genlmsg_end(msg, hdr) < 0) + goto out; + + return genlmsg_reply(msg, info); +out: + nlmsg_free(msg); + return -ENOBUFS; +} + +int __init ieee802154_nl_init(void) +{ + int rc; + + rc = genl_register_family(&nl802154_family); + if (rc) + goto fail; + + rc = nl802154_mac_register(); + if (rc) + goto fail; + + rc = nl802154_phy_register(); + if (rc) + goto fail; + + return 0; + +fail: + genl_unregister_family(&nl802154_family); + return rc; +} + +void __exit ieee802154_nl_exit(void) +{ + genl_unregister_family(&nl802154_family); +} + diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c new file mode 100644 index 00000000..71ee1108 --- /dev/null +++ b/net/ieee802154/nl-mac.c @@ -0,0 +1,618 @@ +/* + * Netlink inteface for IEEE 802.15.4 stack + * + * Copyright 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Dmitry Eremin-Solenikov + * Maxim Osipov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee802154.h" + +static struct genl_multicast_group ieee802154_coord_mcgrp = { + .name = IEEE802154_MCAST_COORD_NAME, +}; + +static struct genl_multicast_group ieee802154_beacon_mcgrp = { + .name = IEEE802154_MCAST_BEACON_NAME, +}; + +int ieee802154_nl_assoc_indic(struct net_device *dev, + struct ieee802154_addr *addr, u8 cap) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + if (addr->addr_type != IEEE802154_ADDR_LONG) { + pr_err("%s: received non-long source address!\n", __func__); + return -EINVAL; + } + + msg = ieee802154_nl_create(0, IEEE802154_ASSOCIATE_INDIC); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + + NLA_PUT(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN, + addr->hwaddr); + + NLA_PUT_U8(msg, IEEE802154_ATTR_CAPABILITY, cap); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_assoc_indic); + +int ieee802154_nl_assoc_confirm(struct net_device *dev, u16 short_addr, + u8 status) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + msg = ieee802154_nl_create(0, IEEE802154_ASSOCIATE_CONF); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + + NLA_PUT_U16(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr); + NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_assoc_confirm); + +int ieee802154_nl_disassoc_indic(struct net_device *dev, + struct ieee802154_addr *addr, u8 reason) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + msg = ieee802154_nl_create(0, IEEE802154_DISASSOCIATE_INDIC); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + + if (addr->addr_type == IEEE802154_ADDR_LONG) + NLA_PUT(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN, + addr->hwaddr); + else + NLA_PUT_U16(msg, IEEE802154_ATTR_SRC_SHORT_ADDR, + addr->short_addr); + + NLA_PUT_U8(msg, IEEE802154_ATTR_REASON, reason); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_disassoc_indic); + +int ieee802154_nl_disassoc_confirm(struct net_device *dev, u8 status) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + msg = ieee802154_nl_create(0, IEEE802154_DISASSOCIATE_CONF); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + + NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_disassoc_confirm); + +int ieee802154_nl_beacon_indic(struct net_device *dev, + u16 panid, u16 coord_addr) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + msg = ieee802154_nl_create(0, IEEE802154_BEACON_NOTIFY_INDIC); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + NLA_PUT_U16(msg, IEEE802154_ATTR_COORD_SHORT_ADDR, coord_addr); + NLA_PUT_U16(msg, IEEE802154_ATTR_COORD_PAN_ID, panid); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_beacon_indic); + +int ieee802154_nl_scan_confirm(struct net_device *dev, + u8 status, u8 scan_type, u32 unscanned, u8 page, + u8 *edl/* , struct list_head *pan_desc_list */) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + msg = ieee802154_nl_create(0, IEEE802154_SCAN_CONF); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + + NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status); + NLA_PUT_U8(msg, IEEE802154_ATTR_SCAN_TYPE, scan_type); + NLA_PUT_U32(msg, IEEE802154_ATTR_CHANNELS, unscanned); + NLA_PUT_U8(msg, IEEE802154_ATTR_PAGE, page); + + if (edl) + NLA_PUT(msg, IEEE802154_ATTR_ED_LIST, 27, edl); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_scan_confirm); + +int ieee802154_nl_start_confirm(struct net_device *dev, u8 status) +{ + struct sk_buff *msg; + + pr_debug("%s\n", __func__); + + msg = ieee802154_nl_create(0, IEEE802154_START_CONF); + if (!msg) + return -ENOBUFS; + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + + NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status); + + return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(ieee802154_nl_start_confirm); + +static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 pid, + u32 seq, int flags, struct net_device *dev) +{ + void *hdr; + struct wpan_phy *phy; + + pr_debug("%s\n", __func__); + + hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, + IEEE802154_LIST_IFACE); + if (!hdr) + goto out; + + phy = ieee802154_mlme_ops(dev)->get_phy(dev); + BUG_ON(!phy); + + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)); + NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex); + + NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, + dev->dev_addr); + NLA_PUT_U16(msg, IEEE802154_ATTR_SHORT_ADDR, + ieee802154_mlme_ops(dev)->get_short_addr(dev)); + NLA_PUT_U16(msg, IEEE802154_ATTR_PAN_ID, + ieee802154_mlme_ops(dev)->get_pan_id(dev)); + wpan_phy_put(phy); + return genlmsg_end(msg, hdr); + +nla_put_failure: + wpan_phy_put(phy); + genlmsg_cancel(msg, hdr); +out: + return -EMSGSIZE; +} + +/* Requests from userspace */ +static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) +{ + struct net_device *dev; + + if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { + char name[IFNAMSIZ + 1]; + nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], + sizeof(name)); + dev = dev_get_by_name(&init_net, name); + } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) + dev = dev_get_by_index(&init_net, + nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX])); + else + return NULL; + + if (!dev) + return NULL; + + if (dev->type != ARPHRD_IEEE802154) { + dev_put(dev); + return NULL; + } + + return dev; +} + +static int ieee802154_associate_req(struct sk_buff *skb, + struct genl_info *info) +{ + struct net_device *dev; + struct ieee802154_addr addr; + u8 page; + int ret = -EINVAL; + + if (!info->attrs[IEEE802154_ATTR_CHANNEL] || + !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || + (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] && + !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) || + !info->attrs[IEEE802154_ATTR_CAPABILITY]) + return -EINVAL; + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) { + addr.addr_type = IEEE802154_ADDR_LONG; + nla_memcpy(addr.hwaddr, + info->attrs[IEEE802154_ATTR_COORD_HW_ADDR], + IEEE802154_ADDR_LEN); + } else { + addr.addr_type = IEEE802154_ADDR_SHORT; + addr.short_addr = nla_get_u16( + info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); + } + addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); + + if (info->attrs[IEEE802154_ATTR_PAGE]) + page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); + else + page = 0; + + ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr, + nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]), + page, + nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY])); + + dev_put(dev); + return ret; +} + +static int ieee802154_associate_resp(struct sk_buff *skb, + struct genl_info *info) +{ + struct net_device *dev; + struct ieee802154_addr addr; + int ret = -EINVAL; + + if (!info->attrs[IEEE802154_ATTR_STATUS] || + !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] || + !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) + return -EINVAL; + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + addr.addr_type = IEEE802154_ADDR_LONG; + nla_memcpy(addr.hwaddr, info->attrs[IEEE802154_ATTR_DEST_HW_ADDR], + IEEE802154_ADDR_LEN); + addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + + + ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, + nla_get_u16(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), + nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS])); + + dev_put(dev); + return ret; +} + +static int ieee802154_disassociate_req(struct sk_buff *skb, + struct genl_info *info) +{ + struct net_device *dev; + struct ieee802154_addr addr; + int ret = -EINVAL; + + if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] && + !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || + !info->attrs[IEEE802154_ATTR_REASON]) + return -EINVAL; + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) { + addr.addr_type = IEEE802154_ADDR_LONG; + nla_memcpy(addr.hwaddr, + info->attrs[IEEE802154_ATTR_DEST_HW_ADDR], + IEEE802154_ADDR_LEN); + } else { + addr.addr_type = IEEE802154_ADDR_SHORT; + addr.short_addr = nla_get_u16( + info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); + } + addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + + ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, + nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); + + dev_put(dev); + return ret; +} + +/* + * PANid, channel, beacon_order = 15, superframe_order = 15, + * PAN_coordinator, battery_life_extension = 0, + * coord_realignment = 0, security_enable = 0 +*/ +static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *dev; + struct ieee802154_addr addr; + + u8 channel, bcn_ord, sf_ord; + u8 page; + int pan_coord, blx, coord_realign; + int ret; + + if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || + !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] || + !info->attrs[IEEE802154_ATTR_CHANNEL] || + !info->attrs[IEEE802154_ATTR_BCN_ORD] || + !info->attrs[IEEE802154_ATTR_SF_ORD] || + !info->attrs[IEEE802154_ATTR_PAN_COORD] || + !info->attrs[IEEE802154_ATTR_BAT_EXT] || + !info->attrs[IEEE802154_ATTR_COORD_REALIGN] + ) + return -EINVAL; + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + addr.addr_type = IEEE802154_ADDR_SHORT; + addr.short_addr = nla_get_u16( + info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); + addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); + + channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]); + bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]); + sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]); + pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]); + blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]); + coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]); + + if (info->attrs[IEEE802154_ATTR_PAGE]) + page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); + else + page = 0; + + + if (addr.short_addr == IEEE802154_ADDR_BROADCAST) { + ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS); + dev_put(dev); + return -EINVAL; + } + + ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page, + bcn_ord, sf_ord, pan_coord, blx, coord_realign); + + dev_put(dev); + return ret; +} + +static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *dev; + int ret; + u8 type; + u32 channels; + u8 duration; + u8 page; + + if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] || + !info->attrs[IEEE802154_ATTR_CHANNELS] || + !info->attrs[IEEE802154_ATTR_DURATION]) + return -EINVAL; + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]); + channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]); + duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]); + + if (info->attrs[IEEE802154_ATTR_PAGE]) + page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); + else + page = 0; + + + ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, + duration); + + dev_put(dev); + return ret; +} + +static int ieee802154_list_iface(struct sk_buff *skb, + struct genl_info *info) +{ + /* Request for interface name, index, type, IEEE address, + PAN Id, short address */ + struct sk_buff *msg; + struct net_device *dev = NULL; + int rc = -ENOBUFS; + + pr_debug("%s\n", __func__); + + dev = ieee802154_nl_get_dev(info); + if (!dev) + return -ENODEV; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + goto out_dev; + + rc = ieee802154_nl_fill_iface(msg, info->snd_pid, info->snd_seq, + 0, dev); + if (rc < 0) + goto out_free; + + dev_put(dev); + + return genlmsg_reply(msg, info); +out_free: + nlmsg_free(msg); +out_dev: + dev_put(dev); + return rc; + +} + +static int ieee802154_dump_iface(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx; + int s_idx = cb->args[0]; + + pr_debug("%s\n", __func__); + + idx = 0; + for_each_netdev(net, dev) { + if (idx < s_idx || (dev->type != ARPHRD_IEEE802154)) + goto cont; + + if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, dev) < 0) + break; +cont: + idx++; + } + cb->args[0] = idx; + + return skb->len; +} + +static struct genl_ops ieee802154_coordinator_ops[] = { + IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req), + IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp), + IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req), + IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req), + IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req), + IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface, + ieee802154_dump_iface), +}; + +/* + * No need to unregister as family unregistration will do it. + */ +int nl802154_mac_register(void) +{ + int i; + int rc; + + rc = genl_register_mc_group(&nl802154_family, + &ieee802154_coord_mcgrp); + if (rc) + return rc; + + rc = genl_register_mc_group(&nl802154_family, + &ieee802154_beacon_mcgrp); + if (rc) + return rc; + + for (i = 0; i < ARRAY_SIZE(ieee802154_coordinator_ops); i++) { + rc = genl_register_ops(&nl802154_family, + &ieee802154_coordinator_ops[i]); + if (rc) + return rc; + } + + return 0; +} diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c new file mode 100644 index 00000000..02548b29 --- /dev/null +++ b/net/ieee802154/nl-phy.c @@ -0,0 +1,346 @@ +/* + * Netlink inteface for IEEE 802.15.4 stack + * + * Copyright 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Dmitry Eremin-Solenikov + * Maxim Osipov + */ + +#include +#include +#include +#include +#include +#include +#include +#include /* for rtnl_{un,}lock */ +#include + +#include "ieee802154.h" + +static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 pid, + u32 seq, int flags, struct wpan_phy *phy) +{ + void *hdr; + int i, pages = 0; + uint32_t *buf = kzalloc(32 * sizeof(uint32_t), GFP_KERNEL); + + pr_debug("%s\n", __func__); + + if (!buf) + return -EMSGSIZE; + + hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, + IEEE802154_LIST_PHY); + if (!hdr) + goto out; + + mutex_lock(&phy->pib_lock); + NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)); + + NLA_PUT_U8(msg, IEEE802154_ATTR_PAGE, phy->current_page); + NLA_PUT_U8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel); + for (i = 0; i < 32; i++) { + if (phy->channels_supported[i]) + buf[pages++] = phy->channels_supported[i] | (i << 27); + } + if (pages) + NLA_PUT(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, + pages * sizeof(uint32_t), buf); + + mutex_unlock(&phy->pib_lock); + kfree(buf); + return genlmsg_end(msg, hdr); + +nla_put_failure: + mutex_unlock(&phy->pib_lock); + genlmsg_cancel(msg, hdr); +out: + kfree(buf); + return -EMSGSIZE; +} + +static int ieee802154_list_phy(struct sk_buff *skb, + struct genl_info *info) +{ + /* Request for interface name, index, type, IEEE address, + PAN Id, short address */ + struct sk_buff *msg; + struct wpan_phy *phy; + const char *name; + int rc = -ENOBUFS; + + pr_debug("%s\n", __func__); + + if (!info->attrs[IEEE802154_ATTR_PHY_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); + if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') + return -EINVAL; /* phy name should be null-terminated */ + + + phy = wpan_phy_find(name); + if (!phy) + return -ENODEV; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + goto out_dev; + + rc = ieee802154_nl_fill_phy(msg, info->snd_pid, info->snd_seq, + 0, phy); + if (rc < 0) + goto out_free; + + wpan_phy_put(phy); + + return genlmsg_reply(msg, info); +out_free: + nlmsg_free(msg); +out_dev: + wpan_phy_put(phy); + return rc; + +} + +struct dump_phy_data { + struct sk_buff *skb; + struct netlink_callback *cb; + int idx, s_idx; +}; + +static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data) +{ + int rc; + struct dump_phy_data *data = _data; + + pr_debug("%s\n", __func__); + + if (data->idx++ < data->s_idx) + return 0; + + rc = ieee802154_nl_fill_phy(data->skb, + NETLINK_CB(data->cb->skb).pid, + data->cb->nlh->nlmsg_seq, + NLM_F_MULTI, + phy); + + if (rc < 0) { + data->idx--; + return rc; + } + + return 0; +} + +static int ieee802154_dump_phy(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct dump_phy_data data = { + .cb = cb, + .skb = skb, + .s_idx = cb->args[0], + .idx = 0, + }; + + pr_debug("%s\n", __func__); + + wpan_phy_for_each(ieee802154_dump_phy_iter, &data); + + cb->args[0] = data.idx; + + return skb->len; +} + +static int ieee802154_add_iface(struct sk_buff *skb, + struct genl_info *info) +{ + struct sk_buff *msg; + struct wpan_phy *phy; + const char *name; + const char *devname; + int rc = -ENOBUFS; + struct net_device *dev; + + pr_debug("%s\n", __func__); + + if (!info->attrs[IEEE802154_ATTR_PHY_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); + if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') + return -EINVAL; /* phy name should be null-terminated */ + + if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { + devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]); + if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] + != '\0') + return -EINVAL; /* phy name should be null-terminated */ + } else { + devname = "wpan%d"; + } + + if (strlen(devname) >= IFNAMSIZ) + return -ENAMETOOLONG; + + phy = wpan_phy_find(name); + if (!phy) + return -ENODEV; + + msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE); + if (!msg) + goto out_dev; + + if (!phy->add_iface) { + rc = -EINVAL; + goto nla_put_failure; + } + + dev = phy->add_iface(phy, devname); + if (IS_ERR(dev)) { + rc = PTR_ERR(dev); + goto nla_put_failure; + } + + NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)); + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name); + + dev_put(dev); + + wpan_phy_put(phy); + + return ieee802154_nl_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); +out_dev: + wpan_phy_put(phy); + return rc; +} + +static int ieee802154_del_iface(struct sk_buff *skb, + struct genl_info *info) +{ + struct sk_buff *msg; + struct wpan_phy *phy; + const char *name; + int rc; + struct net_device *dev; + + pr_debug("%s\n", __func__); + + if (!info->attrs[IEEE802154_ATTR_DEV_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]); + if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') + return -EINVAL; /* name should be null-terminated */ + + dev = dev_get_by_name(genl_info_net(info), name); + if (!dev) + return -ENODEV; + + phy = ieee802154_mlme_ops(dev)->get_phy(dev); + BUG_ON(!phy); + + rc = -EINVAL; + /* phy name is optional, but should be checked if it's given */ + if (info->attrs[IEEE802154_ATTR_PHY_NAME]) { + struct wpan_phy *phy2; + + const char *pname = + nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); + if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] + != '\0') + /* name should be null-terminated */ + goto out_dev; + + phy2 = wpan_phy_find(pname); + if (!phy2) + goto out_dev; + + if (phy != phy2) { + wpan_phy_put(phy2); + goto out_dev; + } + } + + rc = -ENOBUFS; + + msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE); + if (!msg) + goto out_dev; + + if (!phy->del_iface) { + rc = -EINVAL; + goto nla_put_failure; + } + + rtnl_lock(); + phy->del_iface(phy, dev); + + /* We don't have device anymore */ + dev_put(dev); + dev = NULL; + + rtnl_unlock(); + + + NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)); + NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, name); + + wpan_phy_put(phy); + + return ieee802154_nl_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); +out_dev: + wpan_phy_put(phy); + if (dev) + dev_put(dev); + + return rc; +} + +static struct genl_ops ieee802154_phy_ops[] = { + IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy, + ieee802154_dump_phy), + IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface), + IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface), +}; + +/* + * No need to unregister as family unregistration will do it. + */ +int nl802154_phy_register(void) +{ + int i; + int rc; + + for (i = 0; i < ARRAY_SIZE(ieee802154_phy_ops); i++) { + rc = genl_register_ops(&nl802154_family, + &ieee802154_phy_ops[i]); + if (rc) + return rc; + } + + return 0; +} diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c new file mode 100644 index 00000000..6adda4d4 --- /dev/null +++ b/net/ieee802154/nl_policy.c @@ -0,0 +1,56 @@ +/* + * nl802154.h + * + * Copyright (C) 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include + +#define NLA_HW_ADDR NLA_U64 + +const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = { + [IEEE802154_ATTR_DEV_NAME] = { .type = NLA_STRING, }, + [IEEE802154_ATTR_DEV_INDEX] = { .type = NLA_U32, }, + [IEEE802154_ATTR_PHY_NAME] = { .type = NLA_STRING, }, + + [IEEE802154_ATTR_STATUS] = { .type = NLA_U8, }, + [IEEE802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, }, + [IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, }, + [IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, }, + [IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, }, + [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, }, + [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, }, + [IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, }, + [IEEE802154_ATTR_SRC_SHORT_ADDR] = { .type = NLA_U16, }, + [IEEE802154_ATTR_SRC_HW_ADDR] = { .type = NLA_HW_ADDR, }, + [IEEE802154_ATTR_SRC_PAN_ID] = { .type = NLA_U16, }, + [IEEE802154_ATTR_DEST_SHORT_ADDR] = { .type = NLA_U16, }, + [IEEE802154_ATTR_DEST_HW_ADDR] = { .type = NLA_HW_ADDR, }, + [IEEE802154_ATTR_DEST_PAN_ID] = { .type = NLA_U16, }, + + [IEEE802154_ATTR_CAPABILITY] = { .type = NLA_U8, }, + [IEEE802154_ATTR_REASON] = { .type = NLA_U8, }, + [IEEE802154_ATTR_SCAN_TYPE] = { .type = NLA_U8, }, + [IEEE802154_ATTR_CHANNELS] = { .type = NLA_U32, }, + [IEEE802154_ATTR_DURATION] = { .type = NLA_U8, }, + [IEEE802154_ATTR_ED_LIST] = { .len = 27 }, + [IEEE802154_ATTR_CHANNEL_PAGE_LIST] = { .len = 32 * 4, }, +}; + diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c new file mode 100644 index 00000000..10970ca8 --- /dev/null +++ b/net/ieee802154/raw.c @@ -0,0 +1,267 @@ +/* + * Raw IEEE 802.15.4 sockets + * + * Copyright 2007, 2008 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Written by: + * Sergey Lapin + * Dmitry Eremin-Solenikov + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "af802154.h" + +static HLIST_HEAD(raw_head); +static DEFINE_RWLOCK(raw_lock); + +static void raw_hash(struct sock *sk) +{ + write_lock_bh(&raw_lock); + sk_add_node(sk, &raw_head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&raw_lock); +} + +static void raw_unhash(struct sock *sk) +{ + write_lock_bh(&raw_lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&raw_lock); +} + +static void raw_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int len) +{ + struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; + int err = 0; + struct net_device *dev = NULL; + + if (len < sizeof(*addr)) + return -EINVAL; + + if (addr->family != AF_IEEE802154) + return -EINVAL; + + lock_sock(sk); + + dev = ieee802154_get_dev(sock_net(sk), &addr->addr); + if (!dev) { + err = -ENODEV; + goto out; + } + + if (dev->type != ARPHRD_IEEE802154) { + err = -ENODEV; + goto out_put; + } + + sk->sk_bound_dev_if = dev->ifindex; + sk_dst_reset(sk); + +out_put: + dev_put(dev); +out: + release_sock(sk); + + return err; +} + +static int raw_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + return -ENOTSUPP; +} + +static int raw_disconnect(struct sock *sk, int flags) +{ + return 0; +} + +static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size) +{ + struct net_device *dev; + unsigned mtu; + struct sk_buff *skb; + int err; + + if (msg->msg_flags & MSG_OOB) { + pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); + return -EOPNOTSUPP; + } + + lock_sock(sk); + if (!sk->sk_bound_dev_if) + dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); + else + dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); + release_sock(sk); + + if (!dev) { + pr_debug("no dev\n"); + err = -ENXIO; + goto out; + } + + mtu = dev->mtu; + pr_debug("name = %s, mtu = %u\n", dev->name, mtu); + + if (size > mtu) { + pr_debug("size = %Zu, mtu = %u\n", size, mtu); + err = -EINVAL; + goto out_dev; + } + + skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + size, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + goto out_dev; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err < 0) + goto out_skb; + + skb->dev = dev; + skb->sk = sk; + skb->protocol = htons(ETH_P_IEEE802154); + + dev_put(dev); + + err = dev_queue_xmit(skb); + if (err > 0) + err = net_xmit_errno(err); + + return err ?: size; + +out_skb: + kfree_skb(skb); +out_dev: + dev_put(dev); +out: + return err; +} + +static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + size_t copied = 0; + int err = -EOPNOTSUPP; + struct sk_buff *skb; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (flags & MSG_TRUNC) + copied = skb->len; +done: + skb_free_datagram(sk, skb); +out: + if (err) + return err; + return copied; +} + +static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (sock_queue_rcv_skb(sk, skb) < 0) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + + +void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk; + struct hlist_node *node; + + read_lock(&raw_lock); + sk_for_each(sk, node, &raw_head) { + bh_lock_sock(sk); + if (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == dev->ifindex) { + + struct sk_buff *clone; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone) + raw_rcv_skb(sk, clone); + } + bh_unlock_sock(sk); + } + read_unlock(&raw_lock); +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EOPNOTSUPP; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + return -EOPNOTSUPP; +} + +struct proto ieee802154_raw_prot = { + .name = "IEEE-802.15.4-RAW", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), + .close = raw_close, + .bind = raw_bind, + .sendmsg = raw_sendmsg, + .recvmsg = raw_recvmsg, + .hash = raw_hash, + .unhash = raw_unhash, + .connect = raw_connect, + .disconnect = raw_disconnect, + .getsockopt = raw_getsockopt, + .setsockopt = raw_setsockopt, +}; + diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c new file mode 100644 index 00000000..1627ef2e --- /dev/null +++ b/net/ieee802154/wpan-class.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2007, 2008, 2009 Siemens AG + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include + +#include + +#include "ieee802154.h" + +#define MASTER_SHOW_COMPLEX(name, format_string, args...) \ +static ssize_t name ## _show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); \ + int ret; \ + \ + mutex_lock(&phy->pib_lock); \ + ret = snprintf(buf, PAGE_SIZE, format_string "\n", args); \ + mutex_unlock(&phy->pib_lock); \ + return ret; \ +} + +#define MASTER_SHOW(field, format_string) \ + MASTER_SHOW_COMPLEX(field, format_string, phy->field) + +MASTER_SHOW(current_channel, "%d"); +MASTER_SHOW(current_page, "%d"); +MASTER_SHOW_COMPLEX(transmit_power, "%d +- %d dB", + ((signed char) (phy->transmit_power << 2)) >> 2, + (phy->transmit_power >> 6) ? (phy->transmit_power >> 6) * 3 : 1 ); +MASTER_SHOW(cca_mode, "%d"); + +static ssize_t channels_supported_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); + int ret; + int i, len = 0; + + mutex_lock(&phy->pib_lock); + for (i = 0; i < 32; i++) { + ret = snprintf(buf + len, PAGE_SIZE - len, + "%#09x\n", phy->channels_supported[i]); + if (ret < 0) + break; + len += ret; + } + mutex_unlock(&phy->pib_lock); + return len; +} + +static struct device_attribute pmib_attrs[] = { + __ATTR_RO(current_channel), + __ATTR_RO(current_page), + __ATTR_RO(channels_supported), + __ATTR_RO(transmit_power), + __ATTR_RO(cca_mode), + {}, +}; + +static void wpan_phy_release(struct device *d) +{ + struct wpan_phy *phy = container_of(d, struct wpan_phy, dev); + kfree(phy); +} + +static struct class wpan_phy_class = { + .name = "ieee802154", + .dev_release = wpan_phy_release, + .dev_attrs = pmib_attrs, +}; + +static DEFINE_MUTEX(wpan_phy_mutex); +static int wpan_phy_idx; + +static int wpan_phy_match(struct device *dev, void *data) +{ + return !strcmp(dev_name(dev), (const char *)data); +} + +struct wpan_phy *wpan_phy_find(const char *str) +{ + struct device *dev; + + if (WARN_ON(!str)) + return NULL; + + dev = class_find_device(&wpan_phy_class, NULL, + (void *)str, wpan_phy_match); + if (!dev) + return NULL; + + return container_of(dev, struct wpan_phy, dev); +} +EXPORT_SYMBOL(wpan_phy_find); + +struct wpan_phy_iter_data { + int (*fn)(struct wpan_phy *phy, void *data); + void *data; +}; + +static int wpan_phy_iter(struct device *dev, void *_data) +{ + struct wpan_phy_iter_data *wpid = _data; + struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); + return wpid->fn(phy, wpid->data); +} + +int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), + void *data) +{ + struct wpan_phy_iter_data wpid = { + .fn = fn, + .data = data, + }; + + return class_for_each_device(&wpan_phy_class, NULL, + &wpid, wpan_phy_iter); +} +EXPORT_SYMBOL(wpan_phy_for_each); + +static int wpan_phy_idx_valid(int idx) +{ + return idx >= 0; +} + +struct wpan_phy *wpan_phy_alloc(size_t priv_size) +{ + struct wpan_phy *phy = kzalloc(sizeof(*phy) + priv_size, + GFP_KERNEL); + + if (!phy) + goto out; + mutex_lock(&wpan_phy_mutex); + phy->idx = wpan_phy_idx++; + if (unlikely(!wpan_phy_idx_valid(phy->idx))) { + wpan_phy_idx--; + mutex_unlock(&wpan_phy_mutex); + kfree(phy); + goto out; + } + mutex_unlock(&wpan_phy_mutex); + + mutex_init(&phy->pib_lock); + + device_initialize(&phy->dev); + dev_set_name(&phy->dev, "wpan-phy%d", phy->idx); + + phy->dev.class = &wpan_phy_class; + + phy->current_channel = -1; /* not initialised */ + phy->current_page = 0; /* for compatibility */ + + return phy; + +out: + return NULL; +} +EXPORT_SYMBOL(wpan_phy_alloc); + +int wpan_phy_register(struct wpan_phy *phy) +{ + return device_add(&phy->dev); +} +EXPORT_SYMBOL(wpan_phy_register); + +void wpan_phy_unregister(struct wpan_phy *phy) +{ + device_del(&phy->dev); +} +EXPORT_SYMBOL(wpan_phy_unregister); + +void wpan_phy_free(struct wpan_phy *phy) +{ + put_device(&phy->dev); +} +EXPORT_SYMBOL(wpan_phy_free); + +static int __init wpan_phy_class_init(void) +{ + int rc; + rc = class_register(&wpan_phy_class); + if (rc) + goto err; + + rc = ieee802154_nl_init(); + if (rc) + goto err_nl; + + return 0; +err_nl: + class_unregister(&wpan_phy_class); +err: + return rc; +} +subsys_initcall(wpan_phy_class_init); + +static void __exit wpan_phy_class_exit(void) +{ + ieee802154_nl_exit(); + class_unregister(&wpan_phy_class); +} +module_exit(wpan_phy_class_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); +MODULE_AUTHOR("Dmitry Eremin-Solenikov"); + diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig new file mode 100644 index 00000000..cbb505ba --- /dev/null +++ b/net/ipv4/Kconfig @@ -0,0 +1,626 @@ +# +# IP configuration +# +config IP_MULTICAST + bool "IP: multicasting" + help + This is code for addressing several networked computers at once, + enlarging your kernel by about 2 KB. You need multicasting if you + intend to participate in the MBONE, a high bandwidth network on top + of the Internet which carries audio and video broadcasts. More + information about the MBONE is on the WWW at + . Information about the multicast + capabilities of the various network cards is contained in + . For most people, it's + safe to say N. + +config IP_ADVANCED_ROUTER + bool "IP: advanced router" + ---help--- + If you intend to run your Linux box mostly as a router, i.e. as a + computer that forwards and redistributes network packets, say Y; you + will then be presented with several options that allow more precise + control about the routing process. + + The answer to this question won't directly affect the kernel: + answering N will just cause the configurator to skip all the + questions about advanced routing. + + Note that your box can only act as a router if you enable IP + forwarding in your kernel; you can do that by saying Y to "/proc + file system support" and "Sysctl support" below and executing the + line + + echo "1" > /proc/sys/net/ipv4/ip_forward + + at boot time after the /proc file system has been mounted. + + If you turn on IP forwarding, you should consider the rp_filter, which + automatically rejects incoming packets if the routing table entry + for their source address doesn't match the network interface they're + arriving on. This has security advantages because it prevents the + so-called IP spoofing, however it can pose problems if you use + asymmetric routing (packets from you to a host take a different path + than packets from that host to you) or if you operate a non-routing + host which has several IP addresses on different interfaces. To turn + rp_filter on use: + + echo 1 > /proc/sys/net/ipv4/conf//rp_filter + or + echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter + + Note that some distributions enable it in startup scripts. + For details about rp_filter strict and loose mode read + . + + If unsure, say N here. + +config IP_FIB_TRIE_STATS + bool "FIB TRIE statistics" + depends on IP_ADVANCED_ROUTER + ---help--- + Keep track of statistics on structure of FIB TRIE table. + Useful for testing and measuring TRIE performance. + +config IP_MULTIPLE_TABLES + bool "IP: policy routing" + depends on IP_ADVANCED_ROUTER + select FIB_RULES + ---help--- + Normally, a router decides what to do with a received packet based + solely on the packet's final destination address. If you say Y here, + the Linux router will also be able to take the packet's source + address into account. Furthermore, the TOS (Type-Of-Service) field + of the packet can be used for routing decisions as well. + + If you are interested in this, please see the preliminary + documentation at + and . + You will need supporting software from + . + + If unsure, say N. + +config IP_ROUTE_MULTIPATH + bool "IP: equal cost multipath" + depends on IP_ADVANCED_ROUTER + help + Normally, the routing tables specify a single action to be taken in + a deterministic manner for a given packet. If you say Y here + however, it becomes possible to attach several actions to a packet + pattern, in effect specifying several alternative paths to travel + for those packets. The router considers all these paths to be of + equal "cost" and chooses one of them in a non-deterministic fashion + if a matching packet arrives. + +config IP_ROUTE_VERBOSE + bool "IP: verbose route monitoring" + depends on IP_ADVANCED_ROUTER + help + If you say Y here, which is recommended, then the kernel will print + verbose messages regarding the routing, for example warnings about + received packets which look strange and could be evidence of an + attack or a misconfigured system somewhere. The information is + handled by the klogd daemon which is responsible for kernel messages + ("man klogd"). + +config IP_ROUTE_CLASSID + bool + +config IP_PNP + bool "IP: kernel level autoconfiguration" + help + This enables automatic configuration of IP addresses of devices and + of the routing table during kernel boot, based on either information + supplied on the kernel command line or by BOOTP or RARP protocols. + You need to say Y only for diskless machines requiring network + access to boot (in which case you want to say Y to "Root file system + on NFS" as well), because all other machines configure the network + in their startup scripts. + +config IP_PNP_DHCP + bool "IP: DHCP support" + depends on IP_PNP + ---help--- + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the DHCP protocol (a + special protocol designed for doing this job), say Y here. In case + the boot ROM of your network card was designed for booting Linux and + does DHCP itself, providing all necessary information on the kernel + command line, you can say N here. + + If unsure, say Y. Note that if you want to use DHCP, a DHCP server + must be operating on your network. Read + for details. + +config IP_PNP_BOOTP + bool "IP: BOOTP support" + depends on IP_PNP + ---help--- + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the BOOTP protocol (a + special protocol designed for doing this job), say Y here. In case + the boot ROM of your network card was designed for booting Linux and + does BOOTP itself, providing all necessary information on the kernel + command line, you can say N here. If unsure, say Y. Note that if you + want to use BOOTP, a BOOTP server must be operating on your network. + Read for details. + +config IP_PNP_RARP + bool "IP: RARP support" + depends on IP_PNP + help + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the RARP protocol (an + older protocol which is being obsoleted by BOOTP and DHCP), say Y + here. Note that if you want to use RARP, a RARP server must be + operating on your network. Read + for details. + +# not yet ready.. +# bool ' IP: ARP support' CONFIG_IP_PNP_ARP +config NET_IPIP + tristate "IP: tunneling" + select INET_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + encapsulation of IP within IP, which sounds kind of pointless, but + can be useful if you want to make your (or some other) machine + appear on a different network than it physically is, or to use + mobile-IP facilities (allowing laptops to seamlessly move between + networks without changing their IP addresses). + + Saying Y to this option will produce two modules ( = code which can + be inserted in and removed from the running kernel whenever you + want). Most people won't need this and can say N. + +config NET_IPGRE_DEMUX + tristate "IP: GRE demultiplexer" + help + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. + +config NET_IPGRE + tristate "IP: GRE tunnels over IP" + depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX + help + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + GRE (Generic Routing Encapsulation) and at this time allows + encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure. + This driver is useful if the other endpoint is a Cisco router: Cisco + likes GRE much better than the other Linux tunneling driver ("IP + tunneling" above). In addition, GRE allows multicast redistribution + through the tunnel. + +config NET_IPGRE_BROADCAST + bool "IP: broadcast GRE over IP" + depends on IP_MULTICAST && NET_IPGRE + help + One application of GRE/IP is to construct a broadcast WAN (Wide Area + Network), which looks like a normal Ethernet LAN (Local Area + Network), but can be distributed all over the Internet. If you want + to do that, say Y here and to "IP multicast routing" below. + +config IP_MROUTE + bool "IP: multicast routing" + depends on IP_MULTICAST + help + This is used if you want your machine to act as a router for IP + packets that have several destination addresses. It is needed on the + MBONE, a high bandwidth network on top of the Internet which carries + audio and video broadcasts. In order to do that, you would most + likely run the program mrouted. Information about the multicast + capabilities of the various network cards is contained in + . If you haven't heard + about it, you don't need it. + +config IP_MROUTE_MULTIPLE_TABLES + bool "IP: multicast policy routing" + depends on IP_MROUTE && IP_ADVANCED_ROUTER + select FIB_RULES + help + Normally, a multicast router runs a userspace daemon and decides + what to do with a multicast packet based on the source and + destination addresses. If you say Y here, the multicast router + will also be able to take interfaces and packet marks into + account and run multiple instances of userspace daemons + simultaneously, each one handling a single table. + + If unsure, say N. + +config IP_PIMSM_V1 + bool "IP: PIM-SM version 1 support" + depends on IP_MROUTE + help + Kernel side support for Sparse Mode PIM (Protocol Independent + Multicast) version 1. This multicast routing protocol is used widely + because Cisco supports it. You need special software to use it + (pimd-v1). Please see for more + information about PIM. + + Say Y if you want to use PIM-SM v1. Note that you can say N here if + you just want to use Dense Mode PIM. + +config IP_PIMSM_V2 + bool "IP: PIM-SM version 2 support" + depends on IP_MROUTE + help + Kernel side support for Sparse Mode PIM version 2. In order to use + this, you need an experimental routing daemon supporting it (pimd or + gated-5). This routing protocol is not used widely, so say N unless + you want to play with it. + +config ARPD + bool "IP: ARP daemon support" + ---help--- + The kernel maintains an internal cache which maps IP addresses to + hardware addresses on the local network, so that Ethernet/Token Ring/ + etc. frames are sent to the proper address on the physical networking + layer. Normally, kernel uses the ARP protocol to resolve these + mappings. + + Saying Y here adds support to have an user space daemon to do this + resolution instead. This is useful for implementing an alternate + address resolution protocol (e.g. NHRP on mGRE tunnels) and also for + testing purposes. + + If unsure, say N. + +config SYN_COOKIES + bool "IP: TCP syncookie support" + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote + users from being able to connect to your computer during an ongoing + attack and requires very little work from the attacker, who can + operate from anywhere on the Internet. + + SYN cookies provide protection against this type of attack. If you + say Y here, the TCP/IP stack will use a cryptographic challenge + protocol known as "SYN cookies" to enable legitimate users to + continue to connect, even when your machine is under attack. There + is no need for the legitimate users to change their TCP/IP software; + SYN cookies work transparently to them. For technical information + about SYN cookies, check out . + + If you are SYN flooded, the source address reported by the kernel is + likely to have been forged by the attacker; it is only reported as + an aid in tracing the packets to their actual source and should not + be taken as absolute truth. + + SYN cookies may prevent correct error reporting on clients when the + server is really overloaded. If this happens frequently better turn + them off. + + If you say Y here, you can disable SYN cookies at run time by + saying Y to "/proc file system support" and + "Sysctl support" below and executing the command + + echo 0 > /proc/sys/net/ipv4/tcp_syncookies + + after the /proc file system has been mounted. + + If unsure, say N. + +config INET_AH + tristate "IP: AH transformation" + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET_ESP + tristate "IP: ESP transformation" + select XFRM + select CRYPTO + select CRYPTO_AUTHENC + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_CBC + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET_IPCOMP + tristate "IP: IPComp transformation" + select INET_XFRM_TUNNEL + select XFRM_IPCOMP + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config INET_XFRM_TUNNEL + tristate + select INET_TUNNEL + default n + +config INET_TUNNEL + tristate + default n + +config INET_XFRM_MODE_TRANSPORT + tristate "IP: IPsec transport mode" + default y + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET_XFRM_MODE_TUNNEL + tristate "IP: IPsec tunnel mode" + default y + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + +config INET_XFRM_MODE_BEET + tristate "IP: IPsec BEET mode" + default y + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + +config INET_LRO + tristate "Large Receive Offload (ipv4/tcp)" + default y + ---help--- + Support for Large Receive Offload (ipv4/tcp). + + If unsure, say Y. + +config INET_DIAG + tristate "INET: socket monitoring interface" + default y + ---help--- + Support for INET (TCP, DCCP, etc) socket monitoring interface used by + native Linux tools such as ss. ss is included in iproute2, currently + downloadable at: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 + + If unsure, say Y. + +config INET_TCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + +menuconfig TCP_CONG_ADVANCED + bool "TCP: advanced congestion control" + ---help--- + Support for selection of various TCP congestion control + modules. + + Nearly all users can safely say no here, and a safe default + selection will be made (CUBIC with new Reno as a fallback). + + If unsure, say N. + +if TCP_CONG_ADVANCED + +config TCP_CONG_BIC + tristate "Binary Increase Congestion (BIC) control" + default m + ---help--- + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + +config TCP_CONG_CUBIC + tristate "CUBIC TCP" + default y + ---help--- + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + +config TCP_CONG_WESTWOOD + tristate "TCP Westwood+" + default m + ---help--- + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. + +config TCP_CONG_HTCP + tristate "H-TCP" + default m + ---help--- + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. + +config TCP_CONG_HSTCP + tristate "High Speed TCP" + depends on EXPERIMENTAL + default n + ---help--- + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html + +config TCP_CONG_HYBLA + tristate "TCP-Hybla congestion control algorithm" + depends on EXPERIMENTAL + default n + ---help--- + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, especially when sharing a common bottleneck with normal + terrestrial connections. + +config TCP_CONG_VEGAS + tristate "TCP Vegas" + depends on EXPERIMENTAL + default n + ---help--- + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. + +config TCP_CONG_SCALABLE + tristate "Scalable TCP" + depends on EXPERIMENTAL + default n + ---help--- + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www.deneholme.net/tom/scalable/ + +config TCP_CONG_LP + tristate "TCP Low Priority" + depends on EXPERIMENTAL + default n + ---help--- + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utilize only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ + +config TCP_CONG_VENO + tristate "TCP Veno" + depends on EXPERIMENTAL + default n + ---help--- + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See + +config TCP_CONG_YEAH + tristate "YeAH TCP" + depends on EXPERIMENTAL + select TCP_CONG_VEGAS + default n + ---help--- + YeAH-TCP is a sender-side high-speed enabled TCP congestion control + algorithm, which uses a mixed loss/delay approach to compute the + congestion window. It's design goals target high efficiency, + internal, RTT and Reno fairness, resilience to link loss while + keeping network elements load as low as possible. + + For further details look here: + http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + +config TCP_CONG_ILLINOIS + tristate "TCP Illinois" + depends on EXPERIMENTAL + default n + ---help--- + TCP-Illinois is a sender-side modification of TCP Reno for + high speed long delay links. It uses round-trip-time to + adjust the alpha and beta parameters to achieve a higher average + throughput and maintain fairness. + + For further details see: + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + +choice + prompt "Default TCP congestion control" + default DEFAULT_CUBIC + help + Select the TCP congestion control that will be used by default + for all connections. + + config DEFAULT_BIC + bool "Bic" if TCP_CONG_BIC=y + + config DEFAULT_CUBIC + bool "Cubic" if TCP_CONG_CUBIC=y + + config DEFAULT_HTCP + bool "Htcp" if TCP_CONG_HTCP=y + + config DEFAULT_HYBLA + bool "Hybla" if TCP_CONG_HYBLA=y + + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + + config DEFAULT_WESTWOOD + bool "Westwood" if TCP_CONG_WESTWOOD=y + + config DEFAULT_RENO + bool "Reno" + +endchoice + +endif + +config TCP_CONG_CUBIC + tristate + depends on !TCP_CONG_ADVANCED + default y + +config DEFAULT_TCP_CONG + string + default "bic" if DEFAULT_BIC + default "cubic" if DEFAULT_CUBIC + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO + default "cubic" + +config TCP_MD5SIG + bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)" + depends on EXPERIMENTAL + select CRYPTO + select CRYPTO_MD5 + ---help--- + RFC2385 specifies a method of giving MD5 protection to TCP sessions. + Its main (only?) use is to protect BGP sessions between core routers + on the Internet. + + If unsure, say N. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile new file mode 100644 index 00000000..681084d7 --- /dev/null +++ b/net/ipv4/Makefile @@ -0,0 +1,54 @@ +# +# Makefile for the Linux TCP/IP (INET) layer. +# + +obj-y := route.o inetpeer.o protocol.o \ + ip_input.o ip_fragment.o ip_forward.o ip_options.o \ + ip_output.o ip_sockglue.o inet_hashtables.o \ + inet_timewait_sock.o inet_connection_sock.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ + tcp_minisocks.o tcp_cong.o \ + datagram.o raw.o udp.o udplite.o \ + arp.o icmp.o devinet.o af_inet.o igmp.o \ + fib_frontend.o fib_semantics.o fib_trie.o \ + inet_fragment.o ping.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o +obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o +obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o +obj-$(CONFIG_IP_MROUTE) += ipmr.o +obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o +obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_SYN_COOKIES) += syncookies.o +obj-$(CONFIG_INET_AH) += ah4.o +obj-$(CONFIG_INET_ESP) += esp4.o +obj-$(CONFIG_INET_IPCOMP) += ipcomp.o +obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o +obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o +obj-$(CONFIG_INET_LRO) += inet_lro.o +obj-$(CONFIG_INET_TUNNEL) += tunnel4.o +obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o +obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o +obj-$(CONFIG_IP_PNP) += ipconfig.o +obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ +obj-$(CONFIG_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o +obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o +obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o +obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o +obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o +obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o +obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o +obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_NETLABEL) += cipso_ipv4.o + +obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ + xfrm4_output.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c new file mode 100644 index 00000000..4d60f12c --- /dev/null +++ b/net/ipv4/af_inet.c @@ -0,0 +1,1823 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET protocol family socket handler. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Florian La Roche, + * Alan Cox, + * + * Changes (see also sock.c) + * + * piggy, + * Karl Knutson : Socket protocol table + * A.N.Kuznetsov : Socket death error in accept(). + * John Richardson : Fix non blocking error in connect() + * so sockets that fail to connect + * don't return -EINPROGRESS. + * Alan Cox : Asynchronous I/O support + * Alan Cox : Keep correct socket pointer on sock + * structures + * when accept() ed + * Alan Cox : Semantics of SO_LINGER aren't state + * moved to close when you look carefully. + * With this fixed and the accept bug fixed + * some RPC stuff seems happier. + * Niibe Yutaka : 4.4BSD style write async I/O + * Alan Cox, + * Tony Gale : Fixed reuse semantics. + * Alan Cox : bind() shouldn't abort existing but dead + * sockets. Stops FTP netin:.. I hope. + * Alan Cox : bind() works correctly for RAW sockets. + * Note that FreeBSD at least was broken + * in this respect so be careful with + * compatibility tests... + * Alan Cox : routing cache support + * Alan Cox : memzero the socket structure for + * compactness. + * Matt Day : nonblock connect error handler + * Alan Cox : Allow large numbers of pending sockets + * (eg for big web sites), but only if + * specifically application requested. + * Alan Cox : New buffering throughout IP. Used + * dumbly. + * Alan Cox : New buffering now used smartly. + * Alan Cox : BSD rather than common sense + * interpretation of listen. + * Germano Caronni : Assorted small races. + * Alan Cox : sendmsg/recvmsg basic support. + * Alan Cox : Only sendmsg/recvmsg now supported. + * Alan Cox : Locked down bind (see security list). + * Alan Cox : Loosened bind a little. + * Mike McLagan : ADD/DEL DLCI Ioctls + * Willy Konynenberg : Transparent proxying support. + * David S. Miller : New socket lookup architecture. + * Some other random speedups. + * Cyrus Durgin : Cleaned up file for kmod hacks. + * Andi Kleen : Fix inet_stream_connect TCP race. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IP_MROUTE +#include +#endif + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +#include + +static inline int current_has_network(void) +{ + return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); +} +#else +static inline int current_has_network(void) +{ + return 1; +} +#endif + +/* The inetsw table contains everything that inet_create needs to + * build a new socket. + */ +static struct list_head inetsw[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw_lock); + +struct ipv4_config ipv4_config; +EXPORT_SYMBOL(ipv4_config); + +/* New destruction routine */ + +void inet_sock_destruct(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + + __skb_queue_purge(&sk->sk_receive_queue); + __skb_queue_purge(&sk->sk_error_queue); + + sk_mem_reclaim(sk); + + if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { + pr_err("Attempt to release TCP socket in state %d %p\n", + sk->sk_state, sk); + return; + } + if (!sock_flag(sk, SOCK_DEAD)) { + pr_err("Attempt to release alive inet socket %p\n", sk); + return; + } + + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + WARN_ON(sk->sk_wmem_queued); + WARN_ON(sk->sk_forward_alloc); + + kfree(rcu_dereference_protected(inet->inet_opt, 1)); + dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + sk_refcnt_debug_dec(sk); +} +EXPORT_SYMBOL(inet_sock_destruct); + +/* + * The routines beyond this point handle the behaviour of an AF_INET + * socket object. Mostly it punts to the subprotocols of IP to do + * the work. + */ + +/* + * Automatically bind an unbound socket. + */ + +static int inet_autobind(struct sock *sk) +{ + struct inet_sock *inet; + /* We may need to bind the socket. */ + lock_sock(sk); + inet = inet_sk(sk); + if (!inet->inet_num) { + if (sk->sk_prot->get_port(sk, 0)) { + release_sock(sk); + return -EAGAIN; + } + inet->inet_sport = htons(inet->inet_num); + } + release_sock(sk); + return 0; +} + +/* + * Move a socket into listening state. + */ +int inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != TCP_LISTEN) { + err = inet_csk_listen_start(sk, backlog); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} +EXPORT_SYMBOL(inet_listen); + +u32 inet_ehash_secret __read_mostly; +EXPORT_SYMBOL(inet_ehash_secret); + +/* + * inet_ehash_secret must be set exactly once + */ +void build_ehash_secret(void) +{ + u32 rnd; + + do { + get_random_bytes(&rnd, sizeof(rnd)); + } while (rnd == 0); + + cmpxchg(&inet_ehash_secret, 0, rnd); +} +EXPORT_SYMBOL(build_ehash_secret); + +static inline int inet_netns_ok(struct net *net, int protocol) +{ + int hash; + const struct net_protocol *ipprot; + + if (net_eq(net, &init_net)) + return 1; + + hash = protocol & (MAX_INET_PROTOS - 1); + ipprot = rcu_dereference(inet_protos[hash]); + + if (ipprot == NULL) + /* raw IP is OK */ + return 1; + return ipprot->netns_ok; +} + + +/* + * Create an inet socket. + */ + +static int inet_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct inet_protosw *answer; + struct inet_sock *inet; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int try_loading_module = 0; + int err; + + if (!current_has_network()) + return -EACCES; + + if (unlikely(!inet_ehash_secret)) + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + build_ehash_secret(); + + sock->state = SS_UNCONNECTED; + + /* Look for the requested type/protocol pair. */ +lookup_protocol: + err = -ESOCKTNOSUPPORT; + rcu_read_lock(); + list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { + + err = 0; + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + err = -EPROTONOSUPPORT; + } + + if (unlikely(err)) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-2-proto-132-type-1 + * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-2-proto-132 + * (net-pf-PF_INET-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + + err = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; + + err = -EAFNOSUPPORT; + if (!inet_netns_ok(net, protocol)) + goto out_rcu_unlock; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + WARN_ON(answer_prot->slab == NULL); + + err = -ENOBUFS; + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); + if (sk == NULL) + goto out; + + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + + inet->nodefrag = 0; + + if (SOCK_RAW == sock->type) { + inet->inet_num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + + inet->inet_id = 0; + + sock_init_data(sock, sk); + + sk->sk_destruct = inet_sock_destruct; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + + inet->uc_ttl = -1; + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_all = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + + sk_refcnt_debug_inc(sk); + + if (inet->inet_num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically + * shares. + */ + inet->inet_sport = htons(inet->inet_num); + /* Add to protocol hash chains. */ + sk->sk_prot->hash(sk); + } + + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) + sk_common_release(sk); + } +out: + return err; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* + * The peer socket should always be NULL (or else). When we call this + * function we are destroying the object and from then on nobody + * should refer to it. + */ +int inet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + long timeout; + + sock_rps_reset_flow(sk); + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); + + /* If linger is set, we don't return until the close + * is complete. Otherwise we return immediately. The + * actually closing is done the same either way. + * + * If the close is due to the process exiting, we never + * linger.. + */ + timeout = 0; + if (sock_flag(sk, SOCK_LINGER) && + !(current->flags & PF_EXITING)) + timeout = sk->sk_lingertime; + sock->sk = NULL; + sk->sk_prot->close(sk, timeout); + } + return 0; +} +EXPORT_SYMBOL(inet_release); + +/* It is off by default, see below. */ +int sysctl_ip_nonlocal_bind __read_mostly; +EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); + +int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + unsigned short snum; + int chk_addr_ret; + int err; + + /* If the socket has its own bind function then use it. (RAW) */ + if (sk->sk_prot->bind) { + err = sk->sk_prot->bind(sk, uaddr, addr_len); + goto out; + } + err = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + if (addr->sin_family != AF_INET) { + err = -EAFNOSUPPORT; + goto out; + } + + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + + /* Not specified by any standard per-se, however it breaks too + * many applications when removed. It is unfortunate since + * allowing applications to make a non-local bind solves + * several problems with systems using dynamic addressing. + * (ie. your servers still start up even if your ISDN link + * is temporarily down) + */ + err = -EADDRNOTAVAIL; + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + addr->sin_addr.s_addr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) + goto out; + + snum = ntohs(addr->sin_port); + err = -EACCES; + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + goto out; + + /* We keep a pair of addresses. rcv_saddr is the one + * used by hash lookups, and saddr is used for transmit. + * + * In the BSD API these are the same except where it + * would be illegal to use them (multicast/broadcast) in + * which case the sending device address is used. + */ + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE || inet->inet_num) + goto out_release_sock; + + inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + + if (inet->inet_rcv_saddr) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->inet_sport = htons(inet->inet_num); + inet->inet_daddr = 0; + inet->inet_dport = 0; + sk_dst_reset(sk); + err = 0; +out_release_sock: + release_sock(sk); +out: + return err; +} +EXPORT_SYMBOL(inet_bind); + +int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + if (uaddr->sa_family == AF_UNSPEC) + return sk->sk_prot->disconnect(sk, flags); + + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) + return -EAGAIN; + return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); +} +EXPORT_SYMBOL(inet_dgram_connect); + +static long inet_wait_for_connect(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + /* Basic assumption: if someone sets sk->sk_err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ + while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + if (signal_pending(current) || !timeo) + break; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk_sleep(sk), &wait); + return timeo; +} + +/* + * Connect to a remote host. There is regrettably still a little + * TCP 'magic' in here. + */ +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + int err; + long timeo; + + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + + lock_sock(sk); + + if (uaddr->sa_family == AF_UNSPEC) { + err = sk->sk_prot->disconnect(sk, flags); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + goto out; + } + + switch (sock->state) { + default: + err = -EINVAL; + goto out; + case SS_CONNECTED: + err = -EISCONN; + goto out; + case SS_CONNECTING: + err = -EALREADY; + /* Fall out of switch with err, set for this state */ + break; + case SS_UNCONNECTED: + err = -EISCONN; + if (sk->sk_state != TCP_CLOSE) + goto out; + + err = sk->sk_prot->connect(sk, uaddr, addr_len); + if (err < 0) + goto out; + + sock->state = SS_CONNECTING; + + /* Just entered SS_CONNECTING state; the only + * difference is that return value in non-blocking + * case is EINPROGRESS, rather than EALREADY. + */ + err = -EINPROGRESS; + break; + } + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Error code is set above */ + if (!timeo || !inet_wait_for_connect(sk, timeo)) + goto out; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + } + + /* Connection was closed by RST, timeout, ICMP error + * or another process disconnected us. + */ + if (sk->sk_state == TCP_CLOSE) + goto sock_error; + + /* sk->sk_err may be not zero now, if RECVERR was ordered by user + * and error was received after socket entered established state. + * Hence, it is handled normally after connect() return successfully. + */ + + sock->state = SS_CONNECTED; + err = 0; +out: + release_sock(sk); + return err; + +sock_error: + err = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + if (sk->sk_prot->disconnect(sk, flags)) + sock->state = SS_DISCONNECTING; + goto out; +} +EXPORT_SYMBOL(inet_stream_connect); + +/* + * Accept a pending connection. The TCP layer now gives BSD semantics. + */ + +int inet_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk1 = sock->sk; + int err = -EINVAL; + struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); + + if (!sk2) + goto do_err; + + lock_sock(sk2); + + sock_rps_record_flow(sk2); + WARN_ON(!((1 << sk2->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); + + sock_graft(sk2, newsock); + + newsock->state = SS_CONNECTED; + err = 0; + release_sock(sk2); +do_err: + return err; +} +EXPORT_SYMBOL(inet_accept); + + +/* + * This does both peername and sockname. + */ +int inet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); + + sin->sin_family = AF_INET; + if (peer) { + if (!inet->inet_dport || + (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1)) + return -ENOTCONN; + sin->sin_port = inet->inet_dport; + sin->sin_addr.s_addr = inet->inet_daddr; + } else { + __be32 addr = inet->inet_rcv_saddr; + if (!addr) + addr = inet->inet_saddr; + sin->sin_port = inet->inet_sport; + sin->sin_addr.s_addr = addr; + } + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + *uaddr_len = sizeof(*sin); + return 0; +} +EXPORT_SYMBOL(inet_getname); + +int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size) +{ + struct sock *sk = sock->sk; + + sock_rps_record_flow(sk); + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && + inet_autobind(sk)) + return -EAGAIN; + + return sk->sk_prot->sendmsg(iocb, sk, msg, size); +} +EXPORT_SYMBOL(inet_sendmsg); + +ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + + sock_rps_record_flow(sk); + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && + inet_autobind(sk)) + return -EAGAIN; + + if (sk->sk_prot->sendpage) + return sk->sk_prot->sendpage(sk, page, offset, size, flags); + return sock_no_sendpage(sock, page, offset, size, flags); +} +EXPORT_SYMBOL(inet_sendpage); + +int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + sock_rps_record_flow(sk); + + err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} +EXPORT_SYMBOL(inet_recvmsg); + +int inet_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + /* This should really check to make sure + * the socket is a TCP socket. (WHY AC...) + */ + how++; /* maps 0->1 has the advantage of making bit 1 rcvs and + 1->2 bit 2 snds. + 2->3 */ + if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ + return -EINVAL; + + lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if ((1 << sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + switch (sk->sk_state) { + case TCP_CLOSE: + err = -ENOTCONN; + /* Hack to wake up other listeners, who can poll for + POLLHUP, even on eg. unconnected UDP sockets -- RR */ + default: + sk->sk_shutdown |= how; + if (sk->sk_prot->shutdown) + sk->sk_prot->shutdown(sk, how); + break; + + /* Remaining two branches are temporary solution for missing + * close() in multithreaded environment. It is _not_ a good idea, + * but we have no choice until close() is repaired at VFS level. + */ + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* Fall through */ + case TCP_SYN_SENT: + err = sk->sk_prot->disconnect(sk, O_NONBLOCK); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + break; + } + + /* Wake up anyone sleeping in poll. */ + sk->sk_state_change(sk); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(inet_shutdown); + +/* + * ioctl() calls you can issue on an INET socket. Most of these are + * device configuration and stuff and very rarely used. Some ioctls + * pass on to the socket itself. + * + * NOTE: I like the idea of a module for the config stuff. ie ifconfig + * loads the devconfigure module does its configuring and unloads it. + * There's a good 20K of config code hanging around the kernel. + */ + +int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = 0; + struct net *net = sock_net(sk); + + switch (cmd) { + case SIOCGSTAMP: + err = sock_get_timestamp(sk, (struct timeval __user *)arg); + break; + case SIOCGSTAMPNS: + err = sock_get_timestampns(sk, (struct timespec __user *)arg); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCRTMSG: + err = ip_rt_ioctl(net, cmd, (void __user *)arg); + break; + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + err = arp_ioctl(net, cmd, (void __user *)arg); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: + case SIOCKILLADDR: + err = devinet_ioctl(net, cmd, (void __user *)arg); + break; + default: + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; + break; + } + return err; +} +EXPORT_SYMBOL(inet_ioctl); + +#ifdef CONFIG_COMPAT +int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = -ENOIOCTLCMD; + + if (sk->sk_prot->compat_ioctl) + err = sk->sk_prot->compat_ioctl(sk, cmd, arg); + + return err; +} +#endif + +const struct proto_ops inet_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = tcp_poll, + .ioctl = inet_ioctl, + .listen = inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, + .splice_read = tcp_splice_read, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, +#endif +}; +EXPORT_SYMBOL(inet_stream_ops); + +const struct proto_ops inet_dgram_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = inet_getname, + .poll = udp_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, +#endif +}; +EXPORT_SYMBOL(inet_dgram_ops); + +/* + * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without + * udp_poll + */ +static const struct proto_ops inet_sockraw_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = inet_getname, + .poll = datagram_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, +#endif +}; + +static const struct net_proto_family inet_family_ops = { + .family = PF_INET, + .create = inet_create, + .owner = THIS_MODULE, +}; + +/* Upon startup we insert all the elements in inetsw_array[] into + * the linked list inetsw. + */ +static struct inet_protosw inetsw_array[] = +{ + { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcp_prot, + .ops = &inet_stream_ops, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, + }, + + { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udp_prot, + .ops = &inet_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, + }, + + { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMP, + .prot = &ping_prot, + .ops = &inet_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, + }, + + { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &raw_prot, + .ops = &inet_sockraw_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, + } +}; + +#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) + +void inet_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + int protocol = p->protocol; + struct list_head *last_perm; + + spin_lock_bh(&inetsw_lock); + + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + last_perm = &inetsw[p->type]; + list_for_each(lh, &inetsw[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); +out: + spin_unlock_bh(&inetsw_lock); + + return; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} +EXPORT_SYMBOL(inet_register_protosw); + +void inet_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw_lock); + + synchronize_net(); + } +} +EXPORT_SYMBOL(inet_unregister_protosw); + +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr __read_mostly; + +static int inet_sk_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + __be32 old_saddr = inet->inet_saddr; + __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + __be32 new_saddr; + struct ip_options_rcu *inet_opt; + + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + + /* Query new route. */ + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, sk->sk_protocol, + inet->inet_sport, inet->inet_dport, sk, false); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + sk_setup_caps(sk, &rt->dst); + + new_saddr = fl4->saddr; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n", + __func__, &old_saddr, &new_saddr); + } + + inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; + + /* + * XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __sk_prot_rehash(sk); + return 0; +} + +int inet_sk_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + __be32 daddr; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + daddr = inet->inet_daddr; + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + rcu_read_unlock(); + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, + inet->inet_dport, inet->inet_sport, + sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (!IS_ERR(rt)) { + err = 0; + sk_setup_caps(sk, &rt->dst); + } else { + err = PTR_ERR(rt); + + /* Routing failed... */ + sk->sk_route_caps = 0; + /* + * Other protocols have to map its equivalent state to TCP_SYN_SENT. + * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme + */ + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = inet_sk_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + } + + return err; +} +EXPORT_SYMBOL(inet_sk_rebuild_header); + +static int inet_gso_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + const struct net_protocol *ops; + int proto; + int ihl; + int err = -EINVAL; + + if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + goto out; + + iph = ip_hdr(skb); + ihl = iph->ihl * 4; + if (ihl < sizeof(*iph)) + goto out; + + if (unlikely(!pskb_may_pull(skb, ihl))) + goto out; + + __skb_pull(skb, ihl); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); + proto = iph->protocol & (MAX_INET_PROTOS - 1); + err = -EPROTONOSUPPORT; + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (likely(ops && ops->gso_send_check)) + err = ops->gso_send_check(skb); + rcu_read_unlock(); + +out: + return err; +} + +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct iphdr *iph; + const struct net_protocol *ops; + int proto; + int ihl; + int id; + unsigned int offset = 0; + + if (!(features & NETIF_F_V4_CSUM)) + features &= ~NETIF_F_SG; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + 0))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + goto out; + + iph = ip_hdr(skb); + ihl = iph->ihl * 4; + if (ihl < sizeof(*iph)) + goto out; + + if (unlikely(!pskb_may_pull(skb, ihl))) + goto out; + + __skb_pull(skb, ihl); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); + id = ntohs(iph->id); + proto = iph->protocol & (MAX_INET_PROTOS - 1); + segs = ERR_PTR(-EPROTONOSUPPORT); + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (likely(ops && ops->gso_segment)) + segs = ops->gso_segment(skb, features); + rcu_read_unlock(); + + if (!segs || IS_ERR(segs)) + goto out; + + skb = segs; + do { + iph = ip_hdr(skb); + if (proto == IPPROTO_UDP) { + iph->id = htons(id); + iph->frag_off = htons(offset >> 3); + if (skb->next != NULL) + iph->frag_off |= htons(IP_MF); + offset += (skb->len - skb->mac_len - iph->ihl * 4); + } else + iph->id = htons(id++); + iph->tot_len = htons(skb->len - skb->mac_len); + iph->check = 0; + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); + } while ((skb = skb->next)); + +out: + return segs; +} + +static struct sk_buff **inet_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_protocol *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + const struct iphdr *iph; + unsigned int hlen; + unsigned int off; + unsigned int id; + int flush = 1; + int proto; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*iph); + iph = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + iph = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!iph)) + goto out; + } + + proto = iph->protocol & (MAX_INET_PROTOS - 1); + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (!ops || !ops->gro_receive) + goto out_unlock; + + if (*(u8 *)iph != 0x45) + goto out_unlock; + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto out_unlock; + + id = ntohl(*(__be32 *)&iph->id); + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + id >>= 16; + + for (p = *head; p; p = p->next) { + struct iphdr *iph2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + iph2 = ip_hdr(p); + + if ((iph->protocol ^ iph2->protocol) | + (iph->tos ^ iph2->tos) | + ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | + ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* All fields must match except length and checksum. */ + NAPI_GRO_CB(p)->flush |= + (iph->ttl ^ iph2->ttl) | + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); + + NAPI_GRO_CB(p)->flush |= flush; + } + + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); + + pp = ops->gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int inet_gro_complete(struct sk_buff *skb) +{ + const struct net_protocol *ops; + struct iphdr *iph = ip_hdr(skb); + int proto = iph->protocol & (MAX_INET_PROTOS - 1); + int err = -ENOSYS; + __be16 newlen = htons(skb->len - skb_network_offset(skb)); + + csum_replace2(&iph->check, iph->tot_len, newlen); + iph->tot_len = newlen; + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (WARN_ON(!ops || !ops->gro_complete)) + goto out_unlock; + + err = ops->gro_complete(skb); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +int inet_ctl_sock_create(struct sock **sk, unsigned short family, + unsigned short type, unsigned char protocol, + struct net *net) +{ + struct socket *sock; + int rc = sock_create_kern(family, type, protocol, &sock); + + if (rc == 0) { + *sk = sock->sk; + (*sk)->sk_allocation = GFP_ATOMIC; + /* + * Unhash it so that IP input processing does not even see it, + * we do not wish this socket to see incoming packets. + */ + (*sk)->sk_prot->unhash(*sk); + + sk_change_net(*sk, net); + } + return rc; +} +EXPORT_SYMBOL_GPL(inet_ctl_sock_create); + +unsigned long snmp_fold_field(void __percpu *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for_each_possible_cpu(i) { + res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} +EXPORT_SYMBOL_GPL(snmp_fold_field); + +#if BITS_PER_LONG==32 + +u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +{ + u64 res = 0; + int cpu; + + for_each_possible_cpu(cpu) { + void *bhptr, *userptr; + struct u64_stats_sync *syncp; + u64 v_bh, v_user; + unsigned int start; + + /* first mib used by softirq context, we must use _bh() accessors */ + bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu); + syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); + do { + start = u64_stats_fetch_begin_bh(syncp); + v_bh = *(((u64 *) bhptr) + offt); + } while (u64_stats_fetch_retry_bh(syncp, start)); + + /* second mib used in USER context */ + userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu); + syncp = (struct u64_stats_sync *)(userptr + syncp_offset); + do { + start = u64_stats_fetch_begin(syncp); + v_user = *(((u64 *) userptr) + offt); + } while (u64_stats_fetch_retry(syncp, start)); + + res += v_bh + v_user; + } + return res; +} +EXPORT_SYMBOL_GPL(snmp_fold_field64); +#endif + +int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) +{ + BUG_ON(ptr == NULL); + ptr[0] = __alloc_percpu(mibsize, align); + if (!ptr[0]) + goto err0; + ptr[1] = __alloc_percpu(mibsize, align); + if (!ptr[1]) + goto err1; + return 0; +err1: + free_percpu(ptr[0]); + ptr[0] = NULL; +err0: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(snmp_mib_init); + +void snmp_mib_free(void __percpu *ptr[2]) +{ + BUG_ON(ptr == NULL); + free_percpu(ptr[0]); + free_percpu(ptr[1]); + ptr[0] = ptr[1] = NULL; +} +EXPORT_SYMBOL_GPL(snmp_mib_free); + +#ifdef CONFIG_IP_MULTICAST +static const struct net_protocol igmp_protocol = { + .handler = igmp_rcv, + .netns_ok = 1, +}; +#endif + +static const struct net_protocol tcp_protocol = { + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol udp_protocol = { + .handler = udp_rcv, + .err_handler = udp_err, + .gso_send_check = udp4_ufo_send_check, + .gso_segment = udp4_ufo_fragment, + .no_policy = 1, + .netns_ok = 1, +}; + +static const struct net_protocol icmp_protocol = { + .handler = icmp_rcv, + .err_handler = ping_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static __net_init int ipv4_mib_init_net(struct net *net) +{ + if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, + sizeof(struct tcp_mib), + __alignof__(struct tcp_mib)) < 0) + goto err_tcp_mib; + if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp_mib_init((void __percpu **)net->mib.net_statistics, + sizeof(struct linux_mib), + __alignof__(struct linux_mib)) < 0) + goto err_net_mib; + if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udp_mib; + if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udplite_mib; + if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, + sizeof(struct icmp_mib), + __alignof__(struct icmp_mib)) < 0) + goto err_icmp_mib; + if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, + sizeof(struct icmpmsg_mib), + __alignof__(struct icmpmsg_mib)) < 0) + goto err_icmpmsg_mib; + + tcp_mib_init(net); + return 0; + +err_icmpmsg_mib: + snmp_mib_free((void __percpu **)net->mib.icmp_statistics); +err_icmp_mib: + snmp_mib_free((void __percpu **)net->mib.udplite_statistics); +err_udplite_mib: + snmp_mib_free((void __percpu **)net->mib.udp_statistics); +err_udp_mib: + snmp_mib_free((void __percpu **)net->mib.net_statistics); +err_net_mib: + snmp_mib_free((void __percpu **)net->mib.ip_statistics); +err_ip_mib: + snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +err_tcp_mib: + return -ENOMEM; +} + +static __net_exit void ipv4_mib_exit_net(struct net *net) +{ + snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); + snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + snmp_mib_free((void __percpu **)net->mib.udp_statistics); + snmp_mib_free((void __percpu **)net->mib.net_statistics); + snmp_mib_free((void __percpu **)net->mib.ip_statistics); + snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +} + +static __net_initdata struct pernet_operations ipv4_mib_ops = { + .init = ipv4_mib_init_net, + .exit = ipv4_mib_exit_net, +}; + +static int __init init_ipv4_mibs(void) +{ + return register_pernet_subsys(&ipv4_mib_ops); +} + +static int ipv4_proc_init(void); + +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IP), + .func = ip_rcv, + .gso_send_check = inet_gso_send_check, + .gso_segment = inet_gso_segment, + .gro_receive = inet_gro_receive, + .gro_complete = inet_gro_complete, +}; + +static int __init inet_init(void) +{ + struct sk_buff *dummy_skb; + struct inet_protosw *q; + struct list_head *r; + int rc = -EINVAL; + + BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); + + sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!sysctl_local_reserved_ports) + goto out; + + rc = proto_register(&tcp_prot, 1); + if (rc) + goto out_free_reserved_ports; + + rc = proto_register(&udp_prot, 1); + if (rc) + goto out_unregister_tcp_proto; + + rc = proto_register(&raw_prot, 1); + if (rc) + goto out_unregister_udp_proto; + + rc = proto_register(&ping_prot, 1); + if (rc) + goto out_unregister_raw_proto; + + /* + * Tell SOCKET that we are alive... + */ + + (void)sock_register(&inet_family_ops); + +#ifdef CONFIG_SYSCTL + ip_static_sysctl_init(); +#endif + + /* + * Add all the base protocols. + */ + + if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); + if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) + printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); + if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) + printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +#ifdef CONFIG_IP_MULTICAST + if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +#endif + + /* Register the socket-side information for inet_create. */ + for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) + inet_register_protosw(q); + + /* + * Set the ARP module up + */ + + arp_init(); + + /* + * Set the IP module up + */ + + ip_init(); + + tcp_v4_init(); + + /* Setup TCP slab cache for open requests. */ + tcp_init(); + + /* Setup UDP memory threshold */ + udp_init(); + + /* Add UDP-Lite (RFC 3828) */ + udplite4_register(); + + ping_init(); + + /* + * Set the ICMP layer up + */ + + if (icmp_init() < 0) + panic("Failed to create the ICMP control socket.\n"); + + /* + * Initialise the multicast router + */ +#if defined(CONFIG_IP_MROUTE) + if (ip_mr_init()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); +#endif + /* + * Initialise per-cpu ipv4 mibs + */ + + if (init_ipv4_mibs()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); + + ipv4_proc_init(); + + ipfrag_init(); + + dev_add_pack(&ip_packet_type); + + rc = 0; +out: + return rc; +out_unregister_raw_proto: + proto_unregister(&raw_prot); +out_unregister_udp_proto: + proto_unregister(&udp_prot); +out_unregister_tcp_proto: + proto_unregister(&tcp_prot); +out_free_reserved_ports: + kfree(sysctl_local_reserved_ports); + goto out; +} + +fs_initcall(inet_init); + +/* ------------------------------------------------------------------------ */ + +#ifdef CONFIG_PROC_FS +static int __init ipv4_proc_init(void) +{ + int rc = 0; + + if (raw_proc_init()) + goto out_raw; + if (tcp4_proc_init()) + goto out_tcp; + if (udp4_proc_init()) + goto out_udp; + if (ping_proc_init()) + goto out_ping; + if (ip_misc_proc_init()) + goto out_misc; +out: + return rc; +out_misc: + ping_proc_exit(); +out_ping: + udp4_proc_exit(); +out_udp: + tcp4_proc_exit(); +out_tcp: + raw_proc_exit(); +out_raw: + rc = -ENOMEM; + goto out; +} + +#else /* CONFIG_PROC_FS */ +static int __init ipv4_proc_init(void) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +MODULE_ALIAS_NETPROTO(PF_INET); + diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c new file mode 100644 index 00000000..36d14406 --- /dev/null +++ b/net/ipv4/ah4.c @@ -0,0 +1,535 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ah_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) + +static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, + unsigned int size) +{ + unsigned int len; + + len = size + crypto_ahash_digestsize(ahash) + + (crypto_ahash_alignmask(ahash) & + ~(crypto_tfm_ctx_alignment() - 1)); + + len = ALIGN(len, crypto_tfm_ctx_alignment()); + + len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) +{ + return tmp + offset; +} + +static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, + unsigned int offset) +{ + return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); +} + +static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, + u8 *icv) +{ + struct ahash_request *req; + + req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), + crypto_tfm_ctx_alignment()); + + ahash_request_set_tfm(req, ahash); + + return req; +} + +static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, + struct ahash_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_ahash_reqsize(ahash), + __alignof__(struct scatterlist)); +} + +/* Clear mutable options and find final destination to substitute + * into IP header for icv calculation. Options are already checked + * for validity, so paranoia is not required. */ + +static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) +{ + unsigned char * optptr = (unsigned char*)(iph+1); + int l = iph->ihl*4 - sizeof(struct iphdr); + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return 0; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return -EINVAL; + switch (*optptr) { + case IPOPT_SEC: + case 0x85: /* Some "Extended Security" crap. */ + case IPOPT_CIPSO: + case IPOPT_RA: + case 0x80|21: /* RFC1770 */ + break; + case IPOPT_LSRR: + case IPOPT_SSRR: + if (optlen < 6) + return -EINVAL; + memcpy(daddr, optptr+optlen-4, 4); + /* Fall through */ + default: + memset(optptr, 0, optlen); + } + l -= optlen; + optptr += optlen; + } + return 0; +} + +static void ah_output_done(struct crypto_async_request *base, int err) +{ + u8 *icv; + struct iphdr *iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = skb_dst(skb)->xfrm; + struct ah_data *ahp = x->data; + struct iphdr *top_iph = ip_hdr(skb); + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int ihl = ip_hdrlen(skb); + + iph = AH_SKB_CB(skb)->tmp; + icv = ah_tmp_icv(ahp->ahash, iph, ihl); + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + top_iph->frag_off = iph->frag_off; + if (top_iph->ihl != 5) { + top_iph->daddr = iph->daddr; + memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + } + + kfree(AH_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + +static int ah_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int nfrags; + int ihl; + u8 *icv; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct iphdr *iph, *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + + ahp = x->data; + ahash = ahp->ahash; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + skb_push(skb, -skb_network_offset(skb)); + ah = ip_auth_hdr(skb); + ihl = ip_hdrlen(skb); + + err = -ENOMEM; + iph = ah_alloc_tmp(ahash, nfrags, ihl); + if (!iph) + goto out; + + icv = ah_tmp_icv(ahash, iph, ihl); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + memset(ah->auth_data, 0, ahp->icv_trunc_len); + + top_iph = ip_hdr(skb); + + iph->tos = top_iph->tos; + iph->ttl = top_iph->ttl; + iph->frag_off = top_iph->frag_off; + + if (top_iph->ihl != 5) { + iph->daddr = top_iph->daddr; + memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + err = ip_clear_mutable_options(top_iph, &top_iph->daddr); + if (err) + goto out_free; + } + + ah->nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_AH; + + top_iph->tos = 0; + top_iph->tot_len = htons(skb->len); + top_iph->frag_off = 0; + top_iph->ttl = 0; + top_iph->check = 0; + + if (x->props.flags & XFRM_STATE_ALIGN4) + ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + else + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah_output_done, skb); + + AH_SKB_CB(skb)->tmp = iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; + } + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + top_iph->frag_off = iph->frag_off; + if (top_iph->ihl != 5) { + top_iph->daddr = iph->daddr; + memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + } + +out_free: + kfree(iph); +out: + return err; +} + +static void ah_input_done(struct crypto_async_request *base, int err) +{ + u8 *auth_data; + u8 *icv; + struct iphdr *work_iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = xfrm_input_state(skb); + struct ah_data *ahp = x->data; + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int ihl = ip_hdrlen(skb); + int ah_hlen = (ah->hdrlen + 2) << 2; + + work_iph = AH_SKB_CB(skb)->tmp; + auth_data = ah_tmp_auth(work_iph, ihl); + icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out; + + err = ah->nexthdr; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, ihl); + __skb_pull(skb, ah_hlen + ihl); + skb_set_transport_header(skb, -ihl); +out: + kfree(AH_SKB_CB(skb)->tmp); + xfrm_input_resume(skb, err); +} + +static int ah_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ah_hlen; + int ihl; + int nexthdr; + int nfrags; + u8 *auth_data; + u8 *icv; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct iphdr *iph, *work_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + int err = -ENOMEM; + + if (!pskb_may_pull(skb, sizeof(*ah))) + goto out; + + ah = (struct ip_auth_hdr *)skb->data; + ahp = x->data; + ahash = ahp->ahash; + + nexthdr = ah->nexthdr; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (x->props.flags & XFRM_STATE_ALIGN4) { + if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } else { + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + ah = (struct ip_auth_hdr *)skb->data; + iph = ip_hdr(skb); + ihl = ip_hdrlen(skb); + + work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (!work_iph) + goto out; + + auth_data = ah_tmp_auth(work_iph, ihl); + icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + memcpy(work_iph, iph, ihl); + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + + iph->ttl = 0; + iph->tos = 0; + iph->frag_off = 0; + iph->check = 0; + if (ihl > sizeof(*iph)) { + __be32 dummy; + err = ip_clear_mutable_options(iph, &dummy); + if (err) + goto out_free; + } + + skb_push(skb, ihl); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah_input_done, skb); + + AH_SKB_CB(skb)->tmp = work_iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + goto out_free; + } + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out_free; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, ihl); + __skb_pull(skb, ah_hlen + ihl); + skb_set_transport_header(skb, -ihl); + + err = nexthdr; + +out_free: + kfree (work_iph); +out: + return err; +} + +static void ah4_err(struct sk_buff *skb, u32 info) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + ah->spi, IPPROTO_AH, AF_INET); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", + ntohl(ah->spi), ntohl(iph->daddr)); + xfrm_state_put(x); +} + +static int ah_init_state(struct xfrm_state *x) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + struct crypto_ahash *ahash; + + if (!x->aalg) + goto error; + + if (x->encap) + goto error; + + ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); + if (!ahp) + return -ENOMEM; + + ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); + if (IS_ERR(ahash)) + goto error; + + ahp->ahash = ahash; + if (crypto_ahash_setkey(ahash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) + goto error; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_ahash(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_ahash_digestsize(ahash)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_ahash_digestsize(ahash), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + if (x->props.flags & XFRM_STATE_ALIGN4) + x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + else + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + if (x->props.mode == XFRM_MODE_TUNNEL) + x->props.header_len += sizeof(struct iphdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + crypto_free_ahash(ahp->ahash); + kfree(ahp); + } + return -EINVAL; +} + +static void ah_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + crypto_free_ahash(ahp->ahash); + kfree(ahp); +} + + +static const struct xfrm_type ah_type = +{ + .description = "AH4", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .flags = XFRM_TYPE_REPLAY_PROT, + .init_state = ah_init_state, + .destructor = ah_destroy, + .input = ah_input, + .output = ah_output +}; + +static const struct net_protocol ah4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ah4_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static int __init ah4_init(void) +{ + if (xfrm_register_type(&ah_type, AF_INET) < 0) { + printk(KERN_INFO "ip ah init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ip ah init: can't add protocol\n"); + xfrm_unregister_type(&ah_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ah4_fini(void) +{ + if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ip ah close: can't remove protocol\n"); + if (xfrm_unregister_type(&ah_type, AF_INET) < 0) + printk(KERN_INFO "ip ah close: can't remove xfrm type\n"); +} + +module_init(ah4_init); +module_exit(ah4_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c new file mode 100644 index 00000000..d8f852db --- /dev/null +++ b/net/ipv4/arp.c @@ -0,0 +1,1490 @@ +/* linux/net/ipv4/arp.c + * + * Copyright (C) 1994 by Florian La Roche + * + * This module implements the Address Resolution Protocol ARP (RFC 826), + * which is used to convert IP addresses (or in the future maybe other + * high-level addresses) into a low-level hardware address (like an Ethernet + * address). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Alan Cox : Removed the Ethernet assumptions in + * Florian's code + * Alan Cox : Fixed some small errors in the ARP + * logic + * Alan Cox : Allow >4K in /proc + * Alan Cox : Make ARP add its own protocol entry + * Ross Martin : Rewrote arp_rcv() and arp_get_info() + * Stephen Henson : Add AX25 support to arp_get_info() + * Alan Cox : Drop data when a device is downed. + * Alan Cox : Use init_timer(). + * Alan Cox : Double lock fixes. + * Martin Seine : Move the arphdr structure + * to if_arp.h for compatibility. + * with BSD based programs. + * Andrew Tridgell : Added ARP netmask code and + * re-arranged proxy handling. + * Alan Cox : Changed to use notifiers. + * Niibe Yutaka : Reply for this device or proxies only. + * Alan Cox : Don't proxy across hardware types! + * Jonathan Naylor : Added support for NET/ROM. + * Mike Shaver : RFC1122 checks. + * Jonathan Naylor : Only lookup the hardware address for + * the correct hardware type. + * Germano Caronni : Assorted subtle races. + * Craig Schlenter : Don't modify permanent entry + * during arp_rcv. + * Russ Nelson : Tidied up a few bits. + * Alexey Kuznetsov: Major changes to caching and behaviour, + * eg intelligent arp probing and + * generation + * of host down events. + * Alan Cox : Missing unlock in device events. + * Eckes : ARP ioctl control errors. + * Alexey Kuznetsov: Arp free fix. + * Manuel Rodriguez: Gratuitous ARP. + * Jonathan Layes : Added arpd support through kerneld + * message queue (960314) + * Mike Shaver : /proc/sys/net/ipv4/arp_* support + * Mike McLagan : Routing by source + * Stuart Cheshire : Metricom and grat arp fixes + * *** FOR 2.1 clean this up *** + * Lawrence V. Stefani: (08/12/96) Added FDDI support. + * Alan Cox : Took the AP1000 nasty FDDI hack and + * folded into the mainstream FDDI code. + * Ack spit, Linus how did you allow that + * one in... + * Jes Sorensen : Make FDDI work again in 2.1.x and + * clean up the APFDDI & gen. FDDI bits. + * Alexey Kuznetsov: new arp state machine; + * now it is in net/core/neighbour.c. + * Krzysztof Halasa: Added Frame Relay ARP support. + * Arnaldo C. Melo : convert /proc/net/arp to seq_file + * Shmulik Hen: Split arp_send to arp_create and + * arp_xmit so intermediate drivers like + * bonding can change the skb before + * sending (e.g. insert 8021q tag). + * Harald Welte : convert to make use of jenkins hash + * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) +#include +struct neigh_table *clip_tbl_hook; +EXPORT_SYMBOL(clip_tbl_hook); +#endif + +#include +#include + +#include + +/* + * Interface to generic neighbour cache. + */ +static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); +static int arp_constructor(struct neighbour *neigh); +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); +static void parp_redo(struct sk_buff *skb); + +static const struct neigh_ops arp_generic_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static const struct neigh_ops arp_hh_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static const struct neigh_ops arp_direct_ops = { + .family = AF_INET, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static const struct neigh_ops arp_broken_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_compat_output, + .connected_output = neigh_compat_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_table arp_tbl = { + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; +EXPORT_SYMBOL(arp_tbl); + +int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802: + ip_eth_mc_map(addr, haddr); + return 0; + case ARPHRD_IEEE802_TR: + ip_tr_mc_map(addr, haddr); + return 0; + case ARPHRD_INFINIBAND: + ip_ib_mc_map(addr, dev->broadcast, haddr); + return 0; + case ARPHRD_IPGRE: + ip_ipgre_mc_map(addr, dev->broadcast, haddr); + return 0; + default: + if (dir) { + memcpy(haddr, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + + +static u32 arp_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) +{ + return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); +} + +static int arp_constructor(struct neighbour *neigh) +{ + __be32 addr = *(__be32 *)neigh->primary_key; + struct net_device *dev = neigh->dev; + struct in_device *in_dev; + struct neigh_parms *parms; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + neigh->type = inet_addr_type(dev_net(dev), addr); + + parms = in_dev->arp_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + if (!dev->header_ops) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &arp_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + /* Good devices (checked by reading texts, but only Ethernet is + tested) + + ARPHRD_ETHER: (ethernet, apfddi) + ARPHRD_FDDI: (fddi) + ARPHRD_IEEE802: (tr) + ARPHRD_METRICOM: (strip) + ARPHRD_ARCNET: + etc. etc. etc. + + ARPHRD_IPDDP will also work, if author repairs it. + I did not it, because this driver does not work even + in old paradigm. + */ + +#if 1 + /* So... these "amateur" devices are hopeless. + The only thing, that I can say now: + It is very sad that we need to keep ugly obsolete + code to make them happy. + + They should be moved to more reasonable state, now + they use rebuild_header INSTEAD OF hard_start_xmit!!! + Besides that, they are sort of out of date + (a lot of redundant clones/copies, useless in 2.1), + I wonder why people believe that they work. + */ + switch (dev->type) { + default: + break; + case ARPHRD_ROSE: +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: +#endif + neigh->ops = &arp_broken_ops; + neigh->output = neigh->ops->output; + return 0; +#else + break; +#endif + } +#endif + if (neigh->type == RTN_MULTICAST) { + neigh->nud_state = NUD_NOARP; + arp_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + } else if (neigh->type == RTN_BROADCAST || + (dev->flags & IFF_POINTOPOINT)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + + if (dev->header_ops->cache) + neigh->ops = &arp_hh_ops; + else + neigh->ops = &arp_generic_ops; + + if (neigh->nud_state & NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + return 0; +} + +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + dst_link_failure(skb); + kfree_skb(skb); +} + +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + __be32 saddr = 0; + u8 *dst_ha = NULL; + struct net_device *dev = neigh->dev; + __be32 target = *(__be32 *)neigh->primary_key; + int probes = atomic_read(&neigh->probes); + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { + rcu_read_unlock(); + return; + } + switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { + default: + case 0: /* By default announce any local IP */ + if (skb && inet_addr_type(dev_net(dev), + ip_hdr(skb)->saddr) == RTN_LOCAL) + saddr = ip_hdr(skb)->saddr; + break; + case 1: /* Restrict announcements of saddr in same subnet */ + if (!skb) + break; + saddr = ip_hdr(skb)->saddr; + if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + /* saddr should be known to target */ + if (inet_addr_onlink(in_dev, target, saddr)) + break; + } + saddr = 0; + break; + case 2: /* Avoid secondary IPs, get a primary/preferred one */ + break; + } + rcu_read_unlock(); + + if (!saddr) + saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); + + probes -= neigh->parms->ucast_probes; + if (probes < 0) { + if (!(neigh->nud_state & NUD_VALID)) + printk(KERN_DEBUG + "trying to ucast probe in NUD_INVALID\n"); + dst_ha = neigh->ha; + read_lock_bh(&neigh->lock); + } else { + probes -= neigh->parms->app_probes; + if (probes < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + return; + } + } + + arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_ha, dev->dev_addr, NULL); + if (dst_ha) + read_unlock_bh(&neigh->lock); +} + +static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) +{ + int scope; + + switch (IN_DEV_ARP_IGNORE(in_dev)) { + case 0: /* Reply, the tip is already validated */ + return 0; + case 1: /* Reply only if tip is configured on the incoming interface */ + sip = 0; + scope = RT_SCOPE_HOST; + break; + case 2: /* + * Reply only if tip is configured on the incoming interface + * and is in same subnet as sip + */ + scope = RT_SCOPE_HOST; + break; + case 3: /* Do not reply for scope host addresses */ + sip = 0; + scope = RT_SCOPE_LINK; + break; + case 4: /* Reserved */ + case 5: + case 6: + case 7: + return 0; + case 8: /* Do not reply */ + return 1; + default: + return 0; + } + return !inet_confirm_addr(in_dev, sip, tip, scope); +} + +static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) +{ + struct rtable *rt; + int flag = 0; + /*unsigned long now; */ + struct net *net = dev_net(dev); + + rt = ip_route_output(net, sip, tip, 0, 0); + if (IS_ERR(rt)) + return 1; + if (rt->dst.dev != dev) { + NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); + flag = 1; + } + ip_rt_put(rt); + return flag; +} + +/* OBSOLETE FUNCTIONS */ + +/* + * Find an arp mapping in the cache. If not found, post a request. + * + * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, + * even if it exists. It is supposed that skb->dev was mangled + * by a virtual device (eql, shaper). Nobody but broken devices + * is allowed to use this function, it is scheduled to be removed. --ANK + */ + +static int arp_set_predefined(int addr_hint, unsigned char *haddr, + __be32 paddr, struct net_device *dev) +{ + switch (addr_hint) { + case RTN_LOCAL: + printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + memcpy(haddr, dev->dev_addr, dev->addr_len); + return 1; + case RTN_MULTICAST: + arp_mc_map(paddr, haddr, dev, 1); + return 1; + case RTN_BROADCAST: + memcpy(haddr, dev->broadcast, dev->addr_len); + return 1; + } + return 0; +} + + +int arp_find(unsigned char *haddr, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + __be32 paddr; + struct neighbour *n; + + if (!skb_dst(skb)) { + printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); + kfree_skb(skb); + return 1; + } + + paddr = skb_rtable(skb)->rt_gateway; + + if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, + paddr, dev)) + return 0; + + n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); + + if (n) { + n->used = jiffies; + if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { + neigh_ha_snapshot(haddr, n, dev); + neigh_release(n); + return 0; + } + neigh_release(n); + } else + kfree_skb(skb); + return 1; +} +EXPORT_SYMBOL(arp_find); + +/* END OF OBSOLETE FUNCTIONS */ + +struct neighbour *__arp_bind_neighbour(struct dst_entry *dst, __be32 nexthop) +{ + struct net_device *dev = dst->dev; + + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + nexthop = 0; + return __neigh_lookup_errno( +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) + dev->type == ARPHRD_ATM ? + clip_tbl_hook : +#endif + &arp_tbl, &nexthop, dev); +} + +int arp_bind_neighbour(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + struct neighbour *n = dst_get_neighbour(dst); + + if (dev == NULL) + return -EINVAL; + if (n == NULL) { + n = __arp_bind_neighbour(dst, ((struct rtable *)dst)->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + dst_set_neighbour(dst, n); + } + return 0; +} + +/* + * Check if we can use proxy ARP for this path + */ +static inline int arp_fwd_proxy(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt) +{ + struct in_device *out_dev; + int imi, omi = -1; + + if (rt->dst.dev == dev) + return 0; + + if (!IN_DEV_PROXY_ARP(in_dev)) + return 0; + imi = IN_DEV_MEDIUM_ID(in_dev); + if (imi == 0) + return 1; + if (imi == -1) + return 0; + + /* place to check for proxy_arp for routes */ + + out_dev = __in_dev_get_rcu(rt->dst.dev); + if (out_dev) + omi = IN_DEV_MEDIUM_ID(out_dev); + + return omi != imi && omi != -1; +} + +/* + * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) + * + * RFC3069 supports proxy arp replies back to the same interface. This + * is done to support (ethernet) switch features, like RFC 3069, where + * the individual ports are not allowed to communicate with each + * other, BUT they are allowed to talk to the upstream router. As + * described in RFC 3069, it is possible to allow these hosts to + * communicate through the upstream router, by proxy_arp'ing. + * + * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" + * + * This technology is known by different names: + * In RFC 3069 it is called VLAN Aggregation. + * Cisco and Allied Telesyn call it Private VLAN. + * Hewlett-Packard call it Source-Port filtering or port-isolation. + * Ericsson call it MAC-Forced Forwarding (RFC Draft). + * + */ +static inline int arp_fwd_pvlan(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt, + __be32 sip, __be32 tip) +{ + /* Private VLAN is only concerned about the same ethernet segment */ + if (rt->dst.dev != dev) + return 0; + + /* Don't reply on self probes (often done by windowz boxes)*/ + if (sip == tip) + return 0; + + if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) + return 1; + else + return 0; +} + +/* + * Interface to link layer: send routine and receive handler. + */ + +/* + * Create an arp packet. If (dest_hw == NULL), we create a broadcast + * message. + */ +struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw) +{ + struct sk_buff *skb; + struct arphdr *arp; + unsigned char *arp_ptr; + + /* + * Allocate a buffer + */ + + skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) + return NULL; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reset_network_header(skb); + arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); + skb->dev = dev; + skb->protocol = htons(ETH_P_ARP); + if (src_hw == NULL) + src_hw = dev->dev_addr; + if (dest_hw == NULL) + dest_hw = dev->broadcast; + + /* + * Fill the device header for the ARP frame + */ + if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) + goto out; + + /* + * Fill out the arp protocol part. + * + * The arp hardware type should match the device type, except for FDDI, + * which (according to RFC 1390) should always equal 1 (Ethernet). + */ + /* + * Exceptions everywhere. AX.25 uses the AX.25 PID value not the + * DIX code for the protocol. Make these device structure fields. + */ + switch (dev->type) { + default: + arp->ar_hrd = htons(dev->type); + arp->ar_pro = htons(ETH_P_IP); + break; + +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + arp->ar_hrd = htons(ARPHRD_AX25); + arp->ar_pro = htons(AX25_P_IP); + break; + +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + arp->ar_hrd = htons(ARPHRD_NETROM); + arp->ar_pro = htons(AX25_P_IP); + break; +#endif +#endif + +#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) + case ARPHRD_FDDI: + arp->ar_hrd = htons(ARPHRD_ETHER); + arp->ar_pro = htons(ETH_P_IP); + break; +#endif +#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) + case ARPHRD_IEEE802_TR: + arp->ar_hrd = htons(ARPHRD_IEEE802); + arp->ar_pro = htons(ETH_P_IP); + break; +#endif + } + + arp->ar_hln = dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + + memcpy(arp_ptr, src_hw, dev->addr_len); + arp_ptr += dev->addr_len; + memcpy(arp_ptr, &src_ip, 4); + arp_ptr += 4; + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr += dev->addr_len; + memcpy(arp_ptr, &dest_ip, 4); + + return skb; + +out: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(arp_create); + +/* + * Send an arp packet. + */ +void arp_xmit(struct sk_buff *skb) +{ + /* Send it off, maybe filter it using firewalling first. */ + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); +} +EXPORT_SYMBOL(arp_xmit); + +/* + * Create and send an arp packet. + */ +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, const unsigned char *src_hw, + const unsigned char *target_hw) +{ + struct sk_buff *skb; + + /* + * No arp on this interface. + */ + + if (dev->flags&IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (skb == NULL) + return; + + arp_xmit(skb); +} +EXPORT_SYMBOL(arp_send); + +/* + * Process an arp request. + */ + +static int arp_process(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct arphdr *arp; + unsigned char *arp_ptr; + struct rtable *rt; + unsigned char *sha; + __be32 sip, tip; + u16 dev_type = dev->type; + int addr_type; + struct neighbour *n; + struct net *net = dev_net(dev); + + /* arp_rcv below verifies the ARP header and verifies the device + * is ARP'able. + */ + + if (in_dev == NULL) + goto out; + + arp = arp_hdr(skb); + + switch (dev_type) { + default: + if (arp->ar_pro != htons(ETH_P_IP) || + htons(dev_type) != arp->ar_hrd) + goto out; + break; + case ARPHRD_ETHER: + case ARPHRD_IEEE802_TR: + case ARPHRD_FDDI: + case ARPHRD_IEEE802: + /* + * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 + * devices, according to RFC 2625) devices will accept ARP + * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). + * This is the case also of FDDI, where the RFC 1390 says that + * FDDI devices should accept ARP hardware of (1) Ethernet, + * however, to be more robust, we'll accept both 1 (Ethernet) + * or 6 (IEEE 802.2) + */ + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP)) + goto out; + break; + case ARPHRD_AX25: + if (arp->ar_pro != htons(AX25_P_IP) || + arp->ar_hrd != htons(ARPHRD_AX25)) + goto out; + break; + case ARPHRD_NETROM: + if (arp->ar_pro != htons(AX25_P_IP) || + arp->ar_hrd != htons(ARPHRD_NETROM)) + goto out; + break; + } + + /* Understand only these message types */ + + if (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST)) + goto out; + +/* + * Extract fields + */ + arp_ptr = (unsigned char *)(arp + 1); + sha = arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + arp_ptr += dev->addr_len; + memcpy(&tip, arp_ptr, 4); +/* + * Check for bad requests for 127.x.x.x and requests for multicast + * addresses. If this is one such, delete it. + */ + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + goto out; + +/* + * Special case: We must set Frame Relay source Q.922 address + */ + if (dev_type == ARPHRD_DLCI) + sha = dev->broadcast; + +/* + * Process entry. The idea here is we want to send a reply if it is a + * request for us or if it is a request for someone else that we hold + * a proxy for. We want to add an entry to our cache if it is a reply + * to us or if it is a request for our address. + * (The assumption for this last is that if someone is requesting our + * address, they are probably intending to talk to us, so it saves time + * if we cache their address. Their address is also probably not in + * our cache, since ours is not in their cache.) + * + * Putting this another way, we only care about replies if they are to + * us, in which case we add them to the cache. For requests, we care + * about those for us and those for our proxies. We reply to both, + * and in the case of requests for us we add the requester to the arp + * cache. + */ + + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ + if (sip == 0) { + if (arp->ar_op == htons(ARPOP_REQUEST) && + inet_addr_type(net, tip) == RTN_LOCAL && + !arp_ignore(in_dev, sip, tip)) + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, + dev->dev_addr, sha); + goto out; + } + + if (arp->ar_op == htons(ARPOP_REQUEST) && + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { + + rt = skb_rtable(skb); + addr_type = rt->rt_type; + + if (addr_type == RTN_LOCAL) { + int dont_send; + + dont_send = arp_ignore(in_dev, sip, tip); + if (!dont_send && IN_DEV_ARPFILTER(in_dev)) + dont_send = arp_filter(sip, tip, dev); + if (!dont_send) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); + neigh_release(n); + } + } + goto out; + } else if (IN_DEV_FORWARD(in_dev)) { + if (addr_type == RTN_UNICAST && + (arp_fwd_proxy(in_dev, dev, rt) || + arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + (rt->dst.dev != dev && + pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) + neigh_release(n); + + if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || + skb->pkt_type == PACKET_HOST || + in_dev->arp_parms->proxy_delay == 0) { + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); + } else { + pneigh_enqueue(&arp_tbl, + in_dev->arp_parms, skb); + return 0; + } + goto out; + } + } + } + + /* Update our ARP tables */ + + n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + + if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) { + /* Unsolicited ARP is not accepted by default. + It is possible, that this option should be enabled for some + devices (strip is candidate) + */ + if (n == NULL && + (arp->ar_op == htons(ARPOP_REPLY) || + (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && + inet_addr_type(net, sip) == RTN_UNICAST) + n = __neigh_lookup(&arp_tbl, &sip, dev, 1); + } + + if (n) { + int state = NUD_REACHABLE; + int override; + + /* If several different ARP replies follows back-to-back, + use the FIRST one. It is possible, if several proxy + agents are active. Taking the first reply prevents + arp trashing and chooses the fastest router. + */ + override = time_after(jiffies, n->updated + n->parms->locktime); + + /* Broadcast replies and request packets + do not assert neighbour reachability. + */ + if (arp->ar_op != htons(ARPOP_REPLY) || + skb->pkt_type != PACKET_HOST) + state = NUD_STALE; + neigh_update(n, sha, state, + override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_release(n); + } + +out: + consume_skb(skb); + return 0; +} + +static void parp_redo(struct sk_buff *skb) +{ + arp_process(skb); +} + + +/* + * Receive an arp request from the device layer. + */ + +static int arp_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct arphdr *arp; + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull(skb, arp_hdr_len(dev))) + goto freeskb; + + arp = arp_hdr(skb); + if (arp->ar_hln != dev->addr_len || + dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK || + arp->ar_pln != 4) + goto freeskb; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) + goto out_of_mem; + + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + +freeskb: + kfree_skb(skb); +out_of_mem: + return 0; +} + +/* + * User level interface (ioctl) + */ + +/* + * Set (create) an ARP cache entry. + */ + +static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) +{ + if (dev == NULL) { + IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; + return 0; + } + if (__in_dev_get_rtnl(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + return 0; + } + return -ENXIO; +} + +static int arp_req_set_public(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + + if (mask && mask != htonl(0xFFFFFFFF)) + return -EINVAL; + if (!dev && (r->arp_flags & ATF_COM)) { + dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + r->arp_ha.sa_data); + if (!dev) + return -ENODEV; + } + if (mask) { + if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + + return arp_req_set_proxy(net, dev, 1); +} + +static int arp_req_set(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip; + struct neighbour *neigh; + int err; + + if (r->arp_flags & ATF_PUBL) + return arp_req_set_public(net, r, dev); + + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + if (r->arp_flags & ATF_PERM) + r->arp_flags |= ATF_COM; + if (dev == NULL) { + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + + if (IS_ERR(rt)) + return PTR_ERR(rt); + dev = rt->dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + switch (dev->type) { +#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) + case ARPHRD_FDDI: + /* + * According to RFC 1390, FDDI devices should accept ARP + * hardware types of 1 (Ethernet). However, to be more + * robust, we'll accept hardware types of either 1 (Ethernet) + * or 6 (IEEE 802.2). + */ + if (r->arp_ha.sa_family != ARPHRD_FDDI && + r->arp_ha.sa_family != ARPHRD_ETHER && + r->arp_ha.sa_family != ARPHRD_IEEE802) + return -EINVAL; + break; +#endif + default: + if (r->arp_ha.sa_family != dev->type) + return -EINVAL; + break; + } + + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); + err = PTR_ERR(neigh); + if (!IS_ERR(neigh)) { + unsigned state = NUD_STALE; + if (r->arp_flags & ATF_PERM) + state = NUD_PERMANENT; + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? + r->arp_ha.sa_data : NULL, state, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + } + return err; +} + +static unsigned arp_state_to_flags(struct neighbour *neigh) +{ + if (neigh->nud_state&NUD_PERMANENT) + return ATF_PERM | ATF_COM; + else if (neigh->nud_state&NUD_VALID) + return ATF_COM; + else + return 0; +} + +/* + * Get an ARP cache entry. + */ + +static int arp_req_get(struct arpreq *r, struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err = -ENXIO; + + neigh = neigh_lookup(&arp_tbl, &ip, dev); + if (neigh) { + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + r->arp_ha.sa_family = dev->type; + strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + neigh_release(neigh); + err = 0; + } + return err; +} + +int arp_invalidate(struct net_device *dev, __be32 ip) +{ + struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); + int err = -ENXIO; + + if (neigh) { + if (neigh->nud_state & ~NUD_NOARP) + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + } + + return err; +} +EXPORT_SYMBOL(arp_invalidate); + +static int arp_req_delete_public(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + + if (mask == htonl(0xFFFFFFFF)) + return pneigh_delete(&arp_tbl, net, &ip, dev); + + if (mask) + return -EINVAL; + + return arp_req_set_proxy(net, dev, 0); +} + +static int arp_req_delete(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip; + + if (r->arp_flags & ATF_PUBL) + return arp_req_delete_public(net, r, dev); + + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + if (dev == NULL) { + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + if (IS_ERR(rt)) + return PTR_ERR(rt); + dev = rt->dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + return arp_invalidate(dev, ip); +} + +/* + * Handle an ARP layer I/O control request. + */ + +int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + int err; + struct arpreq r; + struct net_device *dev = NULL; + + switch (cmd) { + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; + } + + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + + if (!(r.arp_flags & ATF_PUBL) && + (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) + return -EINVAL; + if (!(r.arp_flags & ATF_NETMASK)) + ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = + htonl(0xFFFFFFFFUL); + rtnl_lock(); + if (r.arp_dev[0]) { + err = -ENODEV; + dev = __dev_get_by_name(net, r.arp_dev); + if (dev == NULL) + goto out; + + /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ + if (!r.arp_ha.sa_family) + r.arp_ha.sa_family = dev->type; + err = -EINVAL; + if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) + goto out; + } else if (cmd == SIOCGARP) { + err = -ENODEV; + goto out; + } + + switch (cmd) { + case SIOCDARP: + err = arp_req_delete(net, &r, dev); + break; + case SIOCSARP: + err = arp_req_set(net, &r, dev); + break; + case SIOCGARP: + err = arp_req_get(&r, dev); + break; + } +out: + rtnl_unlock(); + if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; + return err; +} + +static int arp_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&arp_tbl, dev); + rt_cache_flush(dev_net(dev), 0); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block arp_netdev_notifier = { + .notifier_call = arp_netdev_event, +}; + +/* Note, that it is not on notifier chain. + It is necessary, that this routine was called after route cache will be + flushed. + */ +void arp_ifdown(struct net_device *dev) +{ + neigh_ifdown(&arp_tbl, dev); +} + + +/* + * Called once on startup. + */ + +static struct packet_type arp_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_ARP), + .func = arp_rcv, +}; + +static int arp_proc_init(void); + +void __init arp_init(void) +{ + neigh_table_init(&arp_tbl); + + dev_add_pack(&arp_packet_type); + arp_proc_init(); +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); +#endif + register_netdevice_notifier(&arp_netdev_notifier); +} + +#ifdef CONFIG_PROC_FS +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + +/* ------------------------------------------------------------------------ */ +/* + * ax25 -> ASCII conversion + */ +static char *ax2asc2(ax25_address *a, char *buf) +{ + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') + *s++ = c; + } + + *s++ = '-'; + n = (a->ax25_call[6] >> 1) & 0x0F; + if (n > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; +} +#endif /* CONFIG_AX25 */ + +#define HBUFFERLEN 30 + +static void arp_format_neigh_entry(struct seq_file *seq, + struct neighbour *n) +{ + char hbuffer[HBUFFERLEN]; + int k, j; + char tbuf[16]; + struct net_device *dev = n->dev; + int hatype = dev->type; + + read_lock(&n->lock); + /* Convert hardware address to XX:XX:XX:XX ... form. */ +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) + ax2asc2((ax25_address *)n->ha, hbuffer); + else { +#endif + for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { + hbuffer[k++] = hex_asc_hi(n->ha[j]); + hbuffer[k++] = hex_asc_lo(n->ha[j]); + hbuffer[k++] = ':'; + } + if (k != 0) + --k; + hbuffer[k] = 0; +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + } +#endif + sprintf(tbuf, "%pI4", n->primary_key); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); + read_unlock(&n->lock); +} + +static void arp_format_pneigh_entry(struct seq_file *seq, + struct pneigh_entry *n) +{ + struct net_device *dev = n->dev; + int hatype = dev ? dev->type : 0; + char tbuf[16]; + + sprintf(tbuf, "%pI4", n->key); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", + dev ? dev->name : "*"); +} + +static int arp_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "IP address HW type Flags " + "HW address Mask Device\n"); + } else { + struct neigh_seq_state *state = seq->private; + + if (state->flags & NEIGH_SEQ_IS_PNEIGH) + arp_format_pneigh_entry(seq, v); + else + arp_format_neigh_entry(seq, v); + } + + return 0; +} + +static void *arp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* Don't want to confuse "arp -a" w/ magic entries, + * so we tell the generic iterator to skip NUD_NOARP. + */ + return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); +} + +/* ------------------------------------------------------------------------ */ + +static const struct seq_operations arp_seq_ops = { + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, +}; + +static int arp_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &arp_seq_ops, + sizeof(struct neigh_seq_state)); +} + +static const struct file_operations arp_seq_fops = { + .owner = THIS_MODULE, + .open = arp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static int __net_init arp_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit arp_net_exit(struct net *net) +{ + proc_net_remove(net, "arp"); +} + +static struct pernet_operations arp_net_ops = { + .init = arp_net_init, + .exit = arp_net_exit, +}; + +static int __init arp_proc_init(void) +{ + return register_pernet_subsys(&arp_net_ops); +} + +#else /* CONFIG_PROC_FS */ + +static int __init arp_proc_init(void) +{ + return 0; +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c new file mode 100644 index 00000000..2b3c23c2 --- /dev/null +++ b/net/ipv4/cipso_ipv4.c @@ -0,0 +1,2368 @@ +/* + * CIPSO - Commercial IP Security Option + * + * This is an implementation of the CIPSO 2.2 protocol as specified in + * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in + * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors + * have chosen to adopt the protocol and over the years it has become a + * de-facto standard for labeled networking. + * + * The CIPSO draft specification can be found in the kernel's Documentation + * directory as well as the following URL: + * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt + * The FIPS-188 specification can be found at the following URL: + * http://www.itl.nist.gov/fipspubs/fip188.htm + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* List of available DOI definitions */ +/* XXX - This currently assumes a minimal number of different DOIs in use, + * if in practice there are a lot of different DOIs this list should + * probably be turned into a hash table or something similar so we + * can do quick lookups. */ +static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); +static LIST_HEAD(cipso_v4_doi_list); + +/* Label mapping cache */ +int cipso_v4_cache_enabled = 1; +int cipso_v4_cache_bucketsize = 10; +#define CIPSO_V4_CACHE_BUCKETBITS 7 +#define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS) +#define CIPSO_V4_CACHE_REORDERLIMIT 10 +struct cipso_v4_map_cache_bkt { + spinlock_t lock; + u32 size; + struct list_head list; +}; +struct cipso_v4_map_cache_entry { + u32 hash; + unsigned char *key; + size_t key_len; + + struct netlbl_lsm_cache *lsm_data; + + u32 activity; + struct list_head list; +}; +static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL; + +/* Restricted bitmap (tag #1) flags */ +int cipso_v4_rbm_optfmt = 0; +int cipso_v4_rbm_strictvalid = 1; + +/* + * Protocol Constants + */ + +/* Maximum size of the CIPSO IP option, derived from the fact that the maximum + * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ +#define CIPSO_V4_OPT_LEN_MAX 40 + +/* Length of the base CIPSO option, this includes the option type (1 byte), the + * option length (1 byte), and the DOI (4 bytes). */ +#define CIPSO_V4_HDR_LEN 6 + +/* Base length of the restrictive category bitmap tag (tag #1). */ +#define CIPSO_V4_TAG_RBM_BLEN 4 + +/* Base length of the enumerated category tag (tag #2). */ +#define CIPSO_V4_TAG_ENUM_BLEN 4 + +/* Base length of the ranged categories bitmap tag (tag #5). */ +#define CIPSO_V4_TAG_RNG_BLEN 4 +/* The maximum number of category ranges permitted in the ranged category tag + * (tag #5). You may note that the IETF draft states that the maximum number + * of category ranges is 7, but if the low end of the last category range is + * zero then it is possible to fit 8 category ranges because the zero should + * be omitted. */ +#define CIPSO_V4_TAG_RNG_CAT_MAX 8 + +/* Base length of the local tag (non-standard tag). + * Tag definition (may change between kernel versions) + * + * 0 8 16 24 32 + * +----------+----------+----------+----------+ + * | 10000000 | 00000110 | 32-bit secid value | + * +----------+----------+----------+----------+ + * | in (host byte order)| + * +----------+----------+ + * + */ +#define CIPSO_V4_TAG_LOC_BLEN 6 + +/* + * Helper Functions + */ + +/** + * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit + * @bitmap: the bitmap + * @bitmap_len: length in bits + * @offset: starting offset + * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit + * + * Description: + * Starting at @offset, walk the bitmap from left to right until either the + * desired bit is found or we reach the end. Return the bit offset, -1 if + * not found, or -2 if error. + */ +static int cipso_v4_bitmap_walk(const unsigned char *bitmap, + u32 bitmap_len, + u32 offset, + u8 state) +{ + u32 bit_spot; + u32 byte_offset; + unsigned char bitmask; + unsigned char byte; + + /* gcc always rounds to zero when doing integer division */ + byte_offset = offset / 8; + byte = bitmap[byte_offset]; + bit_spot = offset; + bitmask = 0x80 >> (offset % 8); + + while (bit_spot < bitmap_len) { + if ((state && (byte & bitmask) == bitmask) || + (state == 0 && (byte & bitmask) == 0)) + return bit_spot; + + bit_spot++; + bitmask >>= 1; + if (bitmask == 0) { + byte = bitmap[++byte_offset]; + bitmask = 0x80; + } + } + + return -1; +} + +/** + * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap + * @bitmap: the bitmap + * @bit: the bit + * @state: if non-zero, set the bit (1) else clear the bit (0) + * + * Description: + * Set a single bit in the bitmask. Returns zero on success, negative values + * on error. + */ +static void cipso_v4_bitmap_setbit(unsigned char *bitmap, + u32 bit, + u8 state) +{ + u32 byte_spot; + u8 bitmask; + + /* gcc always rounds to zero when doing integer division */ + byte_spot = bit / 8; + bitmask = 0x80 >> (bit % 8); + if (state) + bitmap[byte_spot] |= bitmask; + else + bitmap[byte_spot] &= ~bitmask; +} + +/** + * cipso_v4_cache_entry_free - Frees a cache entry + * @entry: the entry to free + * + * Description: + * This function frees the memory associated with a cache entry including the + * LSM cache data if there are no longer any users, i.e. reference count == 0. + * + */ +static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry) +{ + if (entry->lsm_data) + netlbl_secattr_cache_free(entry->lsm_data); + kfree(entry->key); + kfree(entry); +} + +/** + * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache + * @key: the hash key + * @key_len: the length of the key in bytes + * + * Description: + * The CIPSO tag hashing function. Returns a 32-bit hash value. + * + */ +static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +/* + * Label Mapping Cache Functions + */ + +/** + * cipso_v4_cache_init - Initialize the CIPSO cache + * + * Description: + * Initializes the CIPSO label mapping cache, this function should be called + * before any of the other functions defined in this file. Returns zero on + * success, negative values on error. + * + */ +static int cipso_v4_cache_init(void) +{ + u32 iter; + + cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, + sizeof(struct cipso_v4_map_cache_bkt), + GFP_KERNEL); + if (cipso_v4_cache == NULL) + return -ENOMEM; + + for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { + spin_lock_init(&cipso_v4_cache[iter].lock); + cipso_v4_cache[iter].size = 0; + INIT_LIST_HEAD(&cipso_v4_cache[iter].list); + } + + return 0; +} + +/** + * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache + * + * Description: + * Invalidates and frees any entries in the CIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +void cipso_v4_cache_invalidate(void) +{ + struct cipso_v4_map_cache_entry *entry, *tmp_entry; + u32 iter; + + for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { + spin_lock_bh(&cipso_v4_cache[iter].lock); + list_for_each_entry_safe(entry, + tmp_entry, + &cipso_v4_cache[iter].list, list) { + list_del(&entry->list); + cipso_v4_cache_entry_free(entry); + } + cipso_v4_cache[iter].size = 0; + spin_unlock_bh(&cipso_v4_cache[iter].lock); + } +} + +/** + * cipso_v4_cache_check - Check the CIPSO cache for a label mapping + * @key: the buffer to check + * @key_len: buffer length in bytes + * @secattr: the security attribute struct to use + * + * Description: + * This function checks the cache to see if a label mapping already exists for + * the given key. If there is a match then the cache is adjusted and the + * @secattr struct is populated with the correct LSM security attributes. The + * cache is adjusted in the following manner if the entry is not already the + * first in the cache bucket: + * + * 1. The cache entry's activity counter is incremented + * 2. The previous (higher ranking) entry's activity counter is decremented + * 3. If the difference between the two activity counters is geater than + * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped + * + * Returns zero on success, -ENOENT for a cache miss, and other negative values + * on error. + * + */ +static int cipso_v4_cache_check(const unsigned char *key, + u32 key_len, + struct netlbl_lsm_secattr *secattr) +{ + u32 bkt; + struct cipso_v4_map_cache_entry *entry; + struct cipso_v4_map_cache_entry *prev_entry = NULL; + u32 hash; + + if (!cipso_v4_cache_enabled) + return -ENOENT; + + hash = cipso_v4_map_cache_hash(key, key_len); + bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1); + spin_lock_bh(&cipso_v4_cache[bkt].lock); + list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) { + if (entry->hash == hash && + entry->key_len == key_len && + memcmp(entry->key, key, key_len) == 0) { + entry->activity += 1; + atomic_inc(&entry->lsm_data->refcount); + secattr->cache = entry->lsm_data; + secattr->flags |= NETLBL_SECATTR_CACHE; + secattr->type = NETLBL_NLTYPE_CIPSOV4; + if (prev_entry == NULL) { + spin_unlock_bh(&cipso_v4_cache[bkt].lock); + return 0; + } + + if (prev_entry->activity > 0) + prev_entry->activity -= 1; + if (entry->activity > prev_entry->activity && + entry->activity - prev_entry->activity > + CIPSO_V4_CACHE_REORDERLIMIT) { + __list_del(entry->list.prev, entry->list.next); + __list_add(&entry->list, + prev_entry->list.prev, + &prev_entry->list); + } + + spin_unlock_bh(&cipso_v4_cache[bkt].lock); + return 0; + } + prev_entry = entry; + } + spin_unlock_bh(&cipso_v4_cache[bkt].lock); + + return -ENOENT; +} + +/** + * cipso_v4_cache_add - Add an entry to the CIPSO cache + * @skb: the packet + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CIPSO label mapping cache. Add the new entry to + * head of the cache bucket's list, if the cache bucket is out of room remove + * the last entry in the list first. It is important to note that there is + * currently no checking for duplicate keys. Returns zero on success, + * negative values on failure. + * + */ +int cipso_v4_cache_add(const struct sk_buff *skb, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + u32 bkt; + struct cipso_v4_map_cache_entry *entry = NULL; + struct cipso_v4_map_cache_entry *old_entry = NULL; + unsigned char *cipso_ptr; + u32 cipso_ptr_len; + + if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0) + return 0; + + cipso_ptr = CIPSO_V4_OPTPTR(skb); + cipso_ptr_len = cipso_ptr[1]; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); + if (entry->key == NULL) { + ret_val = -ENOMEM; + goto cache_add_failure; + } + entry->key_len = cipso_ptr_len; + entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len); + atomic_inc(&secattr->cache->refcount); + entry->lsm_data = secattr->cache; + + bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); + spin_lock_bh(&cipso_v4_cache[bkt].lock); + if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) { + list_add(&entry->list, &cipso_v4_cache[bkt].list); + cipso_v4_cache[bkt].size += 1; + } else { + old_entry = list_entry(cipso_v4_cache[bkt].list.prev, + struct cipso_v4_map_cache_entry, list); + list_del(&old_entry->list); + list_add(&entry->list, &cipso_v4_cache[bkt].list); + cipso_v4_cache_entry_free(old_entry); + } + spin_unlock_bh(&cipso_v4_cache[bkt].lock); + + return 0; + +cache_add_failure: + if (entry) + cipso_v4_cache_entry_free(entry); + return ret_val; +} + +/* + * DOI List Functions + */ + +/** + * cipso_v4_doi_search - Searches for a DOI definition + * @doi: the DOI to search for + * + * Description: + * Search the DOI definition list for a DOI definition with a DOI value that + * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). + * Returns a pointer to the DOI definition on success and NULL on failure. + */ +static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) +{ + struct cipso_v4_doi *iter; + + list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) + if (iter->doi == doi && atomic_read(&iter->refcount)) + return iter; + return NULL; +} + +/** + * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see cipso_ipv4.h for details). Returns + * zero on success and non-zero on failure. + * + */ +int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -EINVAL; + u32 iter; + u32 doi; + u32 doi_type; + struct audit_buffer *audit_buf; + + doi = doi_def->doi; + doi_type = doi_def->type; + + if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN) + goto doi_add_return; + for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { + switch (doi_def->tags[iter]) { + case CIPSO_V4_TAG_RBITMAP: + break; + case CIPSO_V4_TAG_RANGE: + case CIPSO_V4_TAG_ENUM: + if (doi_def->type != CIPSO_V4_MAP_PASS) + goto doi_add_return; + break; + case CIPSO_V4_TAG_LOCAL: + if (doi_def->type != CIPSO_V4_MAP_LOCAL) + goto doi_add_return; + break; + case CIPSO_V4_TAG_INVALID: + if (iter == 0) + goto doi_add_return; + break; + default: + goto doi_add_return; + } + } + + atomic_set(&doi_def->refcount, 1); + + spin_lock(&cipso_v4_doi_list_lock); + if (cipso_v4_doi_search(doi_def->doi) != NULL) { + spin_unlock(&cipso_v4_doi_list_lock); + ret_val = -EEXIST; + goto doi_add_return; + } + list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list); + spin_unlock(&cipso_v4_doi_list_lock); + ret_val = 0; + +doi_add_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); + if (audit_buf != NULL) { + const char *type_str; + switch (doi_type) { + case CIPSO_V4_MAP_TRANS: + type_str = "trans"; + break; + case CIPSO_V4_MAP_PASS: + type_str = "pass"; + break; + case CIPSO_V4_MAP_LOCAL: + type_str = "local"; + break; + default: + type_str = "(unknown)"; + } + audit_log_format(audit_buf, + " cipso_doi=%u cipso_type=%s res=%u", + doi, type_str, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * cipso_v4_doi_free - Frees a DOI definition + * @entry: the entry's RCU field + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) +{ + if (doi_def == NULL) + return; + + switch (doi_def->type) { + case CIPSO_V4_MAP_TRANS: + kfree(doi_def->map.std->lvl.cipso); + kfree(doi_def->map.std->lvl.local); + kfree(doi_def->map.std->cat.cipso); + kfree(doi_def->map.std->cat.local); + break; + } + kfree(doi_def); +} + +/** + * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to the DOI definition can be released + * safely. + * + */ +static void cipso_v4_doi_free_rcu(struct rcu_head *entry) +{ + struct cipso_v4_doi *doi_def; + + doi_def = container_of(entry, struct cipso_v4_doi, rcu); + cipso_v4_doi_free(doi_def); +} + +/** + * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val; + struct cipso_v4_doi *doi_def; + struct audit_buffer *audit_buf; + + spin_lock(&cipso_v4_doi_list_lock); + doi_def = cipso_v4_doi_search(doi); + if (doi_def == NULL) { + spin_unlock(&cipso_v4_doi_list_lock); + ret_val = -ENOENT; + goto doi_remove_return; + } + if (!atomic_dec_and_test(&doi_def->refcount)) { + spin_unlock(&cipso_v4_doi_list_lock); + ret_val = -EBUSY; + goto doi_remove_return; + } + list_del_rcu(&doi_def->list); + spin_unlock(&cipso_v4_doi_list_lock); + + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); + ret_val = 0; + +doi_remove_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); + if (audit_buf != NULL) { + audit_log_format(audit_buf, + " cipso_doi=%u res=%u", + doi, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * rcu_read_lock() is held while accessing the returned definition and the DOI + * definition reference count is decremented when the caller is done. + * + */ +struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) +{ + struct cipso_v4_doi *doi_def; + + rcu_read_lock(); + doi_def = cipso_v4_doi_search(doi); + if (doi_def == NULL) + goto doi_getdef_return; + if (!atomic_inc_not_zero(&doi_def->refcount)) + doi_def = NULL; + +doi_getdef_return: + rcu_read_unlock(); + return doi_def; +} + +/** + * cipso_v4_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). + * + */ +void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) +{ + if (doi_def == NULL) + return; + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; + spin_lock(&cipso_v4_doi_list_lock); + list_del_rcu(&doi_def->list); + spin_unlock(&cipso_v4_doi_list_lock); + + cipso_v4_cache_invalidate(); + call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); +} + +/** + * cipso_v4_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +int cipso_v4_doi_walk(u32 *skip_cnt, + int (*callback) (struct cipso_v4_doi *doi_def, void *arg), + void *cb_arg) +{ + int ret_val = -ENOENT; + u32 doi_cnt = 0; + struct cipso_v4_doi *iter_doi; + + rcu_read_lock(); + list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) + if (atomic_read(&iter_doi->refcount) > 0) { + if (doi_cnt++ < *skip_cnt) + continue; + ret_val = callback(iter_doi, cb_arg); + if (ret_val < 0) { + doi_cnt--; + goto doi_walk_return; + } + } + +doi_walk_return: + rcu_read_unlock(); + *skip_cnt = doi_cnt; + return ret_val; +} + +/* + * Label Mapping Functions + */ + +/** + * cipso_v4_map_lvl_valid - Checks to see if the given level is understood + * @doi_def: the DOI definition + * @level: the level to check + * + * Description: + * Checks the given level against the given DOI definition and returns a + * negative value if the level does not have a valid mapping and a zero value + * if the level is defined by the DOI. + * + */ +static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) +{ + switch (doi_def->type) { + case CIPSO_V4_MAP_PASS: + return 0; + case CIPSO_V4_MAP_TRANS: + if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL) + return 0; + break; + } + + return -EFAULT; +} + +/** + * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network + * @doi_def: the DOI definition + * @host_lvl: the host MLS level + * @net_lvl: the network/CIPSO MLS level + * + * Description: + * Perform a label mapping to translate a local MLS level to the correct + * CIPSO level using the given DOI definition. Returns zero on success, + * negative values otherwise. + * + */ +static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, + u32 host_lvl, + u32 *net_lvl) +{ + switch (doi_def->type) { + case CIPSO_V4_MAP_PASS: + *net_lvl = host_lvl; + return 0; + case CIPSO_V4_MAP_TRANS: + if (host_lvl < doi_def->map.std->lvl.local_size && + doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { + *net_lvl = doi_def->map.std->lvl.local[host_lvl]; + return 0; + } + return -EPERM; + } + + return -EINVAL; +} + +/** + * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host + * @doi_def: the DOI definition + * @net_lvl: the network/CIPSO MLS level + * @host_lvl: the host MLS level + * + * Description: + * Perform a label mapping to translate a CIPSO level to the correct local MLS + * level using the given DOI definition. Returns zero on success, negative + * values otherwise. + * + */ +static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, + u32 net_lvl, + u32 *host_lvl) +{ + struct cipso_v4_std_map_tbl *map_tbl; + + switch (doi_def->type) { + case CIPSO_V4_MAP_PASS: + *host_lvl = net_lvl; + return 0; + case CIPSO_V4_MAP_TRANS: + map_tbl = doi_def->map.std; + if (net_lvl < map_tbl->lvl.cipso_size && + map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { + *host_lvl = doi_def->map.std->lvl.cipso[net_lvl]; + return 0; + } + return -EPERM; + } + + return -EINVAL; +} + +/** + * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid + * @doi_def: the DOI definition + * @bitmap: category bitmap + * @bitmap_len: bitmap length in bytes + * + * Description: + * Checks the given category bitmap against the given DOI definition and + * returns a negative value if any of the categories in the bitmap do not have + * a valid mapping and a zero value if all of the categories are valid. + * + */ +static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, + const unsigned char *bitmap, + u32 bitmap_len) +{ + int cat = -1; + u32 bitmap_len_bits = bitmap_len * 8; + u32 cipso_cat_size; + u32 *cipso_array; + + switch (doi_def->type) { + case CIPSO_V4_MAP_PASS: + return 0; + case CIPSO_V4_MAP_TRANS: + cipso_cat_size = doi_def->map.std->cat.cipso_size; + cipso_array = doi_def->map.std->cat.cipso; + for (;;) { + cat = cipso_v4_bitmap_walk(bitmap, + bitmap_len_bits, + cat + 1, + 1); + if (cat < 0) + break; + if (cat >= cipso_cat_size || + cipso_array[cat] >= CIPSO_V4_INV_CAT) + return -EFAULT; + } + + if (cat == -1) + return 0; + break; + } + + return -EFAULT; +} + +/** + * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network + * @doi_def: the DOI definition + * @secattr: the security attributes + * @net_cat: the zero'd out category bitmap in network/CIPSO format + * @net_cat_len: the length of the CIPSO bitmap in bytes + * + * Description: + * Perform a label mapping to translate a local MLS category bitmap to the + * correct CIPSO bitmap using the given DOI definition. Returns the minimum + * size in bytes of the network bitmap on success, negative values otherwise. + * + */ +static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *net_cat, + u32 net_cat_len) +{ + int host_spot = -1; + u32 net_spot = CIPSO_V4_INV_CAT; + u32 net_spot_max = 0; + u32 net_clen_bits = net_cat_len * 8; + u32 host_cat_size = 0; + u32 *host_cat_array = NULL; + + if (doi_def->type == CIPSO_V4_MAP_TRANS) { + host_cat_size = doi_def->map.std->cat.local_size; + host_cat_array = doi_def->map.std->cat.local; + } + + for (;;) { + host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, + host_spot + 1); + if (host_spot < 0) + break; + + switch (doi_def->type) { + case CIPSO_V4_MAP_PASS: + net_spot = host_spot; + break; + case CIPSO_V4_MAP_TRANS: + if (host_spot >= host_cat_size) + return -EPERM; + net_spot = host_cat_array[host_spot]; + if (net_spot >= CIPSO_V4_INV_CAT) + return -EPERM; + break; + } + if (net_spot >= net_clen_bits) + return -ENOSPC; + cipso_v4_bitmap_setbit(net_cat, net_spot, 1); + + if (net_spot > net_spot_max) + net_spot_max = net_spot; + } + + if (++net_spot_max % 8) + return net_spot_max / 8 + 1; + return net_spot_max / 8; +} + +/** + * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host + * @doi_def: the DOI definition + * @net_cat: the category bitmap in network/CIPSO format + * @net_cat_len: the length of the CIPSO bitmap in bytes + * @secattr: the security attributes + * + * Description: + * Perform a label mapping to translate a CIPSO bitmap to the correct local + * MLS category bitmap using the given DOI definition. Returns zero on + * success, negative values on failure. + * + */ +static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, + const unsigned char *net_cat, + u32 net_cat_len, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + int net_spot = -1; + u32 host_spot = CIPSO_V4_INV_CAT; + u32 net_clen_bits = net_cat_len * 8; + u32 net_cat_size = 0; + u32 *net_cat_array = NULL; + + if (doi_def->type == CIPSO_V4_MAP_TRANS) { + net_cat_size = doi_def->map.std->cat.cipso_size; + net_cat_array = doi_def->map.std->cat.cipso; + } + + for (;;) { + net_spot = cipso_v4_bitmap_walk(net_cat, + net_clen_bits, + net_spot + 1, + 1); + if (net_spot < 0) { + if (net_spot == -2) + return -EFAULT; + return 0; + } + + switch (doi_def->type) { + case CIPSO_V4_MAP_PASS: + host_spot = net_spot; + break; + case CIPSO_V4_MAP_TRANS: + if (net_spot >= net_cat_size) + return -EPERM; + host_spot = net_cat_array[net_spot]; + if (host_spot >= CIPSO_V4_INV_CAT) + return -EPERM; + break; + } + ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, + host_spot, + GFP_ATOMIC); + if (ret_val != 0) + return ret_val; + } + + return -EINVAL; +} + +/** + * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid + * @doi_def: the DOI definition + * @enumcat: category list + * @enumcat_len: length of the category list in bytes + * + * Description: + * Checks the given categories against the given DOI definition and returns a + * negative value if any of the categories do not have a valid mapping and a + * zero value if all of the categories are valid. + * + */ +static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def, + const unsigned char *enumcat, + u32 enumcat_len) +{ + u16 cat; + int cat_prev = -1; + u32 iter; + + if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01) + return -EFAULT; + + for (iter = 0; iter < enumcat_len; iter += 2) { + cat = get_unaligned_be16(&enumcat[iter]); + if (cat <= cat_prev) + return -EFAULT; + cat_prev = cat; + } + + return 0; +} + +/** + * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network + * @doi_def: the DOI definition + * @secattr: the security attributes + * @net_cat: the zero'd out category list in network/CIPSO format + * @net_cat_len: the length of the CIPSO category list in bytes + * + * Description: + * Perform a label mapping to translate a local MLS category bitmap to the + * correct CIPSO category list using the given DOI definition. Returns the + * size in bytes of the network category bitmap on success, negative values + * otherwise. + * + */ +static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *net_cat, + u32 net_cat_len) +{ + int cat = -1; + u32 cat_iter = 0; + + for (;;) { + cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, + cat + 1); + if (cat < 0) + break; + if ((cat_iter + 2) > net_cat_len) + return -ENOSPC; + + *((__be16 *)&net_cat[cat_iter]) = htons(cat); + cat_iter += 2; + } + + return cat_iter; +} + +/** + * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host + * @doi_def: the DOI definition + * @net_cat: the category list in network/CIPSO format + * @net_cat_len: the length of the CIPSO bitmap in bytes + * @secattr: the security attributes + * + * Description: + * Perform a label mapping to translate a CIPSO category list to the correct + * local MLS category bitmap using the given DOI definition. Returns zero on + * success, negative values on failure. + * + */ +static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, + const unsigned char *net_cat, + u32 net_cat_len, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u32 iter; + + for (iter = 0; iter < net_cat_len; iter += 2) { + ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, + get_unaligned_be16(&net_cat[iter]), + GFP_ATOMIC); + if (ret_val != 0) + return ret_val; + } + + return 0; +} + +/** + * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid + * @doi_def: the DOI definition + * @rngcat: category list + * @rngcat_len: length of the category list in bytes + * + * Description: + * Checks the given categories against the given DOI definition and returns a + * negative value if any of the categories do not have a valid mapping and a + * zero value if all of the categories are valid. + * + */ +static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def, + const unsigned char *rngcat, + u32 rngcat_len) +{ + u16 cat_high; + u16 cat_low; + u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1; + u32 iter; + + if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01) + return -EFAULT; + + for (iter = 0; iter < rngcat_len; iter += 4) { + cat_high = get_unaligned_be16(&rngcat[iter]); + if ((iter + 4) <= rngcat_len) + cat_low = get_unaligned_be16(&rngcat[iter + 2]); + else + cat_low = 0; + + if (cat_high > cat_prev) + return -EFAULT; + + cat_prev = cat_low; + } + + return 0; +} + +/** + * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network + * @doi_def: the DOI definition + * @secattr: the security attributes + * @net_cat: the zero'd out category list in network/CIPSO format + * @net_cat_len: the length of the CIPSO category list in bytes + * + * Description: + * Perform a label mapping to translate a local MLS category bitmap to the + * correct CIPSO category list using the given DOI definition. Returns the + * size in bytes of the network category bitmap on success, negative values + * otherwise. + * + */ +static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *net_cat, + u32 net_cat_len) +{ + int iter = -1; + u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; + u32 array_cnt = 0; + u32 cat_size = 0; + + /* make sure we don't overflow the 'array[]' variable */ + if (net_cat_len > + (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) + return -ENOSPC; + + for (;;) { + iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, + iter + 1); + if (iter < 0) + break; + cat_size += (iter == 0 ? 0 : sizeof(u16)); + if (cat_size > net_cat_len) + return -ENOSPC; + array[array_cnt++] = iter; + + iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat, + iter); + if (iter < 0) + return -EFAULT; + cat_size += sizeof(u16); + if (cat_size > net_cat_len) + return -ENOSPC; + array[array_cnt++] = iter; + } + + for (iter = 0; array_cnt > 0;) { + *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]); + iter += 2; + array_cnt--; + if (array[array_cnt] != 0) { + *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]); + iter += 2; + } + } + + return cat_size; +} + +/** + * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host + * @doi_def: the DOI definition + * @net_cat: the category list in network/CIPSO format + * @net_cat_len: the length of the CIPSO bitmap in bytes + * @secattr: the security attributes + * + * Description: + * Perform a label mapping to translate a CIPSO category list to the correct + * local MLS category bitmap using the given DOI definition. Returns zero on + * success, negative values on failure. + * + */ +static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, + const unsigned char *net_cat, + u32 net_cat_len, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u32 net_iter; + u16 cat_low; + u16 cat_high; + + for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { + cat_high = get_unaligned_be16(&net_cat[net_iter]); + if ((net_iter + 4) <= net_cat_len) + cat_low = get_unaligned_be16(&net_cat[net_iter + 2]); + else + cat_low = 0; + + ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat, + cat_low, + cat_high, + GFP_ATOMIC); + if (ret_val != 0) + return ret_val; + } + + return 0; +} + +/* + * Protocol Handling Functions + */ + +/** + * cipso_v4_gentag_hdr - Generate a CIPSO option header + * @doi_def: the DOI definition + * @len: the total tag length in bytes, not including this header + * @buf: the CIPSO option buffer + * + * Description: + * Write a CIPSO header into the beginning of @buffer. + * + */ +static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def, + unsigned char *buf, + u32 len) +{ + buf[0] = IPOPT_CIPSO; + buf[1] = CIPSO_V4_HDR_LEN + len; + *(__be32 *)&buf[2] = htonl(doi_def->doi); +} + +/** + * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1) + * @doi_def: the DOI definition + * @secattr: the security attributes + * @buffer: the option buffer + * @buffer_len: length of buffer in bytes + * + * Description: + * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The + * actual buffer length may be larger than the indicated size due to + * translation between host and network category bitmaps. Returns the size of + * the tag on success, negative values on failure. + * + */ +static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *buffer, + u32 buffer_len) +{ + int ret_val; + u32 tag_len; + u32 level; + + if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) + return -EPERM; + + ret_val = cipso_v4_map_lvl_hton(doi_def, + secattr->attr.mls.lvl, + &level); + if (ret_val != 0) + return ret_val; + + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { + ret_val = cipso_v4_map_cat_rbm_hton(doi_def, + secattr, + &buffer[4], + buffer_len - 4); + if (ret_val < 0) + return ret_val; + + /* This will send packets using the "optimized" format when + * possible as specified in section 3.4.2.6 of the + * CIPSO draft. */ + if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) + tag_len = 14; + else + tag_len = 4 + ret_val; + } else + tag_len = 4; + + buffer[0] = CIPSO_V4_TAG_RBITMAP; + buffer[1] = tag_len; + buffer[3] = level; + + return tag_len; +} + +/** + * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag + * @doi_def: the DOI definition + * @tag: the CIPSO tag + * @secattr: the security attributes + * + * Description: + * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security + * attributes in @secattr. Return zero on success, negatives values on + * failure. + * + */ +static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, + const unsigned char *tag, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u8 tag_len = tag[1]; + u32 level; + + ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); + if (ret_val != 0) + return ret_val; + secattr->attr.mls.lvl = level; + secattr->flags |= NETLBL_SECATTR_MLS_LVL; + + if (tag_len > 4) { + secattr->attr.mls.cat = + netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->attr.mls.cat == NULL) + return -ENOMEM; + + ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, + &tag[4], + tag_len - 4, + secattr); + if (ret_val != 0) { + netlbl_secattr_catmap_free(secattr->attr.mls.cat); + return ret_val; + } + + secattr->flags |= NETLBL_SECATTR_MLS_CAT; + } + + return 0; +} + +/** + * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2) + * @doi_def: the DOI definition + * @secattr: the security attributes + * @buffer: the option buffer + * @buffer_len: length of buffer in bytes + * + * Description: + * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the + * size of the tag on success, negative values on failure. + * + */ +static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *buffer, + u32 buffer_len) +{ + int ret_val; + u32 tag_len; + u32 level; + + if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) + return -EPERM; + + ret_val = cipso_v4_map_lvl_hton(doi_def, + secattr->attr.mls.lvl, + &level); + if (ret_val != 0) + return ret_val; + + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { + ret_val = cipso_v4_map_cat_enum_hton(doi_def, + secattr, + &buffer[4], + buffer_len - 4); + if (ret_val < 0) + return ret_val; + + tag_len = 4 + ret_val; + } else + tag_len = 4; + + buffer[0] = CIPSO_V4_TAG_ENUM; + buffer[1] = tag_len; + buffer[3] = level; + + return tag_len; +} + +/** + * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag + * @doi_def: the DOI definition + * @tag: the CIPSO tag + * @secattr: the security attributes + * + * Description: + * Parse a CIPSO enumerated tag (tag type #2) and return the security + * attributes in @secattr. Return zero on success, negatives values on + * failure. + * + */ +static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, + const unsigned char *tag, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u8 tag_len = tag[1]; + u32 level; + + ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); + if (ret_val != 0) + return ret_val; + secattr->attr.mls.lvl = level; + secattr->flags |= NETLBL_SECATTR_MLS_LVL; + + if (tag_len > 4) { + secattr->attr.mls.cat = + netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->attr.mls.cat == NULL) + return -ENOMEM; + + ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, + &tag[4], + tag_len - 4, + secattr); + if (ret_val != 0) { + netlbl_secattr_catmap_free(secattr->attr.mls.cat); + return ret_val; + } + + secattr->flags |= NETLBL_SECATTR_MLS_CAT; + } + + return 0; +} + +/** + * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5) + * @doi_def: the DOI definition + * @secattr: the security attributes + * @buffer: the option buffer + * @buffer_len: length of buffer in bytes + * + * Description: + * Generate a CIPSO option using the ranged tag, tag type #5. Returns the + * size of the tag on success, negative values on failure. + * + */ +static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *buffer, + u32 buffer_len) +{ + int ret_val; + u32 tag_len; + u32 level; + + if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) + return -EPERM; + + ret_val = cipso_v4_map_lvl_hton(doi_def, + secattr->attr.mls.lvl, + &level); + if (ret_val != 0) + return ret_val; + + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { + ret_val = cipso_v4_map_cat_rng_hton(doi_def, + secattr, + &buffer[4], + buffer_len - 4); + if (ret_val < 0) + return ret_val; + + tag_len = 4 + ret_val; + } else + tag_len = 4; + + buffer[0] = CIPSO_V4_TAG_RANGE; + buffer[1] = tag_len; + buffer[3] = level; + + return tag_len; +} + +/** + * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag + * @doi_def: the DOI definition + * @tag: the CIPSO tag + * @secattr: the security attributes + * + * Description: + * Parse a CIPSO ranged tag (tag type #5) and return the security attributes + * in @secattr. Return zero on success, negatives values on failure. + * + */ +static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, + const unsigned char *tag, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u8 tag_len = tag[1]; + u32 level; + + ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); + if (ret_val != 0) + return ret_val; + secattr->attr.mls.lvl = level; + secattr->flags |= NETLBL_SECATTR_MLS_LVL; + + if (tag_len > 4) { + secattr->attr.mls.cat = + netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->attr.mls.cat == NULL) + return -ENOMEM; + + ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, + &tag[4], + tag_len - 4, + secattr); + if (ret_val != 0) { + netlbl_secattr_catmap_free(secattr->attr.mls.cat); + return ret_val; + } + + secattr->flags |= NETLBL_SECATTR_MLS_CAT; + } + + return 0; +} + +/** + * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) + * @doi_def: the DOI definition + * @secattr: the security attributes + * @buffer: the option buffer + * @buffer_len: length of buffer in bytes + * + * Description: + * Generate a CIPSO option using the local tag. Returns the size of the tag + * on success, negative values on failure. + * + */ +static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *buffer, + u32 buffer_len) +{ + if (!(secattr->flags & NETLBL_SECATTR_SECID)) + return -EPERM; + + buffer[0] = CIPSO_V4_TAG_LOCAL; + buffer[1] = CIPSO_V4_TAG_LOC_BLEN; + *(u32 *)&buffer[2] = secattr->attr.secid; + + return CIPSO_V4_TAG_LOC_BLEN; +} + +/** + * cipso_v4_parsetag_loc - Parse a CIPSO local tag + * @doi_def: the DOI definition + * @tag: the CIPSO tag + * @secattr: the security attributes + * + * Description: + * Parse a CIPSO local tag and return the security attributes in @secattr. + * Return zero on success, negatives values on failure. + * + */ +static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, + const unsigned char *tag, + struct netlbl_lsm_secattr *secattr) +{ + secattr->attr.secid = *(u32 *)&tag[2]; + secattr->flags |= NETLBL_SECATTR_SECID; + + return 0; +} + +/** + * cipso_v4_validate - Validate a CIPSO option + * @option: the start of the option, on error it is set to point to the error + * + * Description: + * This routine is called to validate a CIPSO option, it checks all of the + * fields to ensure that they are at least valid, see the draft snippet below + * for details. If the option is valid then a zero value is returned and + * the value of @option is unchanged. If the option is invalid then a + * non-zero value is returned and @option is adjusted to point to the + * offending portion of the option. From the IETF draft ... + * + * "If any field within the CIPSO options, such as the DOI identifier, is not + * recognized the IP datagram is discarded and an ICMP 'parameter problem' + * (type 12) is generated and returned. The ICMP code field is set to 'bad + * parameter' (code 0) and the pointer is set to the start of the CIPSO field + * that is unrecognized." + * + */ +int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) +{ + unsigned char *opt = *option; + unsigned char *tag; + unsigned char opt_iter; + unsigned char err_offset = 0; + u8 opt_len; + u8 tag_len; + struct cipso_v4_doi *doi_def = NULL; + u32 tag_iter; + + /* caller already checks for length values that are too large */ + opt_len = opt[1]; + if (opt_len < 8) { + err_offset = 1; + goto validate_return; + } + + rcu_read_lock(); + doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); + if (doi_def == NULL) { + err_offset = 2; + goto validate_return_locked; + } + + opt_iter = CIPSO_V4_HDR_LEN; + tag = opt + opt_iter; + while (opt_iter < opt_len) { + for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) + if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID || + ++tag_iter == CIPSO_V4_TAG_MAXCNT) { + err_offset = opt_iter; + goto validate_return_locked; + } + + tag_len = tag[1]; + if (tag_len > (opt_len - opt_iter)) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + + switch (tag[0]) { + case CIPSO_V4_TAG_RBITMAP: + if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + + /* We are already going to do all the verification + * necessary at the socket layer so from our point of + * view it is safe to turn these checks off (and less + * work), however, the CIPSO draft says we should do + * all the CIPSO validations here but it doesn't + * really specify _exactly_ what we need to validate + * ... so, just make it a sysctl tunable. */ + if (cipso_v4_rbm_strictvalid) { + if (cipso_v4_map_lvl_valid(doi_def, + tag[3]) < 0) { + err_offset = opt_iter + 3; + goto validate_return_locked; + } + if (tag_len > CIPSO_V4_TAG_RBM_BLEN && + cipso_v4_map_cat_rbm_valid(doi_def, + &tag[4], + tag_len - 4) < 0) { + err_offset = opt_iter + 4; + goto validate_return_locked; + } + } + break; + case CIPSO_V4_TAG_ENUM: + if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + + if (cipso_v4_map_lvl_valid(doi_def, + tag[3]) < 0) { + err_offset = opt_iter + 3; + goto validate_return_locked; + } + if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && + cipso_v4_map_cat_enum_valid(doi_def, + &tag[4], + tag_len - 4) < 0) { + err_offset = opt_iter + 4; + goto validate_return_locked; + } + break; + case CIPSO_V4_TAG_RANGE: + if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + + if (cipso_v4_map_lvl_valid(doi_def, + tag[3]) < 0) { + err_offset = opt_iter + 3; + goto validate_return_locked; + } + if (tag_len > CIPSO_V4_TAG_RNG_BLEN && + cipso_v4_map_cat_rng_valid(doi_def, + &tag[4], + tag_len - 4) < 0) { + err_offset = opt_iter + 4; + goto validate_return_locked; + } + break; + case CIPSO_V4_TAG_LOCAL: + /* This is a non-standard tag that we only allow for + * local connections, so if the incoming interface is + * not the loopback device drop the packet. */ + if (!(skb->dev->flags & IFF_LOOPBACK)) { + err_offset = opt_iter; + goto validate_return_locked; + } + if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { + err_offset = opt_iter + 1; + goto validate_return_locked; + } + break; + default: + err_offset = opt_iter; + goto validate_return_locked; + } + + tag += tag_len; + opt_iter += tag_len; + } + +validate_return_locked: + rcu_read_unlock(); +validate_return: + *option = opt + err_offset; + return err_offset; +} + +/** + * cipso_v4_error - Send the correct response for a bad packet + * @skb: the packet + * @error: the error code + * @gateway: CIPSO gateway flag + * + * Description: + * Based on the error code given in @error, send an ICMP error message back to + * the originating host. From the IETF draft ... + * + * "If the contents of the CIPSO [option] are valid but the security label is + * outside of the configured host or port label range, the datagram is + * discarded and an ICMP 'destination unreachable' (type 3) is generated and + * returned. The code field of the ICMP is set to 'communication with + * destination network administratively prohibited' (code 9) or to + * 'communication with destination host administratively prohibited' + * (code 10). The value of the code is dependent on whether the originator + * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The + * recipient of the ICMP message MUST be able to handle either value. The + * same procedure is performed if a CIPSO [option] can not be added to an + * IP packet because it is too large to fit in the IP options area." + * + * "If the error is triggered by receipt of an ICMP message, the message is + * discarded and no response is permitted (consistent with general ICMP + * processing rules)." + * + */ +void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) +{ + if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) + return; + + if (gateway) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0); + else + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0); +} + +/** + * cipso_v4_genopt - Generate a CIPSO option + * @buf: the option buffer + * @buf_len: the size of opt_buf + * @doi_def: the CIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Generate a CIPSO option using the DOI definition and security attributes + * passed to the function. Returns the length of the option on success and + * negative values on failure. + * + */ +static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u32 iter; + + if (buf_len <= CIPSO_V4_HDR_LEN) + return -ENOSPC; + + /* XXX - This code assumes only one tag per CIPSO option which isn't + * really a good assumption to make but since we only support the MAC + * tags right now it is a safe assumption. */ + iter = 0; + do { + memset(buf, 0, buf_len); + switch (doi_def->tags[iter]) { + case CIPSO_V4_TAG_RBITMAP: + ret_val = cipso_v4_gentag_rbm(doi_def, + secattr, + &buf[CIPSO_V4_HDR_LEN], + buf_len - CIPSO_V4_HDR_LEN); + break; + case CIPSO_V4_TAG_ENUM: + ret_val = cipso_v4_gentag_enum(doi_def, + secattr, + &buf[CIPSO_V4_HDR_LEN], + buf_len - CIPSO_V4_HDR_LEN); + break; + case CIPSO_V4_TAG_RANGE: + ret_val = cipso_v4_gentag_rng(doi_def, + secattr, + &buf[CIPSO_V4_HDR_LEN], + buf_len - CIPSO_V4_HDR_LEN); + break; + case CIPSO_V4_TAG_LOCAL: + ret_val = cipso_v4_gentag_loc(doi_def, + secattr, + &buf[CIPSO_V4_HDR_LEN], + buf_len - CIPSO_V4_HDR_LEN); + break; + default: + return -EPERM; + } + + iter++; + } while (ret_val < 0 && + iter < CIPSO_V4_TAG_MAXCNT && + doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); + if (ret_val < 0) + return ret_val; + cipso_v4_gentag_hdr(doi_def, buf, ret_val); + return CIPSO_V4_HDR_LEN + ret_val; +} + +static void opt_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_options_rcu, rcu)); +} + +/** + * cipso_v4_sock_setattr - Add a CIPSO option to a socket + * @sk: the socket + * @doi_def: the CIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +int cipso_v4_sock_setattr(struct sock *sk, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + unsigned char *buf = NULL; + u32 buf_len; + u32 opt_len; + struct ip_options_rcu *old, *opt = NULL; + struct inet_sock *sk_inet; + struct inet_connection_sock *sk_conn; + + /* In the case of sock_create_lite(), the sock->sk field is not + * defined yet but it is not a problem as the only users of these + * "lite" PF_INET sockets are functions which do an accept() call + * afterwards so we will label the socket as part of the accept(). */ + if (sk == NULL) + return 0; + + /* We allocate the maximum CIPSO option size here so we are probably + * being a little wasteful, but it makes our life _much_ easier later + * on and after all we are only talking about 40 bytes. */ + buf_len = CIPSO_V4_OPT_LEN_MAX; + buf = kmalloc(buf_len, GFP_ATOMIC); + if (buf == NULL) { + ret_val = -ENOMEM; + goto socket_setattr_failure; + } + + ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (ret_val < 0) + goto socket_setattr_failure; + buf_len = ret_val; + + /* We can't use ip_options_get() directly because it makes a call to + * ip_options_get_alloc() which allocates memory with GFP_KERNEL and + * we won't always have CAP_NET_RAW even though we _always_ want to + * set the IPOPT_CIPSO option. */ + opt_len = (buf_len + 3) & ~3; + opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); + if (opt == NULL) { + ret_val = -ENOMEM; + goto socket_setattr_failure; + } + memcpy(opt->opt.__data, buf, buf_len); + opt->opt.optlen = opt_len; + opt->opt.cipso = sizeof(struct iphdr); + kfree(buf); + buf = NULL; + + sk_inet = inet_sk(sk); + + old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk)); + if (sk_inet->is_icsk) { + sk_conn = inet_csk(sk); + if (old) + sk_conn->icsk_ext_hdr_len -= old->opt.optlen; + sk_conn->icsk_ext_hdr_len += opt->opt.optlen; + sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); + } + rcu_assign_pointer(sk_inet->inet_opt, opt); + if (old) + call_rcu(&old->rcu, opt_kfree_rcu); + + return 0; + +socket_setattr_failure: + kfree(buf); + kfree(opt); + return ret_val; +} + +/** + * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +int cipso_v4_req_setattr(struct request_sock *req, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + unsigned char *buf = NULL; + u32 buf_len; + u32 opt_len; + struct ip_options_rcu *opt = NULL; + struct inet_request_sock *req_inet; + + /* We allocate the maximum CIPSO option size here so we are probably + * being a little wasteful, but it makes our life _much_ easier later + * on and after all we are only talking about 40 bytes. */ + buf_len = CIPSO_V4_OPT_LEN_MAX; + buf = kmalloc(buf_len, GFP_ATOMIC); + if (buf == NULL) { + ret_val = -ENOMEM; + goto req_setattr_failure; + } + + ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (ret_val < 0) + goto req_setattr_failure; + buf_len = ret_val; + + /* We can't use ip_options_get() directly because it makes a call to + * ip_options_get_alloc() which allocates memory with GFP_KERNEL and + * we won't always have CAP_NET_RAW even though we _always_ want to + * set the IPOPT_CIPSO option. */ + opt_len = (buf_len + 3) & ~3; + opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); + if (opt == NULL) { + ret_val = -ENOMEM; + goto req_setattr_failure; + } + memcpy(opt->opt.__data, buf, buf_len); + opt->opt.optlen = opt_len; + opt->opt.cipso = sizeof(struct iphdr); + kfree(buf); + buf = NULL; + + req_inet = inet_rsk(req); + opt = xchg(&req_inet->opt, opt); + if (opt) + call_rcu(&opt->rcu, opt_kfree_rcu); + + return 0; + +req_setattr_failure: + kfree(buf); + kfree(opt); + return ret_val; +} + +/** + * cipso_v4_delopt - Delete the CIPSO option from a set of IP options + * @opt_ptr: IP option pointer + * + * Description: + * Deletes the CIPSO IP option from a set of IP options and makes the necessary + * adjustments to the IP option structure. Returns zero on success, negative + * values on failure. + * + */ +static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) +{ + int hdr_delta = 0; + struct ip_options_rcu *opt = *opt_ptr; + + if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { + u8 cipso_len; + u8 cipso_off; + unsigned char *cipso_ptr; + int iter; + int optlen_new; + + cipso_off = opt->opt.cipso - sizeof(struct iphdr); + cipso_ptr = &opt->opt.__data[cipso_off]; + cipso_len = cipso_ptr[1]; + + if (opt->opt.srr > opt->opt.cipso) + opt->opt.srr -= cipso_len; + if (opt->opt.rr > opt->opt.cipso) + opt->opt.rr -= cipso_len; + if (opt->opt.ts > opt->opt.cipso) + opt->opt.ts -= cipso_len; + if (opt->opt.router_alert > opt->opt.cipso) + opt->opt.router_alert -= cipso_len; + opt->opt.cipso = 0; + + memmove(cipso_ptr, cipso_ptr + cipso_len, + opt->opt.optlen - cipso_off - cipso_len); + + /* determining the new total option length is tricky because of + * the padding necessary, the only thing i can think to do at + * this point is walk the options one-by-one, skipping the + * padding at the end to determine the actual option size and + * from there we can determine the new total option length */ + iter = 0; + optlen_new = 0; + while (iter < opt->opt.optlen) + if (opt->opt.__data[iter] != IPOPT_NOP) { + iter += opt->opt.__data[iter + 1]; + optlen_new = iter; + } else + iter++; + hdr_delta = opt->opt.optlen; + opt->opt.optlen = (optlen_new + 3) & ~3; + hdr_delta -= opt->opt.optlen; + } else { + /* only the cipso option was present on the socket so we can + * remove the entire option struct */ + *opt_ptr = NULL; + hdr_delta = opt->opt.optlen; + call_rcu(&opt->rcu, opt_kfree_rcu); + } + + return hdr_delta; +} + +/** + * cipso_v4_sock_delattr - Delete the CIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CIPSO option from a socket, if present. + * + */ +void cipso_v4_sock_delattr(struct sock *sk) +{ + int hdr_delta; + struct ip_options_rcu *opt; + struct inet_sock *sk_inet; + + sk_inet = inet_sk(sk); + opt = rcu_dereference_protected(sk_inet->inet_opt, 1); + if (opt == NULL || opt->opt.cipso == 0) + return; + + hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); + if (sk_inet->is_icsk && hdr_delta > 0) { + struct inet_connection_sock *sk_conn = inet_csk(sk); + sk_conn->icsk_ext_hdr_len -= hdr_delta; + sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); + } +} + +/** + * cipso_v4_req_delattr - Delete the CIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CIPSO option from a request socket, if present. + * + */ +void cipso_v4_req_delattr(struct request_sock *req) +{ + struct ip_options_rcu *opt; + struct inet_request_sock *req_inet; + + req_inet = inet_rsk(req); + opt = req_inet->opt; + if (opt == NULL || opt->opt.cipso == 0) + return; + + cipso_v4_delopt(&req_inet->opt); +} + +/** + * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions + * @cipso: the CIPSO v4 option + * @secattr: the security attributes + * + * Description: + * Inspect @cipso and return the security attributes in @secattr. Returns zero + * on success and negative values on failure. + * + */ +static int cipso_v4_getattr(const unsigned char *cipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + u32 doi; + struct cipso_v4_doi *doi_def; + + if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) + return 0; + + doi = get_unaligned_be32(&cipso[2]); + rcu_read_lock(); + doi_def = cipso_v4_doi_search(doi); + if (doi_def == NULL) + goto getattr_return; + /* XXX - This code assumes only one tag per CIPSO option which isn't + * really a good assumption to make but since we only support the MAC + * tags right now it is a safe assumption. */ + switch (cipso[6]) { + case CIPSO_V4_TAG_RBITMAP: + ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); + break; + case CIPSO_V4_TAG_ENUM: + ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); + break; + case CIPSO_V4_TAG_RANGE: + ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); + break; + case CIPSO_V4_TAG_LOCAL: + ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); + break; + } + if (ret_val == 0) + secattr->type = NETLBL_NLTYPE_CIPSOV4; + +getattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** + * cipso_v4_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CIPSO option attached to the sock and if + * there is return the CIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +{ + struct ip_options_rcu *opt; + int res = -ENOMSG; + + rcu_read_lock(); + opt = rcu_dereference(inet_sk(sk)->inet_opt); + if (opt && opt->opt.cipso) + res = cipso_v4_getattr(opt->opt.__data + + opt->opt.cipso - + sizeof(struct iphdr), + secattr); + rcu_read_unlock(); + return res; +} + +/** + * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet + * @skb: the packet + * @secattr: the security attributes + * + * Description: + * Set the CIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +int cipso_v4_skbuff_setattr(struct sk_buff *skb, + const struct cipso_v4_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct iphdr *iph; + struct ip_options *opt = &IPCB(skb)->opt; + unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; + u32 buf_len = CIPSO_V4_OPT_LEN_MAX; + u32 opt_len; + int len_delta; + + ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); + if (ret_val < 0) + return ret_val; + buf_len = ret_val; + opt_len = (buf_len + 3) & ~3; + + /* we overwrite any existing options to ensure that we have enough + * room for the CIPSO option, the reason is that we _need_ to guarantee + * that the security label is applied to the packet - we do the same + * thing when using the socket options and it hasn't caused a problem, + * if we need to we can always revisit this choice later */ + + len_delta = opt_len - opt->optlen; + /* if we don't ensure enough headroom we could panic on the skb_push() + * call below so make sure we have enough, we are also "mangling" the + * packet so we should probably do a copy-on-write call anyway */ + ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + if (ret_val < 0) + return ret_val; + + if (len_delta > 0) { + /* we assume that the header + opt->optlen have already been + * "pushed" in ip_options_build() or similar */ + iph = ip_hdr(skb); + skb_push(skb, len_delta); + memmove((char *)iph - len_delta, iph, iph->ihl << 2); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + } else if (len_delta < 0) { + iph = ip_hdr(skb); + memset(iph + 1, IPOPT_NOP, opt->optlen); + } else + iph = ip_hdr(skb); + + if (opt->optlen > 0) + memset(opt, 0, sizeof(*opt)); + opt->optlen = opt_len; + opt->cipso = sizeof(struct iphdr); + opt->is_changed = 1; + + /* we have to do the following because we are being called from a + * netfilter hook which means the packet already has had the header + * fields populated and the checksum calculated - yes this means we + * are doing more work than needed but we do it to keep the core + * stack clean and tidy */ + memcpy(iph + 1, buf, buf_len); + if (opt_len > buf_len) + memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); + if (len_delta != 0) { + iph->ihl = 5 + (opt_len >> 2); + iph->tot_len = htons(skb->len); + } + ip_send_check(iph); + + return 0; +} + +/** + * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +int cipso_v4_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val; + struct iphdr *iph; + struct ip_options *opt = &IPCB(skb)->opt; + unsigned char *cipso_ptr; + + if (opt->cipso == 0) + return 0; + + /* since we are changing the packet we should make a copy */ + ret_val = skb_cow(skb, skb_headroom(skb)); + if (ret_val < 0) + return ret_val; + + /* the easiest thing to do is just replace the cipso option with noop + * options since we don't change the size of the packet, although we + * still need to recalculate the checksum */ + + iph = ip_hdr(skb); + cipso_ptr = (unsigned char *)iph + opt->cipso; + memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); + opt->cipso = 0; + opt->is_changed = 1; + + ip_send_check(iph); + + return 0; +} + +/** + * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option + * @skb: the packet + * @secattr: the security attributes + * + * Description: + * Parse the given packet's CIPSO option and return the security attributes. + * Returns zero on success and negative values on failure. + * + */ +int cipso_v4_skbuff_getattr(const struct sk_buff *skb, + struct netlbl_lsm_secattr *secattr) +{ + return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr); +} + +/* + * Setup Functions + */ + +/** + * cipso_v4_init - Initialize the CIPSO module + * + * Description: + * Initialize the CIPSO module and prepare it for use. Returns zero on success + * and negative values on failure. + * + */ +static int __init cipso_v4_init(void) +{ + int ret_val; + + ret_val = cipso_v4_cache_init(); + if (ret_val != 0) + panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n", + ret_val); + + return 0; +} + +subsys_initcall(cipso_v4_init); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c new file mode 100644 index 00000000..424fafbc --- /dev/null +++ b/net/ipv4/datagram.c @@ -0,0 +1,87 @@ +/* + * common UDP/RAW code + * Linux INET implementation + * + * Authors: + * Hideaki YOSHIFUJI + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct flowi4 *fl4; + struct rtable *rt; + __be32 saddr; + int oif; + int err; + + + if (addr_len < sizeof(*usin)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + sk_dst_reset(sk); + + lock_sock(sk); + + oif = sk->sk_bound_dev_if; + saddr = inet->inet_saddr; + if (ipv4_is_multicast(usin->sin_addr.s_addr)) { + if (!oif) + oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + sk->sk_protocol, + inet->inet_sport, usin->sin_port, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + if (err == -ENETUNREACH) + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + goto out; + } + + if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { + ip_rt_put(rt); + err = -EACCES; + goto out; + } + if (!inet->inet_saddr) + inet->inet_saddr = fl4->saddr; /* Update source address */ + if (!inet->inet_rcv_saddr) { + inet->inet_rcv_saddr = fl4->saddr; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } + inet->inet_daddr = fl4->daddr; + inet->inet_dport = usin->sin_port; + sk->sk_state = TCP_ESTABLISHED; + inet->inet_id = jiffies; + + sk_dst_set(sk, &rt->dst); + err = 0; +out: + release_sock(sk); + return err; +} +EXPORT_SYMBOL(ip4_datagram_connect); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c new file mode 100644 index 00000000..c48323ad --- /dev/null +++ b/net/ipv4/devinet.c @@ -0,0 +1,1851 @@ +/* + * NET3 IP device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the IP parts of dev.c 1.0.19 + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * + * Additional Authors: + * Alan Cox, + * Alexey Kuznetsov, + * + * Changes: + * Alexey Kuznetsov: pa_* fields are replaced with ifaddr + * lists. + * Cyrus Durgin: updated for kmod + * Matthias Andree: in devinet_ioctl, compare label and + * address (4.4BSD alias style support), + * fall back to comparing just the label + * if no match found. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "fib_lookup.h" + +static struct ipv4_devconf ipv4_devconf = { + .data = { + [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + }, +}; + +static struct ipv4_devconf ipv4_devconf_dflt = { + .data = { + [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + }, +}; + +#define IPV4_DEVCONF_DFLT(net, attr) \ + IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) + +static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { + [IFA_LOCAL] = { .type = NLA_U32 }, + [IFA_ADDRESS] = { .type = NLA_U32 }, + [IFA_BROADCAST] = { .type = NLA_U32 }, + [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, +}; + +/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE + * value. So if you change this define, make appropriate changes to + * inet_addr_hash as well. + */ +#define IN4_ADDR_HSIZE 256 +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; +static DEFINE_SPINLOCK(inet_addr_hash_lock); + +static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) +{ + u32 val = (__force u32) addr ^ hash_ptr(net, 8); + + return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & + (IN4_ADDR_HSIZE - 1)); +} + +static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) +{ + unsigned int hash = inet_addr_hash(net, ifa->ifa_local); + + spin_lock(&inet_addr_hash_lock); + hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + spin_unlock(&inet_addr_hash_lock); +} + +static void inet_hash_remove(struct in_ifaddr *ifa) +{ + spin_lock(&inet_addr_hash_lock); + hlist_del_init_rcu(&ifa->hash); + spin_unlock(&inet_addr_hash_lock); +} + +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL + */ +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) +{ + unsigned int hash = inet_addr_hash(net, addr); + struct net_device *result = NULL; + struct in_ifaddr *ifa; + struct hlist_node *node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { + struct net_device *dev = ifa->ifa_dev->dev; + + if (!net_eq(dev_net(dev), net)) + continue; + if (ifa->ifa_local == addr) { + result = dev; + break; + } + } + if (!result) { + struct flowi4 fl4 = { .daddr = addr }; + struct fib_result res = { 0 }; + struct fib_table *local; + + /* Fallback to FIB local table so that communication + * over loopback subnets work. + */ + local = fib_get_table(net, RT_TABLE_LOCAL); + if (local && + !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && + res.type == RTN_LOCAL) + result = FIB_RES_DEV(res); + } + if (result && devref) + dev_hold(result); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL(__ip_dev_find); + +static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); + +static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy); +#ifdef CONFIG_SYSCTL +static void devinet_sysctl_register(struct in_device *idev); +static void devinet_sysctl_unregister(struct in_device *idev); +#else +static inline void devinet_sysctl_register(struct in_device *idev) +{ +} +static inline void devinet_sysctl_unregister(struct in_device *idev) +{ +} +#endif + +/* Locks all the inet devices. */ + +static struct in_ifaddr *inet_alloc_ifa(void) +{ + return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL); +} + +static void inet_rcu_free_ifa(struct rcu_head *head) +{ + struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); + if (ifa->ifa_dev) + in_dev_put(ifa->ifa_dev); + kfree(ifa); +} + +static inline void inet_free_ifa(struct in_ifaddr *ifa) +{ + call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); +} + +void in_dev_finish_destroy(struct in_device *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(idev->ifa_list); + WARN_ON(idev->mc_list); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", + idev, dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) + pr_err("Freeing alive in_device %p\n", idev); + else + kfree(idev); +} +EXPORT_SYMBOL(in_dev_finish_destroy); + +static struct in_device *inetdev_init(struct net_device *dev) +{ + struct in_device *in_dev; + + ASSERT_RTNL(); + + in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); + if (!in_dev) + goto out; + memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, + sizeof(in_dev->cnf)); + in_dev->cnf.sysctl = NULL; + in_dev->dev = dev; + in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); + if (!in_dev->arp_parms) + goto out_kfree; + if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) + dev_disable_lro(dev); + /* Reference in_dev->dev */ + dev_hold(dev); + /* Account for reference dev->ip_ptr (below) */ + in_dev_hold(in_dev); + + devinet_sysctl_register(in_dev); + ip_mc_init_dev(in_dev); + if (dev->flags & IFF_UP) + ip_mc_up(in_dev); + + /* we can receive as soon as ip_ptr is set -- do this last */ + rcu_assign_pointer(dev->ip_ptr, in_dev); +out: + return in_dev; +out_kfree: + kfree(in_dev); + in_dev = NULL; + goto out; +} + +static void in_dev_rcu_put(struct rcu_head *head) +{ + struct in_device *idev = container_of(head, struct in_device, rcu_head); + in_dev_put(idev); +} + +static void inetdev_destroy(struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + struct net_device *dev; + + ASSERT_RTNL(); + + dev = in_dev->dev; + + in_dev->dead = 1; + + ip_mc_destroy_dev(in_dev); + + while ((ifa = in_dev->ifa_list) != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 0); + inet_free_ifa(ifa); + } + + rcu_assign_pointer(dev->ip_ptr, NULL); + + devinet_sysctl_unregister(in_dev); + neigh_parms_release(&arp_tbl, in_dev->arp_parms); + arp_ifdown(dev); + + call_rcu(&in_dev->rcu_head, in_dev_rcu_put); +} + +int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) +{ + rcu_read_lock(); + for_primary_ifa(in_dev) { + if (inet_ifa_match(a, ifa)) { + if (!b || inet_ifa_match(b, ifa)) { + rcu_read_unlock(); + return 1; + } + } + } endfor_ifa(in_dev); + rcu_read_unlock(); + return 0; +} + +static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy, struct nlmsghdr *nlh, u32 pid) +{ + struct in_ifaddr *promote = NULL; + struct in_ifaddr *ifa, *ifa1 = *ifap; + struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *prev_prom = NULL; + int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); + + ASSERT_RTNL(); + + /* 1. Deleting primary ifaddr forces deletion all secondaries + * unless alias promotion is set + **/ + + if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { + struct in_ifaddr **ifap1 = &ifa1->ifa_next; + + while ((ifa = *ifap1) != NULL) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY) && + ifa1->ifa_scope <= ifa->ifa_scope) + last_prim = ifa; + + if (!(ifa->ifa_flags & IFA_F_SECONDARY) || + ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) { + ifap1 = &ifa->ifa_next; + prev_prom = ifa; + continue; + } + + if (!do_promote) { + inet_hash_remove(ifa); + *ifap1 = ifa->ifa_next; + + rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); + blocking_notifier_call_chain(&inetaddr_chain, + NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } else { + promote = ifa; + break; + } + } + } + + /* On promotion all secondaries from subnet are changing + * the primary IP, we must remove all their routes silently + * and later to add them back with new prefsrc. Do this + * while all addresses are on the device list. + */ + for (ifa = promote; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) + fib_del_ifaddr(ifa, ifa1); + } + + /* 2. Unlink it */ + + *ifap = ifa1->ifa_next; + inet_hash_remove(ifa1); + + /* 3. Announce address deletion */ + + /* Send message first, then call notifier. + At first sight, FIB update triggered by notifier + will refer to already deleted ifaddr, that could confuse + netlink listeners. It is not true: look, gated sees + that route deleted and if it still thinks that ifaddr + is valid, it will try to restore deleted routes... Grr. + So that, this order is correct. + */ + rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid); + blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); + + if (promote) { + struct in_ifaddr *next_sec = promote->ifa_next; + + if (prev_prom) { + prev_prom->ifa_next = promote->ifa_next; + promote->ifa_next = last_prim->ifa_next; + last_prim->ifa_next = promote; + } + + promote->ifa_flags &= ~IFA_F_SECONDARY; + rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); + blocking_notifier_call_chain(&inetaddr_chain, + NETDEV_UP, promote); + for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) + continue; + fib_add_ifaddr(ifa); + } + + } + if (destroy) + inet_free_ifa(ifa1); +} + +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy) +{ + __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); +} + +static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, + u32 pid) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct in_ifaddr *ifa1, **ifap, **last_primary; + + ASSERT_RTNL(); + + if (!ifa->ifa_local) { + inet_free_ifa(ifa); + return 0; + } + + ifa->ifa_flags &= ~IFA_F_SECONDARY; + last_primary = &in_dev->ifa_list; + + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; + ifap = &ifa1->ifa_next) { + if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && + ifa->ifa_scope <= ifa1->ifa_scope) + last_primary = &ifa1->ifa_next; + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) { + if (ifa1->ifa_local == ifa->ifa_local) { + inet_free_ifa(ifa); + return -EEXIST; + } + if (ifa1->ifa_scope != ifa->ifa_scope) { + inet_free_ifa(ifa); + return -EINVAL; + } + ifa->ifa_flags |= IFA_F_SECONDARY; + } + } + + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { + net_srandom(ifa->ifa_local); + ifap = last_primary; + } + + ifa->ifa_next = *ifap; + *ifap = ifa; + + inet_hash_insert(dev_net(in_dev->dev), ifa); + + /* Send message first, then call notifier. + Notifier will trigger FIB update, so that + listeners of netlink will know about new ifaddr */ + rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid); + blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int inet_insert_ifa(struct in_ifaddr *ifa) +{ + return __inet_insert_ifa(ifa, NULL, 0); +} + +static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) +{ + struct in_device *in_dev = __in_dev_get_rtnl(dev); + + ASSERT_RTNL(); + + if (!in_dev) { + inet_free_ifa(ifa); + return -ENOBUFS; + } + ipv4_devconf_setall(in_dev); + if (ifa->ifa_dev != in_dev) { + WARN_ON(ifa->ifa_dev); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + } + if (ipv4_is_loopback(ifa->ifa_local)) + ifa->ifa_scope = RT_SCOPE_HOST; + return inet_insert_ifa(ifa); +} + +/* Caller must hold RCU or RTNL : + * We dont take a reference on found in_device + */ +struct in_device *inetdev_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + struct in_device *in_dev = NULL; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + in_dev = rcu_dereference_rtnl(dev->ip_ptr); + rcu_read_unlock(); + return in_dev; +} +EXPORT_SYMBOL(inetdev_by_index); + +/* Called only from RTNL semaphored context. No locks. */ + +struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, + __be32 mask) +{ + ASSERT_RTNL(); + + for_primary_ifa(in_dev) { + if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) + return ifa; + } endfor_ifa(in_dev); + return NULL; +} + +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX+1]; + struct in_device *in_dev; + struct ifaddrmsg *ifm; + struct in_ifaddr *ifa, **ifap; + int err = -EINVAL; + + ASSERT_RTNL(); + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); + if (err < 0) + goto errout; + + ifm = nlmsg_data(nlh); + in_dev = inetdev_by_index(net, ifm->ifa_index); + if (in_dev == NULL) { + err = -ENODEV; + goto errout; + } + + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (tb[IFA_LOCAL] && + ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL])) + continue; + + if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) + continue; + + if (tb[IFA_ADDRESS] && + (ifm->ifa_prefixlen != ifa->ifa_prefixlen || + !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) + continue; + + __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid); + return 0; + } + + err = -EADDRNOTAVAIL; +errout: + return err; +} + +static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) +{ + struct nlattr *tb[IFA_MAX+1]; + struct in_ifaddr *ifa; + struct ifaddrmsg *ifm; + struct net_device *dev; + struct in_device *in_dev; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); + if (err < 0) + goto errout; + + ifm = nlmsg_data(nlh); + err = -EINVAL; + if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL) + goto errout; + + dev = __dev_get_by_index(net, ifm->ifa_index); + err = -ENODEV; + if (dev == NULL) + goto errout; + + in_dev = __in_dev_get_rtnl(dev); + err = -ENOBUFS; + if (in_dev == NULL) + goto errout; + + ifa = inet_alloc_ifa(); + if (ifa == NULL) + /* + * A potential indev allocation can be left alive, it stays + * assigned to its device and is destroy with it. + */ + goto errout; + + ipv4_devconf_setall(in_dev); + in_dev_hold(in_dev); + + if (tb[IFA_ADDRESS] == NULL) + tb[IFA_ADDRESS] = tb[IFA_LOCAL]; + + INIT_HLIST_NODE(&ifa->hash); + ifa->ifa_prefixlen = ifm->ifa_prefixlen; + ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + ifa->ifa_dev = in_dev; + + ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]); + ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]); + + if (tb[IFA_BROADCAST]) + ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); + + if (tb[IFA_LABEL]) + nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + return ifa; + +errout: + return ERR_PTR(err); +} + +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct in_ifaddr *ifa; + + ASSERT_RTNL(); + + ifa = rtm_to_ifaddr(net, nlh); + if (IS_ERR(ifa)) + return PTR_ERR(ifa); + + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid); +} + +/* + * Determine a default network mask, based on the IP address. + */ + +static inline int inet_abc_len(__be32 addr) +{ + int rc = -1; /* Something else, probably a multicast. */ + + if (ipv4_is_zeronet(addr)) + rc = 0; + else { + __u32 haddr = ntohl(addr); + + if (IN_CLASSA(haddr)) + rc = 8; + else if (IN_CLASSB(haddr)) + rc = 16; + else if (IN_CLASSC(haddr)) + rc = 24; + } + + return rc; +} + + +int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + struct sockaddr_in sin_orig; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct in_device *in_dev; + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; + struct net_device *dev; + char *colon; + int ret = -EFAULT; + int tryaddrmatch = 0; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + goto out; + ifr.ifr_name[IFNAMSIZ - 1] = 0; + + /* save original address for comparison */ + memcpy(&sin_orig, sin, sizeof(*sin)); + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + dev_load(net, ifr.ifr_name); + + switch (cmd) { + case SIOCGIFADDR: /* Get interface address */ + case SIOCGIFBRDADDR: /* Get the broadcast address */ + case SIOCGIFDSTADDR: /* Get the destination address */ + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + /* Note that these ioctls will not sleep, + so that we do not impose a lock. + One day we will be forced to put shlock here (I mean SMP) + */ + tryaddrmatch = (sin_orig.sin_family == AF_INET); + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + break; + + case SIOCSIFFLAGS: + ret = -EACCES; + if (!capable(CAP_NET_ADMIN)) + goto out; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ + case SIOCSIFBRDADDR: /* Set the broadcast address */ + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + case SIOCKILLADDR: /* Nuke all sockets on this address */ + ret = -EACCES; + if (!capable(CAP_NET_ADMIN)) + goto out; + ret = -EINVAL; + if (sin->sin_family != AF_INET) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + rtnl_lock(); + + ret = -ENODEV; + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev) + goto done; + + if (colon) + *colon = ':'; + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { + if (tryaddrmatch) { + /* Matthias Andree */ + /* compare label and address (4.4BSD style) */ + /* note: we only do this for a limited set of ioctls + and only if the original address family was AF_INET. + This is checked above. */ + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + sin_orig.sin_addr.s_addr == + ifa->ifa_local) { + break; /* found */ + } + } + } + /* we didn't get a match, maybe the application is + 4.3BSD-style and passed in junk so we fall back to + comparing just the label */ + if (!ifa) { + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) + if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + break; + } + } + + ret = -EADDRNOTAVAIL; + if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS + && cmd != SIOCKILLADDR) + goto done; + + switch (cmd) { + case SIOCGIFADDR: /* Get interface address */ + sin->sin_addr.s_addr = ifa->ifa_local; + goto rarok; + + case SIOCGIFBRDADDR: /* Get the broadcast address */ + sin->sin_addr.s_addr = ifa->ifa_broadcast; + goto rarok; + + case SIOCGIFDSTADDR: /* Get the destination address */ + sin->sin_addr.s_addr = ifa->ifa_address; + goto rarok; + + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + sin->sin_addr.s_addr = ifa->ifa_mask; + goto rarok; + + case SIOCSIFFLAGS: + if (colon) { + ret = -EADDRNOTAVAIL; + if (!ifa) + break; + ret = 0; + if (!(ifr.ifr_flags & IFF_UP)) + inet_del_ifa(in_dev, ifap, 1); + break; + } + ret = dev_change_flags(dev, ifr.ifr_flags); + break; + + case SIOCSIFADDR: /* Set interface address (and family) */ + ret = -EINVAL; + if (inet_abc_len(sin->sin_addr.s_addr) < 0) + break; + + if (!ifa) { + ret = -ENOBUFS; + ifa = inet_alloc_ifa(); + INIT_HLIST_NODE(&ifa->hash); + if (!ifa) + break; + if (colon) + memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + } else { + ret = 0; + if (ifa->ifa_local == sin->sin_addr.s_addr) + break; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = 0; + ifa->ifa_scope = 0; + } + + ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; + + if (!(dev->flags & IFF_POINTOPOINT)) { + ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + if ((dev->flags & IFF_BROADCAST) && + ifa->ifa_prefixlen < 31) + ifa->ifa_broadcast = ifa->ifa_address | + ~ifa->ifa_mask; + } else { + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); + } + ret = inet_set_ifa(dev, ifa); + break; + + case SIOCSIFBRDADDR: /* Set the broadcast address */ + ret = 0; + if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = sin->sin_addr.s_addr; + inet_insert_ifa(ifa); + } + break; + + case SIOCSIFDSTADDR: /* Set the destination address */ + ret = 0; + if (ifa->ifa_address == sin->sin_addr.s_addr) + break; + ret = -EINVAL; + if (inet_abc_len(sin->sin_addr.s_addr) < 0) + break; + ret = 0; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_address = sin->sin_addr.s_addr; + inet_insert_ifa(ifa); + break; + + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + + /* + * The mask we set must be legal. + */ + ret = -EINVAL; + if (bad_mask(sin->sin_addr.s_addr, 0)) + break; + ret = 0; + if (ifa->ifa_mask != sin->sin_addr.s_addr) { + __be32 old_mask = ifa->ifa_mask; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_mask = sin->sin_addr.s_addr; + ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); + + /* See if current broadcast address matches + * with current netmask, then recalculate + * the broadcast address. Otherwise it's a + * funny address, so don't touch it since + * the user seems to know what (s)he's doing... + */ + if ((dev->flags & IFF_BROADCAST) && + (ifa->ifa_prefixlen < 31) && + (ifa->ifa_broadcast == + (ifa->ifa_local|~old_mask))) { + ifa->ifa_broadcast = (ifa->ifa_local | + ~sin->sin_addr.s_addr); + } + inet_insert_ifa(ifa); + } + break; + case SIOCKILLADDR: /* Nuke all connections on this address */ + ret = tcp_nuke_addr(net, (struct sockaddr *) sin); + break; + } +done: + rtnl_unlock(); +out: + return ret; +rarok: + rtnl_unlock(); + ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0; + goto out; +} + +static int inet_gifconf(struct net_device *dev, char __user *buf, int len) +{ + struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_ifaddr *ifa; + struct ifreq ifr; + int done = 0; + + if (!in_dev) + goto out; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (!buf) { + done += sizeof(ifr); + continue; + } + if (len < (int) sizeof(ifr)) + break; + memset(&ifr, 0, sizeof(struct ifreq)); + if (ifa->ifa_label) + strcpy(ifr.ifr_name, ifa->ifa_label); + else + strcpy(ifr.ifr_name, dev->name); + + (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = + ifa->ifa_local; + + if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) { + done = -EFAULT; + break; + } + buf += sizeof(struct ifreq); + len -= sizeof(struct ifreq); + done += sizeof(struct ifreq); + } +out: + return done; +} + +__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) +{ + __be32 addr = 0; + struct in_device *in_dev; + struct net *net = dev_net(dev); + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto no_in_dev; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope > scope) + continue; + if (!dst || inet_ifa_match(dst, ifa)) { + addr = ifa->ifa_local; + break; + } + if (!addr) + addr = ifa->ifa_local; + } endfor_ifa(in_dev); + + if (addr) + goto out_unlock; +no_in_dev: + + /* Not loopback addresses on loopback should be preferred + in this case. It is importnat that lo is the first interface + in dev_base list. + */ + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock; + } + } endfor_ifa(in_dev); + } +out_unlock: + rcu_read_unlock(); + return addr; +} +EXPORT_SYMBOL(inet_select_addr); + +static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, + __be32 local, int scope) +{ + int same = 0; + __be32 addr = 0; + + for_ifa(in_dev) { + if (!addr && + (local == ifa->ifa_local || !local) && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + if (same) + break; + } + if (!same) { + same = (!local || inet_ifa_match(local, ifa)) && + (!dst || inet_ifa_match(dst, ifa)); + if (same && addr) { + if (local || !dst) + break; + /* Is the selected addr into dst subnet? */ + if (inet_ifa_match(addr, ifa)) + break; + /* No, then can we use new local src? */ + if (ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + break; + } + /* search for large dst subnet for addr */ + same = 0; + } + } + } endfor_ifa(in_dev); + + return same ? addr : 0; +} + +/* + * Confirm that local IP address exists using wildcards: + * - in_dev: only on this interface, 0=any interface + * - dst: only in the same subnet as dst, 0=any dst + * - local: address, 0=autoselect the local address + * - scope: maximum allowed scope value for the local address + */ +__be32 inet_confirm_addr(struct in_device *in_dev, + __be32 dst, __be32 local, int scope) +{ + __be32 addr = 0; + struct net_device *dev; + struct net *net; + + if (scope != RT_SCOPE_LINK) + return confirm_addr_indev(in_dev, dst, local, scope); + + net = dev_net(in_dev->dev); + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (in_dev) { + addr = confirm_addr_indev(in_dev, dst, local, scope); + if (addr) + break; + } + } + rcu_read_unlock(); + + return addr; +} + +/* + * Device notifier + */ + +int register_inetaddr_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&inetaddr_chain, nb); +} +EXPORT_SYMBOL(register_inetaddr_notifier); + +int unregister_inetaddr_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&inetaddr_chain, nb); +} +EXPORT_SYMBOL(unregister_inetaddr_notifier); + +/* Rename ifa_labels for a device name change. Make some effort to preserve + * existing alias numbering and to create unique labels if possible. +*/ +static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + int named = 0; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + char old[IFNAMSIZ], *dot; + + memcpy(old, ifa->ifa_label, IFNAMSIZ); + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + if (named++ == 0) + goto skip; + dot = strchr(old, ':'); + if (dot == NULL) { + sprintf(old, ":%d", named); + dot = old; + } + if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) + strcat(ifa->ifa_label, dot); + else + strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); +skip: + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); + } +} + +static inline bool inetdev_valid_mtu(unsigned mtu) +{ + return mtu >= 68; +} + +static void inetdev_send_gratuitous_arp(struct net_device *dev, + struct in_device *in_dev) + +{ + struct in_ifaddr *ifa; + + for (ifa = in_dev->ifa_list; ifa; + ifa = ifa->ifa_next) { + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, + dev->dev_addr, NULL); + } +} + +/* Called only under RTNL semaphore */ + +static int inetdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct in_device *in_dev = __in_dev_get_rtnl(dev); + + ASSERT_RTNL(); + + if (!in_dev) { + if (event == NETDEV_REGISTER) { + in_dev = inetdev_init(dev); + if (!in_dev) + return notifier_from_errno(-ENOMEM); + if (dev->flags & IFF_LOOPBACK) { + IN_DEV_CONF_SET(in_dev, NOXFRM, 1); + IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); + } + } else if (event == NETDEV_CHANGEMTU) { + /* Re-enabling IP */ + if (inetdev_valid_mtu(dev->mtu)) + in_dev = inetdev_init(dev); + } + goto out; + } + + switch (event) { + case NETDEV_REGISTER: + printk(KERN_DEBUG "inetdev_event: bug\n"); + rcu_assign_pointer(dev->ip_ptr, NULL); + break; + case NETDEV_UP: + if (!inetdev_valid_mtu(dev->mtu)) + break; + if (dev->flags & IFF_LOOPBACK) { + struct in_ifaddr *ifa = inet_alloc_ifa(); + + if (ifa) { + INIT_HLIST_NODE(&ifa->hash); + ifa->ifa_local = + ifa->ifa_address = htonl(INADDR_LOOPBACK); + ifa->ifa_prefixlen = 8; + ifa->ifa_mask = inet_make_mask(8); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + ifa->ifa_scope = RT_SCOPE_HOST; + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + inet_insert_ifa(ifa); + } + } + ip_mc_up(in_dev); + /* fall through */ + case NETDEV_CHANGEADDR: + if (!IN_DEV_ARP_NOTIFY(in_dev)) + break; + /* fall through */ + case NETDEV_NOTIFY_PEERS: + /* Send gratuitous ARP to notify of link change */ + inetdev_send_gratuitous_arp(dev, in_dev); + break; + case NETDEV_DOWN: + ip_mc_down(in_dev); + break; + case NETDEV_PRE_TYPE_CHANGE: + ip_mc_unmap(in_dev); + break; + case NETDEV_POST_TYPE_CHANGE: + ip_mc_remap(in_dev); + break; + case NETDEV_CHANGEMTU: + if (inetdev_valid_mtu(dev->mtu)) + break; + /* disable IP when MTU is not enough */ + case NETDEV_UNREGISTER: + inetdev_destroy(in_dev); + break; + case NETDEV_CHANGENAME: + /* Do not notify about label change, this event is + * not interesting to applications using netlink. + */ + inetdev_changename(dev, in_dev); + + devinet_sysctl_unregister(in_dev); + devinet_sysctl_register(in_dev); + break; + } +out: + return NOTIFY_DONE; +} + +static struct notifier_block ip_netdev_notifier = { + .notifier_call = inetdev_event, +}; + +static inline size_t inet_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(4) /* IFA_ADDRESS */ + + nla_total_size(4) /* IFA_LOCAL */ + + nla_total_size(4) /* IFA_BROADCAST */ + + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ +} + +static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = ifa->ifa_prefixlen; + ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + + if (ifa->ifa_address) + NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address); + + if (ifa->ifa_local) + NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local); + + if (ifa->ifa_broadcast) + NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); + + if (ifa->ifa_label[0]) + NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, s_idx; + int ip_idx, s_ip_idx; + struct net_device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (h > s_h || idx > s_idx) + s_ip_idx = 0; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto cont; + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR, NLM_F_MULTI) <= 0) { + rcu_read_unlock(); + goto done; + } + } +cont: + idx++; + } + rcu_read_unlock(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; + + return skb->len; +} + +static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, + u32 pid) +{ + struct sk_buff *skb; + u32 seq = nlh ? nlh->nlmsg_seq : 0; + int err = -ENOBUFS; + struct net *net; + + net = dev_net(ifa->ifa_dev->dev); + skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); +} + +static size_t inet_get_link_af_size(const struct net_device *dev) +{ + struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); + + if (!in_dev) + return 0; + + return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ +} + +static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +{ + struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); + struct nlattr *nla; + int i; + + if (!in_dev) + return -ENODATA; + + nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); + if (nla == NULL) + return -EMSGSIZE; + + for (i = 0; i < IPV4_DEVCONF_MAX; i++) + ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i]; + + return 0; +} + +static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { + [IFLA_INET_CONF] = { .type = NLA_NESTED }, +}; + +static int inet_validate_link_af(const struct net_device *dev, + const struct nlattr *nla) +{ + struct nlattr *a, *tb[IFLA_INET_MAX+1]; + int err, rem; + + if (dev && !__in_dev_get_rtnl(dev)) + return -EAFNOSUPPORT; + + err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy); + if (err < 0) + return err; + + if (tb[IFLA_INET_CONF]) { + nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { + int cfgid = nla_type(a); + + if (nla_len(a) < 4) + return -EINVAL; + + if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) + return -EINVAL; + } + } + + return 0; +} + +static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) +{ + struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct nlattr *a, *tb[IFLA_INET_MAX+1]; + int rem; + + if (!in_dev) + return -EAFNOSUPPORT; + + if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0) + BUG(); + + if (tb[IFLA_INET_CONF]) { + nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) + ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); + } + + return 0; +} + +#ifdef CONFIG_SYSCTL + +static void devinet_copy_dflt_conf(struct net *net, int i) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct in_device *in_dev; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev && !test_bit(i, in_dev->cnf.state)) + in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; + } + rcu_read_unlock(); +} + +/* called with RTNL locked */ +static void inet_forward_change(struct net *net) +{ + struct net_device *dev; + int on = IPV4_DEVCONF_ALL(net, FORWARDING); + + IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; + IPV4_DEVCONF_DFLT(net, FORWARDING) = on; + + for_each_netdev(net, dev) { + struct in_device *in_dev; + if (on) + dev_disable_lro(dev); + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + IN_DEV_CONF_SET(in_dev, FORWARDING, on); + rcu_read_unlock(); + } +} + +static int devinet_conf_proc(ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int old_value = *(int *)ctl->data; + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + int new_value = *(int *)ctl->data; + + if (write) { + struct ipv4_devconf *cnf = ctl->extra1; + struct net *net = ctl->extra2; + int i = (int *)ctl->data - cnf->data; + + set_bit(i, cnf->state); + + if (cnf == net->ipv4.devconf_dflt) + devinet_copy_dflt_conf(net, i); + if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) + if ((new_value == 0) && (old_value != 0)) + rt_cache_flush(net, 0); + } + + return ret; +} + +static int devinet_sysctl_forward(ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write && *valp != val) { + struct net *net = ctl->extra2; + + if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *valp = val; + *ppos = pos; + return restart_syscall(); + } + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { + inet_forward_change(net); + } else if (*valp) { + struct ipv4_devconf *cnf = ctl->extra1; + struct in_device *idev = + container_of(cnf, struct in_device, cnf); + dev_disable_lro(idev->dev); + } + rtnl_unlock(); + rt_cache_flush(net, 0); + } + } + + return ret; +} + +static int ipv4_doint_and_flush(ctl_table *ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + struct net *net = ctl->extra2; + + if (write && *valp != val) + rt_cache_flush(net, 0); + + return ret; +} + +#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ + { \ + .procname = name, \ + .data = ipv4_devconf.data + \ + IPV4_DEVCONF_ ## attr - 1, \ + .maxlen = sizeof(int), \ + .mode = mval, \ + .proc_handler = proc, \ + .extra1 = &ipv4_devconf, \ + } + +#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) + +#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) + +#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) + +#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ + DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) + +static struct devinet_sysctl_table { + struct ctl_table_header *sysctl_header; + struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; + char *dev_name; +} devinet_sysctl = { + .devinet_vars = { + DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", + devinet_sysctl_forward), + DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), + + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), + DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), + DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), + DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), + DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, + "accept_source_route"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), + DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), + DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), + DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), + DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), + DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), + DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), + DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), + DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), + DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), + DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), + + DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), + DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), + DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, + "force_igmp_version"), + DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, + "promote_secondaries"), + }, +}; + +static int __devinet_sysctl_register(struct net *net, char *dev_name, + struct ipv4_devconf *p) +{ + int i; + struct devinet_sysctl_table *t; + +#define DEVINET_CTL_PATH_DEV 3 + + struct ctl_path devinet_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "conf", }, + { /* to be set */ }, + { }, + }; + + t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); + if (!t) + goto out; + + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; + t->devinet_vars[i].extra1 = p; + t->devinet_vars[i].extra2 = net; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + t->dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!t->dev_name) + goto free; + + devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; + + t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, + t->devinet_vars); + if (!t->sysctl_header) + goto free_procname; + + p->sysctl = t; + return 0; + +free_procname: + kfree(t->dev_name); +free: + kfree(t); +out: + return -ENOBUFS; +} + +static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) +{ + struct devinet_sysctl_table *t = cnf->sysctl; + + if (t == NULL) + return; + + cnf->sysctl = NULL; + unregister_net_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); +} + +static void devinet_sysctl_register(struct in_device *idev) +{ + neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); + __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, + &idev->cnf); +} + +static void devinet_sysctl_unregister(struct in_device *idev) +{ + __devinet_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->arp_parms); +} + +static struct ctl_table ctl_forward_entry[] = { + { + .procname = "ip_forward", + .data = &ipv4_devconf.data[ + IPV4_DEVCONF_FORWARDING - 1], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = devinet_sysctl_forward, + .extra1 = &ipv4_devconf, + .extra2 = &init_net, + }, + { }, +}; + +static __net_initdata struct ctl_path net_ipv4_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { }, +}; +#endif + +static __net_init int devinet_init_net(struct net *net) +{ + int err; + struct ipv4_devconf *all, *dflt; +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table_header *forw_hdr; +#endif + + err = -ENOMEM; + all = &ipv4_devconf; + dflt = &ipv4_devconf_dflt; + + if (!net_eq(net, &init_net)) { + all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; + + dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; + +#ifdef CONFIG_SYSCTL + tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); + if (tbl == NULL) + goto err_alloc_ctl; + + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; +#endif + } + +#ifdef CONFIG_SYSCTL + err = __devinet_sysctl_register(net, "all", all); + if (err < 0) + goto err_reg_all; + + err = __devinet_sysctl_register(net, "default", dflt); + if (err < 0) + goto err_reg_dflt; + + err = -ENOMEM; + forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); + if (forw_hdr == NULL) + goto err_reg_ctl; + net->ipv4.forw_hdr = forw_hdr; +#endif + + net->ipv4.devconf_all = all; + net->ipv4.devconf_dflt = dflt; + return 0; + +#ifdef CONFIG_SYSCTL +err_reg_ctl: + __devinet_sysctl_unregister(dflt); +err_reg_dflt: + __devinet_sysctl_unregister(all); +err_reg_all: + if (tbl != ctl_forward_entry) + kfree(tbl); +err_alloc_ctl: +#endif + if (dflt != &ipv4_devconf_dflt) + kfree(dflt); +err_alloc_dflt: + if (all != &ipv4_devconf) + kfree(all); +err_alloc_all: + return err; +} + +static __net_exit void devinet_exit_net(struct net *net) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = net->ipv4.forw_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.forw_hdr); + __devinet_sysctl_unregister(net->ipv4.devconf_dflt); + __devinet_sysctl_unregister(net->ipv4.devconf_all); + kfree(tbl); +#endif + kfree(net->ipv4.devconf_dflt); + kfree(net->ipv4.devconf_all); +} + +static __net_initdata struct pernet_operations devinet_ops = { + .init = devinet_init_net, + .exit = devinet_exit_net, +}; + +static struct rtnl_af_ops inet_af_ops = { + .family = AF_INET, + .fill_link_af = inet_fill_link_af, + .get_link_af_size = inet_get_link_af_size, + .validate_link_af = inet_validate_link_af, + .set_link_af = inet_set_link_af, +}; + +void __init devinet_init(void) +{ + int i; + + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet_addr_lst[i]); + + register_pernet_subsys(&devinet_ops); + + register_gifconf(PF_INET, inet_gifconf); + register_netdevice_notifier(&ip_netdev_notifier); + + rtnl_af_register(&inet_af_ops); + + rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); + rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); +} + diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c new file mode 100644 index 00000000..530787bc --- /dev/null +++ b/net/ipv4/esp4.c @@ -0,0 +1,725 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct esp_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) + +static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); + +/* + * Allocate an AEAD request structure with extra space for SG and IV. + * + * For alignment considerations the IV is placed at the front, followed + * by the request and finally the SG list. + * + * TODO: Use spare space in skb for this where possible. + */ +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) +{ + unsigned int len; + + len = seqhilen; + + len += crypto_aead_ivsize(aead); + + if (len) { + len += crypto_aead_alignmask(aead) & + ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + } + + len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) +{ + return crypto_aead_ivsize(aead) ? + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; +} + +static inline struct aead_givcrypt_request *esp_tmp_givreq( + struct crypto_aead *aead, u8 *iv) +{ + struct aead_givcrypt_request *req; + + req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), + crypto_tfm_ctx_alignment()); + aead_givcrypt_set_tfm(req, aead); + return req; +} + +static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) +{ + struct aead_request *req; + + req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), + crypto_tfm_ctx_alignment()); + aead_request_set_tfm(req, aead); + return req; +} + +static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, + struct aead_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_aead_reqsize(aead), + __alignof__(struct scatterlist)); +} + +static inline struct scatterlist *esp_givreq_sg( + struct crypto_aead *aead, struct aead_givcrypt_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_aead_reqsize(aead), + __alignof__(struct scatterlist)); +} + +static void esp_output_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + kfree(ESP_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + +static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct aead_givcrypt_request *req; + struct scatterlist *sg; + struct scatterlist *asg; + struct esp_data *esp; + struct sk_buff *trailer; + void *tmp; + u8 *iv; + u8 *tail; + int blksize; + int clen; + int alen; + int plen; + int tfclen; + int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; + + /* skb is pure payload to encrypt */ + + err = -ENOMEM; + + esp = x->data; + aead = esp->aead; + alen = crypto_aead_authsize(aead); + + tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + tfclen = padto - skb->len; + } + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + clen = ALIGN(skb->len + 2 + tfclen, blksize); + if (esp->padlen) + clen = ALIGN(clen, esp->padlen); + plen = clen - skb->len - tfclen; + + err = skb_cow_data(skb, tfclen + plen + alen, &trailer); + if (err < 0) + goto error; + nfrags = err; + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) + goto error; + + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_givreq(aead, iv); + asg = esp_givreq_sg(aead, req); + sg = asg + sglists; + + /* Fill padding... */ + tail = skb_tail_pointer(trailer); + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = *skb_mac_header(skb); + pskb_put(skb, trailer, clen - skb->len + alen); + + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + /* this is non-NULL only with UDP Encapsulation */ + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh; + __be32 *udpdata32; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + + uh = (struct udphdr *)esph; + uh->source = sport; + uh->dest = dport; + uh->len = htons(skb->len - skb_transport_offset(skb)); + uh->check = 0; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + esph = (struct ip_esp_hdr *)(uh + 1); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + esph = (struct ip_esp_hdr *)(udpdata32 + 2); + break; + } + + *skb_mac_header(skb) = IPPROTO_UDP; + } + + esph->spi = x->id.spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + esph->enc_data + crypto_aead_ivsize(aead) - skb->data, + clen + alen); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); + + aead_givcrypt_set_callback(req, 0, esp_output_done, skb); + aead_givcrypt_set_crypt(req, sg, sg, clen, iv); + aead_givcrypt_set_assoc(req, asg, assoclen); + aead_givcrypt_set_giv(req, esph->enc_data, + XFRM_SKB_CB(skb)->seq.output.low); + + ESP_SKB_CB(skb)->tmp = tmp; + err = crypto_aead_givencrypt(req); + if (err == -EINPROGRESS) + goto error; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + + kfree(tmp); + +error: + return err; +} + +static int esp_input_done2(struct sk_buff *skb, int err) +{ + const struct iphdr *iph; + struct xfrm_state *x = xfrm_input_state(skb); + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; + int alen = crypto_aead_authsize(aead); + int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); + int elen = skb->len - hlen; + int ihl; + u8 nexthdr[2]; + int padlen; + + kfree(ESP_SKB_CB(skb)->tmp); + + if (unlikely(err)) + goto out; + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + err = -EINVAL; + padlen = nexthdr[0]; + if (padlen + 2 + alen >= elen) + goto out; + + /* ... check padding bits here. Silly. :-) */ + + iph = ip_hdr(skb); + ihl = iph->ihl * 4; + + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); + + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (iph->saddr != x->props.saddr.a4 || + uh->source != encap->encap_sport) { + xfrm_address_t ipaddr; + + ipaddr.a4 = iph->saddr; + km_new_mapping(x, &ipaddr, uh->source); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (x->props.mode == XFRM_MODE_TRANSPORT) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + pskb_trim(skb, skb->len - alen - padlen - 2); + __skb_pull(skb, hlen); + skb_set_transport_header(skb, -ihl); + + err = nexthdr[1]; + + /* RFC4303: Drop dummy packets without any error */ + if (err == IPPROTO_NONE) + err = -EINVAL; + +out: + return err; +} + +static void esp_input_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + xfrm_input_resume(skb, esp_input_done2(skb, err)); +} + +/* + * Note: detecting truncated vs. non-truncated authentication data is very + * expensive, so we only support truncated data, which is the recommended + * and common case. + */ +static int esp_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_esp_hdr *esph; + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; + struct aead_request *req; + struct sk_buff *trailer; + int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; + void *tmp; + u8 *iv; + struct scatterlist *sg; + struct scatterlist *asg; + int err = -EINVAL; + + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) + goto out; + + if (elen <= 0) + goto out; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + err = -ENOMEM; + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) + goto out; + + ESP_SKB_CB(skb)->tmp = tmp; + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + asg = esp_req_sg(aead, req); + sg = asg + sglists; + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ip_esp_hdr *)skb->data; + + /* Get ivec. This can be wrong, check against another impls. */ + iv = esph->enc_data; + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); + + aead_request_set_callback(req, 0, esp_input_done, skb); + aead_request_set_crypt(req, sg, sg, elen, iv); + aead_request_set_assoc(req, asg, assoclen); + + err = crypto_aead_decrypt(req); + if (err == -EINPROGRESS) + goto out; + + err = esp_input_done2(skb, err); + +out: + return err; +} + +static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); + u32 align = max_t(u32, blksize, esp->padlen); + unsigned int net_adj; + + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + case XFRM_MODE_BEET: + net_adj = sizeof(struct iphdr); + break; + case XFRM_MODE_TUNNEL: + net_adj = 0; + break; + default: + BUG(); + } + + return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - + net_adj) & ~(align - 1)) + (net_adj - 2); +} + +static void esp4_err(struct sk_buff *skb, u32 info) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET); + if (!x) + return; + NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr)); + xfrm_state_put(x); +} + +static void esp_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + crypto_free_aead(esp->aead); + kfree(esp); +} + +static int esp_init_aead(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + struct crypto_aead *aead; + int err; + + aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = PTR_ERR(aead); + if (IS_ERR(aead)) + goto error; + + esp->aead = aead; + + err = crypto_aead_setkey(aead, x->aead->alg_key, + (x->aead->alg_key_len + 7) / 8); + if (err) + goto error; + + err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8); + if (err) + goto error; + +error: + return err; +} + +static int esp_init_authenc(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + struct crypto_aead *aead; + struct crypto_authenc_key_param *param; + struct rtattr *rta; + char *key; + char *p; + char authenc_name[CRYPTO_MAX_ALG_NAME]; + unsigned int keylen; + int err; + + err = -EINVAL; + if (x->ealg == NULL) + goto error; + + err = -ENAMETOOLONG; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } + + aead = crypto_alloc_aead(authenc_name, 0, 0); + err = PTR_ERR(aead); + if (IS_ERR(aead)) + goto error; + + esp->aead = aead; + + keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); + err = -ENOMEM; + key = kmalloc(keylen, GFP_KERNEL); + if (!key) + goto error; + + p = key; + rta = (void *)p; + rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM; + rta->rta_len = RTA_LENGTH(sizeof(*param)); + param = RTA_DATA(rta); + p += RTA_SPACE(sizeof(*param)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8); + p += (x->aalg->alg_key_len + 7) / 8; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + err = -EINVAL; + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_aead_authsize(aead)) { + NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto free_key; + } + + err = crypto_aead_setauthsize( + aead, x->aalg->alg_trunc_len / 8); + if (err) + goto free_key; + } + + param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8); + memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8); + + err = crypto_aead_setkey(aead, key, keylen); + +free_key: + kfree(key); + +error: + return err; +} + +static int esp_init_state(struct xfrm_state *x) +{ + struct esp_data *esp; + struct crypto_aead *aead; + u32 align; + int err; + + esp = kzalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + x->data = esp; + + if (x->aead) + err = esp_init_aead(x); + else + err = esp_init_authenc(x); + + if (err) + goto error; + + aead = esp->aead; + + esp->padlen = 0; + + x->props.header_len = sizeof(struct ip_esp_hdr) + + crypto_aead_ivsize(aead); + if (x->props.mode == XFRM_MODE_TUNNEL) + x->props.header_len += sizeof(struct iphdr); + else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6) + x->props.header_len += IPV4_BEET_PHMAXLEN; + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + switch (encap->encap_type) { + default: + goto error; + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); + break; + } + } + + align = ALIGN(crypto_aead_blocksize(aead), 4); + if (esp->padlen) + align = max_t(u32, align, esp->padlen); + x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + +error: + return err; +} + +static const struct xfrm_type esp_type = +{ + .description = "ESP4", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .flags = XFRM_TYPE_REPLAY_PROT, + .init_state = esp_init_state, + .destructor = esp_destroy, + .get_mtu = esp4_get_mtu, + .input = esp_input, + .output = esp_output +}; + +static const struct net_protocol esp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = esp4_err, + .no_policy = 1, + .netns_ok = 1, +}; + +static int __init esp4_init(void) +{ + if (xfrm_register_type(&esp_type, AF_INET) < 0) { + printk(KERN_INFO "ip esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ip esp init: can't add protocol\n"); + xfrm_unregister_type(&esp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit esp4_fini(void) +{ + if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ip esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp_type, AF_INET) < 0) + printk(KERN_INFO "ip esp close: can't remove xfrm type\n"); +} + +module_init(esp4_init); +module_exit(esp4_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c new file mode 100644 index 00000000..22524716 --- /dev/null +++ b/net/ipv4/fib_frontend.c @@ -0,0 +1,1136 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: FIB frontend. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_IP_MULTIPLE_TABLES + +static int __net_init fib4_rules_init(struct net *net) +{ + struct fib_table *local_table, *main_table; + + local_table = fib_trie_table(RT_TABLE_LOCAL); + if (local_table == NULL) + return -ENOMEM; + + main_table = fib_trie_table(RT_TABLE_MAIN); + if (main_table == NULL) + goto fail; + + hlist_add_head_rcu(&local_table->tb_hlist, + &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); + hlist_add_head_rcu(&main_table->tb_hlist, + &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); + return 0; + +fail: + kfree(local_table); + return -ENOMEM; +} +#else + +struct fib_table *fib_new_table(struct net *net, u32 id) +{ + struct fib_table *tb; + unsigned int h; + + if (id == 0) + id = RT_TABLE_MAIN; + tb = fib_get_table(net, id); + if (tb) + return tb; + + tb = fib_trie_table(id); + if (!tb) + return NULL; + h = id & (FIB_TABLE_HASHSZ - 1); + hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); + return tb; +} + +struct fib_table *fib_get_table(struct net *net, u32 id) +{ + struct fib_table *tb; + struct hlist_node *node; + struct hlist_head *head; + unsigned int h; + + if (id == 0) + id = RT_TABLE_MAIN; + h = id & (FIB_TABLE_HASHSZ - 1); + + rcu_read_lock(); + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + if (tb->tb_id == id) { + rcu_read_unlock(); + return tb; + } + } + rcu_read_unlock(); + return NULL; +} +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + +static void fib_flush(struct net *net) +{ + int flushed = 0; + struct fib_table *tb; + struct hlist_node *node; + struct hlist_head *head; + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, node, head, tb_hlist) + flushed += fib_table_flush(tb); + } + + if (flushed) + rt_cache_flush(net, -1); +} + +/* + * Find address type as if only "dev" was present in the system. If + * on_dev is NULL then all interfaces are taken into consideration. + */ +static inline unsigned __inet_dev_addr_type(struct net *net, + const struct net_device *dev, + __be32 addr) +{ + struct flowi4 fl4 = { .daddr = addr }; + struct fib_result res; + unsigned ret = RTN_BROADCAST; + struct fib_table *local_table; + + if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) + return RTN_BROADCAST; + if (ipv4_is_multicast(addr)) + return RTN_MULTICAST; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + local_table = fib_get_table(net, RT_TABLE_LOCAL); + if (local_table) { + ret = RTN_UNICAST; + rcu_read_lock(); + if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { + if (!dev || dev == res.fi->fib_dev) + ret = res.type; + } + rcu_read_unlock(); + } + return ret; +} + +unsigned int inet_addr_type(struct net *net, __be32 addr) +{ + return __inet_dev_addr_type(net, NULL, addr); +} +EXPORT_SYMBOL(inet_addr_type); + +unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, + __be32 addr) +{ + return __inet_dev_addr_type(net, dev, addr); +} +EXPORT_SYMBOL(inet_dev_addr_type); + +/* Given (packet source, input interface) and optional (dst, oif, tos): + * - (main) check, that source is valid i.e. not broadcast or our local + * address. + * - figure out what "logical" interface this packet arrived + * and calculate "specific destination" address. + * - check, that packet arrived from expected physical interface. + * called with rcu_read_lock() + */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, + int oif, struct net_device *dev, __be32 *spec_dst, + u32 *itag) +{ + struct in_device *in_dev; + struct flowi4 fl4; + struct fib_result res; + int no_addr, rpf, accept_local; + bool dev_match; + int ret; + struct net *net; + + fl4.flowi4_oif = 0; + fl4.flowi4_iif = oif; + fl4.daddr = src; + fl4.saddr = dst; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + + no_addr = rpf = accept_local = 0; + in_dev = __in_dev_get_rcu(dev); + if (in_dev) { + no_addr = in_dev->ifa_list == NULL; + + /* Ignore rp_filter for packets protected by IPsec. */ + rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); + + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + } + + if (in_dev == NULL) + goto e_inval; + + net = dev_net(dev); + if (fib_lookup(net, &fl4, &res)) + goto last_resort; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL || !accept_local) + goto e_inval; + } + *spec_dst = FIB_RES_PREFSRC(net, res); + fib_combine_itag(itag, &res); + dev_match = false; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + for (ret = 0; ret < res.fi->fib_nhs; ret++) { + struct fib_nh *nh = &res.fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } + } +#else + if (FIB_RES_DEV(res) == dev) + dev_match = true; +#endif + if (dev_match) { + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + return ret; + } + if (no_addr) + goto last_resort; + if (rpf == 1) + goto e_rpf; + fl4.flowi4_oif = dev->ifindex; + + ret = 0; + if (fib_lookup(net, &fl4, &res) == 0) { + if (res.type == RTN_UNICAST) { + *spec_dst = FIB_RES_PREFSRC(net, res); + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + } + } + return ret; + +last_resort: + if (rpf) + goto e_rpf; + *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + *itag = 0; + return 0; + +e_inval: + return -EINVAL; +e_rpf: + return -EXDEV; +} + +static inline __be32 sk_extract_addr(struct sockaddr *addr) +{ + return ((struct sockaddr_in *) addr)->sin_addr.s_addr; +} + +static int put_rtax(struct nlattr *mx, int len, int type, u32 value) +{ + struct nlattr *nla; + + nla = (struct nlattr *) ((char *) mx + len); + nla->nla_type = type; + nla->nla_len = nla_attr_size(4); + *(u32 *) nla_data(nla) = value; + + return len + nla_total_size(4); +} + +static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, + struct fib_config *cfg) +{ + __be32 addr; + int plen; + + memset(cfg, 0, sizeof(*cfg)); + cfg->fc_nlinfo.nl_net = net; + + if (rt->rt_dst.sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* + * Check mask for validity: + * a) it must be contiguous. + * b) destination must have all host bits clear. + * c) if application forgot to set correct family (AF_INET), + * reject request unless it is absolutely clear i.e. + * both family and mask are zero. + */ + plen = 32; + addr = sk_extract_addr(&rt->rt_dst); + if (!(rt->rt_flags & RTF_HOST)) { + __be32 mask = sk_extract_addr(&rt->rt_genmask); + + if (rt->rt_genmask.sa_family != AF_INET) { + if (mask || rt->rt_genmask.sa_family) + return -EAFNOSUPPORT; + } + + if (bad_mask(mask, addr)) + return -EINVAL; + + plen = inet_mask_len(mask); + } + + cfg->fc_dst_len = plen; + cfg->fc_dst = addr; + + if (cmd != SIOCDELRT) { + cfg->fc_nlflags = NLM_F_CREATE; + cfg->fc_protocol = RTPROT_BOOT; + } + + if (rt->rt_metric) + cfg->fc_priority = rt->rt_metric - 1; + + if (rt->rt_flags & RTF_REJECT) { + cfg->fc_scope = RT_SCOPE_HOST; + cfg->fc_type = RTN_UNREACHABLE; + return 0; + } + + cfg->fc_scope = RT_SCOPE_NOWHERE; + cfg->fc_type = RTN_UNICAST; + + if (rt->rt_dev) { + char *colon; + struct net_device *dev; + char devname[IFNAMSIZ]; + + if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) + return -EFAULT; + + devname[IFNAMSIZ-1] = 0; + colon = strchr(devname, ':'); + if (colon) + *colon = 0; + dev = __dev_get_by_name(net, devname); + if (!dev) + return -ENODEV; + cfg->fc_oif = dev->ifindex; + if (colon) { + struct in_ifaddr *ifa; + struct in_device *in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + return -ENODEV; + *colon = ':'; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + if (strcmp(ifa->ifa_label, devname) == 0) + break; + if (ifa == NULL) + return -ENODEV; + cfg->fc_prefsrc = ifa->ifa_local; + } + } + + addr = sk_extract_addr(&rt->rt_gateway); + if (rt->rt_gateway.sa_family == AF_INET && addr) { + cfg->fc_gw = addr; + if (rt->rt_flags & RTF_GATEWAY && + inet_addr_type(net, addr) == RTN_UNICAST) + cfg->fc_scope = RT_SCOPE_UNIVERSE; + } + + if (cmd == SIOCDELRT) + return 0; + + if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw) + return -EINVAL; + + if (cfg->fc_scope == RT_SCOPE_NOWHERE) + cfg->fc_scope = RT_SCOPE_LINK; + + if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { + struct nlattr *mx; + int len = 0; + + mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL); + if (mx == NULL) + return -ENOMEM; + + if (rt->rt_flags & RTF_MTU) + len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); + + if (rt->rt_flags & RTF_WINDOW) + len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); + + if (rt->rt_flags & RTF_IRTT) + len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); + + cfg->fc_mx = mx; + cfg->fc_mx_len = len; + } + + return 0; +} + +/* + * Handle IP routing ioctl calls. + * These are used to manipulate the routing tables + */ +int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct fib_config cfg; + struct rtentry rt; + int err; + + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&rt, arg, sizeof(rt))) + return -EFAULT; + + rtnl_lock(); + err = rtentry_to_fib_config(net, cmd, &rt, &cfg); + if (err == 0) { + struct fib_table *tb; + + if (cmd == SIOCDELRT) { + tb = fib_get_table(net, cfg.fc_table); + if (tb) + err = fib_table_delete(tb, &cfg); + else + err = -ESRCH; + } else { + tb = fib_new_table(net, cfg.fc_table); + if (tb) + err = fib_table_insert(tb, &cfg); + else + err = -ENOBUFS; + } + + /* allocated by rtentry_to_fib_config() */ + kfree(cfg.fc_mx); + } + rtnl_unlock(); + return err; + } + return -EINVAL; +} + +const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { + [RTA_DST] = { .type = NLA_U32 }, + [RTA_SRC] = { .type = NLA_U32 }, + [RTA_IIF] = { .type = NLA_U32 }, + [RTA_OIF] = { .type = NLA_U32 }, + [RTA_GATEWAY] = { .type = NLA_U32 }, + [RTA_PRIORITY] = { .type = NLA_U32 }, + [RTA_PREFSRC] = { .type = NLA_U32 }, + [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, + [RTA_FLOW] = { .type = NLA_U32 }, +}; + +static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct fib_config *cfg) +{ + struct nlattr *attr; + int err, remaining; + struct rtmsg *rtm; + + err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy); + if (err < 0) + goto errout; + + memset(cfg, 0, sizeof(*cfg)); + + rtm = nlmsg_data(nlh); + cfg->fc_dst_len = rtm->rtm_dst_len; + cfg->fc_tos = rtm->rtm_tos; + cfg->fc_table = rtm->rtm_table; + cfg->fc_protocol = rtm->rtm_protocol; + cfg->fc_scope = rtm->rtm_scope; + cfg->fc_type = rtm->rtm_type; + cfg->fc_flags = rtm->rtm_flags; + cfg->fc_nlflags = nlh->nlmsg_flags; + + cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; + cfg->fc_nlinfo.nlh = nlh; + cfg->fc_nlinfo.nl_net = net; + + if (cfg->fc_type > RTN_MAX) { + err = -EINVAL; + goto errout; + } + + nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { + switch (nla_type(attr)) { + case RTA_DST: + cfg->fc_dst = nla_get_be32(attr); + break; + case RTA_OIF: + cfg->fc_oif = nla_get_u32(attr); + break; + case RTA_GATEWAY: + cfg->fc_gw = nla_get_be32(attr); + break; + case RTA_PRIORITY: + cfg->fc_priority = nla_get_u32(attr); + break; + case RTA_PREFSRC: + cfg->fc_prefsrc = nla_get_be32(attr); + break; + case RTA_METRICS: + cfg->fc_mx = nla_data(attr); + cfg->fc_mx_len = nla_len(attr); + break; + case RTA_MULTIPATH: + cfg->fc_mp = nla_data(attr); + cfg->fc_mp_len = nla_len(attr); + break; + case RTA_FLOW: + cfg->fc_flow = nla_get_u32(attr); + break; + case RTA_TABLE: + cfg->fc_table = nla_get_u32(attr); + break; + } + } + + return 0; +errout: + return err; +} + +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct fib_config cfg; + struct fib_table *tb; + int err; + + err = rtm_to_fib_config(net, skb, nlh, &cfg); + if (err < 0) + goto errout; + + tb = fib_get_table(net, cfg.fc_table); + if (tb == NULL) { + err = -ESRCH; + goto errout; + } + + err = fib_table_delete(tb, &cfg); +errout: + return err; +} + +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct fib_config cfg; + struct fib_table *tb; + int err; + + err = rtm_to_fib_config(net, skb, nlh, &cfg); + if (err < 0) + goto errout; + + tb = fib_new_table(net, cfg.fc_table); + if (tb == NULL) { + err = -ENOBUFS; + goto errout; + } + + err = fib_table_insert(tb, &cfg); +errout: + return err; +} + +static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int h, s_h; + unsigned int e = 0, s_e; + struct fib_table *tb; + struct hlist_node *node; + struct hlist_head *head; + int dumped = 0; + + if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && + ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) + return ip_rt_dump(skb, cb); + + s_h = cb->args[0]; + s_e = cb->args[1]; + + for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { + e = 0; + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, node, head, tb_hlist) { + if (e < s_e) + goto next; + if (dumped) + memset(&cb->args[2], 0, sizeof(cb->args) - + 2 * sizeof(cb->args[0])); + if (fib_table_dump(tb, skb, cb) < 0) + goto out; + dumped = 1; +next: + e++; + } + } +out: + cb->args[1] = e; + cb->args[0] = h; + + return skb->len; +} + +/* Prepare and feed intra-kernel routing request. + * Really, it should be netlink message, but :-( netlink + * can be not configured, so that we feed it directly + * to fib engine. It is legal, because all events occur + * only when netlink is already locked. + */ +static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) +{ + struct net *net = dev_net(ifa->ifa_dev->dev); + struct fib_table *tb; + struct fib_config cfg = { + .fc_protocol = RTPROT_KERNEL, + .fc_type = type, + .fc_dst = dst, + .fc_dst_len = dst_len, + .fc_prefsrc = ifa->ifa_local, + .fc_oif = ifa->ifa_dev->dev->ifindex, + .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, + .fc_nlinfo = { + .nl_net = net, + }, + }; + + if (type == RTN_UNICAST) + tb = fib_new_table(net, RT_TABLE_MAIN); + else + tb = fib_new_table(net, RT_TABLE_LOCAL); + + if (tb == NULL) + return; + + cfg.fc_table = tb->tb_id; + + if (type != RTN_LOCAL) + cfg.fc_scope = RT_SCOPE_LINK; + else + cfg.fc_scope = RT_SCOPE_HOST; + + if (cmd == RTM_NEWROUTE) + fib_table_insert(tb, &cfg); + else + fib_table_delete(tb, &cfg); +} + +void fib_add_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + struct in_ifaddr *prim = ifa; + __be32 mask = ifa->ifa_mask; + __be32 addr = ifa->ifa_local; + __be32 prefix = ifa->ifa_address & mask; + + if (ifa->ifa_flags & IFA_F_SECONDARY) { + prim = inet_ifa_byprefix(in_dev, prefix, mask); + if (prim == NULL) { + printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); + return; + } + } + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + + if (!(dev->flags & IFF_UP)) + return; + + /* Add broadcast address, if it is explicitly assigned. */ + if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + + if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && + (prefix != addr || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, prim); + + /* Add network specific broadcasts, when it takes a sense */ + if (ifa->ifa_prefixlen < 31) { + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, + 32, prim); + } + } +} + +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + struct in_ifaddr *ifa1; + struct in_ifaddr *prim = ifa, *prim1 = NULL; + __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; + __be32 any = ifa->ifa_address & ifa->ifa_mask; +#define LOCAL_OK 1 +#define BRD_OK 2 +#define BRD0_OK 4 +#define BRD1_OK 8 + unsigned ok = 0; + int subnet = 0; /* Primary network */ + int gone = 1; /* Address is missing */ + int same_prefsrc = 0; /* Another primary with same IP */ + + if (ifa->ifa_flags & IFA_F_SECONDARY) { + prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + if (prim == NULL) { + printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); + return; + } + if (iprim && iprim != prim) { + printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); + return; + } + } else if (!ipv4_is_zeronet(any) && + (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); + subnet = 1; + } + + /* Deletion is more complicated than add. + * We should take care of not to delete too much :-) + * + * Scan address list to be sure that addresses are really gone. + */ + + for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1 == ifa) { + /* promotion, keep the IP */ + gone = 0; + continue; + } + /* Ignore IFAs from our subnet */ + if (iprim && ifa1->ifa_mask == iprim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, iprim)) + continue; + + /* Ignore ifa1 if it uses different primary IP (prefsrc) */ + if (ifa1->ifa_flags & IFA_F_SECONDARY) { + /* Another address from our subnet? */ + if (ifa1->ifa_mask == prim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, prim)) + prim1 = prim; + else { + /* We reached the secondaries, so + * same_prefsrc should be determined. + */ + if (!same_prefsrc) + continue; + /* Search new prim1 if ifa1 is not + * using the current prim1 + */ + if (!prim1 || + ifa1->ifa_mask != prim1->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, prim1)) + prim1 = inet_ifa_byprefix(in_dev, + ifa1->ifa_address, + ifa1->ifa_mask); + if (!prim1) + continue; + if (prim1->ifa_local != prim->ifa_local) + continue; + } + } else { + if (prim->ifa_local != ifa1->ifa_local) + continue; + prim1 = ifa1; + if (prim != prim1) + same_prefsrc = 1; + } + if (ifa->ifa_local == ifa1->ifa_local) + ok |= LOCAL_OK; + if (ifa->ifa_broadcast == ifa1->ifa_broadcast) + ok |= BRD_OK; + if (brd == ifa1->ifa_broadcast) + ok |= BRD1_OK; + if (any == ifa1->ifa_broadcast) + ok |= BRD0_OK; + /* primary has network specific broadcasts */ + if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { + __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; + __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + + if (!ipv4_is_zeronet(any1)) { + if (ifa->ifa_broadcast == brd1 || + ifa->ifa_broadcast == any1) + ok |= BRD_OK; + if (brd == brd1 || brd == any1) + ok |= BRD1_OK; + if (any == brd1 || any == any1) + ok |= BRD0_OK; + } + } + } + + if (!(ok & BRD_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + if (subnet && ifa->ifa_prefixlen < 31) { + if (!(ok & BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok & BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + } + if (!(ok & LOCAL_OK)) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + + /* Check, that this local address finally disappeared. */ + if (gone && + inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + /* And the last, but not the least thing. + * We must flush stray FIB entries. + * + * First of all, we scan fib_info list searching + * for stray nexthop entries, then ignite fib_flush. + */ + if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) + fib_flush(dev_net(dev)); + } + } +#undef LOCAL_OK +#undef BRD_OK +#undef BRD0_OK +#undef BRD1_OK +} + +static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) +{ + + struct fib_result res; + struct flowi4 fl4 = { + .flowi4_mark = frn->fl_mark, + .daddr = frn->fl_addr, + .flowi4_tos = frn->fl_tos, + .flowi4_scope = frn->fl_scope, + }; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + frn->err = -ENOENT; + if (tb) { + local_bh_disable(); + + frn->tb_id = tb->tb_id; + rcu_read_lock(); + frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + + if (!frn->err) { + frn->prefixlen = res.prefixlen; + frn->nh_sel = res.nh_sel; + frn->type = res.type; + frn->scope = res.scope; + } + rcu_read_unlock(); + local_bh_enable(); + } +} + +static void nl_fib_input(struct sk_buff *skb) +{ + struct net *net; + struct fib_result_nl *frn; + struct nlmsghdr *nlh; + struct fib_table *tb; + u32 pid; + + net = sock_net(skb->sk); + nlh = nlmsg_hdr(skb); + if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || + nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) + return; + + skb = skb_clone(skb, GFP_KERNEL); + if (skb == NULL) + return; + nlh = nlmsg_hdr(skb); + + frn = (struct fib_result_nl *) NLMSG_DATA(nlh); + tb = fib_get_table(net, frn->tb_id_in); + + nl_fib_lookup(frn, tb); + + pid = NETLINK_CB(skb).pid; /* pid of sending process */ + NETLINK_CB(skb).pid = 0; /* from kernel */ + NETLINK_CB(skb).dst_group = 0; /* unicast */ + netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); +} + +static int __net_init nl_fib_lookup_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, + nl_fib_input, NULL, THIS_MODULE); + if (sk == NULL) + return -EAFNOSUPPORT; + net->ipv4.fibnl = sk; + return 0; +} + +static void nl_fib_lookup_exit(struct net *net) +{ + netlink_kernel_release(net->ipv4.fibnl); + net->ipv4.fibnl = NULL; +} + +static void fib_disable_ip(struct net_device *dev, int force, int delay) +{ + if (fib_sync_down_dev(dev, force)) + fib_flush(dev_net(dev)); + rt_cache_flush(dev_net(dev), delay); + arp_ifdown(dev); +} + +static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); + + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +#endif + atomic_inc(&net->ipv4.dev_addr_genid); + rt_cache_flush(dev_net(dev), -1); + break; + case NETDEV_DOWN: + fib_del_ifaddr(ifa, NULL); + atomic_inc(&net->ipv4.dev_addr_genid); + if (ifa->ifa_dev->ifa_list == NULL) { + /* Last address was deleted from this interface. + * Disable IP. + */ + fib_disable_ip(dev, 1, 0); + } else { + rt_cache_flush(dev_net(dev), -1); + } + break; + } + return NOTIFY_DONE; +} + +static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net *net = dev_net(dev); + + if (event == NETDEV_UNREGISTER) { + fib_disable_ip(dev, 2, -1); + return NOTIFY_DONE; + } + + if (!in_dev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + for_ifa(in_dev) { + fib_add_ifaddr(ifa); + } endfor_ifa(in_dev); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +#endif + atomic_inc(&net->ipv4.dev_addr_genid); + rt_cache_flush(dev_net(dev), -1); + break; + case NETDEV_DOWN: + fib_disable_ip(dev, 0, 0); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + rt_cache_flush(dev_net(dev), 0); + break; + case NETDEV_UNREGISTER_BATCH: + /* The batch unregister is only called on the first + * device in the list of devices being unregistered. + * Therefore we should not pass dev_net(dev) in here. + */ + rt_cache_flush_batch(NULL); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block fib_inetaddr_notifier = { + .notifier_call = fib_inetaddr_event, +}; + +static struct notifier_block fib_netdev_notifier = { + .notifier_call = fib_netdev_event, +}; + +static int __net_init ip_fib_net_init(struct net *net) +{ + int err; + size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; + + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); + if (net->ipv4.fib_table_hash == NULL) + return -ENOMEM; + + err = fib4_rules_init(net); + if (err < 0) + goto fail; + return 0; + +fail: + kfree(net->ipv4.fib_table_hash); + return err; +} + +static void ip_fib_net_exit(struct net *net) +{ + unsigned int i; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + fib4_rules_exit(net); +#endif + + rtnl_lock(); + for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + struct fib_table *tb; + struct hlist_head *head; + struct hlist_node *node, *tmp; + + head = &net->ipv4.fib_table_hash[i]; + hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { + hlist_del(node); + fib_table_flush(tb); + fib_free_table(tb); + } + } + rtnl_unlock(); + kfree(net->ipv4.fib_table_hash); +} + +static int __net_init fib_net_init(struct net *net) +{ + int error; + + error = ip_fib_net_init(net); + if (error < 0) + goto out; + error = nl_fib_lookup_init(net); + if (error < 0) + goto out_nlfl; + error = fib_proc_init(net); + if (error < 0) + goto out_proc; +out: + return error; + +out_proc: + nl_fib_lookup_exit(net); +out_nlfl: + ip_fib_net_exit(net); + goto out; +} + +static void __net_exit fib_net_exit(struct net *net) +{ + fib_proc_exit(net); + nl_fib_lookup_exit(net); + ip_fib_net_exit(net); +} + +static struct pernet_operations fib_net_ops = { + .init = fib_net_init, + .exit = fib_net_exit, +}; + +void __init ip_fib_init(void) +{ + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); + + register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); + + fib_trie_init(); +} diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h new file mode 100644 index 00000000..af0f14ab --- /dev/null +++ b/net/ipv4/fib_lookup.h @@ -0,0 +1,57 @@ +#ifndef _FIB_LOOKUP_H +#define _FIB_LOOKUP_H + +#include +#include +#include + +struct fib_alias { + struct list_head fa_list; + struct fib_info *fa_info; + u8 fa_tos; + u8 fa_type; + u8 fa_state; + struct rcu_head rcu; +}; + +#define FA_S_ACCESSED 0x01 + +/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +static inline void fib_alias_accessed(struct fib_alias *fa) +{ + if (!(fa->fa_state & FA_S_ACCESSED)) + fa->fa_state |= FA_S_ACCESSED; +} + +/* Exported by fib_semantics.c */ +extern void fib_release_info(struct fib_info *); +extern struct fib_info *fib_create_info(struct fib_config *cfg); +extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u32 tb_id, u8 type, __be32 dst, + int dst_len, u8 tos, struct fib_info *fi, + unsigned int); +extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, + int dst_len, u32 tb_id, struct nl_info *info, + unsigned int nlm_flags); +extern struct fib_alias *fib_find_alias(struct list_head *fah, + u8 tos, u32 prio); +extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, + int *last_idx, int dflt); + +static inline void fib_result_assign(struct fib_result *res, + struct fib_info *fi) +{ + /* we used to play games with refcounts, but we now use RCU */ + res->fi = fi; +} + +struct fib_prop { + int error; + u8 scope; +}; + +extern const struct fib_prop fib_props[RTN_MAX + 1]; + +#endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c new file mode 100644 index 00000000..a53bb1b5 --- /dev/null +++ b/net/ipv4/fib_rules.c @@ -0,0 +1,307 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: policy rules. + * + * Authors: Alexey Kuznetsov, + * Thomas Graf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Rani Assaf : local_rule cannot be deleted + * Marc Boucher : routing by fwmark + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct fib4_rule { + struct fib_rule common; + u8 dst_len; + u8 src_len; + u8 tos; + __be32 src; + __be32 srcmask; + __be32 dst; + __be32 dstmask; +#ifdef CONFIG_IP_ROUTE_CLASSID + u32 tclassid; +#endif +}; + +#ifdef CONFIG_IP_ROUTE_CLASSID +u32 fib_rules_tclass(const struct fib_result *res) +{ + return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; +} +#endif + +int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +{ + struct fib_lookup_arg arg = { + .result = res, + .flags = FIB_LOOKUP_NOREF, + }; + int err; + + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); + res->r = arg.rule; + + return err; +} + +static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + int err = -EAGAIN; + struct fib_table *tbl; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + + case FR_ACT_UNREACHABLE: + err = -ENETUNREACH; + goto errout; + + case FR_ACT_PROHIBIT: + err = -EACCES; + goto errout; + + case FR_ACT_BLACKHOLE: + default: + err = -EINVAL; + goto errout; + } + + tbl = fib_get_table(rule->fr_net, rule->table); + if (!tbl) + goto errout; + + err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); + if (err > 0) + err = -EAGAIN; +errout: + return err; +} + + +static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +{ + struct fib4_rule *r = (struct fib4_rule *) rule; + struct flowi4 *fl4 = &fl->u.ip4; + __be32 daddr = fl4->daddr; + __be32 saddr = fl4->saddr; + + if (((saddr ^ r->src) & r->srcmask) || + ((daddr ^ r->dst) & r->dstmask)) + return 0; + + if (r->tos && (r->tos != fl4->flowi4_tos)) + return 0; + + return 1; +} + +static struct fib_table *fib_empty_table(struct net *net) +{ + u32 id; + + for (id = 1; id <= RT_TABLE_MAX; id++) + if (fib_get_table(net, id) == NULL) + return fib_new_table(net, id); + return NULL; +} + +static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { + FRA_GENERIC_POLICY, + [FRA_FLOW] = { .type = NLA_U32 }, +}; + +static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + struct net *net = sock_net(skb->sk); + int err = -EINVAL; + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (frh->tos & ~IPTOS_TOS_MASK) + goto errout; + + if (rule->table == RT_TABLE_UNSPEC) { + if (rule->action == FR_ACT_TO_TBL) { + struct fib_table *table; + + table = fib_empty_table(net); + if (table == NULL) { + err = -ENOBUFS; + goto errout; + } + + rule->table = table->tb_id; + } + } + + if (frh->src_len) + rule4->src = nla_get_be32(tb[FRA_SRC]); + + if (frh->dst_len) + rule4->dst = nla_get_be32(tb[FRA_DST]); + +#ifdef CONFIG_IP_ROUTE_CLASSID + if (tb[FRA_FLOW]) + rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); +#endif + + rule4->src_len = frh->src_len; + rule4->srcmask = inet_make_mask(rule4->src_len); + rule4->dst_len = frh->dst_len; + rule4->dstmask = inet_make_mask(rule4->dst_len); + rule4->tos = frh->tos; + + err = 0; +errout: + return err; +} + +static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (frh->src_len && (rule4->src_len != frh->src_len)) + return 0; + + if (frh->dst_len && (rule4->dst_len != frh->dst_len)) + return 0; + + if (frh->tos && (rule4->tos != frh->tos)) + return 0; + +#ifdef CONFIG_IP_ROUTE_CLASSID + if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) + return 0; +#endif + + if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC]))) + return 0; + + if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST]))) + return 0; + + return 1; +} + +static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + frh->dst_len = rule4->dst_len; + frh->src_len = rule4->src_len; + frh->tos = rule4->tos; + + if (rule4->dst_len) + NLA_PUT_BE32(skb, FRA_DST, rule4->dst); + + if (rule4->src_len) + NLA_PUT_BE32(skb, FRA_SRC, rule4->src); + +#ifdef CONFIG_IP_ROUTE_CLASSID + if (rule4->tclassid) + NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); +#endif + return 0; + +nla_put_failure: + return -ENOBUFS; +} + +static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) +{ + return nla_total_size(4) /* dst */ + + nla_total_size(4) /* src */ + + nla_total_size(4); /* flow */ +} + +static void fib4_rule_flush_cache(struct fib_rules_ops *ops) +{ + rt_cache_flush(ops->fro_net, -1); +} + +static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { + .family = AF_INET, + .rule_size = sizeof(struct fib4_rule), + .addr_size = sizeof(u32), + .action = fib4_rule_action, + .match = fib4_rule_match, + .configure = fib4_rule_configure, + .compare = fib4_rule_compare, + .fill = fib4_rule_fill, + .default_pref = fib_default_rule_pref, + .nlmsg_payload = fib4_rule_nlmsg_payload, + .flush_cache = fib4_rule_flush_cache, + .nlgroup = RTNLGRP_IPV4_RULE, + .policy = fib4_rule_policy, + .owner = THIS_MODULE, +}; + +static int fib_default_rules_init(struct fib_rules_ops *ops) +{ + int err; + + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); + if (err < 0) + return err; + err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); + if (err < 0) + return err; + err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0); + if (err < 0) + return err; + return 0; +} + +int __net_init fib4_rules_init(struct net *net) +{ + int err; + struct fib_rules_ops *ops; + + ops = fib_rules_register(&fib4_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + err = fib_default_rules_init(ops); + if (err < 0) + goto fail; + net->ipv4.rules_ops = ops; + return 0; + +fail: + /* also cleans all rules already added */ + fib_rules_unregister(ops); + return err; +} + +void __net_exit fib4_rules_exit(struct net *net) +{ + fib_rules_unregister(net->ipv4.rules_ops); +} diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c new file mode 100644 index 00000000..7e454ba8 --- /dev/null +++ b/net/ipv4/fib_semantics.c @@ -0,0 +1,1248 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: semantics. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fib_lookup.h" + +static DEFINE_SPINLOCK(fib_info_lock); +static struct hlist_head *fib_info_hash; +static struct hlist_head *fib_info_laddrhash; +static unsigned int fib_info_hash_size; +static unsigned int fib_info_cnt; + +#define DEVINDEX_HASHBITS 8 +#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) +static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static DEFINE_SPINLOCK(fib_multipath_lock); + +#define for_nexthops(fi) { \ + int nhsel; const struct fib_nh *nh; \ + for (nhsel = 0, nh = (fi)->fib_nh; \ + nhsel < (fi)->fib_nhs; \ + nh++, nhsel++) + +#define change_nexthops(fi) { \ + int nhsel; struct fib_nh *nexthop_nh; \ + for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ + nhsel < (fi)->fib_nhs; \ + nexthop_nh++, nhsel++) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +/* Hope, that gcc will optimize it to get rid of dummy loop */ + +#define for_nexthops(fi) { \ + int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ + for (nhsel = 0; nhsel < 1; nhsel++) + +#define change_nexthops(fi) { \ + int nhsel; \ + struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ + for (nhsel = 0; nhsel < 1; nhsel++) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define endfor_nexthops(fi) } + + +const struct fib_prop fib_props[RTN_MAX + 1] = { + [RTN_UNSPEC] = { + .error = 0, + .scope = RT_SCOPE_NOWHERE, + }, + [RTN_UNICAST] = { + .error = 0, + .scope = RT_SCOPE_UNIVERSE, + }, + [RTN_LOCAL] = { + .error = 0, + .scope = RT_SCOPE_HOST, + }, + [RTN_BROADCAST] = { + .error = 0, + .scope = RT_SCOPE_LINK, + }, + [RTN_ANYCAST] = { + .error = 0, + .scope = RT_SCOPE_LINK, + }, + [RTN_MULTICAST] = { + .error = 0, + .scope = RT_SCOPE_UNIVERSE, + }, + [RTN_BLACKHOLE] = { + .error = -EINVAL, + .scope = RT_SCOPE_UNIVERSE, + }, + [RTN_UNREACHABLE] = { + .error = -EHOSTUNREACH, + .scope = RT_SCOPE_UNIVERSE, + }, + [RTN_PROHIBIT] = { + .error = -EACCES, + .scope = RT_SCOPE_UNIVERSE, + }, + [RTN_THROW] = { + .error = -EAGAIN, + .scope = RT_SCOPE_UNIVERSE, + }, + [RTN_NAT] = { + .error = -EINVAL, + .scope = RT_SCOPE_NOWHERE, + }, + [RTN_XRESOLVE] = { + .error = -EINVAL, + .scope = RT_SCOPE_NOWHERE, + }, +}; + +/* Release a nexthop info record */ +static void free_fib_info_rcu(struct rcu_head *head) +{ + struct fib_info *fi = container_of(head, struct fib_info, rcu); + + change_nexthops(fi) { + if (nexthop_nh->nh_dev) + dev_put(nexthop_nh->nh_dev); + } endfor_nexthops(fi); + + release_net(fi->fib_net); + kfree(fi); +} + +void free_fib_info(struct fib_info *fi) +{ + if (fi->fib_dead == 0) { + pr_warning("Freeing alive fib_info %p\n", fi); + return; + } + fib_info_cnt--; + call_rcu(&fi->rcu, free_fib_info_rcu); +} + +void fib_release_info(struct fib_info *fi) +{ + spin_lock_bh(&fib_info_lock); + if (fi && --fi->fib_treeref == 0) { + hlist_del(&fi->fib_hash); + if (fi->fib_prefsrc) + hlist_del(&fi->fib_lhash); + change_nexthops(fi) { + if (!nexthop_nh->nh_dev) + continue; + hlist_del(&nexthop_nh->nh_hash); + } endfor_nexthops(fi) + fi->fib_dead = 1; + fib_info_put(fi); + } + spin_unlock_bh(&fib_info_lock); +} + +static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +{ + const struct fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight != onh->nh_weight || +#endif +#ifdef CONFIG_IP_ROUTE_CLASSID + nh->nh_tclassid != onh->nh_tclassid || +#endif + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +static inline unsigned int fib_devindex_hashfn(unsigned int val) +{ + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ + (val >> (DEVINDEX_HASHBITS * 2))) & mask; +} + +static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +{ + unsigned int mask = (fib_info_hash_size - 1); + unsigned int val = fi->fib_nhs; + + val ^= (fi->fib_protocol << 8) | fi->fib_scope; + val ^= (__force u32)fi->fib_prefsrc; + val ^= fi->fib_priority; + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->nh_oif); + } endfor_nexthops(fi) + + return (val ^ (val >> 7) ^ (val >> 12)) & mask; +} + +static struct fib_info *fib_find_info(const struct fib_info *nfi) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn(nfi); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, node, head, fib_hash) { + if (!net_eq(fi->fib_net, nfi->fib_net)) + continue; + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_scope == fi->fib_scope && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && + memcmp(nfi->fib_metrics, fi->fib_metrics, + sizeof(u32) * RTAX_MAX) == 0 && + ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + return fi; + } + + return NULL; +} + +/* Check, that the gateway is already configured. + * Used only by redirect accept routine. + */ +int ip_fib_check_default(__be32 gw, struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + unsigned int hash; + + spin_lock(&fib_info_lock); + + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_for_each_entry(nh, node, head, nh_hash) { + if (nh->nh_dev == dev && + nh->nh_gw == gw && + !(nh->nh_flags & RTNH_F_DEAD)) { + spin_unlock(&fib_info_lock); + return 0; + } + } + + spin_unlock(&fib_info_lock); + + return -1; +} + +static inline size_t fib_nlmsg_size(struct fib_info *fi) +{ + size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(4) /* RTA_DST */ + + nla_total_size(4) /* RTA_PRIORITY */ + + nla_total_size(4); /* RTA_PREFSRC */ + + /* space for nested metrics */ + payload += nla_total_size((RTAX_MAX * nla_total_size(4))); + + if (fi->fib_nhs) { + /* Also handles the special case fib_nhs == 1 */ + + /* each nexthop is packed in an attribute */ + size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); + + /* may contain flow and gateway attribute */ + nhsize += 2 * nla_total_size(4); + + /* all nexthops are packed in a nested attribute */ + payload += nla_total_size(fi->fib_nhs * nhsize); + } + + return payload; +} + +void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, + int dst_len, u32 tb_id, struct nl_info *info, + unsigned int nlm_flags) +{ + struct sk_buff *skb; + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + int err = -ENOBUFS; + + skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = fib_dump_info(skb, info->pid, seq, event, tb_id, + fa->fa_type, key, dst_len, + fa->fa_tos, fa->fa_info, nlm_flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, + info->nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); +} + +/* Return the first fib alias matching TOS with + * priority less than or equal to PRIO. + */ +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) +{ + if (fah) { + struct fib_alias *fa; + list_for_each_entry(fa, fah, fa_list) { + if (fa->fa_tos > tos) + continue; + if (fa->fa_info->fib_priority >= prio || + fa->fa_tos < tos) + return fa; + } + } + return NULL; +} + +int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, int dflt) +{ + struct neighbour *n; + int state = NUD_NONE; + + n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state == NUD_REACHABLE) + return 0; + if ((state & NUD_VALID) && order != dflt) + return 0; + if ((state & NUD_VALID) || + (*last_idx < 0 && order > dflt)) { + *last_resort = fi; + *last_idx = order; + } + return 1; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) +{ + int nhs = 0; + + while (rtnh_ok(rtnh, remaining)) { + nhs++; + rtnh = rtnh_next(rtnh, &remaining); + } + + /* leftover implies invalid nexthop configuration, discard it */ + return remaining > 0 ? 0 : nhs; +} + +static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, + int remaining, struct fib_config *cfg) +{ + change_nexthops(fi) { + int attrlen; + + if (!rtnh_ok(rtnh, remaining)) + return -EINVAL; + + nexthop_nh->nh_flags = + (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; + nexthop_nh->nh_oif = rtnh->rtnh_ifindex; + nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; +#ifdef CONFIG_IP_ROUTE_CLASSID + nla = nla_find(attrs, attrlen, RTA_FLOW); + nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; +#endif + } + + rtnh = rtnh_next(rtnh, &remaining); + } endfor_nexthops(fi); + + return 0; +} + +#endif + +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + struct rtnexthop *rtnh; + int remaining; +#endif + + if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) + return 1; + + if (cfg->fc_oif || cfg->fc_gw) { + if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && + (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) + return 0; + return 1; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (cfg->fc_mp == NULL) + return 0; + + rtnh = cfg->fc_mp; + remaining = cfg->fc_mp_len; + + for_nexthops(fi) { + int attrlen; + + if (!rtnh_ok(rtnh, remaining)) + return -EINVAL; + + if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) + return 1; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen < 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla && nla_get_be32(nla) != nh->nh_gw) + return 1; +#ifdef CONFIG_IP_ROUTE_CLASSID + nla = nla_find(attrs, attrlen, RTA_FLOW); + if (nla && nla_get_u32(nla) != nh->nh_tclassid) + return 1; +#endif + } + + rtnh = rtnh_next(rtnh, &remaining); + } endfor_nexthops(fi); +#endif + return 0; +} + + +/* + * Picture + * ------- + * + * Semantics of nexthop is very messy by historical reasons. + * We have to take into account, that: + * a) gateway can be actually local interface address, + * so that gatewayed route is direct. + * b) gateway must be on-link address, possibly + * described not by an ifaddr, but also by a direct route. + * c) If both gateway and interface are specified, they should not + * contradict. + * d) If we use tunnel routes, gateway could be not on-link. + * + * Attempt to reconcile all of these (alas, self-contradictory) conditions + * results in pretty ugly and hairy code with obscure logic. + * + * I chose to generalized it instead, so that the size + * of code does not increase practically, but it becomes + * much more general. + * Every prefix is assigned a "scope" value: "host" is local address, + * "link" is direct route, + * [ ... "site" ... "interior" ... ] + * and "universe" is true gateway route with global meaning. + * + * Every prefix refers to a set of "nexthop"s (gw, oif), + * where gw must have narrower scope. This recursion stops + * when gw has LOCAL scope or if "nexthop" is declared ONLINK, + * which means that gw is forced to be on link. + * + * Code is still hairy, but now it is apparently logically + * consistent and very flexible. F.e. as by-product it allows + * to co-exists in peace independent exterior and interior + * routing processes. + * + * Normally it looks as following. + * + * {universe prefix} -> (gw, oif) [scope link] + * | + * |-> {link prefix} -> (gw, oif) [scope local] + * | + * |-> {local prefix} (terminal node) + */ +static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, + struct fib_nh *nh) +{ + int err; + struct net *net; + struct net_device *dev; + + net = cfg->fc_nlinfo.nl_net; + if (nh->nh_gw) { + struct fib_result res; + + if (nh->nh_flags & RTNH_F_ONLINK) { + + if (cfg->fc_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + dev = __dev_get_by_index(net, nh->nh_oif); + if (!dev) + return -ENODEV; + if (!(dev->flags & IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(dev); + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + rcu_read_lock(); + { + struct flowi4 fl4 = { + .daddr = nh->nh_gw, + .flowi4_scope = cfg->fc_scope + 1, + .flowi4_oif = nh->nh_oif, + }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + err = fib_lookup(net, &fl4, &res); + if (err) { + rcu_read_unlock(); + return err; + } + } + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + goto out; + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + nh->nh_dev = dev = FIB_RES_DEV(res); + if (!dev) + goto out; + dev_hold(dev); + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; + } else { + struct in_device *in_dev; + + if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) + return -EINVAL; + + rcu_read_lock(); + err = -ENODEV; + in_dev = inetdev_by_index(net, nh->nh_oif); + if (in_dev == NULL) + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) + goto out; + nh->nh_dev = in_dev->dev; + dev_hold(nh->nh_dev); + nh->nh_scope = RT_SCOPE_HOST; + err = 0; + } +out: + rcu_read_unlock(); + return err; +} + +static inline unsigned int fib_laddr_hashfn(__be32 val) +{ + unsigned int mask = (fib_info_hash_size - 1); + + return ((__force u32)val ^ + ((__force u32)val >> 7) ^ + ((__force u32)val >> 14)) & mask; +} + +static struct hlist_head *fib_info_hash_alloc(int bytes) +{ + if (bytes <= PAGE_SIZE) + return kzalloc(bytes, GFP_KERNEL); + else + return (struct hlist_head *) + __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(bytes)); +} + +static void fib_info_hash_free(struct hlist_head *hash, int bytes) +{ + if (!hash) + return; + + if (bytes <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long) hash, get_order(bytes)); +} + +static void fib_info_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) +{ + struct hlist_head *old_info_hash, *old_laddrhash; + unsigned int old_size = fib_info_hash_size; + unsigned int i, bytes; + + spin_lock_bh(&fib_info_lock); + old_info_hash = fib_info_hash; + old_laddrhash = fib_info_laddrhash; + fib_info_hash_size = new_size; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &fib_info_hash[i]; + struct hlist_node *node, *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { + struct hlist_head *dest; + unsigned int new_hash; + + hlist_del(&fi->fib_hash); + + new_hash = fib_info_hashfn(fi); + dest = &new_info_hash[new_hash]; + hlist_add_head(&fi->fib_hash, dest); + } + } + fib_info_hash = new_info_hash; + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &fib_info_laddrhash[i]; + struct hlist_node *node, *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { + struct hlist_head *ldest; + unsigned int new_hash; + + hlist_del(&fi->fib_lhash); + + new_hash = fib_laddr_hashfn(fi->fib_prefsrc); + ldest = &new_laddrhash[new_hash]; + hlist_add_head(&fi->fib_lhash, ldest); + } + } + fib_info_laddrhash = new_laddrhash; + + spin_unlock_bh(&fib_info_lock); + + bytes = old_size * sizeof(struct hlist_head *); + fib_info_hash_free(old_info_hash, bytes); + fib_info_hash_free(old_laddrhash, bytes); +} + +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_parent->fib_scope); + nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + + return nh->nh_saddr; +} + +struct fib_info *fib_create_info(struct fib_config *cfg) +{ + int err; + struct fib_info *fi = NULL; + struct fib_info *ofi; + int nhs = 1; + struct net *net = cfg->fc_nlinfo.nl_net; + + if (cfg->fc_type > RTN_MAX) + goto err_inval; + + /* Fast check to catch the most weird cases */ + if (fib_props[cfg->fc_type].scope > cfg->fc_scope) + goto err_inval; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (cfg->fc_mp) { + nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); + if (nhs == 0) + goto err_inval; + } +#endif + + err = -ENOBUFS; + if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_size = fib_info_hash_size << 1; + struct hlist_head *new_info_hash; + struct hlist_head *new_laddrhash; + unsigned int bytes; + + if (!new_size) + new_size = 1; + bytes = new_size * sizeof(struct hlist_head *); + new_info_hash = fib_info_hash_alloc(bytes); + new_laddrhash = fib_info_hash_alloc(bytes); + if (!new_info_hash || !new_laddrhash) { + fib_info_hash_free(new_info_hash, bytes); + fib_info_hash_free(new_laddrhash, bytes); + } else + fib_info_hash_move(new_info_hash, new_laddrhash, new_size); + + if (!fib_info_hash_size) + goto failure; + } + + fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + if (fi == NULL) + goto failure; + if (cfg->fc_mx) { + fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!fi->fib_metrics) + goto failure; + } else + fi->fib_metrics = (u32 *) dst_default_metrics; + fib_info_cnt++; + + fi->fib_net = hold_net(net); + fi->fib_protocol = cfg->fc_protocol; + fi->fib_scope = cfg->fc_scope; + fi->fib_flags = cfg->fc_flags; + fi->fib_priority = cfg->fc_priority; + fi->fib_prefsrc = cfg->fc_prefsrc; + + fi->fib_nhs = nhs; + change_nexthops(fi) { + nexthop_nh->nh_parent = fi; + } endfor_nexthops(fi) + + if (cfg->fc_mx) { + struct nlattr *nla; + int remaining; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + + if (type) { + if (type > RTAX_MAX) + goto err_inval; + fi->fib_metrics[type - 1] = nla_get_u32(nla); + } + } + } + + if (cfg->fc_mp) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH + err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); + if (err != 0) + goto failure; + if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) + goto err_inval; + if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) + goto err_inval; +#ifdef CONFIG_IP_ROUTE_CLASSID + if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) + goto err_inval; +#endif +#else + goto err_inval; +#endif + } else { + struct fib_nh *nh = fi->fib_nh; + + nh->nh_oif = cfg->fc_oif; + nh->nh_gw = cfg->fc_gw; + nh->nh_flags = cfg->fc_flags; +#ifdef CONFIG_IP_ROUTE_CLASSID + nh->nh_tclassid = cfg->fc_flow; +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight = 1; +#endif + } + + if (fib_props[cfg->fc_type].error) { + if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) + goto err_inval; + goto link_it; + } else { + switch (cfg->fc_type) { + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + break; + default: + goto err_inval; + } + } + + if (cfg->fc_scope > RT_SCOPE_HOST) + goto err_inval; + + if (cfg->fc_scope == RT_SCOPE_HOST) { + struct fib_nh *nh = fi->fib_nh; + + /* Local address is added. */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + err = fib_check_nh(cfg, fi, nexthop_nh); + if (err != 0) + goto failure; + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || + fi->fib_prefsrc != cfg->fc_dst) + if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) + goto err_inval; + } + + change_nexthops(fi) { + fib_info_update_nh_saddr(net, nexthop_nh); + } endfor_nexthops(fi) + +link_it: + ofi = fib_find_info(fi); + if (ofi) { + fi->fib_dead = 1; + free_fib_info(fi); + ofi->fib_treeref++; + return ofi; + } + + fi->fib_treeref++; + atomic_inc(&fi->fib_clntref); + spin_lock_bh(&fib_info_lock); + hlist_add_head(&fi->fib_hash, + &fib_info_hash[fib_info_hashfn(fi)]); + if (fi->fib_prefsrc) { + struct hlist_head *head; + + head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; + hlist_add_head(&fi->fib_lhash, head); + } + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; + + if (!nexthop_nh->nh_dev) + continue; + hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nexthop_nh->nh_hash, head); + } endfor_nexthops(fi) + spin_unlock_bh(&fib_info_lock); + return fi; + +err_inval: + err = -EINVAL; + +failure: + if (fi) { + fi->fib_dead = 1; + free_fib_info(fi); + } + + return ERR_PTR(err); +} + +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, + struct fib_info *fi, unsigned int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = tos; + if (tb_id < 256) + rtm->rtm_table = tb_id; + else + rtm->rtm_table = RT_TABLE_COMPAT; + NLA_PUT_U32(skb, RTA_TABLE, tb_id); + rtm->rtm_type = type; + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = fi->fib_scope; + rtm->rtm_protocol = fi->fib_protocol; + + if (rtm->rtm_dst_len) + NLA_PUT_BE32(skb, RTA_DST, dst); + + if (fi->fib_priority) + NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); + + if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + goto nla_put_failure; + + if (fi->fib_prefsrc) + NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); + + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); + + if (fi->fib_nh->nh_oif) + NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); +#ifdef CONFIG_IP_ROUTE_CLASSID + if (fi->fib_nh[0].nh_tclassid) + NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); +#endif + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fi->fib_nhs > 1) { + struct rtnexthop *rtnh; + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (mp == NULL) + goto nla_put_failure; + + for_nexthops(fi) { + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); + if (rtnh == NULL) + goto nla_put_failure; + + rtnh->rtnh_flags = nh->nh_flags & 0xFF; + rtnh->rtnh_hops = nh->nh_weight - 1; + rtnh->rtnh_ifindex = nh->nh_oif; + + if (nh->nh_gw) + NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); +#ifdef CONFIG_IP_ROUTE_CLASSID + if (nh->nh_tclassid) + NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); +#endif + /* length of rtnetlink header + attributes */ + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; + } endfor_nexthops(fi); + + nla_nest_end(skb, mp); + } +#endif + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +/* + * Update FIB if: + * - local address disappeared -> we must delete all the entries + * referring to it. + * - device went down -> we must shutdown all nexthops going via it. + */ +int fib_sync_down_addr(struct net *net, __be32 local) +{ + int ret = 0; + unsigned int hash = fib_laddr_hashfn(local); + struct hlist_head *head = &fib_info_laddrhash[hash]; + struct hlist_node *node; + struct fib_info *fi; + + if (fib_info_laddrhash == NULL || local == 0) + return 0; + + hlist_for_each_entry(fi, node, head, fib_lhash) { + if (!net_eq(fi->fib_net, net)) + continue; + if (fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + return ret; +} + +int fib_sync_down_dev(struct net_device *dev, int force) +{ + int ret = 0; + int scope = RT_SCOPE_NOWHERE; + struct fib_info *prev_fi = NULL; + unsigned int hash = fib_devindex_hashfn(dev->ifindex); + struct hlist_head *head = &fib_info_devhash[hash]; + struct hlist_node *node; + struct fib_nh *nh; + + if (force) + scope = -1; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + int dead; + + BUG_ON(!fi->fib_nhs); + if (nh->nh_dev != dev || fi == prev_fi) + continue; + prev_fi = fi; + dead = 0; + change_nexthops(fi) { + if (nexthop_nh->nh_flags & RTNH_F_DEAD) + dead++; + else if (nexthop_nh->nh_dev == dev && + nexthop_nh->nh_scope != scope) { + nexthop_nh->nh_flags |= RTNH_F_DEAD; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + spin_lock_bh(&fib_multipath_lock); + fi->fib_power -= nexthop_nh->nh_power; + nexthop_nh->nh_power = 0; + spin_unlock_bh(&fib_multipath_lock); +#endif + dead++; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (force > 1 && nexthop_nh->nh_dev == dev) { + dead = fi->fib_nhs; + break; + } +#endif + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + + return ret; +} + +/* Must be invoked inside of an RCU protected region. */ +void fib_select_default(struct fib_result *res) +{ + struct fib_info *fi = NULL, *last_resort = NULL; + struct list_head *fa_head = res->fa_head; + struct fib_table *tb = res->table; + int order = -1, last_idx = -1; + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, fa_head, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (next_fi->fib_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + + fib_alias_accessed(fa); + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + fi = next_fi; + order++; + } + + if (order <= 0 || fi == NULL) { + tb->tb_default = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: + return; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* + * Dead device goes up. We wake up dead nexthops. + * It takes sense only on multipath routes. + */ +int fib_sync_up(struct net_device *dev) +{ + struct fib_info *prev_fi; + unsigned int hash; + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + int ret; + + if (!(dev->flags & IFF_UP)) + return 0; + + prev_fi = NULL; + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + ret = 0; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + int alive; + + BUG_ON(!fi->fib_nhs); + if (nh->nh_dev != dev || fi == prev_fi) + continue; + + prev_fi = fi; + alive = 0; + change_nexthops(fi) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + alive++; + continue; + } + if (nexthop_nh->nh_dev == NULL || + !(nexthop_nh->nh_dev->flags & IFF_UP)) + continue; + if (nexthop_nh->nh_dev != dev || + !__in_dev_get_rtnl(dev)) + continue; + alive++; + spin_lock_bh(&fib_multipath_lock); + nexthop_nh->nh_power = 0; + nexthop_nh->nh_flags &= ~RTNH_F_DEAD; + spin_unlock_bh(&fib_multipath_lock); + } endfor_nexthops(fi) + + if (alive > 0) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } + + return ret; +} + +/* + * The algorithm is suboptimal, but it provides really + * fair weighted route distribution. + */ +void fib_select_multipath(struct fib_result *res) +{ + struct fib_info *fi = res->fi; + int w; + + spin_lock_bh(&fib_multipath_lock); + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; + if (power <= 0) { + spin_unlock_bh(&fib_multipath_lock); + /* Race condition: route has just become dead. */ + res->nh_sel = 0; + return; + } + } + + + /* w should be random number [0..fi->fib_power-1], + * it is pretty bad approximation. + */ + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && + nexthop_nh->nh_power) { + w -= nexthop_nh->nh_power; + if (w <= 0) { + nexthop_nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + spin_unlock_bh(&fib_multipath_lock); + return; + } + } + } endfor_nexthops(fi); + + /* Race condition: route has just become dead. */ + res->nh_sel = 0; + spin_unlock_bh(&fib_multipath_lock); +} +#endif diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c new file mode 100644 index 00000000..0d884eb2 --- /dev/null +++ b/net/ipv4/fib_trie.c @@ -0,0 +1,2636 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Robert Olsson Uppsala Universitet + * & Swedish University of Agricultural Sciences. + * + * Jens Laas Swedish University of + * Agricultural Sciences. + * + * Hans Liss Uppsala Universitet + * + * This work is based on the LPC-trie which is originally described in: + * + * An experimental study of compression methods for dynamic tries + * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + * http://www.csc.kth.se/~snilsson/software/dyntrie2/ + * + * + * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 + * + * + * Code from fib_hash has been reused which includes the following header: + * + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 FIB: lookup engine and maintenance routines. + * + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Substantial contributions to this work comes from: + * + * David S. Miller, + * Stephen Hemminger + * Paul E. McKenney + * Patrick McHardy + */ + +#define VERSION "0.409" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fib_lookup.h" + +#define MAX_STAT_DEPTH 32 + +#define KEYLENGTH (8*sizeof(t_key)) + +typedef unsigned int t_key; + +#define T_TNODE 0 +#define T_LEAF 1 +#define NODE_TYPE_MASK 0x1UL +#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) + +#define IS_TNODE(n) (!(n->parent & T_LEAF)) +#define IS_LEAF(n) (n->parent & T_LEAF) + +struct rt_trie_node { + unsigned long parent; + t_key key; +}; + +struct leaf { + unsigned long parent; + t_key key; + struct hlist_head list; + struct rcu_head rcu; +}; + +struct leaf_info { + struct hlist_node hlist; + struct rcu_head rcu; + int plen; + struct list_head falh; +}; + +struct tnode { + unsigned long parent; + t_key key; + unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ + unsigned int full_children; /* KEYLENGTH bits needed */ + unsigned int empty_children; /* KEYLENGTH bits needed */ + union { + struct rcu_head rcu; + struct work_struct work; + struct tnode *tnode_free; + }; + struct rt_trie_node __rcu *child[0]; +}; + +#ifdef CONFIG_IP_FIB_TRIE_STATS +struct trie_use_stats { + unsigned int gets; + unsigned int backtrack; + unsigned int semantic_match_passed; + unsigned int semantic_match_miss; + unsigned int null_node_hit; + unsigned int resize_node_skipped; +}; +#endif + +struct trie_stat { + unsigned int totdepth; + unsigned int maxdepth; + unsigned int tnodes; + unsigned int leaves; + unsigned int nullpointers; + unsigned int prefixes; + unsigned int nodesizes[MAX_STAT_DEPTH]; +}; + +struct trie { + struct rt_trie_node __rcu *trie; +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie_use_stats stats; +#endif +}; + +static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, + int wasfull); +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); +static struct tnode *inflate(struct trie *t, struct tnode *tn); +static struct tnode *halve(struct trie *t, struct tnode *tn); +/* tnodes to free after resize(); protected by RTNL */ +static struct tnode *tnode_free_head; +static size_t tnode_free_size; + +/* + * synchronize_rcu after call_rcu for that many pages; it should be especially + * useful before resizing the root node with PREEMPT_NONE configs; the value was + * obtained experimentally, aiming to avoid visible slowdown. + */ +static const int sync_pages = 128; + +static struct kmem_cache *fn_alias_kmem __read_mostly; +static struct kmem_cache *trie_leaf_kmem __read_mostly; + +/* + * caller must hold RTNL + */ +static inline struct tnode *node_parent(const struct rt_trie_node *node) +{ + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); +} + +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) +{ + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || + lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); +} + +/* Same as rcu_assign_pointer + * but that macro() assumes that value is a pointer. + */ +static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) +{ + smp_wmb(); + node->parent = (unsigned long)ptr | NODE_TYPE(node); +} + +/* + * caller must hold RTNL + */ +static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) +{ + BUG_ON(i >= 1U << tn->bits); + + return rtnl_dereference(tn->child[i]); +} + +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) +{ + BUG_ON(i >= 1U << tn->bits); + + return rcu_dereference_rtnl(tn->child[i]); +} + +static inline int tnode_child_length(const struct tnode *tn) +{ + return 1 << tn->bits; +} + +static inline t_key mask_pfx(t_key k, unsigned int l) +{ + return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); +} + +static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) +{ + if (offset < KEYLENGTH) + return ((t_key)(a << offset)) >> (KEYLENGTH - bits); + else + return 0; +} + +static inline int tkey_equals(t_key a, t_key b) +{ + return a == b; +} + +static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) +{ + if (bits == 0 || offset >= KEYLENGTH) + return 1; + bits = bits > KEYLENGTH ? KEYLENGTH : bits; + return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; +} + +static inline int tkey_mismatch(t_key a, int offset, t_key b) +{ + t_key diff = a ^ b; + int i = offset; + + if (!diff) + return 0; + while ((diff << i) >> (KEYLENGTH-1) == 0) + i++; + return i; +} + +/* + To understand this stuff, an understanding of keys and all their bits is + necessary. Every node in the trie has a key associated with it, but not + all of the bits in that key are significant. + + Consider a node 'n' and its parent 'tp'. + + If n is a leaf, every bit in its key is significant. Its presence is + necessitated by path compression, since during a tree traversal (when + searching for a leaf - unless we are doing an insertion) we will completely + ignore all skipped bits we encounter. Thus we need to verify, at the end of + a potentially successful search, that we have indeed been walking the + correct key path. + + Note that we can never "miss" the correct key in the tree if present by + following the wrong path. Path compression ensures that segments of the key + that are the same for all keys with a given prefix are skipped, but the + skipped part *is* identical for each node in the subtrie below the skipped + bit! trie_insert() in this implementation takes care of that - note the + call to tkey_sub_equals() in trie_insert(). + + if n is an internal node - a 'tnode' here, the various parts of its key + have many different meanings. + + Example: + _________________________________________________________________ + | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | + ----------------------------------------------------------------- + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + + _________________________________________________________________ + | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | + ----------------------------------------------------------------- + 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + + tp->pos = 7 + tp->bits = 3 + n->pos = 15 + n->bits = 4 + + First, let's just ignore the bits that come before the parent tp, that is + the bits from 0 to (tp->pos-1). They are *known* but at this point we do + not use them for anything. + + The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the + index into the parent's child array. That is, they will be used to find + 'n' among tp's children. + + The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits + for the node n. + + All the bits we have seen so far are significant to the node n. The rest + of the bits are really not needed or indeed known in n->key. + + The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into + n's child array, and will of course be different for each child. + + + The rest of the bits, from (n->pos + n->bits) onward, are completely unknown + at this point. + +*/ + +static inline void check_tnode(const struct tnode *tn) +{ + WARN_ON(tn && tn->pos+tn->bits > 32); +} + +static const int halve_threshold = 25; +static const int inflate_threshold = 50; +static const int halve_threshold_root = 15; +static const int inflate_threshold_root = 30; + +static void __alias_free_mem(struct rcu_head *head) +{ + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + kmem_cache_free(fn_alias_kmem, fa); +} + +static inline void alias_free_mem_rcu(struct fib_alias *fa) +{ + call_rcu(&fa->rcu, __alias_free_mem); +} + +static void __leaf_free_rcu(struct rcu_head *head) +{ + struct leaf *l = container_of(head, struct leaf, rcu); + kmem_cache_free(trie_leaf_kmem, l); +} + +static inline void free_leaf(struct leaf *l) +{ + call_rcu_bh(&l->rcu, __leaf_free_rcu); +} + +static inline void free_leaf_info(struct leaf_info *leaf) +{ + kfree_rcu(leaf, rcu); +} + +static struct tnode *tnode_alloc(size_t size) +{ + if (size <= PAGE_SIZE) + return kzalloc(size, GFP_KERNEL); + else + return vzalloc(size); +} + +static void __tnode_vfree(struct work_struct *arg) +{ + struct tnode *tn = container_of(arg, struct tnode, work); + vfree(tn); +} + +static void __tnode_free_rcu(struct rcu_head *head) +{ + struct tnode *tn = container_of(head, struct tnode, rcu); + size_t size = sizeof(struct tnode) + + (sizeof(struct rt_trie_node *) << tn->bits); + + if (size <= PAGE_SIZE) + kfree(tn); + else { + INIT_WORK(&tn->work, __tnode_vfree); + schedule_work(&tn->work); + } +} + +static inline void tnode_free(struct tnode *tn) +{ + if (IS_LEAF(tn)) + free_leaf((struct leaf *) tn); + else + call_rcu(&tn->rcu, __tnode_free_rcu); +} + +static void tnode_free_safe(struct tnode *tn) +{ + BUG_ON(IS_LEAF(tn)); + tn->tnode_free = tnode_free_head; + tnode_free_head = tn; + tnode_free_size += sizeof(struct tnode) + + (sizeof(struct rt_trie_node *) << tn->bits); +} + +static void tnode_free_flush(void) +{ + struct tnode *tn; + + while ((tn = tnode_free_head)) { + tnode_free_head = tn->tnode_free; + tn->tnode_free = NULL; + tnode_free(tn); + } + + if (tnode_free_size >= PAGE_SIZE * sync_pages) { + tnode_free_size = 0; + synchronize_rcu(); + } +} + +static struct leaf *leaf_new(void) +{ + struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); + if (l) { + l->parent = T_LEAF; + INIT_HLIST_HEAD(&l->list); + } + return l; +} + +static struct leaf_info *leaf_info_new(int plen) +{ + struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); + if (li) { + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + } + return li; +} + +static struct tnode *tnode_new(t_key key, int pos, int bits) +{ + size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); + struct tnode *tn = tnode_alloc(sz); + + if (tn) { + tn->parent = T_TNODE; + tn->pos = pos; + tn->bits = bits; + tn->key = key; + tn->full_children = 0; + tn->empty_children = 1<pos == tn->pos + tn->bits; +} + +static inline void put_child(struct trie *t, struct tnode *tn, int i, + struct rt_trie_node *n) +{ + tnode_put_child_reorg(tn, i, n, -1); +} + + /* + * Add a child at position i overwriting the old value. + * Update the value of full_children and empty_children. + */ + +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, + int wasfull) +{ + struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); + int isfull; + + BUG_ON(i >= 1<bits); + + /* update emptyChildren */ + if (n == NULL && chi != NULL) + tn->empty_children++; + else if (n != NULL && chi == NULL) + tn->empty_children--; + + /* update fullChildren */ + if (wasfull == -1) + wasfull = tnode_full(tn, chi); + + isfull = tnode_full(tn, n); + if (wasfull && !isfull) + tn->full_children--; + else if (!wasfull && isfull) + tn->full_children++; + + if (n) + node_set_parent(n, tn); + + rcu_assign_pointer(tn->child[i], n); +} + +#define MAX_WORK 10 +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) +{ + int i; + struct tnode *old_tn; + int inflate_threshold_use; + int halve_threshold_use; + int max_work; + + if (!tn) + return NULL; + + pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); + + /* No children */ + if (tn->empty_children == tnode_child_length(tn)) { + tnode_free_safe(tn); + return NULL; + } + /* One child */ + if (tn->empty_children == tnode_child_length(tn) - 1) + goto one_child; + /* + * Double as long as the resulting node has a number of + * nonempty nodes that are above the threshold. + */ + + /* + * From "Implementing a dynamic compressed trie" by Stefan Nilsson of + * the Helsinki University of Technology and Matti Tikkanen of Nokia + * Telecommunications, page 6: + * "A node is doubled if the ratio of non-empty children to all + * children in the *doubled* node is at least 'high'." + * + * 'high' in this instance is the variable 'inflate_threshold'. It + * is expressed as a percentage, so we multiply it with + * tnode_child_length() and instead of multiplying by 2 (since the + * child array will be doubled by inflate()) and multiplying + * the left-hand side by 100 (to handle the percentage thing) we + * multiply the left-hand side by 50. + * + * The left-hand side may look a bit weird: tnode_child_length(tn) + * - tn->empty_children is of course the number of non-null children + * in the current node. tn->full_children is the number of "full" + * children, that is non-null tnodes with a skip value of 0. + * All of those will be doubled in the resulting inflated tnode, so + * we just count them one extra time here. + * + * A clearer way to write this would be: + * + * to_be_doubled = tn->full_children; + * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * tn->full_children; + * + * new_child_length = tnode_child_length(tn) * 2; + * + * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / + * new_child_length; + * if (new_fill_factor >= inflate_threshold) + * + * ...and so on, tho it would mess up the while () loop. + * + * anyway, + * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= + * inflate_threshold + * + * avoid a division: + * 100 * (not_to_be_doubled + 2*to_be_doubled) >= + * inflate_threshold * new_child_length + * + * expand not_to_be_doubled and to_be_doubled, and shorten: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= inflate_threshold * new_child_length + * + * expand new_child_length: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= + * inflate_threshold * tnode_child_length(tn) * 2 + * + * shorten again: + * 50 * (tn->full_children + tnode_child_length(tn) - + * tn->empty_children) >= inflate_threshold * + * tnode_child_length(tn) + * + */ + + check_tnode(tn); + + /* Keep root node larger */ + + if (!node_parent((struct rt_trie_node *)tn)) { + inflate_threshold_use = inflate_threshold_root; + halve_threshold_use = halve_threshold_root; + } else { + inflate_threshold_use = inflate_threshold; + halve_threshold_use = halve_threshold; + } + + max_work = MAX_WORK; + while ((tn->full_children > 0 && max_work-- && + 50 * (tn->full_children + tnode_child_length(tn) + - tn->empty_children) + >= inflate_threshold_use * tnode_child_length(tn))) { + + old_tn = tn; + tn = inflate(t, tn); + + if (IS_ERR(tn)) { + tn = old_tn; +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } + + check_tnode(tn); + + /* Return if at least one inflate is run */ + if (max_work != MAX_WORK) + return (struct rt_trie_node *) tn; + + /* + * Halve as long as the number of empty children in this + * node is above threshold. + */ + + max_work = MAX_WORK; + while (tn->bits > 1 && max_work-- && + 100 * (tnode_child_length(tn) - tn->empty_children) < + halve_threshold_use * tnode_child_length(tn)) { + + old_tn = tn; + tn = halve(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } + + + /* Only one child remains */ + if (tn->empty_children == tnode_child_length(tn) - 1) { +one_child: + for (i = 0; i < tnode_child_length(tn); i++) { + struct rt_trie_node *n; + + n = rtnl_dereference(tn->child[i]); + if (!n) + continue; + + /* compress one level */ + + node_set_parent(n, NULL); + tnode_free_safe(tn); + return n; + } + } + return (struct rt_trie_node *) tn; +} + + +static void tnode_clean_free(struct tnode *tn) +{ + int i; + struct tnode *tofree; + + for (i = 0; i < tnode_child_length(tn); i++) { + tofree = (struct tnode *)rtnl_dereference(tn->child[i]); + if (tofree) + tnode_free(tofree); + } + tnode_free(tn); +} + +static struct tnode *inflate(struct trie *t, struct tnode *tn) +{ + struct tnode *oldtnode = tn; + int olen = tnode_child_length(tn); + int i; + + pr_debug("In inflate\n"); + + tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); + + if (!tn) + return ERR_PTR(-ENOMEM); + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and inflate + * of tnode is ignored. + */ + + for (i = 0; i < olen; i++) { + struct tnode *inode; + + inode = (struct tnode *) tnode_get_child(oldtnode, i); + if (inode && + IS_TNODE(inode) && + inode->pos == oldtnode->pos + oldtnode->bits && + inode->bits > 1) { + struct tnode *left, *right; + t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; + + left = tnode_new(inode->key&(~m), inode->pos + 1, + inode->bits - 1); + if (!left) + goto nomem; + + right = tnode_new(inode->key|m, inode->pos + 1, + inode->bits - 1); + + if (!right) { + tnode_free(left); + goto nomem; + } + + put_child(t, tn, 2*i, (struct rt_trie_node *) left); + put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); + } + } + + for (i = 0; i < olen; i++) { + struct tnode *inode; + struct rt_trie_node *node = tnode_get_child(oldtnode, i); + struct tnode *left, *right; + int size, j; + + /* An empty child */ + if (node == NULL) + continue; + + /* A leaf or an internal node with skipped bits */ + + if (IS_LEAF(node) || ((struct tnode *) node)->pos > + tn->pos + tn->bits - 1) { + if (tkey_extract_bits(node->key, + oldtnode->pos + oldtnode->bits, + 1) == 0) + put_child(t, tn, 2*i, node); + else + put_child(t, tn, 2*i+1, node); + continue; + } + + /* An internal node with two children */ + inode = (struct tnode *) node; + + if (inode->bits == 1) { + put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); + + tnode_free_safe(inode); + continue; + } + + /* An internal node with more than two children */ + + /* We will replace this node 'inode' with two new + * ones, 'left' and 'right', each with half of the + * original children. The two new nodes will have + * a position one bit further down the key and this + * means that the "significant" part of their keys + * (see the discussion near the top of this file) + * will differ by one bit, which will be "0" in + * left's key and "1" in right's key. Since we are + * moving the key position by one step, the bit that + * we are moving away from - the bit at position + * (inode->pos) - is the one that will differ between + * left and right. So... we synthesize that bit in the + * two new keys. + * The mask 'm' below will be a single "one" bit at + * the position (inode->pos) + */ + + /* Use the old key, but set the new significant + * bit to zero. + */ + + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); + + BUG_ON(!left); + + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); + + BUG_ON(!right); + + size = tnode_child_length(left); + for (j = 0; j < size; j++) { + put_child(t, left, j, rtnl_dereference(inode->child[j])); + put_child(t, right, j, rtnl_dereference(inode->child[j + size])); + } + put_child(t, tn, 2*i, resize(t, left)); + put_child(t, tn, 2*i+1, resize(t, right)); + + tnode_free_safe(inode); + } + tnode_free_safe(oldtnode); + return tn; +nomem: + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); +} + +static struct tnode *halve(struct trie *t, struct tnode *tn) +{ + struct tnode *oldtnode = tn; + struct rt_trie_node *left, *right; + int i; + int olen = tnode_child_length(tn); + + pr_debug("In halve\n"); + + tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); + + if (!tn) + return ERR_PTR(-ENOMEM); + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and halve + * of tnode is ignored. + */ + + for (i = 0; i < olen; i += 2) { + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* Two nonempty children */ + if (left && right) { + struct tnode *newn; + + newn = tnode_new(left->key, tn->pos + tn->bits, 1); + + if (!newn) + goto nomem; + + put_child(t, tn, i/2, (struct rt_trie_node *)newn); + } + + } + + for (i = 0; i < olen; i += 2) { + struct tnode *newBinNode; + + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* At least one of the children is empty */ + if (left == NULL) { + if (right == NULL) /* Both are empty */ + continue; + put_child(t, tn, i/2, right); + continue; + } + + if (right == NULL) { + put_child(t, tn, i/2, left); + continue; + } + + /* Two nonempty children */ + newBinNode = (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); + put_child(t, newBinNode, 0, left); + put_child(t, newBinNode, 1, right); + put_child(t, tn, i/2, resize(t, newBinNode)); + } + tnode_free_safe(oldtnode); + return tn; +nomem: + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); +} + +/* readside must use rcu_read_lock currently dump routines + via get_fa_head and dump */ + +static struct leaf_info *find_leaf_info(struct leaf *l, int plen) +{ + struct hlist_head *head = &l->list; + struct hlist_node *node; + struct leaf_info *li; + + hlist_for_each_entry_rcu(li, node, head, hlist) + if (li->plen == plen) + return li; + + return NULL; +} + +static inline struct list_head *get_fa_head(struct leaf *l, int plen) +{ + struct leaf_info *li = find_leaf_info(l, plen); + + if (!li) + return NULL; + + return &li->falh; +} + +static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) +{ + struct leaf_info *li = NULL, *last = NULL; + struct hlist_node *node; + + if (hlist_empty(head)) { + hlist_add_head_rcu(&new->hlist, head); + } else { + hlist_for_each_entry(li, node, head, hlist) { + if (new->plen > li->plen) + break; + + last = li; + } + if (last) + hlist_add_after_rcu(&last->hlist, &new->hlist); + else + hlist_add_before_rcu(&new->hlist, &li->hlist); + } +} + +/* rcu_read_lock needs to be hold by caller from readside */ + +static struct leaf * +fib_find_node(struct trie *t, u32 key) +{ + int pos; + struct tnode *tn; + struct rt_trie_node *n; + + pos = 0; + n = rcu_dereference_rtnl(t->trie); + + while (n != NULL && NODE_TYPE(n) == T_TNODE) { + tn = (struct tnode *) n; + + check_tnode(tn); + + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { + pos = tn->pos + tn->bits; + n = tnode_get_child_rcu(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); + } else + break; + } + /* Case we have found a leaf. Compare prefixes */ + + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) + return (struct leaf *)n; + + return NULL; +} + +static void trie_rebalance(struct trie *t, struct tnode *tn) +{ + int wasfull; + t_key cindex, key; + struct tnode *tp; + + key = tn->key; + + while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); + tn = (struct tnode *) resize(t, (struct tnode *)tn); + + tnode_put_child_reorg((struct tnode *)tp, cindex, + (struct rt_trie_node *)tn, wasfull); + + tp = node_parent((struct rt_trie_node *) tn); + if (!tp) + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + + tnode_free_flush(); + if (!tp) + break; + tn = tp; + } + + /* Handle last (top) tnode */ + if (IS_TNODE(tn)) + tn = (struct tnode *)resize(t, (struct tnode *)tn); + + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + tnode_free_flush(); +} + +/* only used from updater-side */ + +static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) +{ + int pos, newpos; + struct tnode *tp = NULL, *tn = NULL; + struct rt_trie_node *n; + struct leaf *l; + int missbit; + struct list_head *fa_head = NULL; + struct leaf_info *li; + t_key cindex; + + pos = 0; + n = rtnl_dereference(t->trie); + + /* If we point to NULL, stop. Either the tree is empty and we should + * just put a new leaf in if, or we have reached an empty child slot, + * and we should just put our new leaf in that. + * If we point to a T_TNODE, check if it matches our key. Note that + * a T_TNODE might be skipping any number of bits - its 'pos' need + * not be the parent's 'pos'+'bits'! + * + * If it does match the current key, get pos/bits from it, extract + * the index from our key, push the T_TNODE and walk the tree. + * + * If it doesn't, we have to replace it with a new T_TNODE. + * + * If we point to a T_LEAF, it might or might not have the same key + * as we do. If it does, just change the value, update the T_LEAF's + * value, and return it. + * If it doesn't, we need to replace it with a T_TNODE. + */ + + while (n != NULL && NODE_TYPE(n) == T_TNODE) { + tn = (struct tnode *) n; + + check_tnode(tn); + + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { + tp = tn; + pos = tn->pos + tn->bits; + n = tnode_get_child(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); + + BUG_ON(n && node_parent(n) != tn); + } else + break; + } + + /* + * n ----> NULL, LEAF or TNODE + * + * tp is n's (parent) ----> NULL or TNODE + */ + + BUG_ON(tp && IS_LEAF(tp)); + + /* Case 1: n is a leaf. Compare prefixes */ + + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { + l = (struct leaf *) n; + li = leaf_info_new(plen); + + if (!li) + return NULL; + + fa_head = &li->falh; + insert_leaf_info(&l->list, li); + goto done; + } + l = leaf_new(); + + if (!l) + return NULL; + + l->key = key; + li = leaf_info_new(plen); + + if (!li) { + free_leaf(l); + return NULL; + } + + fa_head = &li->falh; + insert_leaf_info(&l->list, li); + + if (t->trie && n == NULL) { + /* Case 2: n is NULL, and will just insert a new leaf */ + + node_set_parent((struct rt_trie_node *)l, tp); + + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); + } else { + /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ + /* + * Add a new tnode here + * first tnode need some special handling + */ + + if (tp) + pos = tp->pos+tp->bits; + else + pos = 0; + + if (n) { + newpos = tkey_mismatch(key, pos, n->key); + tn = tnode_new(n->key, newpos, 1); + } else { + newpos = 0; + tn = tnode_new(key, newpos, 1); /* First tnode */ + } + + if (!tn) { + free_leaf_info(li); + free_leaf(l); + return NULL; + } + + node_set_parent((struct rt_trie_node *)tn, tp); + + missbit = tkey_extract_bits(key, newpos, 1); + put_child(t, tn, missbit, (struct rt_trie_node *)l); + put_child(t, tn, 1-missbit, n); + + if (tp) { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, + (struct rt_trie_node *)tn); + } else { + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + tp = tn; + } + } + + if (tp && tp->pos + tp->bits > 32) + pr_warning("fib_trie" + " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", + tp, tp->pos, tp->bits, key, plen); + + /* Rebalance the trie */ + + trie_rebalance(t, tp); +done: + return fa_head; +} + +/* + * Caller must hold RTNL. + */ +int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) +{ + struct trie *t = (struct trie *) tb->tb_data; + struct fib_alias *fa, *new_fa; + struct list_head *fa_head = NULL; + struct fib_info *fi; + int plen = cfg->fc_dst_len; + u8 tos = cfg->fc_tos; + u32 key, mask; + int err; + struct leaf *l; + + if (plen > 32) + return -EINVAL; + + key = ntohl(cfg->fc_dst); + + pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); + + mask = ntohl(inet_make_mask(plen)); + + if (key & ~mask) + return -EINVAL; + + key = key & mask; + + fi = fib_create_info(cfg); + if (IS_ERR(fi)) { + err = PTR_ERR(fi); + goto err; + } + + l = fib_find_node(t, key); + fa = NULL; + + if (l) { + fa_head = get_fa_head(l, plen); + fa = fib_find_alias(fa_head, tos, fi->fib_priority); + } + + /* Now fa, if non-NULL, points to the first fib alias + * with the same keys [prefix,tos,priority], if such key already + * exists or to the node before which we will insert new one. + * + * If fa is NULL, we will need to allocate a new one and + * insert to the head of f. + * + * If f is NULL, no fib node matched the destination key + * and we need to allocate a new one of those as well. + */ + + if (fa && fa->fa_tos == tos && + fa->fa_info->fib_priority == fi->fib_priority) { + struct fib_alias *fa_first, *fa_match; + + err = -EEXIST; + if (cfg->fc_nlflags & NLM_F_EXCL) + goto out; + + /* We have 2 goals: + * 1. Find exact match for type, scope, fib_info to avoid + * duplicate routes + * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it + */ + fa_match = NULL; + fa_first = fa; + fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); + list_for_each_entry_continue(fa, fa_head, fa_list) { + if (fa->fa_tos != tos) + break; + if (fa->fa_info->fib_priority != fi->fib_priority) + break; + if (fa->fa_type == cfg->fc_type && + fa->fa_info == fi) { + fa_match = fa; + break; + } + } + + if (cfg->fc_nlflags & NLM_F_REPLACE) { + struct fib_info *fi_drop; + u8 state; + + fa = fa_first; + if (fa_match) { + if (fa == fa_match) + err = 0; + goto out; + } + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (new_fa == NULL) + goto out; + + fi_drop = fa->fa_info; + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = cfg->fc_type; + state = fa->fa_state; + new_fa->fa_state = state & ~FA_S_ACCESSED; + + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); + alias_free_mem_rcu(fa); + + fib_release_info(fi_drop); + if (state & FA_S_ACCESSED) + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, + tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); + + goto succeeded; + } + /* Error if we find a perfect match which + * uses the same scope, type, and nexthop + * information. + */ + if (fa_match) + goto out; + + if (!(cfg->fc_nlflags & NLM_F_APPEND)) + fa = fa_first; + } + err = -ENOENT; + if (!(cfg->fc_nlflags & NLM_F_CREATE)) + goto out; + + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (new_fa == NULL) + goto out; + + new_fa->fa_info = fi; + new_fa->fa_tos = tos; + new_fa->fa_type = cfg->fc_type; + new_fa->fa_state = 0; + /* + * Insert new entry to the list. + */ + + if (!fa_head) { + fa_head = fib_insert_node(t, key, plen); + if (unlikely(!fa_head)) { + err = -ENOMEM; + goto out_free_new_fa; + } + } + + if (!plen) + tb->tb_num_default++; + + list_add_tail_rcu(&new_fa->fa_list, + (fa ? &fa->fa_list : fa_head)); + + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, + &cfg->fc_nlinfo, 0); +succeeded: + return 0; + +out_free_new_fa: + kmem_cache_free(fn_alias_kmem, new_fa); +out: + fib_release_info(fi); +err: + return err; +} + +/* should be called with rcu_read_lock */ +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, + t_key key, const struct flowi4 *flp, + struct fib_result *res, int fib_flags) +{ + struct leaf_info *li; + struct hlist_head *hhead = &l->list; + struct hlist_node *node; + + hlist_for_each_entry_rcu(li, node, hhead, hlist) { + struct fib_alias *fa; + int plen = li->plen; + __be32 mask = inet_make_mask(plen); + + if (l->key != (key & ntohl(mask))) + continue; + + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; + + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fi->fib_dead) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_passed++; +#endif + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_passed++; +#endif + res->prefixlen = plen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fa->fa_info->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &li->falh; + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&res->fi->fib_clntref); + return 0; + } + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_miss++; +#endif + } + + return 1; +} + +int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, + struct fib_result *res, int fib_flags) +{ + struct trie *t = (struct trie *) tb->tb_data; + int ret; + struct rt_trie_node *n; + struct tnode *pn; + unsigned int pos, bits; + t_key key = ntohl(flp->daddr); + unsigned int chopped_off; + t_key cindex = 0; + unsigned int current_prefix_length = KEYLENGTH; + struct tnode *cn; + t_key pref_mismatch; + + rcu_read_lock(); + + n = rcu_dereference(t->trie); + if (!n) + goto failed; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.gets++; +#endif + + /* Just a leaf? */ + if (IS_LEAF(n)) { + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); + goto found; + } + + pn = (struct tnode *) n; + chopped_off = 0; + + while (pn) { + pos = pn->pos; + bits = pn->bits; + + if (!chopped_off) + cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), + pos, bits); + + n = tnode_get_child_rcu(pn, cindex); + + if (n == NULL) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.null_node_hit++; +#endif + goto backtrace; + } + + if (IS_LEAF(n)) { + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); + if (ret > 0) + goto backtrace; + goto found; + } + + cn = (struct tnode *)n; + + /* + * It's a tnode, and we can do some extra checks here if we + * like, to avoid descending into a dead-end branch. + * This tnode is in the parent's child array at index + * key[p_pos..p_pos+p_bits] but potentially with some bits + * chopped off, so in reality the index may be just a + * subprefix, padded with zero at the end. + * We can also take a look at any skipped bits in this + * tnode - everything up to p_pos is supposed to be ok, + * and the non-chopped bits of the index (se previous + * paragraph) are also guaranteed ok, but the rest is + * considered unknown. + * + * The skipped bits are key[pos+bits..cn->pos]. + */ + + /* If current_prefix_length < pos+bits, we are already doing + * actual prefix matching, which means everything from + * pos+(bits-chopped_off) onward must be zero along some + * branch of this subtree - otherwise there is *no* valid + * prefix present. Here we can only check the skipped + * bits. Remember, since we have already indexed into the + * parent's child array, we know that the bits we chopped of + * *are* zero. + */ + + /* NOTA BENE: Checking only skipped bits + for the new node here */ + + if (current_prefix_length < pos+bits) { + if (tkey_extract_bits(cn->key, current_prefix_length, + cn->pos - current_prefix_length) + || !(cn->child[0])) + goto backtrace; + } + + /* + * If chopped_off=0, the index is fully validated and we + * only need to look at the skipped bits for this, the new, + * tnode. What we actually want to do is to find out if + * these skipped bits match our key perfectly, or if we will + * have to count on finding a matching prefix further down, + * because if we do, we would like to have some way of + * verifying the existence of such a prefix at this point. + */ + + /* The only thing we can do at this point is to verify that + * any such matching prefix can indeed be a prefix to our + * key, and if the bits in the node we are inspecting that + * do not match our key are not ZERO, this cannot be true. + * Thus, find out where there is a mismatch (before cn->pos) + * and verify that all the mismatching bits are zero in the + * new tnode's key. + */ + + /* + * Note: We aren't very concerned about the piece of + * the key that precede pn->pos+pn->bits, since these + * have already been checked. The bits after cn->pos + * aren't checked since these are by definition + * "unknown" at this point. Thus, what we want to see + * is if we are about to enter the "prefix matching" + * state, and in that case verify that the skipped + * bits that will prevail throughout this subtree are + * zero, as they have to be if we are to find a + * matching prefix. + */ + + pref_mismatch = mask_pfx(cn->key ^ key, cn->pos); + + /* + * In short: If skipped bits in this node do not match + * the search key, enter the "prefix matching" + * state.directly. + */ + if (pref_mismatch) { + int mp = KEYLENGTH - fls(pref_mismatch); + + if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) + goto backtrace; + + if (current_prefix_length >= cn->pos) + current_prefix_length = mp; + } + + pn = (struct tnode *)n; /* Descend */ + chopped_off = 0; + continue; + +backtrace: + chopped_off++; + + /* As zero don't change the child key (cindex) */ + while ((chopped_off <= pn->bits) + && !(cindex & (1<<(chopped_off-1)))) + chopped_off++; + + /* Decrease current_... with bits chopped off */ + if (current_prefix_length > pn->pos + pn->bits - chopped_off) + current_prefix_length = pn->pos + pn->bits + - chopped_off; + + /* + * Either we do the actual chop off according or if we have + * chopped off all bits in this tnode walk up to our parent. + */ + + if (chopped_off <= pn->bits) { + cindex &= ~(1 << (chopped_off-1)); + } else { + struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); + if (!parent) + goto failed; + + /* Get Child's index */ + cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits); + pn = parent; + chopped_off = 0; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.backtrack++; +#endif + goto backtrace; + } + } +failed: + ret = 1; +found: + rcu_read_unlock(); + return ret; +} + +/* + * Remove the leaf and return parent. + */ +static void trie_leaf_remove(struct trie *t, struct leaf *l) +{ + struct tnode *tp = node_parent((struct rt_trie_node *) l); + + pr_debug("entering trie_leaf_remove(%p)\n", l); + + if (tp) { + t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, NULL); + trie_rebalance(t, tp); + } else + rcu_assign_pointer(t->trie, NULL); + + free_leaf(l); +} + +/* + * Caller must hold RTNL. + */ +int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) +{ + struct trie *t = (struct trie *) tb->tb_data; + u32 key, mask; + int plen = cfg->fc_dst_len; + u8 tos = cfg->fc_tos; + struct fib_alias *fa, *fa_to_delete; + struct list_head *fa_head; + struct leaf *l; + struct leaf_info *li; + + if (plen > 32) + return -EINVAL; + + key = ntohl(cfg->fc_dst); + mask = ntohl(inet_make_mask(plen)); + + if (key & ~mask) + return -EINVAL; + + key = key & mask; + l = fib_find_node(t, key); + + if (!l) + return -ESRCH; + + fa_head = get_fa_head(l, plen); + fa = fib_find_alias(fa_head, tos, 0); + + if (!fa) + return -ESRCH; + + pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + + fa_to_delete = NULL; + fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); + list_for_each_entry_continue(fa, fa_head, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fa->fa_tos != tos) + break; + + if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && + (cfg->fc_scope == RT_SCOPE_NOWHERE || + fa->fa_info->fib_scope == cfg->fc_scope) && + (!cfg->fc_prefsrc || + fi->fib_prefsrc == cfg->fc_prefsrc) && + (!cfg->fc_protocol || + fi->fib_protocol == cfg->fc_protocol) && + fib_nh_match(cfg, fi) == 0) { + fa_to_delete = fa; + break; + } + } + + if (!fa_to_delete) + return -ESRCH; + + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, + &cfg->fc_nlinfo, 0); + + l = fib_find_node(t, key); + li = find_leaf_info(l, plen); + + list_del_rcu(&fa->fa_list); + + if (!plen) + tb->tb_num_default--; + + if (list_empty(fa_head)) { + hlist_del_rcu(&li->hlist); + free_leaf_info(li); + } + + if (hlist_empty(&l->list)) + trie_leaf_remove(t, l); + + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + return 0; +} + +static int trie_flush_list(struct list_head *head) +{ + struct fib_alias *fa, *fa_node; + int found = 0; + + list_for_each_entry_safe(fa, fa_node, head, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + list_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + found++; + } + } + return found; +} + +static int trie_flush_leaf(struct leaf *l) +{ + int found = 0; + struct hlist_head *lih = &l->list; + struct hlist_node *node, *tmp; + struct leaf_info *li = NULL; + + hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { + found += trie_flush_list(&li->falh); + + if (list_empty(&li->falh)) { + hlist_del_rcu(&li->hlist); + free_leaf_info(li); + } + } + return found; +} + +/* + * Scan for the next right leaf starting at node p->child[idx] + * Since we have back pointer, no recursion necessary. + */ +static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) +{ + do { + t_key idx; + + if (c) + idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1; + else + idx = 0; + + while (idx < 1u << p->bits) { + c = tnode_get_child_rcu(p, idx++); + if (!c) + continue; + + if (IS_LEAF(c)) { + prefetch(rcu_dereference_rtnl(p->child[idx])); + return (struct leaf *) c; + } + + /* Rescan start scanning in new node */ + p = (struct tnode *) c; + idx = 0; + } + + /* Node empty, walk back up to parent */ + c = (struct rt_trie_node *) p; + } while ((p = node_parent_rcu(c)) != NULL); + + return NULL; /* Root of trie */ +} + +static struct leaf *trie_firstleaf(struct trie *t) +{ + struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); + + if (!n) + return NULL; + + if (IS_LEAF(n)) /* trie is just a leaf */ + return (struct leaf *) n; + + return leaf_walk_rcu(n, NULL); +} + +static struct leaf *trie_nextleaf(struct leaf *l) +{ + struct rt_trie_node *c = (struct rt_trie_node *) l; + struct tnode *p = node_parent_rcu(c); + + if (!p) + return NULL; /* trie with just one leaf */ + + return leaf_walk_rcu(p, c); +} + +static struct leaf *trie_leafindex(struct trie *t, int index) +{ + struct leaf *l = trie_firstleaf(t); + + while (l && index-- > 0) + l = trie_nextleaf(l); + + return l; +} + + +/* + * Caller must hold RTNL. + */ +int fib_table_flush(struct fib_table *tb) +{ + struct trie *t = (struct trie *) tb->tb_data; + struct leaf *l, *ll = NULL; + int found = 0; + + for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { + found += trie_flush_leaf(l); + + if (ll && hlist_empty(&ll->list)) + trie_leaf_remove(t, ll); + ll = l; + } + + if (ll && hlist_empty(&ll->list)) + trie_leaf_remove(t, ll); + + pr_debug("trie_flush found=%d\n", found); + return found; +} + +void fib_free_table(struct fib_table *tb) +{ + kfree(tb); +} + +static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, + struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, s_i; + struct fib_alias *fa; + __be32 xkey = htonl(key); + + s_i = cb->args[5]; + i = 0; + + /* rcu_read_lock is hold by caller */ + + list_for_each_entry_rcu(fa, fah, fa_list) { + if (i < s_i) { + i++; + continue; + } + + if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, + fa->fa_type, + xkey, + plen, + fa->fa_tos, + fa->fa_info, NLM_F_MULTI) < 0) { + cb->args[5] = i; + return -1; + } + i++; + } + cb->args[5] = i; + return skb->len; +} + +static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) +{ + struct leaf_info *li; + struct hlist_node *node; + int i, s_i; + + s_i = cb->args[4]; + i = 0; + + /* rcu_read_lock is hold by caller */ + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + if (i < s_i) { + i++; + continue; + } + + if (i > s_i) + cb->args[5] = 0; + + if (list_empty(&li->falh)) + continue; + + if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { + cb->args[4] = i; + return -1; + } + i++; + } + + cb->args[4] = i; + return skb->len; +} + +int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct leaf *l; + struct trie *t = (struct trie *) tb->tb_data; + t_key key = cb->args[2]; + int count = cb->args[3]; + + rcu_read_lock(); + /* Dump starting at last key. + * Note: 0.0.0.0/0 (ie default) is first key. + */ + if (count == 0) + l = trie_firstleaf(t); + else { + /* Normally, continue from last key, but if that is missing + * fallback to using slow rescan + */ + l = fib_find_node(t, key); + if (!l) + l = trie_leafindex(t, count); + } + + while (l) { + cb->args[2] = l->key; + if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + cb->args[3] = count; + rcu_read_unlock(); + return -1; + } + + ++count; + l = trie_nextleaf(l); + memset(&cb->args[4], 0, + sizeof(cb->args) - 4*sizeof(cb->args[0])); + } + cb->args[3] = count; + rcu_read_unlock(); + + return skb->len; +} + +void __init fib_trie_init(void) +{ + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), + 0, SLAB_PANIC, NULL); + + trie_leaf_kmem = kmem_cache_create("ip_fib_trie", + max(sizeof(struct leaf), + sizeof(struct leaf_info)), + 0, SLAB_PANIC, NULL); +} + + +struct fib_table *fib_trie_table(u32 id) +{ + struct fib_table *tb; + struct trie *t; + + tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), + GFP_KERNEL); + if (tb == NULL) + return NULL; + + tb->tb_id = id; + tb->tb_default = -1; + tb->tb_num_default = 0; + + t = (struct trie *) tb->tb_data; + memset(t, 0, sizeof(*t)); + + return tb; +} + +#ifdef CONFIG_PROC_FS +/* Depth first Trie walk iterator */ +struct fib_trie_iter { + struct seq_net_private p; + struct fib_table *tb; + struct tnode *tnode; + unsigned int index; + unsigned int depth; +}; + +static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) +{ + struct tnode *tn = iter->tnode; + unsigned int cindex = iter->index; + struct tnode *p; + + /* A single entry routing table */ + if (!tn) + return NULL; + + pr_debug("get_next iter={node=%p index=%d depth=%d}\n", + iter->tnode, iter->index, iter->depth); +rescan: + while (cindex < (1<bits)) { + struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); + + if (n) { + if (IS_LEAF(n)) { + iter->tnode = tn; + iter->index = cindex + 1; + } else { + /* push down one level */ + iter->tnode = (struct tnode *) n; + iter->index = 0; + ++iter->depth; + } + return n; + } + + ++cindex; + } + + /* Current node exhausted, pop back up */ + p = node_parent_rcu((struct rt_trie_node *)tn); + if (p) { + cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; + tn = p; + --iter->depth; + goto rescan; + } + + /* got root? */ + return NULL; +} + +static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, + struct trie *t) +{ + struct rt_trie_node *n; + + if (!t) + return NULL; + + n = rcu_dereference(t->trie); + if (!n) + return NULL; + + if (IS_TNODE(n)) { + iter->tnode = (struct tnode *) n; + iter->index = 0; + iter->depth = 1; + } else { + iter->tnode = NULL; + iter->index = 0; + iter->depth = 0; + } + + return n; +} + +static void trie_collect_stats(struct trie *t, struct trie_stat *s) +{ + struct rt_trie_node *n; + struct fib_trie_iter iter; + + memset(s, 0, sizeof(*s)); + + rcu_read_lock(); + for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { + if (IS_LEAF(n)) { + struct leaf *l = (struct leaf *)n; + struct leaf_info *li; + struct hlist_node *tmp; + + s->leaves++; + s->totdepth += iter.depth; + if (iter.depth > s->maxdepth) + s->maxdepth = iter.depth; + + hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) + ++s->prefixes; + } else { + const struct tnode *tn = (const struct tnode *) n; + int i; + + s->tnodes++; + if (tn->bits < MAX_STAT_DEPTH) + s->nodesizes[tn->bits]++; + + for (i = 0; i < (1<bits); i++) + if (!tn->child[i]) + s->nullpointers++; + } + } + rcu_read_unlock(); +} + +/* + * This outputs /proc/net/fib_triestats + */ +static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) +{ + unsigned int i, max, pointers, bytes, avdepth; + + if (stat->leaves) + avdepth = stat->totdepth*100 / stat->leaves; + else + avdepth = 0; + + seq_printf(seq, "\tAver depth: %u.%02d\n", + avdepth / 100, avdepth % 100); + seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); + + seq_printf(seq, "\tLeaves: %u\n", stat->leaves); + bytes = sizeof(struct leaf) * stat->leaves; + + seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); + bytes += sizeof(struct leaf_info) * stat->prefixes; + + seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); + bytes += sizeof(struct tnode) * stat->tnodes; + + max = MAX_STAT_DEPTH; + while (max > 0 && stat->nodesizes[max-1] == 0) + max--; + + pointers = 0; + for (i = 1; i <= max; i++) + if (stat->nodesizes[i] != 0) { + seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); + pointers += (1<nodesizes[i]; + } + seq_putc(seq, '\n'); + seq_printf(seq, "\tPointers: %u\n", pointers); + + bytes += sizeof(struct rt_trie_node *) * pointers; + seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); + seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); +} + +#ifdef CONFIG_IP_FIB_TRIE_STATS +static void trie_show_usage(struct seq_file *seq, + const struct trie_use_stats *stats) +{ + seq_printf(seq, "\nCounters:\n---------\n"); + seq_printf(seq, "gets = %u\n", stats->gets); + seq_printf(seq, "backtracks = %u\n", stats->backtrack); + seq_printf(seq, "semantic match passed = %u\n", + stats->semantic_match_passed); + seq_printf(seq, "semantic match miss = %u\n", + stats->semantic_match_miss); + seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); + seq_printf(seq, "skipped node resize = %u\n\n", + stats->resize_node_skipped); +} +#endif /* CONFIG_IP_FIB_TRIE_STATS */ + +static void fib_table_print(struct seq_file *seq, struct fib_table *tb) +{ + if (tb->tb_id == RT_TABLE_LOCAL) + seq_puts(seq, "Local:\n"); + else if (tb->tb_id == RT_TABLE_MAIN) + seq_puts(seq, "Main:\n"); + else + seq_printf(seq, "Id %d:\n", tb->tb_id); +} + + +static int fib_triestat_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = (struct net *)seq->private; + unsigned int h; + + seq_printf(seq, + "Basic info: size of leaf:" + " %Zd bytes, size of tnode: %Zd bytes.\n", + sizeof(struct leaf), sizeof(struct tnode)); + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *node; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + struct trie *t = (struct trie *) tb->tb_data; + struct trie_stat stat; + + if (!t) + continue; + + fib_table_print(seq, tb); + + trie_collect_stats(t, &stat); + trie_show_stats(seq, &stat); +#ifdef CONFIG_IP_FIB_TRIE_STATS + trie_show_usage(seq, &t->stats); +#endif + } + } + + return 0; +} + +static int fib_triestat_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, fib_triestat_seq_show); +} + +static const struct file_operations fib_triestat_fops = { + .owner = THIS_MODULE, + .open = fib_triestat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +{ + struct fib_trie_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + loff_t idx = 0; + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *node; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + struct rt_trie_node *n; + + for (n = fib_trie_get_first(iter, + (struct trie *) tb->tb_data); + n; n = fib_trie_get_next(iter)) + if (pos == idx++) { + iter->tb = tb; + return n; + } + } + } + + return NULL; +} + +static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return fib_trie_get_idx(seq, *pos); +} + +static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct fib_trie_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct fib_table *tb = iter->tb; + struct hlist_node *tb_node; + unsigned int h; + struct rt_trie_node *n; + + ++*pos; + /* next node in same table */ + n = fib_trie_get_next(iter); + if (n) + return n; + + /* walk rest of this hash chain */ + h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); + while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { + tb = hlist_entry(tb_node, struct fib_table, tb_hlist); + n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); + if (n) + goto found; + } + + /* new hash chain */ + while (++h < FIB_TABLE_HASHSZ) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) { + n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); + if (n) + goto found; + } + } + return NULL; + +found: + iter->tb = tb; + return n; +} + +static void fib_trie_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static void seq_indent(struct seq_file *seq, int n) +{ + while (n-- > 0) + seq_puts(seq, " "); +} + +static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) +{ + switch (s) { + case RT_SCOPE_UNIVERSE: return "universe"; + case RT_SCOPE_SITE: return "site"; + case RT_SCOPE_LINK: return "link"; + case RT_SCOPE_HOST: return "host"; + case RT_SCOPE_NOWHERE: return "nowhere"; + default: + snprintf(buf, len, "scope=%d", s); + return buf; + } +} + +static const char *const rtn_type_names[__RTN_MAX] = { + [RTN_UNSPEC] = "UNSPEC", + [RTN_UNICAST] = "UNICAST", + [RTN_LOCAL] = "LOCAL", + [RTN_BROADCAST] = "BROADCAST", + [RTN_ANYCAST] = "ANYCAST", + [RTN_MULTICAST] = "MULTICAST", + [RTN_BLACKHOLE] = "BLACKHOLE", + [RTN_UNREACHABLE] = "UNREACHABLE", + [RTN_PROHIBIT] = "PROHIBIT", + [RTN_THROW] = "THROW", + [RTN_NAT] = "NAT", + [RTN_XRESOLVE] = "XRESOLVE", +}; + +static inline const char *rtn_type(char *buf, size_t len, unsigned int t) +{ + if (t < __RTN_MAX && rtn_type_names[t]) + return rtn_type_names[t]; + snprintf(buf, len, "type %u", t); + return buf; +} + +/* Pretty print the trie */ +static int fib_trie_seq_show(struct seq_file *seq, void *v) +{ + const struct fib_trie_iter *iter = seq->private; + struct rt_trie_node *n = v; + + if (!node_parent_rcu(n)) + fib_table_print(seq, iter->tb); + + if (IS_TNODE(n)) { + struct tnode *tn = (struct tnode *) n; + __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); + + seq_indent(seq, iter->depth-1); + seq_printf(seq, " +-- %pI4/%d %d %d %d\n", + &prf, tn->pos, tn->bits, tn->full_children, + tn->empty_children); + + } else { + struct leaf *l = (struct leaf *) n; + struct leaf_info *li; + struct hlist_node *node; + __be32 val = htonl(l->key); + + seq_indent(seq, iter->depth); + seq_printf(seq, " |-- %pI4\n", &val); + + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + char buf1[32], buf2[32]; + + seq_indent(seq, iter->depth+1); + seq_printf(seq, " /%d %s %s", li->plen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_info->fib_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, " tos=%d", fa->fa_tos); + seq_putc(seq, '\n'); + } + } + } + + return 0; +} + +static const struct seq_operations fib_trie_seq_ops = { + .start = fib_trie_seq_start, + .next = fib_trie_seq_next, + .stop = fib_trie_seq_stop, + .show = fib_trie_seq_show, +}; + +static int fib_trie_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &fib_trie_seq_ops, + sizeof(struct fib_trie_iter)); +} + +static const struct file_operations fib_trie_fops = { + .owner = THIS_MODULE, + .open = fib_trie_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct fib_route_iter { + struct seq_net_private p; + struct trie *main_trie; + loff_t pos; + t_key key; +}; + +static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) +{ + struct leaf *l = NULL; + struct trie *t = iter->main_trie; + + /* use cache location of last found key */ + if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) + pos -= iter->pos; + else { + iter->pos = 0; + l = trie_firstleaf(t); + } + + while (l && pos-- > 0) { + iter->pos++; + l = trie_nextleaf(l); + } + + if (l) + iter->key = pos; /* remember it */ + else + iter->pos = 0; /* forget it */ + + return l; +} + +static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct fib_route_iter *iter = seq->private; + struct fib_table *tb; + + rcu_read_lock(); + tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); + if (!tb) + return NULL; + + iter->main_trie = (struct trie *) tb->tb_data; + if (*pos == 0) + return SEQ_START_TOKEN; + else + return fib_route_get_idx(iter, *pos - 1); +} + +static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct fib_route_iter *iter = seq->private; + struct leaf *l = v; + + ++*pos; + if (v == SEQ_START_TOKEN) { + iter->pos = 0; + l = trie_firstleaf(iter->main_trie); + } else { + iter->pos++; + l = trie_nextleaf(l); + } + + if (l) + iter->key = l->key; + else + iter->pos = 0; + return l; +} + +static void fib_route_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +{ + unsigned int flags = 0; + + if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) + flags = RTF_REJECT; + if (fi && fi->fib_nh->nh_gw) + flags |= RTF_GATEWAY; + if (mask == htonl(0xFFFFFFFF)) + flags |= RTF_HOST; + flags |= RTF_UP; + return flags; +} + +/* + * This outputs /proc/net/route. + * The format of the file is not supposed to be changed + * and needs to be same as fib_hash output to avoid breaking + * legacy utilities + */ +static int fib_route_seq_show(struct seq_file *seq, void *v) +{ + struct leaf *l = v; + struct leaf_info *li; + struct hlist_node *node; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " + "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" + "\tWindow\tIRTT"); + return 0; + } + + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + struct fib_alias *fa; + __be32 mask, prefix; + + mask = inet_make_mask(li->plen); + prefix = htonl(l->key); + + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + const struct fib_info *fi = fa->fa_info; + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); + int len; + + if (fa->fa_type == RTN_BROADCAST + || fa->fa_type == RTN_MULTICAST) + continue; + + if (fi) + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u%n", + fi->fib_dev ? fi->fib_dev->name : "*", + prefix, + fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_priority, + mask, + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3, &len); + else + seq_printf(seq, + "*\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u%n", + prefix, 0, flags, 0, 0, 0, + mask, 0, 0, 0, &len); + + seq_printf(seq, "%*s\n", 127 - len, ""); + } + } + + return 0; +} + +static const struct seq_operations fib_route_seq_ops = { + .start = fib_route_seq_start, + .next = fib_route_seq_next, + .stop = fib_route_seq_stop, + .show = fib_route_seq_show, +}; + +static int fib_route_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &fib_route_seq_ops, + sizeof(struct fib_route_iter)); +} + +static const struct file_operations fib_route_fops = { + .owner = THIS_MODULE, + .open = fib_route_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +int __net_init fib_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) + goto out1; + + if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, + &fib_triestat_fops)) + goto out2; + + if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) + goto out3; + + return 0; + +out3: + proc_net_remove(net, "fib_triestat"); +out2: + proc_net_remove(net, "fib_trie"); +out1: + return -ENOMEM; +} + +void __net_exit fib_proc_exit(struct net *net) +{ + proc_net_remove(net, "fib_trie"); + proc_net_remove(net, "fib_triestat"); + proc_net_remove(net, "route"); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 00000000..3e3f75d9 --- /dev/null +++ b/net/ipv4/gre.c @@ -0,0 +1,143 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; +static DEFINE_SPINLOCK(gre_proto_lock); + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version]) + goto err_out_unlock; + + rcu_assign_pointer(gre_proto[version], proto); + spin_unlock(&gre_proto_lock); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (rcu_dereference_protected(gre_proto[version], + lockdep_is_held(&gre_proto_lock)) != proto) + goto err_out_unlock; + rcu_assign_pointer(gre_proto[version], NULL); + spin_unlock(&gre_proto_lock); + synchronize_rcu(); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + const struct iphdr *iph = (const struct iphdr *)skb->data; + u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + + if (ver >= GREPROTO_MAX) + return; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (proto && proto->err_handler) + proto->err_handler(skb, info); + rcu_read_unlock(); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("gre: can't add protocol\n"); + return -EAGAIN; + } + + return 0; +} + +static void __exit gre_exit(void) +{ + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); + diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c new file mode 100644 index 00000000..23ef31ba --- /dev/null +++ b/net/ipv4/icmp.c @@ -0,0 +1,1207 @@ +/* + * NET3: Implementation of the ICMP protocol layer. + * + * Alan Cox, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Some of the function names and the icmp unreach table for this + * module were derived from [icmp.c 1.0.11 06/02/93] by + * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. + * Other than that this module is a complete rewrite. + * + * Fixes: + * Clemens Fruhwirth : introduce global icmp rate limiting + * with icmp type masking ability instead + * of broken per type icmp timeouts. + * Mike Shaver : RFC1122 checks. + * Alan Cox : Multicast ping reply as self. + * Alan Cox : Fix atomicity lockup in ip_build_xmit + * call. + * Alan Cox : Added 216,128 byte paths to the MTU + * code. + * Martin Mares : RFC1812 checks. + * Martin Mares : Can be configured to follow redirects + * if acting as a router _without_ a + * routing protocol (RFC 1812). + * Martin Mares : Echo requests may be configured to + * be ignored (RFC 1812). + * Martin Mares : Limitation of ICMP error message + * transmit rate (RFC 1812). + * Martin Mares : TOS and Precedence set correctly + * (RFC 1812). + * Martin Mares : Now copying as much data from the + * original packet as we can without + * exceeding 576 bytes (RFC 1812). + * Willy Konynenberg : Transparent proxying support. + * Keith Owens : RFC1191 correction for 4.2BSD based + * path MTU bug. + * Thomas Quinot : ICMP Dest Unreach codes up to 15 are + * valid (RFC 1812). + * Andi Kleen : Check all packet lengths properly + * and moved all kfree_skb() up to + * icmp_rcv. + * Andi Kleen : Move the rate limit bookkeeping + * into the dest entry and use a token + * bucket filter (thanks to ANK). Make + * the rates sysctl configurable. + * Yu Tianli : Fixed two ugly bugs in icmp_send + * - IP option length was accounted wrongly + * - ICMP header length was not accounted + * at all. + * Tristan Greaves : Added sysctl option to ignore bogus + * broadcast responses from broken routers. + * + * To Fix: + * + * - Should use skb_pull() instead of all the manual checking. + * This would also greatly simply some upper layer error handlers. --AK + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Build xmit assembly blocks + */ + +struct icmp_bxm { + struct sk_buff *skb; + int offset; + int data_len; + + struct { + struct icmphdr icmph; + __be32 times[3]; + } data; + int head_len; + struct ip_options_data replyopts; +}; + +/* An array of errno for error messages from dest unreach. */ +/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ + +const struct icmp_err icmp_err_convert[] = { + { + .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */ + .fatal = 0, + }, + { + .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */, + .fatal = 1, + }, + { + .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */ + .fatal = 1, + }, + { + .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */ + .fatal = 0, + }, + { + .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */ + .fatal = 0, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */ + .fatal = 1, + }, + { + .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */ + .fatal = 1, + }, + { + .errno = ENONET, /* ICMP_HOST_ISOLATED */ + .fatal = 1, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_ANO */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */ + .fatal = 1, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */ + .fatal = 1, + }, +}; +EXPORT_SYMBOL(icmp_err_convert); + +/* + * ICMP control array. This specifies what to do with each ICMP. + */ + +struct icmp_control { + void (*handler)(struct sk_buff *skb); + short error; /* This ICMP is classed as an error message */ +}; + +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static struct sock *icmp_sk(struct net *net) +{ + return net->ipv4.icmp_sk[smp_processor_id()]; +} + +static inline struct sock *icmp_xmit_lock(struct net *net) +{ + struct sock *sk; + + local_bh_disable(); + + sk = icmp_sk(net); + + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { + /* This can happen if the output path signals a + * dst_link_failure() for an outgoing ICMP packet. + */ + local_bh_enable(); + return NULL; + } + return sk; +} + +static inline void icmp_xmit_unlock(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + +/* + * Send an ICMP frame. + */ + +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) +{ + struct dst_entry *dst = &rt->dst; + bool rc = true; + + if (type > NR_ICMP_TYPES) + goto out; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + goto out; + + /* No rate limit on loopback */ + if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + goto out; + + /* Limit if icmp type is enabled in ratemask. */ + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { + if (!rt->peer) + rt_bind_peer(rt, fl4->daddr, 1); + rc = inet_peer_xrlim_allow(rt->peer, + net->ipv4.sysctl_icmp_ratelimit); + } +out: + return rc; +} + +/* + * Maintain the counters used in the SNMP statistics for outgoing ICMP + */ +void icmp_out_count(struct net *net, unsigned char type) +{ + ICMPMSGOUT_INC_STATS(net, type); + ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); +} + +/* + * Checksum each fragment, and on the first include the headers and final + * checksum. + */ +static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct icmp_bxm *icmp_param = (struct icmp_bxm *)from; + __wsum csum; + + csum = skb_copy_and_csum_bits(icmp_param->skb, + icmp_param->offset + offset, + to, len, 0); + + skb->csum = csum_block_add(skb->csum, csum, odd); + if (icmp_pointers[icmp_param->data.icmph.type].error) + nf_ct_attach(skb, icmp_param->skb); + return 0; +} + +static void icmp_push_reply(struct icmp_bxm *icmp_param, + struct flowi4 *fl4, + struct ipcm_cookie *ipc, struct rtable **rt) +{ + struct sock *sk; + struct sk_buff *skb; + + sk = icmp_sk(dev_net((*rt)->dst.dev)); + if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, + icmp_param->data_len+icmp_param->head_len, + icmp_param->head_len, + ipc, rt, MSG_DONTWAIT) < 0) { + ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS); + ip_flush_pending_frames(sk); + } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + struct icmphdr *icmph = icmp_hdr(skb); + __wsum csum = 0; + struct sk_buff *skb1; + + skb_queue_walk(&sk->sk_write_queue, skb1) { + csum = csum_add(csum, skb1->csum); + } + csum = csum_partial_copy_nocheck((void *)&icmp_param->data, + (char *)icmph, + icmp_param->head_len, csum); + icmph->checksum = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(sk, fl4); + } +} + +/* + * Driving logic for building and sending ICMP messages. + */ + +static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) +{ + struct ipcm_cookie ipc; + struct rtable *rt = skb_rtable(skb); + struct net *net = dev_net(rt->dst.dev); + struct flowi4 fl4; + struct sock *sk; + struct inet_sock *inet; + __be32 daddr; + + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) + return; + + sk = icmp_xmit_lock(net); + if (sk == NULL) + return; + inet = inet_sk(sk); + + icmp_param->data.icmph.checksum = 0; + + inet->tos = ip_hdr(skb)->tos; + daddr = ipc.addr = ip_hdr(skb)->saddr; + ipc.opt = NULL; + ipc.tx_flags = 0; + if (icmp_param->replyopts.opt.opt.optlen) { + ipc.opt = &icmp_param->replyopts.opt; + if (ipc.opt->opt.srr) + daddr = icmp_param->replyopts.opt.opt.faddr; + } + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = rt->rt_spec_dst; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_proto = IPPROTO_ICMP; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + goto out_unlock; + if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, + icmp_param->data.icmph.code)) + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + ip_rt_put(rt); +out_unlock: + icmp_xmit_unlock(sk); +} + +static struct rtable *icmp_route_lookup(struct net *net, + struct flowi4 *fl4, + struct sk_buff *skb_in, + const struct iphdr *iph, + __be32 saddr, u8 tos, + int type, int code, + struct icmp_bxm *param) +{ + struct rtable *rt, *rt2; + struct flowi4 fl4_dec; + int err; + + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = (param->replyopts.opt.opt.srr ? + param->replyopts.opt.opt.faddr : iph->saddr); + fl4->saddr = saddr; + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_proto = IPPROTO_ICMP; + fl4->fl4_icmp_type = type; + fl4->fl4_icmp_code = code; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + rt = __ip_route_output_key(net, fl4); + if (IS_ERR(rt)) + return rt; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + if (!IS_ERR(rt)) { + if (rt != rt2) + return rt; + } else if (PTR_ERR(rt) == -EPERM) { + rt = NULL; + } else + return rt; + + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); + if (err) + goto relookup_failed; + + if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, &fl4_dec); + if (IS_ERR(rt2)) + err = PTR_ERR(rt2); + } else { + struct flowi4 fl4_2 = {}; + unsigned long orefdst; + + fl4_2.daddr = fl4_dec.saddr; + rt2 = ip_route_output_key(net, &fl4_2); + if (IS_ERR(rt2)) { + err = PTR_ERR(rt2); + goto relookup_failed; + } + /* Ugh! */ + orefdst = skb_in->_skb_refdst; /* save old refdst */ + err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, + RT_TOS(tos), rt2->dst.dev); + + dst_release(&rt2->dst); + rt2 = skb_rtable(skb_in); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ + } + + if (err) + goto relookup_failed; + + rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, + flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + if (!IS_ERR(rt2)) { + dst_release(&rt->dst); + memcpy(fl4, &fl4_dec, sizeof(*fl4)); + rt = rt2; + } else if (PTR_ERR(rt2) == -EPERM) { + if (rt) + dst_release(&rt->dst); + return rt2; + } else { + err = PTR_ERR(rt2); + goto relookup_failed; + } + return rt; + +relookup_failed: + if (rt) + return rt; + return ERR_PTR(err); +} + +/* + * Send an ICMP message in response to a situation + * + * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. + * MAY send more (we do). + * MUST NOT change this header information. + * MUST NOT reply to a multicast/broadcast IP address. + * MUST NOT reply to a multicast/broadcast MAC address. + * MUST reply to only the first fragment. + */ + +void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct iphdr *iph; + int room; + struct icmp_bxm icmp_param; + struct rtable *rt = skb_rtable(skb_in); + struct ipcm_cookie ipc; + struct flowi4 fl4; + __be32 saddr; + u8 tos; + struct net *net; + struct sock *sk; + + if (!rt) + goto out; + net = dev_net(rt->dst.dev); + + /* + * Find the original header. It is expected to be valid, of course. + * Check this, icmp_send is called from the most obscure devices + * sometimes. + */ + iph = ip_hdr(skb_in); + + if ((u8 *)iph < skb_in->head || + (skb_in->network_header + sizeof(*iph)) > skb_in->tail) + goto out; + + /* + * No replies to physical multicast/broadcast + */ + if (skb_in->pkt_type != PACKET_HOST) + goto out; + + /* + * Now check at the protocol level + */ + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto out; + + /* + * Only reply to fragment 0. We byte re-order the constant + * mask for efficiency. + */ + if (iph->frag_off & htons(IP_OFFSET)) + goto out; + + /* + * If we send an ICMP error to an ICMP error a mess would result.. + */ + if (icmp_pointers[type].error) { + /* + * We are an error, check if we are replying to an + * ICMP error + */ + if (iph->protocol == IPPROTO_ICMP) { + u8 _inner_type, *itp; + + itp = skb_header_pointer(skb_in, + skb_network_header(skb_in) + + (iph->ihl << 2) + + offsetof(struct icmphdr, + type) - + skb_in->data, + sizeof(_inner_type), + &_inner_type); + if (itp == NULL) + goto out; + + /* + * Assume any unknown ICMP type is an error. This + * isn't specified by the RFC, but think about it.. + */ + if (*itp > NR_ICMP_TYPES || + icmp_pointers[*itp].error) + goto out; + } + } + + sk = icmp_xmit_lock(net); + if (sk == NULL) + return; + + /* + * Construct source address and options. + */ + + saddr = iph->daddr; + if (!(rt->rt_flags & RTCF_LOCAL)) { + struct net_device *dev = NULL; + + rcu_read_lock(); + if (rt_is_input_route(rt) && + net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) + dev = dev_get_by_index_rcu(net, rt->rt_iif); + + if (dev) + saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + else + saddr = 0; + rcu_read_unlock(); + } + + tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + IPTOS_PREC_INTERNETCONTROL) : + iph->tos; + + if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) + goto out_unlock; + + + /* + * Prepare data for ICMP header. + */ + + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_network_offset(skb_in); + inet_sk(sk)->tos = tos; + ipc.addr = iph->saddr; + ipc.opt = &icmp_param.replyopts.opt; + ipc.tx_flags = 0; + + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, + type, code, &icmp_param); + if (IS_ERR(rt)) + goto out_unlock; + + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + goto ende; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + + room = dst_mtu(&rt->dst); + if (room > 576) + room = 576; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; + room -= sizeof(struct icmphdr); + + icmp_param.data_len = skb_in->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); + + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); +ende: + ip_rt_put(rt); +out_unlock: + icmp_xmit_unlock(sk); +out:; +} +EXPORT_SYMBOL(icmp_send); + + +/* + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + */ + +static void icmp_unreach(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct icmphdr *icmph; + int hash, protocol; + const struct net_protocol *ipprot; + u32 info = 0; + struct net *net; + + net = dev_net(skb_dst(skb)->dev); + + /* + * Incomplete header ? + * Only checks for the IP header, there should be an + * additional check for longer headers in upper levels. + */ + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out_err; + + icmph = icmp_hdr(skb); + iph = (const struct iphdr *)skb->data; + + if (iph->ihl < 5) /* Mangled header, drop. */ + goto out_err; + + if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->code & 15) { + case ICMP_NET_UNREACH: + case ICMP_HOST_UNREACH: + case ICMP_PROT_UNREACH: + case ICMP_PORT_UNREACH: + break; + case ICMP_FRAG_NEEDED: + if (ipv4_config.no_pmtu_disc) { + LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n", + &iph->daddr); + } else { + info = ip_rt_frag_needed(net, iph, + ntohs(icmph->un.frag.mtu), + skb->dev); + if (!info) + goto out; + } + break; + case ICMP_SR_FAILED: + LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n", + &iph->daddr); + break; + default: + break; + } + if (icmph->code > NR_ICMP_UNREACH) + goto out; + } else if (icmph->type == ICMP_PARAMETERPROB) + info = ntohl(icmph->un.gateway) >> 24; + + /* + * Throw it at our lower layers + * + * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed + * header. + * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the + * transport layer. + * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to + * transport layer. + */ + + /* + * Check the other end isn't violating RFC 1122. Some routers send + * bogus responses to broadcast frames. If you see this message + * first check your netmask matches at both ends, if it does then + * get the other vendor to fix their kit. + */ + + if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && + inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { + if (net_ratelimit()) + printk(KERN_WARNING "%pI4 sent an invalid ICMP " + "type %u, code %u " + "error to a broadcast: %pI4 on %s\n", + &ip_hdr(skb)->saddr, + icmph->type, icmph->code, + &iph->daddr, + skb->dev->name); + goto out; + } + + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + iph = (const struct iphdr *)skb->data; + protocol = iph->protocol; + + /* + * Deliver ICMP message to raw sockets. Pretty useless feature? + */ + raw_icmp_error(skb, protocol, info); + + hash = protocol & (MAX_INET_PROTOS - 1); + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); + +out: + return; +out_err: + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + goto out; +} + + +/* + * Handle ICMP_REDIRECT. + */ + +static void icmp_redirect(struct sk_buff *skb) +{ + const struct iphdr *iph; + + if (skb->len < sizeof(struct iphdr)) + goto out_err; + + /* + * Get the copied header of the packet that caused the redirect + */ + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + iph = (const struct iphdr *)skb->data; + + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + /* + * As per RFC recommendations now handle it as a host redirect. + */ + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, + icmp_hdr(skb)->un.gateway, + iph->saddr, skb->dev); + break; + } + + /* Ping wants to see redirects. + * Let's pretend they are errors of sorts... */ + if (iph->protocol == IPPROTO_ICMP && + iph->ihl >= 5 && + pskb_may_pull(skb, (iph->ihl<<2)+8)) { + ping_err(skb, icmp_hdr(skb)->un.gateway); + } + +out: + return; +out_err: + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + goto out; +} + +/* + * Handle ICMP_ECHO ("ping") requests. + * + * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo + * requests. + * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be + * included in the reply. + * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring + * echo requests, MUST have default=NOT. + * See also WRT handling of options once they are done and working. + */ + +static void icmp_echo(struct sk_buff *skb) +{ + struct net *net; + + net = dev_net(skb_dst(skb)->dev); + if (!net->ipv4.sysctl_icmp_echo_ignore_all) { + struct icmp_bxm icmp_param; + + icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.data.icmph.type = ICMP_ECHOREPLY; + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = skb->len; + icmp_param.head_len = sizeof(struct icmphdr); + icmp_reply(&icmp_param, skb); + } +} + +/* + * Handle ICMP Timestamp requests. + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. + */ +static void icmp_timestamp(struct sk_buff *skb) +{ + struct timespec tv; + struct icmp_bxm icmp_param; + /* + * Too short. + */ + if (skb->len < 4) + goto out_err; + + /* + * Fill in the current time as ms since midnight UT: + */ + getnstimeofday(&tv); + icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + + tv.tv_nsec / NSEC_PER_MSEC); + icmp_param.data.times[2] = icmp_param.data.times[1]; + if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) + BUG(); + icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; + icmp_param.data.icmph.code = 0; + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = 0; + icmp_param.head_len = sizeof(struct icmphdr) + 12; + icmp_reply(&icmp_param, skb); +out: + return; +out_err: + ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + goto out; +} + + +/* + * Handle ICMP_ADDRESS_MASK requests. (RFC950) + * + * RFC1122 (3.2.2.9). A host MUST only send replies to + * ADDRESS_MASK requests if it's been configured as an address mask + * agent. Receiving a request doesn't constitute implicit permission to + * act as one. Of course, implementing this correctly requires (SHOULD) + * a way to turn the functionality on and off. Another one for sysctl(), + * I guess. -- MS + * + * RFC1812 (4.3.3.9). A router MUST implement it. + * A router SHOULD have switch turning it on/off. + * This switch MUST be ON by default. + * + * Gratuitous replies, zero-source replies are not implemented, + * that complies with RFC. DO NOT implement them!!! All the idea + * of broadcast addrmask replies as specified in RFC950 is broken. + * The problem is that it is not uncommon to have several prefixes + * on one physical interface. Moreover, addrmask agent can even be + * not aware of existing another prefixes. + * If source is zero, addrmask agent cannot choose correct prefix. + * Gratuitous mask announcements suffer from the same problem. + * RFC1812 explains it, but still allows to use ADDRMASK, + * that is pretty silly. --ANK + * + * All these rules are so bizarre, that I removed kernel addrmask + * support at all. It is wrong, it is obsolete, nobody uses it in + * any case. --ANK + * + * Furthermore you can do it with a usermode address agent program + * anyway... + */ + +static void icmp_address(struct sk_buff *skb) +{ +#if 0 + if (net_ratelimit()) + printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); +#endif +} + +/* + * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain + * loudly if an inconsistency is found. + * called with rcu_read_lock() + */ + +static void icmp_address_reply(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + + if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) + return; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + return; + + if (in_dev->ifa_list && + IN_DEV_LOG_MARTIANS(in_dev) && + IN_DEV_FORWARD(in_dev)) { + __be32 _mask, *mp; + + mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); + BUG_ON(mp == NULL); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (*mp == ifa->ifa_mask && + inet_ifa_match(ip_hdr(skb)->saddr, ifa)) + break; + } + if (!ifa && net_ratelimit()) { + printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", + mp, dev->name, &ip_hdr(skb)->saddr); + } + } +} + +static void icmp_discard(struct sk_buff *skb) +{ +} + +/* + * Deal with incoming ICMP packets. + */ +int icmp_rcv(struct sk_buff *skb) +{ + struct icmphdr *icmph; + struct rtable *rt = skb_rtable(skb); + struct net *net = dev_net(rt->dst.dev); + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + struct sec_path *sp = skb_sec_path(skb); + int nh; + + if (!(sp && sp->xvec[sp->len - 1]->props.flags & + XFRM_STATE_ICMP)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr))) + goto drop; + + nh = skb_network_offset(skb); + skb_set_network_header(skb, sizeof(*icmph)); + + if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + skb_set_network_header(skb, nh); + } + + ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto error; + } + + if (!pskb_pull(skb, sizeof(*icmph))) + goto error; + + icmph = icmp_hdr(skb); + + ICMPMSGIN_INC_STATS_BH(net, icmph->type); + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) + goto error; + + + /* + * Parse the ICMP message + */ + + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + /* + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored (we let user decide with a sysctl). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. + */ + if ((icmph->type == ICMP_ECHO || + icmph->type == ICMP_TIMESTAMP) && + net->ipv4.sysctl_icmp_echo_ignore_broadcasts) { + goto error; + } + if (icmph->type != ICMP_ECHO && + icmph->type != ICMP_TIMESTAMP && + icmph->type != ICMP_ADDRESS && + icmph->type != ICMP_ADDRESSREPLY) { + goto error; + } + } + + icmp_pointers[icmph->type].handler(skb); + +drop: + kfree_skb(skb); + return 0; +error: + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + goto drop; +} + +/* + * This table is the definition of how we handle ICMP. + */ +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { + [ICMP_ECHOREPLY] = { + .handler = ping_rcv, + }, + [1] = { + .handler = icmp_discard, + .error = 1, + }, + [2] = { + .handler = icmp_discard, + .error = 1, + }, + [ICMP_DEST_UNREACH] = { + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_SOURCE_QUENCH] = { + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_REDIRECT] = { + .handler = icmp_redirect, + .error = 1, + }, + [6] = { + .handler = icmp_discard, + .error = 1, + }, + [7] = { + .handler = icmp_discard, + .error = 1, + }, + [ICMP_ECHO] = { + .handler = icmp_echo, + }, + [9] = { + .handler = icmp_discard, + .error = 1, + }, + [10] = { + .handler = icmp_discard, + .error = 1, + }, + [ICMP_TIME_EXCEEDED] = { + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_PARAMETERPROB] = { + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_TIMESTAMP] = { + .handler = icmp_timestamp, + }, + [ICMP_TIMESTAMPREPLY] = { + .handler = icmp_discard, + }, + [ICMP_INFO_REQUEST] = { + .handler = icmp_discard, + }, + [ICMP_INFO_REPLY] = { + .handler = icmp_discard, + }, + [ICMP_ADDRESS] = { + .handler = icmp_address, + }, + [ICMP_ADDRESSREPLY] = { + .handler = icmp_address_reply, + }, +}; + +static void __net_exit icmp_sk_exit(struct net *net) +{ + int i; + + for_each_possible_cpu(i) + inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); + kfree(net->ipv4.icmp_sk); + net->ipv4.icmp_sk = NULL; +} + +static int __net_init icmp_sk_init(struct net *net) +{ + int i, err; + + net->ipv4.icmp_sk = + kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + if (net->ipv4.icmp_sk == NULL) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct sock *sk; + + err = inet_ctl_sock_create(&sk, PF_INET, + SOCK_RAW, IPPROTO_ICMP, net); + if (err < 0) + goto fail; + + net->ipv4.icmp_sk[i] = sk; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + sk->sk_sndbuf = + (2 * ((64 * 1024) + sizeof(struct sk_buff))); + + /* + * Speedup sock_wfree() + */ + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; + } + + /* Control parameters for ECHO replies. */ + net->ipv4.sysctl_icmp_echo_ignore_all = 0; + net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1; + + /* Control parameter - ignore bogus broadcast responses? */ + net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1; + + /* + * Configurable global rate limit. + * + * ratelimit defines tokens/packet consumed for dst->rate_token + * bucket ratemask defines which icmp types are ratelimited by + * setting it's bit position. + * + * default: + * dest unreachable (3), source quench (4), + * time exceeded (11), parameter problem (12) + */ + + net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; + net->ipv4.sysctl_icmp_ratemask = 0x1818; + net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + + return 0; + +fail: + for_each_possible_cpu(i) + inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); + kfree(net->ipv4.icmp_sk); + return err; +} + +static struct pernet_operations __net_initdata icmp_sk_ops = { + .init = icmp_sk_init, + .exit = icmp_sk_exit, +}; + +int __init icmp_init(void) +{ + return register_pernet_subsys(&icmp_sk_ops); +} diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c new file mode 100644 index 00000000..e0d42dbb --- /dev/null +++ b/net/ipv4/igmp.c @@ -0,0 +1,2661 @@ +/* + * Linux NET3: Internet Group Management Protocol [IGMP] + * + * This code implements the IGMP protocol as defined in RFC1112. There has + * been a further revision of this protocol since which is now supported. + * + * If you have trouble with this module be careful what gcc you have used, + * the older version didn't come out right using gcc 2.5.8, the newer one + * seems to fall out with gcc 2.6.2. + * + * Authors: + * Alan Cox + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * + * Alan Cox : Added lots of __inline__ to optimise + * the memory usage of all the tiny little + * functions. + * Alan Cox : Dumped the header building experiment. + * Alan Cox : Minor tweaks ready for multicast routing + * and extended IGMP protocol. + * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 + * writes utterly bogus code otherwise (sigh) + * fixed IGMP loopback to behave in the manner + * desired by mrouted, fixed the fact it has been + * broken since 1.3.6 and cleaned up a few minor + * points. + * + * Chih-Jen Chang : Tried to revise IGMP to Version 2 + * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu + * The enhancements are mainly based on Steve Deering's + * ipmulti-3.5 source code. + * Chih-Jen Chang : Added the igmp_get_mrouter_info and + * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of + * the mrouted version on that device. + * Chih-Jen Chang : Added the max_resp_time parameter to + * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter + * to identify the multicast router version + * and do what the IGMP version 2 specified. + * Chih-Jen Chang : Added a timer to revert to IGMP V2 router + * Tsu-Sheng Tsao if the specified time expired. + * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. + * Alan Cox : Use GFP_ATOMIC in the right places. + * Christian Daudt : igmp timer wasn't set for local group + * memberships but was being deleted, + * which caused a "del_timer() called + * from %p with timer not initialized\n" + * message (960131). + * Christian Daudt : removed del_timer from + * igmp_timer_expire function (960205). + * Christian Daudt : igmp_heard_report now only calls + * igmp_timer_expire if tm->running is + * true (960216). + * Malcolm Beattie : ttl comparison wrong in igmp_rcv made + * igmp_heard_query never trigger. Expiry + * miscalculation fixed in igmp_heard_query + * and random() made to return unsigned to + * prevent negative expiry times. + * Alexey Kuznetsov: Wrong group leaving behaviour, backport + * fix from pending 2.1.x patches. + * Alan Cox: Forget to enable FDDI support earlier. + * Alexey Kuznetsov: Fixed leaving groups on device down. + * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. + * David L Stevens: IGMPv3 support, with help from + * Vinay Kulkarni + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IP_MROUTE +#include +#endif +#ifdef CONFIG_PROC_FS +#include +#include +#endif + +#define IP_MAX_MEMBERSHIPS 20 +#define IP_MAX_MSF 10 + +#ifdef CONFIG_IP_MULTICAST +/* Parameter names and values are taken from igmp-v2-06 draft */ + +#define IGMP_V1_Router_Present_Timeout (400*HZ) +#define IGMP_V2_Router_Present_Timeout (400*HZ) +#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Query_Response_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Count 2 + + +#define IGMP_Initial_Report_Delay (1) + +/* IGMP_Initial_Report_Delay is not from IGMP specs! + * IGMP specs require to report membership immediately after + * joining a group, but we delay the first report by a + * small interval. It seems more natural and still does not + * contradict to specs provided this delay is small enough. + */ + +#define IGMP_V1_SEEN(in_dev) \ + (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ + IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ + ((in_dev)->mr_v1_seen && \ + time_before(jiffies, (in_dev)->mr_v1_seen))) +#define IGMP_V2_SEEN(in_dev) \ + (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ + IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ + ((in_dev)->mr_v2_seen && \ + time_before(jiffies, (in_dev)->mr_v2_seen))) + +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); +static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); +static void igmpv3_clear_delrec(struct in_device *in_dev); +static int sf_setstate(struct ip_mc_list *pmc); +static void sf_markstate(struct ip_mc_list *pmc); +#endif +static void ip_mc_clear_src(struct ip_mc_list *pmc); +static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, + int sfcount, __be32 *psfsrc, int delta); + +static void ip_ma_put(struct ip_mc_list *im) +{ + if (atomic_dec_and_test(&im->refcnt)) { + in_dev_put(im->interface); + kfree_rcu(im, rcu); + } +} + +#define for_each_pmc_rcu(in_dev, pmc) \ + for (pmc = rcu_dereference(in_dev->mc_list); \ + pmc != NULL; \ + pmc = rcu_dereference(pmc->next_rcu)) + +#define for_each_pmc_rtnl(in_dev, pmc) \ + for (pmc = rtnl_dereference(in_dev->mc_list); \ + pmc != NULL; \ + pmc = rtnl_dereference(pmc->next_rcu)) + +#ifdef CONFIG_IP_MULTICAST + +/* + * Timer management + */ + +static void igmp_stop_timer(struct ip_mc_list *im) +{ + spin_lock_bh(&im->lock); + if (del_timer(&im->timer)) + atomic_dec(&im->refcnt); + im->tm_running = 0; + im->reporter = 0; + im->unsolicit_count = 0; + spin_unlock_bh(&im->lock); +} + +/* It must be called with locked im->lock */ +static void igmp_start_timer(struct ip_mc_list *im, int max_delay) +{ + int tv = net_random() % max_delay; + + im->tm_running = 1; + if (!mod_timer(&im->timer, jiffies+tv+2)) + atomic_inc(&im->refcnt); +} + +static void igmp_gq_start_timer(struct in_device *in_dev) +{ + int tv = net_random() % in_dev->mr_maxdelay; + + in_dev->mr_gq_running = 1; + if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) + in_dev_hold(in_dev); +} + +static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) + in_dev_hold(in_dev); +} + +static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) +{ + spin_lock_bh(&im->lock); + im->unsolicit_count = 0; + if (del_timer(&im->timer)) { + if ((long)(im->timer.expires-jiffies) < max_delay) { + add_timer(&im->timer); + im->tm_running = 1; + spin_unlock_bh(&im->lock); + return; + } + atomic_dec(&im->refcnt); + } + igmp_start_timer(im, max_delay); + spin_unlock_bh(&im->lock); +} + + +/* + * Send an IGMP report. + */ + +#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) + + +static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case IGMPV3_MODE_IS_INCLUDE: + case IGMPV3_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (!(pmc->gsquery && !psf->sf_gsresp)) { + if (pmc->sfmode == MCAST_INCLUDE) + return 1; + /* don't include if this source is excluded + * in all filters + */ + if (psf->sf_count[MCAST_INCLUDE]) + return type == IGMPV3_MODE_IS_INCLUDE; + return pmc->sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + } + return 0; + case IGMPV3_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case IGMPV3_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case IGMPV3_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; + case IGMPV3_BLOCK_OLD_SOURCES: + if (pmc->sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip_sf_list *psf; + int scount = 0; + + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) + +static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +{ + struct sk_buff *skb; + struct rtable *rt; + struct iphdr *pip; + struct igmpv3_report *pig; + struct net *net = dev_net(dev); + struct flowi4 fl4; + + while (1) { + skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), + GFP_ATOMIC | __GFP_NOWARN); + if (skb) + break; + size >>= 1; + if (size < 256) + return NULL; + } + igmp_skb_size(skb) = size; + + rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) { + kfree_skb(skb); + return NULL; + } + + skb_dst_set(skb, &rt->dst); + skb->dev = dev; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb_reset_network_header(skb); + pip = ip_hdr(skb); + skb_put(skb, sizeof(struct iphdr) + 4); + + pip->version = 4; + pip->ihl = (sizeof(struct iphdr)+4)>>2; + pip->tos = 0xc0; + pip->frag_off = htons(IP_DF); + pip->ttl = 1; + pip->daddr = fl4.daddr; + pip->saddr = fl4.saddr; + pip->protocol = IPPROTO_IGMP; + pip->tot_len = 0; /* filled in later */ + ip_select_ident(pip, &rt->dst, NULL); + ((u8*)&pip[1])[0] = IPOPT_RA; + ((u8*)&pip[1])[1] = 4; + ((u8*)&pip[1])[2] = 0; + ((u8*)&pip[1])[3] = 0; + + skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; + skb_put(skb, sizeof(*pig)); + pig = igmpv3_report_hdr(skb); + pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; + pig->resv1 = 0; + pig->csum = 0; + pig->resv2 = 0; + pig->ngrec = 0; + return skb; +} + +static int igmpv3_sendpack(struct sk_buff *skb) +{ + struct igmphdr *pig = igmp_hdr(skb); + const int igmplen = skb->tail - skb->transport_header; + + pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); + + return ip_local_out(skb); +} + +static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, + int type, struct igmpv3_grec **ppgr) +{ + struct net_device *dev = pmc->interface->dev; + struct igmpv3_report *pih; + struct igmpv3_grec *pgr; + + if (!skb) + skb = igmpv3_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->multiaddr; + pih = igmpv3_report_hdr(skb); + pih->ngrec = htons(ntohs(pih->ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->interface->dev; + struct igmpv3_report *pih; + struct igmpv3_grec *pgr = NULL; + struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, stotal, first, isquery, truncate; + + if (pmc->multiaddr == IGMP_ALL_HOSTS) + return skb; + + isquery = type == IGMPV3_MODE_IS_INCLUDE || + type == IGMPV3_MODE_IS_EXCLUDE; + truncate = type == IGMPV3_MODE_IS_EXCLUDE || + type == IGMPV3_CHANGE_TO_EXCLUDE; + + stotal = scount = 0; + + psf_list = sdeleted ? &pmc->tomb : &pmc->sources; + + if (!*psf_list) + goto empty_source; + + pih = skb ? igmpv3_report_hdr(skb) : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pih && pih->ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + igmpv3_sendpack(skb); + skb = igmpv3_newpack(dev, dev->mtu); + } + } + first = 1; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + __be32 *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(__be32) + + first*sizeof(struct igmpv3_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + igmpv3_sendpack(skb); + skb = igmpv3_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + if (!skb) + return NULL; + psrc = (__be32 *)skb_put(skb, sizeof(__be32)); + *psrc = psf->sf_inaddr; + scount++; stotal++; + if ((type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + +empty_source: + if (!stotal) { + if (type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) + return skb; + if (pmc->crcount || isquery) { + /* make sure we have room for group header */ + if (skb && AVAILABLE(skb)grec_nsrcs = htons(scount); + + if (isquery) + pmc->gsquery = 0; /* clear query state on report */ + return skb; +} + +static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + rcu_read_lock(); + for_each_pmc_rcu(in_dev, pmc) { + if (pmc->multiaddr == IGMP_ALL_HOSTS) + continue; + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) + type = IGMPV3_MODE_IS_EXCLUDE; + else + type = IGMPV3_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->lock); + } + rcu_read_unlock(); + } else { + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) + type = IGMPV3_MODE_IS_EXCLUDE; + else + type = IGMPV3_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->lock); + } + if (!skb) + return 0; + return igmpv3_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) +{ + struct ip_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void igmpv3_send_cr(struct in_device *in_dev) +{ + struct ip_mc_list *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + rcu_read_lock(); + spin_lock_bh(&in_dev->mc_tomb_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->sfmode == MCAST_INCLUDE) { + type = IGMPV3_BLOCK_OLD_SOURCES; + dtype = IGMPV3_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->crcount) { + if (pmc->sfmode == MCAST_EXCLUDE) { + type = IGMPV3_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + pmc->crcount--; + if (pmc->crcount == 0) { + igmpv3_clear_zeros(&pmc->tomb); + igmpv3_clear_zeros(&pmc->sources); + } + } + if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + in_dev->mc_tomb = pmc_next; + in_dev_put(pmc->interface); + kfree(pmc); + } else + pmc_prev = pmc; + } + spin_unlock_bh(&in_dev->mc_tomb_lock); + + /* change recs */ + for_each_pmc_rcu(in_dev, pmc) { + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) { + type = IGMPV3_BLOCK_OLD_SOURCES; + dtype = IGMPV3_ALLOW_NEW_SOURCES; + } else { + type = IGMPV3_ALLOW_NEW_SOURCES; + dtype = IGMPV3_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->crcount) { + if (pmc->sfmode == MCAST_EXCLUDE) + type = IGMPV3_CHANGE_TO_EXCLUDE; + else + type = IGMPV3_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + pmc->crcount--; + } + spin_unlock_bh(&pmc->lock); + } + rcu_read_unlock(); + + if (!skb) + return; + (void) igmpv3_sendpack(skb); +} + +static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, + int type) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct igmphdr *ih; + struct rtable *rt; + struct net_device *dev = in_dev->dev; + struct net *net = dev_net(dev); + __be32 group = pmc ? pmc->multiaddr : 0; + struct flowi4 fl4; + __be32 dst; + + if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) + return igmpv3_send_report(in_dev, pmc); + else if (type == IGMP_HOST_LEAVE_MESSAGE) + dst = IGMP_ALL_ROUTER; + else + dst = group; + + rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) + return -1; + + skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) { + ip_rt_put(rt); + return -1; + } + + skb_dst_set(skb, &rt->dst); + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb_reset_network_header(skb); + iph = ip_hdr(skb); + skb_put(skb, sizeof(struct iphdr) + 4); + + iph->version = 4; + iph->ihl = (sizeof(struct iphdr)+4)>>2; + iph->tos = 0xc0; + iph->frag_off = htons(IP_DF); + iph->ttl = 1; + iph->daddr = dst; + iph->saddr = fl4.saddr; + iph->protocol = IPPROTO_IGMP; + ip_select_ident(iph, &rt->dst, NULL); + ((u8*)&iph[1])[0] = IPOPT_RA; + ((u8*)&iph[1])[1] = 4; + ((u8*)&iph[1])[2] = 0; + ((u8*)&iph[1])[3] = 0; + + ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + ih->type = type; + ih->code = 0; + ih->csum = 0; + ih->group = group; + ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); + + return ip_local_out(skb); +} + +static void igmp_gq_timer_expire(unsigned long data) +{ + struct in_device *in_dev = (struct in_device *)data; + + in_dev->mr_gq_running = 0; + igmpv3_send_report(in_dev, NULL); + __in_dev_put(in_dev); +} + +static void igmp_ifc_timer_expire(unsigned long data) +{ + struct in_device *in_dev = (struct in_device *)data; + + igmpv3_send_cr(in_dev); + if (in_dev->mr_ifc_count) { + in_dev->mr_ifc_count--; + igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); + } + __in_dev_put(in_dev); +} + +static void igmp_ifc_event(struct in_device *in_dev) +{ + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) + return; + in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + igmp_ifc_start_timer(in_dev, 1); +} + + +static void igmp_timer_expire(unsigned long data) +{ + struct ip_mc_list *im=(struct ip_mc_list *)data; + struct in_device *in_dev = im->interface; + + spin_lock(&im->lock); + im->tm_running = 0; + + if (im->unsolicit_count) { + im->unsolicit_count--; + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + } + im->reporter = 1; + spin_unlock(&im->lock); + + if (IGMP_V1_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); + else if (IGMP_V2_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); + else + igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); + + ip_ma_put(im); +} + +/* mark EXCLUDE-mode sources */ +static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) +{ + struct ip_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_count[MCAST_INCLUDE] || + pmc->sfcount[MCAST_EXCLUDE] != + psf->sf_count[MCAST_EXCLUDE]) + continue; + if (srcs[i] == psf->sf_inaddr) { + scount++; + break; + } + } + } + pmc->gsquery = 0; + if (scount == nsrcs) /* all sources excluded */ + return 0; + return 1; +} + +static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) +{ + struct ip_sf_list *psf; + int i, scount; + + if (pmc->sfmode == MCAST_EXCLUDE) + return igmp_xmarksources(pmc, nsrcs, srcs); + + /* mark INCLUDE-mode sources */ + scount = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_inaddr) { + psf->sf_gsresp = 1; + scount++; + break; + } + } + if (!scount) { + pmc->gsquery = 0; + return 0; + } + pmc->gsquery = 1; + return 1; +} + +static void igmp_heard_report(struct in_device *in_dev, __be32 group) +{ + struct ip_mc_list *im; + + /* Timers are only set for non-local groups */ + + if (group == IGMP_ALL_HOSTS) + return; + + rcu_read_lock(); + for_each_pmc_rcu(in_dev, im) { + if (im->multiaddr == group) { + igmp_stop_timer(im); + break; + } + } + rcu_read_unlock(); +} + +static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, + int len) +{ + struct igmphdr *ih = igmp_hdr(skb); + struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); + struct ip_mc_list *im; + __be32 group = ih->group; + int max_delay; + int mark = 0; + + + if (len == 8) { + if (ih->code == 0) { + /* Alas, old v1 router presents here. */ + + max_delay = IGMP_Query_Response_Interval; + in_dev->mr_v1_seen = jiffies + + IGMP_V1_Router_Present_Timeout; + group = 0; + } else { + /* v2 router present */ + max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); + in_dev->mr_v2_seen = jiffies + + IGMP_V2_Router_Present_Timeout; + } + /* cancel the interface change timer */ + in_dev->mr_ifc_count = 0; + if (del_timer(&in_dev->mr_ifc_timer)) + __in_dev_put(in_dev); + /* clear deleted report items */ + igmpv3_clear_delrec(in_dev); + } else if (len < 12) { + return; /* ignore bogus packet; freed by caller */ + } else if (IGMP_V1_SEEN(in_dev)) { + /* This is a v3 query with v1 queriers present */ + max_delay = IGMP_Query_Response_Interval; + group = 0; + } else if (IGMP_V2_SEEN(in_dev)) { + /* this is a v3 query with v2 queriers present; + * Interpretation of the max_delay code is problematic here. + * A real v2 host would use ih_code directly, while v3 has a + * different encoding. We use the v3 encoding as more likely + * to be intended in a v3 query. + */ + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); + if (!max_delay) + max_delay = 1; /* can't mod w/ 0 */ + } else { /* v3 */ + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) + return; + + ih3 = igmpv3_query_hdr(skb); + if (ih3->nsrcs) { + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + + ntohs(ih3->nsrcs)*sizeof(__be32))) + return; + ih3 = igmpv3_query_hdr(skb); + } + + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); + if (!max_delay) + max_delay = 1; /* can't mod w/ 0 */ + in_dev->mr_maxdelay = max_delay; + if (ih3->qrv) + in_dev->mr_qrv = ih3->qrv; + if (!group) { /* general query */ + if (ih3->nsrcs) + return; /* no sources allowed */ + igmp_gq_start_timer(in_dev); + return; + } + /* mark sources to include, if group & source-specific */ + mark = ih3->nsrcs != 0; + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + rcu_read_lock(); + for_each_pmc_rcu(in_dev, im) { + int changed; + + if (group && group != im->multiaddr) + continue; + if (im->multiaddr == IGMP_ALL_HOSTS) + continue; + spin_lock_bh(&im->lock); + if (im->tm_running) + im->gsquery = im->gsquery && mark; + else + im->gsquery = mark; + changed = !im->gsquery || + igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); + spin_unlock_bh(&im->lock); + if (changed) + igmp_mod_timer(im, max_delay); + } + rcu_read_unlock(); +} + +/* called in rcu_read_lock() section */ +int igmp_rcv(struct sk_buff *skb) +{ + /* This basically follows the spec line by line -- see RFC1112 */ + struct igmphdr *ih; + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int len = skb->len; + + if (in_dev == NULL) + goto drop; + + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; + } + + ih = igmp_hdr(skb); + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + igmp_heard_query(in_dev, skb, len); + break; + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* Is it our report looped back? */ + if (rt_is_output_route(skb_rtable(skb))) + break; + /* don't rely on MC router hearing unicast reports */ + if (skb->pkt_type == PACKET_MULTICAST || + skb->pkt_type == PACKET_BROADCAST) + igmp_heard_report(in_dev, ih->group); + break; + case IGMP_PIM: +#ifdef CONFIG_IP_PIMSM_V1 + return pim_rcv_v1(skb); +#endif + case IGMPV3_HOST_MEMBERSHIP_REPORT: + case IGMP_DVMRP: + case IGMP_TRACE: + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_MTRACE: + case IGMP_MTRACE_RESP: + break; + default: + break; + } + +drop: + kfree_skb(skb); + return 0; +} + +#endif + + +/* + * Add a filter to a device + */ + +static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct net_device *dev = in_dev->dev; + + /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. + We will get multicast token leakage, when IFF_MULTICAST + is changed. This check should be done in dev->set_multicast_list + routine. Something sort of: + if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } + --ANK + */ + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_add(dev, buf); +} + +/* + * Remove a filter from a device + */ + +static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct net_device *dev = in_dev->dev; + + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_del(dev, buf); +} + +#ifdef CONFIG_IP_MULTICAST +/* + * deleted ip_mc_list manipulation + */ +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) +{ + struct ip_mc_list *pmc; + + /* this is an "ip_mc_list" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); + if (!pmc) + return; + spin_lock_bh(&im->lock); + pmc->interface = im->interface; + in_dev_hold(in_dev); + pmc->multiaddr = im->multiaddr; + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + pmc->sfmode = im->sfmode; + if (pmc->sfmode == MCAST_INCLUDE) { + struct ip_sf_list *psf; + + pmc->tomb = im->tomb; + pmc->sources = im->sources; + im->tomb = im->sources = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->crcount; + } + spin_unlock_bh(&im->lock); + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc->next = in_dev->mc_tomb; + in_dev->mc_tomb = pmc; + spin_unlock_bh(&in_dev->mc_tomb_lock); +} + +static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) +{ + struct ip_mc_list *pmc, *pmc_prev; + struct ip_sf_list *psf, *psf_next; + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc_prev = NULL; + for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + if (pmc->multiaddr == multiaddr) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + in_dev->mc_tomb = pmc->next; + } + spin_unlock_bh(&in_dev->mc_tomb_lock); + if (pmc) { + for (psf=pmc->tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in_dev_put(pmc->interface); + kfree(pmc); + } +} + +static void igmpv3_clear_delrec(struct in_device *in_dev) +{ + struct ip_mc_list *pmc, *nextpmc; + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc = in_dev->mc_tomb; + in_dev->mc_tomb = NULL; + spin_unlock_bh(&in_dev->mc_tomb_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip_mc_clear_src(pmc); + in_dev_put(pmc->interface); + kfree(pmc); + } + /* clear dead sources, too */ + rcu_read_lock(); + for_each_pmc_rcu(in_dev, pmc) { + struct ip_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->lock); + psf = pmc->tomb; + pmc->tomb = NULL; + spin_unlock_bh(&pmc->lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + rcu_read_unlock(); +} +#endif + +static void igmp_group_dropped(struct ip_mc_list *im) +{ + struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + int reporter; +#endif + + if (im->loaded) { + im->loaded = 0; + ip_mc_filter_del(in_dev, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + reporter = im->reporter; + igmp_stop_timer(im); + + if (!in_dev->dead) { + if (IGMP_V1_SEEN(in_dev)) + return; + if (IGMP_V2_SEEN(in_dev)) { + if (reporter) + igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); + return; + } + /* IGMPv3 */ + igmpv3_add_delrec(in_dev, im); + + igmp_ifc_event(in_dev); + } +#endif +} + +static void igmp_group_added(struct ip_mc_list *im) +{ + struct in_device *in_dev = im->interface; + + if (im->loaded == 0) { + im->loaded = 1; + ip_mc_filter_add(in_dev, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + if (in_dev->dead) + return; + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { + spin_lock_bh(&im->lock); + igmp_start_timer(im, IGMP_Initial_Report_Delay); + spin_unlock_bh(&im->lock); + return; + } + /* else, v3 */ + + im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + igmp_ifc_event(in_dev); +#endif +} + + +/* + * Multicast list managers + */ + + +/* + * A socket has joined a multicast group on device dev. + */ + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + struct ip_mc_list *im; + + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, im) { + if (im->multiaddr == addr) { + im->users++; + ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + goto out; + } + } + + im = kzalloc(sizeof(*im), GFP_KERNEL); + if (!im) + goto out; + + im->users = 1; + im->interface = in_dev; + in_dev_hold(in_dev); + im->multiaddr = addr; + /* initial mode is (EX, empty) */ + im->sfmode = MCAST_EXCLUDE; + im->sfcount[MCAST_EXCLUDE] = 1; + atomic_set(&im->refcnt, 1); + spin_lock_init(&im->lock); +#ifdef CONFIG_IP_MULTICAST + setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); + im->unsolicit_count = IGMP_Unsolicited_Report_Count; +#endif + + im->next_rcu = in_dev->mc_list; + in_dev->mc_count++; + rcu_assign_pointer(in_dev->mc_list, im); + +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, im->multiaddr); +#endif + igmp_group_added(im); + if (!in_dev->dead) + ip_rt_multicast_event(in_dev); +out: + return; +} +EXPORT_SYMBOL(ip_mc_inc_group); + +/* + * Resend IGMP JOIN report; used for bonding. + * Called with rcu_read_lock() + */ +void ip_mc_rejoin_groups(struct in_device *in_dev) +{ +#ifdef CONFIG_IP_MULTICAST + struct ip_mc_list *im; + int type; + + for_each_pmc_rcu(in_dev, im) { + if (im->multiaddr == IGMP_ALL_HOSTS) + continue; + + /* a failover is happening and switches + * must be notified immediately + */ + if (IGMP_V1_SEEN(in_dev)) + type = IGMP_HOST_MEMBERSHIP_REPORT; + else if (IGMP_V2_SEEN(in_dev)) + type = IGMPV2_HOST_MEMBERSHIP_REPORT; + else + type = IGMPV3_HOST_MEMBERSHIP_REPORT; + igmp_send_report(in_dev, im, type); + } +#endif +} +EXPORT_SYMBOL(ip_mc_rejoin_groups); + +/* + * A socket has left a multicast group on device dev + */ + +void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) +{ + struct ip_mc_list *i; + struct ip_mc_list __rcu **ip; + + ASSERT_RTNL(); + + for (ip = &in_dev->mc_list; + (i = rtnl_dereference(*ip)) != NULL; + ip = &i->next_rcu) { + if (i->multiaddr == addr) { + if (--i->users == 0) { + *ip = i->next_rcu; + in_dev->mc_count--; + igmp_group_dropped(i); + ip_mc_clear_src(i); + + if (!in_dev->dead) + ip_rt_multicast_event(in_dev); + + ip_ma_put(i); + return; + } + break; + } + } +} +EXPORT_SYMBOL(ip_mc_dec_group); + +/* Device changing type */ + +void ip_mc_unmap(struct in_device *in_dev) +{ + struct ip_mc_list *pmc; + + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, pmc) + igmp_group_dropped(pmc); +} + +void ip_mc_remap(struct in_device *in_dev) +{ + struct ip_mc_list *pmc; + + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, pmc) + igmp_group_added(pmc); +} + +/* Device going down */ + +void ip_mc_down(struct in_device *in_dev) +{ + struct ip_mc_list *pmc; + + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, pmc) + igmp_group_dropped(pmc); + +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_ifc_count = 0; + if (del_timer(&in_dev->mr_ifc_timer)) + __in_dev_put(in_dev); + in_dev->mr_gq_running = 0; + if (del_timer(&in_dev->mr_gq_timer)) + __in_dev_put(in_dev); + igmpv3_clear_delrec(in_dev); +#endif + + ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); +} + +void ip_mc_init_dev(struct in_device *in_dev) +{ + ASSERT_RTNL(); + + in_dev->mc_tomb = NULL; +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_gq_running = 0; + setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, + (unsigned long)in_dev); + in_dev->mr_ifc_count = 0; + in_dev->mc_count = 0; + setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, + (unsigned long)in_dev); + in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; +#endif + + spin_lock_init(&in_dev->mc_tomb_lock); +} + +/* Device going up */ + +void ip_mc_up(struct in_device *in_dev) +{ + struct ip_mc_list *pmc; + + ASSERT_RTNL(); + + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + + for_each_pmc_rtnl(in_dev, pmc) + igmp_group_added(pmc); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ip_mc_destroy_dev(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + /* Deactivate timers */ + ip_mc_down(in_dev); + + while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { + in_dev->mc_list = i->next_rcu; + in_dev->mc_count--; + + /* We've dropped the groups in ip_mc_down already */ + ip_mc_clear_src(i); + ip_ma_put(i); + } +} + +/* RTNL is locked */ +static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) +{ + struct net_device *dev = NULL; + struct in_device *idev = NULL; + + if (imr->imr_ifindex) { + idev = inetdev_by_index(net, imr->imr_ifindex); + return idev; + } + if (imr->imr_address.s_addr) { + dev = __ip_dev_find(net, imr->imr_address.s_addr, false); + if (!dev) + return NULL; + } + + if (!dev) { + struct rtable *rt = ip_route_output(net, + imr->imr_multiaddr.s_addr, + 0, 0, 0); + if (!IS_ERR(rt)) { + dev = rt->dst.dev; + ip_rt_put(rt); + } + } + if (dev) { + imr->imr_ifindex = dev->ifindex; + idev = __in_dev_get_rtnl(dev); + } + return idev; +} + +/* + * Join a socket to a group + */ +int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; +int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; + + +static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, + __be32 *psfsrc) +{ + struct ip_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == *psfsrc) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (psf->sf_count[sfmode] == 0) { + ip_rt_multicast_event(pmc->interface); + } + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { +#ifdef CONFIG_IP_MULTICAST + struct in_device *in_dev = pmc->interface; +#endif + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->sources = psf->sf_next; +#ifdef CONFIG_IP_MULTICAST + if (psf->sf_oldin && + !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { + psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + psf->sf_next = pmc->tomb; + pmc->tomb = psf; + rv = 1; + } else +#endif + kfree(psf); + } + return rv; +} + +#ifndef CONFIG_IP_MULTICAST +#define igmp_ifc_event(x) do { } while (0) +#endif + +static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, + int sfcount, __be32 *psfsrc, int delta) +{ + struct ip_mc_list *pmc; + int changerec = 0; + int i, err; + + if (!in_dev) + return -ENODEV; + rcu_read_lock(); + for_each_pmc_rcu(in_dev, pmc) { + if (*pmca == pmc->multiaddr) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + rcu_read_unlock(); + return -ESRCH; + } + spin_lock_bh(&pmc->lock); + rcu_read_unlock(); +#ifdef CONFIG_IP_MULTICAST + sf_markstate(pmc); +#endif + if (!delta) { + err = -EINVAL; + if (!pmc->sfcount[sfmode]) + goto out_unlock; + pmc->sfcount[sfmode]--; + } + err = 0; + for (i=0; i 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->sfmode == MCAST_EXCLUDE && + pmc->sfcount[MCAST_EXCLUDE] == 0 && + pmc->sfcount[MCAST_INCLUDE]) { +#ifdef CONFIG_IP_MULTICAST + struct ip_sf_list *psf; +#endif + + /* filter mode change */ + pmc->sfmode = MCAST_INCLUDE; +#ifdef CONFIG_IP_MULTICAST + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = pmc->crcount; + for (psf=pmc->sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + igmp_ifc_event(pmc->interface); + } else if (sf_setstate(pmc) || changerec) { + igmp_ifc_event(pmc->interface); +#endif + } +out_unlock: + spin_unlock_bh(&pmc->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, + __be32 *psfsrc, int delta) +{ + struct ip_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == *psfsrc) + break; + psf_prev = psf; + } + if (!psf) { + psf = kzalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + psf->sf_inaddr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->sources = psf; + } + psf->sf_count[sfmode]++; + if (psf->sf_count[sfmode] == 1) { + ip_rt_multicast_event(pmc->interface); + } + return 0; +} + +#ifdef CONFIG_IP_MULTICAST +static void sf_markstate(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf; + int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->sources; psf; psf=psf->sf_next) + if (pmc->sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf, *dpsf; + int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; + int qrv = pmc->interface->mr_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (pmc->sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in) { + if (!psf->sf_oldin) { + struct ip_sf_list *prev = NULL; + + for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { + if (dpsf->sf_inaddr == psf->sf_inaddr) + break; + prev = dpsf; + } + if (dpsf) { + if (prev) + prev->sf_next = dpsf->sf_next; + else + pmc->tomb = dpsf->sf_next; + kfree(dpsf); + } + psf->sf_crcount = qrv; + rv++; + } + } else if (psf->sf_oldin) { + + psf->sf_crcount = 0; + /* + * add or update "delete" records if an active filter + * is now inactive + */ + for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) + if (dpsf->sf_inaddr == psf->sf_inaddr) + break; + if (!dpsf) { + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); + if (!dpsf) + continue; + *dpsf = *psf; + /* pmc->lock held by callers */ + dpsf->sf_next = pmc->tomb; + pmc->tomb = dpsf; + } + dpsf->sf_crcount = qrv; + rv++; + } + } + return rv; +} +#endif + +/* + * Add multicast source filter list to the interface list + */ +static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, + int sfcount, __be32 *psfsrc, int delta) +{ + struct ip_mc_list *pmc; + int isexclude; + int i, err; + + if (!in_dev) + return -ENODEV; + rcu_read_lock(); + for_each_pmc_rcu(in_dev, pmc) { + if (*pmca == pmc->multiaddr) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + rcu_read_unlock(); + return -ESRCH; + } + spin_lock_bh(&pmc->lock); + rcu_read_unlock(); + +#ifdef CONFIG_IP_MULTICAST + sf_markstate(pmc); +#endif + isexclude = pmc->sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->sfcount[sfmode]++; + err = 0; + for (i=0; isfcount[sfmode]--; + for (j=0; jsfcount[MCAST_EXCLUDE] != 0)) { +#ifdef CONFIG_IP_MULTICAST + struct ip_sf_list *psf; + in_dev = pmc->interface; +#endif + + /* filter mode change */ + if (pmc->sfcount[MCAST_EXCLUDE]) + pmc->sfmode = MCAST_EXCLUDE; + else if (pmc->sfcount[MCAST_INCLUDE]) + pmc->sfmode = MCAST_INCLUDE; +#ifdef CONFIG_IP_MULTICAST + /* else no filters; keep old mode for reports */ + + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = pmc->crcount; + for (psf=pmc->sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + igmp_ifc_event(in_dev); + } else if (sf_setstate(pmc)) { + igmp_ifc_event(in_dev); +#endif + } + spin_unlock_bh(&pmc->lock); + return err; +} + +static void ip_mc_clear_src(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf, *nextpsf; + + for (psf=pmc->tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->tomb = NULL; + for (psf=pmc->sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->sources = NULL; + pmc->sfmode = MCAST_EXCLUDE; + pmc->sfcount[MCAST_INCLUDE] = 0; + pmc->sfcount[MCAST_EXCLUDE] = 1; +} + + +/* + * Join a multicast group + */ +int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) +{ + int err; + __be32 addr = imr->imr_multiaddr.s_addr; + struct ip_mc_socklist *iml = NULL, *i; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + int ifindex; + int count = 0; + + if (!ipv4_is_multicast(addr)) + return -EINVAL; + + rtnl_lock(); + + in_dev = ip_mc_find_dev(net, imr); + + if (!in_dev) { + iml = NULL; + err = -ENODEV; + goto done; + } + + err = -EADDRINUSE; + ifindex = imr->imr_ifindex; + for_each_pmc_rtnl(inet, i) { + if (i->multi.imr_multiaddr.s_addr == addr && + i->multi.imr_ifindex == ifindex) + goto done; + count++; + } + err = -ENOBUFS; + if (count >= sysctl_igmp_max_memberships) + goto done; + iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); + if (iml == NULL) + goto done; + + memcpy(&iml->multi, imr, sizeof(*imr)); + iml->next_rcu = inet->mc_list; + iml->sflist = NULL; + iml->sfmode = MCAST_EXCLUDE; + rcu_assign_pointer(inet->mc_list, iml); + ip_mc_inc_group(in_dev, addr); + err = 0; +done: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(ip_mc_join_group); + +static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, + struct in_device *in_dev) +{ + struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); + int err; + + if (psf == NULL) { + /* any-source empty exclude case */ + return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, + iml->sfmode, 0, NULL, 0); + } + err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, + iml->sfmode, psf->sl_count, psf->sl_addr, 0); + rcu_assign_pointer(iml->sflist, NULL); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psf, rcu); + return err; +} + +/* + * Ask a socket to leave a group. + */ + +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + struct ip_mc_socklist __rcu **imlp; + struct in_device *in_dev; + struct net *net = sock_net(sk); + __be32 group = imr->imr_multiaddr.s_addr; + u32 ifindex; + int ret = -EADDRNOTAVAIL; + + rtnl_lock(); + in_dev = ip_mc_find_dev(net, imr); + ifindex = imr->imr_ifindex; + for (imlp = &inet->mc_list; + (iml = rtnl_dereference(*imlp)) != NULL; + imlp = &iml->next_rcu) { + if (iml->multi.imr_multiaddr.s_addr != group) + continue; + if (ifindex) { + if (iml->multi.imr_ifindex != ifindex) + continue; + } else if (imr->imr_address.s_addr && imr->imr_address.s_addr != + iml->multi.imr_address.s_addr) + continue; + + (void) ip_mc_leave_src(sk, iml, in_dev); + + *imlp = iml->next_rcu; + + if (in_dev) + ip_mc_dec_group(in_dev, group); + rtnl_unlock(); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + kfree_rcu(iml, rcu); + return 0; + } + if (!in_dev) + ret = -ENODEV; + rtnl_unlock(); + return ret; +} + +int ip_mc_source(int add, int omode, struct sock *sk, struct + ip_mreq_source *mreqs, int ifindex) +{ + int err; + struct ip_mreqn imr; + __be32 addr = mreqs->imr_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev = NULL; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + struct net *net = sock_net(sk); + int leavegroup = 0; + int i, j, rv; + + if (!ipv4_is_multicast(addr)) + return -EINVAL; + + rtnl_lock(); + + imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; + imr.imr_address.s_addr = mreqs->imr_interface; + imr.imr_ifindex = ifindex; + in_dev = ip_mc_find_dev(net, &imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for_each_pmc_rtnl(inet, pmc) { + if ((pmc->multi.imr_multiaddr.s_addr == + imr.imr_multiaddr.s_addr) && + (pmc->multi.imr_ifindex == imr.imr_ifindex)) + break; + } + if (!pmc) { /* must have a prior join */ + err = -EINVAL; + goto done; + } + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) { + err = -EINVAL; + goto done; + } + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); + ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, + NULL, 0); + pmc->sfmode = omode; + } + + psl = rtnl_dereference(pmc->sflist); + if (!add) { + if (!psl) + goto done; /* err = -EADDRNOTAVAIL */ + rv = !0; + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, + sizeof(__be32)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; /* err = -EADDRNOTAVAIL */ + + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { + leavegroup = 1; + goto done; + } + + /* update the interface filter */ + ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, + &mreqs->imr_sourceaddr, 1); + + for (j=i+1; jsl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip_sf_socklist *newpsl; + int count = IP_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP_SFBLOCK; + if (psl) { + for (i=0; isl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); + } + rcu_assign_pointer(pmc->sflist, newpsl); + psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, + sizeof(__be32)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = mreqs->imr_sourceaddr; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, + &mreqs->imr_sourceaddr, 1); +done: + rtnl_unlock(); + if (leavegroup) + return ip_mc_leave_group(sk, &imr); + return err; +} + +int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) +{ + int err = 0; + struct ip_mreqn imr; + __be32 addr = msf->imsf_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *newpsl, *psl; + struct net *net = sock_net(sk); + int leavegroup = 0; + + if (!ipv4_is_multicast(addr)) + return -EINVAL; + if (msf->imsf_fmode != MCAST_INCLUDE && + msf->imsf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + rtnl_lock(); + + imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; + imr.imr_address.s_addr = msf->imsf_interface; + imr.imr_ifindex = ifindex; + in_dev = ip_mc_find_dev(net, &imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) { + leavegroup = 1; + goto done; + } + + for_each_pmc_rtnl(inet, pmc) { + if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && + pmc->multi.imr_ifindex == imr.imr_ifindex) + break; + } + if (!pmc) { /* must have a prior join */ + err = -EINVAL; + goto done; + } + if (msf->imsf_numsrc) { + newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc), + GFP_KERNEL); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; + memcpy(newpsl->sl_addr, msf->imsf_slist, + msf->imsf_numsrc * sizeof(msf->imsf_slist[0])); + err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, + msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else { + newpsl = NULL; + (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, + msf->imsf_fmode, 0, NULL, 0); + } + psl = rtnl_dereference(pmc->sflist); + if (psl) { + (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); + } else + (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, + 0, NULL, 0); + rcu_assign_pointer(pmc->sflist, newpsl); + pmc->sfmode = msf->imsf_fmode; + err = 0; +done: + rtnl_unlock(); + if (leavegroup) + err = ip_mc_leave_group(sk, &imr); + return err; +} + +int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, + struct ip_msfilter __user *optval, int __user *optlen) +{ + int err, len, count, copycount; + struct ip_mreqn imr; + __be32 addr = msf->imsf_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + struct net *net = sock_net(sk); + + if (!ipv4_is_multicast(addr)) + return -EINVAL; + + rtnl_lock(); + + imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; + imr.imr_address.s_addr = msf->imsf_interface; + imr.imr_ifindex = 0; + in_dev = ip_mc_find_dev(net, &imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for_each_pmc_rtnl(inet, pmc) { + if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && + pmc->multi.imr_ifindex == imr.imr_ifindex) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + msf->imsf_fmode = pmc->sfmode; + psl = rtnl_dereference(pmc->sflist); + rtnl_unlock(); + if (!psl) { + len = 0; + count = 0; + } else { + count = psl->sl_count; + } + copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; + len = copycount * sizeof(psl->sl_addr[0]); + msf->imsf_numsrc = count; + if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || + copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { + return -EFAULT; + } + if (len && + copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len)) + return -EFAULT; + return 0; +done: + rtnl_unlock(); + return err; +} + +int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + struct sockaddr_in *psin; + __be32 addr; + struct ip_mc_socklist *pmc; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + addr = psin->sin_addr.s_addr; + if (!ipv4_is_multicast(addr)) + return -EINVAL; + + rtnl_lock(); + + err = -EADDRNOTAVAIL; + + for_each_pmc_rtnl(inet, pmc) { + if (pmc->multi.imr_multiaddr.s_addr == addr && + pmc->multi.imr_ifindex == gsf->gf_interface) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = rtnl_dereference(pmc->sflist); + rtnl_unlock(); + count = psl ? psl->sl_count : 0; + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + for (i=0; isin_family = AF_INET; + psin->sin_addr.s_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + rtnl_unlock(); + return err; +} + +/* + * check if a multicast source filter allows delivery for a given + */ +int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *pmc; + struct ip_sf_socklist *psl; + int i; + int ret; + + ret = 1; + if (!ipv4_is_multicast(loc_addr)) + goto out; + + rcu_read_lock(); + for_each_pmc_rcu(inet, pmc) { + if (pmc->multi.imr_multiaddr.s_addr == loc_addr && + pmc->multi.imr_ifindex == dif) + break; + } + ret = inet->mc_all; + if (!pmc) + goto unlock; + psl = rcu_dereference(pmc->sflist); + ret = (pmc->sfmode == MCAST_EXCLUDE); + if (!psl) + goto unlock; + + for (i=0; isl_count; i++) { + if (psl->sl_addr[i] == rmt_addr) + break; + } + ret = 0; + if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + goto unlock; + if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + goto unlock; + ret = 1; +unlock: + rcu_read_unlock(); +out: + return ret; +} + +/* + * A socket is closing. + */ + +void ip_mc_drop_socket(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + struct net *net = sock_net(sk); + + if (inet->mc_list == NULL) + return; + + rtnl_lock(); + while ((iml = rtnl_dereference(inet->mc_list)) != NULL) { + struct in_device *in_dev; + + inet->mc_list = iml->next_rcu; + in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); + (void) ip_mc_leave_src(sk, iml, in_dev); + if (in_dev != NULL) + ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + kfree_rcu(iml, rcu); + } + rtnl_unlock(); +} + +/* called with rcu_read_lock() */ +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +{ + struct ip_mc_list *im; + struct ip_sf_list *psf; + int rv = 0; + + for_each_pmc_rcu(in_dev, im) { + if (im->multiaddr == mc_addr) + break; + } + if (im && proto == IPPROTO_IGMP) { + rv = 1; + } else if (im) { + if (src_addr) { + for (psf=im->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == src_addr) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + im->sfcount[MCAST_EXCLUDE]; + else + rv = im->sfcount[MCAST_EXCLUDE] != 0; + } else + rv = 1; /* unspecified source; tentatively allow */ + } + return rv; +} + +#if defined(CONFIG_PROC_FS) +struct igmp_mc_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct in_device *in_dev; +}; + +#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) + +static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ip_mc_list *im = NULL; + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + + state->in_dev = NULL; + for_each_netdev_rcu(net, state->dev) { + struct in_device *in_dev; + + in_dev = __in_dev_get_rcu(state->dev); + if (!in_dev) + continue; + im = rcu_dereference(in_dev->mc_list); + if (im) { + state->in_dev = in_dev; + break; + } + } + return im; +} + +static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) +{ + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + + im = rcu_dereference(im->next_rcu); + while (!im) { + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->in_dev = NULL; + break; + } + state->in_dev = __in_dev_get_rcu(state->dev); + if (!state->in_dev) + continue; + im = rcu_dereference(state->in_dev->mc_list); + } + return im; +} + +static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip_mc_list *im = igmp_mc_get_first(seq); + if (im) + while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_mc_list *im; + if (v == SEQ_START_TOKEN) + im = igmp_mc_get_first(seq); + else + im = igmp_mc_get_next(seq, v); + ++*pos; + return im; +} + +static void igmp_mc_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + + state->in_dev = NULL; + state->dev = NULL; + rcu_read_unlock(); +} + +static int igmp_mc_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); + else { + struct ip_mc_list *im = (struct ip_mc_list *)v; + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + char *querier; +#ifdef CONFIG_IP_MULTICAST + querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : + IGMP_V2_SEEN(state->in_dev) ? "V2" : + "V3"; +#else + querier = "NONE"; +#endif + + if (rcu_dereference(state->in_dev->mc_list) == im) { + seq_printf(seq, "%d\t%-10s: %5d %7s\n", + state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); + } + + seq_printf(seq, + "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", + im->multiaddr, im->users, + im->tm_running, im->tm_running ? + jiffies_to_clock_t(im->timer.expires-jiffies) : 0, + im->reporter); + } + return 0; +} + +static const struct seq_operations igmp_mc_seq_ops = { + .start = igmp_mc_seq_start, + .next = igmp_mc_seq_next, + .stop = igmp_mc_seq_stop, + .show = igmp_mc_seq_show, +}; + +static int igmp_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &igmp_mc_seq_ops, + sizeof(struct igmp_mc_iter_state)); +} + +static const struct file_operations igmp_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct igmp_mcf_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct in_device *idev; + struct ip_mc_list *im; +}; + +#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) + +static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ip_sf_list *psf = NULL; + struct ip_mc_list *im = NULL; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + state->idev = NULL; + state->im = NULL; + for_each_netdev_rcu(net, state->dev) { + struct in_device *idev; + idev = __in_dev_get_rcu(state->dev); + if (unlikely(idev == NULL)) + continue; + im = rcu_dereference(idev->mc_list); + if (likely(im != NULL)) { + spin_lock_bh(&im->lock); + psf = im->sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->lock); + } + } + return psf; +} + +static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) +{ + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->lock); + state->im = state->im->next; + while (!state->im) { + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = __in_dev_get_rcu(state->dev); + if (!state->idev) + continue; + state->im = rcu_dereference(state->idev->mc_list); + } + if (!state->im) + break; + spin_lock_bh(&state->im->lock); + psf = state->im->sources; + } +out: + return psf; +} + +static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip_sf_list *psf = igmp_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp_mcf_get_first(seq); + else + psf = igmp_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->lock); + state->im = NULL; + } + state->idev = NULL; + state->dev = NULL; + rcu_read_unlock(); +} + +static int igmp_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip_sf_list *psf = (struct ip_sf_list *)v; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%10s %10s %6s %6s\n", "Idx", + "Device", "MCA", + "SRC", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s 0x%08x " + "0x%08x %6lu %6lu\n", + state->dev->ifindex, state->dev->name, + ntohl(state->im->multiaddr), + ntohl(psf->sf_inaddr), + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static const struct seq_operations igmp_mcf_seq_ops = { + .start = igmp_mcf_seq_start, + .next = igmp_mcf_seq_next, + .stop = igmp_mcf_seq_stop, + .show = igmp_mcf_seq_show, +}; + +static int igmp_mcf_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &igmp_mcf_seq_ops, + sizeof(struct igmp_mcf_iter_state)); +} + +static const struct file_operations igmp_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init igmp_net_init(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops); + if (!pde) + goto out_igmp; + pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + if (!pde) + goto out_mcfilter; + return 0; + +out_mcfilter: + proc_net_remove(net, "igmp"); +out_igmp: + return -ENOMEM; +} + +static void __net_exit igmp_net_exit(struct net *net) +{ + proc_net_remove(net, "mcfilter"); + proc_net_remove(net, "igmp"); +} + +static struct pernet_operations igmp_net_ops = { + .init = igmp_net_init, + .exit = igmp_net_exit, +}; + +int __init igmp_mc_proc_init(void) +{ + return register_pernet_subsys(&igmp_net_ops); +} +#endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 00000000..c14d88ad --- /dev/null +++ b/net/ipv4/inet_connection_sock.c @@ -0,0 +1,775 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET connection oriented protocols. + * + * Authors: See the TCP sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); +#endif + +/* + * This struct holds the first and last local port number. + */ +struct local_ports sysctl_local_ports __read_mostly = { + .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), + .range = { 32768, 61000 }, +}; + +unsigned long *sysctl_local_reserved_ports; +EXPORT_SYMBOL(sysctl_local_reserved_ports); + +void inet_get_local_port_range(int *low, int *high) +{ + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = sysctl_local_ports.range[0]; + *high = sysctl_local_ports.range[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} +EXPORT_SYMBOL(inet_get_local_port_range); + +int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + /* + * Unlike other sk lookup places we do not check + * for sk_net here, since _all_ the socks listed + * in tb->owners list belong to the same net - the + * one this bucket belongs to. + */ + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || + sk2_rcv_saddr == sk_rcv_saddr(sk)) + break; + } + } + } + return node != NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +int inet_csk_get_port(struct sock *sk, unsigned short snum) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_bind_hashbucket *head; + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret, attempts = 5; + struct net *net = sock_net(sk); + int smallest_size = -1, smallest_rover; + + local_bh_disable(); + if (!snum) { + int remaining, rover, low, high; + +again: + inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + smallest_rover = rover = net_random() % remaining + low; + + smallest_size = -1; + do { + if (inet_is_reserved_local_port(rover)) + goto next_nolock; + head = &hashinfo->bhash[inet_bhashfn(net, rover, + hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == rover) { + if (tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_rover = rover; + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { + spin_unlock(&head->lock); + snum = smallest_rover; + goto have_snum; + } + } + goto next; + } + break; + next: + spin_unlock(&head->lock); + next_nolock: + if (++rover > high) + rover = low; + } while (--remaining > 0); + + /* Exhausted local port range during search? It is not + * possible for us to be holding one of the bind hash + * locks if this test triggers, because if 'remaining' + * drops to zero, we broke out of the do/while loop at + * the top level, not from the 'break;' statement. + */ + ret = 1; + if (remaining <= 0) { + if (smallest_size != -1) { + snum = smallest_rover; + goto have_snum; + } + goto fail; + } + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { +have_snum: + head = &hashinfo->bhash[inet_bhashfn(net, snum, + hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size == -1) { + goto success; + } else { + ret = 1; + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size != -1 && --attempts >= 0) { + spin_unlock(&head->lock); + goto again; + } + goto fail_unlock; + } + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, + net, head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(inet_csk_get_port); + +/* + * Wait for an incoming connection, avoid race conditions. This must be called + * with the socket locked. + */ +static int inet_csk_wait_for_connect(struct sock *sk, long timeo) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk_sleep(sk), &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out_err; + + /* Find already established connection */ + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out_err; + + error = inet_csk_wait_for_connect(sk, timeo); + if (error) + goto out_err; + } + + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); + WARN_ON(newsk->sk_state == TCP_SYN_RECV); +out: + release_sock(sk); + return newsk; +out_err: + newsk = NULL; + *err = error; + goto out; +} +EXPORT_SYMBOL(inet_csk_accept); + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, + (unsigned long)sk); + setup_timer(&icsk->icsk_delack_timer, delack_handler, + (unsigned long)sk); + setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + icsk->icsk_pending = icsk->icsk_ack.pending = 0; +} +EXPORT_SYMBOL(inet_csk_init_xmit_timers); + +void inet_csk_clear_xmit_timers(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); + sk_stop_timer(sk, &sk->sk_timer); +} +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); + +void inet_csk_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); + +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); + +struct dst_entry *inet_csk_route_req(struct sock *sk, + struct flowi4 *fl4, + const struct request_sock *req) +{ + struct rtable *rt; + const struct inet_request_sock *ireq = inet_rsk(req); + struct ip_options_rcu *opt = inet_rsk(req)->opt; + struct net *net = sock_net(sk); + + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) + goto no_route; + if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + goto route_err; + return &rt->dst; + +route_err: + ip_rt_put(rt); +no_route: + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_route_req); + +struct dst_entry *inet_csk_route_child_sock(struct sock *sk, + struct sock *newsk, + const struct request_sock *req) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *newinet = inet_sk(newsk); + struct ip_options_rcu *opt = ireq->opt; + struct net *net = sock_net(sk); + struct flowi4 *fl4; + struct rtable *rt; + + fl4 = &newinet->cork.fl.u.ip4; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) + goto no_route; + if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + goto route_err; + return &rt->dst; + +route_err: + ip_rt_put(rt); +no_route: + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); + +static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, + const u32 rnd, const u32 synq_hsize) +{ + return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#define AF_INET_FAMILY(fam) ((fam) == AF_INET) +#else +#define AF_INET_FAMILY(fam) 1 +#endif + +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __be16 rport, const __be32 raddr, + const __be32 laddr) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet_request_sock *ireq = inet_rsk(req); + + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && + AF_INET_FAMILY(req->rsk_ops->family)) { + WARN_ON(req->sk); + *prevp = prev; + break; + } + } + + return req; +} +EXPORT_SYMBOL_GPL(inet_csk_search_req); + +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); + +/* Only thing we need from tcp.h */ +extern int sysctl_tcp_synack_retries; + + +/* Decide when to expire the request and when to resend SYN-ACK */ +static inline void syn_ack_recalc(struct request_sock *req, const int thresh, + const int max_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) +{ + if (!rskq_defer_accept) { + *expire = req->retrans >= thresh; + *resend = 1; + return; + } + *expire = req->retrans >= thresh && + (!inet_rsk(req)->acked || req->retrans >= max_retries); + /* + * Do not resend while waiting for data after ACK, + * start to resend on end of deferring period to give + * last chance for data or ACK to create established socket. + */ + *resend = !inet_rsk(req)->acked || + req->retrans >= rskq_defer_accept - 1; +} + +void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto) +{ + struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct listen_sock *lopt = queue->listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct request_sock **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + int expire = 0, resend = 0; + + syn_ack_recalc(req, thresh, max_retries, + queue->rskq_defer_accept, + &expire, &resend); + if (req->rsk_ops->syn_ack_timeout) + req->rsk_ops->syn_ack_timeout(parent, req); + if (!expire && + (!resend || + !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || + inet_rsk(req)->acked)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((timeout << req->retrans), max_rto); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + inet_csk_reqsk_queue_unlink(parent, req, reqp); + reqsk_queue_removed(queue, req); + reqsk_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i + 1) & (lopt->nr_table_entries - 1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + inet_csk_reset_keepalive_timer(parent, interval); +} +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); + +struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, + const gfp_t priority) +{ + struct sock *newsk = sk_clone(sk, priority); + + if (newsk != NULL) { + struct inet_connection_sock *newicsk = inet_csk(newsk); + + newsk->sk_state = TCP_SYN_RECV; + newicsk->icsk_bind_hash = NULL; + + inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; + newsk->sk_write_space = sk_stream_write_space; + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + + security_inet_csk_clone(newsk, req); + } + return newsk; +} +EXPORT_SYMBOL_GPL(inet_csk_clone); + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void inet_csk_destroy_sock(struct sock *sk) +{ + WARN_ON(sk->sk_state != TCP_CLOSE); + WARN_ON(!sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + WARN_ON(!sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + + percpu_counter_dec(sk->sk_prot->orphan_count); + sock_put(sk); +} +EXPORT_SYMBOL(inet_csk_destroy_sock); + +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + + if (rc != 0) + return rc; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + inet_csk_delack_init(sk); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + inet->inet_sport = htons(inet->inet_num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + __reqsk_queue_destroy(&icsk->icsk_accept_queue); + return -EADDRINUSE; +} +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ +void inet_csk_listen_stop(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *acc_req; + struct request_sock *req; + + inet_csk_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&icsk->icsk_accept_queue); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + WARN_ON(sock_owned_by_user(child)); + sock_hold(child); + + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + inet_csk_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + __reqsk_free(req); + } + WARN_ON(sk->sk_ack_backlog); +} +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + +void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + const struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->inet_daddr; + sin->sin_port = inet->inet_dport; +} +EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); + +#ifdef CONFIG_COMPAT +int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_af_ops->compat_getsockopt != NULL) + return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, + optval, optlen); + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); +} +EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); + +int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_af_ops->compat_setsockopt != NULL) + return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, + optval, optlen); + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); +} +EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); +#endif diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c new file mode 100644 index 00000000..3267d389 --- /dev/null +++ b/net/ipv4/inet_diag.c @@ -0,0 +1,951 @@ +/* + * inet_diag.c Module for monitoring INET transport protocols sockets. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +static const struct inet_diag_handler **inet_diag_table; + +struct inet_diag_entry { + __be32 *saddr; + __be32 *daddr; + u16 sport; + u16 dport; + u16 family; + u16 userlocks; +}; + +static struct sock *idiagnl; + +#define INET_DIAG_PUT(skb, attrtype, attrlen) \ + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) + +static DEFINE_MUTEX(inet_diag_table_mutex); + +static const struct inet_diag_handler *inet_diag_lock_handler(int type) +{ + if (!inet_diag_table[type]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_INET_DIAG, type); + + mutex_lock(&inet_diag_table_mutex); + if (!inet_diag_table[type]) + return ERR_PTR(-ENOENT); + + return inet_diag_table[type]; +} + +static inline void inet_diag_unlock_handler( + const struct inet_diag_handler *handler) +{ + mutex_unlock(&inet_diag_table_mutex); +} + +static int inet_csk_diag_fill(struct sock *sk, + struct sk_buff *skb, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + void *info = NULL; + struct inet_diag_meminfo *minfo = NULL; + unsigned char *b = skb_tail_pointer(skb); + const struct inet_diag_handler *handler; + + handler = inet_diag_table[unlh->nlmsg_type]; + BUG_ON(handler == NULL); + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = nlmsg_flags; + + r = NLMSG_DATA(nlh); + BUG_ON(sk->sk_state == TCP_TIME_WAIT); + + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); + + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, + handler->idiag_info_size); + + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { + const size_t len = strlen(icsk->icsk_ca_ops->name); + + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), + icsk->icsk_ca_ops->name); + } + + r->idiag_family = sk->sk_family; + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)sk; + r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = inet->inet_dport; + r->id.idiag_src[0] = inet->inet_rcv_saddr; + r->id.idiag_dst[0] = inet->inet_daddr; + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &np->rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &np->daddr); + } +#endif + +#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + r->idiag_timer = 1; + r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + r->idiag_timer = 4; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (timer_pending(&sk->sk_timer)) { + r->idiag_timer = 2; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + } else { + r->idiag_timer = 0; + r->idiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->idiag_rmem = sk_rmem_alloc_get(sk); + minfo->idiag_wmem = sk->sk_wmem_queued; + minfo->idiag_fmem = sk->sk_forward_alloc; + minfo->idiag_tmem = sk_wmem_alloc_get(sk); + } + + handler->idiag_get_info(sk, r, info); + + if (sk->sk_state < TCP_TIME_WAIT && + icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) + icsk->icsk_ca_ops->get_info(sk, ext, skb); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +rtattr_failure: +nlmsg_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, + struct sk_buff *skb, int ext, u32 pid, + u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + long tmo; + struct inet_diag_msg *r; + const unsigned char *previous_tail = skb_tail_pointer(skb); + struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, + unlh->nlmsg_type, sizeof(*r)); + + r = NLMSG_DATA(nlh); + BUG_ON(tw->tw_state != TCP_TIME_WAIT); + + nlh->nlmsg_flags = nlmsg_flags; + + tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->idiag_family = tw->tw_family; + r->idiag_retrans = 0; + r->id.idiag_if = tw->tw_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)tw; + r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1); + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (tw->tw_family == AF_INET6) { + const struct inet6_timewait_sock *tw6 = + inet6_twsk((struct sock *)tw); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tw6->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tw6->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; + return skb->len; +nlmsg_failure: + nlmsg_trim(skb, previous_tail); + return -EMSGSIZE; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, + skb, ext, pid, seq, nlmsg_flags, + unlh); + return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh); +} + +static int inet_diag_get_exact(struct sk_buff *in_skb, + const struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct inet_diag_req *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + struct inet_hashinfo *hashinfo; + const struct inet_diag_handler *handler; + + handler = inet_diag_lock_handler(nlh->nlmsg_type); + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + goto unlock; + } + + hashinfo = handler->idiag_hashinfo; + err = -EINVAL; + + if (req->idiag_family == AF_INET) { + sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + else if (req->idiag_family == AF_INET6) { + sk = inet6_lookup(&init_net, hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } +#endif + else { + goto unlock; + } + + err = -ENOENT; + if (sk == NULL) + goto unlock; + + err = -ESTALE; + if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || + req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + + handler->idiag_info_size + 64)), + GFP_KERNEL); + if (!rep) + goto out; + + err = sk_diag_fill(sk, rep, req->idiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0, nlh); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put((struct inet_timewait_sock *)sk); + else + sock_put(sk); + } +unlock: + inet_diag_unlock_handler(handler); + return err; +} + +static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __be32 w1, w2; + __be32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +static int inet_diag_bc_run(const void *bc, int len, + const struct inet_diag_entry *entry) +{ + while (len > 0) { + int yes = 1; + const struct inet_diag_bc_op *op = bc; + + switch (op->code) { + case INET_DIAG_BC_NOP: + break; + case INET_DIAG_BC_JMP: + yes = 0; + break; + case INET_DIAG_BC_S_GE: + yes = entry->sport >= op[1].no; + break; + case INET_DIAG_BC_S_LE: + yes = entry->sport <= op[1].no; + break; + case INET_DIAG_BC_D_GE: + yes = entry->dport >= op[1].no; + break; + case INET_DIAG_BC_D_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_AUTO: + yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); + break; + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: { + struct inet_diag_hostcond *cond; + __be32 *addr; + + cond = (struct inet_diag_hostcond *)(op + 1); + if (cond->port != -1 && + cond->port != (op->code == INET_DIAG_BC_S_COND ? + entry->sport : entry->dport)) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (op->code == INET_DIAG_BC_S_COND) + addr = entry->saddr; + else + addr = entry->daddr; + + if (bitstring_match(addr, cond->addr, + cond->prefix_len)) + break; + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return len == 0; +} + +static int valid_cc(const void *bc, int len, int cc) +{ + while (len >= 0) { + const struct inet_diag_bc_op *op = bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4 || op->yes & 3) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +{ + const void *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + const struct inet_diag_bc_op *op = bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: + case INET_DIAG_BC_S_GE: + case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_GE: + case INET_DIAG_BC_D_LE: + case INET_DIAG_BC_JMP: + if (op->no < 4 || op->no > len + 4 || op->no & 3) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + break; + case INET_DIAG_BC_NOP: + break; + default: + return -EINVAL; + } + if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) + return -EINVAL; + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + +static int inet_csk_diag_dump(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + struct inet_diag_entry entry; + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); + struct inet_sock *inet = inet_sk(sk); + + entry.family = sk->sk_family; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; + } else +#endif + { + entry.saddr = &inet->inet_rcv_saddr; + entry.daddr = &inet->inet_daddr; + } + entry.sport = inet->inet_num; + entry.dport = ntohs(inet->inet_dport); + entry.userlocks = sk->sk_userlocks; + + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) + return 0; + } + + return inet_csk_diag_fill(sk, skb, r->idiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + struct inet_diag_entry entry; + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); + + entry.family = tw->tw_family; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (tw->tw_family == AF_INET6) { + struct inet6_timewait_sock *tw6 = + inet6_twsk((struct sock *)tw); + entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32; + entry.daddr = tw6->tw_v6_daddr.s6_addr32; + } else +#endif + { + entry.saddr = &tw->tw_rcv_saddr; + entry.daddr = &tw->tw_daddr; + } + entry.sport = tw->tw_num; + entry.dport = ntohs(tw->tw_dport); + entry.userlocks = 0; + + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) + return 0; + } + + return inet_twsk_diag_fill(tw, skb, r->idiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, + struct request_sock *req, u32 pid, u32 seq, + const struct nlmsghdr *unlh) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *inet = inet_sk(sk); + unsigned char *b = skb_tail_pointer(skb); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = NLM_F_MULTI; + r = NLMSG_DATA(nlh); + + r->idiag_family = sk->sk_family; + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = req->retrans; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)req; + r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + + tmo = req->expires - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &inet6_rsk(req)->loc_addr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &inet6_rsk(req)->rmt_addr); + } +#endif + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + + return skb->len; + +nlmsg_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_entry entry; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt; + const struct nlattr *bc = NULL; + struct inet_sock *inet = inet_sk(sk); + int j, s_j; + int reqnum, s_reqnum; + int err = 0; + + s_j = cb->args[3]; + s_reqnum = cb->args[4]; + + if (s_j > 0) + s_j--; + + entry.family = sk->sk_family; + + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + lopt = icsk->icsk_accept_queue.listen_opt; + if (!lopt || !lopt->qlen) + goto out; + + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + bc = nlmsg_find_attr(cb->nlh, sizeof(*r), + INET_DIAG_REQ_BYTECODE); + entry.sport = inet->inet_num; + entry.userlocks = sk->sk_userlocks; + } + + for (j = s_j; j < lopt->nr_table_entries; j++) { + struct request_sock *req, *head = lopt->syn_table[j]; + + reqnum = 0; + for (req = head; req; reqnum++, req = req->dl_next) { + struct inet_request_sock *ireq = inet_rsk(req); + + if (reqnum < s_reqnum) + continue; + if (r->id.idiag_dport != ireq->rmt_port && + r->id.idiag_dport) + continue; + + if (bc) { + entry.saddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + inet6_rsk(req)->loc_addr.s6_addr32 : +#endif + &ireq->loc_addr; + entry.daddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + inet6_rsk(req)->rmt_addr.s6_addr32 : +#endif + &ireq->rmt_addr; + entry.dport = ntohs(ireq->rmt_port); + + if (!inet_diag_bc_run(nla_data(bc), + nla_len(bc), &entry)) + continue; + } + + err = inet_diag_fill_req(skb, sk, req, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, cb->nlh); + if (err < 0) { + cb->args[3] = j + 1; + cb->args[4] = reqnum; + goto out; + } + } + + s_reqnum = 0; + } + +out: + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + return err; +} + +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; + + handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); + if (IS_ERR(handler)) + goto unlock; + + hashinfo = handler->idiag_hashinfo; + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + goto skip_listen_ht; + + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + struct sock *sk; + struct hlist_nulls_node *node; + struct inet_listen_hashbucket *ilb; + + num = 0; + ilb = &hashinfo->listening_hash[i]; + spin_lock_bh(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->head) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) { + num++; + continue; + } + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!(r->idiag_states & TCPF_LISTEN) || + r->id.idiag_dport || + cb->args[3] > 0) + goto syn_recv; + + if (inet_csk_diag_dump(sk, skb, cb) < 0) { + spin_unlock_bh(&ilb->lock); + goto done; + } + +syn_recv: + if (!(r->idiag_states & TCPF_SYN_RECV)) + goto next_listen; + + if (inet_diag_dump_reqs(skb, sk, cb) < 0) { + spin_unlock_bh(&ilb->lock); + goto done; + } + +next_listen: + cb->args[3] = 0; + cb->args[4] = 0; + ++num; + } + spin_unlock_bh(&ilb->lock); + + s_num = 0; + cb->args[3] = 0; + cb->args[4] = 0; + } +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + goto unlock; + + for (i = s_i; i <= hashinfo->ehash_mask; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); + struct sock *sk; + struct hlist_nulls_node *node; + + num = 0; + + if (hlist_nulls_empty(&head->chain) && + hlist_nulls_empty(&head->twchain)) + continue; + + if (i > s_i) + s_num = 0; + + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next_normal; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next_normal; + if (inet_csk_diag_dump(sk, skb, cb) < 0) { + spin_unlock_bh(lock); + goto done; + } +next_normal: + ++num; + } + + if (r->idiag_states & TCPF_TIME_WAIT) { + struct inet_timewait_sock *tw; + + inet_twsk_for_each(tw, node, + &head->twchain) { + + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != tw->tw_sport && + r->id.idiag_sport) + goto next_dying; + if (r->id.idiag_dport != tw->tw_dport && + r->id.idiag_dport) + goto next_dying; + if (inet_twsk_diag_dump(tw, skb, cb) < 0) { + spin_unlock_bh(lock); + goto done; + } +next_dying: + ++num; + } + } + spin_unlock_bh(lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; +unlock: + inet_diag_unlock_handler(handler); + return skb->len; +} + +static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int hdrlen = sizeof(struct inet_diag_req); + + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || + nlmsg_len(nlh) < hdrlen) + return -EINVAL; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (nlmsg_attrlen(nlh, hdrlen)) { + struct nlattr *attr; + + attr = nlmsg_find_attr(nlh, hdrlen, + INET_DIAG_REQ_BYTECODE); + if (attr == NULL || + nla_len(attr) < sizeof(struct inet_diag_bc_op) || + inet_diag_bc_audit(nla_data(attr), nla_len(attr))) + return -EINVAL; + } + + return netlink_dump_start(idiagnl, skb, nlh, + inet_diag_dump, NULL); + } + + return inet_diag_get_exact(skb, nlh); +} + +static DEFINE_MUTEX(inet_diag_mutex); + +static void inet_diag_rcv(struct sk_buff *skb) +{ + mutex_lock(&inet_diag_mutex); + netlink_rcv_skb(skb, &inet_diag_rcv_msg); + mutex_unlock(&inet_diag_mutex); +} + +int inet_diag_register(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + int err = -EINVAL; + + if (type >= INET_DIAG_GETSOCK_MAX) + goto out; + + mutex_lock(&inet_diag_table_mutex); + err = -EEXIST; + if (inet_diag_table[type] == NULL) { + inet_diag_table[type] = h; + err = 0; + } + mutex_unlock(&inet_diag_table_mutex); +out: + return err; +} +EXPORT_SYMBOL_GPL(inet_diag_register); + +void inet_diag_unregister(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + + if (type >= INET_DIAG_GETSOCK_MAX) + return; + + mutex_lock(&inet_diag_table_mutex); + inet_diag_table[type] = NULL; + mutex_unlock(&inet_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(inet_diag_unregister); + +static int __init inet_diag_init(void) +{ + const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * + sizeof(struct inet_diag_handler *)); + int err = -ENOMEM; + + inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL); + if (!inet_diag_table) + goto out; + + idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0, + inet_diag_rcv, NULL, THIS_MODULE); + if (idiagnl == NULL) + goto out_free_table; + err = 0; +out: + return err; +out_free_table: + kfree(inet_diag_table); + goto out; +} + +static void __exit inet_diag_exit(void) +{ + netlink_kernel_release(idiagnl); + kfree(inet_diag_table); +} + +module_init(inet_diag_init); +module_exit(inet_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c new file mode 100644 index 00000000..5ff2a51b --- /dev/null +++ b/net/ipv4/inet_fragment.c @@ -0,0 +1,286 @@ +/* + * inet fragments management + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Pavel Emelyanov + * Started as consolidation of ipv4/ip_fragment.c, + * ipv6/reassembly. and ipv6 nf conntrack reassembly + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void inet_frag_secret_rebuild(unsigned long dummy) +{ + struct inet_frags *f = (struct inet_frags *)dummy; + unsigned long now = jiffies; + int i; + + write_lock(&f->lock); + get_random_bytes(&f->rnd, sizeof(u32)); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { + struct inet_frag_queue *q; + struct hlist_node *p, *n; + + hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) { + unsigned int hval = f->hashfn(q); + + if (hval != i) { + hlist_del(&q->list); + + /* Relink to new hash chain. */ + hlist_add_head(&q->list, &f->hash[hval]); + } + } + } + write_unlock(&f->lock); + + mod_timer(&f->secret_timer, now + f->secret_interval); +} + +void inet_frags_init(struct inet_frags *f) +{ + int i; + + for (i = 0; i < INETFRAGS_HASHSZ; i++) + INIT_HLIST_HEAD(&f->hash[i]); + + rwlock_init(&f->lock); + + f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + setup_timer(&f->secret_timer, inet_frag_secret_rebuild, + (unsigned long)f); + f->secret_timer.expires = jiffies + f->secret_interval; + add_timer(&f->secret_timer); +} +EXPORT_SYMBOL(inet_frags_init); + +void inet_frags_init_net(struct netns_frags *nf) +{ + nf->nqueues = 0; + atomic_set(&nf->mem, 0); + INIT_LIST_HEAD(&nf->lru_list); +} +EXPORT_SYMBOL(inet_frags_init_net); + +void inet_frags_fini(struct inet_frags *f) +{ + del_timer(&f->secret_timer); +} +EXPORT_SYMBOL(inet_frags_fini); + +void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +{ + nf->low_thresh = 0; + + local_bh_disable(); + inet_frag_evictor(nf, f); + local_bh_enable(); +} +EXPORT_SYMBOL(inet_frags_exit_net); + +static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +{ + write_lock(&f->lock); + hlist_del(&fq->list); + list_del(&fq->lru_list); + fq->net->nqueues--; + write_unlock(&f->lock); +} + +void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + + if (!(fq->last_in & INET_FRAG_COMPLETE)) { + fq_unlink(fq, f); + atomic_dec(&fq->refcnt); + fq->last_in |= INET_FRAG_COMPLETE; + } +} +EXPORT_SYMBOL(inet_frag_kill); + +static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, + struct sk_buff *skb, int *work) +{ + if (work) + *work -= skb->truesize; + + atomic_sub(skb->truesize, &nf->mem); + if (f->skb_free) + f->skb_free(skb); + kfree_skb(skb); +} + +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, + int *work) +{ + struct sk_buff *fp; + struct netns_frags *nf; + + WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); + WARN_ON(del_timer(&q->timer) != 0); + + /* Release all fragment data. */ + fp = q->fragments; + nf = q->net; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(nf, f, fp, work); + fp = xp; + } + + if (work) + *work -= f->qsize; + atomic_sub(f->qsize, &nf->mem); + + if (f->destructor) + f->destructor(q); + kfree(q); + +} +EXPORT_SYMBOL(inet_frag_destroy); + +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) +{ + struct inet_frag_queue *q; + int work, evicted = 0; + + work = atomic_read(&nf->mem) - nf->low_thresh; + while (work > 0) { + read_lock(&f->lock); + if (list_empty(&nf->lru_list)) { + read_unlock(&f->lock); + break; + } + + q = list_first_entry(&nf->lru_list, + struct inet_frag_queue, lru_list); + atomic_inc(&q->refcnt); + read_unlock(&f->lock); + + spin_lock(&q->lock); + if (!(q->last_in & INET_FRAG_COMPLETE)) + inet_frag_kill(q, f); + spin_unlock(&q->lock); + + if (atomic_dec_and_test(&q->refcnt)) + inet_frag_destroy(q, f, &work); + evicted++; + } + + return evicted; +} +EXPORT_SYMBOL(inet_frag_evictor); + +static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, + struct inet_frag_queue *qp_in, struct inet_frags *f, + void *arg) +{ + struct inet_frag_queue *qp; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif + unsigned int hash; + + write_lock(&f->lock); + /* + * While we stayed w/o the lock other CPU could update + * the rnd seed, so we need to re-calculate the hash + * chain. Fortunatelly the qp_in can be used to get one. + */ + hash = f->hashfn(qp_in); +#ifdef CONFIG_SMP + /* With SMP race we have to recheck hash table, because + * such entry could be created on other cpu, while we + * promoted read lock to write lock. + */ + hlist_for_each_entry(qp, n, &f->hash[hash], list) { + if (qp->net == nf && f->match(qp, arg)) { + atomic_inc(&qp->refcnt); + write_unlock(&f->lock); + qp_in->last_in |= INET_FRAG_COMPLETE; + inet_frag_put(qp_in, f); + return qp; + } + } +#endif + qp = qp_in; + if (!mod_timer(&qp->timer, jiffies + nf->timeout)) + atomic_inc(&qp->refcnt); + + atomic_inc(&qp->refcnt); + hlist_add_head(&qp->list, &f->hash[hash]); + list_add_tail(&qp->lru_list, &nf->lru_list); + nf->nqueues++; + write_unlock(&f->lock); + return qp; +} + +static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, + struct inet_frags *f, void *arg) +{ + struct inet_frag_queue *q; + + q = kzalloc(f->qsize, GFP_ATOMIC); + if (q == NULL) + return NULL; + + f->constructor(q, arg); + atomic_add(f->qsize, &nf->mem); + setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + spin_lock_init(&q->lock); + atomic_set(&q->refcnt, 1); + q->net = nf; + + return q; +} + +static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, + struct inet_frags *f, void *arg) +{ + struct inet_frag_queue *q; + + q = inet_frag_alloc(nf, f, arg); + if (q == NULL) + return NULL; + + return inet_frag_intern(nf, q, f, arg); +} + +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, + struct inet_frags *f, void *key, unsigned int hash) + __releases(&f->lock) +{ + struct inet_frag_queue *q; + struct hlist_node *n; + + hlist_for_each_entry(q, n, &f->hash[hash], list) { + if (q->net == nf && f->match(q, key)) { + atomic_inc(&q->refcnt); + read_unlock(&f->lock); + return q; + } + } + read_unlock(&f->lock); + + return inet_frag_create(nf, f, key); +} +EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c new file mode 100644 index 00000000..984ec656 --- /dev/null +++ b/net/ipv4/inet_hashtables.c @@ -0,0 +1,584 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Allocate and initialize a new local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, + struct net *net, + struct inet_bind_hashbucket *head, + const unsigned short snum) +{ + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); + + if (tb != NULL) { + write_pnet(&tb->ib_net, hold_net(net)); + tb->port = snum; + tb->fastreuse = 0; + tb->num_owners = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +/* + * Caller must hold hashbucket lock for this tb with local BH disabled + */ +void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + release_net(ib_net(tb)); + kmem_cache_free(cachep, tb); + } +} + +void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + + atomic_inc(&hashinfo->bsockets); + + inet_sk(sk)->inet_num = snum; + sk_add_bind_node(sk, &tb->owners); + tb->num_owners++; + inet_csk(sk)->icsk_bind_hash = tb; +} + +/* + * Get rid of any references to a local port held by the given sock. + */ +static void __inet_put_port(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, + hashinfo->bhash_size); + struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + struct inet_bind_bucket *tb; + + atomic_dec(&hashinfo->bsockets); + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + __sk_del_bind_node(sk); + tb->num_owners--; + inet_csk(sk)->icsk_bind_hash = NULL; + inet_sk(sk)->inet_num = 0; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&head->lock); +} + +void inet_put_port(struct sock *sk) +{ + local_bh_disable(); + __inet_put_port(sk); + local_bh_enable(); +} +EXPORT_SYMBOL(inet_put_port); + +int __inet_inherit_port(struct sock *sk, struct sock *child) +{ + struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; + unsigned short port = inet_sk(child)->inet_num; + const int bhash = inet_bhashfn(sock_net(sk), port, + table->bhash_size); + struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + if (tb->port != port) { + /* NOTE: using tproxy and redirecting skbs to a proxy + * on a different listener port breaks the assumption + * that the listener socket's icsk_bind_hash is the same + * as that of the child socket. We have to look up or + * create a new bind bucket for the child here. */ + struct hlist_node *node; + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (net_eq(ib_net(tb), sock_net(sk)) && + tb->port == port) + break; + } + if (!node) { + tb = inet_bind_bucket_create(table->bind_bucket_cachep, + sock_net(sk), head, port); + if (!tb) { + spin_unlock(&head->lock); + return -ENOMEM; + } + } + } + inet_bind_hash(child, tb, port); + spin_unlock(&head->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(__inet_inherit_port); + +static inline int compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, const __be32 daddr, + const int dif) +{ + int score = -1; + struct inet_sock *inet = inet_sk(sk); + + if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && + !ipv6_only_sock(sk)) { + __be32 rcv_saddr = inet->inet_rcv_saddr; + score = sk->sk_family == PF_INET ? 1 : 0; + if (rcv_saddr) { + if (rcv_saddr != daddr) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; + } + } + return score; +} + +/* + * Don't inline this cruft. Here are some nice properties to exploit here. The + * BSD API does not allow a listening sock to specify the remote port nor the + * remote address for the connection. So always assume those are both + * wildcarded during the search since they can never be otherwise. + */ + + +struct sock *__inet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + const __be32 daddr, const unsigned short hnum, + const int dif) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned int hash = inet_lhashfn(net, hnum); + struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + int score, hiscore; + + rcu_read_lock(); +begin: + result = NULL; + hiscore = -1; + sk_nulls_for_each_rcu(sk, node, &ilb->head) { + score = compute_score(sk, net, hnum, daddr, dif); + if (score > hiscore) { + result = sk; + hiscore = score; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) + goto begin; + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, daddr, + dif) < hiscore)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL_GPL(__inet_lookup_listener); + +struct sock * __inet_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif) +{ + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + struct sock *sk; + const struct hlist_nulls_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); + unsigned int slot = hash & hashinfo->ehash_mask; + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + + rcu_read_lock(); +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + if (INET_MATCH(sk, net, hash, acookie, + saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) + goto begintw; + if (unlikely(!INET_MATCH(sk, net, hash, acookie, + saddr, daddr, ports, dif))) { + sock_put(sk); + goto begin; + } + goto out; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + +begintw: + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_nulls_for_each_rcu(sk, node, &head->twchain) { + if (INET_TW_MATCH(sk, net, hash, acookie, + saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { + sk = NULL; + goto out; + } + if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, + saddr, daddr, ports, dif))) { + sock_put(sk); + goto begintw; + } + goto out; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begintw; + sk = NULL; +out: + rcu_read_unlock(); + return sk; +} +EXPORT_SYMBOL_GPL(__inet_lookup_established); + +/* called with local bh disabled */ +static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + __be32 daddr = inet->inet_rcv_saddr; + __be32 saddr = inet->inet_daddr; + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); + struct net *net = sock_net(sk); + unsigned int hash = inet_ehashfn(net, daddr, lport, + saddr, inet->inet_dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + spinlock_t *lock = inet_ehash_lockp(hinfo, hash); + struct sock *sk2; + const struct hlist_nulls_node *node; + struct inet_timewait_sock *tw; + int twrefcnt = 0; + + spin_lock(lock); + + /* Check TIME-WAIT sockets first. */ + sk_nulls_for_each(sk2, node, &head->twchain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, net, hash, acookie, + saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_nulls_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, net, hash, acookie, + saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->inet_num = lport; + inet->inet_sport = htons(lport); + sk->sk_hash = hash; + WARN_ON(!sk_unhashed(sk)); + __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + twrefcnt = inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } + spin_unlock(lock); + if (twrefcnt) + inet_twsk_put(tw); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + if (twp) { + *twp = tw; + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + spin_unlock(lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, + inet->inet_daddr, + inet->inet_dport); +} + +int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct hlist_nulls_head *list; + spinlock_t *lock; + struct inet_ehash_bucket *head; + int twrefcnt = 0; + + WARN_ON(!sk_unhashed(sk)); + + sk->sk_hash = inet_sk_ehashfn(sk); + head = inet_ehash_bucket(hashinfo, sk->sk_hash); + list = &head->chain; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + spin_lock(lock); + __sk_nulls_add_node_rcu(sk, list); + if (tw) { + WARN_ON(sk->sk_hash != tw->tw_hash); + twrefcnt = inet_twsk_unhash(tw); + } + spin_unlock(lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return twrefcnt; +} +EXPORT_SYMBOL_GPL(__inet_hash_nolisten); + +static void __inet_hash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; + + if (sk->sk_state != TCP_LISTEN) { + __inet_hash_nolisten(sk, NULL); + return; + } + + WARN_ON(!sk_unhashed(sk)); + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + + spin_lock(&ilb->lock); + __sk_nulls_add_node_rcu(sk, &ilb->head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + spin_unlock(&ilb->lock); +} + +void inet_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(sk); + local_bh_enable(); + } +} +EXPORT_SYMBOL_GPL(inet_hash); + +void inet_unhash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + spinlock_t *lock; + int done; + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) + lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; + else + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + spin_lock_bh(lock); + done =__sk_nulls_del_node_init_rcu(sk); + if (done) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_unlock_bh(lock); +} +EXPORT_SYMBOL_GPL(inet_unhash); + +int __inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk, u32 port_offset, + int (*check_established)(struct inet_timewait_death_row *, + struct sock *, __u16, struct inet_timewait_sock **), + int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->inet_num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + struct net *net = sock_net(sk); + int twrefcnt = 1; + + if (!snum) { + int i, remaining, low, high, port; + static u32 hint; + u32 offset = hint + port_offset; + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + + local_bh_disable(); + for (i = 1; i <= remaining; i++) { + port = low + (i + offset) % remaining; + if (inet_is_reserved_local_port(port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (net_eq(ib_net(tb), net) && + tb->port == port) { + if (tb->fastreuse >= 0) + goto next_port; + WARN_ON(hlist_empty(&tb->owners)); + if (!check_established(death_row, sk, + port, &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->inet_sport = htons(port); + twrefcnt += hash(sk, tw); + } + if (tw) + twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row); + while (twrefcnt) { + twrefcnt--; + inet_twsk_put(tw); + } + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + hash(sk, NULL); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +/* + * Bind a port for a connect operation and hash it. + */ +int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), + __inet_check_established, __inet_hash_nolisten); +} +EXPORT_SYMBOL_GPL(inet_hash_connect); + +void inet_hashinfo_init(struct inet_hashinfo *h) +{ + int i; + + atomic_set(&h->bsockets, 0); + for (i = 0; i < INET_LHTABLE_SIZE; i++) { + spin_lock_init(&h->listening_hash[i].lock); + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, + i + LISTENING_NULLS_BASE); + } +} +EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c new file mode 100644 index 00000000..85a0f75d --- /dev/null +++ b/net/ipv4/inet_lro.c @@ -0,0 +1,600 @@ +/* + * linux/net/ipv4/inet_lro.c + * + * Large Receive Offload (ipv4 / tcp) + * + * (C) Copyright IBM Corp. 2007 + * + * Authors: + * Jan-Bernd Themann + * Christoph Raisch + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jan-Bernd Themann "); +MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); + +#define TCP_HDR_LEN(tcph) (tcph->doff << 2) +#define IP_HDR_LEN(iph) (iph->ihl << 2) +#define TCP_PAYLOAD_LENGTH(iph, tcph) \ + (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) + +#define IPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_W_TIMESTAMP 8 + +#define LRO_MAX_PG_HLEN 64 + +#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } + +/* + * Basic tcp checks whether packet is suitable for LRO + */ + +static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, + int len, const struct net_lro_desc *lro_desc) +{ + /* check ip header: don't aggregate padded frames */ + if (ntohs(iph->tot_len) != len) + return -1; + + if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) + return -1; + + if (iph->ihl != IPH_LEN_WO_OPTIONS) + return -1; + + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || + tcph->rst || tcph->syn || tcph->fin) + return -1; + + if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) + return -1; + + if (tcph->doff != TCPH_LEN_WO_OPTIONS && + tcph->doff != TCPH_LEN_W_TIMESTAMP) + return -1; + + /* check tcp options (only timestamp allowed) */ + if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { + __be32 *topt = (__be32 *)(tcph + 1); + + if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return -1; + + /* timestamp should be in right order */ + topt++; + if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), + ntohl(*topt))) + return -1; + + /* timestamp reply should not be zero */ + topt++; + if (*topt == 0) + return -1; + } + + return 0; +} + +static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) +{ + struct iphdr *iph = lro_desc->iph; + struct tcphdr *tcph = lro_desc->tcph; + __be32 *p; + __wsum tcp_hdr_csum; + + tcph->ack_seq = lro_desc->tcp_ack; + tcph->window = lro_desc->tcp_window; + + if (lro_desc->tcp_saw_tstamp) { + p = (__be32 *)(tcph + 1); + *(p+2) = lro_desc->tcp_rcv_tsecr; + } + + iph->tot_len = htons(lro_desc->ip_tot_len); + + iph->check = 0; + iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); + + tcph->check = 0; + tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); + lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); + tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + lro_desc->ip_tot_len - + IP_HDR_LEN(iph), IPPROTO_TCP, + lro_desc->data_csum); +} + +static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) +{ + __wsum tcp_csum; + __wsum tcp_hdr_csum; + __wsum tcp_ps_hdr_csum; + + tcp_csum = ~csum_unfold(tcph->check); + tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); + + tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + len + TCP_HDR_LEN(tcph), + IPPROTO_TCP, 0); + + return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), + tcp_ps_hdr_csum); +} + +static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph, + u16 vlan_tag, struct vlan_group *vgrp) +{ + int nr_frags; + __be32 *ptr; + u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + nr_frags = skb_shinfo(skb)->nr_frags; + lro_desc->parent = skb; + lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); + lro_desc->iph = iph; + lro_desc->tcph = tcph; + lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; + lro_desc->tcp_ack = tcph->ack_seq; + lro_desc->tcp_window = tcph->window; + + lro_desc->pkt_aggr_cnt = 1; + lro_desc->ip_tot_len = ntohs(iph->tot_len); + + if (tcph->doff == 8) { + ptr = (__be32 *)(tcph+1); + lro_desc->tcp_saw_tstamp = 1; + lro_desc->tcp_rcv_tsval = *(ptr+1); + lro_desc->tcp_rcv_tsecr = *(ptr+2); + } + + lro_desc->mss = tcp_data_len; + lro_desc->vgrp = vgrp; + lro_desc->vlan_tag = vlan_tag; + lro_desc->active = 1; + + lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, + tcp_data_len); +} + +static inline void lro_clear_desc(struct net_lro_desc *lro_desc) +{ + memset(lro_desc, 0, sizeof(struct net_lro_desc)); +} + +static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, + struct tcphdr *tcph, int tcp_data_len) +{ + struct sk_buff *parent = lro_desc->parent; + __be32 *topt; + + lro_desc->pkt_aggr_cnt++; + lro_desc->ip_tot_len += tcp_data_len; + lro_desc->tcp_next_seq += tcp_data_len; + lro_desc->tcp_window = tcph->window; + lro_desc->tcp_ack = tcph->ack_seq; + + /* don't update tcp_rcv_tsval, would not work with PAWS */ + if (lro_desc->tcp_saw_tstamp) { + topt = (__be32 *) (tcph + 1); + lro_desc->tcp_rcv_tsecr = *(topt + 2); + } + + lro_desc->data_csum = csum_block_add(lro_desc->data_csum, + lro_tcp_data_csum(iph, tcph, + tcp_data_len), + parent->len); + + parent->len += tcp_data_len; + parent->data_len += tcp_data_len; + if (tcp_data_len > lro_desc->mss) + lro_desc->mss = tcp_data_len; +} + +static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *parent = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb_pull(skb, (skb->len - tcp_data_len)); + parent->truesize += skb->truesize; + + if (lro_desc->last_skb) + lro_desc->last_skb->next = skb; + else + skb_shinfo(parent)->frag_list = skb; + + lro_desc->last_skb = skb; +} + +static void lro_add_frags(struct net_lro_desc *lro_desc, + int len, int hlen, int truesize, + struct skb_frag_struct *skb_frags, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *skb = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb->truesize += truesize; + + skb_frags[0].page_offset += hlen; + skb_frags[0].size -= hlen; + + while (tcp_data_len > 0) { + *(lro_desc->next_frag) = *skb_frags; + tcp_data_len -= skb_frags->size; + lro_desc->next_frag++; + skb_frags++; + skb_shinfo(skb)->nr_frags++; + } +} + +static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, + struct iphdr *iph, + struct tcphdr *tcph) +{ + if ((lro_desc->iph->saddr != iph->saddr) || + (lro_desc->iph->daddr != iph->daddr) || + (lro_desc->tcph->source != tcph->source) || + (lro_desc->tcph->dest != tcph->dest)) + return -1; + return 0; +} + +static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_arr, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc = NULL; + struct net_lro_desc *tmp; + int max_desc = lro_mgr->max_desc; + int i; + + for (i = 0; i < max_desc; i++) { + tmp = &lro_arr[i]; + if (tmp->active) + if (!lro_check_tcp_conn(tmp, iph, tcph)) { + lro_desc = tmp; + goto out; + } + } + + for (i = 0; i < max_desc; i++) { + if (!lro_arr[i].active) { + lro_desc = &lro_arr[i]; + goto out; + } + } + + LRO_INC_STATS(lro_mgr, no_desc); +out: + return lro_desc; +} + +static void lro_flush(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_desc) +{ + if (lro_desc->pkt_aggr_cnt > 1) + lro_update_tcp_ip_header(lro_desc); + + skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; + + if (lro_desc->vgrp) { + if (lro_mgr->features & LRO_F_NAPI) + vlan_hwaccel_receive_skb(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + else + vlan_hwaccel_rx(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + + } else { + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(lro_desc->parent); + else + netif_rx(lro_desc->parent); + } + + LRO_INC_STATS(lro_mgr, flushed); + lro_clear_desc(lro_desc); +} + +static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, + struct vlan_group *vgrp, u16 vlan_tag, void *priv) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + u64 flags; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_skb_header || + lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) + goto out; + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + vlan_hdr_len = VLAN_HLEN; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) + goto out; + + skb->ip_summed = lro_mgr->ip_summed_aggr; + lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); + LRO_INC_STATS(lro_mgr, aggregated); + return 0; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) + goto out2; + + lro_add_packet(lro_desc, skb, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return 0; + +out2: /* send aggregated SKBs to stack */ + lro_flush(lro_mgr, lro_desc); + +out: + return 1; +} + + +static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + void *mac_hdr, + int hlen, __wsum sum, + u32 ip_summed) +{ + struct sk_buff *skb; + struct skb_frag_struct *skb_frags; + int data_len = len; + int hdr_len = min(len, hlen); + + skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); + if (!skb) + return NULL; + + skb_reserve(skb, lro_mgr->frag_align_pad); + skb->len = len; + skb->data_len = len - hdr_len; + skb->truesize += true_size; + skb->tail += hdr_len; + + memcpy(skb->data, mac_hdr, hdr_len); + + skb_frags = skb_shinfo(skb)->frags; + while (data_len > 0) { + *skb_frags = *frags; + data_len -= frags->size; + skb_frags++; + frags++; + skb_shinfo(skb)->nr_frags++; + } + + skb_shinfo(skb)->frags[0].page_offset += hdr_len; + skb_shinfo(skb)->frags[0].size -= hdr_len; + + skb->ip_summed = ip_summed; + skb->csum = sum; + skb->protocol = eth_type_trans(skb, lro_mgr->dev); + return skb; +} + +static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + struct sk_buff *skb; + u64 flags; + void *mac_hdr; + int mac_hdr_len; + int hdr_len = LRO_MAX_PG_HLEN; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_frag_header || + lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { + mac_hdr = page_address(frags->page) + frags->page_offset; + goto out1; + } + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out1; + + hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); + mac_hdr_len = (int)((void *)(iph) - mac_hdr); + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out1; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) + goto out1; + + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, 0, lro_mgr->ip_summed_aggr); + if (!skb) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + vlan_hdr_len = VLAN_HLEN; + + iph = (void *)(skb->data + vlan_hdr_len); + tcph = (void *)((u8 *)skb->data + vlan_hdr_len + + IP_HDR_LEN(iph)); + + lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL); + LRO_INC_STATS(lro_mgr, aggregated); + return NULL; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) + goto out2; + + lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return NULL; + +out2: /* send aggregated packets to the stack */ + lro_flush(lro_mgr, lro_desc); + +out1: /* Original packet has to be posted to the stack */ + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, sum, lro_mgr->ip_summed); +out: + return skb; +} + +void lro_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) { + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(skb); + else + netif_rx(skb); + } +} +EXPORT_SYMBOL(lro_receive_skb); + +void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + struct vlan_group *vgrp, + u16 vlan_tag, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) { + if (lro_mgr->features & LRO_F_NAPI) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); + } +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); + +void lro_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0, + priv, sum); + if (!skb) + return; + + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(skb); + else + netif_rx(skb); +} +EXPORT_SYMBOL(lro_receive_frags); + +void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp, + vlan_tag, priv, sum); + if (!skb) + return; + + if (lro_mgr->features & LRO_F_NAPI) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags); + +void lro_flush_all(struct net_lro_mgr *lro_mgr) +{ + int i; + struct net_lro_desc *lro_desc = lro_mgr->lro_arr; + + for (i = 0; i < lro_mgr->max_desc; i++) { + if (lro_desc[i].active) + lro_flush(lro_mgr, &lro_desc[i]); + } +} +EXPORT_SYMBOL(lro_flush_all); + +void lro_flush_pkt(struct net_lro_mgr *lro_mgr, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (lro_desc->active) + lro_flush(lro_mgr, lro_desc); +} +EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c new file mode 100644 index 00000000..3c8dfa16 --- /dev/null +++ b/net/ipv4/inet_timewait_sock.c @@ -0,0 +1,523 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic TIME_WAIT sockets functions + * + * From code orinally in TCP + */ + +#include +#include +#include +#include +#include +#include + + +/** + * inet_twsk_unhash - unhash a timewait socket from established hash + * @tw: timewait socket + * + * unhash a timewait socket from established hash, if hashed. + * ehash lock must be held by caller. + * Returns 1 if caller should call inet_twsk_put() after lock release. + */ +int inet_twsk_unhash(struct inet_timewait_sock *tw) +{ + if (hlist_nulls_unhashed(&tw->tw_node)) + return 0; + + hlist_nulls_del_rcu(&tw->tw_node); + sk_nulls_node_init(&tw->tw_node); + /* + * We cannot call inet_twsk_put() ourself under lock, + * caller must call it for us. + */ + return 1; +} + +/** + * inet_twsk_bind_unhash - unhash a timewait socket from bind hash + * @tw: timewait socket + * @hashinfo: hashinfo pointer + * + * unhash a timewait socket from bind hash, if hashed. + * bind hash lock must be held by caller. + * Returns 1 if caller should call inet_twsk_put() after lock release. + */ +int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo) +{ + struct inet_bind_bucket *tb = tw->tw_tb; + + if (!tb) + return 0; + + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + /* + * We cannot call inet_twsk_put() ourself under lock, + * caller must call it for us. + */ + return 1; +} + +/* Must be called with locally disabled BHs. */ +static void __inet_twsk_kill(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo) +{ + struct inet_bind_hashbucket *bhead; + int refcnt; + /* Unlink from established hashes. */ + spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + + spin_lock(lock); + refcnt = inet_twsk_unhash(tw); + spin_unlock(lock); + + /* Disassociate with bind bucket. */ + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; + + spin_lock(&bhead->lock); + refcnt += inet_twsk_bind_unhash(tw, hashinfo); + spin_unlock(&bhead->lock); + +#ifdef SOCK_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + } +#endif + while (refcnt) { + inet_twsk_put(tw); + refcnt--; + } +} + +static noinline void inet_twsk_free(struct inet_timewait_sock *tw) +{ + struct module *owner = tw->tw_prot->owner; + twsk_destructor((struct sock *)tw); +#ifdef SOCK_REFCNT_DEBUG + pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); +#endif + release_net(twsk_net(tw)); + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); + module_put(owner); +} + +void inet_twsk_put(struct inet_timewait_sock *tw) +{ + if (atomic_dec_and_test(&tw->tw_refcnt)) + inet_twsk_free(tw); +} +EXPORT_SYMBOL_GPL(inet_twsk_put); + +/* + * Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the relevant info into it + * from the SK, and mess with hash chains and list linkage. + */ +void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + struct inet_hashinfo *hashinfo) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + struct inet_bind_hashbucket *bhead; + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, + hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tw->tw_tb = icsk->icsk_bind_hash; + WARN_ON(!icsk->icsk_bind_hash); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + spin_lock(lock); + + /* + * Step 2: Hash TW into TIMEWAIT chain. + * Should be done before removing sk from established chain + * because readers are lockless and search established first. + */ + inet_twsk_add_node_rcu(tw, &ehead->twchain); + + /* Step 3: Remove SK from established hash. */ + if (__sk_nulls_del_node_init_rcu(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + + /* + * Notes : + * - We initially set tw_refcnt to 0 in inet_twsk_alloc() + * - We add one reference for the bhash link + * - We add one reference for the ehash link + * - We want this refcnt update done before allowing other + * threads to find this tw in ehash chain. + */ + atomic_add(1 + 1 + 1, &tw->tw_refcnt); + + spin_unlock(lock); +} +EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +{ + struct inet_timewait_sock *tw = + kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + + kmemcheck_annotate_bitfield(tw, flags); + + /* Give us an identity. */ + tw->tw_daddr = inet->inet_daddr; + tw->tw_rcv_saddr = inet->inet_rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->inet_num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->inet_sport; + tw->tw_dport = inet->inet_dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_hash = sk->sk_hash; + tw->tw_ipv6only = 0; + tw->tw_transparent = inet->transparent; + tw->tw_prot = sk->sk_prot_creator; + twsk_net_set(tw, hold_net(sock_net(sk))); + /* + * Because we use RCU lookups, we should not set tw_refcnt + * to a non null value before everything is setup for this + * timewait socket. + */ + atomic_set(&tw->tw_refcnt, 0); + inet_twsk_dead_node_init(tw); + __module_get(tw->tw_prot->owner); + } + + return tw; +} +EXPORT_SYMBOL_GPL(inet_twsk_alloc); + +/* Returns non-zero if quota exceeded. */ +static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + const int slot) +{ + struct inet_timewait_sock *tw; + struct hlist_node *node; + unsigned int killed; + int ret; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + killed = 0; + ret = 0; +rescan: + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + __inet_twsk_del_dead_node(tw); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); +#endif + inet_twsk_put(tw); + killed++; + spin_lock(&twdr->death_lock); + if (killed > INET_TWDR_TWKILL_QUOTA) { + ret = 1; + break; + } + + /* While we dropped twdr->death_lock, another cpu may have + * killed off the next TW bucket in the list, therefore + * do a fresh re-read of the hlist head node with the + * lock reacquired. We still use the hlist traversal + * macro in order to get the prefetches. + */ + goto rescan; + } + + twdr->tw_count -= killed; +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); +#endif + return ret; +} + +void inet_twdr_hangman(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int unsigned need_timer; + + twdr = (struct inet_timewait_death_row *)data; + spin_lock(&twdr->death_lock); + + if (twdr->tw_count == 0) + goto out; + + need_timer = 0; + if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { + twdr->thread_slots |= (1 << twdr->slot); + schedule_work(&twdr->twkill_work); + need_timer = 1; + } else { + /* We purged the entire slot, anything left? */ + if (twdr->tw_count) + need_timer = 1; + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); + } + if (need_timer) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); +out: + spin_unlock(&twdr->death_lock); +} +EXPORT_SYMBOL_GPL(inet_twdr_hangman); + +void inet_twdr_twkill_work(struct work_struct *work) +{ + struct inet_timewait_death_row *twdr = + container_of(work, struct inet_timewait_death_row, twkill_work); + int i; + + BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > + (sizeof(twdr->thread_slots) * 8)); + + while (twdr->thread_slots) { + spin_lock_bh(&twdr->death_lock); + for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { + if (!(twdr->thread_slots & (1 << i))) + continue; + + while (inet_twdr_do_twkill_work(twdr, i) != 0) { + if (need_resched()) { + spin_unlock_bh(&twdr->death_lock); + schedule(); + spin_lock_bh(&twdr->death_lock); + } + } + + twdr->thread_slots &= ~(1 << i); + } + spin_unlock_bh(&twdr->death_lock); + } +} +EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr) +{ + spin_lock(&twdr->death_lock); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); + } + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); +} +EXPORT_SYMBOL(inet_twsk_deschedule); + +void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo, const int timewait_len) +{ + struct hlist_head *list; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; + + spin_lock(&twdr->death_lock); + + /* Unlink it, if it was scheduled */ + if (inet_twsk_del_dead_node(tw)) + twdr->tw_count--; + else + atomic_inc(&tw->tw_refcnt); + + if (slot >= INET_TWDR_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= timewait_len) { + slot = INET_TWDR_TWKILL_SLOTS - 1; + } else { + slot = DIV_ROUND_UP(timeo, twdr->period); + if (slot >= INET_TWDR_TWKILL_SLOTS) + slot = INET_TWDR_TWKILL_SLOTS - 1; + } + tw->tw_ttd = jiffies + timeo; + slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); + list = &twdr->cells[slot]; + } else { + tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + + if (twdr->twcal_hand < 0) { + twdr->twcal_hand = 0; + twdr->twcal_jiffie = jiffies; + twdr->twcal_timer.expires = twdr->twcal_jiffie + + (slot << INET_TWDR_RECYCLE_TICK); + add_timer(&twdr->twcal_timer); + } else { + if (time_after(twdr->twcal_timer.expires, + jiffies + (slot << INET_TWDR_RECYCLE_TICK))) + mod_timer(&twdr->twcal_timer, + jiffies + (slot << INET_TWDR_RECYCLE_TICK)); + slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + list = &twdr->twcal_row[slot]; + } + + hlist_add_head(&tw->tw_death_node, list); + + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); +} +EXPORT_SYMBOL_GPL(inet_twsk_schedule); + +void inet_twdr_twcal_tick(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + twdr = (struct inet_timewait_death_row *)data; + + spin_lock(&twdr->death_lock); + if (twdr->twcal_hand < 0) + goto out; + + slot = twdr->twcal_hand; + j = twdr->twcal_jiffie; + + for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { + if (time_before_eq(j, now)) { + struct hlist_node *node, *safe; + struct inet_timewait_sock *tw; + + inet_twsk_for_each_inmate_safe(tw, node, safe, + &twdr->twcal_row[slot]) { + __inet_twsk_del_dead_node(tw); + __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); +#endif + inet_twsk_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + twdr->twcal_jiffie = j; + twdr->twcal_hand = slot; + } + + if (!hlist_empty(&twdr->twcal_row[slot])) { + mod_timer(&twdr->twcal_timer, j); + goto out; + } + } + j += 1 << INET_TWDR_RECYCLE_TICK; + slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + twdr->twcal_hand = -1; + +out: + if ((twdr->tw_count -= killed) == 0) + del_timer(&twdr->tw_timer); +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); +#endif + spin_unlock(&twdr->death_lock); +} +EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); + +void inet_twsk_purge(struct inet_hashinfo *hashinfo, + struct inet_timewait_death_row *twdr, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_nulls_node *node; + unsigned int slot; + + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + rcu_read_lock(); +restart: + sk_nulls_for_each_rcu(sk, node, &head->twchain) { + tw = inet_twsk(sk); + if ((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count)) + continue; + + if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) + continue; + + if (unlikely((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); + local_bh_disable(); + inet_twsk_deschedule(tw, twdr); + local_bh_enable(); + inet_twsk_put(tw); + goto restart_rcu; + } + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); + } +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c new file mode 100644 index 00000000..68776454 --- /dev/null +++ b/net/ipv4/inetpeer.c @@ -0,0 +1,637 @@ +/* + * INETPEER - A storage for permanent information about peers + * + * This source is covered by the GNU GPL, the same as all kernel sources. + * + * Authors: Andrey V. Savochkin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Theory of operations. + * We keep one entry for each peer IP address. The nodes contains long-living + * information about the peer which doesn't depend on routes. + * At this moment this information consists only of ID field for the next + * outgoing IP packet. This field is incremented with each packet as encoded + * in inet_getid() function (include/net/inetpeer.h). + * At the moment of writing this notes identifier of IP packets is generated + * to be unpredictable using this code only for packets subjected + * (actually or potentially) to defragmentation. I.e. DF packets less than + * PMTU in size uses a constant ID and do not use this code (see + * ip_select_ident() in include/net/ip.h). + * + * Route cache entries hold references to our nodes. + * New cache entries get references via lookup by destination IP address in + * the avl tree. The reference is grabbed only when it's needed i.e. only + * when we try to output IP packet which needs an unpredictable ID (see + * __ip_select_ident() in net/ipv4/route.c). + * Nodes are removed only when reference counter goes to 0. + * When it's happened the node may be removed when a sufficient amount of + * time has been passed since its last use. The less-recently-used entry can + * also be removed if the pool is overloaded i.e. if the total amount of + * entries is greater-or-equal than the threshold. + * + * Node pool is organised as an AVL tree. + * Such an implementation has been chosen not just for fun. It's a way to + * prevent easy and efficient DoS attacks by creating hash collisions. A huge + * amount of long living nodes in a single hash slot would significantly delay + * lookups performed with disabled BHs. + * + * Serialisation issues. + * 1. Nodes may appear in the tree only with the pool lock held. + * 2. Nodes may disappear from the tree only with the pool lock held + * AND reference count being 0. + * 3. Nodes appears and disappears from unused node list only under + * "inet_peer_unused_lock". + * 4. Global variable peer_total is modified under the pool lock. + * 5. struct inet_peer fields modification: + * avl_left, avl_right, avl_parent, avl_height: pool lock + * unused: unused node list lock + * refcnt: atomically against modifications on other CPU; + * usually under some other lock to prevent node disappearing + * dtime: unused node list lock + * daddr: unchangeable + * ip_id_count: atomic value (no lock needed) + */ + +static struct kmem_cache *peer_cachep __read_mostly; + +#define node_height(x) x->avl_height + +#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) +#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) +static const struct inet_peer peer_fake_node = { + .avl_left = peer_avl_empty_rcu, + .avl_right = peer_avl_empty_rcu, + .avl_height = 0 +}; + +struct inet_peer_base { + struct inet_peer __rcu *root; + seqlock_t lock; + int total; +}; + +static struct inet_peer_base v4_peers = { + .root = peer_avl_empty_rcu, + .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), + .total = 0, +}; + +static struct inet_peer_base v6_peers = { + .root = peer_avl_empty_rcu, + .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), + .total = 0, +}; + +#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ + +/* Exported for sysctl_net_ipv4. */ +int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more + * aggressively at this stage */ +int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ +int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ +int inet_peer_gc_mintime __read_mostly = 10 * HZ; +int inet_peer_gc_maxtime __read_mostly = 120 * HZ; + +static struct { + struct list_head list; + spinlock_t lock; +} unused_peers = { + .list = LIST_HEAD_INIT(unused_peers.list), + .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), +}; + +static void peer_check_expire(unsigned long dummy); +static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); + + +/* Called from ip_output.c:ip_init */ +void __init inet_initpeers(void) +{ + struct sysinfo si; + + /* Use the straight interface to information about memory. */ + si_meminfo(&si); + /* The values below were suggested by Alexey Kuznetsov + * . I don't have any opinion about the values + * myself. --SAW + */ + if (si.totalram <= (32768*1024)/PAGE_SIZE) + inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ + if (si.totalram <= (16384*1024)/PAGE_SIZE) + inet_peer_threshold >>= 1; /* about 512KB */ + if (si.totalram <= (8192*1024)/PAGE_SIZE) + inet_peer_threshold >>= 2; /* about 128KB */ + + peer_cachep = kmem_cache_create("inet_peer_cache", + sizeof(struct inet_peer), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, + NULL); + + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + peer_periodic_timer.expires = jiffies + + net_random() % inet_peer_gc_maxtime + + inet_peer_gc_maxtime; + add_timer(&peer_periodic_timer); +} + +/* Called with or without local BH being disabled. */ +static void unlink_from_unused(struct inet_peer *p) +{ + spin_lock_bh(&unused_peers.lock); + list_del_init(&p->unused); + spin_unlock_bh(&unused_peers.lock); +} + +static int addr_compare(const struct inetpeer_addr *a, + const struct inetpeer_addr *b) +{ + int i, n = (a->family == AF_INET ? 1 : 4); + + for (i = 0; i < n; i++) { + if (a->addr.a6[i] == b->addr.a6[i]) + continue; + if (a->addr.a6[i] < b->addr.a6[i]) + return -1; + return 1; + } + + return 0; +} + +#define rcu_deref_locked(X, BASE) \ + rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) + +/* + * Called with local BH disabled and the pool lock held. + */ +#define lookup(_daddr, _stack, _base) \ +({ \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ + \ + stackptr = _stack; \ + *stackptr++ = &_base->root; \ + for (u = rcu_deref_locked(_base->root, _base); \ + u != peer_avl_empty; ) { \ + int cmp = addr_compare(_daddr, &u->daddr); \ + if (cmp == 0) \ + break; \ + if (cmp == -1) \ + v = &u->avl_left; \ + else \ + v = &u->avl_right; \ + *stackptr++ = v; \ + u = rcu_deref_locked(*v, _base); \ + } \ + u; \ +}) + +static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) +{ + int cur, old = atomic_read(ptr); + + while (old != u) { + *newv = old + a; + cur = atomic_cmpxchg(ptr, old, *newv); + if (cur == old) + return true; + old = cur; + } + return false; +} + +/* + * Called with rcu_read_lock() + * Because we hold no lock against a writer, its quite possible we fall + * in an endless loop. + * But every pointer we follow is guaranteed to be valid thanks to RCU. + * We exit from this function if number of links exceeds PEER_MAXDEPTH + */ +static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, + struct inet_peer_base *base, + int *newrefcnt) +{ + struct inet_peer *u = rcu_dereference(base->root); + int count = 0; + + while (u != peer_avl_empty) { + int cmp = addr_compare(daddr, &u->daddr); + if (cmp == 0) { + /* Before taking a reference, check if this entry was + * deleted, unlink_from_pool() sets refcnt=-1 to make + * distinction between an unused entry (refcnt=0) and + * a freed one. + */ + if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) + u = NULL; + return u; + } + if (cmp == -1) + u = rcu_dereference(u->avl_left); + else + u = rcu_dereference(u->avl_right); + if (unlikely(++count == PEER_MAXDEPTH)) + break; + } + return NULL; +} + +/* Called with local BH disabled and the pool lock held. */ +#define lookup_rightempty(start, base) \ +({ \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ + *stackptr++ = &start->avl_left; \ + v = &start->avl_left; \ + for (u = rcu_deref_locked(*v, base); \ + u->avl_right != peer_avl_empty_rcu; ) { \ + v = &u->avl_right; \ + *stackptr++ = v; \ + u = rcu_deref_locked(*v, base); \ + } \ + u; \ +}) + +/* Called with local BH disabled and the pool lock held. + * Variable names are the proof of operation correctness. + * Look into mm/map_avl.c for more detail description of the ideas. + */ +static void peer_avl_rebalance(struct inet_peer __rcu **stack[], + struct inet_peer __rcu ***stackend, + struct inet_peer_base *base) +{ + struct inet_peer __rcu **nodep; + struct inet_peer *node, *l, *r; + int lh, rh; + + while (stackend > stack) { + nodep = *--stackend; + node = rcu_deref_locked(*nodep, base); + l = rcu_deref_locked(node->avl_left, base); + r = rcu_deref_locked(node->avl_right, base); + lh = node_height(l); + rh = node_height(r); + if (lh > rh + 1) { /* l: RH+2 */ + struct inet_peer *ll, *lr, *lrl, *lrr; + int lrh; + ll = rcu_deref_locked(l->avl_left, base); + lr = rcu_deref_locked(l->avl_right, base); + lrh = node_height(lr); + if (lrh <= node_height(ll)) { /* ll: RH+1 */ + RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ + node->avl_height = lrh + 1; /* RH+1 or RH+2 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ + RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ + l->avl_height = node->avl_height + 1; + RCU_INIT_POINTER(*nodep, l); + } else { /* ll: RH, lr: RH+1 */ + lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ + lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ + node->avl_height = rh + 1; /* node: RH+1 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ + RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ + l->avl_height = rh + 1; /* l: RH+1 */ + RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ + RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ + lr->avl_height = rh + 2; + RCU_INIT_POINTER(*nodep, lr); + } + } else if (rh > lh + 1) { /* r: LH+2 */ + struct inet_peer *rr, *rl, *rlr, *rll; + int rlh; + rr = rcu_deref_locked(r->avl_right, base); + rl = rcu_deref_locked(r->avl_left, base); + rlh = node_height(rl); + if (rlh <= node_height(rr)) { /* rr: LH+1 */ + RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ + node->avl_height = rlh + 1; /* LH+1 or LH+2 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ + RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ + r->avl_height = node->avl_height + 1; + RCU_INIT_POINTER(*nodep, r); + } else { /* rr: RH, rl: RH+1 */ + rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ + rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ + node->avl_height = lh + 1; /* node: LH+1 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ + RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ + r->avl_height = lh + 1; /* r: LH+1 */ + RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ + RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ + rl->avl_height = lh + 2; + RCU_INIT_POINTER(*nodep, rl); + } + } else { + node->avl_height = (lh > rh ? lh : rh) + 1; + } + } +} + +/* Called with local BH disabled and the pool lock held. */ +#define link_to_pool(n, base) \ +do { \ + n->avl_height = 1; \ + n->avl_left = peer_avl_empty_rcu; \ + n->avl_right = peer_avl_empty_rcu; \ + /* lockless readers can catch us now */ \ + rcu_assign_pointer(**--stackptr, n); \ + peer_avl_rebalance(stack, stackptr, base); \ +} while (0) + +static void inetpeer_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); +} + +/* May be called with local BH enabled. */ +static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, + struct inet_peer __rcu **stack[PEER_MAXDEPTH]) +{ + int do_free; + + do_free = 0; + + write_seqlock_bh(&base->lock); + /* Check the reference counter. It was artificially incremented by 1 + * in cleanup() function to prevent sudden disappearing. If we can + * atomically (because of lockless readers) take this last reference, + * it's safe to remove the node and free it later. + * We use refcnt=-1 to alert lockless readers this entry is deleted. + */ + if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { + struct inet_peer __rcu ***stackptr, ***delp; + if (lookup(&p->daddr, stack, base) != p) + BUG(); + delp = stackptr - 1; /* *delp[0] == p */ + if (p->avl_left == peer_avl_empty_rcu) { + *delp[0] = p->avl_right; + --stackptr; + } else { + /* look for a node to insert instead of p */ + struct inet_peer *t; + t = lookup_rightempty(p, base); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); + **--stackptr = t->avl_left; + /* t is removed, t->daddr > x->daddr for any + * x in p->avl_left subtree. + * Put t in the old place of p. */ + RCU_INIT_POINTER(*delp[0], t); + t->avl_left = p->avl_left; + t->avl_right = p->avl_right; + t->avl_height = p->avl_height; + BUG_ON(delp[1] != &p->avl_left); + delp[1] = &t->avl_left; /* was &p->avl_left */ + } + peer_avl_rebalance(stack, stackptr, base); + base->total--; + do_free = 1; + } + write_sequnlock_bh(&base->lock); + + if (do_free) + call_rcu(&p->rcu, inetpeer_free_rcu); + else + /* The node is used again. Decrease the reference counter + * back. The loop "cleanup -> unlink_from_unused + * -> unlink_from_pool -> putpeer -> link_to_unused + * -> cleanup (for the same node)" + * doesn't really exist because the entry will have a + * recent deletion time and will not be cleaned again soon. + */ + inet_putpeer(p); +} + +static struct inet_peer_base *family_to_base(int family) +{ + return (family == AF_INET ? &v4_peers : &v6_peers); +} + +static struct inet_peer_base *peer_to_base(struct inet_peer *p) +{ + return family_to_base(p->daddr.family); +} + +/* May be called with local BH enabled. */ +static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) +{ + struct inet_peer *p = NULL; + + /* Remove the first entry from the list of unused nodes. */ + spin_lock_bh(&unused_peers.lock); + if (!list_empty(&unused_peers.list)) { + __u32 delta; + + p = list_first_entry(&unused_peers.list, struct inet_peer, unused); + delta = (__u32)jiffies - p->dtime; + + if (delta < ttl) { + /* Do not prune fresh entries. */ + spin_unlock_bh(&unused_peers.lock); + return -1; + } + + list_del_init(&p->unused); + + /* Grab an extra reference to prevent node disappearing + * before unlink_from_pool() call. */ + atomic_inc(&p->refcnt); + } + spin_unlock_bh(&unused_peers.lock); + + if (p == NULL) + /* It means that the total number of USED entries has + * grown over inet_peer_threshold. It shouldn't really + * happen because of entry limits in route cache. */ + return -1; + + unlink_from_pool(p, peer_to_base(p), stack); + return 0; +} + +/* Called with or without local BH being disabled. */ +struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) +{ + struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; + struct inet_peer_base *base = family_to_base(daddr->family); + struct inet_peer *p; + unsigned int sequence; + int invalidated, newrefcnt = 0; + + /* Look up for the address quickly, lockless. + * Because of a concurrent writer, we might not find an existing entry. + */ + rcu_read_lock(); + sequence = read_seqbegin(&base->lock); + p = lookup_rcu(daddr, base, &newrefcnt); + invalidated = read_seqretry(&base->lock, sequence); + rcu_read_unlock(); + + if (p) { +found: /* The existing node has been found. + * Remove the entry from unused list if it was there. + */ + if (newrefcnt == 1) + unlink_from_unused(p); + return p; + } + + /* If no writer did a change during our lookup, we can return early. */ + if (!create && !invalidated) + return NULL; + + /* retry an exact lookup, taking the lock before. + * At least, nodes should be hot in our cache. + */ + write_seqlock_bh(&base->lock); + p = lookup(daddr, stack, base); + if (p != peer_avl_empty) { + newrefcnt = atomic_inc_return(&p->refcnt); + write_sequnlock_bh(&base->lock); + goto found; + } + p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; + if (p) { + p->daddr = *daddr; + atomic_set(&p->refcnt, 1); + atomic_set(&p->rid, 0); + atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); + p->tcp_ts_stamp = 0; + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + p->rate_last = 0; + p->pmtu_expires = 0; + p->pmtu_orig = 0; + memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); + INIT_LIST_HEAD(&p->unused); + + + /* Link the node. */ + link_to_pool(p, base); + base->total++; + } + write_sequnlock_bh(&base->lock); + + if (base->total >= inet_peer_threshold) + /* Remove one less-recently-used entry. */ + cleanup_once(0, stack); + + return p; +} + +static int compute_total(void) +{ + return v4_peers.total + v6_peers.total; +} +EXPORT_SYMBOL_GPL(inet_getpeer); + +/* Called with local BH disabled. */ +static void peer_check_expire(unsigned long dummy) +{ + unsigned long now = jiffies; + int ttl, total; + struct inet_peer __rcu **stack[PEER_MAXDEPTH]; + + total = compute_total(); + if (total >= inet_peer_threshold) + ttl = inet_peer_minttl; + else + ttl = inet_peer_maxttl + - (inet_peer_maxttl - inet_peer_minttl) / HZ * + total / inet_peer_threshold * HZ; + while (!cleanup_once(ttl, stack)) { + if (jiffies != now) + break; + } + + /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime + * interval depending on the total number of entries (more entries, + * less interval). */ + total = compute_total(); + if (total >= inet_peer_threshold) + peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; + else + peer_periodic_timer.expires = jiffies + + inet_peer_gc_maxtime + - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * + total / inet_peer_threshold * HZ; + add_timer(&peer_periodic_timer); +} + +void inet_putpeer(struct inet_peer *p) +{ + local_bh_disable(); + + if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { + list_add_tail(&p->unused, &unused_peers.list); + p->dtime = (__u32)jiffies; + spin_unlock(&unused_peers.lock); + } + + local_bh_enable(); +} +EXPORT_SYMBOL_GPL(inet_putpeer); + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the inet_peer entries now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same inet_peer fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) +{ + unsigned long now, token; + bool rc = false; + + if (!peer) + return true; + + token = peer->rate_tokens; + now = jiffies; + token += now - peer->rate_last; + peer->rate_last = now; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; + rc = true; + } + peer->rate_tokens = token; + return rc; +} +EXPORT_SYMBOL(inet_peer_xrlim_allow); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c new file mode 100644 index 00000000..29a07b6c --- /dev/null +++ b/net/ipv4/ip_forward.c @@ -0,0 +1,132 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP forwarding functionality. + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip_input.c for + * history. + * Dave Gregorich : NULL ip_rt_put fix for multicast + * routing. + * Jos Vos : Add call_out_firewall before sending, + * use output device for accounting. + * Jos Vos : Call forward firewall after routing + * (always use output device). + * Mike McLagan : Routing by source + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int ip_forward_finish(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + + if (unlikely(opt->optlen)) + ip_forward_options(skb); + + return dst_output(skb); +} + +int ip_forward(struct sk_buff *skb) +{ + struct iphdr *iph; /* Our header */ + struct rtable *rt; /* Route we use */ + struct ip_options * opt = &(IPCB(skb)->opt); + + if (skb_warn_if_lro(skb)) + goto drop; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) + goto drop; + + if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) + return NET_RX_SUCCESS; + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + skb_forward_csum(skb); + + /* + * According to the RFC, we must first decrease the TTL field. If + * that reaches zero, we must reply an ICMP control message telling + * that the packet's lifetime expired. + */ + if (ip_hdr(skb)->ttl <= 1) + goto too_many_hops; + + if (!xfrm4_route_forward(skb)) + goto drop; + + rt = skb_rtable(skb); + + if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) + goto sr_failed; + + if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && + (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_mtu(&rt->dst))); + goto drop; + } + + /* We are about to mangle packet. Copy it! */ + if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len)) + goto drop; + iph = ip_hdr(skb); + + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. + */ + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) + ip_rt_send_redirect(skb); + + skb->priority = rt_tos2priority(iph->tos); + + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, + rt->dst.dev, ip_forward_finish); + +sr_failed: + /* + * Strict routing permits no gatewaying + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + goto drop; + +too_many_hops: + /* Tell the sender its packet died... */ + IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c new file mode 100644 index 00000000..0ad6035f --- /dev/null +++ b/net/ipv4/ip_fragment.c @@ -0,0 +1,835 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP fragmentation functionality. + * + * Authors: Fred N. van Kempen + * Alan Cox + * + * Fixes: + * Alan Cox : Split from ip.c , see ip_input.c for history. + * David S. Miller : Begin massive cleanup... + * Andi Kleen : Add sysctls. + * xxxx : Overlapfrag bug. + * Ultima : ip_expire() kernel panic. + * Bill Hawes : Frag accounting and evictor fixes. + * John McDonald : 0 length frag bug. + * Alexey Kuznetsov: SMP races, threading, cleanup. + * Patrick McHardy : LRU queue of frag heads for evictor. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c + * as well. Or notify me, at least. --ANK + */ + +static int sysctl_ipfrag_max_dist __read_mostly = 64; + +struct ipfrag_skb_cb +{ + struct inet_skb_parm h; + int offset; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +/* Describe an entry in the "incomplete datagrams" queue. */ +struct ipq { + struct inet_frag_queue q; + + u32 user; + __be32 saddr; + __be32 daddr; + __be16 id; + u8 protocol; + u8 ecn; /* RFC3168 support */ + int iif; + unsigned int rid; + struct inet_peer *peer; +}; + +/* RFC 3168 support : + * We want to check ECN values of all fragments, do detect invalid combinations. + * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. + */ +#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ +#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ +#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ +#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ + +static inline u8 ip4_frag_ecn(u8 tos) +{ + return 1 << (tos & INET_ECN_MASK); +} + +/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. + * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field + */ +static const u8 ip4_frag_ecn_table[16] = { + /* at least one fragment had CE, and others ECT_0 or ECT_1 */ + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + + /* invalid combinations : drop frame */ + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +}; + +static struct inet_frags ip4_frags; + +int ip_frag_nqueues(struct net *net) +{ + return net->ipv4.frags.nqueues; +} + +int ip_frag_mem(struct net *net) +{ + return atomic_read(&net->ipv4.frags.mem); +} + +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, + struct net_device *dev); + +struct ip4_create_arg { + struct iphdr *iph; + u32 user; +}; + +static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) +{ + return jhash_3words((__force u32)id << 16 | prot, + (__force u32)saddr, (__force u32)daddr, + ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); +} + +static unsigned int ip4_hashfn(struct inet_frag_queue *q) +{ + struct ipq *ipq; + + ipq = container_of(q, struct ipq, q); + return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); +} + +static int ip4_frag_match(struct inet_frag_queue *q, void *a) +{ + struct ipq *qp; + struct ip4_create_arg *arg = a; + + qp = container_of(q, struct ipq, q); + return qp->id == arg->iph->id && + qp->saddr == arg->iph->saddr && + qp->daddr == arg->iph->daddr && + qp->protocol == arg->iph->protocol && + qp->user == arg->user; +} + +/* Memory Tracking Functions. */ +static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) +{ + atomic_sub(skb->truesize, &nf->mem); + kfree_skb(skb); +} + +static void ip4_frag_init(struct inet_frag_queue *q, void *a) +{ + struct ipq *qp = container_of(q, struct ipq, q); + struct ip4_create_arg *arg = a; + + qp->protocol = arg->iph->protocol; + qp->id = arg->iph->id; + qp->ecn = ip4_frag_ecn(arg->iph->tos); + qp->saddr = arg->iph->saddr; + qp->daddr = arg->iph->daddr; + qp->user = arg->user; + qp->peer = sysctl_ipfrag_max_dist ? + inet_getpeer_v4(arg->iph->saddr, 1) : NULL; +} + +static __inline__ void ip4_frag_free(struct inet_frag_queue *q) +{ + struct ipq *qp; + + qp = container_of(q, struct ipq, q); + if (qp->peer) + inet_putpeer(qp->peer); +} + + +/* Destruction primitives. */ + +static __inline__ void ipq_put(struct ipq *ipq) +{ + inet_frag_put(&ipq->q, &ip4_frags); +} + +/* Kill ipq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static void ipq_kill(struct ipq *ipq) +{ + inet_frag_kill(&ipq->q, &ip4_frags); +} + +/* Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the threshold. + */ +static void ip_evictor(struct net *net) +{ + int evicted; + + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); + if (evicted) + IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); +} + +/* + * Oops, a fragment queue timed out. Kill it and send an ICMP reply. + */ +static void ip_expire(unsigned long arg) +{ + struct ipq *qp; + struct net *net; + + qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + net = container_of(qp->q.net, struct net, ipv4.frags); + + spin_lock(&qp->q.lock); + + if (qp->q.last_in & INET_FRAG_COMPLETE) + goto out; + + ipq_kill(qp); + + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + + if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { + struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; + + rcu_read_lock(); + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out_rcu_unlock; + + /* skb dst is stale, drop it, and perform route lookup again */ + skb_dst_drop(head); + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + + /* + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (qp->user == IP_DEFRAG_CONNTRACK_IN && + skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; + + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); + } +out: + spin_unlock(&qp->q.lock); + ipq_put(qp); +} + +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +{ + struct inet_frag_queue *q; + struct ip4_create_arg arg; + unsigned int hash; + + arg.iph = iph; + arg.user = user; + + read_lock(&ip4_frags.lock); + hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); + + q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); + if (q == NULL) + goto out_nomem; + + return container_of(q, struct ipq, q); + +out_nomem: + LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); + return NULL; +} + +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->q.fragments && (end - start) > max; + + if (rc) { + struct net *net; + + net = container_of(qp->q.net, struct net, ipv4.frags); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { + atomic_inc(&qp->q.refcnt); + return -ETIMEDOUT; + } + + fp = qp->q.fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(qp->q.net, fp); + fp = xp; + } while (fp); + + qp->q.last_in = 0; + qp->q.len = 0; + qp->q.meat = 0; + qp->q.fragments = NULL; + qp->q.fragments_tail = NULL; + qp->iif = 0; + qp->ecn = 0; + + return 0; +} + +/* Add new segment to existing queue. */ +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +{ + struct sk_buff *prev, *next; + struct net_device *dev; + int flags, offset; + int ihl, end; + int err = -ENOENT; + u8 ecn; + + if (qp->q.last_in & INET_FRAG_COMPLETE) + goto err; + + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && + unlikely(err = ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + + ecn = ip4_frag_ecn(ip_hdr(skb)->tos); + offset = ntohs(ip_hdr(skb)->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = ip_hdrlen(skb); + + /* Determine the position of this fragment. */ + end = offset + skb->len - ihl; + err = -EINVAL; + + /* Is this the final fragment? */ + if ((flags & IP_MF) == 0) { + /* If we already have some bits beyond end + * or have different end, the segment is corrrupted. + */ + if (end < qp->q.len || + ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) + goto err; + qp->q.last_in |= INET_FRAG_LAST_IN; + qp->q.len = end; + } else { + if (end&7) { + end &= ~7; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + if (end > qp->q.len) { + /* Some bits beyond end -> corruption. */ + if (qp->q.last_in & INET_FRAG_LAST_IN) + goto err; + qp->q.len = end; + } + } + if (end == offset) + goto err; + + err = -ENOMEM; + if (pskb_pull(skb, ihl) == NULL) + goto err; + + err = pskb_trim_rcsum(skb, end - offset); + if (err) + goto err; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = qp->q.fragments_tail; + if (!prev || FRAG_CB(prev)->offset < offset) { + next = NULL; + goto found; + } + prev = NULL; + for (next = qp->q.fragments; next != NULL; next = next->next) { + if (FRAG_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + +found: + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + err = -EINVAL; + if (end <= offset) + goto err; + err = -ENOMEM; + if (!pskb_pull(skb, i)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + err = -ENOMEM; + + while (next && FRAG_CB(next)->offset < end) { + int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + if (!pskb_pull(next, i)) + goto err; + FRAG_CB(next)->offset += i; + qp->q.meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragment is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + qp->q.fragments = next; + + qp->q.meat -= free_it->len; + frag_kfree_skb(qp->q.net, free_it); + } + } + + FRAG_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (!next) + qp->q.fragments_tail = skb; + if (prev) + prev->next = skb; + else + qp->q.fragments = skb; + + dev = skb->dev; + if (dev) { + qp->iif = dev->ifindex; + skb->dev = NULL; + } + qp->q.stamp = skb->tstamp; + qp->q.meat += skb->len; + qp->ecn |= ecn; + atomic_add(skb->truesize, &qp->q.net->mem); + if (offset == 0) + qp->q.last_in |= INET_FRAG_FIRST_IN; + + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + qp->q.meat == qp->q.len) + return ip_frag_reasm(qp, prev, dev); + + write_lock(&ip4_frags.lock); + list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); + write_unlock(&ip4_frags.lock); + return -EINPROGRESS; + +err: + kfree_skb(skb); + return err; +} + + +/* Build a new IP datagram from all its fragments. */ + +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, + struct net_device *dev) +{ + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct iphdr *iph; + struct sk_buff *fp, *head = qp->q.fragments; + int len; + int ihlen; + int err; + u8 ecn; + + ipq_kill(qp); + + ecn = ip4_frag_ecn_table[qp->ecn]; + if (unlikely(ecn == 0xff)) { + err = -EINVAL; + goto out_fail; + } + /* Make the one we just received the head. */ + if (prev) { + head = prev->next; + fp = skb_clone(head, GFP_ATOMIC); + if (!fp) + goto out_nomem; + + fp->next = head->next; + if (!fp->next) + qp->q.fragments_tail = fp; + prev->next = fp; + + skb_morph(head, qp->q.fragments); + head->next = qp->q.fragments->next; + + kfree_skb(qp->q.fragments); + qp->q.fragments = head; + } + + WARN_ON(head == NULL); + WARN_ON(FRAG_CB(head)->offset != 0); + + /* Allocate a new buffer for the datagram. */ + ihlen = ip_hdrlen(head); + len = ihlen + qp->q.len; + + err = -E2BIG; + if (len > 65535) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_nomem; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_nomem; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i=0; inr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &qp->q.net->mem); + } + + skb_shinfo(head)->frag_list = head->next; + skb_push(head, head->data - skb_network_header(head)); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + } + atomic_sub(head->truesize, &qp->q.net->mem); + + head->next = NULL; + head->dev = dev; + head->tstamp = qp->q.stamp; + + iph = ip_hdr(head); + iph->frag_off = 0; + iph->tot_len = htons(len); + iph->tos |= ecn; + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); + qp->q.fragments = NULL; + qp->q.fragments_tail = NULL; + return 0; + +out_nomem: + LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " + "queue %p\n", qp); + err = -ENOMEM; + goto out_fail; +out_oversize: + if (net_ratelimit()) + printk(KERN_INFO "Oversized IP packet from %pI4.\n", + &qp->saddr); +out_fail: + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + return err; +} + +/* Process an incoming IP datagram fragment. */ +int ip_defrag(struct sk_buff *skb, u32 user) +{ + struct ipq *qp; + struct net *net; + + net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + + /* Start by cleaning up the memory. */ + if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) + ip_evictor(net); + + /* Lookup (or create) queue header */ + if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { + int ret; + + spin_lock(&qp->q.lock); + + ret = ip_frag_queue(qp, skb); + + spin_unlock(&qp->q.lock); + ipq_put(qp); + return ret; + } + + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -ENOMEM; +} +EXPORT_SYMBOL(ip_defrag); + +#ifdef CONFIG_SYSCTL +static int zero; + +static struct ctl_table ip4_frags_ns_ctl_table[] = { + { + .procname = "ipfrag_high_thresh", + .data = &init_net.ipv4.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ipfrag_low_thresh", + .data = &init_net.ipv4.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ipfrag_time", + .data = &init_net.ipv4.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static struct ctl_table ip4_frags_ctl_table[] = { + { + .procname = "ipfrag_secret_interval", + .data = &ip4_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; + +static int __net_init ip4_frags_ns_ctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip4_frags_ns_ctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv4.frags.high_thresh; + table[1].data = &net->ipv4.frags.low_thresh; + table[2].data = &net->ipv4.frags.timeout; + } + + hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv4.frags_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.frags_hdr); + kfree(table); +} + +static void ip4_frags_ctl_register(void) +{ + register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +} +#else +static inline int ip4_frags_ns_ctl_register(struct net *net) +{ + return 0; +} + +static inline void ip4_frags_ns_ctl_unregister(struct net *net) +{ +} + +static inline void ip4_frags_ctl_register(void) +{ +} +#endif + +static int __net_init ipv4_frags_init_net(struct net *net) +{ + /* + * Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to + * measurably harm machine performance. + */ + net->ipv4.frags.high_thresh = 256 * 1024; + net->ipv4.frags.low_thresh = 192 * 1024; + /* + * Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival + * by TTL. + */ + net->ipv4.frags.timeout = IP_FRAG_TIME; + + inet_frags_init_net(&net->ipv4.frags); + + return ip4_frags_ns_ctl_register(net); +} + +static void __net_exit ipv4_frags_exit_net(struct net *net) +{ + ip4_frags_ns_ctl_unregister(net); + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); +} + +static struct pernet_operations ip4_frags_ops = { + .init = ipv4_frags_init_net, + .exit = ipv4_frags_exit_net, +}; + +void __init ipfrag_init(void) +{ + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); + ip4_frags.hashfn = ip4_hashfn; + ip4_frags.constructor = ip4_frag_init; + ip4_frags.destructor = ip4_frag_free; + ip4_frags.skb_free = NULL; + ip4_frags.qsize = sizeof(struct ipq); + ip4_frags.match = ip4_frag_match; + ip4_frags.frag_expire = ip_expire; + ip4_frags.secret_interval = 10 * 60 * HZ; + inet_frags_init(&ip4_frags); +} diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c new file mode 100644 index 00000000..d7bb94c4 --- /dev/null +++ b/net/ipv4/ip_gre.c @@ -0,0 +1,1754 @@ +/* + * Linux NET3: GRE over IP protocol decoder. + * + * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#include +#include +#endif + +/* + Problems & solutions + -------------------- + + 1. The most important issue is detecting local dead loops. + They would cause complete host lockup in transmit, which + would be "resolved" by stack overflow or, if queueing is enabled, + with infinite looping in net_bh. + + We cannot track such dead loops during route installation, + it is infeasible task. The most general solutions would be + to keep skb->encapsulation counter (sort of local ttl), + and silently drop packet when it expires. It is a good + solution, but it supposes maintaing new variable in ALL + skb, even if no tunneling is used. + + Current solution: xmit_recursion breaks dead loops. This is a percpu + counter, since when we enter the first ndo_xmit(), cpu migration is + forbidden. We force an exit if this counter reaches RECURSION_LIMIT + + 2. Networking dead loops would not kill routers, but would really + kill network. IP hop limit plays role of "t->recursion" in this case, + if we copy it from packet being encapsulated to upper header. + It is very good solution, but it introduces two problems: + + - Routing protocols, using packets with ttl=1 (OSPF, RIP2), + do not work over tunnels. + - traceroute does not work. I planned to relay ICMP from tunnel, + so that this problem would be solved and traceroute output + would even more informative. This idea appeared to be wrong: + only Linux complies to rfc1812 now (yes, guys, Linux is the only + true router now :-)), all routers (at least, in neighbourhood of mine) + return only 8 bytes of payload. It is the end. + + Hence, if we want that OSPF worked or traceroute said something reasonable, + we should search for another solution. + + One of them is to parse packet trying to detect inner encapsulation + made by our node. It is difficult or even impossible, especially, + taking into account fragmentation. TO be short, tt is not solution at all. + + Current solution: The solution was UNEXPECTEDLY SIMPLE. + We force DF flag on tunnels with preconfigured hop limit, + that is ALL. :-) Well, it does not remove the problem completely, + but exponential growth of network traffic is changed to linear + (branches, that exceed pmtu are pruned) and tunnel mtu + fastly degrades to value <68, where looping stops. + Yes, it is not good if there exists a router in the loop, + which does not force DF, even when encapsulating packets have DF set. + But it is not our problem! Nobody could accuse us, we made + all that we could make. Even if it is your gated who injected + fatal route to network, even if it were you who configured + fatal static route: you are innocent. :-) + + + + 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain + practically identical code. It would be good to glue them + together, but it is not very evident, how to make them modular. + sit is integral part of IPv6, ipip and gre are naturally modular. + We could extract common parts (hash table, ioctl etc) + to a separate module (ip_tunnel.c). + + Alexey Kuznetsov. + */ + +static struct rtnl_link_ops ipgre_link_ops __read_mostly; +static int ipgre_tunnel_init(struct net_device *dev); +static void ipgre_tunnel_setup(struct net_device *dev); +static int ipgre_tunnel_bind_dev(struct net_device *dev); + +/* Fallback tunnel: no source, no destination, no key, no options */ + +#define HASH_SIZE 16 + +static int ipgre_net_id __read_mostly; +struct ipgre_net { + struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; + + struct net_device *fb_tunnel_dev; +}; + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) + +#define tunnels_r_l tunnels[3] +#define tunnels_r tunnels[2] +#define tunnels_l tunnels[1] +#define tunnels_wc tunnels[0] +/* + * Locking : hash tables are protected by RCU and RTNL + */ + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipgre_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + +/* Given src, dst and key, find appropriate for input tunnel. */ + +static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, + __be32 remote, __be32 local, + __be32 key, __be16 gre_proto) +{ + struct net *net = dev_net(dev); + int link = dev->ifindex; + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(key); + struct ip_tunnel *t, *cand = NULL; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + ARPHRD_ETHER : ARPHRD_IPGRE; + int score, cand_score = 4; + + for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { + if (local != t->parms.iph.saddr || + remote != t->parms.iph.daddr || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { + if (remote != t->parms.iph.daddr || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { + if ((local != t->parms.iph.saddr && + (local != t->parms.iph.daddr || + !ipv4_is_multicast(local))) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { + if (t->parms.i_key != key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IPGRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + if (cand != NULL) + return cand; + + dev = ign->fb_tunnel_dev; + if (dev->flags & IFF_UP) + return netdev_priv(dev); + + return NULL; +} + +static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, + struct ip_tunnel_parm *parms) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + __be32 key = parms->i_key; + unsigned int h = HASH(key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !ipv4_is_multicast(remote)) { + prio |= 2; + h ^= HASH(remote); + } + + return &ign->tunnels[prio][h]; +} + +static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, + struct ip_tunnel *t) +{ + return __ipgre_bucket(ign, &t->parms); +} + +static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipgre_bucket(ign, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static struct ip_tunnel *ipgre_tunnel_find(struct net *net, + struct ip_tunnel_parm *parms, + int type) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + __be32 key = parms->i_key; + int link = parms->link; + struct ip_tunnel *t; + struct ip_tunnel __rcu **tp; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + for (tp = __ipgre_bucket(ign, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + key == t->parms.i_key && + link == t->parms.link && + type == t->dev->type) + break; + + return t; +} + +static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) +{ + struct ip_tunnel *t, *nt; + struct net_device *dev; + char name[IFNAMSIZ]; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); + if (t || !create) + return t; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "gre%d"); + + dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); + if (!dev) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + nt->parms = *parms; + dev->rtnl_link_ops = &ipgre_link_ops; + + dev->mtu = ipgre_tunnel_bind_dev(dev); + + if (register_netdevice(dev) < 0) + goto failed_free; + + dev_hold(dev); + ipgre_tunnel_link(ign, nt); + return nt; + +failed_free: + free_netdev(dev); + return NULL; +} + +static void ipgre_tunnel_uninit(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + ipgre_tunnel_unlink(ign, netdev_priv(dev)); + dev_put(dev); +} + + +static void ipgre_err(struct sk_buff *skb, u32 info) +{ + +/* All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft state for keyed + GRE tunnels with enabled checksum. Tell them "thank you". + + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standrads established + by themself??? + */ + + const struct iphdr *iph = (const struct iphdr *)skb->data; + __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); + int grehlen = (iph->ihl<<2) + 4; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + __be16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (skb_headlen(skb) < grehlen) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + rcu_read_lock(); + t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, + flags & GRE_KEY ? + *(((__be32 *)p) + (grehlen / 4) - 1) : 0, + p[1]); + if (t == NULL || t->parms.iph.daddr == 0 || + ipv4_is_multicast(t->parms.iph.daddr)) + goto out; + + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + rcu_read_unlock(); +} + +static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) { + if (skb->protocol == htons(ETH_P_IP)) { + IP_ECN_set_ce(ip_hdr(skb)); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + IP6_ECN_set_ce(ipv6_hdr(skb)); + } + } +} + +static inline u8 +ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) +{ + u8 inner = 0; + if (skb->protocol == htons(ETH_P_IP)) + inner = old_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); + return INET_ECN_encapsulate(tos, inner); +} + +static int ipgre_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph; + u8 *h; + __be16 flags; + __sum16 csum = 0; + __be32 key = 0; + u32 seqno = 0; + struct ip_tunnel *tunnel; + int offset = 4; + __be16 gre_proto; + + if (!pskb_may_pull(skb, 16)) + goto drop_nolock; + + iph = ip_hdr(skb); + h = skb->data; + flags = *(__be16*)h; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop_nolock; + + if (flags&GRE_CSUM) { + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + csum = csum_fold(skb->csum); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); + skb->ip_summed = CHECKSUM_COMPLETE; + } + offset += 4; + } + if (flags&GRE_KEY) { + key = *(__be32*)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(__be32*)(h + offset)); + offset += 4; + } + } + + gre_proto = *(__be16 *)(h + 2); + + rcu_read_lock(); + if ((tunnel = ipgre_tunnel_lookup(skb->dev, + iph->saddr, iph->daddr, key, + gre_proto))) { + struct pcpu_tstats *tstats; + + secpath_reset(skb); + + skb->protocol = gre_proto; + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { + skb->protocol = htons(ETH_P_IP); + if ((*(h + offset) & 0xF0) != 0x40) + offset += 4; + } + + skb->mac_header = skb->network_header; + __pskb_pull(skb, offset); + skb_postpull_rcsum(skb, skb_transport_header(skb), offset); + skb->pkt_type = PACKET_HOST; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(iph->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + tunnel->dev->stats.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + + iph = ip_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); + + skb_reset_network_header(skb); + ipgre_ecn_decapsulate(iph, skb); + + netif_rx(skb); + + rcu_read_unlock(); + return 0; + } + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +drop: + rcu_read_unlock(); +drop_nolock: + kfree_skb(skb); + return 0; +} + +static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct pcpu_tstats *tstats; + const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *tiph; + struct flowi4 fl4; + u8 tos; + __be16 df; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int gre_hlen; + __be32 dst; + int mtu; + + if (dev->type == ARPHRD_ETHER) + IPCB(skb)->flags = 0; + + if (dev->header_ops && dev->type == ARPHRD_IPGRE) { + gre_hlen = 0; + tiph = (const struct iphdr *)skb->data; + } else { + gre_hlen = tunnel->hlen; + tiph = &tunnel->parms.iph; + } + + if ((dst = tiph->daddr) == 0) { + /* NBMA tunnel */ + + if (skb_dst(skb) == NULL) { + dev->stats.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == htons(ETH_P_IP)) { + rt = skb_rtable(skb); + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct neighbour *neigh = dst_get_neighbour(skb_dst(skb)); + const struct in6_addr *addr6; + int addr_type; + + if (neigh == NULL) + goto tx_error; + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } +#endif + else + goto tx_error; + } + + tos = tiph->tos; + if (tos == 1) { + tos = 0; + if (skb->protocol == htons(ETH_P_IP)) + tos = old_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); + } + + rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, + tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + tdev = rt->dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + df = tiph->frag_off; + if (df) + mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; + else + mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + if (skb->protocol == htons(ETH_P_IP)) { + df |= (old_iph->frag_off&htons(IP_DF)); + + if ((old_iph->frag_off&htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + + if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { + if ((tunnel->parms.iph.daddr && + !ipv4_is_multicast(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); + } + } + + if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ip_rt_put(rt); + goto tx_error; + } + } +#endif + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; + + if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; + if (!new_skb) { + ip_rt_put(rt); + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb_reset_transport_header(skb); + skb_push(skb, gre_hlen); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = IPPROTO_GRE; + iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; + + if ((iph->ttl = tiph->ttl) == 0) { + if (skb->protocol == htons(ETH_P_IP)) + iph->ttl = old_iph->ttl; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; +#endif + else + iph->ttl = ip4_dst_hoplimit(&rt->dst); + } + + ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; + ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); + } + } + + nf_reset(skb); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); + return NETDEV_TX_OK; + +tx_error_icmp: + dst_link_failure(skb); + +tx_error: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int ipgre_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + const struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = ETH_DATA_LEN; + int addend = sizeof(struct iphdr) + 4; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + /* Guess output device to choose reasonable mtu and needed_headroom */ + + if (iph->daddr) { + struct flowi4 fl4; + struct rtable *rt; + + rt = ip_route_output_gre(dev_net(dev), &fl4, + iph->daddr, iph->saddr, + tunnel->parms.o_key, + RT_TOS(iph->tos), + tunnel->parms.link); + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + ip_rt_put(rt); + } + + if (dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len + tdev->needed_headroom; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->needed_headroom = addend + hlen; + mtu -= dev->hard_header_len + addend; + + if (mtu < 68) + mtu = 68; + + tunnel->hlen = addend; + + return mtu; +} + +static int +ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ign->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipgre_tunnel_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || + ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); + + if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + unsigned int nflags = 0; + + t = netdev_priv(dev); + + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { + err = -EINVAL; + break; + } + ipgre_tunnel_unlink(ign, t); + synchronize_net(); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipgre_tunnel_link(ign, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + if (t->parms.link != p.link) { + t->parms.link = p.link; + dev->mtu = ipgre_tunnel_bind_dev(dev); + netdev_state_change(dev); + } + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ign->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(ign->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +/* Nice toy. Unfortunately, useless in real life :-) + It allows to construct virtual multiprotocol broadcast "LAN" + over the Internet, provided multicast routing is tuned. + + + I have no idea was this bicycle invented before me, + so that I had to set ARPHRD_IPGRE to a random value. + I have an impression, that Cisco could make something similar, + but this feature is apparently missing in IOS<=11.2(8). + + I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks + with broadcast 224.66.66.66. If you have access to mbone, play with me :-) + + ping -t 255 224.66.66.66 + + If nobody answers, mbone does not work. + + ip tunnel add Universe mode gre remote 224.66.66.66 local ttl 255 + ip addr add 10.66.66./24 dev Universe + ifconfig Universe up + ifconfig Universe add fe80::/10 + ifconfig Universe add fec0:6666:6666::/96 + ftp 10.66.66.66 + ... + ftp fec0:6666:6666::193.233.7.65 + ... + + */ + +static int ipgre_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned int len) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); + __be16 *p = (__be16*)(iph+1); + + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&iph->saddr, saddr, 4); + if (daddr) + memcpy(&iph->daddr, daddr, 4); + if (iph->daddr) + return t->hlen; + + return -t->hlen; +} + +static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); + memcpy(haddr, &iph->saddr, 4); + return 4; +} + +static const struct header_ops ipgre_header_ops = { + .create = ipgre_header, + .parse = ipgre_header_parse, +}; + +#ifdef CONFIG_NET_IPGRE_BROADCAST +static int ipgre_open(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + if (ipv4_is_multicast(t->parms.iph.daddr)) { + struct flowi4 fl4; + struct rtable *rt; + + rt = ip_route_output_gre(dev_net(dev), &fl4, + t->parms.iph.daddr, + t->parms.iph.saddr, + t->parms.o_key, + RT_TOS(t->parms.iph.tos), + t->parms.link); + if (IS_ERR(rt)) + return -EADDRNOTAVAIL; + dev = rt->dst.dev; + ip_rt_put(rt); + if (__in_dev_get_rtnl(dev) == NULL) + return -EADDRNOTAVAIL; + t->mlink = dev->ifindex; + ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); + } + return 0; +} + +static int ipgre_close(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { + struct in_device *in_dev; + in_dev = inetdev_by_index(dev_net(dev), t->mlink); + if (in_dev) + ip_mc_dec_group(in_dev, t->parms.iph.daddr); + } + return 0; +} + +#endif + +static const struct net_device_ops ipgre_netdev_ops = { + .ndo_init = ipgre_tunnel_init, + .ndo_uninit = ipgre_tunnel_uninit, +#ifdef CONFIG_NET_IPGRE_BROADCAST + .ndo_open = ipgre_open, + .ndo_stop = ipgre_close, +#endif + .ndo_start_xmit = ipgre_tunnel_xmit, + .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, +}; + +static void ipgre_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void ipgre_tunnel_setup(struct net_device *dev) +{ + dev->netdev_ops = &ipgre_netdev_ops; + dev->destructor = ipgre_dev_free; + + dev->type = ARPHRD_IPGRE; + dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static int ipgre_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + if (iph->daddr) { +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(iph->daddr)) { + if (!iph->saddr) + return -EINVAL; + dev->flags = IFF_BROADCAST; + dev->header_ops = &ipgre_header_ops; + } +#endif + } else + dev->header_ops = &ipgre_header_ops; + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static void ipgre_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_GRE; + iph->ihl = 5; + tunnel->hlen = sizeof(struct iphdr) + 4; + + dev_hold(dev); +} + + +static const struct gre_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, +}; + +static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) +{ + int prio; + + for (prio = 0; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + + t = rtnl_dereference(ign->tunnels[prio][h]); + + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init ipgre_init_net(struct net *net) +{ + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int err; + + ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", + ipgre_tunnel_setup); + if (!ign->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(ign->fb_tunnel_dev, net); + + ipgre_fb_tunnel_init(ign->fb_tunnel_dev); + ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; + + if ((err = register_netdev(ign->fb_tunnel_dev))) + goto err_reg_dev; + + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); + return 0; + +err_reg_dev: + ipgre_dev_free(ign->fb_tunnel_dev); +err_alloc_dev: + return err; +} + +static void __net_exit ipgre_exit_net(struct net *net) +{ + struct ipgre_net *ign; + LIST_HEAD(list); + + ign = net_generic(net, ipgre_net_id); + rtnl_lock(); + ipgre_destroy_tunnels(ign, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations ipgre_net_ops = { + .init = ipgre_init_net, + .exit = ipgre_exit_net, + .id = &ipgre_net_id, + .size = sizeof(struct ipgre_net), +}; + +static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 flags; + + if (!data) + return 0; + + flags = 0; + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (flags & (GRE_VERSION|GRE_ROUTING)) + return -EINVAL; + + return 0; +} + +static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be32 daddr; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + goto out; + + if (data[IFLA_GRE_REMOTE]) { + memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); + if (!daddr) + return -EINVAL; + } + +out: + return ipgre_tunnel_validate(tb, data); +} + +static void ipgre_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.protocol = IPPROTO_GRE; + + if (!data) + return; + + if (data[IFLA_GRE_LINK]) + parms->link = nla_get_u32(data[IFLA_GRE_LINK]); + + if (data[IFLA_GRE_IFLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + + if (data[IFLA_GRE_OFLAGS]) + parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + + if (data[IFLA_GRE_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); + + if (data[IFLA_GRE_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); + + if (data[IFLA_GRE_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); + + if (data[IFLA_GRE_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); + + if (data[IFLA_GRE_TTL]) + parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); + + if (data[IFLA_GRE_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); + + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); +} + +static int ipgre_tap_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + ipgre_tunnel_bind_dev(dev); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static const struct net_device_ops ipgre_tap_netdev_ops = { + .ndo_init = ipgre_tap_init, + .ndo_uninit = ipgre_tunnel_uninit, + .ndo_start_xmit = ipgre_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, +}; + +static void ipgre_tap_setup(struct net_device *dev) +{ + + ether_setup(dev); + + dev->netdev_ops = &ipgre_tap_netdev_ops; + dev->destructor = ipgre_dev_free; + + dev->iflink = 0; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int mtu; + int err; + + nt = netdev_priv(dev); + ipgre_netlink_parms(data, &nt->parms); + + if (ipgre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + random_ether_addr(dev->dev_addr); + + mtu = ipgre_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + + err = register_netdevice(dev); + if (err) + goto out; + + dev_hold(dev); + ipgre_tunnel_link(ign, nt); + +out: + return err; +} + +static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *t, *nt; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + struct ip_tunnel_parm p; + int mtu; + + if (dev == ign->fb_tunnel_dev) + return -EINVAL; + + nt = netdev_priv(dev); + ipgre_netlink_parms(data, &p); + + t = ipgre_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + t = nt; + + if (dev->type != ARPHRD_ETHER) { + unsigned int nflags = 0; + + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + } + + ipgre_tunnel_unlink(ign, t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + } + ipgre_tunnel_link(ign, t); + netdev_state_change(dev); + } + + t->parms.o_key = p.o_key; + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + + if (t->parms.link != p.link) { + t->parms.link = p.link; + mtu = ipgre_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + netdev_state_change(dev); + } + + return 0; +} + +static size_t ipgre_get_size(const struct net_device *dev) +{ + return + /* IFLA_GRE_LINK */ + nla_total_size(4) + + /* IFLA_GRE_IFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_OFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_IKEY */ + nla_total_size(4) + + /* IFLA_GRE_OKEY */ + nla_total_size(4) + + /* IFLA_GRE_LOCAL */ + nla_total_size(4) + + /* IFLA_GRE_REMOTE */ + nla_total_size(4) + + /* IFLA_GRE_TTL */ + nla_total_size(1) + + /* IFLA_GRE_TOS */ + nla_total_size(1) + + /* IFLA_GRE_PMTUDISC */ + nla_total_size(1) + + 0; +} + +static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm *p = &t->parms; + + NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); + NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); + NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); + NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); + NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); + NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); + NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); + NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); + NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); + NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { + [IFLA_GRE_LINK] = { .type = NLA_U32 }, + [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_IKEY] = { .type = NLA_U32 }, + [IFLA_GRE_OKEY] = { .type = NLA_U32 }, + [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GRE_TTL] = { .type = NLA_U8 }, + [IFLA_GRE_TOS] = { .type = NLA_U8 }, + [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ipgre_link_ops __read_mostly = { + .kind = "gre", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tunnel_setup, + .validate = ipgre_tunnel_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +}; + +static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { + .kind = "gretap", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tap_setup, + .validate = ipgre_tap_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +}; + +/* + * And now the modules code and kernel interface. + */ + +static int __init ipgre_init(void) +{ + int err; + + printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + + err = register_pernet_device(&ipgre_net_ops); + if (err < 0) + return err; + + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); + if (err < 0) { + printk(KERN_INFO "ipgre init: can't add protocol\n"); + goto add_proto_failed; + } + + err = rtnl_link_register(&ipgre_link_ops); + if (err < 0) + goto rtnl_link_failed; + + err = rtnl_link_register(&ipgre_tap_ops); + if (err < 0) + goto tap_ops_failed; + +out: + return err; + +tap_ops_failed: + rtnl_link_unregister(&ipgre_link_ops); +rtnl_link_failed: + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); +add_proto_failed: + unregister_pernet_device(&ipgre_net_ops); + goto out; +} + +static void __exit ipgre_fini(void) +{ + rtnl_link_unregister(&ipgre_tap_ops); + rtnl_link_unregister(&ipgre_link_ops); + if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) + printk(KERN_INFO "ipgre close: can't remove protocol\n"); + unregister_pernet_device(&ipgre_net_ops); +} + +module_init(ipgre_init); +module_exit(ipgre_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("gre"); +MODULE_ALIAS_RTNL_LINK("gretap"); +MODULE_ALIAS_NETDEV("gre0"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c new file mode 100644 index 00000000..c8f48efc --- /dev/null +++ b/net/ipv4/ip_input.c @@ -0,0 +1,452 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) module. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Donald Becker, + * Alan Cox, + * Richard Underwood + * Stefan Becker, + * Jorge Cwik, + * Arnt Gulbrandsen, + * + * + * Fixes: + * Alan Cox : Commented a couple of minor bits of surplus code + * Alan Cox : Undefining IP_FORWARD doesn't include the code + * (just stops a compiler warning). + * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes + * are junked rather than corrupting things. + * Alan Cox : Frames to bad broadcast subnets are dumped + * We used to process them non broadcast and + * boy could that cause havoc. + * Alan Cox : ip_forward sets the free flag on the + * new frame it queues. Still crap because + * it copies the frame but at least it + * doesn't eat memory too. + * Alan Cox : Generic queue code and memory fixes. + * Fred Van Kempen : IP fragment support (borrowed from NET2E) + * Gerhard Koerting: Forward fragmented frames correctly. + * Gerhard Koerting: Fixes to my fix of the above 8-). + * Gerhard Koerting: IP interface addressing fix. + * Linus Torvalds : More robustness checks + * Alan Cox : Even more checks: Still not as robust as it ought to be + * Alan Cox : Save IP header pointer for later + * Alan Cox : ip option setting + * Alan Cox : Use ip_tos/ip_ttl settings + * Alan Cox : Fragmentation bogosity removed + * (Thanks to Mark.Bush@prg.ox.ac.uk) + * Dmitry Gorodchanin : Send of a raw packet crash fix. + * Alan Cox : Silly ip bug when an overlength + * fragment turns up. Now frees the + * queue. + * Linus Torvalds/ : Memory leakage on fragmentation + * Alan Cox : handling. + * Gerhard Koerting: Forwarding uses IP priority hints + * Teemu Rantanen : Fragment problems. + * Alan Cox : General cleanup, comments and reformat + * Alan Cox : SNMP statistics + * Alan Cox : BSD address rule semantics. Also see + * UDP as there is a nasty checksum issue + * if you do things the wrong way. + * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file + * Alan Cox : IP options adjust sk->priority. + * Pedro Roque : Fix mtu/length error in ip_forward. + * Alan Cox : Avoid ip_chk_addr when possible. + * Richard Underwood : IP multicasting. + * Alan Cox : Cleaned up multicast handlers. + * Alan Cox : RAW sockets demultiplex in the BSD style. + * Gunther Mayer : Fix the SNMP reporting typo + * Alan Cox : Always in group 224.0.0.1 + * Pauline Middelink : Fast ip_checksum update when forwarding + * Masquerading support. + * Alan Cox : Multicast loopback error for 224.0.0.1 + * Alan Cox : IP_MULTICAST_LOOP option. + * Alan Cox : Use notifiers. + * Bjorn Ekwall : Removed ip_csum (from slhc.c too) + * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) + * Stefan Becker : Send out ICMP HOST REDIRECT + * Arnt Gulbrandsen : ip_build_xmit + * Alan Cox : Per socket routing cache + * Alan Cox : Fixed routing cache, added header cache. + * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. + * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. + * Alan Cox : Incoming IP option handling. + * Alan Cox : Set saddr on raw output frames as per BSD. + * Alan Cox : Stopped broadcast source route explosions. + * Alan Cox : Can disable source routing + * Takeshi Sone : Masquerading didn't work. + * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. + * Alan Cox : Memory leaks, tramples, misc debugging. + * Alan Cox : Fixed multicast (by popular demand 8)) + * Alan Cox : Fixed forwarding (by even more popular demand 8)) + * Alan Cox : Fixed SNMP statistics [I think] + * Gerhard Koerting : IP fragmentation forwarding fix + * Alan Cox : Device lock against page fault. + * Alan Cox : IP_HDRINCL facility. + * Werner Almesberger : Zero fragment bug + * Alan Cox : RAW IP frame length bug + * Alan Cox : Outgoing firewall on build_xmit + * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel + * Alan Cox : Multicast routing hooks + * Jos Vos : Do accounting *before* call_in_firewall + * Willy Konynenberg : Transparent proxying support + * + * + * + * To Fix: + * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient + * and could be made very efficient with the addition of some virtual memory hacks to permit + * the allocation of a buffer that can then be 'grown' by twiddling page tables. + * Output fragmentation wants updating along with the buffer management to use a single + * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet + * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause + * fragmentation anyway. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Process Router Attention IP option (RFC 2113) + */ +int ip_call_ra_chain(struct sk_buff *skb) +{ + struct ip_ra_chain *ra; + u8 protocol = ip_hdr(skb)->protocol; + struct sock *last = NULL; + struct net_device *dev = skb->dev; + + for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { + struct sock *sk = ra->sk; + + /* If socket is bound to an interface, only report + * the packet if it came from that interface. + */ + if (sk && inet_sk(sk)->inet_num == protocol && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == dev->ifindex) && + net_eq(sock_net(sk), dev_net(dev))) { + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + return 1; + } + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + raw_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + raw_rcv(last, skb); + return 1; + } + return 0; +} + +static int ip_local_deliver_finish(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + + __skb_pull(skb, ip_hdrlen(skb)); + + /* Point into the IP datagram, just past the header. */ + skb_reset_transport_header(skb); + + rcu_read_lock(); + { + int protocol = ip_hdr(skb)->protocol; + int hash, raw; + const struct net_protocol *ipprot; + + resubmit: + raw = raw_local_deliver(skb, protocol); + + hash = protocol & (MAX_INET_PROTOS - 1); + ipprot = rcu_dereference(inet_protos[hash]); + if (ipprot != NULL) { + int ret; + + if (!net_eq(net, &init_net) && !ipprot->netns_ok) { + if (net_ratelimit()) + printk("%s: proto %d isn't netns-ready\n", + __func__, protocol); + kfree_skb(skb); + goto out; + } + + if (!ipprot->no_policy) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + goto out; + } + nf_reset(skb); + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); + } + } else + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + } + out: + rcu_read_unlock(); + + return 0; +} + +/* + * Deliver IP Packets to the higher protocol layers. + */ +int ip_local_deliver(struct sk_buff *skb) +{ + /* + * Reassemble IP fragments. + */ + + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + return 0; + } + + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + ip_local_deliver_finish); +} + +static inline int ip_rcv_options(struct sk_buff *skb) +{ + struct ip_options *opt; + const struct iphdr *iph; + struct net_device *dev = skb->dev; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + if (skb_cow(skb, skb_headroom(skb))) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + iph = ip_hdr(skb); + opt = &(IPCB(skb)->opt); + opt->optlen = iph->ihl*4 - sizeof(struct iphdr); + + if (ip_options_compile(dev_net(dev), opt, skb)) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + if (unlikely(opt->srr)) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev) { + if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && + net_ratelimit()) + printk(KERN_INFO "source route option %pI4 -> %pI4\n", + &iph->saddr, &iph->daddr); + goto drop; + } + } + + if (ip_options_rcv_srr(skb)) + goto drop; + } + + return 0; +drop: + return -1; +} + +static int ip_rcv_finish(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; + + /* + * Initialise the virtual path cache for the packet. It describes + * how the packet travels inside Linux networking. + */ + if (skb_dst(skb) == NULL) { + int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); + if (unlikely(err)) { + if (err == -EHOSTUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); + goto drop; + } + } + +#ifdef CONFIG_IP_ROUTE_CLASSID + if (unlikely(skb_dst(skb)->tclassid)) { + struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); + u32 idx = skb_dst(skb)->tclassid; + st[idx&0xFF].o_packets++; + st[idx&0xFF].o_bytes += skb->len; + st[(idx>>16)&0xFF].i_packets++; + st[(idx>>16)&0xFF].i_bytes += skb->len; + } +#endif + + if (iph->ihl > 5 && ip_rcv_options(skb)) + goto drop; + + rt = skb_rtable(skb); + if (rt->rt_type == RTN_MULTICAST) { + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, + skb->len); + } else if (rt->rt_type == RTN_BROADCAST) + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, + skb->len); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Main IP Receive routine. + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + const struct iphdr *iph; + u32 len; + + /* When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. + */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + + IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = ip_hdr(skb); + + /* + * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. + * + * Is the datagram acceptable? + * + * 1. Length at least the size of an ip header + * 2. Version of 4 + * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] + * 4. Doesn't have a bogus length + */ + + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) + goto inhdr_error; + + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + /* Remove any debris in the socket control block */ + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + + /* Must drop socket now because of tproxy. */ + skb_orphan(skb); + + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, + ip_rcv_finish); + +inhdr_error: + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); +out: + return NET_RX_DROP; +} diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c new file mode 100644 index 00000000..42dd1a90 --- /dev/null +++ b/net/ipv4/ip_options.c @@ -0,0 +1,649 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The options processing module for ip.c + * + * Authors: A.N.Kuznetsov + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Write options to IP header, record destination address to + * source route option, address of outgoing interface + * (we should already know it, so that this function is allowed be + * called only after routing decision) and timestamp, + * if we originate this datagram. + * + * daddr is real destination address, next hop is recorded in IP header. + * saddr is address of outgoing interface. + */ + +void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + __be32 daddr, struct rtable *rt, int is_frag) +{ + unsigned char *iph = skb_network_header(skb); + + memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); + memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + opt = &(IPCB(skb)->opt); + + if (opt->srr) + memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + + if (!is_frag) { + if (opt->rr_needaddr) + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); + if (opt->ts_needaddr) + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); + if (opt->ts_needtime) { + struct timespec tv; + __be32 midtime; + getnstimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); + memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + } + return; + } + if (opt->rr) { + memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + opt->rr = 0; + opt->rr_needaddr = 0; + } + if (opt->ts) { + memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + opt->ts = 0; + opt->ts_needaddr = opt->ts_needtime = 0; + } +} + +/* + * Provided (sopt, skb) points to received options, + * build in dopt compiled option set appropriate for answering. + * i.e. invert SRR option, copy anothers, + * and grab room in RR/TS options. + * + * NOTE: dopt cannot point to skb. + */ + +int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +{ + const struct ip_options *sopt; + unsigned char *sptr, *dptr; + int soffset, doffset; + int optlen; + __be32 daddr; + + memset(dopt, 0, sizeof(struct ip_options)); + + sopt = &(IPCB(skb)->opt); + + if (sopt->optlen == 0) + return 0; + + sptr = skb_network_header(skb); + dptr = dopt->__data; + + daddr = skb_rtable(skb)->rt_spec_dst; + + if (sopt->rr) { + optlen = sptr[sopt->rr+1]; + soffset = sptr[sopt->rr+2]; + dopt->rr = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->rr, optlen); + if (sopt->rr_needaddr && soffset <= optlen) { + if (soffset + 3 > optlen) + return -EINVAL; + dptr[2] = soffset + 4; + dopt->rr_needaddr = 1; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->ts) { + optlen = sptr[sopt->ts+1]; + soffset = sptr[sopt->ts+2]; + dopt->ts = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->ts, optlen); + if (soffset <= optlen) { + if (sopt->ts_needaddr) { + if (soffset + 3 > optlen) + return -EINVAL; + dopt->ts_needaddr = 1; + soffset += 4; + } + if (sopt->ts_needtime) { + if (soffset + 3 > optlen) + return -EINVAL; + if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { + dopt->ts_needtime = 1; + soffset += 4; + } else { + dopt->ts_needtime = 0; + + if (soffset + 7 <= optlen) { + __be32 addr; + + memcpy(&addr, dptr+soffset-1, 4); + if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { + dopt->ts_needtime = 1; + soffset += 8; + } + } + } + } + dptr[2] = soffset; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->srr) { + unsigned char *start = sptr+sopt->srr; + __be32 faddr; + + optlen = start[1]; + soffset = start[2]; + doffset = 0; + if (soffset > optlen) + soffset = optlen + 1; + soffset -= 4; + if (soffset > 3) { + memcpy(&faddr, &start[soffset-1], 4); + for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + memcpy(&dptr[doffset-1], &start[soffset-1], 4); + /* + * RFC1812 requires to fix illegal source routes. + */ + if (memcmp(&ip_hdr(skb)->saddr, + &start[soffset + 3], 4) == 0) + doffset -= 4; + } + if (doffset > 3) { + memcpy(&start[doffset-1], &daddr, 4); + dopt->faddr = faddr; + dptr[0] = start[0]; + dptr[1] = doffset+3; + dptr[2] = 4; + dptr += doffset+3; + dopt->srr = dopt->optlen + sizeof(struct iphdr); + dopt->optlen += doffset+3; + dopt->is_strictroute = sopt->is_strictroute; + } + } + if (sopt->cipso) { + optlen = sptr[sopt->cipso+1]; + dopt->cipso = dopt->optlen+sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->cipso, optlen); + dptr += optlen; + dopt->optlen += optlen; + } + while (dopt->optlen & 3) { + *dptr++ = IPOPT_END; + dopt->optlen++; + } + return 0; +} + +/* + * Options "fragmenting", just fill options not + * allowed in fragments with NOOPs. + * Simple and stupid 8), but the most efficient way. + */ + +void ip_options_fragment(struct sk_buff * skb) +{ + unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); + struct ip_options * opt = &(IPCB(skb)->opt); + int l = opt->optlen; + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return; + if (!IPOPT_COPIED(*optptr)) + memset(optptr, IPOPT_NOOP, optlen); + l -= optlen; + optptr += optlen; + } + opt->ts = 0; + opt->rr = 0; + opt->rr_needaddr = 0; + opt->ts_needaddr = 0; + opt->ts_needtime = 0; +} + +/* + * Verify options and fill pointers in struct options. + * Caller should clear *opt, and set opt->data. + * If opt == NULL, then skb->data should point to IP header. + */ + +int ip_options_compile(struct net *net, + struct ip_options * opt, struct sk_buff * skb) +{ + int l; + unsigned char * iph; + unsigned char * optptr; + int optlen; + unsigned char * pp_ptr = NULL; + struct rtable *rt = NULL; + + if (skb != NULL) { + rt = skb_rtable(skb); + optptr = (unsigned char *)&(ip_hdr(skb)[1]); + } else + optptr = opt->__data; + iph = optptr - sizeof(struct iphdr); + + for (l = opt->optlen; l > 0; ) { + switch (*optptr) { + case IPOPT_END: + for (optptr++, l--; l>0; optptr++, l--) { + if (*optptr != IPOPT_END) { + *optptr = IPOPT_END; + opt->is_changed = 1; + } + } + goto eol; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) { + pp_ptr = optptr; + goto error; + } + switch (*optptr) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + /* NB: cf RFC-1812 5.2.4.1 */ + if (opt->srr) { + pp_ptr = optptr; + goto error; + } + if (!skb) { + if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { + pp_ptr = optptr + 1; + goto error; + } + memcpy(&opt->faddr, &optptr[3], 4); + if (optlen > 7) + memmove(&optptr[3], &optptr[7], optlen-7); + } + opt->is_strictroute = (optptr[0] == IPOPT_SSRR); + opt->srr = optptr - iph; + break; + case IPOPT_RR: + if (opt->rr) { + pp_ptr = optptr; + goto error; + } + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + if (optptr[2]+3 > optlen) { + pp_ptr = optptr + 2; + goto error; + } + if (rt) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + opt->is_changed = 1; + } + optptr[2] += 4; + opt->rr_needaddr = 1; + } + opt->rr = optptr - iph; + break; + case IPOPT_TIMESTAMP: + if (opt->ts) { + pp_ptr = optptr; + goto error; + } + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 5) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + unsigned char *timeptr = NULL; + if (optptr[2]+3 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + switch (optptr[3]&0xF) { + case IPOPT_TS_TSONLY: + opt->ts = optptr - iph; + if (skb) + timeptr = &optptr[optptr[2]-1]; + opt->ts_needtime = 1; + optptr[2] += 4; + break; + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + if (rt) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + timeptr = &optptr[optptr[2]+3]; + } + opt->ts_needaddr = 1; + opt->ts_needtime = 1; + optptr[2] += 8; + break; + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + { + __be32 addr; + memcpy(&addr, &optptr[optptr[2]-1], 4); + if (inet_addr_type(net, addr) == RTN_UNICAST) + break; + if (skb) + timeptr = &optptr[optptr[2]+3]; + } + opt->ts_needtime = 1; + optptr[2] += 8; + break; + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr + 3; + goto error; + } + break; + } + if (timeptr) { + struct timespec tv; + u32 midtime; + getnstimeofday(&tv); + midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; + put_unaligned_be32(midtime, timeptr); + opt->is_changed = 1; + } + } else { + unsigned overflow = optptr[3]>>4; + if (overflow == 15) { + pp_ptr = optptr + 3; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); + opt->is_changed = 1; + } + } + break; + case IPOPT_RA: + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] == 0 && optptr[3] == 0) + opt->router_alert = optptr - iph; + break; + case IPOPT_CIPSO: + if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) { + pp_ptr = optptr; + goto error; + } + opt->cipso = optptr - iph; + if (cipso_v4_validate(skb, &optptr)) { + pp_ptr = optptr; + goto error; + } + break; + case IPOPT_SEC: + case IPOPT_SID: + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr; + goto error; + } + break; + } + l -= optlen; + optptr += optlen; + } + +eol: + if (!pp_ptr) + return 0; + +error: + if (skb) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); + } + return -EINVAL; +} +EXPORT_SYMBOL(ip_options_compile); + +/* + * Undo all the changes done by ip_options_compile(). + */ + +void ip_options_undo(struct ip_options * opt) +{ + if (opt->srr) { + unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); + memmove(optptr+7, optptr+3, optptr[1]-7); + memcpy(optptr+3, &opt->faddr, 4); + } + if (opt->rr_needaddr) { + unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + if (opt->ts) { + unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); + if (opt->ts_needtime) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + optptr[2] -= 4; + } + if (opt->ts_needaddr) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + } +} + +static struct ip_options_rcu *ip_options_get_alloc(const int optlen) +{ + return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), + GFP_KERNEL); +} + +static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, + struct ip_options_rcu *opt, int optlen) +{ + while (optlen & 3) + opt->opt.__data[optlen++] = IPOPT_END; + opt->opt.optlen = optlen; + if (optlen && ip_options_compile(net, &opt->opt, NULL)) { + kfree(opt); + return -EINVAL; + } + kfree(*optp); + *optp = opt; + return 0; +} + +int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, + unsigned char __user *data, int optlen) +{ + struct ip_options_rcu *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } + return ip_options_get_finish(net, optp, opt, optlen); +} + +int ip_options_get(struct net *net, struct ip_options_rcu **optp, + unsigned char *data, int optlen) +{ + struct ip_options_rcu *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen) + memcpy(opt->opt.__data, data, optlen); + return ip_options_get_finish(net, optp, opt, optlen); +} + +void ip_forward_options(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + unsigned char * optptr; + struct rtable *rt = skb_rtable(skb); + unsigned char *raw = skb_network_header(skb); + + if (opt->rr_needaddr) { + optptr = (unsigned char *)raw + opt->rr; + ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); + opt->is_changed = 1; + } + if (opt->srr_is_hit) { + int srrptr, srrspace; + + optptr = raw + opt->srr; + + for ( srrptr=optptr[2], srrspace = optptr[1]; + srrptr <= srrspace; + srrptr += 4 + ) { + if (srrptr + 3 > srrspace) + break; + if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) + break; + } + if (srrptr + 3 <= srrspace) { + opt->is_changed = 1; + ip_hdr(skb)->daddr = opt->nexthop; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); + optptr[2] = srrptr+4; + } else if (net_ratelimit()) + printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); + if (opt->ts_needaddr) { + optptr = raw + opt->ts; + ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); + opt->is_changed = 1; + } + } + if (opt->is_changed) { + opt->is_changed = 0; + ip_send_check(ip_hdr(skb)); + } +} + +int ip_options_rcv_srr(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + int srrspace, srrptr; + __be32 nexthop; + struct iphdr *iph = ip_hdr(skb); + unsigned char *optptr = skb_network_header(skb) + opt->srr; + struct rtable *rt = skb_rtable(skb); + struct rtable *rt2; + unsigned long orefdst; + int err; + + if (!rt) + return 0; + + if (skb->pkt_type != PACKET_HOST) + return -EINVAL; + if (rt->rt_type == RTN_UNICAST) { + if (!opt->is_strictroute) + return 0; + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); + return -EINVAL; + } + if (rt->rt_type != RTN_LOCAL) + return -EINVAL; + + for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + if (srrptr + 3 > srrspace) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); + return -EINVAL; + } + memcpy(&nexthop, &optptr[srrptr-1], 4); + + orefdst = skb->_skb_refdst; + skb_dst_set(skb, NULL); + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); + rt2 = skb_rtable(skb); + if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { + skb_dst_drop(skb); + skb->_skb_refdst = orefdst; + return -EINVAL; + } + refdst_drop(orefdst); + if (rt2->rt_type != RTN_LOCAL) + break; + /* Superfast 8) loopback forward */ + iph->daddr = nexthop; + opt->is_changed = 1; + } + if (srrptr <= srrspace) { + opt->srr_is_hit = 1; + opt->nexthop = nexthop; + opt->is_changed = 1; + } + return 0; +} +EXPORT_SYMBOL(ip_options_rcv_srr); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c new file mode 100644 index 00000000..51a3eec2 --- /dev/null +++ b/net/ipv4/ip_output.c @@ -0,0 +1,1543 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) output module. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Donald Becker, + * Alan Cox, + * Richard Underwood + * Stefan Becker, + * Jorge Cwik, + * Arnt Gulbrandsen, + * Hirokazu Takahashi, + * + * See ip_input.c for original log + * + * Fixes: + * Alan Cox : Missing nonblock feature in ip_build_xmit. + * Mike Kilburn : htons() missing in ip_build_xmit. + * Bradford Johnson: Fix faulty handling of some frames when + * no route is found. + * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit + * (in case if packet not accepted by + * output firewall rules) + * Mike McLagan : Routing by source + * Alexey Kuznetsov: use new route cache + * Andi Kleen: Fix broken PMTU recovery and remove + * some redundant tests. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Replace ip_reply with ip_send_reply. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. + * Marc Boucher : When call_out_firewall returns FW_QUEUE, + * silently drop skb instead of failing with -EPERM. + * Detlev Wengorz : Copy protocol for fragments. + * Hirokazu Takahashi: HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi: sendfile() on UDP works now. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; +EXPORT_SYMBOL(sysctl_ip_default_ttl); + +/* Generate a checksum for an outgoing IP datagram. */ +__inline__ void ip_send_check(struct iphdr *iph) +{ + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} +EXPORT_SYMBOL(ip_send_check); + +int __ip_local_out(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->tot_len = htons(skb->len); + ip_send_check(iph); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} + +int ip_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip_local_out); + +/* dev_loopback_xmit for use with netfilter. */ +static int ip_dev_loopback_xmit(struct sk_buff *newskb) +{ + skb_reset_mac_header(newskb); + __skb_pull(newskb, skb_network_offset(newskb)); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(newskb)); + netif_rx_ni(newskb); + return 0; +} + +static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) +{ + int ttl = inet->uc_ttl; + + if (ttl < 0) + ttl = ip4_dst_hoplimit(dst); + return ttl; +} + +/* + * Add an ip header to a skbuff and send it out. + * + */ +int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = skb_rtable(skb); + struct iphdr *iph; + + /* Build the IP header. */ + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = 5; + iph->tos = inet->tos; + if (ip_dont_fragment(sk, &rt->dst)) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + iph->ttl = ip_select_ttl(inet, &rt->dst); + iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); + iph->saddr = saddr; + iph->protocol = sk->sk_protocol; + ip_select_ident(iph, &rt->dst, sk); + + if (opt && opt->opt.optlen) { + iph->ihl += opt->opt.optlen>>2; + ip_options_build(skb, &opt->opt, daddr, rt, 0); + } + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + /* Send it out. */ + return ip_local_out(skb); +} +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + +static inline int ip_finish_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = (struct rtable *)dst; + struct net_device *dev = dst->dev; + unsigned int hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + int res; + + if (rt->rt_type == RTN_MULTICAST) { + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + } else if (rt->rt_type == RTN_BROADCAST) + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); + + /* Be paranoid, rather than too clever. */ + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); + if (skb2 == NULL) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + kfree_skb(skb); + skb = skb2; + } + + rcu_read_lock(); + if (dst->hh) { + int res = neigh_hh_output(dst->hh, skb); + + rcu_read_unlock(); + return res; + } else { + neigh = dst_get_neighbour(dst); + if (neigh) { + res = neigh->output(skb); + + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); + } + + if (net_ratelimit()) + printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); + kfree_skb(skb); + return -EINVAL; +} + +static inline int ip_skb_dst_mtu(struct sk_buff *skb) +{ + struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + + return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +} + +static int ip_finish_output(struct sk_buff *skb) +{ +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb_dst(skb)->xfrm != NULL) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + return ip_fragment(skb, ip_finish_output2); + else + return ip_finish_output2(skb); +} + +int ip_mc_output(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + /* + * If the indicated interface is up and running, send the packet. + */ + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + /* + * Multicasts are looped back for other local users + */ + + if (rt->rt_flags&RTCF_MULTICAST) { + if (sk_mc_loop(sk) +#ifdef CONFIG_IP_MROUTE + /* Small optimization: do not loopback not local frames, + which returned after forwarding; they will be dropped + by ip_mr_input in any case. + Note, that local frames are looped back to be delivered + to local recipients. + + This check is duplicated in ip_mr_input at the moment. + */ + && + ((rt->rt_flags & RTCF_LOCAL) || + !(IPCB(skb)->flags & IPSKB_FORWARDED)) +#endif + ) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + if (newskb) + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + ip_dev_loopback_xmit); + } + + /* Multicasts with ttl 0 must not go beyond the host */ + + if (ip_hdr(skb)->ttl == 0) { + kfree_skb(skb); + return 0; + } + } + + if (rt->rt_flags&RTCF_BROADCAST) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + if (newskb) + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip_dev_loopback_xmit); + } + + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, + skb->dev, ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); +} + +int ip_output(struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); +} + +int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; + struct rtable *rt; + struct iphdr *iph; + int res; + + /* Skip all of this if the packet is already routed, + * f.e. by something like SCTP. + */ + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + fl4 = &fl->u.ip4; + rt = skb_rtable(skb); + if (rt != NULL) + goto packet_routed; + + /* Make sure we can route this packet. */ + rt = (struct rtable *)__sk_dst_check(sk, 0); + if (rt == NULL) { + __be32 daddr; + + /* Use correct destination address if we have options. */ + daddr = inet->inet_daddr; + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + rt = ip_route_output_ports(sock_net(sk), fl4, sk, + daddr, inet->inet_saddr, + inet->inet_dport, + inet->inet_sport, + sk->sk_protocol, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; + sk_setup_caps(sk, &rt->dst); + } + skb_dst_set_noref(skb, &rt->dst); + +packet_routed: + if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + goto no_route; + + /* OK, we know where to send it, allocate and build IP header. */ + skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + iph->ttl = ip_select_ttl(inet, &rt->dst); + iph->protocol = sk->sk_protocol; + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; + /* Transport layer set skb->h.foo itself. */ + + if (inet_opt && inet_opt->opt.optlen) { + iph->ihl += inet_opt->opt.optlen >> 2; + ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); + } + + ip_select_ident_more(iph, &rt->dst, sk, + (skb_shinfo(skb)->gso_segs ?: 1) - 1); + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + res = ip_local_out(skb); + rcu_read_unlock(); + return res; + +no_route: + rcu_read_unlock(); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EHOSTUNREACH; +} +EXPORT_SYMBOL(ip_queue_xmit); + + +static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + skb_dst_drop(to); + skb_dst_copy(to, from); + to->dev = from->dev; + to->mark = from->mark; + + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif + nf_copy(to, from); +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + to->nf_trace = from->nf_trace; +#endif +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + to->ipvs_property = from->ipvs_property; +#endif + skb_copy_secmark(to, from); +} + +/* + * This IP datagram is too large to be sent in one piece. Break it up into + * smaller pieces (each of size equal to IP header plus + * a block of the data of the original IP data part) that will yet fit in a + * single device frame, and queue such a frame for sending. + */ + +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +{ + struct iphdr *iph; + int ptr; + struct net_device *dev; + struct sk_buff *skb2; + unsigned int mtu, hlen, left, len, ll_rs; + int offset; + __be16 not_last_frag; + struct rtable *rt = skb_rtable(skb); + int err = 0; + + dev = rt->dst.dev; + + /* + * Point into the IP datagram header. + */ + + iph = ip_hdr(skb); + + if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(ip_skb_dst_mtu(skb))); + kfree_skb(skb); + return -EMSGSIZE; + } + + /* + * Setup starting values. + */ + + hlen = iph->ihl * 4; + mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) + mtu -= nf_bridge_mtu_reduction(skb); +#endif + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; + + /* When frag_list is given, use it. First, check its validity: + * some transformers could create wrong frag_list or break existing + * one, it is not prohibited. In this case fall back to copying. + * + * LATER: this step can be merged to real generation of fragments, + * we can switch to copy when see the first bad fragment. + */ + if (skb_has_frag_list(skb)) { + struct sk_buff *frag, *frag2; + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path_clean; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path_clean; + + BUG_ON(frag->sk); + if (skb->sk) { + frag->sk = skb->sk; + frag->destructor = sock_wfree; + } + skb->truesize -= frag->truesize; + } + + /* Everything is OK. Generate! */ + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off = htons(IP_MF); + ip_send_check(iph); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iph = ip_hdr(frag); + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + if (offset == 0) + ip_options_fragment(frag); + offset += skb->len - hlen; + iph->frag_off = htons(offset>>3); + if (frag->next != NULL) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); + } + + err = output(skb); + + if (!err) + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (err == 0) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* for bridged IP traffic encapsulated inside f.e. a vlan header, + * we need to make room for the encapsulating header + */ + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); + + /* + * Fragment the datagram. + */ + + offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + not_last_frag = iph->frag_off & htons(IP_MF); + + /* + * Keep copying data until we run out. + */ + + while (left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, ll_rs); + skb_put(skb2, len + hlen); + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + + skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) + BUG(); + left -= len; + + /* + * Fill in the new header fields. + */ + iph = ip_hdr(skb2); + iph->frag_off = htons((offset >> 3)); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (offset == 0) + ip_options_fragment(skb); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (left > 0 || not_last_frag) + iph->frag_off |= htons(IP_MF); + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + iph->tot_len = htons(len + hlen); + + ip_send_check(iph); + + err = output(skb2); + if (err) + goto fail; + + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + } + kfree_skb(skb); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + return err; + +fail: + kfree_skb(skb); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + return err; +} +EXPORT_SYMBOL(ip_fragment); + +int +ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct iovec *iov = from; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (memcpy_fromiovecend(to, iov, offset, len) < 0) + return -EFAULT; + } else { + __wsum csum = 0; + if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + return -EFAULT; + skb->csum = csum_block_add(skb->csum, csum, odd); + } + return 0; +} +EXPORT_SYMBOL(ip_generic_getfrag); + +static inline __wsum +csum_page(struct page *page, int offset, int copy) +{ + char *kaddr; + __wsum csum; + kaddr = kmap(page); + csum = csum_partial(kaddr + offset, copy, 0); + kunmap(page); + return csum; +} + +static inline int ip_ufo_append_data(struct sock *sk, + struct sk_buff_head *queue, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int maxfraglen, unsigned int flags) +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP fragmentation offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb, fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb_reset_network_header(skb); + + /* initialize protocol header pointer */ + skb->transport_header = skb->network_header + fragheaderlen; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + + /* specify the length of each IP datagram fragment */ + skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + __skb_queue_tail(queue, skb); + } + + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); +} + +static int __ip_append_data(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, + struct inet_cork *cork, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + + struct ip_options *opt = cork->opt; + int hh_len; + int exthdrlen; + int mtu; + int copy; + int err; + int offset = 0; + unsigned int maxfraglen, fragheaderlen; + int csummode = CHECKSUM_NONE; + struct rtable *rt = (struct rtable *)cork->dst; + + skb = skb_peek_tail(queue); + + exthdrlen = !skb ? rt->dst.header_len : 0; + mtu = cork->fragsize; + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + + if (cork->length + length > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu-exthdrlen); + return -EMSGSIZE; + } + + /* + * transhdrlen > 0 means that this is the first fragment and we wish + * it won't be fragmented in the future. + */ + if (transhdrlen && + length + fragheaderlen <= mtu && + rt->dst.dev->features & NETIF_F_V4_CSUM && + !exthdrlen) + csummode = CHECKSUM_PARTIAL; + + cork->length += length; + if (((length > mtu) || (skb && skb_is_gso(skb))) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { + err = ip_ufo_append_data(sk, queue, getfrag, from, length, + hh_len, fragheaderlen, transhdrlen, + maxfraglen, flags); + if (err) + goto error; + return 0; + } + + /* So, what's going on in the loop below? + * + * We use calculated fragment length to generate chained skb, + * each of segments is IP fragment ready for sending to network after + * adding appropriate IP header. + */ + + if (!skb) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = mtu - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; + struct sk_buff *skb_prev; +alloc_new_skb: + skb_prev = skb; + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + if (datalen > mtu - fragheaderlen) + datalen = maxfraglen - fragheaderlen; + fraglen = datalen + fragheaderlen; + + if ((flags & MSG_MORE) && + !(rt->dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = fraglen; + + alloclen += exthdrlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length + fraggap) + alloclen += rt->dst.trailer_len; + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len + 15, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len + 15, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + else + /* only the initial fragment is + time stamped */ + cork->tx_flags = 0; + } + if (skb == NULL) + goto error; + + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + skb_reserve(skb, hh_len); + skb_shinfo(skb)->tx_flags = cork->tx_flags; + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fraglen + exthdrlen); + skb_set_network_header(skb, exthdrlen); + skb->transport_header = (skb->network_header + + fragheaderlen); + data += fragheaderlen + exthdrlen; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + pskb_trim_unique(skb_prev, maxfraglen); + } + + copy = datalen - transhdrlen - fraggap; + if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = cork->page; + int off = cork->off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if (i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + cork->page = page; + cork->off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + cork->off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); + } + offset += copy; + length -= copy; + } + + return 0; + +error: + cork->length -= length; + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, + struct ipcm_cookie *ipc, struct rtable **rtp) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *opt; + struct rtable *rt; + + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (cork->opt == NULL) { + cork->opt = kmalloc(sizeof(struct ip_options) + 40, + sk->sk_allocation); + if (unlikely(cork->opt == NULL)) + return -ENOBUFS; + } + memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); + cork->flags |= IPCORK_OPT; + cork->addr = ipc->addr; + } + rt = *rtp; + if (unlikely(!rt)) + return -EFAULT; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; + cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); + cork->dst = &rt->dst; + cork->length = 0; + cork->tx_flags = ipc->tx_flags; + cork->page = NULL; + cork->off = 0; + + return 0; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, struct flowi4 *fl4, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); + if (err) + return err; + } else { + transhdrlen = 0; + } + + return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, + from, length, transhdrlen, flags); +} + +ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, + int offset, size_t size, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + struct rtable *rt; + struct ip_options *opt = NULL; + struct inet_cork *cork; + int hh_len; + int mtu; + int len; + int err; + unsigned int maxfraglen, fragheaderlen, fraggap; + + if (inet->hdrincl) + return -EPERM; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) + return -EINVAL; + + cork = &inet->cork.base; + rt = (struct rtable *)cork->dst; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; + + if (!(rt->dst.dev->features&NETIF_F_SG)) + return -EOPNOTSUPP; + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); + mtu = cork->fragsize; + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + + if (cork->length + size > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); + return -EMSGSIZE; + } + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + return -EINVAL; + + cork->length += size; + if ((size + skb->len > mtu) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + } + + + while (size > 0) { + int i; + + if (skb_is_gso(skb)) + len = size; + else { + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + } + if (len <= 0) { + struct sk_buff *skb_prev; + int alloclen; + + skb_prev = skb; + fraggap = skb_prev->len - maxfraglen; + + alloclen = fragheaderlen + hh_len + fraggap + 15; + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); + if (unlikely(!skb)) { + err = -ENOBUFS; + goto error; + } + + /* + * Fill in the control structures + */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + skb_put(skb, fragheaderlen + fraggap); + skb_reset_network_header(skb); + skb->transport_header = (skb->network_header + + fragheaderlen); + if (fraggap) { + skb->csum = skb_copy_and_csum_bits(skb_prev, + maxfraglen, + skb_transport_header(skb), + fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + pskb_trim_unique(skb_prev, maxfraglen); + } + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + i = skb_shinfo(skb)->nr_frags; + if (len > size) + len = size; + if (skb_can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += len; + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, len); + } else { + err = -EMSGSIZE; + goto error; + } + + if (skb->ip_summed == CHECKSUM_NONE) { + __wsum csum; + csum = csum_page(page, offset, len); + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } + + skb->len += len; + skb->data_len += len; + skb->truesize += len; + atomic_add(len, &sk->sk_wmem_alloc); + offset += len; + size -= len; + } + return 0; + +error: + cork->length -= size; + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void ip_cork_release(struct inet_cork *cork) +{ + cork->flags &= ~IPCORK_OPT; + kfree(cork->opt); + cork->opt = NULL; + dst_release(cork->dst); + cork->dst = NULL; +} + +/* + * Combined all pending IP fragments on the socket as one IP datagram + * and push them out. + */ +struct sk_buff *__ip_make_skb(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, + struct inet_cork *cork) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + struct ip_options *opt = NULL; + struct rtable *rt = (struct rtable *)cork->dst; + struct iphdr *iph; + __be16 df = 0; + __u8 ttl; + + if ((skb = __skb_dequeue(queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); + while ((tmp_skb = __skb_dequeue(queue)) != NULL) { + __skb_pull(tmp_skb, skb_network_header_len(skb)); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; + skb->truesize += tmp_skb->truesize; + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; + } + + /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow + * to fragment the frame generated here. No matter, what transforms + * how transforms change size of the packet, it will come out. + */ + if (inet->pmtudisc < IP_PMTUDISC_DO) + skb->local_df = 1; + + /* DF bit is set when we want to see DF on outgoing frames. + * If local_df is set too, we still allow to fragment this frame + * locally. */ + if (inet->pmtudisc >= IP_PMTUDISC_DO || + (skb->len <= dst_mtu(&rt->dst) && + ip_dont_fragment(sk, &rt->dst))) + df = htons(IP_DF); + + if (cork->flags & IPCORK_OPT) + opt = cork->opt; + + if (rt->rt_type == RTN_MULTICAST) + ttl = inet->mc_ttl; + else + ttl = ip_select_ttl(inet, &rt->dst); + + iph = (struct iphdr *)skb->data; + iph->version = 4; + iph->ihl = 5; + iph->tos = inet->tos; + iph->frag_off = df; + ip_select_ident(iph, &rt->dst, sk); + iph->ttl = ttl; + iph->protocol = sk->sk_protocol; + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; + + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, cork->addr, rt, 0); + } + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + /* + * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec + * on dst refcount + */ + cork->dst = NULL; + skb_dst_set(skb, &rt->dst); + + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); + + ip_cork_release(cork); +out: + return skb; +} + +int ip_send_skb(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + int err; + + err = ip_local_out(skb); + if (err) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); + } + + return err; +} + +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) +{ + struct sk_buff *skb; + + skb = ip_finish_skb(sk, fl4); + if (!skb) + return 0; + + /* Netfilter gets whole the not fragmented skb. */ + return ip_send_skb(skb); +} + +/* + * Throw away all pending data on the socket. + */ +static void __ip_flush_pending_frames(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(queue)) != NULL) + kfree_skb(skb); + + ip_cork_release(cork); +} + +void ip_flush_pending_frames(struct sock *sk) +{ + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); +} + +struct sk_buff *ip_make_skb(struct sock *sk, + struct flowi4 *fl4, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_cork cork; + struct sk_buff_head queue; + int err; + + if (flags & MSG_PROBE) + return NULL; + + __skb_queue_head_init(&queue); + + cork.flags = 0; + cork.addr = 0; + cork.opt = NULL; + err = ip_setup_cork(sk, &cork, ipc, rtp); + if (err) + return ERR_PTR(err); + + err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, + from, length, transhdrlen, flags); + if (err) { + __ip_flush_pending_frames(sk, &queue, &cork); + return ERR_PTR(err); + } + + return __ip_make_skb(sk, fl4, &queue, &cork); +} + +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(void *dptr, char *to, int offset, + int len, int odd, struct sk_buff *skb) +{ + __wsum csum; + + csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; +} + +/* + * Generic function to send a packet as reply to another packet. + * Used to send TCP resets so far. ICMP should use this function too. + * + * Should run single threaded per socket because it uses the sock + * structure to pass arguments. + */ +void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, + struct ip_reply_arg *arg, unsigned int len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options_data replyopts; + struct ipcm_cookie ipc; + struct flowi4 fl4; + struct rtable *rt = skb_rtable(skb); + + if (ip_options_echo(&replyopts.opt.opt, skb)) + return; + + ipc.addr = daddr; + ipc.opt = NULL; + ipc.tx_flags = 0; + + if (replyopts.opt.opt.optlen) { + ipc.opt = &replyopts.opt; + + if (replyopts.opt.opt.srr) + daddr = replyopts.opt.opt.faddr; + } + + flowi4_init_output(&fl4, arg->bound_dev_if, 0, + RT_TOS(ip_hdr(skb)->tos), + RT_SCOPE_UNIVERSE, sk->sk_protocol, + ip_reply_arg_flowi_flags(arg), + daddr, rt->rt_spec_dst, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) + return; + + /* And let IP do all the hard work. + + This chunk is not reenterable, hence spinlock. + Note that it uses the fact, that this function is called + with locally disabled BH and that sk cannot be already spinlocked. + */ + bh_lock_sock(sk); + inet->tos = ip_hdr(skb)->tos; + sk->sk_priority = skb->priority; + sk->sk_protocol = ip_hdr(skb)->protocol; + sk->sk_bound_dev_if = arg->bound_dev_if; + ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + &ipc, &rt, MSG_DONTWAIT); + if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + if (arg->csumoffset >= 0) + *((__sum16 *)skb_transport_header(skb) + + arg->csumoffset) = csum_fold(csum_add(skb->csum, + arg->csum)); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(sk, &fl4); + } + + bh_unlock_sock(sk); + + ip_rt_put(rt); +} + +void __init ip_init(void) +{ + ip_rt_init(); + inet_initpeers(); + +#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) + igmp_mc_proc_init(); +#endif +} diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c new file mode 100644 index 00000000..ab0c9efd --- /dev/null +++ b/net/ipv4/ip_sockglue.c @@ -0,0 +1,1352 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP to API glue. + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip.c for history. + * Martin Mares : TOS setting fixed. + * Alan Cox : Fixed a couple of oopses in Martin's + * TOS tweaks. + * Mike McLagan : Routing by source + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#endif + +#include +#include + +#define IP_CMSG_PKTINFO 1 +#define IP_CMSG_TTL 2 +#define IP_CMSG_TOS 4 +#define IP_CMSG_RECVOPTS 8 +#define IP_CMSG_RETOPTS 16 +#define IP_CMSG_PASSSEC 32 +#define IP_CMSG_ORIGDSTADDR 64 + +/* + * SOL_IP control messages. + */ + +static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct in_pktinfo info; + struct rtable *rt = skb_rtable(skb); + + info.ipi_addr.s_addr = ip_hdr(skb)->daddr; + if (rt) { + info.ipi_ifindex = rt->rt_iif; + info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + info.ipi_ifindex = 0; + info.ipi_spec_dst.s_addr = 0; + } + + put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); +} + +static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) +{ + int ttl = ip_hdr(skb)->ttl; + put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); +} + +static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) +{ + put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); +} + +static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) +{ + if (IPCB(skb)->opt.optlen == 0) + return; + + put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, + ip_hdr(skb) + 1); +} + + +static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options * opt = (struct ip_options *)optbuf; + + if (IPCB(skb)->opt.optlen == 0) + return; + + if (ip_options_echo(opt, skb)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + ip_options_undo(opt); + + put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); +} + +static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) +{ + char *secdata; + u32 seclen, secid; + int err; + + err = security_socket_getpeersec_dgram(NULL, skb, &secid); + if (err) + return; + + err = security_secid_to_secctx(secid, &secdata, &seclen); + if (err) + return; + + put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata); + security_release_secctx(secdata, seclen); +} + +static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) +{ + struct sockaddr_in sin; + const struct iphdr *iph = ip_hdr(skb); + __be16 *ports = (__be16 *)skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 > skb->len) + return; + + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = iph->daddr; + sin.sin_port = ports[1]; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); +} + +void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(skb->sk); + unsigned flags = inet->cmsg_flags; + + /* Ordered by supposed usage frequency */ + if (flags & 1) + ip_cmsg_recv_pktinfo(msg, skb); + if ((flags >>= 1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_ttl(msg, skb); + if ((flags >>= 1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_tos(msg, skb); + if ((flags >>= 1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_opts(msg, skb); + if ((flags >>= 1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_retopts(msg, skb); + if ((flags >>= 1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_security(msg, skb); + + if ((flags >>= 1) == 0) + return; + if (flags & 1) + ip_cmsg_recv_dstaddr(msg, skb); + +} +EXPORT_SYMBOL(ip_cmsg_recv); + +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +{ + int err; + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_IP) + continue; + switch (cmsg->cmsg_type) { + case IP_RETOPTS: + err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), + err < 40 ? err : 40); + if (err) + return err; + break; + case IP_PKTINFO: + { + struct in_pktinfo *info; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + return -EINVAL; + info = (struct in_pktinfo *)CMSG_DATA(cmsg); + ipc->oif = info->ipi_ifindex; + ipc->addr = info->ipi_spec_dst.s_addr; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + + +/* Special input handler for packets caught by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ +struct ip_ra_chain __rcu *ip_ra_chain; +static DEFINE_SPINLOCK(ip_ra_lock); + + +static void ip_ra_destroy_rcu(struct rcu_head *head) +{ + struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); + + sock_put(ra->saved_sk); + kfree(ra); +} + +int ip_ra_control(struct sock *sk, unsigned char on, + void (*destructor)(struct sock *)) +{ + struct ip_ra_chain *ra, *new_ra; + struct ip_ra_chain __rcu **rap; + + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) + return -EINVAL; + + new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + spin_lock_bh(&ip_ra_lock); + for (rap = &ip_ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&ip_ra_lock))) != NULL; + rap = &ra->next) { + if (ra->sk == sk) { + if (on) { + spin_unlock_bh(&ip_ra_lock); + kfree(new_ra); + return -EADDRINUSE; + } + /* dont let ip_call_ra_chain() use sk again */ + ra->sk = NULL; + rcu_assign_pointer(*rap, ra->next); + spin_unlock_bh(&ip_ra_lock); + + if (ra->destructor) + ra->destructor(sk); + /* + * Delay sock_put(sk) and kfree(ra) after one rcu grace + * period. This guarantee ip_call_ra_chain() dont need + * to mess with socket refcounts. + */ + ra->saved_sk = sk; + call_rcu(&ra->rcu, ip_ra_destroy_rcu); + return 0; + } + } + if (new_ra == NULL) { + spin_unlock_bh(&ip_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->destructor = destructor; + + new_ra->next = ra; + rcu_assign_pointer(*rap, new_ra); + sock_hold(sk); + spin_unlock_bh(&ip_ra_lock); + + return 0; +} + +void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) +{ + struct sock_exterr_skb *serr; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; + serr->ee.ee_type = icmp_hdr(skb)->type; + serr->ee.ee_code = icmp_hdr(skb)->code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - + skb_network_header(skb); + serr->port = port; + + if (skb_pull(skb, payload - skb->data) != NULL) { + skb_reset_transport_header(skb); + if (sock_queue_err_skb(sk, skb) == 0) + return; + } + kfree_skb(skb); +} + +void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct sock_exterr_skb *serr; + struct iphdr *iph; + struct sk_buff *skb; + + if (!inet->recverr) + return; + + skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); + if (!skb) + return; + + skb_put(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->daddr = daddr; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); + serr->port = port; + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + + serr->addr_offset); + sin->sin_port = serr->port; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin_family = AF_UNSPEC; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + + put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + skb2 = skb_peek(&sk->sk_error_queue); + if (skb2 != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +static void opt_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_options_rcu, rcu)); +} + +/* + * Socket option code for IP. This is the end of the line after any + * TCP,UDP etc options on an IP socket. + */ + +static int do_ip_setsockopt(struct sock *sk, int level, + int optname, char __user *optval, unsigned int optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val = 0, err; + + if (((1<= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } else if (optlen >= sizeof(char)) { + unsigned char ucval; + + if (get_user(ucval, (unsigned char __user *) optval)) + return -EFAULT; + val = (int) ucval; + } + } + + /* If optlen==0, it is equivalent to val == 0 */ + + if (ip_mroute_opt(optname)) + return ip_mroute_setsockopt(sk, optname, optval, optlen); + + err = 0; + lock_sock(sk); + + switch (optname) { + case IP_OPTIONS: + { + struct ip_options_rcu *old, *opt = NULL; + + if (optlen > 40) + goto e_inval; + err = ip_options_get_from_user(sock_net(sk), &opt, + optval, optlen); + if (err) + break; + old = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == PF_INET || + (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) && + inet->inet_daddr != LOOPBACK4_IPV6)) { +#endif + if (old) + icsk->icsk_ext_hdr_len -= old->opt.optlen; + if (opt) + icsk->icsk_ext_hdr_len += opt->opt.optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } +#endif + } + rcu_assign_pointer(inet->inet_opt, opt); + if (old) + call_rcu(&old->rcu, opt_kfree_rcu); + break; + } + case IP_PKTINFO: + if (val) + inet->cmsg_flags |= IP_CMSG_PKTINFO; + else + inet->cmsg_flags &= ~IP_CMSG_PKTINFO; + break; + case IP_RECVTTL: + if (val) + inet->cmsg_flags |= IP_CMSG_TTL; + else + inet->cmsg_flags &= ~IP_CMSG_TTL; + break; + case IP_RECVTOS: + if (val) + inet->cmsg_flags |= IP_CMSG_TOS; + else + inet->cmsg_flags &= ~IP_CMSG_TOS; + break; + case IP_RECVOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RECVOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; + break; + case IP_RETOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RETOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + break; + case IP_PASSSEC: + if (val) + inet->cmsg_flags |= IP_CMSG_PASSSEC; + else + inet->cmsg_flags &= ~IP_CMSG_PASSSEC; + break; + case IP_RECVORIGDSTADDR: + if (val) + inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR; + else + inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; + break; + case IP_TOS: /* This sets both TOS and Precedence */ + if (sk->sk_type == SOCK_STREAM) { + val &= ~3; + val |= inet->tos & 3; + } + if (inet->tos != val) { + inet->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } + break; + case IP_TTL: + if (optlen < 1) + goto e_inval; + if (val != -1 && (val < 0 || val > 255)) + goto e_inval; + inet->uc_ttl = val; + break; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; + break; + } + inet->hdrincl = val ? 1 : 0; + break; + case IP_NODEFRAG: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; + break; + } + inet->nodefrag = val ? 1 : 0; + break; + case IP_MTU_DISCOVER: + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + goto e_inval; + inet->pmtudisc = val; + break; + case IP_RECVERR: + inet->recverr = !!val; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + break; + case IP_MULTICAST_TTL: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (optlen < 1) + goto e_inval; + if (val == -1) + val = 1; + if (val < 0 || val > 255) + goto e_inval; + inet->mc_ttl = val; + break; + case IP_MULTICAST_LOOP: + if (optlen < 1) + goto e_inval; + inet->mc_loop = !!val; + break; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + struct net_device *dev = NULL; + + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + /* + * Check the arguments are allowable + */ + + if (optlen < sizeof(struct in_addr)) + goto e_inval; + + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq, optval, sizeof(mreq))) + break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address, optval, + sizeof(struct in_addr))) + break; + } + + if (!mreq.imr_ifindex) { + if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { + inet->mc_index = 0; + inet->mc_addr = 0; + err = 0; + break; + } + dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); + if (dev) + mreq.imr_ifindex = dev->ifindex; + } else + dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); + + + err = -EADDRNOTAVAIL; + if (!dev) + break; + dev_put(dev); + + err = -EINVAL; + if (sk->sk_bound_dev_if && + mreq.imr_ifindex != sk->sk_bound_dev_if) + break; + + inet->mc_index = mreq.imr_ifindex; + inet->mc_addr = mreq.imr_address.s_addr; + err = 0; + break; + } + + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + { + struct ip_mreqn mreq; + + err = -EPROTO; + if (inet_sk(sk)->is_icsk) + break; + + if (optlen < sizeof(struct ip_mreq)) + goto e_inval; + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq, optval, sizeof(mreq))) + break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq))) + break; + } + + if (optname == IP_ADD_MEMBERSHIP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case IP_MSFILTER: + { + struct ip_msfilter *msf; + + if (optlen < IP_MSFILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; + break; + } + msf = kmalloc(optlen, GFP_KERNEL); + if (!msf) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(msf, optval, optlen)) { + kfree(msf); + break; + } + /* numsrc >= (1G-4) overflow in 32 bits */ + if (msf->imsf_numsrc >= 0x3ffffffcU || + msf->imsf_numsrc > sysctl_igmp_max_msf) { + kfree(msf); + err = -ENOBUFS; + break; + } + if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { + kfree(msf); + err = -EINVAL; + break; + } + err = ip_mc_msfilter(sk, msf, 0); + kfree(msf); + break; + } + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + { + struct ip_mreq_source mreqs; + int omode, add; + + if (optlen != sizeof(struct ip_mreq_source)) + goto e_inval; + if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { + err = -EFAULT; + break; + } + if (optname == IP_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == IP_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { + struct ip_mreqn mreq; + + mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; + mreq.imr_address.s_addr = mreqs.imr_interface; + mreq.imr_ifindex = 0; + err = ip_mc_join_group(sk, &mreq); + if (err && err != -EADDRINUSE) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /* IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, 0); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in *psin; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + err = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(greq))) + break; + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + goto e_inval; + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + err = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) { + err = -EADDRNOTAVAIL; + break; + } + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs.gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct ip_mreqn mreq; + + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs.gsr_interface; + err = ip_mc_join_group(sk, &mreq); + if (err && err != -EADDRINUSE) + break; + greqs.gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, + greqs.gsr_interface); + break; + } + case MCAST_MSFILTER: + { + struct sockaddr_in *psin; + struct ip_msfilter *msf = NULL; + struct group_filter *gsf = NULL; + int msize, i, ifindex; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; + break; + } + gsf = kmalloc(optlen, GFP_KERNEL); + if (!gsf) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) + goto mc_msf_out; + + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); + msf = kmalloc(msize, GFP_KERNEL); + if (!msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + ifindex = gsf->gf_interface; + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) { + err = -EADDRNOTAVAIL; + goto mc_msf_out; + } + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = gsf->gf_fmode; + msf->imsf_numsrc = gsf->gf_numsrc; + err = -EADDRNOTAVAIL; + for (i = 0; i < gsf->gf_numsrc; ++i) { + psin = (struct sockaddr_in *)&gsf->gf_slist[i]; + + if (psin->sin_family != AF_INET) + goto mc_msf_out; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + kfree(gsf); + gsf = NULL; + + err = ip_mc_msfilter(sk, msf, ifindex); +mc_msf_out: + kfree(msf); + kfree(gsf); + break; + } + case IP_MULTICAST_ALL: + if (optlen < 1) + goto e_inval; + if (val != 0 && val != 1) + goto e_inval; + inet->mc_all = val; + break; + case IP_ROUTER_ALERT: + err = ip_ra_control(sk, val ? 1 : 0, NULL); + break; + + case IP_FREEBIND: + if (optlen < 1) + goto e_inval; + inet->freebind = !!val; + break; + + case IP_IPSEC_POLICY: + case IP_XFRM_POLICY: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + err = xfrm_user_policy(sk, optname, optval, optlen); + break; + + case IP_TRANSPARENT: + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (optlen < 1) + goto e_inval; + inet->transparent = !!val; + break; + + case IP_MINTTL: + if (optlen < 1) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + inet->min_ttl = val; + break; + + default: + err = -ENOPROTOOPT; + break; + } + release_sock(sk); + return err; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +/** + * ip_queue_rcv_skb - Queue an skb into sock receive queue + * @sk: socket + * @skb: buffer + * + * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option + * is not set, we drop skb dst entry now, while dst cache line is hot. + */ +int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) + skb_dst_drop(skb); + return sock_queue_rcv_skb(sk, skb); +} +EXPORT_SYMBOL(ip_queue_rcv_skb); + +int ip_setsockopt(struct sock *sk, int level, + int optname, char __user *optval, unsigned int optlen) +{ + int err; + + if (level != SOL_IP) + return -ENOPROTOOPT; + + err = do_ip_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IP_HDRINCL && + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { + lock_sock(sk); + err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); + release_sock(sk); + } +#endif + return err; +} +EXPORT_SYMBOL(ip_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_ip_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int err; + + if (level != SOL_IP) + return -ENOPROTOOPT; + + if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) + return compat_mc_setsockopt(sk, level, optname, optval, optlen, + ip_setsockopt); + + err = do_ip_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IP_HDRINCL && + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { + lock_sock(sk); + err = compat_nf_setsockopt(sk, PF_INET, optname, + optval, optlen); + release_sock(sk); + } +#endif + return err; +} +EXPORT_SYMBOL(compat_ip_setsockopt); +#endif + +/* + * Get the options. Note for future reference. The GET of IP options gets + * the _received_ ones. The set sets the _sent_ ones. + */ + +static int do_ip_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val; + int len; + + if (level != SOL_IP) + return -EOPNOTSUPP; + + if (ip_mroute_opt(optname)) + return ip_mroute_getsockopt(sk, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + lock_sock(sk); + + switch (optname) { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct ip_options)+40]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct ip_options_rcu *inet_opt; + + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + opt->optlen = 0; + if (inet_opt) + memcpy(optbuf, &inet_opt->opt, + sizeof(struct ip_options) + + inet_opt->opt.optlen); + release_sock(sk); + + if (opt->optlen == 0) + return put_user(0, optlen); + + ip_options_undo(opt); + + len = min_t(unsigned int, len, opt->optlen); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, opt->__data, len)) + return -EFAULT; + return 0; + } + case IP_PKTINFO: + val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; + break; + case IP_RECVOPTS: + val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; + case IP_RETOPTS: + val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; + case IP_PASSSEC: + val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; + break; + case IP_RECVORIGDSTADDR: + val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; + break; + case IP_TOS: + val = inet->tos; + break; + case IP_TTL: + val = (inet->uc_ttl == -1 ? + sysctl_ip_default_ttl : + inet->uc_ttl); + break; + case IP_HDRINCL: + val = inet->hdrincl; + break; + case IP_NODEFRAG: + val = inet->nodefrag; + break; + case IP_MTU_DISCOVER: + val = inet->pmtudisc; + break; + case IP_MTU: + { + struct dst_entry *dst; + val = 0; + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); + } + if (!val) { + release_sock(sk); + return -ENOTCONN; + } + break; + } + case IP_RECVERR: + val = inet->recverr; + break; + case IP_MULTICAST_TTL: + val = inet->mc_ttl; + break; + case IP_MULTICAST_LOOP: + val = inet->mc_loop; + break; + case IP_MULTICAST_IF: + { + struct in_addr addr; + len = min_t(unsigned int, len, sizeof(struct in_addr)); + addr.s_addr = inet->mc_addr; + release_sock(sk); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &addr, len)) + return -EFAULT; + return 0; + } + case IP_MSFILTER: + { + struct ip_msfilter msf; + int err; + + if (len < IP_MSFILTER_SIZE(0)) { + release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_msfget(sk, &msf, + (struct ip_msfilter __user *)optval, optlen); + release_sock(sk); + return err; + } + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) { + release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_gsfget(sk, &gsf, + (struct group_filter __user *)optval, + optlen); + release_sock(sk); + return err; + } + case IP_MULTICAST_ALL: + val = inet->mc_all; + break; + case IP_PKTOPTIONS: + { + struct msghdr msg; + + release_sock(sk); + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; + + if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + struct in_pktinfo info; + + info.ipi_addr.s_addr = inet->inet_rcv_saddr; + info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr; + info.ipi_ifindex = inet->mc_index; + put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); + } + if (inet->cmsg_flags & IP_CMSG_TTL) { + int hlim = inet->mc_ttl; + put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IP_FREEBIND: + val = inet->freebind; + break; + case IP_TRANSPARENT: + val = inet->transparent; + break; + case IP_MINTTL: + val = inet->min_ttl; + break; + default: + release_sock(sk); + return -ENOPROTOOPT; + } + release_sock(sk); + + if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { + unsigned char ucval = (unsigned char)val; + len = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &ucval, 1)) + return -EFAULT; + } else { + len = min_t(unsigned int, sizeof(int), len); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + } + return 0; +} + +int ip_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, int __user *optlen) +{ + int err; + + err = do_ip_getsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + err = nf_getsockopt(sk, PF_INET, optname, optval, + &len); + release_sock(sk); + if (err >= 0) + err = put_user(len, optlen); + return err; + } +#endif + return err; +} +EXPORT_SYMBOL(ip_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_ip_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err; + + if (optname == MCAST_MSFILTER) + return compat_mc_getsockopt(sk, level, optname, optval, optlen, + ip_getsockopt); + + err = do_ip_getsockopt(sk, level, optname, optval, optlen); + +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len); + release_sock(sk); + if (err >= 0) + err = put_user(len, optlen); + return err; + } +#endif + return err; +} +EXPORT_SYMBOL(compat_ip_getsockopt); +#endif diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c new file mode 100644 index 00000000..c857f6f4 --- /dev/null +++ b/net/ipv4/ipcomp.c @@ -0,0 +1,185 @@ +/* + * IP Payload Compression Protocol (IPComp) - RFC3173. + * + * Copyright (c) 2003 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Todo: + * - Tunable compression parameters. + * - Compression stats. + * - Adaptive compression. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void ipcomp4_err(struct sk_buff *skb, u32 info) +{ + struct net *net = dev_net(skb->dev); + __be32 spi; + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || + icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + + spi = htonl(ntohs(ipch->cpi)); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET); + if (!x) + return; + NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", + spi, &iph->daddr); + xfrm_state_put(x); +} + +/* We always hold one tunnel user reference to indicate a tunnel */ +static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + struct xfrm_state *t; + + t = xfrm_state_alloc(net); + if (t == NULL) + goto out; + + t->id.proto = IPPROTO_IPIP; + t->id.spi = x->props.saddr.a4; + t->id.daddr.a4 = x->id.daddr.a4; + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET; + t->props.mode = x->props.mode; + t->props.saddr.a4 = x->props.saddr.a4; + t->props.flags = x->props.flags; + memcpy(&t->mark, &x->mark, sizeof(t->mark)); + + if (xfrm_init_state(t)) + goto error; + + atomic_set(&t->tunnel_users, 1); +out: + return t; + +error: + t->km.state = XFRM_STATE_DEAD; + xfrm_state_put(t); + t = NULL; + goto out; +} + +/* + * Must be protected by xfrm_cfg_mutex. State and tunnel user references are + * always incremented on success. + */ +static int ipcomp_tunnel_attach(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + int err = 0; + struct xfrm_state *t; + u32 mark = x->mark.v & x->mark.m; + + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4, + x->props.saddr.a4, IPPROTO_IPIP, AF_INET); + if (!t) { + t = ipcomp_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); +out: + return err; +} + +static int ipcomp4_init_state(struct xfrm_state *x) +{ + int err = -EINVAL; + + x->props.header_len = 0; + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct iphdr); + break; + default: + goto out; + } + + err = ipcomp_init_state(x); + if (err) + goto out; + + if (x->props.mode == XFRM_MODE_TUNNEL) { + err = ipcomp_tunnel_attach(x); + if (err) + goto out; + } + + err = 0; +out: + return err; +} + +static const struct xfrm_type ipcomp_type = { + .description = "IPCOMP4", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp4_init_state, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output +}; + +static const struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ipcomp4_err, + .no_policy = 1, +}; + +static int __init ipcomp4_init(void) +{ + if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { + printk(KERN_INFO "ipcomp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp4_fini(void) +{ + if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp4_init); +module_exit(ipcomp4_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173"); +MODULE_AUTHOR("James Morris "); + +MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c new file mode 100644 index 00000000..7fbcabaf --- /dev/null +++ b/net/ipv4/ipconfig.c @@ -0,0 +1,1636 @@ +/* + * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or + * user-supplied information to configure own IP address and routes. + * + * Copyright (C) 1996-1998 Martin Mares + * + * Derived from network configuration code in fs/nfs/nfsroot.c, + * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. + * + * BOOTP rewritten to construct and analyse packets itself instead + * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--; + * -- MJ, December 1998 + * + * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic" + * initialization scheme. + * - Arnaldo Carvalho de Melo , 08/11/1999 + * + * DHCP support added. To users this looks like a whole separate + * protocol, but we know it's just a bag on the side of BOOTP. + * -- Chip Salzenberg , May 2000 + * + * Ported DHCP support from 2.2.16 to 2.4.0-test4 + * -- Eric Biederman , 30 Aug 2000 + * + * Merged changes from 2.2.19 into 2.4.3 + * -- Eric Biederman , 22 April Aug 2001 + * + * Multiple Nameservers in /proc/net/pnp + * -- Josef Siemes , Aug 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Define this to allow debugging output */ +#undef IPCONFIG_DEBUG + +#ifdef IPCONFIG_DEBUG +#define DBG(x) printk x +#else +#define DBG(x) do { } while(0) +#endif + +#if defined(CONFIG_IP_PNP_DHCP) +#define IPCONFIG_DHCP +#endif +#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP) +#define IPCONFIG_BOOTP +#endif +#if defined(CONFIG_IP_PNP_RARP) +#define IPCONFIG_RARP +#endif +#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP) +#define IPCONFIG_DYNAMIC +#endif + +/* Define the friendly delay before and after opening net devices */ +#define CONF_POST_OPEN 10 /* After opening: 10 msecs */ +#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */ + +/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ +#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ +#define CONF_SEND_RETRIES 6 /* Send six requests per open */ +#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ +#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ +#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ +#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ +#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ +#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers + - '3' from resolv.h */ + +#define NONE cpu_to_be32(INADDR_NONE) +#define ANY cpu_to_be32(INADDR_ANY) + +/* + * Public IP configuration + */ + +/* This is used by platforms which might be able to set the ipconfig + * variables using firmware environment vars. If this is set, it will + * ignore such firmware variables. + */ +int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ + +static int ic_enable __initdata = 0; /* IP config enabled? */ + +/* Protocol choice */ +int ic_proto_enabled __initdata = 0 +#ifdef IPCONFIG_BOOTP + | IC_BOOTP +#endif +#ifdef CONFIG_IP_PNP_DHCP + | IC_USE_DHCP +#endif +#ifdef IPCONFIG_RARP + | IC_RARP +#endif + ; + +static int ic_host_name_set __initdata = 0; /* Host name set by us? */ + +__be32 ic_myaddr = NONE; /* My IP address */ +static __be32 ic_netmask = NONE; /* Netmask for local subnet */ +__be32 ic_gateway = NONE; /* Gateway IP address */ + +__be32 ic_servaddr = NONE; /* Boot server IP address */ + +__be32 root_server_addr = NONE; /* Address of NFS server */ +u8 root_server_path[256] = { 0, }; /* Path to mount as root */ + +u32 ic_dev_xid; /* Device under configuration */ + +/* vendor class identifier */ +static char vendor_class_identifier[253] __initdata; + +/* Persistent data: */ + +static int ic_proto_used; /* Protocol used, if any */ +static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ +static u8 ic_domain[64]; /* DNS (not NIS) domain name */ + +/* + * Private state. + */ + +/* Name of user-selected boot device */ +static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; + +/* Protocols supported by available interfaces */ +static int ic_proto_have_if __initdata = 0; + +/* MTU for boot device */ +static int ic_dev_mtu __initdata = 0; + +#ifdef IPCONFIG_DYNAMIC +static DEFINE_SPINLOCK(ic_recv_lock); +static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +#endif +#ifdef IPCONFIG_DHCP +static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +#endif + + +/* + * Network devices + */ + +struct ic_device { + struct ic_device *next; + struct net_device *dev; + unsigned short flags; + short able; + __be32 xid; +}; + +static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ +static struct net_device *ic_dev __initdata = NULL; /* Selected device */ + +static bool __init ic_is_init_dev(struct net_device *dev) +{ + if (dev->flags & IFF_LOOPBACK) + return false; + return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5)); +} + +static int __init ic_open_devs(void) +{ + struct ic_device *d, **last; + struct net_device *dev; + unsigned short oflags; + unsigned long start; + + last = &ic_first_dev; + rtnl_lock(); + + /* bring loopback device up first */ + for_each_netdev(&init_net, dev) { + if (!(dev->flags & IFF_LOOPBACK)) + continue; + if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + } + + for_each_netdev(&init_net, dev) { + if (ic_is_init_dev(dev)) { + int able = 0; + if (dev->mtu >= 364) + able |= IC_BOOTP; + else + printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); + if (!(dev->flags & IFF_NOARP)) + able |= IC_RARP; + able &= ic_proto_enabled; + if (ic_proto_enabled && !able) + continue; + oflags = dev->flags; + if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + continue; + } + if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { + rtnl_unlock(); + return -ENOMEM; + } + d->dev = dev; + *last = d; + last = &d->next; + d->flags = oflags; + d->able = able; + if (able & IC_BOOTP) + get_random_bytes(&d->xid, sizeof(__be32)); + else + d->xid = 0; + ic_proto_have_if |= able; + DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n", + dev->name, able, d->xid)); + } + } + + /* no point in waiting if we could not bring up at least one device */ + if (!ic_first_dev) + goto have_carrier; + + /* wait for a carrier on at least one device */ + start = jiffies; + while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + for_each_netdev(&init_net, dev) + if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) + goto have_carrier; + + msleep(1); + } +have_carrier: + rtnl_unlock(); + + *last = NULL; + + if (!ic_first_dev) { + if (user_dev_name[0]) + printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); + else + printk(KERN_ERR "IP-Config: No network devices available.\n"); + return -ENODEV; + } + return 0; +} + +static void __init ic_close_devs(void) +{ + struct ic_device *d, *next; + struct net_device *dev; + + rtnl_lock(); + next = ic_first_dev; + while ((d = next)) { + next = d->next; + dev = d->dev; + if (dev != ic_dev) { + DBG(("IP-Config: Downing %s\n", dev->name)); + dev_change_flags(dev, d->flags); + } + kfree(d); + } + rtnl_unlock(); +} + +/* + * Interface to various network functions. + */ + +static inline void +set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) +{ + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = addr; + sin->sin_port = port; +} + +static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg); + set_fs(oldfs); + return res; +} + +static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg); + set_fs(oldfs); + return res; +} + +static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg); + set_fs(oldfs); + return res; +} + +/* + * Set up interface addresses and routes. + */ + +static int __init ic_setup_if(void) +{ + struct ifreq ir; + struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; + int err; + + memset(&ir, 0, sizeof(ir)); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + set_sockaddr(sin, ic_myaddr, 0); + if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_netmask, 0); + if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); + if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); + return -1; + } + /* Handle the case where we need non-standard MTU on the boot link (a network + * using jumbo frames, for instance). If we can't set the mtu, don't error + * out, we'll try to muddle along. + */ + if (ic_dev_mtu != 0) { + strcpy(ir.ifr_name, ic_dev->name); + ir.ifr_mtu = ic_dev_mtu; + if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) + printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n", + ic_dev_mtu, err); + } + return 0; +} + +static int __init ic_setup_routes(void) +{ + /* No need to setup device routes, only the default route... */ + + if (ic_gateway != NONE) { + struct rtentry rm; + int err; + + memset(&rm, 0, sizeof(rm)); + if ((ic_gateway ^ ic_myaddr) & ic_netmask) { + printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); + return -1; + } + set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); + rm.rt_flags = RTF_UP | RTF_GATEWAY; + if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); + return -1; + } + } + + return 0; +} + +/* + * Fill in default values for all missing parameters. + */ + +static int __init ic_defaults(void) +{ + /* + * At this point we have no userspace running so need not + * claim locks on system_utsname + */ + + if (!ic_host_name_set) + sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr); + + if (root_server_addr == NONE) + root_server_addr = ic_servaddr; + + if (ic_netmask == NONE) { + if (IN_CLASSA(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSA_NET); + else if (IN_CLASSB(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSB_NET); + else if (IN_CLASSC(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSC_NET); + else { + printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n", + &ic_myaddr); + return -1; + } + printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); + } + + return 0; +} + +/* + * RARP support. + */ + +#ifdef IPCONFIG_RARP + +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); + +static struct packet_type rarp_packet_type __initdata = { + .type = cpu_to_be16(ETH_P_RARP), + .func = ic_rarp_recv, +}; + +static inline void __init ic_rarp_init(void) +{ + dev_add_pack(&rarp_packet_type); +} + +static inline void __init ic_rarp_cleanup(void) +{ + dev_remove_pack(&rarp_packet_type); +} + +/* + * Process received RARP packet. + */ +static int __init +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct arphdr *rarp; + unsigned char *rarp_ptr; + __be32 sip, tip; + unsigned char *sha, *tha; /* s for "source", t for "target" */ + struct ic_device *d; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, sizeof(struct arphdr))) + goto drop; + + /* Basic sanity checks can be done without the lock. */ + rarp = (struct arphdr *)skb_transport_header(skb); + + /* If this test doesn't pass, it's not IP, or we should + * ignore it anyway. + */ + if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) + goto drop; + + /* If it's not a RARP reply, delete it. */ + if (rarp->ar_op != htons(ARPOP_RREPLY)) + goto drop; + + /* If it's not Ethernet, delete it. */ + if (rarp->ar_pro != htons(ETH_P_IP)) + goto drop; + + if (!pskb_may_pull(skb, arp_hdr_len(dev))) + goto drop; + + /* OK, it is all there and looks valid, process... */ + rarp = (struct arphdr *)skb_transport_header(skb); + rarp_ptr = (unsigned char *) (rarp + 1); + + /* One reply at a time, please. */ + spin_lock(&ic_recv_lock); + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop_unlock; + + /* Find the ic_device that the packet arrived on */ + d = ic_first_dev; + while (d && d->dev != dev) + d = d->next; + if (!d) + goto drop_unlock; /* should never happen */ + + /* Extract variable-width fields */ + sha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&sip, rarp_ptr, 4); + rarp_ptr += 4; + tha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&tip, rarp_ptr, 4); + + /* Discard packets which are not meant for us. */ + if (memcmp(tha, dev->dev_addr, dev->addr_len)) + goto drop_unlock; + + /* Discard packets which are not from specified server. */ + if (ic_servaddr != NONE && ic_servaddr != sip) + goto drop_unlock; + + /* We have a winner! */ + ic_dev = dev; + if (ic_myaddr == NONE) + ic_myaddr = tip; + ic_servaddr = sip; + ic_got_reply = IC_RARP; + +drop_unlock: + /* Show's over. Nothing to see here. */ + spin_unlock(&ic_recv_lock); + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + return 0; +} + + +/* + * Send RARP request packet over a single interface. + */ +static void __init ic_rarp_send_if(struct ic_device *d) +{ + struct net_device *dev = d->dev; + arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, + dev->dev_addr, dev->dev_addr); +} +#endif + +/* + * DHCP/BOOTP support. + */ + +#ifdef IPCONFIG_BOOTP + +struct bootp_pkt { /* BOOTP packet format */ + struct iphdr iph; /* IP header */ + struct udphdr udph; /* UDP header */ + u8 op; /* 1=request, 2=reply */ + u8 htype; /* HW address type */ + u8 hlen; /* HW address length */ + u8 hops; /* Used only by gateways */ + __be32 xid; /* Transaction ID */ + __be16 secs; /* Seconds since we started */ + __be16 flags; /* Just what it says */ + __be32 client_ip; /* Client's IP address if known */ + __be32 your_ip; /* Assigned IP address */ + __be32 server_ip; /* (Next, e.g. NFS) Server's IP address */ + __be32 relay_ip; /* IP address of BOOTP relay */ + u8 hw_addr[16]; /* Client's HW address */ + u8 serv_name[64]; /* Server host name */ + u8 boot_file[128]; /* Name of boot file */ + u8 exten[312]; /* DHCP options / BOOTP vendor extensions */ +}; + +/* packet ops */ +#define BOOTP_REQUEST 1 +#define BOOTP_REPLY 2 + +/* DHCP message types */ +#define DHCPDISCOVER 1 +#define DHCPOFFER 2 +#define DHCPREQUEST 3 +#define DHCPDECLINE 4 +#define DHCPACK 5 +#define DHCPNAK 6 +#define DHCPRELEASE 7 +#define DHCPINFORM 8 + +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); + +static struct packet_type bootp_packet_type __initdata = { + .type = cpu_to_be16(ETH_P_IP), + .func = ic_bootp_recv, +}; + + +/* + * Initialize DHCP/BOOTP extension fields in the request. + */ + +static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; + +#ifdef IPCONFIG_DHCP + +static void __init +ic_dhcp_init_options(u8 *options) +{ + u8 mt = ((ic_servaddr == NONE) + ? DHCPDISCOVER : DHCPREQUEST); + u8 *e = options; + int len; + +#ifdef IPCONFIG_DEBUG + printk("DHCP: Sending message type %d\n", mt); +#endif + + memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ + e += 4; + + *e++ = 53; /* DHCP message type */ + *e++ = 1; + *e++ = mt; + + if (mt == DHCPREQUEST) { + *e++ = 54; /* Server ID (IP address) */ + *e++ = 4; + memcpy(e, &ic_servaddr, 4); + e += 4; + + *e++ = 50; /* Requested IP address */ + *e++ = 4; + memcpy(e, &ic_myaddr, 4); + e += 4; + } + + /* always? */ + { + static const u8 ic_req_params[] = { + 1, /* Subnet mask */ + 3, /* Default gateway */ + 6, /* DNS server */ + 12, /* Host name */ + 15, /* Domain name */ + 17, /* Boot path */ + 26, /* MTU */ + 40, /* NIS domain name */ + }; + + *e++ = 55; /* Parameter request list */ + *e++ = sizeof(ic_req_params); + memcpy(e, ic_req_params, sizeof(ic_req_params)); + e += sizeof(ic_req_params); + + if (ic_host_name_set) { + *e++ = 12; /* host-name */ + len = strlen(utsname()->nodename); + *e++ = len; + memcpy(e, utsname()->nodename, len); + e += len; + } + if (*vendor_class_identifier) { + printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", + vendor_class_identifier); + *e++ = 60; /* Class-identifier */ + len = strlen(vendor_class_identifier); + *e++ = len; + memcpy(e, vendor_class_identifier, len); + e += len; + } + } + + *e++ = 255; /* End of the list */ +} + +#endif /* IPCONFIG_DHCP */ + +static void __init ic_bootp_init_ext(u8 *e) +{ + memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ + e += 4; + *e++ = 1; /* Subnet mask request */ + *e++ = 4; + e += 4; + *e++ = 3; /* Default gateway request */ + *e++ = 4; + e += 4; + *e++ = 5; /* Name server request */ + *e++ = 8; + e += 8; + *e++ = 12; /* Host name request */ + *e++ = 32; + e += 32; + *e++ = 40; /* NIS Domain name request */ + *e++ = 32; + e += 32; + *e++ = 17; /* Boot path */ + *e++ = 40; + e += 40; + + *e++ = 57; /* set extension buffer size for reply */ + *e++ = 2; + *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */ + *e++ = 150; + + *e++ = 255; /* End of the list */ +} + + +/* + * Initialize the DHCP/BOOTP mechanism. + */ +static inline void __init ic_bootp_init(void) +{ + int i; + + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) + ic_nameservers[i] = NONE; + + dev_add_pack(&bootp_packet_type); +} + + +/* + * DHCP/BOOTP cleanup. + */ +static inline void __init ic_bootp_cleanup(void) +{ + dev_remove_pack(&bootp_packet_type); +} + + +/* + * Send DHCP/BOOTP request to single interface. + */ +static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff) +{ + struct net_device *dev = d->dev; + struct sk_buff *skb; + struct bootp_pkt *b; + struct iphdr *h; + + /* Allocate packet */ + skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, + GFP_KERNEL); + if (!skb) + return; + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); + memset(b, 0, sizeof(struct bootp_pkt)); + + /* Construct IP header */ + skb_reset_network_header(skb); + h = ip_hdr(skb); + h->version = 4; + h->ihl = 5; + h->tot_len = htons(sizeof(struct bootp_pkt)); + h->frag_off = htons(IP_DF); + h->ttl = 64; + h->protocol = IPPROTO_UDP; + h->daddr = htonl(INADDR_BROADCAST); + h->check = ip_fast_csum((unsigned char *) h, h->ihl); + + /* Construct UDP header */ + b->udph.source = htons(68); + b->udph.dest = htons(67); + b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr)); + /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ + + /* Construct DHCP/BOOTP header */ + b->op = BOOTP_REQUEST; + if (dev->type < 256) /* check for false types */ + b->htype = dev->type; + else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ + b->htype = ARPHRD_IEEE802; + else if (dev->type == ARPHRD_FDDI) + b->htype = ARPHRD_ETHER; + else { + printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name); + b->htype = dev->type; /* can cause undefined behavior */ + } + + /* server_ip and your_ip address are both already zero per RFC2131 */ + b->hlen = dev->addr_len; + memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); + b->secs = htons(jiffies_diff / HZ); + b->xid = d->xid; + + /* add DHCP options or BOOTP extensions */ +#ifdef IPCONFIG_DHCP + if (ic_proto_enabled & IC_USE_DHCP) + ic_dhcp_init_options(b->exten); + else +#endif + ic_bootp_init_ext(b->exten); + + /* Chain packet down the line... */ + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + if (dev_hard_header(skb, dev, ntohs(skb->protocol), + dev->broadcast, dev->dev_addr, skb->len) < 0 || + dev_queue_xmit(skb) < 0) + printk("E"); +} + + +/* + * Copy BOOTP-supplied string if not already set. + */ +static int __init ic_bootp_string(char *dest, char *src, int len, int max) +{ + if (!len) + return 0; + if (len > max-1) + len = max-1; + memcpy(dest, src, len); + dest[len] = '\0'; + return 1; +} + + +/* + * Process BOOTP extensions. + */ +static void __init ic_do_bootp_ext(u8 *ext) +{ + u8 servers; + int i; + u16 mtu; + +#ifdef IPCONFIG_DEBUG + u8 *c; + + printk("DHCP/BOOTP: Got extension %d:",*ext); + for (c=ext+2; c CONF_NAMESERVERS_MAX) + servers = CONF_NAMESERVERS_MAX; + for (i = 0; i < servers; i++) { + if (ic_nameservers[i] == NONE) + memcpy(&ic_nameservers[i], ext+1+4*i, 4); + } + break; + case 12: /* Host name */ + ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_host_name_set = 1; + break; + case 15: /* Domain name (DNS) */ + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + break; + case 17: /* Root path */ + if (!root_server_path[0]) + ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); + break; + case 26: /* Interface MTU */ + memcpy(&mtu, ext+1, sizeof(mtu)); + ic_dev_mtu = ntohs(mtu); + break; + case 40: /* NIS Domain name (_not_ DNS) */ + ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); + break; + } +} + + +/* + * Receive BOOTP reply. + */ +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct bootp_pkt *b; + struct iphdr *h; + struct ic_device *d; + int len, ext_len; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + /* Perform verifications before taking the lock. */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, + sizeof(struct iphdr) + + sizeof(struct udphdr))) + goto drop; + + b = (struct bootp_pkt *)skb_network_header(skb); + h = &b->iph; + + if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) + goto drop; + + /* Fragments are not supported */ + if (h->frag_off & htons(IP_OFFSET | IP_MF)) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " + "reply.\n"); + goto drop; + } + + if (skb->len < ntohs(h->tot_len)) + goto drop; + + if (ip_fast_csum((char *) h, h->ihl)) + goto drop; + + if (b->udph.source != htons(67) || b->udph.dest != htons(68)) + goto drop; + + if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) + goto drop; + + len = ntohs(b->udph.len) - sizeof(struct udphdr); + ext_len = len - (sizeof(*b) - + sizeof(struct iphdr) - + sizeof(struct udphdr) - + sizeof(b->exten)); + if (ext_len < 0) + goto drop; + + /* Ok the front looks good, make sure we can get at the rest. */ + if (!pskb_may_pull(skb, skb->len)) + goto drop; + + b = (struct bootp_pkt *)skb_network_header(skb); + h = &b->iph; + + /* One reply at a time, please. */ + spin_lock(&ic_recv_lock); + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop_unlock; + + /* Find the ic_device that the packet arrived on */ + d = ic_first_dev; + while (d && d->dev != dev) + d = d->next; + if (!d) + goto drop_unlock; /* should never happen */ + + /* Is it a reply to our BOOTP request? */ + if (b->op != BOOTP_REPLY || + b->xid != d->xid) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Reply not for us, " + "op[%x] xid[%x]\n", + b->op, b->xid); + goto drop_unlock; + } + + /* Is it a reply for the device we are configuring? */ + if (b->xid != ic_dev_xid) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n"); + goto drop_unlock; + } + + /* Parse extensions */ + if (ext_len >= 4 && + !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */ + u8 *end = (u8 *) b + ntohs(b->iph.tot_len); + u8 *ext; + +#ifdef IPCONFIG_DHCP + if (ic_proto_enabled & IC_USE_DHCP) { + __be32 server_id = NONE; + int mt = 0; + + ext = &b->exten[4]; + while (ext < end && *ext != 0xff) { + u8 *opt = ext++; + if (*opt == 0) /* Padding */ + continue; + ext += *ext + 1; + if (ext >= end) + break; + switch (*opt) { + case 53: /* Message type */ + if (opt[1]) + mt = opt[2]; + break; + case 54: /* Server ID (IP address) */ + if (opt[1] >= 4) + memcpy(&server_id, opt + 2, 4); + break; + } + } + +#ifdef IPCONFIG_DEBUG + printk("DHCP: Got message type %d\n", mt); +#endif + + switch (mt) { + case DHCPOFFER: + /* While in the process of accepting one offer, + * ignore all others. + */ + if (ic_myaddr != NONE) + goto drop_unlock; + + /* Let's accept that offer. */ + ic_myaddr = b->your_ip; + ic_servaddr = server_id; +#ifdef IPCONFIG_DEBUG + printk("DHCP: Offered address %pI4 by server %pI4\n", + &ic_myaddr, &ic_servaddr); +#endif + /* The DHCP indicated server address takes + * precedence over the bootp header one if + * they are different. + */ + if ((server_id != NONE) && + (b->server_ip != server_id)) + b->server_ip = ic_servaddr; + break; + + case DHCPACK: + if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0) + goto drop_unlock; + + /* Yeah! */ + break; + + default: + /* Urque. Forget it*/ + ic_myaddr = NONE; + ic_servaddr = NONE; + goto drop_unlock; + } + + ic_dhcp_msgtype = mt; + + } +#endif /* IPCONFIG_DHCP */ + + ext = &b->exten[4]; + while (ext < end && *ext != 0xff) { + u8 *opt = ext++; + if (*opt == 0) /* Padding */ + continue; + ext += *ext + 1; + if (ext < end) + ic_do_bootp_ext(opt); + } + } + + /* We have a winner! */ + ic_dev = dev; + ic_myaddr = b->your_ip; + ic_servaddr = b->server_ip; + if (ic_gateway == NONE && b->relay_ip) + ic_gateway = b->relay_ip; + if (ic_nameservers[0] == NONE) + ic_nameservers[0] = ic_servaddr; + ic_got_reply = IC_BOOTP; + +drop_unlock: + /* Show's over. Nothing to see here. */ + spin_unlock(&ic_recv_lock); + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + + return 0; +} + + +#endif + + +/* + * Dynamic IP configuration -- DHCP, BOOTP, RARP. + */ + +#ifdef IPCONFIG_DYNAMIC + +static int __init ic_dynamic(void) +{ + int retries; + struct ic_device *d; + unsigned long start_jiffies, timeout, jiff; + int do_bootp = ic_proto_have_if & IC_BOOTP; + int do_rarp = ic_proto_have_if & IC_RARP; + + /* + * If none of DHCP/BOOTP/RARP was selected, return with an error. + * This routine gets only called when some pieces of information + * are missing, and without DHCP/BOOTP/RARP we are unable to get it. + */ + if (!ic_proto_enabled) { + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + return -1; + } + +#ifdef IPCONFIG_BOOTP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) + printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n"); +#endif +#ifdef IPCONFIG_RARP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) + printk(KERN_ERR "RARP: No suitable device found.\n"); +#endif + + if (!ic_proto_have_if) + /* Error message already printed */ + return -1; + + /* + * Setup protocols + */ +#ifdef IPCONFIG_BOOTP + if (do_bootp) + ic_bootp_init(); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp) + ic_rarp_init(); +#endif + + /* + * Send requests and wait, until we get an answer. This loop + * seems to be a terrible waste of CPU time, but actually there is + * only one process running at all, so we don't need to use any + * scheduler functions. + * [Actually we could now, but the nothing else running note still + * applies.. - AC] + */ + printk(KERN_NOTICE "Sending %s%s%s requests .", + do_bootp + ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", + (do_bootp && do_rarp) ? " and " : "", + do_rarp ? "RARP" : ""); + + start_jiffies = jiffies; + d = ic_first_dev; + retries = CONF_SEND_RETRIES; + get_random_bytes(&timeout, sizeof(timeout)); + timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); + for (;;) { + /* Track the device we are configuring */ + ic_dev_xid = d->xid; + +#ifdef IPCONFIG_BOOTP + if (do_bootp && (d->able & IC_BOOTP)) + ic_bootp_send_if(d, jiffies - start_jiffies); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp && (d->able & IC_RARP)) + ic_rarp_send_if(d); +#endif + + jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); + while (time_before(jiffies, jiff) && !ic_got_reply) + schedule_timeout_uninterruptible(1); +#ifdef IPCONFIG_DHCP + /* DHCP isn't done until we get a DHCPACK. */ + if ((ic_got_reply & IC_BOOTP) && + (ic_proto_enabled & IC_USE_DHCP) && + ic_dhcp_msgtype != DHCPACK) { + ic_got_reply = 0; + printk(KERN_CONT ","); + continue; + } +#endif /* IPCONFIG_DHCP */ + + if (ic_got_reply) { + printk(KERN_CONT " OK\n"); + break; + } + + if ((d = d->next)) + continue; + + if (! --retries) { + printk(KERN_CONT " timed out!\n"); + break; + } + + d = ic_first_dev; + + timeout = timeout CONF_TIMEOUT_MULT; + if (timeout > CONF_TIMEOUT_MAX) + timeout = CONF_TIMEOUT_MAX; + + printk(KERN_CONT "."); + } + +#ifdef IPCONFIG_BOOTP + if (do_bootp) + ic_bootp_cleanup(); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp) + ic_rarp_cleanup(); +#endif + + if (!ic_got_reply) { + ic_myaddr = NONE; + return -1; + } + + printk("IP-Config: Got %s answer from %pI4, ", + ((ic_got_reply & IC_RARP) ? "RARP" + : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), + &ic_servaddr); + printk(KERN_CONT "my address is %pI4\n", &ic_myaddr); + + return 0; +} + +#endif /* IPCONFIG_DYNAMIC */ + +#ifdef CONFIG_PROC_FS + +static int pnp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + if (ic_proto_used & IC_PROTO) + seq_printf(seq, "#PROTO: %s\n", + (ic_proto_used & IC_RARP) ? "RARP" + : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP"); + else + seq_puts(seq, "#MANUAL\n"); + + if (ic_domain[0]) + seq_printf(seq, + "domain %s\n", ic_domain); + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { + if (ic_nameservers[i] != NONE) + seq_printf(seq, "nameserver %pI4\n", + &ic_nameservers[i]); + } + if (ic_servaddr != NONE) + seq_printf(seq, "bootserver %pI4\n", + &ic_servaddr); + return 0; +} + +static int pnp_seq_open(struct inode *indoe, struct file *file) +{ + return single_open(file, pnp_seq_show, NULL); +} + +static const struct file_operations pnp_seq_fops = { + .owner = THIS_MODULE, + .open = pnp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +/* + * Extract IP address from the parameter string if needed. Note that we + * need to have root_server_addr set _before_ IPConfig gets called as it + * can override it. + */ +__be32 __init root_nfs_parse_addr(char *name) +{ + __be32 addr; + int octets = 0; + char *cp, *cq; + + cp = cq = name; + while (octets < 4) { + while (*cp >= '0' && *cp <= '9') + cp++; + if (cp == cq || cp - cq > 3) + break; + if (*cp == '.' || octets == 3) + octets++; + if (octets < 4) + cp++; + cq = cp; + } + if (octets == 4 && (*cp == ':' || *cp == '\0')) { + if (*cp == ':') + *cp++ = '\0'; + addr = in_aton(name); + memmove(name, cp, strlen(cp) + 1); + } else + addr = NONE; + + return addr; +} + +#define DEVICE_WAIT_MAX 12 /* 12 seconds */ + +static int __init wait_for_devices(void) +{ + int i; + + for (i = 0; i < DEVICE_WAIT_MAX; i++) { + struct net_device *dev; + int found = 0; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (ic_is_init_dev(dev)) { + found = 1; + break; + } + } + rtnl_unlock(); + if (found) + return 0; + ssleep(1); + } + return -ENODEV; +} + +/* + * IP Autoconfig dispatcher. + */ + +static int __init ip_auto_config(void) +{ + __be32 addr; +#ifdef IPCONFIG_DYNAMIC + int retries = CONF_OPEN_RETRIES; +#endif + int err; + +#ifdef CONFIG_PROC_FS + proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); +#endif /* CONFIG_PROC_FS */ + + if (!ic_enable) + return 0; + + DBG(("IP-Config: Entered.\n")); +#ifdef IPCONFIG_DYNAMIC + try_try_again: +#endif + /* Wait for devices to appear */ + err = wait_for_devices(); + if (err) + return err; + + /* Setup all network devices */ + err = ic_open_devs(); + if (err) + return err; + + /* Give drivers a chance to settle */ + msleep(CONF_POST_OPEN); + + /* + * If the config information is insufficient (e.g., our IP address or + * IP address of the boot server is missing or we have multiple network + * interfaces and no default was set), use BOOTP or RARP to get the + * missing values. + */ + if (ic_myaddr == NONE || +#ifdef CONFIG_ROOT_NFS + (root_server_addr == NONE && + ic_servaddr == NONE && + ROOT_DEV == Root_NFS) || +#endif + ic_first_dev->next) { +#ifdef IPCONFIG_DYNAMIC + if (ic_dynamic() < 0) { + ic_close_devs(); + + /* + * I don't know why, but sometimes the + * eepro100 driver (at least) gets upset and + * doesn't work the first time it's opened. + * But then if you close it and reopen it, it + * works just fine. So we need to try that at + * least once before giving up. + * + * Also, if the root will be NFS-mounted, we + * have nowhere to go if DHCP fails. So we + * just have to keep trying forever. + * + * -- Chip + */ +#ifdef CONFIG_ROOT_NFS + if (ROOT_DEV == Root_NFS) { + printk(KERN_ERR + "IP-Config: Retrying forever (NFS root)...\n"); + goto try_try_again; + } +#endif + + if (--retries) { + printk(KERN_ERR + "IP-Config: Reopening network devices...\n"); + goto try_try_again; + } + + /* Oh, well. At least we tried. */ + printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); + return -1; + } +#else /* !DYNAMIC */ + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + ic_close_devs(); + return -1; +#endif /* IPCONFIG_DYNAMIC */ + } else { + /* Device selected manually or only one device -> use it */ + ic_dev = ic_first_dev->dev; + } + + addr = root_nfs_parse_addr(root_server_path); + if (root_server_addr == NONE) + root_server_addr = addr; + + /* + * Use defaults wherever applicable. + */ + if (ic_defaults() < 0) + return -1; + + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + ic_close_devs(); + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + return -1; + + /* + * Record which protocol was actually used. + */ +#ifdef IPCONFIG_DYNAMIC + ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP); +#endif + +#ifndef IPCONFIG_SILENT + /* + * Clue in the operator. + */ + printk("IP-Config: Complete:\n"); + printk(" device=%s", ic_dev->name); + printk(KERN_CONT ", addr=%pI4", &ic_myaddr); + printk(KERN_CONT ", mask=%pI4", &ic_netmask); + printk(KERN_CONT ", gw=%pI4", &ic_gateway); + printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s", + utsname()->nodename, ic_domain, utsname()->domainname); + printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr); + printk(KERN_CONT ", rootserver=%pI4", &root_server_addr); + printk(KERN_CONT ", rootpath=%s", root_server_path); + if (ic_dev_mtu) + printk(KERN_CONT ", mtu=%d", ic_dev_mtu); + printk(KERN_CONT "\n"); +#endif /* !SILENT */ + + return 0; +} + +late_initcall(ip_auto_config); + + +/* + * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel + * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt. + */ +static int __init ic_proto_name(char *name) +{ + if (!strcmp(name, "on") || !strcmp(name, "any")) { + return 1; + } + if (!strcmp(name, "off") || !strcmp(name, "none")) { + return 0; + } +#ifdef CONFIG_IP_PNP_DHCP + else if (!strcmp(name, "dhcp")) { + ic_proto_enabled &= ~IC_RARP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_BOOTP + else if (!strcmp(name, "bootp")) { + ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP); + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_RARP + else if (!strcmp(name, "rarp")) { + ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP); + return 1; + } +#endif +#ifdef IPCONFIG_DYNAMIC + else if (!strcmp(name, "both")) { + ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */ + return 1; + } +#endif + return 0; +} + +static int __init ip_auto_config_setup(char *addrs) +{ + char *cp, *ip, *dp; + int num = 0; + + ic_set_manually = 1; + ic_enable = 1; + + /* + * If any dhcp, bootp etc options are set, leave autoconfig on + * and skip the below static IP processing. + */ + if (ic_proto_name(addrs)) + return 1; + + /* If no static IP is given, turn off autoconfig and bail. */ + if (*addrs == 0 || + strcmp(addrs, "off") == 0 || + strcmp(addrs, "none") == 0) { + ic_enable = 0; + return 1; + } + + /* Parse string for static IP assignment. */ + ip = addrs; + while (ip && *ip) { + if ((cp = strchr(ip, ':'))) + *cp++ = '\0'; + if (strlen(ip) > 0) { + DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); + switch (num) { + case 0: + if ((ic_myaddr = in_aton(ip)) == ANY) + ic_myaddr = NONE; + break; + case 1: + if ((ic_servaddr = in_aton(ip)) == ANY) + ic_servaddr = NONE; + break; + case 2: + if ((ic_gateway = in_aton(ip)) == ANY) + ic_gateway = NONE; + break; + case 3: + if ((ic_netmask = in_aton(ip)) == ANY) + ic_netmask = NONE; + break; + case 4: + if ((dp = strchr(ip, '.'))) { + *dp++ = '\0'; + strlcpy(utsname()->domainname, dp, + sizeof(utsname()->domainname)); + } + strlcpy(utsname()->nodename, ip, + sizeof(utsname()->nodename)); + ic_host_name_set = 1; + break; + case 5: + strlcpy(user_dev_name, ip, sizeof(user_dev_name)); + break; + case 6: + if (ic_proto_name(ip) == 0 && + ic_myaddr == NONE) { + ic_enable = 0; + } + break; + } + } + ip = cp; + num++; + } + + return 1; +} + +static int __init nfsaddrs_config_setup(char *addrs) +{ + return ip_auto_config_setup(addrs); +} + +static int __init vendor_class_identifier_setup(char *addrs) +{ + if (strlcpy(vendor_class_identifier, addrs, + sizeof(vendor_class_identifier)) + >= sizeof(vendor_class_identifier)) + printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"", + vendor_class_identifier); + return 1; +} + +__setup("ip=", ip_auto_config_setup); +__setup("nfsaddrs=", nfsaddrs_config_setup); +__setup("dhcpclass=", vendor_class_identifier_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c new file mode 100644 index 00000000..6f06f7f3 --- /dev/null +++ b/net/ipv4/ipip.c @@ -0,0 +1,912 @@ +/* + * Linux NET3: IP/IP protocol decoder. + * + * Authors: + * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + * + * Fixes: + * Alan Cox : Merged and made usable non modular (its so tiny its silly as + * a module taking up 2 pages). + * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) + * to keep ip_forward happy. + * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). + * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL + * David Woodhouse : Perform some basic ICMP handling. + * IPIP Routing without decapsulation. + * Carlos Picoto : GRE over IP support + * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. + * I do not want to merge them together. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* tunnel.c: an IP tunnel driver + + The purpose of this driver is to provide an IP tunnel through + which you can tunnel network traffic transparently across subnets. + + This was written by looking at Nick Holloway's dummy driver + Thanks for the great code! + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + + Minor tweaks: + Cleaned up the code a little and added some pre-1.3.0 tweaks. + dev->hard_header/hard_header_len changed to use no headers. + Comments/bracketing tweaked. + Made the tunnels use dev->name not tunnel: when error reporting. + Added tx_dropped stat + + -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95 + + Reworked: + Changed to tunnel to destination gateway in addition to the + tunnel's pointopoint address + Almost completely rewritten + Note: There is currently no firewall or ICMP handling done. + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 + +*/ + +/* Things I wish I had known when writing the tunnel driver: + + When the tunnel_xmit() function is called, the skb contains the + packet to be sent (plus a great deal of extra info), and dev + contains the tunnel device that _we_ are. + + When we are passed a packet, we are expected to fill in the + source address with our source IP address. + + What is the proper way to allocate, copy and free a buffer? + After you allocate it, it is a "0 length" chunk of memory + starting at zero. If you want to add headers to the buffer + later, you'll have to call "skb_reserve(skb, amount)" with + the amount of memory you want reserved. Then, you call + "skb_put(skb, amount)" with the amount of space you want in + the buffer. skb_put() returns a pointer to the top (#0) of + that buffer. skb->len is set to the amount of space you have + "allocated" with skb_put(). You can then write up to skb->len + bytes to that buffer. If you need more, you can call skb_put() + again with the additional amount of space you need. You can + find out how much more space you can allocate by calling + "skb_tailroom(skb)". + Now, to add header space, call "skb_push(skb, header_len)". + This creates space at the beginning of the buffer and returns + a pointer to this new space. If later you need to strip a + header from a buffer, call "skb_pull(skb, header_len)". + skb_headroom() will return how much space is left at the top + of the buffer (before the main data). Remember, this headroom + space must be reserved before the skb_put() function is called. + */ + +/* + This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define HASH_SIZE 16 +#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) + +static int ipip_net_id __read_mostly; +struct ipip_net { + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; + + struct net_device *fb_tunnel_dev; +}; + +static int ipip_tunnel_init(struct net_device *dev); +static void ipip_tunnel_setup(struct net_device *dev); +static void ipip_dev_free(struct net_device *dev); + +/* + * Locking : hash tables are protected by RCU and RTNL + */ + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + +static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, + __be32 remote, __be32 local) +{ + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); + struct ip_tunnel *t; + struct ipip_net *ipn = net_generic(net, ipip_net_id); + + for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + + for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + + for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + + t = rcu_dereference(ipn->tunnels_wc[0]); + if (t && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, + struct ip_tunnel_parm *parms) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + unsigned int h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &ipn->tunnels[prio][h]; +} + +static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, + struct ip_tunnel *t) +{ + return __ipip_bucket(ipn, &t->parms); +} + +static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip_bucket(ipn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static struct ip_tunnel * ipip_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; + struct net_device *dev; + char name[IFNAMSIZ]; + struct ipip_net *ipn = net_generic(net, ipip_net_id); + + for (tp = __ipip_bucket(ipn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "tunl%d"); + + dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + nt->parms = *parms; + + if (ipip_tunnel_init(dev) < 0) + goto failed_free; + + if (register_netdevice(dev) < 0) + goto failed_free; + + strcpy(nt->parms.name, dev->name); + + dev_hold(dev); + ipip_tunnel_link(ipn, nt); + return nt; + +failed_free: + ipip_dev_free(dev); + return NULL; +} + +/* called with RTNL */ +static void ipip_tunnel_uninit(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct ipip_net *ipn = net_generic(net, ipip_net_id); + + if (dev == ipn->fb_tunnel_dev) + rcu_assign_pointer(ipn->tunnels_wc[0], NULL); + else + ipip_tunnel_unlink(ipn, netdev_priv(dev)); + dev_put(dev); +} + +static int ipip_err(struct sk_buff *skb, u32 info) +{ + +/* All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + const struct iphdr *iph = (const struct iphdr *)skb->data; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return 0; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return 0; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return 0; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return 0; + break; + } + + err = -ENOENT; + + rcu_read_lock(); + t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + + err = 0; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + rcu_read_unlock(); + return err; +} + +static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, + struct sk_buff *skb) +{ + struct iphdr *inner_iph = ip_hdr(skb); + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int ipip_rcv(struct sk_buff *skb) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + + rcu_read_lock(); + tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + rcu_read_unlock(); + kfree_skb(skb); + return 0; + } + + secpath_reset(skb); + + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); + + ipip_ecn_decapsulate(iph, skb); + + netif_rx(skb); + + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + + return -1; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct pcpu_tstats *tstats; + const struct iphdr *tiph = &tunnel->parms.iph; + u8 tos = tunnel->parms.iph.tos; + __be16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + const struct iphdr *old_iph = ip_hdr(skb); + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + __be32 dst = tiph->daddr; + struct flowi4 fl4; + int mtu; + + if (skb->protocol != htons(ETH_P_IP)) + goto tx_error; + + if (tos & 1) + tos = old_iph->tos; + + if (!dst) { + /* NBMA tunnel */ + if ((rt = skb_rtable(skb)) == NULL) { + dev->stats.tx_fifo_errors++; + goto tx_error; + } + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } + + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPIP, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + df |= old_iph->frag_off & htons(IP_DF); + + if (df) { + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + + if (mtu < 68) { + dev->stats.collisions++; + ip_rt_put(rt); + goto tx_error; + } + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + if ((old_iph->frag_off & htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = old_iph->ttl; + + nf_reset(skb); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); + return NETDEV_TX_OK; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ipip_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + const struct iphdr *iph; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct rtable *rt; + struct flowi4 fl4; + + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), + tunnel->parms.link); + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; +} + +static int +ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct ipip_net *ipn = net_generic(net, ipip_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipn->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip_tunnel_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); + + if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = netdev_priv(dev); + ipip_tunnel_unlink(ipn, t); + synchronize_net(); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip_tunnel_link(ipn, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipip_tunnel_bind_dev(dev); + netdev_state_change(dev); + } + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipn->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) + goto done; + err = -EPERM; + if (t->dev == ipn->fb_tunnel_dev) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops ipip_netdev_ops = { + .ndo_uninit = ipip_tunnel_uninit, + .ndo_start_xmit = ipip_tunnel_xmit, + .ndo_do_ioctl = ipip_tunnel_ioctl, + .ndo_change_mtu = ipip_tunnel_change_mtu, + .ndo_get_stats = ipip_get_stats, +}; + +static void ipip_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void ipip_tunnel_setup(struct net_device *dev) +{ + dev->netdev_ops = &ipip_netdev_ops; + dev->destructor = ipip_dev_free; + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static int ipip_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + tunnel->dev = dev; + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + ipip_tunnel_bind_dev(dev); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static int __net_init ipip_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + dev_hold(dev); + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; +} + +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = ipip_rcv, + .err_handler = ipip_err, + .priority = 1, +}; + +static const char banner[] __initconst = + KERN_INFO "IPv4 over IPv4 tunneling driver\n"; + +static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) +{ + int prio; + + for (prio = 1; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + + t = rtnl_dereference(ipn->tunnels[prio][h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init ipip_init_net(struct net *net) +{ + struct ipip_net *ipn = net_generic(net, ipip_net_id); + struct ip_tunnel *t; + int err; + + ipn->tunnels[0] = ipn->tunnels_wc; + ipn->tunnels[1] = ipn->tunnels_l; + ipn->tunnels[2] = ipn->tunnels_r; + ipn->tunnels[3] = ipn->tunnels_r_l; + + ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), + "tunl0", + ipip_tunnel_setup); + if (!ipn->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(ipn->fb_tunnel_dev, net); + + err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; + + if ((err = register_netdev(ipn->fb_tunnel_dev))) + goto err_reg_dev; + + t = netdev_priv(ipn->fb_tunnel_dev); + + strcpy(t->parms.name, ipn->fb_tunnel_dev->name); + return 0; + +err_reg_dev: + ipip_dev_free(ipn->fb_tunnel_dev); +err_alloc_dev: + /* nothing */ + return err; +} + +static void __net_exit ipip_exit_net(struct net *net) +{ + struct ipip_net *ipn = net_generic(net, ipip_net_id); + LIST_HEAD(list); + + rtnl_lock(); + ipip_destroy_tunnels(ipn, &list); + unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations ipip_net_ops = { + .init = ipip_init_net, + .exit = ipip_exit_net, + .id = &ipip_net_id, + .size = sizeof(struct ipip_net), +}; + +static int __init ipip_init(void) +{ + int err; + + printk(banner); + + err = register_pernet_device(&ipip_net_ops); + if (err < 0) + return err; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + unregister_pernet_device(&ipip_net_ops); + printk(KERN_INFO "ipip init: can't register tunnel\n"); + } + return err; +} + +static void __exit ipip_fini(void) +{ + if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) + printk(KERN_INFO "ipip close: can't deregister tunnel\n"); + + unregister_pernet_device(&ipip_net_ops); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c new file mode 100644 index 00000000..f81af8dd --- /dev/null +++ b/net/ipv4/ipmr.c @@ -0,0 +1,2559 @@ +/* + * IP multicast routing support for mrouted 3.6/3.8 + * + * (c) 1995 Alan Cox, + * Linux Consultancy and Custom Driver Development + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Michael Chastain : Incorrect size of copying. + * Alan Cox : Added the cache manager code + * Alan Cox : Fixed the clone/copy bug and device race. + * Mike McLagan : Routing by source + * Malcolm Beattie : Buffer handling fixes. + * Alexey Kuznetsov : Double buffer free and other fixes. + * SVR Anand : Fixed several multicast bugs and problems. + * Alexey Kuznetsov : Status, optimisations and more. + * Brad Parker : Better behaviour on mrouted upcall + * overflow. + * Carlos Picoto : PIMv1 Support + * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header + * Relax this requirement to work with older peers. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) +#define CONFIG_IP_PIMSM 1 +#endif + +struct mr_table { + struct list_head list; +#ifdef CONFIG_NET_NS + struct net *net; +#endif + u32 id; + struct sock __rcu *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct list_head mfc_cache_array[MFC_LINES]; + struct vif_device vif_table[MAXVIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + int mroute_do_assert; + int mroute_do_pim; +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) + int mroute_reg_vif_num; +#endif +}; + +struct ipmr_rule { + struct fib_rule common; +}; + +struct ipmr_result { + struct mr_table *mrt; +}; + +/* Big lock, protecting vif table, mrt cache and mroute socket state. + * Note that the changes are semaphored via rtnl_lock. + */ + +static DEFINE_RWLOCK(mrt_lock); + +/* + * Multicast router control variables + */ + +#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) + +/* Special spinlock for queue of unresolved entries */ +static DEFINE_SPINLOCK(mfc_unres_lock); + +/* We return to original Alan's scheme. Hash table of resolved + * entries is changed only in process context and protected + * with weak lock mrt_lock. Queue of unresolved entries is protected + * with strong spinlock mfc_unres_lock. + * + * In this case data path is free of exclusive locks at all. + */ + +static struct kmem_cache *mrt_cachep __read_mostly; + +static struct mr_table *ipmr_new_table(struct net *net, u32 id); +static int ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local); +static int ipmr_cache_report(struct mr_table *mrt, + struct sk_buff *pkt, vifi_t vifi, int assert); +static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm); +static void ipmr_expire_process(unsigned long arg); + +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES +#define ipmr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) + +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + ipmr_for_each_table(mrt, net) { + if (mrt->id == id) + return mrt; + } + return NULL; +} + +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, + struct mr_table **mrt) +{ + struct ipmr_result res; + struct fib_lookup_arg arg = { .result = &res, }; + int err; + + err = fib_rules_lookup(net->ipv4.mr_rules_ops, + flowi4_to_flowi(flp4), 0, &arg); + if (err < 0) + return err; + *mrt = res.mrt; + return 0; +} + +static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct ipmr_result *res = arg->result; + struct mr_table *mrt; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + mrt = ipmr_get_table(rule->fr_net, rule->table); + if (mrt == NULL) + return -EAGAIN; + res->mrt = mrt; + return 0; +} + +static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +{ + return 1; +} + +static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = { + FRA_GENERIC_POLICY, +}; + +static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, struct nlattr **tb) +{ + return 0; +} + +static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + return 1; +} + +static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + frh->dst_len = 0; + frh->src_len = 0; + frh->tos = 0; + return 0; +} + +static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { + .family = RTNL_FAMILY_IPMR, + .rule_size = sizeof(struct ipmr_rule), + .addr_size = sizeof(u32), + .action = ipmr_rule_action, + .match = ipmr_rule_match, + .configure = ipmr_rule_configure, + .compare = ipmr_rule_compare, + .default_pref = fib_default_rule_pref, + .fill = ipmr_rule_fill, + .nlgroup = RTNLGRP_IPV4_RULE, + .policy = ipmr_rule_policy, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_rules_init(struct net *net) +{ + struct fib_rules_ops *ops; + struct mr_table *mrt; + int err; + + ops = fib_rules_register(&ipmr_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + INIT_LIST_HEAD(&net->ipv4.mr_tables); + + mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) { + err = -ENOMEM; + goto err1; + } + + err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0); + if (err < 0) + goto err2; + + net->ipv4.mr_rules_ops = ops; + return 0; + +err2: + kfree(mrt); +err1: + fib_rules_unregister(ops); + return err; +} + +static void __net_exit ipmr_rules_exit(struct net *net) +{ + struct mr_table *mrt, *next; + + list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { + list_del(&mrt->list); + kfree(mrt); + } + fib_rules_unregister(net->ipv4.mr_rules_ops); +} +#else +#define ipmr_for_each_table(mrt, net) \ + for (mrt = net->ipv4.mrt; mrt; mrt = NULL) + +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + return net->ipv4.mrt; +} + +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, + struct mr_table **mrt) +{ + *mrt = net->ipv4.mrt; + return 0; +} + +static int __net_init ipmr_rules_init(struct net *net) +{ + net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); + return net->ipv4.mrt ? 0 : -ENOMEM; +} + +static void __net_exit ipmr_rules_exit(struct net *net) +{ + kfree(net->ipv4.mrt); +} +#endif + +static struct mr_table *ipmr_new_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + unsigned int i; + + mrt = ipmr_get_table(net, id); + if (mrt != NULL) + return mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (mrt == NULL) + return NULL; + write_pnet(&mrt->net, net); + mrt->id = id; + + /* Forwarding cache */ + for (i = 0; i < MFC_LINES; i++) + INIT_LIST_HEAD(&mrt->mfc_cache_array[i]); + + INIT_LIST_HEAD(&mrt->mfc_unres_queue); + + setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, + (unsigned long)mrt); + +#ifdef CONFIG_IP_PIMSM + mrt->mroute_reg_vif_num = -1; +#endif +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); +#endif + return mrt; +} + +/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ + +static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) +{ + struct net *net = dev_net(dev); + + dev_close(dev); + + dev = __dev_get_by_name(net, "tunl0"); + if (dev) { + const struct net_device_ops *ops = dev->netdev_ops; + struct ifreq ifr; + struct ip_tunnel_parm p; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL); + set_fs(oldfs); + } + } +} + +static +struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) +{ + struct net_device *dev; + + dev = __dev_get_by_name(net, "tunl0"); + + if (dev) { + const struct net_device_ops *ops = dev->netdev_ops; + int err; + struct ifreq ifr; + struct ip_tunnel_parm p; + struct in_device *in_dev; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + } else { + err = -EOPNOTSUPP; + } + dev = NULL; + + if (err == 0 && + (dev = __dev_get_by_name(net, p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev == NULL) + goto failure; + + ipv4_devconf_setall(in_dev); + IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; + + if (dev_open(dev)) + goto failure; + dev_hold(dev); + } + } + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} + +#ifdef CONFIG_IP_PIMSM + +static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct mr_table *mrt; + struct flowi4 fl4 = { + .flowi4_oif = dev->ifindex, + .flowi4_iif = skb->skb_iif, + .flowi4_mark = skb->mark, + }; + int err; + + err = ipmr_fib_lookup(net, &fl4, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT); + read_unlock(&mrt_lock); + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static const struct net_device_ops reg_vif_netdev_ops = { + .ndo_start_xmit = reg_vif_xmit, +}; + +static void reg_vif_setup(struct net_device *dev) +{ + dev->type = ARPHRD_PIMREG; + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; + dev->flags = IFF_NOARP; + dev->netdev_ops = ®_vif_netdev_ops, + dev->destructor = free_netdev; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) +{ + struct net_device *dev; + struct in_device *in_dev; + char name[IFNAMSIZ]; + + if (mrt->id == RT_TABLE_DEFAULT) + sprintf(name, "pimreg"); + else + sprintf(name, "pimreg%u", mrt->id); + + dev = alloc_netdev(0, name, reg_vif_setup); + + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + if (register_netdevice(dev)) { + free_netdev(dev); + return NULL; + } + dev->iflink = 0; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { + rcu_read_unlock(); + goto failure; + } + + ipv4_devconf_setall(in_dev); + IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; + rcu_read_unlock(); + + if (dev_open(dev)) + goto failure; + + dev_hold(dev); + + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + * @notify: Set to 1, if the caller is a notifier_call + */ + +static int vif_delete(struct mr_table *mrt, int vifi, int notify, + struct list_head *head) +{ + struct vif_device *v; + struct net_device *dev; + struct in_device *in_dev; + + if (vifi < 0 || vifi >= mrt->maxvif) + return -EADDRNOTAVAIL; + + v = &mrt->vif_table[vifi]; + + write_lock_bh(&mrt_lock); + dev = v->dev; + v->dev = NULL; + + if (!dev) { + write_unlock_bh(&mrt_lock); + return -EADDRNOTAVAIL; + } + +#ifdef CONFIG_IP_PIMSM + if (vifi == mrt->mroute_reg_vif_num) + mrt->mroute_reg_vif_num = -1; +#endif + + if (vifi + 1 == mrt->maxvif) { + int tmp; + + for (tmp = vifi - 1; tmp >= 0; tmp--) { + if (VIF_EXISTS(mrt, tmp)) + break; + } + mrt->maxvif = tmp+1; + } + + write_unlock_bh(&mrt_lock); + + dev_set_allmulti(dev, -1); + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { + IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; + ip_rt_multicast_event(in_dev); + } + + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) + unregister_netdevice_queue(dev, head); + + dev_put(dev); + return 0; +} + +static void ipmr_cache_free_rcu(struct rcu_head *head) +{ + struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + + kmem_cache_free(mrt_cachep, c); +} + +static inline void ipmr_cache_free(struct mfc_cache *c) +{ + call_rcu(&c->rcu, ipmr_cache_free_rcu); +} + +/* Destroy an unresolved cache entry, killing queued skbs + * and reporting error to netlink readers. + */ + +static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + struct nlmsgerr *e; + + atomic_dec(&mrt->cache_resolve_queue_len); + + while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { + if (ip_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + e = NLMSG_DATA(nlh); + e->error = -ETIMEDOUT; + memset(&e->msg, 0, sizeof(e->msg)); + + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + } else { + kfree_skb(skb); + } + } + + ipmr_cache_free(c); +} + + +/* Timer process for the unresolved queue. */ + +static void ipmr_expire_process(unsigned long arg) +{ + struct mr_table *mrt = (struct mr_table *)arg; + unsigned long now; + unsigned long expires; + struct mfc_cache *c, *next; + + if (!spin_trylock(&mfc_unres_lock)) { + mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); + return; + } + + if (list_empty(&mrt->mfc_unres_queue)) + goto out; + + now = jiffies; + expires = 10*HZ; + + list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { + if (time_after(c->mfc_un.unres.expires, now)) { + unsigned long interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + continue; + } + + list_del(&c->list); + ipmr_destroy_unres(mrt, c); + } + + if (!list_empty(&mrt->mfc_unres_queue)) + mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); + +out: + spin_unlock(&mfc_unres_lock); +} + +/* Fill oifs list. It is called under write locked mrt_lock. */ + +static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, + unsigned char *ttls) +{ + int vifi; + + cache->mfc_un.res.minvif = MAXVIFS; + cache->mfc_un.res.maxvif = 0; + memset(cache->mfc_un.res.ttls, 255, MAXVIFS); + + for (vifi = 0; vifi < mrt->maxvif; vifi++) { + if (VIF_EXISTS(mrt, vifi) && + ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_un.res.ttls[vifi] = ttls[vifi]; + if (cache->mfc_un.res.minvif > vifi) + cache->mfc_un.res.minvif = vifi; + if (cache->mfc_un.res.maxvif <= vifi) + cache->mfc_un.res.maxvif = vifi + 1; + } + } +} + +static int vif_add(struct net *net, struct mr_table *mrt, + struct vifctl *vifc, int mrtsock) +{ + int vifi = vifc->vifc_vifi; + struct vif_device *v = &mrt->vif_table[vifi]; + struct net_device *dev; + struct in_device *in_dev; + int err; + + /* Is vif busy ? */ + if (VIF_EXISTS(mrt, vifi)) + return -EADDRINUSE; + + switch (vifc->vifc_flags) { +#ifdef CONFIG_IP_PIMSM + case VIFF_REGISTER: + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (mrt->mroute_reg_vif_num >= 0) + return -EADDRINUSE; + dev = ipmr_reg_vif(net, mrt); + if (!dev) + return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + dev_put(dev); + return err; + } + break; +#endif + case VIFF_TUNNEL: + dev = ipmr_new_tunnel(net, vifc); + if (!dev) + return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + ipmr_del_tunnel(dev, vifc); + dev_put(dev); + return err; + } + break; + + case VIFF_USE_IFINDEX: + case 0: + if (vifc->vifc_flags == VIFF_USE_IFINDEX) { + dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); + if (dev && __in_dev_get_rtnl(dev) == NULL) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + } else { + dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); + } + if (!dev) + return -EADDRNOTAVAIL; + err = dev_set_allmulti(dev, 1); + if (err) { + dev_put(dev); + return err; + } + break; + default: + return -EINVAL; + } + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; + ip_rt_multicast_event(in_dev); + + /* Fill in the VIF structures */ + + v->rate_limit = vifc->vifc_rate_limit; + v->local = vifc->vifc_lcl_addr.s_addr; + v->remote = vifc->vifc_rmt_addr.s_addr; + v->flags = vifc->vifc_flags; + if (!mrtsock) + v->flags |= VIFF_STATIC; + v->threshold = vifc->vifc_threshold; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) + v->link = dev->iflink; + + /* And finish update writing critical data */ + write_lock_bh(&mrt_lock); + v->dev = dev; +#ifdef CONFIG_IP_PIMSM + if (v->flags & VIFF_REGISTER) + mrt->mroute_reg_vif_num = vifi; +#endif + if (vifi+1 > mrt->maxvif) + mrt->maxvif = vifi+1; + write_unlock_bh(&mrt_lock); + return 0; +} + +/* called with rcu_read_lock() */ +static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, + __be32 origin, + __be32 mcastgrp) +{ + int line = MFC_HASH(mcastgrp, origin); + struct mfc_cache *c; + + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { + if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) + return c; + } + return NULL; +} + +/* + * Allocate a multicast cache entry + */ +static struct mfc_cache *ipmr_cache_alloc(void) +{ + struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); + + if (c) + c->mfc_un.res.minvif = MAXVIFS; + return c; +} + +static struct mfc_cache *ipmr_cache_alloc_unres(void) +{ + struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); + + if (c) { + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10*HZ; + } + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, + struct mfc_cache *uc, struct mfc_cache *c) +{ + struct sk_buff *skb; + struct nlmsgerr *e; + + /* Play the pending entries through our router */ + + while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (ip_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + + if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb_tail_pointer(skb) - + (u8 *)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + e = NLMSG_DATA(nlh); + e->error = -EMSGSIZE; + memset(&e->msg, 0, sizeof(e->msg)); + } + + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + } else { + ip_mr_forward(net, mrt, skb, c, 0); + } + } +} + +/* + * Bounce a cache query up to mrouted. We could use netlink for this but mrouted + * expects the following bizarre scheme. + * + * Called under mrt_lock. + */ + +static int ipmr_cache_report(struct mr_table *mrt, + struct sk_buff *pkt, vifi_t vifi, int assert) +{ + struct sk_buff *skb; + const int ihl = ip_hdrlen(pkt); + struct igmphdr *igmp; + struct igmpmsg *msg; + struct sock *mroute_sk; + int ret; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); + else +#endif + skb = alloc_skb(128, GFP_ATOMIC); + + if (!skb) + return -ENOBUFS; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + * Duplicate old header, fix ihl, length etc. + * And all this only to mangle msg->im_msgtype and + * to set msg->im_mbz to "mbz" :-) + */ + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + msg = (struct igmpmsg *)skb_network_header(skb); + memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); + msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_mbz = 0; + msg->im_vif = mrt->mroute_reg_vif_num; + ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + + sizeof(struct iphdr)); + } else +#endif + { + + /* Copy the IP header */ + + skb->network_header = skb->tail; + skb_put(skb, ihl); + skb_copy_to_linear_data(skb, pkt->data, ihl); + ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ + msg = (struct igmpmsg *)skb_network_header(skb); + msg->im_vif = vifi; + skb_dst_set(skb, dst_clone(skb_dst(pkt))); + + /* Add our header */ + + igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + igmp->type = + msg->im_msgtype = assert; + igmp->code = 0; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + skb->transport_header = skb->network_header; + } + + rcu_read_lock(); + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk == NULL) { + rcu_read_unlock(); + kfree_skb(skb); + return -EINVAL; + } + + /* Deliver to mrouted */ + + ret = sock_queue_rcv_skb(mroute_sk, skb); + rcu_read_unlock(); + if (ret < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution. It gets locked cache entry! + */ + +static int +ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) +{ + bool found = false; + int err; + struct mfc_cache *c; + const struct iphdr *iph = ip_hdr(skb); + + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(c, &mrt->mfc_unres_queue, list) { + if (c->mfc_mcastgrp == iph->daddr && + c->mfc_origin == iph->saddr) { + found = true; + break; + } + } + + if (!found) { + /* Create a new entry if allowable */ + + if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || + (c = ipmr_cache_alloc_unres()) == NULL) { + spin_unlock_bh(&mfc_unres_lock); + + kfree_skb(skb); + return -ENOBUFS; + } + + /* Fill in the new cache entry */ + + c->mfc_parent = -1; + c->mfc_origin = iph->saddr; + c->mfc_mcastgrp = iph->daddr; + + /* Reflect first query at mrouted. */ + + err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); + if (err < 0) { + /* If the report failed throw the cache entry + out - Brad Parker + */ + spin_unlock_bh(&mfc_unres_lock); + + ipmr_cache_free(c); + kfree_skb(skb); + return err; + } + + atomic_inc(&mrt->cache_resolve_queue_len); + list_add(&c->list, &mrt->mfc_unres_queue); + + if (atomic_read(&mrt->cache_resolve_queue_len) == 1) + mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); + } + + /* See if we can append the packet */ + + if (c->mfc_un.unres.unresolved.qlen > 3) { + kfree_skb(skb); + err = -ENOBUFS; + } else { + skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + err = 0; + } + + spin_unlock_bh(&mfc_unres_lock); + return err; +} + +/* + * MFC cache manipulation by user space mroute daemon + */ + +static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) +{ + int line; + struct mfc_cache *c, *next; + + line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + + list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { + if (c->mfc_origin == mfc->mfcc_origin.s_addr && + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + list_del_rcu(&c->list); + + ipmr_cache_free(c); + return 0; + } + } + return -ENOENT; +} + +static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, + struct mfcctl *mfc, int mrtsock) +{ + bool found = false; + int line; + struct mfc_cache *uc, *c; + + if (mfc->mfcc_parent >= MAXVIFS) + return -ENFILE; + + line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + + list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { + if (c->mfc_origin == mfc->mfcc_origin.s_addr && + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + found = true; + break; + } + } + + if (found) { + write_lock_bh(&mrt_lock); + c->mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + write_unlock_bh(&mrt_lock); + return 0; + } + + if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) + return -EINVAL; + + c = ipmr_cache_alloc(); + if (c == NULL) + return -ENOMEM; + + c->mfc_origin = mfc->mfcc_origin.s_addr; + c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; + c->mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + + list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + found = false; + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { + if (uc->mfc_origin == c->mfc_origin && + uc->mfc_mcastgrp == c->mfc_mcastgrp) { + list_del(&uc->list); + atomic_dec(&mrt->cache_resolve_queue_len); + found = true; + break; + } + } + if (list_empty(&mrt->mfc_unres_queue)) + del_timer(&mrt->ipmr_expire_timer); + spin_unlock_bh(&mfc_unres_lock); + + if (found) { + ipmr_cache_resolve(net, mrt, uc, c); + ipmr_cache_free(uc); + } + return 0; +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +static void mroute_clean_tables(struct mr_table *mrt) +{ + int i; + LIST_HEAD(list); + struct mfc_cache *c, *next; + + /* Shut down all active vif entries */ + + for (i = 0; i < mrt->maxvif; i++) { + if (!(mrt->vif_table[i].flags & VIFF_STATIC)) + vif_delete(mrt, i, 0, &list); + } + unregister_netdevice_many(&list); + + /* Wipe the cache */ + + for (i = 0; i < MFC_LINES; i++) { + list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { + if (c->mfc_flags & MFC_STATIC) + continue; + list_del_rcu(&c->list); + ipmr_cache_free(c); + } + } + + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { + list_del(&c->list); + ipmr_destroy_unres(mrt, c); + } + spin_unlock_bh(&mfc_unres_lock); + } +} + +/* called from ip_ra_control(), before an RCU grace period, + * we dont need to call synchronize_rcu() here + */ +static void mrtsock_destruct(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct mr_table *mrt; + + rtnl_lock(); + ipmr_for_each_table(mrt, net) { + if (sk == rtnl_dereference(mrt->mroute_sk)) { + IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; + rcu_assign_pointer(mrt->mroute_sk, NULL); + mroute_clean_tables(mrt); + } + } + rtnl_unlock(); +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +{ + int ret; + struct vifctl vif; + struct mfcctl mfc; + struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + + if (optname != MRT_INIT) { + if (sk != rcu_dereference_raw(mrt->mroute_sk) && + !capable(CAP_NET_ADMIN)) + return -EACCES; + } + + switch (optname) { + case MRT_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_IGMP) + return -EOPNOTSUPP; + if (optlen != sizeof(int)) + return -ENOPROTOOPT; + + rtnl_lock(); + if (rtnl_dereference(mrt->mroute_sk)) { + rtnl_unlock(); + return -EADDRINUSE; + } + + ret = ip_ra_control(sk, 1, mrtsock_destruct); + if (ret == 0) { + rcu_assign_pointer(mrt->mroute_sk, sk); + IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; + } + rtnl_unlock(); + return ret; + case MRT_DONE: + if (sk != rcu_dereference_raw(mrt->mroute_sk)) + return -EACCES; + return ip_ra_control(sk, 0, NULL); + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if (optlen != sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif, optval, sizeof(vif))) + return -EFAULT; + if (vif.vifc_vifi >= MAXVIFS) + return -ENFILE; + rtnl_lock(); + if (optname == MRT_ADD_VIF) { + ret = vif_add(net, mrt, &vif, + sk == rtnl_dereference(mrt->mroute_sk)); + } else { + ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); + } + rtnl_unlock(); + return ret; + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if (optlen != sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc, optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname == MRT_DEL_MFC) + ret = ipmr_mfc_delete(mrt, &mfc); + else + ret = ipmr_mfc_add(net, mrt, &mfc, + sk == rtnl_dereference(mrt->mroute_sk)); + rtnl_unlock(); + return ret; + /* + * Control PIM assert. + */ + case MRT_ASSERT: + { + int v; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + mrt->mroute_do_assert = (v) ? 1 : 0; + return 0; + } +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: + { + int v; + + if (get_user(v, (int __user *)optval)) + return -EFAULT; + v = (v) ? 1 : 0; + + rtnl_lock(); + ret = 0; + if (v != mrt->mroute_do_pim) { + mrt->mroute_do_pim = v; + mrt->mroute_do_assert = v; + } + rtnl_unlock(); + return ret; + } +#endif +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + case MRT_TABLE: + { + u32 v; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(v, (u32 __user *)optval)) + return -EFAULT; + + rtnl_lock(); + ret = 0; + if (sk == rtnl_dereference(mrt->mroute_sk)) { + ret = -EBUSY; + } else { + if (!ipmr_new_table(net, v)) + ret = -ENOMEM; + raw_sk(sk)->ipmr_table = v; + } + rtnl_unlock(); + return ret; + } +#endif + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) +{ + int olr; + int val; + struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + + if (optname != MRT_VERSION && +#ifdef CONFIG_IP_PIMSM + optname != MRT_PIM && +#endif + optname != MRT_ASSERT) + return -ENOPROTOOPT; + + if (get_user(olr, optlen)) + return -EFAULT; + + olr = min_t(unsigned int, olr, sizeof(int)); + if (olr < 0) + return -EINVAL; + + if (put_user(olr, optlen)) + return -EFAULT; + if (optname == MRT_VERSION) + val = 0x0305; +#ifdef CONFIG_IP_PIMSM + else if (optname == MRT_PIM) + val = mrt->mroute_do_pim; +#endif + else + val = mrt->mroute_do_assert; + if (copy_to_user(optval, &val, olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +{ + struct sioc_sg_req sr; + struct sioc_vif_req vr; + struct vif_device *vif; + struct mfc_cache *c; + struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETVIFCNT: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.vifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif_table[vr.vifi]; + if (VIF_EXISTS(mrt, vr.vifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + rcu_read_lock(); + c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + rcu_read_unlock(); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + rcu_read_unlock(); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req { + struct in_addr src; + struct in_addr grp; + compat_ulong_t pktcnt; + compat_ulong_t bytecnt; + compat_ulong_t wrong_if; +}; + +struct compat_sioc_vif_req { + vifi_t vifi; /* Which iface */ + compat_ulong_t icount; + compat_ulong_t ocount; + compat_ulong_t ibytes; + compat_ulong_t obytes; +}; + +int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + struct compat_sioc_sg_req sr; + struct compat_sioc_vif_req vr; + struct vif_device *vif; + struct mfc_cache *c; + struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETVIFCNT: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.vifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif_table[vr.vifi]; + if (VIF_EXISTS(mrt, vr.vifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + rcu_read_lock(); + c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + rcu_read_unlock(); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + rcu_read_unlock(); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} +#endif + + +static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct mr_table *mrt; + struct vif_device *v; + int ct; + LIST_HEAD(list); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + ipmr_for_each_table(mrt, net) { + v = &mrt->vif_table[0]; + for (ct = 0; ct < mrt->maxvif; ct++, v++) { + if (v->dev == dev) + vif_delete(mrt, ct, 1, &list); + } + } + unregister_netdevice_many(&list); + return NOTIFY_DONE; +} + + +static struct notifier_block ip_mr_notifier = { + .notifier_call = ipmr_device_event, +}; + +/* + * Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. + */ + +static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) +{ + struct iphdr *iph; + const struct iphdr *old_iph = ip_hdr(skb); + + skb_push(skb, sizeof(struct iphdr)); + skb->transport_header = skb->network_header; + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + iph->version = 4; + iph->tos = old_iph->tos; + iph->ttl = old_iph->ttl; + iph->frag_off = 0; + iph->daddr = daddr; + iph->saddr = saddr; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, skb_dst(skb), NULL); + ip_send_check(iph); + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + nf_reset(skb); +} + +static inline int ipmr_forward_finish(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + + if (unlikely(opt->optlen)) + ip_forward_options(skb); + + return dst_output(skb); +} + +/* + * Processing handlers for ipmr_forward + */ + +static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *c, int vifi) +{ + const struct iphdr *iph = ip_hdr(skb); + struct vif_device *vif = &mrt->vif_table[vifi]; + struct net_device *dev; + struct rtable *rt; + struct flowi4 fl4; + int encap = 0; + + if (vif->dev == NULL) + goto out_free; + +#ifdef CONFIG_IP_PIMSM + if (vif->flags & VIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out += skb->len; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; + ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); + goto out_free; + } +#endif + + if (vif->flags & VIFF_TUNNEL) { + rt = ip_route_output_ports(net, &fl4, NULL, + vif->remote, vif->local, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) + goto out_free; + encap = sizeof(struct iphdr); + } else { + rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) + goto out_free; + } + + dev = rt->dst.dev; + + if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { + /* Do not fragment multicasts. Alas, IPv4 does not + * allow to send ICMP, so that packets will disappear + * to blackhole. + */ + + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + ip_rt_put(rt); + goto out_free; + } + + encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; + + if (skb_cow(skb, encap)) { + ip_rt_put(rt); + goto out_free; + } + + vif->pkt_out++; + vif->bytes_out += skb->len; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + ip_decrease_ttl(ip_hdr(skb)); + + /* FIXME: forward and output firewalls used to be called here. + * What do we do with netfilter? -- RR + */ + if (vif->flags & VIFF_TUNNEL) { + ip_encap(skb, vif->local, vif->remote); + /* FIXME: extra output firewall step used to be here. --RR */ + vif->dev->stats.tx_packets++; + vif->dev->stats.tx_bytes += skb->len; + } + + IPCB(skb)->flags |= IPSKB_FORWARDED; + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev, + ipmr_forward_finish); + return; + +out_free: + kfree_skb(skb); +} + +static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) +{ + int ct; + + for (ct = mrt->maxvif-1; ct >= 0; ct--) { + if (mrt->vif_table[ct].dev == dev) + break; + } + return ct; +} + +/* "local" means that we should preserve one skb (for local delivery) */ + +static int ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local) +{ + int psend = -1; + int vif, ct; + + vif = cache->mfc_parent; + cache->mfc_un.res.pkt++; + cache->mfc_un.res.bytes += skb->len; + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (mrt->vif_table[vif].dev != skb->dev) { + int true_vifi; + + if (rt_is_output_route(skb_rtable(skb))) { + /* It is our own packet, looped back. + * Very complicated situation... + * + * The best workaround until routing daemons will be + * fixed is not to redistribute packet, if it was + * send through wrong interface. It means, that + * multicast applications WILL NOT work for + * (S,G), which have default multicast route pointing + * to wrong oif. In any case, it is not a good + * idea to use multicasting applications on router. + */ + goto dont_forward; + } + + cache->mfc_un.res.wrong_if++; + true_vifi = ipmr_find_vif(mrt, skb->dev); + + if (true_vifi >= 0 && mrt->mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + * so that we cannot check that packet arrived on an oif. + * It is bad, but otherwise we would need to move pretty + * large chunk of pimd to kernel. Ough... --ANK + */ + (mrt->mroute_do_pim || + cache->mfc_un.res.ttls[true_vifi] < 255) && + time_after(jiffies, + cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + cache->mfc_un.res.last_assert = jiffies; + ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + } + goto dont_forward; + } + + mrt->vif_table[vif].pkt_in++; + mrt->vif_table[vif].bytes_in += skb->len; + + /* + * Forward the frame + */ + for (ct = cache->mfc_un.res.maxvif - 1; + ct >= cache->mfc_un.res.minvif; ct--) { + if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + + if (skb2) + ipmr_queue_xmit(net, mrt, skb2, cache, + psend); + } + psend = ct; + } + } + if (psend != -1) { + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + + if (skb2) + ipmr_queue_xmit(net, mrt, skb2, cache, psend); + } else { + ipmr_queue_xmit(net, mrt, skb, cache, psend); + return 0; + } + } + +dont_forward: + if (!local) + kfree_skb(skb); + return 0; +} + +static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_oif = rt->rt_oif, + .flowi4_iif = rt->rt_iif, + .flowi4_mark = rt->rt_mark, + }; + struct mr_table *mrt; + int err; + + err = ipmr_fib_lookup(net, &fl4, &mrt); + if (err) + return ERR_PTR(err); + return mrt; +} + +/* + * Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); + */ + +int ip_mr_input(struct sk_buff *skb) +{ + struct mfc_cache *cache; + struct net *net = dev_net(skb->dev); + int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + struct mr_table *mrt; + + /* Packet is looped back after forward, it should not be + * forwarded second time, but still can be delivered locally. + */ + if (IPCB(skb)->flags & IPSKB_FORWARDED) + goto dont_forward; + + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) { + kfree_skb(skb); + return PTR_ERR(mrt); + } + if (!local) { + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + * Cisco IOS <= 11.2(8)) do not put router alert + * option to IGMP packets destined to routable + * groups. It is very bad, because it means + * that we can forward NO IGMP messages. + */ + struct sock *mroute_sk; + + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk) { + nf_reset(skb); + raw_rcv(mroute_sk, skb); + return 0; + } + } + } + + /* already under rcu_read_lock() */ + cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + + /* + * No usable cache entry + */ + if (cache == NULL) { + int vif; + + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + ip_local_deliver(skb); + if (skb2 == NULL) + return -ENOBUFS; + skb = skb2; + } + + read_lock(&mrt_lock); + vif = ipmr_find_vif(mrt, skb->dev); + if (vif >= 0) { + int err2 = ipmr_cache_unresolved(mrt, vif, skb); + read_unlock(&mrt_lock); + + return err2; + } + read_unlock(&mrt_lock); + kfree_skb(skb); + return -ENODEV; + } + + read_lock(&mrt_lock); + ip_mr_forward(net, mrt, skb, cache, local); + read_unlock(&mrt_lock); + + if (local) + return ip_local_deliver(skb); + + return 0; + +dont_forward: + if (local) + return ip_local_deliver(skb); + kfree_skb(skb); + return 0; +} + +#ifdef CONFIG_IP_PIMSM +/* called with rcu_read_lock() */ +static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, + unsigned int pimlen) +{ + struct net_device *reg_dev = NULL; + struct iphdr *encap; + + encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); + /* + * Check that: + * a. packet is really sent to a multicast group + * b. packet is not a NULL-REGISTER + * c. packet is not truncated + */ + if (!ipv4_is_multicast(encap->daddr) || + encap->tot_len == 0 || + ntohs(encap->tot_len) + pimlen > skb->len) + return 1; + + read_lock(&mrt_lock); + if (mrt->mroute_reg_vif_num >= 0) + reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + return 1; + + skb->mac_header = skb->network_header; + skb_pull(skb, (u8 *)encap - skb->data); + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = CHECKSUM_NONE; + skb->pkt_type = PACKET_HOST; + + skb_tunnel_rx(skb, reg_dev); + + netif_rx(skb); + + return NET_RX_SUCCESS; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V1 +/* + * Handle IGMP messages of PIMv1 + */ + +int pim_rcv_v1(struct sk_buff *skb) +{ + struct igmphdr *pim; + struct net *net = dev_net(skb->dev); + struct mr_table *mrt; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) + goto drop; + + pim = igmp_hdr(skb); + + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) + goto drop; + if (!mrt->mroute_do_pim || + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) + goto drop; + + if (__pim_rcv(mrt, skb, sizeof(*pim))) { +drop: + kfree_skb(skb); + } + return 0; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +static int pim_rcv(struct sk_buff *skb) +{ + struct pimreghdr *pim; + struct net *net = dev_net(skb->dev); + struct mr_table *mrt; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) + goto drop; + + pim = (struct pimreghdr *)skb_transport_header(skb); + if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || + (pim->flags & PIM_NULL_REGISTER) || + (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && + csum_fold(skb_checksum(skb, 0, skb->len, 0)))) + goto drop; + + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) + goto drop; + if (__pim_rcv(mrt, skb, sizeof(*pim))) { +drop: + kfree_skb(skb); + } + return 0; +} +#endif + +static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + u8 *b = skb_tail_pointer(skb); + struct rtattr *mp_head; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent >= MAXVIFS) + return -ENOENT; + + if (VIF_EXISTS(mrt, c->mfc_parent)) + RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); + + mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +int ipmr_get_route(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr, + struct rtmsg *rtm, int nowait) +{ + struct mfc_cache *cache; + struct mr_table *mrt; + int err; + + mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + + rcu_read_lock(); + cache = ipmr_cache_find(mrt, saddr, daddr); + + if (cache == NULL) { + struct sk_buff *skb2; + struct iphdr *iph; + struct net_device *dev; + int vif = -1; + + if (nowait) { + rcu_read_unlock(); + return -EAGAIN; + } + + dev = skb->dev; + read_lock(&mrt_lock); + if (dev) + vif = ipmr_find_vif(mrt, dev); + if (vif < 0) { + read_unlock(&mrt_lock); + rcu_read_unlock(); + return -ENODEV; + } + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) { + read_unlock(&mrt_lock); + rcu_read_unlock(); + return -ENOMEM; + } + + skb_push(skb2, sizeof(struct iphdr)); + skb_reset_network_header(skb2); + iph = ip_hdr(skb2); + iph->ihl = sizeof(struct iphdr) >> 2; + iph->saddr = saddr; + iph->daddr = daddr; + iph->version = 0; + err = ipmr_cache_unresolved(mrt, vif, skb2); + read_unlock(&mrt_lock); + rcu_read_unlock(); + return err; + } + + read_lock(&mrt_lock); + if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + err = __ipmr_fill_mroute(mrt, skb, cache, rtm); + read_unlock(&mrt_lock); + rcu_read_unlock(); + return err; +} + +static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 pid, u32 seq, struct mfc_cache *c) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IPMR; + rtm->rtm_dst_len = 32; + rtm->rtm_src_len = 32; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + NLA_PUT_U32(skb, RTA_TABLE, mrt->id); + rtm->rtm_type = RTN_MULTICAST; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = 0; + + NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); + NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); + + if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr_table *mrt; + struct mfc_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + rcu_read_lock(); + ipmr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC_LINES; h++) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ipmr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + mfc) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + s_h = 0; +next_table: + t++; + } +done: + rcu_read_unlock(); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} + +#ifdef CONFIG_PROC_FS +/* + * The /proc interfaces to multicast routing : + * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif + */ +struct ipmr_vif_iter { + struct seq_net_private p; + struct mr_table *mrt; + int ct; +}; + +static struct vif_device *ipmr_vif_seq_idx(struct net *net, + struct ipmr_vif_iter *iter, + loff_t pos) +{ + struct mr_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif_table[iter->ct]; + } + return NULL; +} + +static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(mrt_lock) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + iter->mrt = mrt; + + read_lock(&mrt_lock); + return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ipmr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif_table[iter->ct]; + } + return NULL; +} + +static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) + __releases(mrt_lock) +{ + read_unlock(&mrt_lock); +} + +static int ipmr_vif_seq_show(struct seq_file *seq, void *v) +{ + struct ipmr_vif_iter *iter = seq->private; + struct mr_table *mrt = iter->mrt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); + } else { + const struct vif_device *vif = v; + const char *name = vif->dev ? vif->dev->name : "none"; + + seq_printf(seq, + "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + vif - mrt->vif_table, + name, vif->bytes_in, vif->pkt_in, + vif->bytes_out, vif->pkt_out, + vif->flags, vif->local, vif->remote); + } + return 0; +} + +static const struct seq_operations ipmr_vif_seq_ops = { + .start = ipmr_vif_seq_start, + .next = ipmr_vif_seq_next, + .stop = ipmr_vif_seq_stop, + .show = ipmr_vif_seq_show, +}; + +static int ipmr_vif_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipmr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); +} + +static const struct file_operations ipmr_vif_fops = { + .owner = THIS_MODULE, + .open = ipmr_vif_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct ipmr_mfc_iter { + struct seq_net_private p; + struct mr_table *mrt; + struct list_head *cache; + int ct; +}; + + +static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, + struct ipmr_mfc_iter *it, loff_t pos) +{ + struct mr_table *mrt = it->mrt; + struct mfc_cache *mfc; + + rcu_read_lock(); + for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { + it->cache = &mrt->mfc_cache_array[it->ct]; + list_for_each_entry_rcu(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + } + rcu_read_unlock(); + + spin_lock_bh(&mfc_unres_lock); + it->cache = &mrt->mfc_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&mfc_unres_lock); + + it->cache = NULL; + return NULL; +} + + +static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + it->mrt = mrt; + it->cache = NULL; + it->ct = 0; + return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mfc_cache *mfc = v; + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = it->mrt; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return ipmr_mfc_seq_idx(net, seq->private, 0); + + if (mfc->list.next != it->cache) + return list_entry(mfc->list.next, struct mfc_cache, list); + + if (it->cache == &mrt->mfc_unres_queue) + goto end_of_list; + + BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]); + + while (++it->ct < MFC_LINES) { + it->cache = &mrt->mfc_cache_array[it->ct]; + if (list_empty(it->cache)) + continue; + return list_first_entry(it->cache, struct mfc_cache, list); + } + + /* exhausted cache_array, show unresolved */ + rcu_read_unlock(); + it->cache = &mrt->mfc_unres_queue; + it->ct = 0; + + spin_lock_bh(&mfc_unres_lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mfc_cache, list); + +end_of_list: + spin_unlock_bh(&mfc_unres_lock); + it->cache = NULL; + + return NULL; +} + +static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct ipmr_mfc_iter *it = seq->private; + struct mr_table *mrt = it->mrt; + + if (it->cache == &mrt->mfc_unres_queue) + spin_unlock_bh(&mfc_unres_lock); + else if (it->cache == &mrt->mfc_cache_array[it->ct]) + rcu_read_unlock(); +} + +static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) +{ + int n; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Group Origin Iif Pkts Bytes Wrong Oifs\n"); + } else { + const struct mfc_cache *mfc = v; + const struct ipmr_mfc_iter *it = seq->private; + const struct mr_table *mrt = it->mrt; + + seq_printf(seq, "%08X %08X %-3hd", + (__force u32) mfc->mfc_mcastgrp, + (__force u32) mfc->mfc_origin, + mfc->mfc_parent); + + if (it->cache != &mrt->mfc_unres_queue) { + seq_printf(seq, " %8lu %8lu %8lu", + mfc->mfc_un.res.pkt, + mfc->mfc_un.res.bytes, + mfc->mfc_un.res.wrong_if); + for (n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++) { + if (VIF_EXISTS(mrt, n) && + mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, + " %2d:%-3d", + n, mfc->mfc_un.res.ttls[n]); + } + } else { + /* unresolved mfc_caches don't contain + * pkt, bytes and wrong_if values + */ + seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); + } + seq_putc(seq, '\n'); + } + return 0; +} + +static const struct seq_operations ipmr_mfc_seq_ops = { + .start = ipmr_mfc_seq_start, + .next = ipmr_mfc_seq_next, + .stop = ipmr_mfc_seq_stop, + .show = ipmr_mfc_seq_show, +}; + +static int ipmr_mfc_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); +} + +static const struct file_operations ipmr_mfc_fops = { + .owner = THIS_MODULE, + .open = ipmr_mfc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +static const struct net_protocol pim_protocol = { + .handler = pim_rcv, + .netns_ok = 1, +}; +#endif + + +/* + * Setup for IP multicast routing + */ +static int __net_init ipmr_net_init(struct net *net) +{ + int err; + + err = ipmr_rules_init(net); + if (err < 0) + goto fail; + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + goto proc_cache_fail; +#endif + return 0; + +#ifdef CONFIG_PROC_FS +proc_cache_fail: + proc_net_remove(net, "ip_mr_vif"); +proc_vif_fail: + ipmr_rules_exit(net); +#endif +fail: + return err; +} + +static void __net_exit ipmr_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "ip_mr_cache"); + proc_net_remove(net, "ip_mr_vif"); +#endif + ipmr_rules_exit(net); +} + +static struct pernet_operations ipmr_net_ops = { + .init = ipmr_net_init, + .exit = ipmr_net_exit, +}; + +int __init ip_mr_init(void) +{ + int err; + + mrt_cachep = kmem_cache_create("ip_mrt_cache", + sizeof(struct mfc_cache), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, + NULL); + if (!mrt_cachep) + return -ENOMEM; + + err = register_pernet_subsys(&ipmr_net_ops); + if (err) + goto reg_pernet_fail; + + err = register_netdevice_notifier(&ip_mr_notifier); + if (err) + goto reg_notif_fail; +#ifdef CONFIG_IP_PIMSM_V2 + if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { + printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n"); + err = -EAGAIN; + goto add_proto_fail; + } +#endif + rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute); + return 0; + +#ifdef CONFIG_IP_PIMSM_V2 +add_proto_fail: + unregister_netdevice_notifier(&ip_mr_notifier); +#endif +reg_notif_fail: + unregister_pernet_subsys(&ipmr_net_ops); +reg_pernet_fail: + kmem_cache_destroy(mrt_cachep); + return err; +} diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c new file mode 100644 index 00000000..929b27bd --- /dev/null +++ b/net/ipv4/netfilter.c @@ -0,0 +1,247 @@ +/* IPv4 specific functions of netfilter core */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; + struct flowi4 fl4 = {}; + __be32 saddr = iph->saddr; + __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + unsigned int hh_len; + + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(net, saddr); + if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) + flags |= FLOWI_FLAG_ANYSRC; + else + saddr = 0; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. + */ + fl4.daddr = iph->daddr; + fl4.saddr = saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_flags = flags; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return -1; + + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + if (skb_dst(skb)->error) + return -1; + +#ifdef CONFIG_XFRM + if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { + struct dst_entry *dst = skb_dst(skb); + skb_dst_set(skb, NULL); + dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); + if (IS_ERR(dst)) + return -1; + skb_dst_set(skb, dst); + } +#endif + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -1; + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); + +#ifdef CONFIG_XFRM +int ip_xfrm_me_harder(struct sk_buff *skb) +{ + struct flowi fl; + unsigned int hh_len; + struct dst_entry *dst; + + if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) + return 0; + if (xfrm_decode_session(skb, &fl, AF_INET) < 0) + return -1; + + dst = skb_dst(skb); + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + dst_hold(dst); + + dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + if (IS_ERR(dst)) + return -1; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -1; + return 0; +} +EXPORT_SYMBOL(ip_xfrm_me_harder); +#endif + +void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); +EXPORT_SYMBOL(ip_nat_decode_session); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip_rt_info { + __be32 daddr; + __be32 saddr; + u_int8_t tos; + u_int32_t mark; +}; + +static void nf_ip_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) +{ + struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + rt_info->tos = iph->tos; + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + rt_info->mark = skb->mark; + } +} + +static int nf_ip_reroute(struct sk_buff *skb, + const struct nf_queue_entry *entry) +{ + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) + return ip_route_me_harder(skb, RTN_UNSPEC); + } + return 0; +} + +__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if ((protocol == 0 && !csum_fold(skb->csum)) || + !csum_tcpudp_magic(iph->saddr, iph->daddr, + skb->len - dataoff, protocol, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + if (protocol == 0) + skb->csum = 0; + else + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len - dataoff, + protocol, 0); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip_checksum); + +static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, + skb->len - dataoff, 0); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +} + +static int nf_ip_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict __always_unused) +{ + struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + *dst = &rt->dst; + return 0; +} + +static const struct nf_afinfo nf_ip_afinfo = { + .family = AF_INET, + .checksum = nf_ip_checksum, + .checksum_partial = nf_ip_checksum_partial, + .route = nf_ip_route, + .saveroute = nf_ip_saveroute, + .reroute = nf_ip_reroute, + .route_key_size = sizeof(struct ip_rt_info), +}; + +static int ipv4_netfilter_init(void) +{ + return nf_register_afinfo(&nf_ip_afinfo); +} + +static void ipv4_netfilter_fini(void) +{ + nf_unregister_afinfo(&nf_ip_afinfo); +} + +module_init(ipv4_netfilter_init); +module_exit(ipv4_netfilter_fini); + +#ifdef CONFIG_SYSCTL +struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "netfilter", }, + { } +}; +EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); +#endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig new file mode 100644 index 00000000..73b4e91a --- /dev/null +++ b/net/ipv4/netfilter/Kconfig @@ -0,0 +1,395 @@ +# +# IP netfilter configuration +# + +menu "IP: Netfilter Configuration" + depends on INET && NETFILTER + +config NF_DEFRAG_IPV4 + tristate + default n + +config NF_CONNTRACK_IPV4 + tristate "IPv4 connection tracking support (required for NAT)" + depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV4 + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv4 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_PROC_COMPAT + bool "proc/sysctl compatibility with old connection tracking" + depends on NF_CONNTRACK_IPV4 + default y + help + This option enables /proc and sysctl compatibility with the old + layer 3 dependent connection tracking. This is needed to keep + old programs that have not been adapted to the new names working. + + If unsure, say Y. + +config IP_NF_QUEUE + tristate "IP Userspace queueing via NETLINK (OBSOLETE)" + depends on NETFILTER_ADVANCED + help + Netfilter has the ability to queue packets to user space: the + netlink device can be used to access them using this driver. + + This option enables the old IPv4-only "ip_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_IPTABLES + tristate "IP tables support (required for filtering/masq/NAT)" + default m if NETFILTER_ADVANCED=n + select NETFILTER_XTABLES + help + iptables is a general, extensible packet identification framework. + The packet filtering and full NAT (masquerading, port forwarding, + etc) subsystems now use this: say `Y' or `M' here if you want to use + either of those. + + To compile it as a module, choose M here. If unsure, say N. + +if IP_NF_IPTABLES + +# The matches. +config IP_NF_MATCH_AH + tristate '"ah" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of SPIs + inside AH header of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ECN + tristate '"ecn" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `ECN' match, which allows you to match against + the IPv4 and TCP header ECN fields. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TTL + tristate '"ttl" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MATCH_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. + +# `filter', generic and specific targets +config IP_NF_FILTER + tristate "Packet filtering" + default m if NETFILTER_ADVANCED=n + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP_NF_FILTER + default m if NETFILTER_ADVANCED=n + help + The REJECT target allows a filtering rule to specify that an ICMP + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REJECT_SKERR + bool "Force socket error when rejecting with icmp*" + depends on IP_NF_TARGET_REJECT + default n + help + This option enables turning a "--reject-with icmp*" into a matching + socket error also. + The REJECT target normally allows sending an ICMP message. But it + leaves the local socket unaware of any ingress rejects. + + If unsure, say N. + +config IP_NF_TARGET_LOG + tristate "LOG target support" + default m if NETFILTER_ADVANCED=n + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ULOG + tristate "ULOG target support" + default m if NETFILTER_ADVANCED=n + ---help--- + + This option enables the old IPv4-only "ipt_ULOG" implementation + which has been obsoleted by the new "nfnetlink_log" code (see + CONFIG_NETFILTER_NETLINK_LOG). + + This option adds a `ULOG' target, which allows you to create rules in + any iptables table. The packet is passed to a userspace logging + daemon using netlink multicast sockets; unlike the LOG target + which can only be viewed through syslog. + + The appropriate userspace logging daemon (ulogd) may be obtained from + + + To compile it as a module, choose M here. If unsure, say N. + +# NAT + specific targets: nf_conntrack +config NF_NAT + tristate "Full NAT" + depends on NF_CONNTRACK_IPV4 + default m if NETFILTER_ADVANCED=n + help + The Full NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. It is controlled by + the `nat' table in iptables: see the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config NF_NAT_NEEDED + bool + depends on NF_NAT + default y + +config IP_NF_TARGET_MASQUERADE + tristate "MASQUERADE target support" + depends on NF_NAT + default m if NETFILTER_ADVANCED=n + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_NETMAP + tristate "NETMAP target support" + depends on NF_NAT + depends on NETFILTER_ADVANCED + help + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REDIRECT + tristate "REDIRECT target support" + depends on NF_NAT + depends on NETFILTER_ADVANCED + help + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_NAT_SNMP_BASIC + tristate "Basic SNMP-ALG support" + depends on NF_CONNTRACK_SNMP && NF_NAT + depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP + ---help--- + + This module implements an Application Layer Gateway (ALG) for + SNMP payloads. In conjunction with NAT, it allows a network + management system to access multiple private networks with + conflicting addresses. It works by modifying IP addresses + inside SNMP payloads to match IP-layer NAT mapping. + + This is the "basic" form of SNMP-ALG, as described in RFC 2962 + + To compile it as a module, choose M here. If unsure, say N. + +# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), +# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. +# From kconfig-language.txt: +# +# '&&' (6) +# +# (6) Returns the result of min(/expr/, /expr/). +config NF_NAT_PROTO_DCCP + tristate + depends on NF_NAT && NF_CT_PROTO_DCCP + default NF_NAT && NF_CT_PROTO_DCCP + +config NF_NAT_PROTO_GRE + tristate + depends on NF_NAT && NF_CT_PROTO_GRE + +config NF_NAT_PROTO_UDPLITE + tristate + depends on NF_NAT && NF_CT_PROTO_UDPLITE + default NF_NAT && NF_CT_PROTO_UDPLITE + +config NF_NAT_PROTO_SCTP + tristate + default NF_NAT && NF_CT_PROTO_SCTP + depends on NF_NAT && NF_CT_PROTO_SCTP + select LIBCRC32C + +config NF_NAT_FTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_FTP + +config NF_NAT_IRC + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_IRC + +config NF_NAT_TFTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_TFTP + +config NF_NAT_AMANDA + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_AMANDA + +config NF_NAT_PPTP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_PPTP + select NF_NAT_PROTO_GRE + +config NF_NAT_H323 + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_H323 + +config NF_NAT_SIP + tristate + depends on NF_CONNTRACK && NF_NAT + default NF_NAT && NF_CONNTRACK_SIP + +# mangle + specific targets +config IP_NF_MANGLE + tristate "Packet mangling" + default m if NETFILTER_ADVANCED=n + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CLUSTERIP + tristate "CLUSTERIP target support (EXPERIMENTAL)" + depends on IP_NF_MANGLE && EXPERIMENTAL + depends on NF_CONNTRACK_IPV4 + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_MARK + help + The CLUSTERIP target allows you to build load-balancing clusters of + network servers without having a dedicated load-balancing + router/server/switch. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ECN + tristate "ECN target support" + depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds a `ECN' target, which can be used in the iptables mangle + table. + + You can use this target to remove the ECN bits from the IPv4 header of + an IP packet. This is particularly useful, if you need to work around + existing ECN blackholes on the internet, but don't want to disable + ECN support in general. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TTL + tristate '"TTL" target support' + depends on NETFILTER_ADVANCED && IP_NF_MANGLE + select NETFILTER_XT_TARGET_HL + ---help--- + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + +# raw + specific targets +config IP_NF_RAW + tristate 'raw table support (required for NOTRACK/TRACE)' + depends on NETFILTER_ADVANCED + help + This option adds a `raw' table to iptables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +# security table for MAC policy +config IP_NF_SECURITY + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + +endif # IP_NF_IPTABLES + +# ARP tables +config IP_NF_ARPTABLES + tristate "ARP tables support" + select NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + help + arptables is a general, extensible packet identification framework. + The ARP packet filtering and mangling (manipulation)subsystems + use this: say Y or M here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +if IP_NF_ARPTABLES + +config IP_NF_ARPFILTER + tristate "ARP packet filtering" + help + ARP packet filtering defines a table `filter', which has a series of + rules for simple ARP packet filtering at local input and + local output. On a bridge, you can also specify filtering rules + for forwarded ARP packets. See the man page for arptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARP_MANGLE + tristate "ARP payload mangling" + help + Allows altering the ARP packet payload: source and destination + hardware and network addresses. + +endif # IP_NF_ARPTABLES + +endmenu + diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile new file mode 100644 index 00000000..dca2082e --- /dev/null +++ b/net/ipv4/netfilter/Makefile @@ -0,0 +1,72 @@ +# +# Makefile for the netfilter modules on top of IPv4. +# + +# objects for l3 independent conntrack +nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o +ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) +ifeq ($(CONFIG_PROC_FS),y) +nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o +endif +endif + +nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o +iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o + +# connection tracking +obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o + +obj-$(CONFIG_NF_NAT) += nf_nat.o + +# defrag +obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o + +# NAT helpers (nf_conntrack) +obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o +obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o +obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o +obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o +obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o +obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o +obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o + +# NAT protocols (nf_nat) +obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o +obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o +obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o +obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o + +# generic IP tables +obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o + +# the three instances of ip_tables +obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o +obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o +obj-$(CONFIG_NF_NAT) += iptable_nat.o +obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o +obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o + +# matches +obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o + +# targets +obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o +obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o +obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o +obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o + +# generic ARP tables +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o +obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o + +# just filtering instance of ARP tables for now +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o + diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c new file mode 100644 index 00000000..fd7a3f68 --- /dev/null +++ b/net/ipv4/netfilter/arp_tables.c @@ -0,0 +1,1914 @@ +/* + * Packet matching code for ARP packets. + * + * Based heavily, if not almost entirely, upon ip_tables.c framework. + * + * Some ARP specific bits are: + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "../../netfilter/xt_repldata.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller "); +MODULE_DESCRIPTION("arptables core"); + +/*#define DEBUG_ARP_TABLES*/ +/*#define DEBUG_ARP_TABLES_USER*/ + +#ifdef DEBUG_ARP_TABLES +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_ARP_TABLES_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define ARP_NF_ASSERT(x) WARN_ON(!(x)) +#else +#define ARP_NF_ASSERT(x) +#endif + +void *arpt_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(arpt, ARPT); +} +EXPORT_SYMBOL_GPL(arpt_alloc_initial_table); + +static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, + const char *hdr_addr, int len) +{ + int i, ret; + + if (len > ARPT_DEV_ADDR_LEN_MAX) + len = ARPT_DEV_ADDR_LEN_MAX; + + ret = 0; + for (i = 0; i < len; i++) + ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; + + return ret != 0; +} + +/* + * Unfortunately, _b and _mask are not aligned to an int (or long int) + * Some arches dont care, unrolling the loop is a win on them. + * For other arches, we only have a 16bit alignement. + */ +static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask) +{ +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + unsigned long ret = ifname_compare_aligned(_a, _b, _mask); +#else + unsigned long ret = 0; + const u16 *a = (const u16 *)_a; + const u16 *b = (const u16 *)_b; + const u16 *mask = (const u16 *)_mask; + int i; + + for (i = 0; i < IFNAMSIZ/sizeof(u16); i++) + ret |= (a[i] ^ b[i]) & mask[i]; +#endif + return ret; +} + +/* Returns whether packet matches rule or not. */ +static inline int arp_packet_match(const struct arphdr *arphdr, + struct net_device *dev, + const char *indev, + const char *outdev, + const struct arpt_arp *arpinfo) +{ + const char *arpptr = (char *)(arphdr + 1); + const char *src_devaddr, *tgt_devaddr; + __be32 src_ipaddr, tgt_ipaddr; + long ret; + +#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) + + if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, + ARPT_INV_ARPOP)) { + dprintf("ARP operation field mismatch.\n"); + dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", + arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + return 0; + } + + if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, + ARPT_INV_ARPHRD)) { + dprintf("ARP hardware address format mismatch.\n"); + dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", + arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + return 0; + } + + if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, + ARPT_INV_ARPPRO)) { + dprintf("ARP protocol address format mismatch.\n"); + dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", + arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + return 0; + } + + if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, + ARPT_INV_ARPHLN)) { + dprintf("ARP hardware address length mismatch.\n"); + dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", + arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + return 0; + } + + src_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&src_ipaddr, arpptr, sizeof(u32)); + arpptr += sizeof(u32); + tgt_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); + + if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), + ARPT_INV_SRCDEVADDR) || + FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), + ARPT_INV_TGTDEVADDR)) { + dprintf("Source or target device address mismatch.\n"); + + return 0; + } + + if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, + ARPT_INV_SRCIP) || + FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), + ARPT_INV_TGTIP)) { + dprintf("Source or target IP address mismatch.\n"); + + dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", + &src_ipaddr, + &arpinfo->smsk.s_addr, + &arpinfo->src.s_addr, + arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); + dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n", + &tgt_ipaddr, + &arpinfo->tmsk.s_addr, + &arpinfo->tgt.s_addr, + arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches. */ + ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); + + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, arpinfo->iniface, + arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); + + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, arpinfo->outiface, + arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + return 1; +#undef FWINV +} + +static inline int arp_checkentry(const struct arpt_arp *arp) +{ + if (arp->flags & ~ARPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + arp->flags & ~ARPT_F_MASK); + return 0; + } + if (arp->invflags & ~ARPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + arp->invflags & ~ARPT_INV_MASK); + return 0; + } + + return 1; +} + +static unsigned int +arpt_error(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (net_ratelimit()) + pr_err("arp_tables: error: '%s'\n", + (const char *)par->targinfo); + + return NF_DROP; +} + +static inline const struct xt_entry_target * +arpt_get_target_c(const struct arpt_entry *e) +{ + return arpt_get_target((struct arpt_entry *)e); +} + +static inline struct arpt_entry * +get_entry(const void *base, unsigned int offset) +{ + return (struct arpt_entry *)(base + offset); +} + +static inline __pure +struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) +{ + return (void *)entry + entry->next_offset; +} + +unsigned int arpt_do_table(struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct xt_table *table) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + unsigned int verdict = NF_DROP; + const struct arphdr *arp; + struct arpt_entry *e, *back; + const char *indev, *outdev; + void *table_base; + const struct xt_table_info *private; + struct xt_action_param acpar; + unsigned int addend; + + if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) + return NF_DROP; + + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + local_bh_disable(); + addend = xt_write_recseq_begin(); + private = table->private; + table_base = private->entries[smp_processor_id()]; + + e = get_entry(table_base, private->hook_entry[hook]); + back = get_entry(table_base, private->underflow[hook]); + + acpar.in = in; + acpar.out = out; + acpar.hooknum = hook; + acpar.family = NFPROTO_ARP; + acpar.hotdrop = false; + + arp = arp_hdr(skb); + do { + const struct xt_entry_target *t; + + if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { + e = arpt_next_entry(e); + continue; + } + + ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); + + t = arpt_get_target_c(e); + + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct xt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != XT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, back->comefrom); + continue; + } + if (table_base + v + != arpt_next_entry(e)) { + /* Save old back ptr in next entry */ + struct arpt_entry *next = arpt_next_entry(e); + next->comefrom = (void *)back - table_base; + + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + continue; + } + + /* Targets which reenter must return + * abs. verdicts + */ + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; + verdict = t->u.kernel.target->target(skb, &acpar); + + /* Target might have changed stuff. */ + arp = arp_hdr(skb); + + if (verdict == XT_CONTINUE) + e = arpt_next_entry(e); + else + /* Verdict */ + break; + } while (!acpar.hotdrop); + xt_write_recseq_end(addend); + local_bh_enable(); + + if (acpar.hotdrop) + return NF_DROP; + else + return verdict; +} + +/* All zeroes == unconditional rule. */ +static inline bool unconditional(const struct arpt_arp *arp) +{ + static const struct arpt_arp uncond; + + return memcmp(arp, &uncond, sizeof(uncond)) == 0; +} + +/* Figures out from what hook each rule can be called: returns 0 if + * there are loops. Puts hook bitmask in comefrom. + */ +static int mark_source_chains(const struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + * to 0 as we leave), and comefrom to save source hook bitmask. + */ + for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct arpt_entry *e + = (struct arpt_entry *)(entry0 + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + const struct xt_standard_target *t + = (void *)arpt_get_target_c(e); + int visited = e->comefrom & (1 << hook); + + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { + pr_notice("arptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); + + /* Unconditional return/END. */ + if ((e->target_offset == sizeof(struct arpt_entry) && + (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < 0 && unconditional(&e->arp)) || + visited) { + unsigned int oldpos, size; + + if ((strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + + /* Return: backtrack through the last + * big jump. + */ + do { + e->comefrom ^= (1<counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct arpt_entry *) + (entry0 + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct arpt_entry *) + (entry0 + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0 && + newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct arpt_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } + + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct arpt_entry *) + (entry0 + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int check_entry(const struct arpt_entry *e, const char *name) +{ + const struct xt_entry_target *t; + + if (!arp_checkentry(&e->arp)) { + duprintf("arp_tables: arp check failed %p %s.\n", e, name); + return -EINVAL; + } + + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) + return -EINVAL; + + t = arpt_get_target_c(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + + return 0; +} + +static inline int check_target(struct arpt_entry *e, const char *name) +{ + struct xt_entry_target *t = arpt_get_target(e); + int ret; + struct xt_tgchk_param par = { + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_ARP, + }; + + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); + if (ret < 0) { + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + return ret; + } + return 0; +} + +static inline int +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) +{ + struct xt_entry_target *t; + struct xt_target *target; + int ret; + + ret = check_entry(e, name); + if (ret) + return ret; + + t = arpt_get_target(e); + target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); + ret = PTR_ERR(target); + goto out; + } + t->u.kernel.target = target; + + ret = check_target(e, name); + if (ret) + goto err; + return 0; +err: + module_put(t->u.kernel.target->me); +out: + return ret; +} + +static bool check_underflow(const struct arpt_entry *e) +{ + const struct xt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->arp)) + return false; + t = arpt_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct xt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + +static inline int check_entry_size_and_hooks(struct arpt_entry *e, + struct xt_table_info *newinfo, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int valid_hooks) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || + (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } + newinfo->underflow[h] = underflows[h]; + } + } + + /* Clear counters and comefrom */ + e->counters = ((struct xt_counters) { 0, 0 }); + e->comefrom = 0; + return 0; +} + +static inline void cleanup_entry(struct arpt_entry *e) +{ + struct xt_tgdtor_param par; + struct xt_entry_target *t; + + t = arpt_get_target(e); + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_ARP; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); +} + +/* Checks and translates the user-supplied table segment (held in + * newinfo). + */ +static int translate_table(struct xt_table_info *newinfo, void *entry0, + const struct arpt_replace *repl) +{ + struct arpt_entry *iter; + unsigned int i; + int ret = 0; + + newinfo->size = repl->size; + newinfo->number = repl->num_entries; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + break; + ++i; + if (strcmp(arpt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); + if (ret != 0) + return ret; + + if (i != repl->num_entries) { + duprintf("translate_table: %u not %u entries\n", + i, repl->num_entries); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(repl->valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, repl->hook_entry[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, repl->underflow[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) { + duprintf("Looping hook\n"); + return -ELOOP; + } + + /* Finally, each sanity check must pass */ + i = 0; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } + + if (ret != 0) { + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter); + } + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) { + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); + } + + return ret; +} + +static void get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct arpt_entry *iter; + unsigned int cpu; + unsigned int i; + + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + + i = 0; + xt_entry_foreach(iter, t->entries[cpu], t->size) { + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqcount_begin(s); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqcount_retry(s, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); + ++i; + } + } +} + +static struct xt_counters *alloc_counters(const struct xt_table *table) +{ + unsigned int countersize; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + + /* We need atomic snapshot of counters: rest doesn't change + * (other than comefrom, which userspace doesn't care + * about). + */ + countersize = sizeof(struct xt_counters) * private->number; + counters = vzalloc(countersize); + + if (counters == NULL) + return ERR_PTR(-ENOMEM); + + get_counters(private, counters); + + return counters; +} + +static int copy_entries_to_user(unsigned int total_size, + const struct xt_table *table, + void __user *userptr) +{ + unsigned int off, num; + const struct arpt_entry *e; + struct xt_counters *counters; + struct xt_table_info *private = table->private; + int ret = 0; + void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + const struct xt_entry_target *t; + + e = (struct arpt_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off + + offsetof(struct arpt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + t = arpt_get_target_c(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct xt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, const void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(NFPROTO_ARP, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, const void *src) +{ + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(NFPROTO_ARP, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static int compat_calc_entry(const struct arpt_entry *e, + const struct xt_table_info *info, + const void *base, struct xt_table_info *newinfo) +{ + const struct xt_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + entry_offset = (void *)e - base; + + t = arpt_get_target_c(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct arpt_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct arpt_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + struct arpt_entry *iter; + void *loc_cpu_entry; + int ret; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; +} +#endif + +static int get_info(struct net *net, void __user *user, + const int *len, int compat) +{ + char name[XT_TABLE_MAXNAMELEN]; + struct xt_table *t; + int ret; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[XT_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(NFPROTO_ARP); +#endif + t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), + "arptable_%s", name); + if (t && !IS_ERR(t)) { + struct arpt_getinfo info; + const struct xt_table_info *private = t->private; +#ifdef CONFIG_COMPAT + struct xt_table_info tmp; + + if (compat) { + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(NFPROTO_ARP); + private = &tmp; + } +#endif + memset(&info, 0, sizeof(info)); + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(NFPROTO_ARP); +#endif + return ret; +} + +static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, + const int *len) +{ + int ret; + struct arpt_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(net, NFPROTO_ARP, get.name); + if (t && !IS_ERR(t)) { + const struct xt_table_info *private = t->private; + + duprintf("t->private->number = %u\n", + private->number); + if (get.size == private->size) + ret = copy_entries_to_user(private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int __do_replace(struct net *net, const char *name, + unsigned int valid_hooks, + struct xt_table_info *newinfo, + unsigned int num_counters, + void __user *counters_ptr) +{ + int ret; + struct xt_table *t; + struct xt_table_info *oldinfo; + struct xt_counters *counters; + void *loc_cpu_old_entry; + struct arpt_entry *iter; + + ret = 0; + counters = vzalloc(num_counters * sizeof(struct xt_counters)); + if (!counters) { + ret = -ENOMEM; + goto out; + } + + t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), + "arptable_%s", name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters, and synchronize with replace */ + get_counters(oldinfo, counters); + + /* Decrease module usage counts and free resource */ + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter); + + xt_free_table_info(oldinfo); + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) + ret = -EFAULT; + vfree(counters); + xt_table_unlock(t); + return ret; + + put_module: + module_put(t->me); + xt_table_unlock(t); + free_newinfo_counters_untrans: + vfree(counters); + out: + return ret; +} + +static int do_replace(struct net *net, const void __user *user, + unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct arpt_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(newinfo, loc_cpu_entry, &tmp); + if (ret != 0) + goto free_newinfo; + + duprintf("arp_tables: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int do_add_counters(struct net *net, const void __user *user, + unsigned int len, int compat) +{ + unsigned int i, curcpu; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + const char *name; + int size; + void *ptmp; + struct xt_table *t; + const struct xt_table_info *private; + int ret = 0; + void *loc_cpu_entry; + struct arpt_entry *iter; + unsigned int addend; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; + + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) + return -EINVAL; + + paddc = vmalloc(len - size); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user + size, len - size) != 0) { + ret = -EFAULT; + goto free; + } + + t = xt_find_table_lock(net, NFPROTO_ARP, name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + local_bh_disable(); + private = t->private; + if (private->number != num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + /* Choose the copy that is on our node */ + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } + xt_write_recseq_end(addend); + unlock_up_free: + local_bh_enable(); + xt_table_unlock(t); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +#ifdef CONFIG_COMPAT +static inline void compat_release_entry(struct compat_arpt_entry *e) +{ + struct xt_entry_target *t; + + t = compat_arpt_get_target(e); + module_put(t->u.kernel.target->me); +} + +static inline int +check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + const char *name) +{ + struct xt_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + int ret, off, h; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_arpt_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct arpt_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + entry_offset = (void *)e - (void *)base; + + t = compat_arpt_get_target(e); + target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = PTR_ERR(target); + goto out; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); + if (ret) + goto release_target; + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + return 0; + +release_target: + module_put(t->u.kernel.target->me); +out: + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct xt_entry_target *t; + struct xt_target *target; + struct arpt_entry *de; + unsigned int origsize; + int ret, h; + + ret = 0; + origsize = *size; + de = (struct arpt_entry *)*dstptr; + memcpy(de, e, sizeof(struct arpt_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct arpt_entry); + *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + + de->target_offset = e->target_offset - (origsize - *size); + t = compat_arpt_get_target(e); + target = t->u.kernel.target; + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static int translate_compat_table(const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + struct compat_arpt_entry *iter0; + struct arpt_entry *iter1; + unsigned int size; + int ret = 0; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } + xt_compat_flush_offsets(NFPROTO_ARP); + xt_compat_unlock(NFPROTO_ARP); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = check_target(iter1, name); + if (ret != 0) + break; + ++i; + if (strcmp(arpt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; + j -= i; + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1); + } + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } + return ret; +out_unlock: + xt_compat_flush_offsets(NFPROTO_ARP); + xt_compat_unlock(NFPROTO_ARP); + goto out; +} + +struct compat_arpt_replace { + char name[XT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_ARP_NUMHOOKS]; + u32 underflow[NF_ARP_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; + struct compat_arpt_entry entries[0]; +}; + +static int compat_do_replace(struct net *net, void __user *user, + unsigned int len) +{ + int ret; + struct compat_arpt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct arpt_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = compat_do_replace(sock_net(sk), user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 1); + break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, + compat_uint_t *size, + struct xt_counters *counters, + unsigned int i) +{ + struct xt_entry_target *t; + struct compat_arpt_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; + int ret; + + origsize = *size; + ce = (struct compat_arpt_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; + + *dstptr += sizeof(struct compat_arpt_entry); + *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + + target_offset = e->target_offset - (origsize - *size); + + t = arpt_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + return ret; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; + return 0; +} + +static int compat_copy_entries_to_user(unsigned int total_size, + struct xt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + void *loc_cpu_entry; + unsigned int i = 0; + struct arpt_entry *iter; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } + vfree(counters); + return ret; +} + +struct compat_arpt_get_entries { + char name[XT_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_arpt_entry entrytable[0]; +}; + +static int compat_get_entries(struct net *net, + struct compat_arpt_get_entries __user *uptr, + int *len) +{ + int ret; + struct compat_arpt_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(NFPROTO_ARP); + t = xt_find_table_lock(net, NFPROTO_ARP, get.name); + if (t && !IS_ERR(t)) { + const struct xt_table_info *private = t->private; + struct xt_table_info info; + + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + xt_compat_flush_offsets(NFPROTO_ARP); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + xt_compat_unlock(NFPROTO_ARP); + return ret; +} + +static int do_arpt_get_ctl(struct sock *, int, void __user *, int *); + +static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, + int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 1); + break; + case ARPT_SO_GET_ENTRIES: + ret = compat_get_entries(sock_net(sk), user, len); + break; + default: + ret = do_arpt_get_ctl(sk, cmd, user, len); + } + return ret; +} +#endif + +static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(sock_net(sk), user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 0); + break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 0); + break; + + case ARPT_SO_GET_ENTRIES: + ret = get_entries(sock_net(sk), user, len); + break; + + case ARPT_SO_GET_REVISION_TARGET: { + struct xt_get_revision rev; + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + rev.name[sizeof(rev.name)-1] = 0; + + try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, + rev.revision, 1, &ret), + "arpt_%s", rev.name); + break; + } + + default: + duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct xt_table *arpt_register_table(struct net *net, + const struct xt_table *table, + const struct arpt_replace *repl) +{ + int ret; + struct xt_table_info *newinfo; + struct xt_table_info bootstrap = {0}; + void *loc_cpu_entry; + struct xt_table *new_table; + + newinfo = xt_alloc_table_info(repl->size); + if (!newinfo) { + ret = -ENOMEM; + goto out; + } + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); + + ret = translate_table(newinfo, loc_cpu_entry, repl); + duprintf("arpt_register_table: translate table gives %d\n", ret); + if (ret != 0) + goto out_free; + + new_table = xt_register_table(net, table, &bootstrap, newinfo); + if (IS_ERR(new_table)) { + ret = PTR_ERR(new_table); + goto out_free; + } + return new_table; + +out_free: + xt_free_table_info(newinfo); +out: + return ERR_PTR(ret); +} + +void arpt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct arpt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct xt_target arpt_builtin_tg[] __read_mostly = { + { + .name = XT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_ARP, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif + }, + { + .name = XT_ERROR_TARGET, + .target = arpt_error, + .targetsize = XT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_ARP, + }, +}; + +static struct nf_sockopt_ops arpt_sockopts = { + .pf = PF_INET, + .set_optmin = ARPT_BASE_CTL, + .set_optmax = ARPT_SO_SET_MAX+1, + .set = do_arpt_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_arpt_set_ctl, +#endif + .get_optmin = ARPT_BASE_CTL, + .get_optmax = ARPT_SO_GET_MAX+1, + .get = do_arpt_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_arpt_get_ctl, +#endif + .owner = THIS_MODULE, +}; + +static int __net_init arp_tables_net_init(struct net *net) +{ + return xt_proto_init(net, NFPROTO_ARP); +} + +static void __net_exit arp_tables_net_exit(struct net *net) +{ + xt_proto_fini(net, NFPROTO_ARP); +} + +static struct pernet_operations arp_tables_net_ops = { + .init = arp_tables_net_init, + .exit = arp_tables_net_exit, +}; + +static int __init arp_tables_init(void) +{ + int ret; + + ret = register_pernet_subsys(&arp_tables_net_ops); + if (ret < 0) + goto err1; + + /* No one else will be downing sem now, so we won't sleep */ + ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); + if (ret < 0) + goto err2; + + /* Register setsockopt */ + ret = nf_register_sockopt(&arpt_sockopts); + if (ret < 0) + goto err4; + + printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n"); + return 0; + +err4: + xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); +err2: + unregister_pernet_subsys(&arp_tables_net_ops); +err1: + return ret; +} + +static void __exit arp_tables_fini(void) +{ + nf_unregister_sockopt(&arpt_sockopts); + xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); + unregister_pernet_subsys(&arp_tables_net_ops); +} + +EXPORT_SYMBOL(arpt_register_table); +EXPORT_SYMBOL(arpt_unregister_table); +EXPORT_SYMBOL(arpt_do_table); + +module_init(arp_tables_init); +module_exit(arp_tables_fini); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c new file mode 100644 index 00000000..a5e52a9f --- /dev/null +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -0,0 +1,91 @@ +/* module that allows mangling of the arp payload */ +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("arptables arp payload mangle target"); + +static unsigned int +target(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct arpt_mangle *mangle = par->targinfo; + const struct arphdr *arp; + unsigned char *arpptr; + int pln, hln; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + arp = arp_hdr(skb); + arpptr = skb_network_header(skb) + sizeof(*arp); + pln = arp->ar_pln; + hln = arp->ar_hln; + /* We assume that pln and hln were checked in the match */ + if (mangle->flags & ARPT_MANGLE_SDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > skb_tail_pointer(skb))) + return NF_DROP; + memcpy(arpptr, mangle->src_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_SIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > skb_tail_pointer(skb))) + return NF_DROP; + memcpy(arpptr, &mangle->u_s.src_ip, pln); + } + arpptr += pln; + if (mangle->flags & ARPT_MANGLE_TDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > skb_tail_pointer(skb))) + return NF_DROP; + memcpy(arpptr, mangle->tgt_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_TIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > skb_tail_pointer(skb))) + return NF_DROP; + memcpy(arpptr, &mangle->u_t.tgt_ip, pln); + } + return mangle->target; +} + +static int checkentry(const struct xt_tgchk_param *par) +{ + const struct arpt_mangle *mangle = par->targinfo; + + if (mangle->flags & ~ARPT_MANGLE_MASK || + !(mangle->flags & ARPT_MANGLE_MASK)) + return -EINVAL; + + if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && + mangle->target != XT_CONTINUE) + return -EINVAL; + return 0; +} + +static struct xt_target arpt_mangle_reg __read_mostly = { + .name = "mangle", + .family = NFPROTO_ARP, + .target = target, + .targetsize = sizeof(struct arpt_mangle), + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init arpt_mangle_init(void) +{ + return xt_register_target(&arpt_mangle_reg); +} + +static void __exit arpt_mangle_fini(void) +{ + xt_unregister_target(&arpt_mangle_reg); +} + +module_init(arpt_mangle_init); +module_exit(arpt_mangle_fini); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c new file mode 100644 index 00000000..79ca5e70 --- /dev/null +++ b/net/ipv4/netfilter/arptable_filter.c @@ -0,0 +1,93 @@ +/* + * Filtering ARP tables module. + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller "); +MODULE_DESCRIPTION("arptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ + (1 << NF_ARP_FORWARD)) + +static const struct xt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_ARP, + .priority = NF_IP_PRI_FILTER, +}; + +/* The work comes in here from netfilter.c */ +static unsigned int +arptable_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); +} + +static struct nf_hook_ops *arpfilter_ops __read_mostly; + +static int __net_init arptable_filter_net_init(struct net *net) +{ + struct arpt_replace *repl; + + repl = arpt_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + net->ipv4.arptable_filter = + arpt_register_table(net, &packet_filter, repl); + kfree(repl); + if (IS_ERR(net->ipv4.arptable_filter)) + return PTR_ERR(net->ipv4.arptable_filter); + return 0; +} + +static void __net_exit arptable_filter_net_exit(struct net *net) +{ + arpt_unregister_table(net->ipv4.arptable_filter); +} + +static struct pernet_operations arptable_filter_net_ops = { + .init = arptable_filter_net_init, + .exit = arptable_filter_net_exit, +}; + +static int __init arptable_filter_init(void) +{ + int ret; + + ret = register_pernet_subsys(&arptable_filter_net_ops); + if (ret < 0) + return ret; + + arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) { + ret = PTR_ERR(arpfilter_ops); + goto cleanup_table; + } + return ret; + +cleanup_table: + unregister_pernet_subsys(&arptable_filter_net_ops); + return ret; +} + +static void __exit arptable_filter_fini(void) +{ + xt_hook_unlink(&packet_filter, arpfilter_ops); + unregister_pernet_subsys(&arptable_filter_net_ops); +} + +module_init(arptable_filter_init); +module_exit(arptable_filter_fini); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c new file mode 100644 index 00000000..5c9b9d96 --- /dev/null +++ b/net/ipv4/netfilter/ip_queue.c @@ -0,0 +1,639 @@ +/* + * This is a module which is used for queueing IPv4 packets and + * communicating with userspace via netlink. + * + * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip_queue" +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" + +typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); + +static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; +static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; +static DEFINE_SPINLOCK(queue_lock); +static int peer_pid __read_mostly; +static unsigned int copy_range __read_mostly; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl __read_mostly; +static LIST_HEAD(queue_list); +static DEFINE_MUTEX(ipqnl_mutex); + +static inline void +__ipq_enqueue_entry(struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue_list); + queue_total++; +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + if (range > 0xFFFF) + range = 0xFFFF; + copy_range = range; + copy_mode = mode; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NULL, 0); +} + +static struct nf_queue_entry * +ipq_find_dequeue_entry(unsigned long id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue_lock); + + list_for_each_entry(i, &queue_list, list) { + if ((unsigned long)i == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue_total--; + } + + spin_unlock_bh(&queue_lock); + return entry; +} + +static void +__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue_total--; + nf_reinject(entry, NF_DROP); + } + } +} + +static void +ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + spin_lock_bh(&queue_lock); + __ipq_flush(cmpfn, data); + spin_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) +{ + sk_buff_data_t old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + struct timeval tv; + + switch (ACCESS_ONCE(copy_mode)) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + break; + + case IPQ_COPY_PACKET: + if (entry->skb->ip_summed == CHECKSUM_PARTIAL && + (*errp = skb_checksum_help(entry->skb))) + return NULL; + + data_len = ACCESS_ONCE(copy_range); + if (data_len == 0 || data_len > entry->skb->len) + data_len = entry->skb->len; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + return NULL; + } + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + tv = ktime_to_timeval(entry->skb->tstamp); + pmsg->timestamp_sec = tv.tv_sec; + pmsg->timestamp_usec = tv.tv_usec; + pmsg->mark = entry->skb->mark; + pmsg->hook = entry->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->indev) + strcpy(pmsg->indev_name, entry->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->outdev) + strcpy(pmsg->outdev_name, entry->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->indev && entry->skb->dev && + entry->skb->mac_header != entry->skb->network_header) { + pmsg->hw_type = entry->skb->dev->type; + pmsg->hw_addrlen = dev_parse_header(entry->skb, + pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + *errp = -EINVAL; + printk(KERN_ERR "ip_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + int status = -EINVAL; + struct sk_buff *nskb; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + return status; + + spin_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + spin_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + spin_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) +{ + int diff; + struct iphdr *user_iph = (struct iphdr *)v->payload; + struct sk_buff *nskb; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) { + if (pskb_trim(e->skb, v->data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "ip_queue: error " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, v->data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, v->payload, v->data_len); + e->skb->ip_summed = CHECKSUM_NONE; + + return 0; +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct nf_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv4(vmsg, entry) < 0) + verdict = NF_DROP; + + nf_reinject(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + spin_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + spin_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->indev) + if (entry->indev->ifindex == ifindex) + return 1; + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) + return 1; +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + ipq_flush(dev_cmp, ifindex); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +__ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags; + unsigned int nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = nlmsg_hdr(skb); + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + spin_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + spin_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + spin_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + nlmsglen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); +} + +static void +ipq_rcv_skb(struct sk_buff *skb) +{ + mutex_lock(&ipqnl_mutex); + __ipq_rcv_skb(skb); + mutex_unlock(&ipqnl_mutex); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { + spin_lock_bh(&queue_lock); + if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) + __ipq_reset(); + spin_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; +#endif + +#ifdef CONFIG_PROC_FS +static int ip_queue_show(struct seq_file *m, void *v) +{ + spin_lock_bh(&queue_lock); + + seq_printf(m, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netlink dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + spin_unlock_bh(&queue_lock); + return 0; +} + +static int ip_queue_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_queue_show, NULL); +} + +static const struct file_operations ip_queue_proc_fops = { + .open = ip_queue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; +#endif + +static const struct nf_queue_handler nfqh = { + .name = "ip_queue", + .outfn = &ipq_enqueue_packet, +}; + +static int __init ip_queue_init(void) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc __maybe_unused; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, + ipq_rcv_skb, NULL, THIS_MODULE); + if (ipqnl == NULL) { + printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, + &ip_queue_proc_fops); + if (!proc) { + printk(KERN_ERR "ip_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } +#endif + register_netdevice_notifier(&ipq_dev_notifier); +#ifdef CONFIG_SYSCTL + ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table); +#endif + status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh); + if (status < 0) { + printk(KERN_ERR "ip_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup_sysctl: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ipq_sysctl_header); +#endif + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); +cleanup_ipqnl: __maybe_unused + netlink_kernel_release(ipqnl); + mutex_lock(&ipqnl_mutex); + mutex_unlock(&ipqnl_mutex); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static void __exit ip_queue_fini(void) +{ + nf_unregister_queue_handlers(&nfqh); + + ipq_flush(NULL, 0); + +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ipq_sysctl_header); +#endif + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); + + netlink_kernel_release(ipqnl); + mutex_lock(&ipqnl_mutex); + mutex_unlock(&ipqnl_mutex); + + netlink_unregister_notifier(&ipq_nl_notifier); +} + +MODULE_DESCRIPTION("IPv4 packet queue handler"); +MODULE_AUTHOR("James Morris "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL); + +module_init(ip_queue_init); +module_exit(ip_queue_fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c new file mode 100644 index 00000000..24e556e8 --- /dev/null +++ b/net/ipv4/netfilter/ip_tables.c @@ -0,0 +1,2271 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "../../netfilter/xt_repldata.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("IPv4 packet filter"); + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) pr_info(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) pr_info(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) WARN_ON(!(x)) +#else +#define IP_NF_ASSERT(x) +#endif + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +void *ipt_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ipt, IPT); +} +EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); + +/* Returns whether matches rule or not. */ +/* Performance critical - called for every packet */ +static inline bool +ip_packet_match(const struct iphdr *ip, + const char *indev, + const char *outdev, + const struct ipt_ip *ipinfo, + int isfrag) +{ + unsigned long ret; + +#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) + + if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) || + FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", + &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr, + ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n", + &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr, + ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + return false; + } + + ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); + + if (FWINV(ret != 0, IPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ipinfo->iniface, + ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + return false; + } + + ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); + + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ipinfo->outiface, + ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + return false; + } + + /* Check specific protocol */ + if (ipinfo->proto && + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, ipinfo->proto, + ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + return false; + } + + /* If we have a fragment rule but the packet is not a fragment + * then we return zero */ + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + return false; + } + + return true; +} + +static bool +ip_checkentry(const struct ipt_ip *ip) +{ + if (ip->flags & ~IPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ip->flags & ~IPT_F_MASK); + return false; + } + if (ip->invflags & ~IPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ip->invflags & ~IPT_INV_MASK); + return false; + } + return true; +} + +static unsigned int +ipt_error(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (net_ratelimit()) + pr_info("error: `%s'\n", (const char *)par->targinfo); + + return NF_DROP; +} + +/* Performance critical */ +static inline struct ipt_entry * +get_entry(const void *base, unsigned int offset) +{ + return (struct ipt_entry *)(base + offset); +} + +/* All zeroes == unconditional rule. */ +/* Mildly perf critical (only if packet tracing is on) */ +static inline bool unconditional(const struct ipt_ip *ip) +{ + static const struct ipt_ip uncond; + + return memcmp(ip, &uncond, sizeof(uncond)) == 0; +#undef FWINV +} + +/* for const-correctness */ +static inline const struct xt_entry_target * +ipt_get_target_c(const struct ipt_entry *e) +{ + return ipt_get_target((struct ipt_entry *)e); +} + +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +static const char *const hooknames[] = { + [NF_INET_PRE_ROUTING] = "PREROUTING", + [NF_INET_LOCAL_IN] = "INPUT", + [NF_INET_FORWARD] = "FORWARD", + [NF_INET_LOCAL_OUT] = "OUTPUT", + [NF_INET_POST_ROUTING] = "POSTROUTING", +}; + +enum nf_ip_trace_comments { + NF_IP_TRACE_COMMENT_RULE, + NF_IP_TRACE_COMMENT_RETURN, + NF_IP_TRACE_COMMENT_POLICY, +}; + +static const char *const comments[] = { + [NF_IP_TRACE_COMMENT_RULE] = "rule", + [NF_IP_TRACE_COMMENT_RETURN] = "return", + [NF_IP_TRACE_COMMENT_POLICY] = "policy", +}; + +static struct nf_loginfo trace_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 4, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* Mildly perf critical (only if packet tracing is on) */ +static inline int +get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, + const char *hookname, const char **chainname, + const char **comment, unsigned int *rulenum) +{ + const struct xt_standard_target *t = (void *)ipt_get_target_c(s); + + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { + /* Head of user chain: ERROR target with chainname */ + *chainname = t->target.data; + (*rulenum) = 0; + } else if (s == e) { + (*rulenum)++; + + if (s->target_offset == sizeof(struct ipt_entry) && + strcmp(t->target.u.kernel.target->name, + XT_STANDARD_TARGET) == 0 && + t->verdict < 0 && + unconditional(&s->ip)) { + /* Tail of chains: STANDARD target (return/policy) */ + *comment = *chainname == hookname + ? comments[NF_IP_TRACE_COMMENT_POLICY] + : comments[NF_IP_TRACE_COMMENT_RETURN]; + } + return 1; + } else + (*rulenum)++; + + return 0; +} + +static void trace_packet(const struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + const char *tablename, + const struct xt_table_info *private, + const struct ipt_entry *e) +{ + const void *table_base; + const struct ipt_entry *root; + const char *hookname, *chainname, *comment; + const struct ipt_entry *iter; + unsigned int rulenum = 0; + + table_base = private->entries[smp_processor_id()]; + root = get_entry(table_base, private->hook_entry[hook]); + + hookname = chainname = hooknames[hook]; + comment = comments[NF_IP_TRACE_COMMENT_RULE]; + + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; + + nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); +} +#endif + +static inline __pure +struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) +{ + return (void *)entry + entry->next_offset; +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ipt_do_table(struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct xt_table *table) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + const struct iphdr *ip; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + const void *table_base; + struct ipt_entry *e, **jumpstack; + unsigned int *stackptr, origptr, cpu; + const struct xt_table_info *private; + struct xt_action_param acpar; + unsigned int addend; + + /* Initialization */ + ip = ip_hdr(skb); + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; + acpar.thoff = ip_hdrlen(skb); + acpar.hotdrop = false; + acpar.in = in; + acpar.out = out; + acpar.family = NFPROTO_IPV4; + acpar.hooknum = hook; + + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + local_bh_disable(); + addend = xt_write_recseq_begin(); + private = table->private; + cpu = smp_processor_id(); + table_base = private->entries[cpu]; + jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); + origptr = *stackptr; + + e = get_entry(table_base, private->hook_entry[hook]); + + pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", + table->name, hook, origptr, + get_entry(table_base, private->underflow[hook])); + + do { + const struct xt_entry_target *t; + const struct xt_entry_match *ematch; + + IP_NF_ASSERT(e); + if (!ip_packet_match(ip, indev, outdev, + &e->ip, acpar.fragoff)) { + no_match: + e = ipt_next_entry(e); + continue; + } + + xt_ematch_foreach(ematch, e) { + acpar.match = ematch->u.kernel.match; + acpar.matchinfo = ematch->data; + if (!acpar.match->match(skb, &acpar)) + goto no_match; + } + + ADD_COUNTER(e->counters, skb->len, 1); + + t = ipt_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + /* The packet is traced: log it */ + if (unlikely(skb->nf_trace)) + trace_packet(skb, hook, in, out, + table->name, private, e); +#endif + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct xt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != XT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + if (*stackptr <= origptr) { + e = get_entry(table_base, + private->underflow[hook]); + pr_debug("Underflow (this is normal) " + "to %p\n", e); + } else { + e = jumpstack[--*stackptr]; + pr_debug("Pulled %p out from pos %u\n", + e, *stackptr); + e = ipt_next_entry(e); + } + continue; + } + if (table_base + v != ipt_next_entry(e) && + !(e->ip.flags & IPT_F_GOTO)) { + if (*stackptr >= private->stacksize) { + verdict = NF_DROP; + break; + } + jumpstack[(*stackptr)++] = e; + pr_debug("Pushed %p into pos %u\n", + e, *stackptr - 1); + } + + e = get_entry(table_base, v); + continue; + } + + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; + + verdict = t->u.kernel.target->target(skb, &acpar); + /* Target might have changed stuff. */ + ip = ip_hdr(skb); + if (verdict == XT_CONTINUE) + e = ipt_next_entry(e); + else + /* Verdict */ + break; + } while (!acpar.hotdrop); + pr_debug("Exiting %s; resetting sp from %u to %u\n", + __func__, *stackptr, origptr); + *stackptr = origptr; + xt_write_recseq_end(addend); + local_bh_enable(); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (acpar.hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(const struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + const struct xt_standard_target *t + = (void *)ipt_get_target_c(e); + int visited = e->comefrom & (1 << hook); + + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { + pr_err("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); + + /* Unconditional return/END. */ + if ((e->target_offset == sizeof(struct ipt_entry) && + (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < 0 && unconditional(&e->ip)) || + visited) { + unsigned int oldpos, size; + + if ((strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<comefrom + & (1 << NF_INET_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ipt_entry *) + (entry0 + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ipt_entry *) + (entry0 + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0 && + newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct ipt_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ipt_entry *) + (entry0 + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static void cleanup_match(struct xt_entry_match *m, struct net *net) +{ + struct xt_mtdtor_param par; + + par.net = net; + par.match = m->u.kernel.match; + par.matchinfo = m->data; + par.family = NFPROTO_IPV4; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); +} + +static int +check_entry(const struct ipt_entry *e, const char *name) +{ + const struct xt_entry_target *t; + + if (!ip_checkentry(&e->ip)) { + duprintf("ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + if (e->target_offset + sizeof(struct xt_entry_target) > + e->next_offset) + return -EINVAL; + + t = ipt_get_target_c(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + + return 0; +} + +static int +check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) +{ + const struct ipt_ip *ip = par->entryinfo; + int ret; + + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); + if (ret < 0) { + duprintf("check failed for `%s'.\n", par->match->name); + return ret; + } + return 0; +} + +static int +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) +{ + struct xt_match *match; + int ret; + + match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { + duprintf("find_check_match: `%s' not found\n", m->u.user.name); + return PTR_ERR(match); + } + m->u.kernel.match = match; + + ret = check_match(m, par); + if (ret) + goto err; + + return 0; +err: + module_put(m->u.kernel.match->me); + return ret; +} + +static int check_target(struct ipt_entry *e, struct net *net, const char *name) +{ + struct xt_entry_target *t = ipt_get_target(e); + struct xt_tgchk_param par = { + .net = net, + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_IPV4, + }; + int ret; + + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); + if (ret < 0) { + duprintf("check failed for `%s'.\n", + t->u.kernel.target->name); + return ret; + } + return 0; +} + +static int +find_check_entry(struct ipt_entry *e, struct net *net, const char *name, + unsigned int size) +{ + struct xt_entry_target *t; + struct xt_target *target; + int ret; + unsigned int j; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; + + ret = check_entry(e, name); + if (ret) + return ret; + + j = 0; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ip; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV4; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } + + t = ipt_get_target(e); + target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); + ret = PTR_ERR(target); + goto cleanup_matches; + } + t->u.kernel.target = target; + + ret = check_target(e, net, name); + if (ret) + goto err; + return 0; + err: + module_put(t->u.kernel.target->me); + cleanup_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } + return ret; +} + +static bool check_underflow(const struct ipt_entry *e) +{ + const struct xt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->ip)) + return false; + t = ipt_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct xt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + +static int +check_entry_size_and_hooks(struct ipt_entry *e, + struct xt_table_info *newinfo, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int valid_hooks) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || + (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } + newinfo->underflow[h] = underflows[h]; + } + } + + /* Clear counters and comefrom */ + e->counters = ((struct xt_counters) { 0, 0 }); + e->comefrom = 0; + return 0; +} + +static void +cleanup_entry(struct ipt_entry *e, struct net *net) +{ + struct xt_tgdtor_param par; + struct xt_entry_target *t; + struct xt_entry_match *ematch; + + /* Cleanup all matches */ + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); + t = ipt_get_target(e); + + par.net = net; + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_IPV4; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ipt_replace *repl) +{ + struct ipt_entry *iter; + unsigned int i; + int ret = 0; + + newinfo->size = repl->size; + newinfo->number = repl->num_entries; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + if (strcmp(ipt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + + if (i != repl->num_entries) { + duprintf("translate_table: %u not %u entries\n", + i, repl->num_entries); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(repl->valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, repl->hook_entry[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, repl->underflow[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } + + if (ret != 0) { + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) { + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); + } + + return ret; +} + +static void +get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ipt_entry *iter; + unsigned int cpu; + unsigned int i; + + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + + i = 0; + xt_entry_foreach(iter, t->entries[cpu], t->size) { + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqcount_begin(s); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqcount_retry(s, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); + ++i; /* macro does multi eval of i */ + } + } +} + +static struct xt_counters *alloc_counters(const struct xt_table *table) +{ + unsigned int countersize; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct xt_counters) * private->number; + counters = vzalloc(countersize); + + if (counters == NULL) + return ERR_PTR(-ENOMEM); + + get_counters(private, counters); + + return counters; +} + +static int +copy_entries_to_user(unsigned int total_size, + const struct xt_table *table, + void __user *userptr) +{ + unsigned int off, num; + const struct ipt_entry *e; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + int ret = 0; + const void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + const struct xt_entry_match *m; + const struct xt_entry_target *t; + + e = (struct ipt_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off + + offsetof(struct ipt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ipt_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct xt_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ipt_get_target_c(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct xt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, const void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(AF_INET, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, const void *src) +{ + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(AF_INET, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static int compat_calc_entry(const struct ipt_entry *e, + const struct xt_table_info *info, + const void *base, struct xt_table_info *newinfo) +{ + const struct xt_entry_match *ematch; + const struct xt_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + entry_offset = (void *)e - base; + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ipt_get_target_c(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(AF_INET, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct ipt_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct ipt_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + struct ipt_entry *iter; + void *loc_cpu_entry; + int ret; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; +} +#endif + +static int get_info(struct net *net, void __user *user, + const int *len, int compat) +{ + char name[XT_TABLE_MAXNAMELEN]; + struct xt_table *t; + int ret; + + if (*len != sizeof(struct ipt_getinfo)) { + duprintf("length %u != %zu\n", *len, + sizeof(struct ipt_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[XT_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(AF_INET); +#endif + t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), + "iptable_%s", name); + if (t && !IS_ERR(t)) { + struct ipt_getinfo info; + const struct xt_table_info *private = t->private; +#ifdef CONFIG_COMPAT + struct xt_table_info tmp; + + if (compat) { + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(AF_INET); + private = &tmp; + } +#endif + memset(&info, 0, sizeof(info)); + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(AF_INET); +#endif + return ret; +} + +static int +get_entries(struct net *net, struct ipt_get_entries __user *uptr, + const int *len) +{ + int ret; + struct ipt_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct ipt_get_entries) + get.size) { + duprintf("get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(net, AF_INET, get.name); + if (t && !IS_ERR(t)) { + const struct xt_table_info *private = t->private; + duprintf("t->private->number = %u\n", private->number); + if (get.size == private->size) + ret = copy_entries_to_user(private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int +__do_replace(struct net *net, const char *name, unsigned int valid_hooks, + struct xt_table_info *newinfo, unsigned int num_counters, + void __user *counters_ptr) +{ + int ret; + struct xt_table *t; + struct xt_table_info *oldinfo; + struct xt_counters *counters; + void *loc_cpu_old_entry; + struct ipt_entry *iter; + + ret = 0; + counters = vzalloc(num_counters * sizeof(struct xt_counters)); + if (!counters) { + ret = -ENOMEM; + goto out; + } + + t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), + "iptable_%s", name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters, and synchronize with replace */ + get_counters(oldinfo, counters); + + /* Decrease module usage counts and free resource */ + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + + xt_free_table_info(oldinfo); + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) + ret = -EFAULT; + vfree(counters); + xt_table_unlock(t); + return ret; + + put_module: + module_put(t->me); + xt_table_unlock(t); + free_newinfo_counters_untrans: + vfree(counters); + out: + return ret; +} + +static int +do_replace(struct net *net, const void __user *user, unsigned int len) +{ + int ret; + struct ipt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ipt_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); + if (ret != 0) + goto free_newinfo; + + duprintf("Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +do_add_counters(struct net *net, const void __user *user, + unsigned int len, int compat) +{ + unsigned int i, curcpu; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + const char *name; + int size; + void *ptmp; + struct xt_table *t; + const struct xt_table_info *private; + int ret = 0; + void *loc_cpu_entry; + struct ipt_entry *iter; + unsigned int addend; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; + + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) + return -EINVAL; + + paddc = vmalloc(len - size); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user + size, len - size) != 0) { + ret = -EFAULT; + goto free; + } + + t = xt_find_table_lock(net, AF_INET, name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + local_bh_disable(); + private = t->private; + if (private->number != num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + /* Choose the copy that is on our node */ + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } + xt_write_recseq_end(addend); + unlock_up_free: + local_bh_enable(); + xt_table_unlock(t); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +#ifdef CONFIG_COMPAT +struct compat_ipt_replace { + char name[XT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_INET_NUMHOOKS]; + u32 underflow[NF_INET_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; /* struct xt_counters * */ + struct compat_ipt_entry entries[0]; +}; + +static int +compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, + unsigned int *size, struct xt_counters *counters, + unsigned int i) +{ + struct xt_entry_target *t; + struct compat_ipt_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; + const struct xt_entry_match *ematch; + int ret = 0; + + origsize = *size; + ce = (struct compat_ipt_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; + + *dstptr += sizeof(struct compat_ipt_entry); + *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } + target_offset = e->target_offset - (origsize - *size); + t = ipt_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + return ret; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; + return 0; +} + +static int +compat_find_calc_match(struct xt_entry_match *m, + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + int *size) +{ + struct xt_match *match; + + match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { + duprintf("compat_check_calc_match: `%s' not found\n", + m->u.user.name); + return PTR_ERR(match); + } + m->u.kernel.match = match; + *size += xt_compat_match_offset(match); + return 0; +} + +static void compat_release_entry(struct compat_ipt_entry *e) +{ + struct xt_entry_target *t; + struct xt_entry_match *ematch; + + /* Cleanup all matches */ + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); + t = compat_ipt_get_target(e); + module_put(t->u.kernel.target->me); +} + +static int +check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + const char *name) +{ + struct xt_entry_match *ematch; + struct xt_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + unsigned int j; + int ret, off, h; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_ipt_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct ipt_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + entry_offset = (void *)e - (void *)base; + j = 0; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ip, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } + + t = compat_ipt_get_target(e); + target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = PTR_ERR(target); + goto release_matches; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(AF_INET, entry_offset, off); + if (ret) + goto out; + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + return 0; + +out: + module_put(t->u.kernel.target->me); +release_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct xt_entry_target *t; + struct xt_target *target; + struct ipt_entry *de; + unsigned int origsize; + int ret, h; + struct xt_entry_match *ematch; + + ret = 0; + origsize = *size; + de = (struct ipt_entry *)*dstptr; + memcpy(de, e, sizeof(struct ipt_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct ipt_entry); + *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } + de->target_offset = e->target_offset - (origsize - *size); + t = compat_ipt_get_target(e); + target = t->u.kernel.target; + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static int +compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) +{ + struct xt_entry_match *ematch; + struct xt_mtchk_param mtpar; + unsigned int j; + int ret = 0; + + j = 0; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ip; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV4; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } + + ret = check_target(e, net, name); + if (ret) + goto cleanup_matches; + return 0; + + cleanup_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } + return ret; +} + +static int +translate_compat_table(struct net *net, + const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + struct compat_ipt_entry *iter0; + struct ipt_entry *iter1; + unsigned int size; + int ret; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } + xt_compat_flush_offsets(AF_INET); + xt_compat_unlock(AF_INET); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + if (strcmp(ipt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; + j -= i; + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } + return ret; +out_unlock: + xt_compat_flush_offsets(AF_INET); + xt_compat_unlock(AF_INET); + goto out; +} + +static int +compat_do_replace(struct net *net, void __user *user, unsigned int len) +{ + int ret; + struct compat_ipt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ipt_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = compat_do_replace(sock_net(sk), user, len); + break; + + case IPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 1); + break; + + default: + duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct compat_ipt_get_entries { + char name[XT_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_ipt_entry entrytable[0]; +}; + +static int +compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + const void *loc_cpu_entry; + unsigned int i = 0; + struct ipt_entry *iter; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } + + vfree(counters); + return ret; +} + +static int +compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, + int *len) +{ + int ret; + struct compat_ipt_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + + if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(AF_INET); + t = xt_find_table_lock(net, AF_INET, get.name); + if (t && !IS_ERR(t)) { + const struct xt_table_info *private = t->private; + struct xt_table_info info; + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + xt_compat_flush_offsets(AF_INET); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + xt_compat_unlock(AF_INET); + return ret; +} + +static int do_ipt_get_ctl(struct sock *, int, void __user *, int *); + +static int +compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 1); + break; + case IPT_SO_GET_ENTRIES: + ret = compat_get_entries(sock_net(sk), user, len); + break; + default: + ret = do_ipt_get_ctl(sk, cmd, user, len); + } + return ret; +} +#endif + +static int +do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = do_replace(sock_net(sk), user, len); + break; + + case IPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 0); + break; + + default: + duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 0); + break; + + case IPT_SO_GET_ENTRIES: + ret = get_entries(sock_net(sk), user, len); + break; + + case IPT_SO_GET_REVISION_MATCH: + case IPT_SO_GET_REVISION_TARGET: { + struct xt_get_revision rev; + int target; + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + rev.name[sizeof(rev.name)-1] = 0; + + if (cmd == IPT_SO_GET_REVISION_TARGET) + target = 1; + else + target = 0; + + try_then_request_module(xt_find_revision(AF_INET, rev.name, + rev.revision, + target, &ret), + "ipt_%s", rev.name); + break; + } + + default: + duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct xt_table *ipt_register_table(struct net *net, + const struct xt_table *table, + const struct ipt_replace *repl) +{ + int ret; + struct xt_table_info *newinfo; + struct xt_table_info bootstrap = {0}; + void *loc_cpu_entry; + struct xt_table *new_table; + + newinfo = xt_alloc_table_info(repl->size); + if (!newinfo) { + ret = -ENOMEM; + goto out; + } + + /* choose the copy on our node/cpu, but dont care about preemption */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); + + ret = translate_table(net, newinfo, loc_cpu_entry, repl); + if (ret != 0) + goto out_free; + + new_table = xt_register_table(net, table, &bootstrap, newinfo); + if (IS_ERR(new_table)) { + ret = PTR_ERR(new_table); + goto out_free; + } + + return new_table; + +out_free: + xt_free_table_info(newinfo); +out: + return ERR_PTR(ret); +} + +void ipt_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ipt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline bool +icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + bool invert) +{ + return ((test_type == 0xFF) || + (type == test_type && code >= min_code && code <= max_code)) + ^ invert; +} + +static bool +icmp_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmphdr *ic; + struct icmphdr _icmph; + const struct ipt_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ICMP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->type, ic->code, + !!(icmpinfo->invflags&IPT_ICMP_INV)); +} + +static int icmp_checkentry(const struct xt_mtchk_param *par) +{ + const struct ipt_icmp *icmpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; +} + +static struct xt_target ipt_builtin_tg[] __read_mostly = { + { + .name = XT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_IPV4, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif + }, + { + .name = XT_ERROR_TARGET, + .target = ipt_error, + .targetsize = XT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_IPV4, + }, +}; + +static struct nf_sockopt_ops ipt_sockopts = { + .pf = PF_INET, + .set_optmin = IPT_BASE_CTL, + .set_optmax = IPT_SO_SET_MAX+1, + .set = do_ipt_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_ipt_set_ctl, +#endif + .get_optmin = IPT_BASE_CTL, + .get_optmax = IPT_SO_GET_MAX+1, + .get = do_ipt_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_ipt_get_ctl, +#endif + .owner = THIS_MODULE, +}; + +static struct xt_match ipt_builtin_mt[] __read_mostly = { + { + .name = "icmp", + .match = icmp_match, + .matchsize = sizeof(struct ipt_icmp), + .checkentry = icmp_checkentry, + .proto = IPPROTO_ICMP, + .family = NFPROTO_IPV4, + }, +}; + +static int __net_init ip_tables_net_init(struct net *net) +{ + return xt_proto_init(net, NFPROTO_IPV4); +} + +static void __net_exit ip_tables_net_exit(struct net *net) +{ + xt_proto_fini(net, NFPROTO_IPV4); +} + +static struct pernet_operations ip_tables_net_ops = { + .init = ip_tables_net_init, + .exit = ip_tables_net_exit, +}; + +static int __init ip_tables_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_tables_net_ops); + if (ret < 0) + goto err1; + + /* No one else will be downing sem now, so we won't sleep */ + ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); + if (ret < 0) + goto err2; + ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); + if (ret < 0) + goto err4; + + /* Register setsockopt */ + ret = nf_register_sockopt(&ipt_sockopts); + if (ret < 0) + goto err5; + + pr_info("(C) 2000-2006 Netfilter Core Team\n"); + return 0; + +err5: + xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); +err4: + xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); +err2: + unregister_pernet_subsys(&ip_tables_net_ops); +err1: + return ret; +} + +static void __exit ip_tables_fini(void) +{ + nf_unregister_sockopt(&ipt_sockopts); + + xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt)); + xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); + unregister_pernet_subsys(&ip_tables_net_ops); +} + +EXPORT_SYMBOL(ipt_register_table); +EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_do_table); +module_init(ip_tables_init); +module_exit(ip_tables_fini); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c new file mode 100644 index 00000000..5c9e97c7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -0,0 +1,746 @@ +/* Cluster IP hashmark target + * (C) 2003-2004 by Harald Welte + * based on ideas of Fabio Olive Leite + * + * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CLUSTERIP_VERSION "0.8" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); + +struct clusterip_config { + struct list_head list; /* list of all configs */ + atomic_t refcount; /* reference count */ + atomic_t entries; /* number of entries/rules + * referencing us */ + + __be32 clusterip; /* the IP address */ + u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ + struct net_device *dev; /* device */ + u_int16_t num_total_nodes; /* total number of nodes */ + unsigned long local_nodes; /* node number array */ + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; /* proc dir entry */ +#endif + enum clusterip_hashmode hash_mode; /* which hashing mode */ + u_int32_t hash_initval; /* hash initialization */ + struct rcu_head rcu; +}; + +static LIST_HEAD(clusterip_configs); + +/* clusterip_lock protects the clusterip_configs list */ +static DEFINE_SPINLOCK(clusterip_lock); + +#ifdef CONFIG_PROC_FS +static const struct file_operations clusterip_proc_fops; +static struct proc_dir_entry *clusterip_procdir; +#endif + +static inline void +clusterip_config_get(struct clusterip_config *c) +{ + atomic_inc(&c->refcount); +} + + +static void clusterip_config_rcu_free(struct rcu_head *head) +{ + kfree(container_of(head, struct clusterip_config, rcu)); +} + +static inline void +clusterip_config_put(struct clusterip_config *c) +{ + if (atomic_dec_and_test(&c->refcount)) + call_rcu_bh(&c->rcu, clusterip_config_rcu_free); +} + +/* decrease the count of entries using/referencing this config. If last + * entry(rule) is removed, remove the config from lists, but don't free it + * yet, since proc-files could still be holding references */ +static inline void +clusterip_config_entry_put(struct clusterip_config *c) +{ + local_bh_disable(); + if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { + list_del_rcu(&c->list); + spin_unlock(&clusterip_lock); + local_bh_enable(); + + dev_mc_del(c->dev, c->clustermac); + dev_put(c->dev); + + /* In case anyone still accesses the file, the open/close + * functions are also incrementing the refcount on their own, + * so it's safe to remove the entry even if it's in use. */ +#ifdef CONFIG_PROC_FS + remove_proc_entry(c->pde->name, c->pde->parent); +#endif + return; + } + local_bh_enable(); +} + +static struct clusterip_config * +__clusterip_config_find(__be32 clusterip) +{ + struct clusterip_config *c; + + list_for_each_entry_rcu(c, &clusterip_configs, list) { + if (c->clusterip == clusterip) + return c; + } + + return NULL; +} + +static inline struct clusterip_config * +clusterip_config_find_get(__be32 clusterip, int entry) +{ + struct clusterip_config *c; + + rcu_read_lock_bh(); + c = __clusterip_config_find(clusterip); + if (c) { + if (unlikely(!atomic_inc_not_zero(&c->refcount))) + c = NULL; + else if (entry) + atomic_inc(&c->entries); + } + rcu_read_unlock_bh(); + + return c; +} + +static void +clusterip_config_init_nodelist(struct clusterip_config *c, + const struct ipt_clusterip_tgt_info *i) +{ + int n; + + for (n = 0; n < i->num_local_nodes; n++) + set_bit(i->local_nodes[n] - 1, &c->local_nodes); +} + +static struct clusterip_config * +clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, + struct net_device *dev) +{ + struct clusterip_config *c; + + c = kzalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return NULL; + + c->dev = dev; + c->clusterip = ip; + memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + c->num_total_nodes = i->num_total_nodes; + clusterip_config_init_nodelist(c, i); + c->hash_mode = i->hash_mode; + c->hash_initval = i->hash_initval; + atomic_set(&c->refcount, 1); + atomic_set(&c->entries, 1); + +#ifdef CONFIG_PROC_FS + { + char buffer[16]; + + /* create proc dir entry */ + sprintf(buffer, "%pI4", &ip); + c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, + clusterip_procdir, + &clusterip_proc_fops, c); + if (!c->pde) { + kfree(c); + return NULL; + } + } +#endif + + spin_lock_bh(&clusterip_lock); + list_add_rcu(&c->list, &clusterip_configs); + spin_unlock_bh(&clusterip_lock); + + return c; +} + +#ifdef CONFIG_PROC_FS +static int +clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) +{ + + if (nodenum == 0 || + nodenum > c->num_total_nodes) + return 1; + + /* check if we already have this number in our bitfield */ + if (test_and_set_bit(nodenum - 1, &c->local_nodes)) + return 1; + + return 0; +} + +static bool +clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) +{ + if (nodenum == 0 || + nodenum > c->num_total_nodes) + return true; + + if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) + return false; + + return true; +} +#endif + +static inline u_int32_t +clusterip_hashfn(const struct sk_buff *skb, + const struct clusterip_config *config) +{ + const struct iphdr *iph = ip_hdr(skb); + unsigned long hashval; + u_int16_t sport = 0, dport = 0; + int poff; + + poff = proto_ports_offset(iph->protocol); + if (poff >= 0) { + const u_int16_t *ports; + u16 _ports[2]; + + ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); + if (ports) { + sport = ports[0]; + dport = ports[1]; + } + } else { + if (net_ratelimit()) + pr_info("unknown protocol %u\n", iph->protocol); + } + + switch (config->hash_mode) { + case CLUSTERIP_HASHMODE_SIP: + hashval = jhash_1word(ntohl(iph->saddr), + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT: + hashval = jhash_2words(ntohl(iph->saddr), sport, + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT_DPT: + hashval = jhash_3words(ntohl(iph->saddr), sport, dport, + config->hash_initval); + break; + default: + /* to make gcc happy */ + hashval = 0; + /* This cannot happen, unless the check function wasn't called + * at rule load time */ + pr_info("unknown mode %u\n", config->hash_mode); + BUG(); + break; + } + + /* node numbers are 1..n, not 0..n */ + return (((u64)hashval * config->num_total_nodes) >> 32) + 1; +} + +static inline int +clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) +{ + return test_bit(hash - 1, &config->local_nodes); +} + +/*********************************************************************** + * IPTABLES TARGET + ***********************************************************************/ + +static unsigned int +clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + u_int32_t hash; + + /* don't need to clusterip_config_get() here, since refcount + * is only decremented by destroy() - and ip_tables guarantees + * that the ->target() function isn't called after ->destroy() */ + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_DROP; + + /* special case: ICMP error handling. conntrack distinguishes between + * error messages (RELATED) and information requests (see below) */ + if (ip_hdr(skb)->protocol == IPPROTO_ICMP && + (ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)) + return XT_CONTINUE; + + /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, + * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here + * on, which all have an ID field [relevant for hashing]. */ + + hash = clusterip_hashfn(skb, cipinfo->config); + + switch (ctinfo) { + case IP_CT_NEW: + ct->mark = hash; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + /* FIXME: we don't handle expectations at the + * moment. they can arrive on a different node than + * the master connection (e.g. FTP passive mode) */ + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + default: + break; + } + +#ifdef DEBUG + nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +#endif + pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); + if (!clusterip_responsible(cipinfo->config, hash)) { + pr_debug("not responsible\n"); + return NF_DROP; + } + pr_debug("responsible\n"); + + /* despite being received via linklayer multicast, this is + * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ + skb->pkt_type = PACKET_HOST; + + return XT_CONTINUE; +} + +static int clusterip_tg_check(const struct xt_tgchk_param *par) +{ + struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; + struct clusterip_config *config; + int ret; + + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { + pr_info("unknown mode %u\n", cipinfo->hash_mode); + return -EINVAL; + + } + if (e->ip.dmsk.s_addr != htonl(0xffffffff) || + e->ip.dst.s_addr == 0) { + pr_info("Please specify destination IP\n"); + return -EINVAL; + } + + /* FIXME: further sanity checks */ + + config = clusterip_config_find_get(e->ip.dst.s_addr, 1); + if (!config) { + if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { + pr_info("no config found for %pI4, need 'new'\n", + &e->ip.dst.s_addr); + return -EINVAL; + } else { + struct net_device *dev; + + if (e->ip.iniface[0] == '\0') { + pr_info("Please specify an interface name\n"); + return -EINVAL; + } + + dev = dev_get_by_name(&init_net, e->ip.iniface); + if (!dev) { + pr_info("no such interface %s\n", + e->ip.iniface); + return -ENOENT; + } + + config = clusterip_config_init(cipinfo, + e->ip.dst.s_addr, dev); + if (!config) { + pr_info("cannot allocate config\n"); + dev_put(dev); + return -ENOMEM; + } + dev_mc_add(config->dev, config->clustermac); + } + } + cipinfo->config = config; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +/* drop reference count of cluster config when rule is deleted */ +static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + + /* if no more entries are referencing the config, remove it + * from the list and destroy the proc entry */ + clusterip_config_entry_put(cipinfo->config); + + clusterip_config_put(cipinfo->config); + + nf_ct_l3proto_module_put(par->family); +} + +#ifdef CONFIG_COMPAT +struct compat_ipt_clusterip_tgt_info +{ + u_int32_t flags; + u_int8_t clustermac[6]; + u_int16_t num_total_nodes; + u_int16_t num_local_nodes; + u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; + u_int32_t hash_mode; + u_int32_t hash_initval; + compat_uptr_t config; +}; +#endif /* CONFIG_COMPAT */ + +static struct xt_target clusterip_tg_reg __read_mostly = { + .name = "CLUSTERIP", + .family = NFPROTO_IPV4, + .target = clusterip_tg, + .checkentry = clusterip_tg_check, + .destroy = clusterip_tg_destroy, + .targetsize = sizeof(struct ipt_clusterip_tgt_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), +#endif /* CONFIG_COMPAT */ + .me = THIS_MODULE +}; + + +/*********************************************************************** + * ARP MANGLING CODE + ***********************************************************************/ + +/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ +struct arp_payload { + u_int8_t src_hw[ETH_ALEN]; + __be32 src_ip; + u_int8_t dst_hw[ETH_ALEN]; + __be32 dst_ip; +} __packed; + +#ifdef DEBUG +static void arp_print(struct arp_payload *payload) +{ +#define HBUFFERLEN 30 + char hbuffer[HBUFFERLEN]; + int j,k; + + for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); + hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); + hbuffer[k++]=':'; + } + hbuffer[--k]='\0'; + + pr_debug("src %pI4@%s, dst %pI4\n", + &payload->src_ip, hbuffer, &payload->dst_ip); +} +#endif + +static unsigned int +arp_mangle(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct arphdr *arp = arp_hdr(skb); + struct arp_payload *payload; + struct clusterip_config *c; + + /* we don't care about non-ethernet and non-ipv4 ARP */ + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) + return NF_ACCEPT; + + /* we only want to mangle arp requests and replies */ + if (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST)) + return NF_ACCEPT; + + payload = (void *)(arp+1); + + /* if there is no clusterip configuration for the arp reply's + * source ip, we don't want to mangle it */ + c = clusterip_config_find_get(payload->src_ip, 0); + if (!c) + return NF_ACCEPT; + + /* normally the linux kernel always replies to arp queries of + * addresses on different interfacs. However, in the CLUSTERIP case + * this wouldn't work, since we didn't subscribe the mcast group on + * other interfaces */ + if (c->dev != out) { + pr_debug("not mangling arp reply on different " + "interface: cip'%s'-skb'%s'\n", + c->dev->name, out->name); + clusterip_config_put(c); + return NF_ACCEPT; + } + + /* mangle reply hardware address */ + memcpy(payload->src_hw, c->clustermac, arp->ar_hln); + +#ifdef DEBUG + pr_debug("mangled arp reply: "); + arp_print(payload); +#endif + + clusterip_config_put(c); + + return NF_ACCEPT; +} + +static struct nf_hook_ops cip_arp_ops __read_mostly = { + .hook = arp_mangle, + .pf = NFPROTO_ARP, + .hooknum = NF_ARP_OUT, + .priority = -1 +}; + +/*********************************************************************** + * PROC DIR HANDLING + ***********************************************************************/ + +#ifdef CONFIG_PROC_FS + +struct clusterip_seq_position { + unsigned int pos; /* position */ + unsigned int weight; /* number of bits set == size */ + unsigned int bit; /* current bit */ + unsigned long val; /* current value */ +}; + +static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) +{ + struct clusterip_config *c = s->private; + unsigned int weight; + u_int32_t local_nodes; + struct clusterip_seq_position *idx; + + /* FIXME: possible race */ + local_nodes = c->local_nodes; + weight = hweight32(local_nodes); + if (*pos >= weight) + return NULL; + + idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); + if (!idx) + return ERR_PTR(-ENOMEM); + + idx->pos = *pos; + idx->weight = weight; + idx->bit = ffs(local_nodes); + idx->val = local_nodes; + clear_bit(idx->bit - 1, &idx->val); + + return idx; +} + +static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct clusterip_seq_position *idx = v; + + *pos = ++idx->pos; + if (*pos >= idx->weight) { + kfree(v); + return NULL; + } + idx->bit = ffs(idx->val); + clear_bit(idx->bit - 1, &idx->val); + return idx; +} + +static void clusterip_seq_stop(struct seq_file *s, void *v) +{ + if (!IS_ERR(v)) + kfree(v); +} + +static int clusterip_seq_show(struct seq_file *s, void *v) +{ + struct clusterip_seq_position *idx = v; + + if (idx->pos != 0) + seq_putc(s, ','); + + seq_printf(s, "%u", idx->bit); + + if (idx->pos == idx->weight - 1) + seq_putc(s, '\n'); + + return 0; +} + +static const struct seq_operations clusterip_seq_ops = { + .start = clusterip_seq_start, + .next = clusterip_seq_next, + .stop = clusterip_seq_stop, + .show = clusterip_seq_show, +}; + +static int clusterip_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &clusterip_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + struct clusterip_config *c = PDE(inode)->data; + + sf->private = c; + + clusterip_config_get(c); + } + + return ret; +} + +static int clusterip_proc_release(struct inode *inode, struct file *file) +{ + struct clusterip_config *c = PDE(inode)->data; + int ret; + + ret = seq_release(inode, file); + + if (!ret) + clusterip_config_put(c); + + return ret; +} + +static ssize_t clusterip_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *ofs) +{ + struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data; +#define PROC_WRITELEN 10 + char buffer[PROC_WRITELEN+1]; + unsigned long nodenum; + + if (size > PROC_WRITELEN) + return -EIO; + if (copy_from_user(buffer, input, size)) + return -EFAULT; + buffer[size] = 0; + + if (*buffer == '+') { + nodenum = simple_strtoul(buffer+1, NULL, 10); + if (clusterip_add_node(c, nodenum)) + return -ENOMEM; + } else if (*buffer == '-') { + nodenum = simple_strtoul(buffer+1, NULL,10); + if (clusterip_del_node(c, nodenum)) + return -ENOENT; + } else + return -EIO; + + return size; +} + +static const struct file_operations clusterip_proc_fops = { + .owner = THIS_MODULE, + .open = clusterip_proc_open, + .read = seq_read, + .write = clusterip_proc_write, + .llseek = seq_lseek, + .release = clusterip_proc_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static int __init clusterip_tg_init(void) +{ + int ret; + + ret = xt_register_target(&clusterip_tg_reg); + if (ret < 0) + return ret; + + ret = nf_register_hook(&cip_arp_ops); + if (ret < 0) + goto cleanup_target; + +#ifdef CONFIG_PROC_FS + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); + if (!clusterip_procdir) { + pr_err("Unable to proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_hook; + } +#endif /* CONFIG_PROC_FS */ + + pr_info("ClusterIP Version %s loaded successfully\n", + CLUSTERIP_VERSION); + return 0; + +#ifdef CONFIG_PROC_FS +cleanup_hook: + nf_unregister_hook(&cip_arp_ops); +#endif /* CONFIG_PROC_FS */ +cleanup_target: + xt_unregister_target(&clusterip_tg_reg); + return ret; +} + +static void __exit clusterip_tg_exit(void) +{ + pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); +#ifdef CONFIG_PROC_FS + remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); +#endif + nf_unregister_hook(&cip_arp_ops); + xt_unregister_target(&clusterip_tg_reg); + + /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ + rcu_barrier_bh(); +} + +module_init(clusterip_tg_init); +module_exit(clusterip_tg_exit); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c new file mode 100644 index 00000000..d76d6c9e --- /dev/null +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -0,0 +1,516 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); + +/* One level of recursion won't kill us */ +static void dump_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + sb_add(m, "SRC=%pI4 DST=%pI4 ", + &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + sb_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + sb_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + sb_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & IPT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + sb_add(m, "PROTO=TCP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & IPT_LOG_TCPSEQ) + sb_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + sb_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3F " */ + sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + sb_add(m, "CWR "); + if (th->ece) + sb_add(m, "ECE "); + if (th->urg) + sb_add(m, "URG "); + if (th->ack) + sb_add(m, "ACK "); + if (th->psh) + sb_add(m, "PSH "); + if (th->rst) + sb_add(m, "RST "); + if (th->syn) + sb_add(m, "SYN "); + if (th->fin) + sb_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & IPT_LOG_TCPOPT) && + th->doff * 4 > sizeof(struct tcphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; + const unsigned char *op; + unsigned int i, optsize; + + optsize = th->doff * 4 - sizeof(struct tcphdr); + op = skb_header_pointer(skb, + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); + } + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + if (ih->protocol == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + sb_add(m, "PROTO=UDP " ); + else /* Max length: 14 "PROTO=UDPLITE " */ + sb_add(m, "PROTO=UDPLITE "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + sb_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + sb_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + sb_add(m, "["); + dump_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) + sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + sb_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + sb_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + sb_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + sb_add(m, "UID=%u GID=%u ", + skb->sk->sk_socket->file->f_cred->fsuid, + skb->sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & IPT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + sb_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + sb_add(m, ":%02x", *p); + } + sb_add(m, " "); +} + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static void +ipt_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + const struct net_device *physindev; + const struct net_device *physoutdev; + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) + sb_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) + sb_add(m, "PHYSOUT=%s ", physoutdev->name); + } +#endif + + if (in != NULL) + dump_mac_header(m, loginfo, skb); + + dump_packet(m, loginfo, skb, 0); + + sb_close(m); +} + +static unsigned int +log_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipt_log_info *loginfo = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; + + ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, + loginfo->prefix); + return XT_CONTINUE; +} + +static int log_tg_check(const struct xt_tgchk_param *par) +{ + const struct ipt_log_info *loginfo = par->targinfo; + + if (loginfo->level >= 8) { + pr_debug("level %u >= 8\n", loginfo->level); + return -EINVAL; + } + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + pr_debug("prefix is not null-terminated\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target log_tg_reg __read_mostly = { + .name = "LOG", + .family = NFPROTO_IPV4, + .target = log_tg, + .targetsize = sizeof(struct ipt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, +}; + +static struct nf_logger ipt_log_logger __read_mostly = { + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + +static int __init log_tg_init(void) +{ + int ret; + + ret = xt_register_target(&log_tg_reg); + if (ret < 0) + return ret; + nf_log_register(NFPROTO_IPV4, &ipt_log_logger); + return 0; +} + +static void __exit log_tg_exit(void) +{ + nf_log_unregister(&ipt_log_logger); + xt_unregister_target(&log_tg_reg); +} + +module_init(log_tg_init); +module_exit(log_tg_exit); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c new file mode 100644 index 00000000..9931152a --- /dev/null +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -0,0 +1,173 @@ +/* Masquerade. Simple mapping which alters range to a local IP address + (depending on route). */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); + +/* FIXME: Multiple targets. --RR */ +static int masquerade_tg_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static unsigned int +masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + struct nf_conn_nat *nat; + enum ip_conntrack_info ctinfo; + struct nf_nat_range newrange; + const struct nf_nat_multi_range_compat *mr; + const struct rtable *rt; + __be32 newsrc; + + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); + + ct = nf_ct_get(skb, &ctinfo); + nat = nfct_nat(ct); + + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + /* Source address is 0.0.0.0 - locally generated packet that is + * probably not supposed to be masqueraded. + */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) + return NF_ACCEPT; + + mr = par->targinfo; + rt = skb_rtable(skb); + newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + if (!newsrc) { + pr_info("%s ate my IP address\n", par->out->name); + return NF_DROP; + } + + nat->masq_index = par->out->ifindex; + + /* Transfer from original range. */ + newrange = ((struct nf_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newsrc, newsrc, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); +} + +static int +device_cmp(struct nf_conn *i, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + const struct net_device *dev = ptr; + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, + and forget them. */ + NF_CT_ASSERT(dev->ifindex != 0); + + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + return masq_device_event(this, event, dev); +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static struct xt_target masquerade_tg_reg __read_mostly = { + .name = "MASQUERADE", + .family = NFPROTO_IPV4, + .target = masquerade_tg, + .targetsize = sizeof(struct nf_nat_multi_range_compat), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check, + .me = THIS_MODULE, +}; + +static int __init masquerade_tg_init(void) +{ + int ret; + + ret = xt_register_target(&masquerade_tg_reg); + + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } + + return ret; +} + +static void __exit masquerade_tg_exit(void) +{ + xt_unregister_target(&masquerade_tg_reg); + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} + +module_init(masquerade_tg_init); +module_exit(masquerade_tg_exit); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c new file mode 100644 index 00000000..6cdb298f --- /dev/null +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -0,0 +1,98 @@ +/* NETMAP - static NAT mapping of IP network addresses (1:1). + * The mapping can be applied to source (POSTROUTING), + * destination (PREROUTING), or both (with separate rules). + */ + +/* (C) 2000-2001 Svenning Soerensen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Svenning Soerensen "); +MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); + +static int netmap_tg_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static unsigned int +netmap_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 new_ip, netmask; + const struct nf_nat_multi_range_compat *mr = par->targinfo; + struct nf_nat_range newrange; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT || + par->hooknum == NF_INET_LOCAL_IN); + ct = nf_ct_get(skb, &ctinfo); + + netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) + new_ip = ip_hdr(skb)->daddr & ~netmask; + else + new_ip = ip_hdr(skb)->saddr & ~netmask; + new_ip |= mr->range[0].min_ip & netmask; + + newrange = ((struct nf_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + new_ip, new_ip, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); +} + +static struct xt_target netmap_tg_reg __read_mostly = { + .name = "NETMAP", + .family = NFPROTO_IPV4, + .target = netmap_tg, + .targetsize = sizeof(struct nf_nat_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .checkentry = netmap_tg_check, + .me = THIS_MODULE +}; + +static int __init netmap_tg_init(void) +{ + return xt_register_target(&netmap_tg_reg); +} + +static void __exit netmap_tg_exit(void) +{ + xt_unregister_target(&netmap_tg_reg); +} + +module_init(netmap_tg_init); +module_exit(netmap_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c new file mode 100644 index 00000000..18a06565 --- /dev/null +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -0,0 +1,110 @@ +/* Redirect. Simple mapping which alters dst to a local IP address. */ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); + +/* FIXME: Take multiple ranges --RR */ +static int redirect_tg_check(const struct xt_tgchk_param *par) +{ + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + pr_debug("bad MAP_IPS.\n"); + return -EINVAL; + } + if (mr->rangesize != 1) { + pr_debug("bad rangesize %u.\n", mr->rangesize); + return -EINVAL; + } + return 0; +} + +static unsigned int +redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + __be32 newdst; + const struct nf_nat_multi_range_compat *mr = par->targinfo; + struct nf_nat_range newrange; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (par->hooknum == NF_INET_LOCAL_OUT) + newdst = htonl(0x7F000001); + else { + struct in_device *indev; + struct in_ifaddr *ifa; + + newdst = 0; + + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + if (indev && (ifa = indev->ifa_list)) + newdst = ifa->ifa_local; + rcu_read_unlock(); + + if (!newdst) + return NF_DROP; + } + + /* Transfer from original range. */ + newrange = ((struct nf_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newdst, newdst, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST); +} + +static struct xt_target redirect_tg_reg __read_mostly = { + .name = "REDIRECT", + .family = NFPROTO_IPV4, + .target = redirect_tg, + .targetsize = sizeof(struct nf_nat_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), + .checkentry = redirect_tg_check, + .me = THIS_MODULE, +}; + +static int __init redirect_tg_init(void) +{ + return xt_register_target(&redirect_tg_reg); +} + +static void __exit redirect_tg_exit(void) +{ + xt_unregister_target(&redirect_tg_reg); +} + +module_init(redirect_tg_init); +module_exit(redirect_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c new file mode 100644 index 00000000..9dd754c7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -0,0 +1,220 @@ +/* + * This is a module which is used for rejecting packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BRIDGE_NETFILTER +#include +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); + +/* Send RST reply */ +static void send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _otcph, *tcph; + + /* IP header checks: fragment. */ + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + /* Check checksum */ + if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) + return; + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; + + skb_reserve(nskb, LL_MAX_HEADER); + + skb_reset_network_header(nskb); + niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph->version = 4; + niph->ihl = sizeof(struct iphdr) / 4; + niph->tos = 0; + niph->id = 0; + niph->frag_off = htons(IP_DF); + niph->protocol = IPPROTO_TCP; + niph->check = 0; + niph->saddr = oiph->daddr; + niph->daddr = oiph->saddr; + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + memset(tcph, 0, sizeof(*tcph)); + tcph->source = oth->dest; + tcph->dest = oth->source; + tcph->doff = sizeof(struct tcphdr) / 4; + + if (oth->ack) + tcph->seq = oth->ack_seq; + else { + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - ip_hdrlen(oldskb) - + (oth->doff << 2)); + tcph->ack = 1; + } + + tcph->rst = 1; + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, + niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)tcph - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + /* ip_route_me_harder expects skb->dst to be set */ + skb_dst_set_noref(nskb, skb_dst(oldskb)); + + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); + + /* "Never happens" */ + if (nskb->len > dst_mtu(skb_dst(nskb))) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + + ip_local_out(nskb); + return; + + free_nskb: + kfree_skb(nskb); +} + +static inline void send_unreach(struct sk_buff *skb_in, int code) +{ + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); +#ifdef CONFIG_IP_NF_TARGET_REJECT_SKERR + if (skb_in->sk) { + skb_in->sk->sk_err = icmp_err_convert[code].errno; + skb_in->sk->sk_error_report(skb_in->sk); + pr_debug("ipt_REJECT: sk_err=%d for skb=%p sk=%p\n", + skb_in->sk->sk_err, skb_in, skb_in->sk); + } +#endif +} + +static unsigned int +reject_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipt_reject_info *reject = par->targinfo; + + switch (reject->with) { + case IPT_ICMP_NET_UNREACHABLE: + send_unreach(skb, ICMP_NET_UNREACH); + break; + case IPT_ICMP_HOST_UNREACHABLE: + send_unreach(skb, ICMP_HOST_UNREACH); + break; + case IPT_ICMP_PROT_UNREACHABLE: + send_unreach(skb, ICMP_PROT_UNREACH); + break; + case IPT_ICMP_PORT_UNREACHABLE: + send_unreach(skb, ICMP_PORT_UNREACH); + break; + case IPT_ICMP_NET_PROHIBITED: + send_unreach(skb, ICMP_NET_ANO); + break; + case IPT_ICMP_HOST_PROHIBITED: + send_unreach(skb, ICMP_HOST_ANO); + break; + case IPT_ICMP_ADMIN_PROHIBITED: + send_unreach(skb, ICMP_PKT_FILTERED); + break; + case IPT_TCP_RESET: + send_reset(skb, par->hooknum); + case IPT_ICMP_ECHOREPLY: + /* Doesn't happen. */ + break; + } + + return NF_DROP; +} + +static int reject_tg_check(const struct xt_tgchk_param *par) +{ + const struct ipt_reject_info *rejinfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { + pr_info("ECHOREPLY no longer supported.\n"); + return -EINVAL; + } else if (rejinfo->with == IPT_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ip.proto != IPPROTO_TCP || + (e->ip.invflags & XT_INV_PROTO)) { + pr_info("TCP_RESET invalid for non-tcp\n"); + return -EINVAL; + } + } + return 0; +} + +static struct xt_target reject_tg_reg __read_mostly = { + .name = "REJECT", + .family = NFPROTO_IPV4, + .target = reject_tg, + .targetsize = sizeof(struct ipt_reject_info), + .table = "filter", + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = reject_tg_check, + .me = THIS_MODULE, +}; + +static int __init reject_tg_init(void) +{ + return xt_register_target(&reject_tg_reg); +} + +static void __exit reject_tg_exit(void) +{ + xt_unregister_target(&reject_tg_reg); +} + +module_init(reject_tg_init); +module_exit(reject_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c new file mode 100644 index 00000000..446e0f46 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -0,0 +1,442 @@ +/* + * netfilter module for userspace packet logging daemons + * + * (C) 2000-2004 by Harald Welte + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This module accepts two parameters: + * + * nlbufsiz: + * The parameter specifies how big the buffer for each netlink multicast + * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will + * get accumulated in the kernel until they are sent to userspace. It is + * NOT possible to allocate more than 128kB, and it is strongly discouraged, + * because atomically allocating 128kB inside the network rx softirq is not + * reliable. Please also keep in mind that this buffer size is allocated for + * each nlgroup you are using, so the total kernel memory usage increases + * by that factor. + * + * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since + * nlbufsiz is used with alloc_skb, which adds another + * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead. + * + * flushtimeout: + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); + +#define ULOG_NL_EVENT 111 /* Harald's favorite number */ +#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ + +static unsigned int nlbufsiz = NLMSG_GOODSIZE; +module_param(nlbufsiz, uint, 0400); +MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); + +static unsigned int flushtimeout = 10; +module_param(flushtimeout, uint, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); + +static int nflog = 1; +module_param(nflog, bool, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +/* global data structures */ + +typedef struct { + unsigned int qlen; /* number of nlmsgs' in the skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct sk_buff *skb; /* the pre-allocated skb */ + struct timer_list timer; /* the timer function */ +} ulog_buff_t; + +static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ + +static struct sock *nflognl; /* our socket */ +static DEFINE_SPINLOCK(ulog_lock); /* spinlock */ + +/* send one ulog_buff_t to userspace */ +static void ulog_send(unsigned int nlgroupnum) +{ + ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; + + if (timer_pending(&ub->timer)) { + pr_debug("ulog_send: timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + if (!ub->skb) { + pr_debug("ulog_send: nothing to send\n"); + return; + } + + /* last nlmsg needs NLMSG_DONE */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_type = NLMSG_DONE; + + NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; + pr_debug("throwing %d packets to netlink group %u\n", + ub->qlen, nlgroupnum + 1); + netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); + + ub->qlen = 0; + ub->skb = NULL; + ub->lastnlh = NULL; +} + + +/* timer function to flush queue in flushtimeout time */ +static void ulog_timer(unsigned long data) +{ + pr_debug("timer function called, calling ulog_send\n"); + + /* lock to protect against somebody modifying our structure + * from ipt_ulog_target at the same time */ + spin_lock_bh(&ulog_lock); + ulog_send(data); + spin_unlock_bh(&ulog_lock); +} + +static struct sk_buff *ulog_alloc_skb(unsigned int size) +{ + struct sk_buff *skb; + unsigned int n; + + /* alloc skb which should be big enough for a whole + * multipart message. WARNING: has to be <= 131000 + * due to slab allocator restrictions */ + + n = max(size, nlbufsiz); + skb = alloc_skb(n, GFP_ATOMIC); + if (!skb) { + pr_debug("cannot alloc whole buffer %ub!\n", n); + + if (n > size) { + /* try to allocate only as much as we need for + * current packet */ + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + pr_debug("cannot even allocate %ub\n", size); + } + } + + return skb; +} + +static void ipt_ulog_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_ulog_info *loginfo, + const char *prefix) +{ + ulog_buff_t *ub; + ulog_packet_msg_t *pm; + size_t size, copy_len; + struct nlmsghdr *nlh; + struct timeval tv; + + /* ffs == find first bit set, necessary because userspace + * is already shifting groupnumber, but we need unshifted. + * ffs() returns [1..32], we need [0..31] */ + unsigned int groupnum = ffs(loginfo->nl_group) - 1; + + /* calculate the size of the skb needed */ + if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len) + copy_len = skb->len; + else + copy_len = loginfo->copy_range; + + size = NLMSG_SPACE(sizeof(*pm) + copy_len); + + ub = &ulog_buffers[groupnum]; + + spin_lock_bh(&ulog_lock); + + if (!ub->skb) { + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } else if (ub->qlen >= loginfo->qthreshold || + size > skb_tailroom(ub->skb)) { + /* either the queue len is too high or we don't have + * enough room in nlskb left. send it to userspace. */ + + ulog_send(groupnum); + + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } + + pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); + + /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ + nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, + sizeof(*pm)+copy_len); + ub->qlen++; + + pm = NLMSG_DATA(nlh); + + /* We might not have a timestamp, get one */ + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + /* copy hook, prefix, timestamp, payload, etc. */ + pm->data_len = copy_len; + tv = ktime_to_timeval(skb->tstamp); + put_unaligned(tv.tv_sec, &pm->timestamp_sec); + put_unaligned(tv.tv_usec, &pm->timestamp_usec); + put_unaligned(skb->mark, &pm->mark); + pm->hook = hooknum; + if (prefix != NULL) + strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + else if (loginfo->prefix[0] != '\0') + strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; + + if (in && in->hard_header_len > 0 && + skb->mac_header != skb->network_header && + in->hard_header_len <= ULOG_MAC_LEN) { + memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); + pm->mac_len = in->hard_header_len; + } else + pm->mac_len = 0; + + if (in) + strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; + + if (out) + strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; + + /* copy_len <= skb->len, so can't fail. */ + if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) + BUG(); + + /* check if we are building multi-part messages */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; + + ub->lastnlh = nlh; + + /* if timer isn't already running, start it */ + if (!timer_pending(&ub->timer)) { + ub->timer.expires = jiffies + flushtimeout * HZ / 100; + add_timer(&ub->timer); + } + + /* if threshold is reached, send message to userspace */ + if (ub->qlen >= loginfo->qthreshold) { + if (loginfo->qthreshold > 1) + nlh->nlmsg_type = NLMSG_DONE; + ulog_send(groupnum); + } + + spin_unlock_bh(&ulog_lock); + + return; + +nlmsg_failure: + pr_debug("error during NLMSG_PUT\n"); +alloc_failure: + pr_debug("Error building netlink message\n"); + spin_unlock_bh(&ulog_lock); +} + +static unsigned int +ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + ipt_ulog_packet(par->hooknum, skb, par->in, par->out, + par->targinfo, NULL); + return XT_CONTINUE; +} + +static void ipt_logfn(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li, + const char *prefix) +{ + struct ipt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nl_group = ULOG_DEFAULT_NLGROUP; + loginfo.copy_range = 0; + loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nl_group = li->u.ulog.group; + loginfo.copy_range = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } + + ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static int ulog_tg_check(const struct xt_tgchk_param *par) +{ + const struct ipt_ulog_info *loginfo = par->targinfo; + + if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { + pr_debug("prefix not null-terminated\n"); + return -EINVAL; + } + if (loginfo->qthreshold > ULOG_MAX_QLEN) { + pr_debug("queue threshold %Zu > MAX_QLEN\n", + loginfo->qthreshold); + return -EINVAL; + } + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_ipt_ulog_info { + compat_uint_t nl_group; + compat_size_t copy_range; + compat_size_t qthreshold; + char prefix[ULOG_PREFIX_LEN]; +}; + +static void ulog_tg_compat_from_user(void *dst, const void *src) +{ + const struct compat_ipt_ulog_info *cl = src; + struct ipt_ulog_info l = { + .nl_group = cl->nl_group, + .copy_range = cl->copy_range, + .qthreshold = cl->qthreshold, + }; + + memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); + memcpy(dst, &l, sizeof(l)); +} + +static int ulog_tg_compat_to_user(void __user *dst, const void *src) +{ + const struct ipt_ulog_info *l = src; + struct compat_ipt_ulog_info cl = { + .nl_group = l->nl_group, + .copy_range = l->copy_range, + .qthreshold = l->qthreshold, + }; + + memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); + return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_target ulog_tg_reg __read_mostly = { + .name = "ULOG", + .family = NFPROTO_IPV4, + .target = ulog_tg, + .targetsize = sizeof(struct ipt_ulog_info), + .checkentry = ulog_tg_check, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_ipt_ulog_info), + .compat_from_user = ulog_tg_compat_from_user, + .compat_to_user = ulog_tg_compat_to_user, +#endif + .me = THIS_MODULE, +}; + +static struct nf_logger ipt_ulog_logger __read_mostly = { + .name = "ipt_ULOG", + .logfn = ipt_logfn, + .me = THIS_MODULE, +}; + +static int __init ulog_tg_init(void) +{ + int ret, i; + + pr_debug("init module\n"); + + if (nlbufsiz > 128*1024) { + pr_warning("Netlink buffer has to be <= 128kB\n"); + return -EINVAL; + } + + /* initialize ulog_buffers */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) + setup_timer(&ulog_buffers[i].timer, ulog_timer, i); + + nflognl = netlink_kernel_create(&init_net, + NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, + NULL, THIS_MODULE); + if (!nflognl) + return -ENOMEM; + + ret = xt_register_target(&ulog_tg_reg); + if (ret < 0) { + netlink_kernel_release(nflognl); + return ret; + } + if (nflog) + nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); + + return 0; +} + +static void __exit ulog_tg_exit(void) +{ + ulog_buff_t *ub; + int i; + + pr_debug("cleanup_module\n"); + + if (nflog) + nf_log_unregister(&ipt_ulog_logger); + xt_unregister_target(&ulog_tg_reg); + netlink_kernel_release(nflognl); + + /* remove pending timers and free allocated skb's */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + ub = &ulog_buffers[i]; + if (timer_pending(&ub->timer)) { + pr_debug("timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + if (ub->skb) { + kfree_skb(ub->skb); + ub->skb = NULL; + } + } +} + +module_init(ulog_tg_init); +module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c new file mode 100644 index 00000000..14a2aa8b --- /dev/null +++ b/net/ipv4/netfilter/ipt_ah.c @@ -0,0 +1,91 @@ +/* Kernel module to match AH parameters. */ +/* (C) 1999-2000 Yon Uriarte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte "); +MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r=(spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + const struct ipt_ah *ahinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil AH tinygram.\n"); + par->hotdrop = true; + return 0; + } + + return spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IPT_AH_INV_SPI)); +} + +static int ah_mt_check(const struct xt_mtchk_param *par) +{ + const struct ipt_ah *ahinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + if (ahinfo->invflags & ~IPT_AH_INV_MASK) { + pr_debug("unknown flags %X\n", ahinfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match ah_mt_reg __read_mostly = { + .name = "ah", + .family = NFPROTO_IPV4, + .match = ah_mt, + .matchsize = sizeof(struct ipt_ah), + .proto = IPPROTO_AH, + .checkentry = ah_mt_check, + .me = THIS_MODULE, +}; + +static int __init ah_mt_init(void) +{ + return xt_register_match(&ah_mt_reg); +} + +static void __exit ah_mt_exit(void) +{ + xt_unregister_match(&ah_mt_reg); +} + +module_init(ah_mt_init); +module_exit(ah_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c new file mode 100644 index 00000000..2b57e52c --- /dev/null +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -0,0 +1,127 @@ +/* IP tables module for matching the value of the IPv4 and TCP ECN bits + * + * (C) 2002 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4"); +MODULE_LICENSE("GPL"); + +static inline bool match_ip(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo) +{ + return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^ + !!(einfo->invert & IPT_ECN_OP_MATCH_IP); +} + +static inline bool match_tcp(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo, + bool *hotdrop) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* In practice, TCP match does this, so can't fail. But let's + * be good citizens. + */ + th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); + if (th == NULL) { + *hotdrop = false; + return false; + } + + if (einfo->operation & IPT_ECN_OP_MATCH_ECE) { + if (einfo->invert & IPT_ECN_OP_MATCH_ECE) { + if (th->ece == 1) + return false; + } else { + if (th->ece == 0) + return false; + } + } + + if (einfo->operation & IPT_ECN_OP_MATCH_CWR) { + if (einfo->invert & IPT_ECN_OP_MATCH_CWR) { + if (th->cwr == 1) + return false; + } else { + if (th->cwr == 0) + return false; + } + } + + return true; +} + +static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ipt_ecn_info *info = par->matchinfo; + + if (info->operation & IPT_ECN_OP_MATCH_IP) + if (!match_ip(skb, info)) + return false; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { + if (!match_tcp(skb, info, &par->hotdrop)) + return false; + } + + return true; +} + +static int ecn_mt_check(const struct xt_mtchk_param *par) +{ + const struct ipt_ecn_info *info = par->matchinfo; + const struct ipt_ip *ip = par->entryinfo; + + if (info->operation & IPT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->invert & IPT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && + (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { + pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match ecn_mt_reg __read_mostly = { + .name = "ecn", + .family = NFPROTO_IPV4, + .match = ecn_mt, + .matchsize = sizeof(struct ipt_ecn_info), + .checkentry = ecn_mt_check, + .me = THIS_MODULE, +}; + +static int __init ecn_mt_init(void) +{ + return xt_register_match(&ecn_mt_reg); +} + +static void __exit ecn_mt_exit(void) +{ + xt_unregister_match(&ecn_mt_reg); +} + +module_init(ecn_mt_init); +module_exit(ecn_mt_exit); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c new file mode 100644 index 00000000..c37641e8 --- /dev/null +++ b/net/ipv4/netfilter/iptable_filter.c @@ -0,0 +1,121 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_FILTER, +}; + +static unsigned int +iptable_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net; + + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* root is playing with raw sockets. */ + return NF_ACCEPT; + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); +} + +static struct nf_hook_ops *filter_ops __read_mostly; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __net_init iptable_filter_net_init(struct net *net) +{ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ipt_standard *)repl->entries)[1].target.verdict = + -forward - 1; + + net->ipv4.iptable_filter = + ipt_register_table(net, &packet_filter, repl); + kfree(repl); + if (IS_ERR(net->ipv4.iptable_filter)) + return PTR_ERR(net->ipv4.iptable_filter); + return 0; +} + +static void __net_exit iptable_filter_net_exit(struct net *net) +{ + ipt_unregister_table(net, net->ipv4.iptable_filter); +} + +static struct pernet_operations iptable_filter_net_ops = { + .init = iptable_filter_net_init, + .exit = iptable_filter_net_exit, +}; + +static int __init iptable_filter_init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + pr_err("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + ret = register_pernet_subsys(&iptable_filter_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&iptable_filter_net_ops); + return ret; +} + +static void __exit iptable_filter_fini(void) +{ + xt_hook_unlink(&packet_filter, filter_ops); + unregister_pernet_subsys(&iptable_filter_net_ops); +} + +module_init(iptable_filter_init); +module_exit(iptable_filter_fini); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c new file mode 100644 index 00000000..aef5d1fb --- /dev/null +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -0,0 +1,151 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_POST_ROUTING)) + +static const struct xt_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_MANGLE, +}; + +static unsigned int +ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) +{ + unsigned int ret; + const struct iphdr *iph; + u_int8_t tos; + __be32 saddr, daddr; + u_int32_t mark; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + /* Save things which could affect route */ + mark = skb->mark; + iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; + + ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, + dev_net(out)->ipv4.iptable_mangle); + /* Reroute for ANY change. */ + if (ret != NF_DROP && ret != NF_STOLEN) { + iph = ip_hdr(skb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) + if (ip_route_me_harder(skb, RTN_UNSPEC)) + ret = NF_DROP; + } + + return ret; +} + +/* The work comes in here from netfilter.c. */ +static unsigned int +iptable_mangle_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (hook == NF_INET_LOCAL_OUT) + return ipt_mangle_out(skb, out); + if (hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, hook, in, out, + dev_net(out)->ipv4.iptable_mangle); + /* PREROUTING/INPUT/FORWARD: */ + return ipt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.iptable_mangle); +} + +static struct nf_hook_ops *mangle_ops __read_mostly; + +static int __net_init iptable_mangle_net_init(struct net *net) +{ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; + net->ipv4.iptable_mangle = + ipt_register_table(net, &packet_mangler, repl); + kfree(repl); + if (IS_ERR(net->ipv4.iptable_mangle)) + return PTR_ERR(net->ipv4.iptable_mangle); + return 0; +} + +static void __net_exit iptable_mangle_net_exit(struct net *net) +{ + ipt_unregister_table(net, net->ipv4.iptable_mangle); +} + +static struct pernet_operations iptable_mangle_net_ops = { + .init = iptable_mangle_net_init, + .exit = iptable_mangle_net_exit, +}; + +static int __init iptable_mangle_init(void) +{ + int ret; + + ret = register_pernet_subsys(&iptable_mangle_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&iptable_mangle_net_ops); + return ret; +} + +static void __exit iptable_mangle_fini(void) +{ + xt_hook_unlink(&packet_mangler, mangle_ops); + unregister_pernet_subsys(&iptable_mangle_net_ops); +} + +module_init(iptable_mangle_init); +module_exit(iptable_mangle_fini); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c new file mode 100644 index 00000000..07fb710c --- /dev/null +++ b/net/ipv4/netfilter/iptable_raw.c @@ -0,0 +1,96 @@ +/* + * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . + * + * Copyright (C) 2003 Jozsef Kadlecsik + */ +#include +#include +#include +#include + +#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_RAW, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +iptable_raw_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net; + + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* root is playing with raw sockets. */ + return NF_ACCEPT; + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); +} + +static struct nf_hook_ops *rawtable_ops __read_mostly; + +static int __net_init iptable_raw_net_init(struct net *net) +{ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; + net->ipv4.iptable_raw = + ipt_register_table(net, &packet_raw, repl); + kfree(repl); + if (IS_ERR(net->ipv4.iptable_raw)) + return PTR_ERR(net->ipv4.iptable_raw); + return 0; +} + +static void __net_exit iptable_raw_net_exit(struct net *net) +{ + ipt_unregister_table(net, net->ipv4.iptable_raw); +} + +static struct pernet_operations iptable_raw_net_ops = { + .init = iptable_raw_net_init, + .exit = iptable_raw_net_exit, +}; + +static int __init iptable_raw_init(void) +{ + int ret; + + ret = register_pernet_subsys(&iptable_raw_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&iptable_raw_net_ops); + return ret; +} + +static void __exit iptable_raw_fini(void) +{ + xt_hook_unlink(&packet_raw, rawtable_ops); + unregister_pernet_subsys(&iptable_raw_net_ops); +} + +module_init(iptable_raw_init); +module_exit(iptable_raw_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c new file mode 100644 index 00000000..be45bdc4 --- /dev/null +++ b/net/ipv4/netfilter/iptable_security.c @@ -0,0 +1,113 @@ +/* + * "security" table + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris redhat.com>"); +MODULE_DESCRIPTION("iptables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static const struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_SECURITY, +}; + +static unsigned int +iptable_security_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net; + + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* Somebody is playing with raw sockets. */ + return NF_ACCEPT; + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); +} + +static struct nf_hook_ops *sectbl_ops __read_mostly; + +static int __net_init iptable_security_net_init(struct net *net) +{ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.iptable_security = + ipt_register_table(net, &security_table, repl); + kfree(repl); + if (IS_ERR(net->ipv4.iptable_security)) + return PTR_ERR(net->ipv4.iptable_security); + + return 0; +} + +static void __net_exit iptable_security_net_exit(struct net *net) +{ + ipt_unregister_table(net, net->ipv4.iptable_security); +} + +static struct pernet_operations iptable_security_net_ops = { + .init = iptable_security_net_init, + .exit = iptable_security_net_exit, +}; + +static int __init iptable_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + return ret; + + sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); + goto cleanup_table; + } + + return ret; + +cleanup_table: + unregister_pernet_subsys(&iptable_security_net_ops); + return ret; +} + +static void __exit iptable_security_fini(void) +{ + xt_hook_unlink(&security_table, sectbl_ops); + unregister_pernet_subsys(&iptable_security_net_ops); +} + +module_init(iptable_security_init); +module_exit(iptable_security_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c new file mode 100644 index 00000000..de9da211 --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -0,0 +1,459 @@ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); + +static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + const __be32 *ap; + __be32 _addrs[2]; + ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), + sizeof(u_int32_t) * 2, _addrs); + if (ap == NULL) + return false; + + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + + return true; +} + +static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u3.ip = orig->dst.u3.ip; + tuple->dst.u3.ip = orig->src.u3.ip; + + return true; +} + +static int ipv4_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%pI4 dst=%pI4 ", + &tuple->src.u3.ip, &tuple->dst.u3.ip); +} + +static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + unsigned int *dataoff, u_int8_t *protonum) +{ + const struct iphdr *iph; + struct iphdr _iph; + + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (iph == NULL) + return -NF_DROP; + + /* Conntrack defragments packets, we might still see fragments + * inside ICMP packets though. */ + if (iph->frag_off & htons(IP_OFFSET)) + return -NF_DROP; + + *dataoff = nhoff + (iph->ihl << 2); + *protonum = iph->protocol; + + return NF_ACCEPT; +} + +static unsigned int ipv4_confirm(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + unsigned int ret; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + help = nfct_help(ct); + if (!help) + goto out; + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + if (!helper) + goto out; + + ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); + if (ret != NF_ACCEPT) { + nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, + "nf_ct_%s: dropping packet", helper->name); + return ret; + } + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + typeof(nf_nat_seq_adjust_hook) seq_adjust; + + seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); + if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv4_conntrack_in(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); +} + +static unsigned int ipv4_conntrack_local(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { + { + .hook = ipv4_conntrack_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_conntrack_local, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_confirm, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, + { + .hook = ipv4_confirm, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static ctl_table ip_ct_sysctl_table[] = { + { + .procname = "ip_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_count", + .data = &init_net.ct.count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_buckets", + .data = &init_net.ct.htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_checksum", + .data = &init_net.ct.sysctl_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_log_invalid", + .data = &init_net.ct.sysctl_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { } +}; +#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = sk->sk_protocol; + + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", + &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int ipv4_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); + NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { + [CTA_IP_V4_SRC] = { .type = NLA_U32 }, + [CTA_IP_V4_DST] = { .type = NLA_U32 }, +}; + +static int ipv4_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) + return -EINVAL; + + t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]); + + return 0; +} + +static int ipv4_nlattr_tuple_size(void) +{ + return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1); +} +#endif + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, + .owner = THIS_MODULE, +}; + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { + .l3proto = PF_INET, + .name = "ipv4", + .pkt_to_tuple = ipv4_pkt_to_tuple, + .invert_tuple = ipv4_invert_tuple, + .print_tuple = ipv4_print_tuple, + .get_l4proto = ipv4_get_l4proto, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = ipv4_tuple_to_nlattr, + .nlattr_tuple_size = ipv4_nlattr_tuple_size, + .nlattr_to_tuple = ipv4_nlattr_to_tuple, + .nla_policy = ipv4_nla_policy, +#endif +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, + .ctl_table = ip_ct_sysctl_table, +#endif + .me = THIS_MODULE, +}; + +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); +MODULE_ALIAS("ip_conntrack"); +MODULE_LICENSE("GPL"); + +static int __init nf_conntrack_l3proto_ipv4_init(void) +{ + int ret = 0; + + need_conntrack(); + nf_defrag_ipv4_enable(); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + return ret; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register tcp.\n"); + goto cleanup_sockopt; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register icmp.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register ipv4\n"); + goto cleanup_icmp; + } + + ret = nf_register_hooks(ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + if (ret < 0) { + pr_err("nf_conntrack_ipv4: can't register hooks.\n"); + goto cleanup_ipv4; + } +#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + ret = nf_conntrack_ipv4_compat_init(); + if (ret < 0) + goto cleanup_hooks; +#endif + return ret; +#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + cleanup_hooks: + nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); +#endif + cleanup_ipv4: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + cleanup_icmp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); + cleanup_udp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); + cleanup_tcp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst); + return ret; +} + +static void __exit nf_conntrack_l3proto_ipv4_fini(void) +{ + synchronize_net(); +#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + nf_conntrack_ipv4_compat_fini(); +#endif + nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + nf_unregister_sockopt(&so_getorigdst); +} + +module_init(nf_conntrack_l3proto_ipv4_init); +module_exit(nf_conntrack_l3proto_ipv4_fini); + +void need_ipv4_conntrack(void) +{ + return; +} +EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c new file mode 100644 index 00000000..5585980f --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -0,0 +1,462 @@ +/* ip_conntrack proc compat - based on ip_conntrack_standalone.c + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct ct_iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_iter_state *st = seq->private; + struct hlist_nulls_node *n; + + for (st->bucket = 0; + st->bucket < net->ct.htable_size; + st->bucket++) { + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + if (!is_a_nulls(n)) + return n; + } + return NULL; +} + +static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, + struct hlist_nulls_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_iter_state *st = seq->private; + + head = rcu_dereference(hlist_nulls_next_rcu(head)); + while (is_a_nulls(head)) { + if (likely(get_nulls_value(head) == st->bucket)) { + if (++st->bucket >= net->ct.htable_size) + return NULL; + } + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + } + return head; +} + +static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_nulls_node *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + +static int ct_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_tuple_hash *hash = v; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; + int ret = 0; + + NF_CT_ASSERT(ct); + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; + + + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + goto release; + if (nf_ct_l3num(ct) != AF_INET) + goto release; + + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + NF_CT_ASSERT(l3proto); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + NF_CT_ASSERT(l4proto); + + ret = -ENOSPC; + if (seq_printf(s, "%-8s %u %ld ", + l4proto->name, nf_ct_protonum(ct), + timer_pending(&ct->timeout) + ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) + goto release; + + if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) + goto release; + + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, l4proto)) + goto release; + + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) + goto release; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) + if (seq_printf(s, "[UNREPLIED] ")) + goto release; + + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, l4proto)) + goto release; + + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) + goto release; + + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + if (seq_printf(s, "[ASSURED] ")) + goto release; + +#ifdef CONFIG_NF_CONNTRACK_MARK + if (seq_printf(s, "mark=%u ", ct->mark)) + goto release; +#endif + + if (ct_show_secctx(s, ct)) + goto release; + + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + goto release; + ret = 0; +release: + nf_ct_put(ct); + return ret; +} + +static const struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ct_seq_ops, + sizeof(struct ct_iter_state)); +} + +static const struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +/* expects */ +struct ct_expect_iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *ct_expect_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + struct hlist_node *n; + + for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + if (n) + return n; + } + return NULL; +} + +static struct hlist_node *ct_expect_get_next(struct seq_file *seq, + struct hlist_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + + head = rcu_dereference(hlist_next_rcu(head)); + while (head == NULL) { + if (++st->bucket >= nf_ct_expect_hsize) + return NULL; + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + } + return head; +} + +static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head = ct_expect_get_first(seq); + + if (head) + while (pos && (head = ct_expect_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *exp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ct_expect_get_idx(seq, *pos); +} + +static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return ct_expect_get_next(seq, v); +} + +static void exp_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_expect *exp; + const struct hlist_node *n = v; + + exp = hlist_entry(n, struct nf_conntrack_expect, hnode); + + if (exp->tuple.src.l3num != AF_INET) + return 0; + + if (exp->timeout.function) + seq_printf(s, "%ld ", timer_pending(&exp->timeout) + ? (long)(exp->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + + seq_printf(s, "proto=%u ", exp->tuple.dst.protonum); + + print_tuple(s, &exp->tuple, + __nf_ct_l3proto_find(exp->tuple.src.l3num), + __nf_ct_l4proto_find(exp->tuple.src.l3num, + exp->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static const struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); +} + +static const struct file_operations ip_exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(net->ct.stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(net->ct.stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); + const struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete, + st->search_restart + ); + return 0; +} + +static const struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init ip_conntrack_net_init(struct net *net) +{ + struct proc_dir_entry *proc, *proc_exp, *proc_stat; + + proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); + if (!proc) + goto err1; + + proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, + &ip_exp_file_ops); + if (!proc_exp) + goto err2; + + proc_stat = proc_create("ip_conntrack", S_IRUGO, + net->proc_net_stat, &ct_cpu_seq_fops); + if (!proc_stat) + goto err3; + return 0; + +err3: + proc_net_remove(net, "ip_conntrack_expect"); +err2: + proc_net_remove(net, "ip_conntrack"); +err1: + return -ENOMEM; +} + +static void __net_exit ip_conntrack_net_exit(struct net *net) +{ + remove_proc_entry("ip_conntrack", net->proc_net_stat); + proc_net_remove(net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack"); +} + +static struct pernet_operations ip_conntrack_net_ops = { + .init = ip_conntrack_net_init, + .exit = ip_conntrack_net_exit, +}; + +int __init nf_conntrack_ipv4_compat_init(void) +{ + return register_pernet_subsys(&ip_conntrack_net_ops); +} + +void __exit nf_conntrack_ipv4_compat_fini(void) +{ + unregister_pernet_subsys(&ip_conntrack_net_ops); +} diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 00000000..ab5b27a2 --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,318 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; + +static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct icmphdr *hp; + struct icmphdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 +}; + +static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + if (orig->dst.u.icmp.type >= sizeof(invmap) || + !invmap[orig->dst.u.icmp.type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + static const u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 + }; + + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || + !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + pr_debug("icmp: can't create new conn with type %u\n", + ct->tuplehash[0].tuple.dst.u.icmp.type); + nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); + return false; + } + return true; +} + +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + const struct nf_conntrack_l4proto *innerproto; + const struct nf_conntrack_tuple_hash *h; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + ip_hdrlen(skb) + + sizeof(struct icmphdr), + PF_INET, &origtuple)) { + pr_debug("icmp_error_message: failed to get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_slow */ + innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, zone, &innertuple); + if (!h) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) +{ + const struct icmphdr *icmph; + struct icmphdr _ih; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(net, IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip_checksum(skb, hooknum, dataoff, 0)) { + if (LOG_INVALID(net, IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(net, IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int icmp_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); + NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); + NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, +}; + +static int icmp_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE] || + !tb[CTA_PROTO_ICMP_CODE] || + !tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + + return 0; +} + +static int icmp_nlattr_tuple_size(void) +{ + return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); +} +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *icmp_sysctl_header; +static struct ctl_table icmp_sysctl_table[] = { + { + .procname = "nf_conntrack_icmp_timeout", + .data = &nf_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table icmp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_icmp_timeout", + .data = &nf_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = icmp_tuple_to_nlattr, + .nlattr_tuple_size = icmp_nlattr_tuple_size, + .nlattr_to_tuple = icmp_nlattr_to_tuple, + .nla_policy = icmp_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_header = &icmp_sysctl_header, + .ctl_table = icmp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = icmp_compat_sysctl_table, +#endif +#endif +}; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c new file mode 100644 index 00000000..f3a9b42b --- /dev/null +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -0,0 +1,128 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include +#endif +#include + +/* Returns new sk_buff, or NULL */ +static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err; + + skb_orphan(skb); + + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP_DEFRAG_CONNTRACK_IN + zone; + else + return IP_DEFRAG_CONNTRACK_OUT + zone; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (sk && (sk->sk_family == PF_INET) && + inet->nodefrag) + return NF_ACCEPT; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; +#endif +#endif + /* Gather fragments. */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); + if (nf_ct_ipv4_gather_frags(skb, user)) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static struct nf_hook_ops ipv4_defrag_ops[] = { + { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); +} + +void nf_defrag_ipv4_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c new file mode 100644 index 00000000..703f366f --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -0,0 +1,85 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell + * based on a copy of HW's ip_nat_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Brian J. Murrell "); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_amanda"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + char buffer[sizeof("65535")]; + u_int16_t port; + unsigned int ret; + + /* Connection comes from client. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_ORIGINAL; + + /* When you see the packet, we need to NAT it the same as the + * this one (ie. same IP: it will be TCP and master is UDP). */ + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int res; + + exp->tuple.dst.u.tcp.port = htons(port); + res = nf_ct_expect_related(exp); + if (res == 0) + break; + else if (res != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) + return NF_DROP; + + sprintf(buffer, "%u", port); + ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, + matchoff, matchlen, + buffer, strlen(buffer)); + if (ret != NF_ACCEPT) + nf_ct_unexpect_related(exp); + return ret; +} + +static void __exit nf_nat_amanda_fini(void) +{ + rcu_assign_pointer(nf_nat_amanda_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_amanda_init(void) +{ + BUG_ON(nf_nat_amanda_hook != NULL); + rcu_assign_pointer(nf_nat_amanda_hook, help); + return 0; +} + +module_init(nf_nat_amanda_init); +module_exit(nf_nat_amanda_fini); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c new file mode 100644 index 00000000..3346de5d --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -0,0 +1,779 @@ +/* NAT for netfilter; shared with compatibility layer. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For tcp_prot in getorigdst */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(nf_nat_lock); + +static struct nf_conntrack_l3proto *l3proto __read_mostly; + +#define MAX_IP_NAT_PROTO 256 +static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] + __read_mostly; + +static inline const struct nf_nat_protocol * +__nf_nat_proto_find(u_int8_t protonum) +{ + return rcu_dereference(nf_nat_protos[protonum]); +} + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + unsigned int hash; + + /* Original src, to ensure we map it consistently if poss. */ + hash = jhash_3words((__force u32)tuple->src.u3.ip, + (__force u32)tuple->src.u.all ^ zone, + tuple->dst.protonum, 0); + return ((u64)hash * net->ipv4.nat_htable_size) >> 32; +} + +/* Is this tuple already taken? (not by us) */ +int +nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + incoming ones. NAT means they don't have a fixed mapping, + so we invert the tuple and look for the incoming reply. + + We could keep a separate hash if this proves too slow. */ + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuplepr(&reply, tuple); + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); +} +EXPORT_SYMBOL(nf_nat_used_tuple); + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. */ +static int +in_range(const struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range) +{ + const struct nf_nat_protocol *proto; + int ret = 0; + + /* If we are supposed to map IPs, then we must be in the + range specified, otherwise let this drag us onto a new src IP. */ + if (range->flags & IP_NAT_RANGE_MAP_IPS) { + if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) || + ntohl(tuple->src.u3.ip) > ntohl(range->max_ip)) + return 0; + } + + rcu_read_lock(); + proto = __nf_nat_proto_find(tuple->dst.protonum); + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || + proto->in_range(tuple, IP_NAT_MANIP_SRC, + &range->min, &range->max)) + ret = 1; + rcu_read_unlock(); + + return ret; +} + +static inline int +same_src(const struct nf_conn *ct, + const struct nf_conntrack_tuple *tuple) +{ + const struct nf_conntrack_tuple *t; + + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + return (t->dst.protonum == tuple->dst.protonum && + t->src.u3.ip == tuple->src.u3.ip && + t->src.u.all == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *result, + const struct nf_nat_range *range) +{ + unsigned int h = hash_by_src(net, zone, tuple); + const struct nf_conn_nat *nat; + const struct nf_conn *ct; + const struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { + ct = nat->ct; + if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(result, range)) { + rcu_read_unlock(); + return 1; + } + } + } + rcu_read_unlock(); + return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + 1-65535, we don't do pro-rata allocation based on ports; we choose + the ip with the lowest src-ip/dst-ip/proto usage. +*/ +static void +find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + const struct nf_conn *ct, + enum nf_nat_manip_type maniptype) +{ + __be32 *var_ipp; + /* Host order */ + u_int32_t minip, maxip, j; + + /* No IP mapping? Do nothing. */ + if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) + return; + + if (maniptype == IP_NAT_MANIP_SRC) + var_ipp = &tuple->src.u3.ip; + else + var_ipp = &tuple->dst.u3.ip; + + /* Fast path: only one choice. */ + if (range->min_ip == range->max_ip) { + *var_ipp = range->min_ip; + return; + } + + /* Hashing source and destination IPs gives a fairly even + * spread in practice (if there are a small number of IPs + * involved, there usually aren't that many connections + * anyway). The consistency means that servers see the same + * client coming from the same IP (some Internet Banking sites + * like this), even across reboots. */ + minip = ntohl(range->min_ip); + maxip = ntohl(range->max_ip); + j = jhash_2words((__force u32)tuple->src.u3.ip, + range->flags & IP_NAT_RANGE_PERSISTENT ? + 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0); + j = ((u64)j * (maxip - minip + 1)) >> 32; + *var_ipp = htonl(minip + j); +} + +/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, + * we change the source to map into the range. For NF_INET_PRE_ROUTING + * and NF_INET_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig_tuple, + const struct nf_nat_range *range, + struct nf_conn *ct, + enum nf_nat_manip_type maniptype) +{ + struct net *net = nf_ct_net(ct); + const struct nf_nat_protocol *proto; + u16 zone = nf_ct_zone(ct); + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + and that same mapping gives a unique tuple within the given + range, use that. + + This is only required for source (ie. NAT/masq) mappings. + So far, we don't do local source mappings, so multiple + manips not an issue. */ + if (maniptype == IP_NAT_MANIP_SRC && + !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { + /* try the original tuple first */ + if (in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + return; + } + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, + range)) { + pr_debug("get_unique_tuple: Found current src map\n"); + if (!nf_nat_used_tuple(tuple, ct)) + return; + } + } + + /* 2) Select the least-used IP/proto combination in the given + range. */ + *tuple = *orig_tuple; + find_best_ips_proto(zone, tuple, range, ct, maniptype); + + /* 3) The per-protocol part of the manip is made to map into + the range to make a unique tuple. */ + + rcu_read_lock(); + proto = __nf_nat_proto_find(orig_tuple->dst.protonum); + + /* Only bother mapping if it's not already in range and unique */ + if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { + if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) { + if (proto->in_range(tuple, maniptype, &range->min, + &range->max) && + (range->min.all == range->max.all || + !nf_nat_used_tuple(tuple, ct))) + goto out; + } else if (!nf_nat_used_tuple(tuple, ct)) { + goto out; + } + } + + /* Last change: get protocol to try to obtain unique tuple. */ + proto->unique_tuple(tuple, range, maniptype, ct); +out: + rcu_read_unlock(); +} + +unsigned int +nf_nat_setup_info(struct nf_conn *ct, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_tuple curr_tuple, new_tuple; + struct nf_conn_nat *nat; + + /* nat helper or nfctnetlink also setup binding */ + nat = nfct_nat(ct); + if (!nat) { + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + if (nat == NULL) { + pr_debug("failed to add NAT extension\n"); + return NF_ACCEPT; + } + } + + NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC || + maniptype == IP_NAT_MANIP_DST); + BUG_ON(nf_nat_initialized(ct, maniptype)); + + /* What we've got will look like inverse of reply. Normally + this is what is in the conntrack, except for prior + manipulations (future optimization: if num_manips == 0, + orig_tp = + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ + nf_ct_invert_tuplepr(&curr_tuple, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); + + if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { + struct nf_conntrack_tuple reply; + + /* Alter conntrack table so will recognize replies. */ + nf_ct_invert_tuplepr(&reply, &new_tuple); + nf_conntrack_alter_reply(ct, &reply); + + /* Non-atomic: we own this at the moment. */ + if (maniptype == IP_NAT_MANIP_SRC) + ct->status |= IPS_SRC_NAT; + else + ct->status |= IPS_DST_NAT; + } + + if (maniptype == IP_NAT_MANIP_SRC) { + unsigned int srchash; + + srchash = hash_by_src(net, nf_ct_zone(ct), + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + /* nf_conntrack_alter_reply might re-allocate exntension aera */ + nat = nfct_nat(ct); + nat->ct = ct; + hlist_add_head_rcu(&nat->bysource, + &net->ipv4.nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); + } + + /* It's done. */ + if (maniptype == IP_NAT_MANIP_DST) + ct->status |= IPS_DST_NAT_DONE; + else + ct->status |= IPS_SRC_NAT_DONE; + + return NF_ACCEPT; +} +EXPORT_SYMBOL(nf_nat_setup_info); + +/* Returns true if succeeded. */ +static bool +manip_pkt(u_int16_t proto, + struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *target, + enum nf_nat_manip_type maniptype) +{ + struct iphdr *iph; + const struct nf_nat_protocol *p; + + if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) + return false; + + iph = (void *)skb->data + iphdroff; + + /* Manipulate protcol part. */ + + /* rcu_read_lock()ed by nf_hook_slow */ + p = __nf_nat_proto_find(proto); + if (!p->manip_pkt(skb, iphdroff, target, maniptype)) + return false; + + iph = (void *)skb->data + iphdroff; + + if (maniptype == IP_NAT_MANIP_SRC) { + csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); + iph->saddr = target->src.u3.ip; + } else { + csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); + iph->daddr = target->dst.u3.ip; + } + return true; +} + +/* Do packet manipulations according to nf_nat_setup_info. */ +unsigned int nf_nat_packet(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff *skb) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); + + if (mtype == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + /* Non-atomic: these bits don't change. */ + if (ct->status & statusbit) { + struct nf_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype)) + return NF_DROP; + } + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_nat_packet); + +/* Dir is direction ICMP is coming from (opposite to packet it contains) */ +int nf_nat_icmp_reply_translation(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff *skb) +{ + struct { + struct icmphdr icmp; + struct iphdr ip; + } *inside; + const struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple inner, target; + int hdrlen = ip_hdrlen(skb); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); + + if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + return 0; + + inside = (void *)skb->data + hdrlen; + + /* We're actually going to mangle it beyond trivial checksum + adjustment, so make sure the current checksum is correct. */ + if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) + return 0; + + /* Must be RELATED */ + NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || + skb->nfctinfo == IP_CT_RELATED_REPLY); + + /* Redirects on non-null nats must be dropped, else they'll + start talking to each other without our translation, and be + confused... --RR */ + if (inside->icmp.type == ICMP_REDIRECT) { + /* If NAT isn't finished, assume it and drop. */ + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + + if (ct->status & IPS_NAT_MASK) + return 0; + } + + if (manip == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + + pr_debug("icmp_reply_translation: translating error %p manip %u " + "dir %s\n", skb, manip, + dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); + + /* rcu_read_lock()ed by nf_hook_slow */ + l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); + + if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr), + (hdrlen + + sizeof(struct icmphdr) + inside->ip.ihl * 4), + (u_int16_t)AF_INET, inside->ip.protocol, + &inner, l3proto, l4proto)) + return 0; + + /* Change inner back to look like incoming packet. We do the + opposite manip on this hook to normal, because it might not + pass all hooks (locally-generated ICMP). Consider incoming + packet: PREROUTING (DST manip), routing produces ICMP, goes + through POSTROUTING (which must correct the DST manip). */ + if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp), + &ct->tuplehash[!dir].tuple, !manip)) + return 0; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + /* Reloading "inside" here since manip_pkt inner. */ + inside = (void *)skb->data + hdrlen; + inside->icmp.checksum = 0; + inside->icmp.checksum = + csum_fold(skb_checksum(skb, hdrlen, + skb->len - hdrlen, 0)); + } + + /* Change outer to look the reply to an incoming packet + * (proto 0 means don't invert per-proto part). */ + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, skb, 0, &target, manip)) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); + +/* Protocol registration. */ +int nf_nat_protocol_register(const struct nf_nat_protocol *proto) +{ + int ret = 0; + + spin_lock_bh(&nf_nat_lock); + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } + rcu_assign_pointer(nf_nat_protos[proto->protonum], proto); + out: + spin_unlock_bh(&nf_nat_lock); + return ret; +} +EXPORT_SYMBOL(nf_nat_protocol_register); + +/* No one stores the protocol anywhere; simply delete it. */ +void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) +{ + spin_lock_bh(&nf_nat_lock); + rcu_assign_pointer(nf_nat_protos[proto->protonum], + &nf_nat_unknown_protocol); + spin_unlock_bh(&nf_nat_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_nat_protocol_unregister); + +/* No one using conntrack by the time this called. */ +static void nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ + struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); + + if (nat == NULL || nat->ct == NULL) + return; + + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + spin_unlock_bh(&nf_nat_lock); +} + +static void nf_nat_move_storage(void *new, void *old) +{ + struct nf_conn_nat *new_nat = new; + struct nf_conn_nat *old_nat = old; + struct nf_conn *ct = old_nat->ct; + + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) + return; + + spin_lock_bh(&nf_nat_lock); + hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); + spin_unlock_bh(&nf_nat_lock); +} + +static struct nf_ct_ext_type nat_extend __read_mostly = { + .len = sizeof(struct nf_conn_nat), + .align = __alignof__(struct nf_conn_nat), + .destroy = nf_nat_cleanup_conntrack, + .move = nf_nat_move_storage, + .id = NF_CT_EXT_NAT, + .flags = NF_CT_EXT_F_PREALLOC, +}; + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static const struct nf_nat_protocol * +nf_nat_proto_find_get(u_int8_t protonum) +{ + const struct nf_nat_protocol *p; + + rcu_read_lock(); + p = __nf_nat_proto_find(protonum); + if (!try_module_get(p->me)) + p = &nf_nat_unknown_protocol; + rcu_read_unlock(); + + return p; +} + +static void +nf_nat_proto_put(const struct nf_nat_protocol *p) +{ + module_put(p->me); +} + +static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { + [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, + [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, +}; + +static int nfnetlink_parse_nat_proto(struct nlattr *attr, + const struct nf_conn *ct, + struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_PROTONAT_MAX+1]; + const struct nf_nat_protocol *npt; + int err; + + err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); + if (err < 0) + return err; + + npt = nf_nat_proto_find_get(nf_ct_protonum(ct)); + if (npt->nlattr_to_range) + err = npt->nlattr_to_range(tb, range); + nf_nat_proto_put(npt); + return err; +} + +static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { + [CTA_NAT_MINIP] = { .type = NLA_U32 }, + [CTA_NAT_MAXIP] = { .type = NLA_U32 }, +}; + +static int +nfnetlink_parse_nat(const struct nlattr *nat, + const struct nf_conn *ct, struct nf_nat_range *range) +{ + struct nlattr *tb[CTA_NAT_MAX+1]; + int err; + + memset(range, 0, sizeof(*range)); + + err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); + if (err < 0) + return err; + + if (tb[CTA_NAT_MINIP]) + range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); + + if (!tb[CTA_NAT_MAXIP]) + range->max_ip = range->min_ip; + else + range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO]) + return 0; + + err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); + if (err < 0) + return err; + + return 0; +} + +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + struct nf_nat_range range; + + if (nfnetlink_parse_nat(attr, ct, &range) < 0) + return -EINVAL; + if (nf_nat_initialized(ct, manip)) + return -EEXIST; + + return nf_nat_setup_info(ct, &range, manip); +} +#else +static int +nfnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +static int __net_init nf_nat_net_init(struct net *net) +{ + /* Leave them the same for the moment. */ + net->ipv4.nat_htable_size = net->ct.htable_size; + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); + if (!net->ipv4.nat_bysource) + return -ENOMEM; + return 0; +} + +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int clean_nat(struct nf_conn *i, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + memset(nat, 0, sizeof(*nat)); + i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); + return 0; +} + +static void __net_exit nf_nat_net_exit(struct net *net) +{ + nf_ct_iterate_cleanup(net, &clean_nat, NULL); + synchronize_rcu(); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); +} + +static struct pernet_operations nf_nat_net_ops = { + .init = nf_nat_net_init, + .exit = nf_nat_net_exit, +}; + +static int __init nf_nat_init(void) +{ + size_t i; + int ret; + + need_ipv4_conntrack(); + + ret = nf_ct_extend_register(&nat_extend); + if (ret < 0) { + printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + return ret; + } + + ret = register_pernet_subsys(&nf_nat_net_ops); + if (ret < 0) + goto cleanup_extend; + + /* Sew in builtin protocols. */ + spin_lock_bh(&nf_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) + rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol); + rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); + rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); + rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); + spin_unlock_bh(&nf_nat_lock); + + /* Initialize fake conntrack so that NAT will skip it */ + nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); + + l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); + + BUG_ON(nf_nat_seq_adjust_hook != NULL); + rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); + BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); + rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, + nfnetlink_parse_nat_setup); + BUG_ON(nf_ct_nat_offset != NULL); + rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset); + return 0; + + cleanup_extend: + nf_ct_extend_unregister(&nat_extend); + return ret; +} + +static void __exit nf_nat_cleanup(void) +{ + unregister_pernet_subsys(&nf_nat_net_ops); + nf_ct_l3proto_put(l3proto); + nf_ct_extend_unregister(&nat_extend); + rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); + rcu_assign_pointer(nf_ct_nat_offset, NULL); + synchronize_net(); +} + +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-ipv4"); + +module_init(nf_nat_init); +module_exit(nf_nat_cleanup); diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c new file mode 100644 index 00000000..dc73abb3 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -0,0 +1,137 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("ftp NAT helper"); +MODULE_ALIAS("ip_nat_ftp"); + +/* FIXME: Time out? --RR */ + +static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type, + char *buffer, size_t buflen, + __be32 addr, u16 port) +{ + switch (type) { + case NF_CT_FTP_PORT: + case NF_CT_FTP_PASV: + return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&addr)[0], + ((unsigned char *)&addr)[1], + ((unsigned char *)&addr)[2], + ((unsigned char *)&addr)[3], + port >> 8, + port & 0xFF); + case NF_CT_FTP_EPRT: + return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port); + case NF_CT_FTP_EPSV: + return snprintf(buffer, buflen, "|||%u|", port); + } + + return 0; +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int nf_nat_ftp(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + __be32 newip; + u_int16_t port; + int dir = CTINFO2DIR(ctinfo); + struct nf_conn *ct = exp->master; + char buffer[sizeof("|1|255.255.255.255|65535|")]; + unsigned int buflen; + + pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + + /* Connection will come from wherever this packet goes, hence !dir */ + newip = ct->tuplehash[!dir].tuple.dst.u3.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) + return NF_DROP; + + buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port); + if (!buflen) + goto out; + + pr_debug("calling nf_nat_mangle_tcp_packet\n"); + + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, + matchlen, buffer, buflen)) + goto out; + + return NF_ACCEPT; + +out: + nf_ct_unexpect_related(exp); + return NF_DROP; +} + +static void __exit nf_nat_ftp_fini(void) +{ + rcu_assign_pointer(nf_nat_ftp_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_ftp_init(void) +{ + BUG_ON(nf_nat_ftp_hook != NULL); + rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp); + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_ftp_init); +module_exit(nf_nat_ftp_fini); diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c new file mode 100644 index 00000000..790f3160 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -0,0 +1,618 @@ +/* + * H.323 extension for NAT alteration. + * + * Copyright (c) 2006 Jing Min Zhao + * + * This source code is licensed under General Public License version 2. + * + * Based on the 'brute force' H.323 NAT module by + * Jozsef Kadlecsik + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/****************************************************************************/ +static int set_addr(struct sk_buff *skb, + unsigned char **data, int dataoff, + unsigned int addroff, __be32 ip, __be16 port) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct { + __be32 ip; + __be16 port; + } __attribute__ ((__packed__)) buf; + const struct tcphdr *th; + struct tcphdr _tcph; + + buf.ip = ip; + buf.port = port; + addroff += dataoff; + + if (ip_hdr(skb)->protocol == IPPROTO_TCP) { + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + addroff, sizeof(buf), + (char *) &buf, sizeof(buf))) { + if (net_ratelimit()) + pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet" + " error\n"); + return -1; + } + + /* Relocate data pointer */ + th = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_tcph), &_tcph); + if (th == NULL) + return -1; + *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; + } else { + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, + addroff, sizeof(buf), + (char *) &buf, sizeof(buf))) { + if (net_ratelimit()) + pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet" + " error\n"); + return -1; + } + /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy + * or pull everything in a linear buffer, so we can safely + * use the skb pointers now */ + *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); + } + + return 0; +} + +/****************************************************************************/ +static int set_h225_addr(struct sk_buff *skb, + unsigned char **data, int dataoff, + TransportAddress *taddr, + union nf_inet_addr *addr, __be16 port) +{ + return set_addr(skb, data, dataoff, taddr->ipAddress.ip, + addr->ip, port); +} + +/****************************************************************************/ +static int set_h245_addr(struct sk_buff *skb, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, + union nf_inet_addr *addr, __be16 port) +{ + return set_addr(skb, data, dataoff, + taddr->unicastAddress.iPAddress.network, + addr->ip, port); +} + +/****************************************************************************/ +static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) +{ + const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int i; + __be16 port; + union nf_inet_addr addr; + + for (i = 0; i < count; i++) { + if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) { + if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && + port == info->sig_port[dir]) { + /* GW->GK */ + + /* Fix for Gnomemeeting */ + if (i > 0 && + get_h225_addr(ct, *data, &taddr[0], + &addr, &port) && + (ntohl(addr.ip) & 0xff000000) == 0x7f000000) + i = 0; + + pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n", + &addr.ip, port, + &ct->tuplehash[!dir].tuple.dst.u3.ip, + info->sig_port[!dir]); + return set_h225_addr(skb, data, 0, &taddr[i], + &ct->tuplehash[!dir]. + tuple.dst.u3, + info->sig_port[!dir]); + } else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && + port == info->sig_port[dir]) { + /* GK->GW */ + pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n", + &addr.ip, port, + &ct->tuplehash[!dir].tuple.src.u3.ip, + info->sig_port[!dir]); + return set_h225_addr(skb, data, 0, &taddr[i], + &ct->tuplehash[!dir]. + tuple.src.u3, + info->sig_port[!dir]); + } + } + } + + return 0; +} + +/****************************************************************************/ +static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) +{ + int dir = CTINFO2DIR(ctinfo); + int i; + __be16 port; + union nf_inet_addr addr; + + for (i = 0; i < count; i++) { + if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) && + addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && + port == ct->tuplehash[dir].tuple.src.u.udp.port) { + pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n", + &addr.ip, ntohs(port), + &ct->tuplehash[!dir].tuple.dst.u3.ip, + ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); + return set_h225_addr(skb, data, 0, &taddr[i], + &ct->tuplehash[!dir].tuple.dst.u3, + ct->tuplehash[!dir].tuple. + dst.u.udp.port); + } + } + + return 0; +} + +/****************************************************************************/ +static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, + __be16 port, __be16 rtp_port, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int i; + u_int16_t nated_port; + + /* Set expectations for NAT */ + rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; + rtp_exp->expectfn = nf_nat_follow_master; + rtp_exp->dir = !dir; + rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; + rtcp_exp->expectfn = nf_nat_follow_master; + rtcp_exp->dir = !dir; + + /* Lookup existing expects */ + for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) { + if (info->rtp_port[i][dir] == rtp_port) { + /* Expected */ + + /* Use allocated ports first. This will refresh + * the expects */ + rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir]; + rtcp_exp->tuple.dst.u.udp.port = + htons(ntohs(info->rtp_port[i][dir]) + 1); + break; + } else if (info->rtp_port[i][dir] == 0) { + /* Not expected */ + break; + } + } + + /* Run out of expectations */ + if (i >= H323_RTP_CHANNEL_MAX) { + if (net_ratelimit()) + pr_notice("nf_nat_h323: out of expectations\n"); + return 0; + } + + /* Try to get a pair of ports. */ + for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); + nated_port != 0; nated_port += 2) { + int ret; + + rtp_exp->tuple.dst.u.udp.port = htons(nated_port); + ret = nf_ct_expect_related(rtp_exp); + if (ret == 0) { + rtcp_exp->tuple.dst.u.udp.port = + htons(nated_port + 1); + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + nated_port = 0; + break; + } + } else if (ret != -EBUSY) { + nated_port = 0; + break; + } + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + pr_notice("nf_nat_h323: out of RTP ports\n"); + return 0; + } + + /* Modify signal */ + if (set_h245_addr(skb, data, dataoff, taddr, + &ct->tuplehash[!dir].tuple.dst.u3, + htons((port & htons(1)) ? nated_port + 1 : + nated_port)) == 0) { + /* Save ports */ + info->rtp_port[i][dir] = rtp_port; + info->rtp_port[i][!dir] = htons(nated_port); + } else { + nf_ct_unexpect_related(rtp_exp); + nf_ct_unexpect_related(rtcp_exp); + return -1; + } + + /* Success */ + pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n", + &rtp_exp->tuple.src.u3.ip, + ntohs(rtp_exp->tuple.src.u.udp.port), + &rtp_exp->tuple.dst.u3.ip, + ntohs(rtp_exp->tuple.dst.u.udp.port)); + pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n", + &rtcp_exp->tuple.src.u3.ip, + ntohs(rtcp_exp->tuple.src.u.udp.port), + &rtcp_exp->tuple.dst.u3.ip, + ntohs(rtcp_exp->tuple.dst.u.udp.port)); + + return 0; +} + +/****************************************************************************/ +static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) +{ + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port = ntohs(port); + + /* Set expectations for NAT */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = nf_nat_follow_master; + exp->dir = !dir; + + /* Try to get same port: if not, try to change it. */ + for (; nated_port != 0; nated_port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(nated_port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + pr_notice("nf_nat_h323: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (set_h245_addr(skb, data, dataoff, taddr, + &ct->tuplehash[!dir].tuple.dst.u3, + htons(nated_port)) < 0) { + nf_ct_unexpect_related(exp); + return -1; + } + + pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n", + &exp->tuple.src.u3.ip, + ntohs(exp->tuple.src.u.tcp.port), + &exp->tuple.dst.u3.ip, + ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + +/****************************************************************************/ +static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port = ntohs(port); + + /* Set expectations for NAT */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = nf_nat_follow_master; + exp->dir = !dir; + + /* Check existing expects */ + if (info->sig_port[dir] == port) + nated_port = ntohs(info->sig_port[!dir]); + + /* Try to get same port: if not, try to change it. */ + for (; nated_port != 0; nated_port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(nated_port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + pr_notice("nf_nat_q931: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (set_h225_addr(skb, data, dataoff, taddr, + &ct->tuplehash[!dir].tuple.dst.u3, + htons(nated_port)) == 0) { + /* Save ports */ + info->sig_port[dir] = port; + info->sig_port[!dir] = htons(nated_port); + } else { + nf_ct_unexpect_related(exp); + return -1; + } + + pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n", + &exp->tuple.src.u3.ip, + ntohs(exp->tuple.src.u.tcp.port), + &exp->tuple.dst.u3.ip, + ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + +/**************************************************************************** + * This conntrack expect function replaces nf_conntrack_q931_expect() + * which was set by nf_conntrack_h323.c. + ****************************************************************************/ +static void ip_nat_q931_expect(struct nf_conn *new, + struct nf_conntrack_expect *this) +{ + struct nf_nat_range range; + + if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ + nf_nat_follow_master(new, this); + return; + } + + /* This must be a fresh one. */ + BUG_ON(new->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; + nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = this->saved_proto; + range.min_ip = range.max_ip = + new->master->tuplehash[!this->dir].tuple.src.u3.ip; + nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); +} + +/****************************************************************************/ +static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, TransportAddress *taddr, int idx, + __be16 port, struct nf_conntrack_expect *exp) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port = ntohs(port); + union nf_inet_addr addr; + + /* Set expectations for NAT */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = ip_nat_q931_expect; + exp->dir = !dir; + + /* Check existing expects */ + if (info->sig_port[dir] == port) + nated_port = ntohs(info->sig_port[!dir]); + + /* Try to get same port: if not, try to change it. */ + for (; nated_port != 0; nated_port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(nated_port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + pr_notice("nf_nat_ras: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (set_h225_addr(skb, data, 0, &taddr[idx], + &ct->tuplehash[!dir].tuple.dst.u3, + htons(nated_port)) == 0) { + /* Save ports */ + info->sig_port[dir] = port; + info->sig_port[!dir] = htons(nated_port); + + /* Fix for Gnomemeeting */ + if (idx > 0 && + get_h225_addr(ct, *data, &taddr[0], &addr, &port) && + (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { + set_h225_addr(skb, data, 0, &taddr[0], + &ct->tuplehash[!dir].tuple.dst.u3, + info->sig_port[!dir]); + } + } else { + nf_ct_unexpect_related(exp); + return -1; + } + + /* Success */ + pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n", + &exp->tuple.src.u3.ip, + ntohs(exp->tuple.src.u.tcp.port), + &exp->tuple.dst.u3.ip, + ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + +/****************************************************************************/ +static void ip_nat_callforwarding_expect(struct nf_conn *new, + struct nf_conntrack_expect *this) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(new->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; + nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = this->saved_proto; + range.min_ip = range.max_ip = this->saved_ip; + nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); +} + +/****************************************************************************/ +static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) +{ + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port; + + /* Set expectations for NAT */ + exp->saved_ip = exp->tuple.dst.u3.ip; + exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = ip_nat_callforwarding_expect; + exp->dir = !dir; + + /* Try to get same port: if not, try to change it. */ + for (nated_port = ntohs(port); nated_port != 0; nated_port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(nated_port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + pr_notice("nf_nat_q931: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (!set_h225_addr(skb, data, dataoff, taddr, + &ct->tuplehash[!dir].tuple.dst.u3, + htons(nated_port)) == 0) { + nf_ct_unexpect_related(exp); + return -1; + } + + /* Success */ + pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n", + &exp->tuple.src.u3.ip, + ntohs(exp->tuple.src.u.tcp.port), + &exp->tuple.dst.u3.ip, + ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + +/****************************************************************************/ +static int __init init(void) +{ + BUG_ON(set_h245_addr_hook != NULL); + BUG_ON(set_h225_addr_hook != NULL); + BUG_ON(set_sig_addr_hook != NULL); + BUG_ON(set_ras_addr_hook != NULL); + BUG_ON(nat_rtp_rtcp_hook != NULL); + BUG_ON(nat_t120_hook != NULL); + BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); + BUG_ON(nat_q931_hook != NULL); + + rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); + rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); + rcu_assign_pointer(set_sig_addr_hook, set_sig_addr); + rcu_assign_pointer(set_ras_addr_hook, set_ras_addr); + rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp); + rcu_assign_pointer(nat_t120_hook, nat_t120); + rcu_assign_pointer(nat_h245_hook, nat_h245); + rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding); + rcu_assign_pointer(nat_q931_hook, nat_q931); + return 0; +} + +/****************************************************************************/ +static void __exit fini(void) +{ + rcu_assign_pointer(set_h245_addr_hook, NULL); + rcu_assign_pointer(set_h225_addr_hook, NULL); + rcu_assign_pointer(set_sig_addr_hook, NULL); + rcu_assign_pointer(set_ras_addr_hook, NULL); + rcu_assign_pointer(nat_rtp_rtcp_hook, NULL); + rcu_assign_pointer(nat_t120_hook, NULL); + rcu_assign_pointer(nat_h245_hook, NULL); + rcu_assign_pointer(nat_callforwarding_hook, NULL); + rcu_assign_pointer(nat_q931_hook, NULL); + synchronize_rcu(); +} + +/****************************************************************************/ +module_init(init); +module_exit(fini); + +MODULE_AUTHOR("Jing Min Zhao "); +MODULE_DESCRIPTION("H.323 NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_h323"); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c new file mode 100644 index 00000000..ebc5f889 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -0,0 +1,451 @@ +/* ip_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte + * (C) 2003-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DUMP_OFFSET(x) \ + pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \ + x->offset_before, x->offset_after, x->correction_pos); + +static DEFINE_SPINLOCK(nf_nat_seqofs_lock); + +/* Setup TCP sequence correction given this change at this sequence */ +static inline void +adjust_tcp_sequence(u32 seq, + int sizediff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way = &nat->seq[dir]; + + pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", + seq, sizediff); + + pr_debug("adjust_tcp_sequence: Seq_offset before: "); + DUMP_OFFSET(this_way); + + spin_lock_bh(&nf_nat_seqofs_lock); + + /* SYN adjust. If it's uninitialized, or this is after last + * correction, record it: we don't handle more than one + * adjustment in the window, but do deal with common case of a + * retransmit */ + if (this_way->offset_before == this_way->offset_after || + before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += sizediff; + } + spin_unlock_bh(&nf_nat_seqofs_lock); + + pr_debug("adjust_tcp_sequence: Seq_offset after: "); + DUMP_OFFSET(this_way); +} + +/* Get the offset value, for conntrack */ +s16 nf_nat_get_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way; + s16 offset; + + if (!nat) + return 0; + + this_way = &nat->seq[dir]; + spin_lock_bh(&nf_nat_seqofs_lock); + offset = after(seq, this_way->correction_pos) + ? this_way->offset_after : this_way->offset_before; + spin_unlock_bh(&nf_nat_seqofs_lock); + + return offset; +} +EXPORT_SYMBOL_GPL(nf_nat_get_offset); + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, + unsigned int dataoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + unsigned char *data; + + BUG_ON(skb_is_nonlinear(skb)); + data = skb_network_header(skb) + dataoff; + + /* move post-replacement */ + memmove(data + match_offset + rep_len, + data + match_offset + match_len, + skb->tail - (skb->network_header + dataoff + + match_offset + match_len)); + + /* insert data from buffer */ + memcpy(data + match_offset, rep_buffer, rep_len); + + /* update skb info */ + if (rep_len > match_len) { + pr_debug("nf_nat_mangle_packet: Extending packet by " + "%u from %u bytes\n", rep_len - match_len, skb->len); + skb_put(skb, rep_len - match_len); + } else { + pr_debug("nf_nat_mangle_packet: Shrinking packet from " + "%u from %u bytes\n", match_len - rep_len, skb->len); + __skb_trim(skb, skb->len + rep_len - match_len); + } + + /* fix IP hdr checksum information */ + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff *skb, unsigned int extra) +{ + if (skb->len + extra > 65535) + return 0; + + if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) + return 0; + + return 1; +} + +void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s16 off) +{ + if (!off) + return; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); + nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); +} +EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); + +static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, + int datalen, __sum16 *check, int oldlen) +{ + struct rtable *rt = skb_rtable(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + if (!(rt->rt_flags & RTCF_LOCAL) && + (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + + skb_network_offset(skb) + + iph->ihl * 4; + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, iph->protocol, 0); + } else { + *check = 0; + *check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, iph->protocol, + csum_partial(data, datalen, + 0)); + if (iph->protocol == IPPROTO_UDP && !*check) + *check = CSUM_MANGLED_0; + } + } else + inet_proto_csum_replace2(check, skb, + htons(oldlen), htons(datalen), 1); +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len, bool adjust) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int oldlen, datalen; + + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (rep_len > match_len && + rep_len - match_len > skb_tailroom(skb) && + !enlarge_skb(skb, rep_len - match_len)) + return 0; + + SKB_LINEAR_ASSERT(skb); + + iph = ip_hdr(skb); + tcph = (void *)iph + iph->ihl*4; + + oldlen = skb->len - iph->ihl*4; + mangle_contents(skb, iph->ihl*4 + tcph->doff*4, + match_offset, match_len, rep_buffer, rep_len); + + datalen = skb->len - iph->ihl*4; + nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen); + + if (adjust && rep_len != match_len) + nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); + + return 1; +} +EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with nf_nat_mangle_tcp_packet which + * should be fairly easy to do. + */ +int +nf_nat_mangle_udp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct udphdr *udph; + int datalen, oldlen; + + /* UDP helpers might accidentally mangle the wrong packet */ + iph = ip_hdr(skb); + if (skb->len < iph->ihl*4 + sizeof(*udph) + + match_offset + match_len) + return 0; + + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (rep_len > match_len && + rep_len - match_len > skb_tailroom(skb) && + !enlarge_skb(skb, rep_len - match_len)) + return 0; + + iph = ip_hdr(skb); + udph = (void *)iph + iph->ihl*4; + + oldlen = skb->len - iph->ihl*4; + mangle_contents(skb, iph->ihl*4 + sizeof(*udph), + match_offset, match_len, rep_buffer, rep_len); + + /* update the length of the UDP packet */ + datalen = skb->len - iph->ihl*4; + udph->len = htons(datalen); + + /* fix udp checksum if udp checksum was previously calculated */ + if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) + return 1; + + nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen); + + return 1; +} +EXPORT_SYMBOL(nf_nat_mangle_udp_packet); + +/* Adjust one found SACK option including checksum correction */ +static void +sack_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct nf_nat_seq *natseq) +{ + while (sackoff < sackend) { + struct tcp_sack_block_wire *sack; + __be32 new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - natseq->offset_before, + natseq->correction_pos)) + new_start_seq = htonl(ntohl(sack->start_seq) + - natseq->offset_after); + else + new_start_seq = htonl(ntohl(sack->start_seq) + - natseq->offset_before); + + if (after(ntohl(sack->end_seq) - natseq->offset_before, + natseq->correction_pos)) + new_end_seq = htonl(ntohl(sack->end_seq) + - natseq->offset_after); + else + new_end_seq = htonl(ntohl(sack->end_seq) + - natseq->offset_before); + + pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static inline unsigned int +nf_nat_sack_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + struct nf_conn_nat *nat = nfct_nat(ct); + + optoff = ip_hdrlen(skb) + sizeof(struct tcphdr); + optend = ip_hdrlen(skb) + tcph->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK && + op[1] >= 2+TCPOLEN_SACK_PERBLOCK && + ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + sack_adjust(skb, tcph, optoff+2, + optoff+op[1], &nat->seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int +nf_nat_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct tcphdr *tcph; + int dir; + __be32 newseq, newack; + s16 seqoff, ackoff; + struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way, *other_way; + + dir = CTINFO2DIR(ctinfo); + + this_way = &nat->seq[dir]; + other_way = &nat->seq[!dir]; + + if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) + return 0; + + tcph = (void *)skb->data + ip_hdrlen(skb); + if (after(ntohl(tcph->seq), this_way->correction_pos)) + seqoff = this_way->offset_after; + else + seqoff = this_way->offset_before; + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + ackoff = other_way->offset_after; + else + ackoff = other_way->offset_before; + + newseq = htonl(ntohl(tcph->seq) + seqoff); + newack = htonl(ntohl(tcph->ack_seq) - ackoff); + + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + + pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + return nf_nat_sack_adjust(skb, tcph, ct, ctinfo); +} + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void nf_nat_follow_master(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); +} +EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c new file mode 100644 index 00000000..535e1a80 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_irc.c @@ -0,0 +1,99 @@ +/* IRC extension for TCP NAT alteration. + * + * (C) 2000-2001 by Harald Welte + * (C) 2004 Rusty Russell IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_irc"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) +{ + char buffer[sizeof("4294967296 65635")]; + u_int32_t ip; + u_int16_t port; + unsigned int ret; + + /* Reply comes from server. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = nf_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + + exp->tuple.dst.u.tcp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) + return NF_DROP; + + ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip); + sprintf(buffer, "%u %u", ip, port); + pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", + buffer, &ip, port); + + ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, + matchoff, matchlen, buffer, + strlen(buffer)); + if (ret != NF_ACCEPT) + nf_ct_unexpect_related(exp); + return ret; +} + +static void __exit nf_nat_irc_fini(void) +{ + rcu_assign_pointer(nf_nat_irc_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_irc_init(void) +{ + BUG_ON(nf_nat_irc_hook != NULL); + rcu_assign_pointer(nf_nat_irc_hook, help); + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO KBUILD_MODNAME + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(nf_nat_irc_init); +module_exit(nf_nat_irc_fini); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c new file mode 100644 index 00000000..4c060038 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -0,0 +1,308 @@ +/* + * nf_nat_pptp.c + * + * NAT support for PPTP (Point to Point Tunneling Protocol). + * PPTP is a a protocol for creating virtual private networks. + * It is a specification defined by Microsoft and some vendors + * working with Microsoft. PPTP is built on top of a modified + * version of the Internet Generic Routing Encapsulation Protocol. + * GRE is defined in RFC 1701 and RFC 1702. Documentation of + * PPTP can be found in RFC 2637 + * + * (C) 2000-2005 by Harald Welte + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + * + * TODO: - NAT to a unique tuple, not to TCP source port + * (needs netfilter tuple reservation) + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NF_NAT_PPTP_VERSION "3.0" + +#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off))) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); +MODULE_ALIAS("ip_nat_pptp"); + +static void pptp_nat_expected(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct net *net = nf_ct_net(ct); + const struct nf_conn *master = ct->master; + struct nf_conntrack_expect *other_exp; + struct nf_conntrack_tuple t; + const struct nf_ct_pptp_master *ct_pptp_info; + const struct nf_nat_pptp *nat_pptp_info; + struct nf_nat_range range; + + ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; + nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; + + /* And here goes the grand finale of corrosion... */ + if (exp->dir == IP_CT_DIR_ORIGINAL) { + pr_debug("we are PNS->PAC\n"); + /* therefore, build tuple for PAC->PNS */ + t.src.l3num = AF_INET; + t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip; + t.src.u.gre.key = ct_pptp_info->pac_call_id; + t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip; + t.dst.u.gre.key = ct_pptp_info->pns_call_id; + t.dst.protonum = IPPROTO_GRE; + } else { + pr_debug("we are PAC->PNS\n"); + /* build tuple for PNS->PAC */ + t.src.l3num = AF_INET; + t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip; + t.src.u.gre.key = nat_pptp_info->pns_call_id; + t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip; + t.dst.u.gre.key = nat_pptp_info->pac_call_id; + t.dst.protonum = IPPROTO_GRE; + } + + pr_debug("trying to unexpect other dir: "); + nf_ct_dump_tuple_ip(&t); + other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t); + if (other_exp) { + nf_ct_unexpect_related(other_exp); + nf_ct_expect_put(other_exp); + pr_debug("success\n"); + } else { + pr_debug("not found!\n"); + } + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; + if (exp->dir == IP_CT_DIR_ORIGINAL) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); + + /* For DST manip, map port here to where it's expected. */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; + if (exp->dir == IP_CT_DIR_REPLY) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); +} + +/* outbound packets == from PNS to PAC */ +static int +pptp_outbound_pkt(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq) + +{ + struct nf_ct_pptp_master *ct_pptp_info; + struct nf_nat_pptp *nat_pptp_info; + u_int16_t msg; + __be16 new_callid; + unsigned int cid_off; + + ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; + nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; + + new_callid = ct_pptp_info->pns_call_id; + + switch (msg = ntohs(ctlh->messageType)) { + case PPTP_OUT_CALL_REQUEST: + cid_off = offsetof(union pptp_ctrl_union, ocreq.callID); + /* FIXME: ideally we would want to reserve a call ID + * here. current netfilter NAT core is not able to do + * this :( For now we use TCP source port. This breaks + * multiple calls within one control session */ + + /* save original call ID in nat_info */ + nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id; + + /* don't use tcph->source since we are at a DSTmanip + * hook (e.g. PREROUTING) and pkt is not mangled yet */ + new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; + + /* save new call ID in ct info */ + ct_pptp_info->pns_call_id = new_callid; + break; + case PPTP_IN_CALL_REPLY: + cid_off = offsetof(union pptp_ctrl_union, icack.callID); + break; + case PPTP_CALL_CLEAR_REQUEST: + cid_off = offsetof(union pptp_ctrl_union, clrreq.callID); + break; + default: + pr_debug("unknown outbound packet 0x%04x:%s\n", msg, + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : + pptp_msg_name[0]); + /* fall through */ + case PPTP_SET_LINK_INFO: + /* only need to NAT in case PAC is behind NAT box */ + case PPTP_START_SESSION_REQUEST: + case PPTP_START_SESSION_REPLY: + case PPTP_STOP_SESSION_REQUEST: + case PPTP_STOP_SESSION_REPLY: + case PPTP_ECHO_REQUEST: + case PPTP_ECHO_REPLY: + /* no need to alter packet */ + return NF_ACCEPT; + } + + /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass + * down to here */ + pr_debug("altering call id from 0x%04x to 0x%04x\n", + ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); + + /* mangle packet */ + if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + cid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_callid), (char *)&new_callid, + sizeof(new_callid)) == 0) + return NF_DROP; + return NF_ACCEPT; +} + +static void +pptp_exp_gre(struct nf_conntrack_expect *expect_orig, + struct nf_conntrack_expect *expect_reply) +{ + const struct nf_conn *ct = expect_orig->master; + struct nf_ct_pptp_master *ct_pptp_info; + struct nf_nat_pptp *nat_pptp_info; + + ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; + nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; + + /* save original PAC call ID in nat_info */ + nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; + + /* alter expectation for PNS->PAC direction */ + expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id; + expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id; + expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id; + expect_orig->dir = IP_CT_DIR_ORIGINAL; + + /* alter expectation for PAC->PNS direction */ + expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id; + expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id; + expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id; + expect_reply->dir = IP_CT_DIR_REPLY; +} + +/* inbound packets == from PAC to PNS */ +static int +pptp_inbound_pkt(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq) +{ + const struct nf_nat_pptp *nat_pptp_info; + u_int16_t msg; + __be16 new_pcid; + unsigned int pcid_off; + + nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; + new_pcid = nat_pptp_info->pns_call_id; + + switch (msg = ntohs(ctlh->messageType)) { + case PPTP_OUT_CALL_REPLY: + pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID); + break; + case PPTP_IN_CALL_CONNECT: + pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID); + break; + case PPTP_IN_CALL_REQUEST: + /* only need to nat in case PAC is behind NAT box */ + return NF_ACCEPT; + case PPTP_WAN_ERROR_NOTIFY: + pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID); + break; + case PPTP_CALL_DISCONNECT_NOTIFY: + pcid_off = offsetof(union pptp_ctrl_union, disc.callID); + break; + case PPTP_SET_LINK_INFO: + pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); + break; + default: + pr_debug("unknown inbound packet %s\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : + pptp_msg_name[0]); + /* fall through */ + case PPTP_START_SESSION_REQUEST: + case PPTP_START_SESSION_REPLY: + case PPTP_STOP_SESSION_REQUEST: + case PPTP_STOP_SESSION_REPLY: + case PPTP_ECHO_REQUEST: + case PPTP_ECHO_REPLY: + /* no need to alter packet */ + return NF_ACCEPT; + } + + /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST, + * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */ + + /* mangle packet */ + pr_debug("altering peer call id from 0x%04x to 0x%04x\n", + ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); + + if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + pcid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_pcid), (char *)&new_pcid, + sizeof(new_pcid)) == 0) + return NF_DROP; + return NF_ACCEPT; +} + +static int __init nf_nat_helper_pptp_init(void) +{ + nf_nat_need_gre(); + + BUG_ON(nf_nat_pptp_hook_outbound != NULL); + rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); + + BUG_ON(nf_nat_pptp_hook_inbound != NULL); + rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); + + BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); + rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); + + BUG_ON(nf_nat_pptp_hook_expectfn != NULL); + rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected); + return 0; +} + +static void __exit nf_nat_helper_pptp_fini(void) +{ + rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL); + rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL); + rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL); + rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL); + synchronize_rcu(); +} + +module_init(nf_nat_helper_pptp_init); +module_exit(nf_nat_helper_pptp_fini); diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c new file mode 100644 index 00000000..f52d41ea --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_common.c @@ -0,0 +1,125 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + __be16 port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.all; + else + port = tuple->dst.u.all; + + return ntohs(port) >= ntohs(min->all) && + ntohs(port) <= ntohs(max->all); +} +EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); + +void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct, + u_int16_t *rover) +{ + unsigned int range_size, min, i; + __be16 *portptr; + u_int16_t off; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.all; + else + portptr = &tuple->dst.u.all; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr) < 512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.all); + range_size = ntohs(range->max.all) - min + 1; + } + + if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) + off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip, + maniptype == IP_NAT_MANIP_SRC + ? tuple->dst.u.all + : tuple->src.u.all); + else + off = *rover; + + for (i = 0; ; ++off) { + *portptr = htons(min + off % range_size); + if (++i != range_size && nf_nat_used_tuple(tuple, ct)) + continue; + if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) + *rover = off; + return; + } + return; +} +EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +int nf_nat_proto_range_to_nlattr(struct sk_buff *skb, + const struct nf_nat_range *range) +{ + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all); + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all); + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range); + +int nf_nat_proto_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_PROTONAT_PORT_MIN]) { + range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); + range->max.all = range->min.tcp.port; + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + } + if (tb[CTA_PROTONAT_PORT_MAX]) { + range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + } + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr); +#endif diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c new file mode 100644 index 00000000..570faf26 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c @@ -0,0 +1,108 @@ +/* + * DCCP NAT protocol helper + * + * Copyright (c) 2005, 2006. 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static u_int16_t dccp_port_rover; + +static void +dccp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &dccp_port_rover); +} + +static bool +dccp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (const void *)(skb->data + iphdroff); + struct dccp_hdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl * 4; + __be32 oldip, newip; + __be16 *portptr, oldport, newport; + int hdrsize = 8; /* DCCP connection tracking guarantees this much */ + + if (skb->len >= hdroff + sizeof(struct dccp_hdr)) + hdrsize = sizeof(struct dccp_hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct dccp_hdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + oldip = iph->saddr; + newip = tuple->src.u3.ip; + newport = tuple->src.u.dccp.port; + portptr = &hdr->dccph_sport; + } else { + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + newport = tuple->dst.u.dccp.port; + portptr = &hdr->dccph_dport; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, + 0); + return true; +} + +static const struct nf_nat_protocol nf_nat_protocol_dccp = { + .protonum = IPPROTO_DCCP, + .me = THIS_MODULE, + .manip_pkt = dccp_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = dccp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_dccp_init(void) +{ + return nf_nat_protocol_register(&nf_nat_protocol_dccp); +} + +static void __exit nf_nat_proto_dccp_fini(void) +{ + nf_nat_protocol_unregister(&nf_nat_protocol_dccp); +} + +module_init(nf_nat_proto_dccp_init); +module_exit(nf_nat_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("DCCP NAT protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c new file mode 100644 index 00000000..bc8d83a3 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -0,0 +1,149 @@ +/* + * nf_nat_proto_gre.c + * + * NAT protocol helper module for GRE. + * + * GRE is a generic encapsulation protocol, which is generally not very + * suited for NAT, as it has no protocol-specific part as port numbers. + * + * It has an optional key field, which may help us distinguishing two + * connections between the same two hosts. + * + * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 + * + * PPTP is built on top of a modified version of GRE, and has a mandatory + * field called "CallID", which serves us for the same purpose as the key + * field in plain GRE. + * + * Documentation about PPTP can be found in RFC 2637 + * + * (C) 2000-2005 by Harald Welte + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + * + */ + +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); + +/* generate unique tuple ... */ +static void +gre_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + static u_int16_t key; + __be16 *keyptr; + unsigned int min, i, range_size; + + /* If there is no master conntrack we are not PPTP, + do not change tuples */ + if (!ct->master) + return; + + if (maniptype == IP_NAT_MANIP_SRC) + keyptr = &tuple->src.u.gre.key; + else + keyptr = &tuple->dst.u.gre.key; + + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + pr_debug("%p: NATing GRE PPTP\n", ct); + min = 1; + range_size = 0xffff; + } else { + min = ntohs(range->min.gre.key); + range_size = ntohs(range->max.gre.key) - min + 1; + } + + pr_debug("min = %u, range_size = %u\n", min, range_size); + + for (i = 0; ; ++key) { + *keyptr = htons(min + key % range_size); + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; + } + + pr_debug("%p: no NAT mapping\n", ct); + return; +} + +/* manipulate a GRE packet according to maniptype */ +static bool +gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct gre_hdr *greh; + struct gre_hdr_pptp *pgreh; + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + unsigned int hdroff = iphdroff + iph->ihl * 4; + + /* pgreh includes two optional 32bit fields which are not required + * to be there. That's where the magic '8' comes from */ + if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) + return false; + + greh = (void *)skb->data + hdroff; + pgreh = (struct gre_hdr_pptp *)greh; + + /* we only have destination manip of a packet, since 'source key' + * is not present in the packet itself */ + if (maniptype != IP_NAT_MANIP_DST) + return true; + switch (greh->version) { + case GRE_VERSION_1701: + /* We do not currently NAT any GREv0 packets. + * Try to behave like "nf_nat_proto_unknown" */ + break; + case GRE_VERSION_PPTP: + pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); + pgreh->call_id = tuple->dst.u.gre.key; + break; + default: + pr_debug("can't nat unknown GRE version\n"); + return false; + } + return true; +} + +static const struct nf_nat_protocol gre = { + .protonum = IPPROTO_GRE, + .me = THIS_MODULE, + .manip_pkt = gre_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = gre_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_gre_init(void) +{ + return nf_nat_protocol_register(&gre); +} + +static void __exit nf_nat_proto_gre_fini(void) +{ + nf_nat_protocol_unregister(&gre); +} + +module_init(nf_nat_proto_gre_init); +module_exit(nf_nat_proto_gre_fini); + +void nf_nat_need_gre(void) +{ + return; +} +EXPORT_SYMBOL_GPL(nf_nat_need_gre); diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c new file mode 100644 index 00000000..5744c3ec --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -0,0 +1,84 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static bool +icmp_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && + ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); +} + +static void +icmp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + static u_int16_t id; + unsigned int range_size; + unsigned int i; + + range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xFFFF; + + for (i = 0; ; ++id) { + tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + + (id % range_size)); + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; + } + return; +} + +static bool +icmp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct icmphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct icmphdr *)(skb->data + hdroff); + inet_proto_csum_replace2(&hdr->checksum, skb, + hdr->un.echo.id, tuple->src.u.icmp.id, 0); + hdr->un.echo.id = tuple->src.u.icmp.id; + return true; +} + +const struct nf_nat_protocol nf_nat_protocol_icmp = { + .protonum = IPPROTO_ICMP, + .me = THIS_MODULE, + .manip_pkt = icmp_manip_pkt, + .in_range = icmp_in_range, + .unique_tuple = icmp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c new file mode 100644 index 00000000..756331d4 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include + +static u_int16_t nf_sctp_port_rover; + +static void +sctp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &nf_sctp_port_rover); +} + +static bool +sctp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct sk_buff *frag; + sctp_sctphdr_t *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + __be32 oldip, newip; + __be32 crc32; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct sctphdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.u3.ip; + hdr->source = tuple->src.u.sctp.port; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + hdr->dest = tuple->dst.u.sctp.port; + } + + crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); + skb_walk_frags(skb, frag) + crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), + crc32); + crc32 = sctp_end_cksum(crc32); + hdr->checksum = crc32; + + return true; +} + +static const struct nf_nat_protocol nf_nat_protocol_sctp = { + .protonum = IPPROTO_SCTP, + .me = THIS_MODULE, + .manip_pkt = sctp_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = sctp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_sctp_init(void) +{ + return nf_nat_protocol_register(&nf_nat_protocol_sctp); +} + +static void __exit nf_nat_proto_sctp_exit(void) +{ + nf_nat_protocol_unregister(&nf_nat_protocol_sctp); +} + +module_init(nf_nat_proto_sctp_init); +module_exit(nf_nat_proto_sctp_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c new file mode 100644 index 00000000..aa460a59 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -0,0 +1,92 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static u_int16_t tcp_port_rover; + +static void +tcp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover); +} + +static bool +tcp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct tcphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + __be32 oldip, newip; + __be16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if (skb->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct tcphdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.u3.ip; + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + return true; +} + +const struct nf_nat_protocol nf_nat_protocol_tcp = { + .protonum = IPPROTO_TCP, + .me = THIS_MODULE, + .manip_pkt = tcp_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = tcp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c new file mode 100644 index 00000000..dfe65c7e --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -0,0 +1,83 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static u_int16_t udp_port_rover; + +static void +udp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover); +} + +static bool +udp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + __be32 oldip, newip; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.u3.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, + 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + } + *portptr = newport; + return true; +} + +const struct nf_nat_protocol nf_nat_protocol_udp = { + .protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .manip_pkt = udp_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = udp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c new file mode 100644 index 00000000..3cc8c8af --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c @@ -0,0 +1,99 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include + +static u_int16_t udplite_port_rover; + +static void +udplite_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &udplite_port_rover); +} + +static bool +udplite_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + __be32 oldip, newip; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.u3.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + + *portptr = newport; + return true; +} + +static const struct nf_nat_protocol nf_nat_protocol_udplite = { + .protonum = IPPROTO_UDPLITE, + .me = THIS_MODULE, + .manip_pkt = udplite_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = udplite_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_udplite_init(void) +{ + return nf_nat_protocol_register(&nf_nat_protocol_udplite); +} + +static void __exit nf_nat_proto_udplite_fini(void) +{ + nf_nat_protocol_unregister(&nf_nat_protocol_udplite); +} + +module_init(nf_nat_proto_udplite_init); +module_exit(nf_nat_proto_udplite_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c new file mode 100644 index 00000000..a50f2bc1 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c @@ -0,0 +1,53 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include +#include +#include + +static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type manip_type, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + return true; +} + +static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + anything. */ + return; +} + +static bool +unknown_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + return true; +} + +const struct nf_nat_protocol nf_nat_unknown_protocol = { + /* .me isn't set: getting a ref to this cannot fail. */ + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, +}; diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c new file mode 100644 index 00000000..733c9abc --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -0,0 +1,214 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Everything about the rules for NAT. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_POST_ROUTING) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_LOCAL_IN)) + +static const struct xt_table nat_table = { + .name = "nat", + .valid_hooks = NAT_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, +}; + +/* Source NAT */ +static unsigned int +ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_IN); + + ct = nf_ct_get(skb, &ctinfo); + + /* Connection must be valid and new. */ + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + NF_CT_ASSERT(par->out != NULL); + + return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); +} + +static unsigned int +ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); + + ct = nf_ct_get(skb, &ctinfo); + + /* Connection must be valid and new. */ + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); +} + +static int ipt_snat_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + pr_info("SNAT: multiple ranges no longer supported\n"); + return -EINVAL; + } + return 0; +} + +static int ipt_dnat_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_multi_range_compat *mr = par->targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + pr_info("DNAT: multiple ranges no longer supported\n"); + return -EINVAL; + } + return 0; +} + +static unsigned int +alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + */ + struct nf_nat_range range; + + range.flags = 0; + pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, + HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); + + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); +} + +int nf_nat_rule_find(struct sk_buff *skb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + int ret; + + ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); + + if (ret == NF_ACCEPT) { + if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) + /* NUL mapping */ + ret = alloc_null_binding(ct, hooknum); + } + return ret; +} + +static struct xt_target ipt_snat_reg __read_mostly = { + .name = "SNAT", + .target = ipt_snat_target, + .targetsize = sizeof(struct nf_nat_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), + .checkentry = ipt_snat_checkentry, + .family = AF_INET, +}; + +static struct xt_target ipt_dnat_reg __read_mostly = { + .name = "DNAT", + .target = ipt_dnat_target, + .targetsize = sizeof(struct nf_nat_multi_range_compat), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), + .checkentry = ipt_dnat_checkentry, + .family = AF_INET, +}; + +static int __net_init nf_nat_rule_net_init(struct net *net) +{ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&nat_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl); + kfree(repl); + if (IS_ERR(net->ipv4.nat_table)) + return PTR_ERR(net->ipv4.nat_table); + return 0; +} + +static void __net_exit nf_nat_rule_net_exit(struct net *net) +{ + ipt_unregister_table(net, net->ipv4.nat_table); +} + +static struct pernet_operations nf_nat_rule_net_ops = { + .init = nf_nat_rule_net_init, + .exit = nf_nat_rule_net_exit, +}; + +int __init nf_nat_rule_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_nat_rule_net_ops); + if (ret != 0) + goto out; + ret = xt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; + + ret = xt_register_target(&ipt_dnat_reg); + if (ret != 0) + goto unregister_snat; + + return ret; + + unregister_snat: + xt_unregister_target(&ipt_snat_reg); + unregister_table: + unregister_pernet_subsys(&nf_nat_rule_net_ops); + out: + return ret; +} + +void nf_nat_rule_cleanup(void) +{ + xt_unregister_target(&ipt_dnat_reg); + xt_unregister_target(&ipt_snat_reg); + unregister_pernet_subsys(&nf_nat_rule_net_ops); +} diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c new file mode 100644 index 00000000..e40cf781 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -0,0 +1,561 @@ +/* SIP extension for NAT alteration. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_nat_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP NAT helper"); +MODULE_ALIAS("ip_nat_sip"); + + +static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + const char *buffer, unsigned int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct tcphdr *th; + unsigned int baseoff; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) { + th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); + baseoff = ip_hdrlen(skb) + th->doff * 4; + matchoff += dataoff - baseoff; + + if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + matchoff, matchlen, + buffer, buflen, false)) + return 0; + } else { + baseoff = ip_hdrlen(skb) + sizeof(struct udphdr); + matchoff += dataoff - baseoff; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, + matchoff, matchlen, + buffer, buflen)) + return 0; + } + + /* Reload data pointer and adjust datalen value */ + *dptr = skb->data + dataoff; + *datalen += buflen - matchlen; + return 1; +} + +static int map_addr(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + union nf_inet_addr *addr, __be16 port) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int buflen; + __be32 newaddr; + __be16 newport; + + if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip && + ct->tuplehash[dir].tuple.src.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip; + newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; + } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip && + ct->tuplehash[dir].tuple.dst.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.src.u3.ip; + newport = ct->tuplehash[!dir].tuple.src.u.udp.port; + } else + return 1; + + if (newaddr == addr->ip && newport == port) + return 1; + + buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); + + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, + buffer, buflen); +} + +static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + enum sip_header_types type) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + union nf_inet_addr addr; + __be16 port; + + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, + &matchoff, &matchlen, &addr, &port) <= 0) + return 1; + return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port); +} + +static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int coff, matchoff, matchlen; + enum sip_header_types hdr; + union nf_inet_addr addr; + __be16 port; + int request, in_header; + + /* Basic rules: requests and responses. */ + if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { + if (ct_sip_parse_request(ct, *dptr, *datalen, + &matchoff, &matchlen, + &addr, &port) > 0 && + !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port)) + return NF_DROP; + request = 1; + } else + request = 0; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) + hdr = SIP_HDR_VIA_TCP; + else + hdr = SIP_HDR_VIA_UDP; + + /* Translate topmost Via header and parameters */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + hdr, NULL, &matchoff, &matchlen, + &addr, &port) > 0) { + unsigned int matchend, poff, plen, buflen, n; + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + + /* We're only interested in headers related to this + * connection */ + if (request) { + if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip || + port != ct->tuplehash[dir].tuple.src.u.udp.port) + goto next; + } else { + if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip || + port != ct->tuplehash[dir].tuple.dst.u.udp.port) + goto next; + } + + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port)) + return NF_DROP; + + matchend = matchoff + matchlen; + + /* The maddr= parameter (RFC 2361) specifies where to send + * the reply. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "maddr=", &poff, &plen, + &addr) > 0 && + addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && + addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { + buflen = sprintf(buffer, "%pI4", + &ct->tuplehash[!dir].tuple.dst.u3.ip); + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) + return NF_DROP; + } + + /* The received= parameter (RFC 2361) contains the address + * from which the server received the request. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "received=", &poff, &plen, + &addr) > 0 && + addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && + addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { + buflen = sprintf(buffer, "%pI4", + &ct->tuplehash[!dir].tuple.src.u3.ip); + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) + return NF_DROP; + } + + /* The rport= parameter (RFC 3581) contains the port number + * from which the server received the request. */ + if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, + "rport=", &poff, &plen, + &n) > 0 && + htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && + htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { + __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; + buflen = sprintf(buffer, "%u", ntohs(p)); + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) + return NF_DROP; + } + } + +next: + /* Translate Contact headers */ + coff = 0; + in_header = 0; + while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, + SIP_HDR_CONTACT, &in_header, + &matchoff, &matchlen, + &addr, &port) > 0) { + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port)) + return NF_DROP; + } + + if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) || + !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO)) + return NF_DROP; + + return NF_ACCEPT; +} + +static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) + return; + + th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); + nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); +} + +/* Handles expected signalling connections and media streams */ +static void ip_nat_sip_expected(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip = exp->saved_ip; + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); + + /* Change src to where master sends to, but only if the connection + * actually came from the same source. */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == + ct->master->tuplehash[exp->dir].tuple.src.u3.ip) { + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); + } +} + +static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *exp, + unsigned int matchoff, + unsigned int matchlen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + __be32 newip; + u_int16_t port; + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned buflen; + + /* Connection will come from reply */ + if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) + newip = exp->tuple.dst.u3.ip; + else + newip = ct->tuplehash[!dir].tuple.dst.u3.ip; + + /* If the signalling port matches the connection's source port in the + * original direction, try to use the destination port in the opposite + * direction. */ + if (exp->tuple.dst.u.udp.port == + ct->tuplehash[dir].tuple.src.u.udp.port) + port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); + else + port = ntohs(exp->tuple.dst.u.udp.port); + + exp->saved_ip = exp->tuple.dst.u3.ip; + exp->tuple.dst.u3.ip = newip; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + exp->expectfn = ip_nat_sip_expected; + + for (; port != 0; port++) { + int ret; + + exp->tuple.dst.u.udp.port = htons(port); + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; + break; + } + } + + if (port == 0) + return NF_DROP; + + if (exp->tuple.dst.u3.ip != exp->saved_ip || + exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { + buflen = sprintf(buffer, "%pI4:%u", &newip, port); + if (!mangle_packet(skb, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) + goto err; + } + return NF_ACCEPT; + +err: + nf_ct_unexpect_related(exp); + return NF_DROP; +} + +static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + char buffer[sizeof("65536")]; + int buflen, c_len; + + /* Get actual SDP length */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return 0; + c_len = *datalen - matchoff + strlen("v="); + + /* Now, update SDP length */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) + return 0; + + buflen = sprintf(buffer, "%u", c_len); + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, + buffer, buflen); +} + +static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + char *buffer, int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + + if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, + &matchoff, &matchlen) <= 0) + return -ENOENT; + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, + buffer, buflen) ? 0 : -EINVAL; +} + +static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + const union nf_inet_addr *addr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int buflen; + + buflen = sprintf(buffer, "%pI4", &addr->ip); + if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term, + buffer, buflen)) + return 0; + + return mangle_content_len(skb, dataoff, dptr, datalen); +} + +static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, + unsigned int matchlen, + u_int16_t port) +{ + char buffer[sizeof("nnnnn")]; + unsigned int buflen; + + buflen = sprintf(buffer, "%u", port); + if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, + buffer, buflen)) + return 0; + + return mangle_content_len(skb, dataoff, dptr, datalen); +} + +static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, + const union nf_inet_addr *addr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int buflen; + + /* Mangle session description owner and contact addresses */ + buflen = sprintf(buffer, "%pI4", &addr->ip); + if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, + SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, + buffer, buflen)) + return 0; + + switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, + SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, + buffer, buflen)) { + case 0: + /* + * RFC 2327: + * + * Session description + * + * c=* (connection information - not required if included in all media) + */ + case -ENOENT: + break; + default: + return 0; + } + + return mangle_content_len(skb, dataoff, dptr, datalen); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp, + unsigned int mediaoff, + unsigned int medialen, + union nf_inet_addr *rtp_addr) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int16_t port; + + /* Connection will come from reply */ + if (ct->tuplehash[dir].tuple.src.u3.ip == + ct->tuplehash[!dir].tuple.dst.u3.ip) + rtp_addr->ip = rtp_exp->tuple.dst.u3.ip; + else + rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip; + + rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip; + rtp_exp->tuple.dst.u3.ip = rtp_addr->ip; + rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; + rtp_exp->dir = !dir; + rtp_exp->expectfn = ip_nat_sip_expected; + + rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip; + rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip; + rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; + rtcp_exp->dir = !dir; + rtcp_exp->expectfn = ip_nat_sip_expected; + + /* Try to get same pair of ports: if not, try to change them. */ + for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); + port != 0; port += 2) { + int ret; + + rtp_exp->tuple.dst.u.udp.port = htons(port); + ret = nf_ct_expect_related(rtp_exp); + if (ret == -EBUSY) + continue; + else if (ret < 0) { + port = 0; + break; + } + rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + port = 0; + break; + } + } + + if (port == 0) + goto err1; + + /* Update media port. */ + if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && + !ip_nat_sdp_port(skb, dataoff, dptr, datalen, + mediaoff, medialen, port)) + goto err2; + + return NF_ACCEPT; + +err2: + nf_ct_unexpect_related(rtp_exp); + nf_ct_unexpect_related(rtcp_exp); +err1: + return NF_DROP; +} + +static void __exit nf_nat_sip_fini(void) +{ + rcu_assign_pointer(nf_nat_sip_hook, NULL); + rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL); + rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_session_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_media_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_sip_init(void) +{ + BUG_ON(nf_nat_sip_hook != NULL); + BUG_ON(nf_nat_sip_seq_adjust_hook != NULL); + BUG_ON(nf_nat_sip_expect_hook != NULL); + BUG_ON(nf_nat_sdp_addr_hook != NULL); + BUG_ON(nf_nat_sdp_port_hook != NULL); + BUG_ON(nf_nat_sdp_session_hook != NULL); + BUG_ON(nf_nat_sdp_media_hook != NULL); + rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); + rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); + rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); + rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); + rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); + rcu_assign_pointer(nf_nat_sdp_session_hook, ip_nat_sdp_session); + rcu_assign_pointer(nf_nat_sdp_media_hook, ip_nat_sdp_media); + return 0; +} + +module_init(nf_nat_sip_init); +module_exit(nf_nat_sip_fini); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c new file mode 100644 index 00000000..8812a020 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -0,0 +1,1334 @@ +/* + * nf_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: James Morris + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); +MODULE_ALIAS("ip_nat_snmp_basic"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 +#define NOCT1(n) (*(u8 *)(n)) + +static int debug; +static DEFINE_SPINLOCK(snmp_lock); + +/* + * Application layer address mapping mimics the NAT mapping, but + * only for the first octet in this case (a more flexible system + * can be implemented if needed). + */ +struct oct1_map +{ + u_int8_t from; + u_int8_t to; +}; + + +/***************************************************************************** + * + * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* Class */ +#define ASN1_UNI 0 /* Universal */ +#define ASN1_APL 1 /* Application */ +#define ASN1_CTX 2 /* Context */ +#define ASN1_PRV 3 /* Private */ + +/* Tag */ +#define ASN1_EOC 0 /* End Of Contents */ +#define ASN1_BOL 1 /* Boolean */ +#define ASN1_INT 2 /* Integer */ +#define ASN1_BTS 3 /* Bit String */ +#define ASN1_OTS 4 /* Octet String */ +#define ASN1_NUL 5 /* Null */ +#define ASN1_OJI 6 /* Object Identifier */ +#define ASN1_OJD 7 /* Object Description */ +#define ASN1_EXT 8 /* External */ +#define ASN1_SEQ 16 /* Sequence */ +#define ASN1_SET 17 /* Set */ +#define ASN1_NUMSTR 18 /* Numerical String */ +#define ASN1_PRNSTR 19 /* Printable String */ +#define ASN1_TEXSTR 20 /* Teletext String */ +#define ASN1_VIDSTR 21 /* Video String */ +#define ASN1_IA5STR 22 /* IA5 String */ +#define ASN1_UNITIM 23 /* Universal Time */ +#define ASN1_GENTIM 24 /* General Time */ +#define ASN1_GRASTR 25 /* Graphical String */ +#define ASN1_VISSTR 26 /* Visible String */ +#define ASN1_GENSTR 27 /* General String */ + +/* Primitive / Constructed methods*/ +#define ASN1_PRI 0 /* Primitive */ +#define ASN1_CON 1 /* Constructed */ + +/* + * Error codes. + */ +#define ASN1_ERR_NOERROR 0 +#define ASN1_ERR_DEC_EMPTY 2 +#define ASN1_ERR_DEC_EOC_MISMATCH 3 +#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 +#define ASN1_ERR_DEC_BADVALUE 5 + +/* + * ASN.1 context. + */ +struct asn1_ctx +{ + int error; /* Error condition */ + unsigned char *pointer; /* Octet just to be decoded */ + unsigned char *begin; /* First octet */ + unsigned char *end; /* Octet after last octet */ +}; + +/* + * Octet string (not null terminated) + */ +struct asn1_octstr +{ + unsigned char *data; + unsigned int len; +}; + +static void asn1_open(struct asn1_ctx *ctx, + unsigned char *buf, + unsigned int len) +{ + ctx->begin = buf; + ctx->end = buf + len; + ctx->pointer = buf; + ctx->error = ASN1_ERR_NOERROR; +} + +static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) +{ + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + *ch = *(ctx->pointer)++; + return 1; +} + +static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) +{ + unsigned char ch; + + *tag = 0; + + do + { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *tag <<= 7; + *tag |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_id_decode(struct asn1_ctx *ctx, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned char ch; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *cls = (ch & 0xC0) >> 6; + *con = (ch & 0x20) >> 5; + *tag = (ch & 0x1F); + + if (*tag == 0x1F) { + if (!asn1_tag_decode(ctx, tag)) + return 0; + } + return 1; +} + +static unsigned char asn1_length_decode(struct asn1_ctx *ctx, + unsigned int *def, + unsigned int *len) +{ + unsigned char ch, cnt; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch == 0x80) + *def = 0; + else { + *def = 1; + + if (ch < 0x80) + *len = ch; + else { + cnt = ch & 0x7F; + *len = 0; + + while (cnt > 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *len <<= 8; + *len |= ch; + cnt--; + } + } + } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + + return 1; +} + +static unsigned char asn1_header_decode(struct asn1_ctx *ctx, + unsigned char **eoc, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned int def, len; + + if (!asn1_id_decode(ctx, cls, con, tag)) + return 0; + + def = len = 0; + if (!asn1_length_decode(ctx, &def, &len)) + return 0; + + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + + if (def) + *eoc = ctx->pointer + len; + else + *eoc = NULL; + return 1; +} + +static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + unsigned char ch; + + if (eoc == NULL) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + return 1; + } else { + if (ctx->pointer != eoc) { + ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; + return 0; + } + return 1; + } +} + +static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + ctx->pointer = eoc; + return 1; +} + +static unsigned char asn1_long_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = (signed char) ch; + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned int *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned int)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned char **octets, + unsigned int *len) +{ + unsigned char *ptr; + + *len = 0; + + *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); + if (*octets == NULL) { + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + ptr = *octets; + while (ctx->pointer < eoc) { + if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { + kfree(*octets); + *octets = NULL; + return 0; + } + (*len)++; + } + return 1; +} + +static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, + unsigned long *subid) +{ + unsigned char ch; + + *subid = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long **oid, + unsigned int *len) +{ + unsigned long subid; + unsigned long *optr; + size_t size; + + size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + return 0; + + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + if (*oid == NULL) { + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + optr = *oid; + + if (!asn1_subid_decode(ctx, &subid)) { + kfree(*oid); + *oid = NULL; + return 0; + } + + if (subid < 40) { + optr [0] = 0; + optr [1] = subid; + } else if (subid < 80) { + optr [0] = 1; + optr [1] = subid - 40; + } else { + optr [0] = 2; + optr [1] = subid - 80; + } + + *len = 2; + optr += 2; + + while (ctx->pointer < eoc) { + if (++(*len) > size) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + kfree(*oid); + *oid = NULL; + return 0; + } + + if (!asn1_subid_decode(ctx, optr++)) { + kfree(*oid); + *oid = NULL; + return 0; + } + } + return 1; +} + +/***************************************************************************** + * + * SNMP decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* SNMP Versions */ +#define SNMP_V1 0 +#define SNMP_V2C 1 +#define SNMP_V2 2 +#define SNMP_V3 3 + +/* Default Sizes */ +#define SNMP_SIZE_COMM 256 +#define SNMP_SIZE_OBJECTID 128 +#define SNMP_SIZE_BUFCHR 256 +#define SNMP_SIZE_BUFINT 128 +#define SNMP_SIZE_SMALLOBJECTID 16 + +/* Requests */ +#define SNMP_PDU_GET 0 +#define SNMP_PDU_NEXT 1 +#define SNMP_PDU_RESPONSE 2 +#define SNMP_PDU_SET 3 +#define SNMP_PDU_TRAP1 4 +#define SNMP_PDU_BULK 5 +#define SNMP_PDU_INFORM 6 +#define SNMP_PDU_TRAP2 7 + +/* Errors */ +#define SNMP_NOERROR 0 +#define SNMP_TOOBIG 1 +#define SNMP_NOSUCHNAME 2 +#define SNMP_BADVALUE 3 +#define SNMP_READONLY 4 +#define SNMP_GENERROR 5 +#define SNMP_NOACCESS 6 +#define SNMP_WRONGTYPE 7 +#define SNMP_WRONGLENGTH 8 +#define SNMP_WRONGENCODING 9 +#define SNMP_WRONGVALUE 10 +#define SNMP_NOCREATION 11 +#define SNMP_INCONSISTENTVALUE 12 +#define SNMP_RESOURCEUNAVAILABLE 13 +#define SNMP_COMMITFAILED 14 +#define SNMP_UNDOFAILED 15 +#define SNMP_AUTHORIZATIONERROR 16 +#define SNMP_NOTWRITABLE 17 +#define SNMP_INCONSISTENTNAME 18 + +/* General SNMP V1 Traps */ +#define SNMP_TRAP_COLDSTART 0 +#define SNMP_TRAP_WARMSTART 1 +#define SNMP_TRAP_LINKDOWN 2 +#define SNMP_TRAP_LINKUP 3 +#define SNMP_TRAP_AUTFAILURE 4 +#define SNMP_TRAP_EQPNEIGHBORLOSS 5 +#define SNMP_TRAP_ENTSPECIFIC 6 + +/* SNMPv1 Types */ +#define SNMP_NULL 0 +#define SNMP_INTEGER 1 /* l */ +#define SNMP_OCTETSTR 2 /* c */ +#define SNMP_DISPLAYSTR 2 /* c */ +#define SNMP_OBJECTID 3 /* ul */ +#define SNMP_IPADDR 4 /* uc */ +#define SNMP_COUNTER 5 /* ul */ +#define SNMP_GAUGE 6 /* ul */ +#define SNMP_TIMETICKS 7 /* ul */ +#define SNMP_OPAQUE 8 /* c */ + +/* Additional SNMPv2 Types */ +#define SNMP_UINTEGER 5 /* ul */ +#define SNMP_BITSTR 9 /* uc */ +#define SNMP_NSAP 10 /* uc */ +#define SNMP_COUNTER64 11 /* ul */ +#define SNMP_NOSUCHOBJECT 12 +#define SNMP_NOSUCHINSTANCE 13 +#define SNMP_ENDOFMIBVIEW 14 + +union snmp_syntax +{ + unsigned char uc[0]; /* 8 bit unsigned */ + char c[0]; /* 8 bit signed */ + unsigned long ul[0]; /* 32 bit unsigned */ + long l[0]; /* 32 bit signed */ +}; + +struct snmp_object +{ + unsigned long *id; + unsigned int id_len; + unsigned short type; + unsigned int syntax_len; + union snmp_syntax syntax; +}; + +struct snmp_request +{ + unsigned long id; + unsigned int error_status; + unsigned int error_index; +}; + +struct snmp_v1_trap +{ + unsigned long *id; + unsigned int id_len; + unsigned long ip_address; /* pointer */ + unsigned int general; + unsigned int specific; + unsigned long time; +}; + +/* SNMP types */ +#define SNMP_IPA 0 +#define SNMP_CNT 1 +#define SNMP_GGE 2 +#define SNMP_TIT 3 +#define SNMP_OPQ 4 +#define SNMP_C64 6 + +/* SNMP errors */ +#define SERR_NSO 0 +#define SERR_NSI 1 +#define SERR_EOM 2 + +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + __sum16 *check); +struct snmp_cnv +{ + unsigned int class; + unsigned int tag; + int syntax; +}; + +static const struct snmp_cnv snmp_conv[] = { + {ASN1_UNI, ASN1_NUL, SNMP_NULL}, + {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, + {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, + {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, + {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, + {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, + {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ + {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ + {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, + {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, + + /* SNMPv2 data types and errors */ + {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, + {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, + {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, + {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, + {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, + {0, 0, -1} +}; + +static unsigned char snmp_tag_cls2syntax(unsigned int tag, + unsigned int cls, + unsigned short *syntax) +{ + const struct snmp_cnv *cnv; + + cnv = snmp_conv; + + while (cnv->syntax != -1) { + if (cnv->tag == tag && cnv->class == cls) { + *syntax = cnv->syntax; + return 1; + } + cnv++; + } + return 0; +} + +static unsigned char snmp_object_decode(struct asn1_ctx *ctx, + struct snmp_object **obj) +{ + unsigned int cls, con, tag, len, idlen; + unsigned short type; + unsigned char *eoc, *end, *p; + unsigned long *lp, *id; + unsigned long ul; + long l; + + *obj = NULL; + id = NULL; + + if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &id, &idlen)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { + kfree(id); + return 0; + } + + if (con != ASN1_PRI) { + kfree(id); + return 0; + } + + type = 0; + if (!snmp_tag_cls2syntax(tag, cls, &type)) { + kfree(id); + return 0; + } + + l = 0; + switch (type) { + case SNMP_INTEGER: + len = sizeof(long); + if (!asn1_long_decode(ctx, end, &l)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.l[0] = l; + break; + case SNMP_OCTETSTR: + case SNMP_OPAQUE: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(p); + kfree(id); + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.c, p, len); + kfree(p); + break; + case SNMP_NULL: + case SNMP_NOSUCHOBJECT: + case SNMP_NOSUCHINSTANCE: + case SNMP_ENDOFMIBVIEW: + len = 0; + *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + if (!asn1_null_decode(ctx, end)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + break; + case SNMP_OBJECTID: + if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + kfree(id); + return 0; + } + len *= sizeof(unsigned long); + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(lp); + kfree(id); + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.ul, lp, len); + kfree(lp); + break; + case SNMP_IPADDR: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + if (len != 4) { + kfree(p); + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(p); + kfree(id); + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.uc, p, len); + kfree(p); + break; + case SNMP_COUNTER: + case SNMP_GAUGE: + case SNMP_TIMETICKS: + len = sizeof(unsigned long); + if (!asn1_ulong_decode(ctx, end, &ul)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + pr_notice("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.ul[0] = ul; + break; + default: + kfree(id); + return 0; + } + + (*obj)->syntax_len = len; + (*obj)->type = type; + (*obj)->id = id; + (*obj)->id_len = idlen; + + if (!asn1_eoc_decode(ctx, eoc)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + return 1; +} + +static unsigned char snmp_request_decode(struct asn1_ctx *ctx, + struct snmp_request *request) +{ + unsigned int cls, con, tag; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_ulong_decode(ctx, end, &request->id)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_status)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_index)) + return 0; + + return 1; +} + +/* + * Fast checksum update for possibly oddly-aligned UDP byte, from the + * code example in the draft. + */ +static void fast_csum(__sum16 *csum, + const unsigned char *optr, + const unsigned char *nptr, + int offset) +{ + unsigned char s[4]; + + if (offset & 1) { + s[0] = ~0; + s[1] = ~*optr; + s[2] = 0; + s[3] = *nptr; + } else { + s[0] = ~*optr; + s[1] = ~0; + s[2] = *nptr; + s[3] = 0; + } + + *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); +} + +/* + * Mangle IP address. + * - begin points to the start of the snmp messgae + * - addr points to the start of the address + */ +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + __sum16 *check) +{ + if (map->from == NOCT1(addr)) { + u_int32_t old; + + if (debug) + memcpy(&old, addr, sizeof(old)); + + *addr = map->to; + + /* Update UDP checksum if being used */ + if (*check) { + fast_csum(check, + &map->from, &map->to, addr - begin); + + } + + if (debug) + printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n", + &old, addr); + } +} + +static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, + struct snmp_v1_trap *trap, + const struct oct1_map *map, + __sum16 *check) +{ + unsigned int cls, con, tag, len; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_id_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) + goto err_id_free; + + if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) + goto err_id_free; + + /* IPv4 only */ + if (len != 4) + goto err_addr_free; + + mangle_address(ctx->begin, ctx->pointer - 4, map, check); + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->general)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->specific)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) + goto err_addr_free; + + if (!asn1_ulong_decode(ctx, end, &trap->time)) + goto err_addr_free; + + return 1; + +err_addr_free: + kfree((unsigned long *)trap->ip_address); + +err_id_free: + kfree(trap->id); + + return 0; +} + +/***************************************************************************** + * + * Misc. routines + * + *****************************************************************************/ + +static void hex_dump(const unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i && !(i % 16)) + printk("\n"); + printk("%02x ", *(buf + i)); + } + printk("\n"); +} + +/* + * Parse and mangle SNMP message according to mapping. + * (And this is the fucking 'basic' method). + */ +static int snmp_parse_mangle(unsigned char *msg, + u_int16_t len, + const struct oct1_map *map, + __sum16 *check) +{ + unsigned char *eoc, *end; + unsigned int cls, con, tag, vers, pdutype; + struct asn1_ctx ctx; + struct asn1_octstr comm; + struct snmp_object *obj; + + if (debug > 1) + hex_dump(msg, len); + + asn1_open(&ctx, msg, len); + + /* + * Start of SNMP message. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + /* + * Version 1 or 2 handled. + */ + if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + if (!asn1_uint_decode (&ctx, end, &vers)) + return 0; + if (debug > 1) + printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); + if (vers > 1) + return 1; + + /* + * Community. + */ + if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) + return 0; + if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) + return 0; + if (debug > 1) { + unsigned int i; + + printk(KERN_DEBUG "bsalg: community: "); + for (i = 0; i < comm.len; i++) + printk("%c", comm.data[i]); + printk("\n"); + } + kfree(comm.data); + + /* + * PDU type + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) + return 0; + if (cls != ASN1_CTX || con != ASN1_CON) + return 0; + if (debug > 1) { + static const unsigned char *const pdus[] = { + [SNMP_PDU_GET] = "get", + [SNMP_PDU_NEXT] = "get-next", + [SNMP_PDU_RESPONSE] = "response", + [SNMP_PDU_SET] = "set", + [SNMP_PDU_TRAP1] = "trapv1", + [SNMP_PDU_BULK] = "bulk", + [SNMP_PDU_INFORM] = "inform", + [SNMP_PDU_TRAP2] = "trapv2" + }; + + if (pdutype > SNMP_PDU_TRAP2) + printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); + else + printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); + } + if (pdutype != SNMP_PDU_RESPONSE && + pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) + return 1; + + /* + * Request header or v1 trap + */ + if (pdutype == SNMP_PDU_TRAP1) { + struct snmp_v1_trap trap; + unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); + + if (ret) { + kfree(trap.id); + kfree((unsigned long *)trap.ip_address); + } else + return ret; + + } else { + struct snmp_request req; + + if (!snmp_request_decode(&ctx, &req)) + return 0; + + if (debug > 1) + printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " + "error_index=%u\n", req.id, req.error_status, + req.error_index); + } + + /* + * Loop through objects, look for IP addresses to mangle. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + while (!asn1_eoc_decode(&ctx, eoc)) { + unsigned int i; + + if (!snmp_object_decode(&ctx, &obj)) { + if (obj) { + kfree(obj->id); + kfree(obj); + } + return 0; + } + + if (debug > 1) { + printk(KERN_DEBUG "bsalg: object: "); + for (i = 0; i < obj->id_len; i++) { + if (i > 0) + printk("."); + printk("%lu", obj->id[i]); + } + printk(": type=%u\n", obj->type); + + } + + if (obj->type == SNMP_IPADDR) + mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + + kfree(obj->id); + kfree(obj); + } + + if (!asn1_eoc_decode(&ctx, eoc)) + return 0; + + return 1; +} + +/***************************************************************************** + * + * NAT routines. + * + *****************************************************************************/ + +/* + * SNMP translation routine. + */ +static int snmp_translate(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + u_int16_t udplen = ntohs(udph->len); + u_int16_t paylen = udplen - sizeof(struct udphdr); + int dir = CTINFO2DIR(ctinfo); + struct oct1_map map; + + /* + * Determine mappping for application layer addresses based + * on NAT manipulations for the packet. + */ + if (dir == IP_CT_DIR_ORIGINAL) { + /* SNAT traps */ + map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); + map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); + } else { + /* DNAT replies */ + map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); + map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); + } + + if (map.from == map.to) + return NF_ACCEPT; + + if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), + paylen, &map, &udph->check)) { + if (net_ratelimit()) + printk(KERN_WARNING "bsalg: parser failed\n"); + return NF_DROP; + } + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted */ +static int help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + const struct iphdr *iph = ip_hdr(skb); + const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* + * Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { + if (net_ratelimit()) + printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", + &iph->saddr, &iph->daddr); + return NF_DROP; + } + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, ctinfo, skb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static const struct nf_conntrack_expect_policy snmp_exp_policy = { + .max_expected = 0, + .timeout = 180, +}; + +static struct nf_conntrack_helper snmp_helper __read_mostly = { + .me = THIS_MODULE, + .help = help, + .expect_policy = &snmp_exp_policy, + .name = "snmp", + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, +}; + +static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { + .me = THIS_MODULE, + .help = help, + .expect_policy = &snmp_exp_policy, + .name = "snmp_trap", + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, +}; + +/***************************************************************************** + * + * Module stuff. + * + *****************************************************************************/ + +static int __init nf_nat_snmp_basic_init(void) +{ + int ret = 0; + + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + + ret = nf_conntrack_helper_register(&snmp_trap_helper); + if (ret < 0) { + nf_conntrack_helper_unregister(&snmp_helper); + return ret; + } + return ret; +} + +static void __exit nf_nat_snmp_basic_fini(void) +{ + rcu_assign_pointer(nf_nat_snmp_hook, NULL); + nf_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(nf_nat_snmp_basic_init); +module_exit(nf_nat_snmp_basic_fini); + +module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c new file mode 100644 index 00000000..483b76d0 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -0,0 +1,326 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_XFRM +static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) +{ + struct flowi4 *fl4 = &fl->u.ip4; + const struct nf_conn *ct; + const struct nf_conntrack_tuple *t; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + unsigned long statusbit; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return; + dir = CTINFO2DIR(ctinfo); + t = &ct->tuplehash[dir].tuple; + + if (dir == IP_CT_DIR_ORIGINAL) + statusbit = IPS_DST_NAT; + else + statusbit = IPS_SRC_NAT; + + if (ct->status & statusbit) { + fl4->daddr = t->dst.u3.ip; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl4->fl4_dport = t->dst.u.tcp.port; + } + + statusbit ^= IPS_NAT_MASK; + + if (ct->status & statusbit) { + fl4->saddr = t->src.u3.ip; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl4->fl4_sport = t->src.u.tcp.port; + } +} +#endif + +static unsigned int +nf_nat_fn(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + /* maniptype == SRC for postrouting. */ + enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + and local-out, and nf_nat_out protects post-routing. */ + NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + have dropped it. Hence it's the user's responsibilty to + packet filter it out, or implement conntrack/NAT for that + protocol. 8) --RR */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nfct_nat(ct); + if (!nat) { + /* NAT module was loaded late. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + if (nat == NULL) { + pr_debug("failed to add NAT extension\n"); + return NF_ACCEPT; + } + } + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(ct, ctinfo, + hooknum, skb)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + + /* Seen it before? This can happen for loopback, retrans, + or local packets.. */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = nf_nat_rule_find(skb, hooknum, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + } else + pr_debug("Already setup manip %s for ct %p\n", + maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + } + + return nf_nat_packet(ct, ctinfo, hooknum, skb); +} + +static unsigned int +nf_nat_in(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + __be32 daddr = ip_hdr(skb)->daddr; + + ret = nf_nat_fn(hooknum, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + daddr != ip_hdr(skb)->daddr) + skb_dst_drop(skb); + + return ret; +} + +static unsigned int +nf_nat_out(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_fn(hooknum, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all) + ) + return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; + } +#endif + return ret; +} + +static unsigned int +nf_nat_local_fn(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_fn(hooknum, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + if (ip_route_me_harder(skb, RTN_UNSPEC)) + ret = NF_DROP; + } +#ifdef CONFIG_XFRM + else if (ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) + if (ip_xfrm_me_harder(skb)) + ret = NF_DROP; +#endif + } + return ret; +} + +/* We must be after connection tracking and before packet filtering. */ + +static struct nf_hook_ops nf_nat_ops[] __read_mostly = { + /* Before packet filtering, change destination */ + { + .hook = nf_nat_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_out, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC, + }, + /* Before packet filtering, change destination */ + { + .hook = nf_nat_local_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC, + }, +}; + +static int __init nf_nat_standalone_init(void) +{ + int ret = 0; + + need_ipv4_conntrack(); + +#ifdef CONFIG_XFRM + BUG_ON(ip_nat_decode_session != NULL); + rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); +#endif + ret = nf_nat_rule_init(); + if (ret < 0) { + pr_err("nf_nat_init: can't setup rules.\n"); + goto cleanup_decode_session; + } + ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); + if (ret < 0) { + pr_err("nf_nat_init: can't register hooks.\n"); + goto cleanup_rule_init; + } + return ret; + + cleanup_rule_init: + nf_nat_rule_cleanup(); + cleanup_decode_session: +#ifdef CONFIG_XFRM + rcu_assign_pointer(ip_nat_decode_session, NULL); + synchronize_net(); +#endif + return ret; +} + +static void __exit nf_nat_standalone_fini(void) +{ + nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); + nf_nat_rule_cleanup(); +#ifdef CONFIG_XFRM + rcu_assign_pointer(ip_nat_decode_session, NULL); + synchronize_net(); +#endif + /* Conntrack caches are unregistered in nf_conntrack_cleanup */ +} + +module_init(nf_nat_standalone_init); +module_exit(nf_nat_standalone_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat"); diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c new file mode 100644 index 00000000..7274a43c --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_tftp.c @@ -0,0 +1,51 @@ +/* (C) 2001-2002 Magnus Boden + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Magnus Boden "); +MODULE_DESCRIPTION("TFTP NAT helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_nat_tftp"); + +static unsigned int help(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) +{ + const struct nf_conn *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = nf_nat_follow_master; + if (nf_ct_expect_related(exp) != 0) + return NF_DROP; + return NF_ACCEPT; +} + +static void __exit nf_nat_tftp_fini(void) +{ + rcu_assign_pointer(nf_nat_tftp_hook, NULL); + synchronize_rcu(); +} + +static int __init nf_nat_tftp_init(void) +{ + BUG_ON(nf_nat_tftp_hook != NULL); + rcu_assign_pointer(nf_nat_tftp_hook, help); + return 0; +} + +module_init(nf_nat_tftp_init); +module_exit(nf_nat_tftp_fini); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c new file mode 100644 index 00000000..39b403f8 --- /dev/null +++ b/net/ipv4/ping.c @@ -0,0 +1,931 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/udp.c code. + * + * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), + * Pavel Kankovsky (for Linux 2.4.32) + * + * Pavel gave all rights to bugs to Vasiliy, + * none of the bugs are Pavel's now. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static struct ping_table ping_table; + +static u16 ping_port_rover; + +static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) +{ + int res = (num + net_hash_mix(net)) & mask; + pr_debug("hash(%d) = %d\n", num, res); + return res; +} + +static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, + struct net *net, unsigned num) +{ + return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; +} + +static int ping_v4_get_port(struct sock *sk, unsigned short ident) +{ + struct hlist_nulls_node *node; + struct hlist_nulls_head *hlist; + struct inet_sock *isk, *isk2; + struct sock *sk2 = NULL; + + isk = inet_sk(sk); + write_lock_bh(&ping_table.lock); + if (ident == 0) { + u32 i; + u16 result = ping_port_rover + 1; + + for (i = 0; i < (1L << 16); i++, result++) { + if (!result) + result++; /* avoid zero */ + hlist = ping_hashslot(&ping_table, sock_net(sk), + result); + ping_portaddr_for_each_entry(sk2, node, hlist) { + isk2 = inet_sk(sk2); + + if (isk2->inet_num == result) + goto next_port; + } + + /* found */ + ping_port_rover = ident = result; + break; +next_port: + ; + } + if (i >= (1L << 16)) + goto fail; + } else { + hlist = ping_hashslot(&ping_table, sock_net(sk), ident); + ping_portaddr_for_each_entry(sk2, node, hlist) { + isk2 = inet_sk(sk2); + + if ((isk2->inet_num == ident) && + (sk2 != sk) && + (!sk2->sk_reuse || !sk->sk_reuse)) + goto fail; + } + } + + pr_debug("found port/ident = %d\n", ident); + isk->inet_num = ident; + if (sk_unhashed(sk)) { + pr_debug("was not hashed\n"); + sock_hold(sk); + hlist_nulls_add_head(&sk->sk_nulls_node, hlist); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } + write_unlock_bh(&ping_table.lock); + return 0; + +fail: + write_unlock_bh(&ping_table.lock); + return 1; +} + +static void ping_v4_hash(struct sock *sk) +{ + pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); + BUG(); /* "Please do not press this button again." */ +} + +static void ping_v4_unhash(struct sock *sk) +{ + struct inet_sock *isk = inet_sk(sk); + pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); + if (sk_hashed(sk)) { + write_lock_bh(&ping_table.lock); + hlist_nulls_del(&sk->sk_nulls_node); + sock_put(sk); + isk->inet_num = isk->inet_sport = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&ping_table.lock); + } +} + +static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, + u16 ident, int dif) +{ + struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); + struct sock *sk = NULL; + struct inet_sock *isk; + struct hlist_nulls_node *hnode; + + pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", + (int)ident, (unsigned long)daddr, dif); + read_lock_bh(&ping_table.lock); + + ping_portaddr_for_each_entry(sk, hnode, hslot) { + isk = inet_sk(sk); + + pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, + (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, + sk->sk_bound_dev_if); + + pr_debug("iterate\n"); + if (isk->inet_num != ident) + continue; + if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) + continue; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + + sock_hold(sk); + goto exit; + } + + sk = NULL; +exit: + read_unlock_bh(&ping_table.lock); + + return sk; +} + +static void inet_get_ping_group_range_net(struct net *net, gid_t *low, + gid_t *high) +{ + gid_t *data = net->ipv4.sysctl_ping_group_range; + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = data[0]; + *high = data[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} + + +static int ping_init_sock(struct sock *sk) +{ + struct net *net = sock_net(sk); + gid_t group = current_egid(); + gid_t range[2]; + struct group_info *group_info = get_current_groups(); + int i, j, count = group_info->ngroups; + + inet_get_ping_group_range_net(net, range, range+1); + if (range[0] <= group && group <= range[1]) + return 0; + + for (i = 0; i < group_info->nblocks; i++) { + int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); + + for (j = 0; j < cp_count; j++) { + group = group_info->blocks[i][j]; + if (range[0] <= group && group <= range[1]) + return 0; + } + + count -= cp_count; + } + + return -EACCES; +} + +static void ping_close(struct sock *sk, long timeout) +{ + pr_debug("ping_close(sk=%p,sk->num=%u)\n", + inet_sk(sk), inet_sk(sk)->inet_num); + pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); + + sk_common_release(sk); +} + +/* + * We need our own bind because there are no privileged id's == local ports. + * Moreover, we don't allow binding to multi- and broadcast addresses. + */ + +static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *isk = inet_sk(sk); + unsigned short snum; + int chk_addr_ret; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", + sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); + + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + if (addr->sin_addr.s_addr == INADDR_ANY) + chk_addr_ret = RTN_LOCAL; + + if ((sysctl_ip_nonlocal_bind == 0 && + isk->freebind == 0 && isk->transparent == 0 && + chk_addr_ret != RTN_LOCAL) || + chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST) + return -EADDRNOTAVAIL; + + lock_sock(sk); + + err = -EINVAL; + if (isk->inet_num != 0) + goto out; + + err = -EADDRINUSE; + isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; + snum = ntohs(addr->sin_port); + if (ping_v4_get_port(sk, snum) != 0) { + isk->inet_saddr = isk->inet_rcv_saddr = 0; + goto out; + } + + pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", + (int)isk->inet_num, + (unsigned long) isk->inet_rcv_saddr, + (int)sk->sk_bound_dev_if); + + err = 0; + if (isk->inet_rcv_saddr) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + isk->inet_sport = htons(isk->inet_num); + isk->inet_daddr = 0; + isk->inet_dport = 0; + sk_dst_reset(sk); +out: + release_sock(sk); + pr_debug("ping_v4_bind -> %d\n", err); + return err; +} + +/* + * Is this a supported type of ICMP message? + */ + +static inline int ping_supported(int type, int code) +{ + if (type == ICMP_ECHO && code == 0) + return 1; + return 0; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. + */ + +static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +void ping_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + struct inet_sock *inet_sock; + int type = icmph->type; + int code = icmph->code; + struct net *net = dev_net(skb->dev); + struct sock *sk; + int harderr; + int err; + + /* We assume the packet has already been checked by icmp_unreach */ + + if (!ping_supported(icmph->type, icmph->code)) + return; + + pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, + code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + + sk = ping_v4_lookup(net, iph->daddr, iph->saddr, + ntohs(icmph->un.echo.id), skb->dev->ifindex); + if (sk == NULL) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + pr_debug("no socket, dropping\n"); + return; /* No socket for error */ + } + pr_debug("err on socket %p\n", sk); + + err = 0; + harderr = 0; + inet_sock = inet_sk(sk); + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + /* This is not a real error but ping wants to see it. + * Report it with some fake errno. */ + err = EREMOTEIO; + break; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + case ICMP_REDIRECT: + /* See ICMP_SOURCE_QUENCH */ + err = EREMOTEIO; + break; + } + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. + */ + if (!inet_sock->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { + ip_icmp_error(sk, skb, err, 0 /* no remote port */, + info, (u8 *)icmph); + } + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +/* + * Copy and checksum an ICMP Echo packet from user space into a buffer. + */ + +struct pingfakehdr { + struct icmphdr icmph; + struct iovec *iov; + u32 wcheck; +}; + +static int ping_getfrag(void *from, char * to, + int offset, int fraglen, int odd, struct sk_buff *skb) +{ + struct pingfakehdr *pfh = (struct pingfakehdr *)from; + + if (offset == 0) { + if (fraglen < sizeof(struct icmphdr)) + BUG(); + if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), + pfh->iov, 0, fraglen - sizeof(struct icmphdr), + &pfh->wcheck)) + return -EFAULT; + + return 0; + } + if (offset < sizeof(struct icmphdr)) + BUG(); + if (csum_partial_copy_fromiovecend + (to, pfh->iov, offset - sizeof(struct icmphdr), + fraglen, &pfh->wcheck)) + return -EFAULT; + return 0; +} + +static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, + struct flowi4 *fl4) +{ + struct sk_buff *skb = skb_peek(&sk->sk_write_queue); + + pfh->wcheck = csum_partial((char *)&pfh->icmph, + sizeof(struct icmphdr), pfh->wcheck); + pfh->icmph.checksum = csum_fold(pfh->wcheck); + memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); + skb->ip_summed = CHECKSUM_NONE; + return ip_push_pending_frames(sk, fl4); +} + +static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct net *net = sock_net(sk); + struct flowi4 fl4; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct icmphdr user_icmph; + struct pingfakehdr pfh; + struct rtable *rt = NULL; + struct ip_options_data opt_copy; + int free = 0; + u32 saddr, daddr, faddr; + u8 tos; + int err; + + pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + + if (len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Fetch the ICMP header provided by the userland. + * iovec is modified! + */ + + if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, + sizeof(struct icmphdr))) + return -EFAULT; + if (!ping_supported(user_icmph.type, user_icmph.code)) + return -EINVAL; + + /* + * Get and verify the address. + */ + + if (msg->msg_name) { + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return -EINVAL; + if (usin->sin_family != AF_INET) + return -EINVAL; + daddr = usin->sin_addr.s_addr; + /* no remote port */ + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = inet->inet_daddr; + /* no remote port */ + } + + ipc.addr = inet->inet_saddr; + ipc.opt = NULL; + ipc.oif = sk->sk_bound_dev_if; + ipc.tx_flags = 0; + err = sock_tx_timestamp(sk, &ipc.tx_flags); + if (err) + return err; + + if (msg->msg_controllen) { + err = ip_cmsg_send(sock_net(sk), msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + } + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } + + saddr = ipc.addr; + ipc.addr = faddr = daddr; + + if (ipc.opt && ipc.opt->opt.srr) { + if (!daddr) + return -EINVAL; + faddr = ipc.opt->opt.faddr; + } + tos = RT_TOS(inet->tos); + if (sock_flag(sk, SOCK_LOCALROUTE) || + (msg->msg_flags & MSG_DONTROUTE) || + (ipc.opt && ipc.opt->opt.is_strictroute)) { + tos |= RTO_ONLINK; + } + + if (ipv4_is_multicast(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + if (err == -ENETUNREACH) + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + goto out; + } + + err = -EACCES; + if ((rt->rt_flags & RTCF_BROADCAST) && + !sock_flag(sk, SOCK_BROADCAST)) + goto out; + + if (msg->msg_flags & MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + if (!ipc.addr) + ipc.addr = fl4.daddr; + + lock_sock(sk); + + pfh.icmph.type = user_icmph.type; /* already checked */ + pfh.icmph.code = user_icmph.code; /* ditto */ + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + + err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, + 0, &ipc, &rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else + err = ping_push_pending_frames(sk, &pfh, &fl4); + release_sock(sk); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) { + icmp_out_count(sock_net(sk), user_icmph.type); + return len; + } + return err; + +do_confirm: + dst_confirm(&rt->dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *isk = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + int copied, err; + + pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + /* Don't bother checking the checksum */ + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = 0 /* skb->h.uh->source */; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (isk->cmsg_flags) + ip_cmsg_recv(msg, skb); + err = copied; + +done: + skb_free_datagram(sk, skb); +out: + pr_debug("ping_recvmsg -> %d\n", err); + return err; +} + +static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", + inet_sk(sk), inet_sk(sk)->inet_num, skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { + ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); + kfree_skb(skb); + pr_debug("ping_queue_rcv_skb -> failed\n"); + return -1; + } + return 0; +} + + +/* + * All we need to do is get the socket. + */ + +void ping_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct net *net = dev_net(skb->dev); + struct iphdr *iph = ip_hdr(skb); + struct icmphdr *icmph = icmp_hdr(skb); + u32 saddr = iph->saddr; + u32 daddr = iph->daddr; + + /* We assume the packet has already been checked by icmp_rcv */ + + pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", + skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + + /* Push ICMP header back */ + skb_push(skb, skb->data - (u8 *)icmph); + + sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), + skb->dev->ifindex); + if (sk != NULL) { + pr_debug("rcv on socket %p\n", sk); + ping_queue_rcv_skb(sk, skb_get(skb)); + sock_put(sk); + return; + } + pr_debug("no socket, dropping\n"); + + /* We're called from icmp_rcv(). kfree_skb() is done there. */ +} + +struct proto ping_prot = { + .name = "PING", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .sendmsg = ping_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_v4_hash, + .unhash = ping_v4_unhash, + .get_port = ping_v4_get_port, + .obj_size = sizeof(struct inet_sock), +}; +EXPORT_SYMBOL(ping_prot); + +#ifdef CONFIG_PROC_FS + +static struct sock *ping_get_first(struct seq_file *seq, int start) +{ + struct sock *sk; + struct ping_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; + ++state->bucket) { + struct hlist_nulls_node *node; + struct hlist_nulls_head *hslot; + + hslot = &ping_table.hash[state->bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + + sk_nulls_for_each(sk, node, hslot) { + if (net_eq(sock_net(sk), net)) + goto found; + } + } + sk = NULL; +found: + return sk; +} + +static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) +{ + struct ping_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + do { + sk = sk_nulls_next(sk); + } while (sk && (!net_eq(sock_net(sk), net))); + + if (!sk) + return ping_get_first(seq, state->bucket + 1); + return sk; +} + +static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = ping_get_first(seq, 0); + + if (sk) + while (pos && (sk = ping_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *ping_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ping_iter_state *state = seq->private; + state->bucket = 0; + + read_lock_bh(&ping_table.lock); + + return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; +} + +static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = ping_get_idx(seq, 0); + else + sk = ping_get_next(seq, v); + + ++*pos; + return sk; +} + +static void ping_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ping_table.lock); +} + +static void ping_format_sock(struct sock *sp, struct seq_file *f, + int bucket, int *len) +{ + struct inet_sock *inet = inet_sk(sp); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + + seq_printf(f, "%5d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + bucket, src, srcp, dest, destp, sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); +} + +static int ping_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode ref pointer drops"); + else { + struct ping_iter_state *state = seq->private; + int len; + + ping_format_sock(v, seq, state->bucket, &len); + seq_printf(seq, "%*s\n", 127 - len, ""); + } + return 0; +} + +static const struct seq_operations ping_seq_ops = { + .show = ping_seq_show, + .start = ping_seq_start, + .next = ping_seq_next, + .stop = ping_seq_stop, +}; + +static int ping_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ping_seq_ops, + sizeof(struct ping_iter_state)); +} + +static const struct file_operations ping_seq_fops = { + .open = ping_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int ping_proc_register(struct net *net) +{ + struct proc_dir_entry *p; + int rc = 0; + + p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); + if (!p) + rc = -ENOMEM; + return rc; +} + +static void ping_proc_unregister(struct net *net) +{ + proc_net_remove(net, "icmp"); +} + + +static int __net_init ping_proc_init_net(struct net *net) +{ + return ping_proc_register(net); +} + +static void __net_exit ping_proc_exit_net(struct net *net) +{ + ping_proc_unregister(net); +} + +static struct pernet_operations ping_net_ops = { + .init = ping_proc_init_net, + .exit = ping_proc_exit_net, +}; + +int __init ping_proc_init(void) +{ + return register_pernet_subsys(&ping_net_ops); +} + +void ping_proc_exit(void) +{ + unregister_pernet_subsys(&ping_net_ops); +} + +#endif + +void __init ping_init(void) +{ + int i; + + for (i = 0; i < PING_HTABLE_SIZE; i++) + INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); + rwlock_init(&ping_table.lock); +} diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c new file mode 100644 index 00000000..b14ec7d0 --- /dev/null +++ b/net/ipv4/proc.c @@ -0,0 +1,495 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. It is mainly used for debugging and + * statistics. + * + * Authors: Fred N. van Kempen, + * Gerald J. Heim, + * Fred Baumgarten, + * Erik Schoenfelder, + * + * Fixes: + * Alan Cox : UDP sockets show the rxqueue/txqueue + * using hint flag for the netinfo. + * Pauline Middelink : identd support + * Alan Cox : Make /proc safer. + * Erik Schoenfelder : /proc/net/snmp + * Alan Cox : Handle dead sockets properly. + * Gerhard Koerting : Show both timers + * Alan Cox : Allow inode to be NULL (kernel socket) + * Andi Kleen : Add support for open_requests and + * split functions for more readibility. + * Andi Kleen : Add support for /proc/net/netstat + * Arnaldo C. Melo : Convert to seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Report socket allocation statistics [mea@utu.fi] + */ +static int sockstat_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + int orphans, sockets; + + local_bh_disable(); + orphans = percpu_counter_sum_positive(&tcp_orphan_count); + sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); + local_bh_enable(); + + socket_seq_show(seq); + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", + sock_prot_inuse_get(net, &tcp_prot), orphans, + tcp_death_row.tw_count, sockets, + atomic_long_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d mem %ld\n", + sock_prot_inuse_get(net, &udp_prot), + atomic_long_read(&udp_memory_allocated)); + seq_printf(seq, "UDPLITE: inuse %d\n", + sock_prot_inuse_get(net, &udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", + sock_prot_inuse_get(net, &raw_prot)); + seq_printf(seq, "FRAG: inuse %d memory %d\n", + ip_frag_nqueues(net), ip_frag_mem(net)); + return 0; +} + +static int sockstat_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, sockstat_seq_show); +} + +static const struct file_operations sockstat_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +/* snmp items */ +static const struct snmp_mib snmp4_ipstats_list[] = { + SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS), + SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS), + SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_SENTINEL +}; + +/* Following RFC4293 items are displayed in /proc/net/netstat */ +static const struct snmp_mib snmp4_ipextstats_list[] = { + SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), + SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), + SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS), + SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS), + SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS), + SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS), + SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS), + SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), + SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), + SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), + SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + SNMP_MIB_SENTINEL +}; + +static const struct { + const char *name; + int index; +} icmpmibmap[] = { + { "DestUnreachs", ICMP_DEST_UNREACH }, + { "TimeExcds", ICMP_TIME_EXCEEDED }, + { "ParmProbs", ICMP_PARAMETERPROB }, + { "SrcQuenchs", ICMP_SOURCE_QUENCH }, + { "Redirects", ICMP_REDIRECT }, + { "Echos", ICMP_ECHO }, + { "EchoReps", ICMP_ECHOREPLY }, + { "Timestamps", ICMP_TIMESTAMP }, + { "TimestampReps", ICMP_TIMESTAMPREPLY }, + { "AddrMasks", ICMP_ADDRESS }, + { "AddrMaskReps", ICMP_ADDRESSREPLY }, + { NULL, 0 } +}; + + +static const struct snmp_mib snmp4_tcp_list[] = { + SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), + SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), + SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), + SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), + SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), + SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), + SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), + SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), + SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), + SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), + SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), + SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), + SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), + SNMP_MIB_SENTINEL +}; + +static const struct snmp_mib snmp4_udp_list[] = { + SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_SENTINEL +}; + +static const struct snmp_mib snmp4_net_list[] = { + SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), + SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), + SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), + SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), + SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), + SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), + SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), + SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), + SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), + SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), + SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), + SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), + SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), + SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED), + SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), + SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), + SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), + SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), + SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), + SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), + SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), + SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), + SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), + SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), + SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), + SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), + SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), + SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), + SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), + SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), + SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), + SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), + SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), + SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), + SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), + SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), + SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), + SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), + SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), + SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), + SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), + SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), + SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), + SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), + SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), + SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), + SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), + SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), + SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), + SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), + SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), + SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), + SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), + SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), + SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), + SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), + SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), + SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), + SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), + SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), + SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), + SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), + SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), + SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), + SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), + SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), + SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), + SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), + SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), + SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), + SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), + SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), + SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), + SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), + SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), + SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), + SNMP_MIB_SENTINEL +}; + +static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, + unsigned short *type, int count) +{ + int j; + + if (count) { + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %sType%u", + type[j] & 0x100 ? "Out" : "In", + type[j] & 0xff); + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %lu", vals[j]); + } +} + +static void icmpmsg_put(struct seq_file *seq) +{ +#define PERLINE 16 + + int i, count; + unsigned short type[PERLINE]; + unsigned long vals[PERLINE], val; + struct net *net = seq->private; + + count = 0; + for (i = 0; i < ICMPMSG_MIB_MAX; i++) { + val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); + if (val) { + type[count] = i; + vals[count++] = val; + } + if (count == PERLINE) { + icmpmsg_put_line(seq, vals, type, count); + count = 0; + } + } + icmpmsg_put_line(seq, vals, type, count); + +#undef PERLINE +} + +static void icmp_put(struct seq_file *seq) +{ + int i; + struct net *net = seq->private; + + seq_puts(seq, "\nIcmp: InMsgs InErrors"); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " In%s", icmpmibmap[i].name); + seq_printf(seq, " OutMsgs OutErrors"); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " Out%s", icmpmibmap[i].name); + seq_printf(seq, "\nIcmp: %lu %lu", + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, + icmpmibmap[i].index)); + seq_printf(seq, " %lu %lu", + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, + icmpmibmap[i].index | 0x100)); +} + +/* + * Called from the PROCfs module. This outputs /proc/net/snmp. + */ +static int snmp_seq_show(struct seq_file *seq, void *v) +{ + int i; + struct net *net = seq->private; + + seq_puts(seq, "Ip: Forwarding DefaultTTL"); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_ipstats_list[i].name); + + seq_printf(seq, "\nIp: %d %d", + IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, + sysctl_ip_default_ttl); + + BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %llu", + snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp4_ipstats_list[i].entry, + offsetof(struct ipstats_mib, syncp))); + + icmp_put(seq); /* RFC 2011 compatibility */ + icmpmsg_put(seq); + + seq_puts(seq, "\nTcp:"); + for (i = 0; snmp4_tcp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_tcp_list[i].name); + + seq_puts(seq, "\nTcp:"); + for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { + /* MaxConn field is signed, RFC 2012 */ + if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) + seq_printf(seq, " %ld", + snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp4_tcp_list[i].entry)); + else + seq_printf(seq, " %lu", + snmp_fold_field((void __percpu **)net->mib.tcp_statistics, + snmp4_tcp_list[i].entry)); + } + + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_udp_list[i].name); + + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void __percpu **)net->mib.udp_statistics, + snmp4_udp_list[i].entry)); + + /* the UDP and UDP-Lite MIBs are the same */ + seq_puts(seq, "\nUdpLite:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_udp_list[i].name); + + seq_puts(seq, "\nUdpLite:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void __percpu **)net->mib.udplite_statistics, + snmp4_udp_list[i].entry)); + + seq_putc(seq, '\n'); + return 0; +} + +static int snmp_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, snmp_seq_show); +} + +static const struct file_operations snmp_seq_fops = { + .owner = THIS_MODULE, + .open = snmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + + + +/* + * Output /proc/net/netstat + */ +static int netstat_seq_show(struct seq_file *seq, void *v) +{ + int i; + struct net *net = seq->private; + + seq_puts(seq, "TcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_net_list[i].name); + + seq_puts(seq, "\nTcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void __percpu **)net->mib.net_statistics, + snmp4_net_list[i].entry)); + + seq_puts(seq, "\nIpExt:"); + for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); + + seq_puts(seq, "\nIpExt:"); + for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + seq_printf(seq, " %llu", + snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp4_ipextstats_list[i].entry, + offsetof(struct ipstats_mib, syncp))); + + seq_putc(seq, '\n'); + return 0; +} + +static int netstat_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, netstat_seq_show); +} + +static const struct file_operations netstat_seq_fops = { + .owner = THIS_MODULE, + .open = netstat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static __net_init int ip_proc_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) + goto out_sockstat; + if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; + + return 0; + +out_snmp: + proc_net_remove(net, "netstat"); +out_netstat: + proc_net_remove(net, "sockstat"); +out_sockstat: + return -ENOMEM; +} + +static __net_exit void ip_proc_exit_net(struct net *net) +{ + proc_net_remove(net, "snmp"); + proc_net_remove(net, "netstat"); + proc_net_remove(net, "sockstat"); +} + +static __net_initdata struct pernet_operations ip_proc_ops = { + .init = ip_proc_init_net, + .exit = ip_proc_exit_net, +}; + +int __init ip_misc_proc_init(void) +{ + return register_pernet_subsys(&ip_proc_ops); +} + diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c new file mode 100644 index 00000000..9ae5c01c --- /dev/null +++ b/net/ipv4/protocol.c @@ -0,0 +1,61 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * INET protocol dispatch tables. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * + * Fixes: + * Alan Cox : Ahah! udp icmp errors don't work because + * udp_err is never called! + * Alan Cox : Added new fields for init and ready for + * proper fragmentation (_NO_ 4K limits!) + * Richard Colella : Hang on hash collision + * Vince Laviano : Modified inet_del_protocol() to correctly + * maintain copy bit. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include + +const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; + +/* + * Add a protocol handler to the hash tables + */ + +int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) +{ + int hash = protocol & (MAX_INET_PROTOS - 1); + + return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet_add_protocol); + +/* + * Remove a protocol from the hash tables. + */ + +int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + prot, NULL) == prot) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet_del_protocol); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c new file mode 100644 index 00000000..c9893d43 --- /dev/null +++ b/net/ipv4/raw.c @@ -0,0 +1,1064 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * RAW - implementation of IP "raw" sockets. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * + * Fixes: + * Alan Cox : verify_area() fixed up + * Alan Cox : ICMP error handling + * Alan Cox : EMSGSIZE if you send too big a packet + * Alan Cox : Now uses generic datagrams and shared + * skbuff library. No more peek crashes, + * no more backlogs + * Alan Cox : Checks sk->broadcast. + * Alan Cox : Uses skb_free_datagram/skb_copy_datagram + * Alan Cox : Raw passes ip options too + * Alan Cox : Setsocketopt added + * Alan Cox : Fixed error return for broadcasts + * Alan Cox : Removed wake_up calls + * Alan Cox : Use ttl/tos + * Alan Cox : Cleaned up old debugging + * Alan Cox : Use new kernel side addresses + * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. + * Alan Cox : BSD style RAW socket demultiplexing. + * Alan Cox : Beginnings of mrouted support. + * Alan Cox : Added IP_HDRINCL option. + * Alan Cox : Skip broadcast check if BSDism set. + * David S. Miller : New socket lookup architecture. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct raw_hashinfo raw_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), +}; + +void raw_hash_sk(struct sock *sk) +{ + struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; + struct hlist_head *head; + + head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; + + write_lock_bh(&h->lock); + sk_add_node(sk, head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(raw_hash_sk); + +void raw_unhash_sk(struct sock *sk) +{ + struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; + + write_lock_bh(&h->lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(raw_unhash_sk); + +static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, + unsigned short num, __be32 raddr, __be32 laddr, int dif) +{ + struct hlist_node *node; + + sk_for_each_from(sk, node) { + struct inet_sock *inet = inet_sk(sk); + + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && + !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + goto found; /* gotcha */ + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + if (!pskb_may_pull(skb, sizeof(struct icmphdr))) + return 1; + + type = icmp_hdr(skb)->type; + if (type < 32) { + __u32 data = raw_sk(sk)->filter.data; + + return ((1 << type) & data) != 0; + } + + /* Do not block unknown ICMP types */ + return 0; +} + +/* IP input processing comes here for RAW socket delivery. + * Caller owns SKB, so we must make clones. + * + * RFC 1122: SHOULD pass TOS value up to the transport layer. + * -> It does. And not only TOS, but all IP header. + */ +static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) +{ + struct sock *sk; + struct hlist_head *head; + int delivered = 0; + struct net *net; + + read_lock(&raw_v4_hashinfo.lock); + head = &raw_v4_hashinfo.ht[hash]; + if (hlist_empty(head)) + goto out; + + net = dev_net(skb->dev); + sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + + while (sk) { + delivered = 1; + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) + raw_rcv(sk, clone); + } + sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + } +out: + read_unlock(&raw_v4_hashinfo.lock); + return delivered; +} + +int raw_local_deliver(struct sk_buff *skb, int protocol) +{ + int hash; + struct sock *raw_sk; + + hash = protocol & (RAW_HTABLE_SIZE - 1); + raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); + + /* If there maybe a raw socket we must check - if not we + * don't care less + */ + if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) + raw_sk = NULL; + + return raw_sk != NULL; + +} + +static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + int err = 0; + int harderr = 0; + + /* Report error on raw socket, if: + 1. User requested ip_recverr. + 2. Socket is connected (otherwise the error indication + is useless without ip_recverr and error is hard. + */ + if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + err = EHOSTUNREACH; + if (code > NR_ICMP_UNREACH) + break; + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; + if (code == ICMP_FRAG_NEEDED) { + harderr = inet->pmtudisc != IP_PMTUDISC_DONT; + err = EMSGSIZE; + } + } + + if (inet->recverr) { + const struct iphdr *iph = (const struct iphdr *)skb->data; + u8 *payload = skb->data + (iph->ihl << 2); + + if (inet->hdrincl) + payload = skb->data; + ip_icmp_error(sk, skb, err, 0, info, payload); + } + + if (inet->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) +{ + int hash; + struct sock *raw_sk; + const struct iphdr *iph; + struct net *net; + + hash = protocol & (RAW_HTABLE_SIZE - 1); + + read_lock(&raw_v4_hashinfo.lock); + raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); + if (raw_sk != NULL) { + iph = (const struct iphdr *)skb->data; + net = dev_net(skb->dev); + + while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, + iph->daddr, iph->saddr, + skb->dev->ifindex)) != NULL) { + raw_err(raw_sk, skb, info); + raw_sk = sk_next(raw_sk); + iph = (const struct iphdr *)skb->data; + } + } + read_unlock(&raw_v4_hashinfo.lock); +} + +static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + /* Charge it to the socket. */ + + if (ip_queue_rcv_skb(sk, skb) < 0) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + +int raw_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + nf_reset(skb); + + skb_push(skb, skb->data - skb_network_header(skb)); + + raw_rcv_skb(sk, skb); + return 0; +} + +static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, + void *from, size_t length, + struct rtable **rtp, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + struct iphdr *iph; + struct sk_buff *skb; + unsigned int iphlen; + int err; + struct rtable *rt = *rtp; + + if (length > rt->dst.dev->mtu) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + rt->dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + skb = sock_alloc_send_skb(sk, + length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, + flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_dst_set(skb, &rt->dst); + *rtp = NULL; + + skb_reset_network_header(skb); + iph = ip_hdr(skb); + skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; + + skb->transport_header = skb->network_header; + err = -EFAULT; + if (memcpy_fromiovecend((void *)iph, from, 0, length)) + goto error_free; + + iphlen = iph->ihl * 4; + + /* + * We don't want to modify the ip header, but we do need to + * be sure that it won't cause problems later along the network + * stack. Specifically we want to make sure that iph->ihl is a + * sane value. If ihl points beyond the length of the buffer passed + * in, reject the frame as invalid + */ + err = -EINVAL; + if (iphlen > length) + goto error_free; + + if (iphlen >= sizeof(*iph)) { + if (!iph->saddr) + iph->saddr = fl4->saddr; + iph->check = 0; + iph->tot_len = htons(length); + if (!iph->id) + ip_select_ident(iph, &rt->dst, NULL); + + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + } + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); + + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto error; +out: + return 0; + +error_free: + kfree_skb(skb); +error: + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); + if (err == -ENOBUFS && !inet->recverr) + err = 0; + return err; +} + +static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + unsigned int i; + + if (!msg->msg_iov) + return 0; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl4->flowi4_proto) { + case IPPROTO_ICMP: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + if (get_user(fl4->fl4_icmp_type, type) || + get_user(fl4->fl4_icmp_code, code)) + return -EFAULT; + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } + return 0; +} + +static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct rtable *rt = NULL; + struct flowi4 fl4; + int free = 0; + __be32 daddr; + __be32 saddr; + u8 tos; + int err; + struct ip_options_data opt_copy; + + err = -EMSGSIZE; + if (len > 0xFFFF) + goto out; + + /* + * Check the flags. + */ + + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ + goto out; /* compatibility */ + + /* + * Get and verify the address. + */ + + if (msg->msg_namelen) { + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + err = -EINVAL; + if (msg->msg_namelen < sizeof(*usin)) + goto out; + if (usin->sin_family != AF_INET) { + static int complained; + if (!complained++) + printk(KERN_INFO "%s forgot to set AF_INET in " + "raw sendmsg. Fix it!\n", + current->comm); + err = -EAFNOSUPPORT; + if (usin->sin_family) + goto out; + } + daddr = usin->sin_addr.s_addr; + /* ANK: I did not forget to get protocol from port field. + * I just do not know, who uses this weirdness. + * IP_HDRINCL is much more convenient. + */ + } else { + err = -EDESTADDRREQ; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + daddr = inet->inet_daddr; + } + + ipc.addr = inet->inet_saddr; + ipc.opt = NULL; + ipc.tx_flags = 0; + ipc.oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + err = ip_cmsg_send(sock_net(sk), msg, &ipc); + if (err) + goto out; + if (ipc.opt) + free = 1; + } + + saddr = ipc.addr; + ipc.addr = daddr; + + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } + + if (ipc.opt) { + err = -EINVAL; + /* Linux does not mangle headers on raw sockets, + * so that IP options + IP_HDRINCL is non-sense. + */ + if (inet->hdrincl) + goto done; + if (ipc.opt->opt.srr) { + if (!daddr) + goto done; + daddr = ipc.opt->opt.faddr; + } + } + tos = RT_CONN_FLAGS(sk); + if (msg->msg_flags & MSG_DONTROUTE) + tos |= RTO_ONLINK; + + if (ipv4_is_multicast(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); + + if (!inet->hdrincl) { + err = raw_probe_proto_opt(&fl4, msg); + if (err) + goto done; + } + + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto done; + } + + err = -EACCES; + if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) + goto done; + + if (msg->msg_flags & MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + if (inet->hdrincl) + err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + &rt, msg->msg_flags); + + else { + if (!ipc.addr) + ipc.addr = fl4.daddr; + lock_sock(sk); + err = ip_append_data(sk, &fl4, ip_generic_getfrag, + msg->msg_iov, len, 0, + &ipc, &rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) { + err = ip_push_pending_frames(sk, &fl4); + if (err == -ENOBUFS && !inet->recverr) + err = 0; + } + release_sock(sk); + } +done: + if (free) + kfree(ipc.opt); + ip_rt_put(rt); + +out: + if (err < 0) + return err; + return len; + +do_confirm: + dst_confirm(&rt->dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static void raw_close(struct sock *sk, long timeout) +{ + /* + * Raw sockets may have direct kernel references. Kill them. + */ + ip_ra_control(sk, 0, NULL); + + sk_common_release(sk); +} + +static void raw_destroy(struct sock *sk) +{ + lock_sock(sk); + ip_flush_pending_frames(sk); + release_sock(sk); +} + +/* This gets rid of all the nasties in af_inet. -DaveM */ +static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + int ret = -EINVAL; + int chk_addr_ret; + + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) + goto out; + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + ret = -EADDRNOTAVAIL; + if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) + goto out; + inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + sk_dst_reset(sk); + ret = 0; +out: return ret; +} + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + size_t copied = 0; + int err = -EOPNOTSUPP; + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) { + err = ip_recv_error(sk, msg, len); + goto out; + } + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_ts_and_drops(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + if (flags & MSG_TRUNC) + copied = skb->len; +done: + skb_free_datagram(sk, skb); +out: + if (err) + return err; + return copied; +} + +static int raw_init(struct sock *sk) +{ + struct raw_sock *rp = raw_sk(sk); + + if (inet_sk(sk)->inet_num == IPPROTO_ICMP) + memset(&rp->filter, 0, sizeof(rp->filter)); + return 0; +} + +static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +{ + if (optlen > sizeof(struct icmp_filter)) + optlen = sizeof(struct icmp_filter); + if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; +} + +static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) +{ + int len, ret = -EFAULT; + + if (get_user(len, optlen)) + goto out; + ret = -EINVAL; + if (len < 0) + goto out; + if (len > sizeof(struct icmp_filter)) + len = sizeof(struct icmp_filter); + ret = -EFAULT; + if (put_user(len, optlen) || + copy_to_user(optval, &raw_sk(sk)->filter, len)) + goto out; + ret = 0; +out: return ret; +} + +static int do_raw_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (optname == ICMP_FILTER) { + if (inet_sk(sk)->inet_num != IPPROTO_ICMP) + return -EOPNOTSUPP; + else + return raw_seticmpfilter(sk, optval, optlen); + } + return -ENOPROTOOPT; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level != SOL_RAW) + return ip_setsockopt(sk, level, optname, optval, optlen); + return do_raw_setsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +static int compat_raw_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level != SOL_RAW) + return compat_ip_setsockopt(sk, level, optname, optval, optlen); + return do_raw_setsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int do_raw_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (optname == ICMP_FILTER) { + if (inet_sk(sk)->inet_num != IPPROTO_ICMP) + return -EOPNOTSUPP; + else + return raw_geticmpfilter(sk, optval, optlen); + } + return -ENOPROTOOPT; +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_RAW) + return ip_getsockopt(sk, level, optname, optval, optlen); + return do_raw_getsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +static int compat_raw_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_RAW) + return compat_ip_getsockopt(sk, level, optname, optval, optlen); + return do_raw_getsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); + + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->len; + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: +#ifdef CONFIG_IP_MROUTE + return ipmr_ioctl(sk, cmd, (void __user *)arg); +#else + return -ENOIOCTLCMD; +#endif + } +} + +#ifdef CONFIG_COMPAT +static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + case SIOCINQ: + return -ENOIOCTLCMD; + default: +#ifdef CONFIG_IP_MROUTE + return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); +#else + return -ENOIOCTLCMD; +#endif + } +} +#endif + +struct proto raw_prot = { + .name = "RAW", + .owner = THIS_MODULE, + .close = raw_close, + .destroy = raw_destroy, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = raw_ioctl, + .init = raw_init, + .setsockopt = raw_setsockopt, + .getsockopt = raw_getsockopt, + .sendmsg = raw_sendmsg, + .recvmsg = raw_recvmsg, + .bind = raw_bind, + .backlog_rcv = raw_rcv_skb, + .hash = raw_hash_sk, + .unhash = raw_unhash_sk, + .obj_size = sizeof(struct raw_sock), + .h.raw_hash = &raw_v4_hashinfo, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_raw_setsockopt, + .compat_getsockopt = compat_raw_getsockopt, + .compat_ioctl = compat_raw_ioctl, +#endif +}; + +#ifdef CONFIG_PROC_FS +static struct sock *raw_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct raw_iter_state *state = raw_seq_private(seq); + + for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; + ++state->bucket) { + struct hlist_node *node; + + sk_for_each(sk, node, &state->h->ht[state->bucket]) + if (sock_net(sk) == seq_file_net(seq)) + goto found; + } + sk = NULL; +found: + return sk; +} + +static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) +{ + struct raw_iter_state *state = raw_seq_private(seq); + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sock_net(sk) != seq_file_net(seq)); + + if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { + sk = sk_head(&state->h->ht[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = raw_get_first(seq); + + if (sk) + while (pos && (sk = raw_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +void *raw_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct raw_iter_state *state = raw_seq_private(seq); + + read_lock(&state->h->lock); + return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL_GPL(raw_seq_start); + +void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = raw_get_first(seq); + else + sk = raw_get_next(seq, v); + ++*pos; + return sk; +} +EXPORT_SYMBOL_GPL(raw_seq_next); + +void raw_seq_stop(struct seq_file *seq, void *v) +{ + struct raw_iter_state *state = raw_seq_private(seq); + + read_unlock(&state->h->lock); +} +EXPORT_SYMBOL_GPL(raw_seq_stop); + +static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) +{ + struct inet_sock *inet = inet_sk(sp); + __be32 dest = inet->inet_daddr, + src = inet->inet_rcv_saddr; + __u16 destp = 0, + srcp = inet->inet_num; + + seq_printf(seq, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + i, src, srcp, dest, destp, sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); +} + +static int raw_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode ref pointer drops\n"); + else + raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); + return 0; +} + +static const struct seq_operations raw_seq_ops = { + .start = raw_seq_start, + .next = raw_seq_next, + .stop = raw_seq_stop, + .show = raw_seq_show, +}; + +int raw_seq_open(struct inode *ino, struct file *file, + struct raw_hashinfo *h, const struct seq_operations *ops) +{ + int err; + struct raw_iter_state *i; + + err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state)); + if (err < 0) + return err; + + i = raw_seq_private((struct seq_file *)file->private_data); + i->h = h; + return 0; +} +EXPORT_SYMBOL_GPL(raw_seq_open); + +static int raw_v4_seq_open(struct inode *inode, struct file *file) +{ + return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops); +} + +static const struct file_operations raw_seq_fops = { + .owner = THIS_MODULE, + .open = raw_v4_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static __net_init int raw_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) + return -ENOMEM; + + return 0; +} + +static __net_exit void raw_exit_net(struct net *net) +{ + proc_net_remove(net, "raw"); +} + +static __net_initdata struct pernet_operations raw_net_ops = { + .init = raw_init_net, + .exit = raw_exit_net, +}; + +int __init raw_proc_init(void) +{ + return register_pernet_subsys(&raw_net_ops); +} + +void __init raw_proc_exit(void) +{ + unregister_pernet_subsys(&raw_net_ops); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c new file mode 100644 index 00000000..6b95f74a --- /dev/null +++ b/net/ipv4/route.c @@ -0,0 +1,3471 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * ROUTE - implementation of the IP router. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Alan Cox, + * Linus Torvalds, + * Alexey Kuznetsov, + * + * Fixes: + * Alan Cox : Verify area fixes. + * Alan Cox : cli() protects routing changes + * Rui Oliveira : ICMP routing table updates + * (rco@di.uminho.pt) Routing table insertion and update + * Linus Torvalds : Rewrote bits to be sensible + * Alan Cox : Added BSD route gw semantics + * Alan Cox : Super /proc >4K + * Alan Cox : MTU in route table + * Alan Cox : MSS actually. Also added the window + * clamper. + * Sam Lantinga : Fixed route matching in rt_del() + * Alan Cox : Routing cache support. + * Alan Cox : Removed compatibility cruft. + * Alan Cox : RTF_REJECT support. + * Alan Cox : TCP irtt support. + * Jonathan Naylor : Added Metric support. + * Miquel van Smoorenburg : BSD API fixes. + * Miquel van Smoorenburg : Metrics. + * Alan Cox : Use __u32 properly + * Alan Cox : Aligned routing errors more closely with BSD + * our system is still very different. + * Alan Cox : Faster /proc handling + * Alexey Kuznetsov : Massive rework to support tree based routing, + * routing caches and better behaviour. + * + * Olaf Erb : irtt wasn't being copied right. + * Bjorn Ekwall : Kerneld route support. + * Alan Cox : Multicast fixed (I hope) + * Pavel Krauz : Limited broadcast fixed + * Mike McLagan : Routing by source + * Alexey Kuznetsov : End of old history. Split to fib.c and + * route.c and rewritten from scratch. + * Andi Kleen : Load-limit warning messages. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Vitaly E. Lavrov : Race condition in ip_route_input_slow. + * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. + * Vladimir V. Ivanov : IP rule info (flowid) is really useful. + * Marc Boucher : routing by fwmark + * Robert Olsson : Added rt_cache statistics + * Arnaldo C. Melo : Convert proc stuff to seq_file + * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. + * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect + * Ilia Sotnikov : Removed TOS from hash calculations + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include + +#define RT_FL_TOS(oldflp4) \ + ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) + +#define IP_MAX_MTU 0xFFF0 + +#define RT_GC_TIMEOUT (300*HZ) + +static int ip_rt_max_size; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_redirect_number __read_mostly = 9; +static int ip_rt_redirect_load __read_mostly = HZ / 50; +static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost __read_mostly = HZ; +static int ip_rt_error_burst __read_mostly = 5 * HZ; +static int ip_rt_gc_elasticity __read_mostly = 8; +static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; +static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; +static int ip_rt_min_advmss __read_mostly = 256; +static int rt_chain_length_max __read_mostly = 20; + +static struct delayed_work expires_work; +static unsigned long expires_ljiffies; + +/* + * Interface to generic destination cache. + */ + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ipv4_default_advmss(const struct dst_entry *dst); +static unsigned int ipv4_default_mtu(const struct dst_entry *dst); +static void ipv4_dst_destroy(struct dst_entry *dst); +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_link_failure(struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static int rt_garbage_collect(struct dst_ops *ops); + +static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ +} + +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } else { + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + } + } + return p; +} + +static struct dst_ops ipv4_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .gc = rt_garbage_collect, + .check = ipv4_dst_check, + .default_advmss = ipv4_default_advmss, + .default_mtu = ipv4_default_mtu, + .cow_metrics = ipv4_cow_metrics, + .destroy = ipv4_dst_destroy, + .ifdown = ipv4_dst_ifdown, + .negative_advice = ipv4_negative_advice, + .link_failure = ipv4_link_failure, + .update_pmtu = ip_rt_update_pmtu, + .local_out = __ip_local_out, +}; + +#define ECN_OR_COST(class) TC_PRIO_##class + +const __u8 ip_tos2prio[16] = { + TC_PRIO_BESTEFFORT, + ECN_OR_COST(BESTEFFORT), + TC_PRIO_BESTEFFORT, + ECN_OR_COST(BESTEFFORT), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK) +}; + + +/* + * Route cache. + */ + +/* The locking scheme is rather straight forward: + * + * 1) Read-Copy Update protects the buckets of the central route hash. + * 2) Only writers remove entries, and they hold the lock + * as they look at rtable reference counts. + * 3) Only readers acquire references to rtable entries, + * they do so with atomic increments and with the + * lock held. + */ + +struct rt_hash_bucket { + struct rtable __rcu *chain; +}; + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ + defined(CONFIG_PROVE_LOCKING) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks + * The size of this table is a power of two and depends on the number of CPUS. + * (on lockdep we have a quite big spinlock_t, so keep the size down there) + */ +#ifdef CONFIG_LOCKDEP +# define RT_HASH_LOCK_SZ 256 +#else +# if NR_CPUS >= 32 +# define RT_HASH_LOCK_SZ 4096 +# elif NR_CPUS >= 16 +# define RT_HASH_LOCK_SZ 2048 +# elif NR_CPUS >= 8 +# define RT_HASH_LOCK_SZ 1024 +# elif NR_CPUS >= 4 +# define RT_HASH_LOCK_SZ 512 +# else +# define RT_HASH_LOCK_SZ 256 +# endif +#endif + +static spinlock_t *rt_hash_locks; +# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] + +static __init void rt_hash_lock_init(void) +{ + int i; + + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, + GFP_KERNEL); + if (!rt_hash_locks) + panic("IP: failed to allocate rt_hash_locks\n"); + + for (i = 0; i < RT_HASH_LOCK_SZ; i++) + spin_lock_init(&rt_hash_locks[i]); +} +#else +# define rt_hash_lock_addr(slot) NULL + +static inline void rt_hash_lock_init(void) +{ +} +#endif + +static struct rt_hash_bucket *rt_hash_table __read_mostly; +static unsigned rt_hash_mask __read_mostly; +static unsigned int rt_hash_log __read_mostly; + +static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) + +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, + int genid) +{ + return jhash_3words((__force u32)daddr, (__force u32)saddr, + idx, genid) + & rt_hash_mask; +} + +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->ipv4.rt_genid); +} + +#ifdef CONFIG_PROC_FS +struct rt_cache_iter_state { + struct seq_net_private p; + int bucket; + int genid; +}; + +static struct rtable *rt_cache_get_first(struct seq_file *seq) +{ + struct rt_cache_iter_state *st = seq->private; + struct rtable *r = NULL; + + for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) + continue; + rcu_read_lock_bh(); + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); + while (r) { + if (dev_net(r->dst.dev) == seq_file_net(seq) && + r->rt_genid == st->genid) + return r; + r = rcu_dereference_bh(r->dst.rt_next); + } + rcu_read_unlock_bh(); + } + return r; +} + +static struct rtable *__rt_cache_get_next(struct seq_file *seq, + struct rtable *r) +{ + struct rt_cache_iter_state *st = seq->private; + + r = rcu_dereference_bh(r->dst.rt_next); + while (!r) { + rcu_read_unlock_bh(); + do { + if (--st->bucket < 0) + return NULL; + } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); + rcu_read_lock_bh(); + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); + } + return r; +} + +static struct rtable *rt_cache_get_next(struct seq_file *seq, + struct rtable *r) +{ + struct rt_cache_iter_state *st = seq->private; + while ((r = __rt_cache_get_next(seq, r)) != NULL) { + if (dev_net(r->dst.dev) != seq_file_net(seq)) + continue; + if (r->rt_genid == st->genid) + break; + } + return r; +} + +static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) +{ + struct rtable *r = rt_cache_get_first(seq); + + if (r) + while (pos && (r = rt_cache_get_next(seq, r))) + --pos; + return pos ? NULL : r; +} + +static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct rt_cache_iter_state *st = seq->private; + if (*pos) + return rt_cache_get_idx(seq, *pos - 1); + st->genid = rt_genid(seq_file_net(seq)); + return SEQ_START_TOKEN; +} + +static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct rtable *r; + + if (v == SEQ_START_TOKEN) + r = rt_cache_get_first(seq); + else + r = rt_cache_get_next(seq, v); + ++*pos; + return r; +} + +static void rt_cache_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + rcu_read_unlock_bh(); +} + +static int rt_cache_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" + "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" + "HHUptod\tSpecDst"); + else { + struct rtable *r = v; + struct neighbour *n; + int len, HHUptod; + + rcu_read_lock(); + n = dst_get_neighbour(&r->dst); + HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; + rcu_read_unlock(); + + seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + r->dst.dev ? r->dst.dev->name : "*", + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, + r->rt_flags, atomic_read(&r->dst.__refcnt), + r->dst.__use, 0, (__force u32)r->rt_src, + dst_metric_advmss(&r->dst) + 40, + dst_metric(&r->dst, RTAX_WINDOW), + (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + + dst_metric(&r->dst, RTAX_RTTVAR)), + r->rt_key_tos, + r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, + HHUptod, + r->rt_spec_dst, &len); + + seq_printf(seq, "%*s\n", 127 - len, ""); + } + return 0; +} + +static const struct seq_operations rt_cache_seq_ops = { + .start = rt_cache_seq_start, + .next = rt_cache_seq_next, + .stop = rt_cache_seq_stop, + .show = rt_cache_seq_show, +}; + +static int rt_cache_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &rt_cache_seq_ops, + sizeof(struct rt_cache_iter_state)); +} + +static const struct file_operations rt_cache_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(rt_cache_stat, cpu); + } + return NULL; +} + +static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(rt_cache_stat, cpu); + } + return NULL; + +} + +static void rt_cpu_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int rt_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct rt_cache_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + return 0; + } + + seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " + " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", + dst_entries_get_slow(&ipv4_dst_ops), + st->in_hit, + st->in_slow_tot, + st->in_slow_mc, + st->in_no_route, + st->in_brd, + st->in_martian_dst, + st->in_martian_src, + + st->out_hit, + st->out_slow_tot, + st->out_slow_mc, + + st->gc_total, + st->gc_ignored, + st->gc_goal_miss, + st->gc_dst_overflow, + st->in_hlist_search, + st->out_hlist_search + ); + return 0; +} + +static const struct seq_operations rt_cpu_seq_ops = { + .start = rt_cpu_seq_start, + .next = rt_cpu_seq_next, + .stop = rt_cpu_seq_stop, + .show = rt_cpu_seq_show, +}; + + +static int rt_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rt_cpu_seq_ops); +} + +static const struct file_operations rt_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef CONFIG_IP_ROUTE_CLASSID +static int rt_acct_proc_show(struct seq_file *m, void *v) +{ + struct ip_rt_acct *dst, *src; + unsigned int i, j; + + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; + + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } + } + + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; +} + +static int rt_acct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt_acct_proc_show, NULL); +} + +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static int __net_init ip_rt_do_proc_init(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, + &rt_cache_seq_fops); + if (!pde) + goto err1; + + pde = proc_create("rt_cache", S_IRUGO, + net->proc_net_stat, &rt_cpu_seq_fops); + if (!pde) + goto err2; + +#ifdef CONFIG_IP_ROUTE_CLASSID + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); + if (!pde) + goto err3; +#endif + return 0; + +#ifdef CONFIG_IP_ROUTE_CLASSID +err3: + remove_proc_entry("rt_cache", net->proc_net_stat); +#endif +err2: + remove_proc_entry("rt_cache", net->proc_net); +err1: + return -ENOMEM; +} + +static void __net_exit ip_rt_do_proc_exit(struct net *net) +{ + remove_proc_entry("rt_cache", net->proc_net_stat); + remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_IP_ROUTE_CLASSID + remove_proc_entry("rt_acct", net->proc_net); +#endif +} + +static struct pernet_operations ip_rt_proc_ops __net_initdata = { + .init = ip_rt_do_proc_init, + .exit = ip_rt_do_proc_exit, +}; + +static int __init ip_rt_proc_init(void) +{ + return register_pernet_subsys(&ip_rt_proc_ops); +} + +#else +static inline int ip_rt_proc_init(void) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static inline void rt_free(struct rtable *rt) +{ + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); +} + +static inline void rt_drop(struct rtable *rt) +{ + ip_rt_put(rt); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); +} + +static inline int rt_fast_clean(struct rtable *rth) +{ + /* Kill broadcast/multicast entries very aggresively, if they + collide in hash table with more useful entries */ + return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && + rt_is_input_route(rth) && rth->dst.rt_next; +} + +static inline int rt_valuable(struct rtable *rth) +{ + return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || + (rth->peer && rth->peer->pmtu_expires); +} + +static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +{ + unsigned long age; + int ret = 0; + + if (atomic_read(&rth->dst.__refcnt)) + goto out; + + age = jiffies - rth->dst.lastuse; + if ((age <= tmo1 && !rt_fast_clean(rth)) || + (age <= tmo2 && rt_valuable(rth))) + goto out; + ret = 1; +out: return ret; +} + +/* Bits of score are: + * 31: very valuable + * 30: not quite useless + * 29..0: usage counter + */ +static inline u32 rt_score(struct rtable *rt) +{ + u32 score = jiffies - rt->dst.lastuse; + + score = ~score & ~(3<<30); + + if (rt_valuable(rt)) + score |= (1<<31); + + if (rt_is_output_route(rt) || + !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) + score |= (1<<30); + + return score; +} + +static inline bool rt_caching(const struct net *net) +{ + return net->ipv4.current_rt_cache_rebuild_count <= + net->ipv4.sysctl_rt_cache_rebuild_count; +} + +static inline bool compare_hash_inputs(const struct rtable *rt1, + const struct rtable *rt2) +{ + return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); +} + +static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) +{ + return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_mark ^ rt2->rt_mark) | + (rt1->rt_key_tos ^ rt2->rt_key_tos) | + (rt1->rt_route_iif ^ rt2->rt_route_iif) | + (rt1->rt_oif ^ rt2->rt_oif)) == 0; +} + +static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) +{ + return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); +} + +static inline int rt_is_expired(struct rtable *rth) +{ + return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); +} + +/* + * Perform a full scan of hash table and free all entries. + * Can be called by a softirq or a process. + * In the later case, we want to be reschedule if necessary + */ +static void rt_do_flush(struct net *net, int process_context) +{ + unsigned int i; + struct rtable *rth, *next; + + for (i = 0; i <= rt_hash_mask; i++) { + struct rtable __rcu **pprev; + struct rtable *list; + + if (process_context && need_resched()) + cond_resched(); + rth = rcu_dereference_raw(rt_hash_table[i].chain); + if (!rth) + continue; + + spin_lock_bh(rt_hash_lock_addr(i)); + + list = NULL; + pprev = &rt_hash_table[i].chain; + rth = rcu_dereference_protected(*pprev, + lockdep_is_held(rt_hash_lock_addr(i))); + + while (rth) { + next = rcu_dereference_protected(rth->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i))); + + if (!net || + net_eq(dev_net(rth->dst.dev), net)) { + rcu_assign_pointer(*pprev, next); + rcu_assign_pointer(rth->dst.rt_next, list); + list = rth; + } else { + pprev = &rth->dst.rt_next; + } + rth = next; + } + + spin_unlock_bh(rt_hash_lock_addr(i)); + + for (; list; list = next) { + next = rcu_dereference_protected(list->dst.rt_next, 1); + rt_free(list); + } + } +} + +/* + * While freeing expired entries, we compute average chain length + * and standard deviation, using fixed-point arithmetic. + * This to have an estimation of rt_chain_length_max + * rt_chain_length_max = max(elasticity, AVG + 4*SD) + * We use 3 bits for frational part, and 29 (or 61) for magnitude. + */ + +#define FRACT_BITS 3 +#define ONE (1UL << FRACT_BITS) + +/* + * Given a hash chain and an item in this hash chain, + * find if a previous entry has the same hash_inputs + * (but differs on tos, mark or oif) + * Returns 0 if an alias is found. + * Returns ONE if rth has no alias before itself. + */ +static int has_noalias(const struct rtable *head, const struct rtable *rth) +{ + const struct rtable *aux = head; + + while (aux != rth) { + if (compare_hash_inputs(aux, rth)) + return 0; + aux = rcu_dereference_protected(aux->dst.rt_next, 1); + } + return ONE; +} + +static void rt_check_expire(void) +{ + static unsigned int rover; + unsigned int i = rover, goal; + struct rtable *rth; + struct rtable __rcu **rthp; + unsigned long samples = 0; + unsigned long sum = 0, sum2 = 0; + unsigned long delta; + u64 mult; + + delta = jiffies - expires_ljiffies; + expires_ljiffies = jiffies; + mult = ((u64)delta) << rt_hash_log; + if (ip_rt_gc_timeout > 1) + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + if (goal > rt_hash_mask) + goal = rt_hash_mask + 1; + for (; goal > 0; goal--) { + unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; + + i = (i + 1) & rt_hash_mask; + rthp = &rt_hash_table[i].chain; + + if (need_resched()) + cond_resched(); + + samples++; + + if (rcu_dereference_raw(*rthp) == NULL) + continue; + length = 0; + spin_lock_bh(rt_hash_lock_addr(i)); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { + prefetch(rth->dst.rt_next); + if (rt_is_expired(rth)) { + *rthp = rth->dst.rt_next; + rt_free(rth); + continue; + } + if (rth->dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(jiffies, rth->dst.expires)) { +nofree: + tmo >>= 1; + rthp = &rth->dst.rt_next; + /* + * We only count entries on + * a chain with equal hash inputs once + * so that entries for different QOS + * levels, and other non-hash input + * attributes don't unfairly skew + * the length computation + */ + length += has_noalias(rt_hash_table[i].chain, rth); + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; + + /* Cleanup aged off entries. */ + *rthp = rth->dst.rt_next; + rt_free(rth); + } + spin_unlock_bh(rt_hash_lock_addr(i)); + sum += length; + sum2 += length*length; + } + if (samples) { + unsigned long avg = sum / samples; + unsigned long sd = int_sqrt(sum2 / samples - avg*avg); + rt_chain_length_max = max_t(unsigned long, + ip_rt_gc_elasticity, + (avg + 4*sd) >> FRACT_BITS); + } + rover = i; +} + +/* + * rt_worker_func() is run in process context. + * we call rt_check_expire() to scan part of the hash table + */ +static void rt_worker_func(struct work_struct *work) +{ + rt_check_expire(); + schedule_delayed_work(&expires_work, ip_rt_gc_interval); +} + +/* + * Perturbation of rt_genid by a small quantity [1..256] + * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() + * many times (2^24) without giving recent rt_genid. + * Jenkins hash is strong enough that litle changes of rt_genid are OK. + */ +static void rt_cache_invalidate(struct net *net) +{ + unsigned char shuffle; + + get_random_bytes(&shuffle, sizeof(shuffle)); + atomic_add(shuffle + 1U, &net->ipv4.rt_genid); +} + +/* + * delay < 0 : invalidate cache (fast : entries will be deleted later) + * delay >= 0 : invalidate & flush cache (can be long) + */ +void rt_cache_flush(struct net *net, int delay) +{ + rt_cache_invalidate(net); + if (delay >= 0) + rt_do_flush(net, !in_softirq()); +} + +/* Flush previous cache invalidated entries from the cache */ +void rt_cache_flush_batch(struct net *net) +{ + rt_do_flush(net, !in_softirq()); +} + +static void rt_emergency_hash_rebuild(struct net *net) +{ + if (net_ratelimit()) + printk(KERN_WARNING "Route hash chain too long!\n"); + rt_cache_invalidate(net); +} + +/* + Short description of GC goals. + + We want to build algorithm, which will keep routing cache + at some equilibrium point, when number of aged off entries + is kept approximately equal to newly generated ones. + + Current expiration strength is variable "expire". + We try to adjust it dynamically, so that if networking + is idle expires is large enough to keep enough of warm entries, + and when load increases it reduces to limit cache size. + */ + +static int rt_garbage_collect(struct dst_ops *ops) +{ + static unsigned long expire = RT_GC_TIMEOUT; + static unsigned long last_gc; + static int rover; + static int equilibrium; + struct rtable *rth; + struct rtable __rcu **rthp; + unsigned long now = jiffies; + int goal; + int entries = dst_entries_get_fast(&ipv4_dst_ops); + + /* + * Garbage collection is pretty expensive, + * do not make it too frequently. + */ + + RT_CACHE_STAT_INC(gc_total); + + if (now - last_gc < ip_rt_gc_min_interval && + entries < ip_rt_max_size) { + RT_CACHE_STAT_INC(gc_ignored); + goto out; + } + + entries = dst_entries_get_slow(&ipv4_dst_ops); + /* Calculate number of entries, which we want to expire now. */ + goal = entries - (ip_rt_gc_elasticity << rt_hash_log); + if (goal <= 0) { + if (equilibrium < ipv4_dst_ops.gc_thresh) + equilibrium = ipv4_dst_ops.gc_thresh; + goal = entries - equilibrium; + if (goal > 0) { + equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); + goal = entries - equilibrium; + } + } else { + /* We are in dangerous area. Try to reduce cache really + * aggressively. + */ + goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); + equilibrium = entries - goal; + } + + if (now - last_gc >= ip_rt_gc_min_interval) + last_gc = now; + + if (goal <= 0) { + equilibrium += goal; + goto work_done; + } + + do { + int i, k; + + for (i = rt_hash_mask, k = rover; i >= 0; i--) { + unsigned long tmo = expire; + + k = (k + 1) & rt_hash_mask; + rthp = &rt_hash_table[k].chain; + spin_lock_bh(rt_hash_lock_addr(k)); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { + if (!rt_is_expired(rth) && + !rt_may_expire(rth, tmo, expire)) { + tmo >>= 1; + rthp = &rth->dst.rt_next; + continue; + } + *rthp = rth->dst.rt_next; + rt_free(rth); + goal--; + } + spin_unlock_bh(rt_hash_lock_addr(k)); + if (goal <= 0) + break; + } + rover = k; + + if (goal <= 0) + goto work_done; + + /* Goal is not achieved. We stop process if: + + - if expire reduced to zero. Otherwise, expire is halfed. + - if table is not full. + - if we are called from interrupt. + - jiffies check is just fallback/debug loop breaker. + We will not spin here for long time in any case. + */ + + RT_CACHE_STAT_INC(gc_goal_miss); + + if (expire == 0) + break; + + expire >>= 1; + + if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) + goto out; + } while (!in_softirq() && time_before_eq(jiffies, now)); + + if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) + goto out; + if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) + goto out; + if (net_ratelimit()) + printk(KERN_WARNING "dst cache overflow\n"); + RT_CACHE_STAT_INC(gc_dst_overflow); + return 1; + +work_done: + expire += ip_rt_gc_min_interval; + if (expire > ip_rt_gc_timeout || + dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || + dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) + expire = ip_rt_gc_timeout; +out: return 0; +} + +/* + * Returns number of entries in a hash chain that have different hash_inputs + */ +static int slow_chain_length(const struct rtable *head) +{ + int length = 0; + const struct rtable *rth = head; + + while (rth) { + length += has_noalias(head, rth); + rth = rcu_dereference_protected(rth->dst.rt_next, 1); + } + return length >> FRACT_BITS; +} + +static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, + struct sk_buff *skb, int ifindex) +{ + struct rtable *rth, *cand; + struct rtable __rcu **rthp, **candp; + unsigned long now; + u32 min_score; + int chain_length; + int attempts = !in_softirq(); + +restart: + chain_length = 0; + min_score = ~(u32)0; + cand = NULL; + candp = NULL; + now = jiffies; + + if (!rt_caching(dev_net(rt->dst.dev))) { + /* + * If we're not caching, just tell the caller we + * were successful and don't touch the route. The + * caller hold the sole reference to the cache entry, and + * it will be released when the caller is done with it. + * If we drop it here, the callers have no way to resolve routes + * when we're not caching. Instead, just point *rp at rt, so + * the caller gets a single use out of the route + * Note that we do rt_free on this new route entry, so that + * once its refcount hits zero, we are still able to reap it + * (Thanks Alexey) + * Note: To avoid expensive rcu stuff for this uncached dst, + * we set DST_NOCACHE so that dst_release() can free dst without + * waiting a grace period. + */ + + rt->dst.flags |= DST_NOCACHE; + if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { + int err = arp_bind_neighbour(&rt->dst); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING + "Neighbour table failure & not caching routes.\n"); + ip_rt_put(rt); + return ERR_PTR(err); + } + } + + goto skip_hashing; + } + + rthp = &rt_hash_table[hash].chain; + + spin_lock_bh(rt_hash_lock_addr(hash)); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { + if (rt_is_expired(rth)) { + *rthp = rth->dst.rt_next; + rt_free(rth); + continue; + } + if (compare_keys(rth, rt) && compare_netns(rth, rt)) { + /* Put it first */ + *rthp = rth->dst.rt_next; + /* + * Since lookup is lockfree, the deletion + * must be visible to another weakly ordered CPU before + * the insertion at the start of the hash chain. + */ + rcu_assign_pointer(rth->dst.rt_next, + rt_hash_table[hash].chain); + /* + * Since lookup is lockfree, the update writes + * must be ordered for consistency on SMP. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rth); + + dst_use(&rth->dst, now); + spin_unlock_bh(rt_hash_lock_addr(hash)); + + rt_drop(rt); + if (skb) + skb_dst_set(skb, &rth->dst); + return rth; + } + + if (!atomic_read(&rth->dst.__refcnt)) { + u32 score = rt_score(rth); + + if (score <= min_score) { + cand = rth; + candp = rthp; + min_score = score; + } + } + + chain_length++; + + rthp = &rth->dst.rt_next; + } + + if (cand) { + /* ip_rt_gc_elasticity used to be average length of chain + * length, when exceeded gc becomes really aggressive. + * + * The second limit is less certain. At the moment it allows + * only 2 entries per bucket. We will see. + */ + if (chain_length > ip_rt_gc_elasticity) { + *candp = cand->dst.rt_next; + rt_free(cand); + } + } else { + if (chain_length > rt_chain_length_max && + slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { + struct net *net = dev_net(rt->dst.dev); + int num = ++net->ipv4.current_rt_cache_rebuild_count; + if (!rt_caching(net)) { + printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", + rt->dst.dev->name, num); + } + rt_emergency_hash_rebuild(net); + spin_unlock_bh(rt_hash_lock_addr(hash)); + + hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, + ifindex, rt_genid(net)); + goto restart; + } + } + + /* Try to bind route to arp only if it is output + route or unicast forwarding path. + */ + if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { + int err = arp_bind_neighbour(&rt->dst); + if (err) { + spin_unlock_bh(rt_hash_lock_addr(hash)); + + if (err != -ENOBUFS) { + rt_drop(rt); + return ERR_PTR(err); + } + + /* Neighbour tables are full and nothing + can be released. Try to shrink route cache, + it is most likely it holds some neighbour records. + */ + if (attempts-- > 0) { + int saved_elasticity = ip_rt_gc_elasticity; + int saved_int = ip_rt_gc_min_interval; + ip_rt_gc_elasticity = 1; + ip_rt_gc_min_interval = 0; + rt_garbage_collect(&ipv4_dst_ops); + ip_rt_gc_min_interval = saved_int; + ip_rt_gc_elasticity = saved_elasticity; + goto restart; + } + + if (net_ratelimit()) + printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); + rt_drop(rt); + return ERR_PTR(-ENOBUFS); + } + } + + rt->dst.rt_next = rt_hash_table[hash].chain; + + /* + * Since lookup is lockfree, we must make sure + * previous writes to rt are committed to memory + * before making rt visible to other CPUS. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rt); + + spin_unlock_bh(rt_hash_lock_addr(hash)); + +skip_hashing: + if (skb) + skb_dst_set(skb, &rt->dst); + return rt; +} + +static atomic_t __rt_peer_genid = ATOMIC_INIT(0); + +static u32 rt_peer_genid(void) +{ + return atomic_read(&__rt_peer_genid); +} + +void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) +{ + struct inet_peer *peer; + + peer = inet_getpeer_v4(daddr, create); + + if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) + inet_putpeer(peer); + else + rt->rt_peer_genid = rt_peer_genid(); +} + +/* + * Peer allocation may fail only in serious out-of-memory conditions. However + * we still can generate some output. + * Random ID selection looks a bit dangerous because we have no chances to + * select ID being unique in a reasonable period of time. + * But broken packet identifier may be better than no packet at all. + */ +static void ip_select_fb_ident(struct iphdr *iph) +{ + static DEFINE_SPINLOCK(ip_fb_id_lock); + static u32 ip_fallback_id; + u32 salt; + + spin_lock_bh(&ip_fb_id_lock); + salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); + iph->id = htons(salt & 0xFFFF); + ip_fallback_id = salt; + spin_unlock_bh(&ip_fb_id_lock); +} + +void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +{ + struct rtable *rt = (struct rtable *) dst; + + if (rt) { + if (rt->peer == NULL) + rt_bind_peer(rt, rt->rt_dst, 1); + + /* If peer is attached to destination, it is never detached, + so that we need not to grab a lock to dereference it. + */ + if (rt->peer) { + iph->id = htons(inet_getid(rt->peer, more)); + return; + } + } else + printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", + __builtin_return_address(0)); + + ip_select_fb_ident(iph); +} +EXPORT_SYMBOL(__ip_select_ident); + +static void rt_del(unsigned hash, struct rtable *rt) +{ + struct rtable __rcu **rthp; + struct rtable *aux; + + rthp = &rt_hash_table[hash].chain; + spin_lock_bh(rt_hash_lock_addr(hash)); + ip_rt_put(rt); + while ((aux = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { + if (aux == rt || rt_is_expired(aux)) { + *rthp = aux->dst.rt_next; + rt_free(aux); + continue; + } + rthp = &aux->dst.rt_next; + } + spin_unlock_bh(rt_hash_lock_addr(hash)); +} + +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + struct neighbour *n, *old_n; + + dst_confirm(&rt->dst); + + rt->rt_gateway = peer->redirect_learned.a4; + n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + old_n = xchg(&rt->dst._neighbour, n); + if (old_n) + neigh_release(old_n); + if (!n || !(n->nud_state & NUD_VALID)) { + if (n) + neigh_event_send(n, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + } + return 0; +} + +/* called in rcu_read_lock() section */ +void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, + __be32 saddr, struct net_device *dev) +{ + int s, i; + struct in_device *in_dev = __in_dev_get_rcu(dev); + __be32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; + struct inet_peer *peer; + struct net *net; + + if (!in_dev) + return; + + net = dev_net(dev); + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) + goto reject_redirect; + + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) + goto reject_redirect; + } else { + if (inet_addr_type(net, new_gw) != RTN_UNICAST) + goto reject_redirect; + } + + for (s = 0; s < 2; s++) { + for (i = 0; i < 2; i++) { + unsigned int hash; + struct rtable __rcu **rthp; + struct rtable *rt; + + hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); + + rthp = &rt_hash_table[hash].chain; + + while ((rt = rcu_dereference(*rthp)) != NULL) { + rthp = &rt->dst.rt_next; + + if (rt->rt_key_dst != daddr || + rt->rt_key_src != skeys[s] || + rt->rt_oif != ikeys[i] || + rt_is_input_route(rt) || + rt_is_expired(rt) || + !net_eq(dev_net(rt->dst.dev), net) || + rt->dst.error || + rt->dst.dev != dev || + rt->rt_gateway != old_gw) + continue; + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + if (peer->redirect_learned.a4 != new_gw) { + peer->redirect_learned.a4 = new_gw; + atomic_inc(&__rt_peer_genid); + } + check_peer_redir(&rt->dst, peer); + } + } + } + } + return; + +reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); +#endif + ; +} + +static bool peer_pmtu_expired(struct inet_peer *peer) +{ + unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + + return orig && + time_after_eq(jiffies, orig) && + cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +} + +static bool peer_pmtu_cleaned(struct inet_peer *peer) +{ + unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + + return orig && + cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +} + +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *)dst; + struct dst_entry *ret = dst; + + if (rt) { + if (dst->obsolete > 0) { + ip_rt_put(rt); + ret = NULL; + } else if (rt->rt_flags & RTCF_REDIRECTED) { + unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, + rt->rt_oif, + rt_genid(dev_net(dst->dev))); + rt_del(hash, rt); + ret = NULL; + } else if (rt->peer && peer_pmtu_expired(rt->peer)) { + dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); + } + } + return ret; +} + +/* + * Algorithm: + * 1. The first ip_rt_redirect_number redirects are sent + * with exponential backoff, then we stop sending them at all, + * assuming that the host ignores our redirects. + * 2. If we did not see packets requiring redirects + * during ip_rt_redirect_silence, we assume that the host + * forgot redirected route and start to send redirects again. + * + * This algorithm is much cheaper and more intelligent than dumb load limiting + * in icmp.c. + * + * NOTE. Do not forget to inhibit load limiting for redirects (redundant) + * and "frag. need" (breaks PMTU discovery) in icmp.c. + */ + +void ip_rt_send_redirect(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct inet_peer *peer; + int log_martians; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { + rcu_read_unlock(); + return; + } + log_martians = IN_DEV_LOG_MARTIANS(in_dev); + rcu_read_unlock(); + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + peer = rt->peer; + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + return; + } + + /* No redirected packets during ip_rt_redirect_silence; + * reset the algorithm. + */ + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; + + /* Too many ignored redirects; do not send anything + * set dst.rate_last to the last seen redirected packet. + */ + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; + return; + } + + /* Check for load limit; set rate_last to the latest sent + * redirect. + */ + if (peer->rate_tokens == 0 || + time_after(jiffies, + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + peer->rate_last = jiffies; + ++peer->rate_tokens; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (log_martians && + peer->rate_tokens == ip_rt_redirect_number && + net_ratelimit()) + printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", + &ip_hdr(skb)->saddr, rt->rt_iif, + &rt->rt_dst, &rt->rt_gateway); +#endif + } +} + +static int ip_error(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; + unsigned long now; + bool send; + int code; + + switch (rt->dst.error) { + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + IP_INC_STATS_BH(dev_net(rt->dst.dev), + IPSTATS_MIB_INNOROUTES); + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; + } + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + peer = rt->peer; + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; + } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + +out: kfree_skb(skb); + return 0; +} + +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static const unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + +static inline unsigned short guess_mtu(unsigned short old_mtu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; + return 68; +} + +unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, + unsigned short new_mtu, + struct net_device *dev) +{ + unsigned short old_mtu = ntohs(iph->tot_len); + unsigned short est_mtu = 0; + struct inet_peer *peer; + + peer = inet_getpeer_v4(iph->daddr, 1); + if (peer) { + unsigned short mtu = new_mtu; + + if (new_mtu < 68 || new_mtu >= old_mtu) { + /* BSD 4.2 derived systems incorrectly adjust + * tot_len by the IP header length, and report + * a zero MTU in the ICMP message. + */ + if (mtu == 0 && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + mtu = guess_mtu(old_mtu); + } + + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; + + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + est_mtu = mtu; + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; + } + + inet_putpeer(peer); + + atomic_inc(&__rt_peer_genid); + } + return est_mtu ? : new_mtu; +} + +static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +{ + unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); + + if (!expires) + return; + if (time_before(jiffies, expires)) { + u32 orig_dst_mtu = dst_mtu(dst); + if (peer->pmtu_learned < orig_dst_mtu) { + if (!peer->pmtu_orig) + peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); + dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); + } + } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) + dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); +} + +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + + dst_confirm(dst); + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + peer = rt->peer; + if (peer) { + unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); + + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + if (!pmtu_expires || mtu < peer->pmtu_learned) { + + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; + + atomic_inc(&__rt_peer_genid); + rt->rt_peer_genid = rt_peer_genid(); + } + check_peer_pmtu(dst, peer); + } +} + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rtable *rt = (struct rtable *) dst; + + if (rt_is_expired(rt)) + return NULL; + if (rt->rt_peer_genid != rt_peer_genid()) { + struct inet_peer *peer; + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 0); + + peer = rt->peer; + if (peer) { + check_peer_pmtu(dst, peer); + + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } + } + + rt->rt_peer_genid = rt_peer_genid(); + } + return dst; +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer = rt->peer; + + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + if (peer) { + rt->peer = NULL; + inet_putpeer(peer); + } +} + + +static void ipv4_link_failure(struct sk_buff *skb) +{ + struct rtable *rt; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + + rt = skb_rtable(skb); + if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); +} + +static int ip_rt_bug(struct sk_buff *skb) +{ + printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + skb->dev ? skb->dev->name : "?"); + kfree_skb(skb); + WARN_ON(1); + return 0; +} + +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. + + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) +{ + __be32 src; + + if (rt_is_output_route(rt)) + src = ip_hdr(skb)->saddr; + else { + struct fib_result res; + struct flowi4 fl4; + struct iphdr *iph; + + iph = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = rt->dst.dev->ifindex; + fl4.flowi4_iif = skb->dev->ifindex; + fl4.flowi4_mark = skb->mark; + + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + else + src = inet_select_addr(rt->dst.dev, rt->rt_gateway, + RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } + memcpy(addr, &src, 4); +} + +#ifdef CONFIG_IP_ROUTE_CLASSID +static void set_class_tag(struct rtable *rt, u32 tag) +{ + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; +} +#endif + +static unsigned int ipv4_default_advmss(const struct dst_entry *dst) +{ + unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + + if (advmss == 0) { + advmss = max_t(unsigned int, dst->dev->mtu - 40, + ip_rt_min_advmss); + if (advmss > 65535 - 40) + advmss = 65535 - 40; + } + return advmss; +} + +static unsigned int ipv4_default_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst->dev->mtu; + + if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { + const struct rtable *rt = (const struct rtable *) dst; + + if (rt->rt_gateway != rt->rt_dst && mtu > 576) + mtu = 576; + } + + if (mtu > IP_MAX_MTU) + mtu = IP_MAX_MTU; + + return mtu; +} + +static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, + struct fib_info *fi) +{ + struct inet_peer *peer; + int create = 0; + + /* If a peer entry exists for this destination, we must hook + * it up in order to get at cached metrics. + */ + if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) + create = 1; + + rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); + if (peer) { + rt->rt_peer_genid = rt_peer_genid(); + if (inet_metrics_new(peer)) + memcpy(peer->metrics, fi->fib_metrics, + sizeof(u32) * RTAX_MAX); + dst_init_metrics(&rt->dst, peer->metrics, false); + + check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + rt->rt_gateway = peer->redirect_learned.a4; + rt->rt_flags |= RTCF_REDIRECTED; + } + } else { + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); + } +} + +static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, + const struct fib_result *res, + struct fib_info *fi, u16 type, u32 itag) +{ + struct dst_entry *dst = &rt->dst; + + if (fi) { + if (FIB_RES_GW(*res) && + FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = FIB_RES_GW(*res); + rt_init_metrics(rt, fl4, fi); +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->tclassid = FIB_RES_NH(*res).nh_tclassid; +#endif + } + + if (dst_mtu(dst) > IP_MAX_MTU) + dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); + if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) + dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); + +#ifdef CONFIG_IP_ROUTE_CLASSID +#ifdef CONFIG_IP_MULTIPLE_TABLES + set_class_tag(rt, fib_rules_tclass(res)); +#endif + set_class_tag(rt, itag); +#endif +} + +static struct rtable *rt_dst_alloc(struct net_device *dev, + bool nopolicy, bool noxfrm) +{ + return dst_alloc(&ipv4_dst_ops, dev, 1, -1, + DST_HOST | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + unsigned int hash; + struct rtable *rth; + __be32 spec_dst; + struct in_device *in_dev = __in_dev_get_rcu(dev); + u32 itag = 0; + int err; + + /* Primary sanity checks. */ + + if (in_dev == NULL) + return -EINVAL; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) + goto e_inval; + + if (ipv4_is_zeronet(saddr)) { + if (!ipv4_is_local_multicast(daddr)) + goto e_inval; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); + if (err < 0) + goto e_err; + } + rth = rt_dst_alloc(init_net.loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + if (!rth) + goto e_nobufs; + +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif + rth->dst.output = ip_rt_bug; + + rth->rt_key_dst = daddr; + rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_flags = RTCF_MULTICAST; + rth->rt_type = RTN_MULTICAST; + rth->rt_key_tos = tos; + rth->rt_dst = daddr; + rth->rt_src = saddr; + rth->rt_route_iif = dev->ifindex; + rth->rt_iif = dev->ifindex; + rth->rt_oif = 0; + rth->rt_mark = skb->mark; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; + if (our) { + rth->dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->dst.input = ip_mr_input; +#endif + RT_CACHE_STAT_INC(in_slow_mc); + + hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); + rth = rt_intern_hash(hash, rth, skb, dev->ifindex); + return IS_ERR(rth) ? PTR_ERR(rth) : 0; + +e_nobufs: + return -ENOBUFS; +e_inval: + return -EINVAL; +e_err: + return err; +} + + +static void ip_handle_martian_source(struct net_device *dev, + struct in_device *in_dev, + struct sk_buff *skb, + __be32 daddr, + __be32 saddr) +{ + RT_CACHE_STAT_INC(in_martian_src); +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { + /* + * RFC1812 recommendation, if source is martian, + * the only hint is MAC header. + */ + printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", + &daddr, &saddr, dev->name); + if (dev->hard_header_len && skb_mac_header_was_set(skb)) { + int i; + const unsigned char *p = skb_mac_header(skb); + printk(KERN_WARNING "ll header: "); + for (i = 0; i < dev->hard_header_len; i++, p++) { + printk("%02x", *p); + if (i < (dev->hard_header_len - 1)) + printk(":"); + } + printk("\n"); + } + } +#endif +} + +/* called in rcu_read_lock() section */ +static int __mkroute_input(struct sk_buff *skb, + const struct fib_result *res, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos, + struct rtable **result) +{ + struct rtable *rth; + int err; + struct in_device *out_dev; + unsigned int flags = 0; + __be32 spec_dst; + u32 itag; + + /* get a working reference to the output device */ + out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in ip_route_input" \ + "_slow(). Please, report\n"); + return -EINVAL; + } + + + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, &spec_dst, &itag); + if (err < 0) { + ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, + saddr); + + goto cleanup; + } + + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && + (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + flags |= RTCF_DOREDIRECT; + + if (skb->protocol != htons(ETH_P_IP)) { + /* Not IP (i.e. ARP). Do not create route, if it is + * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. + */ + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { + err = -EINVAL; + goto cleanup; + } + } + + rth = rt_dst_alloc(out_dev->dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM)); + if (!rth) { + err = -ENOBUFS; + goto cleanup; + } + + rth->rt_key_dst = daddr; + rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); + rth->rt_flags = flags; + rth->rt_type = res->type; + rth->rt_key_tos = tos; + rth->rt_dst = daddr; + rth->rt_src = saddr; + rth->rt_route_iif = in_dev->dev->ifindex; + rth->rt_iif = in_dev->dev->ifindex; + rth->rt_oif = 0; + rth->rt_mark = skb->mark; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; + + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + + rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); + + *result = rth; + err = 0; + cleanup: + return err; +} + +static int ip_mkroute_input(struct sk_buff *skb, + struct fib_result *res, + const struct flowi4 *fl4, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) +{ + struct rtable* rth = NULL; + int err; + unsigned hash; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); +#endif + + /* create a routing cache entry */ + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); + if (err) + return err; + + /* put it into the cache */ + hash = rt_hash(daddr, saddr, fl4->flowi4_iif, + rt_genid(dev_net(rth->dst.dev))); + rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); + if (IS_ERR(rth)) + return PTR_ERR(rth); + return 0; +} + +/* + * NOTE. We drop all the packets that has local source + * addresses, because every properly looped back packet + * must have correct destination already attached by output routine. + * + * Such approach solves two big problems: + * 1. Not simplex devices are handled properly. + * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() + */ + +static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) +{ + struct fib_result res; + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flowi4 fl4; + unsigned flags = 0; + u32 itag = 0; + struct rtable * rth; + unsigned hash; + __be32 spec_dst; + int err = -EINVAL; + struct net * net = dev_net(dev); + + /* IP on this device is disabled. */ + + if (!in_dev) + goto out; + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr)) + goto martian_source; + + if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) + */ + if (ipv4_is_zeronet(saddr)) + goto martian_source; + + if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) + goto martian_destination; + + /* + * Now we are ready to route packet. + */ + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); + if (err != 0) { + if (!IN_DEV_FORWARD(in_dev)) + goto e_hostunreach; + goto no_route; + } + + RT_CACHE_STAT_INC(in_slow_tot); + + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + err = fib_validate_source(skb, saddr, daddr, tos, + net->loopback_dev->ifindex, + dev, &spec_dst, &itag); + if (err < 0) + goto martian_source_keep_err; + if (err) + flags |= RTCF_DIRECTSRC; + spec_dst = daddr; + goto local_input; + } + + if (!IN_DEV_FORWARD(in_dev)) + goto e_hostunreach; + if (res.type != RTN_UNICAST) + goto martian_destination; + + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); +out: return err; + +brd_input: + if (skb->protocol != htons(ETH_P_IP)) + goto e_inval; + + if (ipv4_is_zeronet(saddr)) + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + else { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); + if (err < 0) + goto martian_source_keep_err; + if (err) + flags |= RTCF_DIRECTSRC; + } + flags |= RTCF_BROADCAST; + res.type = RTN_BROADCAST; + RT_CACHE_STAT_INC(in_brd); + +local_input: + rth = rt_dst_alloc(net->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + if (!rth) + goto e_nobufs; + + rth->dst.input= ip_local_deliver; + rth->dst.output= ip_rt_bug; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif + + rth->rt_key_dst = daddr; + rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(net); + rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + rth->rt_key_tos = tos; + rth->rt_dst = daddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif + rth->rt_route_iif = dev->ifindex; + rth->rt_iif = dev->ifindex; + rth->rt_oif = 0; + rth->rt_mark = skb->mark; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; + if (res.type == RTN_UNREACHABLE) { + rth->dst.input= ip_error; + rth->dst.error= -err; + rth->rt_flags &= ~RTCF_LOCAL; + } + hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); + rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); + goto out; + +no_route: + RT_CACHE_STAT_INC(in_no_route); + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + if (err == -ESRCH) + err = -ENETUNREACH; + goto local_input; + + /* + * Do not cache martian addresses: they should be logged (RFC1812) + */ +martian_destination: + RT_CACHE_STAT_INC(in_martian_dst); +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); +#endif + +e_hostunreach: + err = -EHOSTUNREACH; + goto out; + +e_inval: + err = -EINVAL; + goto out; + +e_nobufs: + err = -ENOBUFS; + goto out; + +martian_source: + err = -EINVAL; +martian_source_keep_err: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + goto out; +} + +int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, bool noref) +{ + struct rtable * rth; + unsigned hash; + int iif = dev->ifindex; + struct net *net; + int res; + + net = dev_net(dev); + + rcu_read_lock(); + + if (!rt_caching(net)) + goto skip_cache; + + tos &= IPTOS_RT_MASK; + hash = rt_hash(daddr, saddr, iif, rt_genid(net)); + + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->dst.rt_next)) { + if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | + ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | + (rth->rt_route_iif ^ iif) | + (rth->rt_key_tos ^ tos)) == 0 && + rth->rt_mark == skb->mark && + net_eq(dev_net(rth->dst.dev), net) && + !rt_is_expired(rth)) { + if (noref) { + dst_use_noref(&rth->dst, jiffies); + skb_dst_set_noref(skb, &rth->dst); + } else { + dst_use(&rth->dst, jiffies); + skb_dst_set(skb, &rth->dst); + } + RT_CACHE_STAT_INC(in_hit); + rcu_read_unlock(); + return 0; + } + RT_CACHE_STAT_INC(in_hlist_search); + } + +skip_cache: + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many Ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (ipv4_is_multicast(daddr)) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev) { + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); + if (our +#ifdef CONFIG_IP_MROUTE + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) +#endif + ) { + int res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); + rcu_read_unlock(); + return res; + } + } + rcu_read_unlock(); + return -EINVAL; + } + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL(ip_route_input_common); + +/* called with rcu_read_lock() */ +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, + __be32 orig_daddr, __be32 orig_saddr, + int orig_oif, struct net_device *dev_out, + unsigned int flags) +{ + struct fib_info *fi = res->fi; + u32 tos = RT_FL_TOS(fl4); + struct in_device *in_dev; + u16 type = res->type; + struct rtable *rth; + + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); + + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); + + if (dev_out->flags & IFF_LOOPBACK) + flags |= RTCF_LOCAL; + + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) + return ERR_PTR(-EINVAL); + + if (type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST | RTCF_LOCAL; + fi = NULL; + } else if (type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST | RTCF_LOCAL; + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, + fl4->flowi4_proto)) + flags &= ~RTCF_LOCAL; + /* If multicast route do not exist use + * default one, but do not gateway in this case. + * Yes, it is hack. + */ + if (fi && res->prefixlen < 4) + fi = NULL; + } + + rth = rt_dst_alloc(dev_out, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM)); + if (!rth) + return ERR_PTR(-ENOBUFS); + + rth->dst.output = ip_output; + + rth->rt_key_dst = orig_daddr; + rth->rt_key_src = orig_saddr; + rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_flags = flags; + rth->rt_type = type; + rth->rt_key_tos = tos; + rth->rt_dst = fl4->daddr; + rth->rt_src = fl4->saddr; + rth->rt_route_iif = 0; + rth->rt_iif = orig_oif ? : dev_out->ifindex; + rth->rt_oif = orig_oif; + rth->rt_mark = fl4->flowi4_mark; + rth->rt_gateway = fl4->daddr; + rth->rt_spec_dst= fl4->saddr; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; + + RT_CACHE_STAT_INC(out_slow_tot); + + if (flags & RTCF_LOCAL) { + rth->dst.input = ip_local_deliver; + rth->rt_spec_dst = fl4->daddr; + } + if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + rth->rt_spec_dst = fl4->saddr; + if (flags & RTCF_LOCAL && + !(dev_out->flags & IFF_LOOPBACK)) { + rth->dst.output = ip_mc_output; + RT_CACHE_STAT_INC(out_slow_mc); + } +#ifdef CONFIG_IP_MROUTE + if (type == RTN_MULTICAST) { + if (IN_DEV_MFORWARD(in_dev) && + !ipv4_is_local_multicast(fl4->daddr)) { + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; + } + } +#endif + } + + rt_set_nexthop(rth, fl4, res, fi, type, 0); + + return rth; +} + +/* + * Major route resolver routine. + * called with rcu_read_lock(); + */ + +static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) +{ + struct net_device *dev_out = NULL; + u32 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; + struct rtable *rth; + __be32 orig_daddr; + __be32 orig_saddr; + int orig_oif; + + res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + orig_daddr = fl4->daddr; + orig_saddr = fl4->saddr; + orig_oif = fl4->flowi4_oif; + + fl4->flowi4_iif = net->loopback_dev->ifindex; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (fl4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(fl4->saddr) || + ipv4_is_lbcast(fl4->saddr) || + ipv4_is_zeronet(fl4->saddr)) + goto out; + + /* I removed check for oif == dev_out->oif here. + It was wrong for two reasons: + 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + is assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (fl4->flowi4_oif == 0 && + (ipv4_is_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr))) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = __ip_dev_find(net, fl4->saddr, false); + if (dev_out == NULL) + goto out; + + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_PKTINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + fl4->flowi4_oif = dev_out->ifindex; + goto make_route; + } + + if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + if (!__ip_dev_find(net, fl4->saddr, false)) + goto out; + } + } + + + if (fl4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); + if (dev_out == NULL) + goto out; + + /* RACE: Check return value of inet_select_addr instead. */ + if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { + rth = ERR_PTR(-ENETUNREACH); + goto out; + } + if (ipv4_is_local_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr)) { + if (!fl4->saddr) + fl4->saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); + goto make_route; + } + if (fl4->saddr) { + if (ipv4_is_multicast(fl4->daddr)) + fl4->saddr = inet_select_addr(dev_out, 0, + fl4->flowi4_scope); + else if (!fl4->daddr) + fl4->saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); + } + } + + if (!fl4->daddr) { + fl4->daddr = fl4->saddr; + if (!fl4->daddr) + fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); + dev_out = net->loopback_dev; + fl4->flowi4_oif = net->loopback_dev->ifindex; + res.type = RTN_LOCAL; + flags |= RTCF_LOCAL; + goto make_route; + } + + if (fib_lookup(net, fl4, &res)) { + res.fi = NULL; + if (fl4->flowi4_oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, ignoring both routing tables + and ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + if (fl4->saddr == 0) + fl4->saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); + res.type = RTN_UNICAST; + goto make_route; + } + rth = ERR_PTR(-ENETUNREACH); + goto out; + } + + if (res.type == RTN_LOCAL) { + if (!fl4->saddr) { + if (res.fi->fib_prefsrc) + fl4->saddr = res.fi->fib_prefsrc; + else + fl4->saddr = fl4->daddr; + } + dev_out = net->loopback_dev; + fl4->flowi4_oif = dev_out->ifindex; + res.fi = NULL; + flags |= RTCF_LOCAL; + goto make_route; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + fib_select_multipath(&res); + else +#endif + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(&res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); + + dev_out = FIB_RES_DEV(res); + fl4->flowi4_oif = dev_out->ifindex; + + +make_route: + rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, + dev_out, flags); + if (!IS_ERR(rth)) { + unsigned int hash; + + hash = rt_hash(orig_daddr, orig_saddr, orig_oif, + rt_genid(dev_net(dev_out))); + rth = rt_intern_hash(hash, rth, NULL, orig_oif); + } + +out: + rcu_read_unlock(); + return rth; +} + +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) +{ + struct rtable *rth; + unsigned int hash; + + if (!rt_caching(net)) + goto slow_output; + + hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); + + rcu_read_lock_bh(); + for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; + rth = rcu_dereference_bh(rth->dst.rt_next)) { + if (rth->rt_key_dst == flp4->daddr && + rth->rt_key_src == flp4->saddr && + rt_is_output_route(rth) && + rth->rt_oif == flp4->flowi4_oif && + rth->rt_mark == flp4->flowi4_mark && + !((rth->rt_key_tos ^ flp4->flowi4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK)) && + net_eq(dev_net(rth->dst.dev), net) && + !rt_is_expired(rth)) { + dst_use(&rth->dst, jiffies); + RT_CACHE_STAT_INC(out_hit); + rcu_read_unlock_bh(); + if (!flp4->saddr) + flp4->saddr = rth->rt_src; + if (!flp4->daddr) + flp4->daddr = rth->rt_dst; + return rth; + } + RT_CACHE_STAT_INC(out_hlist_search); + } + rcu_read_unlock_bh(); + +slow_output: + return ip_route_output_slow(net, flp4); +} +EXPORT_SYMBOL_GPL(__ip_route_output_key); + +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + +static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) +{ + return 0; +} + +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +{ +} + +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + +static struct dst_ops ipv4_dst_blackhole_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .destroy = ipv4_dst_destroy, + .check = ipv4_blackhole_dst_check, + .default_mtu = ipv4_blackhole_default_mtu, + .default_advmss = ipv4_default_advmss, + .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .cow_metrics = ipv4_rt_blackhole_cow_metrics, +}; + +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) +{ + struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); + struct rtable *ort = (struct rtable *) dst_orig; + + if (rt) { + struct dst_entry *new = &rt->dst; + + new->__use = 1; + new->input = dst_discard; + new->output = dst_discard; + dst_copy_metrics(new, &ort->dst); + + new->dev = ort->dst.dev; + if (new->dev) + dev_hold(new->dev); + + rt->rt_key_dst = ort->rt_key_dst; + rt->rt_key_src = ort->rt_key_src; + rt->rt_key_tos = ort->rt_key_tos; + rt->rt_route_iif = ort->rt_route_iif; + rt->rt_iif = ort->rt_iif; + rt->rt_oif = ort->rt_oif; + rt->rt_mark = ort->rt_mark; + + rt->rt_genid = rt_genid(net); + rt->rt_flags = ort->rt_flags; + rt->rt_type = ort->rt_type; + rt->rt_dst = ort->rt_dst; + rt->rt_src = ort->rt_src; + rt->rt_gateway = ort->rt_gateway; + rt->rt_spec_dst = ort->rt_spec_dst; + rt->peer = ort->peer; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + rt->fi = ort->fi; + if (rt->fi) + atomic_inc(&rt->fi->fib_clntref); + + dst_free(new); + } + + dst_release(dst_orig); + + return rt ? &rt->dst : ERR_PTR(-ENOMEM); +} + +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) +{ + struct rtable *rt = __ip_route_output_key(net, flp4); + + if (IS_ERR(rt)) + return rt; + + if (flp4->flowi4_proto) + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); + + return rt; +} +EXPORT_SYMBOL_GPL(ip_route_output_flow); + +static int rt_fill_info(struct net *net, + struct sk_buff *skb, u32 pid, u32 seq, int event, + int nowait, unsigned int flags) +{ + struct rtable *rt = skb_rtable(skb); + struct rtmsg *r; + struct nlmsghdr *nlh; + long expires = 0; + const struct inet_peer *peer = rt->peer; + u32 id = 0, ts = 0, tsage = 0, error; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); + if (nlh == NULL) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + r->rtm_family = AF_INET; + r->rtm_dst_len = 32; + r->rtm_src_len = 0; + r->rtm_tos = rt->rt_key_tos; + r->rtm_table = RT_TABLE_MAIN; + NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); + r->rtm_type = rt->rt_type; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; + if (rt->rt_flags & RTCF_NOTIFY) + r->rtm_flags |= RTM_F_NOTIFY; + + NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); + + if (rt->rt_key_src) { + r->rtm_src_len = 32; + NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); + } + if (rt->dst.dev) + NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); +#ifdef CONFIG_IP_ROUTE_CLASSID + if (rt->dst.tclassid) + NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); +#endif + if (rt_is_input_route(rt)) + NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); + else if (rt->rt_src != rt->rt_key_src) + NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); + + if (rt->rt_dst != rt->rt_gateway) + NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); + + if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + goto nla_put_failure; + + if (rt->rt_mark) + NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); + + error = rt->dst.error; + if (peer) { + inet_peer_refcheck(rt->peer); + id = atomic_read(&peer->ip_id_count) & 0xffff; + if (peer->tcp_ts_stamp) { + ts = peer->tcp_ts; + tsage = get_seconds() - peer->tcp_ts_stamp; + } + expires = ACCESS_ONCE(peer->pmtu_expires); + if (expires) + expires -= jiffies; + } + + if (rt_is_input_route(rt)) { +#ifdef CONFIG_IP_MROUTE + __be32 dst = rt->rt_dst; + + if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + rt->rt_src, rt->rt_dst, + r, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nla_put_failure; + } else { + if (err == -EMSGSIZE) + goto nla_put_failure; + error = err; + } + } + } else +#endif + NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); + } + + if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, + expires, error) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + struct rtable *rt = NULL; + __be32 dst = 0; + __be32 src = 0; + u32 iif; + int err; + int mark; + struct sk_buff *skb; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); + if (err < 0) + goto errout; + + rtm = nlmsg_data(nlh); + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto errout; + } + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + + /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ + ip_hdr(skb)->protocol = IPPROTO_ICMP; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; + dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; + iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; + mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + + if (iif) { + struct net_device *dev; + + dev = __dev_get_by_index(net, iif); + if (dev == NULL) { + err = -ENODEV; + goto errout_free; + } + + skb->protocol = htons(ETH_P_IP); + skb->dev = dev; + skb->mark = mark; + local_bh_disable(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + local_bh_enable(); + + rt = skb_rtable(skb); + if (err == 0 && rt->dst.error) + err = -rt->dst.error; + } else { + struct flowi4 fl4 = { + .daddr = dst, + .saddr = src, + .flowi4_tos = rtm->rtm_tos, + .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, + .flowi4_mark = mark, + }; + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); + } + + if (err) + goto errout_free; + + skb_dst_set(skb, &rt->dst); + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, 0, 0); + if (err <= 0) + goto errout_free; + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +errout: + return err; + +errout_free: + kfree_skb(skb); + goto errout; +} + +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtable *rt; + int h, s_h; + int idx, s_idx; + struct net *net; + + net = sock_net(skb->sk); + + s_h = cb->args[0]; + if (s_h < 0) + s_h = 0; + s_idx = idx = cb->args[1]; + for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { + if (!rt_hash_table[h].chain) + continue; + rcu_read_lock_bh(); + for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; + rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { + if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) + continue; + if (rt_is_expired(rt)) + continue; + skb_dst_set_noref(skb, &rt->dst); + if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + 1, NLM_F_MULTI) <= 0) { + skb_dst_drop(skb); + rcu_read_unlock_bh(); + goto done; + } + skb_dst_drop(skb); + } + rcu_read_unlock_bh(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; +} + +void ip_rt_multicast_event(struct in_device *in_dev) +{ + rt_cache_flush(dev_net(in_dev->dev), 0); +} + +#ifdef CONFIG_SYSCTL +static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + if (write) { + int flush_delay; + ctl_table ctl; + struct net *net; + + memcpy(&ctl, __ctl, sizeof(ctl)); + ctl.data = &flush_delay; + proc_dointvec(&ctl, write, buffer, lenp, ppos); + + net = (struct net *)__ctl->extra1; + rt_cache_flush(net, flush_delay); + return 0; + } + + return -EINVAL; +} + +static ctl_table ipv4_route_table[] = { + { + .procname = "gc_thresh", + .data = &ipv4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_size", + .data = &ip_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + /* Deprecated. Use gc_min_interval_ms */ + + .procname = "gc_min_interval", + .data = &ip_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_min_interval_ms", + .data = &ip_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "gc_timeout", + .data = &ip_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_interval", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_interval", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "redirect_load", + .data = &ip_rt_redirect_load, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "redirect_number", + .data = &ip_rt_redirect_number, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "redirect_silence", + .data = &ip_rt_redirect_silence, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "error_cost", + .data = &ip_rt_error_cost, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "error_burst", + .data = &ip_rt_error_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_elasticity", + .data = &ip_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "mtu_expires", + .data = &ip_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "min_pmtu", + .data = &ip_rt_min_pmtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "min_adv_mss", + .data = &ip_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table empty[1]; + +static struct ctl_table ipv4_skeleton[] = +{ + { .procname = "route", + .mode = 0555, .child = ipv4_route_table}, + { .procname = "neigh", + .mode = 0555, .child = empty}, + { } +}; + +static __net_initdata struct ctl_path ipv4_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { }, +}; + +static struct ctl_table ipv4_route_flush_table[] = { + { + .procname = "flush", + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = ipv4_sysctl_rtcache_flush, + }, + { }, +}; + +static __net_initdata struct ctl_path ipv4_route_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "route", }, + { }, +}; + +static __net_init int sysctl_route_net_init(struct net *net) +{ + struct ctl_table *tbl; + + tbl = ipv4_route_flush_table; + if (!net_eq(net, &init_net)) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } + tbl[0].extra1 = net; + + net->ipv4.route_hdr = + register_net_sysctl_table(net, ipv4_route_path, tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; + +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, +}; +#endif + +static __net_init int rt_genid_init(struct net *net) +{ + get_random_bytes(&net->ipv4.rt_genid, + sizeof(net->ipv4.rt_genid)); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); + return 0; +} + +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, +}; + + +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */ + +static __initdata unsigned long rhash_entries; +static int __init set_rhash_entries(char *str) +{ + if (!str) + return 0; + rhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("rhash_entries=", set_rhash_entries); + +int __init ip_rt_init(void) +{ + int rc = 0; + +#ifdef CONFIG_IP_ROUTE_CLASSID + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); + if (!ip_rt_acct) + panic("IP: failed to allocate ip_rt_acct\n"); +#endif + + ipv4_dst_ops.kmem_cachep = + kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; + + if (dst_entries_init(&ipv4_dst_ops) < 0) + panic("IP: failed to allocate ipv4_dst_ops counter\n"); + + if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) + panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); + + rt_hash_table = (struct rt_hash_bucket *) + alloc_large_system_hash("IP route cache", + sizeof(struct rt_hash_bucket), + rhash_entries, + (totalram_pages >= 128 * 1024) ? + 15 : 17, + 0, + &rt_hash_log, + &rt_hash_mask, + rhash_entries ? 0 : 512 * 1024); + memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); + + ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); + ip_rt_max_size = (rt_hash_mask + 1) * 16; + + devinet_init(); + ip_fib_init(); + + INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); + expires_ljiffies = jiffies; + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); + + if (ip_rt_proc_init()) + printk(KERN_ERR "Unable to create route proc files\n"); +#ifdef CONFIG_XFRM + xfrm_init(); + xfrm4_init(ip_rt_max_size); +#endif + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif + register_pernet_subsys(&rt_genid_ops); + return rc; +} + +#ifdef CONFIG_SYSCTL +/* + * We really need to sanitize the damn ipv4 init order, then all + * this nonsense will go away. + */ +void __init ip_static_sysctl_init(void) +{ + register_sysctl_paths(ipv4_path, ipv4_skeleton); +} +#endif diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c new file mode 100644 index 00000000..895f2157 --- /dev/null +++ b/net/ipv4/syncookies.c @@ -0,0 +1,377 @@ +/* + * Syncookies implementation for the Linux kernel + * + * Copyright (C) 1997 Andi Kleen + * Based on ideas by D.J.Bernstein and Eric Schenk. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Timestamps: lowest bits store TCP options */ +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) + +extern int sysctl_tcp_syncookies; + +__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +EXPORT_SYMBOL(syncookie_secret); + +static __init int init_syncookies(void) +{ + get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); + return 0; +} +__initcall(init_syncookies); + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); + +static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, + u32 count, int c) +{ + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); + + memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); + tmp[0] = (__force u32)saddr; + tmp[1] = (__force u32)daddr; + tmp[2] = ((__force u32)sport << 16) + (__force u32)dport; + tmp[3] = count; + sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); + + return tmp[17]; +} + + +/* + * when syncookies are in effect and tcp timestamps are enabled we encode + * tcp options in the lower bits of the timestamp value that will be + * sent in the syn-ack. + * Since subsequent timestamps use the normal tcp_time_stamp value, we + * must make sure that the resulting initial timestamp is <= tcp_time_stamp. + */ +__u32 cookie_init_timestamp(struct request_sock *req) +{ + struct inet_request_sock *ireq; + u32 ts, ts_now = tcp_time_stamp; + u32 options = 0; + + ireq = inet_rsk(req); + + options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; + options |= ireq->sack_ok << 4; + options |= ireq->ecn_ok << 5; + + ts = ts_now & ~TSMASK; + ts |= options; + if (ts > ts_now) { + ts >>= TSBITS; + ts--; + ts <<= TSBITS; + ts |= options; + } + return ts; +} + + +static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, + __be16 dport, __u32 sseq, __u32 count, + __u32 data) +{ + /* + * Compute the secure sequence number. + * The output should be: + * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) + * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). + * Where sseq is their sequence number and count increases every + * minute by 1. + * As an extra hack, we add a small "data" value that encodes the + * MSS into the second hash value. + */ + + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +/* + * This retrieves the small "data" value from the syncookie. + * If the syncookie is bad, the data returned will be out of + * range. This must be checked by the caller. + * + * The count value used to generate the cookie must be within + * "maxdiff" if the current (passed-in) "count". The return value + * is (__u32)-1 if this test fails. + */ +static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, __u32 sseq, + __u32 count, __u32 maxdiff) +{ + __u32 diff; + + /* Strip away the layers from the cookie */ + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; /* Leaving the data behind */ +} + +/* + * MSS Values are taken from the 2009 paper + * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: + * - values 1440 to 1460 accounted for 80% of observed mss values + * - values outside the 536-1460 range are rare (<0.2%). + * + * Table must be sorted. + */ +static __u16 const msstab[] = { + 64, + 512, + 536, + 1024, + 1440, + 1460, + 4312, + 8960, +}; + +/* + * Generate a syncookie. mssp points to the mss, which is returned + * rounded down to the value encoded in the cookie. + */ +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + int mssind; + const __u16 mss = *mssp; + + tcp_synq_overflow(sk); + + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; + *mssp = msstab[mssind]; + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return secure_tcp_syn_cookie(iph->saddr, iph->daddr, + th->source, th->dest, ntohl(th->seq), + jiffies / (HZ * 60), mssind); +} + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 +/* + * Check if a ack sequence number is a valid syncookie. + * Return the decoded mss if it is, or 0 if not. + */ +static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), + COUNTER_TRIES); + + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; +} + +static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *child; + + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + if (child) + inet_csk_reqsk_queue_add(sk, req, child); + else + reqsk_free(req); + + return child; +} + + +/* + * when syncookies are in effect and tcp timestamps are enabled we stored + * additional tcp options in the timestamp. + * This extracts these options from the timestamp echo. + * + * The lowest 4 bits store snd_wscale. + * next 2 bits indicate SACK and ECN support. + * + * return false if we decode an option that should not be. + */ +bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) +{ + /* echoed timestamp, lowest bits contain options */ + u32 options = tcp_opt->rcv_tsecr & TSMASK; + + if (!tcp_opt->saw_tstamp) { + tcp_clear_options(tcp_opt); + return true; + } + + if (!sysctl_tcp_timestamps) + return false; + + tcp_opt->sack_ok = (options >> 4) & 0x1; + *ecn_ok = (options >> 5) & 1; + if (*ecn_ok && !sysctl_tcp_ecn) + return false; + + if (tcp_opt->sack_ok && !sysctl_tcp_sack) + return false; + + if ((options & 0xf) == 0xf) + return true; /* no window scaling */ + + tcp_opt->wscale_ok = 1; + tcp_opt->snd_wscale = options & 0xf; + return sysctl_tcp_window_scaling != 0; +} +EXPORT_SYMBOL(cookie_check_timestamp); + +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, + struct ip_options *opt) +{ + struct tcp_options_received tcp_opt; + u8 *hash_location; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = tcp_hdr(skb); + __u32 cookie = ntohl(th->ack_seq) - 1; + struct sock *ret = sk; + struct request_sock *req; + int mss; + struct rtable *rt; + __u8 rcv_wscale; + bool ecn_ok = false; + struct flowi4 fl4; + + if (!sysctl_tcp_syncookies || !th->ack || th->rst) + goto out; + + if (tcp_synq_no_recent_overflow(sk) || + (mss = cookie_check(skb, cookie)) == 0) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + goto out; + } + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + + if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + goto out; + + ret = NULL; + req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + if (!req) + goto out; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = cookie; + req->mss = mss; + ireq->loc_port = th->dest; + ireq->rmt_port = th->source; + ireq->loc_addr = ip_hdr(skb)->daddr; + ireq->rmt_addr = ip_hdr(skb)->saddr; + ireq->ecn_ok = ecn_ok; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ + if (opt && opt->optlen) { + int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; + + ireq->opt = kmalloc(opt_size, GFP_ATOMIC); + if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { + kfree(ireq->opt); + ireq->opt = NULL; + } + } + + if (security_inet_conn_request(sk, skb, req)) { + reqsk_free(req); + goto out; + } + + req->expires = 0UL; + req->retrans = 0; + + /* + * We need to lookup the route here to get at the correct + * window size. We should better make sure that the window size + * hasn't changed since we received the original syn, but I see + * no easy way to do this. + */ + flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + RT_SCOPE_UNIVERSE, IPPROTO_TCP, + inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) { + reqsk_free(req); + goto out; + } + + /* Try to redo what tcp_v4_send_synack did. */ + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + + tcp_select_initial_window(tcp_full_space(sk), req->mss, + &req->rcv_wnd, &req->window_clamp, + ireq->wscale_ok, &rcv_wscale, + dst_metric(&rt->dst, RTAX_INITRWND)); + + ireq->rcv_wscale = rcv_wscale; + + ret = get_cookie_sock(sk, skb, req, &rt->dst); + /* ip_queue_xmit() depends on our flow being setup + * Normal sockets get it right from inet_csk_route_child_sock() + */ + if (ret) + inet_sk(ret)->cork.fl.u.ip4 = fl4; +out: return ret; +} diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c new file mode 100644 index 00000000..57d0752e --- /dev/null +++ b/net/ipv4/sysctl_net_ipv4.c @@ -0,0 +1,840 @@ +/* + * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int zero; +static int tcp_retr1_max = 255; +static int ip_local_port_range_min[] = { 1, 1 }; +static int ip_local_port_range_max[] = { 65535, 65535 }; +static int tcp_adv_win_scale_min = -31; +static int tcp_adv_win_scale_max = 31; +static int ip_ttl_min = 1; +static int ip_ttl_max = 255; +static int ip_ping_group_range_min[] = { 0, 0 }; +static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; + +/* Update system visible IP port range */ +static void set_local_port_range(int range[2]) +{ + write_seqlock(&sysctl_local_ports.lock); + sysctl_local_ports.range[0] = range[0]; + sysctl_local_ports.range[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_local_port_range(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + int range[2]; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_local_port_range_min, + .extra2 = &ip_local_port_range_max, + }; + + inet_get_local_port_range(range, range + 1); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) { + if (range[1] < range[0]) + ret = -EINVAL; + else + set_local_port_range(range); + } + + return ret; +} + + +void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) +{ + gid_t *data = table->data; + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = data[0]; + *high = data[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} + +/* Update system visible IP port range */ +static void set_ping_group_range(struct ctl_table *table, int range[2]) +{ + gid_t *data = table->data; + write_seqlock(&sysctl_local_ports.lock); + data[0] = range[0]; + data[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_ping_group_range(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + gid_t range[2]; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_ping_group_range_min, + .extra2 = &ip_ping_group_range_max, + }; + + inet_get_ping_group_range_table(table, range, range + 1); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + set_ping_group_range(table, range); + + return ret; +} + +static int proc_tcp_congestion_control(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_congestion_control(val); + return ret; +} + +static int proc_tcp_available_congestion_control(ctl_table *ctl, + int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + return ret; +} + +static int proc_allowed_congestion_control(ctl_table *ctl, + int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_allowed_congestion_control(tbl.data); + kfree(tbl.data); + return ret; +} + +static struct ctl_table ipv4_table[] = { + { + .procname = "tcp_timestamps", + .data = &sysctl_tcp_timestamps, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_window_scaling", + .data = &sysctl_tcp_window_scaling, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_sack", + .data = &sysctl_tcp_sack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retrans_collapse", + .data = &sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_default_ttl", + .data = &sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_ttl_min, + .extra2 = &ip_ttl_max, + }, + { + .procname = "ip_no_pmtu_disc", + .data = &ipv4_config.no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_nonlocal_bind", + .data = &sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_syn_retries", + .data = &sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_synack_retries", + .data = &sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_orphans", + .data = &sysctl_tcp_max_orphans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_tw_buckets", + .data = &tcp_death_row.sysctl_max_tw_buckets, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_dynaddr", + .data = &sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_keepalive_time", + .data = &sysctl_tcp_keepalive_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_keepalive_probes", + .data = &sysctl_tcp_keepalive_probes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_keepalive_intvl", + .data = &sysctl_tcp_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_retries1", + .data = &sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra2 = &tcp_retr1_max + }, + { + .procname = "tcp_retries2", + .data = &sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fin_timeout", + .data = &sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#ifdef CONFIG_SYN_COOKIES + { + .procname = "tcp_syncookies", + .data = &sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "tcp_tw_recycle", + .data = &tcp_death_row.sysctl_tw_recycle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_abort_on_overflow", + .data = &sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_stdurg", + .data = &sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_rfc1337", + .data = &sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_syn_backlog", + .data = &sysctl_max_syn_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_local_port_range", + .data = &sysctl_local_ports.range, + .maxlen = sizeof(sysctl_local_ports.range), + .mode = 0644, + .proc_handler = ipv4_local_port_range, + }, + { + .procname = "ip_local_reserved_ports", + .data = NULL, /* initialized in sysctl_ipv4_init */ + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, + { + .procname = "igmp_max_memberships", + .data = &sysctl_igmp_max_memberships, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "igmp_max_msf", + .data = &sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "inet_peer_threshold", + .data = &inet_peer_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "inet_peer_minttl", + .data = &inet_peer_minttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "inet_peer_maxttl", + .data = &inet_peer_maxttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "inet_peer_gc_mintime", + .data = &inet_peer_gc_mintime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "inet_peer_gc_maxtime", + .data = &inet_peer_gc_maxtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_orphan_retries", + .data = &sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fack", + .data = &sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_reordering", + .data = &sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_ecn", + .data = &sysctl_tcp_ecn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_dsack", + .data = &sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_mem", + .data = &sysctl_tcp_mem, + .maxlen = sizeof(sysctl_tcp_mem), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, + { + .procname = "tcp_wmem", + .data = &sysctl_tcp_wmem, + .maxlen = sizeof(sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_rmem", + .data = &sysctl_tcp_rmem, + .maxlen = sizeof(sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_app_win", + .data = &sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_adv_win_scale", + .data = &sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, + }, + { + .procname = "tcp_tw_reuse", + .data = &sysctl_tcp_tw_reuse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_frto", + .data = &sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_frto_response", + .data = &sysctl_tcp_frto_response, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_low_latency", + .data = &sysctl_tcp_low_latency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_no_metrics_save", + .data = &sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_moderate_rcvbuf", + .data = &sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_tso_win_divisor", + .data = &sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_congestion_control", + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, + { + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_mtu_probing", + .data = &sysctl_tcp_mtu_probing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_base_mss", + .data = &sysctl_tcp_base_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_workaround_signed_windows", + .data = &sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_NET_DMA + { + .procname = "tcp_dma_copybreak", + .data = &sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "tcp_slow_start_after_idle", + .data = &sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_NETLABEL + { + .procname = "cipso_cache_enable", + .data = &cipso_v4_cache_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "cipso_cache_bucket_size", + .data = &cipso_v4_cache_bucketsize, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "cipso_rbm_optfmt", + .data = &cipso_v4_rbm_optfmt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "cipso_rbm_strictvalid", + .data = &cipso_v4_rbm_strictvalid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NETLABEL */ + { + .procname = "tcp_available_congestion_control", + .maxlen = TCP_CA_BUF_MAX, + .mode = 0444, + .proc_handler = proc_tcp_available_congestion_control, + }, + { + .procname = "tcp_allowed_congestion_control", + .maxlen = TCP_CA_BUF_MAX, + .mode = 0644, + .proc_handler = proc_allowed_congestion_control, + }, + { + .procname = "tcp_max_ssthresh", + .data = &sysctl_tcp_max_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_cookie_size", + .data = &sysctl_tcp_cookie_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_thin_dupack", + .data = &sysctl_tcp_thin_dupack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "udp_mem", + .data = &sysctl_udp_mem, + .maxlen = sizeof(sysctl_udp_mem), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "udp_rmem_min", + .data = &sysctl_udp_rmem_min, + .maxlen = sizeof(sysctl_udp_rmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, + { + .procname = "udp_wmem_min", + .data = &sysctl_udp_wmem_min, + .maxlen = sizeof(sysctl_udp_wmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; + +static struct ctl_table ipv4_net_table[] = { + { + .procname = "icmp_echo_ignore_all", + .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "icmp_echo_ignore_broadcasts", + .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "icmp_ignore_bogus_error_responses", + .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "icmp_errors_use_inbound_ifaddr", + .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "icmp_ratelimit", + .data = &init_net.ipv4.sysctl_icmp_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "icmp_ratemask", + .data = &init_net.ipv4.sysctl_icmp_ratemask, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "rt_cache_rebuild_count", + .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ping_group_range", + .data = &init_net.ipv4.sysctl_ping_group_range, + .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), + .mode = 0644, + .proc_handler = ipv4_ping_group_range, + }, + { } +}; + +struct ctl_path net_ipv4_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { }, +}; +EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); + +static __net_init int ipv4_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = ipv4_net_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = + &net->ipv4.sysctl_icmp_echo_ignore_all; + table[1].data = + &net->ipv4.sysctl_icmp_echo_ignore_broadcasts; + table[2].data = + &net->ipv4.sysctl_icmp_ignore_bogus_error_responses; + table[3].data = + &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; + table[4].data = + &net->ipv4.sysctl_icmp_ratelimit; + table[5].data = + &net->ipv4.sysctl_icmp_ratemask; + table[6].data = + &net->ipv4.sysctl_rt_cache_rebuild_count; + table[7].data = + &net->ipv4.sysctl_ping_group_range; + + } + + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.sysctl_ping_group_range[0] = 1; + net->ipv4.sysctl_ping_group_range[1] = 0; + + net->ipv4.sysctl_rt_cache_rebuild_count = 4; + + net->ipv4.ipv4_hdr = register_net_sysctl_table(net, + net_ipv4_ctl_path, table); + if (net->ipv4.ipv4_hdr == NULL) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void ipv4_sysctl_exit_net(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.ipv4_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); + kfree(table); +} + +static __net_initdata struct pernet_operations ipv4_sysctl_ops = { + .init = ipv4_sysctl_init_net, + .exit = ipv4_sysctl_exit_net, +}; + +static __init int sysctl_ipv4_init(void) +{ + struct ctl_table_header *hdr; + struct ctl_table *i; + + for (i = ipv4_table; i->procname; i++) { + if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { + i->data = sysctl_local_reserved_ports; + break; + } + } + if (!i->procname) + return -EINVAL; + + hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); + if (hdr == NULL) + return -ENOMEM; + + if (register_pernet_subsys(&ipv4_sysctl_ops)) { + unregister_sysctl_table(hdr); + return -ENOMEM; + } + + return 0; +} + +__initcall(sysctl_ipv4_init); diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c new file mode 100644 index 00000000..0cbbf100 --- /dev/null +++ b/net/ipv4/sysfs_net_ipv4.c @@ -0,0 +1,88 @@ +/* + * net/ipv4/sysfs_net_ipv4.c + * + * sysfs-based networking knobs (so we can, unlike with sysctl, control perms) + * + * Copyright (C) 2008 Google, Inc. + * + * Robert Love + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include + +#define CREATE_IPV4_FILE(_name, _var) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%d\n", _var); \ +} \ +static ssize_t _name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + int val, ret; \ + ret = sscanf(buf, "%d", &val); \ + if (ret != 1) \ + return -EINVAL; \ + if (val < 0) \ + return -EINVAL; \ + _var = val; \ + return count; \ +} \ +static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]); +CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]); +CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]); + +CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]); +CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]); +CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]); + +static struct attribute *ipv4_attrs[] = { + &tcp_wmem_min_attr.attr, + &tcp_wmem_def_attr.attr, + &tcp_wmem_max_attr.attr, + &tcp_rmem_min_attr.attr, + &tcp_rmem_def_attr.attr, + &tcp_rmem_max_attr.attr, + NULL +}; + +static struct attribute_group ipv4_attr_group = { + .attrs = ipv4_attrs, +}; + +static __init int sysfs_ipv4_init(void) +{ + struct kobject *ipv4_kobject; + int ret; + + ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj); + if (!ipv4_kobject) + return -ENOMEM; + + ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group); + if (ret) { + kobject_put(ipv4_kobject); + return ret; + } + + return 0; +} + +subsys_initcall(sysfs_ipv4_init); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c new file mode 100644 index 00000000..30eda52c --- /dev/null +++ b/net/ipv4/tcp.c @@ -0,0 +1,3431 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. poll + * behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised + * state ack error. + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer + * fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle poll() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), poll() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in polling before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Jorge Cwik : Even faster checksumming. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * Alan Cox : Cache last socket. + * Alan Cox : Per route irtt. + * Matt Day : poll()->select() match BSD precisely on error + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if state is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but it's a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * Keith Owens : Do proper merging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO + * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and + * lingertime == 0 (RFC 793 ABORT Call) + * Hirokazu Takahashi : Use copy_from_user() instead of + * csum_and_copy_from_user() if possible. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; + +struct percpu_counter tcp_orphan_count; +EXPORT_SYMBOL_GPL(tcp_orphan_count); + +long sysctl_tcp_mem[3] __read_mostly; +int sysctl_tcp_wmem[3] __read_mostly; +int sysctl_tcp_rmem[3] __read_mostly; + +EXPORT_SYMBOL(sysctl_tcp_mem); +EXPORT_SYMBOL(sysctl_tcp_rmem); +EXPORT_SYMBOL(sysctl_tcp_wmem); + +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ +EXPORT_SYMBOL(tcp_memory_allocated); + +/* + * Current number of TCP sockets. + */ +struct percpu_counter tcp_sockets_allocated; +EXPORT_SYMBOL(tcp_sockets_allocated); + +/* + * TCP splice context + */ +struct tcp_splice_state { + struct pipe_inode_info *pipe; + size_t len; + unsigned int flags; +}; + +/* + * Pressure flag: try to collapse. + * Technical note: it is used by multiple contexts non atomically. + * All the __sk_mem_schedule() is of this nature: accounting + * is strict, actions are advisory and have some latency. + */ +int tcp_memory_pressure __read_mostly; +EXPORT_SYMBOL(tcp_memory_pressure); + +void tcp_enter_memory_pressure(struct sock *sk) +{ + if (!tcp_memory_pressure) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); + tcp_memory_pressure = 1; + } +} +EXPORT_SYMBOL(tcp_enter_memory_pressure); + +/* Convert seconds to retransmits based on initial and max timeout */ +static u8 secs_to_retrans(int seconds, int timeout, int rto_max) +{ + u8 res = 0; + + if (seconds > 0) { + int period = timeout; + + res = 1; + while (seconds > period && res < 255) { + res++; + timeout <<= 1; + if (timeout > rto_max) + timeout = rto_max; + period += timeout; + } + } + return res; +} + +/* Convert retransmits to seconds based on initial and max timeout */ +static int retrans_to_secs(u8 retrans, int timeout, int rto_max) +{ + int period = 0; + + if (retrans > 0) { + period = timeout; + while (--retrans) { + timeout <<= 1; + if (timeout > rto_max) + timeout = rto_max; + period += timeout; + } + } + return period; +} + +/* + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == TCP_LISTEN) + return inet_csk_listen_poll(sk); + + /* Socket is not locked. We are protected from async events + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. + */ + + mask = 0; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + * + * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * our fs/select.c). It means that after we received EOF, + * poll always returns immediately, making impossible poll() on write() + * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * if and only if shutdown has been made in both directions. + * Actually, it is interesting to look how Solaris and DUX + * solve this dilemma. I would prefer, if POLLHUP were maskable, + * then we could set it on SND_SHUTDOWN. BTW examples given + * in Stevens' books assume exactly this behaviour, it explains + * why POLLHUP is incompatible with POLLOUT. --ANK + * + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK + */ + if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + + /* Connected? */ + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int target = sock_rcvlowat(sk, 0, INT_MAX); + + if (tp->urg_seq == tp->copied_seq && + !sock_flag(sk, SOCK_URGINLINE) && + tp->urg_data) + target++; + + /* Potential race condition. If read of tp below will + * escape above sk->sk_state, we can be illegally awaken + * in SYN_* states. */ + if (tp->rcv_nxt - tp->copied_seq >= target) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } else + mask |= POLLOUT | POLLWRNORM; + + if (tp->urg_data & TCP_URG_VALID) + mask |= POLLPRI; + } + /* This barrier is coupled with smp_wmb() in tcp_reset() */ + smp_rmb(); + if (sk->sk_err) + mask |= POLLERR; + + return mask; +} +EXPORT_SYMBOL(tcp_poll); + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct tcp_sock *tp = tcp_sk(sk); + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else if (sock_flag(sk, SOCK_URGINLINE) || + !tp->urg_data || + before(tp->urg_seq, tp->copied_seq) || + !before(tp->urg_seq, tp->rcv_nxt)) { + struct sk_buff *skb; + + answ = tp->rcv_nxt - tp->copied_seq; + + /* Subtract 1, if FIN is in queue. */ + skb = skb_peek_tail(&sk->sk_receive_queue); + if (answ && skb) + answ -= tcp_hdr(skb)->fin; + } else + answ = tp->urg_seq - tp->copied_seq; + release_sock(sk); + break; + case SIOCATMARK: + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; + case SIOCOUTQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_una; + break; + case SIOCOUTQNSD: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_nxt; + break; + default: + return -ENOIOCTLCMD; + } + + return put_user(answ, (int __user *)arg); +} +EXPORT_SYMBOL(tcp_ioctl); + +static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; + tp->pushed_seq = tp->write_seq; +} + +static inline int forced_push(struct tcp_sock *tp) +{ + return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); +} + +static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + skb->csum = 0; + tcb->seq = tcb->end_seq = tp->write_seq; + tcb->flags = TCPHDR_ACK; + tcb->sacked = 0; + skb_header_release(skb); + tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + if (tp->nonagle & TCP_NAGLE_PUSH) + tp->nonagle &= ~TCP_NAGLE_PUSH; +} + +static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) +{ + if (flags & MSG_OOB) + tp->snd_up = tp->write_seq; +} + +static inline void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle) +{ + if (tcp_send_head(sk)) { + struct tcp_sock *tp = tcp_sk(sk); + + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, tcp_write_queue_tail(sk)); + + tcp_mark_urg(tp, flags); + __tcp_push_pending_frames(sk, mss_now, + (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + } +} + +static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct tcp_splice_state *tss = rd_desc->arg.data; + int ret; + + ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), + tss->flags); + if (ret > 0) + rd_desc->count -= ret; + return ret; +} + +static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + .count = tss->len, + }; + + return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * tcp_splice_read - splice data from TCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + **/ +ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + long timeo; + ssize_t spliced; + int ret; + + sock_rps_record_flow(sk); + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + ret = spliced = 0; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); + while (tss.len) { + ret = __tcp_splice_read(sk, &tss); + if (ret < 0) + break; + else if (!ret) { + if (spliced) + break; + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + if (!sock_flag(sk, SOCK_DONE)) + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + sk_wait_data(sk, &timeo); + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + if (!timeo) + break; + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} +EXPORT_SYMBOL(tcp_splice_read); + +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +{ + struct sk_buff *skb; + + /* The TCP header must be at least 32-bit aligned. */ + size = ALIGN(size, 4); + + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + if (skb) { + if (sk_wmem_schedule(sk, skb->truesize)) { + /* + * Make sure that we have exactly size bytes + * available to the caller, no more, no less. + */ + skb_reserve(skb, skb_tailroom(skb) - size); + return skb; + } + __kfree_skb(skb); + } else { + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + } + return NULL; +} + +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 xmit_size_goal, old_size_goal; + + xmit_size_goal = mss_now; + + if (large_allowed && sk_can_gso(sk)) { + xmit_size_goal = ((sk->sk_gso_max_size - 1) - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); + + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + + /* We try hard to avoid divides here */ + old_size_goal = tp->xmit_size_goal_segs * mss_now; + + if (likely(old_size_goal <= xmit_size_goal && + old_size_goal + mss_now > xmit_size_goal)) { + xmit_size_goal = old_size_goal; + } else { + tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + xmit_size_goal = tp->xmit_size_goal_segs * mss_now; + } + } + + return max(xmit_size_goal, mss_now); +} + +static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + return mss_now; +} + +static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, + size_t psize, int flags) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; + int err; + ssize_t copied; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + mss_now = tcp_send_mss(sk, &size_goal, flags); + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto out_err; + + while (psize > 0) { + struct sk_buff *skb = tcp_write_queue_tail(sk); + struct page *page = pages[poffset / PAGE_SIZE]; + int copy, i, can_coalesce; + int offset = poffset % PAGE_SIZE; + int size = min_t(size_t, psize, PAGE_SIZE - offset); + + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { +new_segment: + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (!skb) + goto wait_for_memory; + + skb_entail(sk, skb); + copy = size_goal; + } + + if (copy > size) + copy = size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + if (can_coalesce) { + skb_shinfo(skb)->frags[i - 1].size += copy; + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->sk_wmem_queued += copy; + sk_mem_charge(sk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + skb_shinfo(skb)->gso_segs = 0; + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; + + copied += copy; + poffset += copy; + if (!(psize -= copy)) + goto out; + + if (skb->len < size_goal || (flags & MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_send_mss(sk, &size_goal, flags); + } + +out: + if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) + tcp_push(sk, flags, mss_now, tp->nonagle); + return copied; + +do_error: + if (copied) + goto out; +out_err: + return sk_stream_error(sk, flags, err); +} + +int tcp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + ssize_t res; + + if (!(sk->sk_route_caps & NETIF_F_SG) || + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) + return sock_no_sendpage(sk->sk_socket, page, offset, size, + flags); + + lock_sock(sk); + res = do_tcp_sendpages(sk, &page, offset, size, flags); + release_sock(sk); + return res; +} +EXPORT_SYMBOL(tcp_sendpage); + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +static inline int select_size(struct sock *sk, int sg) +{ + struct tcp_sock *tp = tcp_sk(sk); + int tmp = tp->mss_cache; + + if (sg) { + if (sk_can_gso(sk)) + tmp = 0; + else { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (tmp >= pgbreak && + tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + tmp = pgbreak; + } + } + + return tmp; +} + +int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size) +{ + struct iovec *iov; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int iovlen, flags; + int mss_now, size_goal; + int sg, err, copied; + long timeo; + + lock_sock(sk); + + flags = msg->msg_flags; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + /* This should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + mss_now = tcp_send_mss(sk, &size_goal, flags); + + /* Ok commence sending. */ + iovlen = msg->msg_iovlen; + iov = msg->msg_iov; + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto out_err; + + sg = sk->sk_route_caps & NETIF_F_SG; + + while (--iovlen >= 0) { + size_t seglen = iov->iov_len; + unsigned char __user *from = iov->iov_base; + + iov++; + + while (seglen > 0) { + int copy = 0; + int max = size_goal; + + skb = tcp_write_queue_tail(sk); + if (tcp_send_head(sk)) { + if (skb->ip_summed == CHECKSUM_NONE) + max = mss_now; + copy = max - skb->len; + } + + if (copy <= 0) { +new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + skb = sk_stream_alloc_skb(sk, + select_size(sk, sg), + sk->sk_allocation); + if (!skb) + goto wait_for_memory; + + /* + * Check whether we can use HW checksum. + */ + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_entail(sk, skb); + copy = size_goal; + max = size_goal; + } + + /* Try to append data to the end of skb. */ + if (copy > seglen) + copy = seglen; + + /* Where to copy to? */ + if (skb_tailroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + if (copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + err = skb_add_data_nocache(sk, skb, from, copy); + if (err) + goto do_fault; + } else { + int merge = 0; + int i = skb_shinfo(skb)->nr_frags; + struct page *page = TCP_PAGE(sk); + int off = TCP_OFF(sk); + + if (skb_can_coalesce(skb, i, page, off) && + off != PAGE_SIZE) { + /* We can extend the last page + * fragment. */ + merge = 1; + } else if (i == MAX_SKB_FRAGS || !sg) { + /* Need to add new fragment and cannot + * do this because interface is non-SG, + * or because all the page slots are + * busy. */ + tcp_mark_push(tp, skb); + goto new_segment; + } else if (page) { + if (off == PAGE_SIZE) { + put_page(page); + TCP_PAGE(sk) = page = NULL; + off = 0; + } + } else + off = 0; + + if (copy > PAGE_SIZE - off) + copy = PAGE_SIZE - off; + + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + if (!page) { + /* Allocate new cache page. */ + if (!(page = sk_stream_alloc_page(sk))) + goto wait_for_memory; + } + + /* Time to copy data. We are close to + * the end! */ + err = skb_copy_to_page_nocache(sk, from, skb, + page, off, copy); + if (err) { + /* If this page was new, give it to the + * socket so it does not get leaked. + */ + if (!TCP_PAGE(sk)) { + TCP_PAGE(sk) = page; + TCP_OFF(sk) = 0; + } + goto do_error; + } + + /* Update the skb. */ + if (merge) { + skb_shinfo(skb)->frags[i - 1].size += + copy; + } else { + skb_fill_page_desc(skb, i, page, off, copy); + if (TCP_PAGE(sk)) { + get_page(page); + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; + } + } + + TCP_OFF(sk) = off + copy; + } + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; + + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + skb_shinfo(skb)->gso_segs = 0; + + from += copy; + copied += copy; + if ((seglen -= copy) == 0 && iovlen == 0) + goto out; + + if (skb->len < max || (flags & MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_send_mss(sk, &size_goal, flags); + } + } + +out: + if (copied) + tcp_push(sk, flags, mss_now, tp->nonagle); + release_sock(sk); + + if (copied > 0) + uid_stat_tcp_snd(current_uid(), copied); + return copied; + +do_fault: + if (!skb->len) { + tcp_unlink_write_queue(skb, sk); + /* It is the one place in all of TCP, except connection + * reset, where we can be unlinking the send_head. + */ + tcp_check_send_head(sk, skb); + sk_wmem_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; +out_err: + err = sk_stream_error(sk, flags, err); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sendmsg); + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* No URG data to read. */ + if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || + tp->urg_data == TCP_URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) + return -ENOTCONN; + + if (tp->urg_data & TCP_URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = TCP_URG_READ; + + /* Read urgent data. */ + msg->msg_flags |= MSG_OOB; + + if (len > 0) { + if (!(flags & MSG_TRUNC)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); + len = 1; + } else + msg->msg_flags |= MSG_TRUNC; + + return err ? -EFAULT : len; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 0; + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike + */ + return -EAGAIN; +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +void tcp_cleanup_rbuf(struct sock *sk, int copied) +{ + struct tcp_sock *tp = tcp_sk(sk); + int time_to_ack = 0; + +#if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); +#endif + + if (inet_csk_ack_scheduled(sk)) { + const struct inet_connection_sock *icsk = inet_csk(sk); + /* Delayed ACKs frequently hit locked sockets during bulk + * receive. */ + if (icsk->icsk_ack.blocked || + /* Once-per-two-segments ACK was not sent by tcp_input.c */ + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || + /* + * If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + (copied > 0 && + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong)) && + !atomic_read(&sk->sk_rmem_alloc))) + time_to_ack = 1; + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. + */ + if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + __u32 rcv_window_now = tcp_receive_window(tp); + + /* Optimize, __tcp_select_window() is not cheap. */ + if (2*rcv_window_now <= tp->window_clamp) { + __u32 new_window = __tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than current one. + * "Lots" means "at least twice" here. + */ + if (new_window && new_window >= 2 * rcv_window_now) + time_to_ack = 1; + } + } + if (time_to_ack) + tcp_send_ack(sk); +} + +static void tcp_prequeue_process(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); + + /* RX process wants to run with disabled BHs, though it is not + * necessary */ + local_bh_disable(); + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk_backlog_rcv(sk, skb); + local_bh_enable(); + + /* Clear memory counter. */ + tp->ucopy.memory = 0; +} + +#ifdef CONFIG_NET_DMA +static void tcp_service_net_dma(struct sock *sk, bool wait) +{ + dma_cookie_t done, used; + dma_cookie_t last_issued; + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->ucopy.dma_chan) + return; + + last_issued = tp->ucopy.dma_cookie; + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + do { + if (dma_async_memcpy_complete(tp->ucopy.dma_chan, + last_issued, &done, + &used) == DMA_SUCCESS) { + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + break; + } else { + struct sk_buff *skb; + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + } while (wait); +} +#endif + +static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +{ + struct sk_buff *skb; + u32 offset; + + skb_queue_walk(&sk->sk_receive_queue, skb) { + offset = seq - TCP_SKB_CB(skb)->seq; + if (tcp_hdr(skb)->syn) + offset--; + if (offset < skb->len || tcp_hdr(skb)->fin) { + *off = offset; + return skb; + } + } + return NULL; +} + +/* + * This routine provides an alternative to tcp_recvmsg() for routines + * that would like to handle copying from skbuffs directly in 'sendfile' + * fashion. + * Note: + * - It is assumed that the socket was locked by the caller. + * - The routine does not block. + * - At present, there is no support for reading OOB data + * or for 'peeking' the socket using this routine + * (although both would be easy to implement). + */ +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + u32 offset; + int copied = 0; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + if (offset < skb->len) { + int used; + size_t len; + + len = skb->len - offset; + /* Stop reading if we hit a patch of urgent data */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - seq; + if (urg_offset < len) + len = urg_offset; + if (!len) + break; + } + used = recv_actor(desc, skb, offset, len); + if (used < 0) { + if (!copied) + copied = used; + break; + } else if (used <= len) { + seq += used; + copied += used; + offset += used; + } + /* + * If recv_actor drops the lock (e.g. TCP splice + * receive) the skb pointer might be invalid when + * getting here: tcp_collapse might have deleted it + * while aggregating skbs from the socket queue. + */ + skb = tcp_recv_skb(sk, seq-1, &offset); + if (!skb || (offset+1 != skb->len)) + break; + } + if (tcp_hdr(skb)->fin) { + sk_eat_skb(sk, skb, 0); + ++seq; + break; + } + sk_eat_skb(sk, skb, 0); + if (!desc->count) + break; + tp->copied_seq = seq; + } + tp->copied_seq = seq; + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (copied > 0) { + tcp_cleanup_rbuf(sk, copied); + uid_stat_tcp_rcv(current_uid(), copied); + } + + return copied; +} +EXPORT_SYMBOL(tcp_read_sock); + +/* + * This routine copies from a sock struct into the user buffer. + * + * Technical note: in 2.3 we work on _locked_ socket, so that + * tricks with *seq access order and skb->users are not required. + * Probably, code can be easily improved even more. + */ + +int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int copied = 0; + u32 peek_seq; + u32 *seq; + unsigned long used; + int err; + int target; /* Read at least this many bytes */ + long timeo; + struct task_struct *user_recv = NULL; + int copied_early = 0; + struct sk_buff *skb; + u32 urg_hole = 0; + + lock_sock(sk); + + err = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + preempt_disable(); + skb = skb_peek_tail(&sk->sk_receive_queue); + { + int available = 0; + + if (skb) + available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); + if ((available < target) && + (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && + dma_find_channel(DMA_MEMCPY)) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = + dma_pin_iovec_pages(msg->msg_iov, len); + } else { + preempt_enable_no_resched(); + } + } +#endif + + do { + u32 offset; + + /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + if (tp->urg_data && tp->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } + + /* Next get a buffer. */ + + skb_queue_walk(&sk->sk_receive_queue, skb) { + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) + break; + + offset = *seq - TCP_SKB_CB(skb)->seq; + if (tcp_hdr(skb)->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (tcp_hdr(skb)->fin) + goto found_fin_ok; + WARN(!(flags & MSG_PEEK), + "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); + } + + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + tcp_cleanup_rbuf(sk, copied); + + if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { + /* Install new reader */ + if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { + user_recv = current; + tp->ucopy.task = user_recv; + tp->ucopy.iov = msg->msg_iov; + } + + tp->ucopy.len = len; + + WARN_ON(tp->copied_seq != tp->rcv_nxt && + !(flags & (MSG_PEEK | MSG_TRUNC))); + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise + * order will be broken at second iteration. + * More elegant solution is required!!! + * + * Look: we have the following (pseudo)queues: + * + * 1. packets in flight + * 2. backlog + * 3. prequeue + * 4. receive_queue + * + * Each queue can be processed only if the next ones + * are empty. At this point we have empty receive_queue. + * But prequeue _can_ be not empty after 2nd iteration, + * when we jumped to start of loop because backlog + * processing added something to receive_queue. + * We cannot release_sock(), because backlog contains + * packets arrived _after_ prequeued ones. + * + * Shortly, algorithm is clear --- to process all + * the queues in order. We could make it more directly, + * requeueing packets from backlog to prequeue, if + * is not empty. It is more elegant, but eats cycles, + * unfortunately. + */ + if (!skb_queue_empty(&tp->ucopy.prequeue)) + goto do_prequeue; + + /* __ Set realtime policy in scheduler __ */ + } + +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +#endif + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else + sk_wait_data(sk, &timeo); + +#ifdef CONFIG_NET_DMA + tcp_service_net_dma(sk, false); /* Don't block */ + tp->ucopy.wakeup = 0; +#endif + + if (user_recv) { + int chunk; + + /* __ Restore normal policy in scheduler __ */ + + if ((chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + len -= chunk; + copied += chunk; + } + + if (tp->rcv_nxt == tp->copied_seq && + !skb_queue_empty(&tp->ucopy.prequeue)) { +do_prequeue: + tcp_prequeue_process(sk); + + if ((chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + len -= chunk; + copied += chunk; + } + } + } + if ((flags & MSG_PEEK) && + (peek_seq - copied - urg_hole != tp->copied_seq)) { + if (net_ratelimit()) + printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", + current->comm, task_pid_nr(current)); + peek_seq = tp->copied_seq; + } + continue; + + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sock_flag(sk, SOCK_URGINLINE)) { + ++*seq; + urg_hole++; + offset++; + used--; + if (!used) + goto skip_copy; + } + } else + used = urg_offset; + } + } + + if (!(flags & MSG_TRUNC)) { +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + } + + *seq += used; + copied += used; + len -= used; + + tcp_rcv_space_adjust(sk); + +skip_copy: + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + tp->urg_data = 0; + tcp_fast_path_check(sk); + } + if (used + offset < skb->len) + continue; + + if (tcp_hdr(skb)->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } + continue; + + found_fin_ok: + /* Process the FIN. */ + ++*seq; + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } + break; + } while (len > 0); + + if (user_recv) { + if (!skb_queue_empty(&tp->ucopy.prequeue)) { + int chunk; + + tp->ucopy.len = copied > 0 ? len : 0; + + tcp_prequeue_process(sk); + + if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + len -= chunk; + copied += chunk; + } + } + + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + } + +#ifdef CONFIG_NET_DMA + tcp_service_net_dma(sk, true); /* Wait for queue to drain */ + tp->ucopy.dma_chan = NULL; + + if (tp->ucopy.pinned_list) { + dma_unpin_iovec_pages(tp->ucopy.pinned_list); + tp->ucopy.pinned_list = NULL; + } +#endif + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + tcp_cleanup_rbuf(sk, copied); + + release_sock(sk); + + if (copied > 0) + uid_stat_tcp_rcv(current_uid(), copied); + return copied; + +out: + release_sock(sk); + return err; + +recv_urg: + err = tcp_recv_urg(sk, msg, len, flags); + if (err > 0) + uid_stat_tcp_rcv(current_uid(), err); + goto out; +} +EXPORT_SYMBOL(tcp_recvmsg); + +void tcp_set_state(struct sock *sk, int state) +{ + int oldstate = sk->sk_state; + + switch (state) { + case TCP_ESTABLISHED: + if (oldstate != TCP_ESTABLISHED) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + break; + + case TCP_CLOSE: + if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) + TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(sk); + /* fall through */ + default: + if (oldstate == TCP_ESTABLISHED) + TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; + +#ifdef STATE_TRACE + SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); +#endif +} +EXPORT_SYMBOL_GPL(tcp_set_state); + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static const unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static int tcp_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + tcp_set_state(sk, ns); + + return next & TCP_ACTION_FIN; +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). + */ + +void tcp_shutdown(struct sock *sk, int how) +{ + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk)) + tcp_send_fin(sk); + } +} +EXPORT_SYMBOL(tcp_shutdown); + +void tcp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + int state; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + + /* Special case. */ + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - + tcp_hdr(skb)->fin; + data_was_unread += len; + __kfree_skb(skb); + } + + sk_mem_reclaim(sk); + + /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ + if (sk->sk_state == TCP_CLOSE) + goto adjudge_to_death; + + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. + */ + if (data_was_unread) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, sk->sk_allocation); + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + } else if (tcp_close_state(sk)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + * + * are legal only when FIN has been sent (i.e. in window), + * rather than queued out of window. Purists blame. + * + * F.e. "RFC state" is ESTABLISHED, + * if Linux state is FIN-WAIT-1, but FIN is still not sent. + * + * The visible declinations are that sometimes + * we enter time-wait state, when it is not required really + * (harmless), do not send active resets, when they are + * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when + * they look as CLOSING or LAST_ACK for Linux) + * Probably, I missed some more holelets. + * --ANK + */ + tcp_send_fin(sk); + } + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); + + + /* Now socket is owned by kernel and we acquire BH lock + to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + WARN_ON(sock_owned_by_user(sk)); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + /* Have we already been destroyed by a softirq or backlog? */ + if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) + goto out; + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (sk->sk_state == TCP_FIN_WAIT2) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONLINGER); + } else { + const int tmo = tcp_fin_time(sk); + + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, + tmo - TCP_TIMEWAIT_LEN); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (sk->sk_state != TCP_CLOSE) { + sk_mem_reclaim(sk); + if (tcp_too_many_orphans(sk, 0)) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned " + "sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); + } + } + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} +EXPORT_SYMBOL(tcp_close); + +/* These states need RST on ABORT according to RFC793 */ + +static inline int tcp_need_reset(int state) +{ + return (1 << state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | + TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +} + +int tcp_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + int old_state = sk->sk_state; + + if (old_state != TCP_CLOSE) + tcp_set_state(sk, TCP_CLOSE); + + /* ABORT function of RFC793 */ + if (old_state == TCP_LISTEN) { + inet_csk_listen_stop(sk); + } else if (tcp_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + /* The last check adjusts for discrepancy of Linux wrt. RFC + * states + */ + tcp_send_active_reset(sk, gfp_any()); + sk->sk_err = ECONNRESET; + } else if (old_state == TCP_SYN_SENT) + sk->sk_err = ECONNRESET; + + tcp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + tcp_write_queue_purge(sk); + __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + + inet->inet_dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->srtt = 0; + if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq = 1; + icsk->icsk_backoff = 0; + tp->snd_cwnd = 2; + icsk->icsk_probes_out = 0; + tp->packets_out = 0; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; + tp->window_clamp = 0; + tcp_set_ca_state(sk, TCP_CA_Open); + tcp_clear_retrans(tp); + inet_csk_delack_init(sk); + tcp_init_send_head(sk); + memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); + __sk_dst_reset(sk); + + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + + sk->sk_error_report(sk); + return err; +} +EXPORT_SYMBOL(tcp_disconnect); + +/* + * Socket option code for TCP. + */ +static int do_tcp_setsockopt(struct sock *sk, int level, + int optname, char __user *optval, unsigned int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int val; + int err = 0; + + /* These are data/string values, all the others are ints */ + switch (optname) { + case TCP_CONGESTION: { + char name[TCP_CA_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + val = strncpy_from_user(name, optval, + min_t(long, TCP_CA_NAME_MAX-1, optlen)); + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = tcp_set_congestion_control(sk, name); + release_sock(sk); + return err; + } + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = NULL; + + if (sizeof(ctd) > optlen) + return -EINVAL; + if (copy_from_user(&ctd, optval, sizeof(ctd))) + return -EFAULT; + + if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || + ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) + return -EINVAL; + + if (ctd.tcpct_cookie_desired == 0) { + /* default to global value */ + } else if ((0x1 & ctd.tcpct_cookie_desired) || + ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || + ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { + return -EINVAL; + } + + if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { + /* Supercedes all other values */ + lock_sock(sk); + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + tp->rx_opt.cookie_in_always = 0; /* false */ + tp->rx_opt.cookie_out_never = 1; /* true */ + release_sock(sk); + return err; + } + + /* Allocate ancillary memory before locking. + */ + if (ctd.tcpct_used > 0 || + (tp->cookie_values == NULL && + (sysctl_tcp_cookie_size > 0 || + ctd.tcpct_cookie_desired > 0 || + ctd.tcpct_s_data_desired > 0))) { + cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, + GFP_KERNEL); + if (cvp == NULL) + return -ENOMEM; + + kref_init(&cvp->kref); + } + lock_sock(sk); + tp->rx_opt.cookie_in_always = + (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); + tp->rx_opt.cookie_out_never = 0; /* false */ + + if (tp->cookie_values != NULL) { + if (cvp != NULL) { + /* Changed values are recorded by a changed + * pointer, ensuring the cookie will differ, + * without separately hashing each value later. + */ + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + } else { + cvp = tp->cookie_values; + } + } + + if (cvp != NULL) { + cvp->cookie_desired = ctd.tcpct_cookie_desired; + + if (ctd.tcpct_used > 0) { + memcpy(cvp->s_data_payload, ctd.tcpct_value, + ctd.tcpct_used); + cvp->s_data_desired = ctd.tcpct_used; + cvp->s_data_constant = 1; /* true */ + } else { + /* No constant payload data. */ + cvp->s_data_desired = ctd.tcpct_s_data_desired; + cvp->s_data_constant = 0; /* false */ + } + + tp->cookie_values = cvp; + } + release_sock(sk); + return err; + } + default: + /* fallthru */ + break; + } + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case TCP_MAXSEG: + /* Values greater than interface MTU won't take effect. However + * at the point when this call is done we typically don't yet + * know which interface is going to be used */ + if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { + err = -EINVAL; + break; + } + tp->rx_opt.user_mss = val; + break; + + case TCP_NODELAY: + if (val) { + /* TCP_NODELAY is weaker than TCP_CORK, so that + * this option on corked socket is remembered, but + * it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make + * an explicit push, which overrides even TCP_CORK + * for currently queued segments. + */ + tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tp->nonagle &= ~TCP_NAGLE_OFF; + } + break; + + case TCP_THIN_LINEAR_TIMEOUTS: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_lto = val; + break; + + case TCP_THIN_DUPACK: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_dupack = val; + break; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is + * stronger than TCP_NODELAY. + */ + if (val) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle&TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } + break; + + case TCP_KEEPIDLE: + if (val < 1 || val > MAX_TCP_KEEPIDLE) + err = -EINVAL; + else { + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + } + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + err = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + err = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + err = -EINVAL; + else + icsk->icsk_syn_retries = val; + break; + + case TCP_LINGER2: + if (val < 0) + tp->linger2 = -1; + else if (val > sysctl_tcp_fin_timeout / HZ) + tp->linger2 = 0; + else + tp->linger2 = val * HZ; + break; + + case TCP_DEFER_ACCEPT: + /* Translate value in seconds to number of retransmits */ + icsk->icsk_accept_queue.rskq_defer_accept = + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); + break; + + case TCP_WINDOW_CLAMP: + if (!val) { + if (sk->sk_state != TCP_CLOSE) { + err = -EINVAL; + break; + } + tp->window_clamp = 0; + } else + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + break; + + case TCP_QUICKACK: + if (!val) { + icsk->icsk_ack.pingpong = 1; + } else { + icsk->icsk_ack.pingpong = 0; + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + icsk->icsk_ack.pingpong = 1; + } + } + break; + +#ifdef CONFIG_TCP_MD5SIG + case TCP_MD5SIG: + /* Read the IP->Key mappings from userspace */ + err = tp->af_specific->md5_parse(sk, optval, optlen); + break; +#endif + case TCP_USER_TIMEOUT: + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, + unsigned int optlen) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (level != SOL_TCP) + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); + return do_tcp_setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(tcp_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_tcp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level != SOL_TCP) + return inet_csk_compat_setsockopt(sk, level, optname, + optval, optlen); + return do_tcp_setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(compat_tcp_setsockopt); +#endif + +/* Return information about state of tcp endpoint in API format. */ +void tcp_get_info(struct sock *sk, struct tcp_info *info) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + u32 now = tcp_time_stamp; + + memset(info, 0, sizeof(*info)); + + info->tcpi_state = sk->sk_state; + info->tcpi_ca_state = icsk->icsk_ca_state; + info->tcpi_retransmits = icsk->icsk_retransmits; + info->tcpi_probes = icsk->icsk_probes_out; + info->tcpi_backoff = icsk->icsk_backoff; + + if (tp->rx_opt.tstamp_ok) + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tcp_is_sack(tp)) + info->tcpi_options |= TCPI_OPT_SACK; + if (tp->rx_opt.wscale_ok) { + info->tcpi_options |= TCPI_OPT_WSCALE; + info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; + info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; + } + + if (tp->ecn_flags&TCP_ECN_OK) + info->tcpi_options |= TCPI_OPT_ECN; + + info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); + info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); + info->tcpi_snd_mss = tp->mss_cache; + info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; + + if (sk->sk_state == TCP_LISTEN) { + info->tcpi_unacked = sk->sk_ack_backlog; + info->tcpi_sacked = sk->sk_max_ack_backlog; + } else { + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + } + info->tcpi_lost = tp->lost_out; + info->tcpi_retrans = tp->retrans_out; + info->tcpi_fackets = tp->fackets_out; + + info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); + info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); + + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; + info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; + info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; + info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_snd_ssthresh = tp->snd_ssthresh; + info->tcpi_snd_cwnd = tp->snd_cwnd; + info->tcpi_advmss = tp->advmss; + info->tcpi_reordering = tp->reordering; + + info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_space = tp->rcvq_space.space; + + info->tcpi_total_retrans = tp->total_retrans; +} +EXPORT_SYMBOL_GPL(tcp_get_info); + +static int do_tcp_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, int __user *optlen) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int val, len; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case TCP_MAXSEG: + val = tp->mss_cache; + if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + val = tp->rx_opt.user_mss; + break; + case TCP_NODELAY: + val = !!(tp->nonagle&TCP_NAGLE_OFF); + break; + case TCP_CORK: + val = !!(tp->nonagle&TCP_NAGLE_CORK); + break; + case TCP_KEEPIDLE: + val = keepalive_time_when(tp) / HZ; + break; + case TCP_KEEPINTVL: + val = keepalive_intvl_when(tp) / HZ; + break; + case TCP_KEEPCNT: + val = keepalive_probes(tp); + break; + case TCP_SYNCNT: + val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + break; + case TCP_LINGER2: + val = tp->linger2; + if (val >= 0) + val = (val ? : sysctl_tcp_fin_timeout) / HZ; + break; + case TCP_DEFER_ACCEPT: + val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, + TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); + break; + case TCP_WINDOW_CLAMP: + val = tp->window_clamp; + break; + case TCP_INFO: { + struct tcp_info info; + + if (get_user(len, optlen)) + return -EFAULT; + + tcp_get_info(sk, &info); + + len = min_t(unsigned int, len, sizeof(info)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } + case TCP_QUICKACK: + val = !icsk->icsk_ack.pingpong; + break; + + case TCP_CONGESTION: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, TCP_CA_NAME_MAX); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) + return -EFAULT; + return 0; + + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = tp->cookie_values; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(ctd)) + return -EINVAL; + + memset(&ctd, 0, sizeof(ctd)); + ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? + TCP_COOKIE_IN_ALWAYS : 0) + | (tp->rx_opt.cookie_out_never ? + TCP_COOKIE_OUT_NEVER : 0); + + if (cvp != NULL) { + ctd.tcpct_flags |= (cvp->s_data_in ? + TCP_S_DATA_IN : 0) + | (cvp->s_data_out ? + TCP_S_DATA_OUT : 0); + + ctd.tcpct_cookie_desired = cvp->cookie_desired; + ctd.tcpct_s_data_desired = cvp->s_data_desired; + + memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], + cvp->cookie_pair_size); + ctd.tcpct_used = cvp->cookie_pair_size; + } + + if (put_user(sizeof(ctd), optlen)) + return -EFAULT; + if (copy_to_user(optval, &ctd, sizeof(ctd))) + return -EFAULT; + return 0; + } + case TCP_THIN_LINEAR_TIMEOUTS: + val = tp->thin_lto; + break; + case TCP_THIN_DUPACK: + val = tp->thin_dupack; + break; + + case TCP_USER_TIMEOUT: + val = jiffies_to_msecs(icsk->icsk_user_timeout); + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, + int __user *optlen) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (level != SOL_TCP) + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); + return do_tcp_getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(tcp_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_tcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_TCP) + return inet_csk_compat_getsockopt(sk, level, optname, + optval, optlen); + return do_tcp_getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(compat_tcp_getsockopt); +#endif + +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct tcphdr *th; + unsigned thlen; + unsigned int seq; + __be32 delta; + unsigned int oldlen; + unsigned int mss; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = tcp_hdr(skb); + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = (u16)~skb->len; + __skb_pull(skb, thlen); + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0) || + !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + segs = skb_segment(skb, features); + if (IS_ERR(segs)) + goto out; + + delta = htonl(oldlen + (thlen + mss)); + + skb = segs; + th = tcp_hdr(skb); + seq = ntohl(th->seq); + + do { + th->fin = th->psh = 0; + + th->check = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + if (skb->ip_summed != CHECKSUM_PARTIAL) + th->check = + csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); + + seq += mss; + skb = skb->next; + th = tcp_hdr(skb); + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + delta = htonl(oldlen + (skb->tail - skb->transport_header) + + skb->data_len); + th->check = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + if (skb->ip_summed != CHECKSUM_PARTIAL) + th->check = csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); + +out: + return segs; +} +EXPORT_SYMBOL(tcp_tso_segment); + +struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct tcphdr *th; + struct tcphdr *th2; + unsigned int len; + unsigned int thlen; + __be32 flags; + unsigned int mss = 1; + unsigned int hlen; + unsigned int off; + int flush = 1; + int i; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } + + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + hlen = off + thlen; + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } + + skb_gro_pull(skb, thlen); + + len = skb_gro_len(skb); + flags = tcp_flag_word(th); + + for (; (p = *head); head = &p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + goto found; + } + + goto out_check_final; + +found: + flush = NAPI_GRO_CB(p)->flush; + flush |= (__force int)(flags & TCP_FLAG_CWR); + flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + flush |= (__force int)(th->ack_seq ^ th2->ack_seq); + for (i = sizeof(*th); i < thlen; i += 4) + flush |= *(u32 *)((u8 *)th + i) ^ + *(u32 *)((u8 *)th2 + i); + + mss = skb_shinfo(p)->gso_size; + + flush |= (len - 1) >= mss; + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); + + if (flush || skb_gro_receive(head, skb)) { + mss = 1; + goto out_check_final; + } + + p = *head; + th2 = tcp_hdr(p); + tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); + +out_check_final: + flush = len < mss; + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | + TCP_FLAG_RST | TCP_FLAG_SYN | + TCP_FLAG_FIN)); + + if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) + pp = head; + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} +EXPORT_SYMBOL(tcp_gro_receive); + +int tcp_gro_complete(struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (th->cwr) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + + return 0; +} +EXPORT_SYMBOL(tcp_gro_complete); + +#ifdef CONFIG_TCP_MD5SIG +static unsigned long tcp_md5sig_users; +static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; +static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); + +static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); + if (p) { + if (p->md5_desc.tfm) + crypto_free_hash(p->md5_desc.tfm); + kfree(p); + } + } + free_percpu(pool); +} + +void tcp_free_md5sig_pool(void) +{ + struct tcp_md5sig_pool * __percpu *pool = NULL; + + spin_lock_bh(&tcp_md5sig_pool_lock); + if (--tcp_md5sig_users == 0) { + pool = tcp_md5sig_pool; + tcp_md5sig_pool = NULL; + } + spin_unlock_bh(&tcp_md5sig_pool_lock); + if (pool) + __tcp_free_md5sig_pool(pool); +} +EXPORT_SYMBOL(tcp_free_md5sig_pool); + +static struct tcp_md5sig_pool * __percpu * +__tcp_alloc_md5sig_pool(struct sock *sk) +{ + int cpu; + struct tcp_md5sig_pool * __percpu *pool; + + pool = alloc_percpu(struct tcp_md5sig_pool *); + if (!pool) + return NULL; + + for_each_possible_cpu(cpu) { + struct tcp_md5sig_pool *p; + struct crypto_hash *hash; + + p = kzalloc(sizeof(*p), sk->sk_allocation); + if (!p) + goto out_free; + *per_cpu_ptr(pool, cpu) = p; + + hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (!hash || IS_ERR(hash)) + goto out_free; + + p->md5_desc.tfm = hash; + } + return pool; +out_free: + __tcp_free_md5sig_pool(pool); + return NULL; +} + +struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) +{ + struct tcp_md5sig_pool * __percpu *pool; + int alloc = 0; + +retry: + spin_lock_bh(&tcp_md5sig_pool_lock); + pool = tcp_md5sig_pool; + if (tcp_md5sig_users++ == 0) { + alloc = 1; + spin_unlock_bh(&tcp_md5sig_pool_lock); + } else if (!pool) { + tcp_md5sig_users--; + spin_unlock_bh(&tcp_md5sig_pool_lock); + cpu_relax(); + goto retry; + } else + spin_unlock_bh(&tcp_md5sig_pool_lock); + + if (alloc) { + /* we cannot hold spinlock here because this may sleep. */ + struct tcp_md5sig_pool * __percpu *p; + + p = __tcp_alloc_md5sig_pool(sk); + spin_lock_bh(&tcp_md5sig_pool_lock); + if (!p) { + tcp_md5sig_users--; + spin_unlock_bh(&tcp_md5sig_pool_lock); + return NULL; + } + pool = tcp_md5sig_pool; + if (pool) { + /* oops, it has already been assigned. */ + spin_unlock_bh(&tcp_md5sig_pool_lock); + __tcp_free_md5sig_pool(p); + } else { + tcp_md5sig_pool = pool = p; + spin_unlock_bh(&tcp_md5sig_pool_lock); + } + } + return pool; +} +EXPORT_SYMBOL(tcp_alloc_md5sig_pool); + + +/** + * tcp_get_md5sig_pool - get md5sig_pool for this user + * + * We use percpu structure, so if we succeed, we exit with preemption + * and BH disabled, to make sure another thread or softirq handling + * wont try to get same context. + */ +struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) +{ + struct tcp_md5sig_pool * __percpu *p; + + local_bh_disable(); + + spin_lock(&tcp_md5sig_pool_lock); + p = tcp_md5sig_pool; + if (p) + tcp_md5sig_users++; + spin_unlock(&tcp_md5sig_pool_lock); + + if (p) + return *this_cpu_ptr(p); + + local_bh_enable(); + return NULL; +} +EXPORT_SYMBOL(tcp_get_md5sig_pool); + +void tcp_put_md5sig_pool(void) +{ + local_bh_enable(); + tcp_free_md5sig_pool(); +} +EXPORT_SYMBOL(tcp_put_md5sig_pool); + +int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, + struct tcphdr *th) +{ + struct scatterlist sg; + int err; + + __sum16 old_checksum = th->check; + th->check = 0; + /* options aren't included in the hash */ + sg_init_one(&sg, th, sizeof(struct tcphdr)); + err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); + th->check = old_checksum; + return err; +} +EXPORT_SYMBOL(tcp_md5_hash_header); + +int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, + struct sk_buff *skb, unsigned header_len) +{ + struct scatterlist sg; + const struct tcphdr *tp = tcp_hdr(skb); + struct hash_desc *desc = &hp->md5_desc; + unsigned i; + const unsigned head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + struct sk_buff *frag_iter; + + sg_init_table(&sg, 1); + + sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); + if (crypto_hash_update(desc, &sg, head_data_len)) + return 1; + + for (i = 0; i < shi->nr_frags; ++i) { + const struct skb_frag_struct *f = &shi->frags[i]; + sg_set_page(&sg, f->page, f->size, f->page_offset); + if (crypto_hash_update(desc, &sg, f->size)) + return 1; + } + + skb_walk_frags(skb, frag_iter) + if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) + return 1; + + return 0; +} +EXPORT_SYMBOL(tcp_md5_hash_skb_data); + +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +{ + struct scatterlist sg; + + sg_init_one(&sg, key->key, key->keylen); + return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); +} +EXPORT_SYMBOL(tcp_md5_hash_key); + +#endif + +/** + * Each Responder maintains up to two secret values concurrently for + * efficient secret rollover. Each secret value has 4 states: + * + * Generating. (tcp_secret_generating != tcp_secret_primary) + * Generates new Responder-Cookies, but not yet used for primary + * verification. This is a short-term state, typically lasting only + * one round trip time (RTT). + * + * Primary. (tcp_secret_generating == tcp_secret_primary) + * Used both for generation and primary verification. + * + * Retiring. (tcp_secret_retiring != tcp_secret_secondary) + * Used for verification, until the first failure that can be + * verified by the newer Generating secret. At that time, this + * cookie's state is changed to Secondary, and the Generating + * cookie's state is changed to Primary. This is a short-term state, + * typically lasting only one round trip time (RTT). + * + * Secondary. (tcp_secret_retiring == tcp_secret_secondary) + * Used for secondary verification, after primary verification + * failures. This state lasts no more than twice the Maximum Segment + * Lifetime (2MSL). Then, the secret is discarded. + */ +struct tcp_cookie_secret { + /* The secret is divided into two parts. The digest part is the + * equivalent of previously hashing a secret and saving the state, + * and serves as an initialization vector (IV). The message part + * serves as the trailing secret. + */ + u32 secrets[COOKIE_WORKSPACE_WORDS]; + unsigned long expires; +}; + +#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) +#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) +#define TCP_SECRET_LIFE (HZ * 600) + +static struct tcp_cookie_secret tcp_secret_one; +static struct tcp_cookie_secret tcp_secret_two; + +/* Essentially a circular list, without dynamic allocation. */ +static struct tcp_cookie_secret *tcp_secret_generating; +static struct tcp_cookie_secret *tcp_secret_primary; +static struct tcp_cookie_secret *tcp_secret_retiring; +static struct tcp_cookie_secret *tcp_secret_secondary; + +static DEFINE_SPINLOCK(tcp_secret_locker); + +/* Select a pseudo-random word in the cookie workspace. + */ +static inline u32 tcp_cookie_work(const u32 *ws, const int n) +{ + return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; +} + +/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. + * Called in softirq context. + * Returns: 0 for success. + */ +int tcp_cookie_generator(u32 *bakery) +{ + unsigned long jiffy = jiffies; + + if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { + spin_lock_bh(&tcp_secret_locker); + if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { + /* refreshed by another */ + memcpy(bakery, + &tcp_secret_generating->secrets[0], + COOKIE_WORKSPACE_WORDS); + } else { + /* still needs refreshing */ + get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); + + /* The first time, paranoia assumes that the + * randomization function isn't as strong. But, + * this secret initialization is delayed until + * the last possible moment (packet arrival). + * Although that time is observable, it is + * unpredictably variable. Mash in the most + * volatile clock bits available, and expire the + * secret extra quickly. + */ + if (unlikely(tcp_secret_primary->expires == + tcp_secret_secondary->expires)) { + struct timespec tv; + + getnstimeofday(&tv); + bakery[COOKIE_DIGEST_WORDS+0] ^= + (u32)tv.tv_nsec; + + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_1MSL + + (0x0f & tcp_cookie_work(bakery, 0)); + } else { + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_LIFE + + (0xff & tcp_cookie_work(bakery, 1)); + tcp_secret_primary->expires = jiffy + + TCP_SECRET_2MSL + + (0x1f & tcp_cookie_work(bakery, 2)); + } + memcpy(&tcp_secret_secondary->secrets[0], + bakery, COOKIE_WORKSPACE_WORDS); + + rcu_assign_pointer(tcp_secret_generating, + tcp_secret_secondary); + rcu_assign_pointer(tcp_secret_retiring, + tcp_secret_primary); + /* + * Neither call_rcu() nor synchronize_rcu() needed. + * Retiring data is not freed. It is replaced after + * further (locked) pointer updates, and a quiet time + * (minimum 1MSL, maximum LIFE - 2MSL). + */ + } + spin_unlock_bh(&tcp_secret_locker); + } else { + rcu_read_lock_bh(); + memcpy(bakery, + &rcu_dereference(tcp_secret_generating)->secrets[0], + COOKIE_WORKSPACE_WORDS); + rcu_read_unlock_bh(); + } + return 0; +} +EXPORT_SYMBOL(tcp_cookie_generator); + +void tcp_done(struct sock *sk) +{ + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + + tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_done); + +extern struct tcp_congestion_ops tcp_reno; + +static __initdata unsigned long thash_entries; +static int __init set_thash_entries(char *str) +{ + if (!str) + return 0; + thash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("thash_entries=", set_thash_entries); + +void __init tcp_init(void) +{ + struct sk_buff *skb = NULL; + unsigned long limit; + int i, max_rshare, max_wshare, cnt; + unsigned long jiffy = jiffies; + + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + + percpu_counter_init(&tcp_sockets_allocated, 0); + percpu_counter_init(&tcp_orphan_count, 0); + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + /* Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + tcp_hashinfo.ehash = + alloc_large_system_hash("TCP established", + sizeof(struct inet_ehash_bucket), + thash_entries, + (totalram_pages >= 128 * 1024) ? + 13 : 15, + 0, + NULL, + &tcp_hashinfo.ehash_mask, + thash_entries ? 0 : 512 * 1024); + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); + } + if (inet_ehash_locks_alloc(&tcp_hashinfo)) + panic("TCP: failed to alloc ehash_locks"); + tcp_hashinfo.bhash = + alloc_large_system_hash("TCP bind", + sizeof(struct inet_bind_hashbucket), + tcp_hashinfo.ehash_mask + 1, + (totalram_pages >= 128 * 1024) ? + 13 : 15, + 0, + &tcp_hashinfo.bhash_size, + NULL, + 64 * 1024); + tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + for (i = 0; i < tcp_hashinfo.bhash_size; i++) { + spin_lock_init(&tcp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); + } + + + cnt = tcp_hashinfo.ehash_mask + 1; + + tcp_death_row.sysctl_max_tw_buckets = cnt / 2; + sysctl_tcp_max_orphans = cnt / 2; + sysctl_max_syn_backlog = max(128, cnt / 256); + + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + sysctl_tcp_mem[0] = limit / 4 * 3; + sysctl_tcp_mem[1] = limit; + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); + max_wshare = min(4UL*1024*1024, limit); + max_rshare = min(6UL*1024*1024, limit); + + sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + sysctl_tcp_wmem[1] = 16*1024; + sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + + sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + sysctl_tcp_rmem[1] = 87380; + sysctl_tcp_rmem[2] = max(87380, max_rshare); + + printk(KERN_INFO "TCP: Hash tables configured " + "(established %u bind %u)\n", + tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + + tcp_register_congestion_control(&tcp_reno); + + memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); + memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); + tcp_secret_one.expires = jiffy; /* past due */ + tcp_secret_two.expires = jiffy; /* past due */ + tcp_secret_generating = &tcp_secret_one; + tcp_secret_primary = &tcp_secret_one; + tcp_secret_retiring = &tcp_secret_two; + tcp_secret_secondary = &tcp_secret_two; +} + +static int tcp_is_local(struct net *net, __be32 addr) { + struct rtable *rt; + struct flowi4 fl4 = { .daddr = addr }; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR_OR_NULL(rt)) + return 0; + return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int tcp_is_local6(struct net *net, struct in6_addr *addr) { + struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0); + return rt6 && rt6->rt6i_dev && (rt6->rt6i_dev->flags & IFF_LOOPBACK); +} +#endif + +/* + * tcp_nuke_addr - destroy all sockets on the given local address + * if local address is the unspecified address (0.0.0.0 or ::), destroy all + * sockets with local addresses that are not configured. + */ +int tcp_nuke_addr(struct net *net, struct sockaddr *addr) +{ + int family = addr->sa_family; + unsigned int bucket; + + struct in_addr *in; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct in6_addr *in6 = NULL; +#endif + if (family == AF_INET) { + in = &((struct sockaddr_in *)addr)->sin_addr; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } else if (family == AF_INET6) { + in6 = &((struct sockaddr_in6 *)addr)->sin6_addr; +#endif + } else { + return -EAFNOSUPPORT; + } + + for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) { + struct hlist_nulls_node *node; + struct sock *sk; + spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket); + +restart: + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) { + struct inet_sock *inet = inet_sk(sk); + + if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT) + continue; + if (sock_flag(sk, SOCK_DEAD)) + continue; + + if (family == AF_INET) { + __be32 s4 = inet->inet_rcv_saddr; + if (s4 == LOOPBACK4_IPV6) + continue; + + if (in->s_addr != s4 && + !(in->s_addr == INADDR_ANY && + !tcp_is_local(net, s4))) + continue; + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (family == AF_INET6) { + struct in6_addr *s6; + if (!inet->pinet6) + continue; + + s6 = &inet->pinet6->rcv_saddr; + if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED) + continue; + + if (!ipv6_addr_equal(in6, s6) && + !(ipv6_addr_equal(in6, &in6addr_any) && + !tcp_is_local6(net, s6))) + continue; + } +#endif + + sock_hold(sk); + spin_unlock_bh(lock); + + local_bh_disable(); + bh_lock_sock(sk); + sk->sk_err = ETIMEDOUT; + sk->sk_error_report(sk); + + tcp_done(sk); + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); + + goto restart; + } + spin_unlock_bh(lock); + } + + return 0; +} diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 00000000..6187eb4d --- /dev/null +++ b/net/ipv4/tcp_bic.c @@ -0,0 +1,239 @@ +/* + * Binary Increase Congestion control for TCP + * Home page: + * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injong Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://netsrv.csc.ncsu.edu/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include +#include +#include + + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + +static int fast_convergence = 1; +static int max_increment = 16; +static int low_window = 14; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh; +static int smooth_part = 20; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(low_window, int, 0644); +MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(smooth_part, int, 0644); +MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 epoch_start; /* beginning of an epoch */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + if (initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; + + /* start off normal */ + if (cwnd <= low_window) { + ca->cnt = cwnd; + return; + } + + /* binary increase */ + if (cwnd < ca->last_max_cwnd) { + __u32 dist = (ca->last_max_cwnd - cwnd) + / BICTCP_B; + + if (dist > max_increment) + /* linear increase */ + ca->cnt = cwnd / max_increment; + else if (dist <= 1U) + /* binary search increase */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else + /* binary search increase */ + ca->cnt = cwnd / dist; + } else { + /* slow start AMD linear increase */ + if (cwnd < ca->last_max_cwnd + BICTCP_B) + /* slow start */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) + /* slow start */ + ca->cnt = (cwnd * (BICTCP_B-1)) + / (cwnd - ca->last_max_cwnd); + else + /* linear increase */ + ca->cnt = cwnd / max_increment; + } + + /* if in slow start or link utilization is very low */ + if (ca->loss_cwnd == 0) { + if (ca->cnt > 20) /* increase cwnd 5% per RTT */ + ca->cnt = 20; + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + bictcp_update(ca, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, ca->cnt); + } + +} + +/* + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + + if (tp->snd_cwnd <= low_window) + return max(tp->snd_cwnd >> 1U, 2U); + else + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bictcp *ca = inet_csk_ca(sk); + return max(tp->snd_cwnd, ca->last_max_cwnd); +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops bictcp __read_mostly = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "bic", +}; + +static int __init bictcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&bictcp); +} + +static void __exit bictcp_unregister(void) +{ + tcp_unregister_congestion_control(&bictcp); +} + +module_init(bictcp_register); +module_exit(bictcp_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BIC TCP"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 00000000..850c737e --- /dev/null +++ b/net/ipv4/tcp_cong.c @@ -0,0 +1,424 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger + */ + +#include +#include +#include +#include +#include +#include + +int sysctl_tcp_max_ssthresh = 0; + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry_rcu(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorithm to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_tail_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_congestion_ops *ca; + + /* if no choice made yet assign the current value set as default */ + if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + icsk->icsk_ca_ops = ca; + break; + } + + /* fallback to next available */ + } + rcu_read_unlock(); + } + + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + module_put(icsk->icsk_ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_MODULES + if (!ca && capable(CAP_NET_ADMIN)) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Set default value from kernel configuration at bootup */ +static int __init tcp_congestion_default(void) +{ + return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); +} +late_initcall(tcp_congestion_default); + + +/* Build string with list of available congestion control values */ +void tcp_get_available_congestion_control(char *buf, size_t maxlen) +{ + struct tcp_congestion_ops *ca; + size_t offs = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ca->name); + + } + rcu_read_unlock(); +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* Built list of non-restricted congestion control values */ +void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) +{ + struct tcp_congestion_ops *ca; + size_t offs = 0; + + *buf = '\0'; + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) + continue; + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ca->name); + + } + rcu_read_unlock(); +} + +/* Change list of non-restricted congestion control */ +int tcp_set_allowed_congestion_control(char *val) +{ + struct tcp_congestion_ops *ca; + char *saved_clone, *clone, *name; + int ret = 0; + + saved_clone = clone = kstrdup(val, GFP_USER); + if (!clone) + return -ENOMEM; + + spin_lock(&tcp_cong_list_lock); + /* pass 1 check for bad entries */ + while ((name = strsep(&clone, " ")) && *name) { + ca = tcp_ca_find(name); + if (!ca) { + ret = -ENOENT; + goto out; + } + } + + /* pass 2 clear old values */ + list_for_each_entry_rcu(ca, &tcp_cong_list, list) + ca->flags &= ~TCP_CONG_NON_RESTRICTED; + + /* pass 3 mark as allowed */ + while ((name = strsep(&val, " ")) && *name) { + ca = tcp_ca_find(name); + WARN_ON(!ca); + if (ca) + ca->flags |= TCP_CONG_NON_RESTRICTED; + } +out: + spin_unlock(&tcp_cong_list_lock); + kfree(saved_clone); + + return ret; +} + + +/* Change congestion control for socket */ +int tcp_set_congestion_control(struct sock *sk, const char *name) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_congestion_ops *ca; + int err = 0; + + rcu_read_lock(); + ca = tcp_ca_find(name); + + /* no change asking for existing value */ + if (ca == icsk->icsk_ca_ops) + goto out; + +#ifdef CONFIG_MODULES + /* not found attempt to autoload module */ + if (!ca && capable(CAP_NET_ADMIN)) { + rcu_read_unlock(); + request_module("tcp_%s", name); + rcu_read_lock(); + ca = tcp_ca_find(name); + } +#endif + if (!ca) + err = -ENOENT; + + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) + err = -EPERM; + + else if (!try_module_get(ca->owner)) + err = -EBUSY; + + else { + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = ca; + + if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + } + out: + rcu_read_unlock(); + return err; +} + +/* RFC2861 Check whether we are limited by application or congestion window + * This is the inverse of cwnd check in tcp_tso_should_defer + */ +int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 left; + + if (in_flight >= tp->snd_cwnd) + return 1; + + left = tp->snd_cwnd - in_flight; + if (sk_can_gso(sk) && + left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && + left * tp->mss_cache < sk->sk_gso_max_size) + return 1; + return left <= tcp_max_burst(tp); +} +EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); + +/* + * Slow start is used when congestion window is less than slow start + * threshold. This version implements the basic RFC2581 version + * and optionally supports: + * RFC3742 Limited Slow Start - growth limited to max_ssthresh + * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + int cnt; /* increase in packets */ + + /* RFC3465: ABC Slow start + * Increase only after a full MSS of bytes is acked + * + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) + return; + + if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) + cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ + else + cnt = tp->snd_cwnd; /* exponential increase */ + + /* RFC3465: ABC + * We MAY increase by 2 if discovered delayed ack + */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) + cnt <<= 1; + tp->bytes_acked = 0; + + tp->snd_cwnd_cnt += cnt; + while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd_cnt -= tp->snd_cwnd; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +{ + if (tp->snd_cwnd_cnt >= w) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Appropriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + tcp_cong_avoid_ai(tp, tp->snd_cwnd); + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window with halving. */ +u32 tcp_reno_min_cwnd(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .flags = TCP_CONG_NON_RESTRICTED, + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +/* Initial congestion control used (until SYN) + * really reno under another name so we can tell difference + * during tcp_set_default_congestion_control + */ +struct tcp_congestion_ops tcp_init_congestion_ops = { + .name = "", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; +EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c new file mode 100644 index 00000000..f376b05c --- /dev/null +++ b/net/ipv4/tcp_cubic.c @@ -0,0 +1,492 @@ +/* + * TCP CUBIC: Binary Increase Congestion control for TCP v2.3 + * Home page: + * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC + * This is from the implementation of CUBIC TCP in + * Sangtae Ha, Injong Rhee and Lisong Xu, + * "CUBIC: A New TCP-Friendly High-Speed TCP Variant" + * in ACM SIGOPS Operating System Review, July 2008. + * Available from: + * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf + * + * CUBIC integrates a new slow start algorithm, called HyStart. + * The details of HyStart are presented in + * Sangtae Ha and Injong Rhee, + * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008. + * Available from: + * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf + * + * All testing results are available from: + * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing + * + * Unless CUBIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include +#include +#include +#include + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +/* Two methods of hybrid slow start */ +#define HYSTART_ACK_TRAIN 0x1 +#define HYSTART_DELAY 0x2 + +/* Number of delay samples for detecting the increase of delay */ +#define HYSTART_MIN_SAMPLES 8 +#define HYSTART_DELAY_MIN (4U<<3) +#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) + +static int fast_convergence __read_mostly = 1; +static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh __read_mostly; +static int bic_scale __read_mostly = 41; +static int tcp_friendliness __read_mostly = 1; + +static int hystart __read_mostly = 1; +static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; +static int hystart_low_window __read_mostly = 16; +static int hystart_ack_delta __read_mostly = 2; + +static u32 cube_rtt_scale __read_mostly; +static u32 beta_scale __read_mostly; +static u64 cube_factor __read_mostly; + +/* Note parameters that are used for precomputing scale factors are read-only */ +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(bic_scale, int, 0444); +MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); +module_param(tcp_friendliness, int, 0644); +MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); +module_param(hystart, int, 0644); +MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); +module_param(hystart_detect, int, 0644); +MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" + " 1: packet-train 2: delay 3: both packet-train and delay"); +module_param(hystart_low_window, int, 0644); +MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); +module_param(hystart_ack_delta, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 bic_origin_point;/* origin point of bic function */ + u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 delay_min; /* min delay (msec << 3) */ + u32 epoch_start; /* beginning of an epoch */ + u32 ack_cnt; /* number of acks */ + u32 tcp_cwnd; /* estimated tcp cwnd */ +#define ACK_RATIO_SHIFT 4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) + u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ + u8 sample_cnt; /* number of samples to decide curr_rtt */ + u8 found; /* the exit point is found? */ + u32 round_start; /* beginning of each round */ + u32 end_seq; /* end_seq of the round */ + u32 last_ack; /* last time when the ACK spacing is close */ + u32 curr_rtt; /* the minimum rtt of current round */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; + ca->found = 0; +} + +static inline u32 bictcp_clock(void) +{ +#if HZ < 1000 + return ktime_to_ms(ktime_get_real()); +#else + return jiffies_to_msecs(jiffies); +#endif +} + +static inline void bictcp_hystart_reset(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->round_start = ca->last_ack = bictcp_clock(); + ca->end_seq = tp->snd_nxt; + ca->curr_rtt = 0; + ca->sample_cnt = 0; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + + if (hystart) + bictcp_hystart_reset(sk); + + if (!hystart && initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* calculate the cubic root of x using a table lookup followed by one + * Newton-Raphson iteration. + * Avg err ~= 0.195% + */ +static u32 cubic_root(u64 a) +{ + u32 x, b, shift; + /* + * cbrt(x) MSB values for x MSB values in [0..63]. + * Precomputed then refined by hand - Willy Tarreau + * + * For x in [0..63], + * v = cbrt(x << 18) - 1 + * cbrt(x) = (v[x] + 10) >> 6 + */ + static const u8 v[] = { + /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, + /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, + /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, + /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, + /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, + /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, + /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, + /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, + }; + + b = fls64(a); + if (b < 7) { + /* a in [0..63] */ + return ((u32)v[(u32)a] + 35) >> 6; + } + + b = ((b * 84) >> 8) - 1; + shift = (a >> (b * 3)); + + x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; + + /* + * Newton-Raphson iteration + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1))); + x = ((x * 341) >> 10); + return x; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + u64 offs; + u32 delta, t, bic_target, max_cnt; + + ca->ack_cnt++; /* count the number of ACKs */ + + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->ack_cnt = 1; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using 64 bit) + * and without the support of division of 64bit numbers + * (so all divisions are done by using 32 bit) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + /* change the unit from HZ to bictcp_HZ */ + t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) + - ca->epoch_start) << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; + else + offs = t - ca->bic_K; + + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - delta; + else /* above origin*/ + bic_target = ca->bic_origin_point + delta; + + /* cubic function - calc bictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + /* + * The initial growth of cubic function may be too conservative + * when the available bandwidth is still unknown. + */ + if (ca->loss_cwnd == 0 && ca->cnt > 20) + ca->cnt = 20; /* increase cwnd 5% per RTT */ + + /* TCP Friendly */ + if (tcp_friendliness) { + u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; + while (ca->ack_cnt > delta) { /* update tcp cwnd */ + ca->ack_cnt -= delta; + ca->tcp_cwnd++; + } + + if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (hystart && after(ack, ca->end_seq)) + bictcp_hystart_reset(sk); + tcp_slow_start(tp); + } else { + bictcp_update(ca, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, ca->cnt); + } + +} + +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + struct bictcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) { + bictcp_reset(inet_csk_ca(sk)); + bictcp_hystart_reset(sk); + } +} + +static void hystart_update(struct sock *sk, u32 delay) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (!(ca->found & hystart_detect)) { + u32 now = bictcp_clock(); + + /* first detection parameter - ack-train detection */ + if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + ca->last_ack = now; + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) + ca->found |= HYSTART_ACK_TRAIN; + } + + /* obtain the minimum delay of more than sampling packets */ + if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { + if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + ca->curr_rtt = delay; + + ca->sample_cnt++; + } else { + if (ca->curr_rtt > ca->delay_min + + HYSTART_DELAY_THRESH(ca->delay_min>>4)) + ca->found |= HYSTART_DELAY; + } + /* + * Either one of two conditions are met, + * we exit from slow start immediately. + */ + if (ca->found & hystart_detect) + tp->snd_ssthresh = tp->snd_cwnd; + } +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; + + if (icsk->icsk_ca_state == TCP_CA_Open) { + u32 ratio = ca->delayed_ack; + + ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ratio += cnt; + + ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); + } + + /* Some calls are for duplicates without timetamps */ + if (rtt_us < 0) + return; + + /* Discard delay samples right after fast recovery */ + if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = (rtt_us << 3) / USEC_PER_MSEC; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; + + /* hystart triggers when cwnd is larger than some threshold */ + if (hystart && tp->snd_cwnd <= tp->snd_ssthresh && + tp->snd_cwnd >= hystart_low_window) + hystart_update(sk, delay); +} + +static struct tcp_congestion_ops cubictcp __read_mostly = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "cubic", +}; + +static int __init cubictcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + + /* Precompute a bunch of the scaling factors that are used per-packet + * based on SRTT of 100ms + */ + + beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + + cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is bictcp_HZ=2^10, not HZ + * + * c = bic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + + /* 1/c * 2^2*bictcp_HZ * srtt */ + cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ + + /* divide by bic_scale and by constant Srtt (100ms) */ + do_div(cube_factor, bic_scale * 10); + + /* hystart needs ms clock resolution */ + if (hystart && HZ < 1000) + cubictcp.flags |= TCP_CONG_RTT_STAMP; + + return tcp_register_congestion_control(&cubictcp); +} + +static void __exit cubictcp_unregister(void) +{ + tcp_unregister_congestion_control(&cubictcp); +} + +module_init(cubictcp_register); +module_exit(cubictcp_unregister); + +MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CUBIC TCP"); +MODULE_VERSION("2.3"); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c new file mode 100644 index 00000000..939edb3b --- /dev/null +++ b/net/ipv4/tcp_diag.c @@ -0,0 +1,57 @@ +/* + * tcp_diag.c Module for monitoring TCP transport protocols sockets. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include + +#include + +#include + +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + if (sk->sk_state == TCP_LISTEN) { + r->idiag_rqueue = sk->sk_ack_backlog; + r->idiag_wqueue = sk->sk_max_ack_backlog; + } else { + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + r->idiag_wqueue = tp->write_seq - tp->snd_una; + } + if (info != NULL) + tcp_get_info(sk, info); +} + +static const struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static int __init tcp_diag_init(void) +{ + return inet_diag_register(&tcp_diag_handler); +} + +static void __exit tcp_diag_exit(void) +{ + inet_diag_unregister(&tcp_diag_handler); +} + +module_init(tcp_diag_init); +module_exit(tcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 00000000..30f27f6b --- /dev/null +++ b/net/ipv4/tcp_highspeed.c @@ -0,0 +1,187 @@ +/* + * Sally Floyd's High Speed TCP (RFC 3649) congestion control + * + * See http://www.icir.org/floyd/hstcp.html + * + * John Heffner + */ + +#include +#include + + +/* From AIMD tables from RFC 3649 appendix B, + * with fixed-point MD scaled <<8. + */ +static const struct hstcp_aimd_val { + unsigned int cwnd; + unsigned int md; +} hstcp_aimd_vals[] = { + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, +}; + +#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) + +struct hstcp { + u32 ai; +}; + +static void hstcp_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); + + ca->ai = 0; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); +} + +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + /* Update AIMD parameters. + * + * We want to guarantee that: + * hstcp_aimd_vals[ca->ai-1].cwnd < + * snd_cwnd <= + * hstcp_aimd_vals[ca->ai].cwnd + */ + if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai < HSTCP_AIMD_MAX - 1) + ca->ai++; + } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) { + while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) + ca->ai--; + } + + /* Do additive increase */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + /* cwnd = cwnd + a(w) / cwnd */ + tp->snd_cwnd_cnt += ca->ai + 1; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd_cnt -= tp->snd_cwnd; + tp->snd_cwnd++; + } + } + } +} + +static u32 hstcp_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct hstcp *ca = inet_csk_ca(sk); + + /* Do multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); +} + + +static struct tcp_congestion_ops tcp_highspeed __read_mostly = { + .init = hstcp_init, + .ssthresh = hstcp_ssthresh, + .cong_avoid = hstcp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "highspeed" +}; + +static int __init hstcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_highspeed); +} + +static void __exit hstcp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_highspeed); +} + +module_init(hstcp_register); +module_exit(hstcp_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("High Speed TCP"); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 00000000..c1a81753 --- /dev/null +++ b/net/ipv4/tcp_htcp.c @@ -0,0 +1,315 @@ +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ + +#include +#include +#include + +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +static int use_rtt_scaling __read_mostly = 1; +module_param(use_rtt_scaling, int, 0644); +MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); + +static int use_bandwidth_switch __read_mostly = 1; +module_param(use_bandwidth_switch, int, 0644); +MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); + +struct htcp { + u32 alpha; /* Fixed point arith, << 7 */ + u8 beta; /* Fixed point arith, << 7 */ + u8 modeswitch; /* Delay modeswitch + until we had at least one congestion event */ + u16 pkts_acked; + u32 packetcount; + u32 minRTT; + u32 maxRTT; + u32 last_cong; /* Time since last congestion event end */ + u32 undo_last_cong; + + u32 undo_maxRTT; + u32 undo_old_maxB; + + /* Bandwidth estimation */ + u32 minB; + u32 maxB; + u32 old_maxB; + u32 Bi; + u32 lasttime; +}; + +static inline u32 htcp_cong_time(const struct htcp *ca) +{ + return jiffies - ca->last_cong; +} + +static inline u32 htcp_ccount(const struct htcp *ca) +{ + return htcp_cong_time(ca) / ca->minRTT; +} + +static inline void htcp_reset(struct htcp *ca) +{ + ca->undo_last_cong = ca->last_cong; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + + ca->last_cong = jiffies; +} + +static u32 htcp_cwnd_undo(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + + if (ca->undo_last_cong) { + ca->last_cong = ca->undo_last_cong; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + ca->undo_last_cong = 0; + } + + return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); +} + +static inline void measure_rtt(struct sock *sk, u32 srtt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct htcp *ca = inet_csk_ca(sk); + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (ca->minRTT > srtt || !ca->minRTT) + ca->minRTT = srtt; + + /* max RTT */ + if (icsk->icsk_ca_state == TCP_CA_Open) { + if (ca->maxRTT < ca->minRTT) + ca->maxRTT = ca->minRTT; + if (ca->maxRTT < srtt && + srtt <= ca->maxRTT + msecs_to_jiffies(20)) + ca->maxRTT = srtt; + } +} + +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + + if (icsk->icsk_ca_state == TCP_CA_Open) + ca->pkts_acked = pkts_acked; + + if (rtt > 0) + measure_rtt(sk, usecs_to_jiffies(rtt)); + + if (!use_bandwidth_switch) + return; + + /* achieved throughput calculations */ + if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) { + ca->packetcount = 0; + ca->lasttime = now; + return; + } + + ca->packetcount += pkts_acked; + + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && + now - ca->lasttime >= ca->minRTT && + ca->minRTT > 0) { + __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); + + if (htcp_ccount(ca) <= 3) { + /* just after backoff */ + ca->minB = ca->maxB = ca->Bi = cur_Bi; + } else { + ca->Bi = (3 * ca->Bi + cur_Bi) / 4; + if (ca->Bi > ca->maxB) + ca->maxB = ca->Bi; + if (ca->minB > ca->maxB) + ca->minB = ca->maxB; + } + ca->packetcount = 0; + ca->lasttime = now; + } +} + +static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) +{ + if (use_bandwidth_switch) { + u32 maxB = ca->maxB; + u32 old_maxB = ca->old_maxB; + ca->old_maxB = ca->maxB; + + if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + + if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) { + ca->beta = (minRTT << 7) / maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void htcp_alpha_update(struct htcp *ca) +{ + u32 minRTT = ca->minRTT; + u32 factor = 1; + u32 diff = htcp_cong_time(ca); + + if (diff > HZ) { + diff -= HZ; + factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ; + } + + if (use_rtt_scaling && minRTT) { + u32 scale = (HZ << 3) / (10 * minRTT); + + /* clamping ratio to interval [0.5,10]<<3 */ + scale = min(max(scale, 1U << 2), 10U << 3); + factor = (factor << 3) / scale; + if (!factor) + factor = 1; + } + + ca->alpha = 2 * factor * ((1 << 7) - ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* + * After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void htcp_param_update(struct sock *sk) +{ + struct htcp *ca = inet_csk_ca(sk); + u32 minRTT = ca->minRTT; + u32 maxRTT = ca->maxRTT; + + htcp_beta_update(ca, minRTT, maxRTT); + htcp_alpha_update(ca); + + /* add slowly fading memory for maxRTT to accommodate routing changes */ + if (minRTT > 0 && maxRTT > minRTT) + ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100; +} + +static u32 htcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct htcp *ca = inet_csk_ca(sk); + + htcp_param_update(sk); + return max((tp->snd_cwnd * ca->beta) >> 7, 2U); +} + +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd + */ + if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + htcp_alpha_update(ca); + } else + tp->snd_cwnd_cnt += ca->pkts_acked; + + ca->pkts_acked = 1; + } +} + +static void htcp_init(struct sock *sk) +{ + struct htcp *ca = inet_csk_ca(sk); + + memset(ca, 0, sizeof(struct htcp)); + ca->alpha = ALPHA_BASE; + ca->beta = BETA_MIN; + ca->pkts_acked = 1; + ca->last_cong = jiffies; +} + +static void htcp_state(struct sock *sk, u8 new_state) +{ + switch (new_state) { + case TCP_CA_Open: + { + struct htcp *ca = inet_csk_ca(sk); + if (ca->undo_last_cong) { + ca->last_cong = jiffies; + ca->undo_last_cong = 0; + } + } + break; + case TCP_CA_CWR: + case TCP_CA_Recovery: + case TCP_CA_Loss: + htcp_reset(inet_csk_ca(sk)); + break; + } +} + +static struct tcp_congestion_ops htcp __read_mostly = { + .init = htcp_init, + .ssthresh = htcp_recalc_ssthresh, + .cong_avoid = htcp_cong_avoid, + .set_state = htcp_state, + .undo_cwnd = htcp_cwnd_undo, + .pkts_acked = measure_achieved_throughput, + .owner = THIS_MODULE, + .name = "htcp", +}; + +static int __init htcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); + BUILD_BUG_ON(BETA_MIN >= BETA_MAX); + return tcp_register_congestion_control(&htcp); +} + +static void __exit htcp_unregister(void) +{ + tcp_unregister_congestion_control(&htcp); +} + +module_init(htcp_register); +module_exit(htcp_unregister); + +MODULE_AUTHOR("Baruch Even"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("H-TCP"); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 00000000..fe3ecf48 --- /dev/null +++ b/net/ipv4/tcp_hybla.c @@ -0,0 +1,192 @@ +/* + * TCP HYBLA + * + * TCP-HYBLA Congestion control algorithm, based on: + * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement + * for Heterogeneous Networks", + * International Journal on satellite Communications, + * September 2004 + * Daniele Lacamera + * root at danielinux.net + */ + +#include +#include + +/* Tcp Hybla structure. */ +struct hybla { + u8 hybla_en; + u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ + u32 rho; /* Rho parameter, integer part */ + u32 rho2; /* Rho * Rho, integer part */ + u32 rho_3ls; /* Rho parameter, <<3 */ + u32 rho2_7ls; /* Rho^2, <<7 */ + u32 minrtt; /* Minimum smoothed round trip time value seen */ +}; + +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), + expressed in jiffies */ +static int rtt0 = 25; +module_param(rtt0, int, 0644); +MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); + + +/* This is called to refresh values for hybla parameters */ +static inline void hybla_recalc_param (struct sock *sk) +{ + struct hybla *ca = inet_csk_ca(sk); + + ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho = ca->rho_3ls >> 3; + ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; + ca->rho2 = ca->rho2_7ls >>7; +} + +static void hybla_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); + + ca->rho = 0; + ca->rho2 = 0; + ca->rho_3ls = 0; + ca->rho2_7ls = 0; + ca->snd_cwnd_cents = 0; + ca->hybla_en = 1; + tp->snd_cwnd = 2; + tp->snd_cwnd_clamp = 65535; + + /* 1st Rho measurement based on initial srtt */ + hybla_recalc_param(sk); + + /* set minimum rtt as this is the 1st ever seen */ + ca->minrtt = tp->srtt; + tp->snd_cwnd = ca->rho; +} + +static void hybla_state(struct sock *sk, u8 ca_state) +{ + struct hybla *ca = inet_csk_ca(sk); + ca->hybla_en = (ca_state == TCP_CA_Open); +} + +static inline u32 hybla_fraction(u32 odds) +{ + static const u32 fractions[] = { + 128, 139, 152, 165, 181, 197, 215, 234, + }; + + return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; +} + +/* TCP Hybla main routine. + * This is the algorithm behavior: + * o Recalc Hybla parameters if min_rtt has changed + * o Give cwnd a new value based on the model proposed + * o remember increments <1 + */ +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); + u32 increment, odd, rho_fractions; + int is_slowstart = 0; + + /* Recalculate rho only if this srtt is the lowest */ + if (tp->srtt < ca->minrtt){ + hybla_recalc_param(sk); + ca->minrtt = tp->srtt; + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (!ca->hybla_en) { + tcp_reno_cong_avoid(sk, ack, in_flight); + return; + } + + if (ca->rho == 0) + hybla_recalc_param(sk); + + rho_fractions = ca->rho_3ls - (ca->rho << 3); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * slow start + * INC = 2^RHO - 1 + * This is done by splitting the rho parameter + * into 2 parts: an integer part and a fraction part. + * Inrement<<7 is estimated by doing: + * [2^(int+fract)]<<7 + * that is equal to: + * (2^int) * [(2^fract) <<7] + * 2^int is straightly computed as 1<rho, 16U)) * + hybla_fraction(rho_fractions)) - 128; + } else { + /* + * congestion avoidance + * INC = RHO^2 / W + * as long as increment is estimated as (rho<<7)/window + * it already is <<7 and we can easily count its fractions. + */ + increment = ca->rho2_7ls / tp->snd_cwnd; + if (increment < 128) + tp->snd_cwnd_cnt++; + } + + odd = increment % 128; + tp->snd_cwnd += increment >> 7; + ca->snd_cwnd_cents += odd; + + /* check when fractions goes >=128 and increase cwnd by 1. */ + while (ca->snd_cwnd_cents >= 128) { + tp->snd_cwnd++; + ca->snd_cwnd_cents -= 128; + tp->snd_cwnd_cnt = 0; + } + /* check when cwnd has not been incremented for a while */ + if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + /* clamp down slowstart cwnd to ssthresh value. */ + if (is_slowstart) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +static struct tcp_congestion_ops tcp_hybla __read_mostly = { + .init = hybla_init, + .ssthresh = tcp_reno_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = hybla_cong_avoid, + .set_state = hybla_state, + + .owner = THIS_MODULE, + .name = "hybla" +}; + +static int __init hybla_register(void) +{ + BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_hybla); +} + +static void __exit hybla_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_hybla); +} + +module_init(hybla_register); +module_exit(hybla_unregister); + +MODULE_AUTHOR("Daniele Lacamera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Hybla"); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c new file mode 100644 index 00000000..813b43a7 --- /dev/null +++ b/net/ipv4/tcp_illinois.c @@ -0,0 +1,356 @@ +/* + * TCP Illinois congestion control. + * Home page: + * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html + * + * The algorithm is described in: + * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm + * for High-Speed Networks" + * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf + * + * Implemented from description in paper and ns-2 simulation. + * Copyright (C) 2007 Stephen Hemminger + */ + +#include +#include +#include +#include +#include + +#define ALPHA_SHIFT 7 +#define ALPHA_SCALE (1u<end_seq = tp->snd_nxt; + ca->cnt_rtt = 0; + ca->sum_rtt = 0; + + /* TODO: age max_rtt? */ +} + +static void tcp_illinois_init(struct sock *sk) +{ + struct illinois *ca = inet_csk_ca(sk); + + ca->alpha = ALPHA_MAX; + ca->beta = BETA_BASE; + ca->base_rtt = 0x7fffffff; + ca->max_rtt = 0; + + ca->acked = 0; + ca->rtt_low = 0; + ca->rtt_above = 0; + + rtt_reset(sk); +} + +/* Measure RTT for each ack. */ +static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt) +{ + struct illinois *ca = inet_csk_ca(sk); + + ca->acked = pkts_acked; + + /* dup ack, no rtt sample */ + if (rtt < 0) + return; + + /* ignore bogus values, this prevents wraparound in alpha math */ + if (rtt > RTT_MAX) + rtt = RTT_MAX; + + /* keep track of minimum RTT seen so far */ + if (ca->base_rtt > rtt) + ca->base_rtt = rtt; + + /* and max */ + if (ca->max_rtt < rtt) + ca->max_rtt = rtt; + + ++ca->cnt_rtt; + ca->sum_rtt += rtt; +} + +/* Maximum queuing delay */ +static inline u32 max_delay(const struct illinois *ca) +{ + return ca->max_rtt - ca->base_rtt; +} + +/* Average queuing delay */ +static inline u32 avg_delay(const struct illinois *ca) +{ + u64 t = ca->sum_rtt; + + do_div(t, ca->cnt_rtt); + return t - ca->base_rtt; +} + +/* + * Compute value of alpha used for additive increase. + * If small window then use 1.0, equivalent to Reno. + * + * For larger windows, adjust based on average delay. + * A. If average delay is at minimum (we are uncongested), + * then use large alpha (10.0) to increase faster. + * B. If average delay is at maximum (getting congested) + * then use small alpha (0.3) + * + * The result is a convex window growth curve. + */ +static u32 alpha(struct illinois *ca, u32 da, u32 dm) +{ + u32 d1 = dm / 100; /* Low threshold */ + + if (da <= d1) { + /* If never got out of low delay zone, then use max */ + if (!ca->rtt_above) + return ALPHA_MAX; + + /* Wait for 5 good RTT's before allowing alpha to go alpha max. + * This prevents one good RTT from causing sudden window increase. + */ + if (++ca->rtt_low < theta) + return ca->alpha; + + ca->rtt_low = 0; + ca->rtt_above = 0; + return ALPHA_MAX; + } + + ca->rtt_above = 1; + + /* + * Based on: + * + * (dm - d1) amin amax + * k1 = ------------------- + * amax - amin + * + * (dm - d1) amin + * k2 = ---------------- - d1 + * amax - amin + * + * k1 + * alpha = ---------- + * k2 + da + */ + + dm -= d1; + da -= d1; + return (dm * ALPHA_MAX) / + (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN); +} + +/* + * Beta used for multiplicative decrease. + * For small window sizes returns same value as Reno (0.5) + * + * If delay is small (10% of max) then beta = 1/8 + * If delay is up to 80% of max then beta = 1/2 + * In between is a linear function + */ +static u32 beta(u32 da, u32 dm) +{ + u32 d2, d3; + + d2 = dm / 10; + if (da <= d2) + return BETA_MIN; + + d3 = (8 * dm) / 10; + if (da >= d3 || d3 <= d2) + return BETA_MAX; + + /* + * Based on: + * + * bmin d3 - bmax d2 + * k3 = ------------------- + * d3 - d2 + * + * bmax - bmin + * k4 = ------------- + * d3 - d2 + * + * b = k3 + k4 da + */ + return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da) + / (d3 - d2); +} + +/* Update alpha and beta values once per RTT */ +static void update_params(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + if (tp->snd_cwnd < win_thresh) { + ca->alpha = ALPHA_BASE; + ca->beta = BETA_BASE; + } else if (ca->cnt_rtt > 0) { + u32 dm = max_delay(ca); + u32 da = avg_delay(ca); + + ca->alpha = alpha(ca, da, dm); + ca->beta = beta(da, dm); + } + + rtt_reset(sk); +} + +/* + * In case of loss, reset to default values + */ +static void tcp_illinois_state(struct sock *sk, u8 new_state) +{ + struct illinois *ca = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + ca->alpha = ALPHA_BASE; + ca->beta = BETA_BASE; + ca->rtt_low = 0; + ca->rtt_above = 0; + rtt_reset(sk); + } +} + +/* + * Increase window in response to successful acknowledgment. + */ +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + if (after(ack, ca->end_seq)) + update_params(sk); + + /* RFC2861 only increase cwnd if fully utilized */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* In slow start */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + else { + u32 delta; + + /* snd_cwnd_cnt is # of packets since last cwnd increment */ + tp->snd_cwnd_cnt += ca->acked; + ca->acked = 1; + + /* This is close approximation of: + * tp->snd_cwnd += alpha/tp->snd_cwnd + */ + delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; + if (delta >= tp->snd_cwnd) { + tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, + (u32) tp->snd_cwnd_clamp); + tp->snd_cwnd_cnt = 0; + } + } +} + +static u32 tcp_illinois_ssthresh(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct illinois *ca = inet_csk_ca(sk); + + /* Multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_illinois_info(struct sock *sk, u32 ext, + struct sk_buff *skb) +{ + const struct illinois *ca = inet_csk_ca(sk); + + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info info = { + .tcpv_enabled = 1, + .tcpv_rttcnt = ca->cnt_rtt, + .tcpv_minrtt = ca->base_rtt, + }; + u64 t = ca->sum_rtt; + + do_div(t, ca->cnt_rtt); + info.tcpv_rtt = t; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops tcp_illinois __read_mostly = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_illinois_init, + .ssthresh = tcp_illinois_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = tcp_illinois_cong_avoid, + .set_state = tcp_illinois_state, + .get_info = tcp_illinois_info, + .pkts_acked = tcp_illinois_acked, + + .owner = THIS_MODULE, + .name = "illinois", +}; + +static int __init tcp_illinois_register(void) +{ + BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_illinois); +} + +static void __exit tcp_illinois_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_illinois); +} + +module_init(tcp_illinois_register); +module_exit(tcp_illinois_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Shao Liu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Illinois"); +MODULE_VERSION("1.0"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c new file mode 100644 index 00000000..7410a8c2 --- /dev/null +++ b/net/ipv4/tcp_input.c @@ -0,0 +1,5963 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. + * + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. + * Eric Schenk : Fixes to slow start algorithm. + * Eric Schenk : Yet another double ACK bug. + * Eric Schenk : Delayed ACK bug fixes. + * Eric Schenk : Floyd style fast retrans war avoidance. + * David S. Miller : Don't allow zero congestion window. + * Eric Schenk : Fix retransmitter so that it sends + * next packet on ack of previous packet. + * Andi Kleen : Moved open_request checking here + * and process RSTs for open_requests. + * Andi Kleen : Better prune_queue, and other fixes. + * Andrey Savochkin: Fix RTT measurements in the presence of + * timestamps. + * Andrey Savochkin: Check sequence numbers correctly when + * removing SACKs due to in sequence incoming + * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. + * Andi Kleen: Add tcp_measure_rcv_mss to make + * connections with MSS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_tcp_timestamps __read_mostly = 1; +int sysctl_tcp_window_scaling __read_mostly = 1; +int sysctl_tcp_sack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +EXPORT_SYMBOL(sysctl_tcp_reordering); +int sysctl_tcp_ecn __read_mostly = 2; +EXPORT_SYMBOL(sysctl_tcp_ecn); +int sysctl_tcp_dsack __read_mostly = 1; +int sysctl_tcp_app_win __read_mostly = 31; +int sysctl_tcp_adv_win_scale __read_mostly = 1; +EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); + +int sysctl_tcp_stdurg __read_mostly; +int sysctl_tcp_rfc1337 __read_mostly; +int sysctl_tcp_max_orphans __read_mostly = NR_FILE; +int sysctl_tcp_frto __read_mostly = 2; +int sysctl_tcp_frto_response __read_mostly; +int sysctl_tcp_nometrics_save __read_mostly; + +int sysctl_tcp_thin_dupack __read_mostly; + +int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; +int sysctl_tcp_abc __read_mostly; + +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ + +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) +#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) + +#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) +#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) + +/* Adapt the MSS value used to make delayed ack decision to the + * real world. + */ +static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const unsigned int lss = icsk->icsk_ack.last_seg_size; + unsigned int len; + + icsk->icsk_ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb_shinfo(skb)->gso_size ? : skb->len; + if (len >= icsk->icsk_ack.rcv_mss) { + icsk->icsk_ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len += skb->data - skb_transport_header(skb); + if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || + /* If PSH is not set, packet should be + * full sized, provided peer TCP is not badly broken. + * This observation (if it is correct 8)) allows + * to handle super-low mtu links fairly. + */ + (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && + !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tcp_sk(sk)->tcp_header_len; + icsk->icsk_ack.last_seg_size = len; + if (len == lss) { + icsk->icsk_ack.rcv_mss = len; + return; + } + } + if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; + } +} + +static void tcp_incr_quickack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); + + if (quickacks == 0) + quickacks = 2; + if (quickacks > icsk->icsk_ack.quick) + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); +} + +static void tcp_enter_quickack_mode(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; +} + +/* Send ACKs quickly, if "quick" count is not exhausted + * and the session is not interactive. + */ + +static inline int tcp_in_quickack_mode(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; +} + +static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +{ + if (tp->ecn_flags & TCP_ECN_OK) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tp->ecn_flags & TCP_ECN_OK) { + if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + /* Funny extension: if ECT is not set on a segment, + * it is surely retransmit. It is not in ECN RFC, + * but Linux follows this rule. */ + else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) + tcp_enter_quickack_mode((struct sock *)tp); + } +} + +static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) +{ + if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) + return 1; + return 0; +} + +/* Buffer size and advertised window tuning. + * + * 1. Tuning sk->sk_sndbuf, when connection enters established state. + */ + +static void tcp_fixup_sndbuf(struct sock *sk) +{ + int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + + sizeof(struct sk_buff); + + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } +} + +/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) + * + * All tcp_full_space() is split to two parts: "network" buffer, allocated + * forward and advertised in receiver window (tp->rcv_wnd) and + * "application buffer", required to isolate scheduling/application + * latencies from network. + * window_clamp is maximal advertised window. It can be less than + * tcp_full_space(), in this case tcp_full_space() - window_clamp + * is reserved for "application" buffer. The less window_clamp is + * the smoother our behaviour from viewpoint of network, but the lower + * throughput and the higher sensitivity of the connection to losses. 8) + * + * rcv_ssthresh is more strict window_clamp used at "slow start" + * phase to predict further behaviour of this connection. + * It is used for two goals: + * - to enforce header prediction at sender, even when application + * requires some significant "application buffer". It is check #1. + * - to prevent pruning of receive queue because of misprediction + * of receiver window. Check #2. + * + * The scheme does not work when sender sends good segments opening + * window and then starts to feed us spaghetti. But it should work + * in common situations. Otherwise, we have to rely on queue collapsing. + */ + +/* Slow part of check#2. */ +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + /* Optimize this! */ + int truesize = tcp_win_from_space(skb->truesize) >> 1; + int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; + + while (tp->rcv_ssthresh <= window) { + if (truesize <= skb->len) + return 2 * inet_csk(sk)->icsk_ack.rcv_mss; + + truesize >>= 1; + window >>= 1; + } + return 0; +} + +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && + !tcp_memory_pressure) { + int incr; + + /* Check #2. Increase window, if skb with such overhead + * will fit to rcvbuf in future. + */ + if (tcp_win_from_space(skb->truesize) <= skb->len) + incr = 2 * tp->advmss; + else + incr = __tcp_grow_window(sk, skb); + + if (incr) { + incr = max_t(int, incr, 2 * skb->len); + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, + tp->window_clamp); + inet_csk(sk)->icsk_ack.quick |= 1; + } + } +} + +/* 3. Tuning rcvbuf, when connection enters established state. */ + +static void tcp_fixup_rcvbuf(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + + /* Try to select rcvbuf so that 4 mss-sized segments + * will fit to window and corresponding skbs will fit to our rcvbuf. + * (was 3; 4 is minimum to allow fast retransmit to work.) + */ + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + if (sk->sk_rcvbuf < 4 * rcvmem) + sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); +} + +/* 4. Try to fixup all. It is made immediately after connection enters + * established state. + */ +static void tcp_init_buffer_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int maxwin; + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + tcp_fixup_rcvbuf(sk); + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) + tcp_fixup_sndbuf(sk); + + tp->rcvq_space.space = tp->rcv_wnd; + + maxwin = tcp_full_space(sk); + + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + + if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + tp->window_clamp = max(maxwin - + (maxwin >> sysctl_tcp_app_win), + 4 * tp->advmss); + } + + /* Force reservation of one segment. */ + if (sysctl_tcp_app_win && + tp->window_clamp > 2 * tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* 5. Recalculate window clamp after socket hit its memory bounds. */ +static void tcp_clamp_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_ack.quick = 0; + + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); + } + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); +} + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ +void tcp_initialize_rcv_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd / 2); + hint = min(hint, TCP_MSS_DEFAULT); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} +EXPORT_SYMBOL(tcp_initialize_rcv_mss); + +/* Receiver "autotuning" code. + * + * The algorithm for RTT estimation w/o timestamps is based on + * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. + * + * + * More detail on this code can be found at + * , + * though this reference is out of date. A new paper + * is pending. + */ +static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) +{ + u32 new_sample = tp->rcv_rtt_est.rtt; + long m = sample; + + if (m == 0) + m = 1; + + if (new_sample != 0) { + /* If we sample in larger samples in the non-timestamp + * case, we could grossly overestimate the RTT especially + * with chatty applications or bulk transfer apps which + * are stalled on filesystem I/O. + * + * Also, since we are only going for a minimum in the + * non-timestamp case, we do not smooth things out + * else with timestamps disabled convergence takes too + * long. + */ + if (!win_dep) { + m -= (new_sample >> 3); + new_sample += m; + } else { + m <<= 3; + if (m < new_sample) + new_sample = m; + } + } else { + /* No previous measure. */ + new_sample = m << 3; + } + + if (tp->rcv_rtt_est.rtt != new_sample) + tp->rcv_rtt_est.rtt = new_sample; +} + +static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) +{ + if (tp->rcv_rtt_est.time == 0) + goto new_measure; + if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) + return; + tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); + +new_measure: + tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; + tp->rcv_rtt_est.time = tcp_time_stamp; +} + +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + if (tp->rx_opt.rcv_tsecr && + (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) + tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); +} + +/* + * This function should be called every time data is copied to user space. + * It calculates the appropriate TCP receive buffer space. + */ +void tcp_rcv_space_adjust(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int time; + int space; + + if (tp->rcvq_space.time == 0) + goto new_measure; + + time = tcp_time_stamp - tp->rcvq_space.time; + if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) + return; + + space = 2 * (tp->copied_seq - tp->rcvq_space.seq); + + space = max(tp->rcvq_space.space, space); + + if (tp->rcvq_space.space != space) { + int rcvmem; + + tp->rcvq_space.space = space; + + if (sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int new_clamp = space; + + /* Receive space grows, normalize in order to + * take into account packet headers and sk_buff + * structure overhead. + */ + space /= tp->advmss; + if (!space) + space = 1; + rcvmem = (tp->advmss + MAX_TCP_HEADER + + 16 + sizeof(struct sk_buff)); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + space *= rcvmem; + space = min(space, sysctl_tcp_rmem[2]); + if (space > sk->sk_rcvbuf) { + sk->sk_rcvbuf = space; + + /* Make the window clamp follow along. */ + tp->window_clamp = new_clamp; + } + } + } + +new_measure: + tp->rcvq_space.seq = tp->copied_seq; + tp->rcvq_space.time = tcp_time_stamp; +} + +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM + */ +static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + u32 now; + + inet_csk_schedule_ack(sk); + + tcp_measure_rcv_mss(sk, skb); + + tcp_rcv_rtt_measure(tp); + + now = tcp_time_stamp; + + if (!icsk->icsk_ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. + */ + tcp_incr_quickack(sk); + icsk->icsk_ack.ato = TCP_ATO_MIN; + } else { + int m = now - icsk->icsk_ack.lrcvtime; + + if (m <= TCP_ATO_MIN / 2) { + /* The fastest case is the first. */ + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; + } else if (m < icsk->icsk_ack.ato) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; + if (icsk->icsk_ack.ato > icsk->icsk_rto) + icsk->icsk_ack.ato = icsk->icsk_rto; + } else if (m > icsk->icsk_rto) { + /* Too long gap. Apparently sender failed to + * restart window, so that we send ACKs quickly. + */ + tcp_incr_quickack(sk); + sk_mem_reclaim(sk); + } + } + icsk->icsk_ack.lrcvtime = now; + + TCP_ECN_check_ce(tp, skb); + + if (skb->len >= 128) + tcp_grow_window(sk, skb); +} + +/* Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +{ + struct tcp_sock *tp = tcp_sk(sk); + long m = mrtt; /* RTT */ + + /* The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + * + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + * + * Funny. This algorithm seems to be very broken. + * These formulae increase RTO, when it should be decreased, increase + * too slowly, when it should be increased quickly, decrease too quickly + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely + * does not matter how to _calculate_ it. Seems, it was trap + * that VJ failed to avoid. 8) + */ + if (m == 0) + m = 1; + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) { + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + /* This is similar to one of Eifel findings. + * Eifel blocks mdev updates when rtt decreases. + * This solution is a bit different: we use finer gain + * for mdev in this case (alpha*beta). + * Like Eifel it also prevents growth of rto, + * but also it limits too fast rto decreases, + * happening in pure Eifel. + */ + if (m > 0) + m >>= 3; + } else { + m -= (tp->mdev >> 2); /* similar update on mdev */ + } + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev > tp->mdev_max) { + tp->mdev_max = tp->mdev; + if (tp->mdev_max > tp->rttvar) + tp->rttvar = tp->mdev_max; + } + if (after(tp->snd_una, tp->rtt_seq)) { + if (tp->mdev_max < tp->rttvar) + tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + tp->rtt_seq = tp->snd_nxt; + tp->mdev_max = tcp_rto_min(sk); + } + } else { + /* no previous measure. */ + tp->srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev = m << 1; /* make sure rto = 3*rtt */ + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + tp->rtt_seq = tp->snd_nxt; + } +} + +/* Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ +static inline void tcp_set_rto(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + /* Old crap is replaced with new one. 8) + * + * More seriously: + * 1. If rtt variance happened to be less 50msec, it is hallucination. + * It cannot be less due to utterly erratic ACK generation made + * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ + * to do with delayed acks, because at cwnd>2 true delack timeout + * is invisible. Actually, Linux-2.4 also generates erratic + * ACKs in some circumstances. + */ + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); + + /* 2. Fixups made earlier cannot be right. + * If we do not estimate RTO correctly without them, + * all the algo is pure shit and should be replaced + * with correct one. It is exactly, which we pretend to do. + */ + + /* NOTE: clamping at TCP_RTO_MIN is not required, current algo + * guarantees that rto is higher. + */ + tcp_bound_rto(sk); +} + +/* Save metrics learned by this TCP session. + This function is called only, when TCP finishes successfully + i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (sysctl_tcp_nometrics_save) + return; + + dst_confirm(dst); + + if (dst && (dst->flags & DST_HOST)) { + const struct inet_connection_sock *icsk = inet_csk(sk); + int m; + unsigned long rtt; + + if (icsk->icsk_backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. + * Reset our results. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) + dst_metric_set(dst, RTAX_RTT, 0); + return; + } + + rtt = dst_metric_rtt(dst, RTAX_RTT); + m = rtt - tp->srtt; + + /* If newly calculated rtt larger than stored one, + * store new one. Otherwise, use EWMA. Remember, + * rtt overestimation is always better than underestimation. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) { + if (m <= 0) + set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); + else + set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); + } + + if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { + unsigned long var; + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + var = dst_metric_rtt(dst, RTAX_RTTVAR); + if (m >= var) + var = m; + else + var -= (var - m) >> 2; + + set_dst_metric_rtt(dst, RTAX_RTTVAR, var); + } + + if (tcp_in_initial_slowstart(tp)) { + /* Slow start still did not finish. */ + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) + dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); + if (!dst_metric_locked(dst, RTAX_CWND) && + tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) + dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); + } else if (tp->snd_cwnd > tp->snd_ssthresh && + icsk->icsk_ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!dst_metric_locked(dst, RTAX_SSTHRESH)) + dst_metric_set(dst, RTAX_SSTHRESH, + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); + if (!dst_metric_locked(dst, RTAX_CWND)) + dst_metric_set(dst, RTAX_CWND, + (dst_metric(dst, RTAX_CWND) + + tp->snd_cwnd) >> 1); + } else { + /* Else slow start did not finish, cwnd is non-sense, + ssthresh may be also invalid. + */ + if (!dst_metric_locked(dst, RTAX_CWND)) + dst_metric_set(dst, RTAX_CWND, + (dst_metric(dst, RTAX_CWND) + + tp->snd_ssthresh) >> 1); + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) + dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); + } + + if (!dst_metric_locked(dst, RTAX_REORDERING)) { + if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + dst_metric_set(dst, RTAX_REORDERING, tp->reordering); + } + } +} + +__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) +{ + __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); + + if (!cwnd) + cwnd = TCP_INIT_CWND; + return min_t(__u32, cwnd, tp->snd_cwnd_clamp); +} + +/* Set slow start threshold and cwnd not falling to slow start */ +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (icsk->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + if (set_ssthresh) + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); + + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + +/* + * Packet counting of FACK is based on in-order assumptions, therefore TCP + * disables it when reordering is detected + */ +static void tcp_disable_fack(struct tcp_sock *tp) +{ + /* RFC3517 uses different metric in lost marker => reset on change */ + if (tcp_is_fack(tp)) + tp->lost_skb_hint = NULL; + tp->rx_opt.sack_ok &= ~2; +} + +/* Take a notice that peer is sending D-SACKs */ +static void tcp_dsack_seen(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok |= 4; +} + +/* Initialize metrics on socket. */ + +static void tcp_init_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + if (dst_metric_locked(dst, RTAX_CWND)) + tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); + if (dst_metric(dst, RTAX_SSTHRESH)) { + tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst_metric(dst, RTAX_REORDERING) && + tp->reordering != dst_metric(dst, RTAX_REORDERING)) { + tcp_disable_fack(tp); + tp->reordering = dst_metric(dst, RTAX_REORDERING); + } + + if (dst_metric(dst, RTAX_RTT) == 0) + goto reset; + + if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + goto reset; + + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal circumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { + tp->srtt = dst_metric_rtt(dst, RTAX_RTT); + tp->rtt_seq = tp->snd_nxt; + } + if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + } + tcp_set_rto(sk); + if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) { +reset: + /* Play conservative. If timestamps are not + * supported, TCP will fail to recalculate correct + * rtt, if initial rto is too small. FORGET ALL AND RESET! + */ + if (!tp->rx_opt.saw_tstamp && tp->srtt) { + tp->srtt = 0; + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + } + } + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_update_reordering(struct sock *sk, const int metric, + const int ts) +{ + struct tcp_sock *tp = tcp_sk(sk); + if (metric > tp->reordering) { + int mib_idx; + + tp->reordering = min(TCP_MAX_REORDERING, metric); + + /* This exciting event is worth to be remembered. 8) */ + if (ts) + mib_idx = LINUX_MIB_TCPTSREORDER; + else if (tcp_is_reno(tp)) + mib_idx = LINUX_MIB_TCPRENOREORDER; + else if (tcp_is_fack(tp)) + mib_idx = LINUX_MIB_TCPFACKREORDER; + else + mib_idx = LINUX_MIB_TCPSACKREORDER; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); +#if FASTRETRANS_DEBUG > 1 + printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", + tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, + tp->reordering, + tp->fackets_out, + tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); +#endif + tcp_disable_fack(tp); + } +} + +/* This must be called before lost_out is incremented */ +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) +{ + if ((tp->retransmit_skb_hint == NULL) || + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + tp->retransmit_skb_hint = skb; + + if (!tp->lost_out || + after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; +} + +static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tcp_verify_retransmit_hint(tp, skb); + + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + } +} + +static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, + struct sk_buff *skb) +{ + tcp_verify_retransmit_hint(tp, skb); + + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + } +} + +/* This procedure tags the retransmission queue when SACKs arrive. + * + * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). + * Packets in queue with these bits set are counted in variables + * sacked_out, retrans_out and lost_out, correspondingly. + * + * Valid combinations are: + * Tag InFlight Description + * 0 1 - orig segment is in flight. + * S 0 - nothing flies, orig reached receiver. + * L 0 - nothing flies, orig lost by net. + * R 2 - both orig and retransmit are in flight. + * L|R 1 - orig is lost, retransmit is in flight. + * S|R 1 - orig reached receiver, retrans is still in flight. + * (L|S|R is logically valid, it could occur when L|R is sacked, + * but it is equivalent to plain S and code short-curcuits it to S. + * L|S is logically invalid, it would mean -1 packet in flight 8)) + * + * These 6 states form finite state machine, controlled by the following events: + * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) + * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) + * 3. Loss detection event of one of three flavors: + * A. Scoreboard estimator decided the packet is lost. + * A'. Reno "three dupacks" marks head of queue lost. + * A''. Its FACK modfication, head until snd.fack is lost. + * B. SACK arrives sacking data transmitted after never retransmitted + * hole was sent out. + * C. SACK arrives sacking SND.NXT at the moment, when the + * segment was retransmitted. + * 4. D-SACK added new rule: D-SACK changes any tag to S. + * + * It is pleasant to note, that state diagram turns out to be commutative, + * so that we are allowed not to be bothered by order of our actions, + * when multiple events arrive simultaneously. (see the function below). + * + * Reordering detection. + * -------------------- + * Reordering metric is maximal distance, which a packet can be displaced + * in packet stream. With SACKs we can estimate it: + * + * 1. SACK fills old hole and the corresponding segment was not + * ever retransmitted -> reordering. Alas, we cannot use it + * when segment was retransmitted. + * 2. The last flaw is solved with D-SACK. D-SACK arrives + * for retransmitted and already SACKed segment -> reordering.. + * Both of these heuristics are not used in Loss state, when we cannot + * account for retransmits accurately. + * + * SACK block validation. + * ---------------------- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself reporting + * SACK reneging when it should advance SND.UNA. Such SACK block this is + * perfectly valid, however, in light of RFC2018 which explicitly states + * that "SACK block MUST reflect the newest segment. Even if the newest + * segment is going to be discarded ...", not that it looks very clever + * in case of head skb. Due to potentional receiver driven attacks, we + * choose to avoid immediate execution of a walk in write queue due to + * reneging and defer head skb's loss recovery to standard loss recovery + * procedure that will eventually trigger (nothing forbids us doing this). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * <- outs wnd -> <- wrapzone -> + * u e n u_w e_w s n_w + * | | | | | | | + * |<------------+------+----- TCP seqno space --------------+---------->| + * ...-- <2^31 ->| |<--------... + * ...---- >2^31 ------>| |<--------... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap -> snd_una region will then become "well defined", i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, D-SACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp->max_window can + * be used as an exaggerated estimate. + */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp->snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for D-SACKs too. + * start_seq == snd_una is non-sensical (see comments above) + */ + if (after(start_seq, tp->snd_una)) + return 1; + + if (!is_dsack || !tp->undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (after(end_seq, tp->snd_una)) + return 0; + + if (!before(start_seq, tp->undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp->undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: + * start_seq < undo_marker and end_seq >= undo_marker. + */ + return !before(start_seq, end_seq - tp->max_window); +} + +/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". + * Event "C". Later note: FACK people cheated me again 8), we have to account + * for reordering! Ugly, but should help. + * + * Search retransmitted skbs from write_queue that were sent when snd_nxt was + * less than what is now known to be received by the other end (derived from + * highest SACK block). Also calculate the lowest snd_nxt among the remaining + * retransmitted skbs to avoid some costly processing per ACKs. + */ +static void tcp_mark_lost_retrans(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int cnt = 0; + u32 new_low_seq = tp->snd_nxt; + u32 received_upto = tcp_highest_sack_seq(tp); + + if (!tcp_is_fack(tp) || !tp->retrans_out || + !after(received_upto, tp->lost_retrans_low) || + icsk->icsk_ca_state != TCP_CA_Recovery) + return; + + tcp_for_write_queue(skb, sk) { + u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; + + if (skb == tcp_send_head(sk)) + break; + if (cnt == tp->retrans_out) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) + continue; + + /* TODO: We would like to get rid of tcp_is_fack(tp) only + * constraint here (see above) but figuring out that at + * least tp->reordering SACK blocks reside between ack_seq + * and received_upto is not easy task to do cheaply with + * the available datastructures. + * + * Whether FACK should check here for tp->reordering segs + * in-between one could argue for either way (it would be + * rather simple to implement as we could count fack_count + * during the walk and do tp->fackets_out - fack_count). + */ + if (after(received_upto, ack_seq)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); + } else { + if (before(ack_seq, new_low_seq)) + new_low_seq = ack_seq; + cnt += tcp_skb_pcount(skb); + } + } + + if (tp->retrans_out) + tp->lost_retrans_low = new_low_seq; +} + +static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, + struct tcp_sack_block_wire *sp, int num_sacks, + u32 prior_snd_una) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); + u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); + int dup_sack = 0; + + if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { + dup_sack = 1; + tcp_dsack_seen(tp); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); + } else if (num_sacks > 1) { + u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); + u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); + + if (!after(end_seq_0, end_seq_1) && + !before(start_seq_0, start_seq_1)) { + dup_sack = 1; + tcp_dsack_seen(tp); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPDSACKOFORECV); + } + } + + /* D-SACK for already forgotten data... Do dumb counting. */ + if (dup_sack && tp->undo_marker && tp->undo_retrans && + !after(end_seq_0, prior_snd_una) && + after(end_seq_0, tp->undo_marker)) + tp->undo_retrans--; + + return dup_sack; +} + +struct tcp_sacktag_state { + int reord; + int fack_count; + int flag; +}; + +/* Check if skb is fully within the SACK block. In presence of GSO skbs, + * the incoming SACK may not exactly match but we can find smaller MSS + * aligned portion of it that matches. Therefore we might need to fragment + * which may fail and creates some hassle (caller must handle error case + * returns). + * + * FIXME: this could be merged to shift decision code + */ +static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, + u32 start_seq, u32 end_seq) +{ + int in_sack, err; + unsigned int pkt_len; + unsigned int mss; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (tcp_skb_pcount(skb) > 1 && !in_sack && + after(TCP_SKB_CB(skb)->end_seq, start_seq)) { + mss = tcp_skb_mss(skb); + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + + if (!in_sack) { + pkt_len = start_seq - TCP_SKB_CB(skb)->seq; + if (pkt_len < mss) + pkt_len = mss; + } else { + pkt_len = end_seq - TCP_SKB_CB(skb)->seq; + if (pkt_len < mss) + return -EINVAL; + } + + /* Round if necessary so that SACKs cover only full MSSes + * and/or the remaining small portion (if present) + */ + if (pkt_len > mss) { + unsigned int new_len = (pkt_len / mss) * mss; + if (!in_sack && new_len < pkt_len) { + new_len += mss; + if (new_len > skb->len) + return 0; + } + pkt_len = new_len; + } + err = tcp_fragment(sk, skb, pkt_len, mss); + if (err < 0) + return err; + } + + return in_sack; +} + +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, + int dup_sack, int pcount) +{ + struct tcp_sock *tp = tcp_sk(sk); + int fack_count = state->fack_count; + + /* Account D-SACK for retransmitted packet. */ + if (dup_sack && (sacked & TCPCB_RETRANS)) { + if (tp->undo_marker && tp->undo_retrans && + after(end_seq, tp->undo_marker)) + tp->undo_retrans--; + if (sacked & TCPCB_SACKED_ACKED) + state->reord = min(fack_count, state->reord); + } + + /* Nothing to do; acked frame is about to be dropped (was ACKed). */ + if (!after(end_seq, tp->snd_una)) + return sacked; + + if (!(sacked & TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= pcount; + tp->retrans_out -= pcount; + } + } else { + if (!(sacked & TCPCB_RETRANS)) { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (before(start_seq, + tcp_highest_sack_seq(tp))) + state->reord = min(fack_count, + state->reord); + + /* SACK enhanced F-RTO (RFC4138; Appendix B) */ + if (!after(end_seq, tp->frto_highmark)) + state->flag |= FLAG_ONLY_ORIG_SACKED; + } + + if (sacked & TCPCB_LOST) { + sacked &= ~TCPCB_LOST; + tp->lost_out -= pcount; + } + } + + sacked |= TCPCB_SACKED_ACKED; + state->flag |= FLAG_DATA_SACKED; + tp->sacked_out += pcount; + + fack_count += pcount; + + /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ + if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && + before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) + tp->lost_cnt_hint += pcount; + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } + + /* D-SACK. We can detect redundant retransmission in S|R and plain R + * frames and clear it. undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { + sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= pcount; + } + + return sacked; +} + +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ +static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, + struct tcp_sacktag_state *state, + unsigned int pcount, int shifted, int mss, + int dup_sack) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ + u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ + + BUG_ON(!pcount); + + /* Adjust counters and hints for the newly sacked sequence + * range but discard the return value since prev is already + * marked. We must tag the range first because the seq + * advancement below implicitly advances + * tcp_highest_sack_seq() when skb is highest_sack. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); + + if (skb == tp->lost_skb_hint) + tp->lost_cnt_hint += pcount; + + TCP_SKB_CB(prev)->end_seq += shifted; + TCP_SKB_CB(skb)->seq += shifted; + + skb_shinfo(prev)->gso_segs += pcount; + BUG_ON(skb_shinfo(skb)->gso_segs < pcount); + skb_shinfo(skb)->gso_segs -= pcount; + + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep + * setting gso_size to something. + */ + if (!skb_shinfo(prev)->gso_size) { + skb_shinfo(prev)->gso_size = mss; + skb_shinfo(prev)->gso_type = sk->sk_gso_type; + } + + /* CHECKME: To clear or not to clear? Mimics normal skb currently */ + if (skb_shinfo(skb)->gso_segs <= 1) { + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + } + + /* Difference in this won't matter, both ACKed by the same cumul. ACK */ + TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); + + if (skb->len > 0) { + BUG_ON(!tcp_skb_pcount(skb)); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); + return 0; + } + + /* Whole SKB was eaten :-) */ + + if (skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = prev; + if (skb == tp->scoreboard_skb_hint) + tp->scoreboard_skb_hint = prev; + if (skb == tp->lost_skb_hint) { + tp->lost_skb_hint = prev; + tp->lost_cnt_hint -= tcp_skb_pcount(prev); + } + + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; + if (skb == tcp_highest_sack(sk)) + tcp_advance_highest_sack(sk, skb); + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); + + return 1; +} + +/* I wish gso_size would have a bit more sane initialization than + * something-or-zero which complicates things + */ +static int tcp_skb_seglen(struct sk_buff *skb) +{ + return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); +} + +/* Shifting pages past head area doesn't work */ +static int skb_can_shift(struct sk_buff *skb) +{ + return !skb_headlen(skb) && skb_is_nonlinear(skb); +} + +/* Try collapsing SACK blocks spanning across multiple skbs to a single + * skb. + */ +static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, + struct tcp_sacktag_state *state, + u32 start_seq, u32 end_seq, + int dup_sack) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *prev; + int mss; + int pcount = 0; + int len; + int in_sack; + + if (!sk_can_gso(sk)) + goto fallback; + + /* Normally R but no L won't result in plain S */ + if (!dup_sack && + (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) + goto fallback; + if (!skb_can_shift(skb)) + goto fallback; + /* This frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + goto fallback; + + /* Can only happen with delayed DSACK + discard craziness */ + if (unlikely(skb == tcp_write_queue_head(sk))) + goto fallback; + prev = tcp_write_queue_prev(sk, skb); + + if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto fallback; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (in_sack) { + len = skb->len; + pcount = tcp_skb_pcount(skb); + mss = tcp_skb_seglen(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_skb_seglen(prev)) + goto fallback; + } else { + if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) + goto noop; + /* CHECKME: This is non-MSS split case only?, this will + * cause skipped skbs due to advancing loop btw, original + * has that feature too + */ + if (tcp_skb_pcount(skb) <= 1) + goto noop; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + if (!in_sack) { + /* TODO: head merge to next could be attempted here + * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), + * though it might not be worth of the additional hassle + * + * ...we can probably just fallback to what was done + * previously. We could try merging non-SACKed ones + * as well but it probably isn't going to buy off + * because later SACKs might again split them, and + * it would make skb timestamp tracking considerably + * harder problem. + */ + goto fallback; + } + + len = end_seq - TCP_SKB_CB(skb)->seq; + BUG_ON(len < 0); + BUG_ON(len > skb->len); + + /* MSS boundaries should be honoured or else pcount will + * severely break even though it makes things bit trickier. + * Optimize common case to avoid most of the divides + */ + mss = tcp_skb_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_skb_seglen(prev)) + goto fallback; + + if (len == mss) { + pcount = 1; + } else if (len < mss) { + goto noop; + } else { + pcount = len / mss; + len = pcount * mss; + } + } + + /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ + if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) + goto fallback; + + if (!skb_shift(prev, skb, len)) + goto fallback; + if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + goto out; + + /* Hole filled allows collapsing with the next as well, this is very + * useful when hole on every nth skb pattern happens + */ + if (prev == tcp_write_queue_tail(sk)) + goto out; + skb = tcp_write_queue_next(sk, prev); + + if (!skb_can_shift(skb) || + (skb == tcp_send_head(sk)) || + ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || + (mss != tcp_skb_seglen(skb))) + goto out; + + len = skb->len; + if (skb_shift(prev, skb, len)) { + pcount += tcp_skb_pcount(skb); + tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + } + +out: + state->fack_count += pcount; + return prev; + +noop: + return skb; + +fallback: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); + return NULL; +} + +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, + struct tcp_sack_block *next_dup, + struct tcp_sacktag_state *state, + u32 start_seq, u32 end_seq, + int dup_sack_in) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp; + + tcp_for_write_queue_from(skb, sk) { + int in_sack = 0; + int dup_sack = dup_sack_in; + + if (skb == tcp_send_head(sk)) + break; + + /* queue is in-order => we can short-circuit the walk early */ + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + if ((next_dup != NULL) && + before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { + in_sack = tcp_match_skb_to_sack(sk, skb, + next_dup->start_seq, + next_dup->end_seq); + if (in_sack > 0) + dup_sack = 1; + } + + /* skb reference here is a bit tricky to get right, since + * shifting can eat and free both this skb and the next, + * so not even _safe variant of the loop is enough. + */ + if (in_sack <= 0) { + tmp = tcp_shift_skb_data(sk, skb, state, + start_seq, end_seq, dup_sack); + if (tmp != NULL) { + if (tmp != skb) { + skb = tmp; + continue; + } + + in_sack = 0; + } else { + in_sack = tcp_match_skb_to_sack(sk, skb, + start_seq, + end_seq); + } + } + + if (unlikely(in_sack < 0)) + break; + + if (in_sack) { + TCP_SKB_CB(skb)->sacked = + tcp_sacktag_one(sk, + state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb)); + + if (!before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + + state->fack_count += tcp_skb_pcount(skb); + } + return skb; +} + +/* Avoid all extra work that is being done by sacktag while walking in + * a normal way + */ +static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, + struct tcp_sacktag_state *state, + u32 skip_to_seq) +{ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + break; + + state->fack_count += tcp_skb_pcount(skb); + } + return skb; +} + +static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, + struct sock *sk, + struct tcp_sack_block *next_dup, + struct tcp_sacktag_state *state, + u32 skip_to_seq) +{ + if (next_dup == NULL) + return skb; + + if (before(next_dup->start_seq, skip_to_seq)) { + skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); + skb = tcp_sacktag_walk(skb, sk, NULL, state, + next_dup->start_seq, next_dup->end_seq, + 1); + } + + return skb; +} + +static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +{ + return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); +} + +static int +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, + u32 prior_snd_una) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned char *ptr = (skb_transport_header(ack_skb) + + TCP_SKB_CB(ack_skb)->sacked); + struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); + struct tcp_sack_block sp[TCP_NUM_SACKS]; + struct tcp_sack_block *cache; + struct tcp_sacktag_state state; + struct sk_buff *skb; + int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); + int used_sacks; + int found_dup_sack = 0; + int i, j; + int first_sack_index; + + state.flag = 0; + state.reord = tp->packets_out; + + if (!tp->sacked_out) { + if (WARN_ON(tp->fackets_out)) + tp->fackets_out = 0; + tcp_highest_sack_reset(sk); + } + + found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, + num_sacks, prior_snd_una); + if (found_dup_sack) + state.flag |= FLAG_DSACKING_ACK; + + /* Eliminate too old ACKs, but take into + * account more or less fresh ones, they can + * contain valid SACK info. + */ + if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) + return 0; + + if (!tp->packets_out) + goto out; + + used_sacks = 0; + first_sack_index = 0; + for (i = 0; i < num_sacks; i++) { + int dup_sack = !i && found_dup_sack; + + sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); + sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); + + if (!tcp_is_sackblock_valid(tp, dup_sack, + sp[used_sacks].start_seq, + sp[used_sacks].end_seq)) { + int mib_idx; + + if (dup_sack) { + if (!tp->undo_marker) + mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; + else + mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; + } else { + /* Don't count olds caused by ACK reordering */ + if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && + !after(sp[used_sacks].end_seq, tp->snd_una)) + continue; + mib_idx = LINUX_MIB_TCPSACKDISCARD; + } + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + if (i == 0) + first_sack_index = -1; + continue; + } + + /* Ignore very old stuff early */ + if (!after(sp[used_sacks].end_seq, prior_snd_una)) + continue; + + used_sacks++; + } + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = used_sacks - 1; i > 0; i--) { + for (j = 0; j < i; j++) { + if (after(sp[j].start_seq, sp[j + 1].start_seq)) { + swap(sp[j], sp[j + 1]); + + /* Track where the first SACK block goes to */ + if (j == first_sack_index) + first_sack_index = j + 1; + } + } + } + + skb = tcp_write_queue_head(sk); + state.fack_count = 0; + i = 0; + + if (!tp->sacked_out) { + /* It's already past, so skip checking against it */ + cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); + } else { + cache = tp->recv_sack_cache; + /* Skip empty blocks in at head of the cache */ + while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && + !cache->end_seq) + cache++; + } + + while (i < used_sacks) { + u32 start_seq = sp[i].start_seq; + u32 end_seq = sp[i].end_seq; + int dup_sack = (found_dup_sack && (i == first_sack_index)); + struct tcp_sack_block *next_dup = NULL; + + if (found_dup_sack && ((i + 1) == first_sack_index)) + next_dup = &sp[i + 1]; + + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + state.flag |= FLAG_DATA_LOST; + + /* Skip too early cached blocks */ + while (tcp_sack_cache_ok(tp, cache) && + !before(start_seq, cache->end_seq)) + cache++; + + /* Can skip some work by looking recv_sack_cache? */ + if (tcp_sack_cache_ok(tp, cache) && !dup_sack && + after(end_seq, cache->start_seq)) { + + /* Head todo? */ + if (before(start_seq, cache->start_seq)) { + skb = tcp_sacktag_skip(skb, sk, &state, + start_seq); + skb = tcp_sacktag_walk(skb, sk, next_dup, + &state, + start_seq, + cache->start_seq, + dup_sack); + } + + /* Rest of the block already fully processed? */ + if (!after(end_seq, cache->end_seq)) + goto advance_sp; + + skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, + &state, + cache->end_seq); + + /* ...tail remains todo... */ + if (tcp_highest_sack_seq(tp) == cache->end_seq) { + /* ...but better entrypoint exists! */ + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + state.fack_count = tp->fackets_out; + cache++; + goto walk; + } + + skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); + /* Check overlap against next cached too (past this one already) */ + cache++; + continue; + } + + if (!before(start_seq, tcp_highest_sack_seq(tp))) { + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + state.fack_count = tp->fackets_out; + } + skb = tcp_sacktag_skip(skb, sk, &state, start_seq); + +walk: + skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + start_seq, end_seq, dup_sack); + +advance_sp: + /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct + * due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) + state.flag &= ~FLAG_ONLY_ORIG_SACKED; + + i++; + } + + /* Clear the head of the cache sack blocks so we can skip it next time */ + for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { + tp->recv_sack_cache[i].start_seq = 0; + tp->recv_sack_cache[i].end_seq = 0; + } + for (j = 0; j < used_sacks; j++) + tp->recv_sack_cache[i++] = sp[j]; + + tcp_mark_lost_retrans(sk); + + tcp_verify_left_out(tp); + + if ((state.reord < tp->fackets_out) && + ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && + (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) + tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + +out: + +#if FASTRETRANS_DEBUG > 0 + WARN_ON((int)tp->sacked_out < 0); + WARN_ON((int)tp->lost_out < 0); + WARN_ON((int)tp->retrans_out < 0); + WARN_ON((int)tcp_packets_in_flight(tp) < 0); +#endif + return state.flag; +} + +/* Limits sacked_out so that sum with lost_out isn't ever larger than + * packets_out. Returns zero if sacked_out adjustement wasn't necessary. + */ +static int tcp_limit_reno_sacked(struct tcp_sock *tp) +{ + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if ((tp->sacked_out + holes) > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + return 1; + } + return 0; +} + +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct sock *sk, const int addend) +{ + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_limit_reno_sacked(tp)) + tcp_update_reordering(sk, tp->packets_out + addend, 0); +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + tp->sacked_out++; + tcp_check_reno_reordering(sk, 0); + tcp_verify_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked - 1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked - 1; + } + tcp_check_reno_reordering(sk, acked); + tcp_verify_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out = 0; +} + +static int tcp_is_sackfrto(const struct tcp_sock *tp) +{ + return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); +} + +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) + */ +int tcp_use_frto(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb; + + if (!sysctl_tcp_frto) + return 0; + + /* MTU probe and F-RTO won't really play nicely along currently */ + if (icsk->icsk_mtup.probe_size) + return 0; + + if (tcp_is_sackfrto(tp)) + return 1; + + /* Avoid expensive walking of rexmit queue if possible */ + if (tp->retrans_out > 1) + return 0; + + skb = tcp_write_queue_head(sk); + if (tcp_skb_is_last(sk, skb)) + return 1; + skb = tcp_write_queue_next(sk, skb); /* Skips head */ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + return 0; + /* Short-circuit when first non-SACKed skb has been checked */ + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + break; + } + return 1; +} + +/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO + * recovery a bit and use heuristics in tcp_process_frto() to detect if + * the RTO was spurious. Only clear SACKED_RETRANS of the head here to + * keep retrans_out counting accurate (with SACK F-RTO, other than head + * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS + * bits are handled if the Loss state is really to be entered (in + * tcp_enter_frto_loss). + * + * Do like tcp_enter_loss() would; when RTO expires the second time it + * does: + * "Reduce ssthresh if it has not yet been made inside this window." + */ +void tcp_enter_frto(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || + tp->snd_una == tp->high_seq || + ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && + !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + /* Our state is too optimistic in ssthresh() call because cwnd + * is not reduced until tcp_enter_frto_loss() when previous F-RTO + * recovery has not yet completed. Pattern would be this: RTO, + * Cumulative ACK, RTO (2xRTO for the same segment does not end + * up here twice). + * RFC4138 should be more specific on what to do, even though + * RTO is quite unlikely to occur after the first Cumulative ACK + * due to back-off and complexity of triggering events ... + */ + if (tp->frto_counter) { + u32 stored_cwnd; + stored_cwnd = tp->snd_cwnd; + tp->snd_cwnd = 2; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = stored_cwnd; + } else { + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + } + /* ... in theory, cong.control module could do "any tricks" in + * ssthresh(), which means that ca_state, lost bits and lost_out + * counter would have to be faked before the call occurs. We + * consider that too expensive, unlikely and hacky, so modules + * using these in ssthresh() must deal these incompatibility + * issues if they receives CA_EVENT_FRTO and frto_counter != 0 + */ + tcp_ca_event(sk, CA_EVENT_FRTO); + } + + tp->undo_marker = tp->snd_una; + tp->undo_retrans = 0; + + skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + tcp_verify_left_out(tp); + + /* Too bad if TCP was application limited */ + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); + + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ + if (tcp_is_sackfrto(tp) && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; + } else { + tp->frto_highmark = tp->snd_nxt; + } + tcp_set_ca_state(sk, TCP_CA_Disorder); + tp->high_seq = tp->snd_nxt; + tp->frto_counter = 1; +} + +/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, + * which indicates that we should follow the traditional RTO recovery, + * i.e. mark everything lost and do go-back-N retransmission. + */ +static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + tp->lost_out = 0; + tp->retrans_out = 0; + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + /* + * Count the retransmission made on RTO correctly (only when + * waiting for the first ACK and did not get it)... + */ + if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { + /* For some reason this R-bit might get cleared? */ + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out += tcp_skb_pcount(skb); + /* ...enter this if branch just for the first segment */ + flag |= FLAG_DATA_ACKED; + } else { + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + } + + /* Marking forward transmissions that were made after RTO lost + * can cause unnecessary retransmissions in some scenarios, + * SACK blocks will mitigate that in some but not in all cases. + * We used to not mark them but it was causing break-ups with + * receivers that do only in-order receival. + * + * TODO: we could detect presence of such receiver and select + * different behavior per flow. + */ + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; + } + } + tcp_verify_left_out(tp); + + tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->frto_counter = 0; + tp->bytes_acked = 0; + + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(sk, TCP_CA_Loss); + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); + + tcp_clear_all_retrans_hints(tp); +} + +static void tcp_clear_retrans_partial(struct tcp_sock *tp) +{ + tp->retrans_out = 0; + tp->lost_out = 0; + + tp->undo_marker = 0; + tp->undo_retrans = 0; +} + +void tcp_clear_retrans(struct tcp_sock *tp) +{ + tcp_clear_retrans_partial(tp); + + tp->fackets_out = 0; + tp->sacked_out = 0; +} + +/* Enter Loss state. If "how" is not zero, forget all SACK information + * and reset tags completely, otherwise preserve SACKs. If receiver + * dropped its ofo queue, we will know this due to reneging detection. + */ +void tcp_enter_loss(struct sock *sk, int how) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* Reduce ssthresh if it has not yet been made inside this window. */ + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_LOSS); + } + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + + tp->bytes_acked = 0; + tcp_clear_retrans_partial(tp); + + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); + + if (!how) { + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ + tp->undo_marker = tp->snd_una; + } else { + tp->sacked_out = 0; + tp->fackets_out = 0; + } + tcp_clear_all_retrans_hints(tp); + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; + } + } + tcp_verify_left_out(tp); + + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(sk, TCP_CA_Loss); + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); + /* Abort F-RTO algorithm if one is in progress */ + tp->frto_counter = 0; +} + +/* If ACK arrived pointing to a remembered SACK, it means that our + * remembered SACKs do not reflect real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * + * Do processing similar to RTO timeout. + */ +static int tcp_check_sack_reneging(struct sock *sk, int flag) +{ + if (flag & FLAG_SACK_RENEGING) { + struct inet_connection_sock *icsk = inet_csk(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + + tcp_enter_loss(sk, 1); + icsk->icsk_retransmits++; + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, TCP_RTO_MAX); + return 1; + } + return 0; +} + +static inline int tcp_fackets_out(struct tcp_sock *tp) +{ + return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; +} + +/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs + * counter when SACK is enabled (without SACK, sacked_out is used for + * that purpose). + * + * Instead, with FACK TCP uses fackets_out that includes both SACKed + * segments up to the highest received SACK block so far and holes in + * between them. + * + * With reordering, holes may still be in flight, so RFC3517 recovery + * uses pure sacked_out (total number of SACKed segments) even though + * it violates the RFC that uses duplicate ACKs, often these are equal + * but when e.g. out-of-window ACKs or packet duplication occurs, + * they differ. Since neither occurs due to loss, TCP should really + * ignore them. + */ +static inline int tcp_dupack_heuristics(struct tcp_sock *tp) +{ + return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; +} + +static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) +{ + return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; +} + +static inline int tcp_head_timedout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return tp->packets_out && + tcp_skb_timedout(sk, tcp_write_queue_head(sk)); +} + +/* Linux NewReno/SACK/FACK/ECN state machine. + * -------------------------------------- + * + * "Open" Normal state, no dubious events, fast path. + * "Disorder" In all the respects it is "Open", + * but requires a bit more attention. It is entered when + * we see some SACKs or dupacks. It is split of "Open" + * mainly to move some processing from fast path to slow one. + * "CWR" CWND was reduced due to some Congestion Notification event. + * It can be ECN, ICMP source quench, local device congestion. + * "Recovery" CWND was reduced, we are fast-retransmitting. + * "Loss" CWND was reduced due to RTO timeout or SACK reneging. + * + * tcp_fastretrans_alert() is entered: + * - each incoming ACK, if state is not "Open" + * - when arrived ACK is unusual, namely: + * * SACK + * * Duplicate ACK. + * * ECN ECE. + * + * Counting packets in flight is pretty simple. + * + * in_flight = packets_out - left_out + retrans_out + * + * packets_out is SND.NXT-SND.UNA counted in packets. + * + * retrans_out is number of retransmitted segments. + * + * left_out is number of segments left network, but not ACKed yet. + * + * left_out = sacked_out + lost_out + * + * sacked_out: Packets, which arrived to receiver out of order + * and hence not ACKed. With SACKs this number is simply + * amount of SACKed data. Even without SACKs + * it is easy to give pretty reliable estimate of this number, + * counting duplicate ACKs. + * + * lost_out: Packets lost by network. TCP has no explicit + * "loss notification" feedback from network (for now). + * It means that this number can be only _guessed_. + * Actually, it is the heuristics to predict lossage that + * distinguishes different algorithms. + * + * F.e. after RTO, when all the queue is considered as lost, + * lost_out = packets_out and in_flight = retrans_out. + * + * Essentially, we have now two algorithms counting + * lost packets. + * + * FACK: It is the simplest heuristics. As soon as we decided + * that something is lost, we decide that _all_ not SACKed + * packets until the most forward SACK are lost. I.e. + * lost_out = fackets_out - sacked_out and left_out = fackets_out. + * It is absolutely correct estimate, if network does not reorder + * packets. And it loses any connection to reality when reordering + * takes place. We use FACK by default until reordering + * is suspected on the path to this destination. + * + * NewReno: when Recovery is entered, we assume that one segment + * is lost (classic Reno). While we are in Recovery and + * a partial ACK arrives, we assume that one more packet + * is lost (NewReno). This heuristics are the same in NewReno + * and SACK. + * + * Imagine, that's all! Forget about all this shamanism about CWND inflation + * deflation etc. CWND is real congestion window, never inflated, changes + * only according to classic VJ rules. + * + * Really tricky (and requiring careful tuning) part of algorithm + * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The first determines the moment _when_ we should reduce CWND and, + * hence, slow down forward transmission. In fact, it determines the moment + * when we decide that hole is caused by loss, rather than by a reorder. + * + * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill + * holes, caused by lost packets. + * + * And the most logically complicated part of algorithm is undo + * heuristics. We detect false retransmits due to both too early + * fast retransmit (reordering) and underestimated RTO, analyzing + * timestamps and D-SACKs. When we detect that some segments were + * retransmitted by mistake and CWND reduction was wrong, we undo + * window reduction and abort recovery phase. This logic is hidden + * inside several functions named tcp_try_undo_. + */ + +/* This function decides, when we should leave Disordered state + * and enter Recovery phase, reducing congestion window. + * + * Main question: may we further continue forward transmission + * with the same cwnd? + */ +static int tcp_time_to_recover(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 packets_out; + + /* Do not perform any recovery during F-RTO algorithm */ + if (tp->frto_counter) + return 0; + + /* Trick#1: The loss is proven. */ + if (tp->lost_out) + return 1; + + /* Not-A-Trick#2 : Classic rule... */ + if (tcp_dupack_heuristics(tp) > tp->reordering) + return 1; + + /* Trick#3 : when we use RFC2988 timer restart, fast + * retransmit can be triggered by timeout of queue head. + */ + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) + return 1; + + /* Trick#4: It is still not OK... But will it be useful to delay + * recovery more? + */ + packets_out = tp->packets_out; + if (packets_out <= tp->reordering && + tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + !tcp_may_send_now(sk)) { + /* We have nothing to send. This connection is limited + * either by receiver window or by application. + */ + return 1; + } + + /* If a thin stream is detected, retransmit after first + * received dupack. Employ only if SACK is supported in order + * to avoid possible corner-case series of spurious retransmissions + * Use only if there are no unsent data. + */ + if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && + tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && + tcp_is_sack(tp) && !tcp_send_head(sk)) + return 1; + + return 0; +} + +/* New heuristics: it is possible only after we switched to restart timer + * each time when something is ACKed. Hence, we can detect timed out packets + * during fast retransmit without falling to slow start. + * + * Usefulness of this as is very questionable, since we should know which of + * the segments is the next to timeout which is relatively expensive to find + * in general case unless we add some data structure just for that. The + * current approach certainly won't find the right one too often and when it + * finally does find _something_ it usually marks large part of the window + * right away (because a retransmission with a larger timestamp blocks the + * loop from advancing). -ij + */ +static void tcp_timeout_skbs(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) + return; + + skb = tp->scoreboard_skb_hint; + if (tp->scoreboard_skb_hint == NULL) + skb = tcp_write_queue_head(sk); + + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (!tcp_skb_timedout(sk, skb)) + break; + + tcp_skb_mark_lost(tp, skb); + } + + tp->scoreboard_skb_hint = skb; + + tcp_verify_left_out(tp); +} + +/* Mark head of queue up as lost. With RFC3517 SACK, the packets is + * is against sacked "cnt", otherwise it's against facked "cnt" + */ +static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int cnt, oldcnt; + int err; + unsigned int mss; + + WARN_ON(packets > tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + /* Head already handled? */ + if (mark_head && skb != tcp_write_queue_head(sk)) + return; + } else { + skb = tcp_write_queue_head(sk); + cnt = 0; + } + + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + + if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + break; + + oldcnt = cnt; + if (tcp_is_fack(tp) || tcp_is_reno(tp) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + cnt += tcp_skb_pcount(skb); + + if (cnt > packets) { + if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || + (oldcnt >= packets)) + break; + + mss = skb_shinfo(skb)->gso_size; + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + if (err < 0) + break; + cnt = packets; + } + + tcp_skb_mark_lost(tp, skb); + + if (mark_head) + break; + } + tcp_verify_left_out(tp); +} + +/* Account newly detected lost packet(s) */ + +static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_is_reno(tp)) { + tcp_mark_head_lost(sk, 1, 1); + } else if (tcp_is_fack(tp)) { + int lost = tp->fackets_out - tp->reordering; + if (lost <= 0) + lost = 1; + tcp_mark_head_lost(sk, lost, 0); + } else { + int sacked_upto = tp->sacked_out - tp->reordering; + if (sacked_upto >= 0) + tcp_mark_head_lost(sk, sacked_upto, 0); + else if (fast_rexmit) + tcp_mark_head_lost(sk, 1, 1); + } + + tcp_timeout_skbs(sk); +} + +/* CWND moderation, preventing bursts due to too big ACKs + * in dubious situations. + */ +static inline void tcp_moderate_cwnd(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Lower bound on congestion window is slow start threshold + * unless congestion avoidance choice decides to overide it. + */ +static inline u32 tcp_cwnd_min(const struct sock *sk) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + + return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + +/* Decrease cwnd each second ack. */ +static void tcp_cwnd_down(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int decr = tp->snd_cwnd_cnt + 1; + + if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || + (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { + tp->snd_cwnd_cnt = decr & 1; + decr >>= 1; + + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) + tp->snd_cwnd -= decr; + + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); + tp->snd_cwnd_stamp = tcp_time_stamp; + } +} + +/* Nothing was retransmitted or returned timestamp is less + * than timestamp of the first retransmission. + */ +static inline int tcp_packet_delayed(struct tcp_sock *tp) +{ + return !tp->retrans_stamp || + (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); +} + +/* Undo procedures. */ + +#if FASTRETRANS_DEBUG > 1 +static void DBGUNDO(struct sock *sk, const char *msg) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + if (sk->sk_family == AF_INET) { + printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", + msg, + &inet->inet_daddr, ntohs(inet->inet_dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", + msg, + &np->daddr, ntohs(inet->inet_dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); + } +#endif +} +#else +#define DBGUNDO(x...) do { } while (0) +#endif + +static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->prior_ssthresh) { + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->undo_cwnd) + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); + else + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); + + if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { + tp->snd_ssthresh = tp->prior_ssthresh; + TCP_ECN_withdraw_cwr(tp); + } + } else { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline int tcp_may_undo(struct tcp_sock *tp) +{ + return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); +} + +/* People celebrate: "We love our President!" */ +static int tcp_try_undo_recovery(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_may_undo(tp)) { + int mib_idx; + + /* Happy end! We did not retransmit anything + * or our original transmission succeeded. + */ + DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(sk, true); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPLOSSUNDO; + else + mib_idx = LINUX_MIB_TCPFULLUNDO; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + tp->undo_marker = 0; + } + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + tcp_moderate_cwnd(tp); + return 1; + } + tcp_set_ca_state(sk, TCP_CA_Open); + return 0; +} + +/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ +static void tcp_try_undo_dsack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->undo_marker && !tp->undo_retrans) { + DBGUNDO(sk, "D-SACK"); + tcp_undo_cwr(sk, true); + tp->undo_marker = 0; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + } +} + +/* We can clear retrans_stamp when there are no retransmissions in the + * window. It would seem that it is trivially available for us in + * tp->retrans_out, however, that kind of assumptions doesn't consider + * what will happen if errors occur when sending retransmission for the + * second time. ...It could the that such segment has only + * TCPCB_EVER_RETRANS set at the present time. It seems that checking + * the head skb is enough except for some reneging corner cases that + * are not worth the effort. + * + * Main reason for all this complexity is the fact that connection dying + * time now depends on the validity of the retrans_stamp, in particular, + * that successive retransmissions of a segment must not advance + * retrans_stamp under any conditions. + */ +static int tcp_any_retrans_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (tp->retrans_out) + return 1; + + skb = tcp_write_queue_head(sk); + if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) + return 1; + + return 0; +} + +/* Undo during fast recovery after partial ACK. */ + +static int tcp_try_undo_partial(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + /* Partial ACK arrived. Force Hoe's retransmit. */ + int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); + + if (tcp_may_undo(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + + DBGUNDO(sk, "Hoe"); + tcp_undo_cwr(sk, false); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + + /* So... Do not make Hoe's retransmit yet. + * If the first packet was delayed, the rest + * ones are most probably delayed as well. + */ + failed = 0; + } + return failed; +} + +/* Undo during loss recovery after partial ACK. */ +static int tcp_try_undo_loss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_may_undo(tp)) { + struct sk_buff *skb; + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + + tcp_clear_all_retrans_hints(tp); + + DBGUNDO(sk, "partial loss"); + tp->lost_out = 0; + tcp_undo_cwr(sk, true); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); + inet_csk(sk)->icsk_retransmits = 0; + tp->undo_marker = 0; + if (tcp_is_sack(tp)) + tcp_set_ca_state(sk, TCP_CA_Open); + return 1; + } + return 0; +} + +static inline void tcp_complete_cwr(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + /* Do not moderate cwnd if it's already undone in cwr or recovery */ + if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; + } + tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); +} + +static void tcp_try_keep_open(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int state = TCP_CA_Open; + + if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) + state = TCP_CA_Disorder; + + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); + tp->high_seq = tp->snd_nxt; + } +} + +static void tcp_try_to_open(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_verify_left_out(tp); + + if (!tp->frto_counter && !tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + + if (flag & FLAG_ECE) + tcp_enter_cwr(sk, 1); + + if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { + tcp_try_keep_open(sk); + tcp_moderate_cwnd(tp); + } else { + tcp_cwnd_down(sk, flag); + } +} + +static void tcp_mtup_probe_failed(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; + icsk->icsk_mtup.probe_size = 0; +} + +static void tcp_mtup_probe_success(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + /* FIXME: breaks with very large cwnd */ + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = tp->snd_cwnd * + tcp_mss_to_mtu(sk, tp->mss_cache) / + icsk->icsk_mtup.probe_size; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; + icsk->icsk_mtup.probe_size = 0; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); +} + +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk); + u32 prior_lost = tp->lost_out; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + if (tcp_skb_seglen(skb) > mss && + !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + tcp_skb_mark_lost_uncond_verify(tp, skb); + } + } + + tcp_clear_retrans_hints_partial(tp); + + if (prior_lost == tp->lost_out) + return; + + if (tcp_is_reno(tp)) + tcp_limit_reno_sacked(tp); + + tcp_verify_left_out(tp); + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (icsk->icsk_ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(sk, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} +EXPORT_SYMBOL(tcp_simple_retransmit); + +/* Process an event, which can update packets-in-flight not trivially. + * Main goal of this function is to calculate new estimate for left_out, + * taking into account both packets sitting in receiver's buffer and + * packets lost by network. + * + * Besides that it does CWND reduction, when packet loss is detected + * and changes state of machine. + * + * It does _not_ decide what to send, it is made in function + * tcp_xmit_retransmit_queue(). + */ +static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); + int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + (tcp_fackets_out(tp) > tp->reordering)); + int fast_rexmit = 0, mib_idx; + + if (WARN_ON(!tp->packets_out && tp->sacked_out)) + tp->sacked_out = 0; + if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + tp->fackets_out = 0; + + /* Now state machine starts. + * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ + if (flag & FLAG_ECE) + tp->prior_ssthresh = 0; + + /* B. In all the states check for reneging SACKs. */ + if (tcp_check_sack_reneging(sk, flag)) + return; + + /* C. Process data loss notification, provided it is valid. */ + if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && + before(tp->snd_una, tp->high_seq) && + icsk->icsk_ca_state != TCP_CA_Open && + tp->fackets_out > tp->reordering) { + tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); + } + + /* D. Check consistency of the current state. */ + tcp_verify_left_out(tp); + + /* E. Check state exit conditions. State can be terminated + * when high_seq is ACKed. */ + if (icsk->icsk_ca_state == TCP_CA_Open) { + WARN_ON(tp->retrans_out != 0); + tp->retrans_stamp = 0; + } else if (!before(tp->snd_una, tp->high_seq)) { + switch (icsk->icsk_ca_state) { + case TCP_CA_Loss: + icsk->icsk_retransmits = 0; + if (tcp_try_undo_recovery(sk)) + return; + break; + + case TCP_CA_CWR: + /* CWR is to be held something *above* high_seq + * is ACKed for CWR bit to reach receiver. */ + if (tp->snd_una != tp->high_seq) { + tcp_complete_cwr(sk); + tcp_set_ca_state(sk, TCP_CA_Open); + } + break; + + case TCP_CA_Disorder: + tcp_try_undo_dsack(sk); + if (!tp->undo_marker || + /* For SACK case do not Open to allow to undo + * catching for all duplicate ACKs. */ + tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { + tp->undo_marker = 0; + tcp_set_ca_state(sk, TCP_CA_Open); + } + break; + + case TCP_CA_Recovery: + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); + if (tcp_try_undo_recovery(sk)) + return; + tcp_complete_cwr(sk); + break; + } + } + + /* F. Process state. */ + switch (icsk->icsk_ca_state) { + case TCP_CA_Recovery: + if (!(flag & FLAG_SND_UNA_ADVANCED)) { + if (tcp_is_reno(tp) && is_dupack) + tcp_add_reno_sack(sk); + } else + do_lost = tcp_try_undo_partial(sk, pkts_acked); + break; + case TCP_CA_Loss: + if (flag & FLAG_DATA_ACKED) + icsk->icsk_retransmits = 0; + if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) + tcp_reset_reno_sack(tp); + if (!tcp_try_undo_loss(sk)) { + tcp_moderate_cwnd(tp); + tcp_xmit_retransmit_queue(sk); + return; + } + if (icsk->icsk_ca_state != TCP_CA_Open) + return; + /* Loss is undone; fall through to processing in Open state. */ + default: + if (tcp_is_reno(tp)) { + if (flag & FLAG_SND_UNA_ADVANCED) + tcp_reset_reno_sack(tp); + if (is_dupack) + tcp_add_reno_sack(sk); + } + + if (icsk->icsk_ca_state == TCP_CA_Disorder) + tcp_try_undo_dsack(sk); + + if (!tcp_time_to_recover(sk)) { + tcp_try_to_open(sk, flag); + return; + } + + /* MTU probe failure: don't reduce cwnd */ + if (icsk->icsk_ca_state < TCP_CA_CWR && + icsk->icsk_mtup.probe_size && + tp->snd_una == tp->mtu_probe.probe_seq_start) { + tcp_mtup_probe_failed(sk); + /* Restores the reduction we did in tcp_mtup_probe() */ + tp->snd_cwnd++; + tcp_simple_retransmit(sk); + return; + } + + /* Otherwise enter Recovery state */ + + if (tcp_is_reno(tp)) + mib_idx = LINUX_MIB_TCPRENORECOVERY; + else + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (icsk->icsk_ca_state < TCP_CA_CWR) { + if (!(flag & FLAG_ECE)) + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + TCP_ECN_queue_cwr(tp); + } + + tp->bytes_acked = 0; + tp->snd_cwnd_cnt = 0; + tcp_set_ca_state(sk, TCP_CA_Recovery); + fast_rexmit = 1; + } + + if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + tcp_update_scoreboard(sk, fast_rexmit); + tcp_cwnd_down(sk, flag); + tcp_xmit_retransmit_queue(sk); +} + +static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +{ + tcp_rtt_estimator(sk, seq_rtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; +} + +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Supersedes RFC1323) + */ +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) +{ + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin + * + * Changed: reset backoff as soon as we see the first valid sample. + * If we do not, we get strongly overestimated rto. With timestamps + * samples are accepted even from very old segments: f.e., when rtt=1 + * increases to 8, we retransmit 5 times and after 8 seconds delayed + * answer arrives rto becomes 120 seconds! If at least one of segments + * in window is lost... Voila. --ANK (010210) + */ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); +} + +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) +{ + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + if (flag & FLAG_RETRANS_DATA_ACKED) + return; + + tcp_valid_rtt_meas(sk, seq_rtt); +} + +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + const s32 seq_rtt) +{ + const struct tcp_sock *tp = tcp_sk(sk); + /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tcp_ack_saw_tstamp(sk, flag); + else if (seq_rtt >= 0) + tcp_ack_no_tstamp(sk, seq_rtt, flag); +} + +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); + tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Restart timer after forward progress on connection. + * RFC2988 recommends to restart timer to now+rto. + */ +static void tcp_rearm_rto(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->packets_out) { + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + } else { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + } +} + +/* If we get here, the whole TSO packet has not been acked. */ +static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 packets_acked; + + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); + + packets_acked = tcp_skb_pcount(skb); + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) + return 0; + packets_acked -= tcp_skb_pcount(skb); + + if (packets_acked) { + BUG_ON(tcp_skb_pcount(skb) == 0); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); + } + + return packets_acked; +} + +/* Remove acknowledged frames from the retransmission queue. If our packet + * is before the ack sequence we can discard it as it's confirmed to have + * arrived at the other end. + */ +static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + u32 prior_snd_una) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb; + u32 now = tcp_time_stamp; + int fully_acked = 1; + int flag = 0; + u32 pkts_acked = 0; + u32 reord = tp->packets_out; + u32 prior_sacked = tp->sacked_out; + s32 seq_rtt = -1; + s32 ca_seq_rtt = -1; + ktime_t last_ackt = net_invalid_timestamp(); + + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u32 acked_pcount; + u8 sacked = scb->sacked; + + /* Determine how many packets and what bytes were acked, tso and else */ + if (after(scb->end_seq, tp->snd_una)) { + if (tcp_skb_pcount(skb) == 1 || + !after(tp->snd_una, scb->seq)) + break; + + acked_pcount = tcp_tso_acked(sk, skb); + if (!acked_pcount) + break; + + fully_acked = 0; + } else { + acked_pcount = tcp_skb_pcount(skb); + } + + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= acked_pcount; + flag |= FLAG_RETRANS_DATA_ACKED; + ca_seq_rtt = -1; + seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; + } else { + ca_seq_rtt = now - scb->when; + last_ackt = skb->tstamp; + if (seq_rtt < 0) { + seq_rtt = ca_seq_rtt; + } + if (!(sacked & TCPCB_SACKED_ACKED)) + reord = min(pkts_acked, reord); + } + + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= acked_pcount; + if (sacked & TCPCB_LOST) + tp->lost_out -= acked_pcount; + + tp->packets_out -= acked_pcount; + pkts_acked += acked_pcount; + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPHDR_SYN)) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + if (!fully_acked) + break; + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + tp->scoreboard_skb_hint = NULL; + if (skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = NULL; + if (skb == tp->lost_skb_hint) + tp->lost_skb_hint = NULL; + } + + if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) + tp->snd_up = tp->snd_una; + + if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + flag |= FLAG_SACK_RENEGING; + + if (flag & FLAG_ACKED) { + const struct tcp_congestion_ops *ca_ops + = inet_csk(sk)->icsk_ca_ops; + + if (unlikely(icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { + tcp_mtup_probe_success(sk); + } + + tcp_ack_update_rtt(sk, flag, seq_rtt); + tcp_rearm_rto(sk); + + if (tcp_is_reno(tp)) { + tcp_remove_reno_sacks(sk, pkts_acked); + } else { + int delta; + + /* Non-retransmitted hole got filled? That's reordering */ + if (reord < prior_fackets) + tcp_update_reordering(sk, tp->fackets_out - reord, 0); + + delta = tcp_is_fack(tp) ? pkts_acked : + prior_sacked - tp->sacked_out; + tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); + } + + tp->fackets_out -= min(pkts_acked, tp->fackets_out); + + if (ca_ops->pkts_acked) { + s32 rtt_us = -1; + + /* Is the ACK triggering packet unambiguous? */ + if (!(flag & FLAG_RETRANS_DATA_ACKED)) { + /* High resolution needed and available? */ + if (ca_ops->flags & TCP_CONG_RTT_STAMP && + !ktime_equal(last_ackt, + net_invalid_timestamp())) + rtt_us = ktime_us_delta(ktime_get_real(), + last_ackt); + else if (ca_seq_rtt >= 0) + rtt_us = jiffies_to_usecs(ca_seq_rtt); + } + + ca_ops->pkts_acked(sk, pkts_acked, rtt_us); + } + } + +#if FASTRETRANS_DEBUG > 0 + WARN_ON((int)tp->sacked_out < 0); + WARN_ON((int)tp->lost_out < 0); + WARN_ON((int)tp->retrans_out < 0); + if (!tp->packets_out && tcp_is_sack(tp)) { + icsk = inet_csk(sk); + if (tp->lost_out) { + printk(KERN_DEBUG "Leak l=%u %d\n", + tp->lost_out, icsk->icsk_ca_state); + tp->lost_out = 0; + } + if (tp->sacked_out) { + printk(KERN_DEBUG "Leak s=%u %d\n", + tp->sacked_out, icsk->icsk_ca_state); + tp->sacked_out = 0; + } + if (tp->retrans_out) { + printk(KERN_DEBUG "Leak r=%u %d\n", + tp->retrans_out, icsk->icsk_ca_state); + tp->retrans_out = 0; + } + } +#endif + return flag; +} + +static void tcp_ack_probe(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + /* Was it a usable window open? */ + + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); + } +} + +static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) +{ + return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open; +} + +static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && + !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); +} + +/* Check that window update is acceptable. + * The function assumes that snd_una<=ack<=snd_next. + */ +static inline int tcp_may_update_window(const struct tcp_sock *tp, + const u32 ack, const u32 ack_seq, + const u32 nwin) +{ + return after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); +} + +/* Update our send window. + * + * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 + * and in FreeBSD. NetBSD's one is even worse.) is wrong. + */ +static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, + u32 ack_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + int flag = 0; + u32 nwin = ntohs(tcp_hdr(skb)->window); + + if (likely(!tcp_hdr(skb)->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack_seq); + + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + tp->pred_flags = 0; + tcp_fast_path_check(sk); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); + } + } + } + + tp->snd_una = ack; + + return flag; +} + +/* A very conservative spurious RTO response algorithm: reduce cwnd and + * continue in congestion avoidance. + */ +static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; + TCP_ECN_queue_cwr(tp); + tcp_moderate_cwnd(tp); +} + +/* A conservative spurious RTO response algorithm: reduce cwnd using + * rate halving and continue in congestion avoidance. + */ +static void tcp_ratehalving_spur_to_response(struct sock *sk) +{ + tcp_enter_cwr(sk, 0); +} + +static void tcp_undo_spur_to_response(struct sock *sk, int flag) +{ + if (flag & FLAG_ECE) + tcp_ratehalving_spur_to_response(sk); + else + tcp_undo_cwr(sk, true); +} + +/* F-RTO spurious RTO detection algorithm (RFC4138) + * + * F-RTO affects during two new ACKs following RTO (well, almost, see inline + * comments). State (ACK number) is kept in frto_counter. When ACK advances + * window (but not to or beyond highest sequence sent before RTO): + * On First ACK, send two new segments out. + * On Second ACK, RTO was likely spurious. Do spurious response (response + * algorithm is not part of the F-RTO detection algorithm + * given in RFC4138 but can be selected separately). + * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. + * + * Rationale: if the RTO was spurious, new ACKs should arrive from the + * original window even after we transmit two new data segments. + * + * SACK version: + * on first step, wait until first cumulative ACK arrives, then move to + * the second step. In second step, the next ACK decides. + * + * F-RTO is implemented (mainly) in four functions: + * - tcp_use_frto() is used to determine if TCP is can use F-RTO + * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is + * called when tcp_use_frto() showed green light + * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm + * - tcp_enter_frto_loss() is called if there is not enough evidence + * to prove that the RTO is indeed spurious. It transfers the control + * from F-RTO to the conventional RTO recovery + */ +static int tcp_process_frto(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_verify_left_out(tp); + + /* Duplicate the behavior from Loss state (fastretrans_alert) */ + if (flag & FLAG_DATA_ACKED) + inet_csk(sk)->icsk_retransmits = 0; + + if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || + ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) + tp->undo_marker = 0; + + if (!before(tp->snd_una, tp->frto_highmark)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); + return 1; + } + + if (!tcp_is_sackfrto(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate + */ + if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) + return 1; + + if (!(flag & FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), + flag); + return 1; + } + } else { + if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + /* Prevent sending of new data. */ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)); + return 1; + } + + if ((tp->frto_counter >= 2) && + (!(flag & FLAG_FORWARD_PROGRESS) || + ((flag & FLAG_DATA_SACKED) && + !(flag & FLAG_ONLY_ORIG_SACKED)))) { + /* RFC4138 shortcoming (see comment above) */ + if (!(flag & FLAG_FORWARD_PROGRESS) && + (flag & FLAG_NOT_DUP)) + return 1; + + tcp_enter_frto_loss(sk, 3, flag); + return 1; + } + } + + if (tp->frto_counter == 1) { + /* tcp_may_send_now needs to see updated state */ + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + tp->frto_counter = 2; + + if (!tcp_may_send_now(sk)) + tcp_enter_frto_loss(sk, 2, flag); + + return 1; + } else { + switch (sysctl_tcp_frto_response) { + case 2: + tcp_undo_spur_to_response(sk, flag); + break; + case 1: + tcp_conservative_spur_to_response(tp); + break; + default: + tcp_ratehalving_spur_to_response(sk); + break; + } + tp->frto_counter = 0; + tp->undo_marker = 0; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); + } + return 0; +} + +/* This routine deals with incoming acks, but not outgoing ones. */ +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 prior_in_flight; + u32 prior_fackets; + int prior_packets; + int frto_cwnd = 0; + + /* If the ack is older than previous acks + * then we can probably ignore it. + */ + if (before(ack, prior_snd_una)) + goto old_ack; + + /* If the ack includes data we haven't sent yet, discard + * this segment (RFC793 Section 3.9). + */ + if (after(ack, tp->snd_nxt)) + goto invalid_ack; + + if (after(ack, prior_snd_una)) + flag |= FLAG_SND_UNA_ADVANCED; + + if (sysctl_tcp_abc) { + if (icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + else if (icsk->icsk_ca_state == TCP_CA_Loss) + /* we assume just one segment left network */ + tp->bytes_acked += min(ack - prior_snd_una, + tp->mss_cache); + } + + prior_fackets = tp->fackets_out; + prior_in_flight = tcp_packets_in_flight(tp); + + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + /* Window is constant, pure forward advance. + * No more checks are required. + * Note, we use the fact that SND.UNA>=SND.WL2. + */ + tcp_update_wl(tp, ack_seq); + tp->snd_una = ack; + flag |= FLAG_WIN_UPDATE; + + tcp_ca_event(sk, CA_EVENT_FAST_ACK); + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); + } else { + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); + + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); + + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + flag |= FLAG_ECE; + + tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + } + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->sk_err_soft = 0; + icsk->icsk_probes_out = 0; + tp->rcv_tstamp = tcp_time_stamp; + prior_packets = tp->packets_out; + if (!prior_packets) + goto no_queue; + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + + if (tp->frto_counter) + frto_cwnd = tcp_process_frto(sk, flag); + /* Guarantee sacktag reordering detection against wrap-arounds */ + if (before(tp->frto_highmark, tp->snd_una)) + tp->frto_highmark = 0; + + if (tcp_ack_is_dubious(sk, flag)) { + /* Advance CWND, if state allows this. */ + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && + tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, prior_in_flight); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, + flag); + } else { + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) + tcp_cong_avoid(sk, ack, prior_in_flight); + } + + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) + dst_confirm(__sk_dst_get(sk)); + + return 1; + +no_queue: + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + if (tcp_send_head(sk)) + tcp_ack_probe(sk); + return 1; + +invalid_ack: + SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return -1; + +old_ack: + if (TCP_SKB_CB(skb)->sacked) { + tcp_sacktag_write_queue(sk, skb, prior_snd_una); + if (icsk->icsk_ca_state == TCP_CA_Open) + tcp_try_keep_open(sk); + } + + SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return 0; +} + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. + */ +void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, + u8 **hvpp, int estab) +{ + unsigned char *ptr; + struct tcphdr *th = tcp_hdr(skb); + int length = (th->doff * 4) - sizeof(struct tcphdr); + + ptr = (unsigned char *)(th + 1); + opt_rx->saw_tstamp = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = get_unaligned_be16(ptr); + if (in_mss) { + if (opt_rx->user_mss && + opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; + } + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW && th->syn && + !estab && sysctl_tcp_window_scaling) { + __u8 snd_wscale = *(__u8 *)ptr; + opt_rx->wscale_ok = 1; + if (snd_wscale > 14) { + if (net_ratelimit()) + printk(KERN_INFO "tcp_parse_options: Illegal window " + "scaling value %d >14 received.\n", + snd_wscale); + snd_wscale = 14; + } + opt_rx->snd_wscale = snd_wscale; + } + break; + case TCPOPT_TIMESTAMP: + if ((opsize == TCPOLEN_TIMESTAMP) && + ((estab && opt_rx->tstamp_ok) || + (!estab && sysctl_tcp_timestamps))) { + opt_rx->saw_tstamp = 1; + opt_rx->rcv_tsval = get_unaligned_be32(ptr); + opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM && th->syn && + !estab && sysctl_tcp_sack) { + opt_rx->sack_ok = 1; + tcp_sack_reset(opt_rx); + } + break; + + case TCPOPT_SACK: + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + opt_rx->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + break; +#ifdef CONFIG_TCP_MD5SIG + case TCPOPT_MD5SIG: + /* + * The MD5 Hash has already been + * checked (see tcp_v{4,6}_do_rcv()). + */ + break; +#endif + case TCPOPT_COOKIE: + /* This option is variable length. + */ + switch (opsize) { + case TCPOLEN_COOKIE_BASE: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_PAIR: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_MIN+0: + case TCPOLEN_COOKIE_MIN+2: + case TCPOLEN_COOKIE_MIN+4: + case TCPOLEN_COOKIE_MIN+6: + case TCPOLEN_COOKIE_MAX: + /* 16-bit multiple */ + opt_rx->cookie_plus = opsize; + *hvpp = ptr; + break; + default: + /* ignore option */ + break; + } + break; + } + + ptr += opsize-2; + length -= opsize; + } + } +} +EXPORT_SYMBOL(tcp_parse_options); + +static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) +{ + __be32 *ptr = (__be32 *)(th + 1); + + if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + return 1; + } + return 0; +} + +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_options(). + */ +static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp, u8 **hvpp) +{ + /* In the spirit of fast parsing, compare doff directly to constant + * values. Because equality is used, short doff can be ignored here. + */ + if (th->doff == (sizeof(*th) / 4)) { + tp->rx_opt.saw_tstamp = 0; + return 0; + } else if (tp->rx_opt.tstamp_ok && + th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { + if (tcp_parse_aligned_timestamp(tp, th)) + return 1; + } + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); + return 1; +} + +#ifdef CONFIG_TCP_MD5SIG +/* + * Parse MD5 Signature option + */ +u8 *tcp_parse_md5sig_option(struct tcphdr *th) +{ + int length = (th->doff << 2) - sizeof (*th); + u8 *ptr = (u8*)(th + 1); + + /* If the TCP option is too short, we can short cut */ + if (length < TCPOLEN_MD5SIG) + return NULL; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch(opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2 || opsize > length) + return NULL; + if (opcode == TCPOPT_MD5SIG) + return opsize == TCPOLEN_MD5SIG ? ptr : NULL; + } + ptr += opsize - 2; + length -= opsize; + } + return NULL; +} +EXPORT_SYMBOL(tcp_parse_md5sig_option); +#endif + +static inline void tcp_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); +} + +static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if (tcp_paws_check(&tp->rx_opt, 0)) + tcp_store_ts_recent(tp); + } +} + +/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + * + * It is not fatal. If this ACK does _not_ change critical state (seqs, window) + * it can pass through stack. So, the following predicate verifies that + * this segment is not used for anything but congestion avoidance or + * fast retransmit. Moreover, we even are able to eliminate most of such + * second order effects, if we apply some small "replay" window (~RTO) + * to timestamp space. + * + * All these measures still do not guarantee that we reject wrapped ACKs + * on networks with high bandwidth, when sequence space is recycled fastly, + * but it guarantees that such events will be very rare and do not affect + * connection seriously. This doesn't look nice, but alas, PAWS is really + * buggy extension. + * + * [ Later note. Even worse! It is buggy for segments _with_ data. RFC + * states that events when retransmit arrives after original data are rare. + * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is + * the biggest problem on large power networks even with minor reordering. + * OK, let's give it small replay window. If peer clock is even 1hz, it is safe + * up to bandwidth of 18Gigabit/sec. 8) ] + */ + +static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); + u32 seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + + return (/* 1. Pure ACK with correct sequence number. */ + (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + + /* 2. ... and duplicate ACK. */ + ack == tp->snd_una && + + /* 3. ... and does not update window. */ + !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && + + /* 4. ... and sits in replay window. */ + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); +} + +static inline int tcp_paws_discard(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && + !tcp_disordered_ack(sk, skb); +} + +/* Check segment sequence number for validity. + * + * Segment controls are considered valid, if the segment + * fits to the window after truncation to the window. Acceptability + * of data (and SYN, FIN, of course) is checked separately. + * See tcp_data_queue(), for example. + * + * Also, controls (RST is main one) are accepted using RCV.WUP instead + * of RCV.NXT. Peer still did not advance his SND.UNA when we + * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. + * (borrowed from freebsd) + */ + +static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + return !before(end_seq, tp->rcv_wup) && + !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); +} + +/* When we get a reset we do this. */ +static void tcp_reset(struct sock *sk) +{ + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + tcp_done(sk); +} + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + */ +static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + + inet_csk_schedule_ack(sk); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", + __func__, sk->sk_state); + break; + } + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tcp_is_sack(tp)) + tcp_sack_reset(&tp->rx_opt); + sk_mem_reclaim(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, + u32 end_seq) +{ + if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { + if (before(seq, sp->start_seq)) + sp->start_seq = seq; + if (after(end_seq, sp->end_seq)) + sp->end_seq = end_seq; + return 1; + } + return 0; +} + +static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + int mib_idx; + + if (before(seq, tp->rcv_nxt)) + mib_idx = LINUX_MIB_TCPDSACKOLDSENT; + else + mib_idx = LINUX_MIB_TCPDSACKOFOSENT; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + tp->rx_opt.dsack = 1; + tp->duplicate_sack[0].start_seq = seq; + tp->duplicate_sack[0].end_seq = end_seq; + } +} + +static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->rx_opt.dsack) + tcp_dsack_set(sk, seq, end_seq); + else + tcp_sack_extend(tp->duplicate_sack, seq, end_seq); +} + +static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + tcp_enter_quickack_mode(sk); + + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) + end_seq = tp->rcv_nxt; + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); + } + } + + tcp_send_ack(sk); +} + +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) +{ + int this_sack; + struct tcp_sack_block *sp = &tp->selective_acks[0]; + struct tcp_sack_block *swalk = sp + 1; + + /* See if the recent change to the first SACK eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { + if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { + int i; + + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + tp->rx_opt.num_sacks--; + for (i = this_sack; i < tp->rx_opt.num_sacks; i++) + sp[i] = sp[i + 1]; + continue; + } + this_sack++, swalk++; + } +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int cur_sacks = tp->rx_opt.num_sacks; + int this_sack; + + if (!cur_sacks) + goto new_sack; + + for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { + if (tcp_sack_extend(sp, seq, end_seq)) { + /* Rotate this_sack to the first one. */ + for (; this_sack > 0; this_sack--, sp--) + swap(*sp, *(sp - 1)); + if (cur_sacks > 1) + tcp_sack_maybe_coalesce(tp); + return; + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. + */ + if (this_sack >= TCP_NUM_SACKS) { + this_sack--; + tp->rx_opt.num_sacks--; + sp--; + } + for (; this_sack > 0; this_sack--, sp--) + *sp = *(sp - 1); + +new_sack: + /* Build the new head SACK, and we're done. */ + sp->start_seq = seq; + sp->end_seq = end_seq; + tp->rx_opt.num_sacks++; +} + +/* RCV.NXT advances, some SACKs should be eaten. */ + +static void tcp_sack_remove(struct tcp_sock *tp) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->rx_opt.num_sacks; + int this_sack; + + /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ + if (skb_queue_empty(&tp->out_of_order_queue)) { + tp->rx_opt.num_sacks = 0; + return; + } + + for (this_sack = 0; this_sack < num_sacks;) { + /* Check if the start of the sack is covered by RCV.NXT. */ + if (!before(tp->rcv_nxt, sp->start_seq)) { + int i; + + /* RCV.NXT must cover all the block! */ + WARN_ON(before(tp->rcv_nxt, sp->end_seq)); + + /* Zap this SACK, by moving forward any other SACKS. */ + for (i=this_sack+1; i < num_sacks; i++) + tp->selective_acks[i-1] = tp->selective_acks[i]; + num_sacks--; + continue; + } + this_sack++; + sp++; + } + tp->rx_opt.num_sacks = num_sacks; +} + +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 dsack_high = tp->rcv_nxt; + struct sk_buff *skb; + + while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + + if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { + __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) + dsack_high = TCP_SKB_CB(skb)->end_seq; + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "ofo packet was already received\n"); + __skb_unlink(skb, &tp->out_of_order_queue); + __kfree_skb(skb); + continue; + } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + __skb_unlink(skb, &tp->out_of_order_queue); + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tcp_hdr(skb)->fin) + tcp_fin(skb, sk, tcp_hdr(skb)); + } +} + +static int tcp_prune_ofo_queue(struct sock *sk); +static int tcp_prune_queue(struct sock *sk); + +static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +{ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_rmem_schedule(sk, size)) { + + if (tcp_prune_queue(sk) < 0) + return -1; + + if (!sk_rmem_schedule(sk, size)) { + if (!tcp_prune_ofo_queue(sk)) + return -1; + + if (!sk_rmem_schedule(sk, size)) + return -1; + } + } + return 0; +} + +static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + int eaten = -1; + + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) + goto drop; + + skb_dst_drop(skb); + __skb_pull(skb, th->doff * 4); + + TCP_ECN_accept_cwr(tp, skb); + + tp->rx_opt.dsack = 0; + + /* Queue data for delivery to the user. + * Packets in sequence go to the receive queue. + * Out of sequence packets to the out_of_order_queue. + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (tcp_receive_window(tp) == 0) + goto out_of_window; + + /* Ok. In sequence. In window. */ + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && + sock_owned_by_user(sk) && !tp->urg_data) { + int chunk = min_t(unsigned int, skb->len, + tp->ucopy.len); + + __set_current_state(TASK_RUNNING); + + local_bh_enable(); + if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len); + tcp_rcv_space_adjust(sk); + } + local_bh_disable(); + } + + if (eaten <= 0) { +queue_and_out: + if (eaten < 0 && + tcp_try_rmem_schedule(sk, skb->truesize)) + goto drop; + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + } + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (skb->len) + tcp_event_data_recv(sk, skb); + if (th->fin) + tcp_fin(skb, sk, th); + + if (!skb_queue_empty(&tp->out_of_order_queue)) { + tcp_ofo_queue(sk); + + /* RFC2581. 4.2. SHOULD send immediate ACK, when + * gap in queue is filled. + */ + if (skb_queue_empty(&tp->out_of_order_queue)) + inet_csk(sk)->icsk_ack.pingpong = 0; + } + + if (tp->rx_opt.num_sacks) + tcp_sack_remove(tp); + + tcp_fast_path_check(sk); + + if (eaten > 0) + __kfree_skb(skb); + else if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + return; + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + /* A retransmit, 2nd most common case. Force an immediate ack. */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + +out_of_window: + tcp_enter_quickack_mode(sk); + inet_csk_schedule_ack(sk); +drop: + __kfree_skb(skb); + return; + } + + /* Out of window. F.e. zero window probe. */ + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + goto out_of_window; + + tcp_enter_quickack_mode(sk); + + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + + /* If window is closed, drop tail of packet. But after + * remembering D-SACK for its head made in previous line. + */ + if (!tcp_receive_window(tp)) + goto out_of_window; + goto queue_and_out; + } + + TCP_ECN_check_ce(tp, skb); + + if (tcp_try_rmem_schedule(sk, skb->truesize)) + goto drop; + + /* Disable header prediction. */ + tp->pred_flags = 0; + inet_csk_schedule_ack(sk); + + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + + skb_set_owner_r(skb, sk); + + if (!skb_peek(&tp->out_of_order_queue)) { + /* Initial out of order segment, build 1 SACK. */ + if (tcp_is_sack(tp)) { + tp->rx_opt.num_sacks = 1; + tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; + tp->selective_acks[0].end_seq = + TCP_SKB_CB(skb)->end_seq; + } + __skb_queue_head(&tp->out_of_order_queue, skb); + } else { + struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (seq == TCP_SKB_CB(skb1)->end_seq) { + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + + if (!tp->rx_opt.num_sacks || + tp->selective_acks[0].end_seq != seq) + goto add_sack; + + /* Common case: data arrive in order after hole. */ + tp->selective_acks[0].end_seq = end_seq; + return; + } + + /* Find place to insert this segment. */ + while (1) { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + } + + /* Do skb overlap to previous one? */ + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + __kfree_skb(skb); + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(sk, seq, + TCP_SKB_CB(skb1)->end_seq); + } else { + if (skb_queue_is_first(&tp->out_of_order_queue, + skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev( + &tp->out_of_order_queue, + skb1); + } + } + if (!skb1) + __skb_queue_head(&tp->out_of_order_queue, skb); + else + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + + /* And clean segments covered by new one as whole. */ + while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { + skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + end_seq); + break; + } + __skb_unlink(skb1, &tp->out_of_order_queue); + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); + } + +add_sack: + if (tcp_is_sack(tp)) + tcp_sack_new_ofo_skb(sk, seq, end_seq); + } +} + +static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *list) +{ + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(list, skb)) + next = skb_queue_next(list, skb); + + __skb_unlink(skb, list); + __kfree_skb(skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); + + return next; +} + +/* Collapse contiguous sequence of skbs head..tail with + * sequence numbers start..end. + * + * If tail is NULL, this means until the end of the list. + * + * Segments with FIN/SYN are not collapsed (only because this + * simplifies code) + */ +static void +tcp_collapse(struct sock *sk, struct sk_buff_head *list, + struct sk_buff *head, struct sk_buff *tail, + u32 start, u32 end) +{ + struct sk_buff *skb, *n; + bool end_of_skbs; + + /* First, check that queue is collapsible and find + * the point where collapsing can be useful. */ + skb = head; +restart: + end_of_skbs = true; + skb_queue_walk_from_safe(list, skb, n) { + if (skb == tail) + break; + /* No new bits? It is possible on ofo queue. */ + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + skb = tcp_collapse_one(sk, skb, list); + if (!skb) + break; + goto restart; + } + + /* The first skb to collapse is: + * - not SYN/FIN and + * - bloated or contains data before "start" or + * overlaps to the next one. + */ + if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && + (tcp_win_from_space(skb->truesize) > skb->len || + before(TCP_SKB_CB(skb)->seq, start))) { + end_of_skbs = false; + break; + } + + if (!skb_queue_is_last(list, skb)) { + struct sk_buff *next = skb_queue_next(list, skb); + if (next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { + end_of_skbs = false; + break; + } + } + + /* Decided to skip this, advance start seq. */ + start = TCP_SKB_CB(skb)->end_seq; + } + if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + return; + + while (before(start, end)) { + struct sk_buff *nskb; + unsigned int header = skb_headroom(skb); + int copy = SKB_MAX_ORDER(header, 0); + + /* Too big header? This can happen with IPv6. */ + if (copy < 0) + return; + if (end - start < copy) + copy = end - start; + nskb = alloc_skb(copy + header, GFP_ATOMIC); + if (!nskb) + return; + + skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); + skb_set_network_header(nskb, (skb_network_header(skb) - + skb->head)); + skb_set_transport_header(nskb, (skb_transport_header(skb) - + skb->head)); + skb_reserve(nskb, header); + memcpy(nskb->head, skb->head, header); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; + __skb_queue_before(list, skb, nskb); + skb_set_owner_r(nskb, sk); + + /* Copy data, releasing collapsed skbs. */ + while (copy > 0) { + int offset = start - TCP_SKB_CB(skb)->seq; + int size = TCP_SKB_CB(skb)->end_seq - start; + + BUG_ON(offset < 0); + if (size > 0) { + size = min(copy, size); + if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) + BUG(); + TCP_SKB_CB(nskb)->end_seq += size; + copy -= size; + start += size; + } + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + skb = tcp_collapse_one(sk, skb, list); + if (!skb || + skb == tail || + tcp_hdr(skb)->syn || + tcp_hdr(skb)->fin) + return; + } + } + } +} + +/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs + * and tcp_collapse() them until all the queue is collapsed. + */ +static void tcp_collapse_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); + struct sk_buff *head; + u32 start, end; + + if (skb == NULL) + return; + + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + head = skb; + + for (;;) { + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) + next = skb_queue_next(&tp->out_of_order_queue, skb); + skb = next; + + /* Segment is terminated when we see gap or when + * we are at the end of all the queue. */ + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { + tcp_collapse(sk, &tp->out_of_order_queue, + head, skb, start, end); + head = skb; + if (!skb) + break; + /* Start new segment */ + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + } else { + if (before(TCP_SKB_CB(skb)->seq, start)) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) + end = TCP_SKB_CB(skb)->end_seq; + } + } +} + +/* + * Purge the out-of-order queue. + * Return true if queue was pruned. + */ +static int tcp_prune_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int res = 0; + + if (!skb_queue_empty(&tp->out_of_order_queue)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); + __skb_queue_purge(&tp->out_of_order_queue); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + sk_mem_reclaim(sk); + res = 1; + } + return res; +} + +/* Reduce allocated memory if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. + */ +static int tcp_prune_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); + else if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, + skb_peek(&sk->sk_receive_queue), + NULL, + tp->copied_seq, tp->rcv_nxt); + sk_mem_reclaim(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + + /* Collapsing did not help, destructive actions follow. + * This must not ever occur. */ + + tcp_prune_ofo_queue(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); + + /* Massive buffer overcommit. */ + tp->pred_flags = 0; + return -1; +} + +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static int tcp_should_expand_sndbuf(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If the user specified a specific send buffer setting, do + * not modify it. + */ + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ + if (tcp_memory_pressure) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + return 0; + + /* If we filled the congestion window, do not expand. */ + if (tp->packets_out >= tp->snd_cwnd) + return 0; + + return 1; +} + +/* When incoming ACK allowed to free some skb from write_queue, + * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket + * on the exit from tcp input handler. + * + * PROBLEM: sndbuf expansion does not work well with largesend. + */ +static void tcp_new_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_should_expand_sndbuf(sk)) { + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + int demanded = max_t(unsigned int, tp->snd_cwnd, + tp->reordering + 1); + sndmem *= 2 * demanded; + if (sndmem > sk->sk_sndbuf) + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tp->snd_cwnd_stamp = tcp_time_stamp; + } + + sk->sk_write_space(sk); +} + +static void tcp_check_space(struct sock *sk) +{ + if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { + sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_new_space(sk); + } +} + +static inline void tcp_data_snd_check(struct sock *sk) +{ + tcp_push_pending_frames(sk); + tcp_check_space(sk); +} + +/* + * Check if sending an ack is needed. + */ +static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). Or... + */ + __tcp_select_window(sk) >= tp->rcv_wnd) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* We have out of order data. */ + (ofo_possible && skb_peek(&tp->out_of_order_queue))) { + /* Then ack it now */ + tcp_send_ack(sk); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(sk); + } +} + +static inline void tcp_ack_snd_check(struct sock *sk) +{ + if (!inet_csk_ack_scheduled(sk)) { + /* We sent a data segment already. */ + return; + } + __tcp_ack_snd_check(sk, 1); +} + +/* + * This routine is only called when we have urgent data + * signaled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + * For 1003.1g we should support a new option TCP_STDURG to permit + * either form (or just set the sysctl tcp_stdurg). + */ + +static void tcp_check_urg(struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 ptr = ntohs(th->urg_ptr); + + if (ptr && !sysctl_tcp_stdurg) + ptr--; + ptr += ntohl(th->seq); + + /* Ignore urgent data that we've already seen and read. */ + if (after(tp->copied_seq, ptr)) + return; + + /* Do not replay urg ptr. + * + * NOTE: interesting situation not covered by specs. + * Misbehaving sender may send urg ptr, pointing to segment, + * which we already have in ofo queue. We are not able to fetch + * such data and will stay in TCP_URG_NOTYET until will be eaten + * by recvmsg(). Seems, we are not obliged to handle such wicked + * situations. But it is worth to think about possibility of some + * DoSes using some hypothetical application level deadlock. + */ + if (before(ptr, tp->rcv_nxt)) + return; + + /* Do we already have a newer (or duplicate) urgent pointer? */ + if (tp->urg_data && !after(ptr, tp->urg_seq)) + return; + + /* Tell the world about our new urgent pointer. */ + sk_send_sigurg(sk); + + /* We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * tp->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the semantics of SIOCATMARK (and thus sockatmark()) + * + * NOTE. Double Dutch. Rendering to plain English: author of comment + * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); + * and expect that both A and B disappear from stream. This is _wrong_. + * Though this happens in BSD with high probability, this is occasional. + * Any application relying on this is buggy. Note also, that fix "works" + * only in this artificial test. Insert some normal data between A and B and we will + * decline of BSD again. Verdict: it is better to remove to trap + * buggy users. + */ + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + tp->copied_seq++; + if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { + __skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + } + } + + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = ptr; + + /* Disable header prediction. */ + tp->pred_flags = 0; +} + +/* This is the 'fast' part of urgent handling. */ +static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check if we get a new urgent pointer - normally not. */ + if (th->urg) + tcp_check_urg(sk, th); + + /* Do we wait for any urgent data? - normally not... */ + if (tp->urg_data == TCP_URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - + th->syn; + + /* Is the urgent pointer pointing into this packet? */ + if (ptr < skb->len) { + u8 tmp; + if (skb_copy_bits(skb, ptr, &tmp, 1)) + BUG(); + tp->urg_data = TCP_URG_VALID | tmp; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + } + } +} + +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb_csum_unnecessary(skb)) + err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + else + err = skb_copy_and_csum_datagram_iovec(skb, hlen, + tp->ucopy.iov); + + if (!err) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + } + + local_bh_disable(); + return err; +} + +static __sum16 __tcp_checksum_complete_user(struct sock *sk, + struct sk_buff *skb) +{ + __sum16 result; + + if (sock_owned_by_user(sk)) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +} + +static inline int tcp_checksum_complete_user(struct sock *sk, + struct sk_buff *skb) +{ + return !skb_csum_unnecessary(skb) && + __tcp_checksum_complete_user(sk, skb); +} + +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, + int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + + if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, + tp->ucopy.iov, chunk, + tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + +/* Does PAWS and seqno based validation of an incoming segment, flags will + * play significant role here. + */ +static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, int syn_inerr) +{ + u8 *hash_location; + struct tcp_sock *tp = tcp_sk(sk); + + /* RFC1323: H1. Apply PAWS check first. */ + if (tcp_fast_parse_options(skb, th, tp, &hash_location) && + tp->rx_opt.saw_tstamp && + tcp_paws_discard(sk, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Reset is accepted even if it did not pass PAWS. */ + } + + /* Step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST + * bit is set, if so drop the segment and return)". + */ + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + /* Step 2: check RST bit */ + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + /* step 3: check security and precedence [ignored] */ + + /* step 4: Check for a SYN in window. */ + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (syn_inerr) + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return -1; + } + + return 1; + +discard: + __kfree_skb(skb); + return 0; +} + +/* + * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * - Unexpected TCP option. + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. + */ +int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int res; + + /* + * Header prediction. + * The code loosely follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ + + tp->rx_opt.saw_tstamp = 0; + + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_prediction is to be made + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 for the fast path, otherwise pred_flags is 0 to + * turn it off (when there are holes in the receive + * space for instance) + * PSH flag is ignored. + */ + + if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && + TCP_SKB_CB(skb)->seq == tp->rcv_nxt && + !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { + int tcp_header_len = tp->tcp_header_len; + + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ + + /* Check timestamp */ + if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { + /* No? Slow path! */ + if (!tcp_parse_aligned_timestamp(tp, th)) + goto slow_path; + + /* If PAWS failed, check it more carefully in slow path */ + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + goto slow_path; + + /* DO NOT update ts_recent here, if checksum fails + * and timestamp was corrupted part, it will result + * in a hung connection since we will drop all + * future packets due to the PAWS test. + */ + } + + if (len <= tcp_header_len) { + /* Bulk data transfer: sender */ + if (len == tcp_header_len) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + /* We know that such packets are checksummed + * on entry. + */ + tcp_ack(sk, skb, 0); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } else { /* Header too small */ + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + goto discard; + } + } else { + int eaten = 0; + int copied_early = 0; + + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); + + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(sk, skb); + + __skb_pull(skb, tcp_header_len); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); + } + if (!eaten) { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(sk, skb); + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); + + /* Bulk data transfer: receiver */ + __skb_pull(skb, tcp_header_len); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } + + tcp_event_data_recv(sk, skb); + + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { + /* Well, only one small jumplet in fast path... */ + tcp_ack(sk, skb, FLAG_DATA); + tcp_data_snd_check(sk); + if (!inet_csk_ack_scheduled(sk)) + goto no_ack; + } + + if (!copied_early || tp->rcv_nxt != tp->rcv_wup) + __tcp_ack_snd_check(sk, 0); +no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif + if (eaten) + __kfree_skb(skb); + else + sk->sk_data_ready(sk, 0); + return 0; + } + } + +slow_path: + if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* + * Standard slow path. + */ + + res = tcp_validate_incoming(sk, skb, th, 1); + if (res <= 0) + return -res; + +step5: + if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + goto discard; + + tcp_rcv_rtt_measure_ts(sk, skb); + + /* Process urgent data. */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + tcp_data_queue(sk, skb); + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + return 0; + +csum_error: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + +discard: + __kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(tcp_rcv_established); + +static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + u8 *hash_location; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_cookie_values *cvp = tp->cookie_values; + int saved_clamp = tp->rx_opt.mss_clamp; + + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); + + if (th->ack) { + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + goto reset_and_undo; + + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, + tcp_time_stamp)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); + goto reset_and_undo; + } + + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + if (!th->syn) + goto discard_and_undo; + + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + */ + + TCP_ECN_rcv_synack(tp, th); + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk, skb, FLAG_SLOWPATH); + + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + if (!tp->rx_opt.wscale_ok) { + tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp, 65535U); + } + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + tcp_store_ts_recent(tp); + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + if (tcp_is_sack(tp) && sysctl_tcp_fack) + tcp_enable_fack(tp); + + tcp_mtup_init(sk); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + /* Remember, tcp_poll() does not lock socket! + * Change state from SYN-SENT only after copied_seq + * is initialized. */ + tp->copied_seq = tp->rcv_nxt; + + if (cvp != NULL && + cvp->cookie_pair_size > 0 && + tp->rx_opt.cookie_plus > 0) { + int cookie_size = tp->rx_opt.cookie_plus + - TCPOLEN_COOKIE_BASE; + int cookie_pair_size = cookie_size + + cvp->cookie_desired; + + /* A cookie extension option was sent and returned. + * Note that each incoming SYNACK replaces the + * Responder cookie. The initial exchange is most + * fragile, as protection against spoofing relies + * entirely upon the sequence and timestamp (above). + * This replacement strategy allows the correct pair to + * pass through, while any others will be filtered via + * Responder verification later. + */ + if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { + memcpy(&cvp->cookie_pair[cvp->cookie_desired], + hash_location, cookie_size); + cvp->cookie_pair_size = cookie_pair_size; + } + } + + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + + security_inet_conn_established(sk, skb); + + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + + tcp_init_metrics(sk); + + tcp_init_congestion_control(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_init_buffer_space(sk); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (!tp->rx_opt.snd_wscale) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } + + if (sk->sk_write_pending || + icsk->icsk_accept_queue.rskq_defer_accept || + icsk->icsk_ack.pingpong) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + inet_csk_schedule_ack(sk); + icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); + +discard: + __kfree_skb(skb); + return 0; + } else { + tcp_send_ack(sk); + } + return -1; + } + + /* No ACK in the segment */ + + if (th->rst) { + /* rfc793: + * "If the RST bit is set + * + * Otherwise (no ACK) drop the segment and return." + */ + + goto discard_and_undo; + } + + /* PAWS check. */ + if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && + tcp_paws_reject(&tp->rx_opt, 0)) + goto discard_and_undo; + + if (th->syn) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. + * Particularly, it can be connect to self. + */ + tcp_set_state(sk, TCP_SYN_RECV); + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tcp_store_ts_recent(tp); + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; + + TCP_ECN_rcv_syn(tp, th); + + tcp_mtup_init(sk); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + tcp_send_synack(sk); +#if 0 + /* Note, we could accept data and URG from this segment. + * There are no obstacles to make this. + * + * However, if we ignore data in ACKless segments sometimes, + * we have no reasons to accept it sometimes. + * Also, seems the code doing it in step6 of tcp_rcv_state_process + * is not flawless. So, discard packet for sanity. + * Uncomment this return to process the data. + */ + return -1; +#else + goto discard; +#endif + } + /* "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + */ + +discard_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + goto discard; + +reset_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + return 1; +} + +/* + * This function implements the receiving procedure of RFC 793 for + * all states except ESTABLISHED and TIME_WAIT. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. + */ + +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int queued = 0; + int res; + + tp->rx_opt.saw_tstamp = 0; + + switch (sk->sk_state) { + case TCP_CLOSE: + goto discard; + + case TCP_LISTEN: + if (th->ack) + return 1; + + if (th->rst) + goto discard; + + if (th->syn) { + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) + return 1; + + /* Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. + * + * Now that TTCP is starting to be used we ought to + * queue this data. + * But, this leaves one open to an easy denial of + * service attack, and SYN cookies can't defend + * against this problem. So, we drop the data + * in the interest of security over speed unless + * it's still in use. + */ + kfree_skb(skb); + return 0; + } + goto discard; + + case TCP_SYN_SENT: + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + if (queued >= 0) + return queued; + + /* Do step6 onward by hand. */ + tcp_urg(sk, skb, th); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } + + res = tcp_validate_incoming(sk, skb, th, 0); + if (res <= 0) + return -res; + + /* step 5: check the ACK field */ + if (th->ack) { + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; + + switch (sk->sk_state) { + case TCP_SYN_RECV: + if (acceptable) { + tp->copied_seq = tp->rcv_nxt; + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + + /* Note, that this wakeup is only for marginal + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sk_sleep == + * NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) + sk_wake_async(sk, + SOCK_WAKE_IO, POLL_OUT); + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << + tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + /* tcp_ack considers this ACK as duplicate + * and does not calculate rtt. + * Force it here. + */ + tcp_ack_update_rtt(sk, 0, 0); + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + /* Make sure socket is routed, for + * correct metrics. + */ + icsk->icsk_af_ops->rebuild_header(sk); + + tcp_init_metrics(sk); + + tcp_init_congestion_control(sk); + + /* Prevent spurious tcp_cwnd_restart() on + * first data packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_mtup_init(sk); + tcp_initialize_rcv_mss(sk); + tcp_init_buffer_space(sk); + tcp_fast_path_on(tp); + } else { + return 1; + } + break; + + case TCP_FIN_WAIT1: + if (tp->snd_una == tp->write_seq) { + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + dst_confirm(__sk_dst_get(sk)); + + if (!sock_flag(sk, SOCK_DEAD)) + /* Wake up lingering close() */ + sk->sk_state_change(sk); + else { + int tmo; + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } + } + break; + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; + } + } else + goto discard; + + /* step 6: check the URG bit */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + switch (sk->sk_state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + tcp_reset(sk); + return 1; + } + } + /* Fall through */ + case TCP_ESTABLISHED: + tcp_data_queue(sk, skb); + queued = 1; + break; + } + + /* tcp_data could move socket to TIME-WAIT */ + if (sk->sk_state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} +EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c new file mode 100644 index 00000000..53a5af66 --- /dev/null +++ b/net/ipv4/tcp_ipv4.c @@ -0,0 +1,2657 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * IPv4 specific functions + * + * + * code split from: + * linux/ipv4/tcp.c + * linux/ipv4/tcp_input.c + * linux/ipv4/tcp_output.c + * + * See tcp.c for author information + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * David S. Miller : New socket lookup architecture. + * This code is dedicated to John Dyson. + * David S. Miller : Change semantics of established hash, + * half is devoted to TIME_WAIT sockets + * and the rest go in the other half. + * Andi Kleen : Add support for syncookies and fixed + * some bugs: ip options weren't passed to + * the TCP layer, missed a check for an + * ACK bit. + * Andi Kleen : Implemented fast path mtu discovery. + * Fixed many serious bugs in the + * request_sock handling and moved + * most of it into the af independent code. + * Added tail drop and some other bugfixes. + * Added new listen semantics. + * Mike McLagan : Routing by source + * Juan Jose Ciarlante: ip_dynaddr bits + * Andi Kleen: various fixes. + * Vitaly E. Lavrov : Transparent proxy revived after year + * coma. + * Andi Kleen : Fix new listen. + * Andi Kleen : Fix accept error reporting. + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +int sysctl_tcp_tw_reuse __read_mostly; +int sysctl_tcp_low_latency __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_low_latency); + + +#ifdef CONFIG_TCP_MD5SIG +static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, + __be32 addr); +static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, struct tcphdr *th); +#else +static inline +struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) +{ + return NULL; +} +#endif + +struct inet_hashinfo tcp_hashinfo; +EXPORT_SYMBOL(tcp_hashinfo); + +static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) +{ + return secure_tcp_sequence_number(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) +{ + const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); + struct tcp_sock *tp = tcp_sk(sk); + + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it is safe provided sequence + spaces do not overlap i.e. at data rates <= 80Mbit/sec. + + Actually, the idea is close to VJ's one, only timestamp cache is + held not per host, but per port pair and TW bucket is used as state + holder. + + If TW bucket has been already destroyed we fall back to VJ's scheme + and use initial timestamp retrieved from peer table. + */ + if (tcptw->tw_ts_recent_stamp && + (twp == NULL || (sysctl_tcp_tw_reuse && + get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + sock_hold(sktw); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(tcp_twsk_unique); + +/* This will initiate an outgoing connection. */ +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; + struct flowi4 *fl4; + struct rtable *rt; + int err; + struct ip_options_rcu *inet_opt; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt && inet_opt->opt.srr) { + if (!daddr) + return -EINVAL; + nexthop = inet_opt->opt.faddr; + } + + orig_sport = inet->inet_sport; + orig_dport = usin->sin_port; + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, + orig_sport, orig_dport, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + if (err == -ENETUNREACH) + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + return err; + } + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (!inet_opt || !inet_opt->opt.srr) + daddr = fl4->daddr; + + if (!inet->inet_saddr) + inet->inet_saddr = fl4->saddr; + inet->inet_rcv_saddr = inet->inet_saddr; + + if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + if (tcp_death_row.sysctl_tw_recycle && + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { + struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); + /* + * VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state + * TIME-WAIT * and initialize rx_opt.ts_recent from it, + * when trying new connection. + */ + if (peer) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } + } + } + + inet->inet_dport = usin->sin_port; + inet->inet_daddr = daddr; + + inet_csk(sk)->icsk_ext_hdr_len = 0; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = inet_hash_connect(&tcp_death_row, sk); + if (err) + goto failure; + + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, + inet->inet_sport, inet->inet_dport, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto failure; + } + /* OK, now commit destination to socket. */ + sk->sk_gso_type = SKB_GSO_TCPV4; + sk_setup_caps(sk, &rt->dst); + + if (!tp->write_seq) + tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port); + + inet->inet_id = tp->write_seq ^ jiffies; + + err = tcp_connect(sk); + rt = NULL; + if (err) + goto failure; + + return 0; + +failure: + /* + * This unhashes the socket and releases the local port, + * if necessary. + */ + tcp_set_state(sk, TCP_CLOSE); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->inet_dport = 0; + return err; +} +EXPORT_SYMBOL(tcp_v4_connect); + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) +{ + struct dst_entry *dst; + struct inet_sock *inet = inet_sk(sk); + + /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs + * send out by Linux are always <576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + inet_csk(sk)->icsk_pmtu_cookie > mtu) { + tcp_sync_mss(sk, mtu); + + /* Resend the TCP packet because it's + * clear that the old packet has been + * dropped. This is the new "fast" path mtu + * discovery. + */ + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + */ + +void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; + struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); + struct inet_connection_sock *icsk; + struct tcp_sock *tp; + struct inet_sock *inet; + const int type = icmp_hdr(icmp_skb)->type; + const int code = icmp_hdr(icmp_skb)->code; + struct sock *sk; + struct sk_buff *skb; + __u32 seq; + __u32 remaining; + int err; + struct net *net = dev_net(icmp_skb->dev); + + if (icmp_skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + return; + } + + sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, + iph->saddr, th->source, inet_iif(icmp_skb)); + if (!sk) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + return; + } + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + + icsk = inet_csk(sk); + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + /* check if icmp_skb allows revert of backoff + * (see draft-zimmermann-tcp-lcd) */ + if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) + break; + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + break; + + if (sock_owned_by_user(sk)) + break; + + icsk->icsk_backoff--; + inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << + icsk->icsk_backoff; + tcp_bound_rto(sk); + + skb = tcp_write_queue_head(sk); + BUG_ON(!skb); + + remaining = icsk->icsk_rto - min(icsk->icsk_rto, + tcp_time_stamp - TCP_SKB_CB(skb)->when); + + if (remaining) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now */ + tcp_retransmit_timer(sk); + } + + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct request_sock *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet_csk_search_req(sk, &prev, th->dest, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + an established socket here. + */ + WARN_ON(req->sk); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can f.e. if SYNs crossed. + */ + if (!sock_owned_by_user(sk)) { + sk->sk_err = err; + + sk->sk_error_report(sk); + + tcp_done(sk); + } else { + sk->sk_err_soft = err; + } + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +static void __tcp_v4_send_check(struct sk_buff *skb, + __be32 saddr, __be32 daddr) +{ + struct tcphdr *th = tcp_hdr(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + th->check = tcp_v4_check(skb->len, saddr, daddr, + csum_partial(th, + th->doff << 2, + skb->csum)); + } +} + +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + + __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); +} +EXPORT_SYMBOL(tcp_v4_send_check); + +int tcp_v4_gso_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); + return 0; +} + +/* + * This routine will send an RST to the other tcp. + * + * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) + * for reset. + * Answer: if a packet caused RST, it is not for a socket + * existing in our system, if it is matched to a socket, + * it is just duplicate segment or bug in other side's TCP. + * So that we build reply only basing on parameters + * arrived with segment. + * Exception: precedence violation. We do not implement it in any case. + */ + +static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + struct { + struct tcphdr th; +#ifdef CONFIG_TCP_MD5SIG + __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; +#endif + } rep; + struct ip_reply_arg arg; +#ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *key; +#endif + struct net *net; + + /* Never send a reset in response to a reset. */ + if (th->rst) + return; + + if (skb_rtable(skb)->rt_type != RTN_LOCAL) + return; + + /* Swap the send and the receive. */ + memset(&rep, 0, sizeof(rep)); + rep.th.dest = th->source; + rep.th.source = th->dest; + rep.th.doff = sizeof(struct tcphdr) / 4; + rep.th.rst = 1; + + if (th->ack) { + rep.th.seq = th->ack_seq; + } else { + rep.th.ack = 1; + rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff << 2)); + } + + memset(&arg, 0, sizeof(arg)); + arg.iov[0].iov_base = (unsigned char *)&rep; + arg.iov[0].iov_len = sizeof(rep.th); + +#ifdef CONFIG_TCP_MD5SIG + key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL; + if (key) { + rep.opt[0] = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + /* Update length and the length the header thinks exists */ + arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; + rep.th.doff = arg.iov[0].iov_len / 4; + + tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], + key, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &rep.th); + } +#endif + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ + arg.iov[0].iov_len, IPPROTO_TCP, 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. + * routing might fail in this case. using iif for oif to + * make sure we can deliver it + */ + arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); + + net = dev_net(skb_dst(skb)->dev); + ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, + &arg, arg.iov[0].iov_len); + + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); +} + +/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states + outside socket context is ugly, certainly. What can I do? + */ + +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 ts, int oif, + struct tcp_md5sig_key *key, + int reply_flags) +{ + struct tcphdr *th = tcp_hdr(skb); + struct { + struct tcphdr th; + __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) +#ifdef CONFIG_TCP_MD5SIG + + (TCPOLEN_MD5SIG_ALIGNED >> 2) +#endif + ]; + } rep; + struct ip_reply_arg arg; + struct net *net = dev_net(skb_dst(skb)->dev); + + memset(&rep.th, 0, sizeof(struct tcphdr)); + memset(&arg, 0, sizeof(arg)); + + arg.iov[0].iov_base = (unsigned char *)&rep; + arg.iov[0].iov_len = sizeof(rep.th); + if (ts) { + rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + rep.opt[1] = htonl(tcp_time_stamp); + rep.opt[2] = htonl(ts); + arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; + } + + /* Swap the send and the receive. */ + rep.th.dest = th->source; + rep.th.source = th->dest; + rep.th.doff = arg.iov[0].iov_len / 4; + rep.th.seq = htonl(seq); + rep.th.ack_seq = htonl(ack); + rep.th.ack = 1; + rep.th.window = htons(win); + +#ifdef CONFIG_TCP_MD5SIG + if (key) { + int offset = (ts) ? 3 : 0; + + rep.opt[offset++] = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED; + rep.th.doff = arg.iov[0].iov_len/4; + + tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset], + key, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &rep.th); + } +#endif + arg.flags = reply_flags; + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, /* XXX */ + arg.iov[0].iov_len, IPPROTO_TCP, 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + if (oif) + arg.bound_dev_if = oif; + + ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, + &arg, arg.iov[0].iov_len); + + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); +} + +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_ts_recent, + tw->tw_bound_dev_if, + tcp_twsk_md5_key(tcptw), + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 + ); + + inet_twsk_put(tw); +} + +static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, + tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, + req->ts_recent, + 0, + tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); +} + +/* + * Send a SYN-ACK after having received a SYN. + * This still operates on a request_sock only, not on a big + * socket. + */ +static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct flowi4 fl4; + int err = -1; + struct sk_buff * skb; + + /* First, grab a route. */ + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) + return -1; + + skb = tcp_make_synack(sk, dst, req, rvp); + + if (skb) { + __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); + + err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, + ireq->rmt_addr, + ireq->opt); + err = net_xmit_eval(err); + } + + dst_release(dst); + return err; +} + +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) +{ + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v4_send_synack(sk, NULL, req, rvp); +} + +/* + * IPv4 request_sock destructor. + */ +static void tcp_v4_reqsk_destructor(struct request_sock *req) +{ + kfree(inet_rsk(req)->opt); +} + +static void syn_flood_warning(const struct sk_buff *skb) +{ + const char *msg; + +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) + msg = "Sending cookies"; + else +#endif + msg = "Dropping request"; + + pr_info("TCP: Possible SYN flooding on port %d. %s.\n", + ntohs(tcp_hdr(skb)->dest), msg); +} + +/* + * Save and compile IPv4 options into the request_sock if needed. + */ +static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) +{ + const struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options_rcu *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = sizeof(*dopt) + opt->optlen; + + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt) { + if (ip_options_echo(&dopt->opt, skb)) { + kfree(dopt); + dopt = NULL; + } + } + } + return dopt; +} + +#ifdef CONFIG_TCP_MD5SIG +/* + * RFC2385 MD5 checksumming requires a mapping of + * IP address->MD5 Key. + * We need to maintain these in the sk structure. + */ + +/* Find the Key structure for an address. */ +static struct tcp_md5sig_key * + tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) +{ + struct tcp_sock *tp = tcp_sk(sk); + int i; + + if (!tp->md5sig_info || !tp->md5sig_info->entries4) + return NULL; + for (i = 0; i < tp->md5sig_info->entries4; i++) { + if (tp->md5sig_info->keys4[i].addr == addr) + return &tp->md5sig_info->keys4[i].base; + } + return NULL; +} + +struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, + struct sock *addr_sk) +{ + return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); +} +EXPORT_SYMBOL(tcp_v4_md5_lookup); + +static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, + struct request_sock *req) +{ + return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr); +} + +/* This can be called on a newly created socket, from other files */ +int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, + u8 *newkey, u8 newkeylen) +{ + /* Add Key to the list */ + struct tcp_md5sig_key *key; + struct tcp_sock *tp = tcp_sk(sk); + struct tcp4_md5sig_key *keys; + + key = tcp_v4_md5_do_lookup(sk, addr); + if (key) { + /* Pre-existing entry - just update that one. */ + kfree(key->key); + key->key = newkey; + key->keylen = newkeylen; + } else { + struct tcp_md5sig_info *md5sig; + + if (!tp->md5sig_info) { + tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), + GFP_ATOMIC); + if (!tp->md5sig_info) { + kfree(newkey); + return -ENOMEM; + } + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + } + + md5sig = tp->md5sig_info; + if (md5sig->entries4 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { + kfree(newkey); + return -ENOMEM; + } + + if (md5sig->alloced4 == md5sig->entries4) { + keys = kmalloc((sizeof(*keys) * + (md5sig->entries4 + 1)), GFP_ATOMIC); + if (!keys) { + kfree(newkey); + if (md5sig->entries4 == 0) + tcp_free_md5sig_pool(); + return -ENOMEM; + } + + if (md5sig->entries4) + memcpy(keys, md5sig->keys4, + sizeof(*keys) * md5sig->entries4); + + /* Free old key list, and reference new one */ + kfree(md5sig->keys4); + md5sig->keys4 = keys; + md5sig->alloced4++; + } + md5sig->entries4++; + md5sig->keys4[md5sig->entries4 - 1].addr = addr; + md5sig->keys4[md5sig->entries4 - 1].base.key = newkey; + md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; + } + return 0; +} +EXPORT_SYMBOL(tcp_v4_md5_do_add); + +static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, + u8 *newkey, u8 newkeylen) +{ + return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, + newkey, newkeylen); +} + +int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) +{ + struct tcp_sock *tp = tcp_sk(sk); + int i; + + for (i = 0; i < tp->md5sig_info->entries4; i++) { + if (tp->md5sig_info->keys4[i].addr == addr) { + /* Free the key */ + kfree(tp->md5sig_info->keys4[i].base.key); + tp->md5sig_info->entries4--; + + if (tp->md5sig_info->entries4 == 0) { + kfree(tp->md5sig_info->keys4); + tp->md5sig_info->keys4 = NULL; + tp->md5sig_info->alloced4 = 0; + tcp_free_md5sig_pool(); + } else if (tp->md5sig_info->entries4 != i) { + /* Need to do some manipulation */ + memmove(&tp->md5sig_info->keys4[i], + &tp->md5sig_info->keys4[i+1], + (tp->md5sig_info->entries4 - i) * + sizeof(struct tcp4_md5sig_key)); + } + return 0; + } + } + return -ENOENT; +} +EXPORT_SYMBOL(tcp_v4_md5_do_del); + +static void tcp_v4_clear_md5_list(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Free each key, then the set of key keys, + * the crypto element, and then decrement our + * hold on the last resort crypto. + */ + if (tp->md5sig_info->entries4) { + int i; + for (i = 0; i < tp->md5sig_info->entries4; i++) + kfree(tp->md5sig_info->keys4[i].base.key); + tp->md5sig_info->entries4 = 0; + tcp_free_md5sig_pool(); + } + if (tp->md5sig_info->keys4) { + kfree(tp->md5sig_info->keys4); + tp->md5sig_info->keys4 = NULL; + tp->md5sig_info->alloced4 = 0; + } +} + +static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, + int optlen) +{ + struct tcp_md5sig cmd; + struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + u8 *newkey; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + if (copy_from_user(&cmd, optval, sizeof(cmd))) + return -EFAULT; + + if (sin->sin_family != AF_INET) + return -EINVAL; + + if (!cmd.tcpm_key || !cmd.tcpm_keylen) { + if (!tcp_sk(sk)->md5sig_info) + return -ENOENT; + return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr); + } + + if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) + return -EINVAL; + + if (!tcp_sk(sk)->md5sig_info) { + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_info *p; + + p = kzalloc(sizeof(*p), sk->sk_allocation); + if (!p) + return -EINVAL; + + tp->md5sig_info = p; + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + } + + newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); + if (!newkey) + return -ENOMEM; + return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, + newkey, cmd.tcpm_keylen); +} + +static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, int nbytes) +{ + struct tcp4_pseudohdr *bp; + struct scatterlist sg; + + bp = &hp->md5_blk.ip4; + + /* + * 1. the TCP pseudo-header (in the order: source IP address, + * destination IP address, zero-padded protocol number, and + * segment length) + */ + bp->saddr = saddr; + bp->daddr = daddr; + bp->pad = 0; + bp->protocol = IPPROTO_TCP; + bp->len = cpu_to_be16(nbytes); + + sg_init_one(&sg, bp, sizeof(*bp)); + return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); +} + +static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, struct tcphdr *th) +{ + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; +} + +int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, + struct sock *sk, struct request_sock *req, + struct sk_buff *skb) +{ + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + struct tcphdr *th = tcp_hdr(skb); + __be32 saddr, daddr; + + if (sk) { + saddr = inet_sk(sk)->inet_saddr; + daddr = inet_sk(sk)->inet_daddr; + } else if (req) { + saddr = inet_rsk(req)->loc_addr; + daddr = inet_rsk(req)->rmt_addr; + } else { + const struct iphdr *iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; + } + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + + if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; +} +EXPORT_SYMBOL(tcp_v4_md5_hash_skb); + +static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) +{ + /* + * This gets called for each TCP segment that arrives + * so we want to be efficient. + * We have 3 drop cases: + * o No MD5 hash and one expected. + * o MD5 hash and we're not expecting one. + * o MD5 hash and its wrong. + */ + __u8 *hash_location = NULL; + struct tcp_md5sig_key *hash_expected; + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + int genhash; + unsigned char newhash[16]; + + hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); + hash_location = tcp_parse_md5sig_option(th); + + /* We've parsed the options - do we have a hash? */ + if (!hash_expected && !hash_location) + return 0; + + if (hash_expected && !hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + return 1; + } + + if (!hash_expected && hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + return 1; + } + + /* Okay, so this is hash_expected and hash_location - + * so we need to calculate the checksum. + */ + genhash = tcp_v4_md5_hash_skb(newhash, + hash_expected, + NULL, NULL, skb); + + if (genhash || memcmp(hash_location, newhash, 16) != 0) { + if (net_ratelimit()) { + printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + &iph->saddr, ntohs(th->source), + &iph->daddr, ntohs(th->dest), + genhash ? " tcp_v4_calc_md5_hash failed" : ""); + } + return 1; + } + return 0; +} + +#endif + +struct request_sock_ops tcp_request_sock_ops __read_mostly = { + .family = PF_INET, + .obj_size = sizeof(struct tcp_request_sock), + .rtx_syn_ack = tcp_v4_rtx_synack, + .send_ack = tcp_v4_reqsk_send_ack, + .destructor = tcp_v4_reqsk_destructor, + .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { + .md5_lookup = tcp_v4_reqsk_md5_lookup, + .calc_md5_hash = tcp_v4_md5_hash_skb, +}; +#endif + +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_extend_values tmp_ext; + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct request_sock *req; + struct inet_request_sock *ireq; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; + __be32 saddr = ip_hdr(skb)->saddr; + __be32 daddr = ip_hdr(skb)->daddr; + __u32 isn = TCP_SKB_CB(skb)->when; +#ifdef CONFIG_SYN_COOKIES + int want_cookie = 0; +#else +#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ +#endif + + /* Never answer to SYNs send to broadcast or multicast */ + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if (net_ratelimit()) + syn_flood_warning(skb); +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + want_cookie = 1; + } else +#endif + goto drop; + } + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet_reqsk_alloc(&tcp_request_sock_ops); + if (!req) + goto drop; + +#ifdef CONFIG_TCP_MD5SIG + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; +#endif + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_release; + + /* Secret recipe starts with IP addresses */ + *mess++ ^= (__force u32)daddr; + *mess++ ^= (__force u32)saddr; + + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; + +#ifdef CONFIG_SYN_COOKIES + want_cookie = 0; /* not our kind of cookie */ +#endif + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_release; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, tcp_hdr(skb)); + + if (want_cookie) { + isn = cookie_v4_init_sequence(sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + } else if (!isn) { + struct inet_peer *peer = NULL; + struct flowi4 fl4; + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && + fl4.daddr == saddr && + (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > + TCP_PAWS_WINDOW) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst_metric(dst, RTAX_RTT))) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", + &saddr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } + + isn = tcp_v4_init_sequence(skb); + } + tcp_rsk(req)->snt_isn = isn; + + if (tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || + want_cookie) + goto drop_and_free; + + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + return 0; +} +EXPORT_SYMBOL(tcp_v4_conn_request); + + +/* + * The three way handshake has completed - we got a valid synack - + * now create the new socket. + */ +struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_request_sock *ireq; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; +#ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *key; +#endif + struct ip_options_rcu *inet_opt; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + newsk = tcp_create_openreq_child(sk, req, skb); + if (!newsk) + goto exit_nonewsk; + + newsk->sk_gso_type = SKB_GSO_TCPV4; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); + newinet->inet_daddr = ireq->rmt_addr; + newinet->inet_rcv_saddr = ireq->loc_addr; + newinet->inet_saddr = ireq->loc_addr; + inet_opt = ireq->opt; + rcu_assign_pointer(newinet->inet_opt, inet_opt); + ireq->opt = NULL; + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = ip_hdr(skb)->ttl; + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (inet_opt) + inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + newinet->inet_id = newtp->write_seq ^ jiffies; + + if (!dst) { + dst = inet_csk_route_child_sock(sk, newsk, req); + if (!dst) + goto put_and_exit; + } else { + /* syncookie case : see end of cookie_v4_check() */ + } + sk_setup_caps(newsk, dst); + + tcp_mtup_init(newsk); + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + + tcp_initialize_rcv_mss(newsk); + +#ifdef CONFIG_TCP_MD5SIG + /* Copy over the MD5 key from the original socket */ + key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); + if (key != NULL) { + /* + * We're using one, so create a matching key + * on the newsk structure. If we fail to get + * memory, then we end up not copying the key + * across. Shucks. + */ + char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); + if (newkey != NULL) + tcp_v4_md5_do_add(newsk, newinet->inet_daddr, + newkey, key->keylen); + sk_nocaps_add(newsk, NETIF_F_GSO_MASK); + } +#endif + + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; + __inet_hash_nolisten(newsk, NULL); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); +exit: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return NULL; +put_and_exit: + sock_put(newsk); + goto exit; +} +EXPORT_SYMBOL(tcp_v4_syn_recv_sock); + +static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, + iph->saddr, iph->daddr); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, + th->source, iph->daddr, th->dest, inet_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put(inet_twsk(nsk)); + return NULL; + } + +#ifdef CONFIG_SYN_COOKIES + if (!th->syn) + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +} + +static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + if (!tcp_v4_check(skb->len, iph->saddr, + iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + return 0; + } + } + + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len, IPPROTO_TCP, 0); + + if (skb->len <= 76) { + return __skb_checksum_complete(skb); + } + return 0; +} + + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct sock *rsk; +#ifdef CONFIG_TCP_MD5SIG + /* + * We really want to reject the packet as early as possible + * if: + * o We're expecting an MD5'd packet and this is no MD5 tcp option + * o There is an MD5 option and we're not expecting one + */ + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard; +#endif + + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + sock_rps_save_rxhash(sk, skb->rxhash); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { + rsk = sk; + goto reset; + } + return 0; + } + + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + + if (nsk != sk) { + sock_rps_save_rxhash(nsk, skb->rxhash); + if (tcp_child_process(sk, nsk, skb)) { + rsk = nsk; + goto reset; + } + return 0; + } + } else + sock_rps_save_rxhash(sk, skb->rxhash); + + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + rsk = sk; + goto reset; + } + return 0; + +reset: + tcp_v4_send_reset(rsk, skb); +discard: + kfree_skb(skb); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + return 0; + +csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + goto discard; +} +EXPORT_SYMBOL(tcp_v4_do_rcv); + +/* + * From tcp_input.c + */ + +int tcp_v4_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct tcphdr *th; + struct sock *sk; + int ret; + struct net *net = dev_net(skb->dev); + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* Count it even if it's bad */ + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff * 4)) + goto discard_it; + + /* An explanation is required here, I think. + * Packet length and doff are validated by header prediction, + * provided case of th->doff==0 is eliminated. + * So, we defer the checks. */ + if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) + goto bad_packet; + + th = tcp_hdr(skb); + iph = ip_hdr(skb); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = iph->tos; + TCP_SKB_CB(skb)->sacked = 0; + + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + nf_reset(skb); + + if (sk_filter(sk, skb)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock_nested(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + if (tp->ucopy.dma_chan) + ret = tcp_v4_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } + } else if (unlikely(sk_add_backlog(sk, skb))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } + bh_unlock_sock(sk); + + sock_put(sk); + + return ret; + +no_tcp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + } else { + tcp_v4_send_reset(NULL, skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + inet_twsk_put(inet_twsk(sk)); + goto discard_it; + } + + if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + inet_twsk_put(inet_twsk(sk)); + goto discard_it; + } + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + case TCP_TW_SYN: { + struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), + &tcp_hashinfo, + iph->daddr, th->dest, + inet_iif(skb)); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v4_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) +{ + struct rtable *rt = (struct rtable *) __sk_dst_get(sk); + struct inet_sock *inet = inet_sk(sk); + struct inet_peer *peer; + + if (!rt || + inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { + peer = inet_getpeer_v4(inet->inet_daddr, 1); + *release_it = true; + } else { + if (!rt->peer) + rt_bind_peer(rt, inet->inet_daddr, 1); + peer = rt->peer; + *release_it = false; + } + + return peer; +} +EXPORT_SYMBOL(tcp_v4_get_peer); + +void *tcp_v4_tw_get_peer(struct sock *sk) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + + return inet_getpeer_v4(tw->tw_daddr, 1); +} +EXPORT_SYMBOL(tcp_v4_tw_get_peer); + +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor= tcp_twsk_destructor, + .twsk_getpeer = tcp_v4_tw_get_peer, +}; + +const struct inet_connection_sock_af_ops ipv4_specific = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = tcp_v4_conn_request, + .syn_recv_sock = tcp_v4_syn_recv_sock, + .get_peer = tcp_v4_get_peer, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), + .bind_conflict = inet_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, +#endif +}; +EXPORT_SYMBOL(ipv4_specific); + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { + .md5_lookup = tcp_v4_md5_lookup, + .calc_md5_hash = tcp_v4_md5_hash_skb, + .md5_add = tcp_v4_md5_add_func, + .md5_parse = tcp_v4_parse_md5_keys, +}; +#endif + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v4_init_sock(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = TCP_MSS_DEFAULT; + + tp->reordering = sysctl_tcp_reordering; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; + + sk->sk_state = TCP_CLOSE; + + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + icsk->icsk_af_ops = &ipv4_specific; + icsk->icsk_sync_mss = tcp_sync_mss; +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv4_specific; +#endif + + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + local_bh_disable(); + percpu_counter_inc(&tcp_sockets_allocated); + local_bh_enable(); + + return 0; +} + +void tcp_v4_destroy_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_clear_xmit_timers(sk); + + tcp_cleanup_congestion_control(sk); + + /* Cleanup up the write buffer. */ + tcp_write_queue_purge(sk); + + /* Cleans up our, hopefully empty, out_of_order_queue. */ + __skb_queue_purge(&tp->out_of_order_queue); + +#ifdef CONFIG_TCP_MD5SIG + /* Clean up the MD5 key list, if any */ + if (tp->md5sig_info) { + tcp_v4_clear_md5_list(sk); + kfree(tp->md5sig_info); + tp->md5sig_info = NULL; + } +#endif + +#ifdef CONFIG_NET_DMA + /* Cleans up our sk_async_wait_queue */ + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); + + /* Clean up a referenced TCP bind bucket. */ + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(sk); + + /* + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } + + /* TCP Cookie Transactions */ + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + + percpu_counter_dec(&tcp_sockets_allocated); +} +EXPORT_SYMBOL(tcp_v4_destroy_sock); + +#ifdef CONFIG_PROC_FS +/* Proc filesystem TCP sock list dumping. */ + +static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) +{ + return hlist_nulls_empty(head) ? NULL : + list_entry(head->first, struct inet_timewait_sock, tw_node); +} + +static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) +{ + return !is_a_nulls(tw->tw_node.next) ? + hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; +} + +/* + * Get next listener socket follow cur. If cur is NULL, get first socket + * starting from bucket given in st->bucket; when st->bucket is zero the + * very first socket in the hash table is returned. + */ +static void *listening_get_next(struct seq_file *seq, void *cur) +{ + struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; + struct sock *sk = cur; + struct inet_listen_hashbucket *ilb; + struct tcp_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); + + if (!sk) { + ilb = &tcp_hashinfo.listening_hash[st->bucket]; + spin_lock_bh(&ilb->lock); + sk = sk_nulls_head(&ilb->head); + st->offset = 0; + goto get_sk; + } + ilb = &tcp_hashinfo.listening_hash[st->bucket]; + ++st->num; + ++st->offset; + + if (st->state == TCP_SEQ_STATE_OPENREQ) { + struct request_sock *req = cur; + + icsk = inet_csk(st->syn_wait_sk); + req = req->dl_next; + while (1) { + while (req) { + if (req->rsk_ops->family == st->family) { + cur = req; + goto out; + } + req = req->dl_next; + } + if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) + break; +get_req: + req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; + } + sk = sk_nulls_next(st->syn_wait_sk); + st->state = TCP_SEQ_STATE_LISTENING; + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + } else { + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) + goto start_req; + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + sk = sk_nulls_next(sk); + } +get_sk: + sk_nulls_for_each_from(sk, node) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { + cur = sk; + goto out; + } + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) { +start_req: + st->uid = sock_i_uid(sk); + st->syn_wait_sk = sk; + st->state = TCP_SEQ_STATE_OPENREQ; + st->sbucket = 0; + goto get_req; + } + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + } + spin_unlock_bh(&ilb->lock); + st->offset = 0; + if (++st->bucket < INET_LHTABLE_SIZE) { + ilb = &tcp_hashinfo.listening_hash[st->bucket]; + spin_lock_bh(&ilb->lock); + sk = sk_nulls_head(&ilb->head); + goto get_sk; + } + cur = NULL; +out: + return cur; +} + +static void *listening_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + st->offset = 0; + rc = listening_get_next(seq, NULL); + + while (rc && *pos) { + rc = listening_get_next(seq, rc); + --*pos; + } + return rc; +} + +static inline int empty_bucket(struct tcp_iter_state *st) +{ + return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && + hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); +} + +/* + * Get first established socket starting from bucket given in st->bucket. + * If st->bucket is zero, the very first socket in the hash is returned. + */ +static void *established_get_first(struct seq_file *seq) +{ + struct tcp_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); + void *rc = NULL; + + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { + struct sock *sk; + struct hlist_nulls_node *node; + struct inet_timewait_sock *tw; + spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); + + /* Lockless fast path for the common case of empty buckets */ + if (empty_bucket(st)) + continue; + + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { + if (sk->sk_family != st->family || + !net_eq(sock_net(sk), net)) { + continue; + } + rc = sk; + goto out; + } + st->state = TCP_SEQ_STATE_TIME_WAIT; + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket].twchain) { + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { + continue; + } + rc = tw; + goto out; + } + spin_unlock_bh(lock); + st->state = TCP_SEQ_STATE_ESTABLISHED; + } +out: + return rc; +} + +static void *established_get_next(struct seq_file *seq, void *cur) +{ + struct sock *sk = cur; + struct inet_timewait_sock *tw; + struct hlist_nulls_node *node; + struct tcp_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); + + ++st->num; + ++st->offset; + + if (st->state == TCP_SEQ_STATE_TIME_WAIT) { + tw = cur; + tw = tw_next(tw); +get_tw: + while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { + tw = tw_next(tw); + } + if (tw) { + cur = tw; + goto out; + } + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + st->state = TCP_SEQ_STATE_ESTABLISHED; + + /* Look for next non empty bucket */ + st->offset = 0; + while (++st->bucket <= tcp_hashinfo.ehash_mask && + empty_bucket(st)) + ; + if (st->bucket > tcp_hashinfo.ehash_mask) + return NULL; + + spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); + } else + sk = sk_nulls_next(sk); + + sk_nulls_for_each_from(sk, node) { + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) + goto found; + } + + st->state = TCP_SEQ_STATE_TIME_WAIT; + tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); + goto get_tw; +found: + cur = sk; +out: + return cur; +} + +static void *established_get_idx(struct seq_file *seq, loff_t pos) +{ + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + rc = established_get_first(seq); + + while (rc && pos) { + rc = established_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *tcp_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc; + struct tcp_iter_state *st = seq->private; + + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_idx(seq, &pos); + + if (!rc) { + st->state = TCP_SEQ_STATE_ESTABLISHED; + rc = established_get_idx(seq, pos); + } + + return rc; +} + +static void *tcp_seek_last_pos(struct seq_file *seq) +{ + struct tcp_iter_state *st = seq->private; + int offset = st->offset; + int orig_num = st->num; + void *rc = NULL; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + if (st->bucket >= INET_LHTABLE_SIZE) + break; + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_next(seq, NULL); + while (offset-- && rc) + rc = listening_get_next(seq, rc); + if (rc) + break; + st->bucket = 0; + /* Fallthrough */ + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + st->state = TCP_SEQ_STATE_ESTABLISHED; + if (st->bucket > tcp_hashinfo.ehash_mask) + break; + rc = established_get_first(seq); + while (offset-- && rc) + rc = established_get_next(seq, rc); + } + + st->num = orig_num; + + return rc; +} + +static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct tcp_iter_state *st = seq->private; + void *rc; + + if (*pos && *pos == st->last_pos) { + rc = tcp_seek_last_pos(seq); + if (rc) + goto out; + } + + st->state = TCP_SEQ_STATE_LISTENING; + st->num = 0; + st->bucket = 0; + st->offset = 0; + rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + st->last_pos = *pos; + return rc; +} + +static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct tcp_iter_state *st = seq->private; + void *rc = NULL; + + if (v == SEQ_START_TOKEN) { + rc = tcp_get_idx(seq, 0); + goto out; + } + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + rc = listening_get_next(seq, v); + if (!rc) { + st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + st->offset = 0; + rc = established_get_first(seq); + } + break; + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + rc = established_get_next(seq, v); + break; + } +out: + ++*pos; + st->last_pos = *pos; + return rc; +} + +static void tcp_seq_stop(struct seq_file *seq, void *v) +{ + struct tcp_iter_state *st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + if (v) { + struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + } + case TCP_SEQ_STATE_LISTENING: + if (v != SEQ_START_TOKEN) + spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); + break; + case TCP_SEQ_STATE_TIME_WAIT: + case TCP_SEQ_STATE_ESTABLISHED: + if (v) + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + break; + } +} + +static int tcp_seq_open(struct inode *inode, struct file *file) +{ + struct tcp_seq_afinfo *afinfo = PDE(inode)->data; + struct tcp_iter_state *s; + int err; + + err = seq_open_net(inode, file, &afinfo->seq_ops, + sizeof(struct tcp_iter_state)); + if (err < 0) + return err; + + s = ((struct seq_file *)file->private_data)->private; + s->family = afinfo->family; + s->last_pos = 0; + return 0; +} + +int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) +{ + int rc = 0; + struct proc_dir_entry *p; + + afinfo->seq_fops.open = tcp_seq_open; + afinfo->seq_fops.read = seq_read; + afinfo->seq_fops.llseek = seq_lseek; + afinfo->seq_fops.release = seq_release_net; + + afinfo->seq_ops.start = tcp_seq_start; + afinfo->seq_ops.next = tcp_seq_next; + afinfo->seq_ops.stop = tcp_seq_stop; + + p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + &afinfo->seq_fops, afinfo); + if (!p) + rc = -ENOMEM; + return rc; +} +EXPORT_SYMBOL(tcp_proc_register); + +void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) +{ + proc_net_remove(net, afinfo->name); +} +EXPORT_SYMBOL(tcp_proc_unregister); + +static void get_openreq4(struct sock *sk, struct request_sock *req, + struct seq_file *f, int i, int uid, int *len) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + int ttd = req->expires - jiffies; + + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", + i, + ireq->loc_addr, + ntohs(inet_sk(sk)->inet_sport), + ireq->rmt_addr, + ntohs(ireq->rmt_port), + TCP_SYN_RECV, + 0, 0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + atomic_read(&sk->sk_refcnt), + req, + len); +} + +static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) +{ + int timer_active; + unsigned long timer_expires; + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + int rx_queue; + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + timer_active = 1; + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + timer_active = 4; + timer_expires = icsk->icsk_timeout; + } else if (timer_pending(&sk->sk_timer)) { + timer_active = 2; + timer_expires = sk->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + if (sk->sk_state == TCP_LISTEN) + rx_queue = sk->sk_ack_backlog; + else + /* + * because we dont lock socket, we might find a transient negative value + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + + seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " + "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", + i, src, srcp, dest, destp, sk->sk_state, + tp->write_seq - tp->snd_una, + rx_queue, + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + icsk->icsk_retransmits, + sock_i_uid(sk), + icsk->icsk_probes_out, + sock_i_ino(sk), + atomic_read(&sk->sk_refcnt), sk, + jiffies_to_clock_t(icsk->icsk_rto), + jiffies_to_clock_t(icsk->icsk_ack.ato), + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, + tp->snd_cwnd, + tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, + len); +} + +static void get_timewait4_sock(struct inet_timewait_sock *tw, + struct seq_file *f, int i, int *len) +{ + __be32 dest, src; + __u16 destp, srcp; + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = tw->tw_daddr; + src = tw->tw_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", + i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw, len); +} + +#define TMPSZ 150 + +static int tcp4_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state *st; + int len; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-*s\n", TMPSZ - 1, + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp4_sock(v, seq, st->num, &len); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait4_sock(v, seq, st->num, &len); + break; + } + seq_printf(seq, "%*s\n", TMPSZ - 1 - len, ""); +out: + return 0; +} + +static struct tcp_seq_afinfo tcp4_seq_afinfo = { + .name = "tcp", + .family = AF_INET, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = tcp4_seq_show, + }, +}; + +static int __net_init tcp4_proc_init_net(struct net *net) +{ + return tcp_proc_register(net, &tcp4_seq_afinfo); +} + +static void __net_exit tcp4_proc_exit_net(struct net *net) +{ + tcp_proc_unregister(net, &tcp4_seq_afinfo); +} + +static struct pernet_operations tcp4_net_ops = { + .init = tcp4_proc_init_net, + .exit = tcp4_proc_exit_net, +}; + +int __init tcp4_proc_init(void) +{ + return register_pernet_subsys(&tcp4_net_ops); +} + +void tcp4_proc_exit(void) +{ + unregister_pernet_subsys(&tcp4_net_ops); +} +#endif /* CONFIG_PROC_FS */ + +struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + /* fall through */ + case CHECKSUM_NONE: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return tcp_gro_receive(head, skb); +} + +int tcp4_gro_complete(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), + iph->saddr, iph->daddr, 0); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + + return tcp_gro_complete(skb); +} + +struct proto tcp_prot = { + .name = "TCP", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v4_connect, + .disconnect = tcp_disconnect, + .accept = inet_csk_accept, + .ioctl = tcp_ioctl, + .init = tcp_v4_init_sock, + .destroy = tcp_v4_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v4_do_rcv, + .hash = inet_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .twsk_prot = &tcp_timewait_sock_ops, + .rsk_prot = &tcp_request_sock_ops, + .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_tcp_setsockopt, + .compat_getsockopt = compat_tcp_getsockopt, +#endif +}; +EXPORT_SYMBOL(tcp_prot); + + +static int __net_init tcp_sk_init(struct net *net) +{ + return inet_ctl_sock_create(&net->ipv4.tcp_sock, + PF_INET, SOCK_RAW, IPPROTO_TCP, net); +} + +static void __net_exit tcp_sk_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv4.tcp_sock); +} + +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); +} + +static struct pernet_operations __net_initdata tcp_sk_ops = { + .init = tcp_sk_init, + .exit = tcp_sk_exit, + .exit_batch = tcp_sk_exit_batch, +}; + +void __init tcp_v4_init(void) +{ + inet_hashinfo_init(&tcp_hashinfo); + if (register_pernet_subsys(&tcp_sk_ops)) + panic("Failed to create the TCP control socket.\n"); +} diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 00000000..72f7218b --- /dev/null +++ b/net/ipv4/tcp_lp.c @@ -0,0 +1,344 @@ +/* + * TCP Low Priority (TCP-LP) + * + * TCP Low Priority is a distributed algorithm whose goal is to utilize only + * the excess network bandwidth as compared to the ``fair share`` of + * bandwidth as targeted by TCP. + * + * As of 2.6.13, Linux supports pluggable congestion control algorithms. + * Due to the limitation of the API, we take the following changes from + * the original TCP-LP implementation: + * o We use newReno in most core CA handling. Only add some checking + * within cong_avoid. + * o Error correcting in remote HZ, therefore remote HZ will be keeped + * on checking and updating. + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since + * OWD have a similar meaning as RTT. Also correct the buggy formular. + * o Handle reaction for Early Congestion Indication (ECI) within + * pkts_acked, as mentioned within pseudo code. + * o OWD is handled in relative format, where local time stamp will in + * tcp_time_stamp format. + * + * Original Author: + * Aleksandar Kuzmanovic + * Available from: + * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * Original implementation for 2.4.19: + * http://www-ece.rice.edu/networks/TCP-LP/ + * + * 2.6.x module Authors: + * Wong Hoi Sing, Edison + * Hung Hing Lun, Mike + * SourceForge project page: + * http://tcp-lp-mod.sourceforge.net/ + */ + +#include +#include + +/* resolution of owd */ +#define LP_RESOL 1000 + +/** + * enum tcp_lp_state + * @LP_VALID_RHZ: is remote HZ valid? + * @LP_VALID_OWD: is OWD valid? + * @LP_WITHIN_THR: are we within threshold? + * @LP_WITHIN_INF: are we within inference? + * + * TCP-LP's state flags. + * We create this set of state flag mainly for debugging. + */ +enum tcp_lp_state { + LP_VALID_RHZ = (1 << 0), + LP_VALID_OWD = (1 << 1), + LP_WITHIN_THR = (1 << 3), + LP_WITHIN_INF = (1 << 4), +}; + +/** + * struct lp + * @flag: TCP-LP state flag + * @sowd: smoothed OWD << 3 + * @owd_min: min OWD + * @owd_max: max OWD + * @owd_max_rsv: resrved max owd + * @remote_hz: estimated remote HZ + * @remote_ref_time: remote reference time + * @local_ref_time: local reference time + * @last_drop: time for last active drop + * @inference: current inference + * + * TCP-LP's private struct. + * We get the idea from original TCP-LP implementation where only left those we + * found are really useful. + */ +struct lp { + u32 flag; + u32 sowd; + u32 owd_min; + u32 owd_max; + u32 owd_max_rsv; + u32 remote_hz; + u32 remote_ref_time; + u32 local_ref_time; + u32 last_drop; + u32 inference; +}; + +/** + * tcp_lp_init + * + * Init all required variables. + * Clone the handling from Vegas module implementation. + */ +static void tcp_lp_init(struct sock *sk) +{ + struct lp *lp = inet_csk_ca(sk); + + lp->flag = 0; + lp->sowd = 0; + lp->owd_min = 0xffffffff; + lp->owd_max = 0; + lp->owd_max_rsv = 0; + lp->remote_hz = 0; + lp->remote_ref_time = 0; + lp->local_ref_time = 0; + lp->last_drop = 0; + lp->inference = 0; +} + +/** + * tcp_lp_cong_avoid + * + * Implementation of cong_avoid. + * Will only call newReno CA when away from inference. + * From TCP-LP's paper, this will be handled in additive increasement. + */ +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct lp *lp = inet_csk_ca(sk); + + if (!(lp->flag & LP_WITHIN_INF)) + tcp_reno_cong_avoid(sk, ack, in_flight); +} + +/** + * tcp_lp_remote_hz_estimator + * + * Estimate remote HZ. + * We keep on updating the estimated value, where original TCP-LP + * implementation only guest it for once and use forever. + */ +static u32 tcp_lp_remote_hz_estimator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ + s64 m = 0; + + /* not yet record reference time + * go away!! record it before come back!! */ + if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) + goto out; + + /* we can't calc remote HZ with no different!! */ + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time || + tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + + m = HZ * (tp->rx_opt.rcv_tsval - + lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - + lp->local_ref_time); + if (m < 0) + m = -m; + + if (rhz > 0) { + m -= rhz >> 6; /* m is now error in remote HZ est */ + rhz += m; /* 63/64 old + 1/64 new */ + } else + rhz = m << 6; + + out: + /* record time for successful remote HZ calc */ + if ((rhz >> 6) > 0) + lp->flag |= LP_VALID_RHZ; + else + lp->flag &= ~LP_VALID_RHZ; + + /* record reference time stamp */ + lp->remote_ref_time = tp->rx_opt.rcv_tsval; + lp->local_ref_time = tp->rx_opt.rcv_tsecr; + + return rhz >> 6; +} + +/** + * tcp_lp_owd_calculator + * + * Calculate one way delay (in relative format). + * Original implement OWD as minus of remote time difference to local time + * difference directly. As this time difference just simply equal to RTT, when + * the network status is stable, remote RTT will equal to local RTT, and result + * OWD into zero. + * It seems to be a bug and so we fixed it. + */ +static u32 tcp_lp_owd_calculator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 owd = 0; + + lp->remote_hz = tcp_lp_remote_hz_estimator(sk); + + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - + tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + if (owd < 0) + owd = -owd; + } + + if (owd > 0) + lp->flag |= LP_VALID_OWD; + else + lp->flag &= ~LP_VALID_OWD; + + return owd; +} + +/** + * tcp_lp_rtt_sample + * + * Implementation or rtt_sample. + * Will take the following action, + * 1. calc OWD, + * 2. record the min/max OWD, + * 3. calc smoothed OWD (SOWD). + * Most ideas come from the original TCP-LP implementation. + */ +static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) +{ + struct lp *lp = inet_csk_ca(sk); + s64 mowd = tcp_lp_owd_calculator(sk); + + /* sorry that we don't have valid data */ + if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) + return; + + /* record the next min owd */ + if (mowd < lp->owd_min) + lp->owd_min = mowd; + + /* always forget the max of the max + * we just set owd_max as one below it */ + if (mowd > lp->owd_max) { + if (mowd > lp->owd_max_rsv) { + if (lp->owd_max_rsv == 0) + lp->owd_max = mowd; + else + lp->owd_max = lp->owd_max_rsv; + lp->owd_max_rsv = mowd; + } else + lp->owd_max = mowd; + } + + /* calc for smoothed owd */ + if (lp->sowd != 0) { + mowd -= lp->sowd >> 3; /* m is now error in owd est */ + lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ + } else + lp->sowd = mowd << 3; /* take the measured time be owd */ +} + +/** + * tcp_lp_pkts_acked + * + * Implementation of pkts_acked. + * Deal with active drop under Early Congestion Indication. + * Only drop to half and 1 will be handle, because we hope to use back + * newReno in increase case. + * We work it out by following the idea from TCP-LP's paper directly + */ +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + + if (rtt_us > 0) + tcp_lp_rtt_sample(sk, rtt_us); + + /* calc inference */ + if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) + lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + + /* test if within inference */ + if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; + + /* test if within threshold */ + if (lp->sowd >> 3 < + lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) + lp->flag |= LP_WITHIN_THR; + else + lp->flag &= ~LP_WITHIN_THR; + + pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, + tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, + lp->sowd >> 3); + + if (lp->flag & LP_WITHIN_THR) + return; + + /* FIXME: try to reset owd_min and owd_max here + * so decrease the chance the min/max is no longer suitable + * and will usually within threshold when whithin inference */ + lp->owd_min = lp->sowd >> 3; + lp->owd_max = lp->sowd >> 2; + lp->owd_max_rsv = lp->sowd >> 2; + + /* happened within inference + * drop snd_cwnd into 1 */ + if (lp->flag & LP_WITHIN_INF) + tp->snd_cwnd = 1U; + + /* happened after inference + * cut snd_cwnd into half */ + else + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ + lp->last_drop = tcp_time_stamp; +} + +static struct tcp_congestion_ops tcp_lp __read_mostly = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_lp_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_lp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .pkts_acked = tcp_lp_pkts_acked, + + .owner = THIS_MODULE, + .name = "lp" +}; + +static int __init tcp_lp_register(void) +{ + BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_lp); +} + +static void __exit tcp_lp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_lp); +} + +module_init(tcp_lp_register); +module_exit(tcp_lp_unregister); + +MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Low Priority"); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c new file mode 100644 index 00000000..80b1f807 --- /dev/null +++ b/net/ipv4/tcp_minisocks.c @@ -0,0 +1,786 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_tcp_syncookies __read_mostly = 1; +EXPORT_SYMBOL(sysctl_tcp_syncookies); + +int sysctl_tcp_abort_on_overflow __read_mostly; + +struct inet_timewait_death_row tcp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), + .hashinfo = &tcp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&tcp_death_row), + .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, + inet_twdr_twkill_work), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&tcp_death_row), +}; +EXPORT_SYMBOL_GPL(tcp_death_row); + +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +static int tcp_remember_stamp(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_peer *peer; + bool release_it; + + peer = icsk->icsk_af_ops->get_peer(sk, &release_it); + if (peer) { + if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; + peer->tcp_ts = tp->rx_opt.ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +} + +static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ + struct sock *sk = (struct sock *) tw; + struct inet_peer *peer; + + peer = twsk_getpeer(sk); + if (peer) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + + if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; + peer->tcp_ts = tcptw->tw_ts_recent; + } + inet_putpeer(peer); + return 1; + } + return 0; +} + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return seq == e_win && seq == end_seq; +} + +/* + * * Main purpose of TIME-WAIT state is to close connection gracefully, + * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN + * (and, probably, tail of data) and one or more our ACKs are lost. + * * What is TIME-WAIT timeout? It is associated with maximal packet + * lifetime in the internet, which results in wrong conclusion, that + * it is set to catch "old duplicate segments" wandering out of their path. + * It is not quite correct. This timeout is calculated so that it exceeds + * maximal retransmission timeout enough to allow to lose one (or more) + * segments sent by peer and our ACKs. This time may be calculated from RTO. + * * When TIME-WAIT socket receives RST, it means that another end + * finally closed and we are allowed to kill TIME-WAIT too. + * * Second purpose of TIME-WAIT is catching old duplicate segments. + * Well, certainly it is pure paranoia, but if we load TIME-WAIT + * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. + * * If we invented some more clever way to catch duplicates + * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. + * + * The algorithm below is based on FORMAL INTERPRETATION of RFCs. + * When you compare it to RFCs, please, read section SEGMENT ARRIVES + * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK + */ +enum tcp_tw_status +tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + const struct tcphdr *th) +{ + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + int paws_reject = 0; + + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = tcptw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); + } + } + + if (tw->tw_substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tcptw->tw_rcv_nxt, + tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) + goto kill_with_rst; + + /* Dup ACK? */ + if (!th->ack || + !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + inet_twsk_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || + TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { +kill_with_rst: + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->tw_substate = TCP_TIME_WAIT; + tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tmp_opt.saw_tstamp) { + tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + } + + if (tcp_death_row.sysctl_tw_recycle && + tcptw->tw_ts_recent_stamp && + tcp_tw_remember_stamp(tw)) + inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, + TCP_TIMEWAIT_LEN); + else + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + + if (!paws_reject && + (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { + /* In window segment, it may be only reset or bare ack. */ + + if (th->rst) { + /* This is TIME_WAIT assassination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if (sysctl_tcp_rfc1337 == 0) { +kill: + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); + return TCP_TW_SUCCESS; + } + } + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); + + if (tmp_opt.saw_tstamp) { + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = get_seconds(); + } + + inet_twsk_put(tw); + return TCP_TW_SUCCESS; + } + + /* Out of window segment. + + All the segments are ACKed immediately. + + The only exception is new SYN. We accept it, if it is + not old duplicate and we are not in danger to be killed + by delayed old duplicates. RFC check is that it has + newer sequence number works at rates <40Mbit/sec. + However, if paws works, it is reliable AND even more, + we even may relax silly seq space cutoff. + + RED-PEN: we violate main RFC requirement, if this SYN will appear + old duplicate (i.e. we receive RST in reply to SYN-ACK), + we must return socket to time-wait state. It is not good, + but not fatal yet. + */ + + if (th->syn && !th->rst && !th->ack && !paws_reject && + (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && + (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tcptw->tw_snd_nxt + 65535 + 2; + if (isn == 0) + isn++; + TCP_SKB_CB(skb)->when = isn; + return TCP_TW_SYN; + } + + if (paws_reject) + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + + if (!th->rst) { + /* In this case we must reset the TIMEWAIT timer. + * + * If it is ACKless SYN it may be both old duplicate + * and new good SYN with random sequence number ack) + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); + + /* Send ACK. Note, we do not put the bucket, + * it will be released by caller. + */ + return TCP_TW_ACK; + } + inet_twsk_put(tw); + return TCP_TW_SUCCESS; +} +EXPORT_SYMBOL(tcp_timewait_state_process); + +/* + * Move a socket to time-wait or dead fin-wait-2 state. + */ +void tcp_time_wait(struct sock *sk, int state, int timeo) +{ + struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + int recycle_ok = 0; + + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) + recycle_ok = tcp_remember_stamp(sk); + + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; + + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } +#endif + +#ifdef CONFIG_TCP_MD5SIG + /* + * The timewait bucket does not have the key DB from the + * sock structure. We just make a quick copy of the + * md5 key being used (if indeed we are using one) + * so the timewait ack generating code has the key. + */ + do { + struct tcp_md5sig_key *key; + memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); + tcptw->tw_md5_keylen = 0; + key = tp->af_specific->md5_lookup(sk, sk); + if (key != NULL) { + memcpy(&tcptw->tw_md5_key, key->key, key->keylen); + tcptw->tw_md5_keylen = key->keylen; + if (tcp_alloc_md5sig_pool(sk) == NULL) + BUG(); + } + } while (0); +#endif + + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + if (recycle_ok) { + tw->tw_timeout = rto; + } else { + tw->tw_timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + inet_twsk_schedule(tw, &tcp_death_row, timeo, + TCP_TIMEWAIT_LEN); + inet_twsk_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); + } + + tcp_update_metrics(sk); + tcp_done(sk); +} + +void tcp_twsk_destructor(struct sock *sk) +{ +#ifdef CONFIG_TCP_MD5SIG + struct tcp_timewait_sock *twsk = tcp_twsk(sk); + if (twsk->tw_md5_keylen) + tcp_free_md5sig_pool(); +#endif +} +EXPORT_SYMBOL_GPL(tcp_twsk_destructor); + +static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, + struct request_sock *req) +{ + tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; +} + +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * Actually, we could lots of memory writes here. tp of listening + * socket contains all necessary default parameters. + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) +{ + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + + if (newsk != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(newsk); + struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + + /* TCP Cookie Transactions require space for the cookie pair, + * as it differs for each connection. There is no need to + * copy any s_data_payload stored at the original socket. + * Failure will prevent resuming the connection. + * + * Presumed copied, in order of appearance: + * cookie_in_always, cookie_out_never + */ + if (oldcvp != NULL) { + struct tcp_cookie_values *newcvp = + kzalloc(sizeof(*newtp->cookie_values), + GFP_ATOMIC); + + if (newcvp != NULL) { + kref_init(&newcvp->kref); + newcvp->cookie_desired = + oldcvp->cookie_desired; + newtp->cookie_values = newcvp; + } else { + /* Not Yet Implemented */ + newtp->cookie_values = NULL; + } + } + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); + + tcp_prequeue_init(newtp); + + tcp_init_wl(newtp, treq->rcv_isn); + + newtp->srtt = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->fackets_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = 2; + newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; + + newtp->frto_counter = 0; + newtp->frto_highmark = 0; + + newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + + tcp_set_ca_state(newsk, TCP_CA_Open); + tcp_init_xmit_timers(newsk); + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->write_seq = newtp->pushed_seq = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { + if (sysctl_tcp_fack) + tcp_enable_fack(newtp); + } + newtp->window_clamp = req->window_clamp; + newtp->rcv_ssthresh = req->rcv_wnd; + newtp->rcv_wnd = req->rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << + newtp->rx_opt.snd_wscale); + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } +#ifdef CONFIG_TCP_MD5SIG + newtp->md5sig_info = NULL; /*XXX*/ + if (newtp->af_specific->md5_lookup(sk, newsk)) + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; +#endif + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + TCP_ECN_openreq_child(newtp, req); + + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); + } + return newsk; +} +EXPORT_SYMBOL(tcp_create_openreq_child); + +/* + * Process an incoming packet for SYN_RECV sockets represented + * as a request_sock. + */ + +struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev) +{ + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct sock *child; + const struct tcphdr *th = tcp_hdr(skb); + __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + int paws_reject = 0; + + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2)) { + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = req->ts_recent; + /* We do not store true stamp, but it is not required, + * it can be estimated (approximately) + * from another data. + */ + tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<retrans); + paws_reject = tcp_paws_reject(&tmp_opt, th->rst); + } + } + + /* Check for pure retransmitted SYN. */ + if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && + flg == TCP_FLAG_SYN && + !paws_reject) { + /* + * RFC793 draws (Incorrectly! It was fixed in RFC1122) + * this case on figure 6 and figure 8, but formal + * protocol description says NOTHING. + * To be more exact, it says that we should send ACK, + * because this segment (at least, if it has no data) + * is out of window. + * + * CONCLUSION: RFC793 (even with RFC1122) DOES NOT + * describe SYN-RECV state. All the description + * is wrong, we cannot believe to it and should + * rely only on common sense and implementation + * experience. + * + * Enforce "SYN-ACK" according to figure 8, figure 6 + * of RFC793, fixed by RFC1122. + */ + req->rsk_ops->rtx_syn_ack(sk, req, NULL); + return NULL; + } + + /* Further reproduces section "SEGMENT ARRIVES" + for state SYN-RECEIVED of RFC793. + It is broken, however, it does not work only + when SYNs are crossed. + + You would think that SYN crossing is impossible here, since + we should have a SYN_SENT socket (from connect()) on our end, + but this is not true if the crossed SYNs were sent to both + ends by a malicious third party. We must defend against this, + and to do that we first verify the ACK (as per RFC793, page + 36) and reset if it is invalid. Is this a true full defense? + To convince ourselves, let us consider a way in which the ACK + test can still pass in this 'malicious crossed SYNs' case. + Malicious sender sends identical SYNs (and thus identical sequence + numbers) to both A and B: + + A: gets SYN, seq=7 + B: gets SYN, seq=7 + + By our good fortune, both A and B select the same initial + send sequence number of seven :-) + + A: sends SYN|ACK, seq=7, ack_seq=8 + B: sends SYN|ACK, seq=7, ack_seq=8 + + So we are now A eating this SYN|ACK, ACK test passes. So + does sequence test, SYN is truncated, and thus we consider + it a bare ACK. + + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) + + Note: This case is both harmless, and rare. Possibility is about the + same as us discovering intelligent life on another plant tomorrow. + + But generally, we should (RFC lies!) to accept ACK + from SYNACK both here and in tcp_rcv_state_process(). + tcp_rcv_state_process() does not, hence, we do not too. + + Note that the case is absolutely generic: + we cannot optimize anything here without + violating protocol. All the checks must be made + before attempt to create socket. + */ + + /* RFC793 page 36: "If the connection is in any non-synchronized state ... + * and the incoming segment acknowledges something not yet + * sent (the segment carries an unacceptable ACK) ... + * a reset is sent." + * + * Invalid ACK: reset will be sent by listening socket + */ + if ((flg & TCP_FLAG_ACK) && + (TCP_SKB_CB(skb)->ack_seq != + tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) + return sk; + + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. + */ + + /* RFC793: "first check sequence number". */ + + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { + /* Out of window: send ACK and drop. */ + if (!(flg & TCP_FLAG_RST)) + req->rsk_ops->send_ack(sk, skb, req); + if (paws_reject) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + return NULL; + } + + /* In sequence, PAWS is OK. */ + + if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) + req->ts_recent = tmp_opt.rcv_tsval; + + if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { + /* Truncate SYN, it is out of window starting + at tcp_rsk(req)->rcv_isn + 1. */ + flg &= ~TCP_FLAG_SYN; + } + + /* RFC793: "second check the RST bit" and + * "fourth, check the SYN bit" + */ + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + goto embryonic_reset; + } + + /* ACK sequence verified above, just make sure ACK is + * set. If ACK not set, just silently drop the packet. + */ + if (!(flg & TCP_FLAG_ACK)) + return NULL; + + /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ + if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + inet_rsk(req)->acked = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); + return NULL; + } + + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + + inet_csk_reqsk_queue_add(sk, req, child); + return child; + +listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + inet_rsk(req)->acked = 1; + return NULL; + } + +embryonic_reset: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + if (!(flg & TCP_FLAG_RST)) + req->rsk_ops->send_reset(sk, skb); + + inet_csk_reqsk_queue_drop(sk, req, prev); + return NULL; +} +EXPORT_SYMBOL(tcp_check_req); + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), + skb->len); + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + __sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} +EXPORT_SYMBOL(tcp_child_process); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c new file mode 100644 index 00000000..faf257b9 --- /dev/null +++ b/net/ipv4/tcp_output.c @@ -0,0 +1,2853 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +/* + * Changes: Pedro Roque : Retransmit queue handled by TCP. + * : Fragmentation on mtu decrease + * : Segment collapse on retransmit + * : AF independence + * + * Linus Torvalds : send_delayed_ack + * David S. Miller : Charge memory using the right skb + * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. + * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * Cacophonix Gaul : draft-minshall-nagle-01 + * J Hadi Salim : ECN support + * + */ + +#include + +#include +#include +#include + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse __read_mostly = 1; + +/* People can turn this on to work with those rare, broken TCPs that + * interpret the window field as a signed quantity. + */ +int sysctl_tcp_workaround_signed_windows __read_mostly = 0; + +/* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ +int sysctl_tcp_tso_win_divisor __read_mostly = 3; + +int sysctl_tcp_mtu_probing __read_mostly = 0; +int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; + +/* By default, RFC2861 behavior. */ +int sysctl_tcp_slow_start_after_idle __read_mostly = 1; + +int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ +EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); + + +/* Account for new data that has been sent to the network. */ +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int prior_packets = tp->packets_out; + + tcp_advance_send_head(sk, skb); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + + /* Don't override Nagle indefinitely with F-RTO */ + if (tp->frto_counter == 2) + tp->frto_counter = 3; + + tp->packets_out += tcp_skb_pcount(skb); + if (!prior_packets) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +} + +/* SND.NXT, if window was not shrunk. + * If window has been shrunk, what should we make? It is not clear at all. + * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( + * Anything in between SND.UNA...SND.UNA+SND.WND also can be already + * invalid. OK, let's make this for now: + */ +static inline __u32 tcp_acceptable_seq(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!before(tcp_wnd_end(tp), tp->snd_nxt)) + return tp->snd_nxt; + else + return tcp_wnd_end(tp); +} + +/* Calculate mss to advertise in SYN segment. + * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: + * + * 1. It is independent of path mtu. + * 2. Ideally, it is maximal possible segment size i.e. 65535-40. + * 3. For IPv4 it is reasonable to calculate it from maximal MTU of + * attached devices, because some buggy hosts are confused by + * large MSS. + * 4. We do not make 3, we advertise MSS, calculated from first + * hop device mtu, but allow to raise it to ip_rt_min_advmss. + * This may be overridden via information stored in routing table. + * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, + * probably even Jumbo". + */ +static __u16 tcp_advertise_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + int mss = tp->advmss; + + if (dst) { + unsigned int metric = dst_metric_advmss(dst); + + if (metric < mss) { + mss = metric; + tp->advmss = mss; + } + } + + return (__u16)mss; +} + +/* RFC2861. Reset CWND after idle period longer RTO to "restart window". + * This is the first part of cwnd validation mechanism. */ +static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta = tcp_time_stamp - tp->lsndtime; + u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 cwnd = tp->snd_cwnd; + + tcp_ca_event(sk, CA_EVENT_CWND_RESTART); + + tp->snd_ssthresh = tcp_current_ssthresh(sk); + restart_cwnd = min(restart_cwnd, cwnd); + + while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) + cwnd >>= 1; + tp->snd_cwnd = max(cwnd, restart_cwnd); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_used = 0; +} + +/* Congestion state accounting after a packet has been sent. */ +static void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const u32 now = tcp_time_stamp; + + if (sysctl_tcp_slow_start_after_idle && + (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) + tcp_cwnd_restart(sk, __sk_dst_get(sk)); + + tp->lsndtime = now; + + /* If it is a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; +} + +/* Account for an ACK we sent. */ +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +{ + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +/* Determine a window scaling and initial window to offer. + * Based on the assumption that the given amount of space + * will be offered. Store the results in the tp structure. + * NOTE: for smooth operation initial space offering should + * be a multiple of mss if possible. We assume here that mss >= 1. + * This MUST be enforced by all callers. + */ +void tcp_select_initial_window(int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) +{ + unsigned int space = (__space < 0 ? 0 : __space); + + /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp == 0) + (*window_clamp) = (65535 << 14); + space = min(*window_clamp, space); + + /* Quantize space offering to a multiple of mss if possible. */ + if (space > mss) + space = (space / mss) * mss; + + /* NOTE: offering an initial window larger than 32767 + * will break some buggy TCP stacks. If the admin tells us + * it is likely we could be speaking with such a buggy stack + * we will truncate our initial window offering to 32K-1 + * unless the remote has sent us a window scaling option, + * which we interpret as a sign the remote TCP is not + * misinterpreting the window field as a signed quantity. + */ + if (sysctl_tcp_workaround_signed_windows) + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + else + (*rcv_wnd) = space; + + (*rcv_wscale) = 0; + if (wscale_ok) { + /* Set window scaling on max possible window + * See RFC1323 for an explanation of the limit to 14 + */ + space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + space = min_t(u32, space, *window_clamp); + while (space > 65535 && (*rcv_wscale) < 14) { + space >>= 1; + (*rcv_wscale)++; + } + } + + /* Set initial window to a value enough for senders starting with + * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place + * a limit on the initial window when mss is larger than 1460. + */ + if (mss > (1 << *rcv_wscale)) { + int init_cwnd = TCP_DEFAULT_INIT_RCVWND; + if (mss > 1460) + init_cwnd = + max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); + /* when initializing use the value from init_rcv_wnd + * rather than the default from above + */ + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); + else + *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); + } + + /* Set the clamp no higher than max representable value */ + (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); +} +EXPORT_SYMBOL(tcp_select_initial_window); + +/* Chose a new window to advertise, update state in tcp_sock for the + * socket, and return result with RFC1323 scaling applied. The return + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +static u16 tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __tcp_select_window(sk); + + /* Never shrink the offered window */ + if (new_win < cur_win) { + /* Danger Will Robinson! + * Don't update rcv_wup/rcv_wnd here or else + * we will not be able to advertise a zero + * window in time. --DaveM + * + * Relax Will Robinson. + */ + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + + /* If we advertise zero window, disable fast path. */ + if (new_win == 0) + tp->pred_flags = 0; + + return new_win; +} + +/* Packet ECN state for a SYN-ACK */ +static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; + if (!(tp->ecn_flags & TCP_ECN_OK)) + TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; +} + +/* Packet ECN state for a SYN. */ +static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->ecn_flags = 0; + if (sysctl_tcp_ecn == 1) { + TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; + tp->ecn_flags = TCP_ECN_OK; + } +} + +static __inline__ void +TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +{ + if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +/* Set up ECN state for a packet on a ESTABLISHED socket that is about to + * be sent. + */ +static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, + int tcp_header_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->ecn_flags & TCP_ECN_OK) { + /* Not-retransmitted data segment: set ECT and inject CWR. */ + if (skb->len != tcp_header_len && + !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { + INET_ECN_xmit(sk); + if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; + tcp_hdr(skb)->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } + } else { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } + if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) + tcp_hdr(skb)->ece = 1; + } +} + +/* Constructs common control bits of non-data skb. If SYN/FIN is present, + * auto increment end seqno. + */ +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +{ + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + + TCP_SKB_CB(skb)->flags = flags; + TCP_SKB_CB(skb)->sacked = 0; + + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + + TCP_SKB_CB(skb)->seq = seq; + if (flags & (TCPHDR_SYN | TCPHDR_FIN)) + seq++; + TCP_SKB_CB(skb)->end_seq = seq; +} + +static inline int tcp_urg_mode(const struct tcp_sock *tp) +{ + return tp->snd_una != tp->snd_up; +} + +#define OPTION_SACK_ADVERTISE (1 << 0) +#define OPTION_TS (1 << 1) +#define OPTION_MD5 (1 << 2) +#define OPTION_WSCALE (1 << 3) +#define OPTION_COOKIE_EXTENSION (1 << 4) + +struct tcp_out_options { + u8 options; /* bit field of OPTION_* */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 hash_size; /* bytes in hash_location */ + u16 mss; /* 0 to disable */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + __u8 *hash_location; /* temporary pointer, overloaded */ +}; + +/* The sysctl int routines are generic, so check consistency here. + */ +static u8 tcp_cookie_size_check(u8 desired) +{ + int cookie_size; + + if (desired > 0) + /* previously specified */ + return desired; + + cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); + if (cookie_size <= 0) + /* no default specified */ + return 0; + + if (cookie_size <= TCP_COOKIE_MIN) + /* value too small, specify minimum */ + return TCP_COOKIE_MIN; + + if (cookie_size >= TCP_COOKIE_MAX) + /* value too large, specify maximum */ + return TCP_COOKIE_MAX; + + if (cookie_size & 1) + /* 8-bit multiple, illegal, fix it */ + cookie_size++; + + return (u8)cookie_size; +} + +/* Write previously computed TCP options to the packet. + * + * Beware: Something in the Internet is very sensitive to the ordering of + * TCP options, we learned this through the hard way, so be careful here. + * Luckily we can at least blame others for their non-compliance but from + * inter-operatibility perspective it seems that we're somewhat stuck with + * the ordering which we have been using if we want to keep working with + * those broken things (not that it currently hurts anybody as there isn't + * particular reason why the ordering would need to be changed). + * + * At least SACK_PERM as the first option is known to lead to a disaster + * (but it may well be that other scenarios fail similarly). + */ +static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, + struct tcp_out_options *opts) +{ + u8 options = opts->options; /* mungable copy */ + + /* Having both authentication and cookies for security is redundant, + * and there's certainly not enough room. Instead, the cookie-less + * extension variant is proposed. + * + * Consider the pessimal case with authentication. The options + * could look like: + * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_MD5 & options)) { + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + *ptr++ = htonl((TCPOPT_COOKIE << 24) | + (TCPOLEN_COOKIE_BASE << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } else { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } + options &= ~OPTION_COOKIE_EXTENSION; + /* overload cookie hash location */ + opts->hash_location = (__u8 *)ptr; + ptr += 4; + } + + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } + + if (likely(OPTION_TS & options)) { + if (unlikely(OPTION_SACK_ADVERTISE & options)) { + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + options &= ~OPTION_SACK_ADVERTISE; + } else { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + } + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } + + /* Specification requires after timestamp, so do it now. + * + * Consider the pessimal case without authentication. The options + * could look like: + * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + __u8 *cookie_copy = opts->hash_location; + u8 cookie_size = opts->hash_size; + + /* 8-bit multiple handled in tcp_cookie_size_check() above, + * and elsewhere. + */ + if (0x2 & cookie_size) { + __u8 *p = (__u8 *)ptr; + + /* 16-bit multiple */ + *p++ = TCPOPT_COOKIE; + *p++ = TCPOLEN_COOKIE_BASE + cookie_size; + *p++ = *cookie_copy++; + *p++ = *cookie_copy++; + ptr++; + cookie_size -= 2; + } else { + /* 32-bit multiple */ + *ptr++ = htonl(((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_COOKIE << 8) | + TCPOLEN_COOKIE_BASE) + + cookie_size); + } + + if (cookie_size > 0) { + memcpy(ptr, cookie_copy, cookie_size); + ptr += (cookie_size / 4); + } + } + + if (unlikely(OPTION_SACK_ADVERTISE & options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + } + + if (unlikely(OPTION_WSCALE & options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->ws); + } + + if (unlikely(opts->num_sack_blocks)) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? + tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * + TCPOLEN_SACK_PERBLOCK))); + + for (this_sack = 0; this_sack < opts->num_sack_blocks; + ++this_sack) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + + tp->rx_opt.dsack = 0; + } +} + +/* Compute TCP options for SYN packets. This is not the final + * network wire format yet. + */ +static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_cookie_values *cvp = tp->cookie_values; + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? + tcp_cookie_size_check(cvp->cookie_desired) : + 0; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* We always get an MSS option. The option bytes which will be seen in + * normal data packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so that + * calculations in tcp_sendmsg are simpler etc. So account for this + * fact here if necessary. If we don't do this correctly, as a + * receiver we won't recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK rules correctly. + * SACKs don't matter, we never delay an ACK when we have any of those + * going out. */ + opts->mss = tcp_advertise_mss(sk); + remaining -= TCPOLEN_MSS_ALIGNED; + + if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = tp->rx_opt.ts_recent; + remaining -= TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(sysctl_tcp_window_scaling)) { + opts->ws = tp->rx_opt.rcv_wscale; + opts->options |= OPTION_WSCALE; + remaining -= TCPOLEN_WSCALE_ALIGNED; + } + if (likely(sysctl_tcp_sack)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!(OPTION_TS & opts->options))) + remaining -= TCPOLEN_SACKPERM_ALIGNED; + } + + /* Note that timestamps are required by the specification. + * + * Odd numbers of bytes are prohibited by the specification, ensuring + * that the cookie is 16-bit aligned, and the resulting cookie pair is + * 32-bit aligned. + */ + if (*md5 == NULL && + (OPTION_TS & opts->options) && + cookie_size > 0) { + int need = TCPOLEN_COOKIE_BASE + cookie_size; + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + + if (need > remaining) { + /* try shrinking cookie to fit */ + cookie_size -= 2; + need -= 4; + } + } + while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { + cookie_size -= 4; + need -= 4; + } + if (TCP_COOKIE_MIN <= cookie_size) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; + opts->hash_size = cookie_size; + + /* Remember for future incarnations. */ + cvp->cookie_desired = cookie_size; + + if (cvp->cookie_desired != cvp->cookie_pair_size) { + /* Currently use random bytes as a nonce, + * assuming these are completely unpredictable + * by hostile users of the same system. + */ + get_random_bytes(&cvp->cookie_pair[0], + cookie_size); + cvp->cookie_pair_size = cookie_size; + } + + remaining -= need; + } + } + return MAX_TCP_OPTION_SPACE - remaining; +} + +/* Set up TCP options for SYN-ACKs. */ +static unsigned tcp_synack_options(struct sock *sk, + struct request_sock *req, + unsigned mss, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5, + struct tcp_extend_values *xvp) +{ + struct inet_request_sock *ireq = inet_rsk(req); + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? + xvp->cookie_plus : + 0; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); + if (*md5) { + opts->options |= OPTION_MD5; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + + /* We can't fit any SACK blocks in a packet with MD5 + TS + * options. There was discussion about disabling SACK + * rather than TS in order to fit in better with old, + * buggy kernels, but that was deemed to be unnecessary. + */ + ireq->tstamp_ok &= !ireq->sack_ok; + } +#else + *md5 = NULL; +#endif + + /* We always send an MSS option. */ + opts->mss = mss; + remaining -= TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->wscale_ok)) { + opts->ws = ireq->rcv_wscale; + opts->options |= OPTION_WSCALE; + remaining -= TCPOLEN_WSCALE_ALIGNED; + } + if (likely(ireq->tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = req->ts_recent; + remaining -= TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(ireq->sack_ok)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!ireq->tstamp_ok)) + remaining -= TCPOLEN_SACKPERM_ALIGNED; + } + + /* Similar rationale to tcp_syn_options() applies here, too. + * If the options fit, the same options should fit now! + */ + if (*md5 == NULL && + ireq->tstamp_ok && + cookie_plus > TCPOLEN_COOKIE_BASE) { + int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + } + if (need <= remaining) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; + remaining -= need; + } else { + /* There's no error return, so flag it. */ + xvp->cookie_out_never = 1; /* true */ + opts->hash_size = 0; + } + } + return MAX_TCP_OPTION_SPACE - remaining; +} + +/* Compute TCP options for ESTABLISHED sockets. This is not the + * final wire format yet. + */ +static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + unsigned int eff_sacks; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (unlikely(*md5)) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcb ? tcb->when : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; + if (unlikely(eff_sacks)) { + const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + opts->num_sack_blocks = + min_t(unsigned, eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } + + return size; +} + +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + struct tcp_out_options opts; + unsigned tcp_options_size, tcp_header_size; + struct tcp_md5sig_key *md5; + struct tcphdr *th; + int err; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); + + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) + __net_timestamp(skb); + + if (likely(clone_it)) { + if (unlikely(skb_cloned(skb))) + skb = pskb_copy(skb, gfp_mask); + else + skb = skb_clone(skb, gfp_mask); + if (unlikely(!skb)) + return -ENOBUFS; + } + + inet = inet_sk(sk); + tp = tcp_sk(sk); + tcb = TCP_SKB_CB(skb); + memset(&opts, 0, sizeof(opts)); + + if (unlikely(tcb->flags & TCPHDR_SYN)) + tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); + else + tcp_options_size = tcp_established_options(sk, skb, &opts, + &md5); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); + + if (tcp_packets_in_flight(tp) == 0) { + tcp_ca_event(sk, CA_EVENT_TX_START); + skb->ooo_okay = 1; + } else + skb->ooo_okay = 0; + + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th = tcp_hdr(skb); + th->source = inet->inet_sport; + th->dest = inet->inet_dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->flags); + + if (unlikely(tcb->flags & TCPHDR_SYN)) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(min(tp->rcv_wnd, 65535U)); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; + + /* The urg_mode check is necessary during a below snd_una win probe */ + if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { + if (before(tp->snd_up, tcb->seq + 0x10000)) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); + th->urg = 1; + } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { + th->urg_ptr = htons(0xFFFF); + th->urg = 1; + } + } + + tcp_options_write((__be32 *)(th + 1), tp, &opts); + if (likely((tcb->flags & TCPHDR_SYN) == 0)) + TCP_ECN_send(sk, skb, tcp_header_size); + +#ifdef CONFIG_TCP_MD5SIG + /* Calculate the MD5 hash, as we have all we need now */ + if (md5) { + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + tp->af_specific->calc_md5_hash(opts.hash_location, + md5, sk, NULL, skb); + } +#endif + + icsk->icsk_af_ops->send_check(sk, skb); + + if (likely(tcb->flags & TCPHDR_ACK)) + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); + + if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, + tcp_skb_pcount(skb)); + + err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + if (likely(err <= 0)) + return err; + + tcp_enter_cwr(sk, 1); + + return net_xmit_eval(err); +} + +/* This routine just queues the buffer for sending. + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. + */ +static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq = TCP_SKB_CB(skb)->end_seq; + skb_header_release(skb); + tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); +} + +/* Initialize TSO segments for a packet. */ +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) +{ + if (skb->len <= mss_now || !sk_can_gso(sk) || + skb->ip_summed == CHECKSUM_NONE) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + } else { + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + skb_shinfo(skb)->gso_size = mss_now; + skb_shinfo(skb)->gso_type = sk->sk_gso_type; + } +} + +/* When a modification to fackets out becomes necessary, we need to check + * skb is counted to fackets_out or not. + */ +static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, + int decr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->sacked_out || tcp_is_reno(tp)) + return; + + if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) + tp->fackets_out -= decr; +} + +/* Pcount in the middle of the write queue got changed, we need to do various + * tweaks to fix counters + */ +static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->packets_out -= decr; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= decr; + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= decr; + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) + tp->lost_out -= decr; + + /* Reno case is special. Sigh... */ + if (tcp_is_reno(tp) && decr > 0) + tp->sacked_out -= min_t(u32, tp->sacked_out, decr); + + tcp_adjust_fackets_out(sk, skb, decr); + + if (tp->lost_skb_hint && + before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && + (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) + tp->lost_cnt_hint -= decr; + + tcp_verify_left_out(tp); +} + +/* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. + * Remember, these are still headerless SKBs at this point. + */ +int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int nsize, old_factor; + int nlen; + u8 flags; + + if (WARN_ON(len > skb->len)) + return -EINVAL; + + nsize = skb_headlen(skb) - len; + if (nsize < 0) + nsize = 0; + + if (skb_cloned(skb) && + skb_is_nonlinear(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ + + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); + nlen = skb->len - len - nsize; + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; + + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy_nocheck(skb->data + len, + skb_put(buff, nsize), + nsize, 0); + + skb_trim(skb, len); + + skb->csum = csum_block_sub(skb->csum, buff->csum, len); + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb_split(skb, buff, len); + } + + buff->ip_summed = skb->ip_summed; + + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + buff->tstamp = skb->tstamp; + + old_factor = tcp_skb_pcount(skb); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(sk, buff, mss_now); + + /* If this packet has been sent out already, we must + * adjust the various packet counters. + */ + if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { + int diff = old_factor - tcp_skb_pcount(skb) - + tcp_skb_pcount(buff); + + if (diff) + tcp_adjust_pcount(sk, skb, diff); + } + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + tcp_insert_write_queue_after(skb, buff, sk); + + return 0; +} + +/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c + * eventually). The difference is that pulled data not copied, but + * immediately discarded. + */ +static void __pskb_trim_head(struct sk_buff *skb, int len) +{ + int i, k, eat; + + eat = len; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb_reset_tail_pointer(skb); + skb->data_len -= len; + skb->len = skb->data_len; +} + +/* Remove acked data from a packet in the transmit queue. */ +int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +{ + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* If len == headlen, we avoid __skb_pull to preserve alignment. */ + if (unlikely(len < skb_headlen(skb))) + __skb_pull(skb, len); + else + __pskb_trim_head(skb, len - skb_headlen(skb)); + + TCP_SKB_CB(skb)->seq += len; + skb->ip_summed = CHECKSUM_PARTIAL; + + skb->truesize -= len; + sk->sk_wmem_queued -= len; + sk_mem_uncharge(sk, len); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + + /* Any change of skb->len requires recalculation of tso factor. */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + + return 0; +} + +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= icsk->icsk_ext_hdr_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + return mss_now; +} + +/* Inverse of above */ +int tcp_mss_to_mtu(struct sock *sk, int mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mtu; + + mtu = mss + + tp->tcp_header_len + + icsk->icsk_ext_hdr_len + + icsk->icsk_af_ops->net_header_len; + + return mtu; +} + +/* MTU probing init per socket */ +void tcp_mtup_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); + icsk->icsk_mtup.probe_size = 0; +} +EXPORT_SYMBOL(tcp_mtup_init); + +/* This function synchronize snd mss to current pmtu/exthdr set. + + tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + for TCP options, but includes only bare TCP header. + + tp->rx_opt.mss_clamp is mss negotiated at connection setup. + It is minimum of user_mss and mss received with SYN. + It also does not include TCP options. + + inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. + + tp->mss_cache is current effective sending mss, including + all tcp options except for SACKs. It is evaluated, + taking into account current pmtu, but never exceeds + tp->rx_opt.mss_clamp. + + NOTE1. rfc1122 clearly states that advertised MSS + DOES NOT include either tcp or ip options. + + NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache + are READ ONLY outside this function. --ANK (980731) + */ +unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mss_now; + + if (icsk->icsk_mtup.search_high > pmtu) + icsk->icsk_mtup.search_high = pmtu; + + mss_now = tcp_mtu_to_mss(sk, pmtu); + mss_now = tcp_bound_to_half_wnd(tp, mss_now); + + /* And store cached results */ + icsk->icsk_pmtu_cookie = pmtu; + if (icsk->icsk_mtup.enabled) + mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); + tp->mss_cache = mss_now; + + return mss_now; +} +EXPORT_SYMBOL(tcp_sync_mss); + +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + */ +unsigned int tcp_current_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + u32 mss_now; + unsigned header_len; + struct tcp_out_options opts; + struct tcp_md5sig_key *md5; + + mss_now = tp->mss_cache; + + if (dst) { + u32 mtu = dst_mtu(dst); + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) + mss_now = tcp_sync_mss(sk, mtu); + } + + header_len = tcp_established_options(sk, NULL, &opts, &md5) + + sizeof(struct tcphdr); + /* The mss_cache is sized based on tp->tcp_header_len, which assumes + * some common options. If this is an odd packet (because we have SACK + * blocks etc) then our calculated header_len will be different, and + * we have to adjust mss_now correspondingly */ + if (header_len != tp->tcp_header_len) { + int delta = (int) header_len - tp->tcp_header_len; + mss_now -= delta; + } + + return mss_now; +} + +/* Congestion window validation. (RFC2861) */ +static void tcp_cwnd_validate(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if (sysctl_tcp_slow_start_after_idle && + (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) + tcp_cwnd_application_limited(sk); + } +} + +/* Returns the portion of skb which can be sent right away without + * introducing MSS oddities to segment boundaries. In rare cases where + * mss_now != mss_cache, we will request caller to create a small skb + * per input skb which could be mostly avoided here (if desired). + * + * We explicitly want to create a request for splitting write queue tail + * to a small skb for Nagle purposes while avoiding unnecessary modulos, + * thus all the complexity (cwnd_len is always MSS multiple which we + * return whenever allowed by the other factors). Basically we need the + * modulo only when the receiver window alone is the limiting factor or + * when we would be allowed to send the split-due-to-Nagle skb fully. + */ +static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now, unsigned int cwnd) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 needed, window, cwnd_len; + + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + cwnd_len = mss_now * cwnd; + + if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) + return cwnd_len; + + needed = min(skb->len, window); + + if (cwnd_len <= needed) + return cwnd_len; + + return needed - needed % mss_now; +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, + struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* Initialize TSO state of a skb. + * This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { + tcp_set_skb_tso_segs(sk, skb, mss_now); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +/* Minshall's variant of the Nagle send check. */ +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return skb->len < mss_now && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (see RFC4138). + */ + if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || + (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tcp_wnd_end(tp)); +} + +/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + tcp_init_tso_segs(sk, skb, cur_mss); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +/* Test if sending is allowed right now. */ +int tcp_may_send_now(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = tcp_send_head(sk); + + return skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk), + (tcp_skb_is_last(sk, skb) ? + tp->nonagle : TCP_NAGLE_PUSH)); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + unsigned int mss_now, gfp_t gfp) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u8 flags; + + /* All of a TSO frame must be composed of paged data. */ + if (skb->len != skb->data_len) + return tcp_fragment(sk, skb, len, mss_now); + + buff = sk_stream_alloc_skb(sk, 0, gfp); + if (unlikely(buff == NULL)) + return -ENOMEM; + + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); + buff->truesize += nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(sk, buff, mss_now); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + tcp_insert_write_queue_after(skb, buff, sk); + + return 0; +} + +/* Try to defer sending, if possible, in order to minimize the amount + * of TSO splitting we do. View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + */ +static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + u32 send_win, cong_win, limit, in_flight; + int win_divisor; + + if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) + goto send_now; + + if (icsk->icsk_ca_state != TCP_CA_Open) + goto send_now; + + /* Defer for less than two clock ticks. */ + if (tp->tso_deferred && + (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) + goto send_now; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); + + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that cwnd > in_flight. */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If a full-sized TSO skb can be sent, do it. */ + if (limit >= sk->sk_gso_max_size) + goto send_now; + + /* Middle in queue won't get any more data, full sendable already? */ + if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) + goto send_now; + + win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + if (win_divisor) { + u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk /= win_divisor; + if (limit >= chunk) + goto send_now; + } else { + /* Different approach, try not to defer past a single + * ACK. Receiver should ACK every other full sized + * frame, so if we have space for more than 3 frames + * then send now. + */ + if (limit > tcp_max_burst(tp) * tp->mss_cache) + goto send_now; + } + + /* Ok, it looks like it is advisable to defer. */ + tp->tso_deferred = 1 | (jiffies << 1); + + return 1; + +send_now: + tp->tso_deferred = 0; + return 0; +} + +/* Create a new MTU probe if we are ready. + * MTU probe is regularly attempting to increase the path MTU by + * deliberately sending larger packets. This discovers routing + * changes resulting in larger path MTUs. + * + * Returns 0 if we should wait to probe (no cwnd available), + * 1 if a probe was sent, + * -1 otherwise + */ +static int tcp_mtu_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *nskb, *next; + int len; + int probe_size; + int size_needed; + int copy; + int mss_now; + + /* Not currently probing/verifying, + * not in recovery, + * have enough cwnd, and + * not SACKing (the variable headers throw things off) */ + if (!icsk->icsk_mtup.enabled || + icsk->icsk_mtup.probe_size || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.num_sacks || tp->rx_opt.dsack) + return -1; + + /* Very simple search strategy: just double the MSS. */ + mss_now = tcp_current_mss(sk); + probe_size = 2 * tp->mss_cache; + size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { + /* TODO: set timer for probe_converge_event */ + return -1; + } + + /* Have enough data in the send queue to probe? */ + if (tp->write_seq - tp->snd_nxt < size_needed) + return -1; + + if (tp->snd_wnd < size_needed) + return -1; + if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) + return 0; + + /* Do we need to wait to drain cwnd? With none in flight, don't stall */ + if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { + if (!tcp_packets_in_flight(tp)) + return -1; + else + return 0; + } + + /* We're allowed to probe. Build it now. */ + if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) + return -1; + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); + + skb = tcp_send_head(sk); + + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; + TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; + TCP_SKB_CB(nskb)->sacked = 0; + nskb->csum = 0; + nskb->ip_summed = skb->ip_summed; + + tcp_insert_write_queue_before(nskb, skb, sk); + + len = 0; + tcp_for_write_queue_from_safe(skb, next, sk) { + copy = min_t(int, skb->len, probe_size - len); + if (nskb->ip_summed) + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); + else + nskb->csum = skb_copy_and_csum_bits(skb, 0, + skb_put(nskb, copy), + copy, nskb->csum); + + if (skb->len <= copy) { + /* We've eaten all the data from this skb. + * Throw it away. */ + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + } else { + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & + ~(TCPHDR_FIN|TCPHDR_PSH); + if (!skb_shinfo(skb)->nr_frags) { + skb_pull(skb, copy); + if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->csum = csum_partial(skb->data, + skb->len, 0); + } else { + __pskb_trim_head(skb, copy); + tcp_set_skb_tso_segs(sk, skb, mss_now); + } + TCP_SKB_CB(skb)->seq += copy; + } + + len += copy; + + if (len >= probe_size) + break; + } + tcp_init_tso_segs(sk, nskb, nskb->len); + + /* We're ready to send. If this fails, the probe will + * be resegmented into mss-sized pieces by tcp_write_xmit(). */ + TCP_SKB_CB(nskb)->when = tcp_time_stamp; + if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { + /* Decrement cwnd here because we are sending + * effectively two packets. */ + tp->snd_cwnd--; + tcp_event_new_data_sent(sk, nskb); + + icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); + tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; + tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; + + return 1; + } + + return -1; +} + +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. + * + * LARGESEND note: !tcp_urg_mode is overkill, only frames between + * snd_up-64k-mss .. snd_up cannot be large. However, taking into + * account rare use of URG, this is not a big flaw. + * + * Returns 1, if no segments are in flight and we have queued segments, but + * cannot send anything now because of SWS or another problem. + */ +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; + int result; + + sent_pkts = 0; + + if (!push_one) { + /* Do MTU probing. */ + result = tcp_mtu_probe(sk); + if (!result) { + return 0; + } else if (result > 0) { + sent_pkts = 1; + } + } + + while ((skb = tcp_send_head(sk))) { + unsigned int limit; + + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + BUG_ON(!tso_segs); + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (!cwnd_quota) + break; + + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + break; + + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + if (!push_one && tcp_tso_should_defer(sk, skb)) + break; + } + + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + cwnd_quota); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) + break; + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + tcp_event_new_data_sent(sk, skb); + + tcp_minshall_update(tp, mss_now, skb); + sent_pkts++; + + if (push_one) + break; + } + + if (likely(sent_pkts)) { + tcp_cwnd_validate(sk); + return 0; + } + return !tp->packets_out && tcp_send_head(sk); +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, + int nonagle) +{ + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and + * all will be happy. + */ + if (unlikely(sk->sk_state == TCP_CLOSE)) + return; + + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) + tcp_check_probe_timer(sk); +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned int mss_now) +{ + struct sk_buff *skb = tcp_send_head(sk); + + BUG_ON(!skb || skb->len < mss_now); + + tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); +} + +/* This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + * + * RFC 1122: + * "the suggested [SWS] avoidance algorithm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can raise + * it at least MSS bytes. + * + * Unfortunately, the recommended algorithm breaks header prediction, + * since header prediction assumes th->window stays fixed. + * + * Strictly speaking, keeping th->window fixed violates the receiver + * side SWS prevention criteria. The problem is that under this rule + * a stream of single byte packets will cause the right side of the + * window to always advance by a single byte. + * + * Of course, if the sender implements sender side SWS prevention + * then this will not be a problem. + * + * BSD seems to make the following compromise: + * + * If the free space is less than the 1/4 of the maximum + * space available and the free space is less than 1/2 mss, + * then set the window to 0. + * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] + * Otherwise, just prevent the window from shrinking + * and from being larger than the largest representable value. + * + * This prevents incremental opening of the window in the regime + * where TCP is limited by the speed of the reader side taking + * data out of the TCP receive queue. It does nothing about + * those cases where the window is constrained on the sender side + * because the pipeline is full. + * + * BSD also seems to "accidentally" limit itself to windows that are a + * multiple of MSS, at least until the free space gets quite small. + * This would appear to be a side effect of the mbuf implementation. + * Combining these two algorithms results in the observed behavior + * of having a fixed window size at almost all times. + * + * Below we obtain similar behavior by forcing the offered window to + * a multiple of the mss when it is feasible to do so. + * + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. + * Regular options like TIMESTAMP are taken into account. + */ +u32 __tcp_select_window(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + /* MSS for the peer's data. Previous versions used mss_clamp + * here. I don't know if the value based on our guesses + * of peer's MSS is better for the performance. It's more correct + * but may be worse for the performance because of rcv_mss + * fluctuations. --SAW 1998/11/1 + */ + int mss = icsk->icsk_ack.rcv_mss; + int free_space = tcp_space(sk); + int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int window; + + if (mss > full_space) + mss = full_space; + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, + 4U * tp->advmss); + + if (free_space < mss) + return 0; + } + + if (free_space > tp->rcv_ssthresh) + free_space = tp->rcv_ssthresh; + + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. + */ + window = tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1< mss. + */ + if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space / mss) * mss; + else if (mss == full_space && + free_space > window + (full_space >> 1)) + window = free_space; + } + + return window; +} + +/* Collapses two adjacent SKB's during retransmission. */ +static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + int skb_size, next_skb_size; + + skb_size = skb->len; + next_skb_size = next_skb->len; + + BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); + + tcp_highest_sack_combine(sk, next_skb, skb); + + tcp_unlink_write_queue(next_skb, sk); + + skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), + next_skb_size); + + if (next_skb->ip_summed == CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_PARTIAL; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + /* Merge over control information. This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; + + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + if (next_skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = skb; + + tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); + + sk_wmem_free_skb(sk, next_skb); +} + +/* Check if coalescing SKBs is legal. */ +static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_skb_pcount(skb) > 1) + return 0; + /* TODO: SACK collapsing could be used to remove this condition */ + if (skb_shinfo(skb)->nr_frags != 0) + return 0; + if (skb_cloned(skb)) + return 0; + if (skb == tcp_send_head(sk)) + return 0; + /* Some heurestics for collapsing over SACK'd could be invented */ + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + return 0; + + return 1; +} + +/* Collapse packets in the retransmit queue to make to create + * less packets on the wire. This is only done on retransmission. + */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, + int space) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = to, *tmp; + int first = 1; + + if (!sysctl_tcp_retrans_collapse) + return; + if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) + return; + + tcp_for_write_queue_from_safe(skb, tmp, sk) { + if (!tcp_can_collapse(sk, skb)) + break; + + space -= skb->len; + + if (first) { + first = 0; + continue; + } + + if (space < 0) + break; + /* Punt if not enough space exists in the first SKB for + * the data in the second + */ + if (skb->len > skb_tailroom(to)) + break; + + if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) + break; + + tcp_collapse_retrans(sk, to); + } +} + +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occurred which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned int cur_mss; + int err; + + /* Inconslusive MTU probe */ + if (icsk->icsk_mtup.probe_size) { + icsk->icsk_mtup.probe_size = 0; + } + + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: fragmentation, tunneling, mangling etc. + */ + if (atomic_read(&sk->sk_wmem_alloc) > + min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + return -EAGAIN; + + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { + if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + BUG(); + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) + return -ENOMEM; + } + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + cur_mss = tcp_current_mss(sk); + + /* If receiver has shrunk his window, and skb is out of + * new window, do not retransmit it. The exception is the + * case, when window is shrunk to zero. In this case + * our retransmit serves as a zero window probe. + */ + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + + if (skb->len > cur_mss) { + if (tcp_fragment(sk, skb, cur_mss, cur_mss)) + return -ENOMEM; /* We'll try again later. */ + } else { + int oldpcount = tcp_skb_pcount(skb); + + if (unlikely(oldpcount > 1)) { + tcp_init_tso_segs(sk, skb, cur_mss); + tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); + } + } + + tcp_retrans_try_collapse(sk, skb, cur_mss); + + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if (skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (!pskb_trim(skb, 0)) { + /* Reuse, even though it does some unnecessary work */ + tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, + TCP_SKB_CB(skb)->flags); + skb->ip_summed = CHECKSUM_NONE; + } + } + + /* Make a copy, if the first transmission SKB clone we made + * is still in somebody's hands, else make a clone. + */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + + if (err == 0) { + /* Update global TCP statistics. */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + + tp->total_retrans++; + +#if FASTRETRANS_DEBUG > 0 + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + if (net_ratelimit()) + printk(KERN_DEBUG "retrans_out leaked.\n"); + } +#endif + if (!tp->retrans_out) + tp->lost_retrans_low = tp->snd_nxt; + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out += tcp_skb_pcount(skb); + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = TCP_SKB_CB(skb)->when; + + tp->undo_retrans += tcp_skb_pcount(skb); + + /* snd_nxt is stored to detect loss of retransmitted segment, + * see tcp_input.c tcp_sacktag_write_queue(). + */ + TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } + return err; +} + +/* Check if we forward retransmits are possible in the current + * window/congestion state. + */ +static int tcp_can_forward_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* Forward retransmissions are possible only during Recovery. */ + if (icsk->icsk_ca_state != TCP_CA_Recovery) + return 0; + + /* No forward retransmissions in Reno are possible. */ + if (tcp_is_reno(tp)) + return 0; + + /* Yeah, we have to make difficult choice between forward transmission + * and retransmission... Both ways have their merits... + * + * For now we do not retransmit anything, while we have some new + * segments to send. In the other cases, follow rule 3 for + * NextSeg() specified in RFC3517. + */ + + if (tcp_may_send_now(sk)) + return 0; + + return 1; +} + +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + struct sk_buff *hole = NULL; + u32 last_lost; + int mib_idx; + int fwd_rexmitting = 0; + + if (!tp->packets_out) + return; + + if (!tp->lost_out) + tp->retransmit_high = tp->snd_una; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + last_lost = TCP_SKB_CB(skb)->end_seq; + if (after(last_lost, tp->retransmit_high)) + last_lost = tp->retransmit_high; + } else { + skb = tcp_write_queue_head(sk); + last_lost = tp->snd_una; + } + + tcp_for_write_queue_from(skb, sk) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if (skb == tcp_send_head(sk)) + break; + /* we could do better than to assign each time */ + if (hole == NULL) + tp->retransmit_skb_hint = skb; + + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + return; + + if (fwd_rexmitting) { +begin_fwd: + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + break; + mib_idx = LINUX_MIB_TCPFORWARDRETRANS; + + } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { + tp->retransmit_high = last_lost; + if (!tcp_can_forward_retransmit(sk)) + break; + /* Backtrack if necessary to non-L'ed skb */ + if (hole != NULL) { + skb = hole; + hole = NULL; + } + fwd_rexmitting = 1; + goto begin_fwd; + + } else if (!(sacked & TCPCB_LOST)) { + if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) + hole = skb; + continue; + + } else { + last_lost = TCP_SKB_CB(skb)->end_seq; + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; + } + + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) + continue; + + if (tcp_retransmit_skb(sk, skb)) + return; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + if (skb == tcp_write_queue_head(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); + } +} + +/* Send a fin. The caller locks the socket for us. This cannot be + * allowed to fail queueing a FIN frame under any circumstances. + */ +void tcp_send_fin(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = tcp_write_queue_tail(sk); + int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk); + + if (tcp_send_head(sk) != NULL) { + TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + } else { + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb_fclone(MAX_TCP_HEADER, + sk->sk_allocation); + if (skb) + break; + yield(); + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ + tcp_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); + tcp_queue_skb(sk, skb); + } + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); +} + +/* We get here when a process closes a file descriptor (either due to + * an explicit close() or as a byproduct of exit()'ing) and there + * was unread data in the receive queue. This behavior is recommended + * by RFC 2525, section 2.17. -DaveM + */ +void tcp_send_active_reset(struct sock *sk, gfp_t priority) +{ + struct sk_buff *skb; + + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = alloc_skb(MAX_TCP_HEADER, priority); + if (!skb) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + TCPHDR_ACK | TCPHDR_RST); + /* Send it off. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb, 0, priority)) + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); +} + +/* Send a crossed SYN-ACK during socket establishment. + * WARNING: This routine must only be called when we have already sent + * a SYN packet that crossed the incoming SYN that caused this routine + * to get called. If this assumption fails then the initial rcv_wnd + * and rcv_wscale values will not be correct. + */ +int tcp_send_synack(struct sock *sk) +{ + struct sk_buff *skb; + + skb = tcp_write_queue_head(sk); + if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { + printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + return -EFAULT; + } + if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { + if (skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; + tcp_unlink_write_queue(skb, sk); + skb_header_release(nskb); + __tcp_add_write_queue_head(sk, nskb); + sk_wmem_free_skb(sk, skb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); + skb = nskb; + } + + TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; + TCP_ECN_send_synack(tcp_sk(sk), skb); + } + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +} + +/* Prepare a SYN-ACK. */ +struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) +{ + struct tcp_out_options opts; + struct tcp_extend_values *xvp = tcp_xv(rvp); + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_cookie_values *cvp = tp->cookie_values; + struct tcphdr *th; + struct sk_buff *skb; + struct tcp_md5sig_key *md5; + int tcp_header_size; + int mss; + int s_data_desired = 0; + + if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) + s_data_desired = cvp->s_data_desired; + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_TCP_HEADER); + + skb_dst_set(skb, dst_clone(dst)); + + mss = dst_metric_advmss(dst); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + ireq->rcv_wscale = rcv_wscale; + } + + memset(&opts, 0, sizeof(opts)); +#ifdef CONFIG_SYN_COOKIES + if (unlikely(req->cookie_ts)) + TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + else +#endif + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_header_size = tcp_synack_options(sk, req, mss, + skb, &opts, &md5, xvp) + + sizeof(*th); + + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + + th = tcp_hdr(skb); + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + TCP_ECN_make_synack(req, th); + th->source = ireq->loc_port; + th->dest = ireq->rmt_port; + /* Setting of flags are superfluous here for callers (and ECE is + * not even correctly set) + */ + tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, + TCPHDR_SYN | TCPHDR_ACK); + + if (OPTION_COOKIE_EXTENSION & opts.options) { + if (s_data_desired) { + u8 *buf = skb_put(skb, s_data_desired); + + /* copy data directly from the listening socket. */ + memcpy(buf, cvp->s_data_payload, s_data_desired); + TCP_SKB_CB(skb)->end_seq += s_data_desired; + } + + if (opts.hash_size > 0) { + __u32 workspace[SHA_WORKSPACE_WORDS]; + u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; + u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; + + /* Secret recipe depends on the Timestamp, (future) + * Sequence and Acknowledgment Numbers, Initiator + * Cookie, and others handled by IP variant caller. + */ + *tail-- ^= opts.tsval; + *tail-- ^= tcp_rsk(req)->rcv_isn + 1; + *tail-- ^= TCP_SKB_CB(skb)->seq + 1; + + /* recommended */ + *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); + *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ + + sha_transform((__u32 *)&xvp->cookie_bakery[0], + (char *)mess, + &workspace[0]); + opts.hash_location = + (__u8 *)&xvp->cookie_bakery[0]; + } + } + + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(min(req->rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), tp, &opts); + th->doff = (tcp_header_size >> 2); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + +#ifdef CONFIG_TCP_MD5SIG + /* Okay, we have all we need - do the md5 hash if needed */ + if (md5) { + tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, + md5, NULL, req, skb); + } +#endif + + return skb; +} +EXPORT_SYMBOL(tcp_make_synack); + +/* Do all connect socket setups that can be done AF independent. */ +static void tcp_connect_init(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + +#ifdef CONFIG_TCP_MD5SIG + if (tp->af_specific->md5_lookup(sk, sk) != NULL) + tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; +#endif + + /* If user gave his TCP_MAXSEG, record it to clamp */ + if (tp->rx_opt.user_mss) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->max_window = 0; + tcp_mtup_init(sk); + tcp_sync_mss(sk, dst_mtu(dst)); + + if (!tp->window_clamp) + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + tp->advmss = dst_metric_advmss(dst); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) + tp->advmss = tp->rx_opt.user_mss; + + tcp_initialize_rcv_mss(sk); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + + tcp_select_initial_window(tcp_full_space(sk), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rcv_ssthresh = tp->rcv_wnd; + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->snd_wnd = 0; + tcp_init_wl(tp, 0); + tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; + tp->rcv_nxt = 0; + tp->rcv_wup = 0; + tp->copied_seq = 0; + + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_retransmits = 0; + tcp_clear_retrans(tp); +} + +/* Build a SYN and send it off. */ +int tcp_connect(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int err; + + tcp_connect_init(sk); + + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); + + tp->snd_nxt = tp->write_seq; + tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + TCP_ECN_send_syn(sk, buff); + + /* Send it off. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = TCP_SKB_CB(buff)->when; + skb_header_release(buff); + __tcp_add_write_queue_tail(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); + tp->packets_out += tcp_skb_pcount(buff); + err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + if (err == -ECONNREFUSED) + return err; + + /* We change tp->snd_nxt after the tcp_transmit_skb() call + * in order to make this packet get counted in tcpOutSegs. + */ + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the SYN until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + return 0; +} +EXPORT_SYMBOL(tcp_connect); + +/* Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. + */ +void tcp_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = icsk->icsk_ack.ato; + unsigned long timeout; + + if (ato > TCP_DELACK_MIN) { + const struct tcp_sock *tp = tcp_sk(sk); + int max_ato = HZ / 2; + + if (icsk->icsk_ack.pingpong || + (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) + max_ato = TCP_DELACK_MAX; + + /* Slow path, intersegment interval is "high". */ + + /* If some rtt estimate is known, use it to bound delayed ack. + * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements + * directly. + */ + if (tp->srtt) { + int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); + + if (rtt < max_ato) + max_ato = rtt; + } + + ato = min(ato, max_ato); + } + + /* Stay within the limit we were given */ + timeout = jiffies + ato; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + */ + if (icsk->icsk_ack.blocked || + time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + tcp_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +/* This routine sends an ack and also updates the window. */ +void tcp_send_ack(struct sock *sk) +{ + struct sk_buff *buff; + + /* If we have been reset, we may not send again. */ + if (sk->sk_state == TCP_CLOSE) + return; + + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); +} + +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + * + * Question: what should we make while urgent mode? + * 4.4BSD forces sending single byte of data. We cannot send + * out of window data, because we have SND.NXT==SND.MAX... + * + * Current solution: to send TWO zero-length segments in urgent mode: + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is + * out-of-date with SND.UNA-1 to probe window. + */ +static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (skb == NULL) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); +} + +/* Initiate keepalive or window probe from timer. */ +int tcp_write_wakeup(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (sk->sk_state == TCP_CLOSE) + return -1; + + if ((skb = tcp_send_head(sk)) != NULL && + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + int err; + unsigned int mss = tcp_current_mss(sk); + unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; + if (tcp_fragment(sk, skb, seg_size, mss)) + return -1; + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(sk, skb, mss); + + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_event_new_data_sent(sk, skb); + return err; + } else { + if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + tcp_xmit_probe_skb(sk, 1); + return tcp_xmit_probe_skb(sk, 0); + } +} + +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. + */ +void tcp_send_probe0(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int err; + + err = tcp_write_wakeup(sk); + + if (tp->packets_out || !tcp_send_head(sk)) { + /* Cancel probe timer, if it is not required. */ + icsk->icsk_probes_out = 0; + icsk->icsk_backoff = 0; + return; + } + + if (err <= 0) { + if (icsk->icsk_backoff < sysctl_tcp_retries2) + icsk->icsk_backoff++; + icsk->icsk_probes_out++; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); + } else { + /* If packet was not sent due to local congestion, + * do not backoff and do not remember icsk_probes_out. + * Let local senders to fight for local resources. + * + * Use accumulated backoff yet. + */ + if (!icsk->icsk_probes_out) + icsk->icsk_probes_out = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + } +} diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 00000000..85ee7eb7 --- /dev/null +++ b/net/ipv4/tcp_probe.c @@ -0,0 +1,258 @@ +/* + * tcpprobe - Observe the TCP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Stephen Hemminger "); +MODULE_DESCRIPTION("TCP cwnd snooper"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.1"); + +static int port __read_mostly = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static unsigned int bufsize __read_mostly = 4096; +MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); +module_param(bufsize, uint, 0); + +static int full __read_mostly; +MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); +module_param(full, int, 0); + +static const char procname[] = "tcpprobe"; + +struct tcp_log { + ktime_t tstamp; + __be32 saddr, daddr; + __be16 sport, dport; + u16 length; + u32 snd_nxt; + u32 snd_una; + u32 snd_wnd; + u32 snd_cwnd; + u32 ssthresh; + u32 srtt; +}; + +static struct { + spinlock_t lock; + wait_queue_head_t wait; + ktime_t start; + u32 lastcwnd; + + unsigned long head, tail; + struct tcp_log *log; +} tcp_probe; + + +static inline int tcp_probe_used(void) +{ + return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); +} + +static inline int tcp_probe_avail(void) +{ + return bufsize - tcp_probe_used() - 1; +} + +/* + * Hook inserted to be called before each receive packet. + * Note: arguments must match tcp_rcv_established()! + */ +static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + /* Only update if port matches */ + if ((port == 0 || ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port) && + (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { + + spin_lock(&tcp_probe.lock); + /* If log fills, just silently drop */ + if (tcp_probe_avail() > 1) { + struct tcp_log *p = tcp_probe.log + tcp_probe.head; + + p->tstamp = ktime_get(); + p->saddr = inet->inet_saddr; + p->sport = inet->inet_sport; + p->daddr = inet->inet_daddr; + p->dport = inet->inet_dport; + p->length = skb->len; + p->snd_nxt = tp->snd_nxt; + p->snd_una = tp->snd_una; + p->snd_cwnd = tp->snd_cwnd; + p->snd_wnd = tp->snd_wnd; + p->ssthresh = tcp_current_ssthresh(sk); + p->srtt = tp->srtt >> 3; + + tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); + } + tcp_probe.lastcwnd = tp->snd_cwnd; + spin_unlock(&tcp_probe.lock); + + wake_up(&tcp_probe.wait); + } + + jprobe_return(); + return 0; +} + +static struct jprobe tcp_jprobe = { + .kp = { + .symbol_name = "tcp_rcv_established", + }, + .entry = jtcp_rcv_established, +}; + +static int tcpprobe_open(struct inode * inode, struct file * file) +{ + /* Reset (empty) log */ + spin_lock_bh(&tcp_probe.lock); + tcp_probe.head = tcp_probe.tail = 0; + tcp_probe.start = ktime_get(); + spin_unlock_bh(&tcp_probe.lock); + + return 0; +} + +static int tcpprobe_sprint(char *tbuf, int n) +{ + const struct tcp_log *p + = tcp_probe.log + tcp_probe.tail; + struct timespec tv + = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); + + return scnprintf(tbuf, n, + "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", + (unsigned long) tv.tv_sec, + (unsigned long) tv.tv_nsec, + &p->saddr, ntohs(p->sport), + &p->daddr, ntohs(p->dport), + p->length, p->snd_nxt, p->snd_una, + p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0; + size_t cnt = 0; + + if (!buf) + return -EINVAL; + + while (cnt < len) { + char tbuf[164]; + int width; + + /* Wait for data in buffer */ + error = wait_event_interruptible(tcp_probe.wait, + tcp_probe_used() > 0); + if (error) + break; + + spin_lock_bh(&tcp_probe.lock); + if (tcp_probe.head == tcp_probe.tail) { + /* multiple readers race? */ + spin_unlock_bh(&tcp_probe.lock); + continue; + } + + width = tcpprobe_sprint(tbuf, sizeof(tbuf)); + + if (cnt + width < len) + tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); + + spin_unlock_bh(&tcp_probe.lock); + + /* if record greater than space available + return partial buffer (so far) */ + if (cnt + width >= len) + break; + + if (copy_to_user(buf + cnt, tbuf, width)) + return -EFAULT; + cnt += width; + } + + return cnt == 0 ? error : cnt; +} + +static const struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, + .llseek = noop_llseek, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&tcp_probe.wait); + spin_lock_init(&tcp_probe.lock); + + if (bufsize == 0) + return -EINVAL; + + bufsize = roundup_pow_of_two(bufsize); + tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); + if (!tcp_probe.log) + goto err0; + + if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops)) + goto err0; + + ret = register_jprobe(&tcp_jprobe); + if (ret) + goto err1; + + pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize); + return 0; + err1: + proc_net_remove(&init_net, procname); + err0: + kfree(tcp_probe.log); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + proc_net_remove(&init_net, procname); + unregister_jprobe(&tcp_jprobe); + kfree(tcp_probe.log); +} +module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 00000000..8ce55b8a --- /dev/null +++ b/net/ipv4/tcp_scalable.c @@ -0,0 +1,62 @@ +/* Tom Kelly's Scalable TCP + * + * See http://www.deneholme.net/tom/scalable/ + * + * John Heffner + */ + +#include +#include + +/* These factors derived from the recommended values in the aer: + * .01 and and 7/8. We use 50 instead of 100 to account for + * delayed ack. + */ +#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_MD_SCALE 3 + +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); +} + +static u32 tcp_scalable_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); +} + + +static struct tcp_congestion_ops tcp_scalable __read_mostly = { + .ssthresh = tcp_scalable_ssthresh, + .cong_avoid = tcp_scalable_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "scalable", +}; + +static int __init tcp_scalable_register(void) +{ + return tcp_register_congestion_control(&tcp_scalable); +} + +static void __exit tcp_scalable_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_scalable); +} + +module_init(tcp_scalable_register); +module_exit(tcp_scalable_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Scalable TCP"); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c new file mode 100644 index 00000000..ecd44b0c --- /dev/null +++ b/net/ipv4/tcp_timer.c @@ -0,0 +1,602 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include +#include +#include + +int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; +int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; +int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME; +int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES; +int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; +int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; +int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; +int sysctl_tcp_orphan_retries __read_mostly; +int sysctl_tcp_thin_linear_timeouts __read_mostly; + +static void tcp_write_timer(unsigned long); +static void tcp_delack_timer(unsigned long); +static void tcp_keepalive_timer (unsigned long data); + +void tcp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); +} +EXPORT_SYMBOL(tcp_init_xmit_timers); + +static void tcp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); +} + +/* Do not allow orphaned sockets to eat all our resources. + * This is direct violation of TCP specs, but it is required + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * + * Criteria is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. + */ +static int tcp_out_of_resources(struct sock *sk, int do_reset) +{ + struct tcp_sock *tp = tcp_sk(sk); + int shift = 0; + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + shift++; + + /* If some dubious ICMP arrived, penalize even more. */ + if (sk->sk_err_soft) + shift++; + + if (tcp_too_many_orphans(sk, shift)) { + if (net_ratelimit()) + printk(KERN_INFO "Out of socket memory\n"); + + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = 1; + if (do_reset) + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + return 1; + } + return 0; +} + +/* Calculate maximal number or retries on an orphaned socket. */ +static int tcp_orphan_retries(struct sock *sk, int alive) +{ + int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->sk_err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. */ + if (retries == 0 && alive) + retries = 8; + return retries; +} + +static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) +{ + /* Black hole detection */ + if (sysctl_tcp_mtu_probing) { + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + struct tcp_sock *tp = tcp_sk(sk); + int mss; + + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tp->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + } +} + +/* This function calculates a "timeout" which is equivalent to the timeout of a + * TCP connection after "boundary" unsuccessful, exponentially backed-off + * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if + * syn_set flag is set. + */ +static bool retransmits_timed_out(struct sock *sk, + unsigned int boundary, + unsigned int timeout, + bool syn_set) +{ + unsigned int linear_backoff_thresh, start_ts; + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + + if (!inet_csk(sk)->icsk_retransmits) + return false; + + if (unlikely(!tcp_sk(sk)->retrans_stamp)) + start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; + else + start_ts = tcp_sk(sk)->retrans_stamp; + + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); + + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } + return (tcp_time_stamp - start_ts) >= timeout; +} + +/* A write timeout has occurred. Process the after effects. */ +static int tcp_write_timeout(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int retry_until; + bool do_reset, syn_set = 0; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if (icsk->icsk_retransmits) + dst_negative_advice(sk); + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + syn_set = 1; + } else { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + /* Black hole detection */ + tcp_mtu_probing(icsk, sk); + + dst_negative_advice(sk); + } + + retry_until = sysctl_tcp_retries2; + if (sock_flag(sk, SOCK_DEAD)) { + const int alive = (icsk->icsk_rto < TCP_RTO_MAX); + + retry_until = tcp_orphan_retries(sk, alive); + do_reset = alive || + !retransmits_timed_out(sk, retry_until, 0, 0); + + if (tcp_out_of_resources(sk, do_reset)) + return 1; + } + } + + if (retransmits_timed_out(sk, retry_until, + syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + /* Has it gone just too far? */ + tcp_write_err(sk); + return 1; + } + return 0; +} + +static void tcp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + icsk->icsk_ack.blocked = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); + goto out_unlock; + } + + sk_mem_reclaim_partial(sk); + + if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + goto out; + } + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (!skb_queue_empty(&tp->ucopy.prequeue)) { + struct sk_buff *skb; + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); + + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk_backlog_rcv(sk, skb); + + tp->ucopy.memory = 0; + } + + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } + tcp_send_ack(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + } + +out: + if (tcp_memory_pressure) + sk_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +static void tcp_probe_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int max_probes; + + if (tp->packets_out || !tcp_send_head(sk)) { + icsk->icsk_probes_out = 0; + return; + } + + /* *WARNING* RFC 1122 forbids this + * + * It doesn't AFAIK, because we kill the retransmit timer -AK + * + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] + * + * Let me to explain. icsk_probes_out is zeroed by incoming ACKs + * even if they advertise zero window. Hence, connection is killed only + * if we received no ACKs for normal connection timeout. It is not killed + * only because window stays zero for some time, window may be zero + * until armageddon and even later. We are in full accordance + * with RFCs, only probe timer combines both retransmission timeout + * and probe timeout in one bottle. --ANK + */ + max_probes = sysctl_tcp_retries2; + + if (sock_flag(sk, SOCK_DEAD)) { + const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); + + max_probes = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) + return; + } + + if (icsk->icsk_probes_out > max_probes) { + tcp_write_err(sk); + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); + } +} + +/* + * The TCP retransmit timer. + */ + +void tcp_retransmit_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!tp->packets_out) + goto out; + + WARN_ON(tcp_write_queue_empty(sk)); + + if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && + !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits + * become zero probes, but we should not timeout this + * connection. If the socket is an orphan, time it out, + * we cannot allow such beasts to hang infinitely. + */ +#ifdef TCP_DEBUG + struct inet_sock *inet = inet_sk(sk); + if (sk->sk_family == AF_INET) { + LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &inet->inet_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &np->daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt); + } +#endif +#endif + if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(sk); + goto out; + } + tcp_enter_loss(sk, 0); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + __sk_dst_reset(sk); + goto out_reset_timer; + } + + if (tcp_write_timeout(sk)) + goto out; + + if (icsk->icsk_retransmits == 0) { + int mib_idx; + + if (icsk->icsk_ca_state == TCP_CA_Recovery) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; + else + mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; + } else if (icsk->icsk_ca_state == TCP_CA_Loss) { + mib_idx = LINUX_MIB_TCPLOSSFAILURES; + } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || + tp->sacked_out) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKFAILURES; + else + mib_idx = LINUX_MIB_TCPRENOFAILURES; + } else { + mib_idx = LINUX_MIB_TCPTIMEOUTS; + } + NET_INC_STATS_BH(sock_net(sk), mib_idx); + } + + if (tcp_use_frto(sk)) { + tcp_enter_frto(sk); + } else { + tcp_enter_loss(sk, 0); + } + + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + goto out; + } + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + +out_reset_timer: + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating + * exponential backoff behaviour to avoid continue hammering + * linear-timeout retransmissions into a black hole + */ + if (sk->sk_state == TCP_ESTABLISHED && + (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + tcp_stream_is_thin(tp) && + icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + icsk->icsk_backoff = 0; + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + } else { + /* Use normal (exponential) backoff */ + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) + __sk_dst_reset(sk); + +out:; +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); + goto out_unlock; + } + + if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + goto out; + } + + event = icsk->icsk_pending; + icsk->icsk_pending = 0; + + switch (event) { + case ICSK_TIME_RETRANS: + tcp_retransmit_timer(sk); + break; + case ICSK_TIME_PROBE0: + tcp_probe_timer(sk); + break; + } + +out: + sk_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ + +static void tcp_synack_timer(struct sock *sk) +{ + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); +} + +void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +{ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); +} +EXPORT_SYMBOL(tcp_syn_ack_timeout); + +void tcp_set_keepalive(struct sock *sk, int val) +{ + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return; + + if (val && !sock_flag(sk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + else if (!val) + inet_csk_delete_keepalive_timer(sk); +} + + +static void tcp_keepalive_timer (unsigned long data) +{ + struct sock *sk = (struct sock *) data; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 elapsed; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer (sk, HZ/20); + goto out; + } + + if (sk->sk_state == TCP_LISTEN) { + tcp_synack_timer(sk); + goto out; + } + + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { + if (tp->linger2 >= 0) { + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + goto out; + + elapsed = keepalive_time_when(tp); + + /* It is alive without keepalive 8) */ + if (tp->packets_out || tcp_send_head(sk)) + goto resched; + + elapsed = keepalive_time_elapsed(tp); + + if (elapsed >= keepalive_time_when(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= icsk->icsk_user_timeout && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_write_err(sk); + goto out; + } + if (tcp_write_wakeup(sk) <= 0) { + icsk->icsk_probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } + } else { + /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ + elapsed = keepalive_time_when(tp) - elapsed; + } + + sk_mem_reclaim(sk); + +resched: + inet_csk_reset_keepalive_timer (sk, elapsed); + goto out; + +death: + tcp_done(sk); + +out: + bh_unlock_sock(sk); + sock_put(sk); +} diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 00000000..80fa2bfd --- /dev/null +++ b/net/ipv4/tcp_vegas.c @@ -0,0 +1,339 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ + +#include +#include +#include +#include + +#include + +#include "tcp_vegas.h" + +static int alpha = 2; +static int beta = 4; +static int gamma = 1; + +module_param(alpha, int, 0644); +MODULE_PARM_DESC(alpha, "lower bound of packets in network"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "upper bound of packets in network"); +module_param(gamma, int, 0644); +MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); + + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static void vegas_enable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct sock *sk) +{ + struct vegas *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +void tcp_vegas_init(struct sock *sk) +{ + struct vegas *vegas = inet_csk_ca(sk); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(sk); +} +EXPORT_SYMBOL_GPL(tcp_vegas_init); + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) +{ + struct vegas *vegas = inet_csk_ca(sk); + u32 vrtt; + + if (rtt_us < 0) + return; + + /* Never allow zero rtt or baseRTT */ + vrtt = rtt_us + 1; + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} +EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); + +void tcp_vegas_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(sk); + else + vegas_disable(sk); +} +EXPORT_SYMBOL_GPL(tcp_vegas_state); + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_vegas_init(sk); +} +EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); + +static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) +{ + return min(tp->snd_ssthresh, tp->snd_cwnd-1); +} + +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); + + if (!vegas->doing_vegas_now) { + tcp_reno_cong_avoid(sk, ack, in_flight); + return; + } + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_nxt = tp->snd_nxt; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, in_flight); + } else { + u32 rtt, diff; + u64 target_cwnd; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + */ + target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + */ + diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; + + if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); + tp->snd_ssthresh = tcp_vegas_ssthresh(tp); + + } else if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > beta) { + /* The old window was too fast, so + * we slow down. + */ + tp->snd_cwnd--; + tp->snd_ssthresh + = tcp_vegas_ssthresh(tp); + } else if (diff < alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + tp->snd_cwnd++; + } else { + /* Sending just as fast as we + * should be. + */ + } + } + + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + + tp->snd_ssthresh = tcp_current_ssthresh(sk); + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + /* Use normal slow start */ + else if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + +} + +/* Extract info for Tcp socket info provided via netlink. */ +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info info = { + .tcpv_enabled = ca->doing_vegas_now, + .tcpv_rttcnt = ca->cntRTT, + .tcpv_rtt = ca->baseRTT, + .tcpv_minrtt = ca->minRTT, + }; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + } +} +EXPORT_SYMBOL_GPL(tcp_vegas_get_info); + +static struct tcp_congestion_ops tcp_vegas __read_mostly = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_vegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_vegas_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .pkts_acked = tcp_vegas_pkts_acked, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + + .owner = THIS_MODULE, + .name = "vegas", +}; + +static int __init tcp_vegas_register(void) +{ + BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_vegas); + return 0; +} + +static void __exit tcp_vegas_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_vegas); +} + +module_init(tcp_vegas_register); +module_exit(tcp_vegas_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Vegas"); diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h new file mode 100644 index 00000000..6c0eea2f --- /dev/null +++ b/net/ipv4/tcp_vegas.h @@ -0,0 +1,24 @@ +/* + * TCP Vegas congestion control interface + */ +#ifndef __TCP_VEGAS_H +#define __TCP_VEGAS_H 1 + +/* Vegas variables */ +struct vegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ +}; + +extern void tcp_vegas_init(struct sock *sk); +extern void tcp_vegas_state(struct sock *sk, u8 ca_state); +extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); + +#endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 00000000..ac43cd74 --- /dev/null +++ b/net/ipv4/tcp_veno.c @@ -0,0 +1,234 @@ +/* + * TCP Veno congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * C. P. Fu, S. C. Liew. + * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." + * IEEE Journal on Selected Areas in Communication, + * Feb. 2003. + * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf + */ + +#include +#include +#include +#include + +#include + +/* Default values of the Veno variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static const int beta = 3 << V_PARAM_SHIFT; + +/* Veno variables */ +struct veno { + u8 doing_veno_now; /* if true, do veno for this rtt */ + u16 cntrtt; /* # of rtts measured within last rtt */ + u32 minrtt; /* min of rtts measured within last rtt (in usec) */ + u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ + u32 inc; /* decide whether to increase cwnd */ + u32 diff; /* calculate the diff rate */ +}; + +/* There are several situations when we must "re-start" Veno: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + */ +static inline void veno_enable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn on Veno */ + veno->doing_veno_now = 1; + + veno->minrtt = 0x7fffffff; +} + +static inline void veno_disable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn off Veno */ + veno->doing_veno_now = 0; +} + +static void tcp_veno_init(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + veno->basertt = 0x7fffffff; + veno->inc = 1; + veno_enable(sk); +} + +/* Do rtt sampling needed for Veno. */ +static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) +{ + struct veno *veno = inet_csk_ca(sk); + u32 vrtt; + + if (rtt_us < 0) + return; + + /* Never allow zero rtt or baseRTT */ + vrtt = rtt_us + 1; + + /* Filter to find propagation delay: */ + if (vrtt < veno->basertt) + veno->basertt = vrtt; + + /* Find the min rtt during the last rtt to find + * the current prop. delay + queuing delay: + */ + veno->minrtt = min(veno->minrtt, vrtt); + veno->cntrtt++; +} + +static void tcp_veno_state(struct sock *sk, u8 ca_state) +{ + if (ca_state == TCP_CA_Open) + veno_enable(sk); + else + veno_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Veno calculations + * until we get fresh rtt samples. So when we + * restart, we reset our Veno state to a clean + * state. After we get acks for this flight of + * packets, _then_ we can make Veno calculations + * again. + */ +static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_veno_init(sk); +} + +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (!veno->doing_veno_now) { + tcp_reno_cong_avoid(sk, ack, in_flight); + return; + } + + /* limited by applications */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* We do the Veno calculations only if we got enough rtt samples */ + if (veno->cntrtt <= 2) { + /* We don't have enough rtt samples to do the Veno + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, in_flight); + } else { + u64 target_cwnd; + u32 rtt; + + /* We have enough rtt samples, so, using the Veno + * algorithm, we determine the state of the network. + */ + + rtt = veno->minrtt; + + target_cwnd = (tp->snd_cwnd * veno->basertt); + target_cwnd <<= V_PARAM_SHIFT; + do_div(target_cwnd, rtt); + + veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + tcp_cong_avoid_ai(tp, tp->snd_cwnd); + } else { + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc && + tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; + } else + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + + } + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + /* Wipe the slate clean for the next rtt. */ + /* veno->cntrtt = 0; */ + veno->minrtt = 0x7fffffff; +} + +/* Veno MD phase */ +static u32 tcp_veno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (veno->diff < beta) + /* in "non-congestive state", cut cwnd by 1/5 */ + return max(tp->snd_cwnd * 4 / 5, 2U); + else + /* in "congestive state", cut cwnd by 1/2 */ + return max(tp->snd_cwnd >> 1U, 2U); +} + +static struct tcp_congestion_ops tcp_veno __read_mostly = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_veno_init, + .ssthresh = tcp_veno_ssthresh, + .cong_avoid = tcp_veno_cong_avoid, + .pkts_acked = tcp_veno_pkts_acked, + .set_state = tcp_veno_state, + .cwnd_event = tcp_veno_cwnd_event, + + .owner = THIS_MODULE, + .name = "veno", +}; + +static int __init tcp_veno_register(void) +{ + BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_veno); + return 0; +} + +static void __exit tcp_veno_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_veno); +} + +module_init(tcp_veno_register); +module_exit(tcp_veno_unregister); + +MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Veno"); diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 00000000..1b91bf48 --- /dev/null +++ b/net/ipv4/tcp_westwood.c @@ -0,0 +1,304 @@ +/* + * TCP Westwood+: end-to-end bandwidth estimation for TCP + * + * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 + * + * Support at http://c3lab.poliba.it/index.php/Westwood + * Main references in literature: + * + * - Mascolo S, Casetti, M. Gerla et al. + * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 + * + * - A. Grieco, s. Mascolo + * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer + * Comm. Review, 2004 + * + * - A. Dell'Aera, L. Grieco, S. Mascolo. + * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : + * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 + * + * Westwood+ employs end-to-end bandwidth measurement to set cwnd and + * ssthresh after packet loss. The probing phase is as the original Reno. + */ + +#include +#include +#include +#include +#include + +/* TCP Westwood structure */ +struct westwood { + u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + u32 bw_est; /* bandwidth estimate */ + u32 rtt_win_sx; /* here starts a new evaluation... */ + u32 bk; + u32 snd_una; /* used for evaluating the number of acked bytes */ + u32 cumul_ack; + u32 accounted; + u32 rtt; + u32 rtt_min; /* minimum observed RTT */ + u8 first_ack; /* flag which infers that this is the first ack */ + u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ +}; + + +/* TCP Westwood functions and constants */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ + +/* + * @tcp_westwood_create + * This function initializes fields used in TCP Westwood+, + * it is called after the initial SYN, so the sequence numbers + * are correct but new passive connections we have no + * information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ +static void tcp_westwood_init(struct sock *sk) +{ + struct westwood *w = inet_csk_ca(sk); + + w->bk = 0; + w->bw_ns_est = 0; + w->bw_est = 0; + w->accounted = 0; + w->cumul_ack = 0; + w->reset_rtt_min = 1; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; + w->rtt_win_sx = tcp_time_stamp; + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 1; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coefficients. + */ +static inline u32 westwood_do_filter(u32 a, u32 b) +{ + return ((7 * a) + b) >> 3; +} + +static void westwood_filter(struct westwood *w, u32 delta) +{ + /* If the filter is empty fill it with the first sample of bandwidth */ + if (w->bw_ns_est == 0 && w->bw_est == 0) { + w->bw_ns_est = w->bk / delta; + w->bw_est = w->bw_ns_est; + } else { + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + } +} + +/* + * @westwood_pkts_acked + * Called after processing group of packets. + * but all westwood needs is the last sample of srtt. + */ +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt) +{ + struct westwood *w = inet_csk_ca(sk); + + if (rtt > 0) + w->rtt = usecs_to_jiffies(rtt); +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ +static void westwood_update_window(struct sock *sk) +{ + struct westwood *w = inet_csk_ca(sk); + s32 delta = tcp_time_stamp - w->rtt_win_sx; + + /* Initialize w->snd_una with the first acked sequence number in order + * to fix mismatch between tp->snd_una and w->snd_una for the first + * bandwidth sample + */ + if (w->first_ack) { + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 0; + } + + /* + * See if a RTT-window has passed. + * Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was chosen since an estimation on small + * time intervals is better to avoid... + * Obviously on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { + westwood_filter(w, delta); + + w->bk = 0; + w->rtt_win_sx = tcp_time_stamp; + } +} + +static inline void update_rtt_min(struct westwood *w) +{ + if (w->reset_rtt_min) { + w->rtt_min = w->rtt; + w->reset_rtt_min = 0; + } else + w->rtt_min = min(w->rtt, w->rtt_min); +} + + +/* + * @westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successful. In such case in fact update is + * straight forward and doesn't need any particular care. + */ +static inline void westwood_fast_bw(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + + w->bk += tp->snd_una - w->snd_una; + w->snd_una = tp->snd_una; + update_rtt_min(w); +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating bk in case of + * delayed or partial acks. + */ +static inline u32 westwood_acked_count(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); + + w->cumul_ack = tp->snd_una - w->snd_una; + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!w->cumul_ack) { + w->accounted += tp->mss_cache; + w->cumul_ack = tp->mss_cache; + } + + if (w->cumul_ack > tp->mss_cache) { + /* Partial or delayed ack */ + if (w->accounted >= w->cumul_ack) { + w->accounted -= w->cumul_ack; + w->cumul_ack = tp->mss_cache; + } else { + w->cumul_ack -= w->accounted; + w->accounted = 0; + } + } + + w->snd_una = tp->snd_una; + + return w->cumul_ack; +} + + +/* + * TCP Westwood + * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it + * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 + * so avoids ever returning 0. + */ +static u32 tcp_westwood_bw_rttmin(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); +} + +static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); + + switch (event) { + case CA_EVENT_FAST_ACK: + westwood_fast_bw(sk); + break; + + case CA_EVENT_COMPLETE_CWR: + tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + break; + + case CA_EVENT_FRTO: + tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + /* Update RTT_min when next ack arrives */ + w->reset_rtt_min = 1; + break; + + case CA_EVENT_SLOW_ACK: + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + update_rtt_min(w); + break; + + default: + /* don't care */ + break; + } +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_westwood_info(struct sock *sk, u32 ext, + struct sk_buff *skb) +{ + const struct westwood *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info info = { + .tcpv_enabled = 1, + .tcpv_rtt = jiffies_to_usecs(ca->rtt), + .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), + }; + + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + } +} + + +static struct tcp_congestion_ops tcp_westwood __read_mostly = { + .init = tcp_westwood_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_westwood_bw_rttmin, + .cwnd_event = tcp_westwood_event, + .get_info = tcp_westwood_info, + .pkts_acked = tcp_westwood_pkts_acked, + + .owner = THIS_MODULE, + .name = "westwood" +}; + +static int __init tcp_westwood_register(void) +{ + BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_westwood); +} + +static void __exit tcp_westwood_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_westwood); +} + +module_init(tcp_westwood_register); +module_exit(tcp_westwood_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Westwood+"); diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c new file mode 100644 index 00000000..05c3b6f0 --- /dev/null +++ b/net/ipv4/tcp_yeah.c @@ -0,0 +1,260 @@ +/* + * + * YeAH TCP + * + * For further details look at: + * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * + */ +#include +#include +#include +#include + +#include + +#include "tcp_vegas.h" + +#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck +#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt +#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss +#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion +#define TCP_YEAH_PHY 8 //lin maximum delta from base +#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss +#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count + +#define TCP_SCALABLE_AI_CNT 100U + +/* YeAH variables */ +struct yeah { + struct vegas vegas; /* must be first */ + + /* YeAH */ + u32 lastQ; + u32 doing_reno_now; + + u32 reno_count; + u32 fast_count; + + u32 pkts_acked; +}; + +static void tcp_yeah_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + tcp_vegas_init(sk); + + yeah->doing_reno_now = 0; + yeah->lastQ = 0; + + yeah->reno_count = 2; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); + +} + + +static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (icsk->icsk_ca_state == TCP_CA_Open) + yeah->pkts_acked = pkts_acked; + + tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); +} + +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + else if (!yeah->doing_reno_now) { + /* Scalable */ + + tp->snd_cwnd_cnt += yeah->pkts_acked; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + + yeah->pkts_acked = 1; + + } else { + /* Reno */ + tcp_cong_avoid_ai(tp, tp->snd_cwnd); + } + + /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) + * + * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up yeahly with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, yeah->vegas.beg_snd_nxt)) { + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (yeah->vegas.cntRTT > 2) { + u32 rtt, queue; + u64 bw; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = yeah->vegas.minRTT; + + /* Compute excess number of packets above bandwidth + * Avoid doing full 64 bit divide. + */ + bw = tp->snd_cwnd; + bw *= rtt - yeah->vegas.baseRTT; + do_div(bw, rtt); + queue = bw; + + if (queue > TCP_YEAH_ALPHA || + rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { + if (queue > TCP_YEAH_ALPHA && + tp->snd_cwnd > yeah->reno_count) { + u32 reduction = min(queue / TCP_YEAH_GAMMA , + tp->snd_cwnd >> TCP_YEAH_EPSILON); + + tp->snd_cwnd -= reduction; + + tp->snd_cwnd = max(tp->snd_cwnd, + yeah->reno_count); + + tp->snd_ssthresh = tp->snd_cwnd; + } + + if (yeah->reno_count <= 2) + yeah->reno_count = max(tp->snd_cwnd>>1, 2U); + else + yeah->reno_count++; + + yeah->doing_reno_now = min(yeah->doing_reno_now + 1, + 0xffffffU); + } else { + yeah->fast_count++; + + if (yeah->fast_count > TCP_YEAH_ZETA) { + yeah->reno_count = 2; + yeah->fast_count = 0; + } + + yeah->doing_reno_now = 0; + } + + yeah->lastQ = queue; + + } + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; + yeah->vegas.beg_snd_nxt = tp->snd_nxt; + yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Wipe the slate clean for the next RTT. */ + yeah->vegas.cntRTT = 0; + yeah->vegas.minRTT = 0x7fffffff; + } +} + +static u32 tcp_yeah_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); + struct yeah *yeah = inet_csk_ca(sk); + u32 reduction; + + if (yeah->doing_reno_now < TCP_YEAH_RHO) { + reduction = yeah->lastQ; + + reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + + reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + } else + reduction = max(tp->snd_cwnd>>1, 2U); + + yeah->fast_count = 0; + yeah->reno_count = max(yeah->reno_count>>1, 2U); + + return tp->snd_cwnd - reduction; +} + +static struct tcp_congestion_ops tcp_yeah __read_mostly = { + .flags = TCP_CONG_RTT_STAMP, + .init = tcp_yeah_init, + .ssthresh = tcp_yeah_ssthresh, + .cong_avoid = tcp_yeah_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + .pkts_acked = tcp_yeah_pkts_acked, + + .owner = THIS_MODULE, + .name = "yeah", +}; + +static int __init tcp_yeah_register(void) +{ + BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_yeah); + return 0; +} + +static void __exit tcp_yeah_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_yeah); +} + +module_init(tcp_yeah_register); +module_exit(tcp_yeah_unregister); + +MODULE_AUTHOR("Angelo P. Castellani"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("YeAH TCP"); diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c new file mode 100644 index 00000000..ac3b3ee4 --- /dev/null +++ b/net/ipv4/tunnel4.c @@ -0,0 +1,192 @@ +/* tunnel4.c: Generic IP tunnel transformer. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; +static DEFINE_MUTEX(tunnel4_mutex); + +static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) +{ + return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; +} + +int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) +{ + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + + int ret = -EEXIST; + int priority = handler->priority; + + mutex_lock(&tunnel4_mutex); + + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&tunnel4_mutex); + + return ret; +} +EXPORT_SYMBOL(xfrm4_tunnel_register); + +int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) +{ + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -ENOENT; + + mutex_lock(&tunnel4_mutex); + + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + mutex_unlock(&tunnel4_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm4_tunnel_deregister); + +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +static int tunnel4_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + for_each_tunnel_rcu(tunnel4_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int tunnel64_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + for_each_tunnel_rcu(tunnel64_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} +#endif + +static void tunnel4_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler; + + for_each_tunnel_rcu(tunnel4_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static void tunnel64_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler; + + for_each_tunnel_rcu(tunnel64_handlers, handler) + if (!handler->err_handler(skb, info)) + break; +} +#endif + +static const struct net_protocol tunnel4_protocol = { + .handler = tunnel4_rcv, + .err_handler = tunnel4_err, + .no_policy = 1, + .netns_ok = 1, +}; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct net_protocol tunnel64_protocol = { + .handler = tunnel64_rcv, + .err_handler = tunnel64_err, + .no_policy = 1, + .netns_ok = 1, +}; +#endif + +static int __init tunnel4_init(void) +{ + if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { + printk(KERN_ERR "tunnel4 init: can't add protocol\n"); + return -EAGAIN; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { + printk(KERN_ERR "tunnel64 init: can't add protocol\n"); + inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); + return -EAGAIN; + } +#endif + return 0; +} + +static void __exit tunnel4_fini(void) +{ +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) + printk(KERN_ERR "tunnel64 close: can't remove protocol\n"); +#endif + if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) + printk(KERN_ERR "tunnel4 close: can't remove protocol\n"); +} + +module_init(tunnel4_init); +module_exit(tunnel4_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c new file mode 100644 index 00000000..198f75b7 --- /dev/null +++ b/net/ipv4/udp.c @@ -0,0 +1,2285 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The User Datagram Protocol (UDP). + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Arnt Gulbrandsen, + * Alan Cox, + * Hirokazu Takahashi, + * + * Fixes: + * Alan Cox : verify_area() calls + * Alan Cox : stopped close while in use off icmp + * messages. Not a fix but a botch that + * for udp at least is 'valid'. + * Alan Cox : Fixed icmp handling properly + * Alan Cox : Correct error for oversized datagrams + * Alan Cox : Tidied select() semantics. + * Alan Cox : udp_err() fixed properly, also now + * select and read wake correctly on errors + * Alan Cox : udp_send verify_area moved to avoid mem leak + * Alan Cox : UDP can count its memory + * Alan Cox : send to an unknown connection causes + * an ECONNREFUSED off the icmp, but + * does NOT close. + * Alan Cox : Switched to new sk_buff handlers. No more backlog! + * Alan Cox : Using generic datagram code. Even smaller and the PEEK + * bug no longer crashes it. + * Fred Van Kempen : Net2e support for sk->broadcast. + * Alan Cox : Uses skb_free_datagram + * Alan Cox : Added get/set sockopt support. + * Alan Cox : Broadcasting without option set returns EACCES. + * Alan Cox : No wakeup calls. Instead we now use the callbacks. + * Alan Cox : Use ip_tos and ip_ttl + * Alan Cox : SNMP Mibs + * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. + * Matt Dillon : UDP length checks. + * Alan Cox : Smarter af_inet used properly. + * Alan Cox : Use new kernel side addressing. + * Alan Cox : Incorrect return on truncated datagram receive. + * Arnt Gulbrandsen : New udp_send and stuff + * Alan Cox : Cache last socket + * Alan Cox : Route cache + * Jon Peatfield : Minor efficiency fix to sendto(). + * Mike Shaver : RFC1122 checks. + * Alan Cox : Nonblocking error fix. + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * David S. Miller : New socket lookup architecture. + * Last socket cache retained as it + * does have a high hit rate. + * Olaf Kirch : Don't linearise iovec on sendmsg. + * Andi Kleen : Some cleanups, cache destination entry + * for connect. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Melvin Smith : Check msg_name not msg_namelen in sendto(), + * return ENOTCONN for unconnected sockets (POSIX) + * Janos Farkas : don't deliver multi/broadcasts to a different + * bound-to-device socket + * Hirokazu Takahashi : HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi : sendfile() on UDP works now. + * Arnaldo C. Melo : convert /proc/net/udp to seq_file + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Derek Atkins : Add Encapulation Support + * James Chapman : Add L2TP encapsulation type. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "udp_impl.h" + +struct udp_table udp_table __read_mostly; +EXPORT_SYMBOL(udp_table); + +long sysctl_udp_mem[3] __read_mostly; +EXPORT_SYMBOL(sysctl_udp_mem); + +int sysctl_udp_rmem_min __read_mostly; +EXPORT_SYMBOL(sysctl_udp_rmem_min); + +int sysctl_udp_wmem_min __read_mostly; +EXPORT_SYMBOL(sysctl_udp_wmem_min); + +atomic_long_t udp_memory_allocated; +EXPORT_SYMBOL(udp_memory_allocated); + +#define MAX_UDP_PORTS 65536 +#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) + +static int udp_lib_lport_inuse(struct net *net, __u16 num, + const struct udp_hslot *hslot, + unsigned long *bitmap, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2), + unsigned int log) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + + sk_nulls_for_each(sk2, node, &hslot->head) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (bitmap || udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) { + if (bitmap) + __set_bit(udp_sk(sk2)->udp_port_hash >> log, + bitmap); + else + return 1; + } + return 0; +} + +/* + * Note: we still hold spinlock of primary hash chain, so no other writer + * can insert/delete a socket with local_port == num + */ +static int udp_lib_lport_inuse2(struct net *net, __u16 num, + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + int res = 0; + + spin_lock(&hslot2->lock); + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) { + res = 1; + break; + } + spin_unlock(&hslot2->lock); + return res; +} + +/** + * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 + * + * @sk: socket struct in question + * @snum: port number to look up + * @saddr_comp: AF-dependent comparison of bound local IP addresses + * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, + * with NULL address + */ +int udp_lib_get_port(struct sock *sk, unsigned short snum, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2), + unsigned int hash2_nulladdr) +{ + struct udp_hslot *hslot, *hslot2; + struct udp_table *udptable = sk->sk_prot->h.udp_table; + int error = 1; + struct net *net = sock_net(sk); + + if (!snum) { + int low, high, remaining; + unsigned rand; + unsigned short first, last; + DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); + + inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + + rand = net_random(); + first = (((u64)rand * remaining) >> 32) + low; + /* + * force rand to be an odd multiple of UDP_HTABLE_SIZE + */ + rand = (rand | 1) * (udptable->mask + 1); + last = first + udptable->mask + 1; + do { + hslot = udp_hashslot(udptable, net, first); + bitmap_zero(bitmap, PORTS_PER_CHAIN); + spin_lock_bh(&hslot->lock); + udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, + saddr_comp, udptable->log); + + snum = first; + /* + * Iterate on all possible values of snum for this hash. + * Using steps of an odd multiple of UDP_HTABLE_SIZE + * give us randomization and full range coverage. + */ + do { + if (low <= snum && snum <= high && + !test_bit(snum >> udptable->log, bitmap) && + !inet_is_reserved_local_port(snum)) + goto found; + snum += rand; + } while (snum != first); + spin_unlock_bh(&hslot->lock); + } while (++first != last); + goto fail; + } else { + hslot = udp_hashslot(udptable, net, snum); + spin_lock_bh(&hslot->lock); + if (hslot->count > 10) { + int exist; + unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; + + slot2 &= udptable->mask; + hash2_nulladdr &= udptable->mask; + + hslot2 = udp_hashslot2(udptable, slot2); + if (hslot->count < hslot2->count) + goto scan_primary_hash; + + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + if (!exist && (hash2_nulladdr != slot2)) { + hslot2 = udp_hashslot2(udptable, hash2_nulladdr); + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + } + if (exist) + goto fail_unlock; + else + goto found; + } +scan_primary_hash: + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, + saddr_comp, 0)) + goto fail_unlock; + } +found: + inet_sk(sk)->inet_num = snum; + udp_sk(sk)->udp_port_hash = snum; + udp_sk(sk)->udp_portaddr_hash ^= snum; + if (sk_unhashed(sk)) { + sk_nulls_add_node_rcu(sk, &hslot->head); + hslot->count++; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + spin_lock(&hslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + hslot2->count++; + spin_unlock(&hslot2->lock); + } + error = 0; +fail_unlock: + spin_unlock_bh(&hslot->lock); +fail: + return error; +} +EXPORT_SYMBOL(udp_lib_get_port); + +static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +{ + struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); + + return (!ipv6_only_sock(sk2) && + (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || + inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); +} + +static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, + unsigned int port) +{ + return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; +} + +int udp_v4_get_port(struct sock *sk, unsigned short snum) +{ + unsigned int hash2_nulladdr = + udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + unsigned int hash2_partial = + udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); +} + +static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, + unsigned short hnum, + __be16 sport, __be32 daddr, __be16 dport, int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + !ipv6_only_sock(sk)) { + struct inet_sock *inet = inet_sk(sk); + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 2; + } + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 2; + } + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; + } + } + return score; +} + +/* + * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) + */ +#define SCORE2_MAX (1 + 2 + 2 + 2) +static inline int compute_score2(struct sock *sk, struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->inet_rcv_saddr != daddr) + return -1; + if (inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 2; + } + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; + } + } + return score; +} + + +/* called with read_rcu_lock() */ +static struct sock *udp4_lib_lookup2(struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + if (score == SCORE2_MAX) + goto exact_match; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + +/* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this. -DaveM + */ +static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, + __be16 sport, __be32 daddr, __be16 dport, + int dif, struct udp_table *udptable) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(dport); + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + int score, badness; + + rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp4_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, saddr, sport, + htonl(INADDR_ANY), hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } +begin: + result = NULL; + badness = -1; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + score = compute_score(sk, net, saddr, hnum, sport, + daddr, dport, dif); + if (score > badness) { + result = sk; + badness = score; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score(result, net, saddr, hnum, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); + return result; +} + +static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct udp_table *udptable) +{ + struct sock *sk; + const struct iphdr *iph = ip_hdr(skb); + + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + else + return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); +} + +struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, int dif) +{ + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp4_lib_lookup); + +static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, + __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) +{ + struct hlist_nulls_node *node; + struct sock *s = sk; + unsigned short hnum = ntohs(loc_port); + + sk_nulls_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (!net_eq(sock_net(s), net) || + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && + inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(s) || + (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) + continue; + if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) + continue; + goto found; + } + s = NULL; +found: + return s; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. + * Header points to the ip header of the error packet. We move + * on past this. Then (as it used to claim before adjustment) + * header points to the first 8 bytes of the udp header. We need + * to find the appropriate port. + */ + +void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +{ + struct inet_sock *inet; + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct sock *sk; + int harderr; + int err; + struct net *net = dev_net(skb->dev); + + sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, + iph->saddr, uh->source, skb->dev->ifindex, udptable); + if (sk == NULL) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + return; /* No socket for error */ + } + + err = 0; + harderr = 0; + inet = inet_sk(sk); + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (inet->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + } + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. + */ + if (!inet->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else + ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); + + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +void udp_err(struct sk_buff *skb, u32 info) +{ + __udp4_lib_err(skb, info, &udp_table); +} + +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +void udp_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip_flush_pending_frames(sk); + } +} +EXPORT_SYMBOL(udp_flush_pending_frames); + +/** + * udp4_hwcsum - handle outgoing HW checksumming + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) + * @src: source IP address + * @dst: destination IP address + */ +static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) +{ + struct udphdr *uh = udp_hdr(skb); + struct sk_buff *frags = skb_shinfo(skb)->frag_list; + int offset = skb_transport_offset(skb); + int len = skb->len - offset; + int hlen = len; + __wsum csum = 0; + + if (!frags) { + /* + * Only one fragment on the socket. + */ + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_tcpudp_magic(src, dst, len, + IPPROTO_UDP, 0); + } else { + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together + */ + do { + csum = csum_add(csum, frags->csum); + hlen -= frags->len; + } while ((frags = frags->next)); + + csum = skb_checksum(skb, offset, hlen, csum); + skb->ip_summed = CHECKSUM_NONE; + + uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + +static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct udphdr *uh; + int err = 0; + int is_udplite = IS_UDPLITE(sk); + int offset = skb_transport_offset(skb); + int len = skb->len - offset; + __wsum csum = 0; + + /* + * Create a UDP header + */ + uh = udp_hdr(skb); + uh->source = inet->inet_sport; + uh->dest = fl4->fl4_dport; + uh->len = htons(len); + uh->check = 0; + + if (is_udplite) /* UDP-Lite */ + csum = udplite_csum(skb); + + else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ + + skb->ip_summed = CHECKSUM_NONE; + goto send; + + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ + + udp4_hwcsum(skb, fl4->saddr, fl4->daddr); + goto send; + + } else + csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, + sk->sk_protocol, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + +send: + err = ip_send_skb(skb); + if (err) { + if (err == -ENOBUFS && !inet->recverr) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + err = 0; + } + } else + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); + return err; +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +static int udp_push_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; + struct sk_buff *skb; + int err = 0; + + skb = ip_finish_skb(sk, fl4); + if (!skb) + goto out; + + err = udp_send_skb(skb, fl4); + +out: + up->len = 0; + up->pending = 0; + return err; +} + +int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct udp_sock *up = udp_sk(sk); + struct flowi4 fl4_stack; + struct flowi4 *fl4; + int ulen = len; + struct ipcm_cookie ipc; + struct rtable *rt = NULL; + int free = 0; + int connected = 0; + __be32 daddr, faddr, saddr; + __be16 dport; + u8 tos; + int err, is_udplite = IS_UDPLITE(sk); + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sk_buff *skb; + struct ip_options_data opt_copy; + + if (len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ + return -EOPNOTSUPP; + + ipc.opt = NULL; + ipc.tx_flags = 0; + + getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + + fl4 = &inet->cork.fl.u.ip4; + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET)) { + release_sock(sk); + return -EINVAL; + } + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + /* + * Get and verify the address. + */ + if (msg->msg_name) { + struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return -EINVAL; + if (usin->sin_family != AF_INET) { + if (usin->sin_family != AF_UNSPEC) + return -EAFNOSUPPORT; + } + + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + if (dport == 0) + return -EINVAL; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = inet->inet_daddr; + dport = inet->inet_dport; + /* Open fast path for connected socket. + Route will not be used, if at least one option is set. + */ + connected = 1; + } + ipc.addr = inet->inet_saddr; + + ipc.oif = sk->sk_bound_dev_if; + err = sock_tx_timestamp(sk, &ipc.tx_flags); + if (err) + return err; + if (msg->msg_controllen) { + err = ip_cmsg_send(sock_net(sk), msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + connected = 0; + } + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } + + saddr = ipc.addr; + ipc.addr = faddr = daddr; + + if (ipc.opt && ipc.opt->opt.srr) { + if (!daddr) + return -EINVAL; + faddr = ipc.opt->opt.faddr; + connected = 0; + } + tos = RT_TOS(inet->tos); + if (sock_flag(sk, SOCK_LOCALROUTE) || + (msg->msg_flags & MSG_DONTROUTE) || + (ipc.opt && ipc.opt->opt.is_strictroute)) { + tos |= RTO_ONLINK; + connected = 0; + } + + if (ipv4_is_multicast(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + connected = 0; + } + + if (connected) + rt = (struct rtable *)sk_dst_check(sk, 0); + + if (rt == NULL) { + struct net *net = sock_net(sk); + + fl4 = &fl4_stack; + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + faddr, saddr, dport, inet->inet_sport); + + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + if (err == -ENETUNREACH) + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + goto out; + } + + err = -EACCES; + if ((rt->rt_flags & RTCF_BROADCAST) && + !sock_flag(sk, SOCK_BROADCAST)) + goto out; + if (connected) + sk_dst_set(sk, dst_clone(&rt->dst)); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + saddr = fl4->saddr; + if (!ipc.addr) + daddr = ipc.addr = fl4->daddr; + + /* Lockless fast path for the non-corking case. */ + if (!corkreq) { + skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + msg->msg_flags); + err = PTR_ERR(skb); + if (skb && !IS_ERR(skb)) + err = udp_send_skb(skb, fl4); + goto out; + } + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); + err = -EINVAL; + goto out; + } + /* + * Now cork the socket to pend data. + */ + fl4 = &inet->cork.fl.u.ip4; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->fl4_dport = dport; + fl4->fl4_sport = inet->inet_sport; + up->pending = AF_INET; + +do_append_data: + up->len += ulen; + err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_flush_pending_frames(sk); + else if (!corkreq) + err = udp_push_pending_frames(sk); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; + release_sock(sk); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) + return len; + /* + * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting + * ENOBUFS might not be good (it's not tunable per se), but otherwise + * we don't have a good statistic (IpOutDiscards but it can be too many + * things). We could add another new stat but at least for now that + * seems like overkill. + */ + if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + } + return err; + +do_confirm: + dst_confirm(&rt->dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} +EXPORT_SYMBOL(udp_sendmsg); + +int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct udp_sock *up = udp_sk(sk); + int ret; + + if (!up->pending) { + struct msghdr msg = { .msg_flags = flags|MSG_MORE }; + + /* Call udp_sendmsg to specify destination address which + * sendpage interface can't pass. + * This will succeed only when the socket is connected. + */ + ret = udp_sendmsg(NULL, sk, &msg, 0); + if (ret < 0) + return ret; + } + + lock_sock(sk); + + if (unlikely(!up->pending)) { + release_sock(sk); + + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); + return -EINVAL; + } + + ret = ip_append_page(sk, &inet->cork.fl.u.ip4, + page, offset, size, flags); + if (ret == -EOPNOTSUPP) { + release_sock(sk); + return sock_no_sendpage(sk->sk_socket, page, offset, + size, flags); + } + if (ret < 0) { + udp_flush_pending_frames(sk); + goto out; + } + + up->len += size; + if (!(up->corkflag || (flags&MSG_MORE))) + ret = udp_push_pending_frames(sk); + if (!ret) + ret = size; +out: + release_sock(sk); + return ret; +} + + +/** + * first_packet_length - return length of first packet in receive queue + * @sk: socket + * + * Drops all bad checksum frames, until a valid one is found. + * Returns the length of found skb, or 0 if none is found. + */ +static unsigned int first_packet_length(struct sock *sk) +{ + struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue; + struct sk_buff *skb; + unsigned int res; + + __skb_queue_head_init(&list_kill); + + spin_lock_bh(&rcvq->lock); + while ((skb = skb_peek(rcvq)) != NULL && + udp_lib_checksum_complete(skb)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); + __skb_unlink(skb, rcvq); + __skb_queue_tail(&list_kill, skb); + } + res = skb ? skb->len : 0; + spin_unlock_bh(&rcvq->lock); + + if (!skb_queue_empty(&list_kill)) { + bool slow = lock_sock_fast(sk); + + __skb_queue_purge(&list_kill); + sk_mem_reclaim_partial(sk); + unlock_sock_fast(sk, slow); + } + return res; +} + +/* + * IOCTL requests applicable to the UDP protocol + */ + +int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + { + int amount = sk_wmem_alloc_get(sk); + + return put_user(amount, (int __user *)arg); + } + + case SIOCINQ: + { + unsigned int amount = first_packet_length(sk); + + if (amount) + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + amount -= sizeof(struct udphdr); + + return put_user(amount, (int __user *)arg); + } + + default: + return -ENOIOCTLCMD; + } + + return 0; +} +EXPORT_SYMBOL(udp_ioctl); + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + unsigned int ulen; + int peeked; + int err; + int is_udplite = IS_UDPLITE(sk); + bool slow; + + /* + * Check any passed addresses + */ + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + +try_again: + skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &err); + if (!skb) + goto out; + + ulen = skb->len - sizeof(struct udphdr); + if (len > ulen) + len = ulen; + else if (len < ulen) + msg->msg_flags |= MSG_TRUNC; + + /* + * If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, or if we only want a partial + * coverage checksum (UDP-Lite), do it before the copy. + */ + + if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (udp_lib_checksum_complete(skb)) + goto csum_copy_err; + } + + if (skb_csum_unnecessary(skb)) + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), + msg->msg_iov, len); + else { + err = skb_copy_and_csum_datagram_iovec(skb, + sizeof(struct udphdr), + msg->msg_iov); + + if (err == -EINVAL) + goto csum_copy_err; + } + + if (err) + goto out_free; + + if (!peeked) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + + sock_recv_ts_and_drops(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = udp_hdr(skb)->source; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + + err = len; + if (flags & MSG_TRUNC) + err = ulen; + +out_free: + skb_free_datagram_locked(sk, skb); +out: + return err; + +csum_copy_err: + slow = lock_sock_fast(sk); + if (!skb_kill_datagram(sk, skb, flags)) + UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + unlock_sock_fast(sk, slow); + + if (noblock) + return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; + goto try_again; +} + + +int udp_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + /* + * 1003.1g - break association. + */ + + sk->sk_state = TCP_CLOSE; + inet->inet_daddr = 0; + inet->inet_dport = 0; + sock_rps_save_rxhash(sk, 0); + sk->sk_bound_dev_if = 0; + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { + sk->sk_prot->unhash(sk); + inet->inet_sport = 0; + } + sk_dst_reset(sk); + return 0; +} +EXPORT_SYMBOL(udp_disconnect); + +void udp_lib_unhash(struct sock *sk) +{ + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_hslot *hslot, *hslot2; + + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + + spin_lock_bh(&hslot->lock); + if (sk_nulls_del_node_init_rcu(sk)) { + hslot->count--; + inet_sk(sk)->inet_num = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + } + spin_unlock_bh(&hslot->lock); + } +} +EXPORT_SYMBOL(udp_lib_unhash); + +/* + * inet_rcv_saddr was changed, we must rehash secondary hash + */ +void udp_lib_rehash(struct sock *sk, u16 newhash) +{ + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_hslot *hslot, *hslot2, *nhslot2; + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + nhslot2 = udp_hashslot2(udptable, newhash); + udp_sk(sk)->udp_portaddr_hash = newhash; + if (hslot2 != nhslot2) { + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + /* we must lock primary chain too */ + spin_lock_bh(&hslot->lock); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + + spin_unlock_bh(&hslot->lock); + } + } +} +EXPORT_SYMBOL(udp_lib_rehash); + +static void udp_v4_rehash(struct sock *sk) +{ + u16 new_hash = udp4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + udp_lib_rehash(sk, new_hash); +} + +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + if (inet_sk(sk)->inet_daddr) + sock_rps_save_rxhash(sk, skb->rxhash); + + rc = ip_queue_rcv_skb(sk, skb); + if (rc < 0) { + int is_udplite = IS_UDPLITE(sk); + + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; + } + + return 0; + +} + +/* returns: + * -1: error + * 0: success + * >0: "udp encap" protocol resubmission + * + * Note that in the success and error cases, the skb is assumed to + * have either been requeued or freed. + */ +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + int rc; + int is_udplite = IS_UDPLITE(sk); + + /* + * Charge it to the socket, dropping if the queue is full. + */ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto drop; + nf_reset(skb); + + if (up->encap_type) { + /* + * This is an encapsulation socket so pass the skb to + * the socket's udp_encap_rcv() hook. Otherwise, just + * fall through and pass this up the UDP socket. + * up->encap_rcv() returns the following value: + * =0 if skb was successfully passed to the encap + * handler or was discarded by it. + * >0 if skb should be passed on to UDP. + * <0 if skb should be resubmitted as proto -N + */ + + /* if we're overly short, let UDP handle it */ + if (skb->len > sizeof(struct udphdr) && + up->encap_rcv != NULL) { + int ret; + + ret = (*up->encap_rcv)(sk, skb); + if (ret <= 0) { + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); + return -ret; + } + } + + /* FALLTHROUGH -- it's a UDP Packet */ + } + + /* + * UDP-Lite specific tests, ignored on UDP sockets + */ + if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + + /* + * MIB statistics other than incrementing the error count are + * disabled for the following two types of errors: these depend + * on the application settings, not on the functioning of the + * protocol stack as such. + * + * RFC 3828 here recommends (sec 3.3): "There should also be a + * way ... to ... at least let the receiving application block + * delivery of packets with coverage values less than a value + * provided by the application." + */ + if (up->pcrlen == 0) { /* full coverage was set */ + LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage " + "%d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); + goto drop; + } + /* The next case involves violating the min. coverage requested + * by the receiver. This is subtle: if receiver wants x and x is + * greater than the buffersize/MTU then receiver will complain + * that it wants x while sender emits packets of smaller size y. + * Therefore the above ...()->partial_cov statement is essential. + */ + if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { + LIMIT_NETDEBUG(KERN_WARNING + "UDPLITE: coverage %d too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); + goto drop; + } + } + + if (rcu_dereference_raw(sk->sk_filter)) { + if (udp_lib_checksum_complete(skb)) + goto drop; + } + + + if (sk_rcvqueues_full(sk, skb)) + goto drop; + + rc = 0; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + rc = __udp_queue_rcv_skb(sk, skb); + else if (sk_add_backlog(sk, skb)) { + bh_unlock_sock(sk); + goto drop; + } + bh_unlock_sock(sk); + + return rc; + +drop: + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return -1; +} + + +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + unsigned int i; + struct sk_buff *skb1 = NULL; + struct sock *sk; + + for (i = 0; i < count; i++) { + sk = stack[i]; + if (likely(skb1 == NULL)) + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + + if (!skb1) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + } + + if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) + skb1 = NULL; + } + if (unlikely(skb1)) + kfree_skb(skb1); +} + +/* + * Multicasts and broadcasts go to each listener. + * + * Note: called only from the BH handler context. + */ +static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, + struct udphdr *uh, + __be32 saddr, __be32 daddr, + struct udp_table *udptable) +{ + struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); + int dif; + unsigned int i, count = 0; + + spin_lock(&hslot->lock); + sk = sk_nulls_head(&hslot->head); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); + while (sk) { + stack[count++] = sk; + sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, + daddr, uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; + } + } + /* + * before releasing chain lock, we must take a reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); + + spin_unlock(&hslot->lock); + + /* + * do the slow work with no lock held + */ + if (count) { + flush_stack(stack, count, skb, count - 1); + + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); + } + return 0; +} + +/* Initialize UDP checksum. If exited with zero value (success), + * CHECKSUM_UNNECESSARY means, that no more checks are required. + * Otherwise, csum completion requires chacksumming packet body, + * including udp header and folding it to skb->csum. + */ +static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, + int proto) +{ + const struct iphdr *iph; + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + iph = ip_hdr(skb); + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_COMPLETE) { + if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + proto, skb->csum)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + if (!skb_csum_unnecessary(skb)) + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len, proto, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ + + return 0; +} + +/* + * All we need to do is get the socket, and then do a checksum. + */ + +int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + int proto) +{ + struct sock *sk; + struct udphdr *uh; + unsigned short ulen; + struct rtable *rt = skb_rtable(skb); + __be32 saddr, daddr; + struct net *net = dev_net(skb->dev); + + /* + * Validate the packet. + */ + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto drop; /* No space for header. */ + + uh = udp_hdr(skb); + ulen = ntohs(uh->len); + saddr = ip_hdr(skb)->saddr; + daddr = ip_hdr(skb)->daddr; + + if (ulen > skb->len) + goto short_packet; + + if (proto == IPPROTO_UDP) { + /* UDP validates ulen. */ + if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) + goto short_packet; + uh = udp_hdr(skb); + } + + if (udp4_csum_init(skb, uh, proto)) + goto csum_error; + + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); + + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + + if (sk != NULL) { + int ret = udp_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; + } + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + nf_reset(skb); + + /* No socket. Drop packet silently, if checksum is wrong */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + + UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* + * Hmm. We got an UDP packet to a port to which we + * don't wanna listen. Ignore it. + */ + kfree_skb(skb); + return 0; + +short_packet: + LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", + proto == IPPROTO_UDPLITE ? "-Lite" : "", + &saddr, + ntohs(uh->source), + ulen, + skb->len, + &daddr, + ntohs(uh->dest)); + goto drop; + +csum_error: + /* + * RFC1122: OK. Discards the bad packet silently (as far as + * the network is concerned, anyway) as per 4.1.3.4 (MUST). + */ + LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", + proto == IPPROTO_UDPLITE ? "-Lite" : "", + &saddr, + ntohs(uh->source), + &daddr, + ntohs(uh->dest), + ulen); +drop: + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + kfree_skb(skb); + return 0; +} + +int udp_rcv(struct sk_buff *skb) +{ + return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); +} + +void udp_destroy_sock(struct sock *sk) +{ + bool slow = lock_sock_fast(sk); + udp_flush_pending_frames(sk); + unlock_sock_fast(sk, slow); +} + +/* + * Socket option code for UDP + */ +int udp_lib_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen, + int (*push_pending_frames)(struct sock *)) +{ + struct udp_sock *up = udp_sk(sk); + int val; + int err = 0; + int is_udplite = IS_UDPLITE(sk); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case UDP_CORK: + if (val != 0) { + up->corkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + (*push_pending_frames)(sk); + release_sock(sk); + } + break; + + case UDP_ENCAP: + switch (val) { + case 0: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + up->encap_rcv = xfrm4_udp_encap_rcv; + /* FALLTHROUGH */ + case UDP_ENCAP_L2TPINUDP: + up->encap_type = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + break; + + /* + * UDP-Lite's partial checksum coverage (RFC 3828). + */ + /* The sender sets actual checksum coverage length via this option. + * The case coverage > packet length is handled by send module. */ + case UDPLITE_SEND_CSCOV: + if (!is_udplite) /* Disable the option on UDP sockets */ + return -ENOPROTOOPT; + if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ + val = 8; + else if (val > USHRT_MAX) + val = USHRT_MAX; + up->pcslen = val; + up->pcflag |= UDPLITE_SEND_CC; + break; + + /* The receiver specifies a minimum checksum coverage value. To make + * sense, this should be set to at least 8 (as done below). If zero is + * used, this again means full checksum coverage. */ + case UDPLITE_RECV_CSCOV: + if (!is_udplite) /* Disable the option on UDP sockets */ + return -ENOPROTOOPT; + if (val != 0 && val < 8) /* Avoid silly minimal values. */ + val = 8; + else if (val > USHRT_MAX) + val = USHRT_MAX; + up->pcrlen = val; + up->pcflag |= UDPLITE_RECV_CC; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + return err; +} +EXPORT_SYMBOL(udp_lib_setsockopt); + +int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_setsockopt(sk, level, optname, optval, optlen, + udp_push_pending_frames); + return ip_setsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +int compat_udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_setsockopt(sk, level, optname, optval, optlen, + udp_push_pending_frames); + return compat_ip_setsockopt(sk, level, optname, optval, optlen); +} +#endif + +int udp_lib_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val, len; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + /* The following two cannot be changed on UDP sockets, the return is + * always 0 (which corresponds to the full checksum coverage of UDP). */ + case UDPLITE_SEND_CSCOV: + val = up->pcslen; + break; + + case UDPLITE_RECV_CSCOV: + val = up->pcrlen; + break; + + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL(udp_lib_getsockopt); + +int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_getsockopt(sk, level, optname, optval, optlen); + return ip_getsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +int compat_udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_getsockopt(sk, level, optname, optval, optlen); + return compat_ip_getsockopt(sk, level, optname, optval, optlen); +} +#endif +/** + * udp_poll - wait for a UDP event. + * @file - file struct + * @sock - socket + * @wait - poll table + * + * This is same as datagram poll, except for the special case of + * blocking sockets. If application is using a blocking fd + * and a packet with checksum error is in the queue; + * then it could get return from select indicating data available + * but then block when reading it. Add special case code + * to work around these arguably broken applications. + */ +unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + + /* Check for false positives due to checksum errors */ + if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && + !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk)) + mask &= ~(POLLIN | POLLRDNORM); + + return mask; + +} +EXPORT_SYMBOL(udp_poll); + +struct proto udp_prot = { + .name = "UDP", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .backlog_rcv = __udp_queue_rcv_skb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, + .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem = &sysctl_udp_wmem_min, + .sysctl_rmem = &sysctl_udp_rmem_min, + .obj_size = sizeof(struct udp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udp_table, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_udp_setsockopt, + .compat_getsockopt = compat_udp_getsockopt, +#endif + .clear_sk = sk_prot_clear_portaddr_nulls, +}; +EXPORT_SYMBOL(udp_prot); + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static struct sock *udp_get_first(struct seq_file *seq, int start) +{ + struct sock *sk; + struct udp_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + for (state->bucket = start; state->bucket <= state->udp_table->mask; + ++state->bucket) { + struct hlist_nulls_node *node; + struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; + + if (hlist_nulls_empty(&hslot->head)) + continue; + + spin_lock_bh(&hslot->lock); + sk_nulls_for_each(sk, node, &hslot->head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == state->family) + goto found; + } + spin_unlock_bh(&hslot->lock); + } + sk = NULL; +found: + return sk; +} + +static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) +{ + struct udp_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + do { + sk = sk_nulls_next(sk); + } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); + + if (!sk) { + if (state->bucket <= state->udp_table->mask) + spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); + return udp_get_first(seq, state->bucket + 1); + } + return sk; +} + +static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = udp_get_first(seq, 0); + + if (sk) + while (pos && (sk = udp_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *udp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct udp_iter_state *state = seq->private; + state->bucket = MAX_UDP_PORTS; + + return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; +} + +static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = udp_get_idx(seq, 0); + else + sk = udp_get_next(seq, v); + + ++*pos; + return sk; +} + +static void udp_seq_stop(struct seq_file *seq, void *v) +{ + struct udp_iter_state *state = seq->private; + + if (state->bucket <= state->udp_table->mask) + spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); +} + +static int udp_seq_open(struct inode *inode, struct file *file) +{ + struct udp_seq_afinfo *afinfo = PDE(inode)->data; + struct udp_iter_state *s; + int err; + + err = seq_open_net(inode, file, &afinfo->seq_ops, + sizeof(struct udp_iter_state)); + if (err < 0) + return err; + + s = ((struct seq_file *)file->private_data)->private; + s->family = afinfo->family; + s->udp_table = afinfo->udp_table; + return err; +} + +/* ------------------------------------------------------------------------ */ +int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) +{ + struct proc_dir_entry *p; + int rc = 0; + + afinfo->seq_fops.open = udp_seq_open; + afinfo->seq_fops.read = seq_read; + afinfo->seq_fops.llseek = seq_lseek; + afinfo->seq_fops.release = seq_release_net; + + afinfo->seq_ops.start = udp_seq_start; + afinfo->seq_ops.next = udp_seq_next; + afinfo->seq_ops.stop = udp_seq_stop; + + p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + &afinfo->seq_fops, afinfo); + if (!p) + rc = -ENOMEM; + return rc; +} +EXPORT_SYMBOL(udp_proc_register); + +void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) +{ + proc_net_remove(net, afinfo->name); +} +EXPORT_SYMBOL(udp_proc_unregister); + +/* ------------------------------------------------------------------------ */ +static void udp4_format_sock(struct sock *sp, struct seq_file *f, + int bucket, int *len) +{ + struct inet_sock *inet = inet_sk(sp); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + + seq_printf(f, "%5d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + bucket, src, srcp, dest, destp, sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); +} + +int udp4_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode ref pointer drops"); + else { + struct udp_iter_state *state = seq->private; + int len; + + udp4_format_sock(v, seq, state->bucket, &len); + seq_printf(seq, "%*s\n", 127 - len, ""); + } + return 0; +} + +/* ------------------------------------------------------------------------ */ +static struct udp_seq_afinfo udp4_seq_afinfo = { + .name = "udp", + .family = AF_INET, + .udp_table = &udp_table, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = udp4_seq_show, + }, +}; + +static int __net_init udp4_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udp4_seq_afinfo); +} + +static void __net_exit udp4_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udp4_seq_afinfo); +} + +static struct pernet_operations udp4_net_ops = { + .init = udp4_proc_init_net, + .exit = udp4_proc_exit_net, +}; + +int __init udp4_proc_init(void) +{ + return register_pernet_subsys(&udp4_net_ops); +} + +void udp4_proc_exit(void) +{ + unregister_pernet_subsys(&udp4_net_ops); +} +#endif /* CONFIG_PROC_FS */ + +static __initdata unsigned long uhash_entries; +static int __init set_uhash_entries(char *str) +{ + if (!str) + return 0; + uhash_entries = simple_strtoul(str, &str, 0); + if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) + uhash_entries = UDP_HTABLE_SIZE_MIN; + return 1; +} +__setup("uhash_entries=", set_uhash_entries); + +void __init udp_table_init(struct udp_table *table, const char *name) +{ + unsigned int i; + + if (!CONFIG_BASE_SMALL) + table->hash = alloc_large_system_hash(name, + 2 * sizeof(struct udp_hslot), + uhash_entries, + 21, /* one slot per 2 MB */ + 0, + &table->log, + &table->mask, + 64 * 1024); + /* + * Make sure hash table has the minimum size + */ + if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { + table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * + 2 * sizeof(struct udp_hslot), GFP_KERNEL); + if (!table->hash) + panic(name); + table->log = ilog2(UDP_HTABLE_SIZE_MIN); + table->mask = UDP_HTABLE_SIZE_MIN - 1; + } + table->hash2 = table->hash + (table->mask + 1); + for (i = 0; i <= table->mask; i++) { + INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + table->hash[i].count = 0; + spin_lock_init(&table->hash[i].lock); + } + for (i = 0; i <= table->mask; i++) { + INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + table->hash2[i].count = 0; + spin_lock_init(&table->hash2[i].lock); + } +} + +void __init udp_init(void) +{ + unsigned long limit; + + udp_table_init(&udp_table, "UDP"); + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + sysctl_udp_mem[0] = limit / 4 * 3; + sysctl_udp_mem[1] = limit; + sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; + + sysctl_udp_rmem_min = SK_MEM_QUANTUM; + sysctl_udp_wmem_min = SK_MEM_QUANTUM; +} + +int udp4_ufo_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + iph = ip_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + return 0; +} + +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + int offset; + __wsum csum; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: + return segs; +} + diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h new file mode 100644 index 00000000..aaad650d --- /dev/null +++ b/net/ipv4/udp_impl.h @@ -0,0 +1,34 @@ +#ifndef _UDP4_IMPL_H +#define _UDP4_IMPL_H +#include +#include +#include +#include + +extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int ); +extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); + +extern int udp_v4_get_port(struct sock *sk, unsigned short snum); + +extern int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +extern int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); + +#ifdef CONFIG_COMPAT +extern int compat_udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +extern int compat_udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +#endif +extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +extern int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); +extern void udp_destroy_sock(struct sock *sk); + +#ifdef CONFIG_PROC_FS +extern int udp4_seq_show(struct seq_file *seq, void *v); +#endif +#endif /* _UDP4_IMPL_H */ diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c new file mode 100644 index 00000000..aee9963f --- /dev/null +++ b/net/ipv4/udplite.c @@ -0,0 +1,131 @@ +/* + * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). + * + * Authors: Gerrit Renker + * + * Changes: + * Fixes: + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include "udp_impl.h" + +struct udp_table udplite_table __read_mostly; +EXPORT_SYMBOL(udplite_table); + +static int udplite_rcv(struct sk_buff *skb) +{ + return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); +} + +static void udplite_err(struct sk_buff *skb, u32 info) +{ + __udp4_lib_err(skb, info, &udplite_table); +} + +static const struct net_protocol udplite_protocol = { + .handler = udplite_rcv, + .err_handler = udplite_err, + .no_policy = 1, + .netns_ok = 1, +}; + +struct proto udplite_prot = { + .name = "UDP-Lite", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udplite_sk_init, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .backlog_rcv = udp_queue_rcv_skb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .get_port = udp_v4_get_port, + .obj_size = sizeof(struct udp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udplite_table, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_udp_setsockopt, + .compat_getsockopt = compat_udp_getsockopt, +#endif + .clear_sk = sk_prot_clear_portaddr_nulls, +}; +EXPORT_SYMBOL(udplite_prot); + +static struct inet_protosw udplite4_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDPLITE, + .prot = &udplite_prot, + .ops = &inet_dgram_ops, + .no_check = 0, /* must checksum (RFC 3828) */ + .flags = INET_PROTOSW_PERMANENT, +}; + +#ifdef CONFIG_PROC_FS +static struct udp_seq_afinfo udplite4_seq_afinfo = { + .name = "udplite", + .family = AF_INET, + .udp_table = &udplite_table, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = udp4_seq_show, + }, +}; + +static int __net_init udplite4_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udplite4_seq_afinfo); +} + +static void __net_exit udplite4_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udplite4_seq_afinfo); +} + +static struct pernet_operations udplite4_net_ops = { + .init = udplite4_proc_init_net, + .exit = udplite4_proc_exit_net, +}; + +static __init int udplite4_proc_init(void) +{ + return register_pernet_subsys(&udplite4_net_ops); +} +#else +static inline int udplite4_proc_init(void) +{ + return 0; +} +#endif + +void __init udplite4_register(void) +{ + udp_table_init(&udplite_table, "UDP-Lite"); + if (proto_register(&udplite_prot, 1)) + goto out_register_err; + + if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0) + goto out_unregister_proto; + + inet_register_protosw(&udplite4_protosw); + + if (udplite4_proc_init()) + printk(KERN_ERR "%s: Cannot register /proc!\n", __func__); + return; + +out_unregister_proto: + proto_unregister(&udplite_prot); +out_register_err: + printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); +} diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c new file mode 100644 index 00000000..06814b62 --- /dev/null +++ b/net/ipv4/xfrm4_input.c @@ -0,0 +1,166 @@ +/* + * xfrm4_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * Derek Atkins + * Add Encapsulation support + * + */ + +#include +#include +#include +#include +#include +#include +#include + +int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return xfrm4_extract_header(skb); +} + +static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) +{ + if (skb_dst(skb) == NULL) { + const struct iphdr *iph = ip_hdr(skb); + + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + goto drop; + } + return dst_input(skb); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + return xfrm_input(skb, nexthdr, spi, encap_type); +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +int xfrm4_transport_finish(struct sk_buff *skb, int async) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + +#ifndef CONFIG_NETFILTER + if (!async) + return -iph->protocol; +#endif + + __skb_push(skb, skb->data - skb_network_header(skb)); + iph->tot_len = htons(skb->len); + ip_send_check(iph); + + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + xfrm4_rcv_encap_finish); + return 0; +} + +/* If it's a keepalive packet, then just eat it. + * If it's an encapsulated packet, then pass it to the + * IPsec xfrm input. + * Returns 0 if skb passed to xfrm or was dropped. + * Returns >0 if skb should be passed to UDP. + * Returns <0 if skb should be resubmitted (-ret is protocol) + */ +int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + struct udphdr *uh; + struct iphdr *iph; + int iphlen, len; + + __u8 *udpdata; + __be32 *udpdata32; + __u16 encap_type = up->encap_type; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + len = skb->len - sizeof(struct udphdr); + if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) + return 1; + + /* Now we can get the pointers */ + uh = udp_hdr(skb); + udpdata = (__u8 *)uh + sizeof(struct udphdr); + udpdata32 = (__be32 *)udpdata; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && + udpdata32[0] == 0 && udpdata32[1] == 0) { + + /* ESP Packet with Non-IKE marker */ + len = sizeof(struct udphdr) + 2 * sizeof(u32); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + } + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + /* Now we can update and verify the packet length... */ + iph = ip_hdr(skb); + iphlen = iph->ihl << 2; + iph->tot_len = htons(ntohs(iph->tot_len) - len); + if (skb->len < iphlen + len) { + /* packet is too small!?! */ + goto drop; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + __skb_pull(skb, len); + skb_reset_transport_header(skb); + + /* process ESP */ + return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type); + +drop: + kfree_skb(skb); + return 0; +} + +int xfrm4_rcv(struct sk_buff *skb) +{ + return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); +} +EXPORT_SYMBOL(xfrm4_rcv); diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c new file mode 100644 index 00000000..e3db3f91 --- /dev/null +++ b/net/ipv4/xfrm4_mode_beet.c @@ -0,0 +1,156 @@ +/* + * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4. + * + * Copyright (c) 2006 Diego Beltrami + * Miika Komu + * Herbert Xu + * Abhinav Pathak + * Jeff Ahrenholz + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void xfrm4_beet_make_header(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->ihl = 5; + iph->version = 4; + + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + + iph->id = XFRM_MODE_SKB_CB(skb)->id; + iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; + iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + */ +static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_beet_phdr *ph; + struct iphdr *top_iph; + int hdrlen, optlen; + + hdrlen = 0; + optlen = XFRM_MODE_SKB_CB(skb)->optlen; + if (unlikely(optlen)) + hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb_set_network_header(skb, -x->props.header_len - + hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + xfrm4_beet_make_header(skb); + + ph = (struct ip_beet_phdr *) + __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); + + top_iph = ip_hdr(skb); + + if (unlikely(optlen)) { + BUG_ON(optlen < 0); + + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = optlen / 8; + ph->nexthdr = top_iph->protocol; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); + + top_iph->protocol = IPPROTO_BEETPH; + top_iph->ihl = sizeof(struct iphdr) / 4; + } + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + + return 0; +} + +static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph; + int optlen = 0; + int err = -EINVAL; + + if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { + struct ip_beet_phdr *ph; + int phlen; + + if (!pskb_may_pull(skb, sizeof(*ph))) + goto out; + + ph = (struct ip_beet_phdr *)skb->data; + + phlen = sizeof(*ph) + ph->padlen; + optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); + if (optlen < 0 || optlen & 3 || optlen > 250) + goto out; + + XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; + + if (!pskb_may_pull(skb, phlen)) + goto out; + __skb_pull(skb, phlen); + } + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + xfrm4_beet_make_header(skb); + + iph = ip_hdr(skb); + + iph->ihl += optlen / 4; + iph->tot_len = htons(skb->len); + iph->daddr = x->sel.daddr.a4; + iph->saddr = x->sel.saddr.a4; + iph->check = 0; + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm4_beet_mode = { + .input2 = xfrm4_beet_input, + .input = xfrm_prepare_input, + .output2 = xfrm4_beet_output, + .output = xfrm4_prepare_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, + .flags = XFRM_MODE_FLAG_TUNNEL, +}; + +static int __init xfrm4_beet_init(void) +{ + return xfrm_register_mode(&xfrm4_beet_mode, AF_INET); +} + +static void __exit xfrm4_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_beet_init); +module_exit(xfrm4_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 00000000..fd840c7d --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c @@ -0,0 +1,80 @@ +/* + * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + */ +static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + int ihl = iph->ihl * 4; + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + ihl; + __skb_pull(skb, ihl); + memmove(skb_network_header(skb), iph, ihl); + return 0; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb_transport_header(skb); + + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ip_hdr(skb)->tot_len = htons(skb->len + ihl); + skb_reset_transport_header(skb); + return 0; +} + +static struct xfrm_mode xfrm4_transport_mode = { + .input = xfrm4_transport_input, + .output = xfrm4_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm4_transport_init(void) +{ + return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); +} + +static void __exit xfrm4_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_transport_init); +module_exit(xfrm4_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 00000000..ed4bf11e --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -0,0 +1,121 @@ +/* + * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *inner_iph = ipip_hdr(skb); + + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. + */ +static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct iphdr *top_iph; + int flags; + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + top_iph = ip_hdr(skb); + + top_iph->ihl = 5; + top_iph->version = 4; + + top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, + XFRM_MODE_SKB_CB(skb)->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); + ip_select_ident(top_iph, dst->child, NULL); + + top_iph->ttl = ip4_dst_hoplimit(dst->child); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + + return 0; +} + +static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) + goto out; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm4_tunnel_mode = { + .input2 = xfrm4_mode_tunnel_input, + .input = xfrm_prepare_input, + .output2 = xfrm4_mode_tunnel_output, + .output = xfrm4_prepare_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, + .flags = XFRM_MODE_FLAG_TUNNEL, +}; + +static int __init xfrm4_mode_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); +} + +static void __exit xfrm4_mode_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_mode_tunnel_init); +module_exit(xfrm4_mode_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c new file mode 100644 index 00000000..327a617d --- /dev/null +++ b/net/ipv4/xfrm4_output.c @@ -0,0 +1,101 @@ +/* + * xfrm4_output.c - Common IPsec encapsulation code for IPv4. + * Copyright (c) 2004 Herbert Xu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrm4_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst; + + if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) + goto out; + + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) + goto out; + + dst = skb_dst(skb); + mtu = dst_mtu(dst); + if (skb->len > mtu) { + if (skb->sk) + ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, + inet_sk(skb->sk)->inet_dport, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + ret = -EMSGSIZE; + } +out: + return ret; +} + +int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm4_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; + + return xfrm4_extract_header(skb); +} + +int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm_inner_extract_output(x, skb); + if (err) + return err; + + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; + + skb->protocol = htons(ETH_P_IP); + + return x->outer_mode->output2(x, skb); +} +EXPORT_SYMBOL(xfrm4_prepare_output); + +int xfrm4_output_finish(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER + if (!skb_dst(skb)->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } + + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; +#endif + + skb->protocol = htons(ETH_P_IP); + return xfrm_output(skb); +} + +int xfrm4_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, + NULL, dst->dev, + x->outer_mode->afinfo->output_finish, + !(IPCB(skb)->flags & IPSKB_REROUTED)); +} diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c new file mode 100644 index 00000000..581fe0ab --- /dev/null +++ b/net/ipv4/xfrm4_policy.c @@ -0,0 +1,305 @@ +/* + * xfrm4_policy.c + * + * Changes: + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include +#include +#include + +static struct xfrm_policy_afinfo xfrm4_policy_afinfo; + +static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, + int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct rtable *rt; + + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = daddr->a4; + fl4->flowi4_tos = tos; + if (saddr) + fl4->saddr = saddr->a4; + + rt = __ip_route_output_key(net, fl4); + if (!IS_ERR(rt)) + return &rt->dst; + + return ERR_CAST(rt); +} + +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct flowi4 fl4; + + return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); +} + +static int xfrm4_get_saddr(struct net *net, + xfrm_address_t *saddr, xfrm_address_t *daddr) +{ + struct dst_entry *dst; + struct flowi4 fl4; + + dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); + if (IS_ERR(dst)) + return -EHOSTUNREACH; + + saddr->a4 = fl4.saddr; + dst_release(dst); + return 0; +} + +static int xfrm4_get_tos(const struct flowi *fl) +{ + return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ +} + +static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) +{ + return 0; +} + +static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + const struct flowi *fl) +{ + struct rtable *rt = (struct rtable *)xdst->route; + const struct flowi4 *fl4 = &fl->u.ip4; + + xdst->u.rt.rt_key_dst = fl4->daddr; + xdst->u.rt.rt_key_src = fl4->saddr; + xdst->u.rt.rt_key_tos = fl4->flowi4_tos; + xdst->u.rt.rt_route_iif = fl4->flowi4_iif; + xdst->u.rt.rt_iif = fl4->flowi4_iif; + xdst->u.rt.rt_oif = fl4->flowi4_oif; + xdst->u.rt.rt_mark = fl4->flowi4_mark; + + xdst->u.dst.dev = dev; + dev_hold(dev); + + xdst->u.rt.peer = rt->peer; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | + RTCF_LOCAL); + xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_src = rt->rt_src; + xdst->u.rt.rt_dst = rt->rt_dst; + xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; + + return 0; +} + +static void +_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) +{ + const struct iphdr *iph = ip_hdr(skb); + u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + struct flowi4 *fl4 = &fl->u.ip4; + + memset(fl4, 0, sizeof(struct flowi4)); + fl4->flowi4_mark = skb->mark; + + if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { + switch (iph->protocol) { + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ports = (__be16 *)xprth; + + fl4->fl4_sport = ports[!!reverse]; + fl4->fl4_dport = ports[!reverse]; + } + break; + + case IPPROTO_ICMP: + if (pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp = xprth; + + fl4->fl4_icmp_type = icmp[0]; + fl4->fl4_icmp_code = icmp[1]; + } + break; + + case IPPROTO_ESP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be32 *ehdr = (__be32 *)xprth; + + fl4->fl4_ipsec_spi = ehdr[0]; + } + break; + + case IPPROTO_AH: + if (pskb_may_pull(skb, xprth + 8 - skb->data)) { + __be32 *ah_hdr = (__be32*)xprth; + + fl4->fl4_ipsec_spi = ah_hdr[1]; + } + break; + + case IPPROTO_COMP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ipcomp_hdr = (__be16 *)xprth; + + fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); + } + break; + + case IPPROTO_GRE: + if (pskb_may_pull(skb, xprth + 12 - skb->data)) { + __be16 *greflags = (__be16 *)xprth; + __be32 *gre_hdr = (__be32 *)xprth; + + if (greflags[0] & GRE_KEY) { + if (greflags[0] & GRE_CSUM) + gre_hdr++; + fl4->fl4_gre_key = gre_hdr[1]; + } + } + break; + + default: + fl4->fl4_ipsec_spi = 0; + break; + } + } + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; +} + +static inline int xfrm4_garbage_collect(struct dst_ops *ops) +{ + struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); + + xfrm4_policy_afinfo.garbage_collect(net); + return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); +} + +static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static void xfrm4_dst_destroy(struct dst_entry *dst) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + dst_destroy_metrics_generic(dst); + + if (likely(xdst->u.rt.peer)) + inet_putpeer(xdst->u.rt.peer); + + xfrm_dst_destroy(xdst); +} + +static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + if (!unregister) + return; + + xfrm_dst_ifdown(dst, dev); +} + +static struct dst_ops xfrm4_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .gc = xfrm4_garbage_collect, + .update_pmtu = xfrm4_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, + .destroy = xfrm4_dst_destroy, + .ifdown = xfrm4_dst_ifdown, + .local_out = __ip_local_out, + .gc_thresh = 1024, +}; + +static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { + .family = AF_INET, + .dst_ops = &xfrm4_dst_ops, + .dst_lookup = xfrm4_dst_lookup, + .get_saddr = xfrm4_get_saddr, + .decode_session = _decode_session4, + .get_tos = xfrm4_get_tos, + .init_path = xfrm4_init_path, + .fill_dst = xfrm4_fill_dst, + .blackhole_route = ipv4_blackhole_route, +}; + +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm4_policy_table[] = { + { + .procname = "xfrm4_gc_thresh", + .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *sysctl_hdr; +#endif + +static void __init xfrm4_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); +} + +static void __exit xfrm4_policy_fini(void) +{ +#ifdef CONFIG_SYSCTL + if (sysctl_hdr) + unregister_net_sysctl_table(sysctl_hdr); +#endif + xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); +} + +void __init xfrm4_init(int rt_max_size) +{ + /* + * Select a default value for the gc_thresh based on the main route + * table hash size. It seems to me the worst case scenario is when + * we have ipsec operating in transport mode, in which we create a + * dst_entry per socket. The xfrm gc algorithm starts trying to remove + * entries at gc_thresh, and prevents new allocations as 2*gc_thresh + * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. + * That will let us store an ipsec connection per route table entry, + * and start cleaning when were 1/2 full + */ + xfrm4_dst_ops.gc_thresh = rt_max_size/2; + dst_entries_init(&xfrm4_dst_ops); + + xfrm4_state_init(); + xfrm4_policy_init(); +#ifdef CONFIG_SYSCTL + sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, + xfrm4_policy_table); +#endif +} + diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c new file mode 100644 index 00000000..d9ac0a00 --- /dev/null +++ b/net/ipv4/xfrm4_state.c @@ -0,0 +1,98 @@ +/* + * xfrm4_state.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include + +static int xfrm4_init_flags(struct xfrm_state *x) +{ + if (ipv4_config.no_pmtu_disc) + x->props.flags |= XFRM_STATE_NOPMTUDISC; + return 0; +} + +static void +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi4 *fl4 = &fl->u.ip4; + + sel->daddr.a4 = fl4->daddr; + sel->saddr.a4 = fl4->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl4->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl4->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET; + sel->prefixlen_d = 32; + sel->prefixlen_s = 32; + sel->proto = fl4->flowi4_proto; + sel->ifindex = fl4->flowi4_oif; +} + +static void +xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) +{ + x->id = tmpl->id; + if (x->id.daddr.a4 == 0) + x->id.daddr.a4 = daddr->a4; + x->props.saddr = tmpl->saddr; + if (x->props.saddr.a4 == 0) + x->props.saddr.a4 = saddr->a4; + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET; +} + +int xfrm4_extract_header(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = iph->id; + XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; + XFRM_MODE_SKB_CB(skb)->tos = iph->tos; + XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; + XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); + memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + + return 0; +} + +static struct xfrm_state_afinfo xfrm4_state_afinfo = { + .family = AF_INET, + .proto = IPPROTO_IPIP, + .eth_proto = htons(ETH_P_IP), + .owner = THIS_MODULE, + .init_flags = xfrm4_init_flags, + .init_tempsel = __xfrm4_init_tempsel, + .init_temprop = xfrm4_init_temprop, + .output = xfrm4_output, + .output_finish = xfrm4_output_finish, + .extract_input = xfrm4_extract_input, + .extract_output = xfrm4_extract_output, + .transport_finish = xfrm4_transport_finish, +}; + +void __init xfrm4_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm4_state_afinfo); +} + +#if 0 +void __exit xfrm4_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); +} +#endif /* 0 */ + diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c new file mode 100644 index 00000000..82806455 --- /dev/null +++ b/net/ipv4/xfrm4_tunnel.c @@ -0,0 +1,113 @@ +/* xfrm4_tunnel.c: Generic IP tunnel transformer. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include +#include + +static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) +{ + skb_push(skb, -skb_network_offset(skb)); + return 0; +} + +static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb) +{ + return ip_hdr(skb)->protocol; +} + +static int ipip_init_state(struct xfrm_state *x) +{ + if (x->props.mode != XFRM_MODE_TUNNEL) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct iphdr); + + return 0; +} + +static void ipip_destroy(struct xfrm_state *x) +{ +} + +static const struct xfrm_type ipip_type = { + .description = "IPIP", + .owner = THIS_MODULE, + .proto = IPPROTO_IPIP, + .init_state = ipip_init_state, + .destructor = ipip_destroy, + .input = ipip_xfrm_rcv, + .output = ipip_output +}; + +static int xfrm_tunnel_rcv(struct sk_buff *skb) +{ + return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); +} + +static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) +{ + return -ENOENT; +} + +static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { + .handler = xfrm_tunnel_rcv, + .err_handler = xfrm_tunnel_err, + .priority = 2, +}; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { + .handler = xfrm_tunnel_rcv, + .err_handler = xfrm_tunnel_err, + .priority = 2, +}; +#endif + +static int __init ipip_init(void) +{ + if (xfrm_register_type(&ipip_type, AF_INET) < 0) { + printk(KERN_INFO "ipip init: can't add xfrm type\n"); + return -EAGAIN; + } + + if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { + printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n"); + xfrm_unregister_type(&ipip_type, AF_INET); + return -EAGAIN; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { + printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n"); + xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); + xfrm_unregister_type(&ipip_type, AF_INET); + return -EAGAIN; + } +#endif + return 0; +} + +static void __exit ipip_fini(void) +{ +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) + printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n"); +#endif + if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) + printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n"); + if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) + printk(KERN_INFO "ipip close: can't remove xfrm type\n"); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig new file mode 100644 index 00000000..36d7437a --- /dev/null +++ b/net/ipv6/Kconfig @@ -0,0 +1,253 @@ +# +# IPv6 configuration +# + +# IPv6 as module will cause a CRASH if you try to unload it +menuconfig IPV6 + tristate "The IPv6 protocol" + default m + ---help--- + This is complemental support for the IP version 6. + You will still be able to do traditional IPv4 networking as well. + + For general information about IPv6, see + . + For Linux IPv6 development information, see . + For specific information about IPv6 under Linux, read the HOWTO at + . + + To compile this protocol support as a module, choose M here: the + module will be called ipv6. + +if IPV6 + +config IPV6_PRIVACY + bool "IPv6: Privacy Extensions (RFC 3041) support" + ---help--- + Privacy Extensions for Stateless Address Autoconfiguration in IPv6 + support. With this option, additional periodically-altered + pseudo-random global-scope unicast address(es) will be assigned to + your interface(s). + + We use our standard pseudo-random algorithm to generate the + randomized interface identifier, instead of one described in RFC 3041. + + By default the kernel does not generate temporary addresses. + To use temporary addresses, do + + echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr + + See for details. + +config IPV6_ROUTER_PREF + bool "IPv6: Router Preference (RFC 4191) support" + ---help--- + Router Preference is an optional extension to the Router + Advertisement message which improves the ability of hosts + to pick an appropriate router, especially when the hosts + are placed in a multi-homed network. + + If unsure, say N. + +config IPV6_ROUTE_INFO + bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)" + depends on IPV6_ROUTER_PREF && EXPERIMENTAL + ---help--- + This is experimental support of Route Information. + + If unsure, say N. + +config IPV6_OPTIMISTIC_DAD + bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + This is experimental support for optimistic Duplicate + Address Detection. It allows for autoconfigured addresses + to be used more quickly. + + If unsure, say N. + +config INET6_AH + tristate "IPv6: AH transformation" + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET6_ESP + tristate "IPv6: ESP transformation" + select XFRM + select CRYPTO + select CRYPTO_AUTHENC + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_CBC + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET6_IPCOMP + tristate "IPv6: IPComp transformation" + select INET6_XFRM_TUNNEL + select XFRM_IPCOMP + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config IPV6_MIP6 + tristate "IPv6: Mobility (EXPERIMENTAL)" + depends on EXPERIMENTAL + select XFRM + ---help--- + Support for IPv6 Mobility described in RFC 3775. + + If unsure, say N. + +config INET6_XFRM_TUNNEL + tristate + select INET6_TUNNEL + default n + +config INET6_TUNNEL + tristate + default n + +config INET6_XFRM_MODE_TRANSPORT + tristate "IPv6: IPsec transport mode" + default IPV6 + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_TUNNEL + tristate "IPv6: IPsec tunnel mode" + default IPV6 + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_BEET + tristate "IPv6: IPsec BEET mode" + default IPV6 + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_ROUTEOPTIMIZATION + tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" + depends on EXPERIMENTAL + select XFRM + ---help--- + Support for MIPv6 route optimization mode. + +config IPV6_SIT + tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)" + select INET_TUNNEL + select IPV6_NDISC_NODETYPE + default y + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This driver implements encapsulation of IPv6 + into IPv4 packets. This is useful if you want to connect two IPv6 + networks over an IPv4-only path. + + Saying M here will produce a module called sit. If unsure, say Y. + +config IPV6_SIT_6RD + bool "IPv6: IPv6 Rapid Deployment (6RD) (EXPERIMENTAL)" + depends on IPV6_SIT && EXPERIMENTAL + default n + ---help--- + IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon + mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly + deploy IPv6 unicast service to IPv4 sites to which it provides + customer premise equipment. Like 6to4, it utilizes stateless IPv6 in + IPv4 encapsulation in order to transit IPv4-only network + infrastructure. Unlike 6to4, a 6rd service provider uses an IPv6 + prefix of its own in place of the fixed 6to4 prefix. + + With this option enabled, the SIT driver offers 6rd functionality by + providing additional ioctl API to configure the IPv6 Prefix for in + stead of static 2002::/16 for 6to4. + + If unsure, say N. + +config IPV6_NDISC_NODETYPE + bool + +config IPV6_TUNNEL + tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" + select INET6_TUNNEL + ---help--- + Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in + RFC 2473. + + If unsure, say N. + +config IPV6_MULTIPLE_TABLES + bool "IPv6: Multiple Routing Tables" + depends on EXPERIMENTAL + select FIB_RULES + ---help--- + Support multiple routing tables. + +config IPV6_SUBTREES + bool "IPv6: source address based routing" + depends on IPV6_MULTIPLE_TABLES + ---help--- + Enable routing by source address or prefix. + + The destination address is still the primary routing key, so mixing + normal and source prefix specific routes in the same routing table + may sometimes lead to unintended routing behavior. This can be + avoided by defining different routing tables for the normal and + source prefix specific routes. + + If unsure, say N. + +config IPV6_MROUTE + bool "IPv6: multicast routing (EXPERIMENTAL)" + depends on IPV6 && EXPERIMENTAL + ---help--- + Experimental support for IPv6 multicast forwarding. + If unsure, say N. + +config IPV6_MROUTE_MULTIPLE_TABLES + bool "IPv6: multicast policy routing" + depends on IPV6_MROUTE + select FIB_RULES + help + Normally, a multicast router runs a userspace daemon and decides + what to do with a multicast packet based on the source and + destination addresses. If you say Y here, the multicast router + will also be able to take interfaces and packet marks into + account and run multiple instances of userspace daemons + simultaneously, each one handling a single table. + + If unsure, say N. + +config IPV6_PIMSM_V2 + bool "IPv6: PIM-SM version 2 support (EXPERIMENTAL)" + depends on IPV6_MROUTE + ---help--- + Support for IPv6 PIM multicast routing protocol PIM-SMv2. + If unsure, say N. + +endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile new file mode 100644 index 00000000..686934ac --- /dev/null +++ b/net/ipv6/Makefile @@ -0,0 +1,42 @@ +# +# Makefile for the Linux TCP/IP (INET6) layer. +# + +obj-$(CONFIG_IPV6) += ipv6.o + +ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ + addrlabel.o \ + route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ + raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o + +ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o +ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o + +ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ + xfrm6_output.o +ipv6-$(CONFIG_NETFILTER) += netfilter.o +ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o +ipv6-$(CONFIG_PROC_FS) += proc.o +ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o + +ipv6-objs += $(ipv6-y) + +obj-$(CONFIG_INET6_AH) += ah6.o +obj-$(CONFIG_INET6_ESP) += esp6.o +obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o +obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o +obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o +obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o +obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o +obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o +obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o +obj-$(CONFIG_IPV6_MIP6) += mip6.o +obj-$(CONFIG_NETFILTER) += netfilter/ + +obj-$(CONFIG_IPV6_SIT) += sit.o +obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o + +obj-y += addrconf_core.o exthdrs_core.o + +obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c new file mode 100644 index 00000000..8a4bf719 --- /dev/null +++ b/net/ipv6/addrconf.c @@ -0,0 +1,4783 @@ +/* + * IPv6 Address [auto]configuration + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Janos Farkas : delete timer on ifdown + * + * Andi Kleen : kill double kfree on module + * unload. + * Maciej W. Rozycki : FDDI support + * sekiya@USAGI : Don't send too many RS + * packets. + * yoshfuji@USAGI : Fixed interval between DAD + * packets. + * YOSHIFUJI Hideaki @USAGI : improved accuracy of + * address validation timer. + * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041) + * support. + * Yuji SEKIYA @USAGI : Don't assign a same IPv6 + * address on a same interface. + * YOSHIFUJI Hideaki @USAGI : ARCnet support + * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to + * seq_file. + * YOSHIFUJI Hideaki @USAGI : improved source address + * selection; consider scope, + * status etc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPV6_PRIVACY +#include +#endif + +#include +#include + +#include +#include + +/* Set to 3 to get tracing... */ +#define ACONF_DEBUG 2 + +#if ACONF_DEBUG >= 3 +#define ADBG(x) printk x +#else +#define ADBG(x) +#endif + +#define INFINITY_LIFE_TIME 0xFFFFFFFF + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} + +#define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1) +#define ADDRCONF_TIMER_FUZZ (HZ / 4) +#define ADDRCONF_TIMER_FUZZ_MAX (HZ) + +#ifdef CONFIG_SYSCTL +static void addrconf_sysctl_register(struct inet6_dev *idev); +static void addrconf_sysctl_unregister(struct inet6_dev *idev); +#else +static inline void addrconf_sysctl_register(struct inet6_dev *idev) +{ +} + +static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) +{ +} +#endif + +#ifdef CONFIG_IPV6_PRIVACY +static int __ipv6_regen_rndid(struct inet6_dev *idev); +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_regen_rndid(unsigned long data); +#endif + +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); +static int ipv6_count_addresses(struct inet6_dev *idev); + +/* + * Configured unicast address hash table + */ +static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(addrconf_hash_lock); + +static void addrconf_verify(unsigned long); + +static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); +static DEFINE_SPINLOCK(addrconf_verify_lock); + +static void addrconf_join_anycast(struct inet6_ifaddr *ifp); +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); + +static void addrconf_type_change(struct net_device *dev, + unsigned long event); +static int addrconf_ifdown(struct net_device *dev, int how); + +static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); +static void addrconf_dad_timer(unsigned long data); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_dad_run(struct inet6_dev *idev); +static void addrconf_rs_timer(unsigned long data); +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo); +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev); + +static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); + +static struct ipv6_devconf ipv6_devconf __read_mostly = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .force_mld_version = 0, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, + .accept_ra_defrtr = 1, + .accept_ra_pinfo = 1, +#ifdef CONFIG_IPV6_ROUTER_PREF + .accept_ra_rtr_pref = 1, + .rtr_probe_interval = 60 * HZ, +#ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_max_plen = 0, +#endif +#endif + .proxy_ndp = 0, + .accept_source_route = 0, /* we do not accept RH0 by default. */ + .disable_ipv6 = 0, + .accept_dad = 1, +}; + +static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, + .accept_ra_defrtr = 1, + .accept_ra_pinfo = 1, +#ifdef CONFIG_IPV6_ROUTER_PREF + .accept_ra_rtr_pref = 1, + .rtr_probe_interval = 60 * HZ, +#ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_max_plen = 0, +#endif +#endif + .proxy_ndp = 0, + .accept_source_route = 0, /* we do not accept RH0 by default. */ + .disable_ipv6 = 0, + .accept_dad = 1, +}; + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; + +/* Check if a valid qdisc is available */ +static inline bool addrconf_qdisc_ok(const struct net_device *dev) +{ + return !qdisc_tx_is_noop(dev); +} + +/* Check if a route is valid prefix route */ +static inline int addrconf_is_prefix_route(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0; +} + +static void addrconf_del_timer(struct inet6_ifaddr *ifp) +{ + if (del_timer(&ifp->timer)) + __in6_ifa_put(ifp); +} + +enum addrconf_timer_t { + AC_NONE, + AC_DAD, + AC_RS, +}; + +static void addrconf_mod_timer(struct inet6_ifaddr *ifp, + enum addrconf_timer_t what, + unsigned long when) +{ + if (!del_timer(&ifp->timer)) + in6_ifa_hold(ifp); + + switch (what) { + case AC_DAD: + ifp->timer.function = addrconf_dad_timer; + break; + case AC_RS: + ifp->timer.function = addrconf_rs_timer; + break; + default: + break; + } + ifp->timer.expires = jiffies + when; + add_timer(&ifp->timer); +} + +static int snmp6_alloc_dev(struct inet6_dev *idev) +{ + if (snmp_mib_init((void __percpu **)idev->stats.ipv6, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip; + idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6dev) + goto err_icmp; + idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6msgdev) + goto err_icmpmsg; + + return 0; + +err_icmpmsg: + kfree(idev->stats.icmpv6dev); +err_icmp: + snmp_mib_free((void __percpu **)idev->stats.ipv6); +err_ip: + return -ENOMEM; +} + +static void snmp6_free_dev(struct inet6_dev *idev) +{ + kfree(idev->stats.icmpv6msgdev); + kfree(idev->stats.icmpv6dev); + snmp_mib_free((void __percpu **)idev->stats.ipv6); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + WARN_ON(idev->mc_list != NULL); + +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + pr_warning("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree_rcu(idev, rcu); +} + +EXPORT_SYMBOL(in6_dev_finish_destroy); + +static struct inet6_dev * ipv6_add_dev(struct net_device *dev) +{ + struct inet6_dev *ndev; + + ASSERT_RTNL(); + + if (dev->mtu < IPV6_MIN_MTU) + return NULL; + + ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); + + if (ndev == NULL) + return NULL; + + rwlock_init(&ndev->lock); + ndev->dev = dev; + INIT_LIST_HEAD(&ndev->addr_list); + + memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); + ndev->cnf.mtu6 = dev->mtu; + ndev->cnf.sysctl = NULL; + ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); + if (ndev->nd_parms == NULL) { + kfree(ndev); + return NULL; + } + if (ndev->cnf.forwarding) + dev_disable_lro(dev); + /* We refer to the device */ + dev_hold(dev); + + if (snmp6_alloc_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot allocate memory for statistics; dev=%s.\n", + __func__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + if (snmp6_register_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + /* One reference from device. We must do this before + * we invoke __ipv6_regen_rndid(). + */ + in6_dev_hold(ndev); + + if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) + ndev->cnf.accept_dad = -1; + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { + printk(KERN_INFO + "%s: Disabled Multicast RS\n", + dev->name); + ndev->cnf.rtr_solicits = 0; + } +#endif + +#ifdef CONFIG_IPV6_PRIVACY + INIT_LIST_HEAD(&ndev->tempaddr_list); + setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); + if ((dev->flags&IFF_LOOPBACK) || + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_TUNNEL6 || + dev->type == ARPHRD_SIT || + dev->type == ARPHRD_NONE) { + ndev->cnf.use_tempaddr = -1; + } else { + in6_dev_hold(ndev); + ipv6_regen_rndid((unsigned long) ndev); + } +#endif + + if (netif_running(dev) && addrconf_qdisc_ok(dev)) + ndev->if_flags |= IF_READY; + + ipv6_mc_init_dev(ndev); + ndev->tstamp = jiffies; + addrconf_sysctl_register(ndev); + /* protected by rtnl_lock */ + rcu_assign_pointer(dev->ip6_ptr, ndev); + + /* Join all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + + /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && dev && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + + return ndev; +} + +static struct inet6_dev * ipv6_find_idev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + idev = __in6_dev_get(dev); + if (!idev) { + idev = ipv6_add_dev(dev); + if (!idev) + return NULL; + } + + if (dev->flags&IFF_UP) + ipv6_mc_up(idev); + return idev; +} + +#ifdef CONFIG_SYSCTL +static void dev_forward_change(struct inet6_dev *idev) +{ + struct net_device *dev; + struct inet6_ifaddr *ifa; + + if (!idev) + return; + dev = idev->dev; + if (idev->cnf.forwarding) + dev_disable_lro(dev); + if (dev && (dev->flags & IFF_MULTICAST)) { + if (idev->cnf.forwarding) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + else + ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); + } + + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (ifa->flags&IFA_F_TENTATIVE) + continue; + if (idev->cnf.forwarding) + addrconf_join_anycast(ifa); + else + addrconf_leave_anycast(ifa); + } +} + + +static void addrconf_forward_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.forwarding) ^ (!newf); + idev->cnf.forwarding = newf; + if (changed) + dev_forward_change(idev); + } + } + rcu_read_unlock(); +} + +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +{ + struct net *net; + + net = (struct net *)table->extra2; + if (p == &net->ipv6.devconf_dflt->forwarding) + return 0; + + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *p = old; + return restart_syscall(); + } + + if (p == &net->ipv6.devconf_all->forwarding) { + __s32 newf = net->ipv6.devconf_all->forwarding; + net->ipv6.devconf_dflt->forwarding = newf; + addrconf_forward_change(net, newf); + } else if ((!*p) ^ (!old)) + dev_forward_change((struct inet6_dev *)table->extra1); + rtnl_unlock(); + + if (*p) + rt6_purge_dflt_routers(net); + return 1; +} +#endif + +/* Nobody refers to this ifaddr, destroy it */ +void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) +{ + WARN_ON(!hlist_unhashed(&ifp->addr_lst)); + +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); +#endif + + in6_dev_put(ifp->idev); + + if (del_timer(&ifp->timer)) + pr_notice("Timer is still running, when freeing ifa=%p\n", ifp); + + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + pr_warning("Freeing alive inet6 address %p\n", ifp); + return; + } + dst_release(&ifp->rt->dst); + + kfree_rcu(ifp, rcu); +} + +static void +ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) +{ + struct list_head *p; + int ifp_scope = ipv6_addr_src_scope(&ifp->addr); + + /* + * Each device address list is sorted in order of scope - + * global before linklocal. + */ + list_for_each(p, &idev->addr_list) { + struct inet6_ifaddr *ifa + = list_entry(p, struct inet6_ifaddr, if_list); + if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) + break; + } + + list_add_tail(&ifp->if_list, p); +} + +static u32 ipv6_addr_hash(const struct in6_addr *addr) +{ + /* + * We perform the hash function over the last 64 bits of the address + * This will include the IEEE address token on links that support it. + */ + return jhash_2words((__force u32)addr->s6_addr32[2], + (__force u32)addr->s6_addr32[3], 0) + & (IN6_ADDR_HSIZE - 1); +} + +/* On success it returns ifp with increased reference count */ + +static struct inet6_ifaddr * +ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, + int scope, u32 flags) +{ + struct inet6_ifaddr *ifa = NULL; + struct rt6_info *rt; + unsigned int hash; + int err = 0; + int addr_type = ipv6_addr_type(addr); + + if (addr_type == IPV6_ADDR_ANY || + addr_type & IPV6_ADDR_MULTICAST || + (!(idev->dev->flags & IFF_LOOPBACK) && + addr_type & IPV6_ADDR_LOOPBACK)) + return ERR_PTR(-EADDRNOTAVAIL); + + rcu_read_lock_bh(); + if (idev->dead) { + err = -ENODEV; /*XXX*/ + goto out2; + } + + if (idev->cnf.disable_ipv6) { + err = -EACCES; + goto out2; + } + + spin_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { + ADBG(("ipv6_add_addr: already assigned\n")); + err = -EEXIST; + goto out; + } + + ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + + if (ifa == NULL) { + ADBG(("ipv6_add_addr: malloc failed\n")); + err = -ENOBUFS; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 0); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; + } + + ipv6_addr_copy(&ifa->addr, addr); + + spin_lock_init(&ifa->lock); + spin_lock_init(&ifa->state_lock); + init_timer(&ifa->timer); + INIT_HLIST_NODE(&ifa->addr_lst); + ifa->timer.data = (unsigned long) ifa; + ifa->scope = scope; + ifa->prefix_len = pfxlen; + ifa->flags = flags | IFA_F_TENTATIVE; + ifa->cstamp = ifa->tstamp = jiffies; + + ifa->rt = rt; + + /* + * part one of RFC 4429, section 3.3 + * We should not configure an address as + * optimistic if we do not yet know the link + * layer address of our nexhop router + */ + + if (dst_get_neighbour_raw(&rt->dst) == NULL) + ifa->flags &= ~IFA_F_OPTIMISTIC; + + ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ + in6_ifa_hold(ifa); + + /* Add to big hash table */ + hash = ipv6_addr_hash(addr); + + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + spin_unlock(&addrconf_hash_lock); + + write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ + ipv6_link_dev_addr(idev, ifa); + +#ifdef CONFIG_IPV6_PRIVACY + if (ifa->flags&IFA_F_TEMPORARY) { + list_add(&ifa->tmp_list, &idev->tempaddr_list); + in6_ifa_hold(ifa); + } +#endif + + in6_ifa_hold(ifa); + write_unlock(&idev->lock); +out2: + rcu_read_unlock_bh(); + + if (likely(err == 0)) + atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + else { + kfree(ifa); + ifa = ERR_PTR(err); + } + + return ifa; +out: + spin_unlock(&addrconf_hash_lock); + goto out2; +} + +/* This function wants to get referenced ifp and releases it before return */ + +static void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifa, *ifn; + struct inet6_dev *idev = ifp->idev; + int state; + int deleted = 0, onlink = 0; + unsigned long expires = jiffies; + + spin_lock_bh(&ifp->state_lock); + state = ifp->state; + ifp->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifp->state_lock); + + if (state == INET6_IFADDR_STATE_DEAD) + goto out; + + spin_lock_bh(&addrconf_hash_lock); + hlist_del_init_rcu(&ifp->addr_lst); + spin_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&idev->lock); +#ifdef CONFIG_IPV6_PRIVACY + if (ifp->flags&IFA_F_TEMPORARY) { + list_del(&ifp->tmp_list); + if (ifp->ifpub) { + in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; + } + __in6_ifa_put(ifp); + } +#endif + + list_for_each_entry_safe(ifa, ifn, &idev->addr_list, if_list) { + if (ifa == ifp) { + list_del_init(&ifp->if_list); + __in6_ifa_put(ifp); + + if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) + break; + deleted = 1; + continue; + } else if (ifp->flags & IFA_F_PERMANENT) { + if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) { + if (ifa->flags & IFA_F_PERMANENT) { + onlink = 1; + if (deleted) + break; + } else { + unsigned long lifetime; + + if (!onlink) + onlink = -1; + + spin_lock(&ifa->lock); + + lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); + /* + * Note: Because this address is + * not permanent, lifetime < + * LONG_MAX / HZ here. + */ + if (time_before(expires, + ifa->tstamp + lifetime * HZ)) + expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); + } + } + } + } + write_unlock_bh(&idev->lock); + + addrconf_del_timer(ifp); + + ipv6_ifa_notify(RTM_DELADDR, ifp); + + atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp); + + /* + * Purge or update corresponding prefix + * + * 1) we don't purge prefix here if address was not permanent. + * prefix is managed by its own lifetime. + * 2) if there're no addresses, delete prefix. + * 3) if there're still other permanent address(es), + * corresponding prefix is still permanent. + * 4) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + * + * --yoshfuji + */ + if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { + struct in6_addr prefix; + struct rt6_info *rt; + struct net *net = dev_net(ifp->idev->dev); + ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); + rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1); + + if (rt && addrconf_is_prefix_route(rt)) { + if (onlink == 0) { + ip6_del_rt(rt); + rt = NULL; + } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { + rt->rt6i_expires = expires; + rt->rt6i_flags |= RTF_EXPIRES; + } + } + dst_release(&rt->dst); + } + + /* clean up prefsrc entries */ + rt6_remove_prefsrc(ifp); +out: + in6_ifa_put(ifp); +} + +#ifdef CONFIG_IPV6_PRIVACY +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +{ + struct inet6_dev *idev = ifp->idev; + struct in6_addr addr, *tmpaddr; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age; + unsigned long regen_advance; + int tmp_plen; + int ret = 0; + int max_addresses; + u32 addr_flags; + unsigned long now = jiffies; + + write_lock(&idev->lock); + if (ift) { + spin_lock_bh(&ift->lock); + memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); + spin_unlock_bh(&ift->lock); + tmpaddr = &addr; + } else { + tmpaddr = NULL; + } +retry: + in6_dev_hold(idev); + if (idev->cnf.use_tempaddr <= 0) { + write_unlock(&idev->lock); + printk(KERN_INFO + "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + spin_lock_bh(&ifp->lock); + if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { + idev->cnf.use_tempaddr = -1; /*XXX*/ + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + in6_ifa_hold(ifp); + memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); + if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } + memcpy(&addr.s6_addr[8], idev->rndid, 8); + age = (now - ifp->tstamp) / HZ; + tmp_valid_lft = min_t(__u32, + ifp->valid_lft, + idev->cnf.temp_valid_lft + age); + tmp_prefered_lft = min_t(__u32, + ifp->prefered_lft, + idev->cnf.temp_prefered_lft + age - + idev->cnf.max_desync_factor); + tmp_plen = ifp->prefix_len; + max_addresses = idev->cnf.max_addresses; + tmp_tstamp = ifp->tstamp; + spin_unlock_bh(&ifp->lock); + + regen_advance = idev->cnf.regen_max_retry * + idev->cnf.dad_transmits * + idev->nd_parms->retrans_time / HZ; + write_unlock(&idev->lock); + + /* A temporary address is created only if this calculated Preferred + * Lifetime is greater than REGEN_ADVANCE time units. In particular, + * an implementation must not create a temporary address with a zero + * Preferred Lifetime. + */ + if (tmp_prefered_lft <= regen_advance) { + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } + + addr_flags = IFA_F_TEMPORARY; + /* set in addrconf_prefix_rcv() */ + if (ifp->flags & IFA_F_OPTIMISTIC) + addr_flags |= IFA_F_OPTIMISTIC; + + ift = !max_addresses || + ipv6_count_addresses(idev) < max_addresses ? + ipv6_add_addr(idev, &addr, tmp_plen, + ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, + addr_flags) : NULL; + if (!ift || IS_ERR(ift)) { + in6_ifa_put(ifp); + in6_dev_put(idev); + printk(KERN_INFO + "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + tmpaddr = &addr; + write_lock(&idev->lock); + goto retry; + } + + spin_lock_bh(&ift->lock); + ift->ifpub = ifp; + ift->valid_lft = tmp_valid_lft; + ift->prefered_lft = tmp_prefered_lft; + ift->cstamp = now; + ift->tstamp = tmp_tstamp; + spin_unlock_bh(&ift->lock); + + addrconf_dad_start(ift, 0); + in6_ifa_put(ift); + in6_dev_put(idev); +out: + return ret; +} +#endif + +/* + * Choose an appropriate source address (RFC3484) + */ +enum { + IPV6_SADDR_RULE_INIT = 0, + IPV6_SADDR_RULE_LOCAL, + IPV6_SADDR_RULE_SCOPE, + IPV6_SADDR_RULE_PREFERRED, +#ifdef CONFIG_IPV6_MIP6 + IPV6_SADDR_RULE_HOA, +#endif + IPV6_SADDR_RULE_OIF, + IPV6_SADDR_RULE_LABEL, +#ifdef CONFIG_IPV6_PRIVACY + IPV6_SADDR_RULE_PRIVACY, +#endif + IPV6_SADDR_RULE_ORCHID, + IPV6_SADDR_RULE_PREFIX, + IPV6_SADDR_RULE_MAX +}; + +struct ipv6_saddr_score { + int rule; + int addr_type; + struct inet6_ifaddr *ifa; + DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX); + int scopedist; + int matchlen; +}; + +struct ipv6_saddr_dst { + const struct in6_addr *addr; + int ifindex; + int scope; + int label; + unsigned int prefs; +}; + +static inline int ipv6_saddr_preferred(int type) +{ + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK)) + return 1; + return 0; +} + +static int ipv6_get_saddr_eval(struct net *net, + struct ipv6_saddr_score *score, + struct ipv6_saddr_dst *dst, + int i) +{ + int ret; + + if (i <= score->rule) { + switch (i) { + case IPV6_SADDR_RULE_SCOPE: + ret = score->scopedist; + break; + case IPV6_SADDR_RULE_PREFIX: + ret = score->matchlen; + break; + default: + ret = !!test_bit(i, score->scorebits); + } + goto out; + } + + switch (i) { + case IPV6_SADDR_RULE_INIT: + /* Rule 0: remember if hiscore is not ready yet */ + ret = !!score->ifa; + break; + case IPV6_SADDR_RULE_LOCAL: + /* Rule 1: Prefer same address */ + ret = ipv6_addr_equal(&score->ifa->addr, dst->addr); + break; + case IPV6_SADDR_RULE_SCOPE: + /* Rule 2: Prefer appropriate scope + * + * ret + * ^ + * -1 | d 15 + * ---+--+-+---> scope + * | + * | d is scope of the destination. + * B-d | \ + * | \ <- smaller scope is better if + * B-15 | \ if scope is enough for destinaion. + * | ret = B - scope (-1 <= scope >= d <= 15). + * d-C-1 | / + * |/ <- greater is better + * -C / if scope is not enough for destination. + * /| ret = scope - C (-1 <= d < scope <= 15). + * + * d - C - 1 < B -15 (for all -1 <= d <= 15). + * C > d + 14 - B >= 15 + 14 - B = 29 - B. + * Assume B = 0 and we get C > 29. + */ + ret = __ipv6_addr_src_scope(score->addr_type); + if (ret >= dst->scope) + ret = -ret; + else + ret -= 128; /* 30 is enough */ + score->scopedist = ret; + break; + case IPV6_SADDR_RULE_PREFERRED: + /* Rule 3: Avoid deprecated and optimistic addresses */ + ret = ipv6_saddr_preferred(score->addr_type) || + !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); + break; +#ifdef CONFIG_IPV6_MIP6 + case IPV6_SADDR_RULE_HOA: + { + /* Rule 4: Prefer home address */ + int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA); + ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome; + break; + } +#endif + case IPV6_SADDR_RULE_OIF: + /* Rule 5: Prefer outgoing interface */ + ret = (!dst->ifindex || + dst->ifindex == score->ifa->idev->dev->ifindex); + break; + case IPV6_SADDR_RULE_LABEL: + /* Rule 6: Prefer matching label */ + ret = ipv6_addr_label(net, + &score->ifa->addr, score->addr_type, + score->ifa->idev->dev->ifindex) == dst->label; + break; +#ifdef CONFIG_IPV6_PRIVACY + case IPV6_SADDR_RULE_PRIVACY: + { + /* Rule 7: Prefer public address + * Note: prefer temporary address if use_tempaddr >= 2 + */ + int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? + !!(dst->prefs & IPV6_PREFER_SRC_TMP) : + score->ifa->idev->cnf.use_tempaddr >= 2; + ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; + break; + } +#endif + case IPV6_SADDR_RULE_ORCHID: + /* Rule 8-: Prefer ORCHID vs ORCHID or + * non-ORCHID vs non-ORCHID + */ + ret = !(ipv6_addr_orchid(&score->ifa->addr) ^ + ipv6_addr_orchid(dst->addr)); + break; + case IPV6_SADDR_RULE_PREFIX: + /* Rule 8: Use longest matching prefix */ + score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr, + dst->addr); + break; + default: + ret = 0; + } + + if (ret) + __set_bit(i, score->scorebits); + score->rule = i; +out: + return ret; +} + +int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, + const struct in6_addr *daddr, unsigned int prefs, + struct in6_addr *saddr) +{ + struct ipv6_saddr_score scores[2], + *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_dst dst; + struct net_device *dev; + int dst_type; + + dst_type = __ipv6_addr_type(daddr); + dst.addr = daddr; + dst.ifindex = dst_dev ? dst_dev->ifindex : 0; + dst.scope = __ipv6_addr_src_scope(dst_type); + dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); + dst.prefs = prefs; + + hiscore->rule = -1; + hiscore->ifa = NULL; + + rcu_read_lock(); + + for_each_netdev_rcu(net, dev) { + struct inet6_dev *idev; + + /* Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + */ + if (((dst_type & IPV6_ADDR_MULTICAST) || + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && + dst.ifindex && dev->ifindex != dst.ifindex) + continue; + + idev = __in6_dev_get(dev); + if (!idev) + continue; + + read_lock_bh(&idev->lock); + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + int i; + + /* + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense, unless it is also flagged as optimistic. + * - Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if ((score->ifa->flags & IFA_F_TENTATIVE) && + (!(score->ifa->flags & IFA_F_OPTIMISTIC))) + continue; + + score->addr_type = __ipv6_addr_type(&score->ifa->addr); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ADDRCONF: unspecified / multicast address " + "assigned as unicast address on %s", + dev->name); + continue; + } + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); + miniscore = ipv6_get_saddr_eval(net, score, &dst, i); + + if (minihiscore > miniscore) { + if (i == IPV6_SADDR_RULE_SCOPE && + score->scopedist > 0) { + /* + * special case: + * each remaining entry + * has too small (not enough) + * scope, because ifa entries + * are sorted by their scope + * values. + */ + goto try_nextdev; + } + break; + } else if (minihiscore < miniscore) { + if (hiscore->ifa) + in6_ifa_put(hiscore->ifa); + + in6_ifa_hold(score->ifa); + + swap(hiscore, score); + + /* restore our iterator */ + score->ifa = hiscore->ifa; + + break; + } + } + } +try_nextdev: + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + + if (!hiscore->ifa) + return -EADDRNOTAVAIL; + + ipv6_addr_copy(saddr, &hiscore->ifa->addr); + in6_ifa_put(hiscore->ifa); + return 0; +} +EXPORT_SYMBOL(ipv6_dev_get_saddr); + +int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + ipv6_addr_copy(addr, &ifp->addr); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +static int ipv6_count_addresses(struct inet6_dev *idev) +{ + int cnt = 0; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) + cnt++; + read_unlock_bh(&idev->lock); + return cnt; +} + +int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev, int strict) +{ + struct inet6_ifaddr *ifp; + struct hlist_node *node; + unsigned int hash = ipv6_addr_hash(addr); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr) && + !(ifp->flags&IFA_F_TENTATIVE) && + (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { + rcu_read_unlock_bh(); + return 1; + } + } + + rcu_read_unlock_bh(); + return 0; +} +EXPORT_SYMBOL(ipv6_chk_addr); + +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) +{ + unsigned int hash = ipv6_addr_hash(addr); + struct inet6_ifaddr *ifp; + struct hlist_node *node; + + hlist_for_each_entry(ifp, node, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev) + return true; + } + } + return false; +} + +int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int onlink; + + onlink = 0; + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + onlink = ipv6_prefix_equal(addr, &ifa->addr, + ifa->prefix_len); + if (onlink) + break; + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return onlink; +} + +EXPORT_SYMBOL(ipv6_chk_prefix); + +struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, + struct net_device *dev, int strict) +{ + struct inet6_ifaddr *ifp, *result = NULL; + unsigned int hash = ipv6_addr_hash(addr); + struct hlist_node *node; + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, node, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + result = ifp; + in6_ifa_hold(ifp); + break; + } + } + } + rcu_read_unlock_bh(); + + return result; +} + +/* Gets referenced address, destroys ifaddr */ + +static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) +{ + if (ifp->flags&IFA_F_PERMANENT) { + spin_lock_bh(&ifp->lock); + addrconf_del_timer(ifp); + ifp->flags |= IFA_F_TENTATIVE; + if (dad_failed) + ifp->flags |= IFA_F_DADFAILED; + spin_unlock_bh(&ifp->lock); + if (dad_failed) + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); +#ifdef CONFIG_IPV6_PRIVACY + } else if (ifp->flags&IFA_F_TEMPORARY) { + struct inet6_ifaddr *ifpub; + spin_lock_bh(&ifp->lock); + ifpub = ifp->ifpub; + if (ifpub) { + in6_ifa_hold(ifpub); + spin_unlock_bh(&ifp->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + } else { + spin_unlock_bh(&ifp->lock); + } + ipv6_del_addr(ifp); +#endif + } else + ipv6_del_addr(ifp); +} + +static int addrconf_dad_end(struct inet6_ifaddr *ifp) +{ + int err = -ENOENT; + + spin_lock(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_DAD) { + ifp->state = INET6_IFADDR_STATE_POSTDAD; + err = 0; + } + spin_unlock(&ifp->state_lock); + + return err; +} + +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + struct inet6_dev *idev = ifp->idev; + + if (addrconf_dad_end(ifp)) { + in6_ifa_put(ifp); + return; + } + + if (net_ratelimit()) + printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", + ifp->idev->dev->name, &ifp->addr); + + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { + struct in6_addr addr; + + addr.s6_addr32[0] = htonl(0xfe800000); + addr.s6_addr32[1] = 0; + + if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && + ipv6_addr_equal(&ifp->addr, &addr)) { + /* DAD failed for link-local based on MAC address */ + idev->cnf.disable_ipv6 = 1; + + printk(KERN_INFO "%s: IPv6 being disabled!\n", + ifp->idev->dev->name); + } + } + + addrconf_dad_stop(ifp, 1); +} + +/* Join to solicited addr multicast group. */ + +void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +} + +void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + __ipv6_dev_mc_dec(idev, &maddr); +} + +static void addrconf_join_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + ipv6_dev_ac_inc(ifp->idev->dev, &addr); +} + +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + __ipv6_dev_ac_dec(ifp->idev, &addr); +} + +static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + if (dev->addr_len != ETH_ALEN) + return -1; + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + + /* + * The zSeries OSA network cards can be shared among various + * OS instances, but the OSA cards have only one MAC address. + * This leads to duplicate address conflicts in conjunction + * with IPv6 if more than one instance uses the same card. + * + * The driver for these cards can deliver a unique 16-bit + * identifier for each instance sharing the same card. It is + * placed instead of 0xFFFE in the interface identifier. The + * "u" bit of the interface identifier is not inverted in this + * case. Hence the resulting interface identifier has local + * scope according to RFC2373. + */ + if (dev->dev_id) { + eui[3] = (dev->dev_id >> 8) & 0xFF; + eui[4] = dev->dev_id & 0xFF; + } else { + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + } + return 0; +} + +static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev) +{ + /* XXX: inherit EUI-64 from other interface -- yoshfuji */ + if (dev->addr_len != ARCNET_ALEN) + return -1; + memset(eui, 0, 7); + eui[7] = *(u8*)dev->dev_addr; + return 0; +} + +static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev) +{ + if (dev->addr_len != INFINIBAND_ALEN) + return -1; + memcpy(eui, dev->dev_addr + 12, 8); + eui[0] |= 2; + return 0; +} + +static int __ipv6_isatap_ifid(u8 *eui, __be32 addr) +{ + if (addr == 0) + return -1; + eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) || + ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) || + ipv4_is_private_172(addr) || ipv4_is_test_192(addr) || + ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) || + ipv4_is_test_198(addr) || ipv4_is_multicast(addr) || + ipv4_is_lbcast(addr)) ? 0x00 : 0x02; + eui[1] = 0; + eui[2] = 0x5E; + eui[3] = 0xFE; + memcpy(eui + 4, &addr, 4); + return 0; +} + +static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) +{ + if (dev->priv_flags & IFF_ISATAP) + return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); + return -1; +} + +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802_TR: + return addrconf_ifid_eui48(eui, dev); + case ARPHRD_ARCNET: + return addrconf_ifid_arcnet(eui, dev); + case ARPHRD_INFINIBAND: + return addrconf_ifid_infiniband(eui, dev); + case ARPHRD_SIT: + return addrconf_ifid_sit(eui, dev); + } + return -1; +} + +static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) +{ + int err = -1; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + memcpy(eui, ifp->addr.s6_addr+8, 8); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + return err; +} + +#ifdef CONFIG_IPV6_PRIVACY +/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ +static int __ipv6_regen_rndid(struct inet6_dev *idev) +{ +regen: + get_random_bytes(idev->rndid, sizeof(idev->rndid)); + idev->rndid[0] &= ~0x02; + + /* + * : + * check if generated address is not inappropriate + * + * - Reserved subnet anycast (RFC 2526) + * 11111101 11....11 1xxxxxxx + * - ISATAP (RFC4214) 6.1 + * 00-00-5E-FE-xx-xx-xx-xx + * - value 0 + * - XXX: already assigned to an address on the device + */ + if (idev->rndid[0] == 0xfd && + (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && + (idev->rndid[7]&0x80)) + goto regen; + if ((idev->rndid[0]|idev->rndid[1]) == 0) { + if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) + goto regen; + if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) + goto regen; + } + + return 0; +} + +static void ipv6_regen_rndid(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *) data; + unsigned long expires; + + rcu_read_lock_bh(); + write_lock_bh(&idev->lock); + + if (idev->dead) + goto out; + + if (__ipv6_regen_rndid(idev) < 0) + goto out; + + expires = jiffies + + idev->cnf.temp_prefered_lft * HZ - + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - + idev->cnf.max_desync_factor * HZ; + if (time_before(expires, jiffies)) { + printk(KERN_WARNING + "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", + idev->dev->name); + goto out; + } + + if (!mod_timer(&idev->regen_timer, expires)) + in6_dev_hold(idev); + +out: + write_unlock_bh(&idev->lock); + rcu_read_unlock_bh(); + in6_dev_put(idev); +} + +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { + int ret = 0; + + if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) + ret = __ipv6_regen_rndid(idev); + return ret; +} +#endif + +/* + * Add prefix route. + */ + +static void +addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, + unsigned long expires, u32 flags) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_PREFIX, + .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_ifindex = dev->ifindex, + .fc_expires = expires, + .fc_dst_len = plen, + .fc_flags = RTF_UP | flags, + .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, + }; + + ipv6_addr_copy(&cfg.fc_dst, pfx); + + /* Prevent useless cloning on PtP SIT. + This thing is done here expecting that the whole + class of non-broadcast devices need not cloning. + */ +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT)) + cfg.fc_flags |= RTF_NONEXTHOP; +#endif + + ip6_route_add(&cfg); +} + +/* Create "default" multicast route to the interface */ + +static void addrconf_add_mroute(struct net_device *dev) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_LOCAL, + .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_ifindex = dev->ifindex, + .fc_dst_len = 8, + .fc_flags = RTF_UP, + .fc_nlinfo.nl_net = dev_net(dev), + }; + + ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); + + ip6_route_add(&cfg); +} + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void sit_route_add(struct net_device *dev) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_MAIN, + .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_ifindex = dev->ifindex, + .fc_dst_len = 96, + .fc_flags = RTF_UP | RTF_NONEXTHOP, + .fc_nlinfo.nl_net = dev_net(dev), + }; + + /* prefix length - 96 bits "::d.d.d.d" */ + ip6_route_add(&cfg); +} +#endif + +static void addrconf_add_lroute(struct net_device *dev) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); +} + +static struct inet6_dev *addrconf_add_dev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + idev = ipv6_find_idev(dev); + if (!idev) + return ERR_PTR(-ENOBUFS); + + if (idev->cnf.disable_ipv6) + return ERR_PTR(-EACCES); + + /* Add default multicast route */ + addrconf_add_mroute(dev); + + /* Add link local route */ + addrconf_add_lroute(dev); + return idev; +} + +void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) +{ + struct prefix_info *pinfo; + __u32 valid_lft; + __u32 prefered_lft; + int addr_type; + struct inet6_dev *in6_dev; + struct net *net = dev_net(dev); + + pinfo = (struct prefix_info *) opt; + + if (len < sizeof(struct prefix_info)) { + ADBG(("addrconf: prefix option too short\n")); + return; + } + + /* + * Validation checks ([ADDRCONF], page 19) + */ + + addr_type = ipv6_addr_type(&pinfo->prefix); + + if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) + return; + + valid_lft = ntohl(pinfo->valid); + prefered_lft = ntohl(pinfo->prefered); + + if (prefered_lft > valid_lft) { + if (net_ratelimit()) + printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); + return; + } + + in6_dev = in6_dev_get(dev); + + if (in6_dev == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } + + /* + * Two things going on here: + * 1) Add routes for on-link prefixes + * 2) Configure prefixes with the auto flag set + */ + + if (pinfo->onlink) { + struct rt6_info *rt; + unsigned long rt_expires; + + /* Avoid arithmetic overflow. Really, we could + * save rt_expires in seconds, likely valid_lft, + * but it would require division in fib gc, that it + * not good. + */ + if (HZ > USER_HZ) + rt_expires = addrconf_timeout_fixup(valid_lft, HZ); + else + rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ); + + if (addrconf_finite_timeout(rt_expires)) + rt_expires *= HZ; + + rt = rt6_lookup(net, &pinfo->prefix, NULL, + dev->ifindex, 1); + + if (rt && addrconf_is_prefix_route(rt)) { + /* Autoconf prefix route */ + if (valid_lft == 0) { + ip6_del_rt(rt); + rt = NULL; + } else if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + rt->rt6i_expires = jiffies + rt_expires; + rt->rt6i_flags |= RTF_EXPIRES; + } else { + rt->rt6i_flags &= ~RTF_EXPIRES; + rt->rt6i_expires = 0; + } + } else if (valid_lft) { + clock_t expires = 0; + int flags = RTF_ADDRCONF | RTF_PREFIX_RT; + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + flags |= RTF_EXPIRES; + expires = jiffies_to_clock_t(rt_expires); + } + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, expires, flags); + } + if (rt) + dst_release(&rt->dst); + } + + /* Try to figure out our local address for this prefix */ + + if (pinfo->autoconf && in6_dev->cnf.autoconf) { + struct inet6_ifaddr * ifp; + struct in6_addr addr; + int create = 0, update_lft = 0; + + if (pinfo->prefix_len == 64) { + memcpy(&addr, &pinfo->prefix, 8); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + in6_dev_put(in6_dev); + return; + } + goto ok; + } + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", + pinfo->prefix_len); + in6_dev_put(in6_dev); + return; + +ok: + + ifp = ipv6_get_ifaddr(net, &addr, dev, 1); + + if (ifp == NULL && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + u32 addr_flags = 0; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (in6_dev->cnf.optimistic_dad && + !net->ipv6.devconf_all->forwarding) + addr_flags = IFA_F_OPTIMISTIC; +#endif + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, + addr_flags); + + if (!ifp || IS_ERR(ifp)) { + in6_dev_put(in6_dev); + return; + } + + update_lft = create = 1; + ifp->cstamp = jiffies; + addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); + } + + if (ifp) { + int flags; + unsigned long now; +#ifdef CONFIG_IPV6_PRIVACY + struct inet6_ifaddr *ift; +#endif + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && stored_lft) { + if (valid_lft > MIN_VALID_LIFETIME || + valid_lft > stored_lft) + update_lft = 1; + else if (stored_lft <= MIN_VALID_LIFETIME) { + /* valid_lft <= stored_lft is always true */ + /* + * RFC 4862 Section 5.5.3e: + * "Note that the preferred lifetime of + * the corresponding address is always + * reset to the Preferred Lifetime in + * the received Prefix Information + * option, regardless of whether the + * valid lifetime is also reset or + * ignored." + * + * So if the preferred lifetime in + * this advertisement is different + * than what we have stored, but the + * valid lifetime is invalid, just + * reset prefered_lft. + * + * We must set the valid lifetime + * to the stored lifetime since we'll + * be updating the timestamp below, + * else we'll set it back to the + * minimum. + */ + if (prefered_lft != ifp->prefered_lft) { + valid_lft = stored_lft; + update_lft = 1; + } + } else { + valid_lft = MIN_VALID_LIFETIME; + if (valid_lft < prefered_lft) + prefered_lft = valid_lft; + update_lft = 1; + } + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock(&ifp->lock); + +#ifdef CONFIG_IPV6_PRIVACY + read_lock_bh(&in6_dev->lock); + /* update all temporary addresses in the list */ + list_for_each_entry(ift, &in6_dev->tempaddr_list, + tmp_list) { + int age, max_valid, max_prefered; + + if (ifp != ift->ifpub) + continue; + + /* + * RFC 4941 section 3.3: + * If a received option will extend the lifetime + * of a public address, the lifetimes of + * temporary addresses should be extended, + * subject to the overall constraint that no + * temporary addresses should ever remain + * "valid" or "preferred" for a time longer than + * (TEMP_VALID_LIFETIME) or + * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), + * respectively. + */ + age = (now - ift->cstamp) / HZ; + max_valid = in6_dev->cnf.temp_valid_lft - age; + if (max_valid < 0) + max_valid = 0; + + max_prefered = in6_dev->cnf.temp_prefered_lft - + in6_dev->cnf.max_desync_factor - + age; + if (max_prefered < 0) + max_prefered = 0; + + if (valid_lft > max_valid) + valid_lft = max_valid; + + if (prefered_lft > max_prefered) + prefered_lft = max_prefered; + + spin_lock(&ift->lock); + flags = ift->flags; + ift->valid_lft = valid_lft; + ift->prefered_lft = prefered_lft; + ift->tstamp = now; + if (prefered_lft > 0) + ift->flags &= ~IFA_F_DEPRECATED; + + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } + + if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) { + /* + * When a new public address is created as + * described in [ADDRCONF], also create a new + * temporary address. Also create a temporary + * address if it's enabled but no temporary + * address currently exists. + */ + read_unlock_bh(&in6_dev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&in6_dev->lock); + } +#endif + in6_ifa_put(ifp); + addrconf_verify(0); + } + } + inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); + in6_dev_put(in6_dev); +} + +/* + * Set destination address. + * Special case for SIT interfaces where we create a new "virtual" + * device. + */ +int addrconf_set_dstaddr(struct net *net, void __user *arg) +{ + struct in6_ifreq ireq; + struct net_device *dev; + int err = -EINVAL; + + rtnl_lock(); + + err = -EFAULT; + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + goto err_exit; + + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + + err = -ENODEV; + if (dev == NULL) + goto err_exit; + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + if (dev->type == ARPHRD_SIT) { + const struct net_device_ops *ops = dev->netdev_ops; + struct ifreq ifr; + struct ip_tunnel_parm p; + + err = -EADDRNOTAVAIL; + if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) + goto err_exit; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; + p.iph.saddr = 0; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + } else + err = -EOPNOTSUPP; + + if (err == 0) { + err = -ENOBUFS; + dev = __dev_get_by_name(net, p.name); + if (!dev) + goto err_exit; + err = dev_open(dev); + } + } +#endif + +err_exit: + rtnl_unlock(); + return err; +} + +/* + * Manual configuration of address on an interface + */ +static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx, + unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, + __u32 valid_lft) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + int scope; + u32 flags; + clock_t expires; + unsigned long timeout; + + ASSERT_RTNL(); + + if (plen > 128) + return -EINVAL; + + /* check the lifetime */ + if (!valid_lft || prefered_lft > valid_lft) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + scope = ipv6_addr_scope(pfx); + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; + flags = RTF_EXPIRES; + } else { + expires = 0; + flags = 0; + ifa_flags |= IFA_F_PERMANENT; + } + + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } + + ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); + + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = jiffies; + spin_unlock_bh(&ifp->lock); + + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, + expires, flags); + /* + * Note that section 3.1 of RFC 4429 indicates + * that the Optimistic flag should not be set for + * manually configured addresses + */ + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + addrconf_verify(0); + return 0; + } + + return PTR_ERR(ifp); +} + +static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *pfx, + unsigned int plen) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + + if (plen > 128) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + if ((idev = __in6_dev_get(dev)) == NULL) + return -ENXIO; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->prefix_len == plen && + ipv6_addr_equal(pfx, &ifp->addr)) { + in6_ifa_hold(ifp); + read_unlock_bh(&idev->lock); + + ipv6_del_addr(ifp); + + /* If the last address is deleted administratively, + disable IPv6 on this interface. + */ + if (list_empty(&idev->addr_list)) + addrconf_ifdown(idev->dev, 1); + return 0; + } + } + read_unlock_bh(&idev->lock); + return -EADDRNOTAVAIL; +} + + +int addrconf_add_ifaddr(struct net *net, void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + ireq.ifr6_prefixlen, IFA_F_PERMANENT, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + rtnl_unlock(); + return err; +} + +int addrconf_del_ifaddr(struct net *net, void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, + int plen, int scope) +{ + struct inet6_ifaddr *ifp; + + ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } +} + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void sit_add_v4_addrs(struct inet6_dev *idev) +{ + struct in6_addr addr; + struct net_device *dev; + struct net *net = dev_net(idev->dev); + int scope; + + ASSERT_RTNL(); + + memset(&addr, 0, sizeof(struct in6_addr)); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + + if (idev->dev->flags&IFF_POINTOPOINT) { + addr.s6_addr32[0] = htonl(0xfe800000); + scope = IFA_LINK; + } else { + scope = IPV6_ADDR_COMPATv4; + } + + if (addr.s6_addr32[3]) { + add_addr(idev, &addr, 128, scope); + return; + } + + for_each_netdev(net, dev) { + struct in_device * in_dev = __in_dev_get_rtnl(dev); + if (in_dev && (dev->flags & IFF_UP)) { + struct in_ifaddr * ifa; + + int flag = scope; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + int plen; + + addr.s6_addr32[3] = ifa->ifa_local; + + if (ifa->ifa_scope == RT_SCOPE_LINK) + continue; + if (ifa->ifa_scope >= RT_SCOPE_HOST) { + if (idev->dev->flags&IFF_POINTOPOINT) + continue; + flag |= IFA_HOST; + } + if (idev->dev->flags&IFF_POINTOPOINT) + plen = 64; + else + plen = 96; + + add_addr(idev, &addr, plen, flag); + } + } + } +} +#endif + +static void init_loopback(struct net_device *dev) +{ + struct inet6_dev *idev; + + /* ::1 */ + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init loopback: add_dev failed\n"); + return; + } + + add_addr(idev, &in6addr_loopback, 128, IFA_HOST); +} + +static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + u32 addr_flags = IFA_F_PERMANENT; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (idev->cnf.optimistic_dad && + !dev_net(idev->dev)->ipv6.devconf_all->forwarding) + addr_flags |= IFA_F_OPTIMISTIC; +#endif + + + ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags); + if (!IS_ERR(ifp)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + } +} + +static void addrconf_dev_config(struct net_device *dev) +{ + struct in6_addr addr; + struct inet6_dev * idev; + + ASSERT_RTNL(); + + if ((dev->type != ARPHRD_ETHER) && + (dev->type != ARPHRD_FDDI) && + (dev->type != ARPHRD_IEEE802_TR) && + (dev->type != ARPHRD_ARCNET) && + (dev->type != ARPHRD_INFINIBAND)) { + /* Alas, we support only Ethernet autoconfiguration. */ + return; + } + + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) + return; + + memset(&addr, 0, sizeof(struct in6_addr)); + addr.s6_addr32[0] = htonl(0xFE800000); + + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) + addrconf_add_linklocal(idev, &addr); +} + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void addrconf_sit_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init sit: add_dev failed\n"); + return; + } + + if (dev->priv_flags & IFF_ISATAP) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); + if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) + addrconf_add_linklocal(idev, &addr); + return; + } + + sit_add_v4_addrs(idev); + + if (dev->flags&IFF_POINTOPOINT) { + addrconf_add_mroute(dev); + addrconf_add_lroute(dev); + } else + sit_route_add(dev); +} +#endif + +static inline int +ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) +{ + struct in6_addr lladdr; + + if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { + addrconf_add_linklocal(idev, &lladdr); + return 0; + } + return -1; +} + +static void ip6_tnl_add_linklocal(struct inet6_dev *idev) +{ + struct net_device *link_dev; + struct net *net = dev_net(idev->dev); + + /* first try to inherit the link-local address from the link device */ + if (idev->dev->iflink && + (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + /* then try to inherit it from any device */ + for_each_netdev(net, link_dev) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); +} + +/* + * Autoconfigure tunnel with a link-local address so routing protocols, + * DHCPv6, MLD etc. can be run over the virtual link + */ + +static void addrconf_ip6_tnl_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) { + printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); + return; + } + ip6_tnl_add_linklocal(idev); +} + +static int addrconf_notify(struct notifier_block *this, unsigned long event, + void * data) +{ + struct net_device *dev = (struct net_device *) data; + struct inet6_dev *idev = __in6_dev_get(dev); + int run_pending = 0; + int err; + + switch (event) { + case NETDEV_REGISTER: + if (!idev && dev->mtu >= IPV6_MIN_MTU) { + idev = ipv6_add_dev(dev); + if (!idev) + return notifier_from_errno(-ENOMEM); + } + break; + + case NETDEV_UP: + case NETDEV_CHANGE: + if (dev->flags & IFF_SLAVE) + break; + + if (event == NETDEV_UP) { + if (!addrconf_qdisc_ok(dev)) { + /* device is not ready yet. */ + printk(KERN_INFO + "ADDRCONF(NETDEV_UP): %s: " + "link is not ready\n", + dev->name); + break; + } + + if (!idev && dev->mtu >= IPV6_MIN_MTU) + idev = ipv6_add_dev(dev); + + if (idev) { + idev->if_flags |= IF_READY; + run_pending = 1; + } + } else { + if (!addrconf_qdisc_ok(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) { + if (idev->if_flags & IF_READY) + /* device is already configured. */ + break; + idev->if_flags |= IF_READY; + } + + printk(KERN_INFO + "ADDRCONF(NETDEV_CHANGE): %s: " + "link becomes ready\n", + dev->name); + + run_pending = 1; + } + + switch (dev->type) { +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif + case ARPHRD_TUNNEL6: + addrconf_ip6_tnl_config(dev); + break; + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } + + if (idev) { + if (run_pending) + addrconf_dad_run(idev); + + /* + * If the MTU changed during the interface down, + * when the interface up, the changed MTU must be + * reflected in the idev as well as routers. + */ + if (idev->cnf.mtu6 != dev->mtu && + dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + } + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + + /* + * If the changed mtu during down is lower than + * IPV6_MIN_MTU stop IPv6 on this interface. + */ + if (dev->mtu < IPV6_MIN_MTU) + addrconf_ifdown(dev, 1); + } + break; + + case NETDEV_CHANGEMTU: + if (idev && dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + break; + } + + if (!idev && dev->mtu >= IPV6_MIN_MTU) { + idev = ipv6_add_dev(dev); + if (idev) + break; + } + + /* + * MTU falled under IPV6_MIN_MTU. + * Stop IPv6 on this interface. + */ + + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + /* + * Remove all addresses from this interface. + */ + addrconf_ifdown(dev, event != NETDEV_DOWN); + break; + + case NETDEV_CHANGENAME: + if (idev) { + snmp6_unregister_dev(idev); + addrconf_sysctl_unregister(idev); + addrconf_sysctl_register(idev); + err = snmp6_register_dev(idev); + if (err) + return notifier_from_errno(err); + } + break; + + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + addrconf_type_change(dev, event); + break; + } + + return NOTIFY_OK; +} + +/* + * addrconf module should be notified of a device going up + */ +static struct notifier_block ipv6_dev_notf = { + .notifier_call = addrconf_notify, +}; + +static void addrconf_type_change(struct net_device *dev, unsigned long event) +{ + struct inet6_dev *idev; + ASSERT_RTNL(); + + idev = __in6_dev_get(dev); + + if (event == NETDEV_POST_TYPE_CHANGE) + ipv6_mc_remap(idev); + else if (event == NETDEV_PRE_TYPE_CHANGE) + ipv6_mc_unmap(idev); +} + +static int addrconf_ifdown(struct net_device *dev, int how) +{ + struct net *net = dev_net(dev); + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int state, i; + + ASSERT_RTNL(); + + rt6_ifdown(net, dev); + neigh_ifdown(&nd_tbl, dev); + + idev = __in6_dev_get(dev); + if (idev == NULL) + return -ENODEV; + + /* + * Step 1: remove reference to ipv6 device from parent device. + * Do not dev_put! + */ + if (how) { + idev->dead = 1; + + /* protected by rtnl_lock */ + rcu_assign_pointer(dev->ip6_ptr, NULL); + + /* Step 1.5: remove snmp6 entry */ + snmp6_unregister_dev(idev); + + } + + /* Step 2: clear hash table */ + for (i = 0; i < IN6_ADDR_HSIZE; i++) { + struct hlist_head *h = &inet6_addr_lst[i]; + struct hlist_node *n; + + spin_lock_bh(&addrconf_hash_lock); + restart: + hlist_for_each_entry_rcu(ifa, n, h, addr_lst) { + if (ifa->idev == idev) { + hlist_del_init_rcu(&ifa->addr_lst); + addrconf_del_timer(ifa); + goto restart; + } + } + spin_unlock_bh(&addrconf_hash_lock); + } + + write_lock_bh(&idev->lock); + + /* Step 2: clear flags for stateless addrconf */ + if (!how) + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); + +#ifdef CONFIG_IPV6_PRIVACY + if (how && del_timer(&idev->regen_timer)) + in6_dev_put(idev); + + /* Step 3: clear tempaddr list */ + while (!list_empty(&idev->tempaddr_list)) { + ifa = list_first_entry(&idev->tempaddr_list, + struct inet6_ifaddr, tmp_list); + list_del(&ifa->tmp_list); + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + if (ifa->ifpub) { + in6_ifa_put(ifa->ifpub); + ifa->ifpub = NULL; + } + spin_unlock_bh(&ifa->lock); + in6_ifa_put(ifa); + write_lock_bh(&idev->lock); + } +#endif + + while (!list_empty(&idev->addr_list)) { + ifa = list_first_entry(&idev->addr_list, + struct inet6_ifaddr, if_list); + addrconf_del_timer(ifa); + + list_del(&ifa->if_list); + + write_unlock_bh(&idev->lock); + + spin_lock_bh(&ifa->state_lock); + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifa->state_lock); + + if (state != INET6_IFADDR_STATE_DEAD) { + __ipv6_ifa_notify(RTM_DELADDR, ifa); + atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); + } + in6_ifa_put(ifa); + + write_lock_bh(&idev->lock); + } + + write_unlock_bh(&idev->lock); + + /* Step 5: Discard multicast list */ + if (how) + ipv6_mc_destroy_dev(idev); + else + ipv6_mc_down(idev); + + idev->tstamp = jiffies; + + /* Last: Shot the device (if unregistered) */ + if (how) { + addrconf_sysctl_unregister(idev); + neigh_parms_release(&nd_tbl, idev->nd_parms); + neigh_ifdown(&nd_tbl, dev); + in6_dev_put(idev); + } + return 0; +} + +static void addrconf_rs_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + + read_lock(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) + goto out; + + if (idev->cnf.forwarding) + goto out; + + /* Announcement received after solicitation was sent */ + if (idev->if_flags & IF_RA_RCVD) + goto out; + + spin_lock(&ifp->lock); + if (ifp->probes++ < idev->cnf.rtr_solicits) { + /* The wait after the last probe can be shorter */ + addrconf_mod_timer(ifp, AC_RS, + (ifp->probes == idev->cnf.rtr_solicits) ? + idev->cnf.rtr_solicit_delay : + idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + + ndisc_send_rs(idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + } else { + spin_unlock(&ifp->lock); + /* + * Note: we do not support deprecated "all on-link" + * assumption any longer. + */ + printk(KERN_DEBUG "%s: no IPv6 routers present\n", + idev->dev->name); + } + +out: + read_unlock(&idev->lock); + in6_ifa_put(ifp); +} + +/* + * Duplicate Address Detection + */ +static void addrconf_dad_kick(struct inet6_ifaddr *ifp) +{ + unsigned long rand_num; + struct inet6_dev *idev = ifp->idev; + + if (ifp->flags & IFA_F_OPTIMISTIC) + rand_num = 0; + else + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); +} + +static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) +{ + struct inet6_dev *idev = ifp->idev; + struct net_device *dev = idev->dev; + + addrconf_join_solict(dev, &ifp->addr); + + net_srandom(ifp->addr.s6_addr32[3]); + + read_lock_bh(&idev->lock); + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) + goto out; + + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + idev->cnf.accept_dad < 1 || + !(ifp->flags&IFA_F_TENTATIVE) || + ifp->flags & IFA_F_NODAD) { + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock(&ifp->lock); + read_unlock_bh(&idev->lock); + + addrconf_dad_completed(ifp); + return; + } + + if (!(idev->if_flags & IF_READY)) { + spin_unlock(&ifp->lock); + read_unlock_bh(&idev->lock); + /* + * If the device is not ready: + * - keep it tentative if it is a permanent address. + * - otherwise, kill it. + */ + in6_ifa_hold(ifp); + addrconf_dad_stop(ifp, 0); + return; + } + + /* + * Optimistic nodes can start receiving + * Frames right away + */ + if (ifp->flags & IFA_F_OPTIMISTIC) + ip6_ins_rt(ifp->rt); + + addrconf_dad_kick(ifp); +out: + spin_unlock(&ifp->lock); + read_unlock_bh(&idev->lock); +} + +static void addrconf_dad_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + struct in6_addr mcaddr; + + if (!ifp->probes && addrconf_dad_end(ifp)) + goto out; + + read_lock(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) { + read_unlock(&idev->lock); + goto out; + } + + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) { + spin_unlock(&ifp->lock); + read_unlock(&idev->lock); + goto out; + } + + if (ifp->probes == 0) { + /* + * DAD was successful + */ + + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock(&ifp->lock); + read_unlock(&idev->lock); + + addrconf_dad_completed(ifp); + + goto out; + } + + ifp->probes--; + addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); + spin_unlock(&ifp->lock); + read_unlock(&idev->lock); + + /* send a neighbour solicitation for our addr */ + addrconf_addr_solict_mult(&ifp->addr, &mcaddr); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); +out: + in6_ifa_put(ifp); +} + +static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +{ + struct net_device *dev = ifp->idev->dev; + + /* + * Configure the address for reception. Now it is valid. + */ + + ipv6_ifa_notify(RTM_NEWADDR, ifp); + + /* If added prefix is link local and forwarding is off, + start sending router solicitations. + */ + + if ((ifp->idev->cnf.forwarding == 0 || + ifp->idev->cnf.forwarding == 2) && + ifp->idev->cnf.rtr_solicits > 0 && + (dev->flags&IFF_LOOPBACK) == 0 && + (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + + spin_lock_bh(&ifp->lock); + ifp->probes = 1; + ifp->idev->if_flags |= IF_RS_SENT; + addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); + spin_unlock_bh(&ifp->lock); + } +} + +static void addrconf_dad_run(struct inet6_dev *idev) +{ + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->flags & IFA_F_TENTATIVE && + ifp->state == INET6_IFADDR_STATE_DAD) + addrconf_dad_kick(ifp); + spin_unlock(&ifp->lock); + } + read_unlock_bh(&idev->lock); +} + +#ifdef CONFIG_PROC_FS +struct if6_iter_state { + struct seq_net_private p; + int bucket; +}; + +static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) +{ + struct inet6_ifaddr *ifa = NULL; + struct if6_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + struct hlist_node *n; + hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], + addr_lst) + if (net_eq(dev_net(ifa->idev->dev), net)) + return ifa; + } + return NULL; +} + +static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, + struct inet6_ifaddr *ifa) +{ + struct if6_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct hlist_node *n = &ifa->addr_lst; + + hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) + if (net_eq(dev_net(ifa->idev->dev), net)) + return ifa; + + while (++state->bucket < IN6_ADDR_HSIZE) { + hlist_for_each_entry_rcu_bh(ifa, n, + &inet6_addr_lst[state->bucket], addr_lst) { + if (net_eq(dev_net(ifa->idev->dev), net)) + return ifa; + } + } + + return NULL; +} + +static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct inet6_ifaddr *ifa = if6_get_first(seq); + + if (ifa) + while (pos && (ifa = if6_get_next(seq, ifa)) != NULL) + --pos; + return pos ? NULL : ifa; +} + +static void *if6_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu_bh) +{ + rcu_read_lock_bh(); + return if6_get_idx(seq, *pos); +} + +static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct inet6_ifaddr *ifa; + + ifa = if6_get_next(seq, v); + ++*pos; + return ifa; +} + +static void if6_seq_stop(struct seq_file *seq, void *v) + __releases(rcu_bh) +{ + rcu_read_unlock_bh(); +} + +static int if6_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; + seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", + &ifp->addr, + ifp->idev->dev->ifindex, + ifp->prefix_len, + ifp->scope, + ifp->flags, + ifp->idev->dev->name); + return 0; +} + +static const struct seq_operations if6_seq_ops = { + .start = if6_seq_start, + .next = if6_seq_next, + .show = if6_seq_show, + .stop = if6_seq_stop, +}; + +static int if6_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &if6_seq_ops, + sizeof(struct if6_iter_state)); +} + +static const struct file_operations if6_fops = { + .owner = THIS_MODULE, + .open = if6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init if6_proc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit if6_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "if_inet6"); +} + +static struct pernet_operations if6_proc_net_ops = { + .init = if6_proc_net_init, + .exit = if6_proc_net_exit, +}; + +int __init if6_proc_init(void) +{ + return register_pernet_subsys(&if6_proc_net_ops); +} + +void if6_proc_exit(void) +{ + unregister_pernet_subsys(&if6_proc_net_ops); +} +#endif /* CONFIG_PROC_FS */ + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +/* Check if address is a home address configured on any interface. */ +int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) +{ + int ret = 0; + struct inet6_ifaddr *ifp = NULL; + struct hlist_node *n; + unsigned int hash = ipv6_addr_hash(addr); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr) && + (ifp->flags & IFA_F_HOMEADDRESS)) { + ret = 1; + break; + } + } + rcu_read_unlock_bh(); + return ret; +} +#endif + +/* + * Periodic address status verification + */ + +static void addrconf_verify(unsigned long foo) +{ + unsigned long now, next, next_sec, next_sched; + struct inet6_ifaddr *ifp; + struct hlist_node *node; + int i; + + rcu_read_lock_bh(); + spin_lock(&addrconf_verify_lock); + now = jiffies; + next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); + + del_timer(&addr_chk_timer); + + for (i = 0; i < IN6_ADDR_HSIZE; i++) { +restart: + hlist_for_each_entry_rcu_bh(ifp, node, + &inet6_addr_lst[i], addr_lst) { + unsigned long age; + + if (ifp->flags & IFA_F_PERMANENT) + continue; + + spin_lock(&ifp->lock); + /* We try to batch several events at once. */ + age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifp->valid_lft != INFINITY_LIFE_TIME && + age >= ifp->valid_lft) { + spin_unlock(&ifp->lock); + in6_ifa_hold(ifp); + ipv6_del_addr(ifp); + goto restart; + } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { + spin_unlock(&ifp->lock); + continue; + } else if (age >= ifp->prefered_lft) { + /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */ + int deprecate = 0; + + if (!(ifp->flags&IFA_F_DEPRECATED)) { + deprecate = 1; + ifp->flags |= IFA_F_DEPRECATED; + } + + if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + next = ifp->tstamp + ifp->valid_lft * HZ; + + spin_unlock(&ifp->lock); + + if (deprecate) { + in6_ifa_hold(ifp); + + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); + goto restart; + } +#ifdef CONFIG_IPV6_PRIVACY + } else if ((ifp->flags&IFA_F_TEMPORARY) && + !(ifp->flags&IFA_F_TENTATIVE)) { + unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + ifp->idev->nd_parms->retrans_time / HZ; + + if (age >= ifp->prefered_lft - regen_advance) { + struct inet6_ifaddr *ifpub = ifp->ifpub; + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + if (!ifp->regen_count && ifpub) { + ifp->regen_count++; + in6_ifa_hold(ifp); + in6_ifa_hold(ifpub); + spin_unlock(&ifp->lock); + + spin_lock(&ifpub->lock); + ifpub->regen_count = 0; + spin_unlock(&ifpub->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; + spin_unlock(&ifp->lock); +#endif + } else { + /* ifp->prefered_lft <= ifp->valid_lft */ + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + spin_unlock(&ifp->lock); + } + } + } + + next_sec = round_jiffies_up(next); + next_sched = next; + + /* If rounded timeout is accurate enough, accept it. */ + if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) + next_sched = next_sec; + + /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ + if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) + next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; + + ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched)); + + addr_chk_timer.expires = next_sched; + add_timer(&addr_chk_timer); + spin_unlock(&addrconf_verify_lock); + rcu_read_unlock_bh(); +} + +static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local) +{ + struct in6_addr *pfx = NULL; + + if (addr) + pfx = nla_data(addr); + + if (local) { + if (pfx && nla_memcmp(local, pfx, sizeof(*pfx))) + pfx = NULL; + else + pfx = nla_data(local); + } + + return pfx; +} + +static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { + [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) }, + [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, + [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, +}; + +static int +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifaddrmsg *ifm; + struct nlattr *tb[IFA_MAX+1]; + struct in6_addr *pfx; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + if (pfx == NULL) + return -EINVAL; + + return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, + u32 prefered_lft, u32 valid_lft) +{ + u32 flags; + clock_t expires; + unsigned long timeout; + + if (!valid_lft || (prefered_lft > valid_lft)) + return -EINVAL; + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; + flags = RTF_EXPIRES; + } else { + expires = 0; + flags = 0; + ifa_flags |= IFA_F_PERMANENT; + } + + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } + + spin_lock_bh(&ifp->lock); + ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags; + ifp->tstamp = jiffies; + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + + spin_unlock_bh(&ifp->lock); + if (!(ifp->flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, + expires, flags); + addrconf_verify(0); + + return 0; +} + +static int +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifaddrmsg *ifm; + struct nlattr *tb[IFA_MAX+1]; + struct in6_addr *pfx; + struct inet6_ifaddr *ifa; + struct net_device *dev; + u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; + u8 ifa_flags; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + if (pfx == NULL) + return -EINVAL; + + if (tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci; + + ci = nla_data(tb[IFA_CACHEINFO]); + valid_lft = ci->ifa_valid; + preferred_lft = ci->ifa_prefered; + } else { + preferred_lft = INFINITY_LIFE_TIME; + valid_lft = INFINITY_LIFE_TIME; + } + + dev = __dev_get_by_index(net, ifm->ifa_index); + if (dev == NULL) + return -ENODEV; + + /* We ignore other flags so far. */ + ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS); + + ifa = ipv6_get_ifaddr(net, pfx, dev, 1); + if (ifa == NULL) { + /* + * It would be best to check for !NLM_F_CREATE here but + * userspace alreay relies on not having to provide this. + */ + return inet6_addr_add(net, ifm->ifa_index, pfx, + ifm->ifa_prefixlen, ifa_flags, + preferred_lft, valid_lft); + } + + if (nlh->nlmsg_flags & NLM_F_EXCL || + !(nlh->nlmsg_flags & NLM_F_REPLACE)) + err = -EEXIST; + else + err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft); + + in6_ifa_put(ifa); + + return err; +} + +static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags, + u8 scope, int ifindex) +{ + struct ifaddrmsg *ifm; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = prefixlen; + ifm->ifa_flags = flags; + ifm->ifa_scope = scope; + ifm->ifa_index = ifindex; +} + +static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, + unsigned long tstamp, u32 preferred, u32 valid) +{ + struct ifa_cacheinfo ci; + + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); + ci.ifa_prefered = preferred; + ci.ifa_valid = valid; + + return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); +} + +static inline int rt_scope(int ifa_scope) +{ + if (ifa_scope & IFA_HOST) + return RT_SCOPE_HOST; + else if (ifa_scope & IFA_LINK) + return RT_SCOPE_LINK; + else if (ifa_scope & IFA_SITE) + return RT_SCOPE_SITE; + else + return RT_SCOPE_UNIVERSE; +} + +static inline int inet6_ifaddr_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16) /* IFA_ADDRESS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); +} + +static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + u32 preferred, valid; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), + ifa->idev->dev->ifindex); + + if (!(ifa->flags&IFA_F_PERMANENT)) { + preferred = ifa->prefered_lft; + valid = ifa->valid_lft; + if (preferred != INFINITY_LIFE_TIME) { + long tval = (jiffies - ifa->tstamp)/HZ; + if (preferred > tval) + preferred -= tval; + else + preferred = 0; + if (valid != INFINITY_LIFE_TIME) { + if (valid > tval) + valid -= tval; + else + valid = 0; + } + } + } else { + preferred = INFINITY_LIFE_TIME; + valid = INFINITY_LIFE_TIME; + } + + if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 || + put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, + u32 pid, u32 seq, int event, u16 flags) +{ + struct nlmsghdr *nlh; + u8 scope = RT_SCOPE_UNIVERSE; + int ifindex = ifmca->idev->dev->ifindex; + + if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + scope = RT_SCOPE_SITE; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); + if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 || + put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + u8 scope = RT_SCOPE_UNIVERSE; + int ifindex = ifaca->aca_idev->dev->ifindex; + + if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) + scope = RT_SCOPE_SITE; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); + if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 || + put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +/* called with rcu_read_lock() */ +static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, + struct netlink_callback *cb, enum addr_type_t type, + int s_ip_idx, int *p_ip_idx) +{ + struct ifmcaddr6 *ifmca; + struct ifacaddr6 *ifaca; + int err = 1; + int ip_idx = *p_ip_idx; + + read_lock_bh(&idev->lock); + switch (type) { + case UNICAST_ADDR: { + struct inet6_ifaddr *ifa; + + /* unicast address incl. temp addr */ + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (++ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + } + case MULTICAST_ADDR: + /* multicast address */ + for (ifmca = idev->mc_list; ifmca; + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_GETMULTICAST, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + case ANYCAST_ADDR: + /* anycast address */ + for (ifaca = idev->ac_list; ifaca; + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_GETANYCAST, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + default: + break; + } + read_unlock_bh(&idev->lock); + *p_ip_idx = ip_idx; + return err; +} + +static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + enum addr_type_t type) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, ip_idx; + int s_idx, s_ip_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; + + rcu_read_lock(); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (h > s_h || idx > s_idx) + s_ip_idx = 0; + ip_idx = 0; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + + if (in6_dump_addrs(idev, skb, cb, type, + s_ip_idx, &ip_idx) <= 0) + goto done; +cont: + idx++; + } + } +done: + rcu_read_unlock(); + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; + + return skb->len; +} + +static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = UNICAST_ADDR; + + return inet6_dump_addr(skb, cb, type); +} + +static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = MULTICAST_ADDR; + + return inet6_dump_addr(skb, cb, type); +} + + +static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = ANYCAST_ADDR; + + return inet6_dump_addr(skb, cb, type); +} + +static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, + void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct ifaddrmsg *ifm; + struct nlattr *tb[IFA_MAX+1]; + struct in6_addr *addr = NULL; + struct net_device *dev = NULL; + struct inet6_ifaddr *ifa; + struct sk_buff *skb; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + if (err < 0) + goto errout; + + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + if (addr == NULL) { + err = -EINVAL; + goto errout; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_index) + dev = __dev_get_by_index(net, ifm->ifa_index); + + ifa = ipv6_get_ifaddr(net, addr, dev, 1); + if (!ifa) { + err = -EADDRNOTAVAIL; + goto errout; + } + + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout_ifa; + } + + err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, RTM_NEWADDR, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout_ifa; + } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +errout_ifa: + in6_ifa_put(ifa); +errout: + return err; +} + +static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) +{ + struct sk_buff *skb; + struct net *net = dev_net(ifa->idev->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); +} + +static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, + __s32 *array, int bytes) +{ + BUG_ON(bytes < (DEVCONF_MAX * 4)); + + memset(array, 0, bytes); + array[DEVCONF_FORWARDING] = cnf->forwarding; + array[DEVCONF_HOPLIMIT] = cnf->hop_limit; + array[DEVCONF_MTU6] = cnf->mtu6; + array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; + array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; + array[DEVCONF_AUTOCONF] = cnf->autoconf; + array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; + array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_DELAY] = + jiffies_to_msecs(cnf->rtr_solicit_delay); + array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; +#ifdef CONFIG_IPV6_PRIVACY + array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; + array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; + array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; + array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; + array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; +#endif + array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; + array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; +#ifdef CONFIG_IPV6_ROUTER_PREF + array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; + array[DEVCONF_RTR_PROBE_INTERVAL] = + jiffies_to_msecs(cnf->rtr_probe_interval); +#ifdef CONFIG_IPV6_ROUTE_INFO + array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; +#endif +#endif + array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp; + array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; +#endif +#ifdef CONFIG_IPV6_MROUTE + array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; +#endif + array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; + array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; + array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; +} + +static inline size_t inet6_ifla6_size(void) +{ + return nla_total_size(4) /* IFLA_INET6_FLAGS */ + + nla_total_size(sizeof(struct ifla_cacheinfo)) + + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */ +} + +static inline size_t inet6_if_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ +} + +static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, + int items, int bytes) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(atomic_long_read(&mib[i]), &stats[i]); + + memset(&stats[items], 0, pad); +} + +static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib, + int items, int bytes, size_t syncpoff) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); + + memset(&stats[items], 0, pad); +} + +static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, + int bytes) +{ + switch (attrtype) { + case IFLA_INET6_STATS: + __snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6, + IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); + break; + case IFLA_INET6_ICMP6STATS: + __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); + break; + } +} + +static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) +{ + struct nlattr *nla; + struct ifla_cacheinfo ci; + + NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); + + ci.max_reasm_len = IPV6_MAXPLEN; + ci.tstamp = cstamp_delta(idev->tstamp); + ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); + ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time); + NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); + + nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); + if (nla == NULL) + goto nla_put_failure; + ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); + + /* XXX - MC not implemented */ + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t inet6_get_link_af_size(const struct net_device *dev) +{ + if (!__in6_dev_get(dev)) + return 0; + + return inet6_ifla6_size(); +} + +static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (!idev) + return -ENODATA; + + if (inet6_fill_ifla6_attrs(skb, idev) < 0) + return -EMSGSIZE; + + return 0; +} + +static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct net_device *dev = idev->dev; + struct ifinfomsg *hdr; + struct nlmsghdr *nlh; + void *protoinfo; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); + if (nlh == NULL) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifi_family = AF_INET6; + hdr->__ifi_pad = 0; + hdr->ifi_type = dev->type; + hdr->ifi_index = dev->ifindex; + hdr->ifi_flags = dev_get_flags(dev); + hdr->ifi_change = 0; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + + if (dev->addr_len) + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + protoinfo = nla_nest_start(skb, IFLA_PROTINFO); + if (protoinfo == NULL) + goto nla_put_failure; + + if (inet6_fill_ifla6_attrs(skb, idev) < 0) + goto nla_put_failure; + + nla_nest_end(skb, protoinfo); + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rcu_read_lock(); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + if (inet6_fill_ifinfo(skb, idev, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWLINK, NLM_F_MULTI) <= 0) + goto out; +cont: + idx++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +void inet6_ifinfo_notify(int event, struct inet6_dev *idev) +{ + struct sk_buff *skb; + struct net *net = dev_net(idev->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); +} + +static inline size_t inet6_prefix_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct prefixmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct prefix_cacheinfo)); +} + +static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, + struct prefix_info *pinfo, u32 pid, u32 seq, + int event, unsigned int flags) +{ + struct prefixmsg *pmsg; + struct nlmsghdr *nlh; + struct prefix_cacheinfo ci; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + pmsg = nlmsg_data(nlh); + pmsg->prefix_family = AF_INET6; + pmsg->prefix_pad1 = 0; + pmsg->prefix_pad2 = 0; + pmsg->prefix_ifindex = idev->dev->ifindex; + pmsg->prefix_len = pinfo->prefix_len; + pmsg->prefix_type = pinfo->type; + pmsg->prefix_pad3 = 0; + pmsg->prefix_flags = 0; + if (pinfo->onlink) + pmsg->prefix_flags |= IF_PREFIX_ONLINK; + if (pinfo->autoconf) + pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; + + NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); + + ci.preferred_time = ntohl(pinfo->prefered); + ci.valid_time = ntohl(pinfo->valid); + NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo) +{ + struct sk_buff *skb; + struct net *net = dev_net(idev->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); +} + +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); + + switch (event) { + case RTM_NEWADDR: + /* + * If the address was optimistic + * we inserted the route at the start of + * our DAD process, so we don't need + * to do it again + */ + if (!(ifp->rt->rt6i_node)) + ip6_ins_rt(ifp->rt); + if (ifp->idev->cnf.forwarding) + addrconf_join_anycast(ifp); + break; + case RTM_DELADDR: + if (ifp->idev->cnf.forwarding) + addrconf_leave_anycast(ifp); + addrconf_leave_solict(ifp->idev, &ifp->addr); + dst_hold(&ifp->rt->dst); + + if (ip6_del_rt(ifp->rt)) + dst_free(&ifp->rt->dst); + break; + } +} + +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + rcu_read_lock_bh(); + if (likely(ifp->idev->dead == 0)) + __ipv6_ifa_notify(event, ifp); + rcu_read_unlock_bh(); +} + +#ifdef CONFIG_SYSCTL + +static +int addrconf_sysctl_forward(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + int ret; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_fixup_forwarding(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + +static void dev_disable_change(struct inet6_dev *idev) +{ + if (!idev || !idev->dev) + return; + + if (idev->cnf.disable_ipv6) + addrconf_notify(NULL, NETDEV_DOWN, idev->dev); + else + addrconf_notify(NULL, NETDEV_UP, idev->dev); +} + +static void addrconf_disable_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.disable_ipv6) ^ (!newf); + idev->cnf.disable_ipv6 = newf; + if (changed) + dev_disable_change(idev); + } + } + rcu_read_unlock(); +} + +static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) +{ + struct net *net; + + net = (struct net *)table->extra2; + + if (p == &net->ipv6.devconf_dflt->disable_ipv6) + return 0; + + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *p = old; + return restart_syscall(); + } + + if (p == &net->ipv6.devconf_all->disable_ipv6) { + __s32 newf = net->ipv6.devconf_all->disable_ipv6; + net->ipv6.devconf_dflt->disable_ipv6 = newf; + addrconf_disable_change(net, newf); + } else if ((!*p) ^ (!old)) + dev_disable_change((struct inet6_dev *)table->extra1); + + rtnl_unlock(); + return 0; +} + +static +int addrconf_sysctl_disable(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + int ret; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_disable_ipv6(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + +static struct addrconf_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table addrconf_vars[DEVCONF_MAX+1]; + char *dev_name; +} addrconf_sysctl __read_mostly = { + .sysctl_header = NULL, + .addrconf_vars = { + { + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_forward, + }, + { + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IPV6_PRIVACY + { + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_defrtr", + .data = &ipv6_devconf.accept_ra_defrtr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_pinfo", + .data = &ipv6_devconf.accept_ra_pinfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IPV6_ROUTER_PREF + { + .procname = "accept_ra_rtr_pref", + .data = &ipv6_devconf.accept_ra_rtr_pref, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_probe_interval", + .data = &ipv6_devconf.rtr_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#ifdef CONFIG_IPV6_ROUTE_INFO + { + .procname = "accept_ra_rt_info_max_plen", + .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#endif + { + .procname = "proxy_ndp", + .data = &ipv6_devconf.proxy_ndp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_source_route", + .data = &ipv6_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + { + .procname = "optimistic_dad", + .data = &ipv6_devconf.optimistic_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, +#endif +#ifdef CONFIG_IPV6_MROUTE + { + .procname = "mc_forwarding", + .data = &ipv6_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "disable_ipv6", + .data = &ipv6_devconf.disable_ipv6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable, + }, + { + .procname = "accept_dad", + .data = &ipv6_devconf.accept_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "force_tllao", + .data = &ipv6_devconf.force_tllao, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + /* sentinel */ + } + }, +}; + +static int __addrconf_sysctl_register(struct net *net, char *dev_name, + struct inet6_dev *idev, struct ipv6_devconf *p) +{ + int i; + struct addrconf_sysctl_table *t; + +#define ADDRCONF_CTL_PATH_DEV 3 + + struct ctl_path addrconf_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv6", }, + { .procname = "conf", }, + { /* to be set */ }, + { }, + }; + + + t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); + if (t == NULL) + goto out; + + for (i = 0; t->addrconf_vars[i].data; i++) { + t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf; + t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ + t->addrconf_vars[i].extra2 = net; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + t->dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!t->dev_name) + goto free; + + addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name; + + t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path, + t->addrconf_vars); + if (t->sysctl_header == NULL) + goto free_procname; + + p->sysctl = t; + return 0; + +free_procname: + kfree(t->dev_name); +free: + kfree(t); +out: + return -ENOBUFS; +} + +static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) +{ + struct addrconf_sysctl_table *t; + + if (p->sysctl == NULL) + return; + + t = p->sysctl; + p->sysctl = NULL; + unregister_net_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); +} + +static void addrconf_sysctl_register(struct inet6_dev *idev) +{ + neigh_sysctl_register(idev->dev, idev->nd_parms, "ipv6", + &ndisc_ifinfo_sysctl_change); + __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, + idev, &idev->cnf); +} + +static void addrconf_sysctl_unregister(struct inet6_dev *idev) +{ + __addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); +} + + +#endif + +static int __net_init addrconf_init_net(struct net *net) +{ + int err; + struct ipv6_devconf *all, *dflt; + + err = -ENOMEM; + all = &ipv6_devconf; + dflt = &ipv6_devconf_dflt; + + if (!net_eq(net, &init_net)) { + all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; + + dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; + } else { + /* these will be inherited by all namespaces */ + dflt->autoconf = ipv6_defaults.autoconf; + dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; + } + + net->ipv6.devconf_all = all; + net->ipv6.devconf_dflt = dflt; + +#ifdef CONFIG_SYSCTL + err = __addrconf_sysctl_register(net, "all", NULL, all); + if (err < 0) + goto err_reg_all; + + err = __addrconf_sysctl_register(net, "default", NULL, dflt); + if (err < 0) + goto err_reg_dflt; +#endif + return 0; + +#ifdef CONFIG_SYSCTL +err_reg_dflt: + __addrconf_sysctl_unregister(all); +err_reg_all: + kfree(dflt); +#endif +err_alloc_dflt: + kfree(all); +err_alloc_all: + return err; +} + +static void __net_exit addrconf_exit_net(struct net *net) +{ +#ifdef CONFIG_SYSCTL + __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); + __addrconf_sysctl_unregister(net->ipv6.devconf_all); +#endif + if (!net_eq(net, &init_net)) { + kfree(net->ipv6.devconf_dflt); + kfree(net->ipv6.devconf_all); + } +} + +static struct pernet_operations addrconf_ops = { + .init = addrconf_init_net, + .exit = addrconf_exit_net, +}; + +/* + * Device notifier + */ + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(register_inet6addr_notifier); + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(unregister_inet6addr_notifier); + +static struct rtnl_af_ops inet6_ops = { + .family = AF_INET6, + .fill_link_af = inet6_fill_link_af, + .get_link_af_size = inet6_get_link_af_size, +}; + +/* + * Init / cleanup code + */ + +int __init addrconf_init(void) +{ + int i, err; + + err = ipv6_addr_label_init(); + if (err < 0) { + printk(KERN_CRIT "IPv6 Addrconf:" + " cannot initialize default policy table: %d.\n", err); + goto out; + } + + err = register_pernet_subsys(&addrconf_ops); + if (err < 0) + goto out_addrlabel; + + /* The addrconf netdev notifier requires that loopback_dev + * has it's ipv6 private information allocated and setup + * before it can bring up and give link-local addresses + * to other devices which are up. + * + * Unfortunately, loopback_dev is not necessarily the first + * entry in the global dev_base list of net devices. In fact, + * it is likely to be the very last entry on that list. + * So this causes the notifier registry below to try and + * give link-local addresses to all devices besides loopback_dev + * first, then loopback_dev, which cases all the non-loopback_dev + * devices to fail to get a link-local address. + * + * So, as a temporary fix, allocate the ipv6 structure for + * loopback_dev first by hand. + * Longer term, all of the dependencies ipv6 has upon the loopback + * device and it being up should be removed. + */ + rtnl_lock(); + if (!ipv6_add_dev(init_net.loopback_dev)) + err = -ENOMEM; + rtnl_unlock(); + if (err) + goto errlo; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_addr_lst[i]); + + register_netdevice_notifier(&ipv6_dev_notf); + + addrconf_verify(0); + + err = rtnl_af_register(&inet6_ops); + if (err < 0) + goto errout_af; + + err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo); + if (err < 0) + goto errout; + + /* Only the first call to __rtnl_register can fail */ + __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL); + __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL); + __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr); + __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr); + __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr); + + ipv6_addr_label_rtnl_register(); + + return 0; +errout: + rtnl_af_unregister(&inet6_ops); +errout_af: + unregister_netdevice_notifier(&ipv6_dev_notf); +errlo: + unregister_pernet_subsys(&addrconf_ops); +out_addrlabel: + ipv6_addr_label_cleanup(); +out: + return err; +} + +void addrconf_cleanup(void) +{ + struct net_device *dev; + int i; + + unregister_netdevice_notifier(&ipv6_dev_notf); + unregister_pernet_subsys(&addrconf_ops); + ipv6_addr_label_cleanup(); + + rtnl_lock(); + + __rtnl_af_unregister(&inet6_ops); + + /* clean dev list */ + for_each_netdev(&init_net, dev) { + if (__in6_dev_get(dev) == NULL) + continue; + addrconf_ifdown(dev, 1); + } + addrconf_ifdown(init_net.loopback_dev, 2); + + /* + * Check hash table. + */ + spin_lock_bh(&addrconf_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_addr_lst[i])); + spin_unlock_bh(&addrconf_hash_lock); + + del_timer(&addr_chk_timer); + rtnl_unlock(); +} diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c new file mode 100644 index 00000000..6b038265 --- /dev/null +++ b/net/ipv6/addrconf_core.c @@ -0,0 +1,79 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ + +#include + +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) + +static inline unsigned ipv6_addr_scope2type(unsigned scope) +{ + switch(scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); +} + +int __ipv6_addr_type(const struct in6_addr *addr) +{ + __be32 st; + + st = addr->s6_addr32[0]; + + /* Consider all addresses with the first three bits different of + 000 and 111 as unicasts. + */ + if ((st & htonl(0xE0000000)) != htonl(0x00000000) && + (st & htonl(0xE0000000)) != htonl(0xE0000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { + if (addr->s6_addr32[2] == 0) { + if (addr->s6_addr32[3] == 0) + return IPV6_ADDR_ANY; + + if (addr->s6_addr32[3] == htonl(0x00000001)) + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ + + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + if (addr->s6_addr32[2] == htonl(0x0000ffff)) + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ +} +EXPORT_SYMBOL(__ipv6_addr_type); + diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c new file mode 100644 index 00000000..c8993e5a --- /dev/null +++ b/net/ipv6/addrlabel.c @@ -0,0 +1,599 @@ +/* + * IPv6 Address Label subsystem + * for the IPv6 "Default" Source Address Selection + * + * Copyright (C)2007 USAGI/WIDE Project + */ +/* + * Author: + * YOSHIFUJI Hideaki @ USAGI/WIDE Project + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define ADDRLABEL(x...) printk(x) +#else +#define ADDRLABEL(x...) do { ; } while(0) +#endif + +/* + * Policy Table + */ +struct ip6addrlbl_entry +{ +#ifdef CONFIG_NET_NS + struct net *lbl_net; +#endif + struct in6_addr prefix; + int prefixlen; + int ifindex; + int addrtype; + u32 label; + struct hlist_node list; + atomic_t refcnt; + struct rcu_head rcu; +}; + +static struct ip6addrlbl_table +{ + struct hlist_head head; + spinlock_t lock; + u32 seq; +} ip6addrlbl_table; + +static inline +struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) +{ + return read_pnet(&lbl->lbl_net); +} + +/* + * Default policy table (RFC3484 + extensions) + * + * prefix addr_type label + * ------------------------------------------------------------------------- + * ::1/128 LOOPBACK 0 + * ::/0 N/A 1 + * 2002::/16 N/A 2 + * ::/96 COMPATv4 3 + * ::ffff:0:0/96 V4MAPPED 4 + * fc00::/7 N/A 5 ULA (RFC 4193) + * 2001::/32 N/A 6 Teredo (RFC 4380) + * 2001:10::/28 N/A 7 ORCHID (RFC 4843) + * + * Note: 0xffffffff is used if we do not have any policies. + */ + +#define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL + +static const __net_initdata struct ip6addrlbl_init_table +{ + const struct in6_addr *prefix; + int prefixlen; + u32 label; +} ip6addrlbl_init_table[] = { + { /* ::/0 */ + .prefix = &in6addr_any, + .label = 1, + },{ /* fc00::/7 */ + .prefix = &(struct in6_addr){{{ 0xfc }}}, + .prefixlen = 7, + .label = 5, + },{ /* 2002::/16 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x02 }}}, + .prefixlen = 16, + .label = 2, + },{ /* 2001::/32 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x01 }}}, + .prefixlen = 32, + .label = 6, + },{ /* 2001:10::/28 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x01, 0x00, 0x10 }}}, + .prefixlen = 28, + .label = 7, + },{ /* ::ffff:0:0 */ + .prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}}, + .prefixlen = 96, + .label = 4, + },{ /* ::/96 */ + .prefix = &in6addr_any, + .prefixlen = 96, + .label = 3, + },{ /* ::1/128 */ + .prefix = &in6addr_loopback, + .prefixlen = 128, + .label = 0, + } +}; + +/* Object management */ +static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) +{ +#ifdef CONFIG_NET_NS + release_net(p->lbl_net); +#endif + kfree(p); +} + +static void ip6addrlbl_free_rcu(struct rcu_head *h) +{ + ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); +} + +static inline int ip6addrlbl_hold(struct ip6addrlbl_entry *p) +{ + return atomic_inc_not_zero(&p->refcnt); +} + +static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) +{ + if (atomic_dec_and_test(&p->refcnt)) + call_rcu(&p->rcu, ip6addrlbl_free_rcu); +} + +/* Find label */ +static int __ip6addrlbl_match(struct net *net, + struct ip6addrlbl_entry *p, + const struct in6_addr *addr, + int addrtype, int ifindex) +{ + if (!net_eq(ip6addrlbl_net(p), net)) + return 0; + if (p->ifindex && p->ifindex != ifindex) + return 0; + if (p->addrtype && p->addrtype != addrtype) + return 0; + if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen)) + return 0; + return 1; +} + +static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, + const struct in6_addr *addr, + int type, int ifindex) +{ + struct hlist_node *pos; + struct ip6addrlbl_entry *p; + hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(net, p, addr, type, ifindex)) + return p; + } + return NULL; +} + +u32 ipv6_addr_label(struct net *net, + const struct in6_addr *addr, int type, int ifindex) +{ + u32 label; + struct ip6addrlbl_entry *p; + + type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK; + + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, type, ifindex); + label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; + rcu_read_unlock(); + + ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", + __func__, addr, type, ifindex, label); + + return label; +} + +/* allocate one entry */ +static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, + const struct in6_addr *prefix, + int prefixlen, int ifindex, + u32 label) +{ + struct ip6addrlbl_entry *newp; + int addrtype; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label); + + addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); + + switch (addrtype) { + case IPV6_ADDR_MAPPED: + if (prefixlen > 96) + return ERR_PTR(-EINVAL); + if (prefixlen < 96) + addrtype = 0; + break; + case IPV6_ADDR_COMPATv4: + if (prefixlen != 96) + addrtype = 0; + break; + case IPV6_ADDR_LOOPBACK: + if (prefixlen != 128) + addrtype = 0; + break; + } + + newp = kmalloc(sizeof(*newp), GFP_KERNEL); + if (!newp) + return ERR_PTR(-ENOMEM); + + ipv6_addr_prefix(&newp->prefix, prefix, prefixlen); + newp->prefixlen = prefixlen; + newp->ifindex = ifindex; + newp->addrtype = addrtype; + newp->label = label; + INIT_HLIST_NODE(&newp->list); +#ifdef CONFIG_NET_NS + newp->lbl_net = hold_net(net); +#endif + atomic_set(&newp->refcnt, 1); + return newp; +} + +/* add a label */ +static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +{ + int ret = 0; + + ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", + __func__, + newp, replace); + + if (hlist_empty(&ip6addrlbl_table.head)) { + hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + } else { + struct hlist_node *pos, *n; + struct ip6addrlbl_entry *p = NULL; + hlist_for_each_entry_safe(p, pos, n, + &ip6addrlbl_table.head, list) { + if (p->prefixlen == newp->prefixlen && + net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && + p->ifindex == newp->ifindex && + ipv6_addr_equal(&p->prefix, &newp->prefix)) { + if (!replace) { + ret = -EEXIST; + goto out; + } + hlist_replace_rcu(&p->list, &newp->list); + ip6addrlbl_put(p); + goto out; + } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || + (p->prefixlen < newp->prefixlen)) { + hlist_add_before_rcu(&newp->list, &p->list); + goto out; + } + } + hlist_add_after_rcu(&p->list, &newp->list); + } +out: + if (!ret) + ip6addrlbl_table.seq++; + return ret; +} + +/* add a label */ +static int ip6addrlbl_add(struct net *net, + const struct in6_addr *prefix, int prefixlen, + int ifindex, u32 label, int replace) +{ + struct ip6addrlbl_entry *newp; + int ret = 0; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label, + replace); + + newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); + if (IS_ERR(newp)) + return PTR_ERR(newp); + spin_lock(&ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(newp, replace); + spin_unlock(&ip6addrlbl_table.lock); + if (ret) + ip6addrlbl_free(newp); + return ret; +} + +/* remove a label */ +static int __ip6addrlbl_del(struct net *net, + const struct in6_addr *prefix, int prefixlen, + int ifindex) +{ + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *pos, *n; + int ret = -ESRCH; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", + __func__, prefix, prefixlen, ifindex); + + hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + if (p->prefixlen == prefixlen && + net_eq(ip6addrlbl_net(p), net) && + p->ifindex == ifindex && + ipv6_addr_equal(&p->prefix, prefix)) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); + ret = 0; + break; + } + } + return ret; +} + +static int ip6addrlbl_del(struct net *net, + const struct in6_addr *prefix, int prefixlen, + int ifindex) +{ + struct in6_addr prefix_buf; + int ret; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", + __func__, prefix, prefixlen, ifindex); + + ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); + spin_lock(&ip6addrlbl_table.lock); + ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); + spin_unlock(&ip6addrlbl_table.lock); + return ret; +} + +/* add default label */ +static int __net_init ip6addrlbl_net_init(struct net *net) +{ + int err = 0; + int i; + + ADDRLABEL(KERN_DEBUG "%s()\n", __func__); + + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { + int ret = ip6addrlbl_add(net, + ip6addrlbl_init_table[i].prefix, + ip6addrlbl_init_table[i].prefixlen, + 0, + ip6addrlbl_init_table[i].label, 0); + /* XXX: should we free all rules when we catch an error? */ + if (ret && (!err || err != -ENOMEM)) + err = ret; + } + return err; +} + +static void __net_exit ip6addrlbl_net_exit(struct net *net) +{ + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *pos, *n; + + /* Remove all labels belonging to the exiting net */ + spin_lock(&ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + if (net_eq(ip6addrlbl_net(p), net)) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); + } + } + spin_unlock(&ip6addrlbl_table.lock); +} + +static struct pernet_operations ipv6_addr_label_ops = { + .init = ip6addrlbl_net_init, + .exit = ip6addrlbl_net_exit, +}; + +int __init ipv6_addr_label_init(void) +{ + spin_lock_init(&ip6addrlbl_table.lock); + + return register_pernet_subsys(&ipv6_addr_label_ops); +} + +void ipv6_addr_label_cleanup(void) +{ + unregister_pernet_subsys(&ipv6_addr_label_ops); +} + +static const struct nla_policy ifal_policy[IFAL_MAX+1] = { + [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, + [IFAL_LABEL] = { .len = sizeof(u32), }, +}; + +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, + void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifaddrlblmsg *ifal; + struct nlattr *tb[IFAL_MAX+1]; + struct in6_addr *pfx; + u32 label; + int err = 0; + + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + if (err < 0) + return err; + + ifal = nlmsg_data(nlh); + + if (ifal->ifal_family != AF_INET6 || + ifal->ifal_prefixlen > 128) + return -EINVAL; + + if (!tb[IFAL_ADDRESS]) + return -EINVAL; + + pfx = nla_data(tb[IFAL_ADDRESS]); + if (!pfx) + return -EINVAL; + + if (!tb[IFAL_LABEL]) + return -EINVAL; + label = nla_get_u32(tb[IFAL_LABEL]); + if (label == IPV6_ADDR_LABEL_DEFAULT) + return -EINVAL; + + switch(nlh->nlmsg_type) { + case RTM_NEWADDRLABEL: + if (ifal->ifal_index && + !__dev_get_by_index(net, ifal->ifal_index)) + return -EINVAL; + + err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen, + ifal->ifal_index, label, + nlh->nlmsg_flags & NLM_F_REPLACE); + break; + case RTM_DELADDRLABEL: + err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen, + ifal->ifal_index); + break; + default: + err = -EOPNOTSUPP; + } + return err; +} + +static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, + int prefixlen, int ifindex, u32 lseq) +{ + struct ifaddrlblmsg *ifal = nlmsg_data(nlh); + ifal->ifal_family = AF_INET6; + ifal->ifal_prefixlen = prefixlen; + ifal->ifal_flags = 0; + ifal->ifal_index = ifindex; + ifal->ifal_seq = lseq; +}; + +static int ip6addrlbl_fill(struct sk_buff *skb, + struct ip6addrlbl_entry *p, + u32 lseq, + u32 pid, u32 seq, int event, + unsigned int flags) +{ + struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq, event, + sizeof(struct ifaddrlblmsg), flags); + if (!nlh) + return -EMSGSIZE; + + ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq); + + if (nla_put(skb, IFAL_ADDRESS, 16, &p->prefix) < 0 || + nla_put_u32(skb, IFAL_LABEL, p->label) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct ip6addrlbl_entry *p; + struct hlist_node *pos; + int idx = 0, s_idx = cb->args[0]; + int err; + + rcu_read_lock(); + hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + if (idx >= s_idx && + net_eq(ip6addrlbl_net(p), net)) { + if ((err = ip6addrlbl_fill(skb, p, + ip6addrlbl_table.seq, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDRLABEL, + NLM_F_MULTI)) <= 0) + break; + } + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +static inline int ip6addrlbl_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + + nla_total_size(16) /* IFAL_ADDRESS */ + + nla_total_size(4); /* IFAL_LABEL */ +} + +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, + void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct ifaddrlblmsg *ifal; + struct nlattr *tb[IFAL_MAX+1]; + struct in6_addr *addr; + u32 lseq; + int err = 0; + struct ip6addrlbl_entry *p; + struct sk_buff *skb; + + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + if (err < 0) + return err; + + ifal = nlmsg_data(nlh); + + if (ifal->ifal_family != AF_INET6 || + ifal->ifal_prefixlen != 128) + return -EINVAL; + + if (ifal->ifal_index && + !__dev_get_by_index(net, ifal->ifal_index)) + return -EINVAL; + + if (!tb[IFAL_ADDRESS]) + return -EINVAL; + + addr = nla_data(tb[IFAL_ADDRESS]); + if (!addr) + return -EINVAL; + + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); + if (p && ip6addrlbl_hold(p)) + p = NULL; + lseq = ip6addrlbl_table.seq; + rcu_read_unlock(); + + if (!p) { + err = -ESRCH; + goto out; + } + + if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) { + ip6addrlbl_put(p); + return -ENOBUFS; + } + + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + + ip6addrlbl_put(p); + + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto out; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +out: + return err; +} + +void __init ipv6_addr_label_rtnl_register(void) +{ + __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL); + __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL); + __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump); +} + diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c new file mode 100644 index 00000000..7e8340ef --- /dev/null +++ b/net/ipv6/af_inet6.c @@ -0,0 +1,1349 @@ +/* + * PF_INET6 socket protocol family + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Adapted from linux/net/ipv4/af_inet.c + * + * Fixes: + * piggy, Karl Knutson : Socket protocol table + * Hideaki YOSHIFUJI : sin6_scope_id support + * Arnaldo Melo : check proc_net_create return, cleanups + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IPV6_TUNNEL +#include +#endif + +#include +#include +#include + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +#include + +static inline int current_has_network(void) +{ + return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); +} +#else +static inline int current_has_network(void) +{ + return 1; +} +#endif + +MODULE_AUTHOR("Cast of dozens"); +MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); +MODULE_LICENSE("GPL"); + +/* The inetsw6 table contains everything that inet6_create needs to + * build a new socket. + */ +static struct list_head inetsw6[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw6_lock); + +struct ipv6_params ipv6_defaults = { + .disable_ipv6 = 0, + .autoconf = 1, +}; + +static int disable_ipv6_mod = 0; + +module_param_named(disable, disable_ipv6_mod, int, 0444); +MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); + +module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444); +MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); + +module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); +MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); + +static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +{ + const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} + +static int inet6_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct inet_sock *inet; + struct ipv6_pinfo *np; + struct sock *sk; + struct inet_protosw *answer; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int try_loading_module = 0; + int err; + + if (!current_has_network()) + return -EACCES; + + if (sock->type != SOCK_RAW && + sock->type != SOCK_DGRAM && + !inet_ehash_secret) + build_ehash_secret(); + + /* Look for the requested type/protocol pair. */ +lookup_protocol: + err = -ESOCKTNOSUPPORT; + rcu_read_lock(); + list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) { + + err = 0; + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + err = -EPROTONOSUPPORT; + } + + if (err) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-10-proto-132-type-1 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET6, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-10-proto-132 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET6, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + + err = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + WARN_ON(answer_prot->slab == NULL); + + err = -ENOBUFS; + sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); + if (sk == NULL) + goto out; + + sock_init_data(sock, sk); + + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + + if (SOCK_RAW == sock->type) { + inet->inet_num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + sk->sk_destruct = inet_sock_destruct; + sk->sk_family = PF_INET6; + sk->sk_protocol = protocol; + + sk->sk_backlog_rcv = answer->prot->backlog_rcv; + + inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); + np->hop_limit = -1; + np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; + np->mc_loop = 1; + np->pmtudisc = IPV6_PMTUDISC_WANT; + np->ipv6only = net->ipv6.sysctl.bindv6only; + + /* Init the ipv4 part of the socket since we can have sockets + * using v6 API for ipv4. + */ + inet->uc_ttl = -1; + + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + /* + * Increment only the relevant sk_prot->socks debug field, this changes + * the previous behaviour of incrementing both the equivalent to + * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. + * + * This allows better debug granularity as we'll know exactly how many + * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 + * transport protocol socks. -acme + */ + sk_refcnt_debug_inc(sk); + + if (inet->inet_num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically shares. + */ + inet->inet_sport = htons(inet->inet_num); + sk->sk_prot->hash(sk); + } + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } +out: + return err; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + int addr_type = 0; + int err = 0; + + /* If the socket has its own bind function then use it. */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + addr_type = ipv6_addr_type(&addr->sin6_addr); + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + return -EINVAL; + + snum = ntohs(addr->sin6_port); + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + if (sk->sk_state != TCP_CLOSE || inet->inet_num) { + err = -EINVAL; + goto out; + } + + /* Check if the address belongs to the host. */ + if (addr_type == IPV6_ADDR_MAPPED) { + int chk_addr_ret; + + /* Binding to v4-mapped address on a v6-only socket + * makes no sense + */ + if (np->ipv6only) { + err = -EINVAL; + goto out; + } + + /* Reproduce AF_INET checks to make the bindings consistent */ + v4addr = addr->sin6_addr.s6_addr32[3]; + chk_addr_ret = inet_addr_type(net, v4addr); + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + v4addr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) { + err = -EADDRNOTAVAIL; + goto out; + } + } else { + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + rcu_read_lock(); + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another one + * is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out_unlock; + } + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out_unlock; + } + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!inet->transparent && + !ipv6_chk_addr(net, &addr->sin6_addr, + dev, 0)) { + err = -EADDRNOTAVAIL; + goto out_unlock; + } + } + rcu_read_unlock(); + } + } + + inet->inet_rcv_saddr = v4addr; + inet->inet_saddr = v4addr; + + ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + + if (!(addr_type & IPV6_ADDR_MULTICAST)) + ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + + if (addr_type != IPV6_ADDR_ANY) { + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (addr_type != IPV6_ADDR_MAPPED) + np->ipv6only = 1; + } + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->inet_sport = htons(inet->inet_num); + inet->inet_dport = 0; + inet->inet_daddr = 0; +out: + release_sock(sk); + return err; +out_unlock: + rcu_read_unlock(); + goto out; +} + +EXPORT_SYMBOL(inet6_bind); + +int inet6_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk == NULL) + return -EINVAL; + + /* Free mc lists */ + ipv6_sock_mc_close(sk); + + /* Free ac lists */ + ipv6_sock_ac_close(sk); + + return inet_release(sock); +} + +EXPORT_SYMBOL(inet6_release); + +void inet6_destroy_sock(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt; + + /* Release rx options */ + + if ((skb = xchg(&np->pktoptions, NULL)) != NULL) + kfree_skb(skb); + + if ((skb = xchg(&np->rxpmtu, NULL)) != NULL) + kfree_skb(skb); + + /* Free flowlabels */ + fl6_free_socklist(sk); + + /* Free tx options */ + + if ((opt = xchg(&np->opt, NULL)) != NULL) + sock_kfree_s(sk, opt, opt->tot_len); +} + +EXPORT_SYMBOL_GPL(inet6_destroy_sock); + +/* + * This does both peername and sockname. + */ + +int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (peer) { + if (!inet->inet_dport) + return -ENOTCONN; + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1) + return -ENOTCONN; + sin->sin6_port = inet->inet_dport; + ipv6_addr_copy(&sin->sin6_addr, &np->daddr); + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; + } else { + if (ipv6_addr_any(&np->rcv_saddr)) + ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + else + ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr); + + sin->sin6_port = inet->inet_sport; + } + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = sk->sk_bound_dev_if; + *uaddr_len = sizeof(*sin); + return 0; +} + +EXPORT_SYMBOL(inet6_getname); + +int inet6_killaddr_ioctl(struct net *net, void __user *arg) { + struct in6_ifreq ireq; + struct sockaddr_in6 sin6; + + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + sin6.sin6_family = AF_INET6; + ipv6_addr_copy(&sin6.sin6_addr, &ireq.ifr6_addr); + return tcp_nuke_addr(net, (struct sockaddr *) &sin6); +} + +int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + + switch(cmd) + { + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *)arg); + + case SIOCADDRT: + case SIOCDELRT: + + return ipv6_route_ioctl(net, cmd, (void __user *)arg); + + case SIOCSIFADDR: + return addrconf_add_ifaddr(net, (void __user *) arg); + case SIOCDIFADDR: + return addrconf_del_ifaddr(net, (void __user *) arg); + case SIOCSIFDSTADDR: + return addrconf_set_dstaddr(net, (void __user *) arg); + case SIOCKILLADDR: + return inet6_killaddr_ioctl(net, (void __user *) arg); + default: + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); + } + /*NOTREACHED*/ + return 0; +} + +EXPORT_SYMBOL(inet6_ioctl); + +const struct proto_ops inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = inet_accept, /* ok */ + .getname = inet6_getname, + .poll = tcp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = inet_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, + .splice_read = tcp_splice_read, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +const struct proto_ops inet6_dgram_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = udp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static const struct net_proto_family inet6_family_ops = { + .family = PF_INET6, + .create = inet6_create, + .owner = THIS_MODULE, +}; + +int inet6_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + struct list_head *last_perm; + int protocol = p->protocol; + int ret; + + spin_lock_bh(&inetsw6_lock); + + ret = -EINVAL; + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + ret = -EPERM; + last_perm = &inetsw6[p->type]; + list_for_each(lh, &inetsw6[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); + ret = 0; +out: + spin_unlock_bh(&inetsw6_lock); + return ret; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} + +EXPORT_SYMBOL(inet6_register_protosw); + +void +inet6_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw6_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw6_lock); + + synchronize_net(); + } +} + +EXPORT_SYMBOL(inet6_unregister_protosw); + +int inet6_sk_rebuild_header(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dst_entry *dst; + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p, final; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = sk->sk_protocol; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.flowlabel = np->flow_label; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) { + sk->sk_route_caps = 0; + sk->sk_err_soft = -PTR_ERR(dst); + return PTR_ERR(dst); + } + + __ip6_dst_store(sk, dst, NULL, NULL); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & + *(__be32 *)skb_network_header(skb)) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + +static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) +{ + const struct inet6_protocol *ops = NULL; + + for (;;) { + struct ipv6_opt_hdr *opth; + int len; + + if (proto != NEXTHDR_HOP) { + ops = rcu_dereference(inet6_protos[proto]); + + if (unlikely(!ops)) + break; + + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + + if (unlikely(!pskb_may_pull(skb, 8))) + break; + + opth = (void *)skb->data; + len = ipv6_optlen(opth); + + if (unlikely(!pskb_may_pull(skb, len))) + break; + + proto = opth->nexthdr; + __skb_pull(skb, len); + } + + return proto; +} + +static int ipv6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + const struct inet6_protocol *ops; + int err = -EINVAL; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + err = -EPROTONOSUPPORT; + + rcu_read_lock(); + ops = rcu_dereference(inet6_protos[ + ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); + + if (likely(ops && ops->gso_send_check)) { + skb_reset_transport_header(skb); + err = ops->gso_send_check(skb); + } + rcu_read_unlock(); + +out: + return err; +} + +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct ipv6hdr *ipv6h; + const struct inet6_protocol *ops; + int proto; + struct frag_hdr *fptr; + unsigned int unfrag_ip6hlen; + u8 *prevhdr; + int offset = 0; + + if (!(features & NETIF_F_V6_CSUM)) + features &= ~NETIF_F_SG; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + segs = ERR_PTR(-EPROTONOSUPPORT); + + proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); + rcu_read_lock(); + ops = rcu_dereference(inet6_protos[proto]); + if (likely(ops && ops->gso_segment)) { + skb_reset_transport_header(skb); + segs = ops->gso_segment(skb, features); + } + rcu_read_unlock(); + + if (IS_ERR(segs)) + goto out; + + for (skb = segs; skb; skb = skb->next) { + ipv6h = ipv6_hdr(skb); + ipv6h->payload_len = htons(skb->len - skb->mac_len - + sizeof(*ipv6h)); + if (proto == IPPROTO_UDP) { + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + fptr = (struct frag_hdr *)(skb_network_header(skb) + + unfrag_ip6hlen); + fptr->frag_off = htons(offset); + if (skb->next != NULL) + fptr->frag_off |= htons(IP6_MF); + offset += (ntohs(ipv6h->payload_len) - + sizeof(struct frag_hdr)); + } + } + +out: + return segs; +} + +struct ipv6_gro_cb { + struct napi_gro_cb napi; + int proto; +}; + +#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb) + +static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct inet6_protocol *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct ipv6hdr *iph; + unsigned int nlen; + unsigned int hlen; + unsigned int off; + int flush = 1; + int proto; + __wsum csum; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*iph); + iph = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + iph = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!iph)) + goto out; + } + + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); + + flush += ntohs(iph->payload_len) != skb_gro_len(skb); + + rcu_read_lock(); + proto = iph->nexthdr; + ops = rcu_dereference(inet6_protos[proto]); + if (!ops || !ops->gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + if (!ops || !ops->gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + IPV6_GRO_CB(skb)->proto = proto; + + flush--; + nlen = skb_network_header_len(skb); + + for (p = *head; p; p = p->next) { + struct ipv6hdr *iph2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + iph2 = ipv6_hdr(p); + + /* All fields must match except length. */ + if (nlen != skb_network_header_len(p) || + memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) || + memcmp(&iph->nexthdr, &iph2->nexthdr, + nlen - offsetof(struct ipv6hdr, nexthdr))) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + NAPI_GRO_CB(p)->flush |= flush; + } + + NAPI_GRO_CB(skb)->flush |= flush; + + csum = skb->csum; + skb_postpull_rcsum(skb, iph, skb_network_header_len(skb)); + + pp = ops->gro_receive(head, skb); + + skb->csum = csum; + +out_unlock: + rcu_read_unlock(); + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int ipv6_gro_complete(struct sk_buff *skb) +{ + const struct inet6_protocol *ops; + struct ipv6hdr *iph = ipv6_hdr(skb); + int err = -ENOSYS; + + iph->payload_len = htons(skb->len - skb_network_offset(skb) - + sizeof(*iph)); + + rcu_read_lock(); + ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]); + if (WARN_ON(!ops || !ops->gro_complete)) + goto out_unlock; + + err = ops->gro_complete(skb); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct packet_type ipv6_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IPV6), + .func = ipv6_rcv, + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, +}; + +static int __init ipv6_packet_init(void) +{ + dev_add_pack(&ipv6_packet_type); + return 0; +} + +static void ipv6_packet_cleanup(void) +{ + dev_remove_pack(&ipv6_packet_type); +} + +static int __net_init ipv6_init_mibs(struct net *net) +{ + if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + return -ENOMEM; + if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udplite_mib; + if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics, + sizeof(struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp_mib; + if (snmp_mib_init((void __percpu **)net->mib.icmpv6msg_statistics, + sizeof(struct icmpv6msg_mib), + __alignof__(struct icmpv6msg_mib)) < 0) + goto err_icmpmsg_mib; + return 0; + +err_icmpmsg_mib: + snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); +err_icmp_mib: + snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); +err_ip_mib: + snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); +err_udplite_mib: + snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); + return -ENOMEM; +} + +static void ipv6_cleanup_mibs(struct net *net) +{ + snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); + snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); + snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); + snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); + snmp_mib_free((void __percpu **)net->mib.icmpv6msg_statistics); +} + +static int __net_init inet6_net_init(struct net *net) +{ + int err = 0; + + net->ipv6.sysctl.bindv6only = 0; + net->ipv6.sysctl.icmpv6_time = 1*HZ; + + err = ipv6_init_mibs(net); + if (err) + return err; +#ifdef CONFIG_PROC_FS + err = udp6_proc_init(net); + if (err) + goto out; + err = tcp6_proc_init(net); + if (err) + goto proc_tcp6_fail; + err = ac6_proc_init(net); + if (err) + goto proc_ac6_fail; +#endif + return err; + +#ifdef CONFIG_PROC_FS +proc_ac6_fail: + tcp6_proc_exit(net); +proc_tcp6_fail: + udp6_proc_exit(net); +out: + ipv6_cleanup_mibs(net); + return err; +#endif +} + +static void __net_exit inet6_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + udp6_proc_exit(net); + tcp6_proc_exit(net); + ac6_proc_exit(net); +#endif + ipv6_cleanup_mibs(net); +} + +static struct pernet_operations inet6_net_ops = { + .init = inet6_net_init, + .exit = inet6_net_exit, +}; + +static int __init inet6_init(void) +{ + struct sk_buff *dummy_skb; + struct list_head *r; + int err = 0; + + BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + + /* Register the socket-side information for inet6_create. */ + for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_ipv6_mod) { + printk(KERN_INFO + "IPv6: Loaded, but administratively disabled, " + "reboot required to enable\n"); + goto out; + } + + initialize_hashidentrnd(); + + err = proto_register(&tcpv6_prot, 1); + if (err) + goto out; + + err = proto_register(&udpv6_prot, 1); + if (err) + goto out_unregister_tcp_proto; + + err = proto_register(&udplitev6_prot, 1); + if (err) + goto out_unregister_udp_proto; + + err = proto_register(&rawv6_prot, 1); + if (err) + goto out_unregister_udplite_proto; + + + /* We MUST register RAW sockets before we create the ICMP6, + * IGMP6, or NDISC control sockets. + */ + err = rawv6_init(); + if (err) + goto out_unregister_raw_proto; + + /* Register the family here so that the init calls below will + * be able to create sockets. (?? is this dangerous ??) + */ + err = sock_register(&inet6_family_ops); + if (err) + goto out_sock_register_fail; + +#ifdef CONFIG_SYSCTL + err = ipv6_static_sysctl_register(); + if (err) + goto static_sysctl_fail; +#endif + /* + * ipngwg API draft makes clear that the correct semantics + * for TCP and UDP is to consider one TCP and UDP instance + * in a host available by both INET and INET6 APIs and + * able to communicate via both network protocols. + */ + + err = register_pernet_subsys(&inet6_net_ops); + if (err) + goto register_pernet_fail; + err = icmpv6_init(); + if (err) + goto icmp_fail; + err = ip6_mr_init(); + if (err) + goto ipmr_fail; + err = ndisc_init(); + if (err) + goto ndisc_fail; + err = igmp6_init(); + if (err) + goto igmp_fail; + err = ipv6_netfilter_init(); + if (err) + goto netfilter_fail; + /* Create /proc/foo6 entries. */ +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (raw6_proc_init()) + goto proc_raw6_fail; + if (udplite6_proc_init()) + goto proc_udplite6_fail; + if (ipv6_misc_proc_init()) + goto proc_misc6_fail; + if (if6_proc_init()) + goto proc_if6_fail; +#endif + err = ip6_route_init(); + if (err) + goto ip6_route_fail; + err = ip6_flowlabel_init(); + if (err) + goto ip6_flowlabel_fail; + err = addrconf_init(); + if (err) + goto addrconf_fail; + + /* Init v6 extension headers. */ + err = ipv6_exthdrs_init(); + if (err) + goto ipv6_exthdrs_fail; + + err = ipv6_frag_init(); + if (err) + goto ipv6_frag_fail; + + /* Init v6 transport protocols. */ + err = udpv6_init(); + if (err) + goto udpv6_fail; + + err = udplitev6_init(); + if (err) + goto udplitev6_fail; + + err = tcpv6_init(); + if (err) + goto tcpv6_fail; + + err = ipv6_packet_init(); + if (err) + goto ipv6_packet_fail; + +#ifdef CONFIG_SYSCTL + err = ipv6_sysctl_register(); + if (err) + goto sysctl_fail; +#endif +out: + return err; + +#ifdef CONFIG_SYSCTL +sysctl_fail: + ipv6_packet_cleanup(); +#endif +ipv6_packet_fail: + tcpv6_exit(); +tcpv6_fail: + udplitev6_exit(); +udplitev6_fail: + udpv6_exit(); +udpv6_fail: + ipv6_frag_exit(); +ipv6_frag_fail: + ipv6_exthdrs_exit(); +ipv6_exthdrs_fail: + addrconf_cleanup(); +addrconf_fail: + ip6_flowlabel_cleanup(); +ip6_flowlabel_fail: + ip6_route_cleanup(); +ip6_route_fail: +#ifdef CONFIG_PROC_FS + if6_proc_exit(); +proc_if6_fail: + ipv6_misc_proc_exit(); +proc_misc6_fail: + udplite6_proc_exit(); +proc_udplite6_fail: + raw6_proc_exit(); +proc_raw6_fail: +#endif + ipv6_netfilter_fini(); +netfilter_fail: + igmp6_cleanup(); +igmp_fail: + ndisc_cleanup(); +ndisc_fail: + ip6_mr_cleanup(); +ipmr_fail: + icmpv6_cleanup(); +icmp_fail: + unregister_pernet_subsys(&inet6_net_ops); +register_pernet_fail: +#ifdef CONFIG_SYSCTL + ipv6_static_sysctl_unregister(); +static_sysctl_fail: +#endif + sock_unregister(PF_INET6); + rtnl_unregister_all(PF_INET6); +out_sock_register_fail: + rawv6_exit(); +out_unregister_raw_proto: + proto_unregister(&rawv6_prot); +out_unregister_udplite_proto: + proto_unregister(&udplitev6_prot); +out_unregister_udp_proto: + proto_unregister(&udpv6_prot); +out_unregister_tcp_proto: + proto_unregister(&tcpv6_prot); + goto out; +} +module_init(inet6_init); + +static void __exit inet6_exit(void) +{ + if (disable_ipv6_mod) + return; + + /* First of all disallow new sockets creation. */ + sock_unregister(PF_INET6); + /* Disallow any further netlink messages */ + rtnl_unregister_all(PF_INET6); + +#ifdef CONFIG_SYSCTL + ipv6_sysctl_unregister(); +#endif + udpv6_exit(); + udplitev6_exit(); + tcpv6_exit(); + + /* Cleanup code parts. */ + ipv6_packet_cleanup(); + ipv6_frag_exit(); + ipv6_exthdrs_exit(); + addrconf_cleanup(); + ip6_flowlabel_cleanup(); + ip6_route_cleanup(); +#ifdef CONFIG_PROC_FS + + /* Cleanup code parts. */ + if6_proc_exit(); + ipv6_misc_proc_exit(); + udplite6_proc_exit(); + raw6_proc_exit(); +#endif + ipv6_netfilter_fini(); + igmp6_cleanup(); + ndisc_cleanup(); + ip6_mr_cleanup(); + icmpv6_cleanup(); + rawv6_exit(); + + unregister_pernet_subsys(&inet6_net_ops); +#ifdef CONFIG_SYSCTL + ipv6_static_sysctl_unregister(); +#endif + proto_unregister(&rawv6_prot); + proto_unregister(&udplitev6_prot); + proto_unregister(&udpv6_prot); + proto_unregister(&tcpv6_prot); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} +module_exit(inet6_exit); + +MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c new file mode 100644 index 00000000..4c0f894d --- /dev/null +++ b/net/ipv6/ah6.c @@ -0,0 +1,757 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro + * + * This file is derived from net/ipv4/ah.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPV6HDR_BASELEN 8 + +struct tmp_ext { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + struct in6_addr saddr; +#endif + struct in6_addr daddr; + char hdrs[0]; +}; + +struct ah_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) + +static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, + unsigned int size) +{ + unsigned int len; + + len = size + crypto_ahash_digestsize(ahash) + + (crypto_ahash_alignmask(ahash) & + ~(crypto_tfm_ctx_alignment() - 1)); + + len = ALIGN(len, crypto_tfm_ctx_alignment()); + + len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline struct tmp_ext *ah_tmp_ext(void *base) +{ + return base + IPV6HDR_BASELEN; +} + +static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset) +{ + return tmp + offset; +} + +static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, + unsigned int offset) +{ + return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); +} + +static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, + u8 *icv) +{ + struct ahash_request *req; + + req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), + crypto_tfm_ctx_alignment()); + + ahash_request_set_tfm(req, ahash); + + return req; +} + +static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, + struct ahash_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_ahash_reqsize(ahash), + __alignof__(struct scatterlist)); +} + +static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +{ + u8 *opt = (u8 *)opthdr; + int len = ipv6_optlen(opthdr); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + if (opt[off] & 0x20) + memset(&opt[off+2], 0, opt[off+1]); + break; + } + + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; + +bad: + return 0; +} + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +/** + * ipv6_rearrange_destopt - rearrange IPv6 destination options header + * @iph: IPv6 header + * @destopt: destionation options header + */ +static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) +{ + u8 *opt = (u8 *)destopt; + int len = ipv6_optlen(destopt); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + + /* Rearrange the source address in @iph and the + * addresses in home address option for final source. + * See 11.3.2 of RFC 3775 for details. + */ + if (opt[off] == IPV6_TLV_HAO) { + struct in6_addr final_addr; + struct ipv6_destopt_hao *hao; + + hao = (struct ipv6_destopt_hao *)&opt[off]; + if (hao->length != sizeof(hao->addr)) { + if (net_ratelimit()) + printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length); + goto bad; + } + ipv6_addr_copy(&final_addr, &hao->addr); + ipv6_addr_copy(&hao->addr, &iph->saddr); + ipv6_addr_copy(&iph->saddr, &final_addr); + } + break; + } + + off += optlen; + len -= optlen; + } + /* Note: ok if len == 0 */ +bad: + return; +} +#else +static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {} +#endif + +/** + * ipv6_rearrange_rthdr - rearrange IPv6 routing header + * @iph: IPv6 header + * @rthdr: routing header + * + * Rearrange the destination address in @iph and the addresses in @rthdr + * so that they appear in the order they will at the final destination. + * See Appendix A2 of RFC 2402 for details. + */ +static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) +{ + int segments, segments_left; + struct in6_addr *addrs; + struct in6_addr final_addr; + + segments_left = rthdr->segments_left; + if (segments_left == 0) + return; + rthdr->segments_left = 0; + + /* The value of rthdr->hdrlen has been verified either by the system + * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming + * packets. So we can assume that it is even and that segments is + * greater than or equal to segments_left. + * + * For the same reason we can assume that this option is of type 0. + */ + segments = rthdr->hdrlen >> 1; + + addrs = ((struct rt0_hdr *)rthdr)->addr; + ipv6_addr_copy(&final_addr, addrs + segments - 1); + + addrs += segments - segments_left; + memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); + + ipv6_addr_copy(addrs, &iph->daddr); + ipv6_addr_copy(&iph->daddr, &final_addr); +} + +static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) +{ + union { + struct ipv6hdr *iph; + struct ipv6_opt_hdr *opth; + struct ipv6_rt_hdr *rth; + char *raw; + } exthdr = { .iph = iph }; + char *end = exthdr.raw + len; + int nexthdr = iph->nexthdr; + + exthdr.iph++; + + while (exthdr.raw < end) { + switch (nexthdr) { + case NEXTHDR_DEST: + if (dir == XFRM_POLICY_OUT) + ipv6_rearrange_destopt(iph, exthdr.opth); + case NEXTHDR_HOP: + if (!zero_out_mutable_opts(exthdr.opth)) { + LIMIT_NETDEBUG( + KERN_WARNING "overrun %sopts\n", + nexthdr == NEXTHDR_HOP ? + "hop" : "dest"); + return -EINVAL; + } + break; + + case NEXTHDR_ROUTING: + ipv6_rearrange_rthdr(iph, exthdr.rth); + break; + + default : + return 0; + } + + nexthdr = exthdr.opth->nexthdr; + exthdr.raw += ipv6_optlen(exthdr.opth); + } + + return 0; +} + +static void ah6_output_done(struct crypto_async_request *base, int err) +{ + int extlen; + u8 *iph_base; + u8 *icv; + struct sk_buff *skb = base->data; + struct xfrm_state *x = skb_dst(skb)->xfrm; + struct ah_data *ahp = x->data; + struct ipv6hdr *top_iph = ipv6_hdr(skb); + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + struct tmp_ext *iph_ext; + + extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); + if (extlen) + extlen += sizeof(*iph_ext); + + iph_base = AH_SKB_CB(skb)->tmp; + iph_ext = ah_tmp_ext(iph_base); + icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen); + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + memcpy(top_iph, iph_base, IPV6HDR_BASELEN); + + if (extlen) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + memcpy(&top_iph->saddr, iph_ext, extlen); +#else + memcpy(&top_iph->daddr, iph_ext, extlen); +#endif + } + + kfree(AH_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + +static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int nfrags; + int extlen; + u8 *iph_base; + u8 *icv; + u8 nexthdr; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct ipv6hdr *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + struct tmp_ext *iph_ext; + + ahp = x->data; + ahash = ahp->ahash; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + skb_push(skb, -skb_network_offset(skb)); + extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); + if (extlen) + extlen += sizeof(*iph_ext); + + err = -ENOMEM; + iph_base = ah_alloc_tmp(ahash, nfrags, IPV6HDR_BASELEN + extlen); + if (!iph_base) + goto out; + + iph_ext = ah_tmp_ext(iph_base); + icv = ah_tmp_icv(ahash, iph_ext, extlen); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + ah = ip_auth_hdr(skb); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + + top_iph = ipv6_hdr(skb); + top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); + + nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_AH; + + /* When there are no extension headers, we only need to save the first + * 8 bytes of the base IP header. + */ + memcpy(iph_base, top_iph, IPV6HDR_BASELEN); + + if (extlen) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + memcpy(iph_ext, &top_iph->saddr, extlen); +#else + memcpy(iph_ext, &top_iph->daddr, extlen); +#endif + err = ipv6_clear_mutable_options(top_iph, + extlen - sizeof(*iph_ext) + + sizeof(*top_iph), + XFRM_POLICY_OUT); + if (err) + goto out_free; + } + + ah->nexthdr = nexthdr; + + top_iph->priority = 0; + top_iph->flow_lbl[0] = 0; + top_iph->flow_lbl[1] = 0; + top_iph->flow_lbl[2] = 0; + top_iph->hop_limit = 0; + + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah6_output_done, skb); + + AH_SKB_CB(skb)->tmp = iph_base; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; + } + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + memcpy(top_iph, iph_base, IPV6HDR_BASELEN); + + if (extlen) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + memcpy(&top_iph->saddr, iph_ext, extlen); +#else + memcpy(&top_iph->daddr, iph_ext, extlen); +#endif + } + +out_free: + kfree(iph_base); +out: + return err; +} + +static void ah6_input_done(struct crypto_async_request *base, int err) +{ + u8 *auth_data; + u8 *icv; + u8 *work_iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = xfrm_input_state(skb); + struct ah_data *ahp = x->data; + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int hdr_len = skb_network_header_len(skb); + int ah_hlen = (ah->hdrlen + 2) << 2; + + work_iph = AH_SKB_CB(skb)->tmp; + auth_data = ah_tmp_auth(work_iph, hdr_len); + icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out; + + err = ah->nexthdr; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); + skb_set_transport_header(skb, -hdr_len); +out: + kfree(AH_SKB_CB(skb)->tmp); + xfrm_input_resume(skb, err); +} + + + +static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) +{ + /* + * Before process AH + * [IPv6][Ext1][Ext2][AH][Dest][Payload] + * |<-------------->| hdr_len + * + * To erase AH: + * Keeping copy of cleared headers. After AH processing, + * Moving the pointer of skb->network_header by using skb_pull as long + * as AH header length. Then copy back the copy as long as hdr_len + * If destination header following AH exists, copy it into after [Ext2]. + * + * |<>|[IPv6][Ext1][Ext2][Dest][Payload] + * There is offset of AH before IPv6 header after the process. + */ + + u8 *auth_data; + u8 *icv; + u8 *work_iph; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct ip_auth_hdr *ah; + struct ipv6hdr *ip6h; + struct ah_data *ahp; + u16 hdr_len; + u16 ah_hlen; + int nexthdr; + int nfrags; + int err = -ENOMEM; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + hdr_len = skb_network_header_len(skb); + ah = (struct ip_auth_hdr *)skb->data; + ahp = x->data; + ahash = ahp->ahash; + + nexthdr = ah->nexthdr; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + ah = (struct ip_auth_hdr *)skb->data; + ip6h = ipv6_hdr(skb); + + skb_push(skb, hdr_len); + + work_iph = ah_alloc_tmp(ahash, nfrags, hdr_len + ahp->icv_trunc_len); + if (!work_iph) + goto out; + + auth_data = ah_tmp_auth(work_iph, hdr_len); + icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + memcpy(work_iph, ip6h, hdr_len); + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + + if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) + goto out_free; + + ip6h->priority = 0; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + ip6h->hop_limit = 0; + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah6_input_done, skb); + + AH_SKB_CB(skb)->tmp = work_iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + goto out_free; + } + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out_free; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, hdr_len); + skb->transport_header = skb->network_header; + __skb_pull(skb, ah_hlen + hdr_len); + + err = nexthdr; + +out_free: + kfree(work_iph); +out: + return err; +} + +static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + if (!x) + return; + + NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n", + ntohl(ah->spi), &iph->daddr); + + xfrm_state_put(x); +} + +static int ah6_init_state(struct xfrm_state *x) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + struct crypto_ahash *ahash; + + if (!x->aalg) + goto error; + + if (x->encap) + goto error; + + ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); + if (IS_ERR(ahash)) + goto error; + + ahp->ahash = ahash; + if (crypto_ahash_setkey(ahash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) + goto error; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_hash(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_ahash_digestsize(ahash)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_ahash_digestsize(ahash), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + switch (x->props.mode) { + case XFRM_MODE_BEET: + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct ipv6hdr); + break; + default: + goto error; + } + x->data = ahp; + + return 0; + +error: + if (ahp) { + crypto_free_ahash(ahp->ahash); + kfree(ahp); + } + return -EINVAL; +} + +static void ah6_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + crypto_free_ahash(ahp->ahash); + kfree(ahp); +} + +static const struct xfrm_type ah6_type = +{ + .description = "AH6", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .flags = XFRM_TYPE_REPLAY_PROT, + .init_state = ah6_init_state, + .destructor = ah6_destroy, + .input = ah6_input, + .output = ah6_output, + .hdr_offset = xfrm6_find_1stfragopt, +}; + +static const struct inet6_protocol ah6_protocol = { + .handler = xfrm6_rcv, + .err_handler = ah6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ah6_init(void) +{ + if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + return -EAGAIN; + } + + if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + xfrm_unregister_type(&ah6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit ah6_fini(void) +{ + if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + + if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + +} + +module_init(ah6_init); +module_exit(ah6_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c new file mode 100644 index 00000000..674255f5 --- /dev/null +++ b/net/ipv6/anycast.c @@ -0,0 +1,549 @@ +/* + * Anycast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * David L Stevens (dlstevens@us.ibm.com) + * + * based heavily on net/ipv6/mcast.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); + +/* Big ac list lock for all the sockets */ +static DEFINE_RWLOCK(ipv6_sk_ac_lock); + + +/* + * socket join an anycast group + */ + +int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct inet6_dev *idev; + struct ipv6_ac_socklist *pac; + struct net *net = sock_net(sk); + int ishost = !net->ipv6.devconf_all->forwarding; + int err = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (ipv6_addr_is_multicast(addr)) + return -EINVAL; + if (ipv6_chk_addr(net, addr, NULL, 0)) + return -EINVAL; + + pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); + if (pac == NULL) + return -ENOMEM; + pac->acl_next = NULL; + ipv6_addr_copy(&pac->acl_addr, addr); + + rcu_read_lock(); + if (ifindex == 0) { + struct rt6_info *rt; + + rt = rt6_lookup(net, addr, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dst_release(&rt->dst); + } else if (ishost) { + err = -EADDRNOTAVAIL; + goto error; + } else { + /* router, no matching interface: just pick one */ + dev = dev_get_by_flags_rcu(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); + } + } else + dev = dev_get_by_index_rcu(net, ifindex); + + if (dev == NULL) { + err = -ENODEV; + goto error; + } + + idev = __in6_dev_get(dev); + if (!idev) { + if (ifindex) + err = -ENODEV; + else + err = -EADDRNOTAVAIL; + goto error; + } + /* reset ishost, now that we have a specific device */ + ishost = !idev->cnf.forwarding; + + pac->acl_ifindex = dev->ifindex; + + /* XXX + * For hosts, allow link-local or matching prefix anycasts. + * This obviates the need for propagating anycast routes while + * still allowing some non-router anycast participation. + */ + if (!ipv6_chk_prefix(addr, dev)) { + if (ishost) + err = -EADDRNOTAVAIL; + if (err) + goto error; + } + + err = ipv6_dev_ac_inc(dev, addr); + if (!err) { + write_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + write_unlock_bh(&ipv6_sk_ac_lock); + pac = NULL; + } + +error: + rcu_read_unlock(); + if (pac) + sock_kfree_s(sk, pac, sizeof(*pac)); + return err; +} + +/* + * socket leave an anycast group + */ +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev; + struct ipv6_ac_socklist *pac, *prev_pac; + struct net *net = sock_net(sk); + + write_lock_bh(&ipv6_sk_ac_lock); + prev_pac = NULL; + for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { + if ((ifindex == 0 || pac->acl_ifindex == ifindex) && + ipv6_addr_equal(&pac->acl_addr, addr)) + break; + prev_pac = pac; + } + if (!pac) { + write_unlock_bh(&ipv6_sk_ac_lock); + return -ENOENT; + } + if (prev_pac) + prev_pac->acl_next = pac->acl_next; + else + np->ipv6_ac_list = pac->acl_next; + + write_unlock_bh(&ipv6_sk_ac_lock); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + rcu_read_unlock(); + + sock_kfree_s(sk, pac, sizeof(*pac)); + return 0; +} + +void ipv6_sock_ac_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct ipv6_ac_socklist *pac; + struct net *net = sock_net(sk); + int prev_index; + + write_lock_bh(&ipv6_sk_ac_lock); + pac = np->ipv6_ac_list; + np->ipv6_ac_list = NULL; + write_unlock_bh(&ipv6_sk_ac_lock); + + prev_index = 0; + rcu_read_lock(); + while (pac) { + struct ipv6_ac_socklist *next = pac->acl_next; + + if (pac->acl_ifindex != prev_index) { + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + prev_index = pac->acl_ifindex; + } + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + sock_kfree_s(sk, pac, sizeof(*pac)); + pac = next; + } + rcu_read_unlock(); +} + +#if 0 +/* The function is not used, which is funny. Apparently, author + * supposed to use it to filter out datagrams inside udp/raw but forgot. + * + * It is OK, anycasts are not special comparing to delivery to unicasts. + */ + +int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex) +{ + struct ipv6_ac_socklist *pac; + struct ipv6_pinfo *np = inet6_sk(sk); + int found; + + found = 0; + read_lock(&ipv6_sk_ac_lock); + for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) { + if (ifindex && pac->acl_ifindex != ifindex) + continue; + found = ipv6_addr_equal(&pac->acl_addr, addr); + if (found) + break; + } + read_unlock(&ipv6_sk_ac_lock); + + return found; +} + +#endif + +static void aca_put(struct ifacaddr6 *ac) +{ + if (atomic_dec_and_test(&ac->aca_refcnt)) { + in6_dev_put(ac->aca_idev); + dst_release(&ac->aca_rt->dst); + kfree(ac); + } +} + +/* + * device anycast group inc (add if not found) + */ +int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) +{ + struct ifacaddr6 *aca; + struct inet6_dev *idev; + struct rt6_info *rt; + int err; + + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + err = -ENODEV; + goto out; + } + + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) { + aca->aca_users++; + err = 0; + goto out; + } + } + + /* + * not found: create a new one. + */ + + aca = kzalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); + + if (aca == NULL) { + err = -ENOMEM; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 1); + if (IS_ERR(rt)) { + kfree(aca); + err = PTR_ERR(rt); + goto out; + } + + ipv6_addr_copy(&aca->aca_addr, addr); + aca->aca_idev = idev; + aca->aca_rt = rt; + aca->aca_users = 1; + /* aca_tstamp should be updated upon changes */ + aca->aca_cstamp = aca->aca_tstamp = jiffies; + atomic_set(&aca->aca_refcnt, 2); + spin_lock_init(&aca->aca_lock); + + aca->aca_next = idev->ac_list; + idev->ac_list = aca; + write_unlock_bh(&idev->lock); + + ip6_ins_rt(rt); + + addrconf_join_solict(dev, &aca->aca_addr); + + aca_put(aca); + return 0; +out: + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return err; +} + +/* + * device anycast group decrement + */ +int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct ifacaddr6 *aca, *prev_aca; + + write_lock_bh(&idev->lock); + prev_aca = NULL; + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + prev_aca = aca; + } + if (!aca) { + write_unlock_bh(&idev->lock); + return -ENOENT; + } + if (--aca->aca_users > 0) { + write_unlock_bh(&idev->lock); + return 0; + } + if (prev_aca) + prev_aca->aca_next = aca->aca_next; + else + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->dst); + ip6_del_rt(aca->aca_rt); + + aca_put(aca); + return 0; +} + +/* called with rcu_read_lock() */ +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (idev == NULL) + return -ENODEV; + return __ipv6_dev_ac_dec(idev, addr); +} + +/* + * check if the interface has this anycast address + * called with rcu_read_lock() + */ +static int ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) +{ + struct inet6_dev *idev; + struct ifacaddr6 *aca; + + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (aca = idev->ac_list; aca; aca = aca->aca_next) + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + read_unlock_bh(&idev->lock); + return aca != NULL; + } + return 0; +} + +/* + * check if given interface (or any, if dev==0) has this anycast address + */ +int ipv6_chk_acast_addr(struct net *net, struct net_device *dev, + const struct in6_addr *addr) +{ + int found = 0; + + rcu_read_lock(); + if (dev) + found = ipv6_chk_acast_dev(dev, addr); + else + for_each_netdev_rcu(net, dev) + if (ipv6_chk_acast_dev(dev, addr)) { + found = 1; + break; + } + rcu_read_unlock(); + return found; +} + + +#ifdef CONFIG_PROC_FS +struct ac6_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) + +static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) +{ + struct ifacaddr6 *im = NULL; + struct ac6_iter_state *state = ac6_seq_private(seq); + struct net *net = seq_file_net(seq); + + state->idev = NULL; + for_each_netdev_rcu(net, state->dev) { + struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->ac_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + } + return im; +} + +static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + + im = im->aca_next; + while (!im) { + if (likely(state->idev != NULL)) + read_unlock_bh(&state->idev->lock); + + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = __in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->ac_list; + } + return im; +} + +static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifacaddr6 *im = ac6_get_first(seq); + if (im) + while (pos && (im = ac6_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ac6_get_idx(seq, *pos); +} + +static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifacaddr6 *im = ac6_get_next(seq, v); + + ++*pos; + return im; +} + +static void ac6_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + state->idev = NULL; + } + rcu_read_unlock(); +} + +static int ac6_seq_show(struct seq_file *seq, void *v) +{ + struct ifacaddr6 *im = (struct ifacaddr6 *)v; + struct ac6_iter_state *state = ac6_seq_private(seq); + + seq_printf(seq, "%-4d %-15s %pi6 %5d\n", + state->dev->ifindex, state->dev->name, + &im->aca_addr, im->aca_users); + return 0; +} + +static const struct seq_operations ac6_seq_ops = { + .start = ac6_seq_start, + .next = ac6_seq_next, + .stop = ac6_seq_stop, + .show = ac6_seq_show, +}; + +static int ac6_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ac6_seq_ops, + sizeof(struct ac6_iter_state)); +} + +static const struct file_operations ac6_seq_fops = { + .owner = THIS_MODULE, + .open = ac6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +int __net_init ac6_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "anycast6", S_IRUGO, &ac6_seq_fops)) + return -ENOMEM; + + return 0; +} + +void ac6_proc_exit(struct net *net) +{ + proc_net_remove(net, "anycast6"); +} +#endif + diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c new file mode 100644 index 00000000..16560336 --- /dev/null +++ b/net/ipv6/datagram.c @@ -0,0 +1,868 @@ +/* + * common UDP/RAW code + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr, *final_p, final; + struct dst_entry *dst; + struct flowi6 fl6; + struct ip6_flowlabel *flowlabel = NULL; + struct ipv6_txoptions *opt; + int addr_type; + int err; + + if (usin->sin6_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + err = ip4_datagram_connect(sk, uaddr, addr_len); + goto ipv4_connected; + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl6, 0, sizeof(fl6)); + if (np->sndflow) { + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + } + } + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if (addr_type == IPV6_ADDR_ANY) { + /* + * connect to self + */ + usin->sin6_addr.s6_addr[15] = 0x01; + } + + daddr = &usin->sin6_addr; + + if (addr_type == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + + if (__ipv6_only_sock(sk)) { + err = -ENETUNREACH; + goto out; + } + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + sin.sin_port = usin->sin6_port; + + err = ip4_datagram_connect(sk, + (struct sockaddr*) &sin, + sizeof(sin)); + +ipv4_connected: + if (err) + goto out; + + ipv6_addr_set_v4mapped(inet->inet_daddr, &np->daddr); + + if (ipv6_addr_any(&np->saddr)) + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); + + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, + &np->rcv_saddr); + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } + + goto out; + } + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) { + err = -EINVAL; + goto out; + } + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) + sk->sk_bound_dev_if = np->mcast_oif; + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out; + } + } + + ipv6_addr_copy(&np->daddr, daddr); + np->flow_label = fl6.flowlabel; + + inet->inet_dport = usin->sin6_port; + + /* + * Check for a route to destination an obtain the + * destination cache for it. + */ + + fl6.flowi6_proto = sk->sk_protocol; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) + fl6.flowi6_oif = np->mcast_oif; + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + opt = flowlabel ? flowlabel->opt : np->opt; + final_p = fl6_update_dst(&fl6, opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; + } + + /* source address lookup done in ip6_dst_lookup */ + + if (ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&np->saddr, &fl6.saddr); + + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_copy(&np->rcv_saddr, &fl6.saddr); + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } + + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl6.daddr, &np->daddr) ? + &np->daddr : NULL, +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? + &np->saddr : +#endif + NULL); + + sk->sk_state = TCP_ESTABLISHED; +out: + fl6_sock_release(flowlabel); + return err; +} + +void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr *icmph = icmp6_hdr(skb); + struct sock_exterr_skb *serr; + + if (!np->recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + skb->protocol = htons(ETH_P_IPV6); + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; + serr->ee.ee_type = icmph->icmp6_type; + serr->ee.ee_code = icmph->icmp6_code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - + skb_network_header(skb); + serr->port = port; + + __skb_pull(skb, payload - skb->data); + skb_reset_transport_header(skb); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct ipv6hdr *iph; + struct sk_buff *skb; + + if (!np->recverr) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + skb->protocol = htons(ETH_P_IPV6); + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + ipv6_addr_copy(&iph->daddr, &fl6->daddr); + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); + serr->port = fl6->fl6_dport; + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + struct ip6_mtuinfo *mtu_info; + + if (!np->rxopt.bits.rxpmtu) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + ipv6_addr_copy(&iph->daddr, &fl6->daddr); + + mtu_info = IP6CBMTU(skb); + if (!mtu_info) { + kfree_skb(skb); + return; + } + + mtu_info->ip6m_mtu = mtu; + mtu_info->ip6m_addr.sin6_family = AF_INET6; + mtu_info->ip6m_addr.sin6_port = 0; + mtu_info->ip6m_addr.sin6_flowinfo = 0; + mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; + ipv6_addr_copy(&mtu_info->ip6m_addr.sin6_addr, &ipv6_hdr(skb)->daddr); + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + skb = xchg(&np->rxpmtu, skb); + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in6 *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in6 offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + const unsigned char *nh = skb_network_header(skb); + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = serr->port; + sin->sin6_scope_id = 0; + if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_addr_copy(&sin->sin6_addr, + (struct in6_addr *)(nh + serr->addr_offset)); + if (np->sndflow) + sin->sin6_flowinfo = + (*(__be32 *)(nh + serr->addr_offset - 24) & + IPV6_FLOWINFO_MASK); + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), + &sin->sin6_addr); + } + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin6_family = AF_UNSPEC; + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_addr_copy(&sin->sin6_addr, &ipv6_hdr(skb)->saddr); + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + struct inet_sock *inet = inet_sk(sk); + + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, + &sin->sin6_addr); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + } + + put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else { + spin_unlock_bh(&sk->sk_error_queue.lock); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + +/* + * Handle IPV6_RECVPATHMTU + */ +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct sockaddr_in6 *sin; + struct ip6_mtuinfo mtu_info; + int err; + int copied; + + err = -EAGAIN; + skb = xchg(&np->rxpmtu, NULL); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = 0; + sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; + ipv6_addr_copy(&sin->sin6_addr, &mtu_info.ip6m_addr.sin6_addr); + } + + put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); + + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + unsigned char *nh = skb_network_header(skb); + + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + + src_info.ipi6_ifindex = opt->iif; + ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + + if (np->rxopt.bits.rxhlim) { + int hlim = ipv6_hdr(skb)->hop_limit; + put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + + if (np->rxopt.bits.rxtclass) { + int tclass = (ntohl(*(__be32 *)ipv6_hdr(skb)) >> 20) & 0xff; + put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); + } + + if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { + __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } + + /* HbH is allowed only once */ + if (np->rxopt.bits.hopopts && opt->hop) { + u8 *ptr = nh + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + } + + if (opt->lastopt && + (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { + /* + * Silly enough, but we need to reparse in order to + * report extension headers (except for HbH) + * in order. + * + * Also note that IPV6_RECVRTHDRDSTOPTS is NOT + * (and WILL NOT be) defined because + * IPV6_RECVDSTOPTS is more generic. --yoshfuji + */ + unsigned int off = sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + + while (off <= opt->lastopt) { + unsigned len; + u8 *ptr = nh + off; + + switch(nexthdr) { + case IPPROTO_DSTOPTS: + nexthdr = ptr[0]; + len = (ptr[1] + 1) << 3; + if (np->rxopt.bits.dstopts) + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); + break; + case IPPROTO_ROUTING: + nexthdr = ptr[0]; + len = (ptr[1] + 1) << 3; + if (np->rxopt.bits.srcrt) + put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); + break; + case IPPROTO_AH: + nexthdr = ptr[0]; + len = (ptr[1] + 2) << 2; + break; + default: + nexthdr = ptr[0]; + len = (ptr[1] + 1) << 3; + break; + } + + off += len; + } + } + + /* socket options in old style */ + if (np->rxopt.bits.rxoinfo) { + struct in6_pktinfo src_info; + + src_info.ipi6_ifindex = opt->iif; + ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxohlim) { + int hlim = ipv6_hdr(skb)->hop_limit; + put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); + } + if (np->rxopt.bits.ohopopts && opt->hop) { + u8 *ptr = nh + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.odstopts && opt->dst0) { + u8 *ptr = nh + opt->dst0; + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.osrcrt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); + put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + } + if (np->rxopt.bits.odstopts && opt->dst1) { + u8 *ptr = nh + opt->dst1; + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; + u16 *ports = (u16 *) skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 <= skb->len) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin6.sin6_family = AF_INET6; + ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr); + sin6.sin6_port = ports[1]; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = 0; + + put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); + } + } + return 0; +} + +int datagram_send_ctl(struct net *net, + struct msghdr *msg, struct flowi6 *fl6, + struct ipv6_txoptions *opt, + int *hlimit, int *tclass, int *dontfrag) +{ + struct in6_pktinfo *src_info; + struct cmsghdr *cmsg; + struct ipv6_rt_hdr *rthdr; + struct ipv6_opt_hdr *hdr; + int len; + int err = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + int addr_type; + + if (!CMSG_OK(msg, cmsg)) { + err = -EINVAL; + goto exit_f; + } + + if (cmsg->cmsg_level != SOL_IPV6) + continue; + + switch (cmsg->cmsg_type) { + case IPV6_PKTINFO: + case IPV6_2292PKTINFO: + { + struct net_device *dev = NULL; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { + err = -EINVAL; + goto exit_f; + } + + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + + if (src_info->ipi6_ifindex) { + if (fl6->flowi6_oif && + src_info->ipi6_ifindex != fl6->flowi6_oif) + return -EINVAL; + fl6->flowi6_oif = src_info->ipi6_ifindex; + } + + addr_type = __ipv6_addr_type(&src_info->ipi6_addr); + + rcu_read_lock(); + if (fl6->flowi6_oif) { + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + } else if (addr_type & IPV6_ADDR_LINKLOCAL) { + rcu_read_unlock(); + return -EINVAL; + } + + if (addr_type != IPV6_ADDR_ANY) { + int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; + if (!ipv6_chk_addr(net, &src_info->ipi6_addr, + strict ? dev : NULL, 0)) + err = -EINVAL; + else + ipv6_addr_copy(&fl6->saddr, &src_info->ipi6_addr); + } + + rcu_read_unlock(); + + if (err) + goto exit_f; + + break; + } + + case IPV6_FLOWINFO: + if (cmsg->cmsg_len < CMSG_LEN(4)) { + err = -EINVAL; + goto exit_f; + } + + if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { + if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { + err = -EINVAL; + goto exit_f; + } + } + fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); + break; + + case IPV6_2292HOPOPTS: + case IPV6_HOPOPTS: + if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + opt->opt_nflen += len; + opt->hopopt = hdr; + break; + + case IPV6_2292DSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (opt->dst1opt) { + err = -EINVAL; + goto exit_f; + } + opt->opt_flen += len; + opt->dst1opt = hdr; + break; + + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (cmsg->cmsg_type == IPV6_DSTOPTS) { + opt->opt_flen += len; + opt->dst1opt = hdr; + } else { + opt->opt_nflen += len; + opt->dst0opt = hdr; + } + break; + + case IPV6_2292RTHDR: + case IPV6_RTHDR: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); + + switch (rthdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || + rthdr->segments_left != 1) { + err = -EINVAL; + goto exit_f; + } + break; +#endif + default: + err = -EINVAL; + goto exit_f; + } + + len = ((rthdr->hdrlen + 1) << 3); + + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + + /* segments left must also match */ + if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { + err = -EINVAL; + goto exit_f; + } + + opt->opt_nflen += len; + opt->srcrt = rthdr; + + if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { + int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); + + opt->opt_nflen += dsthdrlen; + opt->dst0opt = opt->dst1opt; + opt->dst1opt = NULL; + opt->opt_flen -= dsthdrlen; + } + + break; + + case IPV6_2292HOPLIMIT: + case IPV6_HOPLIMIT: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + err = -EINVAL; + goto exit_f; + } + + *hlimit = *(int *)CMSG_DATA(cmsg); + if (*hlimit < -1 || *hlimit > 0xff) { + err = -EINVAL; + goto exit_f; + } + + break; + + case IPV6_TCLASS: + { + int tc; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + goto exit_f; + } + + tc = *(int *)CMSG_DATA(cmsg); + if (tc < -1 || tc > 0xff) + goto exit_f; + + err = 0; + *tclass = tc; + + break; + } + + case IPV6_DONTFRAG: + { + int df; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + goto exit_f; + } + + df = *(int *)CMSG_DATA(cmsg); + if (df < 0 || df > 1) + goto exit_f; + + err = 0; + *dontfrag = df; + + break; + } + default: + LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", + cmsg->cmsg_type); + err = -EINVAL; + goto exit_f; + } + } + +exit_f: + return err; +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c new file mode 100644 index 00000000..65dd5433 --- /dev/null +++ b/net/ipv6/esp6.c @@ -0,0 +1,674 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro + * + * This file is derived from net/ipv4/esp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct esp_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) + +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); + +/* + * Allocate an AEAD request structure with extra space for SG and IV. + * + * For alignment considerations the upper 32 bits of the sequence number are + * placed at the front, if present. Followed by the IV, the request and finally + * the SG list. + * + * TODO: Use spare space in skb for this where possible. + */ +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) +{ + unsigned int len; + + len = seqihlen; + + len += crypto_aead_ivsize(aead); + + if (len) { + len += crypto_aead_alignmask(aead) & + ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + } + + len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} + +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) +{ + return crypto_aead_ivsize(aead) ? + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; +} + +static inline struct aead_givcrypt_request *esp_tmp_givreq( + struct crypto_aead *aead, u8 *iv) +{ + struct aead_givcrypt_request *req; + + req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), + crypto_tfm_ctx_alignment()); + aead_givcrypt_set_tfm(req, aead); + return req; +} + +static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) +{ + struct aead_request *req; + + req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), + crypto_tfm_ctx_alignment()); + aead_request_set_tfm(req, aead); + return req; +} + +static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, + struct aead_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_aead_reqsize(aead), + __alignof__(struct scatterlist)); +} + +static inline struct scatterlist *esp_givreq_sg( + struct crypto_aead *aead, struct aead_givcrypt_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_aead_reqsize(aead), + __alignof__(struct scatterlist)); +} + +static void esp_output_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + kfree(ESP_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + +static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct aead_givcrypt_request *req; + struct scatterlist *sg; + struct scatterlist *asg; + struct sk_buff *trailer; + void *tmp; + int blksize; + int clen; + int alen; + int plen; + int tfclen; + int nfrags; + int assoclen; + int sglists; + int seqhilen; + u8 *iv; + u8 *tail; + __be32 *seqhi; + struct esp_data *esp = x->data; + + /* skb is pure payload to encrypt */ + err = -ENOMEM; + + aead = esp->aead; + alen = crypto_aead_authsize(aead); + + tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + tfclen = padto - skb->len; + } + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + clen = ALIGN(skb->len + 2 + tfclen, blksize); + if (esp->padlen) + clen = ALIGN(clen, esp->padlen); + plen = clen - skb->len - tfclen; + + err = skb_cow_data(skb, tfclen + plen + alen, &trailer); + if (err < 0) + goto error; + nfrags = err; + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) + goto error; + + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_givreq(aead, iv); + asg = esp_givreq_sg(aead, req); + sg = asg + sglists; + + /* Fill padding... */ + tail = skb_tail_pointer(trailer); + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = *skb_mac_header(skb); + pskb_put(skb, trailer, clen - skb->len + alen); + + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + esph->enc_data + crypto_aead_ivsize(aead) - skb->data, + clen + alen); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); + + aead_givcrypt_set_callback(req, 0, esp_output_done, skb); + aead_givcrypt_set_crypt(req, sg, sg, clen, iv); + aead_givcrypt_set_assoc(req, asg, assoclen); + aead_givcrypt_set_giv(req, esph->enc_data, + XFRM_SKB_CB(skb)->seq.output.low); + + ESP_SKB_CB(skb)->tmp = tmp; + err = crypto_aead_givencrypt(req); + if (err == -EINPROGRESS) + goto error; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + + kfree(tmp); + +error: + return err; +} + +static int esp_input_done2(struct sk_buff *skb, int err) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; + int alen = crypto_aead_authsize(aead); + int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); + int elen = skb->len - hlen; + int hdr_len = skb_network_header_len(skb); + int padlen; + u8 nexthdr[2]; + + kfree(ESP_SKB_CB(skb)->tmp); + + if (unlikely(err)) + goto out; + + if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) + BUG(); + + err = -EINVAL; + padlen = nexthdr[0]; + if (padlen + 2 + alen >= elen) { + LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage " + "padlen=%d, elen=%d\n", padlen + 2, elen - alen); + goto out; + } + + /* ... check padding bits here. Silly. :-) */ + + pskb_trim(skb, skb->len - alen - padlen - 2); + __skb_pull(skb, hlen); + skb_set_transport_header(skb, -hdr_len); + + err = nexthdr[1]; + + /* RFC4303: Drop dummy packets without any error */ + if (err == IPPROTO_NONE) + err = -EINVAL; + +out: + return err; +} + +static void esp_input_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + xfrm_input_resume(skb, esp_input_done2(skb, err)); +} + +static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_esp_hdr *esph; + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; + struct aead_request *req; + struct sk_buff *trailer; + int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int nfrags; + int assoclen; + int sglists; + int seqhilen; + int ret = 0; + void *tmp; + __be32 *seqhi; + u8 *iv; + struct scatterlist *sg; + struct scatterlist *asg; + + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) { + ret = -EINVAL; + goto out; + } + + if (elen <= 0) { + ret = -EINVAL; + goto out; + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { + ret = -EINVAL; + goto out; + } + + ret = -ENOMEM; + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) + goto out; + + ESP_SKB_CB(skb)->tmp = tmp; + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + asg = esp_req_sg(aead, req); + sg = asg + sglists; + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ip_esp_hdr *)skb->data; + + /* Get ivec. This can be wrong, check against another impls. */ + iv = esph->enc_data; + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); + + aead_request_set_callback(req, 0, esp_input_done, skb); + aead_request_set_crypt(req, sg, sg, elen, iv); + aead_request_set_assoc(req, asg, assoclen); + + ret = crypto_aead_decrypt(req); + if (ret == -EINPROGRESS) + goto out; + + ret = esp_input_done2(skb, ret); + +out: + return ret; +} + +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); + u32 align = max_t(u32, blksize, esp->padlen); + unsigned int net_adj; + + if (x->props.mode != XFRM_MODE_TUNNEL) + net_adj = sizeof(struct ipv6hdr); + else + net_adj = 0; + + return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - + net_adj) & ~(align - 1)) + (net_adj - 2); +} + +static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET6); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%pI6\n", + ntohl(esph->spi), &iph->daddr); + xfrm_state_put(x); +} + +static void esp6_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + crypto_free_aead(esp->aead); + kfree(esp); +} + +static int esp_init_aead(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + struct crypto_aead *aead; + int err; + + aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = PTR_ERR(aead); + if (IS_ERR(aead)) + goto error; + + esp->aead = aead; + + err = crypto_aead_setkey(aead, x->aead->alg_key, + (x->aead->alg_key_len + 7) / 8); + if (err) + goto error; + + err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8); + if (err) + goto error; + +error: + return err; +} + +static int esp_init_authenc(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + struct crypto_aead *aead; + struct crypto_authenc_key_param *param; + struct rtattr *rta; + char *key; + char *p; + char authenc_name[CRYPTO_MAX_ALG_NAME]; + unsigned int keylen; + int err; + + err = -EINVAL; + if (x->ealg == NULL) + goto error; + + err = -ENAMETOOLONG; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } + + aead = crypto_alloc_aead(authenc_name, 0, 0); + err = PTR_ERR(aead); + if (IS_ERR(aead)) + goto error; + + esp->aead = aead; + + keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); + err = -ENOMEM; + key = kmalloc(keylen, GFP_KERNEL); + if (!key) + goto error; + + p = key; + rta = (void *)p; + rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM; + rta->rta_len = RTA_LENGTH(sizeof(*param)); + param = RTA_DATA(rta); + p += RTA_SPACE(sizeof(*param)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8); + p += (x->aalg->alg_key_len + 7) / 8; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + err = -EINVAL; + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_aead_authsize(aead)) { + NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto free_key; + } + + err = crypto_aead_setauthsize( + aead, x->aalg->alg_trunc_len / 8); + if (err) + goto free_key; + } + + param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8); + memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8); + + err = crypto_aead_setkey(aead, key, keylen); + +free_key: + kfree(key); + +error: + return err; +} + +static int esp6_init_state(struct xfrm_state *x) +{ + struct esp_data *esp; + struct crypto_aead *aead; + u32 align; + int err; + + if (x->encap) + return -EINVAL; + + esp = kzalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + x->data = esp; + + if (x->aead) + err = esp_init_aead(x); + else + err = esp_init_authenc(x); + + if (err) + goto error; + + aead = esp->aead; + + esp->padlen = 0; + + x->props.header_len = sizeof(struct ip_esp_hdr) + + crypto_aead_ivsize(aead); + switch (x->props.mode) { + case XFRM_MODE_BEET: + if (x->sel.family != AF_INET6) + x->props.header_len += IPV4_BEET_PHMAXLEN + + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); + break; + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct ipv6hdr); + break; + default: + goto error; + } + + align = ALIGN(crypto_aead_blocksize(aead), 4); + if (esp->padlen) + align = max_t(u32, align, esp->padlen); + x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + +error: + return err; +} + +static const struct xfrm_type esp6_type = +{ + .description = "ESP6", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .flags = XFRM_TYPE_REPLAY_PROT, + .init_state = esp6_init_state, + .destructor = esp6_destroy, + .get_mtu = esp6_get_mtu, + .input = esp6_input, + .output = esp6_output, + .hdr_offset = xfrm6_find_1stfragopt, +}; + +static const struct inet6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .err_handler = esp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init esp6_init(void) +{ + if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + xfrm_unregister_type(&esp6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit esp6_fini(void) +{ + if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); +} + +module_init(esp6_init); +module_exit(esp6_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c new file mode 100644 index 00000000..79a485e8 --- /dev/null +++ b/net/ipv6/exthdrs.c @@ -0,0 +1,898 @@ +/* + * Extension Header handling for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Andi Kleen + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * yoshfuji : ensure not to overrun while parsing + * tlv options. + * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). + * YOSHIFUJI Hideaki @USAGI Register inbound extension header + * handlers as inet6_protocol{}. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#include +#endif + +#include + +int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +{ + const unsigned char *nh = skb_network_header(skb); + int packet_len = skb->tail - skb->network_header; + struct ipv6_opt_hdr *hdr; + int len; + + if (offset + 2 > packet_len) + goto bad; + hdr = (struct ipv6_opt_hdr *)(nh + offset); + len = ((hdr->hdrlen + 1) << 3); + + if (offset + len > packet_len) + goto bad; + + offset += 2; + len -= 2; + + while (len > 0) { + int opttype = nh[offset]; + int optlen; + + if (opttype == type) + return offset; + + switch (opttype) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + optlen = nh[offset + 1] + 2; + if (optlen > len) + goto bad; + break; + } + offset += optlen; + len -= optlen; + } + /* not_found */ + bad: + return -1; +} +EXPORT_SYMBOL_GPL(ipv6_find_tlv); + +/* + * Parsing tlv encoded headers. + * + * Parsing function "func" returns 1, if parsing succeed + * and 0, if it failed. + * It MUST NOT touch skb->h. + */ + +struct tlvtype_proc { + int type; + int (*func)(struct sk_buff *skb, int offset); +}; + +/********************* + Generic functions + *********************/ + +/* An unknown option is detected, decide what to do */ + +static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +{ + switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { + case 0: /* ignore */ + return 1; + + case 1: /* drop packet */ + break; + + case 3: /* Send ICMP if not a multicast address and drop packet */ + /* Actually, it is redundant check. icmp_send + will recheck in any case. + */ + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) + break; + case 2: /* send ICMP PARM PROB regardless and drop packet */ + icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); + return 0; + } + + kfree_skb(skb); + return 0; +} + +/* Parse tlv encoded option header (hop-by-hop or destination) */ + +static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) +{ + struct tlvtype_proc *curr; + const unsigned char *nh = skb_network_header(skb); + int off = skb_network_header_len(skb); + int len = (skb_transport_header(skb)[1] + 1) << 3; + + if (skb_transport_offset(skb) + len > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + default: /* Other TLV code so scan list */ + if (optlen > len) + goto bad; + for (curr=procs; curr->type >= 0; curr++) { + if (curr->type == nh[off]) { + /* type specific length/alignment + checks will be performed in the + func(). */ + if (curr->func(skb, off) == 0) + return 0; + break; + } + } + if (curr->type < 0) { + if (ip6_tlvopt_unknown(skb, off) == 0) + return 0; + } + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; +bad: + kfree_skb(skb); + return 0; +} + +/***************************** + Destination options header. + *****************************/ + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +static int ipv6_dest_hao(struct sk_buff *skb, int optoff) +{ + struct ipv6_destopt_hao *hao; + struct inet6_skb_parm *opt = IP6CB(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct in6_addr tmp_addr; + int ret; + + if (opt->dsthao) { + LIMIT_NETDEBUG(KERN_DEBUG "hao duplicated\n"); + goto discard; + } + opt->dsthao = opt->dst1; + opt->dst1 = 0; + + hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); + + if (hao->length != 16) { + LIMIT_NETDEBUG( + KERN_DEBUG "hao invalid option length = %d\n", hao->length); + goto discard; + } + + if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { + LIMIT_NETDEBUG( + KERN_DEBUG "hao is not an unicast addr: %pI6\n", &hao->addr); + goto discard; + } + + ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr, + (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS); + if (unlikely(ret < 0)) + goto discard; + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto discard; + + /* update all variable using below by copied skbuff */ + hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + + optoff); + ipv6h = ipv6_hdr(skb); + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + + ipv6_addr_copy(&tmp_addr, &ipv6h->saddr); + ipv6_addr_copy(&ipv6h->saddr, &hao->addr); + ipv6_addr_copy(&hao->addr, &tmp_addr); + + if (skb->tstamp.tv64 == 0) + __net_timestamp(skb); + + return 1; + + discard: + kfree_skb(skb); + return 0; +} +#endif + +static struct tlvtype_proc tlvprocdestopt_lst[] = { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + { + .type = IPV6_TLV_HAO, + .func = ipv6_dest_hao, + }, +#endif + {-1, NULL} +}; + +static int ipv6_destopt_rcv(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + __u16 dstbuf; +#endif + struct dst_entry *dst; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { + IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + opt->lastopt = opt->dst1 = skb_network_header_len(skb); +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + dstbuf = opt->dst1; +#endif + + dst = dst_clone(skb_dst(skb)); + if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { + dst_release(dst); + skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + opt = IP6CB(skb); +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + opt->nhoff = dstbuf; +#else + opt->nhoff = opt->dst1; +#endif + return 1; + } + + IP6_INC_STATS_BH(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + dst_release(dst); + return -1; +} + +/******************************** + Routing header. + ********************************/ + +/* called with rcu_read_lock() */ +static int ipv6_rthdr_rcv(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); + struct in6_addr *addr = NULL; + struct in6_addr daddr; + struct inet6_dev *idev; + int n, i; + struct ipv6_rt_hdr *hdr; + struct rt0_hdr *rthdr; + struct net *net = dev_net(skb->dev); + int accept_source_route = net->ipv6.devconf_all->accept_source_route; + + idev = __in6_dev_get(skb->dev); + if (idev && accept_source_route > idev->cnf.accept_source_route) + accept_source_route = idev->cnf.accept_source_route; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); + + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || + skb->pkt_type != PACKET_HOST) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + +looped_back: + if (hdr->segments_left == 0) { + switch (hdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + /* Silently discard type 2 header unless it was + * processed by own + */ + if (!addr) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + break; +#endif + default: + break; + } + + opt->lastopt = opt->srcrt = skb_network_header_len(skb); + skb->transport_header += (hdr->hdrlen + 1) << 3; + opt->dst0 = opt->dst1; + opt->dst1 = 0; + opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); + return 1; + } + + switch (hdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (accept_source_route < 0) + goto unknown_rh; + /* Silently discard invalid RTH type 2 */ + if (hdr->hdrlen != 2 || hdr->segments_left != 1) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + break; +#endif + default: + goto unknown_rh; + } + + /* + * This is the routing header forwarding algorithm from + * RFC 2460, page 16. + */ + + n = hdr->hdrlen >> 1; + + if (hdr->segments_left > n) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((&hdr->segments_left) - + skb_network_header(skb))); + return -1; + } + + /* We are about to mangle packet header. Be careful! + Do not damage packets queued somewhere. + */ + if (skb_cloned(skb)) { + /* the copy is a forwarded packet */ + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -1; + } + hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + + i = n - --hdr->segments_left; + + rthdr = (struct rt0_hdr *) hdr; + addr = rthdr->addr; + addr += i - 1; + + switch (hdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, + (xfrm_address_t *)&ipv6_hdr(skb)->saddr, + IPPROTO_ROUTING) < 0) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + break; +#endif + default: + break; + } + + if (ipv6_addr_is_multicast(addr)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + + ipv6_addr_copy(&daddr, addr); + ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr); + ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr); + + skb_dst_drop(skb); + ip6_route_input(skb); + if (skb_dst(skb)->error) { + skb_push(skb, skb->data - skb_network_header(skb)); + dst_input(skb); + return -1; + } + + if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { + if (ipv6_hdr(skb)->hop_limit <= 1) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0); + kfree_skb(skb); + return -1; + } + ipv6_hdr(skb)->hop_limit--; + goto looped_back; + } + + skb_push(skb, skb->data - skb_network_header(skb)); + dst_input(skb); + return -1; + +unknown_rh: + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + (&hdr->type) - skb_network_header(skb)); + return -1; +} + +static const struct inet6_protocol rthdr_protocol = { + .handler = ipv6_rthdr_rcv, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, +}; + +static const struct inet6_protocol destopt_protocol = { + .handler = ipv6_destopt_rcv, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, +}; + +static const struct inet6_protocol nodata_protocol = { + .handler = dst_discard, + .flags = INET6_PROTO_NOPOLICY, +}; + +int __init ipv6_exthdrs_init(void) +{ + int ret; + + ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); + if (ret) + goto out; + + ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + if (ret) + goto out_rthdr; + + ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); + if (ret) + goto out_destopt; + +out: + return ret; +out_rthdr: + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); +out_destopt: + inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + goto out; +}; + +void ipv6_exthdrs_exit(void) +{ + inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); + inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); +} + +/********************************** + Hop-by-hop options. + **********************************/ + +/* + * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). + */ +static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb) +{ + return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev); +} + +static inline struct net *ipv6_skb_net(struct sk_buff *skb) +{ + return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); +} + +/* Router Alert as of RFC 2711 */ + +static int ipv6_hop_ra(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + + if (nh[optoff + 1] == 2) { + IP6CB(skb)->ra = optoff; + return 1; + } + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", + nh[optoff + 1]); + kfree_skb(skb); + return 0; +} + +/* Jumbo payload */ + +static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + struct net *net = ipv6_skb_net(skb); + u32 pkt_len; + + if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", + nh[optoff+1]); + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); + if (pkt_len <= IPV6_MAXPLEN) { + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); + return 0; + } + if (ipv6_hdr(skb)->payload_len) { + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); + return 0; + } + + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) + goto drop; + + return 1; + +drop: + kfree_skb(skb); + return 0; +} + +static struct tlvtype_proc tlvprochopopt_lst[] = { + { + .type = IPV6_TLV_ROUTERALERT, + .func = ipv6_hop_ra, + }, + { + .type = IPV6_TLV_JUMBO, + .func = ipv6_hop_jumbo, + }, + { -1, } +}; + +int ipv6_parse_hopopts(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); + + /* + * skb_network_header(skb) is equal to skb->data, and + * skb_network_header_len(skb) is always equal to + * sizeof(struct ipv6hdr) by definition of + * hop-by-hop options. + */ + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || + !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { + kfree_skb(skb); + return -1; + } + + opt->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { + skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + opt = IP6CB(skb); + opt->nhoff = sizeof(struct ipv6hdr); + return 1; + } + return -1; +} + +/* + * Creating outbound headers. + * + * "build" functions work when skb is filled from head to tail (datagram) + * "push" functions work when headers are added from tail to head (tcp) + * + * In both cases we assume, that caller reserved enough room + * for headers. + */ + +static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p) +{ + struct rt0_hdr *phdr, *ihdr; + int hops; + + ihdr = (struct rt0_hdr *) opt; + + phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); + memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); + + hops = ihdr->rt_hdr.hdrlen >> 1; + + if (hops > 1) + memcpy(phdr->addr, ihdr->addr + 1, + (hops - 1) * sizeof(struct in6_addr)); + + ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + *addr_p = ihdr->addr; + + phdr->rt_hdr.nexthdr = *proto; + *proto = NEXTHDR_ROUTING; +} + +static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); + + memcpy(h, opt, ipv6_optlen(opt)); + h->nexthdr = *proto; + *proto = type; +} + +void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 *proto, + struct in6_addr **daddr) +{ + if (opt->srcrt) { + ipv6_push_rthdr(skb, proto, opt->srcrt, daddr); + /* + * IPV6_RTHDRDSTOPTS is ignored + * unless IPV6_RTHDR is set (RFC3542). + */ + if (opt->dst0opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + } + if (opt->hopopt) + ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); +} + +EXPORT_SYMBOL(ipv6_push_nfrag_opts); + +void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +{ + if (opt->dst1opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); +} + +struct ipv6_txoptions * +ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) +{ + struct ipv6_txoptions *opt2; + + opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + if (opt2) { + long dif = (char*)opt2 - (char*)opt; + memcpy(opt2, opt, opt->tot_len); + if (opt2->hopopt) + *((char**)&opt2->hopopt) += dif; + if (opt2->dst0opt) + *((char**)&opt2->dst0opt) += dif; + if (opt2->dst1opt) + *((char**)&opt2->dst1opt) += dif; + if (opt2->srcrt) + *((char**)&opt2->srcrt) += dif; + } + return opt2; +} + +EXPORT_SYMBOL_GPL(ipv6_dup_options); + +static int ipv6_renew_option(void *ohdr, + struct ipv6_opt_hdr __user *newopt, int newoptlen, + int inherit, + struct ipv6_opt_hdr **hdr, + char **p) +{ + if (inherit) { + if (ohdr) { + memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); + *hdr = (struct ipv6_opt_hdr *)*p; + *p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr)); + } + } else { + if (newopt) { + if (copy_from_user(*p, newopt, newoptlen)) + return -EFAULT; + *hdr = (struct ipv6_opt_hdr *)*p; + if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen) + return -EINVAL; + *p += CMSG_ALIGN(newoptlen); + } + } + return 0; +} + +struct ipv6_txoptions * +ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, + int newtype, + struct ipv6_opt_hdr __user *newopt, int newoptlen) +{ + int tot_len = 0; + char *p; + struct ipv6_txoptions *opt2; + int err; + + if (opt) { + if (newtype != IPV6_HOPOPTS && opt->hopopt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt)); + if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt)); + if (newtype != IPV6_RTHDR && opt->srcrt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt)); + if (newtype != IPV6_DSTOPTS && opt->dst1opt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); + } + + if (newopt && newoptlen) + tot_len += CMSG_ALIGN(newoptlen); + + if (!tot_len) + return NULL; + + tot_len += sizeof(*opt2); + opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); + if (!opt2) + return ERR_PTR(-ENOBUFS); + + memset(opt2, 0, tot_len); + + opt2->tot_len = tot_len; + p = (char *)(opt2 + 1); + + err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen, + newtype != IPV6_HOPOPTS, + &opt2->hopopt, &p); + if (err) + goto out; + + err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen, + newtype != IPV6_RTHDRDSTOPTS, + &opt2->dst0opt, &p); + if (err) + goto out; + + err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen, + newtype != IPV6_RTHDR, + (struct ipv6_opt_hdr **)&opt2->srcrt, &p); + if (err) + goto out; + + err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen, + newtype != IPV6_DSTOPTS, + &opt2->dst1opt, &p); + if (err) + goto out; + + opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + + (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0); + opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); + + return opt2; +out: + sock_kfree_s(sk, opt2, opt2->tot_len); + return ERR_PTR(err); +} + +struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt) +{ + /* + * ignore the dest before srcrt unless srcrt is being included. + * --yoshfuji + */ + if (opt && opt->dst0opt && !opt->srcrt) { + if (opt_space != opt) { + memcpy(opt_space, opt, sizeof(*opt_space)); + opt = opt_space; + } + opt->opt_nflen -= ipv6_optlen(opt->dst0opt); + opt->dst0opt = NULL; + } + + return opt; +} + +/** + * fl6_update_dst - update flowi destination address with info given + * by srcrt option, if any. + * + * @fl6: flowi6 for which daddr is to be updated + * @opt: struct ipv6_txoptions in which to look for srcrt opt + * @orig: copy of original daddr address if modified + * + * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * and initial value of fl6->daddr set in orig + */ +struct in6_addr *fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) +{ + if (!opt || !opt->srcrt) + return NULL; + + ipv6_addr_copy(orig, &fl6->daddr); + ipv6_addr_copy(&fl6->daddr, ((struct rt0_hdr *)opt->srcrt)->addr); + return orig; +} + +EXPORT_SYMBOL_GPL(fl6_update_dst); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c new file mode 100644 index 00000000..14ed0a95 --- /dev/null +++ b/net/ipv6/exthdrs_core.c @@ -0,0 +1,106 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ +#include + +/* + * find out if nexthdr is a well-known extension header or a protocol + */ + +int ipv6_ext_hdr(u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST); +} + +/* + * Skip any extension headers. This is used by the ICMP module. + * + * Note that strictly speaking this conflicts with RFC 2460 4.0: + * ...The contents and semantics of each extension header determine whether + * or not to proceed to the next header. Therefore, extension headers must + * be processed strictly in the order they appear in the packet; a + * receiver must not, for example, scan through a packet looking for a + * particular kind of extension header and process that header prior to + * processing all preceding ones. + * + * We do exactly this. This is a protocol bug. We can't decide after a + * seeing an unknown discard-with-error flavour TLV option if it's a + * ICMP error message or not (errors should never be send in reply to + * ICMP error messages). + * + * But I see no other way to do this. This might need to be reexamined + * when Linux implements ESP (and maybe AUTH) headers. + * --AK + * + * This function parses (probably truncated) exthdr set "hdr". + * "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * If it is not NULL *nexthdr is updated by type/protocol of this header. + * + * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. + * - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns NULL. + * - First fragment header is skipped, not-first ones + * are considered as unparsable. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + * + * --ANK (980726) + */ + +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + if (nexthdr == NEXTHDR_NONE) + return -1; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -1; + if (nexthdr == NEXTHDR_FRAGMENT) { + __be16 _frag_off, *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -1; + + if (ntohs(*fp) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +EXPORT_SYMBOL(ipv6_ext_hdr); +EXPORT_SYMBOL(ipv6_skip_exthdr); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c new file mode 100644 index 00000000..34d244df --- /dev/null +++ b/net/ipv6/fib6_rules.c @@ -0,0 +1,307 @@ +/* + * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules + * + * Copyright (C)2003-2006 Helsinki University of Technology + * Copyright (C)2003-2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2. + * + * Authors + * Thomas Graf + * Ville Nuorvala + */ + +#include + +#include +#include +#include +#include +#include + +struct fib6_rule +{ + struct fib_rule common; + struct rt6key src; + struct rt6key dst; + u8 tclass; +}; + +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) +{ + struct fib_lookup_arg arg = { + .lookup_ptr = lookup, + .flags = FIB_LOOKUP_NOREF, + }; + + fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); + + if (arg.result) + return arg.result; + + dst_hold(&net->ipv6.ip6_null_entry->dst); + return &net->ipv6.ip6_null_entry->dst; +} + +static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct flowi6 *flp6 = &flp->u.ip6; + struct rt6_info *rt = NULL; + struct fib6_table *table; + struct net *net = rule->fr_net; + pol_lookup_t lookup = arg->lookup_ptr; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + rt = net->ipv6.ip6_null_entry; + goto discard_pkt; + default: + case FR_ACT_BLACKHOLE: + rt = net->ipv6.ip6_blk_hole_entry; + goto discard_pkt; + case FR_ACT_PROHIBIT: + rt = net->ipv6.ip6_prohibit_entry; + goto discard_pkt; + } + + table = fib6_get_table(net, rule->table); + if (table) + rt = lookup(net, table, flp6, flags); + + if (rt != net->ipv6.ip6_null_entry) { + struct fib6_rule *r = (struct fib6_rule *)rule; + + /* + * If we need to find a source address for this traffic, + * we check the result if it meets requirement of the rule. + */ + if ((rule->flags & FIB_RULE_FIND_SADDR) && + r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { + struct in6_addr saddr; + + if (ipv6_dev_get_saddr(net, + ip6_dst_idev(&rt->dst)->dev, + &flp6->daddr, + rt6_flags2srcprefs(flags), + &saddr)) + goto again; + if (!ipv6_prefix_equal(&saddr, &r->src.addr, + r->src.plen)) + goto again; + ipv6_addr_copy(&flp6->saddr, &saddr); + } + goto out; + } +again: + dst_release(&rt->dst); + rt = NULL; + goto out; + +discard_pkt: + dst_hold(&rt->dst); +out: + arg->result = rt; + return rt == NULL ? -EAGAIN : 0; +} + + +static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +{ + struct fib6_rule *r = (struct fib6_rule *) rule; + struct flowi6 *fl6 = &fl->u.ip6; + + if (r->dst.plen && + !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) + return 0; + + /* + * If FIB_RULE_FIND_SADDR is set and we do not have a + * source address for the traffic, we defer check for + * source address. + */ + if (r->src.plen) { + if (flags & RT6_LOOKUP_F_HAS_SADDR) { + if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, + r->src.plen)) + return 0; + } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) + return 0; + } + + if (r->tclass && r->tclass != ((ntohl(fl6->flowlabel) >> 20) & 0xff)) + return 0; + + return 1; +} + +static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = { + FRA_GENERIC_POLICY, +}; + +static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + int err = -EINVAL; + struct net *net = sock_net(skb->sk); + struct fib6_rule *rule6 = (struct fib6_rule *) rule; + + if (rule->action == FR_ACT_TO_TBL) { + if (rule->table == RT6_TABLE_UNSPEC) + goto errout; + + if (fib6_new_table(net, rule->table) == NULL) { + err = -ENOBUFS; + goto errout; + } + } + + if (frh->src_len) + nla_memcpy(&rule6->src.addr, tb[FRA_SRC], + sizeof(struct in6_addr)); + + if (frh->dst_len) + nla_memcpy(&rule6->dst.addr, tb[FRA_DST], + sizeof(struct in6_addr)); + + rule6->src.plen = frh->src_len; + rule6->dst.plen = frh->dst_len; + rule6->tclass = frh->tos; + + err = 0; +errout: + return err; +} + +static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + struct fib6_rule *rule6 = (struct fib6_rule *) rule; + + if (frh->src_len && (rule6->src.plen != frh->src_len)) + return 0; + + if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) + return 0; + + if (frh->tos && (rule6->tclass != frh->tos)) + return 0; + + if (frh->src_len && + nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) + return 0; + + if (frh->dst_len && + nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr))) + return 0; + + return 1; +} + +static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + struct fib6_rule *rule6 = (struct fib6_rule *) rule; + + frh->dst_len = rule6->dst.plen; + frh->src_len = rule6->src.plen; + frh->tos = rule6->tclass; + + if (rule6->dst.plen) + NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr), + &rule6->dst.addr); + + if (rule6->src.plen) + NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr), + &rule6->src.addr); + + return 0; + +nla_put_failure: + return -ENOBUFS; +} + +static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) +{ + return 0x3FFF; +} + +static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) +{ + return nla_total_size(16) /* dst */ + + nla_total_size(16); /* src */ +} + +static const struct fib_rules_ops __net_initdata fib6_rules_ops_template = { + .family = AF_INET6, + .rule_size = sizeof(struct fib6_rule), + .addr_size = sizeof(struct in6_addr), + .action = fib6_rule_action, + .match = fib6_rule_match, + .configure = fib6_rule_configure, + .compare = fib6_rule_compare, + .fill = fib6_rule_fill, + .default_pref = fib6_rule_default_pref, + .nlmsg_payload = fib6_rule_nlmsg_payload, + .nlgroup = RTNLGRP_IPV6_RULE, + .policy = fib6_rule_policy, + .owner = THIS_MODULE, + .fro_net = &init_net, +}; + +static int __net_init fib6_rules_net_init(struct net *net) +{ + struct fib_rules_ops *ops; + int err = -ENOMEM; + + ops = fib_rules_register(&fib6_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv6.fib6_rules_ops = ops; + + + err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0, + RT6_TABLE_LOCAL, 0); + if (err) + goto out_fib6_rules_ops; + + err = fib_default_rule_add(net->ipv6.fib6_rules_ops, + 0x7FFE, RT6_TABLE_MAIN, 0); + if (err) + goto out_fib6_rules_ops; + +out: + return err; + +out_fib6_rules_ops: + fib_rules_unregister(ops); + goto out; +} + +static void __net_exit fib6_rules_net_exit(struct net *net) +{ + fib_rules_unregister(net->ipv6.fib6_rules_ops); +} + +static struct pernet_operations fib6_rules_net_ops = { + .init = fib6_rules_net_init, + .exit = fib6_rules_net_exit, +}; + +int __init fib6_rules_init(void) +{ + return register_pernet_subsys(&fib6_rules_net_ops); +} + + +void fib6_rules_cleanup(void) +{ + unregister_pernet_subsys(&fib6_rules_net_ops); +} diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c new file mode 100644 index 00000000..11900417 --- /dev/null +++ b/net/ipv6/icmp.c @@ -0,0 +1,982 @@ +/* + * Internet Control Message Protocol (ICMPv6) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on net/ipv4/icmp.c + * + * RFC 1885 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Andi Kleen : exception handling + * Andi Kleen add rate limits. never reply to a icmp. + * add more length checks and other fixes. + * yoshfuji : ensure to sent parameter problem for + * fragments. + * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. + * Randy Dunlap and + * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support + * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static inline struct sock *icmpv6_sk(struct net *net) +{ + return net->ipv6.icmp_sk[smp_processor_id()]; +} + +static int icmpv6_rcv(struct sk_buff *skb); + +static const struct inet6_protocol icmpv6_protocol = { + .handler = icmpv6_rcv, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) +{ + struct sock *sk; + + local_bh_disable(); + + sk = icmpv6_sk(net); + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { + /* This can happen if the output path (f.e. SIT or + * ip6ip6 tunnel) signals dst_link_failure() for an + * outgoing ICMP6 packet. + */ + local_bh_enable(); + return NULL; + } + return sk; +} + +static __inline__ void icmpv6_xmit_unlock(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + +/* + * Slightly more convenient version of icmpv6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); + kfree_skb(skb); +} + +/* + * Figure out, may we reply to this packet with icmp error. + * + * We do not reply, if: + * - it was icmp error message. + * - it is truncated, so that it is known, that protocol is ICMPV6 + * (i.e. in the middle of some exthdr) + * + * --ANK (980726) + */ + +static int is_ineligible(struct sk_buff *skb) +{ + int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; + int len = skb->len - ptr; + __u8 nexthdr = ipv6_hdr(skb)->nexthdr; + + if (len < 0) + return 1; + + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr); + if (ptr < 0) + return 0; + if (nexthdr == IPPROTO_ICMPV6) { + u8 _type, *tp; + tp = skb_header_pointer(skb, + ptr+offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + if (tp == NULL || + !(*tp & ICMPV6_INFOMSG_MASK)) + return 1; + } + return 0; +} + +/* + * Check the ICMP output rate limit + */ +static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) +{ + struct dst_entry *dst; + struct net *net = sock_net(sk); + bool res = false; + + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return true; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return true; + + /* + * Look up the output route. + * XXX: perhaps the expire for routing entries cloned by + * this lookup should be more aggressive (not longer than timeout). + */ + dst = ip6_route_output(net, sk, fl6); + if (dst->error) { + IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_OUTNOROUTES); + } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + res = true; + } else { + struct rt6_info *rt = (struct rt6_info *)dst; + int tmo = net->ipv6.sysctl.icmpv6_time; + + /* Give more bandwidth to wider prefixes. */ + if (rt->rt6i_dst.plen < 128) + tmo >>= ((128 - rt->rt6i_dst.plen)>>5); + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo); + } + dst_release(dst); + return res; +} + +/* + * an inline helper for the "simple" if statement below + * checks if parameter problem report is caused by an + * unrecognized IPv6 option that has the Option Type + * highest-order two bits set to 10 + */ + +static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +{ + u8 _optval, *op; + + offset += skb_network_offset(skb); + op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); + if (op == NULL) + return 1; + return (*op & 0xC0) == 0x80; +} + +static int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len) +{ + struct sk_buff *skb; + struct icmp6hdr *icmp6h; + int err = 0; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + icmp6h = icmp6_hdr(skb); + memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); + icmp6h->icmp6_cksum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + skb->csum = csum_partial(icmp6h, + sizeof(struct icmp6hdr), skb->csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, + skb->csum); + } else { + __wsum tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + + tmp_csum = csum_partial(icmp6h, + sizeof(struct icmp6hdr), tmp_csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, + tmp_csum); + } + ip6_push_pending_frames(sk); +out: + return err; +} + +struct icmpv6_msg { + struct sk_buff *skb; + int offset; + uint8_t type; +}; + +static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct icmpv6_msg *msg = (struct icmpv6_msg *) from; + struct sk_buff *org_skb = msg->skb; + __wsum csum = 0; + + csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, + to, len, csum); + skb->csum = csum_block_add(skb->csum, csum, odd); + if (!(msg->type & ICMPV6_INFOMSG_MASK)) + nf_ct_attach(skb, org_skb); + return 0; +} + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +static void mip6_addr_swap(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + struct inet6_skb_parm *opt = IP6CB(skb); + struct ipv6_destopt_hao *hao; + struct in6_addr tmp; + int off; + + if (opt->dsthao) { + off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); + if (likely(off >= 0)) { + hao = (struct ipv6_destopt_hao *) + (skb_network_header(skb) + off); + ipv6_addr_copy(&tmp, &iph->saddr); + ipv6_addr_copy(&iph->saddr, &hao->addr); + ipv6_addr_copy(&hao->addr, &tmp); + } + } +} +#else +static inline void mip6_addr_swap(struct sk_buff *skb) {} +#endif + +static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, + struct sock *sk, struct flowi6 *fl6) +{ + struct dst_entry *dst, *dst2; + struct flowi6 fl2; + int err; + + err = ip6_dst_lookup(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); + dst_release(dst); + return ERR_PTR(-EINVAL); + } + + /* No need to clone since we're just using its address. */ + dst2 = dst; + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); + if (!IS_ERR(dst)) { + if (dst != dst2) + return dst; + } else { + if (PTR_ERR(dst) == -EPERM) + dst = NULL; + else + return dst; + } + + err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6); + if (err) + goto relookup_failed; + + err = ip6_dst_lookup(sk, &dst2, &fl2); + if (err) + goto relookup_failed; + + dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); + if (!IS_ERR(dst2)) { + dst_release(dst); + dst = dst2; + } else { + err = PTR_ERR(dst2); + if (err == -EPERM) { + dst_release(dst); + return dst2; + } else + goto relookup_failed; + } + +relookup_failed: + if (dst) + return dst; + return ERR_PTR(err); +} + +/* + * Send an ICMP message in response to a packet in error + */ +void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + struct net *net = dev_net(skb->dev); + struct inet6_dev *idev = NULL; + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct sock *sk; + struct ipv6_pinfo *np; + const struct in6_addr *saddr = NULL; + struct dst_entry *dst; + struct icmp6hdr tmp_hdr; + struct flowi6 fl6; + struct icmpv6_msg msg; + int iif = 0; + int addr_type = 0; + int len; + int hlimit; + int err = 0; + + if ((u8 *)hdr < skb->head || + (skb->network_header + sizeof(*hdr)) > skb->tail) + return; + + /* + * Make sure we respect the rules + * i.e. RFC 1885 2.4(e) + * Rule (e.1) is enforced by not using icmpv6_send + * in any code that processes icmp errors. + */ + addr_type = ipv6_addr_type(&hdr->daddr); + + if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0)) + saddr = &hdr->daddr; + + /* + * Dest addr check + */ + + if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) { + if (type != ICMPV6_PKT_TOOBIG && + !(type == ICMPV6_PARAMPROB && + code == ICMPV6_UNK_OPTION && + (opt_unrec(skb, info)))) + return; + + saddr = NULL; + } + + addr_type = ipv6_addr_type(&hdr->saddr); + + /* + * Source addr check + */ + + if (addr_type & IPV6_ADDR_LINKLOCAL) + iif = skb->dev->ifindex; + + /* + * Must not send error if the source does not uniquely + * identify a single node (RFC2463 Section 2.4). + * We check unspecified / multicast addresses here, + * and anycast addresses will be checked later. + */ + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); + return; + } + + /* + * Never answer to a ICMP packet. + */ + if (is_ineligible(skb)) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); + return; + } + + mip6_addr_swap(skb); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + ipv6_addr_copy(&fl6.daddr, &hdr->saddr); + if (saddr) + ipv6_addr_copy(&fl6.saddr, saddr); + fl6.flowi6_oif = iif; + fl6.fl6_icmp_type = type; + fl6.fl6_icmp_code = code; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + sk = icmpv6_xmit_lock(net); + if (sk == NULL) + return; + np = inet6_sk(sk); + + if (!icmpv6_xrlim_allow(sk, type, &fl6)) + goto out; + + tmp_hdr.icmp6_type = type; + tmp_hdr.icmp6_code = code; + tmp_hdr.icmp6_cksum = 0; + tmp_hdr.icmp6_pointer = htonl(info); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + + dst = icmpv6_route_lookup(net, skb, sk, &fl6); + if (IS_ERR(dst)) + goto out; + + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + msg.skb = skb; + msg.offset = skb_network_offset(skb); + msg.type = type; + + len = skb->len - msg.offset; + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); + if (len < 0) { + LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); + goto out_dst_release; + } + + idev = in6_dev_get(skb->dev); + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, + np->tclass, NULL, &fl6, (struct rt6_info*)dst, + MSG_DONTWAIT, np->dontfrag); + if (err) { + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + goto out_put; + } + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); + +out_put: + if (likely(idev != NULL)) + in6_dev_put(idev); +out_dst_release: + dst_release(dst); +out: + icmpv6_xmit_unlock(sk); +} + +EXPORT_SYMBOL(icmpv6_send); + +static void icmpv6_echo_reply(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct sock *sk; + struct inet6_dev *idev; + struct ipv6_pinfo *np; + const struct in6_addr *saddr = NULL; + struct icmp6hdr *icmph = icmp6_hdr(skb); + struct icmp6hdr tmp_hdr; + struct flowi6 fl6; + struct icmpv6_msg msg; + struct dst_entry *dst; + int err = 0; + int hlimit; + + saddr = &ipv6_hdr(skb)->daddr; + + if (!ipv6_unicast_destination(skb)) + saddr = NULL; + + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); + tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr); + if (saddr) + ipv6_addr_copy(&fl6.saddr, saddr); + fl6.flowi6_oif = skb->dev->ifindex; + fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + sk = icmpv6_xmit_lock(net); + if (sk == NULL) + return; + np = inet6_sk(sk); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl6); + if (err) + goto out; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); + if (IS_ERR(dst)) + goto out; + + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + idev = in6_dev_get(skb->dev); + + msg.skb = skb; + msg.offset = 0; + msg.type = ICMPV6_ECHO_REPLY; + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl6, + (struct rt6_info*)dst, MSG_DONTWAIT, + np->dontfrag); + + if (err) { + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + goto out_put; + } + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + +out_put: + if (likely(idev != NULL)) + in6_dev_put(idev); + dst_release(dst); +out: + icmpv6_xmit_unlock(sk); +} + +static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +{ + const struct inet6_protocol *ipprot; + int inner_offset; + int hash; + u8 nexthdr; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return; + + nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; + if (ipv6_ext_hdr(nexthdr)) { + /* now skip over extension headers */ + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + if (inner_offset<0) + return; + } else { + inner_offset = sizeof(struct ipv6hdr); + } + + /* Checkin header including 8 bytes of inner protocol header. */ + if (!pskb_may_pull(skb, inner_offset+8)) + return; + + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. + Without this we will not able f.e. to make source routed + pmtu discovery. + Corresponding argument (opt) to notifiers is already added. + --ANK (980726) + */ + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + rcu_read_lock(); + ipprot = rcu_dereference(inet6_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, NULL, type, code, inner_offset, info); + rcu_read_unlock(); + + raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); +} + +/* + * Handle icmp messages + */ + +static int icmpv6_rcv(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); + const struct in6_addr *saddr, *daddr; + const struct ipv6hdr *orig_hdr; + struct icmp6hdr *hdr; + u8 type; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + struct sec_path *sp = skb_sec_path(skb); + int nh; + + if (!(sp && sp->xvec[sp->len - 1]->props.flags & + XFRM_STATE_ICMP)) + goto drop_no_count; + + if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr))) + goto drop_no_count; + + nh = skb_network_offset(skb); + skb_set_network_header(skb, sizeof(*hdr)); + + if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + goto drop_no_count; + + skb_set_network_header(skb, nh); + } + + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS); + + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + + /* Perform checksum. */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, 0)); + if (__skb_checksum_complete(skb)) { + LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%pI6 > %pI6]\n", + saddr, daddr); + goto discard_it; + } + } + + if (!pskb_pull(skb, sizeof(*hdr))) + goto discard_it; + + hdr = icmp6_hdr(skb); + + type = hdr->icmp6_type; + + ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type); + + switch (type) { + case ICMPV6_ECHO_REQUEST: + icmpv6_echo_reply(skb); + break; + + case ICMPV6_ECHO_REPLY: + /* we couldn't care less */ + break; + + case ICMPV6_PKT_TOOBIG: + /* BUGGG_FUTURE: if packet contains rthdr, we cannot update + standard destination cache. Seems, only "advanced" + destination cache will allow to solve this problem + --ANK (980726) + */ + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto discard_it; + hdr = icmp6_hdr(skb); + orig_hdr = (struct ipv6hdr *) (hdr + 1); + rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, + ntohl(hdr->icmp6_mtu)); + + /* + * Drop through to notify + */ + + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + break; + + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + ndisc_rcv(skb); + break; + + case ICMPV6_MGM_QUERY: + igmp6_event_query(skb); + break; + + case ICMPV6_MGM_REPORT: + igmp6_event_report(skb); + break; + + case ICMPV6_MGM_REDUCTION: + case ICMPV6_NI_QUERY: + case ICMPV6_NI_REPLY: + case ICMPV6_MLD2_REPORT: + case ICMPV6_DHAAD_REQUEST: + case ICMPV6_DHAAD_REPLY: + case ICMPV6_MOBILE_PREFIX_SOL: + case ICMPV6_MOBILE_PREFIX_ADV: + break; + + default: + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); + + /* informational */ + if (type & ICMPV6_INFOMSG_MASK) + break; + + /* + * error of unknown type. + * must pass to upper level + */ + + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + } + + kfree_skb(skb); + return 0; + +discard_it: + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); +drop_no_count: + kfree_skb(skb); + return 0; +} + +void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, + u8 type, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int oif) +{ + memset(fl6, 0, sizeof(*fl6)); + ipv6_addr_copy(&fl6->saddr, saddr); + ipv6_addr_copy(&fl6->daddr, daddr); + fl6->flowi6_proto = IPPROTO_ICMPV6; + fl6->fl6_icmp_type = type; + fl6->fl6_icmp_code = 0; + fl6->flowi6_oif = oif; + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); +} + +/* + * Special lock-class for __icmpv6_sk: + */ +static struct lock_class_key icmpv6_socket_sk_dst_lock_key; + +static int __net_init icmpv6_sk_init(struct net *net) +{ + struct sock *sk; + int err, i, j; + + net->ipv6.icmp_sk = + kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + if (net->ipv6.icmp_sk == NULL) + return -ENOMEM; + + for_each_possible_cpu(i) { + err = inet_ctl_sock_create(&sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the ICMP6 control socket " + "(err %d).\n", + err); + goto fail; + } + + net->ipv6.icmp_sk[i] = sk; + + /* + * Split off their lock-class, because sk->sk_dst_lock + * gets used from softirqs, which is safe for + * __icmpv6_sk (because those never get directly used + * via userspace syscalls), but unsafe for normal sockets. + */ + lockdep_set_class(&sk->sk_dst_lock, + &icmpv6_socket_sk_dst_lock_key); + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + sk->sk_sndbuf = + (2 * ((64 * 1024) + sizeof(struct sk_buff))); + } + return 0; + + fail: + for (j = 0; j < i; j++) + inet_ctl_sock_destroy(net->ipv6.icmp_sk[j]); + kfree(net->ipv6.icmp_sk); + return err; +} + +static void __net_exit icmpv6_sk_exit(struct net *net) +{ + int i; + + for_each_possible_cpu(i) { + inet_ctl_sock_destroy(net->ipv6.icmp_sk[i]); + } + kfree(net->ipv6.icmp_sk); +} + +static struct pernet_operations icmpv6_sk_ops = { + .init = icmpv6_sk_init, + .exit = icmpv6_sk_exit, +}; + +int __init icmpv6_init(void) +{ + int err; + + err = register_pernet_subsys(&icmpv6_sk_ops); + if (err < 0) + return err; + + err = -EAGAIN; + if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) + goto fail; + return 0; + +fail: + printk(KERN_ERR "Failed to register ICMP6 protocol\n"); + unregister_pernet_subsys(&icmpv6_sk_ops); + return err; +} + +void icmpv6_cleanup(void) +{ + unregister_pernet_subsys(&icmpv6_sk_ops); + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); +} + + +static const struct icmp6_err { + int err; + int fatal; +} tab_unreach[] = { + { /* NOROUTE */ + .err = ENETUNREACH, + .fatal = 0, + }, + { /* ADM_PROHIBITED */ + .err = EACCES, + .fatal = 1, + }, + { /* Was NOT_NEIGHBOUR, now reserved */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* ADDR_UNREACH */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* PORT_UNREACH */ + .err = ECONNREFUSED, + .fatal = 1, + }, +}; + +int icmpv6_err_convert(u8 type, u8 code, int *err) +{ + int fatal = 0; + + *err = EPROTO; + + switch (type) { + case ICMPV6_DEST_UNREACH: + fatal = 1; + if (code <= ICMPV6_PORT_UNREACH) { + *err = tab_unreach[code].err; + fatal = tab_unreach[code].fatal; + } + break; + + case ICMPV6_PKT_TOOBIG: + *err = EMSGSIZE; + break; + + case ICMPV6_PARAMPROB: + *err = EPROTO; + fatal = 1; + break; + + case ICMPV6_TIME_EXCEED: + *err = EHOSTUNREACH; + break; + } + + return fatal; +} + +EXPORT_SYMBOL(icmpv6_err_convert); + +#ifdef CONFIG_SYSCTL +ctl_table ipv6_icmp_table_template[] = { + { + .procname = "ratelimit", + .data = &init_net.ipv6.sysctl.icmpv6_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { }, +}; + +struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(ipv6_icmp_table_template, + sizeof(ipv6_icmp_table_template), + GFP_KERNEL); + + if (table) + table[0].data = &net->ipv6.sysctl.icmpv6_time; + + return table; +} +#endif + diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 00000000..8a58e8cf --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,252 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + /* + * See comment in inet_csk_bind_conflict about sock lookup + * vs net namespaces issues. + */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + +struct dst_entry *inet6_csk_route_req(struct sock *sk, + const struct request_sock *req) +{ + struct inet6_request_sock *treq = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *final_p, final; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr); + final_p = fl6_update_dst(&fl6, np->opt, &final); + ipv6_addr_copy(&fl6.saddr, &treq->loc_addr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) + return NULL; + + return dst; +} + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, + const u32 rnd, const u16 synq_hsize) +{ + u32 c; + + c = jhash_3words((__force u32)raddr->s6_addr32[0], + (__force u32)raddr->s6_addr32[1], + (__force u32)raddr->s6_addr32[2], + rnd); + + c = jhash_2words((__force u32)raddr->s6_addr32[3], + (__force u32)rport, + c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __be16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet6_request_sock *treq = inet6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + WARN_ON(req->sk != NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->inet_dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +static inline +void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + __ip6_dst_store(sk, dst, daddr, saddr); + +#ifdef CONFIG_XFRM + { + struct rt6_info *rt = (struct rt6_info *)dst; + rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); + } +#endif +} + +static inline +struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst; + + dst = __sk_dst_check(sk, cookie); + +#ifdef CONFIG_XFRM + if (dst) { + struct rt6_info *rt = (struct rt6_info *)dst; + if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { + __sk_dst_reset(sk); + dst = NULL; + } + } +#endif + + return dst; +} + +int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 fl6; + struct dst_entry *dst; + struct in6_addr *final_p, final; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = sk->sk_protocol; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl6.flowlabel); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_sport = inet->inet_sport; + fl6.fl6_dport = inet->inet_dport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = __inet6_csk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + sk->sk_route_caps = 0; + kfree_skb(skb); + return PTR_ERR(dst); + } + + __inet6_csk_dst_store(sk, dst, NULL, NULL); + } + + skb_dst_set(skb, dst_clone(dst)); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl6.daddr, &np->daddr); + + return ip6_xmit(sk, skb, &fl6, np->opt); +} + +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c new file mode 100644 index 00000000..73f1a00a --- /dev/null +++ b/net/ipv6/inet6_hashtables.c @@ -0,0 +1,304 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET6 transport hashtables + * + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include +#include +#include +#include +#include + +int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + int twrefcnt = 0; + + WARN_ON(!sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + struct inet_listen_hashbucket *ilb; + + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + spin_lock(&ilb->lock); + __sk_nulls_add_node_rcu(sk, &ilb->head); + spin_unlock(&ilb->lock); + } else { + unsigned int hash; + struct hlist_nulls_head *list; + spinlock_t *lock; + + sk->sk_hash = hash = inet6_sk_ehashfn(sk); + list = &inet_ehash_bucket(hashinfo, hash)->chain; + lock = inet_ehash_lockp(hashinfo, hash); + spin_lock(lock); + __sk_nulls_add_node_rcu(sk, list); + if (tw) { + WARN_ON(sk->sk_hash != tw->tw_hash); + twrefcnt = inet_twsk_unhash(tw); + } + spin_unlock(lock); + } + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return twrefcnt; +} +EXPORT_SYMBOL(__inet6_hash); + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * The sockhash lock must be held as a reader here. + */ +struct sock *__inet6_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk; + const struct hlist_nulls_node *node; + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + unsigned int slot = hash & hashinfo->ehash_mask; + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + + + rcu_read_lock(); +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ + if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) + goto begintw; + if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + sock_put(sk); + goto begin; + } + goto out; + } + } + if (get_nulls_value(node) != slot) + goto begin; + +begintw: + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_nulls_for_each_rcu(sk, node, &head->twchain) { + if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { + sk = NULL; + goto out; + } + if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + sock_put(sk); + goto begintw; + } + goto out; + } + } + if (get_nulls_value(node) != slot) + goto begintw; + sk = NULL; +out: + rcu_read_unlock(); + return sk; +} +EXPORT_SYMBOL(__inet6_lookup_established); + +static inline int compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, + const struct in6_addr *daddr, + const int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && + sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + score = 1; + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + +struct sock *inet6_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk; + const struct hlist_nulls_node *node; + struct sock *result; + int score, hiscore; + unsigned int hash = inet_lhashfn(net, hnum); + struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + + rcu_read_lock(); +begin: + result = NULL; + hiscore = -1; + sk_nulls_for_each(sk, node, &ilb->head) { + score = compute_score(sk, net, hnum, daddr, dif); + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) + goto begin; + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, daddr, + dif) < hiscore)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); + return result; +} + +EXPORT_SYMBOL_GPL(inet6_lookup_listener); + +struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const __be16 sport, + const struct in6_addr *daddr, const __be16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); + struct net *net = sock_net(sk); + const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, + inet->inet_dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + spinlock_t *lock = inet_ehash_lockp(hinfo, hash); + struct sock *sk2; + const struct hlist_nulls_node *node; + struct inet_timewait_sock *tw; + int twrefcnt = 0; + + spin_lock(lock); + + /* Check TIME-WAIT sockets first. */ + sk_nulls_for_each(sk2, node, &head->twchain) { + tw = inet_twsk(sk2); + + if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_nulls_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->inet_num = lport; + inet->inet_sport = htons(lport); + sk->sk_hash = hash; + WARN_ON(!sk_unhashed(sk)); + __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + twrefcnt = inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } + spin_unlock(lock); + if (twrefcnt) + inet_twsk_put(tw); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + if (twp) { + *twp = tw; + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + spin_unlock(lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->inet_dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), + __inet6_check_established, __inet6_hash); +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c new file mode 100644 index 00000000..0f9b37a1 --- /dev/null +++ b/net/ipv6/ip6_fib.c @@ -0,0 +1,1606 @@ +/* + * Linux INET6 implementation + * Forwarding Information Database + * + * Authors: + * Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + * Ville Nuorvala: Fixed routing subtrees. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +#include +#endif + +#include +#include +#include + +#include +#include + +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + +static struct kmem_cache * fib6_node_kmem __read_mostly; + +enum fib_walk_state_t +{ +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_cleaner_t +{ + struct fib6_walker_t w; + struct net *net; + int (*func)(struct rt6_info *, void *arg); + void *arg; +}; + +static DEFINE_RWLOCK(fib6_walker_lock); + +#ifdef CONFIG_IPV6_SUBTREES +#define FWS_INIT FWS_S +#else +#define FWS_INIT FWS_L +#endif + +static void fib6_prune_clones(struct net *net, struct fib6_node *fn, + struct rt6_info *rt); +static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static int fib6_walk(struct fib6_walker_t *w); +static int fib6_walk_continue(struct fib6_walker_t *w); + +/* + * A routing update causes an increase of the serial number on the + * affected subtree. This allows for cached routes to be asynchronously + * tested when modifications are made to the destination cache as a + * result of redirects, path MTU changes, etc. + */ + +static __u32 rt_sernum; + +static void fib6_gc_timer_cb(unsigned long arg); + +static LIST_HEAD(fib6_walkers); +#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) + +static inline void fib6_walker_link(struct fib6_walker_t *w) +{ + write_lock_bh(&fib6_walker_lock); + list_add(&w->lh, &fib6_walkers); + write_unlock_bh(&fib6_walker_lock); +} + +static inline void fib6_walker_unlink(struct fib6_walker_t *w) +{ + write_lock_bh(&fib6_walker_lock); + list_del(&w->lh); + write_unlock_bh(&fib6_walker_lock); +} +static __inline__ u32 fib6_new_sernum(void) +{ + u32 n = ++rt_sernum; + if ((__s32)n <= 0) + rt_sernum = n = 1; + return n; +} + +/* + * Auxiliary address test functions for the radix tree. + * + * These assume a 32bit processor (although it will work on + * 64bit processors) + */ + +/* + * test bit + */ +#if defined(__LITTLE_ENDIAN) +# define BITOP_BE32_SWIZZLE (0x1F & ~7) +#else +# define BITOP_BE32_SWIZZLE 0 +#endif + +static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) +{ + const __be32 *addr = token; + /* + * Here, + * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) + * is optimized version of + * htonl(1 << ((~fn_bit)&0x1F)) + * See include/asm-generic/bitops/le.h. + */ + return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & + addr[fn_bit >> 5]; +} + +static __inline__ struct fib6_node * node_alloc(void) +{ + struct fib6_node *fn; + + fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + + return fn; +} + +static __inline__ void node_free(struct fib6_node * fn) +{ + kmem_cache_free(fib6_node_kmem, fn); +} + +static __inline__ void rt6_release(struct rt6_info *rt) +{ + if (atomic_dec_and_test(&rt->rt6i_ref)) + dst_free(&rt->dst); +} + +static void fib6_link_table(struct net *net, struct fib6_table *tb) +{ + unsigned int h; + + /* + * Initialize table lock at a single place to give lockdep a key, + * tables aren't visible prior to being linked to the list. + */ + rwlock_init(&tb->tb6_lock); + + h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); + + /* + * No protection necessary, this is the only list mutatation + * operation, tables never disappear once they exist. + */ + hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); +} + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + +static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) +{ + struct fib6_table *table; + + table = kzalloc(sizeof(*table), GFP_ATOMIC); + if (table != NULL) { + table->tb6_id = id; + table->tb6_root.leaf = net->ipv6.ip6_null_entry; + table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + } + + return table; +} + +struct fib6_table *fib6_new_table(struct net *net, u32 id) +{ + struct fib6_table *tb; + + if (id == 0) + id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); + if (tb) + return tb; + + tb = fib6_alloc_table(net, id); + if (tb != NULL) + fib6_link_table(net, tb); + + return tb; +} + +struct fib6_table *fib6_get_table(struct net *net, u32 id) +{ + struct fib6_table *tb; + struct hlist_head *head; + struct hlist_node *node; + unsigned int h; + + if (id == 0) + id = RT6_TABLE_MAIN; + h = id & (FIB6_TABLE_HASHSZ - 1); + rcu_read_lock(); + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + if (tb->tb6_id == id) { + rcu_read_unlock(); + return tb; + } + } + rcu_read_unlock(); + + return NULL; +} + +static void __net_init fib6_tables_init(struct net *net) +{ + fib6_link_table(net, net->ipv6.fib6_main_tbl); + fib6_link_table(net, net->ipv6.fib6_local_tbl); +} +#else + +struct fib6_table *fib6_new_table(struct net *net, u32 id) +{ + return fib6_get_table(net, id); +} + +struct fib6_table *fib6_get_table(struct net *net, u32 id) +{ + return net->ipv6.fib6_main_tbl; +} + +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) +{ + return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); +} + +static void __net_init fib6_tables_init(struct net *net) +{ + fib6_link_table(net, net->ipv6.fib6_main_tbl); +} + +#endif + +static int fib6_dump_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + res = rt6_dump_route(rt, w->args); + if (res < 0) { + /* Frame is full, suspend walking */ + w->leaf = rt; + return 1; + } + WARN_ON(res == 0); + } + w->leaf = NULL; + return 0; +} + +static void fib6_dump_end(struct netlink_callback *cb) +{ + struct fib6_walker_t *w = (void*)cb->args[2]; + + if (w) { + if (cb->args[4]) { + cb->args[4] = 0; + fib6_walker_unlink(w); + } + cb->args[2] = 0; + kfree(w); + } + cb->done = (void*)cb->args[3]; + cb->args[1] = 3; +} + +static int fib6_dump_done(struct netlink_callback *cb) +{ + fib6_dump_end(cb); + return cb->done ? cb->done(cb) : 0; +} + +static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct fib6_walker_t *w; + int res; + + w = (void *)cb->args[2]; + w->root = &table->tb6_root; + + if (cb->args[4] == 0) { + w->count = 0; + w->skip = 0; + + read_lock_bh(&table->tb6_lock); + res = fib6_walk(w); + read_unlock_bh(&table->tb6_lock); + if (res > 0) { + cb->args[4] = 1; + cb->args[5] = w->root->fn_sernum; + } + } else { + if (cb->args[5] != w->root->fn_sernum) { + /* Begin at the root if the tree changed */ + cb->args[5] = w->root->fn_sernum; + w->state = FWS_INIT; + w->node = w->root; + w->skip = w->count; + } else + w->skip = 0; + + read_lock_bh(&table->tb6_lock); + res = fib6_walk_continue(w); + read_unlock_bh(&table->tb6_lock); + if (res <= 0) { + fib6_walker_unlink(w); + cb->args[4] = 0; + } + } + + return res; +} + +static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int h, s_h; + unsigned int e = 0, s_e; + struct rt6_rtnl_dump_arg arg; + struct fib6_walker_t *w; + struct fib6_table *tb; + struct hlist_node *node; + struct hlist_head *head; + int res = 0; + + s_h = cb->args[0]; + s_e = cb->args[1]; + + w = (void *)cb->args[2]; + if (w == NULL) { + /* New dump: + * + * 1. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + + /* + * 2. allocate and initialize walker. + */ + w = kzalloc(sizeof(*w), GFP_ATOMIC); + if (w == NULL) + return -ENOMEM; + w->func = fib6_dump_node; + cb->args[2] = (long)w; + } + + arg.skb = skb; + arg.cb = cb; + arg.net = net; + w->args = &arg; + + rcu_read_lock(); + for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { + e = 0; + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + if (e < s_e) + goto next; + res = fib6_dump_table(tb, skb, cb); + if (res != 0) + goto out; +next: + e++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = e; + cb->args[0] = h; + + res = res < 0 ? res : skb->len; + if (res <= 0) + fib6_dump_end(cb); + return res; +} + +/* + * Routing Table + * + * return the appropriate node for a routing tree "add" operation + * by either creating and inserting or by returning an existing + * node. + */ + +static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, + int addrlen, int plen, + int offset) +{ + struct fib6_node *fn, *in, *ln; + struct fib6_node *pn = NULL; + struct rt6key *key; + int bit; + __be32 dir = 0; + __u32 sernum = fib6_new_sernum(); + + RT6_TRACE("fib6_add_1\n"); + + /* insert node in tree */ + + fn = root; + + do { + key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + goto insert_above; + + /* + * Exact match ? + */ + + if (plen == fn->fn_bit) { + /* clean up an intermediate node */ + if ((fn->fn_flags & RTN_RTINFO) == 0) { + rt6_release(fn->leaf); + fn->leaf = NULL; + } + + fn->fn_sernum = sernum; + + return fn; + } + + /* + * We have more bits to go + */ + + /* Try to walk down on tree. */ + fn->fn_sernum = sernum; + dir = addr_bit_set(addr, fn->fn_bit); + pn = fn; + fn = dir ? fn->right: fn->left; + } while (fn); + + /* + * We walked to the bottom of tree. + * Create new leaf node without children. + */ + + ln = node_alloc(); + + if (ln == NULL) + return NULL; + ln->fn_bit = plen; + + ln->parent = pn; + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + return ln; + + +insert_above: + /* + * split since we don't have a common prefix anymore or + * we have a less significant route. + * we've to insert an intermediate node on the list + * this new node will point to the one we need to create + * and the current + */ + + pn = fn->parent; + + /* find 1st bit in difference between the 2 addrs. + + See comment in __ipv6_addr_diff: bit may be an invalid value, + but if it is >= plen, the value is ignored in any case. + */ + + bit = __ipv6_addr_diff(addr, &key->addr, addrlen); + + /* + * (intermediate)[in] + * / \ + * (new leaf node)[ln] (old node)[fn] + */ + if (plen > bit) { + in = node_alloc(); + ln = node_alloc(); + + if (in == NULL || ln == NULL) { + if (in) + node_free(in); + if (ln) + node_free(ln); + return NULL; + } + + /* + * new intermediate node. + * RTN_RTINFO will + * be off since that an address that chooses one of + * the branches would not match less specific routes + * in the other branch + */ + + in->fn_bit = bit; + + in->parent = pn; + in->leaf = fn->leaf; + atomic_inc(&in->leaf->rt6i_ref); + + in->fn_sernum = sernum; + + /* update parent pointer */ + if (dir) + pn->right = in; + else + pn->left = in; + + ln->fn_bit = plen; + + ln->parent = in; + fn->parent = in; + + ln->fn_sernum = sernum; + + if (addr_bit_set(addr, bit)) { + in->right = ln; + in->left = fn; + } else { + in->left = ln; + in->right = fn; + } + } else { /* plen <= bit */ + + /* + * (new leaf node)[ln] + * / \ + * (old node)[fn] NULL + */ + + ln = node_alloc(); + + if (ln == NULL) + return NULL; + + ln->fn_bit = plen; + + ln->parent = pn; + + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + if (addr_bit_set(&key->addr, plen)) + ln->right = fn; + else + ln->left = fn; + + fn->parent = ln; + } + return ln; +} + +/* + * Insert routing information in a node. + */ + +static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, + struct nl_info *info) +{ + struct rt6_info *iter = NULL; + struct rt6_info **ins; + + ins = &fn->leaf; + + for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) { + /* + * Search for duplicates + */ + + if (iter->rt6i_metric == rt->rt6i_metric) { + /* + * Same priority level + */ + + if (iter->rt6i_dev == rt->rt6i_dev && + iter->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&iter->rt6i_gateway, + &rt->rt6i_gateway)) { + if (!(iter->rt6i_flags&RTF_EXPIRES)) + return -EEXIST; + iter->rt6i_expires = rt->rt6i_expires; + if (!(rt->rt6i_flags&RTF_EXPIRES)) { + iter->rt6i_flags &= ~RTF_EXPIRES; + iter->rt6i_expires = 0; + } + return -EEXIST; + } + } + + if (iter->rt6i_metric > rt->rt6i_metric) + break; + + ins = &iter->dst.rt6_next; + } + + /* Reset round-robin state, if necessary */ + if (ins == &fn->leaf) + fn->rr_ptr = NULL; + + /* + * insert node + */ + + rt->dst.rt6_next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + info->nl_net->ipv6.rt6_stats->fib_rt_entries++; + + if ((fn->fn_flags & RTN_RTINFO) == 0) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } + + return 0; +} + +static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) +{ + if (!timer_pending(&net->ipv6.ip6_fib_timer) && + (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) + mod_timer(&net->ipv6.ip6_fib_timer, + jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); +} + +void fib6_force_start_gc(struct net *net) +{ + if (!timer_pending(&net->ipv6.ip6_fib_timer)) + mod_timer(&net->ipv6.ip6_fib_timer, + jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); +} + +/* + * Add routing information to the routing tree. + * / + * with source addr info in sub-trees + */ + +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) +{ + struct fib6_node *fn, *pn = NULL; + int err = -ENOMEM; + + fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), + rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); + + if (fn == NULL) + goto out; + + pn = fn; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen) { + struct fib6_node *sn; + + if (fn->subtree == NULL) { + struct fib6_node *sfn; + + /* + * Create subtree. + * + * fn[main tree] + * | + * sfn[subtree root] + * \ + * sn[new leaf node] + */ + + /* Create subtree root node */ + sfn = node_alloc(); + if (sfn == NULL) + goto st_failure; + + sfn->leaf = info->nl_net->ipv6.ip6_null_entry; + atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + sfn->fn_flags = RTN_ROOT; + sfn->fn_sernum = fib6_new_sernum(); + + /* Now add the first leaf node to new subtree */ + + sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src)); + + if (sn == NULL) { + /* If it is failed, discard just allocated + root, and then (in st_failure) stale node + in main tree. + */ + node_free(sfn); + goto st_failure; + } + + /* Now link new subtree to main tree */ + sfn->parent = fn; + fn->subtree = sfn; + } else { + sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src)); + + if (sn == NULL) + goto st_failure; + } + + if (fn->leaf == NULL) { + fn->leaf = rt; + atomic_inc(&rt->rt6i_ref); + } + fn = sn; + } +#endif + + err = fib6_add_rt2node(fn, rt, info); + + if (err == 0) { + fib6_start_gc(info->nl_net, rt); + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(info->nl_net, pn, rt); + } + +out: + if (err) { +#ifdef CONFIG_IPV6_SUBTREES + /* + * If fib6_add_1 has cleared the old leaf pointer in the + * super-tree leaf node we have to find a new one for it. + */ + if (pn != fn && pn->leaf == rt) { + pn->leaf = NULL; + atomic_dec(&rt->rt6i_ref); + } + if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn->leaf = fib6_find_prefix(info->nl_net, pn); +#if RT6_DEBUG >= 2 + if (!pn->leaf) { + WARN_ON(pn->leaf == NULL); + pn->leaf = info->nl_net->ipv6.ip6_null_entry; + } +#endif + atomic_inc(&pn->leaf->rt6i_ref); + } +#endif + dst_free(&rt->dst); + } + return err; + +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree creation failed, probably main tree node + is orphan. If it is, shoot it. + */ +st_failure: + if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + fib6_repair_tree(info->nl_net, fn); + dst_free(&rt->dst); + return err; +#endif +} + +/* + * Routing tree lookup + * + */ + +struct lookup_args { + int offset; /* key offset on rt6_info */ + const struct in6_addr *addr; /* search key */ +}; + +static struct fib6_node * fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) +{ + struct fib6_node *fn; + __be32 dir; + + if (unlikely(args->offset == 0)) + return NULL; + + /* + * Descend on a tree + */ + + fn = root; + + for (;;) { + struct fib6_node *next; + + dir = addr_bit_set(args->addr, fn->fn_bit); + + next = dir ? fn->right : fn->left; + + if (next) { + fn = next; + continue; + } + + break; + } + + while(fn) { + if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct rt6key *key; + + key = (struct rt6key *) ((u8 *) fn->leaf + + args->offset); + + if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { +#ifdef CONFIG_IPV6_SUBTREES + if (fn->subtree) + fn = fib6_lookup_1(fn->subtree, args + 1); +#endif + if (!fn || fn->fn_flags & RTN_RTINFO) + return fn; + } + } + + if (fn->fn_flags & RTN_ROOT) + break; + + fn = fn->parent; + } + + return NULL; +} + +struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct fib6_node *fn; + struct lookup_args args[] = { + { + .offset = offsetof(struct rt6_info, rt6i_dst), + .addr = daddr, + }, +#ifdef CONFIG_IPV6_SUBTREES + { + .offset = offsetof(struct rt6_info, rt6i_src), + .addr = saddr, + }, +#endif + { + .offset = 0, /* sentinel */ + } + }; + + fn = fib6_lookup_1(root, daddr ? args : args + 1); + + if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) + fn = root; + + return fn; +} + +/* + * Get node with specified destination prefix (and source prefix, + * if subtrees are used) + */ + + +static struct fib6_node * fib6_locate_1(struct fib6_node *root, + const struct in6_addr *addr, + int plen, int offset) +{ + struct fib6_node *fn; + + for (fn = root; fn ; ) { + struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + return NULL; + + if (plen == fn->fn_bit) + return fn; + + /* + * We have more bits to go + */ + if (addr_bit_set(addr, fn->fn_bit)) + fn = fn->right; + else + fn = fn->left; + } + return NULL; +} + +struct fib6_node * fib6_locate(struct fib6_node *root, + const struct in6_addr *daddr, int dst_len, + const struct in6_addr *saddr, int src_len) +{ + struct fib6_node *fn; + + fn = fib6_locate_1(root, daddr, dst_len, + offsetof(struct rt6_info, rt6i_dst)); + +#ifdef CONFIG_IPV6_SUBTREES + if (src_len) { + WARN_ON(saddr == NULL); + if (fn && fn->subtree) + fn = fib6_locate_1(fn->subtree, saddr, src_len, + offsetof(struct rt6_info, rt6i_src)); + } +#endif + + if (fn && fn->fn_flags&RTN_RTINFO) + return fn; + + return NULL; +} + + +/* + * Deletion + * + */ + +static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +{ + if (fn->fn_flags&RTN_ROOT) + return net->ipv6.ip6_null_entry; + + while(fn) { + if(fn->left) + return fn->left->leaf; + + if(fn->right) + return fn->right->leaf; + + fn = FIB6_SUBTREE(fn); + } + return NULL; +} + +/* + * Called to trim the tree of intermediate nodes when possible. "fn" + * is the node we want to try and remove. + */ + +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_node *fn) +{ + int children; + int nstate; + struct fib6_node *child, *pn; + struct fib6_walker_t *w; + int iter = 0; + + for (;;) { + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + iter++; + + WARN_ON(fn->fn_flags & RTN_RTINFO); + WARN_ON(fn->fn_flags & RTN_TL_ROOT); + WARN_ON(fn->leaf != NULL); + + children = 0; + child = NULL; + if (fn->right) child = fn->right, children |= 1; + if (fn->left) child = fn->left, children |= 2; + + if (children == 3 || FIB6_SUBTREE(fn) +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree root (i.e. fn) may have one child */ + || (children && fn->fn_flags&RTN_ROOT) +#endif + ) { + fn->leaf = fib6_find_prefix(net, fn); +#if RT6_DEBUG >= 2 + if (fn->leaf==NULL) { + WARN_ON(!fn->leaf); + fn->leaf = net->ipv6.ip6_null_entry; + } +#endif + atomic_inc(&fn->leaf->rt6i_ref); + return fn->parent; + } + + pn = fn->parent; +#ifdef CONFIG_IPV6_SUBTREES + if (FIB6_SUBTREE(pn) == fn) { + WARN_ON(!(fn->fn_flags & RTN_ROOT)); + FIB6_SUBTREE(pn) = NULL; + nstate = FWS_L; + } else { + WARN_ON(fn->fn_flags & RTN_ROOT); +#endif + if (pn->right == fn) pn->right = child; + else if (pn->left == fn) pn->left = child; +#if RT6_DEBUG >= 2 + else + WARN_ON(1); +#endif + if (child) + child->parent = pn; + nstate = FWS_R; +#ifdef CONFIG_IPV6_SUBTREES + } +#endif + + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (child == NULL) { + if (w->root == fn) { + w->root = w->node = NULL; + RT6_TRACE("W %p adjusted by delroot 1\n", w); + } else if (w->node == fn) { + RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + w->node = pn; + w->state = nstate; + } + } else { + if (w->root == fn) { + w->root = child; + RT6_TRACE("W %p adjusted by delroot 2\n", w); + } + if (w->node == fn) { + w->node = child; + if (children&2) { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + } else { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + } + } + } + } + read_unlock(&fib6_walker_lock); + + node_free(fn); + if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn)) + return pn; + + rt6_release(pn->leaf); + pn->leaf = NULL; + fn = pn; + } +} + +static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, + struct nl_info *info) +{ + struct fib6_walker_t *w; + struct rt6_info *rt = *rtp; + struct net *net = info->nl_net; + + RT6_TRACE("fib6_del_route\n"); + + /* Unlink it */ + *rtp = rt->dst.rt6_next; + rt->rt6i_node = NULL; + net->ipv6.rt6_stats->fib_rt_entries--; + net->ipv6.rt6_stats->fib_discarded_routes++; + + /* Reset round-robin state, if necessary */ + if (fn->rr_ptr == rt) + fn->rr_ptr = NULL; + + /* Adjust walkers */ + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (w->state == FWS_C && w->leaf == rt) { + RT6_TRACE("walker %p adjusted by delroute\n", w); + w->leaf = rt->dst.rt6_next; + if (w->leaf == NULL) + w->state = FWS_U; + } + } + read_unlock(&fib6_walker_lock); + + rt->dst.rt6_next = NULL; + + /* If it was last route, expunge its radix tree node */ + if (fn->leaf == NULL) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + fn = fib6_repair_tree(net, fn); + } + + if (atomic_read(&rt->rt6i_ref) != 1) { + /* This route is used as dummy address holder in some split + * nodes. It is not leaked, but it still holds other resources, + * which must be released in time. So, scan ascendant nodes + * and replace dummy references to this route with references + * to still alive ones. + */ + while (fn) { + if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { + fn->leaf = fib6_find_prefix(net, fn); + atomic_inc(&fn->leaf->rt6i_ref); + rt6_release(rt); + } + fn = fn->parent; + } + /* No more references are possible at this point. */ + BUG_ON(atomic_read(&rt->rt6i_ref) != 1); + } + + inet6_rt_notify(RTM_DELROUTE, rt, info); + rt6_release(rt); +} + +int fib6_del(struct rt6_info *rt, struct nl_info *info) +{ + struct net *net = info->nl_net; + struct fib6_node *fn = rt->rt6i_node; + struct rt6_info **rtp; + +#if RT6_DEBUG >= 2 + if (rt->dst.obsolete>0) { + WARN_ON(fn != NULL); + return -ENOENT; + } +#endif + if (fn == NULL || rt == net->ipv6.ip6_null_entry) + return -ENOENT; + + WARN_ON(!(fn->fn_flags & RTN_RTINFO)); + + if (!(rt->rt6i_flags&RTF_CACHE)) { + struct fib6_node *pn = fn; +#ifdef CONFIG_IPV6_SUBTREES + /* clones of this route might be in another subtree */ + if (rt->rt6i_src.plen) { + while (!(pn->fn_flags&RTN_ROOT)) + pn = pn->parent; + pn = pn->parent; + } +#endif + fib6_prune_clones(info->nl_net, pn, rt); + } + + /* + * Walk the leaf entries looking for ourself + */ + + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { + if (*rtp == rt) { + fib6_del_route(fn, rtp, info); + return 0; + } + } + return -ENOENT; +} + +/* + * Tree traversal function. + * + * Certainly, it is not interrupt safe. + * However, it is internally reenterable wrt itself and fib6_add/fib6_del. + * It means, that we can modify tree during walking + * and use this function for garbage collection, clone pruning, + * cleaning tree when a device goes down etc. etc. + * + * It guarantees that every node will be traversed, + * and that it will be traversed only once. + * + * Callback function w->func may return: + * 0 -> continue walking. + * positive value -> walking is suspended (used by tree dumps, + * and probably by gc, if it will be split to several slices) + * negative value -> terminate walking. + * + * The function itself returns: + * 0 -> walk is complete. + * >0 -> walk is incomplete (i.e. suspended) + * <0 -> walk is terminated by an error. + */ + +static int fib6_walk_continue(struct fib6_walker_t *w) +{ + struct fib6_node *fn, *pn; + + for (;;) { + fn = w->node; + if (fn == NULL) + return 0; + + if (w->prune && fn != w->root && + fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + w->state = FWS_C; + w->leaf = fn->leaf; + } + switch (w->state) { +#ifdef CONFIG_IPV6_SUBTREES + case FWS_S: + if (FIB6_SUBTREE(fn)) { + w->node = FIB6_SUBTREE(fn); + continue; + } + w->state = FWS_L; +#endif + case FWS_L: + if (fn->left) { + w->node = fn->left; + w->state = FWS_INIT; + continue; + } + w->state = FWS_R; + case FWS_R: + if (fn->right) { + w->node = fn->right; + w->state = FWS_INIT; + continue; + } + w->state = FWS_C; + w->leaf = fn->leaf; + case FWS_C: + if (w->leaf && fn->fn_flags&RTN_RTINFO) { + int err; + + if (w->count < w->skip) { + w->count++; + continue; + } + + err = w->func(w); + if (err) + return err; + + w->count++; + continue; + } + w->state = FWS_U; + case FWS_U: + if (fn == w->root) + return 0; + pn = fn->parent; + w->node = pn; +#ifdef CONFIG_IPV6_SUBTREES + if (FIB6_SUBTREE(pn) == fn) { + WARN_ON(!(fn->fn_flags & RTN_ROOT)); + w->state = FWS_L; + continue; + } +#endif + if (pn->left == fn) { + w->state = FWS_R; + continue; + } + if (pn->right == fn) { + w->state = FWS_C; + w->leaf = w->node->leaf; + continue; + } +#if RT6_DEBUG >= 2 + WARN_ON(1); +#endif + } + } +} + +static int fib6_walk(struct fib6_walker_t *w) +{ + int res; + + w->state = FWS_INIT; + w->node = w->root; + + fib6_walker_link(w); + res = fib6_walk_continue(w); + if (res <= 0) + fib6_walker_unlink(w); + return res; +} + +static int fib6_clean_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct nl_info info = { + .nl_net = c->net, + }; + + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + res = c->func(rt, c->arg); + if (res < 0) { + w->leaf = rt; + res = fib6_del(rt, &info); + if (res) { +#if RT6_DEBUG >= 2 + printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); +#endif + continue; + } + return 0; + } + WARN_ON(res != 0); + } + w->leaf = rt; + return 0; +} + +/* + * Convenient frontend to tree walker. + * + * func is called on each route. + * It may return -1 -> delete this route. + * 0 -> continue walking + * + * prune==1 -> only immediate children of node (certainly, + * ignoring pure split nodes) will be scanned. + */ + +static void fib6_clean_tree(struct net *net, struct fib6_node *root, + int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_cleaner_t c; + + c.w.root = root; + c.w.func = fib6_clean_node; + c.w.prune = prune; + c.w.count = 0; + c.w.skip = 0; + c.func = func; + c.arg = arg; + c.net = net; + + fib6_walk(&c.w); +} + +void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_table *table; + struct hlist_node *node; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + write_lock_bh(&table->tb6_lock); + fib6_clean_tree(net, &table->tb6_root, + func, prune, arg); + write_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + +static int fib6_prune_clone(struct rt6_info *rt, void *arg) +{ + if (rt->rt6i_flags & RTF_CACHE) { + RT6_TRACE("pruning clone %p\n", rt); + return -1; + } + + return 0; +} + +static void fib6_prune_clones(struct net *net, struct fib6_node *fn, + struct rt6_info *rt) +{ + fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt); +} + +/* + * Garbage collection + */ + +static struct fib6_gc_args +{ + int timeout; + int more; +} gc_args; + +static int fib6_age(struct rt6_info *rt, void *arg) +{ + unsigned long now = jiffies; + + /* + * check addrconf expiration here. + * Routes are expired even if they are in use. + * + * Also age clones. Note, that clones are aged out + * only if they are not in use now. + */ + + if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { + if (time_after(now, rt->rt6i_expires)) { + RT6_TRACE("expiring %p\n", rt); + return -1; + } + gc_args.more++; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (atomic_read(&rt->dst.__refcnt) == 0 && + time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { + RT6_TRACE("aging clone %p\n", rt); + return -1; + } else if ((rt->rt6i_flags & RTF_GATEWAY) && + (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + return -1; + } + gc_args.more++; + } + + return 0; +} + +static DEFINE_SPINLOCK(fib6_gc_lock); + +void fib6_run_gc(unsigned long expires, struct net *net) +{ + if (expires != ~0UL) { + spin_lock_bh(&fib6_gc_lock); + gc_args.timeout = expires ? (int)expires : + net->ipv6.sysctl.ip6_rt_gc_interval; + } else { + if (!spin_trylock_bh(&fib6_gc_lock)) { + mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); + return; + } + gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval; + } + + gc_args.more = icmp6_dst_gc(); + + fib6_clean_all(net, fib6_age, 0, NULL); + + if (gc_args.more) + mod_timer(&net->ipv6.ip6_fib_timer, + round_jiffies(jiffies + + net->ipv6.sysctl.ip6_rt_gc_interval)); + else + del_timer(&net->ipv6.ip6_fib_timer); + spin_unlock_bh(&fib6_gc_lock); +} + +static void fib6_gc_timer_cb(unsigned long arg) +{ + fib6_run_gc(0, (struct net *)arg); +} + +static int __net_init fib6_net_init(struct net *net) +{ + size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + + setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); + + net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); + if (!net->ipv6.rt6_stats) + goto out_timer; + + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); + if (!net->ipv6.fib_table_hash) + goto out_rt6_stats; + + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), + GFP_KERNEL); + if (!net->ipv6.fib6_main_tbl) + goto out_fib_table_hash; + + net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; + net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_main_tbl->tb6_root.fn_flags = + RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), + GFP_KERNEL); + if (!net->ipv6.fib6_local_tbl) + goto out_fib6_main_tbl; + net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; + net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_local_tbl->tb6_root.fn_flags = + RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; +#endif + fib6_tables_init(net); + + return 0; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_fib6_main_tbl: + kfree(net->ipv6.fib6_main_tbl); +#endif +out_fib_table_hash: + kfree(net->ipv6.fib_table_hash); +out_rt6_stats: + kfree(net->ipv6.rt6_stats); +out_timer: + return -ENOMEM; + } + +static void fib6_net_exit(struct net *net) +{ + rt6_ifdown(net, NULL); + del_timer_sync(&net->ipv6.ip6_fib_timer); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(net->ipv6.fib6_local_tbl); +#endif + kfree(net->ipv6.fib6_main_tbl); + kfree(net->ipv6.fib_table_hash); + kfree(net->ipv6.rt6_stats); +} + +static struct pernet_operations fib6_net_ops = { + .init = fib6_net_init, + .exit = fib6_net_exit, +}; + +int __init fib6_init(void) +{ + int ret = -ENOMEM; + + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!fib6_node_kmem) + goto out; + + ret = register_pernet_subsys(&fib6_net_ops); + if (ret) + goto out_kmem_cache_create; + + ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); + if (ret) + goto out_unregister_subsys; +out: + return ret; + +out_unregister_subsys: + unregister_pernet_subsys(&fib6_net_ops); +out_kmem_cache_create: + kmem_cache_destroy(fib6_node_kmem); + goto out; +} + +void fib6_gc_cleanup(void) +{ + unregister_pernet_subsys(&fib6_net_ops); + kmem_cache_destroy(fib6_node_kmem); +} diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c new file mode 100644 index 00000000..f3caf1b8 --- /dev/null +++ b/net/ipv6/ip6_flowlabel.c @@ -0,0 +1,782 @@ +/* + * ip6_flowlabel.c IPv6 flowlabel manager. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified + in old IPv6 RFC. Well, it was reasonable value. + */ +#define FL_MAX_LINGER 60 /* Maximal linger timeout */ + +/* FL hash table */ + +#define FL_MAX_PER_SOCK 32 +#define FL_MAX_SIZE 4096 +#define FL_HASH_MASK 255 +#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) + +static atomic_t fl_size = ATOMIC_INIT(0); +static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; + +static void ip6_fl_gc(unsigned long dummy); +static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); + +/* FL hash table lock: it protects only of GC */ + +static DEFINE_RWLOCK(ip6_fl_lock); + +/* Big socket sock */ + +static DEFINE_RWLOCK(ip6_sk_fl_lock); + + +static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) +{ + struct ip6_flowlabel *fl; + + for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + if (fl->label == label && net_eq(fl->fl_net, net)) + return fl; + } + return NULL; +} + +static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) +{ + struct ip6_flowlabel *fl; + + read_lock_bh(&ip6_fl_lock); + fl = __fl_lookup(net, label); + if (fl) + atomic_inc(&fl->users); + read_unlock_bh(&ip6_fl_lock); + return fl; +} + + +static void fl_free(struct ip6_flowlabel *fl) +{ + if (fl) { + release_net(fl->fl_net); + kfree(fl->opt); + } + kfree(fl); +} + +static void fl_release(struct ip6_flowlabel *fl) +{ + write_lock_bh(&ip6_fl_lock); + + fl->lastuse = jiffies; + if (atomic_dec_and_test(&fl->users)) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (fl->opt && fl->share == IPV6_FL_S_EXCL) { + struct ipv6_txoptions *opt = fl->opt; + fl->opt = NULL; + kfree(opt); + } + if (!timer_pending(&ip6_fl_gc_timer) || + time_after(ip6_fl_gc_timer.expires, ttd)) + mod_timer(&ip6_fl_gc_timer, ttd); + } + write_unlock_bh(&ip6_fl_lock); +} + +static void ip6_fl_gc(unsigned long dummy) +{ + int i; + unsigned long now = jiffies; + unsigned long sched = 0; + + write_lock(&ip6_fl_lock); + + for (i=0; i<=FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl, **flp; + flp = &fl_ht[i]; + while ((fl=*flp) != NULL) { + if (atomic_read(&fl->users) == 0) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (time_after_eq(now, ttd)) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + if (!sched || time_before(ttd, sched)) + sched = ttd; + } + flp = &fl->next; + } + } + if (!sched && atomic_read(&fl_size)) + sched = now + FL_MAX_LINGER; + if (sched) { + mod_timer(&ip6_fl_gc_timer, sched); + } + write_unlock(&ip6_fl_lock); +} + +static void __net_exit ip6_fl_purge(struct net *net) +{ + int i; + + write_lock(&ip6_fl_lock); + for (i = 0; i <= FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl, **flp; + flp = &fl_ht[i]; + while ((fl = *flp) != NULL) { + if (net_eq(fl->fl_net, net) && + atomic_read(&fl->users) == 0) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + flp = &fl->next; + } + } + write_unlock(&ip6_fl_lock); +} + +static struct ip6_flowlabel *fl_intern(struct net *net, + struct ip6_flowlabel *fl, __be32 label) +{ + struct ip6_flowlabel *lfl; + + fl->label = label & IPV6_FLOWLABEL_MASK; + + write_lock_bh(&ip6_fl_lock); + if (label == 0) { + for (;;) { + fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + if (fl->label) { + lfl = __fl_lookup(net, fl->label); + if (lfl == NULL) + break; + } + } + } else { + /* + * we dropper the ip6_fl_lock, so this entry could reappear + * and we need to recheck with it. + * + * OTOH no need to search the active socket first, like it is + * done in ipv6_flowlabel_opt - sock is locked, so new entry + * with the same label can only appear on another sock + */ + lfl = __fl_lookup(net, fl->label); + if (lfl != NULL) { + atomic_inc(&lfl->users); + write_unlock_bh(&ip6_fl_lock); + return lfl; + } + } + + fl->lastuse = jiffies; + fl->next = fl_ht[FL_HASH(fl->label)]; + fl_ht[FL_HASH(fl->label)] = fl; + atomic_inc(&fl_size); + write_unlock_bh(&ip6_fl_lock); + return NULL; +} + + + +/* Socket flowlabel lists */ + +struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) +{ + struct ipv6_fl_socklist *sfl; + struct ipv6_pinfo *np = inet6_sk(sk); + + label &= IPV6_FLOWLABEL_MASK; + + read_lock_bh(&ip6_sk_fl_lock); + for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + struct ip6_flowlabel *fl = sfl->fl; + if (fl->label == label) { + fl->lastuse = jiffies; + atomic_inc(&fl->users); + read_unlock_bh(&ip6_sk_fl_lock); + return fl; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + return NULL; +} + +EXPORT_SYMBOL_GPL(fl6_sock_lookup); + +void fl6_free_socklist(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + + while ((sfl = np->ipv6_fl_list) != NULL) { + np->ipv6_fl_list = sfl->next; + fl_release(sfl->fl); + kfree(sfl); + } +} + +/* Service routines */ + + +/* + It is the only difficult place. flowlabel enforces equal headers + before and including routing header, however user may supply options + following rthdr. + */ + +struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, + struct ip6_flowlabel * fl, + struct ipv6_txoptions * fopt) +{ + struct ipv6_txoptions * fl_opt = fl->opt; + + if (fopt == NULL || fopt->opt_flen == 0) + return fl_opt; + + if (fl_opt != NULL) { + opt_space->hopopt = fl_opt->hopopt; + opt_space->dst0opt = fl_opt->dst0opt; + opt_space->srcrt = fl_opt->srcrt; + opt_space->opt_nflen = fl_opt->opt_nflen; + } else { + if (fopt->opt_nflen == 0) + return fopt; + opt_space->hopopt = NULL; + opt_space->dst0opt = NULL; + opt_space->srcrt = NULL; + opt_space->opt_nflen = 0; + } + opt_space->dst1opt = fopt->dst1opt; + opt_space->opt_flen = fopt->opt_flen; + return opt_space; +} + +static unsigned long check_linger(unsigned long ttl) +{ + if (ttl < FL_MIN_LINGER) + return FL_MIN_LINGER*HZ; + if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) + return 0; + return ttl*HZ; +} + +static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) +{ + linger = check_linger(linger); + if (!linger) + return -EPERM; + expires = check_linger(expires); + if (!expires) + return -EPERM; + fl->lastuse = jiffies; + if (time_before(fl->linger, linger)) + fl->linger = linger; + if (time_before(expires, fl->linger)) + expires = fl->linger; + if (time_before(fl->expires, fl->lastuse + expires)) + fl->expires = fl->lastuse + expires; + return 0; +} + +static struct ip6_flowlabel * +fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, + int optlen, int *err_p) +{ + struct ip6_flowlabel *fl = NULL; + int olen; + int addr_type; + int err; + + olen = optlen - CMSG_ALIGN(sizeof(*freq)); + err = -EINVAL; + if (olen > 64 * 1024) + goto done; + + err = -ENOMEM; + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (fl == NULL) + goto done; + + if (olen > 0) { + struct msghdr msg; + struct flowi6 flowi6; + int junk; + + err = -ENOMEM; + fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); + if (fl->opt == NULL) + goto done; + + memset(fl->opt, 0, sizeof(*fl->opt)); + fl->opt->tot_len = sizeof(*fl->opt) + olen; + err = -EFAULT; + if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen)) + goto done; + + msg.msg_controllen = olen; + msg.msg_control = (void*)(fl->opt+1); + memset(&flowi6, 0, sizeof(flowi6)); + + err = datagram_send_ctl(net, &msg, &flowi6, fl->opt, &junk, + &junk, &junk); + if (err) + goto done; + err = -EINVAL; + if (fl->opt->opt_flen) + goto done; + if (fl->opt->opt_nflen == 0) { + kfree(fl->opt); + fl->opt = NULL; + } + } + + fl->fl_net = hold_net(net); + fl->expires = jiffies; + err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); + if (err) + goto done; + fl->share = freq->flr_share; + addr_type = ipv6_addr_type(&freq->flr_dst); + if ((addr_type & IPV6_ADDR_MAPPED) || + addr_type == IPV6_ADDR_ANY) { + err = -EINVAL; + goto done; + } + ipv6_addr_copy(&fl->dst, &freq->flr_dst); + atomic_set(&fl->users, 1); + switch (fl->share) { + case IPV6_FL_S_EXCL: + case IPV6_FL_S_ANY: + break; + case IPV6_FL_S_PROCESS: + fl->owner = current->pid; + break; + case IPV6_FL_S_USER: + fl->owner = current_euid(); + break; + default: + err = -EINVAL; + goto done; + } + return fl; + +done: + fl_free(fl); + *err_p = err; + return NULL; +} + +static int mem_check(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + int room = FL_MAX_SIZE - atomic_read(&fl_size); + int count = 0; + + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) + return 0; + + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + count++; + + if (room <= 0 || + ((count >= FL_MAX_PER_SOCK || + (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + !capable(CAP_NET_ADMIN))) + return -ENOBUFS; + + return 0; +} + +static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2) +{ + if (h1 == h2) + return 0; + if (h1 == NULL || h2 == NULL) + return 1; + if (h1->hdrlen != h2->hdrlen) + return 1; + return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1)); +} + +static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) +{ + if (o1 == o2) + return 0; + if (o1 == NULL || o2 == NULL) + return 1; + if (o1->opt_nflen != o2->opt_nflen) + return 1; + if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt)) + return 1; + if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt)) + return 1; + if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt)) + return 1; + return 0; +} + +static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) +{ + write_lock_bh(&ip6_sk_fl_lock); + sfl->fl = fl; + sfl->next = np->ipv6_fl_list; + np->ipv6_fl_list = sfl; + write_unlock_bh(&ip6_sk_fl_lock); +} + +int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) +{ + int uninitialized_var(err); + struct net *net = sock_net(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_flowlabel_req freq; + struct ipv6_fl_socklist *sfl1=NULL; + struct ipv6_fl_socklist *sfl, **sflp; + struct ip6_flowlabel *fl, *fl1 = NULL; + + + if (optlen < sizeof(freq)) + return -EINVAL; + + if (copy_from_user(&freq, optval, sizeof(freq))) + return -EFAULT; + + switch (freq.flr_action) { + case IPV6_FL_A_PUT: + write_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) + np->flow_label &= ~IPV6_FLOWLABEL_MASK; + *sflp = sfl->next; + write_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); + kfree(sfl); + return 0; + } + } + write_unlock_bh(&ip6_sk_fl_lock); + return -ESRCH; + + case IPV6_FL_A_RENEW: + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); + read_unlock_bh(&ip6_sk_fl_lock); + return err; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) { + fl = fl_lookup(net, freq.flr_label); + if (fl) { + err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); + fl_release(fl); + return err; + } + } + return -ESRCH; + + case IPV6_FL_A_GET: + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) + return -EINVAL; + + fl = fl_create(net, &freq, optval, optlen, &err); + if (fl == NULL) + return err; + sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + + if (freq.flr_label) { + err = -EEXIST; + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_flags&IPV6_FL_F_EXCL) { + read_unlock_bh(&ip6_sk_fl_lock); + goto done; + } + fl1 = sfl->fl; + atomic_inc(&fl1->users); + break; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (fl1 == NULL) + fl1 = fl_lookup(net, freq.flr_label); + if (fl1) { +recheck: + err = -EEXIST; + if (freq.flr_flags&IPV6_FL_F_EXCL) + goto release; + err = -EPERM; + if (fl1->share == IPV6_FL_S_EXCL || + fl1->share != fl->share || + fl1->owner != fl->owner) + goto release; + + err = -EINVAL; + if (!ipv6_addr_equal(&fl1->dst, &fl->dst) || + ipv6_opt_cmp(fl1->opt, fl->opt)) + goto release; + + err = -ENOMEM; + if (sfl1 == NULL) + goto release; + if (fl->linger > fl1->linger) + fl1->linger = fl->linger; + if ((long)(fl->expires - fl1->expires) > 0) + fl1->expires = fl->expires; + fl_link(np, sfl1, fl1); + fl_free(fl); + return 0; + +release: + fl_release(fl1); + goto done; + } + } + err = -ENOENT; + if (!(freq.flr_flags&IPV6_FL_F_CREATE)) + goto done; + + err = -ENOMEM; + if (sfl1 == NULL || (err = mem_check(sk)) != 0) + goto done; + + fl1 = fl_intern(net, fl, freq.flr_label); + if (fl1 != NULL) + goto recheck; + + if (!freq.flr_label) { + if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, + &fl->label, sizeof(fl->label))) { + /* Intentionally ignore fault. */ + } + } + + fl_link(np, sfl1, fl); + return 0; + + default: + return -EINVAL; + } + +done: + fl_free(fl); + kfree(sfl1); + return err; +} + +#ifdef CONFIG_PROC_FS + +struct ip6fl_iter_state { + struct seq_net_private p; + int bucket; +}; + +#define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) + +static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) +{ + struct ip6_flowlabel *fl = NULL; + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + struct net *net = seq_file_net(seq); + + for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { + fl = fl_ht[state->bucket]; + + while (fl && !net_eq(fl->fl_net, net)) + fl = fl->next; + if (fl) + break; + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) +{ + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + struct net *net = seq_file_net(seq); + + fl = fl->next; +try_again: + while (fl && !net_eq(fl->fl_net, net)) + fl = fl->next; + + while (!fl) { + if (++state->bucket <= FL_HASH_MASK) { + fl = fl_ht[state->bucket]; + goto try_again; + } else + break; + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_flowlabel *fl = ip6fl_get_first(seq); + if (fl) + while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) + --pos; + return pos ? NULL : fl; +} + +static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ip6_fl_lock) +{ + read_lock_bh(&ip6_fl_lock); + return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_flowlabel *fl; + + if (v == SEQ_START_TOKEN) + fl = ip6fl_get_first(seq); + else + fl = ip6fl_get_next(seq, v); + ++*pos; + return fl; +} + +static void ip6fl_seq_stop(struct seq_file *seq, void *v) + __releases(ip6_fl_lock) +{ + read_unlock_bh(&ip6_fl_lock); +} + +static int ip6fl_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n", + "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); + else { + struct ip6_flowlabel *fl = v; + seq_printf(seq, + "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", + (unsigned)ntohl(fl->label), + fl->share, + (unsigned)fl->owner, + atomic_read(&fl->users), + fl->linger/HZ, + (long)(fl->expires - jiffies)/HZ, + &fl->dst, + fl->opt ? fl->opt->opt_nflen : 0); + } + return 0; +} + +static const struct seq_operations ip6fl_seq_ops = { + .start = ip6fl_seq_start, + .next = ip6fl_seq_next, + .stop = ip6fl_seq_stop, + .show = ip6fl_seq_show, +}; + +static int ip6fl_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip6fl_seq_ops, + sizeof(struct ip6fl_iter_state)); +} + +static const struct file_operations ip6fl_seq_fops = { + .owner = THIS_MODULE, + .open = ip6fl_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init ip6_flowlabel_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "ip6_flowlabel", + S_IRUGO, &ip6fl_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit ip6_flowlabel_proc_fini(struct net *net) +{ + proc_net_remove(net, "ip6_flowlabel"); +} +#else +static inline int ip6_flowlabel_proc_init(struct net *net) +{ + return 0; +} +static inline void ip6_flowlabel_proc_fini(struct net *net) +{ +} +#endif + +static void __net_exit ip6_flowlabel_net_exit(struct net *net) +{ + ip6_fl_purge(net); + ip6_flowlabel_proc_fini(net); +} + +static struct pernet_operations ip6_flowlabel_net_ops = { + .init = ip6_flowlabel_proc_init, + .exit = ip6_flowlabel_net_exit, +}; + +int ip6_flowlabel_init(void) +{ + return register_pernet_subsys(&ip6_flowlabel_net_ops); +} + +void ip6_flowlabel_cleanup(void) +{ + del_timer(&ip6_fl_gc_timer); + unregister_pernet_subsys(&ip6_flowlabel_net_ops); +} diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c new file mode 100644 index 00000000..027c7ff6 --- /dev/null +++ b/net/ipv6/ip6_input.c @@ -0,0 +1,335 @@ +/* + * IPv6 input + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Ian P. Morris + * + * Based in linux/net/ipv4/ip_input.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* Changes + * + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + + +inline int ip6_rcv_finish( struct sk_buff *skb) +{ + if (skb_dst(skb) == NULL) + ip6_route_input(skb); + + return dst_input(skb); +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + const struct ipv6hdr *hdr; + u32 pkt_len; + struct inet6_dev *idev; + struct net *net = dev_net(skb->dev); + + if (skb->pkt_type == PACKET_OTHERHOST) { + kfree_skb(skb); + return NET_RX_DROP; + } + + rcu_read_lock(); + + idev = __in6_dev_get(skb->dev); + + IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || + !idev || unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + goto drop; + } + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + + /* + * Store incoming device index. When the packet will + * be queued, we cannot refer to skb->dev anymore. + * + * BTW, when we send a packet for our own local address on a + * non-loopback interface (e.g. ethX), it is being delivered + * via the loopback interface (lo) here; skb->dev = loopback_dev. + * It, however, should be considered as if it is being + * arrived via the sending interface (ethX), because of the + * nature of scoping architecture. --yoshfuji + */ + IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + + if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) + goto err; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto err; + + /* + * RFC4291 2.5.3 + * A packet received on an interface with a destination address + * of loopback must be dropped. + */ + if (!(dev->flags & IFF_LOOPBACK) && + ipv6_addr_loopback(&hdr->daddr)) + goto err; + + skb->transport_header = skb->network_header + sizeof(*hdr); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + pkt_len = ntohs(hdr->payload_len); + + /* pkt_len may be zero if Jumbo payload option is present */ + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) { + IP6_INC_STATS_BH(net, + idev, IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + goto drop; + } + hdr = ipv6_hdr(skb); + } + + if (hdr->nexthdr == NEXTHDR_HOP) { + if (ipv6_parse_hopopts(skb) < 0) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + rcu_read_unlock(); + return NET_RX_DROP; + } + } + + rcu_read_unlock(); + + /* Must drop socket now because of tproxy. */ + skb_orphan(skb); + + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL, + ip6_rcv_finish); +err: + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); +drop: + rcu_read_unlock(); + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Deliver the packet to the host + */ + + +static int ip6_input_finish(struct sk_buff *skb) +{ + const struct inet6_protocol *ipprot; + unsigned int nhoff; + int nexthdr, raw; + u8 hash; + struct inet6_dev *idev; + struct net *net = dev_net(skb_dst(skb)->dev); + + /* + * Parse extension headers + */ + + rcu_read_lock(); +resubmit: + idev = ip6_dst_idev(skb_dst(skb)); + if (!pskb_pull(skb, skb_transport_offset(skb))) + goto discard; + nhoff = IP6CB(skb)->nhoff; + nexthdr = skb_network_header(skb)[nhoff]; + + raw = raw6_local_deliver(skb, nexthdr); + + hash = nexthdr & (MAX_INET_PROTOS - 1); + if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + int ret; + + if (ipprot->flags & INET6_PROTO_FINAL) { + const struct ipv6hdr *hdr; + + /* Free reference early: we don't need it any more, + and it may hold ip_conntrack module loaded + indefinitely. */ + nf_reset(skb); + + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + hdr = ipv6_hdr(skb); + if (ipv6_addr_is_multicast(&hdr->daddr) && + !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, + &hdr->saddr) && + !ipv6_is_mld(skb, nexthdr)) + goto discard; + } + if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && + !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + ret = ipprot->handler(skb); + if (ret > 0) + goto resubmit; + else if (ret == 0) + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP6_INC_STATS_BH(net, idev, + IPSTATS_MIB_INUNKNOWNPROTOS); + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_UNK_NEXTHDR, nhoff); + } + } else + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + rcu_read_unlock(); + return 0; + +discard: + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + + +int ip6_input(struct sk_buff *skb) +{ + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + ip6_input_finish); +} + +int ip6_mc_input(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + int deliver; + + IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + skb->len); + + hdr = ipv6_hdr(skb); + deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + +#ifdef CONFIG_IPV6_MROUTE + /* + * IPv6 multicast router mode is now supported ;) + */ + if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) && + likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { + /* + * Okay, we try to forward - split and duplicate + * packets. + */ + struct sk_buff *skb2; + struct inet6_skb_parm *opt = IP6CB(skb); + + /* Check for MLD */ + if (unlikely(opt->ra)) { + /* Check if this is a mld message */ + u8 *ptr = skb_network_header(skb) + opt->ra; + struct icmp6hdr *icmp6; + u8 nexthdr = hdr->nexthdr; + int offset; + + /* Check if the value of Router Alert + * is for MLD (0x0000). + */ + if ((ptr[2] | ptr[3]) == 0) { + deliver = 0; + + if (!ipv6_ext_hdr(nexthdr)) { + /* BUG */ + goto out; + } + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), + &nexthdr); + if (offset < 0) + goto out; + + if (nexthdr != IPPROTO_ICMPV6) + goto out; + + if (!pskb_may_pull(skb, (skb_network_header(skb) + + offset + 1 - skb->data))) + goto out; + + icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); + + switch (icmp6->icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + deliver = 1; + break; + } + goto out; + } + /* unknown RA - process it normally */ + } + + if (deliver) + skb2 = skb_clone(skb, GFP_ATOMIC); + else { + skb2 = skb; + skb = NULL; + } + + if (skb2) { + ip6_mr_input(skb2); + } + } +out: +#endif + if (likely(deliver)) + ip6_input(skb); + else { + /* discard */ + kfree_skb(skb); + } + + return 0; +} diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c new file mode 100644 index 00000000..ae9f6d43 --- /dev/null +++ b/net/ipv6/ip6_output.c @@ -0,0 +1,1690 @@ +/* + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/net/ipv4/ip_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * + * H. von Brand : Added missing #include + * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); + +/* dev_loopback_xmit for use with netfilter. */ +static int ip6_dev_loopback_xmit(struct sk_buff *newskb) +{ + skb_reset_mac_header(newskb); + __skb_pull(newskb, skb_network_offset(newskb)); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(newskb)); + + netif_rx_ni(newskb); + return 0; +} + +static int ip6_finish_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + struct neighbour *neigh; + int res; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + + if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && + ((mroute6_socket(dev_net(dev), skb) && + !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || + ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr))) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + + /* Do not check for IFF_ALLMULTI; multicast routing + is not supported in any case. + */ + if (newskb) + NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + ip6_dev_loopback_xmit); + + if (ipv6_hdr(skb)->hop_limit == 0) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + } + + IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, + skb->len); + } + + rcu_read_lock(); + if (dst->hh) { + res = neigh_hh_output(dst->hh, skb); + + rcu_read_unlock(); + return res; + } else { + neigh = dst_get_neighbour(dst); + if (neigh) { + res = neigh->output(skb); + + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); + } + + IP6_INC_STATS_BH(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; +} + +static int ip6_finish_output(struct sk_buff *skb) +{ + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb))) + return ip6_fragment(skb, ip6_finish_output2); + else + return ip6_finish_output2(skb); +} + +int ip6_output(struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + if (unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); +} + +/* + * xmit an sk_buff (used by TCP, SCTP and DCCP) + */ + +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + struct ipv6_txoptions *opt) +{ + struct net *net = sock_net(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *first_hop = &fl6->daddr; + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr; + u8 proto = fl6->flowi6_proto; + int seg_len = skb->len; + int hlimit = -1; + int tclass = 0; + u32 mtu; + + if (opt) { + unsigned int head_room; + + /* First: exthdrs may take lots of space (~8K for now) + MAX_HEADER is not enough. + */ + head_room = opt->opt_nflen + opt->opt_flen; + seg_len += head_room; + head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + + if (skb_headroom(skb) < head_room) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (skb2 == NULL) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + skb_set_owner_w(skb, sk); + } + if (opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); + } + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); + + /* + * Fill in the IPv6 header + */ + if (np) { + tclass = np->tclass; + hlimit = np->hop_limit; + } + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; + + hdr->payload_len = htons(seg_len); + hdr->nexthdr = proto; + hdr->hop_limit = hlimit; + + ipv6_addr_copy(&hdr->saddr, &fl6->saddr); + ipv6_addr_copy(&hdr->daddr, first_hop); + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + mtu = dst_mtu(dst); + if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { + IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUT, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + dst->dev, dst_output); + } + + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; +} + +EXPORT_SYMBOL(ip6_xmit); + +/* + * To avoid extra problems ND packets are send through this + * routine. It's code duplication but I really want to avoid + * extra checks since ipv6_build_header is used by TCP (which + * is for us performance critical) + */ + +int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, + const struct in6_addr *saddr, const struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + *(__be32*)hdr = htonl(0x60000000); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = np->hop_limit; + + ipv6_addr_copy(&hdr->saddr, saddr); + ipv6_addr_copy(&hdr->daddr, daddr); + + return 0; +} + +static int ip6_call_ra_chain(struct sk_buff *skb, int sel) +{ + struct ip6_ra_chain *ra; + struct sock *last = NULL; + + read_lock(&ip6_ra_lock); + for (ra = ip6_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && ra->sel == sel && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + rawv6_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + rawv6_rcv(last, skb); + read_unlock(&ip6_ra_lock); + return 1; + } + read_unlock(&ip6_ra_lock); + return 0; +} + +static int ip6_forward_proxy_check(struct sk_buff *skb) +{ + struct ipv6hdr *hdr = ipv6_hdr(skb); + u8 nexthdr = hdr->nexthdr; + int offset; + + if (ipv6_ext_hdr(nexthdr)) { + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr); + if (offset < 0) + return 0; + } else + offset = sizeof(struct ipv6hdr); + + if (nexthdr == IPPROTO_ICMPV6) { + struct icmp6hdr *icmp6; + + if (!pskb_may_pull(skb, (skb_network_header(skb) + + offset + 1 - skb->data))) + return 0; + + icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); + + switch (icmp6->icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + /* For reaction involving unicast neighbor discovery + * message destined to the proxied address, pass it to + * input function. + */ + return 1; + default: + break; + } + } + + /* + * The proxying router can't forward traffic sent to a link-local + * address, so signal the sender and discard the packet. This + * behavior is clarified by the MIPv6 specification. + */ + if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { + dst_link_failure(skb); + return -1; + } + + return 0; +} + +static inline int ip6_forward_finish(struct sk_buff *skb) +{ + return dst_output(skb); +} + +int ip6_forward(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(dst->dev); + struct neighbour *n; + u32 mtu; + + if (net->ipv6.devconf_all->forwarding == 0) + goto error; + + if (skb_warn_if_lro(skb)) + goto drop; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + skb_forward_csum(skb); + + /* + * We DO NOT make any processing on + * RA packets, pushing them to user level AS IS + * without ane WARRANTY that application will be able + * to interpret them. The reason is that we + * cannot make anything clever here. + * + * We are not end-node, so that if packet contains + * AH/ESP, we cannot make anything. + * Defragmentation also would be mistake, RA packets + * cannot be fragmented, because there is no warranty + * that different fragments will go along one path. --ANK + */ + if (opt->ra) { + u8 *ptr = skb_network_header(skb) + opt->ra; + if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + return 0; + } + + /* + * check and decrement ttl + */ + if (hdr->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + + kfree_skb(skb); + return -ETIMEDOUT; + } + + /* XXX: idev->cnf.proxy_ndp? */ + if (net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + int proxied = ip6_forward_proxy_check(skb); + if (proxied > 0) + return ip6_input(skb); + else if (proxied < 0) { + IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + + if (!xfrm6_route_forward(skb)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + goto drop; + } + dst = skb_dst(skb); + + /* IPv6 specs say nothing about it, but it is clear that we cannot + send redirects to source routed frames. + We don't send redirects to frames decapsulated from IPsec. + */ + n = dst_get_neighbour(dst); + if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) { + struct in6_addr *target = NULL; + struct rt6_info *rt; + + /* + * incoming and outgoing devices are the same + * send a redirect. + */ + + rt = (struct rt6_info *) dst; + if ((rt->rt6i_flags & RTF_GATEWAY)) + target = (struct in6_addr*)&n->primary_key; + else + target = &hdr->daddr; + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + + /* Limit redirects both by destination (here) + and by source (inside ndisc_send_redirect) + */ + if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + ndisc_send_redirect(skb, n, target); + } else { + int addrtype = ipv6_addr_type(&hdr->saddr); + + /* This check is security critical. */ + if (addrtype == IPV6_ADDR_ANY || + addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) + goto error; + if (addrtype & IPV6_ADDR_LINKLOCAL) { + icmpv6_send(skb, ICMPV6_DEST_UNREACH, + ICMPV6_NOT_NEIGHBOUR, 0); + goto error; + } + } + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (skb->len > mtu && !skb_is_gso(skb)) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + goto drop; + } + + hdr = ipv6_hdr(skb); + + /* Mangling hops number delayed to point after skb COW */ + + hdr->hop_limit--; + + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + ip6_forward_finish); + +error: + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); +drop: + kfree_skb(skb); + return -EINVAL; +} + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); + to->dev = from->dev; + to->mark = from->mark; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif + nf_copy(to, from); +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + to->nf_trace = from->nf_trace; +#endif + skb_copy_secmark(to, from); +} + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + unsigned int packet_len = skb->tail - skb->network_header; + int found_rhdr = 0; + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + break; +#endif + if (found_rhdr) + return offset; + break; + default : + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); + } + + return offset; +} + +static u32 hashidentrnd __read_mostly; +#define FID_HASH_SZ 16 +static u32 ipv6_fragmentation_id[FID_HASH_SZ]; + +void __init initialize_hashidentrnd(void) +{ + get_random_bytes(&hashidentrnd, sizeof(hashidentrnd)); +} + +static u32 __ipv6_select_ident(const struct in6_addr *addr) +{ + u32 newid, oldid, hash = jhash2((u32 *)addr, 4, hashidentrnd); + u32 *pid = &ipv6_fragmentation_id[hash % FID_HASH_SZ]; + + do { + oldid = *pid; + newid = oldid + 1; + if (!(hash + newid)) + newid++; + } while (cmpxchg(pid, oldid, newid) != oldid); + + return hash + newid; +} + +void ipv6_select_ident(struct frag_hdr *fhdr, struct in6_addr *addr) +{ + fhdr->identification = htonl(__ipv6_select_ident(addr)); +} + +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +{ + struct sk_buff *frag; + struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); + struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + __be32 frag_id = 0; + int ptr, offset = 0, err=0; + u8 *prevhdr, nexthdr = 0; + struct net *net = dev_net(skb_dst(skb)->dev); + + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = ip6_skb_dst_mtu(skb); + + /* We must not fragment if the socket is set to force MTU discovery + * or if the skb it not generated by a local socket. + */ + if (!skb->local_df && skb->len > mtu) { + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (np && np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + mtu -= hlen + sizeof(struct frag_hdr); + + if (skb_has_frag_list(skb)) { + int first_len = skb_pagelen(skb); + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path_clean; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path_clean; + + BUG_ON(frag->sk); + if (skb->sk) { + frag->sk = skb->sk; + frag->destructor = sock_wfree; + } + skb->truesize -= frag->truesize; + } + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + /* BUILD HEADER */ + + *prevhdr = NEXTHDR_FRAGMENT; + tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + return -ENOMEM; + } + + __skb_pull(skb, hlen); + fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), tmp_hdr, hlen); + + ipv6_select_ident(fh, &rt->rt6i_dst.addr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + frag_id = fh->identification; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - + sizeof(struct ipv6hdr)); + + dst_hold(&rt->dst); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), tmp_hdr, + hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next != NULL) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + ipv6_hdr(frag)->payload_len = + htons(frag->len - + sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + + err = output(skb); + if(!err) + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGCREATES); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGOKS); + dst_release(&rt->dst); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGFAILS); + dst_release(&rt->dst); + return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + + /* + * Keep copying data until we run out. + */ + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) { + NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev)); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); + frag->transport_header = (frag->network_header + hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + if (!frag_id) { + ipv6_select_ident(fh, &rt->rt6i_dst.addr); + frag_id = fh->identification; + } else + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) + BUG(); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - + sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + err = output(frag); + if (err) + goto fail; + + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGCREATES); + } + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGOKS); + kfree_skb(skb); + return err; + +fail: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return err; +} + +static inline int ip6_rt_check(const struct rt6key *rt_key, + const struct in6_addr *fl_addr, + const struct in6_addr *addr_cache) +{ + return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && + (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); +} + +static struct dst_entry *ip6_sk_dst_check(struct sock *sk, + struct dst_entry *dst, + const struct flowi6 *fl6) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct rt6_info *rt = (struct rt6_info *)dst; + + if (!dst) + goto out; + + /* Yes, checking route validity in not connected + * case is not very simple. Take into account, + * that we do not support routing by source, TOS, + * and MSG_DONTROUTE --ANK (980726) + * + * 1. ip6_rt_check(): If route was host route, + * check that cached destination is current. + * If it is network route, we still may + * check its validity using saved pointer + * to the last used address: daddr_cache. + * We do not want to save whole address now, + * (because main consumer of this service + * is tcp, which has not this problem), + * so that the last trick works only on connected + * sockets. + * 2. oif also should be the same. + */ + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || +#ifdef CONFIG_IPV6_SUBTREES + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || +#endif + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + dst_release(dst); + dst = NULL; + } + +out: + return dst; +} + +static int ip6_dst_lookup_tail(struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6) +{ + struct net *net = sock_net(sk); +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + struct neighbour *n; +#endif + int err; + + if (*dst == NULL) + *dst = ip6_route_output(net, sk, fl6); + + if ((err = (*dst)->error)) + goto out_err_release; + + if (ipv6_addr_any(&fl6->saddr)) { + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); + if (err) + goto out_err_release; + } + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * Here if the dst entry we've looked up + * has a neighbour entry that is in the INCOMPLETE + * state and the src address from the flow is + * marked as OPTIMISTIC, we release the found + * dst entry and replace it instead with the + * dst entry of the nexthop router + */ + rcu_read_lock(); + n = dst_get_neighbour(*dst); + if (n && !(n->nud_state & NUD_VALID)) { + struct inet6_ifaddr *ifp; + struct flowi6 fl_gw6; + int redirect; + + rcu_read_unlock(); + ifp = ipv6_get_ifaddr(net, &fl6->saddr, + (*dst)->dev, 1); + + redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); + if (ifp) + in6_ifa_put(ifp); + + if (redirect) { + /* + * We need to get the dst entry for the + * default router instead + */ + dst_release(*dst); + memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); + memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(net, sk, &fl_gw6); + if ((err = (*dst)->error)) + goto out_err_release; + } + } else { + rcu_read_unlock(); + } +#endif + + return 0; + +out_err_release: + if (err == -ENETUNREACH) + IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); + dst_release(*dst); + *dst = NULL; + return err; +} + +/** + * ip6_dst_lookup - perform route lookup on flow + * @sk: socket which provides route info + * @dst: pointer to dst_entry * for result + * @fl6: flow to lookup + * + * This function performs a route lookup on the given flow. + * + * It returns zero on success, or a standard errno code on error. + */ +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) +{ + *dst = NULL; + return ip6_dst_lookup_tail(sk, dst, fl6); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + +/** + * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @sk: socket which provides route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * @can_sleep: we are in a sleepable context + * + * This function performs a route lookup on the given flow. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, + bool can_sleep) +{ + struct dst_entry *dst = NULL; + int err; + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + ipv6_addr_copy(&fl6->daddr, final_dst); + if (can_sleep) + fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +/** + * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow + * @sk: socket which provides the dst cache and route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * @can_sleep: we are in a sleepable context + * + * This function performs a route lookup on the given flow with the + * possibility of using the cached route in the socket if it is valid. + * It will take the socket dst lock when operating on the dst cache. + * As a result, this function can only be used in process context. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, + bool can_sleep) +{ + struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); + int err; + + dst = ip6_sk_dst_check(sk, dst, fl6); + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + ipv6_addr_copy(&fl6->daddr, final_dst); + if (can_sleep) + fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); + +static inline int ip6_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags, + struct rt6_info *rt) + +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP large send offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + if (skb == NULL) + return -ENOMEM; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb_reset_network_header(skb); + + /* initialize protocol header pointer */ + skb->transport_header = skb->network_header + fragheaderlen; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + struct frag_hdr fhdr; + + /* Specify the length of each IPv6 datagram fragment. + * It has to be a multiple of 8. + */ + skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - + sizeof(struct frag_hdr)) & ~7; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + ipv6_select_ident(&fhdr, &rt->rt6i_dst.addr); + skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UPD LSO, + * so follow normal path + */ + kfree_skb(skb); + + return err; +} + +static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static void ip6_append_data_mtu(int *mtu, + int *maxfraglen, + unsigned int fragheaderlen, + struct sk_buff *skb, + struct rt6_info *rt) +{ + if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { + if (skb == NULL) { + /* first fragment, reserve header_len */ + *mtu = *mtu - rt->dst.header_len; + + } else { + /* + * this fragment is not first, the headers + * space is regarded as data space. + */ + *mtu = dst_mtu(rt->dst.path); + } + *maxfraglen = ((*mtu - fragheaderlen) & ~7) + + fragheaderlen - sizeof(struct frag_hdr); + } +} + +int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, + int offset, int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, int dontfrag) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_cork *cork; + struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen; + int exthdrlen; + int hh_len; + int mtu; + int copy; + int err; + int offset = 0; + int csummode = CHECKSUM_NONE; + __u8 tx_flags = 0; + + if (flags&MSG_PROBE) + return 0; + cork = &inet->cork.base; + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking + */ + if (opt) { + if (WARN_ON(np->cork.opt)) + return -EINVAL; + + np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); + if (unlikely(np->cork.opt == NULL)) + return -ENOBUFS; + + np->cork.opt->tot_len = opt->tot_len; + np->cork.opt->opt_flen = opt->opt_flen; + np->cork.opt->opt_nflen = opt->opt_nflen; + + np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, + sk->sk_allocation); + if (opt->dst0opt && !np->cork.opt->dst0opt) + return -ENOBUFS; + + np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, + sk->sk_allocation); + if (opt->dst1opt && !np->cork.opt->dst1opt) + return -ENOBUFS; + + np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, + sk->sk_allocation); + if (opt->hopopt && !np->cork.opt->hopopt) + return -ENOBUFS; + + np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, + sk->sk_allocation); + if (opt->srcrt && !np->cork.opt->srcrt) + return -ENOBUFS; + + /* need source address above miyazawa*/ + } + dst_hold(&rt->dst); + cork->dst = &rt->dst; + inet->cork.fl.u.ip6 = *fl6; + np->cork.hop_limit = hlimit; + np->cork.tclass = tclass; + if (rt->dst.flags & DST_XFRM_TUNNEL) + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); + else + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); + if (np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + cork->fragsize = mtu; + if (dst_allfrag(rt->dst.path)) + cork->flags |= IPCORK_ALLFRAG; + cork->length = 0; + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) - + rt->rt6i_nfheader_len; + length += exthdrlen; + transhdrlen += exthdrlen; + } else { + rt = (struct rt6_info *)cork->dst; + fl6 = &inet->cork.fl.u.ip6; + opt = np->cork.opt; + transhdrlen = 0; + exthdrlen = 0; + mtu = cork->fragsize; + } + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); + + fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + + (opt ? opt->opt_nflen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + + if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { + if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { + ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); + return -EMSGSIZE; + } + } + + /* For UDP, check if TX timestamp is enabled */ + if (sk->sk_type == SOCK_DGRAM) { + err = sock_tx_timestamp(sk, &tx_flags); + if (err) + goto error; + } + + /* + * Let's try using as much space as possible. + * Use MTU if total length of the message fits into the MTU. + * Otherwise, we need to reserve fragment header and + * fragment alignment (= 8-15 octects, in total). + * + * Note that we may need to "move" the data from the tail of + * of the buffer to the new fragment when we split + * the message. + * + * FIXME: It may be fragmented into multiple chunks + * at once if non-fragmentable extension headers + * are too large. + * --yoshfuji + */ + + cork->length += length; + if (length > mtu) { + int proto = sk->sk_protocol; + if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){ + ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); + return -EMSGSIZE; + } + + if (proto == IPPROTO_UDP && + (rt->dst.dev->features & NETIF_F_UFO)) { + + err = ip6_ufo_append_data(sk, getfrag, from, length, + hh_len, fragheaderlen, + transhdrlen, mtu, flags, rt); + if (err) + goto error; + return 0; + } + } + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; +alloc_new_skb: + /* There's no room in the current skb */ + if (skb) + fraggap = skb->len - maxfraglen; + else + fraggap = 0; + /* update mtu and maxfraglen if necessary */ + if (skb == NULL || skb_prev == NULL) + ip6_append_data_mtu(&mtu, &maxfraglen, + fragheaderlen, skb, rt); + + skb_prev = skb; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + + if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; + if ((flags & MSG_MORE) && + !(rt->dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + if (datalen != length + fraggap) { + /* + * this is not the last fragment, the trailer + * space is regarded as data space. + */ + datalen += rt->dst.trailer_len; + } + + alloclen += rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; + + /* + * We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloclen += sizeof(struct frag_hdr); + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + else { + /* Only the initial fragment + * is time stamped. + */ + tx_flags = 0; + } + } + if (skb == NULL) + goto error; + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + /* reserve for fragmentation */ + skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); + + if (sk->sk_type == SOCK_DGRAM) + skb_shinfo(skb)->tx_flags = tx_flags; + + /* + * Find where to start putting bytes + */ + data = skb_put(skb, fraglen); + skb_set_network_header(skb, exthdrlen); + data += fragheaderlen; + skb->transport_header = (skb->network_header + + fragheaderlen); + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + pskb_trim_unique(skb_prev, maxfraglen); + } + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + kfree_skb(skb); + goto error; + } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = sk->sk_sndmsg_page; + int off = sk->sk_sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if(i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); + } + offset += copy; + length -= copy; + } + return 0; +error: + cork->length -= length; + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) +{ + if (np->cork.opt) { + kfree(np->cork.opt->dst0opt); + kfree(np->cork.opt->dst1opt); + kfree(np->cork.opt->hopopt); + kfree(np->cork.opt->srcrt); + kfree(np->cork.opt); + np->cork.opt = NULL; + } + + if (inet->cork.base.dst) { + dst_release(inet->cork.base.dst); + inet->cork.base.dst = NULL; + inet->cork.base.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); +} + +int ip6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + struct ipv6hdr *hdr; + struct ipv6_txoptions *opt = np->cork.opt; + struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + unsigned char proto = fl6->flowi6_proto; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb_network_header_len(skb)); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; + skb->truesize += tmp_skb->truesize; + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; + } + + /* Allow local fragmentation. */ + if (np->pmtudisc < IPV6_PMTUDISC_DO) + skb->local_df = 1; + + ipv6_addr_copy(final_dst, &fl6->daddr); + __skb_pull(skb, skb_network_header_len(skb)); + if (opt && opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt && opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); + + *(__be32*)hdr = fl6->flowlabel | + htonl(0x60000000 | ((int)np->cork.tclass << 20)); + + hdr->hop_limit = np->cork.hop_limit; + hdr->nexthdr = proto; + ipv6_addr_copy(&hdr->saddr, &fl6->saddr); + ipv6_addr_copy(&hdr->daddr, final_dst); + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + skb_dst_set(skb, dst_clone(&rt->dst)); + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + if (proto == IPPROTO_ICMPV6) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + + ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + } + + err = ip6_local_out(skb); + if (err) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto error; + } + +out: + ip6_cork_release(inet, np); + return err; +error: + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + goto out; +} + +void ip6_flush_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { + if (skb_dst(skb)) + IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + } + + ip6_cork_release(inet_sk(sk), inet6_sk(sk)); +} diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c new file mode 100644 index 00000000..848e494f --- /dev/null +++ b/net/ipv6/ip6_tunnel.c @@ -0,0 +1,1582 @@ +/* + * IPv6 tunneling device + * Linux INET6 implementation + * + * Authors: + * Ville Nuorvala + * Yasuyuki Kozakai + * + * Based on: + * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c + * + * RFC 2473 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Ville Nuorvala"); +MODULE_DESCRIPTION("IPv6 tunneling device"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("ip6tnl0"); + +#ifdef IP6_TNL_DEBUG +#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__) +#else +#define IP6_TNL_TRACE(x...) do {;} while(0) +#endif + +#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) +#define IPV6_TCLASS_SHIFT 20 + +#define HASH_SIZE 32 + +#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ + (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ + (HASH_SIZE - 1)) + +static int ip6_tnl_dev_init(struct net_device *dev); +static void ip6_tnl_dev_setup(struct net_device *dev); + +static int ip6_tnl_net_id __read_mostly; +struct ip6_tnl_net { + /* the IPv6 tunnel fallback device */ + struct net_device *fb_tnl_dev; + /* lists for storing tunnels in use */ + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; +}; + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ip6_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + +/* + * Locking : hash tables are protected by RCU and RTNL + */ + +static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +{ + struct dst_entry *dst = t->dst_cache; + + if (dst && dst->obsolete && + dst->ops->check(dst, t->dst_cookie) == NULL) { + t->dst_cache = NULL; + dst_release(dst); + return NULL; + } + + return dst; +} + +static inline void ip6_tnl_dst_reset(struct ip6_tnl *t) +{ + dst_release(t->dst_cache); + t->dst_cache = NULL; +} + +static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + dst_release(t->dst_cache); + t->dst_cache = dst; +} + +/** + * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ + +#define for_each_ip6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +static struct ip6_tnl * +ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) +{ + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); + struct ip6_tnl *t; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(ip6n->tnls_wc[0]); + if (t && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * ip6_tnl_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * ip6_tnl_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ + +static struct ip6_tnl __rcu ** +ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote) ^ HASH(local); + } + return &ip6n->tnls[prio][h]; +} + +/** + * ip6_tnl_link - add tunnel to hash table + * @t: tunnel to be added + **/ + +static void +ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); + + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +/** + * ip6_tnl_unlink - remove tunnel from hash table + * @t: tunnel to be removed + **/ + +static void +ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void ip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +/** + * ip6_tnl_create() - create a new tunnel + * @p: tunnel parameters + * @pt: pointer to new tunnel + * + * Description: + * Create tunnel matching given parameters. + * + * Return: + * created tunnel or NULL + **/ + +static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) +{ + struct net_device *dev; + struct ip6_tnl *t; + char name[IFNAMSIZ]; + int err; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + sprintf(name, "ip6tnl%%d"); + + dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup); + if (dev == NULL) + goto failed; + + dev_net_set(dev, net); + + t = netdev_priv(dev); + t->parms = *p; + err = ip6_tnl_dev_init(dev); + if (err < 0) + goto failed_free; + + if ((err = register_netdevice(dev)) < 0) + goto failed_free; + + strcpy(t->parms.name, dev->name); + + dev_hold(dev); + ip6_tnl_link(ip6n, t); + return t; + +failed_free: + ip6_dev_free(dev); +failed: + return NULL; +} + +/** + * ip6_tnl_locate - find or create tunnel matching given parameters + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * ip6_tnl_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * matching tunnel or NULL + **/ + +static struct ip6_tnl *ip6_tnl_locate(struct net *net, + struct ip6_tnl_parm *p, int create) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; + struct ip6_tnl *t; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + for (tp = ip6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr)) + return t; + } + if (!create) + return NULL; + return ip6_tnl_create(net, p); +} + +/** + * ip6_tnl_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * ip6_tnl_dev_uninit() removes tunnel from its list + **/ + +static void +ip6_tnl_dev_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (dev == ip6n->fb_tnl_dev) + rcu_assign_pointer(ip6n->tnls_wc[0], NULL); + else + ip6_tnl_unlink(ip6n, t); + ip6_tnl_dst_reset(t); + dev_put(dev); +} + +/** + * parse_tvl_tnl_enc_lim - handle encapsulation limit option + * @skb: received socket buffer + * + * Return: + * 0 if none was found, + * else index to encapsulation limit + **/ + +static __u16 +parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +{ + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; + __u8 nexthdr = ipv6h->nexthdr; + __u16 off = sizeof (*ipv6h); + + while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { + __u16 optlen = 0; + struct ipv6_opt_hdr *hdr; + if (raw + off + sizeof (*hdr) > skb->data && + !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + break; + + hdr = (struct ipv6_opt_hdr *) (raw + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; + if (frag_hdr->frag_off) + break; + optlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + optlen = (hdr->hdrlen + 2) << 2; + } else { + optlen = ipv6_optlen(hdr); + } + if (nexthdr == NEXTHDR_DEST) { + __u16 i = off + 2; + while (1) { + struct ipv6_tlv_tnl_enc_lim *tel; + + /* No more room for encapsulation limit */ + if (i + sizeof (*tel) > off + optlen) + break; + + tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + /* return index of option if found and valid */ + if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && + tel->length == 1) + return i; + /* else jump to next option */ + if (tel->type) + i += tel->length + 2; + else + i++; + } + } + nexthdr = hdr->nexthdr; + off += optlen; + } + return 0; +} + +/** + * ip6_tnl_err - tunnel error handler + * + * Description: + * ip6_tnl_err() should handle errors in the tunnel according + * to the specifications in RFC 2473. + **/ + +static int +ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, + u8 *type, u8 *code, int *msg, __u32 *info, int offset) +{ + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; + struct ip6_tnl *t; + int rel_msg = 0; + u8 rel_type = ICMPV6_DEST_UNREACH; + u8 rel_code = ICMPV6_ADDR_UNREACH; + __u32 rel_info = 0; + __u16 len; + int err = -ENOENT; + + /* If the packet doesn't contain the original IPv6 header we are + in trouble since we might need the source address for further + processing of the error. */ + + rcu_read_lock(); + if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, + &ipv6h->saddr)) == NULL) + goto out; + + if (t->parms.proto != ipproto && t->parms.proto != 0) + goto out; + + err = 0; + + switch (*type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Path to destination invalid " + "or inactive!\n", t->parms.name); + rel_msg = 1; + break; + case ICMPV6_TIME_EXCEED: + if ((*code) == ICMPV6_EXC_HOPLIMIT) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small hop limit or " + "routing loop in tunnel!\n", + t->parms.name); + rel_msg = 1; + } + break; + case ICMPV6_PARAMPROB: + teli = 0; + if ((*code) == ICMPV6_HDR_FIELD) + teli = parse_tlv_tnl_enc_lim(skb, skb->data); + + if (teli && teli == *info - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small encapsulation " + "limit or routing loop in " + "tunnel!\n", t->parms.name); + rel_msg = 1; + } + } else if (net_ratelimit()) { + printk(KERN_WARNING + "%s: Recipient unable to parse tunneled " + "packet!\n ", t->parms.name); + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = *info - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + + if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) { + rel_type = ICMPV6_PKT_TOOBIG; + rel_code = 0; + rel_info = mtu; + rel_msg = 1; + } + break; + } + + *type = rel_type; + *code = rel_code; + *info = rel_info; + *msg = rel_msg; + +out: + rcu_read_unlock(); + return err; +} + +static int +ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + __u32 rel_info = ntohl(info); + int err; + struct sk_buff *skb2; + const struct iphdr *eiph; + struct rtable *rt; + struct flowi4 fl4; + + err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + if (err < 0) + return err; + + if (rel_msg == 0) + return 0; + + switch (rel_type) { + case ICMPV6_DEST_UNREACH: + if (rel_code != ICMPV6_ADDR_UNREACH) + return 0; + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + case ICMPV6_PKT_TOOBIG: + if (rel_code != 0) + return 0; + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_FRAG_NEEDED; + break; + default: + return 0; + } + + if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) + return 0; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + return 0; + + skb_dst_drop(skb2); + + skb_pull(skb2, offset); + skb_reset_network_header(skb2); + eiph = ip_hdr(skb2); + + /* Try to guess incoming interface */ + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->saddr, 0, + 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt)) + goto out; + + skb2->dev = rt->dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->daddr, eiph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || + rt->dst.dev->type != ARPHRD_TUNNEL) { + if (!IS_ERR(rt)) + ip_rt_put(rt); + goto out; + } + skb_dst_set(skb2, &rt->dst); + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, + skb2->dev) || + skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) + goto out; + } + + /* change mtu on this route */ + if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb_dst(skb2))) + goto out; + + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info); + } + + icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); + +out: + kfree_skb(skb2); + return 0; +} + +static int +ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + __u32 rel_info = ntohl(info); + int err; + + err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + if (err < 0) + return err; + + if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { + struct rt6_info *rt; + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + + if (!skb2) + return 0; + + skb_dst_drop(skb2); + skb_pull(skb2, offset); + skb_reset_network_header(skb2); + + /* Try to guess incoming interface */ + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, 0); + + if (rt && rt->rt6i_dev) + skb2->dev = rt->rt6i_dev; + + icmpv6_send(skb2, rel_type, rel_code, rel_info); + + if (rt) + dst_release(&rt->dst); + + kfree_skb(skb2); + } + + return 0; +} + +static void ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; + + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); + + if (INET_ECN_is_ce(dsfield)) + IP_ECN_set_ce(ip_hdr(skb)); +} + +static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); + + if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) + IP6_ECN_set_ce(ipv6_hdr(skb)); +} + +/* called with rcu_read_lock() */ +static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + int ret = 0; + struct net *net = dev_net(t->dev); + + if (p->flags & IP6_TNL_F_CAP_RCV) { + struct net_device *ldev = NULL; + + if (p->link) + ldev = dev_get_by_index_rcu(net, p->link); + + if ((ipv6_addr_is_multicast(&p->laddr) || + likely(ipv6_chk_addr(net, &p->laddr, ldev, 0))) && + likely(!ipv6_chk_addr(net, &p->raddr, NULL, 0))) + ret = 1; + + } + return ret; +} + +/** + * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally + * @skb: received socket buffer + * @protocol: ethernet protocol ID + * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN + * + * Return: 0 + **/ + +static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, + __u8 ipproto, + void (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb)) +{ + struct ip6_tnl *t; + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + + rcu_read_lock(); + + if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, + &ipv6h->daddr)) != NULL) { + struct pcpu_tstats *tstats; + + if (t->parms.proto != ipproto && t->parms.proto != 0) { + rcu_read_unlock(); + goto discard; + } + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + rcu_read_unlock(); + goto discard; + } + + if (!ip6_tnl_rcv_ctl(t)) { + t->dev->stats.rx_dropped++; + rcu_read_unlock(); + goto discard; + } + secpath_reset(skb); + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + skb->protocol = htons(protocol); + skb->pkt_type = PACKET_HOST; + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + + tstats = this_cpu_ptr(t->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, t->dev); + + dscp_ecn_decapsulate(t, ipv6h, skb); + + netif_rx(skb); + + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + return 1; + +discard: + kfree_skb(skb); + return 0; +} + +static int ip4ip6_rcv(struct sk_buff *skb) +{ + return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP, + ip4ip6_dscp_ecn_decapsulate); +} + +static int ip6ip6_rcv(struct sk_buff *skb) +{ + return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6, + ip6ip6_dscp_ecn_decapsulate); +} + +struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) +{ + memset(opt, 0, sizeof(struct ipv6_tel_txoption)); + + opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; + opt->dst_opt[3] = 1; + opt->dst_opt[4] = encap_limit; + opt->dst_opt[5] = IPV6_TLV_PADN; + opt->dst_opt[6] = 1; + + opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; + opt->ops.opt_nflen = 8; +} + +/** + * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline int +ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + int ret = 0; + struct net *net = dev_net(t->dev); + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + struct net_device *ldev = NULL; + + rcu_read_lock(); + if (p->link) + ldev = dev_get_by_index_rcu(net, p->link); + + if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0))) + printk(KERN_WARNING + "%s xmit: Local address not yet configured!\n", + p->name); + else if (!ipv6_addr_is_multicast(&p->raddr) && + unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0))) + printk(KERN_WARNING + "%s xmit: Routing loop! " + "Remote address found on this node!\n", + p->name); + else + ret = 1; + rcu_read_unlock(); + } + return ret; +} +/** + * ip6_tnl_xmit2 - encapsulate packet and send + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * @dsfield: dscp code for outer header + * @fl: flow of tunneled packet + * @encap_limit: encapsulation limit + * @pmtu: Path MTU is stored if packet is too big + * + * Description: + * Build new header and do some sanity checks on the packet before sending + * it. + * + * Return: + * 0 on success + * -1 fail + * %-EMSGSIZE message too big. return mtu in this case. + **/ + +static int ip6_tnl_xmit2(struct sk_buff *skb, + struct net_device *dev, + __u8 dsfield, + struct flowi6 *fl6, + int encap_limit, + __u32 *pmtu) +{ + struct net *net = dev_net(dev); + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6_tel_txoption opt; + struct dst_entry *dst; + struct net_device *tdev; + int mtu; + unsigned int max_headroom = sizeof(struct ipv6hdr); + u8 proto; + int err = -1; + int pkt_len; + + if ((dst = ip6_tnl_dst_check(t)) != NULL) + dst_hold(dst); + else { + dst = ip6_route_output(net, NULL, fl6); + + if (dst->error) + goto tx_err_link_failure; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + } + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + mtu = dst_mtu(dst) - sizeof (*ipv6h); + if (encap_limit >= 0) { + max_headroom += 8; + mtu -= 8; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + if (skb->len > mtu) { + *pmtu = mtu; + err = -EMSGSIZE; + goto tx_err_dst_release; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom += LL_RESERVED_SPACE(tdev); + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb; + + if (!(new_skb = skb_realloc_headroom(skb, max_headroom))) + goto tx_err_dst_release; + + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + kfree_skb(skb); + skb = new_skb; + } + skb_dst_drop(skb); + skb_dst_set(skb, dst_clone(dst)); + + skb->transport_header = skb->network_header; + + proto = fl6->flowi6_proto; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); + } + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ipv6h = ipv6_hdr(skb); + *(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000); + dsfield = INET_ECN_encapsulate(0, dsfield); + ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->nexthdr = proto; + ipv6_addr_copy(&ipv6h->saddr, &fl6->saddr); + ipv6_addr_copy(&ipv6h->daddr, &fl6->daddr); + nf_reset(skb); + pkt_len = skb->len; + err = ip6_local_out(skb); + + if (net_xmit_eval(err) == 0) { + struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats); + + tstats->tx_bytes += pkt_len; + tstats->tx_packets++; + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + ip6_tnl_dst_store(t, dst); + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static inline int +ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + const struct iphdr *iph = ip_hdr(skb); + int encap_limit = -1; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t)) + return -1; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + + dsfield = ipv4_get_dsfield(iph); + + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK; + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + return -1; + } + + return 0; +} + +static inline int +ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int encap_limit = -1; + __u16 offset; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h)) + return -1; + + offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; + + dsfield = ipv6_get_dsfield(ipv6h); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + if (err == -EMSGSIZE) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + return -1; + } + + return 0; +} + +static netdev_tx_t +ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + int ret; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ret = ip4ip6_tnl_xmit(skb, dev); + break; + case htons(ETH_P_IPV6): + ret = ip6ip6_tnl_xmit(skb, dev); + break; + default: + goto tx_err; + } + + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ip6_tnl_set_cap(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + int ltype = ipv6_addr_type(&p->laddr); + int rtype = ipv6_addr_type(&p->raddr); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); + + if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && + (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { + if (ltype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_RCV; + } +} + +static void ip6_tnl_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct ip6_tnl_parm *p = &t->parms; + struct flowi6 *fl6 = &t->fl.u.ip6; + + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + + /* Set up flowi template */ + ipv6_addr_copy(&fl6->saddr, &p->laddr); + ipv6_addr_copy(&fl6->daddr, &p->raddr); + fl6->flowi6_oif = p->link; + fl6->flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + + ip6_tnl_set_cap(t); + + if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); + + struct rt6_info *rt = rt6_lookup(dev_net(dev), + &p->raddr, &p->laddr, + p->link, strict); + + if (rt == NULL) + return; + + if (rt->rt6i_dev) { + dev->hard_header_len = rt->rt6i_dev->hard_header_len + + sizeof (struct ipv6hdr); + + dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; + + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dst_release(&rt->dst); + } +} + +/** + * ip6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * + * Description: + * ip6_tnl_change() updates the tunnel parameters + **/ + +static int +ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +{ + ipv6_addr_copy(&t->parms.laddr, &p->laddr); + ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + t->parms.link = p->link; + t->parms.proto = p->proto; + ip6_tnl_dst_reset(t); + ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * ip6_tnl_ioctl() is used for managing IPv6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6tnl0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ + +static int +ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip6_tnl_parm p; + struct ip6_tnl *t = NULL; + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ip6n->fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + err = -EFAULT; + break; + } + t = ip6_tnl_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + memcpy(&p, &t->parms, sizeof (p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + err = -EFAULT; + } + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + break; + err = -EINVAL; + if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && + p.proto != 0) + break; + t = ip6_tnl_locate(net, &p, cmd == SIOCADDTUNNEL); + if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else + t = netdev_priv(dev); + + ip6_tnl_unlink(ip6n, t); + synchronize_net(); + err = ip6_tnl_change(t, &p); + ip6_tnl_link(ip6n, t); + netdev_state_change(dev); + } + if (t) { + err = 0; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p))) + err = -EFAULT; + + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + + if (dev == ip6n->fb_tnl_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + break; + err = -ENOENT; + if ((t = ip6_tnl_locate(net, &p, 0)) == NULL) + break; + err = -EPERM; + if (t->dev == ip6n->fb_tnl_dev) + break; + dev = t->dev; + } + err = 0; + unregister_netdevice(dev); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * ip6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ + +static int +ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) { + return -EINVAL; + } + dev->mtu = new_mtu; + return 0; +} + + +static const struct net_device_ops ip6_tnl_netdev_ops = { + .ndo_uninit = ip6_tnl_dev_uninit, + .ndo_start_xmit = ip6_tnl_xmit, + .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats = ip6_get_stats, +}; + + +/** + * ip6_tnl_dev_setup - setup virtual tunnel device + * @dev: virtual device associated with tunnel + * + * Description: + * Initialize function pointers and device parameters + **/ + +static void ip6_tnl_dev_setup(struct net_device *dev) +{ + struct ip6_tnl *t; + + dev->netdev_ops = &ip6_tnl_netdev_ops; + dev->destructor = ip6_dev_free; + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; + dev->flags |= IFF_NOARP; + dev->addr_len = sizeof(struct in6_addr); + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + + +/** + * ip6_tnl_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static inline int +ip6_tnl_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + + t->dev = dev; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; +} + +/** + * ip6_tnl_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static int ip6_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; + ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ + +static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; + + t->parms.proto = IPPROTO_IPV6; + dev_hold(dev); + rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; +} + +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { + .handler = ip4ip6_rcv, + .err_handler = ip4ip6_err, + .priority = 1, +}; + +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { + .handler = ip6ip6_rcv, + .err_handler = ip6ip6_err, + .priority = 1, +}; + +static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) +{ + int h; + struct ip6_tnl *t; + LIST_HEAD(list); + + for (h = 0; h < HASH_SIZE; h++) { + t = rtnl_dereference(ip6n->tnls_r_l[h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, &list); + t = rtnl_dereference(t->next); + } + } + + t = rtnl_dereference(ip6n->tnls_wc[0]); + unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init ip6_tnl_init_net(struct net *net) +{ + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl *t = NULL; + int err; + + ip6n->tnls[0] = ip6n->tnls_wc; + ip6n->tnls[1] = ip6n->tnls_r_l; + + err = -ENOMEM; + ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", + ip6_tnl_dev_setup); + + if (!ip6n->fb_tnl_dev) + goto err_alloc_dev; + dev_net_set(ip6n->fb_tnl_dev, net); + + err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + err = register_netdev(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); + return 0; + +err_register: + ip6_dev_free(ip6n->fb_tnl_dev); +err_alloc_dev: + return err; +} + +static void __net_exit ip6_tnl_exit_net(struct net *net) +{ + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + rtnl_lock(); + ip6_tnl_destroy_tunnels(ip6n); + rtnl_unlock(); +} + +static struct pernet_operations ip6_tnl_net_ops = { + .init = ip6_tnl_init_net, + .exit = ip6_tnl_exit_net, + .id = &ip6_tnl_net_id, + .size = sizeof(struct ip6_tnl_net), +}; + +/** + * ip6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ + +static int __init ip6_tunnel_init(void) +{ + int err; + + err = register_pernet_device(&ip6_tnl_net_ops); + if (err < 0) + goto out_pernet; + + err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); + if (err < 0) { + printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n"); + goto out_ip4ip6; + } + + err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); + if (err < 0) { + printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n"); + goto out_ip6ip6; + } + + return 0; + +out_ip6ip6: + xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); +out_ip4ip6: + unregister_pernet_device(&ip6_tnl_net_ops); +out_pernet: + return err; +} + +/** + * ip6_tunnel_cleanup - free resources and unregister protocol + **/ + +static void __exit ip6_tunnel_cleanup(void) +{ + if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) + printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n"); + + if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) + printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n"); + + unregister_pernet_device(&ip6_tnl_net_ops); +} + +module_init(ip6_tunnel_init); +module_exit(ip6_tunnel_cleanup); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c new file mode 100644 index 00000000..86e3cc10 --- /dev/null +++ b/net/ipv6/ip6mr.c @@ -0,0 +1,2278 @@ +/* + * Linux IPv6 multicast routing support for BSD pim6sd + * Based on net/ipv4/ipmr.c. + * + * (c) 2004 Mickael Hoerdt, + * LSIIT Laboratory, Strasbourg, France + * (c) 2004 Jean-Philippe Andriot, + * 6WIND, Paris, France + * Copyright (C)2007,2008 USAGI/WIDE Project + * YOSHIFUJI Hideaki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct mr6_table { + struct list_head list; +#ifdef CONFIG_NET_NS + struct net *net; +#endif + u32 id; + struct sock *mroute6_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc6_unres_queue; + struct list_head mfc6_cache_array[MFC6_LINES]; + struct mif_device vif6_table[MAXMIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + int mroute_do_assert; + int mroute_do_pim; +#ifdef CONFIG_IPV6_PIMSM_V2 + int mroute_reg_vif_num; +#endif +}; + +struct ip6mr_rule { + struct fib_rule common; +}; + +struct ip6mr_result { + struct mr6_table *mrt; +}; + +/* Big lock, protecting vif table, mrt cache and mroute socket state. + Note that the changes are semaphored via rtnl_lock. + */ + +static DEFINE_RWLOCK(mrt_lock); + +/* + * Multicast router control variables + */ + +#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL) + +/* Special spinlock for queue of unresolved entries */ +static DEFINE_SPINLOCK(mfc_unres_lock); + +/* We return to original Alan's scheme. Hash table of resolved + entries is changed only in process context and protected + with weak lock mrt_lock. Queue of unresolved entries is protected + with strong spinlock mfc_unres_lock. + + In this case data path is free of exclusive locks at all. + */ + +static struct kmem_cache *mrt_cachep __read_mostly; + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); +static void ip6mr_free_table(struct mr6_table *mrt); + +static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache); +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert); +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm); +static int ip6mr_rtm_dumproute(struct sk_buff *skb, + struct netlink_callback *cb); +static void mroute_clean_tables(struct mr6_table *mrt); +static void ipmr_expire_process(unsigned long arg); + +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES +#define ip6mr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + + ip6mr_for_each_table(mrt, net) { + if (mrt->id == id) + return mrt; + } + return NULL; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + struct ip6mr_result res; + struct fib_lookup_arg arg = { .result = &res, }; + int err; + + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, + flowi6_to_flowi(flp6), 0, &arg); + if (err < 0) + return err; + *mrt = res.mrt; + return 0; +} + +static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct ip6mr_result *res = arg->result; + struct mr6_table *mrt; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + mrt = ip6mr_get_table(rule->fr_net, rule->table); + if (mrt == NULL) + return -EAGAIN; + res->mrt = mrt; + return 0; +} + +static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) +{ + return 1; +} + +static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { + FRA_GENERIC_POLICY, +}; + +static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, struct nlattr **tb) +{ + return 0; +} + +static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + return 1; +} + +static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + frh->dst_len = 0; + frh->src_len = 0; + frh->tos = 0; + return 0; +} + +static const struct fib_rules_ops __net_initdata ip6mr_rules_ops_template = { + .family = RTNL_FAMILY_IP6MR, + .rule_size = sizeof(struct ip6mr_rule), + .addr_size = sizeof(struct in6_addr), + .action = ip6mr_rule_action, + .match = ip6mr_rule_match, + .configure = ip6mr_rule_configure, + .compare = ip6mr_rule_compare, + .default_pref = fib_default_rule_pref, + .fill = ip6mr_rule_fill, + .nlgroup = RTNLGRP_IPV6_RULE, + .policy = ip6mr_rule_policy, + .owner = THIS_MODULE, +}; + +static int __net_init ip6mr_rules_init(struct net *net) +{ + struct fib_rules_ops *ops; + struct mr6_table *mrt; + int err; + + ops = fib_rules_register(&ip6mr_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + INIT_LIST_HEAD(&net->ipv6.mr6_tables); + + mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) { + err = -ENOMEM; + goto err1; + } + + err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); + if (err < 0) + goto err2; + + net->ipv6.mr6_rules_ops = ops; + return 0; + +err2: + kfree(mrt); +err1: + fib_rules_unregister(ops); + return err; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + struct mr6_table *mrt, *next; + + list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { + list_del(&mrt->list); + ip6mr_free_table(mrt); + } + fib_rules_unregister(net->ipv6.mr6_rules_ops); +} +#else +#define ip6mr_for_each_table(mrt, net) \ + for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + return net->ipv6.mrt6; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + *mrt = net->ipv6.mrt6; + return 0; +} + +static int __net_init ip6mr_rules_init(struct net *net) +{ + net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT); + return net->ipv6.mrt6 ? 0 : -ENOMEM; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + ip6mr_free_table(net->ipv6.mrt6); +} +#endif + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + unsigned int i; + + mrt = ip6mr_get_table(net, id); + if (mrt != NULL) + return mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (mrt == NULL) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + /* Forwarding cache */ + for (i = 0; i < MFC6_LINES; i++) + INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]); + + INIT_LIST_HEAD(&mrt->mfc6_unres_queue); + + setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, + (unsigned long)mrt); + +#ifdef CONFIG_IPV6_PIMSM_V2 + mrt->mroute_reg_vif_num = -1; +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); +#endif + return mrt; +} + +static void ip6mr_free_table(struct mr6_table *mrt) +{ + del_timer(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + +#ifdef CONFIG_PROC_FS + +struct ipmr_mfc_iter { + struct seq_net_private p; + struct mr6_table *mrt; + struct list_head *cache; + int ct; +}; + + +static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, + struct ipmr_mfc_iter *it, loff_t pos) +{ + struct mr6_table *mrt = it->mrt; + struct mfc6_cache *mfc; + + read_lock(&mrt_lock); + for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + } + read_unlock(&mrt_lock); + + spin_lock_bh(&mfc_unres_lock); + it->cache = &mrt->mfc6_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&mfc_unres_lock); + + it->cache = NULL; + return NULL; +} + +/* + * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif + */ + +struct ipmr_vif_iter { + struct seq_net_private p; + struct mr6_table *mrt; + int ct; +}; + +static struct mif_device *ip6mr_vif_seq_idx(struct net *net, + struct ipmr_vif_iter *iter, + loff_t pos) +{ + struct mr6_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!MIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif6_table[iter->ct]; + } + return NULL; +} + +static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(mrt_lock) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + iter->mrt = mrt; + + read_lock(&mrt_lock); + return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip6mr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!MIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif6_table[iter->ct]; + } + return NULL; +} + +static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) + __releases(mrt_lock) +{ + read_unlock(&mrt_lock); +} + +static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) +{ + struct ipmr_vif_iter *iter = seq->private; + struct mr6_table *mrt = iter->mrt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); + } else { + const struct mif_device *vif = v; + const char *name = vif->dev ? vif->dev->name : "none"; + + seq_printf(seq, + "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", + vif - mrt->vif6_table, + name, vif->bytes_in, vif->pkt_in, + vif->bytes_out, vif->pkt_out, + vif->flags); + } + return 0; +} + +static const struct seq_operations ip6mr_vif_seq_ops = { + .start = ip6mr_vif_seq_start, + .next = ip6mr_vif_seq_next, + .stop = ip6mr_vif_seq_stop, + .show = ip6mr_vif_seq_show, +}; + +static int ip6mr_vif_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip6mr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); +} + +static const struct file_operations ip6mr_vif_fops = { + .owner = THIS_MODULE, + .open = ip6mr_vif_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + it->mrt = mrt; + return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mfc6_cache *mfc = v; + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt = it->mrt; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return ipmr_mfc_seq_idx(net, seq->private, 0); + + if (mfc->list.next != it->cache) + return list_entry(mfc->list.next, struct mfc6_cache, list); + + if (it->cache == &mrt->mfc6_unres_queue) + goto end_of_list; + + BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]); + + while (++it->ct < MFC6_LINES) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + if (list_empty(it->cache)) + continue; + return list_first_entry(it->cache, struct mfc6_cache, list); + } + + /* exhausted cache_array, show unresolved */ + read_unlock(&mrt_lock); + it->cache = &mrt->mfc6_unres_queue; + it->ct = 0; + + spin_lock_bh(&mfc_unres_lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mfc6_cache, list); + + end_of_list: + spin_unlock_bh(&mfc_unres_lock); + it->cache = NULL; + + return NULL; +} + +static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; + + if (it->cache == &mrt->mfc6_unres_queue) + spin_unlock_bh(&mfc_unres_lock); + else if (it->cache == mrt->mfc6_cache_array) + read_unlock(&mrt_lock); +} + +static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) +{ + int n; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Group " + "Origin " + "Iif Pkts Bytes Wrong Oifs\n"); + } else { + const struct mfc6_cache *mfc = v; + const struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; + + seq_printf(seq, "%pI6 %pI6 %-3hd", + &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, + mfc->mf6c_parent); + + if (it->cache != &mrt->mfc6_unres_queue) { + seq_printf(seq, " %8lu %8lu %8lu", + mfc->mfc_un.res.pkt, + mfc->mfc_un.res.bytes, + mfc->mfc_un.res.wrong_if); + for (n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++) { + if (MIF_EXISTS(mrt, n) && + mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, + " %2d:%-3d", + n, mfc->mfc_un.res.ttls[n]); + } + } else { + /* unresolved mfc_caches don't contain + * pkt, bytes and wrong_if values + */ + seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); + } + seq_putc(seq, '\n'); + } + return 0; +} + +static const struct seq_operations ipmr_mfc_seq_ops = { + .start = ipmr_mfc_seq_start, + .next = ipmr_mfc_seq_next, + .stop = ipmr_mfc_seq_stop, + .show = ipmr_mfc_seq_show, +}; + +static int ipmr_mfc_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); +} + +static const struct file_operations ip6mr_mfc_fops = { + .owner = THIS_MODULE, + .open = ipmr_mfc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +#ifdef CONFIG_IPV6_PIMSM_V2 + +static int pim6_rcv(struct sk_buff *skb) +{ + struct pimreghdr *pim; + struct ipv6hdr *encap; + struct net_device *reg_dev = NULL; + struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int reg_vif_num; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct pimreghdr *)skb_transport_header(skb); + if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) || + (pim->flags & PIM_NULL_REGISTER) || + (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + sizeof(*pim), IPPROTO_PIM, + csum_partial((void *)pim, sizeof(*pim), 0)) && + csum_fold(skb_checksum(skb, 0, skb->len, 0)))) + goto drop; + + /* check if the inner packet is destined to mcast group */ + encap = (struct ipv6hdr *)(skb_transport_header(skb) + + sizeof(*pim)); + + if (!ipv6_addr_is_multicast(&encap->daddr) || + encap->payload_len == 0 || + ntohs(encap->payload_len) + sizeof(*pim) > skb->len) + goto drop; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto drop; + reg_vif_num = mrt->mroute_reg_vif_num; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = mrt->vif6_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac_header = skb->network_header; + skb_pull(skb, (u8 *)encap - skb->data); + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IPV6); + skb->ip_summed = CHECKSUM_NONE; + skb->pkt_type = PACKET_HOST; + + skb_tunnel_rx(skb, reg_dev); + + netif_rx(skb); + + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} + +static const struct inet6_protocol pim6_protocol = { + .handler = pim6_rcv, +}; + +/* Service routines creating virtual interfaces: PIMREG */ + +static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_oif = dev->ifindex, + .flowi6_iif = skb->skb_iif, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); + read_unlock(&mrt_lock); + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static const struct net_device_ops reg_vif_netdev_ops = { + .ndo_start_xmit = reg_vif_xmit, +}; + +static void reg_vif_setup(struct net_device *dev) +{ + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; + dev->flags = IFF_NOARP; + dev->netdev_ops = ®_vif_netdev_ops; + dev->destructor = free_netdev; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) +{ + struct net_device *dev; + char name[IFNAMSIZ]; + + if (mrt->id == RT6_TABLE_DFLT) + sprintf(name, "pim6reg"); + else + sprintf(name, "pim6reg%u", mrt->id); + + dev = alloc_netdev(0, name, reg_vif_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + if (register_netdevice(dev)) { + free_netdev(dev); + return NULL; + } + dev->iflink = 0; + + if (dev_open(dev)) + goto failure; + + dev_hold(dev); + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + */ + +static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) +{ + struct mif_device *v; + struct net_device *dev; + struct inet6_dev *in6_dev; + + if (vifi < 0 || vifi >= mrt->maxvif) + return -EADDRNOTAVAIL; + + v = &mrt->vif6_table[vifi]; + + write_lock_bh(&mrt_lock); + dev = v->dev; + v->dev = NULL; + + if (!dev) { + write_unlock_bh(&mrt_lock); + return -EADDRNOTAVAIL; + } + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (vifi == mrt->mroute_reg_vif_num) + mrt->mroute_reg_vif_num = -1; +#endif + + if (vifi + 1 == mrt->maxvif) { + int tmp; + for (tmp = vifi - 1; tmp >= 0; tmp--) { + if (MIF_EXISTS(mrt, tmp)) + break; + } + mrt->maxvif = tmp + 1; + } + + write_unlock_bh(&mrt_lock); + + dev_set_allmulti(dev, -1); + + in6_dev = __in6_dev_get(dev); + if (in6_dev) + in6_dev->cnf.mc_forwarding--; + + if (v->flags & MIFF_REGISTER) + unregister_netdevice_queue(dev, head); + + dev_put(dev); + return 0; +} + +static inline void ip6mr_cache_free(struct mfc6_cache *c) +{ + kmem_cache_free(mrt_cachep, c); +} + +/* Destroy an unresolved cache entry, killing queued skbs + and reporting error to netlink readers. + */ + +static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + + atomic_dec(&mrt->cache_resolve_queue_len); + + while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + if (ipv6_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + } else + kfree_skb(skb); + } + + ip6mr_cache_free(c); +} + + +/* Timer process for all the unresolved queue. */ + +static void ipmr_do_expire_process(struct mr6_table *mrt) +{ + unsigned long now = jiffies; + unsigned long expires = 10 * HZ; + struct mfc6_cache *c, *next; + + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + if (time_after(c->mfc_un.unres.expires, now)) { + /* not yet... */ + unsigned long interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + continue; + } + + list_del(&c->list); + ip6mr_destroy_unres(mrt, c); + } + + if (!list_empty(&mrt->mfc6_unres_queue)) + mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); +} + +static void ipmr_expire_process(unsigned long arg) +{ + struct mr6_table *mrt = (struct mr6_table *)arg; + + if (!spin_trylock(&mfc_unres_lock)) { + mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); + return; + } + + if (!list_empty(&mrt->mfc6_unres_queue)) + ipmr_do_expire_process(mrt); + + spin_unlock(&mfc_unres_lock); +} + +/* Fill oifs list. It is called under write locked mrt_lock. */ + +static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache, + unsigned char *ttls) +{ + int vifi; + + cache->mfc_un.res.minvif = MAXMIFS; + cache->mfc_un.res.maxvif = 0; + memset(cache->mfc_un.res.ttls, 255, MAXMIFS); + + for (vifi = 0; vifi < mrt->maxvif; vifi++) { + if (MIF_EXISTS(mrt, vifi) && + ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_un.res.ttls[vifi] = ttls[vifi]; + if (cache->mfc_un.res.minvif > vifi) + cache->mfc_un.res.minvif = vifi; + if (cache->mfc_un.res.maxvif <= vifi) + cache->mfc_un.res.maxvif = vifi + 1; + } + } +} + +static int mif6_add(struct net *net, struct mr6_table *mrt, + struct mif6ctl *vifc, int mrtsock) +{ + int vifi = vifc->mif6c_mifi; + struct mif_device *v = &mrt->vif6_table[vifi]; + struct net_device *dev; + struct inet6_dev *in6_dev; + int err; + + /* Is vif busy ? */ + if (MIF_EXISTS(mrt, vifi)) + return -EADDRINUSE; + + switch (vifc->mif6c_flags) { +#ifdef CONFIG_IPV6_PIMSM_V2 + case MIFF_REGISTER: + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (mrt->mroute_reg_vif_num >= 0) + return -EADDRINUSE; + dev = ip6mr_reg_vif(net, mrt); + if (!dev) + return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + dev_put(dev); + return err; + } + break; +#endif + case 0: + dev = dev_get_by_index(net, vifc->mif6c_pifi); + if (!dev) + return -EADDRNOTAVAIL; + err = dev_set_allmulti(dev, 1); + if (err) { + dev_put(dev); + return err; + } + break; + default: + return -EINVAL; + } + + in6_dev = __in6_dev_get(dev); + if (in6_dev) + in6_dev->cnf.mc_forwarding++; + + /* + * Fill in the VIF structures + */ + v->rate_limit = vifc->vifc_rate_limit; + v->flags = vifc->mif6c_flags; + if (!mrtsock) + v->flags |= VIFF_STATIC; + v->threshold = vifc->vifc_threshold; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (v->flags & MIFF_REGISTER) + v->link = dev->iflink; + + /* And finish update writing critical data */ + write_lock_bh(&mrt_lock); + v->dev = dev; +#ifdef CONFIG_IPV6_PIMSM_V2 + if (v->flags & MIFF_REGISTER) + mrt->mroute_reg_vif_num = vifi; +#endif + if (vifi + 1 > mrt->maxvif) + mrt->maxvif = vifi + 1; + write_unlock_bh(&mrt_lock); + return 0; +} + +static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, + const struct in6_addr *origin, + const struct in6_addr *mcastgrp) +{ + int line = MFC6_HASH(mcastgrp, origin); + struct mfc6_cache *c; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, origin) && + ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) + return c; + } + return NULL; +} + +/* + * Allocate a multicast cache entry + */ +static struct mfc6_cache *ip6mr_cache_alloc(void) +{ + struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); + if (c == NULL) + return NULL; + c->mfc_un.res.minvif = MAXMIFS; + return c; +} + +static struct mfc6_cache *ip6mr_cache_alloc_unres(void) +{ + struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); + if (c == NULL) + return NULL; + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10 * HZ; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, + struct mfc6_cache *uc, struct mfc6_cache *c) +{ + struct sk_buff *skb; + + /* + * Play the pending entries through our router + */ + + while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (ipv6_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); + + if (__ip6mr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + } else + ip6_mr_forward(net, mrt, skb, c); + } +} + +/* + * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd + * expects the following bizarre scheme. + * + * Called under mrt_lock. + */ + +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert) +{ + struct sk_buff *skb; + struct mrt6msg *msg; + int ret; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (assert == MRT6MSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) + +sizeof(*msg)); + else +#endif + skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); + + if (!skb) + return -ENOBUFS; + + /* I suppose that internal messages + * do not require checksums */ + + skb->ip_summed = CHECKSUM_UNNECESSARY; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (assert == MRT6MSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix length etc. + And all this only to mangle msg->im6_msgtype and + to set msg->im6_mbz to "mbz" :-) + */ + skb_push(skb, -skb_network_offset(pkt)); + + skb_push(skb, sizeof(*msg)); + skb_reset_transport_header(skb); + msg = (struct mrt6msg *)skb_transport_header(skb); + msg->im6_mbz = 0; + msg->im6_msgtype = MRT6MSG_WHOLEPKT; + msg->im6_mif = mrt->mroute_reg_vif_num; + msg->im6_pad = 0; + ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr); + ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else +#endif + { + /* + * Copy the IP header + */ + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); + + /* + * Add our header + */ + skb_put(skb, sizeof(*msg)); + skb_reset_transport_header(skb); + msg = (struct mrt6msg *)skb_transport_header(skb); + + msg->im6_mbz = 0; + msg->im6_msgtype = assert; + msg->im6_mif = mifi; + msg->im6_pad = 0; + ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr); + ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr); + + skb_dst_set(skb, dst_clone(skb_dst(pkt))); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (mrt->mroute6_sk == NULL) { + kfree_skb(skb); + return -EINVAL; + } + + /* + * Deliver to user space multicast routing algorithms + */ + ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb); + if (ret < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution. It gets locked cache entry! + */ + +static int +ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) +{ + bool found = false; + int err; + struct mfc6_cache *c; + + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(c, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && + ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { + found = true; + break; + } + } + + if (!found) { + /* + * Create a new entry if allowable + */ + + if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || + (c = ip6mr_cache_alloc_unres()) == NULL) { + spin_unlock_bh(&mfc_unres_lock); + + kfree_skb(skb); + return -ENOBUFS; + } + + /* + * Fill in the new cache entry + */ + c->mf6c_parent = -1; + c->mf6c_origin = ipv6_hdr(skb)->saddr; + c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; + + /* + * Reflect first query at pim6sd + */ + err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); + if (err < 0) { + /* If the report failed throw the cache entry + out - Brad Parker + */ + spin_unlock_bh(&mfc_unres_lock); + + ip6mr_cache_free(c); + kfree_skb(skb); + return err; + } + + atomic_inc(&mrt->cache_resolve_queue_len); + list_add(&c->list, &mrt->mfc6_unres_queue); + + ipmr_do_expire_process(mrt); + } + + /* + * See if we can append the packet + */ + if (c->mfc_un.unres.unresolved.qlen > 3) { + kfree_skb(skb); + err = -ENOBUFS; + } else { + skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + err = 0; + } + + spin_unlock_bh(&mfc_unres_lock); + return err; +} + +/* + * MFC6 cache manipulation by user space + */ + +static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc) +{ + int line; + struct mfc6_cache *c, *next; + + line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && + ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + write_lock_bh(&mrt_lock); + list_del(&c->list); + write_unlock_bh(&mrt_lock); + + ip6mr_cache_free(c); + return 0; + } + } + return -ENOENT; +} + +static int ip6mr_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct mif_device *v; + int ct; + LIST_HEAD(list); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + ip6mr_for_each_table(mrt, net) { + v = &mrt->vif6_table[0]; + for (ct = 0; ct < mrt->maxvif; ct++, v++) { + if (v->dev == dev) + mif6_delete(mrt, ct, &list); + } + } + unregister_netdevice_many(&list); + + return NOTIFY_DONE; +} + +static struct notifier_block ip6_mr_notifier = { + .notifier_call = ip6mr_device_event +}; + +/* + * Setup for IP multicast routing + */ + +static int __net_init ip6mr_net_init(struct net *net) +{ + int err; + + err = ip6mr_rules_init(net); + if (err < 0) + goto fail; + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (!proc_net_fops_create(net, "ip6_mr_vif", 0, &ip6mr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(net, "ip6_mr_cache", 0, &ip6mr_mfc_fops)) + goto proc_cache_fail; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +proc_cache_fail: + proc_net_remove(net, "ip6_mr_vif"); +proc_vif_fail: + ip6mr_rules_exit(net); +#endif +fail: + return err; +} + +static void __net_exit ip6mr_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "ip6_mr_cache"); + proc_net_remove(net, "ip6_mr_vif"); +#endif + ip6mr_rules_exit(net); +} + +static struct pernet_operations ip6mr_net_ops = { + .init = ip6mr_net_init, + .exit = ip6mr_net_exit, +}; + +int __init ip6_mr_init(void) +{ + int err; + + mrt_cachep = kmem_cache_create("ip6_mrt_cache", + sizeof(struct mfc6_cache), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!mrt_cachep) + return -ENOMEM; + + err = register_pernet_subsys(&ip6mr_net_ops); + if (err) + goto reg_pernet_fail; + + err = register_netdevice_notifier(&ip6_mr_notifier); + if (err) + goto reg_notif_fail; +#ifdef CONFIG_IPV6_PIMSM_V2 + if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { + printk(KERN_ERR "ip6_mr_init: can't add PIM protocol\n"); + err = -EAGAIN; + goto add_proto_fail; + } +#endif + rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, ip6mr_rtm_dumproute); + return 0; +#ifdef CONFIG_IPV6_PIMSM_V2 +add_proto_fail: + unregister_netdevice_notifier(&ip6_mr_notifier); +#endif +reg_notif_fail: + unregister_pernet_subsys(&ip6mr_net_ops); +reg_pernet_fail: + kmem_cache_destroy(mrt_cachep); + return err; +} + +void ip6_mr_cleanup(void) +{ + unregister_netdevice_notifier(&ip6_mr_notifier); + unregister_pernet_subsys(&ip6mr_net_ops); + kmem_cache_destroy(mrt_cachep); +} + +static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, + struct mf6cctl *mfc, int mrtsock) +{ + bool found = false; + int line; + struct mfc6_cache *uc, *c; + unsigned char ttls[MAXMIFS]; + int i; + + if (mfc->mf6cc_parent >= MAXMIFS) + return -ENFILE; + + memset(ttls, 255, MAXMIFS); + for (i = 0; i < MAXMIFS; i++) { + if (IF_ISSET(i, &mfc->mf6cc_ifset)) + ttls[i] = 1; + + } + + line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && + ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + found = true; + break; + } + } + + if (found) { + write_lock_bh(&mrt_lock); + c->mf6c_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, c, ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + write_unlock_bh(&mrt_lock); + return 0; + } + + if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) + return -EINVAL; + + c = ip6mr_cache_alloc(); + if (c == NULL) + return -ENOMEM; + + c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; + c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; + c->mf6c_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, c, ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + + write_lock_bh(&mrt_lock); + list_add(&c->list, &mrt->mfc6_cache_array[line]); + write_unlock_bh(&mrt_lock); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + found = false; + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && + ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { + list_del(&uc->list); + atomic_dec(&mrt->cache_resolve_queue_len); + found = true; + break; + } + } + if (list_empty(&mrt->mfc6_unres_queue)) + del_timer(&mrt->ipmr_expire_timer); + spin_unlock_bh(&mfc_unres_lock); + + if (found) { + ip6mr_cache_resolve(net, mrt, uc, c); + ip6mr_cache_free(uc); + } + return 0; +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +static void mroute_clean_tables(struct mr6_table *mrt) +{ + int i; + LIST_HEAD(list); + struct mfc6_cache *c, *next; + + /* + * Shut down all active vif entries + */ + for (i = 0; i < mrt->maxvif; i++) { + if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) + mif6_delete(mrt, i, &list); + } + unregister_netdevice_many(&list); + + /* + * Wipe the cache + */ + for (i = 0; i < MFC6_LINES; i++) { + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { + if (c->mfc_flags & MFC_STATIC) + continue; + write_lock_bh(&mrt_lock); + list_del(&c->list); + write_unlock_bh(&mrt_lock); + + ip6mr_cache_free(c); + } + } + + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_del(&c->list); + ip6mr_destroy_unres(mrt, c); + } + spin_unlock_bh(&mfc_unres_lock); + } +} + +static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) +{ + int err = 0; + struct net *net = sock_net(sk); + + rtnl_lock(); + write_lock_bh(&mrt_lock); + if (likely(mrt->mroute6_sk == NULL)) { + mrt->mroute6_sk = sk; + net->ipv6.devconf_all->mc_forwarding++; + } + else + err = -EADDRINUSE; + write_unlock_bh(&mrt_lock); + + rtnl_unlock(); + + return err; +} + +int ip6mr_sk_done(struct sock *sk) +{ + int err = -EACCES; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + rtnl_lock(); + ip6mr_for_each_table(mrt, net) { + if (sk == mrt->mroute6_sk) { + write_lock_bh(&mrt_lock); + mrt->mroute6_sk = NULL; + net->ipv6.devconf_all->mc_forwarding--; + write_unlock_bh(&mrt_lock); + + mroute_clean_tables(mrt); + err = 0; + break; + } + } + rtnl_unlock(); + + return err; +} + +struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +{ + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->skb_iif, + .flowi6_oif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + return NULL; + + return mrt->mroute6_sk; +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +{ + int ret; + struct mif6ctl vif; + struct mf6cctl mfc; + mifi_t mifi; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + if (optname != MRT6_INIT) { + if (sk != mrt->mroute6_sk && !capable(CAP_NET_ADMIN)) + return -EACCES; + } + + switch (optname) { + case MRT6_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + if (optlen < sizeof(int)) + return -EINVAL; + + return ip6mr_sk_init(mrt, sk); + + case MRT6_DONE: + return ip6mr_sk_done(sk); + + case MRT6_ADD_MIF: + if (optlen < sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif, optval, sizeof(vif))) + return -EFAULT; + if (vif.mif6c_mifi >= MAXMIFS) + return -ENFILE; + rtnl_lock(); + ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk); + rtnl_unlock(); + return ret; + + case MRT6_DEL_MIF: + if (optlen < sizeof(mifi_t)) + return -EINVAL; + if (copy_from_user(&mifi, optval, sizeof(mifi_t))) + return -EFAULT; + rtnl_lock(); + ret = mif6_delete(mrt, mifi, NULL); + rtnl_unlock(); + return ret; + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT6_ADD_MFC: + case MRT6_DEL_MFC: + if (optlen < sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc, optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname == MRT6_DEL_MFC) + ret = ip6mr_mfc_delete(mrt, &mfc); + else + ret = ip6mr_mfc_add(net, mrt, &mfc, sk == mrt->mroute6_sk); + rtnl_unlock(); + return ret; + + /* + * Control PIM assert (to activate pim will activate assert) + */ + case MRT6_ASSERT: + { + int v; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + mrt->mroute_do_assert = !!v; + return 0; + } + +#ifdef CONFIG_IPV6_PIMSM_V2 + case MRT6_PIM: + { + int v; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + v = !!v; + rtnl_lock(); + ret = 0; + if (v != mrt->mroute_do_pim) { + mrt->mroute_do_pim = v; + mrt->mroute_do_assert = v; + } + rtnl_unlock(); + return ret; + } + +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + case MRT6_TABLE: + { + u32 v; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(v, (u32 __user *)optval)) + return -EFAULT; + if (sk == mrt->mroute6_sk) + return -EBUSY; + + rtnl_lock(); + ret = 0; + if (!ip6mr_new_table(net, v)) + ret = -ENOMEM; + raw6_sk(sk)->ip6mr_table = v; + rtnl_unlock(); + return ret; + } +#endif + /* + * Spurious command, or MRT6_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) +{ + int olr; + int val; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (optname) { + case MRT6_VERSION: + val = 0x0305; + break; +#ifdef CONFIG_IPV6_PIMSM_V2 + case MRT6_PIM: + val = mrt->mroute_do_pim; + break; +#endif + case MRT6_ASSERT: + val = mrt->mroute_do_assert; + break; + default: + return -ENOPROTOOPT; + } + + if (get_user(olr, optlen)) + return -EFAULT; + + olr = min_t(int, olr, sizeof(int)); + if (olr < 0) + return -EINVAL; + + if (put_user(olr, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +{ + struct sioc_sg_req6 sr; + struct sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req6 { + struct sockaddr_in6 src; + struct sockaddr_in6 grp; + compat_ulong_t pktcnt; + compat_ulong_t bytecnt; + compat_ulong_t wrong_if; +}; + +struct compat_sioc_mif_req6 { + mifi_t mifi; + compat_ulong_t icount; + compat_ulong_t ocount; + compat_ulong_t ibytes; + compat_ulong_t obytes; +}; + +int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + struct compat_sioc_sg_req6 sr; + struct compat_sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} +#endif + +static inline int ip6mr_forward2_finish(struct sk_buff *skb) +{ + IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTFORWDATAGRAMS); + return dst_output(skb); +} + +/* + * Processing handlers for ip6mr_forward + */ + +static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *c, int vifi) +{ + struct ipv6hdr *ipv6h; + struct mif_device *vif = &mrt->vif6_table[vifi]; + struct net_device *dev; + struct dst_entry *dst; + struct flowi6 fl6; + + if (vif->dev == NULL) + goto out_free; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (vif->flags & MIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out += skb->len; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; + ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); + goto out_free; + } +#endif + + ipv6h = ipv6_hdr(skb); + + fl6 = (struct flowi6) { + .flowi6_oif = vif->link, + .daddr = ipv6h->daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (!dst) + goto out_free; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + dev = vif->dev; + skb->dev = dev; + vif->pkt_out++; + vif->bytes_out += skb->len; + + /* We are about to write */ + /* XXX: extension headers? */ + if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev))) + goto out_free; + + ipv6h = ipv6_hdr(skb); + ipv6h->hop_limit--; + + IP6CB(skb)->flags |= IP6SKB_FORWARDED; + + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dev, + ip6mr_forward2_finish); + +out_free: + kfree_skb(skb); + return 0; +} + +static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) +{ + int ct; + + for (ct = mrt->maxvif - 1; ct >= 0; ct--) { + if (mrt->vif6_table[ct].dev == dev) + break; + } + return ct; +} + +static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache) +{ + int psend = -1; + int vif, ct; + + vif = cache->mf6c_parent; + cache->mfc_un.res.pkt++; + cache->mfc_un.res.bytes += skb->len; + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (mrt->vif6_table[vif].dev != skb->dev) { + int true_vifi; + + cache->mfc_un.res.wrong_if++; + true_vifi = ip6mr_find_vif(mrt, skb->dev); + + if (true_vifi >= 0 && mrt->mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mrt->mroute_do_pim || + cache->mfc_un.res.ttls[true_vifi] < 255) && + time_after(jiffies, + cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + cache->mfc_un.res.last_assert = jiffies; + ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); + } + goto dont_forward; + } + + mrt->vif6_table[vif].pkt_in++; + mrt->vif6_table[vif].bytes_in += skb->len; + + /* + * Forward the frame + */ + for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { + if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ip6mr_forward2(net, mrt, skb2, cache, psend); + } + psend = ct; + } + } + if (psend != -1) { + ip6mr_forward2(net, mrt, skb, cache, psend); + return 0; + } + +dont_forward: + kfree_skb(skb); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip6_mr_input(struct sk_buff *skb) +{ + struct mfc6_cache *cache; + struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + cache = ip6mr_cache_find(mrt, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + + /* + * No usable cache entry + */ + if (cache == NULL) { + int vif; + + vif = ip6mr_find_vif(mrt, skb->dev); + if (vif >= 0) { + int err = ip6mr_cache_unresolved(mrt, vif, skb); + read_unlock(&mrt_lock); + + return err; + } + read_unlock(&mrt_lock); + kfree_skb(skb); + return -ENODEV; + } + + ip6_mr_forward(net, mrt, skb, cache); + + read_unlock(&mrt_lock); + + return 0; +} + + +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + u8 *b = skb_tail_pointer(skb); + struct rtattr *mp_head; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mf6c_parent >= MAXMIFS) + return -ENOENT; + + if (MIF_EXISTS(mrt, c->mf6c_parent)) + RTA_PUT(skb, RTA_IIF, 4, &mrt->vif6_table[c->mf6c_parent].dev->ifindex); + + mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +int ip6mr_get_route(struct net *net, + struct sk_buff *skb, struct rtmsg *rtm, int nowait) +{ + int err; + struct mr6_table *mrt; + struct mfc6_cache *cache; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + read_lock(&mrt_lock); + cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); + + if (!cache) { + struct sk_buff *skb2; + struct ipv6hdr *iph; + struct net_device *dev; + int vif; + + if (nowait) { + read_unlock(&mrt_lock); + return -EAGAIN; + } + + dev = skb->dev; + if (dev == NULL || (vif = ip6mr_find_vif(mrt, dev)) < 0) { + read_unlock(&mrt_lock); + return -ENODEV; + } + + /* really correct? */ + skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb2) { + read_unlock(&mrt_lock); + return -ENOMEM; + } + + skb_reset_transport_header(skb2); + + skb_put(skb2, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb2); + + iph = ipv6_hdr(skb2); + iph->version = 0; + iph->priority = 0; + iph->flow_lbl[0] = 0; + iph->flow_lbl[1] = 0; + iph->flow_lbl[2] = 0; + iph->payload_len = 0; + iph->nexthdr = IPPROTO_NONE; + iph->hop_limit = 0; + ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr); + ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr); + + err = ip6mr_cache_unresolved(mrt, vif, skb2); + read_unlock(&mrt_lock); + + return err; + } + + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + + err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); + read_unlock(&mrt_lock); + return err; +} + +static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + u32 pid, u32 seq, struct mfc6_cache *c) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IPMR; + rtm->rtm_dst_len = 128; + rtm->rtm_src_len = 128; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + NLA_PUT_U32(skb, RTA_TABLE, mrt->id); + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = 0; + + NLA_PUT(skb, RTA_SRC, 16, &c->mf6c_origin); + NLA_PUT(skb, RTA_DST, 16, &c->mf6c_mcastgrp); + + if (__ip6mr_fill_mroute(mrt, skb, c, rtm) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr6_table *mrt; + struct mfc6_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + read_lock(&mrt_lock); + ip6mr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC6_LINES; h++) { + list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + mfc) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + s_h = 0; +next_table: + t++; + } +done: + read_unlock(&mrt_lock); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c new file mode 100644 index 00000000..bba658d9 --- /dev/null +++ b/net/ipv6/ipcomp6.c @@ -0,0 +1,218 @@ +/* + * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173 + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Author Mitsuru KANDA + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * [Memo] + * + * Outbound: + * The compression of IP datagram MUST be done before AH/ESP processing, + * fragmentation, and the addition of Hop-by-Hop/Routing header. + * + * Inbound: + * The decompression of IP datagram MUST be done after the reassembly, + * AH/ESP processing. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + __be32 spi; + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct ip_comp_hdr *ipcomph = + (struct ip_comp_hdr *)(skb->data + offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) + return; + + spi = htonl(ntohs(ipcomph->cpi)); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET6); + if (!x) + return; + + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI6\n", + spi, &iph->daddr); + xfrm_state_put(x); +} + +static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + struct xfrm_state *t = NULL; + + t = xfrm_state_alloc(net); + if (!t) + goto out; + + t->id.proto = IPPROTO_IPV6; + t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); + if (!t->id.spi) + goto error; + + memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET6; + t->props.mode = x->props.mode; + memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); + memcpy(&t->mark, &x->mark, sizeof(t->mark)); + + if (xfrm_init_state(t)) + goto error; + + atomic_set(&t->tunnel_users, 1); + +out: + return t; + +error: + t->km.state = XFRM_STATE_DEAD; + xfrm_state_put(t); + t = NULL; + goto out; +} + +static int ipcomp6_tunnel_attach(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + int err = 0; + struct xfrm_state *t = NULL; + __be32 spi; + u32 mark = x->mark.m & x->mark.v; + + spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr); + if (spi) + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr, + spi, IPPROTO_IPV6, AF_INET6); + if (!t) { + t = ipcomp6_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); + +out: + return err; +} + +static int ipcomp6_init_state(struct xfrm_state *x) +{ + int err = -EINVAL; + + x->props.header_len = 0; + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct ipv6hdr); + break; + default: + goto out; + } + + err = ipcomp_init_state(x); + if (err) + goto out; + + if (x->props.mode == XFRM_MODE_TUNNEL) { + err = ipcomp6_tunnel_attach(x); + if (err) + goto out; + } + + err = 0; +out: + return err; +} + +static const struct xfrm_type ipcomp6_type = +{ + .description = "IPCOMP6", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp6_init_state, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output, + .hdr_offset = xfrm6_find_1stfragopt, +}; + +static const struct inet6_protocol ipcomp6_protocol = +{ + .handler = xfrm6_rcv, + .err_handler = ipcomp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ipcomp6_init(void) +{ + if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp6_fini(void) +{ + if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp6_init); +module_exit(ipcomp6_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173"); +MODULE_AUTHOR("Mitsuru KANDA "); + +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c new file mode 100644 index 00000000..147ede38 --- /dev/null +++ b/net/ipv6/ipv6_sockglue.c @@ -0,0 +1,1290 @@ +/* + * IPv6 BSD socket options interface + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/net/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * FIXME: Make the setsockopt code POSIX compliant: That is + * + * o Truncate getsockopt returns + * o Return an optlen of the truncated length if need be + * + * Changes: + * David L Stevens : + * - added multicast source filtering API for MLDv2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct ip6_ra_chain *ip6_ra_chain; +DEFINE_RWLOCK(ip6_ra_lock); + +int ip6_ra_control(struct sock *sk, int sel) +{ + struct ip6_ra_chain *ra, *new_ra, **rap; + + /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) + return -ENOPROTOOPT; + + new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + write_lock_bh(&ip6_ra_lock); + for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (sel>=0) { + write_unlock_bh(&ip6_ra_lock); + kfree(new_ra); + return -EADDRINUSE; + } + + *rap = ra->next; + write_unlock_bh(&ip6_ra_lock); + + sock_put(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) { + write_unlock_bh(&ip6_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->sel = sel; + new_ra->next = ra; + *rap = new_ra; + sock_hold(sk); + write_unlock_bh(&ip6_ra_lock); + return 0; +} + +static +struct ipv6_txoptions *ipv6_update_options(struct sock *sk, + struct ipv6_txoptions *opt) +{ + if (inet_sk(sk)->is_icsk) { + if (opt && + !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && + inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { + struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + opt = xchg(&inet6_sk(sk)->opt, opt); + } else { + spin_lock(&sk->sk_dst_lock); + opt = xchg(&inet6_sk(sk)->opt, opt); + spin_unlock(&sk->sk_dst_lock); + } + sk_dst_reset(sk); + + return opt; +} + +static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + int val, valbool; + int retv = -ENOPROTOOPT; + + if (optval == NULL) + val=0; + else { + if (optlen >= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } else + val = 0; + } + + valbool = (val!=0); + + if (ip6_mroute_opt(optname)) + return ip6_mroute_setsockopt(sk, optname, optval, optlen); + + lock_sock(sk); + + switch (optname) { + + case IPV6_ADDRFORM: + if (optlen < sizeof(int)) + goto e_inval; + if (val == PF_INET) { + struct ipv6_txoptions *opt; + struct sk_buff *pktopt; + + if (sk->sk_type == SOCK_RAW) + break; + + if (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_UDPLITE) { + struct udp_sock *up = udp_sk(sk); + if (up->pending == AF_INET6) { + retv = -EBUSY; + break; + } + } else if (sk->sk_protocol != IPPROTO_TCP) + break; + + if (sk->sk_state != TCP_ESTABLISHED) { + retv = -ENOTCONN; + break; + } + + if (ipv6_only_sock(sk) || + !ipv6_addr_v4mapped(&np->daddr)) { + retv = -EADDRNOTAVAIL; + break; + } + + fl6_free_socklist(sk); + ipv6_sock_mc_close(sk); + + /* + * Sock is moving from IPv6 to IPv4 (sk_prot), so + * remove it from the refcnt debug socks count in the + * original family... + */ + sk_refcnt_debug_dec(sk); + + if (sk->sk_protocol == IPPROTO_TCP) { + struct inet_connection_sock *icsk = inet_csk(sk); + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + sock_prot_inuse_add(net, &tcp_prot, 1); + local_bh_enable(); + sk->sk_prot = &tcp_prot; + icsk->icsk_af_ops = &ipv4_specific; + sk->sk_socket->ops = &inet_stream_ops; + sk->sk_family = PF_INET; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + struct proto *prot = &udp_prot; + + if (sk->sk_protocol == IPPROTO_UDPLITE) + prot = &udplite_prot; + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + sock_prot_inuse_add(net, prot, 1); + local_bh_enable(); + sk->sk_prot = prot; + sk->sk_socket->ops = &inet_dgram_ops; + sk->sk_family = PF_INET; + } + opt = xchg(&np->opt, NULL); + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + pktopt = xchg(&np->pktoptions, NULL); + kfree_skb(pktopt); + + sk->sk_destruct = inet_sock_destruct; + /* + * ... and add it to the refcnt debug socks count + * in the new family. -acme + */ + sk_refcnt_debug_inc(sk); + module_put(THIS_MODULE); + retv = 0; + break; + } + goto e_inval; + + case IPV6_V6ONLY: + if (optlen < sizeof(int) || + inet_sk(sk)->inet_num) + goto e_inval; + np->ipv6only = valbool; + retv = 0; + break; + + case IPV6_RECVPKTINFO: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxinfo = valbool; + retv = 0; + break; + + case IPV6_2292PKTINFO: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxoinfo = valbool; + retv = 0; + break; + + case IPV6_RECVHOPLIMIT: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxhlim = valbool; + retv = 0; + break; + + case IPV6_2292HOPLIMIT: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxohlim = valbool; + retv = 0; + break; + + case IPV6_RECVRTHDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.srcrt = valbool; + retv = 0; + break; + + case IPV6_2292RTHDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.osrcrt = valbool; + retv = 0; + break; + + case IPV6_RECVHOPOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.hopopts = valbool; + retv = 0; + break; + + case IPV6_2292HOPOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.ohopopts = valbool; + retv = 0; + break; + + case IPV6_RECVDSTOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.dstopts = valbool; + retv = 0; + break; + + case IPV6_2292DSTOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.odstopts = valbool; + retv = 0; + break; + + case IPV6_TCLASS: + if (optlen < sizeof(int)) + goto e_inval; + if (val < -1 || val > 0xff) + goto e_inval; + /* RFC 3542, 6.5: default traffic class of 0x0 */ + if (val == -1) + val = 0; + np->tclass = val; + retv = 0; + break; + + case IPV6_RECVTCLASS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxtclass = valbool; + retv = 0; + break; + + case IPV6_FLOWINFO: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxflow = valbool; + retv = 0; + break; + + case IPV6_RECVPATHMTU: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxpmtu = valbool; + retv = 0; + break; + + case IPV6_TRANSPARENT: + if (!capable(CAP_NET_ADMIN)) { + retv = -EPERM; + break; + } + if (optlen < sizeof(int)) + goto e_inval; + /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ + inet_sk(sk)->transparent = valbool; + retv = 0; + break; + + case IPV6_RECVORIGDSTADDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxorigdstaddr = valbool; + retv = 0; + break; + + case IPV6_HOPOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_RTHDR: + case IPV6_DSTOPTS: + { + struct ipv6_txoptions *opt; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ + if (optlen == 0) + optval = NULL; + else if (optval == NULL) + goto e_inval; + else if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || optlen > 8 * 255) + goto e_inval; + + /* hop-by-hop / destination options are privileged option */ + retv = -EPERM; + if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW)) + break; + + opt = ipv6_renew_options(sk, np->opt, optname, + (struct ipv6_opt_hdr __user *)optval, + optlen); + if (IS_ERR(opt)) { + retv = PTR_ERR(opt); + break; + } + + /* routing header option needs extra check */ + retv = -EINVAL; + if (optname == IPV6_RTHDR && opt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = opt->srcrt; + switch (rthdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || + rthdr->segments_left != 1) + goto sticky_done; + + break; +#endif + default: + goto sticky_done; + } + } + + retv = 0; + opt = ipv6_update_options(sk, opt); +sticky_done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } + + case IPV6_PKTINFO: + { + struct in6_pktinfo pkt; + + if (optlen == 0) + goto e_inval; + else if (optlen < sizeof(struct in6_pktinfo) || optval == NULL) + goto e_inval; + + if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) { + retv = -EFAULT; + break; + } + if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + goto e_inval; + + np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; + ipv6_addr_copy(&np->sticky_pktinfo.ipi6_addr, &pkt.ipi6_addr); + retv = 0; + break; + } + + case IPV6_2292PKTOPTIONS: + { + struct ipv6_txoptions *opt = NULL; + struct msghdr msg; + struct flowi6 fl6; + int junk; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + + if (optlen == 0) + goto update; + + /* 1K is probably excessive + * 1K is surely not enough, 2K per standard header is 16K. + */ + retv = -EINVAL; + if (optlen > 64*1024) + break; + + opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); + retv = -ENOBUFS; + if (opt == NULL) + break; + + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + optlen; + retv = -EFAULT; + if (copy_from_user(opt+1, optval, optlen)) + goto done; + + msg.msg_controllen = optlen; + msg.msg_control = (void*)(opt+1); + + retv = datagram_send_ctl(net, &msg, &fl6, opt, &junk, &junk, + &junk); + if (retv) + goto done; +update: + retv = 0; + opt = ipv6_update_options(sk, opt); +done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } + case IPV6_UNICAST_HOPS: + if (optlen < sizeof(int)) + goto e_inval; + if (val > 255 || val < -1) + goto e_inval; + np->hop_limit = val; + retv = 0; + break; + + case IPV6_MULTICAST_HOPS: + if (sk->sk_type == SOCK_STREAM) + break; + if (optlen < sizeof(int)) + goto e_inval; + if (val > 255 || val < -1) + goto e_inval; + np->mcast_hops = val; + retv = 0; + break; + + case IPV6_MULTICAST_LOOP: + if (optlen < sizeof(int)) + goto e_inval; + if (val != valbool) + goto e_inval; + np->mc_loop = valbool; + retv = 0; + break; + + case IPV6_MULTICAST_IF: + if (sk->sk_type == SOCK_STREAM) + break; + if (optlen < sizeof(int)) + goto e_inval; + + if (val) { + struct net_device *dev; + + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) + goto e_inval; + + dev = dev_get_by_index(net, val); + if (!dev) { + retv = -ENODEV; + break; + } + dev_put(dev); + } + np->mcast_oif = val; + retv = 0; + break; + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + { + struct ipv6_mreq mreq; + + if (optlen < sizeof(struct ipv6_mreq)) + goto e_inval; + + retv = -EPROTO; + if (inet_sk(sk)->is_icsk) + break; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_ADD_MEMBERSHIP) + retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + else + retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + break; + } + case IPV6_JOIN_ANYCAST: + case IPV6_LEAVE_ANYCAST: + { + struct ipv6_mreq mreq; + + if (optlen < sizeof(struct ipv6_mreq)) + goto e_inval; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_JOIN_ANYCAST) + retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + else + retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in6 *psin6; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + + retv = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(struct group_req))) + break; + if (greq.gr_group.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + psin6 = (struct sockaddr_in6 *)&greq.gr_group; + if (optname == MCAST_JOIN_GROUP) + retv = ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + else + retv = ipv6_sock_mc_drop(sk, greq.gr_interface, + &psin6->sin6_addr); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + int omode, add; + + if (optlen < sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + retv = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET6 || + greqs.gsr_source.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; + retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, + &psin6->sin6_addr); + /* prior join w/ different source is ok */ + if (retv && retv != -EADDRINUSE) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + retv = ip6_mc_source(add, omode, sk, &greqs); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_mld_max_msf; + struct group_filter *gsf; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + retv = -ENOBUFS; + break; + } + gsf = kmalloc(optlen,GFP_KERNEL); + if (!gsf) { + retv = -ENOBUFS; + break; + } + retv = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + kfree(gsf); + break; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffffU || + gsf->gf_numsrc > sysctl_mld_max_msf) { + kfree(gsf); + retv = -ENOBUFS; + break; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + kfree(gsf); + retv = -EINVAL; + break; + } + retv = ip6_mc_msfilter(sk, gsf); + kfree(gsf); + + break; + } + case IPV6_ROUTER_ALERT: + if (optlen < sizeof(int)) + goto e_inval; + retv = ip6_ra_control(sk, val); + break; + case IPV6_MTU_DISCOVER: + if (optlen < sizeof(int)) + goto e_inval; + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + goto e_inval; + np->pmtudisc = val; + retv = 0; + break; + case IPV6_MTU: + if (optlen < sizeof(int)) + goto e_inval; + if (val && val < IPV6_MIN_MTU) + goto e_inval; + np->frag_size = val; + retv = 0; + break; + case IPV6_RECVERR: + if (optlen < sizeof(int)) + goto e_inval; + np->recverr = valbool; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + retv = 0; + break; + case IPV6_FLOWINFO_SEND: + if (optlen < sizeof(int)) + goto e_inval; + np->sndflow = valbool; + retv = 0; + break; + case IPV6_FLOWLABEL_MGR: + retv = ipv6_flowlabel_opt(sk, optval, optlen); + break; + case IPV6_IPSEC_POLICY: + case IPV6_XFRM_POLICY: + retv = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + retv = xfrm_user_policy(sk, optname, optval, optlen); + break; + + case IPV6_ADDR_PREFERENCES: + { + unsigned int pref = 0; + unsigned int prefmask = ~0; + + if (optlen < sizeof(int)) + goto e_inval; + + retv = -EINVAL; + + /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ + switch (val & (IPV6_PREFER_SRC_PUBLIC| + IPV6_PREFER_SRC_TMP| + IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { + case IPV6_PREFER_SRC_PUBLIC: + pref |= IPV6_PREFER_SRC_PUBLIC; + break; + case IPV6_PREFER_SRC_TMP: + pref |= IPV6_PREFER_SRC_TMP; + break; + case IPV6_PREFER_SRC_PUBTMP_DEFAULT: + break; + case 0: + goto pref_skip_pubtmp; + default: + goto e_inval; + } + + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| + IPV6_PREFER_SRC_TMP); +pref_skip_pubtmp: + + /* check HOME/COA conflicts */ + switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { + case IPV6_PREFER_SRC_HOME: + break; + case IPV6_PREFER_SRC_COA: + pref |= IPV6_PREFER_SRC_COA; + case 0: + goto pref_skip_coa; + default: + goto e_inval; + } + + prefmask &= ~IPV6_PREFER_SRC_COA; +pref_skip_coa: + + /* check CGA/NONCGA conflicts */ + switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { + case IPV6_PREFER_SRC_CGA: + case IPV6_PREFER_SRC_NONCGA: + case 0: + break; + default: + goto e_inval; + } + + np->srcprefs = (np->srcprefs & prefmask) | pref; + retv = 0; + + break; + } + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + np->min_hopcount = val; + break; + case IPV6_DONTFRAG: + np->dontfrag = valbool; + retv = 0; + break; + } + + release_sock(sk); + + return retv; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +int ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + + if (level != SOL_IPV6) + return -ENOPROTOOPT; + + err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && + optname != IPV6_XFRM_POLICY) { + lock_sock(sk); + err = nf_setsockopt(sk, PF_INET6, optname, optval, + optlen); + release_sock(sk); + } +#endif + return err; +} + +EXPORT_SYMBOL(ipv6_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) { + if (udp_prot.compat_setsockopt != NULL) + return udp_prot.compat_setsockopt(sk, level, optname, + optval, optlen); + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + } + + if (level != SOL_IPV6) + return -ENOPROTOOPT; + + if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) + return compat_mc_setsockopt(sk, level, optname, optval, optlen, + ipv6_setsockopt); + + err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && + optname != IPV6_XFRM_POLICY) { + lock_sock(sk); + err = compat_nf_setsockopt(sk, PF_INET6, optname, + optval, optlen); + release_sock(sk); + } +#endif + return err; +} + +EXPORT_SYMBOL(compat_ipv6_setsockopt); +#endif + +static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, + int optname, char __user *optval, int len) +{ + struct ipv6_opt_hdr *hdr; + + if (!opt) + return 0; + + switch(optname) { + case IPV6_HOPOPTS: + hdr = opt->hopopt; + break; + case IPV6_RTHDRDSTOPTS: + hdr = opt->dst0opt; + break; + case IPV6_RTHDR: + hdr = (struct ipv6_opt_hdr *)opt->srcrt; + break; + case IPV6_DSTOPTS: + hdr = opt->dst1opt; + break; + default: + return -EINVAL; /* should not happen */ + } + + if (!hdr) + return 0; + + len = min_t(unsigned int, len, ipv6_optlen(hdr)); + if (copy_to_user(optval, hdr, len)) + return -EFAULT; + return len; +} + +static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen, unsigned flags) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + int len; + int val; + + if (ip6_mroute_opt(optname)) + return ip6_mroute_getsockopt(sk, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + switch (optname) { + case IPV6_ADDRFORM: + if (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_UDPLITE && + sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + val = sk->sk_family; + break; + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + return -EFAULT; + if (gsf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + + case IPV6_2292PKTOPTIONS: + { + struct msghdr msg; + struct sk_buff *skb; + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = flags; + + lock_sock(sk); + skb = np->pktoptions; + if (skb) + atomic_inc(&skb->users); + release_sock(sk); + + if (skb) { + int err = datagram_recv_ctl(sk, &msg, skb); + kfree_skb(skb); + if (err) + return err; + } else { + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + np->sticky_pktinfo.ipi6_ifindex; + np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) : + ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr)); + put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxhlim) { + int hlim = np->mcast_hops; + put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + if (np->rxopt.bits.rxoinfo) { + struct in6_pktinfo src_info; + src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + np->sticky_pktinfo.ipi6_ifindex; + np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) : + ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr)); + put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxohlim) { + int hlim = np->mcast_hops; + put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); + } + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IPV6_MTU: + { + struct dst_entry *dst; + + val = 0; + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + val = dst_mtu(dst); + rcu_read_unlock(); + if (!val) + return -ENOTCONN; + break; + } + + case IPV6_V6ONLY: + val = np->ipv6only; + break; + + case IPV6_RECVPKTINFO: + val = np->rxopt.bits.rxinfo; + break; + + case IPV6_2292PKTINFO: + val = np->rxopt.bits.rxoinfo; + break; + + case IPV6_RECVHOPLIMIT: + val = np->rxopt.bits.rxhlim; + break; + + case IPV6_2292HOPLIMIT: + val = np->rxopt.bits.rxohlim; + break; + + case IPV6_RECVRTHDR: + val = np->rxopt.bits.srcrt; + break; + + case IPV6_2292RTHDR: + val = np->rxopt.bits.osrcrt; + break; + + case IPV6_HOPOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_RTHDR: + case IPV6_DSTOPTS: + { + + lock_sock(sk); + len = ipv6_getsockopt_sticky(sk, np->opt, + optname, optval, len); + release_sock(sk); + /* check if ipv6_getsockopt_sticky() returns err code */ + if (len < 0) + return len; + return put_user(len, optlen); + } + + case IPV6_RECVHOPOPTS: + val = np->rxopt.bits.hopopts; + break; + + case IPV6_2292HOPOPTS: + val = np->rxopt.bits.ohopopts; + break; + + case IPV6_RECVDSTOPTS: + val = np->rxopt.bits.dstopts; + break; + + case IPV6_2292DSTOPTS: + val = np->rxopt.bits.odstopts; + break; + + case IPV6_TCLASS: + val = np->tclass; + break; + + case IPV6_RECVTCLASS: + val = np->rxopt.bits.rxtclass; + break; + + case IPV6_FLOWINFO: + val = np->rxopt.bits.rxflow; + break; + + case IPV6_RECVPATHMTU: + val = np->rxopt.bits.rxpmtu; + break; + + case IPV6_PATHMTU: + { + struct dst_entry *dst; + struct ip6_mtuinfo mtuinfo; + + if (len < sizeof(mtuinfo)) + return -EINVAL; + + len = sizeof(mtuinfo); + memset(&mtuinfo, 0, sizeof(mtuinfo)); + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + mtuinfo.ip6m_mtu = dst_mtu(dst); + rcu_read_unlock(); + if (!mtuinfo.ip6m_mtu) + return -ENOTCONN; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &mtuinfo, len)) + return -EFAULT; + + return 0; + break; + } + + case IPV6_TRANSPARENT: + val = inet_sk(sk)->transparent; + break; + + case IPV6_RECVORIGDSTADDR: + val = np->rxopt.bits.rxorigdstaddr; + break; + + case IPV6_UNICAST_HOPS: + case IPV6_MULTICAST_HOPS: + { + struct dst_entry *dst; + + if (optname == IPV6_UNICAST_HOPS) + val = np->hop_limit; + else + val = np->mcast_hops; + + if (val < 0) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + val = ip6_dst_hoplimit(dst); + rcu_read_unlock(); + } + + if (val < 0) + val = sock_net(sk)->ipv6.devconf_all->hop_limit; + break; + } + + case IPV6_MULTICAST_LOOP: + val = np->mc_loop; + break; + + case IPV6_MULTICAST_IF: + val = np->mcast_oif; + break; + + case IPV6_MTU_DISCOVER: + val = np->pmtudisc; + break; + + case IPV6_RECVERR: + val = np->recverr; + break; + + case IPV6_FLOWINFO_SEND: + val = np->sndflow; + break; + + case IPV6_ADDR_PREFERENCES: + val = 0; + + if (np->srcprefs & IPV6_PREFER_SRC_TMP) + val |= IPV6_PREFER_SRC_TMP; + else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC) + val |= IPV6_PREFER_SRC_PUBLIC; + else { + /* XXX: should we return system default? */ + val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; + } + + if (np->srcprefs & IPV6_PREFER_SRC_COA) + val |= IPV6_PREFER_SRC_COA; + else + val |= IPV6_PREFER_SRC_HOME; + break; + + case IPV6_MINHOPCOUNT: + val = np->min_hopcount; + break; + + case IPV6_DONTFRAG: + val = np->dontfrag; + break; + + default: + return -ENOPROTOOPT; + } + len = min_t(unsigned int, sizeof(int), len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +int ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + + if(level != SOL_IPV6) + return -ENOPROTOOPT; + + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + err = nf_getsockopt(sk, PF_INET6, optname, optval, + &len); + release_sock(sk); + if (err >= 0) + err = put_user(len, optlen); + } +#endif + return err; +} + +EXPORT_SYMBOL(ipv6_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) { + if (udp_prot.compat_getsockopt != NULL) + return udp_prot.compat_getsockopt(sk, level, optname, + optval, optlen); + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + } + + if (level != SOL_IPV6) + return -ENOPROTOOPT; + + if (optname == MCAST_MSFILTER) + return compat_mc_getsockopt(sk, level, optname, optval, optlen, + ipv6_getsockopt); + + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, + MSG_CMSG_COMPAT); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + err = compat_nf_getsockopt(sk, PF_INET6, + optname, optval, &len); + release_sock(sk); + if (err >= 0) + err = put_user(len, optlen); + } +#endif + return err; +} + +EXPORT_SYMBOL(compat_ipv6_getsockopt); +#endif + diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c new file mode 100644 index 00000000..f2d74ea1 --- /dev/null +++ b/net/ipv6/mcast.c @@ -0,0 +1,2678 @@ +/* + * Multicast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * yoshfuji : fix format of router-alert option + * YOSHIFUJI Hideaki @USAGI: + * Fixed source address for MLD message based on + * . + * YOSHIFUJI Hideaki @USAGI: + * - Ignore Queries for invalid addresses. + * - MLD for link-local addresses. + * David L Stevens : + * - MLDv2 support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Set to 3 to get tracing... */ +#define MCAST_DEBUG 2 + +#if MCAST_DEBUG >= 3 +#define MDBG(x) printk x +#else +#define MDBG(x) +#endif + +/* Ensure that we have struct in6_addr aligned on 32bit word. */ +static void *__mld2_query_bugs[] __attribute__((__unused__)) = { + BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4) +}; + +static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; + +/* Big mc list lock for all the sockets */ +static DEFINE_SPINLOCK(ipv6_sk_mc_lock); + +static void igmp6_join_group(struct ifmcaddr6 *ma); +static void igmp6_leave_group(struct ifmcaddr6 *ma); +static void igmp6_timer_handler(unsigned long data); + +static void mld_gq_timer_expire(unsigned long data); +static void mld_ifc_timer_expire(unsigned long data); +static void mld_ifc_event(struct inet6_dev *idev); +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); +static void mld_clear_delrec(struct inet6_dev *idev); +static int sf_setstate(struct ifmcaddr6 *pmc); +static void sf_markstate(struct ifmcaddr6 *pmc); +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta); +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta); +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev); + + +#define IGMP6_UNSOLICITED_IVAL (10*HZ) +#define MLD_QRV_DEFAULT 2 + +#define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ + (idev)->cnf.force_mld_version == 1 || \ + ((idev)->mc_v1_seen && \ + time_before(jiffies, (idev)->mc_v1_seen))) + +#define IPV6_MLD_MAX_MSF 64 + +int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; + +/* + * socket join on multicast group + */ + +#define for_each_pmc_rcu(np, pmc) \ + for (pmc = rcu_dereference(np->ipv6_mc_list); \ + pmc != NULL; \ + pmc = rcu_dereference(pmc->next)) + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct net_device *dev = NULL; + struct ipv6_mc_socklist *mc_lst; + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + int err; + + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + + rcu_read_lock(); + for_each_pmc_rcu(np, mc_lst) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + rcu_read_unlock(); + return -EADDRINUSE; + } + } + rcu_read_unlock(); + + mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); + + if (mc_lst == NULL) + return -ENOMEM; + + mc_lst->next = NULL; + ipv6_addr_copy(&mc_lst->addr, addr); + + rcu_read_lock(); + if (ifindex == 0) { + struct rt6_info *rt; + rt = rt6_lookup(net, addr, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dst_release(&rt->dst); + } + } else + dev = dev_get_by_index_rcu(net, ifindex); + + if (dev == NULL) { + rcu_read_unlock(); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return -ENODEV; + } + + mc_lst->ifindex = dev->ifindex; + mc_lst->sfmode = MCAST_EXCLUDE; + rwlock_init(&mc_lst->sflock); + mc_lst->sflist = NULL; + + /* + * now add/increase the group membership on the device + */ + + err = ipv6_dev_mc_inc(dev, addr); + + if (err) { + rcu_read_unlock(); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return err; + } + + spin_lock(&ipv6_sk_mc_lock); + mc_lst->next = np->ipv6_mc_list; + rcu_assign_pointer(np->ipv6_mc_list, mc_lst); + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_unlock(); + + return 0; +} + +/* + * socket leave on multicast group + */ +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; + struct ipv6_mc_socklist __rcu **lnk; + struct net *net = sock_net(sk); + + spin_lock(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; + (mc_lst = rcu_dereference_protected(*lnk, + lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ; + lnk = &mc_lst->next) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + struct net_device *dev; + + *lnk = mc_lst->next; + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + if (dev != NULL) { + struct inet6_dev *idev = __in6_dev_get(dev); + + (void) ip6_mc_leave_src(sk, mc_lst, idev); + if (idev) + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); + return 0; + } + } + spin_unlock(&ipv6_sk_mc_lock); + + return -EADDRNOTAVAIL; +} + +/* called with rcu_read_lock() */ +static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, + const struct in6_addr *group, + int ifindex) +{ + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + + if (ifindex == 0) { + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); + + if (rt) { + dev = rt->rt6i_dev; + dst_release(&rt->dst); + } + } else + dev = dev_get_by_index_rcu(net, ifindex); + + if (!dev) + return NULL; + idev = __in6_dev_get(dev); + if (!idev) + return NULL; + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); + return NULL; + } + return idev; +} + +void ipv6_sock_mc_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; + struct net *net = sock_net(sk); + + spin_lock(&ipv6_sk_mc_lock); + while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, + lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { + struct net_device *dev; + + np->ipv6_mc_list = mc_lst->next; + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + if (dev) { + struct inet6_dev *idev = __in6_dev_get(dev); + + (void) ip6_mc_leave_src(sk, mc_lst, idev); + if (idev) + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); + + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); + + spin_lock(&ipv6_sk_mc_lock); + } + spin_unlock(&ipv6_sk_mc_lock); +} + +int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr) +{ + struct in6_addr *source, *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + struct net *net = sock_net(sk); + int i, j, rv; + int leavegroup = 0; + int pmclocked = 0; + int err; + + source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; + group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface); + if (!idev) { + rcu_read_unlock(); + return -ENODEV; + } + + err = -EADDRNOTAVAIL; + + for_each_pmc_rcu(inet6, pmc) { + if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) { /* must have a prior join */ + err = -EINVAL; + goto done; + } + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) { + err = -EINVAL; + goto done; + } + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip6_mc_add_src(idev, group, omode, 0, NULL, 0); + ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sfmode = omode; + } + + write_lock(&pmc->sflock); + pmclocked = 1; + + psl = pmc->sflist; + if (!add) { + if (!psl) + goto done; /* err = -EADDRNOTAVAIL */ + rv = !0; + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, + sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; /* err = -EADDRNOTAVAIL */ + + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { + leavegroup = 1; + goto done; + } + + /* update the interface filter */ + ip6_mc_del_src(idev, group, omode, 1, source, 1); + + for (j=i+1; jsl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_mld_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip6_sf_socklist *newpsl; + int count = IP6_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP6_SFBLOCK; + if (psl) { + for (i=0; isl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } + pmc->sflist = psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = *source; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip6_mc_add_src(idev, group, omode, 1, source, 1); +done: + if (pmclocked) + write_unlock(&pmc->sflock); + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + if (leavegroup) + return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); + return err; +} + +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) +{ + const struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *newpsl, *psl; + struct net *net = sock_net(sk); + int leavegroup = 0; + int i, err; + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + if (gsf->gf_fmode != MCAST_INCLUDE && + gsf->gf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); + + if (!idev) { + rcu_read_unlock(); + return -ENODEV; + } + + err = 0; + + if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { + leavegroup = 1; + goto done; + } + + for_each_pmc_rcu(inet6, pmc) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) { /* must have a prior join */ + err = -EINVAL; + goto done; + } + if (gsf->gf_numsrc) { + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), + GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; + for (i=0; isl_count; ++i) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; + newpsl->sl_addr[i] = psin6->sin6_addr; + } + err = ip6_mc_add_src(idev, group, gsf->gf_fmode, + newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else { + newpsl = NULL; + (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + } + + write_lock(&pmc->sflock); + psl = pmc->sflist; + if (psl) { + (void) ip6_mc_del_src(idev, group, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } else + (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sflist = newpsl; + pmc->sfmode = gsf->gf_fmode; + write_unlock(&pmc->sflock); + err = 0; +done: + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + if (leavegroup) + err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); + return err; +} + +int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + const struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + struct net *net = sock_net(sk); + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); + + if (!idev) { + rcu_read_unlock(); + return -ENODEV; + } + + err = -EADDRNOTAVAIL; + /* + * changes to the ipv6_mc_list require the socket lock and + * a read lock on ip6_sk_mc_lock. We have the socket lock, + * so reading the list is safe. + */ + + for_each_pmc_rcu(inet6, pmc) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(group, &pmc->addr)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = pmc->sflist; + count = psl ? psl->sl_count : 0; + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + /* changes to psl require the socket lock, a read lock on + * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We + * have the socket lock, so reading here is safe. + */ + for (i=0; isin6_family = AF_INET6; + psin6->sin6_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + return err; +} + +int inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, + const struct in6_addr *src_addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc; + struct ip6_sf_socklist *psl; + int rv = 1; + + rcu_read_lock(); + for_each_pmc_rcu(np, mc) { + if (ipv6_addr_equal(&mc->addr, mc_addr)) + break; + } + if (!mc) { + rcu_read_unlock(); + return 1; + } + read_lock(&mc->sflock); + psl = mc->sflist; + if (!psl) { + rv = mc->sfmode == MCAST_EXCLUDE; + } else { + int i; + + for (i=0; isl_count; i++) { + if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) + break; + } + if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + rv = 0; + if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + rv = 0; + } + read_unlock(&mc->sflock); + rcu_read_unlock(); + + return rv; +} + +static void ma_put(struct ifmcaddr6 *mc) +{ + if (atomic_dec_and_test(&mc->mca_refcnt)) { + in6_dev_put(mc->idev); + kfree(mc); + } +} + +static void igmp6_group_added(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (!(mc->mca_flags&MAF_LOADED)) { + mc->mca_flags |= MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_add(dev, buf); + } + spin_unlock_bh(&mc->mca_lock); + + if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) + return; + + if (MLD_V1_SEEN(mc->idev)) { + igmp6_join_group(mc); + return; + } + /* else v2 */ + + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); +} + +static void igmp6_group_dropped(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (mc->mca_flags&MAF_LOADED) { + mc->mca_flags &= ~MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_del(dev, buf); + } + + if (mc->mca_flags & MAF_NOREPORT) + goto done; + spin_unlock_bh(&mc->mca_lock); + + if (!mc->idev->dead) + igmp6_leave_group(mc); + + spin_lock_bh(&mc->mca_lock); + if (del_timer(&mc->mca_timer)) + atomic_dec(&mc->mca_refcnt); +done: + ip6_mc_clear_src(mc); + spin_unlock_bh(&mc->mca_lock); +} + +/* + * deleted ifmcaddr6 manipulation + */ +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) +{ + struct ifmcaddr6 *pmc; + + /* this is an "ifmcaddr6" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC); + if (!pmc) + return; + + spin_lock_bh(&im->mca_lock); + spin_lock_init(&pmc->mca_lock); + pmc->idev = im->idev; + in6_dev_hold(idev); + pmc->mca_addr = im->mca_addr; + pmc->mca_crcount = idev->mc_qrv; + pmc->mca_sfmode = im->mca_sfmode; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + struct ip6_sf_list *psf; + + pmc->mca_tomb = im->mca_tomb; + pmc->mca_sources = im->mca_sources; + im->mca_tomb = im->mca_sources = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->mca_crcount; + } + spin_unlock_bh(&im->mca_lock); + + spin_lock_bh(&idev->mc_lock); + pmc->next = idev->mc_tomb; + idev->mc_tomb = pmc; + spin_unlock_bh(&idev->mc_lock); +} + +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) +{ + struct ifmcaddr6 *pmc, *pmc_prev; + struct ip6_sf_list *psf, *psf_next; + + spin_lock_bh(&idev->mc_lock); + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(&pmc->mca_addr, pmca)) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + idev->mc_tomb = pmc->next; + } + spin_unlock_bh(&idev->mc_lock); + + if (pmc) { + for (psf=pmc->mca_tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in6_dev_put(pmc->idev); + kfree(pmc); + } +} + +static void mld_clear_delrec(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *nextpmc; + + spin_lock_bh(&idev->mc_lock); + pmc = idev->mc_tomb; + idev->mc_tomb = NULL; + spin_unlock_bh(&idev->mc_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip6_mc_clear_src(pmc); + in6_dev_put(pmc->idev); + kfree(pmc); + } + + /* clear dead sources, too */ + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + struct ip6_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->mca_lock); + psf = pmc->mca_tomb; + pmc->mca_tomb = NULL; + spin_unlock_bh(&pmc->mca_lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + read_unlock_bh(&idev->lock); +} + + +/* + * device multicast group inc (add if not found) + */ +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +{ + struct ifmcaddr6 *mc; + struct inet6_dev *idev; + + /* we need to take a reference on idev */ + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENODEV; + } + + for (mc = idev->mc_list; mc; mc = mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, addr)) { + mc->mca_users++; + write_unlock_bh(&idev->lock); + ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, + NULL, 0); + in6_dev_put(idev); + return 0; + } + } + + /* + * not found: create a new one. + */ + + mc = kzalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); + + if (mc == NULL) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENOMEM; + } + + setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); + + ipv6_addr_copy(&mc->mca_addr, addr); + mc->idev = idev; /* (reference taken) */ + mc->mca_users = 1; + /* mca_stamp should be updated upon changes */ + mc->mca_cstamp = mc->mca_tstamp = jiffies; + atomic_set(&mc->mca_refcnt, 2); + spin_lock_init(&mc->mca_lock); + + /* initial mode is (EX, empty) */ + mc->mca_sfmode = MCAST_EXCLUDE; + mc->mca_sfcount[MCAST_EXCLUDE] = 1; + + if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || + IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + mc->mca_flags |= MAF_NOREPORT; + + mc->next = idev->mc_list; + idev->mc_list = mc; + write_unlock_bh(&idev->lock); + + mld_del_delrec(idev, &mc->mca_addr); + igmp6_group_added(mc); + ma_put(mc); + return 0; +} + +/* + * device multicast group del + */ +int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct ifmcaddr6 *ma, **map; + + write_lock_bh(&idev->lock); + for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addr)) { + if (--ma->mca_users == 0) { + *map = ma->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(ma); + + ma_put(ma); + return 0; + } + write_unlock_bh(&idev->lock); + return 0; + } + } + write_unlock_bh(&idev->lock); + + return -ENOENT; +} + +int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) +{ + struct inet6_dev *idev; + int err; + + rcu_read_lock(); + + idev = __in6_dev_get(dev); + if (!idev) + err = -ENODEV; + else + err = __ipv6_dev_mc_dec(idev, addr); + + rcu_read_unlock(); + return err; +} + +/* + * identify MLD packets for MLD filter exceptions + */ +int ipv6_is_mld(struct sk_buff *skb, int nexthdr) +{ + struct icmp6hdr *pic; + + if (nexthdr != IPPROTO_ICMPV6) + return 0; + + if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) + return 0; + + pic = icmp6_hdr(skb); + + switch (pic->icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + return 1; + default: + break; + } + return 0; +} + +/* + * check if the interface/address pair is valid + */ +int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, + const struct in6_addr *src_addr) +{ + struct inet6_dev *idev; + struct ifmcaddr6 *mc; + int rv = 0; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (mc = idev->mc_list; mc; mc=mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (mc) { + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; + + spin_lock_bh(&mc->mca_lock); + for (psf=mc->mca_sources;psf;psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + mc->mca_sfcount[MCAST_EXCLUDE]; + else + rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; + spin_unlock_bh(&mc->mca_lock); + } else + rv = 1; /* don't filter unspecified source */ + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return rv; +} + +static void mld_gq_start_timer(struct inet6_dev *idev) +{ + int tv = net_random() % idev->mc_maxdelay; + + idev->mc_gq_running = 1; + if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +/* + * IGMP handling (alias multicast ICMPv6 messages) + */ + +static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) +{ + unsigned long delay = resptime; + + /* Do not start timer for these addresses */ + if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || + IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + return; + + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (delay >= resptime) { + if (resptime) + delay = net_random() % resptime; + else + delay = 1; + } + ma->mca_timer.expires = jiffies + delay; + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING; +} + +/* mark EXCLUDE-mode sources */ +static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_count[MCAST_INCLUDE] || + pmc->mca_sfcount[MCAST_EXCLUDE] != + psf->sf_count[MCAST_EXCLUDE]) + continue; + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + scount++; + break; + } + } + } + pmc->mca_flags &= ~MAF_GSQUERY; + if (scount == nsrcs) /* all sources excluded */ + return 0; + return 1; +} + +static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + if (pmc->mca_sfmode == MCAST_EXCLUDE) + return mld_xmarksources(pmc, nsrcs, srcs); + + /* mark INCLUDE-mode sources */ + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_addr)) { + psf->sf_gsresp = 1; + scount++; + break; + } + } + } + if (!scount) { + pmc->mca_flags &= ~MAF_GSQUERY; + return 0; + } + pmc->mca_flags |= MAF_GSQUERY; + return 1; +} + +/* called with rcu_read_lock() */ +int igmp6_event_query(struct sk_buff *skb) +{ + struct mld2_query *mlh2 = NULL; + struct ifmcaddr6 *ma; + const struct in6_addr *group; + unsigned long max_delay; + struct inet6_dev *idev; + struct mld_msg *mld; + int group_type; + int mark = 0; + int len; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + return -EINVAL; + + /* compute payload length excluding extension headers */ + len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); + len -= skb_network_header_len(skb); + + /* Drop queries with not link local source */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + idev = __in6_dev_get(skb->dev); + + if (idev == NULL) + return 0; + + mld = (struct mld_msg *)icmp6_hdr(skb); + group = &mld->mld_mca; + group_type = ipv6_addr_type(group); + + if (group_type != IPV6_ADDR_ANY && + !(group_type&IPV6_ADDR_MULTICAST)) + return -EINVAL; + + if (len == 24) { + int switchback; + /* MLDv1 router present */ + + /* Translate milliseconds to jiffies */ + max_delay = (ntohs(mld->mld_maxdelay)*HZ)/1000; + + switchback = (idev->mc_qrv + 1) * max_delay; + idev->mc_v1_seen = jiffies + switchback; + + /* cancel the interface change timer */ + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + } else if (len >= 28) { + int srcs_offset = sizeof(struct mld2_query) - + sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, srcs_offset)) + return -EINVAL; + + mlh2 = (struct mld2_query *)skb_transport_header(skb); + max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000; + if (!max_delay) + max_delay = 1; + idev->mc_maxdelay = max_delay; + if (mlh2->mld2q_qrv) + idev->mc_qrv = mlh2->mld2q_qrv; + if (group_type == IPV6_ADDR_ANY) { /* general query */ + if (mlh2->mld2q_nsrcs) + return -EINVAL; /* no sources allowed */ + + mld_gq_start_timer(idev); + return 0; + } + /* mark sources to include, if group & source-specific */ + if (mlh2->mld2q_nsrcs != 0) { + if (!pskb_may_pull(skb, srcs_offset + + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) + return -EINVAL; + + mlh2 = (struct mld2_query *)skb_transport_header(skb); + mark = 1; + } + } else + return -EINVAL; + + read_lock_bh(&idev->lock); + if (group_type == IPV6_ADDR_ANY) { + for (ma = idev->mc_list; ma; ma=ma->next) { + spin_lock_bh(&ma->mca_lock); + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + } + } else { + for (ma = idev->mc_list; ma; ma=ma->next) { + if (!ipv6_addr_equal(group, &ma->mca_addr)) + continue; + spin_lock_bh(&ma->mca_lock); + if (ma->mca_flags & MAF_TIMER_RUNNING) { + /* gsquery <- gsquery && mark */ + if (!mark) + ma->mca_flags &= ~MAF_GSQUERY; + } else { + /* gsquery <- mark */ + if (mark) + ma->mca_flags |= MAF_GSQUERY; + else + ma->mca_flags &= ~MAF_GSQUERY; + } + if (!(ma->mca_flags & MAF_GSQUERY) || + mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + break; + } + } + read_unlock_bh(&idev->lock); + + return 0; +} + +/* called with rcu_read_lock() */ +int igmp6_event_report(struct sk_buff *skb) +{ + struct ifmcaddr6 *ma; + struct inet6_dev *idev; + struct mld_msg *mld; + int addr_type; + + /* Our own report looped back. Ignore it. */ + if (skb->pkt_type == PACKET_LOOPBACK) + return 0; + + /* send our report if the MC router may not have heard this report */ + if (skb->pkt_type != PACKET_MULTICAST && + skb->pkt_type != PACKET_BROADCAST) + return 0; + + if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) + return -EINVAL; + + mld = (struct mld_msg *)icmp6_hdr(skb); + + /* Drop reports with not link local source */ + addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); + if (addr_type != IPV6_ADDR_ANY && + !(addr_type&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + idev = __in6_dev_get(skb->dev); + if (idev == NULL) + return -ENODEV; + + /* + * Cancel the timer for this group + */ + + read_lock_bh(&idev->lock); + for (ma = idev->mc_list; ma; ma=ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { + spin_lock(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) + atomic_dec(&ma->mca_refcnt); + ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); + spin_unlock(&ma->mca_lock); + break; + } + } + read_unlock_bh(&idev->lock); + return 0; +} + +static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case MLD2_MODE_IS_INCLUDE: + case MLD2_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { + if (pmc->mca_sfmode == MCAST_INCLUDE) + return 1; + /* don't include if this source is excluded + * in all filters + */ + if (psf->sf_count[MCAST_INCLUDE]) + return type == MLD2_MODE_IS_INCLUDE; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + } + return 0; + case MLD2_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case MLD2_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case MLD2_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; + case MLD2_BLOCK_OLD_SOURCES: + if (pmc->mca_sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip6_sf_list *psf; + int scount = 0; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +static struct sk_buff *mld_newpack(struct net_device *dev, int size) +{ + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; + struct sk_buff *skb; + struct mld2_report *pmr; + struct in6_addr addr_buf; + const struct in6_addr *saddr; + int err; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + + /* we assume size > sizeof(ra) here */ + size += LL_ALLOCATED_SPACE(dev); + /* limit our allocations to order-0 page */ + size = min_t(int, size, SKB_MAX_ORDER(0, 0)); + skb = sock_alloc_send_skb(sk, size, 1, &err); + + if (!skb) + return NULL; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + /* : + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + saddr = &in6addr_any; + } else + saddr = &addr_buf; + + ip6_nd_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); + skb_put(skb, sizeof(*pmr)); + pmr = (struct mld2_report *)skb_transport_header(skb); + pmr->mld2r_type = ICMPV6_MLD2_REPORT; + pmr->mld2r_resv1 = 0; + pmr->mld2r_cksum = 0; + pmr->mld2r_resv2 = 0; + pmr->mld2r_ngrec = 0; + return skb; +} + +static void mld_sendpack(struct sk_buff *skb) +{ + struct ipv6hdr *pip6 = ipv6_hdr(skb); + struct mld2_report *pmr = + (struct mld2_report *)skb_transport_header(skb); + int payload_len, mldlen; + struct inet6_dev *idev; + struct net *net = dev_net(skb->dev); + int err; + struct flowi6 fl6; + struct dst_entry *dst; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + + payload_len = (skb->tail - skb->network_header) - sizeof(*pip6); + mldlen = skb->tail - skb->transport_header; + pip6->payload_len = htons(payload_len); + + pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, + IPPROTO_ICMPV6, + csum_partial(skb_transport_header(skb), + mldlen, 0)); + + dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr); + + if (!dst) { + err = -ENOMEM; + goto err_out; + } + + icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + } + skb_dst_set(skb, dst); + if (err) + goto err_out; + + payload_len = skb->len; + + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +out: + if (!err) { + ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT); + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); + } else + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS); + + rcu_read_unlock(); + return; + +err_out: + kfree_skb(skb); + goto out; +} + +static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, struct mld2_grec **ppgr) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr; + + if (!skb) + skb = mld_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->mca_addr; /* structure copy */ + pmr = (struct mld2_report *)skb_transport_header(skb); + pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr = NULL; + struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, stotal, first, isquery, truncate; + + if (pmc->mca_flags & MAF_NOREPORT) + return skb; + + isquery = type == MLD2_MODE_IS_INCLUDE || + type == MLD2_MODE_IS_EXCLUDE; + truncate = type == MLD2_MODE_IS_EXCLUDE || + type == MLD2_CHANGE_TO_EXCLUDE; + + stotal = scount = 0; + + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; + + if (!*psf_list) + goto empty_source; + + pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pmr && pmr->mld2r_ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + } + } + first = 1; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + struct in6_addr *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(*psrc) + + first*sizeof(struct mld2_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + if (!skb) + return NULL; + psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); + *psrc = psf->sf_addr; + scount++; stotal++; + if ((type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + +empty_source: + if (!stotal) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + } + if (pgr) + pgr->grec_nsrcs = htons(scount); + + if (isquery) + pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ + return skb; +} + +static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (pmc->mca_flags & MAF_NOREPORT) + continue; + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + } else { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + if (skb) + mld_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void mld_clear_zeros(struct ip6_sf_list **ppsf) +{ + struct ip6_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void mld_send_cr(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + read_lock_bh(&idev->lock); + spin_lock(&idev->mc_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->mca_crcount) { + if (pmc->mca_sfmode == MCAST_EXCLUDE) { + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + pmc->mca_crcount--; + if (pmc->mca_crcount == 0) { + mld_clear_zeros(&pmc->mca_tomb); + mld_clear_zeros(&pmc->mca_sources); + } + } + if (pmc->mca_crcount == 0 && !pmc->mca_tomb && + !pmc->mca_sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + idev->mc_tomb = pmc_next; + in6_dev_put(pmc->idev); + kfree(pmc); + } else + pmc_prev = pmc; + } + spin_unlock(&idev->mc_lock); + + /* change recs */ + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_ALLOW_NEW_SOURCES; + } else { + type = MLD2_ALLOW_NEW_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->mca_crcount) { + if (pmc->mca_sfmode == MCAST_EXCLUDE) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + pmc->mca_crcount--; + } + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + if (!skb) + return; + (void) mld_sendpack(skb); +} + +static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) +{ + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; + struct inet6_dev *idev; + struct sk_buff *skb; + struct mld_msg *hdr; + const struct in6_addr *snd_addr, *saddr; + struct in6_addr addr_buf; + int err, len, payload_len, full_len; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + struct flowi6 fl6; + struct dst_entry *dst; + + if (type == ICMPV6_MGM_REDUCTION) + snd_addr = &in6addr_linklocal_allrouters; + else + snd_addr = addr; + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + payload_len = len + sizeof(ra); + full_len = sizeof(struct ipv6hdr) + payload_len; + + rcu_read_lock(); + IP6_UPD_PO_STATS(net, __in6_dev_get(dev), + IPSTATS_MIB_OUT, full_len); + rcu_read_unlock(); + + skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + full_len, 1, &err); + + if (skb == NULL) { + rcu_read_lock(); + IP6_INC_STATS(net, __in6_dev_get(dev), + IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + /* : + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + saddr = &in6addr_any; + } else + saddr = &addr_buf; + + ip6_nd_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg)); + memset(hdr, 0, sizeof(struct mld_msg)); + hdr->mld_type = type; + ipv6_addr_copy(&hdr->mld_mca, addr); + + hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, + IPPROTO_ICMPV6, + csum_partial(hdr, len, 0)); + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + + dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr); + if (!dst) { + err = -ENOMEM; + goto err_out; + } + + icmpv6_flow_init(sk, &fl6, type, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err_out; + } + + skb_dst_set(skb, dst); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +out: + if (!err) { + ICMP6MSGOUT_INC_STATS(net, idev, type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len); + } else + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + + rcu_read_unlock(); + return; + +err_out: + kfree_skb(skb); + goto out; +} + +static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, + const struct in6_addr *psfsrc) +{ + struct ip6_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { + struct inet6_dev *idev = pmc->idev; + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->mca_sources = psf->sf_next; + if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && + !MLD_V1_SEEN(idev)) { + psf->sf_crcount = idev->mc_qrv; + psf->sf_next = pmc->mca_tomb; + pmc->mca_tomb = psf; + rv = 1; + } else + kfree(psf); + } + return rv; +} + +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int changerec = 0; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + sf_markstate(pmc); + if (!delta) { + if (!pmc->mca_sfcount[sfmode]) { + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return -EINVAL; + } + pmc->mca_sfcount[sfmode]--; + } + err = 0; + for (i=0; i 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->mca_sfmode == MCAST_EXCLUDE && + pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && + pmc->mca_sfcount[MCAST_INCLUDE]) { + struct ip6_sf_list *psf; + + /* filter mode change */ + pmc->mca_sfmode = MCAST_INCLUDE; + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(pmc->idev); + } else if (sf_setstate(pmc) || changerec) + mld_ifc_event(pmc->idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, + const struct in6_addr *psfsrc, int delta) +{ + struct ip6_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf) { + psf = kzalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + + psf->sf_addr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->mca_sources = psf; + } + psf->sf_count[sfmode]++; + return 0; +} + +static void sf_markstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf, *dpsf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + int qrv = pmc->idev->mc_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in) { + if (!psf->sf_oldin) { + struct ip6_sf_list *prev = NULL; + + for (dpsf=pmc->mca_tomb; dpsf; + dpsf=dpsf->sf_next) { + if (ipv6_addr_equal(&dpsf->sf_addr, + &psf->sf_addr)) + break; + prev = dpsf; + } + if (dpsf) { + if (prev) + prev->sf_next = dpsf->sf_next; + else + pmc->mca_tomb = dpsf->sf_next; + kfree(dpsf); + } + psf->sf_crcount = qrv; + rv++; + } + } else if (psf->sf_oldin) { + psf->sf_crcount = 0; + /* + * add or update "delete" records if an active filter + * is now inactive + */ + for (dpsf=pmc->mca_tomb; dpsf; dpsf=dpsf->sf_next) + if (ipv6_addr_equal(&dpsf->sf_addr, + &psf->sf_addr)) + break; + if (!dpsf) { + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); + if (!dpsf) + continue; + *dpsf = *psf; + /* pmc->mca_lock held by callers */ + dpsf->sf_next = pmc->mca_tomb; + pmc->mca_tomb = dpsf; + } + dpsf->sf_crcount = qrv; + rv++; + } + } + return rv; +} + +/* + * Add multicast source filter list to the interface list + */ +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int isexclude; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + + sf_markstate(pmc); + isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->mca_sfcount[sfmode]++; + err = 0; + for (i=0; imca_sfcount[sfmode]--; + for (j=0; jmca_sfcount[MCAST_EXCLUDE] != 0)) { + struct ip6_sf_list *psf; + + /* filter mode change */ + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + pmc->mca_sfmode = MCAST_EXCLUDE; + else if (pmc->mca_sfcount[MCAST_INCLUDE]) + pmc->mca_sfmode = MCAST_INCLUDE; + /* else no filters; keep old mode for reports */ + + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(idev); + } else if (sf_setstate(pmc)) + mld_ifc_event(idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf, *nextpsf; + + for (psf=pmc->mca_tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_tomb = NULL; + for (psf=pmc->mca_sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_sources = NULL; + pmc->mca_sfmode = MCAST_EXCLUDE; + pmc->mca_sfcount[MCAST_INCLUDE] = 0; + pmc->mca_sfcount[MCAST_EXCLUDE] = 1; +} + + +static void igmp6_join_group(struct ifmcaddr6 *ma) +{ + unsigned long delay; + + if (ma->mca_flags & MAF_NOREPORT) + return; + + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + + delay = net_random() % IGMP6_UNSOLICITED_IVAL; + + spin_lock_bh(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; + spin_unlock_bh(&ma->mca_lock); +} + +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev) +{ + int err; + + /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + * so no other readers or writers of iml or its sflist + */ + if (!iml->sflist) { + /* any-source empty exclude case */ + return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; + return err; +} + +static void igmp6_leave_group(struct ifmcaddr6 *ma) +{ + if (MLD_V1_SEEN(ma->idev)) { + if (ma->mca_flags & MAF_LAST_REPORTER) + igmp6_send(&ma->mca_addr, ma->idev->dev, + ICMPV6_MGM_REDUCTION); + } else { + mld_add_delrec(ma->idev, ma); + mld_ifc_event(ma->idev); + } +} + +static void mld_gq_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + idev->mc_gq_running = 0; + mld_send_report(idev, NULL); + __in6_dev_put(idev); +} + +static void mld_ifc_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + mld_send_cr(idev); + if (idev->mc_ifc_count) { + idev->mc_ifc_count--; + if (idev->mc_ifc_count) + mld_ifc_start_timer(idev, idev->mc_maxdelay); + } + __in6_dev_put(idev); +} + +static void mld_ifc_event(struct inet6_dev *idev) +{ + if (MLD_V1_SEEN(idev)) + return; + idev->mc_ifc_count = idev->mc_qrv; + mld_ifc_start_timer(idev, 1); +} + + +static void igmp6_timer_handler(unsigned long data) +{ + struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + + if (MLD_V1_SEEN(ma->idev)) + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + else + mld_send_report(ma->idev, ma); + + spin_lock(&ma->mca_lock); + ma->mca_flags |= MAF_LAST_REPORTER; + ma->mca_flags &= ~MAF_TIMER_RUNNING; + spin_unlock(&ma->mca_lock); + ma_put(ma); +} + +/* Device changing type */ + +void ipv6_mc_unmap(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i = i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); +} + +void ipv6_mc_remap(struct inet6_dev *idev) +{ + ipv6_mc_up(idev); +} + +/* Device going down */ + +void ipv6_mc_down(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Withdraw multicast list */ + + read_lock_bh(&idev->lock); + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); + + for (i = idev->mc_list; i; i=i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); + + mld_clear_delrec(idev); +} + + +/* Device going up */ + +void ipv6_mc_up(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i=i->next) + igmp6_group_added(i); + read_unlock_bh(&idev->lock); +} + +/* IPv6 device initialization. */ + +void ipv6_mc_init_dev(struct inet6_dev *idev) +{ + write_lock_bh(&idev->lock); + spin_lock_init(&idev->mc_lock); + idev->mc_gq_running = 0; + setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire, + (unsigned long)idev); + idev->mc_tomb = NULL; + idev->mc_ifc_count = 0; + setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire, + (unsigned long)idev); + idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_v1_seen = 0; + write_unlock_bh(&idev->lock); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ipv6_mc_destroy_dev(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Deactivate timers */ + ipv6_mc_down(idev); + + /* Delete all-nodes address. */ + /* We cannot call ipv6_dev_mc_dec() directly, our caller in + * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will + * fail. + */ + __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes); + + if (idev->cnf.forwarding) + __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); + + write_lock_bh(&idev->lock); + while ((i = idev->mc_list) != NULL) { + idev->mc_list = i->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(i); + ma_put(i); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + +#ifdef CONFIG_PROC_FS +struct igmp6_mc_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) + +static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) +{ + struct ifmcaddr6 *im = NULL; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + struct net *net = seq_file_net(seq); + + state->idev = NULL; + for_each_netdev_rcu(net, state->dev) { + struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + im = im->next; + while (!im) { + if (likely(state->idev != NULL)) + read_unlock_bh(&state->idev->lock); + + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = __in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->mc_list; + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifmcaddr6 *im = igmp6_mc_get_first(seq); + if (im) + while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return igmp6_mc_get_idx(seq, *pos); +} + +static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v); + + ++*pos; + return im; +} + +static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + state->idev = NULL; + } + state->dev = NULL; + rcu_read_unlock(); +} + +static int igmp6_mc_seq_show(struct seq_file *seq, void *v) +{ + struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + seq_printf(seq, + "%-4d %-15s %pi6 %5d %08X %ld\n", + state->dev->ifindex, state->dev->name, + &im->mca_addr, + im->mca_users, im->mca_flags, + (im->mca_flags&MAF_TIMER_RUNNING) ? + jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); + return 0; +} + +static const struct seq_operations igmp6_mc_seq_ops = { + .start = igmp6_mc_seq_start, + .next = igmp6_mc_seq_next, + .stop = igmp6_mc_seq_stop, + .show = igmp6_mc_seq_show, +}; + +static int igmp6_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &igmp6_mc_seq_ops, + sizeof(struct igmp6_mc_iter_state)); +} + +static const struct file_operations igmp6_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct igmp6_mcf_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct inet6_dev *idev; + struct ifmcaddr6 *im; +}; + +#define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) + +static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) +{ + struct ip6_sf_list *psf = NULL; + struct ifmcaddr6 *im = NULL; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + struct net *net = seq_file_net(seq); + + state->idev = NULL; + state->im = NULL; + for_each_netdev_rcu(net, state->dev) { + struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (likely(im != NULL)) { + spin_lock_bh(&im->mca_lock); + psf = im->mca_sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->mca_lock); + } + read_unlock_bh(&idev->lock); + } + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->mca_lock); + state->im = state->im->next; + while (!state->im) { + if (likely(state->idev != NULL)) + read_unlock_bh(&state->idev->lock); + + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = __in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + state->im = state->idev->mc_list; + } + if (!state->im) + break; + spin_lock_bh(&state->im->mca_lock); + psf = state->im->mca_sources; + } +out: + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp6_mcf_get_first(seq); + else + psf = igmp6_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->mca_lock); + state->im = NULL; + } + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + state->idev = NULL; + } + state->dev = NULL; + rcu_read_unlock(); +} + +static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip6_sf_list *psf = (struct ip6_sf_list *)v; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%32s %32s %6s %6s\n", "Idx", + "Device", "Multicast Address", + "Source Address", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", + state->dev->ifindex, state->dev->name, + &state->im->mca_addr, + &psf->sf_addr, + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static const struct seq_operations igmp6_mcf_seq_ops = { + .start = igmp6_mcf_seq_start, + .next = igmp6_mcf_seq_next, + .stop = igmp6_mcf_seq_stop, + .show = igmp6_mcf_seq_show, +}; + +static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &igmp6_mcf_seq_ops, + sizeof(struct igmp6_mcf_iter_state)); +} + +static const struct file_operations igmp6_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init igmp6_proc_init(struct net *net) +{ + int err; + + err = -ENOMEM; + if (!proc_net_fops_create(net, "igmp6", S_IRUGO, &igmp6_mc_seq_fops)) + goto out; + if (!proc_net_fops_create(net, "mcfilter6", S_IRUGO, + &igmp6_mcf_seq_fops)) + goto out_proc_net_igmp6; + + err = 0; +out: + return err; + +out_proc_net_igmp6: + proc_net_remove(net, "igmp6"); + goto out; +} + +static void __net_exit igmp6_proc_exit(struct net *net) +{ + proc_net_remove(net, "mcfilter6"); + proc_net_remove(net, "igmp6"); +} +#else +static inline int igmp6_proc_init(struct net *net) +{ + return 0; +} +static inline void igmp6_proc_exit(struct net *net) +{ +} +#endif + +static int __net_init igmp6_net_init(struct net *net) +{ + int err; + + err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the IGMP6 control socket (err %d).\n", + err); + goto out; + } + + inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + + err = igmp6_proc_init(net); + if (err) + goto out_sock_create; +out: + return err; + +out_sock_create: + inet_ctl_sock_destroy(net->ipv6.igmp_sk); + goto out; +} + +static void __net_exit igmp6_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.igmp_sk); + igmp6_proc_exit(net); +} + +static struct pernet_operations igmp6_net_ops = { + .init = igmp6_net_init, + .exit = igmp6_net_exit, +}; + +int __init igmp6_init(void) +{ + return register_pernet_subsys(&igmp6_net_ops); +} + +void igmp6_cleanup(void) +{ + unregister_pernet_subsys(&igmp6_net_ops); +} diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c new file mode 100644 index 00000000..43242e6e --- /dev/null +++ b/net/ipv6/mip6.c @@ -0,0 +1,525 @@ +/* + * Copyright (C)2003-2006 Helsinki University of Technology + * Copyright (C)2003-2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * Authors: + * Noriaki TAKAMIYA @USAGI + * Masahide NAKAMURA @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline unsigned int calc_padlen(unsigned int len, unsigned int n) +{ + return (n - len + 16) & 0x7; +} + +static inline void *mip6_padn(__u8 *data, __u8 padlen) +{ + if (!data) + return NULL; + if (padlen == 1) { + data[0] = IPV6_TLV_PAD0; + } else if (padlen > 1) { + data[0] = IPV6_TLV_PADN; + data[1] = padlen - 2; + if (padlen > 2) + memset(data+2, 0, data[1]); + } + return data + padlen; +} + +static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); +} + +static int mip6_mh_len(int type) +{ + int len = 0; + + switch (type) { + case IP6_MH_TYPE_BRR: + len = 0; + break; + case IP6_MH_TYPE_HOTI: + case IP6_MH_TYPE_COTI: + case IP6_MH_TYPE_BU: + case IP6_MH_TYPE_BACK: + len = 1; + break; + case IP6_MH_TYPE_HOT: + case IP6_MH_TYPE_COT: + case IP6_MH_TYPE_BERROR: + len = 2; + break; + } + return len; +} + +static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) +{ + struct ip6_mh *mh; + + if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) + return -1; + + mh = (struct ip6_mh *)skb_transport_header(skb); + + if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { + LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", + mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); + mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - + skb_network_header(skb))); + return -1; + } + + if (mh->ip6mh_proto != IPPROTO_NONE) { + LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", + mh->ip6mh_proto); + mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - + skb_network_header(skb))); + return -1; + } + + return 0; +} + +struct mip6_report_rate_limiter { + spinlock_t lock; + struct timeval stamp; + int iif; + struct in6_addr src; + struct in6_addr dst; +}; + +static struct mip6_report_rate_limiter mip6_report_rl = { + .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock) +}; + +static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; + int err = destopt->nexthdr; + + spin_lock(&x->lock); + if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && + !ipv6_addr_any((struct in6_addr *)x->coaddr)) + err = -ENOENT; + spin_unlock(&x->lock); + + return err; +} + +/* Destination Option Header is inserted. + * IP Header's src address is replaced with Home Address Option in + * Destination Option Header. + */ +static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct ipv6_destopt_hdr *dstopt; + struct ipv6_destopt_hao *hao; + u8 nexthdr; + int len; + + skb_push(skb, -skb_network_offset(skb)); + iph = ipv6_hdr(skb); + + nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_DSTOPTS; + + dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb); + dstopt->nexthdr = nexthdr; + + hao = mip6_padn((char *)(dstopt + 1), + calc_padlen(sizeof(*dstopt), 6)); + + hao->type = IPV6_TLV_HAO; + BUILD_BUG_ON(sizeof(*hao) != 18); + hao->length = sizeof(*hao) - 2; + + len = ((char *)hao - (char *)dstopt) + sizeof(*hao); + + memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr)); + spin_lock_bh(&x->lock); + memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr)); + spin_unlock_bh(&x->lock); + + WARN_ON(len != x->props.header_len); + dstopt->hdrlen = (x->props.header_len >> 3) - 1; + + return 0; +} + +static inline int mip6_report_rl_allow(struct timeval *stamp, + const struct in6_addr *dst, + const struct in6_addr *src, int iif) +{ + int allow = 0; + + spin_lock_bh(&mip6_report_rl.lock); + if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec || + mip6_report_rl.stamp.tv_usec != stamp->tv_usec || + mip6_report_rl.iif != iif || + !ipv6_addr_equal(&mip6_report_rl.src, src) || + !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { + mip6_report_rl.stamp.tv_sec = stamp->tv_sec; + mip6_report_rl.stamp.tv_usec = stamp->tv_usec; + mip6_report_rl.iif = iif; + ipv6_addr_copy(&mip6_report_rl.src, src); + ipv6_addr_copy(&mip6_report_rl.dst, dst); + allow = 1; + } + spin_unlock_bh(&mip6_report_rl.lock); + return allow; +} + +static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, + const struct flowi *fl) +{ + struct net *net = xs_net(x); + struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + const struct flowi6 *fl6 = &fl->u.ip6; + struct ipv6_destopt_hao *hao = NULL; + struct xfrm_selector sel; + int offset; + struct timeval stamp; + int err = 0; + + if (unlikely(fl6->flowi6_proto == IPPROTO_MH && + fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) + goto out; + + if (likely(opt->dsthao)) { + offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); + if (likely(offset >= 0)) + hao = (struct ipv6_destopt_hao *) + (skb_network_header(skb) + offset); + } + + skb_get_timestamp(skb, &stamp); + + if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr, + hao ? &hao->addr : &ipv6_hdr(skb)->saddr, + opt->iif)) + goto out; + + memset(&sel, 0, sizeof(sel)); + memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + sizeof(sel.daddr)); + sel.prefixlen_d = 128; + memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, + sizeof(sel.saddr)); + sel.prefixlen_s = 128; + sel.family = AF_INET6; + sel.proto = fl6->flowi6_proto; + sel.dport = xfrm_flowi_dport(fl, &fl6->uli); + if (sel.dport) + sel.dport_mask = htons(~0); + sel.sport = xfrm_flowi_sport(fl, &fl6->uli); + if (sel.sport) + sel.sport_mask = htons(~0); + sel.ifindex = fl6->flowi6_oif; + + err = km_report(net, IPPROTO_DSTOPTS, &sel, + (hao ? (xfrm_address_t *)&hao->addr : NULL)); + + out: + return err; +} + +static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, + u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb->tail - skb->network_header; + int found_rhdr = 0; + + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: + /* + * HAO MUST NOT appear more than once. + * XXX: It is better to try to find by the end of + * XXX: packet if HAO exists. + */ + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { + LIMIT_NETDEBUG(KERN_WARNING "mip6: hao exists already, override\n"); + return offset; + } + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + } + + return offset; +} + +static int mip6_destopt_init_state(struct xfrm_state *x) +{ + if (x->id.spi) { + printk(KERN_INFO "%s: spi is not 0: %u\n", __func__, + x->id.spi); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { + printk(KERN_INFO "%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + return -EINVAL; + } + + x->props.header_len = sizeof(struct ipv6_destopt_hdr) + + calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) + + sizeof(struct ipv6_destopt_hao); + WARN_ON(x->props.header_len != 24); + + return 0; +} + +/* + * Do nothing about destroying since it has no specific operation for + * destination options header unlike IPsec protocols. + */ +static void mip6_destopt_destroy(struct xfrm_state *x) +{ +} + +static const struct xfrm_type mip6_destopt_type = +{ + .description = "MIP6DESTOPT", + .owner = THIS_MODULE, + .proto = IPPROTO_DSTOPTS, + .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, + .init_state = mip6_destopt_init_state, + .destructor = mip6_destopt_destroy, + .input = mip6_destopt_input, + .output = mip6_destopt_output, + .reject = mip6_destopt_reject, + .hdr_offset = mip6_destopt_offset, +}; + +static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; + int err = rt2->rt_hdr.nexthdr; + + spin_lock(&x->lock); + if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && + !ipv6_addr_any((struct in6_addr *)x->coaddr)) + err = -ENOENT; + spin_unlock(&x->lock); + + return err; +} + +/* Routing Header type 2 is inserted. + * IP Header's dst address is replaced with Routing Header's Home Address. + */ +static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct rt2_hdr *rt2; + u8 nexthdr; + + skb_push(skb, -skb_network_offset(skb)); + iph = ipv6_hdr(skb); + + nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ROUTING; + + rt2 = (struct rt2_hdr *)skb_transport_header(skb); + rt2->rt_hdr.nexthdr = nexthdr; + rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1; + rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2; + rt2->rt_hdr.segments_left = 1; + memset(&rt2->reserved, 0, sizeof(rt2->reserved)); + + WARN_ON(rt2->rt_hdr.hdrlen != 2); + + memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr)); + spin_lock_bh(&x->lock); + memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr)); + spin_unlock_bh(&x->lock); + + return 0; +} + +static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, + u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb->tail - skb->network_header; + int found_rhdr = 0; + + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + if (offset + 3 <= packet_len) { + struct ipv6_rt_hdr *rt; + rt = (struct ipv6_rt_hdr *)(nh + offset); + if (rt->type != 0) + return offset; + } + found_rhdr = 1; + break; + case NEXTHDR_DEST: + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + return offset; + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + } + + return offset; +} + +static int mip6_rthdr_init_state(struct xfrm_state *x) +{ + if (x->id.spi) { + printk(KERN_INFO "%s: spi is not 0: %u\n", __func__, + x->id.spi); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { + printk(KERN_INFO "%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + return -EINVAL; + } + + x->props.header_len = sizeof(struct rt2_hdr); + + return 0; +} + +/* + * Do nothing about destroying since it has no specific operation for routing + * header type 2 unlike IPsec protocols. + */ +static void mip6_rthdr_destroy(struct xfrm_state *x) +{ +} + +static const struct xfrm_type mip6_rthdr_type = +{ + .description = "MIP6RT", + .owner = THIS_MODULE, + .proto = IPPROTO_ROUTING, + .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, + .init_state = mip6_rthdr_init_state, + .destructor = mip6_rthdr_destroy, + .input = mip6_rthdr_input, + .output = mip6_rthdr_output, + .hdr_offset = mip6_rthdr_offset, +}; + +static int __init mip6_init(void) +{ + printk(KERN_INFO "Mobile IPv6\n"); + + if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { + printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __func__); + goto mip6_destopt_xfrm_fail; + } + if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { + printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __func__); + goto mip6_rthdr_xfrm_fail; + } + if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { + printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __func__); + goto mip6_rawv6_mh_fail; + } + + + return 0; + + mip6_rawv6_mh_fail: + xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); + mip6_rthdr_xfrm_fail: + xfrm_unregister_type(&mip6_destopt_type, AF_INET6); + mip6_destopt_xfrm_fail: + return -EAGAIN; +} + +static void __exit mip6_fini(void) +{ + if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) + printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __func__); + if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) + printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __func__); + if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) + printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __func__); +} + +module_init(mip6_init); +module_exit(mip6_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c new file mode 100644 index 00000000..10a8d411 --- /dev/null +++ b/net/ipv6/ndisc.c @@ -0,0 +1,1896 @@ +/* + * Neighbour Discovery for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Mike Shaver + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Pierre Ynard : export userland ND options + * through netlink (RDNSS support) + * Lars Fenneberg : fixed MTU setting on receipt + * of an RA. + * Janos Farkas : kmalloc failure checks + * Alexey Kuznetsov : state machine reworked + * and moved to net/core. + * Pekka Savola : RFC2461 validation + * YOSHIFUJI Hideaki @USAGI : Verify ND options properly + */ + +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0) +#define ND_NOPRINTK(x...) do { ; } while(0) +#define ND_PRINTK0 ND_PRINTK +#define ND_PRINTK1 ND_NOPRINTK +#define ND_PRINTK2 ND_NOPRINTK +#define ND_PRINTK3 ND_NOPRINTK +#if ND_DEBUG >= 1 +#undef ND_PRINTK1 +#define ND_PRINTK1 ND_PRINTK +#endif +#if ND_DEBUG >= 2 +#undef ND_PRINTK2 +#define ND_PRINTK2 ND_PRINTK +#endif +#if ND_DEBUG >= 3 +#undef ND_PRINTK3 +#define ND_PRINTK3 ND_PRINTK +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 rnd); +static int ndisc_constructor(struct neighbour *neigh); +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); +static int pndisc_constructor(struct pneigh_entry *n); +static void pndisc_destructor(struct pneigh_entry *n); +static void pndisc_redo(struct sk_buff *skb); + +static const struct neigh_ops ndisc_generic_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static const struct neigh_ops ndisc_hh_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + + +static const struct neigh_ops ndisc_direct_ops = { + .family = AF_INET6, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_table nd_tbl = { + .family = AF_INET6, + .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), + .key_len = sizeof(struct in6_addr), + .hash = ndisc_hash, + .constructor = ndisc_constructor, + .pconstructor = pndisc_constructor, + .pdestructor = pndisc_destructor, + .proxy_redo = pndisc_redo, + .id = "ndisc_cache", + .parms = { + .tbl = &nd_tbl, + .base_reachable_time = ND_REACHABLE_TIME, + .retrans_time = ND_RETRANS_TIMER, + .gc_staletime = 60 * HZ, + .reachable_time = ND_REACHABLE_TIME, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +/* ND options */ +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; +#ifdef CONFIG_IPV6_ROUTE_INFO + struct nd_opt_hdr *nd_opts_ri; + struct nd_opt_hdr *nd_opts_ri_end; +#endif + struct nd_opt_hdr *nd_useropts; + struct nd_opt_hdr *nd_useropts_end; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) + +/* + * Return the padding between the option length and the start of the + * link addr. Currently only IP-over-InfiniBand needs this, although + * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may + * also need a pad of 2. + */ +static int ndisc_addr_option_pad(unsigned short type) +{ + switch (type) { + case ARPHRD_INFINIBAND: return 2; + default: return 0; + } +} + +static inline int ndisc_opt_addr_space(struct net_device *dev) +{ + return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); +} + +static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, + unsigned short addr_type) +{ + int space = NDISC_OPT_SPACE(data_len); + int pad = ndisc_addr_option_pad(addr_type); + + opt[0] = type; + opt[1] = space>>3; + + memset(opt + 2, 0, pad); + opt += pad; + space -= pad; + + memcpy(opt+2, data, data_len); + data_len += 2; + opt += data_len; + if ((space -= data_len) > 0) + memset(opt, 0, space); + return opt + space; +} + +static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) +{ + int type; + if (!cur || !end || cur >= end) + return NULL; + type = cur->nd_opt_type; + do { + cur = ((void *)cur) + (cur->nd_opt_len << 3); + } while(cur < end && cur->nd_opt_type != type); + return cur <= end && cur->nd_opt_type == type ? cur : NULL; +} + +static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) +{ + return opt->nd_opt_type == ND_OPT_RDNSS; +} + +static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) +{ + if (!cur || !end || cur >= end) + return NULL; + do { + cur = ((void *)cur) + (cur->nd_opt_len << 3); + } while(cur < end && !ndisc_is_useropt(cur)); + return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; +} + +static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) +{ + struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; + + if (!nd_opt || opt_len < 0 || !ndopts) + return NULL; + memset(ndopts, 0, sizeof(*ndopts)); + while (opt_len) { + int l; + if (opt_len < sizeof(struct nd_opt_hdr)) + return NULL; + l = nd_opt->nd_opt_len << 3; + if (opt_len < l || l == 0) + return NULL; + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + case ND_OPT_MTU: + case ND_OPT_REDIRECT_HDR: + if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { + ND_PRINTK2(KERN_WARNING + "%s(): duplicated ND6 option found: type=%d\n", + __func__, + nd_opt->nd_opt_type); + } else { + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + } + break; + case ND_OPT_PREFIX_INFO: + ndopts->nd_opts_pi_end = nd_opt; + if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + break; +#ifdef CONFIG_IPV6_ROUTE_INFO + case ND_OPT_ROUTE_INFO: + ndopts->nd_opts_ri_end = nd_opt; + if (!ndopts->nd_opts_ri) + ndopts->nd_opts_ri = nd_opt; + break; +#endif + default: + if (ndisc_is_useropt(nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + ND_PRINTK2(KERN_NOTICE + "%s(): ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, nd_opt->nd_opt_len); + } + } + opt_len -= l; + nd_opt = ((void *)nd_opt) + l; + } + return ndopts; +} + +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + u8 *lladdr = (u8 *)(p + 1); + int lladdrlen = p->nd_opt_len << 3; + int prepad = ndisc_addr_option_pad(dev->type); + if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) + return NULL; + return lladdr + prepad; +} + +int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ + case ARPHRD_FDDI: + ipv6_eth_mc_map(addr, buf); + return 0; + case ARPHRD_IEEE802_TR: + ipv6_tr_mc_map(addr,buf); + return 0; + case ARPHRD_ARCNET: + ipv6_arcnet_mc_map(addr, buf); + return 0; + case ARPHRD_INFINIBAND: + ipv6_ib_mc_map(addr, dev->broadcast, buf); + return 0; + case ARPHRD_IPGRE: + return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); + default: + if (dir) { + memcpy(buf, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + +EXPORT_SYMBOL(ndisc_mc_map); + +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) +{ + const u32 *p32 = pkey; + u32 addr_hash, i; + + addr_hash = 0; + for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) + addr_hash ^= *p32++; + + return jhash_2words(addr_hash, dev->ifindex, hash_rnd); +} + +static int ndisc_constructor(struct neighbour *neigh) +{ + struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct net_device *dev = neigh->dev; + struct inet6_dev *in6_dev; + struct neigh_parms *parms; + int is_multicast = ipv6_addr_is_multicast(addr); + + rcu_read_lock(); + in6_dev = in6_dev_get(dev); + if (in6_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in6_dev->nd_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; + if (!dev->header_ops) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &ndisc_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + if (is_multicast) { + neigh->nud_state = NUD_NOARP; + ndisc_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags&IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } else if (dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->header_ops->cache) + neigh->ops = &ndisc_hh_ops; + else + neigh->ops = &ndisc_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + in6_dev_put(in6_dev); + return 0; +} + +static int pndisc_constructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return -EINVAL; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + return 0; +} + +static void pndisc_destructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); +} + +struct sk_buff *ndisc_build_skb(struct net_device *dev, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, + const struct in6_addr *target, + int llinfo) +{ + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + struct sk_buff *skb; + struct icmp6hdr *hdr; + int len; + int err; + u8 *opt; + + if (!dev->addr_len) + llinfo = 0; + + len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0); + if (llinfo) + len += ndisc_opt_addr_space(dev); + + skb = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + LL_ALLOCATED_SPACE(dev)), + 1, &err); + if (!skb) { + ND_PRINTK0(KERN_ERR + "ICMPv6 ND: %s() failed to allocate an skb, err=%d.\n", + __func__, err); + return NULL; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + + skb->transport_header = skb->tail; + skb_put(skb, len); + + hdr = (struct icmp6hdr *)skb_transport_header(skb); + memcpy(hdr, icmp6h, sizeof(*hdr)); + + opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); + if (target) { + ipv6_addr_copy((struct in6_addr *)opt, target); + opt += sizeof(*target); + } + + if (llinfo) + ndisc_fill_addr_option(opt, llinfo, dev->dev_addr, + dev->addr_len, dev->type); + + hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len, + IPPROTO_ICMPV6, + csum_partial(hdr, + len, 0)); + + return skb; +} + +EXPORT_SYMBOL(ndisc_build_skb); + +void ndisc_send_skb(struct sk_buff *skb, + struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h) +{ + struct flowi6 fl6; + struct dst_entry *dst; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + struct inet6_dev *idev; + int err; + u8 type; + + type = icmp6h->icmp6_type; + + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, dev->ifindex); + + dst = icmp6_dst_alloc(dev, neigh, daddr); + if (!dst) { + kfree_skb(skb); + return; + } + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + kfree_skb(skb); + return; + } + + skb_dst_set(skb, dst); + + idev = in6_dev_get(dst->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, + dst_output); + if (!err) { + ICMP6MSGOUT_INC_STATS(net, idev, type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +EXPORT_SYMBOL(ndisc_send_skb); + +/* + * Send a Neighbour Discover packet + */ +static void __ndisc_send(struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, const struct in6_addr *target, + int llinfo) +{ + struct sk_buff *skb; + + skb = ndisc_build_skb(dev, daddr, saddr, icmp6h, target, llinfo); + if (!skb) + return; + + ndisc_send_skb(skb, dev, neigh, daddr, saddr, icmp6h); +} + +static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + int router, int solicited, int override, int inc_opt) +{ + struct in6_addr tmpaddr; + struct inet6_ifaddr *ifp; + const struct in6_addr *src_addr; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + }; + + /* for anycast or proxy, solicited_addr != src_addr */ + ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); + if (ifp) { + src_addr = solicited_addr; + if (ifp->flags & IFA_F_OPTIMISTIC) + override = 0; + inc_opt |= ifp->idev->cnf.force_tllao; + in6_ifa_put(ifp); + } else { + if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, + inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, + &tmpaddr)) + return; + src_addr = &tmpaddr; + } + + icmp6h.icmp6_router = router; + icmp6h.icmp6_solicited = solicited; + icmp6h.icmp6_override = override; + + __ndisc_send(dev, neigh, daddr, src_addr, + &icmp6h, solicited_addr, + inc_opt ? ND_OPT_TARGET_LL_ADDR : 0); +} + +static void ndisc_send_unsol_na(struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + struct in6_addr mcaddr; + + idev = in6_dev_get(dev); + if (!idev) + return; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + addrconf_addr_solict_mult(&ifa->addr, &mcaddr); + ndisc_send_na(dev, NULL, &mcaddr, &ifa->addr, + /*router=*/ !!idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, + /*inc_opt=*/ true); + } + read_unlock_bh(&idev->lock); + + in6_dev_put(idev); +} + +void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *solicit, + const struct in6_addr *daddr, const struct in6_addr *saddr) +{ + struct in6_addr addr_buf; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + }; + + if (saddr == NULL) { + if (ipv6_get_lladdr(dev, &addr_buf, + (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) + return; + saddr = &addr_buf; + } + + __ndisc_send(dev, neigh, daddr, saddr, + &icmp6h, solicit, + !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0); +} + +void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_ROUTER_SOLICITATION, + }; + int send_sllao = dev->addr_len; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * According to section 2.2 of RFC 4429, we must not + * send router solicitations with a sllao from + * optimistic addresses, but we may send the solicitation + * if we don't include the sllao. So here we check + * if our address is optimistic, and if so, we + * suppress the inclusion of the sllao. + */ + if (send_sllao) { + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, + dev, 1); + if (ifp) { + if (ifp->flags & IFA_F_OPTIMISTIC) { + send_sllao = 0; + } + in6_ifa_put(ifp); + } else { + send_sllao = 0; + } + } +#endif + __ndisc_send(dev, NULL, daddr, saddr, + &icmp6h, NULL, + send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0); +} + + +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + /* + * "The sender MUST return an ICMP + * destination unreachable" + */ + dst_link_failure(skb); + kfree_skb(skb); +} + +/* Called with locked neigh: either read or both */ + +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + struct in6_addr *saddr = NULL; + struct in6_addr mcaddr; + struct net_device *dev = neigh->dev; + struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; + int probes = atomic_read(&neigh->probes); + + if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1)) + saddr = &ipv6_hdr(skb)->saddr; + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state & NUD_VALID)) { + ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); + } + ndisc_send_ns(dev, neigh, target, target, saddr); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + } else { + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + } +} + +static int pndisc_is_router(const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n; + int ret = -1; + + read_lock_bh(&nd_tbl.lock); + n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + if (n) + ret = !!(n->flags & NTF_ROUTER); + read_unlock_bh(&nd_tbl.lock); + + return ret; +} + +static void ndisc_recv_ns(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct nd_msg, opt)); + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct inet6_dev *idev = NULL; + struct neighbour *neigh; + int dad = ipv6_addr_any(saddr); + int inc; + int is_router = -1; + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: multicast target address"); + return; + } + + /* + * RFC2461 7.1.1: + * DAD has to be destined for solicited node multicast address. + */ + if (dad && + !(daddr->s6_addr32[0] == htonl(0xff020000) && + daddr->s6_addr32[1] == htonl(0x00000000) && + daddr->s6_addr32[2] == htonl(0x00000001) && + daddr->s6_addr [12] == 0xff )) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (wrong destination)\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND options\n"); + return; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid link-layer address length\n"); + return; + } + + /* RFC2461 7.1.1: + * If the IP source address is the unspecified address, + * there MUST NOT be source link-layer address option + * in the message. + */ + if (dad) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (link-layer address option)\n"); + return; + } + } + + inc = ipv6_addr_is_multicast(daddr); + + ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + if (ifp) { + + if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { + if (dad) { + if (dev->type == ARPHRD_IEEE802_TR) { + const unsigned char *sadr; + sadr = skb_mac_header(skb); + if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && + sadr[9] == dev->dev_addr[1] && + sadr[10] == dev->dev_addr[2] && + sadr[11] == dev->dev_addr[3] && + sadr[12] == dev->dev_addr[4] && + sadr[13] == dev->dev_addr[5]) { + /* looped-back to us */ + goto out; + } + } + + /* + * We are colliding with another node + * who is doing DAD + * so fail our DAD process + */ + addrconf_dad_failure(ifp); + return; + } else { + /* + * This is not a dad solicitation. + * If we are an optimistic node, + * we should respond. + * Otherwise, we should ignore it. + */ + if (!(ifp->flags & IFA_F_OPTIMISTIC)) + goto out; + } + } + + idev = ifp->idev; + } else { + struct net *net = dev_net(dev); + + idev = in6_dev_get(dev); + if (!idev) { + /* XXX: count this drop? */ + return; + } + + if (ipv6_chk_acast_addr(net, dev, &msg->target) || + (idev->cnf.forwarding && + (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && + (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { + if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && + skb->pkt_type != PACKET_HOST && + inc != 0 && + idev->nd_parms->proxy_delay != 0) { + /* + * for anycast or proxy, + * sender should delay its response + * by a random time between 0 and + * MAX_ANYCAST_DELAY_TIME seconds. + * (RFC2461) -- yoshfuji + */ + struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); + if (n) + pneigh_enqueue(&nd_tbl, idev->nd_parms, n); + goto out; + } + } else + goto out; + } + + if (is_router < 0) + is_router = !!idev->cnf.forwarding; + + if (dad) { + ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, + is_router, 0, (ifp != NULL), 1); + goto out; + } + + if (inc) + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); + else + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); + + /* + * update / create cache entry + * for the source address + */ + neigh = __neigh_lookup(&nd_tbl, saddr, dev, + !inc || lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE); + if (neigh || !dev->header_ops) { + ndisc_send_na(dev, neigh, saddr, &msg->target, + is_router, + 1, (ifp != NULL && inc), inc); + if (neigh) + neigh_release(neigh); + } + +out: + if (ifp) + in6_ifa_put(ifp); + else + in6_dev_put(idev); +} + +static void ndisc_recv_na(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct nd_msg, opt)); + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct neighbour *neigh; + + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: packet too short\n"); + return; + } + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: target address is multicast.\n"); + return; + } + + if (ipv6_addr_is_multicast(daddr) && + msg->icmph.icmp6_solicited) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: solicited NA is multicasted.\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND option\n"); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: invalid link-layer address length\n"); + return; + } + } + ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + if (ifp) { + if (skb->pkt_type != PACKET_LOOPBACK + && (ifp->flags & IFA_F_TENTATIVE)) { + addrconf_dad_failure(ifp); + return; + } + /* What should we make now? The advertisement + is invalid, but ndisc specs say nothing + about it. It could be misconfiguration, or + an smart proxy agent tries to help us :-) + + We should not print the error if NA has been + received from loopback - it is just our own + unsolicited advertisement. + */ + if (skb->pkt_type != PACKET_LOOPBACK) + ND_PRINTK1(KERN_WARNING + "ICMPv6 NA: someone advertises our address %pI6 on %s!\n", + &ifp->addr, ifp->idev->dev->name); + in6_ifa_put(ifp); + return; + } + neigh = neigh_lookup(&nd_tbl, &msg->target, dev); + + if (neigh) { + u8 old_flags = neigh->flags; + struct net *net = dev_net(dev); + + if (neigh->nud_state & NUD_FAILED) + goto out; + + /* + * Don't update the neighbor cache entry on a proxy NA from + * ourselves because either the proxied node is off link or it + * has already sent a NA to us. + */ + if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && + net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + /* XXX: idev->cnf.prixy_ndp */ + goto out; + } + + neigh_update(neigh, lladdr, + msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + + if ((old_flags & ~neigh->flags) & NTF_ROUTER) { + /* + * Change: router to host + */ + struct rt6_info *rt; + rt = rt6_get_dflt_router(saddr, dev); + if (rt) + ip6_del_rt(rt); + } + +out: + neigh_release(neigh); + } +} + +static void ndisc_recv_rs(struct sk_buff *skb) +{ + struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); + unsigned long ndoptlen = skb->len - sizeof(*rs_msg); + struct neighbour *neigh; + struct inet6_dev *idev; + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + struct ndisc_options ndopts; + u8 *lladdr = NULL; + + if (skb->len < sizeof(*rs_msg)) + return; + + idev = in6_dev_get(skb->dev); + if (!idev) { + if (net_ratelimit()) + ND_PRINTK1("ICMP6 RS: can't find in6 device\n"); + return; + } + + /* Don't accept RS if we're not in router mode */ + if (!idev->cnf.forwarding) + goto out; + + /* + * Don't update NCE if src = ::; + * this implies that the source node has no ip address assigned yet. + */ + if (ipv6_addr_any(saddr)) + goto out; + + /* Parse ND options */ + if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (net_ratelimit()) + ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n"); + goto out; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) + goto out; + } + + neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); + if (neigh) { + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + neigh_release(neigh); + } +out: + in6_dev_put(idev); +} + +static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) +{ + struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct nduseroptmsg *ndmsg; + struct net *net = dev_net(ra->dev); + int err; + int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + + (opt->nd_opt_len << 3)); + size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); + + skb = nlmsg_new(msg_size, GFP_ATOMIC); + if (skb == NULL) { + err = -ENOBUFS; + goto errout; + } + + nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); + if (nlh == NULL) { + goto nla_put_failure; + } + + ndmsg = nlmsg_data(nlh); + ndmsg->nduseropt_family = AF_INET6; + ndmsg->nduseropt_ifindex = ra->dev->ifindex; + ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; + ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; + ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; + + memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); + + NLA_PUT(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), + &ipv6_hdr(ra)->saddr); + nlmsg_end(skb, nlh); + + rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); + return; + +nla_put_failure: + nlmsg_free(skb); + err = -EMSGSIZE; +errout: + rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); +} + +static inline int accept_ra(struct inet6_dev *in6_dev) +{ + /* + * If forwarding is enabled, RA are not accepted unless the special + * hybrid mode (accept_ra=2) is enabled. + */ + if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2) + return 0; + + return in6_dev->cnf.accept_ra; +} + +static void ndisc_router_discovery(struct sk_buff *skb) +{ + struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); + struct neighbour *neigh = NULL; + struct inet6_dev *in6_dev; + struct rt6_info *rt = NULL; + int lifetime; + struct ndisc_options ndopts; + int optlen; + unsigned int pref = 0; + + __u8 * opt = (__u8 *)(ra_msg + 1); + + optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg); + + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: source address is not link-local.\n"); + return; + } + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: packet too short\n"); + return; + } + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: from host or unauthorized router\n"); + return; + } +#endif + + /* + * set the RA_RECV flag in the interface + */ + + in6_dev = in6_dev_get(skb->dev); + if (in6_dev == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: can't find inet6 device for %s.\n", + skb->dev->name); + return; + } + + if (!ndisc_parse_options(opt, optlen, &ndopts)) { + in6_dev_put(in6_dev); + ND_PRINTK2(KERN_WARNING + "ICMP6 RA: invalid ND options\n"); + return; + } + + if (!accept_ra(in6_dev)) + goto skip_linkparms; + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + /* skip link-specific parameters from interior routers */ + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + goto skip_linkparms; +#endif + + if (in6_dev->if_flags & IF_RS_SENT) { + /* + * flag that an RA was received after an RS was sent + * out on this interface. + */ + in6_dev->if_flags |= IF_RA_RCVD; + } + + /* + * Remember the managed/otherconf flags from most recently + * received RA message (RFC 2462) -- yoshfuji + */ + in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | + IF_RA_OTHERCONF)) | + (ra_msg->icmph.icmp6_addrconf_managed ? + IF_RA_MANAGED : 0) | + (ra_msg->icmph.icmp6_addrconf_other ? + IF_RA_OTHERCONF : 0); + + if (!in6_dev->cnf.accept_ra_defrtr) + goto skip_defrtr; + + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + +#ifdef CONFIG_IPV6_ROUTER_PREF + pref = ra_msg->icmph.icmp6_router_pref; + /* 10b is handled as if it were 00b (medium) */ + if (pref == ICMPV6_ROUTER_PREF_INVALID || + !in6_dev->cnf.accept_ra_rtr_pref) + pref = ICMPV6_ROUTER_PREF_MEDIUM; +#endif + + rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); + + if (rt) + neigh = dst_get_neighbour(&rt->dst); + + if (rt && lifetime == 0) { + neigh_clone(neigh); + ip6_del_rt(rt); + rt = NULL; + } + + if (rt == NULL && lifetime) { + ND_PRINTK3(KERN_DEBUG + "ICMPv6 RA: adding default router.\n"); + + rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); + if (rt == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() failed to add default route.\n", + __func__); + in6_dev_put(in6_dev); + return; + } + + neigh = dst_get_neighbour(&rt->dst); + if (neigh == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() got default router without neighbour.\n", + __func__); + dst_release(&rt->dst); + in6_dev_put(in6_dev); + return; + } + neigh->flags |= NTF_ROUTER; + } else if (rt) { + rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + } + + if (rt) + rt->rt6i_expires = jiffies + (HZ * lifetime); + + if (ra_msg->icmph.icmp6_hop_limit) { + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); + } + +skip_defrtr: + + /* + * Update Reachable Time and Retrans Timer + */ + + if (in6_dev->nd_parms) { + unsigned long rtime = ntohl(ra_msg->retrans_timer); + + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { + rtime = (rtime*HZ)/1000; + if (rtime < HZ/10) + rtime = HZ/10; + in6_dev->nd_parms->retrans_time = rtime; + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + + rtime = ntohl(ra_msg->reachable_time); + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { + rtime = (rtime*HZ)/1000; + + if (rtime < HZ/10) + rtime = HZ/10; + + if (rtime != in6_dev->nd_parms->base_reachable_time) { + in6_dev->nd_parms->base_reachable_time = rtime; + in6_dev->nd_parms->gc_staletime = 3 * rtime; + in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + } + } + +skip_linkparms: + + /* + * Process options. + */ + + if (!neigh) + neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, + skb->dev, 1); + if (neigh) { + u8 *lladdr = NULL; + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid link-layer address length\n"); + goto out; + } + } + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER); + } + + if (!accept_ra(in6_dev)) + goto out; + +#ifdef CONFIG_IPV6_ROUTE_INFO + if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_opts_ri; + p; + p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { + struct route_info *ri = (struct route_info *)p; +#ifdef CONFIG_IPV6_NDISC_NODETYPE + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && + ri->prefix_len == 0) + continue; +#endif + if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) + continue; + rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, + &ipv6_hdr(skb)->saddr); + } + } +#endif + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + /* skip link-specific ndopts from interior routers */ + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + goto out; +#endif + + if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_opts_pi; + p; + p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { + addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3); + } + } + + if (ndopts.nd_opts_mtu) { + __be32 n; + u32 mtu; + + memcpy(&n, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); + mtu = ntohl(n); + + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid mtu: %d\n", + mtu); + } else if (in6_dev->cnf.mtu6 != mtu) { + in6_dev->cnf.mtu6 = mtu; + + if (rt) + dst_metric_set(&rt->dst, RTAX_MTU, mtu); + + rt6_mtu_change(skb->dev, mtu); + } + } + + if (ndopts.nd_useropts) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_useropts; + p; + p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + ndisc_ra_useropt(skb, p); + } + } + + if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid RA options"); + } +out: + if (rt) + dst_release(&rt->dst); + else if (neigh) + neigh_release(neigh); + in6_dev_put(in6_dev); +} + +static void ndisc_redirect_rcv(struct sk_buff *skb) +{ + struct inet6_dev *in6_dev; + struct icmp6hdr *icmph; + const struct in6_addr *dest; + const struct in6_addr *target; /* new first hop to destination */ + struct neighbour *neigh; + int on_link = 0; + struct ndisc_options ndopts; + int optlen; + u8 *lladdr = NULL; + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + switch (skb->ndisc_nodetype) { + case NDISC_NODETYPE_HOST: + case NDISC_NODETYPE_NODEFAULT: + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: from host or unauthorized router\n"); + return; + } +#endif + + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: source address is not link-local.\n"); + return; + } + + optlen = skb->tail - skb->transport_header; + optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: packet too short\n"); + return; + } + + icmph = icmp6_hdr(skb); + target = (const struct in6_addr *) (icmph + 1); + dest = target + 1; + + if (ipv6_addr_is_multicast(dest)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination address is multicast.\n"); + return; + } + + if (ipv6_addr_equal(dest, target)) { + on_link = 1; + } else if (ipv6_addr_type(target) != + (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: target address is not link-local unicast.\n"); + return; + } + + in6_dev = in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) { + in6_dev_put(in6_dev); + return; + } + + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ + + if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid ND options\n"); + in6_dev_put(in6_dev); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid link-layer address length\n"); + in6_dev_put(in6_dev); + return; + } + } + + neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + if (neigh) { + rt6_redirect(dest, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr, neigh, lladdr, + on_link); + neigh_release(neigh); + } + in6_dev_put(in6_dev); +} + +void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, + const struct in6_addr *target) +{ + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + struct sk_buff *buff; + struct icmp6hdr *icmph; + struct in6_addr saddr_buf; + struct in6_addr *addrp; + struct rt6_info *rt; + struct dst_entry *dst; + struct inet6_dev *idev; + struct flowi6 fl6; + u8 *opt; + int rd_len; + int err; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + + if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: no link-local address on %s\n", + dev->name); + return; + } + + if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && + ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: target address is not link-local unicast.\n"); + return; + } + + icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, + &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); + + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL) + return; + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + rt = (struct rt6_info *) dst; + + if (rt->rt6i_flags & RTF_GATEWAY) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination is not a neighbour.\n"); + goto release; + } + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + goto release; + + if (dev->addr_len) { + read_lock_bh(&neigh->lock); + if (neigh->nud_state & NUD_VALID) { + memcpy(ha_buf, neigh->ha, dev->addr_len); + read_unlock_bh(&neigh->lock); + ha = ha_buf; + len += ndisc_opt_addr_space(dev); + } else + read_unlock_bh(&neigh->lock); + } + + rd_len = min_t(unsigned int, + IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + rd_len &= ~0x7; + len += rd_len; + + buff = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + LL_ALLOCATED_SPACE(dev)), + 1, &err); + if (buff == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 Redirect: %s() failed to allocate an skb, err=%d.\n", + __func__, err); + goto release; + } + + skb_reserve(buff, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, + IPPROTO_ICMPV6, len); + + skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data); + skb_put(buff, len); + icmph = icmp6_hdr(buff); + + memset(icmph, 0, sizeof(struct icmp6hdr)); + icmph->icmp6_type = NDISC_REDIRECT; + + /* + * copy target and destination addresses + */ + + addrp = (struct in6_addr *)(icmph + 1); + ipv6_addr_copy(addrp, target); + addrp++; + ipv6_addr_copy(addrp, &ipv6_hdr(skb)->daddr); + + opt = (u8*) (addrp + 1); + + /* + * include target_address option + */ + + if (ha) + opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, + dev->addr_len, dev->type); + + /* + * build redirect option and copy skb over to the new packet. + */ + + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, ipv6_hdr(skb), rd_len - 8); + + icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr, + len, IPPROTO_ICMPV6, + csum_partial(icmph, len, 0)); + + skb_dst_set(buff, dst); + idev = in6_dev_get(dst->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev, + dst_output); + if (!err) { + ICMP6MSGOUT_INC_STATS(net, idev, NDISC_REDIRECT); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); + return; + +release: + dst_release(dst); +} + +static void pndisc_redo(struct sk_buff *skb) +{ + ndisc_recv_ns(skb); + kfree_skb(skb); +} + +int ndisc_rcv(struct sk_buff *skb) +{ + struct nd_msg *msg; + + if (!pskb_may_pull(skb, skb->len)) + return 0; + + msg = (struct nd_msg *)skb_transport_header(skb); + + __skb_push(skb, skb->data - skb_transport_header(skb)); + + if (ipv6_hdr(skb)->hop_limit != 255) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid hop-limit: %d\n", + ipv6_hdr(skb)->hop_limit); + return 0; + } + + if (msg->icmph.icmp6_code != 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid ICMPv6 code: %d\n", + msg->icmph.icmp6_code); + return 0; + } + + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + + switch (msg->icmph.icmp6_type) { + case NDISC_NEIGHBOUR_SOLICITATION: + ndisc_recv_ns(skb); + break; + + case NDISC_NEIGHBOUR_ADVERTISEMENT: + ndisc_recv_na(skb); + break; + + case NDISC_ROUTER_SOLICITATION: + ndisc_recv_rs(skb); + break; + + case NDISC_ROUTER_ADVERTISEMENT: + ndisc_router_discovery(skb); + break; + + case NDISC_REDIRECT: + ndisc_redirect_rcv(skb); + break; + } + + return 0; +} + +static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&nd_tbl, dev); + fib6_run_gc(~0UL, net); + break; + case NETDEV_DOWN: + neigh_ifdown(&nd_tbl, dev); + fib6_run_gc(~0UL, net); + break; + case NETDEV_NOTIFY_PEERS: + ndisc_send_unsol_na(dev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block ndisc_netdev_notifier = { + .notifier_call = ndisc_netdev_event, +}; + +#ifdef CONFIG_SYSCTL +static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, + const char *func, const char *dev_name) +{ + static char warncomm[TASK_COMM_LEN]; + static int warned; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING + "process `%s' is using deprecated sysctl (%s) " + "net.ipv6.neigh.%s.%s; " + "Use net.ipv6.neigh.%s.%s_ms " + "instead.\n", + warncomm, func, + dev_name, ctl->procname, + dev_name, ctl->procname); + warned++; + } +} + +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net_device *dev = ctl->extra1; + struct inet6_dev *idev; + int ret; + + if ((strcmp(ctl->procname, "retrans_time") == 0) || + (strcmp(ctl->procname, "base_reachable_time") == 0)) + ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); + + if (strcmp(ctl->procname, "retrans_time") == 0) + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + else if (strcmp(ctl->procname, "base_reachable_time") == 0) + ret = proc_dointvec_jiffies(ctl, write, + buffer, lenp, ppos); + + else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || + (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) + ret = proc_dointvec_ms_jiffies(ctl, write, + buffer, lenp, ppos); + else + ret = -1; + + if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { + if (ctl->data == &idev->nd_parms->base_reachable_time) + idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + in6_dev_put(idev); + } + return ret; +} + + +#endif + +static int __net_init ndisc_net_init(struct net *net) +{ + struct ipv6_pinfo *np; + struct sock *sk; + int err; + + err = inet_ctl_sock_create(&sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n", + err); + return err; + } + + net->ipv6.ndisc_sk = sk; + + np = inet6_sk(sk); + np->hop_limit = 255; + /* Do not loopback ndisc messages */ + np->mc_loop = 0; + + return 0; +} + +static void __net_exit ndisc_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.ndisc_sk); +} + +static struct pernet_operations ndisc_net_ops = { + .init = ndisc_net_init, + .exit = ndisc_net_exit, +}; + +int __init ndisc_init(void) +{ + int err; + + err = register_pernet_subsys(&ndisc_net_ops); + if (err) + return err; + /* + * Initialize the neighbour table + */ + neigh_table_init(&nd_tbl); + +#ifdef CONFIG_SYSCTL + err = neigh_sysctl_register(NULL, &nd_tbl.parms, "ipv6", + &ndisc_ifinfo_sysctl_change); + if (err) + goto out_unregister_pernet; +#endif + err = register_netdevice_notifier(&ndisc_netdev_notifier); + if (err) + goto out_unregister_sysctl; +out: + return err; + +out_unregister_sysctl: +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +out_unregister_pernet: +#endif + unregister_pernet_subsys(&ndisc_net_ops); + goto out; +} + +void ndisc_cleanup(void) +{ + unregister_netdevice_notifier(&ndisc_netdev_notifier); +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +#endif + neigh_table_clear(&nd_tbl); + unregister_pernet_subsys(&ndisc_net_ops); +} diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c new file mode 100644 index 00000000..30fcee46 --- /dev/null +++ b/net/ipv6/netfilter.c @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .flowi6_mark = skb->mark, + .daddr = iph->daddr, + .saddr = iph->saddr, + }; + + dst = ip6_route_output(net, skb->sk, &fl6); + if (dst->error) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + skb_dst_drop(skb); + + skb_dst_set(skb, dst); + +#ifdef CONFIG_XFRM + if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { + skb_dst_set(skb, NULL); + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0); + if (IS_ERR(dst)) + return -1; + skb_dst_set(skb, dst); + } +#endif + + return 0; +} +EXPORT_SYMBOL(ip6_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip6_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; + u_int32_t mark; +}; + +static void nf_ip6_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) +{ + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->hook == NF_INET_LOCAL_OUT) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + rt_info->mark = skb->mark; + } +} + +static int nf_ip6_reroute(struct sk_buff *skb, + const struct nf_queue_entry *entry) +{ + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->hook == NF_INET_LOCAL_OUT) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || + skb->mark != rt_info->mark) + return ip6_route_me_harder(skb); + } + return 0; +} + +static int nf_ip6_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict) +{ + static const struct ipv6_pinfo fake_pinfo; + static const struct inet_sock fake_sk = { + /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ + .sk.sk_bound_dev_if = 1, + .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, + }; + const void *sk = strict ? &fake_sk : NULL; + + *dst = ip6_route_output(net, sk, &fl->u.ip6); + return (*dst)->error; +} + +__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, protocol, + csum_sub(skb->csum, + skb_checksum(skb, 0, + dataoff, 0)))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold( + csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, + skb_checksum(skb, 0, + dataoff, 0)))); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip6_checksum); + +static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum hsum; + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip6_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + hsum = skb_checksum(skb, 0, dataoff, 0); + skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, hsum))); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +}; + +static const struct nf_afinfo nf_ip6_afinfo = { + .family = AF_INET6, + .checksum = nf_ip6_checksum, + .checksum_partial = nf_ip6_checksum_partial, + .route = nf_ip6_route, + .saveroute = nf_ip6_saveroute, + .reroute = nf_ip6_reroute, + .route_key_size = sizeof(struct ip6_rt_info), +}; + +int __init ipv6_netfilter_init(void) +{ + return nf_register_afinfo(&nf_ip6_afinfo); +} + +/* This can be called from inet6_init() on errors, so it cannot + * be marked __exit. -DaveM + */ +void ipv6_netfilter_fini(void) +{ + nf_unregister_afinfo(&nf_ip6_afinfo); +} diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig new file mode 100644 index 00000000..5bbf5316 --- /dev/null +++ b/net/ipv6/netfilter/Kconfig @@ -0,0 +1,224 @@ +# +# IP netfilter configuration +# + +menu "IPv6: Netfilter Configuration" + depends on INET && IPV6 && NETFILTER + +config NF_DEFRAG_IPV6 + tristate + default n + +config NF_CONNTRACK_IPV6 + tristate "IPv6 connection tracking support" + depends on INET && IPV6 && NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV6 + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_QUEUE + tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" + depends on INET && IPV6 && NETFILTER + depends on NETFILTER_ADVANCED + ---help--- + + This option adds a queue handler to the kernel for IPv6 + packets which enables users to receive the filtered packets + with QUEUE target using libipq. + + This option enables the old IPv6-only "ip6_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + + (C) Fernando Anton 2001 + IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + Universidad Carlos III de Madrid + Universidad Politecnica de Alcala de Henares + email: . + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_IPTABLES + tristate "IP6 tables support (required for filtering)" + depends on INET && IPV6 + select NETFILTER_XTABLES + default m if NETFILTER_ADVANCED=n + help + ip6tables is a general, extensible packet identification framework. + Currently only the packet filtering and packet mangling subsystem + for IPv6 use this, but connection tracking is going to follow. + Say 'Y' or 'M' here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +if IP6_NF_IPTABLES + +# The simple matches. +config IP6_NF_MATCH_AH + tristate '"ah" match support' + depends on NETFILTER_ADVANCED + help + This module allows one to match AH packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_EUI64 + tristate '"eui64" address check' + depends on NETFILTER_ADVANCED + help + This module performs checking on the IPv6 source address + Compares the last 64 bits with the EUI64 (delivered + from the MAC address) address + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_FRAG + tristate '"frag" Fragmentation header match support' + depends on NETFILTER_ADVANCED + help + frag matching allows you to match packets based on the fragmentation + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_OPTS + tristate '"hbh" hop-by-hop and "dst" opts header match support' + depends on NETFILTER_ADVANCED + help + This allows one to match packets based on the hop-by-hop + and destination options headers of a packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_HL + tristate '"hl" hoplimit match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MATCH_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. + +config IP6_NF_MATCH_IPV6HEADER + tristate '"ipv6header" IPv6 Extension Headers Match' + default m if NETFILTER_ADVANCED=n + help + This module allows one to match packets based upon + the ipv6 extension headers. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_MH + tristate '"mh" match support' + depends on NETFILTER_ADVANCED + help + This module allows one to match MH packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_RT + tristate '"rt" Routing header match support' + depends on NETFILTER_ADVANCED + help + rt matching allows you to match packets based on the routing + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +# The targets +config IP6_NF_TARGET_HL + tristate '"HL" hoplimit target support' + depends on NETFILTER_ADVANCED && IP6_NF_MANGLE + select NETFILTER_XT_TARGET_HL + ---help--- + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + +config IP6_NF_TARGET_LOG + tristate "LOG target support" + default m if NETFILTER_ADVANCED=n + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_FILTER + tristate "Packet filtering" + default m if NETFILTER_ADVANCED=n + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP6_NF_FILTER + default m if NETFILTER_ADVANCED=n + help + The REJECT target allows a filtering rule to specify that an ICMPv6 + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_REJECT_SKERR + bool "Force socket error when rejecting with icmp*" + depends on IP6_NF_TARGET_REJECT + default n + help + This option enables turning a "--reject-with icmp*" into a matching + socket error also. + The REJECT target normally allows sending an ICMP message. But it + leaves the local socket unaware of any ingress rejects. + + If unsure, say N. + +config IP6_NF_MANGLE + tristate "Packet mangling" + default m if NETFILTER_ADVANCED=n + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_RAW + tristate 'raw table support (required for TRACE)' + depends on NETFILTER_ADVANCED + help + This option adds a `raw' table to ip6tables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +# security table for MAC policy +config IP6_NF_SECURITY + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + +endif # IP6_NF_IPTABLES + +endmenu + diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile new file mode 100644 index 00000000..abfee91c --- /dev/null +++ b/net/ipv6/netfilter/Makefile @@ -0,0 +1,34 @@ +# +# Makefile for the netfilter modules on top of IPv6. +# + +# Link order matters here. +obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o +obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o +obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o +obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o +obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o + +# objects for l3 independent conntrack +nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o + +# defrag +nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o +obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o + +# matches +obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o +obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o +obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o +obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o +obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o +obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o +obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o + +# targets +obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o +obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c new file mode 100644 index 00000000..24939486 --- /dev/null +++ b/net/ipv6/netfilter/ip6_queue.c @@ -0,0 +1,640 @@ +/* + * This is a module which is used for queueing IPv6 packets and + * communicating with userspace via netlink. + * + * (C) 2001 Fernando Anton, this code is GPL. + * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain + * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain + * email: fanton@it.uc3m.es + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip6_queue" +#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" + +typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); + +static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; +static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; +static DEFINE_SPINLOCK(queue_lock); +static int peer_pid __read_mostly; +static unsigned int copy_range __read_mostly; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl __read_mostly; +static LIST_HEAD(queue_list); +static DEFINE_MUTEX(ipqnl_mutex); + +static inline void +__ipq_enqueue_entry(struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue_list); + queue_total++; +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + if (range > 0xFFFF) + range = 0xFFFF; + copy_range = range; + copy_mode = mode; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NULL, 0); +} + +static struct nf_queue_entry * +ipq_find_dequeue_entry(unsigned long id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue_lock); + + list_for_each_entry(i, &queue_list, list) { + if ((unsigned long)i == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue_total--; + } + + spin_unlock_bh(&queue_lock); + return entry; +} + +static void +__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue_total--; + nf_reinject(entry, NF_DROP); + } + } +} + +static void +ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + spin_lock_bh(&queue_lock); + __ipq_flush(cmpfn, data); + spin_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) +{ + sk_buff_data_t old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + struct timeval tv; + + switch (ACCESS_ONCE(copy_mode)) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + break; + + case IPQ_COPY_PACKET: + if (entry->skb->ip_summed == CHECKSUM_PARTIAL && + (*errp = skb_checksum_help(entry->skb))) + return NULL; + + data_len = ACCESS_ONCE(copy_range); + if (data_len == 0 || data_len > entry->skb->len) + data_len = entry->skb->len; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + return NULL; + } + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + tv = ktime_to_timeval(entry->skb->tstamp); + pmsg->timestamp_sec = tv.tv_sec; + pmsg->timestamp_usec = tv.tv_usec; + pmsg->mark = entry->skb->mark; + pmsg->hook = entry->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->indev) + strcpy(pmsg->indev_name, entry->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->outdev) + strcpy(pmsg->outdev_name, entry->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->indev && entry->skb->dev && + entry->skb->mac_header != entry->skb->network_header) { + pmsg->hw_type = entry->skb->dev->type; + pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + *errp = -EINVAL; + printk(KERN_ERR "ip6_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + int status = -EINVAL; + struct sk_buff *nskb; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + return status; + + spin_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip6_queue: fill at %d entries, " + "dropping packet(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + spin_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + spin_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) +{ + int diff; + struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; + struct sk_buff *nskb; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) { + if (pskb_trim(e->skb, v->data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "ip6_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, v->data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, v->payload, v->data_len); + e->skb->ip_summed = CHECKSUM_NONE; + + return 0; +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct nf_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv6(vmsg, entry) < 0) + verdict = NF_DROP; + + nf_reinject(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + spin_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + spin_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->indev) + if (entry->indev->ifindex == ifindex) + return 1; + + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) + return 1; +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + ipq_flush(dev_cmp, ifindex); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +__ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags; + unsigned int nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = nlmsg_hdr(skb); + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + spin_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + spin_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + spin_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + nlmsglen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); +} + +static void +ipq_rcv_skb(struct sk_buff *skb) +{ + mutex_lock(&ipqnl_mutex); + __ipq_rcv_skb(skb); + mutex_unlock(&ipqnl_mutex); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) { + spin_lock_bh(&queue_lock); + if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) + __ipq_reset(); + spin_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; +#endif + +#ifdef CONFIG_PROC_FS +static int ip6_queue_show(struct seq_file *m, void *v) +{ + spin_lock_bh(&queue_lock); + + seq_printf(m, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netfilter dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + spin_unlock_bh(&queue_lock); + return 0; +} + +static int ip6_queue_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip6_queue_show, NULL); +} + +static const struct file_operations ip6_queue_proc_fops = { + .open = ip6_queue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; +#endif + +static const struct nf_queue_handler nfqh = { + .name = "ip6_queue", + .outfn = &ipq_enqueue_packet, +}; + +static int __init ip6_queue_init(void) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc __maybe_unused; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, + ipq_rcv_skb, NULL, THIS_MODULE); + if (ipqnl == NULL) { + printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, + &ip6_queue_proc_fops); + if (!proc) { + printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } +#endif + register_netdevice_notifier(&ipq_dev_notifier); +#ifdef CONFIG_SYSCTL + ipq_sysctl_header = register_sysctl_paths(net_ipv6_ctl_path, ipq_table); +#endif + status = nf_register_queue_handler(NFPROTO_IPV6, &nfqh); + if (status < 0) { + printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup_sysctl: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ipq_sysctl_header); +#endif + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); + +cleanup_ipqnl: __maybe_unused + netlink_kernel_release(ipqnl); + mutex_lock(&ipqnl_mutex); + mutex_unlock(&ipqnl_mutex); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static void __exit ip6_queue_fini(void) +{ + nf_unregister_queue_handlers(&nfqh); + + ipq_flush(NULL, 0); + +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ipq_sysctl_header); +#endif + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); + + netlink_kernel_release(ipqnl); + mutex_lock(&ipqnl_mutex); + mutex_unlock(&ipqnl_mutex); + + netlink_unregister_notifier(&ipq_nl_notifier); +} + +MODULE_DESCRIPTION("IPv6 packet queue handler"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW); + +module_init(ip6_queue_init); +module_exit(ip6_queue_fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c new file mode 100644 index 00000000..14cb3100 --- /dev/null +++ b/net/ipv6/netfilter/ip6_tables.c @@ -0,0 +1,2375 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "../../netfilter/xt_repldata.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("IPv6 packet filter"); + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) pr_info(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) pr_info(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) WARN_ON(!(x)) +#else +#define IP_NF_ASSERT(x) +#endif + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +void *ip6t_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ip6t, IP6T); +} +EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); + +/* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore + only need to read-lock in the softirq; doing a write_lock_bh() in user + context stops packets coming through and allows user context to read + the counters or update the rules. + + Hence the start of any table is given by get_table() below. */ + +/* Check for an extension */ +int +ip6t_ext_hdr(u8 nexthdr) +{ + return (nexthdr == IPPROTO_HOPOPTS) || + (nexthdr == IPPROTO_ROUTING) || + (nexthdr == IPPROTO_FRAGMENT) || + (nexthdr == IPPROTO_ESP) || + (nexthdr == IPPROTO_AH) || + (nexthdr == IPPROTO_NONE) || + (nexthdr == IPPROTO_DSTOPTS); +} + +/* Returns whether matches rule or not. */ +/* Performance critical - called for every packet */ +static inline bool +ip6_packet_match(const struct sk_buff *skb, + const char *indev, + const char *outdev, + const struct ip6t_ip6 *ip6info, + unsigned int *protoff, + int *fragoff, bool *hotdrop) +{ + unsigned long ret; + const struct ipv6hdr *ipv6 = ipv6_hdr(skb); + +#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) + + if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, + &ip6info->src), IP6T_INV_SRCIP) || + FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, + &ip6info->dst), IP6T_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); +/* + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + ipinfo->smsk.s_addr, ipinfo->src.s_addr, + ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, + ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + return false; + } + + ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); + + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ip6info->iniface, + ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":""); + return false; + } + + ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); + + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ip6info->outiface, + ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":""); + return false; + } + +/* ... might want to do something with class and flowlabel here ... */ + + /* look for the desired protocol header */ + if((ip6info->flags & IP6T_F_PROTO)) { + int protohdr; + unsigned short _frag_off; + + protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off); + if (protohdr < 0) { + if (_frag_off == 0) + *hotdrop = true; + return false; + } + *fragoff = _frag_off; + + dprintf("Packet protocol %hi ?= %s%hi.\n", + protohdr, + ip6info->invflags & IP6T_INV_PROTO ? "!":"", + ip6info->proto); + + if (ip6info->proto == protohdr) { + if(ip6info->invflags & IP6T_INV_PROTO) { + return false; + } + return true; + } + + /* We need match for the '-p all', too! */ + if ((ip6info->proto != 0) && + !(ip6info->invflags & IP6T_INV_PROTO)) + return false; + } + return true; +} + +/* should be ip6 safe */ +static bool +ip6_checkentry(const struct ip6t_ip6 *ipv6) +{ + if (ipv6->flags & ~IP6T_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ipv6->flags & ~IP6T_F_MASK); + return false; + } + if (ipv6->invflags & ~IP6T_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ipv6->invflags & ~IP6T_INV_MASK); + return false; + } + return true; +} + +static unsigned int +ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (net_ratelimit()) + pr_info("error: `%s'\n", (const char *)par->targinfo); + + return NF_DROP; +} + +static inline struct ip6t_entry * +get_entry(const void *base, unsigned int offset) +{ + return (struct ip6t_entry *)(base + offset); +} + +/* All zeroes == unconditional rule. */ +/* Mildly perf critical (only if packet tracing is on) */ +static inline bool unconditional(const struct ip6t_ip6 *ipv6) +{ + static const struct ip6t_ip6 uncond; + + return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; +} + +static inline const struct xt_entry_target * +ip6t_get_target_c(const struct ip6t_entry *e) +{ + return ip6t_get_target((struct ip6t_entry *)e); +} + +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +/* This cries for unification! */ +static const char *const hooknames[] = { + [NF_INET_PRE_ROUTING] = "PREROUTING", + [NF_INET_LOCAL_IN] = "INPUT", + [NF_INET_FORWARD] = "FORWARD", + [NF_INET_LOCAL_OUT] = "OUTPUT", + [NF_INET_POST_ROUTING] = "POSTROUTING", +}; + +enum nf_ip_trace_comments { + NF_IP6_TRACE_COMMENT_RULE, + NF_IP6_TRACE_COMMENT_RETURN, + NF_IP6_TRACE_COMMENT_POLICY, +}; + +static const char *const comments[] = { + [NF_IP6_TRACE_COMMENT_RULE] = "rule", + [NF_IP6_TRACE_COMMENT_RETURN] = "return", + [NF_IP6_TRACE_COMMENT_POLICY] = "policy", +}; + +static struct nf_loginfo trace_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 4, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* Mildly perf critical (only if packet tracing is on) */ +static inline int +get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, + const char *hookname, const char **chainname, + const char **comment, unsigned int *rulenum) +{ + const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); + + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { + /* Head of user chain: ERROR target with chainname */ + *chainname = t->target.data; + (*rulenum) = 0; + } else if (s == e) { + (*rulenum)++; + + if (s->target_offset == sizeof(struct ip6t_entry) && + strcmp(t->target.u.kernel.target->name, + XT_STANDARD_TARGET) == 0 && + t->verdict < 0 && + unconditional(&s->ipv6)) { + /* Tail of chains: STANDARD target (return/policy) */ + *comment = *chainname == hookname + ? comments[NF_IP6_TRACE_COMMENT_POLICY] + : comments[NF_IP6_TRACE_COMMENT_RETURN]; + } + return 1; + } else + (*rulenum)++; + + return 0; +} + +static void trace_packet(const struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + const char *tablename, + const struct xt_table_info *private, + const struct ip6t_entry *e) +{ + const void *table_base; + const struct ip6t_entry *root; + const char *hookname, *chainname, *comment; + const struct ip6t_entry *iter; + unsigned int rulenum = 0; + + table_base = private->entries[smp_processor_id()]; + root = get_entry(table_base, private->hook_entry[hook]); + + hookname = chainname = hooknames[hook]; + comment = comments[NF_IP6_TRACE_COMMENT_RULE]; + + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; + + nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); +} +#endif + +static inline __pure struct ip6t_entry * +ip6t_next_entry(const struct ip6t_entry *entry) +{ + return (void *)entry + entry->next_offset; +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ip6t_do_table(struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct xt_table *table) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + const void *table_base; + struct ip6t_entry *e, **jumpstack; + unsigned int *stackptr, origptr, cpu; + const struct xt_table_info *private; + struct xt_action_param acpar; + unsigned int addend; + + /* Initialization */ + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + acpar.hotdrop = false; + acpar.in = in; + acpar.out = out; + acpar.family = NFPROTO_IPV6; + acpar.hooknum = hook; + + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + + local_bh_disable(); + addend = xt_write_recseq_begin(); + private = table->private; + cpu = smp_processor_id(); + table_base = private->entries[cpu]; + jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); + origptr = *stackptr; + + e = get_entry(table_base, private->hook_entry[hook]); + + do { + const struct xt_entry_target *t; + const struct xt_entry_match *ematch; + + IP_NF_ASSERT(e); + if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, + &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { + no_match: + e = ip6t_next_entry(e); + continue; + } + + xt_ematch_foreach(ematch, e) { + acpar.match = ematch->u.kernel.match; + acpar.matchinfo = ematch->data; + if (!acpar.match->match(skb, &acpar)) + goto no_match; + } + + ADD_COUNTER(e->counters, skb->len, 1); + + t = ip6t_get_target_c(e); + IP_NF_ASSERT(t->u.kernel.target); + +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + /* The packet is traced: log it */ + if (unlikely(skb->nf_trace)) + trace_packet(skb, hook, in, out, + table->name, private, e); +#endif + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct xt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != XT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + if (*stackptr <= origptr) + e = get_entry(table_base, + private->underflow[hook]); + else + e = ip6t_next_entry(jumpstack[--*stackptr]); + continue; + } + if (table_base + v != ip6t_next_entry(e) && + !(e->ipv6.flags & IP6T_F_GOTO)) { + if (*stackptr >= private->stacksize) { + verdict = NF_DROP; + break; + } + jumpstack[(*stackptr)++] = e; + } + + e = get_entry(table_base, v); + continue; + } + + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; + + verdict = t->u.kernel.target->target(skb, &acpar); + if (verdict == XT_CONTINUE) + e = ip6t_next_entry(e); + else + /* Verdict */ + break; + } while (!acpar.hotdrop); + + *stackptr = origptr; + + xt_write_recseq_end(addend); + local_bh_enable(); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (acpar.hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(const struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ip6t_entry *e = (struct ip6t_entry *)(entry0 + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + const struct xt_standard_target *t + = (void *)ip6t_get_target_c(e); + int visited = e->comefrom & (1 << hook); + + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { + pr_err("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); + + /* Unconditional return/END. */ + if ((e->target_offset == sizeof(struct ip6t_entry) && + (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < 0 && + unconditional(&e->ipv6)) || visited) { + unsigned int oldpos, size; + + if ((strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<comefrom + & (1 << NF_INET_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ip6t_entry *) + (entry0 + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ip6t_entry *) + (entry0 + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0 && + newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct ip6t_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ip6t_entry *) + (entry0 + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static void cleanup_match(struct xt_entry_match *m, struct net *net) +{ + struct xt_mtdtor_param par; + + par.net = net; + par.match = m->u.kernel.match; + par.matchinfo = m->data; + par.family = NFPROTO_IPV6; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); +} + +static int +check_entry(const struct ip6t_entry *e, const char *name) +{ + const struct xt_entry_target *t; + + if (!ip6_checkentry(&e->ipv6)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + if (e->target_offset + sizeof(struct xt_entry_target) > + e->next_offset) + return -EINVAL; + + t = ip6t_get_target_c(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + + return 0; +} + +static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) +{ + const struct ip6t_ip6 *ipv6 = par->entryinfo; + int ret; + + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); + if (ret < 0) { + duprintf("ip_tables: check failed for `%s'.\n", + par.match->name); + return ret; + } + return 0; +} + +static int +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) +{ + struct xt_match *match; + int ret; + + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { + duprintf("find_check_match: `%s' not found\n", m->u.user.name); + return PTR_ERR(match); + } + m->u.kernel.match = match; + + ret = check_match(m, par); + if (ret) + goto err; + + return 0; +err: + module_put(m->u.kernel.match->me); + return ret; +} + +static int check_target(struct ip6t_entry *e, struct net *net, const char *name) +{ + struct xt_entry_target *t = ip6t_get_target(e); + struct xt_tgchk_param par = { + .net = net, + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_IPV6, + }; + int ret; + + t = ip6t_get_target(e); + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); + if (ret < 0) { + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + return ret; + } + return 0; +} + +static int +find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, + unsigned int size) +{ + struct xt_entry_target *t; + struct xt_target *target; + int ret; + unsigned int j; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; + + ret = check_entry(e, name); + if (ret) + return ret; + + j = 0; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } + + t = ip6t_get_target(e); + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); + ret = PTR_ERR(target); + goto cleanup_matches; + } + t->u.kernel.target = target; + + ret = check_target(e, net, name); + if (ret) + goto err; + return 0; + err: + module_put(t->u.kernel.target->me); + cleanup_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } + return ret; +} + +static bool check_underflow(const struct ip6t_entry *e) +{ + const struct xt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->ipv6)) + return false; + t = ip6t_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct xt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + +static int +check_entry_size_and_hooks(struct ip6t_entry *e, + struct xt_table_info *newinfo, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int valid_hooks) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } + newinfo->underflow[h] = underflows[h]; + } + } + + /* Clear counters and comefrom */ + e->counters = ((struct xt_counters) { 0, 0 }); + e->comefrom = 0; + return 0; +} + +static void cleanup_entry(struct ip6t_entry *e, struct net *net) +{ + struct xt_tgdtor_param par; + struct xt_entry_target *t; + struct xt_entry_match *ematch; + + /* Cleanup all matches */ + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); + t = ip6t_get_target(e); + + par.net = net; + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_IPV6; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ip6t_replace *repl) +{ + struct ip6t_entry *iter; + unsigned int i; + int ret = 0; + + newinfo->size = repl->size; + newinfo->number = repl->num_entries; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + if (strcmp(ip6t_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + + if (i != repl->num_entries) { + duprintf("translate_table: %u not %u entries\n", + i, repl->num_entries); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(repl->valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, repl->hook_entry[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, repl->underflow[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } + + if (ret != 0) { + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) { + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); + } + + return ret; +} + +static void +get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ip6t_entry *iter; + unsigned int cpu; + unsigned int i; + + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + + i = 0; + xt_entry_foreach(iter, t->entries[cpu], t->size) { + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqcount_begin(s); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqcount_retry(s, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); + ++i; + } + } +} + +static struct xt_counters *alloc_counters(const struct xt_table *table) +{ + unsigned int countersize; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct xt_counters) * private->number; + counters = vzalloc(countersize); + + if (counters == NULL) + return ERR_PTR(-ENOMEM); + + get_counters(private, counters); + + return counters; +} + +static int +copy_entries_to_user(unsigned int total_size, + const struct xt_table *table, + void __user *userptr) +{ + unsigned int off, num; + const struct ip6t_entry *e; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + int ret = 0; + const void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + const struct xt_entry_match *m; + const struct xt_entry_target *t; + + e = (struct ip6t_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off + + offsetof(struct ip6t_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ip6t_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct xt_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ip6t_get_target_c(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct xt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, const void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(AF_INET6, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, const void *src) +{ + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(AF_INET6, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static int compat_calc_entry(const struct ip6t_entry *e, + const struct xt_table_info *info, + const void *base, struct xt_table_info *newinfo) +{ + const struct xt_entry_match *ematch; + const struct xt_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + entry_offset = (void *)e - base; + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ip6t_get_target_c(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(AF_INET6, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct ip6t_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct ip6t_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + struct ip6t_entry *iter; + void *loc_cpu_entry; + int ret; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET6, info->number); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; +} +#endif + +static int get_info(struct net *net, void __user *user, + const int *len, int compat) +{ + char name[XT_TABLE_MAXNAMELEN]; + struct xt_table *t; + int ret; + + if (*len != sizeof(struct ip6t_getinfo)) { + duprintf("length %u != %zu\n", *len, + sizeof(struct ip6t_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[XT_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(AF_INET6); +#endif + t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), + "ip6table_%s", name); + if (t && !IS_ERR(t)) { + struct ip6t_getinfo info; + const struct xt_table_info *private = t->private; +#ifdef CONFIG_COMPAT + struct xt_table_info tmp; + + if (compat) { + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(AF_INET6); + private = &tmp; + } +#endif + memset(&info, 0, sizeof(info)); + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(AF_INET6); +#endif + return ret; +} + +static int +get_entries(struct net *net, struct ip6t_get_entries __user *uptr, + const int *len) +{ + int ret; + struct ip6t_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct ip6t_get_entries) + get.size) { + duprintf("get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(net, AF_INET6, get.name); + if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; + duprintf("t->private->number = %u\n", private->number); + if (get.size == private->size) + ret = copy_entries_to_user(private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int +__do_replace(struct net *net, const char *name, unsigned int valid_hooks, + struct xt_table_info *newinfo, unsigned int num_counters, + void __user *counters_ptr) +{ + int ret; + struct xt_table *t; + struct xt_table_info *oldinfo; + struct xt_counters *counters; + const void *loc_cpu_old_entry; + struct ip6t_entry *iter; + + ret = 0; + counters = vzalloc(num_counters * sizeof(struct xt_counters)); + if (!counters) { + ret = -ENOMEM; + goto out; + } + + t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), + "ip6table_%s", name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters, and synchronize with replace */ + get_counters(oldinfo, counters); + + /* Decrease module usage counts and free resource */ + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + + xt_free_table_info(oldinfo); + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) + ret = -EFAULT; + vfree(counters); + xt_table_unlock(t); + return ret; + + put_module: + module_put(t->me); + xt_table_unlock(t); + free_newinfo_counters_untrans: + vfree(counters); + out: + return ret; +} + +static int +do_replace(struct net *net, const void __user *user, unsigned int len) +{ + int ret; + struct ip6t_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ip6t_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); + if (ret != 0) + goto free_newinfo; + + duprintf("ip_tables: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +do_add_counters(struct net *net, const void __user *user, unsigned int len, + int compat) +{ + unsigned int i, curcpu; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + char *name; + int size; + void *ptmp; + struct xt_table *t; + const struct xt_table_info *private; + int ret = 0; + const void *loc_cpu_entry; + struct ip6t_entry *iter; + unsigned int addend; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; + + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) + return -EINVAL; + + paddc = vmalloc(len - size); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user + size, len - size) != 0) { + ret = -EFAULT; + goto free; + } + + t = xt_find_table_lock(net, AF_INET6, name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + + local_bh_disable(); + private = t->private; + if (private->number != num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + /* Choose the copy that is on our node */ + curcpu = smp_processor_id(); + addend = xt_write_recseq_begin(); + loc_cpu_entry = private->entries[curcpu]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } + xt_write_recseq_end(addend); + + unlock_up_free: + local_bh_enable(); + xt_table_unlock(t); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +#ifdef CONFIG_COMPAT +struct compat_ip6t_replace { + char name[XT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_INET_NUMHOOKS]; + u32 underflow[NF_INET_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; /* struct xt_counters * */ + struct compat_ip6t_entry entries[0]; +}; + +static int +compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, + unsigned int *size, struct xt_counters *counters, + unsigned int i) +{ + struct xt_entry_target *t; + struct compat_ip6t_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; + const struct xt_entry_match *ematch; + int ret = 0; + + origsize = *size; + ce = (struct compat_ip6t_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; + + *dstptr += sizeof(struct compat_ip6t_entry); + *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } + target_offset = e->target_offset - (origsize - *size); + t = ip6t_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + return ret; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; + return 0; +} + +static int +compat_find_calc_match(struct xt_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + int *size) +{ + struct xt_match *match; + + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { + duprintf("compat_check_calc_match: `%s' not found\n", + m->u.user.name); + return PTR_ERR(match); + } + m->u.kernel.match = match; + *size += xt_compat_match_offset(match); + return 0; +} + +static void compat_release_entry(struct compat_ip6t_entry *e) +{ + struct xt_entry_target *t; + struct xt_entry_match *ematch; + + /* Cleanup all matches */ + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); + t = compat_ip6t_get_target(e); + module_put(t->u.kernel.target->me); +} + +static int +check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + const char *name) +{ + struct xt_entry_match *ematch; + struct xt_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + unsigned int j; + int ret, off, h; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_ip6t_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct ip6t_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + entry_offset = (void *)e - (void *)base; + j = 0; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ipv6, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } + + t = compat_ip6t_get_target(e); + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = PTR_ERR(target); + goto release_matches; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(AF_INET6, entry_offset, off); + if (ret) + goto out; + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + return 0; + +out: + module_put(t->u.kernel.target->me); +release_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct xt_entry_target *t; + struct ip6t_entry *de; + unsigned int origsize; + int ret, h; + struct xt_entry_match *ematch; + + ret = 0; + origsize = *size; + de = (struct ip6t_entry *)*dstptr; + memcpy(de, e, sizeof(struct ip6t_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct ip6t_entry); + *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } + de->target_offset = e->target_offset - (origsize - *size); + t = compat_ip6t_get_target(e); + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static int compat_check_entry(struct ip6t_entry *e, struct net *net, + const char *name) +{ + unsigned int j; + int ret = 0; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; + + j = 0; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } + + ret = check_target(e, net, name); + if (ret) + goto cleanup_matches; + return 0; + + cleanup_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } + return ret; +} + +static int +translate_compat_table(struct net *net, + const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + struct compat_ip6t_entry *iter0; + struct ip6t_entry *iter1; + unsigned int size; + int ret = 0; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(AF_INET6); + xt_compat_init_offsets(AF_INET6, number); + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } + xt_compat_flush_offsets(AF_INET6); + xt_compat_unlock(AF_INET6); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; + j -= i; + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } + return ret; +out_unlock: + xt_compat_flush_offsets(AF_INET6); + xt_compat_unlock(AF_INET6); + goto out; +} + +static int +compat_do_replace(struct net *net, void __user *user, unsigned int len) +{ + int ret; + struct compat_ip6t_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ip6t_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = compat_do_replace(sock_net(sk), user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 1); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct compat_ip6t_get_entries { + char name[XT_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_ip6t_entry entrytable[0]; +}; + +static int +compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + const void *loc_cpu_entry; + unsigned int i = 0; + struct ip6t_entry *iter; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } + + vfree(counters); + return ret; +} + +static int +compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, + int *len) +{ + int ret; + struct compat_ip6t_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(AF_INET6); + t = xt_find_table_lock(net, AF_INET6, get.name); + if (t && !IS_ERR(t)) { + const struct xt_table_info *private = t->private; + struct xt_table_info info; + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + xt_compat_flush_offsets(AF_INET6); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + xt_compat_unlock(AF_INET6); + return ret; +} + +static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *); + +static int +compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 1); + break; + case IP6T_SO_GET_ENTRIES: + ret = compat_get_entries(sock_net(sk), user, len); + break; + default: + ret = do_ip6t_get_ctl(sk, cmd, user, len); + } + return ret; +} +#endif + +static int +do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = do_replace(sock_net(sk), user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 0); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 0); + break; + + case IP6T_SO_GET_ENTRIES: + ret = get_entries(sock_net(sk), user, len); + break; + + case IP6T_SO_GET_REVISION_MATCH: + case IP6T_SO_GET_REVISION_TARGET: { + struct xt_get_revision rev; + int target; + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + rev.name[sizeof(rev.name)-1] = 0; + + if (cmd == IP6T_SO_GET_REVISION_TARGET) + target = 1; + else + target = 0; + + try_then_request_module(xt_find_revision(AF_INET6, rev.name, + rev.revision, + target, &ret), + "ip6t_%s", rev.name); + break; + } + + default: + duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct xt_table *ip6t_register_table(struct net *net, + const struct xt_table *table, + const struct ip6t_replace *repl) +{ + int ret; + struct xt_table_info *newinfo; + struct xt_table_info bootstrap = {0}; + void *loc_cpu_entry; + struct xt_table *new_table; + + newinfo = xt_alloc_table_info(repl->size); + if (!newinfo) { + ret = -ENOMEM; + goto out; + } + + /* choose the copy on our node/cpu, but dont care about preemption */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); + + ret = translate_table(net, newinfo, loc_cpu_entry, repl); + if (ret != 0) + goto out_free; + + new_table = xt_register_table(net, table, &bootstrap, newinfo); + if (IS_ERR(new_table)) { + ret = PTR_ERR(new_table); + goto out_free; + } + return new_table; + +out_free: + xt_free_table_info(newinfo); +out: + return ERR_PTR(ret); +} + +void ip6t_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ip6t_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline bool +icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + bool invert) +{ + return (type == test_type && code >= min_code && code <= max_code) + ^ invert; +} + +static bool +icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmp6hdr *ic; + struct icmp6hdr _icmph; + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ICMP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return icmp6_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->icmp6_type, ic->icmp6_code, + !!(icmpinfo->invflags&IP6T_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int icmp6_checkentry(const struct xt_mtchk_param *par) +{ + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; +} + +/* The built-in targets: standard (NULL) and error. */ +static struct xt_target ip6t_builtin_tg[] __read_mostly = { + { + .name = XT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_IPV6, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif + }, + { + .name = XT_ERROR_TARGET, + .target = ip6t_error, + .targetsize = XT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_IPV6, + }, +}; + +static struct nf_sockopt_ops ip6t_sockopts = { + .pf = PF_INET6, + .set_optmin = IP6T_BASE_CTL, + .set_optmax = IP6T_SO_SET_MAX+1, + .set = do_ip6t_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_ip6t_set_ctl, +#endif + .get_optmin = IP6T_BASE_CTL, + .get_optmax = IP6T_SO_GET_MAX+1, + .get = do_ip6t_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_ip6t_get_ctl, +#endif + .owner = THIS_MODULE, +}; + +static struct xt_match ip6t_builtin_mt[] __read_mostly = { + { + .name = "icmp6", + .match = icmp6_match, + .matchsize = sizeof(struct ip6t_icmp), + .checkentry = icmp6_checkentry, + .proto = IPPROTO_ICMPV6, + .family = NFPROTO_IPV6, + }, +}; + +static int __net_init ip6_tables_net_init(struct net *net) +{ + return xt_proto_init(net, NFPROTO_IPV6); +} + +static void __net_exit ip6_tables_net_exit(struct net *net) +{ + xt_proto_fini(net, NFPROTO_IPV6); +} + +static struct pernet_operations ip6_tables_net_ops = { + .init = ip6_tables_net_init, + .exit = ip6_tables_net_exit, +}; + +static int __init ip6_tables_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6_tables_net_ops); + if (ret < 0) + goto err1; + + /* No one else will be downing sem now, so we won't sleep */ + ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); + if (ret < 0) + goto err2; + ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); + if (ret < 0) + goto err4; + + /* Register setsockopt */ + ret = nf_register_sockopt(&ip6t_sockopts); + if (ret < 0) + goto err5; + + pr_info("(C) 2000-2006 Netfilter Core Team\n"); + return 0; + +err5: + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); +err4: + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); +err2: + unregister_pernet_subsys(&ip6_tables_net_ops); +err1: + return ret; +} + +static void __exit ip6_tables_fini(void) +{ + nf_unregister_sockopt(&ip6t_sockopts); + + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); + unregister_pernet_subsys(&ip6_tables_net_ops); +} + +/* + * find the offset to specified header or the protocol number of last header + * if target < 0. "last header" is transport protocol header, ESP, or + * "No next header". + * + * If target header is found, its offset is set in *offset and return protocol + * number. Otherwise, return -ENOENT or -EBADMSG. + * + * If the first fragment doesn't contain the final protocol header or + * NEXTHDR_NONE it is considered invalid. + * + * Note that non-1st fragment is special case that "the protocol number + * of last header" is "next header" field in Fragment header. In this case, + * *offset is meaningless. If fragoff is not NULL, the fragment offset is + * stored in *fragoff; if it is NULL, return -EINVAL. + */ +int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, + int target, unsigned short *fragoff) +{ + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + unsigned int len = skb->len - start; + + if (fragoff) + *fragoff = 0; + + while (nexthdr != target) { + struct ipv6_opt_hdr _hdr, *hp; + unsigned int hdrlen; + + if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { + if (target < 0) + break; + return -ENOENT; + } + + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -EBADMSG; + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short _frag_off; + __be16 *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -EBADMSG; + + _frag_off = ntohs(*fp) & ~0x7; + if (_frag_off) { + if (target < 0 && + ((!ipv6_ext_hdr(hp->nexthdr)) || + hp->nexthdr == NEXTHDR_NONE)) { + if (fragoff) { + *fragoff = _frag_off; + return hp->nexthdr; + } else { + return -EINVAL; + } + } + return -ENOENT; + } + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen + 2) << 2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *offset = start; + return nexthdr; +} + +EXPORT_SYMBOL(ip6t_register_table); +EXPORT_SYMBOL(ip6t_unregister_table); +EXPORT_SYMBOL(ip6t_do_table); +EXPORT_SYMBOL(ip6t_ext_hdr); +EXPORT_SYMBOL(ipv6_find_hdr); + +module_init(ip6_tables_init); +module_exit(ip6_tables_fini); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c new file mode 100644 index 00000000..e6af8d72 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -0,0 +1,527 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 2001 Jan Rekorajski + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Jan Rekorajski "); +MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog"); +MODULE_LICENSE("GPL"); + +struct in_device; +#include +#include + +/* One level of recursion won't kill us */ +static void dump_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & IP6T_LOG_IPOPT) + sb_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + sb_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + sb_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + sb_add(m, "INCOMPLETE "); + + sb_add(m, "ID:%08x ", ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & IP6T_LOG_IPOPT) + sb_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & IP6T_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + sb_add(m, "AH "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & IP6T_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + sb_add(m, "ESP "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + sb_add(m, "SPI=0x%x )", ntohl(eh->spi) ); + + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + sb_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & IP6T_LOG_IPOPT) + sb_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + sb_add(m, "PROTO=TCP "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); + if (th == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & IP6T_LOG_TCPSEQ) + sb_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + sb_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + sb_add(m, "CWR "); + if (th->ece) + sb_add(m, "ECE "); + if (th->urg) + sb_add(m, "URG "); + if (th->ack) + sb_add(m, "ACK "); + if (th->psh) + sb_add(m, "PSH "); + if (th->rst) + sb_add(m, "RST "); + if (th->syn) + sb_add(m, "SYN "); + if (th->fin) + sb_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & IP6T_LOG_TCPOPT) && + th->doff * 4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + const u_int8_t *op; + unsigned int i; + unsigned int optsize = th->doff * 4 + - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, + ptr + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + sb_add(m, "OPT (TRUNCATED)"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i =0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); + } + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + if (currenthdr == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + sb_add(m, "PROTO=UDP " ); + else /* Max length: 14 "PROTO=UDPLITE " */ + sb_add(m, "PROTO=UDPLITE "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); + if (uh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + sb_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + sb_add(m, "["); + dump_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) + sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + sb_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + sb_add(m, "UID=%u GID=%u ", + skb->sk->sk_socket->file->f_cred->fsuid, + skb->sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!recurse && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & IP6T_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT && + (p -= ETH_HLEN) < skb->head) + p = NULL; + + if (p != NULL) { + sb_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + sb_add(m, ":%02x", *p++); + } + sb_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr); + } + } else + sb_add(m, " "); +} + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static void +ip6t_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, + in ? in->name : "", + out ? out->name : ""); + + if (in != NULL) + dump_mac_header(m, loginfo, skb); + + dump_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + sb_close(m); +} + +static unsigned int +log_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_log_info *loginfo = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; + + ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, par->out, + &li, loginfo->prefix); + return XT_CONTINUE; +} + + +static int log_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_log_info *loginfo = par->targinfo; + + if (loginfo->level >= 8) { + pr_debug("level %u >= 8\n", loginfo->level); + return -EINVAL; + } + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + pr_debug("prefix not null-terminated\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target log_tg6_reg __read_mostly = { + .name = "LOG", + .family = NFPROTO_IPV6, + .target = log_tg6, + .targetsize = sizeof(struct ip6t_log_info), + .checkentry = log_tg6_check, + .me = THIS_MODULE, +}; + +static struct nf_logger ip6t_logger __read_mostly = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; + +static int __init log_tg6_init(void) +{ + int ret; + + ret = xt_register_target(&log_tg6_reg); + if (ret < 0) + return ret; + nf_log_register(NFPROTO_IPV6, &ip6t_logger); + return 0; +} + +static void __exit log_tg6_exit(void) +{ + nf_log_unregister(&ip6t_logger); + xt_unregister_target(&log_tg6_reg); +} + +module_init(log_tg6_init); +module_exit(log_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c new file mode 100644 index 00000000..09d30498 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -0,0 +1,271 @@ +/* + * IP6 tables REJECT target module + * Linux INET6 implementation + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Authors: + * Yasuyuki Kozakai + * + * Based on net/ipv4/netfilter/ipt_REJECT.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Yasuyuki KOZAKAI "); +MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); +MODULE_LICENSE("GPL"); + +/* Send RST reply */ +static void send_reset(struct net *net, struct sk_buff *oldskb) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); + struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; + struct dst_entry *dst = NULL; + u8 proto; + struct flowi6 fl6; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + pr_debug("addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + pr_debug("Cannot get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { + pr_debug("proto(%d) != IPPROTO_TCP, " + "or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + pr_debug("RST is set\n"); + return; + } + + /* Check checksum. */ + if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP, + skb_checksum(oldskb, tcphoff, otcplen, 0))) { + pr_debug("TCP checksum is invalid\n"); + return; + } + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + ipv6_addr_copy(&fl6.saddr, &oip6h->daddr); + ipv6_addr_copy(&fl6.daddr, &oip6h->saddr); + fl6.fl6_sport = otcph.dest; + fl6.fl6_dport = otcph.source; + security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + if (net_ratelimit()) + pr_debug("cannot alloc skb\n"); + dst_release(dst); + return; + } + + skb_dst_set(nskb, dst); + + skb_reserve(nskb, hh_len + dst->header_len); + + skb_put(nskb, sizeof(struct ipv6hdr)); + skb_reset_network_header(nskb); + ip6h = ipv6_hdr(nskb); + *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); + ip6h->hop_limit = ip6_dst_hoplimit(dst); + ip6h->nexthdr = IPPROTO_TCP; + ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); + ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, + &ipv6_hdr(nskb)->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); + + nf_ct_attach(nskb, oldskb); + + ip6_local_out(nskb); +} + +static inline void +send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code, + unsigned int hooknum) +{ + if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) + skb_in->dev = net->loopback_dev; + + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); +#ifdef CONFIG_IP6_NF_TARGET_REJECT_SKERR + if (skb_in->sk) { + icmpv6_err_convert(ICMPV6_DEST_UNREACH, code, + &skb_in->sk->sk_err); + skb_in->sk->sk_error_report(skb_in->sk); + pr_debug("ip6t_REJECT: sk_err=%d for skb=%p sk=%p\n", + skb_in->sk->sk_err, skb_in, skb_in->sk); + } +#endif +} + +static unsigned int +reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_reject_info *reject = par->targinfo; + struct net *net = dev_net((par->in != NULL) ? par->in : par->out); + + pr_debug("%s: medium point\n", __func__); + switch (reject->with) { + case IP6T_ICMP6_NO_ROUTE: + send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum); + break; + case IP6T_ICMP6_ADM_PROHIBITED: + send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); + break; + case IP6T_ICMP6_NOT_NEIGHBOUR: + send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); + break; + case IP6T_ICMP6_ADDR_UNREACH: + send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); + break; + case IP6T_ICMP6_PORT_UNREACH: + send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); + break; + case IP6T_ICMP6_ECHOREPLY: + /* Do nothing */ + break; + case IP6T_TCP_RESET: + send_reset(net, skb); + break; + default: + if (net_ratelimit()) + pr_info("case %u not handled yet\n", reject->with); + break; + } + + return NF_DROP; +} + +static int reject_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_reject_info *rejinfo = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; + + if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { + pr_info("ECHOREPLY is not supported.\n"); + return -EINVAL; + } else if (rejinfo->with == IP6T_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ipv6.proto != IPPROTO_TCP || + (e->ipv6.invflags & XT_INV_PROTO)) { + pr_info("TCP_RESET illegal for non-tcp\n"); + return -EINVAL; + } + } + return 0; +} + +static struct xt_target reject_tg6_reg __read_mostly = { + .name = "REJECT", + .family = NFPROTO_IPV6, + .target = reject_tg6, + .targetsize = sizeof(struct ip6t_reject_info), + .table = "filter", + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = reject_tg6_check, + .me = THIS_MODULE +}; + +static int __init reject_tg6_init(void) +{ + return xt_register_target(&reject_tg6_reg); +} + +static void __exit reject_tg6_exit(void) +{ + xt_unregister_target(&reject_tg6_reg); +} + +module_init(reject_tg6_init); +module_exit(reject_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c new file mode 100644 index 00000000..89cccc5a --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -0,0 +1,121 @@ +/* Kernel module to match AH parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip_auth_hdr _ah; + const struct ip_auth_hdr *ah; + const struct ip6t_ah *ahinfo = par->matchinfo; + unsigned int ptr; + unsigned int hdrlen = 0; + int err; + + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); + if (ah == NULL) { + par->hotdrop = true; + return false; + } + + hdrlen = (ah->hdrlen + 2) << 2; + + pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen); + pr_debug("RES %04X ", ah->reserved); + pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi)); + + pr_debug("IPv6 AH spi %02X ", + spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI))); + pr_debug("len %02X %04X %02X ", + ahinfo->hdrlen, hdrlen, + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN))); + pr_debug("res %02X %04X %02X\n", + ahinfo->hdrres, ah->reserved, + !(ahinfo->hdrres && ah->reserved)); + + return (ah != NULL) && + spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI)) && + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN)) && + !(ahinfo->hdrres && ah->reserved); +} + +static int ah_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_ah *ahinfo = par->matchinfo; + + if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { + pr_debug("unknown flags %X\n", ahinfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match ah_mt6_reg __read_mostly = { + .name = "ah", + .family = NFPROTO_IPV6, + .match = ah_mt6, + .matchsize = sizeof(struct ip6t_ah), + .checkentry = ah_mt6_check, + .me = THIS_MODULE, +}; + +static int __init ah_mt6_init(void) +{ + return xt_register_match(&ah_mt6_reg); +} + +static void __exit ah_mt6_exit(void) +{ + xt_unregister_match(&ah_mt6_reg); +} + +module_init(ah_mt6_init); +module_exit(ah_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c new file mode 100644 index 00000000..aab07069 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -0,0 +1,74 @@ +/* Kernel module to match EUI64 address parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_DESCRIPTION("Xtables: IPv6 EUI64 address match"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +static bool +eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + unsigned char eui64[8]; + + if (!(skb_mac_header(skb) >= skb->head && + skb_mac_header(skb) + ETH_HLEN <= skb->data) && + par->fragoff != 0) { + par->hotdrop = true; + return false; + } + + memset(eui64, 0, sizeof(eui64)); + + if (eth_hdr(skb)->h_proto == htons(ETH_P_IPV6)) { + if (ipv6_hdr(skb)->version == 0x6) { + memcpy(eui64, eth_hdr(skb)->h_source, 3); + memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3); + eui64[3] = 0xff; + eui64[4] = 0xfe; + eui64[0] ^= 0x02; + + if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64, + sizeof(eui64))) + return true; + } + } + + return false; +} + +static struct xt_match eui64_mt6_reg __read_mostly = { + .name = "eui64", + .family = NFPROTO_IPV6, + .match = eui64_mt6, + .matchsize = sizeof(int), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, +}; + +static int __init eui64_mt6_init(void) +{ + return xt_register_match(&eui64_mt6_reg); +} + +static void __exit eui64_mt6_exit(void) +{ + xt_unregister_match(&eui64_mt6_reg); +} + +module_init(eui64_mt6_init); +module_exit(eui64_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c new file mode 100644 index 00000000..eda898fd --- /dev/null +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -0,0 +1,136 @@ +/* Kernel module to match FRAG parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 fragment match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline bool +id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) +{ + bool r; + pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', + min, id, max); + r = (id >= min && id <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool +frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct frag_hdr _frag; + const struct frag_hdr *fh; + const struct ip6t_frag *fraginfo = par->matchinfo; + unsigned int ptr; + int err; + + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); + if (fh == NULL) { + par->hotdrop = true; + return false; + } + + pr_debug("INFO %04X ", fh->frag_off); + pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7); + pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6); + pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF)); + pr_debug("ID %u %08X\n", ntohl(fh->identification), + ntohl(fh->identification)); + + pr_debug("IPv6 FRAG id %02X ", + id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))); + pr_debug("res %02X %02X%04X %02X ", + fraginfo->flags & IP6T_FRAG_RES, fh->reserved, + ntohs(fh->frag_off) & 0x6, + !((fraginfo->flags & IP6T_FRAG_RES) && + (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); + pr_debug("first %02X %02X %02X ", + fraginfo->flags & IP6T_FRAG_FST, + ntohs(fh->frag_off) & ~0x7, + !((fraginfo->flags & IP6T_FRAG_FST) && + (ntohs(fh->frag_off) & ~0x7))); + pr_debug("mf %02X %02X %02X ", + fraginfo->flags & IP6T_FRAG_MF, + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_MF) && + !((ntohs(fh->frag_off) & IP6_MF)))); + pr_debug("last %02X %02X %02X\n", + fraginfo->flags & IP6T_FRAG_NMF, + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_NMF) && + (ntohs(fh->frag_off) & IP6_MF))); + + return (fh != NULL) && + id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && + !((fraginfo->flags & IP6T_FRAG_RES) && + (fh->reserved || (ntohs(fh->frag_off) & 0x6))) && + !((fraginfo->flags & IP6T_FRAG_FST) && + (ntohs(fh->frag_off) & ~0x7)) && + !((fraginfo->flags & IP6T_FRAG_MF) && + !(ntohs(fh->frag_off) & IP6_MF)) && + !((fraginfo->flags & IP6T_FRAG_NMF) && + (ntohs(fh->frag_off) & IP6_MF)); +} + +static int frag_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_frag *fraginfo = par->matchinfo; + + if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { + pr_debug("unknown flags %X\n", fraginfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match frag_mt6_reg __read_mostly = { + .name = "frag", + .family = NFPROTO_IPV6, + .match = frag_mt6, + .matchsize = sizeof(struct ip6t_frag), + .checkentry = frag_mt6_check, + .me = THIS_MODULE, +}; + +static int __init frag_mt6_init(void) +{ + return xt_register_match(&frag_mt6_reg); +} + +static void __exit frag_mt6_exit(void) +{ + xt_unregister_match(&frag_mt6_reg); +} + +module_init(frag_mt6_init); +module_exit(frag_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c new file mode 100644 index 00000000..59df051e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -0,0 +1,215 @@ +/* Kernel module to match Hop-by-Hop and Destination parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match"); +MODULE_AUTHOR("Andras Kis-Szabo "); +MODULE_ALIAS("ip6t_dst"); + +/* + * (Type & 0xC0) >> 6 + * 0 -> ignorable + * 1 -> must drop the packet + * 2 -> send ICMP PARM PROB regardless and drop packet + * 3 -> Send ICMP if not a multicast address and drop packet + * (Type & 0x20) >> 5 + * 0 -> invariant + * 1 -> can change the routing + * (Type & 0x1F) Type + * 0 -> Pad1 (only 1 byte!) + * 1 -> PadN LENGTH info (total length = length + 2) + * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) + * 5 -> RTALERT 2 x x + */ + +static struct xt_match hbh_mt6_reg[] __read_mostly; + +static bool +hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6_opt_hdr _optsh; + const struct ipv6_opt_hdr *oh; + const struct ip6t_opts *optinfo = par->matchinfo; + unsigned int temp; + unsigned int ptr; + unsigned int hdrlen = 0; + bool ret = false; + u8 _opttype; + u8 _optlen; + const u_int8_t *tp = NULL; + const u_int8_t *lp = NULL; + unsigned int optlen; + int err; + + err = ipv6_find_hdr(skb, &ptr, + (par->match == &hbh_mt6_reg[0]) ? + NEXTHDR_HOP : NEXTHDR_DEST, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); + if (oh == NULL) { + par->hotdrop = true; + return false; + } + + hdrlen = ipv6_optlen(oh); + if (skb->len - ptr < hdrlen) { + /* Packet smaller than it's length field */ + return false; + } + + pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); + + pr_debug("len %02X %04X %02X ", + optinfo->hdrlen, hdrlen, + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); + + ret = (oh != NULL) && + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); + + ptr += 2; + hdrlen -= 2; + if (!(optinfo->flags & IP6T_OPTS_OPTS)) { + return ret; + } else { + pr_debug("Strict "); + pr_debug("#%d ", optinfo->optsnr); + for (temp = 0; temp < optinfo->optsnr; temp++) { + /* type field exists ? */ + if (hdrlen < 1) + break; + tp = skb_header_pointer(skb, ptr, sizeof(_opttype), + &_opttype); + if (tp == NULL) + break; + + /* Type check */ + if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) { + pr_debug("Tbad %02X %02X\n", *tp, + (optinfo->opts[temp] & 0xFF00) >> 8); + return false; + } else { + pr_debug("Tok "); + } + /* Length check */ + if (*tp) { + u16 spec_len; + + /* length field exists ? */ + if (hdrlen < 2) + break; + lp = skb_header_pointer(skb, ptr + 1, + sizeof(_optlen), + &_optlen); + if (lp == NULL) + break; + spec_len = optinfo->opts[temp] & 0x00FF; + + if (spec_len != 0x00FF && spec_len != *lp) { + pr_debug("Lbad %02X %04X\n", *lp, + spec_len); + return false; + } + pr_debug("Lok "); + optlen = *lp + 2; + } else { + pr_debug("Pad1\n"); + optlen = 1; + } + + /* Step to the next */ + pr_debug("len%04X\n", optlen); + + if ((ptr > skb->len - optlen || hdrlen < optlen) && + temp < optinfo->optsnr - 1) { + pr_debug("new pointer is too large!\n"); + break; + } + ptr += optlen; + hdrlen -= optlen; + } + if (temp == optinfo->optsnr) + return ret; + else + return false; + } + + return false; +} + +static int hbh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_opts *optsinfo = par->matchinfo; + + if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { + pr_debug("unknown flags %X\n", optsinfo->invflags); + return -EINVAL; + } + + if (optsinfo->flags & IP6T_OPTS_NSTRICT) { + pr_debug("Not strict - not implemented"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match hbh_mt6_reg[] __read_mostly = { + { + /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ + .name = "hbh", + .family = NFPROTO_IPV6, + .match = hbh_mt6, + .matchsize = sizeof(struct ip6t_opts), + .checkentry = hbh_mt6_check, + .me = THIS_MODULE, + }, + { + .name = "dst", + .family = NFPROTO_IPV6, + .match = hbh_mt6, + .matchsize = sizeof(struct ip6t_opts), + .checkentry = hbh_mt6_check, + .me = THIS_MODULE, + }, +}; + +static int __init hbh_mt6_init(void) +{ + return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); +} + +static void __exit hbh_mt6_exit(void) +{ + xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); +} + +module_init(hbh_mt6_init); +module_exit(hbh_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c new file mode 100644 index 00000000..54bd9790 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -0,0 +1,154 @@ +/* ipv6header match - matches IPv6 packets based + on whether they contain certain headers */ + +/* Original idea: Brad Chapman + * Rewritten by: Andras Kis-Szabo */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 header types match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +static bool +ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_ipv6header_info *info = par->matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + + /* Make sure this isn't an evil packet */ + + /* type of the 1st exthdr */ + nexthdr = ipv6_hdr(skb)->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + const struct ipv6_opt_hdr *hp; + struct ipv6_opt_hdr _hdr; + int hdrlen; + + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + temp |= MASK_NONE; + break; + } + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return false; + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + temp |= MASK_ESP; + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) + hdrlen = 8; + else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen + 2) << 2; + else + hdrlen = ipv6_optlen(hp); + + /* set the flag */ + switch (nexthdr) { + case NEXTHDR_HOP: + temp |= MASK_HOPOPTS; + break; + case NEXTHDR_ROUTING: + temp |= MASK_ROUTING; + break; + case NEXTHDR_FRAGMENT: + temp |= MASK_FRAGMENT; + break; + case NEXTHDR_AUTH: + temp |= MASK_AH; + break; + case NEXTHDR_DEST: + temp |= MASK_DSTOPTS; + break; + default: + return false; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) + break; + } + + if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP) + temp |= MASK_PROTO; + + if (info->modeflag) + return !((temp ^ info->matchflags ^ info->invflags) + & info->matchflags); + else { + if (info->invflags) + return temp != info->matchflags; + else + return temp == info->matchflags; + } +} + +static int ipv6header_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_ipv6header_info *info = par->matchinfo; + + /* invflags is 0 or 0xff in hard mode */ + if ((!info->modeflag) && info->invflags != 0x00 && + info->invflags != 0xFF) + return -EINVAL; + + return 0; +} + +static struct xt_match ipv6header_mt6_reg __read_mostly = { + .name = "ipv6header", + .family = NFPROTO_IPV6, + .match = ipv6header_mt6, + .matchsize = sizeof(struct ip6t_ipv6header_info), + .checkentry = ipv6header_mt6_check, + .destroy = NULL, + .me = THIS_MODULE, +}; + +static int __init ipv6header_mt6_init(void) +{ + return xt_register_match(&ipv6header_mt6_reg); +} + +static void __exit ipv6header_mt6_exit(void) +{ + xt_unregister_match(&ipv6header_mt6_reg); +} + +module_init(ipv6header_mt6_init); +module_exit(ipv6header_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c new file mode 100644 index 00000000..0c90c66b --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -0,0 +1,94 @@ +/* + * Copyright (C)2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Masahide NAKAMURA @USAGI + * + * Based on net/netfilter/xt_tcpudp.c + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); +MODULE_LICENSE("GPL"); + +/* Returns 1 if the type is matched by the range, 0 otherwise */ +static inline bool +type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) +{ + return (type >= min && type <= max) ^ invert; +} + +static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip6_mh _mh; + const struct ip6_mh *mh; + const struct ip6t_mh *mhinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh); + if (mh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + pr_debug("Dropping evil MH tinygram.\n"); + par->hotdrop = true; + return false; + } + + if (mh->ip6mh_proto != IPPROTO_NONE) { + pr_debug("Dropping invalid MH Payload Proto: %u\n", + mh->ip6mh_proto); + par->hotdrop = true; + return false; + } + + return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type, + !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); +} + +static int mh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_mh *mhinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0; +} + +static struct xt_match mh_mt6_reg __read_mostly = { + .name = "mh", + .family = NFPROTO_IPV6, + .checkentry = mh_mt6_check, + .match = mh_mt6, + .matchsize = sizeof(struct ip6t_mh), + .proto = IPPROTO_MH, + .me = THIS_MODULE, +}; + +static int __init mh_mt6_init(void) +{ + return xt_register_match(&mh_mt6_reg); +} + +static void __exit mh_mt6_exit(void) +{ + xt_unregister_match(&mh_mt6_reg); +} + +module_init(mh_mt6_init); +module_exit(mh_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c new file mode 100644 index 00000000..d8488c50 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -0,0 +1,225 @@ +/* Kernel module to match ROUTING parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline bool +segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) +{ + bool r; + pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, id, max); + r = (id >= min && id <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6_rt_hdr _route; + const struct ipv6_rt_hdr *rh; + const struct ip6t_rt *rtinfo = par->matchinfo; + unsigned int temp; + unsigned int ptr; + unsigned int hdrlen = 0; + bool ret = false; + struct in6_addr _addr; + const struct in6_addr *ap; + int err; + + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); + if (rh == NULL) { + par->hotdrop = true; + return false; + } + + hdrlen = ipv6_optlen(rh); + if (skb->len - ptr < hdrlen) { + /* Pcket smaller than its length field */ + return false; + } + + pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); + pr_debug("TYPE %04X ", rh->type); + pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); + + pr_debug("IPv6 RT segsleft %02X ", + segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS))); + pr_debug("type %02X %02X %02X ", + rtinfo->rt_type, rh->type, + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); + pr_debug("len %02X %04X %02X ", + rtinfo->hdrlen, hdrlen, + !(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN))); + pr_debug("res %02X %02X %02X ", + rtinfo->flags & IP6T_RT_RES, + ((const struct rt0_hdr *)rh)->reserved, + !((rtinfo->flags & IP6T_RT_RES) && + (((const struct rt0_hdr *)rh)->reserved))); + + ret = (rh != NULL) && + (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && + (!(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN))) && + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP))); + + if (ret && (rtinfo->flags & IP6T_RT_RES)) { + const u_int32_t *rp; + u_int32_t _reserved; + rp = skb_header_pointer(skb, + ptr + offsetof(struct rt0_hdr, + reserved), + sizeof(_reserved), + &_reserved); + + ret = (*rp == 0); + } + + pr_debug("#%d ", rtinfo->addrnr); + if (!(rtinfo->flags & IP6T_RT_FST)) { + return ret; + } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { + pr_debug("Not strict "); + if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { + pr_debug("There isn't enough space\n"); + return false; + } else { + unsigned int i = 0; + + pr_debug("#%d ", rtinfo->addrnr); + for (temp = 0; + temp < (unsigned int)((hdrlen - 8) / 16); + temp++) { + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + + BUG_ON(ap == NULL); + + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { + pr_debug("i=%d temp=%d;\n", i, temp); + i++; + } + if (i == rtinfo->addrnr) + break; + } + pr_debug("i=%d #%d\n", i, rtinfo->addrnr); + if (i == rtinfo->addrnr) + return ret; + else + return false; + } + } else { + pr_debug("Strict "); + if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { + pr_debug("There isn't enough space\n"); + return false; + } else { + pr_debug("#%d ", rtinfo->addrnr); + for (temp = 0; temp < rtinfo->addrnr; temp++) { + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + BUG_ON(ap == NULL); + + if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) + break; + } + pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr); + if (temp == rtinfo->addrnr && + temp == (unsigned int)((hdrlen - 8) / 16)) + return ret; + else + return false; + } + } + + return false; +} + +static int rt_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_rt *rtinfo = par->matchinfo; + + if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { + pr_debug("unknown flags %X\n", rtinfo->invflags); + return -EINVAL; + } + if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && + (!(rtinfo->flags & IP6T_RT_TYP) || + (rtinfo->rt_type != 0) || + (rtinfo->invflags & IP6T_RT_INV_TYP))) { + pr_debug("`--rt-type 0' required before `--rt-0-*'"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match rt_mt6_reg __read_mostly = { + .name = "rt", + .family = NFPROTO_IPV6, + .match = rt_mt6, + .matchsize = sizeof(struct ip6t_rt), + .checkentry = rt_mt6_check, + .me = THIS_MODULE, +}; + +static int __init rt_mt6_init(void) +{ + return xt_register_match(&rt_mt6_reg); +} + +static void __exit rt_mt6_exit(void) +{ + xt_unregister_match(&rt_mt6_reg); +} + +module_init(rt_mt6_init); +module_exit(rt_mt6_exit); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c new file mode 100644 index 00000000..c9e37c8f --- /dev/null +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -0,0 +1,113 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("ip6tables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_FILTER, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_filter); +} + +static struct nf_hook_ops *filter_ops __read_mostly; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __net_init ip6table_filter_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ip6t_standard *)repl->entries)[1].target.verdict = + -forward - 1; + + net->ipv6.ip6table_filter = + ip6t_register_table(net, &packet_filter, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_filter)) + return PTR_ERR(net->ipv6.ip6table_filter); + return 0; +} + +static void __net_exit ip6table_filter_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_filter); +} + +static struct pernet_operations ip6table_filter_net_ops = { + .init = ip6table_filter_net_init, + .exit = ip6table_filter_net_exit, +}; + +static int __init ip6table_filter_init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + pr_err("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + ret = register_pernet_subsys(&ip6table_filter_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&ip6table_filter_net_ops); + return ret; +} + +static void __exit ip6table_filter_fini(void) +{ + xt_hook_unlink(&packet_filter, filter_ops); + unregister_pernet_subsys(&ip6table_filter_net_ops); +} + +module_init(ip6table_filter_init); +module_exit(ip6table_filter_fini); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c new file mode 100644 index 00000000..00d19173 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -0,0 +1,145 @@ +/* + * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6 + * + * Copyright (C) 2000-2001 by Harald Welte + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("ip6tables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_POST_ROUTING)) + +static const struct xt_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_MANGLE, +}; + +static unsigned int +ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) +{ + unsigned int ret; + struct in6_addr saddr, daddr; + u_int8_t hop_limit; + u_int32_t flowlabel, mark; + +#if 0 + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) { + if (net_ratelimit()) + pr_warning("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + /* save source/dest address, mark, hoplimit, flowlabel, priority, */ + memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); + mark = skb->mark; + hop_limit = ipv6_hdr(skb)->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u_int32_t *)ipv6_hdr(skb)); + + ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, + dev_net(out)->ipv6.ip6table_mangle); + + if (ret != NF_DROP && ret != NF_STOLEN && + (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || + memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) + return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + + return ret; +} + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_mangle_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (hook == NF_INET_LOCAL_OUT) + return ip6t_mangle_out(skb, out); + if (hook == NF_INET_POST_ROUTING) + return ip6t_do_table(skb, hook, in, out, + dev_net(out)->ipv6.ip6table_mangle); + /* INPUT/FORWARD */ + return ip6t_do_table(skb, hook, in, out, + dev_net(in)->ipv6.ip6table_mangle); +} + +static struct nf_hook_ops *mangle_ops __read_mostly; +static int __net_init ip6table_mangle_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_mangle = + ip6t_register_table(net, &packet_mangler, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_mangle)) + return PTR_ERR(net->ipv6.ip6table_mangle); + return 0; +} + +static void __net_exit ip6table_mangle_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_mangle); +} + +static struct pernet_operations ip6table_mangle_net_ops = { + .init = ip6table_mangle_net_init, + .exit = ip6table_mangle_net_exit, +}; + +static int __init ip6table_mangle_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_mangle_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&ip6table_mangle_net_ops); + return ret; +} + +static void __exit ip6table_mangle_fini(void) +{ + xt_hook_unlink(&packet_mangler, mangle_ops); + unregister_pernet_subsys(&ip6table_mangle_net_ops); +} + +module_init(ip6table_mangle_init); +module_exit(ip6table_mangle_fini); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c new file mode 100644 index 00000000..5b9926a0 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -0,0 +1,88 @@ +/* + * IPv6 raw table, a port of the IPv4 raw table to IPv6 + * + * Copyright (C) 2003 Jozsef Kadlecsik + */ +#include +#include +#include + +#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_raw_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_raw); +} + +static struct nf_hook_ops *rawtable_ops __read_mostly; + +static int __net_init ip6table_raw_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_raw = + ip6t_register_table(net, &packet_raw, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_raw)) + return PTR_ERR(net->ipv6.ip6table_raw); + return 0; +} + +static void __net_exit ip6table_raw_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_raw); +} + +static struct pernet_operations ip6table_raw_net_ops = { + .init = ip6table_raw_net_init, + .exit = ip6table_raw_net_exit, +}; + +static int __init ip6table_raw_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_raw_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&ip6table_raw_net_ops); + return ret; +} + +static void __exit ip6table_raw_fini(void) +{ + xt_hook_unlink(&packet_raw, rawtable_ops); + unregister_pernet_subsys(&ip6table_raw_net_ops); +} + +module_init(ip6table_raw_init); +module_exit(ip6table_raw_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c new file mode 100644 index 00000000..91aa2b4d --- /dev/null +++ b/net/ipv6/netfilter/ip6table_security.c @@ -0,0 +1,105 @@ +/* + * "security" table for IPv6 + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris redhat.com>"); +MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static const struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_SECURITY, +}; + +static unsigned int +ip6table_security_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_security); +} + +static struct nf_hook_ops *sectbl_ops __read_mostly; + +static int __net_init ip6table_security_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_security = + ip6t_register_table(net, &security_table, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_security)) + return PTR_ERR(net->ipv6.ip6table_security); + + return 0; +} + +static void __net_exit ip6table_security_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_security); +} + +static struct pernet_operations ip6table_security_net_ops = { + .init = ip6table_security_net_init, + .exit = ip6table_security_net_exit, +}; + +static int __init ip6table_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + return ret; + + sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); + goto cleanup_table; + } + + return ret; + +cleanup_table: + unregister_pernet_subsys(&ip6table_security_net_ops); + return ret; +} + +static void __exit ip6table_security_fini(void) +{ + xt_hook_unlink(&security_table, sectbl_ops); + unregister_pernet_subsys(&ip6table_security_net_ops); +} + +module_init(ip6table_security_init); +module_exit(ip6table_security_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c new file mode 100644 index 00000000..4111050a --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -0,0 +1,398 @@ +/* + * Copyright (C)2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + const u_int32_t *ap; + u_int32_t _addrs[8]; + + ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), + sizeof(_addrs), _addrs); + if (ap == NULL) + return false; + + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + + return true; +} + +static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); + + return true; +} + +static int ipv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%pI6 dst=%pI6 ", + tuple->src.u3.ip6, tuple->dst.u3.ip6); +} + +/* + * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * if success, *nexthdr is updated by type/protocol of this header. + * + * NOTES: - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns -1. + * - if packet is fragmented, return pointer of the fragment header. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + */ + +static int nf_ct_ipv6_skip_exthdr(const struct sk_buff *skb, int start, + u8 *nexthdrp, int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + break; + if (nexthdr == NEXTHDR_FRAGMENT) + break; + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + unsigned int *dataoff, u_int8_t *protonum) +{ + unsigned int extoff = nhoff + sizeof(struct ipv6hdr); + unsigned char pnum; + int protoff; + + if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), + &pnum, sizeof(pnum)) != 0) { + pr_debug("ip6_conntrack_core: can't get nexthdr\n"); + return -NF_ACCEPT; + } + protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, skb->len - extoff); + /* + * (protoff == skb->len) mean that the packet doesn't have no data + * except of IPv6 & ext headers. but it's tracked anyway. - YK + */ + if ((protoff < 0) || (protoff > skb->len)) { + pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); + return -NF_ACCEPT; + } + + *dataoff = protoff; + *protonum = pnum; + return NF_ACCEPT; +} + +static unsigned int ipv6_confirm(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + unsigned int ret, protoff; + unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + help = nfct_help(ct); + if (!help) + goto out; + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + if (!helper) + goto out; + + protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, + skb->len - extoff); + if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + + ret = helper->help(skb, protoff, ct, ctinfo); + if (ret != NF_ACCEPT) { + nf_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, NULL, + "nf_ct_%s: dropping packet", helper->name); + return ret; + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int __ipv6_conntrack_in(struct net *net, + unsigned int hooknum, + struct sk_buff *skb, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm = skb->nfct_reasm; + + /* This packet is fragmented and has reassembled packet. */ + if (reasm) { + /* Reassembled packet isn't parsed yet ? */ + if (!reasm->nfct) { + unsigned int ret; + + ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm); + if (ret != NF_ACCEPT) + return ret; + } + nf_conntrack_get(reasm->nfct); + skb->nfct = reasm->nfct; + skb->nfctinfo = reasm->nfctinfo; + return NF_ACCEPT; + } + + return nf_conntrack_in(net, PF_INET6, hooknum, skb); +} + +static unsigned int ipv6_conntrack_in(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn); +} + +static unsigned int ipv6_conntrack_local(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) { + if (net_ratelimit()) + pr_notice("ipv6_conntrack_local: packet too short\n"); + return NF_ACCEPT; + } + return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn); +} + +static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { + { + .hook = ipv6_conntrack_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_conntrack_local, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, + }, + { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_LAST-1, + }, +}; + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int ipv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NLA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, + &tuple->src.u3.ip6); + NLA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, + &tuple->dst.u3.ip6); + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = { + [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 }, + [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 }, +}; + +static int ipv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) + return -EINVAL; + + memcpy(&t->src.u3.ip6, nla_data(tb[CTA_IP_V6_SRC]), + sizeof(u_int32_t) * 4); + memcpy(&t->dst.u3.ip6, nla_data(tb[CTA_IP_V6_DST]), + sizeof(u_int32_t) * 4); + + return 0; +} + +static int ipv6_nlattr_tuple_size(void) +{ + return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1); +} +#endif + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { + .l3proto = PF_INET6, + .name = "ipv6", + .pkt_to_tuple = ipv6_pkt_to_tuple, + .invert_tuple = ipv6_invert_tuple, + .print_tuple = ipv6_print_tuple, + .get_l4proto = ipv6_get_l4proto, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = ipv6_tuple_to_nlattr, + .nlattr_tuple_size = ipv6_nlattr_tuple_size, + .nlattr_to_tuple = ipv6_nlattr_to_tuple, + .nla_policy = ipv6_nla_policy, +#endif + .me = THIS_MODULE, +}; + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); + +static int __init nf_conntrack_l3proto_ipv6_init(void) +{ + int ret = 0; + + need_conntrack(); + nf_defrag_ipv6_enable(); + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register tcp.\n"); + return ret; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register icmpv6.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register ipv6\n"); + goto cleanup_icmpv6; + } + + ret = nf_register_hooks(ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_ipv6; + } + return ret; + + cleanup_ipv6: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + cleanup_icmpv6: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + cleanup_udp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); + cleanup_tcp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + return ret; +} + +static void __exit nf_conntrack_l3proto_ipv6_fini(void) +{ + synchronize_net(); + nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); +} + +module_init(nf_conntrack_l3proto_ipv6_init); +module_exit(nf_conntrack_l3proto_ipv6_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 00000000..7c05e7ea --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,308 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; + +static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct icmp6hdr *hp; + struct icmp6hdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1 +}; + +static const u_int8_t noct_valid_new[] = { + [ICMPV6_MGM_QUERY - 130] = 1, + [ICMPV6_MGM_REPORT -130] = 1, + [ICMPV6_MGM_REDUCTION - 130] = 1, + [NDISC_ROUTER_SOLICITATION - 130] = 1, + [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, + [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, + [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, + [ICMPV6_MLD2_REPORT - 130] = 1 +}; + +static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmpv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + static const u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; + + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { + /* Can't create a new ICMPv6 `conn' with this. */ + pr_debug("icmpv6: can't create new conn with type %u\n", + type + 128); + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: invalid new with type %d ", + type + 128); + return false; + } + return true; +} + +static int +icmpv6_error_message(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int icmp6off, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple intuple, origtuple; + const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_l4proto *inproto; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct icmp6hdr), + PF_INET6, &origtuple)) { + pr_debug("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_slow */ + inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, + &nf_conntrack_l3proto_ipv6, inproto)) { + pr_debug("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, zone, &intuple); + if (!h) { + pr_debug("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return NF_ACCEPT; +} + +static int +icmpv6_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) +{ + const struct icmp6hdr *icmp6h; + struct icmp6hdr _ih; + int type; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: short packet "); + return -NF_ACCEPT; + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed "); + return -NF_ACCEPT; + } + + type = icmp6h->icmp6_type - 130; + if (type >= 0 && type < sizeof(noct_valid_new) && + noct_valid_new[type]) { + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + return NF_ACCEPT; + } + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include +static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NLA_PUT_BE16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id); + NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type); + NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code); + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, +}; + +static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMPV6_TYPE] || + !tb[CTA_PROTO_ICMPV6_CODE] || + !tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + + return 0; +} + +static int icmpv6_nlattr_tuple_size(void) +{ + return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); +} +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *icmpv6_sysctl_header; +static struct ctl_table icmpv6_sysctl_table[] = { + { + .procname = "nf_conntrack_icmpv6_timeout", + .data = &nf_ct_icmpv6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_ICMPV6, + .name = "icmpv6", + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .print_tuple = icmpv6_print_tuple, + .packet = icmpv6_packet, + .new = icmpv6_new, + .error = icmpv6_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = icmpv6_tuple_to_nlattr, + .nlattr_tuple_size = icmpv6_nlattr_tuple_size, + .nlattr_to_tuple = icmpv6_nlattr_to_tuple, + .nla_policy = icmpv6_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_header = &icmpv6_sysctl_header, + .ctl_table = icmpv6_sysctl_table, +#endif +}; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c new file mode 100644 index 00000000..08572726 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -0,0 +1,651 @@ +/* + * IPv6 fragment reassembly for connection tracking + * + * Copyright (C)2004 USAGI/WIDE Project + * + * Author: + * Yasuyuki Kozakai @USAGI + * + * Based on: net/ipv6/reassembly.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct nf_ct_frag6_skb_cb +{ + struct inet6_skb_parm h; + int offset; + struct sk_buff *orig; +}; + +#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) + +struct nf_ct_frag6_queue +{ + struct inet_frag_queue q; + + __be32 id; /* fragment id */ + u32 user; + struct in6_addr saddr; + struct in6_addr daddr; + + unsigned int csum; + __u16 nhoffset; +}; + +static struct inet_frags nf_frags; +static struct netns_frags nf_init_frags; + +#ifdef CONFIG_SYSCTL +static struct ctl_table nf_ct_frag6_sysctl_table[] = { + { + .procname = "nf_conntrack_frag6_timeout", + .data = &nf_init_frags.timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_frag6_low_thresh", + .data = &nf_init_frags.low_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_frag6_high_thresh", + .data = &nf_init_frags.high_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *nf_ct_frag6_sysctl_header; +#endif + +static unsigned int nf_hashfn(struct inet_frag_queue *q) +{ + const struct nf_ct_frag6_queue *nq; + + nq = container_of(q, struct nf_ct_frag6_queue, q); + return inet6_hash_frag(nq->id, &nq->saddr, &nq->daddr, nf_frags.rnd); +} + +static void nf_skb_free(struct sk_buff *skb) +{ + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); +} + +/* Destruction primitives. */ + +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +{ + inet_frag_put(&fq->q, &nf_frags); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) +{ + inet_frag_kill(&fq->q, &nf_frags); +} + +static void nf_ct_frag6_evictor(void) +{ + local_bh_disable(); + inet_frag_evictor(&nf_init_frags, &nf_frags); + local_bh_enable(); +} + +static void nf_ct_frag6_expire(unsigned long data) +{ + struct nf_ct_frag6_queue *fq; + + fq = container_of((struct inet_frag_queue *)data, + struct nf_ct_frag6_queue, q); + + spin_lock(&fq->q.lock); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto out; + + fq_kill(fq); + +out: + spin_unlock(&fq->q.lock); + fq_put(fq); +} + +/* Creation primitives. */ + +static __inline__ struct nf_ct_frag6_queue * +fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) +{ + struct inet_frag_queue *q; + struct ip6_create_arg arg; + unsigned int hash; + + arg.id = id; + arg.user = user; + arg.src = src; + arg.dst = dst; + + read_lock_bh(&nf_frags.lock); + hash = inet6_hash_frag(id, src, dst, nf_frags.rnd); + + q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); + local_bh_enable(); + if (q == NULL) + goto oom; + + return container_of(q, struct nf_ct_frag6_queue, q); + +oom: + pr_debug("Can't alloc new queue\n"); + return NULL; +} + + +static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + const struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->q.last_in & INET_FRAG_COMPLETE) { + pr_debug("Already completed\n"); + goto err; + } + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + pr_debug("offset is too large.\n"); + return -1; + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + const unsigned char *nh = skb_network_header(skb); + skb->csum = csum_sub(skb->csum, + csum_partial(nh, (u8 *)(fhdr + 1) - nh, + 0)); + } + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->q.len || + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) { + pr_debug("already received last fragment\n"); + goto err; + } + fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + pr_debug("end of fragment not rounded to 8 bytes.\n"); + return -1; + } + if (end > fq->q.len) { + /* Some bits beyond end -> corruption. */ + if (fq->q.last_in & INET_FRAG_LAST_IN) { + pr_debug("last packet already reached.\n"); + goto err; + } + fq->q.len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { + pr_debug("queue: message is too short.\n"); + goto err; + } + if (pskb_trim_rcsum(skb, end - offset)) { + pr_debug("Can't trim\n"); + goto err; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = fq->q.fragments_tail; + if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } + prev = NULL; + for (next = fq->q.fragments; next != NULL; next = next->next) { + if (NFCT_FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + +found: + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. + */ + + /* Check for overlap with preceding fragment. */ + if (prev && + (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; + + /* Look for overlap with succeeding segment. */ + if (next && NFCT_FRAG6_CB(next)->offset < end) + goto discard_fq; + + NFCT_FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (!next) + fq->q.fragments_tail = skb; + if (prev) + prev->next = skb; + else + fq->q.fragments = skb; + + skb->dev = NULL; + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; + atomic_add(skb->truesize, &nf_init_frags.mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->q.last_in |= INET_FRAG_FIRST_IN; + } + write_lock(&nf_frags.lock); + list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); + write_unlock(&nf_frags.lock); + return 0; + +discard_fq: + fq_kill(fq); +err: + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static struct sk_buff * +nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +{ + struct sk_buff *fp, *op, *head = fq->q.fragments; + int payload_len; + + fq_kill(fq); + + WARN_ON(head == NULL); + WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = ((head->data - skb_network_header(head)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) { + pr_debug("payload len is too large.\n"); + goto out_oversize; + } + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + pr_debug("skb is cloned but can't expand head"); + goto out_oom; + } + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { + pr_debug("Can't alloc skb\n"); + goto out_oom; + } + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i=0; inr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + + NFCT_FRAG6_CB(clone)->orig = NULL; + atomic_add(clone->truesize, &nf_init_frags.mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac_header += sizeof(struct frag_hdr); + head->network_header += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + } + atomic_sub(head->truesize, &nf_init_frags.mem); + + head->next = NULL; + head->dev = dev; + head->tstamp = fq->q.stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_partial(skb_network_header(head), + skb_network_header_len(head), + head->csum); + + fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; + + /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ + fp = skb_shinfo(head)->frag_list; + if (fp && NFCT_FRAG6_CB(fp)->orig == NULL) + /* at above code, head skb is divided into two skbs. */ + fp = fp->next; + + op = NFCT_FRAG6_CB(head)->orig; + for (; fp; fp = fp->next) { + struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; + + op->next = orig; + op = orig; + NFCT_FRAG6_CB(fp)->orig = NULL; + } + + return head; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); +out_fail: + return NULL; +} + +/* + * find the header just before Fragment Header. + * + * if success return 0 and set ... + * (*prevhdrp): the value of "Next Header Field" in the header + * just before Fragment Header. + * (*prevhoff): the offset of "Next Header Field" in the header + * just before Fragment Header. + * (*fhoff) : the offset of Fragment Header. + * + * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c + * + */ +static int +find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) +{ + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + const int netoff = skb_network_offset(skb); + u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); + int start = netoff + sizeof(struct ipv6hdr); + int len = skb->len - start; + u8 prevhdr = NEXTHDR_IPV6; + + while (nexthdr != NEXTHDR_FRAGMENT) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (!ipv6_ext_hdr(nexthdr)) { + return -1; + } + if (nexthdr == NEXTHDR_NONE) { + pr_debug("next header is none\n"); + return -1; + } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + pr_debug("too short\n"); + return -1; + } + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + prevhdr = nexthdr; + prev_nhoff = start; + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + if (len < 0) + return -1; + + *prevhdrp = prevhdr; + *prevhoff = prev_nhoff; + *fhoff = start; + + return 0; +} + +struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) +{ + struct sk_buff *clone; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct nf_ct_frag6_queue *fq; + struct ipv6hdr *hdr; + int fhoff, nhoff; + u8 prevhdr; + struct sk_buff *ret_skb = NULL; + + /* Jumbo payload inhibits frag. header */ + if (ipv6_hdr(skb)->payload_len == 0) { + pr_debug("payload len = 0\n"); + return skb; + } + + if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) + return skb; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone == NULL) { + pr_debug("Can't clone skb\n"); + return skb; + } + + NFCT_FRAG6_CB(clone)->orig = skb; + + if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { + pr_debug("message is too short.\n"); + goto ret_orig; + } + + skb_set_transport_header(clone, fhoff); + hdr = ipv6_hdr(clone); + fhdr = (struct frag_hdr *)skb_transport_header(clone); + + if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) + nf_ct_frag6_evictor(); + + fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr); + if (fq == NULL) { + pr_debug("Can't find and can't create new queue\n"); + goto ret_orig; + } + + spin_lock_bh(&fq->q.lock); + + if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + spin_unlock_bh(&fq->q.lock); + pr_debug("Can't insert skb to queue\n"); + fq_put(fq); + goto ret_orig; + } + + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { + ret_skb = nf_ct_frag6_reasm(fq, dev); + if (ret_skb == NULL) + pr_debug("Can't reassemble fragmented packets\n"); + } + spin_unlock_bh(&fq->q.lock); + + fq_put(fq); + return ret_skb; + +ret_orig: + kfree_skb(clone); + return skb; +} + +void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s;) { + nf_conntrack_put_reasm(s->nfct_reasm); + nf_conntrack_get_reasm(skb); + s->nfct_reasm = skb; + + s2 = s->next; + s->next = NULL; + + NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn, + NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + s = s2; + } + nf_conntrack_put_reasm(skb); +} + +int nf_ct_frag6_init(void) +{ + nf_frags.hashfn = nf_hashfn; + nf_frags.constructor = ip6_frag_init; + nf_frags.destructor = NULL; + nf_frags.skb_free = nf_skb_free; + nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); + nf_frags.match = ip6_frag_match; + nf_frags.frag_expire = nf_ct_frag6_expire; + nf_frags.secret_interval = 10 * 60 * HZ; + nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; + nf_init_frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + nf_init_frags.low_thresh = IPV6_FRAG_LOW_THRESH; + inet_frags_init_net(&nf_init_frags); + inet_frags_init(&nf_frags); + +#ifdef CONFIG_SYSCTL + nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, + nf_ct_frag6_sysctl_table); + if (!nf_ct_frag6_sysctl_header) { + inet_frags_fini(&nf_frags); + return -ENOMEM; + } +#endif + + return 0; +} + +void nf_ct_frag6_cleanup(void) +{ +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_frag6_sysctl_header); + nf_ct_frag6_sysctl_header = NULL; +#endif + inet_frags_fini(&nf_frags); + + nf_init_frags.low_thresh = 0; + nf_ct_frag6_evictor(); +} diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c new file mode 100644 index 00000000..cdd6d045 --- /dev/null +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -0,0 +1,137 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include +#include +#include +#include +#include +#include +#endif +#include +#include + +static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP6_DEFRAG_CONNTRACK_IN + zone; + else + return IP6_DEFRAG_CONNTRACK_OUT + zone; + +} + +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + /* Previously seen (loopback)? */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; +#endif + + reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occurred or not fragmented */ + if (reasm == skb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static struct nf_hook_ops ipv6_defrag_ops[] = { + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + int ret = 0; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); + return ret; + } + ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't register hooks\n"); + goto cleanup_frag6; + } + return ret; + +cleanup_frag6: + nf_ct_frag6_cleanup(); + return ret; + +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + nf_ct_frag6_cleanup(); +} + +void nf_defrag_ipv6_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c new file mode 100644 index 00000000..18ff5df7 --- /dev/null +++ b/net/ipv6/proc.c @@ -0,0 +1,342 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. This is very similar to the IPv4 version, + * except it reports the sockets in the INET6 address family. + * + * Authors: David S. Miller (davem@caip.rutgers.edu) + * YOSHIFUJI Hideaki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int sockstat6_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + + seq_printf(seq, "TCP6: inuse %d\n", + sock_prot_inuse_get(net, &tcpv6_prot)); + seq_printf(seq, "UDP6: inuse %d\n", + sock_prot_inuse_get(net, &udpv6_prot)); + seq_printf(seq, "UDPLITE6: inuse %d\n", + sock_prot_inuse_get(net, &udplitev6_prot)); + seq_printf(seq, "RAW6: inuse %d\n", + sock_prot_inuse_get(net, &rawv6_prot)); + seq_printf(seq, "FRAG6: inuse %d memory %d\n", + ip6_frag_nqueues(net), ip6_frag_mem(net)); + return 0; +} + +static int sockstat6_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, sockstat6_seq_show); +} + +static const struct file_operations sockstat6_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static const struct snmp_mib snmp6_ipstats_list[] = { +/* ipv6 mib according to RFC 2465 */ + SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), + SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), + SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), + SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), + SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS), + SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), + SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS), + SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS), + SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), + SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), + SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), + SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + SNMP_MIB_SENTINEL +}; + +static const struct snmp_mib snmp6_icmp6_list[] = { +/* icmpv6 mib according to RFC 2466 */ + SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), + SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), + SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), + SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), + SNMP_MIB_SENTINEL +}; + +/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ +static const char *const icmp6type2name[256] = { + [ICMPV6_DEST_UNREACH] = "DestUnreachs", + [ICMPV6_PKT_TOOBIG] = "PktTooBigs", + [ICMPV6_TIME_EXCEED] = "TimeExcds", + [ICMPV6_PARAMPROB] = "ParmProblems", + [ICMPV6_ECHO_REQUEST] = "Echos", + [ICMPV6_ECHO_REPLY] = "EchoReplies", + [ICMPV6_MGM_QUERY] = "GroupMembQueries", + [ICMPV6_MGM_REPORT] = "GroupMembResponses", + [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", + [ICMPV6_MLD2_REPORT] = "MLDv2Reports", + [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", + [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", + [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", + [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", + [NDISC_REDIRECT] = "Redirects", +}; + + +static const struct snmp_mib snmp6_udp6_list[] = { + SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_SENTINEL +}; + +static const struct snmp_mib snmp6_udplite6_list[] = { + SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_SENTINEL +}; + +/* can be called either with percpu mib (pcpumib != NULL), + * or shared one (smib != NULL) + */ +static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void __percpu **pcpumib, + atomic_long_t *smib) +{ + char name[32]; + int i; + + /* print by name -- deprecated items */ + for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + int icmptype; + const char *p; + + icmptype = i & 0xff; + p = icmp6type2name[icmptype]; + if (!p) /* don't print un-named types here */ + continue; + snprintf(name, sizeof(name), "Icmp6%s%s", + i & 0x100 ? "Out" : "In", p); + seq_printf(seq, "%-32s\t%lu\n", name, + pcpumib ? snmp_fold_field(pcpumib, i) : atomic_long_read(smib + i)); + } + + /* print by number (nonzero only) - ICMPMsgStat format */ + for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + unsigned long val; + + val = pcpumib ? snmp_fold_field(pcpumib, i) : atomic_long_read(smib + i); + if (!val) + continue; + snprintf(name, sizeof(name), "Icmp6%sType%u", + i & 0x100 ? "Out" : "In", i & 0xff); + seq_printf(seq, "%-32s\t%lu\n", name, val); + } +} + +/* can be called either with percpu mib (pcpumib != NULL), + * or shared one (smib != NULL) + */ +static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **pcpumib, + atomic_long_t *smib, + const struct snmp_mib *itemlist) +{ + int i; + unsigned long val; + + for (i = 0; itemlist[i].name; i++) { + val = pcpumib ? + snmp_fold_field(pcpumib, itemlist[i].entry) : + atomic_long_read(smib + itemlist[i].entry); + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); + } +} + +static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib, + const struct snmp_mib *itemlist, size_t syncpoff) +{ + int i; + + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, + snmp_fold_field64(mib, itemlist[i].entry, syncpoff)); +} + +static int snmp6_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = (struct net *)seq->private; + + snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics, + NULL, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, + (void __percpu **)net->mib.icmpv6msg_statistics, NULL); + snmp6_seq_show_item(seq, (void __percpu **)net->mib.udp_stats_in6, + NULL, snmp6_udp6_list); + snmp6_seq_show_item(seq, (void __percpu **)net->mib.udplite_stats_in6, + NULL, snmp6_udplite6_list); + return 0; +} + +static int snmp6_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, snmp6_seq_show); +} + +static const struct file_operations snmp6_seq_fops = { + .owner = THIS_MODULE, + .open = snmp6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int snmp6_dev_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_dev *idev = (struct inet6_dev *)seq->private; + + seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); + snmp6_seq_show_item(seq, (void __percpu **)idev->stats.ipv6, NULL, + snmp6_ipstats_list); + snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, + snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, NULL, idev->stats.icmpv6msgdev->mibs); + return 0; +} + +static int snmp6_dev_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, snmp6_dev_seq_show, PDE(inode)->data); +} + +static const struct file_operations snmp6_dev_seq_fops = { + .owner = THIS_MODULE, + .open = snmp6_dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int snmp6_register_dev(struct inet6_dev *idev) +{ + struct proc_dir_entry *p; + struct net *net; + + if (!idev || !idev->dev) + return -EINVAL; + + net = dev_net(idev->dev); + if (!net->mib.proc_net_devsnmp6) + return -ENOENT; + + p = proc_create_data(idev->dev->name, S_IRUGO, + net->mib.proc_net_devsnmp6, + &snmp6_dev_seq_fops, idev); + if (!p) + return -ENOMEM; + + idev->stats.proc_dir_entry = p; + return 0; +} + +int snmp6_unregister_dev(struct inet6_dev *idev) +{ + struct net *net = dev_net(idev->dev); + if (!net->mib.proc_net_devsnmp6) + return -ENOENT; + if (!idev->stats.proc_dir_entry) + return -EINVAL; + remove_proc_entry(idev->stats.proc_dir_entry->name, + net->mib.proc_net_devsnmp6); + idev->stats.proc_dir_entry = NULL; + return 0; +} + +static int __net_init ipv6_proc_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "sockstat6", S_IRUGO, + &sockstat6_seq_fops)) + return -ENOMEM; + + if (!proc_net_fops_create(net, "snmp6", S_IRUGO, &snmp6_seq_fops)) + goto proc_snmp6_fail; + + net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); + if (!net->mib.proc_net_devsnmp6) + goto proc_dev_snmp6_fail; + return 0; + +proc_snmp6_fail: + proc_net_remove(net, "sockstat6"); +proc_dev_snmp6_fail: + proc_net_remove(net, "dev_snmp6"); + return -ENOMEM; +} + +static void __net_exit ipv6_proc_exit_net(struct net *net) +{ + proc_net_remove(net, "sockstat6"); + proc_net_remove(net, "dev_snmp6"); + proc_net_remove(net, "snmp6"); +} + +static struct pernet_operations ipv6_proc_ops = { + .init = ipv6_proc_init_net, + .exit = ipv6_proc_exit_net, +}; + +int __init ipv6_misc_proc_init(void) +{ + return register_pernet_subsys(&ipv6_proc_ops); +} + +void ipv6_misc_proc_exit(void) +{ + unregister_pernet_subsys(&ipv6_proc_ops); +} + diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c new file mode 100644 index 00000000..9a7978fd --- /dev/null +++ b/net/ipv6/protocol.c @@ -0,0 +1,54 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET6 protocol dispatch tables. + * + * Authors: Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Vince Laviano (vince@cs.stanford.edu) 16 May 2001 + * - Removed unused variable 'inet6_protocol_base' + * - Modified inet6_del_protocol() to correctly maintain copy bit. + */ +#include +#include +#include +#include + +const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; + +int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) +{ + int hash = protocol & (MAX_INET_PROTOS - 1); + + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet6_add_protocol); + +/* + * Remove a protocol from the hash tables. + */ + +int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + prot, NULL) == prot) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet6_del_protocol); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c new file mode 100644 index 00000000..cc7313b8 --- /dev/null +++ b/net/ipv6/raw.c @@ -0,0 +1,1370 @@ +/* + * RAW sockets for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Adapted from linux/net/ipv4/raw.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#include +#endif +#include + +#include +#include +#include + +#include +#include + +static struct raw_hashinfo raw_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), +}; + +static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, + unsigned short num, const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif) +{ + struct hlist_node *node; + int is_multicast = ipv6_addr_is_multicast(loc_addr); + + sk_for_each_from(sk, node) + if (inet_sk(sk)->inet_num == num) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + goto found; + if (is_multicast && + inet6_mc_check(sk, loc_addr, rmt_addr)) + goto found; + continue; + } + goto found; + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +{ + struct icmp6hdr *icmph; + struct raw6_sock *rp = raw6_sk(sk); + + if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { + __u32 *data = &rp->filter.data[0]; + int bit_nr; + + icmph = (struct icmp6hdr *) skb->data; + bit_nr = icmph->icmp6_type; + + return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + } + return 0; +} + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); + +static mh_filter_t __rcu *mh_filter __read_mostly; + +int rawv6_mh_filter_register(mh_filter_t filter) +{ + rcu_assign_pointer(mh_filter, filter); + return 0; +} +EXPORT_SYMBOL(rawv6_mh_filter_register); + +int rawv6_mh_filter_unregister(mh_filter_t filter) +{ + rcu_assign_pointer(mh_filter, NULL); + synchronize_rcu(); + return 0; +} +EXPORT_SYMBOL(rawv6_mh_filter_unregister); + +#endif + +/* + * demultiplex raw sockets. + * (should consider queueing the skb in the sock receive_queue + * without calling rawv6.c) + * + * Caller owns SKB so we must make clones. + */ +static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +{ + const struct in6_addr *saddr; + const struct in6_addr *daddr; + struct sock *sk; + int delivered = 0; + __u8 hash; + struct net *net; + + saddr = &ipv6_hdr(skb)->saddr; + daddr = saddr + 1; + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + read_lock(&raw_v6_hashinfo.lock); + sk = sk_head(&raw_v6_hashinfo.ht[hash]); + + if (sk == NULL) + goto out; + + net = dev_net(skb->dev); + sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); + + while (sk) { + int filtered; + + delivered = 1; + switch (nexthdr) { + case IPPROTO_ICMPV6: + filtered = icmpv6_filter(sk, skb); + break; + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPPROTO_MH: + { + /* XXX: To validate MH only once for each packet, + * this is placed here. It should be after checking + * xfrm policy, however it doesn't. The checking xfrm + * policy is placed in rawv6_rcv() because it is + * required for each socket. + */ + mh_filter_t *filter; + + filter = rcu_dereference(mh_filter); + filtered = filter ? (*filter)(sk, skb) : 0; + break; + } +#endif + default: + filtered = 0; + break; + } + + if (filtered < 0) + break; + if (filtered == 0) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) { + nf_reset(clone); + rawv6_rcv(sk, clone); + } + } + sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, + IP6CB(skb)->iif); + } +out: + read_unlock(&raw_v6_hashinfo.lock); + return delivered; +} + +int raw6_local_deliver(struct sk_buff *skb, int nexthdr) +{ + struct sock *raw_sk; + + raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]); + if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) + raw_sk = NULL; + + return raw_sk != NULL; +} + +/* This cleans up af_inet6 a bit. -DaveM */ +static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; + __be32 v4addr = 0; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); + + /* Raw sockets are IPv6 only */ + if (addr_type == IPV6_ADDR_MAPPED) + return -EADDRNOTAVAIL; + + lock_sock(sk); + + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE) + goto out; + + rcu_read_lock(); + /* Check if the address belongs to the host. */ + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another + * one is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + goto out_unlock; + + err = -ENODEV; + dev = dev_get_by_index_rcu(sock_net(sk), + sk->sk_bound_dev_if); + if (!dev) + goto out_unlock; + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + err = -EADDRNOTAVAIL; + if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, + dev, 0)) { + goto out_unlock; + } + } + } + + inet->inet_rcv_saddr = inet->inet_saddr = v4addr; + ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + err = 0; +out_unlock: + rcu_read_unlock(); +out: + release_sock(sk); + return err; +} + +static void rawv6_err(struct sock *sk, struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + int err; + int harderr; + + /* Report error on raw socket, if: + 1. User requested recverr. + 2. Socket is connected (otherwise the error indication + is useless without recverr and error is hard. + */ + if (!np->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + harderr = icmpv6_err_convert(type, code, &err); + if (type == ICMPV6_PKT_TOOBIG) + harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); + + if (np->recverr) { + u8 *payload = skb->data; + if (!inet->hdrincl) + payload += offset; + ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); + } + + if (np->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +void raw6_icmp_error(struct sk_buff *skb, int nexthdr, + u8 type, u8 code, int inner_offset, __be32 info) +{ + struct sock *sk; + int hash; + const struct in6_addr *saddr, *daddr; + struct net *net; + + hash = nexthdr & (RAW_HTABLE_SIZE - 1); + + read_lock(&raw_v6_hashinfo.lock); + sk = sk_head(&raw_v6_hashinfo.ht[hash]); + if (sk != NULL) { + /* Note: ipv6_hdr(skb) != skb->data */ + const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; + net = dev_net(skb->dev); + + while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, + IP6CB(skb)->iif))) { + rawv6_err(sk, skb, NULL, type, code, + inner_offset, info); + sk = sk_next(sk); + } + } + read_unlock(&raw_v6_hashinfo.lock); +} + +static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + if ((raw6_sk(sk)->checksum || rcu_dereference_raw(sk->sk_filter)) && + skb_checksum_complete(skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + + /* Charge it to the socket. */ + if (ip_queue_rcv_skb(sk, skb) < 0) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return 0; +} + +/* + * This is next to useless... + * if we demultiplex in network layer we don't need the extra call + * just to queue the skb... + * maybe we could have the network decide upon a hint if it + * should call raw_rcv for demultiplexing + */ +int rawv6_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + + if (!rp->checksum) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, inet->inet_num, skb->csum)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, + inet->inet_num, 0)); + + if (inet->hdrincl) { + if (skb_checksum_complete(skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + rawv6_rcv_skb(sk, skb); + return 0; +} + + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + struct sk_buff *skb; + size_t copied; + int err; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (addr_len) + *addr_len=sizeof(*sin6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb_csum_unnecessary(skb)) { + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if (__skb_checksum_complete(skb)) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + /* Copy the address. */ + if (sin6) { + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + sock_recv_ts_and_drops(msg, sk, skb); + + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + skb_kill_datagram(sk, skb, flags); + + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + goto out; +} + +static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct raw6_sock *rp) +{ + struct sk_buff *skb; + int err = 0; + int offset; + int len; + int total_len; + __wsum tmp_csum; + __sum16 csum; + + if (!rp->checksum) + goto send; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + offset = rp->offset; + total_len = inet_sk(sk)->cork.base.length - (skb_network_header(skb) - + skb->data); + if (offset >= total_len - 1) { + err = -EINVAL; + ip6_flush_pending_frames(sk); + goto out; + } + + /* should be check HW csum miyazawa */ + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + tmp_csum = skb->csum; + } else { + struct sk_buff *csum_skb = NULL; + tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + + if (csum_skb) + continue; + + len = skb->len - skb_transport_offset(skb); + if (offset >= len) { + offset -= len; + continue; + } + + csum_skb = skb; + } + + skb = csum_skb; + } + + offset += skb_transport_offset(skb); + if (skb_copy_bits(skb, offset, &csum, 2)) + BUG(); + + /* in case cksum was not initialized */ + if (unlikely(csum)) + tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); + + csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + total_len, fl6->flowi6_proto, tmp_csum); + + if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) + csum = CSUM_MANGLED_0; + + if (skb_store_bits(skb, offset, &csum, 2)) + BUG(); + +send: + err = ip6_push_pending_frames(sk); +out: + return err; +} + +static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, + struct flowi6 *fl6, struct dst_entry **dstp, + unsigned int flags) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + int err; + struct rt6_info *rt = (struct rt6_info *)*dstp; + + if (length > rt->dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + skb = sock_alloc_send_skb(sk, + length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, + flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_dst_set(skb, &rt->dst); + *dstp = NULL; + + skb_put(skb, length); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + + skb->ip_summed = CHECKSUM_NONE; + + skb->transport_header = skb->network_header; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto error; +out: + return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + if (err == -ENOBUFS && !np->recverr) + err = 0; + return err; +} + +static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + u8 len = 0; + int probed = 0; + int i; + + if (!msg->msg_iov) + return 0; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl6->flowi6_proto) { + case IPPROTO_ICMPV6: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + if (get_user(fl6->fl6_icmp_type, type) || + get_user(fl6->fl6_icmp_code, code)) + return -EFAULT; + probed = 1; + } + break; + case IPPROTO_MH: + if (iov->iov_base && iov->iov_len < 1) + break; + /* check if type field is readable or not. */ + if (iov->iov_len > 2 - len) { + u8 __user *p = iov->iov_base; + if (get_user(fl6->fl6_mh_type, &p[2 - len])) + return -EFAULT; + probed = 1; + } else + len += iov->iov_len; + + break; + default: + probed = 1; + break; + } + if (probed) + break; + } + return 0; +} + +static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p, final; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct dst_entry *dst = NULL; + struct flowi6 fl6; + int addr_len = msg->msg_namelen; + int hlimit = -1; + int tclass = -1; + int dontfrag = -1; + u16 proto; + int err; + + /* Rough check on arithmetic overflow, + better check is made in ip6_append_data(). + */ + if (len > INT_MAX) + return -EMSGSIZE; + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Get and verify the address. + */ + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_mark = sk->sk_mark; + + if (sin6) { + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (sin6->sin6_family && sin6->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + /* port is the proto value [0..255] carried in nexthdr */ + proto = ntohs(sin6->sin6_port); + + if (!proto) + proto = inet->inet_num; + else if (proto != inet->inet_num) + return -EINVAL; + + if (proto > 255) + return -EINVAL; + + daddr = &sin6->sin6_addr; + if (np->sndflow) { + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + proto = inet->inet_num; + daddr = &np->daddr; + fl6.flowlabel = np->flow_label; + } + + if (fl6.flowi6_oif == 0) + fl6.flowi6_oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(struct ipv6_txoptions); + + err = datagram_send_ctl(sock_net(sk), msg, &fl6, opt, &hlimit, + &tclass, &dontfrag); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); + + fl6.flowi6_proto = proto; + err = rawv6_probe_proto_opt(&fl6, msg); + if (err) + goto out; + + if (!ipv6_addr_any(daddr)) + ipv6_addr_copy(&fl6.daddr, daddr); + else + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl6.saddr, &np->saddr); + + final_p = fl6_update_dst(&fl6, opt, &final); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; + } + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + } + + if (tclass < 0) + tclass = np->tclass; + + if (dontfrag < 0) + dontfrag = np->dontfrag; + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; + +back_from_confirm: + if (inet->hdrincl) + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags); + else { + lock_sock(sk); + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, + len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, + msg->msg_flags, dontfrag); + + if (err) + ip6_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = rawv6_push_pending_frames(sk, &fl6, rp); + release_sock(sk); + } +done: + dst_release(dst); +out: + fl6_sock_release(flowlabel); + return err<0?err:len; +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + switch (optname) { + case ICMPV6_FILTER: + if (optlen > sizeof(struct icmp6_filter)) + optlen = sizeof(struct icmp6_filter); + if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + } + + return 0; +} + +static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int len; + + switch (optname) { + case ICMPV6_FILTER: + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + if (len > sizeof(struct icmp6_filter)) + len = sizeof(struct icmp6_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + } + + return 0; +} + + +static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && + level == IPPROTO_IPV6) { + /* + * RFC3542 tells that IPV6_CHECKSUM socket + * option in the IPPROTO_IPV6 level is not + * allowed on ICMPv6 sockets. + * If you want to set it, use IPPROTO_RAW + * level IPV6_CHECKSUM socket option + * (Linux extension). + */ + return -EINVAL; + } + + /* You may get strange result with a positive odd offset; + RFC2292bis agrees with me. */ + if (val > 0 && (val&1)) + return -EINVAL; + if (val < 0) { + rp->checksum = 0; + } else { + rp->checksum = 1; + rp->offset = val; + } + + return 0; + break; + + default: + return -ENOPROTOOPT; + } +} + +static int rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, + optlen); + } + + return do_rawv6_setsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + switch (level) { + case SOL_RAW: + break; + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return compat_ipv6_setsockopt(sk, level, optname, + optval, optlen); + } + return do_rawv6_setsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val, len; + + if (get_user(len,optlen)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + /* + * We allow getsockopt() for IPPROTO_IPV6-level + * IPV6_CHECKSUM socket option on ICMPv6 sockets + * since RFC3542 is silent about it. + */ + if (rp->checksum == 0) + val = -1; + else + val = rp->offset; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, sizeof(int), len); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +static int rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, + optlen); + } + + return do_rawv6_getsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + switch (level) { + case SOL_RAW: + break; + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return compat_ipv6_getsockopt(sk, level, optname, + optval, optlen); + } + return do_rawv6_getsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) { + case SIOCOUTQ: + { + int amount = sk_wmem_alloc_get(sk); + + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->tail - skb->transport_header; + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_ioctl(sk, cmd, (void __user *)arg); +#else + return -ENOIOCTLCMD; +#endif + } +} + +#ifdef CONFIG_COMPAT +static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + case SIOCINQ: + return -ENOIOCTLCMD; + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); +#else + return -ENOIOCTLCMD; +#endif + } +} +#endif + +static void rawv6_close(struct sock *sk, long timeout) +{ + if (inet_sk(sk)->inet_num == IPPROTO_RAW) + ip6_ra_control(sk, -1); + ip6mr_sk_done(sk); + sk_common_release(sk); +} + +static void raw6_destroy(struct sock *sk) +{ + lock_sock(sk); + ip6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); +} + +static int rawv6_init_sk(struct sock *sk) +{ + struct raw6_sock *rp = raw6_sk(sk); + + switch (inet_sk(sk)->inet_num) { + case IPPROTO_ICMPV6: + rp->checksum = 1; + rp->offset = 2; + break; + case IPPROTO_MH: + rp->checksum = 1; + rp->offset = 4; + break; + default: + break; + } + return 0; +} + +struct proto rawv6_prot = { + .name = "RAWv6", + .owner = THIS_MODULE, + .close = rawv6_close, + .destroy = raw6_destroy, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = rawv6_ioctl, + .init = rawv6_init_sk, + .setsockopt = rawv6_setsockopt, + .getsockopt = rawv6_getsockopt, + .sendmsg = rawv6_sendmsg, + .recvmsg = rawv6_recvmsg, + .bind = rawv6_bind, + .backlog_rcv = rawv6_rcv_skb, + .hash = raw_hash_sk, + .unhash = raw_unhash_sk, + .obj_size = sizeof(struct raw6_sock), + .h.raw_hash = &raw_v6_hashinfo, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_rawv6_setsockopt, + .compat_getsockopt = compat_rawv6_getsockopt, + .compat_ioctl = compat_rawv6_ioctl, +#endif +}; + +#ifdef CONFIG_PROC_FS +static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) +{ + struct ipv6_pinfo *np = inet6_sk(sp); + const struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = 0; + srcp = inet_sk(sp)->inet_num; + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); +} + +static int raw6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode ref pointer drops\n"); + else + raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); + return 0; +} + +static const struct seq_operations raw6_seq_ops = { + .start = raw_seq_start, + .next = raw_seq_next, + .stop = raw_seq_stop, + .show = raw6_seq_show, +}; + +static int raw6_seq_open(struct inode *inode, struct file *file) +{ + return raw_seq_open(inode, file, &raw_v6_hashinfo, &raw6_seq_ops); +} + +static const struct file_operations raw6_seq_fops = { + .owner = THIS_MODULE, + .open = raw6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init raw6_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops)) + return -ENOMEM; + + return 0; +} + +static void __net_exit raw6_exit_net(struct net *net) +{ + proc_net_remove(net, "raw6"); +} + +static struct pernet_operations raw6_net_ops = { + .init = raw6_init_net, + .exit = raw6_exit_net, +}; + +int __init raw6_proc_init(void) +{ + return register_pernet_subsys(&raw6_net_ops); +} + +void raw6_proc_exit(void) +{ + unregister_pernet_subsys(&raw6_net_ops); +} +#endif /* CONFIG_PROC_FS */ + +/* Same as inet6_dgram_ops, sans udp_poll. */ +static const struct proto_ops inet6_sockraw_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = datagram_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw rawv6_protosw = { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &rawv6_prot, + .ops = &inet6_sockraw_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + +int __init rawv6_init(void) +{ + int ret; + + ret = inet6_register_protosw(&rawv6_protosw); + if (ret) + goto out; +out: + return ret; +} + +void rawv6_exit(void) +{ + inet6_unregister_protosw(&rawv6_protosw); +} diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c new file mode 100644 index 00000000..7b954e25 --- /dev/null +++ b/net/ipv6/reassembly.c @@ -0,0 +1,769 @@ +/* + * IPv6 fragment reassembly + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on: net/ipv4/ip_fragment.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Fixes: + * Andi Kleen Make it work with multiple hosts. + * More RFC compliance. + * + * Horst von Brand Add missing #include + * Alexey Kuznetsov SMP races, threading, cleanup. + * Patrick McHardy LRU queue of frag heads for evictor. + * Mitsuru KANDA @USAGI Register inet6_protocol{}. + * David Stevens and + * YOSHIFUJI,H. @USAGI Always remove fragment header to + * calculate ICV correctly. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct ip6frag_skb_cb +{ + struct inet6_skb_parm h; + int offset; +}; + +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) + + +/* + * Equivalent of ipv4 struct ipq + */ + +struct frag_queue +{ + struct inet_frag_queue q; + + __be32 id; /* fragment id */ + u32 user; + struct in6_addr saddr; + struct in6_addr daddr; + + int iif; + unsigned int csum; + __u16 nhoffset; +}; + +static struct inet_frags ip6_frags; + +int ip6_frag_nqueues(struct net *net) +{ + return net->ipv6.frags.nqueues; +} + +int ip6_frag_mem(struct net *net) +{ + return atomic_read(&net->ipv6.frags.mem); +} + +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + struct net_device *dev); + +/* + * callers should be careful not to use the hash value outside the ipfrag_lock + * as doing so could race with ipfrag_hash_rnd being recalculated. + */ +unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, + const struct in6_addr *daddr, u32 rnd) +{ + u32 c; + + c = jhash_3words((__force u32)saddr->s6_addr32[0], + (__force u32)saddr->s6_addr32[1], + (__force u32)saddr->s6_addr32[2], + rnd); + + c = jhash_3words((__force u32)saddr->s6_addr32[3], + (__force u32)daddr->s6_addr32[0], + (__force u32)daddr->s6_addr32[1], + c); + + c = jhash_3words((__force u32)daddr->s6_addr32[2], + (__force u32)daddr->s6_addr32[3], + (__force u32)id, + c); + + return c & (INETFRAGS_HASHSZ - 1); +} +EXPORT_SYMBOL_GPL(inet6_hash_frag); + +static unsigned int ip6_hashfn(struct inet_frag_queue *q) +{ + struct frag_queue *fq; + + fq = container_of(q, struct frag_queue, q); + return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr, ip6_frags.rnd); +} + +int ip6_frag_match(struct inet_frag_queue *q, void *a) +{ + struct frag_queue *fq; + struct ip6_create_arg *arg = a; + + fq = container_of(q, struct frag_queue, q); + return (fq->id == arg->id && fq->user == arg->user && + ipv6_addr_equal(&fq->saddr, arg->src) && + ipv6_addr_equal(&fq->daddr, arg->dst)); +} +EXPORT_SYMBOL(ip6_frag_match); + +void ip6_frag_init(struct inet_frag_queue *q, void *a) +{ + struct frag_queue *fq = container_of(q, struct frag_queue, q); + struct ip6_create_arg *arg = a; + + fq->id = arg->id; + fq->user = arg->user; + ipv6_addr_copy(&fq->saddr, arg->src); + ipv6_addr_copy(&fq->daddr, arg->dst); +} +EXPORT_SYMBOL(ip6_frag_init); + +/* Destruction primitives. */ + +static __inline__ void fq_put(struct frag_queue *fq) +{ + inet_frag_put(&fq->q, &ip6_frags); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct frag_queue *fq) +{ + inet_frag_kill(&fq->q, &ip6_frags); +} + +static void ip6_evictor(struct net *net, struct inet6_dev *idev) +{ + int evicted; + + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags); + if (evicted) + IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS, evicted); +} + +static void ip6_frag_expire(unsigned long data) +{ + struct frag_queue *fq; + struct net_device *dev = NULL; + struct net *net; + + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + + spin_lock(&fq->q.lock); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto out; + + fq_kill(fq); + + net = container_of(fq->q.net, struct net, ipv6.frags); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, fq->iif); + if (!dev) + goto out_rcu_unlock; + + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + + /* Don't send error if the first segment did not arrive. */ + if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments) + goto out_rcu_unlock; + + /* + But use as source device on which LAST ARRIVED + segment was received. And do not use fq->dev + pointer directly, device might already disappeared. + */ + fq->q.fragments->dev = dev; + icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); +out: + spin_unlock(&fq->q.lock); + fq_put(fq); +} + +static __inline__ struct frag_queue * +fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6_addr *dst) +{ + struct inet_frag_queue *q; + struct ip6_create_arg arg; + unsigned int hash; + + arg.id = id; + arg.user = IP6_DEFRAG_LOCAL_DELIVER; + arg.src = src; + arg.dst = dst; + + read_lock(&ip6_frags.lock); + hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd); + + q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); + if (q == NULL) + return NULL; + + return container_of(q, struct frag_queue, q); +} + +static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + struct net_device *dev; + int offset, end; + struct net *net = dev_net(skb_dst(skb)->dev); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto err; + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((u8 *)&fhdr->frag_off - + skb_network_header(skb))); + return -1; + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + const unsigned char *nh = skb_network_header(skb); + skb->csum = csum_sub(skb->csum, + csum_partial(nh, (u8 *)(fhdr + 1) - nh, + 0)); + } + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->q.len || + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) + goto err; + fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, payload_len)); + return -1; + } + if (end > fq->q.len) { + /* Some bits beyond end -> corruption. */ + if (fq->q.last_in & INET_FRAG_LAST_IN) + goto err; + fq->q.len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) + goto err; + + if (pskb_trim_rcsum(skb, end - offset)) + goto err; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = fq->q.fragments_tail; + if (!prev || FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } + prev = NULL; + for(next = fq->q.fragments; next != NULL; next = next->next) { + if (FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + +found: + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. + */ + + /* Check for overlap with preceding fragment. */ + if (prev && + (FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; + + /* Look for overlap with succeeding segment. */ + if (next && FRAG6_CB(next)->offset < end) + goto discard_fq; + + FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (!next) + fq->q.fragments_tail = skb; + if (prev) + prev->next = skb; + else + fq->q.fragments = skb; + + dev = skb->dev; + if (dev) { + fq->iif = dev->ifindex; + skb->dev = NULL; + } + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; + atomic_add(skb->truesize, &fq->q.net->mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->q.last_in |= INET_FRAG_FIRST_IN; + } + + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) + return ip6_frag_reasm(fq, prev, dev); + + write_lock(&ip6_frags.lock); + list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); + write_unlock(&ip6_frags.lock); + return -1; + +discard_fq: + fq_kill(fq); +err: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + struct net_device *dev) +{ + struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct sk_buff *fp, *head = fq->q.fragments; + int payload_len; + unsigned int nhoff; + + fq_kill(fq); + + /* Make the one we just received the head. */ + if (prev) { + head = prev->next; + fp = skb_clone(head, GFP_ATOMIC); + + if (!fp) + goto out_oom; + + fp->next = head->next; + if (!fp->next) + fq->q.fragments_tail = fp; + prev->next = fp; + + skb_morph(head, fq->q.fragments); + head->next = fq->q.fragments->next; + + kfree_skb(fq->q.fragments); + fq->q.fragments = head; + } + + WARN_ON(head == NULL); + WARN_ON(FRAG6_CB(head)->offset != 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = ((head->data - skb_network_header(head)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_oom; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_oom; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i=0; inr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &fq->q.net->mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; + skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac_header += sizeof(struct frag_hdr); + head->network_header += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + } + atomic_sub(head->truesize, &fq->q.net->mem); + + head->next = NULL; + head->dev = dev; + head->tstamp = fq->q.stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); + IP6CB(head)->nhoff = nhoff; + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_partial(skb_network_header(head), + skb_network_header_len(head), + head->csum); + + rcu_read_lock(); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + rcu_read_unlock(); + fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; + return 1; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); +out_fail: + rcu_read_lock(); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + rcu_read_unlock(); + return -1; +} + +static int ipv6_frag_rcv(struct sk_buff *skb) +{ + struct frag_hdr *fhdr; + struct frag_queue *fq; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = dev_net(skb_dst(skb)->dev); + + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); + + /* Jumbo payload inhibits frag. header */ + if (hdr->payload_len==0) + goto fail_hdr; + + if (!pskb_may_pull(skb, (skb_transport_offset(skb) + + sizeof(struct frag_hdr)))) + goto fail_hdr; + + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); + + if (!(fhdr->frag_off & htons(0xFFF9))) { + /* It is not a fragmented frame */ + skb->transport_header += sizeof(struct frag_hdr); + IP6_INC_STATS_BH(net, + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); + + IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); + return 1; + } + + if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh) + ip6_evictor(net, ip6_dst_idev(skb_dst(skb))); + + fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr); + if (fq != NULL) { + int ret; + + spin_lock(&fq->q.lock); + + ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); + + spin_unlock(&fq->q.lock); + fq_put(fq); + return ret; + } + + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -1; + +fail_hdr: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); + return -1; +} + +static const struct inet6_protocol frag_protocol = +{ + .handler = ipv6_frag_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +#ifdef CONFIG_SYSCTL +static struct ctl_table ip6_frags_ns_ctl_table[] = { + { + .procname = "ip6frag_high_thresh", + .data = &init_net.ipv6.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip6frag_low_thresh", + .data = &init_net.ipv6.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip6frag_time", + .data = &init_net.ipv6.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static struct ctl_table ip6_frags_ctl_table[] = { + { + .procname = "ip6frag_secret_interval", + .data = &ip6_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static int __net_init ip6_frags_ns_sysctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip6_frags_ns_ctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv6.frags.high_thresh; + table[1].data = &net->ipv6.frags.low_thresh; + table[2].data = &net->ipv6.frags.timeout; + } + + hdr = register_net_sysctl_table(net, net_ipv6_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv6.sysctl.frags_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct ctl_table_header *ip6_ctl_header; + +static int ip6_frags_sysctl_register(void) +{ + ip6_ctl_header = register_net_sysctl_rotable(net_ipv6_ctl_path, + ip6_frags_ctl_table); + return ip6_ctl_header == NULL ? -ENOMEM : 0; +} + +static void ip6_frags_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_ctl_header); +} +#else +static inline int ip6_frags_ns_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void ip6_frags_ns_sysctl_unregister(struct net *net) +{ +} + +static inline int ip6_frags_sysctl_register(void) +{ + return 0; +} + +static inline void ip6_frags_sysctl_unregister(void) +{ +} +#endif + +static int __net_init ipv6_frags_init_net(struct net *net) +{ + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + + inet_frags_init_net(&net->ipv6.frags); + + return ip6_frags_ns_sysctl_register(net); +} + +static void __net_exit ipv6_frags_exit_net(struct net *net) +{ + ip6_frags_ns_sysctl_unregister(net); + inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); +} + +static struct pernet_operations ip6_frags_ops = { + .init = ipv6_frags_init_net, + .exit = ipv6_frags_exit_net, +}; + +int __init ipv6_frag_init(void) +{ + int ret; + + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + if (ret) + goto out; + + ret = ip6_frags_sysctl_register(); + if (ret) + goto err_sysctl; + + ret = register_pernet_subsys(&ip6_frags_ops); + if (ret) + goto err_pernet; + + ip6_frags.hashfn = ip6_hashfn; + ip6_frags.constructor = ip6_frag_init; + ip6_frags.destructor = NULL; + ip6_frags.skb_free = NULL; + ip6_frags.qsize = sizeof(struct frag_queue); + ip6_frags.match = ip6_frag_match; + ip6_frags.frag_expire = ip6_frag_expire; + ip6_frags.secret_interval = 10 * 60 * HZ; + inet_frags_init(&ip6_frags); +out: + return ret; + +err_pernet: + ip6_frags_sysctl_unregister(); +err_sysctl: + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + goto out; +} + +void ipv6_frag_exit(void) +{ + inet_frags_fini(&ip6_frags); + ip6_frags_sysctl_unregister(); + unregister_pernet_subsys(&ip6_frags_ops); + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); +} diff --git a/net/ipv6/route.c b/net/ipv6/route.c new file mode 100644 index 00000000..8e600f82 --- /dev/null +++ b/net/ipv6/route.c @@ -0,0 +1,2976 @@ +/* + * Linux INET6 implementation + * FIB front-end. + * + * Authors: + * Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * YOSHIFUJI Hideaki @USAGI + * reworked default router selection. + * - respect outgoing interface + * - select from (probably) reachable routers (i.e. + * routers in REACHABLE, STALE, DELAY or PROBE states). + * - always select the same router if it is (probably) + * reachable. otherwise, round-robin the list. + * Ville Nuorvala + * Fixed routing subtrees. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SYSCTL +#include +#endif + +/* Set to 3 to get tracing. */ +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RDBG(x) printk x +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RDBG(x) +#define RT6_TRACE(x...) do { ; } while (0) +#endif + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ip6_default_advmss(const struct dst_entry *dst); +static unsigned int ip6_default_mtu(const struct dst_entry *dst); +static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_dst_destroy(struct dst_entry *); +static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +static int ip6_dst_gc(struct dst_ops *ops); + +static int ip6_pkt_discard(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sk_buff *skb); +static void ip6_link_failure(struct sk_buff *skb); +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); + +#ifdef CONFIG_IPV6_ROUTE_INFO +static struct rt6_info *rt6_add_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned pref); +static struct rt6_info *rt6_get_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex); +#endif + +static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + + peer = rt->rt6i_peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} + +static struct dst_ops ip6_dst_ops_template = { + .family = AF_INET6, + .protocol = cpu_to_be16(ETH_P_IPV6), + .gc = ip6_dst_gc, + .gc_thresh = 1024, + .check = ip6_dst_check, + .default_advmss = ip6_default_advmss, + .default_mtu = ip6_default_mtu, + .cow_metrics = ipv6_cow_metrics, + .destroy = ip6_dst_destroy, + .ifdown = ip6_dst_ifdown, + .negative_advice = ip6_negative_advice, + .link_failure = ip6_link_failure, + .update_pmtu = ip6_rt_update_pmtu, + .local_out = __ip6_local_out, +}; + +static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst) +{ + return 0; +} + +static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +{ +} + +static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + +static struct dst_ops ip6_dst_blackhole_ops = { + .family = AF_INET6, + .protocol = cpu_to_be16(ETH_P_IPV6), + .destroy = ip6_dst_destroy, + .check = ip6_dst_check, + .default_mtu = ip6_blackhole_default_mtu, + .default_advmss = ip6_default_advmss, + .update_pmtu = ip6_rt_blackhole_update_pmtu, + .cow_metrics = ip6_rt_blackhole_cow_metrics, +}; + +static const u32 ip6_template_metrics[RTAX_MAX] = { + [RTAX_HOPLIMIT - 1] = 255, +}; + +static struct rt6_info ip6_null_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -ENETUNREACH, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + +static int ip6_pkt_prohibit(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sk_buff *skb); + +static struct rt6_info ip6_prohibit_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -EACCES, + .input = ip6_pkt_prohibit, + .output = ip6_pkt_prohibit_out, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +static struct rt6_info ip6_blk_hole_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -EINVAL, + .input = dst_discard, + .output = dst_discard, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +#endif + +/* allocate dst with ip6_dst_ops */ +static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops, + struct net_device *dev, + int flags) +{ + struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags); + + memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); + + return rt; +} + +static void ip6_dst_destroy(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + struct inet_peer *peer = rt->rt6i_peer; + + if (idev != NULL) { + rt->rt6i_idev = NULL; + in6_dev_put(idev); + } + if (peer) { + rt->rt6i_peer = NULL; + inet_putpeer(peer); + } +} + +static atomic_t __rt6_peer_genid = ATOMIC_INIT(0); + +static u32 rt6_peer_genid(void) +{ + return atomic_read(&__rt6_peer_genid); +} + +void rt6_bind_peer(struct rt6_info *rt, int create) +{ + struct inet_peer *peer; + + peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create); + if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL) + inet_putpeer(peer); + else + rt->rt6i_peer_genid = rt6_peer_genid(); +} + +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *loopback_dev = + dev_net(dev)->loopback_dev; + + if (dev != loopback_dev && idev != NULL && idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(loopback_dev); + if (loopback_idev != NULL) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } + } +} + +static __inline__ int rt6_check_expired(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & RTF_EXPIRES) && + time_after(jiffies, rt->rt6i_expires); +} + +static inline int rt6_need_strict(const struct in6_addr *daddr) +{ + return ipv6_addr_type(daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); +} + +/* + * Route lookup. Any table->tb6_lock is implied. + */ + +static inline struct rt6_info *rt6_device_match(struct net *net, + struct rt6_info *rt, + const struct in6_addr *saddr, + int oif, + int flags) +{ + struct rt6_info *local = NULL; + struct rt6_info *sprt; + + if (!oif && ipv6_addr_any(saddr)) + goto out; + + for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + struct net_device *dev = sprt->rt6i_dev; + + if (oif) { + if (dev->ifindex == oif) + return sprt; + if (dev->flags & IFF_LOOPBACK) { + if (sprt->rt6i_idev == NULL || + sprt->rt6i_idev->dev->ifindex != oif) { + if (flags & RT6_LOOKUP_F_IFACE && oif) + continue; + if (local && (!oif || + local->rt6i_idev->dev->ifindex == oif)) + continue; + } + local = sprt; + } + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return sprt; + } + } + + if (oif) { + if (local) + return local; + + if (flags & RT6_LOOKUP_F_IFACE) + return net->ipv6.ip6_null_entry; + } +out: + return rt; +} + +#ifdef CONFIG_IPV6_ROUTER_PREF +static void rt6_probe(struct rt6_info *rt) +{ + struct neighbour *neigh; + /* + * Okay, this does not seem to be appropriate + * for now, however, we need to check if it + * is really so; aka Router Reachability Probing. + * + * Router Reachability Probe MUST be rate-limited + * to no more than one per minute. + */ + rcu_read_lock(); + neigh = rt ? dst_get_neighbour(&rt->dst) : NULL; + if (!neigh || (neigh->nud_state & NUD_VALID)) + goto out; + read_lock_bh(&neigh->lock); + if (!(neigh->nud_state & NUD_VALID) && + time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { + struct in6_addr mcaddr; + struct in6_addr *target; + + neigh->updated = jiffies; + read_unlock_bh(&neigh->lock); + + target = (struct in6_addr *)&neigh->primary_key; + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL); + } else { + read_unlock_bh(&neigh->lock); + } +out: + rcu_read_unlock(); +} +#else +static inline void rt6_probe(struct rt6_info *rt) +{ +} +#endif + +/* + * Default Router Selection (RFC 2461 6.3.6) + */ +static inline int rt6_check_dev(struct rt6_info *rt, int oif) +{ + struct net_device *dev = rt->rt6i_dev; + if (!oif || dev->ifindex == oif) + return 2; + if ((dev->flags & IFF_LOOPBACK) && + rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) + return 1; + return 0; +} + +static inline int rt6_check_neigh(struct rt6_info *rt) +{ + struct neighbour *neigh; + int m; + + rcu_read_lock(); + neigh = dst_get_neighbour(&rt->dst); + if (rt->rt6i_flags & RTF_NONEXTHOP || + !(rt->rt6i_flags & RTF_GATEWAY)) + m = 1; + else if (neigh) { + read_lock_bh(&neigh->lock); + if (neigh->nud_state & NUD_VALID) + m = 2; +#ifdef CONFIG_IPV6_ROUTER_PREF + else if (neigh->nud_state & NUD_FAILED) + m = 0; +#endif + else + m = 1; + read_unlock_bh(&neigh->lock); + } else + m = 0; + rcu_read_unlock(); + return m; +} + +static int rt6_score_route(struct rt6_info *rt, int oif, + int strict) +{ + int m, n; + + m = rt6_check_dev(rt, oif); + if (!m && (strict & RT6_LOOKUP_F_IFACE)) + return -1; +#ifdef CONFIG_IPV6_ROUTER_PREF + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; +#endif + n = rt6_check_neigh(rt); + if (!n && (strict & RT6_LOOKUP_F_REACHABLE)) + return -1; + return m; +} + +static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, + int *mpri, struct rt6_info *match) +{ + int m; + + if (rt6_check_expired(rt)) + goto out; + + m = rt6_score_route(rt, oif, strict); + if (m < 0) + goto out; + + if (m > *mpri) { + if (strict & RT6_LOOKUP_F_REACHABLE) + rt6_probe(match); + *mpri = m; + match = rt; + } else if (strict & RT6_LOOKUP_F_REACHABLE) { + rt6_probe(rt); + } + +out: + return match; +} + +static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *rr_head, + u32 metric, int oif, int strict) +{ + struct rt6_info *rt, *match; + int mpri = -1; + + match = NULL; + for (rt = rr_head; rt && rt->rt6i_metric == metric; + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match); + for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match); + + return match; +} + +static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +{ + struct rt6_info *match, *rt0; + struct net *net; + + RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n", + __func__, fn->leaf, oif); + + rt0 = fn->rr_ptr; + if (!rt0) + fn->rr_ptr = rt0 = fn->leaf; + + match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); + + if (!match && + (strict & RT6_LOOKUP_F_REACHABLE)) { + struct rt6_info *next = rt0->dst.rt6_next; + + /* no entries matched; do round-robin */ + if (!next || next->rt6i_metric != rt0->rt6i_metric) + next = fn->leaf; + + if (next != rt0) + fn->rr_ptr = next; + } + + RT6_TRACE("%s() => %p\n", + __func__, match); + + net = dev_net(rt0->rt6i_dev); + return match ? match : net->ipv6.ip6_null_entry; +} + +#ifdef CONFIG_IPV6_ROUTE_INFO +int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, + const struct in6_addr *gwaddr) +{ + struct net *net = dev_net(dev); + struct route_info *rinfo = (struct route_info *) opt; + struct in6_addr prefix_buf, *prefix; + unsigned int pref; + unsigned long lifetime; + struct rt6_info *rt; + + if (len < sizeof(struct route_info)) { + return -EINVAL; + } + + /* Sanity check for prefix_len and length */ + if (rinfo->length > 3) { + return -EINVAL; + } else if (rinfo->prefix_len > 128) { + return -EINVAL; + } else if (rinfo->prefix_len > 64) { + if (rinfo->length < 2) { + return -EINVAL; + } + } else if (rinfo->prefix_len > 0) { + if (rinfo->length < 1) { + return -EINVAL; + } + } + + pref = rinfo->route_pref; + if (pref == ICMPV6_ROUTER_PREF_INVALID) + return -EINVAL; + + lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); + + if (rinfo->length == 3) + prefix = (struct in6_addr *)rinfo->prefix; + else { + /* this function is safe */ + ipv6_addr_prefix(&prefix_buf, + (struct in6_addr *)rinfo->prefix, + rinfo->prefix_len); + prefix = &prefix_buf; + } + + rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, + dev->ifindex); + + if (rt && !lifetime) { + ip6_del_rt(rt); + rt = NULL; + } + + if (!rt && lifetime) + rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex, + pref); + else if (rt) + rt->rt6i_flags = RTF_ROUTEINFO | + (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + + if (rt) { + if (!addrconf_finite_timeout(lifetime)) { + rt->rt6i_flags &= ~RTF_EXPIRES; + } else { + rt->rt6i_expires = jiffies + HZ * lifetime; + rt->rt6i_flags |= RTF_EXPIRES; + } + dst_release(&rt->dst); + } + return 0; +} +#endif + +#define BACKTRACK(__net, saddr) \ +do { \ + if (rt == __net->ipv6.ip6_null_entry) { \ + struct fib6_node *pn; \ + while (1) { \ + if (fn->fn_flags & RTN_TL_ROOT) \ + goto out; \ + pn = fn->parent; \ + if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \ + fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \ + else \ + fn = pn; \ + if (fn->fn_flags & RTN_RTINFO) \ + goto restart; \ + } \ + } \ +} while(0) + +static struct rt6_info *ip6_pol_route_lookup(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, int flags) +{ + struct fib6_node *fn; + struct rt6_info *rt; + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + rt = fn->leaf; + rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); + BACKTRACK(net, &fl6->saddr); +out: + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); + return rt; + +} + +struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, + const struct in6_addr *saddr, int oif, int strict) +{ + struct flowi6 fl6 = { + .flowi6_oif = oif, + .daddr = *daddr, + }; + struct dst_entry *dst; + int flags = strict ? RT6_LOOKUP_F_IFACE : 0; + + if (saddr) { + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); + flags |= RT6_LOOKUP_F_HAS_SADDR; + } + + dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); + if (dst->error == 0) + return (struct rt6_info *) dst; + + dst_release(dst); + + return NULL; +} + +EXPORT_SYMBOL(rt6_lookup); + +/* ip6_ins_rt is called with FREE table->tb6_lock. + It takes new route entry, the addition fails by any reason the + route is freed. In any case, if caller does not hold it, it may + be destroyed. + */ + +static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) +{ + int err; + struct fib6_table *table; + + table = rt->rt6i_table; + write_lock_bh(&table->tb6_lock); + err = fib6_add(&table->tb6_root, rt, info); + write_unlock_bh(&table->tb6_lock); + + return err; +} + +int ip6_ins_rt(struct rt6_info *rt) +{ + struct nl_info info = { + .nl_net = dev_net(rt->rt6i_dev), + }; + return __ip6_ins_rt(rt, &info); +} + +static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_info *rt; + + /* + * Clone the route. + */ + + rt = ip6_rt_copy(ort); + + if (rt) { + struct neighbour *neigh; + int attempts = !in_softirq(); + + if (!(rt->rt6i_flags&RTF_GATEWAY)) { + if (rt->rt6i_dst.plen != 128 && + ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; + ipv6_addr_copy(&rt->rt6i_gateway, daddr); + } + + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); + rt->rt6i_dst.plen = 128; + rt->rt6i_flags |= RTF_CACHE; + rt->dst.flags |= DST_HOST; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen && saddr) { + ipv6_addr_copy(&rt->rt6i_src.addr, saddr); + rt->rt6i_src.plen = 128; + } +#endif + + retry: + neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + if (IS_ERR(neigh)) { + struct net *net = dev_net(rt->rt6i_dev); + int saved_rt_min_interval = + net->ipv6.sysctl.ip6_rt_gc_min_interval; + int saved_rt_elasticity = + net->ipv6.sysctl.ip6_rt_gc_elasticity; + + if (attempts-- > 0) { + net->ipv6.sysctl.ip6_rt_gc_elasticity = 1; + net->ipv6.sysctl.ip6_rt_gc_min_interval = 0; + + ip6_dst_gc(&net->ipv6.ip6_dst_ops); + + net->ipv6.sysctl.ip6_rt_gc_elasticity = + saved_rt_elasticity; + net->ipv6.sysctl.ip6_rt_gc_min_interval = + saved_rt_min_interval; + goto retry; + } + + if (net_ratelimit()) + printk(KERN_WARNING + "ipv6: Neighbour table overflow.\n"); + dst_free(&rt->dst); + return NULL; + } + dst_set_neighbour(&rt->dst, neigh); + } + + return rt; +} + +static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr) +{ + struct rt6_info *rt = ip6_rt_copy(ort); + if (rt) { + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); + rt->rt6i_dst.plen = 128; + rt->rt6i_flags |= RTF_CACHE; + rt->dst.flags |= DST_HOST; + dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst))); + } + return rt; +} + +static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, + struct flowi6 *fl6, int flags) +{ + struct fib6_node *fn; + struct rt6_info *rt, *nrt; + int strict = 0; + int attempts = 3; + int err; + int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; + + strict |= flags & RT6_LOOKUP_F_IFACE; + +relookup: + read_lock_bh(&table->tb6_lock); + +restart_2: + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + +restart: + rt = rt6_select(fn, oif, strict | reachable); + + BACKTRACK(net, &fl6->saddr); + if (rt == net->ipv6.ip6_null_entry || + rt->rt6i_flags & RTF_CACHE) + goto out; + + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + + if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); + else if (!(rt->dst.flags & DST_HOST)) + nrt = rt6_alloc_clone(rt, &fl6->daddr); + else + goto out2; + + dst_release(&rt->dst); + rt = nrt ? : net->ipv6.ip6_null_entry; + + dst_hold(&rt->dst); + if (nrt) { + err = ip6_ins_rt(nrt); + if (!err) + goto out2; + } + + if (--attempts <= 0) + goto out2; + + /* + * Race condition! In the gap, when table->tb6_lock was + * released someone could insert this route. Relookup. + */ + dst_release(&rt->dst); + goto relookup; + +out: + if (reachable) { + reachable = 0; + goto restart_2; + } + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); +out2: + rt->dst.lastuse = jiffies; + rt->dst.__use++; + + return rt; +} + +static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, + struct flowi6 *fl6, int flags) +{ + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); +} + +void ip6_route_input(struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK, + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, + }; + + if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG) + flags |= RT6_LOOKUP_F_IFACE; + + skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input)); +} + +static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, + struct flowi6 *fl6, int flags) +{ + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); +} + +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + int flags = 0; + + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) + flags |= RT6_LOOKUP_F_IFACE; + + if (!ipv6_addr_any(&fl6->saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + else if (sk) + flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); + + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); +} + +EXPORT_SYMBOL(ip6_route_output); + +struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) +{ + struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct dst_entry *new = NULL; + + rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0); + if (rt) { + memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); + + new = &rt->dst; + + new->__use = 1; + new->input = dst_discard; + new->output = dst_discard; + + dst_copy_metrics(new, &ort->dst); + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->rt6i_expires = 0; + + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_metric = 0; + + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); +#ifdef CONFIG_IPV6_SUBTREES + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif + + dst_free(new); + } + + dst_release(dst_orig); + return new ? new : ERR_PTR(-ENOMEM); +} + +/* + * Destination cache support functions + */ + +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rt6_info *rt; + + rt = (struct rt6_info *) dst; + + if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) { + if (rt->rt6i_peer_genid != rt6_peer_genid()) { + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 0); + rt->rt6i_peer_genid = rt6_peer_genid(); + } + return dst; + } + return NULL; +} + +static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + + if (rt) { + if (rt->rt6i_flags & RTF_CACHE) { + if (rt6_check_expired(rt)) { + ip6_del_rt(rt); + dst = NULL; + } + } else { + dst_release(dst); + dst = NULL; + } + } + return dst; +} + +static void ip6_link_failure(struct sk_buff *skb) +{ + struct rt6_info *rt; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); + + rt = (struct rt6_info *) skb_dst(skb); + if (rt) { + if (rt->rt6i_flags&RTF_CACHE) { + dst_set_expires(&rt->dst, 0); + rt->rt6i_flags |= RTF_EXPIRES; + } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + rt->rt6i_node->fn_sernum = -1; + } +} + +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct rt6_info *rt6 = (struct rt6_info*)dst; + + if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + if (mtu < IPV6_MIN_MTU) { + u32 features = dst_metric(dst, RTAX_FEATURES); + mtu = IPV6_MIN_MTU; + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(dst, RTAX_FEATURES, features); + } + dst_metric_set(dst, RTAX_MTU, mtu); + } +} + +static unsigned int ip6_default_advmss(const struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + unsigned int mtu = dst_mtu(dst); + struct net *net = dev_net(dev); + + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + + if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) + mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + + /* + * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and + * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. + * IPV6_MAXPLEN is also valid and means: "any MSS, + * rely only on pmtu discovery" + */ + if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) + mtu = IPV6_MAXPLEN; + return mtu; +} + +static unsigned int ip6_default_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = IPV6_MIN_MTU; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + +static struct dst_entry *icmp6_dst_gc_list; +static DEFINE_SPINLOCK(icmp6_dst_lock); + +struct dst_entry *icmp6_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct inet6_dev *idev = in6_dev_get(dev); + struct net *net = dev_net(dev); + + if (unlikely(idev == NULL)) + return NULL; + + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); + if (unlikely(rt == NULL)) { + in6_dev_put(idev); + goto out; + } + + if (neigh) + neigh_hold(neigh); + else { + neigh = ndisc_get_neigh(dev, addr); + if (IS_ERR(neigh)) + neigh = NULL; + } + + rt->rt6i_idev = idev; + dst_set_neighbour(&rt->dst, neigh); + atomic_set(&rt->dst.__refcnt, 1); + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); + rt->dst.output = ip6_output; + + spin_lock_bh(&icmp6_dst_lock); + rt->dst.next = icmp6_dst_gc_list; + icmp6_dst_gc_list = &rt->dst; + spin_unlock_bh(&icmp6_dst_lock); + + fib6_force_start_gc(net); + +out: + return &rt->dst; +} + +int icmp6_dst_gc(void) +{ + struct dst_entry *dst, **pprev; + int more = 0; + + spin_lock_bh(&icmp6_dst_lock); + pprev = &icmp6_dst_gc_list; + + while ((dst = *pprev) != NULL) { + if (!atomic_read(&dst->__refcnt)) { + *pprev = dst->next; + dst_free(dst); + } else { + pprev = &dst->next; + ++more; + } + } + + spin_unlock_bh(&icmp6_dst_lock); + + return more; +} + +static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), + void *arg) +{ + struct dst_entry *dst, **pprev; + + spin_lock_bh(&icmp6_dst_lock); + pprev = &icmp6_dst_gc_list; + while ((dst = *pprev) != NULL) { + struct rt6_info *rt = (struct rt6_info *) dst; + if (func(rt, arg)) { + *pprev = dst->next; + dst_free(dst); + } else { + pprev = &dst->next; + } + } + spin_unlock_bh(&icmp6_dst_lock); +} + +static int ip6_dst_gc(struct dst_ops *ops) +{ + unsigned long now = jiffies; + struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); + int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; + int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int entries; + + entries = dst_entries_get_fast(ops); + if (time_after(rt_last_gc + rt_min_interval, now) && + entries <= rt_max_size) + goto out; + + net->ipv6.ip6_rt_gc_expire++; + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net); + net->ipv6.ip6_rt_last_gc = now; + entries = dst_entries_get_slow(ops); + if (entries < ops->gc_thresh) + net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; +out: + net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + return entries > rt_max_size; +} + +/* Clean host part of a prefix. Not necessary in radix tree, + but results in cleaner routing tables. + + Remove it only when all the things will work! + */ + +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + if (hoplimit == 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) + hoplimit = idev->cnf.hop_limit; + else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + rcu_read_unlock(); + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); + +/* + * + */ + +int ip6_route_add(struct fib6_config *cfg) +{ + int err; + struct net *net = cfg->fc_nlinfo.nl_net; + struct rt6_info *rt = NULL; + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + struct fib6_table *table; + int addr_type; + + if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) + return -EINVAL; +#ifndef CONFIG_IPV6_SUBTREES + if (cfg->fc_src_len) + return -EINVAL; +#endif + if (cfg->fc_ifindex) { + err = -ENODEV; + dev = dev_get_by_index(net, cfg->fc_ifindex); + if (!dev) + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; + } + + if (cfg->fc_metric == 0) + cfg->fc_metric = IP6_RT_PRIO_USER; + + table = fib6_new_table(net, cfg->fc_table); + if (table == NULL) { + err = -ENOBUFS; + goto out; + } + + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT); + + if (rt == NULL) { + err = -ENOMEM; + goto out; + } + + rt->dst.obsolete = -1; + rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ? + jiffies + clock_t_to_jiffies(cfg->fc_expires) : + 0; + + if (cfg->fc_protocol == RTPROT_UNSPEC) + cfg->fc_protocol = RTPROT_BOOT; + rt->rt6i_protocol = cfg->fc_protocol; + + addr_type = ipv6_addr_type(&cfg->fc_dst); + + if (addr_type & IPV6_ADDR_MULTICAST) + rt->dst.input = ip6_mc_input; + else if (cfg->fc_flags & RTF_LOCAL) + rt->dst.input = ip6_input; + else + rt->dst.input = ip6_forward; + + rt->dst.output = ip6_output; + + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); + rt->rt6i_dst.plen = cfg->fc_dst_len; + if (rt->rt6i_dst.plen == 128) + rt->dst.flags |= DST_HOST; + +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); + rt->rt6i_src.plen = cfg->fc_src_len; +#endif + + rt->rt6i_metric = cfg->fc_metric; + + /* We cannot add true routes via loopback here, + they would result in kernel looping; promote them to reject routes + */ + if ((cfg->fc_flags & RTF_REJECT) || + (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK) + && !(cfg->fc_flags&RTF_LOCAL))) { + /* hold loopback dev/idev if we haven't done so. */ + if (dev != net->loopback_dev) { + if (dev) { + dev_put(dev); + in6_dev_put(idev); + } + dev = net->loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + rt->dst.error = -ENETUNREACH; + rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + goto install_route; + } + + if (cfg->fc_flags & RTF_GATEWAY) { + const struct in6_addr *gw_addr; + int gwa_type; + + gw_addr = &cfg->fc_gateway; + ipv6_addr_copy(&rt->rt6i_gateway, gw_addr); + gwa_type = ipv6_addr_type(gw_addr); + + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { + struct rt6_info *grt; + + /* IPv6 strictly inhibits using not link-local + addresses as nexthop address. + Otherwise, router will not able to send redirects. + It is very good, but in some (rare!) circumstances + (SIT, PtP, NBMA NOARP links) it is handy to allow + some exceptions. --ANK + */ + err = -EINVAL; + if (!(gwa_type&IPV6_ADDR_UNICAST)) + goto out; + + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + + err = -EHOSTUNREACH; + if (grt == NULL) + goto out; + if (dev) { + if (dev != grt->rt6i_dev) { + dst_release(&grt->dst); + goto out; + } + } else { + dev = grt->rt6i_dev; + idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + if (!(grt->rt6i_flags&RTF_GATEWAY)) + err = 0; + dst_release(&grt->dst); + + if (err) + goto out; + } + err = -EINVAL; + if (dev == NULL || (dev->flags&IFF_LOOPBACK)) + goto out; + } + + err = -ENODEV; + if (dev == NULL) + goto out; + + if (!ipv6_addr_any(&cfg->fc_prefsrc)) { + if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { + err = -EINVAL; + goto out; + } + ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc); + rt->rt6i_prefsrc.plen = 128; + } else + rt->rt6i_prefsrc.plen = 0; + + if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { + struct neighbour *neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + dst_set_neighbour(&rt->dst, neigh); + } + + rt->rt6i_flags = cfg->fc_flags; + +install_route: + if (cfg->fc_mx) { + struct nlattr *nla; + int remaining; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + + if (type) { + if (type > RTAX_MAX) { + err = -EINVAL; + goto out; + } + + dst_metric_set(&rt->dst, type, nla_get_u32(nla)); + } + } + } + + rt->dst.dev = dev; + rt->rt6i_idev = idev; + rt->rt6i_table = table; + + cfg->fc_nlinfo.nl_net = dev_net(dev); + + return __ip6_ins_rt(rt, &cfg->fc_nlinfo); + +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free(&rt->dst); + return err; +} + +static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) +{ + int err; + struct fib6_table *table; + struct net *net = dev_net(rt->rt6i_dev); + + if (rt == net->ipv6.ip6_null_entry) + return -ENOENT; + + table = rt->rt6i_table; + write_lock_bh(&table->tb6_lock); + + err = fib6_del(rt, info); + dst_release(&rt->dst); + + write_unlock_bh(&table->tb6_lock); + + return err; +} + +int ip6_del_rt(struct rt6_info *rt) +{ + struct nl_info info = { + .nl_net = dev_net(rt->rt6i_dev), + }; + return __ip6_del_rt(rt, &info); +} + +static int ip6_route_del(struct fib6_config *cfg) +{ + struct fib6_table *table; + struct fib6_node *fn; + struct rt6_info *rt; + int err = -ESRCH; + + table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); + if (table == NULL) + return err; + + read_lock_bh(&table->tb6_lock); + + fn = fib6_locate(&table->tb6_root, + &cfg->fc_dst, cfg->fc_dst_len, + &cfg->fc_src, cfg->fc_src_len); + + if (fn) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (cfg->fc_ifindex && + (rt->rt6i_dev == NULL || + rt->rt6i_dev->ifindex != cfg->fc_ifindex)) + continue; + if (cfg->fc_flags & RTF_GATEWAY && + !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + continue; + if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) + continue; + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + } + read_unlock_bh(&table->tb6_lock); + + return err; +} + +/* + * Handle redirects + */ +struct ip6rd_flowi { + struct flowi6 fl6; + struct in6_addr gateway; +}; + +static struct rt6_info *__ip6_route_redirect(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + int flags) +{ + struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; + struct rt6_info *rt; + struct fib6_node *fn; + + /* + * Get the "current" route for this destination and + * check if the redirect has come from approriate router. + * + * RFC 2461 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way the routes are chosen, this notion + * is a bit fuzzy and one might need to check all possible + * routes. + */ + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + /* + * Current route is on-link; redirect is always invalid. + * + * Seems, previous statement is not true. It could + * be node, which looks for us as on-link (f.e. proxy ndisc) + * But then router serving it might decide, that we should + * know truth 8)8) --ANK (980726). + */ + if (rt6_check_expired(rt)) + continue; + if (!(rt->rt6i_flags & RTF_GATEWAY)) + continue; + if (fl6->flowi6_oif != rt->rt6i_dev->ifindex) + continue; + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + continue; + break; + } + + if (!rt) + rt = net->ipv6.ip6_null_entry; + BACKTRACK(net, &fl6->saddr); +out: + dst_hold(&rt->dst); + + read_unlock_bh(&table->tb6_lock); + + return rt; +}; + +static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest, + const struct in6_addr *src, + const struct in6_addr *gateway, + struct net_device *dev) +{ + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct net *net = dev_net(dev); + struct ip6rd_flowi rdfl = { + .fl6 = { + .flowi6_oif = dev->ifindex, + .daddr = *dest, + .saddr = *src, + }, + }; + + ipv6_addr_copy(&rdfl.gateway, gateway); + + if (rt6_need_strict(dest)) + flags |= RT6_LOOKUP_F_IFACE; + + return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6, + flags, __ip6_route_redirect); +} + +void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, + const struct in6_addr *saddr, + struct neighbour *neigh, u8 *lladdr, int on_link) +{ + struct rt6_info *rt, *nrt = NULL; + struct netevent_redirect netevent; + struct net *net = dev_net(neigh->dev); + + rt = ip6_route_redirect(dest, src, saddr, neigh->dev); + + if (rt == net->ipv6.ip6_null_entry) { + if (net_ratelimit()) + printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " + "for redirect target\n"); + goto out; + } + + /* + * We have finally decided to accept it. + */ + + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER)) + ); + + /* + * Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); + + /* Duplicate redirect: silently ignore. */ + if (neigh == dst_get_neighbour_raw(&rt->dst)) + goto out; + + nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + if (on_link) + nrt->rt6i_flags &= ~RTF_GATEWAY; + + ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); + nrt->rt6i_dst.plen = 128; + nrt->dst.flags |= DST_HOST; + + ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); + dst_set_neighbour(&nrt->dst, neigh_clone(neigh)); + + if (ip6_ins_rt(nrt)) + goto out; + + netevent.old = &rt->dst; + netevent.new = &nrt->dst; + call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); + + if (rt->rt6i_flags&RTF_CACHE) { + ip6_del_rt(rt); + return; + } + +out: + dst_release(&rt->dst); +} + +/* + * Handle ICMP "packet too big" messages + * i.e. Path MTU discovery + */ + +static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr, + struct net *net, u32 pmtu, int ifindex) +{ + struct rt6_info *rt, *nrt; + int allfrag = 0; +again: + rt = rt6_lookup(net, daddr, saddr, ifindex, 0); + if (rt == NULL) + return; + + if (rt6_check_expired(rt)) { + ip6_del_rt(rt); + goto again; + } + + if (pmtu >= dst_mtu(&rt->dst)) + goto out; + + if (pmtu < IPV6_MIN_MTU) { + /* + * According to RFC2460, PMTU is set to the IPv6 Minimum Link + * MTU (1280) and a fragment header should always be included + * after a node receiving Too Big message reporting PMTU is + * less than the IPv6 Minimum Link MTU. + */ + pmtu = IPV6_MIN_MTU; + allfrag = 1; + } + + /* New mtu received -> path was valid. + They are sent only in response to data packets, + so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); + + /* Host route. If it is static, it would be better + not to override it, but add new one, so that + when cache entry will expire old pmtu + would return automatically. + */ + if (rt->rt6i_flags & RTF_CACHE) { + dst_metric_set(&rt->dst, RTAX_MTU, pmtu); + if (allfrag) { + u32 features = dst_metric(&rt->dst, RTAX_FEATURES); + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(&rt->dst, RTAX_FEATURES, features); + } + dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); + rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; + goto out; + } + + /* Network route. + Two cases are possible: + 1. It is connected route. Action: COW + 2. It is gatewayed route or NONEXTHOP route. Action: clone it. + */ + if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + nrt = rt6_alloc_cow(rt, daddr, saddr); + else + nrt = rt6_alloc_clone(rt, daddr); + + if (nrt) { + dst_metric_set(&nrt->dst, RTAX_MTU, pmtu); + if (allfrag) { + u32 features = dst_metric(&nrt->dst, RTAX_FEATURES); + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(&nrt->dst, RTAX_FEATURES, features); + } + + /* According to RFC 1981, detecting PMTU increase shouldn't be + * happened within 5 mins, the recommended timer is 10 mins. + * Here this route expiration time is set to ip6_rt_mtu_expires + * which is 10 mins. After 10 mins the decreased pmtu is expired + * and detecting PMTU increase will be automatically happened. + */ + dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; + + ip6_ins_rt(nrt); + } +out: + dst_release(&rt->dst); +} + +void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr, + struct net_device *dev, u32 pmtu) +{ + struct net *net = dev_net(dev); + + /* + * RFC 1981 states that a node "MUST reduce the size of the packets it + * is sending along the path" that caused the Packet Too Big message. + * Since it's not possible in the general case to determine which + * interface was used to send the original packet, we update the MTU + * on the interface that will be used to send future packets. We also + * update the MTU on the interface that received the Packet Too Big in + * case the original packet was forced out that interface with + * SO_BINDTODEVICE or similar. This is the next best thing to the + * correct behaviour, which would be to update the MTU on all + * interfaces. + */ + rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0); + rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex); +} + +/* + * Misc support functions + */ + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) +{ + struct net *net = dev_net(ort->rt6i_dev); + struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, + ort->dst.dev, 0); + + if (rt) { + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + + dst_copy_metrics(&rt->dst, &ort->dst); + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_expires = 0; + + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_metric = 0; + + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); +#ifdef CONFIG_IPV6_SUBTREES + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif + memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); + rt->rt6i_table = ort->rt6i_table; + } + return rt; +} + +#ifdef CONFIG_IPV6_ROUTE_INFO +static struct rt6_info *rt6_get_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex) +{ + struct fib6_node *fn; + struct rt6_info *rt = NULL; + struct fib6_table *table; + + table = fib6_get_table(net, RT6_TABLE_INFO); + if (table == NULL) + return NULL; + + write_lock_bh(&table->tb6_lock); + fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); + if (!fn) + goto out; + + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_dev->ifindex != ifindex) + continue; + if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + continue; + if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) + continue; + dst_hold(&rt->dst); + break; + } +out: + write_unlock_bh(&table->tb6_lock); + return rt; +} + +static struct rt6_info *rt6_add_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned pref) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_INFO, + .fc_metric = IP6_RT_PRIO_USER, + .fc_ifindex = ifindex, + .fc_dst_len = prefixlen, + .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | + RTF_UP | RTF_PREF(pref), + .fc_nlinfo.pid = 0, + .fc_nlinfo.nlh = NULL, + .fc_nlinfo.nl_net = net, + }; + + ipv6_addr_copy(&cfg.fc_dst, prefix); + ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + + /* We should treat it as a default route if prefix length is 0. */ + if (!prefixlen) + cfg.fc_flags |= RTF_DEFAULT; + + ip6_route_add(&cfg); + + return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex); +} +#endif + +struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) +{ + struct rt6_info *rt; + struct fib6_table *table; + + table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); + if (table == NULL) + return NULL; + + write_lock_bh(&table->tb6_lock); + for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + if (dev == rt->rt6i_dev && + ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && + ipv6_addr_equal(&rt->rt6i_gateway, addr)) + break; + } + if (rt) + dst_hold(&rt->dst); + write_unlock_bh(&table->tb6_lock); + return rt; +} + +struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, + struct net_device *dev, + unsigned int pref) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_DFLT, + .fc_metric = IP6_RT_PRIO_USER, + .fc_ifindex = dev->ifindex, + .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | + RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_nlinfo.pid = 0, + .fc_nlinfo.nlh = NULL, + .fc_nlinfo.nl_net = dev_net(dev), + }; + + ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + + ip6_route_add(&cfg); + + return rt6_get_dflt_router(gwaddr, dev); +} + +void rt6_purge_dflt_routers(struct net *net) +{ + struct rt6_info *rt; + struct fib6_table *table; + + /* NOTE: Keep consistent with rt6_get_dflt_router */ + table = fib6_get_table(net, RT6_TABLE_DFLT); + if (table == NULL) + return; + +restart: + read_lock_bh(&table->tb6_lock); + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + ip6_del_rt(rt); + goto restart; + } + } + read_unlock_bh(&table->tb6_lock); +} + +static void rtmsg_to_fib6_config(struct net *net, + struct in6_rtmsg *rtmsg, + struct fib6_config *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->fc_table = RT6_TABLE_MAIN; + cfg->fc_ifindex = rtmsg->rtmsg_ifindex; + cfg->fc_metric = rtmsg->rtmsg_metric; + cfg->fc_expires = rtmsg->rtmsg_info; + cfg->fc_dst_len = rtmsg->rtmsg_dst_len; + cfg->fc_src_len = rtmsg->rtmsg_src_len; + cfg->fc_flags = rtmsg->rtmsg_flags; + + cfg->fc_nlinfo.nl_net = net; + + ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst); + ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src); + ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway); +} + +int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct fib6_config cfg; + struct in6_rtmsg rtmsg; + int err; + + switch(cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = copy_from_user(&rtmsg, arg, + sizeof(struct in6_rtmsg)); + if (err) + return -EFAULT; + + rtmsg_to_fib6_config(net, &rtmsg, &cfg); + + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = ip6_route_add(&cfg); + break; + case SIOCDELRT: + err = ip6_route_del(&cfg); + break; + default: + err = -EINVAL; + } + rtnl_unlock(); + + return err; + } + + return -EINVAL; +} + +/* + * Drop the packet on the floor + */ + +static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) +{ + int type; + struct dst_entry *dst = skb_dst(skb); + switch (ipstats_mib_noroutes) { + case IPSTATS_MIB_INNOROUTES: + type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); + if (type == IPV6_ADDR_ANY) { + IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IPSTATS_MIB_INADDRERRORS); + break; + } + /* FALLTHROUGH */ + case IPSTATS_MIB_OUTNOROUTES: + IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + ipstats_mib_noroutes); + break; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); + kfree_skb(skb); + return 0; +} + +static int ip6_pkt_discard(struct sk_buff *skb) +{ + return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); +} + +static int ip6_pkt_discard_out(struct sk_buff *skb) +{ + skb->dev = skb_dst(skb)->dev; + return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); +} + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + +static int ip6_pkt_prohibit(struct sk_buff *skb) +{ + return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); +} + +static int ip6_pkt_prohibit_out(struct sk_buff *skb) +{ + skb->dev = skb_dst(skb)->dev; + return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); +} + +#endif + +/* + * Allocate a dst for local (unicast / anycast) address. + */ + +struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + const struct in6_addr *addr, + int anycast) +{ + struct net *net = dev_net(idev->dev); + struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, + net->loopback_dev, 0); + struct neighbour *neigh; + + if (rt == NULL) { + if (net_ratelimit()) + pr_warning("IPv6: Maximum number of routes reached," + " consider increasing route/max_size.\n"); + return ERR_PTR(-ENOMEM); + } + + in6_dev_hold(idev); + + rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; + rt->dst.output = ip6_output; + rt->rt6i_idev = idev; + rt->dst.obsolete = -1; + + rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + if (anycast) + rt->rt6i_flags |= RTF_ANYCAST; + else + rt->rt6i_flags |= RTF_LOCAL; + neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + if (IS_ERR(neigh)) { + dst_free(&rt->dst); + + return ERR_CAST(neigh); + } + dst_set_neighbour(&rt->dst, neigh); + + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.plen = 128; + rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + + atomic_set(&rt->dst.__refcnt, 1); + + return rt; +} + +int ip6_route_get_saddr(struct net *net, + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr) +{ + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + int err = 0; + if (rt->rt6i_prefsrc.plen) + ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr); + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, + daddr, prefs, saddr); + return err; +} + +/* remove deleted ip from prefsrc entries */ +struct arg_dev_net_ip { + struct net_device *dev; + struct net *net; + struct in6_addr *addr; +}; + +static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +{ + struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; + struct net *net = ((struct arg_dev_net_ip *)arg)->net; + struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; + + if (((void *)rt->rt6i_dev == dev || dev == NULL) && + rt != net->ipv6.ip6_null_entry && + ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + /* remove prefsrc entry */ + rt->rt6i_prefsrc.plen = 0; + } + return 0; +} + +void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) +{ + struct net *net = dev_net(ifp->idev->dev); + struct arg_dev_net_ip adni = { + .dev = ifp->idev->dev, + .net = net, + .addr = &ifp->addr, + }; + fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni); +} + +struct arg_dev_net { + struct net_device *dev; + struct net *net; +}; + +static int fib6_ifdown(struct rt6_info *rt, void *arg) +{ + const struct arg_dev_net *adn = arg; + const struct net_device *dev = adn->dev; + + if ((rt->rt6i_dev == dev || dev == NULL) && + rt != adn->net->ipv6.ip6_null_entry) { + RT6_TRACE("deleted by ifdown %p\n", rt); + return -1; + } + return 0; +} + +void rt6_ifdown(struct net *net, struct net_device *dev) +{ + struct arg_dev_net adn = { + .dev = dev, + .net = net, + }; + + fib6_clean_all(net, fib6_ifdown, 0, &adn); + icmp6_clean_all(fib6_ifdown, &adn); +} + +struct rt6_mtu_change_arg +{ + struct net_device *dev; + unsigned mtu; +}; + +static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; + struct inet6_dev *idev; + + /* In IPv6 pmtu discovery is not optional, + so that RTAX_MTU lock cannot disable it. + We still use this lock to block changes + caused by addrconf/ndisc. + */ + + idev = __in6_dev_get(arg->dev); + if (idev == NULL) + return 0; + + /* For administrative MTU increase, there is no way to discover + IPv6 PMTU increase, so PMTU increase should be updated here. + Since RFC 1981 doesn't include administrative MTU increase + update PMTU increase is a MUST. (i.e. jumbo frame) + */ + /* + If new MTU is less than route PMTU, this new MTU will be the + lowest MTU in the path, update the route PMTU to reflect PMTU + decreases; if new MTU is greater than route PMTU, and the + old MTU is the lowest MTU in the path, update the route PMTU + to reflect the increase. In this case if the other nodes' MTU + also have the lowest MTU, TOO BIG MESSAGE will be lead to + PMTU discouvery. + */ + if (rt->rt6i_dev == arg->dev && + !dst_metric_locked(&rt->dst, RTAX_MTU) && + (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6))) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } + return 0; +} + +void rt6_mtu_change(struct net_device *dev, unsigned mtu) +{ + struct rt6_mtu_change_arg arg = { + .dev = dev, + .mtu = mtu, + }; + + fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg); +} + +static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, + [RTA_OIF] = { .type = NLA_U32 }, + [RTA_IIF] = { .type = NLA_U32 }, + [RTA_PRIORITY] = { .type = NLA_U32 }, + [RTA_METRICS] = { .type = NLA_NESTED }, +}; + +static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, + struct fib6_config *cfg) +{ + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rtm = nlmsg_data(nlh); + memset(cfg, 0, sizeof(*cfg)); + + cfg->fc_table = rtm->rtm_table; + cfg->fc_dst_len = rtm->rtm_dst_len; + cfg->fc_src_len = rtm->rtm_src_len; + cfg->fc_flags = RTF_UP; + cfg->fc_protocol = rtm->rtm_protocol; + + if (rtm->rtm_type == RTN_UNREACHABLE) + cfg->fc_flags |= RTF_REJECT; + + if (rtm->rtm_type == RTN_LOCAL) + cfg->fc_flags |= RTF_LOCAL; + + cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; + cfg->fc_nlinfo.nlh = nlh; + cfg->fc_nlinfo.nl_net = sock_net(skb->sk); + + if (tb[RTA_GATEWAY]) { + nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16); + cfg->fc_flags |= RTF_GATEWAY; + } + + if (tb[RTA_DST]) { + int plen = (rtm->rtm_dst_len + 7) >> 3; + + if (nla_len(tb[RTA_DST]) < plen) + goto errout; + + nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); + } + + if (tb[RTA_SRC]) { + int plen = (rtm->rtm_src_len + 7) >> 3; + + if (nla_len(tb[RTA_SRC]) < plen) + goto errout; + + nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); + } + + if (tb[RTA_PREFSRC]) + nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); + + if (tb[RTA_OIF]) + cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_PRIORITY]) + cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); + + if (tb[RTA_METRICS]) { + cfg->fc_mx = nla_data(tb[RTA_METRICS]); + cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); + } + + if (tb[RTA_TABLE]) + cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); + + err = 0; +errout: + return err; +} + +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib6_config cfg; + int err; + + err = rtm_to_fib6_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return ip6_route_del(&cfg); +} + +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib6_config cfg; + int err; + + err = rtm_to_fib6_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return ip6_route_add(&cfg); +} + +static inline size_t rt6_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(16) /* RTA_SRC */ + + nla_total_size(16) /* RTA_DST */ + + nla_total_size(16) /* RTA_GATEWAY */ + + nla_total_size(16) /* RTA_PREFSRC */ + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(4) /* RTA_IIF */ + + nla_total_size(4) /* RTA_OIF */ + + nla_total_size(4) /* RTA_PRIORITY */ + + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + + nla_total_size(sizeof(struct rta_cacheinfo)); +} + +static int rt6_fill_node(struct net *net, + struct sk_buff *skb, struct rt6_info *rt, + struct in6_addr *dst, struct in6_addr *src, + int iif, int type, u32 pid, u32 seq, + int prefix, int nowait, unsigned int flags) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + long expires; + u32 table; + struct neighbour *n; + + if (prefix) { /* user wants prefix routes only */ + if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + } + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_INET6; + rtm->rtm_dst_len = rt->rt6i_dst.plen; + rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_tos = 0; + if (rt->rt6i_table) + table = rt->rt6i_table->tb6_id; + else + table = RT6_TABLE_UNSPEC; + rtm->rtm_table = table; + NLA_PUT_U32(skb, RTA_TABLE, table); + if (rt->rt6i_flags&RTF_REJECT) + rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_flags&RTF_LOCAL) + rtm->rtm_type = RTN_LOCAL; + else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + rtm->rtm_type = RTN_LOCAL; + else + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = rt->rt6i_protocol; + if (rt->rt6i_flags&RTF_DYNAMIC) + rtm->rtm_protocol = RTPROT_REDIRECT; + else if (rt->rt6i_flags & RTF_ADDRCONF) + rtm->rtm_protocol = RTPROT_KERNEL; + else if (rt->rt6i_flags&RTF_DEFAULT) + rtm->rtm_protocol = RTPROT_RA; + + if (rt->rt6i_flags&RTF_CACHE) + rtm->rtm_flags |= RTM_F_CLONED; + + if (dst) { + NLA_PUT(skb, RTA_DST, 16, dst); + rtm->rtm_dst_len = 128; + } else if (rtm->rtm_dst_len) + NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); +#ifdef CONFIG_IPV6_SUBTREES + if (src) { + NLA_PUT(skb, RTA_SRC, 16, src); + rtm->rtm_src_len = 128; + } else if (rtm->rtm_src_len) + NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); +#endif + if (iif) { +#ifdef CONFIG_IPV6_MROUTE + if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + int err = ip6mr_get_route(net, skb, rtm, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nla_put_failure; + } else { + if (err == -EMSGSIZE) + goto nla_put_failure; + } + } + } else +#endif + NLA_PUT_U32(skb, RTA_IIF, iif); + } else if (dst) { + struct in6_addr saddr_buf; + if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0) + NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + + if (rt->rt6i_prefsrc.plen) { + struct in6_addr saddr_buf; + ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr); + NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + + if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + goto nla_put_failure; + + rcu_read_lock(); + n = dst_get_neighbour(&rt->dst); + if (n) { + if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } + } + rcu_read_unlock(); + + if (rt->dst.dev) + NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex); + + NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); + + if (!(rt->rt6i_flags & RTF_EXPIRES)) + expires = 0; + else if (rt->rt6i_expires - jiffies < INT_MAX) + expires = rt->rt6i_expires - jiffies; + else + expires = INT_MAX; + + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, + expires, rt->dst.error) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +int rt6_dump_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + int prefix; + + if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { + struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); + prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; + } else + prefix = 0; + + return rt6_fill_node(arg->net, + arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, + prefix, 0, NLM_F_MULTI); +} + +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[RTA_MAX+1]; + struct rt6_info *rt; + struct sk_buff *skb; + struct rtmsg *rtm; + struct flowi6 fl6; + int err, iif = 0; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + memset(&fl6, 0, sizeof(fl6)); + + if (tb[RTA_SRC]) { + if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) + goto errout; + + ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC])); + } + + if (tb[RTA_DST]) { + if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) + goto errout; + + ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST])); + } + + if (tb[RTA_IIF]) + iif = nla_get_u32(tb[RTA_IIF]); + + if (tb[RTA_OIF]) + fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]); + + if (iif) { + struct net_device *dev; + dev = __dev_get_by_index(net, iif); + if (!dev) { + err = -ENODEV; + goto errout; + } + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto errout; + } + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb_reset_mac_header(skb); + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); + + rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6); + skb_dst_set(skb, &rt->dst); + + err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0, 0, 0); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +errout: + return err; +} + +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +{ + struct sk_buff *skb; + struct net *net = info->nl_net; + u32 seq; + int err; + + err = -ENOBUFS; + seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0; + + skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + if (skb == NULL) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, + event, info->pid, seq, 0, 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + +static int ip6_route_dev_notify(struct notifier_block *this, + unsigned long event, void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct net *net = dev_net(dev); + + if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { + net->ipv6.ip6_null_entry->dst.dev = dev; + net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.ip6_prohibit_entry->dst.dev = dev; + net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); + net->ipv6.ip6_blk_hole_entry->dst.dev = dev; + net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); +#endif + } + + return NOTIFY_OK; +} + +/* + * /proc + */ + +#ifdef CONFIG_PROC_FS + +struct rt6_proc_arg +{ + char *buffer; + int offset; + int length; + int skip; + int len; +}; + +static int rt6_info_route(struct rt6_info *rt, void *p_arg) +{ + struct seq_file *m = p_arg; + struct neighbour *n; + + seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); +#else + seq_puts(m, "00000000000000000000000000000000 00 "); +#endif + rcu_read_lock(); + n = dst_get_neighbour(&rt->dst); + if (n) { + seq_printf(m, "%pi6", n->primary_key); + } else { + seq_puts(m, "00000000000000000000000000000000"); + } + rcu_read_unlock(); + seq_printf(m, " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), + rt->dst.__use, rt->rt6i_flags, + rt->rt6i_dev ? rt->rt6i_dev->name : ""); + return 0; +} + +static int ipv6_route_show(struct seq_file *m, void *v) +{ + struct net *net = (struct net *)m->private; + fib6_clean_all(net, rt6_info_route, 0, m); + return 0; +} + +static int ipv6_route_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ipv6_route_show); +} + +static const struct file_operations ipv6_route_proc_fops = { + .owner = THIS_MODULE, + .open = ipv6_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int rt6_stats_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = (struct net *)seq->private; + seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", + net->ipv6.rt6_stats->fib_nodes, + net->ipv6.rt6_stats->fib_route_nodes, + net->ipv6.rt6_stats->fib_rt_alloc, + net->ipv6.rt6_stats->fib_rt_entries, + net->ipv6.rt6_stats->fib_rt_cache, + dst_entries_get_slow(&net->ipv6.ip6_dst_ops), + net->ipv6.rt6_stats->fib_discarded_routes); + + return 0; +} + +static int rt6_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, rt6_stats_seq_show); +} + +static const struct file_operations rt6_stats_seq_fops = { + .owner = THIS_MODULE, + .open = rt6_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SYSCTL + +static +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net; + int delay; + if (!write) + return -EINVAL; + + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; + proc_dointvec(ctl, write, buffer, lenp, ppos); + fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); + return 0; +} + +ctl_table ipv6_route_table_template[] = { + { + .procname = "flush", + .data = &init_net.ipv6.sysctl.flush_delay, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = ipv6_sysctl_rtcache_flush + }, + { + .procname = "gc_thresh", + .data = &ip6_dst_ops_template.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_size", + .data = &init_net.ipv6.sysctl.ip6_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_min_interval", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_timeout", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_interval", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_elasticity", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "mtu_expires", + .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "min_adv_mss", + .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_min_interval_ms", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { } +}; + +struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(ipv6_route_table_template, + sizeof(ipv6_route_table_template), + GFP_KERNEL); + + if (table) { + table[0].data = &net->ipv6.sysctl.flush_delay; + table[0].extra1 = net; + table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; + table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; + table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; + table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; + table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; + table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; + table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; + table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + } + + return table; +} +#endif + +static int __net_init ip6_route_net_init(struct net *net) +{ + int ret = -ENOMEM; + + memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, + sizeof(net->ipv6.ip6_dst_ops)); + + if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) + goto out_ip6_dst_ops; + + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, + sizeof(*net->ipv6.ip6_null_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_null_entry) + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_null_entry; + net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_null_entry->dst, + ip6_template_metrics, true); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, + sizeof(*net->ipv6.ip6_prohibit_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_prohibit_entry) + goto out_ip6_null_entry; + net->ipv6.ip6_prohibit_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_prohibit_entry; + net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, + ip6_template_metrics, true); + + net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, + sizeof(*net->ipv6.ip6_blk_hole_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_blk_hole_entry) + goto out_ip6_prohibit_entry; + net->ipv6.ip6_blk_hole_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; + net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, + ip6_template_metrics, true); +#endif + + net->ipv6.sysctl.flush_delay = 0; + net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; + net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; + net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; + net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + +#ifdef CONFIG_PROC_FS + proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); + proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); +#endif + net->ipv6.ip6_rt_gc_expire = 30*HZ; + + ret = 0; +out: + return ret; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_ip6_prohibit_entry: + kfree(net->ipv6.ip6_prohibit_entry); +out_ip6_null_entry: + kfree(net->ipv6.ip6_null_entry); +#endif +out_ip6_dst_entries: + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +out_ip6_dst_ops: + goto out; +} + +static void __net_exit ip6_route_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "ipv6_route"); + proc_net_remove(net, "rt6_stats"); +#endif + kfree(net->ipv6.ip6_null_entry); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(net->ipv6.ip6_prohibit_entry); + kfree(net->ipv6.ip6_blk_hole_entry); +#endif + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +} + +static struct pernet_operations ip6_route_net_ops = { + .init = ip6_route_net_init, + .exit = ip6_route_net_exit, +}; + +static struct notifier_block ip6_route_dev_notifier = { + .notifier_call = ip6_route_dev_notify, + .priority = 0, +}; + +int __init ip6_route_init(void) +{ + int ret; + + ret = -ENOMEM; + ip6_dst_ops_template.kmem_cachep = + kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip6_dst_ops_template.kmem_cachep) + goto out; + + ret = dst_entries_init(&ip6_dst_blackhole_ops); + if (ret) + goto out_kmem_cache; + + ret = register_pernet_subsys(&ip6_route_net_ops); + if (ret) + goto out_dst_entries; + + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; + + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif + ret = fib6_init(); + if (ret) + goto out_register_subsys; + + ret = xfrm6_init(); + if (ret) + goto out_fib6_init; + + ret = fib6_rules_init(); + if (ret) + goto xfrm6_init; + + ret = -ENOBUFS; + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL)) + goto fib6_rules_init; + + ret = register_netdevice_notifier(&ip6_route_dev_notifier); + if (ret) + goto fib6_rules_init; + +out: + return ret; + +fib6_rules_init: + fib6_rules_cleanup(); +xfrm6_init: + xfrm6_fini(); +out_fib6_init: + fib6_gc_cleanup(); +out_register_subsys: + unregister_pernet_subsys(&ip6_route_net_ops); +out_dst_entries: + dst_entries_destroy(&ip6_dst_blackhole_ops); +out_kmem_cache: + kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); + goto out; +} + +void ip6_route_cleanup(void) +{ + unregister_netdevice_notifier(&ip6_route_dev_notifier); + fib6_rules_cleanup(); + xfrm6_fini(); + fib6_gc_cleanup(); + unregister_pernet_subsys(&ip6_route_net_ops); + dst_entries_destroy(&ip6_dst_blackhole_ops); + kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); +} diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c new file mode 100644 index 00000000..f56acd09 --- /dev/null +++ b/net/ipv6/sit.c @@ -0,0 +1,1293 @@ +/* + * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Roger Venning : 6to4 support + * Nate Thompson : 6to4 support + * Fred Templin : isatap support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + +#define HASH_SIZE 16 +#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) + +static int ipip6_tunnel_init(struct net_device *dev); +static void ipip6_tunnel_setup(struct net_device *dev); +static void ipip6_dev_free(struct net_device *dev); + +static int sit_net_id __read_mostly; +struct sit_net { + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; + + struct net_device *fb_tunnel_dev; +}; + +/* + * Locking : hash tables are protected by RCU and RTNL + */ + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip6_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} +/* + * Must be invoked with rcu_read_lock + */ +static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, + struct net_device *dev, __be32 remote, __be32 local) +{ + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); + struct ip_tunnel *t; + struct sit_net *sitn = net_generic(net, sit_net_id); + + for_each_ip_tunnel_rcu(sitn->tunnels_r_l[h0 ^ h1]) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + for_each_ip_tunnel_rcu(sitn->tunnels_r[h0]) { + if (remote == t->parms.iph.daddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + for_each_ip_tunnel_rcu(sitn->tunnels_l[h1]) { + if (local == t->parms.iph.saddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(sitn->tunnels_wc[0]); + if ((t != NULL) && (t->dev->flags & IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, + struct ip_tunnel_parm *parms) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + unsigned int h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &sitn->tunnels[prio][h]; +} + +static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, + struct ip_tunnel *t) +{ + return __ipip6_bucket(sitn, &t->parms); +} + +static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip6_bucket(sitn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) +{ +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel *t = netdev_priv(dev); + + if (t->dev == sitn->fb_tunnel_dev) { + ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); + t->ip6rd.relay_prefix = 0; + t->ip6rd.prefixlen = 16; + t->ip6rd.relay_prefixlen = 0; + } else { + struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev); + memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd)); + } +#endif +} + +static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; + struct net_device *dev; + char name[IFNAMSIZ]; + struct sit_net *sitn = net_generic(net, sit_net_id); + + for (tp = __ipip6_bucket(sitn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + parms->link == t->parms.link) { + if (create) + return NULL; + else + return t; + } + } + if (!create) + goto failed; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "sit%d"); + + dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + + nt->parms = *parms; + if (ipip6_tunnel_init(dev) < 0) + goto failed_free; + ipip6_tunnel_clone_6rd(dev, sitn); + + if (parms->i_flags & SIT_ISATAP) + dev->priv_flags |= IFF_ISATAP; + + if (register_netdevice(dev) < 0) + goto failed_free; + + strcpy(nt->parms.name, dev->name); + + dev_hold(dev); + + ipip6_tunnel_link(sitn, nt); + return nt; + +failed_free: + ipip6_dev_free(dev); +failed: + return NULL; +} + +#define for_each_prl_rcu(start) \ + for (prl = rcu_dereference(start); \ + prl; \ + prl = rcu_dereference(prl->next)) + +static struct ip_tunnel_prl_entry * +__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) +{ + struct ip_tunnel_prl_entry *prl; + + for_each_prl_rcu(t->prl) + if (prl->addr == addr) + break; + return prl; + +} + +static int ipip6_tunnel_get_prl(struct ip_tunnel *t, + struct ip_tunnel_prl __user *a) +{ + struct ip_tunnel_prl kprl, *kp; + struct ip_tunnel_prl_entry *prl; + unsigned int cmax, c = 0, ca, len; + int ret = 0; + + if (copy_from_user(&kprl, a, sizeof(kprl))) + return -EFAULT; + cmax = kprl.datalen / sizeof(kprl); + if (cmax > 1 && kprl.addr != htonl(INADDR_ANY)) + cmax = 1; + + /* For simple GET or for root users, + * we try harder to allocate. + */ + kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? + kcalloc(cmax, sizeof(*kp), GFP_KERNEL) : + NULL; + + rcu_read_lock(); + + ca = t->prl_count < cmax ? t->prl_count : cmax; + + if (!kp) { + /* We don't try hard to allocate much memory for + * non-root users. + * For root users, retry allocating enough memory for + * the answer. + */ + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + if (!kp) { + ret = -ENOMEM; + goto out; + } + } + + c = 0; + for_each_prl_rcu(t->prl) { + if (c >= cmax) + break; + if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr) + continue; + kp[c].addr = prl->addr; + kp[c].flags = prl->flags; + c++; + if (kprl.addr != htonl(INADDR_ANY)) + break; + } +out: + rcu_read_unlock(); + + len = sizeof(*kp) * c; + ret = 0; + if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen)) + ret = -EFAULT; + + kfree(kp); + + return ret; +} + +static int +ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) +{ + struct ip_tunnel_prl_entry *p; + int err = 0; + + if (a->addr == htonl(INADDR_ANY)) + return -EINVAL; + + ASSERT_RTNL(); + + for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { + if (p->addr == a->addr) { + if (chg) { + p->flags = a->flags; + goto out; + } + err = -EEXIST; + goto out; + } + } + + if (chg) { + err = -ENXIO; + goto out; + } + + p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); + if (!p) { + err = -ENOBUFS; + goto out; + } + + p->next = t->prl; + p->addr = a->addr; + p->flags = a->flags; + t->prl_count++; + rcu_assign_pointer(t->prl, p); +out: + return err; +} + +static void prl_list_destroy_rcu(struct rcu_head *head) +{ + struct ip_tunnel_prl_entry *p, *n; + + p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); + do { + n = rcu_dereference_protected(p->next, 1); + kfree(p); + p = n; + } while (p); +} + +static int +ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) +{ + struct ip_tunnel_prl_entry *x; + struct ip_tunnel_prl_entry __rcu **p; + int err = 0; + + ASSERT_RTNL(); + + if (a && a->addr != htonl(INADDR_ANY)) { + for (p = &t->prl; + (x = rtnl_dereference(*p)) != NULL; + p = &x->next) { + if (x->addr == a->addr) { + *p = x->next; + kfree_rcu(x, rcu_head); + t->prl_count--; + goto out; + } + } + err = -ENXIO; + } else { + x = rtnl_dereference(t->prl); + if (x) { + t->prl_count = 0; + call_rcu(&x->rcu_head, prl_list_destroy_rcu); + t->prl = NULL; + } + } +out: + return err; +} + +static int +isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) +{ + struct ip_tunnel_prl_entry *p; + int ok = 1; + + rcu_read_lock(); + p = __ipip6_tunnel_locate_prl(t, iph->saddr); + if (p) { + if (p->flags & PRL_DEFAULT) + skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT; + else + skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; + } else { + const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; + + if (ipv6_addr_is_isatap(addr6) && + (addr6->s6_addr32[3] == iph->saddr) && + ipv6_chk_prefix(addr6, t->dev)) + skb->ndisc_nodetype = NDISC_NODETYPE_HOST; + else + ok = 0; + } + rcu_read_unlock(); + return ok; +} + +static void ipip6_tunnel_uninit(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + if (dev == sitn->fb_tunnel_dev) { + rcu_assign_pointer(sitn->tunnels_wc[0], NULL); + } else { + ipip6_tunnel_unlink(sitn, netdev_priv(dev)); + ipip6_tunnel_del_prl(netdev_priv(dev), NULL); + } + dev_put(dev); +} + + +static int ipip6_err(struct sk_buff *skb, u32 info) +{ + +/* All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + const struct iphdr *iph = (const struct iphdr *)skb->data; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return 0; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return 0; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return 0; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return 0; + break; + } + + err = -ENOENT; + + rcu_read_lock(); + t = ipip6_tunnel_lookup(dev_net(skb->dev), + skb->dev, + iph->daddr, + iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + + err = 0; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + rcu_read_unlock(); + return err; +} + +static inline void ipip6_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) + IP6_ECN_set_ce(ipv6_hdr(skb)); +} + +static int ipip6_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + iph = ip_hdr(skb); + + rcu_read_lock(); + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + + secpath_reset(skb); + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + IPCB(skb)->flags = 0; + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + + if ((tunnel->dev->priv_flags & IFF_ISATAP) && + !isatap_chksrc(skb, iph, tunnel)) { + tunnel->dev->stats.rx_errors++; + rcu_read_unlock(); + kfree_skb(skb); + return 0; + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); + + ipip6_ecn_decapsulate(iph, skb); + + netif_rx(skb); + + rcu_read_unlock(); + return 0; + } + + /* no tunnel matched, let upstream know, ipsec may handle it */ + rcu_read_unlock(); + return 1; +out: + kfree_skb(skb); + return 0; +} + +/* + * Returns the embedded IPv4 address if the IPv6 address + * comes from 6rd / 6to4 (RFC 3056) addr space. + */ +static inline +__be32 try_6rd(const struct in6_addr *v6dst, struct ip_tunnel *tunnel) +{ + __be32 dst = 0; + +#ifdef CONFIG_IPV6_SIT_6RD + if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, + tunnel->ip6rd.prefixlen)) { + unsigned int pbw0, pbi0; + int pbi1; + u32 d; + + pbw0 = tunnel->ip6rd.prefixlen >> 5; + pbi0 = tunnel->ip6rd.prefixlen & 0x1f; + + d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> + tunnel->ip6rd.relay_prefixlen; + + pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; + if (pbi1 > 0) + d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> + (32 - pbi1); + + dst = tunnel->ip6rd.relay_prefix | htonl(d); + } +#else + if (v6dst->s6_addr16[0] == htons(0x2002)) { + /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ + memcpy(&dst, &v6dst->s6_addr16[1], 4); + } +#endif + return dst; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct pcpu_tstats *tstats; + const struct iphdr *tiph = &tunnel->parms.iph; + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + u8 tos = tunnel->parms.iph.tos; + __be16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + __be32 dst = tiph->daddr; + struct flowi4 fl4; + int mtu; + const struct in6_addr *addr6; + int addr_type; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto tx_error; + + /* ISATAP (RFC4214) - must come before 6to4 */ + if (dev->priv_flags & IFF_ISATAP) { + struct neighbour *neigh = NULL; + + if (skb_dst(skb)) + neigh = dst_get_neighbour(skb_dst(skb)); + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (const struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) + dst = addr6->s6_addr32[3]; + else + goto tx_error; + } + + if (!dst) + dst = try_6rd(&iph6->daddr, tunnel); + + if (!dst) { + struct neighbour *neigh = NULL; + + if (skb_dst(skb)) + neigh = dst_get_neighbour(skb_dst(skb)); + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (const struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } + + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPV6, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + if (rt->rt_type != RTN_UNICAST) { + ip_rt_put(rt); + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + if (df) { + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + + if (mtu < 68) { + dev->stats.collisions++; + ip_rt_put(rt); + goto tx_error; + } + + if (mtu < IPV6_MIN_MTU) { + mtu = IPV6_MIN_MTU; + df = 0; + } + + if (tunnel->parms.iph.daddr && skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ip_rt_put(rt); + goto tx_error; + } + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + iph6 = ipv6_hdr(skb); + } + + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags = 0; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPV6; + iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = iph6->hop_limit; + + nf_reset(skb); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); + return NETDEV_TX_OK; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ipip6_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + const struct iphdr *iph; + struct flowi4 fl4; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct rtable *rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPV6, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dev->iflink = tunnel->parms.link; +} + +static int +ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel_prl prl; + struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif + + switch (cmd) { + case SIOCGETTUNNEL: +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCGET6RD: +#endif + t = NULL; + if (dev == sitn->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip6_tunnel_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + + err = -EFAULT; + if (cmd == SIOCGETTUNNEL) { + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, + sizeof(p))) + goto done; +#ifdef CONFIG_IPV6_SIT_6RD + } else { + ipv6_addr_copy(&ip6rd.prefix, &t->ip6rd.prefix); + ip6rd.relay_prefix = t->ip6rd.relay_prefix; + ip6rd.prefixlen = t->ip6rd.prefixlen; + ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, + sizeof(ip6rd))) + goto done; +#endif + } + err = 0; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); + + if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = netdev_priv(dev); + ipip6_tunnel_unlink(sitn, t); + synchronize_net(); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip6_tunnel_link(sitn, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipip6_tunnel_bind_dev(dev); + netdev_state_change(dev); + } + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == sitn->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip6_tunnel_locate(net, &p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(sitn->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + case SIOCGETPRL: + err = -EINVAL; + if (dev == sitn->fb_tunnel_dev) + goto done; + err = -ENOENT; + if (!(t = netdev_priv(dev))) + goto done; + err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); + break; + + case SIOCADDPRL: + case SIOCDELPRL: + case SIOCCHGPRL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + err = -EINVAL; + if (dev == sitn->fb_tunnel_dev) + goto done; + err = -EFAULT; + if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + goto done; + err = -ENOENT; + if (!(t = netdev_priv(dev))) + goto done; + + switch (cmd) { + case SIOCDELPRL: + err = ipip6_tunnel_del_prl(t, &prl); + break; + case SIOCADDPRL: + case SIOCCHGPRL: + err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); + break; + } + netdev_state_change(dev); + break; + +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCADD6RD: + case SIOCCHG6RD: + case SIOCDEL6RD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, + sizeof(ip6rd))) + goto done; + + t = netdev_priv(dev); + + if (cmd != SIOCDEL6RD) { + struct in6_addr prefix; + __be32 relay_prefix; + + err = -EINVAL; + if (ip6rd.relay_prefixlen > 32 || + ip6rd.prefixlen + (32 - ip6rd.relay_prefixlen) > 64) + goto done; + + ipv6_addr_prefix(&prefix, &ip6rd.prefix, + ip6rd.prefixlen); + if (!ipv6_addr_equal(&prefix, &ip6rd.prefix)) + goto done; + if (ip6rd.relay_prefixlen) + relay_prefix = ip6rd.relay_prefix & + htonl(0xffffffffUL << + (32 - ip6rd.relay_prefixlen)); + else + relay_prefix = 0; + if (relay_prefix != ip6rd.relay_prefix) + goto done; + + ipv6_addr_copy(&t->ip6rd.prefix, &prefix); + t->ip6rd.relay_prefix = relay_prefix; + t->ip6rd.prefixlen = ip6rd.prefixlen; + t->ip6rd.relay_prefixlen = ip6rd.relay_prefixlen; + } else + ipip6_tunnel_clone_6rd(dev, sitn); + + err = 0; + break; +#endif + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops ipip6_netdev_ops = { + .ndo_uninit = ipip6_tunnel_uninit, + .ndo_start_xmit = ipip6_tunnel_xmit, + .ndo_do_ioctl = ipip6_tunnel_ioctl, + .ndo_change_mtu = ipip6_tunnel_change_mtu, + .ndo_get_stats = ipip6_get_stats, +}; + +static void ipip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void ipip6_tunnel_setup(struct net_device *dev) +{ + dev->netdev_ops = &ipip6_netdev_ops; + dev->destructor = ipip6_dev_free; + + dev->type = ARPHRD_SIT; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + dev->iflink = 0; + dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; +} + +static int ipip6_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + tunnel->dev = dev; + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + ipip6_tunnel_bind_dev(dev); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPV6; + iph->ihl = 5; + iph->ttl = 64; + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + dev_hold(dev); + rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); + return 0; +} + +static struct xfrm_tunnel sit_handler __read_mostly = { + .handler = ipip6_rcv, + .err_handler = ipip6_err, + .priority = 1, +}; + +static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head) +{ + int prio; + + for (prio = 1; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + + t = rtnl_dereference(sitn->tunnels[prio][h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init sit_init_net(struct net *net) +{ + struct sit_net *sitn = net_generic(net, sit_net_id); + struct ip_tunnel *t; + int err; + + sitn->tunnels[0] = sitn->tunnels_wc; + sitn->tunnels[1] = sitn->tunnels_l; + sitn->tunnels[2] = sitn->tunnels_r; + sitn->tunnels[3] = sitn->tunnels_r_l; + + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + ipip6_tunnel_setup); + if (!sitn->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(sitn->fb_tunnel_dev, net); + + err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + if (err) + goto err_dev_free; + + ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); + + if ((err = register_netdev(sitn->fb_tunnel_dev))) + goto err_reg_dev; + + t = netdev_priv(sitn->fb_tunnel_dev); + + strcpy(t->parms.name, sitn->fb_tunnel_dev->name); + return 0; + +err_reg_dev: + dev_put(sitn->fb_tunnel_dev); +err_dev_free: + ipip6_dev_free(sitn->fb_tunnel_dev); +err_alloc_dev: + return err; +} + +static void __net_exit sit_exit_net(struct net *net) +{ + struct sit_net *sitn = net_generic(net, sit_net_id); + LIST_HEAD(list); + + rtnl_lock(); + sit_destroy_tunnels(sitn, &list); + unregister_netdevice_queue(sitn->fb_tunnel_dev, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations sit_net_ops = { + .init = sit_init_net, + .exit = sit_exit_net, + .id = &sit_net_id, + .size = sizeof(struct sit_net), +}; + +static void __exit sit_cleanup(void) +{ + xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + + unregister_pernet_device(&sit_net_ops); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +static int __init sit_init(void) +{ + int err; + + printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + + err = register_pernet_device(&sit_net_ops); + if (err < 0) + return err; + err = xfrm4_tunnel_register(&sit_handler, AF_INET6); + if (err < 0) { + unregister_pernet_device(&sit_net_ops); + printk(KERN_INFO "sit init: Can't add protocol\n"); + } + return err; +} + +module_init(sit_init); +module_exit(sit_cleanup); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c new file mode 100644 index 00000000..14b83395 --- /dev/null +++ b/net/ipv6/syncookies.c @@ -0,0 +1,268 @@ +/* + * IPv6 Syncookies implementation for the Linux kernel + * + * Authors: + * Glenn Griffin + * + * Based on IPv4 implementation by Andi Kleen + * linux/net/ipv4/syncookies.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +extern int sysctl_tcp_syncookies; +extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +/* Table must be sorted. */ +static __u16 const msstab[] = { + 64, + 512, + 536, + 1280 - 60, + 1480 - 60, + 1500 - 60, + 4460 - 60, + 9000 - 60, +}; + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 + +static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *child; + + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + if (child) + inet_csk_reqsk_queue_add(sk, req, child); + else + reqsk_free(req); + + return child; +} + +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); + +static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, + __be16 sport, __be16 dport, u32 count, int c) +{ + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); + + /* + * we have 320 bits of information to hash, copy in the remaining + * 192 bits required for sha_transform, from the syncookie_secret + * and overwrite the digest with the secret + */ + memcpy(tmp + 10, syncookie_secret[c], 44); + memcpy(tmp, saddr, 16); + memcpy(tmp + 4, daddr, 16); + tmp[8] = ((__force u32)sport << 16) + (__force u32)dport; + tmp[9] = count; + sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); + + return tmp[17]; +} + +static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __be16 sport, __be16 dport, __u32 sseq, + __u32 count, __u32 data) +{ + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, + __be16 dport, __u32 sseq, __u32 count, + __u32 maxdiff) +{ + __u32 diff; + + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; +} + +__u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + int mssind; + const __u16 mss = *mssp; + + tcp_synq_overflow(sk); + + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; + + *mssp = msstab[mssind]; + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, + th->dest, ntohl(th->seq), + jiffies / (HZ * 60), mssind); +} + +static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), COUNTER_TRIES); + + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; +} + +struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tcp_opt; + u8 *hash_location; + struct inet_request_sock *ireq; + struct inet6_request_sock *ireq6; + struct tcp_request_sock *treq; + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = tcp_hdr(skb); + __u32 cookie = ntohl(th->ack_seq) - 1; + struct sock *ret = sk; + struct request_sock *req; + int mss; + struct dst_entry *dst; + __u8 rcv_wscale; + bool ecn_ok = false; + + if (!sysctl_tcp_syncookies || !th->ack || th->rst) + goto out; + + if (tcp_synq_no_recent_overflow(sk) || + (mss = cookie_check(skb, cookie)) == 0) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + goto out; + } + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + + if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + goto out; + + ret = NULL; + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + if (!req) + goto out; + + ireq = inet_rsk(req); + ireq6 = inet6_rsk(req); + treq = tcp_rsk(req); + + if (security_inet_conn_request(sk, skb, req)) + goto out_free; + + req->mss = mss; + ireq->rmt_port = th->source; + ireq->loc_port = th->dest; + ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + + ireq6->iif = sk->sk_bound_dev_if; + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + req->expires = 0UL; + req->retrans = 0; + ireq->ecn_ok = ecn_ok; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = cookie; + + /* + * We need to lookup the dst_entry to get the correct window size. + * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten + * me if there is a preferred way. + */ + { + struct in6_addr *final_p, final; + struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr); + final_p = fl6_update_dst(&fl6, np->opt, &final); + ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_sk(sk)->inet_sport; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) + goto out_free; + } + + req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + tcp_select_initial_window(tcp_full_space(sk), req->mss, + &req->rcv_wnd, &req->window_clamp, + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + + ireq->rcv_wscale = rcv_wscale; + + ret = get_cookie_sock(sk, skb, req, dst); +out: + return ret; +out_free: + reqsk_free(req); + return NULL; +} + diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c new file mode 100644 index 00000000..6dcf5e7d --- /dev/null +++ b/net/ipv6/sysctl_net_ipv6.c @@ -0,0 +1,176 @@ +/* + * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. + * + * Changes: + * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ctl_table empty[1]; + +static ctl_table ipv6_static_skeleton[] = { + { + .procname = "neigh", + .maxlen = 0, + .mode = 0555, + .child = empty, + }, + { } +}; + +static ctl_table ipv6_table_template[] = { + { + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = ipv6_route_table_template + }, + { + .procname = "icmp", + .maxlen = 0, + .mode = 0555, + .child = ipv6_icmp_table_template + }, + { + .procname = "bindv6only", + .data = &init_net.ipv6.sysctl.bindv6only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +static ctl_table ipv6_rotable[] = { + { + .procname = "mld_max_msf", + .data = &sysctl_mld_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +struct ctl_path net_ipv6_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv6", }, + { }, +}; +EXPORT_SYMBOL_GPL(net_ipv6_ctl_path); + +static int __net_init ipv6_sysctl_net_init(struct net *net) +{ + struct ctl_table *ipv6_table; + struct ctl_table *ipv6_route_table; + struct ctl_table *ipv6_icmp_table; + int err; + + err = -ENOMEM; + ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template), + GFP_KERNEL); + if (!ipv6_table) + goto out; + + ipv6_route_table = ipv6_route_sysctl_init(net); + if (!ipv6_route_table) + goto out_ipv6_table; + ipv6_table[0].child = ipv6_route_table; + + ipv6_icmp_table = ipv6_icmp_sysctl_init(net); + if (!ipv6_icmp_table) + goto out_ipv6_route_table; + ipv6_table[1].child = ipv6_icmp_table; + + ipv6_table[2].data = &net->ipv6.sysctl.bindv6only; + + net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path, + ipv6_table); + if (!net->ipv6.sysctl.table) + goto out_ipv6_icmp_table; + + err = 0; +out: + return err; + +out_ipv6_icmp_table: + kfree(ipv6_icmp_table); +out_ipv6_route_table: + kfree(ipv6_route_table); +out_ipv6_table: + kfree(ipv6_table); + goto out; +} + +static void __net_exit ipv6_sysctl_net_exit(struct net *net) +{ + struct ctl_table *ipv6_table; + struct ctl_table *ipv6_route_table; + struct ctl_table *ipv6_icmp_table; + + ipv6_table = net->ipv6.sysctl.table->ctl_table_arg; + ipv6_route_table = ipv6_table[0].child; + ipv6_icmp_table = ipv6_table[1].child; + + unregister_net_sysctl_table(net->ipv6.sysctl.table); + + kfree(ipv6_table); + kfree(ipv6_route_table); + kfree(ipv6_icmp_table); +} + +static struct pernet_operations ipv6_sysctl_net_ops = { + .init = ipv6_sysctl_net_init, + .exit = ipv6_sysctl_net_exit, +}; + +static struct ctl_table_header *ip6_header; + +int ipv6_sysctl_register(void) +{ + int err = -ENOMEM; + + ip6_header = register_net_sysctl_rotable(net_ipv6_ctl_path, ipv6_rotable); + if (ip6_header == NULL) + goto out; + + err = register_pernet_subsys(&ipv6_sysctl_net_ops); + if (err) + goto err_pernet; +out: + return err; + +err_pernet: + unregister_net_sysctl_table(ip6_header); + goto out; +} + +void ipv6_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_header); + unregister_pernet_subsys(&ipv6_sysctl_net_ops); +} + +static struct ctl_table_header *ip6_base; + +int ipv6_static_sysctl_register(void) +{ + ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton); + if (ip6_base == NULL) + return -ENOMEM; + return 0; +} + +void ipv6_static_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_base); +} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c new file mode 100644 index 00000000..848f9634 --- /dev/null +++ b/net/ipv6/tcp_ipv6.c @@ -0,0 +1,2321 @@ +/* + * TCP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on: + * linux/net/ipv4/tcp.c + * linux/net/ipv4/tcp_input.c + * linux/net/ipv4/tcp_output.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); + +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +static void __tcp_v6_send_check(struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr); + +static const struct inet_connection_sock_af_ops ipv6_mapped; +static const struct inet_connection_sock_af_ops ipv6_specific; +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; +static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; +#else +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, + const struct in6_addr *addr) +{ + return NULL; +} +#endif + +static void tcp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { + tcp_prot.hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(sk, NULL); + local_bh_enable(); + } +} + +static __inline__ __sum16 tcp_v6_check(int len, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __wsum base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); +} + +static __u32 tcp_v6_init_sequence(struct sk_buff *skb) +{ + return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct in6_addr *saddr = NULL, *final_p, final; + struct rt6_info *rt; + struct flowi6 fl6; + struct dst_entry *dst; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl6, 0, sizeof(fl6)); + + if (np->sndflow) { + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6.flowlabel); + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + if (tp->rx_opt.ts_recent_stamp && + !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl6.flowlabel; + + /* + * TCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = icsk->icsk_ext_hdr_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + icsk->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_mapped_specific; +#endif + + err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; + sk->sk_backlog_rcv = tcp_v6_do_rcv; +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_specific; +#endif + goto failure; + } else { + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, + &np->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl6.flowi6_proto = IPPROTO_TCP; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, + (saddr ? saddr : &np->saddr)); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = usin->sin6_port; + fl6.fl6_sport = inet->inet_sport; + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto failure; + } + + if (saddr == NULL) { + saddr = &fl6.saddr; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + + sk->sk_gso_type = SKB_GSO_TCPV6; + __ip6_dst_store(sk, dst, NULL, NULL); + + rt = (struct rt6_info *) dst; + if (tcp_death_row.sysctl_tw_recycle && + !tp->rx_opt.ts_recent_stamp && + ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { + struct inet_peer *peer = rt6_get_peer(rt); + /* + * VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state + * TIME-WAIT * and initialize rx_opt.ts_recent from it, + * when trying new connection. + */ + if (peer) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } + } + } + + icsk->icsk_ext_hdr_len = 0; + if (np->opt) + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); + + tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + + inet->inet_dport = usin->sin6_port; + + tcp_set_state(sk, TCP_SYN_SENT); + err = inet6_hash_connect(&tcp_death_row, sk); + if (err) + goto late_failure; + + if (!tp->write_seq) + tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); + + err = tcp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + tcp_set_state(sk, TCP_CLOSE); + __sk_dst_reset(sk); +failure: + inet->inet_dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data; + const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + struct tcp_sock *tp; + __u32 seq; + struct net *net = dev_net(skb->dev); + + sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, + th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi6 fl6; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + ipv6_addr_copy(&fl6.daddr, &np->daddr); + ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false); + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + goto out; + } + + } else + dst_hold(dst); + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + WARN_ON(req->sk != NULL); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + sk->sk_err = err; + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + + tcp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) +{ + struct inet6_request_sock *treq = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff * skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr * final_p, final; + struct flowi6 fl6; + struct dst_entry *dst; + int err; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr); + ipv6_addr_copy(&fl6.saddr, &treq->loc_addr); + fl6.flowlabel = 0; + fl6.flowi6_oif = treq->iif; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + opt = np->opt; + final_p = fl6_update_dst(&fl6, opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto done; + } + skb = tcp_make_synack(sk, dst, req, rvp); + err = -ENOMEM; + if (skb) { + __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); + + ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr); + err = ip6_xmit(sk, skb, &fl6, opt); + err = net_xmit_eval(err); + } + +done: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return err; +} + +static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) +{ + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v6_send_synack(sk, req, rvp); +} + +static inline void syn_flood_warning(struct sk_buff *skb) +{ +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) + printk(KERN_INFO + "TCPv6: Possible SYN flooding on port %d. " + "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest)); + else +#endif + printk(KERN_INFO + "TCPv6: Possible SYN flooding on port %d. " + "Dropping request.\n", ntohs(tcp_hdr(skb)->dest)); +} + +static void tcp_v6_reqsk_destructor(struct request_sock *req) +{ + kfree_skb(inet6_rsk(req)->pktopts); +} + +#ifdef CONFIG_TCP_MD5SIG +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, + const struct in6_addr *addr) +{ + struct tcp_sock *tp = tcp_sk(sk); + int i; + + BUG_ON(tp == NULL); + + if (!tp->md5sig_info || !tp->md5sig_info->entries6) + return NULL; + + for (i = 0; i < tp->md5sig_info->entries6; i++) { + if (ipv6_addr_equal(&tp->md5sig_info->keys6[i].addr, addr)) + return &tp->md5sig_info->keys6[i].base; + } + return NULL; +} + +static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, + struct sock *addr_sk) +{ + return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr); +} + +static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk, + struct request_sock *req) +{ + return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr); +} + +static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, + char *newkey, u8 newkeylen) +{ + /* Add key to the list */ + struct tcp_md5sig_key *key; + struct tcp_sock *tp = tcp_sk(sk); + struct tcp6_md5sig_key *keys; + + key = tcp_v6_md5_do_lookup(sk, peer); + if (key) { + /* modify existing entry - just update that one */ + kfree(key->key); + key->key = newkey; + key->keylen = newkeylen; + } else { + /* reallocate new list if current one is full. */ + if (!tp->md5sig_info) { + tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC); + if (!tp->md5sig_info) { + kfree(newkey); + return -ENOMEM; + } + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + } + if (tp->md5sig_info->entries6 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { + kfree(newkey); + return -ENOMEM; + } + if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) { + keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) * + (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC); + + if (!keys) { + kfree(newkey); + if (tp->md5sig_info->entries6 == 0) + tcp_free_md5sig_pool(); + return -ENOMEM; + } + + if (tp->md5sig_info->entries6) + memmove(keys, tp->md5sig_info->keys6, + (sizeof (tp->md5sig_info->keys6[0]) * + tp->md5sig_info->entries6)); + + kfree(tp->md5sig_info->keys6); + tp->md5sig_info->keys6 = keys; + tp->md5sig_info->alloced6++; + } + + ipv6_addr_copy(&tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr, + peer); + tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.key = newkey; + tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.keylen = newkeylen; + + tp->md5sig_info->entries6++; + } + return 0; +} + +static int tcp_v6_md5_add_func(struct sock *sk, struct sock *addr_sk, + u8 *newkey, __u8 newkeylen) +{ + return tcp_v6_md5_do_add(sk, &inet6_sk(addr_sk)->daddr, + newkey, newkeylen); +} + +static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) +{ + struct tcp_sock *tp = tcp_sk(sk); + int i; + + for (i = 0; i < tp->md5sig_info->entries6; i++) { + if (ipv6_addr_equal(&tp->md5sig_info->keys6[i].addr, peer)) { + /* Free the key */ + kfree(tp->md5sig_info->keys6[i].base.key); + tp->md5sig_info->entries6--; + + if (tp->md5sig_info->entries6 == 0) { + kfree(tp->md5sig_info->keys6); + tp->md5sig_info->keys6 = NULL; + tp->md5sig_info->alloced6 = 0; + tcp_free_md5sig_pool(); + } else { + /* shrink the database */ + if (tp->md5sig_info->entries6 != i) + memmove(&tp->md5sig_info->keys6[i], + &tp->md5sig_info->keys6[i+1], + (tp->md5sig_info->entries6 - i) + * sizeof (tp->md5sig_info->keys6[0])); + } + return 0; + } + } + return -ENOENT; +} + +static void tcp_v6_clear_md5_list (struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int i; + + if (tp->md5sig_info->entries6) { + for (i = 0; i < tp->md5sig_info->entries6; i++) + kfree(tp->md5sig_info->keys6[i].base.key); + tp->md5sig_info->entries6 = 0; + tcp_free_md5sig_pool(); + } + + kfree(tp->md5sig_info->keys6); + tp->md5sig_info->keys6 = NULL; + tp->md5sig_info->alloced6 = 0; + + if (tp->md5sig_info->entries4) { + for (i = 0; i < tp->md5sig_info->entries4; i++) + kfree(tp->md5sig_info->keys4[i].base.key); + tp->md5sig_info->entries4 = 0; + tcp_free_md5sig_pool(); + } + + kfree(tp->md5sig_info->keys4); + tp->md5sig_info->keys4 = NULL; + tp->md5sig_info->alloced4 = 0; +} + +static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, + int optlen) +{ + struct tcp_md5sig cmd; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; + u8 *newkey; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + if (copy_from_user(&cmd, optval, sizeof(cmd))) + return -EFAULT; + + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + + if (!cmd.tcpm_keylen) { + if (!tcp_sk(sk)->md5sig_info) + return -ENOENT; + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + return tcp_v4_md5_do_del(sk, sin6->sin6_addr.s6_addr32[3]); + return tcp_v6_md5_do_del(sk, &sin6->sin6_addr); + } + + if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) + return -EINVAL; + + if (!tcp_sk(sk)->md5sig_info) { + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_info *p; + + p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL); + if (!p) + return -ENOMEM; + + tp->md5sig_info = p; + sk_nocaps_add(sk, NETIF_F_GSO_MASK); + } + + newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); + if (!newkey) + return -ENOMEM; + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { + return tcp_v4_md5_do_add(sk, sin6->sin6_addr.s6_addr32[3], + newkey, cmd.tcpm_keylen); + } + return tcp_v6_md5_do_add(sk, &sin6->sin6_addr, newkey, cmd.tcpm_keylen); +} + +static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes) +{ + struct tcp6_pseudohdr *bp; + struct scatterlist sg; + + bp = &hp->md5_blk.ip6; + /* 1. TCP pseudo-header (RFC2460) */ + ipv6_addr_copy(&bp->saddr, saddr); + ipv6_addr_copy(&bp->daddr, daddr); + bp->protocol = cpu_to_be32(IPPROTO_TCP); + bp->len = cpu_to_be32(nbytes); + + sg_init_one(&sg, bp, sizeof(*bp)); + return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); +} + +static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + const struct in6_addr *daddr, struct in6_addr *saddr, + struct tcphdr *th) +{ + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; +} + +static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, + struct sock *sk, struct request_sock *req, + struct sk_buff *skb) +{ + const struct in6_addr *saddr, *daddr; + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + struct tcphdr *th = tcp_hdr(skb); + + if (sk) { + saddr = &inet6_sk(sk)->saddr; + daddr = &inet6_sk(sk)->daddr; + } else if (req) { + saddr = &inet6_rsk(req)->loc_addr; + daddr = &inet6_rsk(req)->rmt_addr; + } else { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; + } + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + + if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; +} + +static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb) +{ + __u8 *hash_location = NULL; + struct tcp_md5sig_key *hash_expected; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + int genhash; + u8 newhash[16]; + + hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr); + hash_location = tcp_parse_md5sig_option(th); + + /* We've parsed the options - do we have a hash? */ + if (!hash_expected && !hash_location) + return 0; + + if (hash_expected && !hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + return 1; + } + + if (!hash_expected && hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + return 1; + } + + /* check the signature */ + genhash = tcp_v6_md5_hash_skb(newhash, + hash_expected, + NULL, NULL, skb); + + if (genhash || memcmp(hash_location, newhash, 16) != 0) { + if (net_ratelimit()) { + printk(KERN_INFO "MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + genhash ? "failed" : "mismatch", + &ip6h->saddr, ntohs(th->source), + &ip6h->daddr, ntohs(th->dest)); + } + return 1; + } + return 0; +} +#endif + +struct request_sock_ops tcp6_request_sock_ops __read_mostly = { + .family = AF_INET6, + .obj_size = sizeof(struct tcp6_request_sock), + .rtx_syn_ack = tcp_v6_rtx_synack, + .send_ack = tcp_v6_reqsk_send_ack, + .destructor = tcp_v6_reqsk_destructor, + .send_reset = tcp_v6_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + .md5_lookup = tcp_v6_reqsk_md5_lookup, + .calc_md5_hash = tcp_v6_md5_hash_skb, +}; +#endif + +static void __tcp_v6_send_check(struct sk_buff *skb, + const struct in6_addr *saddr, const struct in6_addr *daddr) +{ + struct tcphdr *th = tcp_hdr(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + th->check = tcp_v6_check(skb->len, saddr, daddr, + csum_partial(th, th->doff << 2, + skb->csum)); + } +} + +static void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + __tcp_v6_send_check(skb, &np->saddr, &np->daddr); +} + +static int tcp_v6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + ipv6h = ipv6_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); + return 0; +} + +static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + /* fall through */ + case CHECKSUM_NONE: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return tcp_gro_receive(head, skb); +} + +static int tcp6_gro_complete(struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb), + &iph->saddr, &iph->daddr, 0); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + + return tcp_gro_complete(skb); +} + +static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, + u32 ts, struct tcp_md5sig_key *key, int rst) +{ + struct tcphdr *th = tcp_hdr(skb), *t1; + struct sk_buff *buff; + struct flowi6 fl6; + struct net *net = dev_net(skb_dst(skb)->dev); + struct sock *ctl_sk = net->ipv6.tcp_sk; + unsigned int tot_len = sizeof(struct tcphdr); + struct dst_entry *dst; + __be32 *topt; + + if (ts) + tot_len += TCPOLEN_TSTAMP_ALIGNED; +#ifdef CONFIG_TCP_MD5SIG + if (key) + tot_len += TCPOLEN_MD5SIG_ALIGNED; +#endif + + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, + GFP_ATOMIC); + if (buff == NULL) + return; + + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); + + t1 = (struct tcphdr *) skb_push(buff, tot_len); + skb_reset_transport_header(buff); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = tot_len / 4; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = !rst || !th->ack; + t1->rst = rst; + t1->window = htons(win); + + topt = (__be32 *)(t1 + 1); + + if (ts) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *topt++ = htonl(tcp_time_stamp); + *topt++ = htonl(ts); + } + +#ifdef CONFIG_TCP_MD5SIG + if (key) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); + tcp_v6_md5_hash_hdr((__u8 *)topt, key, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, t1); + } +#endif + + memset(&fl6, 0, sizeof(fl6)); + ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&fl6.saddr, &ipv6_hdr(skb)->daddr); + + buff->ip_summed = CHECKSUM_PARTIAL; + buff->csum = 0; + + __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); + + fl6.flowi6_proto = IPPROTO_TCP; + fl6.flowi6_oif = inet6_iif(skb); + fl6.fl6_dport = t1->dest; + fl6.fl6_sport = t1->source; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + /* Pass a socket to ip6_dst_lookup either it is for RST + * Underlying function will use this to retrieve the network + * namespace + */ + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false); + if (!IS_ERR(dst)) { + skb_dst_set(buff, dst); + ip6_xmit(ctl_sk, buff, &fl6, NULL); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + if (rst) + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + u32 seq = 0, ack_seq = 0; + struct tcp_md5sig_key *key = NULL; + + if (th->rst) + return; + + if (!ipv6_unicast_destination(skb)) + return; + +#ifdef CONFIG_TCP_MD5SIG + if (sk) + key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr); +#endif + + if (th->ack) + seq = ntohl(th->ack_seq); + else + ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - + (th->doff << 2); + + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, key, 1); +} + +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, + struct tcp_md5sig_key *key) +{ + tcp_v6_send_response(skb, seq, ack, win, ts, key, 0); +} + +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw)); + + inet_twsk_put(tw); +} + +static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr)); +} + + +static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct request_sock *req, **prev; + const struct tcphdr *th = tcp_hdr(skb); + struct sock *nsk; + + /* Find possible connection requests. */ + req = inet6_csk_search_req(sk, &prev, th->source, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, inet6_iif(skb)); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, + &ipv6_hdr(skb)->saddr, th->source, + &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put(inet_twsk(nsk)); + return NULL; + } + +#ifdef CONFIG_SYN_COOKIES + if (!th->syn) + sk = cookie_v6_check(sk, skb); +#endif + return sk; +} + +/* FIXME: this is substantially similar to the ipv4 code. + * Can some kind of merge be done? -- erics + */ +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_extend_values tmp_ext; + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct request_sock *req; + struct inet6_request_sock *treq; + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; +#ifdef CONFIG_SYN_COOKIES + int want_cookie = 0; +#else +#define want_cookie 0 +#endif + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if (net_ratelimit()) + syn_flood_warning(skb); +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) + want_cookie = 1; + else +#endif + goto drop; + } + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + if (req == NULL) + goto drop; + +#ifdef CONFIG_TCP_MD5SIG + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; +#endif + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *d; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_free; + + /* Secret recipe starts with IP addresses */ + d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0]; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0]; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; + +#ifdef CONFIG_SYN_COOKIES + want_cookie = 0; /* not our kind of cookie */ +#endif + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_free; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + treq = inet6_rsk(req); + ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr); + ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr); + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, tcp_hdr(skb)); + + if (!isn) { + struct inet_peer *peer = NULL; + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + treq->pktopts = skb; + } + treq->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) + treq->iif = inet6_iif(skb); + + if (want_cookie) { + isn = cookie_v6_init_sequence(sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + goto have_isn; + } + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet6_csk_route_req(sk, req)) != NULL && + (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && + ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, + &treq->rmt_addr)) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > + TCP_PAWS_WINDOW) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst_metric(dst, RTAX_RTT))) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", + &treq->rmt_addr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } + + isn = tcp_v6_init_sequence(skb); + } +have_isn: + tcp_rsk(req)->snt_isn = isn; + + security_inet_conn_request(sk, skb, req); + + if (tcp_v6_send_synack(sk, req, + (struct request_values *)&tmp_ext) || + want_cookie) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + return 0; /* don't send reset */ +} + +static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *treq; + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct tcp6_sock *newtcp6sk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + struct ipv6_txoptions *opt; +#ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *key; +#endif + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + if (newsk == NULL) + return NULL; + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + newtp = tcp_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr); + + ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + newsk->sk_backlog_rcv = tcp_v4_do_rcv; +#ifdef CONFIG_TCP_MD5SIG + newtp->af_specific = &tcp_sock_ipv6_mapped_specific; +#endif + + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, tcp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); + + return newsk; + } + + treq = inet6_rsk(req); + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (!dst) { + dst = inet6_csk_route_req(sk, req); + if (!dst) + goto out; + } + + newsk = tcp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out_nonewsk; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + newsk->sk_gso_type = SKB_GSO_TCPV6; + __ip6_dst_store(newsk, dst, NULL, NULL); + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr); + ipv6_addr_copy(&newnp->saddr, &treq->loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr); + newsk->sk_bound_dev_if = treq->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->inet_opt = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (treq->pktopts != NULL) { + newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); + kfree_skb(treq->pktopts); + treq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (newnp->opt) + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); + + tcp_mtup_init(newsk); + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + + tcp_initialize_rcv_mss(newsk); + + newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; + newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + +#ifdef CONFIG_TCP_MD5SIG + /* Copy over the MD5 key from the original socket */ + if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) { + /* We're using one, so create a matching key + * on the newsk structure. If we fail to get + * memory, then we end up not copying the key + * across. Shucks. + */ + char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); + if (newkey != NULL) + tcp_v6_md5_do_add(newsk, &newnp->daddr, + newkey, key->keylen); + } +#endif + + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } + __inet6_hash(newsk, NULL); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return NULL; +} + +static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + if (!tcp_v6_check(skb->len, &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + return 0; + } + } + + skb->csum = ~csum_unfold(tcp_v6_check(skb->len, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, 0)); + + if (skb->len <= 76) { + return __skb_checksum_complete(skb); + } + return 0; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp; + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, tcp_rcv_established and rcv_established + handle them correctly, but it is not case with + tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + +#ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash (sk, skb)) + goto discard; +#endif + + if (sk_filter(sk, skb)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + sock_rps_save_rxhash(sk, skb->rxhash); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) + goto reset; + if (opt_skb) + goto ipv6_pktoptions; + return 0; + } + + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + sock_rps_save_rxhash(nsk, skb->rxhash); + if (tcp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } else + sock_rps_save_rxhash(sk, skb->rxhash); + + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) + goto reset; + if (opt_skb) + goto ipv6_pktoptions; + return 0; + +reset: + tcp_v6_send_reset(sk, skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + goto discard; + + +ipv6_pktoptions: + /* Do you ask, what is it? + + 1. skb was enqueued by tcp. + 2. skb is added to tail of read queue, rather than out of order. + 3. socket is not in passive state. + 4. Finally, it really contains options, which user wants to receive. + */ + tp = tcp_sk(sk); + if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) + np->mcast_oif = inet6_iif(opt_skb); + if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) + np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + if (ipv6_opt_accepted(sk, opt_skb)) { + skb_set_owner_r(opt_skb, sk); + opt_skb = xchg(&np->pktoptions, opt_skb); + } else { + __kfree_skb(opt_skb); + opt_skb = xchg(&np->pktoptions, NULL); + } + } + + kfree_skb(opt_skb); + return 0; +} + +static int tcp_v6_rcv(struct sk_buff *skb) +{ + struct tcphdr *th; + const struct ipv6hdr *hdr; + struct sock *sk; + int ret; + struct net *net = dev_net(skb->dev); + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* + * Count it even if it's bad. + */ + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr)/4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff*4)) + goto discard_it; + + if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) + goto bad_packet; + + th = tcp_hdr(skb); + hdr = ipv6_hdr(skb); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(hdr); + TCP_SKB_CB(skb)->sacked = 0; + + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock_nested(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + if (tp->ucopy.dma_chan) + ret = tcp_v6_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } + } else if (unlikely(sk_add_backlog(sk, skb))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } + bh_unlock_sock(sk); + + sock_put(sk); + return ret ? -1 : 0; + +no_tcp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + } else { + tcp_v6_send_reset(NULL, skb); + } + +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + inet_twsk_put(inet_twsk(sk)); + goto discard_it; + } + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + inet_twsk_put(inet_twsk(sk)); + goto discard_it; + } + + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + case TCP_TW_SYN: + { + struct sock *sk2; + + sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + &ipv6_hdr(skb)->daddr, + ntohs(th->dest), inet6_iif(skb)); + if (sk2 != NULL) { + struct inet_timewait_sock *tw = inet_twsk(sk); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v6_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) +{ + struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_peer *peer; + + if (!rt || + !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) { + peer = inet_getpeer_v6(&np->daddr, 1); + *release_it = true; + } else { + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + peer = rt->rt6i_peer; + *release_it = false; + } + + return peer; +} + +static void *tcp_v6_tw_get_peer(struct sock *sk) +{ + struct inet6_timewait_sock *tw6 = inet6_twsk(sk); + struct inet_timewait_sock *tw = inet_twsk(sk); + + if (tw->tw_family == AF_INET) + return tcp_v4_tw_get_peer(sk); + + return inet_getpeer_v6(&tw6->tw_v6_daddr, 1); +} + +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor= tcp_twsk_destructor, + .twsk_getpeer = tcp_v6_tw_get_peer, +}; + +static const struct inet_connection_sock_af_ops ipv6_specific = { + .queue_xmit = inet6_csk_xmit, + .send_check = tcp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .get_peer = tcp_v6_get_peer, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { + .md5_lookup = tcp_v6_md5_lookup, + .calc_md5_hash = tcp_v6_md5_hash_skb, + .md5_add = tcp_v6_md5_add_func, + .md5_parse = tcp_v6_parse_md5_keys, +}; +#endif + +/* + * TCP over IPv4 via INET6 API + */ + +static const struct inet_connection_sock_af_ops ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .get_peer = tcp_v4_get_peer, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { + .md5_lookup = tcp_v4_md5_lookup, + .calc_md5_hash = tcp_v4_md5_hash_skb, + .md5_add = tcp_v6_md5_add_func, + .md5_parse = tcp_v6_parse_md5_keys, +}; +#endif + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v6_init_sock(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = TCP_MSS_DEFAULT; + + tp->reordering = sysctl_tcp_reordering; + + sk->sk_state = TCP_CLOSE; + + icsk->icsk_af_ops = &ipv6_specific; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_specific; +#endif + + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + local_bh_disable(); + percpu_counter_inc(&tcp_sockets_allocated); + local_bh_enable(); + + return 0; +} + +static void tcp_v6_destroy_sock(struct sock *sk) +{ +#ifdef CONFIG_TCP_MD5SIG + /* Clean up the MD5 key list */ + if (tcp_sk(sk)->md5sig_info) + tcp_v6_clear_md5_list(sk); +#endif + tcp_v4_destroy_sock(sk); + inet6_destroy_sock(sk); +} + +#ifdef CONFIG_PROC_FS +/* Proc filesystem TCPv6 sock list dumping. */ +static void get_openreq6(struct seq_file *seq, + struct sock *sk, struct request_sock *req, int i, int uid) +{ + int ttd = req->expires - jiffies; + const struct in6_addr *src = &inet6_rsk(req)->loc_addr; + const struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; + + if (ttd < 0) + ttd = 0; + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + ntohs(inet_rsk(req)->loc_port), + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], + ntohs(inet_rsk(req)->rmt_port), + TCP_SYN_RECV, + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + 0, req); +} + +static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) +{ + const struct in6_addr *dest, *src; + __u16 destp, srcp; + int timer_active; + unsigned long timer_expires; + struct inet_sock *inet = inet_sk(sp); + struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->inet_dport); + srcp = ntohs(inet->inet_sport); + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + timer_active = 1; + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + timer_active = 4; + timer_expires = icsk->icsk_timeout; + } else if (timer_pending(&sp->sk_timer)) { + timer_active = 2; + timer_expires = sp->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %lu %lu %u %u %d\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + tp->write_seq-tp->snd_una, + (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + icsk->icsk_retransmits, + sock_i_uid(sp), + icsk->icsk_probes_out, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + jiffies_to_clock_t(icsk->icsk_rto), + jiffies_to_clock_t(icsk->icsk_ack.ato), + (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, + tp->snd_cwnd, + tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh + ); +} + +static void get_timewait6_sock(struct seq_file *seq, + struct inet_timewait_sock *tw, int i) +{ + const struct in6_addr *dest, *src; + __u16 destp, srcp; + struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); +} + +static int tcp6_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state *st; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp6_sock(seq, v, st->num); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait6_sock(seq, v, st->num); + break; + } +out: + return 0; +} + +static struct tcp_seq_afinfo tcp6_seq_afinfo = { + .name = "tcp6", + .family = AF_INET6, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = tcp6_seq_show, + }, +}; + +int __net_init tcp6_proc_init(struct net *net) +{ + return tcp_proc_register(net, &tcp6_seq_afinfo); +} + +void tcp6_proc_exit(struct net *net) +{ + tcp_proc_unregister(net, &tcp6_seq_afinfo); +} +#endif + +struct proto tcpv6_prot = { + .name = "TCPv6", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v6_connect, + .disconnect = tcp_disconnect, + .accept = inet_csk_accept, + .ioctl = tcp_ioctl, + .init = tcp_v6_init_sock, + .destroy = tcp_v6_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v6_do_rcv, + .hash = tcp_v6_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .orphan_count = &tcp_orphan_count, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .twsk_prot = &tcp6_timewait_sock_ops, + .rsk_prot = &tcp6_request_sock_ops, + .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_tcp_setsockopt, + .compat_getsockopt = compat_tcp_getsockopt, +#endif +}; + +static const struct inet6_protocol tcpv6_protocol = { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .gso_send_check = tcp_v6_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp6_gro_receive, + .gro_complete = tcp6_gro_complete, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static struct inet_protosw tcpv6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcpv6_prot, + .ops = &inet6_stream_ops, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, +}; + +static int __net_init tcpv6_net_init(struct net *net) +{ + return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, + SOCK_RAW, IPPROTO_TCP, net); +} + +static void __net_exit tcpv6_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.tcp_sk); +} + +static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); +} + +static struct pernet_operations tcpv6_net_ops = { + .init = tcpv6_net_init, + .exit = tcpv6_net_exit, + .exit_batch = tcpv6_net_exit_batch, +}; + +int __init tcpv6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP); + if (ret) + goto out; + + /* register inet6 protocol */ + ret = inet6_register_protosw(&tcpv6_protosw); + if (ret) + goto out_tcpv6_protocol; + + ret = register_pernet_subsys(&tcpv6_net_ops); + if (ret) + goto out_tcpv6_protosw; +out: + return ret; + +out_tcpv6_protocol: + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); +out_tcpv6_protosw: + inet6_unregister_protosw(&tcpv6_protosw); + goto out; +} + +void tcpv6_exit(void) +{ + unregister_pernet_subsys(&tcpv6_net_ops); + inet6_unregister_protosw(&tcpv6_protosw); + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); +} diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c new file mode 100644 index 00000000..4f3cec12 --- /dev/null +++ b/net/ipv6/tunnel6.c @@ -0,0 +1,184 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors Mitsuru KANDA + * YOSHIFUJI Hideaki + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; +static DEFINE_MUTEX(tunnel6_mutex); + +int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) +{ + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; + int ret = -EEXIST; + int priority = handler->priority; + + mutex_lock(&tunnel6_mutex); + + for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&tunnel6_mutex); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_register); + +int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) +{ + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; + int ret = -ENOENT; + + mutex_lock(&tunnel6_mutex); + + for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + mutex_unlock(&tunnel6_mutex); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_deregister); + +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +static int tunnel6_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + for_each_tunnel_rcu(tunnel6_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +static int tunnel46_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + for_each_tunnel_rcu(tunnel46_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_tunnel *handler; + + for_each_tunnel_rcu(tunnel6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static const struct inet6_protocol tunnel6_protocol = { + .handler = tunnel6_rcv, + .err_handler = tunnel6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static const struct inet6_protocol tunnel46_protocol = { + .handler = tunnel46_rcv, + .err_handler = tunnel6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static int __init tunnel6_init(void) +{ + if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { + printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { + printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + return -EAGAIN; + } + return 0; +} + +static void __exit tunnel6_fini(void) +{ + if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) + printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); + if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) + printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); +} + +module_init(tunnel6_init); +module_exit(tunnel6_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c new file mode 100644 index 00000000..0d920c58 --- /dev/null +++ b/net/ipv6/udp.c @@ -0,0 +1,1517 @@ +/* + * UDP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/ipv4/udp.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "udp_impl.h" + +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +{ + const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + __be32 sk1_rcv_saddr = sk_rcv_saddr(sk); + __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) + return (!sk2_ipv6only && + (!sk1_rcv_saddr || !sk2_rcv_saddr || + sk1_rcv_saddr == sk2_rcv_saddr)); + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + return 0; +} + +static unsigned int udp6_portaddr_hash(struct net *net, + const struct in6_addr *addr6, + unsigned int port) +{ + unsigned int hash, mix = net_hash_mix(net); + + if (ipv6_addr_any(addr6)) + hash = jhash_1word(0, mix); + else if (ipv6_addr_v4mapped(addr6)) + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); + else + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); + + return hash ^ port; +} + + +int udp_v6_get_port(struct sock *sk, unsigned short snum) +{ + unsigned int hash2_nulladdr = + udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + unsigned int hash2_partial = + udp6_portaddr_hash(sock_net(sk), &inet6_sk(sk)->rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); +} + +static void udp_v6_rehash(struct sock *sk) +{ + u16 new_hash = udp6_portaddr_hash(sock_net(sk), + &inet6_sk(sk)->rcv_saddr, + inet_sk(sk)->inet_num); + + udp_lib_rehash(sk, new_hash); +} + +static inline int compute_score(struct sock *sk, struct net *net, + unsigned short hnum, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + score = 0; + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + +#define SCORE2_MAX (1 + 1 + 1) +static inline int compute_score2(struct sock *sk, struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned short hnum, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score = 0; + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + + +/* called with read_rcu_lock() */ +static struct sock *udp6_lib_lookup2(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + if (score == SCORE2_MAX) + goto exact_match; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + +static struct sock *__udp6_lib_lookup(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, struct udp_table *udptable) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(dport); + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + int score, badness; + + rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp6_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp6_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp6_lib_lookup2(net, saddr, sport, + &in6addr_any, hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } +begin: + result = NULL; + badness = -1; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); + if (score > badness) { + result = sk; + badness = score; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, saddr, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); + return result; +} + +static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct udp_table *udptable) +{ + struct sock *sk; + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + udptable); +} + +struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) +{ + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup); + + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + unsigned int ulen; + int peeked; + int err; + int is_udplite = IS_UDPLITE(sk); + int is_udp4; + bool slow; + + if (addr_len) + *addr_len=sizeof(struct sockaddr_in6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len); + +try_again: + skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &err); + if (!skb) + goto out; + + ulen = skb->len - sizeof(struct udphdr); + if (len > ulen) + len = ulen; + else if (len < ulen) + msg->msg_flags |= MSG_TRUNC; + + is_udp4 = (skb->protocol == htons(ETH_P_IP)); + + /* + * If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, or if we only want a partial + * coverage checksum (UDP-Lite), do it before the copy. + */ + + if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (udp_lib_checksum_complete(skb)) + goto csum_copy_err; + } + + if (skb_csum_unnecessary(skb)) + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), + msg->msg_iov,len); + else { + err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + if (!peeked) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + } + + sock_recv_ts_and_drops(msg, sk, skb); + + /* Copy the address. */ + if (msg->msg_name) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *) msg->msg_name; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = udp_hdr(skb)->source; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + + if (is_udp4) + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, + &sin6->sin6_addr); + else { + ipv6_addr_copy(&sin6->sin6_addr, + &ipv6_hdr(skb)->saddr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + } + if (is_udp4) { + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } else { + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + } + + err = len; + if (flags & MSG_TRUNC) + err = ulen; + +out_free: + skb_free_datagram_locked(sk, skb); +out: + return err; + +csum_copy_err: + slow = lock_sock_fast(sk); + if (!skb_kill_datagram(sk, skb, flags)) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } + unlock_sock_fast(sk, slow); + + if (noblock) + return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; + goto try_again; +} + +void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) +{ + struct ipv6_pinfo *np; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct in6_addr *saddr = &hdr->saddr; + const struct in6_addr *daddr = &hdr->daddr; + struct udphdr *uh = (struct udphdr*)(skb->data+offset); + struct sock *sk; + int err; + + sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest, + saddr, uh->source, inet6_iif(skb), udptable); + if (sk == NULL) + return; + + np = inet6_sk(sk); + + if (!icmpv6_err_convert(type, code, &err) && !np->recverr) + goto out; + + if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) + goto out; + + if (np->recverr) + ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +static __inline__ void udpv6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, + u8 code, int offset, __be32 info ) +{ + __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); +} + +int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + int rc; + int is_udplite = IS_UDPLITE(sk); + + if (!ipv6_addr_any(&inet6_sk(sk)->daddr)) + sock_rps_save_rxhash(sk, skb->rxhash); + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto drop; + + /* + * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). + */ + if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + + if (up->pcrlen == 0) { /* full coverage was set */ + LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: partial coverage" + " %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); + goto drop; + } + if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { + LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: coverage %d " + "too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); + goto drop; + } + } + + if (rcu_dereference_raw(sk->sk_filter)) { + if (udp_lib_checksum_complete(skb)) + goto drop; + } + + if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) { + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + goto drop_no_sk_drops_inc; + } + + return 0; +drop: + atomic_inc(&sk->sk_drops); +drop_no_sk_drops_inc: + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; +} + +static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif) +{ + struct hlist_nulls_node *node; + struct sock *s = sk; + unsigned short num = ntohs(loc_port); + + sk_nulls_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (!net_eq(sock_net(s), net)) + continue; + + if (udp_sk(s)->udp_port_hash == num && + s->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(s); + if (inet->inet_dport) { + if (inet->inet_dport != rmt_port) + continue; + } + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + continue; + } + if (!inet6_mc_check(s, loc_addr, rmt_addr)) + continue; + return s; + } + } + return NULL; +} + +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + unsigned int i; + struct sock *sk; + struct sk_buff *skb1; + + for (i = 0; i < count; i++) { + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + + sk = stack[i]; + if (skb1) { + if (sk_rcvqueues_full(sk, skb1)) { + kfree_skb(skb1); + goto drop; + } + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + udpv6_queue_rcv_skb(sk, skb1); + else if (sk_add_backlog(sk, skb1)) { + kfree_skb(skb1); + bh_unlock_sock(sk); + goto drop; + } + bh_unlock_sock(sk); + continue; + } +drop: + atomic_inc(&sk->sk_drops); + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_INERRORS, IS_UDPLITE(sk)); + } +} +/* + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, const struct in6_addr *daddr, + struct udp_table *udptable) +{ + struct sock *sk, *stack[256 / sizeof(struct sock *)]; + const struct udphdr *uh = udp_hdr(skb); + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); + int dif; + unsigned int i, count = 0; + + spin_lock(&hslot->lock); + sk = sk_nulls_head(&hslot->head); + dif = inet6_iif(skb); + sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); + while (sk) { + stack[count++] = sk; + sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, + uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; + } + } + /* + * before releasing the lock, we must take reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); + + spin_unlock(&hslot->lock); + + if (count) { + flush_stack(stack, count, skb, count - 1); + + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); + } + return 0; +} + +static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, + int proto) +{ + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + if (uh->check == 0) { + /* RFC 2460 section 8.1 says that we SHOULD log + this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); + return 1; + } + if (skb->ip_summed == CHECKSUM_COMPLETE && + !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->len, proto, skb->csum)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, proto, 0)); + + return 0; +} + +int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + int proto) +{ + struct net *net = dev_net(skb->dev); + struct sock *sk; + struct udphdr *uh; + const struct in6_addr *saddr, *daddr; + u32 ulen = 0; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto discard; + + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); + + ulen = ntohs(uh->len); + if (ulen > skb->len) + goto short_packet; + + if (proto == IPPROTO_UDP) { + /* UDP validates ulen. */ + + /* Check for jumbo payload */ + if (ulen == 0) + ulen = skb->len; + + if (ulen < sizeof(*uh)) + goto short_packet; + + if (ulen < skb->len) { + if (pskb_trim_rcsum(skb, ulen)) + goto short_packet; + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); + } + } + + if (udp6_csum_init(skb, uh, proto)) + goto discard; + + /* + * Multicast receive code + */ + if (ipv6_addr_is_multicast(daddr)) + return __udp6_lib_mcast_deliver(net, skb, + saddr, daddr, udptable); + + /* Unicast */ + + /* + * check socket cache ... must talk to Alan about his plans + * for sock caches... i'll skip this for now. + */ + sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + + if (sk == NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + if (udp_lib_checksum_complete(skb)) + goto discard; + UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, + proto == IPPROTO_UDPLITE); + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; + } + + /* deliver */ + + if (sk_rcvqueues_full(sk, skb)) { + sock_put(sk); + goto discard; + } + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + udpv6_queue_rcv_skb(sk, skb); + else if (sk_add_backlog(sk, skb)) { + atomic_inc(&sk->sk_drops); + bh_unlock_sock(sk); + sock_put(sk); + goto discard; + } + bh_unlock_sock(sk); + sock_put(sk); + return 0; + +short_packet: + LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", + proto == IPPROTO_UDPLITE ? "-Lite" : "", + saddr, + ntohs(uh->source), + ulen, + skb->len, + daddr, + ntohs(uh->dest)); + +discard: + UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + kfree_skb(skb); + return 0; +} + +static __inline__ int udpv6_rcv(struct sk_buff *skb) +{ + return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); +} + +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +static void udp_v6_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending == AF_INET) + udp_flush_pending_frames(sk); + else if (up->pending) { + up->len = 0; + up->pending = 0; + ip6_flush_pending_frames(sk); + } +} + +/** + * udp6_hwcsum_outgoing - handle outgoing HW checksumming + * @sk: socket we are sending on + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) + */ +static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + unsigned int offset; + struct udphdr *uh = udp_hdr(skb); + __wsum csum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* Only one fragment on the socket. */ + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0); + } else { + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together + */ + offset = skb_transport_offset(skb); + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + + skb->ip_summed = CHECKSUM_NONE; + + skb_queue_walk(&sk->sk_write_queue, skb) { + csum = csum_add(csum, skb->csum); + } + + uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, + csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + +/* + * Sending + */ + +static int udp_v6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + struct udphdr *uh; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + int err = 0; + int is_udplite = IS_UDPLITE(sk); + __wsum csum = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = udp_hdr(skb); + uh->source = fl6->fl6_sport; + uh->dest = fl6->fl6_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (is_udplite) + csum = udplite_csum_outgoing(sk, skb); + else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ + udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, + up->len); + goto send; + } else + csum = udp_csum_outgoing(sk, skb); + + /* add protocol-dependent pseudo-header */ + uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + up->len, fl6->flowi6_proto, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + +send: + err = ip6_push_pending_frames(sk); + if (err) { + if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + err = 0; + } + } else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); +out: + up->len = 0; + up->pending = 0; + return err; +} + +int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p, final; + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct flowi6 fl6; + struct dst_entry *dst; + int addr_len = msg->msg_namelen; + int ulen = len; + int hlimit = -1; + int tclass = -1; + int dontfrag = -1; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int err; + int connected = 0; + int is_udplite = IS_UDPLITE(sk); + int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + + /* destination address check */ + if (sin6) { + if (addr_len < offsetof(struct sockaddr, sa_data)) + return -EINVAL; + + switch (sin6->sin6_family) { + case AF_INET6: + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + daddr = &sin6->sin6_addr; + break; + case AF_INET: + goto do_udp_sendmsg; + case AF_UNSPEC: + msg->msg_name = sin6 = NULL; + msg->msg_namelen = addr_len = 0; + daddr = NULL; + break; + default: + return -EINVAL; + } + } else if (!up->pending) { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } else + daddr = NULL; + + if (daddr) { + if (ipv6_addr_v4mapped(daddr)) { + struct sockaddr_in sin; + sin.sin_family = AF_INET; + sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + msg->msg_name = &sin; + msg->msg_namelen = sizeof(sin); +do_udp_sendmsg: + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + return udp_sendmsg(iocb, sk, msg, len); + } + } + + if (up->pending == AF_INET) + return udp_sendmsg(iocb, sk, msg, len); + + /* Rough check on arithmetic overflow, + better check is made in ip6_append_data(). + */ + if (len > INT_MAX - sizeof(struct udphdr)) + return -EMSGSIZE; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET6)) { + release_sock(sk); + return -EAFNOSUPPORT; + } + dst = NULL; + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + memset(&fl6, 0, sizeof(fl6)); + + if (sin6) { + if (sin6->sin6_port == 0) + return -EINVAL; + + fl6.fl6_dport = sin6->sin6_port; + daddr = &sin6->sin6_addr; + + if (np->sndflow) { + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + fl6.fl6_dport = inet->inet_dport; + daddr = &np->daddr; + fl6.flowlabel = np->flow_label; + connected = 1; + } + + if (!fl6.flowi6_oif) + fl6.flowi6_oif = sk->sk_bound_dev_if; + + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + + fl6.flowi6_mark = sk->sk_mark; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(*opt); + + err = datagram_send_ctl(sock_net(sk), msg, &fl6, opt, &hlimit, + &tclass, &dontfrag); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + connected = 0; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); + + fl6.flowi6_proto = sk->sk_protocol; + if (!ipv6_addr_any(daddr)) + ipv6_addr_copy(&fl6.daddr, daddr); + else + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl6.saddr, &np->saddr); + fl6.fl6_sport = inet->inet_sport; + + final_p = fl6_update_dst(&fl6, opt, &final); + if (final_p) + connected = 0; + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) { + fl6.flowi6_oif = np->mcast_oif; + connected = 0; + } + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto out; + } + + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + } + + if (tclass < 0) + tclass = np->tclass; + + if (dontfrag < 0) + dontfrag = np->dontfrag; + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); + err = -EINVAL; + goto out; + } + + up->pending = AF_INET6; + +do_append_data: + up->len += ulen; + getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), hlimit, tclass, opt, &fl6, + (struct rt6_info*)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); + if (err) + udp_v6_flush_pending_frames(sk); + else if (!corkreq) + err = udp_v6_push_pending_frames(sk); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; + + if (dst) { + if (connected) { + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl6.daddr, &np->daddr) ? + &np->daddr : NULL, +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? + &np->saddr : +#endif + NULL); + } else { + dst_release(dst); + } + dst = NULL; + } + + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + release_sock(sk); +out: + dst_release(dst); + fl6_sock_release(flowlabel); + if (!err) + return len; + /* + * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting + * ENOBUFS might not be good (it's not tunable per se), but otherwise + * we don't have a good statistic (IpOutDiscards but it can be too many + * things). We could add another new stat but at least for now that + * seems like overkill. + */ + if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + } + return err; + +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +void udpv6_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_v6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); +} + +/* + * Socket option code for UDP + */ +int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_setsockopt(sk, level, optname, optval, optlen, + udp_v6_push_pending_frames); + return ipv6_setsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_setsockopt(sk, level, optname, optval, optlen, + udp_v6_push_pending_frames); + return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); +} +#endif + +int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_getsockopt(sk, level, optname, optval, optlen); + return ipv6_getsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_getsockopt(sk, level, optname, optval, optlen); + return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int udp6_ufo_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + ipv6h = ipv6_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + return 0; +} + +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *mac_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + int offset; + __wsum csum; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len- offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Check if there is enough headroom to insert fragment header. */ + if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) && + pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) + goto out; + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = skb_network_header(skb) - skb_mac_header(skb) + + unfrag_ip6hlen; + mac_start = skb_mac_header(skb); + memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len); + + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + ipv6_select_ident(fptr, + rt ? &rt->rt6i_dst.addr : &ipv6_hdr(skb)->daddr); + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + +out: + return segs; +} + +static const struct inet6_protocol udpv6_protocol = { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .gso_send_check = udp6_ufo_send_check, + .gso_segment = udp6_ufo_fragment, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) +{ + struct inet_sock *inet = inet_sk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + const struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->inet_dport); + srcp = ntohs(inet->inet_sport); + seq_printf(seq, + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops)); +} + +int udp6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode ref pointer drops\n"); + else + udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + return 0; +} + +static struct udp_seq_afinfo udp6_seq_afinfo = { + .name = "udp6", + .family = AF_INET6, + .udp_table = &udp_table, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = udp6_seq_show, + }, +}; + +int __net_init udp6_proc_init(struct net *net) +{ + return udp_proc_register(net, &udp6_seq_afinfo); +} + +void udp6_proc_exit(struct net *net) { + udp_proc_unregister(net, &udp6_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +/* ------------------------------------------------------------------------ */ + +struct proto udpv6_prot = { + .name = "UDPv6", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, + .get_port = udp_v6_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem = &sysctl_udp_wmem_min, + .sysctl_rmem = &sysctl_udp_rmem_min, + .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udp_table, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_udpv6_setsockopt, + .compat_getsockopt = compat_udpv6_getsockopt, +#endif + .clear_sk = sk_prot_clear_portaddr_nulls, +}; + +static struct inet_protosw udpv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udpv6_prot, + .ops = &inet6_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, +}; + + +int __init udpv6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP); + if (ret) + goto out; + + ret = inet6_register_protosw(&udpv6_protosw); + if (ret) + goto out_udpv6_protocol; +out: + return ret; + +out_udpv6_protocol: + inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + goto out; +} + +void udpv6_exit(void) +{ + inet6_unregister_protosw(&udpv6_protosw); + inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); +} diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h new file mode 100644 index 00000000..d7571046 --- /dev/null +++ b/net/ipv6/udp_impl.h @@ -0,0 +1,37 @@ +#ifndef _UDP6_IMPL_H +#define _UDP6_IMPL_H +#include +#include +#include +#include +#include +#include + +extern int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int ); +extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, + u8 , u8 , int , __be32 , struct udp_table *); + +extern int udp_v6_get_port(struct sock *sk, unsigned short snum); + +extern int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +extern int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +#ifdef CONFIG_COMPAT +extern int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +extern int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +#endif +extern int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len); +extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len); +extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); +extern void udpv6_destroy_sock(struct sock *sk); + +#ifdef CONFIG_PROC_FS +extern int udp6_seq_show(struct seq_file *seq, void *v); +#endif +#endif /* _UDP6_IMPL_H */ diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c new file mode 100644 index 00000000..986c4de5 --- /dev/null +++ b/net/ipv6/udplite.c @@ -0,0 +1,132 @@ +/* + * UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6. + * See also net/ipv4/udplite.c + * + * Authors: Gerrit Renker + * + * Changes: + * Fixes: + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include "udp_impl.h" + +static int udplitev6_rcv(struct sk_buff *skb) +{ + return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); +} + +static void udplitev6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); +} + +static const struct inet6_protocol udplitev6_protocol = { + .handler = udplitev6_rcv, + .err_handler = udplitev6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +struct proto udplitev6_prot = { + .name = "UDPLITEv6", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udplite_sk_init, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .get_port = udp_v6_get_port, + .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udplite_table, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_udpv6_setsockopt, + .compat_getsockopt = compat_udpv6_getsockopt, +#endif + .clear_sk = sk_prot_clear_portaddr_nulls, +}; + +static struct inet_protosw udplite6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDPLITE, + .prot = &udplitev6_prot, + .ops = &inet6_dgram_ops, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT, +}; + +int __init udplitev6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); + if (ret) + goto out; + + ret = inet6_register_protosw(&udplite6_protosw); + if (ret) + goto out_udplitev6_protocol; +out: + return ret; + +out_udplitev6_protocol: + inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); + goto out; +} + +void udplitev6_exit(void) +{ + inet6_unregister_protosw(&udplite6_protosw); + inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); +} + +#ifdef CONFIG_PROC_FS +static struct udp_seq_afinfo udplite6_seq_afinfo = { + .name = "udplite6", + .family = AF_INET6, + .udp_table = &udplite_table, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = udp6_seq_show, + }, +}; + +static int __net_init udplite6_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udplite6_seq_afinfo); +} + +static void __net_exit udplite6_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udplite6_seq_afinfo); +} + +static struct pernet_operations udplite6_net_ops = { + .init = udplite6_proc_init_net, + .exit = udplite6_proc_exit_net, +}; + +int __init udplite6_proc_init(void) +{ + return register_pernet_subsys(&udplite6_net_ops); +} + +void udplite6_proc_exit(void) +{ + unregister_pernet_subsys(&udplite6_net_ops); +} +#endif diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c new file mode 100644 index 00000000..f8c3cf84 --- /dev/null +++ b/net/ipv6/xfrm6_input.c @@ -0,0 +1,146 @@ +/* + * xfrm6_input.c: based on net/ipv4/xfrm4_input.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * YOSHIFUJI Hideaki @USAGI + * IPv6 support + */ + +#include +#include +#include +#include +#include +#include + +int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return xfrm6_extract_header(skb); +} + +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + return xfrm_input(skb, nexthdr, spi, 0); +} +EXPORT_SYMBOL(xfrm6_rcv_spi); + +int xfrm6_transport_finish(struct sk_buff *skb, int async) +{ + skb_network_header(skb)[IP6CB(skb)->nhoff] = + XFRM_MODE_SKB_CB(skb)->protocol; + +#ifndef CONFIG_NETFILTER + if (!async) + return 1; +#endif + + ipv6_hdr(skb)->payload_len = htons(skb->len); + __skb_push(skb, skb->data - skb_network_header(skb)); + + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + ip6_rcv_finish); + return -1; +} + +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0); +} + +EXPORT_SYMBOL(xfrm6_rcv); + +int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, + xfrm_address_t *saddr, u8 proto) +{ + struct net *net = dev_net(skb->dev); + struct xfrm_state *x = NULL; + int i = 0; + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); + goto drop; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + if (1 + skb->sp->len == XFRM_MAX_DEPTH) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + for (i = 0; i < 3; i++) { + xfrm_address_t *dst, *src; + + switch (i) { + case 0: + dst = daddr; + src = saddr; + break; + case 1: + /* lookup state with wild-card source address */ + dst = daddr; + src = (xfrm_address_t *)&in6addr_any; + break; + default: + /* lookup state with wild-card addresses */ + dst = (xfrm_address_t *)&in6addr_any; + src = (xfrm_address_t *)&in6addr_any; + break; + } + + x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6); + if (!x) + continue; + + spin_lock(&x->lock); + + if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && + likely(x->km.state == XFRM_STATE_VALID) && + !xfrm_state_check_expire(x)) { + spin_unlock(&x->lock); + if (x->type->input(x, skb) > 0) { + /* found a valid state */ + break; + } + } else + spin_unlock(&x->lock); + + xfrm_state_put(x); + x = NULL; + } + + if (!x) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); + xfrm_audit_state_notfound_simple(skb, AF_INET6); + goto drop; + } + + skb->sp->xvec[skb->sp->len++] = x; + + spin_lock(&x->lock); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + return 1; + +drop: + return -1; +} + +EXPORT_SYMBOL(xfrm6_input_addr); diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c new file mode 100644 index 00000000..f37cba9e --- /dev/null +++ b/net/ipv6/xfrm6_mode_beet.c @@ -0,0 +1,131 @@ +/* + * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6. + * + * Copyright (c) 2006 Diego Beltrami + * Miika Komu + * Herbert Xu + * Abhinav Pathak + * Jeff Ahrenholz + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void xfrm6_beet_make_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + iph->version = 6; + + memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(iph->flow_lbl)); + iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; + + ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); + iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + */ +static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *top_iph; + struct ip_beet_phdr *ph; + int optlen, hdr_len; + + hdr_len = 0; + optlen = XFRM_MODE_SKB_CB(skb)->optlen; + if (unlikely(optlen)) + hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb_set_network_header(skb, -x->props.header_len - hdr_len); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + ph = (struct ip_beet_phdr *)__skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl-hdr_len); + + xfrm6_beet_make_header(skb); + + top_iph = ipv6_hdr(skb); + if (unlikely(optlen)) { + + BUG_ON(optlen < 0); + + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = optlen / 8; + ph->nexthdr = top_iph->nexthdr; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); + + top_iph->nexthdr = IPPROTO_BEETPH; + } + + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + return 0; +} + +static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + int size = sizeof(struct ipv6hdr); + int err; + + err = skb_cow_head(skb, size + skb->mac_len); + if (err) + goto out; + + __skb_push(skb, size); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + xfrm6_beet_make_header(skb); + + ip6h = ipv6_hdr(skb); + ip6h->payload_len = htons(skb->len - size); + ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); + ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm6_beet_mode = { + .input2 = xfrm6_beet_input, + .input = xfrm_prepare_input, + .output2 = xfrm6_beet_output, + .output = xfrm6_prepare_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, + .flags = XFRM_MODE_FLAG_TUNNEL, +}; + +static int __init xfrm6_beet_init(void) +{ + return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6); +} + +static void __exit xfrm6_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_beet_init); +module_exit(xfrm6_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET); diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c new file mode 100644 index 00000000..63d5d493 --- /dev/null +++ b/net/ipv6/xfrm6_mode_ro.c @@ -0,0 +1,84 @@ +/* + * xfrm6_mode_ro.c - Route optimization mode for IPv6. + * + * Copyright (C)2003-2006 Helsinki University of Technology + * Copyright (C)2003-2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * Authors: + * Noriaki TAKAMIYA @USAGI + * Masahide NAKAMURA @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add route optimization header space. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the route optimization header. + */ +static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + iph = ipv6_hdr(skb); + + hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); + skb_set_network_header(skb, -x->props.header_len); + skb->transport_header = skb->network_header + hdr_len; + __skb_pull(skb, hdr_len); + memmove(ipv6_hdr(skb), iph, hdr_len); + + x->lastused = get_seconds(); + + return 0; +} + +static struct xfrm_mode xfrm6_ro_mode = { + .output = xfrm6_ro_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_ROUTEOPTIMIZATION, +}; + +static int __init xfrm6_ro_init(void) +{ + return xfrm_register_mode(&xfrm6_ro_mode, AF_INET6); +} + +static void __exit xfrm6_ro_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_ro_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_ro_init); +module_exit(xfrm6_ro_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_ROUTEOPTIMIZATION); diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c new file mode 100644 index 00000000..4e344105 --- /dev/null +++ b/net/ipv6/xfrm6_mode_transport.c @@ -0,0 +1,85 @@ +/* + * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + */ +static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + iph = ipv6_hdr(skb); + + hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); + skb_set_network_header(skb, -x->props.header_len); + skb->transport_header = skb->network_header + hdr_len; + __skb_pull(skb, hdr_len); + memmove(ipv6_hdr(skb), iph, hdr_len); + return 0; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb_transport_header(skb); + + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); + return 0; +} + +static struct xfrm_mode xfrm6_transport_mode = { + .input = xfrm6_transport_input, + .output = xfrm6_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm6_transport_init(void) +{ + return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); +} + +static void __exit xfrm6_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_transport_init); +module_exit(xfrm6_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c new file mode 100644 index 00000000..23ecd68a --- /dev/null +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -0,0 +1,117 @@ +/* + * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + const struct ipv6hdr *outer_iph = ipv6_hdr(skb); + struct ipv6hdr *inner_iph = ipipv6_hdr(skb); + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. + */ +static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *top_iph; + int dsfield; + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + top_iph = ipv6_hdr(skb); + + top_iph->version = 6; + + memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); + + dsfield = XFRM_MODE_SKB_CB(skb)->tos; + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + ipv6_addr_copy(&top_iph->saddr, (const struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (const struct in6_addr *)&x->id.daddr); + return 0; +} + +static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), + ipipv6_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm6_tunnel_mode = { + .input2 = xfrm6_mode_tunnel_input, + .input = xfrm_prepare_input, + .output2 = xfrm6_mode_tunnel_output, + .output = xfrm6_prepare_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, + .flags = XFRM_MODE_FLAG_TUNNEL, +}; + +static int __init xfrm6_mode_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); +} + +static void __exit xfrm6_mode_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_mode_tunnel_init); +module_exit(xfrm6_mode_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c new file mode 100644 index 00000000..49a91c5f --- /dev/null +++ b/net/ipv6/xfrm6_output.c @@ -0,0 +1,109 @@ +/* + * xfrm6_output.c - Common IPsec encapsulation code for IPv6. + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004 Herbert Xu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, + u8 **prevhdr) +{ + return ip6_find_1stfragopt(skb, prevhdr); +} + +EXPORT_SYMBOL(xfrm6_find_1stfragopt); + +static int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst = skb_dst(skb); + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (!skb->local_df && skb->len > mtu) { + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ret = -EMSGSIZE; + } + + return ret; +} + +int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm6_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; + + return xfrm6_extract_header(skb); +} + +int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm_inner_extract_output(x, skb); + if (err) + return err; + + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif + + skb->protocol = htons(ETH_P_IPV6); + skb->local_df = 1; + + return x->outer_mode->output2(x, skb); +} +EXPORT_SYMBOL(xfrm6_prepare_output); + +int xfrm6_output_finish(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif + + skb->protocol = htons(ETH_P_IPV6); + return xfrm_output(skb); +} + +static int __xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + if ((x && x->props.mode == XFRM_MODE_TUNNEL) && + ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)))) { + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); + } + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm6_output(struct sk_buff *skb) +{ + return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, + skb_dst(skb)->dev, __xfrm6_output); +} diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c new file mode 100644 index 00000000..d879f7ef --- /dev/null +++ b/net/ipv6/xfrm6_policy.c @@ -0,0 +1,357 @@ +/* + * xfrm6_policy.c: based on xfrm4_policy.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#include +#endif + +static struct xfrm_policy_afinfo xfrm6_policy_afinfo; + +static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct flowi6 fl6; + struct dst_entry *dst; + int err; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); + if (saddr) + memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); + + dst = ip6_route_output(net, NULL, &fl6); + + err = dst->error; + if (dst->error) { + dst_release(dst); + dst = ERR_PTR(err); + } + + return dst; +} + +static int xfrm6_get_saddr(struct net *net, + xfrm_address_t *saddr, xfrm_address_t *daddr) +{ + struct dst_entry *dst; + struct net_device *dev; + + dst = xfrm6_dst_lookup(net, 0, NULL, daddr); + if (IS_ERR(dst)) + return -EHOSTUNREACH; + + dev = ip6_dst_idev(dst)->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, + (struct in6_addr *)&daddr->a6, 0, + (struct in6_addr *)&saddr->a6); + dst_release(dst); + return 0; +} + +static int xfrm6_get_tos(const struct flowi *fl) +{ + return 0; +} + +static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) +{ + if (dst->ops->family == AF_INET6) { + struct rt6_info *rt = (struct rt6_info*)dst; + if (rt->rt6i_node) + path->path_cookie = rt->rt6i_node->fn_sernum; + } + + path->u.rt6.rt6i_nfheader_len = nfheader_len; + + return 0; +} + +static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + const struct flowi *fl) +{ + struct rt6_info *rt = (struct rt6_info*)xdst->route; + + xdst->u.dst.dev = dev; + dev_hold(dev); + + xdst->u.rt6.rt6i_idev = in6_dev_get(dev); + if (!xdst->u.rt6.rt6i_idev) + return -ENODEV; + + xdst->u.rt6.rt6i_peer = rt->rt6i_peer; + if (rt->rt6i_peer) + atomic_inc(&rt->rt6i_peer->refcnt); + + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | + RTF_LOCAL); + xdst->u.rt6.rt6i_metric = rt->rt6i_metric; + xdst->u.rt6.rt6i_node = rt->rt6i_node; + if (rt->rt6i_node) + xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; + xdst->u.rt6.rt6i_dst = rt->rt6i_dst; + xdst->u.rt6.rt6i_src = rt->rt6i_src; + + return 0; +} + +static inline void +_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) +{ + struct flowi6 *fl6 = &fl->u.ip6; + int onlyproto = 0; + u16 offset = skb_network_header_len(skb); + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct ipv6_opt_hdr *exthdr; + const unsigned char *nh = skb_network_header(skb); + u8 nexthdr = nh[IP6CB(skb)->nhoff]; + + memset(fl6, 0, sizeof(struct flowi6)); + fl6->flowi6_mark = skb->mark; + + ipv6_addr_copy(&fl6->daddr, reverse ? &hdr->saddr : &hdr->daddr); + ipv6_addr_copy(&fl6->saddr, reverse ? &hdr->daddr : &hdr->saddr); + + while (nh + offset + 1 < skb->data || + pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + nh = skb_network_header(skb); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + onlyproto = 1; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + offset += ipv6_optlen(exthdr); + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + break; + + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + if (!onlyproto && (nh + offset + 4 < skb->data || + pskb_may_pull(skb, nh + offset + 4 - skb->data))) { + __be16 *ports = (__be16 *)exthdr; + + fl6->fl6_sport = ports[!!reverse]; + fl6->fl6_dport = ports[!reverse]; + } + fl6->flowi6_proto = nexthdr; + return; + + case IPPROTO_ICMPV6: + if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) { + u8 *icmp = (u8 *)exthdr; + + fl6->fl6_icmp_type = icmp[0]; + fl6->fl6_icmp_code = icmp[1]; + } + fl6->flowi6_proto = nexthdr; + return; + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPPROTO_MH: + if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { + struct ip6_mh *mh; + mh = (struct ip6_mh *)exthdr; + + fl6->fl6_mh_type = mh->ip6mh_type; + } + fl6->flowi6_proto = nexthdr; + return; +#endif + + /* XXX Why are there these headers? */ + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: + default: + fl6->fl6_ipsec_spi = 0; + fl6->flowi6_proto = nexthdr; + return; + } + } +} + +static inline int xfrm6_garbage_collect(struct dst_ops *ops) +{ + struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); + + xfrm6_policy_afinfo.garbage_collect(net); + return dst_entries_get_fast(ops) > ops->gc_thresh * 2; +} + +static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static void xfrm6_dst_destroy(struct dst_entry *dst) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (likely(xdst->u.rt6.rt6i_idev)) + in6_dev_put(xdst->u.rt6.rt6i_idev); + dst_destroy_metrics_generic(dst); + if (likely(xdst->u.rt6.rt6i_peer)) + inet_putpeer(xdst->u.rt6.rt6i_peer); + xfrm_dst_destroy(xdst); +} + +static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + struct xfrm_dst *xdst; + + if (!unregister) + return; + + xdst = (struct xfrm_dst *)dst; + if (xdst->u.rt6.rt6i_idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(dev_net(dev)->loopback_dev); + BUG_ON(!loopback_idev); + + do { + in6_dev_put(xdst->u.rt6.rt6i_idev); + xdst->u.rt6.rt6i_idev = loopback_idev; + in6_dev_hold(loopback_idev); + xdst = (struct xfrm_dst *)xdst->u.dst.child; + } while (xdst->u.dst.xfrm); + + __in6_dev_put(loopback_idev); + } + + xfrm_dst_ifdown(dst, dev); +} + +static struct dst_ops xfrm6_dst_ops = { + .family = AF_INET6, + .protocol = cpu_to_be16(ETH_P_IPV6), + .gc = xfrm6_garbage_collect, + .update_pmtu = xfrm6_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, + .destroy = xfrm6_dst_destroy, + .ifdown = xfrm6_dst_ifdown, + .local_out = __ip6_local_out, + .gc_thresh = 1024, +}; + +static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { + .family = AF_INET6, + .dst_ops = &xfrm6_dst_ops, + .dst_lookup = xfrm6_dst_lookup, + .get_saddr = xfrm6_get_saddr, + .decode_session = _decode_session6, + .get_tos = xfrm6_get_tos, + .init_path = xfrm6_init_path, + .fill_dst = xfrm6_fill_dst, + .blackhole_route = ip6_blackhole_route, +}; + +static int __init xfrm6_policy_init(void) +{ + return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); +} + +static void xfrm6_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm6_policy_table[] = { + { + .procname = "xfrm6_gc_thresh", + .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *sysctl_hdr; +#endif + +int __init xfrm6_init(void) +{ + int ret; + unsigned int gc_thresh; + + /* + * We need a good default value for the xfrm6 gc threshold. + * In ipv4 we set it to the route hash table size * 8, which + * is half the size of the maximaum route cache for ipv4. It + * would be good to do the same thing for v6, except the table is + * constructed differently here. Here each table for a net namespace + * can have FIB_TABLE_HASHSZ entries, so lets go with the same + * computation that we used for ipv4 here. Also, lets keep the initial + * gc_thresh to a minimum of 1024, since, the ipv6 route cache defaults + * to that as a minimum as well + */ + gc_thresh = FIB6_TABLE_HASHSZ * 8; + xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh; + dst_entries_init(&xfrm6_dst_ops); + + ret = xfrm6_policy_init(); + if (ret) { + dst_entries_destroy(&xfrm6_dst_ops); + goto out; + } + ret = xfrm6_state_init(); + if (ret) + goto out_policy; + +#ifdef CONFIG_SYSCTL + sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv6_ctl_path, + xfrm6_policy_table); +#endif +out: + return ret; +out_policy: + xfrm6_policy_fini(); + goto out; +} + +void xfrm6_fini(void) +{ +#ifdef CONFIG_SYSCTL + if (sysctl_hdr) + unregister_net_sysctl_table(sysctl_hdr); +#endif + //xfrm6_input_fini(); + xfrm6_policy_fini(); + xfrm6_state_fini(); + dst_entries_destroy(&xfrm6_dst_ops); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c new file mode 100644 index 00000000..248f0b2a --- /dev/null +++ b/net/ipv6/xfrm6_state.c @@ -0,0 +1,196 @@ +/* + * xfrm6_state.c: based on xfrm4_state.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include +#include +#include + +static void +__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi6 *fl6 = &fl->u.ip6; + + /* Initialize temporary selector matching only + * to current session. */ + ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl6->daddr); + ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl6->saddr); + sel->dport = xfrm_flowi_dport(fl, &fl6->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl6->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl6->flowi6_proto; + sel->ifindex = fl6->flowi6_oif; +} + +static void +xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) +{ + x->id = tmpl->id; + if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET6; +} + +/* distribution counting sort function for xfrm_state and xfrm_tmpl */ +static int +__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) +{ + int i; + int class[XFRM_MAX_DEPTH]; + int count[maxclass]; + + memset(count, 0, sizeof(count)); + + for (i = 0; i < n; i++) { + int c; + class[i] = c = cmp(src[i]); + count[c]++; + } + + for (i = 2; i < maxclass; i++) + count[i] += count[i - 1]; + + for (i = 0; i < n; i++) { + dst[count[class[i] - 1]++] = src[i]; + src[i] = NULL; + } + + return 0; +} + +/* + * Rule for xfrm_state: + * + * rule 1: select IPsec transport except AH + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec transport AH + * rule 4: select IPsec tunnel + * rule 5: others + */ +static int __xfrm6_state_sort_cmp(void *p) +{ + struct xfrm_state *v = p; + + switch (v->props.mode) { + case XFRM_MODE_TRANSPORT: + if (v->id.proto != IPPROTO_AH) + return 1; + else + return 3; +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 4; + } + return 5; +} + +static int +__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) +{ + return __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_state_sort_cmp, 6); +} + +/* + * Rule for xfrm_tmpl: + * + * rule 1: select IPsec transport + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec tunnel + * rule 4: others + */ +static int __xfrm6_tmpl_sort_cmp(void *p) +{ + struct xfrm_tmpl *v = p; + switch (v->mode) { + case XFRM_MODE_TRANSPORT: + return 1; +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 3; + } + return 4; +} + +static int +__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) +{ + return __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_tmpl_sort_cmp, 5); +} + +int xfrm6_extract_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = 0; + XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); + XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); + XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; + XFRM_MODE_SKB_CB(skb)->optlen = 0; + memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + + return 0; +} + +static struct xfrm_state_afinfo xfrm6_state_afinfo = { + .family = AF_INET6, + .proto = IPPROTO_IPV6, + .eth_proto = htons(ETH_P_IPV6), + .owner = THIS_MODULE, + .init_tempsel = __xfrm6_init_tempsel, + .init_temprop = xfrm6_init_temprop, + .tmpl_sort = __xfrm6_tmpl_sort, + .state_sort = __xfrm6_state_sort, + .output = xfrm6_output, + .output_finish = xfrm6_output_finish, + .extract_input = xfrm6_extract_input, + .extract_output = xfrm6_extract_output, + .transport_finish = xfrm6_transport_finish, +}; + +int __init xfrm6_state_init(void) +{ + return xfrm_state_register_afinfo(&xfrm6_state_afinfo); +} + +void xfrm6_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); +} + diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c new file mode 100644 index 00000000..4fe1db12 --- /dev/null +++ b/net/ipv6/xfrm6_tunnel.c @@ -0,0 +1,402 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors Mitsuru KANDA + * YOSHIFUJI Hideaki + * + * Based on net/ipv4/xfrm4_tunnel.c + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 +#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 + +#define XFRM6_TUNNEL_SPI_MIN 1 +#define XFRM6_TUNNEL_SPI_MAX 0xffffffff + +struct xfrm6_tunnel_net { + struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; + struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; + u32 spi; +}; + +static int xfrm6_tunnel_net_id __read_mostly; +static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) +{ + return net_generic(net, xfrm6_tunnel_net_id); +} + +/* + * xfrm_tunnel_spi things are for allocating unique id ("spi") + * per xfrm_address_t. + */ +struct xfrm6_tunnel_spi { + struct hlist_node list_byaddr; + struct hlist_node list_byspi; + xfrm_address_t addr; + u32 spi; + atomic_t refcnt; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); + +static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; + +static inline unsigned xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) +{ + unsigned h; + + h = (__force u32)(addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]); + h ^= h >> 16; + h ^= h >> 8; + h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; + + return h; +} + +static inline unsigned xfrm6_tunnel_spi_hash_byspi(u32 spi) +{ + return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; +} + +static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos; + + hlist_for_each_entry_rcu(x6spi, pos, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) + return x6spi; + } + + return NULL; +} + +__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + rcu_read_lock_bh(); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); + spi = x6spi ? x6spi->spi : 0; + rcu_read_unlock_bh(); + return htonl(spi); +} + +EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); + +static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + int index = xfrm6_tunnel_spi_hash_byspi(spi); + struct hlist_node *pos; + + hlist_for_each_entry(x6spi, pos, + &xfrm6_tn->spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + return -1; + } + return index; +} + +static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + u32 spi; + struct xfrm6_tunnel_spi *x6spi; + int index; + + if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || + xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) + xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; + else + xfrm6_tn->spi++; + + for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); + if (index >= 0) + goto alloc_spi; + } + for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); + if (index >= 0) + goto alloc_spi; + } + spi = 0; + goto out; +alloc_spi: + xfrm6_tn->spi = spi; + x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); + if (!x6spi) + goto out; + + memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); + x6spi->spi = spi; + atomic_set(&x6spi->refcnt, 1); + + hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); + + index = xfrm6_tunnel_spi_hash_byaddr(saddr); + hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); +out: + return spi; +} + +__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + spin_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); + if (x6spi) { + atomic_inc(&x6spi->refcnt); + spi = x6spi->spi; + } else + spi = __xfrm6_tunnel_alloc_spi(net, saddr); + spin_unlock_bh(&xfrm6_tunnel_spi_lock); + + return htonl(spi); +} + +EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); + +static void x6spi_destroy_rcu(struct rcu_head *head) +{ + kmem_cache_free(xfrm6_tunnel_spi_kmem, + container_of(head, struct xfrm6_tunnel_spi, rcu_head)); +} + +static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos, *n; + + spin_lock_bh(&xfrm6_tunnel_spi_lock); + + hlist_for_each_entry_safe(x6spi, pos, n, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) + { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + if (atomic_dec_and_test(&x6spi->refcnt)) { + hlist_del_rcu(&x6spi->list_byaddr); + hlist_del_rcu(&x6spi->list_byspi); + call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); + break; + } + } + } + spin_unlock_bh(&xfrm6_tunnel_spi_lock); +} + +static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + skb_push(skb, -skb_network_offset(skb)); + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return skb_network_header(skb)[IP6CB(skb)->nhoff]; +} + +static int xfrm6_tunnel_rcv(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); + __be32 spi; + + spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); +} + +static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + /* xfrm6_tunnel native err handling */ + switch (type) { + case ICMPV6_DEST_UNREACH: + switch (code) { + case ICMPV6_NOROUTE: + case ICMPV6_ADM_PROHIBITED: + case ICMPV6_NOT_NEIGHBOUR: + case ICMPV6_ADDR_UNREACH: + case ICMPV6_PORT_UNREACH: + default: + break; + } + break; + case ICMPV6_PKT_TOOBIG: + break; + case ICMPV6_TIME_EXCEED: + switch (code) { + case ICMPV6_EXC_HOPLIMIT: + break; + case ICMPV6_EXC_FRAGTIME: + default: + break; + } + break; + case ICMPV6_PARAMPROB: + switch (code) { + case ICMPV6_HDR_FIELD: break; + case ICMPV6_UNK_NEXTHDR: break; + case ICMPV6_UNK_OPTION: break; + } + break; + default: + break; + } + + return 0; +} + +static int xfrm6_tunnel_init_state(struct xfrm_state *x) +{ + if (x->props.mode != XFRM_MODE_TUNNEL) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct ipv6hdr); + + return 0; +} + +static void xfrm6_tunnel_destroy(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + + xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); +} + +static const struct xfrm_type xfrm6_tunnel_type = { + .description = "IP6IP6", + .owner = THIS_MODULE, + .proto = IPPROTO_IPV6, + .init_state = xfrm6_tunnel_init_state, + .destructor = xfrm6_tunnel_destroy, + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, +}; + +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { + .handler = xfrm6_tunnel_rcv, + .err_handler = xfrm6_tunnel_err, + .priority = 2, +}; + +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { + .handler = xfrm6_tunnel_rcv, + .err_handler = xfrm6_tunnel_err, + .priority = 2, +}; + +static int __net_init xfrm6_tunnel_net_init(struct net *net) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); + xfrm6_tn->spi = 0; + + return 0; +} + +static void __net_exit xfrm6_tunnel_net_exit(struct net *net) +{ +} + +static struct pernet_operations xfrm6_tunnel_net_ops = { + .init = xfrm6_tunnel_net_init, + .exit = xfrm6_tunnel_net_exit, + .id = &xfrm6_tunnel_net_id, + .size = sizeof(struct xfrm6_tunnel_net), +}; + +static int __init xfrm6_tunnel_init(void) +{ + int rv; + + xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", + sizeof(struct xfrm6_tunnel_spi), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!xfrm6_tunnel_spi_kmem) + return -ENOMEM; + rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); + if (rv < 0) + goto out_pernet; + rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); + if (rv < 0) + goto out_type; + rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); + if (rv < 0) + goto out_xfrm6; + rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); + if (rv < 0) + goto out_xfrm46; + return 0; + +out_xfrm46: + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); +out_xfrm6: + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); +out_type: + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); +out_pernet: + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); + return rv; +} + +static void __exit xfrm6_tunnel_fini(void) +{ + xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig new file mode 100644 index 00000000..e9ad0062 --- /dev/null +++ b/net/ipx/Kconfig @@ -0,0 +1,60 @@ +# +# IPX configuration +# +config IPX + tristate "The IPX protocol" + select LLC + ---help--- + This is support for the Novell networking protocol, IPX, commonly + used for local networks of Windows machines. You need it if you + want to access Novell NetWare file or print servers using the Linux + Novell client ncpfs (available from + ) or from + within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, + available from ). In order + to do the former, you'll also have to say Y to "NCP file system + support", below. + + IPX is similar in scope to IP, while SPX, which runs on top of IPX, + is similar to TCP. + + To turn your Linux box into a fully featured NetWare file server and + IPX router, say Y here and fetch either lwared from + or + mars_nwe from . For more + information, read the IPX-HOWTO available from + . + + The IPX driver would enlarge your kernel by about 16 KB. To compile + this driver as a module, choose M here: the module will be called ipx. + Unless you want to integrate your Linux box with a local Novell + network, say N. + +config IPX_INTERN + bool "IPX: Full internal IPX network" + depends on IPX + ---help--- + Every IPX network has an address that identifies it. Sometimes it is + useful to give an IPX "network" address to your Linux box as well + (for example if your box is acting as a file server for different + IPX networks: it will then be accessible from everywhere using the + same address). The way this is done is to create a virtual internal + "network" inside your box and to assign an IPX address to this + network. Say Y here if you want to do this; read the IPX-HOWTO at + for details. + + The full internal IPX network enables you to allocate sockets on + different virtual nodes of the internal network. This is done by + evaluating the field sipx_node of the socket address given to the + bind call. So applications should always initialize the node field + to 0 when binding a socket on the primary network. In this case the + socket is assigned the default node that has been given to the + kernel when the internal network was created. By enabling the full + internal IPX network the cross-forwarding of packets targeted at + 'special' sockets to sockets listening on the primary network is + disabled. This might break existing applications, especially RIP/SAP + daemons. A RIP/SAP daemon that works well with the full internal net + can be found on . + + If you don't know what you are doing, say N. + diff --git a/net/ipx/Makefile b/net/ipx/Makefile new file mode 100644 index 00000000..4b95e3ea --- /dev/null +++ b/net/ipx/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux IPX layer. +# + +obj-$(CONFIG_IPX) += ipx.o + +ipx-y := af_ipx.o ipx_route.o ipx_proc.o +ipx-$(CONFIG_SYSCTL) += sysctl_net_ipx.o diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c new file mode 100644 index 00000000..96802266 --- /dev/null +++ b/net/ipx/af_ipx.c @@ -0,0 +1,2092 @@ +/* + * Implements an IPX socket layer. + * + * This code is derived from work by + * Ross Biro : Writing the original IP stack + * Fred Van Kempen : Tidying up the TCP/IP + * + * Many thanks go to Keith Baker, Institute For Industrial Information + * Technology Ltd, Swansea University for allowing me to work on this + * in my own time even though it was in some ways related to commercial + * work I am currently employed to do there. + * + * All the material in this file is subject to the Gnu license version 2. + * Neither Alan Cox nor the Swansea University Computer Society admit + * liability nor provide warranty for any of this software. This material + * is provided as is and at no charge. + * + * Portions Copyright (c) 2000-2003 Conectiva, Inc. + * Neither Arnaldo Carvalho de Melo nor Conectiva, Inc. admit liability nor + * provide warranty for any of this software. This material is provided + * "AS-IS" and at no charge. + * + * Portions Copyright (c) 1995 Caldera, Inc. + * Neither Greg Page nor Caldera, Inc. admit liability nor provide + * warranty for any of this software. This material is provided + * "AS-IS" and at no charge. + * + * See net/ipx/ChangeLog. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SYSCTL +extern void ipx_register_sysctl(void); +extern void ipx_unregister_sysctl(void); +#else +#define ipx_register_sysctl() +#define ipx_unregister_sysctl() +#endif + +/* Configuration Variables */ +static unsigned char ipxcfg_max_hops = 16; +static char ipxcfg_auto_select_primary; +static char ipxcfg_auto_create_interfaces; +int sysctl_ipx_pprop_broadcasting = 1; + +/* Global Variables */ +static struct datalink_proto *p8022_datalink; +static struct datalink_proto *pEII_datalink; +static struct datalink_proto *p8023_datalink; +static struct datalink_proto *pSNAP_datalink; + +static const struct proto_ops ipx_dgram_ops; + +LIST_HEAD(ipx_interfaces); +DEFINE_SPINLOCK(ipx_interfaces_lock); + +struct ipx_interface *ipx_primary_net; +struct ipx_interface *ipx_internal_net; + +extern int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, + unsigned char *node); +extern void ipxrtr_del_routes(struct ipx_interface *intrfc); +extern int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, + struct iovec *iov, size_t len, int noblock); +extern int ipxrtr_route_skb(struct sk_buff *skb); +extern struct ipx_route *ipxrtr_lookup(__be32 net); +extern int ipxrtr_ioctl(unsigned int cmd, void __user *arg); + +struct ipx_interface *ipx_interfaces_head(void) +{ + struct ipx_interface *rc = NULL; + + if (!list_empty(&ipx_interfaces)) + rc = list_entry(ipx_interfaces.next, + struct ipx_interface, node); + return rc; +} + +static void ipxcfg_set_auto_select(char val) +{ + ipxcfg_auto_select_primary = val; + if (val && !ipx_primary_net) + ipx_primary_net = ipx_interfaces_head(); +} + +static int ipxcfg_get_config_data(struct ipx_config_data __user *arg) +{ + struct ipx_config_data vals; + + vals.ipxcfg_auto_create_interfaces = ipxcfg_auto_create_interfaces; + vals.ipxcfg_auto_select_primary = ipxcfg_auto_select_primary; + + return copy_to_user(arg, &vals, sizeof(vals)) ? -EFAULT : 0; +} + +/* + * Note: Sockets may not be removed _during_ an interrupt or inet_bh + * handler using this technique. They can be added although we do not + * use this facility. + */ + +static void ipx_remove_socket(struct sock *sk) +{ + /* Determine interface with which socket is associated */ + struct ipx_interface *intrfc = ipx_sk(sk)->intrfc; + + if (!intrfc) + goto out; + + ipxitf_hold(intrfc); + spin_lock_bh(&intrfc->if_sklist_lock); + sk_del_node_init(sk); + spin_unlock_bh(&intrfc->if_sklist_lock); + ipxitf_put(intrfc); +out: + return; +} + +static void ipx_destroy_socket(struct sock *sk) +{ + ipx_remove_socket(sk); + skb_queue_purge(&sk->sk_receive_queue); + sk_refcnt_debug_dec(sk); +} + +/* + * The following code is used to support IPX Interfaces (IPXITF). An + * IPX interface is defined by a physical device and a frame type. + */ + +/* ipxitf_clear_primary_net has to be called with ipx_interfaces_lock held */ + +static void ipxitf_clear_primary_net(void) +{ + ipx_primary_net = NULL; + if (ipxcfg_auto_select_primary) + ipx_primary_net = ipx_interfaces_head(); +} + +static struct ipx_interface *__ipxitf_find_using_phys(struct net_device *dev, + __be16 datalink) +{ + struct ipx_interface *i; + + list_for_each_entry(i, &ipx_interfaces, node) + if (i->if_dev == dev && i->if_dlink_type == datalink) + goto out; + i = NULL; +out: + return i; +} + +static struct ipx_interface *ipxitf_find_using_phys(struct net_device *dev, + __be16 datalink) +{ + struct ipx_interface *i; + + spin_lock_bh(&ipx_interfaces_lock); + i = __ipxitf_find_using_phys(dev, datalink); + if (i) + ipxitf_hold(i); + spin_unlock_bh(&ipx_interfaces_lock); + return i; +} + +struct ipx_interface *ipxitf_find_using_net(__be32 net) +{ + struct ipx_interface *i; + + spin_lock_bh(&ipx_interfaces_lock); + if (net) { + list_for_each_entry(i, &ipx_interfaces, node) + if (i->if_netnum == net) + goto hold; + i = NULL; + goto unlock; + } + + i = ipx_primary_net; + if (i) +hold: + ipxitf_hold(i); +unlock: + spin_unlock_bh(&ipx_interfaces_lock); + return i; +} + +/* Sockets are bound to a particular IPX interface. */ +static void ipxitf_insert_socket(struct ipx_interface *intrfc, struct sock *sk) +{ + ipxitf_hold(intrfc); + spin_lock_bh(&intrfc->if_sklist_lock); + ipx_sk(sk)->intrfc = intrfc; + sk_add_node(sk, &intrfc->if_sklist); + spin_unlock_bh(&intrfc->if_sklist_lock); + ipxitf_put(intrfc); +} + +/* caller must hold intrfc->if_sklist_lock */ +static struct sock *__ipxitf_find_socket(struct ipx_interface *intrfc, + __be16 port) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &intrfc->if_sklist) + if (ipx_sk(s)->port == port) + goto found; + s = NULL; +found: + return s; +} + +/* caller must hold a reference to intrfc */ +static struct sock *ipxitf_find_socket(struct ipx_interface *intrfc, + __be16 port) +{ + struct sock *s; + + spin_lock_bh(&intrfc->if_sklist_lock); + s = __ipxitf_find_socket(intrfc, port); + if (s) + sock_hold(s); + spin_unlock_bh(&intrfc->if_sklist_lock); + + return s; +} + +#ifdef CONFIG_IPX_INTERN +static struct sock *ipxitf_find_internal_socket(struct ipx_interface *intrfc, + unsigned char *ipx_node, + __be16 port) +{ + struct sock *s; + struct hlist_node *node; + + ipxitf_hold(intrfc); + spin_lock_bh(&intrfc->if_sklist_lock); + + sk_for_each(s, node, &intrfc->if_sklist) { + struct ipx_sock *ipxs = ipx_sk(s); + + if (ipxs->port == port && + !memcmp(ipx_node, ipxs->node, IPX_NODE_LEN)) + goto found; + } + s = NULL; +found: + spin_unlock_bh(&intrfc->if_sklist_lock); + ipxitf_put(intrfc); + return s; +} +#endif + +static void __ipxitf_down(struct ipx_interface *intrfc) +{ + struct sock *s; + struct hlist_node *node, *t; + + /* Delete all routes associated with this interface */ + ipxrtr_del_routes(intrfc); + + spin_lock_bh(&intrfc->if_sklist_lock); + /* error sockets */ + sk_for_each_safe(s, node, t, &intrfc->if_sklist) { + struct ipx_sock *ipxs = ipx_sk(s); + + s->sk_err = ENOLINK; + s->sk_error_report(s); + ipxs->intrfc = NULL; + ipxs->port = 0; + sock_set_flag(s, SOCK_ZAPPED); /* Indicates it is no longer bound */ + sk_del_node_init(s); + } + INIT_HLIST_HEAD(&intrfc->if_sklist); + spin_unlock_bh(&intrfc->if_sklist_lock); + + /* remove this interface from list */ + list_del(&intrfc->node); + + /* remove this interface from *special* networks */ + if (intrfc == ipx_primary_net) + ipxitf_clear_primary_net(); + if (intrfc == ipx_internal_net) + ipx_internal_net = NULL; + + if (intrfc->if_dev) + dev_put(intrfc->if_dev); + kfree(intrfc); +} + +void ipxitf_down(struct ipx_interface *intrfc) +{ + spin_lock_bh(&ipx_interfaces_lock); + __ipxitf_down(intrfc); + spin_unlock_bh(&ipx_interfaces_lock); +} + +static __inline__ void __ipxitf_put(struct ipx_interface *intrfc) +{ + if (atomic_dec_and_test(&intrfc->refcnt)) + __ipxitf_down(intrfc); +} + +static int ipxitf_device_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct ipx_interface *i, *tmp; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + goto out; + + spin_lock_bh(&ipx_interfaces_lock); + list_for_each_entry_safe(i, tmp, &ipx_interfaces, node) + if (i->if_dev == dev) { + if (event == NETDEV_UP) + ipxitf_hold(i); + else + __ipxitf_put(i); + } + spin_unlock_bh(&ipx_interfaces_lock); +out: + return NOTIFY_DONE; +} + + +static __exit void ipxitf_cleanup(void) +{ + struct ipx_interface *i, *tmp; + + spin_lock_bh(&ipx_interfaces_lock); + list_for_each_entry_safe(i, tmp, &ipx_interfaces, node) + __ipxitf_put(i); + spin_unlock_bh(&ipx_interfaces_lock); +} + +static void ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb) +{ + if (sock_queue_rcv_skb(sock, skb) < 0) + kfree_skb(skb); +} + +/* + * On input skb->sk is NULL. Nobody is charged for the memory. + */ + +/* caller must hold a reference to intrfc */ + +#ifdef CONFIG_IPX_INTERN +static int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + int is_broadcast = !memcmp(ipx->ipx_dest.node, ipx_broadcast_node, + IPX_NODE_LEN); + struct sock *s; + struct hlist_node *node; + int rc; + + spin_lock_bh(&intrfc->if_sklist_lock); + + sk_for_each(s, node, &intrfc->if_sklist) { + struct ipx_sock *ipxs = ipx_sk(s); + + if (ipxs->port == ipx->ipx_dest.sock && + (is_broadcast || !memcmp(ipx->ipx_dest.node, + ipxs->node, IPX_NODE_LEN))) { + /* We found a socket to which to send */ + struct sk_buff *skb1; + + if (copy) { + skb1 = skb_clone(skb, GFP_ATOMIC); + rc = -ENOMEM; + if (!skb1) + goto out; + } else { + skb1 = skb; + copy = 1; /* skb may only be used once */ + } + ipxitf_def_skb_handler(s, skb1); + + /* On an external interface, one socket can listen */ + if (intrfc != ipx_internal_net) + break; + } + } + + /* skb was solely for us, and we did not make a copy, so free it. */ + if (!copy) + kfree_skb(skb); + + rc = 0; +out: + spin_unlock_bh(&intrfc->if_sklist_lock); + return rc; +} +#else +static struct sock *ncp_connection_hack(struct ipx_interface *intrfc, + struct ipxhdr *ipx) +{ + /* The packet's target is a NCP connection handler. We want to hand it + * to the correct socket directly within the kernel, so that the + * mars_nwe packet distribution process does not have to do it. Here we + * only care about NCP and BURST packets. + * + * You might call this a hack, but believe me, you do not want a + * complete NCP layer in the kernel, and this is VERY fast as well. */ + struct sock *sk = NULL; + int connection = 0; + u8 *ncphdr = (u8 *)(ipx + 1); + + if (*ncphdr == 0x22 && *(ncphdr + 1) == 0x22) /* NCP request */ + connection = (((int) *(ncphdr + 5)) << 8) | (int) *(ncphdr + 3); + else if (*ncphdr == 0x77 && *(ncphdr + 1) == 0x77) /* BURST packet */ + connection = (((int) *(ncphdr + 9)) << 8) | (int) *(ncphdr + 8); + + if (connection) { + struct hlist_node *node; + /* Now we have to look for a special NCP connection handling + * socket. Only these sockets have ipx_ncp_conn != 0, set by + * SIOCIPXNCPCONN. */ + spin_lock_bh(&intrfc->if_sklist_lock); + sk_for_each(sk, node, &intrfc->if_sklist) + if (ipx_sk(sk)->ipx_ncp_conn == connection) { + sock_hold(sk); + goto found; + } + sk = NULL; + found: + spin_unlock_bh(&intrfc->if_sklist_lock); + } + return sk; +} + +static int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + struct sock *sock1 = NULL, *sock2 = NULL; + struct sk_buff *skb1 = NULL, *skb2 = NULL; + int rc; + + if (intrfc == ipx_primary_net && ntohs(ipx->ipx_dest.sock) == 0x451) + sock1 = ncp_connection_hack(intrfc, ipx); + if (!sock1) + /* No special socket found, forward the packet the normal way */ + sock1 = ipxitf_find_socket(intrfc, ipx->ipx_dest.sock); + + /* + * We need to check if there is a primary net and if + * this is addressed to one of the *SPECIAL* sockets because + * these need to be propagated to the primary net. + * The *SPECIAL* socket list contains: 0x452(SAP), 0x453(RIP) and + * 0x456(Diagnostic). + */ + + if (ipx_primary_net && intrfc != ipx_primary_net) { + const int dsock = ntohs(ipx->ipx_dest.sock); + + if (dsock == 0x452 || dsock == 0x453 || dsock == 0x456) + /* The appropriate thing to do here is to dup the + * packet and route to the primary net interface via + * ipxitf_send; however, we'll cheat and just demux it + * here. */ + sock2 = ipxitf_find_socket(ipx_primary_net, + ipx->ipx_dest.sock); + } + + /* + * If there is nothing to do return. The kfree will cancel any charging. + */ + rc = 0; + if (!sock1 && !sock2) { + if (!copy) + kfree_skb(skb); + goto out; + } + + /* + * This next segment of code is a little awkward, but it sets it up + * so that the appropriate number of copies of the SKB are made and + * that skb1 and skb2 point to it (them) so that it (they) can be + * demuxed to sock1 and/or sock2. If we are unable to make enough + * copies, we do as much as is possible. + */ + + if (copy) + skb1 = skb_clone(skb, GFP_ATOMIC); + else + skb1 = skb; + + rc = -ENOMEM; + if (!skb1) + goto out_put; + + /* Do we need 2 SKBs? */ + if (sock1 && sock2) + skb2 = skb_clone(skb1, GFP_ATOMIC); + else + skb2 = skb1; + + if (sock1) + ipxitf_def_skb_handler(sock1, skb1); + + if (!skb2) + goto out_put; + + if (sock2) + ipxitf_def_skb_handler(sock2, skb2); + + rc = 0; +out_put: + if (sock1) + sock_put(sock1); + if (sock2) + sock_put(sock2); +out: + return rc; +} +#endif /* CONFIG_IPX_INTERN */ + +static struct sk_buff *ipxitf_adjust_skbuff(struct ipx_interface *intrfc, + struct sk_buff *skb) +{ + struct sk_buff *skb2; + int in_offset = (unsigned char *)ipx_hdr(skb) - skb->head; + int out_offset = intrfc->if_ipx_offset; + int len; + + /* Hopefully, most cases */ + if (in_offset >= out_offset) + return skb; + + /* Need new SKB */ + len = skb->len + out_offset; + skb2 = alloc_skb(len, GFP_ATOMIC); + if (skb2) { + skb_reserve(skb2, out_offset); + skb_reset_network_header(skb2); + skb_reset_transport_header(skb2); + skb_put(skb2, skb->len); + memcpy(ipx_hdr(skb2), ipx_hdr(skb), skb->len); + memcpy(skb2->cb, skb->cb, sizeof(skb->cb)); + } + kfree_skb(skb); + return skb2; +} + +/* caller must hold a reference to intrfc and the skb has to be unshared */ +int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + struct net_device *dev = intrfc->if_dev; + struct datalink_proto *dl = intrfc->if_dlink; + char dest_node[IPX_NODE_LEN]; + int send_to_wire = 1; + int addr_len; + + ipx->ipx_tctrl = IPX_SKB_CB(skb)->ipx_tctrl; + ipx->ipx_dest.net = IPX_SKB_CB(skb)->ipx_dest_net; + ipx->ipx_source.net = IPX_SKB_CB(skb)->ipx_source_net; + + /* see if we need to include the netnum in the route list */ + if (IPX_SKB_CB(skb)->last_hop.index >= 0) { + __be32 *last_hop = (__be32 *)(((u8 *) skb->data) + + sizeof(struct ipxhdr) + + IPX_SKB_CB(skb)->last_hop.index * + sizeof(__be32)); + *last_hop = IPX_SKB_CB(skb)->last_hop.netnum; + IPX_SKB_CB(skb)->last_hop.index = -1; + } + + /* + * We need to know how many skbuffs it will take to send out this + * packet to avoid unnecessary copies. + */ + + if (!dl || !dev || dev->flags & IFF_LOOPBACK) + send_to_wire = 0; /* No non looped */ + + /* + * See if this should be demuxed to sockets on this interface + * + * We want to ensure the original was eaten or that we only use + * up clones. + */ + + if (ipx->ipx_dest.net == intrfc->if_netnum) { + /* + * To our own node, loop and free the original. + * The internal net will receive on all node address. + */ + if (intrfc == ipx_internal_net || + !memcmp(intrfc->if_node, node, IPX_NODE_LEN)) { + /* Don't charge sender */ + skb_orphan(skb); + + /* Will charge receiver */ + return ipxitf_demux_socket(intrfc, skb, 0); + } + + /* Broadcast, loop and possibly keep to send on. */ + if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) { + if (!send_to_wire) + skb_orphan(skb); + ipxitf_demux_socket(intrfc, skb, send_to_wire); + if (!send_to_wire) + goto out; + } + } + + /* + * If the originating net is not equal to our net; this is routed + * We are still charging the sender. Which is right - the driver + * free will handle this fairly. + */ + if (ipx->ipx_source.net != intrfc->if_netnum) { + /* + * Unshare the buffer before modifying the count in + * case it's a flood or tcpdump + */ + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + if (++ipx->ipx_tctrl > ipxcfg_max_hops) + send_to_wire = 0; + } + + if (!send_to_wire) { + kfree_skb(skb); + goto out; + } + + /* Determine the appropriate hardware address */ + addr_len = dev->addr_len; + if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) + memcpy(dest_node, dev->broadcast, addr_len); + else + memcpy(dest_node, &(node[IPX_NODE_LEN-addr_len]), addr_len); + + /* Make any compensation for differing physical/data link size */ + skb = ipxitf_adjust_skbuff(intrfc, skb); + if (!skb) + goto out; + + /* set up data link and physical headers */ + skb->dev = dev; + skb->protocol = htons(ETH_P_IPX); + + /* Send it out */ + dl->request(dl, skb, dest_node); +out: + return 0; +} + +static int ipxitf_add_local_route(struct ipx_interface *intrfc) +{ + return ipxrtr_add_route(intrfc->if_netnum, intrfc, NULL); +} + +static void ipxitf_discover_netnum(struct ipx_interface *intrfc, + struct sk_buff *skb); +static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb); + +static int ipxitf_rcv(struct ipx_interface *intrfc, struct sk_buff *skb) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + int rc = 0; + + ipxitf_hold(intrfc); + + /* See if we should update our network number */ + if (!intrfc->if_netnum) /* net number of intrfc not known yet */ + ipxitf_discover_netnum(intrfc, skb); + + IPX_SKB_CB(skb)->last_hop.index = -1; + if (ipx->ipx_type == IPX_TYPE_PPROP) { + rc = ipxitf_pprop(intrfc, skb); + if (rc) + goto out_free_skb; + } + + /* local processing follows */ + if (!IPX_SKB_CB(skb)->ipx_dest_net) + IPX_SKB_CB(skb)->ipx_dest_net = intrfc->if_netnum; + if (!IPX_SKB_CB(skb)->ipx_source_net) + IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum; + + /* it doesn't make sense to route a pprop packet, there's no meaning + * in the ipx_dest_net for such packets */ + if (ipx->ipx_type != IPX_TYPE_PPROP && + intrfc->if_netnum != IPX_SKB_CB(skb)->ipx_dest_net) { + /* We only route point-to-point packets. */ + if (skb->pkt_type == PACKET_HOST) { + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb) + rc = ipxrtr_route_skb(skb); + goto out_intrfc; + } + + goto out_free_skb; + } + + /* see if we should keep it */ + if (!memcmp(ipx_broadcast_node, ipx->ipx_dest.node, IPX_NODE_LEN) || + !memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN)) { + rc = ipxitf_demux_socket(intrfc, skb, 0); + goto out_intrfc; + } + + /* we couldn't pawn it off so unload it */ +out_free_skb: + kfree_skb(skb); +out_intrfc: + ipxitf_put(intrfc); + return rc; +} + +static void ipxitf_discover_netnum(struct ipx_interface *intrfc, + struct sk_buff *skb) +{ + const struct ipx_cb *cb = IPX_SKB_CB(skb); + + /* see if this is an intra packet: source_net == dest_net */ + if (cb->ipx_source_net == cb->ipx_dest_net && cb->ipx_source_net) { + struct ipx_interface *i = + ipxitf_find_using_net(cb->ipx_source_net); + /* NB: NetWare servers lie about their hop count so we + * dropped the test based on it. This is the best way + * to determine this is a 0 hop count packet. */ + if (!i) { + intrfc->if_netnum = cb->ipx_source_net; + ipxitf_add_local_route(intrfc); + } else { + printk(KERN_WARNING "IPX: Network number collision " + "%lx\n %s %s and %s %s\n", + (unsigned long) ntohl(cb->ipx_source_net), + ipx_device_name(i), + ipx_frame_name(i->if_dlink_type), + ipx_device_name(intrfc), + ipx_frame_name(intrfc->if_dlink_type)); + ipxitf_put(i); + } + } +} + +/** + * ipxitf_pprop - Process packet propagation IPX packet type 0x14, used for + * NetBIOS broadcasts + * @intrfc: IPX interface receiving this packet + * @skb: Received packet + * + * Checks if packet is valid: if its more than %IPX_MAX_PPROP_HOPS hops or if it + * is smaller than a IPX header + the room for %IPX_MAX_PPROP_HOPS hops we drop + * it, not even processing it locally, if it has exact %IPX_MAX_PPROP_HOPS we + * don't broadcast it, but process it locally. See chapter 5 of Novell's "IPX + * RIP and SAP Router Specification", Part Number 107-000029-001. + * + * If it is valid, check if we have pprop broadcasting enabled by the user, + * if not, just return zero for local processing. + * + * If it is enabled check the packet and don't broadcast it if we have already + * seen this packet. + * + * Broadcast: send it to the interfaces that aren't on the packet visited nets + * array, just after the IPX header. + * + * Returns -EINVAL for invalid packets, so that the calling function drops + * the packet without local processing. 0 if packet is to be locally processed. + */ +static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + int i, rc = -EINVAL; + struct ipx_interface *ifcs; + char *c; + __be32 *l; + + /* Illegal packet - too many hops or too short */ + /* We decide to throw it away: no broadcasting, no local processing. + * NetBIOS unaware implementations route them as normal packets - + * tctrl <= 15, any data payload... */ + if (IPX_SKB_CB(skb)->ipx_tctrl > IPX_MAX_PPROP_HOPS || + ntohs(ipx->ipx_pktsize) < sizeof(struct ipxhdr) + + IPX_MAX_PPROP_HOPS * sizeof(u32)) + goto out; + /* are we broadcasting this damn thing? */ + rc = 0; + if (!sysctl_ipx_pprop_broadcasting) + goto out; + /* We do broadcast packet on the IPX_MAX_PPROP_HOPS hop, but we + * process it locally. All previous hops broadcasted it, and process it + * locally. */ + if (IPX_SKB_CB(skb)->ipx_tctrl == IPX_MAX_PPROP_HOPS) + goto out; + + c = ((u8 *) ipx) + sizeof(struct ipxhdr); + l = (__be32 *) c; + + /* Don't broadcast packet if already seen this net */ + for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++) + if (*l++ == intrfc->if_netnum) + goto out; + + /* < IPX_MAX_PPROP_HOPS hops && input interface not in list. Save the + * position where we will insert recvd netnum into list, later on, + * in ipxitf_send */ + IPX_SKB_CB(skb)->last_hop.index = i; + IPX_SKB_CB(skb)->last_hop.netnum = intrfc->if_netnum; + /* xmit on all other interfaces... */ + spin_lock_bh(&ipx_interfaces_lock); + list_for_each_entry(ifcs, &ipx_interfaces, node) { + /* Except unconfigured interfaces */ + if (!ifcs->if_netnum) + continue; + + /* That aren't in the list */ + if (ifcs == intrfc) + continue; + l = (__be32 *) c; + /* don't consider the last entry in the packet list, + * it is our netnum, and it is not there yet */ + for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++) + if (ifcs->if_netnum == *l++) + break; + if (i == IPX_SKB_CB(skb)->ipx_tctrl) { + struct sk_buff *s = skb_copy(skb, GFP_ATOMIC); + + if (s) { + IPX_SKB_CB(s)->ipx_dest_net = ifcs->if_netnum; + ipxrtr_route_skb(s); + } + } + } + spin_unlock_bh(&ipx_interfaces_lock); +out: + return rc; +} + +static void ipxitf_insert(struct ipx_interface *intrfc) +{ + spin_lock_bh(&ipx_interfaces_lock); + list_add_tail(&intrfc->node, &ipx_interfaces); + spin_unlock_bh(&ipx_interfaces_lock); + + if (ipxcfg_auto_select_primary && !ipx_primary_net) + ipx_primary_net = intrfc; +} + +static struct ipx_interface *ipxitf_alloc(struct net_device *dev, __be32 netnum, + __be16 dlink_type, + struct datalink_proto *dlink, + unsigned char internal, + int ipx_offset) +{ + struct ipx_interface *intrfc = kmalloc(sizeof(*intrfc), GFP_ATOMIC); + + if (intrfc) { + intrfc->if_dev = dev; + intrfc->if_netnum = netnum; + intrfc->if_dlink_type = dlink_type; + intrfc->if_dlink = dlink; + intrfc->if_internal = internal; + intrfc->if_ipx_offset = ipx_offset; + intrfc->if_sknum = IPX_MIN_EPHEMERAL_SOCKET; + INIT_HLIST_HEAD(&intrfc->if_sklist); + atomic_set(&intrfc->refcnt, 1); + spin_lock_init(&intrfc->if_sklist_lock); + } + + return intrfc; +} + +static int ipxitf_create_internal(struct ipx_interface_definition *idef) +{ + struct ipx_interface *intrfc; + int rc = -EEXIST; + + /* Only one primary network allowed */ + if (ipx_primary_net) + goto out; + + /* Must have a valid network number */ + rc = -EADDRNOTAVAIL; + if (!idef->ipx_network) + goto out; + intrfc = ipxitf_find_using_net(idef->ipx_network); + rc = -EADDRINUSE; + if (intrfc) { + ipxitf_put(intrfc); + goto out; + } + intrfc = ipxitf_alloc(NULL, idef->ipx_network, 0, NULL, 1, 0); + rc = -EAGAIN; + if (!intrfc) + goto out; + memcpy((char *)&(intrfc->if_node), idef->ipx_node, IPX_NODE_LEN); + ipx_internal_net = ipx_primary_net = intrfc; + ipxitf_hold(intrfc); + ipxitf_insert(intrfc); + + rc = ipxitf_add_local_route(intrfc); + ipxitf_put(intrfc); +out: + return rc; +} + +static __be16 ipx_map_frame_type(unsigned char type) +{ + __be16 rc = 0; + + switch (type) { + case IPX_FRAME_ETHERII: rc = htons(ETH_P_IPX); break; + case IPX_FRAME_8022: rc = htons(ETH_P_802_2); break; + case IPX_FRAME_SNAP: rc = htons(ETH_P_SNAP); break; + case IPX_FRAME_8023: rc = htons(ETH_P_802_3); break; + } + + return rc; +} + +static int ipxitf_create(struct ipx_interface_definition *idef) +{ + struct net_device *dev; + __be16 dlink_type = 0; + struct datalink_proto *datalink = NULL; + struct ipx_interface *intrfc; + int rc; + + if (idef->ipx_special == IPX_INTERNAL) { + rc = ipxitf_create_internal(idef); + goto out; + } + + rc = -EEXIST; + if (idef->ipx_special == IPX_PRIMARY && ipx_primary_net) + goto out; + + intrfc = ipxitf_find_using_net(idef->ipx_network); + rc = -EADDRINUSE; + if (idef->ipx_network && intrfc) { + ipxitf_put(intrfc); + goto out; + } + + if (intrfc) + ipxitf_put(intrfc); + + dev = dev_get_by_name(&init_net, idef->ipx_device); + rc = -ENODEV; + if (!dev) + goto out; + + switch (idef->ipx_dlink_type) { + case IPX_FRAME_TR_8022: + printk(KERN_WARNING "IPX frame type 802.2TR is " + "obsolete Use 802.2 instead.\n"); + /* fall through */ + case IPX_FRAME_8022: + dlink_type = htons(ETH_P_802_2); + datalink = p8022_datalink; + break; + case IPX_FRAME_ETHERII: + if (dev->type != ARPHRD_IEEE802) { + dlink_type = htons(ETH_P_IPX); + datalink = pEII_datalink; + break; + } else + printk(KERN_WARNING "IPX frame type EtherII over " + "token-ring is obsolete. Use SNAP " + "instead.\n"); + /* fall through */ + case IPX_FRAME_SNAP: + dlink_type = htons(ETH_P_SNAP); + datalink = pSNAP_datalink; + break; + case IPX_FRAME_8023: + dlink_type = htons(ETH_P_802_3); + datalink = p8023_datalink; + break; + case IPX_FRAME_NONE: + default: + rc = -EPROTONOSUPPORT; + goto out_dev; + } + + rc = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_dev; + + /* Check addresses are suitable */ + rc = -EINVAL; + if (dev->addr_len > IPX_NODE_LEN) + goto out_dev; + + intrfc = ipxitf_find_using_phys(dev, dlink_type); + if (!intrfc) { + /* Ok now create */ + intrfc = ipxitf_alloc(dev, idef->ipx_network, dlink_type, + datalink, 0, dev->hard_header_len + + datalink->header_length); + rc = -EAGAIN; + if (!intrfc) + goto out_dev; + /* Setup primary if necessary */ + if (idef->ipx_special == IPX_PRIMARY) + ipx_primary_net = intrfc; + if (!memcmp(idef->ipx_node, "\000\000\000\000\000\000", + IPX_NODE_LEN)) { + memset(intrfc->if_node, 0, IPX_NODE_LEN); + memcpy(intrfc->if_node + IPX_NODE_LEN - dev->addr_len, + dev->dev_addr, dev->addr_len); + } else + memcpy(intrfc->if_node, idef->ipx_node, IPX_NODE_LEN); + ipxitf_hold(intrfc); + ipxitf_insert(intrfc); + } + + + /* If the network number is known, add a route */ + rc = 0; + if (!intrfc->if_netnum) + goto out_intrfc; + + rc = ipxitf_add_local_route(intrfc); +out_intrfc: + ipxitf_put(intrfc); + goto out; +out_dev: + dev_put(dev); +out: + return rc; +} + +static int ipxitf_delete(struct ipx_interface_definition *idef) +{ + struct net_device *dev = NULL; + __be16 dlink_type = 0; + struct ipx_interface *intrfc; + int rc = 0; + + spin_lock_bh(&ipx_interfaces_lock); + if (idef->ipx_special == IPX_INTERNAL) { + if (ipx_internal_net) { + __ipxitf_put(ipx_internal_net); + goto out; + } + rc = -ENOENT; + goto out; + } + + dlink_type = ipx_map_frame_type(idef->ipx_dlink_type); + rc = -EPROTONOSUPPORT; + if (!dlink_type) + goto out; + + dev = __dev_get_by_name(&init_net, idef->ipx_device); + rc = -ENODEV; + if (!dev) + goto out; + + intrfc = __ipxitf_find_using_phys(dev, dlink_type); + rc = -EINVAL; + if (!intrfc) + goto out; + __ipxitf_put(intrfc); + + rc = 0; +out: + spin_unlock_bh(&ipx_interfaces_lock); + return rc; +} + +static struct ipx_interface *ipxitf_auto_create(struct net_device *dev, + __be16 dlink_type) +{ + struct ipx_interface *intrfc = NULL; + struct datalink_proto *datalink; + + if (!dev) + goto out; + + /* Check addresses are suitable */ + if (dev->addr_len > IPX_NODE_LEN) + goto out; + + switch (ntohs(dlink_type)) { + case ETH_P_IPX: datalink = pEII_datalink; break; + case ETH_P_802_2: datalink = p8022_datalink; break; + case ETH_P_SNAP: datalink = pSNAP_datalink; break; + case ETH_P_802_3: datalink = p8023_datalink; break; + default: goto out; + } + + intrfc = ipxitf_alloc(dev, 0, dlink_type, datalink, 0, + dev->hard_header_len + datalink->header_length); + + if (intrfc) { + memset(intrfc->if_node, 0, IPX_NODE_LEN); + memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]), + dev->dev_addr, dev->addr_len); + spin_lock_init(&intrfc->if_sklist_lock); + atomic_set(&intrfc->refcnt, 1); + ipxitf_insert(intrfc); + dev_hold(dev); + } + +out: + return intrfc; +} + +static int ipxitf_ioctl(unsigned int cmd, void __user *arg) +{ + int rc = -EINVAL; + struct ifreq ifr; + int val; + + switch (cmd) { + case SIOCSIFADDR: { + struct sockaddr_ipx *sipx; + struct ipx_interface_definition f; + + rc = -EFAULT; + if (copy_from_user(&ifr, arg, sizeof(ifr))) + break; + sipx = (struct sockaddr_ipx *)&ifr.ifr_addr; + rc = -EINVAL; + if (sipx->sipx_family != AF_IPX) + break; + f.ipx_network = sipx->sipx_network; + memcpy(f.ipx_device, ifr.ifr_name, + sizeof(f.ipx_device)); + memcpy(f.ipx_node, sipx->sipx_node, IPX_NODE_LEN); + f.ipx_dlink_type = sipx->sipx_type; + f.ipx_special = sipx->sipx_special; + + if (sipx->sipx_action == IPX_DLTITF) + rc = ipxitf_delete(&f); + else + rc = ipxitf_create(&f); + break; + } + case SIOCGIFADDR: { + struct sockaddr_ipx *sipx; + struct ipx_interface *ipxif; + struct net_device *dev; + + rc = -EFAULT; + if (copy_from_user(&ifr, arg, sizeof(ifr))) + break; + sipx = (struct sockaddr_ipx *)&ifr.ifr_addr; + dev = __dev_get_by_name(&init_net, ifr.ifr_name); + rc = -ENODEV; + if (!dev) + break; + ipxif = ipxitf_find_using_phys(dev, + ipx_map_frame_type(sipx->sipx_type)); + rc = -EADDRNOTAVAIL; + if (!ipxif) + break; + + sipx->sipx_family = AF_IPX; + sipx->sipx_network = ipxif->if_netnum; + memcpy(sipx->sipx_node, ipxif->if_node, + sizeof(sipx->sipx_node)); + rc = -EFAULT; + if (copy_to_user(arg, &ifr, sizeof(ifr))) + break; + ipxitf_put(ipxif); + rc = 0; + break; + } + case SIOCAIPXITFCRT: + rc = -EFAULT; + if (get_user(val, (unsigned char __user *) arg)) + break; + rc = 0; + ipxcfg_auto_create_interfaces = val; + break; + case SIOCAIPXPRISLT: + rc = -EFAULT; + if (get_user(val, (unsigned char __user *) arg)) + break; + rc = 0; + ipxcfg_set_auto_select(val); + break; + } + + return rc; +} + +/* + * Checksum routine for IPX + */ + +/* Note: We assume ipx_tctrl==0 and htons(length)==ipx_pktsize */ +/* This functions should *not* mess with packet contents */ + +__be16 ipx_cksum(struct ipxhdr *packet, int length) +{ + /* + * NOTE: sum is a net byte order quantity, which optimizes the + * loop. This only works on big and little endian machines. (I + * don't know of a machine that isn't.) + */ + /* handle the first 3 words separately; checksum should be skipped + * and ipx_tctrl masked out */ + __u16 *p = (__u16 *)packet; + __u32 sum = p[1] + (p[2] & (__force u16)htons(0x00ff)); + __u32 i = (length >> 1) - 3; /* Number of remaining complete words */ + + /* Loop through them */ + p += 3; + while (i--) + sum += *p++; + + /* Add on the last part word if it exists */ + if (packet->ipx_pktsize & htons(1)) + sum += (__force u16)htons(0xff00) & *p; + + /* Do final fixup */ + sum = (sum & 0xffff) + (sum >> 16); + + /* It's a pity there's no concept of carry in C */ + if (sum >= 0x10000) + sum++; + + /* + * Leave 0 alone; we don't want 0xffff here. Note that we can't get + * here with 0x10000, so this check is the same as ((__u16)sum) + */ + if (sum) + sum = ~sum; + + return (__force __be16)sum; +} + +const char *ipx_frame_name(__be16 frame) +{ + char* rc = "None"; + + switch (ntohs(frame)) { + case ETH_P_IPX: rc = "EtherII"; break; + case ETH_P_802_2: rc = "802.2"; break; + case ETH_P_SNAP: rc = "SNAP"; break; + case ETH_P_802_3: rc = "802.3"; break; + case ETH_P_TR_802_2: rc = "802.2TR"; break; + } + + return rc; +} + +const char *ipx_device_name(struct ipx_interface *intrfc) +{ + return intrfc->if_internal ? "Internal" : + intrfc->if_dev ? intrfc->if_dev->name : "Unknown"; +} + +/* Handling for system calls applied via the various interfaces to an IPX + * socket object. */ + +static int ipx_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int opt; + int rc = -EINVAL; + + lock_sock(sk); + if (optlen != sizeof(int)) + goto out; + + rc = -EFAULT; + if (get_user(opt, (unsigned int __user *)optval)) + goto out; + + rc = -ENOPROTOOPT; + if (!(level == SOL_IPX && optname == IPX_TYPE)) + goto out; + + ipx_sk(sk)->type = opt; + rc = 0; +out: + release_sock(sk); + return rc; +} + +static int ipx_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int val = 0; + int len; + int rc = -ENOPROTOOPT; + + lock_sock(sk); + if (!(level == SOL_IPX && optname == IPX_TYPE)) + goto out; + + val = ipx_sk(sk)->type; + + rc = -EFAULT; + if (get_user(len, optlen)) + goto out; + + len = min_t(unsigned int, len, sizeof(int)); + rc = -EINVAL; + if(len < 0) + goto out; + + rc = -EFAULT; + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + goto out; + + rc = 0; +out: + release_sock(sk); + return rc; +} + +static struct proto ipx_proto = { + .name = "IPX", + .owner = THIS_MODULE, + .obj_size = sizeof(struct ipx_sock), +}; + +static int ipx_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + int rc = -ESOCKTNOSUPPORT; + struct sock *sk; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + /* + * SPX support is not anymore in the kernel sources. If you want to + * ressurrect it, completing it and making it understand shared skbs, + * be fully multithreaded, etc, grab the sources in an early 2.5 kernel + * tree. + */ + if (sock->type != SOCK_DGRAM) + goto out; + + rc = -ENOMEM; + sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto); + if (!sk) + goto out; + + sk_refcnt_debug_inc(sk); + sock_init_data(sock, sk); + sk->sk_no_check = 1; /* Checksum off by default */ + sock->ops = &ipx_dgram_ops; + rc = 0; +out: + return rc; +} + +static int ipx_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + goto out; + + lock_sock(sk); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_DEAD); + sock->sk = NULL; + sk_refcnt_debug_release(sk); + ipx_destroy_socket(sk); + release_sock(sk); + sock_put(sk); +out: + return 0; +} + +/* caller must hold a reference to intrfc */ + +static __be16 ipx_first_free_socketnum(struct ipx_interface *intrfc) +{ + unsigned short socketNum = intrfc->if_sknum; + + spin_lock_bh(&intrfc->if_sklist_lock); + + if (socketNum < IPX_MIN_EPHEMERAL_SOCKET) + socketNum = IPX_MIN_EPHEMERAL_SOCKET; + + while (__ipxitf_find_socket(intrfc, htons(socketNum))) + if (socketNum > IPX_MAX_EPHEMERAL_SOCKET) + socketNum = IPX_MIN_EPHEMERAL_SOCKET; + else + socketNum++; + + spin_unlock_bh(&intrfc->if_sklist_lock); + intrfc->if_sknum = socketNum; + + return htons(socketNum); +} + +static int __ipx_bind(struct socket *sock, + struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct ipx_interface *intrfc; + struct sockaddr_ipx *addr = (struct sockaddr_ipx *)uaddr; + int rc = -EINVAL; + + if (!sock_flag(sk, SOCK_ZAPPED) || addr_len != sizeof(struct sockaddr_ipx)) + goto out; + + intrfc = ipxitf_find_using_net(addr->sipx_network); + rc = -EADDRNOTAVAIL; + if (!intrfc) + goto out; + + if (!addr->sipx_port) { + addr->sipx_port = ipx_first_free_socketnum(intrfc); + rc = -EINVAL; + if (!addr->sipx_port) + goto out_put; + } + + /* protect IPX system stuff like routing/sap */ + rc = -EACCES; + if (ntohs(addr->sipx_port) < IPX_MIN_EPHEMERAL_SOCKET && + !capable(CAP_NET_ADMIN)) + goto out_put; + + ipxs->port = addr->sipx_port; + +#ifdef CONFIG_IPX_INTERN + if (intrfc == ipx_internal_net) { + /* The source address is to be set explicitly if the + * socket is to be bound on the internal network. If a + * node number 0 was specified, the default is used. + */ + + rc = -EINVAL; + if (!memcmp(addr->sipx_node, ipx_broadcast_node, IPX_NODE_LEN)) + goto out_put; + if (!memcmp(addr->sipx_node, ipx_this_node, IPX_NODE_LEN)) + memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN); + else + memcpy(ipxs->node, addr->sipx_node, IPX_NODE_LEN); + + rc = -EADDRINUSE; + if (ipxitf_find_internal_socket(intrfc, ipxs->node, + ipxs->port)) { + SOCK_DEBUG(sk, + "IPX: bind failed because port %X in use.\n", + ntohs(addr->sipx_port)); + goto out_put; + } + } else { + /* Source addresses are easy. It must be our + * network:node pair for an interface routed to IPX + * with the ipx routing ioctl() + */ + + memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN); + + rc = -EADDRINUSE; + if (ipxitf_find_socket(intrfc, addr->sipx_port)) { + SOCK_DEBUG(sk, + "IPX: bind failed because port %X in use.\n", + ntohs(addr->sipx_port)); + goto out_put; + } + } + +#else /* !def CONFIG_IPX_INTERN */ + + /* Source addresses are easy. It must be our network:node pair for + an interface routed to IPX with the ipx routing ioctl() */ + + rc = -EADDRINUSE; + if (ipxitf_find_socket(intrfc, addr->sipx_port)) { + SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); + goto out_put; + } + +#endif /* CONFIG_IPX_INTERN */ + + ipxitf_insert_socket(intrfc, sk); + sock_reset_flag(sk, SOCK_ZAPPED); + + rc = 0; +out_put: + ipxitf_put(intrfc); +out: + return rc; +} + +static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + int rc; + + lock_sock(sk); + rc = __ipx_bind(sock, uaddr, addr_len); + release_sock(sk); + + return rc; +} + +static int ipx_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct sockaddr_ipx *addr; + int rc = -EINVAL; + struct ipx_route *rt; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + lock_sock(sk); + if (addr_len != sizeof(*addr)) + goto out; + addr = (struct sockaddr_ipx *)uaddr; + + /* put the autobinding in */ + if (!ipxs->port) { + struct sockaddr_ipx uaddr; + + uaddr.sipx_port = 0; + uaddr.sipx_network = 0; + +#ifdef CONFIG_IPX_INTERN + rc = -ENETDOWN; + if (!ipxs->intrfc) + goto out; /* Someone zonked the iface */ + memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, + IPX_NODE_LEN); +#endif /* CONFIG_IPX_INTERN */ + + rc = __ipx_bind(sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); + if (rc) + goto out; + } + + /* We can either connect to primary network or somewhere + * we can route to */ + rt = ipxrtr_lookup(addr->sipx_network); + rc = -ENETUNREACH; + if (!rt && !(!addr->sipx_network && ipx_primary_net)) + goto out; + + ipxs->dest_addr.net = addr->sipx_network; + ipxs->dest_addr.sock = addr->sipx_port; + memcpy(ipxs->dest_addr.node, addr->sipx_node, IPX_NODE_LEN); + ipxs->type = addr->sipx_type; + + if (sock->type == SOCK_DGRAM) { + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + } + + if (rt) + ipxrtr_put(rt); + rc = 0; +out: + release_sock(sk); + return rc; +} + + +static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct ipx_address *addr; + struct sockaddr_ipx sipx; + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + int rc; + + *uaddr_len = sizeof(struct sockaddr_ipx); + + lock_sock(sk); + if (peer) { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + addr = &ipxs->dest_addr; + sipx.sipx_network = addr->net; + sipx.sipx_port = addr->sock; + memcpy(sipx.sipx_node, addr->node, IPX_NODE_LEN); + } else { + if (ipxs->intrfc) { + sipx.sipx_network = ipxs->intrfc->if_netnum; +#ifdef CONFIG_IPX_INTERN + memcpy(sipx.sipx_node, ipxs->node, IPX_NODE_LEN); +#else + memcpy(sipx.sipx_node, ipxs->intrfc->if_node, + IPX_NODE_LEN); +#endif /* CONFIG_IPX_INTERN */ + + } else { + sipx.sipx_network = 0; + memset(sipx.sipx_node, '\0', IPX_NODE_LEN); + } + + sipx.sipx_port = ipxs->port; + } + + sipx.sipx_family = AF_IPX; + sipx.sipx_type = ipxs->type; + sipx.sipx_zero = 0; + memcpy(uaddr, &sipx, sizeof(sipx)); + + rc = 0; +out: + release_sock(sk); + return rc; +} + +static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + /* NULL here for pt means the packet was looped back */ + struct ipx_interface *intrfc; + struct ipxhdr *ipx; + u16 ipx_pktsize; + int rc = 0; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + /* Not ours */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out; + + if (!pskb_may_pull(skb, sizeof(struct ipxhdr))) + goto drop; + + ipx_pktsize = ntohs(ipx_hdr(skb)->ipx_pktsize); + + /* Too small or invalid header? */ + if (ipx_pktsize < sizeof(struct ipxhdr) || + !pskb_may_pull(skb, ipx_pktsize)) + goto drop; + + ipx = ipx_hdr(skb); + if (ipx->ipx_checksum != IPX_NO_CHECKSUM && + ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize)) + goto drop; + + IPX_SKB_CB(skb)->ipx_tctrl = ipx->ipx_tctrl; + IPX_SKB_CB(skb)->ipx_dest_net = ipx->ipx_dest.net; + IPX_SKB_CB(skb)->ipx_source_net = ipx->ipx_source.net; + + /* Determine what local ipx endpoint this is */ + intrfc = ipxitf_find_using_phys(dev, pt->type); + if (!intrfc) { + if (ipxcfg_auto_create_interfaces && + IPX_SKB_CB(skb)->ipx_dest_net) { + intrfc = ipxitf_auto_create(dev, pt->type); + if (intrfc) + ipxitf_hold(intrfc); + } + + if (!intrfc) /* Not one of ours */ + /* or invalid packet for auto creation */ + goto drop; + } + + rc = ipxitf_rcv(intrfc, skb); + ipxitf_put(intrfc); + goto out; +drop: + kfree_skb(skb); +out: + return rc; +} + +static int ipx_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct sockaddr_ipx *usipx = (struct sockaddr_ipx *)msg->msg_name; + struct sockaddr_ipx local_sipx; + int rc = -EINVAL; + int flags = msg->msg_flags; + + lock_sock(sk); + /* Socket gets bound below anyway */ +/* if (sk->sk_zapped) + return -EIO; */ /* Socket not bound */ + if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + goto out; + + /* Max possible packet size limited by 16 bit pktsize in header */ + if (len >= 65535 - sizeof(struct ipxhdr)) + goto out; + + if (usipx) { + if (!ipxs->port) { + struct sockaddr_ipx uaddr; + + uaddr.sipx_port = 0; + uaddr.sipx_network = 0; +#ifdef CONFIG_IPX_INTERN + rc = -ENETDOWN; + if (!ipxs->intrfc) + goto out; /* Someone zonked the iface */ + memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, + IPX_NODE_LEN); +#endif + rc = __ipx_bind(sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); + if (rc) + goto out; + } + + rc = -EINVAL; + if (msg->msg_namelen < sizeof(*usipx) || + usipx->sipx_family != AF_IPX) + goto out; + } else { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + usipx = &local_sipx; + usipx->sipx_family = AF_IPX; + usipx->sipx_type = ipxs->type; + usipx->sipx_port = ipxs->dest_addr.sock; + usipx->sipx_network = ipxs->dest_addr.net; + memcpy(usipx->sipx_node, ipxs->dest_addr.node, IPX_NODE_LEN); + } + + rc = ipxrtr_route_packet(sk, usipx, msg->msg_iov, len, + flags & MSG_DONTWAIT); + if (rc >= 0) + rc = len; +out: + release_sock(sk); + return rc; +} + + +static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct sockaddr_ipx *sipx = (struct sockaddr_ipx *)msg->msg_name; + struct ipxhdr *ipx = NULL; + struct sk_buff *skb; + int copied, rc; + + lock_sock(sk); + /* put the autobinding in */ + if (!ipxs->port) { + struct sockaddr_ipx uaddr; + + uaddr.sipx_port = 0; + uaddr.sipx_network = 0; + +#ifdef CONFIG_IPX_INTERN + rc = -ENETDOWN; + if (!ipxs->intrfc) + goto out; /* Someone zonked the iface */ + memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, IPX_NODE_LEN); +#endif /* CONFIG_IPX_INTERN */ + + rc = __ipx_bind(sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); + if (rc) + goto out; + } + + rc = -ENOTCONN; + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &rc); + if (!skb) + goto out; + + ipx = ipx_hdr(skb); + copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr); + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, + copied); + if (rc) + goto out_free; + if (skb->tstamp.tv64) + sk->sk_stamp = skb->tstamp; + + msg->msg_namelen = sizeof(*sipx); + + if (sipx) { + sipx->sipx_family = AF_IPX; + sipx->sipx_port = ipx->ipx_source.sock; + memcpy(sipx->sipx_node, ipx->ipx_source.node, IPX_NODE_LEN); + sipx->sipx_network = IPX_SKB_CB(skb)->ipx_source_net; + sipx->sipx_type = ipx->ipx_type; + sipx->sipx_zero = 0; + } + rc = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + release_sock(sk); + return rc; +} + + +static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + long amount = 0; + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + + lock_sock(sk); + switch (cmd) { + case TIOCOUTQ: + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + rc = put_user(amount, (int __user *)argp); + break; + case TIOCINQ: { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + /* These two are safe on a single CPU system as only + * user tasks fiddle here */ + if (skb) + amount = skb->len - sizeof(struct ipxhdr); + rc = put_user(amount, (int __user *)argp); + break; + } + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (capable(CAP_NET_ADMIN)) + rc = ipxrtr_ioctl(cmd, argp); + break; + case SIOCSIFADDR: + case SIOCAIPXITFCRT: + case SIOCAIPXPRISLT: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + case SIOCGIFADDR: + rc = ipxitf_ioctl(cmd, argp); + break; + case SIOCIPXCFGDATA: + rc = ipxcfg_get_config_data(argp); + break; + case SIOCIPXNCPCONN: + /* + * This socket wants to take care of the NCP connection + * handed to us in arg. + */ + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = get_user(ipx_sk(sk)->ipx_ncp_conn, + (const unsigned short __user *)argp); + break; + case SIOCGSTAMP: + rc = -EINVAL; + if (sk) + rc = sock_get_timestamp(sk, argp); + break; + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + rc = -EINVAL; + break; + default: + rc = -ENOIOCTLCMD; + break; + } + release_sock(sk); + + return rc; +} + + +#ifdef CONFIG_COMPAT +static int ipx_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + /* + * These 4 commands use same structure on 32bit and 64bit. Rest of IPX + * commands is handled by generic ioctl code. As these commands are + * SIOCPROTOPRIVATE..SIOCPROTOPRIVATE+3, they cannot be handled by generic + * code. + */ + switch (cmd) { + case SIOCAIPXITFCRT: + case SIOCAIPXPRISLT: + case SIOCIPXCFGDATA: + case SIOCIPXNCPCONN: + return ipx_ioctl(sock, cmd, arg); + default: + return -ENOIOCTLCMD; + } +} +#endif + + +/* + * Socket family declarations + */ + +static const struct net_proto_family ipx_family_ops = { + .family = PF_IPX, + .create = ipx_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops ipx_dgram_ops = { + .family = PF_IPX, + .owner = THIS_MODULE, + .release = ipx_release, + .bind = ipx_bind, + .connect = ipx_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = ipx_getname, + .poll = datagram_poll, + .ioctl = ipx_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ipx_compat_ioctl, +#endif + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, /* FIXME: support shutdown */ + .setsockopt = ipx_setsockopt, + .getsockopt = ipx_getsockopt, + .sendmsg = ipx_sendmsg, + .recvmsg = ipx_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct packet_type ipx_8023_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_802_3), + .func = ipx_rcv, +}; + +static struct packet_type ipx_dix_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IPX), + .func = ipx_rcv, +}; + +static struct notifier_block ipx_dev_notifier = { + .notifier_call = ipxitf_device_event, +}; + +extern struct datalink_proto *make_EII_client(void); +extern void destroy_EII_client(struct datalink_proto *); + +static const unsigned char ipx_8022_type = 0xE0; +static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; +static const char ipx_EII_err_msg[] __initconst = + KERN_CRIT "IPX: Unable to register with Ethernet II\n"; +static const char ipx_8023_err_msg[] __initconst = + KERN_CRIT "IPX: Unable to register with 802.3\n"; +static const char ipx_llc_err_msg[] __initconst = + KERN_CRIT "IPX: Unable to register with 802.2\n"; +static const char ipx_snap_err_msg[] __initconst = + KERN_CRIT "IPX: Unable to register with SNAP\n"; + +static int __init ipx_init(void) +{ + int rc = proto_register(&ipx_proto, 1); + + if (rc != 0) + goto out; + + sock_register(&ipx_family_ops); + + pEII_datalink = make_EII_client(); + if (pEII_datalink) + dev_add_pack(&ipx_dix_packet_type); + else + printk(ipx_EII_err_msg); + + p8023_datalink = make_8023_client(); + if (p8023_datalink) + dev_add_pack(&ipx_8023_packet_type); + else + printk(ipx_8023_err_msg); + + p8022_datalink = register_8022_client(ipx_8022_type, ipx_rcv); + if (!p8022_datalink) + printk(ipx_llc_err_msg); + + pSNAP_datalink = register_snap_client(ipx_snap_id, ipx_rcv); + if (!pSNAP_datalink) + printk(ipx_snap_err_msg); + + register_netdevice_notifier(&ipx_dev_notifier); + ipx_register_sysctl(); + ipx_proc_init(); +out: + return rc; +} + +static void __exit ipx_proto_finito(void) +{ + ipx_proc_exit(); + ipx_unregister_sysctl(); + + unregister_netdevice_notifier(&ipx_dev_notifier); + + ipxitf_cleanup(); + + if (pSNAP_datalink) { + unregister_snap_client(pSNAP_datalink); + pSNAP_datalink = NULL; + } + + if (p8022_datalink) { + unregister_8022_client(p8022_datalink); + p8022_datalink = NULL; + } + + dev_remove_pack(&ipx_8023_packet_type); + if (p8023_datalink) { + destroy_8023_client(p8023_datalink); + p8023_datalink = NULL; + } + + dev_remove_pack(&ipx_dix_packet_type); + if (pEII_datalink) { + destroy_EII_client(pEII_datalink); + pEII_datalink = NULL; + } + + proto_unregister(&ipx_proto); + sock_unregister(ipx_family_ops.family); +} + +module_init(ipx_init); +module_exit(ipx_proto_finito); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_IPX); diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c new file mode 100644 index 00000000..26b5bfcf --- /dev/null +++ b/net/ipx/ipx_proc.c @@ -0,0 +1,339 @@ +/* + * IPX proc routines + * + * Copyright(C) Arnaldo Carvalho de Melo , 2002 + */ + +#include +#ifdef CONFIG_PROC_FS +#include +#include +#include +#include +#include +#include + +static void *ipx_seq_interface_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_bh(&ipx_interfaces_lock); + return seq_list_start_head(&ipx_interfaces, *pos); +} + +static void *ipx_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &ipx_interfaces, pos); +} + +static void ipx_seq_interface_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&ipx_interfaces_lock); +} + +static int ipx_seq_interface_show(struct seq_file *seq, void *v) +{ + struct ipx_interface *i; + + if (v == &ipx_interfaces) { + seq_puts(seq, "Network Node_Address Primary Device " + "Frame_Type"); +#ifdef IPX_REFCNT_DEBUG + seq_puts(seq, " refcnt"); +#endif + seq_puts(seq, "\n"); + goto out; + } + + i = list_entry(v, struct ipx_interface, node); + seq_printf(seq, "%08lX ", (unsigned long int)ntohl(i->if_netnum)); + seq_printf(seq, "%02X%02X%02X%02X%02X%02X ", + i->if_node[0], i->if_node[1], i->if_node[2], + i->if_node[3], i->if_node[4], i->if_node[5]); + seq_printf(seq, "%-9s", i == ipx_primary_net ? "Yes" : "No"); + seq_printf(seq, "%-11s", ipx_device_name(i)); + seq_printf(seq, "%-9s", ipx_frame_name(i->if_dlink_type)); +#ifdef IPX_REFCNT_DEBUG + seq_printf(seq, "%6d", atomic_read(&i->refcnt)); +#endif + seq_puts(seq, "\n"); +out: + return 0; +} + +static void *ipx_seq_route_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&ipx_routes_lock); + return seq_list_start_head(&ipx_routes, *pos); +} + +static void *ipx_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &ipx_routes, pos); +} + +static void ipx_seq_route_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ipx_routes_lock); +} + +static int ipx_seq_route_show(struct seq_file *seq, void *v) +{ + struct ipx_route *rt; + + if (v == &ipx_routes) { + seq_puts(seq, "Network Router_Net Router_Node\n"); + goto out; + } + + rt = list_entry(v, struct ipx_route, node); + + seq_printf(seq, "%08lX ", (unsigned long int)ntohl(rt->ir_net)); + if (rt->ir_routed) + seq_printf(seq, "%08lX %02X%02X%02X%02X%02X%02X\n", + (long unsigned int)ntohl(rt->ir_intrfc->if_netnum), + rt->ir_router_node[0], rt->ir_router_node[1], + rt->ir_router_node[2], rt->ir_router_node[3], + rt->ir_router_node[4], rt->ir_router_node[5]); + else + seq_puts(seq, "Directly Connected\n"); +out: + return 0; +} + +static __inline__ struct sock *ipx_get_socket_idx(loff_t pos) +{ + struct sock *s = NULL; + struct hlist_node *node; + struct ipx_interface *i; + + list_for_each_entry(i, &ipx_interfaces, node) { + spin_lock_bh(&i->if_sklist_lock); + sk_for_each(s, node, &i->if_sklist) { + if (!pos) + break; + --pos; + } + spin_unlock_bh(&i->if_sklist_lock); + if (!pos) { + if (node) + goto found; + break; + } + } + s = NULL; +found: + return s; +} + +static void *ipx_seq_socket_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + spin_lock_bh(&ipx_interfaces_lock); + return l ? ipx_get_socket_idx(--l) : SEQ_START_TOKEN; +} + +static void *ipx_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock* sk, *next; + struct ipx_interface *i; + struct ipx_sock *ipxs; + + ++*pos; + if (v == SEQ_START_TOKEN) { + sk = NULL; + i = ipx_interfaces_head(); + if (!i) + goto out; + sk = sk_head(&i->if_sklist); + if (sk) + spin_lock_bh(&i->if_sklist_lock); + goto out; + } + sk = v; + next = sk_next(sk); + if (next) { + sk = next; + goto out; + } + ipxs = ipx_sk(sk); + i = ipxs->intrfc; + spin_unlock_bh(&i->if_sklist_lock); + sk = NULL; + for (;;) { + if (i->node.next == &ipx_interfaces) + break; + i = list_entry(i->node.next, struct ipx_interface, node); + spin_lock_bh(&i->if_sklist_lock); + if (!hlist_empty(&i->if_sklist)) { + sk = sk_head(&i->if_sklist); + break; + } + spin_unlock_bh(&i->if_sklist_lock); + } +out: + return sk; +} + +static int ipx_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock *s; + struct ipx_sock *ipxs; + + if (v == SEQ_START_TOKEN) { +#ifdef CONFIG_IPX_INTERN + seq_puts(seq, "Local_Address " + "Remote_Address Tx_Queue " + "Rx_Queue State Uid\n"); +#else + seq_puts(seq, "Local_Address Remote_Address " + "Tx_Queue Rx_Queue State Uid\n"); +#endif + goto out; + } + + s = v; + ipxs = ipx_sk(s); +#ifdef CONFIG_IPX_INTERN + seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", + (unsigned long)ntohl(ipxs->intrfc->if_netnum), + ipxs->node[0], ipxs->node[1], ipxs->node[2], ipxs->node[3], + ipxs->node[4], ipxs->node[5], ntohs(ipxs->port)); +#else + seq_printf(seq, "%08lX:%04X ", (unsigned long) ntohl(ipxs->intrfc->if_netnum), + ntohs(ipxs->port)); +#endif /* CONFIG_IPX_INTERN */ + if (s->sk_state != TCP_ESTABLISHED) + seq_printf(seq, "%-28s", "Not_Connected"); + else { + seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", + (unsigned long)ntohl(ipxs->dest_addr.net), + ipxs->dest_addr.node[0], ipxs->dest_addr.node[1], + ipxs->dest_addr.node[2], ipxs->dest_addr.node[3], + ipxs->dest_addr.node[4], ipxs->dest_addr.node[5], + ntohs(ipxs->dest_addr.sock)); + } + + seq_printf(seq, "%08X %08X %02X %03d\n", + sk_wmem_alloc_get(s), + sk_rmem_alloc_get(s), + s->sk_state, SOCK_INODE(s->sk_socket)->i_uid); +out: + return 0; +} + +static const struct seq_operations ipx_seq_interface_ops = { + .start = ipx_seq_interface_start, + .next = ipx_seq_interface_next, + .stop = ipx_seq_interface_stop, + .show = ipx_seq_interface_show, +}; + +static const struct seq_operations ipx_seq_route_ops = { + .start = ipx_seq_route_start, + .next = ipx_seq_route_next, + .stop = ipx_seq_route_stop, + .show = ipx_seq_route_show, +}; + +static const struct seq_operations ipx_seq_socket_ops = { + .start = ipx_seq_socket_start, + .next = ipx_seq_socket_next, + .stop = ipx_seq_interface_stop, + .show = ipx_seq_socket_show, +}; + +static int ipx_seq_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ipx_seq_route_ops); +} + +static int ipx_seq_interface_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ipx_seq_interface_ops); +} + +static int ipx_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ipx_seq_socket_ops); +} + +static const struct file_operations ipx_seq_interface_fops = { + .owner = THIS_MODULE, + .open = ipx_seq_interface_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ipx_seq_route_fops = { + .owner = THIS_MODULE, + .open = ipx_seq_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ipx_seq_socket_fops = { + .owner = THIS_MODULE, + .open = ipx_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *ipx_proc_dir; + +int __init ipx_proc_init(void) +{ + struct proc_dir_entry *p; + int rc = -ENOMEM; + + ipx_proc_dir = proc_mkdir("ipx", init_net.proc_net); + + if (!ipx_proc_dir) + goto out; + p = proc_create("interface", S_IRUGO, + ipx_proc_dir, &ipx_seq_interface_fops); + if (!p) + goto out_interface; + + p = proc_create("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_fops); + if (!p) + goto out_route; + + p = proc_create("socket", S_IRUGO, ipx_proc_dir, &ipx_seq_socket_fops); + if (!p) + goto out_socket; + + rc = 0; +out: + return rc; +out_socket: + remove_proc_entry("route", ipx_proc_dir); +out_route: + remove_proc_entry("interface", ipx_proc_dir); +out_interface: + remove_proc_entry("ipx", init_net.proc_net); + goto out; +} + +void __exit ipx_proc_exit(void) +{ + remove_proc_entry("interface", ipx_proc_dir); + remove_proc_entry("route", ipx_proc_dir); + remove_proc_entry("socket", ipx_proc_dir); + remove_proc_entry("ipx", init_net.proc_net); +} + +#else /* CONFIG_PROC_FS */ + +int __init ipx_proc_init(void) +{ + return 0; +} + +void __exit ipx_proc_exit(void) +{ +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c new file mode 100644 index 00000000..30f4519b --- /dev/null +++ b/net/ipx/ipx_route.c @@ -0,0 +1,295 @@ +/* + * Implements the IPX routing routines. + * Code moved from af_ipx.c. + * + * Arnaldo Carvalho de Melo , 2003 + * + * See net/ipx/ChangeLog. + */ + +#include +#include +#include +#include + +#include +#include + +LIST_HEAD(ipx_routes); +DEFINE_RWLOCK(ipx_routes_lock); + +extern struct ipx_interface *ipx_internal_net; + +extern __be16 ipx_cksum(struct ipxhdr *packet, int length); +extern struct ipx_interface *ipxitf_find_using_net(__be32 net); +extern int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy); +extern int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy); +extern int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, + char *node); +extern struct ipx_interface *ipxitf_find_using_net(__be32 net); + +struct ipx_route *ipxrtr_lookup(__be32 net) +{ + struct ipx_route *r; + + read_lock_bh(&ipx_routes_lock); + list_for_each_entry(r, &ipx_routes, node) + if (r->ir_net == net) { + ipxrtr_hold(r); + goto unlock; + } + r = NULL; +unlock: + read_unlock_bh(&ipx_routes_lock); + return r; +} + +/* + * Caller must hold a reference to intrfc + */ +int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc, + unsigned char *node) +{ + struct ipx_route *rt; + int rc; + + /* Get a route structure; either existing or create */ + rt = ipxrtr_lookup(network); + if (!rt) { + rt = kmalloc(sizeof(*rt), GFP_ATOMIC); + rc = -EAGAIN; + if (!rt) + goto out; + + atomic_set(&rt->refcnt, 1); + ipxrtr_hold(rt); + write_lock_bh(&ipx_routes_lock); + list_add(&rt->node, &ipx_routes); + write_unlock_bh(&ipx_routes_lock); + } else { + rc = -EEXIST; + if (intrfc == ipx_internal_net) + goto out_put; + } + + rt->ir_net = network; + rt->ir_intrfc = intrfc; + if (!node) { + memset(rt->ir_router_node, '\0', IPX_NODE_LEN); + rt->ir_routed = 0; + } else { + memcpy(rt->ir_router_node, node, IPX_NODE_LEN); + rt->ir_routed = 1; + } + + rc = 0; +out_put: + ipxrtr_put(rt); +out: + return rc; +} + +void ipxrtr_del_routes(struct ipx_interface *intrfc) +{ + struct ipx_route *r, *tmp; + + write_lock_bh(&ipx_routes_lock); + list_for_each_entry_safe(r, tmp, &ipx_routes, node) + if (r->ir_intrfc == intrfc) { + list_del(&r->node); + ipxrtr_put(r); + } + write_unlock_bh(&ipx_routes_lock); +} + +static int ipxrtr_create(struct ipx_route_definition *rd) +{ + struct ipx_interface *intrfc; + int rc = -ENETUNREACH; + + /* Find the appropriate interface */ + intrfc = ipxitf_find_using_net(rd->ipx_router_network); + if (!intrfc) + goto out; + rc = ipxrtr_add_route(rd->ipx_network, intrfc, rd->ipx_router_node); + ipxitf_put(intrfc); +out: + return rc; +} + +static int ipxrtr_delete(__be32 net) +{ + struct ipx_route *r, *tmp; + int rc; + + write_lock_bh(&ipx_routes_lock); + list_for_each_entry_safe(r, tmp, &ipx_routes, node) + if (r->ir_net == net) { + /* Directly connected; can't lose route */ + rc = -EPERM; + if (!r->ir_routed) + goto out; + list_del(&r->node); + ipxrtr_put(r); + rc = 0; + goto out; + } + rc = -ENOENT; +out: + write_unlock_bh(&ipx_routes_lock); + return rc; +} + +/* + * The skb has to be unshared, we'll end up calling ipxitf_send, that'll + * modify the packet + */ +int ipxrtr_route_skb(struct sk_buff *skb) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + struct ipx_route *r = ipxrtr_lookup(IPX_SKB_CB(skb)->ipx_dest_net); + + if (!r) { /* no known route */ + kfree_skb(skb); + return 0; + } + + ipxitf_hold(r->ir_intrfc); + ipxitf_send(r->ir_intrfc, skb, r->ir_routed ? + r->ir_router_node : ipx->ipx_dest.node); + ipxitf_put(r->ir_intrfc); + ipxrtr_put(r); + + return 0; +} + +/* + * Route an outgoing frame from a socket. + */ +int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, + struct iovec *iov, size_t len, int noblock) +{ + struct sk_buff *skb; + struct ipx_sock *ipxs = ipx_sk(sk); + struct ipx_interface *intrfc; + struct ipxhdr *ipx; + size_t size; + int ipx_offset; + struct ipx_route *rt = NULL; + int rc; + + /* Find the appropriate interface on which to send packet */ + if (!usipx->sipx_network && ipx_primary_net) { + usipx->sipx_network = ipx_primary_net->if_netnum; + intrfc = ipx_primary_net; + } else { + rt = ipxrtr_lookup(usipx->sipx_network); + rc = -ENETUNREACH; + if (!rt) + goto out; + intrfc = rt->ir_intrfc; + } + + ipxitf_hold(intrfc); + ipx_offset = intrfc->if_ipx_offset; + size = sizeof(struct ipxhdr) + len + ipx_offset; + + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + if (!skb) + goto out_put; + + skb_reserve(skb, ipx_offset); + skb->sk = sk; + + /* Fill in IPX header */ + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_put(skb, sizeof(struct ipxhdr)); + ipx = ipx_hdr(skb); + ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr)); + IPX_SKB_CB(skb)->ipx_tctrl = 0; + ipx->ipx_type = usipx->sipx_type; + + IPX_SKB_CB(skb)->last_hop.index = -1; +#ifdef CONFIG_IPX_INTERN + IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum; + memcpy(ipx->ipx_source.node, ipxs->node, IPX_NODE_LEN); +#else + rc = ntohs(ipxs->port); + if (rc == 0x453 || rc == 0x452) { + /* RIP/SAP special handling for mars_nwe */ + IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum; + memcpy(ipx->ipx_source.node, intrfc->if_node, IPX_NODE_LEN); + } else { + IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum; + memcpy(ipx->ipx_source.node, ipxs->intrfc->if_node, + IPX_NODE_LEN); + } +#endif /* CONFIG_IPX_INTERN */ + ipx->ipx_source.sock = ipxs->port; + IPX_SKB_CB(skb)->ipx_dest_net = usipx->sipx_network; + memcpy(ipx->ipx_dest.node, usipx->sipx_node, IPX_NODE_LEN); + ipx->ipx_dest.sock = usipx->sipx_port; + + rc = memcpy_fromiovec(skb_put(skb, len), iov, len); + if (rc) { + kfree_skb(skb); + goto out_put; + } + + /* Apply checksum. Not allowed on 802.3 links. */ + if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023)) + ipx->ipx_checksum = htons(0xFFFF); + else + ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); + + rc = ipxitf_send(intrfc, skb, (rt && rt->ir_routed) ? + rt->ir_router_node : ipx->ipx_dest.node); +out_put: + ipxitf_put(intrfc); + if (rt) + ipxrtr_put(rt); +out: + return rc; +} + +/* + * We use a normal struct rtentry for route handling + */ +int ipxrtr_ioctl(unsigned int cmd, void __user *arg) +{ + struct rtentry rt; /* Use these to behave like 'other' stacks */ + struct sockaddr_ipx *sg, *st; + int rc = -EFAULT; + + if (copy_from_user(&rt, arg, sizeof(rt))) + goto out; + + sg = (struct sockaddr_ipx *)&rt.rt_gateway; + st = (struct sockaddr_ipx *)&rt.rt_dst; + + rc = -EINVAL; + if (!(rt.rt_flags & RTF_GATEWAY) || /* Direct routes are fixed */ + sg->sipx_family != AF_IPX || + st->sipx_family != AF_IPX) + goto out; + + switch (cmd) { + case SIOCDELRT: + rc = ipxrtr_delete(st->sipx_network); + break; + case SIOCADDRT: { + struct ipx_route_definition f; + f.ipx_network = st->sipx_network; + f.ipx_router_network = sg->sipx_network; + memcpy(f.ipx_router_node, sg->sipx_node, IPX_NODE_LEN); + rc = ipxrtr_create(&f); + break; + } + } + +out: + return rc; +} diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c new file mode 100644 index 00000000..bd6dca00 --- /dev/null +++ b/net/ipx/sysctl_net_ipx.c @@ -0,0 +1,46 @@ +/* -*- linux-c -*- + * sysctl_net_ipx.c: sysctl interface to net IPX subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipx directory entry (empty =) ). [MS] + * Added /proc/sys/net/ipx/ipx_pprop_broadcasting - acme March 4, 2001 + */ + +#include +#include + +#ifndef CONFIG_SYSCTL +#error This file should not be compiled without CONFIG_SYSCTL defined +#endif + +/* From af_ipx.c */ +extern int sysctl_ipx_pprop_broadcasting; + +static struct ctl_table ipx_table[] = { + { + .procname = "ipx_pprop_broadcasting", + .data = &sysctl_ipx_pprop_broadcasting, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { }, +}; + +static struct ctl_path ipx_path[] = { + { .procname = "net", }, + { .procname = "ipx", }, + { } +}; + +static struct ctl_table_header *ipx_table_header; + +void ipx_register_sysctl(void) +{ + ipx_table_header = register_sysctl_paths(ipx_path, ipx_table); +} + +void ipx_unregister_sysctl(void) +{ + unregister_sysctl_table(ipx_table_header); +} diff --git a/net/irda/Kconfig b/net/irda/Kconfig new file mode 100644 index 00000000..c8671a7f --- /dev/null +++ b/net/irda/Kconfig @@ -0,0 +1,96 @@ +# +# IrDA protocol configuration +# + +menuconfig IRDA + depends on NET && !S390 + tristate "IrDA (infrared) subsystem support" + select CRC_CCITT + ---help--- + Say Y here if you want to build support for the IrDA (TM) protocols. + The Infrared Data Associations (tm) specifies standards for wireless + infrared communication and is supported by most laptops and PDA's. + + To use Linux support for the IrDA (tm) protocols, you will also need + some user-space utilities like irattach. For more information, see + the file . You also want to + read the IR-HOWTO, available at + . + + If you want to exchange bits of data (vCal, vCard) with a PDA, you + will need to install some OBEX application, such as OpenObex : + + + To compile this support as a module, choose M here: the module will + be called irda. + +comment "IrDA protocols" + depends on IRDA + +source "net/irda/irlan/Kconfig" + +source "net/irda/irnet/Kconfig" + +source "net/irda/ircomm/Kconfig" + +config IRDA_ULTRA + bool "Ultra (connectionless) protocol" + depends on IRDA + help + Say Y here to support the connectionless Ultra IRDA protocol. + Ultra allows to exchange data over IrDA with really simple devices + (watch, beacon) without the overhead of the IrDA protocol (no handshaking, + no management frames, simple fixed header). + Ultra is available as a special socket : socket(AF_IRDA, SOCK_DGRAM, 1); + +comment "IrDA options" + depends on IRDA + +config IRDA_CACHE_LAST_LSAP + bool "Cache last LSAP" + depends on IRDA + help + Say Y here if you want IrLMP to cache the last LSAP used. This + makes sense since most frames will be sent/received on the same + connection. Enabling this option will save a hash-lookup per frame. + + If unsure, say Y. + +config IRDA_FAST_RR + bool "Fast RRs (low latency)" + depends on IRDA + ---help--- + Say Y here is you want IrLAP to send fast RR (Receive Ready) frames + when acting as a primary station. + Disabling this option will make latency over IrDA very bad. Enabling + this option will make the IrDA stack send more packet than strictly + necessary, thus reduce your battery life (but not that much). + + Fast RR will make IrLAP send out a RR frame immediately when + receiving a frame if its own transmit queue is currently empty. This + will give a lot of speed improvement when receiving much data since + the secondary station will not have to wait the max. turn around + time (usually 500ms) before it is allowed to transmit the next time. + If the transmit queue of the secondary is also empty, the primary will + start backing-off before sending another RR frame, waiting longer + each time until the back-off reaches the max. turn around time. + This back-off increase in controlled via + /proc/sys/net/irda/fast_poll_increase + + If unsure, say Y. + +config IRDA_DEBUG + bool "Debug information" + depends on IRDA + help + Say Y here if you want the IrDA subsystem to write debug information + to your syslog. You can change the debug level in + /proc/sys/net/irda/debug . + When this option is enabled, the IrDA also perform many extra internal + verifications which will usually prevent the kernel to crash in case of + bugs. + + If unsure, say Y (since it makes it easier to find the bugs). + +source "drivers/net/irda/Kconfig" + diff --git a/net/irda/Makefile b/net/irda/Makefile new file mode 100644 index 00000000..187f6c56 --- /dev/null +++ b/net/irda/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for the Linux IrDA protocol layer. +# + +obj-$(CONFIG_IRDA) += irda.o +obj-$(CONFIG_IRLAN) += irlan/ +obj-$(CONFIG_IRNET) += irnet/ +obj-$(CONFIG_IRCOMM) += ircomm/ + +irda-y := iriap.o iriap_event.o irlmp.o irlmp_event.o irlmp_frame.o \ + irlap.o irlap_event.o irlap_frame.o timer.o qos.o irqueue.o \ + irttp.o irda_device.o irias_object.o wrapper.o af_irda.o \ + discovery.o parameters.o irnetlink.o irmod.o +irda-$(CONFIG_PROC_FS) += irproc.o +irda-$(CONFIG_SYSCTL) += irsysctl.o diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c new file mode 100644 index 00000000..cc616974 --- /dev/null +++ b/net/irda/af_irda.c @@ -0,0 +1,2746 @@ +/********************************************************************* + * + * Filename: af_irda.c + * Version: 0.9 + * Description: IrDA sockets implementation + * Status: Stable + * Author: Dag Brattli + * Created at: Sun May 31 10:12:43 1998 + * Modified at: Sat Dec 25 21:10:23 1999 + * Modified by: Dag Brattli + * Sources: af_netroom.c, af_ax25.c, af_rose.c, af_x25.c etc. + * + * Copyright (c) 1999 Dag Brattli + * Copyright (c) 1999-2003 Jean Tourrilhes + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * Linux-IrDA now supports four different types of IrDA sockets: + * + * o SOCK_STREAM: TinyTP connections with SAR disabled. The + * max SDU size is 0 for conn. of this type + * o SOCK_SEQPACKET: TinyTP connections with SAR enabled. TTP may + * fragment the messages, but will preserve + * the message boundaries + * o SOCK_DGRAM: IRDAPROTO_UNITDATA: TinyTP connections with Unitdata + * (unreliable) transfers + * IRDAPROTO_ULTRA: Connectionless and unreliable data + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* TIOCOUTQ, TIOCINQ */ +#include + +#include +#include + +#include + +static int irda_create(struct net *net, struct socket *sock, int protocol, int kern); + +static const struct proto_ops irda_stream_ops; +static const struct proto_ops irda_seqpacket_ops; +static const struct proto_ops irda_dgram_ops; + +#ifdef CONFIG_IRDA_ULTRA +static const struct proto_ops irda_ultra_ops; +#define ULTRA_MAX_DATA 382 +#endif /* CONFIG_IRDA_ULTRA */ + +#define IRDA_MAX_HEADER (TTP_MAX_HEADER) + +/* + * Function irda_data_indication (instance, sap, skb) + * + * Received some data from TinyTP. Just queue it on the receive queue + * + */ +static int irda_data_indication(void *instance, void *sap, struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + int err; + + IRDA_DEBUG(3, "%s()\n", __func__); + + self = instance; + sk = instance; + + err = sock_queue_rcv_skb(sk, skb); + if (err) { + IRDA_DEBUG(1, "%s(), error: no more mem!\n", __func__); + self->rx_flow = FLOW_STOP; + + /* When we return error, TTP will need to requeue the skb */ + return err; + } + + return 0; +} + +/* + * Function irda_disconnect_indication (instance, sap, reason, skb) + * + * Connection has been closed. Check reason to find out why + * + */ +static void irda_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + + self = instance; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + /* Don't care about it, but let's not leak it */ + if(skb) + dev_kfree_skb(skb); + + sk = instance; + if (sk == NULL) { + IRDA_DEBUG(0, "%s(%p) : BUG : sk is NULL\n", + __func__, self); + return; + } + + /* Prevent race conditions with irda_release() and irda_shutdown() */ + bh_lock_sock(sk); + if (!sock_flag(sk, SOCK_DEAD) && sk->sk_state != TCP_CLOSE) { + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + + sk->sk_state_change(sk); + + /* Close our TSAP. + * If we leave it open, IrLMP put it back into the list of + * unconnected LSAPs. The problem is that any incoming request + * can then be matched to this socket (and it will be, because + * it is at the head of the list). This would prevent any + * listening socket waiting on the same TSAP to get those + * requests. Some apps forget to close sockets, or hang to it + * a bit too long, so we may stay in this dead state long + * enough to be noticed... + * Note : all socket function do check sk->sk_state, so we are + * safe... + * Jean II + */ + if (self->tsap) { + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + } + bh_unlock_sock(sk); + + /* Note : once we are there, there is not much you want to do + * with the socket anymore, apart from closing it. + * For example, bind() and connect() won't reset sk->sk_err, + * sk->sk_shutdown and sk->sk_flags to valid values... + * Jean II + */ +} + +/* + * Function irda_connect_confirm (instance, sap, qos, max_sdu_size, skb) + * + * Connections has been confirmed by the remote device + * + */ +static void irda_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, __u8 max_header_size, + struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + + self = instance; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + sk = instance; + if (sk == NULL) { + dev_kfree_skb(skb); + return; + } + + dev_kfree_skb(skb); + // Should be ??? skb_queue_tail(&sk->sk_receive_queue, skb); + + /* How much header space do we need to reserve */ + self->max_header_size = max_header_size; + + /* IrTTP max SDU size in transmit direction */ + self->max_sdu_size_tx = max_sdu_size; + + /* Find out what the largest chunk of data that we can transmit is */ + switch (sk->sk_type) { + case SOCK_STREAM: + if (max_sdu_size != 0) { + IRDA_ERROR("%s: max_sdu_size must be 0\n", + __func__); + return; + } + self->max_data_size = irttp_get_max_seg_size(self->tsap); + break; + case SOCK_SEQPACKET: + if (max_sdu_size == 0) { + IRDA_ERROR("%s: max_sdu_size cannot be 0\n", + __func__); + return; + } + self->max_data_size = max_sdu_size; + break; + default: + self->max_data_size = irttp_get_max_seg_size(self->tsap); + } + + IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __func__, + self->max_data_size); + + memcpy(&self->qos_tx, qos, sizeof(struct qos_info)); + + /* We are now connected! */ + sk->sk_state = TCP_ESTABLISHED; + sk->sk_state_change(sk); +} + +/* + * Function irda_connect_indication(instance, sap, qos, max_sdu_size, userdata) + * + * Incoming connection + * + */ +static void irda_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 max_header_size, struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + + self = instance; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + sk = instance; + if (sk == NULL) { + dev_kfree_skb(skb); + return; + } + + /* How much header space do we need to reserve */ + self->max_header_size = max_header_size; + + /* IrTTP max SDU size in transmit direction */ + self->max_sdu_size_tx = max_sdu_size; + + /* Find out what the largest chunk of data that we can transmit is */ + switch (sk->sk_type) { + case SOCK_STREAM: + if (max_sdu_size != 0) { + IRDA_ERROR("%s: max_sdu_size must be 0\n", + __func__); + kfree_skb(skb); + return; + } + self->max_data_size = irttp_get_max_seg_size(self->tsap); + break; + case SOCK_SEQPACKET: + if (max_sdu_size == 0) { + IRDA_ERROR("%s: max_sdu_size cannot be 0\n", + __func__); + kfree_skb(skb); + return; + } + self->max_data_size = max_sdu_size; + break; + default: + self->max_data_size = irttp_get_max_seg_size(self->tsap); + } + + IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __func__, + self->max_data_size); + + memcpy(&self->qos_tx, qos, sizeof(struct qos_info)); + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_state_change(sk); +} + +/* + * Function irda_connect_response (handle) + * + * Accept incoming connection + * + */ +static void irda_connect_response(struct irda_sock *self) +{ + struct sk_buff *skb; + + IRDA_DEBUG(2, "%s()\n", __func__); + + skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, + GFP_ATOMIC); + if (skb == NULL) { + IRDA_DEBUG(0, "%s() Unable to allocate sk_buff!\n", + __func__); + return; + } + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(skb, IRDA_MAX_HEADER); + + irttp_connect_response(self->tsap, self->max_sdu_size_rx, skb); +} + +/* + * Function irda_flow_indication (instance, sap, flow) + * + * Used by TinyTP to tell us if it can accept more data or not + * + */ +static void irda_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) +{ + struct irda_sock *self; + struct sock *sk; + + IRDA_DEBUG(2, "%s()\n", __func__); + + self = instance; + sk = instance; + BUG_ON(sk == NULL); + + switch (flow) { + case FLOW_STOP: + IRDA_DEBUG(1, "%s(), IrTTP wants us to slow down\n", + __func__); + self->tx_flow = flow; + break; + case FLOW_START: + self->tx_flow = flow; + IRDA_DEBUG(1, "%s(), IrTTP wants us to start again\n", + __func__); + wake_up_interruptible(sk_sleep(sk)); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown flow command!\n", __func__); + /* Unknown flow command, better stop */ + self->tx_flow = flow; + break; + } +} + +/* + * Function irda_getvalue_confirm (obj_id, value, priv) + * + * Got answer from remote LM-IAS, just pass object to requester... + * + * Note : duplicate from above, but we need our own version that + * doesn't touch the dtsap_sel and save the full value structure... + */ +static void irda_getvalue_confirm(int result, __u16 obj_id, + struct ias_value *value, void *priv) +{ + struct irda_sock *self; + + self = (struct irda_sock *) priv; + if (!self) { + IRDA_WARNING("%s: lost myself!\n", __func__); + return; + } + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* Check if request succeeded */ + if (result != IAS_SUCCESS) { + IRDA_DEBUG(1, "%s(), IAS query failed! (%d)\n", __func__, + result); + + self->errno = result; /* We really need it later */ + + /* Wake up any processes waiting for result */ + wake_up_interruptible(&self->query_wait); + + return; + } + + /* Pass the object to the caller (so the caller must delete it) */ + self->ias_result = value; + self->errno = 0; + + /* Wake up any processes waiting for result */ + wake_up_interruptible(&self->query_wait); +} + +/* + * Function irda_selective_discovery_indication (discovery) + * + * Got a selective discovery indication from IrLMP. + * + * IrLMP is telling us that this node is new and matching our hint bit + * filter. Wake up any process waiting for answer... + */ +static void irda_selective_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv) +{ + struct irda_sock *self; + + IRDA_DEBUG(2, "%s()\n", __func__); + + self = (struct irda_sock *) priv; + if (!self) { + IRDA_WARNING("%s: lost myself!\n", __func__); + return; + } + + /* Pass parameter to the caller */ + self->cachedaddr = discovery->daddr; + + /* Wake up process if its waiting for device to be discovered */ + wake_up_interruptible(&self->query_wait); +} + +/* + * Function irda_discovery_timeout (priv) + * + * Timeout in the selective discovery process + * + * We were waiting for a node to be discovered, but nothing has come up + * so far. Wake up the user and tell him that we failed... + */ +static void irda_discovery_timeout(u_long priv) +{ + struct irda_sock *self; + + IRDA_DEBUG(2, "%s()\n", __func__); + + self = (struct irda_sock *) priv; + BUG_ON(self == NULL); + + /* Nothing for the caller */ + self->cachelog = NULL; + self->cachedaddr = 0; + self->errno = -ETIME; + + /* Wake up process if its still waiting... */ + wake_up_interruptible(&self->query_wait); +} + +/* + * Function irda_open_tsap (self) + * + * Open local Transport Service Access Point (TSAP) + * + */ +static int irda_open_tsap(struct irda_sock *self, __u8 tsap_sel, char *name) +{ + notify_t notify; + + if (self->tsap) { + IRDA_WARNING("%s: busy!\n", __func__); + return -EBUSY; + } + + /* Initialize callbacks to be used by the IrDA stack */ + irda_notify_init(¬ify); + notify.connect_confirm = irda_connect_confirm; + notify.connect_indication = irda_connect_indication; + notify.disconnect_indication = irda_disconnect_indication; + notify.data_indication = irda_data_indication; + notify.udata_indication = irda_data_indication; + notify.flow_indication = irda_flow_indication; + notify.instance = self; + strncpy(notify.name, name, NOTIFY_MAX_NAME); + + self->tsap = irttp_open_tsap(tsap_sel, DEFAULT_INITIAL_CREDIT, + ¬ify); + if (self->tsap == NULL) { + IRDA_DEBUG(0, "%s(), Unable to allocate TSAP!\n", + __func__); + return -ENOMEM; + } + /* Remember which TSAP selector we actually got */ + self->stsap_sel = self->tsap->stsap_sel; + + return 0; +} + +/* + * Function irda_open_lsap (self) + * + * Open local Link Service Access Point (LSAP). Used for opening Ultra + * sockets + */ +#ifdef CONFIG_IRDA_ULTRA +static int irda_open_lsap(struct irda_sock *self, int pid) +{ + notify_t notify; + + if (self->lsap) { + IRDA_WARNING("%s(), busy!\n", __func__); + return -EBUSY; + } + + /* Initialize callbacks to be used by the IrDA stack */ + irda_notify_init(¬ify); + notify.udata_indication = irda_data_indication; + notify.instance = self; + strncpy(notify.name, "Ultra", NOTIFY_MAX_NAME); + + self->lsap = irlmp_open_lsap(LSAP_CONNLESS, ¬ify, pid); + if (self->lsap == NULL) { + IRDA_DEBUG( 0, "%s(), Unable to allocate LSAP!\n", __func__); + return -ENOMEM; + } + + return 0; +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irda_find_lsap_sel (self, name) + * + * Try to lookup LSAP selector in remote LM-IAS + * + * Basically, we start a IAP query, and then go to sleep. When the query + * return, irda_getvalue_confirm will wake us up, and we can examine the + * result of the query... + * Note that in some case, the query fail even before we go to sleep, + * creating some races... + */ +static int irda_find_lsap_sel(struct irda_sock *self, char *name) +{ + IRDA_DEBUG(2, "%s(%p, %s)\n", __func__, self, name); + + if (self->iriap) { + IRDA_WARNING("%s(): busy with a previous query\n", + __func__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irda_getvalue_confirm); + if(self->iriap == NULL) + return -ENOMEM; + + /* Treat unexpected wakeup as disconnect */ + self->errno = -EHOSTUNREACH; + + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, self->saddr, self->daddr, + name, "IrDA:TinyTP:LsapSel"); + + /* Wait for answer, if not yet finished (or failed) */ + if (wait_event_interruptible(self->query_wait, (self->iriap==NULL))) + /* Treat signals as disconnect */ + return -EHOSTUNREACH; + + /* Check what happened */ + if (self->errno) + { + /* Requested object/attribute doesn't exist */ + if((self->errno == IAS_CLASS_UNKNOWN) || + (self->errno == IAS_ATTRIB_UNKNOWN)) + return -EADDRNOTAVAIL; + else + return -EHOSTUNREACH; + } + + /* Get the remote TSAP selector */ + switch (self->ias_result->type) { + case IAS_INTEGER: + IRDA_DEBUG(4, "%s() int=%d\n", + __func__, self->ias_result->t.integer); + + if (self->ias_result->t.integer != -1) + self->dtsap_sel = self->ias_result->t.integer; + else + self->dtsap_sel = 0; + break; + default: + self->dtsap_sel = 0; + IRDA_DEBUG(0, "%s(), bad type!\n", __func__); + break; + } + if (self->ias_result) + irias_delete_value(self->ias_result); + + if (self->dtsap_sel) + return 0; + + return -EADDRNOTAVAIL; +} + +/* + * Function irda_discover_daddr_and_lsap_sel (self, name) + * + * This try to find a device with the requested service. + * + * It basically look into the discovery log. For each address in the list, + * it queries the LM-IAS of the device to find if this device offer + * the requested service. + * If there is more than one node supporting the service, we complain + * to the user (it should move devices around). + * The, we set both the destination address and the lsap selector to point + * on the service on the unique device we have found. + * + * Note : this function fails if there is more than one device in range, + * because IrLMP doesn't disconnect the LAP when the last LSAP is closed. + * Moreover, we would need to wait the LAP disconnection... + */ +static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) +{ + discinfo_t *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + int err = -ENETUNREACH; + __u32 daddr = DEV_ADDR_ANY; /* Address we found the service on */ + __u8 dtsap_sel = 0x0; /* TSAP associated with it */ + + IRDA_DEBUG(2, "%s(), name=%s\n", __func__, name); + + /* Ask lmp for the current discovery log + * Note : we have to use irlmp_get_discoveries(), as opposed + * to play with the cachelog directly, because while we are + * making our ias query, le log might change... */ + discoveries = irlmp_get_discoveries(&number, self->mask.word, + self->nslots); + /* Check if the we got some results */ + if (discoveries == NULL) + return -ENETUNREACH; /* No nodes discovered */ + + /* + * Now, check all discovered devices (if any), and connect + * client only about the services that the client is + * interested in... + */ + for(i = 0; i < number; i++) { + /* Try the address in the log */ + self->daddr = discoveries[i].daddr; + self->saddr = 0x0; + IRDA_DEBUG(1, "%s(), trying daddr = %08x\n", + __func__, self->daddr); + + /* Query remote LM-IAS for this service */ + err = irda_find_lsap_sel(self, name); + switch (err) { + case 0: + /* We found the requested service */ + if(daddr != DEV_ADDR_ANY) { + IRDA_DEBUG(1, "%s(), discovered service ''%s'' in two different devices !!!\n", + __func__, name); + self->daddr = DEV_ADDR_ANY; + kfree(discoveries); + return -ENOTUNIQ; + } + /* First time we found that one, save it ! */ + daddr = self->daddr; + dtsap_sel = self->dtsap_sel; + break; + case -EADDRNOTAVAIL: + /* Requested service simply doesn't exist on this node */ + break; + default: + /* Something bad did happen :-( */ + IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__); + self->daddr = DEV_ADDR_ANY; + kfree(discoveries); + return -EHOSTUNREACH; + break; + } + } + /* Cleanup our copy of the discovery log */ + kfree(discoveries); + + /* Check out what we found */ + if(daddr == DEV_ADDR_ANY) { + IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n", + __func__, name); + self->daddr = DEV_ADDR_ANY; + return -EADDRNOTAVAIL; + } + + /* Revert back to discovered device & service */ + self->daddr = daddr; + self->saddr = 0x0; + self->dtsap_sel = dtsap_sel; + + IRDA_DEBUG(1, "%s(), discovered requested service ''%s'' at address %08x\n", + __func__, name, self->daddr); + + return 0; +} + +/* + * Function irda_getname (sock, uaddr, uaddr_len, peer) + * + * Return the our own, or peers socket address (sockaddr_irda) + * + */ +static int irda_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_irda saddr; + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + + memset(&saddr, 0, sizeof(saddr)); + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + saddr.sir_family = AF_IRDA; + saddr.sir_lsap_sel = self->dtsap_sel; + saddr.sir_addr = self->daddr; + } else { + saddr.sir_family = AF_IRDA; + saddr.sir_lsap_sel = self->stsap_sel; + saddr.sir_addr = self->saddr; + } + + IRDA_DEBUG(1, "%s(), tsap_sel = %#x\n", __func__, saddr.sir_lsap_sel); + IRDA_DEBUG(1, "%s(), addr = %08x\n", __func__, saddr.sir_addr); + + /* uaddr_len come to us uninitialised */ + *uaddr_len = sizeof (struct sockaddr_irda); + memcpy(uaddr, &saddr, *uaddr_len); + + return 0; +} + +/* + * Function irda_listen (sock, backlog) + * + * Just move to the listen state + * + */ +static int irda_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = -EOPNOTSUPP; + + IRDA_DEBUG(2, "%s()\n", __func__); + + lock_sock(sk); + + if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && + (sk->sk_type != SOCK_DGRAM)) + goto out; + + if (sk->sk_state != TCP_LISTEN) { + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + + err = 0; + } +out: + release_sock(sk); + + return err; +} + +/* + * Function irda_bind (sock, uaddr, addr_len) + * + * Used by servers to register their well known TSAP + * + */ +static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr; + struct irda_sock *self = irda_sk(sk); + int err; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + if (addr_len != sizeof(struct sockaddr_irda)) + return -EINVAL; + + lock_sock(sk); +#ifdef CONFIG_IRDA_ULTRA + /* Special care for Ultra sockets */ + if ((sk->sk_type == SOCK_DGRAM) && + (sk->sk_protocol == IRDAPROTO_ULTRA)) { + self->pid = addr->sir_lsap_sel; + err = -EOPNOTSUPP; + if (self->pid & 0x80) { + IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", __func__); + goto out; + } + err = irda_open_lsap(self, self->pid); + if (err < 0) + goto out; + + /* Pretend we are connected */ + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + err = 0; + + goto out; + } +#endif /* CONFIG_IRDA_ULTRA */ + + self->ias_obj = irias_new_object(addr->sir_name, jiffies); + err = -ENOMEM; + if (self->ias_obj == NULL) + goto out; + + err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name); + if (err < 0) { + irias_delete_object(self->ias_obj); + self->ias_obj = NULL; + goto out; + } + + /* Register with LM-IAS */ + irias_add_integer_attrib(self->ias_obj, "IrDA:TinyTP:LsapSel", + self->stsap_sel, IAS_KERNEL_ATTR); + irias_insert_object(self->ias_obj); + + err = 0; +out: + release_sock(sk); + return err; +} + +/* + * Function irda_accept (sock, newsock, flags) + * + * Wait for incoming connection + * + */ +static int irda_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct irda_sock *new, *self = irda_sk(sk); + struct sock *newsk; + struct sk_buff *skb; + int err; + + IRDA_DEBUG(2, "%s()\n", __func__); + + err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); + if (err) + return err; + + err = -EINVAL; + + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) + goto out; + + if ((sk = sock->sk) == NULL) + goto out; + + err = -EOPNOTSUPP; + if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && + (sk->sk_type != SOCK_DGRAM)) + goto out; + + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out; + + /* + * The read queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + + /* + * We can perform the accept only if there is incoming data + * on the listening socket. + * So, we will block the caller until we receive any data. + * If the caller was waiting on select() or poll() before + * calling us, the data is waiting for us ;-) + * Jean II + */ + while (1) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + /* Non blocking operation */ + err = -EWOULDBLOCK; + if (flags & O_NONBLOCK) + goto out; + + err = wait_event_interruptible(*(sk_sleep(sk)), + skb_peek(&sk->sk_receive_queue)); + if (err) + goto out; + } + + newsk = newsock->sk; + err = -EIO; + if (newsk == NULL) + goto out; + + newsk->sk_state = TCP_ESTABLISHED; + + new = irda_sk(newsk); + + /* Now attach up the new socket */ + new->tsap = irttp_dup(self->tsap, new); + err = -EPERM; /* value does not seem to make sense. -arnd */ + if (!new->tsap) { + IRDA_DEBUG(0, "%s(), dup failed!\n", __func__); + kfree_skb(skb); + goto out; + } + + new->stsap_sel = new->tsap->stsap_sel; + new->dtsap_sel = new->tsap->dtsap_sel; + new->saddr = irttp_get_saddr(new->tsap); + new->daddr = irttp_get_daddr(new->tsap); + + new->max_sdu_size_tx = self->max_sdu_size_tx; + new->max_sdu_size_rx = self->max_sdu_size_rx; + new->max_data_size = self->max_data_size; + new->max_header_size = self->max_header_size; + + memcpy(&new->qos_tx, &self->qos_tx, sizeof(struct qos_info)); + + /* Clean up the original one to keep it in listen state */ + irttp_listen(self->tsap); + + kfree_skb(skb); + sk->sk_ack_backlog--; + + newsock->state = SS_CONNECTED; + + irda_connect_response(new); + err = 0; +out: + release_sock(sk); + return err; +} + +/* + * Function irda_connect (sock, uaddr, addr_len, flags) + * + * Connect to a IrDA device + * + * The main difference with a "standard" connect is that with IrDA we need + * to resolve the service name into a TSAP selector (in TCP, port number + * doesn't have to be resolved). + * Because of this service name resoltion, we can offer "auto-connect", + * where we connect to a service without specifying a destination address. + * + * Note : by consulting "errno", the user space caller may learn the cause + * of the failure. Most of them are visible in the function, others may come + * from subroutines called and are listed here : + * o EBUSY : already processing a connect + * o EHOSTUNREACH : bad addr->sir_addr argument + * o EADDRNOTAVAIL : bad addr->sir_name argument + * o ENOTUNIQ : more than one node has addr->sir_name (auto-connect) + * o ENETUNREACH : no node found on the network (auto-connect) + */ +static int irda_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr; + struct irda_sock *self = irda_sk(sk); + int err; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + lock_sock(sk); + /* Don't allow connect for Ultra sockets */ + err = -ESOCKTNOSUPPORT; + if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA)) + goto out; + + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + err = 0; + goto out; /* Connect completed during a ERESTARTSYS event */ + } + + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + err = -ECONNREFUSED; + goto out; + } + + err = -EISCONN; /* No reconnect on a seqpacket socket */ + if (sk->sk_state == TCP_ESTABLISHED) + goto out; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + err = -EINVAL; + if (addr_len != sizeof(struct sockaddr_irda)) + goto out; + + /* Check if user supplied any destination device address */ + if ((!addr->sir_addr) || (addr->sir_addr == DEV_ADDR_ANY)) { + /* Try to find one suitable */ + err = irda_discover_daddr_and_lsap_sel(self, addr->sir_name); + if (err) { + IRDA_DEBUG(0, "%s(), auto-connect failed!\n", __func__); + goto out; + } + } else { + /* Use the one provided by the user */ + self->daddr = addr->sir_addr; + IRDA_DEBUG(1, "%s(), daddr = %08x\n", __func__, self->daddr); + + /* If we don't have a valid service name, we assume the + * user want to connect on a specific LSAP. Prevent + * the use of invalid LSAPs (IrLMP 1.1 p10). Jean II */ + if((addr->sir_name[0] != '\0') || + (addr->sir_lsap_sel >= 0x70)) { + /* Query remote LM-IAS using service name */ + err = irda_find_lsap_sel(self, addr->sir_name); + if (err) { + IRDA_DEBUG(0, "%s(), connect failed!\n", __func__); + goto out; + } + } else { + /* Directly connect to the remote LSAP + * specified by the sir_lsap field. + * Please use with caution, in IrDA LSAPs are + * dynamic and there is no "well-known" LSAP. */ + self->dtsap_sel = addr->sir_lsap_sel; + } + } + + /* Check if we have opened a local TSAP */ + if (!self->tsap) + irda_open_tsap(self, LSAP_ANY, addr->sir_name); + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + /* Connect to remote device */ + err = irttp_connect_request(self->tsap, self->dtsap_sel, + self->saddr, self->daddr, NULL, + self->max_sdu_size_rx, NULL); + if (err) { + IRDA_DEBUG(0, "%s(), connect failed!\n", __func__); + goto out; + } + + /* Now the loop */ + err = -EINPROGRESS; + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + goto out; + + err = -ERESTARTSYS; + if (wait_event_interruptible(*(sk_sleep(sk)), + (sk->sk_state != TCP_SYN_SENT))) + goto out; + + if (sk->sk_state != TCP_ESTABLISHED) { + sock->state = SS_UNCONNECTED; + if (sk->sk_prot->disconnect(sk, flags)) + sock->state = SS_DISCONNECTING; + err = sock_error(sk); + if (!err) + err = -ECONNRESET; + goto out; + } + + sock->state = SS_CONNECTED; + + /* At this point, IrLMP has assigned our source address */ + self->saddr = irttp_get_saddr(self->tsap); + err = 0; +out: + release_sock(sk); + return err; +} + +static struct proto irda_proto = { + .name = "IRDA", + .owner = THIS_MODULE, + .obj_size = sizeof(struct irda_sock), +}; + +/* + * Function irda_create (sock, protocol) + * + * Create IrDA socket + * + */ +static int irda_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct irda_sock *self; + + IRDA_DEBUG(2, "%s()\n", __func__); + + if (net != &init_net) + return -EAFNOSUPPORT; + + /* Check for valid socket type */ + switch (sock->type) { + case SOCK_STREAM: /* For TTP connections with SAR disabled */ + case SOCK_SEQPACKET: /* For TTP connections with SAR enabled */ + case SOCK_DGRAM: /* For TTP Unitdata or LMP Ultra transfers */ + break; + default: + return -ESOCKTNOSUPPORT; + } + + /* Allocate networking socket */ + sk = sk_alloc(net, PF_IRDA, GFP_ATOMIC, &irda_proto); + if (sk == NULL) + return -ENOMEM; + + self = irda_sk(sk); + IRDA_DEBUG(2, "%s() : self is %p\n", __func__, self); + + init_waitqueue_head(&self->query_wait); + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &irda_stream_ops; + self->max_sdu_size_rx = TTP_SAR_DISABLE; + break; + case SOCK_SEQPACKET: + sock->ops = &irda_seqpacket_ops; + self->max_sdu_size_rx = TTP_SAR_UNBOUND; + break; + case SOCK_DGRAM: + switch (protocol) { +#ifdef CONFIG_IRDA_ULTRA + case IRDAPROTO_ULTRA: + sock->ops = &irda_ultra_ops; + /* Initialise now, because we may send on unbound + * sockets. Jean II */ + self->max_data_size = ULTRA_MAX_DATA - LMP_PID_HEADER; + self->max_header_size = IRDA_MAX_HEADER + LMP_PID_HEADER; + break; +#endif /* CONFIG_IRDA_ULTRA */ + case IRDAPROTO_UNITDATA: + sock->ops = &irda_dgram_ops; + /* We let Unitdata conn. be like seqpack conn. */ + self->max_sdu_size_rx = TTP_SAR_UNBOUND; + break; + default: + sk_free(sk); + return -ESOCKTNOSUPPORT; + } + break; + default: + sk_free(sk); + return -ESOCKTNOSUPPORT; + } + + /* Initialise networking socket struct */ + sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */ + sk->sk_family = PF_IRDA; + sk->sk_protocol = protocol; + + /* Register as a client with IrLMP */ + self->ckey = irlmp_register_client(0, NULL, NULL, NULL); + self->mask.word = 0xffff; + self->rx_flow = self->tx_flow = FLOW_START; + self->nslots = DISCOVERY_DEFAULT_SLOTS; + self->daddr = DEV_ADDR_ANY; /* Until we get connected */ + self->saddr = 0x0; /* so IrLMP assign us any link */ + return 0; +} + +/* + * Function irda_destroy_socket (self) + * + * Destroy socket + * + */ +static void irda_destroy_socket(struct irda_sock *self) +{ + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + /* Unregister with IrLMP */ + irlmp_unregister_client(self->ckey); + irlmp_unregister_service(self->skey); + + /* Unregister with LM-IAS */ + if (self->ias_obj) { + irias_delete_object(self->ias_obj); + self->ias_obj = NULL; + } + + if (self->iriap) { + iriap_close(self->iriap); + self->iriap = NULL; + } + + if (self->tsap) { + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } +#ifdef CONFIG_IRDA_ULTRA + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } +#endif /* CONFIG_IRDA_ULTRA */ +} + +/* + * Function irda_release (sock) + */ +static int irda_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + IRDA_DEBUG(2, "%s()\n", __func__); + + if (sk == NULL) + return 0; + + lock_sock(sk); + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + + /* Destroy IrDA socket */ + irda_destroy_socket(irda_sk(sk)); + + sock_orphan(sk); + sock->sk = NULL; + release_sock(sk); + + /* Purge queues (see sock_init_data()) */ + skb_queue_purge(&sk->sk_receive_queue); + + /* Destroy networking socket if we are the last reference on it, + * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */ + sock_put(sk); + + /* Notes on socket locking and deallocation... - Jean II + * In theory we should put pairs of sock_hold() / sock_put() to + * prevent the socket to be destroyed whenever there is an + * outstanding request or outstanding incoming packet or event. + * + * 1) This may include IAS request, both in connect and getsockopt. + * Unfortunately, the situation is a bit more messy than it looks, + * because we close iriap and kfree(self) above. + * + * 2) This may include selective discovery in getsockopt. + * Same stuff as above, irlmp registration and self are gone. + * + * Probably 1 and 2 may not matter, because it's all triggered + * by a process and the socket layer already prevent the + * socket to go away while a process is holding it, through + * sockfd_put() and fput()... + * + * 3) This may include deferred TSAP closure. In particular, + * we may receive a late irda_disconnect_indication() + * Fortunately, (tsap_cb *)->close_pend should protect us + * from that. + * + * I did some testing on SMP, and it looks solid. And the socket + * memory leak is now gone... - Jean II + */ + + return 0; +} + +/* + * Function irda_sendmsg (iocb, sock, msg, len) + * + * Send message down to TinyTP. This function is used for both STREAM and + * SEQPACK services. This is possible since it forces the client to + * fragment the message if necessary + */ +static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct irda_sock *self; + struct sk_buff *skb; + int err = -EPIPE; + + IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); + + /* Note : socket.c set MSG_EOR on SEQPACKET sockets */ + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT | + MSG_NOSIGNAL)) { + return -EINVAL; + } + + lock_sock(sk); + + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto out_err; + + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + + self = irda_sk(sk); + + /* Check if IrTTP is wants us to slow down */ + + if (wait_event_interruptible(*(sk_sleep(sk)), + (self->tx_flow != FLOW_STOP || sk->sk_state != TCP_ESTABLISHED))) { + err = -ERESTARTSYS; + goto out; + } + + /* Check if we are still connected */ + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + + /* Check that we don't send out too big frames */ + if (len > self->max_data_size) { + IRDA_DEBUG(2, "%s(), Chopping frame from %zd to %d bytes!\n", + __func__, len, self->max_data_size); + len = self->max_data_size; + } + + skb = sock_alloc_send_skb(sk, len + self->max_header_size + 16, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + goto out_err; + + skb_reserve(skb, self->max_header_size + 16); + skb_reset_transport_header(skb); + skb_put(skb, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + if (err) { + kfree_skb(skb); + goto out_err; + } + + /* + * Just send the message to TinyTP, and let it deal with possible + * errors. No need to duplicate all that here + */ + err = irttp_data_request(self->tsap, skb); + if (err) { + IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); + goto out_err; + } + + release_sock(sk); + /* Tell client how much data we actually sent */ + return len; + +out_err: + err = sk_stream_error(sk, msg->msg_flags, err); +out: + release_sock(sk); + return err; + +} + +/* + * Function irda_recvmsg_dgram (iocb, sock, msg, size, flags) + * + * Try to receive message and copy it to user. The frame is discarded + * after being read, regardless of how much the user actually read + */ +static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + struct sk_buff *skb; + size_t copied; + int err; + + IRDA_DEBUG(4, "%s()\n", __func__); + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + skb_reset_transport_header(skb); + copied = skb->len; + + if (copied > size) { + IRDA_DEBUG(2, "%s(), Received truncated frame (%zd < %zd)!\n", + __func__, copied, size); + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + skb_free_datagram(sk, skb); + + /* + * Check if we have previously stopped IrTTP and we know + * have more free space in our rx_queue. If so tell IrTTP + * to start delivering frames again before our rx_queue gets + * empty + */ + if (self->rx_flow == FLOW_STOP) { + if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) { + IRDA_DEBUG(2, "%s(), Starting IrTTP\n", __func__); + self->rx_flow = FLOW_START; + irttp_flow_request(self->tsap, FLOW_START); + } + } + + return copied; +} + +/* + * Function irda_recvmsg_stream (iocb, sock, msg, size, flags) + */ +static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + int noblock = flags & MSG_DONTWAIT; + size_t copied = 0; + int target, err; + long timeo; + + IRDA_DEBUG(3, "%s()\n", __func__); + + if ((err = sock_error(sk)) < 0) + return err; + + if (sock->flags & __SO_ACCEPTCON) + return -EINVAL; + + err =-EOPNOTSUPP; + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + err = 0; + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, noblock); + + msg->msg_namelen = 0; + + do { + int chunk; + struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue); + + if (skb == NULL) { + DEFINE_WAIT(wait); + err = 0; + + if (copied >= target) + break; + + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + /* + * POSIX 1003.1g mandates this order. + */ + err = sock_error(sk); + if (err) + ; + else if (sk->sk_shutdown & RCV_SHUTDOWN) + ; + else if (noblock) + err = -EAGAIN; + else if (signal_pending(current)) + err = sock_intr_errno(timeo); + else if (sk->sk_state != TCP_ESTABLISHED) + err = -ENOTCONN; + else if (skb_peek(&sk->sk_receive_queue) == NULL) + /* Wait process until data arrives */ + schedule(); + + finish_wait(sk_sleep(sk), &wait); + + if (err) + return err; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + continue; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) { + skb_pull(skb, chunk); + + /* put the skb back if we didn't use it up.. */ + if (skb->len) { + IRDA_DEBUG(1, "%s(), back on q!\n", + __func__); + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + + kfree_skb(skb); + } else { + IRDA_DEBUG(0, "%s() questionable!?\n", __func__); + + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + + /* + * Check if we have previously stopped IrTTP and we know + * have more free space in our rx_queue. If so tell IrTTP + * to start delivering frames again before our rx_queue gets + * empty + */ + if (self->rx_flow == FLOW_STOP) { + if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) { + IRDA_DEBUG(2, "%s(), Starting IrTTP\n", __func__); + self->rx_flow = FLOW_START; + irttp_flow_request(self->tsap, FLOW_START); + } + } + + return copied; +} + +/* + * Function irda_sendmsg_dgram (iocb, sock, msg, len) + * + * Send message down to TinyTP for the unreliable sequenced + * packet service... + * + */ +static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct irda_sock *self; + struct sk_buff *skb; + int err; + + IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + goto out; + } + + err = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + self = irda_sk(sk); + + /* + * Check that we don't send out too big frames. This is an unreliable + * service, so we have no fragmentation and no coalescence + */ + if (len > self->max_data_size) { + IRDA_DEBUG(0, "%s(), Warning to much data! " + "Chopping frame from %zd to %d bytes!\n", + __func__, len, self->max_data_size); + len = self->max_data_size; + } + + skb = sock_alloc_send_skb(sk, len + self->max_header_size, + msg->msg_flags & MSG_DONTWAIT, &err); + err = -ENOBUFS; + if (!skb) + goto out; + + skb_reserve(skb, self->max_header_size); + skb_reset_transport_header(skb); + + IRDA_DEBUG(4, "%s(), appending user data\n", __func__); + skb_put(skb, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + if (err) { + kfree_skb(skb); + goto out; + } + + /* + * Just send the message to TinyTP, and let it deal with possible + * errors. No need to duplicate all that here + */ + err = irttp_udata_request(self->tsap, skb); + if (err) { + IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); + goto out; + } + + release_sock(sk); + return len; + +out: + release_sock(sk); + return err; +} + +/* + * Function irda_sendmsg_ultra (iocb, sock, msg, len) + * + * Send message down to IrLMP for the unreliable Ultra + * packet service... + */ +#ifdef CONFIG_IRDA_ULTRA +static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct irda_sock *self; + __u8 pid = 0; + int bound = 0; + struct sk_buff *skb; + int err; + + IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); + + err = -EINVAL; + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + lock_sock(sk); + + err = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + goto out; + } + + self = irda_sk(sk); + + /* Check if an address was specified with sendto. Jean II */ + if (msg->msg_name) { + struct sockaddr_irda *addr = (struct sockaddr_irda *) msg->msg_name; + err = -EINVAL; + /* Check address, extract pid. Jean II */ + if (msg->msg_namelen < sizeof(*addr)) + goto out; + if (addr->sir_family != AF_IRDA) + goto out; + + pid = addr->sir_lsap_sel; + if (pid & 0x80) { + IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", __func__); + err = -EOPNOTSUPP; + goto out; + } + } else { + /* Check that the socket is properly bound to an Ultra + * port. Jean II */ + if ((self->lsap == NULL) || + (sk->sk_state != TCP_ESTABLISHED)) { + IRDA_DEBUG(0, "%s(), socket not bound to Ultra PID.\n", + __func__); + err = -ENOTCONN; + goto out; + } + /* Use PID from socket */ + bound = 1; + } + + /* + * Check that we don't send out too big frames. This is an unreliable + * service, so we have no fragmentation and no coalescence + */ + if (len > self->max_data_size) { + IRDA_DEBUG(0, "%s(), Warning to much data! " + "Chopping frame from %zd to %d bytes!\n", + __func__, len, self->max_data_size); + len = self->max_data_size; + } + + skb = sock_alloc_send_skb(sk, len + self->max_header_size, + msg->msg_flags & MSG_DONTWAIT, &err); + err = -ENOBUFS; + if (!skb) + goto out; + + skb_reserve(skb, self->max_header_size); + skb_reset_transport_header(skb); + + IRDA_DEBUG(4, "%s(), appending user data\n", __func__); + skb_put(skb, len); + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + if (err) { + kfree_skb(skb); + goto out; + } + + err = irlmp_connless_data_request((bound ? self->lsap : NULL), + skb, pid); + if (err) + IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); +out: + release_sock(sk); + return err ? : len; +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irda_shutdown (sk, how) + */ +static int irda_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + + IRDA_DEBUG(1, "%s(%p)\n", __func__, self); + + lock_sock(sk); + + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + + if (self->iriap) { + iriap_close(self->iriap); + self->iriap = NULL; + } + + if (self->tsap) { + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + + /* A few cleanup so the socket look as good as new... */ + self->rx_flow = self->tx_flow = FLOW_START; /* needed ??? */ + self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */ + self->saddr = 0x0; /* so IrLMP assign us any link */ + + release_sock(sk); + + return 0; +} + +/* + * Function irda_poll (file, sock, wait) + */ +static unsigned int irda_poll(struct file * file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + unsigned int mask; + + IRDA_DEBUG(4, "%s()\n", __func__); + + poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* Exceptional events? */ + if (sk->sk_err) + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) { + IRDA_DEBUG(0, "%s(), POLLHUP\n", __func__); + mask |= POLLHUP; + } + + /* Readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) { + IRDA_DEBUG(4, "Socket is readable\n"); + mask |= POLLIN | POLLRDNORM; + } + + /* Connection-based need to check for termination and startup */ + switch (sk->sk_type) { + case SOCK_STREAM: + if (sk->sk_state == TCP_CLOSE) { + IRDA_DEBUG(0, "%s(), POLLHUP\n", __func__); + mask |= POLLHUP; + } + + if (sk->sk_state == TCP_ESTABLISHED) { + if ((self->tx_flow == FLOW_START) && + sock_writeable(sk)) + { + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + } + } + break; + case SOCK_SEQPACKET: + if ((self->tx_flow == FLOW_START) && + sock_writeable(sk)) + { + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + } + break; + case SOCK_DGRAM: + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + break; + default: + break; + } + + return mask; +} + +/* + * Function irda_ioctl (sock, cmd, arg) + */ +static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + + IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd); + + err = -EINVAL; + switch (cmd) { + case TIOCOUTQ: { + long amount; + + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + err = put_user(amount, (unsigned int __user *)arg); + break; + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + err = put_user(amount, (unsigned int __user *)arg); + break; + } + + case SIOCGSTAMP: + if (sk != NULL) + err = sock_get_timestamp(sk, (struct timeval __user *)arg); + break; + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + break; + default: + IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__); + err = -ENOIOCTLCMD; + } + + return err; +} + +#ifdef CONFIG_COMPAT +/* + * Function irda_ioctl (sock, cmd, arg) + */ +static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + /* + * All IRDA's ioctl are standard ones. + */ + return -ENOIOCTLCMD; +} +#endif + +/* + * Function irda_setsockopt (sock, level, optname, optval, optlen) + * + * Set some options for the socket + * + */ +static int irda_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + struct irda_ias_set *ias_opt; + struct ias_object *ias_obj; + struct ias_attrib * ias_attr; /* Attribute in IAS object */ + int opt, free_ias = 0, err = 0; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + if (level != SOL_IRLMP) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case IRLMP_IAS_SET: + /* The user want to add an attribute to an existing IAS object + * (in the IAS database) or to create a new object with this + * attribute. + * We first query IAS to know if the object exist, and then + * create the right attribute... + */ + + if (optlen != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, optlen)) { + kfree(ias_opt); + err = -EFAULT; + goto out; + } + + /* Find the object we target. + * If the user gives us an empty string, we use the object + * associated with this socket. This will workaround + * duplicated class name - Jean II */ + if(ias_opt->irda_class_name[0] == '\0') { + if(self->ias_obj == NULL) { + kfree(ias_opt); + err = -EINVAL; + goto out; + } + ias_obj = self->ias_obj; + } else + ias_obj = irias_find_object(ias_opt->irda_class_name); + + /* Only ROOT can mess with the global IAS database. + * Users can only add attributes to the object associated + * with the socket they own - Jean II */ + if((!capable(CAP_NET_ADMIN)) && + ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { + kfree(ias_opt); + err = -EPERM; + goto out; + } + + /* If the object doesn't exist, create it */ + if(ias_obj == (struct ias_object *) NULL) { + /* Create a new object */ + ias_obj = irias_new_object(ias_opt->irda_class_name, + jiffies); + if (ias_obj == NULL) { + kfree(ias_opt); + err = -ENOMEM; + goto out; + } + free_ias = 1; + } + + /* Do we have the attribute already ? */ + if(irias_find_attrib(ias_obj, ias_opt->irda_attrib_name)) { + kfree(ias_opt); + if (free_ias) { + kfree(ias_obj->name); + kfree(ias_obj); + } + err = -EINVAL; + goto out; + } + + /* Look at the type */ + switch(ias_opt->irda_attrib_type) { + case IAS_INTEGER: + /* Add an integer attribute */ + irias_add_integer_attrib( + ias_obj, + ias_opt->irda_attrib_name, + ias_opt->attribute.irda_attrib_int, + IAS_USER_ATTR); + break; + case IAS_OCT_SEQ: + /* Check length */ + if(ias_opt->attribute.irda_attrib_octet_seq.len > + IAS_MAX_OCTET_STRING) { + kfree(ias_opt); + if (free_ias) { + kfree(ias_obj->name); + kfree(ias_obj); + } + + err = -EINVAL; + goto out; + } + /* Add an octet sequence attribute */ + irias_add_octseq_attrib( + ias_obj, + ias_opt->irda_attrib_name, + ias_opt->attribute.irda_attrib_octet_seq.octet_seq, + ias_opt->attribute.irda_attrib_octet_seq.len, + IAS_USER_ATTR); + break; + case IAS_STRING: + /* Should check charset & co */ + /* Check length */ + /* The length is encoded in a __u8, and + * IAS_MAX_STRING == 256, so there is no way + * userspace can pass us a string too large. + * Jean II */ + /* NULL terminate the string (avoid troubles) */ + ias_opt->attribute.irda_attrib_string.string[ias_opt->attribute.irda_attrib_string.len] = '\0'; + /* Add a string attribute */ + irias_add_string_attrib( + ias_obj, + ias_opt->irda_attrib_name, + ias_opt->attribute.irda_attrib_string.string, + IAS_USER_ATTR); + break; + default : + kfree(ias_opt); + if (free_ias) { + kfree(ias_obj->name); + kfree(ias_obj); + } + err = -EINVAL; + goto out; + } + irias_insert_object(ias_obj); + kfree(ias_opt); + break; + case IRLMP_IAS_DEL: + /* The user want to delete an object from our local IAS + * database. We just need to query the IAS, check is the + * object is not owned by the kernel and delete it. + */ + + if (optlen != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, optlen)) { + kfree(ias_opt); + err = -EFAULT; + goto out; + } + + /* Find the object we target. + * If the user gives us an empty string, we use the object + * associated with this socket. This will workaround + * duplicated class name - Jean II */ + if(ias_opt->irda_class_name[0] == '\0') + ias_obj = self->ias_obj; + else + ias_obj = irias_find_object(ias_opt->irda_class_name); + if(ias_obj == (struct ias_object *) NULL) { + kfree(ias_opt); + err = -EINVAL; + goto out; + } + + /* Only ROOT can mess with the global IAS database. + * Users can only del attributes from the object associated + * with the socket they own - Jean II */ + if((!capable(CAP_NET_ADMIN)) && + ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { + kfree(ias_opt); + err = -EPERM; + goto out; + } + + /* Find the attribute (in the object) we target */ + ias_attr = irias_find_attrib(ias_obj, + ias_opt->irda_attrib_name); + if(ias_attr == (struct ias_attrib *) NULL) { + kfree(ias_opt); + err = -EINVAL; + goto out; + } + + /* Check is the user space own the object */ + if(ias_attr->value->owner != IAS_USER_ATTR) { + IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__); + kfree(ias_opt); + err = -EPERM; + goto out; + } + + /* Remove the attribute (and maybe the object) */ + irias_delete_attrib(ias_obj, ias_attr, 1); + kfree(ias_opt); + break; + case IRLMP_MAX_SDU_SIZE: + if (optlen < sizeof(int)) { + err = -EINVAL; + goto out; + } + + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + goto out; + } + + /* Only possible for a seqpacket service (TTP with SAR) */ + if (sk->sk_type != SOCK_SEQPACKET) { + IRDA_DEBUG(2, "%s(), setting max_sdu_size = %d\n", + __func__, opt); + self->max_sdu_size_rx = opt; + } else { + IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n", + __func__); + err = -ENOPROTOOPT; + goto out; + } + break; + case IRLMP_HINTS_SET: + if (optlen < sizeof(int)) { + err = -EINVAL; + goto out; + } + + /* The input is really a (__u8 hints[2]), easier as an int */ + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + goto out; + } + + /* Unregister any old registration */ + if (self->skey) + irlmp_unregister_service(self->skey); + + self->skey = irlmp_register_service((__u16) opt); + break; + case IRLMP_HINT_MASK_SET: + /* As opposed to the previous case which set the hint bits + * that we advertise, this one set the filter we use when + * making a discovery (nodes which don't match any hint + * bit in the mask are not reported). + */ + if (optlen < sizeof(int)) { + err = -EINVAL; + goto out; + } + + /* The input is really a (__u8 hints[2]), easier as an int */ + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + goto out; + } + + /* Set the new hint mask */ + self->mask.word = (__u16) opt; + /* Mask out extension bits */ + self->mask.word &= 0x7f7f; + /* Check if no bits */ + if(!self->mask.word) + self->mask.word = 0xFFFF; + + break; + default: + err = -ENOPROTOOPT; + break; + } + +out: + release_sock(sk); + + return err; +} + +/* + * Function irda_extract_ias_value(ias_opt, ias_value) + * + * Translate internal IAS value structure to the user space representation + * + * The external representation of IAS values, as we exchange them with + * user space program is quite different from the internal representation, + * as stored in the IAS database (because we need a flat structure for + * crossing kernel boundary). + * This function transform the former in the latter. We also check + * that the value type is valid. + */ +static int irda_extract_ias_value(struct irda_ias_set *ias_opt, + struct ias_value *ias_value) +{ + /* Look at the type */ + switch (ias_value->type) { + case IAS_INTEGER: + /* Copy the integer */ + ias_opt->attribute.irda_attrib_int = ias_value->t.integer; + break; + case IAS_OCT_SEQ: + /* Set length */ + ias_opt->attribute.irda_attrib_octet_seq.len = ias_value->len; + /* Copy over */ + memcpy(ias_opt->attribute.irda_attrib_octet_seq.octet_seq, + ias_value->t.oct_seq, ias_value->len); + break; + case IAS_STRING: + /* Set length */ + ias_opt->attribute.irda_attrib_string.len = ias_value->len; + ias_opt->attribute.irda_attrib_string.charset = ias_value->charset; + /* Copy over */ + memcpy(ias_opt->attribute.irda_attrib_string.string, + ias_value->t.string, ias_value->len); + /* NULL terminate the string (avoid troubles) */ + ias_opt->attribute.irda_attrib_string.string[ias_value->len] = '\0'; + break; + case IAS_MISSING: + default : + return -EINVAL; + } + + /* Copy type over */ + ias_opt->irda_attrib_type = ias_value->type; + + return 0; +} + +/* + * Function irda_getsockopt (sock, level, optname, optval, optlen) + */ +static int irda_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + struct irda_device_list list; + struct irda_device_info *discoveries; + struct irda_ias_set * ias_opt; /* IAS get/query params */ + struct ias_object * ias_obj; /* Object in IAS */ + struct ias_attrib * ias_attr; /* Attribute in IAS object */ + int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */ + int val = 0; + int len = 0; + int err = 0; + int offset, total; + + IRDA_DEBUG(2, "%s(%p)\n", __func__, self); + + if (level != SOL_IRLMP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if(len < 0) + return -EINVAL; + + lock_sock(sk); + + switch (optname) { + case IRLMP_ENUMDEVICES: + + /* Offset to first device entry */ + offset = sizeof(struct irda_device_list) - + sizeof(struct irda_device_info); + + if (len < offset) { + err = -EINVAL; + goto out; + } + + /* Ask lmp for the current discovery log */ + discoveries = irlmp_get_discoveries(&list.len, self->mask.word, + self->nslots); + /* Check if the we got some results */ + if (discoveries == NULL) { + err = -EAGAIN; + goto out; /* Didn't find any devices */ + } + + /* Write total list length back to client */ + if (copy_to_user(optval, &list, offset)) + err = -EFAULT; + + /* Copy the list itself - watch for overflow */ + if (list.len > 2048) { + err = -EINVAL; + goto bed; + } + total = offset + (list.len * sizeof(struct irda_device_info)); + if (total > len) + total = len; + if (copy_to_user(optval+offset, discoveries, total - offset)) + err = -EFAULT; + + /* Write total number of bytes used back to client */ + if (put_user(total, optlen)) + err = -EFAULT; +bed: + /* Free up our buffer */ + kfree(discoveries); + break; + case IRLMP_MAX_SDU_SIZE: + val = self->max_data_size; + len = sizeof(int); + if (put_user(len, optlen)) { + err = -EFAULT; + goto out; + } + + if (copy_to_user(optval, &val, len)) { + err = -EFAULT; + goto out; + } + + break; + case IRLMP_IAS_GET: + /* The user want an object from our local IAS database. + * We just need to query the IAS and return the value + * that we found */ + + /* Check that the user has allocated the right space for us */ + if (len != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, len)) { + kfree(ias_opt); + err = -EFAULT; + goto out; + } + + /* Find the object we target. + * If the user gives us an empty string, we use the object + * associated with this socket. This will workaround + * duplicated class name - Jean II */ + if(ias_opt->irda_class_name[0] == '\0') + ias_obj = self->ias_obj; + else + ias_obj = irias_find_object(ias_opt->irda_class_name); + if(ias_obj == (struct ias_object *) NULL) { + kfree(ias_opt); + err = -EINVAL; + goto out; + } + + /* Find the attribute (in the object) we target */ + ias_attr = irias_find_attrib(ias_obj, + ias_opt->irda_attrib_name); + if(ias_attr == (struct ias_attrib *) NULL) { + kfree(ias_opt); + err = -EINVAL; + goto out; + } + + /* Translate from internal to user structure */ + err = irda_extract_ias_value(ias_opt, ias_attr->value); + if(err) { + kfree(ias_opt); + goto out; + } + + /* Copy reply to the user */ + if (copy_to_user(optval, ias_opt, + sizeof(struct irda_ias_set))) { + kfree(ias_opt); + err = -EFAULT; + goto out; + } + /* Note : don't need to put optlen, we checked it */ + kfree(ias_opt); + break; + case IRLMP_IAS_QUERY: + /* The user want an object from a remote IAS database. + * We need to use IAP to query the remote database and + * then wait for the answer to come back. */ + + /* Check that the user has allocated the right space for us */ + if (len != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, len)) { + kfree(ias_opt); + err = -EFAULT; + goto out; + } + + /* At this point, there are two cases... + * 1) the socket is connected - that's the easy case, we + * just query the device we are connected to... + * 2) the socket is not connected - the user doesn't want + * to connect and/or may not have a valid service name + * (so can't create a fake connection). In this case, + * we assume that the user pass us a valid destination + * address in the requesting structure... + */ + if(self->daddr != DEV_ADDR_ANY) { + /* We are connected - reuse known daddr */ + daddr = self->daddr; + } else { + /* We are not connected, we must specify a valid + * destination address */ + daddr = ias_opt->daddr; + if((!daddr) || (daddr == DEV_ADDR_ANY)) { + kfree(ias_opt); + err = -EINVAL; + goto out; + } + } + + /* Check that we can proceed with IAP */ + if (self->iriap) { + IRDA_WARNING("%s: busy with a previous query\n", + __func__); + kfree(ias_opt); + err = -EBUSY; + goto out; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irda_getvalue_confirm); + + if (self->iriap == NULL) { + kfree(ias_opt); + err = -ENOMEM; + goto out; + } + + /* Treat unexpected wakeup as disconnect */ + self->errno = -EHOSTUNREACH; + + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, + self->saddr, daddr, + ias_opt->irda_class_name, + ias_opt->irda_attrib_name); + + /* Wait for answer, if not yet finished (or failed) */ + if (wait_event_interruptible(self->query_wait, + (self->iriap == NULL))) { + /* pending request uses copy of ias_opt-content + * we can free it regardless! */ + kfree(ias_opt); + /* Treat signals as disconnect */ + err = -EHOSTUNREACH; + goto out; + } + + /* Check what happened */ + if (self->errno) + { + kfree(ias_opt); + /* Requested object/attribute doesn't exist */ + if((self->errno == IAS_CLASS_UNKNOWN) || + (self->errno == IAS_ATTRIB_UNKNOWN)) + err = -EADDRNOTAVAIL; + else + err = -EHOSTUNREACH; + + goto out; + } + + /* Translate from internal to user structure */ + err = irda_extract_ias_value(ias_opt, self->ias_result); + if (self->ias_result) + irias_delete_value(self->ias_result); + if (err) { + kfree(ias_opt); + goto out; + } + + /* Copy reply to the user */ + if (copy_to_user(optval, ias_opt, + sizeof(struct irda_ias_set))) { + kfree(ias_opt); + err = -EFAULT; + goto out; + } + /* Note : don't need to put optlen, we checked it */ + kfree(ias_opt); + break; + case IRLMP_WAITDEVICE: + /* This function is just another way of seeing life ;-) + * IRLMP_ENUMDEVICES assumes that you have a static network, + * and that you just want to pick one of the devices present. + * On the other hand, in here we assume that no device is + * present and that at some point in the future a device will + * come into range. When this device arrive, we just wake + * up the caller, so that he has time to connect to it before + * the device goes away... + * Note : once the node has been discovered for more than a + * few second, it won't trigger this function, unless it + * goes away and come back changes its hint bits (so we + * might call it IRLMP_WAITNEWDEVICE). + */ + + /* Check that the user is passing us an int */ + if (len != sizeof(int)) { + err = -EINVAL; + goto out; + } + /* Get timeout in ms (max time we block the caller) */ + if (get_user(val, (int __user *)optval)) { + err = -EFAULT; + goto out; + } + + /* Tell IrLMP we want to be notified */ + irlmp_update_client(self->ckey, self->mask.word, + irda_selective_discovery_indication, + NULL, (void *) self); + + /* Do some discovery (and also return cached results) */ + irlmp_discovery_request(self->nslots); + + /* Wait until a node is discovered */ + if (!self->cachedaddr) { + IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__); + + /* Set watchdog timer to expire in ms. */ + self->errno = 0; + setup_timer(&self->watchdog, irda_discovery_timeout, + (unsigned long)self); + self->watchdog.expires = jiffies + (val * HZ/1000); + add_timer(&(self->watchdog)); + + /* Wait for IR-LMP to call us back */ + __wait_event_interruptible(self->query_wait, + (self->cachedaddr != 0 || self->errno == -ETIME), + err); + + /* If watchdog is still activated, kill it! */ + if(timer_pending(&(self->watchdog))) + del_timer(&(self->watchdog)); + + IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__); + + if (err != 0) + goto out; + } + else + IRDA_DEBUG(1, "%s(), found immediately !\n", + __func__); + + /* Tell IrLMP that we have been notified */ + irlmp_update_client(self->ckey, self->mask.word, + NULL, NULL, NULL); + + /* Check if the we got some results */ + if (!self->cachedaddr) + return -EAGAIN; /* Didn't find any devices */ + daddr = self->cachedaddr; + /* Cleanup */ + self->cachedaddr = 0; + + /* We return the daddr of the device that trigger the + * wakeup. As irlmp pass us only the new devices, we + * are sure that it's not an old device. + * If the user want more details, he should query + * the whole discovery log and pick one device... + */ + if (put_user(daddr, (int __user *)optval)) { + err = -EFAULT; + goto out; + } + + break; + default: + err = -ENOPROTOOPT; + } + +out: + + release_sock(sk); + + return err; +} + +static const struct net_proto_family irda_family_ops = { + .family = PF_IRDA, + .create = irda_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops irda_stream_ops = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = irda_connect, + .socketpair = sock_no_socketpair, + .accept = irda_accept, + .getname = irda_getname, + .poll = irda_poll, + .ioctl = irda_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = irda_compat_ioctl, +#endif + .listen = irda_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg, + .recvmsg = irda_recvmsg_stream, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops irda_seqpacket_ops = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = irda_connect, + .socketpair = sock_no_socketpair, + .accept = irda_accept, + .getname = irda_getname, + .poll = datagram_poll, + .ioctl = irda_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = irda_compat_ioctl, +#endif + .listen = irda_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg, + .recvmsg = irda_recvmsg_dgram, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops irda_dgram_ops = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = irda_connect, + .socketpair = sock_no_socketpair, + .accept = irda_accept, + .getname = irda_getname, + .poll = datagram_poll, + .ioctl = irda_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = irda_compat_ioctl, +#endif + .listen = irda_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg_dgram, + .recvmsg = irda_recvmsg_dgram, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#ifdef CONFIG_IRDA_ULTRA +static const struct proto_ops irda_ultra_ops = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = irda_getname, + .poll = datagram_poll, + .ioctl = irda_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = irda_compat_ioctl, +#endif + .listen = sock_no_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg_ultra, + .recvmsg = irda_recvmsg_dgram, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irsock_init (pro) + * + * Initialize IrDA protocol + * + */ +int __init irsock_init(void) +{ + int rc = proto_register(&irda_proto, 0); + + if (rc == 0) + rc = sock_register(&irda_family_ops); + + return rc; +} + +/* + * Function irsock_cleanup (void) + * + * Remove IrDA protocol + * + */ +void irsock_cleanup(void) +{ + sock_unregister(PF_IRDA); + proto_unregister(&irda_proto); +} diff --git a/net/irda/discovery.c b/net/irda/discovery.c new file mode 100644 index 00000000..36c3f037 --- /dev/null +++ b/net/irda/discovery.c @@ -0,0 +1,422 @@ +/********************************************************************* + * + * Filename: discovery.c + * Version: 0.1 + * Description: Routines for handling discoveries at the IrLMP layer + * Status: Experimental. + * Author: Dag Brattli + * Created at: Tue Apr 6 15:33:50 1999 + * Modified at: Sat Oct 9 17:11:31 1999 + * Modified by: Dag Brattli + * Modified at: Fri May 28 3:11 CST 1999 + * Modified by: Horst von Brand + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +/* + * Function irlmp_add_discovery (cachelog, discovery) + * + * Add a new discovery to the cachelog, and remove any old discoveries + * from the same device + * + * Note : we try to preserve the time this device was *first* discovered + * (as opposed to the time of last discovery used for cleanup). This is + * used by clients waiting for discovery events to tell if the device + * discovered is "new" or just the same old one. They can't rely there + * on a binary flag (new/old), because not all discovery events are + * propagated to them, and they might not always listen, so they would + * miss some new devices popping up... + * Jean II + */ +void irlmp_add_discovery(hashbin_t *cachelog, discovery_t *new) +{ + discovery_t *discovery, *node; + unsigned long flags; + + /* Set time of first discovery if node is new (see below) */ + new->firststamp = new->timestamp; + + spin_lock_irqsave(&cachelog->hb_spinlock, flags); + + /* + * Remove all discoveries of devices that has previously been + * discovered on the same link with the same name (info), or the + * same daddr. We do this since some devices (mostly PDAs) change + * their device address between every discovery. + */ + discovery = (discovery_t *) hashbin_get_first(cachelog); + while (discovery != NULL ) { + node = discovery; + + /* Be sure to stay one item ahead */ + discovery = (discovery_t *) hashbin_get_next(cachelog); + + if ((node->data.saddr == new->data.saddr) && + ((node->data.daddr == new->data.daddr) || + (strcmp(node->data.info, new->data.info) == 0))) + { + /* This discovery is a previous discovery + * from the same device, so just remove it + */ + hashbin_remove_this(cachelog, (irda_queue_t *) node); + /* Check if hints bits are unchanged */ + if (get_unaligned((__u16 *)node->data.hints) == get_unaligned((__u16 *)new->data.hints)) + /* Set time of first discovery for this node */ + new->firststamp = node->firststamp; + kfree(node); + } + } + + /* Insert the new and updated version */ + hashbin_insert(cachelog, (irda_queue_t *) new, new->data.daddr, NULL); + + spin_unlock_irqrestore(&cachelog->hb_spinlock, flags); +} + +/* + * Function irlmp_add_discovery_log (cachelog, log) + * + * Merge a disovery log into the cachelog. + * + */ +void irlmp_add_discovery_log(hashbin_t *cachelog, hashbin_t *log) +{ + discovery_t *discovery; + + IRDA_DEBUG(4, "%s()\n", __func__); + + /* + * If log is missing this means that IrLAP was unable to perform the + * discovery, so restart discovery again with just the half timeout + * of the normal one. + */ + /* Well... It means that there was nobody out there - Jean II */ + if (log == NULL) { + /* irlmp_start_discovery_timer(irlmp, 150); */ + return; + } + + /* + * Locking : we are the only owner of this discovery log, so + * no need to lock it. + * We just need to lock the global log in irlmp_add_discovery(). + */ + discovery = (discovery_t *) hashbin_remove_first(log); + while (discovery != NULL) { + irlmp_add_discovery(cachelog, discovery); + + discovery = (discovery_t *) hashbin_remove_first(log); + } + + /* Delete the now empty log */ + hashbin_delete(log, (FREE_FUNC) kfree); +} + +/* + * Function irlmp_expire_discoveries (log, saddr, force) + * + * Go through all discoveries and expire all that has stayed too long + * + * Note : this assume that IrLAP won't change its saddr, which + * currently is a valid assumption... + */ +void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force) +{ + discovery_t * discovery; + discovery_t * curr; + unsigned long flags; + discinfo_t * buffer = NULL; + int n; /* Size of the full log */ + int i = 0; /* How many we expired */ + + IRDA_ASSERT(log != NULL, return;); + IRDA_DEBUG(4, "%s()\n", __func__); + + spin_lock_irqsave(&log->hb_spinlock, flags); + + discovery = (discovery_t *) hashbin_get_first(log); + while (discovery != NULL) { + /* Be sure to be one item ahead */ + curr = discovery; + discovery = (discovery_t *) hashbin_get_next(log); + + /* Test if it's time to expire this discovery */ + if ((curr->data.saddr == saddr) && + (force || + ((jiffies - curr->timestamp) > DISCOVERY_EXPIRE_TIMEOUT))) + { + /* Create buffer as needed. + * As this function get called a lot and most time + * we don't have anything to put in the log (we are + * quite picky), we can save a lot of overhead + * by not calling kmalloc. Jean II */ + if(buffer == NULL) { + /* Create the client specific buffer */ + n = HASHBIN_GET_SIZE(log); + buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC); + if (buffer == NULL) { + spin_unlock_irqrestore(&log->hb_spinlock, flags); + return; + } + + } + + /* Copy discovery information */ + memcpy(&(buffer[i]), &(curr->data), + sizeof(discinfo_t)); + i++; + + /* Remove it from the log */ + curr = hashbin_remove_this(log, (irda_queue_t *) curr); + kfree(curr); + } + } + + /* Drop the spinlock before calling the higher layers, as + * we can't guarantee they won't call us back and create a + * deadlock. We will work on our own private data, so we + * don't care to be interrupted. - Jean II */ + spin_unlock_irqrestore(&log->hb_spinlock, flags); + + if(buffer == NULL) + return; + + /* Tell IrLMP and registered clients about it */ + irlmp_discovery_expiry(buffer, i); + + /* Free up our buffer */ + kfree(buffer); +} + +#if 0 +/* + * Function irlmp_dump_discoveries (log) + * + * Print out all discoveries in log + * + */ +void irlmp_dump_discoveries(hashbin_t *log) +{ + discovery_t *discovery; + + IRDA_ASSERT(log != NULL, return;); + + discovery = (discovery_t *) hashbin_get_first(log); + while (discovery != NULL) { + IRDA_DEBUG(0, "Discovery:\n"); + IRDA_DEBUG(0, " daddr=%08x\n", discovery->data.daddr); + IRDA_DEBUG(0, " saddr=%08x\n", discovery->data.saddr); + IRDA_DEBUG(0, " nickname=%s\n", discovery->data.info); + + discovery = (discovery_t *) hashbin_get_next(log); + } +} +#endif + +/* + * Function irlmp_copy_discoveries (log, pn, mask) + * + * Copy all discoveries in a buffer + * + * This function implement a safe way for lmp clients to access the + * discovery log. The basic problem is that we don't want the log + * to change (add/remove) while the client is reading it. If the + * lmp client manipulate directly the hashbin, he is sure to get + * into troubles... + * The idea is that we copy all the current discovery log in a buffer + * which is specific to the client and pass this copy to him. As we + * do this operation with the spinlock grabbed, we are safe... + * Note : we don't want those clients to grab the spinlock, because + * we have no control on how long they will hold it... + * Note : we choose to copy the log in "struct irda_device_info" to + * save space... + * Note : the client must kfree himself() the log... + * Jean II + */ +struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn, + __u16 mask, int old_entries) +{ + discovery_t * discovery; + unsigned long flags; + discinfo_t * buffer = NULL; + int j_timeout = (sysctl_discovery_timeout * HZ); + int n; /* Size of the full log */ + int i = 0; /* How many we picked */ + + IRDA_ASSERT(pn != NULL, return NULL;); + IRDA_ASSERT(log != NULL, return NULL;); + + /* Save spin lock */ + spin_lock_irqsave(&log->hb_spinlock, flags); + + discovery = (discovery_t *) hashbin_get_first(log); + while (discovery != NULL) { + /* Mask out the ones we don't want : + * We want to match the discovery mask, and to get only + * the most recent one (unless we want old ones) */ + if ((get_unaligned((__u16 *)discovery->data.hints) & mask) && + ((old_entries) || + ((jiffies - discovery->firststamp) < j_timeout))) { + /* Create buffer as needed. + * As this function get called a lot and most time + * we don't have anything to put in the log (we are + * quite picky), we can save a lot of overhead + * by not calling kmalloc. Jean II */ + if(buffer == NULL) { + /* Create the client specific buffer */ + n = HASHBIN_GET_SIZE(log); + buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC); + if (buffer == NULL) { + spin_unlock_irqrestore(&log->hb_spinlock, flags); + return NULL; + } + + } + + /* Copy discovery information */ + memcpy(&(buffer[i]), &(discovery->data), + sizeof(discinfo_t)); + i++; + } + discovery = (discovery_t *) hashbin_get_next(log); + } + + spin_unlock_irqrestore(&log->hb_spinlock, flags); + + /* Get the actual number of device in the buffer and return */ + *pn = i; + return buffer; +} + +#ifdef CONFIG_PROC_FS +static inline discovery_t *discovery_seq_idx(loff_t pos) + +{ + discovery_t *discovery; + + for (discovery = (discovery_t *) hashbin_get_first(irlmp->cachelog); + discovery != NULL; + discovery = (discovery_t *) hashbin_get_next(irlmp->cachelog)) { + if (pos-- == 0) + break; + } + + return discovery; +} + +static void *discovery_seq_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_irq(&irlmp->cachelog->hb_spinlock); + return *pos ? discovery_seq_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *discovery_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (v == SEQ_START_TOKEN) + ? (void *) hashbin_get_first(irlmp->cachelog) + : (void *) hashbin_get_next(irlmp->cachelog); +} + +static void discovery_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irlmp->cachelog->hb_spinlock); +} + +static int discovery_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "IrLMP: Discovery log:\n\n"); + else { + const discovery_t *discovery = v; + + seq_printf(seq, "nickname: %s, hint: 0x%02x%02x", + discovery->data.info, + discovery->data.hints[0], + discovery->data.hints[1]); +#if 0 + if ( discovery->data.hints[0] & HINT_PNP) + seq_puts(seq, "PnP Compatible "); + if ( discovery->data.hints[0] & HINT_PDA) + seq_puts(seq, "PDA/Palmtop "); + if ( discovery->data.hints[0] & HINT_COMPUTER) + seq_puts(seq, "Computer "); + if ( discovery->data.hints[0] & HINT_PRINTER) + seq_puts(seq, "Printer "); + if ( discovery->data.hints[0] & HINT_MODEM) + seq_puts(seq, "Modem "); + if ( discovery->data.hints[0] & HINT_FAX) + seq_puts(seq, "Fax "); + if ( discovery->data.hints[0] & HINT_LAN) + seq_puts(seq, "LAN Access "); + + if ( discovery->data.hints[1] & HINT_TELEPHONY) + seq_puts(seq, "Telephony "); + if ( discovery->data.hints[1] & HINT_FILE_SERVER) + seq_puts(seq, "File Server "); + if ( discovery->data.hints[1] & HINT_COMM) + seq_puts(seq, "IrCOMM "); + if ( discovery->data.hints[1] & HINT_OBEX) + seq_puts(seq, "IrOBEX "); +#endif + seq_printf(seq,", saddr: 0x%08x, daddr: 0x%08x\n\n", + discovery->data.saddr, + discovery->data.daddr); + + seq_putc(seq, '\n'); + } + return 0; +} + +static const struct seq_operations discovery_seq_ops = { + .start = discovery_seq_start, + .next = discovery_seq_next, + .stop = discovery_seq_stop, + .show = discovery_seq_show, +}; + +static int discovery_seq_open(struct inode *inode, struct file *file) +{ + IRDA_ASSERT(irlmp != NULL, return -EINVAL;); + + return seq_open(file, &discovery_seq_ops); +} + +const struct file_operations discovery_seq_fops = { + .owner = THIS_MODULE, + .open = discovery_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif diff --git a/net/irda/ircomm/Kconfig b/net/irda/ircomm/Kconfig new file mode 100644 index 00000000..2d4c6b4a --- /dev/null +++ b/net/irda/ircomm/Kconfig @@ -0,0 +1,12 @@ +config IRCOMM + tristate "IrCOMM protocol" + depends on IRDA + help + Say Y here if you want to build support for the IrCOMM protocol. + To compile it as modules, choose M here: the modules will be + called ircomm and ircomm_tty. + IrCOMM implements serial port emulation, and makes it possible to + use all existing applications that understands TTY's with an + infrared link. Thus you should be able to use application like PPP, + minicom and others. + diff --git a/net/irda/ircomm/Makefile b/net/irda/ircomm/Makefile new file mode 100644 index 00000000..ab23b5ba --- /dev/null +++ b/net/irda/ircomm/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux IrDA IrCOMM protocol layer. +# + +obj-$(CONFIG_IRCOMM) += ircomm.o ircomm-tty.o + +ircomm-y := ircomm_core.o ircomm_event.o ircomm_lmp.o ircomm_ttp.o +ircomm-tty-y := ircomm_tty.o ircomm_tty_attach.o ircomm_tty_ioctl.o ircomm_param.o diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c new file mode 100644 index 00000000..52079f19 --- /dev/null +++ b/net/irda/ircomm/ircomm_core.c @@ -0,0 +1,592 @@ +/********************************************************************* + * + * Filename: ircomm_core.c + * Version: 1.0 + * Description: IrCOMM service interface + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Jun 6 20:37:34 1999 + * Modified at: Tue Dec 21 13:26:41 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static int __ircomm_close(struct ircomm_cb *self); +static void ircomm_control_indication(struct ircomm_cb *self, + struct sk_buff *skb, int clen); + +#ifdef CONFIG_PROC_FS +extern struct proc_dir_entry *proc_irda; +static int ircomm_seq_open(struct inode *, struct file *); + +static const struct file_operations ircomm_proc_fops = { + .owner = THIS_MODULE, + .open = ircomm_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +hashbin_t *ircomm = NULL; + +static int __init ircomm_init(void) +{ + ircomm = hashbin_new(HB_LOCK); + if (ircomm == NULL) { + IRDA_ERROR("%s(), can't allocate hashbin!\n", __func__); + return -ENOMEM; + } + +#ifdef CONFIG_PROC_FS + { struct proc_dir_entry *ent; + ent = proc_create("ircomm", 0, proc_irda, &ircomm_proc_fops); + if (!ent) { + printk(KERN_ERR "ircomm_init: can't create /proc entry!\n"); + return -ENODEV; + } + } +#endif /* CONFIG_PROC_FS */ + + IRDA_MESSAGE("IrCOMM protocol (Dag Brattli)\n"); + + return 0; +} + +static void __exit ircomm_cleanup(void) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + hashbin_delete(ircomm, (FREE_FUNC) __ircomm_close); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("ircomm", proc_irda); +#endif /* CONFIG_PROC_FS */ +} + +/* + * Function ircomm_open (client_notify) + * + * Start a new IrCOMM instance + * + */ +struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line) +{ + struct ircomm_cb *self = NULL; + int ret; + + IRDA_DEBUG(2, "%s(), service_type=0x%02x\n", __func__ , + service_type); + + IRDA_ASSERT(ircomm != NULL, return NULL;); + + self = kzalloc(sizeof(struct ircomm_cb), GFP_ATOMIC); + if (self == NULL) + return NULL; + + self->notify = *notify; + self->magic = IRCOMM_MAGIC; + + /* Check if we should use IrLMP or IrTTP */ + if (service_type & IRCOMM_3_WIRE_RAW) { + self->flow_status = FLOW_START; + ret = ircomm_open_lsap(self); + } else + ret = ircomm_open_tsap(self); + + if (ret < 0) { + kfree(self); + return NULL; + } + + self->service_type = service_type; + self->line = line; + + hashbin_insert(ircomm, (irda_queue_t *) self, line, NULL); + + ircomm_next_state(self, IRCOMM_IDLE); + + return self; +} + +EXPORT_SYMBOL(ircomm_open); + +/* + * Function ircomm_close_instance (self) + * + * Remove IrCOMM instance + * + */ +static int __ircomm_close(struct ircomm_cb *self) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Disconnect link if any */ + ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, NULL, NULL); + + /* Remove TSAP */ + if (self->tsap) { + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + + /* Remove LSAP */ + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } + self->magic = 0; + + kfree(self); + + return 0; +} + +/* + * Function ircomm_close (self) + * + * Closes and removes the specified IrCOMM instance + * + */ +int ircomm_close(struct ircomm_cb *self) +{ + struct ircomm_cb *entry; + + IRDA_ASSERT(self != NULL, return -EIO;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EIO;); + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + entry = hashbin_remove(ircomm, self->line, NULL); + + IRDA_ASSERT(entry == self, return -1;); + + return __ircomm_close(self); +} + +EXPORT_SYMBOL(ircomm_close); + +/* + * Function ircomm_connect_request (self, service_type) + * + * Impl. of this function is differ from one of the reference. This + * function does discovery as well as sending connect request + * + */ +int ircomm_connect_request(struct ircomm_cb *self, __u8 dlsap_sel, + __u32 saddr, __u32 daddr, struct sk_buff *skb, + __u8 service_type) +{ + struct ircomm_info info; + int ret; + + IRDA_DEBUG(2 , "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + + self->service_type= service_type; + + info.dlsap_sel = dlsap_sel; + info.saddr = saddr; + info.daddr = daddr; + + ret = ircomm_do_event(self, IRCOMM_CONNECT_REQUEST, skb, &info); + + return ret; +} + +EXPORT_SYMBOL(ircomm_connect_request); + +/* + * Function ircomm_connect_indication (self, qos, skb) + * + * Notify user layer about the incoming connection + * + */ +void ircomm_connect_indication(struct ircomm_cb *self, struct sk_buff *skb, + struct ircomm_info *info) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* + * If there are any data hiding in the control channel, we must + * deliver it first. The side effect is that the control channel + * will be removed from the skb + */ + if (self->notify.connect_indication) + self->notify.connect_indication(self->notify.instance, self, + info->qos, info->max_data_size, + info->max_header_size, skb); + else { + IRDA_DEBUG(0, "%s(), missing handler\n", __func__ ); + } +} + +/* + * Function ircomm_connect_response (self, userdata, max_sdu_size) + * + * User accepts connection + * + */ +int ircomm_connect_response(struct ircomm_cb *self, struct sk_buff *userdata) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + ret = ircomm_do_event(self, IRCOMM_CONNECT_RESPONSE, userdata, NULL); + + return ret; +} + +EXPORT_SYMBOL(ircomm_connect_response); + +/* + * Function connect_confirm (self, skb) + * + * Notify user layer that the link is now connected + * + */ +void ircomm_connect_confirm(struct ircomm_cb *self, struct sk_buff *skb, + struct ircomm_info *info) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + if (self->notify.connect_confirm ) + self->notify.connect_confirm(self->notify.instance, + self, info->qos, + info->max_data_size, + info->max_header_size, skb); + else { + IRDA_DEBUG(0, "%s(), missing handler\n", __func__ ); + } +} + +/* + * Function ircomm_data_request (self, userdata) + * + * Send IrCOMM data to peer device + * + */ +int ircomm_data_request(struct ircomm_cb *self, struct sk_buff *skb) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -EFAULT;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;); + IRDA_ASSERT(skb != NULL, return -EFAULT;); + + ret = ircomm_do_event(self, IRCOMM_DATA_REQUEST, skb, NULL); + + return ret; +} + +EXPORT_SYMBOL(ircomm_data_request); + +/* + * Function ircomm_data_indication (self, skb) + * + * Data arrived, so deliver it to user + * + */ +void ircomm_data_indication(struct ircomm_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(skb->len > 0, return;); + + if (self->notify.data_indication) + self->notify.data_indication(self->notify.instance, self, skb); + else { + IRDA_DEBUG(0, "%s(), missing handler\n", __func__ ); + } +} + +/* + * Function ircomm_process_data (self, skb) + * + * Data arrived which may contain control channel data + * + */ +void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb) +{ + int clen; + + IRDA_ASSERT(skb->len > 0, return;); + + clen = skb->data[0]; + + /* + * Input validation check: a stir4200/mcp2150 combinations sometimes + * results in frames with clen > remaining packet size. These are + * illegal; if we throw away just this frame then it seems to carry on + * fine + */ + if (unlikely(skb->len < (clen + 1))) { + IRDA_DEBUG(2, "%s() throwing away illegal frame\n", + __func__ ); + return; + } + + /* + * If there are any data hiding in the control channel, we must + * deliver it first. The side effect is that the control channel + * will be removed from the skb + */ + if (clen > 0) + ircomm_control_indication(self, skb, clen); + + /* Remove control channel from data channel */ + skb_pull(skb, clen+1); + + if (skb->len) + ircomm_data_indication(self, skb); + else { + IRDA_DEBUG(4, "%s(), data was control info only!\n", + __func__ ); + } +} + +/* + * Function ircomm_control_request (self, params) + * + * Send control data to peer device + * + */ +int ircomm_control_request(struct ircomm_cb *self, struct sk_buff *skb) +{ + int ret; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -EFAULT;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;); + IRDA_ASSERT(skb != NULL, return -EFAULT;); + + ret = ircomm_do_event(self, IRCOMM_CONTROL_REQUEST, skb, NULL); + + return ret; +} + +EXPORT_SYMBOL(ircomm_control_request); + +/* + * Function ircomm_control_indication (self, skb) + * + * Data has arrived on the control channel + * + */ +static void ircomm_control_indication(struct ircomm_cb *self, + struct sk_buff *skb, int clen) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Use udata for delivering data on the control channel */ + if (self->notify.udata_indication) { + struct sk_buff *ctrl_skb; + + /* We don't own the skb, so clone it */ + ctrl_skb = skb_clone(skb, GFP_ATOMIC); + if (!ctrl_skb) + return; + + /* Remove data channel from control channel */ + skb_trim(ctrl_skb, clen+1); + + self->notify.udata_indication(self->notify.instance, self, + ctrl_skb); + + /* Drop reference count - + * see ircomm_tty_control_indication(). */ + dev_kfree_skb(ctrl_skb); + } else { + IRDA_DEBUG(0, "%s(), missing handler\n", __func__ ); + } +} + +/* + * Function ircomm_disconnect_request (self, userdata, priority) + * + * User layer wants to disconnect the IrCOMM connection + * + */ +int ircomm_disconnect_request(struct ircomm_cb *self, struct sk_buff *userdata) +{ + struct ircomm_info info; + int ret; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + + ret = ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, userdata, + &info); + return ret; +} + +EXPORT_SYMBOL(ircomm_disconnect_request); + +/* + * Function disconnect_indication (self, skb) + * + * Tell user that the link has been disconnected + * + */ +void ircomm_disconnect_indication(struct ircomm_cb *self, struct sk_buff *skb, + struct ircomm_info *info) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(info != NULL, return;); + + if (self->notify.disconnect_indication) { + self->notify.disconnect_indication(self->notify.instance, self, + info->reason, skb); + } else { + IRDA_DEBUG(0, "%s(), missing handler\n", __func__ ); + } +} + +/* + * Function ircomm_flow_request (self, flow) + * + * + * + */ +void ircomm_flow_request(struct ircomm_cb *self, LOCAL_FLOW flow) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + if (self->service_type == IRCOMM_3_WIRE_RAW) + return; + + irttp_flow_request(self->tsap, flow); +} + +EXPORT_SYMBOL(ircomm_flow_request); + +#ifdef CONFIG_PROC_FS +static void *ircomm_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ircomm_cb *self; + loff_t off = 0; + + spin_lock_irq(&ircomm->hb_spinlock); + + for (self = (struct ircomm_cb *) hashbin_get_first(ircomm); + self != NULL; + self = (struct ircomm_cb *) hashbin_get_next(ircomm)) { + if (off++ == *pos) + break; + + } + return self; +} + +static void *ircomm_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (void *) hashbin_get_next(ircomm); +} + +static void ircomm_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&ircomm->hb_spinlock); +} + +static int ircomm_seq_show(struct seq_file *seq, void *v) +{ + const struct ircomm_cb *self = v; + + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EINVAL; ); + + if(self->line < 0x10) + seq_printf(seq, "ircomm%d", self->line); + else + seq_printf(seq, "irlpt%d", self->line - 0x10); + + seq_printf(seq, + " state: %s, slsap_sel: %#02x, dlsap_sel: %#02x, mode:", + ircomm_state[ self->state], + self->slsap_sel, self->dlsap_sel); + + if(self->service_type & IRCOMM_3_WIRE_RAW) + seq_printf(seq, " 3-wire-raw"); + if(self->service_type & IRCOMM_3_WIRE) + seq_printf(seq, " 3-wire"); + if(self->service_type & IRCOMM_9_WIRE) + seq_printf(seq, " 9-wire"); + if(self->service_type & IRCOMM_CENTRONICS) + seq_printf(seq, " Centronics"); + seq_putc(seq, '\n'); + + return 0; +} + +static const struct seq_operations ircomm_seq_ops = { + .start = ircomm_seq_start, + .next = ircomm_seq_next, + .stop = ircomm_seq_stop, + .show = ircomm_seq_show, +}; + +static int ircomm_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ircomm_seq_ops); +} +#endif /* CONFIG_PROC_FS */ + +MODULE_AUTHOR("Dag Brattli "); +MODULE_DESCRIPTION("IrCOMM protocol"); +MODULE_LICENSE("GPL"); + +module_init(ircomm_init); +module_exit(ircomm_cleanup); diff --git a/net/irda/ircomm/ircomm_event.c b/net/irda/ircomm/ircomm_event.c new file mode 100644 index 00000000..d78554fe --- /dev/null +++ b/net/irda/ircomm/ircomm_event.c @@ -0,0 +1,250 @@ +/********************************************************************* + * + * Filename: ircomm_event.c + * Version: 1.0 + * Description: IrCOMM layer state machine + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Jun 6 20:33:11 1999 + * Modified at: Sun Dec 12 13:44:32 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); +static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); +static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); +static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); + +const char *const ircomm_state[] = { + "IRCOMM_IDLE", + "IRCOMM_WAITI", + "IRCOMM_WAITR", + "IRCOMM_CONN", +}; + +#ifdef CONFIG_IRDA_DEBUG +static const char *const ircomm_event[] = { + "IRCOMM_CONNECT_REQUEST", + "IRCOMM_CONNECT_RESPONSE", + "IRCOMM_TTP_CONNECT_INDICATION", + "IRCOMM_LMP_CONNECT_INDICATION", + "IRCOMM_TTP_CONNECT_CONFIRM", + "IRCOMM_LMP_CONNECT_CONFIRM", + + "IRCOMM_LMP_DISCONNECT_INDICATION", + "IRCOMM_TTP_DISCONNECT_INDICATION", + "IRCOMM_DISCONNECT_REQUEST", + + "IRCOMM_TTP_DATA_INDICATION", + "IRCOMM_LMP_DATA_INDICATION", + "IRCOMM_DATA_REQUEST", + "IRCOMM_CONTROL_REQUEST", + "IRCOMM_CONTROL_INDICATION", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static int (*state[])(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) = +{ + ircomm_state_idle, + ircomm_state_waiti, + ircomm_state_waitr, + ircomm_state_conn, +}; + +/* + * Function ircomm_state_idle (self, event, skb) + * + * IrCOMM is currently idle + * + */ +static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_CONNECT_REQUEST: + ircomm_next_state(self, IRCOMM_WAITI); + ret = self->issue.connect_request(self, skb, info); + break; + case IRCOMM_TTP_CONNECT_INDICATION: + case IRCOMM_LMP_CONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_WAITR); + ircomm_connect_indication(self, skb, info); + break; + default: + IRDA_DEBUG(4, "%s(), unknown event: %s\n", __func__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_state_waiti (self, event, skb) + * + * The IrCOMM user has requested an IrCOMM connection to the remote + * device and is awaiting confirmation + */ +static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_TTP_CONNECT_CONFIRM: + case IRCOMM_LMP_CONNECT_CONFIRM: + ircomm_next_state(self, IRCOMM_CONN); + ircomm_connect_confirm(self, skb, info); + break; + case IRCOMM_TTP_DISCONNECT_INDICATION: + case IRCOMM_LMP_DISCONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_IDLE); + ircomm_disconnect_indication(self, skb, info); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event: %s\n", __func__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_state_waitr (self, event, skb) + * + * IrCOMM has received an incoming connection request and is awaiting + * response from the user + */ +static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_CONNECT_RESPONSE: + ircomm_next_state(self, IRCOMM_CONN); + ret = self->issue.connect_response(self, skb); + break; + case IRCOMM_DISCONNECT_REQUEST: + ircomm_next_state(self, IRCOMM_IDLE); + ret = self->issue.disconnect_request(self, skb, info); + break; + case IRCOMM_TTP_DISCONNECT_INDICATION: + case IRCOMM_LMP_DISCONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_IDLE); + ircomm_disconnect_indication(self, skb, info); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event = %s\n", __func__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_state_conn (self, event, skb) + * + * IrCOMM is connected to the peer IrCOMM device + * + */ +static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_DATA_REQUEST: + ret = self->issue.data_request(self, skb, 0); + break; + case IRCOMM_TTP_DATA_INDICATION: + ircomm_process_data(self, skb); + break; + case IRCOMM_LMP_DATA_INDICATION: + ircomm_data_indication(self, skb); + break; + case IRCOMM_CONTROL_REQUEST: + /* Just send a separate frame for now */ + ret = self->issue.data_request(self, skb, skb->len); + break; + case IRCOMM_TTP_DISCONNECT_INDICATION: + case IRCOMM_LMP_DISCONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_IDLE); + ircomm_disconnect_indication(self, skb, info); + break; + case IRCOMM_DISCONNECT_REQUEST: + ircomm_next_state(self, IRCOMM_IDLE); + ret = self->issue.disconnect_request(self, skb, info); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event = %s\n", __func__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_do_event (self, event, skb) + * + * Process event + * + */ +int ircomm_do_event(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + IRDA_DEBUG(4, "%s: state=%s, event=%s\n", __func__ , + ircomm_state[self->state], ircomm_event[event]); + + return (*state[self->state])(self, event, skb, info); +} + +/* + * Function ircomm_next_state (self, state) + * + * Switch state + * + */ +void ircomm_next_state(struct ircomm_cb *self, IRCOMM_STATE state) +{ + self->state = state; + + IRDA_DEBUG(4, "%s: next state=%s, service type=%d\n", __func__ , + ircomm_state[self->state], self->service_type); +} diff --git a/net/irda/ircomm/ircomm_lmp.c b/net/irda/ircomm/ircomm_lmp.c new file mode 100644 index 00000000..3b8095c7 --- /dev/null +++ b/net/irda/ircomm/ircomm_lmp.c @@ -0,0 +1,370 @@ +/********************************************************************* + * + * Filename: ircomm_lmp.c + * Version: 1.0 + * Description: Interface between IrCOMM and IrLMP + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Jun 6 20:48:27 1999 + * Modified at: Sun Dec 12 13:44:17 1999 + * Modified by: Dag Brattli + * Sources: Previous IrLPT work by Thomas Davis + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include /* struct irda_skb_cb */ + +#include +#include + + +/* + * Function ircomm_lmp_connect_request (self, userdata) + * + * + * + */ +static int ircomm_lmp_connect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + int ret = 0; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irlmp_connect_request(self->lsap, info->dlsap_sel, + info->saddr, info->daddr, NULL, userdata); + return ret; +} + +/* + * Function ircomm_lmp_connect_response (self, skb) + * + * + * + */ +static int ircomm_lmp_connect_response(struct ircomm_cb *self, + struct sk_buff *userdata) +{ + struct sk_buff *tx_skb; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + /* Any userdata supplied? */ + if (userdata == NULL) { + tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + } else { + /* + * Check that the client has reserved enough space for + * headers + */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_MAX_HEADER, + return -1;); + + /* Don't forget to refcount it - should be NULL anyway */ + skb_get(userdata); + tx_skb = userdata; + } + + return irlmp_connect_response(self->lsap, tx_skb); +} + +static int ircomm_lmp_disconnect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + struct sk_buff *tx_skb; + int ret; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + if (!userdata) { + tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + userdata = tx_skb; + } else { + /* Don't forget to refcount it - should be NULL anyway */ + skb_get(userdata); + } + + ret = irlmp_disconnect_request(self->lsap, userdata); + + return ret; +} + +/* + * Function ircomm_lmp_flow_control (skb) + * + * This function is called when a data frame we have sent to IrLAP has + * been deallocated. We do this to make sure we don't flood IrLAP with + * frames, since we are not using the IrTTP flow control mechanism + */ +static void ircomm_lmp_flow_control(struct sk_buff *skb) +{ + struct irda_skb_cb *cb; + struct ircomm_cb *self; + int line; + + IRDA_ASSERT(skb != NULL, return;); + + cb = (struct irda_skb_cb *) skb->cb; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + line = cb->line; + + self = (struct ircomm_cb *) hashbin_lock_find(ircomm, line, NULL); + if (!self) { + IRDA_DEBUG(2, "%s(), didn't find myself\n", __func__ ); + return; + } + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + self->pkt_count--; + + if ((self->pkt_count < 2) && (self->flow_status == FLOW_STOP)) { + IRDA_DEBUG(2, "%s(), asking TTY to start again!\n", __func__ ); + self->flow_status = FLOW_START; + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, + self, FLOW_START); + } +} + +/* + * Function ircomm_lmp_data_request (self, userdata) + * + * Send data frame to peer device + * + */ +static int ircomm_lmp_data_request(struct ircomm_cb *self, + struct sk_buff *skb, + int not_used) +{ + struct irda_skb_cb *cb; + int ret; + + IRDA_ASSERT(skb != NULL, return -1;); + + cb = (struct irda_skb_cb *) skb->cb; + + cb->line = self->line; + + IRDA_DEBUG(4, "%s(), sending frame\n", __func__ ); + + /* Don't forget to refcount it - see ircomm_tty_do_softint() */ + skb_get(skb); + + skb_orphan(skb); + skb->destructor = ircomm_lmp_flow_control; + + if ((self->pkt_count++ > 7) && (self->flow_status == FLOW_START)) { + IRDA_DEBUG(2, "%s(), asking TTY to slow down!\n", __func__ ); + self->flow_status = FLOW_STOP; + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, + self, FLOW_STOP); + } + ret = irlmp_data_request(self->lsap, skb); + if (ret) { + IRDA_ERROR("%s(), failed\n", __func__); + /* irlmp_data_request already free the packet */ + } + + return ret; +} + +/* + * Function ircomm_lmp_data_indication (instance, sap, skb) + * + * Incoming data which we must deliver to the state machine, to check + * we are still connected. + */ +static int ircomm_lmp_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + ircomm_do_event(self, IRCOMM_LMP_DATA_INDICATION, skb, NULL); + + /* Drop reference count - see ircomm_tty_data_indication(). */ + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function ircomm_lmp_connect_confirm (instance, sap, qos, max_sdu_size, + * max_header_size, skb) + * + * Connection has been confirmed by peer device + * + */ +static void ircomm_lmp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, return;); + + info.max_data_size = max_seg_size; + info.max_header_size = max_header_size; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_LMP_CONNECT_CONFIRM, skb, &info); + + /* Drop reference count - see ircomm_tty_connect_confirm(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_lmp_connect_indication (instance, sap, qos, max_sdu_size, + * max_header_size, skb) + * + * Peer device wants to make a connection with us + * + */ +static void ircomm_lmp_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *)instance; + struct ircomm_info info; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, return;); + + info.max_data_size = max_seg_size; + info.max_header_size = max_header_size; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_LMP_CONNECT_INDICATION, skb, &info); + + /* Drop reference count - see ircomm_tty_connect_indication(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_lmp_disconnect_indication (instance, sap, reason, skb) + * + * Peer device has closed the connection, or the link went down for some + * other reason + */ +static void ircomm_lmp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + info.reason = reason; + + ircomm_do_event(self, IRCOMM_LMP_DISCONNECT_INDICATION, skb, &info); + + /* Drop reference count - see ircomm_tty_disconnect_indication(). */ + if(skb) + dev_kfree_skb(skb); +} +/* + * Function ircomm_open_lsap (self) + * + * Open LSAP. This function will only be used when using "raw" services + * + */ +int ircomm_open_lsap(struct ircomm_cb *self) +{ + notify_t notify; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + /* Register callbacks */ + irda_notify_init(¬ify); + notify.data_indication = ircomm_lmp_data_indication; + notify.connect_confirm = ircomm_lmp_connect_confirm; + notify.connect_indication = ircomm_lmp_connect_indication; + notify.disconnect_indication = ircomm_lmp_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrCOMM", sizeof(notify.name)); + + self->lsap = irlmp_open_lsap(LSAP_ANY, ¬ify, 0); + if (!self->lsap) { + IRDA_DEBUG(0,"%sfailed to allocate tsap\n", __func__ ); + return -1; + } + self->slsap_sel = self->lsap->slsap_sel; + + /* + * Initialize the call-table for issuing commands + */ + self->issue.data_request = ircomm_lmp_data_request; + self->issue.connect_request = ircomm_lmp_connect_request; + self->issue.connect_response = ircomm_lmp_connect_response; + self->issue.disconnect_request = ircomm_lmp_disconnect_request; + + return 0; +} diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c new file mode 100644 index 00000000..8b915f3a --- /dev/null +++ b/net/irda/ircomm/ircomm_param.c @@ -0,0 +1,511 @@ +/********************************************************************* + * + * Filename: ircomm_param.c + * Version: 1.0 + * Description: Parameter handling for the IrCOMM protocol + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Jun 7 10:25:11 1999 + * Modified at: Sun Jan 30 14:32:03 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +static int ircomm_param_service_type(void *instance, irda_param_t *param, + int get); +static int ircomm_param_port_type(void *instance, irda_param_t *param, + int get); +static int ircomm_param_port_name(void *instance, irda_param_t *param, + int get); +static int ircomm_param_service_type(void *instance, irda_param_t *param, + int get); +static int ircomm_param_data_rate(void *instance, irda_param_t *param, + int get); +static int ircomm_param_data_format(void *instance, irda_param_t *param, + int get); +static int ircomm_param_flow_control(void *instance, irda_param_t *param, + int get); +static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get); +static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get); +static int ircomm_param_line_status(void *instance, irda_param_t *param, + int get); +static int ircomm_param_dte(void *instance, irda_param_t *param, int get); +static int ircomm_param_dce(void *instance, irda_param_t *param, int get); +static int ircomm_param_poll(void *instance, irda_param_t *param, int get); + +static pi_minor_info_t pi_minor_call_table_common[] = { + { ircomm_param_service_type, PV_INT_8_BITS }, + { ircomm_param_port_type, PV_INT_8_BITS }, + { ircomm_param_port_name, PV_STRING } +}; +static pi_minor_info_t pi_minor_call_table_non_raw[] = { + { ircomm_param_data_rate, PV_INT_32_BITS | PV_BIG_ENDIAN }, + { ircomm_param_data_format, PV_INT_8_BITS }, + { ircomm_param_flow_control, PV_INT_8_BITS }, + { ircomm_param_xon_xoff, PV_INT_16_BITS }, + { ircomm_param_enq_ack, PV_INT_16_BITS }, + { ircomm_param_line_status, PV_INT_8_BITS } +}; +static pi_minor_info_t pi_minor_call_table_9_wire[] = { + { ircomm_param_dte, PV_INT_8_BITS }, + { ircomm_param_dce, PV_INT_8_BITS }, + { ircomm_param_poll, PV_NO_VALUE }, +}; + +static pi_major_info_t pi_major_call_table[] = { + { pi_minor_call_table_common, 3 }, + { pi_minor_call_table_non_raw, 6 }, + { pi_minor_call_table_9_wire, 3 } +/* { pi_minor_call_table_centronics } */ +}; + +pi_param_info_t ircomm_param_info = { pi_major_call_table, 3, 0x0f, 4 }; + +/* + * Function ircomm_param_request (self, pi, flush) + * + * Queue a parameter for the control channel + * + */ +int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush) +{ + struct tty_struct *tty; + unsigned long flags; + struct sk_buff *skb; + int count; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + tty = self->tty; + if (!tty) + return 0; + + /* Make sure we don't send parameters for raw mode */ + if (self->service_type == IRCOMM_3_WIRE_RAW) + return 0; + + spin_lock_irqsave(&self->spinlock, flags); + + skb = self->ctrl_skb; + if (!skb) { + skb = alloc_skb(256, GFP_ATOMIC); + if (!skb) { + spin_unlock_irqrestore(&self->spinlock, flags); + return -ENOMEM; + } + + skb_reserve(skb, self->max_header_size); + self->ctrl_skb = skb; + } + /* + * Inserting is a little bit tricky since we don't know how much + * room we will need. But this should hopefully work OK + */ + count = irda_param_insert(self, pi, skb_tail_pointer(skb), + skb_tailroom(skb), &ircomm_param_info); + if (count < 0) { + IRDA_WARNING("%s(), no room for parameter!\n", __func__); + spin_unlock_irqrestore(&self->spinlock, flags); + return -1; + } + skb_put(skb, count); + + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(2, "%s(), skb->len=%d\n", __func__ , skb->len); + + if (flush) { + /* ircomm_tty_do_softint will take care of the rest */ + schedule_work(&self->tqueue); + } + + return count; +} + +/* + * Function ircomm_param_service_type (self, buf, len) + * + * Handle service type, this function will both be called after the LM-IAS + * query and then the remote device sends its initial parameters + * + */ +static int ircomm_param_service_type(void *instance, irda_param_t *param, + int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + __u8 service_type = (__u8) param->pv.i; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + param->pv.i = self->settings.service_type; + return 0; + } + + /* Find all common service types */ + service_type &= self->service_type; + if (!service_type) { + IRDA_DEBUG(2, + "%s(), No common service type to use!\n", __func__ ); + return -1; + } + IRDA_DEBUG(0, "%s(), services in common=%02x\n", __func__ , + service_type); + + /* + * Now choose a preferred service type of those available + */ + if (service_type & IRCOMM_CENTRONICS) + self->settings.service_type = IRCOMM_CENTRONICS; + else if (service_type & IRCOMM_9_WIRE) + self->settings.service_type = IRCOMM_9_WIRE; + else if (service_type & IRCOMM_3_WIRE) + self->settings.service_type = IRCOMM_3_WIRE; + else if (service_type & IRCOMM_3_WIRE_RAW) + self->settings.service_type = IRCOMM_3_WIRE_RAW; + + IRDA_DEBUG(0, "%s(), resulting service type=0x%02x\n", __func__ , + self->settings.service_type); + + /* + * Now the line is ready for some communication. Check if we are a + * server, and send over some initial parameters. + * Client do it in ircomm_tty_state_setup(). + * Note : we may get called from ircomm_tty_getvalue_confirm(), + * therefore before we even have open any socket. And self->client + * is initialised to TRUE only later. So, we check if the link is + * really initialised. - Jean II + */ + if ((self->max_header_size != IRCOMM_TTY_HDR_UNINITIALISED) && + (!self->client) && + (self->settings.service_type != IRCOMM_3_WIRE_RAW)) + { + /* Init connection */ + ircomm_tty_send_initial_parameters(self); + ircomm_tty_link_established(self); + } + + return 0; +} + +/* + * Function ircomm_param_port_type (self, param) + * + * The port type parameter tells if the devices are serial or parallel. + * Since we only advertise serial service, this parameter should only + * be equal to IRCOMM_SERIAL. + */ +static int ircomm_param_port_type(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = IRCOMM_SERIAL; + else { + self->settings.port_type = (__u8) param->pv.i; + + IRDA_DEBUG(0, "%s(), port type=%d\n", __func__ , + self->settings.port_type); + } + return 0; +} + +/* + * Function ircomm_param_port_name (self, param) + * + * Exchange port name + * + */ +static int ircomm_param_port_name(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + IRDA_DEBUG(0, "%s(), not imp!\n", __func__ ); + } else { + IRDA_DEBUG(0, "%s(), port-name=%s\n", __func__ , param->pv.c); + strncpy(self->settings.port_name, param->pv.c, 32); + } + + return 0; +} + +/* + * Function ircomm_param_data_rate (self, param) + * + * Exchange data rate to be used in this settings + * + */ +static int ircomm_param_data_rate(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.data_rate; + else + self->settings.data_rate = param->pv.i; + + IRDA_DEBUG(2, "%s(), data rate = %d\n", __func__ , param->pv.i); + + return 0; +} + +/* + * Function ircomm_param_data_format (self, param) + * + * Exchange data format to be used in this settings + * + */ +static int ircomm_param_data_format(void *instance, irda_param_t *param, + int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.data_format; + else + self->settings.data_format = (__u8) param->pv.i; + + return 0; +} + +/* + * Function ircomm_param_flow_control (self, param) + * + * Exchange flow control settings to be used in this settings + * + */ +static int ircomm_param_flow_control(void *instance, irda_param_t *param, + int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.flow_control; + else + self->settings.flow_control = (__u8) param->pv.i; + + IRDA_DEBUG(1, "%s(), flow control = 0x%02x\n", __func__ , (__u8) param->pv.i); + + return 0; +} + +/* + * Function ircomm_param_xon_xoff (self, param) + * + * Exchange XON/XOFF characters + * + */ +static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + param->pv.i = self->settings.xonxoff[0]; + param->pv.i |= self->settings.xonxoff[1] << 8; + } else { + self->settings.xonxoff[0] = (__u16) param->pv.i & 0xff; + self->settings.xonxoff[1] = (__u16) param->pv.i >> 8; + } + + IRDA_DEBUG(0, "%s(), XON/XOFF = 0x%02x,0x%02x\n", __func__ , + param->pv.i & 0xff, param->pv.i >> 8); + + return 0; +} + +/* + * Function ircomm_param_enq_ack (self, param) + * + * Exchange ENQ/ACK characters + * + */ +static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + param->pv.i = self->settings.enqack[0]; + param->pv.i |= self->settings.enqack[1] << 8; + } else { + self->settings.enqack[0] = (__u16) param->pv.i & 0xff; + self->settings.enqack[1] = (__u16) param->pv.i >> 8; + } + + IRDA_DEBUG(0, "%s(), ENQ/ACK = 0x%02x,0x%02x\n", __func__ , + param->pv.i & 0xff, param->pv.i >> 8); + + return 0; +} + +/* + * Function ircomm_param_line_status (self, param) + * + * + * + */ +static int ircomm_param_line_status(void *instance, irda_param_t *param, + int get) +{ + IRDA_DEBUG(2, "%s(), not impl.\n", __func__ ); + + return 0; +} + +/* + * Function ircomm_param_dte (instance, param) + * + * If we get here, there must be some sort of null-modem connection, and + * we are probably working in server mode as well. + */ +static int ircomm_param_dte(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + __u8 dte; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.dte; + else { + dte = (__u8) param->pv.i; + + self->settings.dce = 0; + + if (dte & IRCOMM_DELTA_DTR) + self->settings.dce |= (IRCOMM_DELTA_DSR| + IRCOMM_DELTA_RI | + IRCOMM_DELTA_CD); + if (dte & IRCOMM_DTR) + self->settings.dce |= (IRCOMM_DSR| + IRCOMM_RI | + IRCOMM_CD); + + if (dte & IRCOMM_DELTA_RTS) + self->settings.dce |= IRCOMM_DELTA_CTS; + if (dte & IRCOMM_RTS) + self->settings.dce |= IRCOMM_CTS; + + /* Take appropriate actions */ + ircomm_tty_check_modem_status(self); + + /* Null modem cable emulator */ + self->settings.null_modem = TRUE; + } + + return 0; +} + +/* + * Function ircomm_param_dce (instance, param) + * + * + * + */ +static int ircomm_param_dce(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + __u8 dce; + + IRDA_DEBUG(1, "%s(), dce = 0x%02x\n", __func__ , (__u8) param->pv.i); + + dce = (__u8) param->pv.i; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + self->settings.dce = dce; + + /* Check if any of the settings have changed */ + if (dce & 0x0f) { + if (dce & IRCOMM_DELTA_CTS) { + IRDA_DEBUG(2, "%s(), CTS\n", __func__ ); + } + } + + ircomm_tty_check_modem_status(self); + + return 0; +} + +/* + * Function ircomm_param_poll (instance, param) + * + * Called when the peer device is polling for the line settings + * + */ +static int ircomm_param_poll(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* Poll parameters are always of length 0 (just a signal) */ + if (!get) { + /* Respond with DTE line settings */ + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + return 0; +} + + + + + diff --git a/net/irda/ircomm/ircomm_ttp.c b/net/irda/ircomm/ircomm_ttp.c new file mode 100644 index 00000000..6e6509f2 --- /dev/null +++ b/net/irda/ircomm/ircomm_ttp.c @@ -0,0 +1,368 @@ +/********************************************************************* + * + * Filename: ircomm_ttp.c + * Version: 1.0 + * Description: Interface between IrCOMM and IrTTP + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Jun 6 20:48:27 1999 + * Modified at: Mon Dec 13 11:35:13 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include + +#include +#include +#include +#include + +#include +#include + +static int ircomm_ttp_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static void ircomm_ttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); +static void ircomm_ttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); +static void ircomm_ttp_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd); +static void ircomm_ttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb); +static int ircomm_ttp_data_request(struct ircomm_cb *self, + struct sk_buff *skb, + int clen); +static int ircomm_ttp_connect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info); +static int ircomm_ttp_connect_response(struct ircomm_cb *self, + struct sk_buff *userdata); +static int ircomm_ttp_disconnect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info); + +/* + * Function ircomm_open_tsap (self) + * + * + * + */ +int ircomm_open_tsap(struct ircomm_cb *self) +{ + notify_t notify; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + /* Register callbacks */ + irda_notify_init(¬ify); + notify.data_indication = ircomm_ttp_data_indication; + notify.connect_confirm = ircomm_ttp_connect_confirm; + notify.connect_indication = ircomm_ttp_connect_indication; + notify.flow_indication = ircomm_ttp_flow_indication; + notify.disconnect_indication = ircomm_ttp_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrCOMM", sizeof(notify.name)); + + self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, + ¬ify); + if (!self->tsap) { + IRDA_DEBUG(0, "%sfailed to allocate tsap\n", __func__ ); + return -1; + } + self->slsap_sel = self->tsap->stsap_sel; + + /* + * Initialize the call-table for issuing commands + */ + self->issue.data_request = ircomm_ttp_data_request; + self->issue.connect_request = ircomm_ttp_connect_request; + self->issue.connect_response = ircomm_ttp_connect_response; + self->issue.disconnect_request = ircomm_ttp_disconnect_request; + + return 0; +} + +/* + * Function ircomm_ttp_connect_request (self, userdata) + * + * + * + */ +static int ircomm_ttp_connect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irttp_connect_request(self->tsap, info->dlsap_sel, + info->saddr, info->daddr, NULL, + TTP_SAR_DISABLE, userdata); + + return ret; +} + +/* + * Function ircomm_ttp_connect_response (self, skb) + * + * + * + */ +static int ircomm_ttp_connect_response(struct ircomm_cb *self, + struct sk_buff *userdata) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irttp_connect_response(self->tsap, TTP_SAR_DISABLE, userdata); + + return ret; +} + +/* + * Function ircomm_ttp_data_request (self, userdata) + * + * Send IrCOMM data to IrTTP layer. Currently we do not try to combine + * control data with pure data, so they will be sent as separate frames. + * Should not be a big problem though, since control frames are rare. But + * some of them are sent after connection establishment, so this can + * increase the latency a bit. + */ +static int ircomm_ttp_data_request(struct ircomm_cb *self, + struct sk_buff *skb, + int clen) +{ + int ret; + + IRDA_ASSERT(skb != NULL, return -1;); + + IRDA_DEBUG(2, "%s(), clen=%d\n", __func__ , clen); + + /* + * Insert clen field, currently we either send data only, or control + * only frames, to make things easier and avoid queueing + */ + IRDA_ASSERT(skb_headroom(skb) >= IRCOMM_HEADER_SIZE, return -1;); + + /* Don't forget to refcount it - see ircomm_tty_do_softint() */ + skb_get(skb); + + skb_push(skb, IRCOMM_HEADER_SIZE); + + skb->data[0] = clen; + + ret = irttp_data_request(self->tsap, skb); + if (ret) { + IRDA_ERROR("%s(), failed\n", __func__); + /* irttp_data_request already free the packet */ + } + + return ret; +} + +/* + * Function ircomm_ttp_data_indication (instance, sap, skb) + * + * Incoming data + * + */ +static int ircomm_ttp_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + ircomm_do_event(self, IRCOMM_TTP_DATA_INDICATION, skb, NULL); + + /* Drop reference count - see ircomm_tty_data_indication(). */ + dev_kfree_skb(skb); + + return 0; +} + +static void ircomm_ttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, goto out;); + + if (max_sdu_size != TTP_SAR_DISABLE) { + IRDA_ERROR("%s(), SAR not allowed for IrCOMM!\n", + __func__); + goto out; + } + + info.max_data_size = irttp_get_max_seg_size(self->tsap) + - IRCOMM_HEADER_SIZE; + info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_TTP_CONNECT_CONFIRM, skb, &info); + +out: + /* Drop reference count - see ircomm_tty_connect_confirm(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_ttp_connect_indication (instance, sap, qos, max_sdu_size, + * max_header_size, skb) + * + * + * + */ +static void ircomm_ttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *)instance; + struct ircomm_info info; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, goto out;); + + if (max_sdu_size != TTP_SAR_DISABLE) { + IRDA_ERROR("%s(), SAR not allowed for IrCOMM!\n", + __func__); + goto out; + } + + info.max_data_size = irttp_get_max_seg_size(self->tsap) + - IRCOMM_HEADER_SIZE; + info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_TTP_CONNECT_INDICATION, skb, &info); + +out: + /* Drop reference count - see ircomm_tty_connect_indication(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_ttp_disconnect_request (self, userdata, info) + * + * + * + */ +static int ircomm_ttp_disconnect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + int ret; + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irttp_disconnect_request(self->tsap, userdata, P_NORMAL); + + return ret; +} + +/* + * Function ircomm_ttp_disconnect_indication (instance, sap, reason, skb) + * + * + * + */ +static void ircomm_ttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + info.reason = reason; + + ircomm_do_event(self, IRCOMM_TTP_DISCONNECT_INDICATION, skb, &info); + + /* Drop reference count - see ircomm_tty_disconnect_indication(). */ + if(skb) + dev_kfree_skb(skb); +} + +/* + * Function ircomm_ttp_flow_indication (instance, sap, cmd) + * + * Layer below is telling us to start or stop the flow of data + * + */ +static void ircomm_ttp_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, self, cmd); +} + + diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c new file mode 100644 index 00000000..b3cc8b39 --- /dev/null +++ b/net/irda/ircomm/ircomm_tty.c @@ -0,0 +1,1411 @@ +/********************************************************************* + * + * Filename: ircomm_tty.c + * Version: 1.0 + * Description: IrCOMM serial TTY driver + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Jun 6 21:00:56 1999 + * Modified at: Wed Feb 23 00:09:02 2000 + * Modified by: Dag Brattli + * Sources: serial.c and previous IrCOMM work by Takahide Higuchi + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for MODULE_ALIAS_CHARDEV_MAJOR */ + +#include + +#include +#include + +#include +#include +#include +#include + +static int ircomm_tty_open(struct tty_struct *tty, struct file *filp); +static void ircomm_tty_close(struct tty_struct * tty, struct file *filp); +static int ircomm_tty_write(struct tty_struct * tty, + const unsigned char *buf, int count); +static int ircomm_tty_write_room(struct tty_struct *tty); +static void ircomm_tty_throttle(struct tty_struct *tty); +static void ircomm_tty_unthrottle(struct tty_struct *tty); +static int ircomm_tty_chars_in_buffer(struct tty_struct *tty); +static void ircomm_tty_flush_buffer(struct tty_struct *tty); +static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch); +static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout); +static void ircomm_tty_hangup(struct tty_struct *tty); +static void ircomm_tty_do_softint(struct work_struct *work); +static void ircomm_tty_shutdown(struct ircomm_tty_cb *self); +static void ircomm_tty_stop(struct tty_struct *tty); + +static int ircomm_tty_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static int ircomm_tty_control_indication(void *instance, void *sap, + struct sk_buff *skb); +static void ircomm_tty_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd); +#ifdef CONFIG_PROC_FS +static const struct file_operations ircomm_tty_proc_fops; +#endif /* CONFIG_PROC_FS */ +static struct tty_driver *driver; + +static hashbin_t *ircomm_tty = NULL; + +static const struct tty_operations ops = { + .open = ircomm_tty_open, + .close = ircomm_tty_close, + .write = ircomm_tty_write, + .write_room = ircomm_tty_write_room, + .chars_in_buffer = ircomm_tty_chars_in_buffer, + .flush_buffer = ircomm_tty_flush_buffer, + .ioctl = ircomm_tty_ioctl, /* ircomm_tty_ioctl.c */ + .tiocmget = ircomm_tty_tiocmget, /* ircomm_tty_ioctl.c */ + .tiocmset = ircomm_tty_tiocmset, /* ircomm_tty_ioctl.c */ + .throttle = ircomm_tty_throttle, + .unthrottle = ircomm_tty_unthrottle, + .send_xchar = ircomm_tty_send_xchar, + .set_termios = ircomm_tty_set_termios, + .stop = ircomm_tty_stop, + .start = ircomm_tty_start, + .hangup = ircomm_tty_hangup, + .wait_until_sent = ircomm_tty_wait_until_sent, +#ifdef CONFIG_PROC_FS + .proc_fops = &ircomm_tty_proc_fops, +#endif /* CONFIG_PROC_FS */ +}; + +/* + * Function ircomm_tty_init() + * + * Init IrCOMM TTY layer/driver + * + */ +static int __init ircomm_tty_init(void) +{ + driver = alloc_tty_driver(IRCOMM_TTY_PORTS); + if (!driver) + return -ENOMEM; + ircomm_tty = hashbin_new(HB_LOCK); + if (ircomm_tty == NULL) { + IRDA_ERROR("%s(), can't allocate hashbin!\n", __func__); + put_tty_driver(driver); + return -ENOMEM; + } + + driver->owner = THIS_MODULE; + driver->driver_name = "ircomm"; + driver->name = "ircomm"; + driver->major = IRCOMM_TTY_MAJOR; + driver->minor_start = IRCOMM_TTY_MINOR; + driver->type = TTY_DRIVER_TYPE_SERIAL; + driver->subtype = SERIAL_TYPE_NORMAL; + driver->init_termios = tty_std_termios; + driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; + driver->flags = TTY_DRIVER_REAL_RAW; + tty_set_operations(driver, &ops); + if (tty_register_driver(driver)) { + IRDA_ERROR("%s(): Couldn't register serial driver\n", + __func__); + put_tty_driver(driver); + return -1; + } + return 0; +} + +static void __exit __ircomm_tty_cleanup(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_tty_shutdown(self); + + self->magic = 0; + kfree(self); +} + +/* + * Function ircomm_tty_cleanup () + * + * Remove IrCOMM TTY layer/driver + * + */ +static void __exit ircomm_tty_cleanup(void) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + ret = tty_unregister_driver(driver); + if (ret) { + IRDA_ERROR("%s(), failed to unregister driver\n", + __func__); + return; + } + + hashbin_delete(ircomm_tty, (FREE_FUNC) __ircomm_tty_cleanup); + put_tty_driver(driver); +} + +/* + * Function ircomm_startup (self) + * + * + * + */ +static int ircomm_tty_startup(struct ircomm_tty_cb *self) +{ + notify_t notify; + int ret = -ENODEV; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* Check if already open */ + if (test_and_set_bit(ASYNC_B_INITIALIZED, &self->flags)) { + IRDA_DEBUG(2, "%s(), already open so break out!\n", __func__ ); + return 0; + } + + /* Register with IrCOMM */ + irda_notify_init(¬ify); + /* These callbacks we must handle ourselves */ + notify.data_indication = ircomm_tty_data_indication; + notify.udata_indication = ircomm_tty_control_indication; + notify.flow_indication = ircomm_tty_flow_indication; + + /* Use the ircomm_tty interface for these ones */ + notify.disconnect_indication = ircomm_tty_disconnect_indication; + notify.connect_confirm = ircomm_tty_connect_confirm; + notify.connect_indication = ircomm_tty_connect_indication; + strlcpy(notify.name, "ircomm_tty", sizeof(notify.name)); + notify.instance = self; + + if (!self->ircomm) { + self->ircomm = ircomm_open(¬ify, self->service_type, + self->line); + } + if (!self->ircomm) + goto err; + + self->slsap_sel = self->ircomm->slsap_sel; + + /* Connect IrCOMM link with remote device */ + ret = ircomm_tty_attach_cable(self); + if (ret < 0) { + IRDA_ERROR("%s(), error attaching cable!\n", __func__); + goto err; + } + + return 0; +err: + clear_bit(ASYNC_B_INITIALIZED, &self->flags); + return ret; +} + +/* + * Function ircomm_block_til_ready (self, filp) + * + * + * + */ +static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, + struct file *filp) +{ + DECLARE_WAITQUEUE(wait, current); + int retval; + int do_clocal = 0, extra_count = 0; + unsigned long flags; + struct tty_struct *tty; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + tty = self->tty; + + /* + * If non-blocking mode is set, or the port is not enabled, + * then make the check up front and then exit. + */ + if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){ + /* nonblock mode is set or port is not enabled */ + self->flags |= ASYNC_NORMAL_ACTIVE; + IRDA_DEBUG(1, "%s(), O_NONBLOCK requested!\n", __func__ ); + return 0; + } + + if (tty->termios->c_cflag & CLOCAL) { + IRDA_DEBUG(1, "%s(), doing CLOCAL!\n", __func__ ); + do_clocal = 1; + } + + /* Wait for carrier detect and the line to become + * free (i.e., not in use by the callout). While we are in + * this loop, self->open_count is dropped by one, so that + * mgsl_close() knows when to free things. We restore it upon + * exit, either normal or abnormal. + */ + + retval = 0; + add_wait_queue(&self->open_wait, &wait); + + IRDA_DEBUG(2, "%s(%d):block_til_ready before block on %s open_count=%d\n", + __FILE__,__LINE__, tty->driver->name, self->open_count ); + + /* As far as I can see, we protect open_count - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + if (!tty_hung_up_p(filp)) { + extra_count = 1; + self->open_count--; + } + spin_unlock_irqrestore(&self->spinlock, flags); + self->blocked_open++; + + while (1) { + if (tty->termios->c_cflag & CBAUD) { + /* Here, we use to lock those two guys, but + * as ircomm_param_request() does it itself, + * I don't see the point (and I see the deadlock). + * Jean II */ + self->settings.dte |= IRCOMM_RTS + IRCOMM_DTR; + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + current->state = TASK_INTERRUPTIBLE; + + if (tty_hung_up_p(filp) || + !test_bit(ASYNC_B_INITIALIZED, &self->flags)) { + retval = (self->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS; + break; + } + + /* + * Check if link is ready now. Even if CLOCAL is + * specified, we cannot return before the IrCOMM link is + * ready + */ + if (!test_bit(ASYNC_B_CLOSING, &self->flags) && + (do_clocal || (self->settings.dce & IRCOMM_CD)) && + self->state == IRCOMM_TTY_READY) + { + break; + } + + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + + IRDA_DEBUG(1, "%s(%d):block_til_ready blocking on %s open_count=%d\n", + __FILE__,__LINE__, tty->driver->name, self->open_count ); + + schedule(); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&self->open_wait, &wait); + + if (extra_count) { + /* ++ is not atomic, so this should be protected - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + self->open_count++; + spin_unlock_irqrestore(&self->spinlock, flags); + } + self->blocked_open--; + + IRDA_DEBUG(1, "%s(%d):block_til_ready after blocking on %s open_count=%d\n", + __FILE__,__LINE__, tty->driver->name, self->open_count); + + if (!retval) + self->flags |= ASYNC_NORMAL_ACTIVE; + + return retval; +} + +/* + * Function ircomm_tty_open (tty, filp) + * + * This routine is called when a particular tty device is opened. This + * routine is mandatory; if this routine is not filled in, the attempted + * open will fail with ENODEV. + */ +static int ircomm_tty_open(struct tty_struct *tty, struct file *filp) +{ + struct ircomm_tty_cb *self; + unsigned int line; + unsigned long flags; + int ret; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + line = tty->index; + if (line >= IRCOMM_TTY_PORTS) + return -ENODEV; + + /* Check if instance already exists */ + self = hashbin_lock_find(ircomm_tty, line, NULL); + if (!self) { + /* No, so make new instance */ + self = kzalloc(sizeof(struct ircomm_tty_cb), GFP_KERNEL); + if (self == NULL) { + IRDA_ERROR("%s(), kmalloc failed!\n", __func__); + return -ENOMEM; + } + + self->magic = IRCOMM_TTY_MAGIC; + self->flow = FLOW_STOP; + + self->line = line; + INIT_WORK(&self->tqueue, ircomm_tty_do_softint); + self->max_header_size = IRCOMM_TTY_HDR_UNINITIALISED; + self->max_data_size = IRCOMM_TTY_DATA_UNINITIALISED; + self->close_delay = 5*HZ/10; + self->closing_wait = 30*HZ; + + /* Init some important stuff */ + init_timer(&self->watchdog_timer); + init_waitqueue_head(&self->open_wait); + init_waitqueue_head(&self->close_wait); + spin_lock_init(&self->spinlock); + + /* + * Force TTY into raw mode by default which is usually what + * we want for IrCOMM and IrLPT. This way applications will + * not have to twiddle with printcap etc. + * + * Note this is completely usafe and doesn't work properly + */ + tty->termios->c_iflag = 0; + tty->termios->c_oflag = 0; + + /* Insert into hash */ + hashbin_insert(ircomm_tty, (irda_queue_t *) self, line, NULL); + } + /* ++ is not atomic, so this should be protected - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + self->open_count++; + + tty->driver_data = self; + self->tty = tty; + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(1, "%s(), %s%d, count = %d\n", __func__ , tty->driver->name, + self->line, self->open_count); + + /* Not really used by us, but lets do it anyway */ + self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0; + + /* + * If the port is the middle of closing, bail out now + */ + if (tty_hung_up_p(filp) || + test_bit(ASYNC_B_CLOSING, &self->flags)) { + + /* Hm, why are we blocking on ASYNC_CLOSING if we + * do return -EAGAIN/-ERESTARTSYS below anyway? + * IMHO it's either not needed in the first place + * or for some reason we need to make sure the async + * closing has been finished - if so, wouldn't we + * probably better sleep uninterruptible? + */ + + if (wait_event_interruptible(self->close_wait, !test_bit(ASYNC_B_CLOSING, &self->flags))) { + IRDA_WARNING("%s - got signal while blocking on ASYNC_CLOSING!\n", + __func__); + return -ERESTARTSYS; + } + +#ifdef SERIAL_DO_RESTART + return (self->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS; +#else + return -EAGAIN; +#endif + } + + /* Check if this is a "normal" ircomm device, or an irlpt device */ + if (line < 0x10) { + self->service_type = IRCOMM_3_WIRE | IRCOMM_9_WIRE; + self->settings.service_type = IRCOMM_9_WIRE; /* 9 wire as default */ + /* Jan Kiszka -> add DSR/RI -> Conform to IrCOMM spec */ + self->settings.dce = IRCOMM_CTS | IRCOMM_CD | IRCOMM_DSR | IRCOMM_RI; /* Default line settings */ + IRDA_DEBUG(2, "%s(), IrCOMM device\n", __func__ ); + } else { + IRDA_DEBUG(2, "%s(), IrLPT device\n", __func__ ); + self->service_type = IRCOMM_3_WIRE_RAW; + self->settings.service_type = IRCOMM_3_WIRE_RAW; /* Default */ + } + + ret = ircomm_tty_startup(self); + if (ret) + return ret; + + ret = ircomm_tty_block_til_ready(self, filp); + if (ret) { + IRDA_DEBUG(2, + "%s(), returning after block_til_ready with %d\n", __func__ , + ret); + + return ret; + } + return 0; +} + +/* + * Function ircomm_tty_close (tty, filp) + * + * This routine is called when a particular tty device is closed. + * + */ +static void ircomm_tty_close(struct tty_struct *tty, struct file *filp) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + spin_lock_irqsave(&self->spinlock, flags); + + if (tty_hung_up_p(filp)) { + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(0, "%s(), returning 1\n", __func__ ); + return; + } + + if ((tty->count == 1) && (self->open_count != 1)) { + /* + * Uh, oh. tty->count is 1, which means that the tty + * structure will be freed. state->count should always + * be one in these conditions. If it's greater than + * one, we've got real problems, since it means the + * serial port won't be shutdown. + */ + IRDA_DEBUG(0, "%s(), bad serial port count; " + "tty->count is 1, state->count is %d\n", __func__ , + self->open_count); + self->open_count = 1; + } + + if (--self->open_count < 0) { + IRDA_ERROR("%s(), bad serial port count for ttys%d: %d\n", + __func__, self->line, self->open_count); + self->open_count = 0; + } + if (self->open_count) { + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(0, "%s(), open count > 0\n", __func__ ); + return; + } + + /* Hum... Should be test_and_set_bit ??? - Jean II */ + set_bit(ASYNC_B_CLOSING, &self->flags); + + /* We need to unlock here (we were unlocking at the end of this + * function), because tty_wait_until_sent() may schedule. + * I don't know if the rest should be protected somehow, + * so someone should check. - Jean II */ + spin_unlock_irqrestore(&self->spinlock, flags); + + /* + * Now we wait for the transmit buffer to clear; and we notify + * the line discipline to only process XON/XOFF characters. + */ + tty->closing = 1; + if (self->closing_wait != ASYNC_CLOSING_WAIT_NONE) + tty_wait_until_sent(tty, self->closing_wait); + + ircomm_tty_shutdown(self); + + tty_driver_flush_buffer(tty); + tty_ldisc_flush(tty); + + tty->closing = 0; + self->tty = NULL; + + if (self->blocked_open) { + if (self->close_delay) + schedule_timeout_interruptible(self->close_delay); + wake_up_interruptible(&self->open_wait); + } + + self->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING); + wake_up_interruptible(&self->close_wait); +} + +/* + * Function ircomm_tty_flush_buffer (tty) + * + * + * + */ +static void ircomm_tty_flush_buffer(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* + * Let do_softint() do this to avoid race condition with + * do_softint() ;-) + */ + schedule_work(&self->tqueue); +} + +/* + * Function ircomm_tty_do_softint (work) + * + * We use this routine to give the write wakeup to the user at at a + * safe time (as fast as possible after write have completed). This + * can be compared to the Tx interrupt. + */ +static void ircomm_tty_do_softint(struct work_struct *work) +{ + struct ircomm_tty_cb *self = + container_of(work, struct ircomm_tty_cb, tqueue); + struct tty_struct *tty; + unsigned long flags; + struct sk_buff *skb, *ctrl_skb; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (!self || self->magic != IRCOMM_TTY_MAGIC) + return; + + tty = self->tty; + if (!tty) + return; + + /* Unlink control buffer */ + spin_lock_irqsave(&self->spinlock, flags); + + ctrl_skb = self->ctrl_skb; + self->ctrl_skb = NULL; + + spin_unlock_irqrestore(&self->spinlock, flags); + + /* Flush control buffer if any */ + if(ctrl_skb) { + if(self->flow == FLOW_START) + ircomm_control_request(self->ircomm, ctrl_skb); + /* Drop reference count - see ircomm_ttp_data_request(). */ + dev_kfree_skb(ctrl_skb); + } + + if (tty->hw_stopped) + return; + + /* Unlink transmit buffer */ + spin_lock_irqsave(&self->spinlock, flags); + + skb = self->tx_skb; + self->tx_skb = NULL; + + spin_unlock_irqrestore(&self->spinlock, flags); + + /* Flush transmit buffer if any */ + if (skb) { + ircomm_tty_do_event(self, IRCOMM_TTY_DATA_REQUEST, skb, NULL); + /* Drop reference count - see ircomm_ttp_data_request(). */ + dev_kfree_skb(skb); + } + + /* Check if user (still) wants to be waken up */ + tty_wakeup(tty); +} + +/* + * Function ircomm_tty_write (tty, buf, count) + * + * This routine is called by the kernel to write a series of characters + * to the tty device. The characters may come from user space or kernel + * space. This routine will return the number of characters actually + * accepted for writing. This routine is mandatory. + */ +static int ircomm_tty_write(struct tty_struct *tty, + const unsigned char *buf, int count) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + struct sk_buff *skb; + int tailroom = 0; + int len = 0; + int size; + + IRDA_DEBUG(2, "%s(), count=%d, hw_stopped=%d\n", __func__ , count, + tty->hw_stopped); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* We may receive packets from the TTY even before we have finished + * our setup. Not cool. + * The problem is that we don't know the final header and data size + * to create the proper skb, so any skb we would create would have + * bogus header and data size, so need care. + * We use a bogus header size to safely detect this condition. + * Another problem is that hw_stopped was set to 0 way before it + * should be, so we would drop this skb. It should now be fixed. + * One option is to not accept data until we are properly setup. + * But, I suspect that when it happens, the ppp line discipline + * just "drops" the data, which might screw up connect scripts. + * The second option is to create a "safe skb", with large header + * and small size (see ircomm_tty_open() for values). + * We just need to make sure that when the real values get filled, + * we don't mess up the original "safe skb" (see tx_data_size). + * Jean II */ + if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED) { + IRDA_DEBUG(1, "%s() : not initialised\n", __func__); +#ifdef IRCOMM_NO_TX_BEFORE_INIT + /* We didn't consume anything, TTY will retry */ + return 0; +#endif + } + + if (count < 1) + return 0; + + /* Protect our manipulation of self->tx_skb and related */ + spin_lock_irqsave(&self->spinlock, flags); + + /* Fetch current transmit buffer */ + skb = self->tx_skb; + + /* + * Send out all the data we get, possibly as multiple fragmented + * frames, but this will only happen if the data is larger than the + * max data size. The normal case however is just the opposite, and + * this function may be called multiple times, and will then actually + * defragment the data and send it out as one packet as soon as + * possible, but at a safer point in time + */ + while (count) { + size = count; + + /* Adjust data size to the max data size */ + if (size > self->max_data_size) + size = self->max_data_size; + + /* + * Do we already have a buffer ready for transmit, or do + * we need to allocate a new frame + */ + if (skb) { + /* + * Any room for more data at the end of the current + * transmit buffer? Cannot use skb_tailroom, since + * dev_alloc_skb gives us a larger skb than we + * requested + * Note : use tx_data_size, because max_data_size + * may have changed and we don't want to overwrite + * the skb. - Jean II + */ + if ((tailroom = (self->tx_data_size - skb->len)) > 0) { + /* Adjust data to tailroom */ + if (size > tailroom) + size = tailroom; + } else { + /* + * Current transmit frame is full, so break + * out, so we can send it as soon as possible + */ + break; + } + } else { + /* Prepare a full sized frame */ + skb = alloc_skb(self->max_data_size+ + self->max_header_size, + GFP_ATOMIC); + if (!skb) { + spin_unlock_irqrestore(&self->spinlock, flags); + return -ENOBUFS; + } + skb_reserve(skb, self->max_header_size); + self->tx_skb = skb; + /* Remember skb size because max_data_size may + * change later on - Jean II */ + self->tx_data_size = self->max_data_size; + } + + /* Copy data */ + memcpy(skb_put(skb,size), buf + len, size); + + count -= size; + len += size; + } + + spin_unlock_irqrestore(&self->spinlock, flags); + + /* + * Schedule a new thread which will transmit the frame as soon + * as possible, but at a safe point in time. We do this so the + * "user" can give us data multiple times, as PPP does (because of + * its 256 byte tx buffer). We will then defragment and send out + * all this data as one single packet. + */ + schedule_work(&self->tqueue); + + return len; +} + +/* + * Function ircomm_tty_write_room (tty) + * + * This routine returns the numbers of characters the tty driver will + * accept for queuing to be written. This number is subject to change as + * output buffers get emptied, or if the output flow control is acted. + */ +static int ircomm_tty_write_room(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + +#ifdef IRCOMM_NO_TX_BEFORE_INIT + /* max_header_size tells us if the channel is initialised or not. */ + if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED) + /* Don't bother us yet */ + return 0; +#endif + + /* Check if we are allowed to transmit any data. + * hw_stopped is the regular flow control. + * Jean II */ + if (tty->hw_stopped) + ret = 0; + else { + spin_lock_irqsave(&self->spinlock, flags); + if (self->tx_skb) + ret = self->tx_data_size - self->tx_skb->len; + else + ret = self->max_data_size; + spin_unlock_irqrestore(&self->spinlock, flags); + } + IRDA_DEBUG(2, "%s(), ret=%d\n", __func__ , ret); + + return ret; +} + +/* + * Function ircomm_tty_wait_until_sent (tty, timeout) + * + * This routine waits until the device has written out all of the + * characters in its transmitter FIFO. + */ +static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long orig_jiffies, poll_time; + unsigned long flags; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + orig_jiffies = jiffies; + + /* Set poll time to 200 ms */ + poll_time = IRDA_MIN(timeout, msecs_to_jiffies(200)); + + spin_lock_irqsave(&self->spinlock, flags); + while (self->tx_skb && self->tx_skb->len) { + spin_unlock_irqrestore(&self->spinlock, flags); + schedule_timeout_interruptible(poll_time); + spin_lock_irqsave(&self->spinlock, flags); + if (signal_pending(current)) + break; + if (timeout && time_after(jiffies, orig_jiffies + timeout)) + break; + } + spin_unlock_irqrestore(&self->spinlock, flags); + current->state = TASK_RUNNING; +} + +/* + * Function ircomm_tty_throttle (tty) + * + * This routine notifies the tty driver that input buffers for the line + * discipline are close to full, and it should somehow signal that no + * more characters should be sent to the tty. + */ +static void ircomm_tty_throttle(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* Software flow control? */ + if (I_IXOFF(tty)) + ircomm_tty_send_xchar(tty, STOP_CHAR(tty)); + + /* Hardware flow control? */ + if (tty->termios->c_cflag & CRTSCTS) { + self->settings.dte &= ~IRCOMM_RTS; + self->settings.dte |= IRCOMM_DELTA_RTS; + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + ircomm_flow_request(self->ircomm, FLOW_STOP); +} + +/* + * Function ircomm_tty_unthrottle (tty) + * + * This routine notifies the tty drivers that it should signals that + * characters can now be sent to the tty without fear of overrunning the + * input buffers of the line disciplines. + */ +static void ircomm_tty_unthrottle(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* Using software flow control? */ + if (I_IXOFF(tty)) { + ircomm_tty_send_xchar(tty, START_CHAR(tty)); + } + + /* Using hardware flow control? */ + if (tty->termios->c_cflag & CRTSCTS) { + self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS); + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + IRDA_DEBUG(1, "%s(), FLOW_START\n", __func__ ); + } + ircomm_flow_request(self->ircomm, FLOW_START); +} + +/* + * Function ircomm_tty_chars_in_buffer (tty) + * + * Indicates if there are any data in the buffer + * + */ +static int ircomm_tty_chars_in_buffer(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + int len = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + spin_lock_irqsave(&self->spinlock, flags); + + if (self->tx_skb) + len = self->tx_skb->len; + + spin_unlock_irqrestore(&self->spinlock, flags); + + return len; +} + +static void ircomm_tty_shutdown(struct ircomm_tty_cb *self) +{ + unsigned long flags; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + if (!test_and_clear_bit(ASYNC_B_INITIALIZED, &self->flags)) + return; + + ircomm_tty_detach_cable(self); + + spin_lock_irqsave(&self->spinlock, flags); + + del_timer(&self->watchdog_timer); + + /* Free parameter buffer */ + if (self->ctrl_skb) { + dev_kfree_skb(self->ctrl_skb); + self->ctrl_skb = NULL; + } + + /* Free transmit buffer */ + if (self->tx_skb) { + dev_kfree_skb(self->tx_skb); + self->tx_skb = NULL; + } + + if (self->ircomm) { + ircomm_close(self->ircomm); + self->ircomm = NULL; + } + + spin_unlock_irqrestore(&self->spinlock, flags); +} + +/* + * Function ircomm_tty_hangup (tty) + * + * This routine notifies the tty driver that it should hangup the tty + * device. + * + */ +static void ircomm_tty_hangup(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* ircomm_tty_flush_buffer(tty); */ + ircomm_tty_shutdown(self); + + /* I guess we need to lock here - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + self->flags &= ~ASYNC_NORMAL_ACTIVE; + self->tty = NULL; + self->open_count = 0; + spin_unlock_irqrestore(&self->spinlock, flags); + + wake_up_interruptible(&self->open_wait); +} + +/* + * Function ircomm_tty_send_xchar (tty, ch) + * + * This routine is used to send a high-priority XON/XOFF character to + * the device. + */ +static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch) +{ + IRDA_DEBUG(0, "%s(), not impl\n", __func__ ); +} + +/* + * Function ircomm_tty_start (tty) + * + * This routine notifies the tty driver that it resume sending + * characters to the tty device. + */ +void ircomm_tty_start(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + ircomm_flow_request(self->ircomm, FLOW_START); +} + +/* + * Function ircomm_tty_stop (tty) + * + * This routine notifies the tty driver that it should stop outputting + * characters to the tty device. + */ +static void ircomm_tty_stop(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_flow_request(self->ircomm, FLOW_STOP); +} + +/* + * Function ircomm_check_modem_status (self) + * + * Check for any changes in the DCE's line settings. This function should + * be called whenever the dce parameter settings changes, to update the + * flow control settings and other things + */ +void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self) +{ + struct tty_struct *tty; + int status; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + tty = self->tty; + + status = self->settings.dce; + + if (status & IRCOMM_DCE_DELTA_ANY) { + /*wake_up_interruptible(&self->delta_msr_wait);*/ + } + if ((self->flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) { + IRDA_DEBUG(2, + "%s(), ircomm%d CD now %s...\n", __func__ , self->line, + (status & IRCOMM_CD) ? "on" : "off"); + + if (status & IRCOMM_CD) { + wake_up_interruptible(&self->open_wait); + } else { + IRDA_DEBUG(2, + "%s(), Doing serial hangup..\n", __func__ ); + if (tty) + tty_hangup(tty); + + /* Hangup will remote the tty, so better break out */ + return; + } + } + if (self->flags & ASYNC_CTS_FLOW) { + if (tty->hw_stopped) { + if (status & IRCOMM_CTS) { + IRDA_DEBUG(2, + "%s(), CTS tx start...\n", __func__ ); + tty->hw_stopped = 0; + + /* Wake up processes blocked on open */ + wake_up_interruptible(&self->open_wait); + + schedule_work(&self->tqueue); + return; + } + } else { + if (!(status & IRCOMM_CTS)) { + IRDA_DEBUG(2, + "%s(), CTS tx stop...\n", __func__ ); + tty->hw_stopped = 1; + } + } + } +} + +/* + * Function ircomm_tty_data_indication (instance, sap, skb) + * + * Handle incoming data, and deliver it to the line discipline + * + */ +static int ircomm_tty_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + if (!self->tty) { + IRDA_DEBUG(0, "%s(), no tty!\n", __func__ ); + return 0; + } + + /* + * If we receive data when hardware is stopped then something is wrong. + * We try to poll the peers line settings to check if we are up todate. + * Devices like WinCE can do this, and since they don't send any + * params, we can just as well declare the hardware for running. + */ + if (self->tty->hw_stopped && (self->flow == FLOW_START)) { + IRDA_DEBUG(0, "%s(), polling for line settings!\n", __func__ ); + ircomm_param_request(self, IRCOMM_POLL, TRUE); + + /* We can just as well declare the hardware for running */ + ircomm_tty_send_initial_parameters(self); + ircomm_tty_link_established(self); + } + + /* + * Use flip buffer functions since the code may be called from interrupt + * context + */ + tty_insert_flip_string(self->tty, skb->data, skb->len); + tty_flip_buffer_push(self->tty); + + /* No need to kfree_skb - see ircomm_ttp_data_indication() */ + + return 0; +} + +/* + * Function ircomm_tty_control_indication (instance, sap, skb) + * + * Parse all incoming parameters (easy!) + * + */ +static int ircomm_tty_control_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + int clen; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + clen = skb->data[0]; + + irda_param_extract_all(self, skb->data+1, IRDA_MIN(skb->len-1, clen), + &ircomm_param_info); + + /* No need to kfree_skb - see ircomm_control_indication() */ + + return 0; +} + +/* + * Function ircomm_tty_flow_indication (instance, sap, cmd) + * + * This function is called by IrTTP when it wants us to slow down the + * transmission of data. We just mark the hardware as stopped, and wait + * for IrTTP to notify us that things are OK again. + */ +static void ircomm_tty_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + struct tty_struct *tty; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + tty = self->tty; + + switch (cmd) { + case FLOW_START: + IRDA_DEBUG(2, "%s(), hw start!\n", __func__ ); + tty->hw_stopped = 0; + + /* ircomm_tty_do_softint will take care of the rest */ + schedule_work(&self->tqueue); + break; + default: /* If we get here, something is very wrong, better stop */ + case FLOW_STOP: + IRDA_DEBUG(2, "%s(), hw stopped!\n", __func__ ); + tty->hw_stopped = 1; + break; + } + self->flow = cmd; +} + +#ifdef CONFIG_PROC_FS +static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m) +{ + char sep; + + seq_printf(m, "State: %s\n", ircomm_tty_state[self->state]); + + seq_puts(m, "Service type: "); + if (self->service_type & IRCOMM_9_WIRE) + seq_puts(m, "9_WIRE"); + else if (self->service_type & IRCOMM_3_WIRE) + seq_puts(m, "3_WIRE"); + else if (self->service_type & IRCOMM_3_WIRE_RAW) + seq_puts(m, "3_WIRE_RAW"); + else + seq_puts(m, "No common service type!\n"); + seq_putc(m, '\n'); + + seq_printf(m, "Port name: %s\n", self->settings.port_name); + + seq_printf(m, "DTE status:"); + sep = ' '; + if (self->settings.dte & IRCOMM_RTS) { + seq_printf(m, "%cRTS", sep); + sep = '|'; + } + if (self->settings.dte & IRCOMM_DTR) { + seq_printf(m, "%cDTR", sep); + sep = '|'; + } + seq_putc(m, '\n'); + + seq_puts(m, "DCE status:"); + sep = ' '; + if (self->settings.dce & IRCOMM_CTS) { + seq_printf(m, "%cCTS", sep); + sep = '|'; + } + if (self->settings.dce & IRCOMM_DSR) { + seq_printf(m, "%cDSR", sep); + sep = '|'; + } + if (self->settings.dce & IRCOMM_CD) { + seq_printf(m, "%cCD", sep); + sep = '|'; + } + if (self->settings.dce & IRCOMM_RI) { + seq_printf(m, "%cRI", sep); + sep = '|'; + } + seq_putc(m, '\n'); + + seq_puts(m, "Configuration: "); + if (!self->settings.null_modem) + seq_puts(m, "DTE <-> DCE\n"); + else + seq_puts(m, "DTE <-> DTE (null modem emulation)\n"); + + seq_printf(m, "Data rate: %d\n", self->settings.data_rate); + + seq_puts(m, "Flow control:"); + sep = ' '; + if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) { + seq_printf(m, "%cXON_XOFF_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) { + seq_printf(m, "%cXON_XOFF_OUT", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) { + seq_printf(m, "%cRTS_CTS_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) { + seq_printf(m, "%cRTS_CTS_OUT", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) { + seq_printf(m, "%cDSR_DTR_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) { + seq_printf(m, "%cDSR_DTR_OUT", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) { + seq_printf(m, "%cENQ_ACK_IN", sep); + sep = '|'; + } + if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) { + seq_printf(m, "%cENQ_ACK_OUT", sep); + sep = '|'; + } + seq_putc(m, '\n'); + + seq_puts(m, "Flags:"); + sep = ' '; + if (self->flags & ASYNC_CTS_FLOW) { + seq_printf(m, "%cASYNC_CTS_FLOW", sep); + sep = '|'; + } + if (self->flags & ASYNC_CHECK_CD) { + seq_printf(m, "%cASYNC_CHECK_CD", sep); + sep = '|'; + } + if (self->flags & ASYNC_INITIALIZED) { + seq_printf(m, "%cASYNC_INITIALIZED", sep); + sep = '|'; + } + if (self->flags & ASYNC_LOW_LATENCY) { + seq_printf(m, "%cASYNC_LOW_LATENCY", sep); + sep = '|'; + } + if (self->flags & ASYNC_CLOSING) { + seq_printf(m, "%cASYNC_CLOSING", sep); + sep = '|'; + } + if (self->flags & ASYNC_NORMAL_ACTIVE) { + seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep); + sep = '|'; + } + seq_putc(m, '\n'); + + seq_printf(m, "Role: %s\n", self->client ? "client" : "server"); + seq_printf(m, "Open count: %d\n", self->open_count); + seq_printf(m, "Max data size: %d\n", self->max_data_size); + seq_printf(m, "Max header size: %d\n", self->max_header_size); + + if (self->tty) + seq_printf(m, "Hardware: %s\n", + self->tty->hw_stopped ? "Stopped" : "Running"); +} + +static int ircomm_tty_proc_show(struct seq_file *m, void *v) +{ + struct ircomm_tty_cb *self; + unsigned long flags; + + spin_lock_irqsave(&ircomm_tty->hb_spinlock, flags); + + self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty); + while (self != NULL) { + if (self->magic != IRCOMM_TTY_MAGIC) + break; + + ircomm_tty_line_info(self, m); + self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty); + } + spin_unlock_irqrestore(&ircomm_tty->hb_spinlock, flags); + return 0; +} + +static int ircomm_tty_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ircomm_tty_proc_show, NULL); +} + +static const struct file_operations ircomm_tty_proc_fops = { + .owner = THIS_MODULE, + .open = ircomm_tty_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +MODULE_AUTHOR("Dag Brattli "); +MODULE_DESCRIPTION("IrCOMM serial TTY driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CHARDEV_MAJOR(IRCOMM_TTY_MAJOR); + +module_init(ircomm_tty_init); +module_exit(ircomm_tty_cleanup); diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c new file mode 100644 index 00000000..3c175402 --- /dev/null +++ b/net/irda/ircomm/ircomm_tty_attach.c @@ -0,0 +1,997 @@ +/********************************************************************* + * + * Filename: ircomm_tty_attach.c + * Version: + * Description: Code for attaching the serial driver to IrCOMM + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Jun 5 17:42:00 1999 + * Modified at: Tue Jan 4 14:20:49 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static void ircomm_tty_ias_register(struct ircomm_tty_cb *self); +static void ircomm_tty_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv); +static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id, + struct ias_value *value, void *priv); +static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self, + int timeout); +static void ircomm_tty_watchdog_timer_expired(void *data); + +static int ircomm_tty_state_idle(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_search(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_setup(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_ready(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); + +const char *const ircomm_tty_state[] = { + "IRCOMM_TTY_IDLE", + "IRCOMM_TTY_SEARCH", + "IRCOMM_TTY_QUERY_PARAMETERS", + "IRCOMM_TTY_QUERY_LSAP_SEL", + "IRCOMM_TTY_SETUP", + "IRCOMM_TTY_READY", + "*** ERROR *** ", +}; + +#ifdef CONFIG_IRDA_DEBUG +static const char *const ircomm_tty_event[] = { + "IRCOMM_TTY_ATTACH_CABLE", + "IRCOMM_TTY_DETACH_CABLE", + "IRCOMM_TTY_DATA_REQUEST", + "IRCOMM_TTY_DATA_INDICATION", + "IRCOMM_TTY_DISCOVERY_REQUEST", + "IRCOMM_TTY_DISCOVERY_INDICATION", + "IRCOMM_TTY_CONNECT_CONFIRM", + "IRCOMM_TTY_CONNECT_INDICATION", + "IRCOMM_TTY_DISCONNECT_REQUEST", + "IRCOMM_TTY_DISCONNECT_INDICATION", + "IRCOMM_TTY_WD_TIMER_EXPIRED", + "IRCOMM_TTY_GOT_PARAMETERS", + "IRCOMM_TTY_GOT_LSAPSEL", + "*** ERROR ****", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static int (*state[])(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event, + struct sk_buff *skb, struct ircomm_tty_info *info) = +{ + ircomm_tty_state_idle, + ircomm_tty_state_search, + ircomm_tty_state_query_parameters, + ircomm_tty_state_query_lsap_sel, + ircomm_tty_state_setup, + ircomm_tty_state_ready, +}; + +/* + * Function ircomm_tty_attach_cable (driver) + * + * Try to attach cable (IrCOMM link). This function will only return + * when the link has been connected, or if an error condition occurs. + * If success, the return value is the resulting service type. + */ +int ircomm_tty_attach_cable(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* Check if somebody has already connected to us */ + if (ircomm_is_connected(self->ircomm)) { + IRDA_DEBUG(0, "%s(), already connected!\n", __func__ ); + return 0; + } + + /* Make sure nobody tries to write before the link is up */ + self->tty->hw_stopped = 1; + + ircomm_tty_ias_register(self); + + ircomm_tty_do_event(self, IRCOMM_TTY_ATTACH_CABLE, NULL, NULL); + + return 0; +} + +/* + * Function ircomm_detach_cable (driver) + * + * Detach cable, or cable has been detached by peer + * + */ +void ircomm_tty_detach_cable(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + del_timer(&self->watchdog_timer); + + /* Remove discovery handler */ + if (self->ckey) { + irlmp_unregister_client(self->ckey); + self->ckey = NULL; + } + /* Remove IrCOMM hint bits */ + if (self->skey) { + irlmp_unregister_service(self->skey); + self->skey = NULL; + } + + if (self->iriap) { + iriap_close(self->iriap); + self->iriap = NULL; + } + + /* Remove LM-IAS object */ + if (self->obj) { + irias_delete_object(self->obj); + self->obj = NULL; + } + + ircomm_tty_do_event(self, IRCOMM_TTY_DETACH_CABLE, NULL, NULL); + + /* Reset some values */ + self->daddr = self->saddr = 0; + self->dlsap_sel = self->slsap_sel = 0; + + memset(&self->settings, 0, sizeof(struct ircomm_params)); +} + +/* + * Function ircomm_tty_ias_register (self) + * + * Register with LM-IAS depending on which service type we are + * + */ +static void ircomm_tty_ias_register(struct ircomm_tty_cb *self) +{ + __u8 oct_seq[6]; + __u16 hints; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* Compute hint bits based on service */ + hints = irlmp_service_to_hint(S_COMM); + if (self->service_type & IRCOMM_3_WIRE_RAW) + hints |= irlmp_service_to_hint(S_PRINTER); + + /* Advertise IrCOMM hint bit in discovery */ + if (!self->skey) + self->skey = irlmp_register_service(hints); + /* Set up a discovery handler */ + if (!self->ckey) + self->ckey = irlmp_register_client(hints, + ircomm_tty_discovery_indication, + NULL, (void *) self); + + /* If already done, no need to do it again */ + if (self->obj) + return; + + if (self->service_type & IRCOMM_3_WIRE_RAW) { + /* Register IrLPT with LM-IAS */ + self->obj = irias_new_object("IrLPT", IAS_IRLPT_ID); + irias_add_integer_attrib(self->obj, "IrDA:IrLMP:LsapSel", + self->slsap_sel, IAS_KERNEL_ATTR); + } else { + /* Register IrCOMM with LM-IAS */ + self->obj = irias_new_object("IrDA:IrCOMM", IAS_IRCOMM_ID); + irias_add_integer_attrib(self->obj, "IrDA:TinyTP:LsapSel", + self->slsap_sel, IAS_KERNEL_ATTR); + + /* Code the parameters into the buffer */ + irda_param_pack(oct_seq, "bbbbbb", + IRCOMM_SERVICE_TYPE, 1, self->service_type, + IRCOMM_PORT_TYPE, 1, IRCOMM_SERIAL); + + /* Register parameters with LM-IAS */ + irias_add_octseq_attrib(self->obj, "Parameters", oct_seq, 6, + IAS_KERNEL_ATTR); + } + irias_insert_object(self->obj); +} + +/* + * Function ircomm_tty_ias_unregister (self) + * + * Remove our IAS object and client hook while connected. + * + */ +static void ircomm_tty_ias_unregister(struct ircomm_tty_cb *self) +{ + /* Remove LM-IAS object now so it is not reused. + * IrCOMM deals very poorly with multiple incoming connections. + * It should looks a lot more like IrNET, and "dup" a server TSAP + * to the application TSAP (based on various rules). + * This is a cheap workaround allowing multiple clients to + * connect to us. It will not always work. + * Each IrCOMM socket has an IAS entry. Incoming connection will + * pick the first one found. So, when we are fully connected, + * we remove our IAS entries so that the next IAS entry is used. + * We do that for *both* client and server, because a server + * can also create client instances. + * Jean II */ + if (self->obj) { + irias_delete_object(self->obj); + self->obj = NULL; + } + +#if 0 + /* Remove discovery handler. + * While we are connected, we no longer need to receive + * discovery events. This would be the case if there is + * multiple IrLAP interfaces. Jean II */ + if (self->ckey) { + irlmp_unregister_client(self->ckey); + self->ckey = NULL; + } +#endif +} + +/* + * Function ircomm_send_initial_parameters (self) + * + * Send initial parameters to the remote IrCOMM device. These parameters + * must be sent before any data. + */ +int ircomm_tty_send_initial_parameters(struct ircomm_tty_cb *self) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (self->service_type & IRCOMM_3_WIRE_RAW) + return 0; + + /* + * Set default values, but only if the application for some reason + * haven't set them already + */ + IRDA_DEBUG(2, "%s(), data-rate = %d\n", __func__ , + self->settings.data_rate); + if (!self->settings.data_rate) + self->settings.data_rate = 9600; + IRDA_DEBUG(2, "%s(), data-format = %d\n", __func__ , + self->settings.data_format); + if (!self->settings.data_format) + self->settings.data_format = IRCOMM_WSIZE_8; /* 8N1 */ + + IRDA_DEBUG(2, "%s(), flow-control = %d\n", __func__ , + self->settings.flow_control); + /*self->settings.flow_control = IRCOMM_RTS_CTS_IN|IRCOMM_RTS_CTS_OUT;*/ + + /* Do not set delta values for the initial parameters */ + self->settings.dte = IRCOMM_DTR | IRCOMM_RTS; + + /* Only send service type parameter when we are the client */ + if (self->client) + ircomm_param_request(self, IRCOMM_SERVICE_TYPE, FALSE); + ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE); + ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE); + + /* For a 3 wire service, we just flush the last parameter and return */ + if (self->settings.service_type == IRCOMM_3_WIRE) { + ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE); + return 0; + } + + /* Only 9-wire service types continue here */ + ircomm_param_request(self, IRCOMM_FLOW_CONTROL, FALSE); +#if 0 + ircomm_param_request(self, IRCOMM_XON_XOFF, FALSE); + ircomm_param_request(self, IRCOMM_ENQ_ACK, FALSE); +#endif + /* Notify peer that we are ready to receive data */ + ircomm_param_request(self, IRCOMM_DTE, TRUE); + + return 0; +} + +/* + * Function ircomm_tty_discovery_indication (discovery) + * + * Remote device is discovered, try query the remote IAS to see which + * device it is, and which services it has. + * + */ +static void ircomm_tty_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv) +{ + struct ircomm_tty_cb *self; + struct ircomm_tty_info info; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Important note : + * We need to drop all passive discoveries. + * The LSAP management of IrComm is deficient and doesn't deal + * with the case of two instance connecting to each other + * simultaneously (it will deadlock in LMP). + * The proper fix would be to use the same technique as in IrNET, + * to have one server socket and separate instances for the + * connecting/connected socket. + * The workaround is to drop passive discovery, which drastically + * reduce the probability of this happening. + * Jean II */ + if(mode == DISCOVERY_PASSIVE) + return; + + info.daddr = discovery->daddr; + info.saddr = discovery->saddr; + + self = (struct ircomm_tty_cb *) priv; + ircomm_tty_do_event(self, IRCOMM_TTY_DISCOVERY_INDICATION, + NULL, &info); +} + +/* + * Function ircomm_tty_disconnect_indication (instance, sap, reason, skb) + * + * Link disconnected + * + */ +void ircomm_tty_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + if (!self->tty) + return; + + /* This will stop control data transfers */ + self->flow = FLOW_STOP; + + /* Stop data transfers */ + self->tty->hw_stopped = 1; + + ircomm_tty_do_event(self, IRCOMM_TTY_DISCONNECT_INDICATION, NULL, + NULL); +} + +/* + * Function ircomm_tty_getvalue_confirm (result, obj_id, value, priv) + * + * Got result from the IAS query we make + * + */ +static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id, + struct ias_value *value, + void *priv) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) priv; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* Check if request succeeded */ + if (result != IAS_SUCCESS) { + IRDA_DEBUG(4, "%s(), got NULL value!\n", __func__ ); + return; + } + + switch (value->type) { + case IAS_OCT_SEQ: + IRDA_DEBUG(2, "%s(), got octet sequence\n", __func__ ); + + irda_param_extract_all(self, value->t.oct_seq, value->len, + &ircomm_param_info); + + ircomm_tty_do_event(self, IRCOMM_TTY_GOT_PARAMETERS, NULL, + NULL); + break; + case IAS_INTEGER: + /* Got LSAP selector */ + IRDA_DEBUG(2, "%s(), got lsapsel = %d\n", __func__ , + value->t.integer); + + if (value->t.integer == -1) { + IRDA_DEBUG(0, "%s(), invalid value!\n", __func__ ); + } else + self->dlsap_sel = value->t.integer; + + ircomm_tty_do_event(self, IRCOMM_TTY_GOT_LSAPSEL, NULL, NULL); + break; + case IAS_MISSING: + IRDA_DEBUG(0, "%s(), got IAS_MISSING\n", __func__ ); + break; + default: + IRDA_DEBUG(0, "%s(), got unknown type!\n", __func__ ); + break; + } + irias_delete_value(value); +} + +/* + * Function ircomm_tty_connect_confirm (instance, sap, qos, max_sdu_size, skb) + * + * Connection confirmed + * + */ +void ircomm_tty_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_data_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + self->client = TRUE; + self->max_data_size = max_data_size; + self->max_header_size = max_header_size; + self->flow = FLOW_START; + + ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_CONFIRM, NULL, NULL); + + /* No need to kfree_skb - see ircomm_ttp_connect_confirm() */ +} + +/* + * Function ircomm_tty_connect_indication (instance, sap, qos, max_sdu_size, + * skb) + * + * we are discovered and being requested to connect by remote device ! + * + */ +void ircomm_tty_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_data_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + int clen; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + self->client = FALSE; + self->max_data_size = max_data_size; + self->max_header_size = max_header_size; + self->flow = FLOW_START; + + clen = skb->data[0]; + if (clen) + irda_param_extract_all(self, skb->data+1, + IRDA_MIN(skb->len, clen), + &ircomm_param_info); + + ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_INDICATION, NULL, NULL); + + /* No need to kfree_skb - see ircomm_ttp_connect_indication() */ +} + +/* + * Function ircomm_tty_link_established (self) + * + * Called when the IrCOMM link is established + * + */ +void ircomm_tty_link_established(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + if (!self->tty) + return; + + del_timer(&self->watchdog_timer); + + /* + * IrCOMM link is now up, and if we are not using hardware + * flow-control, then declare the hardware as running. Otherwise we + * will have to wait for the peer device (DCE) to raise the CTS + * line. + */ + if ((self->flags & ASYNC_CTS_FLOW) && ((self->settings.dce & IRCOMM_CTS) == 0)) { + IRDA_DEBUG(0, "%s(), waiting for CTS ...\n", __func__ ); + return; + } else { + IRDA_DEBUG(1, "%s(), starting hardware!\n", __func__ ); + + self->tty->hw_stopped = 0; + + /* Wake up processes blocked on open */ + wake_up_interruptible(&self->open_wait); + } + + schedule_work(&self->tqueue); +} + +/* + * Function ircomm_tty_start_watchdog_timer (self, timeout) + * + * Start the watchdog timer. This timer is used to make sure that any + * connection attempt is successful, and if not, we will retry after + * the timeout + */ +static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self, + int timeout) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + irda_start_timer(&self->watchdog_timer, timeout, (void *) self, + ircomm_tty_watchdog_timer_expired); +} + +/* + * Function ircomm_tty_watchdog_timer_expired (data) + * + * Called when the connect procedure have taken to much time. + * + */ +static void ircomm_tty_watchdog_timer_expired(void *data) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_tty_do_event(self, IRCOMM_TTY_WD_TIMER_EXPIRED, NULL, NULL); +} + + +/* + * Function ircomm_tty_do_event (self, event, skb) + * + * Process event + * + */ +int ircomm_tty_do_event(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event, + struct sk_buff *skb, struct ircomm_tty_info *info) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + return (*state[self->state])(self, event, skb, info); +} + +/* + * Function ircomm_tty_next_state (self, state) + * + * Switch state + * + */ +static inline void ircomm_tty_next_state(struct ircomm_tty_cb *self, IRCOMM_TTY_STATE state) +{ + /* + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + IRDA_DEBUG(2, "%s: next state=%s, service type=%d\n", __func__ , + ircomm_tty_state[self->state], self->service_type); + */ + self->state = state; +} + +/* + * Function ircomm_tty_state_idle (self, event, skb, info) + * + * Just hanging around + * + */ +static int ircomm_tty_state_idle(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + switch (event) { + case IRCOMM_TTY_ATTACH_CABLE: + /* Try to discover any remote devices */ + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + + irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS); + break; + case IRCOMM_TTY_DISCOVERY_INDICATION: + self->daddr = info->daddr; + self->saddr = info->saddr; + + if (self->iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __func__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + ircomm_tty_getvalue_confirm); + + iriap_getvaluebyclass_request(self->iriap, + self->saddr, self->daddr, + "IrDA:IrCOMM", "Parameters"); + + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Just stay idle */ + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_search (self, event, skb, info) + * + * Trying to discover an IrCOMM device + * + */ +static int ircomm_tty_state_search(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_DISCOVERY_INDICATION: + self->daddr = info->daddr; + self->saddr = info->saddr; + + if (self->iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __func__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + ircomm_tty_getvalue_confirm); + + if (self->service_type == IRCOMM_3_WIRE_RAW) { + iriap_getvaluebyclass_request(self->iriap, self->saddr, + self->daddr, "IrLPT", + "IrDA:IrLMP:LsapSel"); + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL); + } else { + iriap_getvaluebyclass_request(self->iriap, self->saddr, + self->daddr, + "IrDA:IrCOMM", + "Parameters"); + + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS); + } + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: +#if 1 + /* Give up */ +#else + /* Try to discover any remote devices */ + ircomm_tty_start_watchdog_timer(self, 3*HZ); + irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS); +#endif + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_query (self, event, skb, info) + * + * Querying the remote LM-IAS for IrCOMM parameters + * + */ +static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_GOT_PARAMETERS: + if (self->iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __func__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + ircomm_tty_getvalue_confirm); + + iriap_getvaluebyclass_request(self->iriap, self->saddr, + self->daddr, "IrDA:IrCOMM", + "IrDA:TinyTP:LsapSel"); + + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Go back to search mode */ + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_query_lsap_sel (self, event, skb, info) + * + * Query remote LM-IAS for the LSAP selector which we can connect to + * + */ +static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_GOT_LSAPSEL: + /* Connect to remote device */ + ret = ircomm_connect_request(self->ircomm, self->dlsap_sel, + self->saddr, self->daddr, + NULL, self->service_type); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_SETUP); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Go back to search mode */ + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_setup (self, event, skb, info) + * + * Trying to connect + * + */ +static int ircomm_tty_state_setup(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_CONNECT_CONFIRM: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* + * Send initial parameters. This will also send out queued + * parameters waiting for the connection to come up + */ + ircomm_tty_send_initial_parameters(self); + ircomm_tty_link_established(self); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Go back to search mode */ + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_DETACH_CABLE: + /* ircomm_disconnect_request(self->ircomm, NULL); */ + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_ready (self, event, skb, info) + * + * IrCOMM is now connected + * + */ +static int ircomm_tty_state_ready(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_TTY_DATA_REQUEST: + ret = ircomm_data_request(self->ircomm, skb); + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_disconnect_request(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + case IRCOMM_TTY_DISCONNECT_INDICATION: + ircomm_tty_ias_register(self); + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + + if (self->flags & ASYNC_CHECK_CD) { + /* Drop carrier */ + self->settings.dce = IRCOMM_DELTA_CD; + ircomm_tty_check_modem_status(self); + } else { + IRDA_DEBUG(0, "%s(), hanging up!\n", __func__ ); + if (self->tty) + tty_hangup(self->tty); + } + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c new file mode 100644 index 00000000..77c5e649 --- /dev/null +++ b/net/irda/ircomm/ircomm_tty_ioctl.c @@ -0,0 +1,427 @@ +/********************************************************************* + * + * Filename: ircomm_tty_ioctl.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Jun 10 14:39:09 1999 + * Modified at: Wed Jan 5 14:45:43 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) + +/* + * Function ircomm_tty_change_speed (driver) + * + * Change speed of the driver. If the remote device is a DCE, then this + * should make it change the speed of its serial port + */ +static void ircomm_tty_change_speed(struct ircomm_tty_cb *self) +{ + unsigned cflag, cval; + int baud; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (!self->tty || !self->tty->termios || !self->ircomm) + return; + + cflag = self->tty->termios->c_cflag; + + /* byte size and parity */ + switch (cflag & CSIZE) { + case CS5: cval = IRCOMM_WSIZE_5; break; + case CS6: cval = IRCOMM_WSIZE_6; break; + case CS7: cval = IRCOMM_WSIZE_7; break; + case CS8: cval = IRCOMM_WSIZE_8; break; + default: cval = IRCOMM_WSIZE_5; break; + } + if (cflag & CSTOPB) + cval |= IRCOMM_2_STOP_BIT; + + if (cflag & PARENB) + cval |= IRCOMM_PARITY_ENABLE; + if (!(cflag & PARODD)) + cval |= IRCOMM_PARITY_EVEN; + + /* Determine divisor based on baud rate */ + baud = tty_get_baud_rate(self->tty); + if (!baud) + baud = 9600; /* B0 transition handled in rs_set_termios */ + + self->settings.data_rate = baud; + ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE); + + /* CTS flow control flag and modem status interrupts */ + if (cflag & CRTSCTS) { + self->flags |= ASYNC_CTS_FLOW; + self->settings.flow_control |= IRCOMM_RTS_CTS_IN; + /* This got me. Bummer. Jean II */ + if (self->service_type == IRCOMM_3_WIRE_RAW) + IRDA_WARNING("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n", __func__); + } else { + self->flags &= ~ASYNC_CTS_FLOW; + self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN; + } + if (cflag & CLOCAL) + self->flags &= ~ASYNC_CHECK_CD; + else + self->flags |= ASYNC_CHECK_CD; +#if 0 + /* + * Set up parity check flag + */ + + if (I_INPCK(self->tty)) + driver->read_status_mask |= LSR_FE | LSR_PE; + if (I_BRKINT(driver->tty) || I_PARMRK(driver->tty)) + driver->read_status_mask |= LSR_BI; + + /* + * Characters to ignore + */ + driver->ignore_status_mask = 0; + if (I_IGNPAR(driver->tty)) + driver->ignore_status_mask |= LSR_PE | LSR_FE; + + if (I_IGNBRK(self->tty)) { + self->ignore_status_mask |= LSR_BI; + /* + * If we're ignore parity and break indicators, ignore + * overruns too. (For real raw support). + */ + if (I_IGNPAR(self->tty)) + self->ignore_status_mask |= LSR_OE; + } +#endif + self->settings.data_format = cval; + + ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE); + ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE); +} + +/* + * Function ircomm_tty_set_termios (tty, old_termios) + * + * This routine allows the tty driver to be notified when device's + * termios settings have changed. Note that a well-designed tty driver + * should be prepared to accept the case where old == NULL, and try to + * do something rational. + */ +void ircomm_tty_set_termios(struct tty_struct *tty, + struct ktermios *old_termios) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned int cflag = tty->termios->c_cflag; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if ((cflag == old_termios->c_cflag) && + (RELEVANT_IFLAG(tty->termios->c_iflag) == + RELEVANT_IFLAG(old_termios->c_iflag))) + { + return; + } + + ircomm_tty_change_speed(self); + + /* Handle transition to B0 status */ + if ((old_termios->c_cflag & CBAUD) && + !(cflag & CBAUD)) { + self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS); + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + /* Handle transition away from B0 status */ + if (!(old_termios->c_cflag & CBAUD) && + (cflag & CBAUD)) { + self->settings.dte |= IRCOMM_DTR; + if (!(tty->termios->c_cflag & CRTSCTS) || + !test_bit(TTY_THROTTLED, &tty->flags)) { + self->settings.dte |= IRCOMM_RTS; + } + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + /* Handle turning off CRTSCTS */ + if ((old_termios->c_cflag & CRTSCTS) && + !(tty->termios->c_cflag & CRTSCTS)) + { + tty->hw_stopped = 0; + ircomm_tty_start(tty); + } +} + +/* + * Function ircomm_tty_tiocmget (tty) + * + * + * + */ +int ircomm_tty_tiocmget(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned int result; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + + result = ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0) + | ((self->settings.dte & IRCOMM_DTR) ? TIOCM_DTR : 0) + | ((self->settings.dce & IRCOMM_CD) ? TIOCM_CAR : 0) + | ((self->settings.dce & IRCOMM_RI) ? TIOCM_RNG : 0) + | ((self->settings.dce & IRCOMM_DSR) ? TIOCM_DSR : 0) + | ((self->settings.dce & IRCOMM_CTS) ? TIOCM_CTS : 0); + return result; +} + +/* + * Function ircomm_tty_tiocmset (tty, set, clear) + * + * + * + */ +int ircomm_tty_tiocmset(struct tty_struct *tty, + unsigned int set, unsigned int clear) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (set & TIOCM_RTS) + self->settings.dte |= IRCOMM_RTS; + if (set & TIOCM_DTR) + self->settings.dte |= IRCOMM_DTR; + + if (clear & TIOCM_RTS) + self->settings.dte &= ~IRCOMM_RTS; + if (clear & TIOCM_DTR) + self->settings.dte &= ~IRCOMM_DTR; + + if ((set|clear) & TIOCM_RTS) + self->settings.dte |= IRCOMM_DELTA_RTS; + if ((set|clear) & TIOCM_DTR) + self->settings.dte |= IRCOMM_DELTA_DTR; + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + + return 0; +} + +/* + * Function get_serial_info (driver, retinfo) + * + * + * + */ +static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self, + struct serial_struct __user *retinfo) +{ + struct serial_struct info; + + if (!retinfo) + return -EFAULT; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + memset(&info, 0, sizeof(info)); + info.line = self->line; + info.flags = self->flags; + info.baud_base = self->settings.data_rate; + info.close_delay = self->close_delay; + info.closing_wait = self->closing_wait; + + /* For compatibility */ + info.type = PORT_16550A; + info.port = 0; + info.irq = 0; + info.xmit_fifo_size = 0; + info.hub6 = 0; + info.custom_divisor = 0; + + if (copy_to_user(retinfo, &info, sizeof(*retinfo))) + return -EFAULT; + + return 0; +} + +/* + * Function set_serial_info (driver, new_info) + * + * + * + */ +static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self, + struct serial_struct __user *new_info) +{ +#if 0 + struct serial_struct new_serial; + struct ircomm_tty_cb old_state, *state; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + if (copy_from_user(&new_serial,new_info,sizeof(new_serial))) + return -EFAULT; + + + state = self + old_state = *self; + + if (!capable(CAP_SYS_ADMIN)) { + if ((new_serial.baud_base != state->settings.data_rate) || + (new_serial.close_delay != state->close_delay) || + ((new_serial.flags & ~ASYNC_USR_MASK) != + (self->flags & ~ASYNC_USR_MASK))) + return -EPERM; + state->flags = ((state->flags & ~ASYNC_USR_MASK) | + (new_serial.flags & ASYNC_USR_MASK)); + self->flags = ((self->flags & ~ASYNC_USR_MASK) | + (new_serial.flags & ASYNC_USR_MASK)); + /* self->custom_divisor = new_serial.custom_divisor; */ + goto check_and_exit; + } + + /* + * OK, past this point, all the error checking has been done. + * At this point, we start making changes..... + */ + + if (self->settings.data_rate != new_serial.baud_base) { + self->settings.data_rate = new_serial.baud_base; + ircomm_param_request(self, IRCOMM_DATA_RATE, TRUE); + } + + self->close_delay = new_serial.close_delay * HZ/100; + self->closing_wait = new_serial.closing_wait * HZ/100; + /* self->custom_divisor = new_serial.custom_divisor; */ + + self->flags = ((self->flags & ~ASYNC_FLAGS) | + (new_serial.flags & ASYNC_FLAGS)); + self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0; + + check_and_exit: + + if (self->flags & ASYNC_INITIALIZED) { + if (((old_state.flags & ASYNC_SPD_MASK) != + (self->flags & ASYNC_SPD_MASK)) || + (old_driver.custom_divisor != driver->custom_divisor)) { + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI) + driver->tty->alt_speed = 57600; + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI) + driver->tty->alt_speed = 115200; + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI) + driver->tty->alt_speed = 230400; + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP) + driver->tty->alt_speed = 460800; + ircomm_tty_change_speed(driver); + } + } +#endif + return 0; +} + +/* + * Function ircomm_tty_ioctl (tty, cmd, arg) + * + * + * + */ +int ircomm_tty_ioctl(struct tty_struct *tty, + unsigned int cmd, unsigned long arg) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + int ret = 0; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) && + (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) && + (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) { + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + } + + switch (cmd) { + case TIOCGSERIAL: + ret = ircomm_tty_get_serial_info(self, (struct serial_struct __user *) arg); + break; + case TIOCSSERIAL: + ret = ircomm_tty_set_serial_info(self, (struct serial_struct __user *) arg); + break; + case TIOCMIWAIT: + IRDA_DEBUG(0, "(), TIOCMIWAIT, not impl!\n"); + break; + + case TIOCGICOUNT: + IRDA_DEBUG(0, "%s(), TIOCGICOUNT not impl!\n", __func__ ); +#if 0 + save_flags(flags); cli(); + cnow = driver->icount; + restore_flags(flags); + p_cuser = (struct serial_icounter_struct __user *) arg; + if (put_user(cnow.cts, &p_cuser->cts) || + put_user(cnow.dsr, &p_cuser->dsr) || + put_user(cnow.rng, &p_cuser->rng) || + put_user(cnow.dcd, &p_cuser->dcd) || + put_user(cnow.rx, &p_cuser->rx) || + put_user(cnow.tx, &p_cuser->tx) || + put_user(cnow.frame, &p_cuser->frame) || + put_user(cnow.overrun, &p_cuser->overrun) || + put_user(cnow.parity, &p_cuser->parity) || + put_user(cnow.brk, &p_cuser->brk) || + put_user(cnow.buf_overrun, &p_cuser->buf_overrun)) + return -EFAULT; +#endif + return 0; + default: + ret = -ENOIOCTLCMD; /* ioctls which we must ignore */ + } + return ret; +} + + + diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c new file mode 100644 index 00000000..25cc2e69 --- /dev/null +++ b/net/irda/irda_device.c @@ -0,0 +1,324 @@ +/********************************************************************* + * + * Filename: irda_device.c + * Version: 0.9 + * Description: Utility functions used by the device drivers + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Oct 9 09:22:27 1999 + * Modified at: Sun Jan 23 17:41:24 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2001 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +static void __irda_task_delete(struct irda_task *task); + +static hashbin_t *dongles = NULL; +static hashbin_t *tasks = NULL; + +static void irda_task_timer_expired(void *data); + +int __init irda_device_init( void) +{ + dongles = hashbin_new(HB_NOLOCK); + if (dongles == NULL) { + IRDA_WARNING("IrDA: Can't allocate dongles hashbin!\n"); + return -ENOMEM; + } + spin_lock_init(&dongles->hb_spinlock); + + tasks = hashbin_new(HB_LOCK); + if (tasks == NULL) { + IRDA_WARNING("IrDA: Can't allocate tasks hashbin!\n"); + hashbin_delete(dongles, NULL); + return -ENOMEM; + } + + /* We no longer initialise the driver ourselves here, we let + * the system do it for us... - Jean II */ + + return 0; +} + +static void leftover_dongle(void *arg) +{ + struct dongle_reg *reg = arg; + IRDA_WARNING("IrDA: Dongle type %x not unregistered\n", + reg->type); +} + +void irda_device_cleanup(void) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + hashbin_delete(tasks, (FREE_FUNC) __irda_task_delete); + + hashbin_delete(dongles, leftover_dongle); +} + +/* + * Function irda_device_set_media_busy (self, status) + * + * Called when we have detected that another station is transmitting + * in contention mode. + */ +void irda_device_set_media_busy(struct net_device *dev, int status) +{ + struct irlap_cb *self; + + IRDA_DEBUG(4, "%s(%s)\n", __func__, status ? "TRUE" : "FALSE"); + + self = (struct irlap_cb *) dev->atalk_ptr; + + /* Some drivers may enable the receive interrupt before calling + * irlap_open(), or they may disable the receive interrupt + * after calling irlap_close(). + * The IrDA stack is protected from this in irlap_driver_rcv(). + * However, the driver calls directly the wrapper, that calls + * us directly. Make sure we protect ourselves. + * Jean II */ + if (!self || self->magic != LAP_MAGIC) + return; + + if (status) { + self->media_busy = TRUE; + if (status == SMALL) + irlap_start_mbusy_timer(self, SMALLBUSY_TIMEOUT); + else + irlap_start_mbusy_timer(self, MEDIABUSY_TIMEOUT); + IRDA_DEBUG( 4, "Media busy!\n"); + } else { + self->media_busy = FALSE; + irlap_stop_mbusy_timer(self); + } +} +EXPORT_SYMBOL(irda_device_set_media_busy); + + +/* + * Function irda_device_is_receiving (dev) + * + * Check if the device driver is currently receiving data + * + */ +int irda_device_is_receiving(struct net_device *dev) +{ + struct if_irda_req req; + int ret; + + IRDA_DEBUG(2, "%s()\n", __func__); + + if (!dev->netdev_ops->ndo_do_ioctl) { + IRDA_ERROR("%s: do_ioctl not impl. by device driver\n", + __func__); + return -1; + } + + ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req, + SIOCGRECEIVING); + if (ret < 0) + return ret; + + return req.ifr_receiving; +} + +static void __irda_task_delete(struct irda_task *task) +{ + del_timer(&task->timer); + + kfree(task); +} + +static void irda_task_delete(struct irda_task *task) +{ + /* Unregister task */ + hashbin_remove(tasks, (long) task, NULL); + + __irda_task_delete(task); +} + +/* + * Function irda_task_kick (task) + * + * Tries to execute a task possible multiple times until the task is either + * finished, or askes for a timeout. When a task is finished, we do post + * processing, and notify the parent task, that is waiting for this task + * to complete. + */ +static int irda_task_kick(struct irda_task *task) +{ + int finished = TRUE; + int count = 0; + int timeout; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(task != NULL, return -1;); + IRDA_ASSERT(task->magic == IRDA_TASK_MAGIC, return -1;); + + /* Execute task until it's finished, or askes for a timeout */ + do { + timeout = task->function(task); + if (count++ > 100) { + IRDA_ERROR("%s: error in task handler!\n", + __func__); + irda_task_delete(task); + return TRUE; + } + } while ((timeout == 0) && (task->state != IRDA_TASK_DONE)); + + if (timeout < 0) { + IRDA_ERROR("%s: Error executing task!\n", __func__); + irda_task_delete(task); + return TRUE; + } + + /* Check if we are finished */ + if (task->state == IRDA_TASK_DONE) { + del_timer(&task->timer); + + /* Do post processing */ + if (task->finished) + task->finished(task); + + /* Notify parent */ + if (task->parent) { + /* Check if parent is waiting for us to complete */ + if (task->parent->state == IRDA_TASK_CHILD_WAIT) { + task->parent->state = IRDA_TASK_CHILD_DONE; + + /* Stop timer now that we are here */ + del_timer(&task->parent->timer); + + /* Kick parent task */ + irda_task_kick(task->parent); + } + } + irda_task_delete(task); + } else if (timeout > 0) { + irda_start_timer(&task->timer, timeout, (void *) task, + irda_task_timer_expired); + finished = FALSE; + } else { + IRDA_DEBUG(0, "%s(), not finished, and no timeout!\n", + __func__); + finished = FALSE; + } + + return finished; +} + +/* + * Function irda_task_timer_expired (data) + * + * Task time has expired. We now try to execute task (again), and restart + * the timer if the task has not finished yet + */ +static void irda_task_timer_expired(void *data) +{ + struct irda_task *task; + + IRDA_DEBUG(2, "%s()\n", __func__); + + task = (struct irda_task *) data; + + irda_task_kick(task); +} + +/* + * Function irda_device_setup (dev) + * + * This function should be used by low level device drivers in a similar way + * as ether_setup() is used by normal network device drivers + */ +static void irda_device_setup(struct net_device *dev) +{ + dev->hard_header_len = 0; + dev->addr_len = LAP_ALEN; + + dev->type = ARPHRD_IRDA; + dev->tx_queue_len = 8; /* Window size + 1 s-frame */ + + memset(dev->broadcast, 0xff, LAP_ALEN); + + dev->mtu = 2048; + dev->flags = IFF_NOARP; +} + +/* + * Funciton alloc_irdadev + * Allocates and sets up an IRDA device in a manner similar to + * alloc_etherdev. + */ +struct net_device *alloc_irdadev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "irda%d", irda_device_setup); +} +EXPORT_SYMBOL(alloc_irdadev); + +#ifdef CONFIG_ISA_DMA_API +/* + * Function setup_dma (idev, buffer, count, mode) + * + * Setup the DMA channel. Commonly used by LPC FIR drivers + * + */ +void irda_setup_dma(int channel, dma_addr_t buffer, int count, int mode) +{ + unsigned long flags; + + flags = claim_dma_lock(); + + disable_dma(channel); + clear_dma_ff(channel); + set_dma_mode(channel, mode); + set_dma_addr(channel, buffer); + set_dma_count(channel, count); + enable_dma(channel); + + release_dma_lock(flags); +} +EXPORT_SYMBOL(irda_setup_dma); +#endif diff --git a/net/irda/iriap.c b/net/irda/iriap.c new file mode 100644 index 00000000..f876eed7 --- /dev/null +++ b/net/irda/iriap.c @@ -0,0 +1,1104 @@ +/********************************************************************* + * + * Filename: iriap.c + * Version: 0.8 + * Description: Information Access Protocol (IAP) + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Aug 21 00:02:07 1997 + * Modified at: Sat Dec 25 16:42:42 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IRDA_DEBUG +/* FIXME: This one should go in irlmp.c */ +static const char *const ias_charset_types[] = { + "CS_ASCII", + "CS_ISO_8859_1", + "CS_ISO_8859_2", + "CS_ISO_8859_3", + "CS_ISO_8859_4", + "CS_ISO_8859_5", + "CS_ISO_8859_6", + "CS_ISO_8859_7", + "CS_ISO_8859_8", + "CS_ISO_8859_9", + "CS_UNICODE" +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static hashbin_t *iriap = NULL; +static void *service_handle; + +static void __iriap_close(struct iriap_cb *self); +static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode); +static void iriap_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *skb); +static void iriap_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); +static void iriap_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, __u8 max_header_size, + struct sk_buff *skb); +static int iriap_data_indication(void *instance, void *sap, + struct sk_buff *skb); + +static void iriap_watchdog_timer_expired(void *data); + +static inline void iriap_start_watchdog_timer(struct iriap_cb *self, + int timeout) +{ + irda_start_timer(&self->watchdog_timer, timeout, self, + iriap_watchdog_timer_expired); +} + +static struct lock_class_key irias_objects_key; + +/* + * Function iriap_init (void) + * + * Initializes the IrIAP layer, called by the module initialization code + * in irmod.c + */ +int __init iriap_init(void) +{ + struct ias_object *obj; + struct iriap_cb *server; + __u8 oct_seq[6]; + __u16 hints; + + /* Allocate master array */ + iriap = hashbin_new(HB_LOCK); + if (!iriap) + return -ENOMEM; + + /* Object repository - defined in irias_object.c */ + irias_objects = hashbin_new(HB_LOCK); + if (!irias_objects) { + IRDA_WARNING("%s: Can't allocate irias_objects hashbin!\n", + __func__); + hashbin_delete(iriap, NULL); + return -ENOMEM; + } + + lockdep_set_class_and_name(&irias_objects->hb_spinlock, &irias_objects_key, + "irias_objects"); + + /* + * Register some default services for IrLMP + */ + hints = irlmp_service_to_hint(S_COMPUTER); + service_handle = irlmp_register_service(hints); + + /* Register the Device object with LM-IAS */ + obj = irias_new_object("Device", IAS_DEVICE_ID); + irias_add_string_attrib(obj, "DeviceName", "Linux", IAS_KERNEL_ATTR); + + oct_seq[0] = 0x01; /* Version 1 */ + oct_seq[1] = 0x00; /* IAS support bits */ + oct_seq[2] = 0x00; /* LM-MUX support bits */ +#ifdef CONFIG_IRDA_ULTRA + oct_seq[2] |= 0x04; /* Connectionless Data support */ +#endif + irias_add_octseq_attrib(obj, "IrLMPSupport", oct_seq, 3, + IAS_KERNEL_ATTR); + irias_insert_object(obj); + + /* + * Register server support with IrLMP so we can accept incoming + * connections + */ + server = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL); + if (!server) { + IRDA_DEBUG(0, "%s(), unable to open server\n", __func__); + return -1; + } + iriap_register_lsap(server, LSAP_IAS, IAS_SERVER); + + return 0; +} + +/* + * Function iriap_cleanup (void) + * + * Initializes the IrIAP layer, called by the module cleanup code in + * irmod.c + */ +void iriap_cleanup(void) +{ + irlmp_unregister_service(service_handle); + + hashbin_delete(iriap, (FREE_FUNC) __iriap_close); + hashbin_delete(irias_objects, (FREE_FUNC) __irias_delete_object); +} + +/* + * Function iriap_open (void) + * + * Opens an instance of the IrIAP layer, and registers with IrLMP + */ +struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv, + CONFIRM_CALLBACK callback) +{ + struct iriap_cb *self; + + IRDA_DEBUG(2, "%s()\n", __func__); + + self = kzalloc(sizeof(*self), GFP_ATOMIC); + if (!self) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + return NULL; + } + + /* + * Initialize instance + */ + + self->magic = IAS_MAGIC; + self->mode = mode; + if (mode == IAS_CLIENT) + iriap_register_lsap(self, slsap_sel, mode); + + self->confirm = callback; + self->priv = priv; + + /* iriap_getvaluebyclass_request() will construct packets before + * we connect, so this must have a sane value... Jean II */ + self->max_header_size = LMP_MAX_HEADER; + + init_timer(&self->watchdog_timer); + + hashbin_insert(iriap, (irda_queue_t *) self, (long) self, NULL); + + /* Initialize state machines */ + iriap_next_client_state(self, S_DISCONNECT); + iriap_next_call_state(self, S_MAKE_CALL); + iriap_next_server_state(self, R_DISCONNECT); + iriap_next_r_connect_state(self, R_WAITING); + + return self; +} +EXPORT_SYMBOL(iriap_open); + +/* + * Function __iriap_close (self) + * + * Removes (deallocates) the IrIAP instance + * + */ +static void __iriap_close(struct iriap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + del_timer(&self->watchdog_timer); + + if (self->request_skb) + dev_kfree_skb(self->request_skb); + + self->magic = 0; + + kfree(self); +} + +/* + * Function iriap_close (void) + * + * Closes IrIAP and deregisters with IrLMP + */ +void iriap_close(struct iriap_cb *self) +{ + struct iriap_cb *entry; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } + + entry = (struct iriap_cb *) hashbin_remove(iriap, (long) self, NULL); + IRDA_ASSERT(entry == self, return;); + + __iriap_close(self); +} +EXPORT_SYMBOL(iriap_close); + +static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode) +{ + notify_t notify; + + IRDA_DEBUG(2, "%s()\n", __func__); + + irda_notify_init(¬ify); + notify.connect_confirm = iriap_connect_confirm; + notify.connect_indication = iriap_connect_indication; + notify.disconnect_indication = iriap_disconnect_indication; + notify.data_indication = iriap_data_indication; + notify.instance = self; + if (mode == IAS_CLIENT) + strcpy(notify.name, "IrIAS cli"); + else + strcpy(notify.name, "IrIAS srv"); + + self->lsap = irlmp_open_lsap(slsap_sel, ¬ify, 0); + if (self->lsap == NULL) { + IRDA_ERROR("%s: Unable to allocated LSAP!\n", __func__); + return -1; + } + self->slsap_sel = self->lsap->slsap_sel; + + return 0; +} + +/* + * Function iriap_disconnect_indication (handle, reason) + * + * Got disconnect, so clean up everything associated with this connection + * + */ +static void iriap_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct iriap_cb *self; + + IRDA_DEBUG(4, "%s(), reason=%s\n", __func__, irlmp_reasons[reason]); + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + IRDA_ASSERT(iriap != NULL, return;); + + del_timer(&self->watchdog_timer); + + /* Not needed */ + if (skb) + dev_kfree_skb(skb); + + if (self->mode == IAS_CLIENT) { + IRDA_DEBUG(4, "%s(), disconnect as client\n", __func__); + + + iriap_do_client_event(self, IAP_LM_DISCONNECT_INDICATION, + NULL); + /* + * Inform service user that the request failed by sending + * it a NULL value. Warning, the client might close us, so + * remember no to use self anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_DISCONNECT, 0, NULL, self->priv); + } else { + IRDA_DEBUG(4, "%s(), disconnect as server\n", __func__); + iriap_do_server_event(self, IAP_LM_DISCONNECT_INDICATION, + NULL); + iriap_close(self); + } +} + +/* + * Function iriap_disconnect_request (handle) + */ +static void iriap_disconnect_request(struct iriap_cb *self) +{ + struct sk_buff *tx_skb; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC); + if (tx_skb == NULL) { + IRDA_DEBUG(0, + "%s(), Could not allocate an sk_buff of length %d\n", + __func__, LMP_MAX_HEADER); + return; + } + + /* + * Reserve space for MUX control and LAP header + */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + + irlmp_disconnect_request(self->lsap, tx_skb); +} + +/* + * Function iriap_getvaluebyclass (addr, name, attr) + * + * Retrieve all values from attribute in all objects with given class + * name + */ +int iriap_getvaluebyclass_request(struct iriap_cb *self, + __u32 saddr, __u32 daddr, + char *name, char *attr) +{ + struct sk_buff *tx_skb; + int name_len, attr_len, skb_len; + __u8 *frame; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return -1;); + + /* Client must supply the destination device address */ + if (!daddr) + return -1; + + self->daddr = daddr; + self->saddr = saddr; + + /* + * Save operation, so we know what the later indication is about + */ + self->operation = GET_VALUE_BY_CLASS; + + /* Give ourselves 10 secs to finish this operation */ + iriap_start_watchdog_timer(self, 10*HZ); + + name_len = strlen(name); /* Up to IAS_MAX_CLASSNAME = 60 */ + attr_len = strlen(attr); /* Up to IAS_MAX_ATTRIBNAME = 60 */ + + skb_len = self->max_header_size+2+name_len+1+attr_len+4; + tx_skb = alloc_skb(skb_len, GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + skb_put(tx_skb, 3+name_len+attr_len); + frame = tx_skb->data; + + /* Build frame */ + frame[0] = IAP_LST | GET_VALUE_BY_CLASS; + frame[1] = name_len; /* Insert length of name */ + memcpy(frame+2, name, name_len); /* Insert name */ + frame[2+name_len] = attr_len; /* Insert length of attr */ + memcpy(frame+3+name_len, attr, attr_len); /* Insert attr */ + + iriap_do_client_event(self, IAP_CALL_REQUEST_GVBC, tx_skb); + + /* Drop reference count - see state_s_disconnect(). */ + dev_kfree_skb(tx_skb); + + return 0; +} +EXPORT_SYMBOL(iriap_getvaluebyclass_request); + +/* + * Function iriap_getvaluebyclass_confirm (self, skb) + * + * Got result from GetValueByClass command. Parse it and return result + * to service user. + * + */ +static void iriap_getvaluebyclass_confirm(struct iriap_cb *self, + struct sk_buff *skb) +{ + struct ias_value *value; + int charset; + __u32 value_len; + __u32 tmp_cpu32; + __u16 obj_id; + __u16 len; + __u8 type; + __u8 *fp; + int n; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Initialize variables */ + fp = skb->data; + n = 2; + + /* Get length, MSB first */ + len = get_unaligned_be16(fp + n); + n += 2; + + IRDA_DEBUG(4, "%s(), len=%d\n", __func__, len); + + /* Get object ID, MSB first */ + obj_id = get_unaligned_be16(fp + n); + n += 2; + + type = fp[n++]; + IRDA_DEBUG(4, "%s(), Value type = %d\n", __func__, type); + + switch (type) { + case IAS_INTEGER: + memcpy(&tmp_cpu32, fp+n, 4); n += 4; + be32_to_cpus(&tmp_cpu32); + value = irias_new_integer_value(tmp_cpu32); + + /* Legal values restricted to 0x01-0x6f, page 15 irttp */ + IRDA_DEBUG(4, "%s(), lsap=%d\n", __func__, value->t.integer); + break; + case IAS_STRING: + charset = fp[n++]; + + switch (charset) { + case CS_ASCII: + break; +/* case CS_ISO_8859_1: */ +/* case CS_ISO_8859_2: */ +/* case CS_ISO_8859_3: */ +/* case CS_ISO_8859_4: */ +/* case CS_ISO_8859_5: */ +/* case CS_ISO_8859_6: */ +/* case CS_ISO_8859_7: */ +/* case CS_ISO_8859_8: */ +/* case CS_ISO_8859_9: */ +/* case CS_UNICODE: */ + default: + IRDA_DEBUG(0, "%s(), charset %s, not supported\n", + __func__, ias_charset_types[charset]); + + /* Aborting, close connection! */ + iriap_disconnect_request(self); + return; + /* break; */ + } + value_len = fp[n++]; + IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len); + + /* Make sure the string is null-terminated */ + if (n + value_len < skb->len) + fp[n + value_len] = 0x00; + IRDA_DEBUG(4, "Got string %s\n", fp+n); + + /* Will truncate to IAS_MAX_STRING bytes */ + value = irias_new_string_value(fp+n); + break; + case IAS_OCT_SEQ: + value_len = get_unaligned_be16(fp + n); + n += 2; + + /* Will truncate to IAS_MAX_OCTET_STRING bytes */ + value = irias_new_octseq_value(fp+n, value_len); + break; + default: + value = irias_new_missing_value(); + break; + } + + /* Finished, close connection! */ + iriap_disconnect_request(self); + + /* Warning, the client might close us, so remember no to use self + * anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_SUCCESS, obj_id, value, self->priv); + else { + IRDA_DEBUG(0, "%s(), missing handler!\n", __func__); + irias_delete_value(value); + } +} + +/* + * Function iriap_getvaluebyclass_response () + * + * Send answer back to remote LM-IAS + * + */ +static void iriap_getvaluebyclass_response(struct iriap_cb *self, + __u16 obj_id, + __u8 ret_code, + struct ias_value *value) +{ + struct sk_buff *tx_skb; + int n; + __be32 tmp_be32; + __be16 tmp_be16; + __u8 *fp; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(value != NULL, return;); + IRDA_ASSERT(value->len <= 1024, return;); + + /* Initialize variables */ + n = 0; + + /* + * We must adjust the size of the response after the length of the + * value. We add 32 bytes because of the 6 bytes for the frame and + * max 5 bytes for the value coding. + */ + tx_skb = alloc_skb(value->len + self->max_header_size + 32, + GFP_ATOMIC); + if (!tx_skb) + return; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + skb_put(tx_skb, 6); + + fp = tx_skb->data; + + /* Build frame */ + fp[n++] = GET_VALUE_BY_CLASS | IAP_LST; + fp[n++] = ret_code; + + /* Insert list length (MSB first) */ + tmp_be16 = htons(0x0001); + memcpy(fp+n, &tmp_be16, 2); n += 2; + + /* Insert object identifier ( MSB first) */ + tmp_be16 = cpu_to_be16(obj_id); + memcpy(fp+n, &tmp_be16, 2); n += 2; + + switch (value->type) { + case IAS_STRING: + skb_put(tx_skb, 3 + value->len); + fp[n++] = value->type; + fp[n++] = 0; /* ASCII */ + fp[n++] = (__u8) value->len; + memcpy(fp+n, value->t.string, value->len); n+=value->len; + break; + case IAS_INTEGER: + skb_put(tx_skb, 5); + fp[n++] = value->type; + + tmp_be32 = cpu_to_be32(value->t.integer); + memcpy(fp+n, &tmp_be32, 4); n += 4; + break; + case IAS_OCT_SEQ: + skb_put(tx_skb, 3 + value->len); + fp[n++] = value->type; + + tmp_be16 = cpu_to_be16(value->len); + memcpy(fp+n, &tmp_be16, 2); n += 2; + memcpy(fp+n, value->t.oct_seq, value->len); n+=value->len; + break; + case IAS_MISSING: + IRDA_DEBUG( 3, "%s: sending IAS_MISSING\n", __func__); + skb_put(tx_skb, 1); + fp[n++] = value->type; + break; + default: + IRDA_DEBUG(0, "%s(), type not implemented!\n", __func__); + break; + } + iriap_do_r_connect_event(self, IAP_CALL_RESPONSE, tx_skb); + + /* Drop reference count - see state_r_execute(). */ + dev_kfree_skb(tx_skb); +} + +/* + * Function iriap_getvaluebyclass_indication (self, skb) + * + * getvaluebyclass is requested from peer LM-IAS + * + */ +static void iriap_getvaluebyclass_indication(struct iriap_cb *self, + struct sk_buff *skb) +{ + struct ias_object *obj; + struct ias_attrib *attrib; + int name_len; + int attr_len; + char name[IAS_MAX_CLASSNAME + 1]; /* 60 bytes */ + char attr[IAS_MAX_ATTRIBNAME + 1]; /* 60 bytes */ + __u8 *fp; + int n; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + fp = skb->data; + n = 1; + + name_len = fp[n++]; + + IRDA_ASSERT(name_len < IAS_MAX_CLASSNAME + 1, return;); + + memcpy(name, fp+n, name_len); n+=name_len; + name[name_len] = '\0'; + + attr_len = fp[n++]; + + IRDA_ASSERT(attr_len < IAS_MAX_ATTRIBNAME + 1, return;); + + memcpy(attr, fp+n, attr_len); n+=attr_len; + attr[attr_len] = '\0'; + + IRDA_DEBUG(4, "LM-IAS: Looking up %s: %s\n", name, attr); + obj = irias_find_object(name); + + if (obj == NULL) { + IRDA_DEBUG(2, "LM-IAS: Object %s not found\n", name); + iriap_getvaluebyclass_response(self, 0x1235, IAS_CLASS_UNKNOWN, + &irias_missing); + return; + } + IRDA_DEBUG(4, "LM-IAS: found %s, id=%d\n", obj->name, obj->id); + + attrib = irias_find_attrib(obj, attr); + if (attrib == NULL) { + IRDA_DEBUG(2, "LM-IAS: Attribute %s not found\n", attr); + iriap_getvaluebyclass_response(self, obj->id, + IAS_ATTRIB_UNKNOWN, + &irias_missing); + return; + } + + /* We have a match; send the value. */ + iriap_getvaluebyclass_response(self, obj->id, IAS_SUCCESS, + attrib->value); +} + +/* + * Function iriap_send_ack (void) + * + * Currently not used + * + */ +void iriap_send_ack(struct iriap_cb *self) +{ + struct sk_buff *tx_skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + tx_skb = alloc_skb(LMP_MAX_HEADER + 1, GFP_ATOMIC); + if (!tx_skb) + return; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + skb_put(tx_skb, 1); + frame = tx_skb->data; + + /* Build frame */ + frame[0] = IAP_LST | IAP_ACK | self->operation; + + irlmp_data_request(self->lsap, tx_skb); +} + +void iriap_connect_request(struct iriap_cb *self) +{ + int ret; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + ret = irlmp_connect_request(self->lsap, LSAP_IAS, + self->saddr, self->daddr, + NULL, NULL); + if (ret < 0) { + IRDA_DEBUG(0, "%s(), connect failed!\n", __func__); + self->confirm(IAS_DISCONNECT, 0, NULL, self->priv); + } +} + +/* + * Function iriap_connect_confirm (handle, skb) + * + * LSAP connection confirmed! + * + */ +static void iriap_connect_confirm(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct iriap_cb *self; + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + self->max_data_size = max_seg_size; + self->max_header_size = max_header_size; + + del_timer(&self->watchdog_timer); + + iriap_do_client_event(self, IAP_LM_CONNECT_CONFIRM, skb); + + /* Drop reference count - see state_s_make_call(). */ + dev_kfree_skb(skb); +} + +/* + * Function iriap_connect_indication ( handle, skb) + * + * Remote LM-IAS is requesting connection + * + */ +static void iriap_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct iriap_cb *self, *new; + + IRDA_DEBUG(1, "%s()\n", __func__); + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self != NULL, goto out;); + IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;); + + /* Start new server */ + new = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL); + if (!new) { + IRDA_DEBUG(0, "%s(), open failed\n", __func__); + goto out; + } + + /* Now attach up the new "socket" */ + new->lsap = irlmp_dup(self->lsap, new); + if (!new->lsap) { + IRDA_DEBUG(0, "%s(), dup failed!\n", __func__); + goto out; + } + + new->max_data_size = max_seg_size; + new->max_header_size = max_header_size; + + /* Clean up the original one to keep it in listen state */ + irlmp_listen(self->lsap); + + iriap_do_server_event(new, IAP_LM_CONNECT_INDICATION, skb); + +out: + /* Drop reference count - see state_r_disconnect(). */ + dev_kfree_skb(skb); +} + +/* + * Function iriap_data_indication (handle, skb) + * + * Receives data from connection identified by handle from IrLMP + * + */ +static int iriap_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct iriap_cb *self; + __u8 *frame; + __u8 opcode; + + IRDA_DEBUG(3, "%s()\n", __func__); + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(skb != NULL, return 0;); + IRDA_ASSERT(self != NULL, goto out;); + IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;); + + frame = skb->data; + + if (self->mode == IAS_SERVER) { + /* Call server */ + IRDA_DEBUG(4, "%s(), Calling server!\n", __func__); + iriap_do_r_connect_event(self, IAP_RECV_F_LST, skb); + goto out; + } + opcode = frame[0]; + if (~opcode & IAP_LST) { + IRDA_WARNING("%s:, IrIAS multiframe commands or " + "results is not implemented yet!\n", + __func__); + goto out; + } + + /* Check for ack frames since they don't contain any data */ + if (opcode & IAP_ACK) { + IRDA_DEBUG(0, "%s() Got ack frame!\n", __func__); + goto out; + } + + opcode &= ~IAP_LST; /* Mask away LST bit */ + + switch (opcode) { + case GET_INFO_BASE: + IRDA_DEBUG(0, "IrLMP GetInfoBaseDetails not implemented!\n"); + break; + case GET_VALUE_BY_CLASS: + iriap_do_call_event(self, IAP_RECV_F_LST, NULL); + + switch (frame[1]) { + case IAS_SUCCESS: + iriap_getvaluebyclass_confirm(self, skb); + break; + case IAS_CLASS_UNKNOWN: + IRDA_DEBUG(1, "%s(), No such class!\n", __func__); + /* Finished, close connection! */ + iriap_disconnect_request(self); + + /* + * Warning, the client might close us, so remember + * no to use self anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_CLASS_UNKNOWN, 0, NULL, + self->priv); + break; + case IAS_ATTRIB_UNKNOWN: + IRDA_DEBUG(1, "%s(), No such attribute!\n", __func__); + /* Finished, close connection! */ + iriap_disconnect_request(self); + + /* + * Warning, the client might close us, so remember + * no to use self anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_ATTRIB_UNKNOWN, 0, NULL, + self->priv); + break; + } + break; + default: + IRDA_DEBUG(0, "%s(), Unknown op-code: %02x\n", __func__, + opcode); + break; + } + +out: + /* Cleanup - sub-calls will have done skb_get() as needed. */ + dev_kfree_skb(skb); + return 0; +} + +/* + * Function iriap_call_indication (self, skb) + * + * Received call to server from peer LM-IAS + * + */ +void iriap_call_indication(struct iriap_cb *self, struct sk_buff *skb) +{ + __u8 *fp; + __u8 opcode; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + fp = skb->data; + + opcode = fp[0]; + if (~opcode & 0x80) { + IRDA_WARNING("%s: IrIAS multiframe commands or results " + "is not implemented yet!\n", __func__); + return; + } + opcode &= 0x7f; /* Mask away LST bit */ + + switch (opcode) { + case GET_INFO_BASE: + IRDA_WARNING("%s: GetInfoBaseDetails not implemented yet!\n", + __func__); + break; + case GET_VALUE_BY_CLASS: + iriap_getvaluebyclass_indication(self, skb); + break; + } + /* skb will be cleaned up in iriap_data_indication */ +} + +/* + * Function iriap_watchdog_timer_expired (data) + * + * Query has taken too long time, so abort + * + */ +static void iriap_watchdog_timer_expired(void *data) +{ + struct iriap_cb *self = (struct iriap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + /* iriap_close(self); */ +} + +#ifdef CONFIG_PROC_FS + +static const char *const ias_value_types[] = { + "IAS_MISSING", + "IAS_INTEGER", + "IAS_OCT_SEQ", + "IAS_STRING" +}; + +static inline struct ias_object *irias_seq_idx(loff_t pos) +{ + struct ias_object *obj; + + for (obj = (struct ias_object *) hashbin_get_first(irias_objects); + obj; obj = (struct ias_object *) hashbin_get_next(irias_objects)) { + if (pos-- == 0) + break; + } + + return obj; +} + +static void *irias_seq_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_irq(&irias_objects->hb_spinlock); + + return *pos ? irias_seq_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *irias_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) + ? (void *) hashbin_get_first(irias_objects) + : (void *) hashbin_get_next(irias_objects); +} + +static void irias_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irias_objects->hb_spinlock); +} + +static int irias_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "LM-IAS Objects:\n"); + else { + struct ias_object *obj = v; + struct ias_attrib *attrib; + + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -EINVAL;); + + seq_printf(seq, "name: %s, id=%d\n", + obj->name, obj->id); + + /* Careful for priority inversions here ! + * All other uses of attrib spinlock are independent of + * the object spinlock, so we are safe. Jean II */ + spin_lock(&obj->attribs->hb_spinlock); + + /* List all attributes for this object */ + for (attrib = (struct ias_attrib *) hashbin_get_first(obj->attribs); + attrib != NULL; + attrib = (struct ias_attrib *) hashbin_get_next(obj->attribs)) { + + IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, + goto outloop; ); + + seq_printf(seq, " - Attribute name: \"%s\", ", + attrib->name); + seq_printf(seq, "value[%s]: ", + ias_value_types[attrib->value->type]); + + switch (attrib->value->type) { + case IAS_INTEGER: + seq_printf(seq, "%d\n", + attrib->value->t.integer); + break; + case IAS_STRING: + seq_printf(seq, "\"%s\"\n", + attrib->value->t.string); + break; + case IAS_OCT_SEQ: + seq_printf(seq, "octet sequence (%d bytes)\n", + attrib->value->len); + break; + case IAS_MISSING: + seq_puts(seq, "missing\n"); + break; + default: + seq_printf(seq, "type %d?\n", + attrib->value->type); + } + seq_putc(seq, '\n'); + + } + IRDA_ASSERT_LABEL(outloop:) + spin_unlock(&obj->attribs->hb_spinlock); + } + + return 0; +} + +static const struct seq_operations irias_seq_ops = { + .start = irias_seq_start, + .next = irias_seq_next, + .stop = irias_seq_stop, + .show = irias_seq_show, +}; + +static int irias_seq_open(struct inode *inode, struct file *file) +{ + IRDA_ASSERT( irias_objects != NULL, return -EINVAL;); + + return seq_open(file, &irias_seq_ops); +} + +const struct file_operations irias_seq_fops = { + .owner = THIS_MODULE, + .open = irias_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* PROC_FS */ diff --git a/net/irda/iriap_event.c b/net/irda/iriap_event.c new file mode 100644 index 00000000..703774e2 --- /dev/null +++ b/net/irda/iriap_event.c @@ -0,0 +1,504 @@ +/********************************************************************* + * + * Filename: iriap_event.c + * Version: 0.1 + * Description: IAP Finite State Machine + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Aug 21 00:02:07 1997 + * Modified at: Wed Mar 1 11:28:34 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +#include +#include +#include +#include + +static void state_s_disconnect (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_connecting (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_call (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); + +static void state_s_make_call (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_calling (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_outstanding (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_replying (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_wait_active (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); + +static void state_r_disconnect (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_call (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_waiting (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_wait_active (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_receiving (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_execute (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_returning (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); + +static void (*iriap_state[])(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) = { + /* Client FSM */ + state_s_disconnect, + state_s_connecting, + state_s_call, + + /* S-Call FSM */ + state_s_make_call, + state_s_calling, + state_s_outstanding, + state_s_replying, + state_s_wait_for_call, + state_s_wait_active, + + /* Server FSM */ + state_r_disconnect, + state_r_call, + + /* R-Connect FSM */ + state_r_waiting, + state_r_wait_active, + state_r_receiving, + state_r_execute, + state_r_returning, +}; + +void iriap_next_client_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->client_state = state; +} + +void iriap_next_call_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->call_state = state; +} + +void iriap_next_server_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->server_state = state; +} + +void iriap_next_r_connect_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->r_connect_state = state; +} + +void iriap_do_client_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->client_state]) (self, event, skb); +} + +void iriap_do_call_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->call_state]) (self, event, skb); +} + +void iriap_do_server_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->server_state]) (self, event, skb); +} + +void iriap_do_r_connect_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->r_connect_state]) (self, event, skb); +} + + +/* + * Function state_s_disconnect (event, skb) + * + * S-Disconnect, The device has no LSAP connection to a particular + * remote device. + */ +static void state_s_disconnect(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + switch (event) { + case IAP_CALL_REQUEST_GVBC: + iriap_next_client_state(self, S_CONNECTING); + IRDA_ASSERT(self->request_skb == NULL, return;); + /* Don't forget to refcount it - + * see iriap_getvaluebyclass_request(). */ + skb_get(skb); + self->request_skb = skb; + iriap_connect_request(self); + break; + case IAP_LM_DISCONNECT_INDICATION: + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event); + break; + } +} + +/* + * Function state_s_connecting (self, event, skb) + * + * S-Connecting + * + */ +static void state_s_connecting(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + switch (event) { + case IAP_LM_CONNECT_CONFIRM: + /* + * Jump to S-Call FSM + */ + iriap_do_call_event(self, IAP_CALL_REQUEST, skb); + /* iriap_call_request(self, 0,0,0); */ + iriap_next_client_state(self, S_CALL); + break; + case IAP_LM_DISCONNECT_INDICATION: + /* Abort calls */ + iriap_next_call_state(self, S_MAKE_CALL); + iriap_next_client_state(self, S_DISCONNECT); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event); + break; + } +} + +/* + * Function state_s_call (self, event, skb) + * + * S-Call, The device can process calls to a specific remote + * device. Whenever the LSAP connection is disconnected, this state + * catches that event and clears up + */ +static void state_s_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + + switch (event) { + case IAP_LM_DISCONNECT_INDICATION: + /* Abort calls */ + iriap_next_call_state(self, S_MAKE_CALL); + iriap_next_client_state(self, S_DISCONNECT); + break; + default: + IRDA_DEBUG(0, "state_s_call: Unknown event %d\n", event); + break; + } +} + +/* + * Function state_s_make_call (event, skb) + * + * S-Make-Call + * + */ +static void state_s_make_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + + IRDA_ASSERT(self != NULL, return;); + + switch (event) { + case IAP_CALL_REQUEST: + /* Already refcounted - see state_s_disconnect() */ + tx_skb = self->request_skb; + self->request_skb = NULL; + + irlmp_data_request(self->lsap, tx_skb); + iriap_next_call_state(self, S_OUTSTANDING); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event); + break; + } +} + +/* + * Function state_s_calling (event, skb) + * + * S-Calling + * + */ +static void state_s_calling(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __func__); +} + +/* + * Function state_s_outstanding (event, skb) + * + * S-Outstanding, The device is waiting for a response to a command + * + */ +static void state_s_outstanding(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + + switch (event) { + case IAP_RECV_F_LST: + /*iriap_send_ack(self);*/ + /*LM_Idle_request(idle); */ + + iriap_next_call_state(self, S_WAIT_FOR_CALL); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event); + break; + } +} + +/* + * Function state_s_replying (event, skb) + * + * S-Replying, The device is collecting a multiple part response + */ +static void state_s_replying(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __func__); +} + +/* + * Function state_s_wait_for_call (event, skb) + * + * S-Wait-for-Call + * + */ +static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __func__); +} + + +/* + * Function state_s_wait_active (event, skb) + * + * S-Wait-Active + * + */ +static void state_s_wait_active(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __func__); +} + +/************************************************************************** + * + * Server FSM + * + **************************************************************************/ + +/* + * Function state_r_disconnect (self, event, skb) + * + * LM-IAS server is disconnected (not processing any requests!) + * + */ +static void state_r_disconnect(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + + switch (event) { + case IAP_LM_CONNECT_INDICATION: + tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC); + if (tx_skb == NULL) { + IRDA_WARNING("%s: unable to malloc!\n", __func__); + return; + } + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + + irlmp_connect_response(self->lsap, tx_skb); + /*LM_Idle_request(idle); */ + + iriap_next_server_state(self, R_CALL); + + /* + * Jump to R-Connect FSM, we skip R-Waiting since we do not + * care about LM_Idle_request()! + */ + iriap_next_r_connect_state(self, R_RECEIVING); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event %d\n", __func__, event); + break; + } +} + +/* + * Function state_r_call (self, event, skb) + */ +static void state_r_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + switch (event) { + case IAP_LM_DISCONNECT_INDICATION: + /* Abort call */ + iriap_next_server_state(self, R_DISCONNECT); + iriap_next_r_connect_state(self, R_WAITING); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event!\n", __func__); + break; + } +} + +/* + * R-Connect FSM + */ + +/* + * Function state_r_waiting (self, event, skb) + */ +static void state_r_waiting(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __func__); +} + +static void state_r_wait_active(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __func__); +} + +/* + * Function state_r_receiving (self, event, skb) + * + * We are receiving a command + * + */ +static void state_r_receiving(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + switch (event) { + case IAP_RECV_F_LST: + iriap_next_r_connect_state(self, R_EXECUTE); + + iriap_call_indication(self, skb); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event!\n", __func__); + break; + } +} + +/* + * Function state_r_execute (self, event, skb) + * + * The server is processing the request + * + */ +static void state_r_execute(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + switch (event) { + case IAP_CALL_RESPONSE: + /* + * Since we don't implement the Waiting state, we return + * to state Receiving instead, DB. + */ + iriap_next_r_connect_state(self, R_RECEIVING); + + /* Don't forget to refcount it - see + * iriap_getvaluebyclass_response(). */ + skb_get(skb); + + irlmp_data_request(self->lsap, skb); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event!\n", __func__); + break; + } +} + +static void state_r_returning(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), event=%d\n", __func__, event); + + switch (event) { + case IAP_RECV_F_LST: + break; + default: + break; + } +} diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c new file mode 100644 index 00000000..f07ed9fd --- /dev/null +++ b/net/irda/irias_object.c @@ -0,0 +1,567 @@ +/********************************************************************* + * + * Filename: irias_object.c + * Version: 0.3 + * Description: IAS object database and functions + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Oct 1 22:50:04 1998 + * Modified at: Wed Dec 15 11:23:16 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include +#include + +hashbin_t *irias_objects; + +/* + * Used when a missing value needs to be returned + */ +struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}}; + + +/* + * Function ias_new_object (name, id) + * + * Create a new IAS object + * + */ +struct ias_object *irias_new_object( char *name, int id) +{ + struct ias_object *obj; + + IRDA_DEBUG( 4, "%s()\n", __func__); + + obj = kzalloc(sizeof(struct ias_object), GFP_ATOMIC); + if (obj == NULL) { + IRDA_WARNING("%s(), Unable to allocate object!\n", + __func__); + return NULL; + } + + obj->magic = IAS_OBJECT_MAGIC; + obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC); + if (!obj->name) { + IRDA_WARNING("%s(), Unable to allocate name!\n", + __func__); + kfree(obj); + return NULL; + } + obj->id = id; + + /* Locking notes : the attrib spinlock has lower precendence + * than the objects spinlock. Never grap the objects spinlock + * while holding any attrib spinlock (risk of deadlock). Jean II */ + obj->attribs = hashbin_new(HB_LOCK); + + if (obj->attribs == NULL) { + IRDA_WARNING("%s(), Unable to allocate attribs!\n", + __func__); + kfree(obj->name); + kfree(obj); + return NULL; + } + + return obj; +} +EXPORT_SYMBOL(irias_new_object); + +/* + * Function irias_delete_attrib (attrib) + * + * Delete given attribute and deallocate all its memory + * + */ +static void __irias_delete_attrib(struct ias_attrib *attrib) +{ + IRDA_ASSERT(attrib != NULL, return;); + IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); + + kfree(attrib->name); + + irias_delete_value(attrib->value); + attrib->magic = ~IAS_ATTRIB_MAGIC; + + kfree(attrib); +} + +void __irias_delete_object(struct ias_object *obj) +{ + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + kfree(obj->name); + + hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib); + + obj->magic = ~IAS_OBJECT_MAGIC; + + kfree(obj); +} + +/* + * Function irias_delete_object (obj) + * + * Remove object from hashbin and deallocate all attributes associated with + * with this object and the object itself + * + */ +int irias_delete_object(struct ias_object *obj) +{ + struct ias_object *node; + + IRDA_ASSERT(obj != NULL, return -1;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;); + + /* Remove from list */ + node = hashbin_remove_this(irias_objects, (irda_queue_t *) obj); + if (!node) + IRDA_DEBUG( 0, "%s(), object already removed!\n", + __func__); + + /* Destroy */ + __irias_delete_object(obj); + + return 0; +} +EXPORT_SYMBOL(irias_delete_object); + +/* + * Function irias_delete_attrib (obj) + * + * Remove attribute from hashbin and, if it was the last attribute of + * the object, remove the object as well. + * + */ +int irias_delete_attrib(struct ias_object *obj, struct ias_attrib *attrib, + int cleanobject) +{ + struct ias_attrib *node; + + IRDA_ASSERT(obj != NULL, return -1;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;); + IRDA_ASSERT(attrib != NULL, return -1;); + + /* Remove attribute from object */ + node = hashbin_remove_this(obj->attribs, (irda_queue_t *) attrib); + if (!node) + return 0; /* Already removed or non-existent */ + + /* Deallocate attribute */ + __irias_delete_attrib(node); + + /* Check if object has still some attributes, destroy it if none. + * At first glance, this look dangerous, as the kernel reference + * various IAS objects. However, we only use this function on + * user attributes, not kernel attributes, so there is no risk + * of deleting a kernel object this way. Jean II */ + node = (struct ias_attrib *) hashbin_get_first(obj->attribs); + if (cleanobject && !node) + irias_delete_object(obj); + + return 0; +} + +/* + * Function irias_insert_object (obj) + * + * Insert an object into the LM-IAS database + * + */ +void irias_insert_object(struct ias_object *obj) +{ + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + hashbin_insert(irias_objects, (irda_queue_t *) obj, 0, obj->name); +} +EXPORT_SYMBOL(irias_insert_object); + +/* + * Function irias_find_object (name) + * + * Find object with given name + * + */ +struct ias_object *irias_find_object(char *name) +{ + IRDA_ASSERT(name != NULL, return NULL;); + + /* Unsafe (locking), object might change */ + return hashbin_lock_find(irias_objects, 0, name); +} +EXPORT_SYMBOL(irias_find_object); + +/* + * Function irias_find_attrib (obj, name) + * + * Find named attribute in object + * + */ +struct ias_attrib *irias_find_attrib(struct ias_object *obj, char *name) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return NULL;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return NULL;); + IRDA_ASSERT(name != NULL, return NULL;); + + attrib = hashbin_lock_find(obj->attribs, 0, name); + if (attrib == NULL) + return NULL; + + /* Unsafe (locking), attrib might change */ + return attrib; +} + +/* + * Function irias_add_attribute (obj, attrib) + * + * Add attribute to object + * + */ +static void irias_add_attrib(struct ias_object *obj, struct ias_attrib *attrib, + int owner) +{ + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + IRDA_ASSERT(attrib != NULL, return;); + IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); + + /* Set if attrib is owned by kernel or user space */ + attrib->value->owner = owner; + + hashbin_insert(obj->attribs, (irda_queue_t *) attrib, 0, attrib->name); +} + +/* + * Function irias_object_change_attribute (obj_name, attrib_name, new_value) + * + * Change the value of an objects attribute. + * + */ +int irias_object_change_attribute(char *obj_name, char *attrib_name, + struct ias_value *new_value) +{ + struct ias_object *obj; + struct ias_attrib *attrib; + unsigned long flags; + + /* Find object */ + obj = hashbin_lock_find(irias_objects, 0, obj_name); + if (obj == NULL) { + IRDA_WARNING("%s: Unable to find object: %s\n", __func__, + obj_name); + return -1; + } + + /* Slightly unsafe (obj might get removed under us) */ + spin_lock_irqsave(&obj->attribs->hb_spinlock, flags); + + /* Find attribute */ + attrib = hashbin_find(obj->attribs, 0, attrib_name); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to find attribute: %s\n", + __func__, attrib_name); + spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags); + return -1; + } + + if ( attrib->value->type != new_value->type) { + IRDA_DEBUG( 0, "%s(), changing value type not allowed!\n", + __func__); + spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags); + return -1; + } + + /* Delete old value */ + irias_delete_value(attrib->value); + + /* Insert new value */ + attrib->value = new_value; + + /* Success */ + spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags); + return 0; +} +EXPORT_SYMBOL(irias_object_change_attribute); + +/* + * Function irias_object_add_integer_attrib (obj, name, value) + * + * Add an integer attribute to an LM-IAS object + * + */ +void irias_add_integer_attrib(struct ias_object *obj, char *name, int value, + int owner) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + IRDA_ASSERT(name != NULL, return;); + + attrib = kzalloc(sizeof(struct ias_attrib), GFP_ATOMIC); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __func__); + return; + } + + attrib->magic = IAS_ATTRIB_MAGIC; + attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC); + + /* Insert value */ + attrib->value = irias_new_integer_value(value); + if (!attrib->name || !attrib->value) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __func__); + if (attrib->value) + irias_delete_value(attrib->value); + kfree(attrib->name); + kfree(attrib); + return; + } + + irias_add_attrib(obj, attrib, owner); +} +EXPORT_SYMBOL(irias_add_integer_attrib); + + /* + * Function irias_add_octseq_attrib (obj, name, octet_seq, len) + * + * Add a octet sequence attribute to an LM-IAS object + * + */ + +void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets, + int len, int owner) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + IRDA_ASSERT(name != NULL, return;); + IRDA_ASSERT(octets != NULL, return;); + + attrib = kzalloc(sizeof(struct ias_attrib), GFP_ATOMIC); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __func__); + return; + } + + attrib->magic = IAS_ATTRIB_MAGIC; + attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC); + + attrib->value = irias_new_octseq_value( octets, len); + if (!attrib->name || !attrib->value) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __func__); + if (attrib->value) + irias_delete_value(attrib->value); + kfree(attrib->name); + kfree(attrib); + return; + } + + irias_add_attrib(obj, attrib, owner); +} +EXPORT_SYMBOL(irias_add_octseq_attrib); + +/* + * Function irias_object_add_string_attrib (obj, string) + * + * Add a string attribute to an LM-IAS object + * + */ +void irias_add_string_attrib(struct ias_object *obj, char *name, char *value, + int owner) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + IRDA_ASSERT(name != NULL, return;); + IRDA_ASSERT(value != NULL, return;); + + attrib = kzalloc(sizeof( struct ias_attrib), GFP_ATOMIC); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __func__); + return; + } + + attrib->magic = IAS_ATTRIB_MAGIC; + attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC); + + attrib->value = irias_new_string_value(value); + if (!attrib->name || !attrib->value) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __func__); + if (attrib->value) + irias_delete_value(attrib->value); + kfree(attrib->name); + kfree(attrib); + return; + } + + irias_add_attrib(obj, attrib, owner); +} +EXPORT_SYMBOL(irias_add_string_attrib); + +/* + * Function irias_new_integer_value (integer) + * + * Create new IAS integer value + * + */ +struct ias_value *irias_new_integer_value(int integer) +{ + struct ias_value *value; + + value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + return NULL; + } + + value->type = IAS_INTEGER; + value->len = 4; + value->t.integer = integer; + + return value; +} +EXPORT_SYMBOL(irias_new_integer_value); + +/* + * Function irias_new_string_value (string) + * + * Create new IAS string value + * + * Per IrLMP 1.1, 4.3.3.2, strings are up to 256 chars - Jean II + */ +struct ias_value *irias_new_string_value(char *string) +{ + struct ias_value *value; + + value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + return NULL; + } + + value->type = IAS_STRING; + value->charset = CS_ASCII; + value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC); + if (!value->t.string) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + kfree(value); + return NULL; + } + + value->len = strlen(value->t.string); + + return value; +} + +/* + * Function irias_new_octseq_value (octets, len) + * + * Create new IAS octet-sequence value + * + * Per IrLMP 1.1, 4.3.3.2, octet-sequence are up to 1024 bytes - Jean II + */ +struct ias_value *irias_new_octseq_value(__u8 *octseq , int len) +{ + struct ias_value *value; + + value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + return NULL; + } + + value->type = IAS_OCT_SEQ; + /* Check length */ + if(len > IAS_MAX_OCTET_STRING) + len = IAS_MAX_OCTET_STRING; + value->len = len; + + value->t.oct_seq = kmemdup(octseq, len, GFP_ATOMIC); + if (value->t.oct_seq == NULL){ + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + kfree(value); + return NULL; + } + return value; +} + +struct ias_value *irias_new_missing_value(void) +{ + struct ias_value *value; + + value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __func__); + return NULL; + } + + value->type = IAS_MISSING; + + return value; +} + +/* + * Function irias_delete_value (value) + * + * Delete IAS value + * + */ +void irias_delete_value(struct ias_value *value) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(value != NULL, return;); + + switch (value->type) { + case IAS_INTEGER: /* Fallthrough */ + case IAS_MISSING: + /* No need to deallocate */ + break; + case IAS_STRING: + /* Deallocate string */ + kfree(value->t.string); + break; + case IAS_OCT_SEQ: + /* Deallocate byte stream */ + kfree(value->t.oct_seq); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown value type!\n", __func__); + break; + } + kfree(value); +} +EXPORT_SYMBOL(irias_delete_value); diff --git a/net/irda/irlan/Kconfig b/net/irda/irlan/Kconfig new file mode 100644 index 00000000..951abc2e --- /dev/null +++ b/net/irda/irlan/Kconfig @@ -0,0 +1,14 @@ +config IRLAN + tristate "IrLAN protocol" + depends on IRDA + help + Say Y here if you want to build support for the IrLAN protocol. + To compile it as a module, choose M here: the module will be called + irlan. IrLAN emulates an Ethernet and makes it possible to put up + a wireless LAN using infrared beams. + + The IrLAN protocol can be used to talk with infrared access points + like the HP NetbeamIR, or the ESI JetEye NET. You can also connect + to another Linux machine running the IrLAN protocol for ad-hoc + networking! + diff --git a/net/irda/irlan/Makefile b/net/irda/irlan/Makefile new file mode 100644 index 00000000..94eefbc8 --- /dev/null +++ b/net/irda/irlan/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux IrDA IrLAN protocol layer. +# + +obj-$(CONFIG_IRLAN) += irlan.o + +irlan-y := irlan_common.o irlan_eth.o irlan_event.o irlan_client.o irlan_provider.o irlan_filter.o irlan_provider_event.o irlan_client_event.o diff --git a/net/irda/irlan/irlan_client.c b/net/irda/irlan/irlan_client.c new file mode 100644 index 00000000..7ed3af95 --- /dev/null +++ b/net/irda/irlan/irlan_client.c @@ -0,0 +1,576 @@ +/********************************************************************* + * + * Filename: irlan_client.c + * Version: 0.9 + * Description: IrDA LAN Access Protocol (IrLAN) Client + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Tue Dec 14 15:47:02 1999 + * Modified by: Dag Brattli + * Sources: skeleton.c by Donald Becker + * slip.c by Laurence Culhane, + * Fred N. van Kempen, + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#undef CONFIG_IRLAN_GRATUITOUS_ARP + +static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *); +static int irlan_client_ctrl_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static void irlan_client_ctrl_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *); +static void irlan_check_response_param(struct irlan_cb *self, char *param, + char *value, int val_len); +static void irlan_client_open_ctrl_tsap(struct irlan_cb *self); + +static void irlan_client_kick_timer_expired(void *data) +{ + struct irlan_cb *self = (struct irlan_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* + * If we are in peer mode, the client may not have got the discovery + * indication it needs to make progress. If the client is still in + * IDLE state, we must kick it to, but only if the provider is not IDLE + */ + if ((self->provider.access_type == ACCESS_PEER) && + (self->client.state == IRLAN_IDLE) && + (self->provider.state != IRLAN_IDLE)) { + irlan_client_wakeup(self, self->saddr, self->daddr); + } +} + +static void irlan_client_start_kick_timer(struct irlan_cb *self, int timeout) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + irda_start_timer(&self->client.kick_timer, timeout, (void *) self, + irlan_client_kick_timer_expired); +} + +/* + * Function irlan_client_wakeup (self, saddr, daddr) + * + * Wake up client + * + */ +void irlan_client_wakeup(struct irlan_cb *self, __u32 saddr, __u32 daddr) +{ + IRDA_DEBUG(1, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* + * Check if we are already awake, or if we are a provider in direct + * mode (in that case we must leave the client idle + */ + if ((self->client.state != IRLAN_IDLE) || + (self->provider.access_type == ACCESS_DIRECT)) + { + IRDA_DEBUG(0, "%s(), already awake!\n", __func__ ); + return; + } + + /* Addresses may have changed! */ + self->saddr = saddr; + self->daddr = daddr; + + if (self->disconnect_reason == LM_USER_REQUEST) { + IRDA_DEBUG(0, "%s(), still stopped by user\n", __func__ ); + return; + } + + /* Open TSAPs */ + irlan_client_open_ctrl_tsap(self); + irlan_open_data_tsap(self); + + irlan_do_client_event(self, IRLAN_DISCOVERY_INDICATION, NULL); + + /* Start kick timer */ + irlan_client_start_kick_timer(self, 2*HZ); +} + +/* + * Function irlan_discovery_indication (daddr) + * + * Remote device with IrLAN server support discovered + * + */ +void irlan_client_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv) +{ + struct irlan_cb *self; + __u32 saddr, daddr; + + IRDA_DEBUG(1, "%s()\n", __func__ ); + + IRDA_ASSERT(discovery != NULL, return;); + + /* + * I didn't check it, but I bet that IrLAN suffer from the same + * deficiency as IrComm and doesn't handle two instances + * simultaneously connecting to each other. + * Same workaround, drop passive discoveries. + * Jean II */ + if(mode == DISCOVERY_PASSIVE) + return; + + saddr = discovery->saddr; + daddr = discovery->daddr; + + /* Find instance */ + rcu_read_lock(); + self = irlan_get_any(); + if (self) { + IRDA_ASSERT(self->magic == IRLAN_MAGIC, goto out;); + + IRDA_DEBUG(1, "%s(), Found instance (%08x)!\n", __func__ , + daddr); + + irlan_client_wakeup(self, saddr, daddr); + } +IRDA_ASSERT_LABEL(out:) + rcu_read_unlock(); +} + +/* + * Function irlan_client_data_indication (handle, skb) + * + * This function gets the data that is received on the control channel + * + */ +static int irlan_client_ctrl_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct irlan_cb *self; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + irlan_do_client_event(self, IRLAN_DATA_INDICATION, skb); + + /* Ready for a new command */ + IRDA_DEBUG(2, "%s(), clearing tx_busy\n", __func__ ); + self->client.tx_busy = FALSE; + + /* Check if we have some queued commands waiting to be sent */ + irlan_run_ctrl_tx_queue(self); + + return 0; +} + +static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *userdata) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + struct sk_buff *skb; + + IRDA_DEBUG(4, "%s(), reason=%d\n", __func__ , reason); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap != NULL, return;); + IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;); + + IRDA_ASSERT(tsap == self->client.tsap_ctrl, return;); + + /* Remove frames queued on the control channel */ + while ((skb = skb_dequeue(&self->client.txq)) != NULL) { + dev_kfree_skb(skb); + } + self->client.tx_busy = FALSE; + + irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL); +} + +/* + * Function irlan_client_open_tsaps (self) + * + * Initialize callbacks and open IrTTP TSAPs + * + */ +static void irlan_client_open_ctrl_tsap(struct irlan_cb *self) +{ + struct tsap_cb *tsap; + notify_t notify; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Check if already open */ + if (self->client.tsap_ctrl) + return; + + irda_notify_init(¬ify); + + /* Set up callbacks */ + notify.data_indication = irlan_client_ctrl_data_indication; + notify.connect_confirm = irlan_client_ctrl_connect_confirm; + notify.disconnect_indication = irlan_client_ctrl_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrLAN ctrl (c)", sizeof(notify.name)); + + tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, ¬ify); + if (!tsap) { + IRDA_DEBUG(2, "%s(), Got no tsap!\n", __func__ ); + return; + } + self->client.tsap_ctrl = tsap; +} + +/* + * Function irlan_client_connect_confirm (handle, skb) + * + * Connection to peer IrLAN laye confirmed + * + */ +static void irlan_client_ctrl_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->client.max_sdu_size = max_sdu_size; + self->client.max_header_size = max_header_size; + + /* TODO: we could set the MTU depending on the max_sdu_size */ + + irlan_do_client_event(self, IRLAN_CONNECT_COMPLETE, NULL); +} + +/* + * Function print_ret_code (code) + * + * Print return code of request to peer IrLAN layer. + * + */ +static void print_ret_code(__u8 code) +{ + switch(code) { + case 0: + printk(KERN_INFO "Success\n"); + break; + case 1: + IRDA_WARNING("IrLAN: Insufficient resources\n"); + break; + case 2: + IRDA_WARNING("IrLAN: Invalid command format\n"); + break; + case 3: + IRDA_WARNING("IrLAN: Command not supported\n"); + break; + case 4: + IRDA_WARNING("IrLAN: Parameter not supported\n"); + break; + case 5: + IRDA_WARNING("IrLAN: Value not supported\n"); + break; + case 6: + IRDA_WARNING("IrLAN: Not open\n"); + break; + case 7: + IRDA_WARNING("IrLAN: Authentication required\n"); + break; + case 8: + IRDA_WARNING("IrLAN: Invalid password\n"); + break; + case 9: + IRDA_WARNING("IrLAN: Protocol error\n"); + break; + case 255: + IRDA_WARNING("IrLAN: Asynchronous status\n"); + break; + } +} + +/* + * Function irlan_client_parse_response (self, skb) + * + * Extract all parameters from received buffer, then feed them to + * check_params for parsing + */ +void irlan_client_parse_response(struct irlan_cb *self, struct sk_buff *skb) +{ + __u8 *frame; + __u8 *ptr; + int count; + int ret; + __u16 val_len; + int i; + char *name; + char *value; + + IRDA_ASSERT(skb != NULL, return;); + + IRDA_DEBUG(4, "%s() skb->len=%d\n", __func__ , (int) skb->len); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + if (!skb) { + IRDA_ERROR("%s(), Got NULL skb!\n", __func__); + return; + } + frame = skb->data; + + /* + * Check return code and print it if not success + */ + if (frame[0]) { + print_ret_code(frame[0]); + return; + } + + name = kmalloc(255, GFP_ATOMIC); + if (!name) + return; + value = kmalloc(1016, GFP_ATOMIC); + if (!value) { + kfree(name); + return; + } + + /* How many parameters? */ + count = frame[1]; + + IRDA_DEBUG(4, "%s(), got %d parameters\n", __func__ , count); + + ptr = frame+2; + + /* For all parameters */ + for (i=0; imagic == IRLAN_MAGIC, return;); + + /* Media type */ + if (strcmp(param, "MEDIA") == 0) { + if (strcmp(value, "802.3") == 0) + self->media = MEDIA_802_3; + else + self->media = MEDIA_802_5; + return; + } + if (strcmp(param, "FILTER_TYPE") == 0) { + if (strcmp(value, "DIRECTED") == 0) + self->client.filter_type |= IRLAN_DIRECTED; + else if (strcmp(value, "FUNCTIONAL") == 0) + self->client.filter_type |= IRLAN_FUNCTIONAL; + else if (strcmp(value, "GROUP") == 0) + self->client.filter_type |= IRLAN_GROUP; + else if (strcmp(value, "MAC_FRAME") == 0) + self->client.filter_type |= IRLAN_MAC_FRAME; + else if (strcmp(value, "MULTICAST") == 0) + self->client.filter_type |= IRLAN_MULTICAST; + else if (strcmp(value, "BROADCAST") == 0) + self->client.filter_type |= IRLAN_BROADCAST; + else if (strcmp(value, "IPX_SOCKET") == 0) + self->client.filter_type |= IRLAN_IPX_SOCKET; + + } + if (strcmp(param, "ACCESS_TYPE") == 0) { + if (strcmp(value, "DIRECT") == 0) + self->client.access_type = ACCESS_DIRECT; + else if (strcmp(value, "PEER") == 0) + self->client.access_type = ACCESS_PEER; + else if (strcmp(value, "HOSTED") == 0) + self->client.access_type = ACCESS_HOSTED; + else { + IRDA_DEBUG(2, "%s(), unknown access type!\n", __func__ ); + } + } + /* IRLAN version */ + if (strcmp(param, "IRLAN_VER") == 0) { + IRDA_DEBUG(4, "IrLAN version %d.%d\n", (__u8) value[0], + (__u8) value[1]); + + self->version[0] = value[0]; + self->version[1] = value[1]; + return; + } + /* Which remote TSAP to use for data channel */ + if (strcmp(param, "DATA_CHAN") == 0) { + self->dtsap_sel_data = value[0]; + IRDA_DEBUG(4, "Data TSAP = %02x\n", self->dtsap_sel_data); + return; + } + if (strcmp(param, "CON_ARB") == 0) { + memcpy(&tmp_cpu, value, 2); /* Align value */ + le16_to_cpus(&tmp_cpu); /* Convert to host order */ + self->client.recv_arb_val = tmp_cpu; + IRDA_DEBUG(2, "%s(), receive arb val=%d\n", __func__ , + self->client.recv_arb_val); + } + if (strcmp(param, "MAX_FRAME") == 0) { + memcpy(&tmp_cpu, value, 2); /* Align value */ + le16_to_cpus(&tmp_cpu); /* Convert to host order */ + self->client.max_frame = tmp_cpu; + IRDA_DEBUG(4, "%s(), max frame=%d\n", __func__ , + self->client.max_frame); + } + + /* RECONNECT_KEY, in case the link goes down! */ + if (strcmp(param, "RECONNECT_KEY") == 0) { + IRDA_DEBUG(4, "Got reconnect key: "); + /* for (i = 0; i < val_len; i++) */ +/* printk("%02x", value[i]); */ + memcpy(self->client.reconnect_key, value, val_len); + self->client.key_len = val_len; + IRDA_DEBUG(4, "\n"); + } + /* FILTER_ENTRY, have we got an ethernet address? */ + if (strcmp(param, "FILTER_ENTRY") == 0) { + bytes = value; + IRDA_DEBUG(4, "Ethernet address = %pM\n", bytes); + for (i = 0; i < 6; i++) + self->dev->dev_addr[i] = bytes[i]; + } +} + +/* + * Function irlan_client_get_value_confirm (obj_id, value) + * + * Got results from remote LM-IAS + * + */ +void irlan_client_get_value_confirm(int result, __u16 obj_id, + struct ias_value *value, void *priv) +{ + struct irlan_cb *self; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(priv != NULL, return;); + + self = (struct irlan_cb *) priv; + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* We probably don't need to make any more queries */ + iriap_close(self->client.iriap); + self->client.iriap = NULL; + + /* Check if request succeeded */ + if (result != IAS_SUCCESS) { + IRDA_DEBUG(2, "%s(), got NULL value!\n", __func__ ); + irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL, + NULL); + return; + } + + switch (value->type) { + case IAS_INTEGER: + self->dtsap_sel_ctrl = value->t.integer; + + if (value->t.integer != -1) { + irlan_do_client_event(self, IRLAN_IAS_PROVIDER_AVAIL, + NULL); + return; + } + irias_delete_value(value); + break; + default: + IRDA_DEBUG(2, "%s(), unknown type!\n", __func__ ); + break; + } + irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL, NULL); +} diff --git a/net/irda/irlan/irlan_client_event.c b/net/irda/irlan/irlan_client_event.c new file mode 100644 index 00000000..8d5a8ebc --- /dev/null +++ b/net/irda/irlan/irlan_client_event.c @@ -0,0 +1,533 @@ +/********************************************************************* + * + * Filename: irlan_client_event.c + * Version: 0.9 + * Description: IrLAN client state machine + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sun Dec 26 21:52:24 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int irlan_client_state_idle (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_conn (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_info (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_open (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_wait (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_arb (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_data (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_sync (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); + +static int (*state[])(struct irlan_cb *, IRLAN_EVENT event, struct sk_buff *) = +{ + irlan_client_state_idle, + irlan_client_state_query, + irlan_client_state_conn, + irlan_client_state_info, + irlan_client_state_media, + irlan_client_state_open, + irlan_client_state_wait, + irlan_client_state_arb, + irlan_client_state_data, + irlan_client_state_close, + irlan_client_state_sync +}; + +void irlan_do_client_event(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + (*state[ self->client.state]) (self, event, skb); +} + +/* + * Function irlan_client_state_idle (event, skb, info) + * + * IDLE, We are waiting for an indication that there is a provider + * available. + */ +static int irlan_client_state_idle(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch (event) { + case IRLAN_DISCOVERY_INDICATION: + if (self->client.iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __func__); + return -EBUSY; + } + + self->client.iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irlan_client_get_value_confirm); + /* Get some values from peer IAS */ + irlan_next_client_state(self, IRLAN_QUERY); + iriap_getvaluebyclass_request(self->client.iriap, + self->saddr, self->daddr, + "IrLAN", "IrDA:TinyTP:LsapSel"); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(4, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_query (event, skb, info) + * + * QUERY, We have queryed the remote IAS and is ready to connect + * to provider, just waiting for the confirm. + * + */ +static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch(event) { + case IRLAN_IAS_PROVIDER_AVAIL: + IRDA_ASSERT(self->dtsap_sel_ctrl != 0, return -1;); + + self->client.open_retries = 0; + + irttp_connect_request(self->client.tsap_ctrl, + self->dtsap_sel_ctrl, + self->saddr, self->daddr, NULL, + IRLAN_MTU, NULL); + irlan_next_client_state(self, IRLAN_CONN); + break; + case IRLAN_IAS_PROVIDER_NOT_AVAIL: + IRDA_DEBUG(2, "%s(), IAS_PROVIDER_NOT_AVAIL\n", __func__ ); + irlan_next_client_state(self, IRLAN_IDLE); + + /* Give the client a kick! */ + if ((self->provider.access_type == ACCESS_PEER) && + (self->provider.state != IRLAN_IDLE)) + irlan_client_wakeup(self, self->saddr, self->daddr); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_conn (event, skb, info) + * + * CONN, We have connected to a provider but has not issued any + * commands yet. + * + */ +static int irlan_client_state_conn(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch (event) { + case IRLAN_CONNECT_COMPLETE: + /* Send getinfo cmd */ + irlan_get_provider_info(self); + irlan_next_client_state(self, IRLAN_INFO); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_info (self, event, skb, info) + * + * INFO, We have issued a GetInfo command and is awaiting a reply. + */ +static int irlan_client_state_info(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch (event) { + case IRLAN_DATA_INDICATION: + IRDA_ASSERT(skb != NULL, return -1;); + + irlan_client_parse_response(self, skb); + + irlan_next_client_state(self, IRLAN_MEDIA); + + irlan_get_media_char(self); + break; + + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_media (self, event, skb, info) + * + * MEDIA, The irlan_client has issued a GetMedia command and is awaiting a + * reply. + * + */ +static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_DATA_INDICATION: + irlan_client_parse_response(self, skb); + irlan_open_data_channel(self); + irlan_next_client_state(self, IRLAN_OPEN); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_open (self, event, skb, info) + * + * OPEN, The irlan_client has issued a OpenData command and is awaiting a + * reply + * + */ +static int irlan_client_state_open(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + struct qos_info qos; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_DATA_INDICATION: + irlan_client_parse_response(self, skb); + + /* + * Check if we have got the remote TSAP for data + * communications + */ + IRDA_ASSERT(self->dtsap_sel_data != 0, return -1;); + + /* Check which access type we are dealing with */ + switch (self->client.access_type) { + case ACCESS_PEER: + if (self->provider.state == IRLAN_OPEN) { + + irlan_next_client_state(self, IRLAN_ARB); + irlan_do_client_event(self, IRLAN_CHECK_CON_ARB, + NULL); + } else { + + irlan_next_client_state(self, IRLAN_WAIT); + } + break; + case ACCESS_DIRECT: + case ACCESS_HOSTED: + qos.link_disc_time.bits = 0x01; /* 3 secs */ + + irttp_connect_request(self->tsap_data, + self->dtsap_sel_data, + self->saddr, self->daddr, &qos, + IRLAN_MTU, NULL); + + irlan_next_client_state(self, IRLAN_DATA); + break; + default: + IRDA_DEBUG(2, "%s(), unknown access type!\n", __func__ ); + break; + } + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_wait (self, event, skb, info) + * + * WAIT, The irlan_client is waiting for the local provider to enter the + * provider OPEN state. + * + */ +static int irlan_client_state_wait(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_PROVIDER_SIGNAL: + irlan_next_client_state(self, IRLAN_ARB); + irlan_do_client_event(self, IRLAN_CHECK_CON_ARB, NULL); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +static int irlan_client_state_arb(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + struct qos_info qos; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_CHECK_CON_ARB: + if (self->client.recv_arb_val == self->provider.send_arb_val) { + irlan_next_client_state(self, IRLAN_CLOSE); + irlan_close_data_channel(self); + } else if (self->client.recv_arb_val < + self->provider.send_arb_val) + { + qos.link_disc_time.bits = 0x01; /* 3 secs */ + + irlan_next_client_state(self, IRLAN_DATA); + irttp_connect_request(self->tsap_data, + self->dtsap_sel_data, + self->saddr, self->daddr, &qos, + IRLAN_MTU, NULL); + } else if (self->client.recv_arb_val > + self->provider.send_arb_val) + { + IRDA_DEBUG(2, "%s(), lost the battle :-(\n", __func__ ); + } + break; + case IRLAN_DATA_CONNECT_INDICATION: + irlan_next_client_state(self, IRLAN_DATA); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_data (self, event, skb, info) + * + * DATA, The data channel is connected, allowing data transfers between + * the local and remote machines. + * + */ +static int irlan_client_state_data(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch(event) { + case IRLAN_DATA_INDICATION: + irlan_client_parse_response(self, skb); + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_close (self, event, skb, info) + * + * + * + */ +static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_sync (self, event, skb, info) + * + * + * + */ +static int irlan_client_state_sync(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (skb) + dev_kfree_skb(skb); + + return 0; +} + + + + + + + + + + + + + diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c new file mode 100644 index 00000000..6130f9d9 --- /dev/null +++ b/net/irda/irlan/irlan_common.c @@ -0,0 +1,1214 @@ +/********************************************************************* + * + * Filename: irlan_common.c + * Version: 0.9 + * Description: IrDA LAN Access Protocol Implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sun Dec 26 21:53:10 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +/* extern char sysctl_devname[]; */ + +/* + * Master structure + */ +static LIST_HEAD(irlans); + +static void *ckey; +static void *skey; + +/* Module parameters */ +static int eth; /* Use "eth" or "irlan" name for devices */ +static int access = ACCESS_PEER; /* PEER, DIRECT or HOSTED */ + +#ifdef CONFIG_PROC_FS +static const char *const irlan_access[] = { + "UNKNOWN", + "DIRECT", + "PEER", + "HOSTED" +}; + +static const char *const irlan_media[] = { + "UNKNOWN", + "802.3", + "802.5" +}; + +extern struct proc_dir_entry *proc_irda; + +static int irlan_seq_open(struct inode *inode, struct file *file); + +static const struct file_operations irlan_fops = { + .owner = THIS_MODULE, + .open = irlan_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +extern struct proc_dir_entry *proc_irda; +#endif /* CONFIG_PROC_FS */ + +static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr); +static void __irlan_close(struct irlan_cb *self); +static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, + __u8 value_byte, __u16 value_short, + __u8 *value_array, __u16 value_len); +static void irlan_open_unicast_addr(struct irlan_cb *self); +static void irlan_get_unicast_addr(struct irlan_cb *self); +void irlan_close_tsaps(struct irlan_cb *self); + +/* + * Function irlan_init (void) + * + * Initialize IrLAN layer + * + */ +static int __init irlan_init(void) +{ + struct irlan_cb *new; + __u16 hints; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + +#ifdef CONFIG_PROC_FS + { struct proc_dir_entry *proc; + proc = proc_create("irlan", 0, proc_irda, &irlan_fops); + if (!proc) { + printk(KERN_ERR "irlan_init: can't create /proc entry!\n"); + return -ENODEV; + } + } +#endif /* CONFIG_PROC_FS */ + + IRDA_DEBUG(4, "%s()\n", __func__ ); + hints = irlmp_service_to_hint(S_LAN); + + /* Register with IrLMP as a client */ + ckey = irlmp_register_client(hints, &irlan_client_discovery_indication, + NULL, NULL); + if (!ckey) + goto err_ckey; + + /* Register with IrLMP as a service */ + skey = irlmp_register_service(hints); + if (!skey) + goto err_skey; + + /* Start the master IrLAN instance (the only one for now) */ + new = irlan_open(DEV_ADDR_ANY, DEV_ADDR_ANY); + if (!new) + goto err_open; + + /* The master will only open its (listen) control TSAP */ + irlan_provider_open_ctrl_tsap(new); + + /* Do some fast discovery! */ + irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS); + + return 0; + +err_open: + irlmp_unregister_service(skey); +err_skey: + irlmp_unregister_client(ckey); +err_ckey: +#ifdef CONFIG_PROC_FS + remove_proc_entry("irlan", proc_irda); +#endif /* CONFIG_PROC_FS */ + + return -ENOMEM; +} + +static void __exit irlan_cleanup(void) +{ + struct irlan_cb *self, *next; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + irlmp_unregister_client(ckey); + irlmp_unregister_service(skey); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("irlan", proc_irda); +#endif /* CONFIG_PROC_FS */ + + /* Cleanup any leftover network devices */ + rtnl_lock(); + list_for_each_entry_safe(self, next, &irlans, dev_list) { + __irlan_close(self); + } + rtnl_unlock(); +} + +/* + * Function irlan_open (void) + * + * Open new instance of a client/provider, we should only register the + * network device if this instance is ment for a particular client/provider + */ +static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr) +{ + struct net_device *dev; + struct irlan_cb *self; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Create network device with irlan */ + dev = alloc_irlandev(eth ? "eth%d" : "irlan%d"); + if (!dev) + return NULL; + + self = netdev_priv(dev); + self->dev = dev; + + /* + * Initialize local device structure + */ + self->magic = IRLAN_MAGIC; + self->saddr = saddr; + self->daddr = daddr; + + /* Provider access can only be PEER, DIRECT, or HOSTED */ + self->provider.access_type = access; + if (access == ACCESS_DIRECT) { + /* + * Since we are emulating an IrLAN sever we will have to + * give ourself an ethernet address! + */ + dev->dev_addr[0] = 0x40; + dev->dev_addr[1] = 0x00; + dev->dev_addr[2] = 0x00; + dev->dev_addr[3] = 0x00; + get_random_bytes(dev->dev_addr+4, 1); + get_random_bytes(dev->dev_addr+5, 1); + } + + self->media = MEDIA_802_3; + self->disconnect_reason = LM_USER_REQUEST; + init_timer(&self->watchdog_timer); + init_timer(&self->client.kick_timer); + init_waitqueue_head(&self->open_wait); + + skb_queue_head_init(&self->client.txq); + + irlan_next_client_state(self, IRLAN_IDLE); + irlan_next_provider_state(self, IRLAN_IDLE); + + if (register_netdev(dev)) { + IRDA_DEBUG(2, "%s(), register_netdev() failed!\n", + __func__ ); + self = NULL; + free_netdev(dev); + } else { + rtnl_lock(); + list_add_rcu(&self->dev_list, &irlans); + rtnl_unlock(); + } + + return self; +} +/* + * Function __irlan_close (self) + * + * This function closes and deallocates the IrLAN client instances. Be + * aware that other functions which calls client_close() must + * remove self from irlans list first. + */ +static void __irlan_close(struct irlan_cb *self) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + ASSERT_RTNL(); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + del_timer_sync(&self->watchdog_timer); + del_timer_sync(&self->client.kick_timer); + + /* Close all open connections and remove TSAPs */ + irlan_close_tsaps(self); + + if (self->client.iriap) + iriap_close(self->client.iriap); + + /* Remove frames queued on the control channel */ + skb_queue_purge(&self->client.txq); + + /* Unregister and free self via destructor */ + unregister_netdevice(self->dev); +} + +/* Find any instance of irlan, used for client discovery wakeup */ +struct irlan_cb *irlan_get_any(void) +{ + struct irlan_cb *self; + + list_for_each_entry_rcu(self, &irlans, dev_list) { + return self; + } + return NULL; +} + +/* + * Function irlan_connect_indication (instance, sap, qos, max_sdu_size, skb) + * + * Here we receive the connect indication for the data channel + * + */ +static void irlan_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap == self->tsap_data,return;); + + self->max_sdu_size = max_sdu_size; + self->max_header_size = max_header_size; + + IRDA_DEBUG(0, "%s: We are now connected!\n", __func__); + + del_timer(&self->watchdog_timer); + + /* If you want to pass the skb to *both* state machines, you will + * need to skb_clone() it, so that you don't free it twice. + * As the state machines don't need it, git rid of it here... + * Jean II */ + if (skb) + dev_kfree_skb(skb); + + irlan_do_provider_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL); + irlan_do_client_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL); + + if (self->provider.access_type == ACCESS_PEER) { + /* + * Data channel is open, so we are now allowed to + * configure the remote filter + */ + irlan_get_unicast_addr(self); + irlan_open_unicast_addr(self); + } + /* Ready to transfer Ethernet frames (at last) */ + netif_start_queue(self->dev); /* Clear reason */ +} + +static void irlan_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->max_sdu_size = max_sdu_size; + self->max_header_size = max_header_size; + + /* TODO: we could set the MTU depending on the max_sdu_size */ + + IRDA_DEBUG(0, "%s: We are now connected!\n", __func__); + del_timer(&self->watchdog_timer); + + /* + * Data channel is open, so we are now allowed to configure the remote + * filter + */ + irlan_get_unicast_addr(self); + irlan_open_unicast_addr(self); + + /* Open broadcast and multicast filter by default */ + irlan_set_broadcast_filter(self, TRUE); + irlan_set_multicast_filter(self, TRUE); + + /* Ready to transfer Ethernet frames */ + netif_start_queue(self->dev); + self->disconnect_reason = 0; /* Clear reason */ + wake_up_interruptible(&self->open_wait); +} + +/* + * Function irlan_client_disconnect_indication (handle) + * + * Callback function for the IrTTP layer. Indicates a disconnection of + * the specified connection (handle) + */ +static void irlan_disconnect_indication(void *instance, + void *sap, LM_REASON reason, + struct sk_buff *userdata) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(0, "%s(), reason=%d\n", __func__ , reason); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap != NULL, return;); + IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;); + + IRDA_ASSERT(tsap == self->tsap_data, return;); + + IRDA_DEBUG(2, "IrLAN, data channel disconnected by peer!\n"); + + /* Save reason so we know if we should try to reconnect or not */ + self->disconnect_reason = reason; + + switch (reason) { + case LM_USER_REQUEST: /* User request */ + IRDA_DEBUG(2, "%s(), User requested\n", __func__ ); + break; + case LM_LAP_DISCONNECT: /* Unexpected IrLAP disconnect */ + IRDA_DEBUG(2, "%s(), Unexpected IrLAP disconnect\n", __func__ ); + break; + case LM_CONNECT_FAILURE: /* Failed to establish IrLAP connection */ + IRDA_DEBUG(2, "%s(), IrLAP connect failed\n", __func__ ); + break; + case LM_LAP_RESET: /* IrLAP reset */ + IRDA_DEBUG(2, "%s(), IrLAP reset\n", __func__ ); + break; + case LM_INIT_DISCONNECT: + IRDA_DEBUG(2, "%s(), IrLMP connect failed\n", __func__ ); + break; + default: + IRDA_ERROR("%s(), Unknown disconnect reason\n", __func__); + break; + } + + /* If you want to pass the skb to *both* state machines, you will + * need to skb_clone() it, so that you don't free it twice. + * As the state machines don't need it, git rid of it here... + * Jean II */ + if (userdata) + dev_kfree_skb(userdata); + + irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL); + irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL); + + wake_up_interruptible(&self->open_wait); +} + +void irlan_open_data_tsap(struct irlan_cb *self) +{ + struct tsap_cb *tsap; + notify_t notify; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Check if already open */ + if (self->tsap_data) + return; + + irda_notify_init(¬ify); + + notify.data_indication = irlan_eth_receive; + notify.udata_indication = irlan_eth_receive; + notify.connect_indication = irlan_connect_indication; + notify.connect_confirm = irlan_connect_confirm; + notify.flow_indication = irlan_eth_flow_indication; + notify.disconnect_indication = irlan_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrLAN data", sizeof(notify.name)); + + tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, ¬ify); + if (!tsap) { + IRDA_DEBUG(2, "%s(), Got no tsap!\n", __func__ ); + return; + } + self->tsap_data = tsap; + + /* + * This is the data TSAP selector which we will pass to the client + * when the client ask for it. + */ + self->stsap_sel_data = self->tsap_data->stsap_sel; +} + +void irlan_close_tsaps(struct irlan_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Disconnect and close all open TSAP connections */ + if (self->tsap_data) { + irttp_disconnect_request(self->tsap_data, NULL, P_NORMAL); + irttp_close_tsap(self->tsap_data); + self->tsap_data = NULL; + } + if (self->client.tsap_ctrl) { + irttp_disconnect_request(self->client.tsap_ctrl, NULL, + P_NORMAL); + irttp_close_tsap(self->client.tsap_ctrl); + self->client.tsap_ctrl = NULL; + } + if (self->provider.tsap_ctrl) { + irttp_disconnect_request(self->provider.tsap_ctrl, NULL, + P_NORMAL); + irttp_close_tsap(self->provider.tsap_ctrl); + self->provider.tsap_ctrl = NULL; + } + self->disconnect_reason = LM_USER_REQUEST; +} + +/* + * Function irlan_ias_register (self, tsap_sel) + * + * Register with LM-IAS + * + */ +void irlan_ias_register(struct irlan_cb *self, __u8 tsap_sel) +{ + struct ias_object *obj; + struct ias_value *new_value; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* + * Check if object has already been registered by a previous provider. + * If that is the case, we just change the value of the attribute + */ + if (!irias_find_object("IrLAN")) { + obj = irias_new_object("IrLAN", IAS_IRLAN_ID); + irias_add_integer_attrib(obj, "IrDA:TinyTP:LsapSel", tsap_sel, + IAS_KERNEL_ATTR); + irias_insert_object(obj); + } else { + new_value = irias_new_integer_value(tsap_sel); + irias_object_change_attribute("IrLAN", "IrDA:TinyTP:LsapSel", + new_value); + } + + /* Register PnP object only if not registered before */ + if (!irias_find_object("PnP")) { + obj = irias_new_object("PnP", IAS_PNP_ID); +#if 0 + irias_add_string_attrib(obj, "Name", sysctl_devname, + IAS_KERNEL_ATTR); +#else + irias_add_string_attrib(obj, "Name", "Linux", IAS_KERNEL_ATTR); +#endif + irias_add_string_attrib(obj, "DeviceID", "HWP19F0", + IAS_KERNEL_ATTR); + irias_add_integer_attrib(obj, "CompCnt", 1, IAS_KERNEL_ATTR); + if (self->provider.access_type == ACCESS_PEER) + irias_add_string_attrib(obj, "Comp#01", "PNP8389", + IAS_KERNEL_ATTR); + else + irias_add_string_attrib(obj, "Comp#01", "PNP8294", + IAS_KERNEL_ATTR); + + irias_add_string_attrib(obj, "Manufacturer", + "Linux-IrDA Project", IAS_KERNEL_ATTR); + irias_insert_object(obj); + } +} + +/* + * Function irlan_run_ctrl_tx_queue (self) + * + * Try to send the next command in the control transmit queue + * + */ +int irlan_run_ctrl_tx_queue(struct irlan_cb *self) +{ + struct sk_buff *skb; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + if (irda_lock(&self->client.tx_busy) == FALSE) + return -EBUSY; + + skb = skb_dequeue(&self->client.txq); + if (!skb) { + self->client.tx_busy = FALSE; + return 0; + } + + /* Check that it's really possible to send commands */ + if ((self->client.tsap_ctrl == NULL) || + (self->client.state == IRLAN_IDLE)) + { + self->client.tx_busy = FALSE; + dev_kfree_skb(skb); + return -1; + } + IRDA_DEBUG(2, "%s(), sending ...\n", __func__ ); + + return irttp_data_request(self->client.tsap_ctrl, skb); +} + +/* + * Function irlan_ctrl_data_request (self, skb) + * + * This function makes sure that commands on the control channel is being + * sent in a command/response fashion + */ +static void irlan_ctrl_data_request(struct irlan_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Queue command */ + skb_queue_tail(&self->client.txq, skb); + + /* Try to send command */ + irlan_run_ctrl_tx_queue(self); +} + +/* + * Function irlan_get_provider_info (self) + * + * Send Get Provider Information command to peer IrLAN layer + * + */ +void irlan_get_provider_info(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER, + GFP_ATOMIC); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_GET_PROVIDER_INFO; + frame[1] = 0x00; /* Zero parameters */ + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_open_data_channel (self) + * + * Send an Open Data Command to provider + * + */ +void irlan_open_data_channel(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_STRING_PARAMETER_LEN("MEDIA", "802.3") + + IRLAN_STRING_PARAMETER_LEN("ACCESS_TYPE", "DIRECT"), + GFP_ATOMIC); + if (!skb) + return; + + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + /* Build frame */ + frame[0] = CMD_OPEN_DATA_CHANNEL; + frame[1] = 0x02; /* Two parameters */ + + irlan_insert_string_param(skb, "MEDIA", "802.3"); + irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT"); + /* irlan_insert_string_param(skb, "MODE", "UNRELIABLE"); */ + +/* self->use_udata = TRUE; */ + + irlan_ctrl_data_request(self, skb); +} + +void irlan_close_data_channel(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Check if the TSAP is still there */ + if (self->client.tsap_ctrl == NULL) + return; + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN"), + GFP_ATOMIC); + if (!skb) + return; + + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + /* Build frame */ + frame[0] = CMD_CLOSE_DATA_CHAN; + frame[1] = 0x01; /* One parameter */ + + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_open_unicast_addr (self) + * + * Make IrLAN provider accept ethernet frames addressed to the unicast + * address. + * + */ +static void irlan_open_unicast_addr(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") + + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") + + IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "FILTER"), + GFP_ATOMIC); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN" , self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED"); + irlan_insert_string_param(skb, "FILTER_MODE", "FILTER"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_set_broadcast_filter (self, status) + * + * Make IrLAN provider accept ethernet frames addressed to the broadcast + * address. Be careful with the use of this one, since there may be a lot + * of broadcast traffic out there. We can still function without this + * one but then _we_ have to initiate all communication with other + * hosts, since ARP request for this host will not be answered. + */ +void irlan_set_broadcast_filter(struct irlan_cb *self, int status) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") + + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "BROADCAST") + + /* We may waste one byte here...*/ + IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "FILTER"), + GFP_ATOMIC); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST"); + if (status) + irlan_insert_string_param(skb, "FILTER_MODE", "FILTER"); + else + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_set_multicast_filter (self, status) + * + * Make IrLAN provider accept ethernet frames addressed to the multicast + * address. + * + */ +void irlan_set_multicast_filter(struct irlan_cb *self, int status) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") + + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "MULTICAST") + + /* We may waste one byte here...*/ + IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "NONE"), + GFP_ATOMIC); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST"); + if (status) + irlan_insert_string_param(skb, "FILTER_MODE", "ALL"); + else + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_get_unicast_addr (self) + * + * Retrieves the unicast address from the IrLAN provider. This address + * will be inserted into the devices structure, so the ethernet layer + * can construct its packets. + * + */ +static void irlan_get_unicast_addr(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") + + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") + + IRLAN_STRING_PARAMETER_LEN("FILTER_OPERATION", + "DYNAMIC"), + GFP_ATOMIC); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED"); + irlan_insert_string_param(skb, "FILTER_OPERATION", "DYNAMIC"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_get_media_char (self) + * + * + * + */ +void irlan_get_media_char(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + IRLAN_STRING_PARAMETER_LEN("MEDIA", "802.3"), + GFP_ATOMIC); + + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + /* Build frame */ + frame[0] = CMD_GET_MEDIA_CHAR; + frame[1] = 0x01; /* One parameter */ + + irlan_insert_string_param(skb, "MEDIA", "802.3"); + irlan_ctrl_data_request(self, skb); +} + +/* + * Function insert_byte_param (skb, param, value) + * + * Insert byte parameter into frame + * + */ +int irlan_insert_byte_param(struct sk_buff *skb, char *param, __u8 value) +{ + return __irlan_insert_param(skb, param, IRLAN_BYTE, value, 0, NULL, 0); +} + +int irlan_insert_short_param(struct sk_buff *skb, char *param, __u16 value) +{ + return __irlan_insert_param(skb, param, IRLAN_SHORT, 0, value, NULL, 0); +} + +/* + * Function insert_string (skb, param, value) + * + * Insert string parameter into frame + * + */ +int irlan_insert_string_param(struct sk_buff *skb, char *param, char *string) +{ + int string_len = strlen(string); + + return __irlan_insert_param(skb, param, IRLAN_ARRAY, 0, 0, string, + string_len); +} + +/* + * Function insert_array_param(skb, param, value, len_value) + * + * Insert array parameter into frame + * + */ +int irlan_insert_array_param(struct sk_buff *skb, char *name, __u8 *array, + __u16 array_len) +{ + return __irlan_insert_param(skb, name, IRLAN_ARRAY, 0, 0, array, + array_len); +} + +/* + * Function insert_param (skb, param, value, byte) + * + * Insert parameter at end of buffer, structure of a parameter is: + * + * ----------------------------------------------------------------------- + * | Name Length[1] | Param Name[1..255] | Val Length[2] | Value[0..1016]| + * ----------------------------------------------------------------------- + */ +static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, + __u8 value_byte, __u16 value_short, + __u8 *value_array, __u16 value_len) +{ + __u8 *frame; + __u8 param_len; + __le16 tmp_le; /* Temporary value in little endian format */ + int n=0; + + if (skb == NULL) { + IRDA_DEBUG(2, "%s(), Got NULL skb\n", __func__ ); + return 0; + } + + param_len = strlen(param); + switch (type) { + case IRLAN_BYTE: + value_len = 1; + break; + case IRLAN_SHORT: + value_len = 2; + break; + case IRLAN_ARRAY: + IRDA_ASSERT(value_array != NULL, return 0;); + IRDA_ASSERT(value_len > 0, return 0;); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown parameter type!\n", __func__ ); + return 0; + break; + } + + /* Insert at end of sk-buffer */ + frame = skb_tail_pointer(skb); + + /* Make space for data */ + if (skb_tailroom(skb) < (param_len+value_len+3)) { + IRDA_DEBUG(2, "%s(), No more space at end of skb\n", __func__ ); + return 0; + } + skb_put(skb, param_len+value_len+3); + + /* Insert parameter length */ + frame[n++] = param_len; + + /* Insert parameter */ + memcpy(frame+n, param, param_len); n += param_len; + + /* Insert value length (2 byte little endian format, LSB first) */ + tmp_le = cpu_to_le16(value_len); + memcpy(frame+n, &tmp_le, 2); n += 2; /* To avoid alignment problems */ + + /* Insert value */ + switch (type) { + case IRLAN_BYTE: + frame[n++] = value_byte; + break; + case IRLAN_SHORT: + tmp_le = cpu_to_le16(value_short); + memcpy(frame+n, &tmp_le, 2); n += 2; + break; + case IRLAN_ARRAY: + memcpy(frame+n, value_array, value_len); n+=value_len; + break; + default: + break; + } + IRDA_ASSERT(n == (param_len+value_len+3), return 0;); + + return param_len+value_len+3; +} + +/* + * Function irlan_extract_param (buf, name, value, len) + * + * Extracts a single parameter name/value pair from buffer and updates + * the buffer pointer to point to the next name/value pair. + */ +int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len) +{ + __u8 name_len; + __u16 val_len; + int n=0; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + /* get length of parameter name (1 byte) */ + name_len = buf[n++]; + + if (name_len > 254) { + IRDA_DEBUG(2, "%s(), name_len > 254\n", __func__ ); + return -RSP_INVALID_COMMAND_FORMAT; + } + + /* get parameter name */ + memcpy(name, buf+n, name_len); + name[name_len] = '\0'; + n+=name_len; + + /* + * Get length of parameter value (2 bytes in little endian + * format) + */ + memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */ + le16_to_cpus(&val_len); n+=2; + + if (val_len >= 1016) { + IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ ); + return -RSP_INVALID_COMMAND_FORMAT; + } + *len = val_len; + + /* get parameter value */ + memcpy(value, buf+n, val_len); + value[val_len] = '\0'; + n+=val_len; + + IRDA_DEBUG(4, "Parameter: %s ", name); + IRDA_DEBUG(4, "Value: %s\n", value); + + return n; +} + +#ifdef CONFIG_PROC_FS + +/* + * Start of reading /proc entries. + * Return entry at pos, + * or start_token to indicate print header line + * or NULL if end of file + */ +static void *irlan_seq_start(struct seq_file *seq, loff_t *pos) +{ + rcu_read_lock(); + return seq_list_start_head(&irlans, *pos); +} + +/* Return entry after v, and increment pos */ +static void *irlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &irlans, pos); +} + +/* End of reading /proc file */ +static void irlan_seq_stop(struct seq_file *seq, void *v) +{ + rcu_read_unlock(); +} + + +/* + * Show one entry in /proc file. + */ +static int irlan_seq_show(struct seq_file *seq, void *v) +{ + if (v == &irlans) + seq_puts(seq, "IrLAN instances:\n"); + else { + struct irlan_cb *self = list_entry(v, struct irlan_cb, dev_list); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + seq_printf(seq,"ifname: %s,\n", + self->dev->name); + seq_printf(seq,"client state: %s, ", + irlan_state[ self->client.state]); + seq_printf(seq,"provider state: %s,\n", + irlan_state[ self->provider.state]); + seq_printf(seq,"saddr: %#08x, ", + self->saddr); + seq_printf(seq,"daddr: %#08x\n", + self->daddr); + seq_printf(seq,"version: %d.%d,\n", + self->version[1], self->version[0]); + seq_printf(seq,"access type: %s\n", + irlan_access[self->client.access_type]); + seq_printf(seq,"media: %s\n", + irlan_media[self->media]); + + seq_printf(seq,"local filter:\n"); + seq_printf(seq,"remote filter: "); + irlan_print_filter(seq, self->client.filter_type); + seq_printf(seq,"tx busy: %s\n", + netif_queue_stopped(self->dev) ? "TRUE" : "FALSE"); + + seq_putc(seq,'\n'); + } + return 0; +} + +static const struct seq_operations irlan_seq_ops = { + .start = irlan_seq_start, + .next = irlan_seq_next, + .stop = irlan_seq_stop, + .show = irlan_seq_show, +}; + +static int irlan_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &irlan_seq_ops); +} +#endif + +MODULE_AUTHOR("Dag Brattli "); +MODULE_DESCRIPTION("The Linux IrDA LAN protocol"); +MODULE_LICENSE("GPL"); + +module_param(eth, bool, 0); +MODULE_PARM_DESC(eth, "Name devices ethX (0) or irlanX (1)"); +module_param(access, int, 0); +MODULE_PARM_DESC(access, "Access type DIRECT=1, PEER=2, HOSTED=3"); + +module_init(irlan_init); +module_exit(irlan_cleanup); + diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c new file mode 100644 index 00000000..8ee1ff6c --- /dev/null +++ b/net/irda/irlan/irlan_eth.c @@ -0,0 +1,349 @@ +/********************************************************************* + * + * Filename: irlan_eth.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Oct 15 08:37:58 1998 + * Modified at: Tue Mar 21 09:06:41 2000 + * Modified by: Dag Brattli + * Sources: skeleton.c by Donald Becker + * slip.c by Laurence Culhane, + * Fred N. van Kempen, + * + * Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int irlan_eth_open(struct net_device *dev); +static int irlan_eth_close(struct net_device *dev); +static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, + struct net_device *dev); +static void irlan_eth_set_multicast_list( struct net_device *dev); + +static const struct net_device_ops irlan_eth_netdev_ops = { + .ndo_open = irlan_eth_open, + .ndo_stop = irlan_eth_close, + .ndo_start_xmit = irlan_eth_xmit, + .ndo_set_multicast_list = irlan_eth_set_multicast_list, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + +/* + * Function irlan_eth_setup (dev) + * + * The network device initialization function. + * + */ +static void irlan_eth_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &irlan_eth_netdev_ops; + dev->destructor = free_netdev; + + + /* + * Lets do all queueing in IrTTP instead of this device driver. + * Queueing here as well can introduce some strange latency + * problems, which we will avoid by setting the queue size to 0. + */ + /* + * The bugs in IrTTP and IrLAN that created this latency issue + * have now been fixed, and we can propagate flow control properly + * to the network layer. However, this requires a minimal queue of + * packets for the device. + * Without flow control, the Tx Queue is 14 (ttp) + 0 (dev) = 14 + * With flow control, the Tx Queue is 7 (ttp) + 4 (dev) = 11 + * See irlan_eth_flow_indication()... + * Note : this number was randomly selected and would need to + * be adjusted. + * Jean II */ + dev->tx_queue_len = 4; +} + +/* + * Function alloc_irlandev + * + * Allocate network device and control block + * + */ +struct net_device *alloc_irlandev(const char *name) +{ + return alloc_netdev(sizeof(struct irlan_cb), name, + irlan_eth_setup); +} + +/* + * Function irlan_eth_open (dev) + * + * Network device has been opened by user + * + */ +static int irlan_eth_open(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Ready to play! */ + netif_stop_queue(dev); /* Wait until data link is ready */ + + /* We are now open, so time to do some work */ + self->disconnect_reason = 0; + irlan_client_wakeup(self, self->saddr, self->daddr); + + /* Make sure we have a hardware address before we return, + so DHCP clients gets happy */ + return wait_event_interruptible(self->open_wait, + !self->tsap_data->connected); +} + +/* + * Function irlan_eth_close (dev) + * + * Stop the ether network device, his function will usually be called by + * ifconfig down. We should now disconnect the link, We start the + * close timer, so that the instance will be removed if we are unable + * to discover the remote device after the disconnect. + */ +static int irlan_eth_close(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Stop device */ + netif_stop_queue(dev); + + irlan_close_data_channel(self); + irlan_close_tsaps(self); + + irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL); + irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL); + + /* Remove frames queued on the control channel */ + skb_queue_purge(&self->client.txq); + + self->client.tx_busy = 0; + + return 0; +} + +/* + * Function irlan_eth_tx (skb) + * + * Transmits ethernet frames over IrDA link. + * + */ +static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + int ret; + unsigned int len; + + /* skb headroom large enough to contain all IrDA-headers? */ + if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, self->max_header_size); + + /* We have to free the original skb anyway */ + dev_kfree_skb(skb); + + /* Did the realloc succeed? */ + if (new_skb == NULL) + return NETDEV_TX_OK; + + /* Use the new skb instead */ + skb = new_skb; + } + + dev->trans_start = jiffies; + + len = skb->len; + /* Now queue the packet in the transport layer */ + if (self->use_udata) + ret = irttp_udata_request(self->tsap_data, skb); + else + ret = irttp_data_request(self->tsap_data, skb); + + if (ret < 0) { + /* + * IrTTPs tx queue is full, so we just have to + * drop the frame! You might think that we should + * just return -1 and don't deallocate the frame, + * but that is dangerous since it's possible that + * we have replaced the original skb with a new + * one with larger headroom, and that would really + * confuse do_dev_queue_xmit() in dev.c! I have + * tried :-) DB + */ + /* irttp_data_request already free the packet */ + dev->stats.tx_dropped++; + } else { + dev->stats.tx_packets++; + dev->stats.tx_bytes += len; + } + + return NETDEV_TX_OK; +} + +/* + * Function irlan_eth_receive (handle, skb) + * + * This function gets the data that is received on the data channel + * + */ +int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) +{ + struct irlan_cb *self = instance; + struct net_device *dev = self->dev; + + if (skb == NULL) { + dev->stats.rx_dropped++; + return 0; + } + if (skb->len < ETH_HLEN) { + IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n", + __func__, skb->len); + dev->stats.rx_dropped++; + dev_kfree_skb(skb); + return 0; + } + + /* + * Adopt this frame! Important to set all these fields since they + * might have been previously set by the low level IrDA network + * device driver + */ + skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */ + + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; + + netif_rx(skb); /* Eat it! */ + + return 0; +} + +/* + * Function irlan_eth_flow (status) + * + * Do flow control between IP/Ethernet and IrLAN/IrTTP. This is done by + * controlling the queue stop/start. + * + * The IrDA link layer has the advantage to have flow control, and + * IrTTP now properly handles that. Flow controlling the higher layers + * prevent us to drop Tx packets in here (up to 15% for a TCP socket, + * more for UDP socket). + * Also, this allow us to reduce the overall transmit queue, which means + * less latency in case of mixed traffic. + * Jean II + */ +void irlan_eth_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) +{ + struct irlan_cb *self; + struct net_device *dev; + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + dev = self->dev; + + IRDA_ASSERT(dev != NULL, return;); + + IRDA_DEBUG(0, "%s() : flow %s ; running %d\n", __func__, + flow == FLOW_STOP ? "FLOW_STOP" : "FLOW_START", + netif_running(dev)); + + switch (flow) { + case FLOW_STOP: + /* IrTTP is full, stop higher layers */ + netif_stop_queue(dev); + break; + case FLOW_START: + default: + /* Tell upper layers that its time to transmit frames again */ + /* Schedule network layer */ + netif_wake_queue(dev); + break; + } +} + +/* + * Function set_multicast_list (dev) + * + * Configure the filtering of the device + * + */ +#define HW_MAX_ADDRS 4 /* Must query to get it! */ +static void irlan_eth_set_multicast_list(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + IRDA_DEBUG(2, "%s()\n", __func__ ); + + /* Check if data channel has been connected yet */ + if (self->client.state != IRLAN_DATA) { + IRDA_DEBUG(1, "%s(), delaying!\n", __func__ ); + return; + } + + if (dev->flags & IFF_PROMISC) { + /* Enable promiscuous mode */ + IRDA_WARNING("Promiscuous mode not implemented by IrLAN!\n"); + } + else if ((dev->flags & IFF_ALLMULTI) || + netdev_mc_count(dev) > HW_MAX_ADDRS) { + /* Disable promiscuous mode, use normal mode. */ + IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ ); + /* hardware_set_filter(NULL); */ + + irlan_set_multicast_filter(self, TRUE); + } + else if (!netdev_mc_empty(dev)) { + IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ ); + /* Walk the address list, and load the filter */ + /* hardware_set_filter(dev->mc_list); */ + + irlan_set_multicast_filter(self, TRUE); + } + else { + IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __func__ ); + irlan_set_multicast_filter(self, FALSE); + } + + if (dev->flags & IFF_BROADCAST) + irlan_set_broadcast_filter(self, TRUE); + else + irlan_set_broadcast_filter(self, FALSE); +} diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c new file mode 100644 index 00000000..43f16040 --- /dev/null +++ b/net/irda/irlan/irlan_event.c @@ -0,0 +1,60 @@ +/********************************************************************* + * + * Filename: irlan_event.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Tue Oct 20 09:10:16 1998 + * Modified at: Sat Oct 30 12:59:01 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +const char * const irlan_state[] = { + "IRLAN_IDLE", + "IRLAN_QUERY", + "IRLAN_CONN", + "IRLAN_INFO", + "IRLAN_MEDIA", + "IRLAN_OPEN", + "IRLAN_WAIT", + "IRLAN_ARB", + "IRLAN_DATA", + "IRLAN_CLOSE", + "IRLAN_SYNC", +}; + +void irlan_next_client_state(struct irlan_cb *self, IRLAN_STATE state) +{ + IRDA_DEBUG(2, "%s(), %s\n", __func__ , irlan_state[state]); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->client.state = state; +} + +void irlan_next_provider_state(struct irlan_cb *self, IRLAN_STATE state) +{ + IRDA_DEBUG(2, "%s(), %s\n", __func__ , irlan_state[state]); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->provider.state = state; +} + diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c new file mode 100644 index 00000000..7977be7c --- /dev/null +++ b/net/irda/irlan/irlan_filter.c @@ -0,0 +1,243 @@ +/********************************************************************* + * + * Filename: irlan_filter.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Fri Jan 29 11:16:38 1999 + * Modified at: Sat Oct 30 12:58:45 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include + +#include +#include + +/* + * Function irlan_filter_request (self, skb) + * + * Handle filter request from client peer device + * + */ +void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + if ((self->provider.filter_type == IRLAN_DIRECTED) && + (self->provider.filter_operation == DYNAMIC)) + { + IRDA_DEBUG(0, "Giving peer a dynamic Ethernet address\n"); + self->provider.mac_address[0] = 0x40; + self->provider.mac_address[1] = 0x00; + self->provider.mac_address[2] = 0x00; + self->provider.mac_address[3] = 0x00; + + /* Use arbitration value to generate MAC address */ + if (self->provider.access_type == ACCESS_PEER) { + self->provider.mac_address[4] = + self->provider.send_arb_val & 0xff; + self->provider.mac_address[5] = + (self->provider.send_arb_val >> 8) & 0xff; + } else { + /* Just generate something for now */ + get_random_bytes(self->provider.mac_address+4, 1); + get_random_bytes(self->provider.mac_address+5, 1); + } + + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x03; + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + irlan_insert_short_param(skb, "MAX_ENTRY", 0x0001); + irlan_insert_array_param(skb, "FILTER_ENTRY", + self->provider.mac_address, 6); + return; + } + + if ((self->provider.filter_type == IRLAN_DIRECTED) && + (self->provider.filter_mode == FILTER)) + { + IRDA_DEBUG(0, "Directed filter on\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_DIRECTED) && + (self->provider.filter_mode == NONE)) + { + IRDA_DEBUG(0, "Directed filter off\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + + if ((self->provider.filter_type == IRLAN_BROADCAST) && + (self->provider.filter_mode == FILTER)) + { + IRDA_DEBUG(0, "Broadcast filter on\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_BROADCAST) && + (self->provider.filter_mode == NONE)) + { + IRDA_DEBUG(0, "Broadcast filter off\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_MULTICAST) && + (self->provider.filter_mode == FILTER)) + { + IRDA_DEBUG(0, "Multicast filter on\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_MULTICAST) && + (self->provider.filter_mode == NONE)) + { + IRDA_DEBUG(0, "Multicast filter off\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_MULTICAST) && + (self->provider.filter_operation == GET)) + { + IRDA_DEBUG(0, "Multicast filter get\n"); + skb->data[0] = 0x00; /* Success? */ + skb->data[1] = 0x02; + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + irlan_insert_short_param(skb, "MAX_ENTRY", 16); + return; + } + skb->data[0] = 0x00; /* Command not supported */ + skb->data[1] = 0x00; + + IRDA_DEBUG(0, "Not implemented!\n"); +} + +/* + * Function check_request_param (self, param, value) + * + * Check parameters in request from peer device + * + */ +void irlan_check_command_param(struct irlan_cb *self, char *param, char *value) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + IRDA_DEBUG(4, "%s, %s\n", param, value); + + /* + * This is experimental!! DB. + */ + if (strcmp(param, "MODE") == 0) { + IRDA_DEBUG(0, "%s()\n", __func__ ); + self->use_udata = TRUE; + return; + } + + /* + * FILTER_TYPE + */ + if (strcmp(param, "FILTER_TYPE") == 0) { + if (strcmp(value, "DIRECTED") == 0) { + self->provider.filter_type = IRLAN_DIRECTED; + return; + } + if (strcmp(value, "MULTICAST") == 0) { + self->provider.filter_type = IRLAN_MULTICAST; + return; + } + if (strcmp(value, "BROADCAST") == 0) { + self->provider.filter_type = IRLAN_BROADCAST; + return; + } + } + /* + * FILTER_MODE + */ + if (strcmp(param, "FILTER_MODE") == 0) { + if (strcmp(value, "ALL") == 0) { + self->provider.filter_mode = ALL; + return; + } + if (strcmp(value, "FILTER") == 0) { + self->provider.filter_mode = FILTER; + return; + } + if (strcmp(value, "NONE") == 0) { + self->provider.filter_mode = FILTER; + return; + } + } + /* + * FILTER_OPERATION + */ + if (strcmp(param, "FILTER_OPERATION") == 0) { + if (strcmp(value, "DYNAMIC") == 0) { + self->provider.filter_operation = DYNAMIC; + return; + } + if (strcmp(value, "GET") == 0) { + self->provider.filter_operation = GET; + return; + } + } +} + +/* + * Function irlan_print_filter (filter_type, buf) + * + * Print status of filter. Used by /proc file system + * + */ +#ifdef CONFIG_PROC_FS +#define MASK2STR(m,s) { .mask = m, .str = s } + +void irlan_print_filter(struct seq_file *seq, int filter_type) +{ + static struct { + int mask; + const char *str; + } filter_mask2str[] = { + MASK2STR(IRLAN_DIRECTED, "DIRECTED"), + MASK2STR(IRLAN_FUNCTIONAL, "FUNCTIONAL"), + MASK2STR(IRLAN_GROUP, "GROUP"), + MASK2STR(IRLAN_MAC_FRAME, "MAC_FRAME"), + MASK2STR(IRLAN_MULTICAST, "MULTICAST"), + MASK2STR(IRLAN_BROADCAST, "BROADCAST"), + MASK2STR(IRLAN_IPX_SOCKET, "IPX_SOCKET"), + MASK2STR(0, NULL) + }, *p; + + for (p = filter_mask2str; p->str; p++) { + if (filter_type & p->mask) + seq_printf(seq, "%s ", p->str); + } + seq_putc(seq, '\n'); +} +#undef MASK2STR +#endif diff --git a/net/irda/irlan/irlan_provider.c b/net/irda/irlan/irlan_provider.c new file mode 100644 index 00000000..b8af74ab --- /dev/null +++ b/net/irda/irlan/irlan_provider.c @@ -0,0 +1,417 @@ +/********************************************************************* + * + * Filename: irlan_provider.c + * Version: 0.9 + * Description: IrDA LAN Access Protocol Implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sat Oct 30 12:52:10 1999 + * Modified by: Dag Brattli + * Sources: skeleton.c by Donald Becker + * slip.c by Laurence Culhane, + * Fred N. van Kempen, + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static void irlan_provider_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); + +/* + * Function irlan_provider_control_data_indication (handle, skb) + * + * This function gets the data that is received on the control channel + * + */ +static int irlan_provider_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct irlan_cb *self; + __u8 code; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + IRDA_ASSERT(skb != NULL, return -1;); + + code = skb->data[0]; + switch(code) { + case CMD_GET_PROVIDER_INFO: + IRDA_DEBUG(4, "Got GET_PROVIDER_INFO command!\n"); + irlan_do_provider_event(self, IRLAN_GET_INFO_CMD, skb); + break; + + case CMD_GET_MEDIA_CHAR: + IRDA_DEBUG(4, "Got GET_MEDIA_CHAR command!\n"); + irlan_do_provider_event(self, IRLAN_GET_MEDIA_CMD, skb); + break; + case CMD_OPEN_DATA_CHANNEL: + IRDA_DEBUG(4, "Got OPEN_DATA_CHANNEL command!\n"); + irlan_do_provider_event(self, IRLAN_OPEN_DATA_CMD, skb); + break; + case CMD_FILTER_OPERATION: + IRDA_DEBUG(4, "Got FILTER_OPERATION command!\n"); + irlan_do_provider_event(self, IRLAN_FILTER_CONFIG_CMD, skb); + break; + case CMD_RECONNECT_DATA_CHAN: + IRDA_DEBUG(2, "%s(), Got RECONNECT_DATA_CHAN command\n", __func__ ); + IRDA_DEBUG(2, "%s(), NOT IMPLEMENTED\n", __func__ ); + break; + case CMD_CLOSE_DATA_CHAN: + IRDA_DEBUG(2, "Got CLOSE_DATA_CHAN command!\n"); + IRDA_DEBUG(2, "%s(), NOT IMPLEMENTED\n", __func__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown command!\n", __func__ ); + break; + } + return 0; +} + +/* + * Function irlan_provider_connect_indication (handle, skb, priv) + * + * Got connection from peer IrLAN client + * + */ +static void irlan_provider_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(0, "%s()\n", __func__ ); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + IRDA_ASSERT(tsap == self->provider.tsap_ctrl,return;); + IRDA_ASSERT(self->provider.state == IRLAN_IDLE, return;); + + self->provider.max_sdu_size = max_sdu_size; + self->provider.max_header_size = max_header_size; + + irlan_do_provider_event(self, IRLAN_CONNECT_INDICATION, NULL); + + /* + * If we are in peer mode, the client may not have got the discovery + * indication it needs to make progress. If the client is still in + * IDLE state, we must kick it. + */ + if ((self->provider.access_type == ACCESS_PEER) && + (self->client.state == IRLAN_IDLE)) + { + irlan_client_wakeup(self, self->saddr, self->daddr); + } +} + +/* + * Function irlan_provider_connect_response (handle) + * + * Accept incoming connection + * + */ +void irlan_provider_connect_response(struct irlan_cb *self, + struct tsap_cb *tsap) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Just accept */ + irttp_connect_response(tsap, IRLAN_MTU, NULL); +} + +static void irlan_provider_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *userdata) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(4, "%s(), reason=%d\n", __func__ , reason); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap != NULL, return;); + IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;); + + IRDA_ASSERT(tsap == self->provider.tsap_ctrl, return;); + + irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL); +} + +/* + * Function irlan_parse_open_data_cmd (self, skb) + * + * + * + */ +int irlan_parse_open_data_cmd(struct irlan_cb *self, struct sk_buff *skb) +{ + int ret; + + ret = irlan_provider_parse_command(self, CMD_OPEN_DATA_CHANNEL, skb); + + /* Open data channel */ + irlan_open_data_tsap(self); + + return ret; +} + +/* + * Function parse_command (skb) + * + * Extract all parameters from received buffer, then feed them to + * check_params for parsing + * + */ +int irlan_provider_parse_command(struct irlan_cb *self, int cmd, + struct sk_buff *skb) +{ + __u8 *frame; + __u8 *ptr; + int count; + __u16 val_len; + int i; + char *name; + char *value; + int ret = RSP_SUCCESS; + + IRDA_ASSERT(skb != NULL, return -RSP_PROTOCOL_ERROR;); + + IRDA_DEBUG(4, "%s(), skb->len=%d\n", __func__ , (int)skb->len); + + IRDA_ASSERT(self != NULL, return -RSP_PROTOCOL_ERROR;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -RSP_PROTOCOL_ERROR;); + + if (!skb) + return -RSP_PROTOCOL_ERROR; + + frame = skb->data; + + name = kmalloc(255, GFP_ATOMIC); + if (!name) + return -RSP_INSUFFICIENT_RESOURCES; + value = kmalloc(1016, GFP_ATOMIC); + if (!value) { + kfree(name); + return -RSP_INSUFFICIENT_RESOURCES; + } + + /* How many parameters? */ + count = frame[1]; + + IRDA_DEBUG(4, "Got %d parameters\n", count); + + ptr = frame+2; + + /* For all parameters */ + for (i=0; imagic == IRLAN_MAGIC, return;); + + skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER + + /* Bigger param length comes from CMD_GET_MEDIA_CHAR */ + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") + + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "BORADCAST") + + IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "MULTICAST") + + IRLAN_STRING_PARAMETER_LEN("ACCESS_TYPE", "HOSTED"), + GFP_ATOMIC); + + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->provider.max_header_size); + skb_put(skb, 2); + + switch (command) { + case CMD_GET_PROVIDER_INFO: + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x02; /* 2 parameters */ + switch (self->media) { + case MEDIA_802_3: + irlan_insert_string_param(skb, "MEDIA", "802.3"); + break; + case MEDIA_802_5: + irlan_insert_string_param(skb, "MEDIA", "802.5"); + break; + default: + IRDA_DEBUG(2, "%s(), unknown media type!\n", __func__ ); + break; + } + irlan_insert_short_param(skb, "IRLAN_VER", 0x0101); + break; + + case CMD_GET_MEDIA_CHAR: + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x05; /* 5 parameters */ + irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED"); + irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST"); + irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST"); + + switch (self->provider.access_type) { + case ACCESS_DIRECT: + irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT"); + break; + case ACCESS_PEER: + irlan_insert_string_param(skb, "ACCESS_TYPE", "PEER"); + break; + case ACCESS_HOSTED: + irlan_insert_string_param(skb, "ACCESS_TYPE", "HOSTED"); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown access type\n", __func__ ); + break; + } + irlan_insert_short_param(skb, "MAX_FRAME", 0x05ee); + break; + case CMD_OPEN_DATA_CHANNEL: + skb->data[0] = 0x00; /* Success */ + if (self->provider.send_arb_val) { + skb->data[1] = 0x03; /* 3 parameters */ + irlan_insert_short_param(skb, "CON_ARB", + self->provider.send_arb_val); + } else + skb->data[1] = 0x02; /* 2 parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->stsap_sel_data); + irlan_insert_string_param(skb, "RECONNECT_KEY", "LINUX RULES!"); + break; + case CMD_FILTER_OPERATION: + irlan_filter_request(self, skb); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown command!\n", __func__ ); + break; + } + + irttp_data_request(self->provider.tsap_ctrl, skb); +} + +/* + * Function irlan_provider_register(void) + * + * Register provider support so we can accept incoming connections. + * + */ +int irlan_provider_open_ctrl_tsap(struct irlan_cb *self) +{ + struct tsap_cb *tsap; + notify_t notify; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + /* Check if already open */ + if (self->provider.tsap_ctrl) + return -1; + + /* + * First register well known control TSAP + */ + irda_notify_init(¬ify); + notify.data_indication = irlan_provider_data_indication; + notify.connect_indication = irlan_provider_connect_indication; + notify.disconnect_indication = irlan_provider_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrLAN ctrl (p)", sizeof(notify.name)); + + tsap = irttp_open_tsap(LSAP_ANY, 1, ¬ify); + if (!tsap) { + IRDA_DEBUG(2, "%s(), Got no tsap!\n", __func__ ); + return -1; + } + self->provider.tsap_ctrl = tsap; + + /* Register with LM-IAS */ + irlan_ias_register(self, tsap->stsap_sel); + + return 0; +} + diff --git a/net/irda/irlan/irlan_provider_event.c b/net/irda/irlan/irlan_provider_event.c new file mode 100644 index 00000000..01a9d7c9 --- /dev/null +++ b/net/irda/irlan/irlan_provider_event.c @@ -0,0 +1,241 @@ +/********************************************************************* + * + * Filename: irlan_provider_event.c + * Version: 0.9 + * Description: IrLAN provider state machine) + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sat Oct 30 12:52:41 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include +#include + +static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); + +static int (*state[])(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) = +{ + irlan_provider_state_idle, + NULL, /* Query */ + NULL, /* Info */ + irlan_provider_state_info, + NULL, /* Media */ + irlan_provider_state_open, + NULL, /* Wait */ + NULL, /* Arb */ + irlan_provider_state_data, + NULL, /* Close */ + NULL, /* Sync */ +}; + +void irlan_do_provider_event(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(*state[ self->provider.state] != NULL, return;); + + (*state[self->provider.state]) (self, event, skb); +} + +/* + * Function irlan_provider_state_idle (event, skb, info) + * + * IDLE, We are waiting for an indication that there is a provider + * available. + */ +static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_CONNECT_INDICATION: + irlan_provider_connect_response( self, self->provider.tsap_ctrl); + irlan_next_provider_state( self, IRLAN_INFO); + break; + default: + IRDA_DEBUG(4, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_provider_state_info (self, event, skb, info) + * + * INFO, We have issued a GetInfo command and is awaiting a reply. + */ +static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_GET_INFO_CMD: + /* Be sure to use 802.3 in case of peer mode */ + if (self->provider.access_type == ACCESS_PEER) { + self->media = MEDIA_802_3; + + /* Check if client has started yet */ + if (self->client.state == IRLAN_IDLE) { + /* This should get the client going */ + irlmp_discovery_request(8); + } + } + + irlan_provider_send_reply(self, CMD_GET_PROVIDER_INFO, + RSP_SUCCESS); + /* Keep state */ + break; + case IRLAN_GET_MEDIA_CMD: + irlan_provider_send_reply(self, CMD_GET_MEDIA_CHAR, + RSP_SUCCESS); + /* Keep state */ + break; + case IRLAN_OPEN_DATA_CMD: + ret = irlan_parse_open_data_cmd(self, skb); + if (self->provider.access_type == ACCESS_PEER) { + /* FIXME: make use of random functions! */ + self->provider.send_arb_val = (jiffies & 0xffff); + } + irlan_provider_send_reply(self, CMD_OPEN_DATA_CHANNEL, ret); + + if (ret == RSP_SUCCESS) { + irlan_next_provider_state(self, IRLAN_OPEN); + + /* Signal client that we are now open */ + irlan_do_client_event(self, IRLAN_PROVIDER_SIGNAL, NULL); + } + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_provider_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG( 0, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_provider_state_open (self, event, skb, info) + * + * OPEN, The client has issued a OpenData command and is awaiting a + * reply + * + */ +static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_FILTER_CONFIG_CMD: + irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb); + irlan_provider_send_reply(self, CMD_FILTER_OPERATION, + RSP_SUCCESS); + /* Keep state */ + break; + case IRLAN_DATA_CONNECT_INDICATION: + irlan_next_provider_state(self, IRLAN_DATA); + irlan_provider_connect_response(self, self->tsap_data); + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_provider_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_provider_state_data (self, event, skb, info) + * + * DATA, The data channel is connected, allowing data transfers between + * the local and remote machines. + * + */ +static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch(event) { + case IRLAN_FILTER_CONFIG_CMD: + irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb); + irlan_provider_send_reply(self, CMD_FILTER_OPERATION, + RSP_SUCCESS); + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_provider_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG( 0, "%s(), Unknown event %d\n", __func__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + + + + + + + + + + diff --git a/net/irda/irlap.c b/net/irda/irlap.c new file mode 100644 index 00000000..005b4244 --- /dev/null +++ b/net/irda/irlap.c @@ -0,0 +1,1237 @@ +/********************************************************************* + * + * Filename: irlap.c + * Version: 1.0 + * Description: IrLAP implementation for Linux + * Status: Stable + * Author: Dag Brattli + * Created at: Mon Aug 4 20:40:53 1997 + * Modified at: Tue Dec 14 09:26:44 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static hashbin_t *irlap = NULL; +int sysctl_slot_timeout = SLOT_TIMEOUT * 1000 / HZ; + +/* This is the delay of missed pf period before generating an event + * to the application. The spec mandate 3 seconds, but in some cases + * it's way too long. - Jean II */ +int sysctl_warn_noreply_time = 3; + +extern void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb); +static void __irlap_close(struct irlap_cb *self); +static void irlap_init_qos_capabilities(struct irlap_cb *self, + struct qos_info *qos_user); + +#ifdef CONFIG_IRDA_DEBUG +static const char *const lap_reasons[] = { + "ERROR, NOT USED", + "LAP_DISC_INDICATION", + "LAP_NO_RESPONSE", + "LAP_RESET_INDICATION", + "LAP_FOUND_NONE", + "LAP_MEDIA_BUSY", + "LAP_PRIMARY_CONFLICT", + "ERROR, NOT USED", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +int __init irlap_init(void) +{ + /* Check if the compiler did its job properly. + * May happen on some ARM configuration, check with Russell King. */ + IRDA_ASSERT(sizeof(struct xid_frame) == 14, ;); + IRDA_ASSERT(sizeof(struct test_frame) == 10, ;); + IRDA_ASSERT(sizeof(struct ua_frame) == 10, ;); + IRDA_ASSERT(sizeof(struct snrm_frame) == 11, ;); + + /* Allocate master array */ + irlap = hashbin_new(HB_LOCK); + if (irlap == NULL) { + IRDA_ERROR("%s: can't allocate irlap hashbin!\n", + __func__); + return -ENOMEM; + } + + return 0; +} + +void irlap_cleanup(void) +{ + IRDA_ASSERT(irlap != NULL, return;); + + hashbin_delete(irlap, (FREE_FUNC) __irlap_close); +} + +/* + * Function irlap_open (driver) + * + * Initialize IrLAP layer + * + */ +struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos, + const char *hw_name) +{ + struct irlap_cb *self; + + IRDA_DEBUG(4, "%s()\n", __func__); + + /* Initialize the irlap structure. */ + self = kzalloc(sizeof(struct irlap_cb), GFP_KERNEL); + if (self == NULL) + return NULL; + + self->magic = LAP_MAGIC; + + /* Make a binding between the layers */ + self->netdev = dev; + self->qos_dev = qos; + /* Copy hardware name */ + if(hw_name != NULL) { + strlcpy(self->hw_name, hw_name, sizeof(self->hw_name)); + } else { + self->hw_name[0] = '\0'; + } + + /* FIXME: should we get our own field? */ + dev->atalk_ptr = self; + + self->state = LAP_OFFLINE; + + /* Initialize transmit queue */ + skb_queue_head_init(&self->txq); + skb_queue_head_init(&self->txq_ultra); + skb_queue_head_init(&self->wx_list); + + /* My unique IrLAP device address! */ + /* We don't want the broadcast address, neither the NULL address + * (most often used to signify "invalid"), and we don't want an + * address already in use (otherwise connect won't be able + * to select the proper link). - Jean II */ + do { + get_random_bytes(&self->saddr, sizeof(self->saddr)); + } while ((self->saddr == 0x0) || (self->saddr == BROADCAST) || + (hashbin_lock_find(irlap, self->saddr, NULL)) ); + /* Copy to the driver */ + memcpy(dev->dev_addr, &self->saddr, 4); + + init_timer(&self->slot_timer); + init_timer(&self->query_timer); + init_timer(&self->discovery_timer); + init_timer(&self->final_timer); + init_timer(&self->poll_timer); + init_timer(&self->wd_timer); + init_timer(&self->backoff_timer); + init_timer(&self->media_busy_timer); + + irlap_apply_default_connection_parameters(self); + + self->N3 = 3; /* # connections attempts to try before giving up */ + + self->state = LAP_NDM; + + hashbin_insert(irlap, (irda_queue_t *) self, self->saddr, NULL); + + irlmp_register_link(self, self->saddr, &self->notify); + + return self; +} +EXPORT_SYMBOL(irlap_open); + +/* + * Function __irlap_close (self) + * + * Remove IrLAP and all allocated memory. Stop any pending timers. + * + */ +static void __irlap_close(struct irlap_cb *self) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Stop timers */ + del_timer(&self->slot_timer); + del_timer(&self->query_timer); + del_timer(&self->discovery_timer); + del_timer(&self->final_timer); + del_timer(&self->poll_timer); + del_timer(&self->wd_timer); + del_timer(&self->backoff_timer); + del_timer(&self->media_busy_timer); + + irlap_flush_all_queues(self); + + self->magic = 0; + + kfree(self); +} + +/* + * Function irlap_close (self) + * + * Remove IrLAP instance + * + */ +void irlap_close(struct irlap_cb *self) +{ + struct irlap_cb *lap; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* We used to send a LAP_DISC_INDICATION here, but this was + * racy. This has been move within irlmp_unregister_link() + * itself. Jean II */ + + /* Kill the LAP and all LSAPs on top of it */ + irlmp_unregister_link(self->saddr); + self->notify.instance = NULL; + + /* Be sure that we manage to remove ourself from the hash */ + lap = hashbin_remove(irlap, self->saddr, NULL); + if (!lap) { + IRDA_DEBUG(1, "%s(), Didn't find myself!\n", __func__); + return; + } + __irlap_close(lap); +} +EXPORT_SYMBOL(irlap_close); + +/* + * Function irlap_connect_indication (self, skb) + * + * Another device is attempting to make a connection + * + */ +void irlap_connect_indication(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_init_qos_capabilities(self, NULL); /* No user QoS! */ + + irlmp_link_connect_indication(self->notify.instance, self->saddr, + self->daddr, &self->qos_tx, skb); +} + +/* + * Function irlap_connect_response (self, skb) + * + * Service user has accepted incoming connection + * + */ +void irlap_connect_response(struct irlap_cb *self, struct sk_buff *userdata) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + irlap_do_event(self, CONNECT_RESPONSE, userdata, NULL); +} + +/* + * Function irlap_connect_request (self, daddr, qos_user, sniff) + * + * Request connection with another device, sniffing is not implemented + * yet. + * + */ +void irlap_connect_request(struct irlap_cb *self, __u32 daddr, + struct qos_info *qos_user, int sniff) +{ + IRDA_DEBUG(3, "%s(), daddr=0x%08x\n", __func__, daddr); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + self->daddr = daddr; + + /* + * If the service user specifies QoS values for this connection, + * then use them + */ + irlap_init_qos_capabilities(self, qos_user); + + if ((self->state == LAP_NDM) && !self->media_busy) + irlap_do_event(self, CONNECT_REQUEST, NULL, NULL); + else + self->connect_pending = TRUE; +} + +/* + * Function irlap_connect_confirm (self, skb) + * + * Connection request has been accepted + * + */ +void irlap_connect_confirm(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlmp_link_connect_confirm(self->notify.instance, &self->qos_tx, skb); +} + +/* + * Function irlap_data_indication (self, skb) + * + * Received data frames from IR-port, so we just pass them up to + * IrLMP for further processing + * + */ +void irlap_data_indication(struct irlap_cb *self, struct sk_buff *skb, + int unreliable) +{ + /* Hide LAP header from IrLMP layer */ + skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + irlmp_link_data_indication(self->notify.instance, skb, unreliable); +} + + +/* + * Function irlap_data_request (self, skb) + * + * Queue data for transmission, must wait until XMIT state + * + */ +void irlap_data_request(struct irlap_cb *self, struct sk_buff *skb, + int unreliable) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER), + return;); + skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + /* + * Must set frame format now so that the rest of the code knows + * if its dealing with an I or an UI frame + */ + if (unreliable) + skb->data[1] = UI_FRAME; + else + skb->data[1] = I_FRAME; + + /* Don't forget to refcount it - see irlmp_connect_request(). */ + skb_get(skb); + + /* Add at the end of the queue (keep ordering) - Jean II */ + skb_queue_tail(&self->txq, skb); + + /* + * Send event if this frame only if we are in the right state + * FIXME: udata should be sent first! (skb_queue_head?) + */ + if ((self->state == LAP_XMIT_P) || (self->state == LAP_XMIT_S)) { + /* If we are not already processing the Tx queue, trigger + * transmission immediately - Jean II */ + if((skb_queue_len(&self->txq) <= 1) && (!self->local_busy)) + irlap_do_event(self, DATA_REQUEST, skb, NULL); + /* Otherwise, the packets will be sent normally at the + * next pf-poll - Jean II */ + } +} + +/* + * Function irlap_unitdata_request (self, skb) + * + * Send Ultra data. This is data that must be sent outside any connection + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlap_unitdata_request(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER), + return;); + skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + skb->data[0] = CBROADCAST; + skb->data[1] = UI_FRAME; + + /* Don't need to refcount, see irlmp_connless_data_request() */ + + skb_queue_tail(&self->txq_ultra, skb); + + irlap_do_event(self, SEND_UI_FRAME, NULL, NULL); +} +#endif /*CONFIG_IRDA_ULTRA */ + +/* + * Function irlap_udata_indication (self, skb) + * + * Receive Ultra data. This is data that is received outside any connection + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlap_unitdata_indication(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(1, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Hide LAP header from IrLMP layer */ + skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + irlmp_link_unitdata_indication(self->notify.instance, skb); +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irlap_disconnect_request (void) + * + * Request to disconnect connection by service user + */ +void irlap_disconnect_request(struct irlap_cb *self) +{ + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Don't disconnect until all data frames are successfully sent */ + if (!skb_queue_empty(&self->txq)) { + self->disconnect_pending = TRUE; + return; + } + + /* Check if we are in the right state for disconnecting */ + switch (self->state) { + case LAP_XMIT_P: /* FALLTHROUGH */ + case LAP_XMIT_S: /* FALLTHROUGH */ + case LAP_CONN: /* FALLTHROUGH */ + case LAP_RESET_WAIT: /* FALLTHROUGH */ + case LAP_RESET_CHECK: + irlap_do_event(self, DISCONNECT_REQUEST, NULL, NULL); + break; + default: + IRDA_DEBUG(2, "%s(), disconnect pending!\n", __func__); + self->disconnect_pending = TRUE; + break; + } +} + +/* + * Function irlap_disconnect_indication (void) + * + * Disconnect request from other device + * + */ +void irlap_disconnect_indication(struct irlap_cb *self, LAP_REASON reason) +{ + IRDA_DEBUG(1, "%s(), reason=%s\n", __func__, lap_reasons[reason]); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Flush queues */ + irlap_flush_all_queues(self); + + switch (reason) { + case LAP_RESET_INDICATION: + IRDA_DEBUG(1, "%s(), Sending reset request!\n", __func__); + irlap_do_event(self, RESET_REQUEST, NULL, NULL); + break; + case LAP_NO_RESPONSE: /* FALLTHROUGH */ + case LAP_DISC_INDICATION: /* FALLTHROUGH */ + case LAP_FOUND_NONE: /* FALLTHROUGH */ + case LAP_MEDIA_BUSY: + irlmp_link_disconnect_indication(self->notify.instance, self, + reason, NULL); + break; + default: + IRDA_ERROR("%s: Unknown reason %d\n", __func__, reason); + } +} + +/* + * Function irlap_discovery_request (gen_addr_bit) + * + * Start one single discovery operation. + * + */ +void irlap_discovery_request(struct irlap_cb *self, discovery_t *discovery) +{ + struct irlap_info info; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(discovery != NULL, return;); + + IRDA_DEBUG(4, "%s(), nslots = %d\n", __func__, discovery->nslots); + + IRDA_ASSERT((discovery->nslots == 1) || (discovery->nslots == 6) || + (discovery->nslots == 8) || (discovery->nslots == 16), + return;); + + /* Discovery is only possible in NDM mode */ + if (self->state != LAP_NDM) { + IRDA_DEBUG(4, "%s(), discovery only possible in NDM mode\n", + __func__); + irlap_discovery_confirm(self, NULL); + /* Note : in theory, if we are not in NDM, we could postpone + * the discovery like we do for connection request. + * In practice, it's not worth it. If the media was busy, + * it's likely next time around it won't be busy. If we are + * in REPLY state, we will get passive discovery info & event. + * Jean II */ + return; + } + + /* Check if last discovery request finished in time, or if + * it was aborted due to the media busy flag. */ + if (self->discovery_log != NULL) { + hashbin_delete(self->discovery_log, (FREE_FUNC) kfree); + self->discovery_log = NULL; + } + + /* All operations will occur at predictable time, no need to lock */ + self->discovery_log = hashbin_new(HB_NOLOCK); + + if (self->discovery_log == NULL) { + IRDA_WARNING("%s(), Unable to allocate discovery log!\n", + __func__); + return; + } + + info.S = discovery->nslots; /* Number of slots */ + info.s = 0; /* Current slot */ + + self->discovery_cmd = discovery; + info.discovery = discovery; + + /* sysctl_slot_timeout bounds are checked in irsysctl.c - Jean II */ + self->slot_timeout = sysctl_slot_timeout * HZ / 1000; + + irlap_do_event(self, DISCOVERY_REQUEST, NULL, &info); +} + +/* + * Function irlap_discovery_confirm (log) + * + * A device has been discovered in front of this station, we + * report directly to LMP. + */ +void irlap_discovery_confirm(struct irlap_cb *self, hashbin_t *discovery_log) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + IRDA_ASSERT(self->notify.instance != NULL, return;); + + /* + * Check for successful discovery, since we are then allowed to clear + * the media busy condition (IrLAP 6.13.4 - p.94). This should allow + * us to make connection attempts much faster and easier (i.e. no + * collisions). + * Setting media busy to false will also generate an event allowing + * to process pending events in NDM state machine. + * Note : the spec doesn't define what's a successful discovery is. + * If we want Ultra to work, it's successful even if there is + * nobody discovered - Jean II + */ + if (discovery_log) + irda_device_set_media_busy(self->netdev, FALSE); + + /* Inform IrLMP */ + irlmp_link_discovery_confirm(self->notify.instance, discovery_log); +} + +/* + * Function irlap_discovery_indication (log) + * + * Somebody is trying to discover us! + * + */ +void irlap_discovery_indication(struct irlap_cb *self, discovery_t *discovery) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(discovery != NULL, return;); + + IRDA_ASSERT(self->notify.instance != NULL, return;); + + /* A device is very likely to connect immediately after it performs + * a successful discovery. This means that in our case, we are much + * more likely to receive a connection request over the medium. + * So, we backoff to avoid collisions. + * IrLAP spec 6.13.4 suggest 100ms... + * Note : this little trick actually make a *BIG* difference. If I set + * my Linux box with discovery enabled and one Ultra frame sent every + * second, my Palm has no trouble connecting to it every time ! + * Jean II */ + irda_device_set_media_busy(self->netdev, SMALL); + + irlmp_link_discovery_indication(self->notify.instance, discovery); +} + +/* + * Function irlap_status_indication (quality_of_link) + */ +void irlap_status_indication(struct irlap_cb *self, int quality_of_link) +{ + switch (quality_of_link) { + case STATUS_NO_ACTIVITY: + IRDA_MESSAGE("IrLAP, no activity on link!\n"); + break; + case STATUS_NOISY: + IRDA_MESSAGE("IrLAP, noisy link!\n"); + break; + default: + break; + } + irlmp_status_indication(self->notify.instance, + quality_of_link, LOCK_NO_CHANGE); +} + +/* + * Function irlap_reset_indication (void) + */ +void irlap_reset_indication(struct irlap_cb *self) +{ + IRDA_DEBUG(1, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + if (self->state == LAP_RESET_WAIT) + irlap_do_event(self, RESET_REQUEST, NULL, NULL); + else + irlap_do_event(self, RESET_RESPONSE, NULL, NULL); +} + +/* + * Function irlap_reset_confirm (void) + */ +void irlap_reset_confirm(void) +{ + IRDA_DEBUG(1, "%s()\n", __func__); +} + +/* + * Function irlap_generate_rand_time_slot (S, s) + * + * Generate a random time slot between s and S-1 where + * S = Number of slots (0 -> S-1) + * s = Current slot + */ +int irlap_generate_rand_time_slot(int S, int s) +{ + static int rand; + int slot; + + IRDA_ASSERT((S - s) > 0, return 0;); + + rand += jiffies; + rand ^= (rand << 12); + rand ^= (rand >> 20); + + slot = s + rand % (S-s); + + IRDA_ASSERT((slot >= s) || (slot < S), return 0;); + + return slot; +} + +/* + * Function irlap_update_nr_received (nr) + * + * Remove all acknowledged frames in current window queue. This code is + * not intuitive and you should not try to change it. If you think it + * contains bugs, please mail a patch to the author instead. + */ +void irlap_update_nr_received(struct irlap_cb *self, int nr) +{ + struct sk_buff *skb = NULL; + int count = 0; + + /* + * Remove all the ack-ed frames from the window queue. + */ + + /* + * Optimize for the common case. It is most likely that the receiver + * will acknowledge all the frames we have sent! So in that case we + * delete all frames stored in window. + */ + if (nr == self->vs) { + while ((skb = skb_dequeue(&self->wx_list)) != NULL) { + dev_kfree_skb(skb); + } + /* The last acked frame is the next to send minus one */ + self->va = nr - 1; + } else { + /* Remove all acknowledged frames in current window */ + while ((skb_peek(&self->wx_list) != NULL) && + (((self->va+1) % 8) != nr)) + { + skb = skb_dequeue(&self->wx_list); + dev_kfree_skb(skb); + + self->va = (self->va + 1) % 8; + count++; + } + } + + /* Advance window */ + self->window = self->window_size - skb_queue_len(&self->wx_list); +} + +/* + * Function irlap_validate_ns_received (ns) + * + * Validate the next to send (ns) field from received frame. + */ +int irlap_validate_ns_received(struct irlap_cb *self, int ns) +{ + /* ns as expected? */ + if (ns == self->vr) + return NS_EXPECTED; + /* + * Stations are allowed to treat invalid NS as unexpected NS + * IrLAP, Recv ... with-invalid-Ns. p. 84 + */ + return NS_UNEXPECTED; + + /* return NR_INVALID; */ +} +/* + * Function irlap_validate_nr_received (nr) + * + * Validate the next to receive (nr) field from received frame. + * + */ +int irlap_validate_nr_received(struct irlap_cb *self, int nr) +{ + /* nr as expected? */ + if (nr == self->vs) { + IRDA_DEBUG(4, "%s(), expected!\n", __func__); + return NR_EXPECTED; + } + + /* + * unexpected nr? (but within current window), first we check if the + * ns numbers of the frames in the current window wrap. + */ + if (self->va < self->vs) { + if ((nr >= self->va) && (nr <= self->vs)) + return NR_UNEXPECTED; + } else { + if ((nr >= self->va) || (nr <= self->vs)) + return NR_UNEXPECTED; + } + + /* Invalid nr! */ + return NR_INVALID; +} + +/* + * Function irlap_initiate_connection_state () + * + * Initialize the connection state parameters + * + */ +void irlap_initiate_connection_state(struct irlap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Next to send and next to receive */ + self->vs = self->vr = 0; + + /* Last frame which got acked (0 - 1) % 8 */ + self->va = 7; + + self->window = 1; + + self->remote_busy = FALSE; + self->retry_count = 0; +} + +/* + * Function irlap_wait_min_turn_around (self, qos) + * + * Wait negotiated minimum turn around time, this function actually sets + * the number of BOS's that must be sent before the next transmitted + * frame in order to delay for the specified amount of time. This is + * done to avoid using timers, and the forbidden udelay! + */ +void irlap_wait_min_turn_around(struct irlap_cb *self, struct qos_info *qos) +{ + __u32 min_turn_time; + __u32 speed; + + /* Get QoS values. */ + speed = qos->baud_rate.value; + min_turn_time = qos->min_turn_time.value; + + /* No need to calculate XBOFs for speeds over 115200 bps */ + if (speed > 115200) { + self->mtt_required = min_turn_time; + return; + } + + /* + * Send additional BOF's for the next frame for the requested + * min turn time, so now we must calculate how many chars (XBOF's) we + * must send for the requested time period (min turn time) + */ + self->xbofs_delay = irlap_min_turn_time_in_bytes(speed, min_turn_time); +} + +/* + * Function irlap_flush_all_queues (void) + * + * Flush all queues + * + */ +void irlap_flush_all_queues(struct irlap_cb *self) +{ + struct sk_buff* skb; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Free transmission queue */ + while ((skb = skb_dequeue(&self->txq)) != NULL) + dev_kfree_skb(skb); + + while ((skb = skb_dequeue(&self->txq_ultra)) != NULL) + dev_kfree_skb(skb); + + /* Free sliding window buffered packets */ + while ((skb = skb_dequeue(&self->wx_list)) != NULL) + dev_kfree_skb(skb); +} + +/* + * Function irlap_setspeed (self, speed) + * + * Change the speed of the IrDA port + * + */ +static void irlap_change_speed(struct irlap_cb *self, __u32 speed, int now) +{ + struct sk_buff *skb; + + IRDA_DEBUG(0, "%s(), setting speed to %d\n", __func__, speed); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + self->speed = speed; + + /* Change speed now, or just piggyback speed on frames */ + if (now) { + /* Send down empty frame to trigger speed change */ + skb = alloc_skb(0, GFP_ATOMIC); + if (skb) + irlap_queue_xmit(self, skb); + } +} + +/* + * Function irlap_init_qos_capabilities (self, qos) + * + * Initialize QoS for this IrLAP session, What we do is to compute the + * intersection of the QoS capabilities for the user, driver and for + * IrLAP itself. Normally, IrLAP will not specify any values, but it can + * be used to restrict certain values. + */ +static void irlap_init_qos_capabilities(struct irlap_cb *self, + struct qos_info *qos_user) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(self->netdev != NULL, return;); + + /* Start out with the maximum QoS support possible */ + irda_init_max_qos_capabilies(&self->qos_rx); + + /* Apply drivers QoS capabilities */ + irda_qos_compute_intersection(&self->qos_rx, self->qos_dev); + + /* + * Check for user supplied QoS parameters. The service user is only + * allowed to supply these values. We check each parameter since the + * user may not have set all of them. + */ + if (qos_user) { + IRDA_DEBUG(1, "%s(), Found user specified QoS!\n", __func__); + + if (qos_user->baud_rate.bits) + self->qos_rx.baud_rate.bits &= qos_user->baud_rate.bits; + + if (qos_user->max_turn_time.bits) + self->qos_rx.max_turn_time.bits &= qos_user->max_turn_time.bits; + if (qos_user->data_size.bits) + self->qos_rx.data_size.bits &= qos_user->data_size.bits; + + if (qos_user->link_disc_time.bits) + self->qos_rx.link_disc_time.bits &= qos_user->link_disc_time.bits; + } + + /* Use 500ms in IrLAP for now */ + self->qos_rx.max_turn_time.bits &= 0x01; + + /* Set data size */ + /*self->qos_rx.data_size.bits &= 0x03;*/ + + irda_qos_bits_to_value(&self->qos_rx); +} + +/* + * Function irlap_apply_default_connection_parameters (void, now) + * + * Use the default connection and transmission parameters + */ +void irlap_apply_default_connection_parameters(struct irlap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* xbofs : Default value in NDM */ + self->next_bofs = 12; + self->bofs_count = 12; + + /* NDM Speed is 9600 */ + irlap_change_speed(self, 9600, TRUE); + + /* Set mbusy when going to NDM state */ + irda_device_set_media_busy(self->netdev, TRUE); + + /* + * Generate random connection address for this session, which must + * be 7 bits wide and different from 0x00 and 0xfe + */ + while ((self->caddr == 0x00) || (self->caddr == 0xfe)) { + get_random_bytes(&self->caddr, sizeof(self->caddr)); + self->caddr &= 0xfe; + } + + /* Use default values until connection has been negitiated */ + self->slot_timeout = sysctl_slot_timeout; + self->final_timeout = FINAL_TIMEOUT; + self->poll_timeout = POLL_TIMEOUT; + self->wd_timeout = WD_TIMEOUT; + + /* Set some default values */ + self->qos_tx.baud_rate.value = 9600; + self->qos_rx.baud_rate.value = 9600; + self->qos_tx.max_turn_time.value = 0; + self->qos_rx.max_turn_time.value = 0; + self->qos_tx.min_turn_time.value = 0; + self->qos_rx.min_turn_time.value = 0; + self->qos_tx.data_size.value = 64; + self->qos_rx.data_size.value = 64; + self->qos_tx.window_size.value = 1; + self->qos_rx.window_size.value = 1; + self->qos_tx.additional_bofs.value = 12; + self->qos_rx.additional_bofs.value = 12; + self->qos_tx.link_disc_time.value = 0; + self->qos_rx.link_disc_time.value = 0; + + irlap_flush_all_queues(self); + + self->disconnect_pending = FALSE; + self->connect_pending = FALSE; +} + +/* + * Function irlap_apply_connection_parameters (qos, now) + * + * Initialize IrLAP with the negotiated QoS values + * + * If 'now' is false, the speed and xbofs will be changed after the next + * frame is sent. + * If 'now' is true, the speed and xbofs is changed immediately + */ +void irlap_apply_connection_parameters(struct irlap_cb *self, int now) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Set the negotiated xbofs value */ + self->next_bofs = self->qos_tx.additional_bofs.value; + if (now) + self->bofs_count = self->next_bofs; + + /* Set the negotiated link speed (may need the new xbofs value) */ + irlap_change_speed(self, self->qos_tx.baud_rate.value, now); + + self->window_size = self->qos_tx.window_size.value; + self->window = self->qos_tx.window_size.value; + +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* + * Calculate how many bytes it is possible to transmit before the + * link must be turned around + */ + self->line_capacity = + irlap_max_line_capacity(self->qos_tx.baud_rate.value, + self->qos_tx.max_turn_time.value); + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + + + /* + * Initialize timeout values, some of the rules are listed on + * page 92 in IrLAP. + */ + IRDA_ASSERT(self->qos_tx.max_turn_time.value != 0, return;); + IRDA_ASSERT(self->qos_rx.max_turn_time.value != 0, return;); + /* The poll timeout applies only to the primary station. + * It defines the maximum time the primary stay in XMIT mode + * before timeout and turning the link around (sending a RR). + * Or, this is how much we can keep the pf bit in primary mode. + * Therefore, it must be lower or equal than our *OWN* max turn around. + * Jean II */ + self->poll_timeout = self->qos_tx.max_turn_time.value * HZ / 1000; + /* The Final timeout applies only to the primary station. + * It defines the maximum time the primary wait (mostly in RECV mode) + * for an answer from the secondary station before polling it again. + * Therefore, it must be greater or equal than our *PARTNER* + * max turn around time - Jean II */ + self->final_timeout = self->qos_rx.max_turn_time.value * HZ / 1000; + /* The Watchdog Bit timeout applies only to the secondary station. + * It defines the maximum time the secondary wait (mostly in RECV mode) + * for poll from the primary station before getting annoyed. + * Therefore, it must be greater or equal than our *PARTNER* + * max turn around time - Jean II */ + self->wd_timeout = self->final_timeout * 2; + + /* + * N1 and N2 are maximum retry count for *both* the final timer + * and the wd timer (with a factor 2) as defined above. + * After N1 retry of a timer, we give a warning to the user. + * After N2 retry, we consider the link dead and disconnect it. + * Jean II + */ + + /* + * Set N1 to 0 if Link Disconnect/Threshold Time = 3 and set it to + * 3 seconds otherwise. See page 71 in IrLAP for more details. + * Actually, it's not always 3 seconds, as we allow to set + * it via sysctl... Max maxtt is 500ms, and N1 need to be multiple + * of 2, so 1 second is minimum we can allow. - Jean II + */ + if (self->qos_tx.link_disc_time.value == sysctl_warn_noreply_time) + /* + * If we set N1 to 0, it will trigger immediately, which is + * not what we want. What we really want is to disable it, + * Jean II + */ + self->N1 = -2; /* Disable - Need to be multiple of 2*/ + else + self->N1 = sysctl_warn_noreply_time * 1000 / + self->qos_rx.max_turn_time.value; + + IRDA_DEBUG(4, "Setting N1 = %d\n", self->N1); + + /* Set N2 to match our own disconnect time */ + self->N2 = self->qos_tx.link_disc_time.value * 1000 / + self->qos_rx.max_turn_time.value; + IRDA_DEBUG(4, "Setting N2 = %d\n", self->N2); +} + +#ifdef CONFIG_PROC_FS +struct irlap_iter_state { + int id; +}; + +static void *irlap_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct irlap_iter_state *iter = seq->private; + struct irlap_cb *self; + + /* Protect our access to the tsap list */ + spin_lock_irq(&irlap->hb_spinlock); + iter->id = 0; + + for (self = (struct irlap_cb *) hashbin_get_first(irlap); + self; self = (struct irlap_cb *) hashbin_get_next(irlap)) { + if (iter->id == *pos) + break; + ++iter->id; + } + + return self; +} + +static void *irlap_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct irlap_iter_state *iter = seq->private; + + ++*pos; + ++iter->id; + return (void *) hashbin_get_next(irlap); +} + +static void irlap_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irlap->hb_spinlock); +} + +static int irlap_seq_show(struct seq_file *seq, void *v) +{ + const struct irlap_iter_state *iter = seq->private; + const struct irlap_cb *self = v; + + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EINVAL;); + + seq_printf(seq, "irlap%d ", iter->id); + seq_printf(seq, "state: %s\n", + irlap_state[self->state]); + + seq_printf(seq, " device name: %s, ", + (self->netdev) ? self->netdev->name : "bug"); + seq_printf(seq, "hardware name: %s\n", self->hw_name); + + seq_printf(seq, " caddr: %#02x, ", self->caddr); + seq_printf(seq, "saddr: %#08x, ", self->saddr); + seq_printf(seq, "daddr: %#08x\n", self->daddr); + + seq_printf(seq, " win size: %d, ", + self->window_size); + seq_printf(seq, "win: %d, ", self->window); +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + seq_printf(seq, "line capacity: %d, ", + self->line_capacity); + seq_printf(seq, "bytes left: %d\n", self->bytes_left); +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + seq_printf(seq, " tx queue len: %d ", + skb_queue_len(&self->txq)); + seq_printf(seq, "win queue len: %d ", + skb_queue_len(&self->wx_list)); + seq_printf(seq, "rbusy: %s", self->remote_busy ? + "TRUE" : "FALSE"); + seq_printf(seq, " mbusy: %s\n", self->media_busy ? + "TRUE" : "FALSE"); + + seq_printf(seq, " retrans: %d ", self->retry_count); + seq_printf(seq, "vs: %d ", self->vs); + seq_printf(seq, "vr: %d ", self->vr); + seq_printf(seq, "va: %d\n", self->va); + + seq_printf(seq, " qos\tbps\tmaxtt\tdsize\twinsize\taddbofs\tmintt\tldisc\tcomp\n"); + + seq_printf(seq, " tx\t%d\t", + self->qos_tx.baud_rate.value); + seq_printf(seq, "%d\t", + self->qos_tx.max_turn_time.value); + seq_printf(seq, "%d\t", + self->qos_tx.data_size.value); + seq_printf(seq, "%d\t", + self->qos_tx.window_size.value); + seq_printf(seq, "%d\t", + self->qos_tx.additional_bofs.value); + seq_printf(seq, "%d\t", + self->qos_tx.min_turn_time.value); + seq_printf(seq, "%d\t", + self->qos_tx.link_disc_time.value); + seq_printf(seq, "\n"); + + seq_printf(seq, " rx\t%d\t", + self->qos_rx.baud_rate.value); + seq_printf(seq, "%d\t", + self->qos_rx.max_turn_time.value); + seq_printf(seq, "%d\t", + self->qos_rx.data_size.value); + seq_printf(seq, "%d\t", + self->qos_rx.window_size.value); + seq_printf(seq, "%d\t", + self->qos_rx.additional_bofs.value); + seq_printf(seq, "%d\t", + self->qos_rx.min_turn_time.value); + seq_printf(seq, "%d\n", + self->qos_rx.link_disc_time.value); + + return 0; +} + +static const struct seq_operations irlap_seq_ops = { + .start = irlap_seq_start, + .next = irlap_seq_next, + .stop = irlap_seq_stop, + .show = irlap_seq_show, +}; + +static int irlap_seq_open(struct inode *inode, struct file *file) +{ + if (irlap == NULL) + return -EINVAL; + + return seq_open_private(file, &irlap_seq_ops, + sizeof(struct irlap_iter_state)); +} + +const struct file_operations irlap_seq_fops = { + .owner = THIS_MODULE, + .open = irlap_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* CONFIG_PROC_FS */ diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c new file mode 100644 index 00000000..ccd214f9 --- /dev/null +++ b/net/irda/irlap_event.c @@ -0,0 +1,2330 @@ +/********************************************************************* + * + * Filename: irlap_event.c + * Version: 0.9 + * Description: IrLAP state machine implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Aug 16 00:59:29 1997 + * Modified at: Sat Dec 25 21:07:57 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * Copyright (c) 1998 Thomas Davis + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include /* irlmp_flow_indication(), ... */ + +#include + +#ifdef CONFIG_IRDA_FAST_RR +int sysctl_fast_poll_increase = 50; +#endif + +static int irlap_state_ndm (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_query (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reply (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_conn (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_setup (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_xmit_p (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_pclose (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_nrm_p (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reset (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_nrm_s (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_xmit_s (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_sclose (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reset_check(struct irlap_cb *, IRLAP_EVENT event, + struct sk_buff *, struct irlap_info *); + +#ifdef CONFIG_IRDA_DEBUG +static const char *const irlap_event[] = { + "DISCOVERY_REQUEST", + "CONNECT_REQUEST", + "CONNECT_RESPONSE", + "DISCONNECT_REQUEST", + "DATA_REQUEST", + "RESET_REQUEST", + "RESET_RESPONSE", + "SEND_I_CMD", + "SEND_UI_FRAME", + "RECV_DISCOVERY_XID_CMD", + "RECV_DISCOVERY_XID_RSP", + "RECV_SNRM_CMD", + "RECV_TEST_CMD", + "RECV_TEST_RSP", + "RECV_UA_RSP", + "RECV_DM_RSP", + "RECV_RD_RSP", + "RECV_I_CMD", + "RECV_I_RSP", + "RECV_UI_FRAME", + "RECV_FRMR_RSP", + "RECV_RR_CMD", + "RECV_RR_RSP", + "RECV_RNR_CMD", + "RECV_RNR_RSP", + "RECV_REJ_CMD", + "RECV_REJ_RSP", + "RECV_SREJ_CMD", + "RECV_SREJ_RSP", + "RECV_DISC_CMD", + "SLOT_TIMER_EXPIRED", + "QUERY_TIMER_EXPIRED", + "FINAL_TIMER_EXPIRED", + "POLL_TIMER_EXPIRED", + "DISCOVERY_TIMER_EXPIRED", + "WD_TIMER_EXPIRED", + "BACKOFF_TIMER_EXPIRED", + "MEDIA_BUSY_TIMER_EXPIRED", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +const char *const irlap_state[] = { + "LAP_NDM", + "LAP_QUERY", + "LAP_REPLY", + "LAP_CONN", + "LAP_SETUP", + "LAP_OFFLINE", + "LAP_XMIT_P", + "LAP_PCLOSE", + "LAP_NRM_P", + "LAP_RESET_WAIT", + "LAP_RESET", + "LAP_NRM_S", + "LAP_XMIT_S", + "LAP_SCLOSE", + "LAP_RESET_CHECK", +}; + +static int (*state[])(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) = +{ + irlap_state_ndm, + irlap_state_query, + irlap_state_reply, + irlap_state_conn, + irlap_state_setup, + irlap_state_offline, + irlap_state_xmit_p, + irlap_state_pclose, + irlap_state_nrm_p, + irlap_state_reset_wait, + irlap_state_reset, + irlap_state_nrm_s, + irlap_state_xmit_s, + irlap_state_sclose, + irlap_state_reset_check, +}; + +/* + * Function irda_poll_timer_expired (data) + * + * Poll timer has expired. Normally we must now send a RR frame to the + * remote device + */ +static void irlap_poll_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Calculate and set time before we will have to send back the pf bit + * to the peer. Use in primary. + * Make sure that state is XMIT_P/XMIT_S when calling this function + * (and that nobody messed up with the state). - Jean II + */ +static void irlap_start_poll_timer(struct irlap_cb *self, int timeout) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + +#ifdef CONFIG_IRDA_FAST_RR + /* + * Send out the RR frames faster if our own transmit queue is empty, or + * if the peer is busy. The effect is a much faster conversation + */ + if (skb_queue_empty(&self->txq) || self->remote_busy) { + if (self->fast_RR == TRUE) { + /* + * Assert that the fast poll timer has not reached the + * normal poll timer yet + */ + if (self->fast_RR_timeout < timeout) { + /* + * FIXME: this should be a more configurable + * function + */ + self->fast_RR_timeout += + (sysctl_fast_poll_increase * HZ/1000); + + /* Use this fast(er) timeout instead */ + timeout = self->fast_RR_timeout; + } + } else { + self->fast_RR = TRUE; + + /* Start with just 0 ms */ + self->fast_RR_timeout = 0; + timeout = 0; + } + } else + self->fast_RR = FALSE; + + IRDA_DEBUG(3, "%s(), timeout=%d (%ld)\n", __func__, timeout, jiffies); +#endif /* CONFIG_IRDA_FAST_RR */ + + if (timeout == 0) + irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL); + else + irda_start_timer(&self->poll_timer, timeout, self, + irlap_poll_timer_expired); +} + +/* + * Function irlap_do_event (event, skb, info) + * + * Rushes through the state machine without any delay. If state == XMIT + * then send queued data frames. + */ +void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret; + + if (!self || self->magic != LAP_MAGIC) + return; + + IRDA_DEBUG(3, "%s(), event = %s, state = %s\n", __func__, + irlap_event[event], irlap_state[self->state]); + + ret = (*state[self->state])(self, event, skb, info); + + /* + * Check if there are any pending events that needs to be executed + */ + switch (self->state) { + case LAP_XMIT_P: /* FALLTHROUGH */ + case LAP_XMIT_S: + /* + * We just received the pf bit and are at the beginning + * of a new LAP transmit window. + * Check if there are any queued data frames, and do not + * try to disconnect link if we send any data frames, since + * that will change the state away form XMIT + */ + IRDA_DEBUG(2, "%s() : queue len = %d\n", __func__, + skb_queue_len(&self->txq)); + + if (!skb_queue_empty(&self->txq)) { + /* Prevent race conditions with irlap_data_request() */ + self->local_busy = TRUE; + + /* Theory of operation. + * We send frames up to when we fill the window or + * reach line capacity. Those frames will queue up + * in the device queue, and the driver will slowly + * send them. + * After each frame that we send, we poll the higher + * layer for more data. It's the right time to do + * that because the link layer need to perform the mtt + * and then send the first frame, so we can afford + * to send a bit of time in kernel space. + * The explicit flow indication allow to minimise + * buffers (== lower latency), to avoid higher layer + * polling via timers (== less context switches) and + * to implement a crude scheduler - Jean II */ + + /* Try to send away all queued data frames */ + while ((skb = skb_dequeue(&self->txq)) != NULL) { + /* Send one frame */ + ret = (*state[self->state])(self, SEND_I_CMD, + skb, NULL); + /* Drop reference count. + * It will be increase as needed in + * irlap_send_data_xxx() */ + kfree_skb(skb); + + /* Poll the higher layers for one more frame */ + irlmp_flow_indication(self->notify.instance, + FLOW_START); + + if (ret == -EPROTO) + break; /* Try again later! */ + } + /* Finished transmitting */ + self->local_busy = FALSE; + } else if (self->disconnect_pending) { + self->disconnect_pending = FALSE; + + ret = (*state[self->state])(self, DISCONNECT_REQUEST, + NULL, NULL); + } + break; +/* case LAP_NDM: */ +/* case LAP_CONN: */ +/* case LAP_RESET_WAIT: */ +/* case LAP_RESET_CHECK: */ + default: + break; + } +} + +/* + * Function irlap_state_ndm (event, skb, frame) + * + * NDM (Normal Disconnected Mode) state + * + */ +static int irlap_state_ndm(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + discovery_t *discovery_rsp; + int ret = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case CONNECT_REQUEST: + IRDA_ASSERT(self->netdev != NULL, return -1;); + + if (self->media_busy) { + /* Note : this will never happen, because we test + * media busy in irlap_connect_request() and + * postpone the event... - Jean II */ + IRDA_DEBUG(0, "%s(), CONNECT_REQUEST: media busy!\n", + __func__); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_MEDIA_BUSY); + } else { + irlap_send_snrm_frame(self, &self->qos_rx); + + /* Start Final-bit timer */ + irlap_start_final_timer(self, self->final_timeout); + + self->retry_count = 0; + irlap_next_state(self, LAP_SETUP); + } + break; + case RECV_SNRM_CMD: + /* Check if the frame contains and I field */ + if (info) { + self->daddr = info->daddr; + self->caddr = info->caddr; + + irlap_next_state(self, LAP_CONN); + + irlap_connect_indication(self, skb); + } else { + IRDA_DEBUG(0, "%s(), SNRM frame does not " + "contain an I field!\n", __func__); + } + break; + case DISCOVERY_REQUEST: + IRDA_ASSERT(info != NULL, return -1;); + + if (self->media_busy) { + IRDA_DEBUG(1, "%s(), DISCOVERY_REQUEST: media busy!\n", + __func__); + /* irlap->log.condition = MEDIA_BUSY; */ + + /* This will make IrLMP try again */ + irlap_discovery_confirm(self, NULL); + /* Note : the discovery log is not cleaned up here, + * it will be done in irlap_discovery_request() + * Jean II */ + return 0; + } + + self->S = info->S; + self->s = info->s; + irlap_send_discovery_xid_frame(self, info->S, info->s, TRUE, + info->discovery); + self->frame_sent = FALSE; + self->s++; + + irlap_start_slot_timer(self, self->slot_timeout); + irlap_next_state(self, LAP_QUERY); + break; + case RECV_DISCOVERY_XID_CMD: + IRDA_ASSERT(info != NULL, return -1;); + + /* Assert that this is not the final slot */ + if (info->s <= info->S) { + self->slot = irlap_generate_rand_time_slot(info->S, + info->s); + if (self->slot == info->s) { + discovery_rsp = irlmp_get_discovery_response(); + discovery_rsp->data.daddr = info->daddr; + + irlap_send_discovery_xid_frame(self, info->S, + self->slot, + FALSE, + discovery_rsp); + self->frame_sent = TRUE; + } else + self->frame_sent = FALSE; + + /* + * Go to reply state until end of discovery to + * inhibit our own transmissions. Set the timer + * to not stay forever there... Jean II + */ + irlap_start_query_timer(self, info->S, info->s); + irlap_next_state(self, LAP_REPLY); + } else { + /* This is the final slot. How is it possible ? + * This would happen is both discoveries are just slightly + * offset (if they are in sync, all packets are lost). + * Most often, all the discovery requests will be received + * in QUERY state (see my comment there), except for the + * last frame that will come here. + * The big trouble when it happen is that active discovery + * doesn't happen, because nobody answer the discoveries + * frame of the other guy, so the log shows up empty. + * What should we do ? + * Not much. It's too late to answer those discovery frames, + * so we just pass the info to IrLMP who will put it in the + * log (and post an event). + * Another cause would be devices that do discovery much + * slower than us, however the latest fixes should minimise + * those cases... + * Jean II + */ + IRDA_DEBUG(1, "%s(), Receiving final discovery request, missed the discovery slots :-(\n", __func__); + + /* Last discovery request -> in the log */ + irlap_discovery_indication(self, info->discovery); + } + break; + case MEDIA_BUSY_TIMER_EXPIRED: + /* A bunch of events may be postponed because the media is + * busy (usually immediately after we close a connection), + * or while we are doing discovery (state query/reply). + * In all those cases, the media busy flag will be cleared + * when it's OK for us to process those postponed events. + * This event is not mentioned in the state machines in the + * IrLAP spec. It's because they didn't consider Ultra and + * postponing connection request is optional. + * Jean II */ +#ifdef CONFIG_IRDA_ULTRA + /* Send any pending Ultra frames if any */ + if (!skb_queue_empty(&self->txq_ultra)) { + /* We don't send the frame, just post an event. + * Also, previously this code was in timer.c... + * Jean II */ + ret = (*state[self->state])(self, SEND_UI_FRAME, + NULL, NULL); + } +#endif /* CONFIG_IRDA_ULTRA */ + /* Check if we should try to connect. + * This code was previously in irlap_do_event() */ + if (self->connect_pending) { + self->connect_pending = FALSE; + + /* This one *should* not pend in this state, except + * if a socket try to connect and immediately + * disconnect. - clear - Jean II */ + if (self->disconnect_pending) + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + else + ret = (*state[self->state])(self, + CONNECT_REQUEST, + NULL, NULL); + self->disconnect_pending = FALSE; + } + /* Note : one way to test if this code works well (including + * media busy and small busy) is to create a user space + * application generating an Ultra packet every 3.05 sec (or + * 2.95 sec) and to see how it interact with discovery. + * It's fairly easy to check that no packet is lost, that the + * packets are postponed during discovery and that after + * discovery indication you have a 100ms "gap". + * As connection request and Ultra are now processed the same + * way, this avoid the tedious job of trying IrLAP connection + * in all those cases... + * Jean II */ + break; +#ifdef CONFIG_IRDA_ULTRA + case SEND_UI_FRAME: + { + int i; + /* Only allowed to repeat an operation twice */ + for (i=0; ((i<2) && (self->media_busy == FALSE)); i++) { + skb = skb_dequeue(&self->txq_ultra); + if (skb) + irlap_send_ui_frame(self, skb, CBROADCAST, + CMD_FRAME); + else + break; + /* irlap_send_ui_frame() won't increase skb reference + * count, so no dev_kfree_skb() - Jean II */ + } + if (i == 2) { + /* Force us to listen 500 ms again */ + irda_device_set_media_busy(self->netdev, TRUE); + } + break; + } + case RECV_UI_FRAME: + /* Only accept broadcast frames in NDM mode */ + if (info->caddr != CBROADCAST) { + IRDA_DEBUG(0, "%s(), not a broadcast frame!\n", + __func__); + } else + irlap_unitdata_indication(self, skb); + break; +#endif /* CONFIG_IRDA_ULTRA */ + case RECV_TEST_CMD: + /* Remove test frame header */ + skb_pull(skb, sizeof(struct test_frame)); + + /* + * Send response. This skb will not be sent out again, and + * will only be used to send out the same info as the cmd + */ + irlap_send_test_frame(self, CBROADCAST, info->daddr, skb); + break; + case RECV_TEST_RSP: + IRDA_DEBUG(0, "%s() not implemented!\n", __func__); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__, + irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_query (event, skb, info) + * + * QUERY state + * + */ +static int irlap_state_query(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_DISCOVERY_XID_RSP: + IRDA_ASSERT(info != NULL, return -1;); + IRDA_ASSERT(info->discovery != NULL, return -1;); + + IRDA_DEBUG(4, "%s(), daddr=%08x\n", __func__, + info->discovery->data.daddr); + + if (!self->discovery_log) { + IRDA_WARNING("%s: discovery log is gone! " + "maybe the discovery timeout has been set" + " too short?\n", __func__); + break; + } + hashbin_insert(self->discovery_log, + (irda_queue_t *) info->discovery, + info->discovery->data.daddr, NULL); + + /* Keep state */ + /* irlap_next_state(self, LAP_QUERY); */ + + break; + case RECV_DISCOVERY_XID_CMD: + /* Yes, it is possible to receive those frames in this mode. + * Note that most often the last discovery request won't + * occur here but in NDM state (see my comment there). + * What should we do ? + * Not much. We are currently performing our own discovery, + * therefore we can't answer those frames. We don't want + * to change state either. We just pass the info to + * IrLMP who will put it in the log (and post an event). + * Jean II + */ + + IRDA_ASSERT(info != NULL, return -1;); + + IRDA_DEBUG(1, "%s(), Receiving discovery request (s = %d) while performing discovery :-(\n", __func__, info->s); + + /* Last discovery request ? */ + if (info->s == 0xff) + irlap_discovery_indication(self, info->discovery); + break; + case SLOT_TIMER_EXPIRED: + /* + * Wait a little longer if we detect an incoming frame. This + * is not mentioned in the spec, but is a good thing to do, + * since we want to work even with devices that violate the + * timing requirements. + */ + if (irda_device_is_receiving(self->netdev) && !self->add_wait) { + IRDA_DEBUG(2, "%s(), device is slow to answer, " + "waiting some more!\n", __func__); + irlap_start_slot_timer(self, msecs_to_jiffies(10)); + self->add_wait = TRUE; + return ret; + } + self->add_wait = FALSE; + + if (self->s < self->S) { + irlap_send_discovery_xid_frame(self, self->S, + self->s, TRUE, + self->discovery_cmd); + self->s++; + irlap_start_slot_timer(self, self->slot_timeout); + + /* Keep state */ + irlap_next_state(self, LAP_QUERY); + } else { + /* This is the final slot! */ + irlap_send_discovery_xid_frame(self, self->S, 0xff, + TRUE, + self->discovery_cmd); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + /* + * We are now finished with the discovery procedure, + * so now we must return the results + */ + irlap_discovery_confirm(self, self->discovery_log); + + /* IrLMP should now have taken care of the log */ + self->discovery_log = NULL; + } + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__, + irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_reply (self, event, skb, info) + * + * REPLY, we have received a XID discovery frame from a device and we + * are waiting for the right time slot to send a response XID frame + * + */ +static int irlap_state_reply(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + discovery_t *discovery_rsp; + int ret=0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case QUERY_TIMER_EXPIRED: + IRDA_DEBUG(0, "%s(), QUERY_TIMER_EXPIRED <%ld>\n", + __func__, jiffies); + irlap_next_state(self, LAP_NDM); + break; + case RECV_DISCOVERY_XID_CMD: + IRDA_ASSERT(info != NULL, return -1;); + /* Last frame? */ + if (info->s == 0xff) { + del_timer(&self->query_timer); + + /* info->log.condition = REMOTE; */ + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_discovery_indication(self, info->discovery); + } else { + /* If it's our slot, send our reply */ + if ((info->s >= self->slot) && (!self->frame_sent)) { + discovery_rsp = irlmp_get_discovery_response(); + discovery_rsp->data.daddr = info->daddr; + + irlap_send_discovery_xid_frame(self, info->S, + self->slot, + FALSE, + discovery_rsp); + + self->frame_sent = TRUE; + } + /* Readjust our timer to accommodate devices + * doing faster or slower discovery than us... + * Jean II */ + irlap_start_query_timer(self, info->S, info->s); + + /* Keep state */ + //irlap_next_state(self, LAP_REPLY); + } + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __func__, + event, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_conn (event, skb, info) + * + * CONN, we have received a SNRM command and is waiting for the upper + * layer to accept or refuse connection + * + */ +static int irlap_state_conn(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s(), event=%s\n", __func__, irlap_event[ event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case CONNECT_RESPONSE: + skb_pull(skb, sizeof(struct snrm_frame)); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + + irlap_qos_negotiate(self, skb); + + irlap_initiate_connection_state(self); + + /* + * Applying the parameters now will make sure we change speed + * *after* we have sent the next frame + */ + irlap_apply_connection_parameters(self, FALSE); + + /* + * Sending this frame will force a speed change after it has + * been sent (i.e. the frame will be sent at 9600). + */ + irlap_send_ua_response_frame(self, &self->qos_rx); + +#if 0 + /* + * We are allowed to send two frames, but this may increase + * the connect latency, so lets not do it for now. + */ + /* This is full of good intentions, but doesn't work in + * practice. + * After sending the first UA response, we switch the + * dongle to the negotiated speed, which is usually + * different than 9600 kb/s. + * From there, there is two solutions : + * 1) The other end has received the first UA response : + * it will set up the connection, move to state LAP_NRM_P, + * and will ignore and drop the second UA response. + * Actually, it's even worse : the other side will almost + * immediately send a RR that will likely collide with the + * UA response (depending on negotiated turnaround). + * 2) The other end has not received the first UA response, + * will stay at 9600 and will never see the second UA response. + * Jean II */ + irlap_send_ua_response_frame(self, &self->qos_rx); +#endif + + /* + * The WD-timer could be set to the duration of the P-timer + * for this case, but it is recommended to use twice the + * value (note 3 IrLAP p. 60). + */ + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_NRM_S); + + break; + case RECV_DISCOVERY_XID_CMD: + IRDA_DEBUG(3, "%s(), event RECV_DISCOVER_XID_CMD!\n", + __func__); + irlap_next_state(self, LAP_NDM); + + break; + case DISCONNECT_REQUEST: + IRDA_DEBUG(0, "%s(), Disconnect request!\n", __func__); + irlap_send_dm_frame(self); + irlap_next_state( self, LAP_NDM); + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __func__, + event, irlap_event[event]); + + ret = -1; + break; + } + + return ret; +} + +/* + * Function irlap_state_setup (event, skb, frame) + * + * SETUP state, The local layer has transmitted a SNRM command frame to + * a remote peer layer and is awaiting a reply . + * + */ +static int irlap_state_setup(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case FINAL_TIMER_EXPIRED: + if (self->retry_count < self->N3) { +/* + * Perform random backoff, Wait a random number of time units, minimum + * duration half the time taken to transmitt a SNRM frame, maximum duration + * 1.5 times the time taken to transmit a SNRM frame. So this time should + * between 15 msecs and 45 msecs. + */ + irlap_start_backoff_timer(self, msecs_to_jiffies(20 + + (jiffies % 30))); + } else { + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_FOUND_NONE); + } + break; + case BACKOFF_TIMER_EXPIRED: + irlap_send_snrm_frame(self, &self->qos_rx); + irlap_start_final_timer(self, self->final_timeout); + self->retry_count++; + break; + case RECV_SNRM_CMD: + IRDA_DEBUG(4, "%s(), SNRM battle!\n", __func__); + + IRDA_ASSERT(skb != NULL, return 0;); + IRDA_ASSERT(info != NULL, return 0;); + + /* + * The device with the largest device address wins the battle + * (both have sent a SNRM command!) + */ + if (info &&(info->daddr > self->saddr)) { + del_timer(&self->final_timer); + irlap_initiate_connection_state(self); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + + skb_pull(skb, sizeof(struct snrm_frame)); + + irlap_qos_negotiate(self, skb); + + /* Send UA frame and then change link settings */ + irlap_apply_connection_parameters(self, FALSE); + irlap_send_ua_response_frame(self, &self->qos_rx); + + irlap_next_state(self, LAP_NRM_S); + irlap_connect_confirm(self, skb); + + /* + * The WD-timer could be set to the duration of the + * P-timer for this case, but it is recommended + * to use twice the value (note 3 IrLAP p. 60). + */ + irlap_start_wd_timer(self, self->wd_timeout); + } else { + /* We just ignore the other device! */ + irlap_next_state(self, LAP_SETUP); + } + break; + case RECV_UA_RSP: + /* Stop F-timer */ + del_timer(&self->final_timer); + + /* Initiate connection state */ + irlap_initiate_connection_state(self); + + /* Negotiate connection parameters */ + IRDA_ASSERT(skb->len > 10, return -1;); + + skb_pull(skb, sizeof(struct ua_frame)); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + + irlap_qos_negotiate(self, skb); + + /* Set the new link setting *now* (before the rr frame) */ + irlap_apply_connection_parameters(self, TRUE); + self->retry_count = 0; + + /* Wait for turnaround time to give a chance to the other + * device to be ready to receive us. + * Note : the time to switch speed is typically larger + * than the turnaround time, but as we don't have the other + * side speed switch time, that's our best guess... + * Jean II */ + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* This frame will actually be sent at the new speed */ + irlap_send_rr_frame(self, CMD_FRAME); + + /* The timer is set to half the normal timer to quickly + * detect a failure to negotiate the new connection + * parameters. IrLAP 6.11.3.2, note 3. + * Note that currently we don't process this failure + * properly, as we should do a quick disconnect. + * Jean II */ + irlap_start_final_timer(self, self->final_timeout/2); + irlap_next_state(self, LAP_NRM_P); + + irlap_connect_confirm(self, skb); + break; + case RECV_DM_RSP: /* FALLTHROUGH */ + case RECV_DISC_CMD: + del_timer(&self->final_timer); + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __func__, + event, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_offline (self, event, skb, info) + * + * OFFLINE state, not used for now! + * + */ +static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + IRDA_DEBUG( 0, "%s(), Unknown event\n", __func__); + + return -1; +} + +/* + * Function irlap_state_xmit_p (self, event, skb, info) + * + * XMIT, Only the primary station has right to transmit, and we + * therefore do not expect to receive any transmissions from other + * stations. + * + */ +static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + switch (event) { + case SEND_I_CMD: + /* + * Only send frame if send-window > 0. + */ + if ((self->window > 0) && (!self->remote_busy)) { + int nextfit; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + struct sk_buff *skb_next; + + /* With DYNAMIC_WINDOW, we keep the window size + * maximum, and adapt on the packets we are sending. + * At 115k, we can send only 2 packets of 2048 bytes + * in a 500 ms turnaround. Without this option, we + * would always limit the window to 2. With this + * option, if we send smaller packets, we can send + * up to 7 of them (always depending on QoS). + * Jean II */ + + /* Look at the next skb. This is safe, as we are + * the only consumer of the Tx queue (if we are not, + * we have other problems) - Jean II */ + skb_next = skb_peek(&self->txq); + + /* Check if a subsequent skb exist and would fit in + * the current window (with respect to turnaround + * time). + * This allow us to properly mark the current packet + * with the pf bit, to avoid falling back on the + * second test below, and avoid waiting the + * end of the window and sending a extra RR. + * Note : (skb_next != NULL) <=> (skb_queue_len() > 0) + * Jean II */ + nextfit = ((skb_next != NULL) && + ((skb_next->len + skb->len) <= + self->bytes_left)); + + /* + * The current packet may not fit ! Because of test + * above, this should not happen any more !!! + * Test if we have transmitted more bytes over the + * link than its possible to do with the current + * speed and turn-around-time. + */ + if((!nextfit) && (skb->len > self->bytes_left)) { + IRDA_DEBUG(0, "%s(), Not allowed to transmit" + " more bytes!\n", __func__); + /* Requeue the skb */ + skb_queue_head(&self->txq, skb_get(skb)); + /* + * We should switch state to LAP_NRM_P, but + * that is not possible since we must be sure + * that we poll the other side. Since we have + * used up our time, the poll timer should + * trigger anyway now, so we just wait for it + * DB + */ + /* + * Sorry, but that's not totally true. If + * we send 2000B packets, we may wait another + * 1000B until our turnaround expire. That's + * why we need to be proactive in avoiding + * coming here. - Jean II + */ + return -EPROTO; + } + + /* Subtract space used by this skb */ + self->bytes_left -= skb->len; +#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* Window has been adjusted for the max packet + * size, so much simpler... - Jean II */ + nextfit = !skb_queue_empty(&self->txq); +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* + * Send data with poll bit cleared only if window > 1 + * and there is more frames after this one to be sent + */ + if ((self->window > 1) && (nextfit)) { + /* More packet to send in current window */ + irlap_send_data_primary(self, skb); + irlap_next_state(self, LAP_XMIT_P); + } else { + /* Final packet of window */ + irlap_send_data_primary_poll(self, skb); + + /* + * Make sure state machine does not try to send + * any more frames + */ + ret = -EPROTO; + } +#ifdef CONFIG_IRDA_FAST_RR + /* Peer may want to reply immediately */ + self->fast_RR = FALSE; +#endif /* CONFIG_IRDA_FAST_RR */ + } else { + IRDA_DEBUG(4, "%s(), Unable to send! remote busy?\n", + __func__); + skb_queue_head(&self->txq, skb_get(skb)); + + /* + * The next ret is important, because it tells + * irlap_next_state _not_ to deliver more frames + */ + ret = -EPROTO; + } + break; + case POLL_TIMER_EXPIRED: + IRDA_DEBUG(3, "%s(), POLL_TIMER_EXPIRED <%ld>\n", + __func__, jiffies); + irlap_send_rr_frame(self, CMD_FRAME); + /* Return to NRM properly - Jean II */ + self->window = self->window_size; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* Allowed to transmit a maximum number of bytes again. */ + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_NRM_P); + break; + case DISCONNECT_REQUEST: + del_timer(&self->poll_timer); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_disc_frame(self); + irlap_flush_all_queues(self); + irlap_start_final_timer(self, self->final_timeout); + self->retry_count = 0; + irlap_next_state(self, LAP_PCLOSE); + break; + case DATA_REQUEST: + /* Nothing to do, irlap_do_event() will send the packet + * when we return... - Jean II */ + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __func__, irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} + +/* + * Function irlap_state_pclose (event, skb, info) + * + * PCLOSE state + */ +static int irlap_state_pclose(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(1, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_UA_RSP: /* FALLTHROUGH */ + case RECV_DM_RSP: + del_timer(&self->final_timer); + + /* Set new link parameters */ + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case FINAL_TIMER_EXPIRED: + if (self->retry_count < self->N3) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_disc_frame(self); + irlap_start_final_timer(self, self->final_timeout); + self->retry_count++; + /* Keep state */ + } else { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d\n", __func__, event); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_nrm_p (self, event, skb, info) + * + * NRM_P (Normal Response Mode as Primary), The primary station has given + * permissions to a secondary station to transmit IrLAP resonse frames + * (by sending a frame with the P bit set). The primary station will not + * transmit any frames and is expecting to receive frames only from the + * secondary to which transmission permissions has been given. + */ +static int irlap_state_nrm_p(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + int ns_status; + int nr_status; + + switch (event) { + case RECV_I_RSP: /* Optimize for the common case */ + if (unlikely(skb->len <= LAP_ADDR_HEADER + LAP_CTRL_HEADER)) { + /* + * Input validation check: a stir4200/mcp2150 + * combination sometimes results in an empty i:rsp. + * This makes no sense; we can just ignore the frame + * and send an rr:cmd immediately. This happens before + * changing nr or ns so triggers a retransmit + */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + /* Keep state */ + break; + } + /* FIXME: must check for remote_busy below */ +#ifdef CONFIG_IRDA_FAST_RR + /* + * Reset the fast_RR so we can use the fast RR code with + * full speed the next time since peer may have more frames + * to transmitt + */ + self->fast_RR = FALSE; +#endif /* CONFIG_IRDA_FAST_RR */ + IRDA_ASSERT( info != NULL, return -1;); + + ns_status = irlap_validate_ns_received(self, info->ns); + nr_status = irlap_validate_nr_received(self, info->nr); + + /* + * Check for expected I(nformation) frame + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) { + + /* Update Vr (next frame for us to receive) */ + self->vr = (self->vr + 1) % 8; + + /* Update Nr received, cleanup our retry queue */ + irlap_update_nr_received(self, info->nr); + + /* + * Got expected NR, so reset the + * retry_count. This is not done by IrLAP spec, + * which is strange! + */ + self->retry_count = 0; + self->ack_required = TRUE; + + /* poll bit cleared? */ + if (!info->pf) { + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_P); + + irlap_data_indication(self, skb, FALSE); + } else { + /* No longer waiting for pf */ + del_timer(&self->final_timer); + + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* Call higher layer *before* changing state + * to give them a chance to send data in the + * next LAP frame. + * Jean II */ + irlap_data_indication(self, skb, FALSE); + + /* XMIT states are the most dangerous state + * to be in, because user requests are + * processed directly and may change state. + * On the other hand, in NDM_P, those + * requests are queued and we will process + * them when we return to irlap_do_event(). + * Jean II + */ + irlap_next_state(self, LAP_XMIT_P); + + /* This is the last frame. + * Make sure it's always called in XMIT state. + * - Jean II */ + irlap_start_poll_timer(self, self->poll_timeout); + } + break; + + } + /* Unexpected next to send (Ns) */ + if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED)) + { + if (!info->pf) { + irlap_update_nr_received(self, info->nr); + + /* + * Wait until the last frame before doing + * anything + */ + + /* Keep state */ + irlap_next_state(self, LAP_NRM_P); + } else { + IRDA_DEBUG(4, + "%s(), missing or duplicate frame!\n", + __func__); + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + + self->ack_required = FALSE; + + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_NRM_P); + } + break; + } + /* + * Unexpected next to receive (Nr) + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED)) + { + if (info->pf) { + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, CMD_FRAME); + + self->ack_required = FALSE; + + /* Make sure we account for the time + * to transmit our frames. See comemnts + * in irlap_send_data_primary_poll(). + * Jean II */ + irlap_start_final_timer(self, 2 * self->final_timeout); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_P); + + irlap_data_indication(self, skb, FALSE); + } else { + /* + * Do not resend frames until the last + * frame has arrived from the other + * device. This is not documented in + * IrLAP!! + */ + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + self->ack_required = FALSE; + + /* Keep state, do not move this line!*/ + irlap_next_state(self, LAP_NRM_P); + + irlap_data_indication(self, skb, FALSE); + } + break; + } + /* + * Unexpected next to send (Ns) and next to receive (Nr) + * Not documented by IrLAP! + */ + if ((ns_status == NS_UNEXPECTED) && + (nr_status == NR_UNEXPECTED)) + { + IRDA_DEBUG(4, "%s(), unexpected nr and ns!\n", + __func__); + if (info->pf) { + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, CMD_FRAME); + + /* Give peer some time to retransmit! + * But account for our own Tx. */ + irlap_start_final_timer(self, 2 * self->final_timeout); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_P); + } else { + /* Update Nr received */ + /* irlap_update_nr_received( info->nr); */ + + self->ack_required = FALSE; + } + break; + } + + /* + * Invalid NR or NS + */ + if ((nr_status == NR_INVALID) || (ns_status == NS_INVALID)) { + if (info->pf) { + del_timer(&self->final_timer); + + irlap_next_state(self, LAP_RESET_WAIT); + + irlap_disconnect_indication(self, LAP_RESET_INDICATION); + self->xmitflag = TRUE; + } else { + del_timer(&self->final_timer); + + irlap_disconnect_indication(self, LAP_RESET_INDICATION); + + self->xmitflag = FALSE; + } + break; + } + IRDA_DEBUG(1, "%s(), Not implemented!\n", __func__); + IRDA_DEBUG(1, "%s(), event=%s, ns_status=%d, nr_status=%d\n", + __func__, irlap_event[event], ns_status, nr_status); + break; + case RECV_UI_FRAME: + /* Poll bit cleared? */ + if (!info->pf) { + irlap_data_indication(self, skb, TRUE); + irlap_next_state(self, LAP_NRM_P); + } else { + del_timer(&self->final_timer); + irlap_data_indication(self, skb, TRUE); + irlap_next_state(self, LAP_XMIT_P); + IRDA_DEBUG(1, "%s: RECV_UI_FRAME: next state %s\n", __func__, irlap_state[self->state]); + irlap_start_poll_timer(self, self->poll_timeout); + } + break; + case RECV_RR_RSP: + /* + * If you get a RR, the remote isn't busy anymore, + * no matter what the NR + */ + self->remote_busy = FALSE; + + /* Stop final timer */ + del_timer(&self->final_timer); + + /* + * Nr as expected? + */ + ret = irlap_validate_nr_received(self, info->nr); + if (ret == NR_EXPECTED) { + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* + * Got expected NR, so reset the retry_count. This + * is not done by the IrLAP standard , which is + * strange! DB. + */ + self->retry_count = 0; + irlap_wait_min_turn_around(self, &self->qos_tx); + + irlap_next_state(self, LAP_XMIT_P); + + /* Start poll timer */ + irlap_start_poll_timer(self, self->poll_timeout); + } else if (ret == NR_UNEXPECTED) { + IRDA_ASSERT(info != NULL, return -1;); + /* + * Unexpected nr! + */ + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + IRDA_DEBUG(4, "RECV_RR_FRAME: Retrans:%d, nr=%d, va=%d, " + "vs=%d, vr=%d\n", + self->retry_count, info->nr, self->va, + self->vs, self->vr); + + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, CMD_FRAME); + irlap_start_final_timer(self, self->final_timeout * 2); + + irlap_next_state(self, LAP_NRM_P); + } else if (ret == NR_INVALID) { + IRDA_DEBUG(1, "%s(), Received RR with " + "invalid nr !\n", __func__); + + irlap_next_state(self, LAP_RESET_WAIT); + + irlap_disconnect_indication(self, LAP_RESET_INDICATION); + self->xmitflag = TRUE; + } + break; + case RECV_RNR_RSP: + IRDA_ASSERT(info != NULL, return -1;); + + /* Stop final timer */ + del_timer(&self->final_timer); + self->remote_busy = TRUE; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + irlap_next_state(self, LAP_XMIT_P); + + /* Start poll timer */ + irlap_start_poll_timer(self, self->poll_timeout); + break; + case RECV_FRMR_RSP: + del_timer(&self->final_timer); + self->xmitflag = TRUE; + irlap_next_state(self, LAP_RESET_WAIT); + irlap_reset_indication(self); + break; + case FINAL_TIMER_EXPIRED: + /* + * We are allowed to wait for additional 300 ms if + * final timer expires when we are in the middle + * of receiving a frame (page 45, IrLAP). Check that + * we only do this once for each frame. + */ + if (irda_device_is_receiving(self->netdev) && !self->add_wait) { + IRDA_DEBUG(1, "FINAL_TIMER_EXPIRED when receiving a " + "frame! Waiting a little bit more!\n"); + irlap_start_final_timer(self, msecs_to_jiffies(300)); + + /* + * Don't allow this to happen one more time in a row, + * or else we can get a pretty tight loop here if + * if we only receive half a frame. DB. + */ + self->add_wait = TRUE; + break; + } + self->add_wait = FALSE; + + /* N2 is the disconnect timer. Until we reach it, we retry */ + if (self->retry_count < self->N2) { + if (skb_peek(&self->wx_list) == NULL) { + /* Retry sending the pf bit to the secondary */ + IRDA_DEBUG(4, "nrm_p: resending rr"); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + } else { + IRDA_DEBUG(4, "nrm_p: resend frames"); + irlap_resend_rejected_frames(self, CMD_FRAME); + } + + irlap_start_final_timer(self, self->final_timeout); + self->retry_count++; + IRDA_DEBUG(4, "irlap_state_nrm_p: FINAL_TIMER_EXPIRED:" + " retry_count=%d\n", self->retry_count); + + /* Early warning event. I'm using a pretty liberal + * interpretation of the spec and generate an event + * every time the timer is multiple of N1 (and not + * only the first time). This allow application + * to know precisely if connectivity restart... + * Jean II */ + if((self->retry_count % self->N1) == 0) + irlap_status_indication(self, + STATUS_NO_ACTIVITY); + + /* Keep state */ + } else { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + case RECV_REJ_RSP: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + } else + irlap_resend_rejected_frames(self, CMD_FRAME); + irlap_start_final_timer(self, 2 * self->final_timeout); + break; + case RECV_SREJ_RSP: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + } else + irlap_resend_rejected_frame(self, CMD_FRAME); + irlap_start_final_timer(self, 2 * self->final_timeout); + break; + case RECV_RD_RSP: + IRDA_DEBUG(1, "%s(), RECV_RD_RSP\n", __func__); + + irlap_flush_all_queues(self); + irlap_next_state(self, LAP_XMIT_P); + /* Call back the LAP state machine to do a proper disconnect */ + irlap_disconnect_request(self); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %s\n", + __func__, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_reset_wait (event, skb, info) + * + * We have informed the service user of a reset condition, and is + * awaiting reset of disconnect request. + * + */ +static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(3, "%s(), event = %s\n", __func__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RESET_REQUEST: + if (self->xmitflag) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_snrm_frame(self, NULL); + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_RESET); + } else { + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_RESET); + } + break; + case DISCONNECT_REQUEST: + irlap_wait_min_turn_around( self, &self->qos_tx); + irlap_send_disc_frame( self); + irlap_flush_all_queues( self); + irlap_start_final_timer( self, self->final_timeout); + self->retry_count = 0; + irlap_next_state( self, LAP_PCLOSE); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__, + irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_reset (self, event, skb, info) + * + * We have sent a SNRM reset command to the peer layer, and is awaiting + * reply. + * + */ +static int irlap_state_reset(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(3, "%s(), event = %s\n", __func__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_DISC_CMD: + del_timer(&self->final_timer); + + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + + break; + case RECV_UA_RSP: + del_timer(&self->final_timer); + + /* Initiate connection state */ + irlap_initiate_connection_state(self); + + irlap_reset_confirm(); + + self->remote_busy = FALSE; + + irlap_next_state(self, LAP_XMIT_P); + + irlap_start_poll_timer(self, self->poll_timeout); + + break; + case FINAL_TIMER_EXPIRED: + if (self->retry_count < 3) { + irlap_wait_min_turn_around(self, &self->qos_tx); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + irlap_send_snrm_frame(self, self->qos_dev); + + self->retry_count++; /* Experimental!! */ + + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_RESET); + } else if (self->retry_count >= self->N3) { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + case RECV_SNRM_CMD: + /* + * SNRM frame is not allowed to contain an I-field in this + * state + */ + if (!info) { + IRDA_DEBUG(3, "%s(), RECV_SNRM_CMD\n", __func__); + irlap_initiate_connection_state(self); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_ua_response_frame(self, &self->qos_rx); + irlap_reset_confirm(); + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_NDM); + } else { + IRDA_DEBUG(0, + "%s(), SNRM frame contained an I field!\n", + __func__); + } + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %s\n", + __func__, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_xmit_s (event, skb, info) + * + * XMIT_S, The secondary station has been given the right to transmit, + * and we therefore do not expect to receive any transmissions from other + * stations. + */ +static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s(), event=%s\n", __func__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -ENODEV;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;); + + switch (event) { + case SEND_I_CMD: + /* + * Send frame only if send window > 0 + */ + if ((self->window > 0) && (!self->remote_busy)) { + int nextfit; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + struct sk_buff *skb_next; + + /* + * Same deal as in irlap_state_xmit_p(), so see + * the comments at that point. + * We are the secondary, so there are only subtle + * differences. - Jean II + */ + + /* Check if a subsequent skb exist and would fit in + * the current window (with respect to turnaround + * time). - Jean II */ + skb_next = skb_peek(&self->txq); + nextfit = ((skb_next != NULL) && + ((skb_next->len + skb->len) <= + self->bytes_left)); + + /* + * Test if we have transmitted more bytes over the + * link than its possible to do with the current + * speed and turn-around-time. + */ + if((!nextfit) && (skb->len > self->bytes_left)) { + IRDA_DEBUG(0, "%s(), Not allowed to transmit" + " more bytes!\n", __func__); + /* Requeue the skb */ + skb_queue_head(&self->txq, skb_get(skb)); + + /* + * Switch to NRM_S, this is only possible + * when we are in secondary mode, since we + * must be sure that we don't miss any RR + * frames + */ + self->window = self->window_size; + self->bytes_left = self->line_capacity; + irlap_start_wd_timer(self, self->wd_timeout); + + irlap_next_state(self, LAP_NRM_S); + /* Slight difference with primary : + * here we would wait for the other side to + * expire the turnaround. - Jean II */ + + return -EPROTO; /* Try again later */ + } + /* Subtract space used by this skb */ + self->bytes_left -= skb->len; +#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* Window has been adjusted for the max packet + * size, so much simpler... - Jean II */ + nextfit = !skb_queue_empty(&self->txq); +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* + * Send data with final bit cleared only if window > 1 + * and there is more frames to be sent + */ + if ((self->window > 1) && (nextfit)) { + irlap_send_data_secondary(self, skb); + irlap_next_state(self, LAP_XMIT_S); + } else { + irlap_send_data_secondary_final(self, skb); + irlap_next_state(self, LAP_NRM_S); + + /* + * Make sure state machine does not try to send + * any more frames + */ + ret = -EPROTO; + } + } else { + IRDA_DEBUG(2, "%s(), Unable to send!\n", __func__); + skb_queue_head(&self->txq, skb_get(skb)); + ret = -EPROTO; + } + break; + case DISCONNECT_REQUEST: + irlap_send_rd_frame(self); + irlap_flush_all_queues(self); + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_SCLOSE); + break; + case DATA_REQUEST: + /* Nothing to do, irlap_do_event() will send the packet + * when we return... - Jean II */ + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__, + irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} + +/* + * Function irlap_state_nrm_s (event, skb, info) + * + * NRM_S (Normal Response Mode as Secondary) state, in this state we are + * expecting to receive frames from the primary station + * + */ +static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ns_status; + int nr_status; + int ret = 0; + + IRDA_DEBUG(4, "%s(), event=%s\n", __func__, irlap_event[ event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_I_CMD: /* Optimize for the common case */ + /* FIXME: must check for remote_busy below */ + IRDA_DEBUG(4, "%s(), event=%s nr=%d, vs=%d, ns=%d, " + "vr=%d, pf=%d\n", __func__, + irlap_event[event], info->nr, + self->vs, info->ns, self->vr, info->pf); + + self->retry_count = 0; + + ns_status = irlap_validate_ns_received(self, info->ns); + nr_status = irlap_validate_nr_received(self, info->nr); + /* + * Check for expected I(nformation) frame + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) { + + /* Update Vr (next frame for us to receive) */ + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* + * poll bit cleared? + */ + if (!info->pf) { + + self->ack_required = TRUE; + + /* + * Starting WD-timer here is optional, but + * not recommended. Note 6 IrLAP p. 83 + */ +#if 0 + irda_start_timer(WD_TIMER, self->wd_timeout); +#endif + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_S); + + irlap_data_indication(self, skb, FALSE); + break; + } else { + /* + * We should wait before sending RR, and + * also before changing to XMIT_S + * state. (note 1, IrLAP p. 82) + */ + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* + * Give higher layers a chance to + * immediately reply with some data before + * we decide if we should send a RR frame + * or not + */ + irlap_data_indication(self, skb, FALSE); + + /* Any pending data requests? */ + if (!skb_queue_empty(&self->txq) && + (self->window > 0)) + { + self->ack_required = TRUE; + + del_timer(&self->wd_timer); + + irlap_next_state(self, LAP_XMIT_S); + } else { + irlap_send_rr_frame(self, RSP_FRAME); + irlap_start_wd_timer(self, + self->wd_timeout); + + /* Keep the state */ + irlap_next_state(self, LAP_NRM_S); + } + break; + } + } + /* + * Check for Unexpected next to send (Ns) + */ + if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED)) + { + /* Unexpected next to send, with final bit cleared */ + if (!info->pf) { + irlap_update_nr_received(self, info->nr); + + irlap_start_wd_timer(self, self->wd_timeout); + } else { + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + + irlap_start_wd_timer(self, self->wd_timeout); + } + break; + } + + /* + * Unexpected Next to Receive(NR) ? + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED)) + { + if (info->pf) { + IRDA_DEBUG(4, "RECV_I_RSP: frame(s) lost\n"); + + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, RSP_FRAME); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_S); + + irlap_data_indication(self, skb, FALSE); + irlap_start_wd_timer(self, self->wd_timeout); + break; + } + /* + * This is not documented in IrLAP!! Unexpected NR + * with poll bit cleared + */ + if (!info->pf) { + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_S); + + irlap_data_indication(self, skb, FALSE); + irlap_start_wd_timer(self, self->wd_timeout); + } + break; + } + + if (ret == NR_INVALID) { + IRDA_DEBUG(0, "NRM_S, NR_INVALID not implemented!\n"); + } + if (ret == NS_INVALID) { + IRDA_DEBUG(0, "NRM_S, NS_INVALID not implemented!\n"); + } + break; + case RECV_UI_FRAME: + /* + * poll bit cleared? + */ + if (!info->pf) { + irlap_data_indication(self, skb, TRUE); + irlap_next_state(self, LAP_NRM_S); /* Keep state */ + } else { + /* + * Any pending data requests? + */ + if (!skb_queue_empty(&self->txq) && + (self->window > 0) && !self->remote_busy) + { + irlap_data_indication(self, skb, TRUE); + + del_timer(&self->wd_timer); + + irlap_next_state(self, LAP_XMIT_S); + } else { + irlap_data_indication(self, skb, TRUE); + + irlap_wait_min_turn_around(self, &self->qos_tx); + + irlap_send_rr_frame(self, RSP_FRAME); + self->ack_required = FALSE; + + irlap_start_wd_timer(self, self->wd_timeout); + + /* Keep the state */ + irlap_next_state(self, LAP_NRM_S); + } + } + break; + case RECV_RR_CMD: + self->retry_count = 0; + + /* + * Nr as expected? + */ + nr_status = irlap_validate_nr_received(self, info->nr); + if (nr_status == NR_EXPECTED) { + if (!skb_queue_empty(&self->txq) && + (self->window > 0)) { + self->remote_busy = FALSE; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + del_timer(&self->wd_timer); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_next_state(self, LAP_XMIT_S); + } else { + self->remote_busy = FALSE; + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_start_wd_timer(self, self->wd_timeout); + + /* Note : if the link is idle (this case), + * we never go in XMIT_S, so we never get a + * chance to process any DISCONNECT_REQUEST. + * Do it now ! - Jean II */ + if (self->disconnect_pending) { + /* Disconnect */ + irlap_send_rd_frame(self); + irlap_flush_all_queues(self); + + irlap_next_state(self, LAP_SCLOSE); + } else { + /* Just send back pf bit */ + irlap_send_rr_frame(self, RSP_FRAME); + + irlap_next_state(self, LAP_NRM_S); + } + } + } else if (nr_status == NR_UNEXPECTED) { + self->remote_busy = FALSE; + irlap_update_nr_received(self, info->nr); + irlap_resend_rejected_frames(self, RSP_FRAME); + + irlap_start_wd_timer(self, self->wd_timeout); + + /* Keep state */ + irlap_next_state(self, LAP_NRM_S); + } else { + IRDA_DEBUG(1, "%s(), invalid nr not implemented!\n", + __func__); + } + break; + case RECV_SNRM_CMD: + /* SNRM frame is not allowed to contain an I-field */ + if (!info) { + del_timer(&self->wd_timer); + IRDA_DEBUG(1, "%s(), received SNRM cmd\n", __func__); + irlap_next_state(self, LAP_RESET_CHECK); + + irlap_reset_indication(self); + } else { + IRDA_DEBUG(0, + "%s(), SNRM frame contained an I-field!\n", + __func__); + + } + break; + case RECV_REJ_CMD: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + } else + irlap_resend_rejected_frames(self, RSP_FRAME); + irlap_start_wd_timer(self, self->wd_timeout); + break; + case RECV_SREJ_CMD: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + } else + irlap_resend_rejected_frame(self, RSP_FRAME); + irlap_start_wd_timer(self, self->wd_timeout); + break; + case WD_TIMER_EXPIRED: + /* + * Wait until retry_count * n matches negotiated threshold/ + * disconnect time (note 2 in IrLAP p. 82) + * + * Similar to irlap_state_nrm_p() -> FINAL_TIMER_EXPIRED + * Note : self->wd_timeout = (self->final_timeout * 2), + * which explain why we use (self->N2 / 2) here !!! + * Jean II + */ + IRDA_DEBUG(1, "%s(), retry_count = %d\n", __func__, + self->retry_count); + + if (self->retry_count < (self->N2 / 2)) { + /* No retry, just wait for primary */ + irlap_start_wd_timer(self, self->wd_timeout); + self->retry_count++; + + if((self->retry_count % (self->N1 / 2)) == 0) + irlap_status_indication(self, + STATUS_NO_ACTIVITY); + } else { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + case RECV_DISC_CMD: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + /* Send disconnect response */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_ua_response_frame(self, NULL); + + del_timer(&self->wd_timer); + irlap_flush_all_queues(self); + /* Set default link parameters */ + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case RECV_DISCOVERY_XID_CMD: + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + self->ack_required = TRUE; + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_NRM_S); + + break; + case RECV_TEST_CMD: + /* Remove test frame header (only LAP header in NRM) */ + skb_pull(skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_start_wd_timer(self, self->wd_timeout); + + /* Send response (info will be copied) */ + irlap_send_test_frame(self, self->caddr, info->daddr, skb); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __func__, + event, irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} + +/* + * Function irlap_state_sclose (self, event, skb, info) + */ +static int irlap_state_sclose(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + IRDA_DEBUG(1, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -ENODEV;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;); + + switch (event) { + case RECV_DISC_CMD: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + /* Send disconnect response */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_ua_response_frame(self, NULL); + + del_timer(&self->wd_timer); + /* Set default link parameters */ + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case RECV_DM_RSP: + /* IrLAP-1.1 p.82: in SCLOSE, S and I type RSP frames + * shall take us down into default NDM state, like DM_RSP + */ + case RECV_RR_RSP: + case RECV_RNR_RSP: + case RECV_REJ_RSP: + case RECV_SREJ_RSP: + case RECV_I_RSP: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + del_timer(&self->wd_timer); + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case WD_TIMER_EXPIRED: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + default: + /* IrLAP-1.1 p.82: in SCLOSE, basically any received frame + * with pf=1 shall restart the wd-timer and resend the rd:rsp + */ + if (info != NULL && info->pf) { + del_timer(&self->wd_timer); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rd_frame(self); + irlap_start_wd_timer(self, self->wd_timeout); + break; /* stay in SCLOSE */ + } + + IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __func__, + event, irlap_event[event]); + + break; + } + + return -1; +} + +static int irlap_state_reset_check( struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, + struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(1, "%s(), event=%s\n", __func__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -ENODEV;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;); + + switch (event) { + case RESET_RESPONSE: + irlap_send_ua_response_frame(self, &self->qos_rx); + irlap_initiate_connection_state(self); + irlap_start_wd_timer(self, WD_TIMEOUT); + irlap_flush_all_queues(self); + + irlap_next_state(self, LAP_NRM_S); + break; + case DISCONNECT_REQUEST: + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rd_frame(self); + irlap_start_wd_timer(self, WD_TIMEOUT); + irlap_next_state(self, LAP_SCLOSE); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __func__, + event, irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c new file mode 100644 index 00000000..8c004161 --- /dev/null +++ b/net/irda/irlap_frame.c @@ -0,0 +1,1434 @@ +/********************************************************************* + * + * Filename: irlap_frame.c + * Version: 1.0 + * Description: Build and transmit IrLAP frames + * Status: Stable + * Author: Dag Brattli + * Created at: Tue Aug 19 10:27:26 1997 + * Modified at: Wed Jan 5 08:59:04 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb, + int command); + +/* + * Function irlap_insert_info (self, skb) + * + * Insert minimum turnaround time and speed information into the skb. We + * need to do this since it's per packet relevant information. Safe to + * have this function inlined since it's only called from one place + */ +static inline void irlap_insert_info(struct irlap_cb *self, + struct sk_buff *skb) +{ + struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb; + + /* + * Insert MTT (min. turn time) and speed into skb, so that the + * device driver knows which settings to use + */ + cb->magic = LAP_MAGIC; + cb->mtt = self->mtt_required; + cb->next_speed = self->speed; + + /* Reset */ + self->mtt_required = 0; + + /* + * Delay equals negotiated BOFs count, plus the number of BOFs to + * force the negotiated minimum turnaround time + */ + cb->xbofs = self->bofs_count; + cb->next_xbofs = self->next_bofs; + cb->xbofs_delay = self->xbofs_delay; + + /* Reset XBOF's delay (used only for getting min turn time) */ + self->xbofs_delay = 0; + /* Put the correct xbofs value for the next packet */ + self->bofs_count = self->next_bofs; +} + +/* + * Function irlap_queue_xmit (self, skb) + * + * A little wrapper for dev_queue_xmit, so we can insert some common + * code into it. + */ +void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb) +{ + /* Some common init stuff */ + skb->dev = self->netdev; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->protocol = htons(ETH_P_IRDA); + skb->priority = TC_PRIO_BESTEFFORT; + + irlap_insert_info(self, skb); + + if (unlikely(self->mode & IRDA_MODE_MONITOR)) { + IRDA_DEBUG(3, "%s(): %s is in monitor mode\n", __func__, + self->netdev->name); + dev_kfree_skb(skb); + return; + } + + dev_queue_xmit(skb); +} + +/* + * Function irlap_send_snrm_cmd (void) + * + * Transmits a connect SNRM command frame + */ +void irlap_send_snrm_frame(struct irlap_cb *self, struct qos_info *qos) +{ + struct sk_buff *tx_skb; + struct snrm_frame *frame; + int ret; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Allocate frame */ + tx_skb = alloc_skb(sizeof(struct snrm_frame) + + IRLAP_NEGOCIATION_PARAMS_LEN, + GFP_ATOMIC); + if (!tx_skb) + return; + + frame = (struct snrm_frame *) skb_put(tx_skb, 2); + + /* Insert connection address field */ + if (qos) + frame->caddr = CMD_FRAME | CBROADCAST; + else + frame->caddr = CMD_FRAME | self->caddr; + + /* Insert control field */ + frame->control = SNRM_CMD | PF_BIT; + + /* + * If we are establishing a connection then insert QoS parameters + */ + if (qos) { + skb_put(tx_skb, 9); /* 25 left */ + frame->saddr = cpu_to_le32(self->saddr); + frame->daddr = cpu_to_le32(self->daddr); + + frame->ncaddr = self->caddr; + + ret = irlap_insert_qos_negotiation_params(self, tx_skb); + if (ret < 0) { + dev_kfree_skb(tx_skb); + return; + } + } + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_snrm_cmd (skb, info) + * + * Received SNRM (Set Normal Response Mode) command frame + * + */ +static void irlap_recv_snrm_cmd(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info) +{ + struct snrm_frame *frame; + + if (pskb_may_pull(skb,sizeof(struct snrm_frame))) { + frame = (struct snrm_frame *) skb->data; + + /* Copy the new connection address ignoring the C/R bit */ + info->caddr = frame->ncaddr & 0xFE; + + /* Check if the new connection address is valid */ + if ((info->caddr == 0x00) || (info->caddr == 0xfe)) { + IRDA_DEBUG(3, "%s(), invalid connection address!\n", + __func__); + return; + } + + /* Copy peer device address */ + info->daddr = le32_to_cpu(frame->saddr); + info->saddr = le32_to_cpu(frame->daddr); + + /* Only accept if addressed directly to us */ + if (info->saddr != self->saddr) { + IRDA_DEBUG(2, "%s(), not addressed to us!\n", + __func__); + return; + } + irlap_do_event(self, RECV_SNRM_CMD, skb, info); + } else { + /* Signal that this SNRM frame does not contain and I-field */ + irlap_do_event(self, RECV_SNRM_CMD, skb, NULL); + } +} + +/* + * Function irlap_send_ua_response_frame (qos) + * + * Send UA (Unnumbered Acknowledgement) frame + * + */ +void irlap_send_ua_response_frame(struct irlap_cb *self, struct qos_info *qos) +{ + struct sk_buff *tx_skb; + struct ua_frame *frame; + int ret; + + IRDA_DEBUG(2, "%s() <%ld>\n", __func__, jiffies); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Allocate frame */ + tx_skb = alloc_skb(sizeof(struct ua_frame) + + IRLAP_NEGOCIATION_PARAMS_LEN, + GFP_ATOMIC); + if (!tx_skb) + return; + + frame = (struct ua_frame *) skb_put(tx_skb, 10); + + /* Build UA response */ + frame->caddr = self->caddr; + frame->control = UA_RSP | PF_BIT; + + frame->saddr = cpu_to_le32(self->saddr); + frame->daddr = cpu_to_le32(self->daddr); + + /* Should we send QoS negotiation parameters? */ + if (qos) { + ret = irlap_insert_qos_negotiation_params(self, tx_skb); + if (ret < 0) { + dev_kfree_skb(tx_skb); + return; + } + } + + irlap_queue_xmit(self, tx_skb); +} + + +/* + * Function irlap_send_dm_frame (void) + * + * Send disconnected mode (DM) frame + * + */ +void irlap_send_dm_frame( struct irlap_cb *self) +{ + struct sk_buff *tx_skb = NULL; + struct dm_frame *frame; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + tx_skb = alloc_skb(sizeof(struct dm_frame), GFP_ATOMIC); + if (!tx_skb) + return; + + frame = (struct dm_frame *)skb_put(tx_skb, 2); + + if (self->state == LAP_NDM) + frame->caddr = CBROADCAST; + else + frame->caddr = self->caddr; + + frame->control = DM_RSP | PF_BIT; + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_send_disc_frame (void) + * + * Send disconnect (DISC) frame + * + */ +void irlap_send_disc_frame(struct irlap_cb *self) +{ + struct sk_buff *tx_skb = NULL; + struct disc_frame *frame; + + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + tx_skb = alloc_skb(sizeof(struct disc_frame), GFP_ATOMIC); + if (!tx_skb) + return; + + frame = (struct disc_frame *)skb_put(tx_skb, 2); + + frame->caddr = self->caddr | CMD_FRAME; + frame->control = DISC_CMD | PF_BIT; + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_send_discovery_xid_frame (S, s, command) + * + * Build and transmit a XID (eXchange station IDentifier) discovery + * frame. + */ +void irlap_send_discovery_xid_frame(struct irlap_cb *self, int S, __u8 s, + __u8 command, discovery_t *discovery) +{ + struct sk_buff *tx_skb = NULL; + struct xid_frame *frame; + __u32 bcast = BROADCAST; + __u8 *info; + + IRDA_DEBUG(4, "%s(), s=%d, S=%d, command=%d\n", __func__, + s, S, command); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(discovery != NULL, return;); + + tx_skb = alloc_skb(sizeof(struct xid_frame) + IRLAP_DISCOVERY_INFO_LEN, + GFP_ATOMIC); + if (!tx_skb) + return; + + skb_put(tx_skb, 14); + frame = (struct xid_frame *) tx_skb->data; + + if (command) { + frame->caddr = CBROADCAST | CMD_FRAME; + frame->control = XID_CMD | PF_BIT; + } else { + frame->caddr = CBROADCAST; + frame->control = XID_RSP | PF_BIT; + } + frame->ident = XID_FORMAT; + + frame->saddr = cpu_to_le32(self->saddr); + + if (command) + frame->daddr = cpu_to_le32(bcast); + else + frame->daddr = cpu_to_le32(discovery->data.daddr); + + switch (S) { + case 1: + frame->flags = 0x00; + break; + case 6: + frame->flags = 0x01; + break; + case 8: + frame->flags = 0x02; + break; + case 16: + frame->flags = 0x03; + break; + default: + frame->flags = 0x02; + break; + } + + frame->slotnr = s; + frame->version = 0x00; + + /* + * Provide info for final slot only in commands, and for all + * responses. Send the second byte of the hint only if the + * EXTENSION bit is set in the first byte. + */ + if (!command || (frame->slotnr == 0xff)) { + int len; + + if (discovery->data.hints[0] & HINT_EXTENSION) { + info = skb_put(tx_skb, 2); + info[0] = discovery->data.hints[0]; + info[1] = discovery->data.hints[1]; + } else { + info = skb_put(tx_skb, 1); + info[0] = discovery->data.hints[0]; + } + info = skb_put(tx_skb, 1); + info[0] = discovery->data.charset; + + len = IRDA_MIN(discovery->name_len, skb_tailroom(tx_skb)); + info = skb_put(tx_skb, len); + memcpy(info, discovery->data.info, len); + } + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_discovery_xid_rsp (skb, info) + * + * Received a XID discovery response + * + */ +static void irlap_recv_discovery_xid_rsp(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info) +{ + struct xid_frame *xid; + discovery_t *discovery = NULL; + __u8 *discovery_info; + char *text; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + if (!pskb_may_pull(skb, sizeof(struct xid_frame))) { + IRDA_ERROR("%s: frame too short!\n", __func__); + return; + } + + xid = (struct xid_frame *) skb->data; + + info->daddr = le32_to_cpu(xid->saddr); + info->saddr = le32_to_cpu(xid->daddr); + + /* Make sure frame is addressed to us */ + if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) { + IRDA_DEBUG(0, "%s(), frame is not addressed to us!\n", + __func__); + return; + } + + if ((discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC)) == NULL) { + IRDA_WARNING("%s: kmalloc failed!\n", __func__); + return; + } + + discovery->data.daddr = info->daddr; + discovery->data.saddr = self->saddr; + discovery->timestamp = jiffies; + + IRDA_DEBUG(4, "%s(), daddr=%08x\n", __func__, + discovery->data.daddr); + + discovery_info = skb_pull(skb, sizeof(struct xid_frame)); + + /* Get info returned from peer */ + discovery->data.hints[0] = discovery_info[0]; + if (discovery_info[0] & HINT_EXTENSION) { + IRDA_DEBUG(4, "EXTENSION\n"); + discovery->data.hints[1] = discovery_info[1]; + discovery->data.charset = discovery_info[2]; + text = (char *) &discovery_info[3]; + } else { + discovery->data.hints[1] = 0; + discovery->data.charset = discovery_info[1]; + text = (char *) &discovery_info[2]; + } + /* + * Terminate info string, should be safe since this is where the + * FCS bytes resides. + */ + skb->data[skb->len] = '\0'; + strncpy(discovery->data.info, text, NICKNAME_MAX_LEN); + discovery->name_len = strlen(discovery->data.info); + + info->discovery = discovery; + + irlap_do_event(self, RECV_DISCOVERY_XID_RSP, skb, info); +} + +/* + * Function irlap_recv_discovery_xid_cmd (skb, info) + * + * Received a XID discovery command + * + */ +static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info) +{ + struct xid_frame *xid; + discovery_t *discovery = NULL; + __u8 *discovery_info; + char *text; + + if (!pskb_may_pull(skb, sizeof(struct xid_frame))) { + IRDA_ERROR("%s: frame too short!\n", __func__); + return; + } + + xid = (struct xid_frame *) skb->data; + + info->daddr = le32_to_cpu(xid->saddr); + info->saddr = le32_to_cpu(xid->daddr); + + /* Make sure frame is addressed to us */ + if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) { + IRDA_DEBUG(0, "%s(), frame is not addressed to us!\n", + __func__); + return; + } + + switch (xid->flags & 0x03) { + case 0x00: + info->S = 1; + break; + case 0x01: + info->S = 6; + break; + case 0x02: + info->S = 8; + break; + case 0x03: + info->S = 16; + break; + default: + /* Error!! */ + return; + } + info->s = xid->slotnr; + + discovery_info = skb_pull(skb, sizeof(struct xid_frame)); + + /* + * Check if last frame + */ + if (info->s == 0xff) { + /* Check if things are sane at this point... */ + if((discovery_info == NULL) || + !pskb_may_pull(skb, 3)) { + IRDA_ERROR("%s: discovery frame too short!\n", + __func__); + return; + } + + /* + * We now have some discovery info to deliver! + */ + discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC); + if (!discovery) { + IRDA_WARNING("%s: unable to malloc!\n", __func__); + return; + } + + discovery->data.daddr = info->daddr; + discovery->data.saddr = self->saddr; + discovery->timestamp = jiffies; + + discovery->data.hints[0] = discovery_info[0]; + if (discovery_info[0] & HINT_EXTENSION) { + discovery->data.hints[1] = discovery_info[1]; + discovery->data.charset = discovery_info[2]; + text = (char *) &discovery_info[3]; + } else { + discovery->data.hints[1] = 0; + discovery->data.charset = discovery_info[1]; + text = (char *) &discovery_info[2]; + } + /* + * Terminate string, should be safe since this is where the + * FCS bytes resides. + */ + skb->data[skb->len] = '\0'; + strncpy(discovery->data.info, text, NICKNAME_MAX_LEN); + discovery->name_len = strlen(discovery->data.info); + + info->discovery = discovery; + } else + info->discovery = NULL; + + irlap_do_event(self, RECV_DISCOVERY_XID_CMD, skb, info); +} + +/* + * Function irlap_send_rr_frame (self, command) + * + * Build and transmit RR (Receive Ready) frame. Notice that it is currently + * only possible to send RR frames with the poll bit set. + */ +void irlap_send_rr_frame(struct irlap_cb *self, int command) +{ + struct sk_buff *tx_skb; + struct rr_frame *frame; + + tx_skb = alloc_skb(sizeof(struct rr_frame), GFP_ATOMIC); + if (!tx_skb) + return; + + frame = (struct rr_frame *)skb_put(tx_skb, 2); + + frame->caddr = self->caddr; + frame->caddr |= (command) ? CMD_FRAME : 0; + + frame->control = RR | PF_BIT | (self->vr << 5); + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_send_rd_frame (self) + * + * Request disconnect. Used by a secondary station to request the + * disconnection of the link. + */ +void irlap_send_rd_frame(struct irlap_cb *self) +{ + struct sk_buff *tx_skb; + struct rd_frame *frame; + + tx_skb = alloc_skb(sizeof(struct rd_frame), GFP_ATOMIC); + if (!tx_skb) + return; + + frame = (struct rd_frame *)skb_put(tx_skb, 2); + + frame->caddr = self->caddr; + frame->caddr = RD_RSP | PF_BIT; + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_rr_frame (skb, info) + * + * Received RR (Receive Ready) frame from peer station, no harm in + * making it inline since its called only from one single place + * (irlap_driver_rcv). + */ +static inline void irlap_recv_rr_frame(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info, int command) +{ + info->nr = skb->data[1] >> 5; + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_RR_CMD, skb, info); + else + irlap_do_event(self, RECV_RR_RSP, skb, info); +} + +/* + * Function irlap_recv_rnr_frame (self, skb, info) + * + * Received RNR (Receive Not Ready) frame from peer station + * + */ +static void irlap_recv_rnr_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + info->nr = skb->data[1] >> 5; + + IRDA_DEBUG(4, "%s(), nr=%d, %ld\n", __func__, info->nr, jiffies); + + if (command) + irlap_do_event(self, RECV_RNR_CMD, skb, info); + else + irlap_do_event(self, RECV_RNR_RSP, skb, info); +} + +static void irlap_recv_rej_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + IRDA_DEBUG(0, "%s()\n", __func__); + + info->nr = skb->data[1] >> 5; + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_REJ_CMD, skb, info); + else + irlap_do_event(self, RECV_REJ_RSP, skb, info); +} + +static void irlap_recv_srej_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + IRDA_DEBUG(0, "%s()\n", __func__); + + info->nr = skb->data[1] >> 5; + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_SREJ_CMD, skb, info); + else + irlap_do_event(self, RECV_SREJ_RSP, skb, info); +} + +static void irlap_recv_disc_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + IRDA_DEBUG(2, "%s()\n", __func__); + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_DISC_CMD, skb, info); + else + irlap_do_event(self, RECV_RD_RSP, skb, info); +} + +/* + * Function irlap_recv_ua_frame (skb, frame) + * + * Received UA (Unnumbered Acknowledgement) frame + * + */ +static inline void irlap_recv_ua_frame(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info) +{ + irlap_do_event(self, RECV_UA_RSP, skb, info); +} + +/* + * Function irlap_send_data_primary(self, skb) + * + * Send I-frames as the primary station but without the poll bit set + * + */ +void irlap_send_data_primary(struct irlap_cb *self, struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + /* Copy buffer */ + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + self->window -= 1; + + irlap_send_i_frame( self, tx_skb, CMD_FRAME); + } else { + IRDA_DEBUG(4, "%s(), sending unreliable frame\n", __func__); + irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME); + self->window -= 1; + } +} +/* + * Function irlap_send_data_primary_poll (self, skb) + * + * Send I(nformation) frame as primary with poll bit set + */ +void irlap_send_data_primary_poll(struct irlap_cb *self, struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + int transmission_time; + + /* Stop P timer */ + del_timer(&self->poll_timer); + + /* Is this reliable or unreliable data? */ + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + /* Copy buffer */ + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + /* + * Set poll bit if necessary. We do this to the copied + * skb, since retransmitted need to set or clear the poll + * bit depending on when they are sent. + */ + tx_skb->data[1] |= PF_BIT; + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + + irlap_next_state(self, LAP_NRM_P); + irlap_send_i_frame(self, tx_skb, CMD_FRAME); + } else { + IRDA_DEBUG(4, "%s(), sending unreliable frame\n", __func__); + + if (self->ack_required) { + irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME); + irlap_next_state(self, LAP_NRM_P); + irlap_send_rr_frame(self, CMD_FRAME); + self->ack_required = FALSE; + } else { + skb->data[1] |= PF_BIT; + irlap_next_state(self, LAP_NRM_P); + irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME); + } + } + + /* How much time we took for transmission of all frames. + * We don't know, so let assume we used the full window. Jean II */ + transmission_time = self->final_timeout; + + /* Reset parameter so that we can fill next window */ + self->window = self->window_size; + +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* Remove what we have not used. Just do a prorata of the + * bytes left in window to window capacity. + * See max_line_capacities[][] in qos.c for details. Jean II */ + transmission_time -= (self->final_timeout * self->bytes_left + / self->line_capacity); + IRDA_DEBUG(4, "%s() adjusting transmission_time : ft=%d, bl=%d, lc=%d -> tt=%d\n", __func__, self->final_timeout, self->bytes_left, self->line_capacity, transmission_time); + + /* We are allowed to transmit a maximum number of bytes again. */ + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + + /* + * The network layer has a intermediate buffer between IrLAP + * and the IrDA driver which can contain 8 frames. So, even + * though IrLAP is currently sending the *last* frame of the + * tx-window, the driver most likely has only just started + * sending the *first* frame of the same tx-window. + * I.e. we are always at the very beginning of or Tx window. + * Now, we are supposed to set the final timer from the end + * of our tx-window to let the other peer reply. So, we need + * to add extra time to compensate for the fact that we + * are really at the start of tx-window, otherwise the final timer + * might expire before he can answer... + * Jean II + */ + irlap_start_final_timer(self, self->final_timeout + transmission_time); + + /* + * The clever amongst you might ask why we do this adjustement + * only here, and not in all the other cases in irlap_event.c. + * In all those other case, we only send a very short management + * frame (few bytes), so the adjustement would be lost in the + * noise... + * The exception of course is irlap_resend_rejected_frame(). + * Jean II */ +} + +/* + * Function irlap_send_data_secondary_final (self, skb) + * + * Send I(nformation) frame as secondary with final bit set + * + */ +void irlap_send_data_secondary_final(struct irlap_cb *self, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb = NULL; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Is this reliable or unreliable data? */ + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + tx_skb->data[1] |= PF_BIT; + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + + irlap_send_i_frame(self, tx_skb, RSP_FRAME); + } else { + if (self->ack_required) { + irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME); + irlap_send_rr_frame(self, RSP_FRAME); + self->ack_required = FALSE; + } else { + skb->data[1] |= PF_BIT; + irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME); + } + } + + self->window = self->window_size; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* We are allowed to transmit a maximum number of bytes again. */ + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + + irlap_start_wd_timer(self, self->wd_timeout); +} + +/* + * Function irlap_send_data_secondary (self, skb) + * + * Send I(nformation) frame as secondary without final bit set + * + */ +void irlap_send_data_secondary(struct irlap_cb *self, struct sk_buff *skb) +{ + struct sk_buff *tx_skb = NULL; + + /* Is this reliable or unreliable data? */ + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + self->window -= 1; + + irlap_send_i_frame(self, tx_skb, RSP_FRAME); + } else { + irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME); + self->window -= 1; + } +} + +/* + * Function irlap_resend_rejected_frames (nr) + * + * Resend frames which has not been acknowledged. Should be safe to + * traverse the list without locking it since this function will only be + * called from interrupt context (BH) + */ +void irlap_resend_rejected_frames(struct irlap_cb *self, int command) +{ + struct sk_buff *tx_skb; + struct sk_buff *skb; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Resend unacknowledged frame(s) */ + skb_queue_walk(&self->wx_list, skb) { + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* We copy the skb to be retransmitted since we will have to + * modify it. Cloning will confuse packet sniffers + */ + /* tx_skb = skb_clone( skb, GFP_ATOMIC); */ + tx_skb = skb_copy(skb, GFP_ATOMIC); + if (!tx_skb) { + IRDA_DEBUG(0, "%s(), unable to copy\n", __func__); + return; + } + + /* Clear old Nr field + poll bit */ + tx_skb->data[1] &= 0x0f; + + /* + * Set poll bit on the last frame retransmitted + */ + if (skb_queue_is_last(&self->wx_list, skb)) + tx_skb->data[1] |= PF_BIT; /* Set p/f bit */ + else + tx_skb->data[1] &= ~PF_BIT; /* Clear p/f bit */ + + irlap_send_i_frame(self, tx_skb, command); + } +#if 0 /* Not yet */ + /* + * We can now fill the window with additional data frames + */ + while (!skb_queue_empty(&self->txq)) { + + IRDA_DEBUG(0, "%s(), sending additional frames!\n", __func__); + if (self->window > 0) { + skb = skb_dequeue( &self->txq); + IRDA_ASSERT(skb != NULL, return;); + + /* + * If send window > 1 then send frame with pf + * bit cleared + */ + if ((self->window > 1) && + !skb_queue_empty(&self->txq)) { + irlap_send_data_primary(self, skb); + } else { + irlap_send_data_primary_poll(self, skb); + } + kfree_skb(skb); + } + } +#endif +} + +void irlap_resend_rejected_frame(struct irlap_cb *self, int command) +{ + struct sk_buff *tx_skb; + struct sk_buff *skb; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Resend unacknowledged frame(s) */ + skb = skb_peek(&self->wx_list); + if (skb != NULL) { + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* We copy the skb to be retransmitted since we will have to + * modify it. Cloning will confuse packet sniffers + */ + /* tx_skb = skb_clone( skb, GFP_ATOMIC); */ + tx_skb = skb_copy(skb, GFP_ATOMIC); + if (!tx_skb) { + IRDA_DEBUG(0, "%s(), unable to copy\n", __func__); + return; + } + + /* Clear old Nr field + poll bit */ + tx_skb->data[1] &= 0x0f; + + /* Set poll/final bit */ + tx_skb->data[1] |= PF_BIT; /* Set p/f bit */ + + irlap_send_i_frame(self, tx_skb, command); + } +} + +/* + * Function irlap_send_ui_frame (self, skb, command) + * + * Contruct and transmit an Unnumbered Information (UI) frame + * + */ +void irlap_send_ui_frame(struct irlap_cb *self, struct sk_buff *skb, + __u8 caddr, int command) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Insert connection address */ + skb->data[0] = caddr | ((command) ? CMD_FRAME : 0); + + irlap_queue_xmit(self, skb); +} + +/* + * Function irlap_send_i_frame (skb) + * + * Contruct and transmit Information (I) frame + */ +static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb, + int command) +{ + /* Insert connection address */ + skb->data[0] = self->caddr; + skb->data[0] |= (command) ? CMD_FRAME : 0; + + /* Insert next to receive (Vr) */ + skb->data[1] |= (self->vr << 5); /* insert nr */ + + irlap_queue_xmit(self, skb); +} + +/* + * Function irlap_recv_i_frame (skb, frame) + * + * Receive and parse an I (Information) frame, no harm in making it inline + * since it's called only from one single place (irlap_driver_rcv). + */ +static inline void irlap_recv_i_frame(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info, int command) +{ + info->nr = skb->data[1] >> 5; /* Next to receive */ + info->pf = skb->data[1] & PF_BIT; /* Final bit */ + info->ns = (skb->data[1] >> 1) & 0x07; /* Next to send */ + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_I_CMD, skb, info); + else + irlap_do_event(self, RECV_I_RSP, skb, info); +} + +/* + * Function irlap_recv_ui_frame (self, skb, info) + * + * Receive and parse an Unnumbered Information (UI) frame + * + */ +static void irlap_recv_ui_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info) +{ + IRDA_DEBUG( 4, "%s()\n", __func__); + + info->pf = skb->data[1] & PF_BIT; /* Final bit */ + + irlap_do_event(self, RECV_UI_FRAME, skb, info); +} + +/* + * Function irlap_recv_frmr_frame (skb, frame) + * + * Received Frame Reject response. + * + */ +static void irlap_recv_frmr_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info) +{ + __u8 *frame; + int w, x, y, z; + + IRDA_DEBUG(0, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(info != NULL, return;); + + if (!pskb_may_pull(skb, 4)) { + IRDA_ERROR("%s: frame too short!\n", __func__); + return; + } + + frame = skb->data; + + info->nr = frame[2] >> 5; /* Next to receive */ + info->pf = frame[2] & PF_BIT; /* Final bit */ + info->ns = (frame[2] >> 1) & 0x07; /* Next to send */ + + w = frame[3] & 0x01; + x = frame[3] & 0x02; + y = frame[3] & 0x04; + z = frame[3] & 0x08; + + if (w) { + IRDA_DEBUG(0, "Rejected control field is undefined or not " + "implemented.\n"); + } + if (x) { + IRDA_DEBUG(0, "Rejected control field was invalid because it " + "contained a non permitted I field.\n"); + } + if (y) { + IRDA_DEBUG(0, "Received I field exceeded the maximum negotiated " + "for the existing connection or exceeded the maximum " + "this station supports if no connection exists.\n"); + } + if (z) { + IRDA_DEBUG(0, "Rejected control field control field contained an " + "invalid Nr count.\n"); + } + irlap_do_event(self, RECV_FRMR_RSP, skb, info); +} + +/* + * Function irlap_send_test_frame (self, daddr) + * + * Send a test frame response + * + */ +void irlap_send_test_frame(struct irlap_cb *self, __u8 caddr, __u32 daddr, + struct sk_buff *cmd) +{ + struct sk_buff *tx_skb; + struct test_frame *frame; + __u8 *info; + + tx_skb = alloc_skb(cmd->len + sizeof(struct test_frame), GFP_ATOMIC); + if (!tx_skb) + return; + + /* Broadcast frames must include saddr and daddr fields */ + if (caddr == CBROADCAST) { + frame = (struct test_frame *) + skb_put(tx_skb, sizeof(struct test_frame)); + + /* Insert the swapped addresses */ + frame->saddr = cpu_to_le32(self->saddr); + frame->daddr = cpu_to_le32(daddr); + } else + frame = (struct test_frame *) skb_put(tx_skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER); + + frame->caddr = caddr; + frame->control = TEST_RSP | PF_BIT; + + /* Copy info */ + info = skb_put(tx_skb, cmd->len); + memcpy(info, cmd->data, cmd->len); + + /* Return to sender */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_test_frame (self, skb) + * + * Receive a test frame + * + */ +static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + struct test_frame *frame; + + IRDA_DEBUG(2, "%s()\n", __func__); + + if (!pskb_may_pull(skb, sizeof(*frame))) { + IRDA_ERROR("%s: frame too short!\n", __func__); + return; + } + frame = (struct test_frame *) skb->data; + + /* Broadcast frames must carry saddr and daddr fields */ + if (info->caddr == CBROADCAST) { + if (skb->len < sizeof(struct test_frame)) { + IRDA_DEBUG(0, "%s() test frame too short!\n", + __func__); + return; + } + + /* Read and swap addresses */ + info->daddr = le32_to_cpu(frame->saddr); + info->saddr = le32_to_cpu(frame->daddr); + + /* Make sure frame is addressed to us */ + if ((info->saddr != self->saddr) && + (info->saddr != BROADCAST)) { + return; + } + } + + if (command) + irlap_do_event(self, RECV_TEST_CMD, skb, info); + else + irlap_do_event(self, RECV_TEST_RSP, skb, info); +} + +/* + * Function irlap_driver_rcv (skb, netdev, ptype) + * + * Called when a frame is received. Dispatches the right receive function + * for processing of the frame. + * + * Note on skb management : + * After calling the higher layers of the IrDA stack, we always + * kfree() the skb, which drop the reference count (and potentially + * destroy it). + * If a higher layer of the stack want to keep the skb around (to put + * in a queue or pass it to the higher layer), it will need to use + * skb_get() to keep a reference on it. This is usually done at the + * LMP level in irlmp.c. + * Jean II + */ +int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype, struct net_device *orig_dev) +{ + struct irlap_info info; + struct irlap_cb *self; + int command; + __u8 control; + int ret = -1; + + if (!net_eq(dev_net(dev), &init_net)) + goto out; + + /* FIXME: should we get our own field? */ + self = (struct irlap_cb *) dev->atalk_ptr; + + /* If the net device is down, then IrLAP is gone! */ + if (!self || self->magic != LAP_MAGIC) + goto err; + + /* We are no longer an "old" protocol, so we need to handle + * share and non linear skbs. This should never happen, so + * we don't need to be clever about it. Jean II */ + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IRDA_ERROR("%s: can't clone shared skb!\n", __func__); + goto err; + } + + /* Check if frame is large enough for parsing */ + if (!pskb_may_pull(skb, 2)) { + IRDA_ERROR("%s: frame too short!\n", __func__); + goto err; + } + + command = skb->data[0] & CMD_FRAME; + info.caddr = skb->data[0] & CBROADCAST; + + info.pf = skb->data[1] & PF_BIT; + info.control = skb->data[1] & ~PF_BIT; /* Mask away poll/final bit */ + + control = info.control; + + /* First we check if this frame has a valid connection address */ + if ((info.caddr != self->caddr) && (info.caddr != CBROADCAST)) { + IRDA_DEBUG(0, "%s(), wrong connection address!\n", + __func__); + goto out; + } + /* + * Optimize for the common case and check if the frame is an + * I(nformation) frame. Only I-frames have bit 0 set to 0 + */ + if (~control & 0x01) { + irlap_recv_i_frame(self, skb, &info, command); + goto out; + } + /* + * We now check is the frame is an S(upervisory) frame. Only + * S-frames have bit 0 set to 1 and bit 1 set to 0 + */ + if (~control & 0x02) { + /* + * Received S(upervisory) frame, check which frame type it is + * only the first nibble is of interest + */ + switch (control & 0x0f) { + case RR: + irlap_recv_rr_frame(self, skb, &info, command); + break; + case RNR: + irlap_recv_rnr_frame(self, skb, &info, command); + break; + case REJ: + irlap_recv_rej_frame(self, skb, &info, command); + break; + case SREJ: + irlap_recv_srej_frame(self, skb, &info, command); + break; + default: + IRDA_WARNING("%s: Unknown S-frame %02x received!\n", + __func__, info.control); + break; + } + goto out; + } + /* + * This must be a C(ontrol) frame + */ + switch (control) { + case XID_RSP: + irlap_recv_discovery_xid_rsp(self, skb, &info); + break; + case XID_CMD: + irlap_recv_discovery_xid_cmd(self, skb, &info); + break; + case SNRM_CMD: + irlap_recv_snrm_cmd(self, skb, &info); + break; + case DM_RSP: + irlap_do_event(self, RECV_DM_RSP, skb, &info); + break; + case DISC_CMD: /* And RD_RSP since they have the same value */ + irlap_recv_disc_frame(self, skb, &info, command); + break; + case TEST_CMD: + irlap_recv_test_frame(self, skb, &info, command); + break; + case UA_RSP: + irlap_recv_ua_frame(self, skb, &info); + break; + case FRMR_RSP: + irlap_recv_frmr_frame(self, skb, &info); + break; + case UI_FRAME: + irlap_recv_ui_frame(self, skb, &info); + break; + default: + IRDA_WARNING("%s: Unknown frame %02x received!\n", + __func__, info.control); + break; + } +out: + ret = 0; +err: + /* Always drop our reference on the skb */ + dev_kfree_skb(skb); + return ret; +} diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c new file mode 100644 index 00000000..6115a44c --- /dev/null +++ b/net/irda/irlmp.c @@ -0,0 +1,2025 @@ +/********************************************************************* + * + * Filename: irlmp.c + * Version: 1.0 + * Description: IrDA Link Management Protocol (LMP) layer + * Status: Stable. + * Author: Dag Brattli + * Created at: Sun Aug 17 20:54:32 1997 + * Modified at: Wed Jan 5 11:26:03 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static __u8 irlmp_find_free_slsap(void); +static int irlmp_slsap_inuse(__u8 slsap_sel); + +/* Master structure */ +struct irlmp_cb *irlmp = NULL; + +/* These can be altered by the sysctl interface */ +int sysctl_discovery = 0; +int sysctl_discovery_timeout = 3; /* 3 seconds by default */ +int sysctl_discovery_slots = 6; /* 6 slots by default */ +int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ; +char sysctl_devname[65]; + +const char *irlmp_reasons[] = { + "ERROR, NOT USED", + "LM_USER_REQUEST", + "LM_LAP_DISCONNECT", + "LM_CONNECT_FAILURE", + "LM_LAP_RESET", + "LM_INIT_DISCONNECT", + "ERROR, NOT USED", +}; + +/* + * Function irlmp_init (void) + * + * Create (allocate) the main IrLMP structure + * + */ +int __init irlmp_init(void) +{ + IRDA_DEBUG(1, "%s()\n", __func__); + /* Initialize the irlmp structure. */ + irlmp = kzalloc( sizeof(struct irlmp_cb), GFP_KERNEL); + if (irlmp == NULL) + return -ENOMEM; + + irlmp->magic = LMP_MAGIC; + + irlmp->clients = hashbin_new(HB_LOCK); + irlmp->services = hashbin_new(HB_LOCK); + irlmp->links = hashbin_new(HB_LOCK); + irlmp->unconnected_lsaps = hashbin_new(HB_LOCK); + irlmp->cachelog = hashbin_new(HB_NOLOCK); + + if ((irlmp->clients == NULL) || + (irlmp->services == NULL) || + (irlmp->links == NULL) || + (irlmp->unconnected_lsaps == NULL) || + (irlmp->cachelog == NULL)) { + return -ENOMEM; + } + + spin_lock_init(&irlmp->cachelog->hb_spinlock); + + irlmp->last_lsap_sel = 0x0f; /* Reserved 0x00-0x0f */ + strcpy(sysctl_devname, "Linux"); + + init_timer(&irlmp->discovery_timer); + + /* Do discovery every 3 seconds, conditionally */ + if (sysctl_discovery) + irlmp_start_discovery_timer(irlmp, + sysctl_discovery_timeout*HZ); + + return 0; +} + +/* + * Function irlmp_cleanup (void) + * + * Remove IrLMP layer + * + */ +void irlmp_cleanup(void) +{ + /* Check for main structure */ + IRDA_ASSERT(irlmp != NULL, return;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;); + + del_timer(&irlmp->discovery_timer); + + hashbin_delete(irlmp->links, (FREE_FUNC) kfree); + hashbin_delete(irlmp->unconnected_lsaps, (FREE_FUNC) kfree); + hashbin_delete(irlmp->clients, (FREE_FUNC) kfree); + hashbin_delete(irlmp->services, (FREE_FUNC) kfree); + hashbin_delete(irlmp->cachelog, (FREE_FUNC) kfree); + + /* De-allocate main structure */ + kfree(irlmp); + irlmp = NULL; +} + +/* + * Function irlmp_open_lsap (slsap, notify) + * + * Register with IrLMP and create a local LSAP, + * returns handle to LSAP. + */ +struct lsap_cb *irlmp_open_lsap(__u8 slsap_sel, notify_t *notify, __u8 pid) +{ + struct lsap_cb *self; + + IRDA_ASSERT(notify != NULL, return NULL;); + IRDA_ASSERT(irlmp != NULL, return NULL;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return NULL;); + IRDA_ASSERT(notify->instance != NULL, return NULL;); + + /* Does the client care which Source LSAP selector it gets? */ + if (slsap_sel == LSAP_ANY) { + slsap_sel = irlmp_find_free_slsap(); + if (!slsap_sel) + return NULL; + } else if (irlmp_slsap_inuse(slsap_sel)) + return NULL; + + /* Allocate new instance of a LSAP connection */ + self = kzalloc(sizeof(struct lsap_cb), GFP_ATOMIC); + if (self == NULL) { + IRDA_ERROR("%s: can't allocate memory\n", __func__); + return NULL; + } + + self->magic = LMP_LSAP_MAGIC; + self->slsap_sel = slsap_sel; + + /* Fix connectionless LSAP's */ + if (slsap_sel == LSAP_CONNLESS) { +#ifdef CONFIG_IRDA_ULTRA + self->dlsap_sel = LSAP_CONNLESS; + self->pid = pid; +#endif /* CONFIG_IRDA_ULTRA */ + } else + self->dlsap_sel = LSAP_ANY; + /* self->connected = FALSE; -> already NULL via memset() */ + + init_timer(&self->watchdog_timer); + + self->notify = *notify; + + self->lsap_state = LSAP_DISCONNECTED; + + /* Insert into queue of unconnected LSAPs */ + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self, + (long) self, NULL); + + return self; +} +EXPORT_SYMBOL(irlmp_open_lsap); + +/* + * Function __irlmp_close_lsap (self) + * + * Remove an instance of LSAP + */ +static void __irlmp_close_lsap(struct lsap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + /* + * Set some of the variables to preset values + */ + self->magic = 0; + del_timer(&self->watchdog_timer); /* Important! */ + + if (self->conn_skb) + dev_kfree_skb(self->conn_skb); + + kfree(self); +} + +/* + * Function irlmp_close_lsap (self) + * + * Close and remove LSAP + * + */ +void irlmp_close_lsap(struct lsap_cb *self) +{ + struct lap_cb *lap; + struct lsap_cb *lsap = NULL; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + /* + * Find out if we should remove this LSAP from a link or from the + * list of unconnected lsaps (not associated with a link) + */ + lap = self->lap; + if (lap) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + /* We might close a LSAP before it has completed the + * connection setup. In those case, higher layers won't + * send a proper disconnect request. Harmless, except + * that we will forget to close LAP... - Jean II */ + if(self->lsap_state != LSAP_DISCONNECTED) { + self->lsap_state = LSAP_DISCONNECTED; + irlmp_do_lap_event(self->lap, + LM_LAP_DISCONNECT_REQUEST, NULL); + } + /* Now, remove from the link */ + lsap = hashbin_remove(lap->lsaps, (long) self, NULL); +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + lap->cache.valid = FALSE; +#endif + } + self->lap = NULL; + /* Check if we found the LSAP! If not then try the unconnected lsaps */ + if (!lsap) { + lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, + NULL); + } + if (!lsap) { + IRDA_DEBUG(0, + "%s(), Looks like somebody has removed me already!\n", + __func__); + return; + } + __irlmp_close_lsap(self); +} +EXPORT_SYMBOL(irlmp_close_lsap); + +/* + * Function irlmp_register_irlap (saddr, notify) + * + * Register IrLAP layer with IrLMP. There is possible to have multiple + * instances of the IrLAP layer, each connected to different IrDA ports + * + */ +void irlmp_register_link(struct irlap_cb *irlap, __u32 saddr, notify_t *notify) +{ + struct lap_cb *lap; + + IRDA_ASSERT(irlmp != NULL, return;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;); + IRDA_ASSERT(notify != NULL, return;); + + /* + * Allocate new instance of a LSAP connection + */ + lap = kzalloc(sizeof(struct lap_cb), GFP_KERNEL); + if (lap == NULL) { + IRDA_ERROR("%s: unable to kmalloc\n", __func__); + return; + } + + lap->irlap = irlap; + lap->magic = LMP_LAP_MAGIC; + lap->saddr = saddr; + lap->daddr = DEV_ADDR_ANY; +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + lap->cache.valid = FALSE; +#endif + lap->lsaps = hashbin_new(HB_LOCK); + if (lap->lsaps == NULL) { + IRDA_WARNING("%s(), unable to kmalloc lsaps\n", __func__); + kfree(lap); + return; + } + + lap->lap_state = LAP_STANDBY; + + init_timer(&lap->idle_timer); + + /* + * Insert into queue of LMP links + */ + hashbin_insert(irlmp->links, (irda_queue_t *) lap, lap->saddr, NULL); + + /* + * We set only this variable so IrLAP can tell us on which link the + * different events happened on + */ + irda_notify_init(notify); + notify->instance = lap; +} + +/* + * Function irlmp_unregister_irlap (saddr) + * + * IrLAP layer has been removed! + * + */ +void irlmp_unregister_link(__u32 saddr) +{ + struct lap_cb *link; + + IRDA_DEBUG(4, "%s()\n", __func__); + + /* We must remove ourselves from the hashbin *first*. This ensure + * that no more LSAPs will be open on this link and no discovery + * will be triggered anymore. Jean II */ + link = hashbin_remove(irlmp->links, saddr, NULL); + if (link) { + IRDA_ASSERT(link->magic == LMP_LAP_MAGIC, return;); + + /* Kill all the LSAPs on this link. Jean II */ + link->reason = LAP_DISC_INDICATION; + link->daddr = DEV_ADDR_ANY; + irlmp_do_lap_event(link, LM_LAP_DISCONNECT_INDICATION, NULL); + + /* Remove all discoveries discovered at this link */ + irlmp_expire_discoveries(irlmp->cachelog, link->saddr, TRUE); + + /* Final cleanup */ + del_timer(&link->idle_timer); + link->magic = 0; + hashbin_delete(link->lsaps, (FREE_FUNC) __irlmp_close_lsap); + kfree(link); + } +} + +/* + * Function irlmp_connect_request (handle, dlsap, userdata) + * + * Connect with a peer LSAP + * + */ +int irlmp_connect_request(struct lsap_cb *self, __u8 dlsap_sel, + __u32 saddr, __u32 daddr, + struct qos_info *qos, struct sk_buff *userdata) +{ + struct sk_buff *tx_skb = userdata; + struct lap_cb *lap; + struct lsap_cb *lsap; + int ret; + + IRDA_ASSERT(self != NULL, return -EBADR;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EBADR;); + + IRDA_DEBUG(2, + "%s(), slsap_sel=%02x, dlsap_sel=%02x, saddr=%08x, daddr=%08x\n", + __func__, self->slsap_sel, dlsap_sel, saddr, daddr); + + if (test_bit(0, &self->connected)) { + ret = -EISCONN; + goto err; + } + + /* Client must supply destination device address */ + if (!daddr) { + ret = -EINVAL; + goto err; + } + + /* Any userdata? */ + if (tx_skb == NULL) { + tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + skb_reserve(tx_skb, LMP_MAX_HEADER); + } + + /* Make room for MUX control header (3 bytes) */ + IRDA_ASSERT(skb_headroom(tx_skb) >= LMP_CONTROL_HEADER, return -1;); + skb_push(tx_skb, LMP_CONTROL_HEADER); + + self->dlsap_sel = dlsap_sel; + + /* + * Find the link to where we should try to connect since there may + * be more than one IrDA port on this machine. If the client has + * passed us the saddr (and already knows which link to use), then + * we use that to find the link, if not then we have to look in the + * discovery log and check if any of the links has discovered a + * device with the given daddr + */ + if ((!saddr) || (saddr == DEV_ADDR_ANY)) { + discovery_t *discovery; + unsigned long flags; + + spin_lock_irqsave(&irlmp->cachelog->hb_spinlock, flags); + if (daddr != DEV_ADDR_ANY) + discovery = hashbin_find(irlmp->cachelog, daddr, NULL); + else { + IRDA_DEBUG(2, "%s(), no daddr\n", __func__); + discovery = (discovery_t *) + hashbin_get_first(irlmp->cachelog); + } + + if (discovery) { + saddr = discovery->data.saddr; + daddr = discovery->data.daddr; + } + spin_unlock_irqrestore(&irlmp->cachelog->hb_spinlock, flags); + } + lap = hashbin_lock_find(irlmp->links, saddr, NULL); + if (lap == NULL) { + IRDA_DEBUG(1, "%s(), Unable to find a usable link!\n", __func__); + ret = -EHOSTUNREACH; + goto err; + } + + /* Check if LAP is disconnected or already connected */ + if (lap->daddr == DEV_ADDR_ANY) + lap->daddr = daddr; + else if (lap->daddr != daddr) { + /* Check if some LSAPs are active on this LAP */ + if (HASHBIN_GET_SIZE(lap->lsaps) == 0) { + /* No active connection, but LAP hasn't been + * disconnected yet (waiting for timeout in LAP). + * Maybe we could give LAP a bit of help in this case. + */ + IRDA_DEBUG(0, "%s(), sorry, but I'm waiting for LAP to timeout!\n", __func__); + ret = -EAGAIN; + goto err; + } + + /* LAP is already connected to a different node, and LAP + * can only talk to one node at a time */ + IRDA_DEBUG(0, "%s(), sorry, but link is busy!\n", __func__); + ret = -EBUSY; + goto err; + } + + self->lap = lap; + + /* + * Remove LSAP from list of unconnected LSAPs and insert it into the + * list of connected LSAPs for the particular link + */ + lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, NULL); + + IRDA_ASSERT(lsap != NULL, return -1;); + IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(lsap->lap != NULL, return -1;); + IRDA_ASSERT(lsap->lap->magic == LMP_LAP_MAGIC, return -1;); + + hashbin_insert(self->lap->lsaps, (irda_queue_t *) self, (long) self, + NULL); + + set_bit(0, &self->connected); /* TRUE */ + + /* + * User supplied qos specifications? + */ + if (qos) + self->qos = *qos; + + irlmp_do_lsap_event(self, LM_CONNECT_REQUEST, tx_skb); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(tx_skb); + + return 0; + +err: + /* Cleanup */ + if(tx_skb) + dev_kfree_skb(tx_skb); + return ret; +} +EXPORT_SYMBOL(irlmp_connect_request); + +/* + * Function irlmp_connect_indication (self) + * + * Incoming connection + * + */ +void irlmp_connect_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + int max_seg_size; + int lap_header_size; + int max_header_size; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self->lap != NULL, return;); + + IRDA_DEBUG(2, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n", + __func__, self->slsap_sel, self->dlsap_sel); + + /* Note : self->lap is set in irlmp_link_data_indication(), + * (case CONNECT_CMD:) because we have no way to set it here. + * Similarly, self->dlsap_sel is usually set in irlmp_find_lsap(). + * Jean II */ + + self->qos = *self->lap->qos; + + max_seg_size = self->lap->qos->data_size.value-LMP_HEADER; + lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap); + max_header_size = LMP_HEADER + lap_header_size; + + /* Hide LMP_CONTROL_HEADER header from layer above */ + skb_pull(skb, LMP_CONTROL_HEADER); + + if (self->notify.connect_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.connect_indication(self->notify.instance, self, + &self->qos, max_seg_size, + max_header_size, skb); + } +} + +/* + * Function irlmp_connect_response (handle, userdata) + * + * Service user is accepting connection + * + */ +int irlmp_connect_response(struct lsap_cb *self, struct sk_buff *userdata) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(userdata != NULL, return -1;); + + /* We set the connected bit and move the lsap to the connected list + * in the state machine itself. Jean II */ + + IRDA_DEBUG(2, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n", + __func__, self->slsap_sel, self->dlsap_sel); + + /* Make room for MUX control header (3 bytes) */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_CONTROL_HEADER, return -1;); + skb_push(userdata, LMP_CONTROL_HEADER); + + irlmp_do_lsap_event(self, LM_CONNECT_RESPONSE, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + return 0; +} +EXPORT_SYMBOL(irlmp_connect_response); + +/* + * Function irlmp_connect_confirm (handle, skb) + * + * LSAP connection confirmed peer device! + */ +void irlmp_connect_confirm(struct lsap_cb *self, struct sk_buff *skb) +{ + int max_header_size; + int lap_header_size; + int max_seg_size; + + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(self->lap != NULL, return;); + + self->qos = *self->lap->qos; + + max_seg_size = self->lap->qos->data_size.value-LMP_HEADER; + lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap); + max_header_size = LMP_HEADER + lap_header_size; + + IRDA_DEBUG(2, "%s(), max_header_size=%d\n", + __func__, max_header_size); + + /* Hide LMP_CONTROL_HEADER header from layer above */ + skb_pull(skb, LMP_CONTROL_HEADER); + + if (self->notify.connect_confirm) { + /* Don't forget to refcount it - see irlap_driver_rcv() */ + skb_get(skb); + self->notify.connect_confirm(self->notify.instance, self, + &self->qos, max_seg_size, + max_header_size, skb); + } +} + +/* + * Function irlmp_dup (orig, instance) + * + * Duplicate LSAP, can be used by servers to confirm a connection on a + * new LSAP so it can keep listening on the old one. + * + */ +struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance) +{ + struct lsap_cb *new; + unsigned long flags; + + IRDA_DEBUG(1, "%s()\n", __func__); + + spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + /* Only allowed to duplicate unconnected LSAP's, and only LSAPs + * that have received a connect indication. Jean II */ + if ((!hashbin_find(irlmp->unconnected_lsaps, (long) orig, NULL)) || + (orig->lap == NULL)) { + IRDA_DEBUG(0, "%s(), invalid LSAP (wrong state)\n", + __func__); + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, + flags); + return NULL; + } + + /* Allocate a new instance */ + new = kmemdup(orig, sizeof(*new), GFP_ATOMIC); + if (!new) { + IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __func__); + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, + flags); + return NULL; + } + /* new->lap = orig->lap; => done in the memcpy() */ + /* new->slsap_sel = orig->slsap_sel; => done in the memcpy() */ + new->conn_skb = NULL; + + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + /* Not everything is the same */ + new->notify.instance = instance; + + init_timer(&new->watchdog_timer); + + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) new, + (long) new, NULL); + +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + /* Make sure that we invalidate the LSAP cache */ + new->lap->cache.valid = FALSE; +#endif /* CONFIG_IRDA_CACHE_LAST_LSAP */ + + return new; +} + +/* + * Function irlmp_disconnect_request (handle, userdata) + * + * The service user is requesting disconnection, this will not remove the + * LSAP, but only mark it as disconnected + */ +int irlmp_disconnect_request(struct lsap_cb *self, struct sk_buff *userdata) +{ + struct lsap_cb *lsap; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(userdata != NULL, return -1;); + + /* Already disconnected ? + * There is a race condition between irlmp_disconnect_indication() + * and us that might mess up the hashbins below. This fixes it. + * Jean II */ + if (! test_and_clear_bit(0, &self->connected)) { + IRDA_DEBUG(0, "%s(), already disconnected!\n", __func__); + dev_kfree_skb(userdata); + return -1; + } + + skb_push(userdata, LMP_CONTROL_HEADER); + + /* + * Do the event before the other stuff since we must know + * which lap layer that the frame should be transmitted on + */ + irlmp_do_lsap_event(self, LM_DISCONNECT_REQUEST, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + /* + * Remove LSAP from list of connected LSAPs for the particular link + * and insert it into the list of unconnected LSAPs + */ + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + IRDA_ASSERT(self->lap->lsaps != NULL, return -1;); + + lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL); +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + self->lap->cache.valid = FALSE; +#endif + + IRDA_ASSERT(lsap != NULL, return -1;); + IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(lsap == self, return -1;); + + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self, + (long) self, NULL); + + /* Reset some values */ + self->dlsap_sel = LSAP_ANY; + self->lap = NULL; + + return 0; +} +EXPORT_SYMBOL(irlmp_disconnect_request); + +/* + * Function irlmp_disconnect_indication (reason, userdata) + * + * LSAP is being closed! + */ +void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason, + struct sk_buff *skb) +{ + struct lsap_cb *lsap; + + IRDA_DEBUG(1, "%s(), reason=%s\n", __func__, irlmp_reasons[reason]); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + IRDA_DEBUG(3, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n", + __func__, self->slsap_sel, self->dlsap_sel); + + /* Already disconnected ? + * There is a race condition between irlmp_disconnect_request() + * and us that might mess up the hashbins below. This fixes it. + * Jean II */ + if (! test_and_clear_bit(0, &self->connected)) { + IRDA_DEBUG(0, "%s(), already disconnected!\n", __func__); + return; + } + + /* + * Remove association between this LSAP and the link it used + */ + IRDA_ASSERT(self->lap != NULL, return;); + IRDA_ASSERT(self->lap->lsaps != NULL, return;); + + lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL); +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + self->lap->cache.valid = FALSE; +#endif + + IRDA_ASSERT(lsap != NULL, return;); + IRDA_ASSERT(lsap == self, return;); + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) lsap, + (long) lsap, NULL); + + self->dlsap_sel = LSAP_ANY; + self->lap = NULL; + + /* + * Inform service user + */ + if (self->notify.disconnect_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + if(skb) + skb_get(skb); + self->notify.disconnect_indication(self->notify.instance, + self, reason, skb); + } else { + IRDA_DEBUG(0, "%s(), no handler\n", __func__); + } +} + +/* + * Function irlmp_do_expiry (void) + * + * Do a cleanup of the discovery log (remove old entries) + * + * Note : separate from irlmp_do_discovery() so that we can handle + * passive discovery properly. + */ +void irlmp_do_expiry(void) +{ + struct lap_cb *lap; + + /* + * Expire discovery on all links which are *not* connected. + * On links which are connected, we can't do discovery + * anymore and can't refresh the log, so we freeze the + * discovery log to keep info about the device we are + * connected to. + * This info is mandatory if we want irlmp_connect_request() + * to work properly. - Jean II + */ + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + + if (lap->lap_state == LAP_STANDBY) { + /* Expire discoveries discovered on this link */ + irlmp_expire_discoveries(irlmp->cachelog, lap->saddr, + FALSE); + } + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } +} + +/* + * Function irlmp_do_discovery (nslots) + * + * Do some discovery on all links + * + * Note : log expiry is done above. + */ +void irlmp_do_discovery(int nslots) +{ + struct lap_cb *lap; + __u16 *data_hintsp; + + /* Make sure the value is sane */ + if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ + IRDA_WARNING("%s: invalid value for number of slots!\n", + __func__); + nslots = sysctl_discovery_slots = 8; + } + + /* Construct new discovery info to be used by IrLAP, */ + data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints; + put_unaligned(irlmp->hints.word, data_hintsp); + + /* + * Set character set for device name (we use ASCII), and + * copy device name. Remember to make room for a \0 at the + * end + */ + irlmp->discovery_cmd.data.charset = CS_ASCII; + strncpy(irlmp->discovery_cmd.data.info, sysctl_devname, + NICKNAME_MAX_LEN); + irlmp->discovery_cmd.name_len = strlen(irlmp->discovery_cmd.data.info); + irlmp->discovery_cmd.nslots = nslots; + + /* + * Try to send discovery packets on all links + */ + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + + if (lap->lap_state == LAP_STANDBY) { + /* Try to discover */ + irlmp_do_lap_event(lap, LM_LAP_DISCOVERY_REQUEST, + NULL); + } + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } +} + +/* + * Function irlmp_discovery_request (nslots) + * + * Do a discovery of devices in front of the computer + * + * If the caller has registered a client discovery callback, this + * allow him to receive the full content of the discovery log through + * this callback (as normally he will receive only new discoveries). + */ +void irlmp_discovery_request(int nslots) +{ + /* Return current cached discovery log (in full) */ + irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_LOG); + + /* + * Start a single discovery operation if discovery is not already + * running + */ + if (!sysctl_discovery) { + /* Check if user wants to override the default */ + if (nslots == DISCOVERY_DEFAULT_SLOTS) + nslots = sysctl_discovery_slots; + + irlmp_do_discovery(nslots); + /* Note : we never do expiry here. Expiry will run on the + * discovery timer regardless of the state of sysctl_discovery + * Jean II */ + } +} +EXPORT_SYMBOL(irlmp_discovery_request); + +/* + * Function irlmp_get_discoveries (pn, mask, slots) + * + * Return the current discovery log + * + * If discovery is not enabled, you should call this function again + * after 1 or 2 seconds (i.e. after discovery has been done). + */ +struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots) +{ + /* If discovery is not enabled, it's likely that the discovery log + * will be empty. So, we trigger a single discovery, so that next + * time the user call us there might be some results in the log. + * Jean II + */ + if (!sysctl_discovery) { + /* Check if user wants to override the default */ + if (nslots == DISCOVERY_DEFAULT_SLOTS) + nslots = sysctl_discovery_slots; + + /* Start discovery - will complete sometime later */ + irlmp_do_discovery(nslots); + /* Note : we never do expiry here. Expiry will run on the + * discovery timer regardless of the state of sysctl_discovery + * Jean II */ + } + + /* Return current cached discovery log */ + return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE); +} +EXPORT_SYMBOL(irlmp_get_discoveries); + +/* + * Function irlmp_notify_client (log) + * + * Notify all about discovered devices + * + * Clients registered with IrLMP are : + * o IrComm + * o IrLAN + * o Any socket (in any state - ouch, that may be a lot !) + * The client may have defined a callback to be notified in case of + * partial/selective discovery based on the hints that it passed to IrLMP. + */ +static inline void +irlmp_notify_client(irlmp_client_t *client, + hashbin_t *log, DISCOVERY_MODE mode) +{ + discinfo_t *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + + IRDA_DEBUG(3, "%s()\n", __func__); + + /* Check if client wants or not partial/selective log (optimisation) */ + if (!client->disco_callback) + return; + + /* + * Locking notes : + * the old code was manipulating the log directly, which was + * very racy. Now, we use copy_discoveries, that protects + * itself while dumping the log for us. + * The overhead of the copy is compensated by the fact that + * we only pass new discoveries in normal mode and don't + * pass the same old entry every 3s to the caller as we used + * to do (virtual function calling is expensive). + * Jean II + */ + + /* + * Now, check all discovered devices (if any), and notify client + * only about the services that the client is interested in + * We also notify only about the new devices unless the caller + * explicitly request a dump of the log. Jean II + */ + discoveries = irlmp_copy_discoveries(log, &number, + client->hint_mask.word, + (mode == DISCOVERY_LOG)); + /* Check if the we got some results */ + if (discoveries == NULL) + return; /* No nodes discovered */ + + /* Pass all entries to the listener */ + for(i = 0; i < number; i++) + client->disco_callback(&(discoveries[i]), mode, client->priv); + + /* Free up our buffer */ + kfree(discoveries); +} + +/* + * Function irlmp_discovery_confirm ( self, log) + * + * Some device(s) answered to our discovery request! Check to see which + * device it is, and give indication to the client(s) + * + */ +void irlmp_discovery_confirm(hashbin_t *log, DISCOVERY_MODE mode) +{ + irlmp_client_t *client; + irlmp_client_t *client_next; + + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(log != NULL, return;); + + if (!(HASHBIN_GET_SIZE(log))) + return; + + /* For each client - notify callback may touch client list */ + client = (irlmp_client_t *) hashbin_get_first(irlmp->clients); + while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL, + (void *) &client_next) ) { + /* Check if we should notify client */ + irlmp_notify_client(client, log, mode); + + client = client_next; + } +} + +/* + * Function irlmp_discovery_expiry (expiry) + * + * This device is no longer been discovered, and therefore it is being + * purged from the discovery log. Inform all clients who have + * registered for this event... + * + * Note : called exclusively from discovery.c + * Note : this is no longer called under discovery spinlock, so the + * client can do whatever he wants in the callback. + */ +void irlmp_discovery_expiry(discinfo_t *expiries, int number) +{ + irlmp_client_t *client; + irlmp_client_t *client_next; + int i; + + IRDA_DEBUG(3, "%s()\n", __func__); + + IRDA_ASSERT(expiries != NULL, return;); + + /* For each client - notify callback may touch client list */ + client = (irlmp_client_t *) hashbin_get_first(irlmp->clients); + while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL, + (void *) &client_next) ) { + + /* Pass all entries to the listener */ + for(i = 0; i < number; i++) { + /* Check if we should notify client */ + if ((client->expir_callback) && + (client->hint_mask.word & + get_unaligned((__u16 *)expiries[i].hints) + & 0x7f7f) ) + client->expir_callback(&(expiries[i]), + EXPIRY_TIMEOUT, + client->priv); + } + + /* Next client */ + client = client_next; + } +} + +/* + * Function irlmp_get_discovery_response () + * + * Used by IrLAP to get the discovery info it needs when answering + * discovery requests by other devices. + */ +discovery_t *irlmp_get_discovery_response(void) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(irlmp != NULL, return NULL;); + + put_unaligned(irlmp->hints.word, (__u16 *)irlmp->discovery_rsp.data.hints); + + /* + * Set character set for device name (we use ASCII), and + * copy device name. Remember to make room for a \0 at the + * end + */ + irlmp->discovery_rsp.data.charset = CS_ASCII; + + strncpy(irlmp->discovery_rsp.data.info, sysctl_devname, + NICKNAME_MAX_LEN); + irlmp->discovery_rsp.name_len = strlen(irlmp->discovery_rsp.data.info); + + return &irlmp->discovery_rsp; +} + +/* + * Function irlmp_data_request (self, skb) + * + * Send some data to peer device + * + * Note on skb management : + * After calling the lower layers of the IrDA stack, we always + * kfree() the skb, which drop the reference count (and potentially + * destroy it). + * IrLMP and IrLAP may queue the packet, and in those cases will need + * to use skb_get() to keep it around. + * Jean II + */ +int irlmp_data_request(struct lsap_cb *self, struct sk_buff *userdata) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + /* Make room for MUX header */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;); + skb_push(userdata, LMP_HEADER); + + ret = irlmp_do_lsap_event(self, LM_DATA_REQUEST, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + return ret; +} +EXPORT_SYMBOL(irlmp_data_request); + +/* + * Function irlmp_data_indication (handle, skb) + * + * Got data from LAP layer so pass it up to upper layer + * + */ +void irlmp_data_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + /* Hide LMP header from layer above */ + skb_pull(skb, LMP_HEADER); + + if (self->notify.data_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.data_indication(self->notify.instance, self, skb); + } +} + +/* + * Function irlmp_udata_request (self, skb) + */ +int irlmp_udata_request(struct lsap_cb *self, struct sk_buff *userdata) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(userdata != NULL, return -1;); + + /* Make room for MUX header */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;); + skb_push(userdata, LMP_HEADER); + + ret = irlmp_do_lsap_event(self, LM_UDATA_REQUEST, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + return ret; +} + +/* + * Function irlmp_udata_indication (self, skb) + * + * Send unreliable data (but still within the connection) + * + */ +void irlmp_udata_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Hide LMP header from layer above */ + skb_pull(skb, LMP_HEADER); + + if (self->notify.udata_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.udata_indication(self->notify.instance, self, + skb); + } +} + +/* + * Function irlmp_connless_data_request (self, skb) + */ +#ifdef CONFIG_IRDA_ULTRA +int irlmp_connless_data_request(struct lsap_cb *self, struct sk_buff *userdata, + __u8 pid) +{ + struct sk_buff *clone_skb; + struct lap_cb *lap; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(userdata != NULL, return -1;); + + /* Make room for MUX and PID header */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER+LMP_PID_HEADER, + return -1;); + + /* Insert protocol identifier */ + skb_push(userdata, LMP_PID_HEADER); + if(self != NULL) + userdata->data[0] = self->pid; + else + userdata->data[0] = pid; + + /* Connectionless sockets must use 0x70 */ + skb_push(userdata, LMP_HEADER); + userdata->data[0] = userdata->data[1] = LSAP_CONNLESS; + + /* Try to send Connectionless packets out on all links */ + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return -1;); + + clone_skb = skb_clone(userdata, GFP_ATOMIC); + if (!clone_skb) { + dev_kfree_skb(userdata); + return -ENOMEM; + } + + irlap_unitdata_request(lap->irlap, clone_skb); + /* irlap_unitdata_request() don't increase refcount, + * so no dev_kfree_skb() - Jean II */ + + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } + dev_kfree_skb(userdata); + + return 0; +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irlmp_connless_data_indication (self, skb) + * + * Receive unreliable data outside any connection. Mostly used by Ultra + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlmp_connless_data_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Hide LMP and PID header from layer above */ + skb_pull(skb, LMP_HEADER+LMP_PID_HEADER); + + if (self->notify.udata_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.udata_indication(self->notify.instance, self, + skb); + } +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Propagate status indication from LAP to LSAPs (via LMP) + * This don't trigger any change of state in lap_cb, lmp_cb or lsap_cb, + * and the event is stateless, therefore we can bypass both state machines + * and send the event direct to the LSAP user. + * Jean II + */ +void irlmp_status_indication(struct lap_cb *self, + LINK_STATUS link, LOCK_STATUS lock) +{ + struct lsap_cb *next; + struct lsap_cb *curr; + + /* Send status_indication to all LSAPs using this link */ + curr = (struct lsap_cb *) hashbin_get_first( self->lsaps); + while (NULL != hashbin_find_next(self->lsaps, (long) curr, NULL, + (void *) &next) ) { + IRDA_ASSERT(curr->magic == LMP_LSAP_MAGIC, return;); + /* + * Inform service user if he has requested it + */ + if (curr->notify.status_indication != NULL) + curr->notify.status_indication(curr->notify.instance, + link, lock); + else + IRDA_DEBUG(2, "%s(), no handler\n", __func__); + + curr = next; + } +} + +/* + * Receive flow control indication from LAP. + * LAP want us to send it one more frame. We implement a simple round + * robin scheduler between the active sockets so that we get a bit of + * fairness. Note that the round robin is far from perfect, but it's + * better than nothing. + * We then poll the selected socket so that we can do synchronous + * refilling of IrLAP (which allow to minimise the number of buffers). + * Jean II + */ +void irlmp_flow_indication(struct lap_cb *self, LOCAL_FLOW flow) +{ + struct lsap_cb *next; + struct lsap_cb *curr; + int lsap_todo; + + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(flow == FLOW_START, return;); + + /* Get the number of lsap. That's the only safe way to know + * that we have looped around... - Jean II */ + lsap_todo = HASHBIN_GET_SIZE(self->lsaps); + IRDA_DEBUG(4, "%s() : %d lsaps to scan\n", __func__, lsap_todo); + + /* Poll lsap in order until the queue is full or until we + * tried them all. + * Most often, the current LSAP will have something to send, + * so we will go through this loop only once. - Jean II */ + while((lsap_todo--) && + (IRLAP_GET_TX_QUEUE_LEN(self->irlap) < LAP_HIGH_THRESHOLD)) { + /* Try to find the next lsap we should poll. */ + next = self->flow_next; + /* If we have no lsap, restart from first one */ + if(next == NULL) + next = (struct lsap_cb *) hashbin_get_first(self->lsaps); + /* Verify current one and find the next one */ + curr = hashbin_find_next(self->lsaps, (long) next, NULL, + (void *) &self->flow_next); + /* Uh-oh... Paranoia */ + if(curr == NULL) + break; + IRDA_DEBUG(4, "%s() : curr is %p, next was %p and is now %p, still %d to go - queue len = %d\n", __func__, curr, next, self->flow_next, lsap_todo, IRLAP_GET_TX_QUEUE_LEN(self->irlap)); + + /* Inform lsap user that it can send one more packet. */ + if (curr->notify.flow_indication != NULL) + curr->notify.flow_indication(curr->notify.instance, + curr, flow); + else + IRDA_DEBUG(1, "%s(), no handler\n", __func__); + } +} + +#if 0 +/* + * Function irlmp_hint_to_service (hint) + * + * Returns a list of all servics contained in the given hint bits. This + * function assumes that the hint bits have the size of two bytes only + */ +__u8 *irlmp_hint_to_service(__u8 *hint) +{ + __u8 *service; + int i = 0; + + /* + * Allocate array to store services in. 16 entries should be safe + * since we currently only support 2 hint bytes + */ + service = kmalloc(16, GFP_ATOMIC); + if (!service) { + IRDA_DEBUG(1, "%s(), Unable to kmalloc!\n", __func__); + return NULL; + } + + if (!hint[0]) { + IRDA_DEBUG(1, "\n"); + kfree(service); + return NULL; + } + if (hint[0] & HINT_PNP) + IRDA_DEBUG(1, "PnP Compatible "); + if (hint[0] & HINT_PDA) + IRDA_DEBUG(1, "PDA/Palmtop "); + if (hint[0] & HINT_COMPUTER) + IRDA_DEBUG(1, "Computer "); + if (hint[0] & HINT_PRINTER) { + IRDA_DEBUG(1, "Printer "); + service[i++] = S_PRINTER; + } + if (hint[0] & HINT_MODEM) + IRDA_DEBUG(1, "Modem "); + if (hint[0] & HINT_FAX) + IRDA_DEBUG(1, "Fax "); + if (hint[0] & HINT_LAN) { + IRDA_DEBUG(1, "LAN Access "); + service[i++] = S_LAN; + } + /* + * Test if extension byte exists. This byte will usually be + * there, but this is not really required by the standard. + * (IrLMP p. 29) + */ + if (hint[0] & HINT_EXTENSION) { + if (hint[1] & HINT_TELEPHONY) { + IRDA_DEBUG(1, "Telephony "); + service[i++] = S_TELEPHONY; + } if (hint[1] & HINT_FILE_SERVER) + IRDA_DEBUG(1, "File Server "); + + if (hint[1] & HINT_COMM) { + IRDA_DEBUG(1, "IrCOMM "); + service[i++] = S_COMM; + } + if (hint[1] & HINT_OBEX) { + IRDA_DEBUG(1, "IrOBEX "); + service[i++] = S_OBEX; + } + } + IRDA_DEBUG(1, "\n"); + + /* So that client can be notified about any discovery */ + service[i++] = S_ANY; + + service[i] = S_END; + + return service; +} +#endif + +static const __u16 service_hint_mapping[S_END][2] = { + { HINT_PNP, 0 }, /* S_PNP */ + { HINT_PDA, 0 }, /* S_PDA */ + { HINT_COMPUTER, 0 }, /* S_COMPUTER */ + { HINT_PRINTER, 0 }, /* S_PRINTER */ + { HINT_MODEM, 0 }, /* S_MODEM */ + { HINT_FAX, 0 }, /* S_FAX */ + { HINT_LAN, 0 }, /* S_LAN */ + { HINT_EXTENSION, HINT_TELEPHONY }, /* S_TELEPHONY */ + { HINT_EXTENSION, HINT_COMM }, /* S_COMM */ + { HINT_EXTENSION, HINT_OBEX }, /* S_OBEX */ + { 0xFF, 0xFF }, /* S_ANY */ +}; + +/* + * Function irlmp_service_to_hint (service) + * + * Converts a service type, to a hint bit + * + * Returns: a 16 bit hint value, with the service bit set + */ +__u16 irlmp_service_to_hint(int service) +{ + __u16_host_order hint; + + hint.byte[0] = service_hint_mapping[service][0]; + hint.byte[1] = service_hint_mapping[service][1]; + + return hint.word; +} +EXPORT_SYMBOL(irlmp_service_to_hint); + +/* + * Function irlmp_register_service (service) + * + * Register local service with IrLMP + * + */ +void *irlmp_register_service(__u16 hints) +{ + irlmp_service_t *service; + + IRDA_DEBUG(4, "%s(), hints = %04x\n", __func__, hints); + + /* Make a new registration */ + service = kmalloc(sizeof(irlmp_service_t), GFP_ATOMIC); + if (!service) { + IRDA_DEBUG(1, "%s(), Unable to kmalloc!\n", __func__); + return NULL; + } + service->hints.word = hints; + hashbin_insert(irlmp->services, (irda_queue_t *) service, + (long) service, NULL); + + irlmp->hints.word |= hints; + + return (void *)service; +} +EXPORT_SYMBOL(irlmp_register_service); + +/* + * Function irlmp_unregister_service (handle) + * + * Unregister service with IrLMP. + * + * Returns: 0 on success, -1 on error + */ +int irlmp_unregister_service(void *handle) +{ + irlmp_service_t *service; + unsigned long flags; + + IRDA_DEBUG(4, "%s()\n", __func__); + + if (!handle) + return -1; + + /* Caller may call with invalid handle (it's legal) - Jean II */ + service = hashbin_lock_find(irlmp->services, (long) handle, NULL); + if (!service) { + IRDA_DEBUG(1, "%s(), Unknown service!\n", __func__); + return -1; + } + + hashbin_remove_this(irlmp->services, (irda_queue_t *) service); + kfree(service); + + /* Remove old hint bits */ + irlmp->hints.word = 0; + + /* Refresh current hint bits */ + spin_lock_irqsave(&irlmp->services->hb_spinlock, flags); + service = (irlmp_service_t *) hashbin_get_first(irlmp->services); + while (service) { + irlmp->hints.word |= service->hints.word; + + service = (irlmp_service_t *)hashbin_get_next(irlmp->services); + } + spin_unlock_irqrestore(&irlmp->services->hb_spinlock, flags); + return 0; +} +EXPORT_SYMBOL(irlmp_unregister_service); + +/* + * Function irlmp_register_client (hint_mask, callback1, callback2) + * + * Register a local client with IrLMP + * First callback is selective discovery (based on hints) + * Second callback is for selective discovery expiries + * + * Returns: handle > 0 on success, 0 on error + */ +void *irlmp_register_client(__u16 hint_mask, DISCOVERY_CALLBACK1 disco_clb, + DISCOVERY_CALLBACK2 expir_clb, void *priv) +{ + irlmp_client_t *client; + + IRDA_DEBUG(1, "%s()\n", __func__); + IRDA_ASSERT(irlmp != NULL, return NULL;); + + /* Make a new registration */ + client = kmalloc(sizeof(irlmp_client_t), GFP_ATOMIC); + if (!client) { + IRDA_DEBUG( 1, "%s(), Unable to kmalloc!\n", __func__); + return NULL; + } + + /* Register the details */ + client->hint_mask.word = hint_mask; + client->disco_callback = disco_clb; + client->expir_callback = expir_clb; + client->priv = priv; + + hashbin_insert(irlmp->clients, (irda_queue_t *) client, + (long) client, NULL); + + return (void *) client; +} +EXPORT_SYMBOL(irlmp_register_client); + +/* + * Function irlmp_update_client (handle, hint_mask, callback1, callback2) + * + * Updates specified client (handle) with possibly new hint_mask and + * callback + * + * Returns: 0 on success, -1 on error + */ +int irlmp_update_client(void *handle, __u16 hint_mask, + DISCOVERY_CALLBACK1 disco_clb, + DISCOVERY_CALLBACK2 expir_clb, void *priv) +{ + irlmp_client_t *client; + + if (!handle) + return -1; + + client = hashbin_lock_find(irlmp->clients, (long) handle, NULL); + if (!client) { + IRDA_DEBUG(1, "%s(), Unknown client!\n", __func__); + return -1; + } + + client->hint_mask.word = hint_mask; + client->disco_callback = disco_clb; + client->expir_callback = expir_clb; + client->priv = priv; + + return 0; +} +EXPORT_SYMBOL(irlmp_update_client); + +/* + * Function irlmp_unregister_client (handle) + * + * Returns: 0 on success, -1 on error + * + */ +int irlmp_unregister_client(void *handle) +{ + struct irlmp_client *client; + + IRDA_DEBUG(4, "%s()\n", __func__); + + if (!handle) + return -1; + + /* Caller may call with invalid handle (it's legal) - Jean II */ + client = hashbin_lock_find(irlmp->clients, (long) handle, NULL); + if (!client) { + IRDA_DEBUG(1, "%s(), Unknown client!\n", __func__); + return -1; + } + + IRDA_DEBUG(4, "%s(), removing client!\n", __func__); + hashbin_remove_this(irlmp->clients, (irda_queue_t *) client); + kfree(client); + + return 0; +} +EXPORT_SYMBOL(irlmp_unregister_client); + +/* + * Function irlmp_slsap_inuse (slsap) + * + * Check if the given source LSAP selector is in use + * + * This function is clearly not very efficient. On the mitigating side, the + * stack make sure that in 99% of the cases, we are called only once + * for each socket allocation. We could probably keep a bitmap + * of the allocated LSAP, but I'm not sure the complexity is worth it. + * Jean II + */ +static int irlmp_slsap_inuse(__u8 slsap_sel) +{ + struct lsap_cb *self; + struct lap_cb *lap; + unsigned long flags; + + IRDA_ASSERT(irlmp != NULL, return TRUE;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return TRUE;); + IRDA_ASSERT(slsap_sel != LSAP_ANY, return TRUE;); + + IRDA_DEBUG(4, "%s()\n", __func__); + +#ifdef CONFIG_IRDA_ULTRA + /* Accept all bindings to the connectionless LSAP */ + if (slsap_sel == LSAP_CONNLESS) + return FALSE; +#endif /* CONFIG_IRDA_ULTRA */ + + /* Valid values are between 0 and 127 (0x0-0x6F) */ + if (slsap_sel > LSAP_MAX) + return TRUE; + + /* + * Check if slsap is already in use. To do this we have to loop over + * every IrLAP connection and check every LSAP associated with each + * the connection. + */ + spin_lock_irqsave_nested(&irlmp->links->hb_spinlock, flags, + SINGLE_DEPTH_NESTING); + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, goto errlap;); + + /* Careful for priority inversions here ! + * irlmp->links is never taken while another IrDA + * spinlock is held, so we are safe. Jean II */ + spin_lock(&lap->lsaps->hb_spinlock); + + /* For this IrLAP, check all the LSAPs */ + self = (struct lsap_cb *) hashbin_get_first(lap->lsaps); + while (self != NULL) { + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, + goto errlsap;); + + if ((self->slsap_sel == slsap_sel)) { + IRDA_DEBUG(4, "Source LSAP selector=%02x in use\n", + self->slsap_sel); + goto errlsap; + } + self = (struct lsap_cb*) hashbin_get_next(lap->lsaps); + } + spin_unlock(&lap->lsaps->hb_spinlock); + + /* Next LAP */ + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } + spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags); + + /* + * Server sockets are typically waiting for connections and + * therefore reside in the unconnected list. We don't want + * to give out their LSAPs for obvious reasons... + * Jean II + */ + spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + self = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps); + while (self != NULL) { + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, goto erruncon;); + if ((self->slsap_sel == slsap_sel)) { + IRDA_DEBUG(4, "Source LSAP selector=%02x in use (unconnected)\n", + self->slsap_sel); + goto erruncon; + } + self = (struct lsap_cb*) hashbin_get_next(irlmp->unconnected_lsaps); + } + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + return FALSE; + + /* Error exit from within one of the two nested loops. + * Make sure we release the right spinlock in the righ order. + * Jean II */ +errlsap: + spin_unlock(&lap->lsaps->hb_spinlock); +IRDA_ASSERT_LABEL(errlap:) + spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags); + return TRUE; + + /* Error exit from within the unconnected loop. + * Just one spinlock to release... Jean II */ +erruncon: + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + return TRUE; +} + +/* + * Function irlmp_find_free_slsap () + * + * Find a free source LSAP to use. This function is called if the service + * user has requested a source LSAP equal to LM_ANY + */ +static __u8 irlmp_find_free_slsap(void) +{ + __u8 lsap_sel; + int wrapped = 0; + + IRDA_ASSERT(irlmp != NULL, return -1;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return -1;); + + /* Most users don't really care which LSAPs they are given, + * and therefore we automatically give them a free LSAP. + * This function try to find a suitable LSAP, i.e. which is + * not in use and is within the acceptable range. Jean II */ + + do { + /* Always increment to LSAP number before using it. + * In theory, we could reuse the last LSAP number, as long + * as it is no longer in use. Some IrDA stack do that. + * However, the previous socket may be half closed, i.e. + * we closed it, we think it's no longer in use, but the + * other side did not receive our close and think it's + * active and still send data on it. + * This is similar to what is done with PIDs and TCP ports. + * Also, this reduce the number of calls to irlmp_slsap_inuse() + * which is an expensive function to call. + * Jean II */ + irlmp->last_lsap_sel++; + + /* Check if we need to wraparound (0x70-0x7f are reserved) */ + if (irlmp->last_lsap_sel > LSAP_MAX) { + /* 0x00-0x10 are also reserved for well know ports */ + irlmp->last_lsap_sel = 0x10; + + /* Make sure we terminate the loop */ + if (wrapped++) { + IRDA_ERROR("%s: no more free LSAPs !\n", + __func__); + return 0; + } + } + + /* If the LSAP is in use, try the next one. + * Despite the autoincrement, we need to check if the lsap + * is really in use or not, first because LSAP may be + * directly allocated in irlmp_open_lsap(), and also because + * we may wraparound on old sockets. Jean II */ + } while (irlmp_slsap_inuse(irlmp->last_lsap_sel)); + + /* Got it ! */ + lsap_sel = irlmp->last_lsap_sel; + IRDA_DEBUG(4, "%s(), found free lsap_sel=%02x\n", + __func__, lsap_sel); + + return lsap_sel; +} + +/* + * Function irlmp_convert_lap_reason (lap_reason) + * + * Converts IrLAP disconnect reason codes to IrLMP disconnect reason + * codes + * + */ +LM_REASON irlmp_convert_lap_reason( LAP_REASON lap_reason) +{ + int reason = LM_LAP_DISCONNECT; + + switch (lap_reason) { + case LAP_DISC_INDICATION: /* Received a disconnect request from peer */ + IRDA_DEBUG( 1, "%s(), LAP_DISC_INDICATION\n", __func__); + reason = LM_USER_REQUEST; + break; + case LAP_NO_RESPONSE: /* To many retransmits without response */ + IRDA_DEBUG( 1, "%s(), LAP_NO_RESPONSE\n", __func__); + reason = LM_LAP_DISCONNECT; + break; + case LAP_RESET_INDICATION: + IRDA_DEBUG( 1, "%s(), LAP_RESET_INDICATION\n", __func__); + reason = LM_LAP_RESET; + break; + case LAP_FOUND_NONE: + case LAP_MEDIA_BUSY: + case LAP_PRIMARY_CONFLICT: + IRDA_DEBUG(1, "%s(), LAP_FOUND_NONE, LAP_MEDIA_BUSY or LAP_PRIMARY_CONFLICT\n", __func__); + reason = LM_CONNECT_FAILURE; + break; + default: + IRDA_DEBUG(1, "%s(), Unknown IrLAP disconnect reason %d!\n", + __func__, lap_reason); + reason = LM_LAP_DISCONNECT; + break; + } + + return reason; +} + +#ifdef CONFIG_PROC_FS + +struct irlmp_iter_state { + hashbin_t *hashbin; +}; + +#define LSAP_START_TOKEN ((void *)1) +#define LINK_START_TOKEN ((void *)2) + +static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off) +{ + void *element; + + spin_lock_irq(&iter->hashbin->hb_spinlock); + for (element = hashbin_get_first(iter->hashbin); + element != NULL; + element = hashbin_get_next(iter->hashbin)) { + if (!off || *off-- == 0) { + /* NB: hashbin left locked */ + return element; + } + } + spin_unlock_irq(&iter->hashbin->hb_spinlock); + iter->hashbin = NULL; + return NULL; +} + + +static void *irlmp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct irlmp_iter_state *iter = seq->private; + void *v; + loff_t off = *pos; + + iter->hashbin = NULL; + if (off-- == 0) + return LSAP_START_TOKEN; + + iter->hashbin = irlmp->unconnected_lsaps; + v = irlmp_seq_hb_idx(iter, &off); + if (v) + return v; + + if (off-- == 0) + return LINK_START_TOKEN; + + iter->hashbin = irlmp->links; + return irlmp_seq_hb_idx(iter, &off); +} + +static void *irlmp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct irlmp_iter_state *iter = seq->private; + + ++*pos; + + if (v == LSAP_START_TOKEN) { /* start of list of lsaps */ + iter->hashbin = irlmp->unconnected_lsaps; + v = irlmp_seq_hb_idx(iter, NULL); + return v ? v : LINK_START_TOKEN; + } + + if (v == LINK_START_TOKEN) { /* start of list of links */ + iter->hashbin = irlmp->links; + return irlmp_seq_hb_idx(iter, NULL); + } + + v = hashbin_get_next(iter->hashbin); + + if (v == NULL) { /* no more in this hash bin */ + spin_unlock_irq(&iter->hashbin->hb_spinlock); + + if (iter->hashbin == irlmp->unconnected_lsaps) + v = LINK_START_TOKEN; + + iter->hashbin = NULL; + } + return v; +} + +static void irlmp_seq_stop(struct seq_file *seq, void *v) +{ + struct irlmp_iter_state *iter = seq->private; + + if (iter->hashbin) + spin_unlock_irq(&iter->hashbin->hb_spinlock); +} + +static int irlmp_seq_show(struct seq_file *seq, void *v) +{ + const struct irlmp_iter_state *iter = seq->private; + struct lsap_cb *self = v; + + if (v == LSAP_START_TOKEN) + seq_puts(seq, "Unconnected LSAPs:\n"); + else if (v == LINK_START_TOKEN) + seq_puts(seq, "\nRegistered Link Layers:\n"); + else if (iter->hashbin == irlmp->unconnected_lsaps) { + self = v; + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EINVAL; ); + seq_printf(seq, "lsap state: %s, ", + irlsap_state[ self->lsap_state]); + seq_printf(seq, + "slsap_sel: %#02x, dlsap_sel: %#02x, ", + self->slsap_sel, self->dlsap_sel); + seq_printf(seq, "(%s)", self->notify.name); + seq_printf(seq, "\n"); + } else if (iter->hashbin == irlmp->links) { + struct lap_cb *lap = v; + + seq_printf(seq, "lap state: %s, ", + irlmp_state[lap->lap_state]); + + seq_printf(seq, "saddr: %#08x, daddr: %#08x, ", + lap->saddr, lap->daddr); + seq_printf(seq, "num lsaps: %d", + HASHBIN_GET_SIZE(lap->lsaps)); + seq_printf(seq, "\n"); + + /* Careful for priority inversions here ! + * All other uses of attrib spinlock are independent of + * the object spinlock, so we are safe. Jean II */ + spin_lock(&lap->lsaps->hb_spinlock); + + seq_printf(seq, "\n Connected LSAPs:\n"); + for (self = (struct lsap_cb *) hashbin_get_first(lap->lsaps); + self != NULL; + self = (struct lsap_cb *)hashbin_get_next(lap->lsaps)) { + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, + goto outloop;); + seq_printf(seq, " lsap state: %s, ", + irlsap_state[ self->lsap_state]); + seq_printf(seq, + "slsap_sel: %#02x, dlsap_sel: %#02x, ", + self->slsap_sel, self->dlsap_sel); + seq_printf(seq, "(%s)", self->notify.name); + seq_putc(seq, '\n'); + + } + IRDA_ASSERT_LABEL(outloop:) + spin_unlock(&lap->lsaps->hb_spinlock); + seq_putc(seq, '\n'); + } else + return -EINVAL; + + return 0; +} + +static const struct seq_operations irlmp_seq_ops = { + .start = irlmp_seq_start, + .next = irlmp_seq_next, + .stop = irlmp_seq_stop, + .show = irlmp_seq_show, +}; + +static int irlmp_seq_open(struct inode *inode, struct file *file) +{ + IRDA_ASSERT(irlmp != NULL, return -EINVAL;); + + return seq_open_private(file, &irlmp_seq_ops, + sizeof(struct irlmp_iter_state)); +} + +const struct file_operations irlmp_seq_fops = { + .owner = THIS_MODULE, + .open = irlmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ diff --git a/net/irda/irlmp_event.c b/net/irda/irlmp_event.c new file mode 100644 index 00000000..9505a7d0 --- /dev/null +++ b/net/irda/irlmp_event.c @@ -0,0 +1,909 @@ +/********************************************************************* + * + * Filename: irlmp_event.c + * Version: 0.8 + * Description: An IrDA LMP event driver for Linux + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Aug 4 20:40:53 1997 + * Modified at: Tue Dec 14 23:04:16 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +#include +#include +#include +#include +#include +#include + +const char *const irlmp_state[] = { + "LAP_STANDBY", + "LAP_U_CONNECT", + "LAP_ACTIVE", +}; + +const char *const irlsap_state[] = { + "LSAP_DISCONNECTED", + "LSAP_CONNECT", + "LSAP_CONNECT_PEND", + "LSAP_DATA_TRANSFER_READY", + "LSAP_SETUP", + "LSAP_SETUP_PEND", +}; + +#ifdef CONFIG_IRDA_DEBUG +static const char *const irlmp_event[] = { + "LM_CONNECT_REQUEST", + "LM_CONNECT_CONFIRM", + "LM_CONNECT_RESPONSE", + "LM_CONNECT_INDICATION", + + "LM_DISCONNECT_INDICATION", + "LM_DISCONNECT_REQUEST", + + "LM_DATA_REQUEST", + "LM_UDATA_REQUEST", + "LM_DATA_INDICATION", + "LM_UDATA_INDICATION", + + "LM_WATCHDOG_TIMEOUT", + + /* IrLAP events */ + "LM_LAP_CONNECT_REQUEST", + "LM_LAP_CONNECT_INDICATION", + "LM_LAP_CONNECT_CONFIRM", + "LM_LAP_DISCONNECT_INDICATION", + "LM_LAP_DISCONNECT_REQUEST", + "LM_LAP_DISCOVERY_REQUEST", + "LM_LAP_DISCOVERY_CONFIRM", + "LM_LAP_IDLE_TIMEOUT", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +/* LAP Connection control proto declarations */ +static void irlmp_state_standby (struct lap_cb *, IRLMP_EVENT, + struct sk_buff *); +static void irlmp_state_u_connect(struct lap_cb *, IRLMP_EVENT, + struct sk_buff *); +static void irlmp_state_active (struct lap_cb *, IRLMP_EVENT, + struct sk_buff *); + +/* LSAP Connection control proto declarations */ +static int irlmp_state_disconnected(struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_connect (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_connect_pend(struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_dtr (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_setup (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_setup_pend (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); + +static void (*lap_state[]) (struct lap_cb *, IRLMP_EVENT, struct sk_buff *) = +{ + irlmp_state_standby, + irlmp_state_u_connect, + irlmp_state_active, +}; + +static int (*lsap_state[])( struct lsap_cb *, IRLMP_EVENT, struct sk_buff *) = +{ + irlmp_state_disconnected, + irlmp_state_connect, + irlmp_state_connect_pend, + irlmp_state_dtr, + irlmp_state_setup, + irlmp_state_setup_pend +}; + +static inline void irlmp_next_lap_state(struct lap_cb *self, + IRLMP_STATE state) +{ + /* + IRDA_DEBUG(4, "%s(), LMP LAP = %s\n", __func__, irlmp_state[state]); + */ + self->lap_state = state; +} + +static inline void irlmp_next_lsap_state(struct lsap_cb *self, + LSAP_STATE state) +{ + /* + IRDA_ASSERT(self != NULL, return;); + IRDA_DEBUG(4, "%s(), LMP LSAP = %s\n", __func__, irlsap_state[state]); + */ + self->lsap_state = state; +} + +/* Do connection control events */ +int irlmp_do_lsap_event(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s(), EVENT = %s, STATE = %s\n", + __func__, irlmp_event[event], irlsap_state[ self->lsap_state]); + + return (*lsap_state[self->lsap_state]) (self, event, skb); +} + +/* + * Function do_lap_event (event, skb, info) + * + * Do IrLAP control events + * + */ +void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + IRDA_DEBUG(4, "%s(), EVENT = %s, STATE = %s\n", __func__, + irlmp_event[event], + irlmp_state[self->lap_state]); + + (*lap_state[self->lap_state]) (self, event, skb); +} + +void irlmp_discovery_timer_expired(void *data) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + /* We always cleanup the log (active & passive discovery) */ + irlmp_do_expiry(); + + irlmp_do_discovery(sysctl_discovery_slots); + + /* Restart timer */ + irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout * HZ); +} + +void irlmp_watchdog_timer_expired(void *data) +{ + struct lsap_cb *self = (struct lsap_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + irlmp_do_lsap_event(self, LM_WATCHDOG_TIMEOUT, NULL); +} + +void irlmp_idle_timer_expired(void *data) +{ + struct lap_cb *self = (struct lap_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + irlmp_do_lap_event(self, LM_LAP_IDLE_TIMEOUT, NULL); +} + +/* + * Send an event on all LSAPs attached to this LAP. + */ +static inline void +irlmp_do_all_lsap_event(hashbin_t * lsap_hashbin, + IRLMP_EVENT event) +{ + struct lsap_cb *lsap; + struct lsap_cb *lsap_next; + + /* Note : this function use the new hashbin_find_next() + * function, instead of the old hashbin_get_next(). + * This make sure that we are always pointing one lsap + * ahead, so that if the current lsap is removed as the + * result of sending the event, we don't care. + * Also, as we store the context ourselves, if an enumeration + * of the same lsap hashbin happens as the result of sending the + * event, we don't care. + * The only problem is if the next lsap is removed. In that case, + * hashbin_find_next() will return NULL and we will abort the + * enumeration. - Jean II */ + + /* Also : we don't accept any skb in input. We can *NOT* pass + * the same skb to multiple clients safely, we would need to + * skb_clone() it. - Jean II */ + + lsap = (struct lsap_cb *) hashbin_get_first(lsap_hashbin); + + while (NULL != hashbin_find_next(lsap_hashbin, + (long) lsap, + NULL, + (void *) &lsap_next) ) { + irlmp_do_lsap_event(lsap, event, NULL); + lsap = lsap_next; + } +} + +/********************************************************************* + * + * LAP connection control states + * + ********************************************************************/ + +/* + * Function irlmp_state_standby (event, skb, info) + * + * STANDBY, The IrLAP connection does not exist. + * + */ +static void irlmp_state_standby(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + IRDA_ASSERT(self->irlap != NULL, return;); + + switch (event) { + case LM_LAP_DISCOVERY_REQUEST: + /* irlmp_next_station_state( LMP_DISCOVER); */ + + irlap_discovery_request(self->irlap, &irlmp->discovery_cmd); + break; + case LM_LAP_CONNECT_INDICATION: + /* It's important to switch state first, to avoid IrLMP to + * think that the link is free since IrLMP may then start + * discovery before the connection is properly set up. DB. + */ + irlmp_next_lap_state(self, LAP_ACTIVE); + + /* Just accept connection TODO, this should be fixed */ + irlap_connect_response(self->irlap, skb); + break; + case LM_LAP_CONNECT_REQUEST: + IRDA_DEBUG(4, "%s() LS_CONNECT_REQUEST\n", __func__); + + irlmp_next_lap_state(self, LAP_U_CONNECT); + + /* FIXME: need to set users requested QoS */ + irlap_connect_request(self->irlap, self->daddr, NULL, 0); + break; + case LM_LAP_DISCONNECT_INDICATION: + IRDA_DEBUG(4, "%s(), Error LM_LAP_DISCONNECT_INDICATION\n", + __func__); + + irlmp_next_lap_state(self, LAP_STANDBY); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __func__, irlmp_event[event]); + break; + } +} + +/* + * Function irlmp_state_u_connect (event, skb, info) + * + * U_CONNECT, The layer above has tried to open an LSAP connection but + * since the IrLAP connection does not exist, we must first start an + * IrLAP connection. We are now waiting response from IrLAP. + * */ +static void irlmp_state_u_connect(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s(), event=%s\n", __func__, irlmp_event[event]); + + switch (event) { + case LM_LAP_CONNECT_INDICATION: + /* It's important to switch state first, to avoid IrLMP to + * think that the link is free since IrLMP may then start + * discovery before the connection is properly set up. DB. + */ + irlmp_next_lap_state(self, LAP_ACTIVE); + + /* Just accept connection TODO, this should be fixed */ + irlap_connect_response(self->irlap, skb); + + /* Tell LSAPs that they can start sending data */ + irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM); + + /* Note : by the time we get there (LAP retries and co), + * the lsaps may already have gone. This avoid getting stuck + * forever in LAP_ACTIVE state - Jean II */ + if (HASHBIN_GET_SIZE(self->lsaps) == 0) { + IRDA_DEBUG(0, "%s() NO LSAPs !\n", __func__); + irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT); + } + break; + case LM_LAP_CONNECT_REQUEST: + /* Already trying to connect */ + break; + case LM_LAP_CONNECT_CONFIRM: + /* For all lsap_ce E Associated do LS_Connect_confirm */ + irlmp_next_lap_state(self, LAP_ACTIVE); + + /* Tell LSAPs that they can start sending data */ + irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM); + + /* Note : by the time we get there (LAP retries and co), + * the lsaps may already have gone. This avoid getting stuck + * forever in LAP_ACTIVE state - Jean II */ + if (HASHBIN_GET_SIZE(self->lsaps) == 0) { + IRDA_DEBUG(0, "%s() NO LSAPs !\n", __func__); + irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT); + } + break; + case LM_LAP_DISCONNECT_INDICATION: + IRDA_DEBUG(4, "%s(), LM_LAP_DISCONNECT_INDICATION\n", __func__); + irlmp_next_lap_state(self, LAP_STANDBY); + + /* Send disconnect event to all LSAPs using this link */ + irlmp_do_all_lsap_event(self->lsaps, + LM_LAP_DISCONNECT_INDICATION); + break; + case LM_LAP_DISCONNECT_REQUEST: + IRDA_DEBUG(4, "%s(), LM_LAP_DISCONNECT_REQUEST\n", __func__); + + /* One of the LSAP did timeout or was closed, if it was + * the last one, try to get out of here - Jean II */ + if (HASHBIN_GET_SIZE(self->lsaps) <= 1) { + irlap_disconnect_request(self->irlap); + } + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __func__, irlmp_event[event]); + break; + } +} + +/* + * Function irlmp_state_active (event, skb, info) + * + * ACTIVE, IrLAP connection is active + * + */ +static void irlmp_state_active(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + switch (event) { + case LM_LAP_CONNECT_REQUEST: + IRDA_DEBUG(4, "%s(), LS_CONNECT_REQUEST\n", __func__); + + /* + * IrLAP may have a pending disconnect. We tried to close + * IrLAP, but it was postponed because the link was + * busy or we were still sending packets. As we now + * need it, make sure it stays on. Jean II + */ + irlap_clear_disconnect(self->irlap); + + /* + * LAP connection already active, just bounce back! Since we + * don't know which LSAP that tried to do this, we have to + * notify all LSAPs using this LAP, but that should be safe to + * do anyway. + */ + irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM); + + /* Needed by connect indication */ + irlmp_do_all_lsap_event(irlmp->unconnected_lsaps, + LM_LAP_CONNECT_CONFIRM); + /* Keep state */ + break; + case LM_LAP_DISCONNECT_REQUEST: + /* + * Need to find out if we should close IrLAP or not. If there + * is only one LSAP connection left on this link, that LSAP + * must be the one that tries to close IrLAP. It will be + * removed later and moved to the list of unconnected LSAPs + */ + if (HASHBIN_GET_SIZE(self->lsaps) > 0) { + /* Timer value is checked in irsysctl - Jean II */ + irlmp_start_idle_timer(self, sysctl_lap_keepalive_time * HZ / 1000); + } else { + /* No more connections, so close IrLAP */ + + /* We don't want to change state just yet, because + * we want to reflect accurately the real state of + * the LAP, not the state we wish it was in, + * so that we don't lose LM_LAP_CONNECT_REQUEST. + * In some cases, IrLAP won't close the LAP + * immediately. For example, it might still be + * retrying packets or waiting for the pf bit. + * As the LAP always send a DISCONNECT_INDICATION + * in PCLOSE or SCLOSE, just change state on that. + * Jean II */ + irlap_disconnect_request(self->irlap); + } + break; + case LM_LAP_IDLE_TIMEOUT: + if (HASHBIN_GET_SIZE(self->lsaps) == 0) { + /* Same reasoning as above - keep state */ + irlap_disconnect_request(self->irlap); + } + break; + case LM_LAP_DISCONNECT_INDICATION: + irlmp_next_lap_state(self, LAP_STANDBY); + + /* In some case, at this point our side has already closed + * all lsaps, and we are waiting for the idle_timer to + * expire. If another device reconnect immediately, the + * idle timer will expire in the midle of the connection + * initialisation, screwing up things a lot... + * Therefore, we must stop the timer... */ + irlmp_stop_idle_timer(self); + + /* + * Inform all connected LSAP's using this link + */ + irlmp_do_all_lsap_event(self->lsaps, + LM_LAP_DISCONNECT_INDICATION); + + /* Force an expiry of the discovery log. + * Now that the LAP is free, the system may attempt to + * connect to another device. Unfortunately, our entries + * are stale. There is a small window (<3s) before the + * normal discovery will run and where irlmp_connect_request() + * can get the wrong info, so make sure things get + * cleaned *NOW* ;-) - Jean II */ + irlmp_do_expiry(); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __func__, irlmp_event[event]); + break; + } +} + +/********************************************************************* + * + * LSAP connection control states + * + ********************************************************************/ + +/* + * Function irlmp_state_disconnected (event, skb, info) + * + * DISCONNECTED + * + */ +static int irlmp_state_disconnected(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + switch (event) { +#ifdef CONFIG_IRDA_ULTRA + case LM_UDATA_INDICATION: + /* This is most bizarre. Those packets are aka unreliable + * connected, aka IrLPT or SOCK_DGRAM/IRDAPROTO_UNITDATA. + * Why do we pass them as Ultra ??? Jean II */ + irlmp_connless_data_indication(self, skb); + break; +#endif /* CONFIG_IRDA_ULTRA */ + case LM_CONNECT_REQUEST: + IRDA_DEBUG(4, "%s(), LM_CONNECT_REQUEST\n", __func__); + + if (self->conn_skb) { + IRDA_WARNING("%s: busy with another request!\n", + __func__); + return -EBUSY; + } + /* Don't forget to refcount it (see irlmp_connect_request()) */ + skb_get(skb); + self->conn_skb = skb; + + irlmp_next_lsap_state(self, LSAP_SETUP_PEND); + + /* Start watchdog timer (5 secs for now) */ + irlmp_start_watchdog_timer(self, 5*HZ); + + irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL); + break; + case LM_CONNECT_INDICATION: + if (self->conn_skb) { + IRDA_WARNING("%s: busy with another request!\n", + __func__); + return -EBUSY; + } + /* Don't forget to refcount it (see irlap_driver_rcv()) */ + skb_get(skb); + self->conn_skb = skb; + + irlmp_next_lsap_state(self, LSAP_CONNECT_PEND); + + /* Start watchdog timer + * This is not mentionned in the spec, but there is a rare + * race condition that can get the socket stuck. + * If we receive this event while our LAP is closing down, + * the LM_LAP_CONNECT_REQUEST get lost and we get stuck in + * CONNECT_PEND state forever. + * The other cause of getting stuck down there is if the + * higher layer never reply to the CONNECT_INDICATION. + * Anyway, it make sense to make sure that we always have + * a backup plan. 1 second is plenty (should be immediate). + * Jean II */ + irlmp_start_watchdog_timer(self, 1*HZ); + + irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %s on LSAP %#02x\n", + __func__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_connect (self, event, skb) + * + * CONNECT + * + */ +static int irlmp_state_connect(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + struct lsap_cb *lsap; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + switch (event) { + case LM_CONNECT_RESPONSE: + /* + * Bind this LSAP to the IrLAP link where the connect was + * received + */ + lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, + NULL); + + IRDA_ASSERT(lsap == self, return -1;); + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->lsaps != NULL, return -1;); + + hashbin_insert(self->lap->lsaps, (irda_queue_t *) self, + (long) self, NULL); + + set_bit(0, &self->connected); /* TRUE */ + + irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, CONNECT_CNF, skb); + + del_timer(&self->watchdog_timer); + + irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY); + break; + case LM_WATCHDOG_TIMEOUT: + /* May happen, who knows... + * Jean II */ + IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __func__); + + /* Disconnect, get out... - Jean II */ + self->lap = NULL; + self->dlsap_sel = LSAP_ANY; + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + break; + default: + /* LM_LAP_DISCONNECT_INDICATION : Should never happen, we + * are *not* yet bound to the IrLAP link. Jean II */ + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __func__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_connect_pend (event, skb, info) + * + * CONNECT_PEND + * + */ +static int irlmp_state_connect_pend(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + switch (event) { + case LM_CONNECT_REQUEST: + /* Keep state */ + break; + case LM_CONNECT_RESPONSE: + IRDA_DEBUG(0, "%s(), LM_CONNECT_RESPONSE, " + "no indication issued yet\n", __func__); + /* Keep state */ + break; + case LM_DISCONNECT_REQUEST: + IRDA_DEBUG(0, "%s(), LM_DISCONNECT_REQUEST, " + "not yet bound to IrLAP connection\n", __func__); + /* Keep state */ + break; + case LM_LAP_CONNECT_CONFIRM: + IRDA_DEBUG(4, "%s(), LS_CONNECT_CONFIRM\n", __func__); + irlmp_next_lsap_state(self, LSAP_CONNECT); + + tx_skb = self->conn_skb; + self->conn_skb = NULL; + + irlmp_connect_indication(self, tx_skb); + /* Drop reference count - see irlmp_connect_indication(). */ + dev_kfree_skb(tx_skb); + break; + case LM_WATCHDOG_TIMEOUT: + /* Will happen in some rare cases because of a race condition. + * Just make sure we don't stay there forever... + * Jean II */ + IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __func__); + + /* Go back to disconnected mode, keep the socket waiting */ + self->lap = NULL; + self->dlsap_sel = LSAP_ANY; + if(self->conn_skb) + dev_kfree_skb(self->conn_skb); + self->conn_skb = NULL; + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + break; + default: + /* LM_LAP_DISCONNECT_INDICATION : Should never happen, we + * are *not* yet bound to the IrLAP link. Jean II */ + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __func__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_dtr (self, event, skb) + * + * DATA_TRANSFER_READY + * + */ +static int irlmp_state_dtr(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + LM_REASON reason; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(self->lap != NULL, return -1;); + + switch (event) { + case LM_DATA_REQUEST: /* Optimize for the common case */ + irlmp_send_data_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, FALSE, skb); + break; + case LM_DATA_INDICATION: /* Optimize for the common case */ + irlmp_data_indication(self, skb); + break; + case LM_UDATA_REQUEST: + IRDA_ASSERT(skb != NULL, return -1;); + irlmp_send_data_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, TRUE, skb); + break; + case LM_UDATA_INDICATION: + irlmp_udata_indication(self, skb); + break; + case LM_CONNECT_REQUEST: + IRDA_DEBUG(0, "%s(), LM_CONNECT_REQUEST, " + "error, LSAP already connected\n", __func__); + /* Keep state */ + break; + case LM_CONNECT_RESPONSE: + IRDA_DEBUG(0, "%s(), LM_CONNECT_RESPONSE, " + "error, LSAP already connected\n", __func__); + /* Keep state */ + break; + case LM_DISCONNECT_REQUEST: + irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, self->slsap_sel, + DISCONNECT, skb); + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + /* Called only from irlmp_disconnect_request(), will + * unbind from LAP over there. Jean II */ + + /* Try to close the LAP connection if its still there */ + if (self->lap) { + IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", + __func__); + irlmp_do_lap_event(self->lap, + LM_LAP_DISCONNECT_REQUEST, + NULL); + } + break; + case LM_LAP_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + reason = irlmp_convert_lap_reason(self->lap->reason); + + irlmp_disconnect_indication(self, reason, NULL); + break; + case LM_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + + IRDA_ASSERT(skb != NULL, return -1;); + IRDA_ASSERT(skb->len > 3, return -1;); + reason = skb->data[3]; + + /* Try to close the LAP connection */ + IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", __func__); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + + irlmp_disconnect_indication(self, reason, skb); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __func__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_setup (event, skb, info) + * + * SETUP, Station Control has set up the underlying IrLAP connection. + * An LSAP connection request has been transmitted to the peer + * LSAP-Connection Control FSM and we are awaiting reply. + */ +static int irlmp_state_setup(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + LM_REASON reason; + int ret = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s()\n", __func__); + + switch (event) { + case LM_CONNECT_CONFIRM: + irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY); + + del_timer(&self->watchdog_timer); + + irlmp_connect_confirm(self, skb); + break; + case LM_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + + IRDA_ASSERT(skb != NULL, return -1;); + IRDA_ASSERT(skb->len > 3, return -1;); + reason = skb->data[3]; + + /* Try to close the LAP connection */ + IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", __func__); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + + irlmp_disconnect_indication(self, reason, skb); + break; + case LM_LAP_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + del_timer(&self->watchdog_timer); + + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + + reason = irlmp_convert_lap_reason(self->lap->reason); + + irlmp_disconnect_indication(self, reason, skb); + break; + case LM_WATCHDOG_TIMEOUT: + IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __func__); + + IRDA_ASSERT(self->lap != NULL, return -1;); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __func__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_setup_pend (event, skb, info) + * + * SETUP_PEND, An LM_CONNECT_REQUEST has been received from the service + * user to set up an LSAP connection. A request has been sent to the + * LAP FSM to set up the underlying IrLAP connection, and we + * are awaiting confirm. + */ +static int irlmp_state_setup_pend(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + LM_REASON reason; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(irlmp != NULL, return -1;); + + switch (event) { + case LM_LAP_CONNECT_CONFIRM: + IRDA_ASSERT(self->conn_skb != NULL, return -1;); + + tx_skb = self->conn_skb; + self->conn_skb = NULL; + + irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, CONNECT_CMD, tx_skb); + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(tx_skb); + + irlmp_next_lsap_state(self, LSAP_SETUP); + break; + case LM_WATCHDOG_TIMEOUT: + IRDA_DEBUG(0, "%s() : WATCHDOG_TIMEOUT !\n", __func__); + + IRDA_ASSERT(self->lap != NULL, return -1;); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL); + break; + case LM_LAP_DISCONNECT_INDICATION: /* LS_Disconnect.indication */ + del_timer( &self->watchdog_timer); + + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + reason = irlmp_convert_lap_reason(self->lap->reason); + + irlmp_disconnect_indication(self, reason, NULL); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __func__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c new file mode 100644 index 00000000..062e63b1 --- /dev/null +++ b/net/irda/irlmp_frame.c @@ -0,0 +1,490 @@ +/********************************************************************* + * + * Filename: irlmp_frame.c + * Version: 0.9 + * Description: IrLMP frame implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Tue Aug 19 02:09:59 1997 + * Modified at: Mon Dec 13 13:41:12 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap, + __u8 slsap, int status, hashbin_t *); + +inline void irlmp_send_data_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap, + int expedited, struct sk_buff *skb) +{ + skb->data[0] = dlsap; + skb->data[1] = slsap; + + if (expedited) { + IRDA_DEBUG(4, "%s(), sending expedited data\n", __func__); + irlap_data_request(self->irlap, skb, TRUE); + } else + irlap_data_request(self->irlap, skb, FALSE); +} + +/* + * Function irlmp_send_lcf_pdu (dlsap, slsap, opcode,skb) + * + * Send Link Control Frame to IrLAP + */ +void irlmp_send_lcf_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap, + __u8 opcode, struct sk_buff *skb) +{ + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + frame = skb->data; + + frame[0] = dlsap | CONTROL_BIT; + frame[1] = slsap; + + frame[2] = opcode; + + if (opcode == DISCONNECT) + frame[3] = 0x01; /* Service user request */ + else + frame[3] = 0x00; /* rsvd */ + + irlap_data_request(self->irlap, skb, FALSE); +} + +/* + * Function irlmp_input (skb) + * + * Used by IrLAP to pass received data frames to IrLMP layer + * + */ +void irlmp_link_data_indication(struct lap_cb *self, struct sk_buff *skb, + int unreliable) +{ + struct lsap_cb *lsap; + __u8 slsap_sel; /* Source (this) LSAP address */ + __u8 dlsap_sel; /* Destination LSAP address */ + __u8 *fp; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(skb->len > 2, return;); + + fp = skb->data; + + /* + * The next statements may be confusing, but we do this so that + * destination LSAP of received frame is source LSAP in our view + */ + slsap_sel = fp[0] & LSAP_MASK; + dlsap_sel = fp[1]; + + /* + * Check if this is an incoming connection, since we must deal with + * it in a different way than other established connections. + */ + if ((fp[0] & CONTROL_BIT) && (fp[2] == CONNECT_CMD)) { + IRDA_DEBUG(3, "%s(), incoming connection, " + "source LSAP=%d, dest LSAP=%d\n", + __func__, slsap_sel, dlsap_sel); + + /* Try to find LSAP among the unconnected LSAPs */ + lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, CONNECT_CMD, + irlmp->unconnected_lsaps); + + /* Maybe LSAP was already connected, so try one more time */ + if (!lsap) { + IRDA_DEBUG(1, "%s(), incoming connection for LSAP already connected\n", __func__); + lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0, + self->lsaps); + } + } else + lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0, + self->lsaps); + + if (lsap == NULL) { + IRDA_DEBUG(2, "IrLMP, Sorry, no LSAP for received frame!\n"); + IRDA_DEBUG(2, "%s(), slsap_sel = %02x, dlsap_sel = %02x\n", + __func__, slsap_sel, dlsap_sel); + if (fp[0] & CONTROL_BIT) { + IRDA_DEBUG(2, "%s(), received control frame %02x\n", + __func__, fp[2]); + } else { + IRDA_DEBUG(2, "%s(), received data frame\n", __func__); + } + return; + } + + /* + * Check if we received a control frame? + */ + if (fp[0] & CONTROL_BIT) { + switch (fp[2]) { + case CONNECT_CMD: + lsap->lap = self; + irlmp_do_lsap_event(lsap, LM_CONNECT_INDICATION, skb); + break; + case CONNECT_CNF: + irlmp_do_lsap_event(lsap, LM_CONNECT_CONFIRM, skb); + break; + case DISCONNECT: + IRDA_DEBUG(4, "%s(), Disconnect indication!\n", + __func__); + irlmp_do_lsap_event(lsap, LM_DISCONNECT_INDICATION, + skb); + break; + case ACCESSMODE_CMD: + IRDA_DEBUG(0, "Access mode cmd not implemented!\n"); + break; + case ACCESSMODE_CNF: + IRDA_DEBUG(0, "Access mode cnf not implemented!\n"); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown control frame %02x\n", + __func__, fp[2]); + break; + } + } else if (unreliable) { + /* Optimize and bypass the state machine if possible */ + if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY) + irlmp_udata_indication(lsap, skb); + else + irlmp_do_lsap_event(lsap, LM_UDATA_INDICATION, skb); + } else { + /* Optimize and bypass the state machine if possible */ + if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY) + irlmp_data_indication(lsap, skb); + else + irlmp_do_lsap_event(lsap, LM_DATA_INDICATION, skb); + } +} + +/* + * Function irlmp_link_unitdata_indication (self, skb) + * + * + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlmp_link_unitdata_indication(struct lap_cb *self, struct sk_buff *skb) +{ + struct lsap_cb *lsap; + __u8 slsap_sel; /* Source (this) LSAP address */ + __u8 dlsap_sel; /* Destination LSAP address */ + __u8 pid; /* Protocol identifier */ + __u8 *fp; + unsigned long flags; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(skb->len > 2, return;); + + fp = skb->data; + + /* + * The next statements may be confusing, but we do this so that + * destination LSAP of received frame is source LSAP in our view + */ + slsap_sel = fp[0] & LSAP_MASK; + dlsap_sel = fp[1]; + pid = fp[2]; + + if (pid & 0x80) { + IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", + __func__); + return; + } + + /* Check if frame is addressed to the connectionless LSAP */ + if ((slsap_sel != LSAP_CONNLESS) || (dlsap_sel != LSAP_CONNLESS)) { + IRDA_DEBUG(0, "%s(), dropping frame!\n", __func__); + return; + } + + /* Search the connectionless LSAP */ + spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags); + lsap = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps); + while (lsap != NULL) { + /* + * Check if source LSAP and dest LSAP selectors and PID match. + */ + if ((lsap->slsap_sel == slsap_sel) && + (lsap->dlsap_sel == dlsap_sel) && + (lsap->pid == pid)) + { + break; + } + lsap = (struct lsap_cb *) hashbin_get_next(irlmp->unconnected_lsaps); + } + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + if (lsap) + irlmp_connless_data_indication(lsap, skb); + else { + IRDA_DEBUG(0, "%s(), found no matching LSAP!\n", __func__); + } +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irlmp_link_disconnect_indication (reason, userdata) + * + * IrLAP has disconnected + * + */ +void irlmp_link_disconnect_indication(struct lap_cb *lap, + struct irlap_cb *irlap, + LAP_REASON reason, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(lap != NULL, return;); + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + + lap->reason = reason; + lap->daddr = DEV_ADDR_ANY; + + /* FIXME: must do something with the skb if any */ + + /* + * Inform station state machine + */ + irlmp_do_lap_event(lap, LM_LAP_DISCONNECT_INDICATION, NULL); +} + +/* + * Function irlmp_link_connect_indication (qos) + * + * Incoming LAP connection! + * + */ +void irlmp_link_connect_indication(struct lap_cb *self, __u32 saddr, + __u32 daddr, struct qos_info *qos, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + /* Copy QoS settings for this session */ + self->qos = qos; + + /* Update destination device address */ + self->daddr = daddr; + IRDA_ASSERT(self->saddr == saddr, return;); + + irlmp_do_lap_event(self, LM_LAP_CONNECT_INDICATION, skb); +} + +/* + * Function irlmp_link_connect_confirm (qos) + * + * LAP connection confirmed! + * + */ +void irlmp_link_connect_confirm(struct lap_cb *self, struct qos_info *qos, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(qos != NULL, return;); + + /* Don't need use the skb for now */ + + /* Copy QoS settings for this session */ + self->qos = qos; + + irlmp_do_lap_event(self, LM_LAP_CONNECT_CONFIRM, NULL); +} + +/* + * Function irlmp_link_discovery_indication (self, log) + * + * Device is discovering us + * + * It's not an answer to our own discoveries, just another device trying + * to perform discovery, but we don't want to miss the opportunity + * to exploit this information, because : + * o We may not actively perform discovery (just passive discovery) + * o This type of discovery is much more reliable. In some cases, it + * seem that less than 50% of our discoveries get an answer, while + * we always get ~100% of these. + * o Make faster discovery, statistically divide time of discovery + * events by 2 (important for the latency aspect and user feel) + * o Even is we do active discovery, the other node might not + * answer our discoveries (ex: Palm). The Palm will just perform + * one active discovery and connect directly to us. + * + * However, when both devices discover each other, they might attempt to + * connect to each other following the discovery event, and it would create + * collisions on the medium (SNRM battle). + * The "fix" for that is to disable all connection requests in IrLAP + * for 100ms after a discovery indication by setting the media_busy flag. + * Previously, we used to postpone the event which was quite ugly. Now + * that IrLAP takes care of this problem, just pass the event up... + * + * Jean II + */ +void irlmp_link_discovery_indication(struct lap_cb *self, + discovery_t *discovery) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + /* Add to main log, cleanup */ + irlmp_add_discovery(irlmp->cachelog, discovery); + + /* Just handle it the same way as a discovery confirm, + * bypass the LM_LAP state machine (see below) */ + irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_PASSIVE); +} + +/* + * Function irlmp_link_discovery_confirm (self, log) + * + * Called by IrLAP with a list of discoveries after the discovery + * request has been carried out. A NULL log is received if IrLAP + * was unable to carry out the discovery request + * + */ +void irlmp_link_discovery_confirm(struct lap_cb *self, hashbin_t *log) +{ + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + /* Add to main log, cleanup */ + irlmp_add_discovery_log(irlmp->cachelog, log); + + /* Propagate event to various LSAPs registered for it. + * We bypass the LM_LAP state machine because + * 1) We do it regardless of the LM_LAP state + * 2) It doesn't affect the LM_LAP state + * 3) Faster, slimer, simpler, ... + * Jean II */ + irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_ACTIVE); +} + +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP +static inline void irlmp_update_cache(struct lap_cb *lap, + struct lsap_cb *lsap) +{ + /* Prevent concurrent read to get garbage */ + lap->cache.valid = FALSE; + /* Update cache entry */ + lap->cache.dlsap_sel = lsap->dlsap_sel; + lap->cache.slsap_sel = lsap->slsap_sel; + lap->cache.lsap = lsap; + lap->cache.valid = TRUE; +} +#endif + +/* + * Function irlmp_find_handle (self, dlsap_sel, slsap_sel, status, queue) + * + * Find handle associated with destination and source LSAP + * + * Any IrDA connection (LSAP/TSAP) is uniquely identified by + * 3 parameters, the local lsap, the remote lsap and the remote address. + * We may initiate multiple connections to the same remote service + * (they will have different local lsap), a remote device may initiate + * multiple connections to the same local service (they will have + * different remote lsap), or multiple devices may connect to the same + * service and may use the same remote lsap (and they will have + * different remote address). + * So, where is the remote address ? Each LAP connection is made with + * a single remote device, so imply a specific remote address. + * Jean II + */ +static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel, + __u8 slsap_sel, int status, + hashbin_t *queue) +{ + struct lsap_cb *lsap; + unsigned long flags; + + /* + * Optimize for the common case. We assume that the last frame + * received is in the same connection as the last one, so check in + * cache first to avoid the linear search + */ +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + if ((self->cache.valid) && + (self->cache.slsap_sel == slsap_sel) && + (self->cache.dlsap_sel == dlsap_sel)) + { + return self->cache.lsap; + } +#endif + + spin_lock_irqsave(&queue->hb_spinlock, flags); + + lsap = (struct lsap_cb *) hashbin_get_first(queue); + while (lsap != NULL) { + /* + * If this is an incoming connection, then the destination + * LSAP selector may have been specified as LM_ANY so that + * any client can connect. In that case we only need to check + * if the source LSAP (in our view!) match! + */ + if ((status == CONNECT_CMD) && + (lsap->slsap_sel == slsap_sel) && + (lsap->dlsap_sel == LSAP_ANY)) { + /* This is where the dest lsap sel is set on incoming + * lsaps */ + lsap->dlsap_sel = dlsap_sel; + break; + } + /* + * Check if source LSAP and dest LSAP selectors match. + */ + if ((lsap->slsap_sel == slsap_sel) && + (lsap->dlsap_sel == dlsap_sel)) + break; + + lsap = (struct lsap_cb *) hashbin_get_next(queue); + } +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + if(lsap) + irlmp_update_cache(self, lsap); +#endif + spin_unlock_irqrestore(&queue->hb_spinlock, flags); + + /* Return what we've found or NULL */ + return lsap; +} diff --git a/net/irda/irmod.c b/net/irda/irmod.c new file mode 100644 index 00000000..303a68d9 --- /dev/null +++ b/net/irda/irmod.c @@ -0,0 +1,211 @@ +/********************************************************************* + * + * Filename: irmod.c + * Version: 0.9 + * Description: IrDA stack main entry points + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Dec 15 13:55:39 1997 + * Modified at: Wed Jan 5 15:12:41 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2004 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +/* + * This file contains the main entry points of the IrDA stack. + * They are in this file and not af_irda.c because some developpers + * are using the IrDA stack without the socket API (compiling out + * af_irda.c). + * Jean II + */ + +#include +#include + +#include +#include /* notify_t */ +#include /* irlap_init */ +#include /* irlmp_init */ +#include /* iriap_init */ +#include /* irttp_init */ +#include /* irda_device_init */ + +/* + * Module parameters + */ +#ifdef CONFIG_IRDA_DEBUG +unsigned int irda_debug = IRDA_DEBUG_LEVEL; +module_param_named(debug, irda_debug, uint, 0); +MODULE_PARM_DESC(debug, "IRDA debugging level"); +EXPORT_SYMBOL(irda_debug); +#endif + +/* Packet type handler. + * Tell the kernel how IrDA packets should be handled. + */ +static struct packet_type irda_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IRDA), + .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */ +}; + +/* + * Function irda_notify_init (notify) + * + * Used for initializing the notify structure + * + */ +void irda_notify_init(notify_t *notify) +{ + notify->data_indication = NULL; + notify->udata_indication = NULL; + notify->connect_confirm = NULL; + notify->connect_indication = NULL; + notify->disconnect_indication = NULL; + notify->flow_indication = NULL; + notify->status_indication = NULL; + notify->instance = NULL; + strlcpy(notify->name, "Unknown", sizeof(notify->name)); +} +EXPORT_SYMBOL(irda_notify_init); + +/* + * Function irda_init (void) + * + * Protocol stack initialisation entry point. + * Initialise the various components of the IrDA stack + */ +static int __init irda_init(void) +{ + int ret = 0; + + IRDA_DEBUG(0, "%s()\n", __func__); + + /* Lower layer of the stack */ + irlmp_init(); + irlap_init(); + + /* Driver/dongle support */ + irda_device_init(); + + /* Higher layers of the stack */ + iriap_init(); + irttp_init(); + ret = irsock_init(); + if (ret < 0) + goto out_err_1; + + /* Add IrDA packet type (Start receiving packets) */ + dev_add_pack(&irda_packet_type); + + /* External APIs */ +#ifdef CONFIG_PROC_FS + irda_proc_register(); +#endif +#ifdef CONFIG_SYSCTL + ret = irda_sysctl_register(); + if (ret < 0) + goto out_err_2; +#endif + + ret = irda_nl_register(); + if (ret < 0) + goto out_err_3; + + return 0; + + out_err_3: +#ifdef CONFIG_SYSCTL + irda_sysctl_unregister(); + out_err_2: +#endif +#ifdef CONFIG_PROC_FS + irda_proc_unregister(); +#endif + + /* Remove IrDA packet type (stop receiving packets) */ + dev_remove_pack(&irda_packet_type); + + /* Remove higher layers */ + irsock_cleanup(); + out_err_1: + irttp_cleanup(); + iriap_cleanup(); + + /* Remove lower layers */ + irda_device_cleanup(); + irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */ + + /* Remove middle layer */ + irlmp_cleanup(); + + + return ret; +} + +/* + * Function irda_cleanup (void) + * + * Protocol stack cleanup/removal entry point. + * Cleanup the various components of the IrDA stack + */ +static void __exit irda_cleanup(void) +{ + /* Remove External APIs */ + irda_nl_unregister(); + +#ifdef CONFIG_SYSCTL + irda_sysctl_unregister(); +#endif +#ifdef CONFIG_PROC_FS + irda_proc_unregister(); +#endif + + /* Remove IrDA packet type (stop receiving packets) */ + dev_remove_pack(&irda_packet_type); + + /* Remove higher layers */ + irsock_cleanup(); + irttp_cleanup(); + iriap_cleanup(); + + /* Remove lower layers */ + irda_device_cleanup(); + irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */ + + /* Remove middle layer */ + irlmp_cleanup(); +} + +/* + * The IrDA stack must be initialised *before* drivers get initialised, + * and *before* higher protocols (IrLAN/IrCOMM/IrNET) get initialised, + * otherwise bad things will happen (hashbins will be NULL for example). + * Those modules are at module_init()/device_initcall() level. + * + * On the other hand, it needs to be initialised *after* the basic + * networking, the /proc/net filesystem and sysctl module. Those are + * currently initialised in .../init/main.c (before initcalls). + * Also, IrDA drivers needs to be initialised *after* the random number + * generator (main stack and higher layer init don't need it anymore). + * + * Jean II + */ +subsys_initcall(irda_init); +module_exit(irda_cleanup); + +MODULE_AUTHOR("Dag Brattli & Jean Tourrilhes "); +MODULE_DESCRIPTION("The Linux IrDA Protocol Stack"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_IRDA); diff --git a/net/irda/irnet/Kconfig b/net/irda/irnet/Kconfig new file mode 100644 index 00000000..28c557f0 --- /dev/null +++ b/net/irda/irnet/Kconfig @@ -0,0 +1,13 @@ +config IRNET + tristate "IrNET protocol" + depends on IRDA && PPP + help + Say Y here if you want to build support for the IrNET protocol. + To compile it as a module, choose M here: the module will be + called irnet. IrNET is a PPP driver, so you will also need a + working PPP subsystem (driver, daemon and config)... + + IrNET is an alternate way to transfer TCP/IP traffic over IrDA. It + uses synchronous PPP over a set of point to point IrDA sockets. You + can use it between Linux machine or with W2k. + diff --git a/net/irda/irnet/Makefile b/net/irda/irnet/Makefile new file mode 100644 index 00000000..61c365c8 --- /dev/null +++ b/net/irda/irnet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux IrDA IrNET protocol layer. +# + +obj-$(CONFIG_IRNET) += irnet.o + +irnet-y := irnet_ppp.o irnet_irda.o diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h new file mode 100644 index 00000000..979ecb24 --- /dev/null +++ b/net/irda/irnet/irnet.h @@ -0,0 +1,528 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file contains definitions and declarations global to the IrNET module, + * all grouped in one place... + * This file is a *private* header, so other modules don't want to know + * what's in there... + * + * Note : as most part of the Linux kernel, this module is available + * under the GNU General Public License (GPL). + */ + +#ifndef IRNET_H +#define IRNET_H + +/************************** DOCUMENTATION ***************************/ +/* + * What is IrNET + * ------------- + * IrNET is a protocol allowing to carry TCP/IP traffic between two + * IrDA peers in an efficient fashion. It is a thin layer, passing PPP + * packets to IrTTP and vice versa. It uses PPP in synchronous mode, + * because IrTTP offer a reliable sequenced packet service (as opposed + * to a byte stream). In fact, you could see IrNET as carrying TCP/IP + * in a IrDA socket, using PPP to provide the glue. + * + * The main difference with traditional PPP over IrCOMM is that we + * avoid the framing and serial emulation which are a performance + * bottleneck. It also allows multipoint communications in a sensible + * fashion. + * + * The main difference with IrLAN is that we use PPP for the link + * management, which is more standard, interoperable and flexible than + * the IrLAN protocol. For example, PPP adds authentication, + * encryption, compression, header compression and automated routing + * setup. And, as IrNET let PPP do the hard work, the implementation + * is much simpler than IrLAN. + * + * The Linux implementation + * ------------------------ + * IrNET is written on top of the Linux-IrDA stack, and interface with + * the generic Linux PPP driver. Because IrNET depend on recent + * changes of the PPP driver interface, IrNET will work only with very + * recent kernel (2.3.99-pre6 and up). + * + * The present implementation offer the following features : + * o simple user interface using pppd + * o efficient implementation (interface directly to PPP and IrTTP) + * o addressing (you can specify the name of the IrNET recipient) + * o multipoint operation (limited by IrLAP specification) + * o information in /proc/net/irda/irnet + * o IrNET events on /dev/irnet (for user space daemon) + * o IrNET daemon (irnetd) to automatically handle incoming requests + * o Windows 2000 compatibility (tested, but need more work) + * Currently missing : + * o Lot's of testing (that's your job) + * o Connection retries (may be too hard to do) + * o Check pppd persist mode + * o User space daemon (to automatically handle incoming requests) + * + * The setup is not currently the most easy, but this should get much + * better when everything will get integrated... + * + * Acknowledgements + * ---------------- + * This module is based on : + * o The PPP driver (ppp_synctty/ppp_generic) by Paul Mackerras + * o The IrLAN protocol (irlan_common/XXX) by Dag Brattli + * o The IrSock interface (af_irda) by Dag Brattli + * o Some other bits from the kernel and my drivers... + * Infinite thanks to those brave souls for providing the infrastructure + * upon which IrNET is built. + * + * Thanks to all my colleagues in HP for helping me. In particular, + * thanks to Salil Pradhan and Bill Serra for W2k testing... + * Thanks to Luiz Magalhaes for irnetd and much testing... + * + * Thanks to Alan Cox for answering lot's of my stupid questions, and + * to Paul Mackerras answering my questions on how to best integrate + * IrNET and pppd. + * + * Jean II + * + * Note on some implementations choices... + * ------------------------------------ + * 1) Direct interface vs tty/socket + * I could have used a tty interface to hook to ppp and use the full + * socket API to connect to IrDA. The code would have been easier to + * maintain, and maybe the code would have been smaller... + * Instead, we hook directly to ppp_generic and to IrTTP, which make + * things more complicated... + * + * The first reason is flexibility : this allow us to create IrNET + * instances on demand (no /dev/ircommX crap) and to allow linkname + * specification on pppd command line... + * + * Second reason is speed optimisation. If you look closely at the + * transmit and receive paths, you will notice that they are "super lean" + * (that's why they look ugly), with no function calls and as little data + * copy and modification as I could... + * + * 2) irnetd in user space + * irnetd is implemented in user space, which is necessary to call pppd. + * This also give maximum benefits in term of flexibility and customability, + * and allow to offer the event channel, useful for other stuff like debug. + * + * On the other hand, this require a loose coordination between the + * present module and irnetd. One critical area is how incoming request + * are handled. + * When irnet receive an incoming request, it send an event to irnetd and + * drop the incoming IrNET socket. + * irnetd start a pppd instance, which create a new IrNET socket. This new + * socket is then connected in the originating node to the pppd instance. + * At this point, in the originating node, the first socket is closed. + * + * I admit, this is a bit messy and waste some resources. The alternative + * is caching incoming socket, and that's also quite messy and waste + * resources. + * We also make connection time slower. For example, on a 115 kb/s link it + * adds 60ms to the connection time (770 ms). However, this is slower than + * the time it takes to fire up pppd on my P133... + * + * + * History : + * ------- + * + * v1 - 15.5.00 - Jean II + * o Basic IrNET (hook to ppp_generic & IrTTP - incl. multipoint) + * o control channel on /dev/irnet (set name/address) + * o event channel on /dev/irnet (for user space daemon) + * + * v2 - 5.6.00 - Jean II + * o Enable DROP_NOT_READY to avoid PPP timeouts & other weirdness... + * o Add DISCONNECT_TO event and rename DISCONNECT_FROM. + * o Set official device number alloaction on /dev/irnet + * + * v3 - 30.8.00 - Jean II + * o Update to latest Linux-IrDA changes : + * - queue_t => irda_queue_t + * o Update to ppp-2.4.0 : + * - move irda_irnet_connect from PPPIOCATTACH to TIOCSETD + * o Add EXPIRE event (depend on new IrDA-Linux patch) + * o Switch from `hashbin_remove' to `hashbin_remove_this' to fix + * a multilink bug... (depend on new IrDA-Linux patch) + * o fix a self->daddr to self->raddr in irda_irnet_connect to fix + * another multilink bug (darn !) + * o Remove LINKNAME_IOCTL cruft + * + * v3b - 31.8.00 - Jean II + * o Dump discovery log at event channel startup + * + * v4 - 28.9.00 - Jean II + * o Fix interaction between poll/select and dump discovery log + * o Add IRNET_BLOCKED_LINK event (depend on new IrDA-Linux patch) + * o Add IRNET_NOANSWER_FROM event (mostly to help support) + * o Release flow control in disconnect_indication + * o Block packets while connecting (speed up connections) + * + * v5 - 11.01.01 - Jean II + * o Init self->max_header_size, just in case... + * o Set up ap->chan.hdrlen, to get zero copy on tx side working. + * o avoid tx->ttp->flow->ppp->tx->... loop, by checking flow state + * Thanks to Christian Gennerat for finding this bug ! + * --- + * o Declare the proper MTU/MRU that we can support + * (but PPP doesn't read the MTU value :-() + * o Declare hashbin HB_NOLOCK instead of HB_LOCAL to avoid + * disabling and enabling irq twice + * + * v6 - 31.05.01 - Jean II + * o Print source address in Found, Discovery, Expiry & Request events + * o Print requested source address in /proc/net/irnet + * o Change control channel input. Allow multiple commands in one line. + * o Add saddr command to change ap->rsaddr (and use that in IrDA) + * --- + * o Make the IrDA connection procedure totally asynchronous. + * Heavy rewrite of the IAS query code and the whole connection + * procedure. Now, irnet_connect() no longer need to be called from + * a process context... + * o Enable IrDA connect retries in ppp_irnet_send(). The good thing + * is that IrDA connect retries are directly driven by PPP LCP + * retries (we retry for each LCP packet), so that everything + * is transparently controlled from pppd lcp-max-configure. + * o Add ttp_connect flag to prevent rentry on the connect procedure + * o Test and fixups to eliminate side effects of retries + * + * v7 - 22.08.01 - Jean II + * o Cleanup : Change "saddr = 0x0" to "saddr = DEV_ADDR_ANY" + * o Fix bug in BLOCK_WHEN_CONNECT introduced in v6 : due to the + * asynchronous IAS query, self->tsap is NULL when PPP send the + * first packet. This was preventing "connect-delay 0" to work. + * Change the test in ppp_irnet_send() to self->ttp_connect. + * + * v8 - 1.11.01 - Jean II + * o Tighten the use of self->ttp_connect and self->ttp_open to + * prevent various race conditions. + * o Avoid leaking discovery log and skb + * o Replace "self" with "server" in irnet_connect_indication() to + * better detect cut'n'paste error ;-) + * + * v9 - 29.11.01 - Jean II + * o Fix event generation in disconnect indication that I broke in v8 + * It was always generation "No-Answer" because I was testing ttp_open + * just after clearing it. *blush*. + * o Use newly created irttp_listen() to fix potential crash when LAP + * destroyed before irnet module removed. + * + * v10 - 4.3.2 - Jean II + * o When receiving a disconnect indication, don't reenable the + * PPP Tx queue, this will trigger a reconnect. Instead, close + * the channel, which will kill pppd... + * + * v11 - 20.3.02 - Jean II + * o Oops ! v10 fix disabled IrNET retries and passive behaviour. + * Better fix in irnet_disconnect_indication() : + * - if connected, kill pppd via hangup. + * - if not connected, reenable ppp Tx, which trigger IrNET retry. + * + * v12 - 10.4.02 - Jean II + * o Fix race condition in irnet_connect_indication(). + * If the socket was already trying to connect, drop old connection + * and use new one only if acting as primary. See comments. + * + * v13 - 30.5.02 - Jean II + * o Update module init code + * + * v14 - 20.2.03 - Jean II + * o Add discovery hint bits in the control channel. + * o Remove obsolete MOD_INC/DEC_USE_COUNT in favor of .owner + * + * v15 - 7.4.03 - Jean II + * o Replace spin_lock_irqsave() with spin_lock_bh() so that we can + * use ppp_unit_number(). It's probably also better overall... + * o Disable call to ppp_unregister_channel(), because we can't do it. + */ + +/***************************** INCLUDES *****************************/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* isspace() */ +#include /* skip_spaces() */ +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/***************************** OPTIONS *****************************/ +/* + * Define or undefine to compile or not some optional part of the + * IrNET driver... + * Note : the present defaults make sense, play with that at your + * own risk... + */ +/* IrDA side of the business... */ +#define DISCOVERY_NOMASK /* To enable W2k compatibility... */ +#define ADVERTISE_HINT /* Advertise IrLAN hint bit */ +#define ALLOW_SIMULT_CONNECT /* This seem to work, cross fingers... */ +#define DISCOVERY_EVENTS /* Query the discovery log to post events */ +#define INITIAL_DISCOVERY /* Dump current discovery log as events */ +#undef STREAM_COMPAT /* Not needed - potentially messy */ +#undef CONNECT_INDIC_KICK /* Might mess IrDA, not needed */ +#undef FAIL_SEND_DISCONNECT /* Might mess IrDA, not needed */ +#undef PASS_CONNECT_PACKETS /* Not needed ? Safe */ +#undef MISSING_PPP_API /* Stuff I wish I could do */ + +/* PPP side of the business */ +#define BLOCK_WHEN_CONNECT /* Block packets when connecting */ +#define CONNECT_IN_SEND /* Retry IrDA connection procedure */ +#undef FLUSH_TO_PPP /* Not sure about this one, let's play safe */ +#undef SECURE_DEVIRNET /* Bah... */ + +/****************************** DEBUG ******************************/ + +/* + * This set of flags enable and disable all the various warning, + * error and debug message of this driver. + * Each section can be enabled and disabled independently + */ +/* In the PPP part */ +#define DEBUG_CTRL_TRACE 0 /* Control channel */ +#define DEBUG_CTRL_INFO 0 /* various info */ +#define DEBUG_CTRL_ERROR 1 /* problems */ +#define DEBUG_FS_TRACE 0 /* filesystem callbacks */ +#define DEBUG_FS_INFO 0 /* various info */ +#define DEBUG_FS_ERROR 1 /* problems */ +#define DEBUG_PPP_TRACE 0 /* PPP related functions */ +#define DEBUG_PPP_INFO 0 /* various info */ +#define DEBUG_PPP_ERROR 1 /* problems */ +#define DEBUG_MODULE_TRACE 0 /* module insertion/removal */ +#define DEBUG_MODULE_ERROR 1 /* problems */ + +/* In the IrDA part */ +#define DEBUG_IRDA_SR_TRACE 0 /* IRDA subroutines */ +#define DEBUG_IRDA_SR_INFO 0 /* various info */ +#define DEBUG_IRDA_SR_ERROR 1 /* problems */ +#define DEBUG_IRDA_SOCK_TRACE 0 /* IRDA main socket functions */ +#define DEBUG_IRDA_SOCK_INFO 0 /* various info */ +#define DEBUG_IRDA_SOCK_ERROR 1 /* problems */ +#define DEBUG_IRDA_SERV_TRACE 0 /* The IrNET server */ +#define DEBUG_IRDA_SERV_INFO 0 /* various info */ +#define DEBUG_IRDA_SERV_ERROR 1 /* problems */ +#define DEBUG_IRDA_TCB_TRACE 0 /* IRDA IrTTP callbacks */ +#define DEBUG_IRDA_CB_INFO 0 /* various info */ +#define DEBUG_IRDA_CB_ERROR 1 /* problems */ +#define DEBUG_IRDA_OCB_TRACE 0 /* IRDA other callbacks */ +#define DEBUG_IRDA_OCB_INFO 0 /* various info */ +#define DEBUG_IRDA_OCB_ERROR 1 /* problems */ + +#define DEBUG_ASSERT 0 /* Verify all assertions */ + +/* + * These are the macros we are using to actually print the debug + * statements. Don't look at it, it's ugly... + * + * One of the trick is that, as the DEBUG_XXX are constant, the + * compiler will optimise away the if() in all cases. + */ +/* All error messages (will show up in the normal logs) */ +#define DERROR(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_INFO "irnet: %s(): " format, __func__ , ##args);} + +/* Normal debug message (will show up in /var/log/debug) */ +#define DEBUG(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: %s(): " format, __func__ , ##args);} + +/* Entering a function (trace) */ +#define DENTER(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: -> %s" format, __func__ , ##args);} + +/* Entering and exiting a function in one go (trace) */ +#define DPASS(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: <>%s" format, __func__ , ##args);} + +/* Exiting a function (trace) */ +#define DEXIT(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: <-%s()" format, __func__ , ##args);} + +/* Exit a function with debug */ +#define DRETURN(ret, dbg, args...) \ + {DEXIT(dbg, ": " args);\ + return ret; } + +/* Exit a function on failed condition */ +#define DABORT(cond, ret, dbg, args...) \ + {if(cond) {\ + DERROR(dbg, args);\ + return ret; }} + +/* Invalid assertion, print out an error and exit... */ +#define DASSERT(cond, ret, dbg, args...) \ + {if((DEBUG_ASSERT) && !(cond)) {\ + DERROR(dbg, "Invalid assertion: " args);\ + return ret; }} + +/************************ CONSTANTS & MACROS ************************/ + +/* Paranoia */ +#define IRNET_MAGIC 0xB00754 + +/* Number of control events in the control channel buffer... */ +#define IRNET_MAX_EVENTS 8 /* Should be more than enough... */ + +/****************************** TYPES ******************************/ + +/* + * This is the main structure where we store all the data pertaining to + * one instance of irnet. + * Note : in irnet functions, a pointer this structure is usually called + * "ap" or "self". If the code is borrowed from the IrDA stack, it tend + * to be called "self", and if it is borrowed from the PPP driver it is + * "ap". Apart from that, it's exactly the same structure ;-) + */ +typedef struct irnet_socket +{ + /* ------------------- Instance management ------------------- */ + /* We manage a linked list of IrNET socket instances */ + irda_queue_t q; /* Must be first - for hasbin */ + int magic; /* Paranoia */ + + /* --------------------- FileSystem part --------------------- */ + /* "pppd" interact directly with us on a /dev/ file */ + struct file * file; /* File descriptor of this instance */ + /* TTY stuff - to keep "pppd" happy */ + struct ktermios termios; /* Various tty flags */ + /* Stuff for the control channel */ + int event_index; /* Last read in the event log */ + + /* ------------------------- PPP part ------------------------- */ + /* We interface directly to the ppp_generic driver in the kernel */ + int ppp_open; /* registered with ppp_generic */ + struct ppp_channel chan; /* Interface to generic ppp layer */ + + int mru; /* Max size of PPP payload */ + u32 xaccm[8]; /* Asynchronous character map (just */ + u32 raccm; /* to please pppd - dummy) */ + unsigned int flags; /* PPP flags (compression, ...) */ + unsigned int rbits; /* Unused receive flags ??? */ + struct work_struct disconnect_work; /* Process context disconnection */ + /* ------------------------ IrTTP part ------------------------ */ + /* We create a pseudo "socket" over the IrDA tranport */ + unsigned long ttp_open; /* Set when IrTTP is ready */ + unsigned long ttp_connect; /* Set when IrTTP is connecting */ + struct tsap_cb * tsap; /* IrTTP instance (the connection) */ + + char rname[NICKNAME_MAX_LEN + 1]; + /* IrDA nickname of destination */ + __u32 rdaddr; /* Requested peer IrDA address */ + __u32 rsaddr; /* Requested local IrDA address */ + __u32 daddr; /* actual peer IrDA address */ + __u32 saddr; /* my local IrDA address */ + __u8 dtsap_sel; /* Remote TSAP selector */ + __u8 stsap_sel; /* Local TSAP selector */ + + __u32 max_sdu_size_rx;/* Socket parameters used for IrTTP */ + __u32 max_sdu_size_tx; + __u32 max_data_size; + __u8 max_header_size; + LOCAL_FLOW tx_flow; /* State of the Tx path in IrTTP */ + + /* ------------------- IrLMP and IrIAS part ------------------- */ + /* Used for IrDA Discovery and socket name resolution */ + void * ckey; /* IrLMP client handle */ + __u16 mask; /* Hint bits mask (filter discov.)*/ + int nslots; /* Number of slots for discovery */ + + struct iriap_cb * iriap; /* Used to query remote IAS */ + int errno; /* status of the IAS query */ + + /* -------------------- Discovery log part -------------------- */ + /* Used by initial discovery on the control channel + * and by irnet_discover_daddr_and_lsap_sel() */ + struct irda_device_info *discoveries; /* Copy of the discovery log */ + int disco_index; /* Last read in the discovery log */ + int disco_number; /* Size of the discovery log */ + + struct mutex lock; + +} irnet_socket; + +/* + * This is the various event that we will generate on the control channel + */ +typedef enum irnet_event +{ + IRNET_DISCOVER, /* New IrNET node discovered */ + IRNET_EXPIRE, /* IrNET node expired */ + IRNET_CONNECT_TO, /* IrNET socket has connected to other node */ + IRNET_CONNECT_FROM, /* Other node has connected to IrNET socket */ + IRNET_REQUEST_FROM, /* Non satisfied connection request */ + IRNET_NOANSWER_FROM, /* Failed connection request */ + IRNET_BLOCKED_LINK, /* Link (IrLAP) is blocked for > 3s */ + IRNET_DISCONNECT_FROM, /* IrNET socket has disconnected */ + IRNET_DISCONNECT_TO /* Closing IrNET socket */ +} irnet_event; + +/* + * This is the storage for an event and its arguments + */ +typedef struct irnet_log +{ + irnet_event event; + int unit; + __u32 saddr; + __u32 daddr; + char name[NICKNAME_MAX_LEN + 1]; /* 21 + 1 */ + __u16_host_order hints; /* Discovery hint bits */ +} irnet_log; + +/* + * This is the storage for all events and related stuff... + */ +typedef struct irnet_ctrl_channel +{ + irnet_log log[IRNET_MAX_EVENTS]; /* Event log */ + int index; /* Current index in log */ + spinlock_t spinlock; /* Serialize access to the event log */ + wait_queue_head_t rwait; /* processes blocked on read (or poll) */ +} irnet_ctrl_channel; + +/**************************** PROTOTYPES ****************************/ +/* + * Global functions of the IrNET module + * Note : we list here also functions called from one file to the other. + */ + +/* -------------------------- IRDA PART -------------------------- */ +extern int + irda_irnet_create(irnet_socket *); /* Initialise a IrNET socket */ +extern int + irda_irnet_connect(irnet_socket *); /* Try to connect over IrDA */ +extern void + irda_irnet_destroy(irnet_socket *); /* Teardown a IrNET socket */ +extern int + irda_irnet_init(void); /* Initialise IrDA part of IrNET */ +extern void + irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */ + +/**************************** VARIABLES ****************************/ + +/* Control channel stuff - allocated in irnet_irda.h */ +extern struct irnet_ctrl_channel irnet_events; + +#endif /* IRNET_H */ diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c new file mode 100644 index 00000000..7f17a802 --- /dev/null +++ b/net/irda/irnet/irnet_irda.c @@ -0,0 +1,1885 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file implement the IRDA interface of IrNET. + * Basically, we sit on top of IrTTP. We set up IrTTP, IrIAS properly, + * and exchange frames with IrTTP. + */ + +#include "irnet_irda.h" /* Private header */ +#include +#include +#include +#include + +/* + * PPP disconnect work: we need to make sure we're in + * process context when calling ppp_unregister_channel(). + */ +static void irnet_ppp_disconnect(struct work_struct *work) +{ + irnet_socket * self = + container_of(work, irnet_socket, disconnect_work); + + if (self == NULL) + return; + /* + * If we were connected, cleanup & close the PPP + * channel, which will kill pppd (hangup) and the rest. + */ + if (self->ppp_open && !self->ttp_open && !self->ttp_connect) { + ppp_unregister_channel(&self->chan); + self->ppp_open = 0; + } +} + +/************************* CONTROL CHANNEL *************************/ +/* + * When ppp is not active, /dev/irnet act as a control channel. + * Writing allow to set up the IrDA destination of the IrNET channel, + * and any application may be read events happening on IrNET... + */ + +/*------------------------------------------------------------------*/ +/* + * Post an event to the control channel... + * Put the event in the log, and then wait all process blocked on read + * so they can read the log... + */ +static void +irnet_post_event(irnet_socket * ap, + irnet_event event, + __u32 saddr, + __u32 daddr, + char * name, + __u16 hints) +{ + int index; /* In the log */ + + DENTER(CTRL_TRACE, "(ap=0x%p, event=%d, daddr=%08x, name=``%s'')\n", + ap, event, daddr, name); + + /* Protect this section via spinlock. + * Note : as we are the only event producer, we only need to exclude + * ourself when touching the log, which is nice and easy. + */ + spin_lock_bh(&irnet_events.spinlock); + + /* Copy the event in the log */ + index = irnet_events.index; + irnet_events.log[index].event = event; + irnet_events.log[index].daddr = daddr; + irnet_events.log[index].saddr = saddr; + /* Try to copy IrDA nickname */ + if(name) + strcpy(irnet_events.log[index].name, name); + else + irnet_events.log[index].name[0] = '\0'; + /* Copy hints */ + irnet_events.log[index].hints.word = hints; + /* Try to get ppp unit number */ + if((ap != (irnet_socket *) NULL) && (ap->ppp_open)) + irnet_events.log[index].unit = ppp_unit_number(&ap->chan); + else + irnet_events.log[index].unit = -1; + + /* Increment the index + * Note that we increment the index only after the event is written, + * to make sure that the readers don't get garbage... */ + irnet_events.index = (index + 1) % IRNET_MAX_EVENTS; + + DEBUG(CTRL_INFO, "New event index is %d\n", irnet_events.index); + + /* Spin lock end */ + spin_unlock_bh(&irnet_events.spinlock); + + /* Now : wake up everybody waiting for events... */ + wake_up_interruptible_all(&irnet_events.rwait); + + DEXIT(CTRL_TRACE, "\n"); +} + +/************************* IRDA SUBROUTINES *************************/ +/* + * These are a bunch of subroutines called from other functions + * down there, mostly common code or to improve readability... + * + * Note : we duplicate quite heavily some routines of af_irda.c, + * because our input structure (self) is quite different + * (struct irnet instead of struct irda_sock), which make sharing + * the same code impossible (at least, without templates). + */ + +/*------------------------------------------------------------------*/ +/* + * Function irda_open_tsap (self) + * + * Open local Transport Service Access Point (TSAP) + * + * Create a IrTTP instance for us and set all the IrTTP callbacks. + */ +static inline int +irnet_open_tsap(irnet_socket * self) +{ + notify_t notify; /* Callback structure */ + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + DABORT(self->tsap != NULL, -EBUSY, IRDA_SR_ERROR, "Already busy !\n"); + + /* Initialize IrTTP callbacks to be used by the IrDA stack */ + irda_notify_init(¬ify); + notify.connect_confirm = irnet_connect_confirm; + notify.connect_indication = irnet_connect_indication; + notify.disconnect_indication = irnet_disconnect_indication; + notify.data_indication = irnet_data_indication; + /*notify.udata_indication = NULL;*/ + notify.flow_indication = irnet_flow_indication; + notify.status_indication = irnet_status_indication; + notify.instance = self; + strlcpy(notify.name, IRNET_NOTIFY_NAME, sizeof(notify.name)); + + /* Open an IrTTP instance */ + self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, + ¬ify); + DABORT(self->tsap == NULL, -ENOMEM, + IRDA_SR_ERROR, "Unable to allocate TSAP !\n"); + + /* Remember which TSAP selector we actually got */ + self->stsap_sel = self->tsap->stsap_sel; + + DEXIT(IRDA_SR_TRACE, " - tsap=0x%p, sel=0x%X\n", + self->tsap, self->stsap_sel); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_ias_to_tsap (self, result, value) + * + * Examine an IAS object and extract TSAP + * + * We do an IAP query to find the TSAP associated with the IrNET service. + * When IrIAP pass us the result of the query, this function look at + * the return values to check for failures and extract the TSAP if + * possible. + * Also deallocate value + * The failure is in self->errno + * Return TSAP or -1 + */ +static inline __u8 +irnet_ias_to_tsap(irnet_socket * self, + int result, + struct ias_value * value) +{ + __u8 dtsap_sel = 0; /* TSAP we are looking for */ + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* By default, no error */ + self->errno = 0; + + /* Check if request succeeded */ + switch(result) + { + /* Standard errors : service not available */ + case IAS_CLASS_UNKNOWN: + case IAS_ATTRIB_UNKNOWN: + DEBUG(IRDA_SR_INFO, "IAS object doesn't exist ! (%d)\n", result); + self->errno = -EADDRNOTAVAIL; + break; + + /* Other errors, most likely IrDA stack failure */ + default : + DEBUG(IRDA_SR_INFO, "IAS query failed ! (%d)\n", result); + self->errno = -EHOSTUNREACH; + break; + + /* Success : we got what we wanted */ + case IAS_SUCCESS: + break; + } + + /* Check what was returned to us */ + if(value != NULL) + { + /* What type of argument have we got ? */ + switch(value->type) + { + case IAS_INTEGER: + DEBUG(IRDA_SR_INFO, "result=%d\n", value->t.integer); + if(value->t.integer != -1) + /* Get the remote TSAP selector */ + dtsap_sel = value->t.integer; + else + self->errno = -EADDRNOTAVAIL; + break; + default: + self->errno = -EADDRNOTAVAIL; + DERROR(IRDA_SR_ERROR, "bad type ! (0x%X)\n", value->type); + break; + } + + /* Cleanup */ + irias_delete_value(value); + } + else /* value == NULL */ + { + /* Nothing returned to us - usually result != SUCCESS */ + if(!(self->errno)) + { + DERROR(IRDA_SR_ERROR, + "IrDA bug : result == SUCCESS && value == NULL\n"); + self->errno = -EHOSTUNREACH; + } + } + DEXIT(IRDA_SR_TRACE, "\n"); + + /* Return the TSAP */ + return dtsap_sel; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_find_lsap_sel (self) + * + * Try to lookup LSAP selector in remote LM-IAS + * + * Basically, we start a IAP query, and then go to sleep. When the query + * return, irnet_getvalue_confirm will wake us up, and we can examine the + * result of the query... + * Note that in some case, the query fail even before we go to sleep, + * creating some races... + */ +static inline int +irnet_find_lsap_sel(irnet_socket * self) +{ + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* This should not happen */ + DABORT(self->iriap, -EBUSY, IRDA_SR_ERROR, "busy with a previous query.\n"); + + /* Create an IAP instance, will be closed in irnet_getvalue_confirm() */ + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irnet_getvalue_confirm); + + /* Treat unexpected signals as disconnect */ + self->errno = -EHOSTUNREACH; + + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, self->rsaddr, self->daddr, + IRNET_SERVICE_NAME, IRNET_IAS_VALUE); + + /* The above request is non-blocking. + * After a while, IrDA will call us back in irnet_getvalue_confirm() + * We will then call irnet_ias_to_tsap() and finish the + * connection procedure */ + + DEXIT(IRDA_SR_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_connect_tsap (self) + * + * Initialise the TTP socket and initiate TTP connection + * + */ +static inline int +irnet_connect_tsap(irnet_socket * self) +{ + int err; + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* Open a local TSAP (an IrTTP instance) */ + err = irnet_open_tsap(self); + if(err != 0) + { + clear_bit(0, &self->ttp_connect); + DERROR(IRDA_SR_ERROR, "connect aborted!\n"); + return err; + } + + /* Connect to remote device */ + err = irttp_connect_request(self->tsap, self->dtsap_sel, + self->rsaddr, self->daddr, NULL, + self->max_sdu_size_rx, NULL); + if(err != 0) + { + clear_bit(0, &self->ttp_connect); + DERROR(IRDA_SR_ERROR, "connect aborted!\n"); + return err; + } + + /* The above call is non-blocking. + * After a while, the IrDA stack will either call us back in + * irnet_connect_confirm() or irnet_disconnect_indication() + * See you there ;-) */ + + DEXIT(IRDA_SR_TRACE, "\n"); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_discover_next_daddr (self) + * + * Query the IrNET TSAP of the next device in the log. + * + * Used in the TSAP discovery procedure. + */ +static inline int +irnet_discover_next_daddr(irnet_socket * self) +{ + /* Close the last instance of IrIAP, and open a new one. + * We can't reuse the IrIAP instance in the IrIAP callback */ + if(self->iriap) + { + iriap_close(self->iriap); + self->iriap = NULL; + } + /* Create a new IAP instance */ + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irnet_discovervalue_confirm); + if(self->iriap == NULL) + return -ENOMEM; + + /* Next discovery - before the call to avoid races */ + self->disco_index++; + + /* Check if we have one more address to try */ + if(self->disco_index < self->disco_number) + { + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, + self->discoveries[self->disco_index].saddr, + self->discoveries[self->disco_index].daddr, + IRNET_SERVICE_NAME, IRNET_IAS_VALUE); + /* The above request is non-blocking. + * After a while, IrDA will call us back in irnet_discovervalue_confirm() + * We will then call irnet_ias_to_tsap() and come back here again... */ + return 0; + } + else + return 1; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_discover_daddr_and_lsap_sel (self) + * + * This try to find a device with the requested service. + * + * Initiate a TSAP discovery procedure. + * It basically look into the discovery log. For each address in the list, + * it queries the LM-IAS of the device to find if this device offer + * the requested service. + * If there is more than one node supporting the service, we complain + * to the user (it should move devices around). + * If we find one node which have the requested TSAP, we connect to it. + * + * This function just start the whole procedure. It request the discovery + * log and submit the first IAS query. + * The bulk of the job is handled in irnet_discovervalue_confirm() + * + * Note : this procedure fails if there is more than one device in range + * on the same dongle, because IrLMP doesn't disconnect the LAP when the + * last LSAP is closed. Moreover, we would need to wait the LAP + * disconnection... + */ +static inline int +irnet_discover_daddr_and_lsap_sel(irnet_socket * self) +{ + int ret; + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* Ask lmp for the current discovery log */ + self->discoveries = irlmp_get_discoveries(&self->disco_number, self->mask, + DISCOVERY_DEFAULT_SLOTS); + + /* Check if the we got some results */ + if(self->discoveries == NULL) + { + self->disco_number = -1; + clear_bit(0, &self->ttp_connect); + DRETURN(-ENETUNREACH, IRDA_SR_INFO, "No Cachelog...\n"); + } + DEBUG(IRDA_SR_INFO, "Got the log (0x%p), size is %d\n", + self->discoveries, self->disco_number); + + /* Start with the first discovery */ + self->disco_index = -1; + self->daddr = DEV_ADDR_ANY; + + /* This will fail if the log is empty - this is non-blocking */ + ret = irnet_discover_next_daddr(self); + if(ret) + { + /* Close IAP */ + if(self->iriap) + iriap_close(self->iriap); + self->iriap = NULL; + + /* Cleanup our copy of the discovery log */ + kfree(self->discoveries); + self->discoveries = NULL; + + clear_bit(0, &self->ttp_connect); + DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n"); + } + + /* Follow me in irnet_discovervalue_confirm() */ + + DEXIT(IRDA_SR_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_dname_to_daddr (self) + * + * Convert an IrDA nickname to a valid IrDA address + * + * It basically look into the discovery log until there is a match. + */ +static inline int +irnet_dname_to_daddr(irnet_socket * self) +{ + struct irda_device_info *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* Ask lmp for the current discovery log */ + discoveries = irlmp_get_discoveries(&number, 0xffff, + DISCOVERY_DEFAULT_SLOTS); + /* Check if the we got some results */ + if(discoveries == NULL) + DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n"); + + /* + * Now, check all discovered devices (if any), and connect + * client only about the services that the client is + * interested in... + */ + for(i = 0; i < number; i++) + { + /* Does the name match ? */ + if(!strncmp(discoveries[i].info, self->rname, NICKNAME_MAX_LEN)) + { + /* Yes !!! Get it.. */ + self->daddr = discoveries[i].daddr; + DEBUG(IRDA_SR_INFO, "discovered device ``%s'' at address 0x%08x.\n", + self->rname, self->daddr); + kfree(discoveries); + DEXIT(IRDA_SR_TRACE, "\n"); + return 0; + } + } + /* No luck ! */ + DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname); + kfree(discoveries); + return -EADDRNOTAVAIL; +} + + +/************************* SOCKET ROUTINES *************************/ +/* + * This are the main operations on IrNET sockets, basically to create + * and destroy IrNET sockets. These are called from the PPP part... + */ + +/*------------------------------------------------------------------*/ +/* + * Create a IrNET instance : just initialise some parameters... + */ +int +irda_irnet_create(irnet_socket * self) +{ + DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self); + + self->magic = IRNET_MAGIC; /* Paranoia */ + + self->ttp_open = 0; /* Prevent higher layer from accessing IrTTP */ + self->ttp_connect = 0; /* Not connecting yet */ + self->rname[0] = '\0'; /* May be set via control channel */ + self->rdaddr = DEV_ADDR_ANY; /* May be set via control channel */ + self->rsaddr = DEV_ADDR_ANY; /* May be set via control channel */ + self->daddr = DEV_ADDR_ANY; /* Until we get connected */ + self->saddr = DEV_ADDR_ANY; /* Until we get connected */ + self->max_sdu_size_rx = TTP_SAR_UNBOUND; + + /* Register as a client with IrLMP */ + self->ckey = irlmp_register_client(0, NULL, NULL, NULL); +#ifdef DISCOVERY_NOMASK + self->mask = 0xffff; /* For W2k compatibility */ +#else /* DISCOVERY_NOMASK */ + self->mask = irlmp_service_to_hint(S_LAN); +#endif /* DISCOVERY_NOMASK */ + self->tx_flow = FLOW_START; /* Flow control from IrTTP */ + + INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect); + + DEXIT(IRDA_SOCK_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Connect to the other side : + * o convert device name to an address + * o find the socket number (dlsap) + * o Establish the connection + * + * Note : We no longer mimic af_irda. The IAS query for finding the TSAP + * is done asynchronously, like the TTP connection. This allow us to + * call this function from any context (not only process). + * The downside is that following what's happening in there is tricky + * because it involve various functions all over the place... + */ +int +irda_irnet_connect(irnet_socket * self) +{ + int err; + + DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self); + + /* Check if we are already trying to connect. + * Because irda_irnet_connect() can be called directly by pppd plus + * packet retries in ppp_generic and connect may take time, plus we may + * race with irnet_connect_indication(), we need to be careful there... */ + if(test_and_set_bit(0, &self->ttp_connect)) + DRETURN(-EBUSY, IRDA_SOCK_INFO, "Already connecting...\n"); + if((self->iriap != NULL) || (self->tsap != NULL)) + DERROR(IRDA_SOCK_ERROR, "Socket not cleaned up...\n"); + + /* Insert ourselves in the hashbin so that the IrNET server can find us. + * Notes : 4th arg is string of 32 char max and must be null terminated + * When 4th arg is used (string), 3rd arg isn't (int) + * Can't re-insert (MUST remove first) so check for that... */ + if((irnet_server.running) && (self->q.q_next == NULL)) + { + spin_lock_bh(&irnet_server.spinlock); + hashbin_insert(irnet_server.list, (irda_queue_t *) self, 0, self->rname); + spin_unlock_bh(&irnet_server.spinlock); + DEBUG(IRDA_SOCK_INFO, "Inserted ``%s'' in hashbin...\n", self->rname); + } + + /* If we don't have anything (no address, no name) */ + if((self->rdaddr == DEV_ADDR_ANY) && (self->rname[0] == '\0')) + { + /* Try to find a suitable address */ + if((err = irnet_discover_daddr_and_lsap_sel(self)) != 0) + DRETURN(err, IRDA_SOCK_INFO, "auto-connect failed!\n"); + /* In most cases, the call above is non-blocking */ + } + else + { + /* If we have only the name (no address), try to get an address */ + if(self->rdaddr == DEV_ADDR_ANY) + { + if((err = irnet_dname_to_daddr(self)) != 0) + DRETURN(err, IRDA_SOCK_INFO, "name connect failed!\n"); + } + else + /* Use the requested destination address */ + self->daddr = self->rdaddr; + + /* Query remote LM-IAS to find LSAP selector */ + irnet_find_lsap_sel(self); + /* The above call is non blocking */ + } + + /* At this point, we are waiting for the IrDA stack to call us back, + * or we have already failed. + * We will finish the connection procedure in irnet_connect_tsap(). + */ + DEXIT(IRDA_SOCK_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_irnet_destroy(self) + * + * Destroy irnet instance + * + * Note : this need to be called from a process context. + */ +void +irda_irnet_destroy(irnet_socket * self) +{ + DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self); + if(self == NULL) + return; + + /* Remove ourselves from hashbin (if we are queued in hashbin) + * Note : `irnet_server.running' protect us from calls in hashbin_delete() */ + if((irnet_server.running) && (self->q.q_next != NULL)) + { + struct irnet_socket * entry; + DEBUG(IRDA_SOCK_INFO, "Removing from hash..\n"); + spin_lock_bh(&irnet_server.spinlock); + entry = hashbin_remove_this(irnet_server.list, (irda_queue_t *) self); + self->q.q_next = NULL; + spin_unlock_bh(&irnet_server.spinlock); + DASSERT(entry == self, , IRDA_SOCK_ERROR, "Can't remove from hash.\n"); + } + + /* If we were connected, post a message */ + if(test_bit(0, &self->ttp_open)) + { + /* Note : as the disconnect comes from ppp_generic, the unit number + * doesn't exist anymore when we post the event, so we need to pass + * NULL as the first arg... */ + irnet_post_event(NULL, IRNET_DISCONNECT_TO, + self->saddr, self->daddr, self->rname, 0); + } + + /* Prevent various IrDA callbacks from messing up things + * Need to be first */ + clear_bit(0, &self->ttp_connect); + + /* Prevent higher layer from accessing IrTTP */ + clear_bit(0, &self->ttp_open); + + /* Unregister with IrLMP */ + irlmp_unregister_client(self->ckey); + + /* Unregister with LM-IAS */ + if(self->iriap) + { + iriap_close(self->iriap); + self->iriap = NULL; + } + + /* Cleanup eventual discoveries from connection attempt or control channel */ + if(self->discoveries != NULL) + { + /* Cleanup our copy of the discovery log */ + kfree(self->discoveries); + self->discoveries = NULL; + } + + /* Close our IrTTP connection */ + if(self->tsap) + { + DEBUG(IRDA_SOCK_INFO, "Closing our TTP connection.\n"); + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + self->stsap_sel = 0; + + DEXIT(IRDA_SOCK_TRACE, "\n"); +} + + +/************************** SERVER SOCKET **************************/ +/* + * The IrNET service is composed of one server socket and a variable + * number of regular IrNET sockets. The server socket is supposed to + * handle incoming connections and redirect them to one IrNET sockets. + * It's a superset of the regular IrNET socket, but has a very distinct + * behaviour... + */ + +/*------------------------------------------------------------------*/ +/* + * Function irnet_daddr_to_dname (self) + * + * Convert an IrDA address to a IrDA nickname + * + * It basically look into the discovery log until there is a match. + */ +static inline int +irnet_daddr_to_dname(irnet_socket * self) +{ + struct irda_device_info *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + + DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self); + + /* Ask lmp for the current discovery log */ + discoveries = irlmp_get_discoveries(&number, 0xffff, + DISCOVERY_DEFAULT_SLOTS); + /* Check if the we got some results */ + if (discoveries == NULL) + DRETURN(-ENETUNREACH, IRDA_SERV_INFO, "Cachelog empty...\n"); + + /* Now, check all discovered devices (if any) */ + for(i = 0; i < number; i++) + { + /* Does the name match ? */ + if(discoveries[i].daddr == self->daddr) + { + /* Yes !!! Get it.. */ + strlcpy(self->rname, discoveries[i].info, sizeof(self->rname)); + self->rname[sizeof(self->rname) - 1] = '\0'; + DEBUG(IRDA_SERV_INFO, "Device 0x%08x is in fact ``%s''.\n", + self->daddr, self->rname); + kfree(discoveries); + DEXIT(IRDA_SERV_TRACE, "\n"); + return 0; + } + } + /* No luck ! */ + DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr); + kfree(discoveries); + return -EADDRNOTAVAIL; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_find_socket (self) + * + * Find the correct IrNET socket + * + * Look into the list of IrNET sockets and finds one with the right + * properties... + */ +static inline irnet_socket * +irnet_find_socket(irnet_socket * self) +{ + irnet_socket * new = (irnet_socket *) NULL; + int err; + + DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self); + + /* Get the addresses of the requester */ + self->daddr = irttp_get_daddr(self->tsap); + self->saddr = irttp_get_saddr(self->tsap); + + /* Try to get the IrDA nickname of the requester */ + err = irnet_daddr_to_dname(self); + + /* Protect access to the instance list */ + spin_lock_bh(&irnet_server.spinlock); + + /* So now, try to get an socket having specifically + * requested that nickname */ + if(err == 0) + { + new = (irnet_socket *) hashbin_find(irnet_server.list, + 0, self->rname); + if(new) + DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches rname ``%s''.\n", + new, new->rname); + } + + /* If no name matches, try to find an socket by the destination address */ + /* It can be either the requested destination address (set via the + * control channel), or the current destination address if the + * socket is in the middle of a connection request */ + if(new == (irnet_socket *) NULL) + { + new = (irnet_socket *) hashbin_get_first(irnet_server.list); + while(new !=(irnet_socket *) NULL) + { + /* Does it have the same address ? */ + if((new->rdaddr == self->daddr) || (new->daddr == self->daddr)) + { + /* Yes !!! Get it.. */ + DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches daddr %#08x.\n", + new, self->daddr); + break; + } + new = (irnet_socket *) hashbin_get_next(irnet_server.list); + } + } + + /* If we don't have any socket, get the first unconnected socket */ + if(new == (irnet_socket *) NULL) + { + new = (irnet_socket *) hashbin_get_first(irnet_server.list); + while(new !=(irnet_socket *) NULL) + { + /* Is it available ? */ + if(!(test_bit(0, &new->ttp_open)) && (new->rdaddr == DEV_ADDR_ANY) && + (new->rname[0] == '\0') && (new->ppp_open)) + { + /* Yes !!! Get it.. */ + DEBUG(IRDA_SERV_INFO, "Socket 0x%p is free.\n", + new); + break; + } + new = (irnet_socket *) hashbin_get_next(irnet_server.list); + } + } + + /* Spin lock end */ + spin_unlock_bh(&irnet_server.spinlock); + + DEXIT(IRDA_SERV_TRACE, " - new = 0x%p\n", new); + return new; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_connect_socket (self) + * + * Connect an incoming connection to the socket + * + */ +static inline int +irnet_connect_socket(irnet_socket * server, + irnet_socket * new, + struct qos_info * qos, + __u32 max_sdu_size, + __u8 max_header_size) +{ + DENTER(IRDA_SERV_TRACE, "(server=0x%p, new=0x%p)\n", + server, new); + + /* Now attach up the new socket */ + new->tsap = irttp_dup(server->tsap, new); + DABORT(new->tsap == NULL, -1, IRDA_SERV_ERROR, "dup failed!\n"); + + /* Set up all the relevant parameters on the new socket */ + new->stsap_sel = new->tsap->stsap_sel; + new->dtsap_sel = new->tsap->dtsap_sel; + new->saddr = irttp_get_saddr(new->tsap); + new->daddr = irttp_get_daddr(new->tsap); + + new->max_header_size = max_header_size; + new->max_sdu_size_tx = max_sdu_size; + new->max_data_size = max_sdu_size; +#ifdef STREAM_COMPAT + /* If we want to receive "stream sockets" */ + if(max_sdu_size == 0) + new->max_data_size = irttp_get_max_seg_size(new->tsap); +#endif /* STREAM_COMPAT */ + + /* Clean up the original one to keep it in listen state */ + irttp_listen(server->tsap); + + /* Send a connection response on the new socket */ + irttp_connect_response(new->tsap, new->max_sdu_size_rx, NULL); + + /* Allow PPP to send its junk over the new socket... */ + set_bit(0, &new->ttp_open); + + /* Not connecting anymore, and clean up last possible remains + * of connection attempts on the socket */ + clear_bit(0, &new->ttp_connect); + if(new->iriap) + { + iriap_close(new->iriap); + new->iriap = NULL; + } + if(new->discoveries != NULL) + { + kfree(new->discoveries); + new->discoveries = NULL; + } + +#ifdef CONNECT_INDIC_KICK + /* As currently we don't block packets in ppp_irnet_send() while passive, + * this is not really needed... + * Also, not doing it give IrDA a chance to finish the setup properly + * before being swamped with packets... */ + ppp_output_wakeup(&new->chan); +#endif /* CONNECT_INDIC_KICK */ + + /* Notify the control channel */ + irnet_post_event(new, IRNET_CONNECT_FROM, + new->saddr, new->daddr, server->rname, 0); + + DEXIT(IRDA_SERV_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_disconnect_server (self) + * + * Cleanup the server socket when the incoming connection abort + * + */ +static inline void +irnet_disconnect_server(irnet_socket * self, + struct sk_buff *skb) +{ + DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self); + + /* Put the received packet in the black hole */ + kfree_skb(skb); + +#ifdef FAIL_SEND_DISCONNECT + /* Tell the other party we don't want to be connected */ + /* Hum... Is it the right thing to do ? And do we need to send + * a connect response before ? It looks ok without this... */ + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); +#endif /* FAIL_SEND_DISCONNECT */ + + /* Notify the control channel (see irnet_find_socket()) */ + irnet_post_event(NULL, IRNET_REQUEST_FROM, + self->saddr, self->daddr, self->rname, 0); + + /* Clean up the server to keep it in listen state */ + irttp_listen(self->tsap); + + DEXIT(IRDA_SERV_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_setup_server (self) + * + * Create a IrTTP server and set it up... + * + * Register the IrLAN hint bit, create a IrTTP instance for us, + * set all the IrTTP callbacks and create an IrIAS entry... + */ +static inline int +irnet_setup_server(void) +{ + __u16 hints; + + DENTER(IRDA_SERV_TRACE, "()\n"); + + /* Initialise the regular socket part of the server */ + irda_irnet_create(&irnet_server.s); + + /* Open a local TSAP (an IrTTP instance) for the server */ + irnet_open_tsap(&irnet_server.s); + + /* PPP part setup */ + irnet_server.s.ppp_open = 0; + irnet_server.s.chan.private = NULL; + irnet_server.s.file = NULL; + + /* Get the hint bit corresponding to IrLAN */ + /* Note : we overload the IrLAN hint bit. As it is only a "hint", and as + * we provide roughly the same functionality as IrLAN, this is ok. + * In fact, the situation is similar as JetSend overloading the Obex hint + */ + hints = irlmp_service_to_hint(S_LAN); + +#ifdef ADVERTISE_HINT + /* Register with IrLMP as a service (advertise our hint bit) */ + irnet_server.skey = irlmp_register_service(hints); +#endif /* ADVERTISE_HINT */ + + /* Register with LM-IAS (so that people can connect to us) */ + irnet_server.ias_obj = irias_new_object(IRNET_SERVICE_NAME, jiffies); + irias_add_integer_attrib(irnet_server.ias_obj, IRNET_IAS_VALUE, + irnet_server.s.stsap_sel, IAS_KERNEL_ATTR); + irias_insert_object(irnet_server.ias_obj); + +#ifdef DISCOVERY_EVENTS + /* Tell IrLMP we want to be notified of newly discovered nodes */ + irlmp_update_client(irnet_server.s.ckey, hints, + irnet_discovery_indication, irnet_expiry_indication, + (void *) &irnet_server.s); +#endif + + DEXIT(IRDA_SERV_TRACE, " - self=0x%p\n", &irnet_server.s); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_destroy_server (self) + * + * Destroy the IrTTP server... + * + * Reverse of the previous function... + */ +static inline void +irnet_destroy_server(void) +{ + DENTER(IRDA_SERV_TRACE, "()\n"); + +#ifdef ADVERTISE_HINT + /* Unregister with IrLMP */ + irlmp_unregister_service(irnet_server.skey); +#endif /* ADVERTISE_HINT */ + + /* Unregister with LM-IAS */ + if(irnet_server.ias_obj) + irias_delete_object(irnet_server.ias_obj); + + /* Cleanup the socket part */ + irda_irnet_destroy(&irnet_server.s); + + DEXIT(IRDA_SERV_TRACE, "\n"); +} + + +/************************ IRDA-TTP CALLBACKS ************************/ +/* + * When we create a IrTTP instance, we pass to it a set of callbacks + * that IrTTP will call in case of various events. + * We take care of those events here. + */ + +/*------------------------------------------------------------------*/ +/* + * Function irnet_data_indication (instance, sap, skb) + * + * Received some data from TinyTP. Just queue it on the receive queue + * + */ +static int +irnet_data_indication(void * instance, + void * sap, + struct sk_buff *skb) +{ + irnet_socket * ap = (irnet_socket *) instance; + unsigned char * p; + int code = 0; + + DENTER(IRDA_TCB_TRACE, "(self/ap=0x%p, skb=0x%p)\n", + ap, skb); + DASSERT(skb != NULL, 0, IRDA_CB_ERROR, "skb is NULL !!!\n"); + + /* Check is ppp is ready to receive our packet */ + if(!ap->ppp_open) + { + DERROR(IRDA_CB_ERROR, "PPP not ready, dropping packet...\n"); + /* When we return error, TTP will need to requeue the skb and + * will stop the sender. IrTTP will stall until we send it a + * flow control request... */ + return -ENOMEM; + } + + /* strip address/control field if present */ + p = skb->data; + if((p[0] == PPP_ALLSTATIONS) && (p[1] == PPP_UI)) + { + /* chop off address/control */ + if(skb->len < 3) + goto err_exit; + p = skb_pull(skb, 2); + } + + /* decompress protocol field if compressed */ + if(p[0] & 1) + { + /* protocol is compressed */ + skb_push(skb, 1)[0] = 0; + } + else + if(skb->len < 2) + goto err_exit; + + /* pass to generic ppp layer */ + /* Note : how do I know if ppp can accept or not the packet ? This is + * essential if I want to manage flow control smoothly... */ + ppp_input(&ap->chan, skb); + + DEXIT(IRDA_TCB_TRACE, "\n"); + return 0; + + err_exit: + DERROR(IRDA_CB_ERROR, "Packet too small, dropping...\n"); + kfree_skb(skb); + ppp_input_error(&ap->chan, code); + return 0; /* Don't return an error code, only for flow control... */ +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_disconnect_indication (instance, sap, reason, skb) + * + * Connection has been closed. Chech reason to find out why + * + * Note : there are many cases where we come here : + * o attempted to connect, timeout + * o connected, link is broken, LAP has timeout + * o connected, other side close the link + * o connection request on the server not handled + */ +static void +irnet_disconnect_indication(void * instance, + void * sap, + LM_REASON reason, + struct sk_buff *skb) +{ + irnet_socket * self = (irnet_socket *) instance; + int test_open; + int test_connect; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n"); + + /* Don't care about it, but let's not leak it */ + if(skb) + dev_kfree_skb(skb); + + /* Prevent higher layer from accessing IrTTP */ + test_open = test_and_clear_bit(0, &self->ttp_open); + /* Not connecting anymore... + * (note : TSAP is open, so IAP callbacks are no longer pending...) */ + test_connect = test_and_clear_bit(0, &self->ttp_connect); + + /* If both self->ttp_open and self->ttp_connect are NULL, it mean that we + * have a race condition with irda_irnet_destroy() or + * irnet_connect_indication(), so don't mess up tsap... + */ + if(!(test_open || test_connect)) + { + DERROR(IRDA_CB_ERROR, "Race condition detected...\n"); + return; + } + + /* If we were active, notify the control channel */ + if(test_open) + irnet_post_event(self, IRNET_DISCONNECT_FROM, + self->saddr, self->daddr, self->rname, 0); + else + /* If we were trying to connect, notify the control channel */ + if((self->tsap) && (self != &irnet_server.s)) + irnet_post_event(self, IRNET_NOANSWER_FROM, + self->saddr, self->daddr, self->rname, 0); + + /* Close our IrTTP connection, cleanup tsap */ + if((self->tsap) && (self != &irnet_server.s)) + { + DEBUG(IRDA_CB_INFO, "Closing our TTP connection.\n"); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + /* Cleanup the socket in case we want to reconnect in ppp_output_wakeup() */ + self->stsap_sel = 0; + self->daddr = DEV_ADDR_ANY; + self->tx_flow = FLOW_START; + + /* Deal with the ppp instance if it's still alive */ + if(self->ppp_open) + { + if(test_open) + { + /* ppp_unregister_channel() wants a user context. */ + schedule_work(&self->disconnect_work); + } + else + { + /* If we were trying to connect, flush (drain) ppp_generic + * Tx queue (most often we have blocked it), which will + * trigger an other attempt to connect. If we are passive, + * this will empty the Tx queue after last try. */ + ppp_output_wakeup(&self->chan); + } + } + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_connect_confirm (instance, sap, qos, max_sdu_size, skb) + * + * Connections has been confirmed by the remote device + * + */ +static void +irnet_connect_confirm(void * instance, + void * sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + irnet_socket * self = (irnet_socket *) instance; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self); + + /* Check if socket is closing down (via irda_irnet_destroy()) */ + if(! test_bit(0, &self->ttp_connect)) + { + DERROR(IRDA_CB_ERROR, "Socket no longer connecting. Ouch !\n"); + return; + } + + /* How much header space do we need to reserve */ + self->max_header_size = max_header_size; + + /* IrTTP max SDU size in transmit direction */ + self->max_sdu_size_tx = max_sdu_size; + self->max_data_size = max_sdu_size; +#ifdef STREAM_COMPAT + if(max_sdu_size == 0) + self->max_data_size = irttp_get_max_seg_size(self->tsap); +#endif /* STREAM_COMPAT */ + + /* At this point, IrLMP has assigned our source address */ + self->saddr = irttp_get_saddr(self->tsap); + + /* Allow higher layer to access IrTTP */ + set_bit(0, &self->ttp_open); + clear_bit(0, &self->ttp_connect); /* Not racy, IrDA traffic is serial */ + /* Give a kick in the ass of ppp_generic so that he sends us some data */ + ppp_output_wakeup(&self->chan); + + /* Check size of received packet */ + if(skb->len > 0) + { +#ifdef PASS_CONNECT_PACKETS + DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n"); + /* Try to pass it to PPP */ + irnet_data_indication(instance, sap, skb); +#else /* PASS_CONNECT_PACKETS */ + DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n"); + kfree_skb(skb); /* Note : will be optimised with other kfree... */ +#endif /* PASS_CONNECT_PACKETS */ + } + else + kfree_skb(skb); + + /* Notify the control channel */ + irnet_post_event(self, IRNET_CONNECT_TO, + self->saddr, self->daddr, self->rname, 0); + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_flow_indication (instance, sap, flow) + * + * Used by TinyTP to tell us if it can accept more data or not + * + */ +static void +irnet_flow_indication(void * instance, + void * sap, + LOCAL_FLOW flow) +{ + irnet_socket * self = (irnet_socket *) instance; + LOCAL_FLOW oldflow = self->tx_flow; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p, flow=%d)\n", self, flow); + + /* Update our state */ + self->tx_flow = flow; + + /* Check what IrTTP want us to do... */ + switch(flow) + { + case FLOW_START: + DEBUG(IRDA_CB_INFO, "IrTTP wants us to start again\n"); + /* Check if we really need to wake up PPP */ + if(oldflow == FLOW_STOP) + ppp_output_wakeup(&self->chan); + else + DEBUG(IRDA_CB_INFO, "But we were already transmitting !!!\n"); + break; + case FLOW_STOP: + DEBUG(IRDA_CB_INFO, "IrTTP wants us to slow down\n"); + break; + default: + DEBUG(IRDA_CB_INFO, "Unknown flow command!\n"); + break; + } + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_status_indication (instance, sap, reason, skb) + * + * Link (IrLAP) status report. + * + */ +static void +irnet_status_indication(void * instance, + LINK_STATUS link, + LOCK_STATUS lock) +{ + irnet_socket * self = (irnet_socket *) instance; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n"); + + /* We can only get this event if we are connected */ + switch(link) + { + case STATUS_NO_ACTIVITY: + irnet_post_event(self, IRNET_BLOCKED_LINK, + self->saddr, self->daddr, self->rname, 0); + break; + default: + DEBUG(IRDA_CB_INFO, "Unknown status...\n"); + } + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_connect_indication(instance, sap, qos, max_sdu_size, userdata) + * + * Incoming connection + * + * In theory, this function is called only on the server socket. + * Some other node is attempting to connect to the IrNET service, and has + * sent a connection request on our server socket. + * We just redirect the connection to the relevant IrNET socket. + * + * Note : we also make sure that between 2 irnet nodes, there can + * exist only one irnet connection. + */ +static void +irnet_connect_indication(void * instance, + void * sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + irnet_socket * server = &irnet_server.s; + irnet_socket * new = (irnet_socket *) NULL; + + DENTER(IRDA_TCB_TRACE, "(server=0x%p)\n", server); + DASSERT(instance == &irnet_server, , IRDA_CB_ERROR, + "Invalid instance (0x%p) !!!\n", instance); + DASSERT(sap == irnet_server.s.tsap, , IRDA_CB_ERROR, "Invalid sap !!!\n"); + + /* Try to find the most appropriate IrNET socket */ + new = irnet_find_socket(server); + + /* After all this hard work, do we have an socket ? */ + if(new == (irnet_socket *) NULL) + { + DEXIT(IRDA_CB_INFO, ": No socket waiting for this connection.\n"); + irnet_disconnect_server(server, skb); + return; + } + + /* Is the socket already busy ? */ + if(test_bit(0, &new->ttp_open)) + { + DEXIT(IRDA_CB_INFO, ": Socket already connected.\n"); + irnet_disconnect_server(server, skb); + return; + } + + /* The following code is a bit tricky, so need comments ;-) + */ + /* If ttp_connect is set, the socket is trying to connect to the other + * end and may have sent a IrTTP connection request and is waiting for + * a connection response (that may never come). + * Now, the pain is that the socket may have opened a tsap and is + * waiting on it, while the other end is trying to connect to it on + * another tsap. + * Because IrNET can be peer to peer, we need to workaround this. + * Furthermore, the way the irnetd script is implemented, the + * target will create a second IrNET connection back to the + * originator and expect the originator to bind this new connection + * to the original PPPD instance. + * And of course, if we don't use irnetd, we can have a race when + * both side try to connect simultaneously, which could leave both + * connections half closed (yuck). + * Conclusions : + * 1) The "originator" must accept the new connection and get rid + * of the old one so that irnetd works + * 2) One side must deny the new connection to avoid races, + * but both side must agree on which side it is... + * Most often, the originator is primary at the LAP layer. + * Jean II + */ + /* Now, let's look at the way I wrote the test... + * We need to clear up the ttp_connect flag atomically to prevent + * irnet_disconnect_indication() to mess up the tsap we are going to close. + * We want to clear the ttp_connect flag only if we close the tsap, + * otherwise we will never close it, so we need to check for primary + * *before* doing the test on the flag. + * And of course, ALLOW_SIMULT_CONNECT can disable this entirely... + * Jean II + */ + + /* Socket already connecting ? On primary ? */ + if(0 +#ifdef ALLOW_SIMULT_CONNECT + || ((irttp_is_primary(server->tsap) == 1) && /* primary */ + (test_and_clear_bit(0, &new->ttp_connect))) +#endif /* ALLOW_SIMULT_CONNECT */ + ) + { + DERROR(IRDA_CB_ERROR, "Socket already connecting, but going to reuse it !\n"); + + /* Cleanup the old TSAP if necessary - IrIAP will be cleaned up later */ + if(new->tsap != NULL) + { + /* Close the old connection the new socket was attempting, + * so that we can hook it up to the new connection. + * It's now safe to do it... */ + irttp_close_tsap(new->tsap); + new->tsap = NULL; + } + } + else + { + /* Three options : + * 1) socket was not connecting or connected : ttp_connect should be 0. + * 2) we don't want to connect the socket because we are secondary or + * ALLOW_SIMULT_CONNECT is undefined. ttp_connect should be 1. + * 3) we are half way in irnet_disconnect_indication(), and it's a + * nice race condition... Fortunately, we can detect that by checking + * if tsap is still alive. On the other hand, we can't be in + * irda_irnet_destroy() otherwise we would not have found this + * socket in the hashbin. + * Jean II */ + if((test_bit(0, &new->ttp_connect)) || (new->tsap != NULL)) + { + /* Don't mess this socket, somebody else in in charge... */ + DERROR(IRDA_CB_ERROR, "Race condition detected, socket in use, abort connect...\n"); + irnet_disconnect_server(server, skb); + return; + } + } + + /* So : at this point, we have a socket, and it is idle. Good ! */ + irnet_connect_socket(server, new, qos, max_sdu_size, max_header_size); + + /* Check size of received packet */ + if(skb->len > 0) + { +#ifdef PASS_CONNECT_PACKETS + DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n"); + /* Try to pass it to PPP */ + irnet_data_indication(new, new->tsap, skb); +#else /* PASS_CONNECT_PACKETS */ + DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n"); + kfree_skb(skb); /* Note : will be optimised with other kfree... */ +#endif /* PASS_CONNECT_PACKETS */ + } + else + kfree_skb(skb); + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + + +/********************** IRDA-IAS/LMP CALLBACKS **********************/ +/* + * These are the callbacks called by other layers of the IrDA stack, + * mainly LMP for discovery and IAS for name queries. + */ + +/*------------------------------------------------------------------*/ +/* + * Function irnet_getvalue_confirm (result, obj_id, value, priv) + * + * Got answer from remote LM-IAS, just connect + * + * This is the reply to a IAS query we were doing to find the TSAP of + * the device we want to connect to. + * If we have found a valid TSAP, just initiate the TTP connection + * on this TSAP. + */ +static void +irnet_getvalue_confirm(int result, + __u16 obj_id, + struct ias_value *value, + void * priv) +{ + irnet_socket * self = (irnet_socket *) priv; + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n"); + + /* Check if already connected (via irnet_connect_socket()) + * or socket is closing down (via irda_irnet_destroy()) */ + if(! test_bit(0, &self->ttp_connect)) + { + DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n"); + return; + } + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* Post process the IAS reply */ + self->dtsap_sel = irnet_ias_to_tsap(self, result, value); + + /* If error, just go out */ + if(self->errno) + { + clear_bit(0, &self->ttp_connect); + DERROR(IRDA_OCB_ERROR, "IAS connect failed ! (0x%X)\n", self->errno); + return; + } + + DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n", + self->daddr, self->dtsap_sel); + + /* Start up TTP - non blocking */ + irnet_connect_tsap(self); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_discovervalue_confirm (result, obj_id, value, priv) + * + * Handle the TSAP discovery procedure state machine. + * Got answer from remote LM-IAS, try next device + * + * We are doing a TSAP discovery procedure, and we got an answer to + * a IAS query we were doing to find the TSAP on one of the address + * in the discovery log. + * + * If we have found a valid TSAP for the first time, save it. If it's + * not the first time we found one, complain. + * + * If we have more addresses in the log, just initiate a new query. + * Note that those query may fail (see irnet_discover_daddr_and_lsap_sel()) + * + * Otherwise, wrap up the procedure (cleanup), check if we have found + * any device and connect to it. + */ +static void +irnet_discovervalue_confirm(int result, + __u16 obj_id, + struct ias_value *value, + void * priv) +{ + irnet_socket * self = (irnet_socket *) priv; + __u8 dtsap_sel; /* TSAP we are looking for */ + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n"); + + /* Check if already connected (via irnet_connect_socket()) + * or socket is closing down (via irda_irnet_destroy()) */ + if(! test_bit(0, &self->ttp_connect)) + { + DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n"); + return; + } + + /* Post process the IAS reply */ + dtsap_sel = irnet_ias_to_tsap(self, result, value); + + /* Have we got something ? */ + if(self->errno == 0) + { + /* We found the requested service */ + if(self->daddr != DEV_ADDR_ANY) + { + DERROR(IRDA_OCB_ERROR, "More than one device in range supports IrNET...\n"); + } + else + { + /* First time we found that one, save it ! */ + self->daddr = self->discoveries[self->disco_index].daddr; + self->dtsap_sel = dtsap_sel; + } + } + + /* If no failure */ + if((self->errno == -EADDRNOTAVAIL) || (self->errno == 0)) + { + int ret; + + /* Search the next node */ + ret = irnet_discover_next_daddr(self); + if(!ret) + { + /* In this case, the above request was non-blocking. + * We will return here after a while... */ + return; + } + /* In this case, we have processed the last discovery item */ + } + + /* No more queries to be done (failure or last one) */ + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* No more items : remove the log and signal termination */ + DEBUG(IRDA_OCB_INFO, "Cleaning up log (0x%p)\n", + self->discoveries); + if(self->discoveries != NULL) + { + /* Cleanup our copy of the discovery log */ + kfree(self->discoveries); + self->discoveries = NULL; + } + self->disco_number = -1; + + /* Check out what we found */ + if(self->daddr == DEV_ADDR_ANY) + { + self->daddr = DEV_ADDR_ANY; + clear_bit(0, &self->ttp_connect); + DEXIT(IRDA_OCB_TRACE, ": cannot discover IrNET in any device !!!\n"); + return; + } + + /* We have a valid address - just connect */ + + DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n", + self->daddr, self->dtsap_sel); + + /* Start up TTP - non blocking */ + irnet_connect_tsap(self); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} + +#ifdef DISCOVERY_EVENTS +/*------------------------------------------------------------------*/ +/* + * Function irnet_discovery_indication (discovery) + * + * Got a discovery indication from IrLMP, post an event + * + * Note : IrLMP take care of matching the hint mask for us, and also + * check if it is a "new" node for us... + * + * As IrLMP filter on the IrLAN hint bit, we get both IrLAN and IrNET + * nodes, so it's only at connection time that we will know if the + * node support IrNET, IrLAN or both. The other solution is to check + * in IAS the PNP ids and service name. + * Note : even if a node support IrNET (or IrLAN), it's no guarantee + * that we will be able to connect to it, the node might already be + * busy... + * + * One last thing : in some case, this function will trigger duplicate + * discovery events. On the other hand, we should catch all + * discoveries properly (i.e. not miss one). Filtering duplicate here + * is to messy, so we leave that to user space... + */ +static void +irnet_discovery_indication(discinfo_t * discovery, + DISCOVERY_MODE mode, + void * priv) +{ + irnet_socket * self = &irnet_server.s; + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR, + "Invalid instance (0x%p) !!!\n", priv); + + DEBUG(IRDA_OCB_INFO, "Discovered new IrNET/IrLAN node %s...\n", + discovery->info); + + /* Notify the control channel */ + irnet_post_event(NULL, IRNET_DISCOVER, + discovery->saddr, discovery->daddr, discovery->info, + get_unaligned((__u16 *)discovery->hints)); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_expiry_indication (expiry) + * + * Got a expiry indication from IrLMP, post an event + * + * Note : IrLMP take care of matching the hint mask for us, we only + * check if it is a "new" node... + */ +static void +irnet_expiry_indication(discinfo_t * expiry, + DISCOVERY_MODE mode, + void * priv) +{ + irnet_socket * self = &irnet_server.s; + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR, + "Invalid instance (0x%p) !!!\n", priv); + + DEBUG(IRDA_OCB_INFO, "IrNET/IrLAN node %s expired...\n", + expiry->info); + + /* Notify the control channel */ + irnet_post_event(NULL, IRNET_EXPIRE, + expiry->saddr, expiry->daddr, expiry->info, + get_unaligned((__u16 *)expiry->hints)); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} +#endif /* DISCOVERY_EVENTS */ + + +/*********************** PROC ENTRY CALLBACKS ***********************/ +/* + * We create a instance in the /proc filesystem, and here we take care + * of that... + */ + +#ifdef CONFIG_PROC_FS +static int +irnet_proc_show(struct seq_file *m, void *v) +{ + irnet_socket * self; + char * state; + int i = 0; + + /* Get the IrNET server information... */ + seq_printf(m, "IrNET server - "); + seq_printf(m, "IrDA state: %s, ", + (irnet_server.running ? "running" : "dead")); + seq_printf(m, "stsap_sel: %02x, ", irnet_server.s.stsap_sel); + seq_printf(m, "dtsap_sel: %02x\n", irnet_server.s.dtsap_sel); + + /* Do we need to continue ? */ + if(!irnet_server.running) + return 0; + + /* Protect access to the instance list */ + spin_lock_bh(&irnet_server.spinlock); + + /* Get the sockets one by one... */ + self = (irnet_socket *) hashbin_get_first(irnet_server.list); + while(self != NULL) + { + /* Start printing info about the socket. */ + seq_printf(m, "\nIrNET socket %d - ", i++); + + /* First, get the requested configuration */ + seq_printf(m, "Requested IrDA name: \"%s\", ", self->rname); + seq_printf(m, "daddr: %08x, ", self->rdaddr); + seq_printf(m, "saddr: %08x\n", self->rsaddr); + + /* Second, get all the PPP info */ + seq_printf(m, " PPP state: %s", + (self->ppp_open ? "registered" : "unregistered")); + if(self->ppp_open) + { + seq_printf(m, ", unit: ppp%d", + ppp_unit_number(&self->chan)); + seq_printf(m, ", channel: %d", + ppp_channel_index(&self->chan)); + seq_printf(m, ", mru: %d", + self->mru); + /* Maybe add self->flags ? Later... */ + } + + /* Then, get all the IrDA specific info... */ + if(self->ttp_open) + state = "connected"; + else + if(self->tsap != NULL) + state = "connecting"; + else + if(self->iriap != NULL) + state = "searching"; + else + if(self->ttp_connect) + state = "weird"; + else + state = "idle"; + seq_printf(m, "\n IrDA state: %s, ", state); + seq_printf(m, "daddr: %08x, ", self->daddr); + seq_printf(m, "stsap_sel: %02x, ", self->stsap_sel); + seq_printf(m, "dtsap_sel: %02x\n", self->dtsap_sel); + + /* Next socket, please... */ + self = (irnet_socket *) hashbin_get_next(irnet_server.list); + } + + /* Spin lock end */ + spin_unlock_bh(&irnet_server.spinlock); + + return 0; +} + +static int irnet_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irnet_proc_show, NULL); +} + +static const struct file_operations irnet_proc_fops = { + .owner = THIS_MODULE, + .open = irnet_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* PROC_FS */ + + +/********************** CONFIGURATION/CLEANUP **********************/ +/* + * Initialisation and teardown of the IrDA part, called at module + * insertion and removal... + */ + +/*------------------------------------------------------------------*/ +/* + * Prepare the IrNET layer for operation... + */ +int __init +irda_irnet_init(void) +{ + int err = 0; + + DENTER(MODULE_TRACE, "()\n"); + + /* Pure paranoia - should be redundant */ + memset(&irnet_server, 0, sizeof(struct irnet_root)); + + /* Setup start of irnet instance list */ + irnet_server.list = hashbin_new(HB_NOLOCK); + DABORT(irnet_server.list == NULL, -ENOMEM, + MODULE_ERROR, "Can't allocate hashbin!\n"); + /* Init spinlock for instance list */ + spin_lock_init(&irnet_server.spinlock); + + /* Initialise control channel */ + init_waitqueue_head(&irnet_events.rwait); + irnet_events.index = 0; + /* Init spinlock for event logging */ + spin_lock_init(&irnet_events.spinlock); + +#ifdef CONFIG_PROC_FS + /* Add a /proc file for irnet infos */ + proc_create("irnet", 0, proc_irda, &irnet_proc_fops); +#endif /* CONFIG_PROC_FS */ + + /* Setup the IrNET server */ + err = irnet_setup_server(); + + if(!err) + /* We are no longer functional... */ + irnet_server.running = 1; + + DEXIT(MODULE_TRACE, "\n"); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Cleanup at exit... + */ +void __exit +irda_irnet_cleanup(void) +{ + DENTER(MODULE_TRACE, "()\n"); + + /* We are no longer there... */ + irnet_server.running = 0; + +#ifdef CONFIG_PROC_FS + /* Remove our /proc file */ + remove_proc_entry("irnet", proc_irda); +#endif /* CONFIG_PROC_FS */ + + /* Remove our IrNET server from existence */ + irnet_destroy_server(); + + /* Remove all instances of IrNET socket still present */ + hashbin_delete(irnet_server.list, (FREE_FUNC) irda_irnet_destroy); + + DEXIT(MODULE_TRACE, "\n"); +} diff --git a/net/irda/irnet/irnet_irda.h b/net/irda/irnet/irnet_irda.h new file mode 100644 index 00000000..3e408952 --- /dev/null +++ b/net/irda/irnet/irnet_irda.h @@ -0,0 +1,178 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file contains all definitions and declarations necessary for the + * IRDA part of the IrNET module (dealing with IrTTP, IrIAS and co). + * This file is a private header, so other modules don't want to know + * what's in there... + */ + +#ifndef IRNET_IRDA_H +#define IRNET_IRDA_H + +/***************************** INCLUDES *****************************/ +/* Please add other headers in irnet.h */ + +#include "irnet.h" /* Module global include */ + +/************************ CONSTANTS & MACROS ************************/ + +/* + * Name of the service (socket name) used by IrNET + */ +/* IAS object name (or part of it) */ +#define IRNET_SERVICE_NAME "IrNetv1" +/* IAS attribute */ +#define IRNET_IAS_VALUE "IrDA:TinyTP:LsapSel" +/* LMP notify name for client (only for /proc/net/irda/irlmp) */ +#define IRNET_NOTIFY_NAME "IrNET socket" +/* LMP notify name for server (only for /proc/net/irda/irlmp) */ +#define IRNET_NOTIFY_NAME_SERV "IrNET server" + +/****************************** TYPES ******************************/ + +/* + * This is the main structure where we store all the data pertaining to + * the IrNET server (listen for connection requests) and the root + * of the IrNET socket list + */ +typedef struct irnet_root +{ + irnet_socket s; /* To pretend we are a client... */ + + /* Generic stuff */ + int magic; /* Paranoia */ + int running; /* Are we operational ? */ + + /* Link list of all IrNET instances opened */ + hashbin_t * list; + spinlock_t spinlock; /* Serialize access to the list */ + /* Note : the way hashbin has been designed is absolutely not + * reentrant, beware... So, we blindly protect all with spinlock */ + + /* Handle for the hint bit advertised in IrLMP */ + void * skey; + + /* Server socket part */ + struct ias_object * ias_obj; /* Our service name + lsap in IAS */ + +} irnet_root; + + +/**************************** PROTOTYPES ****************************/ + +/* ----------------------- CONTROL CHANNEL ----------------------- */ +static void + irnet_post_event(irnet_socket *, + irnet_event, + __u32, + __u32, + char *, + __u16); +/* ----------------------- IRDA SUBROUTINES ----------------------- */ +static inline int + irnet_open_tsap(irnet_socket *); +static inline __u8 + irnet_ias_to_tsap(irnet_socket *, + int, + struct ias_value *); +static inline int + irnet_find_lsap_sel(irnet_socket *); +static inline int + irnet_connect_tsap(irnet_socket *); +static inline int + irnet_discover_next_daddr(irnet_socket *); +static inline int + irnet_discover_daddr_and_lsap_sel(irnet_socket *); +static inline int + irnet_dname_to_daddr(irnet_socket *); +/* ------------------------ SERVER SOCKET ------------------------ */ +static inline int + irnet_daddr_to_dname(irnet_socket *); +static inline irnet_socket * + irnet_find_socket(irnet_socket *); +static inline int + irnet_connect_socket(irnet_socket *, + irnet_socket *, + struct qos_info *, + __u32, + __u8); +static inline void + irnet_disconnect_server(irnet_socket *, + struct sk_buff *); +static inline int + irnet_setup_server(void); +static inline void + irnet_destroy_server(void); +/* ---------------------- IRDA-TTP CALLBACKS ---------------------- */ +static int + irnet_data_indication(void *, /* instance */ + void *, /* sap */ + struct sk_buff *); +static void + irnet_disconnect_indication(void *, + void *, + LM_REASON, + struct sk_buff *); +static void + irnet_connect_confirm(void *, + void *, + struct qos_info *, + __u32, + __u8, + struct sk_buff *); +static void + irnet_flow_indication(void *, + void *, + LOCAL_FLOW); +static void + irnet_status_indication(void *, + LINK_STATUS, + LOCK_STATUS); +static void + irnet_connect_indication(void *, + void *, + struct qos_info *, + __u32, + __u8, + struct sk_buff *); +/* -------------------- IRDA-IAS/LMP CALLBACKS -------------------- */ +static void + irnet_getvalue_confirm(int, + __u16, + struct ias_value *, + void *); +static void + irnet_discovervalue_confirm(int, + __u16, + struct ias_value *, + void *); +#ifdef DISCOVERY_EVENTS +static void + irnet_discovery_indication(discinfo_t *, + DISCOVERY_MODE, + void *); +static void + irnet_expiry_indication(discinfo_t *, + DISCOVERY_MODE, + void *); +#endif + +/**************************** VARIABLES ****************************/ + +/* + * The IrNET server. Listen to connection requests and co... + */ +static struct irnet_root irnet_server; + +/* Control channel stuff (note : extern) */ +struct irnet_ctrl_channel irnet_events; + +/* The /proc/net/irda directory, defined elsewhere... */ +#ifdef CONFIG_PROC_FS +extern struct proc_dir_entry *proc_irda; +#endif /* CONFIG_PROC_FS */ + +#endif /* IRNET_IRDA_H */ diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c new file mode 100644 index 00000000..2bb2beb6 --- /dev/null +++ b/net/irda/irnet/irnet_ppp.c @@ -0,0 +1,1189 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file implement the PPP interface and /dev/irnet character device. + * The PPP interface hook to the ppp_generic module, handle all our + * relationship to the PPP code in the kernel (and by extension to pppd), + * and exchange PPP frames with this module (send/receive). + * The /dev/irnet device is used primarily for 2 functions : + * 1) as a stub for pppd (the ppp daemon), so that we can appropriately + * generate PPP sessions (we pretend we are a tty). + * 2) as a control channel (write commands, read events) + */ + +#include +#include +#include "irnet_ppp.h" /* Private header */ +/* Please put other headers in irnet.h - Thanks */ + +/* Generic PPP callbacks (to call us) */ +static const struct ppp_channel_ops irnet_ppp_ops = { + .start_xmit = ppp_irnet_send, + .ioctl = ppp_irnet_ioctl +}; + +/************************* CONTROL CHANNEL *************************/ +/* + * When a pppd instance is not active on /dev/irnet, it acts as a control + * channel. + * Writing allow to set up the IrDA destination of the IrNET channel, + * and any application may be read events happening in IrNET... + */ + +/*------------------------------------------------------------------*/ +/* + * Write is used to send a command to configure a IrNET channel + * before it is open by pppd. The syntax is : "command argument" + * Currently there is only two defined commands : + * o name : set the requested IrDA nickname of the IrNET peer. + * o addr : set the requested IrDA address of the IrNET peer. + * Note : the code is crude, but effective... + */ +static inline ssize_t +irnet_ctrl_write(irnet_socket * ap, + const char __user *buf, + size_t count) +{ + char command[IRNET_MAX_COMMAND]; + char * start; /* Current command being processed */ + char * next; /* Next command to process */ + int length; /* Length of current command */ + + DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); + + /* Check for overflow... */ + DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM, + CTRL_ERROR, "Too much data !!!\n"); + + /* Get the data in the driver */ + if(copy_from_user(command, buf, count)) + { + DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); + return -EFAULT; + } + + /* Safe terminate the string */ + command[count] = '\0'; + DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n", + command, count); + + /* Check every commands in the command line */ + next = command; + while(next != NULL) + { + /* Look at the next command */ + start = next; + + /* Scrap whitespaces before the command */ + start = skip_spaces(start); + + /* ',' is our command separator */ + next = strchr(start, ','); + if(next) + { + *next = '\0'; /* Terminate command */ + length = next - start; /* Length */ + next++; /* Skip the '\0' */ + } + else + length = strlen(start); + + DEBUG(CTRL_INFO, "Found command ``%s'' (%d).\n", start, length); + + /* Check if we recognised one of the known command + * We can't use "switch" with strings, so hack with "continue" */ + + /* First command : name -> Requested IrDA nickname */ + if(!strncmp(start, "name", 4)) + { + /* Copy the name only if is included and not "any" */ + if((length > 5) && (strcmp(start + 5, "any"))) + { + /* Strip out trailing whitespaces */ + while(isspace(start[length - 1])) + length--; + + DABORT(length < 5 || length > NICKNAME_MAX_LEN + 5, + -EINVAL, CTRL_ERROR, "Invalid nickname.\n"); + + /* Copy the name for later reuse */ + memcpy(ap->rname, start + 5, length - 5); + ap->rname[length - 5] = '\0'; + } + else + ap->rname[0] = '\0'; + DEBUG(CTRL_INFO, "Got rname = ``%s''\n", ap->rname); + + /* Restart the loop */ + continue; + } + + /* Second command : addr, daddr -> Requested IrDA destination address + * Also process : saddr -> Requested IrDA source address */ + if((!strncmp(start, "addr", 4)) || + (!strncmp(start, "daddr", 5)) || + (!strncmp(start, "saddr", 5))) + { + __u32 addr = DEV_ADDR_ANY; + + /* Copy the address only if is included and not "any" */ + if((length > 5) && (strcmp(start + 5, "any"))) + { + char * begp = start + 5; + char * endp; + + /* Scrap whitespaces before the command */ + begp = skip_spaces(begp); + + /* Convert argument to a number (last arg is the base) */ + addr = simple_strtoul(begp, &endp, 16); + /* Has it worked ? (endp should be start + length) */ + DABORT(endp <= (start + 5), -EINVAL, + CTRL_ERROR, "Invalid address.\n"); + } + /* Which type of address ? */ + if(start[0] == 's') + { + /* Save it */ + ap->rsaddr = addr; + DEBUG(CTRL_INFO, "Got rsaddr = %08x\n", ap->rsaddr); + } + else + { + /* Save it */ + ap->rdaddr = addr; + DEBUG(CTRL_INFO, "Got rdaddr = %08x\n", ap->rdaddr); + } + + /* Restart the loop */ + continue; + } + + /* Other possible command : connect N (number of retries) */ + + /* No command matched -> Failed... */ + DABORT(1, -EINVAL, CTRL_ERROR, "Not a recognised IrNET command.\n"); + } + + /* Success : we have parsed all commands successfully */ + return count; +} + +#ifdef INITIAL_DISCOVERY +/*------------------------------------------------------------------*/ +/* + * Function irnet_get_discovery_log (self) + * + * Query the content on the discovery log if not done + * + * This function query the current content of the discovery log + * at the startup of the event channel and save it in the internal struct. + */ +static void +irnet_get_discovery_log(irnet_socket * ap) +{ + __u16 mask = irlmp_service_to_hint(S_LAN); + + /* Ask IrLMP for the current discovery log */ + ap->discoveries = irlmp_get_discoveries(&ap->disco_number, mask, + DISCOVERY_DEFAULT_SLOTS); + + /* Check if the we got some results */ + if(ap->discoveries == NULL) + ap->disco_number = -1; + + DEBUG(CTRL_INFO, "Got the log (0x%p), size is %d\n", + ap->discoveries, ap->disco_number); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_read_discovery_log (self, event) + * + * Read the content on the discovery log + * + * This function dump the current content of the discovery log + * at the startup of the event channel. + * Return 1 if wrote an event on the control channel... + * + * State of the ap->disco_XXX variables : + * Socket creation : discoveries = NULL ; disco_index = 0 ; disco_number = 0 + * While reading : discoveries = ptr ; disco_index = X ; disco_number = Y + * After reading : discoveries = NULL ; disco_index = Y ; disco_number = -1 + */ +static inline int +irnet_read_discovery_log(irnet_socket * ap, + char * event) +{ + int done_event = 0; + + DENTER(CTRL_TRACE, "(ap=0x%p, event=0x%p)\n", + ap, event); + + /* Test if we have some work to do or we have already finished */ + if(ap->disco_number == -1) + { + DEBUG(CTRL_INFO, "Already done\n"); + return 0; + } + + /* Test if it's the first time and therefore we need to get the log */ + if(ap->discoveries == NULL) + irnet_get_discovery_log(ap); + + /* Check if we have more item to dump */ + if(ap->disco_index < ap->disco_number) + { + /* Write an event */ + sprintf(event, "Found %08x (%s) behind %08x {hints %02X-%02X}\n", + ap->discoveries[ap->disco_index].daddr, + ap->discoveries[ap->disco_index].info, + ap->discoveries[ap->disco_index].saddr, + ap->discoveries[ap->disco_index].hints[0], + ap->discoveries[ap->disco_index].hints[1]); + DEBUG(CTRL_INFO, "Writing discovery %d : %s\n", + ap->disco_index, ap->discoveries[ap->disco_index].info); + + /* We have an event */ + done_event = 1; + /* Next discovery */ + ap->disco_index++; + } + + /* Check if we have done the last item */ + if(ap->disco_index >= ap->disco_number) + { + /* No more items : remove the log and signal termination */ + DEBUG(CTRL_INFO, "Cleaning up log (0x%p)\n", + ap->discoveries); + if(ap->discoveries != NULL) + { + /* Cleanup our copy of the discovery log */ + kfree(ap->discoveries); + ap->discoveries = NULL; + } + ap->disco_number = -1; + } + + return done_event; +} +#endif /* INITIAL_DISCOVERY */ + +/*------------------------------------------------------------------*/ +/* + * Read is used to get IrNET events + */ +static inline ssize_t +irnet_ctrl_read(irnet_socket * ap, + struct file * file, + char __user * buf, + size_t count) +{ + DECLARE_WAITQUEUE(wait, current); + char event[64]; /* Max event is 61 char */ + ssize_t ret = 0; + + DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); + + /* Check if we can write an event out in one go */ + DABORT(count < sizeof(event), -EOVERFLOW, CTRL_ERROR, "Buffer to small.\n"); + +#ifdef INITIAL_DISCOVERY + /* Check if we have read the log */ + if(irnet_read_discovery_log(ap, event)) + { + /* We have an event !!! Copy it to the user */ + if(copy_to_user(buf, event, strlen(event))) + { + DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); + return -EFAULT; + } + + DEXIT(CTRL_TRACE, "\n"); + return strlen(event); + } +#endif /* INITIAL_DISCOVERY */ + + /* Put ourselves on the wait queue to be woken up */ + add_wait_queue(&irnet_events.rwait, &wait); + current->state = TASK_INTERRUPTIBLE; + for(;;) + { + /* If there is unread events */ + ret = 0; + if(ap->event_index != irnet_events.index) + break; + ret = -EAGAIN; + if(file->f_flags & O_NONBLOCK) + break; + ret = -ERESTARTSYS; + if(signal_pending(current)) + break; + /* Yield and wait to be woken up */ + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(&irnet_events.rwait, &wait); + + /* Did we got it ? */ + if(ret != 0) + { + /* No, return the error code */ + DEXIT(CTRL_TRACE, " - ret %Zd\n", ret); + return ret; + } + + /* Which event is it ? */ + switch(irnet_events.log[ap->event_index].event) + { + case IRNET_DISCOVER: + sprintf(event, "Discovered %08x (%s) behind %08x {hints %02X-%02X}\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr, + irnet_events.log[ap->event_index].hints.byte[0], + irnet_events.log[ap->event_index].hints.byte[1]); + break; + case IRNET_EXPIRE: + sprintf(event, "Expired %08x (%s) behind %08x {hints %02X-%02X}\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr, + irnet_events.log[ap->event_index].hints.byte[0], + irnet_events.log[ap->event_index].hints.byte[1]); + break; + case IRNET_CONNECT_TO: + sprintf(event, "Connected to %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_CONNECT_FROM: + sprintf(event, "Connection from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_REQUEST_FROM: + sprintf(event, "Request from %08x (%s) behind %08x\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr); + break; + case IRNET_NOANSWER_FROM: + sprintf(event, "No-answer from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_BLOCKED_LINK: + sprintf(event, "Blocked link with %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_DISCONNECT_FROM: + sprintf(event, "Disconnection from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_DISCONNECT_TO: + sprintf(event, "Disconnected to %08x (%s)\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name); + break; + default: + sprintf(event, "Bug\n"); + } + /* Increment our event index */ + ap->event_index = (ap->event_index + 1) % IRNET_MAX_EVENTS; + + DEBUG(CTRL_INFO, "Event is :%s", event); + + /* Copy it to the user */ + if(copy_to_user(buf, event, strlen(event))) + { + DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); + return -EFAULT; + } + + DEXIT(CTRL_TRACE, "\n"); + return strlen(event); +} + +/*------------------------------------------------------------------*/ +/* + * Poll : called when someone do a select on /dev/irnet. + * Just check if there are new events... + */ +static inline unsigned int +irnet_ctrl_poll(irnet_socket * ap, + struct file * file, + poll_table * wait) +{ + unsigned int mask; + + DENTER(CTRL_TRACE, "(ap=0x%p)\n", ap); + + poll_wait(file, &irnet_events.rwait, wait); + mask = POLLOUT | POLLWRNORM; + /* If there is unread events */ + if(ap->event_index != irnet_events.index) + mask |= POLLIN | POLLRDNORM; +#ifdef INITIAL_DISCOVERY + if(ap->disco_number != -1) + { + /* Test if it's the first time and therefore we need to get the log */ + if(ap->discoveries == NULL) + irnet_get_discovery_log(ap); + /* Recheck */ + if(ap->disco_number != -1) + mask |= POLLIN | POLLRDNORM; + } +#endif /* INITIAL_DISCOVERY */ + + DEXIT(CTRL_TRACE, " - mask=0x%X\n", mask); + return mask; +} + + +/*********************** FILESYSTEM CALLBACKS ***********************/ +/* + * Implement the usual open, read, write functions that will be called + * by the file system when some action is performed on /dev/irnet. + * Most of those actions will in fact be performed by "pppd" or + * the control channel, we just act as a redirector... + */ + +/*------------------------------------------------------------------*/ +/* + * Open : when somebody open /dev/irnet + * We basically create a new instance of irnet and initialise it. + */ +static int +dev_irnet_open(struct inode * inode, + struct file * file) +{ + struct irnet_socket * ap; + int err; + + DENTER(FS_TRACE, "(file=0x%p)\n", file); + +#ifdef SECURE_DEVIRNET + /* This could (should?) be enforced by the permissions on /dev/irnet. */ + if(!capable(CAP_NET_ADMIN)) + return -EPERM; +#endif /* SECURE_DEVIRNET */ + + /* Allocate a private structure for this IrNET instance */ + ap = kzalloc(sizeof(*ap), GFP_KERNEL); + DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n"); + + /* initialize the irnet structure */ + ap->file = file; + + /* PPP channel setup */ + ap->ppp_open = 0; + ap->chan.private = ap; + ap->chan.ops = &irnet_ppp_ops; + ap->chan.mtu = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN); + ap->chan.hdrlen = 2 + TTP_MAX_HEADER; /* for A/C + Max IrDA hdr */ + /* PPP parameters */ + ap->mru = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN); + ap->xaccm[0] = ~0U; + ap->xaccm[3] = 0x60000000U; + ap->raccm = ~0U; + + /* Setup the IrDA part... */ + err = irda_irnet_create(ap); + if(err) + { + DERROR(FS_ERROR, "Can't setup IrDA link...\n"); + kfree(ap); + + return err; + } + + /* For the control channel */ + ap->event_index = irnet_events.index; /* Cancel all past events */ + + mutex_init(&ap->lock); + + /* Put our stuff where we will be able to find it later */ + file->private_data = ap; + + DEXIT(FS_TRACE, " - ap=0x%p\n", ap); + + return 0; +} + + +/*------------------------------------------------------------------*/ +/* + * Close : when somebody close /dev/irnet + * Destroy the instance of /dev/irnet + */ +static int +dev_irnet_close(struct inode * inode, + struct file * file) +{ + irnet_socket * ap = file->private_data; + + DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n", + file, ap); + DABORT(ap == NULL, 0, FS_ERROR, "ap is NULL !!!\n"); + + /* Detach ourselves */ + file->private_data = NULL; + + /* Close IrDA stuff */ + irda_irnet_destroy(ap); + + /* Disconnect from the generic PPP layer if not already done */ + if(ap->ppp_open) + { + DERROR(FS_ERROR, "Channel still registered - deregistering !\n"); + ap->ppp_open = 0; + ppp_unregister_channel(&ap->chan); + } + + kfree(ap); + + DEXIT(FS_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Write does nothing. + * (we receive packet from ppp_generic through ppp_irnet_send()) + */ +static ssize_t +dev_irnet_write(struct file * file, + const char __user *buf, + size_t count, + loff_t * ppos) +{ + irnet_socket * ap = file->private_data; + + DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", + file, ap, count); + DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); + + /* If we are connected to ppp_generic, let it handle the job */ + if(ap->ppp_open) + return -EAGAIN; + else + return irnet_ctrl_write(ap, buf, count); +} + +/*------------------------------------------------------------------*/ +/* + * Read doesn't do much either. + * (pppd poll us, but ultimately reads through /dev/ppp) + */ +static ssize_t +dev_irnet_read(struct file * file, + char __user * buf, + size_t count, + loff_t * ppos) +{ + irnet_socket * ap = file->private_data; + + DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", + file, ap, count); + DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); + + /* If we are connected to ppp_generic, let it handle the job */ + if(ap->ppp_open) + return -EAGAIN; + else + return irnet_ctrl_read(ap, file, buf, count); +} + +/*------------------------------------------------------------------*/ +/* + * Poll : called when someone do a select on /dev/irnet + */ +static unsigned int +dev_irnet_poll(struct file * file, + poll_table * wait) +{ + irnet_socket * ap = file->private_data; + unsigned int mask; + + DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n", + file, ap); + + mask = POLLOUT | POLLWRNORM; + DABORT(ap == NULL, mask, FS_ERROR, "ap is NULL !!!\n"); + + /* If we are connected to ppp_generic, let it handle the job */ + if(!ap->ppp_open) + mask |= irnet_ctrl_poll(ap, file, wait); + + DEXIT(FS_TRACE, " - mask=0x%X\n", mask); + return mask; +} + +/*------------------------------------------------------------------*/ +/* + * IOCtl : Called when someone does some ioctls on /dev/irnet + * This is the way pppd configure us and control us while the PPP + * instance is active. + */ +static long +dev_irnet_ioctl( + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + irnet_socket * ap = file->private_data; + int err; + int val; + void __user *argp = (void __user *)arg; + + DENTER(FS_TRACE, "(file=0x%p, ap=0x%p, cmd=0x%X)\n", + file, ap, cmd); + + /* Basic checks... */ + DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n"); +#ifdef SECURE_DEVIRNET + if(!capable(CAP_NET_ADMIN)) + return -EPERM; +#endif /* SECURE_DEVIRNET */ + + err = -EFAULT; + switch(cmd) + { + /* Set discipline (should be N_SYNC_PPP or N_TTY) */ + case TIOCSETD: + if(get_user(val, (int __user *)argp)) + break; + if((val == N_SYNC_PPP) || (val == N_PPP)) + { + DEBUG(FS_INFO, "Entering PPP discipline.\n"); + /* PPP channel setup (ap->chan in configured in dev_irnet_open())*/ + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + + err = ppp_register_channel(&ap->chan); + if(err == 0) + { + /* Our ppp side is active */ + ap->ppp_open = 1; + + DEBUG(FS_INFO, "Trying to establish a connection.\n"); + /* Setup the IrDA link now - may fail... */ + irda_irnet_connect(ap); + } + else + DERROR(FS_ERROR, "Can't setup PPP channel...\n"); + + mutex_unlock(&ap->lock); + } + else + { + /* In theory, should be N_TTY */ + DEBUG(FS_INFO, "Exiting PPP discipline.\n"); + /* Disconnect from the generic PPP layer */ + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + + if(ap->ppp_open) + { + ap->ppp_open = 0; + ppp_unregister_channel(&ap->chan); + } + else + DERROR(FS_ERROR, "Channel not registered !\n"); + err = 0; + + mutex_unlock(&ap->lock); + } + break; + + /* Query PPP channel and unit number */ + case PPPIOCGCHAN: + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + + if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan), + (int __user *)argp)) + err = 0; + + mutex_unlock(&ap->lock); + break; + case PPPIOCGUNIT: + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + + if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan), + (int __user *)argp)) + err = 0; + + mutex_unlock(&ap->lock); + break; + + /* All these ioctls can be passed both directly and from ppp_generic, + * so we just deal with them in one place... + */ + case PPPIOCGFLAGS: + case PPPIOCSFLAGS: + case PPPIOCGASYNCMAP: + case PPPIOCSASYNCMAP: + case PPPIOCGRASYNCMAP: + case PPPIOCSRASYNCMAP: + case PPPIOCGXASYNCMAP: + case PPPIOCSXASYNCMAP: + case PPPIOCGMRU: + case PPPIOCSMRU: + DEBUG(FS_INFO, "Standard PPP ioctl.\n"); + if(!capable(CAP_NET_ADMIN)) + err = -EPERM; + else { + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + + err = ppp_irnet_ioctl(&ap->chan, cmd, arg); + + mutex_unlock(&ap->lock); + } + break; + + /* TTY IOCTLs : Pretend that we are a tty, to keep pppd happy */ + /* Get termios */ + case TCGETS: + DEBUG(FS_INFO, "Get termios.\n"); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + +#ifndef TCGETS2 + if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios)) + err = 0; +#else + if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios)) + err = 0; +#endif + + mutex_unlock(&ap->lock); + break; + /* Set termios */ + case TCSETSF: + DEBUG(FS_INFO, "Set termios.\n"); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + +#ifndef TCGETS2 + if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp)) + err = 0; +#else + if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp)) + err = 0; +#endif + + mutex_unlock(&ap->lock); + break; + + /* Set DTR/RTS */ + case TIOCMBIS: + case TIOCMBIC: + /* Set exclusive/non-exclusive mode */ + case TIOCEXCL: + case TIOCNXCL: + DEBUG(FS_INFO, "TTY compatibility.\n"); + err = 0; + break; + + case TCGETA: + DEBUG(FS_INFO, "TCGETA\n"); + break; + + case TCFLSH: + DEBUG(FS_INFO, "TCFLSH\n"); + /* Note : this will flush buffers in PPP, so it *must* be done + * We should also worry that we don't accept junk here and that + * we get rid of our own buffers */ +#ifdef FLUSH_TO_PPP + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + ppp_output_wakeup(&ap->chan); + mutex_unlock(&ap->lock); +#endif /* FLUSH_TO_PPP */ + err = 0; + break; + + case FIONREAD: + DEBUG(FS_INFO, "FIONREAD\n"); + val = 0; + if(put_user(val, (int __user *)argp)) + break; + err = 0; + break; + + default: + DERROR(FS_ERROR, "Unsupported ioctl (0x%X)\n", cmd); + err = -ENOTTY; + } + + DEXIT(FS_TRACE, " - err = 0x%X\n", err); + return err; +} + +/************************** PPP CALLBACKS **************************/ +/* + * This are the functions that the generic PPP driver in the kernel + * will call to communicate to us. + */ + +/*------------------------------------------------------------------*/ +/* + * Prepare the ppp frame for transmission over the IrDA socket. + * We make sure that the header space is enough, and we change ppp header + * according to flags passed by pppd. + * This is not a callback, but just a helper function used in ppp_irnet_send() + */ +static inline struct sk_buff * +irnet_prepare_skb(irnet_socket * ap, + struct sk_buff * skb) +{ + unsigned char * data; + int proto; /* PPP protocol */ + int islcp; /* Protocol == LCP */ + int needaddr; /* Need PPP address */ + + DENTER(PPP_TRACE, "(ap=0x%p, skb=0x%p)\n", + ap, skb); + + /* Extract PPP protocol from the frame */ + data = skb->data; + proto = (data[0] << 8) + data[1]; + + /* LCP packets with codes between 1 (configure-request) + * and 7 (code-reject) must be sent as though no options + * have been negotiated. */ + islcp = (proto == PPP_LCP) && (1 <= data[2]) && (data[2] <= 7); + + /* compress protocol field if option enabled */ + if((data[0] == 0) && (ap->flags & SC_COMP_PROT) && (!islcp)) + skb_pull(skb,1); + + /* Check if we need address/control fields */ + needaddr = 2*((ap->flags & SC_COMP_AC) == 0 || islcp); + + /* Is the skb headroom large enough to contain all IrDA-headers? */ + if((skb_headroom(skb) < (ap->max_header_size + needaddr)) || + (skb_shared(skb))) + { + struct sk_buff * new_skb; + + DEBUG(PPP_INFO, "Reallocating skb\n"); + + /* Create a new skb */ + new_skb = skb_realloc_headroom(skb, ap->max_header_size + needaddr); + + /* We have to free the original skb anyway */ + dev_kfree_skb(skb); + + /* Did the realloc succeed ? */ + DABORT(new_skb == NULL, NULL, PPP_ERROR, "Could not realloc skb\n"); + + /* Use the new skb instead */ + skb = new_skb; + } + + /* prepend address/control fields if necessary */ + if(needaddr) + { + skb_push(skb, 2); + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; + } + + DEXIT(PPP_TRACE, "\n"); + + return skb; +} + +/*------------------------------------------------------------------*/ +/* + * Send a packet to the peer over the IrTTP connection. + * Returns 1 iff the packet was accepted. + * Returns 0 iff packet was not consumed. + * If the packet was not accepted, we will call ppp_output_wakeup + * at some later time to reactivate flow control in ppp_generic. + */ +static int +ppp_irnet_send(struct ppp_channel * chan, + struct sk_buff * skb) +{ + irnet_socket * self = (struct irnet_socket *) chan->private; + int ret; + + DENTER(PPP_TRACE, "(channel=0x%p, ap/self=0x%p)\n", + chan, self); + + /* Check if things are somewhat valid... */ + DASSERT(self != NULL, 0, PPP_ERROR, "Self is NULL !!!\n"); + + /* Check if we are connected */ + if(!(test_bit(0, &self->ttp_open))) + { +#ifdef CONNECT_IN_SEND + /* Let's try to connect one more time... */ + /* Note : we won't be connected after this call, but we should be + * ready for next packet... */ + /* If we are already connecting, this will fail */ + irda_irnet_connect(self); +#endif /* CONNECT_IN_SEND */ + + DEBUG(PPP_INFO, "IrTTP not ready ! (%ld-%ld)\n", + self->ttp_open, self->ttp_connect); + + /* Note : we can either drop the packet or block the packet. + * + * Blocking the packet allow us a better connection time, + * because by calling ppp_output_wakeup() we can have + * ppp_generic resending the LCP request immediately to us, + * rather than waiting for one of pppd periodic transmission of + * LCP request. + * + * On the other hand, if we block all packet, all those periodic + * transmissions of pppd accumulate in ppp_generic, creating a + * backlog of LCP request. When we eventually connect later on, + * we have to transmit all this backlog before we can connect + * proper (if we don't timeout before). + * + * The current strategy is as follow : + * While we are attempting to connect, we block packets to get + * a better connection time. + * If we fail to connect, we drain the queue and start dropping packets + */ +#ifdef BLOCK_WHEN_CONNECT + /* If we are attempting to connect */ + if(test_bit(0, &self->ttp_connect)) + { + /* Blocking packet, ppp_generic will retry later */ + return 0; + } +#endif /* BLOCK_WHEN_CONNECT */ + + /* Dropping packet, pppd will retry later */ + dev_kfree_skb(skb); + return 1; + } + + /* Check if the queue can accept any packet, otherwise block */ + if(self->tx_flow != FLOW_START) + DRETURN(0, PPP_INFO, "IrTTP queue full (%d skbs)...\n", + skb_queue_len(&self->tsap->tx_queue)); + + /* Prepare ppp frame for transmission */ + skb = irnet_prepare_skb(self, skb); + DABORT(skb == NULL, 1, PPP_ERROR, "Prepare skb for Tx failed.\n"); + + /* Send the packet to IrTTP */ + ret = irttp_data_request(self->tsap, skb); + if(ret < 0) + { + /* + * > IrTTPs tx queue is full, so we just have to + * > drop the frame! You might think that we should + * > just return -1 and don't deallocate the frame, + * > but that is dangerous since it's possible that + * > we have replaced the original skb with a new + * > one with larger headroom, and that would really + * > confuse do_dev_queue_xmit() in dev.c! I have + * > tried :-) DB + * Correction : we verify the flow control above (self->tx_flow), + * so we come here only if IrTTP doesn't like the packet (empty, + * too large, IrTTP not connected). In those rare cases, it's ok + * to drop it, we don't want to see it here again... + * Jean II + */ + DERROR(PPP_ERROR, "IrTTP doesn't like this packet !!! (0x%X)\n", ret); + /* irttp_data_request already free the packet */ + } + + DEXIT(PPP_TRACE, "\n"); + return 1; /* Packet has been consumed */ +} + +/*------------------------------------------------------------------*/ +/* + * Take care of the ioctls that ppp_generic doesn't want to deal with... + * Note : we are also called from dev_irnet_ioctl(). + */ +static int +ppp_irnet_ioctl(struct ppp_channel * chan, + unsigned int cmd, + unsigned long arg) +{ + irnet_socket * ap = (struct irnet_socket *) chan->private; + int err; + int val; + u32 accm[8]; + void __user *argp = (void __user *)arg; + + DENTER(PPP_TRACE, "(channel=0x%p, ap=0x%p, cmd=0x%X)\n", + chan, ap, cmd); + + /* Basic checks... */ + DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n"); + + err = -EFAULT; + switch(cmd) + { + /* PPP flags */ + case PPPIOCGFLAGS: + val = ap->flags | ap->rbits; + if(put_user(val, (int __user *) argp)) + break; + err = 0; + break; + case PPPIOCSFLAGS: + if(get_user(val, (int __user *) argp)) + break; + ap->flags = val & ~SC_RCV_BITS; + ap->rbits = val & SC_RCV_BITS; + err = 0; + break; + + /* Async map stuff - all dummy to please pppd */ + case PPPIOCGASYNCMAP: + if(put_user(ap->xaccm[0], (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCSASYNCMAP: + if(get_user(ap->xaccm[0], (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCGRASYNCMAP: + if(put_user(ap->raccm, (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCSRASYNCMAP: + if(get_user(ap->raccm, (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCGXASYNCMAP: + if(copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm))) + break; + err = 0; + break; + case PPPIOCSXASYNCMAP: + if(copy_from_user(accm, argp, sizeof(accm))) + break; + accm[2] &= ~0x40000000U; /* can't escape 0x5e */ + accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */ + memcpy(ap->xaccm, accm, sizeof(ap->xaccm)); + err = 0; + break; + + /* Max PPP frame size */ + case PPPIOCGMRU: + if(put_user(ap->mru, (int __user *) argp)) + break; + err = 0; + break; + case PPPIOCSMRU: + if(get_user(val, (int __user *) argp)) + break; + if(val < PPP_MRU) + val = PPP_MRU; + ap->mru = val; + err = 0; + break; + + default: + DEBUG(PPP_INFO, "Unsupported ioctl (0x%X)\n", cmd); + err = -ENOIOCTLCMD; + } + + DEXIT(PPP_TRACE, " - err = 0x%X\n", err); + return err; +} + +/************************** INITIALISATION **************************/ +/* + * Module initialisation and all that jazz... + */ + +/*------------------------------------------------------------------*/ +/* + * Hook our device callbacks in the filesystem, to connect our code + * to /dev/irnet + */ +static inline int __init +ppp_irnet_init(void) +{ + int err = 0; + + DENTER(MODULE_TRACE, "()\n"); + + /* Allocate ourselves as a minor in the misc range */ + err = misc_register(&irnet_misc_device); + + DEXIT(MODULE_TRACE, "\n"); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Cleanup at exit... + */ +static inline void __exit +ppp_irnet_cleanup(void) +{ + DENTER(MODULE_TRACE, "()\n"); + + /* De-allocate /dev/irnet minor in misc range */ + misc_deregister(&irnet_misc_device); + + DEXIT(MODULE_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Module main entry point + */ +static int __init +irnet_init(void) +{ + int err; + + /* Initialise both parts... */ + err = irda_irnet_init(); + if(!err) + err = ppp_irnet_init(); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Module exit + */ +static void __exit +irnet_cleanup(void) +{ + irda_irnet_cleanup(); + ppp_irnet_cleanup(); +} + +/*------------------------------------------------------------------*/ +/* + * Module magic + */ +module_init(irnet_init); +module_exit(irnet_cleanup); +MODULE_AUTHOR("Jean Tourrilhes "); +MODULE_DESCRIPTION("IrNET : Synchronous PPP over IrDA"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CHARDEV(10, 187); diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h new file mode 100644 index 00000000..94022586 --- /dev/null +++ b/net/irda/irnet/irnet_ppp.h @@ -0,0 +1,119 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file contains all definitions and declarations necessary for the + * PPP part of the IrNET module. + * This file is a private header, so other modules don't want to know + * what's in there... + */ + +#ifndef IRNET_PPP_H +#define IRNET_PPP_H + +/***************************** INCLUDES *****************************/ + +#include "irnet.h" /* Module global include */ + +/************************ CONSTANTS & MACROS ************************/ + +/* /dev/irnet file constants */ +#define IRNET_MAJOR 10 /* Misc range */ +#define IRNET_MINOR 187 /* Official allocation */ + +/* IrNET control channel stuff */ +#define IRNET_MAX_COMMAND 256 /* Max length of a command line */ + +/* PPP hardcore stuff */ + +/* Bits in rbits (PPP flags in irnet struct) */ +#define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) + +/* Bit numbers in busy */ +#define XMIT_BUSY 0 +#define RECV_BUSY 1 +#define XMIT_WAKEUP 2 +#define XMIT_FULL 3 + +/* Queue management */ +#define PPPSYNC_MAX_RQLEN 32 /* arbitrary */ + +/****************************** TYPES ******************************/ + + +/**************************** PROTOTYPES ****************************/ + +/* ----------------------- CONTROL CHANNEL ----------------------- */ +static inline ssize_t + irnet_ctrl_write(irnet_socket *, + const char *, + size_t); +static inline ssize_t + irnet_ctrl_read(irnet_socket *, + struct file *, + char *, + size_t); +static inline unsigned int + irnet_ctrl_poll(irnet_socket *, + struct file *, + poll_table *); +/* ----------------------- CHARACTER DEVICE ----------------------- */ +static int + dev_irnet_open(struct inode *, /* fs callback : open */ + struct file *), + dev_irnet_close(struct inode *, + struct file *); +static ssize_t + dev_irnet_write(struct file *, + const char __user *, + size_t, + loff_t *), + dev_irnet_read(struct file *, + char __user *, + size_t, + loff_t *); +static unsigned int + dev_irnet_poll(struct file *, + poll_table *); +static long + dev_irnet_ioctl(struct file *, + unsigned int, + unsigned long); +/* ------------------------ PPP INTERFACE ------------------------ */ +static inline struct sk_buff * + irnet_prepare_skb(irnet_socket *, + struct sk_buff *); +static int + ppp_irnet_send(struct ppp_channel *, + struct sk_buff *); +static int + ppp_irnet_ioctl(struct ppp_channel *, + unsigned int, + unsigned long); + +/**************************** VARIABLES ****************************/ + +/* Filesystem callbacks (to call us) */ +static const struct file_operations irnet_device_fops = +{ + .owner = THIS_MODULE, + .read = dev_irnet_read, + .write = dev_irnet_write, + .poll = dev_irnet_poll, + .unlocked_ioctl = dev_irnet_ioctl, + .open = dev_irnet_open, + .release = dev_irnet_close, + .llseek = noop_llseek, + /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */ +}; + +/* Structure so that the misc major (drivers/char/misc.c) take care of us... */ +static struct miscdevice irnet_misc_device = +{ + IRNET_MINOR, + "irnet", + &irnet_device_fops +}; + +#endif /* IRNET_PPP_H */ diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c new file mode 100644 index 00000000..6c7c4b92 --- /dev/null +++ b/net/irda/irnetlink.c @@ -0,0 +1,159 @@ +/* + * IrDA netlink layer, for stack configuration. + * + * Copyright (c) 2007 Samuel Ortiz + * + * Partly based on the 802.11 nelink implementation + * (see net/wireless/nl80211.c) which is: + * Copyright 2006 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + + +static struct genl_family irda_nl_family = { + .id = GENL_ID_GENERATE, + .name = IRDA_NL_NAME, + .hdrsize = 0, + .version = IRDA_NL_VERSION, + .maxattr = IRDA_NL_CMD_MAX, +}; + +static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info) +{ + char * ifname; + + if (!info->attrs[IRDA_NL_ATTR_IFNAME]) + return NULL; + + ifname = nla_data(info->attrs[IRDA_NL_ATTR_IFNAME]); + + IRDA_DEBUG(5, "%s(): Looking for %s\n", __func__, ifname); + + return dev_get_by_name(net, ifname); +} + +static int irda_nl_set_mode(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device * dev; + struct irlap_cb * irlap; + u32 mode; + + if (!info->attrs[IRDA_NL_ATTR_MODE]) + return -EINVAL; + + mode = nla_get_u32(info->attrs[IRDA_NL_ATTR_MODE]); + + IRDA_DEBUG(5, "%s(): Switching to mode: %d\n", __func__, mode); + + dev = ifname_to_netdev(&init_net, info); + if (!dev) + return -ENODEV; + + irlap = (struct irlap_cb *)dev->atalk_ptr; + if (!irlap) { + dev_put(dev); + return -ENODEV; + } + + irlap->mode = mode; + + dev_put(dev); + + return 0; +} + +static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device * dev; + struct irlap_cb * irlap; + struct sk_buff *msg; + void *hdr; + int ret = -ENOBUFS; + + dev = ifname_to_netdev(&init_net, info); + if (!dev) + return -ENODEV; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + dev_put(dev); + return -ENOMEM; + } + + irlap = (struct irlap_cb *)dev->atalk_ptr; + if (!irlap) { + ret = -ENODEV; + goto err_out; + } + + hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq, + &irda_nl_family, 0, IRDA_NL_CMD_GET_MODE); + if (hdr == NULL) { + ret = -EMSGSIZE; + goto err_out; + } + + if(nla_put_string(msg, IRDA_NL_ATTR_IFNAME, + dev->name)) + goto err_out; + + if(nla_put_u32(msg, IRDA_NL_ATTR_MODE, irlap->mode)) + goto err_out; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + + err_out: + nlmsg_free(msg); + dev_put(dev); + + return ret; +} + +static const struct nla_policy irda_nl_policy[IRDA_NL_ATTR_MAX + 1] = { + [IRDA_NL_ATTR_IFNAME] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ-1 }, + [IRDA_NL_ATTR_MODE] = { .type = NLA_U32 }, +}; + +static struct genl_ops irda_nl_ops[] = { + { + .cmd = IRDA_NL_CMD_SET_MODE, + .doit = irda_nl_set_mode, + .policy = irda_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IRDA_NL_CMD_GET_MODE, + .doit = irda_nl_get_mode, + .policy = irda_nl_policy, + /* can be retrieved by unprivileged users */ + }, + +}; + +int irda_nl_register(void) +{ + return genl_register_family_with_ops(&irda_nl_family, + irda_nl_ops, ARRAY_SIZE(irda_nl_ops)); +} + +void irda_nl_unregister(void) +{ + genl_unregister_family(&irda_nl_family); +} diff --git a/net/irda/irproc.c b/net/irda/irproc.c new file mode 100644 index 00000000..b9ac598e --- /dev/null +++ b/net/irda/irproc.c @@ -0,0 +1,97 @@ +/********************************************************************* + * + * Filename: irproc.c + * Version: 1.0 + * Description: Various entries in the /proc file system + * Status: Experimental. + * Author: Thomas Davis, + * Created at: Sat Feb 21 21:33:24 1998 + * Modified at: Sun Nov 14 08:54:54 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999, Dag Brattli + * Copyright (c) 1998, Thomas Davis, , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * I, Thomas Davis, provide no warranty for any of this software. + * This material is provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +extern const struct file_operations discovery_seq_fops; +extern const struct file_operations irlap_seq_fops; +extern const struct file_operations irlmp_seq_fops; +extern const struct file_operations irttp_seq_fops; +extern const struct file_operations irias_seq_fops; + +struct irda_entry { + const char *name; + const struct file_operations *fops; +}; + +struct proc_dir_entry *proc_irda; +EXPORT_SYMBOL(proc_irda); + +static const struct irda_entry irda_dirs[] = { + {"discovery", &discovery_seq_fops}, + {"irttp", &irttp_seq_fops}, + {"irlmp", &irlmp_seq_fops}, + {"irlap", &irlap_seq_fops}, + {"irias", &irias_seq_fops}, +}; + +/* + * Function irda_proc_register (void) + * + * Register irda entry in /proc file system + * + */ +void __init irda_proc_register(void) +{ + int i; + + proc_irda = proc_mkdir("irda", init_net.proc_net); + if (proc_irda == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(irda_dirs); i++) + (void) proc_create(irda_dirs[i].name, 0, proc_irda, + irda_dirs[i].fops); +} + +/* + * Function irda_proc_unregister (void) + * + * Unregister irda entry in /proc file system + * + */ +void irda_proc_unregister(void) +{ + int i; + + if (proc_irda) { + for (i=0; i + * Created at: Tue Jun 9 13:29:31 1998 + * Modified at: Sun Dec 12 13:48:22 1999 + * Modified by: Dag Brattli + * Modified at: Thu Jan 4 14:29:10 CET 2001 + * Modified by: Marc Zyngier + * + * Copyright (C) 1998-1999, Aage Kvalnes + * Copyright (C) 1998, Dag Brattli, + * All Rights Reserved. + * + * This code is taken from the Vortex Operating System written by Aage + * Kvalnes. Aage has agreed that this code can use the GPL licence, + * although he does not use that licence in his own code. + * + * This copyright does however _not_ include the ELF hash() function + * which I currently don't know which licence or copyright it + * has. Please inform me if you know. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +/* + * NOTE : + * There are various problems with this package : + * o the hash function for ints is pathetic (but could be changed) + * o locking is sometime suspicious (especially during enumeration) + * o most users have only a few elements (== overhead) + * o most users never use search, so don't benefit from hashing + * Problem already fixed : + * o not 64 bit compliant (most users do hashv = (int) self) + * o hashbin_remove() is broken => use hashbin_remove_this() + * I think most users would be better served by a simple linked list + * (like include/linux/list.h) with a global spinlock per list. + * Jean II + */ + +/* + * Notes on the concurrent access to hashbin and other SMP issues + * ------------------------------------------------------------- + * Hashbins are very often in the IrDA stack a global repository of + * information, and therefore used in a very asynchronous manner following + * various events (driver calls, timers, user calls...). + * Therefore, very often it is highly important to consider the + * management of concurrent access to the hashbin and how to guarantee the + * consistency of the operations on it. + * + * First, we need to define the objective of locking : + * 1) Protect user data (content pointed by the hashbin) + * 2) Protect hashbin structure itself (linked list in each bin) + * + * OLD LOCKING + * ----------- + * + * The previous locking strategy, either HB_LOCAL or HB_GLOBAL were + * both inadequate in *both* aspect. + * o HB_GLOBAL was using a spinlock for each bin (local locking). + * o HB_LOCAL was disabling irq on *all* CPUs, so use a single + * global semaphore. + * The problems were : + * A) Global irq disabling is no longer supported by the kernel + * B) No protection for the hashbin struct global data + * o hashbin_delete() + * o hb_current + * C) No protection for user data in some cases + * + * A) HB_LOCAL use global irq disabling, so doesn't work on kernel + * 2.5.X. Even when it is supported (kernel 2.4.X and earlier), its + * performance is not satisfactory on SMP setups. Most hashbins were + * HB_LOCAL, so (A) definitely need fixing. + * B) HB_LOCAL could be modified to fix (B). However, because HB_GLOBAL + * lock only the individual bins, it will never be able to lock the + * global data, so can't do (B). + * C) Some functions return pointer to data that is still in the + * hashbin : + * o hashbin_find() + * o hashbin_get_first() + * o hashbin_get_next() + * As the data is still in the hashbin, it may be changed or free'd + * while the caller is examinimg the data. In those case, locking can't + * be done within the hashbin, but must include use of the data within + * the caller. + * The caller can easily do this with HB_LOCAL (just disable irqs). + * However, this is impossible with HB_GLOBAL because the caller has no + * way to know the proper bin, so don't know which spinlock to use. + * + * Quick summary : can no longer use HB_LOCAL, and HB_GLOBAL is + * fundamentally broken and will never work. + * + * NEW LOCKING + * ----------- + * + * To fix those problems, I've introduce a few changes in the + * hashbin locking : + * 1) New HB_LOCK scheme + * 2) hashbin->hb_spinlock + * 3) New hashbin usage policy + * + * HB_LOCK : + * ------- + * HB_LOCK is a locking scheme intermediate between the old HB_LOCAL + * and HB_GLOBAL. It uses a single spinlock to protect the whole content + * of the hashbin. As it is a single spinlock, it can protect the global + * data of the hashbin and not only the bins themselves. + * HB_LOCK can only protect some of the hashbin calls, so it only lock + * call that can be made 100% safe and leave other call unprotected. + * HB_LOCK in theory is slower than HB_GLOBAL, but as the hashbin + * content is always small contention is not high, so it doesn't matter + * much. HB_LOCK is probably faster than HB_LOCAL. + * + * hashbin->hb_spinlock : + * -------------------- + * The spinlock that HB_LOCK uses is available for caller, so that + * the caller can protect unprotected calls (see below). + * If the caller want to do entirely its own locking (HB_NOLOCK), he + * can do so and may use safely this spinlock. + * Locking is done like this : + * spin_lock_irqsave(&hashbin->hb_spinlock, flags); + * Releasing the lock : + * spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + * + * Safe & Protected calls : + * ---------------------- + * The following calls are safe or protected via HB_LOCK : + * o hashbin_new() -> safe + * o hashbin_delete() + * o hashbin_insert() + * o hashbin_remove_first() + * o hashbin_remove() + * o hashbin_remove_this() + * o HASHBIN_GET_SIZE() -> atomic + * + * The following calls only protect the hashbin itself : + * o hashbin_lock_find() + * o hashbin_find_next() + * + * Unprotected calls : + * ----------------- + * The following calls need to be protected by the caller : + * o hashbin_find() + * o hashbin_get_first() + * o hashbin_get_next() + * + * Locking Policy : + * -------------- + * If the hashbin is used only in a single thread of execution + * (explicitly or implicitely), you can use HB_NOLOCK + * If the calling module already provide concurrent access protection, + * you may use HB_NOLOCK. + * + * In all other cases, you need to use HB_LOCK and lock the hashbin + * every time before calling one of the unprotected calls. You also must + * use the pointer returned by the unprotected call within the locked + * region. + * + * Extra care for enumeration : + * -------------------------- + * hashbin_get_first() and hashbin_get_next() use the hashbin to + * store the current position, in hb_current. + * As long as the hashbin remains locked, this is safe. If you unlock + * the hashbin, the current position may change if anybody else modify + * or enumerate the hashbin. + * Summary : do the full enumeration while locked. + * + * Alternatively, you may use hashbin_find_next(). But, this will + * be slower, is more complex to use and doesn't protect the hashbin + * content. So, care is needed here as well. + * + * Other issues : + * ------------ + * I believe that we are overdoing it by using spin_lock_irqsave() + * and we should use only spin_lock_bh() or similar. But, I don't have + * the balls to try it out. + * Don't believe that because hashbin are now (somewhat) SMP safe + * that the rest of the code is. Higher layers tend to be safest, + * but LAP and LMP would need some serious dedicated love. + * + * Jean II + */ +#include +#include + +#include +#include + +/************************ QUEUE SUBROUTINES ************************/ + +/* + * Hashbin + */ +#define GET_HASHBIN(x) ( x & HASHBIN_MASK ) + +/* + * Function hash (name) + * + * This function hash the input string 'name' using the ELF hash + * function for strings. + */ +static __u32 hash( const char* name) +{ + __u32 h = 0; + __u32 g; + + while(*name) { + h = (h<<4) + *name++; + if ((g = (h & 0xf0000000))) + h ^=g>>24; + h &=~g; + } + return h; +} + +/* + * Function enqueue_first (queue, proc) + * + * Insert item first in queue. + * + */ +static void enqueue_first(irda_queue_t **queue, irda_queue_t* element) +{ + + IRDA_DEBUG( 4, "%s()\n", __func__); + + /* + * Check if queue is empty. + */ + if ( *queue == NULL ) { + /* + * Queue is empty. Insert one element into the queue. + */ + element->q_next = element->q_prev = *queue = element; + + } else { + /* + * Queue is not empty. Insert element into front of queue. + */ + element->q_next = (*queue); + (*queue)->q_prev->q_next = element; + element->q_prev = (*queue)->q_prev; + (*queue)->q_prev = element; + (*queue) = element; + } +} + + +/* + * Function dequeue (queue) + * + * Remove first entry in queue + * + */ +static irda_queue_t *dequeue_first(irda_queue_t **queue) +{ + irda_queue_t *ret; + + IRDA_DEBUG( 4, "dequeue_first()\n"); + + /* + * Set return value + */ + ret = *queue; + + if ( *queue == NULL ) { + /* + * Queue was empty. + */ + } else if ( (*queue)->q_next == *queue ) { + /* + * Queue only contained a single element. It will now be + * empty. + */ + *queue = NULL; + } else { + /* + * Queue contained several element. Remove the first one. + */ + (*queue)->q_prev->q_next = (*queue)->q_next; + (*queue)->q_next->q_prev = (*queue)->q_prev; + *queue = (*queue)->q_next; + } + + /* + * Return the removed entry (or NULL of queue was empty). + */ + return ret; +} + +/* + * Function dequeue_general (queue, element) + * + * + */ +static irda_queue_t *dequeue_general(irda_queue_t **queue, irda_queue_t* element) +{ + irda_queue_t *ret; + + IRDA_DEBUG( 4, "dequeue_general()\n"); + + /* + * Set return value + */ + ret = *queue; + + if ( *queue == NULL ) { + /* + * Queue was empty. + */ + } else if ( (*queue)->q_next == *queue ) { + /* + * Queue only contained a single element. It will now be + * empty. + */ + *queue = NULL; + + } else { + /* + * Remove specific element. + */ + element->q_prev->q_next = element->q_next; + element->q_next->q_prev = element->q_prev; + if ( (*queue) == element) + (*queue) = element->q_next; + } + + /* + * Return the removed entry (or NULL of queue was empty). + */ + return ret; +} + +/************************ HASHBIN MANAGEMENT ************************/ + +/* + * Function hashbin_create ( type, name ) + * + * Create hashbin! + * + */ +hashbin_t *hashbin_new(int type) +{ + hashbin_t* hashbin; + + /* + * Allocate new hashbin + */ + hashbin = kzalloc(sizeof(*hashbin), GFP_ATOMIC); + if (!hashbin) + return NULL; + + /* + * Initialize structure + */ + hashbin->hb_type = type; + hashbin->magic = HB_MAGIC; + //hashbin->hb_current = NULL; + + /* Make sure all spinlock's are unlocked */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_init(&hashbin->hb_spinlock); + } + + return hashbin; +} +EXPORT_SYMBOL(hashbin_new); + + +/* + * Function hashbin_delete (hashbin, free_func) + * + * Destroy hashbin, the free_func can be a user supplied special routine + * for deallocating this structure if it's complex. If not the user can + * just supply kfree, which should take care of the job. + */ +#ifdef CONFIG_LOCKDEP +static int hashbin_lock_depth = 0; +#endif +int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) +{ + irda_queue_t* queue; + unsigned long flags = 0; + int i; + + IRDA_ASSERT(hashbin != NULL, return -1;); + IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags, + hashbin_lock_depth++); + } + + /* + * Free the entries in the hashbin, TODO: use hashbin_clear when + * it has been shown to work + */ + for (i = 0; i < HASHBIN_SIZE; i ++ ) { + queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]); + while (queue ) { + if (free_func) + (*free_func)(queue); + queue = dequeue_first( + (irda_queue_t**) &hashbin->hb_queue[i]); + } + } + + /* Cleanup local data */ + hashbin->hb_current = NULL; + hashbin->magic = ~HB_MAGIC; + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); +#ifdef CONFIG_LOCKDEP + hashbin_lock_depth--; +#endif + } + + /* + * Free the hashbin structure + */ + kfree(hashbin); + + return 0; +} +EXPORT_SYMBOL(hashbin_delete); + +/********************* HASHBIN LIST OPERATIONS *********************/ + +/* + * Function hashbin_insert (hashbin, entry, name) + * + * Insert an entry into the hashbin + * + */ +void hashbin_insert(hashbin_t* hashbin, irda_queue_t* entry, long hashv, + const char* name) +{ + unsigned long flags = 0; + int bin; + + IRDA_DEBUG( 4, "%s()\n", __func__); + + IRDA_ASSERT( hashbin != NULL, return;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return;); + + /* + * Locate hashbin + */ + if ( name ) + hashv = hash( name ); + bin = GET_HASHBIN( hashv ); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + /* + * Store name and key + */ + entry->q_hash = hashv; + if ( name ) + strlcpy( entry->q_name, name, sizeof(entry->q_name)); + + /* + * Insert new entry first + */ + enqueue_first( (irda_queue_t**) &hashbin->hb_queue[ bin ], + entry); + hashbin->hb_size++; + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ +} +EXPORT_SYMBOL(hashbin_insert); + +/* + * Function hashbin_remove_first (hashbin) + * + * Remove first entry of the hashbin + * + * Note : this function no longer use hashbin_remove(), but does things + * similar to hashbin_remove_this(), so can be considered safe. + * Jean II + */ +void *hashbin_remove_first( hashbin_t *hashbin) +{ + unsigned long flags = 0; + irda_queue_t *entry = NULL; + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + entry = hashbin_get_first( hashbin); + if ( entry != NULL) { + int bin; + long hashv; + /* + * Locate hashbin + */ + hashv = entry->q_hash; + bin = GET_HASHBIN( hashv ); + + /* + * Dequeue the entry... + */ + dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ], + (irda_queue_t*) entry ); + hashbin->hb_size--; + entry->q_next = NULL; + entry->q_prev = NULL; + + /* + * Check if this item is the currently selected item, and in + * that case we must reset hb_current + */ + if ( entry == hashbin->hb_current) + hashbin->hb_current = NULL; + } + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + return entry; +} + + +/* + * Function hashbin_remove (hashbin, hashv, name) + * + * Remove entry with the given name + * + * The use of this function is highly discouraged, because the whole + * concept behind hashbin_remove() is broken. In many cases, it's not + * possible to guarantee the unicity of the index (either hashv or name), + * leading to removing the WRONG entry. + * The only simple safe use is : + * hashbin_remove(hasbin, (int) self, NULL); + * In other case, you must think hard to guarantee unicity of the index. + * Jean II + */ +void* hashbin_remove( hashbin_t* hashbin, long hashv, const char* name) +{ + int bin, found = FALSE; + unsigned long flags = 0; + irda_queue_t* entry; + + IRDA_DEBUG( 4, "%s()\n", __func__); + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + /* + * Locate hashbin + */ + if ( name ) + hashv = hash( name ); + bin = GET_HASHBIN( hashv ); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + /* + * Search for entry + */ + entry = hashbin->hb_queue[ bin ]; + if ( entry ) { + do { + /* + * Check for key + */ + if ( entry->q_hash == hashv ) { + /* + * Name compare too? + */ + if ( name ) { + if ( strcmp( entry->q_name, name) == 0) + { + found = TRUE; + break; + } + } else { + found = TRUE; + break; + } + } + entry = entry->q_next; + } while ( entry != hashbin->hb_queue[ bin ] ); + } + + /* + * If entry was found, dequeue it + */ + if ( found ) { + dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ], + (irda_queue_t*) entry ); + hashbin->hb_size--; + + /* + * Check if this item is the currently selected item, and in + * that case we must reset hb_current + */ + if ( entry == hashbin->hb_current) + hashbin->hb_current = NULL; + } + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + + /* Return */ + if ( found ) + return entry; + else + return NULL; + +} +EXPORT_SYMBOL(hashbin_remove); + +/* + * Function hashbin_remove_this (hashbin, entry) + * + * Remove entry with the given name + * + * In some cases, the user of hashbin can't guarantee the unicity + * of either the hashv or name. + * In those cases, using the above function is guaranteed to cause troubles, + * so we use this one instead... + * And by the way, it's also faster, because we skip the search phase ;-) + */ +void* hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry) +{ + unsigned long flags = 0; + int bin; + long hashv; + + IRDA_DEBUG( 4, "%s()\n", __func__); + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + IRDA_ASSERT( entry != NULL, return NULL;); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + /* Check if valid and not already removed... */ + if((entry->q_next == NULL) || (entry->q_prev == NULL)) { + entry = NULL; + goto out; + } + + /* + * Locate hashbin + */ + hashv = entry->q_hash; + bin = GET_HASHBIN( hashv ); + + /* + * Dequeue the entry... + */ + dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ], + (irda_queue_t*) entry ); + hashbin->hb_size--; + entry->q_next = NULL; + entry->q_prev = NULL; + + /* + * Check if this item is the currently selected item, and in + * that case we must reset hb_current + */ + if ( entry == hashbin->hb_current) + hashbin->hb_current = NULL; +out: + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + return entry; +} +EXPORT_SYMBOL(hashbin_remove_this); + +/*********************** HASHBIN ENUMERATION ***********************/ + +/* + * Function hashbin_common_find (hashbin, hashv, name) + * + * Find item with the given hashv or name + * + */ +void* hashbin_find( hashbin_t* hashbin, long hashv, const char* name ) +{ + int bin; + irda_queue_t* entry; + + IRDA_DEBUG( 4, "hashbin_find()\n"); + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + /* + * Locate hashbin + */ + if ( name ) + hashv = hash( name ); + bin = GET_HASHBIN( hashv ); + + /* + * Search for entry + */ + entry = hashbin->hb_queue[ bin]; + if ( entry ) { + do { + /* + * Check for key + */ + if ( entry->q_hash == hashv ) { + /* + * Name compare too? + */ + if ( name ) { + if ( strcmp( entry->q_name, name ) == 0 ) { + return entry; + } + } else { + return entry; + } + } + entry = entry->q_next; + } while ( entry != hashbin->hb_queue[ bin ] ); + } + + return NULL; +} +EXPORT_SYMBOL(hashbin_find); + +/* + * Function hashbin_lock_find (hashbin, hashv, name) + * + * Find item with the given hashv or name + * + * Same, but with spinlock protection... + * I call it safe, but it's only safe with respect to the hashbin, not its + * content. - Jean II + */ +void* hashbin_lock_find( hashbin_t* hashbin, long hashv, const char* name ) +{ + unsigned long flags = 0; + irda_queue_t* entry; + + /* Synchronize */ + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + + /* + * Search for entry + */ + entry = (irda_queue_t* ) hashbin_find( hashbin, hashv, name ); + + /* Release lock */ + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + + return entry; +} +EXPORT_SYMBOL(hashbin_lock_find); + +/* + * Function hashbin_find (hashbin, hashv, name, pnext) + * + * Find an item with the given hashv or name, and its successor + * + * This function allow to do concurrent enumerations without the + * need to lock over the whole session, because the caller keep the + * context of the search. On the other hand, it might fail and return + * NULL if the entry is removed. - Jean II + */ +void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name, + void ** pnext) +{ + unsigned long flags = 0; + irda_queue_t* entry; + + /* Synchronize */ + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + + /* + * Search for current entry + * This allow to check if the current item is still in the + * hashbin or has been removed. + */ + entry = (irda_queue_t* ) hashbin_find( hashbin, hashv, name ); + + /* + * Trick hashbin_get_next() to return what we want + */ + if(entry) { + hashbin->hb_current = entry; + *pnext = hashbin_get_next( hashbin ); + } else + *pnext = NULL; + + /* Release lock */ + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + + return entry; +} + +/* + * Function hashbin_get_first (hashbin) + * + * Get a pointer to first element in hashbin, this function must be + * called before any calls to hashbin_get_next()! + * + */ +irda_queue_t *hashbin_get_first( hashbin_t* hashbin) +{ + irda_queue_t *entry; + int i; + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + if ( hashbin == NULL) + return NULL; + + for ( i = 0; i < HASHBIN_SIZE; i ++ ) { + entry = hashbin->hb_queue[ i]; + if ( entry) { + hashbin->hb_current = entry; + return entry; + } + } + /* + * Did not find any item in hashbin + */ + return NULL; +} +EXPORT_SYMBOL(hashbin_get_first); + +/* + * Function hashbin_get_next (hashbin) + * + * Get next item in hashbin. A series of hashbin_get_next() calls must + * be started by a call to hashbin_get_first(). The function returns + * NULL when all items have been traversed + * + * The context of the search is stored within the hashbin, so you must + * protect yourself from concurrent enumerations. - Jean II + */ +irda_queue_t *hashbin_get_next( hashbin_t *hashbin) +{ + irda_queue_t* entry; + int bin; + int i; + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + if ( hashbin->hb_current == NULL) { + IRDA_ASSERT( hashbin->hb_current != NULL, return NULL;); + return NULL; + } + entry = hashbin->hb_current->q_next; + bin = GET_HASHBIN( entry->q_hash); + + /* + * Make sure that we are not back at the beginning of the queue + * again + */ + if ( entry != hashbin->hb_queue[ bin ]) { + hashbin->hb_current = entry; + + return entry; + } + + /* + * Check that this is not the last queue in hashbin + */ + if ( bin >= HASHBIN_SIZE) + return NULL; + + /* + * Move to next queue in hashbin + */ + bin++; + for ( i = bin; i < HASHBIN_SIZE; i++ ) { + entry = hashbin->hb_queue[ i]; + if ( entry) { + hashbin->hb_current = entry; + + return entry; + } + } + return NULL; +} +EXPORT_SYMBOL(hashbin_get_next); diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c new file mode 100644 index 00000000..d0b70dad --- /dev/null +++ b/net/irda/irsysctl.c @@ -0,0 +1,273 @@ +/********************************************************************* + * + * Filename: irsysctl.c + * Version: 1.0 + * Description: Sysctl interface for IrDA + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun May 24 22:12:06 1998 + * Modified at: Fri Jun 4 02:50:15 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2001 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include /* irda_debug */ +#include +#include +#include + +extern int sysctl_discovery; +extern int sysctl_discovery_slots; +extern int sysctl_discovery_timeout; +extern int sysctl_slot_timeout; +extern int sysctl_fast_poll_increase; +extern char sysctl_devname[]; +extern int sysctl_max_baud_rate; +extern int sysctl_min_tx_turn_time; +extern int sysctl_max_tx_data_size; +extern int sysctl_max_tx_window; +extern int sysctl_max_noreply_time; +extern int sysctl_warn_noreply_time; +extern int sysctl_lap_keepalive_time; + +extern struct irlmp_cb *irlmp; + +/* this is needed for the proc_dointvec_minmax - Jean II */ +static int max_discovery_slots = 16; /* ??? */ +static int min_discovery_slots = 1; +/* IrLAP 6.13.2 says 25ms to 10+70ms - allow higher since some devices + * seems to require it. (from Dag's comment) */ +static int max_slot_timeout = 160; +static int min_slot_timeout = 20; +static int max_max_baud_rate = 16000000; /* See qos.c - IrLAP spec */ +static int min_max_baud_rate = 2400; +static int max_min_tx_turn_time = 10000; /* See qos.c - IrLAP spec */ +static int min_min_tx_turn_time; +static int max_max_tx_data_size = 2048; /* See qos.c - IrLAP spec */ +static int min_max_tx_data_size = 64; +static int max_max_tx_window = 7; /* See qos.c - IrLAP spec */ +static int min_max_tx_window = 1; +static int max_max_noreply_time = 40; /* See qos.c - IrLAP spec */ +static int min_max_noreply_time = 3; +static int max_warn_noreply_time = 3; /* 3s == standard */ +static int min_warn_noreply_time = 1; /* 1s == min WD_TIMER */ +static int max_lap_keepalive_time = 10000; /* 10s */ +static int min_lap_keepalive_time = 100; /* 100us */ +/* For other sysctl, I've no idea of the range. Maybe Dag could help + * us on that - Jean II */ + +static int do_devname(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + if (ret == 0 && write) { + struct ias_value *val; + + val = irias_new_string_value(sysctl_devname); + if (val) + irias_object_change_attribute("Device", "DeviceName", val); + } + return ret; +} + + +static int do_discovery(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (irlmp == NULL) + return -ENODEV; + + if (sysctl_discovery) + irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout*HZ); + else + del_timer_sync(&irlmp->discovery_timer); + + return ret; +} + +/* One file */ +static ctl_table irda_table[] = { + { + .procname = "discovery", + .data = &sysctl_discovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = do_discovery, + }, + { + .procname = "devname", + .data = sysctl_devname, + .maxlen = 65, + .mode = 0644, + .proc_handler = do_devname, + }, +#ifdef CONFIG_IRDA_DEBUG + { + .procname = "debug", + .data = &irda_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#ifdef CONFIG_IRDA_FAST_RR + { + .procname = "fast_poll_increase", + .data = &sysctl_fast_poll_increase, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "discovery_slots", + .data = &sysctl_discovery_slots, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_discovery_slots, + .extra2 = &max_discovery_slots + }, + { + .procname = "discovery_timeout", + .data = &sysctl_discovery_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "slot_timeout", + .data = &sysctl_slot_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_slot_timeout, + .extra2 = &max_slot_timeout + }, + { + .procname = "max_baud_rate", + .data = &sysctl_max_baud_rate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_baud_rate, + .extra2 = &max_max_baud_rate + }, + { + .procname = "min_tx_turn_time", + .data = &sysctl_min_tx_turn_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_min_tx_turn_time, + .extra2 = &max_min_tx_turn_time + }, + { + .procname = "max_tx_data_size", + .data = &sysctl_max_tx_data_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_tx_data_size, + .extra2 = &max_max_tx_data_size + }, + { + .procname = "max_tx_window", + .data = &sysctl_max_tx_window, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_tx_window, + .extra2 = &max_max_tx_window + }, + { + .procname = "max_noreply_time", + .data = &sysctl_max_noreply_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_noreply_time, + .extra2 = &max_max_noreply_time + }, + { + .procname = "warn_noreply_time", + .data = &sysctl_warn_noreply_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_warn_noreply_time, + .extra2 = &max_warn_noreply_time + }, + { + .procname = "lap_keepalive_time", + .data = &sysctl_lap_keepalive_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_lap_keepalive_time, + .extra2 = &max_lap_keepalive_time + }, + { } +}; + +static struct ctl_path irda_path[] = { + { .procname = "net", }, + { .procname = "irda", }, + { } +}; + +static struct ctl_table_header *irda_table_header; + +/* + * Function irda_sysctl_register (void) + * + * Register our sysctl interface + * + */ +int __init irda_sysctl_register(void) +{ + irda_table_header = register_sysctl_paths(irda_path, irda_table); + if (!irda_table_header) + return -ENOMEM; + + return 0; +} + +/* + * Function irda_sysctl_unregister (void) + * + * Unregister our sysctl interface + * + */ +void irda_sysctl_unregister(void) +{ + unregister_sysctl_table(irda_table_header); +} + + + diff --git a/net/irda/irttp.c b/net/irda/irttp.c new file mode 100644 index 00000000..9d9af460 --- /dev/null +++ b/net/irda/irttp.c @@ -0,0 +1,1915 @@ +/********************************************************************* + * + * Filename: irttp.c + * Version: 1.2 + * Description: Tiny Transport Protocol (TTP) implementation + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:31 1997 + * Modified at: Wed Jan 5 11:31:27 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +static struct irttp_cb *irttp; + +static void __irttp_close_tsap(struct tsap_cb *self); + +static int irttp_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static int irttp_udata_indication(void *instance, void *sap, + struct sk_buff *skb); +static void irttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *); +static void irttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 header_size, struct sk_buff *skb); +static void irttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 header_size, struct sk_buff *skb); +static void irttp_run_tx_queue(struct tsap_cb *self); +static void irttp_run_rx_queue(struct tsap_cb *self); + +static void irttp_flush_queues(struct tsap_cb *self); +static void irttp_fragment_skb(struct tsap_cb *self, struct sk_buff *skb); +static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self); +static void irttp_todo_expired(unsigned long data); +static int irttp_param_max_sdu_size(void *instance, irda_param_t *param, + int get); + +static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow); +static void irttp_status_indication(void *instance, + LINK_STATUS link, LOCK_STATUS lock); + +/* Information for parsing parameters in IrTTP */ +static pi_minor_info_t pi_minor_call_table[] = { + { NULL, 0 }, /* 0x00 */ + { irttp_param_max_sdu_size, PV_INTEGER | PV_BIG_ENDIAN } /* 0x01 */ +}; +static pi_major_info_t pi_major_call_table[] = {{ pi_minor_call_table, 2 }}; +static pi_param_info_t param_info = { pi_major_call_table, 1, 0x0f, 4 }; + +/************************ GLOBAL PROCEDURES ************************/ + +/* + * Function irttp_init (void) + * + * Initialize the IrTTP layer. Called by module initialization code + * + */ +int __init irttp_init(void) +{ + irttp = kzalloc(sizeof(struct irttp_cb), GFP_KERNEL); + if (irttp == NULL) + return -ENOMEM; + + irttp->magic = TTP_MAGIC; + + irttp->tsaps = hashbin_new(HB_LOCK); + if (!irttp->tsaps) { + IRDA_ERROR("%s: can't allocate IrTTP hashbin!\n", + __func__); + kfree(irttp); + return -ENOMEM; + } + + return 0; +} + +/* + * Function irttp_cleanup (void) + * + * Called by module destruction/cleanup code + * + */ +void irttp_cleanup(void) +{ + /* Check for main structure */ + IRDA_ASSERT(irttp->magic == TTP_MAGIC, return;); + + /* + * Delete hashbin and close all TSAP instances in it + */ + hashbin_delete(irttp->tsaps, (FREE_FUNC) __irttp_close_tsap); + + irttp->magic = 0; + + /* De-allocate main structure */ + kfree(irttp); + + irttp = NULL; +} + +/*************************** SUBROUTINES ***************************/ + +/* + * Function irttp_start_todo_timer (self, timeout) + * + * Start todo timer. + * + * Made it more effient and unsensitive to race conditions - Jean II + */ +static inline void irttp_start_todo_timer(struct tsap_cb *self, int timeout) +{ + /* Set new value for timer */ + mod_timer(&self->todo_timer, jiffies + timeout); +} + +/* + * Function irttp_todo_expired (data) + * + * Todo timer has expired! + * + * One of the restriction of the timer is that it is run only on the timer + * interrupt which run every 10ms. This mean that even if you set the timer + * with a delay of 0, it may take up to 10ms before it's run. + * So, to minimise latency and keep cache fresh, we try to avoid using + * it as much as possible. + * Note : we can't use tasklets, because they can't be asynchronously + * killed (need user context), and we can't guarantee that here... + * Jean II + */ +static void irttp_todo_expired(unsigned long data) +{ + struct tsap_cb *self = (struct tsap_cb *) data; + + /* Check that we still exist */ + if (!self || self->magic != TTP_TSAP_MAGIC) + return; + + IRDA_DEBUG(4, "%s(instance=%p)\n", __func__, self); + + /* Try to make some progress, especially on Tx side - Jean II */ + irttp_run_rx_queue(self); + irttp_run_tx_queue(self); + + /* Check if time for disconnect */ + if (test_bit(0, &self->disconnect_pend)) { + /* Check if it's possible to disconnect yet */ + if (skb_queue_empty(&self->tx_queue)) { + /* Make sure disconnect is not pending anymore */ + clear_bit(0, &self->disconnect_pend); /* FALSE */ + + /* Note : self->disconnect_skb may be NULL */ + irttp_disconnect_request(self, self->disconnect_skb, + P_NORMAL); + self->disconnect_skb = NULL; + } else { + /* Try again later */ + irttp_start_todo_timer(self, HZ/10); + + /* No reason to try and close now */ + return; + } + } + + /* Check if it's closing time */ + if (self->close_pend) + /* Finish cleanup */ + irttp_close_tsap(self); +} + +/* + * Function irttp_flush_queues (self) + * + * Flushes (removes all frames) in transitt-buffer (tx_list) + */ +static void irttp_flush_queues(struct tsap_cb *self) +{ + struct sk_buff* skb; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + /* Deallocate frames waiting to be sent */ + while ((skb = skb_dequeue(&self->tx_queue)) != NULL) + dev_kfree_skb(skb); + + /* Deallocate received frames */ + while ((skb = skb_dequeue(&self->rx_queue)) != NULL) + dev_kfree_skb(skb); + + /* Deallocate received fragments */ + while ((skb = skb_dequeue(&self->rx_fragments)) != NULL) + dev_kfree_skb(skb); +} + +/* + * Function irttp_reassemble (self) + * + * Makes a new (continuous) skb of all the fragments in the fragment + * queue + * + */ +static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self) +{ + struct sk_buff *skb, *frag; + int n = 0; /* Fragment index */ + + IRDA_ASSERT(self != NULL, return NULL;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return NULL;); + + IRDA_DEBUG(2, "%s(), self->rx_sdu_size=%d\n", __func__, + self->rx_sdu_size); + + skb = dev_alloc_skb(TTP_HEADER + self->rx_sdu_size); + if (!skb) + return NULL; + + /* + * Need to reserve space for TTP header in case this skb needs to + * be requeued in case delivery failes + */ + skb_reserve(skb, TTP_HEADER); + skb_put(skb, self->rx_sdu_size); + + /* + * Copy all fragments to a new buffer + */ + while ((frag = skb_dequeue(&self->rx_fragments)) != NULL) { + skb_copy_to_linear_data_offset(skb, n, frag->data, frag->len); + n += frag->len; + + dev_kfree_skb(frag); + } + + IRDA_DEBUG(2, + "%s(), frame len=%d, rx_sdu_size=%d, rx_max_sdu_size=%d\n", + __func__, n, self->rx_sdu_size, self->rx_max_sdu_size); + /* Note : irttp_run_rx_queue() calculate self->rx_sdu_size + * by summing the size of all fragments, so we should always + * have n == self->rx_sdu_size, except in cases where we + * droped the last fragment (when self->rx_sdu_size exceed + * self->rx_max_sdu_size), where n < self->rx_sdu_size. + * Jean II */ + IRDA_ASSERT(n <= self->rx_sdu_size, n = self->rx_sdu_size;); + + /* Set the new length */ + skb_trim(skb, n); + + self->rx_sdu_size = 0; + + return skb; +} + +/* + * Function irttp_fragment_skb (skb) + * + * Fragments a frame and queues all the fragments for transmission + * + */ +static inline void irttp_fragment_skb(struct tsap_cb *self, + struct sk_buff *skb) +{ + struct sk_buff *frag; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* + * Split frame into a number of segments + */ + while (skb->len > self->max_seg_size) { + IRDA_DEBUG(2, "%s(), fragmenting ...\n", __func__); + + /* Make new segment */ + frag = alloc_skb(self->max_seg_size+self->max_header_size, + GFP_ATOMIC); + if (!frag) + return; + + skb_reserve(frag, self->max_header_size); + + /* Copy data from the original skb into this fragment. */ + skb_copy_from_linear_data(skb, skb_put(frag, self->max_seg_size), + self->max_seg_size); + + /* Insert TTP header, with the more bit set */ + frame = skb_push(frag, TTP_HEADER); + frame[0] = TTP_MORE; + + /* Hide the copied data from the original skb */ + skb_pull(skb, self->max_seg_size); + + /* Queue fragment */ + skb_queue_tail(&self->tx_queue, frag); + } + /* Queue what is left of the original skb */ + IRDA_DEBUG(2, "%s(), queuing last segment\n", __func__); + + frame = skb_push(skb, TTP_HEADER); + frame[0] = 0x00; /* Clear more bit */ + + /* Queue fragment */ + skb_queue_tail(&self->tx_queue, skb); +} + +/* + * Function irttp_param_max_sdu_size (self, param) + * + * Handle the MaxSduSize parameter in the connect frames, this function + * will be called both when this parameter needs to be inserted into, and + * extracted from the connect frames + */ +static int irttp_param_max_sdu_size(void *instance, irda_param_t *param, + int get) +{ + struct tsap_cb *self; + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->tx_max_sdu_size; + else + self->tx_max_sdu_size = param->pv.i; + + IRDA_DEBUG(1, "%s(), MaxSduSize=%d\n", __func__, param->pv.i); + + return 0; +} + +/*************************** CLIENT CALLS ***************************/ +/************************** LMP CALLBACKS **************************/ +/* Everything is happily mixed up. Waiting for next clean up - Jean II */ + +/* + * Initialization, that has to be done on new tsap + * instance allocation and on duplication + */ +static void irttp_init_tsap(struct tsap_cb *tsap) +{ + spin_lock_init(&tsap->lock); + init_timer(&tsap->todo_timer); + + skb_queue_head_init(&tsap->rx_queue); + skb_queue_head_init(&tsap->tx_queue); + skb_queue_head_init(&tsap->rx_fragments); +} + +/* + * Function irttp_open_tsap (stsap, notify) + * + * Create TSAP connection endpoint, + */ +struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify) +{ + struct tsap_cb *self; + struct lsap_cb *lsap; + notify_t ttp_notify; + + IRDA_ASSERT(irttp->magic == TTP_MAGIC, return NULL;); + + /* The IrLMP spec (IrLMP 1.1 p10) says that we have the right to + * use only 0x01-0x6F. Of course, we can use LSAP_ANY as well. + * JeanII */ + if((stsap_sel != LSAP_ANY) && + ((stsap_sel < 0x01) || (stsap_sel >= 0x70))) { + IRDA_DEBUG(0, "%s(), invalid tsap!\n", __func__); + return NULL; + } + + self = kzalloc(sizeof(struct tsap_cb), GFP_ATOMIC); + if (self == NULL) { + IRDA_DEBUG(0, "%s(), unable to kmalloc!\n", __func__); + return NULL; + } + + /* Initialize internal objects */ + irttp_init_tsap(self); + + /* Initialise todo timer */ + self->todo_timer.data = (unsigned long) self; + self->todo_timer.function = &irttp_todo_expired; + + /* Initialize callbacks for IrLMP to use */ + irda_notify_init(&ttp_notify); + ttp_notify.connect_confirm = irttp_connect_confirm; + ttp_notify.connect_indication = irttp_connect_indication; + ttp_notify.disconnect_indication = irttp_disconnect_indication; + ttp_notify.data_indication = irttp_data_indication; + ttp_notify.udata_indication = irttp_udata_indication; + ttp_notify.flow_indication = irttp_flow_indication; + if(notify->status_indication != NULL) + ttp_notify.status_indication = irttp_status_indication; + ttp_notify.instance = self; + strncpy(ttp_notify.name, notify->name, NOTIFY_MAX_NAME); + + self->magic = TTP_TSAP_MAGIC; + self->connected = FALSE; + + /* + * Create LSAP at IrLMP layer + */ + lsap = irlmp_open_lsap(stsap_sel, &ttp_notify, 0); + if (lsap == NULL) { + IRDA_WARNING("%s: unable to allocate LSAP!!\n", __func__); + return NULL; + } + + /* + * If user specified LSAP_ANY as source TSAP selector, then IrLMP + * will replace it with whatever source selector which is free, so + * the stsap_sel we have might not be valid anymore + */ + self->stsap_sel = lsap->slsap_sel; + IRDA_DEBUG(4, "%s(), stsap_sel=%02x\n", __func__, self->stsap_sel); + + self->notify = *notify; + self->lsap = lsap; + + hashbin_insert(irttp->tsaps, (irda_queue_t *) self, (long) self, NULL); + + if (credit > TTP_RX_MAX_CREDIT) + self->initial_credit = TTP_RX_MAX_CREDIT; + else + self->initial_credit = credit; + + return self; +} +EXPORT_SYMBOL(irttp_open_tsap); + +/* + * Function irttp_close (handle) + * + * Remove an instance of a TSAP. This function should only deal with the + * deallocation of the TSAP, and resetting of the TSAPs values; + * + */ +static void __irttp_close_tsap(struct tsap_cb *self) +{ + /* First make sure we're connected. */ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + irttp_flush_queues(self); + + del_timer(&self->todo_timer); + + /* This one won't be cleaned up if we are disconnect_pend + close_pend + * and we receive a disconnect_indication */ + if (self->disconnect_skb) + dev_kfree_skb(self->disconnect_skb); + + self->connected = FALSE; + self->magic = ~TTP_TSAP_MAGIC; + + kfree(self); +} + +/* + * Function irttp_close (self) + * + * Remove TSAP from list of all TSAPs and then deallocate all resources + * associated with this TSAP + * + * Note : because we *free* the tsap structure, it is the responsibility + * of the caller to make sure we are called only once and to deal with + * possible race conditions. - Jean II + */ +int irttp_close_tsap(struct tsap_cb *self) +{ + struct tsap_cb *tsap; + + IRDA_DEBUG(4, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + /* Make sure tsap has been disconnected */ + if (self->connected) { + /* Check if disconnect is not pending */ + if (!test_bit(0, &self->disconnect_pend)) { + IRDA_WARNING("%s: TSAP still connected!\n", + __func__); + irttp_disconnect_request(self, NULL, P_NORMAL); + } + self->close_pend = TRUE; + irttp_start_todo_timer(self, HZ/10); + + return 0; /* Will be back! */ + } + + tsap = hashbin_remove(irttp->tsaps, (long) self, NULL); + + IRDA_ASSERT(tsap == self, return -1;); + + /* Close corresponding LSAP */ + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } + + __irttp_close_tsap(self); + + return 0; +} +EXPORT_SYMBOL(irttp_close_tsap); + +/* + * Function irttp_udata_request (self, skb) + * + * Send unreliable data on this TSAP + * + */ +int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + IRDA_DEBUG(4, "%s()\n", __func__); + + /* Take shortcut on zero byte packets */ + if (skb->len == 0) { + ret = 0; + goto err; + } + + /* Check that nothing bad happens */ + if (!self->connected) { + IRDA_WARNING("%s(), Not connected\n", __func__); + ret = -ENOTCONN; + goto err; + } + + if (skb->len > self->max_seg_size) { + IRDA_ERROR("%s(), UData is too large for IrLAP!\n", __func__); + ret = -EMSGSIZE; + goto err; + } + + irlmp_udata_request(self->lsap, skb); + self->stats.tx_packets++; + + return 0; + +err: + dev_kfree_skb(skb); + return ret; +} +EXPORT_SYMBOL(irttp_udata_request); + + +/* + * Function irttp_data_request (handle, skb) + * + * Queue frame for transmission. If SAR is enabled, fragement the frame + * and queue the fragments for transmission + */ +int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb) +{ + __u8 *frame; + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + IRDA_DEBUG(2, "%s() : queue len = %d\n", __func__, + skb_queue_len(&self->tx_queue)); + + /* Take shortcut on zero byte packets */ + if (skb->len == 0) { + ret = 0; + goto err; + } + + /* Check that nothing bad happens */ + if (!self->connected) { + IRDA_WARNING("%s: Not connected\n", __func__); + ret = -ENOTCONN; + goto err; + } + + /* + * Check if SAR is disabled, and the frame is larger than what fits + * inside an IrLAP frame + */ + if ((self->tx_max_sdu_size == 0) && (skb->len > self->max_seg_size)) { + IRDA_ERROR("%s: SAR disabled, and data is too large for IrLAP!\n", + __func__); + ret = -EMSGSIZE; + goto err; + } + + /* + * Check if SAR is enabled, and the frame is larger than the + * TxMaxSduSize + */ + if ((self->tx_max_sdu_size != 0) && + (self->tx_max_sdu_size != TTP_SAR_UNBOUND) && + (skb->len > self->tx_max_sdu_size)) + { + IRDA_ERROR("%s: SAR enabled, but data is larger than TxMaxSduSize!\n", + __func__); + ret = -EMSGSIZE; + goto err; + } + /* + * Check if transmit queue is full + */ + if (skb_queue_len(&self->tx_queue) >= TTP_TX_MAX_QUEUE) { + /* + * Give it a chance to empty itself + */ + irttp_run_tx_queue(self); + + /* Drop packet. This error code should trigger the caller + * to resend the data in the client code - Jean II */ + ret = -ENOBUFS; + goto err; + } + + /* Queue frame, or queue frame segments */ + if ((self->tx_max_sdu_size == 0) || (skb->len < self->max_seg_size)) { + /* Queue frame */ + IRDA_ASSERT(skb_headroom(skb) >= TTP_HEADER, return -1;); + frame = skb_push(skb, TTP_HEADER); + frame[0] = 0x00; /* Clear more bit */ + + skb_queue_tail(&self->tx_queue, skb); + } else { + /* + * Fragment the frame, this function will also queue the + * fragments, we don't care about the fact the transmit + * queue may be overfilled by all the segments for a little + * while + */ + irttp_fragment_skb(self, skb); + } + + /* Check if we can accept more data from client */ + if ((!self->tx_sdu_busy) && + (skb_queue_len(&self->tx_queue) > TTP_TX_HIGH_THRESHOLD)) { + /* Tx queue filling up, so stop client. */ + if (self->notify.flow_indication) { + self->notify.flow_indication(self->notify.instance, + self, FLOW_STOP); + } + /* self->tx_sdu_busy is the state of the client. + * Update state after notifying client to avoid + * race condition with irttp_flow_indication(). + * If the queue empty itself after our test but before + * we set the flag, we will fix ourselves below in + * irttp_run_tx_queue(). + * Jean II */ + self->tx_sdu_busy = TRUE; + } + + /* Try to make some progress */ + irttp_run_tx_queue(self); + + return 0; + +err: + dev_kfree_skb(skb); + return ret; +} +EXPORT_SYMBOL(irttp_data_request); + +/* + * Function irttp_run_tx_queue (self) + * + * Transmit packets queued for transmission (if possible) + * + */ +static void irttp_run_tx_queue(struct tsap_cb *self) +{ + struct sk_buff *skb; + unsigned long flags; + int n; + + IRDA_DEBUG(2, "%s() : send_credit = %d, queue_len = %d\n", + __func__, + self->send_credit, skb_queue_len(&self->tx_queue)); + + /* Get exclusive access to the tx queue, otherwise don't touch it */ + if (irda_lock(&self->tx_queue_lock) == FALSE) + return; + + /* Try to send out frames as long as we have credits + * and as long as LAP is not full. If LAP is full, it will + * poll us through irttp_flow_indication() - Jean II */ + while ((self->send_credit > 0) && + (!irlmp_lap_tx_queue_full(self->lsap)) && + (skb = skb_dequeue(&self->tx_queue))) + { + /* + * Since we can transmit and receive frames concurrently, + * the code below is a critical region and we must assure that + * nobody messes with the credits while we update them. + */ + spin_lock_irqsave(&self->lock, flags); + + n = self->avail_credit; + self->avail_credit = 0; + + /* Only room for 127 credits in frame */ + if (n > 127) { + self->avail_credit = n-127; + n = 127; + } + self->remote_credit += n; + self->send_credit--; + + spin_unlock_irqrestore(&self->lock, flags); + + /* + * More bit must be set by the data_request() or fragment() + * functions + */ + skb->data[0] |= (n & 0x7f); + + /* Detach from socket. + * The current skb has a reference to the socket that sent + * it (skb->sk). When we pass it to IrLMP, the skb will be + * stored in in IrLAP (self->wx_list). When we are within + * IrLAP, we lose the notion of socket, so we should not + * have a reference to a socket. So, we drop it here. + * + * Why does it matter ? + * When the skb is freed (kfree_skb), if it is associated + * with a socket, it release buffer space on the socket + * (through sock_wfree() and sock_def_write_space()). + * If the socket no longer exist, we may crash. Hard. + * When we close a socket, we make sure that associated packets + * in IrTTP are freed. However, we have no way to cancel + * the packet that we have passed to IrLAP. So, if a packet + * remains in IrLAP (retry on the link or else) after we + * close the socket, we are dead ! + * Jean II */ + if (skb->sk != NULL) { + /* IrSOCK application, IrOBEX, ... */ + skb_orphan(skb); + } + /* IrCOMM over IrTTP, IrLAN, ... */ + + /* Pass the skb to IrLMP - done */ + irlmp_data_request(self->lsap, skb); + self->stats.tx_packets++; + } + + /* Check if we can accept more frames from client. + * We don't want to wait until the todo timer to do that, and we + * can't use tasklets (grr...), so we are obliged to give control + * to client. That's ok, this test will be true not too often + * (max once per LAP window) and we are called from places + * where we can spend a bit of time doing stuff. - Jean II */ + if ((self->tx_sdu_busy) && + (skb_queue_len(&self->tx_queue) < TTP_TX_LOW_THRESHOLD) && + (!self->close_pend)) + { + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, + self, FLOW_START); + + /* self->tx_sdu_busy is the state of the client. + * We don't really have a race here, but it's always safer + * to update our state after the client - Jean II */ + self->tx_sdu_busy = FALSE; + } + + /* Reset lock */ + self->tx_queue_lock = 0; +} + +/* + * Function irttp_give_credit (self) + * + * Send a dataless flowdata TTP-PDU and give available credit to peer + * TSAP + */ +static inline void irttp_give_credit(struct tsap_cb *self) +{ + struct sk_buff *tx_skb = NULL; + unsigned long flags; + int n; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + IRDA_DEBUG(4, "%s() send=%d,avail=%d,remote=%d\n", + __func__, + self->send_credit, self->avail_credit, self->remote_credit); + + /* Give credit to peer */ + tx_skb = alloc_skb(TTP_MAX_HEADER, GFP_ATOMIC); + if (!tx_skb) + return; + + /* Reserve space for LMP, and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + + /* + * Since we can transmit and receive frames concurrently, + * the code below is a critical region and we must assure that + * nobody messes with the credits while we update them. + */ + spin_lock_irqsave(&self->lock, flags); + + n = self->avail_credit; + self->avail_credit = 0; + + /* Only space for 127 credits in frame */ + if (n > 127) { + self->avail_credit = n - 127; + n = 127; + } + self->remote_credit += n; + + spin_unlock_irqrestore(&self->lock, flags); + + skb_put(tx_skb, 1); + tx_skb->data[0] = (__u8) (n & 0x7f); + + irlmp_data_request(self->lsap, tx_skb); + self->stats.tx_packets++; +} + +/* + * Function irttp_udata_indication (instance, sap, skb) + * + * Received some unit-data (unreliable) + * + */ +static int irttp_udata_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct tsap_cb *self; + int err; + + IRDA_DEBUG(4, "%s()\n", __func__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + self->stats.rx_packets++; + + /* Just pass data to layer above */ + if (self->notify.udata_indication) { + err = self->notify.udata_indication(self->notify.instance, + self,skb); + /* Same comment as in irttp_do_data_indication() */ + if (!err) + return 0; + } + /* Either no handler, or handler returns an error */ + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irttp_data_indication (instance, sap, skb) + * + * Receive segment from IrLMP. + * + */ +static int irttp_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct tsap_cb *self; + unsigned long flags; + int n; + + self = (struct tsap_cb *) instance; + + n = skb->data[0] & 0x7f; /* Extract the credits */ + + self->stats.rx_packets++; + + /* Deal with inbound credit + * Since we can transmit and receive frames concurrently, + * the code below is a critical region and we must assure that + * nobody messes with the credits while we update them. + */ + spin_lock_irqsave(&self->lock, flags); + self->send_credit += n; + if (skb->len > 1) + self->remote_credit--; + spin_unlock_irqrestore(&self->lock, flags); + + /* + * Data or dataless packet? Dataless frames contains only the + * TTP_HEADER. + */ + if (skb->len > 1) { + /* + * We don't remove the TTP header, since we must preserve the + * more bit, so the defragment routing knows what to do + */ + skb_queue_tail(&self->rx_queue, skb); + } else { + /* Dataless flowdata TTP-PDU */ + dev_kfree_skb(skb); + } + + + /* Push data to the higher layer. + * We do it synchronously because running the todo timer for each + * receive packet would be too much overhead and latency. + * By passing control to the higher layer, we run the risk that + * it may take time or grab a lock. Most often, the higher layer + * will only put packet in a queue. + * Anyway, packets are only dripping through the IrDA, so we can + * have time before the next packet. + * Further, we are run from NET_BH, so the worse that can happen is + * us missing the optimal time to send back the PF bit in LAP. + * Jean II */ + irttp_run_rx_queue(self); + + /* We now give credits to peer in irttp_run_rx_queue(). + * We need to send credit *NOW*, otherwise we are going + * to miss the next Tx window. The todo timer may take + * a while before it's run... - Jean II */ + + /* + * If the peer device has given us some credits and we didn't have + * anyone from before, then we need to shedule the tx queue. + * We need to do that because our Tx have stopped (so we may not + * get any LAP flow indication) and the user may be stopped as + * well. - Jean II + */ + if (self->send_credit == n) { + /* Restart pushing stuff to LAP */ + irttp_run_tx_queue(self); + /* Note : we don't want to schedule the todo timer + * because it has horrible latency. No tasklets + * because the tasklet API is broken. - Jean II */ + } + + return 0; +} + +/* + * Function irttp_status_indication (self, reason) + * + * Status_indication, just pass to the higher layer... + * + */ +static void irttp_status_indication(void *instance, + LINK_STATUS link, LOCK_STATUS lock) +{ + struct tsap_cb *self; + + IRDA_DEBUG(4, "%s()\n", __func__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + /* Check if client has already closed the TSAP and gone away */ + if (self->close_pend) + return; + + /* + * Inform service user if he has requested it + */ + if (self->notify.status_indication != NULL) + self->notify.status_indication(self->notify.instance, + link, lock); + else + IRDA_DEBUG(2, "%s(), no handler\n", __func__); +} + +/* + * Function irttp_flow_indication (self, reason) + * + * Flow_indication : IrLAP tells us to send more data. + * + */ +static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) +{ + struct tsap_cb *self; + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + IRDA_DEBUG(4, "%s(instance=%p)\n", __func__, self); + + /* We are "polled" directly from LAP, and the LAP want to fill + * its Tx window. We want to do our best to send it data, so that + * we maximise the window. On the other hand, we want to limit the + * amount of work here so that LAP doesn't hang forever waiting + * for packets. - Jean II */ + + /* Try to send some packets. Currently, LAP calls us every time + * there is one free slot, so we will send only one packet. + * This allow the scheduler to do its round robin - Jean II */ + irttp_run_tx_queue(self); + + /* Note regarding the interraction with higher layer. + * irttp_run_tx_queue() may call the client when its queue + * start to empty, via notify.flow_indication(). Initially. + * I wanted this to happen in a tasklet, to avoid client + * grabbing the CPU, but we can't use tasklets safely. And timer + * is definitely too slow. + * This will happen only once per LAP window, and usually at + * the third packet (unless window is smaller). LAP is still + * doing mtt and sending first packet so it's sort of OK + * to do that. Jean II */ + + /* If we need to send disconnect. try to do it now */ + if(self->disconnect_pend) + irttp_start_todo_timer(self, 0); +} + +/* + * Function irttp_flow_request (self, command) + * + * This function could be used by the upper layers to tell IrTTP to stop + * delivering frames if the receive queues are starting to get full, or + * to tell IrTTP to start delivering frames again. + */ +void irttp_flow_request(struct tsap_cb *self, LOCAL_FLOW flow) +{ + IRDA_DEBUG(1, "%s()\n", __func__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + switch (flow) { + case FLOW_STOP: + IRDA_DEBUG(1, "%s(), flow stop\n", __func__); + self->rx_sdu_busy = TRUE; + break; + case FLOW_START: + IRDA_DEBUG(1, "%s(), flow start\n", __func__); + self->rx_sdu_busy = FALSE; + + /* Client say he can accept more data, try to free our + * queues ASAP - Jean II */ + irttp_run_rx_queue(self); + + break; + default: + IRDA_DEBUG(1, "%s(), Unknown flow command!\n", __func__); + } +} +EXPORT_SYMBOL(irttp_flow_request); + +/* + * Function irttp_connect_request (self, dtsap_sel, daddr, qos) + * + * Try to connect to remote destination TSAP selector + * + */ +int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel, + __u32 saddr, __u32 daddr, + struct qos_info *qos, __u32 max_sdu_size, + struct sk_buff *userdata) +{ + struct sk_buff *tx_skb; + __u8 *frame; + __u8 n; + + IRDA_DEBUG(4, "%s(), max_sdu_size=%d\n", __func__, max_sdu_size); + + IRDA_ASSERT(self != NULL, return -EBADR;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -EBADR;); + + if (self->connected) { + if(userdata) + dev_kfree_skb(userdata); + return -EISCONN; + } + + /* Any userdata supplied? */ + if (userdata == NULL) { + tx_skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, + GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER); + } else { + tx_skb = userdata; + /* + * Check that the client has reserved enough space for + * headers + */ + IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER, + { dev_kfree_skb(userdata); return -1; } ); + } + + /* Initialize connection parameters */ + self->connected = FALSE; + self->avail_credit = 0; + self->rx_max_sdu_size = max_sdu_size; + self->rx_sdu_size = 0; + self->rx_sdu_busy = FALSE; + self->dtsap_sel = dtsap_sel; + + n = self->initial_credit; + + self->remote_credit = 0; + self->send_credit = 0; + + /* + * Give away max 127 credits for now + */ + if (n > 127) { + self->avail_credit=n-127; + n = 127; + } + + self->remote_credit = n; + + /* SAR enabled? */ + if (max_sdu_size > 0) { + IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER), + { dev_kfree_skb(tx_skb); return -1; } ); + + /* Insert SAR parameters */ + frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER); + + frame[0] = TTP_PARAMETERS | n; + frame[1] = 0x04; /* Length */ + frame[2] = 0x01; /* MaxSduSize */ + frame[3] = 0x02; /* Value length */ + + put_unaligned(cpu_to_be16((__u16) max_sdu_size), + (__be16 *)(frame+4)); + } else { + /* Insert plain TTP header */ + frame = skb_push(tx_skb, TTP_HEADER); + + /* Insert initial credit in frame */ + frame[0] = n & 0x7f; + } + + /* Connect with IrLMP. No QoS parameters for now */ + return irlmp_connect_request(self->lsap, dtsap_sel, saddr, daddr, qos, + tx_skb); +} +EXPORT_SYMBOL(irttp_connect_request); + +/* + * Function irttp_connect_confirm (handle, qos, skb) + * + * Service user confirms TSAP connection with peer. + * + */ +static void irttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, + __u8 max_header_size, struct sk_buff *skb) +{ + struct tsap_cb *self; + int parameters; + int ret; + __u8 plen; + __u8 n; + + IRDA_DEBUG(4, "%s()\n", __func__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + self->max_seg_size = max_seg_size - TTP_HEADER; + self->max_header_size = max_header_size + TTP_HEADER; + + /* + * Check if we have got some QoS parameters back! This should be the + * negotiated QoS for the link. + */ + if (qos) { + IRDA_DEBUG(4, "IrTTP, Negotiated BAUD_RATE: %02x\n", + qos->baud_rate.bits); + IRDA_DEBUG(4, "IrTTP, Negotiated BAUD_RATE: %d bps.\n", + qos->baud_rate.value); + } + + n = skb->data[0] & 0x7f; + + IRDA_DEBUG(4, "%s(), Initial send_credit=%d\n", __func__, n); + + self->send_credit = n; + self->tx_max_sdu_size = 0; + self->connected = TRUE; + + parameters = skb->data[0] & 0x80; + + IRDA_ASSERT(skb->len >= TTP_HEADER, return;); + skb_pull(skb, TTP_HEADER); + + if (parameters) { + plen = skb->data[0]; + + ret = irda_param_extract_all(self, skb->data+1, + IRDA_MIN(skb->len-1, plen), + ¶m_info); + + /* Any errors in the parameter list? */ + if (ret < 0) { + IRDA_WARNING("%s: error extracting parameters\n", + __func__); + dev_kfree_skb(skb); + + /* Do not accept this connection attempt */ + return; + } + /* Remove parameters */ + skb_pull(skb, IRDA_MIN(skb->len, plen+1)); + } + + IRDA_DEBUG(4, "%s() send=%d,avail=%d,remote=%d\n", __func__, + self->send_credit, self->avail_credit, self->remote_credit); + + IRDA_DEBUG(2, "%s(), MaxSduSize=%d\n", __func__, + self->tx_max_sdu_size); + + if (self->notify.connect_confirm) { + self->notify.connect_confirm(self->notify.instance, self, qos, + self->tx_max_sdu_size, + self->max_header_size, skb); + } else + dev_kfree_skb(skb); +} + +/* + * Function irttp_connect_indication (handle, skb) + * + * Some other device is connecting to this TSAP + * + */ +static void irttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, __u8 max_header_size, + struct sk_buff *skb) +{ + struct tsap_cb *self; + struct lsap_cb *lsap; + int parameters; + int ret; + __u8 plen; + __u8 n; + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + lsap = (struct lsap_cb *) sap; + + self->max_seg_size = max_seg_size - TTP_HEADER; + self->max_header_size = max_header_size+TTP_HEADER; + + IRDA_DEBUG(4, "%s(), TSAP sel=%02x\n", __func__, self->stsap_sel); + + /* Need to update dtsap_sel if its equal to LSAP_ANY */ + self->dtsap_sel = lsap->dlsap_sel; + + n = skb->data[0] & 0x7f; + + self->send_credit = n; + self->tx_max_sdu_size = 0; + + parameters = skb->data[0] & 0x80; + + IRDA_ASSERT(skb->len >= TTP_HEADER, return;); + skb_pull(skb, TTP_HEADER); + + if (parameters) { + plen = skb->data[0]; + + ret = irda_param_extract_all(self, skb->data+1, + IRDA_MIN(skb->len-1, plen), + ¶m_info); + + /* Any errors in the parameter list? */ + if (ret < 0) { + IRDA_WARNING("%s: error extracting parameters\n", + __func__); + dev_kfree_skb(skb); + + /* Do not accept this connection attempt */ + return; + } + + /* Remove parameters */ + skb_pull(skb, IRDA_MIN(skb->len, plen+1)); + } + + if (self->notify.connect_indication) { + self->notify.connect_indication(self->notify.instance, self, + qos, self->tx_max_sdu_size, + self->max_header_size, skb); + } else + dev_kfree_skb(skb); +} + +/* + * Function irttp_connect_response (handle, userdata) + * + * Service user is accepting the connection, just pass it down to + * IrLMP! + * + */ +int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size, + struct sk_buff *userdata) +{ + struct sk_buff *tx_skb; + __u8 *frame; + int ret; + __u8 n; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s(), Source TSAP selector=%02x\n", __func__, + self->stsap_sel); + + /* Any userdata supplied? */ + if (userdata == NULL) { + tx_skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, + GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER); + } else { + tx_skb = userdata; + /* + * Check that the client has reserved enough space for + * headers + */ + IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER, + { dev_kfree_skb(userdata); return -1; } ); + } + + self->avail_credit = 0; + self->remote_credit = 0; + self->rx_max_sdu_size = max_sdu_size; + self->rx_sdu_size = 0; + self->rx_sdu_busy = FALSE; + + n = self->initial_credit; + + /* Frame has only space for max 127 credits (7 bits) */ + if (n > 127) { + self->avail_credit = n - 127; + n = 127; + } + + self->remote_credit = n; + self->connected = TRUE; + + /* SAR enabled? */ + if (max_sdu_size > 0) { + IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER), + { dev_kfree_skb(tx_skb); return -1; } ); + + /* Insert TTP header with SAR parameters */ + frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER); + + frame[0] = TTP_PARAMETERS | n; + frame[1] = 0x04; /* Length */ + + /* irda_param_insert(self, IRTTP_MAX_SDU_SIZE, frame+1, */ +/* TTP_SAR_HEADER, ¶m_info) */ + + frame[2] = 0x01; /* MaxSduSize */ + frame[3] = 0x02; /* Value length */ + + put_unaligned(cpu_to_be16((__u16) max_sdu_size), + (__be16 *)(frame+4)); + } else { + /* Insert TTP header */ + frame = skb_push(tx_skb, TTP_HEADER); + + frame[0] = n & 0x7f; + } + + ret = irlmp_connect_response(self->lsap, tx_skb); + + return ret; +} +EXPORT_SYMBOL(irttp_connect_response); + +/* + * Function irttp_dup (self, instance) + * + * Duplicate TSAP, can be used by servers to confirm a connection on a + * new TSAP so it can keep listening on the old one. + */ +struct tsap_cb *irttp_dup(struct tsap_cb *orig, void *instance) +{ + struct tsap_cb *new; + unsigned long flags; + + IRDA_DEBUG(1, "%s()\n", __func__); + + /* Protect our access to the old tsap instance */ + spin_lock_irqsave(&irttp->tsaps->hb_spinlock, flags); + + /* Find the old instance */ + if (!hashbin_find(irttp->tsaps, (long) orig, NULL)) { + IRDA_DEBUG(0, "%s(), unable to find TSAP\n", __func__); + spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); + return NULL; + } + + /* Allocate a new instance */ + new = kmalloc(sizeof(struct tsap_cb), GFP_ATOMIC); + if (!new) { + IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __func__); + spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); + return NULL; + } + /* Dup */ + memcpy(new, orig, sizeof(struct tsap_cb)); + spin_lock_init(&new->lock); + + /* We don't need the old instance any more */ + spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); + + /* Try to dup the LSAP (may fail if we were too slow) */ + new->lsap = irlmp_dup(orig->lsap, new); + if (!new->lsap) { + IRDA_DEBUG(0, "%s(), dup failed!\n", __func__); + kfree(new); + return NULL; + } + + /* Not everything should be copied */ + new->notify.instance = instance; + + /* Initialize internal objects */ + irttp_init_tsap(new); + + /* This is locked */ + hashbin_insert(irttp->tsaps, (irda_queue_t *) new, (long) new, NULL); + + return new; +} +EXPORT_SYMBOL(irttp_dup); + +/* + * Function irttp_disconnect_request (self) + * + * Close this connection please! If priority is high, the queued data + * segments, if any, will be deallocated first + * + */ +int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata, + int priority) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + /* Already disconnected? */ + if (!self->connected) { + IRDA_DEBUG(4, "%s(), already disconnected!\n", __func__); + if (userdata) + dev_kfree_skb(userdata); + return -1; + } + + /* Disconnect already pending ? + * We need to use an atomic operation to prevent reentry. This + * function may be called from various context, like user, timer + * for following a disconnect_indication() (i.e. net_bh). + * Jean II */ + if(test_and_set_bit(0, &self->disconnect_pend)) { + IRDA_DEBUG(0, "%s(), disconnect already pending\n", + __func__); + if (userdata) + dev_kfree_skb(userdata); + + /* Try to make some progress */ + irttp_run_tx_queue(self); + return -1; + } + + /* + * Check if there is still data segments in the transmit queue + */ + if (!skb_queue_empty(&self->tx_queue)) { + if (priority == P_HIGH) { + /* + * No need to send the queued data, if we are + * disconnecting right now since the data will + * not have any usable connection to be sent on + */ + IRDA_DEBUG(1, "%s(): High priority!!()\n", __func__); + irttp_flush_queues(self); + } else if (priority == P_NORMAL) { + /* + * Must delay disconnect until after all data segments + * have been sent and the tx_queue is empty + */ + /* We'll reuse this one later for the disconnect */ + self->disconnect_skb = userdata; /* May be NULL */ + + irttp_run_tx_queue(self); + + irttp_start_todo_timer(self, HZ/10); + return -1; + } + } + /* Note : we don't need to check if self->rx_queue is full and the + * state of self->rx_sdu_busy because the disconnect response will + * be sent at the LMP level (so even if the peer has its Tx queue + * full of data). - Jean II */ + + IRDA_DEBUG(1, "%s(), Disconnecting ...\n", __func__); + self->connected = FALSE; + + if (!userdata) { + struct sk_buff *tx_skb; + tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC); + if (!tx_skb) + return -ENOMEM; + + /* + * Reserve space for MUX and LAP header + */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + + userdata = tx_skb; + } + ret = irlmp_disconnect_request(self->lsap, userdata); + + /* The disconnect is no longer pending */ + clear_bit(0, &self->disconnect_pend); /* FALSE */ + + return ret; +} +EXPORT_SYMBOL(irttp_disconnect_request); + +/* + * Function irttp_disconnect_indication (self, reason) + * + * Disconnect indication, TSAP disconnected by peer? + * + */ +static void irttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *skb) +{ + struct tsap_cb *self; + + IRDA_DEBUG(4, "%s()\n", __func__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + /* Prevent higher layer to send more data */ + self->connected = FALSE; + + /* Check if client has already tried to close the TSAP */ + if (self->close_pend) { + /* In this case, the higher layer is probably gone. Don't + * bother it and clean up the remains - Jean II */ + if (skb) + dev_kfree_skb(skb); + irttp_close_tsap(self); + return; + } + + /* If we are here, we assume that is the higher layer is still + * waiting for the disconnect notification and able to process it, + * even if he tried to disconnect. Otherwise, it would have already + * attempted to close the tsap and self->close_pend would be TRUE. + * Jean II */ + + /* No need to notify the client if has already tried to disconnect */ + if(self->notify.disconnect_indication) + self->notify.disconnect_indication(self->notify.instance, self, + reason, skb); + else + if (skb) + dev_kfree_skb(skb); +} + +/* + * Function irttp_do_data_indication (self, skb) + * + * Try to deliver reassembled skb to layer above, and requeue it if that + * for some reason should fail. We mark rx sdu as busy to apply back + * pressure is necessary. + */ +static void irttp_do_data_indication(struct tsap_cb *self, struct sk_buff *skb) +{ + int err; + + /* Check if client has already closed the TSAP and gone away */ + if (self->close_pend) { + dev_kfree_skb(skb); + return; + } + + err = self->notify.data_indication(self->notify.instance, self, skb); + + /* Usually the layer above will notify that it's input queue is + * starting to get filled by using the flow request, but this may + * be difficult, so it can instead just refuse to eat it and just + * give an error back + */ + if (err) { + IRDA_DEBUG(0, "%s() requeueing skb!\n", __func__); + + /* Make sure we take a break */ + self->rx_sdu_busy = TRUE; + + /* Need to push the header in again */ + skb_push(skb, TTP_HEADER); + skb->data[0] = 0x00; /* Make sure MORE bit is cleared */ + + /* Put skb back on queue */ + skb_queue_head(&self->rx_queue, skb); + } +} + +/* + * Function irttp_run_rx_queue (self) + * + * Check if we have any frames to be transmitted, or if we have any + * available credit to give away. + */ +static void irttp_run_rx_queue(struct tsap_cb *self) +{ + struct sk_buff *skb; + int more = 0; + + IRDA_DEBUG(2, "%s() send=%d,avail=%d,remote=%d\n", __func__, + self->send_credit, self->avail_credit, self->remote_credit); + + /* Get exclusive access to the rx queue, otherwise don't touch it */ + if (irda_lock(&self->rx_queue_lock) == FALSE) + return; + + /* + * Reassemble all frames in receive queue and deliver them + */ + while (!self->rx_sdu_busy && (skb = skb_dequeue(&self->rx_queue))) { + /* This bit will tell us if it's the last fragment or not */ + more = skb->data[0] & 0x80; + + /* Remove TTP header */ + skb_pull(skb, TTP_HEADER); + + /* Add the length of the remaining data */ + self->rx_sdu_size += skb->len; + + /* + * If SAR is disabled, or user has requested no reassembly + * of received fragments then we just deliver them + * immediately. This can be requested by clients that + * implements byte streams without any message boundaries + */ + if (self->rx_max_sdu_size == TTP_SAR_DISABLE) { + irttp_do_data_indication(self, skb); + self->rx_sdu_size = 0; + + continue; + } + + /* Check if this is a fragment, and not the last fragment */ + if (more) { + /* + * Queue the fragment if we still are within the + * limits of the maximum size of the rx_sdu + */ + if (self->rx_sdu_size <= self->rx_max_sdu_size) { + IRDA_DEBUG(4, "%s(), queueing frag\n", + __func__); + skb_queue_tail(&self->rx_fragments, skb); + } else { + /* Free the part of the SDU that is too big */ + dev_kfree_skb(skb); + } + continue; + } + /* + * This is the last fragment, so time to reassemble! + */ + if ((self->rx_sdu_size <= self->rx_max_sdu_size) || + (self->rx_max_sdu_size == TTP_SAR_UNBOUND)) + { + /* + * A little optimizing. Only queue the fragment if + * there are other fragments. Since if this is the + * last and only fragment, there is no need to + * reassemble :-) + */ + if (!skb_queue_empty(&self->rx_fragments)) { + skb_queue_tail(&self->rx_fragments, + skb); + + skb = irttp_reassemble_skb(self); + } + + /* Now we can deliver the reassembled skb */ + irttp_do_data_indication(self, skb); + } else { + IRDA_DEBUG(1, "%s(), Truncated frame\n", __func__); + + /* Free the part of the SDU that is too big */ + dev_kfree_skb(skb); + + /* Deliver only the valid but truncated part of SDU */ + skb = irttp_reassemble_skb(self); + + irttp_do_data_indication(self, skb); + } + self->rx_sdu_size = 0; + } + + /* + * It's not trivial to keep track of how many credits are available + * by incrementing at each packet, because delivery may fail + * (irttp_do_data_indication() may requeue the frame) and because + * we need to take care of fragmentation. + * We want the other side to send up to initial_credit packets. + * We have some frames in our queues, and we have already allowed it + * to send remote_credit. + * No need to spinlock, write is atomic and self correcting... + * Jean II + */ + self->avail_credit = (self->initial_credit - + (self->remote_credit + + skb_queue_len(&self->rx_queue) + + skb_queue_len(&self->rx_fragments))); + + /* Do we have too much credits to send to peer ? */ + if ((self->remote_credit <= TTP_RX_MIN_CREDIT) && + (self->avail_credit > 0)) { + /* Send explicit credit frame */ + irttp_give_credit(self); + /* Note : do *NOT* check if tx_queue is non-empty, that + * will produce deadlocks. I repeat : send a credit frame + * even if we have something to send in our Tx queue. + * If we have credits, it means that our Tx queue is blocked. + * + * Let's suppose the peer can't keep up with our Tx. He will + * flow control us by not sending us any credits, and we + * will stop Tx and start accumulating credits here. + * Up to the point where the peer will stop its Tx queue, + * for lack of credits. + * Let's assume the peer application is single threaded. + * It will block on Tx and never consume any Rx buffer. + * Deadlock. Guaranteed. - Jean II + */ + } + + /* Reset lock */ + self->rx_queue_lock = 0; +} + +#ifdef CONFIG_PROC_FS +struct irttp_iter_state { + int id; +}; + +static void *irttp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct irttp_iter_state *iter = seq->private; + struct tsap_cb *self; + + /* Protect our access to the tsap list */ + spin_lock_irq(&irttp->tsaps->hb_spinlock); + iter->id = 0; + + for (self = (struct tsap_cb *) hashbin_get_first(irttp->tsaps); + self != NULL; + self = (struct tsap_cb *) hashbin_get_next(irttp->tsaps)) { + if (iter->id == *pos) + break; + ++iter->id; + } + + return self; +} + +static void *irttp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct irttp_iter_state *iter = seq->private; + + ++*pos; + ++iter->id; + return (void *) hashbin_get_next(irttp->tsaps); +} + +static void irttp_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irttp->tsaps->hb_spinlock); +} + +static int irttp_seq_show(struct seq_file *seq, void *v) +{ + const struct irttp_iter_state *iter = seq->private; + const struct tsap_cb *self = v; + + seq_printf(seq, "TSAP %d, ", iter->id); + seq_printf(seq, "stsap_sel: %02x, ", + self->stsap_sel); + seq_printf(seq, "dtsap_sel: %02x\n", + self->dtsap_sel); + seq_printf(seq, " connected: %s, ", + self->connected? "TRUE":"FALSE"); + seq_printf(seq, "avail credit: %d, ", + self->avail_credit); + seq_printf(seq, "remote credit: %d, ", + self->remote_credit); + seq_printf(seq, "send credit: %d\n", + self->send_credit); + seq_printf(seq, " tx packets: %lu, ", + self->stats.tx_packets); + seq_printf(seq, "rx packets: %lu, ", + self->stats.rx_packets); + seq_printf(seq, "tx_queue len: %u ", + skb_queue_len(&self->tx_queue)); + seq_printf(seq, "rx_queue len: %u\n", + skb_queue_len(&self->rx_queue)); + seq_printf(seq, " tx_sdu_busy: %s, ", + self->tx_sdu_busy? "TRUE":"FALSE"); + seq_printf(seq, "rx_sdu_busy: %s\n", + self->rx_sdu_busy? "TRUE":"FALSE"); + seq_printf(seq, " max_seg_size: %u, ", + self->max_seg_size); + seq_printf(seq, "tx_max_sdu_size: %u, ", + self->tx_max_sdu_size); + seq_printf(seq, "rx_max_sdu_size: %u\n", + self->rx_max_sdu_size); + + seq_printf(seq, " Used by (%s)\n\n", + self->notify.name); + return 0; +} + +static const struct seq_operations irttp_seq_ops = { + .start = irttp_seq_start, + .next = irttp_seq_next, + .stop = irttp_seq_stop, + .show = irttp_seq_show, +}; + +static int irttp_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &irttp_seq_ops, + sizeof(struct irttp_iter_state)); +} + +const struct file_operations irttp_seq_fops = { + .owner = THIS_MODULE, + .open = irttp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ diff --git a/net/irda/parameters.c b/net/irda/parameters.c new file mode 100644 index 00000000..71cd38c1 --- /dev/null +++ b/net/irda/parameters.c @@ -0,0 +1,591 @@ +/********************************************************************* + * + * Filename: parameters.c + * Version: 1.0 + * Description: A more general way to handle (pi,pl,pv) parameters + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Jun 7 10:25:11 1999 + * Modified at: Sun Jan 30 14:08:39 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include + +#include +#include + +static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); + +static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); + +static int irda_param_unpack(__u8 *buf, char *fmt, ...); + +/* Parameter value call table. Must match PV_TYPE */ +static PV_HANDLER pv_extract_table[] = { + irda_extract_integer, /* Handler for any length integers */ + irda_extract_integer, /* Handler for 8 bits integers */ + irda_extract_integer, /* Handler for 16 bits integers */ + irda_extract_string, /* Handler for strings */ + irda_extract_integer, /* Handler for 32 bits integers */ + irda_extract_octseq, /* Handler for octet sequences */ + irda_extract_no_value /* Handler for no value parameters */ +}; + +static PV_HANDLER pv_insert_table[] = { + irda_insert_integer, /* Handler for any length integers */ + irda_insert_integer, /* Handler for 8 bits integers */ + irda_insert_integer, /* Handler for 16 bits integers */ + NULL, /* Handler for strings */ + irda_insert_integer, /* Handler for 32 bits integers */ + NULL, /* Handler for octet sequences */ + irda_insert_no_value /* Handler for no value parameters */ +}; + +/* + * Function irda_insert_no_value (self, buf, len, pi, type, func) + */ +static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int ret; + + p.pi = pi; + p.pl = 0; + + /* Call handler for this parameter */ + ret = (*func)(self, &p, PV_GET); + + /* Extract values anyway, since handler may need them */ + irda_param_pack(buf, "bb", p.pi, p.pl); + + if (ret < 0) + return ret; + + return 2; /* Inserted pl+2 bytes */ +} + +/* + * Function irda_extract_no_value (self, buf, len, type, func) + * + * Extracts a parameter without a pv field (pl=0) + * + */ +static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int ret; + + /* Extract values anyway, since handler may need them */ + irda_param_unpack(buf, "bb", &p.pi, &p.pl); + + /* Call handler for this parameter */ + ret = (*func)(self, &p, PV_PUT); + + if (ret < 0) + return ret; + + return 2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_insert_integer (self, buf, len, pi, type, func) + */ +static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int n = 0; + int err; + + p.pi = pi; /* In case handler needs to know */ + p.pl = type & PV_MASK; /* The integer type codes the length as well */ + p.pv.i = 0; /* Clear value */ + + /* Call handler for this parameter */ + err = (*func)(self, &p, PV_GET); + if (err < 0) + return err; + + /* + * If parameter length is still 0, then (1) this is an any length + * integer, and (2) the handler function does not care which length + * we choose to use, so we pick the one the gives the fewest bytes. + */ + if (p.pl == 0) { + if (p.pv.i < 0xff) { + IRDA_DEBUG(2, "%s(), using 1 byte\n", __func__); + p.pl = 1; + } else if (p.pv.i < 0xffff) { + IRDA_DEBUG(2, "%s(), using 2 bytes\n", __func__); + p.pl = 2; + } else { + IRDA_DEBUG(2, "%s(), using 4 bytes\n", __func__); + p.pl = 4; /* Default length */ + } + } + /* Check if buffer is long enough for insertion */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer too short for insertion!\n", + __func__); + return -1; + } + IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d, pi=%d\n", __func__, + p.pi, p.pl, p.pv.i); + switch (p.pl) { + case 1: + n += irda_param_pack(buf, "bbb", p.pi, p.pl, (__u8) p.pv.i); + break; + case 2: + if (type & PV_BIG_ENDIAN) + p.pv.i = cpu_to_be16((__u16) p.pv.i); + else + p.pv.i = cpu_to_le16((__u16) p.pv.i); + n += irda_param_pack(buf, "bbs", p.pi, p.pl, (__u16) p.pv.i); + break; + case 4: + if (type & PV_BIG_ENDIAN) + cpu_to_be32s(&p.pv.i); + else + cpu_to_le32s(&p.pv.i); + n += irda_param_pack(buf, "bbi", p.pi, p.pl, p.pv.i); + + break; + default: + IRDA_WARNING("%s: length %d not supported\n", + __func__, p.pl); + /* Skip parameter */ + return -1; + } + + return p.pl+2; /* Inserted pl+2 bytes */ +} + +/* + * Function irda_extract integer (self, buf, len, pi, type, func) + * + * Extract a possibly variable length integer from buffer, and call + * handler for processing of the parameter + */ +static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int n = 0; + int extract_len; /* Real length we extract */ + int err; + + p.pi = pi; /* In case handler needs to know */ + p.pl = buf[1]; /* Extract length of value */ + p.pv.i = 0; /* Clear value */ + extract_len = p.pl; /* Default : extract all */ + + /* Check if buffer is long enough for parsing */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer too short for parsing! " + "Need %d bytes, but len is only %d\n", + __func__, p.pl, len); + return -1; + } + + /* + * Check that the integer length is what we expect it to be. If the + * handler want a 16 bits integer then a 32 bits is not good enough + * PV_INTEGER means that the handler is flexible. + */ + if (((type & PV_MASK) != PV_INTEGER) && ((type & PV_MASK) != p.pl)) { + IRDA_ERROR("%s: invalid parameter length! " + "Expected %d bytes, but value had %d bytes!\n", + __func__, type & PV_MASK, p.pl); + + /* Most parameters are bit/byte fields or little endian, + * so it's ok to only extract a subset of it (the subset + * that the handler expect). This is necessary, as some + * broken implementations seems to add extra undefined bits. + * If the parameter is shorter than we expect or is big + * endian, we can't play those tricks. Jean II */ + if((p.pl < (type & PV_MASK)) || (type & PV_BIG_ENDIAN)) { + /* Skip parameter */ + return p.pl+2; + } else { + /* Extract subset of it, fallthrough */ + extract_len = type & PV_MASK; + } + } + + + switch (extract_len) { + case 1: + n += irda_param_unpack(buf+2, "b", &p.pv.i); + break; + case 2: + n += irda_param_unpack(buf+2, "s", &p.pv.i); + if (type & PV_BIG_ENDIAN) + p.pv.i = be16_to_cpu((__u16) p.pv.i); + else + p.pv.i = le16_to_cpu((__u16) p.pv.i); + break; + case 4: + n += irda_param_unpack(buf+2, "i", &p.pv.i); + if (type & PV_BIG_ENDIAN) + be32_to_cpus(&p.pv.i); + else + le32_to_cpus(&p.pv.i); + break; + default: + IRDA_WARNING("%s: length %d not supported\n", + __func__, p.pl); + + /* Skip parameter */ + return p.pl+2; + } + + IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d, pi=%d\n", __func__, + p.pi, p.pl, p.pv.i); + /* Call handler for this parameter */ + err = (*func)(self, &p, PV_PUT); + if (err < 0) + return err; + + return p.pl+2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_extract_string (self, buf, len, type, func) + */ +static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + char str[33]; + irda_param_t p; + int err; + + IRDA_DEBUG(2, "%s()\n", __func__); + + p.pi = pi; /* In case handler needs to know */ + p.pl = buf[1]; /* Extract length of value */ + if (p.pl > 32) + p.pl = 32; + + IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__, + p.pi, p.pl); + + /* Check if buffer is long enough for parsing */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer too short for parsing! " + "Need %d bytes, but len is only %d\n", + __func__, p.pl, len); + return -1; + } + + /* Should be safe to copy string like this since we have already + * checked that the buffer is long enough */ + strncpy(str, buf+2, p.pl); + + IRDA_DEBUG(2, "%s(), str=0x%02x 0x%02x\n", __func__, + (__u8) str[0], (__u8) str[1]); + + /* Null terminate string */ + str[p.pl] = '\0'; + + p.pv.c = str; /* Handler will need to take a copy */ + + /* Call handler for this parameter */ + err = (*func)(self, &p, PV_PUT); + if (err < 0) + return err; + + return p.pl+2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_extract_octseq (self, buf, len, type, func) + */ +static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + + p.pi = pi; /* In case handler needs to know */ + p.pl = buf[1]; /* Extract length of value */ + + /* Check if buffer is long enough for parsing */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer too short for parsing! " + "Need %d bytes, but len is only %d\n", + __func__, p.pl, len); + return -1; + } + + IRDA_DEBUG(0, "%s(), not impl\n", __func__); + + return p.pl+2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_param_pack (skb, fmt, ...) + * + * Format: + * 'i' = 32 bits integer + * 's' = string + * + */ +int irda_param_pack(__u8 *buf, char *fmt, ...) +{ + irda_pv_t arg; + va_list args; + char *p; + int n = 0; + + va_start(args, fmt); + + for (p = fmt; *p != '\0'; p++) { + switch (*p) { + case 'b': /* 8 bits unsigned byte */ + buf[n++] = (__u8)va_arg(args, int); + break; + case 's': /* 16 bits unsigned short */ + arg.i = (__u16)va_arg(args, int); + put_unaligned((__u16)arg.i, (__u16 *)(buf+n)); n+=2; + break; + case 'i': /* 32 bits unsigned integer */ + arg.i = va_arg(args, __u32); + put_unaligned(arg.i, (__u32 *)(buf+n)); n+=4; + break; +#if 0 + case 'c': /* \0 terminated string */ + arg.c = va_arg(args, char *); + strcpy(buf+n, arg.c); + n += strlen(arg.c) + 1; + break; +#endif + default: + va_end(args); + return -1; + } + } + va_end(args); + + return 0; +} +EXPORT_SYMBOL(irda_param_pack); + +/* + * Function irda_param_unpack (skb, fmt, ...) + */ +static int irda_param_unpack(__u8 *buf, char *fmt, ...) +{ + irda_pv_t arg; + va_list args; + char *p; + int n = 0; + + va_start(args, fmt); + + for (p = fmt; *p != '\0'; p++) { + switch (*p) { + case 'b': /* 8 bits byte */ + arg.ip = va_arg(args, __u32 *); + *arg.ip = buf[n++]; + break; + case 's': /* 16 bits short */ + arg.ip = va_arg(args, __u32 *); + *arg.ip = get_unaligned((__u16 *)(buf+n)); n+=2; + break; + case 'i': /* 32 bits unsigned integer */ + arg.ip = va_arg(args, __u32 *); + *arg.ip = get_unaligned((__u32 *)(buf+n)); n+=4; + break; +#if 0 + case 'c': /* \0 terminated string */ + arg.c = va_arg(args, char *); + strcpy(arg.c, buf+n); + n += strlen(arg.c) + 1; + break; +#endif + default: + va_end(args); + return -1; + } + + } + va_end(args); + + return 0; +} + +/* + * Function irda_param_insert (self, pi, buf, len, info) + * + * Insert the specified parameter (pi) into buffer. Returns number of + * bytes inserted + */ +int irda_param_insert(void *self, __u8 pi, __u8 *buf, int len, + pi_param_info_t *info) +{ + pi_minor_info_t *pi_minor_info; + __u8 pi_minor; + __u8 pi_major; + int type; + int ret = -1; + int n = 0; + + IRDA_ASSERT(buf != NULL, return ret;); + IRDA_ASSERT(info != NULL, return ret;); + + pi_minor = pi & info->pi_mask; + pi_major = pi >> info->pi_major_offset; + + /* Check if the identifier value (pi) is valid */ + if ((pi_major > info->len-1) || + (pi_minor > info->tables[pi_major].len-1)) + { + IRDA_DEBUG(0, "%s(), no handler for parameter=0x%02x\n", + __func__, pi); + + /* Skip this parameter */ + return -1; + } + + /* Lookup the info on how to parse this parameter */ + pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor]; + + /* Find expected data type for this parameter identifier (pi)*/ + type = pi_minor_info->type; + + /* Check if handler has been implemented */ + if (!pi_minor_info->func) { + IRDA_MESSAGE("%s: no handler for pi=%#x\n", __func__, pi); + /* Skip this parameter */ + return -1; + } + + /* Insert parameter value */ + ret = (*pv_insert_table[type & PV_MASK])(self, buf+n, len, pi, type, + pi_minor_info->func); + return ret; +} +EXPORT_SYMBOL(irda_param_insert); + +/* + * Function irda_param_extract (self, buf, len, info) + * + * Parse all parameters. If len is correct, then everything should be + * safe. Returns the number of bytes that was parsed + * + */ +static int irda_param_extract(void *self, __u8 *buf, int len, + pi_param_info_t *info) +{ + pi_minor_info_t *pi_minor_info; + __u8 pi_minor; + __u8 pi_major; + int type; + int ret = -1; + int n = 0; + + IRDA_ASSERT(buf != NULL, return ret;); + IRDA_ASSERT(info != NULL, return ret;); + + pi_minor = buf[n] & info->pi_mask; + pi_major = buf[n] >> info->pi_major_offset; + + /* Check if the identifier value (pi) is valid */ + if ((pi_major > info->len-1) || + (pi_minor > info->tables[pi_major].len-1)) + { + IRDA_DEBUG(0, "%s(), no handler for parameter=0x%02x\n", + __func__, buf[0]); + + /* Skip this parameter */ + return 2 + buf[n + 1]; /* Continue */ + } + + /* Lookup the info on how to parse this parameter */ + pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor]; + + /* Find expected data type for this parameter identifier (pi)*/ + type = pi_minor_info->type; + + IRDA_DEBUG(3, "%s(), pi=[%d,%d], type=%d\n", __func__, + pi_major, pi_minor, type); + + /* Check if handler has been implemented */ + if (!pi_minor_info->func) { + IRDA_MESSAGE("%s: no handler for pi=%#x\n", + __func__, buf[n]); + /* Skip this parameter */ + return 2 + buf[n + 1]; /* Continue */ + } + + /* Parse parameter value */ + ret = (*pv_extract_table[type & PV_MASK])(self, buf+n, len, buf[n], + type, pi_minor_info->func); + return ret; +} + +/* + * Function irda_param_extract_all (self, buf, len, info) + * + * Parse all parameters. If len is correct, then everything should be + * safe. Returns the number of bytes that was parsed + * + */ +int irda_param_extract_all(void *self, __u8 *buf, int len, + pi_param_info_t *info) +{ + int ret = -1; + int n = 0; + + IRDA_ASSERT(buf != NULL, return ret;); + IRDA_ASSERT(info != NULL, return ret;); + + /* + * Parse all parameters. Each parameter must be at least two bytes + * long or else there is no point in trying to parse it + */ + while (len > 2) { + ret = irda_param_extract(self, buf+n, len, info); + if (ret < 0) + return ret; + + n += ret; + len -= ret; + } + return n; +} +EXPORT_SYMBOL(irda_param_extract_all); diff --git a/net/irda/qos.c b/net/irda/qos.c new file mode 100644 index 00000000..1b51bcf4 --- /dev/null +++ b/net/irda/qos.c @@ -0,0 +1,774 @@ +/********************************************************************* + * + * Filename: qos.c + * Version: 1.0 + * Description: IrLAP QoS parameter negotiation + * Status: Stable + * Author: Dag Brattli + * Created at: Tue Sep 9 00:00:26 1997 + * Modified at: Sun Jan 30 14:29:16 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2001 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include + +#include +#include +#include +#include +#include + +/* + * Maximum values of the baud rate we negotiate with the other end. + * Most often, you don't have to change that, because Linux-IrDA will + * use the maximum offered by the link layer, which usually works fine. + * In some very rare cases, you may want to limit it to lower speeds... + */ +int sysctl_max_baud_rate = 16000000; +/* + * Maximum value of the lap disconnect timer we negotiate with the other end. + * Most often, the value below represent the best compromise, but some user + * may want to keep the LAP alive longer or shorter in case of link failure. + * Remember that the threshold time (early warning) is fixed to 3s... + */ +int sysctl_max_noreply_time = 12; +/* + * Minimum turn time to be applied before transmitting to the peer. + * Nonzero values (usec) are used as lower limit to the per-connection + * mtt value which was announced by the other end during negotiation. + * Might be helpful if the peer device provides too short mtt. + * Default is 10us which means using the unmodified value given by the + * peer except if it's 0 (0 is likely a bug in the other stack). + */ +unsigned sysctl_min_tx_turn_time = 10; +/* + * Maximum data size to be used in transmission in payload of LAP frame. + * There is a bit of confusion in the IrDA spec : + * The LAP spec defines the payload of a LAP frame (I field) to be + * 2048 bytes max (IrLAP 1.1, chapt 6.6.5, p40). + * On the other hand, the PHY mention frames of 2048 bytes max (IrPHY + * 1.2, chapt 5.3.2.1, p41). But, this number includes the LAP header + * (2 bytes), and CRC (32 bits at 4 Mb/s). So, for the I field (LAP + * payload), that's only 2042 bytes. Oups ! + * My nsc-ircc hardware has troubles receiving 2048 bytes frames at 4 Mb/s, + * so adjust to 2042... I don't know if this bug applies only for 2048 + * bytes frames or all negotiated frame sizes, but you can use the sysctl + * to play with this value anyway. + * Jean II */ +unsigned sysctl_max_tx_data_size = 2042; +/* + * Maximum transmit window, i.e. number of LAP frames between turn-around. + * This allow to override what the peer told us. Some peers are buggy and + * don't always support what they tell us. + * Jean II */ +unsigned sysctl_max_tx_window = 7; + +static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get); +static int irlap_param_link_disconnect(void *instance, irda_param_t *parm, + int get); +static int irlap_param_max_turn_time(void *instance, irda_param_t *param, + int get); +static int irlap_param_data_size(void *instance, irda_param_t *param, int get); +static int irlap_param_window_size(void *instance, irda_param_t *param, + int get); +static int irlap_param_additional_bofs(void *instance, irda_param_t *parm, + int get); +static int irlap_param_min_turn_time(void *instance, irda_param_t *param, + int get); + +#ifndef CONFIG_IRDA_DYNAMIC_WINDOW +static __u32 irlap_requested_line_capacity(struct qos_info *qos); +#endif + +static __u32 min_turn_times[] = { 10000, 5000, 1000, 500, 100, 50, 10, 0 }; /* us */ +static __u32 baud_rates[] = { 2400, 9600, 19200, 38400, 57600, 115200, 576000, + 1152000, 4000000, 16000000 }; /* bps */ +static __u32 data_sizes[] = { 64, 128, 256, 512, 1024, 2048 }; /* bytes */ +static __u32 add_bofs[] = { 48, 24, 12, 5, 3, 2, 1, 0 }; /* bytes */ +static __u32 max_turn_times[] = { 500, 250, 100, 50 }; /* ms */ +static __u32 link_disc_times[] = { 3, 8, 12, 16, 20, 25, 30, 40 }; /* secs */ + +static __u32 max_line_capacities[10][4] = { + /* 500 ms 250 ms 100 ms 50 ms (max turn time) */ + { 100, 0, 0, 0 }, /* 2400 bps */ + { 400, 0, 0, 0 }, /* 9600 bps */ + { 800, 0, 0, 0 }, /* 19200 bps */ + { 1600, 0, 0, 0 }, /* 38400 bps */ + { 2360, 0, 0, 0 }, /* 57600 bps */ + { 4800, 2400, 960, 480 }, /* 115200 bps */ + { 28800, 11520, 5760, 2880 }, /* 576000 bps */ + { 57600, 28800, 11520, 5760 }, /* 1152000 bps */ + { 200000, 100000, 40000, 20000 }, /* 4000000 bps */ + { 800000, 400000, 160000, 80000 }, /* 16000000 bps */ +}; + +static pi_minor_info_t pi_minor_call_table_type_0[] = { + { NULL, 0 }, +/* 01 */{ irlap_param_baud_rate, PV_INTEGER | PV_LITTLE_ENDIAN }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, +/* 08 */{ irlap_param_link_disconnect, PV_INT_8_BITS } +}; + +static pi_minor_info_t pi_minor_call_table_type_1[] = { + { NULL, 0 }, + { NULL, 0 }, +/* 82 */{ irlap_param_max_turn_time, PV_INT_8_BITS }, +/* 83 */{ irlap_param_data_size, PV_INT_8_BITS }, +/* 84 */{ irlap_param_window_size, PV_INT_8_BITS }, +/* 85 */{ irlap_param_additional_bofs, PV_INT_8_BITS }, +/* 86 */{ irlap_param_min_turn_time, PV_INT_8_BITS }, +}; + +static pi_major_info_t pi_major_call_table[] = { + { pi_minor_call_table_type_0, 9 }, + { pi_minor_call_table_type_1, 7 }, +}; + +static pi_param_info_t irlap_param_info = { pi_major_call_table, 2, 0x7f, 7 }; + +/* ---------------------- LOCAL SUBROUTINES ---------------------- */ +/* Note : we start with a bunch of local subroutines. + * As the compiler is "one pass", this is the only way to get them to + * inline properly... + * Jean II + */ +/* + * Function value_index (value, array, size) + * + * Returns the index to the value in the specified array + */ +static inline int value_index(__u32 value, __u32 *array, int size) +{ + int i; + + for (i=0; i < size; i++) + if (array[i] == value) + break; + return i; +} + +/* + * Function index_value (index, array) + * + * Returns value to index in array, easy! + * + */ +static inline __u32 index_value(int index, __u32 *array) +{ + return array[index]; +} + +/* + * Function msb_index (word) + * + * Returns index to most significant bit (MSB) in word + * + */ +static int msb_index (__u16 word) +{ + __u16 msb = 0x8000; + int index = 15; /* Current MSB */ + + /* Check for buggy peers. + * Note : there is a small probability that it could be us, but I + * would expect driver authors to catch that pretty early and be + * able to check precisely what's going on. If a end user sees this, + * it's very likely the peer. - Jean II */ + if (word == 0) { + IRDA_WARNING("%s(), Detected buggy peer, adjust null PV to 0x1!\n", + __func__); + /* The only safe choice (we don't know the array size) */ + word = 0x1; + } + + while (msb) { + if (word & msb) + break; /* Found it! */ + msb >>=1; + index--; + } + return index; +} + +/* + * Function value_lower_bits (value, array) + * + * Returns a bit field marking all possibility lower than value. + */ +static inline int value_lower_bits(__u32 value, __u32 *array, int size, __u16 *field) +{ + int i; + __u16 mask = 0x1; + __u16 result = 0x0; + + for (i=0; i < size; i++) { + /* Add the current value to the bit field, shift mask */ + result |= mask; + mask <<= 1; + /* Finished ? */ + if (array[i] >= value) + break; + } + /* Send back a valid index */ + if(i >= size) + i = size - 1; /* Last item */ + *field = result; + return i; +} + +/* + * Function value_highest_bit (value, array) + * + * Returns a bit field marking the highest possibility lower than value. + */ +static inline int value_highest_bit(__u32 value, __u32 *array, int size, __u16 *field) +{ + int i; + __u16 mask = 0x1; + __u16 result = 0x0; + + for (i=0; i < size; i++) { + /* Finished ? */ + if (array[i] <= value) + break; + /* Shift mask */ + mask <<= 1; + } + /* Set the current value to the bit field */ + result |= mask; + /* Send back a valid index */ + if(i >= size) + i = size - 1; /* Last item */ + *field = result; + return i; +} + +/* -------------------------- MAIN CALLS -------------------------- */ + +/* + * Function irda_qos_compute_intersection (qos, new) + * + * Compute the intersection of the old QoS capabilities with new ones + * + */ +void irda_qos_compute_intersection(struct qos_info *qos, struct qos_info *new) +{ + IRDA_ASSERT(qos != NULL, return;); + IRDA_ASSERT(new != NULL, return;); + + /* Apply */ + qos->baud_rate.bits &= new->baud_rate.bits; + qos->window_size.bits &= new->window_size.bits; + qos->min_turn_time.bits &= new->min_turn_time.bits; + qos->max_turn_time.bits &= new->max_turn_time.bits; + qos->data_size.bits &= new->data_size.bits; + qos->link_disc_time.bits &= new->link_disc_time.bits; + qos->additional_bofs.bits &= new->additional_bofs.bits; + + irda_qos_bits_to_value(qos); +} + +/* + * Function irda_init_max_qos_capabilies (qos) + * + * The purpose of this function is for layers and drivers to be able to + * set the maximum QoS possible and then "and in" their own limitations + * + */ +void irda_init_max_qos_capabilies(struct qos_info *qos) +{ + int i; + /* + * These are the maximum supported values as specified on pages + * 39-43 in IrLAP + */ + + /* Use sysctl to set some configurable values... */ + /* Set configured max speed */ + i = value_lower_bits(sysctl_max_baud_rate, baud_rates, 10, + &qos->baud_rate.bits); + sysctl_max_baud_rate = index_value(i, baud_rates); + + /* Set configured max disc time */ + i = value_lower_bits(sysctl_max_noreply_time, link_disc_times, 8, + &qos->link_disc_time.bits); + sysctl_max_noreply_time = index_value(i, link_disc_times); + + /* LSB is first byte, MSB is second byte */ + qos->baud_rate.bits &= 0x03ff; + + qos->window_size.bits = 0x7f; + qos->min_turn_time.bits = 0xff; + qos->max_turn_time.bits = 0x0f; + qos->data_size.bits = 0x3f; + qos->link_disc_time.bits &= 0xff; + qos->additional_bofs.bits = 0xff; +} +EXPORT_SYMBOL(irda_init_max_qos_capabilies); + +/* + * Function irlap_adjust_qos_settings (qos) + * + * Adjust QoS settings in case some values are not possible to use because + * of other settings + */ +static void irlap_adjust_qos_settings(struct qos_info *qos) +{ + __u32 line_capacity; + int index; + + IRDA_DEBUG(2, "%s()\n", __func__); + + /* + * Make sure the mintt is sensible. + * Main culprit : Ericsson T39. - Jean II + */ + if (sysctl_min_tx_turn_time > qos->min_turn_time.value) { + int i; + + IRDA_WARNING("%s(), Detected buggy peer, adjust mtt to %dus!\n", + __func__, sysctl_min_tx_turn_time); + + /* We don't really need bits, but easier this way */ + i = value_highest_bit(sysctl_min_tx_turn_time, min_turn_times, + 8, &qos->min_turn_time.bits); + sysctl_min_tx_turn_time = index_value(i, min_turn_times); + qos->min_turn_time.value = sysctl_min_tx_turn_time; + } + + /* + * Not allowed to use a max turn time less than 500 ms if the baudrate + * is less than 115200 + */ + if ((qos->baud_rate.value < 115200) && + (qos->max_turn_time.value < 500)) + { + IRDA_DEBUG(0, + "%s(), adjusting max turn time from %d to 500 ms\n", + __func__, qos->max_turn_time.value); + qos->max_turn_time.value = 500; + } + + /* + * The data size must be adjusted according to the baud rate and max + * turn time + */ + index = value_index(qos->data_size.value, data_sizes, 6); + line_capacity = irlap_max_line_capacity(qos->baud_rate.value, + qos->max_turn_time.value); + +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + while ((qos->data_size.value > line_capacity) && (index > 0)) { + qos->data_size.value = data_sizes[index--]; + IRDA_DEBUG(2, "%s(), reducing data size to %d\n", + __func__, qos->data_size.value); + } +#else /* Use method described in section 6.6.11 of IrLAP */ + while (irlap_requested_line_capacity(qos) > line_capacity) { + IRDA_ASSERT(index != 0, return;); + + /* Must be able to send at least one frame */ + if (qos->window_size.value > 1) { + qos->window_size.value--; + IRDA_DEBUG(2, "%s(), reducing window size to %d\n", + __func__, qos->window_size.value); + } else if (index > 1) { + qos->data_size.value = data_sizes[index--]; + IRDA_DEBUG(2, "%s(), reducing data size to %d\n", + __func__, qos->data_size.value); + } else { + IRDA_WARNING("%s(), nothing more we can do!\n", + __func__); + } + } +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* + * Fix tx data size according to user limits - Jean II + */ + if (qos->data_size.value > sysctl_max_tx_data_size) + /* Allow non discrete adjustement to avoid losing capacity */ + qos->data_size.value = sysctl_max_tx_data_size; + /* + * Override Tx window if user request it. - Jean II + */ + if (qos->window_size.value > sysctl_max_tx_window) + qos->window_size.value = sysctl_max_tx_window; +} + +/* + * Function irlap_negotiate (qos_device, qos_session, skb) + * + * Negotiate QoS values, not really that much negotiation :-) + * We just set the QoS capabilities for the peer station + * + */ +int irlap_qos_negotiate(struct irlap_cb *self, struct sk_buff *skb) +{ + int ret; + + ret = irda_param_extract_all(self, skb->data, skb->len, + &irlap_param_info); + + /* Convert the negotiated bits to values */ + irda_qos_bits_to_value(&self->qos_tx); + irda_qos_bits_to_value(&self->qos_rx); + + irlap_adjust_qos_settings(&self->qos_tx); + + IRDA_DEBUG(2, "Setting BAUD_RATE to %d bps.\n", + self->qos_tx.baud_rate.value); + IRDA_DEBUG(2, "Setting DATA_SIZE to %d bytes\n", + self->qos_tx.data_size.value); + IRDA_DEBUG(2, "Setting WINDOW_SIZE to %d\n", + self->qos_tx.window_size.value); + IRDA_DEBUG(2, "Setting XBOFS to %d\n", + self->qos_tx.additional_bofs.value); + IRDA_DEBUG(2, "Setting MAX_TURN_TIME to %d ms.\n", + self->qos_tx.max_turn_time.value); + IRDA_DEBUG(2, "Setting MIN_TURN_TIME to %d usecs.\n", + self->qos_tx.min_turn_time.value); + IRDA_DEBUG(2, "Setting LINK_DISC to %d secs.\n", + self->qos_tx.link_disc_time.value); + return ret; +} + +/* + * Function irlap_insert_negotiation_params (qos, fp) + * + * Insert QoS negotiaion pararameters into frame + * + */ +int irlap_insert_qos_negotiation_params(struct irlap_cb *self, + struct sk_buff *skb) +{ + int ret; + + /* Insert data rate */ + ret = irda_param_insert(self, PI_BAUD_RATE, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert max turnaround time */ + ret = irda_param_insert(self, PI_MAX_TURN_TIME, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert data size */ + ret = irda_param_insert(self, PI_DATA_SIZE, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert window size */ + ret = irda_param_insert(self, PI_WINDOW_SIZE, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert additional BOFs */ + ret = irda_param_insert(self, PI_ADD_BOFS, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert minimum turnaround time */ + ret = irda_param_insert(self, PI_MIN_TURN_TIME, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert link disconnect/threshold time */ + ret = irda_param_insert(self, PI_LINK_DISC, skb_tail_pointer(skb), + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + return 0; +} + +/* + * Function irlap_param_baud_rate (instance, param, get) + * + * Negotiate data-rate + * + */ +static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get) +{ + __u16 final; + + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) { + param->pv.i = self->qos_rx.baud_rate.bits; + IRDA_DEBUG(2, "%s(), baud rate = 0x%02x\n", + __func__, param->pv.i); + } else { + /* + * Stations must agree on baud rate, so calculate + * intersection + */ + IRDA_DEBUG(2, "Requested BAUD_RATE: 0x%04x\n", (__u16) param->pv.i); + final = (__u16) param->pv.i & self->qos_rx.baud_rate.bits; + + IRDA_DEBUG(2, "Final BAUD_RATE: 0x%04x\n", final); + self->qos_tx.baud_rate.bits = final; + self->qos_rx.baud_rate.bits = final; + } + + return 0; +} + +/* + * Function irlap_param_link_disconnect (instance, param, get) + * + * Negotiate link disconnect/threshold time. + * + */ +static int irlap_param_link_disconnect(void *instance, irda_param_t *param, + int get) +{ + __u16 final; + + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.link_disc_time.bits; + else { + /* + * Stations must agree on link disconnect/threshold + * time. + */ + IRDA_DEBUG(2, "LINK_DISC: %02x\n", (__u8) param->pv.i); + final = (__u8) param->pv.i & self->qos_rx.link_disc_time.bits; + + IRDA_DEBUG(2, "Final LINK_DISC: %02x\n", final); + self->qos_tx.link_disc_time.bits = final; + self->qos_rx.link_disc_time.bits = final; + } + return 0; +} + +/* + * Function irlap_param_max_turn_time (instance, param, get) + * + * Negotiate the maximum turnaround time. This is a type 1 parameter and + * will be negotiated independently for each station + * + */ +static int irlap_param_max_turn_time(void *instance, irda_param_t *param, + int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.max_turn_time.bits; + else + self->qos_tx.max_turn_time.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_data_size (instance, param, get) + * + * Negotiate the data size. This is a type 1 parameter and + * will be negotiated independently for each station + * + */ +static int irlap_param_data_size(void *instance, irda_param_t *param, int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.data_size.bits; + else + self->qos_tx.data_size.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_window_size (instance, param, get) + * + * Negotiate the window size. This is a type 1 parameter and + * will be negotiated independently for each station + * + */ +static int irlap_param_window_size(void *instance, irda_param_t *param, + int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.window_size.bits; + else + self->qos_tx.window_size.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_additional_bofs (instance, param, get) + * + * Negotiate additional BOF characters. This is a type 1 parameter and + * will be negotiated independently for each station. + */ +static int irlap_param_additional_bofs(void *instance, irda_param_t *param, int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.additional_bofs.bits; + else + self->qos_tx.additional_bofs.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_min_turn_time (instance, param, get) + * + * Negotiate the minimum turn around time. This is a type 1 parameter and + * will be negotiated independently for each station + */ +static int irlap_param_min_turn_time(void *instance, irda_param_t *param, + int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.min_turn_time.bits; + else + self->qos_tx.min_turn_time.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_max_line_capacity (speed, max_turn_time, min_turn_time) + * + * Calculate the maximum line capacity + * + */ +__u32 irlap_max_line_capacity(__u32 speed, __u32 max_turn_time) +{ + __u32 line_capacity; + int i,j; + + IRDA_DEBUG(2, "%s(), speed=%d, max_turn_time=%d\n", + __func__, speed, max_turn_time); + + i = value_index(speed, baud_rates, 10); + j = value_index(max_turn_time, max_turn_times, 4); + + IRDA_ASSERT(((i >=0) && (i <10)), return 0;); + IRDA_ASSERT(((j >=0) && (j <4)), return 0;); + + line_capacity = max_line_capacities[i][j]; + + IRDA_DEBUG(2, "%s(), line capacity=%d bytes\n", + __func__, line_capacity); + + return line_capacity; +} + +#ifndef CONFIG_IRDA_DYNAMIC_WINDOW +static __u32 irlap_requested_line_capacity(struct qos_info *qos) +{ + __u32 line_capacity; + + line_capacity = qos->window_size.value * + (qos->data_size.value + 6 + qos->additional_bofs.value) + + irlap_min_turn_time_in_bytes(qos->baud_rate.value, + qos->min_turn_time.value); + + IRDA_DEBUG(2, "%s(), requested line capacity=%d\n", + __func__, line_capacity); + + return line_capacity; +} +#endif + +void irda_qos_bits_to_value(struct qos_info *qos) +{ + int index; + + IRDA_ASSERT(qos != NULL, return;); + + index = msb_index(qos->baud_rate.bits); + qos->baud_rate.value = baud_rates[index]; + + index = msb_index(qos->data_size.bits); + qos->data_size.value = data_sizes[index]; + + index = msb_index(qos->window_size.bits); + qos->window_size.value = index+1; + + index = msb_index(qos->min_turn_time.bits); + qos->min_turn_time.value = min_turn_times[index]; + + index = msb_index(qos->max_turn_time.bits); + qos->max_turn_time.value = max_turn_times[index]; + + index = msb_index(qos->link_disc_time.bits); + qos->link_disc_time.value = link_disc_times[index]; + + index = msb_index(qos->additional_bofs.bits); + qos->additional_bofs.value = add_bofs[index]; +} +EXPORT_SYMBOL(irda_qos_bits_to_value); diff --git a/net/irda/timer.c b/net/irda/timer.c new file mode 100644 index 00000000..f418cb2a --- /dev/null +++ b/net/irda/timer.c @@ -0,0 +1,232 @@ +/********************************************************************* + * + * Filename: timer.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Aug 16 00:59:29 1997 + * Modified at: Wed Dec 8 12:50:34 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2002 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include + +extern int sysctl_slot_timeout; + +static void irlap_slot_timer_expired(void* data); +static void irlap_query_timer_expired(void* data); +static void irlap_final_timer_expired(void* data); +static void irlap_wd_timer_expired(void* data); +static void irlap_backoff_timer_expired(void* data); +static void irlap_media_busy_expired(void* data); + +void irlap_start_slot_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->slot_timer, timeout, (void *) self, + irlap_slot_timer_expired); +} + +void irlap_start_query_timer(struct irlap_cb *self, int S, int s) +{ + int timeout; + + /* Calculate when the peer discovery should end. Normally, we + * get the end-of-discovery frame, so this is just in case + * we miss it. + * Basically, we multiply the number of remaining slots by our + * slot time, plus add some extra time to properly receive the last + * discovery packet (which is longer due to extra discovery info), + * to avoid messing with for incomming connections requests and + * to accommodate devices that perform discovery slower than us. + * Jean II */ + timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s) + + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT); + + /* Set or re-set the timer. We reset the timer for each received + * discovery query, which allow us to automatically adjust to + * the speed of the peer discovery (faster or slower). Jean II */ + irda_start_timer( &self->query_timer, timeout, (void *) self, + irlap_query_timer_expired); +} + +void irlap_start_final_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->final_timer, timeout, (void *) self, + irlap_final_timer_expired); +} + +void irlap_start_wd_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->wd_timer, timeout, (void *) self, + irlap_wd_timer_expired); +} + +void irlap_start_backoff_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->backoff_timer, timeout, (void *) self, + irlap_backoff_timer_expired); +} + +void irlap_start_mbusy_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->media_busy_timer, timeout, + (void *) self, irlap_media_busy_expired); +} + +void irlap_stop_mbusy_timer(struct irlap_cb *self) +{ + /* If timer is activated, kill it! */ + del_timer(&self->media_busy_timer); + + /* If we are in NDM, there is a bunch of events in LAP that + * that be pending due to the media_busy condition, such as + * CONNECT_REQUEST and SEND_UI_FRAME. If we don't generate + * an event, they will wait forever... + * Jean II */ + if (self->state == LAP_NDM) + irlap_do_event(self, MEDIA_BUSY_TIMER_EXPIRED, NULL, NULL); +} + +void irlmp_start_watchdog_timer(struct lsap_cb *self, int timeout) +{ + irda_start_timer(&self->watchdog_timer, timeout, (void *) self, + irlmp_watchdog_timer_expired); +} + +void irlmp_start_discovery_timer(struct irlmp_cb *self, int timeout) +{ + irda_start_timer(&self->discovery_timer, timeout, (void *) self, + irlmp_discovery_timer_expired); +} + +void irlmp_start_idle_timer(struct lap_cb *self, int timeout) +{ + irda_start_timer(&self->idle_timer, timeout, (void *) self, + irlmp_idle_timer_expired); +} + +void irlmp_stop_idle_timer(struct lap_cb *self) +{ + /* If timer is activated, kill it! */ + del_timer(&self->idle_timer); +} + +/* + * Function irlap_slot_timer_expired (data) + * + * IrLAP slot timer has expired + * + */ +static void irlap_slot_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, SLOT_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irlap_query_timer_expired (data) + * + * IrLAP query timer has expired + * + */ +static void irlap_query_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, QUERY_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irda_final_timer_expired (data) + * + * + * + */ +static void irlap_final_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, FINAL_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irda_wd_timer_expired (data) + * + * + * + */ +static void irlap_wd_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, WD_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irda_backoff_timer_expired (data) + * + * + * + */ +static void irlap_backoff_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, BACKOFF_TIMER_EXPIRED, NULL, NULL); +} + + +/* + * Function irtty_media_busy_expired (data) + * + * + */ +static void irlap_media_busy_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + + irda_device_set_media_busy(self->netdev, FALSE); + /* Note : the LAP event will be send in irlap_stop_mbusy_timer(), + * to catch other cases where the flag is cleared (for example + * after a discovery) - Jean II */ +} diff --git a/net/irda/wrapper.c b/net/irda/wrapper.c new file mode 100644 index 00000000..fd0995b1 --- /dev/null +++ b/net/irda/wrapper.c @@ -0,0 +1,492 @@ +/********************************************************************* + * + * Filename: wrapper.c + * Version: 1.2 + * Description: IrDA SIR async wrapper layer + * Status: Stable + * Author: Dag Brattli + * Created at: Mon Aug 4 20:40:53 1997 + * Modified at: Fri Jan 28 13:21:09 2000 + * Modified by: Dag Brattli + * Modified at: Fri May 28 3:11 CST 1999 + * Modified by: Horst von Brand + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2002 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/************************** FRAME WRAPPING **************************/ +/* + * Unwrap and unstuff SIR frames + * + * Note : at FIR and MIR, HDLC framing is used and usually handled + * by the controller, so we come here only for SIR... Jean II + */ + +/* + * Function stuff_byte (byte, buf) + * + * Byte stuff one single byte and put the result in buffer pointed to by + * buf. The buffer must at all times be able to have two bytes inserted. + * + * This is in a tight loop, better inline it, so need to be prior to callers. + * (2000 bytes on P6 200MHz, non-inlined ~370us, inline ~170us) - Jean II + */ +static inline int stuff_byte(__u8 byte, __u8 *buf) +{ + switch (byte) { + case BOF: /* FALLTHROUGH */ + case EOF: /* FALLTHROUGH */ + case CE: + /* Insert transparently coded */ + buf[0] = CE; /* Send link escape */ + buf[1] = byte^IRDA_TRANS; /* Complement bit 5 */ + return 2; + /* break; */ + default: + /* Non-special value, no transparency required */ + buf[0] = byte; + return 1; + /* break; */ + } +} + +/* + * Function async_wrap (skb, *tx_buff, buffsize) + * + * Makes a new buffer with wrapping and stuffing, should check that + * we don't get tx buffer overflow. + */ +int async_wrap_skb(struct sk_buff *skb, __u8 *tx_buff, int buffsize) +{ + struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb; + int xbofs; + int i; + int n; + union { + __u16 value; + __u8 bytes[2]; + } fcs; + + /* Initialize variables */ + fcs.value = INIT_FCS; + n = 0; + + /* + * Send XBOF's for required min. turn time and for the negotiated + * additional XBOFS + */ + + if (cb->magic != LAP_MAGIC) { + /* + * This will happen for all frames sent from user-space. + * Nothing to worry about, but we set the default number of + * BOF's + */ + IRDA_DEBUG(1, "%s(), wrong magic in skb!\n", __func__); + xbofs = 10; + } else + xbofs = cb->xbofs + cb->xbofs_delay; + + IRDA_DEBUG(4, "%s(), xbofs=%d\n", __func__, xbofs); + + /* Check that we never use more than 115 + 48 xbofs */ + if (xbofs > 163) { + IRDA_DEBUG(0, "%s(), too many xbofs (%d)\n", __func__, + xbofs); + xbofs = 163; + } + + memset(tx_buff + n, XBOF, xbofs); + n += xbofs; + + /* Start of packet character BOF */ + tx_buff[n++] = BOF; + + /* Insert frame and calc CRC */ + for (i=0; i < skb->len; i++) { + /* + * Check for the possibility of tx buffer overflow. We use + * bufsize-5 since the maximum number of bytes that can be + * transmitted after this point is 5. + */ + if(n >= (buffsize-5)) { + IRDA_ERROR("%s(), tx buffer overflow (n=%d)\n", + __func__, n); + return n; + } + + n += stuff_byte(skb->data[i], tx_buff+n); + fcs.value = irda_fcs(fcs.value, skb->data[i]); + } + + /* Insert CRC in little endian format (LSB first) */ + fcs.value = ~fcs.value; +#ifdef __LITTLE_ENDIAN + n += stuff_byte(fcs.bytes[0], tx_buff+n); + n += stuff_byte(fcs.bytes[1], tx_buff+n); +#else /* ifdef __BIG_ENDIAN */ + n += stuff_byte(fcs.bytes[1], tx_buff+n); + n += stuff_byte(fcs.bytes[0], tx_buff+n); +#endif + tx_buff[n++] = EOF; + + return n; +} +EXPORT_SYMBOL(async_wrap_skb); + +/************************* FRAME UNWRAPPING *************************/ +/* + * Unwrap and unstuff SIR frames + * + * Complete rewrite by Jean II : + * More inline, faster, more compact, more logical. Jean II + * (16 bytes on P6 200MHz, old 5 to 7 us, new 4 to 6 us) + * (24 bytes on P6 200MHz, old 9 to 10 us, new 7 to 8 us) + * (for reference, 115200 b/s is 1 byte every 69 us) + * And reduce wrapper.o by ~900B in the process ;-) + * + * Then, we have the addition of ZeroCopy, which is optional + * (i.e. the driver must initiate it) and improve final processing. + * (2005 B frame + EOF on P6 200MHz, without 30 to 50 us, with 10 to 25 us) + * + * Note : at FIR and MIR, HDLC framing is used and usually handled + * by the controller, so we come here only for SIR... Jean II + */ + +/* + * We can also choose where we want to do the CRC calculation. We can + * do it "inline", as we receive the bytes, or "postponed", when + * receiving the End-Of-Frame. + * (16 bytes on P6 200MHz, inlined 4 to 6 us, postponed 4 to 5 us) + * (24 bytes on P6 200MHz, inlined 7 to 8 us, postponed 5 to 7 us) + * With ZeroCopy : + * (2005 B frame on P6 200MHz, inlined 10 to 25 us, postponed 140 to 180 us) + * Without ZeroCopy : + * (2005 B frame on P6 200MHz, inlined 30 to 50 us, postponed 150 to 180 us) + * (Note : numbers taken with irq disabled) + * + * From those numbers, it's not clear which is the best strategy, because + * we end up running through a lot of data one way or another (i.e. cache + * misses). I personally prefer to avoid the huge latency spike of the + * "postponed" solution, because it come just at the time when we have + * lot's of protocol processing to do and it will hurt our ability to + * reach low link turnaround times... Jean II + */ +//#define POSTPONE_RX_CRC + +/* + * Function async_bump (buf, len, stats) + * + * Got a frame, make a copy of it, and pass it up the stack! We can try + * to inline it since it's only called from state_inside_frame + */ +static inline void +async_bump(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff) +{ + struct sk_buff *newskb; + struct sk_buff *dataskb; + int docopy; + + /* Check if we need to copy the data to a new skb or not. + * If the driver doesn't use ZeroCopy Rx, we have to do it. + * With ZeroCopy Rx, the rx_buff already point to a valid + * skb. But, if the frame is small, it is more efficient to + * copy it to save memory (copy will be fast anyway - that's + * called Rx-copy-break). Jean II */ + docopy = ((rx_buff->skb == NULL) || + (rx_buff->len < IRDA_RX_COPY_THRESHOLD)); + + /* Allocate a new skb */ + newskb = dev_alloc_skb(docopy ? rx_buff->len + 1 : rx_buff->truesize); + if (!newskb) { + stats->rx_dropped++; + /* We could deliver the current skb if doing ZeroCopy Rx, + * but this would stall the Rx path. Better drop the + * packet... Jean II */ + return; + } + + /* Align IP header to 20 bytes (i.e. increase skb->data) + * Note this is only useful with IrLAN, as PPP has a variable + * header size (2 or 1 bytes) - Jean II */ + skb_reserve(newskb, 1); + + if(docopy) { + /* Copy data without CRC (length already checked) */ + skb_copy_to_linear_data(newskb, rx_buff->data, + rx_buff->len - 2); + /* Deliver this skb */ + dataskb = newskb; + } else { + /* We are using ZeroCopy. Deliver old skb */ + dataskb = rx_buff->skb; + /* And hook the new skb to the rx_buff */ + rx_buff->skb = newskb; + rx_buff->head = newskb->data; /* NOT newskb->head */ + //printk(KERN_DEBUG "ZeroCopy : len = %d, dataskb = %p, newskb = %p\n", rx_buff->len, dataskb, newskb); + } + + /* Set proper length on skb (without CRC) */ + skb_put(dataskb, rx_buff->len - 2); + + /* Feed it to IrLAP layer */ + dataskb->dev = dev; + skb_reset_mac_header(dataskb); + dataskb->protocol = htons(ETH_P_IRDA); + + netif_rx(dataskb); + + stats->rx_packets++; + stats->rx_bytes += rx_buff->len; + + /* Clean up rx_buff (redundant with async_unwrap_bof() ???) */ + rx_buff->data = rx_buff->head; + rx_buff->len = 0; +} + +/* + * Function async_unwrap_bof(dev, byte) + * + * Handle Beginning Of Frame character received within a frame + * + */ +static inline void +async_unwrap_bof(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(rx_buff->state) { + case LINK_ESCAPE: + case INSIDE_FRAME: + /* Not supposed to happen, the previous frame is not + * finished - Jean II */ + IRDA_DEBUG(1, "%s(), Discarding incomplete frame\n", + __func__); + stats->rx_errors++; + stats->rx_missed_errors++; + irda_device_set_media_busy(dev, TRUE); + break; + + case OUTSIDE_FRAME: + case BEGIN_FRAME: + default: + /* We may receive multiple BOF at the start of frame */ + break; + } + + /* Now receiving frame */ + rx_buff->state = BEGIN_FRAME; + rx_buff->in_frame = TRUE; + + /* Time to initialize receive buffer */ + rx_buff->data = rx_buff->head; + rx_buff->len = 0; + rx_buff->fcs = INIT_FCS; +} + +/* + * Function async_unwrap_eof(dev, byte) + * + * Handle End Of Frame character received within a frame + * + */ +static inline void +async_unwrap_eof(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ +#ifdef POSTPONE_RX_CRC + int i; +#endif + + switch(rx_buff->state) { + case OUTSIDE_FRAME: + /* Probably missed the BOF */ + stats->rx_errors++; + stats->rx_missed_errors++; + irda_device_set_media_busy(dev, TRUE); + break; + + case BEGIN_FRAME: + case LINK_ESCAPE: + case INSIDE_FRAME: + default: + /* Note : in the case of BEGIN_FRAME and LINK_ESCAPE, + * the fcs will most likely not match and generate an + * error, as expected - Jean II */ + rx_buff->state = OUTSIDE_FRAME; + rx_buff->in_frame = FALSE; + +#ifdef POSTPONE_RX_CRC + /* If we haven't done the CRC as we receive bytes, we + * must do it now... Jean II */ + for(i = 0; i < rx_buff->len; i++) + rx_buff->fcs = irda_fcs(rx_buff->fcs, + rx_buff->data[i]); +#endif + + /* Test FCS and signal success if the frame is good */ + if (rx_buff->fcs == GOOD_FCS) { + /* Deliver frame */ + async_bump(dev, stats, rx_buff); + break; + } else { + /* Wrong CRC, discard frame! */ + irda_device_set_media_busy(dev, TRUE); + + IRDA_DEBUG(1, "%s(), crc error\n", __func__); + stats->rx_errors++; + stats->rx_crc_errors++; + } + break; + } +} + +/* + * Function async_unwrap_ce(dev, byte) + * + * Handle Character Escape character received within a frame + * + */ +static inline void +async_unwrap_ce(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(rx_buff->state) { + case OUTSIDE_FRAME: + /* Activate carrier sense */ + irda_device_set_media_busy(dev, TRUE); + break; + + case LINK_ESCAPE: + IRDA_WARNING("%s: state not defined\n", __func__); + break; + + case BEGIN_FRAME: + case INSIDE_FRAME: + default: + /* Stuffed byte coming */ + rx_buff->state = LINK_ESCAPE; + break; + } +} + +/* + * Function async_unwrap_other(dev, byte) + * + * Handle other characters received within a frame + * + */ +static inline void +async_unwrap_other(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(rx_buff->state) { + /* This is on the critical path, case are ordered by + * probability (most frequent first) - Jean II */ + case INSIDE_FRAME: + /* Must be the next byte of the frame */ + if (rx_buff->len < rx_buff->truesize) { + rx_buff->data[rx_buff->len++] = byte; +#ifndef POSTPONE_RX_CRC + rx_buff->fcs = irda_fcs(rx_buff->fcs, byte); +#endif + } else { + IRDA_DEBUG(1, "%s(), Rx buffer overflow, aborting\n", + __func__); + rx_buff->state = OUTSIDE_FRAME; + } + break; + + case LINK_ESCAPE: + /* + * Stuffed char, complement bit 5 of byte + * following CE, IrLAP p.114 + */ + byte ^= IRDA_TRANS; + if (rx_buff->len < rx_buff->truesize) { + rx_buff->data[rx_buff->len++] = byte; +#ifndef POSTPONE_RX_CRC + rx_buff->fcs = irda_fcs(rx_buff->fcs, byte); +#endif + rx_buff->state = INSIDE_FRAME; + } else { + IRDA_DEBUG(1, "%s(), Rx buffer overflow, aborting\n", + __func__); + rx_buff->state = OUTSIDE_FRAME; + } + break; + + case OUTSIDE_FRAME: + /* Activate carrier sense */ + if(byte != XBOF) + irda_device_set_media_busy(dev, TRUE); + break; + + case BEGIN_FRAME: + default: + rx_buff->data[rx_buff->len++] = byte; +#ifndef POSTPONE_RX_CRC + rx_buff->fcs = irda_fcs(rx_buff->fcs, byte); +#endif + rx_buff->state = INSIDE_FRAME; + break; + } +} + +/* + * Function async_unwrap_char (dev, rx_buff, byte) + * + * Parse and de-stuff frame received from the IrDA-port + * + * This is the main entry point for SIR drivers. + */ +void async_unwrap_char(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(byte) { + case CE: + async_unwrap_ce(dev, stats, rx_buff, byte); + break; + case BOF: + async_unwrap_bof(dev, stats, rx_buff, byte); + break; + case EOF: + async_unwrap_eof(dev, stats, rx_buff, byte); + break; + default: + async_unwrap_other(dev, stats, rx_buff, byte); + break; + } +} +EXPORT_SYMBOL(async_unwrap_char); + diff --git a/net/iucv/Kconfig b/net/iucv/Kconfig new file mode 100644 index 00000000..16ce9cd4 --- /dev/null +++ b/net/iucv/Kconfig @@ -0,0 +1,15 @@ +config IUCV + tristate "IUCV support (S390 - z/VM only)" + depends on S390 + help + Select this option if you want to use inter-user communication + under VM or VIF. If you run on z/VM, say "Y" to enable a fast + communication link between VM guests. + +config AFIUCV + tristate "AF_IUCV support (S390 - z/VM only)" + depends on IUCV + help + Select this option if you want to use inter-user communication under + VM or VIF sockets. If you run on z/VM, say "Y" to enable a fast + communication link between VM guests. diff --git a/net/iucv/Makefile b/net/iucv/Makefile new file mode 100644 index 00000000..7bfdc853 --- /dev/null +++ b/net/iucv/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for IUCV +# + +obj-$(CONFIG_IUCV) += iucv.o +obj-$(CONFIG_AFIUCV) += af_iucv.o diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c new file mode 100644 index 00000000..e2013e43 --- /dev/null +++ b/net/iucv/af_iucv.c @@ -0,0 +1,1795 @@ +/* + * IUCV protocol stack for Linux on zSeries + * + * Copyright IBM Corp. 2006, 2009 + * + * Author(s): Jennifer Hunt + * Hendrik Brueckner + * PM functions: + * Ursula Braun + */ + +#define KMSG_COMPONENT "af_iucv" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define VERSION "1.1" + +static char iucv_userid[80]; + +static const struct proto_ops iucv_sock_ops; + +static struct proto iucv_proto = { + .name = "AF_IUCV", + .owner = THIS_MODULE, + .obj_size = sizeof(struct iucv_sock), +}; + +/* special AF_IUCV IPRM messages */ +static const u8 iprm_shutdown[8] = + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}; + +#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class)) + +/* macros to set/get socket control buffer at correct offset */ +#define CB_TAG(skb) ((skb)->cb) /* iucv message tag */ +#define CB_TAG_LEN (sizeof(((struct iucv_message *) 0)->tag)) +#define CB_TRGCLS(skb) ((skb)->cb + CB_TAG_LEN) /* iucv msg target class */ +#define CB_TRGCLS_LEN (TRGCLS_SIZE) + +#define __iucv_sock_wait(sk, condition, timeo, ret) \ +do { \ + DEFINE_WAIT(__wait); \ + long __timeo = timeo; \ + ret = 0; \ + prepare_to_wait(sk_sleep(sk), &__wait, TASK_INTERRUPTIBLE); \ + while (!(condition)) { \ + if (!__timeo) { \ + ret = -EAGAIN; \ + break; \ + } \ + if (signal_pending(current)) { \ + ret = sock_intr_errno(__timeo); \ + break; \ + } \ + release_sock(sk); \ + __timeo = schedule_timeout(__timeo); \ + lock_sock(sk); \ + ret = sock_error(sk); \ + if (ret) \ + break; \ + } \ + finish_wait(sk_sleep(sk), &__wait); \ +} while (0) + +#define iucv_sock_wait(sk, condition, timeo) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __iucv_sock_wait(sk, condition, timeo, __ret); \ + __ret; \ +}) + +static void iucv_sock_kill(struct sock *sk); +static void iucv_sock_close(struct sock *sk); + +/* Call Back functions */ +static void iucv_callback_rx(struct iucv_path *, struct iucv_message *); +static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *); +static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]); +static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8], + u8 ipuser[16]); +static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]); +static void iucv_callback_shutdown(struct iucv_path *, u8 ipuser[16]); + +static struct iucv_sock_list iucv_sk_list = { + .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock), + .autobind_name = ATOMIC_INIT(0) +}; + +static struct iucv_handler af_iucv_handler = { + .path_pending = iucv_callback_connreq, + .path_complete = iucv_callback_connack, + .path_severed = iucv_callback_connrej, + .message_pending = iucv_callback_rx, + .message_complete = iucv_callback_txdone, + .path_quiesced = iucv_callback_shutdown, +}; + +static inline void high_nmcpy(unsigned char *dst, char *src) +{ + memcpy(dst, src, 8); +} + +static inline void low_nmcpy(unsigned char *dst, char *src) +{ + memcpy(&dst[8], src, 8); +} + +static int afiucv_pm_prepare(struct device *dev) +{ +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "afiucv_pm_prepare\n"); +#endif + return 0; +} + +static void afiucv_pm_complete(struct device *dev) +{ +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "afiucv_pm_complete\n"); +#endif +} + +/** + * afiucv_pm_freeze() - Freeze PM callback + * @dev: AFIUCV dummy device + * + * Sever all established IUCV communication pathes + */ +static int afiucv_pm_freeze(struct device *dev) +{ + struct iucv_sock *iucv; + struct sock *sk; + struct hlist_node *node; + int err = 0; + +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "afiucv_pm_freeze\n"); +#endif + read_lock(&iucv_sk_list.lock); + sk_for_each(sk, node, &iucv_sk_list.head) { + iucv = iucv_sk(sk); + skb_queue_purge(&iucv->send_skb_q); + skb_queue_purge(&iucv->backlog_skb_q); + switch (sk->sk_state) { + case IUCV_SEVERED: + case IUCV_DISCONN: + case IUCV_CLOSING: + case IUCV_CONNECTED: + if (iucv->path) { + err = iucv_path_sever(iucv->path, NULL); + iucv_path_free(iucv->path); + iucv->path = NULL; + } + break; + case IUCV_OPEN: + case IUCV_BOUND: + case IUCV_LISTEN: + case IUCV_CLOSED: + default: + break; + } + } + read_unlock(&iucv_sk_list.lock); + return err; +} + +/** + * afiucv_pm_restore_thaw() - Thaw and restore PM callback + * @dev: AFIUCV dummy device + * + * socket clean up after freeze + */ +static int afiucv_pm_restore_thaw(struct device *dev) +{ + struct sock *sk; + struct hlist_node *node; + +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "afiucv_pm_restore_thaw\n"); +#endif + read_lock(&iucv_sk_list.lock); + sk_for_each(sk, node, &iucv_sk_list.head) { + switch (sk->sk_state) { + case IUCV_CONNECTED: + sk->sk_err = EPIPE; + sk->sk_state = IUCV_DISCONN; + sk->sk_state_change(sk); + break; + case IUCV_DISCONN: + case IUCV_SEVERED: + case IUCV_CLOSING: + case IUCV_LISTEN: + case IUCV_BOUND: + case IUCV_OPEN: + default: + break; + } + } + read_unlock(&iucv_sk_list.lock); + return 0; +} + +static const struct dev_pm_ops afiucv_pm_ops = { + .prepare = afiucv_pm_prepare, + .complete = afiucv_pm_complete, + .freeze = afiucv_pm_freeze, + .thaw = afiucv_pm_restore_thaw, + .restore = afiucv_pm_restore_thaw, +}; + +static struct device_driver af_iucv_driver = { + .owner = THIS_MODULE, + .name = "afiucv", + .bus = &iucv_bus, + .pm = &afiucv_pm_ops, +}; + +/* dummy device used as trigger for PM functions */ +static struct device *af_iucv_dev; + +/** + * iucv_msg_length() - Returns the length of an iucv message. + * @msg: Pointer to struct iucv_message, MUST NOT be NULL + * + * The function returns the length of the specified iucv message @msg of data + * stored in a buffer and of data stored in the parameter list (PRMDATA). + * + * For IUCV_IPRMDATA, AF_IUCV uses the following convention to transport socket + * data: + * PRMDATA[0..6] socket data (max 7 bytes); + * PRMDATA[7] socket data length value (len is 0xff - PRMDATA[7]) + * + * The socket data length is computed by subtracting the socket data length + * value from 0xFF. + * If the socket data len is greater 7, then PRMDATA can be used for special + * notifications (see iucv_sock_shutdown); and further, + * if the socket data len is > 7, the function returns 8. + * + * Use this function to allocate socket buffers to store iucv message data. + */ +static inline size_t iucv_msg_length(struct iucv_message *msg) +{ + size_t datalen; + + if (msg->flags & IUCV_IPRMDATA) { + datalen = 0xff - msg->rmmsg[7]; + return (datalen < 8) ? datalen : 8; + } + return msg->length; +} + +/** + * iucv_sock_in_state() - check for specific states + * @sk: sock structure + * @state: first iucv sk state + * @state: second iucv sk state + * + * Returns true if the socket in either in the first or second state. + */ +static int iucv_sock_in_state(struct sock *sk, int state, int state2) +{ + return (sk->sk_state == state || sk->sk_state == state2); +} + +/** + * iucv_below_msglim() - function to check if messages can be sent + * @sk: sock structure + * + * Returns true if the send queue length is lower than the message limit. + * Always returns true if the socket is not connected (no iucv path for + * checking the message limit). + */ +static inline int iucv_below_msglim(struct sock *sk) +{ + struct iucv_sock *iucv = iucv_sk(sk); + + if (sk->sk_state != IUCV_CONNECTED) + return 1; + return (skb_queue_len(&iucv->send_skb_q) < iucv->path->msglim); +} + +/** + * iucv_sock_wake_msglim() - Wake up thread waiting on msg limit + */ +static void iucv_sock_wake_msglim(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); +} + +/* Timers */ +static void iucv_sock_timeout(unsigned long arg) +{ + struct sock *sk = (struct sock *)arg; + + bh_lock_sock(sk); + sk->sk_err = ETIMEDOUT; + sk->sk_state_change(sk); + bh_unlock_sock(sk); + + iucv_sock_kill(sk); + sock_put(sk); +} + +static void iucv_sock_clear_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +static struct sock *__iucv_get_sock_by_name(char *nm) +{ + struct sock *sk; + struct hlist_node *node; + + sk_for_each(sk, node, &iucv_sk_list.head) + if (!memcmp(&iucv_sk(sk)->src_name, nm, 8)) + return sk; + + return NULL; +} + +static void iucv_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); +} + +/* Cleanup Listen */ +static void iucv_sock_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + /* Close non-accepted connections */ + while ((sk = iucv_accept_dequeue(parent, NULL))) { + iucv_sock_close(sk); + iucv_sock_kill(sk); + } + + parent->sk_state = IUCV_CLOSED; +} + +/* Kill socket (only if zapped and orphaned) */ +static void iucv_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + iucv_sock_unlink(&iucv_sk_list, sk); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +/* Close an IUCV socket */ +static void iucv_sock_close(struct sock *sk) +{ + unsigned char user_data[16]; + struct iucv_sock *iucv = iucv_sk(sk); + unsigned long timeo; + + iucv_sock_clear_timer(sk); + lock_sock(sk); + + switch (sk->sk_state) { + case IUCV_LISTEN: + iucv_sock_cleanup_listen(sk); + break; + + case IUCV_CONNECTED: + case IUCV_DISCONN: + sk->sk_state = IUCV_CLOSING; + sk->sk_state_change(sk); + + if (!skb_queue_empty(&iucv->send_skb_q)) { + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + timeo = sk->sk_lingertime; + else + timeo = IUCV_DISCONN_TIMEOUT; + iucv_sock_wait(sk, + iucv_sock_in_state(sk, IUCV_CLOSED, 0), + timeo); + } + + case IUCV_CLOSING: /* fall through */ + sk->sk_state = IUCV_CLOSED; + sk->sk_state_change(sk); + + if (iucv->path) { + low_nmcpy(user_data, iucv->src_name); + high_nmcpy(user_data, iucv->dst_name); + ASCEBC(user_data, sizeof(user_data)); + iucv_path_sever(iucv->path, user_data); + iucv_path_free(iucv->path); + iucv->path = NULL; + } + + sk->sk_err = ECONNRESET; + sk->sk_state_change(sk); + + skb_queue_purge(&iucv->send_skb_q); + skb_queue_purge(&iucv->backlog_skb_q); + break; + + default: + /* nothing to do here */ + break; + } + + /* mark socket for deletion by iucv_sock_kill() */ + sock_set_flag(sk, SOCK_ZAPPED); + + release_sock(sk); +} + +static void iucv_sock_init(struct sock *sk, struct sock *parent) +{ + if (parent) + sk->sk_type = parent->sk_type; +} + +static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) +{ + struct sock *sk; + + sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&iucv_sk(sk)->accept_q); + spin_lock_init(&iucv_sk(sk)->accept_q_lock); + skb_queue_head_init(&iucv_sk(sk)->send_skb_q); + INIT_LIST_HEAD(&iucv_sk(sk)->message_q.list); + spin_lock_init(&iucv_sk(sk)->message_q.lock); + skb_queue_head_init(&iucv_sk(sk)->backlog_skb_q); + iucv_sk(sk)->send_tag = 0; + iucv_sk(sk)->flags = 0; + iucv_sk(sk)->msglimit = IUCV_QUEUELEN_DEFAULT; + iucv_sk(sk)->path = NULL; + memset(&iucv_sk(sk)->src_user_id , 0, 32); + + sk->sk_destruct = iucv_sock_destruct; + sk->sk_sndtimeo = IUCV_CONN_TIMEOUT; + sk->sk_allocation = GFP_DMA; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = IUCV_OPEN; + + setup_timer(&sk->sk_timer, iucv_sock_timeout, (unsigned long)sk); + + iucv_sock_link(&iucv_sk_list, sk); + return sk; +} + +/* Create an IUCV socket */ +static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + if (protocol && protocol != PF_IUCV) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &iucv_sock_ops; + break; + case SOCK_SEQPACKET: + /* currently, proto ops can handle both sk types */ + sock->ops = &iucv_sock_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL); + if (!sk) + return -ENOMEM; + + iucv_sock_init(sk, NULL); + + return 0; +} + +void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_add_node(sk, &l->head); + write_unlock_bh(&l->lock); +} + +void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_del_node_init(sk); + write_unlock_bh(&l->lock); +} + +void iucv_accept_enqueue(struct sock *parent, struct sock *sk) +{ + unsigned long flags; + struct iucv_sock *par = iucv_sk(parent); + + sock_hold(sk); + spin_lock_irqsave(&par->accept_q_lock, flags); + list_add_tail(&iucv_sk(sk)->accept_q, &par->accept_q); + spin_unlock_irqrestore(&par->accept_q_lock, flags); + iucv_sk(sk)->parent = parent; + sk_acceptq_added(parent); +} + +void iucv_accept_unlink(struct sock *sk) +{ + unsigned long flags; + struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent); + + spin_lock_irqsave(&par->accept_q_lock, flags); + list_del_init(&iucv_sk(sk)->accept_q); + spin_unlock_irqrestore(&par->accept_q_lock, flags); + sk_acceptq_removed(iucv_sk(sk)->parent); + iucv_sk(sk)->parent = NULL; + sock_put(sk); +} + +struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock) +{ + struct iucv_sock *isk, *n; + struct sock *sk; + + list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) { + sk = (struct sock *) isk; + lock_sock(sk); + + if (sk->sk_state == IUCV_CLOSED) { + iucv_accept_unlink(sk); + release_sock(sk); + continue; + } + + if (sk->sk_state == IUCV_CONNECTED || + sk->sk_state == IUCV_SEVERED || + sk->sk_state == IUCV_DISCONN || /* due to PM restore */ + !newsock) { + iucv_accept_unlink(sk); + if (newsock) + sock_graft(sk, newsock); + + if (sk->sk_state == IUCV_SEVERED) + sk->sk_state = IUCV_DISCONN; + + release_sock(sk); + return sk; + } + + release_sock(sk); + } + return NULL; +} + +/* Bind an unbound socket */ +static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, + int addr_len) +{ + struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr; + struct sock *sk = sock->sk; + struct iucv_sock *iucv; + int err; + + /* Verify the input sockaddr */ + if (!addr || addr->sa_family != AF_IUCV) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_state != IUCV_OPEN) { + err = -EBADFD; + goto done; + } + + write_lock_bh(&iucv_sk_list.lock); + + iucv = iucv_sk(sk); + if (__iucv_get_sock_by_name(sa->siucv_name)) { + err = -EADDRINUSE; + goto done_unlock; + } + if (iucv->path) { + err = 0; + goto done_unlock; + } + + /* Bind the socket */ + memcpy(iucv->src_name, sa->siucv_name, 8); + + /* Copy the user id */ + memcpy(iucv->src_user_id, iucv_userid, 8); + sk->sk_state = IUCV_BOUND; + err = 0; + +done_unlock: + /* Release the socket list lock */ + write_unlock_bh(&iucv_sk_list.lock); +done: + release_sock(sk); + return err; +} + +/* Automatically bind an unbound socket */ +static int iucv_sock_autobind(struct sock *sk) +{ + struct iucv_sock *iucv = iucv_sk(sk); + char query_buffer[80]; + char name[12]; + int err = 0; + + /* Set the userid and name */ + cpcmd("QUERY USERID", query_buffer, sizeof(query_buffer), &err); + if (unlikely(err)) + return -EPROTO; + + memcpy(iucv->src_user_id, query_buffer, 8); + + write_lock_bh(&iucv_sk_list.lock); + + sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); + while (__iucv_get_sock_by_name(name)) { + sprintf(name, "%08x", + atomic_inc_return(&iucv_sk_list.autobind_name)); + } + + write_unlock_bh(&iucv_sk_list.lock); + + memcpy(&iucv->src_name, name, 8); + + return err; +} + +/* Connect an unconnected socket */ +static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr; + struct sock *sk = sock->sk; + struct iucv_sock *iucv; + unsigned char user_data[16]; + int err; + + if (addr->sa_family != AF_IUCV || alen < sizeof(struct sockaddr_iucv)) + return -EINVAL; + + if (sk->sk_state != IUCV_OPEN && sk->sk_state != IUCV_BOUND) + return -EBADFD; + + if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_SEQPACKET) + return -EINVAL; + + if (sk->sk_state == IUCV_OPEN) { + err = iucv_sock_autobind(sk); + if (unlikely(err)) + return err; + } + + lock_sock(sk); + + /* Set the destination information */ + memcpy(iucv_sk(sk)->dst_user_id, sa->siucv_user_id, 8); + memcpy(iucv_sk(sk)->dst_name, sa->siucv_name, 8); + + high_nmcpy(user_data, sa->siucv_name); + low_nmcpy(user_data, iucv_sk(sk)->src_name); + ASCEBC(user_data, sizeof(user_data)); + + iucv = iucv_sk(sk); + /* Create path. */ + iucv->path = iucv_path_alloc(iucv->msglimit, + IUCV_IPRMDATA, GFP_KERNEL); + if (!iucv->path) { + err = -ENOMEM; + goto done; + } + err = iucv_path_connect(iucv->path, &af_iucv_handler, + sa->siucv_user_id, NULL, user_data, sk); + if (err) { + iucv_path_free(iucv->path); + iucv->path = NULL; + switch (err) { + case 0x0b: /* Target communicator is not logged on */ + err = -ENETUNREACH; + break; + case 0x0d: /* Max connections for this guest exceeded */ + case 0x0e: /* Max connections for target guest exceeded */ + err = -EAGAIN; + break; + case 0x0f: /* Missing IUCV authorization */ + err = -EACCES; + break; + default: + err = -ECONNREFUSED; + break; + } + goto done; + } + + if (sk->sk_state != IUCV_CONNECTED) { + err = iucv_sock_wait(sk, iucv_sock_in_state(sk, IUCV_CONNECTED, + IUCV_DISCONN), + sock_sndtimeo(sk, flags & O_NONBLOCK)); + } + + if (sk->sk_state == IUCV_DISCONN) { + err = -ECONNREFUSED; + } + + if (err) { + iucv_path_sever(iucv->path, NULL); + iucv_path_free(iucv->path); + iucv->path = NULL; + } + +done: + release_sock(sk); + return err; +} + +/* Move a socket into listening state. */ +static int iucv_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sk->sk_state != IUCV_BOUND) + goto done; + + if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) + goto done; + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = IUCV_LISTEN; + err = 0; + +done: + release_sock(sk); + return err; +} + +/* Accept a pending connection */ +static int iucv_sock_accept(struct socket *sock, struct socket *newsock, + int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *nsk; + long timeo; + int err = 0; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + + if (sk->sk_state != IUCV_LISTEN) { + err = -EBADFD; + goto done; + } + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* Wait for an incoming connection */ + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (!(nsk = iucv_accept_dequeue(sk, newsock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + err = -EAGAIN; + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + + if (sk->sk_state != IUCV_LISTEN) { + err = -EBADFD; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + +done: + release_sock(sk); + return err; +} + +static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr, + int *len, int peer) +{ + struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr; + struct sock *sk = sock->sk; + + addr->sa_family = AF_IUCV; + *len = sizeof(struct sockaddr_iucv); + + if (peer) { + memcpy(siucv->siucv_user_id, iucv_sk(sk)->dst_user_id, 8); + memcpy(siucv->siucv_name, &iucv_sk(sk)->dst_name, 8); + } else { + memcpy(siucv->siucv_user_id, iucv_sk(sk)->src_user_id, 8); + memcpy(siucv->siucv_name, iucv_sk(sk)->src_name, 8); + } + memset(&siucv->siucv_port, 0, sizeof(siucv->siucv_port)); + memset(&siucv->siucv_addr, 0, sizeof(siucv->siucv_addr)); + memset(siucv->siucv_nodeid, 0, sizeof(siucv->siucv_nodeid)); + + return 0; +} + +/** + * iucv_send_iprm() - Send socket data in parameter list of an iucv message. + * @path: IUCV path + * @msg: Pointer to a struct iucv_message + * @skb: The socket data to send, skb->len MUST BE <= 7 + * + * Send the socket data in the parameter list in the iucv message + * (IUCV_IPRMDATA). The socket data is stored at index 0 to 6 in the parameter + * list and the socket data len at index 7 (last byte). + * See also iucv_msg_length(). + * + * Returns the error code from the iucv_message_send() call. + */ +static int iucv_send_iprm(struct iucv_path *path, struct iucv_message *msg, + struct sk_buff *skb) +{ + u8 prmdata[8]; + + memcpy(prmdata, (void *) skb->data, skb->len); + prmdata[7] = 0xff - (u8) skb->len; + return iucv_message_send(path, msg, IUCV_IPRMDATA, 0, + (void *) prmdata, 8); +} + +static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct iucv_sock *iucv = iucv_sk(sk); + struct sk_buff *skb; + struct iucv_message txmsg; + struct cmsghdr *cmsg; + int cmsg_done; + long timeo; + char user_id[9]; + char appl_id[9]; + int err; + int noblock = msg->msg_flags & MSG_DONTWAIT; + + err = sock_error(sk); + if (err) + return err; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* SOCK_SEQPACKET: we do not support segmented records */ + if (sk->sk_type == SOCK_SEQPACKET && !(msg->msg_flags & MSG_EOR)) + return -EOPNOTSUPP; + + lock_sock(sk); + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + goto out; + } + + /* Return if the socket is not in connected state */ + if (sk->sk_state != IUCV_CONNECTED) { + err = -ENOTCONN; + goto out; + } + + /* initialize defaults */ + cmsg_done = 0; /* check for duplicate headers */ + txmsg.class = 0; + + /* iterate over control messages */ + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if (!CMSG_OK(msg, cmsg)) { + err = -EINVAL; + goto out; + } + + if (cmsg->cmsg_level != SOL_IUCV) + continue; + + if (cmsg->cmsg_type & cmsg_done) { + err = -EINVAL; + goto out; + } + cmsg_done |= cmsg->cmsg_type; + + switch (cmsg->cmsg_type) { + case SCM_IUCV_TRGCLS: + if (cmsg->cmsg_len != CMSG_LEN(TRGCLS_SIZE)) { + err = -EINVAL; + goto out; + } + + /* set iucv message target class */ + memcpy(&txmsg.class, + (void *) CMSG_DATA(cmsg), TRGCLS_SIZE); + + break; + + default: + err = -EINVAL; + goto out; + break; + } + } + + /* allocate one skb for each iucv message: + * this is fine for SOCK_SEQPACKET (unless we want to support + * segmented records using the MSG_EOR flag), but + * for SOCK_STREAM we might want to improve it in future */ + skb = sock_alloc_send_skb(sk, len, noblock, &err); + if (!skb) + goto out; + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + goto fail; + } + + /* wait if outstanding messages for iucv path has reached */ + timeo = sock_sndtimeo(sk, noblock); + err = iucv_sock_wait(sk, iucv_below_msglim(sk), timeo); + if (err) + goto fail; + + /* return -ECONNRESET if the socket is no longer connected */ + if (sk->sk_state != IUCV_CONNECTED) { + err = -ECONNRESET; + goto fail; + } + + /* increment and save iucv message tag for msg_completion cbk */ + txmsg.tag = iucv->send_tag++; + memcpy(CB_TAG(skb), &txmsg.tag, CB_TAG_LEN); + skb_queue_tail(&iucv->send_skb_q, skb); + + if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) + && skb->len <= 7) { + err = iucv_send_iprm(iucv->path, &txmsg, skb); + + /* on success: there is no message_complete callback + * for an IPRMDATA msg; remove skb from send queue */ + if (err == 0) { + skb_unlink(skb, &iucv->send_skb_q); + kfree_skb(skb); + } + + /* this error should never happen since the + * IUCV_IPRMDATA path flag is set... sever path */ + if (err == 0x15) { + iucv_path_sever(iucv->path, NULL); + skb_unlink(skb, &iucv->send_skb_q); + err = -EPIPE; + goto fail; + } + } else + err = iucv_message_send(iucv->path, &txmsg, 0, 0, + (void *) skb->data, skb->len); + if (err) { + if (err == 3) { + user_id[8] = 0; + memcpy(user_id, iucv->dst_user_id, 8); + appl_id[8] = 0; + memcpy(appl_id, iucv->dst_name, 8); + pr_err("Application %s on z/VM guest %s" + " exceeds message limit\n", + appl_id, user_id); + err = -EAGAIN; + } else + err = -EPIPE; + skb_unlink(skb, &iucv->send_skb_q); + goto fail; + } + + release_sock(sk); + return len; + +fail: + kfree_skb(skb); +out: + release_sock(sk); + return err; +} + +/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's + * + * Locking: must be called with message_q.lock held + */ +static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len) +{ + int dataleft, size, copied = 0; + struct sk_buff *nskb; + + dataleft = len; + while (dataleft) { + if (dataleft >= sk->sk_rcvbuf / 4) + size = sk->sk_rcvbuf / 4; + else + size = dataleft; + + nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA); + if (!nskb) + return -ENOMEM; + + /* copy target class to control buffer of new skb */ + memcpy(CB_TRGCLS(nskb), CB_TRGCLS(skb), CB_TRGCLS_LEN); + + /* copy data fragment */ + memcpy(nskb->data, skb->data + copied, size); + copied += size; + dataleft -= size; + + skb_reset_transport_header(nskb); + skb_reset_network_header(nskb); + nskb->len = size; + + skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb); + } + + return 0; +} + +/* iucv_process_message() - Receive a single outstanding IUCV message + * + * Locking: must be called with message_q.lock held + */ +static void iucv_process_message(struct sock *sk, struct sk_buff *skb, + struct iucv_path *path, + struct iucv_message *msg) +{ + int rc; + unsigned int len; + + len = iucv_msg_length(msg); + + /* store msg target class in the second 4 bytes of skb ctrl buffer */ + /* Note: the first 4 bytes are reserved for msg tag */ + memcpy(CB_TRGCLS(skb), &msg->class, CB_TRGCLS_LEN); + + /* check for special IPRM messages (e.g. iucv_sock_shutdown) */ + if ((msg->flags & IUCV_IPRMDATA) && len > 7) { + if (memcmp(msg->rmmsg, iprm_shutdown, 8) == 0) { + skb->data = NULL; + skb->len = 0; + } + } else { + rc = iucv_message_receive(path, msg, msg->flags & IUCV_IPRMDATA, + skb->data, len, NULL); + if (rc) { + kfree_skb(skb); + return; + } + /* we need to fragment iucv messages for SOCK_STREAM only; + * for SOCK_SEQPACKET, it is only relevant if we support + * record segmentation using MSG_EOR (see also recvmsg()) */ + if (sk->sk_type == SOCK_STREAM && + skb->truesize >= sk->sk_rcvbuf / 4) { + rc = iucv_fragment_skb(sk, skb, len); + kfree_skb(skb); + skb = NULL; + if (rc) { + iucv_path_sever(path, NULL); + return; + } + skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q); + } else { + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + skb->len = len; + } + } + + if (sock_queue_rcv_skb(sk, skb)) + skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb); +} + +/* iucv_process_message_q() - Process outstanding IUCV messages + * + * Locking: must be called with message_q.lock held + */ +static void iucv_process_message_q(struct sock *sk) +{ + struct iucv_sock *iucv = iucv_sk(sk); + struct sk_buff *skb; + struct sock_msg_q *p, *n; + + list_for_each_entry_safe(p, n, &iucv->message_q.list, list) { + skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA); + if (!skb) + break; + iucv_process_message(sk, skb, p->path, &p->msg); + list_del(&p->list); + kfree(p); + if (!skb_queue_empty(&iucv->backlog_skb_q)) + break; + } +} + +static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + int noblock = flags & MSG_DONTWAIT; + struct sock *sk = sock->sk; + struct iucv_sock *iucv = iucv_sk(sk); + unsigned int copied, rlen; + struct sk_buff *skb, *rskb, *cskb; + int err = 0; + + if ((sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_SEVERED) && + skb_queue_empty(&iucv->backlog_skb_q) && + skb_queue_empty(&sk->sk_receive_queue) && + list_empty(&iucv->message_q.list)) + return 0; + + if (flags & (MSG_OOB)) + return -EOPNOTSUPP; + + /* receive/dequeue next skb: + * the function understands MSG_PEEK and, thus, does not dequeue skb */ + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) { + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 0; + return err; + } + + rlen = skb->len; /* real length of skb */ + copied = min_t(unsigned int, rlen, len); + + cskb = skb; + if (memcpy_toiovec(msg->msg_iov, cskb->data, copied)) { + if (!(flags & MSG_PEEK)) + skb_queue_head(&sk->sk_receive_queue, skb); + return -EFAULT; + } + + /* SOCK_SEQPACKET: set MSG_TRUNC if recv buf size is too small */ + if (sk->sk_type == SOCK_SEQPACKET) { + if (copied < rlen) + msg->msg_flags |= MSG_TRUNC; + /* each iucv message contains a complete record */ + msg->msg_flags |= MSG_EOR; + } + + /* create control message to store iucv msg target class: + * get the trgcls from the control buffer of the skb due to + * fragmentation of original iucv message. */ + err = put_cmsg(msg, SOL_IUCV, SCM_IUCV_TRGCLS, + CB_TRGCLS_LEN, CB_TRGCLS(skb)); + if (err) { + if (!(flags & MSG_PEEK)) + skb_queue_head(&sk->sk_receive_queue, skb); + return err; + } + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) { + + /* SOCK_STREAM: re-queue skb if it contains unreceived data */ + if (sk->sk_type == SOCK_STREAM) { + skb_pull(skb, copied); + if (skb->len) { + skb_queue_head(&sk->sk_receive_queue, skb); + goto done; + } + } + + kfree_skb(skb); + + /* Queue backlog skbs */ + spin_lock_bh(&iucv->message_q.lock); + rskb = skb_dequeue(&iucv->backlog_skb_q); + while (rskb) { + if (sock_queue_rcv_skb(sk, rskb)) { + skb_queue_head(&iucv->backlog_skb_q, + rskb); + break; + } else { + rskb = skb_dequeue(&iucv->backlog_skb_q); + } + } + if (skb_queue_empty(&iucv->backlog_skb_q)) { + if (!list_empty(&iucv->message_q.list)) + iucv_process_message_q(sk); + } + spin_unlock_bh(&iucv->message_q.lock); + } + +done: + /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */ + if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC)) + copied = rlen; + + return copied; +} + +static inline unsigned int iucv_accept_poll(struct sock *parent) +{ + struct iucv_sock *isk, *n; + struct sock *sk; + + list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) { + sk = (struct sock *) isk; + + if (sk->sk_state == IUCV_CONNECTED) + return POLLIN | POLLRDNORM; + } + + return 0; +} + +unsigned int iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + + sock_poll_wait(file, sk_sleep(sk), wait); + + if (sk->sk_state == IUCV_LISTEN) + return iucv_accept_poll(sk); + + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP; + + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + if (sk->sk_state == IUCV_CLOSED) + mask |= POLLHUP; + + if (sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_SEVERED) + mask |= POLLIN; + + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} + +static int iucv_sock_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct iucv_sock *iucv = iucv_sk(sk); + struct iucv_message txmsg; + int err = 0; + + how++; + + if ((how & ~SHUTDOWN_MASK) || !how) + return -EINVAL; + + lock_sock(sk); + switch (sk->sk_state) { + case IUCV_DISCONN: + case IUCV_CLOSING: + case IUCV_SEVERED: + case IUCV_CLOSED: + err = -ENOTCONN; + goto fail; + + default: + sk->sk_shutdown |= how; + break; + } + + if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) { + txmsg.class = 0; + txmsg.tag = 0; + err = iucv_message_send(iucv->path, &txmsg, IUCV_IPRMDATA, 0, + (void *) iprm_shutdown, 8); + if (err) { + switch (err) { + case 1: + err = -ENOTCONN; + break; + case 2: + err = -ECONNRESET; + break; + default: + err = -ENOTCONN; + break; + } + } + } + + if (how == RCV_SHUTDOWN || how == SHUTDOWN_MASK) { + err = iucv_path_quiesce(iucv_sk(sk)->path, NULL); + if (err) + err = -ENOTCONN; + + skb_queue_purge(&sk->sk_receive_queue); + } + + /* Wake up anyone sleeping in poll */ + sk->sk_state_change(sk); + +fail: + release_sock(sk); + return err; +} + +static int iucv_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err = 0; + + if (!sk) + return 0; + + iucv_sock_close(sk); + + /* Unregister with IUCV base support */ + if (iucv_sk(sk)->path) { + iucv_path_sever(iucv_sk(sk)->path, NULL); + iucv_path_free(iucv_sk(sk)->path); + iucv_sk(sk)->path = NULL; + } + + sock_orphan(sk); + iucv_sock_kill(sk); + return err; +} + +/* getsockopt and setsockopt */ +static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct iucv_sock *iucv = iucv_sk(sk); + int val; + int rc; + + if (level != SOL_IUCV) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + rc = 0; + + lock_sock(sk); + switch (optname) { + case SO_IPRMDATA_MSG: + if (val) + iucv->flags |= IUCV_IPRMDATA; + else + iucv->flags &= ~IUCV_IPRMDATA; + break; + case SO_MSGLIMIT: + switch (sk->sk_state) { + case IUCV_OPEN: + case IUCV_BOUND: + if (val < 1 || val > (u16)(~0)) + rc = -EINVAL; + else + iucv->msglimit = val; + break; + default: + rc = -EINVAL; + break; + } + break; + default: + rc = -ENOPROTOOPT; + break; + } + release_sock(sk); + + return rc; +} + +static int iucv_sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct iucv_sock *iucv = iucv_sk(sk); + int val, len; + + if (level != SOL_IUCV) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + len = min_t(unsigned int, len, sizeof(int)); + + switch (optname) { + case SO_IPRMDATA_MSG: + val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0; + break; + case SO_MSGLIMIT: + lock_sock(sk); + val = (iucv->path != NULL) ? iucv->path->msglim /* connected */ + : iucv->msglimit; /* default */ + release_sock(sk); + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + + +/* Callback wrappers - called from iucv base support */ +static int iucv_callback_connreq(struct iucv_path *path, + u8 ipvmid[8], u8 ipuser[16]) +{ + unsigned char user_data[16]; + unsigned char nuser_data[16]; + unsigned char src_name[8]; + struct hlist_node *node; + struct sock *sk, *nsk; + struct iucv_sock *iucv, *niucv; + int err; + + memcpy(src_name, ipuser, 8); + EBCASC(src_name, 8); + /* Find out if this path belongs to af_iucv. */ + read_lock(&iucv_sk_list.lock); + iucv = NULL; + sk = NULL; + sk_for_each(sk, node, &iucv_sk_list.head) + if (sk->sk_state == IUCV_LISTEN && + !memcmp(&iucv_sk(sk)->src_name, src_name, 8)) { + /* + * Found a listening socket with + * src_name == ipuser[0-7]. + */ + iucv = iucv_sk(sk); + break; + } + read_unlock(&iucv_sk_list.lock); + if (!iucv) + /* No socket found, not one of our paths. */ + return -EINVAL; + + bh_lock_sock(sk); + + /* Check if parent socket is listening */ + low_nmcpy(user_data, iucv->src_name); + high_nmcpy(user_data, iucv->dst_name); + ASCEBC(user_data, sizeof(user_data)); + if (sk->sk_state != IUCV_LISTEN) { + err = iucv_path_sever(path, user_data); + iucv_path_free(path); + goto fail; + } + + /* Check for backlog size */ + if (sk_acceptq_is_full(sk)) { + err = iucv_path_sever(path, user_data); + iucv_path_free(path); + goto fail; + } + + /* Create the new socket */ + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + if (!nsk) { + err = iucv_path_sever(path, user_data); + iucv_path_free(path); + goto fail; + } + + niucv = iucv_sk(nsk); + iucv_sock_init(nsk, sk); + + /* Set the new iucv_sock */ + memcpy(niucv->dst_name, ipuser + 8, 8); + EBCASC(niucv->dst_name, 8); + memcpy(niucv->dst_user_id, ipvmid, 8); + memcpy(niucv->src_name, iucv->src_name, 8); + memcpy(niucv->src_user_id, iucv->src_user_id, 8); + niucv->path = path; + + /* Call iucv_accept */ + high_nmcpy(nuser_data, ipuser + 8); + memcpy(nuser_data + 8, niucv->src_name, 8); + ASCEBC(nuser_data + 8, 8); + + /* set message limit for path based on msglimit of accepting socket */ + niucv->msglimit = iucv->msglimit; + path->msglim = iucv->msglimit; + err = iucv_path_accept(path, &af_iucv_handler, nuser_data, nsk); + if (err) { + err = iucv_path_sever(path, user_data); + iucv_path_free(path); + iucv_sock_kill(nsk); + goto fail; + } + + iucv_accept_enqueue(sk, nsk); + + /* Wake up accept */ + nsk->sk_state = IUCV_CONNECTED; + sk->sk_data_ready(sk, 1); + err = 0; +fail: + bh_unlock_sock(sk); + return 0; +} + +static void iucv_callback_connack(struct iucv_path *path, u8 ipuser[16]) +{ + struct sock *sk = path->private; + + sk->sk_state = IUCV_CONNECTED; + sk->sk_state_change(sk); +} + +static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) +{ + struct sock *sk = path->private; + struct iucv_sock *iucv = iucv_sk(sk); + struct sk_buff *skb; + struct sock_msg_q *save_msg; + int len; + + if (sk->sk_shutdown & RCV_SHUTDOWN) { + iucv_message_reject(path, msg); + return; + } + + spin_lock(&iucv->message_q.lock); + + if (!list_empty(&iucv->message_q.list) || + !skb_queue_empty(&iucv->backlog_skb_q)) + goto save_message; + + len = atomic_read(&sk->sk_rmem_alloc); + len += iucv_msg_length(msg) + sizeof(struct sk_buff); + if (len > sk->sk_rcvbuf) + goto save_message; + + skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA); + if (!skb) + goto save_message; + + iucv_process_message(sk, skb, path, msg); + goto out_unlock; + +save_message: + save_msg = kzalloc(sizeof(struct sock_msg_q), GFP_ATOMIC | GFP_DMA); + if (!save_msg) + goto out_unlock; + save_msg->path = path; + save_msg->msg = *msg; + + list_add_tail(&save_msg->list, &iucv->message_q.list); + +out_unlock: + spin_unlock(&iucv->message_q.lock); +} + +static void iucv_callback_txdone(struct iucv_path *path, + struct iucv_message *msg) +{ + struct sock *sk = path->private; + struct sk_buff *this = NULL; + struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q; + struct sk_buff *list_skb = list->next; + unsigned long flags; + + if (!skb_queue_empty(list)) { + spin_lock_irqsave(&list->lock, flags); + + while (list_skb != (struct sk_buff *)list) { + if (!memcmp(&msg->tag, CB_TAG(list_skb), CB_TAG_LEN)) { + this = list_skb; + break; + } + list_skb = list_skb->next; + } + if (this) + __skb_unlink(this, list); + + spin_unlock_irqrestore(&list->lock, flags); + + if (this) { + kfree_skb(this); + /* wake up any process waiting for sending */ + iucv_sock_wake_msglim(sk); + } + } + BUG_ON(!this); + + if (sk->sk_state == IUCV_CLOSING) { + if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) { + sk->sk_state = IUCV_CLOSED; + sk->sk_state_change(sk); + } + } + +} + +static void iucv_callback_connrej(struct iucv_path *path, u8 ipuser[16]) +{ + struct sock *sk = path->private; + + if (!list_empty(&iucv_sk(sk)->accept_q)) + sk->sk_state = IUCV_SEVERED; + else + sk->sk_state = IUCV_DISCONN; + + sk->sk_state_change(sk); +} + +/* called if the other communication side shuts down its RECV direction; + * in turn, the callback sets SEND_SHUTDOWN to disable sending of data. + */ +static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16]) +{ + struct sock *sk = path->private; + + bh_lock_sock(sk); + if (sk->sk_state != IUCV_CLOSED) { + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + } + bh_unlock_sock(sk); +} + +static const struct proto_ops iucv_sock_ops = { + .family = PF_IUCV, + .owner = THIS_MODULE, + .release = iucv_sock_release, + .bind = iucv_sock_bind, + .connect = iucv_sock_connect, + .listen = iucv_sock_listen, + .accept = iucv_sock_accept, + .getname = iucv_sock_getname, + .sendmsg = iucv_sock_sendmsg, + .recvmsg = iucv_sock_recvmsg, + .poll = iucv_sock_poll, + .ioctl = sock_no_ioctl, + .mmap = sock_no_mmap, + .socketpair = sock_no_socketpair, + .shutdown = iucv_sock_shutdown, + .setsockopt = iucv_sock_setsockopt, + .getsockopt = iucv_sock_getsockopt, +}; + +static const struct net_proto_family iucv_sock_family_ops = { + .family = AF_IUCV, + .owner = THIS_MODULE, + .create = iucv_sock_create, +}; + +static int __init afiucv_init(void) +{ + int err; + + if (!MACHINE_IS_VM) { + pr_err("The af_iucv module cannot be loaded" + " without z/VM\n"); + err = -EPROTONOSUPPORT; + goto out; + } + cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err); + if (unlikely(err)) { + WARN_ON(err); + err = -EPROTONOSUPPORT; + goto out; + } + + err = iucv_register(&af_iucv_handler, 0); + if (err) + goto out; + err = proto_register(&iucv_proto, 0); + if (err) + goto out_iucv; + err = sock_register(&iucv_sock_family_ops); + if (err) + goto out_proto; + /* establish dummy device */ + err = driver_register(&af_iucv_driver); + if (err) + goto out_sock; + af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!af_iucv_dev) { + err = -ENOMEM; + goto out_driver; + } + dev_set_name(af_iucv_dev, "af_iucv"); + af_iucv_dev->bus = &iucv_bus; + af_iucv_dev->parent = iucv_root; + af_iucv_dev->release = (void (*)(struct device *))kfree; + af_iucv_dev->driver = &af_iucv_driver; + err = device_register(af_iucv_dev); + if (err) + goto out_driver; + + return 0; + +out_driver: + driver_unregister(&af_iucv_driver); +out_sock: + sock_unregister(PF_IUCV); +out_proto: + proto_unregister(&iucv_proto); +out_iucv: + iucv_unregister(&af_iucv_handler, 0); +out: + return err; +} + +static void __exit afiucv_exit(void) +{ + device_unregister(af_iucv_dev); + driver_unregister(&af_iucv_driver); + sock_unregister(PF_IUCV); + proto_unregister(&iucv_proto); + iucv_unregister(&af_iucv_handler, 0); +} + +module_init(afiucv_init); +module_exit(afiucv_exit); + +MODULE_AUTHOR("Jennifer Hunt "); +MODULE_DESCRIPTION("IUCV Sockets ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_IUCV); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c new file mode 100644 index 00000000..7f912491 --- /dev/null +++ b/net/iucv/iucv.c @@ -0,0 +1,2098 @@ +/* + * IUCV base infrastructure. + * + * Copyright IBM Corp. 2001, 2009 + * + * Author(s): + * Original source: + * Alan Altmark (Alan_Altmark@us.ibm.com) Sept. 2000 + * Xenia Tkatschow (xenia@us.ibm.com) + * 2Gb awareness and general cleanup: + * Fritz Elfert (elfert@de.ibm.com, felfert@millenux.com) + * Rewritten for af_iucv: + * Martin Schwidefsky + * PM functions: + * Ursula Braun (ursula.braun@de.ibm.com) + * + * Documentation used: + * The original source + * CP Programming Service, IBM document # SC24-5760 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define KMSG_COMPONENT "iucv" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * FLAGS: + * All flags are defined in the field IPFLAGS1 of each function + * and can be found in CP Programming Services. + * IPSRCCLS - Indicates you have specified a source class. + * IPTRGCLS - Indicates you have specified a target class. + * IPFGPID - Indicates you have specified a pathid. + * IPFGMID - Indicates you have specified a message ID. + * IPNORPY - Indicates a one-way message. No reply expected. + * IPALL - Indicates that all paths are affected. + */ +#define IUCV_IPSRCCLS 0x01 +#define IUCV_IPTRGCLS 0x01 +#define IUCV_IPFGPID 0x02 +#define IUCV_IPFGMID 0x04 +#define IUCV_IPNORPY 0x10 +#define IUCV_IPALL 0x80 + +static int iucv_bus_match(struct device *dev, struct device_driver *drv) +{ + return 0; +} + +enum iucv_pm_states { + IUCV_PM_INITIAL = 0, + IUCV_PM_FREEZING = 1, + IUCV_PM_THAWING = 2, + IUCV_PM_RESTORING = 3, +}; +static enum iucv_pm_states iucv_pm_state; + +static int iucv_pm_prepare(struct device *); +static void iucv_pm_complete(struct device *); +static int iucv_pm_freeze(struct device *); +static int iucv_pm_thaw(struct device *); +static int iucv_pm_restore(struct device *); + +static const struct dev_pm_ops iucv_pm_ops = { + .prepare = iucv_pm_prepare, + .complete = iucv_pm_complete, + .freeze = iucv_pm_freeze, + .thaw = iucv_pm_thaw, + .restore = iucv_pm_restore, +}; + +struct bus_type iucv_bus = { + .name = "iucv", + .match = iucv_bus_match, + .pm = &iucv_pm_ops, +}; +EXPORT_SYMBOL(iucv_bus); + +struct device *iucv_root; +EXPORT_SYMBOL(iucv_root); + +static int iucv_available; + +/* General IUCV interrupt structure */ +struct iucv_irq_data { + u16 ippathid; + u8 ipflags1; + u8 iptype; + u32 res2[8]; +}; + +struct iucv_irq_list { + struct list_head list; + struct iucv_irq_data data; +}; + +static struct iucv_irq_data *iucv_irq_data[NR_CPUS]; +static cpumask_t iucv_buffer_cpumask = { CPU_BITS_NONE }; +static cpumask_t iucv_irq_cpumask = { CPU_BITS_NONE }; + +/* + * Queue of interrupt buffers lock for delivery via the tasklet + * (fast but can't call smp_call_function). + */ +static LIST_HEAD(iucv_task_queue); + +/* + * The tasklet for fast delivery of iucv interrupts. + */ +static void iucv_tasklet_fn(unsigned long); +static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0); + +/* + * Queue of interrupt buffers for delivery via a work queue + * (slower but can call smp_call_function). + */ +static LIST_HEAD(iucv_work_queue); + +/* + * The work element to deliver path pending interrupts. + */ +static void iucv_work_fn(struct work_struct *work); +static DECLARE_WORK(iucv_work, iucv_work_fn); + +/* + * Spinlock protecting task and work queue. + */ +static DEFINE_SPINLOCK(iucv_queue_lock); + +enum iucv_command_codes { + IUCV_QUERY = 0, + IUCV_RETRIEVE_BUFFER = 2, + IUCV_SEND = 4, + IUCV_RECEIVE = 5, + IUCV_REPLY = 6, + IUCV_REJECT = 8, + IUCV_PURGE = 9, + IUCV_ACCEPT = 10, + IUCV_CONNECT = 11, + IUCV_DECLARE_BUFFER = 12, + IUCV_QUIESCE = 13, + IUCV_RESUME = 14, + IUCV_SEVER = 15, + IUCV_SETMASK = 16, + IUCV_SETCONTROLMASK = 17, +}; + +/* + * Error messages that are used with the iucv_sever function. They get + * converted to EBCDIC. + */ +static char iucv_error_no_listener[16] = "NO LISTENER"; +static char iucv_error_no_memory[16] = "NO MEMORY"; +static char iucv_error_pathid[16] = "INVALID PATHID"; + +/* + * iucv_handler_list: List of registered handlers. + */ +static LIST_HEAD(iucv_handler_list); + +/* + * iucv_path_table: an array of iucv_path structures. + */ +static struct iucv_path **iucv_path_table; +static unsigned long iucv_max_pathid; + +/* + * iucv_lock: spinlock protecting iucv_handler_list and iucv_pathid_table + */ +static DEFINE_SPINLOCK(iucv_table_lock); + +/* + * iucv_active_cpu: contains the number of the cpu executing the tasklet + * or the work handler. Needed for iucv_path_sever called from tasklet. + */ +static int iucv_active_cpu = -1; + +/* + * Mutex and wait queue for iucv_register/iucv_unregister. + */ +static DEFINE_MUTEX(iucv_register_mutex); + +/* + * Counter for number of non-smp capable handlers. + */ +static int iucv_nonsmp_handler; + +/* + * IUCV control data structure. Used by iucv_path_accept, iucv_path_connect, + * iucv_path_quiesce and iucv_path_sever. + */ +struct iucv_cmd_control { + u16 ippathid; + u8 ipflags1; + u8 iprcode; + u16 ipmsglim; + u16 res1; + u8 ipvmid[8]; + u8 ipuser[16]; + u8 iptarget[8]; +} __attribute__ ((packed,aligned(8))); + +/* + * Data in parameter list iucv structure. Used by iucv_message_send, + * iucv_message_send2way and iucv_message_reply. + */ +struct iucv_cmd_dpl { + u16 ippathid; + u8 ipflags1; + u8 iprcode; + u32 ipmsgid; + u32 iptrgcls; + u8 iprmmsg[8]; + u32 ipsrccls; + u32 ipmsgtag; + u32 ipbfadr2; + u32 ipbfln2f; + u32 res; +} __attribute__ ((packed,aligned(8))); + +/* + * Data in buffer iucv structure. Used by iucv_message_receive, + * iucv_message_reject, iucv_message_send, iucv_message_send2way + * and iucv_declare_cpu. + */ +struct iucv_cmd_db { + u16 ippathid; + u8 ipflags1; + u8 iprcode; + u32 ipmsgid; + u32 iptrgcls; + u32 ipbfadr1; + u32 ipbfln1f; + u32 ipsrccls; + u32 ipmsgtag; + u32 ipbfadr2; + u32 ipbfln2f; + u32 res; +} __attribute__ ((packed,aligned(8))); + +/* + * Purge message iucv structure. Used by iucv_message_purge. + */ +struct iucv_cmd_purge { + u16 ippathid; + u8 ipflags1; + u8 iprcode; + u32 ipmsgid; + u8 ipaudit[3]; + u8 res1[5]; + u32 res2; + u32 ipsrccls; + u32 ipmsgtag; + u32 res3[3]; +} __attribute__ ((packed,aligned(8))); + +/* + * Set mask iucv structure. Used by iucv_enable_cpu. + */ +struct iucv_cmd_set_mask { + u8 ipmask; + u8 res1[2]; + u8 iprcode; + u32 res2[9]; +} __attribute__ ((packed,aligned(8))); + +union iucv_param { + struct iucv_cmd_control ctrl; + struct iucv_cmd_dpl dpl; + struct iucv_cmd_db db; + struct iucv_cmd_purge purge; + struct iucv_cmd_set_mask set_mask; +}; + +/* + * Anchor for per-cpu IUCV command parameter block. + */ +static union iucv_param *iucv_param[NR_CPUS]; +static union iucv_param *iucv_param_irq[NR_CPUS]; + +/** + * iucv_call_b2f0 + * @code: identifier of IUCV call to CP. + * @parm: pointer to a struct iucv_parm block + * + * Calls CP to execute IUCV commands. + * + * Returns the result of the CP IUCV call. + */ +static inline int iucv_call_b2f0(int command, union iucv_param *parm) +{ + register unsigned long reg0 asm ("0"); + register unsigned long reg1 asm ("1"); + int ccode; + + reg0 = command; + reg1 = virt_to_phys(parm); + asm volatile( + " .long 0xb2f01000\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1) + : "m" (*parm) : "cc"); + return (ccode == 1) ? parm->ctrl.iprcode : ccode; +} + +/** + * iucv_query_maxconn + * + * Determines the maximum number of connections that may be established. + * + * Returns the maximum number of connections or -EPERM is IUCV is not + * available. + */ +static int iucv_query_maxconn(void) +{ + register unsigned long reg0 asm ("0"); + register unsigned long reg1 asm ("1"); + void *param; + int ccode; + + param = kzalloc(sizeof(union iucv_param), GFP_KERNEL|GFP_DMA); + if (!param) + return -ENOMEM; + reg0 = IUCV_QUERY; + reg1 = (unsigned long) param; + asm volatile ( + " .long 0xb2f01000\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc"); + if (ccode == 0) + iucv_max_pathid = reg1; + kfree(param); + return ccode ? -EPERM : 0; +} + +/** + * iucv_allow_cpu + * @data: unused + * + * Allow iucv interrupts on this cpu. + */ +static void iucv_allow_cpu(void *data) +{ + int cpu = smp_processor_id(); + union iucv_param *parm; + + /* + * Enable all iucv interrupts. + * ipmask contains bits for the different interrupts + * 0x80 - Flag to allow nonpriority message pending interrupts + * 0x40 - Flag to allow priority message pending interrupts + * 0x20 - Flag to allow nonpriority message completion interrupts + * 0x10 - Flag to allow priority message completion interrupts + * 0x08 - Flag to allow IUCV control interrupts + */ + parm = iucv_param_irq[cpu]; + memset(parm, 0, sizeof(union iucv_param)); + parm->set_mask.ipmask = 0xf8; + iucv_call_b2f0(IUCV_SETMASK, parm); + + /* + * Enable all iucv control interrupts. + * ipmask contains bits for the different interrupts + * 0x80 - Flag to allow pending connections interrupts + * 0x40 - Flag to allow connection complete interrupts + * 0x20 - Flag to allow connection severed interrupts + * 0x10 - Flag to allow connection quiesced interrupts + * 0x08 - Flag to allow connection resumed interrupts + */ + memset(parm, 0, sizeof(union iucv_param)); + parm->set_mask.ipmask = 0xf8; + iucv_call_b2f0(IUCV_SETCONTROLMASK, parm); + /* Set indication that iucv interrupts are allowed for this cpu. */ + cpumask_set_cpu(cpu, &iucv_irq_cpumask); +} + +/** + * iucv_block_cpu + * @data: unused + * + * Block iucv interrupts on this cpu. + */ +static void iucv_block_cpu(void *data) +{ + int cpu = smp_processor_id(); + union iucv_param *parm; + + /* Disable all iucv interrupts. */ + parm = iucv_param_irq[cpu]; + memset(parm, 0, sizeof(union iucv_param)); + iucv_call_b2f0(IUCV_SETMASK, parm); + + /* Clear indication that iucv interrupts are allowed for this cpu. */ + cpumask_clear_cpu(cpu, &iucv_irq_cpumask); +} + +/** + * iucv_block_cpu_almost + * @data: unused + * + * Allow connection-severed interrupts only on this cpu. + */ +static void iucv_block_cpu_almost(void *data) +{ + int cpu = smp_processor_id(); + union iucv_param *parm; + + /* Allow iucv control interrupts only */ + parm = iucv_param_irq[cpu]; + memset(parm, 0, sizeof(union iucv_param)); + parm->set_mask.ipmask = 0x08; + iucv_call_b2f0(IUCV_SETMASK, parm); + /* Allow iucv-severed interrupt only */ + memset(parm, 0, sizeof(union iucv_param)); + parm->set_mask.ipmask = 0x20; + iucv_call_b2f0(IUCV_SETCONTROLMASK, parm); + + /* Clear indication that iucv interrupts are allowed for this cpu. */ + cpumask_clear_cpu(cpu, &iucv_irq_cpumask); +} + +/** + * iucv_declare_cpu + * @data: unused + * + * Declare a interrupt buffer on this cpu. + */ +static void iucv_declare_cpu(void *data) +{ + int cpu = smp_processor_id(); + union iucv_param *parm; + int rc; + + if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask)) + return; + + /* Declare interrupt buffer. */ + parm = iucv_param_irq[cpu]; + memset(parm, 0, sizeof(union iucv_param)); + parm->db.ipbfadr1 = virt_to_phys(iucv_irq_data[cpu]); + rc = iucv_call_b2f0(IUCV_DECLARE_BUFFER, parm); + if (rc) { + char *err = "Unknown"; + switch (rc) { + case 0x03: + err = "Directory error"; + break; + case 0x0a: + err = "Invalid length"; + break; + case 0x13: + err = "Buffer already exists"; + break; + case 0x3e: + err = "Buffer overlap"; + break; + case 0x5c: + err = "Paging or storage error"; + break; + } + pr_warning("Defining an interrupt buffer on CPU %i" + " failed with 0x%02x (%s)\n", cpu, rc, err); + return; + } + + /* Set indication that an iucv buffer exists for this cpu. */ + cpumask_set_cpu(cpu, &iucv_buffer_cpumask); + + if (iucv_nonsmp_handler == 0 || cpumask_empty(&iucv_irq_cpumask)) + /* Enable iucv interrupts on this cpu. */ + iucv_allow_cpu(NULL); + else + /* Disable iucv interrupts on this cpu. */ + iucv_block_cpu(NULL); +} + +/** + * iucv_retrieve_cpu + * @data: unused + * + * Retrieve interrupt buffer on this cpu. + */ +static void iucv_retrieve_cpu(void *data) +{ + int cpu = smp_processor_id(); + union iucv_param *parm; + + if (!cpumask_test_cpu(cpu, &iucv_buffer_cpumask)) + return; + + /* Block iucv interrupts. */ + iucv_block_cpu(NULL); + + /* Retrieve interrupt buffer. */ + parm = iucv_param_irq[cpu]; + iucv_call_b2f0(IUCV_RETRIEVE_BUFFER, parm); + + /* Clear indication that an iucv buffer exists for this cpu. */ + cpumask_clear_cpu(cpu, &iucv_buffer_cpumask); +} + +/** + * iucv_setmask_smp + * + * Allow iucv interrupts on all cpus. + */ +static void iucv_setmask_mp(void) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + /* Enable all cpus with a declared buffer. */ + if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask) && + !cpumask_test_cpu(cpu, &iucv_irq_cpumask)) + smp_call_function_single(cpu, iucv_allow_cpu, + NULL, 1); + put_online_cpus(); +} + +/** + * iucv_setmask_up + * + * Allow iucv interrupts on a single cpu. + */ +static void iucv_setmask_up(void) +{ + cpumask_t cpumask; + int cpu; + + /* Disable all cpu but the first in cpu_irq_cpumask. */ + cpumask_copy(&cpumask, &iucv_irq_cpumask); + cpumask_clear_cpu(cpumask_first(&iucv_irq_cpumask), &cpumask); + for_each_cpu(cpu, &cpumask) + smp_call_function_single(cpu, iucv_block_cpu, NULL, 1); +} + +/** + * iucv_enable + * + * This function makes iucv ready for use. It allocates the pathid + * table, declares an iucv interrupt buffer and enables the iucv + * interrupts. Called when the first user has registered an iucv + * handler. + */ +static int iucv_enable(void) +{ + size_t alloc_size; + int cpu, rc; + + get_online_cpus(); + rc = -ENOMEM; + alloc_size = iucv_max_pathid * sizeof(struct iucv_path); + iucv_path_table = kzalloc(alloc_size, GFP_KERNEL); + if (!iucv_path_table) + goto out; + /* Declare per cpu buffers. */ + rc = -EIO; + for_each_online_cpu(cpu) + smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1); + if (cpumask_empty(&iucv_buffer_cpumask)) + /* No cpu could declare an iucv buffer. */ + goto out; + put_online_cpus(); + return 0; +out: + kfree(iucv_path_table); + iucv_path_table = NULL; + put_online_cpus(); + return rc; +} + +/** + * iucv_disable + * + * This function shuts down iucv. It disables iucv interrupts, retrieves + * the iucv interrupt buffer and frees the pathid table. Called after the + * last user unregister its iucv handler. + */ +static void iucv_disable(void) +{ + get_online_cpus(); + on_each_cpu(iucv_retrieve_cpu, NULL, 1); + kfree(iucv_path_table); + iucv_path_table = NULL; + put_online_cpus(); +} + +static int __cpuinit iucv_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + cpumask_t cpumask; + long cpu = (long) hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_irq_data[cpu]) + return notifier_from_errno(-ENOMEM); + + iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param[cpu]) { + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; + return notifier_from_errno(-ENOMEM); + } + iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param_irq[cpu]) { + kfree(iucv_param[cpu]); + iucv_param[cpu] = NULL; + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; + return notifier_from_errno(-ENOMEM); + } + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + kfree(iucv_param_irq[cpu]); + iucv_param_irq[cpu] = NULL; + kfree(iucv_param[cpu]); + iucv_param[cpu] = NULL; + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!iucv_path_table) + break; + smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!iucv_path_table) + break; + cpumask_copy(&cpumask, &iucv_buffer_cpumask); + cpumask_clear_cpu(cpu, &cpumask); + if (cpumask_empty(&cpumask)) + /* Can't offline last IUCV enabled cpu. */ + return notifier_from_errno(-EINVAL); + smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1); + if (cpumask_empty(&iucv_irq_cpumask)) + smp_call_function_single( + cpumask_first(&iucv_buffer_cpumask), + iucv_allow_cpu, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata iucv_cpu_notifier = { + .notifier_call = iucv_cpu_notify, +}; + +/** + * iucv_sever_pathid + * @pathid: path identification number. + * @userdata: 16-bytes of user data. + * + * Sever an iucv path to free up the pathid. Used internally. + */ +static int iucv_sever_pathid(u16 pathid, u8 userdata[16]) +{ + union iucv_param *parm; + + parm = iucv_param_irq[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + if (userdata) + memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser)); + parm->ctrl.ippathid = pathid; + return iucv_call_b2f0(IUCV_SEVER, parm); +} + +/** + * __iucv_cleanup_queue + * @dummy: unused dummy argument + * + * Nop function called via smp_call_function to force work items from + * pending external iucv interrupts to the work queue. + */ +static void __iucv_cleanup_queue(void *dummy) +{ +} + +/** + * iucv_cleanup_queue + * + * Function called after a path has been severed to find all remaining + * work items for the now stale pathid. The caller needs to hold the + * iucv_table_lock. + */ +static void iucv_cleanup_queue(void) +{ + struct iucv_irq_list *p, *n; + + /* + * When a path is severed, the pathid can be reused immediately + * on a iucv connect or a connection pending interrupt. Remove + * all entries from the task queue that refer to a stale pathid + * (iucv_path_table[ix] == NULL). Only then do the iucv connect + * or deliver the connection pending interrupt. To get all the + * pending interrupts force them to the work queue by calling + * an empty function on all cpus. + */ + smp_call_function(__iucv_cleanup_queue, NULL, 1); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) { + /* Remove stale work items from the task queue. */ + if (iucv_path_table[p->data.ippathid] == NULL) { + list_del(&p->list); + kfree(p); + } + } + spin_unlock_irq(&iucv_queue_lock); +} + +/** + * iucv_register: + * @handler: address of iucv handler structure + * @smp: != 0 indicates that the handler can deal with out of order messages + * + * Registers a driver with IUCV. + * + * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. + */ +int iucv_register(struct iucv_handler *handler, int smp) +{ + int rc; + + if (!iucv_available) + return -ENOSYS; + mutex_lock(&iucv_register_mutex); + if (!smp) + iucv_nonsmp_handler++; + if (list_empty(&iucv_handler_list)) { + rc = iucv_enable(); + if (rc) + goto out_mutex; + } else if (!smp && iucv_nonsmp_handler == 1) + iucv_setmask_up(); + INIT_LIST_HEAD(&handler->paths); + + spin_lock_bh(&iucv_table_lock); + list_add_tail(&handler->list, &iucv_handler_list); + spin_unlock_bh(&iucv_table_lock); + rc = 0; +out_mutex: + mutex_unlock(&iucv_register_mutex); + return rc; +} +EXPORT_SYMBOL(iucv_register); + +/** + * iucv_unregister + * @handler: address of iucv handler structure + * @smp: != 0 indicates that the handler can deal with out of order messages + * + * Unregister driver from IUCV. + */ +void iucv_unregister(struct iucv_handler *handler, int smp) +{ + struct iucv_path *p, *n; + + mutex_lock(&iucv_register_mutex); + spin_lock_bh(&iucv_table_lock); + /* Remove handler from the iucv_handler_list. */ + list_del_init(&handler->list); + /* Sever all pathids still referring to the handler. */ + list_for_each_entry_safe(p, n, &handler->paths, list) { + iucv_sever_pathid(p->pathid, NULL); + iucv_path_table[p->pathid] = NULL; + list_del(&p->list); + iucv_path_free(p); + } + spin_unlock_bh(&iucv_table_lock); + if (!smp) + iucv_nonsmp_handler--; + if (list_empty(&iucv_handler_list)) + iucv_disable(); + else if (!smp && iucv_nonsmp_handler == 0) + iucv_setmask_mp(); + mutex_unlock(&iucv_register_mutex); +} +EXPORT_SYMBOL(iucv_unregister); + +static int iucv_reboot_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + int i; + + get_online_cpus(); + on_each_cpu(iucv_block_cpu, NULL, 1); + preempt_disable(); + for (i = 0; i < iucv_max_pathid; i++) { + if (iucv_path_table[i]) + iucv_sever_pathid(i, NULL); + } + preempt_enable(); + put_online_cpus(); + iucv_disable(); + return NOTIFY_DONE; +} + +static struct notifier_block iucv_reboot_notifier = { + .notifier_call = iucv_reboot_event, +}; + +/** + * iucv_path_accept + * @path: address of iucv path structure + * @handler: address of iucv handler structure + * @userdata: 16 bytes of data reflected to the communication partner + * @private: private data passed to interrupt handlers for this path + * + * This function is issued after the user received a connection pending + * external interrupt and now wishes to complete the IUCV communication path. + * + * Returns the result of the CP IUCV call. + */ +int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, + u8 userdata[16], void *private) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + /* Prepare parameter block. */ + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + parm->ctrl.ippathid = path->pathid; + parm->ctrl.ipmsglim = path->msglim; + if (userdata) + memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser)); + parm->ctrl.ipflags1 = path->flags; + + rc = iucv_call_b2f0(IUCV_ACCEPT, parm); + if (!rc) { + path->private = private; + path->msglim = parm->ctrl.ipmsglim; + path->flags = parm->ctrl.ipflags1; + } +out: + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_path_accept); + +/** + * iucv_path_connect + * @path: address of iucv path structure + * @handler: address of iucv handler structure + * @userid: 8-byte user identification + * @system: 8-byte target system identification + * @userdata: 16 bytes of data reflected to the communication partner + * @private: private data passed to interrupt handlers for this path + * + * This function establishes an IUCV path. Although the connect may complete + * successfully, you are not able to use the path until you receive an IUCV + * Connection Complete external interrupt. + * + * Returns the result of the CP IUCV call. + */ +int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, + u8 userid[8], u8 system[8], u8 userdata[16], + void *private) +{ + union iucv_param *parm; + int rc; + + spin_lock_bh(&iucv_table_lock); + iucv_cleanup_queue(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + parm->ctrl.ipmsglim = path->msglim; + parm->ctrl.ipflags1 = path->flags; + if (userid) { + memcpy(parm->ctrl.ipvmid, userid, sizeof(parm->ctrl.ipvmid)); + ASCEBC(parm->ctrl.ipvmid, sizeof(parm->ctrl.ipvmid)); + EBC_TOUPPER(parm->ctrl.ipvmid, sizeof(parm->ctrl.ipvmid)); + } + if (system) { + memcpy(parm->ctrl.iptarget, system, + sizeof(parm->ctrl.iptarget)); + ASCEBC(parm->ctrl.iptarget, sizeof(parm->ctrl.iptarget)); + EBC_TOUPPER(parm->ctrl.iptarget, sizeof(parm->ctrl.iptarget)); + } + if (userdata) + memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser)); + + rc = iucv_call_b2f0(IUCV_CONNECT, parm); + if (!rc) { + if (parm->ctrl.ippathid < iucv_max_pathid) { + path->pathid = parm->ctrl.ippathid; + path->msglim = parm->ctrl.ipmsglim; + path->flags = parm->ctrl.ipflags1; + path->handler = handler; + path->private = private; + list_add_tail(&path->list, &handler->paths); + iucv_path_table[path->pathid] = path; + } else { + iucv_sever_pathid(parm->ctrl.ippathid, + iucv_error_pathid); + rc = -EIO; + } + } +out: + spin_unlock_bh(&iucv_table_lock); + return rc; +} +EXPORT_SYMBOL(iucv_path_connect); + +/** + * iucv_path_quiesce: + * @path: address of iucv path structure + * @userdata: 16 bytes of data reflected to the communication partner + * + * This function temporarily suspends incoming messages on an IUCV path. + * You can later reactivate the path by invoking the iucv_resume function. + * + * Returns the result from the CP IUCV call. + */ +int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16]) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + if (userdata) + memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser)); + parm->ctrl.ippathid = path->pathid; + rc = iucv_call_b2f0(IUCV_QUIESCE, parm); +out: + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_path_quiesce); + +/** + * iucv_path_resume: + * @path: address of iucv path structure + * @userdata: 16 bytes of data reflected to the communication partner + * + * This function resumes incoming messages on an IUCV path that has + * been stopped with iucv_path_quiesce. + * + * Returns the result from the CP IUCV call. + */ +int iucv_path_resume(struct iucv_path *path, u8 userdata[16]) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + if (userdata) + memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser)); + parm->ctrl.ippathid = path->pathid; + rc = iucv_call_b2f0(IUCV_RESUME, parm); +out: + local_bh_enable(); + return rc; +} + +/** + * iucv_path_sever + * @path: address of iucv path structure + * @userdata: 16 bytes of data reflected to the communication partner + * + * This function terminates an IUCV path. + * + * Returns the result from the CP IUCV call. + */ +int iucv_path_sever(struct iucv_path *path, u8 userdata[16]) +{ + int rc; + + preempt_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + if (iucv_active_cpu != smp_processor_id()) + spin_lock_bh(&iucv_table_lock); + rc = iucv_sever_pathid(path->pathid, userdata); + iucv_path_table[path->pathid] = NULL; + list_del_init(&path->list); + if (iucv_active_cpu != smp_processor_id()) + spin_unlock_bh(&iucv_table_lock); +out: + preempt_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_path_sever); + +/** + * iucv_message_purge + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @srccls: source class of message + * + * Cancels a message you have sent. + * + * Returns the result from the CP IUCV call. + */ +int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, + u32 srccls) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + parm->purge.ippathid = path->pathid; + parm->purge.ipmsgid = msg->id; + parm->purge.ipsrccls = srccls; + parm->purge.ipflags1 = IUCV_IPSRCCLS | IUCV_IPFGMID | IUCV_IPFGPID; + rc = iucv_call_b2f0(IUCV_PURGE, parm); + if (!rc) { + msg->audit = (*(u32 *) &parm->purge.ipaudit) >> 8; + msg->tag = parm->purge.ipmsgtag; + } +out: + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_message_purge); + +/** + * iucv_message_receive_iprmdata + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the message is received (IUCV_IPBUFLST) + * @buffer: address of data buffer or address of struct iucv_array + * @size: length of data buffer + * @residual: + * + * Internal function used by iucv_message_receive and __iucv_message_receive + * to receive RMDATA data stored in struct iucv_message. + */ +static int iucv_message_receive_iprmdata(struct iucv_path *path, + struct iucv_message *msg, + u8 flags, void *buffer, + size_t size, size_t *residual) +{ + struct iucv_array *array; + u8 *rmmsg; + size_t copy; + + /* + * Message is 8 bytes long and has been stored to the + * message descriptor itself. + */ + if (residual) + *residual = abs(size - 8); + rmmsg = msg->rmmsg; + if (flags & IUCV_IPBUFLST) { + /* Copy to struct iucv_array. */ + size = (size < 8) ? size : 8; + for (array = buffer; size > 0; array++) { + copy = min_t(size_t, size, array->length); + memcpy((u8 *)(addr_t) array->address, + rmmsg, copy); + rmmsg += copy; + size -= copy; + } + } else { + /* Copy to direct buffer. */ + memcpy(buffer, rmmsg, min_t(size_t, size, 8)); + } + return 0; +} + +/** + * __iucv_message_receive + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the message is received (IUCV_IPBUFLST) + * @buffer: address of data buffer or address of struct iucv_array + * @size: length of data buffer + * @residual: + * + * This function receives messages that are being sent to you over + * established paths. This function will deal with RMDATA messages + * embedded in struct iucv_message as well. + * + * Locking: no locking + * + * Returns the result from the CP IUCV call. + */ +int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, + u8 flags, void *buffer, size_t size, size_t *residual) +{ + union iucv_param *parm; + int rc; + + if (msg->flags & IUCV_IPRMDATA) + return iucv_message_receive_iprmdata(path, msg, flags, + buffer, size, residual); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + parm->db.ipbfadr1 = (u32)(addr_t) buffer; + parm->db.ipbfln1f = (u32) size; + parm->db.ipmsgid = msg->id; + parm->db.ippathid = path->pathid; + parm->db.iptrgcls = msg->class; + parm->db.ipflags1 = (flags | IUCV_IPFGPID | + IUCV_IPFGMID | IUCV_IPTRGCLS); + rc = iucv_call_b2f0(IUCV_RECEIVE, parm); + if (!rc || rc == 5) { + msg->flags = parm->db.ipflags1; + if (residual) + *residual = parm->db.ipbfln1f; + } +out: + return rc; +} +EXPORT_SYMBOL(__iucv_message_receive); + +/** + * iucv_message_receive + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the message is received (IUCV_IPBUFLST) + * @buffer: address of data buffer or address of struct iucv_array + * @size: length of data buffer + * @residual: + * + * This function receives messages that are being sent to you over + * established paths. This function will deal with RMDATA messages + * embedded in struct iucv_message as well. + * + * Locking: local_bh_enable/local_bh_disable + * + * Returns the result from the CP IUCV call. + */ +int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, + u8 flags, void *buffer, size_t size, size_t *residual) +{ + int rc; + + if (msg->flags & IUCV_IPRMDATA) + return iucv_message_receive_iprmdata(path, msg, flags, + buffer, size, residual); + local_bh_disable(); + rc = __iucv_message_receive(path, msg, flags, buffer, size, residual); + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_message_receive); + +/** + * iucv_message_reject + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * + * The reject function refuses a specified message. Between the time you + * are notified of a message and the time that you complete the message, + * the message may be rejected. + * + * Returns the result from the CP IUCV call. + */ +int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + parm->db.ippathid = path->pathid; + parm->db.ipmsgid = msg->id; + parm->db.iptrgcls = msg->class; + parm->db.ipflags1 = (IUCV_IPTRGCLS | IUCV_IPFGMID | IUCV_IPFGPID); + rc = iucv_call_b2f0(IUCV_REJECT, parm); +out: + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_message_reject); + +/** + * iucv_message_reply + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) + * @reply: address of reply data buffer or address of struct iucv_array + * @size: length of reply data buffer + * + * This function responds to the two-way messages that you receive. You + * must identify completely the message to which you wish to reply. ie, + * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into + * the parameter list. + * + * Returns the result from the CP IUCV call. + */ +int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, + u8 flags, void *reply, size_t size) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + if (flags & IUCV_IPRMDATA) { + parm->dpl.ippathid = path->pathid; + parm->dpl.ipflags1 = flags; + parm->dpl.ipmsgid = msg->id; + parm->dpl.iptrgcls = msg->class; + memcpy(parm->dpl.iprmmsg, reply, min_t(size_t, size, 8)); + } else { + parm->db.ipbfadr1 = (u32)(addr_t) reply; + parm->db.ipbfln1f = (u32) size; + parm->db.ippathid = path->pathid; + parm->db.ipflags1 = flags; + parm->db.ipmsgid = msg->id; + parm->db.iptrgcls = msg->class; + } + rc = iucv_call_b2f0(IUCV_REPLY, parm); +out: + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_message_reply); + +/** + * __iucv_message_send + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) + * @srccls: source class of message + * @buffer: address of send buffer or address of struct iucv_array + * @size: length of send buffer + * + * This function transmits data to another application. Data to be + * transmitted is in a buffer and this is a one-way message and the + * receiver will not reply to the message. + * + * Locking: no locking + * + * Returns the result from the CP IUCV call. + */ +int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, + u8 flags, u32 srccls, void *buffer, size_t size) +{ + union iucv_param *parm; + int rc; + + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + if (flags & IUCV_IPRMDATA) { + /* Message of 8 bytes can be placed into the parameter list. */ + parm->dpl.ippathid = path->pathid; + parm->dpl.ipflags1 = flags | IUCV_IPNORPY; + parm->dpl.iptrgcls = msg->class; + parm->dpl.ipsrccls = srccls; + parm->dpl.ipmsgtag = msg->tag; + memcpy(parm->dpl.iprmmsg, buffer, 8); + } else { + parm->db.ipbfadr1 = (u32)(addr_t) buffer; + parm->db.ipbfln1f = (u32) size; + parm->db.ippathid = path->pathid; + parm->db.ipflags1 = flags | IUCV_IPNORPY; + parm->db.iptrgcls = msg->class; + parm->db.ipsrccls = srccls; + parm->db.ipmsgtag = msg->tag; + } + rc = iucv_call_b2f0(IUCV_SEND, parm); + if (!rc) + msg->id = parm->db.ipmsgid; +out: + return rc; +} +EXPORT_SYMBOL(__iucv_message_send); + +/** + * iucv_message_send + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) + * @srccls: source class of message + * @buffer: address of send buffer or address of struct iucv_array + * @size: length of send buffer + * + * This function transmits data to another application. Data to be + * transmitted is in a buffer and this is a one-way message and the + * receiver will not reply to the message. + * + * Locking: local_bh_enable/local_bh_disable + * + * Returns the result from the CP IUCV call. + */ +int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, + u8 flags, u32 srccls, void *buffer, size_t size) +{ + int rc; + + local_bh_disable(); + rc = __iucv_message_send(path, msg, flags, srccls, buffer, size); + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_message_send); + +/** + * iucv_message_send2way + * @path: address of iucv path structure + * @msg: address of iucv msg structure + * @flags: how the message is sent and the reply is received + * (IUCV_IPRMDATA, IUCV_IPBUFLST, IUCV_IPPRTY, IUCV_ANSLST) + * @srccls: source class of message + * @buffer: address of send buffer or address of struct iucv_array + * @size: length of send buffer + * @ansbuf: address of answer buffer or address of struct iucv_array + * @asize: size of reply buffer + * + * This function transmits data to another application. Data to be + * transmitted is in a buffer. The receiver of the send is expected to + * reply to the message and a buffer is provided into which IUCV moves + * the reply to this message. + * + * Returns the result from the CP IUCV call. + */ +int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, + u8 flags, u32 srccls, void *buffer, size_t size, + void *answer, size_t asize, size_t *residual) +{ + union iucv_param *parm; + int rc; + + local_bh_disable(); + if (cpumask_empty(&iucv_buffer_cpumask)) { + rc = -EIO; + goto out; + } + parm = iucv_param[smp_processor_id()]; + memset(parm, 0, sizeof(union iucv_param)); + if (flags & IUCV_IPRMDATA) { + parm->dpl.ippathid = path->pathid; + parm->dpl.ipflags1 = path->flags; /* priority message */ + parm->dpl.iptrgcls = msg->class; + parm->dpl.ipsrccls = srccls; + parm->dpl.ipmsgtag = msg->tag; + parm->dpl.ipbfadr2 = (u32)(addr_t) answer; + parm->dpl.ipbfln2f = (u32) asize; + memcpy(parm->dpl.iprmmsg, buffer, 8); + } else { + parm->db.ippathid = path->pathid; + parm->db.ipflags1 = path->flags; /* priority message */ + parm->db.iptrgcls = msg->class; + parm->db.ipsrccls = srccls; + parm->db.ipmsgtag = msg->tag; + parm->db.ipbfadr1 = (u32)(addr_t) buffer; + parm->db.ipbfln1f = (u32) size; + parm->db.ipbfadr2 = (u32)(addr_t) answer; + parm->db.ipbfln2f = (u32) asize; + } + rc = iucv_call_b2f0(IUCV_SEND, parm); + if (!rc) + msg->id = parm->db.ipmsgid; +out: + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(iucv_message_send2way); + +/** + * iucv_path_pending + * @data: Pointer to external interrupt buffer + * + * Process connection pending work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_path_pending { + u16 ippathid; + u8 ipflags1; + u8 iptype; + u16 ipmsglim; + u16 res1; + u8 ipvmid[8]; + u8 ipuser[16]; + u32 res3; + u8 ippollfg; + u8 res4[3]; +} __packed; + +static void iucv_path_pending(struct iucv_irq_data *data) +{ + struct iucv_path_pending *ipp = (void *) data; + struct iucv_handler *handler; + struct iucv_path *path; + char *error; + + BUG_ON(iucv_path_table[ipp->ippathid]); + /* New pathid, handler found. Create a new path struct. */ + error = iucv_error_no_memory; + path = iucv_path_alloc(ipp->ipmsglim, ipp->ipflags1, GFP_ATOMIC); + if (!path) + goto out_sever; + path->pathid = ipp->ippathid; + iucv_path_table[path->pathid] = path; + EBCASC(ipp->ipvmid, 8); + + /* Call registered handler until one is found that wants the path. */ + list_for_each_entry(handler, &iucv_handler_list, list) { + if (!handler->path_pending) + continue; + /* + * Add path to handler to allow a call to iucv_path_sever + * inside the path_pending function. If the handler returns + * an error remove the path from the handler again. + */ + list_add(&path->list, &handler->paths); + path->handler = handler; + if (!handler->path_pending(path, ipp->ipvmid, ipp->ipuser)) + return; + list_del(&path->list); + path->handler = NULL; + } + /* No handler wanted the path. */ + iucv_path_table[path->pathid] = NULL; + iucv_path_free(path); + error = iucv_error_no_listener; +out_sever: + iucv_sever_pathid(ipp->ippathid, error); +} + +/** + * iucv_path_complete + * @data: Pointer to external interrupt buffer + * + * Process connection complete work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_path_complete { + u16 ippathid; + u8 ipflags1; + u8 iptype; + u16 ipmsglim; + u16 res1; + u8 res2[8]; + u8 ipuser[16]; + u32 res3; + u8 ippollfg; + u8 res4[3]; +} __packed; + +static void iucv_path_complete(struct iucv_irq_data *data) +{ + struct iucv_path_complete *ipc = (void *) data; + struct iucv_path *path = iucv_path_table[ipc->ippathid]; + + if (path) + path->flags = ipc->ipflags1; + if (path && path->handler && path->handler->path_complete) + path->handler->path_complete(path, ipc->ipuser); +} + +/** + * iucv_path_severed + * @data: Pointer to external interrupt buffer + * + * Process connection severed work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_path_severed { + u16 ippathid; + u8 res1; + u8 iptype; + u32 res2; + u8 res3[8]; + u8 ipuser[16]; + u32 res4; + u8 ippollfg; + u8 res5[3]; +} __packed; + +static void iucv_path_severed(struct iucv_irq_data *data) +{ + struct iucv_path_severed *ips = (void *) data; + struct iucv_path *path = iucv_path_table[ips->ippathid]; + + if (!path || !path->handler) /* Already severed */ + return; + if (path->handler->path_severed) + path->handler->path_severed(path, ips->ipuser); + else { + iucv_sever_pathid(path->pathid, NULL); + iucv_path_table[path->pathid] = NULL; + list_del(&path->list); + iucv_path_free(path); + } +} + +/** + * iucv_path_quiesced + * @data: Pointer to external interrupt buffer + * + * Process connection quiesced work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_path_quiesced { + u16 ippathid; + u8 res1; + u8 iptype; + u32 res2; + u8 res3[8]; + u8 ipuser[16]; + u32 res4; + u8 ippollfg; + u8 res5[3]; +} __packed; + +static void iucv_path_quiesced(struct iucv_irq_data *data) +{ + struct iucv_path_quiesced *ipq = (void *) data; + struct iucv_path *path = iucv_path_table[ipq->ippathid]; + + if (path && path->handler && path->handler->path_quiesced) + path->handler->path_quiesced(path, ipq->ipuser); +} + +/** + * iucv_path_resumed + * @data: Pointer to external interrupt buffer + * + * Process connection resumed work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_path_resumed { + u16 ippathid; + u8 res1; + u8 iptype; + u32 res2; + u8 res3[8]; + u8 ipuser[16]; + u32 res4; + u8 ippollfg; + u8 res5[3]; +} __packed; + +static void iucv_path_resumed(struct iucv_irq_data *data) +{ + struct iucv_path_resumed *ipr = (void *) data; + struct iucv_path *path = iucv_path_table[ipr->ippathid]; + + if (path && path->handler && path->handler->path_resumed) + path->handler->path_resumed(path, ipr->ipuser); +} + +/** + * iucv_message_complete + * @data: Pointer to external interrupt buffer + * + * Process message complete work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_message_complete { + u16 ippathid; + u8 ipflags1; + u8 iptype; + u32 ipmsgid; + u32 ipaudit; + u8 iprmmsg[8]; + u32 ipsrccls; + u32 ipmsgtag; + u32 res; + u32 ipbfln2f; + u8 ippollfg; + u8 res2[3]; +} __packed; + +static void iucv_message_complete(struct iucv_irq_data *data) +{ + struct iucv_message_complete *imc = (void *) data; + struct iucv_path *path = iucv_path_table[imc->ippathid]; + struct iucv_message msg; + + if (path && path->handler && path->handler->message_complete) { + msg.flags = imc->ipflags1; + msg.id = imc->ipmsgid; + msg.audit = imc->ipaudit; + memcpy(msg.rmmsg, imc->iprmmsg, 8); + msg.class = imc->ipsrccls; + msg.tag = imc->ipmsgtag; + msg.length = imc->ipbfln2f; + path->handler->message_complete(path, &msg); + } +} + +/** + * iucv_message_pending + * @data: Pointer to external interrupt buffer + * + * Process message pending work item. Called from tasklet while holding + * iucv_table_lock. + */ +struct iucv_message_pending { + u16 ippathid; + u8 ipflags1; + u8 iptype; + u32 ipmsgid; + u32 iptrgcls; + union { + u32 iprmmsg1_u32; + u8 iprmmsg1[4]; + } ln1msg1; + union { + u32 ipbfln1f; + u8 iprmmsg2[4]; + } ln1msg2; + u32 res1[3]; + u32 ipbfln2f; + u8 ippollfg; + u8 res2[3]; +} __packed; + +static void iucv_message_pending(struct iucv_irq_data *data) +{ + struct iucv_message_pending *imp = (void *) data; + struct iucv_path *path = iucv_path_table[imp->ippathid]; + struct iucv_message msg; + + if (path && path->handler && path->handler->message_pending) { + msg.flags = imp->ipflags1; + msg.id = imp->ipmsgid; + msg.class = imp->iptrgcls; + if (imp->ipflags1 & IUCV_IPRMDATA) { + memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8); + msg.length = 8; + } else + msg.length = imp->ln1msg2.ipbfln1f; + msg.reply_size = imp->ipbfln2f; + path->handler->message_pending(path, &msg); + } +} + +/** + * iucv_tasklet_fn: + * + * This tasklet loops over the queue of irq buffers created by + * iucv_external_interrupt, calls the appropriate action handler + * and then frees the buffer. + */ +static void iucv_tasklet_fn(unsigned long ignored) +{ + typedef void iucv_irq_fn(struct iucv_irq_data *); + static iucv_irq_fn *irq_fn[] = { + [0x02] = iucv_path_complete, + [0x03] = iucv_path_severed, + [0x04] = iucv_path_quiesced, + [0x05] = iucv_path_resumed, + [0x06] = iucv_message_complete, + [0x07] = iucv_message_complete, + [0x08] = iucv_message_pending, + [0x09] = iucv_message_pending, + }; + LIST_HEAD(task_queue); + struct iucv_irq_list *p, *n; + + /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ + if (!spin_trylock(&iucv_table_lock)) { + tasklet_schedule(&iucv_tasklet); + return; + } + iucv_active_cpu = smp_processor_id(); + + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_task_queue, &task_queue); + spin_unlock_irq(&iucv_queue_lock); + + list_for_each_entry_safe(p, n, &task_queue, list) { + list_del_init(&p->list); + irq_fn[p->data.iptype](&p->data); + kfree(p); + } + + iucv_active_cpu = -1; + spin_unlock(&iucv_table_lock); +} + +/** + * iucv_work_fn: + * + * This work function loops over the queue of path pending irq blocks + * created by iucv_external_interrupt, calls the appropriate action + * handler and then frees the buffer. + */ +static void iucv_work_fn(struct work_struct *work) +{ + LIST_HEAD(work_queue); + struct iucv_irq_list *p, *n; + + /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ + spin_lock_bh(&iucv_table_lock); + iucv_active_cpu = smp_processor_id(); + + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_work_queue, &work_queue); + spin_unlock_irq(&iucv_queue_lock); + + iucv_cleanup_queue(); + list_for_each_entry_safe(p, n, &work_queue, list) { + list_del_init(&p->list); + iucv_path_pending(&p->data); + kfree(p); + } + + iucv_active_cpu = -1; + spin_unlock_bh(&iucv_table_lock); +} + +/** + * iucv_external_interrupt + * @code: irq code + * + * Handles external interrupts coming in from CP. + * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). + */ +static void iucv_external_interrupt(unsigned int ext_int_code, + unsigned int param32, unsigned long param64) +{ + struct iucv_irq_data *p; + struct iucv_irq_list *work; + + kstat_cpu(smp_processor_id()).irqs[EXTINT_IUC]++; + p = iucv_irq_data[smp_processor_id()]; + if (p->ippathid >= iucv_max_pathid) { + WARN_ON(p->ippathid >= iucv_max_pathid); + iucv_sever_pathid(p->ippathid, iucv_error_no_listener); + return; + } + BUG_ON(p->iptype < 0x01 || p->iptype > 0x09); + work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); + if (!work) { + pr_warning("iucv_external_interrupt: out of memory\n"); + return; + } + memcpy(&work->data, p, sizeof(work->data)); + spin_lock(&iucv_queue_lock); + if (p->iptype == 0x01) { + /* Path pending interrupt. */ + list_add_tail(&work->list, &iucv_work_queue); + schedule_work(&iucv_work); + } else { + /* The other interrupts. */ + list_add_tail(&work->list, &iucv_task_queue); + tasklet_schedule(&iucv_tasklet); + } + spin_unlock(&iucv_queue_lock); +} + +static int iucv_pm_prepare(struct device *dev) +{ + int rc = 0; + +#ifdef CONFIG_PM_DEBUG + printk(KERN_INFO "iucv_pm_prepare\n"); +#endif + if (dev->driver && dev->driver->pm && dev->driver->pm->prepare) + rc = dev->driver->pm->prepare(dev); + return rc; +} + +static void iucv_pm_complete(struct device *dev) +{ +#ifdef CONFIG_PM_DEBUG + printk(KERN_INFO "iucv_pm_complete\n"); +#endif + if (dev->driver && dev->driver->pm && dev->driver->pm->complete) + dev->driver->pm->complete(dev); +} + +/** + * iucv_path_table_empty() - determine if iucv path table is empty + * + * Returns 0 if there are still iucv pathes defined + * 1 if there are no iucv pathes defined + */ +int iucv_path_table_empty(void) +{ + int i; + + for (i = 0; i < iucv_max_pathid; i++) { + if (iucv_path_table[i]) + return 0; + } + return 1; +} + +/** + * iucv_pm_freeze() - Freeze PM callback + * @dev: iucv-based device + * + * disable iucv interrupts + * invoke callback function of the iucv-based driver + * shut down iucv, if no iucv-pathes are established anymore + */ +static int iucv_pm_freeze(struct device *dev) +{ + int cpu; + struct iucv_irq_list *p, *n; + int rc = 0; + +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "iucv_pm_freeze\n"); +#endif + if (iucv_pm_state != IUCV_PM_FREEZING) { + for_each_cpu(cpu, &iucv_irq_cpumask) + smp_call_function_single(cpu, iucv_block_cpu_almost, + NULL, 1); + cancel_work_sync(&iucv_work); + list_for_each_entry_safe(p, n, &iucv_work_queue, list) { + list_del_init(&p->list); + iucv_sever_pathid(p->data.ippathid, + iucv_error_no_listener); + kfree(p); + } + } + iucv_pm_state = IUCV_PM_FREEZING; + if (dev->driver && dev->driver->pm && dev->driver->pm->freeze) + rc = dev->driver->pm->freeze(dev); + if (iucv_path_table_empty()) + iucv_disable(); + return rc; +} + +/** + * iucv_pm_thaw() - Thaw PM callback + * @dev: iucv-based device + * + * make iucv ready for use again: allocate path table, declare interrupt buffers + * and enable iucv interrupts + * invoke callback function of the iucv-based driver + */ +static int iucv_pm_thaw(struct device *dev) +{ + int rc = 0; + +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "iucv_pm_thaw\n"); +#endif + iucv_pm_state = IUCV_PM_THAWING; + if (!iucv_path_table) { + rc = iucv_enable(); + if (rc) + goto out; + } + if (cpumask_empty(&iucv_irq_cpumask)) { + if (iucv_nonsmp_handler) + /* enable interrupts on one cpu */ + iucv_allow_cpu(NULL); + else + /* enable interrupts on all cpus */ + iucv_setmask_mp(); + } + if (dev->driver && dev->driver->pm && dev->driver->pm->thaw) + rc = dev->driver->pm->thaw(dev); +out: + return rc; +} + +/** + * iucv_pm_restore() - Restore PM callback + * @dev: iucv-based device + * + * make iucv ready for use again: allocate path table, declare interrupt buffers + * and enable iucv interrupts + * invoke callback function of the iucv-based driver + */ +static int iucv_pm_restore(struct device *dev) +{ + int rc = 0; + +#ifdef CONFIG_PM_DEBUG + printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table); +#endif + if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table) + pr_warning("Suspending Linux did not completely close all IUCV " + "connections\n"); + iucv_pm_state = IUCV_PM_RESTORING; + if (cpumask_empty(&iucv_irq_cpumask)) { + rc = iucv_query_maxconn(); + rc = iucv_enable(); + if (rc) + goto out; + } + if (dev->driver && dev->driver->pm && dev->driver->pm->restore) + rc = dev->driver->pm->restore(dev); +out: + return rc; +} + +/** + * iucv_init + * + * Allocates and initializes various data structures. + */ +static int __init iucv_init(void) +{ + int rc; + int cpu; + + if (!MACHINE_IS_VM) { + rc = -EPROTONOSUPPORT; + goto out; + } + rc = iucv_query_maxconn(); + if (rc) + goto out; + rc = register_external_interrupt(0x4000, iucv_external_interrupt); + if (rc) + goto out; + iucv_root = root_device_register("iucv"); + if (IS_ERR(iucv_root)) { + rc = PTR_ERR(iucv_root); + goto out_int; + } + + for_each_online_cpu(cpu) { + /* Note: GFP_DMA used to get memory below 2G */ + iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_irq_data[cpu]) { + rc = -ENOMEM; + goto out_free; + } + + /* Allocate parameter blocks. */ + iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param[cpu]) { + rc = -ENOMEM; + goto out_free; + } + iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param_irq[cpu]) { + rc = -ENOMEM; + goto out_free; + } + + } + rc = register_hotcpu_notifier(&iucv_cpu_notifier); + if (rc) + goto out_free; + rc = register_reboot_notifier(&iucv_reboot_notifier); + if (rc) + goto out_cpu; + ASCEBC(iucv_error_no_listener, 16); + ASCEBC(iucv_error_no_memory, 16); + ASCEBC(iucv_error_pathid, 16); + iucv_available = 1; + rc = bus_register(&iucv_bus); + if (rc) + goto out_reboot; + return 0; + +out_reboot: + unregister_reboot_notifier(&iucv_reboot_notifier); +out_cpu: + unregister_hotcpu_notifier(&iucv_cpu_notifier); +out_free: + for_each_possible_cpu(cpu) { + kfree(iucv_param_irq[cpu]); + iucv_param_irq[cpu] = NULL; + kfree(iucv_param[cpu]); + iucv_param[cpu] = NULL; + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; + } + root_device_unregister(iucv_root); +out_int: + unregister_external_interrupt(0x4000, iucv_external_interrupt); +out: + return rc; +} + +/** + * iucv_exit + * + * Frees everything allocated from iucv_init. + */ +static void __exit iucv_exit(void) +{ + struct iucv_irq_list *p, *n; + int cpu; + + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) + kfree(p); + list_for_each_entry_safe(p, n, &iucv_work_queue, list) + kfree(p); + spin_unlock_irq(&iucv_queue_lock); + unregister_reboot_notifier(&iucv_reboot_notifier); + unregister_hotcpu_notifier(&iucv_cpu_notifier); + for_each_possible_cpu(cpu) { + kfree(iucv_param_irq[cpu]); + iucv_param_irq[cpu] = NULL; + kfree(iucv_param[cpu]); + iucv_param[cpu] = NULL; + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; + } + root_device_unregister(iucv_root); + bus_unregister(&iucv_bus); + unregister_external_interrupt(0x4000, iucv_external_interrupt); +} + +subsys_initcall(iucv_init); +module_exit(iucv_exit); + +MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)"); +MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver"); +MODULE_LICENSE("GPL"); diff --git a/net/key/Makefile b/net/key/Makefile new file mode 100644 index 00000000..85760804 --- /dev/null +++ b/net/key/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the key AF. +# + +obj-$(CONFIG_NET_KEY) += af_key.o diff --git a/net/key/af_key.c b/net/key/af_key.c new file mode 100644 index 00000000..8f92cf81 --- /dev/null +++ b/net/key/af_key.c @@ -0,0 +1,3818 @@ +/* + * net/key/af_key.c An implementation of PF_KEYv2 sockets. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Maxim Giryaev + * David S. Miller + * Alexey Kuznetsov + * Kunihiro Ishiguro + * Kazunori MIYAZAWA / USAGI Project + * Derek Atkins + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) +#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) + +static int pfkey_net_id __read_mostly; +struct netns_pfkey { + /* List of all pfkey sockets. */ + struct hlist_head table; + atomic_t socks_nr; +}; +static DEFINE_MUTEX(pfkey_mutex); + +#define DUMMY_MARK 0 +static struct xfrm_mark dummy_mark = {0, 0}; +struct pfkey_sock { + /* struct sock must be the first member of struct pfkey_sock */ + struct sock sk; + int registered; + int promisc; + + struct { + uint8_t msg_version; + uint32_t msg_pid; + int (*dump)(struct pfkey_sock *sk); + void (*done)(struct pfkey_sock *sk); + union { + struct xfrm_policy_walk policy; + struct xfrm_state_walk state; + } u; + struct sk_buff *skb; + } dump; +}; + +static inline struct pfkey_sock *pfkey_sk(struct sock *sk) +{ + return (struct pfkey_sock *)sk; +} + +static int pfkey_can_dump(const struct sock *sk) +{ + if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf) + return 1; + return 0; +} + +static void pfkey_terminate_dump(struct pfkey_sock *pfk) +{ + if (pfk->dump.dump) { + if (pfk->dump.skb) { + kfree_skb(pfk->dump.skb); + pfk->dump.skb = NULL; + } + pfk->dump.done(pfk); + pfk->dump.dump = NULL; + pfk->dump.done = NULL; + } +} + +static void pfkey_sock_destruct(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + + pfkey_terminate_dump(pfkey_sk(sk)); + skb_queue_purge(&sk->sk_receive_queue); + + if (!sock_flag(sk, SOCK_DEAD)) { + pr_err("Attempt to release alive pfkey socket: %p\n", sk); + return; + } + + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + + atomic_dec(&net_pfkey->socks_nr); +} + +static const struct proto_ops pfkey_ops; + +static void pfkey_insert(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + + mutex_lock(&pfkey_mutex); + sk_add_node_rcu(sk, &net_pfkey->table); + mutex_unlock(&pfkey_mutex); +} + +static void pfkey_remove(struct sock *sk) +{ + mutex_lock(&pfkey_mutex); + sk_del_node_init_rcu(sk); + mutex_unlock(&pfkey_mutex); +} + +static struct proto key_proto = { + .name = "KEY", + .owner = THIS_MODULE, + .obj_size = sizeof(struct pfkey_sock), +}; + +static int pfkey_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + struct sock *sk; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + if (protocol != PF_KEY_V2) + return -EPROTONOSUPPORT; + + err = -ENOMEM; + sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto); + if (sk == NULL) + goto out; + + sock->ops = &pfkey_ops; + sock_init_data(sock, sk); + + sk->sk_family = PF_KEY; + sk->sk_destruct = pfkey_sock_destruct; + + atomic_inc(&net_pfkey->socks_nr); + + pfkey_insert(sk); + + return 0; +out: + return err; +} + +static int pfkey_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + pfkey_remove(sk); + + sock_orphan(sk); + sock->sk = NULL; + skb_queue_purge(&sk->sk_write_queue); + + synchronize_rcu(); + sock_put(sk); + + return 0; +} + +static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, + gfp_t allocation, struct sock *sk) +{ + int err = -ENOBUFS; + + sock_hold(sk); + if (*skb2 == NULL) { + if (atomic_read(&skb->users) != 1) { + *skb2 = skb_clone(skb, allocation); + } else { + *skb2 = skb; + atomic_inc(&skb->users); + } + } + if (*skb2 != NULL) { + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_orphan(*skb2); + skb_set_owner_r(*skb2, sk); + skb_queue_tail(&sk->sk_receive_queue, *skb2); + sk->sk_data_ready(sk, (*skb2)->len); + *skb2 = NULL; + err = 0; + } + } + sock_put(sk); + return err; +} + +/* Send SKB to all pfkey sockets matching selected criteria. */ +#define BROADCAST_ALL 0 +#define BROADCAST_ONE 1 +#define BROADCAST_REGISTERED 2 +#define BROADCAST_PROMISC_ONLY 4 +static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, + int broadcast_flags, struct sock *one_sk, + struct net *net) +{ + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + struct sock *sk; + struct hlist_node *node; + struct sk_buff *skb2 = NULL; + int err = -ESRCH; + + /* XXX Do we need something like netlink_overrun? I think + * XXX PF_KEY socket apps will not mind current behavior. + */ + if (!skb) + return -ENOMEM; + + rcu_read_lock(); + sk_for_each_rcu(sk, node, &net_pfkey->table) { + struct pfkey_sock *pfk = pfkey_sk(sk); + int err2; + + /* Yes, it means that if you are meant to receive this + * pfkey message you receive it twice as promiscuous + * socket. + */ + if (pfk->promisc) + pfkey_broadcast_one(skb, &skb2, allocation, sk); + + /* the exact target will be processed later */ + if (sk == one_sk) + continue; + if (broadcast_flags != BROADCAST_ALL) { + if (broadcast_flags & BROADCAST_PROMISC_ONLY) + continue; + if ((broadcast_flags & BROADCAST_REGISTERED) && + !pfk->registered) + continue; + if (broadcast_flags & BROADCAST_ONE) + continue; + } + + err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk); + + /* Error is cleare after succecful sending to at least one + * registered KM */ + if ((broadcast_flags & BROADCAST_REGISTERED) && err) + err = err2; + } + rcu_read_unlock(); + + if (one_sk != NULL) + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + + kfree_skb(skb2); + kfree_skb(skb); + return err; +} + +static int pfkey_do_dump(struct pfkey_sock *pfk) +{ + struct sadb_msg *hdr; + int rc; + + rc = pfk->dump.dump(pfk); + if (rc == -ENOBUFS) + return 0; + + if (pfk->dump.skb) { + if (!pfkey_can_dump(&pfk->sk)) + return 0; + + hdr = (struct sadb_msg *) pfk->dump.skb->data; + hdr->sadb_msg_seq = 0; + hdr->sadb_msg_errno = rc; + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + &pfk->sk, sock_net(&pfk->sk)); + pfk->dump.skb = NULL; + } + + pfkey_terminate_dump(pfk); + return rc; +} + +static inline void pfkey_hdr_dup(struct sadb_msg *new, + const struct sadb_msg *orig) +{ + *new = *orig; +} + +static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) +{ + struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + struct sadb_msg *hdr; + + if (!skb) + return -ENOBUFS; + + /* Woe be to the platform trying to support PFKEY yet + * having normal errnos outside the 1-255 range, inclusive. + */ + err = -err; + if (err == ERESTARTSYS || + err == ERESTARTNOHAND || + err == ERESTARTNOINTR) + err = EINTR; + if (err >= 512) + err = EINVAL; + BUG_ON(err <= 0 || err >= 256); + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr, orig); + hdr->sadb_msg_errno = (uint8_t) err; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / + sizeof(uint64_t)); + + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); + + return 0; +} + +static u8 sadb_ext_min_len[] = { + [SADB_EXT_RESERVED] = (u8) 0, + [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), + [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), + [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), + [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), + [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), + [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), + [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), + [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), + [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), + [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), + [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), + [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), + [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), + [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), + [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), + [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), + [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), + [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), + [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), + [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), + [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), + [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), +}; + +/* Verify sadb_address_{len,prefixlen} against sa_family. */ +static int verify_address_len(const void *p) +{ + const struct sadb_address *sp = p; + const struct sockaddr *addr = (const struct sockaddr *)(sp + 1); + const struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + const struct sockaddr_in6 *sin6; +#endif + int len; + + switch (addr->sa_family) { + case AF_INET: + len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); + if (sp->sadb_address_len != len || + sp->sadb_address_prefixlen > 32) + return -EINVAL; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); + if (sp->sadb_address_len != len || + sp->sadb_address_prefixlen > 128) + return -EINVAL; + break; +#endif + default: + /* It is user using kernel to keep track of security + * associations for another protocol, such as + * OSPF/RSVP/RIPV2/MIP. It is user's job to verify + * lengths. + * + * XXX Actually, association/policy database is not yet + * XXX able to cope with arbitrary sockaddr families. + * XXX When it can, remove this -EINVAL. -DaveM + */ + return -EINVAL; + break; + } + + return 0; +} + +static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx) +{ + return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + + sec_ctx->sadb_x_ctx_len, + sizeof(uint64_t)); +} + +static inline int verify_sec_ctx_len(const void *p) +{ + const struct sadb_x_sec_ctx *sec_ctx = p; + int len = sec_ctx->sadb_x_ctx_len; + + if (len > PAGE_SIZE) + return -EINVAL; + + len = pfkey_sec_ctx_len(sec_ctx); + + if (sec_ctx->sadb_x_sec_len != len) + return -EINVAL; + + return 0; +} + +static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx) +{ + struct xfrm_user_sec_ctx *uctx = NULL; + int ctx_size = sec_ctx->sadb_x_ctx_len; + + uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL); + + if (!uctx) + return NULL; + + uctx->len = pfkey_sec_ctx_len(sec_ctx); + uctx->exttype = sec_ctx->sadb_x_sec_exttype; + uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; + uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; + uctx->ctx_len = sec_ctx->sadb_x_ctx_len; + memcpy(uctx + 1, sec_ctx + 1, + uctx->ctx_len); + + return uctx; +} + +static int present_and_same_family(const struct sadb_address *src, + const struct sadb_address *dst) +{ + const struct sockaddr *s_addr, *d_addr; + + if (!src || !dst) + return 0; + + s_addr = (const struct sockaddr *)(src + 1); + d_addr = (const struct sockaddr *)(dst + 1); + if (s_addr->sa_family != d_addr->sa_family) + return 0; + if (s_addr->sa_family != AF_INET +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + && s_addr->sa_family != AF_INET6 +#endif + ) + return 0; + + return 1; +} + +static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs) +{ + const char *p = (char *) hdr; + int len = skb->len; + + len -= sizeof(*hdr); + p += sizeof(*hdr); + while (len > 0) { + const struct sadb_ext *ehdr = (const struct sadb_ext *) p; + uint16_t ext_type; + int ext_len; + + ext_len = ehdr->sadb_ext_len; + ext_len *= sizeof(uint64_t); + ext_type = ehdr->sadb_ext_type; + if (ext_len < sizeof(uint64_t) || + ext_len > len || + ext_type == SADB_EXT_RESERVED) + return -EINVAL; + + if (ext_type <= SADB_EXT_MAX) { + int min = (int) sadb_ext_min_len[ext_type]; + if (ext_len < min) + return -EINVAL; + if (ext_hdrs[ext_type-1] != NULL) + return -EINVAL; + if (ext_type == SADB_EXT_ADDRESS_SRC || + ext_type == SADB_EXT_ADDRESS_DST || + ext_type == SADB_EXT_ADDRESS_PROXY || + ext_type == SADB_X_EXT_NAT_T_OA) { + if (verify_address_len(p)) + return -EINVAL; + } + if (ext_type == SADB_X_EXT_SEC_CTX) { + if (verify_sec_ctx_len(p)) + return -EINVAL; + } + ext_hdrs[ext_type-1] = (void *) p; + } + p += ext_len; + len -= ext_len; + } + + return 0; +} + +static uint16_t +pfkey_satype2proto(uint8_t satype) +{ + switch (satype) { + case SADB_SATYPE_UNSPEC: + return IPSEC_PROTO_ANY; + case SADB_SATYPE_AH: + return IPPROTO_AH; + case SADB_SATYPE_ESP: + return IPPROTO_ESP; + case SADB_X_SATYPE_IPCOMP: + return IPPROTO_COMP; + break; + default: + return 0; + } + /* NOTREACHED */ +} + +static uint8_t +pfkey_proto2satype(uint16_t proto) +{ + switch (proto) { + case IPPROTO_AH: + return SADB_SATYPE_AH; + case IPPROTO_ESP: + return SADB_SATYPE_ESP; + case IPPROTO_COMP: + return SADB_X_SATYPE_IPCOMP; + break; + default: + return 0; + } + /* NOTREACHED */ +} + +/* BTW, this scheme means that there is no way with PFKEY2 sockets to + * say specifically 'just raw sockets' as we encode them as 255. + */ + +static uint8_t pfkey_proto_to_xfrm(uint8_t proto) +{ + return proto == IPSEC_PROTO_ANY ? 0 : proto; +} + +static uint8_t pfkey_proto_from_xfrm(uint8_t proto) +{ + return proto ? proto : IPSEC_PROTO_ANY; +} + +static inline int pfkey_sockaddr_len(sa_family_t family) +{ + switch (family) { + case AF_INET: + return sizeof(struct sockaddr_in); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + return sizeof(struct sockaddr_in6); +#endif + } + return 0; +} + +static +int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr) +{ + switch (sa->sa_family) { + case AF_INET: + xaddr->a4 = + ((struct sockaddr_in *)sa)->sin_addr.s_addr; + return AF_INET; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + memcpy(xaddr->a6, + &((struct sockaddr_in6 *)sa)->sin6_addr, + sizeof(struct in6_addr)); + return AF_INET6; +#endif + } + return 0; +} + +static +int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr) +{ + return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1), + xaddr); +} + +static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + const struct sadb_sa *sa; + const struct sadb_address *addr; + uint16_t proto; + unsigned short family; + xfrm_address_t *xaddr; + + sa = (const struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1]; + if (sa == NULL) + return NULL; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return NULL; + + /* sadb_address_len should be checked by caller */ + addr = (const struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1]; + if (addr == NULL) + return NULL; + + family = ((const struct sockaddr *)(addr + 1))->sa_family; + switch (family) { + case AF_INET: + xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr; + break; +#endif + default: + xaddr = NULL; + } + + if (!xaddr) + return NULL; + + return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family); +} + +#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) + +static int +pfkey_sockaddr_size(sa_family_t family) +{ + return PFKEY_ALIGN8(pfkey_sockaddr_len(family)); +} + +static inline int pfkey_mode_from_xfrm(int mode) +{ + switch(mode) { + case XFRM_MODE_TRANSPORT: + return IPSEC_MODE_TRANSPORT; + case XFRM_MODE_TUNNEL: + return IPSEC_MODE_TUNNEL; + case XFRM_MODE_BEET: + return IPSEC_MODE_BEET; + default: + return -1; + } +} + +static inline int pfkey_mode_to_xfrm(int mode) +{ + switch(mode) { + case IPSEC_MODE_ANY: /*XXX*/ + case IPSEC_MODE_TRANSPORT: + return XFRM_MODE_TRANSPORT; + case IPSEC_MODE_TUNNEL: + return XFRM_MODE_TUNNEL; + case IPSEC_MODE_BEET: + return XFRM_MODE_BEET; + default: + return -1; + } +} + +static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port, + struct sockaddr *sa, + unsigned short family) +{ + switch (family) { + case AF_INET: + { + struct sockaddr_in *sin = (struct sockaddr_in *)sa; + sin->sin_family = AF_INET; + sin->sin_port = port; + sin->sin_addr.s_addr = xaddr->a4; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + return 32; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = port; + sin6->sin6_flowinfo = 0; + ipv6_addr_copy(&sin6->sin6_addr, (const struct in6_addr *)xaddr->a6); + sin6->sin6_scope_id = 0; + return 128; + } +#endif + } + return 0; +} + +static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, + int add_keys, int hsc) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_sa *sa; + struct sadb_lifetime *lifetime; + struct sadb_address *addr; + struct sadb_key *key; + struct sadb_x_sa2 *sa2; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int ctx_size = 0; + int size; + int auth_key_size = 0; + int encrypt_key_size = 0; + int sockaddr_size; + struct xfrm_encap_tmpl *natt = NULL; + int mode; + + /* address family check */ + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return ERR_PTR(-EINVAL); + + /* base, SA, (lifetime (HSC),) address(SD), (address(P),) + key(AE), (identity(SD),) (sensitivity)> */ + size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + + sizeof(struct sadb_lifetime) + + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + + sizeof(struct sadb_address)*2 + + sockaddr_size*2 + + sizeof(struct sadb_x_sa2); + + if ((xfrm_ctx = x->security)) { + ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); + size += sizeof(struct sadb_x_sec_ctx) + ctx_size; + } + + /* identity & sensitivity */ + if (xfrm_addr_cmp(&x->sel.saddr, &x->props.saddr, x->props.family)) + size += sizeof(struct sadb_address) + sockaddr_size; + + if (add_keys) { + if (x->aalg && x->aalg->alg_key_len) { + auth_key_size = + PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); + size += sizeof(struct sadb_key) + auth_key_size; + } + if (x->ealg && x->ealg->alg_key_len) { + encrypt_key_size = + PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); + size += sizeof(struct sadb_key) + encrypt_key_size; + } + } + if (x->encap) + natt = x->encap; + + if (natt && natt->encap_type) { + size += sizeof(struct sadb_x_nat_t_type); + size += sizeof(struct sadb_x_nat_t_port); + size += sizeof(struct sadb_x_nat_t_port); + } + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + /* call should fill header later */ + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memset(hdr, 0, size); /* XXX do we need this ? */ + hdr->sadb_msg_len = size / sizeof(uint64_t); + + /* sa */ + sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa)); + sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); + sa->sadb_sa_exttype = SADB_EXT_SA; + sa->sadb_sa_spi = x->id.spi; + sa->sadb_sa_replay = x->props.replay_window; + switch (x->km.state) { + case XFRM_STATE_VALID: + sa->sadb_sa_state = x->km.dying ? + SADB_SASTATE_DYING : SADB_SASTATE_MATURE; + break; + case XFRM_STATE_ACQ: + sa->sadb_sa_state = SADB_SASTATE_LARVAL; + break; + default: + sa->sadb_sa_state = SADB_SASTATE_DEAD; + break; + } + sa->sadb_sa_auth = 0; + if (x->aalg) { + struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + sa->sadb_sa_auth = a ? a->desc.sadb_alg_id : 0; + } + sa->sadb_sa_encrypt = 0; + BUG_ON(x->ealg && x->calg); + if (x->ealg) { + struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); + sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + } + /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ + if (x->calg) { + struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); + sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + } + + sa->sadb_sa_flags = 0; + if (x->props.flags & XFRM_STATE_NOECN) + sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; + if (x->props.flags & XFRM_STATE_NOPMTUDISC) + sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC; + + /* hard time */ + if (hsc & 2) { + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; + lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); + lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; + lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; + } + /* soft time */ + if (hsc & 1) { + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; + lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); + lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; + lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; + } + /* current time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + lifetime->sadb_lifetime_allocations = x->curlft.packets; + lifetime->sadb_lifetime_bytes = x->curlft.bytes; + lifetime->sadb_lifetime_addtime = x->curlft.add_time; + lifetime->sadb_lifetime_usetime = x->curlft.use_time; + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + /* "if the ports are non-zero, then the sadb_address_proto field, + normally zero, MUST be filled in with the transport + protocol's number." - RFC2367 */ + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + + addr->sadb_address_prefixlen = + pfkey_sockaddr_fill(&x->props.saddr, 0, + (struct sockaddr *) (addr + 1), + x->props.family); + if (!addr->sadb_address_prefixlen) + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + + addr->sadb_address_prefixlen = + pfkey_sockaddr_fill(&x->id.daddr, 0, + (struct sockaddr *) (addr + 1), + x->props.family); + if (!addr->sadb_address_prefixlen) + BUG(); + + if (xfrm_addr_cmp(&x->sel.saddr, &x->props.saddr, + x->props.family)) { + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; + addr->sadb_address_proto = + pfkey_proto_from_xfrm(x->sel.proto); + addr->sadb_address_prefixlen = x->sel.prefixlen_s; + addr->sadb_address_reserved = 0; + + pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport, + (struct sockaddr *) (addr + 1), + x->props.family); + } + + /* auth key */ + if (add_keys && auth_key_size) { + key = (struct sadb_key *) skb_put(skb, + sizeof(struct sadb_key)+auth_key_size); + key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / + sizeof(uint64_t); + key->sadb_key_exttype = SADB_EXT_KEY_AUTH; + key->sadb_key_bits = x->aalg->alg_key_len; + key->sadb_key_reserved = 0; + memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); + } + /* encrypt key */ + if (add_keys && encrypt_key_size) { + key = (struct sadb_key *) skb_put(skb, + sizeof(struct sadb_key)+encrypt_key_size); + key->sadb_key_len = (sizeof(struct sadb_key) + + encrypt_key_size) / sizeof(uint64_t); + key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; + key->sadb_key_bits = x->ealg->alg_key_len; + key->sadb_key_reserved = 0; + memcpy(key + 1, x->ealg->alg_key, + (x->ealg->alg_key_len+7)/8); + } + + /* sa */ + sa2 = (struct sadb_x_sa2 *) skb_put(skb, sizeof(struct sadb_x_sa2)); + sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); + sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; + if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) { + kfree_skb(skb); + return ERR_PTR(-EINVAL); + } + sa2->sadb_x_sa2_mode = mode; + sa2->sadb_x_sa2_reserved1 = 0; + sa2->sadb_x_sa2_reserved2 = 0; + sa2->sadb_x_sa2_sequence = 0; + sa2->sadb_x_sa2_reqid = x->props.reqid; + + if (natt && natt->encap_type) { + struct sadb_x_nat_t_type *n_type; + struct sadb_x_nat_t_port *n_port; + + /* type */ + n_type = (struct sadb_x_nat_t_type*) skb_put(skb, sizeof(*n_type)); + n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); + n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; + n_type->sadb_x_nat_t_type_type = natt->encap_type; + n_type->sadb_x_nat_t_type_reserved[0] = 0; + n_type->sadb_x_nat_t_type_reserved[1] = 0; + n_type->sadb_x_nat_t_type_reserved[2] = 0; + + /* source port */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + /* dest port */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_dport; + n_port->sadb_x_nat_t_port_reserved = 0; + } + + /* security context */ + if (xfrm_ctx) { + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, + sizeof(struct sadb_x_sec_ctx) + ctx_size); + sec_ctx->sadb_x_sec_len = + (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + + return skb; +} + + +static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x) +{ + struct sk_buff *skb; + + skb = __pfkey_xfrm_state2msg(x, 1, 3); + + return skb; +} + +static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x, + int hsc) +{ + return __pfkey_xfrm_state2msg(x, 0, hsc); +} + +static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, + const struct sadb_msg *hdr, + void * const *ext_hdrs) +{ + struct xfrm_state *x; + const struct sadb_lifetime *lifetime; + const struct sadb_sa *sa; + const struct sadb_key *key; + const struct sadb_x_sec_ctx *sec_ctx; + uint16_t proto; + int err; + + + sa = (const struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1]; + if (!sa || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return ERR_PTR(-EINVAL); + if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && + !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) + return ERR_PTR(-EINVAL); + if (hdr->sadb_msg_satype == SADB_SATYPE_AH && + !ext_hdrs[SADB_EXT_KEY_AUTH-1]) + return ERR_PTR(-EINVAL); + if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != + !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) + return ERR_PTR(-EINVAL); + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return ERR_PTR(-EINVAL); + + /* default error is no buffer space */ + err = -ENOBUFS; + + /* RFC2367: + + Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. + SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not + sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. + Therefore, the sadb_sa_state field of all submitted SAs MUST be + SADB_SASTATE_MATURE and the kernel MUST return an error if this is + not true. + + However, KAME setkey always uses SADB_SASTATE_LARVAL. + Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. + */ + if (sa->sadb_sa_auth > SADB_AALG_MAX || + (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && + sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || + sa->sadb_sa_encrypt > SADB_EALG_MAX) + return ERR_PTR(-EINVAL); + key = (const struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; + if (key != NULL && + sa->sadb_sa_auth != SADB_X_AALG_NULL && + ((key->sadb_key_bits+7) / 8 == 0 || + (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t))) + return ERR_PTR(-EINVAL); + key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; + if (key != NULL && + sa->sadb_sa_encrypt != SADB_EALG_NULL && + ((key->sadb_key_bits+7) / 8 == 0 || + (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t))) + return ERR_PTR(-EINVAL); + + x = xfrm_state_alloc(net); + if (x == NULL) + return ERR_PTR(-ENOBUFS); + + x->id.proto = proto; + x->id.spi = sa->sadb_sa_spi; + x->props.replay_window = sa->sadb_sa_replay; + if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) + x->props.flags |= XFRM_STATE_NOECN; + if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) + x->props.flags |= XFRM_STATE_DECAP_DSCP; + if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC) + x->props.flags |= XFRM_STATE_NOPMTUDISC; + + lifetime = (const struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1]; + if (lifetime != NULL) { + x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; + x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + lifetime = (const struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]; + if (lifetime != NULL) { + x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; + x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + + sec_ctx = (const struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + goto out; + + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + + if (err) + goto out; + } + + key = (const struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; + if (sa->sadb_sa_auth) { + int keysize = 0; + struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); + if (!a) { + err = -ENOSYS; + goto out; + } + if (key) + keysize = (key->sadb_key_bits + 7) / 8; + x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); + if (!x->aalg) + goto out; + strcpy(x->aalg->alg_name, a->name); + x->aalg->alg_key_len = 0; + if (key) { + x->aalg->alg_key_len = key->sadb_key_bits; + memcpy(x->aalg->alg_key, key+1, keysize); + } + x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits; + x->props.aalgo = sa->sadb_sa_auth; + /* x->algo.flags = sa->sadb_sa_flags; */ + } + if (sa->sadb_sa_encrypt) { + if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { + struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); + if (!a) { + err = -ENOSYS; + goto out; + } + x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); + if (!x->calg) + goto out; + strcpy(x->calg->alg_name, a->name); + x->props.calgo = sa->sadb_sa_encrypt; + } else { + int keysize = 0; + struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); + if (!a) { + err = -ENOSYS; + goto out; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; + if (key) + keysize = (key->sadb_key_bits + 7) / 8; + x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); + if (!x->ealg) + goto out; + strcpy(x->ealg->alg_name, a->name); + x->ealg->alg_key_len = 0; + if (key) { + x->ealg->alg_key_len = key->sadb_key_bits; + memcpy(x->ealg->alg_key, key+1, keysize); + } + x->props.ealgo = sa->sadb_sa_encrypt; + } + } + /* x->algo.flags = sa->sadb_sa_flags; */ + + x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + &x->props.saddr); + if (!x->props.family) { + err = -EAFNOSUPPORT; + goto out; + } + pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], + &x->id.daddr); + + if (ext_hdrs[SADB_X_EXT_SA2-1]) { + const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1]; + int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); + if (mode < 0) { + err = -EINVAL; + goto out; + } + x->props.mode = mode; + x->props.reqid = sa2->sadb_x_sa2_reqid; + } + + if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { + const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; + + /* Nobody uses this, but we try. */ + x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); + x->sel.prefixlen_s = addr->sadb_address_prefixlen; + } + + if (!x->sel.family) + x->sel.family = x->props.family; + + if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { + const struct sadb_x_nat_t_type* n_type; + struct xfrm_encap_tmpl *natt; + + x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + if (!x->encap) + goto out; + + natt = x->encap; + n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; + natt->encap_type = n_type->sadb_x_nat_t_type_type; + + if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { + const struct sadb_x_nat_t_port *n_port = + ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; + natt->encap_sport = n_port->sadb_x_nat_t_port_port; + } + if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { + const struct sadb_x_nat_t_port *n_port = + ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; + natt->encap_dport = n_port->sadb_x_nat_t_port_port; + } + memset(&natt->encap_oa, 0, sizeof(natt->encap_oa)); + } + + err = xfrm_init_state(x); + if (err) + goto out; + + x->km.seq = hdr->sadb_msg_seq; + return x; + +out: + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + return ERR_PTR(err); +} + +static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + return -EOPNOTSUPP; +} + +static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + struct sk_buff *resp_skb; + struct sadb_x_sa2 *sa2; + struct sadb_address *saddr, *daddr; + struct sadb_msg *out_hdr; + struct sadb_spirange *range; + struct xfrm_state *x = NULL; + int mode; + int err; + u32 min_spi, max_spi; + u32 reqid; + u8 proto; + unsigned short family; + xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { + mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); + if (mode < 0) + return -EINVAL; + reqid = sa2->sadb_x_sa2_reqid; + } else { + mode = 0; + reqid = 0; + } + + saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; + daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; + + family = ((struct sockaddr *)(saddr + 1))->sa_family; + switch (family) { + case AF_INET: + xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; + xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; + xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; + break; +#endif + } + + if (hdr->sadb_msg_seq) { + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + if (x && xfrm_addr_cmp(&x->id.daddr, xdaddr, family)) { + xfrm_state_put(x); + x = NULL; + } + } + + if (!x) + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family); + + if (x == NULL) + return -ENOENT; + + min_spi = 0x100; + max_spi = 0x0fffffff; + + range = ext_hdrs[SADB_EXT_SPIRANGE-1]; + if (range) { + min_spi = range->sadb_spirange_min; + max_spi = range->sadb_spirange_max; + } + + err = xfrm_alloc_spi(x, min_spi, max_spi); + resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); + + if (IS_ERR(resp_skb)) { + xfrm_state_put(x); + return PTR_ERR(resp_skb); + } + + out_hdr = (struct sadb_msg *) resp_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_GETSPI; + out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + + xfrm_state_put(x); + + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); + + return 0; +} + +static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + struct xfrm_state *x; + + if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) + return -EOPNOTSUPP; + + if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) + return 0; + + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + if (x == NULL) + return 0; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_ACQ) { + x->km.state = XFRM_STATE_ERROR; + wake_up(&net->xfrm.km_waitq); + } + spin_unlock_bh(&x->lock); + xfrm_state_put(x); + return 0; +} + +static inline int event2poltype(int event) +{ + switch (event) { + case XFRM_MSG_DELPOLICY: + return SADB_X_SPDDELETE; + case XFRM_MSG_NEWPOLICY: + return SADB_X_SPDADD; + case XFRM_MSG_UPDPOLICY: + return SADB_X_SPDUPDATE; + case XFRM_MSG_POLEXPIRE: + // return SADB_X_SPDEXPIRE; + default: + pr_err("pfkey: Unknown policy event %d\n", event); + break; + } + + return 0; +} + +static inline int event2keytype(int event) +{ + switch (event) { + case XFRM_MSG_DELSA: + return SADB_DELETE; + case XFRM_MSG_NEWSA: + return SADB_ADD; + case XFRM_MSG_UPDSA: + return SADB_UPDATE; + case XFRM_MSG_EXPIRE: + return SADB_EXPIRE; + default: + pr_err("pfkey: Unknown SA event %d\n", event); + break; + } + + return 0; +} + +/* ADD/UPD/DEL */ +static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + + skb = pfkey_xfrm_state2msg(x); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + hdr = (struct sadb_msg *) skb->data; + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = event2keytype(c->event); + hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = c->seq; + hdr->sadb_msg_pid = c->pid; + + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); + + return 0; +} + +static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + struct xfrm_state *x; + int err; + struct km_event c; + + x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs); + if (IS_ERR(x)) + return PTR_ERR(x); + + xfrm_state_hold(x); + if (hdr->sadb_msg_type == SADB_ADD) + err = xfrm_state_add(x); + else + err = xfrm_state_update(x); + + xfrm_audit_state_add(x, err ? 0 : 1, + audit_get_loginuid(current), + audit_get_sessionid(current), 0); + + if (err < 0) { + x->km.state = XFRM_STATE_DEAD; + __xfrm_state_put(x); + goto out; + } + + if (hdr->sadb_msg_type == SADB_ADD) + c.event = XFRM_MSG_NEWSA; + else + c.event = XFRM_MSG_UPDSA; + c.seq = hdr->sadb_msg_seq; + c.pid = hdr->sadb_msg_pid; + km_state_notify(x, &c); +out: + xfrm_state_put(x); + return err; +} + +static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + struct xfrm_state *x; + struct km_event c; + int err; + + if (!ext_hdrs[SADB_EXT_SA-1] || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); + if (x == NULL) + return -ESRCH; + + if ((err = security_xfrm_state_delete(x))) + goto out; + + if (xfrm_state_kern(x)) { + err = -EPERM; + goto out; + } + + err = xfrm_state_delete(x); + + if (err < 0) + goto out; + + c.seq = hdr->sadb_msg_seq; + c.pid = hdr->sadb_msg_pid; + c.event = XFRM_MSG_DELSA; + km_state_notify(x, &c); +out: + xfrm_audit_state_delete(x, err ? 0 : 1, + audit_get_loginuid(current), + audit_get_sessionid(current), 0); + xfrm_state_put(x); + + return err; +} + +static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + __u8 proto; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_state *x; + + if (!ext_hdrs[SADB_EXT_SA-1] || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); + if (x == NULL) + return -ESRCH; + + out_skb = pfkey_xfrm_state2msg(x); + proto = x->id.proto; + xfrm_state_put(x); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_GET; + out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); + + return 0; +} + +static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, + gfp_t allocation) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + int len, auth_len, enc_len, i; + + auth_len = xfrm_count_auth_supported(); + if (auth_len) { + auth_len *= sizeof(struct sadb_alg); + auth_len += sizeof(struct sadb_supported); + } + + enc_len = xfrm_count_enc_supported(); + if (enc_len) { + enc_len *= sizeof(struct sadb_alg); + enc_len += sizeof(struct sadb_supported); + } + + len = enc_len + auth_len + sizeof(struct sadb_msg); + + skb = alloc_skb(len + 16, allocation); + if (!skb) + goto out_put_algs; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(*hdr)); + pfkey_hdr_dup(hdr, orig); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_len = len / sizeof(uint64_t); + + if (auth_len) { + struct sadb_supported *sp; + struct sadb_alg *ap; + + sp = (struct sadb_supported *) skb_put(skb, auth_len); + ap = (struct sadb_alg *) (sp + 1); + + sp->sadb_supported_len = auth_len / sizeof(uint64_t); + sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + if (aalg->available) + *ap++ = aalg->desc; + } + } + + if (enc_len) { + struct sadb_supported *sp; + struct sadb_alg *ap; + + sp = (struct sadb_supported *) skb_put(skb, enc_len); + ap = (struct sadb_alg *) (sp + 1); + + sp->sadb_supported_len = enc_len / sizeof(uint64_t); + sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + if (ealg->available) + *ap++ = ealg->desc; + } + } + +out_put_algs: + return skb; +} + +static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct pfkey_sock *pfk = pfkey_sk(sk); + struct sk_buff *supp_skb; + + if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) + return -EINVAL; + + if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { + if (pfk->registered&(1<sadb_msg_satype)) + return -EEXIST; + pfk->registered |= (1<sadb_msg_satype); + } + + xfrm_probe_algs(); + + supp_skb = compose_sadb_supported(hdr, GFP_KERNEL); + if (!supp_skb) { + if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) + pfk->registered &= ~(1<sadb_msg_satype); + + return -ENOBUFS; + } + + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); + + return 0; +} + +static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + + skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); + if (!skb) + return -ENOBUFS; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memcpy(hdr, ihdr, sizeof(struct sadb_msg)); + hdr->sadb_msg_errno = (uint8_t) 0; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); +} + +static int key_notify_sa_flush(const struct km_event *c) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + + skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); + if (!skb) + return -ENOBUFS; + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto); + hdr->sadb_msg_type = SADB_FLUSH; + hdr->sadb_msg_seq = c->seq; + hdr->sadb_msg_pid = c->pid; + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_errno = (uint8_t) 0; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); + + return 0; +} + +static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + unsigned proto; + struct km_event c; + struct xfrm_audit audit_info; + int err, err2; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + audit_info.loginuid = audit_get_loginuid(current); + audit_info.sessionid = audit_get_sessionid(current); + audit_info.secid = 0; + err = xfrm_state_flush(net, proto, &audit_info); + err2 = unicast_flush_resp(sk, hdr); + if (err || err2) { + if (err == -ESRCH) /* empty table - go quietly */ + err = 0; + return err ? err : err2; + } + + c.data.proto = proto; + c.seq = hdr->sadb_msg_seq; + c.pid = hdr->sadb_msg_pid; + c.event = XFRM_MSG_FLUSHSA; + c.net = net; + km_state_notify(NULL, &c); + + return 0; +} + +static int dump_sa(struct xfrm_state *x, int count, void *ptr) +{ + struct pfkey_sock *pfk = ptr; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + if (!pfkey_can_dump(&pfk->sk)) + return -ENOBUFS; + + out_skb = pfkey_xfrm_state2msg(x); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = pfk->dump.msg_version; + out_hdr->sadb_msg_type = SADB_DUMP; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = count + 1; + out_hdr->sadb_msg_pid = pfk->dump.msg_pid; + + if (pfk->dump.skb) + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + &pfk->sk, sock_net(&pfk->sk)); + pfk->dump.skb = out_skb; + + return 0; +} + +static int pfkey_dump_sa(struct pfkey_sock *pfk) +{ + struct net *net = sock_net(&pfk->sk); + return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk); +} + +static void pfkey_dump_sa_done(struct pfkey_sock *pfk) +{ + xfrm_state_walk_done(&pfk->dump.u.state); +} + +static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + u8 proto; + struct pfkey_sock *pfk = pfkey_sk(sk); + + if (pfk->dump.dump != NULL) + return -EBUSY; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + pfk->dump.msg_version = hdr->sadb_msg_version; + pfk->dump.msg_pid = hdr->sadb_msg_pid; + pfk->dump.dump = pfkey_dump_sa; + pfk->dump.done = pfkey_dump_sa_done; + xfrm_state_walk_init(&pfk->dump.u.state, proto); + + return pfkey_do_dump(pfk); +} + +static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct pfkey_sock *pfk = pfkey_sk(sk); + int satype = hdr->sadb_msg_satype; + bool reset_errno = false; + + if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { + reset_errno = true; + if (satype != 0 && satype != 1) + return -EINVAL; + pfk->promisc = satype; + } + if (reset_errno && skb_cloned(skb)) + skb = skb_copy(skb, GFP_KERNEL); + else + skb = skb_clone(skb, GFP_KERNEL); + + if (reset_errno && skb) { + struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data; + new_hdr->sadb_msg_errno = 0; + } + + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); + return 0; +} + +static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + int i; + u32 reqid = *(u32*)ptr; + + for (i=0; ixfrm_nr; i++) { + if (xp->xfrm_vec[i].reqid == reqid) + return -EEXIST; + } + return 0; +} + +static u32 gen_reqid(struct net *net) +{ + struct xfrm_policy_walk walk; + u32 start; + int rc; + static u32 reqid = IPSEC_MANUAL_REQID_MAX; + + start = reqid; + do { + ++reqid; + if (reqid == 0) + reqid = IPSEC_MANUAL_REQID_MAX+1; + xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); + rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); + xfrm_policy_walk_done(&walk); + if (rc != -EEXIST) + return reqid; + } while (reqid != start); + return 0; +} + +static int +parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) +{ + struct net *net = xp_net(xp); + struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; + int mode; + + if (xp->xfrm_nr >= XFRM_MAX_DEPTH) + return -ELOOP; + + if (rq->sadb_x_ipsecrequest_mode == 0) + return -EINVAL; + + t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */ + if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) + return -EINVAL; + t->mode = mode; + if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) + t->optional = 1; + else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { + t->reqid = rq->sadb_x_ipsecrequest_reqid; + if (t->reqid > IPSEC_MANUAL_REQID_MAX) + t->reqid = 0; + if (!t->reqid && !(t->reqid = gen_reqid(net))) + return -ENOBUFS; + } + + /* addresses present only in tunnel mode */ + if (t->mode == XFRM_MODE_TUNNEL) { + u8 *sa = (u8 *) (rq + 1); + int family, socklen; + + family = pfkey_sockaddr_extract((struct sockaddr *)sa, + &t->saddr); + if (!family) + return -EINVAL; + + socklen = pfkey_sockaddr_len(family); + if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen), + &t->id.daddr) != family) + return -EINVAL; + t->encap_family = family; + } else + t->encap_family = xp->family; + + /* No way to set this via kame pfkey */ + t->allalgs = 1; + xp->xfrm_nr++; + return 0; +} + +static int +parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) +{ + int err; + int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); + struct sadb_x_ipsecrequest *rq = (void*)(pol+1); + + while (len >= sizeof(struct sadb_x_ipsecrequest)) { + if ((err = parse_ipsecrequest(xp, rq)) < 0) + return err; + len -= rq->sadb_x_ipsecrequest_len; + rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); + } + return 0; +} + +static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + + if (xfrm_ctx) { + int len = sizeof(struct sadb_x_sec_ctx); + len += xfrm_ctx->ctx_len; + return PFKEY_ALIGN8(len); + } + return 0; +} + +static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp) +{ + const struct xfrm_tmpl *t; + int sockaddr_size = pfkey_sockaddr_size(xp->family); + int socklen = 0; + int i; + + for (i=0; ixfrm_nr; i++) { + t = xp->xfrm_vec + i; + socklen += pfkey_sockaddr_len(t->encap_family); + } + + return sizeof(struct sadb_msg) + + (sizeof(struct sadb_lifetime) * 3) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + sizeof(struct sadb_x_policy) + + (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) + + (socklen * 2) + + pfkey_xfrm_policy2sec_ctx_size(xp); +} + +static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp) +{ + struct sk_buff *skb; + int size; + + size = pfkey_xfrm_policy2msg_size(xp); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + return skb; +} + +static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir) +{ + struct sadb_msg *hdr; + struct sadb_address *addr; + struct sadb_lifetime *lifetime; + struct sadb_x_policy *pol; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int i; + int size; + int sockaddr_size = pfkey_sockaddr_size(xp->family); + int socklen = pfkey_sockaddr_len(xp->family); + + size = pfkey_xfrm_policy2msg_size(xp); + + /* call should fill header later */ + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memset(hdr, 0, size); /* XXX do we need this ? */ + + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); + addr->sadb_address_prefixlen = xp->selector.prefixlen_s; + addr->sadb_address_reserved = 0; + if (!pfkey_sockaddr_fill(&xp->selector.saddr, + xp->selector.sport, + (struct sockaddr *) (addr + 1), + xp->family)) + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); + addr->sadb_address_prefixlen = xp->selector.prefixlen_d; + addr->sadb_address_reserved = 0; + + pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport, + (struct sockaddr *) (addr + 1), + xp->family); + + /* hard time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; + lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); + lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; + lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; + /* soft time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; + lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); + lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; + lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; + /* current time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + lifetime->sadb_lifetime_allocations = xp->curlft.packets; + lifetime->sadb_lifetime_bytes = xp->curlft.bytes; + lifetime->sadb_lifetime_addtime = xp->curlft.add_time; + lifetime->sadb_lifetime_usetime = xp->curlft.use_time; + + pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; + if (xp->action == XFRM_POLICY_ALLOW) { + if (xp->xfrm_nr) + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + else + pol->sadb_x_policy_type = IPSEC_POLICY_NONE; + } + pol->sadb_x_policy_dir = dir+1; + pol->sadb_x_policy_id = xp->index; + pol->sadb_x_policy_priority = xp->priority; + + for (i=0; ixfrm_nr; i++) { + const struct xfrm_tmpl *t = xp->xfrm_vec + i; + struct sadb_x_ipsecrequest *rq; + int req_size; + int mode; + + req_size = sizeof(struct sadb_x_ipsecrequest); + if (t->mode == XFRM_MODE_TUNNEL) { + socklen = pfkey_sockaddr_len(t->encap_family); + req_size += socklen * 2; + } else { + size -= 2*socklen; + } + rq = (void*)skb_put(skb, req_size); + pol->sadb_x_policy_len += req_size/8; + memset(rq, 0, sizeof(*rq)); + rq->sadb_x_ipsecrequest_len = req_size; + rq->sadb_x_ipsecrequest_proto = t->id.proto; + if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0) + return -EINVAL; + rq->sadb_x_ipsecrequest_mode = mode; + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; + if (t->reqid) + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; + if (t->optional) + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; + rq->sadb_x_ipsecrequest_reqid = t->reqid; + + if (t->mode == XFRM_MODE_TUNNEL) { + u8 *sa = (void *)(rq + 1); + pfkey_sockaddr_fill(&t->saddr, 0, + (struct sockaddr *)sa, + t->encap_family); + pfkey_sockaddr_fill(&t->id.daddr, 0, + (struct sockaddr *) (sa + socklen), + t->encap_family); + } + } + + /* security context */ + if ((xfrm_ctx = xp->security)) { + int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); + + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size); + sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); + + return 0; +} + +static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + int err; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + err = pfkey_xfrm_policy2msg(out_skb, xp, dir); + if (err < 0) + return err; + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = PF_KEY_V2; + + if (c->data.byid && c->event == XFRM_MSG_DELPOLICY) + out_hdr->sadb_msg_type = SADB_X_SPDDELETE2; + else + out_hdr->sadb_msg_type = event2poltype(c->event); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = c->seq; + out_hdr->sadb_msg_pid = c->pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); + return 0; + +} + +static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + int err = 0; + struct sadb_lifetime *lifetime; + struct sadb_address *sa; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || + !ext_hdrs[SADB_X_EXT_POLICY-1]) + return -EINVAL; + + pol = ext_hdrs[SADB_X_EXT_POLICY-1]; + if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) + return -EINVAL; + if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) + return -EINVAL; + + xp = xfrm_policy_alloc(net, GFP_KERNEL); + if (xp == NULL) + return -ENOBUFS; + + xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? + XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); + xp->priority = pol->sadb_x_policy_priority; + + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); + if (!xp->family) { + err = -EINVAL; + goto out; + } + xp->selector.family = xp->family; + xp->selector.prefixlen_s = sa->sadb_address_prefixlen; + xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (xp->selector.sport) + xp->selector.sport_mask = htons(0xffff); + + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); + xp->selector.prefixlen_d = sa->sadb_address_prefixlen; + + /* Amusing, we set this twice. KAME apps appear to set same value + * in both addresses. + */ + xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + + xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (xp->selector.dport) + xp->selector.dport_mask = htons(0xffff); + + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) { + err = -ENOBUFS; + goto out; + } + + err = security_xfrm_policy_alloc(&xp->security, uctx); + kfree(uctx); + + if (err) + goto out; + } + + xp->lft.soft_byte_limit = XFRM_INF; + xp->lft.hard_byte_limit = XFRM_INF; + xp->lft.soft_packet_limit = XFRM_INF; + xp->lft.hard_packet_limit = XFRM_INF; + if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { + xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; + xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { + xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; + xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + xp->xfrm_nr = 0; + if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && + (err = parse_ipsecrequests(xp, pol)) < 0) + goto out; + + err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, + hdr->sadb_msg_type != SADB_X_SPDUPDATE); + + xfrm_audit_policy_add(xp, err ? 0 : 1, + audit_get_loginuid(current), + audit_get_sessionid(current), 0); + + if (err) + goto out; + + if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) + c.event = XFRM_MSG_UPDPOLICY; + else + c.event = XFRM_MSG_NEWPOLICY; + + c.seq = hdr->sadb_msg_seq; + c.pid = hdr->sadb_msg_pid; + + km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); + xfrm_pol_put(xp); + return 0; + +out: + xp->walk.dead = 1; + xfrm_policy_destroy(xp); + return err; +} + +static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + int err; + struct sadb_address *sa; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct xfrm_selector sel; + struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *pol_ctx = NULL; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || + !ext_hdrs[SADB_X_EXT_POLICY-1]) + return -EINVAL; + + pol = ext_hdrs[SADB_X_EXT_POLICY-1]; + if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) + return -EINVAL; + + memset(&sel, 0, sizeof(sel)); + + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); + sel.prefixlen_s = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (sel.sport) + sel.sport_mask = htons(0xffff); + + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); + sel.prefixlen_d = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (sel.dport) + sel.dport_mask = htons(0xffff); + + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + return -ENOMEM; + + err = security_xfrm_policy_alloc(&pol_ctx, uctx); + kfree(uctx); + if (err) + return err; + } + + xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + pol->sadb_x_policy_dir - 1, &sel, pol_ctx, + 1, &err); + security_xfrm_policy_free(pol_ctx); + if (xp == NULL) + return -ENOENT; + + xfrm_audit_policy_delete(xp, err ? 0 : 1, + audit_get_loginuid(current), + audit_get_sessionid(current), 0); + + if (err) + goto out; + + c.seq = hdr->sadb_msg_seq; + c.pid = hdr->sadb_msg_pid; + c.data.byid = 0; + c.event = XFRM_MSG_DELPOLICY; + km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); + +out: + xfrm_pol_put(xp); + return err; +} + +static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir) +{ + int err; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + err = 0; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + err = pfkey_xfrm_policy2msg(out_skb, xp, dir); + if (err < 0) + goto out; + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); + err = 0; + +out: + return err; +} + +#ifdef CONFIG_NET_KEY_MIGRATE +static int pfkey_sockaddr_pair_size(sa_family_t family) +{ + return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); +} + +static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, + xfrm_address_t *saddr, xfrm_address_t *daddr, + u16 *family) +{ + int af, socklen; + + if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) + return -EINVAL; + + af = pfkey_sockaddr_extract(sa, saddr); + if (!af) + return -EINVAL; + + socklen = pfkey_sockaddr_len(af); + if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen), + daddr) != af) + return -EINVAL; + + *family = af; + return 0; +} + +static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, + struct xfrm_migrate *m) +{ + int err; + struct sadb_x_ipsecrequest *rq2; + int mode; + + if (len <= sizeof(struct sadb_x_ipsecrequest) || + len < rq1->sadb_x_ipsecrequest_len) + return -EINVAL; + + /* old endoints */ + err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), + rq1->sadb_x_ipsecrequest_len, + &m->old_saddr, &m->old_daddr, + &m->old_family); + if (err) + return err; + + rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); + len -= rq1->sadb_x_ipsecrequest_len; + + if (len <= sizeof(struct sadb_x_ipsecrequest) || + len < rq2->sadb_x_ipsecrequest_len) + return -EINVAL; + + /* new endpoints */ + err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), + rq2->sadb_x_ipsecrequest_len, + &m->new_saddr, &m->new_daddr, + &m->new_family); + if (err) + return err; + + if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto || + rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode || + rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid) + return -EINVAL; + + m->proto = rq1->sadb_x_ipsecrequest_proto; + if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0) + return -EINVAL; + m->mode = mode; + m->reqid = rq1->sadb_x_ipsecrequest_reqid; + + return ((int)(rq1->sadb_x_ipsecrequest_len + + rq2->sadb_x_ipsecrequest_len)); +} + +static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, + const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + int i, len, ret, err = -EINVAL; + u8 dir; + struct sadb_address *sa; + struct sadb_x_kmaddress *kma; + struct sadb_x_policy *pol; + struct sadb_x_ipsecrequest *rq; + struct xfrm_selector sel; + struct xfrm_migrate m[XFRM_MAX_DEPTH]; + struct xfrm_kmaddress k; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], + ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || + !ext_hdrs[SADB_X_EXT_POLICY - 1]) { + err = -EINVAL; + goto out; + } + + kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1]; + pol = ext_hdrs[SADB_X_EXT_POLICY - 1]; + + if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) { + err = -EINVAL; + goto out; + } + + if (kma) { + /* convert sadb_x_kmaddress to xfrm_kmaddress */ + k.reserved = kma->sadb_x_kmaddress_reserved; + ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1), + 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma), + &k.local, &k.remote, &k.family); + if (ret < 0) { + err = ret; + goto out; + } + } + + dir = pol->sadb_x_policy_dir - 1; + memset(&sel, 0, sizeof(sel)); + + /* set source address info of selector */ + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1]; + sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); + sel.prefixlen_s = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port; + if (sel.sport) + sel.sport_mask = htons(0xffff); + + /* set destination address info of selector */ + sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1], + pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); + sel.prefixlen_d = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port; + if (sel.dport) + sel.dport_mask = htons(0xffff); + + rq = (struct sadb_x_ipsecrequest *)(pol + 1); + + /* extract ipsecrequests */ + i = 0; + len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy); + + while (len > 0 && i < XFRM_MAX_DEPTH) { + ret = ipsecrequests_to_migrate(rq, len, &m[i]); + if (ret < 0) { + err = ret; + goto out; + } else { + rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret); + len -= ret; + i++; + } + } + + if (!i || len > 0) { + err = -EINVAL; + goto out; + } + + return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, + kma ? &k : NULL); + + out: + return err; +} +#else +static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, + const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + return -ENOPROTOOPT; +} +#endif + + +static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + unsigned int dir; + int err = 0, delete; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct km_event c; + + if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) + return -EINVAL; + + dir = xfrm_policy_id2dir(pol->sadb_x_policy_id); + if (dir >= XFRM_POLICY_MAX) + return -EINVAL; + + delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); + xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + dir, pol->sadb_x_policy_id, delete, &err); + if (xp == NULL) + return -ENOENT; + + if (delete) { + xfrm_audit_policy_delete(xp, err ? 0 : 1, + audit_get_loginuid(current), + audit_get_sessionid(current), 0); + + if (err) + goto out; + c.seq = hdr->sadb_msg_seq; + c.pid = hdr->sadb_msg_pid; + c.data.byid = 1; + c.event = XFRM_MSG_DELPOLICY; + km_policy_notify(xp, dir, &c); + } else { + err = key_pol_get_resp(sk, xp, hdr, dir); + } + +out: + xfrm_pol_put(xp); + return err; +} + +static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + struct pfkey_sock *pfk = ptr; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + int err; + + if (!pfkey_can_dump(&pfk->sk)) + return -ENOBUFS; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + err = pfkey_xfrm_policy2msg(out_skb, xp, dir); + if (err < 0) + return err; + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = pfk->dump.msg_version; + out_hdr->sadb_msg_type = SADB_X_SPDDUMP; + out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = count + 1; + out_hdr->sadb_msg_pid = pfk->dump.msg_pid; + + if (pfk->dump.skb) + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, + &pfk->sk, sock_net(&pfk->sk)); + pfk->dump.skb = out_skb; + + return 0; +} + +static int pfkey_dump_sp(struct pfkey_sock *pfk) +{ + struct net *net = sock_net(&pfk->sk); + return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk); +} + +static void pfkey_dump_sp_done(struct pfkey_sock *pfk) +{ + xfrm_policy_walk_done(&pfk->dump.u.policy); +} + +static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct pfkey_sock *pfk = pfkey_sk(sk); + + if (pfk->dump.dump != NULL) + return -EBUSY; + + pfk->dump.msg_version = hdr->sadb_msg_version; + pfk->dump.msg_pid = hdr->sadb_msg_pid; + pfk->dump.dump = pfkey_dump_sp; + pfk->dump.done = pfkey_dump_sp_done; + xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); + + return pfkey_do_dump(pfk); +} + +static int key_notify_policy_flush(const struct km_event *c) +{ + struct sk_buff *skb_out; + struct sadb_msg *hdr; + + skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); + if (!skb_out) + return -ENOBUFS; + hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); + hdr->sadb_msg_type = SADB_X_SPDFLUSH; + hdr->sadb_msg_seq = c->seq; + hdr->sadb_msg_pid = c->pid; + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_errno = (uint8_t) 0; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); + return 0; + +} + +static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) +{ + struct net *net = sock_net(sk); + struct km_event c; + struct xfrm_audit audit_info; + int err, err2; + + audit_info.loginuid = audit_get_loginuid(current); + audit_info.sessionid = audit_get_sessionid(current); + audit_info.secid = 0; + err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); + err2 = unicast_flush_resp(sk, hdr); + if (err || err2) { + if (err == -ESRCH) /* empty table - old silent behavior */ + return 0; + return err; + } + + c.data.type = XFRM_POLICY_TYPE_MAIN; + c.event = XFRM_MSG_FLUSHPOLICY; + c.pid = hdr->sadb_msg_pid; + c.seq = hdr->sadb_msg_seq; + c.net = net; + km_policy_notify(NULL, 0, &c); + + return 0; +} + +typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, + const struct sadb_msg *hdr, void * const *ext_hdrs); +static pfkey_handler pfkey_funcs[SADB_MAX + 1] = { + [SADB_RESERVED] = pfkey_reserved, + [SADB_GETSPI] = pfkey_getspi, + [SADB_UPDATE] = pfkey_add, + [SADB_ADD] = pfkey_add, + [SADB_DELETE] = pfkey_delete, + [SADB_GET] = pfkey_get, + [SADB_ACQUIRE] = pfkey_acquire, + [SADB_REGISTER] = pfkey_register, + [SADB_EXPIRE] = NULL, + [SADB_FLUSH] = pfkey_flush, + [SADB_DUMP] = pfkey_dump, + [SADB_X_PROMISC] = pfkey_promisc, + [SADB_X_PCHANGE] = NULL, + [SADB_X_SPDUPDATE] = pfkey_spdadd, + [SADB_X_SPDADD] = pfkey_spdadd, + [SADB_X_SPDDELETE] = pfkey_spddelete, + [SADB_X_SPDGET] = pfkey_spdget, + [SADB_X_SPDACQUIRE] = NULL, + [SADB_X_SPDDUMP] = pfkey_spddump, + [SADB_X_SPDFLUSH] = pfkey_spdflush, + [SADB_X_SPDSETIDX] = pfkey_spdadd, + [SADB_X_SPDDELETE2] = pfkey_spdget, + [SADB_X_MIGRATE] = pfkey_migrate, +}; + +static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr) +{ + void *ext_hdrs[SADB_EXT_MAX]; + int err; + + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); + + memset(ext_hdrs, 0, sizeof(ext_hdrs)); + err = parse_exthdrs(skb, hdr, ext_hdrs); + if (!err) { + err = -EOPNOTSUPP; + if (pfkey_funcs[hdr->sadb_msg_type]) + err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); + } + return err; +} + +static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) +{ + struct sadb_msg *hdr = NULL; + + if (skb->len < sizeof(*hdr)) { + *errp = -EMSGSIZE; + } else { + hdr = (struct sadb_msg *) skb->data; + if (hdr->sadb_msg_version != PF_KEY_V2 || + hdr->sadb_msg_reserved != 0 || + (hdr->sadb_msg_type <= SADB_RESERVED || + hdr->sadb_msg_type > SADB_MAX)) { + hdr = NULL; + *errp = -EINVAL; + } else if (hdr->sadb_msg_len != (skb->len / + sizeof(uint64_t)) || + hdr->sadb_msg_len < (sizeof(struct sadb_msg) / + sizeof(uint64_t))) { + hdr = NULL; + *errp = -EMSGSIZE; + } else { + *errp = 0; + } + } + return hdr; +} + +static inline int aalg_tmpl_set(const struct xfrm_tmpl *t, + const struct xfrm_algo_desc *d) +{ + unsigned int id = d->desc.sadb_alg_id; + + if (id >= sizeof(t->aalgos) * 8) + return 0; + + return (t->aalgos >> id) & 1; +} + +static inline int ealg_tmpl_set(const struct xfrm_tmpl *t, + const struct xfrm_algo_desc *d) +{ + unsigned int id = d->desc.sadb_alg_id; + + if (id >= sizeof(t->ealgos) * 8) + return 0; + + return (t->ealgos >> id) & 1; +} + +static int count_ah_combs(const struct xfrm_tmpl *t) +{ + int i, sz = 0; + + for (i = 0; ; i++) { + const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + if (aalg_tmpl_set(t, aalg) && aalg->available) + sz += sizeof(struct sadb_comb); + } + return sz + sizeof(struct sadb_prop); +} + +static int count_esp_combs(const struct xfrm_tmpl *t) +{ + int i, k, sz = 0; + + for (i = 0; ; i++) { + const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + continue; + + for (k = 1; ; k++) { + const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); + if (!aalg) + break; + + if (aalg_tmpl_set(t, aalg) && aalg->available) + sz += sizeof(struct sadb_comb); + } + } + return sz + sizeof(struct sadb_prop); +} + +static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +{ + struct sadb_prop *p; + int i; + + p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop)); + p->sadb_prop_len = sizeof(struct sadb_prop)/8; + p->sadb_prop_exttype = SADB_EXT_PROPOSAL; + p->sadb_prop_replay = 32; + memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); + + for (i = 0; ; i++) { + const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + + if (aalg_tmpl_set(t, aalg) && aalg->available) { + struct sadb_comb *c; + c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); + memset(c, 0, sizeof(*c)); + p->sadb_prop_len += sizeof(struct sadb_comb)/8; + c->sadb_comb_auth = aalg->desc.sadb_alg_id; + c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; + c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; + c->sadb_comb_hard_addtime = 24*60*60; + c->sadb_comb_soft_addtime = 20*60*60; + c->sadb_comb_hard_usetime = 8*60*60; + c->sadb_comb_soft_usetime = 7*60*60; + } + } +} + +static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) +{ + struct sadb_prop *p; + int i, k; + + p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop)); + p->sadb_prop_len = sizeof(struct sadb_prop)/8; + p->sadb_prop_exttype = SADB_EXT_PROPOSAL; + p->sadb_prop_replay = 32; + memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); + + for (i=0; ; i++) { + const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + continue; + + for (k = 1; ; k++) { + struct sadb_comb *c; + const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); + if (!aalg) + break; + if (!(aalg_tmpl_set(t, aalg) && aalg->available)) + continue; + c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); + memset(c, 0, sizeof(*c)); + p->sadb_prop_len += sizeof(struct sadb_comb)/8; + c->sadb_comb_auth = aalg->desc.sadb_alg_id; + c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; + c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; + c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; + c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; + c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; + c->sadb_comb_hard_addtime = 24*60*60; + c->sadb_comb_soft_addtime = 20*60*60; + c->sadb_comb_hard_usetime = 8*60*60; + c->sadb_comb_soft_usetime = 7*60*60; + } + } +} + +static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) +{ + return 0; +} + +static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + int hard; + int hsc; + + hard = c->data.hard; + if (hard) + hsc = 2; + else + hsc = 1; + + out_skb = pfkey_xfrm_state2msg_expire(x, hsc); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = PF_KEY_V2; + out_hdr->sadb_msg_type = SADB_EXPIRE; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = 0; + out_hdr->sadb_msg_pid = 0; + + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); + return 0; +} + +static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c) +{ + struct net *net = x ? xs_net(x) : c->net; + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + + if (atomic_read(&net_pfkey->socks_nr) == 0) + return 0; + + switch (c->event) { + case XFRM_MSG_EXPIRE: + return key_notify_sa_expire(x, c); + case XFRM_MSG_DELSA: + case XFRM_MSG_NEWSA: + case XFRM_MSG_UPDSA: + return key_notify_sa(x, c); + case XFRM_MSG_FLUSHSA: + return key_notify_sa_flush(c); + case XFRM_MSG_NEWAE: /* not yet supported */ + break; + default: + pr_err("pfkey: Unknown SA event %d\n", c->event); + break; + } + + return 0; +} + +static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) +{ + if (xp && xp->type != XFRM_POLICY_TYPE_MAIN) + return 0; + + switch (c->event) { + case XFRM_MSG_POLEXPIRE: + return key_notify_policy_expire(xp, c); + case XFRM_MSG_DELPOLICY: + case XFRM_MSG_NEWPOLICY: + case XFRM_MSG_UPDPOLICY: + return key_notify_policy(xp, dir, c); + case XFRM_MSG_FLUSHPOLICY: + if (c->data.type != XFRM_POLICY_TYPE_MAIN) + break; + return key_notify_policy_flush(c); + default: + pr_err("pfkey: Unknown policy event %d\n", c->event); + break; + } + + return 0; +} + +static u32 get_acqseq(void) +{ + u32 res; + static atomic_t acqseq; + + do { + res = atomic_inc_return(&acqseq); + } while (!res); + return res; +} + +static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp, int dir) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_address *addr; + struct sadb_x_policy *pol; + int sockaddr_size; + int size; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int ctx_size = 0; + + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return -EINVAL; + + size = sizeof(struct sadb_msg) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + sizeof(struct sadb_x_policy); + + if (x->id.proto == IPPROTO_AH) + size += count_ah_combs(t); + else if (x->id.proto == IPPROTO_ESP) + size += count_esp_combs(t); + + if ((xfrm_ctx = x->security)) { + ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); + size += sizeof(struct sadb_x_sec_ctx) + ctx_size; + } + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_ACQUIRE; + hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_pid = 0; + + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + addr->sadb_address_prefixlen = + pfkey_sockaddr_fill(&x->props.saddr, 0, + (struct sockaddr *) (addr + 1), + x->props.family); + if (!addr->sadb_address_prefixlen) + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + addr->sadb_address_prefixlen = + pfkey_sockaddr_fill(&x->id.daddr, 0, + (struct sockaddr *) (addr + 1), + x->props.family); + if (!addr->sadb_address_prefixlen) + BUG(); + + pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + pol->sadb_x_policy_dir = dir+1; + pol->sadb_x_policy_id = xp->index; + + /* Set sadb_comb's. */ + if (x->id.proto == IPPROTO_AH) + dump_ah_combs(skb, t); + else if (x->id.proto == IPPROTO_ESP) + dump_esp_combs(skb, t); + + /* security context */ + if (xfrm_ctx) { + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, + sizeof(struct sadb_x_sec_ctx) + ctx_size); + sec_ctx->sadb_x_sec_len = + (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); +} + +static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, + u8 *data, int len, int *dir) +{ + struct net *net = sock_net(sk); + struct xfrm_policy *xp; + struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + struct sadb_x_sec_ctx *sec_ctx; + + switch (sk->sk_family) { + case AF_INET: + if (opt != IP_IPSEC_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + if (opt != IPV6_IPSEC_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#endif + default: + *dir = -EINVAL; + return NULL; + } + + *dir = -EINVAL; + + if (len < sizeof(struct sadb_x_policy) || + pol->sadb_x_policy_len*8 > len || + pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || + (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) + return NULL; + + xp = xfrm_policy_alloc(net, GFP_ATOMIC); + if (xp == NULL) { + *dir = -ENOBUFS; + return NULL; + } + + xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? + XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); + + xp->lft.soft_byte_limit = XFRM_INF; + xp->lft.hard_byte_limit = XFRM_INF; + xp->lft.soft_packet_limit = XFRM_INF; + xp->lft.hard_packet_limit = XFRM_INF; + xp->family = sk->sk_family; + + xp->xfrm_nr = 0; + if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && + (*dir = parse_ipsecrequests(xp, pol)) < 0) + goto out; + + /* security context too */ + if (len >= (pol->sadb_x_policy_len*8 + + sizeof(struct sadb_x_sec_ctx))) { + char *p = (char *)pol; + struct xfrm_user_sec_ctx *uctx; + + p += pol->sadb_x_policy_len*8; + sec_ctx = (struct sadb_x_sec_ctx *)p; + if (len < pol->sadb_x_policy_len*8 + + sec_ctx->sadb_x_sec_len) { + *dir = -EINVAL; + goto out; + } + if ((*dir = verify_sec_ctx_len(p))) + goto out; + uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + *dir = security_xfrm_policy_alloc(&xp->security, uctx); + kfree(uctx); + + if (*dir) + goto out; + } + + *dir = pol->sadb_x_policy_dir-1; + return xp; + +out: + xp->walk.dead = 1; + xfrm_policy_destroy(xp); + return NULL; +} + +static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_sa *sa; + struct sadb_address *addr; + struct sadb_x_nat_t_port *n_port; + int sockaddr_size; + int size; + __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); + struct xfrm_encap_tmpl *natt = NULL; + + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return -EINVAL; + + if (!satype) + return -EINVAL; + + if (!x->encap) + return -EINVAL; + + natt = x->encap; + + /* Build an SADB_X_NAT_T_NEW_MAPPING message: + * + * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | + * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) + */ + + size = sizeof(struct sadb_msg) + + sizeof(struct sadb_sa) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + (sizeof(struct sadb_x_nat_t_port) * 2); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; + hdr->sadb_msg_satype = satype; + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_pid = 0; + + /* SA */ + sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa)); + sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); + sa->sadb_sa_exttype = SADB_EXT_SA; + sa->sadb_sa_spi = x->id.spi; + sa->sadb_sa_replay = 0; + sa->sadb_sa_state = 0; + sa->sadb_sa_auth = 0; + sa->sadb_sa_encrypt = 0; + sa->sadb_sa_flags = 0; + + /* ADDRESS_SRC (old addr) */ + addr = (struct sadb_address*) + skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + addr->sadb_address_prefixlen = + pfkey_sockaddr_fill(&x->props.saddr, 0, + (struct sockaddr *) (addr + 1), + x->props.family); + if (!addr->sadb_address_prefixlen) + BUG(); + + /* NAT_T_SPORT (old port) */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + /* ADDRESS_DST (new addr) */ + addr = (struct sadb_address*) + skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + addr->sadb_address_prefixlen = + pfkey_sockaddr_fill(ipaddr, 0, + (struct sockaddr *) (addr + 1), + x->props.family); + if (!addr->sadb_address_prefixlen) + BUG(); + + /* NAT_T_DPORT (new port) */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; + n_port->sadb_x_nat_t_port_port = sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); +} + +#ifdef CONFIG_NET_KEY_MIGRATE +static int set_sadb_address(struct sk_buff *skb, int sasize, int type, + const struct xfrm_selector *sel) +{ + struct sadb_address *addr; + addr = (struct sadb_address *)skb_put(skb, sizeof(struct sadb_address) + sasize); + addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8; + addr->sadb_address_exttype = type; + addr->sadb_address_proto = sel->proto; + addr->sadb_address_reserved = 0; + + switch (type) { + case SADB_EXT_ADDRESS_SRC: + addr->sadb_address_prefixlen = sel->prefixlen_s; + pfkey_sockaddr_fill(&sel->saddr, 0, + (struct sockaddr *)(addr + 1), + sel->family); + break; + case SADB_EXT_ADDRESS_DST: + addr->sadb_address_prefixlen = sel->prefixlen_d; + pfkey_sockaddr_fill(&sel->daddr, 0, + (struct sockaddr *)(addr + 1), + sel->family); + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k) +{ + struct sadb_x_kmaddress *kma; + u8 *sa; + int family = k->family; + int socklen = pfkey_sockaddr_len(family); + int size_req; + + size_req = (sizeof(struct sadb_x_kmaddress) + + pfkey_sockaddr_pair_size(family)); + + kma = (struct sadb_x_kmaddress *)skb_put(skb, size_req); + memset(kma, 0, size_req); + kma->sadb_x_kmaddress_len = size_req / 8; + kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS; + kma->sadb_x_kmaddress_reserved = k->reserved; + + sa = (u8 *)(kma + 1); + if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) || + !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family)) + return -EINVAL; + + return 0; +} + +static int set_ipsecrequest(struct sk_buff *skb, + uint8_t proto, uint8_t mode, int level, + uint32_t reqid, uint8_t family, + const xfrm_address_t *src, const xfrm_address_t *dst) +{ + struct sadb_x_ipsecrequest *rq; + u8 *sa; + int socklen = pfkey_sockaddr_len(family); + int size_req; + + size_req = sizeof(struct sadb_x_ipsecrequest) + + pfkey_sockaddr_pair_size(family); + + rq = (struct sadb_x_ipsecrequest *)skb_put(skb, size_req); + memset(rq, 0, size_req); + rq->sadb_x_ipsecrequest_len = size_req; + rq->sadb_x_ipsecrequest_proto = proto; + rq->sadb_x_ipsecrequest_mode = mode; + rq->sadb_x_ipsecrequest_level = level; + rq->sadb_x_ipsecrequest_reqid = reqid; + + sa = (u8 *) (rq + 1); + if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) || + !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family)) + return -EINVAL; + + return 0; +} +#endif + +#ifdef CONFIG_NET_KEY_MIGRATE +static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, + const struct xfrm_migrate *m, int num_bundles, + const struct xfrm_kmaddress *k) +{ + int i; + int sasize_sel; + int size = 0; + int size_pol = 0; + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_x_policy *pol; + const struct xfrm_migrate *mp; + + if (type != XFRM_POLICY_TYPE_MAIN) + return 0; + + if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH) + return -EINVAL; + + if (k != NULL) { + /* addresses for KM */ + size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) + + pfkey_sockaddr_pair_size(k->family)); + } + + /* selector */ + sasize_sel = pfkey_sockaddr_size(sel->family); + if (!sasize_sel) + return -EINVAL; + size += (sizeof(struct sadb_address) + sasize_sel) * 2; + + /* policy info */ + size_pol += sizeof(struct sadb_x_policy); + + /* ipsecrequests */ + for (i = 0, mp = m; i < num_bundles; i++, mp++) { + /* old locator pair */ + size_pol += sizeof(struct sadb_x_ipsecrequest) + + pfkey_sockaddr_pair_size(mp->old_family); + /* new locator pair */ + size_pol += sizeof(struct sadb_x_ipsecrequest) + + pfkey_sockaddr_pair_size(mp->new_family); + } + + size += sizeof(struct sadb_msg) + size_pol; + + /* alloc buffer */ + skb = alloc_skb(size, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *)skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_X_MIGRATE; + hdr->sadb_msg_satype = pfkey_proto2satype(m->proto); + hdr->sadb_msg_len = size / 8; + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = 0; + hdr->sadb_msg_pid = 0; + + /* Addresses to be used by KM for negotiation, if ext is available */ + if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) + return -EINVAL; + + /* selector src */ + set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); + + /* selector dst */ + set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel); + + /* policy information */ + pol = (struct sadb_x_policy *)skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = size_pol / 8; + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + pol->sadb_x_policy_dir = dir + 1; + pol->sadb_x_policy_id = 0; + pol->sadb_x_policy_priority = 0; + + for (i = 0, mp = m; i < num_bundles; i++, mp++) { + /* old ipsecrequest */ + int mode = pfkey_mode_from_xfrm(mp->mode); + if (mode < 0) + goto err; + if (set_ipsecrequest(skb, mp->proto, mode, + (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), + mp->reqid, mp->old_family, + &mp->old_saddr, &mp->old_daddr) < 0) + goto err; + + /* new ipsecrequest */ + if (set_ipsecrequest(skb, mp->proto, mode, + (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), + mp->reqid, mp->new_family, + &mp->new_saddr, &mp->new_daddr) < 0) + goto err; + } + + /* broadcast migrate message to sockets */ + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); + + return 0; + +err: + kfree_skb(skb); + return -EINVAL; +} +#else +static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, + const struct xfrm_migrate *m, int num_bundles, + const struct xfrm_kmaddress *k) +{ + return -ENOPROTOOPT; +} +#endif + +static int pfkey_sendmsg(struct kiocb *kiocb, + struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; + struct sadb_msg *hdr = NULL; + int err; + + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) + goto out; + + err = -EMSGSIZE; + if ((unsigned)len > sk->sk_sndbuf - 32) + goto out; + + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); + if (skb == NULL) + goto out; + + err = -EFAULT; + if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) + goto out; + + hdr = pfkey_get_base_msg(skb, &err); + if (!hdr) + goto out; + + mutex_lock(&xfrm_cfg_mutex); + err = pfkey_process(sk, skb, hdr); + mutex_unlock(&xfrm_cfg_mutex); + +out: + if (err && hdr && pfkey_error(hdr, err, sk) == 0) + err = 0; + kfree_skb(skb); + + return err ? : len; +} + +static int pfkey_recvmsg(struct kiocb *kiocb, + struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct pfkey_sock *pfk = pfkey_sk(sk); + struct sk_buff *skb; + int copied, err; + + err = -EINVAL; + if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) + goto out; + + msg->msg_namelen = 0; + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb_reset_transport_header(skb); + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free; + + sock_recv_ts_and_drops(msg, sk, skb); + + err = (flags & MSG_TRUNC) ? skb->len : copied; + + if (pfk->dump.dump != NULL && + 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + pfkey_do_dump(pfk); + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +static const struct proto_ops pfkey_ops = { + .family = PF_KEY, + .owner = THIS_MODULE, + /* Operations that make no sense on pfkey sockets. */ + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, + + /* Now the operations that really occur. */ + .release = pfkey_release, + .poll = datagram_poll, + .sendmsg = pfkey_sendmsg, + .recvmsg = pfkey_recvmsg, +}; + +static const struct net_proto_family pfkey_family_ops = { + .family = PF_KEY, + .create = pfkey_create, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_PROC_FS +static int pfkey_seq_show(struct seq_file *f, void *v) +{ + struct sock *s = sk_entry(v); + + if (v == SEQ_START_TOKEN) + seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); + else + seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", + s, + atomic_read(&s->sk_refcnt), + sk_rmem_alloc_get(s), + sk_wmem_alloc_get(s), + sock_i_uid(s), + sock_i_ino(s) + ); + return 0; +} + +static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos) + __acquires(rcu) +{ + struct net *net = seq_file_net(f); + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + + rcu_read_lock(); + return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos); +} + +static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos) +{ + struct net *net = seq_file_net(f); + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + + return seq_hlist_next_rcu(v, &net_pfkey->table, ppos); +} + +static void pfkey_seq_stop(struct seq_file *f, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static const struct seq_operations pfkey_seq_ops = { + .start = pfkey_seq_start, + .next = pfkey_seq_next, + .stop = pfkey_seq_stop, + .show = pfkey_seq_show, +}; + +static int pfkey_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &pfkey_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations pfkey_proc_ops = { + .open = pfkey_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init pfkey_init_proc(struct net *net) +{ + struct proc_dir_entry *e; + + e = proc_net_fops_create(net, "pfkey", 0, &pfkey_proc_ops); + if (e == NULL) + return -ENOMEM; + + return 0; +} + +static void __net_exit pfkey_exit_proc(struct net *net) +{ + proc_net_remove(net, "pfkey"); +} +#else +static inline int pfkey_init_proc(struct net *net) +{ + return 0; +} + +static inline void pfkey_exit_proc(struct net *net) +{ +} +#endif + +static struct xfrm_mgr pfkeyv2_mgr = +{ + .id = "pfkeyv2", + .notify = pfkey_send_notify, + .acquire = pfkey_send_acquire, + .compile_policy = pfkey_compile_policy, + .new_mapping = pfkey_send_new_mapping, + .notify_policy = pfkey_send_policy_notify, + .migrate = pfkey_send_migrate, +}; + +static int __net_init pfkey_net_init(struct net *net) +{ + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + int rv; + + INIT_HLIST_HEAD(&net_pfkey->table); + atomic_set(&net_pfkey->socks_nr, 0); + + rv = pfkey_init_proc(net); + + return rv; +} + +static void __net_exit pfkey_net_exit(struct net *net) +{ + struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); + + pfkey_exit_proc(net); + BUG_ON(!hlist_empty(&net_pfkey->table)); +} + +static struct pernet_operations pfkey_net_ops = { + .init = pfkey_net_init, + .exit = pfkey_net_exit, + .id = &pfkey_net_id, + .size = sizeof(struct netns_pfkey), +}; + +static void __exit ipsec_pfkey_exit(void) +{ + xfrm_unregister_km(&pfkeyv2_mgr); + sock_unregister(PF_KEY); + unregister_pernet_subsys(&pfkey_net_ops); + proto_unregister(&key_proto); +} + +static int __init ipsec_pfkey_init(void) +{ + int err = proto_register(&key_proto, 0); + + if (err != 0) + goto out; + + err = register_pernet_subsys(&pfkey_net_ops); + if (err != 0) + goto out_unregister_key_proto; + err = sock_register(&pfkey_family_ops); + if (err != 0) + goto out_unregister_pernet; + err = xfrm_register_km(&pfkeyv2_mgr); + if (err != 0) + goto out_sock_unregister; +out: + return err; + +out_sock_unregister: + sock_unregister(PF_KEY); +out_unregister_pernet: + unregister_pernet_subsys(&pfkey_net_ops); +out_unregister_key_proto: + proto_unregister(&key_proto); + goto out; +} + +module_init(ipsec_pfkey_init); +module_exit(ipsec_pfkey_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_KEY); diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig new file mode 100644 index 00000000..4b1e7175 --- /dev/null +++ b/net/l2tp/Kconfig @@ -0,0 +1,107 @@ +# +# Layer Two Tunneling Protocol (L2TP) +# + +menuconfig L2TP + tristate "Layer Two Tunneling Protocol (L2TP)" + depends on INET + ---help--- + Layer Two Tunneling Protocol + + From RFC 2661 . + + L2TP facilitates the tunneling of packets across an + intervening network in a way that is as transparent as + possible to both end-users and applications. + + L2TP is often used to tunnel PPP traffic over IP + tunnels. One IP tunnel may carry thousands of individual PPP + connections. L2TP is also used as a VPN protocol, popular + with home workers to connect to their offices. + + L2TPv3 allows other protocols as well as PPP to be carried + over L2TP tunnels. L2TPv3 is defined in RFC 3931 + . + + The kernel component handles only L2TP data packets: a + userland daemon handles L2TP the control protocol (tunnel + and session setup). One such daemon is OpenL2TP + (http://openl2tp.org/). + + If you don't need L2TP, say N. To compile all L2TP code as + modules, choose M here. + +config L2TP_DEBUGFS + tristate "L2TP debugfs support" + depends on L2TP && DEBUG_FS + help + Support for l2tp directory in debugfs filesystem. This may be + used to dump internal state of the l2tp drivers for problem + analysis. + + If unsure, say 'Y'. + + To compile this driver as a module, choose M here. The module + will be called l2tp_debugfs. + +config L2TP_V3 + bool "L2TPv3 support (EXPERIMENTAL)" + depends on EXPERIMENTAL && L2TP + help + Layer Two Tunneling Protocol Version 3 + + From RFC 3931 . + + The Layer Two Tunneling Protocol (L2TP) provides a dynamic + mechanism for tunneling Layer 2 (L2) "circuits" across a + packet-oriented data network (e.g., over IP). L2TP, as + originally defined in RFC 2661, is a standard method for + tunneling Point-to-Point Protocol (PPP) [RFC1661] sessions. + L2TP has since been adopted for tunneling a number of other + L2 protocols, including ATM, Frame Relay, HDLC and even raw + ethernet frames. + + If you are connecting to L2TPv3 equipment, or you want to + tunnel raw ethernet frames using L2TP, say Y here. If + unsure, say N. + +config L2TP_IP + tristate "L2TP IP encapsulation for L2TPv3" + depends on L2TP_V3 + help + Support for L2TP-over-IP socket family. + + The L2TPv3 protocol defines two possible encapsulations for + L2TP frames, namely UDP and plain IP (without UDP). This + driver provides a new L2TPIP socket family with which + userspace L2TPv3 daemons may create L2TP/IP tunnel sockets + when UDP encapsulation is not required. When L2TP is carried + in IP packets, it used IP protocol number 115, so this port + must be enabled in firewalls. + + To compile this driver as a module, choose M here. The module + will be called l2tp_ip. + +config L2TP_ETH + tristate "L2TP ethernet pseudowire support for L2TPv3" + depends on L2TP_V3 + help + Support for carrying raw ethernet frames over L2TPv3. + + From RFC 4719 . + + The Layer 2 Tunneling Protocol, Version 3 (L2TPv3) can be + used as a control protocol and for data encapsulation to set + up Pseudowires for transporting layer 2 Packet Data Units + across an IP network [RFC3931]. + + This driver provides an ethernet virtual interface for each + L2TP ethernet pseudowire instance. Standard Linux tools may + be used to assign an IP address to the local virtual + interface, or add the interface to a bridge. + + If you are using L2TPv3, you will almost certainly want to + enable this option. + + To compile this driver as a module, choose M here. The module + will be called l2tp_eth. diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile new file mode 100644 index 00000000..110e7bc2 --- /dev/null +++ b/net/l2tp/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the L2TP. +# + +obj-$(CONFIG_L2TP) += l2tp_core.o + +# Build l2tp as modules if L2TP is M +obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o +obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o +obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_V3)) += l2tp_netlink.o +obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_ETH)) += l2tp_eth.o +obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_DEBUGFS)) += l2tp_debugfs.o diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c new file mode 100644 index 00000000..71c292e3 --- /dev/null +++ b/net/l2tp/l2tp_core.c @@ -0,0 +1,1703 @@ +/* + * L2TP core. + * + * Copyright (c) 2008,2009,2010 Katalix Systems Ltd + * + * This file contains some code of the original L2TPv2 pppol2tp + * driver, which has the following copyright: + * + * Authors: Martijn van Oosterhout + * James Chapman (jchapman@katalix.com) + * Contributors: + * Michal Ostrowski + * Arnaldo Carvalho de Melo + * David S. Miller (davem@redhat.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "l2tp_core.h" + +#define L2TP_DRV_VERSION "V2.0" + +/* L2TP header constants */ +#define L2TP_HDRFLAG_T 0x8000 +#define L2TP_HDRFLAG_L 0x4000 +#define L2TP_HDRFLAG_S 0x0800 +#define L2TP_HDRFLAG_O 0x0200 +#define L2TP_HDRFLAG_P 0x0100 + +#define L2TP_HDR_VER_MASK 0x000F +#define L2TP_HDR_VER_2 0x0002 +#define L2TP_HDR_VER_3 0x0003 + +/* L2TPv3 default L2-specific sublayer */ +#define L2TP_SLFLAG_S 0x40000000 +#define L2TP_SL_SEQ_MASK 0x00ffffff + +#define L2TP_HDR_SIZE_SEQ 10 +#define L2TP_HDR_SIZE_NOSEQ 6 + +/* Default trace flags */ +#define L2TP_DEFAULT_DEBUG_FLAGS 0 + +#define PRINTK(_mask, _type, _lvl, _fmt, args...) \ + do { \ + if ((_mask) & (_type)) \ + printk(_lvl "L2TP: " _fmt, ##args); \ + } while (0) + +/* Private data stored for received packets in the skb. + */ +struct l2tp_skb_cb { + u32 ns; + u16 has_seq; + u16 length; + unsigned long expires; +}; + +#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)]) + +static atomic_t l2tp_tunnel_count; +static atomic_t l2tp_session_count; + +/* per-net private data for this module */ +static unsigned int l2tp_net_id; +struct l2tp_net { + struct list_head l2tp_tunnel_list; + spinlock_t l2tp_tunnel_list_lock; + struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2]; + spinlock_t l2tp_session_hlist_lock; +}; + +static void l2tp_session_set_header_len(struct l2tp_session *session, int version); +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); + +static inline struct l2tp_net *l2tp_pernet(struct net *net) +{ + BUG_ON(!net); + + return net_generic(net, l2tp_net_id); +} + + +/* Tunnel reference counts. Incremented per session that is added to + * the tunnel. + */ +static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) +{ + atomic_inc(&tunnel->ref_count); +} + +static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) +{ + if (atomic_dec_and_test(&tunnel->ref_count)) + l2tp_tunnel_free(tunnel); +} +#ifdef L2TP_REFCNT_DEBUG +#define l2tp_tunnel_inc_refcount(_t) do { \ + printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ + l2tp_tunnel_inc_refcount_1(_t); \ + } while (0) +#define l2tp_tunnel_dec_refcount(_t) do { \ + printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ + l2tp_tunnel_dec_refcount_1(_t); \ + } while (0) +#else +#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) +#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) +#endif + +/* Session hash global list for L2TPv3. + * The session_id SHOULD be random according to RFC3931, but several + * L2TP implementations use incrementing session_ids. So we do a real + * hash on the session_id, rather than a simple bitmask. + */ +static inline struct hlist_head * +l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id) +{ + return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)]; + +} + +/* Lookup a session by id in the global session list + */ +static struct l2tp_session *l2tp_session_find_2(struct net *net, u32 session_id) +{ + struct l2tp_net *pn = l2tp_pernet(net); + struct hlist_head *session_list = + l2tp_session_id_hash_2(pn, session_id); + struct l2tp_session *session; + struct hlist_node *walk; + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(session, walk, session_list, global_hlist) { + if (session->session_id == session_id) { + rcu_read_unlock_bh(); + return session; + } + } + rcu_read_unlock_bh(); + + return NULL; +} + +/* Session hash list. + * The session_id SHOULD be random according to RFC2661, but several + * L2TP implementations (Cisco and Microsoft) use incrementing + * session_ids. So we do a real hash on the session_id, rather than a + * simple bitmask. + */ +static inline struct hlist_head * +l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) +{ + return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; +} + +/* Lookup a session by id + */ +struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id) +{ + struct hlist_head *session_list; + struct l2tp_session *session; + struct hlist_node *walk; + + /* In L2TPv3, session_ids are unique over all tunnels and we + * sometimes need to look them up before we know the + * tunnel. + */ + if (tunnel == NULL) + return l2tp_session_find_2(net, session_id); + + session_list = l2tp_session_id_hash(tunnel, session_id); + read_lock_bh(&tunnel->hlist_lock); + hlist_for_each_entry(session, walk, session_list, hlist) { + if (session->session_id == session_id) { + read_unlock_bh(&tunnel->hlist_lock); + return session; + } + } + read_unlock_bh(&tunnel->hlist_lock); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_session_find); + +struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth) +{ + int hash; + struct hlist_node *walk; + struct l2tp_session *session; + int count = 0; + + read_lock_bh(&tunnel->hlist_lock); + for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { + hlist_for_each_entry(session, walk, &tunnel->session_hlist[hash], hlist) { + if (++count > nth) { + read_unlock_bh(&tunnel->hlist_lock); + return session; + } + } + } + + read_unlock_bh(&tunnel->hlist_lock); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_session_find_nth); + +/* Lookup a session by interface name. + * This is very inefficient but is only used by management interfaces. + */ +struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname) +{ + struct l2tp_net *pn = l2tp_pernet(net); + int hash; + struct hlist_node *walk; + struct l2tp_session *session; + + rcu_read_lock_bh(); + for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) { + hlist_for_each_entry_rcu(session, walk, &pn->l2tp_session_hlist[hash], global_hlist) { + if (!strcmp(session->ifname, ifname)) { + rcu_read_unlock_bh(); + return session; + } + } + } + + rcu_read_unlock_bh(); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname); + +/* Lookup a tunnel by id + */ +struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id) +{ + struct l2tp_tunnel *tunnel; + struct l2tp_net *pn = l2tp_pernet(net); + + rcu_read_lock_bh(); + list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { + if (tunnel->tunnel_id == tunnel_id) { + rcu_read_unlock_bh(); + return tunnel; + } + } + rcu_read_unlock_bh(); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_find); + +struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth) +{ + struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_tunnel *tunnel; + int count = 0; + + rcu_read_lock_bh(); + list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { + if (++count > nth) { + rcu_read_unlock_bh(); + return tunnel; + } + } + + rcu_read_unlock_bh(); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth); + +/***************************************************************************** + * Receive data handling + *****************************************************************************/ + +/* Queue a skb in order. We come here only if the skb has an L2TP sequence + * number. + */ +static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *skb) +{ + struct sk_buff *skbp; + struct sk_buff *tmp; + u32 ns = L2TP_SKB_CB(skb)->ns; + + spin_lock_bh(&session->reorder_q.lock); + skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { + if (L2TP_SKB_CB(skbp)->ns > ns) { + __skb_queue_before(&session->reorder_q, skbp, skb); + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n", + session->name, ns, L2TP_SKB_CB(skbp)->ns, + skb_queue_len(&session->reorder_q)); + session->stats.rx_oos_packets++; + goto out; + } + } + + __skb_queue_tail(&session->reorder_q, skb); + +out: + spin_unlock_bh(&session->reorder_q.lock); +} + +/* Dequeue a single skb. + */ +static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *skb) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + int length = L2TP_SKB_CB(skb)->length; + + /* We're about to requeue the skb, so return resources + * to its current owner (a socket receive buffer). + */ + skb_orphan(skb); + + tunnel->stats.rx_packets++; + tunnel->stats.rx_bytes += length; + session->stats.rx_packets++; + session->stats.rx_bytes += length; + + if (L2TP_SKB_CB(skb)->has_seq) { + /* Bump our Nr */ + session->nr++; + if (tunnel->version == L2TP_HDR_VER_2) + session->nr &= 0xffff; + else + session->nr &= 0xffffff; + + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: updated nr to %hu\n", session->name, session->nr); + } + + /* call private receive handler */ + if (session->recv_skb != NULL) + (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length); + else + kfree_skb(skb); + + if (session->deref) + (*session->deref)(session); +} + +/* Dequeue skbs from the session's reorder_q, subject to packet order. + * Skbs that have been in the queue for too long are simply discarded. + */ +static void l2tp_recv_dequeue(struct l2tp_session *session) +{ + struct sk_buff *skb; + struct sk_buff *tmp; + + /* If the pkt at the head of the queue has the nr that we + * expect to send up next, dequeue it and any other + * in-sequence packets behind it. + */ + spin_lock_bh(&session->reorder_q.lock); + skb_queue_walk_safe(&session->reorder_q, skb, tmp) { + if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) { + session->stats.rx_seq_discards++; + session->stats.rx_errors++; + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: oos pkt %u len %d discarded (too old), " + "waiting for %u, reorder_q_len=%d\n", + session->name, L2TP_SKB_CB(skb)->ns, + L2TP_SKB_CB(skb)->length, session->nr, + skb_queue_len(&session->reorder_q)); + __skb_unlink(skb, &session->reorder_q); + kfree_skb(skb); + if (session->deref) + (*session->deref)(session); + continue; + } + + if (L2TP_SKB_CB(skb)->has_seq) { + if (L2TP_SKB_CB(skb)->ns != session->nr) { + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: holding oos pkt %u len %d, " + "waiting for %u, reorder_q_len=%d\n", + session->name, L2TP_SKB_CB(skb)->ns, + L2TP_SKB_CB(skb)->length, session->nr, + skb_queue_len(&session->reorder_q)); + goto out; + } + } + __skb_unlink(skb, &session->reorder_q); + + /* Process the skb. We release the queue lock while we + * do so to let other contexts process the queue. + */ + spin_unlock_bh(&session->reorder_q.lock); + l2tp_recv_dequeue_skb(session, skb); + spin_lock_bh(&session->reorder_q.lock); + } + +out: + spin_unlock_bh(&session->reorder_q.lock); +} + +static inline int l2tp_verify_udp_checksum(struct sock *sk, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_hdr(skb); + u16 ulen = ntohs(uh->len); + struct inet_sock *inet; + __wsum psum; + + if (sk->sk_no_check || skb_csum_unnecessary(skb) || !uh->check) + return 0; + + inet = inet_sk(sk); + psum = csum_tcpudp_nofold(inet->inet_saddr, inet->inet_daddr, ulen, + IPPROTO_UDP, 0); + + if ((skb->ip_summed == CHECKSUM_COMPLETE) && + !csum_fold(csum_add(psum, skb->csum))) + return 0; + + skb->csum = psum; + + return __skb_checksum_complete(skb); +} + +/* Do receive processing of L2TP data frames. We handle both L2TPv2 + * and L2TPv3 data frames here. + * + * L2TPv2 Data Message Header + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |T|L|x|x|S|x|O|P|x|x|x|x| Ver | Length (opt) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Tunnel ID | Session ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Ns (opt) | Nr (opt) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Offset Size (opt) | Offset pad... (opt) + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Data frames are marked by T=0. All other fields are the same as + * those in L2TP control frames. + * + * L2TPv3 Data Message Header + * + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | L2TP Session Header | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | L2-Specific Sublayer | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Tunnel Payload ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * L2TPv3 Session Header Over IP + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Session ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Cookie (optional, maximum 64 bits)... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * L2TPv3 L2-Specific Sublayer Format + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |x|S|x|x|x|x|x|x| Sequence Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Cookie value, sublayer format and offset (pad) are negotiated with + * the peer when the session is set up. Unlike L2TPv2, we do not need + * to parse the packet header to determine if optional fields are + * present. + * + * Caller must already have parsed the frame and determined that it is + * a data (not control) frame before coming here. Fields up to the + * session-id have already been parsed and ptr points to the data + * after the session-id. + */ +void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, + unsigned char *ptr, unsigned char *optr, u16 hdrflags, + int length, int (*payload_hook)(struct sk_buff *skb)) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + int offset; + u32 ns, nr; + + /* The ref count is increased since we now hold a pointer to + * the session. Take care to decrement the refcnt when exiting + * this function from now on... + */ + l2tp_session_inc_refcount(session); + if (session->ref) + (*session->ref)(session); + + /* Parse and check optional cookie */ + if (session->peer_cookie_len > 0) { + if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) { + PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO, + "%s: cookie mismatch (%u/%u). Discarding.\n", + tunnel->name, tunnel->tunnel_id, session->session_id); + session->stats.rx_cookie_discards++; + goto discard; + } + ptr += session->peer_cookie_len; + } + + /* Handle the optional sequence numbers. Sequence numbers are + * in different places for L2TPv2 and L2TPv3. + * + * If we are the LAC, enable/disable sequence numbers under + * the control of the LNS. If no sequence numbers present but + * we were expecting them, discard frame. + */ + ns = nr = 0; + L2TP_SKB_CB(skb)->has_seq = 0; + if (tunnel->version == L2TP_HDR_VER_2) { + if (hdrflags & L2TP_HDRFLAG_S) { + ns = ntohs(*(__be16 *) ptr); + ptr += 2; + nr = ntohs(*(__be16 *) ptr); + ptr += 2; + + /* Store L2TP info in the skb */ + L2TP_SKB_CB(skb)->ns = ns; + L2TP_SKB_CB(skb)->has_seq = 1; + + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: recv data ns=%u, nr=%u, session nr=%u\n", + session->name, ns, nr, session->nr); + } + } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { + u32 l2h = ntohl(*(__be32 *) ptr); + + if (l2h & 0x40000000) { + ns = l2h & 0x00ffffff; + + /* Store L2TP info in the skb */ + L2TP_SKB_CB(skb)->ns = ns; + L2TP_SKB_CB(skb)->has_seq = 1; + + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: recv data ns=%u, session nr=%u\n", + session->name, ns, session->nr); + } + } + + /* Advance past L2-specific header, if present */ + ptr += session->l2specific_len; + + if (L2TP_SKB_CB(skb)->has_seq) { + /* Received a packet with sequence numbers. If we're the LNS, + * check if we sre sending sequence numbers and if not, + * configure it so. + */ + if ((!session->lns_mode) && (!session->send_seq)) { + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_INFO, + "%s: requested to enable seq numbers by LNS\n", + session->name); + session->send_seq = -1; + l2tp_session_set_header_len(session, tunnel->version); + } + } else { + /* No sequence numbers. + * If user has configured mandatory sequence numbers, discard. + */ + if (session->recv_seq) { + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_WARNING, + "%s: recv data has no seq numbers when required. " + "Discarding\n", session->name); + session->stats.rx_seq_discards++; + goto discard; + } + + /* If we're the LAC and we're sending sequence numbers, the + * LNS has requested that we no longer send sequence numbers. + * If we're the LNS and we're sending sequence numbers, the + * LAC is broken. Discard the frame. + */ + if ((!session->lns_mode) && (session->send_seq)) { + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_INFO, + "%s: requested to disable seq numbers by LNS\n", + session->name); + session->send_seq = 0; + l2tp_session_set_header_len(session, tunnel->version); + } else if (session->send_seq) { + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_WARNING, + "%s: recv data has no seq numbers when required. " + "Discarding\n", session->name); + session->stats.rx_seq_discards++; + goto discard; + } + } + + /* Session data offset is handled differently for L2TPv2 and + * L2TPv3. For L2TPv2, there is an optional 16-bit value in + * the header. For L2TPv3, the offset is negotiated using AVPs + * in the session setup control protocol. + */ + if (tunnel->version == L2TP_HDR_VER_2) { + /* If offset bit set, skip it. */ + if (hdrflags & L2TP_HDRFLAG_O) { + offset = ntohs(*(__be16 *)ptr); + ptr += 2 + offset; + } + } else + ptr += session->offset; + + offset = ptr - optr; + if (!pskb_may_pull(skb, offset)) + goto discard; + + __skb_pull(skb, offset); + + /* If caller wants to process the payload before we queue the + * packet, do so now. + */ + if (payload_hook) + if ((*payload_hook)(skb)) + goto discard; + + /* Prepare skb for adding to the session's reorder_q. Hold + * packets for max reorder_timeout or 1 second if not + * reordering. + */ + L2TP_SKB_CB(skb)->length = length; + L2TP_SKB_CB(skb)->expires = jiffies + + (session->reorder_timeout ? session->reorder_timeout : HZ); + + /* Add packet to the session's receive queue. Reordering is done here, if + * enabled. Saved L2TP protocol info is stored in skb->sb[]. + */ + if (L2TP_SKB_CB(skb)->has_seq) { + if (session->reorder_timeout != 0) { + /* Packet reordering enabled. Add skb to session's + * reorder queue, in order of ns. + */ + l2tp_recv_queue_skb(session, skb); + } else { + /* Packet reordering disabled. Discard out-of-sequence + * packets + */ + if (L2TP_SKB_CB(skb)->ns != session->nr) { + session->stats.rx_seq_discards++; + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: oos pkt %u len %d discarded, " + "waiting for %u, reorder_q_len=%d\n", + session->name, L2TP_SKB_CB(skb)->ns, + L2TP_SKB_CB(skb)->length, session->nr, + skb_queue_len(&session->reorder_q)); + goto discard; + } + skb_queue_tail(&session->reorder_q, skb); + } + } else { + /* No sequence numbers. Add the skb to the tail of the + * reorder queue. This ensures that it will be + * delivered after all previous sequenced skbs. + */ + skb_queue_tail(&session->reorder_q, skb); + } + + /* Try to dequeue as many skbs from reorder_q as we can. */ + l2tp_recv_dequeue(session); + + l2tp_session_dec_refcount(session); + + return; + +discard: + session->stats.rx_errors++; + kfree_skb(skb); + + if (session->deref) + (*session->deref)(session); + + l2tp_session_dec_refcount(session); +} +EXPORT_SYMBOL(l2tp_recv_common); + +/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame + * here. The skb is not on a list when we get here. + * Returns 0 if the packet was a data packet and was successfully passed on. + * Returns 1 if the packet was not a good data packet and could not be + * forwarded. All such packets are passed up to userspace to deal with. + */ +static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, + int (*payload_hook)(struct sk_buff *skb)) +{ + struct l2tp_session *session = NULL; + unsigned char *ptr, *optr; + u16 hdrflags; + u32 tunnel_id, session_id; + int offset; + u16 version; + int length; + + if (tunnel->sock && l2tp_verify_udp_checksum(tunnel->sock, skb)) + goto discard_bad_csum; + + /* UDP always verifies the packet length. */ + __skb_pull(skb, sizeof(struct udphdr)); + + /* Short packet? */ + if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) { + PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO, + "%s: recv short packet (len=%d)\n", tunnel->name, skb->len); + goto error; + } + + /* Point to L2TP header */ + optr = ptr = skb->data; + + /* Trace packet contents, if enabled */ + if (tunnel->debug & L2TP_MSG_DATA) { + length = min(32u, skb->len); + if (!pskb_may_pull(skb, length)) + goto error; + + printk(KERN_DEBUG "%s: recv: ", tunnel->name); + + offset = 0; + do { + printk(" %02X", ptr[offset]); + } while (++offset < length); + + printk("\n"); + } + + /* Get L2TP header flags */ + hdrflags = ntohs(*(__be16 *) ptr); + + /* Check protocol version */ + version = hdrflags & L2TP_HDR_VER_MASK; + if (version != tunnel->version) { + PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO, + "%s: recv protocol version mismatch: got %d expected %d\n", + tunnel->name, version, tunnel->version); + goto error; + } + + /* Get length of L2TP packet */ + length = skb->len; + + /* If type is control packet, it is handled by userspace. */ + if (hdrflags & L2TP_HDRFLAG_T) { + PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_DEBUG, + "%s: recv control packet, len=%d\n", tunnel->name, length); + goto error; + } + + /* Skip flags */ + ptr += 2; + + if (tunnel->version == L2TP_HDR_VER_2) { + /* If length is present, skip it */ + if (hdrflags & L2TP_HDRFLAG_L) + ptr += 2; + + /* Extract tunnel and session ID */ + tunnel_id = ntohs(*(__be16 *) ptr); + ptr += 2; + session_id = ntohs(*(__be16 *) ptr); + ptr += 2; + } else { + ptr += 2; /* skip reserved bits */ + tunnel_id = tunnel->tunnel_id; + session_id = ntohl(*(__be32 *) ptr); + ptr += 4; + } + + /* Find the session context */ + session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id); + if (!session || !session->recv_skb) { + /* Not found? Pass to userspace to deal with */ + PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO, + "%s: no session found (%u/%u). Passing up.\n", + tunnel->name, tunnel_id, session_id); + goto error; + } + + l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook); + + return 0; + +discard_bad_csum: + LIMIT_NETDEBUG("%s: UDP: bad checksum\n", tunnel->name); + UDP_INC_STATS_USER(tunnel->l2tp_net, UDP_MIB_INERRORS, 0); + tunnel->stats.rx_errors++; + kfree_skb(skb); + + return 0; + +error: + /* Put UDP header back */ + __skb_push(skb, sizeof(struct udphdr)); + + return 1; +} + +/* UDP encapsulation receive handler. See net/ipv4/udp.c. + * Return codes: + * 0 : success. + * <0: error + * >0: skb should be passed up to userspace as UDP. + */ +int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct l2tp_tunnel *tunnel; + + tunnel = l2tp_sock_to_tunnel(sk); + if (tunnel == NULL) + goto pass_up; + + PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_DEBUG, + "%s: received %d bytes\n", tunnel->name, skb->len); + + if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook)) + goto pass_up_put; + + sock_put(sk); + return 0; + +pass_up_put: + sock_put(sk); +pass_up: + return 1; +} +EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv); + +/************************************************************************ + * Transmit handling + ***********************************************************************/ + +/* Build an L2TP header for the session into the buffer provided. + */ +static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + __be16 *bufp = buf; + __be16 *optr = buf; + u16 flags = L2TP_HDR_VER_2; + u32 tunnel_id = tunnel->peer_tunnel_id; + u32 session_id = session->peer_session_id; + + if (session->send_seq) + flags |= L2TP_HDRFLAG_S; + + /* Setup L2TP header. */ + *bufp++ = htons(flags); + *bufp++ = htons(tunnel_id); + *bufp++ = htons(session_id); + if (session->send_seq) { + *bufp++ = htons(session->ns); + *bufp++ = 0; + session->ns++; + session->ns &= 0xffff; + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: updated ns to %u\n", session->name, session->ns); + } + + return bufp - optr; +} + +static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + char *bufp = buf; + char *optr = bufp; + + /* Setup L2TP header. The header differs slightly for UDP and + * IP encapsulations. For UDP, there is 4 bytes of flags. + */ + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { + u16 flags = L2TP_HDR_VER_3; + *((__be16 *) bufp) = htons(flags); + bufp += 2; + *((__be16 *) bufp) = 0; + bufp += 2; + } + + *((__be32 *) bufp) = htonl(session->peer_session_id); + bufp += 4; + if (session->cookie_len) { + memcpy(bufp, &session->cookie[0], session->cookie_len); + bufp += session->cookie_len; + } + if (session->l2specific_len) { + if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) { + u32 l2h = 0; + if (session->send_seq) { + l2h = 0x40000000 | session->ns; + session->ns++; + session->ns &= 0xffffff; + PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG, + "%s: updated ns to %u\n", session->name, session->ns); + } + + *((__be32 *) bufp) = htonl(l2h); + } + bufp += session->l2specific_len; + } + if (session->offset) + bufp += session->offset; + + return bufp - optr; +} + +static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, + struct flowi *fl, size_t data_len) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + unsigned int len = skb->len; + int error; + + /* Debug */ + if (session->send_seq) + PRINTK(session->debug, L2TP_MSG_DATA, KERN_DEBUG, + "%s: send %Zd bytes, ns=%u\n", session->name, + data_len, session->ns - 1); + else + PRINTK(session->debug, L2TP_MSG_DATA, KERN_DEBUG, + "%s: send %Zd bytes\n", session->name, data_len); + + if (session->debug & L2TP_MSG_DATA) { + int i; + int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; + unsigned char *datap = skb->data + uhlen; + + printk(KERN_DEBUG "%s: xmit:", session->name); + for (i = 0; i < (len - uhlen); i++) { + printk(" %02X", *datap++); + if (i == 31) { + printk(" ..."); + break; + } + } + printk("\n"); + } + + /* Queue the packet to IP for output */ + skb->local_df = 1; + error = ip_queue_xmit(skb, fl); + + /* Update stats */ + if (error >= 0) { + tunnel->stats.tx_packets++; + tunnel->stats.tx_bytes += len; + session->stats.tx_packets++; + session->stats.tx_bytes += len; + } else { + tunnel->stats.tx_errors++; + session->stats.tx_errors++; + } + + return 0; +} + +/* Automatically called when the skb is freed. + */ +static void l2tp_sock_wfree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} + +/* For data skbs that we transmit, we associate with the tunnel socket + * but don't do accounting. + */ +static inline void l2tp_skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + sock_hold(sk); + skb->sk = sk; + skb->destructor = l2tp_sock_wfree; +} + +/* If caller requires the skb to have a ppp header, the header must be + * inserted in the skb data before calling this function. + */ +int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len) +{ + int data_len = skb->len; + struct l2tp_tunnel *tunnel = session->tunnel; + struct sock *sk = tunnel->sock; + struct flowi *fl; + struct udphdr *uh; + struct inet_sock *inet; + __wsum csum; + int old_headroom; + int new_headroom; + int headroom; + int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; + int udp_len; + + /* Check that there's enough headroom in the skb to insert IP, + * UDP and L2TP headers. If not enough, expand it to + * make room. Adjust truesize. + */ + headroom = NET_SKB_PAD + sizeof(struct iphdr) + + uhlen + hdr_len; + old_headroom = skb_headroom(skb); + if (skb_cow_head(skb, headroom)) { + dev_kfree_skb(skb); + goto abort; + } + + new_headroom = skb_headroom(skb); + skb_orphan(skb); + skb->truesize += new_headroom - old_headroom; + + /* Setup L2TP header */ + session->build_header(session, __skb_push(skb, hdr_len)); + + /* Reset skb netfilter state */ + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); + nf_reset(skb); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + dev_kfree_skb(skb); + goto out_unlock; + } + + /* Get routing info from the tunnel socket */ + skb_dst_drop(skb); + skb_dst_set(skb, dst_clone(__sk_dst_get(sk))); + + inet = inet_sk(sk); + fl = &inet->cork.fl; + switch (tunnel->encap) { + case L2TP_ENCAPTYPE_UDP: + /* Setup UDP header */ + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + uh->source = inet->inet_sport; + uh->dest = inet->inet_dport; + udp_len = uhlen + hdr_len + data_len; + uh->len = htons(udp_len); + uh->check = 0; + + /* Calculate UDP checksum if configured to do so */ + if (sk->sk_no_check == UDP_CSUM_NOXMIT) + skb->ip_summed = CHECKSUM_NONE; + else if ((skb_dst(skb) && skb_dst(skb)->dev) && + (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM))) { + skb->ip_summed = CHECKSUM_COMPLETE; + csum = skb_checksum(skb, 0, udp_len, 0); + uh->check = csum_tcpudp_magic(inet->inet_saddr, + inet->inet_daddr, + udp_len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_tcpudp_magic(inet->inet_saddr, + inet->inet_daddr, + udp_len, IPPROTO_UDP, 0); + } + break; + + case L2TP_ENCAPTYPE_IP: + break; + } + + l2tp_skb_set_owner_w(skb, sk); + + l2tp_xmit_core(session, skb, fl, data_len); +out_unlock: + bh_unlock_sock(sk); + +abort: + return 0; +} +EXPORT_SYMBOL_GPL(l2tp_xmit_skb); + +/***************************************************************************** + * Tinnel and session create/destroy. + *****************************************************************************/ + +/* Tunnel socket destruct hook. + * The tunnel context is deleted only when all session sockets have been + * closed. + */ +static void l2tp_tunnel_destruct(struct sock *sk) +{ + struct l2tp_tunnel *tunnel; + + tunnel = sk->sk_user_data; + if (tunnel == NULL) + goto end; + + PRINTK(tunnel->debug, L2TP_MSG_CONTROL, KERN_INFO, + "%s: closing...\n", tunnel->name); + + /* Close all sessions */ + l2tp_tunnel_closeall(tunnel); + + switch (tunnel->encap) { + case L2TP_ENCAPTYPE_UDP: + /* No longer an encapsulation socket. See net/ipv4/udp.c */ + (udp_sk(sk))->encap_type = 0; + (udp_sk(sk))->encap_rcv = NULL; + break; + case L2TP_ENCAPTYPE_IP: + break; + } + + /* Remove hooks into tunnel socket */ + tunnel->sock = NULL; + sk->sk_destruct = tunnel->old_sk_destruct; + sk->sk_user_data = NULL; + + /* Call the original destructor */ + if (sk->sk_destruct) + (*sk->sk_destruct)(sk); + + /* We're finished with the socket */ + l2tp_tunnel_dec_refcount(tunnel); + +end: + return; +} + +/* When the tunnel is closed, all the attached sessions need to go too. + */ +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +{ + int hash; + struct hlist_node *walk; + struct hlist_node *tmp; + struct l2tp_session *session; + + BUG_ON(tunnel == NULL); + + PRINTK(tunnel->debug, L2TP_MSG_CONTROL, KERN_INFO, + "%s: closing all sessions...\n", tunnel->name); + + write_lock_bh(&tunnel->hlist_lock); + for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { +again: + hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) { + session = hlist_entry(walk, struct l2tp_session, hlist); + + PRINTK(session->debug, L2TP_MSG_CONTROL, KERN_INFO, + "%s: closing session\n", session->name); + + hlist_del_init(&session->hlist); + + /* Since we should hold the sock lock while + * doing any unbinding, we need to release the + * lock we're holding before taking that lock. + * Hold a reference to the sock so it doesn't + * disappear as we're jumping between locks. + */ + if (session->ref != NULL) + (*session->ref)(session); + + write_unlock_bh(&tunnel->hlist_lock); + + if (tunnel->version != L2TP_HDR_VER_2) { + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + + spin_lock_bh(&pn->l2tp_session_hlist_lock); + hlist_del_init_rcu(&session->global_hlist); + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + synchronize_rcu(); + } + + if (session->session_close != NULL) + (*session->session_close)(session); + + if (session->deref != NULL) + (*session->deref)(session); + + write_lock_bh(&tunnel->hlist_lock); + + /* Now restart from the beginning of this hash + * chain. We always remove a session from the + * list so we are guaranteed to make forward + * progress. + */ + goto again; + } + } + write_unlock_bh(&tunnel->hlist_lock); +} + +/* Really kill the tunnel. + * Come here only when all sessions have been cleared from the tunnel. + */ +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) +{ + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + + BUG_ON(atomic_read(&tunnel->ref_count) != 0); + BUG_ON(tunnel->sock != NULL); + + PRINTK(tunnel->debug, L2TP_MSG_CONTROL, KERN_INFO, + "%s: free...\n", tunnel->name); + + /* Remove from tunnel list */ + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_del_rcu(&tunnel->list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + synchronize_rcu(); + + atomic_dec(&l2tp_tunnel_count); + kfree(tunnel); +} + +/* Create a socket for the tunnel, if one isn't set up by + * userspace. This is used for static tunnels where there is no + * managing L2TP daemon. + */ +static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct socket **sockp) +{ + int err = -EINVAL; + struct sockaddr_in udp_addr; + struct sockaddr_l2tpip ip_addr; + struct socket *sock = NULL; + + switch (cfg->encap) { + case L2TP_ENCAPTYPE_UDP: + err = sock_create(AF_INET, SOCK_DGRAM, 0, sockp); + if (err < 0) + goto out; + + sock = *sockp; + + memset(&udp_addr, 0, sizeof(udp_addr)); + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = htons(cfg->local_udp_port); + err = kernel_bind(sock, (struct sockaddr *) &udp_addr, sizeof(udp_addr)); + if (err < 0) + goto out; + + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = htons(cfg->peer_udp_port); + err = kernel_connect(sock, (struct sockaddr *) &udp_addr, sizeof(udp_addr), 0); + if (err < 0) + goto out; + + if (!cfg->use_udp_checksums) + sock->sk->sk_no_check = UDP_CSUM_NOXMIT; + + break; + + case L2TP_ENCAPTYPE_IP: + err = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_L2TP, sockp); + if (err < 0) + goto out; + + sock = *sockp; + + memset(&ip_addr, 0, sizeof(ip_addr)); + ip_addr.l2tp_family = AF_INET; + ip_addr.l2tp_addr = cfg->local_ip; + ip_addr.l2tp_conn_id = tunnel_id; + err = kernel_bind(sock, (struct sockaddr *) &ip_addr, sizeof(ip_addr)); + if (err < 0) + goto out; + + ip_addr.l2tp_family = AF_INET; + ip_addr.l2tp_addr = cfg->peer_ip; + ip_addr.l2tp_conn_id = peer_tunnel_id; + err = kernel_connect(sock, (struct sockaddr *) &ip_addr, sizeof(ip_addr), 0); + if (err < 0) + goto out; + + break; + + default: + goto out; + } + +out: + if ((err < 0) && sock) { + sock_release(sock); + *sockp = NULL; + } + + return err; +} + +int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) +{ + struct l2tp_tunnel *tunnel = NULL; + int err; + struct socket *sock = NULL; + struct sock *sk = NULL; + struct l2tp_net *pn; + enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP; + + /* Get the tunnel socket from the fd, which was opened by + * the userspace L2TP daemon. If not specified, create a + * kernel socket. + */ + if (fd < 0) { + err = l2tp_tunnel_sock_create(tunnel_id, peer_tunnel_id, cfg, &sock); + if (err < 0) + goto err; + } else { + err = -EBADF; + sock = sockfd_lookup(fd, &err); + if (!sock) { + printk(KERN_ERR "tunl %hu: sockfd_lookup(fd=%d) returned %d\n", + tunnel_id, fd, err); + goto err; + } + } + + sk = sock->sk; + + if (cfg != NULL) + encap = cfg->encap; + + /* Quick sanity checks */ + switch (encap) { + case L2TP_ENCAPTYPE_UDP: + err = -EPROTONOSUPPORT; + if (sk->sk_protocol != IPPROTO_UDP) { + printk(KERN_ERR "tunl %hu: fd %d wrong protocol, got %d, expected %d\n", + tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP); + goto err; + } + break; + case L2TP_ENCAPTYPE_IP: + err = -EPROTONOSUPPORT; + if (sk->sk_protocol != IPPROTO_L2TP) { + printk(KERN_ERR "tunl %hu: fd %d wrong protocol, got %d, expected %d\n", + tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP); + goto err; + } + break; + } + + /* Check if this socket has already been prepped */ + tunnel = (struct l2tp_tunnel *)sk->sk_user_data; + if (tunnel != NULL) { + /* This socket has already been prepped */ + err = -EBUSY; + goto err; + } + + tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL); + if (tunnel == NULL) { + err = -ENOMEM; + goto err; + } + + tunnel->version = version; + tunnel->tunnel_id = tunnel_id; + tunnel->peer_tunnel_id = peer_tunnel_id; + tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS; + + tunnel->magic = L2TP_TUNNEL_MAGIC; + sprintf(&tunnel->name[0], "tunl %u", tunnel_id); + rwlock_init(&tunnel->hlist_lock); + + /* The net we belong to */ + tunnel->l2tp_net = net; + pn = l2tp_pernet(net); + + if (cfg != NULL) + tunnel->debug = cfg->debug; + + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ + tunnel->encap = encap; + if (encap == L2TP_ENCAPTYPE_UDP) { + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ + udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP; + udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv; + } + + sk->sk_user_data = tunnel; + + /* Hook on the tunnel socket destructor so that we can cleanup + * if the tunnel socket goes away. + */ + tunnel->old_sk_destruct = sk->sk_destruct; + sk->sk_destruct = &l2tp_tunnel_destruct; + tunnel->sock = sk; + sk->sk_allocation = GFP_ATOMIC; + + /* Add tunnel to our list */ + INIT_LIST_HEAD(&tunnel->list); + atomic_inc(&l2tp_tunnel_count); + + /* Bump the reference count. The tunnel context is deleted + * only when this drops to zero. Must be done before list insertion + */ + l2tp_tunnel_inc_refcount(tunnel); + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + + err = 0; +err: + if (tunnelp) + *tunnelp = tunnel; + + /* If tunnel's socket was created by the kernel, it doesn't + * have a file. + */ + if (sock && sock->file) + sockfd_put(sock); + + return err; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_create); + +/* This function is used by the netlink TUNNEL_DELETE command. + */ +int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +{ + int err = 0; + struct socket *sock = tunnel->sock ? tunnel->sock->sk_socket : NULL; + + /* Force the tunnel socket to close. This will eventually + * cause the tunnel to be deleted via the normal socket close + * mechanisms when userspace closes the tunnel socket. + */ + if (sock != NULL) { + err = inet_shutdown(sock, 2); + + /* If the tunnel's socket was created by the kernel, + * close the socket here since the socket was not + * created by userspace. + */ + if (sock->file == NULL) + err = inet_release(sock); + } + + return err; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); + +/* Really kill the session. + */ +void l2tp_session_free(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel; + + BUG_ON(atomic_read(&session->ref_count) != 0); + + tunnel = session->tunnel; + if (tunnel != NULL) { + BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); + + /* Delete the session from the hash */ + write_lock_bh(&tunnel->hlist_lock); + hlist_del_init(&session->hlist); + write_unlock_bh(&tunnel->hlist_lock); + + /* Unlink from the global hash if not L2TPv2 */ + if (tunnel->version != L2TP_HDR_VER_2) { + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + + spin_lock_bh(&pn->l2tp_session_hlist_lock); + hlist_del_init_rcu(&session->global_hlist); + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + synchronize_rcu(); + } + + if (session->session_id != 0) + atomic_dec(&l2tp_session_count); + + sock_put(tunnel->sock); + + /* This will delete the tunnel context if this + * is the last session on the tunnel. + */ + session->tunnel = NULL; + l2tp_tunnel_dec_refcount(tunnel); + } + + kfree(session); + + return; +} +EXPORT_SYMBOL_GPL(l2tp_session_free); + +/* This function is used by the netlink SESSION_DELETE command and by + pseudowire modules. + */ +int l2tp_session_delete(struct l2tp_session *session) +{ + if (session->session_close != NULL) + (*session->session_close)(session); + + l2tp_session_dec_refcount(session); + + return 0; +} +EXPORT_SYMBOL_GPL(l2tp_session_delete); + + +/* We come here whenever a session's send_seq, cookie_len or + * l2specific_len parameters are set. + */ +static void l2tp_session_set_header_len(struct l2tp_session *session, int version) +{ + if (version == L2TP_HDR_VER_2) { + session->hdr_len = 6; + if (session->send_seq) + session->hdr_len += 4; + } else { + session->hdr_len = 4 + session->cookie_len + session->l2specific_len + session->offset; + if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP) + session->hdr_len += 4; + } + +} + +struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +{ + struct l2tp_session *session; + + session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL); + if (session != NULL) { + session->magic = L2TP_SESSION_MAGIC; + session->tunnel = tunnel; + + session->session_id = session_id; + session->peer_session_id = peer_session_id; + session->nr = 1; + + sprintf(&session->name[0], "sess %u/%u", + tunnel->tunnel_id, session->session_id); + + skb_queue_head_init(&session->reorder_q); + + INIT_HLIST_NODE(&session->hlist); + INIT_HLIST_NODE(&session->global_hlist); + + /* Inherit debug options from tunnel */ + session->debug = tunnel->debug; + + if (cfg) { + session->pwtype = cfg->pw_type; + session->debug = cfg->debug; + session->mtu = cfg->mtu; + session->mru = cfg->mru; + session->send_seq = cfg->send_seq; + session->recv_seq = cfg->recv_seq; + session->lns_mode = cfg->lns_mode; + session->reorder_timeout = cfg->reorder_timeout; + session->offset = cfg->offset; + session->l2specific_type = cfg->l2specific_type; + session->l2specific_len = cfg->l2specific_len; + session->cookie_len = cfg->cookie_len; + memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len); + session->peer_cookie_len = cfg->peer_cookie_len; + memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len); + } + + if (tunnel->version == L2TP_HDR_VER_2) + session->build_header = l2tp_build_l2tpv2_header; + else + session->build_header = l2tp_build_l2tpv3_header; + + l2tp_session_set_header_len(session, tunnel->version); + + /* Bump the reference count. The session context is deleted + * only when this drops to zero. + */ + l2tp_session_inc_refcount(session); + l2tp_tunnel_inc_refcount(tunnel); + + /* Ensure tunnel socket isn't deleted */ + sock_hold(tunnel->sock); + + /* Add session to the tunnel's hash list */ + write_lock_bh(&tunnel->hlist_lock); + hlist_add_head(&session->hlist, + l2tp_session_id_hash(tunnel, session_id)); + write_unlock_bh(&tunnel->hlist_lock); + + /* And to the global session list if L2TPv3 */ + if (tunnel->version != L2TP_HDR_VER_2) { + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + + spin_lock_bh(&pn->l2tp_session_hlist_lock); + hlist_add_head_rcu(&session->global_hlist, + l2tp_session_id_hash_2(pn, session_id)); + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + } + + /* Ignore management session in session count value */ + if (session->session_id != 0) + atomic_inc(&l2tp_session_count); + } + + return session; +} +EXPORT_SYMBOL_GPL(l2tp_session_create); + +/***************************************************************************** + * Init and cleanup + *****************************************************************************/ + +static __net_init int l2tp_init_net(struct net *net) +{ + struct l2tp_net *pn = net_generic(net, l2tp_net_id); + int hash; + + INIT_LIST_HEAD(&pn->l2tp_tunnel_list); + spin_lock_init(&pn->l2tp_tunnel_list_lock); + + for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) + INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]); + + spin_lock_init(&pn->l2tp_session_hlist_lock); + + return 0; +} + +static struct pernet_operations l2tp_net_ops = { + .init = l2tp_init_net, + .id = &l2tp_net_id, + .size = sizeof(struct l2tp_net), +}; + +static int __init l2tp_init(void) +{ + int rc = 0; + + rc = register_pernet_device(&l2tp_net_ops); + if (rc) + goto out; + + printk(KERN_INFO "L2TP core driver, %s\n", L2TP_DRV_VERSION); + +out: + return rc; +} + +static void __exit l2tp_exit(void) +{ + unregister_pernet_device(&l2tp_net_ops); +} + +module_init(l2tp_init); +module_exit(l2tp_exit); + +MODULE_AUTHOR("James Chapman "); +MODULE_DESCRIPTION("L2TP core"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(L2TP_DRV_VERSION); + diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h new file mode 100644 index 00000000..a16a48e7 --- /dev/null +++ b/net/l2tp/l2tp_core.h @@ -0,0 +1,271 @@ +/* + * L2TP internal definitions. + * + * Copyright (c) 2008,2009 Katalix Systems Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _L2TP_CORE_H_ +#define _L2TP_CORE_H_ + +/* Just some random numbers */ +#define L2TP_TUNNEL_MAGIC 0x42114DDA +#define L2TP_SESSION_MAGIC 0x0C04EB7D + +/* Per tunnel, session hash table size */ +#define L2TP_HASH_BITS 4 +#define L2TP_HASH_SIZE (1 << L2TP_HASH_BITS) + +/* System-wide, session hash table size */ +#define L2TP_HASH_BITS_2 8 +#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2) + +/* Debug message categories for the DEBUG socket option */ +enum { + L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if + * compiled in) */ + L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel + * interface */ + L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */ + L2TP_MSG_DATA = (1 << 3), /* data packets */ +}; + +struct sk_buff; + +struct l2tp_stats { + u64 tx_packets; + u64 tx_bytes; + u64 tx_errors; + u64 rx_packets; + u64 rx_bytes; + u64 rx_seq_discards; + u64 rx_oos_packets; + u64 rx_errors; + u64 rx_cookie_discards; +}; + +struct l2tp_tunnel; + +/* Describes a session. Contains information to determine incoming + * packets and transmit outgoing ones. + */ +struct l2tp_session_cfg { + enum l2tp_pwtype pw_type; + unsigned data_seq:2; /* data sequencing level + * 0 => none, 1 => IP only, + * 2 => all + */ + unsigned recv_seq:1; /* expect receive packets with + * sequence numbers? */ + unsigned send_seq:1; /* send packets with sequence + * numbers? */ + unsigned lns_mode:1; /* behave as LNS? LAC enables + * sequence numbers under + * control of LNS. */ + int debug; /* bitmask of debug message + * categories */ + u16 vlan_id; /* VLAN pseudowire only */ + u16 offset; /* offset to payload */ + u16 l2specific_len; /* Layer 2 specific length */ + u16 l2specific_type; /* Layer 2 specific type */ + u8 cookie[8]; /* optional cookie */ + int cookie_len; /* 0, 4 or 8 bytes */ + u8 peer_cookie[8]; /* peer's cookie */ + int peer_cookie_len; /* 0, 4 or 8 bytes */ + int reorder_timeout; /* configured reorder timeout + * (in jiffies) */ + int mtu; + int mru; + char *ifname; +}; + +struct l2tp_session { + int magic; /* should be + * L2TP_SESSION_MAGIC */ + + struct l2tp_tunnel *tunnel; /* back pointer to tunnel + * context */ + u32 session_id; + u32 peer_session_id; + u8 cookie[8]; + int cookie_len; + u8 peer_cookie[8]; + int peer_cookie_len; + u16 offset; /* offset from end of L2TP header + to beginning of data */ + u16 l2specific_len; + u16 l2specific_type; + u16 hdr_len; + u32 nr; /* session NR state (receive) */ + u32 ns; /* session NR state (send) */ + struct sk_buff_head reorder_q; /* receive reorder queue */ + struct hlist_node hlist; /* Hash list node */ + atomic_t ref_count; + + char name[32]; /* for logging */ + char ifname[IFNAMSIZ]; + unsigned data_seq:2; /* data sequencing level + * 0 => none, 1 => IP only, + * 2 => all + */ + unsigned recv_seq:1; /* expect receive packets with + * sequence numbers? */ + unsigned send_seq:1; /* send packets with sequence + * numbers? */ + unsigned lns_mode:1; /* behave as LNS? LAC enables + * sequence numbers under + * control of LNS. */ + int debug; /* bitmask of debug message + * categories */ + int reorder_timeout; /* configured reorder timeout + * (in jiffies) */ + int mtu; + int mru; + enum l2tp_pwtype pwtype; + struct l2tp_stats stats; + struct hlist_node global_hlist; /* Global hash list node */ + + int (*build_header)(struct l2tp_session *session, void *buf); + void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); + void (*session_close)(struct l2tp_session *session); + void (*ref)(struct l2tp_session *session); + void (*deref)(struct l2tp_session *session); +#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) + void (*show)(struct seq_file *m, void *priv); +#endif + uint8_t priv[0]; /* private data */ +}; + +/* Describes the tunnel. It contains info to track all the associated + * sessions so incoming packets can be sorted out + */ +struct l2tp_tunnel_cfg { + int debug; /* bitmask of debug message + * categories */ + enum l2tp_encap_type encap; + + /* Used only for kernel-created sockets */ + struct in_addr local_ip; + struct in_addr peer_ip; + u16 local_udp_port; + u16 peer_udp_port; + unsigned int use_udp_checksums:1; +}; + +struct l2tp_tunnel { + int magic; /* Should be L2TP_TUNNEL_MAGIC */ + rwlock_t hlist_lock; /* protect session_hlist */ + struct hlist_head session_hlist[L2TP_HASH_SIZE]; + /* hashed list of sessions, + * hashed by id */ + u32 tunnel_id; + u32 peer_tunnel_id; + int version; /* 2=>L2TPv2, 3=>L2TPv3 */ + + char name[20]; /* for logging */ + int debug; /* bitmask of debug message + * categories */ + enum l2tp_encap_type encap; + struct l2tp_stats stats; + + struct list_head list; /* Keep a list of all tunnels */ + struct net *l2tp_net; /* the net we belong to */ + + atomic_t ref_count; +#ifdef CONFIG_DEBUG_FS + void (*show)(struct seq_file *m, void *arg); +#endif + int (*recv_payload_hook)(struct sk_buff *skb); + void (*old_sk_destruct)(struct sock *); + struct sock *sock; /* Parent socket */ + int fd; + + uint8_t priv[0]; /* private data */ +}; + +struct l2tp_nl_cmd_ops { + int (*session_create)(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); + int (*session_delete)(struct l2tp_session *session); +}; + +static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel) +{ + return &tunnel->priv[0]; +} + +static inline void *l2tp_session_priv(struct l2tp_session *session) +{ + return &session->priv[0]; +} + +static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk) +{ + struct l2tp_tunnel *tunnel; + + if (sk == NULL) + return NULL; + + sock_hold(sk); + tunnel = (struct l2tp_tunnel *)(sk->sk_user_data); + if (tunnel == NULL) { + sock_put(sk); + goto out; + } + + BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); + +out: + return tunnel; +} + +extern struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id); +extern struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth); +extern struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname); +extern struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); +extern struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); + +extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); +extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); +extern int l2tp_session_delete(struct l2tp_session *session); +extern void l2tp_session_free(struct l2tp_session *session); +extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); +extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); + +extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); + +extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); +extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); + +/* Session reference counts. Incremented when code obtains a reference + * to a session. + */ +static inline void l2tp_session_inc_refcount_1(struct l2tp_session *session) +{ + atomic_inc(&session->ref_count); +} + +static inline void l2tp_session_dec_refcount_1(struct l2tp_session *session) +{ + if (atomic_dec_and_test(&session->ref_count)) + l2tp_session_free(session); +} + +#ifdef L2TP_REFCNT_DEBUG +#define l2tp_session_inc_refcount(_s) do { \ + printk(KERN_DEBUG "l2tp_session_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_s)->name, atomic_read(&_s->ref_count)); \ + l2tp_session_inc_refcount_1(_s); \ + } while (0) +#define l2tp_session_dec_refcount(_s) do { \ + printk(KERN_DEBUG "l2tp_session_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_s)->name, atomic_read(&_s->ref_count)); \ + l2tp_session_dec_refcount_1(_s); \ + } while (0) +#else +#define l2tp_session_inc_refcount(s) l2tp_session_inc_refcount_1(s) +#define l2tp_session_dec_refcount(s) l2tp_session_dec_refcount_1(s) +#endif + +#endif /* _L2TP_CORE_H_ */ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c new file mode 100644 index 00000000..76130134 --- /dev/null +++ b/net/l2tp/l2tp_debugfs.c @@ -0,0 +1,341 @@ +/* + * L2TP subsystem debugfs + * + * Copyright (c) 2010 Katalix Systems Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "l2tp_core.h" + +static struct dentry *rootdir; +static struct dentry *tunnels; + +struct l2tp_dfs_seq_data { + struct net *net; + int tunnel_idx; /* current tunnel */ + int session_idx; /* index of session within current tunnel */ + struct l2tp_tunnel *tunnel; + struct l2tp_session *session; /* NULL means get next tunnel */ +}; + +static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd) +{ + pd->tunnel = l2tp_tunnel_find_nth(pd->net, pd->tunnel_idx); + pd->tunnel_idx++; +} + +static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) +{ + pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx); + pd->session_idx++; + + if (pd->session == NULL) { + pd->session_idx = 0; + l2tp_dfs_next_tunnel(pd); + } + +} + +static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs) +{ + struct l2tp_dfs_seq_data *pd = SEQ_START_TOKEN; + loff_t pos = *offs; + + if (!pos) + goto out; + + BUG_ON(m->private == NULL); + pd = m->private; + + if (pd->tunnel == NULL) + l2tp_dfs_next_tunnel(pd); + else + l2tp_dfs_next_session(pd); + + /* NULL tunnel and session indicates end of list */ + if ((pd->tunnel == NULL) && (pd->session == NULL)) + pd = NULL; + +out: + return pd; +} + + +static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void l2tp_dfs_seq_stop(struct seq_file *p, void *v) +{ + /* nothing to do */ +} + +static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) +{ + struct l2tp_tunnel *tunnel = v; + int session_count = 0; + int hash; + struct hlist_node *walk; + struct hlist_node *tmp; + + read_lock_bh(&tunnel->hlist_lock); + for (hash = 0; hash < L2TP_HASH_SIZE; hash++) { + hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) { + struct l2tp_session *session; + + session = hlist_entry(walk, struct l2tp_session, hlist); + if (session->session_id == 0) + continue; + + session_count++; + } + } + read_unlock_bh(&tunnel->hlist_lock); + + seq_printf(m, "\nTUNNEL %u peer %u", tunnel->tunnel_id, tunnel->peer_tunnel_id); + if (tunnel->sock) { + struct inet_sock *inet = inet_sk(tunnel->sock); + seq_printf(m, " from %pI4 to %pI4\n", + &inet->inet_saddr, &inet->inet_daddr); + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) + seq_printf(m, " source port %hu, dest port %hu\n", + ntohs(inet->inet_sport), ntohs(inet->inet_dport)); + } + seq_printf(m, " L2TPv%d, %s\n", tunnel->version, + tunnel->encap == L2TP_ENCAPTYPE_UDP ? "UDP" : + tunnel->encap == L2TP_ENCAPTYPE_IP ? "IP" : + ""); + seq_printf(m, " %d sessions, refcnt %d/%d\n", session_count, + tunnel->sock ? atomic_read(&tunnel->sock->sk_refcnt) : 0, + atomic_read(&tunnel->ref_count)); + + seq_printf(m, " %08x rx %llu/%llu/%llu rx %llu/%llu/%llu\n", + tunnel->debug, + (unsigned long long)tunnel->stats.tx_packets, + (unsigned long long)tunnel->stats.tx_bytes, + (unsigned long long)tunnel->stats.tx_errors, + (unsigned long long)tunnel->stats.rx_packets, + (unsigned long long)tunnel->stats.rx_bytes, + (unsigned long long)tunnel->stats.rx_errors); + + if (tunnel->show != NULL) + tunnel->show(m, tunnel); +} + +static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) +{ + struct l2tp_session *session = v; + + seq_printf(m, " SESSION %u, peer %u, %s\n", session->session_id, + session->peer_session_id, + session->pwtype == L2TP_PWTYPE_ETH ? "ETH" : + session->pwtype == L2TP_PWTYPE_PPP ? "PPP" : + ""); + if (session->send_seq || session->recv_seq) + seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns); + seq_printf(m, " refcnt %d\n", atomic_read(&session->ref_count)); + seq_printf(m, " config %d/%d/%c/%c/%s/%s %08x %u\n", + session->mtu, session->mru, + session->recv_seq ? 'R' : '-', + session->send_seq ? 'S' : '-', + session->data_seq == 1 ? "IPSEQ" : + session->data_seq == 2 ? "DATASEQ" : "-", + session->lns_mode ? "LNS" : "LAC", + session->debug, + jiffies_to_msecs(session->reorder_timeout)); + seq_printf(m, " offset %hu l2specific %hu/%hu\n", + session->offset, session->l2specific_type, session->l2specific_len); + if (session->cookie_len) { + seq_printf(m, " cookie %02x%02x%02x%02x", + session->cookie[0], session->cookie[1], + session->cookie[2], session->cookie[3]); + if (session->cookie_len == 8) + seq_printf(m, "%02x%02x%02x%02x", + session->cookie[4], session->cookie[5], + session->cookie[6], session->cookie[7]); + seq_printf(m, "\n"); + } + if (session->peer_cookie_len) { + seq_printf(m, " peer cookie %02x%02x%02x%02x", + session->peer_cookie[0], session->peer_cookie[1], + session->peer_cookie[2], session->peer_cookie[3]); + if (session->peer_cookie_len == 8) + seq_printf(m, "%02x%02x%02x%02x", + session->peer_cookie[4], session->peer_cookie[5], + session->peer_cookie[6], session->peer_cookie[7]); + seq_printf(m, "\n"); + } + + seq_printf(m, " %hu/%hu tx %llu/%llu/%llu rx %llu/%llu/%llu\n", + session->nr, session->ns, + (unsigned long long)session->stats.tx_packets, + (unsigned long long)session->stats.tx_bytes, + (unsigned long long)session->stats.tx_errors, + (unsigned long long)session->stats.rx_packets, + (unsigned long long)session->stats.rx_bytes, + (unsigned long long)session->stats.rx_errors); + + if (session->show != NULL) + session->show(m, session); +} + +static int l2tp_dfs_seq_show(struct seq_file *m, void *v) +{ + struct l2tp_dfs_seq_data *pd = v; + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "TUNNEL ID, peer ID from IP to IP\n"); + seq_puts(m, " L2TPv2/L2TPv3, UDP/IP\n"); + seq_puts(m, " sessions session-count, refcnt refcnt/sk->refcnt\n"); + seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); + seq_puts(m, " SESSION ID, peer ID, PWTYPE\n"); + seq_puts(m, " refcnt cnt\n"); + seq_puts(m, " offset OFFSET l2specific TYPE/LEN\n"); + seq_puts(m, " [ cookie ]\n"); + seq_puts(m, " [ peer cookie ]\n"); + seq_puts(m, " config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n"); + seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); + goto out; + } + + /* Show the tunnel or session context */ + if (pd->session == NULL) + l2tp_dfs_seq_tunnel_show(m, pd->tunnel); + else + l2tp_dfs_seq_session_show(m, pd->session); + +out: + return 0; +} + +static const struct seq_operations l2tp_dfs_seq_ops = { + .start = l2tp_dfs_seq_start, + .next = l2tp_dfs_seq_next, + .stop = l2tp_dfs_seq_stop, + .show = l2tp_dfs_seq_show, +}; + +static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) +{ + struct l2tp_dfs_seq_data *pd; + struct seq_file *seq; + int rc = -ENOMEM; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (pd == NULL) + goto out; + + /* Derive the network namespace from the pid opening the + * file. + */ + pd->net = get_net_ns_by_pid(current->pid); + if (IS_ERR(pd->net)) { + rc = PTR_ERR(pd->net); + goto err_free_pd; + } + + rc = seq_open(file, &l2tp_dfs_seq_ops); + if (rc) + goto err_free_net; + + seq = file->private_data; + seq->private = pd; + +out: + return rc; + +err_free_net: + put_net(pd->net); +err_free_pd: + kfree(pd); + goto out; +} + +static int l2tp_dfs_seq_release(struct inode *inode, struct file *file) +{ + struct l2tp_dfs_seq_data *pd; + struct seq_file *seq; + + seq = file->private_data; + pd = seq->private; + if (pd->net) + put_net(pd->net); + kfree(pd); + seq_release(inode, file); + + return 0; +} + +static const struct file_operations l2tp_dfs_fops = { + .owner = THIS_MODULE, + .open = l2tp_dfs_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = l2tp_dfs_seq_release, +}; + +static int __init l2tp_debugfs_init(void) +{ + int rc = 0; + + rootdir = debugfs_create_dir("l2tp", NULL); + if (IS_ERR(rootdir)) { + rc = PTR_ERR(rootdir); + rootdir = NULL; + goto out; + } + + tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops); + if (tunnels == NULL) + rc = -EIO; + + printk(KERN_INFO "L2TP debugfs support\n"); + +out: + if (rc) + printk(KERN_WARNING "l2tp debugfs: unable to init\n"); + + return rc; +} + +static void __exit l2tp_debugfs_exit(void) +{ + debugfs_remove(tunnels); + debugfs_remove(rootdir); +} + +module_init(l2tp_debugfs_init); +module_exit(l2tp_debugfs_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman "); +MODULE_DESCRIPTION("L2TP debugfs driver"); +MODULE_VERSION("1.0"); diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c new file mode 100644 index 00000000..3c55f633 --- /dev/null +++ b/net/l2tp/l2tp_eth.c @@ -0,0 +1,335 @@ +/* + * L2TPv3 ethernet pseudowire driver + * + * Copyright (c) 2008,2009,2010 Katalix Systems Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "l2tp_core.h" + +/* Default device name. May be overridden by name specified by user */ +#define L2TP_ETH_DEV_NAME "l2tpeth%d" + +/* via netdev_priv() */ +struct l2tp_eth { + struct net_device *dev; + struct sock *tunnel_sock; + struct l2tp_session *session; + struct list_head list; +}; + +/* via l2tp_session_priv() */ +struct l2tp_eth_sess { + struct net_device *dev; +}; + +/* per-net private data for this module */ +static unsigned int l2tp_eth_net_id; +struct l2tp_eth_net { + struct list_head l2tp_eth_dev_list; + spinlock_t l2tp_eth_lock; +}; + +static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) +{ + return net_generic(net, l2tp_eth_net_id); +} + +static int l2tp_eth_dev_init(struct net_device *dev) +{ + struct l2tp_eth *priv = netdev_priv(dev); + + priv->dev = dev; + random_ether_addr(dev->dev_addr); + memset(&dev->broadcast[0], 0xff, 6); + + return 0; +} + +static void l2tp_eth_dev_uninit(struct net_device *dev) +{ + struct l2tp_eth *priv = netdev_priv(dev); + struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); + + spin_lock(&pn->l2tp_eth_lock); + list_del_init(&priv->list); + spin_unlock(&pn->l2tp_eth_lock); + dev_put(dev); +} + +static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct l2tp_eth *priv = netdev_priv(dev); + struct l2tp_session *session = priv->session; + + l2tp_xmit_skb(session, skb, session->hdr_len); + + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + + return 0; +} + +static struct net_device_ops l2tp_eth_netdev_ops = { + .ndo_init = l2tp_eth_dev_init, + .ndo_uninit = l2tp_eth_dev_uninit, + .ndo_start_xmit = l2tp_eth_dev_xmit, +}; + +static void l2tp_eth_dev_setup(struct net_device *dev) +{ + ether_setup(dev); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->netdev_ops = &l2tp_eth_netdev_ops; + dev->destructor = free_netdev; +} + +static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) +{ + struct l2tp_eth_sess *spriv = l2tp_session_priv(session); + struct net_device *dev = spriv->dev; + + if (session->debug & L2TP_MSG_DATA) { + unsigned int length; + int offset; + u8 *ptr = skb->data; + + length = min(32u, skb->len); + if (!pskb_may_pull(skb, length)) + goto error; + + printk(KERN_DEBUG "%s: eth recv: ", session->name); + + offset = 0; + do { + printk(" %02X", ptr[offset]); + } while (++offset < length); + + printk("\n"); + } + + if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) + goto error; + + secpath_reset(skb); + + /* checksums verified by L2TP */ + skb->ip_summed = CHECKSUM_NONE; + + skb_dst_drop(skb); + nf_reset(skb); + + if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { + dev->stats.rx_packets++; + dev->stats.rx_bytes += data_len; + } else + dev->stats.rx_errors++; + + return; + +error: + dev->stats.rx_errors++; + kfree_skb(skb); +} + +static void l2tp_eth_delete(struct l2tp_session *session) +{ + struct l2tp_eth_sess *spriv; + struct net_device *dev; + + if (session) { + spriv = l2tp_session_priv(session); + dev = spriv->dev; + if (dev) { + unregister_netdev(dev); + spriv->dev = NULL; + module_put(THIS_MODULE); + } + } +} + +#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +static void l2tp_eth_show(struct seq_file *m, void *arg) +{ + struct l2tp_session *session = arg; + struct l2tp_eth_sess *spriv = l2tp_session_priv(session); + struct net_device *dev = spriv->dev; + + seq_printf(m, " interface %s\n", dev->name); +} +#endif + +static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +{ + struct net_device *dev; + char name[IFNAMSIZ]; + struct l2tp_tunnel *tunnel; + struct l2tp_session *session; + struct l2tp_eth *priv; + struct l2tp_eth_sess *spriv; + int rc; + struct l2tp_eth_net *pn; + + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (!tunnel) { + rc = -ENODEV; + goto out; + } + + session = l2tp_session_find(net, tunnel, session_id); + if (session) { + rc = -EEXIST; + goto out; + } + + if (cfg->ifname) { + dev = dev_get_by_name(net, cfg->ifname); + if (dev) { + dev_put(dev); + rc = -EEXIST; + goto out; + } + strlcpy(name, cfg->ifname, IFNAMSIZ); + } else + strcpy(name, L2TP_ETH_DEV_NAME); + + session = l2tp_session_create(sizeof(*spriv), tunnel, session_id, + peer_session_id, cfg); + if (!session) { + rc = -ENOMEM; + goto out; + } + + dev = alloc_netdev(sizeof(*priv), name, l2tp_eth_dev_setup); + if (!dev) { + rc = -ENOMEM; + goto out_del_session; + } + + dev_net_set(dev, net); + if (session->mtu == 0) + session->mtu = dev->mtu - session->hdr_len; + dev->mtu = session->mtu; + dev->needed_headroom += session->hdr_len; + + priv = netdev_priv(dev); + priv->dev = dev; + priv->session = session; + INIT_LIST_HEAD(&priv->list); + + priv->tunnel_sock = tunnel->sock; + session->recv_skb = l2tp_eth_dev_recv; + session->session_close = l2tp_eth_delete; +#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) + session->show = l2tp_eth_show; +#endif + + spriv = l2tp_session_priv(session); + spriv->dev = dev; + + rc = register_netdev(dev); + if (rc < 0) + goto out_del_dev; + + __module_get(THIS_MODULE); + /* Must be done after register_netdev() */ + strlcpy(session->ifname, dev->name, IFNAMSIZ); + + dev_hold(dev); + pn = l2tp_eth_pernet(dev_net(dev)); + spin_lock(&pn->l2tp_eth_lock); + list_add(&priv->list, &pn->l2tp_eth_dev_list); + spin_unlock(&pn->l2tp_eth_lock); + + return 0; + +out_del_dev: + free_netdev(dev); +out_del_session: + l2tp_session_delete(session); +out: + return rc; +} + +static __net_init int l2tp_eth_init_net(struct net *net) +{ + struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); + + INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); + spin_lock_init(&pn->l2tp_eth_lock); + + return 0; +} + +static struct pernet_operations l2tp_eth_net_ops = { + .init = l2tp_eth_init_net, + .id = &l2tp_eth_net_id, + .size = sizeof(struct l2tp_eth_net), +}; + + +static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { + .session_create = l2tp_eth_create, + .session_delete = l2tp_session_delete, +}; + + +static int __init l2tp_eth_init(void) +{ + int err = 0; + + err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); + if (err) + goto out; + + err = register_pernet_device(&l2tp_eth_net_ops); + if (err) + goto out_unreg; + + printk(KERN_INFO "L2TP ethernet pseudowire support (L2TPv3)\n"); + + return 0; + +out_unreg: + l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); +out: + return err; +} + +static void __exit l2tp_eth_exit(void) +{ + unregister_pernet_device(&l2tp_eth_net_ops); + l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); +} + +module_init(l2tp_eth_init); +module_exit(l2tp_eth_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman "); +MODULE_DESCRIPTION("L2TP ethernet pseudowire driver"); +MODULE_VERSION("1.0"); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c new file mode 100644 index 00000000..78bc442b --- /dev/null +++ b/net/l2tp/l2tp_ip.c @@ -0,0 +1,707 @@ +/* + * L2TPv3 IP encapsulation support + * + * Copyright (c) 2008,2009,2010 Katalix Systems Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "l2tp_core.h" + +struct l2tp_ip_sock { + /* inet_sock has to be the first member of l2tp_ip_sock */ + struct inet_sock inet; + + __u32 conn_id; + __u32 peer_conn_id; + + __u64 tx_packets; + __u64 tx_bytes; + __u64 tx_errors; + __u64 rx_packets; + __u64 rx_bytes; + __u64 rx_errors; +}; + +static DEFINE_RWLOCK(l2tp_ip_lock); +static struct hlist_head l2tp_ip_table; +static struct hlist_head l2tp_ip_bind_table; + +static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) +{ + return (struct l2tp_ip_sock *)sk; +} + +static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) +{ + struct hlist_node *node; + struct sock *sk; + + sk_for_each_bound(sk, node, &l2tp_ip_bind_table) { + struct inet_sock *inet = inet_sk(sk); + struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); + + if (l2tp == NULL) + continue; + + if ((l2tp->conn_id == tunnel_id) && + net_eq(sock_net(sk), net) && + !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + goto found; + } + + sk = NULL; +found: + return sk; +} + +static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) +{ + struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id); + if (sk) + sock_hold(sk); + + return sk; +} + +/* When processing receive frames, there are two cases to + * consider. Data frames consist of a non-zero session-id and an + * optional cookie. Control frames consist of a regular L2TP header + * preceded by 32-bits of zeros. + * + * L2TPv3 Session Header Over IP + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Session ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Cookie (optional, maximum 64 bits)... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * L2TPv3 Control Message Header Over IP + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | (32 bits of zeros) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Control Connection ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Ns | Nr | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * All control frames are passed to userspace. + */ +static int l2tp_ip_recv(struct sk_buff *skb) +{ + struct sock *sk; + u32 session_id; + u32 tunnel_id; + unsigned char *ptr, *optr; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel = NULL; + int length; + int offset; + + /* Point to L2TP header */ + optr = ptr = skb->data; + + if (!pskb_may_pull(skb, 4)) + goto discard; + + session_id = ntohl(*((__be32 *) ptr)); + ptr += 4; + + /* RFC3931: L2TP/IP packets have the first 4 bytes containing + * the session_id. If it is 0, the packet is a L2TP control + * frame and the session_id value can be discarded. + */ + if (session_id == 0) { + __skb_pull(skb, 4); + goto pass_up; + } + + /* Ok, this is a data packet. Lookup the session. */ + session = l2tp_session_find(&init_net, NULL, session_id); + if (session == NULL) + goto discard; + + tunnel = session->tunnel; + if (tunnel == NULL) + goto discard; + + /* Trace packet contents, if enabled */ + if (tunnel->debug & L2TP_MSG_DATA) { + length = min(32u, skb->len); + if (!pskb_may_pull(skb, length)) + goto discard; + + printk(KERN_DEBUG "%s: ip recv: ", tunnel->name); + + offset = 0; + do { + printk(" %02X", ptr[offset]); + } while (++offset < length); + + printk("\n"); + } + + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook); + + return 0; + +pass_up: + /* Get the tunnel_id from the L2TP header */ + if (!pskb_may_pull(skb, 12)) + goto discard; + + if ((skb->data[0] & 0xc0) != 0xc0) + goto discard; + + tunnel_id = ntohl(*(__be32 *) &skb->data[4]); + tunnel = l2tp_tunnel_find(&init_net, tunnel_id); + if (tunnel != NULL) + sk = tunnel->sock; + else { + struct iphdr *iph = (struct iphdr *) skb_network_header(skb); + + read_lock_bh(&l2tp_ip_lock); + sk = __l2tp_ip_bind_lookup(&init_net, iph->daddr, 0, tunnel_id); + read_unlock_bh(&l2tp_ip_lock); + } + + if (sk == NULL) + goto discard; + + sock_hold(sk); + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_put; + + nf_reset(skb); + + return sk_receive_skb(sk, skb, 1); + +discard_put: + sock_put(sk); + +discard: + kfree_skb(skb); + return 0; +} + +static int l2tp_ip_open(struct sock *sk) +{ + /* Prevent autobind. We don't have ports. */ + inet_sk(sk)->inet_num = IPPROTO_L2TP; + + write_lock_bh(&l2tp_ip_lock); + sk_add_node(sk, &l2tp_ip_table); + write_unlock_bh(&l2tp_ip_lock); + + return 0; +} + +static void l2tp_ip_close(struct sock *sk, long timeout) +{ + write_lock_bh(&l2tp_ip_lock); + hlist_del_init(&sk->sk_bind_node); + hlist_del_init(&sk->sk_node); + write_unlock_bh(&l2tp_ip_lock); + sk_common_release(sk); +} + +static void l2tp_ip_destroy_sock(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + kfree_skb(skb); + + sk_refcnt_debug_dec(sk); +} + +static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr; + int ret; + int chk_addr_ret; + + if (!sock_flag(sk, SOCK_ZAPPED)) + return -EINVAL; + if (addr_len < sizeof(struct sockaddr_l2tpip)) + return -EINVAL; + if (addr->l2tp_family != AF_INET) + return -EINVAL; + + ret = -EADDRINUSE; + read_lock_bh(&l2tp_ip_lock); + if (__l2tp_ip_bind_lookup(&init_net, addr->l2tp_addr.s_addr, sk->sk_bound_dev_if, addr->l2tp_conn_id)) + goto out_in_use; + + read_unlock_bh(&l2tp_ip_lock); + + lock_sock(sk); + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) + goto out; + + chk_addr_ret = inet_addr_type(&init_net, addr->l2tp_addr.s_addr); + ret = -EADDRNOTAVAIL; + if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) + goto out; + + inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + sk_dst_reset(sk); + + l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; + + write_lock_bh(&l2tp_ip_lock); + sk_add_bind_node(sk, &l2tp_ip_bind_table); + sk_del_node_init(sk); + write_unlock_bh(&l2tp_ip_lock); + ret = 0; + sock_reset_flag(sk, SOCK_ZAPPED); + +out: + release_sock(sk); + + return ret; + +out_in_use: + read_unlock_bh(&l2tp_ip_lock); + + return ret; +} + +static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct flowi4 *fl4; + struct rtable *rt; + __be32 saddr; + int oif, rc; + + if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ + return -EINVAL; + + if (addr_len < sizeof(*lsa)) + return -EINVAL; + + if (lsa->l2tp_family != AF_INET) + return -EAFNOSUPPORT; + + lock_sock(sk); + + sk_dst_reset(sk); + + oif = sk->sk_bound_dev_if; + saddr = inet->inet_saddr; + + rc = -EINVAL; + if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) + goto out; + + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, lsa->l2tp_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + IPPROTO_L2TP, + 0, 0, sk, true); + if (IS_ERR(rt)) { + rc = PTR_ERR(rt); + if (rc == -ENETUNREACH) + IP_INC_STATS_BH(&init_net, IPSTATS_MIB_OUTNOROUTES); + goto out; + } + + rc = -ENETUNREACH; + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + goto out; + } + + l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; + + if (!inet->inet_saddr) + inet->inet_saddr = fl4->saddr; + if (!inet->inet_rcv_saddr) + inet->inet_rcv_saddr = fl4->saddr; + inet->inet_daddr = fl4->daddr; + sk->sk_state = TCP_ESTABLISHED; + inet->inet_id = jiffies; + + sk_dst_set(sk, &rt->dst); + + write_lock_bh(&l2tp_ip_lock); + hlist_del_init(&sk->sk_bind_node); + sk_add_bind_node(sk, &l2tp_ip_bind_table); + write_unlock_bh(&l2tp_ip_lock); + + rc = 0; +out: + release_sock(sk); + return rc; +} + +static int l2tp_ip_disconnect(struct sock *sk, int flags) +{ + if (sock_flag(sk, SOCK_ZAPPED)) + return 0; + + return udp_disconnect(sk, flags); +} + +static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk); + struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; + + memset(lsa, 0, sizeof(*lsa)); + lsa->l2tp_family = AF_INET; + if (peer) { + if (!inet->inet_dport) + return -ENOTCONN; + lsa->l2tp_conn_id = lsk->peer_conn_id; + lsa->l2tp_addr.s_addr = inet->inet_daddr; + } else { + __be32 addr = inet->inet_rcv_saddr; + if (!addr) + addr = inet->inet_saddr; + lsa->l2tp_conn_id = lsk->conn_id; + lsa->l2tp_addr.s_addr = addr; + } + *uaddr_len = sizeof(*lsa); + return 0; +} + +static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + /* Charge it to the socket, dropping if the queue is full. */ + rc = sock_queue_rcv_skb(sk, skb); + if (rc < 0) + goto drop; + + return 0; + +drop: + IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS); + kfree_skb(skb); + return -1; +} + +/* Userspace will call sendmsg() on the tunnel socket to send L2TP + * control frames. + */ +static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) +{ + struct sk_buff *skb; + int rc; + struct l2tp_ip_sock *lsa = l2tp_ip_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = NULL; + struct flowi4 *fl4; + int connected = 0; + __be32 daddr; + + lock_sock(sk); + + rc = -ENOTCONN; + if (sock_flag(sk, SOCK_DEAD)) + goto out; + + /* Get and verify the address. */ + if (msg->msg_name) { + struct sockaddr_l2tpip *lip = (struct sockaddr_l2tpip *) msg->msg_name; + rc = -EINVAL; + if (msg->msg_namelen < sizeof(*lip)) + goto out; + + if (lip->l2tp_family != AF_INET) { + rc = -EAFNOSUPPORT; + if (lip->l2tp_family != AF_UNSPEC) + goto out; + } + + daddr = lip->l2tp_addr.s_addr; + } else { + rc = -EDESTADDRREQ; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + daddr = inet->inet_daddr; + connected = 1; + } + + /* Allocate a socket buffer */ + rc = -ENOMEM; + skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) + + 4 + len, 0, GFP_KERNEL); + if (!skb) + goto error; + + /* Reserve space for headers, putting IP header on 4-byte boundary. */ + skb_reserve(skb, 2 + NET_SKB_PAD); + skb_reset_network_header(skb); + skb_reserve(skb, sizeof(struct iphdr)); + skb_reset_transport_header(skb); + + /* Insert 0 session_id */ + *((__be32 *) skb_put(skb, 4)) = 0; + + /* Copy user data into skb */ + rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (rc < 0) { + kfree_skb(skb); + goto error; + } + + fl4 = &inet->cork.fl.u.ip4; + if (connected) + rt = (struct rtable *) __sk_dst_check(sk, 0); + + if (rt == NULL) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + + /* Use correct destination address if we have options. */ + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + + rcu_read_unlock(); + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + rt = ip_route_output_ports(sock_net(sk), fl4, sk, + daddr, inet->inet_saddr, + inet->inet_dport, inet->inet_sport, + sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; + sk_setup_caps(sk, &rt->dst); + } + skb_dst_set(skb, dst_clone(&rt->dst)); + + /* Queue the packet to IP for output */ + rc = ip_queue_xmit(skb, &inet->cork.fl); + +error: + /* Update stats */ + if (rc >= 0) { + lsa->tx_packets++; + lsa->tx_bytes += len; + rc = len; + } else { + lsa->tx_errors++; + } + +out: + release_sock(sk); + return rc; + +no_route: + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + rc = -EHOSTUNREACH; + goto out; +} + +static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk); + size_t copied = 0; + int err = -EOPNOTSUPP; + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + if (flags & MSG_TRUNC) + copied = skb->len; +done: + skb_free_datagram(sk, skb); +out: + if (err) { + lsk->rx_errors++; + return err; + } + + lsk->rx_packets++; + lsk->rx_bytes += copied; + + return copied; +} + +static struct proto l2tp_ip_prot = { + .name = "L2TP/IP", + .owner = THIS_MODULE, + .init = l2tp_ip_open, + .close = l2tp_ip_close, + .bind = l2tp_ip_bind, + .connect = l2tp_ip_connect, + .disconnect = l2tp_ip_disconnect, + .ioctl = udp_ioctl, + .destroy = l2tp_ip_destroy_sock, + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .sendmsg = l2tp_ip_sendmsg, + .recvmsg = l2tp_ip_recvmsg, + .backlog_rcv = l2tp_ip_backlog_recv, + .hash = inet_hash, + .unhash = inet_unhash, + .obj_size = sizeof(struct l2tp_ip_sock), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, +#endif +}; + +static const struct proto_ops l2tp_ip_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = l2tp_ip_getname, + .poll = datagram_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw l2tp_ip_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_L2TP, + .prot = &l2tp_ip_prot, + .ops = &l2tp_ip_ops, + .no_check = 0, +}; + +static struct net_protocol l2tp_ip_protocol __read_mostly = { + .handler = l2tp_ip_recv, +}; + +static int __init l2tp_ip_init(void) +{ + int err; + + printk(KERN_INFO "L2TP IP encapsulation support (L2TPv3)\n"); + + err = proto_register(&l2tp_ip_prot, 1); + if (err != 0) + goto out; + + err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); + if (err) + goto out1; + + inet_register_protosw(&l2tp_ip_protosw); + return 0; + +out1: + proto_unregister(&l2tp_ip_prot); +out: + return err; +} + +static void __exit l2tp_ip_exit(void) +{ + inet_unregister_protosw(&l2tp_ip_protosw); + inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); + proto_unregister(&l2tp_ip_prot); +} + +module_init(l2tp_ip_init); +module_exit(l2tp_ip_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Chapman "); +MODULE_DESCRIPTION("L2TP over IP"); +MODULE_VERSION("1.0"); + +/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like + * enums + */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c new file mode 100644 index 00000000..93a41a09 --- /dev/null +++ b/net/l2tp/l2tp_netlink.c @@ -0,0 +1,841 @@ +/* + * L2TP netlink layer, for management + * + * Copyright (c) 2008,2009,2010 Katalix Systems Ltd + * + * Partly based on the IrDA nelink implementation + * (see net/irda/irnetlink.c) which is: + * Copyright (c) 2007 Samuel Ortiz + * which is in turn partly based on the wireless netlink code: + * Copyright 2006 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "l2tp_core.h" + + +static struct genl_family l2tp_nl_family = { + .id = GENL_ID_GENERATE, + .name = L2TP_GENL_NAME, + .version = L2TP_GENL_VERSION, + .hdrsize = 0, + .maxattr = L2TP_ATTR_MAX, +}; + +/* Accessed under genl lock */ +static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; + +static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info) +{ + u32 tunnel_id; + u32 session_id; + char *ifname; + struct l2tp_tunnel *tunnel; + struct l2tp_session *session = NULL; + struct net *net = genl_info_net(info); + + if (info->attrs[L2TP_ATTR_IFNAME]) { + ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); + session = l2tp_session_find_by_ifname(net, ifname); + } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && + (info->attrs[L2TP_ATTR_CONN_ID])) { + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (tunnel) + session = l2tp_session_find(net, tunnel, session_id); + } + + return session; +} + +static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + int ret = -ENOBUFS; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq, + &l2tp_nl_family, 0, L2TP_CMD_NOOP); + if (IS_ERR(hdr)) { + ret = PTR_ERR(hdr); + goto err_out; + } + + genlmsg_end(msg, hdr); + + return genlmsg_unicast(genl_info_net(info), msg, info->snd_pid); + +err_out: + nlmsg_free(msg); + +out: + return ret; +} + +static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info) +{ + u32 tunnel_id; + u32 peer_tunnel_id; + int proto_version; + int fd; + int ret = 0; + struct l2tp_tunnel_cfg cfg = { 0, }; + struct l2tp_tunnel *tunnel; + struct net *net = genl_info_net(info); + + if (!info->attrs[L2TP_ATTR_CONN_ID]) { + ret = -EINVAL; + goto out; + } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + + if (!info->attrs[L2TP_ATTR_PEER_CONN_ID]) { + ret = -EINVAL; + goto out; + } + peer_tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_CONN_ID]); + + if (!info->attrs[L2TP_ATTR_PROTO_VERSION]) { + ret = -EINVAL; + goto out; + } + proto_version = nla_get_u8(info->attrs[L2TP_ATTR_PROTO_VERSION]); + + if (!info->attrs[L2TP_ATTR_ENCAP_TYPE]) { + ret = -EINVAL; + goto out; + } + cfg.encap = nla_get_u16(info->attrs[L2TP_ATTR_ENCAP_TYPE]); + + fd = -1; + if (info->attrs[L2TP_ATTR_FD]) { + fd = nla_get_u32(info->attrs[L2TP_ATTR_FD]); + } else { + if (info->attrs[L2TP_ATTR_IP_SADDR]) + cfg.local_ip.s_addr = nla_get_be32(info->attrs[L2TP_ATTR_IP_SADDR]); + if (info->attrs[L2TP_ATTR_IP_DADDR]) + cfg.peer_ip.s_addr = nla_get_be32(info->attrs[L2TP_ATTR_IP_DADDR]); + if (info->attrs[L2TP_ATTR_UDP_SPORT]) + cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]); + if (info->attrs[L2TP_ATTR_UDP_DPORT]) + cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]); + if (info->attrs[L2TP_ATTR_UDP_CSUM]) + cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]); + } + + if (info->attrs[L2TP_ATTR_DEBUG]) + cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); + + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (tunnel != NULL) { + ret = -EEXIST; + goto out; + } + + ret = -EINVAL; + switch (cfg.encap) { + case L2TP_ENCAPTYPE_UDP: + case L2TP_ENCAPTYPE_IP: + ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id, + peer_tunnel_id, &cfg, &tunnel); + break; + } + +out: + return ret; +} + +static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info) +{ + struct l2tp_tunnel *tunnel; + u32 tunnel_id; + int ret = 0; + struct net *net = genl_info_net(info); + + if (!info->attrs[L2TP_ATTR_CONN_ID]) { + ret = -EINVAL; + goto out; + } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (tunnel == NULL) { + ret = -ENODEV; + goto out; + } + + (void) l2tp_tunnel_delete(tunnel); + +out: + return ret; +} + +static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info) +{ + struct l2tp_tunnel *tunnel; + u32 tunnel_id; + int ret = 0; + struct net *net = genl_info_net(info); + + if (!info->attrs[L2TP_ATTR_CONN_ID]) { + ret = -EINVAL; + goto out; + } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (tunnel == NULL) { + ret = -ENODEV; + goto out; + } + + if (info->attrs[L2TP_ATTR_DEBUG]) + tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); + +out: + return ret; +} + +static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 pid, u32 seq, int flags, + struct l2tp_tunnel *tunnel) +{ + void *hdr; + struct nlattr *nest; + struct sock *sk = NULL; + struct inet_sock *inet; + + hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, + L2TP_CMD_TUNNEL_GET); + if (IS_ERR(hdr)) + return PTR_ERR(hdr); + + NLA_PUT_U8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version); + NLA_PUT_U32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id); + NLA_PUT_U32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id); + NLA_PUT_U32(skb, L2TP_ATTR_DEBUG, tunnel->debug); + NLA_PUT_U16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap); + + nest = nla_nest_start(skb, L2TP_ATTR_STATS); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT_U64(skb, L2TP_ATTR_TX_PACKETS, tunnel->stats.tx_packets); + NLA_PUT_U64(skb, L2TP_ATTR_TX_BYTES, tunnel->stats.tx_bytes); + NLA_PUT_U64(skb, L2TP_ATTR_TX_ERRORS, tunnel->stats.tx_errors); + NLA_PUT_U64(skb, L2TP_ATTR_RX_PACKETS, tunnel->stats.rx_packets); + NLA_PUT_U64(skb, L2TP_ATTR_RX_BYTES, tunnel->stats.rx_bytes); + NLA_PUT_U64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, tunnel->stats.rx_seq_discards); + NLA_PUT_U64(skb, L2TP_ATTR_RX_OOS_PACKETS, tunnel->stats.rx_oos_packets); + NLA_PUT_U64(skb, L2TP_ATTR_RX_ERRORS, tunnel->stats.rx_errors); + nla_nest_end(skb, nest); + + sk = tunnel->sock; + if (!sk) + goto out; + + inet = inet_sk(sk); + + switch (tunnel->encap) { + case L2TP_ENCAPTYPE_UDP: + NLA_PUT_U16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)); + NLA_PUT_U16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)); + NLA_PUT_U8(skb, L2TP_ATTR_UDP_CSUM, (sk->sk_no_check != UDP_CSUM_NOXMIT)); + /* NOBREAK */ + case L2TP_ENCAPTYPE_IP: + NLA_PUT_BE32(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr); + NLA_PUT_BE32(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr); + break; + } + +out: + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -1; +} + +static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) +{ + struct l2tp_tunnel *tunnel; + struct sk_buff *msg; + u32 tunnel_id; + int ret = -ENOBUFS; + struct net *net = genl_info_net(info); + + if (!info->attrs[L2TP_ATTR_CONN_ID]) { + ret = -EINVAL; + goto out; + } + + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (tunnel == NULL) { + ret = -ENODEV; + goto out; + } + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + ret = l2tp_nl_tunnel_send(msg, info->snd_pid, info->snd_seq, + NLM_F_ACK, tunnel); + if (ret < 0) + goto err_out; + + return genlmsg_unicast(net, msg, info->snd_pid); + +err_out: + nlmsg_free(msg); + +out: + return ret; +} + +static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int ti = cb->args[0]; + struct l2tp_tunnel *tunnel; + struct net *net = sock_net(skb->sk); + + for (;;) { + tunnel = l2tp_tunnel_find_nth(net, ti); + if (tunnel == NULL) + goto out; + + if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + tunnel) <= 0) + goto out; + + ti++; + } + +out: + cb->args[0] = ti; + + return skb->len; +} + +static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *info) +{ + u32 tunnel_id = 0; + u32 session_id; + u32 peer_session_id; + int ret = 0; + struct l2tp_tunnel *tunnel; + struct l2tp_session *session; + struct l2tp_session_cfg cfg = { 0, }; + struct net *net = genl_info_net(info); + + if (!info->attrs[L2TP_ATTR_CONN_ID]) { + ret = -EINVAL; + goto out; + } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); + tunnel = l2tp_tunnel_find(net, tunnel_id); + if (!tunnel) { + ret = -ENODEV; + goto out; + } + + if (!info->attrs[L2TP_ATTR_SESSION_ID]) { + ret = -EINVAL; + goto out; + } + session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); + session = l2tp_session_find(net, tunnel, session_id); + if (session) { + ret = -EEXIST; + goto out; + } + + if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { + ret = -EINVAL; + goto out; + } + peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); + + if (!info->attrs[L2TP_ATTR_PW_TYPE]) { + ret = -EINVAL; + goto out; + } + cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); + if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { + ret = -EINVAL; + goto out; + } + + if (tunnel->version > 2) { + if (info->attrs[L2TP_ATTR_OFFSET]) + cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]); + + if (info->attrs[L2TP_ATTR_DATA_SEQ]) + cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); + + cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT; + if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) + cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); + + cfg.l2specific_len = 4; + if (info->attrs[L2TP_ATTR_L2SPEC_LEN]) + cfg.l2specific_len = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_LEN]); + + if (info->attrs[L2TP_ATTR_COOKIE]) { + u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); + if (len > 8) { + ret = -EINVAL; + goto out; + } + cfg.cookie_len = len; + memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); + } + if (info->attrs[L2TP_ATTR_PEER_COOKIE]) { + u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); + if (len > 8) { + ret = -EINVAL; + goto out; + } + cfg.peer_cookie_len = len; + memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); + } + if (info->attrs[L2TP_ATTR_IFNAME]) + cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); + + if (info->attrs[L2TP_ATTR_VLAN_ID]) + cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]); + } + + if (info->attrs[L2TP_ATTR_DEBUG]) + cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); + + if (info->attrs[L2TP_ATTR_RECV_SEQ]) + cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); + + if (info->attrs[L2TP_ATTR_SEND_SEQ]) + cfg.send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); + + if (info->attrs[L2TP_ATTR_LNS_MODE]) + cfg.lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); + + if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) + cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); + + if (info->attrs[L2TP_ATTR_MTU]) + cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]); + + if (info->attrs[L2TP_ATTR_MRU]) + cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]); + + if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || + (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { + ret = -EPROTONOSUPPORT; + goto out; + } + + /* Check that pseudowire-specific params are present */ + switch (cfg.pw_type) { + case L2TP_PWTYPE_NONE: + break; + case L2TP_PWTYPE_ETH_VLAN: + if (!info->attrs[L2TP_ATTR_VLAN_ID]) { + ret = -EINVAL; + goto out; + } + break; + case L2TP_PWTYPE_ETH: + break; + case L2TP_PWTYPE_PPP: + case L2TP_PWTYPE_PPP_AC: + break; + case L2TP_PWTYPE_IP: + default: + ret = -EPROTONOSUPPORT; + break; + } + + ret = -EPROTONOSUPPORT; + if (l2tp_nl_cmd_ops[cfg.pw_type]->session_create) + ret = (*l2tp_nl_cmd_ops[cfg.pw_type]->session_create)(net, tunnel_id, + session_id, peer_session_id, &cfg); + +out: + return ret; +} + +static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *info) +{ + int ret = 0; + struct l2tp_session *session; + u16 pw_type; + + session = l2tp_nl_session_find(info); + if (session == NULL) { + ret = -ENODEV; + goto out; + } + + pw_type = session->pwtype; + if (pw_type < __L2TP_PWTYPE_MAX) + if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) + ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session); + +out: + return ret; +} + +static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *info) +{ + int ret = 0; + struct l2tp_session *session; + + session = l2tp_nl_session_find(info); + if (session == NULL) { + ret = -ENODEV; + goto out; + } + + if (info->attrs[L2TP_ATTR_DEBUG]) + session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); + + if (info->attrs[L2TP_ATTR_DATA_SEQ]) + session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); + + if (info->attrs[L2TP_ATTR_RECV_SEQ]) + session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); + + if (info->attrs[L2TP_ATTR_SEND_SEQ]) + session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); + + if (info->attrs[L2TP_ATTR_LNS_MODE]) + session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); + + if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) + session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); + + if (info->attrs[L2TP_ATTR_MTU]) + session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]); + + if (info->attrs[L2TP_ATTR_MRU]) + session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]); + +out: + return ret; +} + +static int l2tp_nl_session_send(struct sk_buff *skb, u32 pid, u32 seq, int flags, + struct l2tp_session *session) +{ + void *hdr; + struct nlattr *nest; + struct l2tp_tunnel *tunnel = session->tunnel; + struct sock *sk = NULL; + + sk = tunnel->sock; + + hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, L2TP_CMD_SESSION_GET); + if (IS_ERR(hdr)) + return PTR_ERR(hdr); + + NLA_PUT_U32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id); + NLA_PUT_U32(skb, L2TP_ATTR_SESSION_ID, session->session_id); + NLA_PUT_U32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id); + NLA_PUT_U32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id); + NLA_PUT_U32(skb, L2TP_ATTR_DEBUG, session->debug); + NLA_PUT_U16(skb, L2TP_ATTR_PW_TYPE, session->pwtype); + NLA_PUT_U16(skb, L2TP_ATTR_MTU, session->mtu); + if (session->mru) + NLA_PUT_U16(skb, L2TP_ATTR_MRU, session->mru); + + if (session->ifname && session->ifname[0]) + NLA_PUT_STRING(skb, L2TP_ATTR_IFNAME, session->ifname); + if (session->cookie_len) + NLA_PUT(skb, L2TP_ATTR_COOKIE, session->cookie_len, &session->cookie[0]); + if (session->peer_cookie_len) + NLA_PUT(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, &session->peer_cookie[0]); + NLA_PUT_U8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq); + NLA_PUT_U8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq); + NLA_PUT_U8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode); +#ifdef CONFIG_XFRM + if ((sk) && (sk->sk_policy[0] || sk->sk_policy[1])) + NLA_PUT_U8(skb, L2TP_ATTR_USING_IPSEC, 1); +#endif + if (session->reorder_timeout) + NLA_PUT_MSECS(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout); + + nest = nla_nest_start(skb, L2TP_ATTR_STATS); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT_U64(skb, L2TP_ATTR_TX_PACKETS, session->stats.tx_packets); + NLA_PUT_U64(skb, L2TP_ATTR_TX_BYTES, session->stats.tx_bytes); + NLA_PUT_U64(skb, L2TP_ATTR_TX_ERRORS, session->stats.tx_errors); + NLA_PUT_U64(skb, L2TP_ATTR_RX_PACKETS, session->stats.rx_packets); + NLA_PUT_U64(skb, L2TP_ATTR_RX_BYTES, session->stats.rx_bytes); + NLA_PUT_U64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, session->stats.rx_seq_discards); + NLA_PUT_U64(skb, L2TP_ATTR_RX_OOS_PACKETS, session->stats.rx_oos_packets); + NLA_PUT_U64(skb, L2TP_ATTR_RX_ERRORS, session->stats.rx_errors); + nla_nest_end(skb, nest); + + return genlmsg_end(skb, hdr); + + nla_put_failure: + genlmsg_cancel(skb, hdr); + return -1; +} + +static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) +{ + struct l2tp_session *session; + struct sk_buff *msg; + int ret; + + session = l2tp_nl_session_find(info); + if (session == NULL) { + ret = -ENODEV; + goto out; + } + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + ret = l2tp_nl_session_send(msg, info->snd_pid, info->snd_seq, + 0, session); + if (ret < 0) + goto err_out; + + return genlmsg_unicast(genl_info_net(info), msg, info->snd_pid); + +err_out: + nlmsg_free(msg); + +out: + return ret; +} + +static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct l2tp_session *session; + struct l2tp_tunnel *tunnel = NULL; + int ti = cb->args[0]; + int si = cb->args[1]; + + for (;;) { + if (tunnel == NULL) { + tunnel = l2tp_tunnel_find_nth(net, ti); + if (tunnel == NULL) + goto out; + } + + session = l2tp_session_find_nth(tunnel, si); + if (session == NULL) { + ti++; + tunnel = NULL; + si = 0; + continue; + } + + if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + session) <= 0) + break; + + si++; + } + +out: + cb->args[0] = ti; + cb->args[1] = si; + + return skb->len; +} + +static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { + [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, }, + [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, + [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, + [L2TP_ATTR_OFFSET] = { .type = NLA_U16, }, + [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, }, + [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, }, + [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, }, + [L2TP_ATTR_PROTO_VERSION] = { .type = NLA_U8, }, + [L2TP_ATTR_CONN_ID] = { .type = NLA_U32, }, + [L2TP_ATTR_PEER_CONN_ID] = { .type = NLA_U32, }, + [L2TP_ATTR_SESSION_ID] = { .type = NLA_U32, }, + [L2TP_ATTR_PEER_SESSION_ID] = { .type = NLA_U32, }, + [L2TP_ATTR_UDP_CSUM] = { .type = NLA_U8, }, + [L2TP_ATTR_VLAN_ID] = { .type = NLA_U16, }, + [L2TP_ATTR_DEBUG] = { .type = NLA_U32, }, + [L2TP_ATTR_RECV_SEQ] = { .type = NLA_U8, }, + [L2TP_ATTR_SEND_SEQ] = { .type = NLA_U8, }, + [L2TP_ATTR_LNS_MODE] = { .type = NLA_U8, }, + [L2TP_ATTR_USING_IPSEC] = { .type = NLA_U8, }, + [L2TP_ATTR_RECV_TIMEOUT] = { .type = NLA_MSECS, }, + [L2TP_ATTR_FD] = { .type = NLA_U32, }, + [L2TP_ATTR_IP_SADDR] = { .type = NLA_U32, }, + [L2TP_ATTR_IP_DADDR] = { .type = NLA_U32, }, + [L2TP_ATTR_UDP_SPORT] = { .type = NLA_U16, }, + [L2TP_ATTR_UDP_DPORT] = { .type = NLA_U16, }, + [L2TP_ATTR_MTU] = { .type = NLA_U16, }, + [L2TP_ATTR_MRU] = { .type = NLA_U16, }, + [L2TP_ATTR_STATS] = { .type = NLA_NESTED, }, + [L2TP_ATTR_IFNAME] = { + .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1, + }, + [L2TP_ATTR_COOKIE] = { + .type = NLA_BINARY, + .len = 8, + }, + [L2TP_ATTR_PEER_COOKIE] = { + .type = NLA_BINARY, + .len = 8, + }, +}; + +static struct genl_ops l2tp_nl_ops[] = { + { + .cmd = L2TP_CMD_NOOP, + .doit = l2tp_nl_cmd_noop, + .policy = l2tp_nl_policy, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = L2TP_CMD_TUNNEL_CREATE, + .doit = l2tp_nl_cmd_tunnel_create, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_TUNNEL_DELETE, + .doit = l2tp_nl_cmd_tunnel_delete, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_TUNNEL_MODIFY, + .doit = l2tp_nl_cmd_tunnel_modify, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_TUNNEL_GET, + .doit = l2tp_nl_cmd_tunnel_get, + .dumpit = l2tp_nl_cmd_tunnel_dump, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_SESSION_CREATE, + .doit = l2tp_nl_cmd_session_create, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_SESSION_DELETE, + .doit = l2tp_nl_cmd_session_delete, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_SESSION_MODIFY, + .doit = l2tp_nl_cmd_session_modify, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = L2TP_CMD_SESSION_GET, + .doit = l2tp_nl_cmd_session_get, + .dumpit = l2tp_nl_cmd_session_dump, + .policy = l2tp_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops) +{ + int ret; + + ret = -EINVAL; + if (pw_type >= __L2TP_PWTYPE_MAX) + goto err; + + genl_lock(); + ret = -EBUSY; + if (l2tp_nl_cmd_ops[pw_type]) + goto out; + + l2tp_nl_cmd_ops[pw_type] = ops; + ret = 0; + +out: + genl_unlock(); +err: + return ret; +} +EXPORT_SYMBOL_GPL(l2tp_nl_register_ops); + +void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type) +{ + if (pw_type < __L2TP_PWTYPE_MAX) { + genl_lock(); + l2tp_nl_cmd_ops[pw_type] = NULL; + genl_unlock(); + } +} +EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); + +static int l2tp_nl_init(void) +{ + int err; + + printk(KERN_INFO "L2TP netlink interface\n"); + err = genl_register_family_with_ops(&l2tp_nl_family, l2tp_nl_ops, + ARRAY_SIZE(l2tp_nl_ops)); + + return err; +} + +static void l2tp_nl_cleanup(void) +{ + genl_unregister_family(&l2tp_nl_family); +} + +module_init(l2tp_nl_init); +module_exit(l2tp_nl_cleanup); + +MODULE_AUTHOR("James Chapman "); +MODULE_DESCRIPTION("L2TP netlink"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +MODULE_ALIAS("net-pf-" __stringify(PF_NETLINK) "-proto-" \ + __stringify(NETLINK_GENERIC) "-type-" "l2tp"); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c new file mode 100644 index 00000000..13f9868e --- /dev/null +++ b/net/l2tp/l2tp_ppp.c @@ -0,0 +1,1840 @@ +/***************************************************************************** + * Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets + * + * PPPoX --- Generic PPP encapsulation socket family + * PPPoL2TP --- PPP over L2TP (RFC 2661) + * + * Version: 2.0.0 + * + * Authors: James Chapman (jchapman@katalix.com) + * + * Based on original work by Martijn van Oosterhout + * + * License: + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* This driver handles only L2TP data frames; control frames are handled by a + * userspace application. + * + * To send data in an L2TP session, userspace opens a PPPoL2TP socket and + * attaches it to a bound UDP socket with local tunnel_id / session_id and + * peer tunnel_id / session_id set. Data can then be sent or received using + * regular socket sendmsg() / recvmsg() calls. Kernel parameters of the socket + * can be read or modified using ioctl() or [gs]etsockopt() calls. + * + * When a PPPoL2TP socket is connected with local and peer session_id values + * zero, the socket is treated as a special tunnel management socket. + * + * Here's example userspace code to create a socket for sending/receiving data + * over an L2TP session:- + * + * struct sockaddr_pppol2tp sax; + * int fd; + * int session_fd; + * + * fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP); + * + * sax.sa_family = AF_PPPOX; + * sax.sa_protocol = PX_PROTO_OL2TP; + * sax.pppol2tp.fd = tunnel_fd; // bound UDP socket + * sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr; + * sax.pppol2tp.addr.sin_port = addr->sin_port; + * sax.pppol2tp.addr.sin_family = AF_INET; + * sax.pppol2tp.s_tunnel = tunnel_id; + * sax.pppol2tp.s_session = session_id; + * sax.pppol2tp.d_tunnel = peer_tunnel_id; + * sax.pppol2tp.d_session = peer_session_id; + * + * session_fd = connect(fd, (struct sockaddr *)&sax, sizeof(sax)); + * + * A pppd plugin that allows PPP traffic to be carried over L2TP using + * this driver is available from the OpenL2TP project at + * http://openl2tp.sourceforge.net. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "l2tp_core.h" + +#define PPPOL2TP_DRV_VERSION "V2.0" + +/* Space for UDP, L2TP and PPP headers */ +#define PPPOL2TP_HEADER_OVERHEAD 40 + +#define PRINTK(_mask, _type, _lvl, _fmt, args...) \ + do { \ + if ((_mask) & (_type)) \ + printk(_lvl "PPPOL2TP: " _fmt, ##args); \ + } while (0) + +/* Number of bytes to build transmit L2TP headers. + * Unfortunately the size is different depending on whether sequence numbers + * are enabled. + */ +#define PPPOL2TP_L2TP_HDR_SIZE_SEQ 10 +#define PPPOL2TP_L2TP_HDR_SIZE_NOSEQ 6 + +/* Private data of each session. This data lives at the end of struct + * l2tp_session, referenced via session->priv[]. + */ +struct pppol2tp_session { + int owner; /* pid that opened the socket */ + + struct sock *sock; /* Pointer to the session + * PPPoX socket */ + struct sock *tunnel_sock; /* Pointer to the tunnel UDP + * socket */ + int flags; /* accessed by PPPIOCGFLAGS. + * Unused. */ +}; + +static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); + +static const struct ppp_channel_ops pppol2tp_chan_ops = { + .start_xmit = pppol2tp_xmit, +}; + +static const struct proto_ops pppol2tp_ops; + +/* Helpers to obtain tunnel/session contexts from sockets. + */ +static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) +{ + struct l2tp_session *session; + + if (sk == NULL) + return NULL; + + sock_hold(sk); + session = (struct l2tp_session *)(sk->sk_user_data); + if (session == NULL) { + sock_put(sk); + goto out; + } + + BUG_ON(session->magic != L2TP_SESSION_MAGIC); + +out: + return session; +} + +/***************************************************************************** + * Receive data handling + *****************************************************************************/ + +static int pppol2tp_recv_payload_hook(struct sk_buff *skb) +{ + /* Skip PPP header, if present. In testing, Microsoft L2TP clients + * don't send the PPP header (PPP header compression enabled), but + * other clients can include the header. So we cope with both cases + * here. The PPP header is always FF03 when using L2TP. + * + * Note that skb->data[] isn't dereferenced from a u16 ptr here since + * the field may be unaligned. + */ + if (!pskb_may_pull(skb, 2)) + return 1; + + if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03)) + skb_pull(skb, 2); + + return 0; +} + +/* Receive message. This is the recvmsg for the PPPoL2TP socket. + */ +static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, + int flags) +{ + int err; + struct sk_buff *skb; + struct sock *sk = sock->sk; + + err = -EIO; + if (sk->sk_state & PPPOX_BOUND) + goto end; + + msg->msg_namelen = 0; + + err = 0; + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + if (!skb) + goto end; + + if (len > skb->len) + len = skb->len; + else if (len < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + if (likely(err == 0)) + err = len; + + kfree_skb(skb); +end: + return err; +} + +static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + struct sock *sk = NULL; + + /* If the socket is bound, send it in to PPP's input queue. Otherwise + * queue it on the session socket. + */ + sk = ps->sock; + if (sk == NULL) + goto no_sock; + + if (sk->sk_state & PPPOX_BOUND) { + struct pppox_sock *po; + PRINTK(session->debug, PPPOL2TP_MSG_DATA, KERN_DEBUG, + "%s: recv %d byte data frame, passing to ppp\n", + session->name, data_len); + + /* We need to forget all info related to the L2TP packet + * gathered in the skb as we are going to reuse the same + * skb for the inner packet. + * Namely we need to: + * - reset xfrm (IPSec) information as it applies to + * the outer L2TP packet and not to the inner one + * - release the dst to force a route lookup on the inner + * IP packet since skb->dst currently points to the dst + * of the UDP tunnel + * - reset netfilter information as it doesn't apply + * to the inner packet either + */ + secpath_reset(skb); + skb_dst_drop(skb); + nf_reset(skb); + + po = pppox_sk(sk); + ppp_input(&po->chan, skb); + } else { + PRINTK(session->debug, PPPOL2TP_MSG_DATA, KERN_INFO, + "%s: socket not bound\n", session->name); + + /* Not bound. Nothing we can do, so discard. */ + session->stats.rx_errors++; + kfree_skb(skb); + } + + return; + +no_sock: + PRINTK(session->debug, PPPOL2TP_MSG_DATA, KERN_INFO, + "%s: no socket\n", session->name); + kfree_skb(skb); +} + +static void pppol2tp_session_sock_hold(struct l2tp_session *session) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + + if (ps->sock) + sock_hold(ps->sock); +} + +static void pppol2tp_session_sock_put(struct l2tp_session *session) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + + if (ps->sock) + sock_put(ps->sock); +} + +/************************************************************************ + * Transmit handling + ***********************************************************************/ + +/* This is the sendmsg for the PPPoL2TP pppol2tp_session socket. We come here + * when a user application does a sendmsg() on the session socket. L2TP and + * PPP headers must be inserted into the user's data. + */ +static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t total_len) +{ + static const unsigned char ppph[2] = { 0xff, 0x03 }; + struct sock *sk = sock->sk; + struct sk_buff *skb; + int error; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel; + struct pppol2tp_session *ps; + int uhlen; + + error = -ENOTCONN; + if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) + goto error; + + /* Get session and tunnel contexts */ + error = -EBADF; + session = pppol2tp_sock_to_session(sk); + if (session == NULL) + goto error; + + ps = l2tp_session_priv(session); + tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); + if (tunnel == NULL) + goto error_put_sess; + + uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; + + /* Allocate a socket buffer */ + error = -ENOMEM; + skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + + uhlen + session->hdr_len + + sizeof(ppph) + total_len, + 0, GFP_KERNEL); + if (!skb) + goto error_put_sess_tun; + + /* Reserve space for headers. */ + skb_reserve(skb, NET_SKB_PAD); + skb_reset_network_header(skb); + skb_reserve(skb, sizeof(struct iphdr)); + skb_reset_transport_header(skb); + skb_reserve(skb, uhlen); + + /* Add PPP header */ + skb->data[0] = ppph[0]; + skb->data[1] = ppph[1]; + skb_put(skb, 2); + + /* Copy user data into skb */ + error = memcpy_fromiovec(skb->data, m->msg_iov, total_len); + if (error < 0) { + kfree_skb(skb); + goto error_put_sess_tun; + } + skb_put(skb, total_len); + + l2tp_xmit_skb(session, skb, session->hdr_len); + + sock_put(ps->tunnel_sock); + + return error; + +error_put_sess_tun: + sock_put(ps->tunnel_sock); +error_put_sess: + sock_put(sk); +error: + return error; +} + +/* Transmit function called by generic PPP driver. Sends PPP frame + * over PPPoL2TP socket. + * + * This is almost the same as pppol2tp_sendmsg(), but rather than + * being called with a msghdr from userspace, it is called with a skb + * from the kernel. + * + * The supplied skb from ppp doesn't have enough headroom for the + * insertion of L2TP, UDP and IP headers so we need to allocate more + * headroom in the skb. This will create a cloned skb. But we must be + * careful in the error case because the caller will expect to free + * the skb it supplied, not our cloned skb. So we take care to always + * leave the original skb unfreed if we return an error. + */ +static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) +{ + static const u8 ppph[2] = { 0xff, 0x03 }; + struct sock *sk = (struct sock *) chan->private; + struct sock *sk_tun; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel; + struct pppol2tp_session *ps; + int old_headroom; + int new_headroom; + + if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) + goto abort; + + /* Get session and tunnel contexts from the socket */ + session = pppol2tp_sock_to_session(sk); + if (session == NULL) + goto abort; + + ps = l2tp_session_priv(session); + sk_tun = ps->tunnel_sock; + if (sk_tun == NULL) + goto abort_put_sess; + tunnel = l2tp_sock_to_tunnel(sk_tun); + if (tunnel == NULL) + goto abort_put_sess; + + old_headroom = skb_headroom(skb); + if (skb_cow_head(skb, sizeof(ppph))) + goto abort_put_sess_tun; + + new_headroom = skb_headroom(skb); + skb->truesize += new_headroom - old_headroom; + + /* Setup PPP header */ + __skb_push(skb, sizeof(ppph)); + skb->data[0] = ppph[0]; + skb->data[1] = ppph[1]; + + l2tp_xmit_skb(session, skb, session->hdr_len); + + sock_put(sk_tun); + sock_put(sk); + return 1; + +abort_put_sess_tun: + sock_put(sk_tun); +abort_put_sess: + sock_put(sk); +abort: + /* Free the original skb */ + kfree_skb(skb); + return 1; +} + +/***************************************************************************** + * Session (and tunnel control) socket create/destroy. + *****************************************************************************/ + +/* Called by l2tp_core when a session socket is being closed. + */ +static void pppol2tp_session_close(struct l2tp_session *session) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + struct sock *sk = ps->sock; + struct sk_buff *skb; + + BUG_ON(session->magic != L2TP_SESSION_MAGIC); + + if (session->session_id == 0) + goto out; + + if (sk != NULL) { + lock_sock(sk); + + if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { + pppox_unbind_sock(sk); + sk->sk_state = PPPOX_DEAD; + sk->sk_state_change(sk); + } + + /* Purge any queued data */ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + while ((skb = skb_dequeue(&session->reorder_q))) { + kfree_skb(skb); + sock_put(sk); + } + + release_sock(sk); + } + +out: + return; +} + +/* Really kill the session socket. (Called from sock_put() if + * refcnt == 0.) + */ +static void pppol2tp_session_destruct(struct sock *sk) +{ + struct l2tp_session *session; + + if (sk->sk_user_data != NULL) { + session = sk->sk_user_data; + if (session == NULL) + goto out; + + sk->sk_user_data = NULL; + BUG_ON(session->magic != L2TP_SESSION_MAGIC); + l2tp_session_dec_refcount(session); + } + +out: + return; +} + +/* Called when the PPPoX socket (session) is closed. + */ +static int pppol2tp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct l2tp_session *session; + int error; + + if (!sk) + return 0; + + error = -EBADF; + lock_sock(sk); + if (sock_flag(sk, SOCK_DEAD) != 0) + goto error; + + pppox_unbind_sock(sk); + + /* Signal the death of the socket. */ + sk->sk_state = PPPOX_DEAD; + sock_orphan(sk); + sock->sk = NULL; + + session = pppol2tp_sock_to_session(sk); + + /* Purge any queued data */ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + if (session != NULL) { + struct sk_buff *skb; + while ((skb = skb_dequeue(&session->reorder_q))) { + kfree_skb(skb); + sock_put(sk); + } + sock_put(sk); + } + + release_sock(sk); + + /* This will delete the session context via + * pppol2tp_session_destruct() if the socket's refcnt drops to + * zero. + */ + sock_put(sk); + + return 0; + +error: + release_sock(sk); + return error; +} + +static struct proto pppol2tp_sk_proto = { + .name = "PPPOL2TP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct pppox_sock), +}; + +static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + rc = l2tp_udp_encap_recv(sk, skb); + if (rc) + kfree_skb(skb); + + return NET_RX_SUCCESS; +} + +/* socket() handler. Initialize a new struct sock. + */ +static int pppol2tp_create(struct net *net, struct socket *sock) +{ + int error = -ENOMEM; + struct sock *sk; + + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto); + if (!sk) + goto out; + + sock_init_data(sock, sk); + + sock->state = SS_UNCONNECTED; + sock->ops = &pppol2tp_ops; + + sk->sk_backlog_rcv = pppol2tp_backlog_recv; + sk->sk_protocol = PX_PROTO_OL2TP; + sk->sk_family = PF_PPPOX; + sk->sk_state = PPPOX_NONE; + sk->sk_type = SOCK_STREAM; + sk->sk_destruct = pppol2tp_session_destruct; + + error = 0; + +out: + return error; +} + +#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +static void pppol2tp_show(struct seq_file *m, void *arg) +{ + struct l2tp_session *session = arg; + struct pppol2tp_session *ps = l2tp_session_priv(session); + + if (ps) { + struct pppox_sock *po = pppox_sk(ps->sock); + if (po) + seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); + } +} +#endif + +/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket + */ +static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, + int sockaddr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr; + struct sockaddr_pppol2tpv3 *sp3 = (struct sockaddr_pppol2tpv3 *) uservaddr; + struct pppox_sock *po = pppox_sk(sk); + struct l2tp_session *session = NULL; + struct l2tp_tunnel *tunnel; + struct pppol2tp_session *ps; + struct dst_entry *dst; + struct l2tp_session_cfg cfg = { 0, }; + int error = 0; + u32 tunnel_id, peer_tunnel_id; + u32 session_id, peer_session_id; + int ver = 2; + int fd; + + lock_sock(sk); + + error = -EINVAL; + if (sp->sa_protocol != PX_PROTO_OL2TP) + goto end; + + /* Check for already bound sockets */ + error = -EBUSY; + if (sk->sk_state & PPPOX_CONNECTED) + goto end; + + /* We don't supporting rebinding anyway */ + error = -EALREADY; + if (sk->sk_user_data) + goto end; /* socket is already attached */ + + /* Get params from socket address. Handle L2TPv2 and L2TPv3 */ + if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) { + fd = sp->pppol2tp.fd; + tunnel_id = sp->pppol2tp.s_tunnel; + peer_tunnel_id = sp->pppol2tp.d_tunnel; + session_id = sp->pppol2tp.s_session; + peer_session_id = sp->pppol2tp.d_session; + } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) { + ver = 3; + fd = sp3->pppol2tp.fd; + tunnel_id = sp3->pppol2tp.s_tunnel; + peer_tunnel_id = sp3->pppol2tp.d_tunnel; + session_id = sp3->pppol2tp.s_session; + peer_session_id = sp3->pppol2tp.d_session; + } else { + error = -EINVAL; + goto end; /* bad socket address */ + } + + /* Don't bind if tunnel_id is 0 */ + error = -EINVAL; + if (tunnel_id == 0) + goto end; + + tunnel = l2tp_tunnel_find(sock_net(sk), tunnel_id); + + /* Special case: create tunnel context if session_id and + * peer_session_id is 0. Otherwise look up tunnel using supplied + * tunnel id. + */ + if ((session_id == 0) && (peer_session_id == 0)) { + if (tunnel == NULL) { + struct l2tp_tunnel_cfg tcfg = { + .encap = L2TP_ENCAPTYPE_UDP, + .debug = 0, + }; + error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); + if (error < 0) + goto end; + } + } else { + /* Error if we can't find the tunnel */ + error = -ENOENT; + if (tunnel == NULL) + goto end; + + /* Error if socket is not prepped */ + if (tunnel->sock == NULL) + goto end; + } + + if (tunnel->recv_payload_hook == NULL) + tunnel->recv_payload_hook = pppol2tp_recv_payload_hook; + + if (tunnel->peer_tunnel_id == 0) { + if (ver == 2) + tunnel->peer_tunnel_id = sp->pppol2tp.d_tunnel; + else + tunnel->peer_tunnel_id = sp3->pppol2tp.d_tunnel; + } + + /* Create session if it doesn't already exist. We handle the + * case where a session was previously created by the netlink + * interface by checking that the session doesn't already have + * a socket and its tunnel socket are what we expect. If any + * of those checks fail, return EEXIST to the caller. + */ + session = l2tp_session_find(sock_net(sk), tunnel, session_id); + if (session == NULL) { + /* Default MTU must allow space for UDP/L2TP/PPP + * headers. + */ + cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD; + + /* Allocate and initialize a new session context. */ + session = l2tp_session_create(sizeof(struct pppol2tp_session), + tunnel, session_id, + peer_session_id, &cfg); + if (session == NULL) { + error = -ENOMEM; + goto end; + } + } else { + ps = l2tp_session_priv(session); + error = -EEXIST; + if (ps->sock != NULL) + goto end; + + /* consistency checks */ + if (ps->tunnel_sock != tunnel->sock) + goto end; + } + + /* Associate session with its PPPoL2TP socket */ + ps = l2tp_session_priv(session); + ps->owner = current->pid; + ps->sock = sk; + ps->tunnel_sock = tunnel->sock; + + session->recv_skb = pppol2tp_recv; + session->session_close = pppol2tp_session_close; +#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) + session->show = pppol2tp_show; +#endif + + /* We need to know each time a skb is dropped from the reorder + * queue. + */ + session->ref = pppol2tp_session_sock_hold; + session->deref = pppol2tp_session_sock_put; + + /* If PMTU discovery was enabled, use the MTU that was discovered */ + dst = sk_dst_get(sk); + if (dst != NULL) { + u32 pmtu = dst_mtu(__sk_dst_get(sk)); + if (pmtu != 0) + session->mtu = session->mru = pmtu - + PPPOL2TP_HEADER_OVERHEAD; + dst_release(dst); + } + + /* Special case: if source & dest session_id == 0x0000, this + * socket is being created to manage the tunnel. Just set up + * the internal context for use by ioctl() and sockopt() + * handlers. + */ + if ((session->session_id == 0) && + (session->peer_session_id == 0)) { + error = 0; + goto out_no_ppp; + } + + /* The only header we need to worry about is the L2TP + * header. This size is different depending on whether + * sequence numbers are enabled for the data channel. + */ + po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; + + po->chan.private = sk; + po->chan.ops = &pppol2tp_chan_ops; + po->chan.mtu = session->mtu; + + error = ppp_register_net_channel(sock_net(sk), &po->chan); + if (error) + goto end; + +out_no_ppp: + /* This is how we get the session context from the socket. */ + sk->sk_user_data = session; + sk->sk_state = PPPOX_CONNECTED; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: created\n", session->name); + +end: + release_sock(sk); + + return error; +} + +#ifdef CONFIG_L2TP_V3 + +/* Called when creating sessions via the netlink interface. + */ +static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) +{ + int error; + struct l2tp_tunnel *tunnel; + struct l2tp_session *session; + struct pppol2tp_session *ps; + + tunnel = l2tp_tunnel_find(net, tunnel_id); + + /* Error if we can't find the tunnel */ + error = -ENOENT; + if (tunnel == NULL) + goto out; + + /* Error if tunnel socket is not prepped */ + if (tunnel->sock == NULL) + goto out; + + /* Check that this session doesn't already exist */ + error = -EEXIST; + session = l2tp_session_find(net, tunnel, session_id); + if (session != NULL) + goto out; + + /* Default MTU values. */ + if (cfg->mtu == 0) + cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; + if (cfg->mru == 0) + cfg->mru = cfg->mtu; + + /* Allocate and initialize a new session context. */ + error = -ENOMEM; + session = l2tp_session_create(sizeof(struct pppol2tp_session), + tunnel, session_id, + peer_session_id, cfg); + if (session == NULL) + goto out; + + ps = l2tp_session_priv(session); + ps->tunnel_sock = tunnel->sock; + + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: created\n", session->name); + + error = 0; + +out: + return error; +} + +/* Called when deleting sessions via the netlink interface. + */ +static int pppol2tp_session_delete(struct l2tp_session *session) +{ + struct pppol2tp_session *ps = l2tp_session_priv(session); + + if (ps->sock == NULL) + l2tp_session_dec_refcount(session); + + return 0; +} + +#endif /* CONFIG_L2TP_V3 */ + +/* getname() support. + */ +static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, + int *usockaddr_len, int peer) +{ + int len = 0; + int error = 0; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel; + struct sock *sk = sock->sk; + struct inet_sock *inet; + struct pppol2tp_session *pls; + + error = -ENOTCONN; + if (sk == NULL) + goto end; + if (sk->sk_state != PPPOX_CONNECTED) + goto end; + + error = -EBADF; + session = pppol2tp_sock_to_session(sk); + if (session == NULL) + goto end; + + pls = l2tp_session_priv(session); + tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock); + if (tunnel == NULL) { + error = -EBADF; + goto end_put_sess; + } + + inet = inet_sk(tunnel->sock); + if (tunnel->version == 2) { + struct sockaddr_pppol2tp sp; + len = sizeof(sp); + memset(&sp, 0, len); + sp.sa_family = AF_PPPOX; + sp.sa_protocol = PX_PROTO_OL2TP; + sp.pppol2tp.fd = tunnel->fd; + sp.pppol2tp.pid = pls->owner; + sp.pppol2tp.s_tunnel = tunnel->tunnel_id; + sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; + sp.pppol2tp.s_session = session->session_id; + sp.pppol2tp.d_session = session->peer_session_id; + sp.pppol2tp.addr.sin_family = AF_INET; + sp.pppol2tp.addr.sin_port = inet->inet_dport; + sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; + memcpy(uaddr, &sp, len); + } else if (tunnel->version == 3) { + struct sockaddr_pppol2tpv3 sp; + len = sizeof(sp); + memset(&sp, 0, len); + sp.sa_family = AF_PPPOX; + sp.sa_protocol = PX_PROTO_OL2TP; + sp.pppol2tp.fd = tunnel->fd; + sp.pppol2tp.pid = pls->owner; + sp.pppol2tp.s_tunnel = tunnel->tunnel_id; + sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; + sp.pppol2tp.s_session = session->session_id; + sp.pppol2tp.d_session = session->peer_session_id; + sp.pppol2tp.addr.sin_family = AF_INET; + sp.pppol2tp.addr.sin_port = inet->inet_dport; + sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; + memcpy(uaddr, &sp, len); + } + + *usockaddr_len = len; + + sock_put(pls->tunnel_sock); +end_put_sess: + sock_put(sk); + error = 0; + +end: + return error; +} + +/**************************************************************************** + * ioctl() handlers. + * + * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP + * sockets. However, in order to control kernel tunnel features, we allow + * userspace to create a special "tunnel" PPPoX socket which is used for + * control only. Tunnel PPPoX sockets have session_id == 0 and simply allow + * the user application to issue L2TP setsockopt(), getsockopt() and ioctl() + * calls. + ****************************************************************************/ + +static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, + struct l2tp_stats *stats) +{ + dest->tx_packets = stats->tx_packets; + dest->tx_bytes = stats->tx_bytes; + dest->tx_errors = stats->tx_errors; + dest->rx_packets = stats->rx_packets; + dest->rx_bytes = stats->rx_bytes; + dest->rx_seq_discards = stats->rx_seq_discards; + dest->rx_oos_packets = stats->rx_oos_packets; + dest->rx_errors = stats->rx_errors; +} + +/* Session ioctl helper. + */ +static int pppol2tp_session_ioctl(struct l2tp_session *session, + unsigned int cmd, unsigned long arg) +{ + struct ifreq ifr; + int err = 0; + struct sock *sk; + int val = (int) arg; + struct pppol2tp_session *ps = l2tp_session_priv(session); + struct l2tp_tunnel *tunnel = session->tunnel; + struct pppol2tp_ioc_stats stats; + + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_DEBUG, + "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n", + session->name, cmd, arg); + + sk = ps->sock; + sock_hold(sk); + + switch (cmd) { + case SIOCGIFMTU: + err = -ENXIO; + if (!(sk->sk_state & PPPOX_CONNECTED)) + break; + + err = -EFAULT; + if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq))) + break; + ifr.ifr_mtu = session->mtu; + if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq))) + break; + + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get mtu=%d\n", session->name, session->mtu); + err = 0; + break; + + case SIOCSIFMTU: + err = -ENXIO; + if (!(sk->sk_state & PPPOX_CONNECTED)) + break; + + err = -EFAULT; + if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq))) + break; + + session->mtu = ifr.ifr_mtu; + + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set mtu=%d\n", session->name, session->mtu); + err = 0; + break; + + case PPPIOCGMRU: + err = -ENXIO; + if (!(sk->sk_state & PPPOX_CONNECTED)) + break; + + err = -EFAULT; + if (put_user(session->mru, (int __user *) arg)) + break; + + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get mru=%d\n", session->name, session->mru); + err = 0; + break; + + case PPPIOCSMRU: + err = -ENXIO; + if (!(sk->sk_state & PPPOX_CONNECTED)) + break; + + err = -EFAULT; + if (get_user(val, (int __user *) arg)) + break; + + session->mru = val; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set mru=%d\n", session->name, session->mru); + err = 0; + break; + + case PPPIOCGFLAGS: + err = -EFAULT; + if (put_user(ps->flags, (int __user *) arg)) + break; + + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get flags=%d\n", session->name, ps->flags); + err = 0; + break; + + case PPPIOCSFLAGS: + err = -EFAULT; + if (get_user(val, (int __user *) arg)) + break; + ps->flags = val; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set flags=%d\n", session->name, ps->flags); + err = 0; + break; + + case PPPIOCGL2TPSTATS: + err = -ENXIO; + if (!(sk->sk_state & PPPOX_CONNECTED)) + break; + + memset(&stats, 0, sizeof(stats)); + stats.tunnel_id = tunnel->tunnel_id; + stats.session_id = session->session_id; + pppol2tp_copy_stats(&stats, &session->stats); + if (copy_to_user((void __user *) arg, &stats, + sizeof(stats))) + break; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get L2TP stats\n", session->name); + err = 0; + break; + + default: + err = -ENOSYS; + break; + } + + sock_put(sk); + + return err; +} + +/* Tunnel ioctl helper. + * + * Note the special handling for PPPIOCGL2TPSTATS below. If the ioctl data + * specifies a session_id, the session ioctl handler is called. This allows an + * application to retrieve session stats via a tunnel socket. + */ +static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + struct sock *sk; + struct pppol2tp_ioc_stats stats; + + PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_DEBUG, + "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n", + tunnel->name, cmd, arg); + + sk = tunnel->sock; + sock_hold(sk); + + switch (cmd) { + case PPPIOCGL2TPSTATS: + err = -ENXIO; + if (!(sk->sk_state & PPPOX_CONNECTED)) + break; + + if (copy_from_user(&stats, (void __user *) arg, + sizeof(stats))) { + err = -EFAULT; + break; + } + if (stats.session_id != 0) { + /* resend to session ioctl handler */ + struct l2tp_session *session = + l2tp_session_find(sock_net(sk), tunnel, stats.session_id); + if (session != NULL) + err = pppol2tp_session_ioctl(session, cmd, arg); + else + err = -EBADR; + break; + } +#ifdef CONFIG_XFRM + stats.using_ipsec = (sk->sk_policy[0] || sk->sk_policy[1]) ? 1 : 0; +#endif + pppol2tp_copy_stats(&stats, &tunnel->stats); + if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) { + err = -EFAULT; + break; + } + PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get L2TP stats\n", tunnel->name); + err = 0; + break; + + default: + err = -ENOSYS; + break; + } + + sock_put(sk); + + return err; +} + +/* Main ioctl() handler. + * Dispatch to tunnel or session helpers depending on the socket. + */ +static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct sock *sk = sock->sk; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel; + struct pppol2tp_session *ps; + int err; + + if (!sk) + return 0; + + err = -EBADF; + if (sock_flag(sk, SOCK_DEAD) != 0) + goto end; + + err = -ENOTCONN; + if ((sk->sk_user_data == NULL) || + (!(sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)))) + goto end; + + /* Get session context from the socket */ + err = -EBADF; + session = pppol2tp_sock_to_session(sk); + if (session == NULL) + goto end; + + /* Special case: if session's session_id is zero, treat ioctl as a + * tunnel ioctl + */ + ps = l2tp_session_priv(session); + if ((session->session_id == 0) && + (session->peer_session_id == 0)) { + err = -EBADF; + tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); + if (tunnel == NULL) + goto end_put_sess; + + err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg); + sock_put(ps->tunnel_sock); + goto end_put_sess; + } + + err = pppol2tp_session_ioctl(session, cmd, arg); + +end_put_sess: + sock_put(sk); +end: + return err; +} + +/***************************************************************************** + * setsockopt() / getsockopt() support. + * + * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP + * sockets. In order to control kernel tunnel features, we allow userspace to + * create a special "tunnel" PPPoX socket which is used for control only. + * Tunnel PPPoX sockets have session_id == 0 and simply allow the user + * application to issue L2TP setsockopt(), getsockopt() and ioctl() calls. + *****************************************************************************/ + +/* Tunnel setsockopt() helper. + */ +static int pppol2tp_tunnel_setsockopt(struct sock *sk, + struct l2tp_tunnel *tunnel, + int optname, int val) +{ + int err = 0; + + switch (optname) { + case PPPOL2TP_SO_DEBUG: + tunnel->debug = val; + PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set debug=%x\n", tunnel->name, tunnel->debug); + break; + + default: + err = -ENOPROTOOPT; + break; + } + + return err; +} + +/* Session setsockopt helper. + */ +static int pppol2tp_session_setsockopt(struct sock *sk, + struct l2tp_session *session, + int optname, int val) +{ + int err = 0; + struct pppol2tp_session *ps = l2tp_session_priv(session); + + switch (optname) { + case PPPOL2TP_SO_RECVSEQ: + if ((val != 0) && (val != 1)) { + err = -EINVAL; + break; + } + session->recv_seq = val ? -1 : 0; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set recv_seq=%d\n", session->name, session->recv_seq); + break; + + case PPPOL2TP_SO_SENDSEQ: + if ((val != 0) && (val != 1)) { + err = -EINVAL; + break; + } + session->send_seq = val ? -1 : 0; + { + struct sock *ssk = ps->sock; + struct pppox_sock *po = pppox_sk(ssk); + po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : + PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; + } + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set send_seq=%d\n", session->name, session->send_seq); + break; + + case PPPOL2TP_SO_LNSMODE: + if ((val != 0) && (val != 1)) { + err = -EINVAL; + break; + } + session->lns_mode = val ? -1 : 0; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set lns_mode=%d\n", session->name, session->lns_mode); + break; + + case PPPOL2TP_SO_DEBUG: + session->debug = val; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set debug=%x\n", session->name, session->debug); + break; + + case PPPOL2TP_SO_REORDERTO: + session->reorder_timeout = msecs_to_jiffies(val); + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: set reorder_timeout=%d\n", session->name, session->reorder_timeout); + break; + + default: + err = -ENOPROTOOPT; + break; + } + + return err; +} + +/* Main setsockopt() entry point. + * Does API checks, then calls either the tunnel or session setsockopt + * handler, according to whether the PPPoL2TP socket is a for a regular + * session or the special tunnel type. + */ +static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel; + struct pppol2tp_session *ps; + int val; + int err; + + if (level != SOL_PPPOL2TP) + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + err = -ENOTCONN; + if (sk->sk_user_data == NULL) + goto end; + + /* Get session context from the socket */ + err = -EBADF; + session = pppol2tp_sock_to_session(sk); + if (session == NULL) + goto end; + + /* Special case: if session_id == 0x0000, treat as operation on tunnel + */ + ps = l2tp_session_priv(session); + if ((session->session_id == 0) && + (session->peer_session_id == 0)) { + err = -EBADF; + tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); + if (tunnel == NULL) + goto end_put_sess; + + err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); + sock_put(ps->tunnel_sock); + } else + err = pppol2tp_session_setsockopt(sk, session, optname, val); + + err = 0; + +end_put_sess: + sock_put(sk); +end: + return err; +} + +/* Tunnel getsockopt helper. Called with sock locked. + */ +static int pppol2tp_tunnel_getsockopt(struct sock *sk, + struct l2tp_tunnel *tunnel, + int optname, int *val) +{ + int err = 0; + + switch (optname) { + case PPPOL2TP_SO_DEBUG: + *val = tunnel->debug; + PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get debug=%x\n", tunnel->name, tunnel->debug); + break; + + default: + err = -ENOPROTOOPT; + break; + } + + return err; +} + +/* Session getsockopt helper. Called with sock locked. + */ +static int pppol2tp_session_getsockopt(struct sock *sk, + struct l2tp_session *session, + int optname, int *val) +{ + int err = 0; + + switch (optname) { + case PPPOL2TP_SO_RECVSEQ: + *val = session->recv_seq; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get recv_seq=%d\n", session->name, *val); + break; + + case PPPOL2TP_SO_SENDSEQ: + *val = session->send_seq; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get send_seq=%d\n", session->name, *val); + break; + + case PPPOL2TP_SO_LNSMODE: + *val = session->lns_mode; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get lns_mode=%d\n", session->name, *val); + break; + + case PPPOL2TP_SO_DEBUG: + *val = session->debug; + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get debug=%d\n", session->name, *val); + break; + + case PPPOL2TP_SO_REORDERTO: + *val = (int) jiffies_to_msecs(session->reorder_timeout); + PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO, + "%s: get reorder_timeout=%d\n", session->name, *val); + break; + + default: + err = -ENOPROTOOPT; + } + + return err; +} + +/* Main getsockopt() entry point. + * Does API checks, then calls either the tunnel or session getsockopt + * handler, according to whether the PPPoX socket is a for a regular session + * or the special tunnel type. + */ +static int pppol2tp_getsockopt(struct socket *sock, int level, + int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct l2tp_session *session; + struct l2tp_tunnel *tunnel; + int val, len; + int err; + struct pppol2tp_session *ps; + + if (level != SOL_PPPOL2TP) + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + + if (get_user(len, (int __user *) optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + err = -ENOTCONN; + if (sk->sk_user_data == NULL) + goto end; + + /* Get the session context */ + err = -EBADF; + session = pppol2tp_sock_to_session(sk); + if (session == NULL) + goto end; + + /* Special case: if session_id == 0x0000, treat as operation on tunnel */ + ps = l2tp_session_priv(session); + if ((session->session_id == 0) && + (session->peer_session_id == 0)) { + err = -EBADF; + tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock); + if (tunnel == NULL) + goto end_put_sess; + + err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); + sock_put(ps->tunnel_sock); + } else + err = pppol2tp_session_getsockopt(sk, session, optname, &val); + + err = -EFAULT; + if (put_user(len, (int __user *) optlen)) + goto end_put_sess; + + if (copy_to_user((void __user *) optval, &val, len)) + goto end_put_sess; + + err = 0; + +end_put_sess: + sock_put(sk); +end: + return err; +} + +/***************************************************************************** + * /proc filesystem for debug + * Since the original pppol2tp driver provided /proc/net/pppol2tp for + * L2TPv2, we dump only L2TPv2 tunnels and sessions here. + *****************************************************************************/ + +static unsigned int pppol2tp_net_id; + +#ifdef CONFIG_PROC_FS + +struct pppol2tp_seq_data { + struct seq_net_private p; + int tunnel_idx; /* current tunnel */ + int session_idx; /* index of session within current tunnel */ + struct l2tp_tunnel *tunnel; + struct l2tp_session *session; /* NULL means get next tunnel */ +}; + +static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) +{ + for (;;) { + pd->tunnel = l2tp_tunnel_find_nth(net, pd->tunnel_idx); + pd->tunnel_idx++; + + if (pd->tunnel == NULL) + break; + + /* Ignore L2TPv3 tunnels */ + if (pd->tunnel->version < 3) + break; + } +} + +static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) +{ + pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx); + pd->session_idx++; + + if (pd->session == NULL) { + pd->session_idx = 0; + pppol2tp_next_tunnel(net, pd); + } +} + +static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs) +{ + struct pppol2tp_seq_data *pd = SEQ_START_TOKEN; + loff_t pos = *offs; + struct net *net; + + if (!pos) + goto out; + + BUG_ON(m->private == NULL); + pd = m->private; + net = seq_file_net(m); + + if (pd->tunnel == NULL) + pppol2tp_next_tunnel(net, pd); + else + pppol2tp_next_session(net, pd); + + /* NULL tunnel and session indicates end of list */ + if ((pd->tunnel == NULL) && (pd->session == NULL)) + pd = NULL; + +out: + return pd; +} + +static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void pppol2tp_seq_stop(struct seq_file *p, void *v) +{ + /* nothing to do */ +} + +static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) +{ + struct l2tp_tunnel *tunnel = v; + + seq_printf(m, "\nTUNNEL '%s', %c %d\n", + tunnel->name, + (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', + atomic_read(&tunnel->ref_count) - 1); + seq_printf(m, " %08x %llu/%llu/%llu %llu/%llu/%llu\n", + tunnel->debug, + (unsigned long long)tunnel->stats.tx_packets, + (unsigned long long)tunnel->stats.tx_bytes, + (unsigned long long)tunnel->stats.tx_errors, + (unsigned long long)tunnel->stats.rx_packets, + (unsigned long long)tunnel->stats.rx_bytes, + (unsigned long long)tunnel->stats.rx_errors); +} + +static void pppol2tp_seq_session_show(struct seq_file *m, void *v) +{ + struct l2tp_session *session = v; + struct l2tp_tunnel *tunnel = session->tunnel; + struct pppol2tp_session *ps = l2tp_session_priv(session); + struct pppox_sock *po = pppox_sk(ps->sock); + u32 ip = 0; + u16 port = 0; + + if (tunnel->sock) { + struct inet_sock *inet = inet_sk(tunnel->sock); + ip = ntohl(inet->inet_saddr); + port = ntohs(inet->inet_sport); + } + + seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> " + "%04X/%04X %d %c\n", + session->name, ip, port, + tunnel->tunnel_id, + session->session_id, + tunnel->peer_tunnel_id, + session->peer_session_id, + ps->sock->sk_state, + (session == ps->sock->sk_user_data) ? + 'Y' : 'N'); + seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n", + session->mtu, session->mru, + session->recv_seq ? 'R' : '-', + session->send_seq ? 'S' : '-', + session->lns_mode ? "LNS" : "LAC", + session->debug, + jiffies_to_msecs(session->reorder_timeout)); + seq_printf(m, " %hu/%hu %llu/%llu/%llu %llu/%llu/%llu\n", + session->nr, session->ns, + (unsigned long long)session->stats.tx_packets, + (unsigned long long)session->stats.tx_bytes, + (unsigned long long)session->stats.tx_errors, + (unsigned long long)session->stats.rx_packets, + (unsigned long long)session->stats.rx_bytes, + (unsigned long long)session->stats.rx_errors); + + if (po) + seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); +} + +static int pppol2tp_seq_show(struct seq_file *m, void *v) +{ + struct pppol2tp_seq_data *pd = v; + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n"); + seq_puts(m, "TUNNEL name, user-data-ok session-count\n"); + seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); + seq_puts(m, " SESSION name, addr/port src-tid/sid " + "dest-tid/sid state user-data-ok\n"); + seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n"); + seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); + goto out; + } + + /* Show the tunnel or session context. + */ + if (pd->session == NULL) + pppol2tp_seq_tunnel_show(m, pd->tunnel); + else + pppol2tp_seq_session_show(m, pd->session); + +out: + return 0; +} + +static const struct seq_operations pppol2tp_seq_ops = { + .start = pppol2tp_seq_start, + .next = pppol2tp_seq_next, + .stop = pppol2tp_seq_stop, + .show = pppol2tp_seq_show, +}; + +/* Called when our /proc file is opened. We allocate data for use when + * iterating our tunnel / session contexts and store it in the private + * data of the seq_file. + */ +static int pppol2tp_proc_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &pppol2tp_seq_ops, + sizeof(struct pppol2tp_seq_data)); +} + +static const struct file_operations pppol2tp_proc_fops = { + .owner = THIS_MODULE, + .open = pppol2tp_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif /* CONFIG_PROC_FS */ + +/***************************************************************************** + * Network namespace + *****************************************************************************/ + +static __net_init int pppol2tp_init_net(struct net *net) +{ + struct proc_dir_entry *pde; + int err = 0; + + pde = proc_net_fops_create(net, "pppol2tp", S_IRUGO, &pppol2tp_proc_fops); + if (!pde) { + err = -ENOMEM; + goto out; + } + +out: + return err; +} + +static __net_exit void pppol2tp_exit_net(struct net *net) +{ + proc_net_remove(net, "pppol2tp"); +} + +static struct pernet_operations pppol2tp_net_ops = { + .init = pppol2tp_init_net, + .exit = pppol2tp_exit_net, + .id = &pppol2tp_net_id, +}; + +/***************************************************************************** + * Init and cleanup + *****************************************************************************/ + +static const struct proto_ops pppol2tp_ops = { + .family = AF_PPPOX, + .owner = THIS_MODULE, + .release = pppol2tp_release, + .bind = sock_no_bind, + .connect = pppol2tp_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = pppol2tp_getname, + .poll = datagram_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = pppol2tp_setsockopt, + .getsockopt = pppol2tp_getsockopt, + .sendmsg = pppol2tp_sendmsg, + .recvmsg = pppol2tp_recvmsg, + .mmap = sock_no_mmap, + .ioctl = pppox_ioctl, +}; + +static const struct pppox_proto pppol2tp_proto = { + .create = pppol2tp_create, + .ioctl = pppol2tp_ioctl +}; + +#ifdef CONFIG_L2TP_V3 + +static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = { + .session_create = pppol2tp_session_create, + .session_delete = pppol2tp_session_delete, +}; + +#endif /* CONFIG_L2TP_V3 */ + +static int __init pppol2tp_init(void) +{ + int err; + + err = register_pernet_device(&pppol2tp_net_ops); + if (err) + goto out; + + err = proto_register(&pppol2tp_sk_proto, 0); + if (err) + goto out_unregister_pppol2tp_pernet; + + err = register_pppox_proto(PX_PROTO_OL2TP, &pppol2tp_proto); + if (err) + goto out_unregister_pppol2tp_proto; + +#ifdef CONFIG_L2TP_V3 + err = l2tp_nl_register_ops(L2TP_PWTYPE_PPP, &pppol2tp_nl_cmd_ops); + if (err) + goto out_unregister_pppox; +#endif + + printk(KERN_INFO "PPPoL2TP kernel driver, %s\n", + PPPOL2TP_DRV_VERSION); + +out: + return err; + +#ifdef CONFIG_L2TP_V3 +out_unregister_pppox: + unregister_pppox_proto(PX_PROTO_OL2TP); +#endif +out_unregister_pppol2tp_proto: + proto_unregister(&pppol2tp_sk_proto); +out_unregister_pppol2tp_pernet: + unregister_pernet_device(&pppol2tp_net_ops); + goto out; +} + +static void __exit pppol2tp_exit(void) +{ +#ifdef CONFIG_L2TP_V3 + l2tp_nl_unregister_ops(L2TP_PWTYPE_PPP); +#endif + unregister_pppox_proto(PX_PROTO_OL2TP); + proto_unregister(&pppol2tp_sk_proto); + unregister_pernet_device(&pppol2tp_net_ops); +} + +module_init(pppol2tp_init); +module_exit(pppol2tp_exit); + +MODULE_AUTHOR("James Chapman "); +MODULE_DESCRIPTION("PPP over L2TP over UDP"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(PPPOL2TP_DRV_VERSION); diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig new file mode 100644 index 00000000..f0b5efb3 --- /dev/null +++ b/net/lapb/Kconfig @@ -0,0 +1,22 @@ +# +# LAPB Data Link Drive +# + +config LAPB + tristate "LAPB Data Link Driver (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + Link Access Procedure, Balanced (LAPB) is the data link layer (i.e. + the lower) part of the X.25 protocol. It offers a reliable + connection service to exchange data frames with one other host, and + it is used to transport higher level protocols (mostly X.25 Packet + Layer, the higher part of X.25, but others are possible as well). + Usually, LAPB is used with specialized X.21 network cards, but Linux + currently supports LAPB only over Ethernet connections. If you want + to use LAPB connections over Ethernet, say Y here and to "LAPB over + Ethernet driver" below. Read + for technical + details. + + To compile this driver as a module, choose M here: the + module will be called lapb. If unsure, say N. diff --git a/net/lapb/Makefile b/net/lapb/Makefile new file mode 100644 index 00000000..fff797df --- /dev/null +++ b/net/lapb/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux LAPB layer. +# + +obj-$(CONFIG_LAPB) += lapb.o + +lapb-y := lapb_in.o lapb_out.o lapb_subr.o lapb_timer.o lapb_iface.o diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c new file mode 100644 index 00000000..d5d8d555 --- /dev/null +++ b/net/lapb/lapb_iface.c @@ -0,0 +1,450 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + * 2000-10-29 Henner Eisen lapb_data_indication() return status. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(lapb_list); +static DEFINE_RWLOCK(lapb_list_lock); + +/* + * Free an allocated lapb control block. + */ +static void lapb_free_cb(struct lapb_cb *lapb) +{ + kfree(lapb); +} + +static __inline__ void lapb_hold(struct lapb_cb *lapb) +{ + atomic_inc(&lapb->refcnt); +} + +static __inline__ void lapb_put(struct lapb_cb *lapb) +{ + if (atomic_dec_and_test(&lapb->refcnt)) + lapb_free_cb(lapb); +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void __lapb_remove_cb(struct lapb_cb *lapb) +{ + if (lapb->node.next) { + list_del(&lapb->node); + lapb_put(lapb); + } +} + +/* + * Add a socket to the bound sockets list. + */ +static void __lapb_insert_cb(struct lapb_cb *lapb) +{ + list_add(&lapb->node, &lapb_list); + lapb_hold(lapb); +} + +static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) +{ + struct list_head *entry; + struct lapb_cb *lapb, *use = NULL; + + list_for_each(entry, &lapb_list) { + lapb = list_entry(entry, struct lapb_cb, node); + if (lapb->dev == dev) { + use = lapb; + break; + } + } + + if (use) + lapb_hold(use); + + return use; +} + +static struct lapb_cb *lapb_devtostruct(struct net_device *dev) +{ + struct lapb_cb *rc; + + read_lock_bh(&lapb_list_lock); + rc = __lapb_devtostruct(dev); + read_unlock_bh(&lapb_list_lock); + + return rc; +} +/* + * Create an empty LAPB control block. + */ +static struct lapb_cb *lapb_create_cb(void) +{ + struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); + + + if (!lapb) + goto out; + + skb_queue_head_init(&lapb->write_queue); + skb_queue_head_init(&lapb->ack_queue); + + init_timer(&lapb->t1timer); + init_timer(&lapb->t2timer); + + lapb->t1 = LAPB_DEFAULT_T1; + lapb->t2 = LAPB_DEFAULT_T2; + lapb->n2 = LAPB_DEFAULT_N2; + lapb->mode = LAPB_DEFAULT_MODE; + lapb->window = LAPB_DEFAULT_WINDOW; + lapb->state = LAPB_STATE_0; + atomic_set(&lapb->refcnt, 1); +out: + return lapb; +} + +int lapb_register(struct net_device *dev, struct lapb_register_struct *callbacks) +{ + struct lapb_cb *lapb; + int rc = LAPB_BADTOKEN; + + write_lock_bh(&lapb_list_lock); + + lapb = __lapb_devtostruct(dev); + if (lapb) { + lapb_put(lapb); + goto out; + } + + lapb = lapb_create_cb(); + rc = LAPB_NOMEM; + if (!lapb) + goto out; + + lapb->dev = dev; + lapb->callbacks = *callbacks; + + __lapb_insert_cb(lapb); + + lapb_start_t1timer(lapb); + + rc = LAPB_OK; +out: + write_unlock_bh(&lapb_list_lock); + return rc; +} + +int lapb_unregister(struct net_device *dev) +{ + struct lapb_cb *lapb; + int rc = LAPB_BADTOKEN; + + write_lock_bh(&lapb_list_lock); + lapb = __lapb_devtostruct(dev); + if (!lapb) + goto out; + + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + + lapb_clear_queues(lapb); + + __lapb_remove_cb(lapb); + + lapb_put(lapb); + rc = LAPB_OK; +out: + write_unlock_bh(&lapb_list_lock); + return rc; +} + +int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) +{ + int rc = LAPB_BADTOKEN; + struct lapb_cb *lapb = lapb_devtostruct(dev); + + if (!lapb) + goto out; + + parms->t1 = lapb->t1 / HZ; + parms->t2 = lapb->t2 / HZ; + parms->n2 = lapb->n2; + parms->n2count = lapb->n2count; + parms->state = lapb->state; + parms->window = lapb->window; + parms->mode = lapb->mode; + + if (!timer_pending(&lapb->t1timer)) + parms->t1timer = 0; + else + parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; + + if (!timer_pending(&lapb->t2timer)) + parms->t2timer = 0; + else + parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; + + lapb_put(lapb); + rc = LAPB_OK; +out: + return rc; +} + +int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) +{ + int rc = LAPB_BADTOKEN; + struct lapb_cb *lapb = lapb_devtostruct(dev); + + if (!lapb) + goto out; + + rc = LAPB_INVALUE; + if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) + goto out_put; + + if (lapb->state == LAPB_STATE_0) { + if (parms->mode & LAPB_EXTENDED) { + if (parms->window < 1 || parms->window > 127) + goto out_put; + } else { + if (parms->window < 1 || parms->window > 7) + goto out_put; + } + lapb->mode = parms->mode; + lapb->window = parms->window; + } + + lapb->t1 = parms->t1 * HZ; + lapb->t2 = parms->t2 * HZ; + lapb->n2 = parms->n2; + + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_connect_request(struct net_device *dev) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + rc = LAPB_OK; + if (lapb->state == LAPB_STATE_1) + goto out_put; + + rc = LAPB_CONNECTED; + if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) + goto out_put; + + lapb_establish_data_link(lapb); + +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S0 -> S1\n", lapb->dev); +#endif + lapb->state = LAPB_STATE_1; + + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_disconnect_request(struct net_device *dev) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + switch (lapb->state) { + case LAPB_STATE_0: + rc = LAPB_NOTCONNECTED; + goto out_put; + + case LAPB_STATE_1: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX DISC(1)\n", lapb->dev); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + rc = LAPB_NOTCONNECTED; + goto out_put; + + case LAPB_STATE_2: + rc = LAPB_OK; + goto out_put; + } + + lapb_clear_queues(lapb); + lapb->n2count = 0; + lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_2; + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 DISC(1)\n", lapb->dev); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S2\n", lapb->dev); +#endif + + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_data_request(struct net_device *dev, struct sk_buff *skb) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + rc = LAPB_NOTCONNECTED; + if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) + goto out_put; + + skb_queue_tail(&lapb->write_queue, skb); + lapb_kick(lapb); + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_data_received(struct net_device *dev, struct sk_buff *skb) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (lapb) { + lapb_data_input(lapb, skb); + lapb_put(lapb); + rc = LAPB_OK; + } + + return rc; +} + +void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.connect_confirmation) + lapb->callbacks.connect_confirmation(lapb->dev, reason); +} + +void lapb_connect_indication(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.connect_indication) + lapb->callbacks.connect_indication(lapb->dev, reason); +} + +void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.disconnect_confirmation) + lapb->callbacks.disconnect_confirmation(lapb->dev, reason); +} + +void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.disconnect_indication) + lapb->callbacks.disconnect_indication(lapb->dev, reason); +} + +int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) +{ + if (lapb->callbacks.data_indication) + return lapb->callbacks.data_indication(lapb->dev, skb); + + kfree_skb(skb); + return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */ +} + +int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) +{ + int used = 0; + + if (lapb->callbacks.data_transmit) { + lapb->callbacks.data_transmit(lapb->dev, skb); + used = 1; + } + + return used; +} + +EXPORT_SYMBOL(lapb_register); +EXPORT_SYMBOL(lapb_unregister); +EXPORT_SYMBOL(lapb_getparms); +EXPORT_SYMBOL(lapb_setparms); +EXPORT_SYMBOL(lapb_connect_request); +EXPORT_SYMBOL(lapb_disconnect_request); +EXPORT_SYMBOL(lapb_data_request); +EXPORT_SYMBOL(lapb_data_received); + +static int __init lapb_init(void) +{ + return 0; +} + +static void __exit lapb_exit(void) +{ + WARN_ON(!list_empty(&lapb_list)); +} + +MODULE_AUTHOR("Jonathan Naylor "); +MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); +MODULE_LICENSE("GPL"); + +module_init(lapb_init); +module_exit(lapb_exit); diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c new file mode 100644 index 00000000..21904a00 --- /dev/null +++ b/net/lapb/lapb_in.c @@ -0,0 +1,724 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naulor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + * 2000-10-29 Henner Eisen lapb_data_indication() return status. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 0, Disconnected State. + * The handling of the timer(s) is in file lapb_timer.c. + */ +static void lapb_state0_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S0 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S0 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 RX DISC(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + break; + + default: + break; + } + + kfree_skb(skb); +} + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file lapb_timer.c. + */ +static void lapb_state1_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX DISC(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + break; + + case LAPB_UA: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX UA(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S3\n", + lapb->dev); +#endif + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_confirmation(lapb, LAPB_OK); + } + break; + + case LAPB_DM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", + lapb->dev); +#endif + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_indication(lapb, LAPB_REFUSED); + } + break; + } + + kfree_skb(skb); +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file lapb_timer.c + */ +static void lapb_state2_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX {SABM,SABME}(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S2 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX DISC(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S2 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + break; + + case LAPB_UA: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX UA(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", + lapb->dev); +#endif + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_confirmation(lapb, LAPB_OK); + } + break; + + case LAPB_DM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", + lapb->dev); +#endif + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_confirmation(lapb, + LAPB_NOTCONNECTED); + } + break; + + case LAPB_I: + case LAPB_REJ: + case LAPB_RNR: + case LAPB_RR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX {I,REJ,RNR,RR}" + "(%d)\n", lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S2 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + break; + } + + kfree_skb(skb); +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file lapb_timer.c + */ +static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + int queued = 0; + int modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : + LAPB_SMODULUS; + + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_requeue_frames(lapb); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_requeue_frames(lapb); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX DISC(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", + lapb->dev); +#endif + lapb_clear_queues(lapb); + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_indication(lapb, LAPB_OK); + break; + + case LAPB_DM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", + lapb->dev); +#endif + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_indication(lapb, LAPB_NOTCONNECTED); + break; + + case LAPB_RNR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX RNR(%d) R%d\n", + lapb->dev, frame->pf, frame->nr); +#endif + lapb->condition |= LAPB_PEER_RX_BUSY_CONDITION; + lapb_check_need_response(lapb, frame->cr, frame->pf); + if (lapb_validate_nr(lapb, frame->nr)) { + lapb_check_iframes_acked(lapb, frame->nr); + } else { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + } + break; + + case LAPB_RR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX RR(%d) R%d\n", + lapb->dev, frame->pf, frame->nr); +#endif + lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION; + lapb_check_need_response(lapb, frame->cr, frame->pf); + if (lapb_validate_nr(lapb, frame->nr)) { + lapb_check_iframes_acked(lapb, frame->nr); + } else { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + } + break; + + case LAPB_REJ: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX REJ(%d) R%d\n", + lapb->dev, frame->pf, frame->nr); +#endif + lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION; + lapb_check_need_response(lapb, frame->cr, frame->pf); + if (lapb_validate_nr(lapb, frame->nr)) { + lapb_frames_acked(lapb, frame->nr); + lapb_stop_t1timer(lapb); + lapb->n2count = 0; + lapb_requeue_frames(lapb); + } else { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + } + break; + + case LAPB_I: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX I(%d) S%d R%d\n", + lapb->dev, frame->pf, frame->ns, frame->nr); +#endif + if (!lapb_validate_nr(lapb, frame->nr)) { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + break; + } + if (lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) + lapb_frames_acked(lapb, frame->nr); + else + lapb_check_iframes_acked(lapb, frame->nr); + + if (frame->ns == lapb->vr) { + int cn; + cn = lapb_data_indication(lapb, skb); + queued = 1; + /* + * If upper layer has dropped the frame, we + * basically ignore any further protocol + * processing. This will cause the peer + * to re-transmit the frame later like + * a frame lost on the wire. + */ + if (cn == NET_RX_DROP) { + printk(KERN_DEBUG + "LAPB: rx congestion\n"); + break; + } + lapb->vr = (lapb->vr + 1) % modulus; + lapb->condition &= ~LAPB_REJECT_CONDITION; + if (frame->pf) + lapb_enquiry_response(lapb); + else { + if (!(lapb->condition & + LAPB_ACK_PENDING_CONDITION)) { + lapb->condition |= LAPB_ACK_PENDING_CONDITION; + lapb_start_t2timer(lapb); + } + } + } else { + if (lapb->condition & LAPB_REJECT_CONDITION) { + if (frame->pf) + lapb_enquiry_response(lapb); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG + "lapb: (%p) S3 TX REJ(%d) R%d\n", + lapb->dev, frame->pf, lapb->vr); +#endif + lapb->condition |= LAPB_REJECT_CONDITION; + lapb_send_control(lapb, LAPB_REJ, + frame->pf, + LAPB_RESPONSE); + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; + } + } + break; + + case LAPB_FRMR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX FRMR(%d) %02X " + "%02X %02X %02X %02X\n", lapb->dev, frame->pf, + skb->data[0], skb->data[1], skb->data[2], + skb->data[3], skb->data[4]); +#endif + lapb_establish_data_link(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S1\n", + lapb->dev); +#endif + lapb_requeue_frames(lapb); + lapb->state = LAPB_STATE_1; + break; + + case LAPB_ILLEGAL: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX ILLEGAL(%d)\n", + lapb->dev, frame->pf); +#endif + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_W; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + break; + } + + if (!queued) + kfree_skb(skb); +} + +/* + * State machine for state 4, Frame Reject State. + * The handling of the timer(s) is in file lapb_timer.c. + */ +static void lapb_state4_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S4 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S4 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + } + + kfree_skb(skb); +} + +/* + * Process an incoming LAPB frame + */ +void lapb_data_input(struct lapb_cb *lapb, struct sk_buff *skb) +{ + struct lapb_frame frame; + + if (lapb_decode(lapb, skb, &frame) < 0) { + kfree_skb(skb); + return; + } + + switch (lapb->state) { + case LAPB_STATE_0: + lapb_state0_machine(lapb, skb, &frame); break; + case LAPB_STATE_1: + lapb_state1_machine(lapb, skb, &frame); break; + case LAPB_STATE_2: + lapb_state2_machine(lapb, skb, &frame); break; + case LAPB_STATE_3: + lapb_state3_machine(lapb, skb, &frame); break; + case LAPB_STATE_4: + lapb_state4_machine(lapb, skb, &frame); break; + } + + lapb_kick(lapb); +} diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c new file mode 100644 index 00000000..c75a7954 --- /dev/null +++ b/net/lapb/lapb_out.c @@ -0,0 +1,224 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void lapb_send_iframe(struct lapb_cb *lapb, struct sk_buff *skb, int poll_bit) +{ + unsigned char *frame; + + if (!skb) + return; + + if (lapb->mode & LAPB_EXTENDED) { + frame = skb_push(skb, 2); + + frame[0] = LAPB_I; + frame[0] |= lapb->vs << 1; + frame[1] = poll_bit ? LAPB_EPF : 0; + frame[1] |= lapb->vr << 1; + } else { + frame = skb_push(skb, 1); + + *frame = LAPB_I; + *frame |= poll_bit ? LAPB_SPF : 0; + *frame |= lapb->vr << 5; + *frame |= lapb->vs << 1; + } + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX I(%d) S%d R%d\n", + lapb->dev, lapb->state, poll_bit, lapb->vs, lapb->vr); +#endif + + lapb_transmit_buffer(lapb, skb, LAPB_COMMAND); +} + +void lapb_kick(struct lapb_cb *lapb) +{ + struct sk_buff *skb, *skbn; + unsigned short modulus, start, end; + + modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; + start = !skb_peek(&lapb->ack_queue) ? lapb->va : lapb->vs; + end = (lapb->va + lapb->window) % modulus; + + if (!(lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) && + start != end && skb_peek(&lapb->write_queue)) { + lapb->vs = start; + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&lapb->write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&lapb->write_queue, skb); + break; + } + + if (skb->sk) + skb_set_owner_w(skbn, skb->sk); + + /* + * Transmit the frame copy. + */ + lapb_send_iframe(lapb, skbn, LAPB_POLLOFF); + + lapb->vs = (lapb->vs + 1) % modulus; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&lapb->ack_queue, skb); + + } while (lapb->vs != end && (skb = skb_dequeue(&lapb->write_queue)) != NULL); + + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; + + if (!lapb_t1timer_running(lapb)) + lapb_start_t1timer(lapb); + } +} + +void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type) +{ + unsigned char *ptr; + + ptr = skb_push(skb, 1); + + if (lapb->mode & LAPB_MLP) { + if (lapb->mode & LAPB_DCE) { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_C; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_D; + } else { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_D; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_C; + } + } else { + if (lapb->mode & LAPB_DCE) { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_A; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_B; + } else { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_B; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_A; + } + } + +#if LAPB_DEBUG > 2 + printk(KERN_DEBUG "lapb: (%p) S%d TX %02X %02X %02X\n", + lapb->dev, lapb->state, + skb->data[0], skb->data[1], skb->data[2]); +#endif + + if (!lapb_data_transmit(lapb, skb)) + kfree_skb(skb); +} + +void lapb_establish_data_link(struct lapb_cb *lapb) +{ + lapb->condition = 0x00; + lapb->n2count = 0; + + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX SABME(1)\n", + lapb->dev, lapb->state); +#endif + lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX SABM(1)\n", + lapb->dev, lapb->state); +#endif + lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); + } + + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); +} + +void lapb_enquiry_response(struct lapb_cb *lapb) +{ +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX RR(1) R%d\n", + lapb->dev, lapb->state, lapb->vr); +#endif + + lapb_send_control(lapb, LAPB_RR, LAPB_POLLON, LAPB_RESPONSE); + + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; +} + +void lapb_timeout_response(struct lapb_cb *lapb) +{ +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX RR(0) R%d\n", + lapb->dev, lapb->state, lapb->vr); +#endif + lapb_send_control(lapb, LAPB_RR, LAPB_POLLOFF, LAPB_RESPONSE); + + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; +} + +void lapb_check_iframes_acked(struct lapb_cb *lapb, unsigned short nr) +{ + if (lapb->vs == nr) { + lapb_frames_acked(lapb, nr); + lapb_stop_t1timer(lapb); + lapb->n2count = 0; + } else if (lapb->va != nr) { + lapb_frames_acked(lapb, nr); + lapb_start_t1timer(lapb); + } +} + +void lapb_check_need_response(struct lapb_cb *lapb, int type, int pf) +{ + if (type == LAPB_COMMAND && pf) + lapb_enquiry_response(lapb); +} diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c new file mode 100644 index 00000000..43a2a7fb --- /dev/null +++ b/net/lapb/lapb_subr.c @@ -0,0 +1,313 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all the queues of frames. + */ +void lapb_clear_queues(struct lapb_cb *lapb) +{ + skb_queue_purge(&lapb->write_queue); + skb_queue_purge(&lapb->ack_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void lapb_frames_acked(struct lapb_cb *lapb, unsigned short nr) +{ + struct sk_buff *skb; + int modulus; + + modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (lapb->va != nr) + while (skb_peek(&lapb->ack_queue) && lapb->va != nr) { + skb = skb_dequeue(&lapb->ack_queue); + kfree_skb(skb); + lapb->va = (lapb->va + 1) % modulus; + } +} + +void lapb_requeue_frames(struct lapb_cb *lapb) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by lapb_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ + while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) { + if (!skb_prev) + skb_queue_head(&lapb->write_queue, skb); + else + skb_append(skb_prev, skb, &lapb->write_queue); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int lapb_validate_nr(struct lapb_cb *lapb, unsigned short nr) +{ + unsigned short vc = lapb->va; + int modulus; + + modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; + + while (vc != lapb->vs) { + if (nr == vc) + return 1; + vc = (vc + 1) % modulus; + } + + return nr == lapb->vs; +} + +/* + * This routine is the centralised routine for parsing the control + * information for the different frame formats. + */ +int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + frame->type = LAPB_ILLEGAL; + +#if LAPB_DEBUG > 2 + printk(KERN_DEBUG "lapb: (%p) S%d RX %02X %02X %02X\n", + lapb->dev, lapb->state, + skb->data[0], skb->data[1], skb->data[2]); +#endif + + /* We always need to look at 2 bytes, sometimes we need + * to look at 3 and those cases are handled below. + */ + if (!pskb_may_pull(skb, 2)) + return -1; + + if (lapb->mode & LAPB_MLP) { + if (lapb->mode & LAPB_DCE) { + if (skb->data[0] == LAPB_ADDR_D) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_C) + frame->cr = LAPB_RESPONSE; + } else { + if (skb->data[0] == LAPB_ADDR_C) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_D) + frame->cr = LAPB_RESPONSE; + } + } else { + if (lapb->mode & LAPB_DCE) { + if (skb->data[0] == LAPB_ADDR_B) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_A) + frame->cr = LAPB_RESPONSE; + } else { + if (skb->data[0] == LAPB_ADDR_A) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_B) + frame->cr = LAPB_RESPONSE; + } + } + + skb_pull(skb, 1); + + if (lapb->mode & LAPB_EXTENDED) { + if (!(skb->data[0] & LAPB_S)) { + if (!pskb_may_pull(skb, 2)) + return -1; + /* + * I frame - carries NR/NS/PF + */ + frame->type = LAPB_I; + frame->ns = (skb->data[0] >> 1) & 0x7F; + frame->nr = (skb->data[1] >> 1) & 0x7F; + frame->pf = skb->data[1] & LAPB_EPF; + frame->control[0] = skb->data[0]; + frame->control[1] = skb->data[1]; + skb_pull(skb, 2); + } else if ((skb->data[0] & LAPB_U) == 1) { + if (!pskb_may_pull(skb, 2)) + return -1; + /* + * S frame - take out PF/NR + */ + frame->type = skb->data[0] & 0x0F; + frame->nr = (skb->data[1] >> 1) & 0x7F; + frame->pf = skb->data[1] & LAPB_EPF; + frame->control[0] = skb->data[0]; + frame->control[1] = skb->data[1]; + skb_pull(skb, 2); + } else if ((skb->data[0] & LAPB_U) == 3) { + /* + * U frame - take out PF + */ + frame->type = skb->data[0] & ~LAPB_SPF; + frame->pf = skb->data[0] & LAPB_SPF; + frame->control[0] = skb->data[0]; + frame->control[1] = 0x00; + skb_pull(skb, 1); + } + } else { + if (!(skb->data[0] & LAPB_S)) { + /* + * I frame - carries NR/NS/PF + */ + frame->type = LAPB_I; + frame->ns = (skb->data[0] >> 1) & 0x07; + frame->nr = (skb->data[0] >> 5) & 0x07; + frame->pf = skb->data[0] & LAPB_SPF; + } else if ((skb->data[0] & LAPB_U) == 1) { + /* + * S frame - take out PF/NR + */ + frame->type = skb->data[0] & 0x0F; + frame->nr = (skb->data[0] >> 5) & 0x07; + frame->pf = skb->data[0] & LAPB_SPF; + } else if ((skb->data[0] & LAPB_U) == 3) { + /* + * U frame - take out PF + */ + frame->type = skb->data[0] & ~LAPB_SPF; + frame->pf = skb->data[0] & LAPB_SPF; + } + + frame->control[0] = skb->data[0]; + + skb_pull(skb, 1); + } + + return 0; +} + +/* + * This routine is called when the HDLC layer internally generates a + * command or response for the remote machine ( eg. RR, UA etc. ). + * Only supervisory or unnumbered frames are processed, FRMRs are handled + * by lapb_transmit_frmr below. + */ +void lapb_send_control(struct lapb_cb *lapb, int frametype, + int poll_bit, int type) +{ + struct sk_buff *skb; + unsigned char *dptr; + + if ((skb = alloc_skb(LAPB_HEADER_LEN + 3, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, LAPB_HEADER_LEN + 1); + + if (lapb->mode & LAPB_EXTENDED) { + if ((frametype & LAPB_U) == LAPB_U) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= poll_bit ? LAPB_SPF : 0; + } else { + dptr = skb_put(skb, 2); + dptr[0] = frametype; + dptr[1] = (lapb->vr << 1); + dptr[1] |= poll_bit ? LAPB_EPF : 0; + } + } else { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= poll_bit ? LAPB_SPF : 0; + if ((frametype & LAPB_U) == LAPB_S) /* S frames carry NR */ + *dptr |= (lapb->vr << 5); + } + + lapb_transmit_buffer(lapb, skb, type); +} + +/* + * This routine generates FRMRs based on information previously stored in + * the LAPB control block. + */ +void lapb_transmit_frmr(struct lapb_cb *lapb) +{ + struct sk_buff *skb; + unsigned char *dptr; + + if ((skb = alloc_skb(LAPB_HEADER_LEN + 7, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, LAPB_HEADER_LEN + 1); + + if (lapb->mode & LAPB_EXTENDED) { + dptr = skb_put(skb, 6); + *dptr++ = LAPB_FRMR; + *dptr++ = lapb->frmr_data.control[0]; + *dptr++ = lapb->frmr_data.control[1]; + *dptr++ = (lapb->vs << 1) & 0xFE; + *dptr = (lapb->vr << 1) & 0xFE; + if (lapb->frmr_data.cr == LAPB_RESPONSE) + *dptr |= 0x01; + dptr++; + *dptr++ = lapb->frmr_type; + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX FRMR %02X %02X %02X %02X %02X\n", + lapb->dev, lapb->state, + skb->data[1], skb->data[2], skb->data[3], + skb->data[4], skb->data[5]); +#endif + } else { + dptr = skb_put(skb, 4); + *dptr++ = LAPB_FRMR; + *dptr++ = lapb->frmr_data.control[0]; + *dptr = (lapb->vs << 1) & 0x0E; + *dptr |= (lapb->vr << 5) & 0xE0; + if (lapb->frmr_data.cr == LAPB_RESPONSE) + *dptr |= 0x10; + dptr++; + *dptr++ = lapb->frmr_type; + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX FRMR %02X %02X %02X\n", + lapb->dev, lapb->state, skb->data[1], + skb->data[2], skb->data[3]); +#endif + } + + lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE); +} diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c new file mode 100644 index 00000000..af6d14b4 --- /dev/null +++ b/net/lapb/lapb_timer.c @@ -0,0 +1,189 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void lapb_t1timer_expiry(unsigned long); +static void lapb_t2timer_expiry(unsigned long); + +void lapb_start_t1timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t1timer); + + lapb->t1timer.data = (unsigned long)lapb; + lapb->t1timer.function = &lapb_t1timer_expiry; + lapb->t1timer.expires = jiffies + lapb->t1; + + add_timer(&lapb->t1timer); +} + +void lapb_start_t2timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t2timer); + + lapb->t2timer.data = (unsigned long)lapb; + lapb->t2timer.function = &lapb_t2timer_expiry; + lapb->t2timer.expires = jiffies + lapb->t2; + + add_timer(&lapb->t2timer); +} + +void lapb_stop_t1timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t1timer); +} + +void lapb_stop_t2timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t2timer); +} + +int lapb_t1timer_running(struct lapb_cb *lapb) +{ + return timer_pending(&lapb->t1timer); +} + +static void lapb_t2timer_expiry(unsigned long param) +{ + struct lapb_cb *lapb = (struct lapb_cb *)param; + + if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; + lapb_timeout_response(lapb); + } +} + +static void lapb_t1timer_expiry(unsigned long param) +{ + struct lapb_cb *lapb = (struct lapb_cb *)param; + + switch (lapb->state) { + + /* + * If we are a DCE, keep going DM .. DM .. DM + */ + case LAPB_STATE_0: + if (lapb->mode & LAPB_DCE) + lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE); + break; + + /* + * Awaiting connection state, send SABM(E), up to N2 times. + */ + case LAPB_STATE_1: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX SABME(1)\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX SABM(1)\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); + } + } + break; + + /* + * Awaiting disconnection state, send DISC, up to N2 times. + */ + case LAPB_STATE_2: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 TX DISC(1)\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); + } + break; + + /* + * Data transfer state, restransmit I frames, up to N2 times. + */ + case LAPB_STATE_3: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_stop_t2timer(lapb); + lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; + lapb_requeue_frames(lapb); + } + break; + + /* + * Frame reject state, restransmit FRMR frames, up to N2 times. + */ + case LAPB_STATE_4: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S4 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; + lapb_transmit_frmr(lapb); + } + break; + } + + lapb_start_t1timer(lapb); +} diff --git a/net/llc/Kconfig b/net/llc/Kconfig new file mode 100644 index 00000000..b91c6510 --- /dev/null +++ b/net/llc/Kconfig @@ -0,0 +1,10 @@ +config LLC + tristate + depends on NET + +config LLC2 + tristate "ANSI/IEEE 802.2 LLC type 2 Support" + select LLC + help + This is a Logical Link Layer type 2, connection oriented support. + Select this if you want to have support for PF_LLC sockets. diff --git a/net/llc/Makefile b/net/llc/Makefile new file mode 100644 index 00000000..4e260cff --- /dev/null +++ b/net/llc/Makefile @@ -0,0 +1,25 @@ +########################################################################### +# Makefile for the Linux 802.2 LLC (fully-functional) layer. +# +# Copyright (c) 1997 by Procom Technology,Inc. +# 2001-2003 by Arnaldo Carvalho de Melo +# +# This program can be redistributed or modified under the terms of the +# GNU General Public License as published by the Free Software Foundation. +# This program is distributed without any warranty or implied warranty +# of merchantability or fitness for a particular purpose. +# +# See the GNU General Public License for more details. +########################################################################### + +obj-$(CONFIG_LLC) += llc.o + +llc-y := llc_core.o llc_input.o llc_output.o + +obj-$(CONFIG_LLC2) += llc2.o + +llc2-y := llc_if.o llc_c_ev.o llc_c_ac.o llc_conn.o llc_c_st.o llc_pdu.o \ + llc_sap.o llc_s_ac.o llc_s_ev.o llc_s_st.o af_llc.o llc_station.o + +llc2-$(CONFIG_PROC_FS) += llc_proc.o +llc2-$(CONFIG_SYSCTL) += sysctl_net_llc.o diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c new file mode 100644 index 00000000..a18e6c3d --- /dev/null +++ b/net/llc/af_llc.c @@ -0,0 +1,1248 @@ +/* + * af_llc.c - LLC User Interface SAPs + * Description: + * Functions in this module are implementation of socket based llc + * communications for the Linux operating system. Support of llc class + * one and class two is provided via SOCK_DGRAM and SOCK_STREAM + * respectively. + * + * An llc2 connection is (mac + sap), only one llc2 sap connection + * is allowed per mac. Though one sap may have multiple mac + sap + * connections. + * + * Copyright (c) 2001 by Jay Schulist + * 2002-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* remember: uninitialized global data is zeroed because its in .bss */ +static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; +static u16 llc_ui_sap_link_no_max[256]; +static struct sockaddr_llc llc_ui_addrnull; +static const struct proto_ops llc_ui_ops; + +static int llc_ui_wait_for_conn(struct sock *sk, long timeout); +static int llc_ui_wait_for_disc(struct sock *sk, long timeout); +static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); + +#if 0 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +/* Maybe we'll add some more in the future. */ +#define LLC_CMSG_PKTINFO 1 + + +/** + * llc_ui_next_link_no - return the next unused link number for a sap + * @sap: Address of sap to get link number from. + * + * Return the next unused link number for a given sap. + */ +static inline u16 llc_ui_next_link_no(int sap) +{ + return llc_ui_sap_link_no_max[sap]++; +} + +/** + * llc_proto_type - return eth protocol for ARP header type + * @arphrd: ARP header type. + * + * Given an ARP header type return the corresponding ethernet protocol. + */ +static inline __be16 llc_proto_type(u16 arphrd) +{ + return arphrd == ARPHRD_IEEE802_TR ? + htons(ETH_P_TR_802_2) : htons(ETH_P_802_2); +} + +/** + * llc_ui_addr_null - determines if a address structure is null + * @addr: Address to test if null. + */ +static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr) +{ + return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); +} + +/** + * llc_ui_header_len - return length of llc header based on operation + * @sk: Socket which contains a valid llc socket type. + * @addr: Complete sockaddr_llc structure received from the user. + * + * Provide the length of the llc header depending on what kind of + * operation the user would like to perform and the type of socket. + * Returns the correct llc header length. + */ +static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) +{ + u8 rc = LLC_PDU_LEN_U; + + if (addr->sllc_test || addr->sllc_xid) + rc = LLC_PDU_LEN_U; + else if (sk->sk_type == SOCK_STREAM) + rc = LLC_PDU_LEN_I; + return rc; +} + +/** + * llc_ui_send_data - send data via reliable llc2 connection + * @sk: Connection the socket is using. + * @skb: Data the user wishes to send. + * @noblock: can we block waiting for data? + * + * Send data via reliable llc2 connection. + * Returns 0 upon success, non-zero if action did not succeed. + */ +static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) +{ + struct llc_sock* llc = llc_sk(sk); + int rc = 0; + + if (unlikely(llc_data_accept_state(llc->state) || + llc->remote_busy_flag || + llc->p_flag)) { + long timeout = sock_sndtimeo(sk, noblock); + + rc = llc_ui_wait_for_busy_core(sk, timeout); + } + if (unlikely(!rc)) + rc = llc_build_and_send_pkt(sk, skb); + return rc; +} + +static void llc_ui_sk_init(struct socket *sock, struct sock *sk) +{ + sock_graft(sk, sock); + sk->sk_type = sock->type; + sock->ops = &llc_ui_ops; +} + +static struct proto llc_proto = { + .name = "LLC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct llc_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, +}; + +/** + * llc_ui_create - alloc and init a new llc_ui socket + * @net: network namespace (must be default network) + * @sock: Socket to initialize and attach allocated sk to. + * @protocol: Unused. + * @kern: on behalf of kernel or userspace + * + * Allocate and initialize a new llc_ui socket, validate the user wants a + * socket type we have available. + * Returns 0 upon success, negative upon failure. + */ +static int llc_ui_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + int rc = -ESOCKTNOSUPPORT; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { + rc = -ENOMEM; + sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto); + if (sk) { + rc = 0; + llc_ui_sk_init(sock, sk); + } + } + return rc; +} + +/** + * llc_ui_release - shutdown socket + * @sock: Socket to release. + * + * Shutdown and deallocate an existing socket. + */ +static int llc_ui_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc; + + if (unlikely(sk == NULL)) + goto out; + sock_hold(sk); + lock_sock(sk); + llc = llc_sk(sk); + dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, + llc->laddr.lsap, llc->daddr.lsap); + if (!llc_send_disc(sk)) + llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + if (!sock_flag(sk, SOCK_ZAPPED)) + llc_sap_remove_socket(llc->sap, sk); + release_sock(sk); + if (llc->dev) + dev_put(llc->dev); + sock_put(sk); + llc_sk_free(sk); +out: + return 0; +} + +/** + * llc_ui_autoport - provide dynamically allocate SAP number + * + * Provide the caller with a dynamically allocated SAP number according + * to the rules that are set in this function. Returns: 0, upon failure, + * SAP number otherwise. + */ +static int llc_ui_autoport(void) +{ + struct llc_sap *sap; + int i, tries = 0; + + while (tries < LLC_SAP_DYN_TRIES) { + for (i = llc_ui_sap_last_autoport; + i < LLC_SAP_DYN_STOP; i += 2) { + sap = llc_sap_find(i); + if (!sap) { + llc_ui_sap_last_autoport = i + 2; + goto out; + } + llc_sap_put(sap); + } + llc_ui_sap_last_autoport = LLC_SAP_DYN_START; + tries++; + } + i = 0; +out: + return i; +} + +/** + * llc_ui_autobind - automatically bind a socket to a sap + * @sock: socket to bind + * @addr: address to connect to + * + * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't + * specifically used llc_ui_bind to bind to an specific address/sap + * + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap; + int rc = -EINVAL; + + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out; + rc = -ENODEV; + if (sk->sk_bound_dev_if) { + llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + if (llc->dev && addr->sllc_arphrd != llc->dev->type) { + dev_put(llc->dev); + llc->dev = NULL; + } + } else + llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); + if (!llc->dev) + goto out; + rc = -EUSERS; + llc->laddr.lsap = llc_ui_autoport(); + if (!llc->laddr.lsap) + goto out; + rc = -EBUSY; /* some other network layer is using the sap */ + sap = llc_sap_open(llc->laddr.lsap, NULL); + if (!sap) + goto out; + memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); + memcpy(&llc->addr, addr, sizeof(llc->addr)); + /* assign new connection to its SAP */ + llc_sap_add_socket(sap, sk); + sock_reset_flag(sk, SOCK_ZAPPED); + rc = 0; +out: + return rc; +} + +/** + * llc_ui_bind - bind a socket to a specific address. + * @sock: Socket to bind an address to. + * @uaddr: Address the user wants the socket bound to. + * @addrlen: Length of the uaddr structure. + * + * Bind a socket to a specific address. For llc a user is able to bind to + * a specific sap only or mac + sap. + * If the user desires to bind to a specific mac + sap, it is possible to + * have multiple sap connections via multiple macs. + * Bind and autobind for that matter must enforce the correct sap usage + * otherwise all hell will break loose. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) +{ + struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap; + int rc = -EINVAL; + + dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); + if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) + goto out; + rc = -EAFNOSUPPORT; + if (unlikely(addr->sllc_family != AF_LLC)) + goto out; + rc = -ENODEV; + rcu_read_lock(); + if (sk->sk_bound_dev_if) { + llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); + if (llc->dev) { + if (!addr->sllc_arphrd) + addr->sllc_arphrd = llc->dev->type; + if (llc_mac_null(addr->sllc_mac)) + memcpy(addr->sllc_mac, llc->dev->dev_addr, + IFHWADDRLEN); + if (addr->sllc_arphrd != llc->dev->type || + !llc_mac_match(addr->sllc_mac, + llc->dev->dev_addr)) { + rc = -EINVAL; + llc->dev = NULL; + } + } + } else + llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, + addr->sllc_mac); + if (llc->dev) + dev_hold(llc->dev); + rcu_read_unlock(); + if (!llc->dev) + goto out; + if (!addr->sllc_sap) { + rc = -EUSERS; + addr->sllc_sap = llc_ui_autoport(); + if (!addr->sllc_sap) + goto out; + } + sap = llc_sap_find(addr->sllc_sap); + if (!sap) { + sap = llc_sap_open(addr->sllc_sap, NULL); + rc = -EBUSY; /* some other network layer is using the sap */ + if (!sap) + goto out; + } else { + struct llc_addr laddr, daddr; + struct sock *ask; + + memset(&laddr, 0, sizeof(laddr)); + memset(&daddr, 0, sizeof(daddr)); + /* + * FIXME: check if the address is multicast, + * only SOCK_DGRAM can do this. + */ + memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); + laddr.lsap = addr->sllc_sap; + rc = -EADDRINUSE; /* mac + sap clash. */ + ask = llc_lookup_established(sap, &daddr, &laddr); + if (ask) { + sock_put(ask); + goto out_put; + } + } + llc->laddr.lsap = addr->sllc_sap; + memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); + memcpy(&llc->addr, addr, sizeof(llc->addr)); + /* assign new connection to its SAP */ + llc_sap_add_socket(sap, sk); + sock_reset_flag(sk, SOCK_ZAPPED); + rc = 0; +out_put: + llc_sap_put(sap); +out: + return rc; +} + +/** + * llc_ui_shutdown - shutdown a connect llc2 socket. + * @sock: Socket to shutdown. + * @how: What part of the socket to shutdown. + * + * Shutdown a connected llc2 socket. Currently this function only supports + * shutting down both sends and receives (2), we could probably make this + * function such that a user can shutdown only half the connection but not + * right now. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int rc = -ENOTCONN; + + lock_sock(sk); + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + goto out; + rc = -EINVAL; + if (how != 2) + goto out; + rc = llc_send_disc(sk); + if (!rc) + rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + /* Wake up anyone sleeping in poll */ + sk->sk_state_change(sk); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_connect - Connect to a remote llc2 mac + sap. + * @sock: Socket which will be connected to the remote destination. + * @uaddr: Remote and possibly the local address of the new connection. + * @addrlen: Size of uaddr structure. + * @flags: Operational flags specified by the user. + * + * Connect to a remote llc2 mac + sap. The caller must specify the + * destination mac and address to connect to. If the user hasn't previously + * called bind(2) with a smac the address of the first interface of the + * specified arp type will be used. + * This function will autobind if user did not previously call bind. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, + int addrlen, int flags) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; + int rc = -EINVAL; + + lock_sock(sk); + if (unlikely(addrlen != sizeof(*addr))) + goto out; + rc = -EAFNOSUPPORT; + if (unlikely(addr->sllc_family != AF_LLC)) + goto out; + if (unlikely(sk->sk_type != SOCK_STREAM)) + goto out; + rc = -EALREADY; + if (unlikely(sock->state == SS_CONNECTING)) + goto out; + /* bind connection to sap if user hasn't done it. */ + if (sock_flag(sk, SOCK_ZAPPED)) { + /* bind to sap with null dev, exclusive */ + rc = llc_ui_autobind(sock, addr); + if (rc) + goto out; + } + llc->daddr.lsap = addr->sllc_sap; + memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); + rc = llc_establish_connection(sk, llc->dev->dev_addr, + addr->sllc_mac, addr->sllc_sap); + if (rc) { + dprintk("%s: llc_ui_send_conn failed :-(\n", __func__); + sock->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + goto out; + } + + if (sk->sk_state == TCP_SYN_SENT) { + const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) + goto out; + + rc = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + } + + if (sk->sk_state == TCP_CLOSE) + goto sock_error; + + sock->state = SS_CONNECTED; + rc = 0; +out: + release_sock(sk); + return rc; +sock_error: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; +} + +/** + * llc_ui_listen - allow a normal socket to accept incoming connections + * @sock: Socket to allow incoming connections on. + * @backlog: Number of connections to queue. + * + * Allow a normal socket to accept incoming connections. + * Returns 0 upon success, negative otherwise. + */ +static int llc_ui_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int rc = -EINVAL; + + lock_sock(sk); + if (unlikely(sock->state != SS_UNCONNECTED)) + goto out; + rc = -EOPNOTSUPP; + if (unlikely(sk->sk_type != SOCK_STREAM)) + goto out; + rc = -EAGAIN; + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + rc = 0; + if (!(unsigned)backlog) /* BSDism */ + backlog = 1; + sk->sk_max_ack_backlog = backlog; + if (sk->sk_state != TCP_LISTEN) { + sk->sk_ack_backlog = 0; + sk->sk_state = TCP_LISTEN; + } + sk->sk_socket->flags |= __SO_ACCEPTCON; +out: + release_sock(sk); + return rc; +} + +static int llc_ui_wait_for_disc(struct sock *sk, long timeout) +{ + DEFINE_WAIT(wait); + int rc = 0; + + while (1) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE)) + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + rc = 0; + } + finish_wait(sk_sleep(sk), &wait); + return rc; +} + +static int llc_ui_wait_for_conn(struct sock *sk, long timeout) +{ + DEFINE_WAIT(wait); + + while (1) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT)) + break; + if (signal_pending(current) || !timeout) + break; + } + finish_wait(sk_sleep(sk), &wait); + return timeout; +} + +static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) +{ + DEFINE_WAIT(wait); + struct llc_sock *llc = llc_sk(sk); + int rc; + + while (1) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + rc = 0; + if (sk_wait_event(sk, &timeout, + (sk->sk_shutdown & RCV_SHUTDOWN) || + (!llc_data_accept_state(llc->state) && + !llc->remote_busy_flag && + !llc->p_flag))) + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + } + finish_wait(sk_sleep(sk), &wait); + return rc; +} + +static int llc_wait_data(struct sock *sk, long timeo) +{ + int rc; + + while (1) { + /* + * POSIX 1003.1g mandates this order. + */ + rc = sock_error(sk); + if (rc) + break; + rc = 0; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + rc = -EAGAIN; + if (!timeo) + break; + rc = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + rc = 0; + if (sk_wait_data(sk, &timeo)) + break; + } + return rc; +} + +static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(skb->sk); + + if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { + struct llc_pktinfo info; + + info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; + llc_pdu_decode_dsap(skb, &info.lpi_sap); + llc_pdu_decode_da(skb, info.lpi_mac); + put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); + } +} + +/** + * llc_ui_accept - accept a new incoming connection. + * @sock: Socket which connections arrive on. + * @newsock: Socket to move incoming connection to. + * @flags: User specified operational flags. + * + * Accept a new incoming connection. + * Returns 0 upon success, negative otherwise. + */ +static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk, *newsk; + struct llc_sock *llc, *newllc; + struct sk_buff *skb; + int rc = -EOPNOTSUPP; + + dprintk("%s: accepting on %02X\n", __func__, + llc_sk(sk)->laddr.lsap); + lock_sock(sk); + if (unlikely(sk->sk_type != SOCK_STREAM)) + goto out; + rc = -EINVAL; + if (unlikely(sock->state != SS_UNCONNECTED || + sk->sk_state != TCP_LISTEN)) + goto out; + /* wait for a connection to arrive. */ + if (skb_queue_empty(&sk->sk_receive_queue)) { + rc = llc_wait_data(sk, sk->sk_rcvtimeo); + if (rc) + goto out; + } + dprintk("%s: got a new connection on %02X\n", __func__, + llc_sk(sk)->laddr.lsap); + skb = skb_dequeue(&sk->sk_receive_queue); + rc = -EINVAL; + if (!skb->sk) + goto frees; + rc = 0; + newsk = skb->sk; + /* attach connection to a new socket. */ + llc_ui_sk_init(newsock, newsk); + sock_reset_flag(newsk, SOCK_ZAPPED); + newsk->sk_state = TCP_ESTABLISHED; + newsock->state = SS_CONNECTED; + llc = llc_sk(sk); + newllc = llc_sk(newsk); + memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); + newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); + + /* put original socket back into a clean listen state. */ + sk->sk_state = TCP_LISTEN; + sk->sk_ack_backlog--; + dprintk("%s: ok success on %02X, client on %02X\n", __func__, + llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); +frees: + kfree_skb(skb); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_recvmsg - copy received data to the socket user. + * @sock: Socket to copy data from. + * @msg: Various user space related information. + * @len: Size of user buffer. + * @flags: User specified flags. + * + * Copy received data to the socket user. + * Returns non-negative upon success, negative otherwise. + */ +static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct sockaddr_llc *uaddr = (struct sockaddr_llc *)msg->msg_name; + const int nonblock = flags & MSG_DONTWAIT; + struct sk_buff *skb = NULL; + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + size_t copied = 0; + u32 peek_seq = 0; + u32 *seq; + unsigned long used; + int target; /* Read at least this many bytes */ + long timeo; + + lock_sock(sk); + copied = -ENOTCONN; + if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + seq = &llc->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = llc->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + copied = 0; + + do { + u32 offset; + + /* + * We need to check signals first, to get correct SIGURG + * handling. FIXME: Need to check this doesn't impact 1003.1g + * and move it down to the bottom of the loop + */ + if (signal_pending(current)) { + if (copied) + break; + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + + /* Next get a buffer. */ + + skb = skb_peek(&sk->sk_receive_queue); + if (skb) { + offset = *seq; + goto found_ok_skb; + } + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + (flags & MSG_PEEK)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + if (!timeo) { + copied = -EAGAIN; + break; + } + } + + if (copied >= target) { /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else + sk_wait_data(sk, &timeo); + + if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { + if (net_ratelimit()) + printk(KERN_DEBUG "LLC(%s:%d): Application " + "bug, race in MSG_PEEK.\n", + current->comm, task_pid_nr(current)); + peek_seq = llc->copied_seq; + } + continue; + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + if (!(flags & MSG_TRUNC)) { + int rc = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (rc) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + *seq += used; + copied += used; + len -= used; + + /* For non stream protcols we get one packet per recvmsg call */ + if (sk->sk_type != SOCK_STREAM) + goto copy_uaddr; + + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, 0); + *seq = 0; + } + + /* Partial read */ + if (used + offset < skb->len) + continue; + } while (len > 0); + +out: + release_sock(sk); + return copied; +copy_uaddr: + if (uaddr != NULL && skb != NULL) { + memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); + msg->msg_namelen = sizeof(*uaddr); + } + if (llc_sk(sk)->cmsg_flags) + llc_cmsg_rcv(msg, skb); + + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, 0); + *seq = 0; + } + + goto out; +} + +/** + * llc_ui_sendmsg - Transmit data provided by the socket user. + * @sock: Socket to transmit data from. + * @msg: Various user related information. + * @len: Length of data to transmit. + * + * Transmit data provided by the socket user. + * Returns non-negative upon success, negative otherwise. + */ +static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct sockaddr_llc *addr = (struct sockaddr_llc *)msg->msg_name; + int flags = msg->msg_flags; + int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + size_t size = 0; + int rc = -EINVAL, copied = 0, hdrlen; + + dprintk("%s: sending from %02X to %02X\n", __func__, + llc->laddr.lsap, llc->daddr.lsap); + lock_sock(sk); + if (addr) { + if (msg->msg_namelen < sizeof(*addr)) + goto release; + } else { + if (llc_ui_addr_null(&llc->addr)) + goto release; + addr = &llc->addr; + } + /* must bind connection to sap if user hasn't done it. */ + if (sock_flag(sk, SOCK_ZAPPED)) { + /* bind to sap with null dev, exclusive. */ + rc = llc_ui_autobind(sock, addr); + if (rc) + goto release; + } + hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); + size = hdrlen + len; + if (size > llc->dev->mtu) + size = llc->dev->mtu; + copied = size - hdrlen; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + if (!skb) + goto release; + skb->dev = llc->dev; + skb->protocol = llc_proto_type(addr->sllc_arphrd); + skb_reserve(skb, hdrlen); + rc = memcpy_fromiovec(skb_put(skb, copied), msg->msg_iov, copied); + if (rc) + goto out; + if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { + llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); + goto out; + } + if (addr->sllc_test) { + llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); + goto out; + } + if (addr->sllc_xid) { + llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); + goto out; + } + rc = -ENOPROTOOPT; + if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) + goto out; + rc = llc_ui_send_data(sk, skb, noblock); +out: + if (rc) { + kfree_skb(skb); +release: + dprintk("%s: failed sending from %02X to %02X: %d\n", + __func__, llc->laddr.lsap, llc->daddr.lsap, rc); + } + release_sock(sk); + return rc ? : copied; +} + +/** + * llc_ui_getname - return the address info of a socket + * @sock: Socket to get address of. + * @uaddr: Address structure to return information. + * @uaddrlen: Length of address structure. + * @peer: Does user want local or remote address information. + * + * Return the address information of a socket. + */ +static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddrlen, int peer) +{ + struct sockaddr_llc sllc; + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + int rc = 0; + + memset(&sllc, 0, sizeof(sllc)); + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + *uaddrlen = sizeof(sllc); + memset(uaddr, 0, *uaddrlen); + if (peer) { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + if(llc->dev) + sllc.sllc_arphrd = llc->dev->type; + sllc.sllc_sap = llc->daddr.lsap; + memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); + } else { + rc = -EINVAL; + if (!llc->sap) + goto out; + sllc.sllc_sap = llc->sap->laddr.lsap; + + if (llc->dev) { + sllc.sllc_arphrd = llc->dev->type; + memcpy(&sllc.sllc_mac, llc->dev->dev_addr, + IFHWADDRLEN); + } + } + rc = 0; + sllc.sllc_family = AF_LLC; + memcpy(uaddr, &sllc, sizeof(sllc)); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_ioctl - io controls for PF_LLC + * @sock: Socket to get/set info + * @cmd: command + * @arg: optional argument for cmd + * + * get/set info on llc sockets + */ +static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +/** + * llc_ui_setsockopt - set various connection specific parameters. + * @sock: Socket to set options on. + * @level: Socket level user is requesting operations on. + * @optname: Operation name. + * @optval User provided operation data. + * @optlen: Length of optval. + * + * Set various connection specific parameters. + */ +static int llc_ui_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + unsigned int opt; + int rc = -EINVAL; + + lock_sock(sk); + if (unlikely(level != SOL_LLC || optlen != sizeof(int))) + goto out; + rc = get_user(opt, (int __user *)optval); + if (rc) + goto out; + rc = -EINVAL; + switch (optname) { + case LLC_OPT_RETRY: + if (opt > LLC_OPT_MAX_RETRY) + goto out; + llc->n2 = opt; + break; + case LLC_OPT_SIZE: + if (opt > LLC_OPT_MAX_SIZE) + goto out; + llc->n1 = opt; + break; + case LLC_OPT_ACK_TMR_EXP: + if (opt > LLC_OPT_MAX_ACK_TMR_EXP) + goto out; + llc->ack_timer.expire = opt * HZ; + break; + case LLC_OPT_P_TMR_EXP: + if (opt > LLC_OPT_MAX_P_TMR_EXP) + goto out; + llc->pf_cycle_timer.expire = opt * HZ; + break; + case LLC_OPT_REJ_TMR_EXP: + if (opt > LLC_OPT_MAX_REJ_TMR_EXP) + goto out; + llc->rej_sent_timer.expire = opt * HZ; + break; + case LLC_OPT_BUSY_TMR_EXP: + if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) + goto out; + llc->busy_state_timer.expire = opt * HZ; + break; + case LLC_OPT_TX_WIN: + if (opt > LLC_OPT_MAX_WIN) + goto out; + llc->k = opt; + break; + case LLC_OPT_RX_WIN: + if (opt > LLC_OPT_MAX_WIN) + goto out; + llc->rw = opt; + break; + case LLC_OPT_PKTINFO: + if (opt) + llc->cmsg_flags |= LLC_CMSG_PKTINFO; + else + llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; + break; + default: + rc = -ENOPROTOOPT; + goto out; + } + rc = 0; +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_getsockopt - get connection specific socket info + * @sock: Socket to get information from. + * @level: Socket level user is requesting operations on. + * @optname: Operation name. + * @optval: Variable to return operation data in. + * @optlen: Length of optval. + * + * Get connection specific socket information. + */ +static int llc_ui_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + int val = 0, len = 0, rc = -EINVAL; + + lock_sock(sk); + if (unlikely(level != SOL_LLC)) + goto out; + rc = get_user(len, optlen); + if (rc) + goto out; + rc = -EINVAL; + if (len != sizeof(int)) + goto out; + switch (optname) { + case LLC_OPT_RETRY: + val = llc->n2; break; + case LLC_OPT_SIZE: + val = llc->n1; break; + case LLC_OPT_ACK_TMR_EXP: + val = llc->ack_timer.expire / HZ; break; + case LLC_OPT_P_TMR_EXP: + val = llc->pf_cycle_timer.expire / HZ; break; + case LLC_OPT_REJ_TMR_EXP: + val = llc->rej_sent_timer.expire / HZ; break; + case LLC_OPT_BUSY_TMR_EXP: + val = llc->busy_state_timer.expire / HZ; break; + case LLC_OPT_TX_WIN: + val = llc->k; break; + case LLC_OPT_RX_WIN: + val = llc->rw; break; + case LLC_OPT_PKTINFO: + val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; + break; + default: + rc = -ENOPROTOOPT; + goto out; + } + rc = 0; + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + rc = -EFAULT; +out: + release_sock(sk); + return rc; +} + +static const struct net_proto_family llc_ui_family_ops = { + .family = PF_LLC, + .create = llc_ui_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops llc_ui_ops = { + .family = PF_LLC, + .owner = THIS_MODULE, + .release = llc_ui_release, + .bind = llc_ui_bind, + .connect = llc_ui_connect, + .socketpair = sock_no_socketpair, + .accept = llc_ui_accept, + .getname = llc_ui_getname, + .poll = datagram_poll, + .ioctl = llc_ui_ioctl, + .listen = llc_ui_listen, + .shutdown = llc_ui_shutdown, + .setsockopt = llc_ui_setsockopt, + .getsockopt = llc_ui_getsockopt, + .sendmsg = llc_ui_sendmsg, + .recvmsg = llc_ui_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const char llc_proc_err_msg[] __initconst = + KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; +static const char llc_sysctl_err_msg[] __initconst = + KERN_CRIT "LLC: Unable to register the sysctl entries\n"; +static const char llc_sock_err_msg[] __initconst = + KERN_CRIT "LLC: Unable to register the network family\n"; + +static int __init llc2_init(void) +{ + int rc = proto_register(&llc_proto, 0); + + if (rc != 0) + goto out; + + llc_build_offset_table(); + llc_station_init(); + llc_ui_sap_last_autoport = LLC_SAP_DYN_START; + rc = llc_proc_init(); + if (rc != 0) { + printk(llc_proc_err_msg); + goto out_unregister_llc_proto; + } + rc = llc_sysctl_init(); + if (rc) { + printk(llc_sysctl_err_msg); + goto out_proc; + } + rc = sock_register(&llc_ui_family_ops); + if (rc) { + printk(llc_sock_err_msg); + goto out_sysctl; + } + llc_add_pack(LLC_DEST_SAP, llc_sap_handler); + llc_add_pack(LLC_DEST_CONN, llc_conn_handler); +out: + return rc; +out_sysctl: + llc_sysctl_exit(); +out_proc: + llc_proc_exit(); +out_unregister_llc_proto: + proto_unregister(&llc_proto); + goto out; +} + +static void __exit llc2_exit(void) +{ + llc_station_exit(); + llc_remove_pack(LLC_DEST_SAP); + llc_remove_pack(LLC_DEST_CONN); + sock_unregister(PF_LLC); + llc_proc_exit(); + llc_sysctl_exit(); + proto_unregister(&llc_proto); +} + +module_init(llc2_init); +module_exit(llc2_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); +MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); +MODULE_ALIAS_NETPROTO(PF_LLC); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c new file mode 100644 index 00000000..ea225bd2 --- /dev/null +++ b/net/llc/llc_c_ac.c @@ -0,0 +1,1444 @@ +/* + * llc_c_ac.c - actions performed during connection state transition. + * + * Description: + * Functions in this module are implementation of connection component actions + * Details of actions can be found in IEEE-802.2 standard document. + * All functions have one connection and one event as input argument. All of + * them return 0 On success and 1 otherwise. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb); +static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb); +static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *ev); + +static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb); + +static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, + struct sk_buff *skb); + +static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb); + +#define INCORRECT 0 + +int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->remote_busy_flag) { + u8 nr; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + llc->remote_busy_flag = 0; + del_timer(&llc->busy_state_timer.timer); + nr = LLC_I_GET_NR(pdu); + llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); + } + return 0; +} + +int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->ind_prim = LLC_CONN_PRIM; + return 0; +} + +int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->cfm_prim = LLC_CONN_PRIM; + return 0; +} + +static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->cfm_prim = LLC_DATA_PRIM; + return 0; +} + +int llc_conn_ac_data_ind(struct sock *sk, struct sk_buff *skb) +{ + llc_conn_rtn_pdu(sk, skb); + return 0; +} + +int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + u8 reason = 0; + int rc = 0; + + if (ev->type == LLC_CONN_EV_TYPE_PDU) { + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM) + reason = LLC_DISC_REASON_RX_DM_RSP_PDU; + else if (LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC) + reason = LLC_DISC_REASON_RX_DISC_CMD_PDU; + } else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR) + reason = LLC_DISC_REASON_ACK_TMR_EXP; + else + rc = -EINVAL; + if (!rc) { + ev->reason = reason; + ev->ind_prim = LLC_DISC_PRIM; + } + return rc; +} + +int llc_conn_ac_disc_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->reason = ev->status; + ev->cfm_prim = LLC_DISC_PRIM; + return 0; +} + +int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb) +{ + u8 reason = 0; + int rc = 1; + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + struct llc_sock *llc = llc_sk(sk); + + switch (ev->type) { + case LLC_CONN_EV_TYPE_PDU: + if (LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR) { + reason = LLC_RESET_REASON_LOCAL; + rc = 0; + } else if (LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) { + reason = LLC_RESET_REASON_REMOTE; + rc = 0; + } + break; + case LLC_CONN_EV_TYPE_ACK_TMR: + case LLC_CONN_EV_TYPE_P_TMR: + case LLC_CONN_EV_TYPE_REJ_TMR: + case LLC_CONN_EV_TYPE_BUSY_TMR: + if (llc->retry_count > llc->n2) { + reason = LLC_RESET_REASON_LOCAL; + rc = 0; + } + break; + } + if (!rc) { + ev->reason = reason; + ev->ind_prim = LLC_RESET_PRIM; + } + return rc; +} + +int llc_conn_ac_rst_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->reason = 0; + ev->cfm_prim = LLC_RESET_PRIM; + return 0; +} + +int llc_conn_ac_clear_remote_busy_if_f_eq_1(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && llc_sk(sk)->ack_pf) + llc_conn_ac_clear_remote_busy(sk, skb); + return 0; +} + +int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->data_flag == 2) + del_timer(&llc->rej_sent_timer.timer); + return 0; +} + +int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_disc_cmd(nskb, 1); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + llc_conn_ac_set_p_flag_1(sk, skb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_dm_rsp(nskb, f_bit); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_dm_rsp(nskb, 1); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + int rc = -ENOBUFS; + struct sk_buff *nskb; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + struct llc_sock *llc = llc_sk(sk); + + llc->rx_pdu_hdr = *((u32 *)pdu); + if (LLC_PDU_IS_CMD(pdu)) + llc_pdu_decode_pf_bit(skb, &f_bit); + else + f_bit = 0; + nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, + sizeof(struct llc_frmr_info)); + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, + llc->vR, INCORRECT); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, + sizeof(struct llc_frmr_info)); + + if (nskb) { + struct llc_sap *sap = llc->sap; + struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_frmr_rsp(nskb, pdu, 0, llc->vS, + llc->vR, INCORRECT); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + int rc = -ENOBUFS; + struct sk_buff *nskb; + struct llc_sock *llc = llc_sk(sk); + + llc_pdu_decode_pf_bit(skb, &f_bit); + nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, + sizeof(struct llc_frmr_info)); + if (nskb) { + struct llc_sap *sap = llc->sap; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, + llc->vR, INCORRECT); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (likely(!rc)) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return rc; +} + +static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (likely(!rc)) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return rc; +} + +int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (likely(!rc)) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return 0; +} + +int llc_conn_ac_resend_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 nr = LLC_I_GET_NR(pdu); + + llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); + return 0; +} + +int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk, + struct sk_buff *skb) +{ + u8 nr; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (likely(!rc)) + llc_conn_send_pdu(sk, nskb); + else + kfree_skb(skb); + } + if (rc) { + nr = LLC_I_GET_NR(pdu); + rc = 0; + llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); + } + return rc; +} + +int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 nr = LLC_I_GET_NR(pdu); + + llc_conn_resend_i_pdu_as_rsp(sk, nr, 1); + return 0; +} + +int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rej_rsp(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rej_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rnr_rsp(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!llc->remote_busy_flag) { + llc->remote_busy_flag = 1; + mod_timer(&llc->busy_state_timer.timer, + jiffies + llc->busy_state_timer.expire); + } + return 0; +} + +int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + u8 f_bit = 1; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +void llc_conn_set_p_flag(struct sock *sk, u8 value) +{ + int state_changed = llc_sk(sk)->p_flag && !value; + + llc_sk(sk)->p_flag = value; + + if (state_changed) + sk->sk_state_change(sk); +} + +int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + u8 *dmac = llc->daddr.mac; + + if (llc->dev->flags & IFF_LOOPBACK) + dmac = llc->dev->dev_addr; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_sabme_cmd(nskb, 1); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + llc_conn_set_p_flag(sk, 1); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); + + llc_pdu_decode_pf_bit(skb, &f_bit); + if (nskb) { + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_ua_rsp(nskb, f_bit); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_set_s_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->s_flag = 0; + return 0; +} + +int llc_conn_ac_set_s_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->s_flag = 1; + return 0; +} + +int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + llc_conn_set_p_flag(sk, 1); + mod_timer(&llc->pf_cycle_timer.timer, + jiffies + llc->pf_cycle_timer.expire); + return 0; +} + +/** + * llc_conn_ac_send_ack_if_needed - check if ack is needed + * @sk: current connection structure + * @skb: current event + * + * Checks number of received PDUs which have not been acknowledged, yet, + * If number of them reaches to "npta"(Number of PDUs To Acknowledge) then + * sends an RR response as acknowledgement for them. Returns 0 for + * success, 1 otherwise. + */ +int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) +{ + u8 pf_bit; + struct llc_sock *llc = llc_sk(sk); + + llc_pdu_decode_pf_bit(skb, &pf_bit); + llc->ack_pf |= pf_bit & 1; + if (!llc->ack_must_be_send) { + llc->first_pdu_Ns = llc->vR; + llc->ack_must_be_send = 1; + llc->ack_pf = pf_bit & 1; + } + if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) + % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { + llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); + llc->ack_must_be_send = 0; + llc->ack_pf = 0; + llc_conn_ac_inc_npta_value(sk, skb); + } + return 0; +} + +/** + * llc_conn_ac_rst_sendack_flag - resets ack_must_be_send flag + * @sk: current connection structure + * @skb: current event + * + * This action resets ack_must_be_send flag of given connection, this flag + * indicates if there is any PDU which has not been acknowledged yet. + * Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->ack_must_be_send = llc_sk(sk)->ack_pf = 0; + return 0; +} + +/** + * llc_conn_ac_send_i_rsp_f_set_ackpf - acknowledge received PDUs + * @sk: current connection structure + * @skb: current event + * + * Sends an I response PDU with f-bit set to ack_pf flag as acknowledge to + * all received PDUs which have not been acknowledged, yet. ack_pf flag is + * set to one if one PDU with p-bit set to one is received. Returns 0 for + * success, 1 otherwise. + */ +static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, + struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (likely(!rc)) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return rc; +} + +/** + * llc_conn_ac_send_i_as_ack - sends an I-format PDU to acknowledge rx PDUs + * @sk: current connection structure. + * @skb: current event. + * + * This action sends an I-format PDU as acknowledge to received PDUs which + * have not been acknowledged, yet, if there is any. By using of this + * action number of acknowledgements decreases, this technic is called + * piggy backing. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->ack_must_be_send) { + llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); + llc->ack_must_be_send = 0 ; + llc->ack_pf = 0; + } else + llc_conn_ac_send_i_cmd_p_set_0(sk, skb); + return 0; +} + +/** + * llc_conn_ac_send_rr_rsp_f_set_ackpf - ack all rx PDUs not yet acked + * @sk: current connection structure. + * @skb: current event. + * + * This action sends an RR response with f-bit set to ack_pf flag as + * acknowledge to all received PDUs which have not been acknowledged, yet, + * if there is any. ack_pf flag indicates if a PDU has been received with + * p-bit set to one. Returns 0 for success, 1 otherwise. + */ +static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, + struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); + + if (nskb) { + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (unlikely(rc)) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +/** + * llc_conn_ac_inc_npta_value - tries to make value of npta greater + * @sk: current connection structure. + * @skb: current event. + * + * After "inc_cntr" times calling of this action, "npta" increase by one. + * this action tries to make vale of "npta" greater as possible; number of + * acknowledgements decreases by increasing of "npta". Returns 0 for + * success, 1 otherwise. + */ +static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!llc->inc_cntr) { + llc->dec_step = 0; + llc->dec_cntr = llc->inc_cntr = 2; + ++llc->npta; + if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; + } else + --llc->inc_cntr; + return 0; +} + +/** + * llc_conn_ac_adjust_npta_by_rr - decreases "npta" by one + * @sk: current connection structure. + * @skb: current event. + * + * After receiving "dec_cntr" times RR command, this action decreases + * "npta" by one. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!llc->connect_step && !llc->remote_busy_flag) { + if (!llc->dec_step) { + if (!llc->dec_cntr) { + llc->inc_cntr = llc->dec_cntr = 2; + if (llc->npta > 0) + llc->npta = llc->npta - 1; + } else + llc->dec_cntr -=1; + } + } else + llc->connect_step = 0 ; + return 0; +} + +/** + * llc_conn_ac_adjust_npta_by_rnr - decreases "npta" by one + * @sk: current connection structure. + * @skb: current event. + * + * After receiving "dec_cntr" times RNR command, this action decreases + * "npta" by one. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->remote_busy_flag) + if (!llc->dec_step) { + if (!llc->dec_cntr) { + llc->inc_cntr = llc->dec_cntr = 2; + if (llc->npta > 0) + --llc->npta; + } else + --llc->dec_cntr; + } + return 0; +} + +/** + * llc_conn_ac_dec_tx_win_size - decreases tx window size + * @sk: current connection structure. + * @skb: current event. + * + * After receiving of a REJ command or response, transmit window size is + * decreased by number of PDUs which are outstanding yet. Returns 0 for + * success, 1 otherwise. + */ +int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); + + if (llc->k - unacked_pdu < 1) + llc->k = 1; + else + llc->k -= unacked_pdu; + return 0; +} + +/** + * llc_conn_ac_inc_tx_win_size - tx window size is inc by 1 + * @sk: current connection structure. + * @skb: current event. + * + * After receiving an RR response with f-bit set to one, transmit window + * size is increased by one. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + llc->k += 1; + if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; + return 0; +} + +int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + del_timer(&llc->pf_cycle_timer.timer); + del_timer(&llc->ack_timer.timer); + del_timer(&llc->rej_sent_timer.timer); + del_timer(&llc->busy_state_timer.timer); + llc->ack_must_be_send = 0; + llc->ack_pf = 0; + return 0; +} + +int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + del_timer(&llc->rej_sent_timer.timer); + del_timer(&llc->pf_cycle_timer.timer); + del_timer(&llc->busy_state_timer.timer); + llc->ack_must_be_send = 0; + llc->ack_pf = 0; + return 0; +} + +int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); + return 0; +} + +int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + mod_timer(&llc->rej_sent_timer.timer, + jiffies + llc->rej_sent_timer.expire); + return 0; +} + +int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!timer_pending(&llc->ack_timer.timer)) + mod_timer(&llc->ack_timer.timer, + jiffies + llc->ack_timer.expire); + return 0; +} + +int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb) +{ + del_timer(&llc_sk(sk)->ack_timer.timer); + return 0; +} + +int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + del_timer(&llc->pf_cycle_timer.timer); + llc_conn_set_p_flag(sk, 0); + return 0; +} + +int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb) +{ + del_timer(&llc_sk(sk)->rej_sent_timer.timer); + return 0; +} + +int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb) +{ + int acked; + u16 unacked = 0; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + struct llc_sock *llc = llc_sk(sk); + + llc->last_nr = PDU_SUPV_GET_Nr(pdu); + acked = llc_conn_remove_acked_pdus(sk, llc->last_nr, &unacked); + /* On loopback we don't queue I frames in unack_pdu_q queue. */ + if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) { + llc->retry_count = 0; + del_timer(&llc->ack_timer.timer); + if (llc->failed_data_req) { + /* already, we did not accept data from upper layer + * (tx_window full or unacceptable state). Now, we + * can send data and must inform to upper layer. + */ + llc->failed_data_req = 0; + llc_conn_ac_data_confirm(sk, skb); + } + if (unacked) + mod_timer(&llc->ack_timer.timer, + jiffies + llc->ack_timer.expire); + } else if (llc->failed_data_req) { + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + if (f_bit == 1) { + llc->failed_data_req = 0; + llc_conn_ac_data_confirm(sk, skb); + } + } + return 0; +} + +int llc_conn_ac_upd_p_flag(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu)) { + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + if (f_bit) { + llc_conn_set_p_flag(sk, 0); + llc_conn_ac_stop_p_timer(sk, skb); + } + } + return 0; +} + +int llc_conn_ac_set_data_flag_2(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->data_flag = 2; + return 0; +} + +int llc_conn_ac_set_data_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->data_flag = 0; + return 0; +} + +int llc_conn_ac_set_data_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->data_flag = 1; + return 0; +} + +int llc_conn_ac_set_data_flag_1_if_data_flag_eq_0(struct sock *sk, + struct sk_buff *skb) +{ + if (!llc_sk(sk)->data_flag) + llc_sk(sk)->data_flag = 1; + return 0; +} + +int llc_conn_ac_set_p_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_conn_set_p_flag(sk, 0); + return 0; +} + +static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_conn_set_p_flag(sk, 1); + return 0; +} + +int llc_conn_ac_set_remote_busy_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->remote_busy_flag = 0; + return 0; +} + +int llc_conn_ac_set_cause_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->cause_flag = 0; + return 0; +} + +int llc_conn_ac_set_cause_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->cause_flag = 1; + return 0; +} + +int llc_conn_ac_set_retry_cnt_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->retry_count = 0; + return 0; +} + +int llc_conn_ac_inc_retry_cnt_by_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->retry_count++; + return 0; +} + +int llc_conn_ac_set_vr_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vR = 0; + return 0; +} + +int llc_conn_ac_inc_vr_by_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vR = PDU_GET_NEXT_Vr(llc_sk(sk)->vR); + return 0; +} + +int llc_conn_ac_set_vs_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vS = 0; + return 0; +} + +int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vS = llc_sk(sk)->last_nr; + return 0; +} + +static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; + return 0; +} + +static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type) +{ + struct sock *sk = (struct sock *)timeout_data; + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + bh_lock_sock(sk); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + skb_set_owner_r(skb, sk); + ev->type = type; + llc_process_tmr_ev(sk, skb); + } + bh_unlock_sock(sk); +} + +void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data) +{ + llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_P_TMR); +} + +void llc_conn_busy_tmr_cb(unsigned long timeout_data) +{ + llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_BUSY_TMR); +} + +void llc_conn_ack_tmr_cb(unsigned long timeout_data) +{ + llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_ACK_TMR); +} + +void llc_conn_rej_tmr_cb(unsigned long timeout_data) +{ + llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_REJ_TMR); +} + +int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->X = llc_sk(sk)->vS; + llc_conn_ac_set_vs_nr(sk, skb); + return 0; +} + +int llc_conn_ac_upd_vs(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 nr = PDU_SUPV_GET_Nr(pdu); + + if (llc_circular_between(llc_sk(sk)->vS, nr, llc_sk(sk)->X)) + llc_conn_ac_set_vs_nr(sk, skb); + return 0; +} + +/* + * Non-standard actions; these not contained in IEEE specification; for + * our own usage + */ +/** + * llc_conn_disc - removes connection from SAP list and frees it + * @sk: closed connection + * @skb: occurred event + */ +int llc_conn_disc(struct sock *sk, struct sk_buff *skb) +{ + /* FIXME: this thing seems to want to die */ + return 0; +} + +/** + * llc_conn_reset - resets connection + * @sk : reseting connection. + * @skb: occurred event. + * + * Stop all timers, empty all queues and reset all flags. + */ +int llc_conn_reset(struct sock *sk, struct sk_buff *skb) +{ + llc_sk_reset(sk); + return 0; +} + +/** + * llc_circular_between - designates that b is between a and c or not + * @a: lower bound + * @b: element to see if is between a and b + * @c: upper bound + * + * This function designates that b is between a and c or not (for example, + * 0 is between 127 and 1). Returns 1 if b is between a and c, 0 + * otherwise. + */ +u8 llc_circular_between(u8 a, u8 b, u8 c) +{ + b = b - a; + c = c - a; + return b <= c; +} + +/** + * llc_process_tmr_ev - timer backend + * @sk: active connection + * @skb: occurred event + * + * This function is called from timer callback functions. When connection + * is busy (during sending a data frame) timer expiration event must be + * queued. Otherwise this event can be sent to connection state machine. + * Queued events will process by llc_backlog_rcv function after sending + * data frame. + */ +static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb) +{ + if (llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) { + printk(KERN_WARNING "%s: timer called on closed connection\n", + __func__); + kfree_skb(skb); + } else { + if (!sock_owned_by_user(sk)) + llc_conn_state_process(sk, skb); + else { + llc_set_backlog_type(skb, LLC_EVENT); + __sk_add_backlog(sk, skb); + } + } +} diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c new file mode 100644 index 00000000..523fdd1c --- /dev/null +++ b/net/llc/llc_c_ev.c @@ -0,0 +1,748 @@ +/* + * llc_c_ev.c - Connection component state transition event qualifiers + * + * A 'state' consists of a number of possible event matching functions, + * the actions associated with each being executed when that event is + * matched; a 'state machine' accepts events in a serial fashion from an + * event queue. Each event is passed to each successive event matching + * function until a match is made (the event matching function returns + * success, or '0') or the list of event matching functions is exhausted. + * If a match is made, the actions associated with the event are executed + * and the state is changed to that event's transition state. Before some + * events are recognized, even after a match has been made, a certain + * number of 'event qualifier' functions must also be executed. If these + * all execute successfully, then the event is finally executed. + * + * These event functions must return 0 for success, to show a matched + * event, of 1 if the event does not match. Event qualifier functions + * must return a 0 for success or a non-zero for failure. Each function + * is simply responsible for verifying one single thing and returning + * either a success or failure. + * + * All of followed event functions are described in 802.2 LLC Protocol + * standard document except two functions that we added that will explain + * in their comments, at below. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include + +#if 1 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +/** + * llc_util_ns_inside_rx_window - check if sequence number is in rx window + * @ns: sequence number of received pdu. + * @vr: sequence number which receiver expects to receive. + * @rw: receive window size of receiver. + * + * Checks if sequence number of received PDU is in range of receive + * window. Returns 0 for success, 1 otherwise + */ +static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw) +{ + return !llc_circular_between(vr, ns, + (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO); +} + +/** + * llc_util_nr_inside_tx_window - check if sequence number is in tx window + * @sk: current connection. + * @nr: N(R) of received PDU. + * + * This routine checks if N(R) of received PDU is in range of transmit + * window; on the other hand checks if received PDU acknowledges some + * outstanding PDUs that are in transmit window. Returns 0 for success, 1 + * otherwise. + */ +static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) +{ + u8 nr1, nr2; + struct sk_buff *skb; + struct llc_pdu_sn *pdu; + struct llc_sock *llc = llc_sk(sk); + int rc = 0; + + if (llc->dev->flags & IFF_LOOPBACK) + goto out; + rc = 1; + if (skb_queue_empty(&llc->pdu_unack_q)) + goto out; + skb = skb_peek(&llc->pdu_unack_q); + pdu = llc_pdu_sn_hdr(skb); + nr1 = LLC_I_GET_NS(pdu); + skb = skb_peek_tail(&llc->pdu_unack_q); + pdu = llc_pdu_sn_hdr(skb); + nr2 = LLC_I_GET_NS(pdu); + rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO); +out: + return rc; +} + +int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_CONN_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_DATA_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_DISC_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_RESET_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type == LLC_CONN_EV_TYPE_SIMPLE && + ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1; +} + +int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type == LLC_CONN_EV_TYPE_SIMPLE && + ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1; +} + +int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb) +{ + return 1; +} + +int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1; +} + +int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1; +} + +int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + ns != vr && + llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; + if (!rc) + dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", + __func__, llc_sk(sk)->state, ns, vr); + return rc; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk, + struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vr = llc_sk(sk)->vR; + const u8 ns = LLC_I_GET_NS(pdu); + const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + ns != vr && + llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; + if (!rc) + dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", + __func__, llc_sk(sk)->state, ns, vr); + return rc; +} + +int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; +} + +int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1; +} + +int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1; +} + +int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_CMD(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { + if (LLC_I_PF_IS_1(pdu)) + rc = 0; + } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu)) + rc = 0; + } + return rc; +} + +int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (LLC_PDU_IS_CMD(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) + rc = 0; + else if (LLC_PDU_TYPE_IS_U(pdu)) + switch (LLC_U_PDU_CMD(pdu)) { + case LLC_2_PDU_CMD_SABME: + case LLC_2_PDU_CMD_DISC: + rc = 0; + break; + } + } + return rc; +} + +int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) + rc = 0; + else if (LLC_PDU_TYPE_IS_U(pdu)) + switch (LLC_U_PDU_RSP(pdu)) { + case LLC_2_PDU_RSP_UA: + case LLC_2_PDU_RSP_DM: + case LLC_2_PDU_RSP_FRMR: + rc = 0; + break; + } + } + + return rc; +} + +int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk, + struct sk_buff *skb) +{ + u16 rc = 1; + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vs = llc_sk(sk)->vS; + const u8 nr = LLC_I_GET_NR(pdu); + + if (LLC_PDU_IS_CMD(pdu) && + (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && + nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { + dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", + __func__, llc_sk(sk)->state, vs, nr); + rc = 0; + } + return rc; +} + +int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk, + struct sk_buff *skb) +{ + u16 rc = 1; + const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + const u8 vs = llc_sk(sk)->vS; + const u8 nr = LLC_I_GET_NR(pdu); + + if (LLC_PDU_IS_RSP(pdu) && + (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && + nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { + rc = 0; + dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", + __func__, llc_sk(sk)->state, vs, nr); + } + return rc; +} + +int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_P_TMR; +} + +int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_ACK_TMR; +} + +int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_REJ_TMR; +} + +int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR; +} + +int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb) +{ + return 1; +} + +int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb) +{ + const struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type == LLC_CONN_EV_TYPE_SIMPLE && + ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1; +} + +/* Event qualifier functions + * + * these functions simply verify the value of a state flag associated with + * the connection and return either a 0 for success or a non-zero value + * for not-success; verify the event is the type we expect + */ +int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->data_flag != 1; +} + +int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->data_flag; +} + +int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->data_flag != 2; +} + +int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->p_flag != 1; +} + +/** + * conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window + * @sk: current connection structure. + * @skb: current event. + * + * This function determines when frame which is sent, is last frame of + * transmit window, if it is then this function return zero else return + * one. This function is used for sending last frame of transmit window + * as I-format command with p-bit set to one. Returns 0 if frame is last + * frame, 1 otherwise. + */ +int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k); +} + +/** + * conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window + * @sk: current connection structure. + * @skb: current event. + * + * This function determines when frame which is sent, isn't last frame of + * transmit window, if it isn't then this function return zero else return + * one. Returns 0 if frame isn't last frame, 1 otherwise. + */ +int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k; +} + +int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->p_flag; +} + +int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + return llc_sk(sk)->p_flag == f_bit ? 0 : 1; +} + +int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->remote_busy_flag; +} + +int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !llc_sk(sk)->remote_busy_flag; +} + +int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb) +{ + return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2); +} + +int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb) +{ + return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2); +} + +int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !llc_sk(sk)->s_flag; +} + +int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->s_flag; +} + +int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !llc_sk(sk)->cause_flag; +} + +int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->cause_flag; +} + +int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_CONN; + return 0; +} + +int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_DISC; + return 0; +} + +int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_FAILED; + return 0; +} + +int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_REMOTE_BUSY; + return 0; +} + +int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_REFUSE; + return 0; +} + +int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_CONFLICT; + return 0; +} + +int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_RESET_DONE; + return 0; +} diff --git a/net/llc/llc_c_st.c b/net/llc/llc_c_st.c new file mode 100644 index 00000000..818a9428 --- /dev/null +++ b/net/llc/llc_c_st.c @@ -0,0 +1,4946 @@ +/* + * llc_c_st.c - This module contains state transition of connection component. + * + * Description of event functions and actions there is in 802.2 LLC standard, + * or in "llc_c_ac.c" and "llc_c_ev.c" modules. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include + +#define NONE NULL + +/* COMMON CONNECTION STATE transitions + * Common transitions for + * LLC_CONN_STATE_NORMAL, + * LLC_CONN_STATE_BUSY, + * LLC_CONN_STATE_REJ, + * LLC_CONN_STATE_AWAIT, + * LLC_CONN_STATE_AWAIT_BUSY and + * LLC_CONN_STATE_AWAIT_REJ states + */ +/* State transitions for LLC_CONN_EV_DISC_REQ event */ +static llc_conn_action_t llc_common_actions_1[] = { + [0] = llc_conn_ac_send_disc_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_1 = { + .ev = llc_conn_ev_disc_req, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RESET_REQ event */ +static llc_conn_action_t llc_common_actions_2[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_2 = { + .ev = llc_conn_ev_rst_req, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_common_actions_3[] = { + [0] = llc_conn_ac_stop_all_timers, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_send_ua_rsp_f_set_p, + [4] = llc_conn_ac_rst_ind, + [5] = llc_conn_ac_set_p_flag_0, + [6] = llc_conn_ac_set_remote_busy_0, + [7] = llc_conn_reset, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_3 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_common_actions_4[] = { + [0] = llc_conn_ac_stop_all_timers, + [1] = llc_conn_ac_send_ua_rsp_f_set_p, + [2] = llc_conn_ac_disc_ind, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_4 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_common_actions_5[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_rst_ind, + [5] = llc_conn_ac_set_cause_flag_0, + [6] = llc_conn_reset, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_5 = { + .ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_common_actions_6[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_ac_stop_all_timers, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_6 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_ZZZ_CMD_Pbit_SET_X_INVAL_Nr event */ +static llc_conn_action_t llc_common_actions_7a[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_7a = { + .ev = llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_7a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_X_INVAL_Ns event */ +static llc_conn_action_t llc_common_actions_7b[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_7b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_7b, +}; + +/* State transitions for LLC_CONN_EV_RX_ZZZ_RSP_Fbit_SET_X_INVAL_Nr event */ +static llc_conn_action_t llc_common_actions_8a[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_8a = { + .ev = llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_INVAL_Ns event */ +static llc_conn_action_t llc_common_actions_8b[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_8b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_BAD_PDU event */ +static llc_conn_action_t llc_common_actions_8c[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_8c = { + .ev = llc_conn_ev_rx_bad_pdu, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_8c, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_common_actions_9[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_9 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_9, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_1 event */ +#if 0 +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_10[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_10[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_10 = { + .ev = llc_conn_ev_rx_xxx_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = llc_common_ev_qfyrs_10, + .ev_actions = llc_common_actions_10, +}; +#endif + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11a[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11a[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11a = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11a, + .ev_actions = llc_common_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11b[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11b[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11b = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11b, + .ev_actions = llc_common_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11c[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11c[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11c = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11c, + .ev_actions = llc_common_actions_11c, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11d[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11d[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11d = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11d, + .ev_actions = llc_common_actions_11d, +}; + +/* + * Common dummy state transition; must be last entry for all state + * transition groups - it'll be on .bss, so will be zeroed. + */ +static struct llc_conn_state_trans llc_common_state_trans_end; + +/* LLC_CONN_STATE_ADM transitions */ +/* State transitions for LLC_CONN_EV_CONN_REQ event */ +static llc_conn_action_t llc_adm_actions_1[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_set_retry_cnt_0, + [3] = llc_conn_ac_set_s_flag_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_1 = { + .ev = llc_conn_ev_conn_req, + .next_state = LLC_CONN_STATE_SETUP, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_adm_actions_2[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_p_flag_0, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_ac_conn_ind, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_2 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_adm_actions_3[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_3 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_adm_actions_4[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_1, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_4 = { + .ev = llc_conn_ev_rx_xxx_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_YYY event */ +static llc_conn_action_t llc_adm_actions_5[] = { + [0] = llc_conn_disc, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_5 = { + .ev = llc_conn_ev_rx_any_frame, + .next_state = LLC_CONN_OUT_OF_SVC, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_5, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_adm_state_transitions[] = { + [0] = &llc_adm_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* local_busy */ + [3] = &llc_common_state_trans_end, /* init_pf_cycle */ + [4] = &llc_common_state_trans_end, /* timer */ + [5] = &llc_adm_state_trans_2, /* Receive frame */ + [6] = &llc_adm_state_trans_3, + [7] = &llc_adm_state_trans_4, + [8] = &llc_adm_state_trans_5, + [9] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_SETUP transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_setup_actions_1[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_set_s_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_SETUP, + .ev_qualifiers = NONE, + .ev_actions = llc_setup_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_set_status_conn, + [2] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_2[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_set_remote_busy_0, + [5] = llc_conn_ac_conn_confirm, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_2 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_setup_ev_qfyrs_2, + .ev_actions = llc_setup_actions_2, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_s_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_conn, + [2] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_3[] = { + [0] = llc_conn_ac_set_p_flag_0, + [1] = llc_conn_ac_set_remote_busy_0, + [2] = llc_conn_ac_conn_confirm, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_3 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_setup_ev_qfyrs_3, + .ev_actions = llc_setup_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_set_status_disc, + [1] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_4[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_ac_conn_confirm, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_4 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_setup_ev_qfyrs_4, + .ev_actions = llc_setup_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_set_status_disc, + [1] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_5[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_conn_confirm, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_5 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_setup_ev_qfyrs_5, + .ev_actions = llc_setup_actions_5, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_7[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_SETUP, + .ev_qualifiers = llc_setup_ev_qfyrs_7, + .ev_actions = llc_setup_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_failed, + [3] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_8[] = { + [0] = llc_conn_ac_conn_confirm, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_setup_ev_qfyrs_8, + .ev_actions = llc_setup_actions_8, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_setup_state_transitions[] = { + [0] = &llc_common_state_trans_end, /* Request */ + [1] = &llc_common_state_trans_end, /* local busy */ + [2] = &llc_common_state_trans_end, /* init_pf_cycle */ + [3] = &llc_setup_state_trans_3, /* Timer */ + [4] = &llc_setup_state_trans_7, + [5] = &llc_setup_state_trans_8, + [6] = &llc_common_state_trans_end, + [7] = &llc_setup_state_trans_1, /* Receive frame */ + [8] = &llc_setup_state_trans_2, + [9] = &llc_setup_state_trans_4, + [10] = &llc_setup_state_trans_5, + [11] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_NORMAL transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = llc_conn_ev_qlfy_last_frame_eq_0, + [3] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_1[] = { + [0] = llc_conn_ac_send_i_as_ack, + [1] = llc_conn_ac_start_ack_tmr_if_not_running, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_1, + .ev_actions = llc_normal_actions_1, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = llc_conn_ev_qlfy_last_frame_eq_1, + [3] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_2[] = { + [0] = llc_conn_ac_send_i_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_2 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_2, + .ev_actions = llc_normal_actions_2, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_1, + [1] = llc_conn_ev_qlfy_set_status_remote_busy, + [2] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_normal_actions_2_1[1]; + +static struct llc_conn_state_trans llc_normal_state_trans_2_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_2_1, + .ev_actions = llc_normal_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_3[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rnr_xxx_x_set_0, + [2] = llc_conn_ac_set_data_flag_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_3 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_normal_ev_qfyrs_3, + .ev_actions = llc_normal_actions_3, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_4[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rnr_xxx_x_set_0, + [2] = llc_conn_ac_set_data_flag_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_4 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_normal_ev_qfyrs_4, + .ev_actions = llc_normal_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_5a[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_start_rej_timer, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_5a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_5a, + .ev_actions = llc_normal_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_5b[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_start_rej_timer, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_5b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_5b, + .ev_actions = llc_normal_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_5c[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_start_rej_timer, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_5c = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_5c, + .ev_actions = llc_normal_actions_5c, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_6a[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_6a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_6a, + .ev_actions = llc_normal_actions_6a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_6b[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_6b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_6b, + .ev_actions = llc_normal_actions_6b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_normal_actions_7[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_rsp_f_set_1, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_7 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_7, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_8[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [5] = llc_conn_ac_send_ack_if_needed, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_8a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_8a, + .ev_actions = llc_normal_actions_8, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_8b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_8b, + .ev_actions = llc_normal_actions_8, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_9a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_send_ack_if_needed, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_9a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_9a, + .ev_actions = llc_normal_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_9b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_send_ack_if_needed, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_9b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_9b, + .ev_actions = llc_normal_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_10[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_send_ack_rsp_f_set_1, + [2] = llc_conn_ac_rst_sendack_flag, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_data_ind, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_10 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_10, +}; + +/* State transitions for * LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_11a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_11a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_11b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_11b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_11c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_11c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_inc_tx_win_size, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_11c = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_11c, + .ev_actions = llc_normal_actions_11c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_12[] = { + [0] = llc_conn_ac_send_ack_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_adjust_npta_by_rr, + [3] = llc_conn_ac_rst_sendack_flag, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_12 = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_12, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_13a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_13a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_13a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_13b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_13b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_13b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_13c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_13c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_13c = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_13c, + .ev_actions = llc_normal_actions_13c, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_14[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_adjust_npta_by_rnr, + [3] = llc_conn_ac_rst_sendack_flag, + [4] = llc_conn_ac_set_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_14 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_14, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_15a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_dec_tx_win_size, + [4] = llc_conn_ac_resend_i_xxx_x_set_0, + [5] = llc_conn_ac_clear_remote_busy, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_15a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_15a, + .ev_actions = llc_normal_actions_15a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_15b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_dec_tx_win_size, + [4] = llc_conn_ac_resend_i_xxx_x_set_0, + [5] = llc_conn_ac_clear_remote_busy, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_15b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_15b, + .ev_actions = llc_normal_actions_15b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_16a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_dec_tx_win_size, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_16a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_16a, + .ev_actions = llc_normal_actions_16a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_16b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_dec_tx_win_size, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_16b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_16b, + .ev_actions = llc_normal_actions_16b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_17[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_dec_tx_win_size, + [3] = llc_conn_ac_resend_i_rsp_f_set_1, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_17 = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_17, +}; + +/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_18[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_18[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_18 = { + .ev = llc_conn_ev_init_p_f_cycle, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_18, + .ev_actions = llc_normal_actions_18, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_19[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_19[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rr_cmd_p_set_1, + [2] = llc_conn_ac_rst_vs, + [3] = llc_conn_ac_start_p_timer, + [4] = llc_conn_ac_inc_retry_cnt_by_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_19 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_normal_ev_qfyrs_19, + .ev_actions = llc_normal_actions_19, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_20a[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rr_cmd_p_set_1, + [2] = llc_conn_ac_rst_vs, + [3] = llc_conn_ac_start_p_timer, + [4] = llc_conn_ac_inc_retry_cnt_by_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_20a = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_normal_ev_qfyrs_20a, + .ev_actions = llc_normal_actions_20a, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_20b[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rr_cmd_p_set_1, + [2] = llc_conn_ac_rst_vs, + [3] = llc_conn_ac_start_p_timer, + [4] = llc_conn_ac_inc_retry_cnt_by_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_20b = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_normal_ev_qfyrs_20b, + .ev_actions = llc_normal_actions_20b, +}; + +/* State transitions for LLC_CONN_EV_TX_BUFF_FULL event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_21[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_21[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_21 = { + .ev = llc_conn_ev_tx_buffer_full, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_21, + .ev_actions = llc_normal_actions_21, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_normal_state_transitions[] = { + [0] = &llc_normal_state_trans_1, /* Requests */ + [1] = &llc_normal_state_trans_2, + [2] = &llc_normal_state_trans_2_1, + [3] = &llc_common_state_trans_1, + [4] = &llc_common_state_trans_2, + [5] = &llc_common_state_trans_end, + [6] = &llc_normal_state_trans_21, + [7] = &llc_normal_state_trans_3, /* Local busy */ + [8] = &llc_normal_state_trans_4, + [9] = &llc_common_state_trans_end, + [10] = &llc_normal_state_trans_18, /* Init pf cycle */ + [11] = &llc_common_state_trans_end, + [12] = &llc_common_state_trans_11a, /* Timers */ + [13] = &llc_common_state_trans_11b, + [14] = &llc_common_state_trans_11c, + [15] = &llc_common_state_trans_11d, + [16] = &llc_normal_state_trans_19, + [17] = &llc_normal_state_trans_20a, + [18] = &llc_normal_state_trans_20b, + [19] = &llc_common_state_trans_end, + [20] = &llc_normal_state_trans_8b, /* Receive frames */ + [21] = &llc_normal_state_trans_9b, + [22] = &llc_normal_state_trans_10, + [23] = &llc_normal_state_trans_11b, + [24] = &llc_normal_state_trans_11c, + [25] = &llc_normal_state_trans_5a, + [26] = &llc_normal_state_trans_5b, + [27] = &llc_normal_state_trans_5c, + [28] = &llc_normal_state_trans_6a, + [29] = &llc_normal_state_trans_6b, + [30] = &llc_normal_state_trans_7, + [31] = &llc_normal_state_trans_8a, + [32] = &llc_normal_state_trans_9a, + [33] = &llc_normal_state_trans_11a, + [34] = &llc_normal_state_trans_12, + [35] = &llc_normal_state_trans_13a, + [36] = &llc_normal_state_trans_13b, + [37] = &llc_normal_state_trans_13c, + [38] = &llc_normal_state_trans_14, + [39] = &llc_normal_state_trans_15a, + [40] = &llc_normal_state_trans_15b, + [41] = &llc_normal_state_trans_16a, + [42] = &llc_normal_state_trans_16b, + [43] = &llc_normal_state_trans_17, + [44] = &llc_common_state_trans_3, + [45] = &llc_common_state_trans_4, + [46] = &llc_common_state_trans_5, + [47] = &llc_common_state_trans_6, + [48] = &llc_common_state_trans_7a, + [49] = &llc_common_state_trans_7b, + [50] = &llc_common_state_trans_8a, + [51] = &llc_common_state_trans_8b, + [52] = &llc_common_state_trans_8c, + [53] = &llc_common_state_trans_9, + /* [54] = &llc_common_state_trans_10, */ + [54] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_BUSY transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_1[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = llc_conn_ac_start_ack_tmr_if_not_running, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_1, + .ev_actions = llc_busy_actions_1, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_2[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = llc_conn_ac_start_ack_tmr_if_not_running, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_2 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_2, + .ev_actions = llc_busy_actions_2, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_1, + [1] = llc_conn_ev_qlfy_set_status_remote_busy, + [2] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_busy_actions_2_1[1]; + +static struct llc_conn_state_trans llc_busy_state_trans_2_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_2_1, + .ev_actions = llc_busy_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_1, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_3[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_start_rej_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_3 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_3, + .ev_actions = llc_busy_actions_3, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_1, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_4[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_start_rej_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_4 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_4, + .ev_actions = llc_busy_actions_4, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_5[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_5 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_busy_ev_qfyrs_5, + .ev_actions = llc_busy_actions_5, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_6[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_6[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_6 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_busy_ev_qfyrs_6, + .ev_actions = llc_busy_actions_6, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_2, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_7[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_7 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_7, + .ev_actions = llc_busy_actions_7, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_2, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_8[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_8 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_8, + .ev_actions = llc_busy_actions_8, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_9a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_9a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_9a, + .ev_actions = llc_busy_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_9b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_9b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_9b, + .ev_actions = llc_busy_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_10a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_10a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_10a, + .ev_actions = llc_busy_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_10b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_10b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_10b, + .ev_actions = llc_busy_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_busy_actions_11[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_11 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_11, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_12[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rnr_rsp_f_set_1, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_12 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_12, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_13a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [6] = llc_conn_ac_set_data_flag_0, + [7] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_13a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_13a, + .ev_actions = llc_busy_actions_13a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_13b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [6] = llc_conn_ac_set_data_flag_0, + [7] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_13b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_13b, + .ev_actions = llc_busy_actions_13b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_14a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_14a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_14a, + .ev_actions = llc_busy_actions_14a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_14b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_14b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_14b, + .ev_actions = llc_busy_actions_14b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_15a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_15a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_15a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_15b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_15b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_15b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_15c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_15c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_15c = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_15c, + .ev_actions = llc_busy_actions_15c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_16[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_16 = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_16, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_17a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_17a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_17a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_17b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_17b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_17b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_17c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_17c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_17c = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_17c, + .ev_actions = llc_busy_actions_17c, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_18[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_18 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_18, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_19a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_19a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_19a, + .ev_actions = llc_busy_actions_19a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_19b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_19b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_19b, + .ev_actions = llc_busy_actions_19b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_20a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_20a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_20a, + .ev_actions = llc_busy_actions_20a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_20b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_20b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_20b, + .ev_actions = llc_busy_actions_20b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_21[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_send_rnr_rsp_f_set_1, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_21 = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_21, +}; + +/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_22[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_22[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_22 = { + .ev = llc_conn_ev_init_p_f_cycle, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_22, + .ev_actions = llc_busy_actions_22, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_23[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_23[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_rst_vs, + [2] = llc_conn_ac_start_p_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_23 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_23, + .ev_actions = llc_busy_actions_23, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_24a[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = llc_conn_ac_rst_vs, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_24a = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_24a, + .ev_actions = llc_busy_actions_24a, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_24b[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = llc_conn_ac_rst_vs, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_24b = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_24b, + .ev_actions = llc_busy_actions_24b, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_25[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_25[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = llc_conn_ac_rst_vs, + [4] = llc_conn_ac_set_data_flag_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_25 = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_25, + .ev_actions = llc_busy_actions_25, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_26[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_26[] = { + [0] = llc_conn_ac_set_data_flag_1, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_26 = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_26, + .ev_actions = llc_busy_actions_26, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_busy_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_busy_state_trans_1, + [3] = &llc_busy_state_trans_2, + [4] = &llc_busy_state_trans_2_1, + [5] = &llc_common_state_trans_end, + [6] = &llc_busy_state_trans_3, /* Local busy */ + [7] = &llc_busy_state_trans_4, + [8] = &llc_busy_state_trans_5, + [9] = &llc_busy_state_trans_6, + [10] = &llc_busy_state_trans_7, + [11] = &llc_busy_state_trans_8, + [12] = &llc_common_state_trans_end, + [13] = &llc_busy_state_trans_22, /* Initiate PF cycle */ + [14] = &llc_common_state_trans_end, + [15] = &llc_common_state_trans_11a, /* Timer */ + [16] = &llc_common_state_trans_11b, + [17] = &llc_common_state_trans_11c, + [18] = &llc_common_state_trans_11d, + [19] = &llc_busy_state_trans_23, + [20] = &llc_busy_state_trans_24a, + [21] = &llc_busy_state_trans_24b, + [22] = &llc_busy_state_trans_25, + [23] = &llc_busy_state_trans_26, + [24] = &llc_common_state_trans_end, + [25] = &llc_busy_state_trans_9a, /* Receive frame */ + [26] = &llc_busy_state_trans_9b, + [27] = &llc_busy_state_trans_10a, + [28] = &llc_busy_state_trans_10b, + [29] = &llc_busy_state_trans_11, + [30] = &llc_busy_state_trans_12, + [31] = &llc_busy_state_trans_13a, + [32] = &llc_busy_state_trans_13b, + [33] = &llc_busy_state_trans_14a, + [34] = &llc_busy_state_trans_14b, + [35] = &llc_busy_state_trans_15a, + [36] = &llc_busy_state_trans_15b, + [37] = &llc_busy_state_trans_15c, + [38] = &llc_busy_state_trans_16, + [39] = &llc_busy_state_trans_17a, + [40] = &llc_busy_state_trans_17b, + [41] = &llc_busy_state_trans_17c, + [42] = &llc_busy_state_trans_18, + [43] = &llc_busy_state_trans_19a, + [44] = &llc_busy_state_trans_19b, + [45] = &llc_busy_state_trans_20a, + [46] = &llc_busy_state_trans_20b, + [47] = &llc_busy_state_trans_21, + [48] = &llc_common_state_trans_3, + [49] = &llc_common_state_trans_4, + [50] = &llc_common_state_trans_5, + [51] = &llc_common_state_trans_6, + [52] = &llc_common_state_trans_7a, + [53] = &llc_common_state_trans_7b, + [54] = &llc_common_state_trans_8a, + [55] = &llc_common_state_trans_8b, + [56] = &llc_common_state_trans_8c, + [57] = &llc_common_state_trans_9, + /* [58] = &llc_common_state_trans_10, */ + [58] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_REJ transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_1[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_1, + .ev_actions = llc_reject_actions_1, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_2[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_2 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_2, + .ev_actions = llc_reject_actions_2, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_1, + [1] = llc_conn_ev_qlfy_set_status_remote_busy, + [2] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_reject_actions_2_1[1]; + +static struct llc_conn_state_trans llc_reject_state_trans_2_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_2_1, + .ev_actions = llc_reject_actions_2_1, +}; + + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_3[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_2, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_3 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_reject_ev_qfyrs_3, + .ev_actions = llc_reject_actions_3, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_4[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_2, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_4 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_reject_ev_qfyrs_4, + .ev_actions = llc_reject_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_reject_actions_5a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_5a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_reject_actions_5b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_5b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_5c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_5c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_5c = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_5c, + .ev_actions = llc_reject_actions_5c, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_reject_actions_6[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_6 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_7a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_send_ack_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = llc_conn_ac_stop_rej_timer, + [7] = NULL, + +}; + +static struct llc_conn_state_trans llc_reject_state_trans_7a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_7a, + .ev_actions = llc_reject_actions_7a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_7b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_send_ack_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = llc_conn_ac_stop_rej_timer, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_7b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_7b, + .ev_actions = llc_reject_actions_7b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_8a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_ack_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_8a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_8a, + .ev_actions = llc_reject_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_8b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_ack_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_8b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_8b, + .ev_actions = llc_reject_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_9[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_ack_rsp_f_set_1, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_9 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_9, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_10a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_10a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_10b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_10b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_10c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_10c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_10c = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_10c, + .ev_actions = llc_reject_actions_10c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_11[] = { + [0] = llc_conn_ac_send_ack_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_11 = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_11, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_12a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_12a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_12a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_12b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_12b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_12b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_12c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_12c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_12c = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_12c, + .ev_actions = llc_reject_actions_12c, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_13[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_13 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_13, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_14a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_14a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_14a, + .ev_actions = llc_reject_actions_14a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_14b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_14b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_14b, + .ev_actions = llc_reject_actions_14b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_15a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_15a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_15a, + .ev_actions = llc_reject_actions_15a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_15b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_15b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_15b, + .ev_actions = llc_reject_actions_15b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_16[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_rsp_f_set_1, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_16 = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_16, +}; + +/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_17[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_17[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_17 = { + .ev = llc_conn_ev_init_p_f_cycle, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_17, + .ev_actions = llc_reject_actions_17, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_18[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_18[] = { + [0] = llc_conn_ac_send_rej_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_18 = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_18, + .ev_actions = llc_reject_actions_18, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_19[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_19[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = llc_conn_ac_rst_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_19 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_19, + .ev_actions = llc_reject_actions_19, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_20a[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = llc_conn_ac_rst_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_20a = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_20a, + .ev_actions = llc_reject_actions_20a, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_20b[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = llc_conn_ac_rst_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_20b = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_20b, + .ev_actions = llc_reject_actions_20b, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_reject_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_common_state_trans_end, + [3] = &llc_reject_state_trans_1, + [4] = &llc_reject_state_trans_2, + [5] = &llc_reject_state_trans_2_1, + [6] = &llc_reject_state_trans_3, /* Local busy */ + [7] = &llc_reject_state_trans_4, + [8] = &llc_common_state_trans_end, + [9] = &llc_reject_state_trans_17, /* Initiate PF cycle */ + [10] = &llc_common_state_trans_end, + [11] = &llc_common_state_trans_11a, /* Timer */ + [12] = &llc_common_state_trans_11b, + [13] = &llc_common_state_trans_11c, + [14] = &llc_common_state_trans_11d, + [15] = &llc_reject_state_trans_18, + [16] = &llc_reject_state_trans_19, + [17] = &llc_reject_state_trans_20a, + [18] = &llc_reject_state_trans_20b, + [19] = &llc_common_state_trans_end, + [20] = &llc_common_state_trans_3, /* Receive frame */ + [21] = &llc_common_state_trans_4, + [22] = &llc_common_state_trans_5, + [23] = &llc_common_state_trans_6, + [24] = &llc_common_state_trans_7a, + [25] = &llc_common_state_trans_7b, + [26] = &llc_common_state_trans_8a, + [27] = &llc_common_state_trans_8b, + [28] = &llc_common_state_trans_8c, + [29] = &llc_common_state_trans_9, + /* [30] = &llc_common_state_trans_10, */ + [30] = &llc_reject_state_trans_5a, + [31] = &llc_reject_state_trans_5b, + [32] = &llc_reject_state_trans_5c, + [33] = &llc_reject_state_trans_6, + [34] = &llc_reject_state_trans_7a, + [35] = &llc_reject_state_trans_7b, + [36] = &llc_reject_state_trans_8a, + [37] = &llc_reject_state_trans_8b, + [38] = &llc_reject_state_trans_9, + [39] = &llc_reject_state_trans_10a, + [40] = &llc_reject_state_trans_10b, + [41] = &llc_reject_state_trans_10c, + [42] = &llc_reject_state_trans_11, + [43] = &llc_reject_state_trans_12a, + [44] = &llc_reject_state_trans_12b, + [45] = &llc_reject_state_trans_12c, + [46] = &llc_reject_state_trans_13, + [47] = &llc_reject_state_trans_14a, + [48] = &llc_reject_state_trans_14b, + [49] = &llc_reject_state_trans_15a, + [50] = &llc_reject_state_trans_15b, + [51] = &llc_reject_state_trans_16, + [52] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_AWAIT transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_await_ev_qfyrs_1_0[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_await_actions_1_0[1]; + +static struct llc_conn_state_trans llc_await_state_trans_1_0 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_await_ev_qfyrs_1_0, + .ev_actions = llc_await_actions_1_0, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_action_t llc_await_actions_1[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_0, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_1 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_2[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_stop_p_timer, + [4] = llc_conn_ac_resend_i_xxx_x_set_0, + [5] = llc_conn_ac_start_rej_timer, + [6] = llc_conn_ac_clear_remote_busy, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_2 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_3a[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_3a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_3a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_3b[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_3b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_3b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_4[] = { + [0] = llc_conn_ac_send_rej_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_start_rej_timer, + [4] = llc_conn_ac_start_p_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_4 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_5[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr, + [6] = llc_conn_ac_clear_remote_busy, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_5 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_6a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_6a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_6a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_6b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_6b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_6b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_7[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_rsp_f_set_1, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_7 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_7, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_8a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_8a = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_8b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_8b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9c = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9c, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9d[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9d = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9d, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_10a[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_10a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_10b[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_10b = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_11[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_11 = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_11, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_12a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_12a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_12a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_12b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_12b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_12b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_13[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_13 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_13, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_await_ev_qfyrs_14[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_actions_14[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_14 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_await_ev_qfyrs_14, + .ev_actions = llc_await_actions_14, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_await_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_await_state_trans_1_0, + [3] = &llc_common_state_trans_end, + [4] = &llc_await_state_trans_1, /* Local busy */ + [5] = &llc_common_state_trans_end, + [6] = &llc_common_state_trans_end, /* Initiate PF Cycle */ + [7] = &llc_common_state_trans_11a, /* Timer */ + [8] = &llc_common_state_trans_11b, + [9] = &llc_common_state_trans_11c, + [10] = &llc_common_state_trans_11d, + [11] = &llc_await_state_trans_14, + [12] = &llc_common_state_trans_end, + [13] = &llc_common_state_trans_3, /* Receive frame */ + [14] = &llc_common_state_trans_4, + [15] = &llc_common_state_trans_5, + [16] = &llc_common_state_trans_6, + [17] = &llc_common_state_trans_7a, + [18] = &llc_common_state_trans_7b, + [19] = &llc_common_state_trans_8a, + [20] = &llc_common_state_trans_8b, + [21] = &llc_common_state_trans_8c, + [22] = &llc_common_state_trans_9, + /* [23] = &llc_common_state_trans_10, */ + [23] = &llc_await_state_trans_2, + [24] = &llc_await_state_trans_3a, + [25] = &llc_await_state_trans_3b, + [26] = &llc_await_state_trans_4, + [27] = &llc_await_state_trans_5, + [28] = &llc_await_state_trans_6a, + [29] = &llc_await_state_trans_6b, + [30] = &llc_await_state_trans_7, + [31] = &llc_await_state_trans_8a, + [32] = &llc_await_state_trans_8b, + [33] = &llc_await_state_trans_9a, + [34] = &llc_await_state_trans_9b, + [35] = &llc_await_state_trans_9c, + [36] = &llc_await_state_trans_9d, + [37] = &llc_await_state_trans_10a, + [38] = &llc_await_state_trans_10b, + [39] = &llc_await_state_trans_11, + [40] = &llc_await_state_trans_12a, + [41] = &llc_await_state_trans_12b, + [42] = &llc_await_state_trans_13, + [43] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_AWAIT_BUSY transitions */ +/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1_0[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_await_busy_actions_1_0[1]; + +static struct llc_conn_state_trans llc_await_busy_state_trans_1_0 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_await_busy_ev_qfyrs_1_0, + .ev_actions = llc_await_busy_actions_1_0, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_1[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_start_rej_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_1 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_busy_ev_qfyrs_1, + .ev_actions = llc_await_busy_actions_1, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_2[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_2 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_await_busy_ev_qfyrs_2, + .ev_actions = llc_await_busy_actions_2, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_3[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_3 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_busy_ev_qfyrs_3, + .ev_actions = llc_await_busy_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_4[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_stop_p_timer, + [4] = llc_conn_ac_set_data_flag_1, + [5] = llc_conn_ac_clear_remote_busy, + [6] = llc_conn_ac_resend_i_xxx_x_set_0, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_4 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_5a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_data_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_5a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_5b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_data_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_5b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_6[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_data_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_6 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_7[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_stop_p_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = llc_conn_ac_set_data_flag_0, + [7] = llc_conn_ac_clear_remote_busy, + [8] = llc_conn_ac_resend_i_xxx_x_set_0, + [9] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_7 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_7, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_8a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_8a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_8b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_8b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_9[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_9 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_9, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_10a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_10a = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_10b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_10b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11c = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11c, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11d[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11d = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11d, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_12a[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_12a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_12a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_12b[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_12b = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_12b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_13[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_13 = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_13, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_14a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_14a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_14a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_14b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_14b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_14b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_15[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_15 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_15, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_16[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_16[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_16 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_await_busy_ev_qfyrs_16, + .ev_actions = llc_await_busy_actions_16, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_await_busy_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_await_busy_state_trans_1_0, + [3] = &llc_common_state_trans_end, + [4] = &llc_await_busy_state_trans_1, /* Local busy */ + [5] = &llc_await_busy_state_trans_2, + [6] = &llc_await_busy_state_trans_3, + [7] = &llc_common_state_trans_end, + [8] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [9] = &llc_common_state_trans_11a, /* Timer */ + [10] = &llc_common_state_trans_11b, + [11] = &llc_common_state_trans_11c, + [12] = &llc_common_state_trans_11d, + [13] = &llc_await_busy_state_trans_16, + [14] = &llc_common_state_trans_end, + [15] = &llc_await_busy_state_trans_4, /* Receive frame */ + [16] = &llc_await_busy_state_trans_5a, + [17] = &llc_await_busy_state_trans_5b, + [18] = &llc_await_busy_state_trans_6, + [19] = &llc_await_busy_state_trans_7, + [20] = &llc_await_busy_state_trans_8a, + [21] = &llc_await_busy_state_trans_8b, + [22] = &llc_await_busy_state_trans_9, + [23] = &llc_await_busy_state_trans_10a, + [24] = &llc_await_busy_state_trans_10b, + [25] = &llc_await_busy_state_trans_11a, + [26] = &llc_await_busy_state_trans_11b, + [27] = &llc_await_busy_state_trans_11c, + [28] = &llc_await_busy_state_trans_11d, + [29] = &llc_await_busy_state_trans_12a, + [30] = &llc_await_busy_state_trans_12b, + [31] = &llc_await_busy_state_trans_13, + [32] = &llc_await_busy_state_trans_14a, + [33] = &llc_await_busy_state_trans_14b, + [34] = &llc_await_busy_state_trans_15, + [35] = &llc_common_state_trans_3, + [36] = &llc_common_state_trans_4, + [37] = &llc_common_state_trans_5, + [38] = &llc_common_state_trans_6, + [39] = &llc_common_state_trans_7a, + [40] = &llc_common_state_trans_7b, + [41] = &llc_common_state_trans_8a, + [42] = &llc_common_state_trans_8b, + [43] = &llc_common_state_trans_8c, + [44] = &llc_common_state_trans_9, + /* [45] = &llc_common_state_trans_10, */ + [45] = &llc_common_state_trans_end, +}; + +/* ----------------- LLC_CONN_STATE_AWAIT_REJ transitions --------------- */ +/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_await_reject_ev_qfyrs_1_0[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_await_reject_actions_1_0[1]; + +static struct llc_conn_state_trans llc_await_reject_state_trans_1_0 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_reject_ev_qfyrs_1_0, + .ev_actions = llc_await_reject_actions_1_0, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_action_t llc_await_rejct_actions_1[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_2, + [2] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_1 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_2a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_2a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_2a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_2b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_2b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_2b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_3[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_3 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_4[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr, + [7] = llc_conn_ac_clear_remote_busy, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_4 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_5a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_5a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_5b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_5b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_6[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_rsp_f_set_1, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_6 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_7a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_7a = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_7a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_7b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_7b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_7b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_7c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_7c = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_7c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8c = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8c, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8d[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8d = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8d, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_9a[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_9a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_9b[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_9b = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_10[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_10 = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_10, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_11a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_11a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_11b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_11b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_12[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_12 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_12, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_await_rejct_ev_qfyrs_13[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_rejct_actions_13[] = { + [0] = llc_conn_ac_send_rej_cmd_p_set_1, + [1] = llc_conn_ac_stop_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_13 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_rejct_ev_qfyrs_13, + .ev_actions = llc_await_rejct_actions_13, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_await_rejct_state_transitions[] = { + [0] = &llc_await_reject_state_trans_1_0, + [1] = &llc_common_state_trans_1, /* requests */ + [2] = &llc_common_state_trans_2, + [3] = &llc_common_state_trans_end, + [4] = &llc_await_rejct_state_trans_1, /* local busy */ + [5] = &llc_common_state_trans_end, + [6] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [7] = &llc_await_rejct_state_trans_13, /* timers */ + [8] = &llc_common_state_trans_11a, + [9] = &llc_common_state_trans_11b, + [10] = &llc_common_state_trans_11c, + [11] = &llc_common_state_trans_11d, + [12] = &llc_common_state_trans_end, + [13] = &llc_await_rejct_state_trans_2a, /* receive frames */ + [14] = &llc_await_rejct_state_trans_2b, + [15] = &llc_await_rejct_state_trans_3, + [16] = &llc_await_rejct_state_trans_4, + [17] = &llc_await_rejct_state_trans_5a, + [18] = &llc_await_rejct_state_trans_5b, + [19] = &llc_await_rejct_state_trans_6, + [20] = &llc_await_rejct_state_trans_7a, + [21] = &llc_await_rejct_state_trans_7b, + [22] = &llc_await_rejct_state_trans_7c, + [23] = &llc_await_rejct_state_trans_8a, + [24] = &llc_await_rejct_state_trans_8b, + [25] = &llc_await_rejct_state_trans_8c, + [26] = &llc_await_rejct_state_trans_8d, + [27] = &llc_await_rejct_state_trans_9a, + [28] = &llc_await_rejct_state_trans_9b, + [29] = &llc_await_rejct_state_trans_10, + [30] = &llc_await_rejct_state_trans_11a, + [31] = &llc_await_rejct_state_trans_11b, + [32] = &llc_await_rejct_state_trans_12, + [33] = &llc_common_state_trans_3, + [34] = &llc_common_state_trans_4, + [35] = &llc_common_state_trans_5, + [36] = &llc_common_state_trans_6, + [37] = &llc_common_state_trans_7a, + [38] = &llc_common_state_trans_7b, + [39] = &llc_common_state_trans_8a, + [40] = &llc_common_state_trans_8b, + [41] = &llc_common_state_trans_8c, + [42] = &llc_common_state_trans_9, + /* [43] = &llc_common_state_trans_10, */ + [43] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_D_CONN transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event, + * cause_flag = 1 */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_conflict, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_1[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_ac_disc_confirm, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_1, + .ev_actions = llc_d_conn_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_conflict, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_1_1[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_1_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_1_1, + .ev_actions = llc_d_conn_actions_1_1, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_1, + [2] = llc_conn_ev_qlfy_set_status_disc, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_2[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_disc_confirm, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_2 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_2, + .ev_actions = llc_d_conn_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_disc, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_2_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_2_1 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_2_1, + .ev_actions = llc_d_conn_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_d_conn_actions_3[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_3 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = NONE, + .ev_actions = llc_d_conn_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_4[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_disc_confirm, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_4 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_4, + .ev_actions = llc_d_conn_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_4_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_4_1 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_4_1, + .ev_actions = llc_d_conn_actions_4_1, +}; + +/* + * State transition for + * LLC_CONN_EV_DATA_CONN_REQ event + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_d_conn_actions_5[1]; + +static struct llc_conn_state_trans llc_d_conn_state_trans_5 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = llc_d_conn_ev_qfyrs_5, + .ev_actions = llc_d_conn_actions_5, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_6[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_6[] = { + [0] = llc_conn_ac_send_disc_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_6 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = llc_d_conn_ev_qfyrs_6, + .ev_actions = llc_d_conn_actions_6, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 1 */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_cause_flag_eq_1, + [2] = llc_conn_ev_qlfy_set_status_failed, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_7[] = { + [0] = llc_conn_ac_disc_confirm, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_7, + .ev_actions = llc_d_conn_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 0 */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_cause_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_failed, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_8[] = { + [0] = llc_conn_disc, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_8, + .ev_actions = llc_d_conn_actions_8, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_d_conn_state_transitions[] = { + [0] = &llc_d_conn_state_trans_5, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* Local busy */ + [3] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [4] = &llc_d_conn_state_trans_6, /* Timer */ + [5] = &llc_d_conn_state_trans_7, + [6] = &llc_d_conn_state_trans_8, + [7] = &llc_common_state_trans_end, + [8] = &llc_d_conn_state_trans_1, /* Receive frame */ + [9] = &llc_d_conn_state_trans_1_1, + [10] = &llc_d_conn_state_trans_2, + [11] = &llc_d_conn_state_trans_2_1, + [12] = &llc_d_conn_state_trans_3, + [13] = &llc_d_conn_state_trans_4, + [14] = &llc_d_conn_state_trans_4_1, + [15] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_RESET transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_rst_actions_1[] = { + [0] = llc_conn_ac_set_vs_0, + [1] = llc_conn_ac_set_vr_0, + [2] = llc_conn_ac_set_s_flag_1, + [3] = llc_conn_ac_send_ua_rsp_f_set_p, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_rst_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_1, + [2] = llc_conn_ev_qlfy_set_status_conn, + [3] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_2[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_rst_confirm, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_reset, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_2 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_rst_ev_qfyrs_2, + .ev_actions = llc_rst_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_rst_done, + [3] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_2_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_rst_confirm, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_reset, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_2_1 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_rst_ev_qfyrs_2_1, + .ev_actions = llc_rst_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_s_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_rst_done, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_3[] = { + [0] = llc_conn_ac_set_p_flag_0, + [1] = llc_conn_ac_set_remote_busy_0, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_3 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_rst_ev_qfyrs_3, + .ev_actions = llc_rst_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; +static llc_conn_action_t llc_rst_actions_4[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_disc_ind, + [2] = llc_conn_ac_stop_ack_timer, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_4 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_4, + .ev_actions = llc_rst_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_refuse, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_4_1[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_4_1 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_4_1, + .ev_actions = llc_rst_actions_4_1, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_5[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_5 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_5, + .ev_actions = llc_rst_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_refuse, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_5_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_5_1 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_5_1, + .ev_actions = llc_rst_actions_5_1, +}; + +/* State transitions for DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_6[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_rst_actions_6[1]; + +static struct llc_conn_state_trans llc_rst_state_trans_6 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_rst_ev_qfyrs_6, + .ev_actions = llc_rst_actions_6, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_7[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_rst_ev_qfyrs_7, + .ev_actions = llc_rst_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = llc_conn_ev_qlfy_cause_flag_eq_1, + [3] = llc_conn_ev_qlfy_set_status_failed, + [4] = NULL, +}; +static llc_conn_action_t llc_rst_actions_8[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_8, + .ev_actions = llc_rst_actions_8, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8_1[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = llc_conn_ev_qlfy_cause_flag_eq_0, + [3] = llc_conn_ev_qlfy_set_status_failed, + [4] = NULL, +}; +static llc_conn_action_t llc_rst_actions_8_1[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_8_1 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_8_1, + .ev_actions = llc_rst_actions_8_1, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_rst_state_transitions[] = { + [0] = &llc_rst_state_trans_6, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* Local busy */ + [3] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [4] = &llc_rst_state_trans_3, /* Timer */ + [5] = &llc_rst_state_trans_7, + [6] = &llc_rst_state_trans_8, + [7] = &llc_rst_state_trans_8_1, + [8] = &llc_common_state_trans_end, + [9] = &llc_rst_state_trans_1, /* Receive frame */ + [10] = &llc_rst_state_trans_2, + [11] = &llc_rst_state_trans_2_1, + [12] = &llc_rst_state_trans_4, + [13] = &llc_rst_state_trans_4_1, + [14] = &llc_rst_state_trans_5, + [15] = &llc_rst_state_trans_5_1, + [16] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_ERROR transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_error_actions_1[] = { + [0] = llc_conn_ac_set_vs_0, + [1] = llc_conn_ac_set_vr_0, + [2] = llc_conn_ac_send_ua_rsp_f_set_p, + [3] = llc_conn_ac_rst_ind, + [4] = llc_conn_ac_set_p_flag_0, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_ac_stop_ack_timer, + [7] = llc_conn_reset, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_error_actions_2[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = llc_conn_ac_disc_ind, + [2] = llc_conn_ac_stop_ack_timer, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_2 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_error_actions_3[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_3 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_error_actions_4[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_set_retry_cnt_0, + [3] = llc_conn_ac_set_cause_flag_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_4 = { + .ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_error_actions_5[] = { + [0] = llc_conn_ac_resend_frmr_rsp_f_set_p, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_5 = { + .ev = llc_conn_ev_rx_xxx_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_X event */ +static struct llc_conn_state_trans llc_error_state_trans_6 = { + .ev = llc_conn_ev_rx_xxx_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = NONE, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_error_actions_7[] = { + [0] = llc_conn_ac_resend_frmr_rsp_f_set_0, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = llc_error_ev_qfyrs_7, + .ev_actions = llc_error_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_error_actions_8[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_set_s_flag_0, + [2] = llc_conn_ac_start_ack_timer, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_error_ev_qfyrs_8, + .ev_actions = llc_error_actions_8, +}; + +/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_9[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_error_actions_9[1]; + +static struct llc_conn_state_trans llc_error_state_trans_9 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = llc_error_ev_qfyrs_9, + .ev_actions = llc_error_actions_9, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_error_state_transitions[] = { + [0] = &llc_error_state_trans_9, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* Local busy */ + [3] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [4] = &llc_error_state_trans_7, /* Timer */ + [5] = &llc_error_state_trans_8, + [6] = &llc_common_state_trans_end, + [7] = &llc_error_state_trans_1, /* Receive frame */ + [8] = &llc_error_state_trans_2, + [9] = &llc_error_state_trans_3, + [10] = &llc_error_state_trans_4, + [11] = &llc_error_state_trans_5, + [12] = &llc_error_state_trans_6, + [13] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_TEMP transitions */ +/* State transitions for LLC_CONN_EV_DISC_REQ event */ +static llc_conn_action_t llc_temp_actions_1[] = { + [0] = llc_conn_ac_stop_all_timers, + [1] = llc_conn_ac_send_disc_cmd_p_set_x, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_temp_state_trans_1 = { + .ev = llc_conn_ev_disc_req, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_temp_actions_1, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_temp_state_transitions[] = { + [0] = &llc_temp_state_trans_1, /* requests */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* local busy */ + [3] = &llc_common_state_trans_end, /* init_pf_cycle */ + [4] = &llc_common_state_trans_end, /* timer */ + [5] = &llc_common_state_trans_end, /* receive */ +}; + +/* Connection State Transition Table */ +struct llc_conn_state llc_conn_state_table[NBR_CONN_STATES] = { + [LLC_CONN_STATE_ADM - 1] = { + .current_state = LLC_CONN_STATE_ADM, + .transitions = llc_adm_state_transitions, + }, + [LLC_CONN_STATE_SETUP - 1] = { + .current_state = LLC_CONN_STATE_SETUP, + .transitions = llc_setup_state_transitions, + }, + [LLC_CONN_STATE_NORMAL - 1] = { + .current_state = LLC_CONN_STATE_NORMAL, + .transitions = llc_normal_state_transitions, + }, + [LLC_CONN_STATE_BUSY - 1] = { + .current_state = LLC_CONN_STATE_BUSY, + .transitions = llc_busy_state_transitions, + }, + [LLC_CONN_STATE_REJ - 1] = { + .current_state = LLC_CONN_STATE_REJ, + .transitions = llc_reject_state_transitions, + }, + [LLC_CONN_STATE_AWAIT - 1] = { + .current_state = LLC_CONN_STATE_AWAIT, + .transitions = llc_await_state_transitions, + }, + [LLC_CONN_STATE_AWAIT_BUSY - 1] = { + .current_state = LLC_CONN_STATE_AWAIT_BUSY, + .transitions = llc_await_busy_state_transitions, + }, + [LLC_CONN_STATE_AWAIT_REJ - 1] = { + .current_state = LLC_CONN_STATE_AWAIT_REJ, + .transitions = llc_await_rejct_state_transitions, + }, + [LLC_CONN_STATE_D_CONN - 1] = { + .current_state = LLC_CONN_STATE_D_CONN, + .transitions = llc_d_conn_state_transitions, + }, + [LLC_CONN_STATE_RESET - 1] = { + .current_state = LLC_CONN_STATE_RESET, + .transitions = llc_rst_state_transitions, + }, + [LLC_CONN_STATE_ERROR - 1] = { + .current_state = LLC_CONN_STATE_ERROR, + .transitions = llc_error_state_transitions, + }, + [LLC_CONN_STATE_TEMP - 1] = { + .current_state = LLC_CONN_STATE_TEMP, + .transitions = llc_temp_state_transitions, + }, +}; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c new file mode 100644 index 00000000..ba137a6a --- /dev/null +++ b/net/llc/llc_conn.c @@ -0,0 +1,1016 @@ +/* + * llc_conn.c - Driver routines for connection component. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +static int llc_find_offset(int state, int ev_type); +static void llc_conn_send_pdus(struct sock *sk); +static int llc_conn_service(struct sock *sk, struct sk_buff *skb); +static int llc_exec_conn_trans_actions(struct sock *sk, + struct llc_conn_state_trans *trans, + struct sk_buff *ev); +static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, + struct sk_buff *skb); + +/* Offset table on connection states transition diagram */ +static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV]; + +int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ; +int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ; +int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ; +int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; + +/** + * llc_conn_state_process - sends event to connection state machine + * @sk: connection + * @skb: occurred event + * + * Sends an event to connection state machine. After processing event + * (executing it's actions and changing state), upper layer will be + * indicated or confirmed, if needed. Returns 0 for success, 1 for + * failure. The socket lock has to be held before calling this function. + */ +int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(skb->sk); + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + /* + * We have to hold the skb, because llc_conn_service will kfree it in + * the sending path and we need to look at the skb->cb, where we encode + * llc_conn_state_ev. + */ + skb_get(skb); + ev->ind_prim = ev->cfm_prim = 0; + /* + * Send event to state machine + */ + rc = llc_conn_service(skb->sk, skb); + if (unlikely(rc != 0)) { + printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); + goto out_kfree_skb; + } + + if (unlikely(!ev->ind_prim && !ev->cfm_prim)) { + /* indicate or confirm not required */ + if (!skb->next) + goto out_kfree_skb; + goto out_skb_put; + } + + if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */ + skb_get(skb); + + switch (ev->ind_prim) { + case LLC_DATA_PRIM: + llc_save_primitive(sk, skb, LLC_DATA_PRIM); + if (unlikely(sock_queue_rcv_skb(sk, skb))) { + /* + * shouldn't happen + */ + printk(KERN_ERR "%s: sock_queue_rcv_skb failed!\n", + __func__); + kfree_skb(skb); + } + break; + case LLC_CONN_PRIM: + /* + * Can't be sock_queue_rcv_skb, because we have to leave the + * skb->sk pointing to the newly created struct sock in + * llc_conn_handler. -acme + */ + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_state_change(sk); + break; + case LLC_DISC_PRIM: + sock_hold(sk); + if (sk->sk_type == SOCK_STREAM && + sk->sk_state == TCP_ESTABLISHED) { + sk->sk_shutdown = SHUTDOWN_MASK; + sk->sk_socket->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + if (!sock_flag(sk, SOCK_DEAD)) { + sock_set_flag(sk, SOCK_DEAD); + sk->sk_state_change(sk); + } + } + kfree_skb(skb); + sock_put(sk); + break; + case LLC_RESET_PRIM: + /* + * FIXME: + * RESET is not being notified to upper layers for now + */ + printk(KERN_INFO "%s: received a reset ind!\n", __func__); + kfree_skb(skb); + break; + default: + if (ev->ind_prim) { + printk(KERN_INFO "%s: received unknown %d prim!\n", + __func__, ev->ind_prim); + kfree_skb(skb); + } + /* No indication */ + break; + } + + switch (ev->cfm_prim) { + case LLC_DATA_PRIM: + if (!llc_data_accept_state(llc->state)) + sk->sk_write_space(sk); + else + rc = llc->failed_data_req = 1; + break; + case LLC_CONN_PRIM: + if (sk->sk_type == SOCK_STREAM && + sk->sk_state == TCP_SYN_SENT) { + if (ev->status) { + sk->sk_socket->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + } else { + sk->sk_socket->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + } + sk->sk_state_change(sk); + } + break; + case LLC_DISC_PRIM: + sock_hold(sk); + if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSING) { + sk->sk_socket->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + sk->sk_state_change(sk); + } + sock_put(sk); + break; + case LLC_RESET_PRIM: + /* + * FIXME: + * RESET is not being notified to upper layers for now + */ + printk(KERN_INFO "%s: received a reset conf!\n", __func__); + break; + default: + if (ev->cfm_prim) { + printk(KERN_INFO "%s: received unknown %d prim!\n", + __func__, ev->cfm_prim); + break; + } + goto out_skb_put; /* No confirmation */ + } +out_kfree_skb: + kfree_skb(skb); +out_skb_put: + kfree_skb(skb); + return rc; +} + +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +{ + /* queue PDU to send to MAC layer */ + skb_queue_tail(&sk->sk_write_queue, skb); + llc_conn_send_pdus(sk); +} + +/** + * llc_conn_rtn_pdu - sends received data pdu to upper layer + * @sk: Active connection + * @skb: Received data frame + * + * Sends received data pdu to upper layer (by using indicate function). + * Prepares service parameters (prim and prim_data). calling indication + * function will be done in llc_conn_state_process. + */ +void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->ind_prim = LLC_DATA_PRIM; +} + +/** + * llc_conn_resend_i_pdu_as_cmd - resend all all unacknowledged I PDUs + * @sk: active connection + * @nr: NR + * @first_p_bit: p_bit value of first pdu + * + * Resend all unacknowledged I PDUs, starting with the NR; send first as + * command PDU with P bit equal first_p_bit; if more than one send + * subsequent as command PDUs with P bit equal zero (0). + */ +void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) +{ + struct sk_buff *skb; + struct llc_pdu_sn *pdu; + u16 nbr_unack_pdus; + struct llc_sock *llc; + u8 howmany_resend = 0; + + llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); + if (!nbr_unack_pdus) + goto out; + /* + * Process unack PDUs only if unack queue is not empty; remove + * appropriate PDUs, fix them up, and put them on mac_pdu_q. + */ + llc = llc_sk(sk); + + while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { + pdu = llc_pdu_sn_hdr(skb); + llc_pdu_set_cmd_rsp(skb, LLC_PDU_CMD); + llc_pdu_set_pf_bit(skb, first_p_bit); + skb_queue_tail(&sk->sk_write_queue, skb); + first_p_bit = 0; + llc->vS = LLC_I_GET_NS(pdu); + howmany_resend++; + } + if (howmany_resend > 0) + llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; + /* any PDUs to re-send are queued up; start sending to MAC */ + llc_conn_send_pdus(sk); +out:; +} + +/** + * llc_conn_resend_i_pdu_as_rsp - Resend all unacknowledged I PDUs + * @sk: active connection. + * @nr: NR + * @first_f_bit: f_bit value of first pdu. + * + * Resend all unacknowledged I PDUs, starting with the NR; send first as + * response PDU with F bit equal first_f_bit; if more than one send + * subsequent as response PDUs with F bit equal zero (0). + */ +void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) +{ + struct sk_buff *skb; + u16 nbr_unack_pdus; + struct llc_sock *llc = llc_sk(sk); + u8 howmany_resend = 0; + + llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); + if (!nbr_unack_pdus) + goto out; + /* + * Process unack PDUs only if unack queue is not empty; remove + * appropriate PDUs, fix them up, and put them on mac_pdu_q + */ + while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + llc_pdu_set_cmd_rsp(skb, LLC_PDU_RSP); + llc_pdu_set_pf_bit(skb, first_f_bit); + skb_queue_tail(&sk->sk_write_queue, skb); + first_f_bit = 0; + llc->vS = LLC_I_GET_NS(pdu); + howmany_resend++; + } + if (howmany_resend > 0) + llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; + /* any PDUs to re-send are queued up; start sending to MAC */ + llc_conn_send_pdus(sk); +out:; +} + +/** + * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue + * @sk: active connection + * nr: NR + * how_many_unacked: size of pdu_unack_q after removing acked pdus + * + * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns + * the number of pdus that removed from queue. + */ +int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked) +{ + int pdu_pos, i; + struct sk_buff *skb; + struct llc_pdu_sn *pdu; + int nbr_acked = 0; + struct llc_sock *llc = llc_sk(sk); + int q_len = skb_queue_len(&llc->pdu_unack_q); + + if (!q_len) + goto out; + skb = skb_peek(&llc->pdu_unack_q); + pdu = llc_pdu_sn_hdr(skb); + + /* finding position of last acked pdu in queue */ + pdu_pos = ((int)LLC_2_SEQ_NBR_MODULO + (int)nr - + (int)LLC_I_GET_NS(pdu)) % LLC_2_SEQ_NBR_MODULO; + + for (i = 0; i < pdu_pos && i < q_len; i++) { + skb = skb_dequeue(&llc->pdu_unack_q); + kfree_skb(skb); + nbr_acked++; + } +out: + *how_many_unacked = skb_queue_len(&llc->pdu_unack_q); + return nbr_acked; +} + +/** + * llc_conn_send_pdus - Sends queued PDUs + * @sk: active connection + * + * Sends queued pdus to MAC layer for transmission. + */ +static void llc_conn_send_pdus(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_TYPE_IS_I(pdu) && + !(skb->dev->flags & IFF_LOOPBACK)) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + + skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); + if (!skb2) + break; + skb = skb2; + } + dev_queue_xmit(skb); + } +} + +/** + * llc_conn_service - finds transition and changes state of connection + * @sk: connection + * @skb: happened event + * + * This function finds transition that matches with happened event, then + * executes related actions and finally changes state of connection. + * Returns 0 for success, 1 for failure. + */ +static int llc_conn_service(struct sock *sk, struct sk_buff *skb) +{ + int rc = 1; + struct llc_sock *llc = llc_sk(sk); + struct llc_conn_state_trans *trans; + + if (llc->state > NBR_CONN_STATES) + goto out; + rc = 0; + trans = llc_qualify_conn_ev(sk, skb); + if (trans) { + rc = llc_exec_conn_trans_actions(sk, trans, skb); + if (!rc && trans->next_state != NO_STATE_CHANGE) { + llc->state = trans->next_state; + if (!llc_data_accept_state(llc->state)) + sk->sk_state_change(sk); + } + } +out: + return rc; +} + +/** + * llc_qualify_conn_ev - finds transition for event + * @sk: connection + * @skb: happened event + * + * This function finds transition that matches with happened event. + * Returns pointer to found transition on success, %NULL otherwise. + */ +static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_conn_state_trans **next_trans; + llc_conn_ev_qfyr_t *next_qualifier; + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + struct llc_sock *llc = llc_sk(sk); + struct llc_conn_state *curr_state = + &llc_conn_state_table[llc->state - 1]; + + /* search thru events for this state until + * list exhausted or until no more + */ + for (next_trans = curr_state->transitions + + llc_find_offset(llc->state - 1, ev->type); + (*next_trans)->ev; next_trans++) { + if (!((*next_trans)->ev)(sk, skb)) { + /* got POSSIBLE event match; the event may require + * qualification based on the values of a number of + * state flags; if all qualifications are met (i.e., + * if all qualifying functions return success, or 0, + * then this is THE event we're looking for + */ + for (next_qualifier = (*next_trans)->ev_qualifiers; + next_qualifier && *next_qualifier && + !(*next_qualifier)(sk, skb); next_qualifier++) + /* nothing */; + if (!next_qualifier || !*next_qualifier) + /* all qualifiers executed successfully; this is + * our transition; return it so we can perform + * the associated actions & change the state + */ + return *next_trans; + } + } + return NULL; +} + +/** + * llc_exec_conn_trans_actions - executes related actions + * @sk: connection + * @trans: transition that it's actions must be performed + * @skb: event + * + * Executes actions that is related to happened event. Returns 0 for + * success, 1 to indicate failure of at least one action. + */ +static int llc_exec_conn_trans_actions(struct sock *sk, + struct llc_conn_state_trans *trans, + struct sk_buff *skb) +{ + int rc = 0; + llc_conn_action_t *next_action; + + for (next_action = trans->ev_actions; + next_action && *next_action; next_action++) { + int rc2 = (*next_action)(sk, skb); + + if (rc2 == 2) { + rc = rc2; + break; + } else if (rc2) + rc = 1; + } + return rc; +} + +static inline bool llc_estab_match(const struct llc_sap *sap, + const struct llc_addr *daddr, + const struct llc_addr *laddr, + const struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + return llc->laddr.lsap == laddr->lsap && + llc->daddr.lsap == daddr->lsap && + llc_mac_match(llc->laddr.mac, laddr->mac) && + llc_mac_match(llc->daddr.mac, daddr->mac); +} + +/** + * __llc_lookup_established - Finds connection for the remote/local sap/mac + * @sap: SAP + * @daddr: address of remote LLC (MAC + SAP) + * @laddr: address of local LLC (MAC + SAP) + * + * Search connection list of the SAP and finds connection using the remote + * mac, remote sap, local mac, and local sap. Returns pointer for + * connection found, %NULL otherwise. + * Caller has to make sure local_bh is disabled. + */ +static struct sock *__llc_lookup_established(struct llc_sap *sap, + struct llc_addr *daddr, + struct llc_addr *laddr) +{ + struct sock *rc; + struct hlist_nulls_node *node; + int slot = llc_sk_laddr_hashfn(sap, laddr); + struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; + + rcu_read_lock(); +again: + sk_nulls_for_each_rcu(rc, node, laddr_hb) { + if (llc_estab_match(sap, daddr, laddr, rc)) { + /* Extra checks required by SLAB_DESTROY_BY_RCU */ + if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) + goto again; + if (unlikely(llc_sk(rc)->sap != sap || + !llc_estab_match(sap, daddr, laddr, rc))) { + sock_put(rc); + continue; + } + goto found; + } + } + rc = NULL; + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (unlikely(get_nulls_value(node) != slot)) + goto again; +found: + rcu_read_unlock(); + return rc; +} + +struct sock *llc_lookup_established(struct llc_sap *sap, + struct llc_addr *daddr, + struct llc_addr *laddr) +{ + struct sock *sk; + + local_bh_disable(); + sk = __llc_lookup_established(sap, daddr, laddr); + local_bh_enable(); + return sk; +} + +static inline bool llc_listener_match(const struct llc_sap *sap, + const struct llc_addr *laddr, + const struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && + llc->laddr.lsap == laddr->lsap && + llc_mac_match(llc->laddr.mac, laddr->mac); +} + +static struct sock *__llc_lookup_listener(struct llc_sap *sap, + struct llc_addr *laddr) +{ + struct sock *rc; + struct hlist_nulls_node *node; + int slot = llc_sk_laddr_hashfn(sap, laddr); + struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; + + rcu_read_lock(); +again: + sk_nulls_for_each_rcu(rc, node, laddr_hb) { + if (llc_listener_match(sap, laddr, rc)) { + /* Extra checks required by SLAB_DESTROY_BY_RCU */ + if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) + goto again; + if (unlikely(llc_sk(rc)->sap != sap || + !llc_listener_match(sap, laddr, rc))) { + sock_put(rc); + continue; + } + goto found; + } + } + rc = NULL; + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (unlikely(get_nulls_value(node) != slot)) + goto again; +found: + rcu_read_unlock(); + return rc; +} + +/** + * llc_lookup_listener - Finds listener for local MAC + SAP + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search connection list of the SAP and finds connection listening on + * local mac, and local sap. Returns pointer for parent socket found, + * %NULL otherwise. + * Caller has to make sure local_bh is disabled. + */ +static struct sock *llc_lookup_listener(struct llc_sap *sap, + struct llc_addr *laddr) +{ + static struct llc_addr null_addr; + struct sock *rc = __llc_lookup_listener(sap, laddr); + + if (!rc) + rc = __llc_lookup_listener(sap, &null_addr); + + return rc; +} + +static struct sock *__llc_lookup(struct llc_sap *sap, + struct llc_addr *daddr, + struct llc_addr *laddr) +{ + struct sock *sk = __llc_lookup_established(sap, daddr, laddr); + + return sk ? : llc_lookup_listener(sap, laddr); +} + +/** + * llc_data_accept_state - designates if in this state data can be sent. + * @state: state of connection. + * + * Returns 0 if data can be sent, 1 otherwise. + */ +u8 llc_data_accept_state(u8 state) +{ + return state != LLC_CONN_STATE_NORMAL && state != LLC_CONN_STATE_BUSY && + state != LLC_CONN_STATE_REJ; +} + +/** + * llc_find_next_offset - finds offset for next category of transitions + * @state: state table. + * @offset: start offset. + * + * Finds offset of next category of transitions in transition table. + * Returns the start index of next category. + */ +static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset) +{ + u16 cnt = 0; + struct llc_conn_state_trans **next_trans; + + for (next_trans = state->transitions + offset; + (*next_trans)->ev; next_trans++) + ++cnt; + return cnt; +} + +/** + * llc_build_offset_table - builds offset table of connection + * + * Fills offset table of connection state transition table + * (llc_offset_table). + */ +void __init llc_build_offset_table(void) +{ + struct llc_conn_state *curr_state; + int state, ev_type, next_offset; + + for (state = 0; state < NBR_CONN_STATES; state++) { + curr_state = &llc_conn_state_table[state]; + next_offset = 0; + for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) { + llc_offset_table[state][ev_type] = next_offset; + next_offset += llc_find_next_offset(curr_state, + next_offset) + 1; + } + } +} + +/** + * llc_find_offset - finds start offset of category of transitions + * @state: state of connection + * @ev_type: type of happened event + * + * Finds start offset of desired category of transitions. Returns the + * desired start offset. + */ +static int llc_find_offset(int state, int ev_type) +{ + int rc = 0; + /* at this stage, llc_offset_table[..][2] is not important. it is for + * init_pf_cycle and I don't know what is it. + */ + switch (ev_type) { + case LLC_CONN_EV_TYPE_PRIM: + rc = llc_offset_table[state][0]; break; + case LLC_CONN_EV_TYPE_PDU: + rc = llc_offset_table[state][4]; break; + case LLC_CONN_EV_TYPE_SIMPLE: + rc = llc_offset_table[state][1]; break; + case LLC_CONN_EV_TYPE_P_TMR: + case LLC_CONN_EV_TYPE_ACK_TMR: + case LLC_CONN_EV_TYPE_REJ_TMR: + case LLC_CONN_EV_TYPE_BUSY_TMR: + rc = llc_offset_table[state][3]; break; + } + return rc; +} + +/** + * llc_sap_add_socket - adds a socket to a SAP + * @sap: SAP + * @sk: socket + * + * This function adds a socket to the hash tables of a SAP. + */ +void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + struct hlist_head *dev_hb = llc_sk_dev_hash(sap, llc->dev->ifindex); + struct hlist_nulls_head *laddr_hb = llc_sk_laddr_hash(sap, &llc->laddr); + + llc_sap_hold(sap); + llc_sk(sk)->sap = sap; + + spin_lock_bh(&sap->sk_lock); + sap->sk_count++; + sk_nulls_add_node_rcu(sk, laddr_hb); + hlist_add_head(&llc->dev_hash_node, dev_hb); + spin_unlock_bh(&sap->sk_lock); +} + +/** + * llc_sap_remove_socket - removes a socket from SAP + * @sap: SAP + * @sk: socket + * + * This function removes a connection from the hash tables of a SAP if + * the connection was in this list. + */ +void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + spin_lock_bh(&sap->sk_lock); + sk_nulls_del_node_init_rcu(sk); + hlist_del(&llc->dev_hash_node); + sap->sk_count--; + spin_unlock_bh(&sap->sk_lock); + llc_sap_put(sap); +} + +/** + * llc_conn_rcv - sends received pdus to the connection state machine + * @sk: current connection structure. + * @skb: received frame. + * + * Sends received pdus to the connection state machine. + */ +static int llc_conn_rcv(struct sock* sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->type = LLC_CONN_EV_TYPE_PDU; + ev->reason = 0; + return llc_conn_state_process(sk, skb); +} + +static struct sock *llc_create_incoming_sock(struct sock *sk, + struct net_device *dev, + struct llc_addr *saddr, + struct llc_addr *daddr) +{ + struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, + sk->sk_prot); + struct llc_sock *newllc, *llc = llc_sk(sk); + + if (!newsk) + goto out; + newllc = llc_sk(newsk); + memcpy(&newllc->laddr, daddr, sizeof(newllc->laddr)); + memcpy(&newllc->daddr, saddr, sizeof(newllc->daddr)); + newllc->dev = dev; + dev_hold(dev); + llc_sap_add_socket(llc->sap, newsk); + llc_sap_hold(llc->sap); +out: + return newsk; +} + +void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_addr saddr, daddr; + struct sock *sk; + + llc_pdu_decode_sa(skb, saddr.mac); + llc_pdu_decode_ssap(skb, &saddr.lsap); + llc_pdu_decode_da(skb, daddr.mac); + llc_pdu_decode_dsap(skb, &daddr.lsap); + + sk = __llc_lookup(sap, &saddr, &daddr); + if (!sk) + goto drop; + + bh_lock_sock(sk); + /* + * This has to be done here and not at the upper layer ->accept + * method because of the way the PROCOM state machine works: + * it needs to set several state variables (see, for instance, + * llc_adm_actions_2 in net/llc/llc_c_st.c) and send a packet to + * the originator of the new connection, and this state has to be + * in the newly created struct sock private area. -acme + */ + if (unlikely(sk->sk_state == TCP_LISTEN)) { + struct sock *newsk = llc_create_incoming_sock(sk, skb->dev, + &saddr, &daddr); + if (!newsk) + goto drop_unlock; + skb_set_owner_r(skb, newsk); + } else { + /* + * Can't be skb_set_owner_r, this will be done at the + * llc_conn_state_process function, later on, when we will use + * skb_queue_rcv_skb to send it to upper layers, this is + * another trick required to cope with how the PROCOM state + * machine works. -acme + */ + skb->sk = sk; + } + if (!sock_owned_by_user(sk)) + llc_conn_rcv(sk, skb); + else { + dprintk("%s: adding to backlog...\n", __func__); + llc_set_backlog_type(skb, LLC_PACKET); + if (sk_add_backlog(sk, skb)) + goto drop_unlock; + } +out: + bh_unlock_sock(sk); + sock_put(sk); + return; +drop: + kfree_skb(skb); + return; +drop_unlock: + kfree_skb(skb); + goto out; +} + +#undef LLC_REFCNT_DEBUG +#ifdef LLC_REFCNT_DEBUG +static atomic_t llc_sock_nr; +#endif + +/** + * llc_backlog_rcv - Processes rx frames and expired timers. + * @sk: LLC sock (p8022 connection) + * @skb: queued rx frame or event + * + * This function processes frames that has received and timers that has + * expired during sending an I pdu (refer to data_req_handler). frames + * queue by llc_rcv function (llc_mac.c) and timers queue by timer + * callback functions(llc_c_ac.c). + */ +static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int rc = 0; + struct llc_sock *llc = llc_sk(sk); + + if (likely(llc_backlog_type(skb) == LLC_PACKET)) { + if (likely(llc->state > 1)) /* not closed */ + rc = llc_conn_rcv(sk, skb); + else + goto out_kfree_skb; + } else if (llc_backlog_type(skb) == LLC_EVENT) { + /* timer expiration event */ + if (likely(llc->state > 1)) /* not closed */ + rc = llc_conn_state_process(sk, skb); + else + goto out_kfree_skb; + } else { + printk(KERN_ERR "%s: invalid skb in backlog\n", __func__); + goto out_kfree_skb; + } +out: + return rc; +out_kfree_skb: + kfree_skb(skb); + goto out; +} + +/** + * llc_sk_init - Initializes a socket with default llc values. + * @sk: socket to initialize. + * + * Initializes a socket with default llc values. + */ +static void llc_sk_init(struct sock* sk) +{ + struct llc_sock *llc = llc_sk(sk); + + llc->state = LLC_CONN_STATE_ADM; + llc->inc_cntr = llc->dec_cntr = 2; + llc->dec_step = llc->connect_step = 1; + + setup_timer(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, + (unsigned long)sk); + llc->ack_timer.expire = sysctl_llc2_ack_timeout; + + setup_timer(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, + (unsigned long)sk); + llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout; + + setup_timer(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, + (unsigned long)sk); + llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout; + + setup_timer(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, + (unsigned long)sk); + llc->busy_state_timer.expire = sysctl_llc2_busy_timeout; + + llc->n2 = 2; /* max retransmit */ + llc->k = 2; /* tx win size, will adjust dynam */ + llc->rw = 128; /* rx win size (opt and equal to + * tx_win of remote LLC) */ + skb_queue_head_init(&llc->pdu_unack_q); + sk->sk_backlog_rcv = llc_backlog_rcv; +} + +/** + * llc_sk_alloc - Allocates LLC sock + * @family: upper layer protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Allocates a LLC sock and initializes it. Returns the new LLC sock + * or %NULL if there's no memory available for one + */ +struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) +{ + struct sock *sk = sk_alloc(net, family, priority, prot); + + if (!sk) + goto out; + llc_sk_init(sk); + sock_init_data(NULL, sk); +#ifdef LLC_REFCNT_DEBUG + atomic_inc(&llc_sock_nr); + printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk, + __func__, atomic_read(&llc_sock_nr)); +#endif +out: + return sk; +} + +/** + * llc_sk_free - Frees a LLC socket + * @sk - socket to free + * + * Frees a LLC socket + */ +void llc_sk_free(struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + llc->state = LLC_CONN_OUT_OF_SVC; + /* Stop all (possibly) running timers */ + llc_conn_ac_stop_all_timers(sk, NULL); +#ifdef DEBUG_LLC_CONN_ALLOC + printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__, + skb_queue_len(&llc->pdu_unack_q), + skb_queue_len(&sk->sk_write_queue)); +#endif + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&llc->pdu_unack_q); +#ifdef LLC_REFCNT_DEBUG + if (atomic_read(&sk->sk_refcnt) != 1) { + printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n", + sk, __func__, atomic_read(&sk->sk_refcnt)); + printk(KERN_DEBUG "%d LLC sockets are still alive\n", + atomic_read(&llc_sock_nr)); + } else { + atomic_dec(&llc_sock_nr); + printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk, + __func__, atomic_read(&llc_sock_nr)); + } +#endif + sock_put(sk); +} + +/** + * llc_sk_reset - resets a connection + * @sk: LLC socket to reset + * + * Resets a connection to the out of service state. Stops its timers + * and frees any frames in the queues of the connection. + */ +void llc_sk_reset(struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + llc_conn_ac_stop_all_timers(sk, NULL); + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&llc->pdu_unack_q); + llc->remote_busy_flag = 0; + llc->cause_flag = 0; + llc->retry_count = 0; + llc_conn_set_p_flag(sk, 0); + llc->f_flag = 0; + llc->s_flag = 0; + llc->ack_pf = 0; + llc->first_pdu_Ns = 0; + llc->ack_must_be_send = 0; + llc->dec_step = 1; + llc->inc_cntr = 2; + llc->dec_cntr = 2; + llc->X = 0; + llc->failed_data_req = 0 ; + llc->last_nr = 0; +} diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c new file mode 100644 index 00000000..2bb0ddff --- /dev/null +++ b/net/llc/llc_core.c @@ -0,0 +1,169 @@ +/* + * llc_core.c - Minimum needed routines for sap handling and module init/exit + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +LIST_HEAD(llc_sap_list); +DEFINE_SPINLOCK(llc_sap_list_lock); + +/** + * llc_sap_alloc - allocates and initializes sap. + * + * Allocates and initializes sap. + */ +static struct llc_sap *llc_sap_alloc(void) +{ + struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC); + int i; + + if (sap) { + /* sap->laddr.mac - leave as a null, it's filled by bind */ + sap->state = LLC_SAP_STATE_ACTIVE; + spin_lock_init(&sap->sk_lock); + for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) + INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i); + atomic_set(&sap->refcnt, 1); + } + return sap; +} + +static struct llc_sap *__llc_sap_find(unsigned char sap_value) +{ + struct llc_sap* sap; + + list_for_each_entry(sap, &llc_sap_list, node) + if (sap->laddr.lsap == sap_value) + goto out; + sap = NULL; +out: + return sap; +} + +/** + * llc_sap_find - searchs a SAP in station + * @sap_value: sap to be found + * + * Searchs for a sap in the sap list of the LLC's station upon the sap ID. + * If the sap is found it will be refcounted and the user will have to do + * a llc_sap_put after use. + * Returns the sap or %NULL if not found. + */ +struct llc_sap *llc_sap_find(unsigned char sap_value) +{ + struct llc_sap *sap; + + rcu_read_lock_bh(); + sap = __llc_sap_find(sap_value); + if (sap) + llc_sap_hold(sap); + rcu_read_unlock_bh(); + return sap; +} + +/** + * llc_sap_open - open interface to the upper layers. + * @lsap: SAP number. + * @func: rcv func for datalink protos + * + * Interface function to upper layer. Each one who wants to get a SAP + * (for example NetBEUI) should call this function. Returns the opened + * SAP for success, NULL for failure. + */ +struct llc_sap *llc_sap_open(unsigned char lsap, + int (*func)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev)) +{ + struct llc_sap *sap = NULL; + + spin_lock_bh(&llc_sap_list_lock); + if (__llc_sap_find(lsap)) /* SAP already exists */ + goto out; + sap = llc_sap_alloc(); + if (!sap) + goto out; + sap->laddr.lsap = lsap; + sap->rcv_func = func; + list_add_tail_rcu(&sap->node, &llc_sap_list); +out: + spin_unlock_bh(&llc_sap_list_lock); + return sap; +} + +/** + * llc_sap_close - close interface for upper layers. + * @sap: SAP to be closed. + * + * Close interface function to upper layer. Each one who wants to + * close an open SAP (for example NetBEUI) should call this function. + * Removes this sap from the list of saps in the station and then + * frees the memory for this sap. + */ +void llc_sap_close(struct llc_sap *sap) +{ + WARN_ON(sap->sk_count); + + spin_lock_bh(&llc_sap_list_lock); + list_del_rcu(&sap->node); + spin_unlock_bh(&llc_sap_list_lock); + + synchronize_rcu(); + + kfree(sap); +} + +static struct packet_type llc_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_802_2), + .func = llc_rcv, +}; + +static struct packet_type llc_tr_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_TR_802_2), + .func = llc_rcv, +}; + +static int __init llc_init(void) +{ + dev_add_pack(&llc_packet_type); + dev_add_pack(&llc_tr_packet_type); + return 0; +} + +static void __exit llc_exit(void) +{ + dev_remove_pack(&llc_packet_type); + dev_remove_pack(&llc_tr_packet_type); +} + +module_init(llc_init); +module_exit(llc_exit); + +EXPORT_SYMBOL(llc_sap_list); +EXPORT_SYMBOL(llc_sap_list_lock); +EXPORT_SYMBOL(llc_sap_find); +EXPORT_SYMBOL(llc_sap_open); +EXPORT_SYMBOL(llc_sap_close); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); +MODULE_DESCRIPTION("LLC IEEE 802.2 core support"); diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c new file mode 100644 index 00000000..25c31c0a --- /dev/null +++ b/net/llc/llc_if.c @@ -0,0 +1,154 @@ +/* + * llc_if.c - Defines LLC interface to upper layer + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * llc_build_and_send_pkt - Connection data sending for upper layers. + * @sk: connection + * @skb: packet to send + * + * This function is called when upper layer wants to send data using + * connection oriented communication mode. During sending data, connection + * will be locked and received frames and expired timers will be queued. + * Returns 0 for success, -ECONNABORTED when the connection already + * closed and -EBUSY when sending data is not permitted in this state or + * LLC has send an I pdu with p bit set to 1 and is waiting for it's + * response. + */ +int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev; + int rc = -ECONNABORTED; + struct llc_sock *llc = llc_sk(sk); + + if (unlikely(llc->state == LLC_CONN_STATE_ADM)) + goto out; + rc = -EBUSY; + if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ + llc->p_flag)) { + llc->failed_data_req = 1; + goto out; + } + ev = llc_conn_ev(skb); + ev->type = LLC_CONN_EV_TYPE_PRIM; + ev->prim = LLC_DATA_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + skb->dev = llc->dev; + rc = llc_conn_state_process(sk, skb); +out: + return rc; +} + +/** + * llc_establish_connection - Called by upper layer to establish a conn + * @sk: connection + * @lmac: local mac address + * @dmac: destination mac address + * @dsap: destination sap + * + * Upper layer calls this to establish an LLC connection with a remote + * machine. This function packages a proper event and sends it connection + * component state machine. Success or failure of connection + * establishment will inform to upper layer via calling it's confirm + * function and passing proper information. + */ +int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap) +{ + int rc = -EISCONN; + struct llc_addr laddr, daddr; + struct sk_buff *skb; + struct llc_sock *llc = llc_sk(sk); + struct sock *existing; + + laddr.lsap = llc->sap->laddr.lsap; + daddr.lsap = dsap; + memcpy(daddr.mac, dmac, sizeof(daddr.mac)); + memcpy(laddr.mac, lmac, sizeof(laddr.mac)); + existing = llc_lookup_established(llc->sap, &daddr, &laddr); + if (existing) { + if (existing->sk_state == TCP_ESTABLISHED) { + sk = existing; + goto out_put; + } else + sock_put(existing); + } + sock_hold(sk); + rc = -ENOMEM; + skb = alloc_skb(0, GFP_ATOMIC); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->type = LLC_CONN_EV_TYPE_PRIM; + ev->prim = LLC_CONN_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + skb_set_owner_w(skb, sk); + rc = llc_conn_state_process(sk, skb); + } +out_put: + sock_put(sk); + return rc; +} + +/** + * llc_send_disc - Called by upper layer to close a connection + * @sk: connection to be closed + * + * Upper layer calls this when it wants to close an established LLC + * connection with a remote machine. This function packages a proper event + * and sends it to connection component state machine. Returns 0 for + * success, 1 otherwise. + */ +int llc_send_disc(struct sock *sk) +{ + u16 rc = 1; + struct llc_conn_state_ev *ev; + struct sk_buff *skb; + + sock_hold(sk); + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_ESTABLISHED || + llc_sk(sk)->state == LLC_CONN_STATE_ADM || + llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) + goto out; + /* + * Postpone unassigning the connection from its SAP and returning the + * connection until all ACTIONs have been completely executed + */ + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto out; + skb_set_owner_w(skb, sk); + sk->sk_state = TCP_CLOSING; + ev = llc_conn_ev(skb); + ev->type = LLC_CONN_EV_TYPE_PRIM; + ev->prim = LLC_DISC_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + rc = llc_conn_state_process(sk, skb); +out: + sock_put(sk); + return rc; +} + diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c new file mode 100644 index 00000000..90324211 --- /dev/null +++ b/net/llc/llc_input.c @@ -0,0 +1,212 @@ +/* + * llc_input.c - Minimal input path for LLC + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include + +#if 0 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +/* + * Packet handler for the station, registerable because in the minimal + * LLC core that is taking shape only the very minimal subset of LLC that + * is needed for things like IPX, Appletalk, etc will stay, with all the + * rest in the llc1 and llc2 modules. + */ +static void (*llc_station_handler)(struct sk_buff *skb); + +/* + * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. + */ +static void (*llc_type_handlers[2])(struct llc_sap *sap, + struct sk_buff *skb); + +void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, + struct sk_buff *skb)) +{ + if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) + llc_type_handlers[type - 1] = handler; +} + +void llc_remove_pack(int type) +{ + if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) + llc_type_handlers[type - 1] = NULL; +} + +void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) +{ + llc_station_handler = handler; +} + +/** + * llc_pdu_type - returns which LLC component must handle for PDU + * @skb: input skb + * + * This function returns which LLC component must handle this PDU. + */ +static __inline__ int llc_pdu_type(struct sk_buff *skb) +{ + int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) + goto out; + switch (LLC_U_PDU_CMD(pdu)) { + case LLC_1_PDU_CMD_XID: + case LLC_1_PDU_CMD_UI: + case LLC_1_PDU_CMD_TEST: + type = LLC_DEST_SAP; + break; + case LLC_2_PDU_CMD_SABME: + case LLC_2_PDU_CMD_DISC: + case LLC_2_PDU_RSP_UA: + case LLC_2_PDU_RSP_DM: + case LLC_2_PDU_RSP_FRMR: + break; + default: + type = LLC_DEST_INVALID; + break; + } +out: + return type; +} + +/** + * llc_fixup_skb - initializes skb pointers + * @skb: This argument points to incoming skb + * + * Initializes internal skb pointer to start of network layer by deriving + * length of LLC header; finds length of LLC control field in LLC header + * by looking at the two lowest-order bits of the first control field + * byte; field is either 3 or 4 bytes long. + */ +static inline int llc_fixup_skb(struct sk_buff *skb) +{ + u8 llc_len = 2; + struct llc_pdu_un *pdu; + + if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) + return 0; + + pdu = (struct llc_pdu_un *)skb->data; + if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) + llc_len = 1; + llc_len += 2; + + if (unlikely(!pskb_may_pull(skb, llc_len))) + return 0; + + skb->transport_header += llc_len; + skb_pull(skb, llc_len); + if (skb->protocol == htons(ETH_P_802_2)) { + __be16 pdulen = eth_hdr(skb)->h_proto; + s32 data_size = ntohs(pdulen) - llc_len; + + if (data_size < 0 || + !pskb_may_pull(skb, data_size)) + return 0; + if (unlikely(pskb_trim_rcsum(skb, data_size))) + return 0; + } + return 1; +} + +/** + * llc_rcv - 802.2 entry point from net lower layers + * @skb: received pdu + * @dev: device that receive pdu + * @pt: packet type + * + * When the system receives a 802.2 frame this function is called. It + * checks SAP and connection of received pdu and passes frame to + * llc_{station,sap,conn}_rcv for sending to proper state machine. If + * the frame is related to a busy connection (a connection is sending + * data now), it queues this frame in the connection's backlog. + */ +int llc_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct llc_sap *sap; + struct llc_pdu_sn *pdu; + int dest; + int (*rcv)(struct sk_buff *, struct net_device *, + struct packet_type *, struct net_device *); + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + /* + * When the interface is in promisc. mode, drop all the crap that it + * receives, do not try to analyse it. + */ + if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { + dprintk("%s: PACKET_OTHERHOST\n", __func__); + goto drop; + } + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto out; + if (unlikely(!llc_fixup_skb(skb))) + goto drop; + pdu = llc_pdu_sn_hdr(skb); + if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ + goto handle_station; + sap = llc_sap_find(pdu->dsap); + if (unlikely(!sap)) {/* unknown SAP */ + dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, + pdu->dsap); + goto drop; + } + /* + * First the upper layer protocols that don't need the full + * LLC functionality + */ + rcv = rcu_dereference(sap->rcv_func); + dest = llc_pdu_type(skb); + if (unlikely(!dest || !llc_type_handlers[dest - 1])) { + if (rcv) + rcv(skb, dev, pt, orig_dev); + else + kfree_skb(skb); + } else { + if (rcv) { + struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); + if (cskb) + rcv(cskb, dev, pt, orig_dev); + } + llc_type_handlers[dest - 1](sap, skb); + } + llc_sap_put(sap); +out: + return 0; +drop: + kfree_skb(skb); + goto out; +handle_station: + if (!llc_station_handler) + goto drop; + llc_station_handler(skb); + goto out; +} + +EXPORT_SYMBOL(llc_add_pack); +EXPORT_SYMBOL(llc_remove_pack); +EXPORT_SYMBOL(llc_set_station_handler); diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c new file mode 100644 index 00000000..b38a1079 --- /dev/null +++ b/net/llc/llc_output.c @@ -0,0 +1,81 @@ +/* + * llc_output.c - LLC minimal output path + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License version 2 for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +/** + * llc_mac_hdr_init - fills MAC header fields + * @skb: Address of the frame to initialize its MAC header + * @sa: The MAC source address + * @da: The MAC destination address + * + * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type + * is a valid type and initialization completes correctly 1, otherwise. + */ +int llc_mac_hdr_init(struct sk_buff *skb, + const unsigned char *sa, const unsigned char *da) +{ + int rc = -EINVAL; + + switch (skb->dev->type) { + case ARPHRD_IEEE802_TR: + case ARPHRD_ETHER: + case ARPHRD_LOOPBACK: + rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa, + skb->len); + if (rc > 0) + rc = 0; + break; + default: + WARN(1, "device type not supported: %d\n", skb->dev->type); + } + return rc; +} + +/** + * llc_build_and_send_ui_pkt - unitdata request interface for upper layers + * @sap: sap to use + * @skb: packet to send + * @dmac: destination mac address + * @dsap: destination sap + * + * Upper layers calls this function when upper layer wants to send data + * using connection-less mode communication (UI pdu). + * + * Accept data frame from network layer to be sent using connection- + * less mode communication; timeout/retries handled by network layer; + * package primitive as an event and send to SAP event handler + */ +int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, + unsigned char *dmac, unsigned char dsap) +{ + int rc; + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, + dsap, LLC_PDU_CMD); + llc_pdu_init_as_ui_cmd(skb); + rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); + if (likely(!rc)) + rc = dev_queue_xmit(skb); + return rc; +} + +EXPORT_SYMBOL(llc_mac_hdr_init); +EXPORT_SYMBOL(llc_build_and_send_ui_pkt); diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c new file mode 100644 index 00000000..2e6cb791 --- /dev/null +++ b/net/llc/llc_pdu.c @@ -0,0 +1,372 @@ +/* + * llc_pdu.c - access to PDU internals + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include + +static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type); +static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu); + +void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) +{ + llc_pdu_un_hdr(skb)->ssap |= pdu_type; +} + +/** + * pdu_set_pf_bit - sets poll/final bit in LLC header + * @pdu_frame: input frame that p/f bit must be set into it. + * @bit_value: poll/final bit (0 or 1). + * + * This function sets poll/final bit in LLC header (based on type of PDU). + * in I or S pdus, p/f bit is right bit of fourth byte in header. in U + * pdus p/f bit is fifth bit of third byte. + */ +void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value) +{ + u8 pdu_type; + struct llc_pdu_sn *pdu; + + llc_pdu_decode_pdu_type(skb, &pdu_type); + pdu = llc_pdu_sn_hdr(skb); + + switch (pdu_type) { + case LLC_PDU_TYPE_I: + case LLC_PDU_TYPE_S: + pdu->ctrl_2 = (pdu->ctrl_2 & 0xFE) | bit_value; + break; + case LLC_PDU_TYPE_U: + pdu->ctrl_1 |= (pdu->ctrl_1 & 0xEF) | (bit_value << 4); + break; + } +} + +/** + * llc_pdu_decode_pf_bit - extracs poll/final bit from LLC header + * @skb: input skb that p/f bit must be extracted from it + * @pf_bit: poll/final bit (0 or 1) + * + * This function extracts poll/final bit from LLC header (based on type of + * PDU). In I or S pdus, p/f bit is right bit of fourth byte in header. In + * U pdus p/f bit is fifth bit of third byte. + */ +void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit) +{ + u8 pdu_type; + struct llc_pdu_sn *pdu; + + llc_pdu_decode_pdu_type(skb, &pdu_type); + pdu = llc_pdu_sn_hdr(skb); + + switch (pdu_type) { + case LLC_PDU_TYPE_I: + case LLC_PDU_TYPE_S: + *pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; + break; + case LLC_PDU_TYPE_U: + *pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; + break; + } +} + +/** + * llc_pdu_init_as_disc_cmd - Builds DISC PDU + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * + * Builds a pdu frame as a DISC command. + */ +void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_CMD_DISC; + pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_init_as_i_cmd - builds I pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @ns: The sequence number of the data PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as an I command. + */ +void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_I; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= (p_bit & LLC_I_PF_BIT_MASK); /* p/f bit */ + pdu->ctrl_1 |= (ns << 1) & 0xFE; /* set N(S) in bits 2..8 */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rej_cmd - builds REJ PDU + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as a REJ command. + */ +void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_CMD_REJ; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rnr_cmd - builds RNR pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as an RNR command. + */ +void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_CMD_RNR; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rr_cmd - Builds RR pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as an RR command. + */ +void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_CMD_RR; + pdu->ctrl_2 = p_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_sabme_cmd - builds SABME pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * + * Builds a pdu frame as an SABME command. + */ +void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_CMD_SABME; + pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_init_as_dm_rsp - builds DM response pdu + * @skb: Address of the skb to build + * @f_bit: The F bit to set in the PDU + * + * Builds a pdu frame as a DM response. + */ +void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_RSP_DM; + pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_init_as_frmr_rsp - builds FRMR response PDU + * @skb: Address of the frame to build + * @prev_pdu: The rejected PDU frame + * @f_bit: The F bit to set in the PDU + * @vs: tx state vari value for the data link conn at the rejecting LLC + * @vr: rx state var value for the data link conn at the rejecting LLC + * @vzyxw: completely described in the IEEE Std 802.2 document (Pg 55) + * + * Builds a pdu frame as a FRMR response. + */ +void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, + u8 f_bit, u8 vs, u8 vr, u8 vzyxw) +{ + struct llc_frmr_info *frmr_info; + u8 prev_pf = 0; + u8 *ctrl; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_RSP_FRMR; + pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; + + frmr_info = (struct llc_frmr_info *)&pdu->ctrl_2; + ctrl = (u8 *)&prev_pdu->ctrl_1; + FRMR_INFO_SET_REJ_CNTRL(frmr_info,ctrl); + FRMR_INFO_SET_Vs(frmr_info, vs); + FRMR_INFO_SET_Vr(frmr_info, vr); + prev_pf = llc_pdu_get_pf_bit(prev_pdu); + FRMR_INFO_SET_C_R_BIT(frmr_info, prev_pf); + FRMR_INFO_SET_INVALID_PDU_CTRL_IND(frmr_info, vzyxw); + FRMR_INFO_SET_INVALID_PDU_INFO_IND(frmr_info, vzyxw); + FRMR_INFO_SET_PDU_INFO_2LONG_IND(frmr_info, vzyxw); + FRMR_INFO_SET_PDU_INVALID_Nr_IND(frmr_info, vzyxw); + FRMR_INFO_SET_PDU_INVALID_Ns_IND(frmr_info, vzyxw); + skb_put(skb, sizeof(struct llc_frmr_info)); +} + +/** + * llc_pdu_init_as_rr_rsp - builds RR response pdu + * @skb: Address of the skb to build + * @f_bit: The F bit to set in the PDU + * @nr: The seq. number of the expected data PDU from the remote + * + * Builds a pdu frame as an RR response. + */ +void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_RSP_RR; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rej_rsp - builds REJ response pdu + * @skb: Address of the skb to build + * @f_bit: The F bit to set in the PDU + * @nr: The seq. number of the expected data PDU from the remote + * + * Builds a pdu frame as a REJ response. + */ +void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_RSP_REJ; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rnr_rsp - builds RNR response pdu + * @skb: Address of the frame to build + * @f_bit: The F bit to set in the PDU + * @nr: The seq. number of the expected data PDU from the remote + * + * Builds a pdu frame as an RNR response. + */ +void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_RSP_RNR; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_ua_rsp - builds UA response pdu + * @skb: Address of the frame to build + * @f_bit: The F bit to set in the PDU + * + * Builds a pdu frame as a UA response. + */ +void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_RSP_UA; + pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_decode_pdu_type - designates PDU type + * @skb: input skb that type of it must be designated. + * @type: type of PDU (output argument). + * + * This function designates type of PDU (I, S or U). + */ +static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (pdu->ctrl_1 & 1) { + if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) + *type = LLC_PDU_TYPE_U; + else + *type = LLC_PDU_TYPE_S; + } else + *type = LLC_PDU_TYPE_I; +} + +/** + * llc_pdu_get_pf_bit - extracts p/f bit of input PDU + * @pdu: pointer to LLC header. + * + * This function extracts p/f bit of input PDU. at first examines type of + * PDU and then extracts p/f bit. Returns the p/f bit. + */ +static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu) +{ + u8 pdu_type; + u8 pf_bit = 0; + + if (pdu->ctrl_1 & 1) { + if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) + pdu_type = LLC_PDU_TYPE_U; + else + pdu_type = LLC_PDU_TYPE_S; + } else + pdu_type = LLC_PDU_TYPE_I; + switch (pdu_type) { + case LLC_PDU_TYPE_I: + case LLC_PDU_TYPE_S: + pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; + break; + case LLC_PDU_TYPE_U: + pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; + break; + } + return pf_bit; +} diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c new file mode 100644 index 00000000..7af1ff2d --- /dev/null +++ b/net/llc/llc_proc.c @@ -0,0 +1,276 @@ +/* + * proc_llc.c - proc interface for LLC + * + * Copyright (c) 2001 by Jay Schulist + * 2002-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void llc_ui_format_mac(struct seq_file *seq, u8 *addr) +{ + seq_printf(seq, "%pM", addr); +} + +static struct sock *llc_get_sk_idx(loff_t pos) +{ + struct llc_sap *sap; + struct sock *sk = NULL; + int i; + + list_for_each_entry_rcu(sap, &llc_sap_list, node) { + spin_lock_bh(&sap->sk_lock); + for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) { + struct hlist_nulls_head *head = &sap->sk_laddr_hash[i]; + struct hlist_nulls_node *node; + + sk_nulls_for_each(sk, node, head) { + if (!pos) + goto found; /* keep the lock */ + --pos; + } + } + spin_unlock_bh(&sap->sk_lock); + } + sk = NULL; +found: + return sk; +} + +static void *llc_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + rcu_read_lock_bh(); + return l ? llc_get_sk_idx(--l) : SEQ_START_TOKEN; +} + +static struct sock *laddr_hash_next(struct llc_sap *sap, int bucket) +{ + struct hlist_nulls_node *node; + struct sock *sk = NULL; + + while (++bucket < LLC_SK_LADDR_HASH_ENTRIES) + sk_nulls_for_each(sk, node, &sap->sk_laddr_hash[bucket]) + goto out; + +out: + return sk; +} + +static void *llc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock* sk, *next; + struct llc_sock *llc; + struct llc_sap *sap; + + ++*pos; + if (v == SEQ_START_TOKEN) { + sk = llc_get_sk_idx(0); + goto out; + } + sk = v; + next = sk_nulls_next(sk); + if (next) { + sk = next; + goto out; + } + llc = llc_sk(sk); + sap = llc->sap; + sk = laddr_hash_next(sap, llc_sk_laddr_hashfn(sap, &llc->laddr)); + if (sk) + goto out; + spin_unlock_bh(&sap->sk_lock); + list_for_each_entry_continue_rcu(sap, &llc_sap_list, node) { + spin_lock_bh(&sap->sk_lock); + sk = laddr_hash_next(sap, -1); + if (sk) + break; /* keep the lock */ + spin_unlock_bh(&sap->sk_lock); + } +out: + return sk; +} + +static void llc_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) { + struct sock *sk = v; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + spin_unlock_bh(&sap->sk_lock); + } + rcu_read_unlock_bh(); +} + +static int llc_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock* sk; + struct llc_sock *llc; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "SKt Mc local_mac_sap remote_mac_sap " + " tx_queue rx_queue st uid link\n"); + goto out; + } + sk = v; + llc = llc_sk(sk); + + /* FIXME: check if the address is multicast */ + seq_printf(seq, "%2X %2X ", sk->sk_type, 0); + + if (llc->dev) + llc_ui_format_mac(seq, llc->dev->dev_addr); + else { + u8 addr[6] = {0,0,0,0,0,0}; + llc_ui_format_mac(seq, addr); + } + seq_printf(seq, "@%02X ", llc->sap->laddr.lsap); + llc_ui_format_mac(seq, llc->daddr.mac); + seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->daddr.lsap, + sk_wmem_alloc_get(sk), + sk_rmem_alloc_get(sk) - llc->copied_seq, + sk->sk_state, + sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : -1, + llc->link); +out: + return 0; +} + +static const char *const llc_conn_state_names[] = { + [LLC_CONN_STATE_ADM] = "adm", + [LLC_CONN_STATE_SETUP] = "setup", + [LLC_CONN_STATE_NORMAL] = "normal", + [LLC_CONN_STATE_BUSY] = "busy", + [LLC_CONN_STATE_REJ] = "rej", + [LLC_CONN_STATE_AWAIT] = "await", + [LLC_CONN_STATE_AWAIT_BUSY] = "await_busy", + [LLC_CONN_STATE_AWAIT_REJ] = "await_rej", + [LLC_CONN_STATE_D_CONN] = "d_conn", + [LLC_CONN_STATE_RESET] = "reset", + [LLC_CONN_STATE_ERROR] = "error", + [LLC_CONN_STATE_TEMP] = "temp", +}; + +static int llc_seq_core_show(struct seq_file *seq, void *v) +{ + struct sock* sk; + struct llc_sock *llc; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Connection list:\n" + "dsap state retr txw rxw pf ff sf df rs cs " + "tack tpfc trs tbs blog busr\n"); + goto out; + } + sk = v; + llc = llc_sk(sk); + + seq_printf(seq, " %02X %-10s %3d %3d %3d %2d %2d %2d %2d %2d %2d " + "%4d %4d %3d %3d %4d %4d\n", + llc->daddr.lsap, llc_conn_state_names[llc->state], + llc->retry_count, llc->k, llc->rw, llc->p_flag, llc->f_flag, + llc->s_flag, llc->data_flag, llc->remote_busy_flag, + llc->cause_flag, timer_pending(&llc->ack_timer.timer), + timer_pending(&llc->pf_cycle_timer.timer), + timer_pending(&llc->rej_sent_timer.timer), + timer_pending(&llc->busy_state_timer.timer), + !!sk->sk_backlog.tail, !!sock_owned_by_user(sk)); +out: + return 0; +} + +static const struct seq_operations llc_seq_socket_ops = { + .start = llc_seq_start, + .next = llc_seq_next, + .stop = llc_seq_stop, + .show = llc_seq_socket_show, +}; + +static const struct seq_operations llc_seq_core_ops = { + .start = llc_seq_start, + .next = llc_seq_next, + .stop = llc_seq_stop, + .show = llc_seq_core_show, +}; + +static int llc_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &llc_seq_socket_ops); +} + +static int llc_seq_core_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &llc_seq_core_ops); +} + +static const struct file_operations llc_seq_socket_fops = { + .owner = THIS_MODULE, + .open = llc_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations llc_seq_core_fops = { + .owner = THIS_MODULE, + .open = llc_seq_core_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *llc_proc_dir; + +int __init llc_proc_init(void) +{ + int rc = -ENOMEM; + struct proc_dir_entry *p; + + llc_proc_dir = proc_mkdir("llc", init_net.proc_net); + if (!llc_proc_dir) + goto out; + + p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops); + if (!p) + goto out_socket; + + p = proc_create("core", S_IRUGO, llc_proc_dir, &llc_seq_core_fops); + if (!p) + goto out_core; + + rc = 0; +out: + return rc; +out_core: + remove_proc_entry("socket", llc_proc_dir); +out_socket: + remove_proc_entry("llc", init_net.proc_net); + goto out; +} + +void llc_proc_exit(void) +{ + remove_proc_entry("socket", llc_proc_dir); + remove_proc_entry("core", llc_proc_dir); + remove_proc_entry("llc", init_net.proc_net); +} diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c new file mode 100644 index 00000000..a94bd56b --- /dev/null +++ b/net/llc/llc_s_ac.c @@ -0,0 +1,208 @@ +/* + * llc_s_ac.c - actions performed during sap state transition. + * + * Description : + * Functions in this module are implementation of sap component actions. + * Details of actions can be found in IEEE-802.2 standard document. + * All functions have one sap and one event as input argument. All of + * them return 0 On success and 1 otherwise. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + + +/** + * llc_sap_action_unit_data_ind - forward UI PDU to network layer + * @sap: SAP + * @skb: the event to forward + * + * Received a UI PDU from MAC layer; forward to network layer as a + * UNITDATA INDICATION; verify our event is the kind we expect + */ +int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb) +{ + llc_sap_rtn_pdu(sap, skb); + return 0; +} + +/** + * llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer + * @sap: SAP + * @skb: the event to send + * + * Sends a UI PDU to the MAC layer in response to a UNITDATA REQUEST + * primitive from the network layer. Verifies event is a primitive type of + * event. Verify the primitive is a UNITDATA REQUEST. + */ +int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + int rc; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + ev->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_ui_cmd(skb); + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (likely(!rc)) + rc = dev_queue_xmit(skb); + return rc; +} + +/** + * llc_sap_action_send_xid_c - send XID PDU as response to XID REQ + * @sap: SAP + * @skb: the event to send + * + * Send a XID command PDU to MAC layer in response to a XID REQUEST + * primitive from the network layer. Verify event is a primitive type + * event. Verify the primitive is a XID REQUEST. + */ +int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + int rc; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + ev->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (likely(!rc)) + rc = dev_queue_xmit(skb); + return rc; +} + +/** + * llc_sap_action_send_xid_r - send XID PDU resp to MAC for received XID + * @sap: SAP + * @skb: the event to send + * + * Send XID response PDU to MAC in response to an earlier received XID + * command PDU. Verify event is a PDU type event + */ +int llc_sap_action_send_xid_r(struct llc_sap *sap, struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap; + int rc = 1; + struct sk_buff *nskb; + + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_da(skb, mac_sa); + llc_pdu_decode_ssap(skb, &dsap); + nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, + sizeof(struct llc_xid_info)); + if (!nskb) + goto out; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, + LLC_PDU_RSP); + llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 0); + rc = llc_mac_hdr_init(nskb, mac_sa, mac_da); + if (likely(!rc)) + rc = dev_queue_xmit(nskb); +out: + return rc; +} + +/** + * llc_sap_action_send_test_c - send TEST PDU to MAC in resp to TEST REQ + * @sap: SAP + * @skb: the event to send + * + * Send a TEST command PDU to the MAC layer in response to a TEST REQUEST + * primitive from the network layer. Verify event is a primitive type + * event; verify the primitive is a TEST REQUEST. + */ +int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + int rc; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + ev->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_test_cmd(skb); + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (likely(!rc)) + rc = dev_queue_xmit(skb); + return rc; +} + +int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap; + struct sk_buff *nskb; + int rc = 1; + u32 data_size; + + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_da(skb, mac_sa); + llc_pdu_decode_ssap(skb, &dsap); + + /* The test request command is type U (llc_len = 3) */ + data_size = ntohs(eth_hdr(skb)->h_proto) - 3; + nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); + if (!nskb) + goto out; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, + LLC_PDU_RSP); + llc_pdu_init_as_test_rsp(nskb, skb); + rc = llc_mac_hdr_init(nskb, mac_sa, mac_da); + if (likely(!rc)) + rc = dev_queue_xmit(nskb); +out: + return rc; +} + +/** + * llc_sap_action_report_status - report data link status to layer mgmt + * @sap: SAP + * @skb: the event to send + * + * Report data link status to layer management. Verify our event is the + * kind we expect. + */ +int llc_sap_action_report_status(struct llc_sap *sap, struct sk_buff *skb) +{ + return 0; +} + +/** + * llc_sap_action_xid_ind - send XID PDU resp to net layer via XID IND + * @sap: SAP + * @skb: the event to send + * + * Send a XID response PDU to the network layer via a XID INDICATION + * primitive. + */ +int llc_sap_action_xid_ind(struct llc_sap *sap, struct sk_buff *skb) +{ + llc_sap_rtn_pdu(sap, skb); + return 0; +} + +/** + * llc_sap_action_test_ind - send TEST PDU to net layer via TEST IND + * @sap: SAP + * @skb: the event to send + * + * Send a TEST response PDU to the network layer via a TEST INDICATION + * primitive. Verify our event is a PDU type event. + */ +int llc_sap_action_test_ind(struct llc_sap *sap, struct sk_buff *skb) +{ + llc_sap_rtn_pdu(sap, skb); + return 0; +} diff --git a/net/llc/llc_s_ev.c b/net/llc/llc_s_ev.c new file mode 100644 index 00000000..a74d2a1d --- /dev/null +++ b/net/llc/llc_s_ev.c @@ -0,0 +1,115 @@ +/* + * llc_s_ev.c - Defines SAP component events + * + * The followed event functions are SAP component events which are described + * in 802.2 LLC protocol standard document. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include + +int llc_sap_ev_activation_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_SIMPLE && + ev->prim_type == LLC_SAP_EV_ACTIVATION_REQ ? 0 : 1; +} + +int llc_sap_ev_rx_ui(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_UI ? 0 : 1; +} + +int llc_sap_ev_unitdata_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_PRIM && + ev->prim == LLC_DATAUNIT_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; + +} + +int llc_sap_ev_xid_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_PRIM && + ev->prim == LLC_XID_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_sap_ev_rx_xid_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1; +} + +int llc_sap_ev_rx_xid_r(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1; +} + +int llc_sap_ev_test_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_PRIM && + ev->prim == LLC_TEST_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_sap_ev_rx_test_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1; +} + +int llc_sap_ev_rx_test_r(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1; +} + +int llc_sap_ev_deactivation_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_SIMPLE && + ev->prim_type == LLC_SAP_EV_DEACTIVATION_REQ ? 0 : 1; +} diff --git a/net/llc/llc_s_st.c b/net/llc/llc_s_st.c new file mode 100644 index 00000000..135f7d80 --- /dev/null +++ b/net/llc/llc_s_st.c @@ -0,0 +1,183 @@ +/* + * llc_s_st.c - Defines SAP component state machine transitions. + * + * The followed transitions are SAP component state machine transitions + * which are described in 802.2 LLC protocol standard document. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include + +/* dummy last-transition indicator; common to all state transition groups + * last entry for this state + * all members are zeros, .bss zeroes it + */ +static struct llc_sap_state_trans llc_sap_state_trans_end; + +/* state LLC_SAP_STATE_INACTIVE transition for + * LLC_SAP_EV_ACTIVATION_REQ event + */ +static llc_sap_action_t llc_sap_inactive_state_actions_1[] = { + [0] = llc_sap_action_report_status, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_inactive_state_trans_1 = { + .ev = llc_sap_ev_activation_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_inactive_state_actions_1, +}; + +/* array of pointers; one to each transition */ +static struct llc_sap_state_trans *llc_sap_inactive_state_transitions[] = { + [0] = &llc_sap_inactive_state_trans_1, + [1] = &llc_sap_state_trans_end, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_UI event */ +static llc_sap_action_t llc_sap_active_state_actions_1[] = { + [0] = llc_sap_action_unitdata_ind, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_1 = { + .ev = llc_sap_ev_rx_ui, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_1, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_UNITDATA_REQ event */ +static llc_sap_action_t llc_sap_active_state_actions_2[] = { + [0] = llc_sap_action_send_ui, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_2 = { + .ev = llc_sap_ev_unitdata_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_2, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_XID_REQ event */ +static llc_sap_action_t llc_sap_active_state_actions_3[] = { + [0] = llc_sap_action_send_xid_c, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_3 = { + .ev = llc_sap_ev_xid_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_3, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_C event */ +static llc_sap_action_t llc_sap_active_state_actions_4[] = { + [0] = llc_sap_action_send_xid_r, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_4 = { + .ev = llc_sap_ev_rx_xid_c, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_4, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_R event */ +static llc_sap_action_t llc_sap_active_state_actions_5[] = { + [0] = llc_sap_action_xid_ind, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_5 = { + .ev = llc_sap_ev_rx_xid_r, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_5, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_TEST_REQ event */ +static llc_sap_action_t llc_sap_active_state_actions_6[] = { + [0] = llc_sap_action_send_test_c, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_6 = { + .ev = llc_sap_ev_test_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_6, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_C event */ +static llc_sap_action_t llc_sap_active_state_actions_7[] = { + [0] = llc_sap_action_send_test_r, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_7 = { + .ev = llc_sap_ev_rx_test_c, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_7 +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_R event */ +static llc_sap_action_t llc_sap_active_state_actions_8[] = { + [0] = llc_sap_action_test_ind, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_8 = { + .ev = llc_sap_ev_rx_test_r, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_8, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for + * LLC_SAP_EV_DEACTIVATION_REQ event + */ +static llc_sap_action_t llc_sap_active_state_actions_9[] = { + [0] = llc_sap_action_report_status, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_9 = { + .ev = llc_sap_ev_deactivation_req, + .next_state = LLC_SAP_STATE_INACTIVE, + .ev_actions = llc_sap_active_state_actions_9 +}; + +/* array of pointers; one to each transition */ +static struct llc_sap_state_trans *llc_sap_active_state_transitions[] = { + [0] = &llc_sap_active_state_trans_2, + [1] = &llc_sap_active_state_trans_1, + [2] = &llc_sap_active_state_trans_3, + [3] = &llc_sap_active_state_trans_4, + [4] = &llc_sap_active_state_trans_5, + [5] = &llc_sap_active_state_trans_6, + [6] = &llc_sap_active_state_trans_7, + [7] = &llc_sap_active_state_trans_8, + [8] = &llc_sap_active_state_trans_9, + [9] = &llc_sap_state_trans_end, +}; + +/* SAP state transition table */ +struct llc_sap_state llc_sap_state_table[LLC_NR_SAP_STATES] = { + [LLC_SAP_STATE_INACTIVE - 1] = { + .curr_state = LLC_SAP_STATE_INACTIVE, + .transitions = llc_sap_inactive_state_transitions, + }, + [LLC_SAP_STATE_ACTIVE - 1] = { + .curr_state = LLC_SAP_STATE_ACTIVE, + .transitions = llc_sap_active_state_transitions, + }, +}; diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c new file mode 100644 index 00000000..94e7fca7 --- /dev/null +++ b/net/llc/llc_sap.c @@ -0,0 +1,444 @@ +/* + * llc_sap.c - driver routines for SAP component. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int llc_mac_header_len(unsigned short devtype) +{ + switch (devtype) { + case ARPHRD_ETHER: + case ARPHRD_LOOPBACK: + return sizeof(struct ethhdr); +#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) + case ARPHRD_IEEE802_TR: + return sizeof(struct trh_hdr); +#endif + } + return 0; +} + +/** + * llc_alloc_frame - allocates sk_buff for frame + * @dev: network device this skb will be sent over + * @type: pdu type to allocate + * @data_size: data size to allocate + * + * Allocates an sk_buff for frame and initializes sk_buff fields. + * Returns allocated skb or %NULL when out of memory. + */ +struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev, + u8 type, u32 data_size) +{ + int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; + struct sk_buff *skb; + + hlen += llc_mac_header_len(dev->type); + skb = alloc_skb(hlen + data_size, GFP_ATOMIC); + + if (skb) { + skb_reset_mac_header(skb); + skb_reserve(skb, hlen); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->protocol = htons(ETH_P_802_2); + skb->dev = dev; + if (sk != NULL) + skb_set_owner_w(skb, sk); + } + return skb; +} + +void llc_save_primitive(struct sock *sk, struct sk_buff* skb, u8 prim) +{ + struct sockaddr_llc *addr; + + /* save primitive for use by the user. */ + addr = llc_ui_skb_cb(skb); + + memset(addr, 0, sizeof(*addr)); + addr->sllc_family = sk->sk_family; + addr->sllc_arphrd = skb->dev->type; + addr->sllc_test = prim == LLC_TEST_PRIM; + addr->sllc_xid = prim == LLC_XID_PRIM; + addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; + llc_pdu_decode_sa(skb, addr->sllc_mac); + llc_pdu_decode_ssap(skb, &addr->sllc_sap); +} + +/** + * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu. + * @sap: pointer to SAP + * @skb: received pdu + */ +void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + switch (LLC_U_PDU_RSP(pdu)) { + case LLC_1_PDU_CMD_TEST: + ev->prim = LLC_TEST_PRIM; break; + case LLC_1_PDU_CMD_XID: + ev->prim = LLC_XID_PRIM; break; + case LLC_1_PDU_CMD_UI: + ev->prim = LLC_DATAUNIT_PRIM; break; + } + ev->ind_cfm_flag = LLC_IND; +} + +/** + * llc_find_sap_trans - finds transition for event + * @sap: pointer to SAP + * @skb: happened event + * + * This function finds transition that matches with happened event. + * Returns the pointer to found transition on success or %NULL for + * failure. + */ +static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, + struct sk_buff* skb) +{ + int i = 0; + struct llc_sap_state_trans *rc = NULL; + struct llc_sap_state_trans **next_trans; + struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1]; + /* + * Search thru events for this state until list exhausted or until + * its obvious the event is not valid for the current state + */ + for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) + if (!next_trans[i]->ev(sap, skb)) { + rc = next_trans[i]; /* got event match; return it */ + break; + } + return rc; +} + +/** + * llc_exec_sap_trans_actions - execute actions related to event + * @sap: pointer to SAP + * @trans: pointer to transition that it's actions must be performed + * @skb: happened event. + * + * This function executes actions that is related to happened event. + * Returns 0 for success and 1 for failure of at least one action. + */ +static int llc_exec_sap_trans_actions(struct llc_sap *sap, + struct llc_sap_state_trans *trans, + struct sk_buff *skb) +{ + int rc = 0; + llc_sap_action_t *next_action = trans->ev_actions; + + for (; next_action && *next_action; next_action++) + if ((*next_action)(sap, skb)) + rc = 1; + return rc; +} + +/** + * llc_sap_next_state - finds transition, execs actions & change SAP state + * @sap: pointer to SAP + * @skb: happened event + * + * This function finds transition that matches with happened event, then + * executes related actions and finally changes state of SAP. It returns + * 0 on success and 1 for failure. + */ +static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb) +{ + int rc = 1; + struct llc_sap_state_trans *trans; + + if (sap->state > LLC_NR_SAP_STATES) + goto out; + trans = llc_find_sap_trans(sap, skb); + if (!trans) + goto out; + /* + * Got the state to which we next transition; perform the actions + * associated with this transition before actually transitioning to the + * next state + */ + rc = llc_exec_sap_trans_actions(sap, trans, skb); + if (rc) + goto out; + /* + * Transition SAP to next state if all actions execute successfully + */ + sap->state = trans->next_state; +out: + return rc; +} + +/** + * llc_sap_state_process - sends event to SAP state machine + * @sap: sap to use + * @skb: pointer to occurred event + * + * After executing actions of the event, upper layer will be indicated + * if needed(on receiving an UI frame). sk can be null for the + * datalink_proto case. + */ +static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + /* + * We have to hold the skb, because llc_sap_next_state + * will kfree it in the sending path and we need to + * look at the skb->cb, where we encode llc_sap_state_ev. + */ + skb_get(skb); + ev->ind_cfm_flag = 0; + llc_sap_next_state(sap, skb); + if (ev->ind_cfm_flag == LLC_IND) { + if (skb->sk->sk_state == TCP_LISTEN) + kfree_skb(skb); + else { + llc_save_primitive(skb->sk, skb, ev->prim); + + /* queue skb to the user. */ + if (sock_queue_rcv_skb(skb->sk, skb)) + kfree_skb(skb); + } + } + kfree_skb(skb); +} + +/** + * llc_build_and_send_test_pkt - TEST interface for upper layers. + * @sap: sap to use + * @skb: packet to send + * @dmac: destination mac address + * @dsap: destination sap + * + * This function is called when upper layer wants to send a TEST pdu. + * Returns 0 for success, 1 otherwise. + */ +void llc_build_and_send_test_pkt(struct llc_sap *sap, + struct sk_buff *skb, u8 *dmac, u8 dsap) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + ev->saddr.lsap = sap->laddr.lsap; + ev->daddr.lsap = dsap; + memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); + memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); + + ev->type = LLC_SAP_EV_TYPE_PRIM; + ev->prim = LLC_TEST_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + llc_sap_state_process(sap, skb); +} + +/** + * llc_build_and_send_xid_pkt - XID interface for upper layers + * @sap: sap to use + * @skb: packet to send + * @dmac: destination mac address + * @dsap: destination sap + * + * This function is called when upper layer wants to send a XID pdu. + * Returns 0 for success, 1 otherwise. + */ +void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, + u8 *dmac, u8 dsap) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + ev->saddr.lsap = sap->laddr.lsap; + ev->daddr.lsap = dsap; + memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); + memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); + + ev->type = LLC_SAP_EV_TYPE_PRIM; + ev->prim = LLC_XID_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + llc_sap_state_process(sap, skb); +} + +/** + * llc_sap_rcv - sends received pdus to the sap state machine + * @sap: current sap component structure. + * @skb: received frame. + * + * Sends received pdus to the sap state machine. + */ +static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, + struct sock *sk) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + ev->type = LLC_SAP_EV_TYPE_PDU; + ev->reason = 0; + skb->sk = sk; + llc_sap_state_process(sap, skb); +} + +static inline bool llc_dgram_match(const struct llc_sap *sap, + const struct llc_addr *laddr, + const struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + return sk->sk_type == SOCK_DGRAM && + llc->laddr.lsap == laddr->lsap && + llc_mac_match(llc->laddr.mac, laddr->mac); +} + +/** + * llc_lookup_dgram - Finds dgram socket for the local sap/mac + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search socket list of the SAP and finds connection using the local + * mac, and local sap. Returns pointer for socket found, %NULL otherwise. + */ +static struct sock *llc_lookup_dgram(struct llc_sap *sap, + const struct llc_addr *laddr) +{ + struct sock *rc; + struct hlist_nulls_node *node; + int slot = llc_sk_laddr_hashfn(sap, laddr); + struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; + + rcu_read_lock_bh(); +again: + sk_nulls_for_each_rcu(rc, node, laddr_hb) { + if (llc_dgram_match(sap, laddr, rc)) { + /* Extra checks required by SLAB_DESTROY_BY_RCU */ + if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) + goto again; + if (unlikely(llc_sk(rc)->sap != sap || + !llc_dgram_match(sap, laddr, rc))) { + sock_put(rc); + continue; + } + goto found; + } + } + rc = NULL; + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (unlikely(get_nulls_value(node) != slot)) + goto again; +found: + rcu_read_unlock_bh(); + return rc; +} + +static inline bool llc_mcast_match(const struct llc_sap *sap, + const struct llc_addr *laddr, + const struct sk_buff *skb, + const struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + return sk->sk_type == SOCK_DGRAM && + llc->laddr.lsap == laddr->lsap && + llc->dev == skb->dev; +} + +static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, + struct sock **stack, int count) +{ + struct sk_buff *skb1; + int i; + + for (i = 0; i < count; i++) { + skb1 = skb_clone(skb, GFP_ATOMIC); + if (!skb1) { + sock_put(stack[i]); + continue; + } + + llc_sap_rcv(sap, skb1, stack[i]); + sock_put(stack[i]); + } +} + +/** + * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search socket list of the SAP and finds connections with same sap. + * Deliver clone to each. + */ +static void llc_sap_mcast(struct llc_sap *sap, + const struct llc_addr *laddr, + struct sk_buff *skb) +{ + int i = 0, count = 256 / sizeof(struct sock *); + struct sock *sk, *stack[count]; + struct hlist_node *node; + struct llc_sock *llc; + struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); + + spin_lock_bh(&sap->sk_lock); + hlist_for_each_entry(llc, node, dev_hb, dev_hash_node) { + + sk = &llc->sk; + + if (!llc_mcast_match(sap, laddr, skb, sk)) + continue; + + sock_hold(sk); + if (i < count) + stack[i++] = sk; + else { + llc_do_mcast(sap, skb, stack, i); + i = 0; + } + } + spin_unlock_bh(&sap->sk_lock); + + llc_do_mcast(sap, skb, stack, i); +} + + +void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_addr laddr; + + llc_pdu_decode_da(skb, laddr.mac); + llc_pdu_decode_dsap(skb, &laddr.lsap); + + if (llc_mac_multicast(laddr.mac)) { + llc_sap_mcast(sap, &laddr, skb); + kfree_skb(skb); + } else { + struct sock *sk = llc_lookup_dgram(sap, &laddr); + if (sk) { + llc_sap_rcv(sap, skb, sk); + sock_put(sk); + } else + kfree_skb(skb); + } +} diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c new file mode 100644 index 00000000..cf4aea3b --- /dev/null +++ b/net/llc/llc_station.c @@ -0,0 +1,722 @@ +/* + * llc_station.c - station component of LLC + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * struct llc_station - LLC station component + * + * SAP and connection resource manager, one per adapter. + * + * @state - state of station + * @xid_r_count - XID response PDU counter + * @mac_sa - MAC source address + * @sap_list - list of related SAPs + * @ev_q - events entering state mach. + * @mac_pdu_q - PDUs ready to send to MAC + */ +struct llc_station { + u8 state; + u8 xid_r_count; + struct timer_list ack_timer; + u8 retry_count; + u8 maximum_retry; + struct { + struct sk_buff_head list; + spinlock_t lock; + } ev_q; + struct sk_buff_head mac_pdu_q; +}; + +#define LLC_STATION_ACK_TIME (3 * HZ) + +int sysctl_llc_station_ack_timeout = LLC_STATION_ACK_TIME; + +/* Types of events (possible values in 'ev->type') */ +#define LLC_STATION_EV_TYPE_SIMPLE 1 +#define LLC_STATION_EV_TYPE_CONDITION 2 +#define LLC_STATION_EV_TYPE_PRIM 3 +#define LLC_STATION_EV_TYPE_PDU 4 /* command/response PDU */ +#define LLC_STATION_EV_TYPE_ACK_TMR 5 +#define LLC_STATION_EV_TYPE_RPT_STATUS 6 + +/* Events */ +#define LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK 1 +#define LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK 2 +#define LLC_STATION_EV_ACK_TMR_EXP_LT_RETRY_CNT_MAX_RETRY 3 +#define LLC_STATION_EV_ACK_TMR_EXP_EQ_RETRY_CNT_MAX_RETRY 4 +#define LLC_STATION_EV_RX_NULL_DSAP_XID_C 5 +#define LLC_STATION_EV_RX_NULL_DSAP_0_XID_R_XID_R_CNT_EQ 6 +#define LLC_STATION_EV_RX_NULL_DSAP_1_XID_R_XID_R_CNT_EQ 7 +#define LLC_STATION_EV_RX_NULL_DSAP_TEST_C 8 +#define LLC_STATION_EV_DISABLE_REQ 9 + +struct llc_station_state_ev { + u8 type; + u8 prim; + u8 prim_type; + u8 reason; + struct list_head node; /* node in station->ev_q.list */ +}; + +static __inline__ struct llc_station_state_ev * + llc_station_ev(struct sk_buff *skb) +{ + return (struct llc_station_state_ev *)skb->cb; +} + +typedef int (*llc_station_ev_t)(struct sk_buff *skb); + +#define LLC_STATION_STATE_DOWN 1 /* initial state */ +#define LLC_STATION_STATE_DUP_ADDR_CHK 2 +#define LLC_STATION_STATE_UP 3 + +#define LLC_NBR_STATION_STATES 3 /* size of state table */ + +typedef int (*llc_station_action_t)(struct sk_buff *skb); + +/* Station component state table structure */ +struct llc_station_state_trans { + llc_station_ev_t ev; + u8 next_state; + llc_station_action_t *ev_actions; +}; + +struct llc_station_state { + u8 curr_state; + struct llc_station_state_trans **transitions; +}; + +static struct llc_station llc_main_station; + +static int llc_stat_ev_enable_with_dup_addr_check(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_SIMPLE && + ev->prim_type == + LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK ? 0 : 1; +} + +static int llc_stat_ev_enable_without_dup_addr_check(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_SIMPLE && + ev->prim_type == + LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK ? 0 : 1; +} + +static int llc_stat_ev_ack_tmr_exp_lt_retry_cnt_max_retry(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_ACK_TMR && + llc_main_station.retry_count < + llc_main_station.maximum_retry ? 0 : 1; +} + +static int llc_stat_ev_ack_tmr_exp_eq_retry_cnt_max_retry(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_ACK_TMR && + llc_main_station.retry_count == + llc_main_station.maximum_retry ? 0 : 1; +} + +static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_CMD(pdu) && /* command PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID && + !pdu->dsap ? 0 : 1; /* NULL DSAP value */ +} + +static int llc_stat_ev_rx_null_dsap_0_xid_r_xid_r_cnt_eq(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_RSP(pdu) && /* response PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID && + !pdu->dsap && /* NULL DSAP value */ + !llc_main_station.xid_r_count ? 0 : 1; +} + +static int llc_stat_ev_rx_null_dsap_1_xid_r_xid_r_cnt_eq(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_RSP(pdu) && /* response PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID && + !pdu->dsap && /* NULL DSAP value */ + llc_main_station.xid_r_count == 1 ? 0 : 1; +} + +static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_CMD(pdu) && /* command PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST && + !pdu->dsap ? 0 : 1; /* NULL DSAP */ +} + +static int llc_stat_ev_disable_req(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_PRIM && + ev->prim == LLC_DISABLE_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +/** + * llc_station_send_pdu - queues PDU to send + * @skb: Address of the PDU + * + * Queues a PDU to send to the MAC layer. + */ +static void llc_station_send_pdu(struct sk_buff *skb) +{ + skb_queue_tail(&llc_main_station.mac_pdu_q, skb); + while ((skb = skb_dequeue(&llc_main_station.mac_pdu_q)) != NULL) + if (dev_queue_xmit(skb)) + break; +} + +static int llc_station_ac_start_ack_timer(struct sk_buff *skb) +{ + mod_timer(&llc_main_station.ack_timer, + jiffies + sysctl_llc_station_ack_timeout); + return 0; +} + +static int llc_station_ac_set_retry_cnt_0(struct sk_buff *skb) +{ + llc_main_station.retry_count = 0; + return 0; +} + +static int llc_station_ac_inc_retry_cnt_by_1(struct sk_buff *skb) +{ + llc_main_station.retry_count++; + return 0; +} + +static int llc_station_ac_set_xid_r_cnt_0(struct sk_buff *skb) +{ + llc_main_station.xid_r_count = 0; + return 0; +} + +static int llc_station_ac_inc_xid_r_cnt_by_1(struct sk_buff *skb) +{ + llc_main_station.xid_r_count++; + return 0; +} + +static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb) +{ + int rc = 1; + struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, + sizeof(struct llc_xid_info)); + + if (!nskb) + goto out; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, 0, LLC_PDU_CMD); + llc_pdu_init_as_xid_cmd(nskb, LLC_XID_NULL_CLASS_2, 127); + rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, skb->dev->dev_addr); + if (unlikely(rc)) + goto free; + llc_station_send_pdu(nskb); +out: + return rc; +free: + kfree_skb(skb); + goto out; +} + +static int llc_station_ac_send_xid_r(struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], dsap; + int rc = 1; + struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, + sizeof(struct llc_xid_info)); + + if (!nskb) + goto out; + rc = 0; + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_ssap(skb, &dsap); + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); + llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127); + rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da); + if (unlikely(rc)) + goto free; + llc_station_send_pdu(nskb); +out: + return rc; +free: + kfree_skb(skb); + goto out; +} + +static int llc_station_ac_send_test_r(struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], dsap; + int rc = 1; + u32 data_size; + struct sk_buff *nskb; + + /* The test request command is type U (llc_len = 3) */ + data_size = ntohs(eth_hdr(skb)->h_proto) - 3; + nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); + + if (!nskb) + goto out; + rc = 0; + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_ssap(skb, &dsap); + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); + llc_pdu_init_as_test_rsp(nskb, skb); + rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da); + if (unlikely(rc)) + goto free; + llc_station_send_pdu(nskb); +out: + return rc; +free: + kfree_skb(skb); + goto out; +} + +static int llc_station_ac_report_status(struct sk_buff *skb) +{ + return 0; +} + +/* COMMON STATION STATE transitions */ + +/* dummy last-transition indicator; common to all state transition groups + * last entry for this state + * all members are zeros, .bss zeroes it + */ +static struct llc_station_state_trans llc_stat_state_trans_end; + +/* DOWN STATE transitions */ + +/* state transition for LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK event */ +static llc_station_action_t llc_stat_down_state_actions_1[] = { + [0] = llc_station_ac_start_ack_timer, + [1] = llc_station_ac_set_retry_cnt_0, + [2] = llc_station_ac_set_xid_r_cnt_0, + [3] = llc_station_ac_send_null_dsap_xid_c, + [4] = NULL, +}; + +static struct llc_station_state_trans llc_stat_down_state_trans_1 = { + .ev = llc_stat_ev_enable_with_dup_addr_check, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_down_state_actions_1, +}; + +/* state transition for LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK event */ +static llc_station_action_t llc_stat_down_state_actions_2[] = { + [0] = llc_station_ac_report_status, /* STATION UP */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_down_state_trans_2 = { + .ev = llc_stat_ev_enable_without_dup_addr_check, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_down_state_actions_2, +}; + +/* array of pointers; one to each transition */ +static struct llc_station_state_trans *llc_stat_dwn_state_trans[] = { + [0] = &llc_stat_down_state_trans_1, + [1] = &llc_stat_down_state_trans_2, + [2] = &llc_stat_state_trans_end, +}; + +/* UP STATE transitions */ +/* state transition for LLC_STATION_EV_DISABLE_REQ event */ +static llc_station_action_t llc_stat_up_state_actions_1[] = { + [0] = llc_station_ac_report_status, /* STATION DOWN */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_up_state_trans_1 = { + .ev = llc_stat_ev_disable_req, + .next_state = LLC_STATION_STATE_DOWN, + .ev_actions = llc_stat_up_state_actions_1, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_XID_C event */ +static llc_station_action_t llc_stat_up_state_actions_2[] = { + [0] = llc_station_ac_send_xid_r, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_up_state_trans_2 = { + .ev = llc_stat_ev_rx_null_dsap_xid_c, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_up_state_actions_2, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_TEST_C event */ +static llc_station_action_t llc_stat_up_state_actions_3[] = { + [0] = llc_station_ac_send_test_r, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_up_state_trans_3 = { + .ev = llc_stat_ev_rx_null_dsap_test_c, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_up_state_actions_3, +}; + +/* array of pointers; one to each transition */ +static struct llc_station_state_trans *llc_stat_up_state_trans [] = { + [0] = &llc_stat_up_state_trans_1, + [1] = &llc_stat_up_state_trans_2, + [2] = &llc_stat_up_state_trans_3, + [3] = &llc_stat_state_trans_end, +}; + +/* DUP ADDR CHK STATE transitions */ +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_0_XID_R_XID_R_CNT_EQ + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_1[] = { + [0] = llc_station_ac_inc_xid_r_cnt_by_1, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_1 = { + .ev = llc_stat_ev_rx_null_dsap_0_xid_r_xid_r_cnt_eq, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_dupaddr_state_actions_1, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_1_XID_R_XID_R_CNT_EQ + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_2[] = { + [0] = llc_station_ac_report_status, /* DUPLICATE ADDRESS FOUND */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_2 = { + .ev = llc_stat_ev_rx_null_dsap_1_xid_r_xid_r_cnt_eq, + .next_state = LLC_STATION_STATE_DOWN, + .ev_actions = llc_stat_dupaddr_state_actions_2, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_XID_C event */ +static llc_station_action_t llc_stat_dupaddr_state_actions_3[] = { + [0] = llc_station_ac_send_xid_r, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_3 = { + .ev = llc_stat_ev_rx_null_dsap_xid_c, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_dupaddr_state_actions_3, +}; + +/* state transition for LLC_STATION_EV_ACK_TMR_EXP_LT_RETRY_CNT_MAX_RETRY + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_4[] = { + [0] = llc_station_ac_start_ack_timer, + [1] = llc_station_ac_inc_retry_cnt_by_1, + [2] = llc_station_ac_set_xid_r_cnt_0, + [3] = llc_station_ac_send_null_dsap_xid_c, + [4] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_4 = { + .ev = llc_stat_ev_ack_tmr_exp_lt_retry_cnt_max_retry, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_dupaddr_state_actions_4, +}; + +/* state transition for LLC_STATION_EV_ACK_TMR_EXP_EQ_RETRY_CNT_MAX_RETRY + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_5[] = { + [0] = llc_station_ac_report_status, /* STATION UP */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_5 = { + .ev = llc_stat_ev_ack_tmr_exp_eq_retry_cnt_max_retry, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_dupaddr_state_actions_5, +}; + +/* state transition for LLC_STATION_EV_DISABLE_REQ event */ +static llc_station_action_t llc_stat_dupaddr_state_actions_6[] = { + [0] = llc_station_ac_report_status, /* STATION DOWN */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_6 = { + .ev = llc_stat_ev_disable_req, + .next_state = LLC_STATION_STATE_DOWN, + .ev_actions = llc_stat_dupaddr_state_actions_6, +}; + +/* array of pointers; one to each transition */ +static struct llc_station_state_trans *llc_stat_dupaddr_state_trans[] = { + [0] = &llc_stat_dupaddr_state_trans_6, /* Request */ + [1] = &llc_stat_dupaddr_state_trans_4, /* Timer */ + [2] = &llc_stat_dupaddr_state_trans_5, + [3] = &llc_stat_dupaddr_state_trans_1, /* Receive frame */ + [4] = &llc_stat_dupaddr_state_trans_2, + [5] = &llc_stat_dupaddr_state_trans_3, + [6] = &llc_stat_state_trans_end, +}; + +static struct llc_station_state + llc_station_state_table[LLC_NBR_STATION_STATES] = { + [LLC_STATION_STATE_DOWN - 1] = { + .curr_state = LLC_STATION_STATE_DOWN, + .transitions = llc_stat_dwn_state_trans, + }, + [LLC_STATION_STATE_DUP_ADDR_CHK - 1] = { + .curr_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .transitions = llc_stat_dupaddr_state_trans, + }, + [LLC_STATION_STATE_UP - 1] = { + .curr_state = LLC_STATION_STATE_UP, + .transitions = llc_stat_up_state_trans, + }, +}; + +/** + * llc_exec_station_trans_actions - executes actions for transition + * @trans: Address of the transition + * @skb: Address of the event that caused the transition + * + * Executes actions of a transition of the station state machine. Returns + * 0 if all actions complete successfully, nonzero otherwise. + */ +static u16 llc_exec_station_trans_actions(struct llc_station_state_trans *trans, + struct sk_buff *skb) +{ + u16 rc = 0; + llc_station_action_t *next_action = trans->ev_actions; + + for (; next_action && *next_action; next_action++) + if ((*next_action)(skb)) + rc = 1; + return rc; +} + +/** + * llc_find_station_trans - finds transition for this event + * @skb: Address of the event + * + * Search thru events of the current state of the station until list + * exhausted or it's obvious that the event is not valid for the current + * state. Returns the address of the transition if cound, %NULL otherwise. + */ +static struct llc_station_state_trans * + llc_find_station_trans(struct sk_buff *skb) +{ + int i = 0; + struct llc_station_state_trans *rc = NULL; + struct llc_station_state_trans **next_trans; + struct llc_station_state *curr_state = + &llc_station_state_table[llc_main_station.state - 1]; + + for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) + if (!next_trans[i]->ev(skb)) { + rc = next_trans[i]; + break; + } + return rc; +} + +/** + * llc_station_free_ev - frees an event + * @skb: Address of the event + * + * Frees an event. + */ +static void llc_station_free_ev(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + if (ev->type == LLC_STATION_EV_TYPE_PDU) + kfree_skb(skb); +} + +/** + * llc_station_next_state - processes event and goes to the next state + * @skb: Address of the event + * + * Processes an event, executes any transitions related to that event and + * updates the state of the station. + */ +static u16 llc_station_next_state(struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_station_state_trans *trans; + + if (llc_main_station.state > LLC_NBR_STATION_STATES) + goto out; + trans = llc_find_station_trans(skb); + if (trans) { + /* got the state to which we next transition; perform the + * actions associated with this transition before actually + * transitioning to the next state + */ + rc = llc_exec_station_trans_actions(trans, skb); + if (!rc) + /* transition station to next state if all actions + * execute successfully; done; wait for next event + */ + llc_main_station.state = trans->next_state; + } else + /* event not recognized in current state; re-queue it for + * processing again at a later time; return failure + */ + rc = 0; +out: + llc_station_free_ev(skb); + return rc; +} + +/** + * llc_station_service_events - service events in the queue + * + * Get an event from the station event queue (if any); attempt to service + * the event; if event serviced, get the next event (if any) on the event + * queue; if event not service, re-queue the event on the event queue and + * attempt to service the next event; when serviced all events in queue, + * finished; if don't transition to different state, just service all + * events once; if transition to new state, service all events again. + * Caller must hold llc_main_station.ev_q.lock. + */ +static void llc_station_service_events(void) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&llc_main_station.ev_q.list)) != NULL) + llc_station_next_state(skb); +} + +/** + * llc_station_state_process: queue event and try to process queue. + * @skb: Address of the event + * + * Queues an event (on the station event queue) for handling by the + * station state machine and attempts to process any queued-up events. + */ +static void llc_station_state_process(struct sk_buff *skb) +{ + spin_lock_bh(&llc_main_station.ev_q.lock); + skb_queue_tail(&llc_main_station.ev_q.list, skb); + llc_station_service_events(); + spin_unlock_bh(&llc_main_station.ev_q.lock); +} + +static void llc_station_ack_tmr_cb(unsigned long timeout_data) +{ + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + if (skb) { + struct llc_station_state_ev *ev = llc_station_ev(skb); + + ev->type = LLC_STATION_EV_TYPE_ACK_TMR; + llc_station_state_process(skb); + } +} + +/* + * llc_station_rcv - send received pdu to the station state machine + * @skb: received frame. + * + * Sends data unit to station state machine. + */ +static void llc_station_rcv(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + ev->type = LLC_STATION_EV_TYPE_PDU; + ev->reason = 0; + llc_station_state_process(skb); +} + +int __init llc_station_init(void) +{ + int rc = -ENOBUFS; + struct sk_buff *skb; + struct llc_station_state_ev *ev; + + skb_queue_head_init(&llc_main_station.mac_pdu_q); + skb_queue_head_init(&llc_main_station.ev_q.list); + spin_lock_init(&llc_main_station.ev_q.lock); + setup_timer(&llc_main_station.ack_timer, llc_station_ack_tmr_cb, + (unsigned long)&llc_main_station); + llc_main_station.ack_timer.expires = jiffies + + sysctl_llc_station_ack_timeout; + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto out; + rc = 0; + llc_set_station_handler(llc_station_rcv); + ev = llc_station_ev(skb); + memset(ev, 0, sizeof(*ev)); + llc_main_station.maximum_retry = 1; + llc_main_station.state = LLC_STATION_STATE_DOWN; + ev->type = LLC_STATION_EV_TYPE_SIMPLE; + ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; + rc = llc_station_next_state(skb); +out: + return rc; +} + +void __exit llc_station_exit(void) +{ + llc_set_station_handler(NULL); +} diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c new file mode 100644 index 00000000..e2ebe358 --- /dev/null +++ b/net/llc/sysctl_net_llc.c @@ -0,0 +1,103 @@ +/* + * sysctl_net_llc.c: sysctl interface to LLC net subsystem. + * + * Arnaldo Carvalho de Melo + */ + +#include +#include +#include +#include + +#ifndef CONFIG_SYSCTL +#error This file should not be compiled without CONFIG_SYSCTL defined +#endif + +static struct ctl_table llc2_timeout_table[] = { + { + .procname = "ack", + .data = &sysctl_llc2_ack_timeout, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "busy", + .data = &sysctl_llc2_busy_timeout, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "p", + .data = &sysctl_llc2_p_timeout, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "rej", + .data = &sysctl_llc2_rej_timeout, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { }, +}; + +static struct ctl_table llc_station_table[] = { + { + .procname = "ack_timeout", + .data = &sysctl_llc_station_ack_timeout, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { }, +}; + +static struct ctl_table llc2_dir_timeout_table[] = { + { + .procname = "timeout", + .mode = 0555, + .child = llc2_timeout_table, + }, + { }, +}; + +static struct ctl_table llc_table[] = { + { + .procname = "llc2", + .mode = 0555, + .child = llc2_dir_timeout_table, + }, + { + .procname = "station", + .mode = 0555, + .child = llc_station_table, + }, + { }, +}; + +static struct ctl_path llc_path[] = { + { .procname = "net", }, + { .procname = "llc", }, + { } +}; + +static struct ctl_table_header *llc_table_header; + +int __init llc_sysctl_init(void) +{ + llc_table_header = register_sysctl_paths(llc_path, llc_table); + + return llc_table_header ? 0 : -ENOMEM; +} + +void llc_sysctl_exit(void) +{ + if (llc_table_header) { + unregister_sysctl_table(llc_table_header); + llc_table_header = NULL; + } +} diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig new file mode 100644 index 00000000..f5fdfcbf --- /dev/null +++ b/net/mac80211/Kconfig @@ -0,0 +1,236 @@ +config MAC80211 + tristate "Generic IEEE 802.11 Networking Stack (mac80211)" + depends on CFG80211 + select CRYPTO + select CRYPTO_ARC4 + select CRYPTO_AES + select CRC32 + select AVERAGE + ---help--- + This option enables the hardware independent IEEE 802.11 + networking stack. + +comment "CFG80211 needs to be enabled for MAC80211" + depends on CFG80211=n + +if MAC80211 != n + +config MAC80211_HAS_RC + bool + +config MAC80211_RC_PID + bool "PID controller based rate control algorithm" if EXPERT + select MAC80211_HAS_RC + ---help--- + This option enables a TX rate control algorithm for + mac80211 that uses a PID controller to select the TX + rate. + +config MAC80211_RC_MINSTREL + bool "Minstrel" if EXPERT + select MAC80211_HAS_RC + default y + ---help--- + This option enables the 'minstrel' TX rate control algorithm + +config MAC80211_RC_MINSTREL_HT + bool "Minstrel 802.11n support" if EXPERT + depends on MAC80211_RC_MINSTREL + default y + ---help--- + This option enables the 'minstrel_ht' TX rate control algorithm + +choice + prompt "Default rate control algorithm" + depends on MAC80211_HAS_RC + default MAC80211_RC_DEFAULT_MINSTREL + ---help--- + This option selects the default rate control algorithm + mac80211 will use. Note that this default can still be + overridden through the ieee80211_default_rc_algo module + parameter if different algorithms are available. + +config MAC80211_RC_DEFAULT_PID + bool "PID controller based rate control algorithm" + depends on MAC80211_RC_PID + ---help--- + Select the PID controller based rate control as the + default rate control algorithm. You should choose + this unless you know what you are doing. + +config MAC80211_RC_DEFAULT_MINSTREL + bool "Minstrel" + depends on MAC80211_RC_MINSTREL + ---help--- + Select Minstrel as the default rate control algorithm. + + +endchoice + +config MAC80211_RC_DEFAULT + string + default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT + default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL + default "pid" if MAC80211_RC_DEFAULT_PID + default "" + +endif + +comment "Some wireless drivers require a rate control algorithm" + depends on MAC80211 && MAC80211_HAS_RC=n + +config MAC80211_MESH + bool "Enable mac80211 mesh networking (pre-802.11s) support" + depends on MAC80211 && EXPERIMENTAL + ---help--- + This options enables support of Draft 802.11s mesh networking. + The implementation is based on Draft 2.08 of the Mesh Networking + amendment. However, no compliance with that draft is claimed or even + possible, as drafts leave a number of identifiers to be defined after + ratification. For more information visit http://o11s.org/. + +config MAC80211_LEDS + bool "Enable LED triggers" + depends on MAC80211 + depends on LEDS_CLASS + select LEDS_TRIGGERS + ---help--- + This option enables a few LED triggers for different + packet receive/transmit events. + +config MAC80211_DEBUGFS + bool "Export mac80211 internals in DebugFS" + depends on MAC80211 && DEBUG_FS + ---help--- + Select this to see extensive information about + the internal state of mac80211 in debugfs. + + Say N unless you know you need this. + +menuconfig MAC80211_DEBUG_MENU + bool "Select mac80211 debugging features" + depends on MAC80211 + ---help--- + This option collects various mac80211 debug settings. + +config MAC80211_NOINLINE + bool "Do not inline TX/RX handlers" + depends on MAC80211_DEBUG_MENU + ---help--- + This option affects code generation in mac80211, when + selected some functions are marked "noinline" to allow + easier debugging of problems in the transmit and receive + paths. + + This option increases code size a bit and inserts a lot + of function calls in the code, but is otherwise safe to + enable. + + If unsure, say N unless you expect to be finding problems + in mac80211. + +config MAC80211_VERBOSE_DEBUG + bool "Verbose debugging output" + depends on MAC80211_DEBUG_MENU + ---help--- + Selecting this option causes mac80211 to print out + many debugging messages. It should not be selected + on production systems as some of the messages are + remotely triggerable. + + Do not select this option. + +config MAC80211_HT_DEBUG + bool "Verbose HT debugging" + depends on MAC80211_DEBUG_MENU + ---help--- + This option enables 802.11n High Throughput features + debug tracing output. + + It should not be selected on production systems as some + of the messages are remotely triggerable. + + Do not select this option. + +config MAC80211_TKIP_DEBUG + bool "Verbose TKIP debugging" + depends on MAC80211_DEBUG_MENU + ---help--- + Selecting this option causes mac80211 to print out + very verbose TKIP debugging messages. It should not + be selected on production systems as those messages + are remotely triggerable. + + Do not select this option. + +config MAC80211_IBSS_DEBUG + bool "Verbose IBSS debugging" + depends on MAC80211_DEBUG_MENU + ---help--- + Selecting this option causes mac80211 to print out + very verbose IBSS debugging messages. It should not + be selected on production systems as those messages + are remotely triggerable. + + Do not select this option. + +config MAC80211_VERBOSE_PS_DEBUG + bool "Verbose powersave mode debugging" + depends on MAC80211_DEBUG_MENU + ---help--- + Selecting this option causes mac80211 to print out very + verbose power save mode debugging messages (when mac80211 + is an AP and has power saving stations.) + It should not be selected on production systems as those + messages are remotely triggerable. + + Do not select this option. + +config MAC80211_VERBOSE_MPL_DEBUG + bool "Verbose mesh peer link debugging" + depends on MAC80211_DEBUG_MENU + depends on MAC80211_MESH + ---help--- + Selecting this option causes mac80211 to print out very + verbose mesh peer link debugging messages (when mac80211 + is taking part in a mesh network). + It should not be selected on production systems as those + messages are remotely triggerable. + + Do not select this option. + +config MAC80211_VERBOSE_MHWMP_DEBUG + bool "Verbose mesh HWMP routing debugging" + depends on MAC80211_DEBUG_MENU + depends on MAC80211_MESH + ---help--- + Selecting this option causes mac80211 to print out very + verbose mesh routing (HWMP) debugging messages (when mac80211 + is taking part in a mesh network). + It should not be selected on production systems as those + messages are remotely triggerable. + + Do not select this option. + +config MAC80211_DEBUG_COUNTERS + bool "Extra statistics for TX/RX debugging" + depends on MAC80211_DEBUG_MENU + depends on MAC80211_DEBUGFS + ---help--- + Selecting this option causes mac80211 to keep additional + and very verbose statistics about TX and RX handler use + and show them in debugfs. + + If unsure, say N. + +config MAC80211_DRIVER_API_TRACER + bool "Driver API tracer" + depends on MAC80211_DEBUG_MENU + depends on EVENT_TRACING + help + Say Y here to make mac80211 register with the ftrace + framework for the driver API -- you can then see which + driver methods it is calling and which API functions + drivers are calling by looking at the trace. + + If unsure, say Y. diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile new file mode 100644 index 00000000..fdb54e61 --- /dev/null +++ b/net/mac80211/Makefile @@ -0,0 +1,61 @@ +obj-$(CONFIG_MAC80211) += mac80211.o + +# mac80211 objects +mac80211-y := \ + main.o status.o \ + sta_info.o \ + wep.o \ + wpa.o \ + scan.o offchannel.o \ + ht.o agg-tx.o agg-rx.o \ + ibss.o \ + mlme.o work.o \ + iface.o \ + rate.o \ + michael.o \ + tkip.o \ + aes_ccm.o \ + aes_cmac.o \ + cfg.o \ + rx.o \ + spectmgmt.o \ + tx.o \ + key.o \ + util.o \ + wme.o \ + event.o \ + chan.o + +mac80211-$(CONFIG_MAC80211_LEDS) += led.o +mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ + debugfs.o \ + debugfs_sta.o \ + debugfs_netdev.o \ + debugfs_key.o + +mac80211-$(CONFIG_MAC80211_MESH) += \ + mesh.o \ + mesh_pathtbl.o \ + mesh_plink.o \ + mesh_hwmp.o + +mac80211-$(CONFIG_PM) += pm.o + +mac80211-$(CONFIG_MAC80211_DRIVER_API_TRACER) += driver-trace.o +CFLAGS_driver-trace.o := -I$(src) + +# objects for PID algorithm +rc80211_pid-y := rc80211_pid_algo.o +rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o + +rc80211_minstrel-y := rc80211_minstrel.o +rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o + +rc80211_minstrel_ht-y := rc80211_minstrel_ht.o +rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o + +mac80211-$(CONFIG_MAC80211_RC_PID) += $(rc80211_pid-y) +mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) +mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y) + +ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c new file mode 100644 index 00000000..b9b595c0 --- /dev/null +++ b/net/mac80211/aes_ccm.c @@ -0,0 +1,149 @@ +/* + * Copyright 2003-2004, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include "key.h" +#include "aes_ccm.h" + +static void aes_ccm_prepare(struct crypto_cipher *tfm, u8 *scratch, u8 *a) +{ + int i; + u8 *b_0, *aad, *b, *s_0; + + b_0 = scratch + 3 * AES_BLOCK_LEN; + aad = scratch + 4 * AES_BLOCK_LEN; + b = scratch; + s_0 = scratch + AES_BLOCK_LEN; + + crypto_cipher_encrypt_one(tfm, b, b_0); + + /* Extra Authenticate-only data (always two AES blocks) */ + for (i = 0; i < AES_BLOCK_LEN; i++) + aad[i] ^= b[i]; + crypto_cipher_encrypt_one(tfm, b, aad); + + aad += AES_BLOCK_LEN; + + for (i = 0; i < AES_BLOCK_LEN; i++) + aad[i] ^= b[i]; + crypto_cipher_encrypt_one(tfm, a, aad); + + /* Mask out bits from auth-only-b_0 */ + b_0[0] &= 0x07; + + /* S_0 is used to encrypt T (= MIC) */ + b_0[14] = 0; + b_0[15] = 0; + crypto_cipher_encrypt_one(tfm, s_0, b_0); +} + + +void ieee80211_aes_ccm_encrypt(struct crypto_cipher *tfm, u8 *scratch, + u8 *data, size_t data_len, + u8 *cdata, u8 *mic) +{ + int i, j, last_len, num_blocks; + u8 *pos, *cpos, *b, *s_0, *e, *b_0; + + b = scratch; + s_0 = scratch + AES_BLOCK_LEN; + e = scratch + 2 * AES_BLOCK_LEN; + b_0 = scratch + 3 * AES_BLOCK_LEN; + + num_blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); + last_len = data_len % AES_BLOCK_LEN; + aes_ccm_prepare(tfm, scratch, b); + + /* Process payload blocks */ + pos = data; + cpos = cdata; + for (j = 1; j <= num_blocks; j++) { + int blen = (j == num_blocks && last_len) ? + last_len : AES_BLOCK_LEN; + + /* Authentication followed by encryption */ + for (i = 0; i < blen; i++) + b[i] ^= pos[i]; + crypto_cipher_encrypt_one(tfm, b, b); + + b_0[14] = (j >> 8) & 0xff; + b_0[15] = j & 0xff; + crypto_cipher_encrypt_one(tfm, e, b_0); + for (i = 0; i < blen; i++) + *cpos++ = *pos++ ^ e[i]; + } + + for (i = 0; i < CCMP_MIC_LEN; i++) + mic[i] = b[i] ^ s_0[i]; +} + + +int ieee80211_aes_ccm_decrypt(struct crypto_cipher *tfm, u8 *scratch, + u8 *cdata, size_t data_len, u8 *mic, u8 *data) +{ + int i, j, last_len, num_blocks; + u8 *pos, *cpos, *b, *s_0, *a, *b_0; + + b = scratch; + s_0 = scratch + AES_BLOCK_LEN; + a = scratch + 2 * AES_BLOCK_LEN; + b_0 = scratch + 3 * AES_BLOCK_LEN; + + num_blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); + last_len = data_len % AES_BLOCK_LEN; + aes_ccm_prepare(tfm, scratch, a); + + /* Process payload blocks */ + cpos = cdata; + pos = data; + for (j = 1; j <= num_blocks; j++) { + int blen = (j == num_blocks && last_len) ? + last_len : AES_BLOCK_LEN; + + /* Decryption followed by authentication */ + b_0[14] = (j >> 8) & 0xff; + b_0[15] = j & 0xff; + crypto_cipher_encrypt_one(tfm, b, b_0); + for (i = 0; i < blen; i++) { + *pos = *cpos++ ^ b[i]; + a[i] ^= *pos++; + } + crypto_cipher_encrypt_one(tfm, a, a); + } + + for (i = 0; i < CCMP_MIC_LEN; i++) { + if ((mic[i] ^ s_0[i]) != a[i]) + return -1; + } + + return 0; +} + + +struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[]) +{ + struct crypto_cipher *tfm; + + tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + if (!IS_ERR(tfm)) + crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN); + + return tfm; +} + + +void ieee80211_aes_key_free(struct crypto_cipher *tfm) +{ + crypto_free_cipher(tfm); +} diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h new file mode 100644 index 00000000..6e7820ef --- /dev/null +++ b/net/mac80211/aes_ccm.h @@ -0,0 +1,26 @@ +/* + * Copyright 2003-2004, Instant802 Networks, Inc. + * Copyright 2006, Devicescape Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef AES_CCM_H +#define AES_CCM_H + +#include + +#define AES_BLOCK_LEN 16 + +struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[]); +void ieee80211_aes_ccm_encrypt(struct crypto_cipher *tfm, u8 *scratch, + u8 *data, size_t data_len, + u8 *cdata, u8 *mic); +int ieee80211_aes_ccm_decrypt(struct crypto_cipher *tfm, u8 *scratch, + u8 *cdata, size_t data_len, + u8 *mic, u8 *data); +void ieee80211_aes_key_free(struct crypto_cipher *tfm); + +#endif /* AES_CCM_H */ diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c new file mode 100644 index 00000000..d502b268 --- /dev/null +++ b/net/mac80211/aes_cmac.c @@ -0,0 +1,132 @@ +/* + * AES-128-CMAC with TLen 16 for IEEE 802.11w BIP + * Copyright 2008, Jouni Malinen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include "key.h" +#include "aes_cmac.h" + +#define AES_BLOCK_SIZE 16 +#define AES_CMAC_KEY_LEN 16 +#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */ +#define AAD_LEN 20 + + +static void gf_mulx(u8 *pad) +{ + int i, carry; + + carry = pad[0] & 0x80; + for (i = 0; i < AES_BLOCK_SIZE - 1; i++) + pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7); + pad[AES_BLOCK_SIZE - 1] <<= 1; + if (carry) + pad[AES_BLOCK_SIZE - 1] ^= 0x87; +} + + +static void aes_128_cmac_vector(struct crypto_cipher *tfm, u8 *scratch, + size_t num_elem, + const u8 *addr[], const size_t *len, u8 *mac) +{ + u8 *cbc, *pad; + const u8 *pos, *end; + size_t i, e, left, total_len; + + cbc = scratch; + pad = scratch + AES_BLOCK_SIZE; + + memset(cbc, 0, AES_BLOCK_SIZE); + + total_len = 0; + for (e = 0; e < num_elem; e++) + total_len += len[e]; + left = total_len; + + e = 0; + pos = addr[0]; + end = pos + len[0]; + + while (left >= AES_BLOCK_SIZE) { + for (i = 0; i < AES_BLOCK_SIZE; i++) { + cbc[i] ^= *pos++; + if (pos >= end) { + e++; + pos = addr[e]; + end = pos + len[e]; + } + } + if (left > AES_BLOCK_SIZE) + crypto_cipher_encrypt_one(tfm, cbc, cbc); + left -= AES_BLOCK_SIZE; + } + + memset(pad, 0, AES_BLOCK_SIZE); + crypto_cipher_encrypt_one(tfm, pad, pad); + gf_mulx(pad); + + if (left || total_len == 0) { + for (i = 0; i < left; i++) { + cbc[i] ^= *pos++; + if (pos >= end) { + e++; + pos = addr[e]; + end = pos + len[e]; + } + } + cbc[left] ^= 0x80; + gf_mulx(pad); + } + + for (i = 0; i < AES_BLOCK_SIZE; i++) + pad[i] ^= cbc[i]; + crypto_cipher_encrypt_one(tfm, pad, pad); + memcpy(mac, pad, CMAC_TLEN); +} + + +void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic) +{ + const u8 *addr[3]; + size_t len[3]; + u8 zero[CMAC_TLEN]; + + memset(zero, 0, CMAC_TLEN); + addr[0] = aad; + len[0] = AAD_LEN; + addr[1] = data; + len[1] = data_len - CMAC_TLEN; + addr[2] = zero; + len[2] = CMAC_TLEN; + + aes_128_cmac_vector(tfm, scratch, 3, addr, len, mic); +} + + +struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]) +{ + struct crypto_cipher *tfm; + + tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + if (!IS_ERR(tfm)) + crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN); + + return tfm; +} + + +void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm) +{ + crypto_free_cipher(tfm); +} diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h new file mode 100644 index 00000000..0eb9a483 --- /dev/null +++ b/net/mac80211/aes_cmac.h @@ -0,0 +1,19 @@ +/* + * Copyright 2008, Jouni Malinen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef AES_CMAC_H +#define AES_CMAC_H + +#include + +struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]); +void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic); +void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm); + +#endif /* AES_CMAC_H */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c new file mode 100644 index 00000000..1a41b142 --- /dev/null +++ b/net/mac80211/agg-rx.c @@ -0,0 +1,328 @@ +/* + * HT handling + * + * Copyright 2003, Jouni Malinen + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2007-2010, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/** + * DOC: RX A-MPDU aggregation + * + * Aggregation on the RX side requires only implementing the + * @ampdu_action callback that is invoked to start/stop any + * block-ack sessions for RX aggregation. + * + * When RX aggregation is started by the peer, the driver is + * notified via @ampdu_action function, with the + * %IEEE80211_AMPDU_RX_START action, and may reject the request + * in which case a negative response is sent to the peer, if it + * accepts it a positive response is sent. + * + * While the session is active, the device/driver are required + * to de-aggregate frames and pass them up one by one to mac80211, + * which will handle the reorder buffer. + * + * When the aggregation session is stopped again by the peer or + * ourselves, the driver's @ampdu_action function will be called + * with the action %IEEE80211_AMPDU_RX_STOP. In this case, the + * call must not fail. + */ + +#include +#include +#include +#include "ieee80211_i.h" +#include "driver-ops.h" + +static void ieee80211_free_tid_rx(struct rcu_head *h) +{ + struct tid_ampdu_rx *tid_rx = + container_of(h, struct tid_ampdu_rx, rcu_head); + int i; + + del_timer_sync(&tid_rx->reorder_timer); + + for (i = 0; i < tid_rx->buf_size; i++) + dev_kfree_skb(tid_rx->reorder_buf[i]); + kfree(tid_rx->reorder_buf); + kfree(tid_rx->reorder_time); + kfree(tid_rx); +} + +void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, + u16 initiator, u16 reason, bool tx) +{ + struct ieee80211_local *local = sta->local; + struct tid_ampdu_rx *tid_rx; + + lockdep_assert_held(&sta->ampdu_mlme.mtx); + + tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid], + lockdep_is_held(&sta->ampdu_mlme.mtx)); + + if (!tid_rx) + return; + + rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], NULL); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Rx BA session stop requested for %pM tid %u\n", + sta->sta.addr, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP, + &sta->sta, tid, NULL, 0)) + printk(KERN_DEBUG "HW problem - can not stop rx " + "aggregation for tid %d\n", tid); + + /* check if this is a self generated aggregation halt */ + if (initiator == WLAN_BACK_RECIPIENT && tx) + ieee80211_send_delba(sta->sdata, sta->sta.addr, + tid, 0, reason); + + del_timer_sync(&tid_rx->session_timer); + + call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx); +} + +void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, + u16 initiator, u16 reason, bool tx) +{ + mutex_lock(&sta->ampdu_mlme.mtx); + ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx); + mutex_unlock(&sta->ampdu_mlme.mtx); +} + +/* + * After accepting the AddBA Request we activated a timer, + * resetting it after each frame that arrives from the originator. + */ +static void sta_rx_agg_session_timer_expired(unsigned long data) +{ + /* not an elegant detour, but there is no choice as the timer passes + * only one argument, and various sta_info are needed here, so init + * flow in sta_info_create gives the TID as data, while the timer_to_id + * array gives the sta through container_of */ + u8 *ptid = (u8 *)data; + u8 *timer_to_id = ptid - *ptid; + struct sta_info *sta = container_of(timer_to_id, struct sta_info, + timer_to_tid[0]); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid); +#endif + set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired); + ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); +} + +static void sta_rx_agg_reorder_timer_expired(unsigned long data) +{ + u8 *ptid = (u8 *)data; + u8 *timer_to_id = ptid - *ptid; + struct sta_info *sta = container_of(timer_to_id, struct sta_info, + timer_to_tid[0]); + + rcu_read_lock(); + ieee80211_release_reorder_timeout(sta, *ptid); + rcu_read_unlock(); +} + +static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, + u8 dialog_token, u16 status, u16 policy, + u16 buf_size, u16 timeout) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 capab; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer " + "for addba resp frame\n", sdata->name); + return; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; + mgmt->u.action.u.addba_resp.dialog_token = dialog_token; + + capab = (u16)(policy << 1); /* bit 1 aggregation policy */ + capab |= (u16)(tid << 2); /* bit 5:2 TID number */ + capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ + + mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); + mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); + mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_process_addba_request(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct tid_ampdu_rx *tid_agg_rx; + u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status; + u8 dialog_token; + int ret = -EOPNOTSUPP; + + /* extract session parameters from addba request frame */ + dialog_token = mgmt->u.action.u.addba_req.dialog_token; + timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + start_seq_num = + le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; + + capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; + tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; + buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + + status = WLAN_STATUS_REQUEST_DECLINED; + + if (test_sta_flags(sta, WLAN_STA_BLOCK_BA)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Suspend in progress. " + "Denying ADDBA request\n"); +#endif + goto end_no_lock; + } + + /* sanity check for incoming parameters: + * check if configuration can support the BA policy + * and if buffer size does not exceeds max value */ + /* XXX: check own ht delayed BA capability?? */ + if (((ba_policy != 1) && + (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || + (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + status = WLAN_STATUS_INVALID_QOS_PARAM; +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "AddBA Req with bad params from " + "%pM on tid %u. policy %d, buffer size %d\n", + mgmt->sa, tid, ba_policy, + buf_size); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + goto end_no_lock; + } + /* determine default buffer size */ + if (buf_size == 0) + buf_size = IEEE80211_MAX_AMPDU_BUF; + + /* make sure the size doesn't exceed the maximum supported by the hw */ + if (buf_size > local->hw.max_rx_aggregation_subframes) + buf_size = local->hw.max_rx_aggregation_subframes; + + /* examine state machine */ + mutex_lock(&sta->ampdu_mlme.mtx); + + if (sta->ampdu_mlme.tid_rx[tid]) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "unexpected AddBA Req from " + "%pM on tid %u\n", + mgmt->sa, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + goto end; + } + + /* prepare A-MPDU MLME for Rx aggregation */ + tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL); + if (!tid_agg_rx) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_ERR "allocate rx mlme to tid %d failed\n", + tid); +#endif + goto end; + } + + spin_lock_init(&tid_agg_rx->reorder_lock); + + /* rx timer */ + tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired; + tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&tid_agg_rx->session_timer); + + /* rx reorder timer */ + tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired; + tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&tid_agg_rx->reorder_timer); + + /* prepare reordering buffer */ + tid_agg_rx->reorder_buf = + kcalloc(buf_size, sizeof(struct sk_buff *), GFP_KERNEL); + tid_agg_rx->reorder_time = + kcalloc(buf_size, sizeof(unsigned long), GFP_KERNEL); + if (!tid_agg_rx->reorder_buf || !tid_agg_rx->reorder_time) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_ERR "can not allocate reordering buffer " + "to tid %d\n", tid); +#endif + kfree(tid_agg_rx->reorder_buf); + kfree(tid_agg_rx->reorder_time); + kfree(tid_agg_rx); + goto end; + } + + ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, + &sta->sta, tid, &start_seq_num, 0); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (ret) { + kfree(tid_agg_rx->reorder_buf); + kfree(tid_agg_rx->reorder_time); + kfree(tid_agg_rx); + goto end; + } + + /* update data */ + tid_agg_rx->dialog_token = dialog_token; + tid_agg_rx->ssn = start_seq_num; + tid_agg_rx->head_seq_num = start_seq_num; + tid_agg_rx->buf_size = buf_size; + tid_agg_rx->timeout = timeout; + tid_agg_rx->stored_mpdu_num = 0; + status = WLAN_STATUS_SUCCESS; + + /* activate it for RX */ + rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], tid_agg_rx); + + if (timeout) + mod_timer(&tid_agg_rx->session_timer, TU_TO_EXP_TIME(timeout)); + +end: + mutex_unlock(&sta->ampdu_mlme.mtx); + +end_no_lock: + ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, + dialog_token, status, 1, buf_size, timeout); +} diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c new file mode 100644 index 00000000..b7f4f5c1 --- /dev/null +++ b/net/mac80211/agg-tx.c @@ -0,0 +1,842 @@ +/* + * HT handling + * + * Copyright 2003, Jouni Malinen + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2007-2010, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "wme.h" + +/** + * DOC: TX A-MPDU aggregation + * + * Aggregation on the TX side requires setting the hardware flag + * %IEEE80211_HW_AMPDU_AGGREGATION. The driver will then be handed + * packets with a flag indicating A-MPDU aggregation. The driver + * or device is responsible for actually aggregating the frames, + * as well as deciding how many and which to aggregate. + * + * When TX aggregation is started by some subsystem (usually the rate + * control algorithm would be appropriate) by calling the + * ieee80211_start_tx_ba_session() function, the driver will be + * notified via its @ampdu_action function, with the + * %IEEE80211_AMPDU_TX_START action. + * + * In response to that, the driver is later required to call the + * ieee80211_start_tx_ba_cb_irqsafe() function, which will really + * start the aggregation session after the peer has also responded. + * If the peer responds negatively, the session will be stopped + * again right away. Note that it is possible for the aggregation + * session to be stopped before the driver has indicated that it + * is done setting it up, in which case it must not indicate the + * setup completion. + * + * Also note that, since we also need to wait for a response from + * the peer, the driver is notified of the completion of the + * handshake by the %IEEE80211_AMPDU_TX_OPERATIONAL action to the + * @ampdu_action callback. + * + * Similarly, when the aggregation session is stopped by the peer + * or something calling ieee80211_stop_tx_ba_session(), the driver's + * @ampdu_action function will be called with the action + * %IEEE80211_AMPDU_TX_STOP. In this case, the call must not fail, + * and the driver must later call ieee80211_stop_tx_ba_cb_irqsafe(). + */ + +static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, + const u8 *da, u16 tid, + u8 dialog_token, u16 start_seq_num, + u16 agg_size, u16 timeout) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 capab; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer " + "for addba request frame\n", sdata->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); + + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; + + mgmt->u.action.u.addba_req.dialog_token = dialog_token; + capab = (u16)(1 << 1); /* bit 1 aggregation policy */ + capab |= (u16)(tid << 2); /* bit 5:2 TID number */ + capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */ + + mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab); + + mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout); + mgmt->u.action.u.addba_req.start_seq_num = + cpu_to_le16(start_seq_num << 4); + + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_bar *bar; + u16 bar_control = 0; + + skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom); + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer for " + "bar frame\n", sdata->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar)); + memset(bar, 0, sizeof(*bar)); + bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | + IEEE80211_STYPE_BACK_REQ); + memcpy(bar->ra, ra, ETH_ALEN); + memcpy(bar->ta, sdata->vif.addr, ETH_ALEN); + bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL; + bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA; + bar_control |= (u16)(tid << 12); + bar->control = cpu_to_le16(bar_control); + bar->start_seq_num = cpu_to_le16(ssn); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_assign_tid_tx(struct sta_info *sta, int tid, + struct tid_ampdu_tx *tid_tx) +{ + lockdep_assert_held(&sta->ampdu_mlme.mtx); + lockdep_assert_held(&sta->lock); + rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], tid_tx); +} + +int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator, + bool tx) +{ + struct ieee80211_local *local = sta->local; + struct tid_ampdu_tx *tid_tx; + int ret; + + lockdep_assert_held(&sta->ampdu_mlme.mtx); + + spin_lock_bh(&sta->lock); + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + if (!tid_tx) { + spin_unlock_bh(&sta->lock); + return -ENOENT; + } + + /* if we're already stopping ignore any new requests to stop */ + if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { + spin_unlock_bh(&sta->lock); + return -EALREADY; + } + + if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { + /* not even started yet! */ + ieee80211_assign_tid_tx(sta, tid, NULL); + spin_unlock_bh(&sta->lock); + kfree_rcu(tid_tx, rcu_head); + return 0; + } + + set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state); + + spin_unlock_bh(&sta->lock); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Tx BA session stop requested for %pM tid %u\n", + sta->sta.addr, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + del_timer_sync(&tid_tx->addba_resp_timer); + + /* + * After this packets are no longer handed right through + * to the driver but are put onto tid_tx->pending instead, + * with locking to ensure proper access. + */ + clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state); + + /* + * There might be a few packets being processed right now (on + * another CPU) that have already gotten past the aggregation + * check when it was still OPERATIONAL and consequently have + * IEEE80211_TX_CTL_AMPDU set. In that case, this code might + * call into the driver at the same time or even before the + * TX paths calls into it, which could confuse the driver. + * + * Wait for all currently running TX paths to finish before + * telling the driver. New packets will not go through since + * the aggregation session is no longer OPERATIONAL. + */ + synchronize_net(); + + tid_tx->stop_initiator = initiator; + tid_tx->tx_stop = tx; + + ret = drv_ampdu_action(local, sta->sdata, + IEEE80211_AMPDU_TX_STOP, + &sta->sta, tid, NULL, 0); + + /* HW shall not deny going back to legacy */ + if (WARN_ON(ret)) { + /* + * We may have pending packets get stuck in this case... + * Not bothering with a workaround for now. + */ + } + + return ret; +} + +/* + * After sending add Block Ack request we activated a timer until + * add Block Ack response will arrive from the recipient. + * If this timer expires sta_addba_resp_timer_expired will be executed. + */ +static void sta_addba_resp_timer_expired(unsigned long data) +{ + /* not an elegant detour, but there is no choice as the timer passes + * only one argument, and both sta_info and TID are needed, so init + * flow in sta_info_create gives the TID as data, while the timer_to_id + * array gives the sta through container_of */ + u16 tid = *(u8 *)data; + struct sta_info *sta = container_of((void *)data, + struct sta_info, timer_to_tid[tid]); + struct tid_ampdu_tx *tid_tx; + + /* check if the TID waits for addBA response */ + rcu_read_lock(); + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); + if (!tid_tx || + test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { + rcu_read_unlock(); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "timer expired on tid %d but we are not " + "(or no longer) expecting addBA response there\n", + tid); +#endif + return; + } + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid); +#endif + + ieee80211_stop_tx_ba_session(&sta->sta, tid); + rcu_read_unlock(); +} + +static inline int ieee80211_ac_from_tid(int tid) +{ + return ieee802_1d_to_ac[tid & 7]; +} + +/* + * When multiple aggregation sessions on multiple stations + * are being created/destroyed simultaneously, we need to + * refcount the global queue stop caused by that in order + * to not get into a situation where one of the aggregation + * setup or teardown re-enables queues before the other is + * ready to handle that. + * + * These two functions take care of this issue by keeping + * a global "agg_queue_stop" refcount. + */ +static void __acquires(agg_queue) +ieee80211_stop_queue_agg(struct ieee80211_local *local, int tid) +{ + int queue = ieee80211_ac_from_tid(tid); + + if (atomic_inc_return(&local->agg_queue_stop[queue]) == 1) + ieee80211_stop_queue_by_reason( + &local->hw, queue, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + __acquire(agg_queue); +} + +static void __releases(agg_queue) +ieee80211_wake_queue_agg(struct ieee80211_local *local, int tid) +{ + int queue = ieee80211_ac_from_tid(tid); + + if (atomic_dec_return(&local->agg_queue_stop[queue]) == 0) + ieee80211_wake_queue_by_reason( + &local->hw, queue, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + __release(agg_queue); +} + +/* + * splice packets from the STA's pending to the local pending, + * requires a call to ieee80211_agg_splice_finish later + */ +static void __acquires(agg_queue) +ieee80211_agg_splice_packets(struct ieee80211_local *local, + struct tid_ampdu_tx *tid_tx, u16 tid) +{ + int queue = ieee80211_ac_from_tid(tid); + unsigned long flags; + + ieee80211_stop_queue_agg(local, tid); + + if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates" + " from the pending queue\n", tid)) + return; + + if (!skb_queue_empty(&tid_tx->pending)) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + /* copy over remaining packets */ + skb_queue_splice_tail_init(&tid_tx->pending, + &local->pending[queue]); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + } +} + +static void __releases(agg_queue) +ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid) +{ + ieee80211_wake_queue_agg(local, tid); +} + +void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) +{ + struct tid_ampdu_tx *tid_tx; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + u16 start_seq_num; + int ret; + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + + /* + * Start queuing up packets for this aggregation session. + * We're going to release them once the driver is OK with + * that. + */ + clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); + + /* + * Make sure no packets are being processed. This ensures that + * we have a valid starting sequence number and that in-flight + * packets have been flushed out and no packets for this TID + * will go into the driver during the ampdu_action call. + */ + synchronize_net(); + + start_seq_num = sta->tid_seq[tid] >> 4; + + ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START, + &sta->sta, tid, &start_seq_num, 0); + if (ret) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA request denied - HW unavailable for" + " tid %d\n", tid); +#endif + spin_lock_bh(&sta->lock); + ieee80211_agg_splice_packets(local, tid_tx, tid); + ieee80211_assign_tid_tx(sta, tid, NULL); + ieee80211_agg_splice_finish(local, tid); + spin_unlock_bh(&sta->lock); + + kfree_rcu(tid_tx, rcu_head); + return; + } + + /* activate the timer for the recipient's addBA response */ + mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid); +#endif + + spin_lock_bh(&sta->lock); + sta->ampdu_mlme.addba_req_num[tid]++; + spin_unlock_bh(&sta->lock); + + /* send AddBA request */ + ieee80211_send_addba_request(sdata, sta->sta.addr, tid, + tid_tx->dialog_token, start_seq_num, + local->hw.max_tx_aggregation_subframes, + tid_tx->timeout); +} + +int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, + u16 timeout) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct tid_ampdu_tx *tid_tx; + int ret = 0; + + trace_api_start_tx_ba_session(pubsta, tid); + + if (WARN_ON(!local->ops->ampdu_action)) + return -EINVAL; + + if ((tid >= STA_TID_NUM) || + !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)) + return -EINVAL; + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Open BA session requested for %pM tid %u\n", + pubsta->addr, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + /* + * The aggregation code is not prepared to handle + * anything but STA/AP due to the BSSID handling. + * IBSS could work in the code but isn't supported + * by drivers or the standard. + */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_AP) + return -EINVAL; + + if (test_sta_flags(sta, WLAN_STA_BLOCK_BA)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA sessions blocked. " + "Denying BA session request\n"); +#endif + return -EINVAL; + } + + spin_lock_bh(&sta->lock); + + /* we have tried too many times, receiver does not want A-MPDU */ + if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) { + ret = -EBUSY; + goto err_unlock_sta; + } + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + /* check if the TID is not in aggregation flow already */ + if (tid_tx || sta->ampdu_mlme.tid_start_tx[tid]) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "BA request denied - session is not " + "idle on tid %u\n", tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + ret = -EAGAIN; + goto err_unlock_sta; + } + + /* prepare A-MPDU MLME for Tx aggregation */ + tid_tx = kzalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC); + if (!tid_tx) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_ERR "allocate tx mlme to tid %d failed\n", + tid); +#endif + ret = -ENOMEM; + goto err_unlock_sta; + } + + skb_queue_head_init(&tid_tx->pending); + __set_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); + + tid_tx->timeout = timeout; + + /* Tx timer */ + tid_tx->addba_resp_timer.function = sta_addba_resp_timer_expired; + tid_tx->addba_resp_timer.data = (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&tid_tx->addba_resp_timer); + + /* assign a dialog token */ + sta->ampdu_mlme.dialog_token_allocator++; + tid_tx->dialog_token = sta->ampdu_mlme.dialog_token_allocator; + + /* + * Finally, assign it to the start array; the work item will + * collect it and move it to the normal array. + */ + sta->ampdu_mlme.tid_start_tx[tid] = tid_tx; + + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + + /* this flow continues off the work */ + err_unlock_sta: + spin_unlock_bh(&sta->lock); + return ret; +} +EXPORT_SYMBOL(ieee80211_start_tx_ba_session); + +static void ieee80211_agg_tx_operational(struct ieee80211_local *local, + struct sta_info *sta, u16 tid) +{ + struct tid_ampdu_tx *tid_tx; + + lockdep_assert_held(&sta->ampdu_mlme.mtx); + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Aggregation is on for tid %d\n", tid); +#endif + + drv_ampdu_action(local, sta->sdata, + IEEE80211_AMPDU_TX_OPERATIONAL, + &sta->sta, tid, NULL, tid_tx->buf_size); + + /* + * synchronize with TX path, while splicing the TX path + * should block so it won't put more packets onto pending. + */ + spin_lock_bh(&sta->lock); + + ieee80211_agg_splice_packets(local, tid_tx, tid); + /* + * Now mark as operational. This will be visible + * in the TX path, and lets it go lock-free in + * the common case. + */ + set_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state); + ieee80211_agg_splice_finish(local, tid); + + spin_unlock_bh(&sta->lock); +} + +void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + struct tid_ampdu_tx *tid_tx; + + trace_api_start_tx_ba_cb(sdata, ra, tid); + + if (tid >= STA_TID_NUM) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n", + tid, STA_TID_NUM); +#endif + return; + } + + mutex_lock(&local->sta_mtx); + sta = sta_info_get(sdata, ra); + if (!sta) { + mutex_unlock(&local->sta_mtx); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Could not find station: %pM\n", ra); +#endif + return; + } + + mutex_lock(&sta->ampdu_mlme.mtx); + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + + if (WARN_ON(!tid_tx)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "addBA was not requested!\n"); +#endif + goto unlock; + } + + if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))) + goto unlock; + + if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) + ieee80211_agg_tx_operational(local, sta, tid); + + unlock: + mutex_unlock(&sta->ampdu_mlme.mtx); + mutex_unlock(&local->sta_mtx); +} + +void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, + const u8 *ra, u16 tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_ra_tid *ra_tid; + struct sk_buff *skb = dev_alloc_skb(0); + + if (unlikely(!skb)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_WARNING "%s: Not enough memory, " + "dropping start BA session", sdata->name); +#endif + return; + } + ra_tid = (struct ieee80211_ra_tid *) &skb->cb; + memcpy(&ra_tid->ra, ra, ETH_ALEN); + ra_tid->tid = tid; + + skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_START; + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&local->hw, &sdata->work); +} +EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); + +int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator, + bool tx) +{ + int ret; + + mutex_lock(&sta->ampdu_mlme.mtx); + + ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx); + + mutex_unlock(&sta->ampdu_mlme.mtx); + + return ret; +} + +int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct tid_ampdu_tx *tid_tx; + int ret = 0; + + trace_api_stop_tx_ba_session(pubsta, tid); + + if (!local->ops->ampdu_action) + return -EINVAL; + + if (tid >= STA_TID_NUM) + return -EINVAL; + + spin_lock_bh(&sta->lock); + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + + if (!tid_tx) { + ret = -ENOENT; + goto unlock; + } + + if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { + /* already in progress stopping it */ + ret = 0; + goto unlock; + } + + set_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + + unlock: + spin_unlock_bh(&sta->lock); + return ret; +} +EXPORT_SYMBOL(ieee80211_stop_tx_ba_session); + +void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + struct tid_ampdu_tx *tid_tx; + + trace_api_stop_tx_ba_cb(sdata, ra, tid); + + if (tid >= STA_TID_NUM) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n", + tid, STA_TID_NUM); +#endif + return; + } + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Stopping Tx BA session for %pM tid %d\n", + ra, tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + mutex_lock(&local->sta_mtx); + + sta = sta_info_get(sdata, ra); + if (!sta) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Could not find station: %pM\n", ra); +#endif + goto unlock; + } + + mutex_lock(&sta->ampdu_mlme.mtx); + spin_lock_bh(&sta->lock); + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + + if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n"); +#endif + goto unlock_sta; + } + + if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop) + ieee80211_send_delba(sta->sdata, ra, tid, + WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); + + /* + * When we get here, the TX path will not be lockless any more wrt. + * aggregation, since the OPERATIONAL bit has long been cleared. + * Thus it will block on getting the lock, if it occurs. So if we + * stop the queue now, we will not get any more packets, and any + * that might be being processed will wait for us here, thereby + * guaranteeing that no packets go to the tid_tx pending queue any + * more. + */ + + ieee80211_agg_splice_packets(local, tid_tx, tid); + + /* future packets must not find the tid_tx struct any more */ + ieee80211_assign_tid_tx(sta, tid, NULL); + + ieee80211_agg_splice_finish(local, tid); + + kfree_rcu(tid_tx, rcu_head); + + unlock_sta: + spin_unlock_bh(&sta->lock); + mutex_unlock(&sta->ampdu_mlme.mtx); + unlock: + mutex_unlock(&local->sta_mtx); +} + +void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, + const u8 *ra, u16 tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_ra_tid *ra_tid; + struct sk_buff *skb = dev_alloc_skb(0); + + if (unlikely(!skb)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_WARNING "%s: Not enough memory, " + "dropping stop BA session", sdata->name); +#endif + return; + } + ra_tid = (struct ieee80211_ra_tid *) &skb->cb; + memcpy(&ra_tid->ra, ra, ETH_ALEN); + ra_tid->tid = tid; + + skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_STOP; + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&local->hw, &sdata->work); +} +EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe); + + +void ieee80211_process_addba_resp(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct tid_ampdu_tx *tid_tx; + u16 capab, tid; + u8 buf_size; + + capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); + tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; + buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + + mutex_lock(&sta->ampdu_mlme.mtx); + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + if (!tid_tx) + goto out; + + if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid); +#endif + goto out; + } + + del_timer_sync(&tid_tx->addba_resp_timer); + +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "switched off addBA timer for tid %d\n", tid); +#endif + + /* + * addba_resp_timer may have fired before we got here, and + * caused WANT_STOP to be set. If the stop then was already + * processed further, STOPPING might be set. + */ + if (test_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state) || + test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG + "got addBA resp for tid %d but we already gave up\n", + tid); +#endif + goto out; + } + + if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) + == WLAN_STATUS_SUCCESS) { + /* + * IEEE 802.11-2007 7.3.1.14: + * In an ADDBA Response frame, when the Status Code field + * is set to 0, the Buffer Size subfield is set to a value + * of at least 1. + */ + if (!buf_size) + goto out; + + if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED, + &tid_tx->state)) { + /* ignore duplicate response */ + goto out; + } + + tid_tx->buf_size = buf_size; + + if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)) + ieee80211_agg_tx_operational(local, sta, tid); + + sta->ampdu_mlme.addba_req_num[tid] = 0; + } else { + ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR, + true); + } + + out: + mutex_unlock(&sta->ampdu_mlme.mtx); +} diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c new file mode 100644 index 00000000..143a0064 --- /dev/null +++ b/net/mac80211/cfg.c @@ -0,0 +1,2149 @@ +/* + * mac80211 configuration hooks for cfg80211 + * + * Copyright 2006-2010 Johannes Berg + * + * This file is GPLv2 as found in COPYING. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "cfg.h" +#include "rate.h" +#include "mesh.h" + +static struct net_device *ieee80211_add_iface(struct wiphy *wiphy, char *name, + enum nl80211_iftype type, + u32 *flags, + struct vif_params *params) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct net_device *dev; + struct ieee80211_sub_if_data *sdata; + int err; + + err = ieee80211_if_add(local, name, &dev, type, params); + if (err) + return ERR_PTR(err); + + if (type == NL80211_IFTYPE_MONITOR && flags) { + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + sdata->u.mntr_flags = *flags; + } + + return dev; +} + +static int ieee80211_del_iface(struct wiphy *wiphy, struct net_device *dev) +{ + ieee80211_if_remove(IEEE80211_DEV_TO_SUB_IF(dev)); + + return 0; +} + +static int ieee80211_change_iface(struct wiphy *wiphy, + struct net_device *dev, + enum nl80211_iftype type, u32 *flags, + struct vif_params *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int ret; + + ret = ieee80211_if_change_type(sdata, type); + if (ret) + return ret; + + if (type == NL80211_IFTYPE_AP_VLAN && + params && params->use_4addr == 0) + rcu_assign_pointer(sdata->u.vlan.sta, NULL); + else if (type == NL80211_IFTYPE_STATION && + params && params->use_4addr >= 0) + sdata->u.mgd.use_4addr = params->use_4addr; + + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) { + struct ieee80211_local *local = sdata->local; + + if (ieee80211_sdata_running(sdata)) { + /* + * Prohibit MONITOR_FLAG_COOK_FRAMES to be + * changed while the interface is up. + * Else we would need to add a lot of cruft + * to update everything: + * cooked_mntrs, monitor and all fif_* counters + * reconfigure hardware + */ + if ((*flags & MONITOR_FLAG_COOK_FRAMES) != + (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)) + return -EBUSY; + + ieee80211_adjust_monitor_flags(sdata, -1); + sdata->u.mntr_flags = *flags; + ieee80211_adjust_monitor_flags(sdata, 1); + + ieee80211_configure_filter(local); + } else { + /* + * Because the interface is down, ieee80211_do_stop + * and ieee80211_do_open take care of "everything" + * mentioned in the comment above. + */ + sdata->u.mntr_flags = *flags; + } + } + + return 0; +} + +static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, + u8 key_idx, bool pairwise, const u8 *mac_addr, + struct key_params *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct sta_info *sta = NULL; + struct ieee80211_key *key; + int err; + + if (!ieee80211_sdata_running(sdata)) + return -ENETDOWN; + + /* reject WEP and TKIP keys if WEP failed to initialize */ + switch (params->cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_TKIP: + case WLAN_CIPHER_SUITE_WEP104: + if (IS_ERR(sdata->local->wep_tx_tfm)) + return -EINVAL; + break; + default: + break; + } + + key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, + params->key, params->seq_len, params->seq); + if (IS_ERR(key)) + return PTR_ERR(key); + + if (pairwise) + key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; + + mutex_lock(&sdata->local->sta_mtx); + + if (mac_addr) { + if (ieee80211_vif_is_mesh(&sdata->vif)) + sta = sta_info_get(sdata, mac_addr); + else + sta = sta_info_get_bss(sdata, mac_addr); + if (!sta) { + ieee80211_key_free(sdata->local, key); + err = -ENOENT; + goto out_unlock; + } + } + + err = ieee80211_key_link(key, sdata, sta); + if (err) + ieee80211_key_free(sdata->local, key); + + out_unlock: + mutex_unlock(&sdata->local->sta_mtx); + + return err; +} + +static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, + u8 key_idx, bool pairwise, const u8 *mac_addr) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + struct ieee80211_key *key = NULL; + int ret; + + mutex_lock(&local->sta_mtx); + mutex_lock(&local->key_mtx); + + if (mac_addr) { + ret = -ENOENT; + + sta = sta_info_get_bss(sdata, mac_addr); + if (!sta) + goto out_unlock; + + if (pairwise) + key = key_mtx_dereference(local, sta->ptk); + else + key = key_mtx_dereference(local, sta->gtk[key_idx]); + } else + key = key_mtx_dereference(local, sdata->keys[key_idx]); + + if (!key) { + ret = -ENOENT; + goto out_unlock; + } + + __ieee80211_key_free(key); + + ret = 0; + out_unlock: + mutex_unlock(&local->key_mtx); + mutex_unlock(&local->sta_mtx); + + return ret; +} + +static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, + u8 key_idx, bool pairwise, const u8 *mac_addr, + void *cookie, + void (*callback)(void *cookie, + struct key_params *params)) +{ + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta = NULL; + u8 seq[6] = {0}; + struct key_params params; + struct ieee80211_key *key = NULL; + u32 iv32; + u16 iv16; + int err = -ENOENT; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + rcu_read_lock(); + + if (mac_addr) { + sta = sta_info_get_bss(sdata, mac_addr); + if (!sta) + goto out; + + if (pairwise) + key = rcu_dereference(sta->ptk); + else if (key_idx < NUM_DEFAULT_KEYS) + key = rcu_dereference(sta->gtk[key_idx]); + } else + key = rcu_dereference(sdata->keys[key_idx]); + + if (!key) + goto out; + + memset(¶ms, 0, sizeof(params)); + + params.cipher = key->conf.cipher; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_TKIP: + iv32 = key->u.tkip.tx.iv32; + iv16 = key->u.tkip.tx.iv16; + + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) + drv_get_tkip_seq(sdata->local, + key->conf.hw_key_idx, + &iv32, &iv16); + + seq[0] = iv16 & 0xff; + seq[1] = (iv16 >> 8) & 0xff; + seq[2] = iv32 & 0xff; + seq[3] = (iv32 >> 8) & 0xff; + seq[4] = (iv32 >> 16) & 0xff; + seq[5] = (iv32 >> 24) & 0xff; + params.seq = seq; + params.seq_len = 6; + break; + case WLAN_CIPHER_SUITE_CCMP: + seq[0] = key->u.ccmp.tx_pn[5]; + seq[1] = key->u.ccmp.tx_pn[4]; + seq[2] = key->u.ccmp.tx_pn[3]; + seq[3] = key->u.ccmp.tx_pn[2]; + seq[4] = key->u.ccmp.tx_pn[1]; + seq[5] = key->u.ccmp.tx_pn[0]; + params.seq = seq; + params.seq_len = 6; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + seq[0] = key->u.aes_cmac.tx_pn[5]; + seq[1] = key->u.aes_cmac.tx_pn[4]; + seq[2] = key->u.aes_cmac.tx_pn[3]; + seq[3] = key->u.aes_cmac.tx_pn[2]; + seq[4] = key->u.aes_cmac.tx_pn[1]; + seq[5] = key->u.aes_cmac.tx_pn[0]; + params.seq = seq; + params.seq_len = 6; + break; + } + + params.key = key->conf.key; + params.key_len = key->conf.keylen; + + callback(cookie, ¶ms); + err = 0; + + out: + rcu_read_unlock(); + return err; +} + +static int ieee80211_config_default_key(struct wiphy *wiphy, + struct net_device *dev, + u8 key_idx, bool uni, + bool multi) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + ieee80211_set_default_key(sdata, key_idx, uni, multi); + + return 0; +} + +static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, + struct net_device *dev, + u8 key_idx) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + ieee80211_set_default_mgmt_key(sdata, key_idx); + + return 0; +} + +static void rate_idx_to_bitrate(struct rate_info *rate, struct sta_info *sta, int idx) +{ + if (!(rate->flags & RATE_INFO_FLAGS_MCS)) { + struct ieee80211_supported_band *sband; + sband = sta->local->hw.wiphy->bands[ + sta->local->hw.conf.channel->band]; + rate->legacy = sband->bitrates[idx].bitrate; + } else + rate->mcs = idx; +} + +static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct timespec uptime; + + sinfo->generation = sdata->local->sta_generation; + + sinfo->filled = STATION_INFO_INACTIVE_TIME | + STATION_INFO_RX_BYTES | + STATION_INFO_TX_BYTES | + STATION_INFO_RX_PACKETS | + STATION_INFO_TX_PACKETS | + STATION_INFO_TX_RETRIES | + STATION_INFO_TX_FAILED | + STATION_INFO_TX_BITRATE | + STATION_INFO_RX_BITRATE | + STATION_INFO_RX_DROP_MISC | + STATION_INFO_BSS_PARAM | + STATION_INFO_CONNECTED_TIME; + + do_posix_clock_monotonic_gettime(&uptime); + sinfo->connected_time = uptime.tv_sec - sta->last_connected; + + sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); + sinfo->rx_bytes = sta->rx_bytes; + sinfo->tx_bytes = sta->tx_bytes; + sinfo->rx_packets = sta->rx_packets; + sinfo->tx_packets = sta->tx_packets; + sinfo->tx_retries = sta->tx_retry_count; + sinfo->tx_failed = sta->tx_retry_failed; + sinfo->rx_dropped_misc = sta->rx_dropped; + + if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || + (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { + sinfo->filled |= STATION_INFO_SIGNAL | STATION_INFO_SIGNAL_AVG; + sinfo->signal = (s8)sta->last_signal; + sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal); + } + + sinfo->txrate.flags = 0; + if (sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS) + sinfo->txrate.flags |= RATE_INFO_FLAGS_MCS; + if (sta->last_tx_rate.flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + sinfo->txrate.flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH; + if (sta->last_tx_rate.flags & IEEE80211_TX_RC_SHORT_GI) + sinfo->txrate.flags |= RATE_INFO_FLAGS_SHORT_GI; + rate_idx_to_bitrate(&sinfo->txrate, sta, sta->last_tx_rate.idx); + + sinfo->rxrate.flags = 0; + if (sta->last_rx_rate_flag & RX_FLAG_HT) + sinfo->rxrate.flags |= RATE_INFO_FLAGS_MCS; + if (sta->last_rx_rate_flag & RX_FLAG_40MHZ) + sinfo->rxrate.flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH; + if (sta->last_rx_rate_flag & RX_FLAG_SHORT_GI) + sinfo->rxrate.flags |= RATE_INFO_FLAGS_SHORT_GI; + rate_idx_to_bitrate(&sinfo->rxrate, sta, sta->last_rx_rate_idx); + + if (ieee80211_vif_is_mesh(&sdata->vif)) { +#ifdef CONFIG_MAC80211_MESH + sinfo->filled |= STATION_INFO_LLID | + STATION_INFO_PLID | + STATION_INFO_PLINK_STATE; + + sinfo->llid = le16_to_cpu(sta->llid); + sinfo->plid = le16_to_cpu(sta->plid); + sinfo->plink_state = sta->plink_state; +#endif + } + + sinfo->bss_param.flags = 0; + if (sdata->vif.bss_conf.use_cts_prot) + sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; + if (sdata->vif.bss_conf.use_short_preamble) + sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; + if (sdata->vif.bss_conf.use_short_slot) + sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; + sinfo->bss_param.dtim_period = sdata->local->hw.conf.ps_dtim_period; + sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; +} + + +static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, + int idx, u8 *mac, struct station_info *sinfo) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct sta_info *sta; + int ret = -ENOENT; + + rcu_read_lock(); + + sta = sta_info_get_by_idx(sdata, idx); + if (sta) { + ret = 0; + memcpy(mac, sta->sta.addr, ETH_ALEN); + sta_set_sinfo(sta, sinfo); + } + + rcu_read_unlock(); + + return ret; +} + +static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, + int idx, struct survey_info *survey) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + + return drv_get_survey(local, idx, survey); +} + +static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, + u8 *mac, struct station_info *sinfo) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct sta_info *sta; + int ret = -ENOENT; + + rcu_read_lock(); + + sta = sta_info_get_bss(sdata, mac); + if (sta) { + ret = 0; + sta_set_sinfo(sta, sinfo); + } + + rcu_read_unlock(); + + return ret; +} + +/* + * This handles both adding a beacon and setting new beacon info + */ +static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata, + struct beacon_parameters *params) +{ + struct beacon_data *new, *old; + int new_head_len, new_tail_len; + int size; + int err = -EINVAL; + + old = rtnl_dereference(sdata->u.ap.beacon); + + /* head must not be zero-length */ + if (params->head && !params->head_len) + return -EINVAL; + + /* + * This is a kludge. beacon interval should really be part + * of the beacon information. + */ + if (params->interval && + (sdata->vif.bss_conf.beacon_int != params->interval)) { + sdata->vif.bss_conf.beacon_int = params->interval; + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_BEACON_INT); + } + + /* Need to have a beacon head if we don't have one yet */ + if (!params->head && !old) + return err; + + /* sorry, no way to start beaconing without dtim period */ + if (!params->dtim_period && !old) + return err; + + /* new or old head? */ + if (params->head) + new_head_len = params->head_len; + else + new_head_len = old->head_len; + + /* new or old tail? */ + if (params->tail || !old) + /* params->tail_len will be zero for !params->tail */ + new_tail_len = params->tail_len; + else + new_tail_len = old->tail_len; + + size = sizeof(*new) + new_head_len + new_tail_len; + + new = kzalloc(size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* start filling the new info now */ + + /* new or old dtim period? */ + if (params->dtim_period) + new->dtim_period = params->dtim_period; + else + new->dtim_period = old->dtim_period; + + /* + * pointers go into the block we allocated, + * memory is | beacon_data | head | tail | + */ + new->head = ((u8 *) new) + sizeof(*new); + new->tail = new->head + new_head_len; + new->head_len = new_head_len; + new->tail_len = new_tail_len; + + /* copy in head */ + if (params->head) + memcpy(new->head, params->head, new_head_len); + else + memcpy(new->head, old->head, new_head_len); + + /* copy in optional tail */ + if (params->tail) + memcpy(new->tail, params->tail, new_tail_len); + else + if (old) + memcpy(new->tail, old->tail, new_tail_len); + + sdata->vif.bss_conf.dtim_period = new->dtim_period; + + rcu_assign_pointer(sdata->u.ap.beacon, new); + + synchronize_rcu(); + + kfree(old); + + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | + BSS_CHANGED_BEACON); + return 0; +} + +static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev, + struct beacon_parameters *params) +{ + struct ieee80211_sub_if_data *sdata; + struct beacon_data *old; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + old = rtnl_dereference(sdata->u.ap.beacon); + if (old) + return -EALREADY; + + return ieee80211_config_beacon(sdata, params); +} + +static int ieee80211_set_beacon(struct wiphy *wiphy, struct net_device *dev, + struct beacon_parameters *params) +{ + struct ieee80211_sub_if_data *sdata; + struct beacon_data *old; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + old = rtnl_dereference(sdata->u.ap.beacon); + if (!old) + return -ENOENT; + + return ieee80211_config_beacon(sdata, params); +} + +static int ieee80211_del_beacon(struct wiphy *wiphy, struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata; + struct beacon_data *old; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + old = rtnl_dereference(sdata->u.ap.beacon); + if (!old) + return -ENOENT; + + rcu_assign_pointer(sdata->u.ap.beacon, NULL); + synchronize_rcu(); + kfree(old); + + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); + return 0; +} + +/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ +struct iapp_layer2_update { + u8 da[ETH_ALEN]; /* broadcast */ + u8 sa[ETH_ALEN]; /* STA addr */ + __be16 len; /* 6 */ + u8 dsap; /* 0 */ + u8 ssap; /* 0 */ + u8 control; + u8 xid_info[3]; +} __packed; + +static void ieee80211_send_layer2_update(struct sta_info *sta) +{ + struct iapp_layer2_update *msg; + struct sk_buff *skb; + + /* Send Level 2 Update Frame to update forwarding tables in layer 2 + * bridge devices */ + + skb = dev_alloc_skb(sizeof(*msg)); + if (!skb) + return; + msg = (struct iapp_layer2_update *)skb_put(skb, sizeof(*msg)); + + /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) + * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ + + memset(msg->da, 0xff, ETH_ALEN); + memcpy(msg->sa, sta->sta.addr, ETH_ALEN); + msg->len = htons(6); + msg->dsap = 0; + msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ + msg->control = 0xaf; /* XID response lsb.1111F101. + * F=0 (no poll command; unsolicited frame) */ + msg->xid_info[0] = 0x81; /* XID format identifier */ + msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ + msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ + + skb->dev = sta->sdata->dev; + skb->protocol = eth_type_trans(skb, sta->sdata->dev); + memset(skb->cb, 0, sizeof(skb->cb)); + netif_rx_ni(skb); +} + +static void sta_apply_parameters(struct ieee80211_local *local, + struct sta_info *sta, + struct station_parameters *params) +{ + unsigned long flags; + u32 rates; + int i, j; + struct ieee80211_supported_band *sband; + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 mask, set; + + sband = local->hw.wiphy->bands[local->oper_channel->band]; + + spin_lock_irqsave(&sta->flaglock, flags); + mask = params->sta_flags_mask; + set = params->sta_flags_set; + + if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { + sta->flags &= ~WLAN_STA_AUTHORIZED; + if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) + sta->flags |= WLAN_STA_AUTHORIZED; + } + + if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { + sta->flags &= ~WLAN_STA_SHORT_PREAMBLE; + if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) + sta->flags |= WLAN_STA_SHORT_PREAMBLE; + } + + if (mask & BIT(NL80211_STA_FLAG_WME)) { + sta->flags &= ~WLAN_STA_WME; + if (set & BIT(NL80211_STA_FLAG_WME)) + sta->flags |= WLAN_STA_WME; + } + + if (mask & BIT(NL80211_STA_FLAG_MFP)) { + sta->flags &= ~WLAN_STA_MFP; + if (set & BIT(NL80211_STA_FLAG_MFP)) + sta->flags |= WLAN_STA_MFP; + } + + if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) { + sta->flags &= ~WLAN_STA_AUTH; + if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) + sta->flags |= WLAN_STA_AUTH; + } + spin_unlock_irqrestore(&sta->flaglock, flags); + + /* + * cfg80211 validates this (1-2007) and allows setting the AID + * only when creating a new station entry + */ + if (params->aid) + sta->sta.aid = params->aid; + + /* + * FIXME: updating the following information is racy when this + * function is called from ieee80211_change_station(). + * However, all this information should be static so + * maybe we should just reject attemps to change it. + */ + + if (params->listen_interval >= 0) + sta->listen_interval = params->listen_interval; + + if (params->supported_rates) { + rates = 0; + + for (i = 0; i < params->supported_rates_len; i++) { + int rate = (params->supported_rates[i] & 0x7f) * 5; + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) + rates |= BIT(j); + } + } + sta->sta.supp_rates[local->oper_channel->band] = rates; + } + + if (params->ht_capa) + ieee80211_ht_cap_ie_to_sta_ht_cap(sband, + params->ht_capa, + &sta->sta.ht_cap); + + if (ieee80211_vif_is_mesh(&sdata->vif)) { +#ifdef CONFIG_MAC80211_MESH + if (sdata->u.mesh.security & IEEE80211_MESH_SEC_SECURED) + switch (params->plink_state) { + case NL80211_PLINK_LISTEN: + case NL80211_PLINK_ESTAB: + case NL80211_PLINK_BLOCKED: + sta->plink_state = params->plink_state; + break; + default: + /* nothing */ + break; + } + else + switch (params->plink_action) { + case PLINK_ACTION_OPEN: + mesh_plink_open(sta); + break; + case PLINK_ACTION_BLOCK: + mesh_plink_block(sta); + break; + } +#endif + } +} + +static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, + u8 *mac, struct station_parameters *params) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct sta_info *sta; + struct ieee80211_sub_if_data *sdata; + int err; + int layer2_update; + + if (params->vlan) { + sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); + + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_AP) + return -EINVAL; + } else + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (compare_ether_addr(mac, sdata->vif.addr) == 0) + return -EINVAL; + + if (is_multicast_ether_addr(mac)) + return -EINVAL; + + sta = sta_info_alloc(sdata, mac, GFP_KERNEL); + if (!sta) + return -ENOMEM; + + sta->flags = WLAN_STA_AUTH | WLAN_STA_ASSOC; + + sta_apply_parameters(local, sta, params); + + rate_control_rate_init(sta); + + layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + sdata->vif.type == NL80211_IFTYPE_AP; + + err = sta_info_insert_rcu(sta); + if (err) { + rcu_read_unlock(); + return err; + } + + if (layer2_update) + ieee80211_send_layer2_update(sta); + + rcu_read_unlock(); + + return 0; +} + +static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, + u8 *mac) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (mac) + return sta_info_destroy_addr_bss(sdata, mac); + + sta_info_flush(local, sdata); + return 0; +} + +static int ieee80211_change_station(struct wiphy *wiphy, + struct net_device *dev, + u8 *mac, + struct station_parameters *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = wiphy_priv(wiphy); + struct sta_info *sta; + struct ieee80211_sub_if_data *vlansdata; + + rcu_read_lock(); + + sta = sta_info_get_bss(sdata, mac); + if (!sta) { + rcu_read_unlock(); + return -ENOENT; + } + + if (params->vlan && params->vlan != sta->sdata->dev) { + vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); + + if (vlansdata->vif.type != NL80211_IFTYPE_AP_VLAN && + vlansdata->vif.type != NL80211_IFTYPE_AP) { + rcu_read_unlock(); + return -EINVAL; + } + + if (params->vlan->ieee80211_ptr->use_4addr) { + if (vlansdata->u.vlan.sta) { + rcu_read_unlock(); + return -EBUSY; + } + + rcu_assign_pointer(vlansdata->u.vlan.sta, sta); + } + + sta->sdata = vlansdata; + ieee80211_send_layer2_update(sta); + } + + sta_apply_parameters(local, sta, params); + + rcu_read_unlock(); + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) + ieee80211_recalc_ps(local, -1); + + return 0; +} + +#ifdef CONFIG_MAC80211_MESH +static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, + u8 *dst, u8 *next_hop) +{ + struct ieee80211_sub_if_data *sdata; + struct mesh_path *mpath; + struct sta_info *sta; + int err; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + rcu_read_lock(); + sta = sta_info_get(sdata, next_hop); + if (!sta) { + rcu_read_unlock(); + return -ENOENT; + } + + err = mesh_path_add(dst, sdata); + if (err) { + rcu_read_unlock(); + return err; + } + + mpath = mesh_path_lookup(dst, sdata); + if (!mpath) { + rcu_read_unlock(); + return -ENXIO; + } + mesh_path_fix_nexthop(mpath, sta); + + rcu_read_unlock(); + return 0; +} + +static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, + u8 *dst) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (dst) + return mesh_path_del(dst, sdata); + + mesh_path_flush(sdata); + return 0; +} + +static int ieee80211_change_mpath(struct wiphy *wiphy, + struct net_device *dev, + u8 *dst, u8 *next_hop) +{ + struct ieee80211_sub_if_data *sdata; + struct mesh_path *mpath; + struct sta_info *sta; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + rcu_read_lock(); + + sta = sta_info_get(sdata, next_hop); + if (!sta) { + rcu_read_unlock(); + return -ENOENT; + } + + mpath = mesh_path_lookup(dst, sdata); + if (!mpath) { + rcu_read_unlock(); + return -ENOENT; + } + + mesh_path_fix_nexthop(mpath, sta); + + rcu_read_unlock(); + return 0; +} + +static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, + struct mpath_info *pinfo) +{ + struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop); + + if (next_hop_sta) + memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN); + else + memset(next_hop, 0, ETH_ALEN); + + pinfo->generation = mesh_paths_generation; + + pinfo->filled = MPATH_INFO_FRAME_QLEN | + MPATH_INFO_SN | + MPATH_INFO_METRIC | + MPATH_INFO_EXPTIME | + MPATH_INFO_DISCOVERY_TIMEOUT | + MPATH_INFO_DISCOVERY_RETRIES | + MPATH_INFO_FLAGS; + + pinfo->frame_qlen = mpath->frame_queue.qlen; + pinfo->sn = mpath->sn; + pinfo->metric = mpath->metric; + if (time_before(jiffies, mpath->exp_time)) + pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies); + pinfo->discovery_timeout = + jiffies_to_msecs(mpath->discovery_timeout); + pinfo->discovery_retries = mpath->discovery_retries; + pinfo->flags = 0; + if (mpath->flags & MESH_PATH_ACTIVE) + pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; + if (mpath->flags & MESH_PATH_RESOLVING) + pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; + if (mpath->flags & MESH_PATH_SN_VALID) + pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; + if (mpath->flags & MESH_PATH_FIXED) + pinfo->flags |= NL80211_MPATH_FLAG_FIXED; + if (mpath->flags & MESH_PATH_RESOLVING) + pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; + + pinfo->flags = mpath->flags; +} + +static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, + u8 *dst, u8 *next_hop, struct mpath_info *pinfo) + +{ + struct ieee80211_sub_if_data *sdata; + struct mesh_path *mpath; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + rcu_read_lock(); + mpath = mesh_path_lookup(dst, sdata); + if (!mpath) { + rcu_read_unlock(); + return -ENOENT; + } + memcpy(dst, mpath->dst, ETH_ALEN); + mpath_set_pinfo(mpath, next_hop, pinfo); + rcu_read_unlock(); + return 0; +} + +static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, + int idx, u8 *dst, u8 *next_hop, + struct mpath_info *pinfo) +{ + struct ieee80211_sub_if_data *sdata; + struct mesh_path *mpath; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + rcu_read_lock(); + mpath = mesh_path_lookup_by_idx(idx, sdata); + if (!mpath) { + rcu_read_unlock(); + return -ENOENT; + } + memcpy(dst, mpath->dst, ETH_ALEN); + mpath_set_pinfo(mpath, next_hop, pinfo); + rcu_read_unlock(); + return 0; +} + +static int ieee80211_get_mesh_config(struct wiphy *wiphy, + struct net_device *dev, + struct mesh_config *conf) +{ + struct ieee80211_sub_if_data *sdata; + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config)); + return 0; +} + +static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask) +{ + return (mask >> (parm-1)) & 0x1; +} + +static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, + const struct mesh_setup *setup) +{ + u8 *new_ie; + const u8 *old_ie; + + /* allocate information elements */ + new_ie = NULL; + old_ie = ifmsh->ie; + + if (setup->ie_len) { + new_ie = kmemdup(setup->ie, setup->ie_len, + GFP_KERNEL); + if (!new_ie) + return -ENOMEM; + } + ifmsh->ie_len = setup->ie_len; + ifmsh->ie = new_ie; + kfree(old_ie); + + /* now copy the rest of the setup parameters */ + ifmsh->mesh_id_len = setup->mesh_id_len; + memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len); + ifmsh->mesh_pp_id = setup->path_sel_proto; + ifmsh->mesh_pm_id = setup->path_metric; + ifmsh->security = IEEE80211_MESH_SEC_NONE; + if (setup->is_authenticated) + ifmsh->security |= IEEE80211_MESH_SEC_AUTHED; + if (setup->is_secure) + ifmsh->security |= IEEE80211_MESH_SEC_SECURED; + + return 0; +} + +static int ieee80211_update_mesh_config(struct wiphy *wiphy, + struct net_device *dev, u32 mask, + const struct mesh_config *nconf) +{ + struct mesh_config *conf; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_mesh *ifmsh; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + ifmsh = &sdata->u.mesh; + + /* Set the config options which we are interested in setting */ + conf = &(sdata->u.mesh.mshcfg); + if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask)) + conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout; + if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask)) + conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout; + if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask)) + conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout; + if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask)) + conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks; + if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask)) + conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries; + if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask)) + conf->dot11MeshTTL = nconf->dot11MeshTTL; + if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask)) + conf->dot11MeshTTL = nconf->element_ttl; + if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) + conf->auto_open_plinks = nconf->auto_open_plinks; + if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask)) + conf->dot11MeshHWMPmaxPREQretries = + nconf->dot11MeshHWMPmaxPREQretries; + if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask)) + conf->path_refresh_time = nconf->path_refresh_time; + if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask)) + conf->min_discovery_timeout = nconf->min_discovery_timeout; + if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask)) + conf->dot11MeshHWMPactivePathTimeout = + nconf->dot11MeshHWMPactivePathTimeout; + if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask)) + conf->dot11MeshHWMPpreqMinInterval = + nconf->dot11MeshHWMPpreqMinInterval; + if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, + mask)) + conf->dot11MeshHWMPnetDiameterTraversalTime = + nconf->dot11MeshHWMPnetDiameterTraversalTime; + if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) { + conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode; + ieee80211_mesh_root_setup(ifmsh); + } + return 0; +} + +static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, + const struct mesh_config *conf, + const struct mesh_setup *setup) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + int err; + + memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config)); + err = copy_mesh_setup(ifmsh, setup); + if (err) + return err; + ieee80211_start_mesh(sdata); + + return 0; +} + +static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + ieee80211_stop_mesh(sdata); + + return 0; +} +#endif + +static int ieee80211_change_bss(struct wiphy *wiphy, + struct net_device *dev, + struct bss_parameters *params) +{ + struct ieee80211_sub_if_data *sdata; + u32 changed = 0; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (params->use_cts_prot >= 0) { + sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot; + changed |= BSS_CHANGED_ERP_CTS_PROT; + } + if (params->use_short_preamble >= 0) { + sdata->vif.bss_conf.use_short_preamble = + params->use_short_preamble; + changed |= BSS_CHANGED_ERP_PREAMBLE; + } + + if (!sdata->vif.bss_conf.use_short_slot && + sdata->local->hw.conf.channel->band == IEEE80211_BAND_5GHZ) { + sdata->vif.bss_conf.use_short_slot = true; + changed |= BSS_CHANGED_ERP_SLOT; + } + + if (params->use_short_slot_time >= 0) { + sdata->vif.bss_conf.use_short_slot = + params->use_short_slot_time; + changed |= BSS_CHANGED_ERP_SLOT; + } + + if (params->basic_rates) { + int i, j; + u32 rates = 0; + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_supported_band *sband = + wiphy->bands[local->oper_channel->band]; + + for (i = 0; i < params->basic_rates_len; i++) { + int rate = (params->basic_rates[i] & 0x7f) * 5; + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) + rates |= BIT(j); + } + } + sdata->vif.bss_conf.basic_rates = rates; + changed |= BSS_CHANGED_BASIC_RATES; + } + + if (params->ap_isolate >= 0) { + if (params->ap_isolate) + sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; + else + sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS; + } + + if (params->ht_opmode >= 0) { + sdata->vif.bss_conf.ht_operation_mode = + (u16) params->ht_opmode; + changed |= BSS_CHANGED_HT; + } + + ieee80211_bss_info_change_notify(sdata, changed); + + return 0; +} + +static int ieee80211_set_txq_params(struct wiphy *wiphy, + struct ieee80211_txq_params *params) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_tx_queue_params p; + + if (!local->ops->conf_tx) + return -EOPNOTSUPP; + + memset(&p, 0, sizeof(p)); + p.aifs = params->aifs; + p.cw_max = params->cwmax; + p.cw_min = params->cwmin; + p.txop = params->txop; + + /* + * Setting tx queue params disables u-apsd because it's only + * called in master mode. + */ + p.uapsd = false; + + if (drv_conf_tx(local, params->queue, &p)) { + wiphy_debug(local->hw.wiphy, + "failed to set TX queue parameters for queue %d\n", + params->queue); + return -EINVAL; + } + + return 0; +} + +static int ieee80211_set_channel(struct wiphy *wiphy, + struct net_device *netdev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = NULL; + struct ieee80211_channel *old_oper; + enum nl80211_channel_type old_oper_type; + enum nl80211_channel_type old_vif_oper_type= NL80211_CHAN_NO_HT; + + if (netdev) + sdata = IEEE80211_DEV_TO_SUB_IF(netdev); + + switch (ieee80211_get_channel_mode(local, NULL)) { + case CHAN_MODE_HOPPING: + return -EBUSY; + case CHAN_MODE_FIXED: + if (local->oper_channel != chan) + return -EBUSY; + if (!sdata && local->_oper_channel_type == channel_type) + return 0; + break; + case CHAN_MODE_UNDEFINED: + break; + } + + if (sdata) + old_vif_oper_type = sdata->vif.bss_conf.channel_type; + old_oper_type = local->_oper_channel_type; + + if (!ieee80211_set_channel_type(local, sdata, channel_type)) + return -EBUSY; + + old_oper = local->oper_channel; + local->oper_channel = chan; + + /* Update driver if changes were actually made. */ + if ((old_oper != local->oper_channel) || + (old_oper_type != local->_oper_channel_type)) + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + + if ((sdata && sdata->vif.type != NL80211_IFTYPE_MONITOR) && + old_vif_oper_type != sdata->vif.bss_conf.channel_type) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT); + + return 0; +} + +#ifdef CONFIG_PM +static int ieee80211_suspend(struct wiphy *wiphy, + struct cfg80211_wowlan *wowlan) +{ + return __ieee80211_suspend(wiphy_priv(wiphy), wowlan); +} + +static int ieee80211_resume(struct wiphy *wiphy) +{ + return __ieee80211_resume(wiphy_priv(wiphy)); +} +#else +#define ieee80211_suspend NULL +#define ieee80211_resume NULL +#endif + +static int ieee80211_scan(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_scan_request *req) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + switch (ieee80211_vif_type_p2p(&sdata->vif)) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_P2P_CLIENT: + break; + case NL80211_IFTYPE_P2P_GO: + if (sdata->local->ops->hw_scan) + break; + /* + * FIXME: implement NoA while scanning in software, + * for now fall through to allow scanning only when + * beaconing hasn't been configured yet + */ + case NL80211_IFTYPE_AP: + if (sdata->u.ap.beacon) + return -EOPNOTSUPP; + break; + default: + return -EOPNOTSUPP; + } + + return ieee80211_request_scan(sdata, req); +} + +static int +ieee80211_sched_scan_start(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_sched_scan_request *req) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (!sdata->local->ops->sched_scan_start) + return -EOPNOTSUPP; + + return ieee80211_request_sched_scan_start(sdata, req); +} + +static int +ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (!sdata->local->ops->sched_scan_stop) + return -EOPNOTSUPP; + + return ieee80211_request_sched_scan_stop(sdata); +} + +static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_auth_request *req) +{ + return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req); +} + +static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_assoc_request *req) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + switch (ieee80211_get_channel_mode(local, sdata)) { + case CHAN_MODE_HOPPING: + return -EBUSY; + case CHAN_MODE_FIXED: + if (local->oper_channel == req->bss->channel) + break; + return -EBUSY; + case CHAN_MODE_UNDEFINED: + break; + } + + return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req); +} + +static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_deauth_request *req, + void *cookie) +{ + return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev), + req, cookie); +} + +static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_disassoc_request *req, + void *cookie) +{ + return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev), + req, cookie); +} + +static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ibss_params *params) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + switch (ieee80211_get_channel_mode(local, sdata)) { + case CHAN_MODE_HOPPING: + return -EBUSY; + case CHAN_MODE_FIXED: + if (!params->channel_fixed) + return -EBUSY; + if (local->oper_channel == params->channel) + break; + return -EBUSY; + case CHAN_MODE_UNDEFINED: + break; + } + + return ieee80211_ibss_join(sdata, params); +} + +static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + return ieee80211_ibss_leave(sdata); +} + +static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + int err; + + if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { + err = drv_set_frag_threshold(local, wiphy->frag_threshold); + + if (err) + return err; + } + + if (changed & WIPHY_PARAM_COVERAGE_CLASS) { + err = drv_set_coverage_class(local, wiphy->coverage_class); + + if (err) + return err; + } + + if (changed & WIPHY_PARAM_RTS_THRESHOLD) { + err = drv_set_rts_threshold(local, wiphy->rts_threshold); + + if (err) + return err; + } + + if (changed & WIPHY_PARAM_RETRY_SHORT) + local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; + if (changed & WIPHY_PARAM_RETRY_LONG) + local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; + if (changed & + (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); + + return 0; +} + +static int ieee80211_set_tx_power(struct wiphy *wiphy, + enum nl80211_tx_power_setting type, int mbm) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_channel *chan = local->hw.conf.channel; + u32 changes = 0; + + switch (type) { + case NL80211_TX_POWER_AUTOMATIC: + local->user_power_level = -1; + break; + case NL80211_TX_POWER_LIMITED: + if (mbm < 0 || (mbm % 100)) + return -EOPNOTSUPP; + local->user_power_level = MBM_TO_DBM(mbm); + break; + case NL80211_TX_POWER_FIXED: + if (mbm < 0 || (mbm % 100)) + return -EOPNOTSUPP; + /* TODO: move to cfg80211 when it knows the channel */ + if (MBM_TO_DBM(mbm) > chan->max_power) + return -EINVAL; + local->user_power_level = MBM_TO_DBM(mbm); + break; + } + + ieee80211_hw_config(local, changes); + + return 0; +} + +static int ieee80211_get_tx_power(struct wiphy *wiphy, int *dbm) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + *dbm = local->hw.conf.power_level; + + return 0; +} + +static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev, + const u8 *addr) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN); + + return 0; +} + +static void ieee80211_rfkill_poll(struct wiphy *wiphy) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + drv_rfkill_poll(local); +} + +#ifdef CONFIG_NL80211_TESTMODE +static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (!local->ops->testmode_cmd) + return -EOPNOTSUPP; + + return local->ops->testmode_cmd(&local->hw, data, len); +} +#endif + +int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) +{ + const u8 *ap; + enum ieee80211_smps_mode old_req; + int err; + + lockdep_assert_held(&sdata->u.mgd.mtx); + + old_req = sdata->u.mgd.req_smps; + sdata->u.mgd.req_smps = smps_mode; + + if (old_req == smps_mode && + smps_mode != IEEE80211_SMPS_AUTOMATIC) + return 0; + + /* + * If not associated, or current association is not an HT + * association, there's no need to send an action frame. + */ + if (!sdata->u.mgd.associated || + sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) { + mutex_lock(&sdata->local->iflist_mtx); + ieee80211_recalc_smps(sdata->local); + mutex_unlock(&sdata->local->iflist_mtx); + return 0; + } + + ap = sdata->u.mgd.associated->bssid; + + if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { + if (sdata->u.mgd.powersave) + smps_mode = IEEE80211_SMPS_DYNAMIC; + else + smps_mode = IEEE80211_SMPS_OFF; + } + + /* send SM PS frame to AP */ + err = ieee80211_send_smps_action(sdata, smps_mode, + ap, ap); + if (err) + sdata->u.mgd.req_smps = old_req; + + return err; +} + +static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, + bool enabled, int timeout) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + return -EOPNOTSUPP; + + if (enabled == sdata->u.mgd.powersave && + timeout == local->dynamic_ps_forced_timeout) + return 0; + + sdata->u.mgd.powersave = enabled; + local->dynamic_ps_forced_timeout = timeout; + + /* no change, but if automatic follow powersave */ + mutex_lock(&sdata->u.mgd.mtx); + __ieee80211_request_smps(sdata, sdata->u.mgd.req_smps); + mutex_unlock(&sdata->u.mgd.mtx); + + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + + ieee80211_recalc_ps(local, -1); + + return 0; +} + +static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, + struct net_device *dev, + s32 rssi_thold, u32 rssi_hyst) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_vif *vif = &sdata->vif; + struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; + + if (rssi_thold == bss_conf->cqm_rssi_thold && + rssi_hyst == bss_conf->cqm_rssi_hyst) + return 0; + + bss_conf->cqm_rssi_thold = rssi_thold; + bss_conf->cqm_rssi_hyst = rssi_hyst; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) { + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + return 0; + } + + /* tell the driver upon association, unless already associated */ + if (sdata->u.mgd.associated) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM); + + return 0; +} + +static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, + struct net_device *dev, + const u8 *addr, + const struct cfg80211_bitrate_mask *mask) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + int i, ret; + + if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + ret = drv_set_bitrate_mask(local, sdata, mask); + if (ret) + return ret; + } + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) + sdata->rc_rateidx_mask[i] = mask->control[i].legacy; + + return 0; +} + +static int ieee80211_remain_on_channel_hw(struct ieee80211_local *local, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type chantype, + unsigned int duration, u64 *cookie) +{ + int ret; + u32 random_cookie; + + lockdep_assert_held(&local->mtx); + + if (local->hw_roc_cookie) + return -EBUSY; + /* must be nonzero */ + random_cookie = random32() | 1; + + *cookie = random_cookie; + local->hw_roc_dev = dev; + local->hw_roc_cookie = random_cookie; + local->hw_roc_channel = chan; + local->hw_roc_channel_type = chantype; + local->hw_roc_duration = duration; + ret = drv_remain_on_channel(local, chan, chantype, duration); + if (ret) { + local->hw_roc_channel = NULL; + local->hw_roc_cookie = 0; + } + + return ret; +} + +static int ieee80211_remain_on_channel(struct wiphy *wiphy, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, + u64 *cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + + if (local->ops->remain_on_channel) { + int ret; + + mutex_lock(&local->mtx); + ret = ieee80211_remain_on_channel_hw(local, dev, + chan, channel_type, + duration, cookie); + local->hw_roc_for_tx = false; + mutex_unlock(&local->mtx); + + return ret; + } + + return ieee80211_wk_remain_on_channel(sdata, chan, channel_type, + duration, cookie); +} + +static int ieee80211_cancel_remain_on_channel_hw(struct ieee80211_local *local, + u64 cookie) +{ + int ret; + + lockdep_assert_held(&local->mtx); + + if (local->hw_roc_cookie != cookie) + return -ENOENT; + + ret = drv_cancel_remain_on_channel(local); + if (ret) + return ret; + + local->hw_roc_cookie = 0; + local->hw_roc_channel = NULL; + + ieee80211_recalc_idle(local); + + return 0; +} + +static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, + struct net_device *dev, + u64 cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + + if (local->ops->cancel_remain_on_channel) { + int ret; + + mutex_lock(&local->mtx); + ret = ieee80211_cancel_remain_on_channel_hw(local, cookie); + mutex_unlock(&local->mtx); + + return ret; + } + + return ieee80211_wk_cancel_remain_on_channel(sdata, cookie); +} + +static enum work_done_result +ieee80211_offchan_tx_done(struct ieee80211_work *wk, struct sk_buff *skb) +{ + /* + * Use the data embedded in the work struct for reporting + * here so if the driver mangled the SKB before dropping + * it (which is the only way we really should get here) + * then we don't report mangled data. + * + * If there was no wait time, then by the time we get here + * the driver will likely not have reported the status yet, + * so in that case userspace will have to deal with it. + */ + + if (wk->offchan_tx.wait && !wk->offchan_tx.status) + cfg80211_mgmt_tx_status(wk->sdata->dev, + (unsigned long) wk->offchan_tx.frame, + wk->ie, wk->ie_len, false, GFP_KERNEL); + + return WORK_DONE_DESTROY; +} + +static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev, + struct ieee80211_channel *chan, bool offchan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, unsigned int wait, + const u8 *buf, size_t len, u64 *cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct sta_info *sta; + struct ieee80211_work *wk; + const struct ieee80211_mgmt *mgmt = (void *)buf; + u32 flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_CTL_REQ_TX_STATUS; + bool is_offchan = false; + + /* Check that we are on the requested channel for transmission */ + if (chan != local->tmp_channel && + chan != local->oper_channel) + is_offchan = true; + if (channel_type_valid && + (channel_type != local->tmp_channel_type && + channel_type != local->_oper_channel_type)) + is_offchan = true; + + if (chan == local->hw_roc_channel) { + /* TODO: check channel type? */ + is_offchan = false; + flags |= IEEE80211_TX_CTL_TX_OFFCHAN; + } + + if (is_offchan && !offchan) + return -EBUSY; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + if (!ieee80211_is_action(mgmt->frame_control) || + mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) + break; + rcu_read_lock(); + sta = sta_info_get(sdata, mgmt->da); + rcu_read_unlock(); + if (!sta) + return -ENOLINK; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + break; + default: + return -EOPNOTSUPP; + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + len); + if (!skb) + return -ENOMEM; + skb_reserve(skb, local->hw.extra_tx_headroom); + + memcpy(skb_put(skb, len), buf, len); + + IEEE80211_SKB_CB(skb)->flags = flags; + + skb->dev = sdata->dev; + + *cookie = (unsigned long) skb; + + if (is_offchan && local->ops->offchannel_tx) { + int ret; + + IEEE80211_SKB_CB(skb)->band = chan->band; + + mutex_lock(&local->mtx); + + if (local->hw_offchan_tx_cookie) { + mutex_unlock(&local->mtx); + return -EBUSY; + } + + /* TODO: bitrate control, TX processing? */ + ret = drv_offchannel_tx(local, skb, chan, channel_type, wait); + + if (ret == 0) + local->hw_offchan_tx_cookie = *cookie; + mutex_unlock(&local->mtx); + + /* + * Allow driver to return 1 to indicate it wants to have the + * frame transmitted with a remain_on_channel + regular TX. + */ + if (ret != 1) + return ret; + } + + if (is_offchan && local->ops->remain_on_channel) { + unsigned int duration; + int ret; + + mutex_lock(&local->mtx); + /* + * If the duration is zero, then the driver + * wouldn't actually do anything. Set it to + * 100 for now. + * + * TODO: cancel the off-channel operation + * when we get the SKB's TX status and + * the wait time was zero before. + */ + duration = 100; + if (wait) + duration = wait; + ret = ieee80211_remain_on_channel_hw(local, dev, chan, + channel_type, + duration, cookie); + if (ret) { + kfree_skb(skb); + mutex_unlock(&local->mtx); + return ret; + } + + local->hw_roc_for_tx = true; + local->hw_roc_duration = wait; + + /* + * queue up frame for transmission after + * ieee80211_ready_on_channel call + */ + + /* modify cookie to prevent API mismatches */ + *cookie ^= 2; + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN; + local->hw_roc_skb = skb; + local->hw_roc_skb_for_status = skb; + mutex_unlock(&local->mtx); + + return 0; + } + + /* + * Can transmit right away if the channel was the + * right one and there's no wait involved... If a + * wait is involved, we might otherwise not be on + * the right channel for long enough! + */ + if (!is_offchan && !wait && !sdata->vif.bss_conf.idle) { + ieee80211_tx_skb(sdata, skb); + return 0; + } + + wk = kzalloc(sizeof(*wk) + len, GFP_KERNEL); + if (!wk) { + kfree_skb(skb); + return -ENOMEM; + } + + wk->type = IEEE80211_WORK_OFFCHANNEL_TX; + wk->chan = chan; + wk->chan_type = channel_type; + wk->sdata = sdata; + wk->done = ieee80211_offchan_tx_done; + wk->offchan_tx.frame = skb; + wk->offchan_tx.wait = wait; + wk->ie_len = len; + memcpy(wk->ie, buf, len); + + ieee80211_add_work(wk); + return 0; +} + +static int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, + struct net_device *dev, + u64 cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct ieee80211_work *wk; + int ret = -ENOENT; + + mutex_lock(&local->mtx); + + if (local->ops->offchannel_tx_cancel_wait && + local->hw_offchan_tx_cookie == cookie) { + ret = drv_offchannel_tx_cancel_wait(local); + + if (!ret) + local->hw_offchan_tx_cookie = 0; + + mutex_unlock(&local->mtx); + + return ret; + } + + if (local->ops->cancel_remain_on_channel) { + cookie ^= 2; + ret = ieee80211_cancel_remain_on_channel_hw(local, cookie); + + if (ret == 0) { + kfree_skb(local->hw_roc_skb); + local->hw_roc_skb = NULL; + local->hw_roc_skb_for_status = NULL; + } + + mutex_unlock(&local->mtx); + + return ret; + } + + list_for_each_entry(wk, &local->work_list, list) { + if (wk->sdata != sdata) + continue; + + if (wk->type != IEEE80211_WORK_OFFCHANNEL_TX) + continue; + + if (cookie != (unsigned long) wk->offchan_tx.frame) + continue; + + wk->timeout = jiffies; + + ieee80211_queue_work(&local->hw, &local->work_work); + ret = 0; + break; + } + mutex_unlock(&local->mtx); + + return ret; +} + +static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, + struct net_device *dev, + u16 frame_type, bool reg) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ)) + return; + + if (reg) + local->probe_req_reg++; + else + local->probe_req_reg--; + + ieee80211_queue_work(&local->hw, &local->reconfig_filter); +} + +static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (local->started) + return -EOPNOTSUPP; + + return drv_set_antenna(local, tx_ant, rx_ant); +} + +static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + return drv_get_antenna(local, tx_ant, rx_ant); +} + +static int ieee80211_set_ringparam(struct wiphy *wiphy, u32 tx, u32 rx) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + return drv_set_ringparam(local, tx, rx); +} + +static void ieee80211_get_ringparam(struct wiphy *wiphy, + u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + drv_get_ringparam(local, tx, tx_max, rx, rx_max); +} + +struct cfg80211_ops mac80211_config_ops = { + .add_virtual_intf = ieee80211_add_iface, + .del_virtual_intf = ieee80211_del_iface, + .change_virtual_intf = ieee80211_change_iface, + .add_key = ieee80211_add_key, + .del_key = ieee80211_del_key, + .get_key = ieee80211_get_key, + .set_default_key = ieee80211_config_default_key, + .set_default_mgmt_key = ieee80211_config_default_mgmt_key, + .add_beacon = ieee80211_add_beacon, + .set_beacon = ieee80211_set_beacon, + .del_beacon = ieee80211_del_beacon, + .add_station = ieee80211_add_station, + .del_station = ieee80211_del_station, + .change_station = ieee80211_change_station, + .get_station = ieee80211_get_station, + .dump_station = ieee80211_dump_station, + .dump_survey = ieee80211_dump_survey, +#ifdef CONFIG_MAC80211_MESH + .add_mpath = ieee80211_add_mpath, + .del_mpath = ieee80211_del_mpath, + .change_mpath = ieee80211_change_mpath, + .get_mpath = ieee80211_get_mpath, + .dump_mpath = ieee80211_dump_mpath, + .update_mesh_config = ieee80211_update_mesh_config, + .get_mesh_config = ieee80211_get_mesh_config, + .join_mesh = ieee80211_join_mesh, + .leave_mesh = ieee80211_leave_mesh, +#endif + .change_bss = ieee80211_change_bss, + .set_txq_params = ieee80211_set_txq_params, + .set_channel = ieee80211_set_channel, + .suspend = ieee80211_suspend, + .resume = ieee80211_resume, + .scan = ieee80211_scan, + .sched_scan_start = ieee80211_sched_scan_start, + .sched_scan_stop = ieee80211_sched_scan_stop, + .auth = ieee80211_auth, + .assoc = ieee80211_assoc, + .deauth = ieee80211_deauth, + .disassoc = ieee80211_disassoc, + .join_ibss = ieee80211_join_ibss, + .leave_ibss = ieee80211_leave_ibss, + .set_wiphy_params = ieee80211_set_wiphy_params, + .set_tx_power = ieee80211_set_tx_power, + .get_tx_power = ieee80211_get_tx_power, + .set_wds_peer = ieee80211_set_wds_peer, + .rfkill_poll = ieee80211_rfkill_poll, + CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) + .set_power_mgmt = ieee80211_set_power_mgmt, + .set_bitrate_mask = ieee80211_set_bitrate_mask, + .remain_on_channel = ieee80211_remain_on_channel, + .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, + .mgmt_tx = ieee80211_mgmt_tx, + .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, + .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, + .mgmt_frame_register = ieee80211_mgmt_frame_register, + .set_antenna = ieee80211_set_antenna, + .get_antenna = ieee80211_get_antenna, + .set_ringparam = ieee80211_set_ringparam, + .get_ringparam = ieee80211_get_ringparam, +}; diff --git a/net/mac80211/cfg.h b/net/mac80211/cfg.h new file mode 100644 index 00000000..7d7879f5 --- /dev/null +++ b/net/mac80211/cfg.h @@ -0,0 +1,9 @@ +/* + * mac80211 configuration hooks for cfg80211 + */ +#ifndef __CFG_H +#define __CFG_H + +extern struct cfg80211_ops mac80211_config_ops; + +#endif /* __CFG_H */ diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c new file mode 100644 index 00000000..889c3e93 --- /dev/null +++ b/net/mac80211/chan.c @@ -0,0 +1,130 @@ +/* + * mac80211 - channel management + */ + +#include +#include "ieee80211_i.h" + +static enum ieee80211_chan_mode +__ieee80211_get_channel_mode(struct ieee80211_local *local, + struct ieee80211_sub_if_data *ignore) +{ + struct ieee80211_sub_if_data *sdata; + + lockdep_assert_held(&local->iflist_mtx); + + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata == ignore) + continue; + + if (!ieee80211_sdata_running(sdata)) + continue; + + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) + continue; + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !sdata->u.mgd.associated) + continue; + + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + if (!sdata->u.ibss.ssid_len) + continue; + if (!sdata->u.ibss.fixed_channel) + return CHAN_MODE_HOPPING; + } + + if (sdata->vif.type == NL80211_IFTYPE_AP && + !sdata->u.ap.beacon) + continue; + + return CHAN_MODE_FIXED; + } + + return CHAN_MODE_UNDEFINED; +} + +enum ieee80211_chan_mode +ieee80211_get_channel_mode(struct ieee80211_local *local, + struct ieee80211_sub_if_data *ignore) +{ + enum ieee80211_chan_mode mode; + + mutex_lock(&local->iflist_mtx); + mode = __ieee80211_get_channel_mode(local, ignore); + mutex_unlock(&local->iflist_mtx); + + return mode; +} + +bool ieee80211_set_channel_type(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_channel_type chantype) +{ + struct ieee80211_sub_if_data *tmp; + enum nl80211_channel_type superchan = NL80211_CHAN_NO_HT; + bool result; + + mutex_lock(&local->iflist_mtx); + + list_for_each_entry(tmp, &local->interfaces, list) { + if (tmp == sdata) + continue; + + if (!ieee80211_sdata_running(tmp)) + continue; + + switch (tmp->vif.bss_conf.channel_type) { + case NL80211_CHAN_NO_HT: + case NL80211_CHAN_HT20: + if (superchan > tmp->vif.bss_conf.channel_type) + break; + + superchan = tmp->vif.bss_conf.channel_type; + break; + case NL80211_CHAN_HT40PLUS: + WARN_ON(superchan == NL80211_CHAN_HT40MINUS); + superchan = NL80211_CHAN_HT40PLUS; + break; + case NL80211_CHAN_HT40MINUS: + WARN_ON(superchan == NL80211_CHAN_HT40PLUS); + superchan = NL80211_CHAN_HT40MINUS; + break; + } + } + + switch (superchan) { + case NL80211_CHAN_NO_HT: + case NL80211_CHAN_HT20: + /* + * allow any change that doesn't go to no-HT + * (if it already is no-HT no change is needed) + */ + if (chantype == NL80211_CHAN_NO_HT) + break; + superchan = chantype; + break; + case NL80211_CHAN_HT40PLUS: + case NL80211_CHAN_HT40MINUS: + /* allow smaller bandwidth and same */ + if (chantype == NL80211_CHAN_NO_HT) + break; + if (chantype == NL80211_CHAN_HT20) + break; + if (superchan == chantype) + break; + result = false; + goto out; + } + + local->_oper_channel_type = superchan; + + if (sdata) + sdata->vif.bss_conf.channel_type = chantype; + + result = true; + out: + mutex_unlock(&local->iflist_mtx); + + return result; +} diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c new file mode 100644 index 00000000..186e02f7 --- /dev/null +++ b/net/mac80211/debugfs.c @@ -0,0 +1,526 @@ + +/* + * mac80211 debugfs for wireless PHYs + * + * Copyright 2007 Johannes Berg + * + * GPLv2 + * + */ + +#include +#include +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" +#include "debugfs.h" + +int mac80211_open_file_generic(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define DEBUGFS_FORMAT_BUFFER_SIZE 100 + +int mac80211_format_buffer(char __user *userbuf, size_t count, + loff_t *ppos, char *fmt, ...) +{ + va_list args; + char buf[DEBUGFS_FORMAT_BUFFER_SIZE]; + int res; + + va_start(args, fmt); + res = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + return simple_read_from_buffer(userbuf, count, ppos, buf, res); +} + +#define DEBUGFS_READONLY_FILE_FN(name, fmt, value...) \ +static ssize_t name## _read(struct file *file, char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + struct ieee80211_local *local = file->private_data; \ + \ + return mac80211_format_buffer(userbuf, count, ppos, \ + fmt "\n", ##value); \ +} + +#define DEBUGFS_READONLY_FILE_OPS(name) \ +static const struct file_operations name## _ops = { \ + .read = name## _read, \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +}; + +#define DEBUGFS_READONLY_FILE(name, fmt, value...) \ + DEBUGFS_READONLY_FILE_FN(name, fmt, value) \ + DEBUGFS_READONLY_FILE_OPS(name) + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, 0400, phyd, local, &name## _ops); + +#define DEBUGFS_ADD_MODE(name, mode) \ + debugfs_create_file(#name, mode, phyd, local, &name## _ops); + + +DEBUGFS_READONLY_FILE(user_power, "%d", + local->user_power_level); +DEBUGFS_READONLY_FILE(power, "%d", + local->hw.conf.power_level); +DEBUGFS_READONLY_FILE(frequency, "%d", + local->hw.conf.channel->center_freq); +DEBUGFS_READONLY_FILE(total_ps_buffered, "%d", + local->total_ps_buffered); +DEBUGFS_READONLY_FILE(wep_iv, "%#08x", + local->wep_iv & 0xffffff); +DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", + local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); + +static ssize_t tsf_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + u64 tsf; + + tsf = drv_get_tsf(local); + + return mac80211_format_buffer(user_buf, count, ppos, "0x%016llx\n", + (unsigned long long) tsf); +} + +static ssize_t tsf_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + unsigned long long tsf; + char buf[100]; + size_t len; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + buf[len] = '\0'; + + if (strncmp(buf, "reset", 5) == 0) { + if (local->ops->reset_tsf) { + drv_reset_tsf(local); + wiphy_info(local->hw.wiphy, "debugfs reset TSF\n"); + } + } else { + tsf = simple_strtoul(buf, NULL, 0); + if (local->ops->set_tsf) { + drv_set_tsf(local, tsf); + wiphy_info(local->hw.wiphy, + "debugfs set TSF to %#018llx\n", tsf); + + } + } + + return count; +} + +static const struct file_operations tsf_ops = { + .read = tsf_read, + .write = tsf_write, + .open = mac80211_open_file_generic, + .llseek = default_llseek, +}; + +static ssize_t reset_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + + rtnl_lock(); + __ieee80211_suspend(&local->hw, NULL); + __ieee80211_resume(&local->hw); + rtnl_unlock(); + + return count; +} + +static const struct file_operations reset_ops = { + .write = reset_write, + .open = mac80211_open_file_generic, + .llseek = noop_llseek, +}; + +static ssize_t noack_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + + return mac80211_format_buffer(user_buf, count, ppos, "%d\n", + local->wifi_wme_noack_test); +} + +static ssize_t noack_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[10]; + size_t len; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + buf[len] = '\0'; + + local->wifi_wme_noack_test = !!simple_strtoul(buf, NULL, 0); + + return count; +} + +static const struct file_operations noack_ops = { + .read = noack_read, + .write = noack_write, + .open = mac80211_open_file_generic, + .llseek = default_llseek, +}; + +static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + return mac80211_format_buffer(user_buf, count, ppos, "0x%x\n", + local->uapsd_queues); +} + +static ssize_t uapsd_queues_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + unsigned long val; + char buf[10]; + size_t len; + int ret; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + buf[len] = '\0'; + + ret = strict_strtoul(buf, 0, &val); + + if (ret) + return -EINVAL; + + if (val & ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK) + return -ERANGE; + + local->uapsd_queues = val; + + return count; +} + +static const struct file_operations uapsd_queues_ops = { + .read = uapsd_queues_read, + .write = uapsd_queues_write, + .open = mac80211_open_file_generic, + .llseek = default_llseek, +}; + +static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + + return mac80211_format_buffer(user_buf, count, ppos, "0x%x\n", + local->uapsd_max_sp_len); +} + +static ssize_t uapsd_max_sp_len_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + unsigned long val; + char buf[10]; + size_t len; + int ret; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + buf[len] = '\0'; + + ret = strict_strtoul(buf, 0, &val); + + if (ret) + return -EINVAL; + + if (val & ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK) + return -ERANGE; + + local->uapsd_max_sp_len = val; + + return count; +} + +static const struct file_operations uapsd_max_sp_len_ops = { + .read = uapsd_max_sp_len_read, + .write = uapsd_max_sp_len_write, + .open = mac80211_open_file_generic, + .llseek = default_llseek, +}; + +static ssize_t channel_type_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + const char *buf; + + switch (local->hw.conf.channel_type) { + case NL80211_CHAN_NO_HT: + buf = "no ht\n"; + break; + case NL80211_CHAN_HT20: + buf = "ht20\n"; + break; + case NL80211_CHAN_HT40MINUS: + buf = "ht40-\n"; + break; + case NL80211_CHAN_HT40PLUS: + buf = "ht40+\n"; + break; + default: + buf = "???"; + break; + } + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t hwflags_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + int mxln = 500; + ssize_t rv; + char *buf = kzalloc(mxln, GFP_KERNEL); + int sf = 0; /* how many written so far */ + + sf += snprintf(buf, mxln - sf, "0x%x\n", local->hw.flags); + if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + sf += snprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n"); + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + sf += snprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n"); + if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) + sf += snprintf(buf + sf, mxln - sf, + "HOST_BCAST_PS_BUFFERING\n"); + if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE) + sf += snprintf(buf + sf, mxln - sf, + "2GHZ_SHORT_SLOT_INCAPABLE\n"); + if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE) + sf += snprintf(buf + sf, mxln - sf, + "2GHZ_SHORT_PREAMBLE_INCAPABLE\n"); + if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + sf += snprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n"); + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + sf += snprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n"); + if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) + sf += snprintf(buf + sf, mxln - sf, "NEED_DTIM_PERIOD\n"); + if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT) + sf += snprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n"); + if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) + sf += snprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n"); + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + sf += snprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); + if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) + sf += snprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); + if (local->hw.flags & IEEE80211_HW_BEACON_FILTER) + sf += snprintf(buf + sf, mxln - sf, "BEACON_FILTER\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_STATIC_SMPS\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_SMPS\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_UAPSD\n"); + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + sf += snprintf(buf + sf, mxln - sf, "REPORTS_TX_ACK_STATUS\n"); + if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + sf += snprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_CQM_RSSI\n"); + if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK) + sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n"); + if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) + sf += snprintf(buf + sf, mxln - sf, "AP_LINK_PS\n"); + + rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); + kfree(buf); + return rv; +} + +static ssize_t queues_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + unsigned long flags; + char buf[IEEE80211_MAX_QUEUES * 20]; + int q, res = 0; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + for (q = 0; q < local->hw.queues; q++) + res += sprintf(buf + res, "%02d: %#.8lx/%d\n", q, + local->queue_stop_reasons[q], + skb_queue_len(&local->pending[q])); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + return simple_read_from_buffer(user_buf, count, ppos, buf, res); +} + +DEBUGFS_READONLY_FILE_OPS(hwflags); +DEBUGFS_READONLY_FILE_OPS(channel_type); +DEBUGFS_READONLY_FILE_OPS(queues); + +/* statistics stuff */ + +static ssize_t format_devstat_counter(struct ieee80211_local *local, + char __user *userbuf, + size_t count, loff_t *ppos, + int (*printvalue)(struct ieee80211_low_level_stats *stats, char *buf, + int buflen)) +{ + struct ieee80211_low_level_stats stats; + char buf[20]; + int res; + + rtnl_lock(); + res = drv_get_stats(local, &stats); + rtnl_unlock(); + if (res) + return res; + res = printvalue(&stats, buf, sizeof(buf)); + return simple_read_from_buffer(userbuf, count, ppos, buf, res); +} + +#define DEBUGFS_DEVSTATS_FILE(name) \ +static int print_devstats_##name(struct ieee80211_low_level_stats *stats,\ + char *buf, int buflen) \ +{ \ + return scnprintf(buf, buflen, "%u\n", stats->name); \ +} \ +static ssize_t stats_ ##name## _read(struct file *file, \ + char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + return format_devstat_counter(file->private_data, \ + userbuf, \ + count, \ + ppos, \ + print_devstats_##name); \ +} \ + \ +static const struct file_operations stats_ ##name## _ops = { \ + .read = stats_ ##name## _read, \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +}; + +#define DEBUGFS_STATS_ADD(name, field) \ + debugfs_create_u32(#name, 0400, statsd, (u32 *) &field); +#define DEBUGFS_DEVSTATS_ADD(name) \ + debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); + +DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount); +DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount); +DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount); +DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount); + +void debugfs_hw_add(struct ieee80211_local *local) +{ + struct dentry *phyd = local->hw.wiphy->debugfsdir; + struct dentry *statsd; + + if (!phyd) + return; + + local->debugfs.keys = debugfs_create_dir("keys", phyd); + + DEBUGFS_ADD(frequency); + DEBUGFS_ADD(total_ps_buffered); + DEBUGFS_ADD(wep_iv); + DEBUGFS_ADD(tsf); + DEBUGFS_ADD(queues); + DEBUGFS_ADD_MODE(reset, 0200); + DEBUGFS_ADD(noack); + DEBUGFS_ADD(uapsd_queues); + DEBUGFS_ADD(uapsd_max_sp_len); + DEBUGFS_ADD(channel_type); + DEBUGFS_ADD(hwflags); + DEBUGFS_ADD(user_power); + DEBUGFS_ADD(power); + + statsd = debugfs_create_dir("statistics", phyd); + + /* if the dir failed, don't put all the other things into the root! */ + if (!statsd) + return; + + DEBUGFS_STATS_ADD(transmitted_fragment_count, + local->dot11TransmittedFragmentCount); + DEBUGFS_STATS_ADD(multicast_transmitted_frame_count, + local->dot11MulticastTransmittedFrameCount); + DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount); + DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount); + DEBUGFS_STATS_ADD(multiple_retry_count, + local->dot11MultipleRetryCount); + DEBUGFS_STATS_ADD(frame_duplicate_count, + local->dot11FrameDuplicateCount); + DEBUGFS_STATS_ADD(received_fragment_count, + local->dot11ReceivedFragmentCount); + DEBUGFS_STATS_ADD(multicast_received_frame_count, + local->dot11MulticastReceivedFrameCount); + DEBUGFS_STATS_ADD(transmitted_frame_count, + local->dot11TransmittedFrameCount); +#ifdef CONFIG_MAC80211_DEBUG_COUNTERS + DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop); + DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued); + DEBUGFS_STATS_ADD(tx_handlers_drop_unencrypted, + local->tx_handlers_drop_unencrypted); + DEBUGFS_STATS_ADD(tx_handlers_drop_fragment, + local->tx_handlers_drop_fragment); + DEBUGFS_STATS_ADD(tx_handlers_drop_wep, + local->tx_handlers_drop_wep); + DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc, + local->tx_handlers_drop_not_assoc); + DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port, + local->tx_handlers_drop_unauth_port); + DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop); + DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued); + DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc, + local->rx_handlers_drop_nullfunc); + DEBUGFS_STATS_ADD(rx_handlers_drop_defrag, + local->rx_handlers_drop_defrag); + DEBUGFS_STATS_ADD(rx_handlers_drop_short, + local->rx_handlers_drop_short); + DEBUGFS_STATS_ADD(rx_handlers_drop_passive_scan, + local->rx_handlers_drop_passive_scan); + DEBUGFS_STATS_ADD(tx_expand_skb_head, + local->tx_expand_skb_head); + DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned, + local->tx_expand_skb_head_cloned); + DEBUGFS_STATS_ADD(rx_expand_skb_head, + local->rx_expand_skb_head); + DEBUGFS_STATS_ADD(rx_expand_skb_head2, + local->rx_expand_skb_head2); + DEBUGFS_STATS_ADD(rx_handlers_fragments, + local->rx_handlers_fragments); + DEBUGFS_STATS_ADD(tx_status_drop, + local->tx_status_drop); +#endif + DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); + DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); + DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount); + DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount); +} diff --git a/net/mac80211/debugfs.h b/net/mac80211/debugfs.h new file mode 100644 index 00000000..7c875296 --- /dev/null +++ b/net/mac80211/debugfs.h @@ -0,0 +1,15 @@ +#ifndef __MAC80211_DEBUGFS_H +#define __MAC80211_DEBUGFS_H + +#ifdef CONFIG_MAC80211_DEBUGFS +extern void debugfs_hw_add(struct ieee80211_local *local); +extern int mac80211_open_file_generic(struct inode *inode, struct file *file); +extern int mac80211_format_buffer(char __user *userbuf, size_t count, + loff_t *ppos, char *fmt, ...); +#else +static inline void debugfs_hw_add(struct ieee80211_local *local) +{ +} +#endif + +#endif /* __MAC80211_DEBUGFS_H */ diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c new file mode 100644 index 00000000..33c58b85 --- /dev/null +++ b/net/mac80211/debugfs_key.c @@ -0,0 +1,342 @@ +/* + * Copyright 2003-2005 Devicescape Software, Inc. + * Copyright (c) 2006 Jiri Benc + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include "ieee80211_i.h" +#include "key.h" +#include "debugfs.h" +#include "debugfs_key.h" + +#define KEY_READ(name, prop, format_string) \ +static ssize_t key_##name##_read(struct file *file, \ + char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + struct ieee80211_key *key = file->private_data; \ + return mac80211_format_buffer(userbuf, count, ppos, \ + format_string, key->prop); \ +} +#define KEY_READ_D(name) KEY_READ(name, name, "%d\n") +#define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n") + +#define KEY_OPS(name) \ +static const struct file_operations key_ ##name## _ops = { \ + .read = key_##name##_read, \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +} + +#define KEY_FILE(name, format) \ + KEY_READ_##format(name) \ + KEY_OPS(name) + +#define KEY_CONF_READ(name, format_string) \ + KEY_READ(conf_##name, conf.name, format_string) +#define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n") + +#define KEY_CONF_OPS(name) \ +static const struct file_operations key_ ##name## _ops = { \ + .read = key_conf_##name##_read, \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +} + +#define KEY_CONF_FILE(name, format) \ + KEY_CONF_READ_##format(name) \ + KEY_CONF_OPS(name) + +KEY_CONF_FILE(keylen, D); +KEY_CONF_FILE(keyidx, D); +KEY_CONF_FILE(hw_key_idx, D); +KEY_FILE(flags, X); +KEY_FILE(tx_rx_count, D); +KEY_READ(ifindex, sdata->name, "%s\n"); +KEY_OPS(ifindex); + +static ssize_t key_algorithm_read(struct file *file, + char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[15]; + struct ieee80211_key *key = file->private_data; + u32 c = key->conf.cipher; + + sprintf(buf, "%.2x-%.2x-%.2x:%d\n", + c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); + return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); +} +KEY_OPS(algorithm); + +static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + const u8 *tpn; + char buf[20]; + int len; + struct ieee80211_key *key = file->private_data; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + len = scnprintf(buf, sizeof(buf), "\n"); + break; + case WLAN_CIPHER_SUITE_TKIP: + len = scnprintf(buf, sizeof(buf), "%08x %04x\n", + key->u.tkip.tx.iv32, + key->u.tkip.tx.iv16); + break; + case WLAN_CIPHER_SUITE_CCMP: + tpn = key->u.ccmp.tx_pn; + len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", + tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]); + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + tpn = key->u.aes_cmac.tx_pn; + len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", + tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], + tpn[5]); + break; + default: + return 0; + } + return simple_read_from_buffer(userbuf, count, ppos, buf, len); +} +KEY_OPS(tx_spec); + +static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + char buf[14*NUM_RX_DATA_QUEUES+1], *p = buf; + int i, len; + const u8 *rpn; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + len = scnprintf(buf, sizeof(buf), "\n"); + break; + case WLAN_CIPHER_SUITE_TKIP: + for (i = 0; i < NUM_RX_DATA_QUEUES; i++) + p += scnprintf(p, sizeof(buf)+buf-p, + "%08x %04x\n", + key->u.tkip.rx[i].iv32, + key->u.tkip.rx[i].iv16); + len = p - buf; + break; + case WLAN_CIPHER_SUITE_CCMP: + for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) { + rpn = key->u.ccmp.rx_pn[i]; + p += scnprintf(p, sizeof(buf)+buf-p, + "%02x%02x%02x%02x%02x%02x\n", + rpn[0], rpn[1], rpn[2], + rpn[3], rpn[4], rpn[5]); + } + len = p - buf; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + rpn = key->u.aes_cmac.rx_pn; + p += scnprintf(p, sizeof(buf)+buf-p, + "%02x%02x%02x%02x%02x%02x\n", + rpn[0], rpn[1], rpn[2], + rpn[3], rpn[4], rpn[5]); + len = p - buf; + break; + default: + return 0; + } + return simple_read_from_buffer(userbuf, count, ppos, buf, len); +} +KEY_OPS(rx_spec); + +static ssize_t key_replays_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + char buf[20]; + int len; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + len = scnprintf(buf, sizeof(buf), "%u\n", + key->u.aes_cmac.replays); + break; + default: + return 0; + } + return simple_read_from_buffer(userbuf, count, ppos, buf, len); +} +KEY_OPS(replays); + +static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + char buf[20]; + int len; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_AES_CMAC: + len = scnprintf(buf, sizeof(buf), "%u\n", + key->u.aes_cmac.icverrors); + break; + default: + return 0; + } + return simple_read_from_buffer(userbuf, count, ppos, buf, len); +} +KEY_OPS(icverrors); + +static ssize_t key_key_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct ieee80211_key *key = file->private_data; + int i, bufsize = 2 * key->conf.keylen + 2; + char *buf = kmalloc(bufsize, GFP_KERNEL); + char *p = buf; + ssize_t res; + + if (!buf) + return -ENOMEM; + + for (i = 0; i < key->conf.keylen; i++) + p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]); + p += scnprintf(p, bufsize+buf-p, "\n"); + res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return res; +} +KEY_OPS(key); + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, 0400, key->debugfs.dir, \ + key, &key_##name##_ops); + +void ieee80211_debugfs_key_add(struct ieee80211_key *key) + { + static int keycount; + char buf[50]; + struct sta_info *sta; + + if (!key->local->debugfs.keys) + return; + + sprintf(buf, "%d", keycount); + key->debugfs.cnt = keycount; + keycount++; + key->debugfs.dir = debugfs_create_dir(buf, + key->local->debugfs.keys); + + if (!key->debugfs.dir) + return; + + sta = key->sta; + if (sta) { + sprintf(buf, "../../stations/%pM", sta->sta.addr); + key->debugfs.stalink = + debugfs_create_symlink("station", key->debugfs.dir, buf); + } + + DEBUGFS_ADD(keylen); + DEBUGFS_ADD(flags); + DEBUGFS_ADD(keyidx); + DEBUGFS_ADD(hw_key_idx); + DEBUGFS_ADD(tx_rx_count); + DEBUGFS_ADD(algorithm); + DEBUGFS_ADD(tx_spec); + DEBUGFS_ADD(rx_spec); + DEBUGFS_ADD(replays); + DEBUGFS_ADD(icverrors); + DEBUGFS_ADD(key); + DEBUGFS_ADD(ifindex); +}; + +void ieee80211_debugfs_key_remove(struct ieee80211_key *key) +{ + if (!key) + return; + + debugfs_remove_recursive(key->debugfs.dir); + key->debugfs.dir = NULL; +} + +void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) +{ + char buf[50]; + struct ieee80211_key *key; + + if (!sdata->debugfs.dir) + return; + + lockdep_assert_held(&sdata->local->key_mtx); + + if (sdata->default_unicast_key) { + key = key_mtx_dereference(sdata->local, + sdata->default_unicast_key); + sprintf(buf, "../keys/%d", key->debugfs.cnt); + sdata->debugfs.default_unicast_key = + debugfs_create_symlink("default_unicast_key", + sdata->debugfs.dir, buf); + } else { + debugfs_remove(sdata->debugfs.default_unicast_key); + sdata->debugfs.default_unicast_key = NULL; + } + + if (sdata->default_multicast_key) { + key = key_mtx_dereference(sdata->local, + sdata->default_multicast_key); + sprintf(buf, "../keys/%d", key->debugfs.cnt); + sdata->debugfs.default_multicast_key = + debugfs_create_symlink("default_multicast_key", + sdata->debugfs.dir, buf); + } else { + debugfs_remove(sdata->debugfs.default_multicast_key); + sdata->debugfs.default_multicast_key = NULL; + } +} + +void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata) +{ + char buf[50]; + struct ieee80211_key *key; + + if (!sdata->debugfs.dir) + return; + + key = key_mtx_dereference(sdata->local, + sdata->default_mgmt_key); + if (key) { + sprintf(buf, "../keys/%d", key->debugfs.cnt); + sdata->debugfs.default_mgmt_key = + debugfs_create_symlink("default_mgmt_key", + sdata->debugfs.dir, buf); + } else + ieee80211_debugfs_key_remove_mgmt_default(sdata); +} + +void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata) + return; + + debugfs_remove(sdata->debugfs.default_mgmt_key); + sdata->debugfs.default_mgmt_key = NULL; +} + +void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, + struct sta_info *sta) +{ + debugfs_remove(key->debugfs.stalink); + key->debugfs.stalink = NULL; +} diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h new file mode 100644 index 00000000..32adc77e --- /dev/null +++ b/net/mac80211/debugfs_key.h @@ -0,0 +1,33 @@ +#ifndef __MAC80211_DEBUGFS_KEY_H +#define __MAC80211_DEBUGFS_KEY_H + +#ifdef CONFIG_MAC80211_DEBUGFS +void ieee80211_debugfs_key_add(struct ieee80211_key *key); +void ieee80211_debugfs_key_remove(struct ieee80211_key *key); +void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_key_add_mgmt_default( + struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_key_remove_mgmt_default( + struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, + struct sta_info *sta); +#else +static inline void ieee80211_debugfs_key_add(struct ieee80211_key *key) +{} +static inline void ieee80211_debugfs_key_remove(struct ieee80211_key *key) +{} +static inline void ieee80211_debugfs_key_update_default( + struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_debugfs_key_add_mgmt_default( + struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_debugfs_key_remove_mgmt_default( + struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, + struct sta_info *sta) +{} +#endif + +#endif /* __MAC80211_DEBUGFS_KEY_H */ diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c new file mode 100644 index 00000000..9ea7c0d0 --- /dev/null +++ b/net/mac80211/debugfs_netdev.c @@ -0,0 +1,564 @@ +/* + * Copyright (c) 2006 Jiri Benc + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ieee80211_i.h" +#include "rate.h" +#include "debugfs.h" +#include "debugfs_netdev.h" + +static ssize_t ieee80211_if_read( + struct ieee80211_sub_if_data *sdata, + char __user *userbuf, + size_t count, loff_t *ppos, + ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int)) +{ + char buf[70]; + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (sdata->dev->reg_state == NETREG_REGISTERED) + ret = (*format)(sdata, buf, sizeof(buf)); + read_unlock(&dev_base_lock); + + if (ret >= 0) + ret = simple_read_from_buffer(userbuf, count, ppos, buf, ret); + + return ret; +} + +static ssize_t ieee80211_if_write( + struct ieee80211_sub_if_data *sdata, + const char __user *userbuf, + size_t count, loff_t *ppos, + ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int)) +{ + u8 *buf; + ssize_t ret; + + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(buf, userbuf, count)) + goto freebuf; + + ret = -ENODEV; + rtnl_lock(); + if (sdata->dev->reg_state == NETREG_REGISTERED) + ret = (*write)(sdata, buf, count); + rtnl_unlock(); + +freebuf: + kfree(buf); + return ret; +} + +#define IEEE80211_IF_FMT(name, field, format_string) \ +static ssize_t ieee80211_if_fmt_##name( \ + const struct ieee80211_sub_if_data *sdata, char *buf, \ + int buflen) \ +{ \ + return scnprintf(buf, buflen, format_string, sdata->field); \ +} +#define IEEE80211_IF_FMT_DEC(name, field) \ + IEEE80211_IF_FMT(name, field, "%d\n") +#define IEEE80211_IF_FMT_HEX(name, field) \ + IEEE80211_IF_FMT(name, field, "%#x\n") +#define IEEE80211_IF_FMT_LHEX(name, field) \ + IEEE80211_IF_FMT(name, field, "%#lx\n") +#define IEEE80211_IF_FMT_SIZE(name, field) \ + IEEE80211_IF_FMT(name, field, "%zd\n") + +#define IEEE80211_IF_FMT_ATOMIC(name, field) \ +static ssize_t ieee80211_if_fmt_##name( \ + const struct ieee80211_sub_if_data *sdata, \ + char *buf, int buflen) \ +{ \ + return scnprintf(buf, buflen, "%d\n", atomic_read(&sdata->field));\ +} + +#define IEEE80211_IF_FMT_MAC(name, field) \ +static ssize_t ieee80211_if_fmt_##name( \ + const struct ieee80211_sub_if_data *sdata, char *buf, \ + int buflen) \ +{ \ + return scnprintf(buf, buflen, "%pM\n", sdata->field); \ +} + +#define IEEE80211_IF_FMT_DEC_DIV_16(name, field) \ +static ssize_t ieee80211_if_fmt_##name( \ + const struct ieee80211_sub_if_data *sdata, \ + char *buf, int buflen) \ +{ \ + return scnprintf(buf, buflen, "%d\n", sdata->field / 16); \ +} + +#define __IEEE80211_IF_FILE(name, _write) \ +static ssize_t ieee80211_if_read_##name(struct file *file, \ + char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + return ieee80211_if_read(file->private_data, \ + userbuf, count, ppos, \ + ieee80211_if_fmt_##name); \ +} \ +static const struct file_operations name##_ops = { \ + .read = ieee80211_if_read_##name, \ + .write = (_write), \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +} + +#define __IEEE80211_IF_FILE_W(name) \ +static ssize_t ieee80211_if_write_##name(struct file *file, \ + const char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + return ieee80211_if_write(file->private_data, userbuf, count, \ + ppos, ieee80211_if_parse_##name); \ +} \ +__IEEE80211_IF_FILE(name, ieee80211_if_write_##name) + + +#define IEEE80211_IF_FILE(name, field, format) \ + IEEE80211_IF_FMT_##format(name, field) \ + __IEEE80211_IF_FILE(name, NULL) + +/* common attributes */ +IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); +IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ], + HEX); +IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ], + HEX); +IEEE80211_IF_FILE(flags, flags, HEX); +IEEE80211_IF_FILE(state, state, LHEX); +IEEE80211_IF_FILE(channel_type, vif.bss_conf.channel_type, DEC); + +/* STA attributes */ +IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); +IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); +IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC); +IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16); + +static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) +{ + struct ieee80211_local *local = sdata->local; + int err; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) && + smps_mode == IEEE80211_SMPS_STATIC) + return -EINVAL; + + /* auto should be dynamic if in PS mode */ + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) && + (smps_mode == IEEE80211_SMPS_DYNAMIC || + smps_mode == IEEE80211_SMPS_AUTOMATIC)) + return -EINVAL; + + /* supported only on managed interfaces for now */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + mutex_lock(&sdata->u.mgd.mtx); + err = __ieee80211_request_smps(sdata, smps_mode); + mutex_unlock(&sdata->u.mgd.mtx); + + return err; +} + +static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = { + [IEEE80211_SMPS_AUTOMATIC] = "auto", + [IEEE80211_SMPS_OFF] = "off", + [IEEE80211_SMPS_STATIC] = "static", + [IEEE80211_SMPS_DYNAMIC] = "dynamic", +}; + +static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + return snprintf(buf, buflen, "request: %s\nused: %s\n", + smps_modes[sdata->u.mgd.req_smps], + smps_modes[sdata->u.mgd.ap_smps]); +} + +static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata, + const char *buf, int buflen) +{ + enum ieee80211_smps_mode mode; + + for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) { + if (strncmp(buf, smps_modes[mode], buflen) == 0) { + int err = ieee80211_set_smps(sdata, mode); + if (!err) + return buflen; + return err; + } + } + + return -EINVAL; +} + +__IEEE80211_IF_FILE_W(smps); + +static ssize_t ieee80211_if_fmt_tkip_mic_test( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + return -EOPNOTSUPP; +} + +static int hwaddr_aton(const char *txt, u8 *addr) +{ + int i; + + for (i = 0; i < ETH_ALEN; i++) { + int a, b; + + a = hex_to_bin(*txt++); + if (a < 0) + return -1; + b = hex_to_bin(*txt++); + if (b < 0) + return -1; + *addr++ = (a << 4) | b; + if (i < 5 && *txt++ != ':') + return -1; + } + + return 0; +} + +static ssize_t ieee80211_if_parse_tkip_mic_test( + struct ieee80211_sub_if_data *sdata, const char *buf, int buflen) +{ + struct ieee80211_local *local = sdata->local; + u8 addr[ETH_ALEN]; + struct sk_buff *skb; + struct ieee80211_hdr *hdr; + __le16 fc; + + /* + * Assume colon-delimited MAC address with possible white space + * following. + */ + if (buflen < 3 * ETH_ALEN - 1) + return -EINVAL; + if (hwaddr_aton(buf, addr) < 0) + return -EINVAL; + + if (!ieee80211_sdata_running(sdata)) + return -ENOTCONN; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24 + 100); + if (!skb) + return -ENOMEM; + skb_reserve(skb, local->hw.extra_tx_headroom); + + hdr = (struct ieee80211_hdr *) skb_put(skb, 24); + memset(hdr, 0, 24); + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA BSSID SA */ + memcpy(hdr->addr1, addr, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + memcpy(hdr->addr3, sdata->vif.addr, ETH_ALEN); + break; + case NL80211_IFTYPE_STATION: + fc |= cpu_to_le16(IEEE80211_FCTL_TODS); + /* BSSID SA DA */ + if (sdata->vif.bss_conf.bssid == NULL) { + dev_kfree_skb(skb); + return -ENOTCONN; + } + memcpy(hdr->addr1, sdata->vif.bss_conf.bssid, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + memcpy(hdr->addr3, addr, ETH_ALEN); + break; + default: + dev_kfree_skb(skb); + return -EOPNOTSUPP; + } + hdr->frame_control = fc; + + /* + * Add some length to the test frame to make it look bit more valid. + * The exact contents does not matter since the recipient is required + * to drop this because of the Michael MIC failure. + */ + memset(skb_put(skb, 50), 0, 50); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_TKIP_MIC_FAILURE; + + ieee80211_tx_skb(sdata, skb); + + return buflen; +} + +__IEEE80211_IF_FILE_W(tkip_mic_test); + +/* AP attributes */ +IEEE80211_IF_FILE(num_sta_ps, u.ap.num_sta_ps, ATOMIC); +IEEE80211_IF_FILE(dtim_count, u.ap.dtim_count, DEC); + +static ssize_t ieee80211_if_fmt_num_buffered_multicast( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + return scnprintf(buf, buflen, "%u\n", + skb_queue_len(&sdata->u.ap.ps_bc_buf)); +} +__IEEE80211_IF_FILE(num_buffered_multicast, NULL); + +/* WDS attributes */ +IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC); + +#ifdef CONFIG_MAC80211_MESH +/* Mesh stats attributes */ +IEEE80211_IF_FILE(fwded_mcast, u.mesh.mshstats.fwded_mcast, DEC); +IEEE80211_IF_FILE(fwded_unicast, u.mesh.mshstats.fwded_unicast, DEC); +IEEE80211_IF_FILE(fwded_frames, u.mesh.mshstats.fwded_frames, DEC); +IEEE80211_IF_FILE(dropped_frames_ttl, u.mesh.mshstats.dropped_frames_ttl, DEC); +IEEE80211_IF_FILE(dropped_frames_no_route, + u.mesh.mshstats.dropped_frames_no_route, DEC); +IEEE80211_IF_FILE(estab_plinks, u.mesh.mshstats.estab_plinks, ATOMIC); + +/* Mesh parameters */ +IEEE80211_IF_FILE(dot11MeshMaxRetries, + u.mesh.mshcfg.dot11MeshMaxRetries, DEC); +IEEE80211_IF_FILE(dot11MeshRetryTimeout, + u.mesh.mshcfg.dot11MeshRetryTimeout, DEC); +IEEE80211_IF_FILE(dot11MeshConfirmTimeout, + u.mesh.mshcfg.dot11MeshConfirmTimeout, DEC); +IEEE80211_IF_FILE(dot11MeshHoldingTimeout, + u.mesh.mshcfg.dot11MeshHoldingTimeout, DEC); +IEEE80211_IF_FILE(dot11MeshTTL, u.mesh.mshcfg.dot11MeshTTL, DEC); +IEEE80211_IF_FILE(element_ttl, u.mesh.mshcfg.element_ttl, DEC); +IEEE80211_IF_FILE(auto_open_plinks, u.mesh.mshcfg.auto_open_plinks, DEC); +IEEE80211_IF_FILE(dot11MeshMaxPeerLinks, + u.mesh.mshcfg.dot11MeshMaxPeerLinks, DEC); +IEEE80211_IF_FILE(dot11MeshHWMPactivePathTimeout, + u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout, DEC); +IEEE80211_IF_FILE(dot11MeshHWMPpreqMinInterval, + u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval, DEC); +IEEE80211_IF_FILE(dot11MeshHWMPnetDiameterTraversalTime, + u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime, DEC); +IEEE80211_IF_FILE(dot11MeshHWMPmaxPREQretries, + u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries, DEC); +IEEE80211_IF_FILE(path_refresh_time, + u.mesh.mshcfg.path_refresh_time, DEC); +IEEE80211_IF_FILE(min_discovery_timeout, + u.mesh.mshcfg.min_discovery_timeout, DEC); +IEEE80211_IF_FILE(dot11MeshHWMPRootMode, + u.mesh.mshcfg.dot11MeshHWMPRootMode, DEC); +#endif + + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, 0400, sdata->debugfs.dir, \ + sdata, &name##_ops); + +#define DEBUGFS_ADD_MODE(name, mode) \ + debugfs_create_file(#name, mode, sdata->debugfs.dir, \ + sdata, &name##_ops); + +static void add_sta_files(struct ieee80211_sub_if_data *sdata) +{ + DEBUGFS_ADD(drop_unencrypted); + DEBUGFS_ADD(flags); + DEBUGFS_ADD(state); + DEBUGFS_ADD(channel_type); + DEBUGFS_ADD(rc_rateidx_mask_2ghz); + DEBUGFS_ADD(rc_rateidx_mask_5ghz); + + DEBUGFS_ADD(bssid); + DEBUGFS_ADD(aid); + DEBUGFS_ADD(last_beacon); + DEBUGFS_ADD(ave_beacon); + DEBUGFS_ADD_MODE(smps, 0600); + DEBUGFS_ADD_MODE(tkip_mic_test, 0200); +} + +static void add_ap_files(struct ieee80211_sub_if_data *sdata) +{ + DEBUGFS_ADD(drop_unencrypted); + DEBUGFS_ADD(flags); + DEBUGFS_ADD(state); + DEBUGFS_ADD(channel_type); + DEBUGFS_ADD(rc_rateidx_mask_2ghz); + DEBUGFS_ADD(rc_rateidx_mask_5ghz); + + DEBUGFS_ADD(num_sta_ps); + DEBUGFS_ADD(dtim_count); + DEBUGFS_ADD(num_buffered_multicast); + DEBUGFS_ADD_MODE(tkip_mic_test, 0200); +} + +static void add_wds_files(struct ieee80211_sub_if_data *sdata) +{ + DEBUGFS_ADD(drop_unencrypted); + DEBUGFS_ADD(flags); + DEBUGFS_ADD(state); + DEBUGFS_ADD(channel_type); + DEBUGFS_ADD(rc_rateidx_mask_2ghz); + DEBUGFS_ADD(rc_rateidx_mask_5ghz); + + DEBUGFS_ADD(peer); +} + +static void add_vlan_files(struct ieee80211_sub_if_data *sdata) +{ + DEBUGFS_ADD(drop_unencrypted); + DEBUGFS_ADD(flags); + DEBUGFS_ADD(state); + DEBUGFS_ADD(channel_type); + DEBUGFS_ADD(rc_rateidx_mask_2ghz); + DEBUGFS_ADD(rc_rateidx_mask_5ghz); +} + +static void add_monitor_files(struct ieee80211_sub_if_data *sdata) +{ + DEBUGFS_ADD(flags); + DEBUGFS_ADD(state); + DEBUGFS_ADD(channel_type); +} + +#ifdef CONFIG_MAC80211_MESH + +static void add_mesh_stats(struct ieee80211_sub_if_data *sdata) +{ + struct dentry *dir = debugfs_create_dir("mesh_stats", + sdata->debugfs.dir); + +#define MESHSTATS_ADD(name)\ + debugfs_create_file(#name, 0400, dir, sdata, &name##_ops); + + MESHSTATS_ADD(fwded_mcast); + MESHSTATS_ADD(fwded_unicast); + MESHSTATS_ADD(fwded_frames); + MESHSTATS_ADD(dropped_frames_ttl); + MESHSTATS_ADD(dropped_frames_no_route); + MESHSTATS_ADD(estab_plinks); +#undef MESHSTATS_ADD +} + +static void add_mesh_config(struct ieee80211_sub_if_data *sdata) +{ + struct dentry *dir = debugfs_create_dir("mesh_config", + sdata->debugfs.dir); + +#define MESHPARAMS_ADD(name) \ + debugfs_create_file(#name, 0600, dir, sdata, &name##_ops); + + MESHPARAMS_ADD(dot11MeshMaxRetries); + MESHPARAMS_ADD(dot11MeshRetryTimeout); + MESHPARAMS_ADD(dot11MeshConfirmTimeout); + MESHPARAMS_ADD(dot11MeshHoldingTimeout); + MESHPARAMS_ADD(dot11MeshTTL); + MESHPARAMS_ADD(element_ttl); + MESHPARAMS_ADD(auto_open_plinks); + MESHPARAMS_ADD(dot11MeshMaxPeerLinks); + MESHPARAMS_ADD(dot11MeshHWMPactivePathTimeout); + MESHPARAMS_ADD(dot11MeshHWMPpreqMinInterval); + MESHPARAMS_ADD(dot11MeshHWMPnetDiameterTraversalTime); + MESHPARAMS_ADD(dot11MeshHWMPmaxPREQretries); + MESHPARAMS_ADD(path_refresh_time); + MESHPARAMS_ADD(min_discovery_timeout); + +#undef MESHPARAMS_ADD +} +#endif + +static void add_files(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata->debugfs.dir) + return; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_MESH_POINT: +#ifdef CONFIG_MAC80211_MESH + add_mesh_stats(sdata); + add_mesh_config(sdata); +#endif + break; + case NL80211_IFTYPE_STATION: + add_sta_files(sdata); + break; + case NL80211_IFTYPE_ADHOC: + /* XXX */ + break; + case NL80211_IFTYPE_AP: + add_ap_files(sdata); + break; + case NL80211_IFTYPE_WDS: + add_wds_files(sdata); + break; + case NL80211_IFTYPE_MONITOR: + add_monitor_files(sdata); + break; + case NL80211_IFTYPE_AP_VLAN: + add_vlan_files(sdata); + break; + default: + break; + } +} + +void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) +{ + char buf[10+IFNAMSIZ]; + + sprintf(buf, "netdev:%s", sdata->name); + sdata->debugfs.dir = debugfs_create_dir(buf, + sdata->local->hw.wiphy->debugfsdir); + if (sdata->debugfs.dir) + sdata->debugfs.subdir_stations = debugfs_create_dir("stations", + sdata->debugfs.dir); + add_files(sdata); +} + +void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata->debugfs.dir) + return; + + debugfs_remove_recursive(sdata->debugfs.dir); + sdata->debugfs.dir = NULL; +} + +void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) +{ + struct dentry *dir; + char buf[10 + IFNAMSIZ]; + + dir = sdata->debugfs.dir; + + if (!dir) + return; + + sprintf(buf, "netdev:%s", sdata->name); + if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf)) + printk(KERN_ERR "mac80211: debugfs: failed to rename debugfs " + "dir to %s\n", buf); +} diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h new file mode 100644 index 00000000..79025e79 --- /dev/null +++ b/net/mac80211/debugfs_netdev.h @@ -0,0 +1,22 @@ +/* routines exported for debugfs handling */ + +#ifndef __IEEE80211_DEBUGFS_NETDEV_H +#define __IEEE80211_DEBUGFS_NETDEV_H + +#ifdef CONFIG_MAC80211_DEBUGFS +void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata); +void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata); +#else +static inline void ieee80211_debugfs_add_netdev( + struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_debugfs_remove_netdev( + struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_debugfs_rename_netdev( + struct ieee80211_sub_if_data *sdata) +{} +#endif + +#endif /* __IEEE80211_DEBUGFS_NETDEV_H */ diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c new file mode 100644 index 00000000..a01d2137 --- /dev/null +++ b/net/mac80211/debugfs_sta.c @@ -0,0 +1,377 @@ +/* + * Copyright 2003-2005 Devicescape Software, Inc. + * Copyright (c) 2006 Jiri Benc + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include "ieee80211_i.h" +#include "debugfs.h" +#include "debugfs_sta.h" +#include "sta_info.h" + +/* sta attributtes */ + +#define STA_READ(name, field, format_string) \ +static ssize_t sta_ ##name## _read(struct file *file, \ + char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + struct sta_info *sta = file->private_data; \ + return mac80211_format_buffer(userbuf, count, ppos, \ + format_string, sta->field); \ +} +#define STA_READ_D(name, field) STA_READ(name, field, "%d\n") +#define STA_READ_U(name, field) STA_READ(name, field, "%u\n") +#define STA_READ_S(name, field) STA_READ(name, field, "%s\n") + +#define STA_OPS(name) \ +static const struct file_operations sta_ ##name## _ops = { \ + .read = sta_##name##_read, \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +} + +#define STA_OPS_RW(name) \ +static const struct file_operations sta_ ##name## _ops = { \ + .read = sta_##name##_read, \ + .write = sta_##name##_write, \ + .open = mac80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +} + +#define STA_FILE(name, field, format) \ + STA_READ_##format(name, field) \ + STA_OPS(name) + +STA_FILE(aid, sta.aid, D); +STA_FILE(dev, sdata->name, S); +STA_FILE(last_signal, last_signal, D); + +static ssize_t sta_flags_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[100]; + struct sta_info *sta = file->private_data; + u32 staflags = get_sta_flags(sta); + int res = scnprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s", + staflags & WLAN_STA_AUTH ? "AUTH\n" : "", + staflags & WLAN_STA_ASSOC ? "ASSOC\n" : "", + staflags & WLAN_STA_PS_STA ? "PS (sta)\n" : "", + staflags & WLAN_STA_PS_DRIVER ? "PS (driver)\n" : "", + staflags & WLAN_STA_AUTHORIZED ? "AUTHORIZED\n" : "", + staflags & WLAN_STA_SHORT_PREAMBLE ? "SHORT PREAMBLE\n" : "", + staflags & WLAN_STA_WME ? "WME\n" : "", + staflags & WLAN_STA_WDS ? "WDS\n" : "", + staflags & WLAN_STA_MFP ? "MFP\n" : ""); + return simple_read_from_buffer(userbuf, count, ppos, buf, res); +} +STA_OPS(flags); + +static ssize_t sta_num_ps_buf_frames_read(struct file *file, + char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + return mac80211_format_buffer(userbuf, count, ppos, "%u\n", + skb_queue_len(&sta->ps_tx_buf)); +} +STA_OPS(num_ps_buf_frames); + +static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + return mac80211_format_buffer(userbuf, count, ppos, "%d\n", + jiffies_to_msecs(jiffies - sta->last_rx)); +} +STA_OPS(inactive_ms); + + +static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + struct timespec uptime; + struct tm result; + long connected_time_secs; + char buf[100]; + int res; + do_posix_clock_monotonic_gettime(&uptime); + connected_time_secs = uptime.tv_sec - sta->last_connected; + time_to_tm(connected_time_secs, 0, &result); + result.tm_year -= 70; + result.tm_mday -= 1; + res = scnprintf(buf, sizeof(buf), + "years - %ld\nmonths - %d\ndays - %d\nclock - %d:%d:%d\n\n", + result.tm_year, result.tm_mon, result.tm_mday, + result.tm_hour, result.tm_min, result.tm_sec); + return simple_read_from_buffer(userbuf, count, ppos, buf, res); +} +STA_OPS(connected_time); + + + +static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[15*NUM_RX_DATA_QUEUES], *p = buf; + int i; + struct sta_info *sta = file->private_data; + for (i = 0; i < NUM_RX_DATA_QUEUES; i++) + p += scnprintf(p, sizeof(buf)+buf-p, "%x ", + le16_to_cpu(sta->last_seq_ctrl[i])); + p += scnprintf(p, sizeof(buf)+buf-p, "\n"); + return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); +} +STA_OPS(last_seq_ctrl); + +static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[71 + STA_TID_NUM * 40], *p = buf; + int i; + struct sta_info *sta = file->private_data; + struct tid_ampdu_rx *tid_rx; + struct tid_ampdu_tx *tid_tx; + + rcu_read_lock(); + + p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n", + sta->ampdu_mlme.dialog_token_allocator + 1); + p += scnprintf(p, sizeof(buf) + buf - p, + "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); + + for (i = 0; i < STA_TID_NUM; i++) { + tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]); + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]); + + p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i); + p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_rx); + p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x", + tid_rx ? tid_rx->dialog_token : 0); + p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x", + tid_rx ? tid_rx->ssn : 0); + + p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_tx); + p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x", + tid_tx ? tid_tx->dialog_token : 0); + p += scnprintf(p, sizeof(buf) + buf - p, "\t%03d", + tid_tx ? skb_queue_len(&tid_tx->pending) : 0); + p += scnprintf(p, sizeof(buf) + buf - p, "\n"); + } + rcu_read_unlock(); + + return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); +} + +static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + char _buf[12], *buf = _buf; + struct sta_info *sta = file->private_data; + bool start, tx; + unsigned long tid; + int ret; + + if (count > sizeof(_buf)) + return -EINVAL; + + if (copy_from_user(buf, userbuf, count)) + return -EFAULT; + + buf[sizeof(_buf) - 1] = '\0'; + + if (strncmp(buf, "tx ", 3) == 0) { + buf += 3; + tx = true; + } else if (strncmp(buf, "rx ", 3) == 0) { + buf += 3; + tx = false; + } else + return -EINVAL; + + if (strncmp(buf, "start ", 6) == 0) { + buf += 6; + start = true; + if (!tx) + return -EINVAL; + } else if (strncmp(buf, "stop ", 5) == 0) { + buf += 5; + start = false; + } else + return -EINVAL; + + tid = simple_strtoul(buf, NULL, 0); + + if (tid >= STA_TID_NUM) + return -EINVAL; + + if (tx) { + if (start) + ret = ieee80211_start_tx_ba_session(&sta->sta, tid, 5000); + else + ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); + } else { + __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, + 3, true); + ret = 0; + } + + return ret ?: count; +} +STA_OPS_RW(agg_status); + +static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ +#define PRINT_HT_CAP(_cond, _str) \ + do { \ + if (_cond) \ + p += scnprintf(p, sizeof(buf)+buf-p, "\t" _str "\n"); \ + } while (0) + char buf[512], *p = buf; + int i; + struct sta_info *sta = file->private_data; + struct ieee80211_sta_ht_cap *htc = &sta->sta.ht_cap; + + p += scnprintf(p, sizeof(buf) + buf - p, "ht %ssupported\n", + htc->ht_supported ? "" : "not "); + if (htc->ht_supported) { + p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.4x\n", htc->cap); + + PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC"); + PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40"); + PRINT_HT_CAP(!(htc->cap & BIT(1)), "HT20"); + + PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 0, "Static SM Power Save"); + PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 1, "Dynamic SM Power Save"); + PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 3, "SM Power Save disabled"); + + PRINT_HT_CAP((htc->cap & BIT(4)), "RX Greenfield"); + PRINT_HT_CAP((htc->cap & BIT(5)), "RX HT20 SGI"); + PRINT_HT_CAP((htc->cap & BIT(6)), "RX HT40 SGI"); + PRINT_HT_CAP((htc->cap & BIT(7)), "TX STBC"); + + PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 0, "No RX STBC"); + PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 1, "RX STBC 1-stream"); + PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 2, "RX STBC 2-streams"); + PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 3, "RX STBC 3-streams"); + + PRINT_HT_CAP((htc->cap & BIT(10)), "HT Delayed Block Ack"); + + PRINT_HT_CAP((htc->cap & BIT(11)), "Max AMSDU length: " + "3839 bytes"); + PRINT_HT_CAP(!(htc->cap & BIT(11)), "Max AMSDU length: " + "7935 bytes"); + + /* + * For beacons and probe response this would mean the BSS + * does or does not allow the usage of DSSS/CCK HT40. + * Otherwise it means the STA does or does not use + * DSSS/CCK HT40. + */ + PRINT_HT_CAP((htc->cap & BIT(12)), "DSSS/CCK HT40"); + PRINT_HT_CAP(!(htc->cap & BIT(12)), "No DSSS/CCK HT40"); + + /* BIT(13) is reserved */ + + PRINT_HT_CAP((htc->cap & BIT(14)), "40 MHz Intolerant"); + + PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection"); + + p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n", + htc->ampdu_factor, htc->ampdu_density); + p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:"); + + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) + p += scnprintf(p, sizeof(buf)+buf-p, " %.2x", + htc->mcs.rx_mask[i]); + p += scnprintf(p, sizeof(buf)+buf-p, "\n"); + + /* If not set this is meaningless */ + if (le16_to_cpu(htc->mcs.rx_highest)) { + p += scnprintf(p, sizeof(buf)+buf-p, + "MCS rx highest: %d Mbps\n", + le16_to_cpu(htc->mcs.rx_highest)); + } + + p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n", + htc->mcs.tx_params); + } + + return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); +} +STA_OPS(ht_capa); + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, 0400, \ + sta->debugfs.dir, sta, &sta_ ##name## _ops); + +#define DEBUGFS_ADD_COUNTER(name, field) \ + if (sizeof(sta->field) == sizeof(u32)) \ + debugfs_create_u32(#name, 0400, sta->debugfs.dir, \ + (u32 *) &sta->field); \ + else \ + debugfs_create_u64(#name, 0400, sta->debugfs.dir, \ + (u64 *) &sta->field); + +void ieee80211_sta_debugfs_add(struct sta_info *sta) +{ + struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations; + u8 mac[3*ETH_ALEN]; + + sta->debugfs.add_has_run = true; + + if (!stations_dir) + return; + + snprintf(mac, sizeof(mac), "%pM", sta->sta.addr); + + /* + * This might fail due to a race condition: + * When mac80211 unlinks a station, the debugfs entries + * remain, but it is already possible to link a new + * station with the same address which triggers adding + * it to debugfs; therefore, if the old station isn't + * destroyed quickly enough the old station's debugfs + * dir might still be around. + */ + sta->debugfs.dir = debugfs_create_dir(mac, stations_dir); + if (!sta->debugfs.dir) + return; + + DEBUGFS_ADD(flags); + DEBUGFS_ADD(num_ps_buf_frames); + DEBUGFS_ADD(inactive_ms); + DEBUGFS_ADD(connected_time); + DEBUGFS_ADD(last_seq_ctrl); + DEBUGFS_ADD(agg_status); + DEBUGFS_ADD(dev); + DEBUGFS_ADD(last_signal); + DEBUGFS_ADD(ht_capa); + + DEBUGFS_ADD_COUNTER(rx_packets, rx_packets); + DEBUGFS_ADD_COUNTER(tx_packets, tx_packets); + DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes); + DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes); + DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates); + DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments); + DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped); + DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments); + DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count); + DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed); + DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count); + DEBUGFS_ADD_COUNTER(wep_weak_iv_count, wep_weak_iv_count); +} + +void ieee80211_sta_debugfs_remove(struct sta_info *sta) +{ + debugfs_remove_recursive(sta->debugfs.dir); + sta->debugfs.dir = NULL; +} diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h new file mode 100644 index 00000000..8b608903 --- /dev/null +++ b/net/mac80211/debugfs_sta.h @@ -0,0 +1,14 @@ +#ifndef __MAC80211_DEBUGFS_STA_H +#define __MAC80211_DEBUGFS_STA_H + +#include "sta_info.h" + +#ifdef CONFIG_MAC80211_DEBUGFS +void ieee80211_sta_debugfs_add(struct sta_info *sta); +void ieee80211_sta_debugfs_remove(struct sta_info *sta); +#else +static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {} +static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {} +#endif + +#endif /* __MAC80211_DEBUGFS_STA_H */ diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h new file mode 100644 index 00000000..eebf7a67 --- /dev/null +++ b/net/mac80211/driver-ops.h @@ -0,0 +1,640 @@ +#ifndef __MAC80211_DRIVER_OPS +#define __MAC80211_DRIVER_OPS + +#include +#include "ieee80211_i.h" +#include "driver-trace.h" + +static inline void drv_tx(struct ieee80211_local *local, struct sk_buff *skb) +{ + local->ops->tx(&local->hw, skb); +} + +static inline int drv_start(struct ieee80211_local *local) +{ + int ret; + + might_sleep(); + + trace_drv_start(local); + local->started = true; + smp_mb(); + ret = local->ops->start(&local->hw); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_stop(struct ieee80211_local *local) +{ + might_sleep(); + + trace_drv_stop(local); + local->ops->stop(&local->hw); + trace_drv_return_void(local); + + /* sync away all work on the tasklet before clearing started */ + tasklet_disable(&local->tasklet); + tasklet_enable(&local->tasklet); + + barrier(); + + local->started = false; +} + +#ifdef CONFIG_PM +static inline int drv_suspend(struct ieee80211_local *local, + struct cfg80211_wowlan *wowlan) +{ + int ret; + + might_sleep(); + + trace_drv_suspend(local); + ret = local->ops->suspend(&local->hw, wowlan); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int drv_resume(struct ieee80211_local *local) +{ + int ret; + + might_sleep(); + + trace_drv_resume(local); + ret = local->ops->resume(&local->hw); + trace_drv_return_int(local, ret); + return ret; +} +#endif + +static inline int drv_add_interface(struct ieee80211_local *local, + struct ieee80211_vif *vif) +{ + int ret; + + might_sleep(); + + trace_drv_add_interface(local, vif_to_sdata(vif)); + ret = local->ops->add_interface(&local->hw, vif); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type, bool p2p) +{ + int ret; + + might_sleep(); + + trace_drv_change_interface(local, sdata, type, p2p); + ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_remove_interface(struct ieee80211_local *local, + struct ieee80211_vif *vif) +{ + might_sleep(); + + trace_drv_remove_interface(local, vif_to_sdata(vif)); + local->ops->remove_interface(&local->hw, vif); + trace_drv_return_void(local); +} + +static inline int drv_config(struct ieee80211_local *local, u32 changed) +{ + int ret; + + might_sleep(); + + trace_drv_config(local, changed); + ret = local->ops->config(&local->hw, changed); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_bss_info_changed(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss_conf *info, + u32 changed) +{ + might_sleep(); + + trace_drv_bss_info_changed(local, sdata, info, changed); + if (local->ops->bss_info_changed) + local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed); + trace_drv_return_void(local); +} + +static inline u64 drv_prepare_multicast(struct ieee80211_local *local, + struct netdev_hw_addr_list *mc_list) +{ + u64 ret = 0; + + trace_drv_prepare_multicast(local, mc_list->count); + + if (local->ops->prepare_multicast) + ret = local->ops->prepare_multicast(&local->hw, mc_list); + + trace_drv_return_u64(local, ret); + + return ret; +} + +static inline void drv_configure_filter(struct ieee80211_local *local, + unsigned int changed_flags, + unsigned int *total_flags, + u64 multicast) +{ + might_sleep(); + + trace_drv_configure_filter(local, changed_flags, total_flags, + multicast); + local->ops->configure_filter(&local->hw, changed_flags, total_flags, + multicast); + trace_drv_return_void(local); +} + +static inline int drv_set_tim(struct ieee80211_local *local, + struct ieee80211_sta *sta, bool set) +{ + int ret = 0; + trace_drv_set_tim(local, sta, set); + if (local->ops->set_tim) + ret = local->ops->set_tim(&local->hw, sta, set); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int drv_set_key(struct ieee80211_local *local, + enum set_key_cmd cmd, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key) +{ + int ret; + + might_sleep(); + + trace_drv_set_key(local, cmd, sdata, sta, key); + ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_update_tkip_key(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_key_conf *conf, + struct sta_info *sta, u32 iv32, + u16 *phase1key) +{ + struct ieee80211_sta *ista = NULL; + + if (sta) + ista = &sta->sta; + + trace_drv_update_tkip_key(local, sdata, conf, ista, iv32); + if (local->ops->update_tkip_key) + local->ops->update_tkip_key(&local->hw, &sdata->vif, conf, + ista, iv32, phase1key); + trace_drv_return_void(local); +} + +static inline int drv_hw_scan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_scan_request *req) +{ + int ret; + + might_sleep(); + + trace_drv_hw_scan(local, sdata); + ret = local->ops->hw_scan(&local->hw, &sdata->vif, req); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int +drv_sched_scan_start(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_sched_scan_request *req, + struct ieee80211_sched_scan_ies *ies) +{ + int ret; + + might_sleep(); + + trace_drv_sched_scan_start(local, sdata); + ret = local->ops->sched_scan_start(&local->hw, &sdata->vif, + req, ies); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_sched_scan_stop(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + + trace_drv_sched_scan_stop(local, sdata); + local->ops->sched_scan_stop(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + +static inline void drv_sw_scan_start(struct ieee80211_local *local) +{ + might_sleep(); + + trace_drv_sw_scan_start(local); + if (local->ops->sw_scan_start) + local->ops->sw_scan_start(&local->hw); + trace_drv_return_void(local); +} + +static inline void drv_sw_scan_complete(struct ieee80211_local *local) +{ + might_sleep(); + + trace_drv_sw_scan_complete(local); + if (local->ops->sw_scan_complete) + local->ops->sw_scan_complete(&local->hw); + trace_drv_return_void(local); +} + +static inline int drv_get_stats(struct ieee80211_local *local, + struct ieee80211_low_level_stats *stats) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + if (local->ops->get_stats) + ret = local->ops->get_stats(&local->hw, stats); + trace_drv_get_stats(local, stats, ret); + + return ret; +} + +static inline void drv_get_tkip_seq(struct ieee80211_local *local, + u8 hw_key_idx, u32 *iv32, u16 *iv16) +{ + if (local->ops->get_tkip_seq) + local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16); + trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16); +} + +static inline int drv_set_frag_threshold(struct ieee80211_local *local, + u32 value) +{ + int ret = 0; + + might_sleep(); + + trace_drv_set_frag_threshold(local, value); + if (local->ops->set_frag_threshold) + ret = local->ops->set_frag_threshold(&local->hw, value); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int drv_set_rts_threshold(struct ieee80211_local *local, + u32 value) +{ + int ret = 0; + + might_sleep(); + + trace_drv_set_rts_threshold(local, value); + if (local->ops->set_rts_threshold) + ret = local->ops->set_rts_threshold(&local->hw, value); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int drv_set_coverage_class(struct ieee80211_local *local, + u8 value) +{ + int ret = 0; + might_sleep(); + + trace_drv_set_coverage_class(local, value); + if (local->ops->set_coverage_class) + local->ops->set_coverage_class(&local->hw, value); + else + ret = -EOPNOTSUPP; + + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_sta_notify(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum sta_notify_cmd cmd, + struct ieee80211_sta *sta) +{ + trace_drv_sta_notify(local, sdata, cmd, sta); + if (local->ops->sta_notify) + local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta); + trace_drv_return_void(local); +} + +static inline int drv_sta_add(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta) +{ + int ret = 0; + + might_sleep(); + + trace_drv_sta_add(local, sdata, sta); + if (local->ops->sta_add) + ret = local->ops->sta_add(&local->hw, &sdata->vif, sta); + + trace_drv_return_int(local, ret); + + return ret; +} + +static inline void drv_sta_remove(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta) +{ + might_sleep(); + + trace_drv_sta_remove(local, sdata, sta); + if (local->ops->sta_remove) + local->ops->sta_remove(&local->hw, &sdata->vif, sta); + + trace_drv_return_void(local); +} + +static inline int drv_conf_tx(struct ieee80211_local *local, u16 queue, + const struct ieee80211_tx_queue_params *params) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + trace_drv_conf_tx(local, queue, params); + if (local->ops->conf_tx) + ret = local->ops->conf_tx(&local->hw, queue, params); + trace_drv_return_int(local, ret); + return ret; +} + +static inline u64 drv_get_tsf(struct ieee80211_local *local) +{ + u64 ret = -1ULL; + + might_sleep(); + + trace_drv_get_tsf(local); + if (local->ops->get_tsf) + ret = local->ops->get_tsf(&local->hw); + trace_drv_return_u64(local, ret); + return ret; +} + +static inline void drv_set_tsf(struct ieee80211_local *local, u64 tsf) +{ + might_sleep(); + + trace_drv_set_tsf(local, tsf); + if (local->ops->set_tsf) + local->ops->set_tsf(&local->hw, tsf); + trace_drv_return_void(local); +} + +static inline void drv_reset_tsf(struct ieee80211_local *local) +{ + might_sleep(); + + trace_drv_reset_tsf(local); + if (local->ops->reset_tsf) + local->ops->reset_tsf(&local->hw); + trace_drv_return_void(local); +} + +static inline int drv_tx_last_beacon(struct ieee80211_local *local) +{ + int ret = 0; /* default unsuported op for less congestion */ + + might_sleep(); + + trace_drv_tx_last_beacon(local); + if (local->ops->tx_last_beacon) + ret = local->ops->tx_last_beacon(&local->hw); + trace_drv_return_int(local, ret); + return ret; +} + +static inline int drv_ampdu_action(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_ampdu_mlme_action action, + struct ieee80211_sta *sta, u16 tid, + u16 *ssn, u8 buf_size) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size); + + if (local->ops->ampdu_action) + ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action, + sta, tid, ssn, buf_size); + + trace_drv_return_int(local, ret); + + return ret; +} + +static inline int drv_get_survey(struct ieee80211_local *local, int idx, + struct survey_info *survey) +{ + int ret = -EOPNOTSUPP; + + trace_drv_get_survey(local, idx, survey); + + if (local->ops->get_survey) + ret = local->ops->get_survey(&local->hw, idx, survey); + + trace_drv_return_int(local, ret); + + return ret; +} + +static inline void drv_rfkill_poll(struct ieee80211_local *local) +{ + might_sleep(); + + if (local->ops->rfkill_poll) + local->ops->rfkill_poll(&local->hw); +} + +static inline void drv_flush(struct ieee80211_local *local, bool drop) +{ + might_sleep(); + + trace_drv_flush(local, drop); + if (local->ops->flush) + local->ops->flush(&local->hw, drop); + trace_drv_return_void(local); +} + +static inline void drv_channel_switch(struct ieee80211_local *local, + struct ieee80211_channel_switch *ch_switch) +{ + might_sleep(); + + trace_drv_channel_switch(local, ch_switch); + local->ops->channel_switch(&local->hw, ch_switch); + trace_drv_return_void(local); +} + + +static inline int drv_set_antenna(struct ieee80211_local *local, + u32 tx_ant, u32 rx_ant) +{ + int ret = -EOPNOTSUPP; + might_sleep(); + if (local->ops->set_antenna) + ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); + trace_drv_set_antenna(local, tx_ant, rx_ant, ret); + return ret; +} + +static inline int drv_get_antenna(struct ieee80211_local *local, + u32 *tx_ant, u32 *rx_ant) +{ + int ret = -EOPNOTSUPP; + might_sleep(); + if (local->ops->get_antenna) + ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); + trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); + return ret; +} + +static inline int drv_remain_on_channel(struct ieee80211_local *local, + struct ieee80211_channel *chan, + enum nl80211_channel_type chantype, + unsigned int duration) +{ + int ret; + + might_sleep(); + + trace_drv_remain_on_channel(local, chan, chantype, duration); + ret = local->ops->remain_on_channel(&local->hw, chan, chantype, + duration); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local) +{ + int ret; + + might_sleep(); + + trace_drv_cancel_remain_on_channel(local); + ret = local->ops->cancel_remain_on_channel(&local->hw); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline int drv_offchannel_tx(struct ieee80211_local *local, + struct sk_buff *skb, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int wait) +{ + int ret; + + might_sleep(); + + trace_drv_offchannel_tx(local, skb, chan, channel_type, wait); + ret = local->ops->offchannel_tx(&local->hw, skb, chan, + channel_type, wait); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline int drv_offchannel_tx_cancel_wait(struct ieee80211_local *local) +{ + int ret; + + might_sleep(); + + trace_drv_offchannel_tx_cancel_wait(local); + ret = local->ops->offchannel_tx_cancel_wait(&local->hw); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline int drv_set_ringparam(struct ieee80211_local *local, + u32 tx, u32 rx) +{ + int ret = -ENOTSUPP; + + might_sleep(); + + trace_drv_set_ringparam(local, tx, rx); + if (local->ops->set_ringparam) + ret = local->ops->set_ringparam(&local->hw, tx, rx); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline void drv_get_ringparam(struct ieee80211_local *local, + u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) +{ + might_sleep(); + + trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max); + if (local->ops->get_ringparam) + local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max); + trace_drv_return_void(local); +} + +static inline bool drv_tx_frames_pending(struct ieee80211_local *local) +{ + bool ret = false; + + might_sleep(); + + trace_drv_tx_frames_pending(local); + if (local->ops->tx_frames_pending) + ret = local->ops->tx_frames_pending(&local->hw); + trace_drv_return_bool(local, ret); + + return ret; +} + +static inline int drv_set_bitrate_mask(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct cfg80211_bitrate_mask *mask) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + trace_drv_set_bitrate_mask(local, sdata, mask); + if (local->ops->set_bitrate_mask) + ret = local->ops->set_bitrate_mask(&local->hw, + &sdata->vif, mask); + trace_drv_return_int(local, ret); + + return ret; +} + +#endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/driver-trace.c b/net/mac80211/driver-trace.c new file mode 100644 index 00000000..8ed8711b --- /dev/null +++ b/net/mac80211/driver-trace.c @@ -0,0 +1,9 @@ +/* bug in tracepoint.h, it should include this */ +#include + +/* sparse isn't too happy with all macros... */ +#ifndef __CHECKER__ +#include "driver-ops.h" +#define CREATE_TRACE_POINTS +#include "driver-trace.h" +#endif diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h new file mode 100644 index 00000000..ed9edcbd --- /dev/null +++ b/net/mac80211/driver-trace.h @@ -0,0 +1,1348 @@ +#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __MAC80211_DRIVER_TRACE + +#include +#include +#include "ieee80211_i.h" + +#if !defined(CONFIG_MAC80211_DRIVER_API_TRACER) || defined(__CHECKER__) +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, ...) \ +static inline void trace_ ## name(proto) {} +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(...) +#undef DEFINE_EVENT +#define DEFINE_EVENT(evt_class, name, proto, ...) \ +static inline void trace_ ## name(proto) {} +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mac80211 + +#define MAXNAME 32 +#define LOCAL_ENTRY __array(char, wiphy_name, 32) +#define LOCAL_ASSIGN strlcpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME) +#define LOCAL_PR_FMT "%s" +#define LOCAL_PR_ARG __entry->wiphy_name + +#define STA_ENTRY __array(char, sta_addr, ETH_ALEN) +#define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : memset(__entry->sta_addr, 0, ETH_ALEN)) +#define STA_PR_FMT " sta:%pM" +#define STA_PR_ARG __entry->sta_addr + +#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ + __field(bool, p2p) \ + __string(vif_name, sdata->dev ? sdata->dev->name : "") +#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ + __entry->p2p = sdata->vif.p2p; \ + __assign_str(vif_name, sdata->dev ? sdata->dev->name : "") +#define VIF_PR_FMT " vif:%s(%d%s)" +#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" + +/* + * Tracing for driver callbacks. + */ + +DECLARE_EVENT_CLASS(local_only_evt, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local), + TP_STRUCT__entry( + LOCAL_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) +); + +DECLARE_EVENT_CLASS(local_sdata_addr_evt, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __array(char, addr, 6) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + memcpy(__entry->addr, sdata->vif.addr, 6); + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " addr:%pM", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr + ) +); + +DECLARE_EVENT_CLASS(local_u32_evt, + TP_PROTO(struct ieee80211_local *local, u32 value), + TP_ARGS(local, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, value) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " value:%d", + LOCAL_PR_ARG, __entry->value + ) +); + +DECLARE_EVENT_CLASS(local_sdata_evt, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG + ) +); + +DEFINE_EVENT(local_only_evt, drv_return_void, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(drv_return_int, + TP_PROTO(struct ieee80211_local *local, int ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret) +); + +TRACE_EVENT(drv_return_bool, + TP_PROTO(struct ieee80211_local *local, bool ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ? + "true" : "false") +); + +TRACE_EVENT(drv_return_u64, + TP_PROTO(struct ieee80211_local *local, u64 ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u64, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret) +); + +DEFINE_EVENT(local_only_evt, drv_start, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, drv_suspend, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, drv_resume, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, drv_stop, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +TRACE_EVENT(drv_change_interface, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type, bool p2p), + + TP_ARGS(local, sdata, type, p2p), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u32, new_type) + __field(bool, new_p2p) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->new_type = type; + __entry->new_p2p = p2p; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type, + __entry->new_p2p ? "/p2p" : "" + ) +); + +DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +TRACE_EVENT(drv_config, + TP_PROTO(struct ieee80211_local *local, + u32 changed), + + TP_ARGS(local, changed), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, changed) + __field(u32, flags) + __field(int, power_level) + __field(int, dynamic_ps_timeout) + __field(int, max_sleep_period) + __field(u16, listen_interval) + __field(u8, long_frame_max_tx_count) + __field(u8, short_frame_max_tx_count) + __field(int, center_freq) + __field(int, channel_type) + __field(int, smps) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->changed = changed; + __entry->flags = local->hw.conf.flags; + __entry->power_level = local->hw.conf.power_level; + __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout; + __entry->max_sleep_period = local->hw.conf.max_sleep_period; + __entry->listen_interval = local->hw.conf.listen_interval; + __entry->long_frame_max_tx_count = local->hw.conf.long_frame_max_tx_count; + __entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count; + __entry->center_freq = local->hw.conf.channel->center_freq; + __entry->channel_type = local->hw.conf.channel_type; + __entry->smps = local->hw.conf.smps_mode; + ), + + TP_printk( + LOCAL_PR_FMT " ch:%#x freq:%d", + LOCAL_PR_ARG, __entry->changed, __entry->center_freq + ) +); + +TRACE_EVENT(drv_bss_info_changed, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss_conf *info, + u32 changed), + + TP_ARGS(local, sdata, info, changed), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(bool, assoc) + __field(u16, aid) + __field(bool, cts) + __field(bool, shortpre) + __field(bool, shortslot) + __field(u8, dtimper) + __field(u16, bcnint) + __field(u16, assoc_cap) + __field(u64, timestamp) + __field(u32, basic_rates) + __field(u32, changed) + __field(bool, enable_beacon) + __field(u16, ht_operation_mode) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->changed = changed; + __entry->aid = info->aid; + __entry->assoc = info->assoc; + __entry->shortpre = info->use_short_preamble; + __entry->cts = info->use_cts_prot; + __entry->shortslot = info->use_short_slot; + __entry->dtimper = info->dtim_period; + __entry->bcnint = info->beacon_int; + __entry->assoc_cap = info->assoc_capability; + __entry->timestamp = info->timestamp; + __entry->basic_rates = info->basic_rates; + __entry->enable_beacon = info->enable_beacon; + __entry->ht_operation_mode = info->ht_operation_mode; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " changed:%#x", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed + ) +); + +TRACE_EVENT(drv_prepare_multicast, + TP_PROTO(struct ieee80211_local *local, int mc_count), + + TP_ARGS(local, mc_count), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, mc_count) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mc_count = mc_count; + ), + + TP_printk( + LOCAL_PR_FMT " prepare mc (%d)", + LOCAL_PR_ARG, __entry->mc_count + ) +); + +TRACE_EVENT(drv_configure_filter, + TP_PROTO(struct ieee80211_local *local, + unsigned int changed_flags, + unsigned int *total_flags, + u64 multicast), + + TP_ARGS(local, changed_flags, total_flags, multicast), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(unsigned int, changed) + __field(unsigned int, total) + __field(u64, multicast) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->changed = changed_flags; + __entry->total = *total_flags; + __entry->multicast = multicast; + ), + + TP_printk( + LOCAL_PR_FMT " changed:%#x total:%#x", + LOCAL_PR_ARG, __entry->changed, __entry->total + ) +); + +TRACE_EVENT(drv_set_tim, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, bool set), + + TP_ARGS(local, sta, set), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(bool, set) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->set = set; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT " set:%d", + LOCAL_PR_ARG, STA_PR_FMT, __entry->set + ) +); + +TRACE_EVENT(drv_set_key, + TP_PROTO(struct ieee80211_local *local, + enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key), + + TP_ARGS(local, cmd, sdata, sta, key), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, cipher) + __field(u8, hw_key_idx) + __field(u8, flags) + __field(s8, keyidx) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->cipher = key->cipher; + __entry->flags = key->flags; + __entry->keyidx = key->keyidx; + __entry->hw_key_idx = key->hw_key_idx; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG + ) +); + +TRACE_EVENT(drv_update_tkip_key, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_key_conf *conf, + struct ieee80211_sta *sta, u32 iv32), + + TP_ARGS(local, sdata, conf, sta, iv32), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, iv32) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->iv32 = iv32; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x", + LOCAL_PR_ARG,VIF_PR_ARG,STA_PR_ARG, __entry->iv32 + ) +); + +DEFINE_EVENT(local_sdata_evt, drv_hw_scan, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +DEFINE_EVENT(local_sdata_evt, drv_sched_scan_start, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +DEFINE_EVENT(local_sdata_evt, drv_sched_scan_stop, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + +DEFINE_EVENT(local_only_evt, drv_sw_scan_start, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, drv_sw_scan_complete, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(drv_get_stats, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_low_level_stats *stats, + int ret), + + TP_ARGS(local, stats, ret), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, ret) + __field(unsigned int, ackfail) + __field(unsigned int, rtsfail) + __field(unsigned int, fcserr) + __field(unsigned int, rtssucc) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + __entry->ackfail = stats->dot11ACKFailureCount; + __entry->rtsfail = stats->dot11RTSFailureCount; + __entry->fcserr = stats->dot11FCSErrorCount; + __entry->rtssucc = stats->dot11RTSSuccessCount; + ), + + TP_printk( + LOCAL_PR_FMT " ret:%d", + LOCAL_PR_ARG, __entry->ret + ) +); + +TRACE_EVENT(drv_get_tkip_seq, + TP_PROTO(struct ieee80211_local *local, + u8 hw_key_idx, u32 *iv32, u16 *iv16), + + TP_ARGS(local, hw_key_idx, iv32, iv16), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, hw_key_idx) + __field(u32, iv32) + __field(u16, iv16) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->hw_key_idx = hw_key_idx; + __entry->iv32 = *iv32; + __entry->iv16 = *iv16; + ), + + TP_printk( + LOCAL_PR_FMT, LOCAL_PR_ARG + ) +); + +DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold, + TP_PROTO(struct ieee80211_local *local, u32 value), + TP_ARGS(local, value) +); + +DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, + TP_PROTO(struct ieee80211_local *local, u32 value), + TP_ARGS(local, value) +); + +TRACE_EVENT(drv_set_coverage_class, + TP_PROTO(struct ieee80211_local *local, u8 value), + + TP_ARGS(local, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, value) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " value:%d", + LOCAL_PR_ARG, __entry->value + ) +); + +TRACE_EVENT(drv_sta_notify, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum sta_notify_cmd cmd, + struct ieee80211_sta *sta), + + TP_ARGS(local, sdata, cmd, sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, cmd) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->cmd = cmd; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " cmd:%d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd + ) +); + +TRACE_EVENT(drv_sta_add, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta), + + TP_ARGS(local, sdata, sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG + ) +); + +TRACE_EVENT(drv_sta_remove, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta), + + TP_ARGS(local, sdata, sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG + ) +); + +TRACE_EVENT(drv_conf_tx, + TP_PROTO(struct ieee80211_local *local, u16 queue, + const struct ieee80211_tx_queue_params *params), + + TP_ARGS(local, queue, params), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u16, queue) + __field(u16, txop) + __field(u16, cw_min) + __field(u16, cw_max) + __field(u8, aifs) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->queue = queue; + __entry->txop = params->txop; + __entry->cw_max = params->cw_max; + __entry->cw_min = params->cw_min; + __entry->aifs = params->aifs; + ), + + TP_printk( + LOCAL_PR_FMT " queue:%d", + LOCAL_PR_ARG, __entry->queue + ) +); + +DEFINE_EVENT(local_only_evt, drv_get_tsf, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(drv_set_tsf, + TP_PROTO(struct ieee80211_local *local, u64 tsf), + + TP_ARGS(local, tsf), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u64, tsf) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->tsf = tsf; + ), + + TP_printk( + LOCAL_PR_FMT " tsf:%llu", + LOCAL_PR_ARG, (unsigned long long)__entry->tsf + ) +); + +DEFINE_EVENT(local_only_evt, drv_reset_tsf, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, drv_tx_last_beacon, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(drv_ampdu_action, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_ampdu_mlme_action action, + struct ieee80211_sta *sta, u16 tid, + u16 *ssn, u8 buf_size), + + TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u32, action) + __field(u16, tid) + __field(u16, ssn) + __field(u8, buf_size) + VIF_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->action = action; + __entry->tid = tid; + __entry->ssn = ssn ? *ssn : 0; + __entry->buf_size = buf_size; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, + __entry->tid, __entry->buf_size + ) +); + +TRACE_EVENT(drv_get_survey, + TP_PROTO(struct ieee80211_local *local, int idx, + struct survey_info *survey), + + TP_ARGS(local, idx, survey), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, idx) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->idx = idx; + ), + + TP_printk( + LOCAL_PR_FMT " idx:%d", + LOCAL_PR_ARG, __entry->idx + ) +); + +TRACE_EVENT(drv_flush, + TP_PROTO(struct ieee80211_local *local, bool drop), + + TP_ARGS(local, drop), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, drop) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->drop = drop; + ), + + TP_printk( + LOCAL_PR_FMT " drop:%d", + LOCAL_PR_ARG, __entry->drop + ) +); + +TRACE_EVENT(drv_channel_switch, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_channel_switch *ch_switch), + + TP_ARGS(local, ch_switch), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u64, timestamp) + __field(bool, block_tx) + __field(u16, freq) + __field(u8, count) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->timestamp = ch_switch->timestamp; + __entry->block_tx = ch_switch->block_tx; + __entry->freq = ch_switch->channel->center_freq; + __entry->count = ch_switch->count; + ), + + TP_printk( + LOCAL_PR_FMT " new freq:%u count:%d", + LOCAL_PR_ARG, __entry->freq, __entry->count + ) +); + +TRACE_EVENT(drv_set_antenna, + TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), + + TP_ARGS(local, tx_ant, rx_ant, ret), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, tx_ant) + __field(u32, rx_ant) + __field(int, ret) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->tx_ant = tx_ant; + __entry->rx_ant = rx_ant; + __entry->ret = ret; + ), + + TP_printk( + LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", + LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret + ) +); + +TRACE_EVENT(drv_get_antenna, + TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), + + TP_ARGS(local, tx_ant, rx_ant, ret), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, tx_ant) + __field(u32, rx_ant) + __field(int, ret) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->tx_ant = tx_ant; + __entry->rx_ant = rx_ant; + __entry->ret = ret; + ), + + TP_printk( + LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", + LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret + ) +); + +TRACE_EVENT(drv_remain_on_channel, + TP_PROTO(struct ieee80211_local *local, struct ieee80211_channel *chan, + enum nl80211_channel_type chantype, unsigned int duration), + + TP_ARGS(local, chan, chantype, duration), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, center_freq) + __field(int, channel_type) + __field(unsigned int, duration) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->center_freq = chan->center_freq; + __entry->channel_type = chantype; + __entry->duration = duration; + ), + + TP_printk( + LOCAL_PR_FMT " freq:%dMHz duration:%dms", + LOCAL_PR_ARG, __entry->center_freq, __entry->duration + ) +); + +DEFINE_EVENT(local_only_evt, drv_cancel_remain_on_channel, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(drv_offchannel_tx, + TP_PROTO(struct ieee80211_local *local, struct sk_buff *skb, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int wait), + + TP_ARGS(local, skb, chan, channel_type, wait), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, center_freq) + __field(int, channel_type) + __field(unsigned int, wait) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->center_freq = chan->center_freq; + __entry->channel_type = channel_type; + __entry->wait = wait; + ), + + TP_printk( + LOCAL_PR_FMT " freq:%dMHz, wait:%dms", + LOCAL_PR_ARG, __entry->center_freq, __entry->wait + ) +); + +TRACE_EVENT(drv_set_ringparam, + TP_PROTO(struct ieee80211_local *local, u32 tx, u32 rx), + + TP_ARGS(local, tx, rx), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, tx) + __field(u32, rx) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->tx = tx; + __entry->rx = rx; + ), + + TP_printk( + LOCAL_PR_FMT " tx:%d rx %d", + LOCAL_PR_ARG, __entry->tx, __entry->rx + ) +); + +TRACE_EVENT(drv_get_ringparam, + TP_PROTO(struct ieee80211_local *local, u32 *tx, u32 *tx_max, + u32 *rx, u32 *rx_max), + + TP_ARGS(local, tx, tx_max, rx, rx_max), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u32, tx) + __field(u32, tx_max) + __field(u32, rx) + __field(u32, rx_max) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->tx = *tx; + __entry->tx_max = *tx_max; + __entry->rx = *rx; + __entry->rx_max = *rx_max; + ), + + TP_printk( + LOCAL_PR_FMT " tx:%d tx_max %d rx %d rx_max %d", + LOCAL_PR_ARG, + __entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max + ) +); + +DEFINE_EVENT(local_only_evt, drv_tx_frames_pending, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, drv_offchannel_tx_cancel_wait, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(drv_set_bitrate_mask, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct cfg80211_bitrate_mask *mask), + + TP_ARGS(local, sdata, mask), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u32, legacy_2g) + __field(u32, legacy_5g) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->legacy_2g = mask->control[IEEE80211_BAND_2GHZ].legacy; + __entry->legacy_5g = mask->control[IEEE80211_BAND_5GHZ].legacy; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " 2G Mask:0x%x 5G Mask:0x%x", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->legacy_2g, __entry->legacy_5g + ) +); + +/* + * Tracing for API calls that drivers call. + */ + +TRACE_EVENT(api_start_tx_ba_session, + TP_PROTO(struct ieee80211_sta *sta, u16 tid), + + TP_ARGS(sta, tid), + + TP_STRUCT__entry( + STA_ENTRY + __field(u16, tid) + ), + + TP_fast_assign( + STA_ASSIGN; + __entry->tid = tid; + ), + + TP_printk( + STA_PR_FMT " tid:%d", + STA_PR_ARG, __entry->tid + ) +); + +TRACE_EVENT(api_start_tx_ba_cb, + TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid), + + TP_ARGS(sdata, ra, tid), + + TP_STRUCT__entry( + VIF_ENTRY + __array(u8, ra, ETH_ALEN) + __field(u16, tid) + ), + + TP_fast_assign( + VIF_ASSIGN; + memcpy(__entry->ra, ra, ETH_ALEN); + __entry->tid = tid; + ), + + TP_printk( + VIF_PR_FMT " ra:%pM tid:%d", + VIF_PR_ARG, __entry->ra, __entry->tid + ) +); + +TRACE_EVENT(api_stop_tx_ba_session, + TP_PROTO(struct ieee80211_sta *sta, u16 tid), + + TP_ARGS(sta, tid), + + TP_STRUCT__entry( + STA_ENTRY + __field(u16, tid) + ), + + TP_fast_assign( + STA_ASSIGN; + __entry->tid = tid; + ), + + TP_printk( + STA_PR_FMT " tid:%d", + STA_PR_ARG, __entry->tid + ) +); + +TRACE_EVENT(api_stop_tx_ba_cb, + TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid), + + TP_ARGS(sdata, ra, tid), + + TP_STRUCT__entry( + VIF_ENTRY + __array(u8, ra, ETH_ALEN) + __field(u16, tid) + ), + + TP_fast_assign( + VIF_ASSIGN; + memcpy(__entry->ra, ra, ETH_ALEN); + __entry->tid = tid; + ), + + TP_printk( + VIF_PR_FMT " ra:%pM tid:%d", + VIF_PR_ARG, __entry->ra, __entry->tid + ) +); + +DEFINE_EVENT(local_only_evt, api_restart_hw, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(api_beacon_loss, + TP_PROTO(struct ieee80211_sub_if_data *sdata), + + TP_ARGS(sdata), + + TP_STRUCT__entry( + VIF_ENTRY + ), + + TP_fast_assign( + VIF_ASSIGN; + ), + + TP_printk( + VIF_PR_FMT, + VIF_PR_ARG + ) +); + +TRACE_EVENT(api_connection_loss, + TP_PROTO(struct ieee80211_sub_if_data *sdata), + + TP_ARGS(sdata), + + TP_STRUCT__entry( + VIF_ENTRY + ), + + TP_fast_assign( + VIF_ASSIGN; + ), + + TP_printk( + VIF_PR_FMT, + VIF_PR_ARG + ) +); + +TRACE_EVENT(api_cqm_rssi_notify, + TP_PROTO(struct ieee80211_sub_if_data *sdata, + enum nl80211_cqm_rssi_threshold_event rssi_event), + + TP_ARGS(sdata, rssi_event), + + TP_STRUCT__entry( + VIF_ENTRY + __field(u32, rssi_event) + ), + + TP_fast_assign( + VIF_ASSIGN; + __entry->rssi_event = rssi_event; + ), + + TP_printk( + VIF_PR_FMT " event:%d", + VIF_PR_ARG, __entry->rssi_event + ) +); + +TRACE_EVENT(api_scan_completed, + TP_PROTO(struct ieee80211_local *local, bool aborted), + + TP_ARGS(local, aborted), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, aborted) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->aborted = aborted; + ), + + TP_printk( + LOCAL_PR_FMT " aborted:%d", + LOCAL_PR_ARG, __entry->aborted + ) +); + +TRACE_EVENT(api_sched_scan_results, + TP_PROTO(struct ieee80211_local *local), + + TP_ARGS(local), + + TP_STRUCT__entry( + LOCAL_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT, LOCAL_PR_ARG + ) +); + +TRACE_EVENT(api_sched_scan_stopped, + TP_PROTO(struct ieee80211_local *local), + + TP_ARGS(local), + + TP_STRUCT__entry( + LOCAL_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT, LOCAL_PR_ARG + ) +); + +TRACE_EVENT(api_sta_block_awake, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, bool block), + + TP_ARGS(local, sta, block), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(bool, block) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->block = block; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT " block:%d", + LOCAL_PR_ARG, STA_PR_FMT, __entry->block + ) +); + +TRACE_EVENT(api_chswitch_done, + TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success), + + TP_ARGS(sdata, success), + + TP_STRUCT__entry( + VIF_ENTRY + __field(bool, success) + ), + + TP_fast_assign( + VIF_ASSIGN; + __entry->success = success; + ), + + TP_printk( + VIF_PR_FMT " success=%d", + VIF_PR_ARG, __entry->success + ) +); + +DEFINE_EVENT(local_only_evt, api_ready_on_channel, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, api_remain_on_channel_expired, + TP_PROTO(struct ieee80211_local *local), + TP_ARGS(local) +); + +/* + * Tracing for internal functions + * (which may also be called in response to driver calls) + */ + +TRACE_EVENT(wake_queue, + TP_PROTO(struct ieee80211_local *local, u16 queue, + enum queue_stop_reason reason), + + TP_ARGS(local, queue, reason), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u16, queue) + __field(u32, reason) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->queue = queue; + __entry->reason = reason; + ), + + TP_printk( + LOCAL_PR_FMT " queue:%d, reason:%d", + LOCAL_PR_ARG, __entry->queue, __entry->reason + ) +); + +TRACE_EVENT(stop_queue, + TP_PROTO(struct ieee80211_local *local, u16 queue, + enum queue_stop_reason reason), + + TP_ARGS(local, queue, reason), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u16, queue) + __field(u32, reason) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->queue = queue; + __entry->reason = reason; + ), + + TP_printk( + LOCAL_PR_FMT " queue:%d, reason:%d", + LOCAL_PR_ARG, __entry->queue, __entry->reason + ) +); +#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE driver-trace +#include diff --git a/net/mac80211/event.c b/net/mac80211/event.c new file mode 100644 index 00000000..01ae7595 --- /dev/null +++ b/net/mac80211/event.c @@ -0,0 +1,27 @@ +/* + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * mac80211 - events + */ +#include +#include "ieee80211_i.h" + +/* + * Indicate a failed Michael MIC to userspace. If the caller knows the TSC of + * the frame that generated the MIC failure (i.e., if it was provided by the + * driver or is still in the frame), it should provide that information. + */ +void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx, + struct ieee80211_hdr *hdr, const u8 *tsc, + gfp_t gfp) +{ + cfg80211_michael_mic_failure(sdata->dev, hdr->addr2, + (hdr->addr1[0] & 0x01) ? + NL80211_KEYTYPE_GROUP : + NL80211_KEYTYPE_PAIRWISE, + keyidx, tsc, gfp); +} diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c new file mode 100644 index 00000000..591add22 --- /dev/null +++ b/net/mac80211/ht.c @@ -0,0 +1,318 @@ +/* + * HT handling + * + * Copyright 2003, Jouni Malinen + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2007-2010, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include "ieee80211_i.h" +#include "rate.h" + +void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, + struct ieee80211_ht_cap *ht_cap_ie, + struct ieee80211_sta_ht_cap *ht_cap) +{ + u8 ampdu_info, tx_mcs_set_cap; + int i, max_tx_streams; + + BUG_ON(!ht_cap); + + memset(ht_cap, 0, sizeof(*ht_cap)); + + if (!ht_cap_ie || !sband->ht_cap.ht_supported) + return; + + ht_cap->ht_supported = true; + + /* + * The bits listed in this expression should be + * the same for the peer and us, if the station + * advertises more then we can't use those thus + * we mask them out. + */ + ht_cap->cap = le16_to_cpu(ht_cap_ie->cap_info) & + (sband->ht_cap.cap | + ~(IEEE80211_HT_CAP_LDPC_CODING | + IEEE80211_HT_CAP_SUP_WIDTH_20_40 | + IEEE80211_HT_CAP_GRN_FLD | + IEEE80211_HT_CAP_SGI_20 | + IEEE80211_HT_CAP_SGI_40 | + IEEE80211_HT_CAP_DSSSCCK40)); + /* + * The STBC bits are asymmetric -- if we don't have + * TX then mask out the peer's RX and vice versa. + */ + if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_TX_STBC)) + ht_cap->cap &= ~IEEE80211_HT_CAP_RX_STBC; + if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC)) + ht_cap->cap &= ~IEEE80211_HT_CAP_TX_STBC; + + ampdu_info = ht_cap_ie->ampdu_params_info; + ht_cap->ampdu_factor = + ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR; + ht_cap->ampdu_density = + (ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2; + + /* own MCS TX capabilities */ + tx_mcs_set_cap = sband->ht_cap.mcs.tx_params; + + /* Copy peer MCS TX capabilities, the driver might need them. */ + ht_cap->mcs.tx_params = ht_cap_ie->mcs.tx_params; + + /* can we TX with MCS rates? */ + if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED)) + return; + + /* Counting from 0, therefore +1 */ + if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF) + max_tx_streams = + ((tx_mcs_set_cap & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK) + >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; + else + max_tx_streams = IEEE80211_HT_MCS_TX_MAX_STREAMS; + + /* + * 802.11n-2009 20.3.5 / 20.6 says: + * - indices 0 to 7 and 32 are single spatial stream + * - 8 to 31 are multiple spatial streams using equal modulation + * [8..15 for two streams, 16..23 for three and 24..31 for four] + * - remainder are multiple spatial streams using unequal modulation + */ + for (i = 0; i < max_tx_streams; i++) + ht_cap->mcs.rx_mask[i] = + sband->ht_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i]; + + if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION) + for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE; + i < IEEE80211_HT_MCS_MASK_LEN; i++) + ht_cap->mcs.rx_mask[i] = + sband->ht_cap.mcs.rx_mask[i] & + ht_cap_ie->mcs.rx_mask[i]; + + /* handle MCS rate 32 too */ + if (sband->ht_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1) + ht_cap->mcs.rx_mask[32/8] |= 1; +} + +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx) +{ + int i; + + cancel_work_sync(&sta->ampdu_mlme.work); + + for (i = 0; i < STA_TID_NUM; i++) { + __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx); + __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_LEAVE_QBSS, tx); + } +} + +void ieee80211_ba_session_work(struct work_struct *work) +{ + struct sta_info *sta = + container_of(work, struct sta_info, ampdu_mlme.work); + struct tid_ampdu_tx *tid_tx; + int tid; + + /* + * When this flag is set, new sessions should be + * blocked, and existing sessions will be torn + * down by the code that set the flag, so this + * need not run. + */ + if (test_sta_flags(sta, WLAN_STA_BLOCK_BA)) + return; + + mutex_lock(&sta->ampdu_mlme.mtx); + for (tid = 0; tid < STA_TID_NUM; tid++) { + if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired)) + ___ieee80211_stop_rx_ba_session( + sta, tid, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_TIMEOUT, true); + + tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; + if (tid_tx) { + /* + * Assign it over to the normal tid_tx array + * where it "goes live". + */ + spin_lock_bh(&sta->lock); + + sta->ampdu_mlme.tid_start_tx[tid] = NULL; + /* could there be a race? */ + if (sta->ampdu_mlme.tid_tx[tid]) + kfree(tid_tx); + else + ieee80211_assign_tid_tx(sta, tid, tid_tx); + spin_unlock_bh(&sta->lock); + + ieee80211_tx_ba_session_handle_start(sta, tid); + continue; + } + + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); + if (tid_tx && test_and_clear_bit(HT_AGG_STATE_WANT_STOP, + &tid_tx->state)) + ___ieee80211_stop_tx_ba_session(sta, tid, + WLAN_BACK_INITIATOR, + true); + } + mutex_unlock(&sta->ampdu_mlme.mtx); +} + +void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, + const u8 *da, u16 tid, + u16 initiator, u16 reason_code) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 params; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer " + "for delba frame\n", sdata->name); + return; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); + + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA; + params = (u16)(initiator << 11); /* bit 11 initiator */ + params |= (u16)(tid << 12); /* bit 15:12 TID number */ + + mgmt->u.action.u.delba.params = cpu_to_le16(params); + mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); + + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, size_t len) +{ + u16 tid, params; + u16 initiator; + + params = le16_to_cpu(mgmt->u.action.u.delba.params); + tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; + initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11; + +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "delba from %pM (%s) tid %d reason code %d\n", + mgmt->sa, initiator ? "initiator" : "recipient", tid, + le16_to_cpu(mgmt->u.action.u.delba.reason_code)); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (initiator == WLAN_BACK_INITIATOR) + __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, + true); + else + __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, + true); +} + +int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps, const u8 *da, + const u8 *bssid) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *action_frame; + + /* 27 = header + category + action + smps mode */ + skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom); + action_frame = (void *)skb_put(skb, 27); + memcpy(action_frame->da, da, ETH_ALEN); + memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); + memcpy(action_frame->bssid, bssid, ETH_ALEN); + action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + action_frame->u.action.category = WLAN_CATEGORY_HT; + action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + switch (smps) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + case IEEE80211_SMPS_OFF: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_DISABLED; + break; + case IEEE80211_SMPS_STATIC: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_STATIC; + break; + case IEEE80211_SMPS_DYNAMIC: + action_frame->u.action.u.ht_smps.smps_control = + WLAN_HT_SMPS_CONTROL_DYNAMIC; + break; + } + + /* we'll do more on status of this frame */ + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); + + return 0; +} + +void ieee80211_request_smps_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.request_smps_work); + + mutex_lock(&sdata->u.mgd.mtx); + __ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode); + mutex_unlock(&sdata->u.mgd.mtx); +} + +void ieee80211_request_smps(struct ieee80211_vif *vif, + enum ieee80211_smps_mode smps_mode) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return; + + if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF)) + smps_mode = IEEE80211_SMPS_AUTOMATIC; + + sdata->u.mgd.driver_smps_mode = smps_mode; + + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); +} +/* this might change ... don't want non-open drivers using it */ +EXPORT_SYMBOL_GPL(ieee80211_request_smps); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c new file mode 100644 index 00000000..56c24cab --- /dev/null +++ b/net/mac80211/ibss.c @@ -0,0 +1,1016 @@ +/* + * IBSS mode implementation + * Copyright 2003-2008, Jouni Malinen + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2009, Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" + +#define IEEE80211_SCAN_INTERVAL (2 * HZ) +#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ) +#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) + +#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ) +#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ) + +#define IEEE80211_IBSS_MAX_STA_ENTRIES 128 + + +static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + u16 auth_alg, auth_transaction; + + lockdep_assert_held(&sdata->u.ibss.mtx); + + if (len < 24 + 6) + return; + + auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); + auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); + + /* + * IEEE 802.11 standard does not require authentication in IBSS + * networks and most implementations do not seem to use it. + * However, try to reply to authentication attempts if someone + * has actually implemented this. + */ + if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1) + ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0, + sdata->u.ibss.bssid, NULL, 0, 0); +} + +static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, + const u8 *bssid, const int beacon_int, + struct ieee80211_channel *chan, + const u32 basic_rates, + const u16 capability, u64 tsf) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + int rates, i; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u8 *pos; + struct ieee80211_supported_band *sband; + struct cfg80211_bss *bss; + u32 bss_change; + u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; + + lockdep_assert_held(&ifibss->mtx); + + /* Reset own TSF to allow time synchronization work. */ + drv_reset_tsf(local); + + skb = ifibss->skb; + rcu_assign_pointer(ifibss->presp, NULL); + synchronize_rcu(); + skb->data = skb->head; + skb->len = 0; + skb_reset_tail_pointer(skb); + skb_reserve(skb, sdata->local->hw.extra_tx_headroom); + + if (memcmp(ifibss->bssid, bssid, ETH_ALEN)) + sta_info_flush(sdata->local, sdata); + + /* if merging, indicate to driver that we leave the old IBSS */ + if (sdata->vif.bss_conf.ibss_joined) { + sdata->vif.bss_conf.ibss_joined = false; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IBSS); + } + + memcpy(ifibss->bssid, bssid, ETH_ALEN); + + sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; + + local->oper_channel = chan; + WARN_ON(!ieee80211_set_channel_type(local, sdata, NL80211_CHAN_NO_HT)); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + + sband = local->hw.wiphy->bands[chan->band]; + + /* build supported rates array */ + pos = supp_rates; + for (i = 0; i < sband->n_bitrates; i++) { + int rate = sband->bitrates[i].bitrate; + u8 basic = 0; + if (basic_rates & BIT(i)) + basic = 0x80; + *pos++ = basic | (u8) (rate / 5); + } + + /* Build IBSS probe response */ + mgmt = (void *) skb_put(skb, 24 + sizeof(mgmt->u.beacon)); + memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_RESP); + memset(mgmt->da, 0xff, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); + mgmt->u.beacon.beacon_int = cpu_to_le16(beacon_int); + mgmt->u.beacon.timestamp = cpu_to_le64(tsf); + mgmt->u.beacon.capab_info = cpu_to_le16(capability); + + pos = skb_put(skb, 2 + ifibss->ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = ifibss->ssid_len; + memcpy(pos, ifibss->ssid, ifibss->ssid_len); + + rates = sband->n_bitrates; + if (rates > 8) + rates = 8; + pos = skb_put(skb, 2 + rates); + *pos++ = WLAN_EID_SUPP_RATES; + *pos++ = rates; + memcpy(pos, supp_rates, rates); + + if (sband->band == IEEE80211_BAND_2GHZ) { + pos = skb_put(skb, 2 + 1); + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = ieee80211_frequency_to_channel(chan->center_freq); + } + + pos = skb_put(skb, 2 + 2); + *pos++ = WLAN_EID_IBSS_PARAMS; + *pos++ = 2; + /* FIX: set ATIM window based on scan results */ + *pos++ = 0; + *pos++ = 0; + + if (sband->n_bitrates > 8) { + rates = sband->n_bitrates - 8; + pos = skb_put(skb, 2 + rates); + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = rates; + memcpy(pos, &supp_rates[8], rates); + } + + if (ifibss->ie_len) + memcpy(skb_put(skb, ifibss->ie_len), + ifibss->ie, ifibss->ie_len); + + if (local->hw.queues >= 4) { + pos = skb_put(skb, 9); + *pos++ = WLAN_EID_VENDOR_SPECIFIC; + *pos++ = 7; /* len */ + *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */ + *pos++ = 0x50; + *pos++ = 0xf2; + *pos++ = 2; /* WME */ + *pos++ = 0; /* WME info */ + *pos++ = 1; /* WME ver */ + *pos++ = 0; /* U-APSD no in use */ + } + + rcu_assign_pointer(ifibss->presp, skb); + + sdata->vif.bss_conf.beacon_int = beacon_int; + sdata->vif.bss_conf.basic_rates = basic_rates; + bss_change = BSS_CHANGED_BEACON_INT; + bss_change |= ieee80211_reset_erp_info(sdata); + bss_change |= BSS_CHANGED_BSSID; + bss_change |= BSS_CHANGED_BEACON; + bss_change |= BSS_CHANGED_BEACON_ENABLED; + bss_change |= BSS_CHANGED_BASIC_RATES; + bss_change |= BSS_CHANGED_IBSS; + sdata->vif.bss_conf.ibss_joined = true; + ieee80211_bss_info_change_notify(sdata, bss_change); + + ieee80211_sta_def_wmm_params(sdata, sband->n_bitrates, supp_rates); + + ifibss->state = IEEE80211_IBSS_MLME_JOINED; + mod_timer(&ifibss->timer, + round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); + + bss = cfg80211_inform_bss_frame(local->hw.wiphy, local->hw.conf.channel, + mgmt, skb->len, 0, GFP_KERNEL); + cfg80211_put_bss(bss); + cfg80211_ibss_joined(sdata->dev, ifibss->bssid, GFP_KERNEL); +} + +static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss *bss) +{ + struct cfg80211_bss *cbss = + container_of((void *)bss, struct cfg80211_bss, priv); + struct ieee80211_supported_band *sband; + u32 basic_rates; + int i, j; + u16 beacon_int = cbss->beacon_interval; + + lockdep_assert_held(&sdata->u.ibss.mtx); + + if (beacon_int < 10) + beacon_int = 10; + + sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; + + basic_rates = 0; + + for (i = 0; i < bss->supp_rates_len; i++) { + int rate = (bss->supp_rates[i] & 0x7f) * 5; + bool is_basic = !!(bss->supp_rates[i] & 0x80); + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) { + if (is_basic) + basic_rates |= BIT(j); + break; + } + } + } + + __ieee80211_sta_join_ibss(sdata, cbss->bssid, + beacon_int, + cbss->channel, + basic_rates, + cbss->capability, + cbss->tsf); +} + +static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status, + struct ieee802_11_elems *elems, + bool beacon) +{ + struct ieee80211_local *local = sdata->local; + int freq; + struct cfg80211_bss *cbss; + struct ieee80211_bss *bss; + struct sta_info *sta; + struct ieee80211_channel *channel; + u64 beacon_timestamp, rx_timestamp; + u32 supp_rates = 0; + enum ieee80211_band band = rx_status->band; + + if (elems->ds_params && elems->ds_params_len == 1) + freq = ieee80211_channel_to_frequency(elems->ds_params[0], + band); + else + freq = rx_status->freq; + + channel = ieee80211_get_channel(local->hw.wiphy, freq); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return; + + if (sdata->vif.type == NL80211_IFTYPE_ADHOC && + memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) { + + rcu_read_lock(); + sta = sta_info_get(sdata, mgmt->sa); + + if (elems->supp_rates) { + supp_rates = ieee80211_sta_get_rates(local, elems, + band); + if (sta) { + u32 prev_rates; + + prev_rates = sta->sta.supp_rates[band]; + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + + if (sta->sta.supp_rates[band] != prev_rates) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG + "%s: updated supp_rates set " + "for %pM based on beacon" + "/probe_resp (0x%x -> 0x%x)\n", + sdata->name, sta->sta.addr, + prev_rates, + sta->sta.supp_rates[band]); +#endif + rate_control_rate_init(sta); + } + } else + sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid, + mgmt->sa, supp_rates, + GFP_ATOMIC); + } + + if (sta && elems->wmm_info) + set_sta_flags(sta, WLAN_STA_WME); + + rcu_read_unlock(); + } + + bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, + channel, beacon); + if (!bss) + return; + + cbss = container_of((void *)bss, struct cfg80211_bss, priv); + + /* was just updated in ieee80211_bss_info_update */ + beacon_timestamp = cbss->tsf; + + /* check if we need to merge IBSS */ + + /* we use a fixed BSSID */ + if (sdata->u.ibss.fixed_bssid) + goto put_bss; + + /* not an IBSS */ + if (!(cbss->capability & WLAN_CAPABILITY_IBSS)) + goto put_bss; + + /* different channel */ + if (cbss->channel != local->oper_channel) + goto put_bss; + + /* different SSID */ + if (elems->ssid_len != sdata->u.ibss.ssid_len || + memcmp(elems->ssid, sdata->u.ibss.ssid, + sdata->u.ibss.ssid_len)) + goto put_bss; + + /* same BSSID */ + if (memcmp(cbss->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) + goto put_bss; + + if (rx_status->flag & RX_FLAG_MACTIME_MPDU) { + /* + * For correct IBSS merging we need mactime; since mactime is + * defined as the time the first data symbol of the frame hits + * the PHY, and the timestamp of the beacon is defined as "the + * time that the data symbol containing the first bit of the + * timestamp is transmitted to the PHY plus the transmitting + * STA's delays through its local PHY from the MAC-PHY + * interface to its interface with the WM" (802.11 11.1.2) + * - equals the time this bit arrives at the receiver - we have + * to take into account the offset between the two. + * + * E.g. at 1 MBit that means mactime is 192 usec earlier + * (=24 bytes * 8 usecs/byte) than the beacon timestamp. + */ + int rate; + + if (rx_status->flag & RX_FLAG_HT) + rate = 65; /* TODO: HT rates */ + else + rate = local->hw.wiphy->bands[band]-> + bitrates[rx_status->rate_idx].bitrate; + + rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate); + } else { + /* + * second best option: get current TSF + * (will return -1 if not supported) + */ + rx_timestamp = drv_get_tsf(local); + } + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "RX beacon SA=%pM BSSID=" + "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n", + mgmt->sa, mgmt->bssid, + (unsigned long long)rx_timestamp, + (unsigned long long)beacon_timestamp, + (unsigned long long)(rx_timestamp - beacon_timestamp), + jiffies); +#endif + + if (beacon_timestamp > rx_timestamp) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: beacon TSF higher than " + "local TSF - IBSS merge with BSSID %pM\n", + sdata->name, mgmt->bssid); +#endif + ieee80211_sta_join_ibss(sdata, bss); + supp_rates = ieee80211_sta_get_rates(local, elems, band); + ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, + supp_rates, GFP_KERNEL); + } + + put_bss: + ieee80211_rx_bss_put(local, bss); +} + +/* + * Add a new IBSS station, will also be called by the RX code when, + * in IBSS mode, receiving a frame from a yet-unknown station, hence + * must be callable in atomic context. + */ +struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, + u8 *bssid,u8 *addr, u32 supp_rates, + gfp_t gfp) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + int band = local->hw.conf.channel->band; + + /* + * XXX: Consider removing the least recently used entry and + * allow new one to be added. + */ + if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) { + if (net_ratelimit()) + printk(KERN_DEBUG "%s: No room for a new IBSS STA entry %pM\n", + sdata->name, addr); + return NULL; + } + + if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) + return NULL; + + if (compare_ether_addr(bssid, sdata->u.ibss.bssid)) + return NULL; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n", + addr, sdata->name); +#endif + + sta = sta_info_alloc(sdata, addr, gfp); + if (!sta) + return NULL; + + sta->last_rx = jiffies; + set_sta_flags(sta, WLAN_STA_AUTHORIZED); + + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + + rate_control_rate_init(sta); + + /* If it fails, maybe we raced another insertion? */ + if (sta_info_insert(sta)) + return sta_info_get(sdata, addr); + return sta; +} + +static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + int active = 0; + struct sta_info *sta; + + lockdep_assert_held(&sdata->u.ibss.mtx); + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sta->sdata == sdata && + time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL, + jiffies)) { + active++; + break; + } + } + + rcu_read_unlock(); + + return active; +} + +/* + * This function is called with state == IEEE80211_IBSS_MLME_JOINED + */ + +static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + lockdep_assert_held(&ifibss->mtx); + + mod_timer(&ifibss->timer, + round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); + + ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT); + + if (time_before(jiffies, ifibss->last_scan_completed + + IEEE80211_IBSS_MERGE_INTERVAL)) + return; + + if (ieee80211_sta_active_ibss(sdata)) + return; + + if (ifibss->fixed_channel) + return; + + printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other " + "IBSS networks with same SSID (merge)\n", sdata->name); + + ieee80211_request_internal_scan(sdata, + ifibss->ssid, ifibss->ssid_len, + ifibss->fixed_channel ? ifibss->channel : NULL); +} + +static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + u8 bssid[ETH_ALEN]; + u16 capability; + int i; + + lockdep_assert_held(&ifibss->mtx); + + if (ifibss->fixed_bssid) { + memcpy(bssid, ifibss->bssid, ETH_ALEN); + } else { + /* Generate random, not broadcast, locally administered BSSID. Mix in + * own MAC address to make sure that devices that do not have proper + * random number generator get different BSSID. */ + get_random_bytes(bssid, ETH_ALEN); + for (i = 0; i < ETH_ALEN; i++) + bssid[i] ^= sdata->vif.addr[i]; + bssid[0] &= ~0x01; + bssid[0] |= 0x02; + } + + printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n", + sdata->name, bssid); + + capability = WLAN_CAPABILITY_IBSS; + + if (ifibss->privacy) + capability |= WLAN_CAPABILITY_PRIVACY; + else + sdata->drop_unencrypted = 0; + + __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, + ifibss->channel, ifibss->basic_rates, + capability, 0); +} + +/* + * This function is called with state == IEEE80211_IBSS_MLME_SEARCH + */ + +static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct cfg80211_bss *cbss; + struct ieee80211_channel *chan = NULL; + const u8 *bssid = NULL; + int active_ibss; + u16 capability; + + lockdep_assert_held(&ifibss->mtx); + + active_ibss = ieee80211_sta_active_ibss(sdata); +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n", + sdata->name, active_ibss); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (active_ibss) + return; + + capability = WLAN_CAPABILITY_IBSS; + if (ifibss->privacy) + capability |= WLAN_CAPABILITY_PRIVACY; + if (ifibss->fixed_bssid) + bssid = ifibss->bssid; + if (ifibss->fixed_channel) + chan = ifibss->channel; + if (!is_zero_ether_addr(ifibss->bssid)) + bssid = ifibss->bssid; + cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid, + ifibss->ssid, ifibss->ssid_len, + WLAN_CAPABILITY_IBSS | WLAN_CAPABILITY_PRIVACY, + capability); + + if (cbss) { + struct ieee80211_bss *bss; + + bss = (void *)cbss->priv; +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG " sta_find_ibss: selected %pM current " + "%pM\n", cbss->bssid, ifibss->bssid); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM" + " based on configured SSID\n", + sdata->name, cbss->bssid); + + ieee80211_sta_join_ibss(sdata, bss); + ieee80211_rx_bss_put(local, bss); + return; + } + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG " did not try to join ibss\n"); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + /* Selected IBSS not found in current scan results - try to scan */ + if (time_after(jiffies, ifibss->last_scan_completed + + IEEE80211_SCAN_INTERVAL)) { + printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to " + "join\n", sdata->name); + + ieee80211_request_internal_scan(sdata, + ifibss->ssid, ifibss->ssid_len, + ifibss->fixed_channel ? ifibss->channel : NULL); + } else { + int interval = IEEE80211_SCAN_INTERVAL; + + if (time_after(jiffies, ifibss->ibss_join_req + + IEEE80211_IBSS_JOIN_TIMEOUT)) { + if (!(local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS)) { + ieee80211_sta_create_ibss(sdata); + return; + } + printk(KERN_DEBUG "%s: IBSS not allowed on" + " %d MHz\n", sdata->name, + local->hw.conf.channel->center_freq); + + /* No IBSS found - decrease scan interval and continue + * scanning. */ + interval = IEEE80211_SCAN_INTERVAL_SLOW; + } + + mod_timer(&ifibss->timer, + round_jiffies(jiffies + interval)); + } +} + +static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, + struct sk_buff *req) +{ + struct ieee80211_mgmt *mgmt = (void *)req->data; + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + int tx_last_beacon, len = req->len; + struct sk_buff *skb; + struct ieee80211_mgmt *resp; + struct sk_buff *presp; + u8 *pos, *end; + + lockdep_assert_held(&ifibss->mtx); + + presp = rcu_dereference_protected(ifibss->presp, + lockdep_is_held(&ifibss->mtx)); + + if (ifibss->state != IEEE80211_IBSS_MLME_JOINED || + len < 24 + 2 || !presp) + return; + + tx_last_beacon = drv_tx_last_beacon(local); + +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM" + " (tx_last_beacon=%d)\n", + sdata->name, mgmt->sa, mgmt->da, + mgmt->bssid, tx_last_beacon); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + + if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da)) + return; + + if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 && + memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0) + return; + + end = ((u8 *) mgmt) + len; + pos = mgmt->u.probe_req.variable; + if (pos[0] != WLAN_EID_SSID || + pos + 2 + pos[1] > end) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq " + "from %pM\n", + sdata->name, mgmt->sa); +#endif + return; + } + if (pos[1] != 0 && + (pos[1] != ifibss->ssid_len || + memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) { + /* Ignore ProbeReq for foreign SSID */ + return; + } + + /* Reply with ProbeResp */ + skb = skb_copy(presp, GFP_KERNEL); + if (!skb) + return; + + resp = (struct ieee80211_mgmt *) skb->data; + memcpy(resp->da, mgmt->sa, ETH_ALEN); +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n", + sdata->name, resp->da); +#endif /* CONFIG_MAC80211_IBSS_DEBUG */ + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + size_t baselen; + struct ieee802_11_elems elems; + + if (memcmp(mgmt->da, sdata->vif.addr, ETH_ALEN)) + return; /* ignore ProbeResp to foreign address */ + + baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, + &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false); +} + +static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + size_t baselen; + struct ieee802_11_elems elems; + + /* Process beacon from the current BSS */ + baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true); +} + +void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_rx_status *rx_status; + struct ieee80211_mgmt *mgmt; + u16 fc; + + rx_status = IEEE80211_SKB_RXCB(skb); + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + mutex_lock(&sdata->u.ibss.mtx); + + if (!sdata->u.ibss.ssid_len) + goto mgmt_out; /* not ready to merge yet */ + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_PROBE_REQ: + ieee80211_rx_mgmt_probe_req(sdata, skb); + break; + case IEEE80211_STYPE_PROBE_RESP: + ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_BEACON: + ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_AUTH: + ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len); + break; + } + + mgmt_out: + mutex_unlock(&sdata->u.ibss.mtx); +} + +void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + mutex_lock(&ifibss->mtx); + + /* + * Work could be scheduled after scan or similar + * when we aren't even joined (or trying) with a + * network. + */ + if (!ifibss->ssid_len) + goto out; + + switch (ifibss->state) { + case IEEE80211_IBSS_MLME_SEARCH: + ieee80211_sta_find_ibss(sdata); + break; + case IEEE80211_IBSS_MLME_JOINED: + ieee80211_sta_merge_ibss(sdata); + break; + default: + WARN_ON(1); + break; + } + + out: + mutex_unlock(&ifibss->mtx); +} + +static void ieee80211_ibss_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + + if (local->quiescing) { + ifibss->timer_running = true; + return; + } + + ieee80211_queue_work(&local->hw, &sdata->work); +} + +#ifdef CONFIG_PM +void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + if (del_timer_sync(&ifibss->timer)) + ifibss->timer_running = true; +} + +void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + if (ifibss->timer_running) { + add_timer(&ifibss->timer); + ifibss->timer_running = false; + } +} +#endif + +void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + + setup_timer(&ifibss->timer, ieee80211_ibss_timer, + (unsigned long) sdata); + mutex_init(&ifibss->mtx); +} + +/* scan finished notification */ +void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + if (sdata->vif.type != NL80211_IFTYPE_ADHOC) + continue; + sdata->u.ibss.last_scan_completed = jiffies; + ieee80211_queue_work(&local->hw, &sdata->work); + } + mutex_unlock(&local->iflist_mtx); +} + +int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, + struct cfg80211_ibss_params *params) +{ + struct sk_buff *skb; + + skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom + + 36 /* bitrates */ + + 34 /* SSID */ + + 3 /* DS params */ + + 4 /* IBSS params */ + + params->ie_len); + if (!skb) + return -ENOMEM; + + mutex_lock(&sdata->u.ibss.mtx); + + if (params->bssid) { + memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN); + sdata->u.ibss.fixed_bssid = true; + } else + sdata->u.ibss.fixed_bssid = false; + + sdata->u.ibss.privacy = params->privacy; + sdata->u.ibss.basic_rates = params->basic_rates; + memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate, + sizeof(params->mcast_rate)); + + sdata->vif.bss_conf.beacon_int = params->beacon_interval; + + sdata->u.ibss.channel = params->channel; + sdata->u.ibss.fixed_channel = params->channel_fixed; + + /* fix ourselves to that channel now already */ + if (params->channel_fixed) { + sdata->local->oper_channel = params->channel; + WARN_ON(!ieee80211_set_channel_type(sdata->local, sdata, + NL80211_CHAN_NO_HT)); + } + + if (params->ie) { + sdata->u.ibss.ie = kmemdup(params->ie, params->ie_len, + GFP_KERNEL); + if (sdata->u.ibss.ie) + sdata->u.ibss.ie_len = params->ie_len; + } + + sdata->u.ibss.skb = skb; + sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; + sdata->u.ibss.ibss_join_req = jiffies; + + memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN); + sdata->u.ibss.ssid_len = params->ssid_len; + + mutex_unlock(&sdata->u.ibss.mtx); + + mutex_lock(&sdata->local->mtx); + ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); + + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + + return 0; +} + +int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) +{ + struct sk_buff *skb; + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_local *local = sdata->local; + struct cfg80211_bss *cbss; + u16 capability; + int active_ibss; + + mutex_lock(&sdata->u.ibss.mtx); + + sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; + memset(sdata->u.ibss.bssid, 0, ETH_ALEN); + sdata->u.ibss.ssid_len = 0; + + active_ibss = ieee80211_sta_active_ibss(sdata); + + if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) { + capability = WLAN_CAPABILITY_IBSS; + + if (ifibss->privacy) + capability |= WLAN_CAPABILITY_PRIVACY; + + cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->channel, + ifibss->bssid, ifibss->ssid, + ifibss->ssid_len, WLAN_CAPABILITY_IBSS | + WLAN_CAPABILITY_PRIVACY, + capability); + + if (cbss) { + cfg80211_unlink_bss(local->hw.wiphy, cbss); + cfg80211_put_bss(cbss); + } + } + + sta_info_flush(sdata->local, sdata); + + /* remove beacon */ + kfree(sdata->u.ibss.ie); + skb = rcu_dereference_protected(sdata->u.ibss.presp, + lockdep_is_held(&sdata->u.ibss.mtx)); + rcu_assign_pointer(sdata->u.ibss.presp, NULL); + sdata->vif.bss_conf.ibss_joined = false; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED | + BSS_CHANGED_IBSS); + synchronize_rcu(); + kfree_skb(skb); + + skb_queue_purge(&sdata->skb_queue); + + del_timer_sync(&sdata->u.ibss.timer); + + mutex_unlock(&sdata->u.ibss.mtx); + + mutex_lock(&local->mtx); + ieee80211_recalc_idle(sdata->local); + mutex_unlock(&local->mtx); + + return 0; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h new file mode 100644 index 00000000..3fdac77b --- /dev/null +++ b/net/mac80211/ieee80211_i.h @@ -0,0 +1,1408 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007-2010 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef IEEE80211_I_H +#define IEEE80211_I_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "key.h" +#include "sta_info.h" + +struct ieee80211_local; + +/* Maximum number of broadcast/multicast frames to buffer when some of the + * associated stations are using power saving. */ +#define AP_MAX_BC_BUFFER 128 + +/* Maximum number of frames buffered to all STAs, including multicast frames. + * Note: increasing this limit increases the potential memory requirement. Each + * frame can be up to about 2 kB long. */ +#define TOTAL_MAX_TX_BUFFER 512 + +/* Required encryption head and tailroom */ +#define IEEE80211_ENCRYPT_HEADROOM 8 +#define IEEE80211_ENCRYPT_TAILROOM 18 + +/* IEEE 802.11 (Ch. 9.5 Defragmentation) requires support for concurrent + * reception of at least three fragmented frames. This limit can be increased + * by changing this define, at the cost of slower frame reassembly and + * increased memory use (about 2 kB of RAM per entry). */ +#define IEEE80211_FRAGMENT_MAX 4 + +#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) + +#define IEEE80211_DEFAULT_UAPSD_QUEUES \ + (IEEE80211_WMM_IE_STA_QOSINFO_AC_BK | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_BE | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_VI | \ + IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + +#define IEEE80211_DEFAULT_MAX_SP_LEN \ + IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL + +struct ieee80211_fragment_entry { + unsigned long first_frag_time; + unsigned int seq; + unsigned int rx_queue; + unsigned int last_frag; + unsigned int extra_len; + struct sk_buff_head skb_list; + int ccmp; /* Whether fragments were encrypted with CCMP */ + u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ +}; + + +struct ieee80211_bss { + /* don't want to look up all the time */ + size_t ssid_len; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + + u8 dtim_period; + + bool wmm_used; + bool uapsd_supported; + + unsigned long last_probe_resp; + +#ifdef CONFIG_MAC80211_MESH + u8 *mesh_id; + size_t mesh_id_len; + u8 *mesh_cfg; +#endif + +#define IEEE80211_MAX_SUPP_RATES 32 + u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; + size_t supp_rates_len; + + /* + * During association, we save an ERP value from a probe response so + * that we can feed ERP info to the driver when handling the + * association completes. these fields probably won't be up-to-date + * otherwise, you probably don't want to use them. + */ + bool has_erp_value; + u8 erp_value; +}; + +static inline u8 *bss_mesh_cfg(struct ieee80211_bss *bss) +{ +#ifdef CONFIG_MAC80211_MESH + return bss->mesh_cfg; +#endif + return NULL; +} + +static inline u8 *bss_mesh_id(struct ieee80211_bss *bss) +{ +#ifdef CONFIG_MAC80211_MESH + return bss->mesh_id; +#endif + return NULL; +} + +static inline u8 bss_mesh_id_len(struct ieee80211_bss *bss) +{ +#ifdef CONFIG_MAC80211_MESH + return bss->mesh_id_len; +#endif + return 0; +} + + +typedef unsigned __bitwise__ ieee80211_tx_result; +#define TX_CONTINUE ((__force ieee80211_tx_result) 0u) +#define TX_DROP ((__force ieee80211_tx_result) 1u) +#define TX_QUEUED ((__force ieee80211_tx_result) 2u) + +#define IEEE80211_TX_FRAGMENTED BIT(0) +#define IEEE80211_TX_UNICAST BIT(1) +#define IEEE80211_TX_PS_BUFFERED BIT(2) + +struct ieee80211_tx_data { + struct sk_buff *skb; + struct ieee80211_local *local; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + struct ieee80211_key *key; + + struct ieee80211_channel *channel; + + u16 ethertype; + unsigned int flags; +}; + + +typedef unsigned __bitwise__ ieee80211_rx_result; +#define RX_CONTINUE ((__force ieee80211_rx_result) 0u) +#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u) +#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u) +#define RX_QUEUED ((__force ieee80211_rx_result) 3u) + +/** + * enum ieee80211_packet_rx_flags - packet RX flags + * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed + * (incl. multicast frames) + * @IEEE80211_RX_IN_SCAN: received while scanning + * @IEEE80211_RX_FRAGMENTED: fragmented frame + * @IEEE80211_RX_AMSDU: a-MSDU packet + * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed + * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering + * + * These are per-frame flags that are attached to a frame in the + * @rx_flags field of &struct ieee80211_rx_status. + */ +enum ieee80211_packet_rx_flags { + IEEE80211_RX_IN_SCAN = BIT(0), + IEEE80211_RX_RA_MATCH = BIT(1), + IEEE80211_RX_FRAGMENTED = BIT(2), + IEEE80211_RX_AMSDU = BIT(3), + IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), + IEEE80211_RX_DEFERRED_RELEASE = BIT(5), +}; + +/** + * enum ieee80211_rx_flags - RX data flags + * + * @IEEE80211_RX_CMNTR: received on cooked monitor already + * + * These flags are used across handling multiple interfaces + * for a single frame. + */ +enum ieee80211_rx_flags { + IEEE80211_RX_CMNTR = BIT(0), +}; + +struct ieee80211_rx_data { + struct sk_buff *skb; + struct ieee80211_local *local; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + struct ieee80211_key *key; + + unsigned int flags; + int queue; + u32 tkip_iv32; + u16 tkip_iv16; +}; + +struct beacon_data { + u8 *head, *tail; + int head_len, tail_len; + int dtim_period; +}; + +struct ieee80211_if_ap { + struct beacon_data __rcu *beacon; + + struct list_head vlans; + + /* yes, this looks ugly, but guarantees that we can later use + * bitmap_empty :) + * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */ + u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)]; + struct sk_buff_head ps_bc_buf; + atomic_t num_sta_ps; /* number of stations in PS mode */ + int dtim_count; + bool dtim_bc_mc; +}; + +struct ieee80211_if_wds { + struct sta_info *sta; + u8 remote_addr[ETH_ALEN]; +}; + +struct ieee80211_if_vlan { + struct list_head list; + + /* used for all tx if the VLAN is configured to 4-addr mode */ + struct sta_info __rcu *sta; +}; + +struct mesh_stats { + __u32 fwded_mcast; /* Mesh forwarded multicast frames */ + __u32 fwded_unicast; /* Mesh forwarded unicast frames */ + __u32 fwded_frames; /* Mesh total forwarded frames */ + __u32 dropped_frames_ttl; /* Not transmitted since mesh_ttl == 0*/ + __u32 dropped_frames_no_route; /* Not transmitted, no route found */ + atomic_t estab_plinks; +}; + +#define PREQ_Q_F_START 0x1 +#define PREQ_Q_F_REFRESH 0x2 +struct mesh_preq_queue { + struct list_head list; + u8 dst[ETH_ALEN]; + u8 flags; +}; + +enum ieee80211_work_type { + IEEE80211_WORK_ABORT, + IEEE80211_WORK_DIRECT_PROBE, + IEEE80211_WORK_AUTH, + IEEE80211_WORK_ASSOC_BEACON_WAIT, + IEEE80211_WORK_ASSOC, + IEEE80211_WORK_REMAIN_ON_CHANNEL, + IEEE80211_WORK_OFFCHANNEL_TX, +}; + +/** + * enum work_done_result - indicates what to do after work was done + * + * @WORK_DONE_DESTROY: This work item is no longer needed, destroy. + * @WORK_DONE_REQUEUE: This work item was reset to be reused, and + * should be requeued. + */ +enum work_done_result { + WORK_DONE_DESTROY, + WORK_DONE_REQUEUE, +}; + +struct ieee80211_work { + struct list_head list; + + struct rcu_head rcu_head; + + struct ieee80211_sub_if_data *sdata; + + enum work_done_result (*done)(struct ieee80211_work *wk, + struct sk_buff *skb); + + struct ieee80211_channel *chan; + enum nl80211_channel_type chan_type; + + unsigned long timeout; + enum ieee80211_work_type type; + + u8 filter_ta[ETH_ALEN]; + + bool started; + + union { + struct { + int tries; + u16 algorithm, transaction; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + u8 key[WLAN_KEY_LEN_WEP104]; + u8 key_len, key_idx; + bool privacy; + } probe_auth; + struct { + struct cfg80211_bss *bss; + const u8 *supp_rates; + const u8 *ht_information_ie; + enum ieee80211_smps_mode smps; + int tries; + u16 capability; + u8 prev_bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len; + u8 supp_rates_len; + bool wmm_used, use_11n, uapsd_used; + } assoc; + struct { + u32 duration; + } remain; + struct { + struct sk_buff *frame; + u32 wait; + bool status; + } offchan_tx; + }; + + int ie_len; + /* must be last */ + u8 ie[0]; +}; + +/* flags used in struct ieee80211_if_managed.flags */ +enum ieee80211_sta_flags { + IEEE80211_STA_BEACON_POLL = BIT(0), + IEEE80211_STA_CONNECTION_POLL = BIT(1), + IEEE80211_STA_CONTROL_PORT = BIT(2), + IEEE80211_STA_DISABLE_11N = BIT(4), + IEEE80211_STA_CSA_RECEIVED = BIT(5), + IEEE80211_STA_MFP_ENABLED = BIT(6), + IEEE80211_STA_UAPSD_ENABLED = BIT(7), + IEEE80211_STA_NULLFUNC_ACKED = BIT(8), + IEEE80211_STA_RESET_SIGNAL_AVE = BIT(9), +}; + +struct ieee80211_if_managed { + struct timer_list timer; + struct timer_list conn_mon_timer; + struct timer_list bcn_mon_timer; + struct timer_list chswitch_timer; + struct work_struct monitor_work; + struct work_struct chswitch_work; + struct work_struct beacon_connection_loss_work; + + unsigned long beacon_timeout; + unsigned long probe_timeout; + int probe_send_count; + bool nullfunc_failed; + + struct mutex mtx; + struct cfg80211_bss *associated; + + u8 bssid[ETH_ALEN]; + + u16 aid; + + unsigned long timers_running; /* used for quiesce/restart */ + bool powersave; /* powersave requested for this iface */ + bool broken_ap; /* AP is broken -- turn off powersave */ + enum ieee80211_smps_mode req_smps, /* requested smps mode */ + ap_smps, /* smps mode AP thinks we're in */ + driver_smps_mode; /* smps mode request */ + + struct work_struct request_smps_work; + + unsigned int flags; + + bool beacon_crc_valid; + u32 beacon_crc; + + enum { + IEEE80211_MFP_DISABLED, + IEEE80211_MFP_OPTIONAL, + IEEE80211_MFP_REQUIRED + } mfp; /* management frame protection */ + + int wmm_last_param_set; + + u8 use_4addr; + + /* Signal strength from the last Beacon frame in the current BSS. */ + int last_beacon_signal; + + /* + * Weighted average of the signal strength from Beacon frames in the + * current BSS. This is in units of 1/16 of the signal unit to maintain + * accuracy and to speed up calculations, i.e., the value need to be + * divided by 16 to get the actual value. + */ + int ave_beacon_signal; + + /* + * Number of Beacon frames used in ave_beacon_signal. This can be used + * to avoid generating less reliable cqm events that would be based + * only on couple of received frames. + */ + unsigned int count_beacon_signal; + + /* + * Last Beacon frame signal strength average (ave_beacon_signal / 16) + * that triggered a cqm event. 0 indicates that no event has been + * generated for the current association. + */ + int last_cqm_event_signal; +}; + +struct ieee80211_if_ibss { + struct timer_list timer; + + struct mutex mtx; + + unsigned long last_scan_completed; + + u32 basic_rates; + + bool timer_running; + + bool fixed_bssid; + bool fixed_channel; + bool privacy; + + u8 bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + u8 ssid_len, ie_len; + u8 *ie; + struct ieee80211_channel *channel; + + unsigned long ibss_join_req; + /* probe response/beacon for IBSS */ + struct sk_buff __rcu *presp; + struct sk_buff *skb; + + enum { + IEEE80211_IBSS_MLME_SEARCH, + IEEE80211_IBSS_MLME_JOINED, + } state; +}; + +struct ieee80211_if_mesh { + struct timer_list housekeeping_timer; + struct timer_list mesh_path_timer; + struct timer_list mesh_path_root_timer; + + unsigned long timers_running; + + unsigned long wrkq_flags; + + u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN]; + size_t mesh_id_len; + /* Active Path Selection Protocol Identifier */ + u8 mesh_pp_id; + /* Active Path Selection Metric Identifier */ + u8 mesh_pm_id; + /* Congestion Control Mode Identifier */ + u8 mesh_cc_id; + /* Synchronization Protocol Identifier */ + u8 mesh_sp_id; + /* Authentication Protocol Identifier */ + u8 mesh_auth_id; + /* Local mesh Sequence Number */ + u32 sn; + /* Last used PREQ ID */ + u32 preq_id; + atomic_t mpaths; + /* Timestamp of last SN update */ + unsigned long last_sn_update; + /* Timestamp of last SN sent */ + unsigned long last_preq; + struct mesh_rmc *rmc; + spinlock_t mesh_preq_queue_lock; + struct mesh_preq_queue preq_queue; + int preq_queue_len; + struct mesh_stats mshstats; + struct mesh_config mshcfg; + u32 mesh_seqnum; + bool accepting_plinks; + const u8 *ie; + u8 ie_len; + enum { + IEEE80211_MESH_SEC_NONE = 0x0, + IEEE80211_MESH_SEC_AUTHED = 0x1, + IEEE80211_MESH_SEC_SECURED = 0x2, + } security; +}; + +#ifdef CONFIG_MAC80211_MESH +#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ + do { (msh)->mshstats.name++; } while (0) +#else +#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \ + do { } while (0) +#endif + +/** + * enum ieee80211_sub_if_data_flags - virtual interface flags + * + * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets + * @IEEE80211_SDATA_PROMISC: interface is promisc + * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode + * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between + * associated stations and deliver multicast frames both + * back to wireless media and to the local net stack. + */ +enum ieee80211_sub_if_data_flags { + IEEE80211_SDATA_ALLMULTI = BIT(0), + IEEE80211_SDATA_PROMISC = BIT(1), + IEEE80211_SDATA_OPERATING_GMODE = BIT(2), + IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), +}; + +/** + * enum ieee80211_sdata_state_bits - virtual interface state bits + * @SDATA_STATE_RUNNING: virtual interface is up & running; this + * mirrors netif_running() but is separate for interface type + * change handling while the interface is up + * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel + * mode, so queues are stopped + */ +enum ieee80211_sdata_state_bits { + SDATA_STATE_RUNNING, + SDATA_STATE_OFFCHANNEL, +}; + +struct ieee80211_sub_if_data { + struct list_head list; + + struct wireless_dev wdev; + + /* keys */ + struct list_head key_list; + + struct net_device *dev; + struct ieee80211_local *local; + + unsigned int flags; + + unsigned long state; + + int drop_unencrypted; + + char name[IFNAMSIZ]; + + /* + * keep track of whether the HT opmode (stored in + * vif.bss_info.ht_operation_mode) is valid. + */ + bool ht_opmode_valid; + + /* to detect idle changes */ + bool old_idle; + + /* Fragment table for host-based reassembly */ + struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; + unsigned int fragment_next; + + struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; + struct ieee80211_key __rcu *default_unicast_key; + struct ieee80211_key __rcu *default_multicast_key; + struct ieee80211_key __rcu *default_mgmt_key; + + u16 sequence_number; + __be16 control_port_protocol; + bool control_port_no_encrypt; + + struct work_struct work; + struct sk_buff_head skb_queue; + + bool arp_filter_state; + + /* + * AP this belongs to: self in AP mode and + * corresponding AP in VLAN mode, NULL for + * all others (might be needed later in IBSS) + */ + struct ieee80211_if_ap *bss; + + /* bitmap of allowed (non-MCS) rate indexes for rate control */ + u32 rc_rateidx_mask[IEEE80211_NUM_BANDS]; + + union { + struct ieee80211_if_ap ap; + struct ieee80211_if_wds wds; + struct ieee80211_if_vlan vlan; + struct ieee80211_if_managed mgd; + struct ieee80211_if_ibss ibss; + struct ieee80211_if_mesh mesh; + u32 mntr_flags; + } u; + +#ifdef CONFIG_MAC80211_DEBUGFS + struct { + struct dentry *dir; + struct dentry *subdir_stations; + struct dentry *default_unicast_key; + struct dentry *default_multicast_key; + struct dentry *default_mgmt_key; + } debugfs; +#endif + /* must be last, dynamically sized area in this! */ + struct ieee80211_vif vif; +}; + +static inline +struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) +{ + return container_of(p, struct ieee80211_sub_if_data, vif); +} + +enum sdata_queue_type { + IEEE80211_SDATA_QUEUE_TYPE_FRAME = 0, + IEEE80211_SDATA_QUEUE_AGG_START = 1, + IEEE80211_SDATA_QUEUE_AGG_STOP = 2, +}; + +enum { + IEEE80211_RX_MSG = 1, + IEEE80211_TX_STATUS_MSG = 2, +}; + +enum queue_stop_reason { + IEEE80211_QUEUE_STOP_REASON_DRIVER, + IEEE80211_QUEUE_STOP_REASON_PS, + IEEE80211_QUEUE_STOP_REASON_CSA, + IEEE80211_QUEUE_STOP_REASON_AGGREGATION, + IEEE80211_QUEUE_STOP_REASON_SUSPEND, + IEEE80211_QUEUE_STOP_REASON_SKB_ADD, +}; + +#ifdef CONFIG_MAC80211_LEDS +struct tpt_led_trigger { + struct led_trigger trig; + char name[32]; + const struct ieee80211_tpt_blink *blink_table; + unsigned int blink_table_len; + struct timer_list timer; + unsigned long prev_traffic; + unsigned long tx_bytes, rx_bytes; + unsigned int active, want; + bool running; +}; +#endif + +/** + * mac80211 scan flags - currently active scan mode + * + * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as + * well be on the operating channel + * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to + * determine if we are on the operating channel or not + * @SCAN_COMPLETED: Set for our scan work function when the driver reported + * that the scan completed. + * @SCAN_ABORTED: Set for our scan work function when the driver reported + * a scan complete for an aborted scan. + */ +enum { + SCAN_SW_SCANNING, + SCAN_HW_SCANNING, + SCAN_COMPLETED, + SCAN_ABORTED, +}; + +/** + * enum mac80211_scan_state - scan state machine states + * + * @SCAN_DECISION: Main entry point to the scan state machine, this state + * determines if we should keep on scanning or switch back to the + * operating channel + * @SCAN_SET_CHANNEL: Set the next channel to be scanned + * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses + * @SCAN_LEAVE_OPER_CHANNEL: Leave the operating channel, notify the AP + * about us leaving the channel and stop all associated STA interfaces + * @SCAN_ENTER_OPER_CHANNEL: Enter the operating channel again, notify the + * AP about us being back and restart all associated STA interfaces + */ +enum mac80211_scan_state { + SCAN_DECISION, + SCAN_SET_CHANNEL, + SCAN_SEND_PROBE, + SCAN_LEAVE_OPER_CHANNEL, + SCAN_ENTER_OPER_CHANNEL, +}; + +struct ieee80211_local { + /* embed the driver visible part. + * don't cast (use the static inlines below), but we keep + * it first anyway so they become a no-op */ + struct ieee80211_hw hw; + + const struct ieee80211_ops *ops; + + /* + * work stuff, potentially off-channel (in the future) + */ + struct list_head work_list; + struct timer_list work_timer; + struct work_struct work_work; + struct sk_buff_head work_skb_queue; + + /* + * private workqueue to mac80211. mac80211 makes this accessible + * via ieee80211_queue_work() + */ + struct workqueue_struct *workqueue; + + unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; + /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ + spinlock_t queue_stop_reason_lock; + + int open_count; + int monitors, cooked_mntrs; + /* number of interfaces with corresponding FIF_ flags */ + int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, + fif_probe_req; + int probe_req_reg; + unsigned int filter_flags; /* FIF_* */ + + bool wiphy_ciphers_allocated; + + /* protects the aggregated multicast list and filter calls */ + spinlock_t filter_lock; + + /* used for uploading changed mc list */ + struct work_struct reconfig_filter; + + /* used to reconfigure hardware SM PS */ + struct work_struct recalc_smps; + + /* aggregated multicast list */ + struct netdev_hw_addr_list mc_list; + + bool tim_in_locked_section; /* see ieee80211_beacon_get() */ + + /* + * suspended is true if we finished all the suspend _and_ we have + * not yet come up from resume. This is to be used by mac80211 + * to ensure driver sanity during suspend and mac80211's own + * sanity. It can eventually be used for WoW as well. + */ + bool suspended; + + /* + * Resuming is true while suspended, but when we're reprogramming the + * hardware -- at that time it's allowed to use ieee80211_queue_work() + * again even though some other parts of the stack are still suspended + * and we still drop received frames to avoid waking the stack. + */ + bool resuming; + + /* + * quiescing is true during the suspend process _only_ to + * ease timer cancelling etc. + */ + bool quiescing; + + /* device is started */ + bool started; + + /* wowlan is enabled -- don't reconfig on resume */ + bool wowlan; + + int tx_headroom; /* required headroom for hardware/radiotap */ + + /* Tasklet and skb queue to process calls from IRQ mode. All frames + * added to skb_queue will be processed, but frames in + * skb_queue_unreliable may be dropped if the total length of these + * queues increases over the limit. */ +#define IEEE80211_IRQSAFE_QUEUE_LIMIT 128 + struct tasklet_struct tasklet; + struct sk_buff_head skb_queue; + struct sk_buff_head skb_queue_unreliable; + + /* + * Internal FIFO queue which is shared between multiple rx path + * stages. Its main task is to provide a serialization mechanism, + * so all rx handlers can enjoy having exclusive access to their + * private data structures. + */ + struct sk_buff_head rx_skb_queue; + bool running_rx_handler; /* protected by rx_skb_queue.lock */ + + /* Station data */ + /* + * The mutex only protects the list and counter, + * reads are done in RCU. + * Additionally, the lock protects the hash table, + * the pending list and each BSS's TIM bitmap. + */ + struct mutex sta_mtx; + spinlock_t sta_lock; + unsigned long num_sta; + struct list_head sta_list, sta_pending_list; + struct sta_info __rcu *sta_hash[STA_HASH_SIZE]; + struct timer_list sta_cleanup; + struct work_struct sta_finish_work; + int sta_generation; + + struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; + struct tasklet_struct tx_pending_tasklet; + + atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; + + /* number of interfaces with corresponding IFF_ flags */ + atomic_t iff_allmultis, iff_promiscs; + + struct rate_control_ref *rate_ctrl; + + struct crypto_cipher *wep_tx_tfm; + struct crypto_cipher *wep_rx_tfm; + u32 wep_iv; + + /* see iface.c */ + struct list_head interfaces; + struct mutex iflist_mtx; + + /* + * Key mutex, protects sdata's key_list and sta_info's + * key pointers (write access, they're RCU.) + */ + struct mutex key_mtx; + + /* mutex for scan and work locking */ + struct mutex mtx; + + /* Scanning and BSS list */ + unsigned long scanning; + struct cfg80211_ssid scan_ssid; + struct cfg80211_scan_request *int_scan_req; + struct cfg80211_scan_request *scan_req, *hw_scan_req; + struct ieee80211_channel *scan_channel; + enum ieee80211_band hw_scan_band; + int scan_channel_idx; + int scan_ies_len; + + bool sched_scanning; + struct ieee80211_sched_scan_ies sched_scan_ies; + struct work_struct sched_scan_stopped_work; + + unsigned long leave_oper_channel_time; + enum mac80211_scan_state next_scan_state; + struct delayed_work scan_work; + struct ieee80211_sub_if_data *scan_sdata; + enum nl80211_channel_type _oper_channel_type; + struct ieee80211_channel *oper_channel, *csa_channel; + + /* Temporary remain-on-channel for off-channel operations */ + struct ieee80211_channel *tmp_channel; + enum nl80211_channel_type tmp_channel_type; + + /* SNMP counters */ + /* dot11CountersTable */ + u32 dot11TransmittedFragmentCount; + u32 dot11MulticastTransmittedFrameCount; + u32 dot11FailedCount; + u32 dot11RetryCount; + u32 dot11MultipleRetryCount; + u32 dot11FrameDuplicateCount; + u32 dot11ReceivedFragmentCount; + u32 dot11MulticastReceivedFrameCount; + u32 dot11TransmittedFrameCount; + +#ifdef CONFIG_MAC80211_LEDS + int tx_led_counter, rx_led_counter; + struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; + struct tpt_led_trigger *tpt_led_trigger; + char tx_led_name[32], rx_led_name[32], + assoc_led_name[32], radio_led_name[32]; +#endif + +#ifdef CONFIG_MAC80211_DEBUG_COUNTERS + /* TX/RX handler statistics */ + unsigned int tx_handlers_drop; + unsigned int tx_handlers_queued; + unsigned int tx_handlers_drop_unencrypted; + unsigned int tx_handlers_drop_fragment; + unsigned int tx_handlers_drop_wep; + unsigned int tx_handlers_drop_not_assoc; + unsigned int tx_handlers_drop_unauth_port; + unsigned int rx_handlers_drop; + unsigned int rx_handlers_queued; + unsigned int rx_handlers_drop_nullfunc; + unsigned int rx_handlers_drop_defrag; + unsigned int rx_handlers_drop_short; + unsigned int rx_handlers_drop_passive_scan; + unsigned int tx_expand_skb_head; + unsigned int tx_expand_skb_head_cloned; + unsigned int rx_expand_skb_head; + unsigned int rx_expand_skb_head2; + unsigned int rx_handlers_fragments; + unsigned int tx_status_drop; +#define I802_DEBUG_INC(c) (c)++ +#else /* CONFIG_MAC80211_DEBUG_COUNTERS */ +#define I802_DEBUG_INC(c) do { } while (0) +#endif /* CONFIG_MAC80211_DEBUG_COUNTERS */ + + + int total_ps_buffered; /* total number of all buffered unicast and + * multicast packets for power saving stations + */ + int wifi_wme_noack_test; + unsigned int wmm_acm; /* bit field of ACM bits (BIT(802.1D tag)) */ + + /* + * Bitmask of enabled u-apsd queues, + * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association + * to take effect. + */ + unsigned int uapsd_queues; + + /* + * Maximum number of buffered frames AP can deliver during a + * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar. + * Needs a new association to take effect. + */ + unsigned int uapsd_max_sp_len; + + bool pspolling; + bool offchannel_ps_enabled; + /* + * PS can only be enabled when we have exactly one managed + * interface (and monitors) in PS, this then points there. + */ + struct ieee80211_sub_if_data *ps_sdata; + struct work_struct dynamic_ps_enable_work; + struct work_struct dynamic_ps_disable_work; + struct timer_list dynamic_ps_timer; + struct notifier_block network_latency_notifier; + struct notifier_block ifa_notifier; + + /* + * The dynamic ps timeout configured from user space via WEXT - + * this will override whatever chosen by mac80211 internally. + */ + int dynamic_ps_forced_timeout; + int dynamic_ps_user_timeout; + bool disable_dynamic_ps; + + int user_power_level; /* in dBm */ + int power_constr_level; /* in dBm */ + + enum ieee80211_smps_mode smps_mode; + + struct work_struct restart_work; + +#ifdef CONFIG_MAC80211_DEBUGFS + struct local_debugfsdentries { + struct dentry *rcdir; + struct dentry *keys; + } debugfs; +#endif + + struct ieee80211_channel *hw_roc_channel; + struct net_device *hw_roc_dev; + struct sk_buff *hw_roc_skb, *hw_roc_skb_for_status; + struct work_struct hw_roc_start, hw_roc_done; + enum nl80211_channel_type hw_roc_channel_type; + unsigned int hw_roc_duration; + u32 hw_roc_cookie; + bool hw_roc_for_tx; + unsigned long hw_offchan_tx_cookie; + + /* dummy netdev for use w/ NAPI */ + struct net_device napi_dev; + + struct napi_struct napi; +}; + +static inline struct ieee80211_sub_if_data * +IEEE80211_DEV_TO_SUB_IF(struct net_device *dev) +{ + return netdev_priv(dev); +} + +/* this struct represents 802.11n's RA/TID combination */ +struct ieee80211_ra_tid { + u8 ra[ETH_ALEN]; + u16 tid; +}; + +/* Parsed Information Elements */ +struct ieee802_11_elems { + u8 *ie_start; + size_t total_len; + + /* pointers to IEs */ + u8 *ssid; + u8 *supp_rates; + u8 *fh_params; + u8 *ds_params; + u8 *cf_params; + struct ieee80211_tim_ie *tim; + u8 *ibss_params; + u8 *challenge; + u8 *wpa; + u8 *rsn; + u8 *erp_info; + u8 *ext_supp_rates; + u8 *wmm_info; + u8 *wmm_param; + struct ieee80211_ht_cap *ht_cap_elem; + struct ieee80211_ht_info *ht_info_elem; + struct ieee80211_meshconf_ie *mesh_config; + u8 *mesh_id; + u8 *peer_link; + u8 *preq; + u8 *prep; + u8 *perr; + struct ieee80211_rann_ie *rann; + u8 *ch_switch_elem; + u8 *country_elem; + u8 *pwr_constr_elem; + u8 *quiet_elem; /* first quite element */ + u8 *timeout_int; + + /* length of them, respectively */ + u8 ssid_len; + u8 supp_rates_len; + u8 fh_params_len; + u8 ds_params_len; + u8 cf_params_len; + u8 tim_len; + u8 ibss_params_len; + u8 challenge_len; + u8 wpa_len; + u8 rsn_len; + u8 erp_info_len; + u8 ext_supp_rates_len; + u8 wmm_info_len; + u8 wmm_param_len; + u8 mesh_id_len; + u8 peer_link_len; + u8 preq_len; + u8 prep_len; + u8 perr_len; + u8 ch_switch_elem_len; + u8 country_elem_len; + u8 pwr_constr_elem_len; + u8 quiet_elem_len; + u8 num_of_quiet_elem; /* can be more the one */ + u8 timeout_int_len; +}; + +static inline struct ieee80211_local *hw_to_local( + struct ieee80211_hw *hw) +{ + return container_of(hw, struct ieee80211_local, hw); +} + +static inline struct ieee80211_hw *local_to_hw( + struct ieee80211_local *local) +{ + return &local->hw; +} + + +static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) +{ + return compare_ether_addr(raddr, addr) == 0 || + is_broadcast_ether_addr(raddr); +} + + +int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); +void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); +void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, + u32 changed); +void ieee80211_configure_filter(struct ieee80211_local *local); +u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); + +/* STA code */ +void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); +int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, + struct cfg80211_auth_request *req); +int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, + struct cfg80211_assoc_request *req); +int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, + struct cfg80211_deauth_request *req, + void *cookie); +int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, + struct cfg80211_disassoc_request *req, + void *cookie); +void ieee80211_send_pspoll(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); +void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency); +int ieee80211_max_network_latency(struct notifier_block *nb, + unsigned long data, void *dummy); +int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel_sw_ie *sw_elem, + struct ieee80211_bss *bss, + u64 timestamp); +void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); + +/* IBSS code */ +void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); +void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata); +struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, + u8 *bssid, u8 *addr, u32 supp_rates, + gfp_t gfp); +int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, + struct cfg80211_ibss_params *params); +int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata); +void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata); +void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata); +void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata); +void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); + +/* mesh code */ +void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata); +void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); + +/* scan/BSS handling */ +void ieee80211_scan_work(struct work_struct *work); +int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, + const u8 *ssid, u8 ssid_len, + struct ieee80211_channel *chan); +int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, + struct cfg80211_scan_request *req); +void ieee80211_scan_cancel(struct ieee80211_local *local); +ieee80211_rx_result +ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); + +void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); +struct ieee80211_bss * +ieee80211_bss_info_update(struct ieee80211_local *local, + struct ieee80211_rx_status *rx_status, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee802_11_elems *elems, + struct ieee80211_channel *channel, + bool beacon); +struct ieee80211_bss * +ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq, + u8 *ssid, u8 ssid_len); +void ieee80211_rx_bss_put(struct ieee80211_local *local, + struct ieee80211_bss *bss); + +/* scheduled scan handling */ +int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, + struct cfg80211_sched_scan_request *req); +int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata); +void ieee80211_sched_scan_stopped_work(struct work_struct *work); + +/* off-channel helpers */ +bool ieee80211_cfg_on_oper_channel(struct ieee80211_local *local); +void ieee80211_offchannel_enable_all_ps(struct ieee80211_local *local, + bool tell_ap); +void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local, + bool offchannel_ps_enable); +void ieee80211_offchannel_return(struct ieee80211_local *local, + bool enable_beaconing, + bool offchannel_ps_disable); +void ieee80211_hw_roc_setup(struct ieee80211_local *local); + +/* interface handling */ +int ieee80211_iface_init(void); +void ieee80211_iface_exit(void); +int ieee80211_if_add(struct ieee80211_local *local, const char *name, + struct net_device **new_dev, enum nl80211_iftype type, + struct vif_params *params); +int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type); +void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata); +void ieee80211_remove_interfaces(struct ieee80211_local *local); +u32 __ieee80211_recalc_idle(struct ieee80211_local *local); +void ieee80211_recalc_idle(struct ieee80211_local *local); +void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, + const int offset); + +static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) +{ + return test_bit(SDATA_STATE_RUNNING, &sdata->state); +} + +/* tx handling */ +void ieee80211_clear_tx_pending(struct ieee80211_local *local); +void ieee80211_tx_pending(unsigned long data); +netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, + struct net_device *dev); +netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, + struct net_device *dev); + +/* + * radiotap header for status frames + */ +struct ieee80211_tx_status_rtap_hdr { + struct ieee80211_radiotap_header hdr; + u8 rate; + u8 padding_for_rate; + __le16 tx_flags; + u8 data_retries; +} __packed; + + +/* HT */ +void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, + struct ieee80211_ht_cap *ht_cap_ie, + struct ieee80211_sta_ht_cap *ht_cap); +void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn); +void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, + const u8 *da, u16 tid, + u16 initiator, u16 reason_code); +int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps, const u8 *da, + const u8 *bssid); +void ieee80211_request_smps_work(struct work_struct *work); + +void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, + u16 initiator, u16 reason, bool stop); +void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, + u16 initiator, u16 reason, bool stop); +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx); +void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_process_addba_resp(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len); +void ieee80211_process_addba_request(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len); + +int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator, + bool tx); +int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, + enum ieee80211_back_parties initiator, + bool tx); +void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); +void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); +void ieee80211_ba_session_work(struct work_struct *work); +void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); + +/* Spectrum management */ +void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len); + +/* Suspend/resume and hw reconfiguration */ +int ieee80211_reconfig(struct ieee80211_local *local); +void ieee80211_stop_device(struct ieee80211_local *local); + +#ifdef CONFIG_PM +int __ieee80211_suspend(struct ieee80211_hw *hw, + struct cfg80211_wowlan *wowlan); + +static inline int __ieee80211_resume(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s: resume with hardware scan still in progress\n", + wiphy_name(hw->wiphy)); + + return ieee80211_reconfig(hw_to_local(hw)); +} +#else +static inline int __ieee80211_suspend(struct ieee80211_hw *hw, + struct cfg80211_wowlan *wowlan) +{ + return 0; +} + +static inline int __ieee80211_resume(struct ieee80211_hw *hw) +{ + return 0; +} +#endif + +/* utility functions/constants */ +extern void *mac80211_wiphy_privid; /* for wiphy privid */ +u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, + enum nl80211_iftype type); +int ieee80211_frame_duration(struct ieee80211_local *local, size_t len, + int rate, int erp, int short_preamble); +void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx, + struct ieee80211_hdr *hdr, const u8 *tsc, + gfp_t gfp); +void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata); +void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee802_11_parse_elems(u8 *start, size_t len, + struct ieee802_11_elems *elems); +u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, + struct ieee802_11_elems *elems, + u64 filter, u32 crc); +u32 ieee80211_mandatory_rates(struct ieee80211_local *local, + enum ieee80211_band band); + +void ieee80211_dynamic_ps_enable_work(struct work_struct *work); +void ieee80211_dynamic_ps_disable_work(struct work_struct *work); +void ieee80211_dynamic_ps_timer(unsigned long data); +void ieee80211_send_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int powersave); +void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, + struct ieee80211_hdr *hdr); +void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, + struct ieee80211_hdr *hdr, bool ack); +void ieee80211_beacon_connection_loss_work(struct work_struct *work); + +void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, + enum queue_stop_reason reason); +void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, + enum queue_stop_reason reason); +void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason); +void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason); +void ieee80211_add_pending_skb(struct ieee80211_local *local, + struct sk_buff *skb); +int ieee80211_add_pending_skbs(struct ieee80211_local *local, + struct sk_buff_head *skbs); +int ieee80211_add_pending_skbs_fn(struct ieee80211_local *local, + struct sk_buff_head *skbs, + void (*fn)(void *data), void *data); + +void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, + u16 transaction, u16 auth_alg, + u8 *extra, size_t extra_len, const u8 *bssid, + const u8 *key, u8 key_len, u8 key_idx); +int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, + const u8 *ie, size_t ie_len, + enum ieee80211_band band, u32 rate_mask, + u8 channel); +struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, + u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len); +void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len); + +void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, + const size_t supp_rates_len, + const u8 *supp_rates); +u32 ieee80211_sta_get_rates(struct ieee80211_local *local, + struct ieee802_11_elems *elems, + enum ieee80211_band band); +int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode); +void ieee80211_recalc_smps(struct ieee80211_local *local); + +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset); +size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); + +/* internal work items */ +void ieee80211_work_init(struct ieee80211_local *local); +void ieee80211_add_work(struct ieee80211_work *wk); +void free_work(struct ieee80211_work *wk); +void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata); +ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +int ieee80211_wk_remain_on_channel(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, u64 *cookie); +int ieee80211_wk_cancel_remain_on_channel( + struct ieee80211_sub_if_data *sdata, u64 cookie); + +/* channel management */ +enum ieee80211_chan_mode { + CHAN_MODE_UNDEFINED, + CHAN_MODE_HOPPING, + CHAN_MODE_FIXED, +}; + +enum ieee80211_chan_mode +ieee80211_get_channel_mode(struct ieee80211_local *local, + struct ieee80211_sub_if_data *ignore); +bool ieee80211_set_channel_type(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_channel_type chantype); + +#ifdef CONFIG_MAC80211_NOINLINE +#define debug_noinline noinline +#else +#define debug_noinline +#endif + +#endif /* IEEE80211_I_H */ diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c new file mode 100644 index 00000000..65f3764c --- /dev/null +++ b/net/mac80211/iface.c @@ -0,0 +1,1417 @@ +/* + * Interface handling (except master interface) + * + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright (c) 2006 Jiri Benc + * Copyright 2008, Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "ieee80211_i.h" +#include "sta_info.h" +#include "debugfs_netdev.h" +#include "mesh.h" +#include "led.h" +#include "driver-ops.h" +#include "wme.h" +#include "rate.h" + +/** + * DOC: Interface list locking + * + * The interface list in each struct ieee80211_local is protected + * three-fold: + * + * (1) modifications may only be done under the RTNL + * (2) modifications and readers are protected against each other by + * the iflist_mtx. + * (3) modifications are done in an RCU manner so atomic readers + * can traverse the list in RCU-safe blocks. + * + * As a consequence, reads (traversals) of the list can be protected + * by either the RTNL, the iflist_mtx or RCU. + */ + + +static int ieee80211_change_mtu(struct net_device *dev, int new_mtu) +{ + int meshhdrlen; + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + meshhdrlen = (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ? 5 : 0; + + /* FIX: what would be proper limits for MTU? + * This interface uses 802.3 frames. */ + if (new_mtu < 256 || + new_mtu > IEEE80211_MAX_DATA_LEN - 24 - 6 - meshhdrlen) { + return -EINVAL; + } + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + printk(KERN_DEBUG "%s: setting MTU %d\n", dev->name, new_mtu); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + dev->mtu = new_mtu; + return 0; +} + +static int ieee80211_change_mac(struct net_device *dev, void *addr) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct sockaddr *sa = addr; + int ret; + + if (ieee80211_sdata_running(sdata)) + return -EBUSY; + + ret = eth_mac_addr(dev, sa); + + if (ret == 0) + memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); + + return ret; +} + +static inline int identical_mac_addr_allowed(int type1, int type2) +{ + return type1 == NL80211_IFTYPE_MONITOR || + type2 == NL80211_IFTYPE_MONITOR || + (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) || + (type1 == NL80211_IFTYPE_WDS && + (type2 == NL80211_IFTYPE_WDS || + type2 == NL80211_IFTYPE_AP)) || + (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) || + (type1 == NL80211_IFTYPE_AP_VLAN && + (type2 == NL80211_IFTYPE_AP || + type2 == NL80211_IFTYPE_AP_VLAN)); +} + +static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype iftype) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *nsdata; + struct net_device *dev = sdata->dev; + + ASSERT_RTNL(); + + /* we hold the RTNL here so can safely walk the list */ + list_for_each_entry(nsdata, &local->interfaces, list) { + struct net_device *ndev = nsdata->dev; + + if (ndev != dev && ieee80211_sdata_running(nsdata)) { + /* + * Allow only a single IBSS interface to be up at any + * time. This is restricted because beacon distribution + * cannot work properly if both are in the same IBSS. + * + * To remove this restriction we'd have to disallow them + * from setting the same SSID on different IBSS interfaces + * belonging to the same hardware. Then, however, we're + * faced with having to adopt two different TSF timers... + */ + if (iftype == NL80211_IFTYPE_ADHOC && + nsdata->vif.type == NL80211_IFTYPE_ADHOC) + return -EBUSY; + + /* + * The remaining checks are only performed for interfaces + * with the same MAC address. + */ + if (compare_ether_addr(dev->dev_addr, ndev->dev_addr)) + continue; + + /* + * check whether it may have the same address + */ + if (!identical_mac_addr_allowed(iftype, + nsdata->vif.type)) + return -ENOTUNIQ; + + /* + * can only add VLANs to enabled APs + */ + if (iftype == NL80211_IFTYPE_AP_VLAN && + nsdata->vif.type == NL80211_IFTYPE_AP) + sdata->bss = &nsdata->u.ap; + } + } + + return 0; +} + +void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, + const int offset) +{ + struct ieee80211_local *local = sdata->local; + u32 flags = sdata->u.mntr_flags; + +#define ADJUST(_f, _s) do { \ + if (flags & MONITOR_FLAG_##_f) \ + local->fif_##_s += offset; \ + } while (0) + + ADJUST(FCSFAIL, fcsfail); + ADJUST(PLCPFAIL, plcpfail); + ADJUST(CONTROL, control); + ADJUST(CONTROL, pspoll); + ADJUST(OTHER_BSS, other_bss); + +#undef ADJUST +} + +/* + * NOTE: Be very careful when changing this function, it must NOT return + * an error on interface type changes that have been pre-checked, so most + * checks should be in ieee80211_check_concurrent_iface. + */ +static int ieee80211_do_open(struct net_device *dev, bool coming_up) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + u32 changed = 0; + int res; + u32 hw_reconf_flags = 0; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_WDS: + if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) + return -ENOLINK; + break; + case NL80211_IFTYPE_AP_VLAN: + if (!sdata->bss) + return -ENOLINK; + list_add(&sdata->u.vlan.list, &sdata->bss->vlans); + break; + case NL80211_IFTYPE_AP: + sdata->bss = &sdata->u.ap; + break; + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_ADHOC: + /* no special treatment */ + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_P2P_GO: + /* cannot happen */ + WARN_ON(1); + break; + } + + if (local->open_count == 0) { + res = drv_start(local); + if (res) + goto err_del_bss; + if (local->ops->napi_poll) + napi_enable(&local->napi); + /* we're brought up, everything changes */ + hw_reconf_flags = ~0; + ieee80211_led_radio(local, true); + ieee80211_mod_tpt_led_trig(local, + IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); + } + + /* + * Copy the hopefully now-present MAC address to + * this interface, if it has the special null one. + */ + if (is_zero_ether_addr(dev->dev_addr)) { + memcpy(dev->dev_addr, + local->hw.wiphy->perm_addr, + ETH_ALEN); + memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); + + if (!is_valid_ether_addr(dev->dev_addr)) { + if (!local->open_count) + drv_stop(local); + return -EADDRNOTAVAIL; + } + } + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + /* no need to tell driver */ + break; + case NL80211_IFTYPE_MONITOR: + if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) { + local->cooked_mntrs++; + break; + } + + /* must be before the call to ieee80211_configure_filter */ + local->monitors++; + if (local->monitors == 1) { + local->hw.conf.flags |= IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } + + ieee80211_adjust_monitor_flags(sdata, 1); + ieee80211_configure_filter(local); + + netif_carrier_on(dev); + break; + default: + if (coming_up) { + res = drv_add_interface(local, &sdata->vif); + if (res) + goto err_stop; + } + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + local->fif_pspoll++; + local->fif_probe_req++; + + ieee80211_configure_filter(local); + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + local->fif_probe_req++; + } + + changed |= ieee80211_reset_erp_info(sdata); + ieee80211_bss_info_change_notify(sdata, changed); + + if (sdata->vif.type == NL80211_IFTYPE_STATION) + netif_carrier_off(dev); + else + netif_carrier_on(dev); + } + + set_bit(SDATA_STATE_RUNNING, &sdata->state); + + if (sdata->vif.type == NL80211_IFTYPE_WDS) { + /* Create STA entry for the WDS peer */ + sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, + GFP_KERNEL); + if (!sta) { + res = -ENOMEM; + goto err_del_interface; + } + + /* no locking required since STA is not live yet */ + sta->flags |= WLAN_STA_AUTHORIZED; + + res = sta_info_insert(sta); + if (res) { + /* STA has been freed */ + goto err_del_interface; + } + + rate_control_rate_init(sta); + } + + /* + * set_multicast_list will be invoked by the networking core + * which will check whether any increments here were done in + * error and sync them down to the hardware as filter flags. + */ + if (sdata->flags & IEEE80211_SDATA_ALLMULTI) + atomic_inc(&local->iff_allmultis); + + if (sdata->flags & IEEE80211_SDATA_PROMISC) + atomic_inc(&local->iff_promiscs); + + mutex_lock(&local->mtx); + hw_reconf_flags |= __ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + + if (coming_up) + local->open_count++; + + if (hw_reconf_flags) { + ieee80211_hw_config(local, hw_reconf_flags); + /* + * set default queue parameters so drivers don't + * need to initialise the hardware if the hardware + * doesn't start up with sane defaults + */ + ieee80211_set_wmm_default(sdata); + } + + ieee80211_recalc_ps(local, -1); + + netif_tx_start_all_queues(dev); + + return 0; + err_del_interface: + drv_remove_interface(local, &sdata->vif); + err_stop: + if (!local->open_count) + drv_stop(local); + err_del_bss: + sdata->bss = NULL; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + list_del(&sdata->u.vlan.list); + clear_bit(SDATA_STATE_RUNNING, &sdata->state); + return res; +} + +static int ieee80211_open(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int err; + + /* fail early if user set an invalid address */ + if (!is_zero_ether_addr(dev->dev_addr) && + !is_valid_ether_addr(dev->dev_addr)) + return -EADDRNOTAVAIL; + + err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); + if (err) + return err; + + return ieee80211_do_open(dev, true); +} + +static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, + bool going_down) +{ + struct ieee80211_local *local = sdata->local; + unsigned long flags; + struct sk_buff *skb, *tmp; + u32 hw_reconf_flags = 0; + int i; + enum nl80211_channel_type orig_ct; + + clear_bit(SDATA_STATE_RUNNING, &sdata->state); + + if (local->scan_sdata == sdata) + ieee80211_scan_cancel(local); + + /* + * Stop TX on this interface first. + */ + netif_tx_stop_all_queues(sdata->dev); + + /* + * Purge work for this interface. + */ + ieee80211_work_purge(sdata); + + /* + * Remove all stations associated with this interface. + * + * This must be done before calling ops->remove_interface() + * because otherwise we can later invoke ops->sta_notify() + * whenever the STAs are removed, and that invalidates driver + * assumptions about always getting a vif pointer that is valid + * (because if we remove a STA after ops->remove_interface() + * the driver will have removed the vif info already!) + * + * This is relevant only in AP, WDS and mesh modes, since in + * all other modes we've already removed all stations when + * disconnecting etc. + */ + sta_info_flush(local, sdata); + + /* + * Don't count this interface for promisc/allmulti while it + * is down. dev_mc_unsync() will invoke set_multicast_list + * on the master interface which will sync these down to the + * hardware as filter flags. + */ + if (sdata->flags & IEEE80211_SDATA_ALLMULTI) + atomic_dec(&local->iff_allmultis); + + if (sdata->flags & IEEE80211_SDATA_PROMISC) + atomic_dec(&local->iff_promiscs); + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + local->fif_pspoll--; + local->fif_probe_req--; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + local->fif_probe_req--; + } + + netif_addr_lock_bh(sdata->dev); + spin_lock_bh(&local->filter_lock); + __hw_addr_unsync(&local->mc_list, &sdata->dev->mc, + sdata->dev->addr_len); + spin_unlock_bh(&local->filter_lock); + netif_addr_unlock_bh(sdata->dev); + + ieee80211_configure_filter(local); + + del_timer_sync(&local->dynamic_ps_timer); + cancel_work_sync(&local->dynamic_ps_enable_work); + + /* APs need special treatment */ + if (sdata->vif.type == NL80211_IFTYPE_AP) { + struct ieee80211_sub_if_data *vlan, *tmpsdata; + struct beacon_data *old_beacon = + rtnl_dereference(sdata->u.ap.beacon); + + /* sdata_running will return false, so this will disable */ + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_BEACON_ENABLED); + + /* remove beacon */ + rcu_assign_pointer(sdata->u.ap.beacon, NULL); + synchronize_rcu(); + kfree(old_beacon); + + /* free all potentially still buffered bcast frames */ + while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) { + local->total_ps_buffered--; + dev_kfree_skb(skb); + } + + /* down all dependent devices, that is VLANs */ + list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, + u.vlan.list) + dev_close(vlan->dev); + WARN_ON(!list_empty(&sdata->u.ap.vlans)); + } + + if (going_down) + local->open_count--; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + list_del(&sdata->u.vlan.list); + /* no need to tell driver */ + break; + case NL80211_IFTYPE_MONITOR: + if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) { + local->cooked_mntrs--; + break; + } + + local->monitors--; + if (local->monitors == 0) { + local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } + + ieee80211_adjust_monitor_flags(sdata, -1); + ieee80211_configure_filter(local); + break; + default: + mutex_lock(&local->mtx); + if (local->hw_roc_dev == sdata->dev && + local->hw_roc_channel) { + /* ignore return value since this is racy */ + drv_cancel_remain_on_channel(local); + ieee80211_queue_work(&local->hw, &local->hw_roc_done); + } + mutex_unlock(&local->mtx); + + flush_work(&local->hw_roc_start); + flush_work(&local->hw_roc_done); + + flush_work(&sdata->work); + /* + * When we get here, the interface is marked down. + * Call synchronize_rcu() to wait for the RX path + * should it be using the interface and enqueuing + * frames at this very time on another CPU. + */ + synchronize_rcu(); + skb_queue_purge(&sdata->skb_queue); + + /* + * Disable beaconing here for mesh only, AP and IBSS + * are already taken care of. + */ + if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_BEACON_ENABLED); + + /* + * Free all remaining keys, there shouldn't be any, + * except maybe group keys in AP more or WDS? + */ + ieee80211_free_keys(sdata); + + if (going_down) + drv_remove_interface(local, &sdata->vif); + } + + sdata->bss = NULL; + + mutex_lock(&local->mtx); + hw_reconf_flags |= __ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + + ieee80211_recalc_ps(local, -1); + + if (local->open_count == 0) { + if (local->ops->napi_poll) + napi_disable(&local->napi); + ieee80211_clear_tx_pending(local); + ieee80211_stop_device(local); + + /* no reconfiguring after stop! */ + hw_reconf_flags = 0; + } + + /* Re-calculate channel-type, in case there are multiple vifs + * on different channel types. + */ + orig_ct = local->_oper_channel_type; + ieee80211_set_channel_type(local, NULL, NL80211_CHAN_NO_HT); + + /* do after stop to avoid reconfiguring when we stop anyway */ + if (hw_reconf_flags || (orig_ct != local->_oper_channel_type)) + ieee80211_hw_config(local, hw_reconf_flags); + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { + skb_queue_walk_safe(&local->pending[i], skb, tmp) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + if (info->control.vif == &sdata->vif) { + __skb_unlink(skb, &local->pending[i]); + dev_kfree_skb_irq(skb); + } + } + } + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +static int ieee80211_stop(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + ieee80211_do_stop(sdata, true); + + return 0; +} + +static void ieee80211_set_multicast_list(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + int allmulti, promisc, sdata_allmulti, sdata_promisc; + + allmulti = !!(dev->flags & IFF_ALLMULTI); + promisc = !!(dev->flags & IFF_PROMISC); + sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI); + sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC); + + if (allmulti != sdata_allmulti) { + if (dev->flags & IFF_ALLMULTI) + atomic_inc(&local->iff_allmultis); + else + atomic_dec(&local->iff_allmultis); + sdata->flags ^= IEEE80211_SDATA_ALLMULTI; + } + + if (promisc != sdata_promisc) { + if (dev->flags & IFF_PROMISC) + atomic_inc(&local->iff_promiscs); + else + atomic_dec(&local->iff_promiscs); + sdata->flags ^= IEEE80211_SDATA_PROMISC; + } + spin_lock_bh(&local->filter_lock); + __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); + spin_unlock_bh(&local->filter_lock); + ieee80211_queue_work(&local->hw, &local->reconfig_filter); +} + +/* + * Called when the netdev is removed or, by the code below, before + * the interface type changes. + */ +static void ieee80211_teardown_sdata(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + int flushed; + int i; + + /* free extra data */ + ieee80211_free_keys(sdata); + + ieee80211_debugfs_remove_netdev(sdata); + + for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) + __skb_queue_purge(&sdata->fragments[i].skb_list); + sdata->fragment_next = 0; + + if (ieee80211_vif_is_mesh(&sdata->vif)) + mesh_rmc_free(sdata); + + flushed = sta_info_flush(local, sdata); + WARN_ON(flushed); +} + +static u16 ieee80211_netdev_select_queue(struct net_device *dev, + struct sk_buff *skb) +{ + return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); +} + +static const struct net_device_ops ieee80211_dataif_ops = { + .ndo_open = ieee80211_open, + .ndo_stop = ieee80211_stop, + .ndo_uninit = ieee80211_teardown_sdata, + .ndo_start_xmit = ieee80211_subif_start_xmit, + .ndo_set_multicast_list = ieee80211_set_multicast_list, + .ndo_change_mtu = ieee80211_change_mtu, + .ndo_set_mac_address = ieee80211_change_mac, + .ndo_select_queue = ieee80211_netdev_select_queue, +}; + +static u16 ieee80211_monitor_select_queue(struct net_device *dev, + struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct ieee80211_hdr *hdr; + struct ieee80211_radiotap_header *rtap = (void *)skb->data; + u8 *p; + + if (local->hw.queues < 4) + return 0; + + if (skb->len < 4 || + skb->len < le16_to_cpu(rtap->it_len) + 2 /* frame control */) + return 0; /* doesn't matter, frame will be dropped */ + + hdr = (void *)((u8 *)skb->data + le16_to_cpu(rtap->it_len)); + + if (!ieee80211_is_data(hdr->frame_control)) { + skb->priority = 7; + return ieee802_1d_to_ac[skb->priority]; + } + if (!ieee80211_is_data_qos(hdr->frame_control)) { + skb->priority = 0; + return ieee802_1d_to_ac[skb->priority]; + } + + p = ieee80211_get_qos_ctl(hdr); + skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; + + return ieee80211_downgrade_queue(local, skb); +} + +static const struct net_device_ops ieee80211_monitorif_ops = { + .ndo_open = ieee80211_open, + .ndo_stop = ieee80211_stop, + .ndo_uninit = ieee80211_teardown_sdata, + .ndo_start_xmit = ieee80211_monitor_start_xmit, + .ndo_set_multicast_list = ieee80211_set_multicast_list, + .ndo_change_mtu = ieee80211_change_mtu, + .ndo_set_mac_address = eth_mac_addr, + .ndo_select_queue = ieee80211_monitor_select_queue, +}; + +static void ieee80211_if_setup(struct net_device *dev) +{ + ether_setup(dev); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->netdev_ops = &ieee80211_dataif_ops; + dev->destructor = free_netdev; +} + +static void ieee80211_iface_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, work); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct sta_info *sta; + struct ieee80211_ra_tid *ra_tid; + + if (!ieee80211_sdata_running(sdata)) + return; + + if (local->scanning) + return; + + /* + * ieee80211_queue_work() should have picked up most cases, + * here we'll pick the rest. + */ + if (WARN(local->suspended, + "interface work scheduled while going to suspend\n")) + return; + + /* first process frames */ + while ((skb = skb_dequeue(&sdata->skb_queue))) { + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_START) { + ra_tid = (void *)&skb->cb; + ieee80211_start_tx_ba_cb(&sdata->vif, ra_tid->ra, + ra_tid->tid); + } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_STOP) { + ra_tid = (void *)&skb->cb; + ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra, + ra_tid->tid); + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_BACK) { + int len = skb->len; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + if (sta) { + switch (mgmt->u.action.u.addba_req.action_code) { + case WLAN_ACTION_ADDBA_REQ: + ieee80211_process_addba_request( + local, sta, mgmt, len); + break; + case WLAN_ACTION_ADDBA_RESP: + ieee80211_process_addba_resp(local, sta, + mgmt, len); + break; + case WLAN_ACTION_DELBA: + ieee80211_process_delba(sdata, sta, + mgmt, len); + break; + default: + WARN_ON(1); + break; + } + } + mutex_unlock(&local->sta_mtx); + } else if (ieee80211_is_data_qos(mgmt->frame_control)) { + struct ieee80211_hdr *hdr = (void *)mgmt; + /* + * So the frame isn't mgmt, but frame_control + * is at the right place anyway, of course, so + * the if statement is correct. + * + * Warn if we have other data frame types here, + * they must not get here. + */ + WARN_ON(hdr->frame_control & + cpu_to_le16(IEEE80211_STYPE_NULLFUNC)); + WARN_ON(!(hdr->seq_ctrl & + cpu_to_le16(IEEE80211_SCTL_FRAG))); + /* + * This was a fragment of a frame, received while + * a block-ack session was active. That cannot be + * right, so terminate the session. + */ + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + if (sta) { + u16 tid = *ieee80211_get_qos_ctl(hdr) & + IEEE80211_QOS_CTL_TID_MASK; + + __ieee80211_stop_rx_ba_session( + sta, tid, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_REQUIRE_SETUP, + true); + } + mutex_unlock(&local->sta_mtx); + } else switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ieee80211_sta_rx_queued_mgmt(sdata, skb); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_rx_queued_mgmt(sdata, skb); + break; + case NL80211_IFTYPE_MESH_POINT: + if (!ieee80211_vif_is_mesh(&sdata->vif)) + break; + ieee80211_mesh_rx_queued_mgmt(sdata, skb); + break; + default: + WARN(1, "frame for unexpected interface type"); + break; + } + + kfree_skb(skb); + } + + /* then other type-dependent work */ + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ieee80211_sta_work(sdata); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_work(sdata); + break; + case NL80211_IFTYPE_MESH_POINT: + if (!ieee80211_vif_is_mesh(&sdata->vif)) + break; + ieee80211_mesh_work(sdata); + break; + default: + break; + } +} + + +/* + * Helper function to initialise an interface to a specific type. + */ +static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + /* clear type-dependent union */ + memset(&sdata->u, 0, sizeof(sdata->u)); + + /* and set some type-dependent values */ + sdata->vif.type = type; + sdata->vif.p2p = false; + sdata->dev->netdev_ops = &ieee80211_dataif_ops; + sdata->wdev.iftype = type; + + sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE); + sdata->control_port_no_encrypt = false; + + /* only monitor differs */ + sdata->dev->type = ARPHRD_ETHER; + + skb_queue_head_init(&sdata->skb_queue); + INIT_WORK(&sdata->work, ieee80211_iface_work); + + switch (type) { + case NL80211_IFTYPE_P2P_GO: + type = NL80211_IFTYPE_AP; + sdata->vif.type = type; + sdata->vif.p2p = true; + /* fall through */ + case NL80211_IFTYPE_AP: + skb_queue_head_init(&sdata->u.ap.ps_bc_buf); + INIT_LIST_HEAD(&sdata->u.ap.vlans); + break; + case NL80211_IFTYPE_P2P_CLIENT: + type = NL80211_IFTYPE_STATION; + sdata->vif.type = type; + sdata->vif.p2p = true; + /* fall through */ + case NL80211_IFTYPE_STATION: + ieee80211_sta_setup_sdata(sdata); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_setup_sdata(sdata); + break; + case NL80211_IFTYPE_MESH_POINT: + if (ieee80211_vif_is_mesh(&sdata->vif)) + ieee80211_mesh_init_sdata(sdata); + break; + case NL80211_IFTYPE_MONITOR: + sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP; + sdata->dev->netdev_ops = &ieee80211_monitorif_ops; + sdata->u.mntr_flags = MONITOR_FLAG_CONTROL | + MONITOR_FLAG_OTHER_BSS; + break; + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_AP_VLAN: + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + BUG(); + break; + } + + ieee80211_debugfs_add_netdev(sdata); +} + +static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + struct ieee80211_local *local = sdata->local; + int ret, err; + enum nl80211_iftype internal_type = type; + bool p2p = false; + + ASSERT_RTNL(); + + if (!local->ops->change_interface) + return -EBUSY; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + /* + * Could maybe also all others here? + * Just not sure how that interacts + * with the RX/config path e.g. for + * mesh. + */ + break; + default: + return -EBUSY; + } + + switch (type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + /* + * Could probably support everything + * but WDS here (WDS do_open can fail + * under memory pressure, which this + * code isn't prepared to handle). + */ + break; + case NL80211_IFTYPE_P2P_CLIENT: + p2p = true; + internal_type = NL80211_IFTYPE_STATION; + break; + case NL80211_IFTYPE_P2P_GO: + p2p = true; + internal_type = NL80211_IFTYPE_AP; + break; + default: + return -EBUSY; + } + + ret = ieee80211_check_concurrent_iface(sdata, internal_type); + if (ret) + return ret; + + ieee80211_do_stop(sdata, false); + + ieee80211_teardown_sdata(sdata->dev); + + ret = drv_change_interface(local, sdata, internal_type, p2p); + if (ret) + type = sdata->vif.type; + + ieee80211_setup_sdata(sdata, type); + + err = ieee80211_do_open(sdata->dev, false); + WARN(err, "type change: do_open returned %d", err); + + return ret; +} + +int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + int ret; + + ASSERT_RTNL(); + + if (type == ieee80211_vif_type_p2p(&sdata->vif)) + return 0; + + /* Setting ad-hoc mode on non-IBSS channel is not supported. */ + if (sdata->local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS && + type == NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; + + if (ieee80211_sdata_running(sdata)) { + ret = ieee80211_runtime_change_iftype(sdata, type); + if (ret) + return ret; + } else { + /* Purge and reset type-dependent state. */ + ieee80211_teardown_sdata(sdata->dev); + ieee80211_setup_sdata(sdata, type); + } + + /* reset some values that shouldn't be kept across type changes */ + sdata->vif.bss_conf.basic_rates = + ieee80211_mandatory_rates(sdata->local, + sdata->local->hw.conf.channel->band); + sdata->drop_unencrypted = 0; + if (type == NL80211_IFTYPE_STATION) + sdata->u.mgd.use_4addr = false; + + return 0; +} + +static void ieee80211_assign_perm_addr(struct ieee80211_local *local, + struct net_device *dev, + enum nl80211_iftype type) +{ + struct ieee80211_sub_if_data *sdata; + u64 mask, start, addr, val, inc; + u8 *m; + u8 tmp_addr[ETH_ALEN]; + int i; + + /* default ... something at least */ + memcpy(dev->perm_addr, local->hw.wiphy->perm_addr, ETH_ALEN); + + if (is_zero_ether_addr(local->hw.wiphy->addr_mask) && + local->hw.wiphy->n_addresses <= 1) + return; + + + mutex_lock(&local->iflist_mtx); + + switch (type) { + case NL80211_IFTYPE_MONITOR: + /* doesn't matter */ + break; + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_AP_VLAN: + /* match up with an AP interface */ + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type != NL80211_IFTYPE_AP) + continue; + memcpy(dev->perm_addr, sdata->vif.addr, ETH_ALEN); + break; + } + /* keep default if no AP interface present */ + break; + default: + /* assign a new address if possible -- try n_addresses first */ + for (i = 0; i < local->hw.wiphy->n_addresses; i++) { + bool used = false; + + list_for_each_entry(sdata, &local->interfaces, list) { + if (memcmp(local->hw.wiphy->addresses[i].addr, + sdata->vif.addr, ETH_ALEN) == 0) { + used = true; + break; + } + } + + if (!used) { + memcpy(dev->perm_addr, + local->hw.wiphy->addresses[i].addr, + ETH_ALEN); + break; + } + } + + /* try mask if available */ + if (is_zero_ether_addr(local->hw.wiphy->addr_mask)) + break; + + m = local->hw.wiphy->addr_mask; + mask = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | + ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | + ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); + + if (__ffs64(mask) + hweight64(mask) != fls64(mask)) { + /* not a contiguous mask ... not handled now! */ + printk(KERN_DEBUG "not contiguous\n"); + break; + } + + m = local->hw.wiphy->perm_addr; + start = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | + ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | + ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); + + inc = 1ULL<<__ffs64(mask); + val = (start & mask); + addr = (start & ~mask) | (val & mask); + do { + bool used = false; + + tmp_addr[5] = addr >> 0*8; + tmp_addr[4] = addr >> 1*8; + tmp_addr[3] = addr >> 2*8; + tmp_addr[2] = addr >> 3*8; + tmp_addr[1] = addr >> 4*8; + tmp_addr[0] = addr >> 5*8; + + val += inc; + + list_for_each_entry(sdata, &local->interfaces, list) { + if (memcmp(tmp_addr, sdata->vif.addr, + ETH_ALEN) == 0) { + used = true; + break; + } + } + + if (!used) { + memcpy(dev->perm_addr, tmp_addr, ETH_ALEN); + break; + } + addr = (start & ~mask) | (val & mask); + } while (addr != start); + + break; + } + + mutex_unlock(&local->iflist_mtx); +} + +int ieee80211_if_add(struct ieee80211_local *local, const char *name, + struct net_device **new_dev, enum nl80211_iftype type, + struct vif_params *params) +{ + struct net_device *ndev; + struct ieee80211_sub_if_data *sdata = NULL; + int ret, i; + + ASSERT_RTNL(); + + ndev = alloc_netdev_mq(sizeof(*sdata) + local->hw.vif_data_size, + name, ieee80211_if_setup, local->hw.queues); + if (!ndev) + return -ENOMEM; + dev_net_set(ndev, wiphy_net(local->hw.wiphy)); + + ndev->needed_headroom = local->tx_headroom + + 4*6 /* four MAC addresses */ + + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */ + + 6 /* mesh */ + + 8 /* rfc1042/bridge tunnel */ + - ETH_HLEN /* ethernet hard_header_len */ + + IEEE80211_ENCRYPT_HEADROOM; + ndev->needed_tailroom = IEEE80211_ENCRYPT_TAILROOM; + + ret = dev_alloc_name(ndev, ndev->name); + if (ret < 0) + goto fail; + + ieee80211_assign_perm_addr(local, ndev, type); + memcpy(ndev->dev_addr, ndev->perm_addr, ETH_ALEN); + SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy)); + + /* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */ + sdata = netdev_priv(ndev); + ndev->ieee80211_ptr = &sdata->wdev; + memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN); + memcpy(sdata->name, ndev->name, IFNAMSIZ); + + /* initialise type-independent data */ + sdata->wdev.wiphy = local->hw.wiphy; + sdata->local = local; + sdata->dev = ndev; +#ifdef CONFIG_INET + sdata->arp_filter_state = true; +#endif + + for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) + skb_queue_head_init(&sdata->fragments[i].skb_list); + + INIT_LIST_HEAD(&sdata->key_list); + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + struct ieee80211_supported_band *sband; + sband = local->hw.wiphy->bands[i]; + sdata->rc_rateidx_mask[i] = + sband ? (1 << sband->n_bitrates) - 1 : 0; + } + + /* setup type-dependent data */ + ieee80211_setup_sdata(sdata, type); + + if (params) { + ndev->ieee80211_ptr->use_4addr = params->use_4addr; + if (type == NL80211_IFTYPE_STATION) + sdata->u.mgd.use_4addr = params->use_4addr; + } + + ret = register_netdevice(ndev); + if (ret) + goto fail; + + mutex_lock(&local->iflist_mtx); + list_add_tail_rcu(&sdata->list, &local->interfaces); + mutex_unlock(&local->iflist_mtx); + + if (new_dev) + *new_dev = ndev; + + return 0; + + fail: + free_netdev(ndev); + return ret; +} + +void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) +{ + ASSERT_RTNL(); + + mutex_lock(&sdata->local->iflist_mtx); + list_del_rcu(&sdata->list); + mutex_unlock(&sdata->local->iflist_mtx); + + synchronize_rcu(); + unregister_netdevice(sdata->dev); +} + +/* + * Remove all interfaces, may only be called at hardware unregistration + * time because it doesn't do RCU-safe list removals. + */ +void ieee80211_remove_interfaces(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata, *tmp; + LIST_HEAD(unreg_list); + + ASSERT_RTNL(); + + mutex_lock(&local->iflist_mtx); + list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { + list_del(&sdata->list); + + unregister_netdevice_queue(sdata->dev, &unreg_list); + } + mutex_unlock(&local->iflist_mtx); + unregister_netdevice_many(&unreg_list); + list_del(&unreg_list); +} + +static u32 ieee80211_idle_off(struct ieee80211_local *local, + const char *reason) +{ + if (!(local->hw.conf.flags & IEEE80211_CONF_IDLE)) + return 0; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason); +#endif + + local->hw.conf.flags &= ~IEEE80211_CONF_IDLE; + return IEEE80211_CONF_CHANGE_IDLE; +} + +static u32 ieee80211_idle_on(struct ieee80211_local *local) +{ + if (local->hw.conf.flags & IEEE80211_CONF_IDLE) + return 0; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "device now idle\n"); +#endif + + drv_flush(local, false); + + local->hw.conf.flags |= IEEE80211_CONF_IDLE; + return IEEE80211_CONF_CHANGE_IDLE; +} + +u32 __ieee80211_recalc_idle(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + int count = 0; + bool working = false, scanning = false, hw_roc = false; + struct ieee80211_work *wk; + unsigned int led_trig_start = 0, led_trig_stop = 0; + +#ifdef CONFIG_PROVE_LOCKING + WARN_ON(debug_locks && !lockdep_rtnl_is_held() && + !lockdep_is_held(&local->iflist_mtx)); +#endif + lockdep_assert_held(&local->mtx); + + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) { + sdata->vif.bss_conf.idle = true; + continue; + } + + sdata->old_idle = sdata->vif.bss_conf.idle; + + /* do not count disabled managed interfaces */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !sdata->u.mgd.associated) { + sdata->vif.bss_conf.idle = true; + continue; + } + /* do not count unused IBSS interfaces */ + if (sdata->vif.type == NL80211_IFTYPE_ADHOC && + !sdata->u.ibss.ssid_len) { + sdata->vif.bss_conf.idle = true; + continue; + } + /* count everything else */ + count++; + } + + list_for_each_entry(wk, &local->work_list, list) { + working = true; + wk->sdata->vif.bss_conf.idle = false; + } + + if (local->scan_sdata) { + scanning = true; + local->scan_sdata->vif.bss_conf.idle = false; + } + + if (local->hw_roc_channel) + hw_roc = true; + + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->old_idle == sdata->vif.bss_conf.idle) + continue; + if (!ieee80211_sdata_running(sdata)) + continue; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + } + + if (working || scanning || hw_roc) + led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_WORK; + else + led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_WORK; + + if (count) + led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED; + else + led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED; + + ieee80211_mod_tpt_led_trig(local, led_trig_start, led_trig_stop); + + if (hw_roc) + return ieee80211_idle_off(local, "hw remain-on-channel"); + if (working) + return ieee80211_idle_off(local, "working"); + if (scanning) + return ieee80211_idle_off(local, "scanning"); + if (!count) + return ieee80211_idle_on(local); + else + return ieee80211_idle_off(local, "in use"); + + return 0; +} + +void ieee80211_recalc_idle(struct ieee80211_local *local) +{ + u32 chg; + + mutex_lock(&local->iflist_mtx); + chg = __ieee80211_recalc_idle(local); + mutex_unlock(&local->iflist_mtx); + if (chg) + ieee80211_hw_config(local, chg); +} + +static int netdev_notify(struct notifier_block *nb, + unsigned long state, + void *ndev) +{ + struct net_device *dev = ndev; + struct ieee80211_sub_if_data *sdata; + + if (state != NETDEV_CHANGENAME) + return 0; + + if (!dev->ieee80211_ptr || !dev->ieee80211_ptr->wiphy) + return 0; + + if (dev->ieee80211_ptr->wiphy->privid != mac80211_wiphy_privid) + return 0; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + memcpy(sdata->name, dev->name, IFNAMSIZ); + + ieee80211_debugfs_rename_netdev(sdata); + return 0; +} + +static struct notifier_block mac80211_netdev_notifier = { + .notifier_call = netdev_notify, +}; + +int ieee80211_iface_init(void) +{ + return register_netdevice_notifier(&mac80211_netdev_notifier); +} + +void ieee80211_iface_exit(void) +{ + unregister_netdevice_notifier(&mac80211_netdev_notifier); +} diff --git a/net/mac80211/key.c b/net/mac80211/key.c new file mode 100644 index 00000000..f825e2f0 --- /dev/null +++ b/net/mac80211/key.c @@ -0,0 +1,535 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007-2008 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "debugfs_key.h" +#include "aes_ccm.h" +#include "aes_cmac.h" + + +/** + * DOC: Key handling basics + * + * Key handling in mac80211 is done based on per-interface (sub_if_data) + * keys and per-station keys. Since each station belongs to an interface, + * each station key also belongs to that interface. + * + * Hardware acceleration is done on a best-effort basis for algorithms + * that are implemented in software, for each key the hardware is asked + * to enable that key for offloading but if it cannot do that the key is + * simply kept for software encryption (unless it is for an algorithm + * that isn't implemented in software). + * There is currently no way of knowing whether a key is handled in SW + * or HW except by looking into debugfs. + * + * All key management is internally protected by a mutex. Within all + * other parts of mac80211, key references are, just as STA structure + * references, protected by RCU. Note, however, that some things are + * unprotected, namely the key->sta dereferences within the hardware + * acceleration functions. This means that sta_info_destroy() must + * remove the key which waits for an RCU grace period. + */ + +static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; + +static void assert_key_lock(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->key_mtx); +} + +static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key) +{ + if (key->sta) + return &key->sta->sta; + + return NULL; +} + +static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_sta *sta; + int ret; + + might_sleep(); + + if (!key->local->ops->set_key) + goto out_unsupported; + + assert_key_lock(key->local); + + sta = get_sta_for_key(key); + + /* + * If this is a per-STA GTK, check if it + * is supported; if not, return. + */ + if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && + !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)) + goto out_unsupported; + + sdata = key->sdata; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + /* + * The driver doesn't know anything about VLAN interfaces. + * Hence, don't send GTKs for VLAN interfaces to the driver. + */ + if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) + goto out_unsupported; + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + } + + ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf); + + if (!ret) { + key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; + return 0; + } + + if (ret != -ENOSPC && ret != -EOPNOTSUPP) + wiphy_err(key->local->hw.wiphy, + "failed to set key (%d, %pM) to hardware (%d)\n", + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + + out_unsupported: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + case WLAN_CIPHER_SUITE_TKIP: + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_AES_CMAC: + /* all of these we can do in software */ + return 0; + default: + return -EINVAL; + } +} + +static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_sta *sta; + int ret; + + might_sleep(); + + if (!key || !key->local->ops->set_key) + return; + + assert_key_lock(key->local); + + if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + return; + + sta = get_sta_for_key(key); + sdata = key->sdata; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + ret = drv_set_key(key->local, DISABLE_KEY, sdata, + sta, &key->conf); + + if (ret) + wiphy_err(key->local->hw.wiphy, + "failed to remove key (%d, %pM) from hardware (%d)\n", + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + + key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; +} + +void ieee80211_key_removed(struct ieee80211_key_conf *key_conf) +{ + struct ieee80211_key *key; + + key = container_of(key_conf, struct ieee80211_key, conf); + + might_sleep(); + assert_key_lock(key->local); + + key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; + + /* + * Flush TX path to avoid attempts to use this key + * after this function returns. Until then, drivers + * must be prepared to handle the key. + */ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ieee80211_key_removed); + +static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, + int idx, bool uni, bool multi) +{ + struct ieee80211_key *key = NULL; + + assert_key_lock(sdata->local); + + if (idx >= 0 && idx < NUM_DEFAULT_KEYS) + key = key_mtx_dereference(sdata->local, sdata->keys[idx]); + + if (uni) + rcu_assign_pointer(sdata->default_unicast_key, key); + if (multi) + rcu_assign_pointer(sdata->default_multicast_key, key); + + ieee80211_debugfs_key_update_default(sdata); +} + +void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, + bool uni, bool multi) +{ + mutex_lock(&sdata->local->key_mtx); + __ieee80211_set_default_key(sdata, idx, uni, multi); + mutex_unlock(&sdata->local->key_mtx); +} + +static void +__ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx) +{ + struct ieee80211_key *key = NULL; + + assert_key_lock(sdata->local); + + if (idx >= NUM_DEFAULT_KEYS && + idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) + key = key_mtx_dereference(sdata->local, sdata->keys[idx]); + + rcu_assign_pointer(sdata->default_mgmt_key, key); + + ieee80211_debugfs_key_update_default(sdata); +} + +void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, + int idx) +{ + mutex_lock(&sdata->local->key_mtx); + __ieee80211_set_default_mgmt_key(sdata, idx); + mutex_unlock(&sdata->local->key_mtx); +} + + +static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + bool pairwise, + struct ieee80211_key *old, + struct ieee80211_key *new) +{ + int idx; + bool defunikey, defmultikey, defmgmtkey; + + if (new) + list_add(&new->list, &sdata->key_list); + + if (sta && pairwise) { + rcu_assign_pointer(sta->ptk, new); + } else if (sta) { + if (old) + idx = old->conf.keyidx; + else + idx = new->conf.keyidx; + rcu_assign_pointer(sta->gtk[idx], new); + } else { + WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); + + if (old) + idx = old->conf.keyidx; + else + idx = new->conf.keyidx; + + defunikey = old && + old == key_mtx_dereference(sdata->local, + sdata->default_unicast_key); + defmultikey = old && + old == key_mtx_dereference(sdata->local, + sdata->default_multicast_key); + defmgmtkey = old && + old == key_mtx_dereference(sdata->local, + sdata->default_mgmt_key); + + if (defunikey && !new) + __ieee80211_set_default_key(sdata, -1, true, false); + if (defmultikey && !new) + __ieee80211_set_default_key(sdata, -1, false, true); + if (defmgmtkey && !new) + __ieee80211_set_default_mgmt_key(sdata, -1); + + rcu_assign_pointer(sdata->keys[idx], new); + if (defunikey && new) + __ieee80211_set_default_key(sdata, new->conf.keyidx, + true, false); + if (defmultikey && new) + __ieee80211_set_default_key(sdata, new->conf.keyidx, + false, true); + if (defmgmtkey && new) + __ieee80211_set_default_mgmt_key(sdata, + new->conf.keyidx); + } + + if (old) + list_del(&old->list); +} + +struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, + const u8 *key_data, + size_t seq_len, const u8 *seq) +{ + struct ieee80211_key *key; + int i, j, err; + + BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS); + + key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); + if (!key) + return ERR_PTR(-ENOMEM); + + /* + * Default to software encryption; we'll later upload the + * key to the hardware if possible. + */ + key->conf.flags = 0; + key->flags = 0; + + key->conf.cipher = cipher; + key->conf.keyidx = idx; + key->conf.keylen = key_len; + switch (cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + key->conf.iv_len = WEP_IV_LEN; + key->conf.icv_len = WEP_ICV_LEN; + break; + case WLAN_CIPHER_SUITE_TKIP: + key->conf.iv_len = TKIP_IV_LEN; + key->conf.icv_len = TKIP_ICV_LEN; + if (seq) { + for (i = 0; i < NUM_RX_DATA_QUEUES; i++) { + key->u.tkip.rx[i].iv32 = + get_unaligned_le32(&seq[2]); + key->u.tkip.rx[i].iv16 = + get_unaligned_le16(seq); + } + } + break; + case WLAN_CIPHER_SUITE_CCMP: + key->conf.iv_len = CCMP_HDR_LEN; + key->conf.icv_len = CCMP_MIC_LEN; + if (seq) { + for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) + for (j = 0; j < CCMP_PN_LEN; j++) + key->u.ccmp.rx_pn[i][j] = + seq[CCMP_PN_LEN - j - 1]; + } + /* + * Initialize AES key state here as an optimization so that + * it does not need to be initialized for every packet. + */ + key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data); + if (IS_ERR(key->u.ccmp.tfm)) { + err = PTR_ERR(key->u.ccmp.tfm); + kfree(key); + return ERR_PTR(err); + } + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + key->conf.iv_len = 0; + key->conf.icv_len = sizeof(struct ieee80211_mmie); + if (seq) + for (j = 0; j < 6; j++) + key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1]; + /* + * Initialize AES key state here as an optimization so that + * it does not need to be initialized for every packet. + */ + key->u.aes_cmac.tfm = + ieee80211_aes_cmac_key_setup(key_data); + if (IS_ERR(key->u.aes_cmac.tfm)) { + err = PTR_ERR(key->u.aes_cmac.tfm); + kfree(key); + return ERR_PTR(err); + } + break; + } + memcpy(key->conf.key, key_data, key_len); + INIT_LIST_HEAD(&key->list); + + return key; +} + +static void __ieee80211_key_destroy(struct ieee80211_key *key) +{ + if (!key) + return; + + /* + * Synchronize so the TX path can no longer be using + * this key before we free/remove it. + */ + synchronize_rcu(); + + if (key->local) + ieee80211_key_disable_hw_accel(key); + + if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP) + ieee80211_aes_key_free(key->u.ccmp.tfm); + if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) + ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); + if (key->local) + ieee80211_debugfs_key_remove(key); + + kfree(key); +} + +int ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + struct ieee80211_key *old_key; + int idx, ret; + bool pairwise; + + BUG_ON(!sdata); + BUG_ON(!key); + + pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; + idx = key->conf.keyidx; + key->local = sdata->local; + key->sdata = sdata; + key->sta = sta; + + if (sta) { + /* + * some hardware cannot handle TKIP with QoS, so + * we indicate whether QoS could be in use. + */ + if (test_sta_flags(sta, WLAN_STA_WME)) + key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA; + } else { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + struct sta_info *ap; + + /* + * We're getting a sta pointer in, so must be under + * appropriate locking for sta_info_get(). + */ + + /* same here, the AP could be using QoS */ + ap = sta_info_get(key->sdata, key->sdata->u.mgd.bssid); + if (ap) { + if (test_sta_flags(ap, WLAN_STA_WME)) + key->conf.flags |= + IEEE80211_KEY_FLAG_WMM_STA; + } + } + } + + mutex_lock(&sdata->local->key_mtx); + + if (sta && pairwise) + old_key = key_mtx_dereference(sdata->local, sta->ptk); + else if (sta) + old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]); + else + old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]); + + __ieee80211_key_replace(sdata, sta, pairwise, old_key, key); + __ieee80211_key_destroy(old_key); + + ieee80211_debugfs_key_add(key); + + ret = ieee80211_key_enable_hw_accel(key); + + mutex_unlock(&sdata->local->key_mtx); + + return ret; +} + +void __ieee80211_key_free(struct ieee80211_key *key) +{ + if (!key) + return; + + /* + * Replace key with nothingness if it was ever used. + */ + if (key->sdata) + __ieee80211_key_replace(key->sdata, key->sta, + key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, + key, NULL); + __ieee80211_key_destroy(key); +} + +void ieee80211_key_free(struct ieee80211_local *local, + struct ieee80211_key *key) +{ + mutex_lock(&local->key_mtx); + __ieee80211_key_free(key); + mutex_unlock(&local->key_mtx); +} + +void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_key *key; + + ASSERT_RTNL(); + + if (WARN_ON(!ieee80211_sdata_running(sdata))) + return; + + mutex_lock(&sdata->local->key_mtx); + + list_for_each_entry(key, &sdata->key_list, list) + ieee80211_key_enable_hw_accel(key); + + mutex_unlock(&sdata->local->key_mtx); +} + +void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_key *key; + + ASSERT_RTNL(); + + mutex_lock(&sdata->local->key_mtx); + + list_for_each_entry(key, &sdata->key_list, list) + ieee80211_key_disable_hw_accel(key); + + mutex_unlock(&sdata->local->key_mtx); +} + +void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_key *key, *tmp; + + mutex_lock(&sdata->local->key_mtx); + + ieee80211_debugfs_key_remove_mgmt_default(sdata); + + list_for_each_entry_safe(key, tmp, &sdata->key_list, list) + __ieee80211_key_free(key); + + ieee80211_debugfs_key_update_default(sdata); + + mutex_unlock(&sdata->local->key_mtx); +} diff --git a/net/mac80211/key.h b/net/mac80211/key.h new file mode 100644 index 00000000..d801d535 --- /dev/null +++ b/net/mac80211/key.h @@ -0,0 +1,152 @@ +/* + * Copyright 2002-2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef IEEE80211_KEY_H +#define IEEE80211_KEY_H + +#include +#include +#include +#include +#include + +#define NUM_DEFAULT_KEYS 4 +#define NUM_DEFAULT_MGMT_KEYS 2 + +#define WEP_IV_LEN 4 +#define WEP_ICV_LEN 4 +#define ALG_CCMP_KEY_LEN 16 +#define CCMP_HDR_LEN 8 +#define CCMP_MIC_LEN 8 +#define CCMP_TK_LEN 16 +#define CCMP_PN_LEN 6 +#define TKIP_IV_LEN 8 +#define TKIP_ICV_LEN 4 + +#define NUM_RX_DATA_QUEUES 17 + +struct ieee80211_local; +struct ieee80211_sub_if_data; +struct sta_info; + +/** + * enum ieee80211_internal_key_flags - internal key flags + * + * @KEY_FLAG_UPLOADED_TO_HARDWARE: Indicates that this key is present + * in the hardware for TX crypto hardware acceleration. + */ +enum ieee80211_internal_key_flags { + KEY_FLAG_UPLOADED_TO_HARDWARE = BIT(0), +}; + +enum ieee80211_internal_tkip_state { + TKIP_STATE_NOT_INIT, + TKIP_STATE_PHASE1_DONE, + TKIP_STATE_PHASE1_HW_UPLOADED, +}; + +struct tkip_ctx { + u32 iv32; + u16 iv16; + u16 p1k[5]; + enum ieee80211_internal_tkip_state state; +}; + +struct ieee80211_key { + struct ieee80211_local *local; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + + /* for sdata list */ + struct list_head list; + + /* protected by key mutex */ + unsigned int flags; + + union { + struct { + /* last used TSC */ + struct tkip_ctx tx; + + /* last received RSC */ + struct tkip_ctx rx[NUM_RX_DATA_QUEUES]; + } tkip; + struct { + u8 tx_pn[6]; + /* + * Last received packet number. The first + * NUM_RX_DATA_QUEUES counters are used with Data + * frames and the last counter is used with Robust + * Management frames. + */ + u8 rx_pn[NUM_RX_DATA_QUEUES + 1][6]; + struct crypto_cipher *tfm; + u32 replays; /* dot11RSNAStatsCCMPReplays */ + /* scratch buffers for virt_to_page() (crypto API) */ +#ifndef AES_BLOCK_LEN +#define AES_BLOCK_LEN 16 +#endif + u8 tx_crypto_buf[6 * AES_BLOCK_LEN]; + u8 rx_crypto_buf[6 * AES_BLOCK_LEN]; + } ccmp; + struct { + u8 tx_pn[6]; + u8 rx_pn[6]; + struct crypto_cipher *tfm; + u32 replays; /* dot11RSNAStatsCMACReplays */ + u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ + /* scratch buffers for virt_to_page() (crypto API) */ + u8 tx_crypto_buf[2 * AES_BLOCK_LEN]; + u8 rx_crypto_buf[2 * AES_BLOCK_LEN]; + } aes_cmac; + } u; + + /* number of times this key has been used */ + int tx_rx_count; + +#ifdef CONFIG_MAC80211_DEBUGFS + struct { + struct dentry *stalink; + struct dentry *dir; + int cnt; + } debugfs; +#endif + + /* + * key config, must be last because it contains key + * material as variable length member + */ + struct ieee80211_key_conf conf; +}; + +struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, + const u8 *key_data, + size_t seq_len, const u8 *seq); +/* + * Insert a key into data structures (sdata, sta if necessary) + * to make it used, free old key. + */ +int __must_check ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta); +void __ieee80211_key_free(struct ieee80211_key *key); +void ieee80211_key_free(struct ieee80211_local *local, + struct ieee80211_key *key); +void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, + bool uni, bool multi); +void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, + int idx); +void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata); + +#define key_mtx_dereference(local, ref) \ + rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx))) + +#endif /* IEEE80211_KEY_H */ diff --git a/net/mac80211/led.c b/net/mac80211/led.c new file mode 100644 index 00000000..14590332 --- /dev/null +++ b/net/mac80211/led.c @@ -0,0 +1,308 @@ +/* + * Copyright 2006, Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* just for IFNAMSIZ */ +#include +#include +#include "led.h" + +void ieee80211_led_rx(struct ieee80211_local *local) +{ + if (unlikely(!local->rx_led)) + return; + if (local->rx_led_counter++ % 2 == 0) + led_trigger_event(local->rx_led, LED_OFF); + else + led_trigger_event(local->rx_led, LED_FULL); +} + +/* q is 1 if a packet was enqueued, 0 if it has been transmitted */ +void ieee80211_led_tx(struct ieee80211_local *local, int q) +{ + if (unlikely(!local->tx_led)) + return; + /* not sure how this is supposed to work ... */ + local->tx_led_counter += 2*q-1; + if (local->tx_led_counter % 2 == 0) + led_trigger_event(local->tx_led, LED_OFF); + else + led_trigger_event(local->tx_led, LED_FULL); +} + +void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) +{ + if (unlikely(!local->assoc_led)) + return; + if (associated) + led_trigger_event(local->assoc_led, LED_FULL); + else + led_trigger_event(local->assoc_led, LED_OFF); +} + +void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) +{ + if (unlikely(!local->radio_led)) + return; + if (enabled) + led_trigger_event(local->radio_led, LED_FULL); + else + led_trigger_event(local->radio_led, LED_OFF); +} + +void ieee80211_led_names(struct ieee80211_local *local) +{ + snprintf(local->rx_led_name, sizeof(local->rx_led_name), + "%srx", wiphy_name(local->hw.wiphy)); + snprintf(local->tx_led_name, sizeof(local->tx_led_name), + "%stx", wiphy_name(local->hw.wiphy)); + snprintf(local->assoc_led_name, sizeof(local->assoc_led_name), + "%sassoc", wiphy_name(local->hw.wiphy)); + snprintf(local->radio_led_name, sizeof(local->radio_led_name), + "%sradio", wiphy_name(local->hw.wiphy)); +} + +void ieee80211_led_init(struct ieee80211_local *local) +{ + local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); + if (local->rx_led) { + local->rx_led->name = local->rx_led_name; + if (led_trigger_register(local->rx_led)) { + kfree(local->rx_led); + local->rx_led = NULL; + } + } + + local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); + if (local->tx_led) { + local->tx_led->name = local->tx_led_name; + if (led_trigger_register(local->tx_led)) { + kfree(local->tx_led); + local->tx_led = NULL; + } + } + + local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); + if (local->assoc_led) { + local->assoc_led->name = local->assoc_led_name; + if (led_trigger_register(local->assoc_led)) { + kfree(local->assoc_led); + local->assoc_led = NULL; + } + } + + local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); + if (local->radio_led) { + local->radio_led->name = local->radio_led_name; + if (led_trigger_register(local->radio_led)) { + kfree(local->radio_led); + local->radio_led = NULL; + } + } + + if (local->tpt_led_trigger) { + if (led_trigger_register(&local->tpt_led_trigger->trig)) { + kfree(local->tpt_led_trigger); + local->tpt_led_trigger = NULL; + } + } +} + +void ieee80211_led_exit(struct ieee80211_local *local) +{ + if (local->radio_led) { + led_trigger_unregister(local->radio_led); + kfree(local->radio_led); + } + if (local->assoc_led) { + led_trigger_unregister(local->assoc_led); + kfree(local->assoc_led); + } + if (local->tx_led) { + led_trigger_unregister(local->tx_led); + kfree(local->tx_led); + } + if (local->rx_led) { + led_trigger_unregister(local->rx_led); + kfree(local->rx_led); + } + + if (local->tpt_led_trigger) { + led_trigger_unregister(&local->tpt_led_trigger->trig); + kfree(local->tpt_led_trigger); + } +} + +char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + return local->radio_led_name; +} +EXPORT_SYMBOL(__ieee80211_get_radio_led_name); + +char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + return local->assoc_led_name; +} +EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); + +char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + return local->tx_led_name; +} +EXPORT_SYMBOL(__ieee80211_get_tx_led_name); + +char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + return local->rx_led_name; +} +EXPORT_SYMBOL(__ieee80211_get_rx_led_name); + +static unsigned long tpt_trig_traffic(struct ieee80211_local *local, + struct tpt_led_trigger *tpt_trig) +{ + unsigned long traffic, delta; + + traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes; + + delta = traffic - tpt_trig->prev_traffic; + tpt_trig->prev_traffic = traffic; + return DIV_ROUND_UP(delta, 1024 / 8); +} + +static void tpt_trig_timer(unsigned long data) +{ + struct ieee80211_local *local = (void *)data; + struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; + struct led_classdev *led_cdev; + unsigned long on, off, tpt; + int i; + + if (!tpt_trig->running) + return; + + mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); + + tpt = tpt_trig_traffic(local, tpt_trig); + + /* default to just solid on */ + on = 1; + off = 0; + + for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) { + if (tpt_trig->blink_table[i].throughput < 0 || + tpt > tpt_trig->blink_table[i].throughput) { + off = tpt_trig->blink_table[i].blink_time / 2; + on = tpt_trig->blink_table[i].blink_time - off; + break; + } + } + + read_lock(&tpt_trig->trig.leddev_list_lock); + list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + led_blink_set(led_cdev, &on, &off); + read_unlock(&tpt_trig->trig.leddev_list_lock); +} + +char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, + unsigned int flags, + const struct ieee80211_tpt_blink *blink_table, + unsigned int blink_table_len) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct tpt_led_trigger *tpt_trig; + + if (WARN_ON(local->tpt_led_trigger)) + return NULL; + + tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL); + if (!tpt_trig) + return NULL; + + snprintf(tpt_trig->name, sizeof(tpt_trig->name), + "%stpt", wiphy_name(local->hw.wiphy)); + + tpt_trig->trig.name = tpt_trig->name; + + tpt_trig->blink_table = blink_table; + tpt_trig->blink_table_len = blink_table_len; + tpt_trig->want = flags; + + setup_timer(&tpt_trig->timer, tpt_trig_timer, (unsigned long)local); + + local->tpt_led_trigger = tpt_trig; + + return tpt_trig->name; +} +EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger); + +static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) +{ + struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; + + if (tpt_trig->running) + return; + + /* reset traffic */ + tpt_trig_traffic(local, tpt_trig); + tpt_trig->running = true; + + tpt_trig_timer((unsigned long)local); + mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); +} + +static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) +{ + struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; + struct led_classdev *led_cdev; + + if (!tpt_trig->running) + return; + + tpt_trig->running = false; + del_timer_sync(&tpt_trig->timer); + + read_lock(&tpt_trig->trig.leddev_list_lock); + list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + led_brightness_set(led_cdev, LED_OFF); + read_unlock(&tpt_trig->trig.leddev_list_lock); +} + +void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, + unsigned int types_on, unsigned int types_off) +{ + struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; + bool allowed; + + WARN_ON(types_on & types_off); + + if (!tpt_trig) + return; + + tpt_trig->active &= ~types_off; + tpt_trig->active |= types_on; + + /* + * Regardless of wanted state, we shouldn't blink when + * the radio is disabled -- this can happen due to some + * code ordering issues with __ieee80211_recalc_idle() + * being called before the radio is started. + */ + allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO; + + if (!allowed || !(tpt_trig->active & tpt_trig->want)) + ieee80211_stop_tpt_led_trig(local); + else + ieee80211_start_tpt_led_trig(local); +} diff --git a/net/mac80211/led.h b/net/mac80211/led.h new file mode 100644 index 00000000..e0275d9b --- /dev/null +++ b/net/mac80211/led.h @@ -0,0 +1,73 @@ +/* + * Copyright 2006, Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include "ieee80211_i.h" + +#ifdef CONFIG_MAC80211_LEDS +void ieee80211_led_rx(struct ieee80211_local *local); +void ieee80211_led_tx(struct ieee80211_local *local, int q); +void ieee80211_led_assoc(struct ieee80211_local *local, + bool associated); +void ieee80211_led_radio(struct ieee80211_local *local, + bool enabled); +void ieee80211_led_names(struct ieee80211_local *local); +void ieee80211_led_init(struct ieee80211_local *local); +void ieee80211_led_exit(struct ieee80211_local *local); +void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, + unsigned int types_on, unsigned int types_off); +#else +static inline void ieee80211_led_rx(struct ieee80211_local *local) +{ +} +static inline void ieee80211_led_tx(struct ieee80211_local *local, int q) +{ +} +static inline void ieee80211_led_assoc(struct ieee80211_local *local, + bool associated) +{ +} +static inline void ieee80211_led_radio(struct ieee80211_local *local, + bool enabled) +{ +} +static inline void ieee80211_led_names(struct ieee80211_local *local) +{ +} +static inline void ieee80211_led_init(struct ieee80211_local *local) +{ +} +static inline void ieee80211_led_exit(struct ieee80211_local *local) +{ +} +static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, + unsigned int types_on, + unsigned int types_off) +{ +} +#endif + +static inline void +ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes) +{ +#ifdef CONFIG_MAC80211_LEDS + if (local->tpt_led_trigger && ieee80211_is_data(fc)) + local->tpt_led_trigger->tx_bytes += bytes; +#endif +} + +static inline void +ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes) +{ +#ifdef CONFIG_MAC80211_LEDS + if (local->tpt_led_trigger && ieee80211_is_data(fc)) + local->tpt_led_trigger->rx_bytes += bytes; +#endif +} diff --git a/net/mac80211/main.c b/net/mac80211/main.c new file mode 100644 index 00000000..1e36fb33 --- /dev/null +++ b/net/mac80211/main.c @@ -0,0 +1,1102 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" +#include "mesh.h" +#include "wep.h" +#include "led.h" +#include "cfg.h" +#include "debugfs.h" + +static struct lock_class_key ieee80211_rx_skb_queue_class; + +void ieee80211_configure_filter(struct ieee80211_local *local) +{ + u64 mc; + unsigned int changed_flags; + unsigned int new_flags = 0; + + if (atomic_read(&local->iff_promiscs)) + new_flags |= FIF_PROMISC_IN_BSS; + + if (atomic_read(&local->iff_allmultis)) + new_flags |= FIF_ALLMULTI; + + if (local->monitors || local->scanning) + new_flags |= FIF_BCN_PRBRESP_PROMISC; + + if (local->fif_probe_req || local->probe_req_reg) + new_flags |= FIF_PROBE_REQ; + + if (local->fif_fcsfail) + new_flags |= FIF_FCSFAIL; + + if (local->fif_plcpfail) + new_flags |= FIF_PLCPFAIL; + + if (local->fif_control) + new_flags |= FIF_CONTROL; + + if (local->fif_other_bss) + new_flags |= FIF_OTHER_BSS; + + if (local->fif_pspoll) + new_flags |= FIF_PSPOLL; + + spin_lock_bh(&local->filter_lock); + changed_flags = local->filter_flags ^ new_flags; + + mc = drv_prepare_multicast(local, &local->mc_list); + spin_unlock_bh(&local->filter_lock); + + /* be a bit nasty */ + new_flags |= (1<<31); + + drv_configure_filter(local, changed_flags, &new_flags, mc); + + WARN_ON(new_flags & (1<<31)); + + local->filter_flags = new_flags & ~(1<<31); +} + +static void ieee80211_reconfig_filter(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, reconfig_filter); + + ieee80211_configure_filter(local); +} + +/* + * Returns true if we are logically configured to be on + * the operating channel AND the hardware-conf is currently + * configured on the operating channel. Compares channel-type + * as well. + */ +bool ieee80211_cfg_on_oper_channel(struct ieee80211_local *local) +{ + struct ieee80211_channel *chan, *scan_chan; + enum nl80211_channel_type channel_type; + + /* This logic needs to match logic in ieee80211_hw_config */ + if (local->scan_channel) { + chan = local->scan_channel; + /* If scanning on oper channel, use whatever channel-type + * is currently in use. + */ + if (chan == local->oper_channel) + channel_type = local->_oper_channel_type; + else + channel_type = NL80211_CHAN_NO_HT; + } else if (local->tmp_channel) { + chan = scan_chan = local->tmp_channel; + channel_type = local->tmp_channel_type; + } else { + chan = local->oper_channel; + channel_type = local->_oper_channel_type; + } + + if (chan != local->oper_channel || + channel_type != local->_oper_channel_type) + return false; + + /* Check current hardware-config against oper_channel. */ + if ((local->oper_channel != local->hw.conf.channel) || + (local->_oper_channel_type != local->hw.conf.channel_type)) + return false; + + return true; +} + +int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) +{ + struct ieee80211_channel *chan, *scan_chan; + int ret = 0; + int power; + enum nl80211_channel_type channel_type; + u32 offchannel_flag; + + might_sleep(); + + scan_chan = local->scan_channel; + + /* If this off-channel logic ever changes, ieee80211_on_oper_channel + * may need to change as well. + */ + offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; + if (scan_chan) { + chan = scan_chan; + /* If scanning on oper channel, use whatever channel-type + * is currently in use. + */ + if (chan == local->oper_channel) + channel_type = local->_oper_channel_type; + else + channel_type = NL80211_CHAN_NO_HT; + } else if (local->tmp_channel) { + chan = scan_chan = local->tmp_channel; + channel_type = local->tmp_channel_type; + } else { + chan = local->oper_channel; + channel_type = local->_oper_channel_type; + } + + if (chan != local->oper_channel || + channel_type != local->_oper_channel_type) + local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; + else + local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL; + + offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; + + if (offchannel_flag || chan != local->hw.conf.channel || + channel_type != local->hw.conf.channel_type) { + local->hw.conf.channel = chan; + local->hw.conf.channel_type = channel_type; + changed |= IEEE80211_CONF_CHANGE_CHANNEL; + } + + if (!conf_is_ht(&local->hw.conf)) { + /* + * mac80211.h documents that this is only valid + * when the channel is set to an HT type, and + * that otherwise STATIC is used. + */ + local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC; + } else if (local->hw.conf.smps_mode != local->smps_mode) { + local->hw.conf.smps_mode = local->smps_mode; + changed |= IEEE80211_CONF_CHANGE_SMPS; + } + + if ((local->scanning & SCAN_SW_SCANNING) || + (local->scanning & SCAN_HW_SCANNING)) + power = chan->max_power; + else + power = local->power_constr_level ? + (chan->max_power - local->power_constr_level) : + chan->max_power; + + if (local->user_power_level >= 0) + power = min(power, local->user_power_level); + + if (local->hw.conf.power_level != power) { + changed |= IEEE80211_CONF_CHANGE_POWER; + local->hw.conf.power_level = power; + } + + if (changed && local->open_count) { + ret = drv_config(local, changed); + /* + * Goal: + * HW reconfiguration should never fail, the driver has told + * us what it can support so it should live up to that promise. + * + * Current status: + * rfkill is not integrated with mac80211 and a + * configuration command can thus fail if hardware rfkill + * is enabled + * + * FIXME: integrate rfkill with mac80211 and then add this + * WARN_ON() back + * + */ + /* WARN_ON(ret); */ + } + + return ret; +} + +void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, + u32 changed) +{ + struct ieee80211_local *local = sdata->local; + static const u8 zero[ETH_ALEN] = { 0 }; + + if (!changed) + return; + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + /* + * While not associated, claim a BSSID of all-zeroes + * so that drivers don't do any weird things with the + * BSSID at that time. + */ + if (sdata->vif.bss_conf.assoc) + sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid; + else + sdata->vif.bss_conf.bssid = zero; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; + else if (sdata->vif.type == NL80211_IFTYPE_AP) + sdata->vif.bss_conf.bssid = sdata->vif.addr; + else if (sdata->vif.type == NL80211_IFTYPE_WDS) + sdata->vif.bss_conf.bssid = NULL; + else if (ieee80211_vif_is_mesh(&sdata->vif)) { + sdata->vif.bss_conf.bssid = zero; + } else { + WARN_ON(1); + return; + } + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_MESH_POINT: + break; + default: + /* do not warn to simplify caller in scan.c */ + changed &= ~BSS_CHANGED_BEACON_ENABLED; + if (WARN_ON(changed & BSS_CHANGED_BEACON)) + return; + break; + } + + if (changed & BSS_CHANGED_BEACON_ENABLED) { + if (local->quiescing || !ieee80211_sdata_running(sdata) || + test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) { + sdata->vif.bss_conf.enable_beacon = false; + } else { + /* + * Beacon should be enabled, but AP mode must + * check whether there is a beacon configured. + */ + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + sdata->vif.bss_conf.enable_beacon = + !!sdata->u.ap.beacon; + break; + case NL80211_IFTYPE_ADHOC: + sdata->vif.bss_conf.enable_beacon = + !!sdata->u.ibss.presp; + break; +#ifdef CONFIG_MAC80211_MESH + case NL80211_IFTYPE_MESH_POINT: + sdata->vif.bss_conf.enable_beacon = + !!sdata->u.mesh.mesh_id_len; + break; +#endif + default: + /* not reached */ + WARN_ON(1); + break; + } + } + } + + drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed); +} + +u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) +{ + sdata->vif.bss_conf.use_cts_prot = false; + sdata->vif.bss_conf.use_short_preamble = false; + sdata->vif.bss_conf.use_short_slot = false; + return BSS_CHANGED_ERP_CTS_PROT | + BSS_CHANGED_ERP_PREAMBLE | + BSS_CHANGED_ERP_SLOT; +} + +static void ieee80211_tasklet_handler(unsigned long data) +{ + struct ieee80211_local *local = (struct ieee80211_local *) data; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&local->skb_queue)) || + (skb = skb_dequeue(&local->skb_queue_unreliable))) { + switch (skb->pkt_type) { + case IEEE80211_RX_MSG: + /* Clear skb->pkt_type in order to not confuse kernel + * netstack. */ + skb->pkt_type = 0; + ieee80211_rx(local_to_hw(local), skb); + break; + case IEEE80211_TX_STATUS_MSG: + skb->pkt_type = 0; + ieee80211_tx_status(local_to_hw(local), skb); + break; + default: + WARN(1, "mac80211: Packet is of unknown type %d\n", + skb->pkt_type); + dev_kfree_skb(skb); + break; + } + } +} + +static void ieee80211_restart_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, restart_work); + + /* wait for scan work complete */ + flush_workqueue(local->workqueue); + + mutex_lock(&local->mtx); + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) || + local->sched_scanning, + "%s called with hardware scan in progress\n", __func__); + mutex_unlock(&local->mtx); + + rtnl_lock(); + ieee80211_scan_cancel(local); + ieee80211_reconfig(local); + rtnl_unlock(); +} + +void ieee80211_restart_hw(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_restart_hw(local); + + wiphy_info(hw->wiphy, + "Hardware restart was requested\n"); + + /* use this reason, ieee80211_reconfig will unblock it */ + ieee80211_stop_queues_by_reason(hw, + IEEE80211_QUEUE_STOP_REASON_SUSPEND); + + schedule_work(&local->restart_work); +} +EXPORT_SYMBOL(ieee80211_restart_hw); + +static void ieee80211_recalc_smps_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, recalc_smps); + + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_smps(local); + mutex_unlock(&local->iflist_mtx); +} + +#ifdef CONFIG_INET +static int ieee80211_ifa_changed(struct notifier_block *nb, + unsigned long data, void *arg) +{ + struct in_ifaddr *ifa = arg; + struct ieee80211_local *local = + container_of(nb, struct ieee80211_local, + ifa_notifier); + struct net_device *ndev = ifa->ifa_dev->dev; + struct wireless_dev *wdev = ndev->ieee80211_ptr; + struct in_device *idev; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_bss_conf *bss_conf; + struct ieee80211_if_managed *ifmgd; + int c = 0; + + /* Make sure it's our interface that got changed */ + if (!wdev) + return NOTIFY_DONE; + + if (wdev->wiphy != local->hw.wiphy) + return NOTIFY_DONE; + + sdata = IEEE80211_DEV_TO_SUB_IF(ndev); + bss_conf = &sdata->vif.bss_conf; + + if (!ieee80211_sdata_running(sdata)) + return NOTIFY_DONE; + + /* ARP filtering is only supported in managed mode */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return NOTIFY_DONE; + + idev = __in_dev_get_rtnl(sdata->dev); + if (!idev) + return NOTIFY_DONE; + + ifmgd = &sdata->u.mgd; + mutex_lock(&ifmgd->mtx); + + /* Copy the addresses to the bss_conf list */ + ifa = idev->ifa_list; + while (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN && ifa) { + bss_conf->arp_addr_list[c] = ifa->ifa_address; + ifa = ifa->ifa_next; + c++; + } + + /* If not all addresses fit the list, disable filtering */ + if (ifa) { + sdata->arp_filter_state = false; + c = 0; + } else { + sdata->arp_filter_state = true; + } + bss_conf->arp_addr_cnt = c; + + /* Configure driver only if associated */ + if (ifmgd->associated) { + bss_conf->arp_filter_enabled = sdata->arp_filter_state; + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_ARP_FILTER); + } + + mutex_unlock(&ifmgd->mtx); + + return NOTIFY_DONE; +} +#endif + +static int ieee80211_napi_poll(struct napi_struct *napi, int budget) +{ + struct ieee80211_local *local = + container_of(napi, struct ieee80211_local, napi); + + return local->ops->napi_poll(&local->hw, budget); +} + +void ieee80211_napi_schedule(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + napi_schedule(&local->napi); +} +EXPORT_SYMBOL(ieee80211_napi_schedule); + +void ieee80211_napi_complete(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + napi_complete(&local->napi); +} +EXPORT_SYMBOL(ieee80211_napi_complete); + +/* There isn't a lot of sense in it, but you can transmit anything you like */ +static const struct ieee80211_txrx_stypes +ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { + [NL80211_IFTYPE_ADHOC] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_STATION] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + }, + [NL80211_IFTYPE_AP] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_AP_VLAN] = { + /* copy AP */ + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_P2P_CLIENT] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + }, + [NL80211_IFTYPE_P2P_GO] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_MESH_POINT] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4), + }, +}; + +struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, + const struct ieee80211_ops *ops) +{ + struct ieee80211_local *local; + int priv_size, i; + struct wiphy *wiphy; + + /* Ensure 32-byte alignment of our private data and hw private data. + * We use the wiphy priv data for both our ieee80211_local and for + * the driver's private data + * + * In memory it'll be like this: + * + * +-------------------------+ + * | struct wiphy | + * +-------------------------+ + * | struct ieee80211_local | + * +-------------------------+ + * | driver's private data | + * +-------------------------+ + * + */ + priv_size = ALIGN(sizeof(*local), NETDEV_ALIGN) + priv_data_len; + + wiphy = wiphy_new(&mac80211_config_ops, priv_size); + + if (!wiphy) + return NULL; + + wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes; + + wiphy->privid = mac80211_wiphy_privid; + + wiphy->flags |= WIPHY_FLAG_NETNS_OK | + WIPHY_FLAG_4ADDR_AP | + WIPHY_FLAG_4ADDR_STATION; + + if (!ops->set_key) + wiphy->flags |= WIPHY_FLAG_IBSS_RSN; + + wiphy->bss_priv_size = sizeof(struct ieee80211_bss); + + local = wiphy_priv(wiphy); + + local->hw.wiphy = wiphy; + + local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN); + + BUG_ON(!ops->tx); + BUG_ON(!ops->start); + BUG_ON(!ops->stop); + BUG_ON(!ops->config); + BUG_ON(!ops->add_interface); + BUG_ON(!ops->remove_interface); + BUG_ON(!ops->configure_filter); + local->ops = ops; + + /* set up some defaults */ + local->hw.queues = 1; + local->hw.max_rates = 1; + local->hw.max_report_rates = 0; + local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; + local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; + local->user_power_level = -1; + local->uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES; + local->uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN; + + INIT_LIST_HEAD(&local->interfaces); + + __hw_addr_init(&local->mc_list); + + mutex_init(&local->iflist_mtx); + mutex_init(&local->mtx); + + mutex_init(&local->key_mtx); + spin_lock_init(&local->filter_lock); + spin_lock_init(&local->queue_stop_reason_lock); + + /* + * The rx_skb_queue is only accessed from tasklets, + * but other SKB queues are used from within IRQ + * context. Therefore, this one needs a different + * locking class so our direct, non-irq-safe use of + * the queue's lock doesn't throw lockdep warnings. + */ + skb_queue_head_init_class(&local->rx_skb_queue, + &ieee80211_rx_skb_queue_class); + + INIT_DELAYED_WORK(&local->scan_work, ieee80211_scan_work); + + ieee80211_work_init(local); + + INIT_WORK(&local->restart_work, ieee80211_restart_work); + + INIT_WORK(&local->reconfig_filter, ieee80211_reconfig_filter); + INIT_WORK(&local->recalc_smps, ieee80211_recalc_smps_work); + local->smps_mode = IEEE80211_SMPS_OFF; + + INIT_WORK(&local->dynamic_ps_enable_work, + ieee80211_dynamic_ps_enable_work); + INIT_WORK(&local->dynamic_ps_disable_work, + ieee80211_dynamic_ps_disable_work); + setup_timer(&local->dynamic_ps_timer, + ieee80211_dynamic_ps_timer, (unsigned long) local); + + INIT_WORK(&local->sched_scan_stopped_work, + ieee80211_sched_scan_stopped_work); + + sta_info_init(local); + + for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { + skb_queue_head_init(&local->pending[i]); + atomic_set(&local->agg_queue_stop[i], 0); + } + tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, + (unsigned long)local); + + tasklet_init(&local->tasklet, + ieee80211_tasklet_handler, + (unsigned long) local); + + skb_queue_head_init(&local->skb_queue); + skb_queue_head_init(&local->skb_queue_unreliable); + + /* init dummy netdev for use w/ NAPI */ + init_dummy_netdev(&local->napi_dev); + + ieee80211_led_names(local); + + ieee80211_hw_roc_setup(local); + + return local_to_hw(local); +} +EXPORT_SYMBOL(ieee80211_alloc_hw); + +int ieee80211_register_hw(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + int result, i; + enum ieee80211_band band; + int channels, max_bitrates; + bool supp_ht; + static const u32 cipher_suites[] = { + /* keep WEP first, it may be removed below */ + WLAN_CIPHER_SUITE_WEP40, + WLAN_CIPHER_SUITE_WEP104, + WLAN_CIPHER_SUITE_TKIP, + WLAN_CIPHER_SUITE_CCMP, + + /* keep last -- depends on hw flags! */ + WLAN_CIPHER_SUITE_AES_CMAC + }; + + if ((hw->wiphy->wowlan.flags || hw->wiphy->wowlan.n_patterns) +#ifdef CONFIG_PM + && (!local->ops->suspend || !local->ops->resume) +#endif + ) + return -EINVAL; + + if (hw->max_report_rates == 0) + hw->max_report_rates = hw->max_rates; + + /* + * generic code guarantees at least one band, + * set this very early because much code assumes + * that hw.conf.channel is assigned + */ + channels = 0; + max_bitrates = 0; + supp_ht = false; + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + struct ieee80211_supported_band *sband; + + sband = local->hw.wiphy->bands[band]; + if (!sband) + continue; + if (!local->oper_channel) { + /* init channel we're on */ + local->hw.conf.channel = + local->oper_channel = &sband->channels[0]; + local->hw.conf.channel_type = NL80211_CHAN_NO_HT; + } + channels += sband->n_channels; + + if (max_bitrates < sband->n_bitrates) + max_bitrates = sband->n_bitrates; + supp_ht = supp_ht || sband->ht_cap.ht_supported; + } + + local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) + + sizeof(void *) * channels, GFP_KERNEL); + if (!local->int_scan_req) + return -ENOMEM; + + /* if low-level driver supports AP, we also support VLAN */ + if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) { + hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN); + hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_AP_VLAN); + } + + /* mac80211 always supports monitor */ + hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); + hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR); + + /* + * mac80211 doesn't support more than 1 channel, and also not more + * than one IBSS interface + */ + for (i = 0; i < hw->wiphy->n_iface_combinations; i++) { + const struct ieee80211_iface_combination *c; + int j; + + c = &hw->wiphy->iface_combinations[i]; + + if (c->num_different_channels > 1) + return -EINVAL; + + for (j = 0; j < c->n_limits; j++) + if ((c->limits[j].types & BIT(NL80211_IFTYPE_ADHOC)) && + c->limits[j].max > 1) + return -EINVAL; + } + +#ifndef CONFIG_MAC80211_MESH + /* mesh depends on Kconfig, but drivers should set it if they want */ + local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT); +#endif + + /* if the underlying driver supports mesh, mac80211 will (at least) + * provide routing of mesh authentication frames to userspace */ + if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_MESH_POINT)) + local->hw.wiphy->flags |= WIPHY_FLAG_MESH_AUTH; + + /* mac80211 supports control port protocol changing */ + local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; + + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; + else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; + + WARN((local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) + && (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), + "U-APSD not supported with HW_PS_NULLFUNC_STACK\n"); + + /* + * Calculate scan IE length -- we need this to alloc + * memory and to subtract from the driver limit. It + * includes the DS Params, (extended) supported rates, and HT + * information -- SSID is the driver's responsibility. + */ + local->scan_ies_len = 4 + max_bitrates /* (ext) supp rates */ + + 3 /* DS Params */; + if (supp_ht) + local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap); + + if (!local->ops->hw_scan) { + /* For hw_scan, driver needs to set these up. */ + local->hw.wiphy->max_scan_ssids = 4; + local->hw.wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN; + } + + /* + * If the driver supports any scan IEs, then assume the + * limit includes the IEs mac80211 will add, otherwise + * leave it at zero and let the driver sort it out; we + * still pass our IEs to the driver but userspace will + * not be allowed to in that case. + */ + if (local->hw.wiphy->max_scan_ie_len) + local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len; + + /* Set up cipher suites unless driver already did */ + if (!local->hw.wiphy->cipher_suites) { + local->hw.wiphy->cipher_suites = cipher_suites; + local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); + if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) + local->hw.wiphy->n_cipher_suites--; + } + if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) { + if (local->hw.wiphy->cipher_suites == cipher_suites) { + local->hw.wiphy->cipher_suites += 2; + local->hw.wiphy->n_cipher_suites -= 2; + } else { + u32 *suites; + int r, w = 0; + + /* Filter out WEP */ + + suites = kmemdup( + local->hw.wiphy->cipher_suites, + sizeof(u32) * local->hw.wiphy->n_cipher_suites, + GFP_KERNEL); + if (!suites) + return -ENOMEM; + for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) { + u32 suite = local->hw.wiphy->cipher_suites[r]; + if (suite == WLAN_CIPHER_SUITE_WEP40 || + suite == WLAN_CIPHER_SUITE_WEP104) + continue; + suites[w++] = suite; + } + local->hw.wiphy->cipher_suites = suites; + local->hw.wiphy->n_cipher_suites = w; + local->wiphy_ciphers_allocated = true; + } + } + + if (!local->ops->remain_on_channel) + local->hw.wiphy->max_remain_on_channel_duration = 5000; + + if (local->ops->sched_scan_start) + local->hw.wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; + + result = wiphy_register(local->hw.wiphy); + if (result < 0) + goto fail_wiphy_register; + + /* + * We use the number of queues for feature tests (QoS, HT) internally + * so restrict them appropriately. + */ + if (hw->queues > IEEE80211_MAX_QUEUES) + hw->queues = IEEE80211_MAX_QUEUES; + + local->workqueue = + alloc_ordered_workqueue(wiphy_name(local->hw.wiphy), 0); + if (!local->workqueue) { + result = -ENOMEM; + goto fail_workqueue; + } + + /* + * The hardware needs headroom for sending the frame, + * and we need some headroom for passing the frame to monitor + * interfaces, but never both at the same time. + */ +#ifndef __CHECKER__ + BUILD_BUG_ON(IEEE80211_TX_STATUS_HEADROOM != + sizeof(struct ieee80211_tx_status_rtap_hdr)); +#endif + local->tx_headroom = max_t(unsigned int , local->hw.extra_tx_headroom, + sizeof(struct ieee80211_tx_status_rtap_hdr)); + + debugfs_hw_add(local); + + /* + * if the driver doesn't specify a max listen interval we + * use 5 which should be a safe default + */ + if (local->hw.max_listen_interval == 0) + local->hw.max_listen_interval = 5; + + local->hw.conf.listen_interval = local->hw.max_listen_interval; + + local->dynamic_ps_forced_timeout = -1; + + result = ieee80211_wep_init(local); + if (result < 0) + wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", + result); + + ieee80211_led_init(local); + + rtnl_lock(); + + result = ieee80211_init_rate_ctrl_alg(local, + hw->rate_control_algorithm); + if (result < 0) { + wiphy_debug(local->hw.wiphy, + "Failed to initialize rate control algorithm\n"); + goto fail_rate; + } + + /* add one default STA interface if supported */ + if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION)) { + result = ieee80211_if_add(local, "wlan%d", NULL, + NL80211_IFTYPE_STATION, NULL); + if (result) + wiphy_warn(local->hw.wiphy, + "Failed to add default virtual iface\n"); + } + + rtnl_unlock(); + + local->network_latency_notifier.notifier_call = + ieee80211_max_network_latency; + result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY, + &local->network_latency_notifier); + if (result) { + rtnl_lock(); + goto fail_pm_qos; + } + +#ifdef CONFIG_INET + local->ifa_notifier.notifier_call = ieee80211_ifa_changed; + result = register_inetaddr_notifier(&local->ifa_notifier); + if (result) + goto fail_ifa; +#endif + + netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll, + local->hw.napi_weight); + + return 0; + +#ifdef CONFIG_INET + fail_ifa: + pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, + &local->network_latency_notifier); + rtnl_lock(); +#endif + fail_pm_qos: + ieee80211_led_exit(local); + ieee80211_remove_interfaces(local); + fail_rate: + rtnl_unlock(); + ieee80211_wep_free(local); + sta_info_stop(local); + destroy_workqueue(local->workqueue); + fail_workqueue: + wiphy_unregister(local->hw.wiphy); + fail_wiphy_register: + if (local->wiphy_ciphers_allocated) + kfree(local->hw.wiphy->cipher_suites); + kfree(local->int_scan_req); + return result; +} +EXPORT_SYMBOL(ieee80211_register_hw); + +void ieee80211_unregister_hw(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + tasklet_kill(&local->tx_pending_tasklet); + tasklet_kill(&local->tasklet); + + pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY, + &local->network_latency_notifier); +#ifdef CONFIG_INET + unregister_inetaddr_notifier(&local->ifa_notifier); +#endif + + rtnl_lock(); + + /* + * At this point, interface list manipulations are fine + * because the driver cannot be handing us frames any + * more and the tasklet is killed. + */ + ieee80211_remove_interfaces(local); + + rtnl_unlock(); + + /* + * Now all work items will be gone, but the + * timer might still be armed, so delete it + */ + del_timer_sync(&local->work_timer); + + cancel_work_sync(&local->restart_work); + cancel_work_sync(&local->reconfig_filter); + + ieee80211_clear_tx_pending(local); + sta_info_stop(local); + rate_control_deinitialize(local); + + if (skb_queue_len(&local->skb_queue) || + skb_queue_len(&local->skb_queue_unreliable)) + wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); + skb_queue_purge(&local->skb_queue); + skb_queue_purge(&local->skb_queue_unreliable); + skb_queue_purge(&local->rx_skb_queue); + + destroy_workqueue(local->workqueue); + wiphy_unregister(local->hw.wiphy); + ieee80211_wep_free(local); + ieee80211_led_exit(local); + kfree(local->int_scan_req); +} +EXPORT_SYMBOL(ieee80211_unregister_hw); + +void ieee80211_free_hw(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + mutex_destroy(&local->iflist_mtx); + mutex_destroy(&local->mtx); + + if (local->wiphy_ciphers_allocated) + kfree(local->hw.wiphy->cipher_suites); + + wiphy_free(local->hw.wiphy); +} +EXPORT_SYMBOL(ieee80211_free_hw); + +static int __init ieee80211_init(void) +{ + struct sk_buff *skb; + int ret; + + BUILD_BUG_ON(sizeof(struct ieee80211_tx_info) > sizeof(skb->cb)); + BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, driver_data) + + IEEE80211_TX_INFO_DRIVER_DATA_SIZE > sizeof(skb->cb)); + + ret = rc80211_minstrel_init(); + if (ret) + return ret; + + ret = rc80211_minstrel_ht_init(); + if (ret) + goto err_minstrel; + + ret = rc80211_pid_init(); + if (ret) + goto err_pid; + + ret = ieee80211_iface_init(); + if (ret) + goto err_netdev; + + return 0; + err_netdev: + rc80211_pid_exit(); + err_pid: + rc80211_minstrel_ht_exit(); + err_minstrel: + rc80211_minstrel_exit(); + + return ret; +} + +static void __exit ieee80211_exit(void) +{ + rc80211_pid_exit(); + rc80211_minstrel_ht_exit(); + rc80211_minstrel_exit(); + + if (mesh_allocated) + ieee80211s_stop(); + + ieee80211_iface_exit(); + + rcu_barrier(); +} + + +subsys_initcall(ieee80211_init); +module_exit(ieee80211_exit); + +MODULE_DESCRIPTION("IEEE 802.11 subsystem"); +MODULE_LICENSE("GPL"); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c new file mode 100644 index 00000000..29e9980c --- /dev/null +++ b/net/mac80211/mesh.c @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2008, 2009 open80211s Ltd. + * Authors: Luis Carlos Cobo + * Javier Cardona + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include "ieee80211_i.h" +#include "mesh.h" + +#define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ) +#define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ) +#define IEEE80211_MESH_RANN_INTERVAL (1 * HZ) + +#define MESHCONF_CAPAB_ACCEPT_PLINKS 0x01 +#define MESHCONF_CAPAB_FORWARDING 0x08 + +#define TMR_RUNNING_HK 0 +#define TMR_RUNNING_MP 1 +#define TMR_RUNNING_MPR 2 + +int mesh_allocated; +static struct kmem_cache *rm_cache; + +void ieee80211s_init(void) +{ + mesh_pathtbl_init(); + mesh_allocated = 1; + rm_cache = kmem_cache_create("mesh_rmc", sizeof(struct rmc_entry), + 0, 0, NULL); +} + +void ieee80211s_stop(void) +{ + mesh_pathtbl_unregister(); + kmem_cache_destroy(rm_cache); +} + +static void ieee80211_mesh_housekeeping_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = (void *) data; + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); + + if (local->quiescing) { + set_bit(TMR_RUNNING_HK, &ifmsh->timers_running); + return; + } + + ieee80211_queue_work(&local->hw, &sdata->work); +} + +/** + * mesh_matches_local - check if the config of a mesh point matches ours + * + * @ie: information elements of a management frame from the mesh peer + * @sdata: local mesh subif + * + * This function checks if the mesh configuration of a mesh point matches the + * local mesh configuration, i.e. if both nodes belong to the same mesh network. + */ +bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + /* + * As support for each feature is added, check for matching + * - On mesh config capabilities + * - Power Save Support En + * - Sync support enabled + * - Sync support active + * - Sync support required from peer + * - MDA enabled + * - Power management control on fc + */ + if (ifmsh->mesh_id_len == ie->mesh_id_len && + memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 && + (ifmsh->mesh_pp_id == ie->mesh_config->meshconf_psel) && + (ifmsh->mesh_pm_id == ie->mesh_config->meshconf_pmetric) && + (ifmsh->mesh_cc_id == ie->mesh_config->meshconf_congest) && + (ifmsh->mesh_sp_id == ie->mesh_config->meshconf_synch) && + (ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth)) + return true; + + return false; +} + +/** + * mesh_peer_accepts_plinks - check if an mp is willing to establish peer links + * + * @ie: information elements of a management frame from the mesh peer + */ +bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie) +{ + return (ie->mesh_config->meshconf_cap & + MESHCONF_CAPAB_ACCEPT_PLINKS) != 0; +} + +/** + * mesh_accept_plinks_update: update accepting_plink in local mesh beacons + * + * @sdata: mesh interface in which mesh beacons are going to be updated + */ +void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata) +{ + bool free_plinks; + + /* In case mesh_plink_free_count > 0 and mesh_plinktbl_capacity == 0, + * the mesh interface might be able to establish plinks with peers that + * are already on the table but are not on PLINK_ESTAB state. However, + * in general the mesh interface is not accepting peer link requests + * from new peers, and that must be reflected in the beacon + */ + free_plinks = mesh_plink_availables(sdata); + + if (free_plinks != sdata->u.mesh.accepting_plinks) + ieee80211_mesh_housekeeping_timer((unsigned long) sdata); +} + +int mesh_rmc_init(struct ieee80211_sub_if_data *sdata) +{ + int i; + + sdata->u.mesh.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL); + if (!sdata->u.mesh.rmc) + return -ENOMEM; + sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1; + for (i = 0; i < RMC_BUCKETS; i++) + INIT_LIST_HEAD(&sdata->u.mesh.rmc->bucket[i].list); + return 0; +} + +void mesh_rmc_free(struct ieee80211_sub_if_data *sdata) +{ + struct mesh_rmc *rmc = sdata->u.mesh.rmc; + struct rmc_entry *p, *n; + int i; + + if (!sdata->u.mesh.rmc) + return; + + for (i = 0; i < RMC_BUCKETS; i++) + list_for_each_entry_safe(p, n, &rmc->bucket[i].list, list) { + list_del(&p->list); + kmem_cache_free(rm_cache, p); + } + + kfree(rmc); + sdata->u.mesh.rmc = NULL; +} + +/** + * mesh_rmc_check - Check frame in recent multicast cache and add if absent. + * + * @sa: source address + * @mesh_hdr: mesh_header + * + * Returns: 0 if the frame is not in the cache, nonzero otherwise. + * + * Checks using the source address and the mesh sequence number if we have + * received this frame lately. If the frame is not in the cache, it is added to + * it. + */ +int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr, + struct ieee80211_sub_if_data *sdata) +{ + struct mesh_rmc *rmc = sdata->u.mesh.rmc; + u32 seqnum = 0; + int entries = 0; + u8 idx; + struct rmc_entry *p, *n; + + /* Don't care about endianness since only match matters */ + memcpy(&seqnum, &mesh_hdr->seqnum, sizeof(mesh_hdr->seqnum)); + idx = le32_to_cpu(mesh_hdr->seqnum) & rmc->idx_mask; + list_for_each_entry_safe(p, n, &rmc->bucket[idx].list, list) { + ++entries; + if (time_after(jiffies, p->exp_time) || + (entries == RMC_QUEUE_MAX_LEN)) { + list_del(&p->list); + kmem_cache_free(rm_cache, p); + --entries; + } else if ((seqnum == p->seqnum) && + (memcmp(sa, p->sa, ETH_ALEN) == 0)) + return -1; + } + + p = kmem_cache_alloc(rm_cache, GFP_ATOMIC); + if (!p) { + printk(KERN_DEBUG "o11s: could not allocate RMC entry\n"); + return 0; + } + p->seqnum = seqnum; + p->exp_time = jiffies + RMC_TIMEOUT; + memcpy(p->sa, sa, ETH_ALEN); + list_add(&p->list, &rmc->bucket[idx].list); + return 0; +} + +void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + u8 *pos; + int len, i, rate; + u8 neighbors; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + len = sband->n_bitrates; + if (len > 8) + len = 8; + pos = skb_put(skb, len + 2); + *pos++ = WLAN_EID_SUPP_RATES; + *pos++ = len; + for (i = 0; i < len; i++) { + rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + + if (sband->n_bitrates > len) { + pos = skb_put(skb, sband->n_bitrates - len + 2); + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = sband->n_bitrates - len; + for (i = len; i < sband->n_bitrates; i++) { + rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + } + + if (sband->band == IEEE80211_BAND_2GHZ) { + pos = skb_put(skb, 2 + 1); + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = ieee80211_frequency_to_channel(local->hw.conf.channel->center_freq); + } + + pos = skb_put(skb, 2 + sdata->u.mesh.mesh_id_len); + *pos++ = WLAN_EID_MESH_ID; + *pos++ = sdata->u.mesh.mesh_id_len; + if (sdata->u.mesh.mesh_id_len) + memcpy(pos, sdata->u.mesh.mesh_id, sdata->u.mesh.mesh_id_len); + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_meshconf_ie)); + *pos++ = WLAN_EID_MESH_CONFIG; + *pos++ = sizeof(struct ieee80211_meshconf_ie); + + /* Active path selection protocol ID */ + *pos++ = sdata->u.mesh.mesh_pp_id; + + /* Active path selection metric ID */ + *pos++ = sdata->u.mesh.mesh_pm_id; + + /* Congestion control mode identifier */ + *pos++ = sdata->u.mesh.mesh_cc_id; + + /* Synchronization protocol identifier */ + *pos++ = sdata->u.mesh.mesh_sp_id; + + /* Authentication Protocol identifier */ + *pos++ = sdata->u.mesh.mesh_auth_id; + + /* Mesh Formation Info - number of neighbors */ + neighbors = atomic_read(&sdata->u.mesh.mshstats.estab_plinks); + /* Number of neighbor mesh STAs or 15 whichever is smaller */ + neighbors = (neighbors > 15) ? 15 : neighbors; + *pos++ = neighbors << 1; + + /* Mesh capability */ + sdata->u.mesh.accepting_plinks = mesh_plink_availables(sdata); + *pos = MESHCONF_CAPAB_FORWARDING; + *pos++ |= sdata->u.mesh.accepting_plinks ? + MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00; + *pos++ = 0x00; + + if (sdata->u.mesh.ie) { + int len = sdata->u.mesh.ie_len; + const u8 *data = sdata->u.mesh.ie; + if (skb_tailroom(skb) > len) + memcpy(skb_put(skb, len), data, len); + } +} + + +static void ieee80211_mesh_path_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_local *local = sdata->local; + + if (local->quiescing) { + set_bit(TMR_RUNNING_MP, &ifmsh->timers_running); + return; + } + + ieee80211_queue_work(&local->hw, &sdata->work); +} + +static void ieee80211_mesh_path_root_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_local *local = sdata->local; + + set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); + + if (local->quiescing) { + set_bit(TMR_RUNNING_MPR, &ifmsh->timers_running); + return; + } + + ieee80211_queue_work(&local->hw, &sdata->work); +} + +void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) +{ + if (ifmsh->mshcfg.dot11MeshHWMPRootMode) + set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); + else { + clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); + /* stop running timer */ + del_timer_sync(&ifmsh->mesh_path_root_timer); + } +} + +/** + * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame + * @hdr: 802.11 frame header + * @fc: frame control field + * @meshda: destination address in the mesh + * @meshsa: source address address in the mesh. Same as TA, as frame is + * locally originated. + * + * Return the length of the 802.11 (does not include a mesh control header) + */ +int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, + const u8 *meshda, const u8 *meshsa) +{ + if (is_multicast_ether_addr(meshda)) { + *fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA TA SA */ + memcpy(hdr->addr1, meshda, ETH_ALEN); + memcpy(hdr->addr2, meshsa, ETH_ALEN); + memcpy(hdr->addr3, meshsa, ETH_ALEN); + return 24; + } else { + *fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memset(hdr->addr1, 0, ETH_ALEN); /* RA is resolved later */ + memcpy(hdr->addr2, meshsa, ETH_ALEN); + memcpy(hdr->addr3, meshda, ETH_ALEN); + memcpy(hdr->addr4, meshsa, ETH_ALEN); + return 30; + } +} + +/** + * ieee80211_new_mesh_header - create a new mesh header + * @meshhdr: uninitialized mesh header + * @sdata: mesh interface to be used + * @addr4or5: 1st address in the ae header, which may correspond to address 4 + * (if addr6 is NULL) or address 5 (if addr6 is present). It may + * be NULL. + * @addr6: 2nd address in the ae header, which corresponds to addr6 of the + * mesh frame + * + * Return the header length. + */ +int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr, + struct ieee80211_sub_if_data *sdata, char *addr4or5, + char *addr6) +{ + int aelen = 0; + BUG_ON(!addr4or5 && addr6); + memset(meshhdr, 0, sizeof(*meshhdr)); + meshhdr->ttl = sdata->u.mesh.mshcfg.dot11MeshTTL; + put_unaligned(cpu_to_le32(sdata->u.mesh.mesh_seqnum), &meshhdr->seqnum); + sdata->u.mesh.mesh_seqnum++; + if (addr4or5 && !addr6) { + meshhdr->flags |= MESH_FLAGS_AE_A4; + aelen += ETH_ALEN; + memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN); + } else if (addr4or5 && addr6) { + meshhdr->flags |= MESH_FLAGS_AE_A5_A6; + aelen += 2 * ETH_ALEN; + memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN); + memcpy(meshhdr->eaddr2, addr6, ETH_ALEN); + } + return 6 + aelen; +} + +static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata, + struct ieee80211_if_mesh *ifmsh) +{ + bool free_plinks; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + printk(KERN_DEBUG "%s: running mesh housekeeping\n", + sdata->name); +#endif + + ieee80211_sta_expire(sdata, IEEE80211_MESH_PEER_INACTIVITY_LIMIT); + mesh_path_expire(sdata); + + free_plinks = mesh_plink_availables(sdata); + if (free_plinks != sdata->u.mesh.accepting_plinks) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); + + mod_timer(&ifmsh->housekeeping_timer, + round_jiffies(jiffies + IEEE80211_MESH_HOUSEKEEPING_INTERVAL)); +} + +static void ieee80211_mesh_rootpath(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + mesh_path_tx_root_frame(sdata); + mod_timer(&ifmsh->mesh_path_root_timer, + round_jiffies(jiffies + IEEE80211_MESH_RANN_INTERVAL)); +} + +#ifdef CONFIG_PM +void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + /* use atomic bitops in case both timers fire at the same time */ + + if (del_timer_sync(&ifmsh->housekeeping_timer)) + set_bit(TMR_RUNNING_HK, &ifmsh->timers_running); + if (del_timer_sync(&ifmsh->mesh_path_timer)) + set_bit(TMR_RUNNING_MP, &ifmsh->timers_running); + if (del_timer_sync(&ifmsh->mesh_path_root_timer)) + set_bit(TMR_RUNNING_MPR, &ifmsh->timers_running); +} + +void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + if (test_and_clear_bit(TMR_RUNNING_HK, &ifmsh->timers_running)) + add_timer(&ifmsh->housekeeping_timer); + if (test_and_clear_bit(TMR_RUNNING_MP, &ifmsh->timers_running)) + add_timer(&ifmsh->mesh_path_timer); + if (test_and_clear_bit(TMR_RUNNING_MPR, &ifmsh->timers_running)) + add_timer(&ifmsh->mesh_path_root_timer); + ieee80211_mesh_root_setup(ifmsh); +} +#endif + +void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_local *local = sdata->local; + + local->fif_other_bss++; + /* mesh ifaces must set allmulti to forward mcast traffic */ + atomic_inc(&local->iff_allmultis); + ieee80211_configure_filter(local); + + ifmsh->mesh_cc_id = 0; /* Disabled */ + ifmsh->mesh_sp_id = 0; /* Neighbor Offset */ + ifmsh->mesh_auth_id = 0; /* Disabled */ + set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); + ieee80211_mesh_root_setup(ifmsh); + ieee80211_queue_work(&local->hw, &sdata->work); + sdata->vif.bss_conf.beacon_int = MESH_DEFAULT_BEACON_INTERVAL; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON | + BSS_CHANGED_BEACON_ENABLED | + BSS_CHANGED_BEACON_INT); +} + +void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + ifmsh->mesh_id_len = 0; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); + sta_info_flush(local, NULL); + + del_timer_sync(&sdata->u.mesh.housekeeping_timer); + del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); + /* + * If the timer fired while we waited for it, it will have + * requeued the work. Now the work will be running again + * but will not rearm the timer again because it checks + * whether the interface is running, which, at this point, + * it no longer is. + */ + cancel_work_sync(&sdata->work); + + local->fif_other_bss--; + atomic_dec(&local->iff_allmultis); + ieee80211_configure_filter(local); +} + +static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, + u16 stype, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_local *local = sdata->local; + struct ieee802_11_elems elems; + struct ieee80211_channel *channel; + u32 supp_rates = 0; + size_t baselen; + int freq; + enum ieee80211_band band = rx_status->band; + + /* ignore ProbeResp to foreign address */ + if (stype == IEEE80211_STYPE_PROBE_RESP && + compare_ether_addr(mgmt->da, sdata->vif.addr)) + return; + + baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, + &elems); + + /* ignore beacons from secure mesh peers if our security is off */ + if (elems.rsn_len && sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) + return; + + if (elems.ds_params && elems.ds_params_len == 1) + freq = ieee80211_channel_to_frequency(elems.ds_params[0], band); + else + freq = rx_status->freq; + + channel = ieee80211_get_channel(local->hw.wiphy, freq); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return; + + if (elems.mesh_id && elems.mesh_config && + mesh_matches_local(&elems, sdata)) { + supp_rates = ieee80211_sta_get_rates(local, &elems, band); + mesh_neighbour_update(mgmt->sa, supp_rates, sdata, &elems); + } +} + +static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_MESH_ACTION: + mesh_rx_plink_frame(sdata, mgmt, len, rx_status); + break; + case WLAN_CATEGORY_MESH_PATH_SEL: + mesh_rx_path_sel_frame(sdata, mgmt, len); + break; + } +} + +void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_rx_status *rx_status; + struct ieee80211_mgmt *mgmt; + u16 stype; + + rx_status = IEEE80211_SKB_RXCB(skb); + mgmt = (struct ieee80211_mgmt *) skb->data; + stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; + + switch (stype) { + case IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_STYPE_BEACON: + ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_ACTION: + ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status); + break; + } +} + +void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + if (ifmsh->preq_queue_len && + time_after(jiffies, + ifmsh->last_preq + msecs_to_jiffies(ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval))) + mesh_path_start_discovery(sdata); + + if (test_and_clear_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags)) + mesh_mpath_table_grow(); + + if (test_and_clear_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags)) + mesh_mpp_table_grow(); + + if (test_and_clear_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags)) + ieee80211_mesh_housekeeping(sdata, ifmsh); + + if (test_and_clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags)) + ieee80211_mesh_rootpath(sdata); +} + +void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) + if (ieee80211_vif_is_mesh(&sdata->vif)) + ieee80211_queue_work(&local->hw, &sdata->work); + rcu_read_unlock(); +} + +void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + setup_timer(&ifmsh->housekeeping_timer, + ieee80211_mesh_housekeeping_timer, + (unsigned long) sdata); + + ifmsh->accepting_plinks = true; + ifmsh->preq_id = 0; + ifmsh->sn = 0; + atomic_set(&ifmsh->mpaths, 0); + mesh_rmc_init(sdata); + ifmsh->last_preq = jiffies; + /* Allocate all mesh structures when creating the first mesh interface. */ + if (!mesh_allocated) + ieee80211s_init(); + setup_timer(&ifmsh->mesh_path_timer, + ieee80211_mesh_path_timer, + (unsigned long) sdata); + setup_timer(&ifmsh->mesh_path_root_timer, + ieee80211_mesh_path_root_timer, + (unsigned long) sdata); + INIT_LIST_HEAD(&ifmsh->preq_queue.list); + spin_lock_init(&ifmsh->mesh_preq_queue_lock); +} diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h new file mode 100644 index 00000000..249e7333 --- /dev/null +++ b/net/mac80211/mesh.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2008, 2009 open80211s Ltd. + * Authors: Luis Carlos Cobo + * Javier Cardona + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef IEEE80211S_H +#define IEEE80211S_H + +#include +#include +#include +#include "ieee80211_i.h" + + +/* Data structures */ + +/** + * enum mesh_path_flags - mac80211 mesh path flags + * + * + * + * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding + * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path + * @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence + * number + * @MESH_PATH_FIXED: the mesh path has been manually set and should not be + * modified + * @MESH_PATH_RESOLVED: the mesh path can has been resolved + * + * MESH_PATH_RESOLVED is used by the mesh path timer to + * decide when to stop or cancel the mesh path discovery. + */ +enum mesh_path_flags { + MESH_PATH_ACTIVE = BIT(0), + MESH_PATH_RESOLVING = BIT(1), + MESH_PATH_SN_VALID = BIT(2), + MESH_PATH_FIXED = BIT(3), + MESH_PATH_RESOLVED = BIT(4), +}; + +/** + * enum mesh_deferred_task_flags - mac80211 mesh deferred tasks + * + * + * + * @MESH_WORK_HOUSEKEEPING: run the periodic mesh housekeeping tasks + * @MESH_WORK_GROW_MPATH_TABLE: the mesh path table is full and needs + * to grow. + * @MESH_WORK_GROW_MPP_TABLE: the mesh portals table is full and needs to + * grow + * @MESH_WORK_ROOT: the mesh root station needs to send a frame + */ +enum mesh_deferred_task_flags { + MESH_WORK_HOUSEKEEPING, + MESH_WORK_GROW_MPATH_TABLE, + MESH_WORK_GROW_MPP_TABLE, + MESH_WORK_ROOT, +}; + +/** + * struct mesh_path - mac80211 mesh path structure + * + * @dst: mesh path destination mac address + * @sdata: mesh subif + * @next_hop: mesh neighbor to which frames for this destination will be + * forwarded + * @timer: mesh path discovery timer + * @frame_queue: pending queue for frames sent to this destination while the + * path is unresolved + * @sn: target sequence number + * @metric: current metric to this destination + * @hop_count: hops to destination + * @exp_time: in jiffies, when the path will expire or when it expired + * @discovery_timeout: timeout (lapse in jiffies) used for the last discovery + * retry + * @discovery_retries: number of discovery retries + * @flags: mesh path flags, as specified on &enum mesh_path_flags + * @state_lock: mesh path state lock + * + * + * The combination of dst and sdata is unique in the mesh path table. Since the + * next_hop STA is only protected by RCU as well, deleting the STA must also + * remove/substitute the mesh_path structure and wait until that is no longer + * reachable before destroying the STA completely. + */ +struct mesh_path { + u8 dst[ETH_ALEN]; + u8 mpp[ETH_ALEN]; /* used for MPP or MAP */ + struct ieee80211_sub_if_data *sdata; + struct sta_info __rcu *next_hop; + struct timer_list timer; + struct sk_buff_head frame_queue; + struct rcu_head rcu; + u32 sn; + u32 metric; + u8 hop_count; + unsigned long exp_time; + u32 discovery_timeout; + u8 discovery_retries; + enum mesh_path_flags flags; + spinlock_t state_lock; +}; + +/** + * struct mesh_table + * + * @hash_buckets: array of hash buckets of the table + * @hashwlock: array of locks to protect write operations, one per bucket + * @hash_mask: 2^size_order - 1, used to compute hash idx + * @hash_rnd: random value used for hash computations + * @entries: number of entries in the table + * @free_node: function to free nodes of the table + * @copy_node: function to copy nodes of the table + * @size_order: determines size of the table, there will be 2^size_order hash + * buckets + * @mean_chain_len: maximum average length for the hash buckets' list, if it is + * reached, the table will grow + * rcu_head: RCU head to free the table + */ +struct mesh_table { + /* Number of buckets will be 2^N */ + struct hlist_head *hash_buckets; + spinlock_t *hashwlock; /* One per bucket, for add/del */ + unsigned int hash_mask; /* (2^size_order) - 1 */ + __u32 hash_rnd; /* Used for hash generation */ + atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ + void (*free_node) (struct hlist_node *p, bool free_leafs); + int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl); + int size_order; + int mean_chain_len; + + struct rcu_head rcu_head; +}; + +/* Recent multicast cache */ +/* RMC_BUCKETS must be a power of 2, maximum 256 */ +#define RMC_BUCKETS 256 +#define RMC_QUEUE_MAX_LEN 4 +#define RMC_TIMEOUT (3 * HZ) + +/** + * struct rmc_entry - entry in the Recent Multicast Cache + * + * @seqnum: mesh sequence number of the frame + * @exp_time: expiration time of the entry, in jiffies + * @sa: source address of the frame + * + * The Recent Multicast Cache keeps track of the latest multicast frames that + * have been received by a mesh interface and discards received multicast frames + * that are found in the cache. + */ +struct rmc_entry { + struct list_head list; + u32 seqnum; + unsigned long exp_time; + u8 sa[ETH_ALEN]; +}; + +struct mesh_rmc { + struct rmc_entry bucket[RMC_BUCKETS]; + u32 idx_mask; +}; + + +#define MESH_DEFAULT_BEACON_INTERVAL 1000 /* in 1024 us units */ + +#define MESH_PATH_EXPIRE (600 * HZ) + +/* Default maximum number of plinks per interface */ +#define MESH_MAX_PLINKS 256 + +/* Maximum number of paths per interface */ +#define MESH_MAX_MPATHS 1024 + +/* Pending ANA approval */ +#define MESH_PATH_SEL_ACTION 0 + +/* PERR reason codes */ +#define PEER_RCODE_UNSPECIFIED 11 +#define PERR_RCODE_NO_ROUTE 12 +#define PERR_RCODE_DEST_UNREACH 13 + +/* Public interfaces */ +/* Various */ +int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, + const u8 *da, const u8 *sa); +int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr, + struct ieee80211_sub_if_data *sdata, char *addr4or5, + char *addr6); +int mesh_rmc_check(u8 *addr, struct ieee80211s_hdr *mesh_hdr, + struct ieee80211_sub_if_data *sdata); +bool mesh_matches_local(struct ieee802_11_elems *ie, + struct ieee80211_sub_if_data *sdata); +void mesh_ids_set_default(struct ieee80211_if_mesh *mesh); +void mesh_mgmt_ies_add(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata); +void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); +int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); +void ieee80211s_init(void); +void ieee80211s_update_metric(struct ieee80211_local *local, + struct sta_info *stainfo, struct sk_buff *skb); +void ieee80211s_stop(void); +void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); +void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); +void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata); +void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh); + +/* Mesh paths */ +int mesh_nexthop_lookup(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata); +void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata); +struct mesh_path *mesh_path_lookup(u8 *dst, + struct ieee80211_sub_if_data *sdata); +struct mesh_path *mpp_path_lookup(u8 *dst, + struct ieee80211_sub_if_data *sdata); +int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata); +struct mesh_path *mesh_path_lookup_by_idx(int idx, + struct ieee80211_sub_if_data *sdata); +void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop); +void mesh_path_expire(struct ieee80211_sub_if_data *sdata); +void mesh_path_flush(struct ieee80211_sub_if_data *sdata); +void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); +int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata); +/* Mesh plinks */ +void mesh_neighbour_update(u8 *hw_addr, u32 rates, + struct ieee80211_sub_if_data *sdata, + struct ieee802_11_elems *ie); +bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); +void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); +void mesh_plink_broken(struct sta_info *sta); +void mesh_plink_deactivate(struct sta_info *sta); +int mesh_plink_open(struct sta_info *sta); +void mesh_plink_block(struct sta_info *sta); +void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status); + +/* Private interfaces */ +/* Mesh tables */ +void mesh_mpath_table_grow(void); +void mesh_mpp_table_grow(void); +/* Mesh paths */ +int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, __le16 target_rcode, + const u8 *ra, struct ieee80211_sub_if_data *sdata); +void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta); +void mesh_path_flush_pending(struct mesh_path *mpath); +void mesh_path_tx_pending(struct mesh_path *mpath); +int mesh_pathtbl_init(void); +void mesh_pathtbl_unregister(void); +int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata); +void mesh_path_timer(unsigned long data); +void mesh_path_flush_by_nexthop(struct sta_info *sta); +void mesh_path_discard_frame(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata); +void mesh_path_quiesce(struct ieee80211_sub_if_data *sdata); +void mesh_path_restart(struct ieee80211_sub_if_data *sdata); +void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata); + +extern int mesh_paths_generation; + +#ifdef CONFIG_MAC80211_MESH +extern int mesh_allocated; + +static inline int mesh_plink_free_count(struct ieee80211_sub_if_data *sdata) +{ + return sdata->u.mesh.mshcfg.dot11MeshMaxPeerLinks - + atomic_read(&sdata->u.mesh.mshstats.estab_plinks); +} + +static inline bool mesh_plink_availables(struct ieee80211_sub_if_data *sdata) +{ + return (min_t(long, mesh_plink_free_count(sdata), + MESH_MAX_PLINKS - sdata->local->num_sta)) > 0; +} + +static inline void mesh_path_activate(struct mesh_path *mpath) +{ + mpath->flags |= MESH_PATH_ACTIVE | MESH_PATH_RESOLVED; +} + +static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) +{ + return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP; +} + +void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local); + +void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata); +void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata); +void mesh_plink_quiesce(struct sta_info *sta); +void mesh_plink_restart(struct sta_info *sta); +#else +#define mesh_allocated 0 +static inline void +ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {} +static inline void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata) +{} +static inline void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata) +{} +static inline void mesh_plink_quiesce(struct sta_info *sta) {} +static inline void mesh_plink_restart(struct sta_info *sta) {} +static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) +{ return false; } +#endif + +#endif /* IEEE80211S_H */ diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c new file mode 100644 index 00000000..2b180530 --- /dev/null +++ b/net/mac80211/mesh_hwmp.c @@ -0,0 +1,1016 @@ +/* + * Copyright (c) 2008, 2009 open80211s Ltd. + * Author: Luis Carlos Cobo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include "mesh.h" + +#ifdef CONFIG_MAC80211_VERBOSE_MHWMP_DEBUG +#define mhwmp_dbg(fmt, args...) printk(KERN_DEBUG "Mesh HWMP: " fmt, ##args) +#else +#define mhwmp_dbg(fmt, args...) do { (void)(0); } while (0) +#endif + +#define TEST_FRAME_LEN 8192 +#define MAX_METRIC 0xffffffff +#define ARITH_SHIFT 8 + +/* Number of frames buffered per destination for unresolved destinations */ +#define MESH_FRAME_QUEUE_LEN 10 +#define MAX_PREQ_QUEUE_LEN 64 + +/* Destination only */ +#define MP_F_DO 0x1 +/* Reply and forward */ +#define MP_F_RF 0x2 +/* Unknown Sequence Number */ +#define MP_F_USN 0x01 +/* Reason code Present */ +#define MP_F_RCODE 0x02 + +static void mesh_queue_preq(struct mesh_path *, u8); + +static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae) +{ + if (ae) + offset += 6; + return get_unaligned_le32(preq_elem + offset); +} + +static inline u32 u16_field_get(u8 *preq_elem, int offset, bool ae) +{ + if (ae) + offset += 6; + return get_unaligned_le16(preq_elem + offset); +} + +/* HWMP IE processing macros */ +#define AE_F (1<<6) +#define AE_F_SET(x) (*x & AE_F) +#define PREQ_IE_FLAGS(x) (*(x)) +#define PREQ_IE_HOPCOUNT(x) (*(x + 1)) +#define PREQ_IE_TTL(x) (*(x + 2)) +#define PREQ_IE_PREQ_ID(x) u32_field_get(x, 3, 0) +#define PREQ_IE_ORIG_ADDR(x) (x + 7) +#define PREQ_IE_ORIG_SN(x) u32_field_get(x, 13, 0); +#define PREQ_IE_LIFETIME(x) u32_field_get(x, 17, AE_F_SET(x)); +#define PREQ_IE_METRIC(x) u32_field_get(x, 21, AE_F_SET(x)); +#define PREQ_IE_TARGET_F(x) (*(AE_F_SET(x) ? x + 32 : x + 26)) +#define PREQ_IE_TARGET_ADDR(x) (AE_F_SET(x) ? x + 33 : x + 27) +#define PREQ_IE_TARGET_SN(x) u32_field_get(x, 33, AE_F_SET(x)); + + +#define PREP_IE_FLAGS(x) PREQ_IE_FLAGS(x) +#define PREP_IE_HOPCOUNT(x) PREQ_IE_HOPCOUNT(x) +#define PREP_IE_TTL(x) PREQ_IE_TTL(x) +#define PREP_IE_ORIG_ADDR(x) (x + 3) +#define PREP_IE_ORIG_SN(x) u32_field_get(x, 9, 0); +#define PREP_IE_LIFETIME(x) u32_field_get(x, 13, AE_F_SET(x)); +#define PREP_IE_METRIC(x) u32_field_get(x, 17, AE_F_SET(x)); +#define PREP_IE_TARGET_ADDR(x) (AE_F_SET(x) ? x + 27 : x + 21) +#define PREP_IE_TARGET_SN(x) u32_field_get(x, 27, AE_F_SET(x)); + +#define PERR_IE_TTL(x) (*(x)) +#define PERR_IE_TARGET_FLAGS(x) (*(x + 2)) +#define PERR_IE_TARGET_ADDR(x) (x + 3) +#define PERR_IE_TARGET_SN(x) u32_field_get(x, 9, 0); +#define PERR_IE_TARGET_RCODE(x) u16_field_get(x, 13, 0); + +#define MSEC_TO_TU(x) (x*1000/1024) +#define SN_GT(x, y) ((long) (y) - (long) (x) < 0) +#define SN_LT(x, y) ((long) (x) - (long) (y) < 0) + +#define net_traversal_jiffies(s) \ + msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime) +#define default_lifetime(s) \ + MSEC_TO_TU(s->u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout) +#define min_preq_int_jiff(s) \ + (msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval)) +#define max_preq_retries(s) (s->u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries) +#define disc_timeout_jiff(s) \ + msecs_to_jiffies(sdata->u.mesh.mshcfg.min_discovery_timeout) + +enum mpath_frame_type { + MPATH_PREQ = 0, + MPATH_PREP, + MPATH_PERR, + MPATH_RANN +}; + +static const u8 broadcast_addr[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + +static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, + u8 *orig_addr, __le32 orig_sn, u8 target_flags, u8 *target, + __le32 target_sn, const u8 *da, u8 hop_count, u8 ttl, + __le32 lifetime, __le32 metric, __le32 preq_id, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400); + struct ieee80211_mgmt *mgmt; + u8 *pos; + int ie_len; + + if (!skb) + return -1; + skb_reserve(skb, local->hw.extra_tx_headroom); + /* 25 is the size of the common mgmt part (24) plus the size of the + * common action part (1) + */ + mgmt = (struct ieee80211_mgmt *) + skb_put(skb, 25 + sizeof(mgmt->u.action.u.mesh_action)); + memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.mesh_action)); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + /* BSSID == SA */ + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + mgmt->u.action.category = WLAN_CATEGORY_MESH_PATH_SEL; + mgmt->u.action.u.mesh_action.action_code = MESH_PATH_SEL_ACTION; + + switch (action) { + case MPATH_PREQ: + mhwmp_dbg("sending PREQ to %pM\n", target); + ie_len = 37; + pos = skb_put(skb, 2 + ie_len); + *pos++ = WLAN_EID_PREQ; + break; + case MPATH_PREP: + mhwmp_dbg("sending PREP to %pM\n", target); + ie_len = 31; + pos = skb_put(skb, 2 + ie_len); + *pos++ = WLAN_EID_PREP; + break; + case MPATH_RANN: + mhwmp_dbg("sending RANN from %pM\n", orig_addr); + ie_len = sizeof(struct ieee80211_rann_ie); + pos = skb_put(skb, 2 + ie_len); + *pos++ = WLAN_EID_RANN; + break; + default: + kfree_skb(skb); + return -ENOTSUPP; + break; + } + *pos++ = ie_len; + *pos++ = flags; + *pos++ = hop_count; + *pos++ = ttl; + if (action == MPATH_PREQ) { + memcpy(pos, &preq_id, 4); + pos += 4; + } + memcpy(pos, orig_addr, ETH_ALEN); + pos += ETH_ALEN; + memcpy(pos, &orig_sn, 4); + pos += 4; + if (action != MPATH_RANN) { + memcpy(pos, &lifetime, 4); + pos += 4; + } + memcpy(pos, &metric, 4); + pos += 4; + if (action == MPATH_PREQ) { + /* destination count */ + *pos++ = 1; + *pos++ = target_flags; + } + if (action != MPATH_RANN) { + memcpy(pos, target, ETH_ALEN); + pos += ETH_ALEN; + memcpy(pos, &target_sn, 4); + } + + ieee80211_tx_skb(sdata, skb); + return 0; +} + +/** + * mesh_send_path error - Sends a PERR mesh management frame + * + * @target: broken destination + * @target_sn: SN of the broken destination + * @target_rcode: reason code for this PERR + * @ra: node this frame is addressed to + */ +int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, + __le16 target_rcode, const u8 *ra, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400); + struct ieee80211_mgmt *mgmt; + u8 *pos; + int ie_len; + + if (!skb) + return -1; + skb_reserve(skb, local->hw.extra_tx_headroom); + /* 25 is the size of the common mgmt part (24) plus the size of the + * common action part (1) + */ + mgmt = (struct ieee80211_mgmt *) + skb_put(skb, 25 + sizeof(mgmt->u.action.u.mesh_action)); + memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.mesh_action)); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + memcpy(mgmt->da, ra, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + /* BSSID is left zeroed, wildcard value */ + mgmt->u.action.category = WLAN_CATEGORY_MESH_PATH_SEL; + mgmt->u.action.u.mesh_action.action_code = MESH_PATH_SEL_ACTION; + ie_len = 15; + pos = skb_put(skb, 2 + ie_len); + *pos++ = WLAN_EID_PERR; + *pos++ = ie_len; + /* ttl */ + *pos++ = ttl; + /* number of destinations */ + *pos++ = 1; + /* + * flags bit, bit 1 is unset if we know the sequence number and + * bit 2 is set if we have a reason code + */ + *pos = 0; + if (!target_sn) + *pos |= MP_F_USN; + if (target_rcode) + *pos |= MP_F_RCODE; + pos++; + memcpy(pos, target, ETH_ALEN); + pos += ETH_ALEN; + memcpy(pos, &target_sn, 4); + pos += 4; + memcpy(pos, &target_rcode, 2); + + ieee80211_tx_skb(sdata, skb); + return 0; +} + +void ieee80211s_update_metric(struct ieee80211_local *local, + struct sta_info *stainfo, struct sk_buff *skb) +{ + struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + int failed; + + if (!ieee80211_is_data(hdr->frame_control)) + return; + + failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); + + /* moving average, scaled to 100 */ + stainfo->fail_avg = ((80 * stainfo->fail_avg + 5) / 100 + 20 * failed); + if (stainfo->fail_avg > 95) + mesh_plink_broken(stainfo); +} + +static u32 airtime_link_metric_get(struct ieee80211_local *local, + struct sta_info *sta) +{ + struct ieee80211_supported_band *sband; + /* This should be adjusted for each device */ + int device_constant = 1 << ARITH_SHIFT; + int test_frame_len = TEST_FRAME_LEN << ARITH_SHIFT; + int s_unit = 1 << ARITH_SHIFT; + int rate, err; + u32 tx_time, estimated_retx; + u64 result; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + if (sta->fail_avg >= 100) + return MAX_METRIC; + + if (sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS) + return MAX_METRIC; + + err = (sta->fail_avg << ARITH_SHIFT) / 100; + + /* bitrate is in units of 100 Kbps, while we need rate in units of + * 1Mbps. This will be corrected on tx_time computation. + */ + rate = sband->bitrates[sta->last_tx_rate.idx].bitrate; + tx_time = (device_constant + 10 * test_frame_len / rate); + estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err)); + result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ; + return (u32)result; +} + +/** + * hwmp_route_info_get - Update routing info to originator and transmitter + * + * @sdata: local mesh subif + * @mgmt: mesh management frame + * @hwmp_ie: hwmp information element (PREP or PREQ) + * + * This function updates the path routing information to the originator and the + * transmitter of a HWMP PREQ or PREP frame. + * + * Returns: metric to frame originator or 0 if the frame should not be further + * processed + * + * Notes: this function is the only place (besides user-provided info) where + * path routing information is updated. + */ +static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + u8 *hwmp_ie, enum mpath_frame_type action) +{ + struct ieee80211_local *local = sdata->local; + struct mesh_path *mpath; + struct sta_info *sta; + bool fresh_info; + u8 *orig_addr, *ta; + u32 orig_sn, orig_metric; + unsigned long orig_lifetime, exp_time; + u32 last_hop_metric, new_metric; + bool process = true; + + rcu_read_lock(); + sta = sta_info_get(sdata, mgmt->sa); + if (!sta) { + rcu_read_unlock(); + return 0; + } + + last_hop_metric = airtime_link_metric_get(local, sta); + /* Update and check originator routing info */ + fresh_info = true; + + switch (action) { + case MPATH_PREQ: + orig_addr = PREQ_IE_ORIG_ADDR(hwmp_ie); + orig_sn = PREQ_IE_ORIG_SN(hwmp_ie); + orig_lifetime = PREQ_IE_LIFETIME(hwmp_ie); + orig_metric = PREQ_IE_METRIC(hwmp_ie); + break; + case MPATH_PREP: + /* Originator here refers to the MP that was the destination in + * the Path Request. The draft refers to that MP as the + * destination address, even though usually it is the origin of + * the PREP frame. We divert from the nomenclature in the draft + * so that we can easily use a single function to gather path + * information from both PREQ and PREP frames. + */ + orig_addr = PREP_IE_ORIG_ADDR(hwmp_ie); + orig_sn = PREP_IE_ORIG_SN(hwmp_ie); + orig_lifetime = PREP_IE_LIFETIME(hwmp_ie); + orig_metric = PREP_IE_METRIC(hwmp_ie); + break; + default: + rcu_read_unlock(); + return 0; + } + new_metric = orig_metric + last_hop_metric; + if (new_metric < orig_metric) + new_metric = MAX_METRIC; + exp_time = TU_TO_EXP_TIME(orig_lifetime); + + if (memcmp(orig_addr, sdata->vif.addr, ETH_ALEN) == 0) { + /* This MP is the originator, we are not interested in this + * frame, except for updating transmitter's path info. + */ + process = false; + fresh_info = false; + } else { + mpath = mesh_path_lookup(orig_addr, sdata); + if (mpath) { + spin_lock_bh(&mpath->state_lock); + if (mpath->flags & MESH_PATH_FIXED) + fresh_info = false; + else if ((mpath->flags & MESH_PATH_ACTIVE) && + (mpath->flags & MESH_PATH_SN_VALID)) { + if (SN_GT(mpath->sn, orig_sn) || + (mpath->sn == orig_sn && + new_metric >= mpath->metric)) { + process = false; + fresh_info = false; + } + } + } else { + mesh_path_add(orig_addr, sdata); + mpath = mesh_path_lookup(orig_addr, sdata); + if (!mpath) { + rcu_read_unlock(); + return 0; + } + spin_lock_bh(&mpath->state_lock); + } + + if (fresh_info) { + mesh_path_assign_nexthop(mpath, sta); + mpath->flags |= MESH_PATH_SN_VALID; + mpath->metric = new_metric; + mpath->sn = orig_sn; + mpath->exp_time = time_after(mpath->exp_time, exp_time) + ? mpath->exp_time : exp_time; + mesh_path_activate(mpath); + spin_unlock_bh(&mpath->state_lock); + mesh_path_tx_pending(mpath); + /* draft says preq_id should be saved to, but there does + * not seem to be any use for it, skipping by now + */ + } else + spin_unlock_bh(&mpath->state_lock); + } + + /* Update and check transmitter routing info */ + ta = mgmt->sa; + if (memcmp(orig_addr, ta, ETH_ALEN) == 0) + fresh_info = false; + else { + fresh_info = true; + + mpath = mesh_path_lookup(ta, sdata); + if (mpath) { + spin_lock_bh(&mpath->state_lock); + if ((mpath->flags & MESH_PATH_FIXED) || + ((mpath->flags & MESH_PATH_ACTIVE) && + (last_hop_metric > mpath->metric))) + fresh_info = false; + } else { + mesh_path_add(ta, sdata); + mpath = mesh_path_lookup(ta, sdata); + if (!mpath) { + rcu_read_unlock(); + return 0; + } + spin_lock_bh(&mpath->state_lock); + } + + if (fresh_info) { + mesh_path_assign_nexthop(mpath, sta); + mpath->flags &= ~MESH_PATH_SN_VALID; + mpath->metric = last_hop_metric; + mpath->exp_time = time_after(mpath->exp_time, exp_time) + ? mpath->exp_time : exp_time; + mesh_path_activate(mpath); + spin_unlock_bh(&mpath->state_lock); + mesh_path_tx_pending(mpath); + } else + spin_unlock_bh(&mpath->state_lock); + } + + rcu_read_unlock(); + + return process ? new_metric : 0; +} + +static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + u8 *preq_elem, u32 metric) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_path *mpath; + u8 *target_addr, *orig_addr; + u8 target_flags, ttl; + u32 orig_sn, target_sn, lifetime; + bool reply = false; + bool forward = true; + + /* Update target SN, if present */ + target_addr = PREQ_IE_TARGET_ADDR(preq_elem); + orig_addr = PREQ_IE_ORIG_ADDR(preq_elem); + target_sn = PREQ_IE_TARGET_SN(preq_elem); + orig_sn = PREQ_IE_ORIG_SN(preq_elem); + target_flags = PREQ_IE_TARGET_F(preq_elem); + + mhwmp_dbg("received PREQ from %pM\n", orig_addr); + + if (memcmp(target_addr, sdata->vif.addr, ETH_ALEN) == 0) { + mhwmp_dbg("PREQ is for us\n"); + forward = false; + reply = true; + metric = 0; + if (time_after(jiffies, ifmsh->last_sn_update + + net_traversal_jiffies(sdata)) || + time_before(jiffies, ifmsh->last_sn_update)) { + target_sn = ++ifmsh->sn; + ifmsh->last_sn_update = jiffies; + } + } else { + rcu_read_lock(); + mpath = mesh_path_lookup(target_addr, sdata); + if (mpath) { + if ((!(mpath->flags & MESH_PATH_SN_VALID)) || + SN_LT(mpath->sn, target_sn)) { + mpath->sn = target_sn; + mpath->flags |= MESH_PATH_SN_VALID; + } else if ((!(target_flags & MP_F_DO)) && + (mpath->flags & MESH_PATH_ACTIVE)) { + reply = true; + metric = mpath->metric; + target_sn = mpath->sn; + if (target_flags & MP_F_RF) + target_flags |= MP_F_DO; + else + forward = false; + } + } + rcu_read_unlock(); + } + + if (reply) { + lifetime = PREQ_IE_LIFETIME(preq_elem); + ttl = ifmsh->mshcfg.element_ttl; + if (ttl != 0) { + mhwmp_dbg("replying to the PREQ\n"); + mesh_path_sel_frame_tx(MPATH_PREP, 0, target_addr, + cpu_to_le32(target_sn), 0, orig_addr, + cpu_to_le32(orig_sn), mgmt->sa, 0, ttl, + cpu_to_le32(lifetime), cpu_to_le32(metric), + 0, sdata); + } else + ifmsh->mshstats.dropped_frames_ttl++; + } + + if (forward) { + u32 preq_id; + u8 hopcount, flags; + + ttl = PREQ_IE_TTL(preq_elem); + lifetime = PREQ_IE_LIFETIME(preq_elem); + if (ttl <= 1) { + ifmsh->mshstats.dropped_frames_ttl++; + return; + } + mhwmp_dbg("forwarding the PREQ from %pM\n", orig_addr); + --ttl; + flags = PREQ_IE_FLAGS(preq_elem); + preq_id = PREQ_IE_PREQ_ID(preq_elem); + hopcount = PREQ_IE_HOPCOUNT(preq_elem) + 1; + mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr, + cpu_to_le32(orig_sn), target_flags, target_addr, + cpu_to_le32(target_sn), broadcast_addr, + hopcount, ttl, cpu_to_le32(lifetime), + cpu_to_le32(metric), cpu_to_le32(preq_id), + sdata); + ifmsh->mshstats.fwded_mcast++; + ifmsh->mshstats.fwded_frames++; + } +} + + +static inline struct sta_info * +next_hop_deref_protected(struct mesh_path *mpath) +{ + return rcu_dereference_protected(mpath->next_hop, + lockdep_is_held(&mpath->state_lock)); +} + + +static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + u8 *prep_elem, u32 metric) +{ + struct mesh_path *mpath; + u8 *target_addr, *orig_addr; + u8 ttl, hopcount, flags; + u8 next_hop[ETH_ALEN]; + u32 target_sn, orig_sn, lifetime; + + mhwmp_dbg("received PREP from %pM\n", PREP_IE_ORIG_ADDR(prep_elem)); + + /* Note that we divert from the draft nomenclature and denominate + * destination to what the draft refers to as origininator. So in this + * function destnation refers to the final destination of the PREP, + * which corresponds with the originator of the PREQ which this PREP + * replies + */ + target_addr = PREP_IE_TARGET_ADDR(prep_elem); + if (memcmp(target_addr, sdata->vif.addr, ETH_ALEN) == 0) + /* destination, no forwarding required */ + return; + + ttl = PREP_IE_TTL(prep_elem); + if (ttl <= 1) { + sdata->u.mesh.mshstats.dropped_frames_ttl++; + return; + } + + rcu_read_lock(); + mpath = mesh_path_lookup(target_addr, sdata); + if (mpath) + spin_lock_bh(&mpath->state_lock); + else + goto fail; + if (!(mpath->flags & MESH_PATH_ACTIVE)) { + spin_unlock_bh(&mpath->state_lock); + goto fail; + } + memcpy(next_hop, next_hop_deref_protected(mpath)->sta.addr, ETH_ALEN); + spin_unlock_bh(&mpath->state_lock); + --ttl; + flags = PREP_IE_FLAGS(prep_elem); + lifetime = PREP_IE_LIFETIME(prep_elem); + hopcount = PREP_IE_HOPCOUNT(prep_elem) + 1; + orig_addr = PREP_IE_ORIG_ADDR(prep_elem); + target_sn = PREP_IE_TARGET_SN(prep_elem); + orig_sn = PREP_IE_ORIG_SN(prep_elem); + + mesh_path_sel_frame_tx(MPATH_PREP, flags, orig_addr, + cpu_to_le32(orig_sn), 0, target_addr, + cpu_to_le32(target_sn), next_hop, hopcount, + ttl, cpu_to_le32(lifetime), cpu_to_le32(metric), + 0, sdata); + rcu_read_unlock(); + + sdata->u.mesh.mshstats.fwded_unicast++; + sdata->u.mesh.mshstats.fwded_frames++; + return; + +fail: + rcu_read_unlock(); + sdata->u.mesh.mshstats.dropped_frames_no_route++; +} + +static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, u8 *perr_elem) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_path *mpath; + u8 ttl; + u8 *ta, *target_addr; + u32 target_sn; + u16 target_rcode; + + ta = mgmt->sa; + ttl = PERR_IE_TTL(perr_elem); + if (ttl <= 1) { + ifmsh->mshstats.dropped_frames_ttl++; + return; + } + ttl--; + target_addr = PERR_IE_TARGET_ADDR(perr_elem); + target_sn = PERR_IE_TARGET_SN(perr_elem); + target_rcode = PERR_IE_TARGET_RCODE(perr_elem); + + rcu_read_lock(); + mpath = mesh_path_lookup(target_addr, sdata); + if (mpath) { + spin_lock_bh(&mpath->state_lock); + if (mpath->flags & MESH_PATH_ACTIVE && + memcmp(ta, next_hop_deref_protected(mpath)->sta.addr, + ETH_ALEN) == 0 && + (!(mpath->flags & MESH_PATH_SN_VALID) || + SN_GT(target_sn, mpath->sn))) { + mpath->flags &= ~MESH_PATH_ACTIVE; + mpath->sn = target_sn; + spin_unlock_bh(&mpath->state_lock); + mesh_path_error_tx(ttl, target_addr, cpu_to_le32(target_sn), + cpu_to_le16(target_rcode), + broadcast_addr, sdata); + } else + spin_unlock_bh(&mpath->state_lock); + } + rcu_read_unlock(); +} + +static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + struct ieee80211_rann_ie *rann) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_path *mpath; + u8 ttl, flags, hopcount; + u8 *orig_addr; + u32 orig_sn, metric; + + ttl = rann->rann_ttl; + if (ttl <= 1) { + ifmsh->mshstats.dropped_frames_ttl++; + return; + } + ttl--; + flags = rann->rann_flags; + orig_addr = rann->rann_addr; + orig_sn = rann->rann_seq; + hopcount = rann->rann_hopcount; + hopcount++; + metric = rann->rann_metric; + mhwmp_dbg("received RANN from %pM\n", orig_addr); + + rcu_read_lock(); + mpath = mesh_path_lookup(orig_addr, sdata); + if (!mpath) { + mesh_path_add(orig_addr, sdata); + mpath = mesh_path_lookup(orig_addr, sdata); + if (!mpath) { + rcu_read_unlock(); + sdata->u.mesh.mshstats.dropped_frames_no_route++; + return; + } + mesh_queue_preq(mpath, + PREQ_Q_F_START | PREQ_Q_F_REFRESH); + } + if (mpath->sn < orig_sn) { + mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr, + cpu_to_le32(orig_sn), + 0, NULL, 0, broadcast_addr, + hopcount, ttl, 0, + cpu_to_le32(metric + mpath->metric), + 0, sdata); + mpath->sn = orig_sn; + } + rcu_read_unlock(); +} + + +void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee802_11_elems elems; + size_t baselen; + u32 last_hop_metric; + + /* need action_code */ + if (len < IEEE80211_MIN_ACTION_SIZE + 1) + return; + + baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; + ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, + len - baselen, &elems); + + if (elems.preq) { + if (elems.preq_len != 37) + /* Right now we support just 1 destination and no AE */ + return; + last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, + MPATH_PREQ); + if (last_hop_metric) + hwmp_preq_frame_process(sdata, mgmt, elems.preq, + last_hop_metric); + } + if (elems.prep) { + if (elems.prep_len != 31) + /* Right now we support no AE */ + return; + last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, + MPATH_PREP); + if (last_hop_metric) + hwmp_prep_frame_process(sdata, mgmt, elems.prep, + last_hop_metric); + } + if (elems.perr) { + if (elems.perr_len != 15) + /* Right now we support only one destination per PERR */ + return; + hwmp_perr_frame_process(sdata, mgmt, elems.perr); + } + if (elems.rann) + hwmp_rann_frame_process(sdata, mgmt, elems.rann); +} + +/** + * mesh_queue_preq - queue a PREQ to a given destination + * + * @mpath: mesh path to discover + * @flags: special attributes of the PREQ to be sent + * + * Locking: the function must be called from within a rcu read lock block. + * + */ +static void mesh_queue_preq(struct mesh_path *mpath, u8 flags) +{ + struct ieee80211_sub_if_data *sdata = mpath->sdata; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_preq_queue *preq_node; + + preq_node = kmalloc(sizeof(struct mesh_preq_queue), GFP_ATOMIC); + if (!preq_node) { + mhwmp_dbg("could not allocate PREQ node\n"); + return; + } + + spin_lock(&ifmsh->mesh_preq_queue_lock); + if (ifmsh->preq_queue_len == MAX_PREQ_QUEUE_LEN) { + spin_unlock(&ifmsh->mesh_preq_queue_lock); + kfree(preq_node); + if (printk_ratelimit()) + mhwmp_dbg("PREQ node queue full\n"); + return; + } + + memcpy(preq_node->dst, mpath->dst, ETH_ALEN); + preq_node->flags = flags; + + list_add_tail(&preq_node->list, &ifmsh->preq_queue.list); + ++ifmsh->preq_queue_len; + spin_unlock(&ifmsh->mesh_preq_queue_lock); + + if (time_after(jiffies, ifmsh->last_preq + min_preq_int_jiff(sdata))) + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + + else if (time_before(jiffies, ifmsh->last_preq)) { + /* avoid long wait if did not send preqs for a long time + * and jiffies wrapped around + */ + ifmsh->last_preq = jiffies - min_preq_int_jiff(sdata) - 1; + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + } else + mod_timer(&ifmsh->mesh_path_timer, ifmsh->last_preq + + min_preq_int_jiff(sdata)); +} + +/** + * mesh_path_start_discovery - launch a path discovery from the PREQ queue + * + * @sdata: local mesh subif + */ +void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_preq_queue *preq_node; + struct mesh_path *mpath; + u8 ttl, target_flags; + u32 lifetime; + + spin_lock_bh(&ifmsh->mesh_preq_queue_lock); + if (!ifmsh->preq_queue_len || + time_before(jiffies, ifmsh->last_preq + + min_preq_int_jiff(sdata))) { + spin_unlock_bh(&ifmsh->mesh_preq_queue_lock); + return; + } + + preq_node = list_first_entry(&ifmsh->preq_queue.list, + struct mesh_preq_queue, list); + list_del(&preq_node->list); + --ifmsh->preq_queue_len; + spin_unlock_bh(&ifmsh->mesh_preq_queue_lock); + + rcu_read_lock(); + mpath = mesh_path_lookup(preq_node->dst, sdata); + if (!mpath) + goto enddiscovery; + + spin_lock_bh(&mpath->state_lock); + if (preq_node->flags & PREQ_Q_F_START) { + if (mpath->flags & MESH_PATH_RESOLVING) { + spin_unlock_bh(&mpath->state_lock); + goto enddiscovery; + } else { + mpath->flags &= ~MESH_PATH_RESOLVED; + mpath->flags |= MESH_PATH_RESOLVING; + mpath->discovery_retries = 0; + mpath->discovery_timeout = disc_timeout_jiff(sdata); + } + } else if (!(mpath->flags & MESH_PATH_RESOLVING) || + mpath->flags & MESH_PATH_RESOLVED) { + mpath->flags &= ~MESH_PATH_RESOLVING; + spin_unlock_bh(&mpath->state_lock); + goto enddiscovery; + } + + ifmsh->last_preq = jiffies; + + if (time_after(jiffies, ifmsh->last_sn_update + + net_traversal_jiffies(sdata)) || + time_before(jiffies, ifmsh->last_sn_update)) { + ++ifmsh->sn; + sdata->u.mesh.last_sn_update = jiffies; + } + lifetime = default_lifetime(sdata); + ttl = sdata->u.mesh.mshcfg.element_ttl; + if (ttl == 0) { + sdata->u.mesh.mshstats.dropped_frames_ttl++; + spin_unlock_bh(&mpath->state_lock); + goto enddiscovery; + } + + if (preq_node->flags & PREQ_Q_F_REFRESH) + target_flags = MP_F_DO; + else + target_flags = MP_F_RF; + + spin_unlock_bh(&mpath->state_lock); + mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->vif.addr, + cpu_to_le32(ifmsh->sn), target_flags, mpath->dst, + cpu_to_le32(mpath->sn), broadcast_addr, 0, + ttl, cpu_to_le32(lifetime), 0, + cpu_to_le32(ifmsh->preq_id++), sdata); + mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout); + +enddiscovery: + rcu_read_unlock(); + kfree(preq_node); +} + +/** + * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame + * + * @skb: 802.11 frame to be sent + * @sdata: network subif the frame will be sent through + * + * Returns: 0 if the next hop was found. Nonzero otherwise. If no next hop is + * found, the function will start a path discovery and queue the frame so it is + * sent when the path is resolved. This means the caller must not free the skb + * in this case. + */ +int mesh_nexthop_lookup(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata) +{ + struct sk_buff *skb_to_free = NULL; + struct mesh_path *mpath; + struct sta_info *next_hop; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + u8 *target_addr = hdr->addr3; + int err = 0; + + rcu_read_lock(); + mpath = mesh_path_lookup(target_addr, sdata); + + if (!mpath) { + mesh_path_add(target_addr, sdata); + mpath = mesh_path_lookup(target_addr, sdata); + if (!mpath) { + sdata->u.mesh.mshstats.dropped_frames_no_route++; + err = -ENOSPC; + goto endlookup; + } + } + + if (mpath->flags & MESH_PATH_ACTIVE) { + if (time_after(jiffies, + mpath->exp_time - + msecs_to_jiffies(sdata->u.mesh.mshcfg.path_refresh_time)) && + !memcmp(sdata->vif.addr, hdr->addr4, ETH_ALEN) && + !(mpath->flags & MESH_PATH_RESOLVING) && + !(mpath->flags & MESH_PATH_FIXED)) { + mesh_queue_preq(mpath, + PREQ_Q_F_START | PREQ_Q_F_REFRESH); + } + next_hop = rcu_dereference(mpath->next_hop); + if (next_hop) + memcpy(hdr->addr1, next_hop->sta.addr, ETH_ALEN); + else + err = -ENOENT; + } else { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + if (!(mpath->flags & MESH_PATH_RESOLVING)) { + /* Start discovery only if it is not running yet */ + mesh_queue_preq(mpath, PREQ_Q_F_START); + } + + if (skb_queue_len(&mpath->frame_queue) >= + MESH_FRAME_QUEUE_LEN) + skb_to_free = skb_dequeue(&mpath->frame_queue); + + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + skb_queue_tail(&mpath->frame_queue, skb); + if (skb_to_free) + mesh_path_discard_frame(skb_to_free, sdata); + err = -ENOENT; + } + +endlookup: + rcu_read_unlock(); + return err; +} + +void mesh_path_timer(unsigned long data) +{ + struct mesh_path *mpath = (void *) data; + struct ieee80211_sub_if_data *sdata = mpath->sdata; + + if (sdata->local->quiescing) + return; + + spin_lock_bh(&mpath->state_lock); + if (mpath->flags & MESH_PATH_RESOLVED || + (!(mpath->flags & MESH_PATH_RESOLVING))) + mpath->flags &= ~(MESH_PATH_RESOLVING | MESH_PATH_RESOLVED); + else if (mpath->discovery_retries < max_preq_retries(sdata)) { + ++mpath->discovery_retries; + mpath->discovery_timeout *= 2; + mesh_queue_preq(mpath, 0); + } else { + mpath->flags = 0; + mpath->exp_time = jiffies; + mesh_path_flush_pending(mpath); + } + + spin_unlock_bh(&mpath->state_lock); +} + +void +mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + + mesh_path_sel_frame_tx(MPATH_RANN, 0, sdata->vif.addr, + cpu_to_le32(++ifmsh->sn), + 0, NULL, 0, broadcast_addr, + 0, sdata->u.mesh.mshcfg.element_ttl, + 0, 0, 0, sdata); +} diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c new file mode 100644 index 00000000..0d2faacc --- /dev/null +++ b/net/mac80211/mesh_pathtbl.c @@ -0,0 +1,848 @@ +/* + * Copyright (c) 2008, 2009 open80211s Ltd. + * Author: Luis Carlos Cobo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ieee80211_i.h" +#include "mesh.h" + +/* There will be initially 2^INIT_PATHS_SIZE_ORDER buckets */ +#define INIT_PATHS_SIZE_ORDER 2 + +/* Keep the mean chain length below this constant */ +#define MEAN_CHAIN_LEN 2 + +#define MPATH_EXPIRED(mpath) ((mpath->flags & MESH_PATH_ACTIVE) && \ + time_after(jiffies, mpath->exp_time) && \ + !(mpath->flags & MESH_PATH_FIXED)) + +struct mpath_node { + struct hlist_node list; + struct rcu_head rcu; + /* This indirection allows two different tables to point to the same + * mesh_path structure, useful when resizing + */ + struct mesh_path *mpath; +}; + +static struct mesh_table __rcu *mesh_paths; +static struct mesh_table __rcu *mpp_paths; /* Store paths for MPP&MAP */ + +int mesh_paths_generation; + +/* This lock will have the grow table function as writer and add / delete nodes + * as readers. When reading the table (i.e. doing lookups) we are well protected + * by RCU + */ +static DEFINE_RWLOCK(pathtbl_resize_lock); + + +static inline struct mesh_table *resize_dereference_mesh_paths(void) +{ + return rcu_dereference_protected(mesh_paths, + lockdep_is_held(&pathtbl_resize_lock)); +} + +static inline struct mesh_table *resize_dereference_mpp_paths(void) +{ + return rcu_dereference_protected(mpp_paths, + lockdep_is_held(&pathtbl_resize_lock)); +} + +/* + * CAREFUL -- "tbl" must not be an expression, + * in particular not an rcu_dereference(), since + * it's used twice. So it is illegal to do + * for_each_mesh_entry(rcu_dereference(...), ...) + */ +#define for_each_mesh_entry(tbl, p, node, i) \ + for (i = 0; i <= tbl->hash_mask; i++) \ + hlist_for_each_entry_rcu(node, p, &tbl->hash_buckets[i], list) + + +static struct mesh_table *mesh_table_alloc(int size_order) +{ + int i; + struct mesh_table *newtbl; + + newtbl = kmalloc(sizeof(struct mesh_table), GFP_ATOMIC); + if (!newtbl) + return NULL; + + newtbl->hash_buckets = kzalloc(sizeof(struct hlist_head) * + (1 << size_order), GFP_ATOMIC); + + if (!newtbl->hash_buckets) { + kfree(newtbl); + return NULL; + } + + newtbl->hashwlock = kmalloc(sizeof(spinlock_t) * + (1 << size_order), GFP_ATOMIC); + if (!newtbl->hashwlock) { + kfree(newtbl->hash_buckets); + kfree(newtbl); + return NULL; + } + + newtbl->size_order = size_order; + newtbl->hash_mask = (1 << size_order) - 1; + atomic_set(&newtbl->entries, 0); + get_random_bytes(&newtbl->hash_rnd, + sizeof(newtbl->hash_rnd)); + for (i = 0; i <= newtbl->hash_mask; i++) + spin_lock_init(&newtbl->hashwlock[i]); + + return newtbl; +} + +static void __mesh_table_free(struct mesh_table *tbl) +{ + kfree(tbl->hash_buckets); + kfree(tbl->hashwlock); + kfree(tbl); +} + +static void mesh_table_free(struct mesh_table *tbl, bool free_leafs) +{ + struct hlist_head *mesh_hash; + struct hlist_node *p, *q; + int i; + + mesh_hash = tbl->hash_buckets; + for (i = 0; i <= tbl->hash_mask; i++) { + spin_lock_bh(&tbl->hashwlock[i]); + hlist_for_each_safe(p, q, &mesh_hash[i]) { + tbl->free_node(p, free_leafs); + atomic_dec(&tbl->entries); + } + spin_unlock_bh(&tbl->hashwlock[i]); + } + __mesh_table_free(tbl); +} + +static int mesh_table_grow(struct mesh_table *oldtbl, + struct mesh_table *newtbl) +{ + struct hlist_head *oldhash; + struct hlist_node *p, *q; + int i; + + if (atomic_read(&oldtbl->entries) + < oldtbl->mean_chain_len * (oldtbl->hash_mask + 1)) + return -EAGAIN; + + newtbl->free_node = oldtbl->free_node; + newtbl->mean_chain_len = oldtbl->mean_chain_len; + newtbl->copy_node = oldtbl->copy_node; + atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries)); + + oldhash = oldtbl->hash_buckets; + for (i = 0; i <= oldtbl->hash_mask; i++) + hlist_for_each(p, &oldhash[i]) + if (oldtbl->copy_node(p, newtbl) < 0) + goto errcopy; + + return 0; + +errcopy: + for (i = 0; i <= newtbl->hash_mask; i++) { + hlist_for_each_safe(p, q, &newtbl->hash_buckets[i]) + oldtbl->free_node(p, 0); + } + return -ENOMEM; +} + +static u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata, + struct mesh_table *tbl) +{ + /* Use last four bytes of hw addr and interface index as hash index */ + return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex, tbl->hash_rnd) + & tbl->hash_mask; +} + + +/** + * + * mesh_path_assign_nexthop - update mesh path next hop + * + * @mpath: mesh path to update + * @sta: next hop to assign + * + * Locking: mpath->state_lock must be held when calling this function + */ +void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta) +{ + struct sk_buff *skb; + struct ieee80211_hdr *hdr; + struct sk_buff_head tmpq; + unsigned long flags; + + rcu_assign_pointer(mpath->next_hop, sta); + + __skb_queue_head_init(&tmpq); + + spin_lock_irqsave(&mpath->frame_queue.lock, flags); + + while ((skb = __skb_dequeue(&mpath->frame_queue)) != NULL) { + hdr = (struct ieee80211_hdr *) skb->data; + memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); + __skb_queue_tail(&tmpq, skb); + } + + skb_queue_splice(&tmpq, &mpath->frame_queue); + spin_unlock_irqrestore(&mpath->frame_queue.lock, flags); +} + + +/** + * mesh_path_lookup - look up a path in the mesh path table + * @dst: hardware address (ETH_ALEN length) of destination + * @sdata: local subif + * + * Returns: pointer to the mesh path structure, or NULL if not found + * + * Locking: must be called within a read rcu section. + */ +struct mesh_path *mesh_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata) +{ + struct mesh_path *mpath; + struct hlist_node *n; + struct hlist_head *bucket; + struct mesh_table *tbl; + struct mpath_node *node; + + tbl = rcu_dereference(mesh_paths); + + bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)]; + hlist_for_each_entry_rcu(node, n, bucket, list) { + mpath = node->mpath; + if (mpath->sdata == sdata && + memcmp(dst, mpath->dst, ETH_ALEN) == 0) { + if (MPATH_EXPIRED(mpath)) { + spin_lock_bh(&mpath->state_lock); + if (MPATH_EXPIRED(mpath)) + mpath->flags &= ~MESH_PATH_ACTIVE; + spin_unlock_bh(&mpath->state_lock); + } + return mpath; + } + } + return NULL; +} + +struct mesh_path *mpp_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata) +{ + struct mesh_path *mpath; + struct hlist_node *n; + struct hlist_head *bucket; + struct mesh_table *tbl; + struct mpath_node *node; + + tbl = rcu_dereference(mpp_paths); + + bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)]; + hlist_for_each_entry_rcu(node, n, bucket, list) { + mpath = node->mpath; + if (mpath->sdata == sdata && + memcmp(dst, mpath->dst, ETH_ALEN) == 0) { + if (MPATH_EXPIRED(mpath)) { + spin_lock_bh(&mpath->state_lock); + if (MPATH_EXPIRED(mpath)) + mpath->flags &= ~MESH_PATH_ACTIVE; + spin_unlock_bh(&mpath->state_lock); + } + return mpath; + } + } + return NULL; +} + + +/** + * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index + * @idx: index + * @sdata: local subif, or NULL for all entries + * + * Returns: pointer to the mesh path structure, or NULL if not found. + * + * Locking: must be called within a read rcu section. + */ +struct mesh_path *mesh_path_lookup_by_idx(int idx, struct ieee80211_sub_if_data *sdata) +{ + struct mesh_table *tbl = rcu_dereference(mesh_paths); + struct mpath_node *node; + struct hlist_node *p; + int i; + int j = 0; + + for_each_mesh_entry(tbl, p, node, i) { + if (sdata && node->mpath->sdata != sdata) + continue; + if (j++ == idx) { + if (MPATH_EXPIRED(node->mpath)) { + spin_lock_bh(&node->mpath->state_lock); + if (MPATH_EXPIRED(node->mpath)) + node->mpath->flags &= ~MESH_PATH_ACTIVE; + spin_unlock_bh(&node->mpath->state_lock); + } + return node->mpath; + } + } + + return NULL; +} + +/** + * mesh_path_add - allocate and add a new path to the mesh path table + * @addr: destination address of the path (ETH_ALEN length) + * @sdata: local subif + * + * Returns: 0 on success + * + * State: the initial state of the new path is set to 0 + */ +int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_local *local = sdata->local; + struct mesh_table *tbl; + struct mesh_path *mpath, *new_mpath; + struct mpath_node *node, *new_node; + struct hlist_head *bucket; + struct hlist_node *n; + int grow = 0; + int err = 0; + u32 hash_idx; + + if (memcmp(dst, sdata->vif.addr, ETH_ALEN) == 0) + /* never add ourselves as neighbours */ + return -ENOTSUPP; + + if (is_multicast_ether_addr(dst)) + return -ENOTSUPP; + + if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0) + return -ENOSPC; + + err = -ENOMEM; + new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC); + if (!new_mpath) + goto err_path_alloc; + + new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC); + if (!new_node) + goto err_node_alloc; + + read_lock_bh(&pathtbl_resize_lock); + memcpy(new_mpath->dst, dst, ETH_ALEN); + new_mpath->sdata = sdata; + new_mpath->flags = 0; + skb_queue_head_init(&new_mpath->frame_queue); + new_node->mpath = new_mpath; + new_mpath->timer.data = (unsigned long) new_mpath; + new_mpath->timer.function = mesh_path_timer; + new_mpath->exp_time = jiffies; + spin_lock_init(&new_mpath->state_lock); + init_timer(&new_mpath->timer); + + tbl = resize_dereference_mesh_paths(); + + hash_idx = mesh_table_hash(dst, sdata, tbl); + bucket = &tbl->hash_buckets[hash_idx]; + + spin_lock_bh(&tbl->hashwlock[hash_idx]); + + err = -EEXIST; + hlist_for_each_entry(node, n, bucket, list) { + mpath = node->mpath; + if (mpath->sdata == sdata && memcmp(dst, mpath->dst, ETH_ALEN) == 0) + goto err_exists; + } + + hlist_add_head_rcu(&new_node->list, bucket); + if (atomic_inc_return(&tbl->entries) >= + tbl->mean_chain_len * (tbl->hash_mask + 1)) + grow = 1; + + mesh_paths_generation++; + + spin_unlock_bh(&tbl->hashwlock[hash_idx]); + read_unlock_bh(&pathtbl_resize_lock); + if (grow) { + set_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags); + ieee80211_queue_work(&local->hw, &sdata->work); + } + return 0; + +err_exists: + spin_unlock_bh(&tbl->hashwlock[hash_idx]); + read_unlock_bh(&pathtbl_resize_lock); + kfree(new_node); +err_node_alloc: + kfree(new_mpath); +err_path_alloc: + atomic_dec(&sdata->u.mesh.mpaths); + return err; +} + +static void mesh_table_free_rcu(struct rcu_head *rcu) +{ + struct mesh_table *tbl = container_of(rcu, struct mesh_table, rcu_head); + + mesh_table_free(tbl, false); +} + +void mesh_mpath_table_grow(void) +{ + struct mesh_table *oldtbl, *newtbl; + + write_lock_bh(&pathtbl_resize_lock); + oldtbl = resize_dereference_mesh_paths(); + newtbl = mesh_table_alloc(oldtbl->size_order + 1); + if (!newtbl) + goto out; + if (mesh_table_grow(oldtbl, newtbl) < 0) { + __mesh_table_free(newtbl); + goto out; + } + rcu_assign_pointer(mesh_paths, newtbl); + + call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu); + + out: + write_unlock_bh(&pathtbl_resize_lock); +} + +void mesh_mpp_table_grow(void) +{ + struct mesh_table *oldtbl, *newtbl; + + write_lock_bh(&pathtbl_resize_lock); + oldtbl = resize_dereference_mpp_paths(); + newtbl = mesh_table_alloc(oldtbl->size_order + 1); + if (!newtbl) + goto out; + if (mesh_table_grow(oldtbl, newtbl) < 0) { + __mesh_table_free(newtbl); + goto out; + } + rcu_assign_pointer(mpp_paths, newtbl); + call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu); + + out: + write_unlock_bh(&pathtbl_resize_lock); +} + +int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_local *local = sdata->local; + struct mesh_table *tbl; + struct mesh_path *mpath, *new_mpath; + struct mpath_node *node, *new_node; + struct hlist_head *bucket; + struct hlist_node *n; + int grow = 0; + int err = 0; + u32 hash_idx; + + if (memcmp(dst, sdata->vif.addr, ETH_ALEN) == 0) + /* never add ourselves as neighbours */ + return -ENOTSUPP; + + if (is_multicast_ether_addr(dst)) + return -ENOTSUPP; + + err = -ENOMEM; + new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC); + if (!new_mpath) + goto err_path_alloc; + + new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC); + if (!new_node) + goto err_node_alloc; + + read_lock_bh(&pathtbl_resize_lock); + memcpy(new_mpath->dst, dst, ETH_ALEN); + memcpy(new_mpath->mpp, mpp, ETH_ALEN); + new_mpath->sdata = sdata; + new_mpath->flags = 0; + skb_queue_head_init(&new_mpath->frame_queue); + new_node->mpath = new_mpath; + new_mpath->exp_time = jiffies; + spin_lock_init(&new_mpath->state_lock); + + tbl = resize_dereference_mpp_paths(); + + hash_idx = mesh_table_hash(dst, sdata, tbl); + bucket = &tbl->hash_buckets[hash_idx]; + + spin_lock_bh(&tbl->hashwlock[hash_idx]); + + err = -EEXIST; + hlist_for_each_entry(node, n, bucket, list) { + mpath = node->mpath; + if (mpath->sdata == sdata && memcmp(dst, mpath->dst, ETH_ALEN) == 0) + goto err_exists; + } + + hlist_add_head_rcu(&new_node->list, bucket); + if (atomic_inc_return(&tbl->entries) >= + tbl->mean_chain_len * (tbl->hash_mask + 1)) + grow = 1; + + spin_unlock_bh(&tbl->hashwlock[hash_idx]); + read_unlock_bh(&pathtbl_resize_lock); + if (grow) { + set_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags); + ieee80211_queue_work(&local->hw, &sdata->work); + } + return 0; + +err_exists: + spin_unlock_bh(&tbl->hashwlock[hash_idx]); + read_unlock_bh(&pathtbl_resize_lock); + kfree(new_node); +err_node_alloc: + kfree(new_mpath); +err_path_alloc: + return err; +} + + +/** + * mesh_plink_broken - deactivates paths and sends perr when a link breaks + * + * @sta: broken peer link + * + * This function must be called from the rate control algorithm if enough + * delivery errors suggest that a peer link is no longer usable. + */ +void mesh_plink_broken(struct sta_info *sta) +{ + struct mesh_table *tbl; + static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + struct mesh_path *mpath; + struct mpath_node *node; + struct hlist_node *p; + struct ieee80211_sub_if_data *sdata = sta->sdata; + int i; + + rcu_read_lock(); + tbl = rcu_dereference(mesh_paths); + for_each_mesh_entry(tbl, p, node, i) { + mpath = node->mpath; + spin_lock_bh(&mpath->state_lock); + if (rcu_dereference(mpath->next_hop) == sta && + mpath->flags & MESH_PATH_ACTIVE && + !(mpath->flags & MESH_PATH_FIXED)) { + mpath->flags &= ~MESH_PATH_ACTIVE; + ++mpath->sn; + spin_unlock_bh(&mpath->state_lock); + mesh_path_error_tx(sdata->u.mesh.mshcfg.element_ttl, + mpath->dst, cpu_to_le32(mpath->sn), + cpu_to_le16(PERR_RCODE_DEST_UNREACH), + bcast, sdata); + } else + spin_unlock_bh(&mpath->state_lock); + } + rcu_read_unlock(); +} + +/** + * mesh_path_flush_by_nexthop - Deletes mesh paths if their next hop matches + * + * @sta - mesh peer to match + * + * RCU notes: this function is called when a mesh plink transitions from + * PLINK_ESTAB to any other state, since PLINK_ESTAB state is the only one that + * allows path creation. This will happen before the sta can be freed (because + * sta_info_destroy() calls this) so any reader in a rcu read block will be + * protected against the plink disappearing. + */ +void mesh_path_flush_by_nexthop(struct sta_info *sta) +{ + struct mesh_table *tbl; + struct mesh_path *mpath; + struct mpath_node *node; + struct hlist_node *p; + int i; + + rcu_read_lock(); + tbl = rcu_dereference(mesh_paths); + for_each_mesh_entry(tbl, p, node, i) { + mpath = node->mpath; + if (rcu_dereference(mpath->next_hop) == sta) + mesh_path_del(mpath->dst, mpath->sdata); + } + rcu_read_unlock(); +} + +void mesh_path_flush(struct ieee80211_sub_if_data *sdata) +{ + struct mesh_table *tbl; + struct mesh_path *mpath; + struct mpath_node *node; + struct hlist_node *p; + int i; + + rcu_read_lock(); + tbl = rcu_dereference(mesh_paths); + for_each_mesh_entry(tbl, p, node, i) { + mpath = node->mpath; + if (mpath->sdata == sdata) + mesh_path_del(mpath->dst, mpath->sdata); + } + rcu_read_unlock(); +} + +static void mesh_path_node_reclaim(struct rcu_head *rp) +{ + struct mpath_node *node = container_of(rp, struct mpath_node, rcu); + struct ieee80211_sub_if_data *sdata = node->mpath->sdata; + + del_timer_sync(&node->mpath->timer); + atomic_dec(&sdata->u.mesh.mpaths); + kfree(node->mpath); + kfree(node); +} + +/** + * mesh_path_del - delete a mesh path from the table + * + * @addr: dst address (ETH_ALEN length) + * @sdata: local subif + * + * Returns: 0 if successful + */ +int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata) +{ + struct mesh_table *tbl; + struct mesh_path *mpath; + struct mpath_node *node; + struct hlist_head *bucket; + struct hlist_node *n; + int hash_idx; + int err = 0; + + read_lock_bh(&pathtbl_resize_lock); + tbl = resize_dereference_mesh_paths(); + hash_idx = mesh_table_hash(addr, sdata, tbl); + bucket = &tbl->hash_buckets[hash_idx]; + + spin_lock_bh(&tbl->hashwlock[hash_idx]); + hlist_for_each_entry(node, n, bucket, list) { + mpath = node->mpath; + if (mpath->sdata == sdata && + memcmp(addr, mpath->dst, ETH_ALEN) == 0) { + spin_lock_bh(&mpath->state_lock); + mpath->flags |= MESH_PATH_RESOLVING; + hlist_del_rcu(&node->list); + call_rcu(&node->rcu, mesh_path_node_reclaim); + atomic_dec(&tbl->entries); + spin_unlock_bh(&mpath->state_lock); + goto enddel; + } + } + + err = -ENXIO; +enddel: + mesh_paths_generation++; + spin_unlock_bh(&tbl->hashwlock[hash_idx]); + read_unlock_bh(&pathtbl_resize_lock); + return err; +} + +/** + * mesh_path_tx_pending - sends pending frames in a mesh path queue + * + * @mpath: mesh path to activate + * + * Locking: the state_lock of the mpath structure must NOT be held when calling + * this function. + */ +void mesh_path_tx_pending(struct mesh_path *mpath) +{ + if (mpath->flags & MESH_PATH_ACTIVE) + ieee80211_add_pending_skbs(mpath->sdata->local, + &mpath->frame_queue); +} + +/** + * mesh_path_discard_frame - discard a frame whose path could not be resolved + * + * @skb: frame to discard + * @sdata: network subif the frame was to be sent through + * + * If the frame was being forwarded from another MP, a PERR frame will be sent + * to the precursor. The precursor's address (i.e. the previous hop) was saved + * in addr1 of the frame-to-be-forwarded, and would only be overwritten once + * the destination is successfully resolved. + * + * Locking: the function must me called within a rcu_read_lock region + */ +void mesh_path_discard_frame(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct mesh_path *mpath; + u32 sn = 0; + + if (memcmp(hdr->addr4, sdata->vif.addr, ETH_ALEN) != 0) { + u8 *ra, *da; + + da = hdr->addr3; + ra = hdr->addr1; + mpath = mesh_path_lookup(da, sdata); + if (mpath) + sn = ++mpath->sn; + mesh_path_error_tx(sdata->u.mesh.mshcfg.element_ttl, skb->data, + cpu_to_le32(sn), + cpu_to_le16(PERR_RCODE_NO_ROUTE), ra, sdata); + } + + kfree_skb(skb); + sdata->u.mesh.mshstats.dropped_frames_no_route++; +} + +/** + * mesh_path_flush_pending - free the pending queue of a mesh path + * + * @mpath: mesh path whose queue has to be freed + * + * Locking: the function must me called within a rcu_read_lock region + */ +void mesh_path_flush_pending(struct mesh_path *mpath) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&mpath->frame_queue)) && + (mpath->flags & MESH_PATH_ACTIVE)) + mesh_path_discard_frame(skb, mpath->sdata); +} + +/** + * mesh_path_fix_nexthop - force a specific next hop for a mesh path + * + * @mpath: the mesh path to modify + * @next_hop: the next hop to force + * + * Locking: this function must be called holding mpath->state_lock + */ +void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) +{ + spin_lock_bh(&mpath->state_lock); + mesh_path_assign_nexthop(mpath, next_hop); + mpath->sn = 0xffff; + mpath->metric = 0; + mpath->hop_count = 0; + mpath->exp_time = 0; + mpath->flags |= MESH_PATH_FIXED; + mesh_path_activate(mpath); + spin_unlock_bh(&mpath->state_lock); + mesh_path_tx_pending(mpath); +} + +static void mesh_path_node_free(struct hlist_node *p, bool free_leafs) +{ + struct mesh_path *mpath; + struct mpath_node *node = hlist_entry(p, struct mpath_node, list); + mpath = node->mpath; + hlist_del_rcu(p); + if (free_leafs) { + del_timer_sync(&mpath->timer); + kfree(mpath); + } + kfree(node); +} + +static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl) +{ + struct mesh_path *mpath; + struct mpath_node *node, *new_node; + u32 hash_idx; + + new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC); + if (new_node == NULL) + return -ENOMEM; + + node = hlist_entry(p, struct mpath_node, list); + mpath = node->mpath; + new_node->mpath = mpath; + hash_idx = mesh_table_hash(mpath->dst, mpath->sdata, newtbl); + hlist_add_head(&new_node->list, + &newtbl->hash_buckets[hash_idx]); + return 0; +} + +int mesh_pathtbl_init(void) +{ + struct mesh_table *tbl_path, *tbl_mpp; + + tbl_path = mesh_table_alloc(INIT_PATHS_SIZE_ORDER); + if (!tbl_path) + return -ENOMEM; + tbl_path->free_node = &mesh_path_node_free; + tbl_path->copy_node = &mesh_path_node_copy; + tbl_path->mean_chain_len = MEAN_CHAIN_LEN; + + tbl_mpp = mesh_table_alloc(INIT_PATHS_SIZE_ORDER); + if (!tbl_mpp) { + mesh_table_free(tbl_path, true); + return -ENOMEM; + } + tbl_mpp->free_node = &mesh_path_node_free; + tbl_mpp->copy_node = &mesh_path_node_copy; + tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN; + + /* Need no locking since this is during init */ + RCU_INIT_POINTER(mesh_paths, tbl_path); + RCU_INIT_POINTER(mpp_paths, tbl_mpp); + + return 0; +} + +void mesh_path_expire(struct ieee80211_sub_if_data *sdata) +{ + struct mesh_table *tbl; + struct mesh_path *mpath; + struct mpath_node *node; + struct hlist_node *p; + int i; + + rcu_read_lock(); + tbl = rcu_dereference(mesh_paths); + for_each_mesh_entry(tbl, p, node, i) { + if (node->mpath->sdata != sdata) + continue; + mpath = node->mpath; + spin_lock_bh(&mpath->state_lock); + if ((!(mpath->flags & MESH_PATH_RESOLVING)) && + (!(mpath->flags & MESH_PATH_FIXED)) && + time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) { + spin_unlock_bh(&mpath->state_lock); + mesh_path_del(mpath->dst, mpath->sdata); + } else + spin_unlock_bh(&mpath->state_lock); + } + rcu_read_unlock(); +} + +void mesh_pathtbl_unregister(void) +{ + /* no need for locking during exit path */ + mesh_table_free(rcu_dereference_raw(mesh_paths), true); + mesh_table_free(rcu_dereference_raw(mpp_paths), true); +} diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c new file mode 100644 index 00000000..f4adc091 --- /dev/null +++ b/net/mac80211/mesh_plink.c @@ -0,0 +1,809 @@ +/* + * Copyright (c) 2008, 2009 open80211s Ltd. + * Author: Luis Carlos Cobo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include "ieee80211_i.h" +#include "rate.h" +#include "mesh.h" + +#ifdef CONFIG_MAC80211_VERBOSE_MPL_DEBUG +#define mpl_dbg(fmt, args...) printk(KERN_DEBUG fmt, ##args) +#else +#define mpl_dbg(fmt, args...) do { (void)(0); } while (0) +#endif + +#define PLINK_GET_LLID(p) (p + 4) +#define PLINK_GET_PLID(p) (p + 6) + +#define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \ + jiffies + HZ * t / 1000)) + +/* Peer link cancel reasons, all subject to ANA approval */ +#define MESH_LINK_CANCELLED 2 +#define MESH_MAX_NEIGHBORS 3 +#define MESH_CAPABILITY_POLICY_VIOLATION 4 +#define MESH_CLOSE_RCVD 5 +#define MESH_MAX_RETRIES 6 +#define MESH_CONFIRM_TIMEOUT 7 +#define MESH_SECURITY_ROLE_NEGOTIATION_DIFFERS 8 +#define MESH_SECURITY_AUTHENTICATION_IMPOSSIBLE 9 +#define MESH_SECURITY_FAILED_VERIFICATION 10 + +#define dot11MeshMaxRetries(s) (s->u.mesh.mshcfg.dot11MeshMaxRetries) +#define dot11MeshRetryTimeout(s) (s->u.mesh.mshcfg.dot11MeshRetryTimeout) +#define dot11MeshConfirmTimeout(s) (s->u.mesh.mshcfg.dot11MeshConfirmTimeout) +#define dot11MeshHoldingTimeout(s) (s->u.mesh.mshcfg.dot11MeshHoldingTimeout) +#define dot11MeshMaxPeerLinks(s) (s->u.mesh.mshcfg.dot11MeshMaxPeerLinks) + +enum plink_frame_type { + PLINK_OPEN = 1, + PLINK_CONFIRM, + PLINK_CLOSE +}; + +enum plink_event { + PLINK_UNDEFINED, + OPN_ACPT, + OPN_RJCT, + OPN_IGNR, + CNF_ACPT, + CNF_RJCT, + CNF_IGNR, + CLS_ACPT, + CLS_IGNR +}; + +static inline +void mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata) +{ + atomic_inc(&sdata->u.mesh.mshstats.estab_plinks); + mesh_accept_plinks_update(sdata); +} + +static inline +void mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata) +{ + atomic_dec(&sdata->u.mesh.mshstats.estab_plinks); + mesh_accept_plinks_update(sdata); +} + +/** + * mesh_plink_fsm_restart - restart a mesh peer link finite state machine + * + * @sta: mesh peer link to restart + * + * Locking: this function must be called holding sta->lock + */ +static inline void mesh_plink_fsm_restart(struct sta_info *sta) +{ + sta->plink_state = NL80211_PLINK_LISTEN; + sta->llid = sta->plid = sta->reason = 0; + sta->plink_retries = 0; +} + +/* + * NOTE: This is just an alias for sta_info_alloc(), see notes + * on it in the lifecycle management section! + */ +static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata, + u8 *hw_addr, u32 rates) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + if (local->num_sta >= MESH_MAX_PLINKS) + return NULL; + + sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL); + if (!sta) + return NULL; + + sta->flags = WLAN_STA_AUTHORIZED | WLAN_STA_AUTH; + sta->sta.supp_rates[local->hw.conf.channel->band] = rates; + rate_control_rate_init(sta); + + return sta; +} + +/** + * __mesh_plink_deactivate - deactivate mesh peer link + * + * @sta: mesh peer link to deactivate + * + * All mesh paths with this peer as next hop will be flushed + * + * Locking: the caller must hold sta->lock + */ +static bool __mesh_plink_deactivate(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + bool deactivated = false; + + if (sta->plink_state == NL80211_PLINK_ESTAB) { + mesh_plink_dec_estab_count(sdata); + deactivated = true; + } + sta->plink_state = NL80211_PLINK_BLOCKED; + mesh_path_flush_by_nexthop(sta); + + return deactivated; +} + +/** + * mesh_plink_deactivate - deactivate mesh peer link + * + * @sta: mesh peer link to deactivate + * + * All mesh paths with this peer as next hop will be flushed + */ +void mesh_plink_deactivate(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + bool deactivated; + + spin_lock_bh(&sta->lock); + deactivated = __mesh_plink_deactivate(sta); + spin_unlock_bh(&sta->lock); + + if (deactivated) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); +} + +static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, + enum plink_frame_type action, u8 *da, __le16 llid, __le16 plid, + __le16 reason) { + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 + + sdata->u.mesh.ie_len); + struct ieee80211_mgmt *mgmt; + bool include_plid = false; + static const u8 meshpeeringproto[] = { 0x00, 0x0F, 0xAC, 0x2A }; + u8 *pos; + int ie_len; + + if (!skb) + return -1; + skb_reserve(skb, local->hw.extra_tx_headroom); + /* 25 is the size of the common mgmt part (24) plus the size of the + * common action part (1) + */ + mgmt = (struct ieee80211_mgmt *) + skb_put(skb, 25 + sizeof(mgmt->u.action.u.plink_action)); + memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.plink_action)); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION; + mgmt->u.action.u.plink_action.action_code = action; + + if (action == PLINK_CLOSE) + mgmt->u.action.u.plink_action.aux = reason; + else { + mgmt->u.action.u.plink_action.aux = cpu_to_le16(0x0); + if (action == PLINK_CONFIRM) { + pos = skb_put(skb, 4); + /* two-byte status code followed by two-byte AID */ + memset(pos, 0, 2); + memcpy(pos + 2, &plid, 2); + } + mesh_mgmt_ies_add(skb, sdata); + } + + /* Add Peer Link Management element */ + switch (action) { + case PLINK_OPEN: + ie_len = 6; + break; + case PLINK_CONFIRM: + ie_len = 8; + include_plid = true; + break; + case PLINK_CLOSE: + default: + if (!plid) + ie_len = 8; + else { + ie_len = 10; + include_plid = true; + } + break; + } + + pos = skb_put(skb, 2 + ie_len); + *pos++ = WLAN_EID_PEER_LINK; + *pos++ = ie_len; + memcpy(pos, meshpeeringproto, sizeof(meshpeeringproto)); + pos += 4; + memcpy(pos, &llid, 2); + if (include_plid) { + pos += 2; + memcpy(pos, &plid, 2); + } + if (action == PLINK_CLOSE) { + pos += 2; + memcpy(pos, &reason, 2); + } + + ieee80211_tx_skb(sdata, skb); + return 0; +} + +void mesh_neighbour_update(u8 *hw_addr, u32 rates, + struct ieee80211_sub_if_data *sdata, + struct ieee802_11_elems *elems) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + + sta = sta_info_get(sdata, hw_addr); + if (!sta) { + rcu_read_unlock(); + /* Userspace handles peer allocation when security is enabled + * */ + if (sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) + cfg80211_notify_new_peer_candidate(sdata->dev, hw_addr, + elems->ie_start, elems->total_len, + GFP_KERNEL); + else + sta = mesh_plink_alloc(sdata, hw_addr, rates); + if (!sta) + return; + if (sta_info_insert_rcu(sta)) { + rcu_read_unlock(); + return; + } + } + + sta->last_rx = jiffies; + sta->sta.supp_rates[local->hw.conf.channel->band] = rates; + if (mesh_peer_accepts_plinks(elems) && + sta->plink_state == NL80211_PLINK_LISTEN && + sdata->u.mesh.accepting_plinks && + sdata->u.mesh.mshcfg.auto_open_plinks) + mesh_plink_open(sta); + + rcu_read_unlock(); +} + +static void mesh_plink_timer(unsigned long data) +{ + struct sta_info *sta; + __le16 llid, plid, reason; + struct ieee80211_sub_if_data *sdata; + + /* + * This STA is valid because sta_info_destroy() will + * del_timer_sync() this timer after having made sure + * it cannot be readded (by deleting the plink.) + */ + sta = (struct sta_info *) data; + + if (sta->sdata->local->quiescing) { + sta->plink_timer_was_running = true; + return; + } + + spin_lock_bh(&sta->lock); + if (sta->ignore_plink_timer) { + sta->ignore_plink_timer = false; + spin_unlock_bh(&sta->lock); + return; + } + mpl_dbg("Mesh plink timer for %pM fired on state %d\n", + sta->sta.addr, sta->plink_state); + reason = 0; + llid = sta->llid; + plid = sta->plid; + sdata = sta->sdata; + + switch (sta->plink_state) { + case NL80211_PLINK_OPN_RCVD: + case NL80211_PLINK_OPN_SNT: + /* retry timer */ + if (sta->plink_retries < dot11MeshMaxRetries(sdata)) { + u32 rand; + mpl_dbg("Mesh plink for %pM (retry, timeout): %d %d\n", + sta->sta.addr, sta->plink_retries, + sta->plink_timeout); + get_random_bytes(&rand, sizeof(u32)); + sta->plink_timeout = sta->plink_timeout + + rand % sta->plink_timeout; + ++sta->plink_retries; + mod_plink_timer(sta, sta->plink_timeout); + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->sta.addr, llid, + 0, 0); + break; + } + reason = cpu_to_le16(MESH_MAX_RETRIES); + /* fall through on else */ + case NL80211_PLINK_CNF_RCVD: + /* confirm timer */ + if (!reason) + reason = cpu_to_le16(MESH_CONFIRM_TIMEOUT); + sta->plink_state = NL80211_PLINK_HOLDING; + mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata)); + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, plid, + reason); + break; + case NL80211_PLINK_HOLDING: + /* holding timer */ + del_timer(&sta->plink_timer); + mesh_plink_fsm_restart(sta); + spin_unlock_bh(&sta->lock); + break; + default: + spin_unlock_bh(&sta->lock); + break; + } +} + +#ifdef CONFIG_PM +void mesh_plink_quiesce(struct sta_info *sta) +{ + if (del_timer_sync(&sta->plink_timer)) + sta->plink_timer_was_running = true; +} + +void mesh_plink_restart(struct sta_info *sta) +{ + if (sta->plink_timer_was_running) { + add_timer(&sta->plink_timer); + sta->plink_timer_was_running = false; + } +} +#endif + +static inline void mesh_plink_timer_set(struct sta_info *sta, int timeout) +{ + sta->plink_timer.expires = jiffies + (HZ * timeout / 1000); + sta->plink_timer.data = (unsigned long) sta; + sta->plink_timer.function = mesh_plink_timer; + sta->plink_timeout = timeout; + add_timer(&sta->plink_timer); +} + +int mesh_plink_open(struct sta_info *sta) +{ + __le16 llid; + struct ieee80211_sub_if_data *sdata = sta->sdata; + + if (!test_sta_flags(sta, WLAN_STA_AUTH)) + return -EPERM; + + spin_lock_bh(&sta->lock); + get_random_bytes(&llid, 2); + sta->llid = llid; + if (sta->plink_state != NL80211_PLINK_LISTEN) { + spin_unlock_bh(&sta->lock); + return -EBUSY; + } + sta->plink_state = NL80211_PLINK_OPN_SNT; + mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata)); + spin_unlock_bh(&sta->lock); + mpl_dbg("Mesh plink: starting establishment with %pM\n", + sta->sta.addr); + + return mesh_plink_frame_tx(sdata, PLINK_OPEN, + sta->sta.addr, llid, 0, 0); +} + +void mesh_plink_block(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + bool deactivated; + + spin_lock_bh(&sta->lock); + deactivated = __mesh_plink_deactivate(sta); + sta->plink_state = NL80211_PLINK_BLOCKED; + spin_unlock_bh(&sta->lock); + + if (deactivated) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); +} + + +void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, + size_t len, struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_local *local = sdata->local; + struct ieee802_11_elems elems; + struct sta_info *sta; + enum plink_event event; + enum plink_frame_type ftype; + size_t baselen; + bool deactivated, matches_local = true; + u8 ie_len; + u8 *baseaddr; + __le16 plid, llid, reason; +#ifdef CONFIG_MAC80211_VERBOSE_MPL_DEBUG + static const char *mplstates[] = { + [NL80211_PLINK_LISTEN] = "LISTEN", + [NL80211_PLINK_OPN_SNT] = "OPN-SNT", + [NL80211_PLINK_OPN_RCVD] = "OPN-RCVD", + [NL80211_PLINK_CNF_RCVD] = "CNF_RCVD", + [NL80211_PLINK_ESTAB] = "ESTAB", + [NL80211_PLINK_HOLDING] = "HOLDING", + [NL80211_PLINK_BLOCKED] = "BLOCKED" + }; +#endif + + /* need action_code, aux */ + if (len < IEEE80211_MIN_ACTION_SIZE + 3) + return; + + if (is_multicast_ether_addr(mgmt->da)) { + mpl_dbg("Mesh plink: ignore frame from multicast address"); + return; + } + + baseaddr = mgmt->u.action.u.plink_action.variable; + baselen = (u8 *) mgmt->u.action.u.plink_action.variable - (u8 *) mgmt; + if (mgmt->u.action.u.plink_action.action_code == PLINK_CONFIRM) { + baseaddr += 4; + baselen += 4; + } + ieee802_11_parse_elems(baseaddr, len - baselen, &elems); + if (!elems.peer_link) { + mpl_dbg("Mesh plink: missing necessary peer link ie\n"); + return; + } + if (elems.rsn_len && + sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) { + mpl_dbg("Mesh plink: can't establish link with secure peer\n"); + return; + } + + ftype = mgmt->u.action.u.plink_action.action_code; + ie_len = elems.peer_link_len; + if ((ftype == PLINK_OPEN && ie_len != 6) || + (ftype == PLINK_CONFIRM && ie_len != 8) || + (ftype == PLINK_CLOSE && ie_len != 8 && ie_len != 10)) { + mpl_dbg("Mesh plink: incorrect plink ie length %d %d\n", + ftype, ie_len); + return; + } + + if (ftype != PLINK_CLOSE && (!elems.mesh_id || !elems.mesh_config)) { + mpl_dbg("Mesh plink: missing necessary ie\n"); + return; + } + /* Note the lines below are correct, the llid in the frame is the plid + * from the point of view of this host. + */ + memcpy(&plid, PLINK_GET_LLID(elems.peer_link), 2); + if (ftype == PLINK_CONFIRM || (ftype == PLINK_CLOSE && ie_len == 10)) + memcpy(&llid, PLINK_GET_PLID(elems.peer_link), 2); + + rcu_read_lock(); + + sta = sta_info_get(sdata, mgmt->sa); + if (!sta && ftype != PLINK_OPEN) { + mpl_dbg("Mesh plink: cls or cnf from unknown peer\n"); + rcu_read_unlock(); + return; + } + + if (sta && !test_sta_flags(sta, WLAN_STA_AUTH)) { + mpl_dbg("Mesh plink: Action frame from non-authed peer\n"); + rcu_read_unlock(); + return; + } + + if (sta && sta->plink_state == NL80211_PLINK_BLOCKED) { + rcu_read_unlock(); + return; + } + + /* Now we will figure out the appropriate event... */ + event = PLINK_UNDEFINED; + if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) { + matches_local = false; + switch (ftype) { + case PLINK_OPEN: + event = OPN_RJCT; + break; + case PLINK_CONFIRM: + event = CNF_RJCT; + break; + case PLINK_CLOSE: + /* avoid warning */ + break; + } + } + + if (!sta && !matches_local) { + rcu_read_unlock(); + reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION); + llid = 0; + mesh_plink_frame_tx(sdata, PLINK_CLOSE, mgmt->sa, llid, + plid, reason); + return; + } else if (!sta) { + /* ftype == PLINK_OPEN */ + u32 rates; + + rcu_read_unlock(); + + if (!mesh_plink_free_count(sdata)) { + mpl_dbg("Mesh plink error: no more free plinks\n"); + return; + } + + rates = ieee80211_sta_get_rates(local, &elems, rx_status->band); + sta = mesh_plink_alloc(sdata, mgmt->sa, rates); + if (!sta) { + mpl_dbg("Mesh plink error: plink table full\n"); + return; + } + if (sta_info_insert_rcu(sta)) { + rcu_read_unlock(); + return; + } + event = OPN_ACPT; + spin_lock_bh(&sta->lock); + } else if (matches_local) { + spin_lock_bh(&sta->lock); + switch (ftype) { + case PLINK_OPEN: + if (!mesh_plink_free_count(sdata) || + (sta->plid && sta->plid != plid)) + event = OPN_IGNR; + else + event = OPN_ACPT; + break; + case PLINK_CONFIRM: + if (!mesh_plink_free_count(sdata) || + (sta->llid != llid || sta->plid != plid)) + event = CNF_IGNR; + else + event = CNF_ACPT; + break; + case PLINK_CLOSE: + if (sta->plink_state == NL80211_PLINK_ESTAB) + /* Do not check for llid or plid. This does not + * follow the standard but since multiple plinks + * per sta are not supported, it is necessary in + * order to avoid a livelock when MP A sees an + * establish peer link to MP B but MP B does not + * see it. This can be caused by a timeout in + * B's peer link establishment or B beign + * restarted. + */ + event = CLS_ACPT; + else if (sta->plid != plid) + event = CLS_IGNR; + else if (ie_len == 7 && sta->llid != llid) + event = CLS_IGNR; + else + event = CLS_ACPT; + break; + default: + mpl_dbg("Mesh plink: unknown frame subtype\n"); + spin_unlock_bh(&sta->lock); + rcu_read_unlock(); + return; + } + } else { + spin_lock_bh(&sta->lock); + } + + mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n", + mgmt->sa, mplstates[sta->plink_state], + le16_to_cpu(sta->llid), le16_to_cpu(sta->plid), + event); + reason = 0; + switch (sta->plink_state) { + /* spin_unlock as soon as state is updated at each case */ + case NL80211_PLINK_LISTEN: + switch (event) { + case CLS_ACPT: + mesh_plink_fsm_restart(sta); + spin_unlock_bh(&sta->lock); + break; + case OPN_ACPT: + sta->plink_state = NL80211_PLINK_OPN_RCVD; + sta->plid = plid; + get_random_bytes(&llid, 2); + sta->llid = llid; + mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata)); + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->sta.addr, llid, + 0, 0); + mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, + llid, plid, 0); + break; + default: + spin_unlock_bh(&sta->lock); + break; + } + break; + + case NL80211_PLINK_OPN_SNT: + switch (event) { + case OPN_RJCT: + case CNF_RJCT: + reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION); + case CLS_ACPT: + if (!reason) + reason = cpu_to_le16(MESH_CLOSE_RCVD); + sta->reason = reason; + sta->plink_state = NL80211_PLINK_HOLDING; + if (!mod_plink_timer(sta, + dot11MeshHoldingTimeout(sdata))) + sta->ignore_plink_timer = true; + + llid = sta->llid; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, + plid, reason); + break; + case OPN_ACPT: + /* retry timer is left untouched */ + sta->plink_state = NL80211_PLINK_OPN_RCVD; + sta->plid = plid; + llid = sta->llid; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid, + plid, 0); + break; + case CNF_ACPT: + sta->plink_state = NL80211_PLINK_CNF_RCVD; + if (!mod_plink_timer(sta, + dot11MeshConfirmTimeout(sdata))) + sta->ignore_plink_timer = true; + + spin_unlock_bh(&sta->lock); + break; + default: + spin_unlock_bh(&sta->lock); + break; + } + break; + + case NL80211_PLINK_OPN_RCVD: + switch (event) { + case OPN_RJCT: + case CNF_RJCT: + reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION); + case CLS_ACPT: + if (!reason) + reason = cpu_to_le16(MESH_CLOSE_RCVD); + sta->reason = reason; + sta->plink_state = NL80211_PLINK_HOLDING; + if (!mod_plink_timer(sta, + dot11MeshHoldingTimeout(sdata))) + sta->ignore_plink_timer = true; + + llid = sta->llid; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, + plid, reason); + break; + case OPN_ACPT: + llid = sta->llid; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid, + plid, 0); + break; + case CNF_ACPT: + del_timer(&sta->plink_timer); + sta->plink_state = NL80211_PLINK_ESTAB; + spin_unlock_bh(&sta->lock); + mesh_plink_inc_estab_count(sdata); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); + mpl_dbg("Mesh plink with %pM ESTABLISHED\n", + sta->sta.addr); + break; + default: + spin_unlock_bh(&sta->lock); + break; + } + break; + + case NL80211_PLINK_CNF_RCVD: + switch (event) { + case OPN_RJCT: + case CNF_RJCT: + reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION); + case CLS_ACPT: + if (!reason) + reason = cpu_to_le16(MESH_CLOSE_RCVD); + sta->reason = reason; + sta->plink_state = NL80211_PLINK_HOLDING; + if (!mod_plink_timer(sta, + dot11MeshHoldingTimeout(sdata))) + sta->ignore_plink_timer = true; + + llid = sta->llid; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, + plid, reason); + break; + case OPN_ACPT: + del_timer(&sta->plink_timer); + sta->plink_state = NL80211_PLINK_ESTAB; + spin_unlock_bh(&sta->lock); + mesh_plink_inc_estab_count(sdata); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); + mpl_dbg("Mesh plink with %pM ESTABLISHED\n", + sta->sta.addr); + mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid, + plid, 0); + break; + default: + spin_unlock_bh(&sta->lock); + break; + } + break; + + case NL80211_PLINK_ESTAB: + switch (event) { + case CLS_ACPT: + reason = cpu_to_le16(MESH_CLOSE_RCVD); + sta->reason = reason; + deactivated = __mesh_plink_deactivate(sta); + sta->plink_state = NL80211_PLINK_HOLDING; + llid = sta->llid; + mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata)); + spin_unlock_bh(&sta->lock); + if (deactivated) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON); + mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, + plid, reason); + break; + case OPN_ACPT: + llid = sta->llid; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid, + plid, 0); + break; + default: + spin_unlock_bh(&sta->lock); + break; + } + break; + case NL80211_PLINK_HOLDING: + switch (event) { + case CLS_ACPT: + if (del_timer(&sta->plink_timer)) + sta->ignore_plink_timer = 1; + mesh_plink_fsm_restart(sta); + spin_unlock_bh(&sta->lock); + break; + case OPN_ACPT: + case CNF_ACPT: + case OPN_RJCT: + case CNF_RJCT: + llid = sta->llid; + reason = sta->reason; + spin_unlock_bh(&sta->lock); + mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, + llid, plid, reason); + break; + default: + spin_unlock_bh(&sta->lock); + } + break; + default: + /* should not get here, PLINK_BLOCKED is dealt with at the + * beginning of the function + */ + spin_unlock_bh(&sta->lock); + break; + } + + rcu_read_unlock(); +} diff --git a/net/mac80211/michael.c b/net/mac80211/michael.c new file mode 100644 index 00000000..408649bd --- /dev/null +++ b/net/mac80211/michael.c @@ -0,0 +1,86 @@ +/* + * Michael MIC implementation - optimized for TKIP MIC operations + * Copyright 2002-2003, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include + +#include "michael.h" + +static void michael_block(struct michael_mic_ctx *mctx, u32 val) +{ + mctx->l ^= val; + mctx->r ^= rol32(mctx->l, 17); + mctx->l += mctx->r; + mctx->r ^= ((mctx->l & 0xff00ff00) >> 8) | + ((mctx->l & 0x00ff00ff) << 8); + mctx->l += mctx->r; + mctx->r ^= rol32(mctx->l, 3); + mctx->l += mctx->r; + mctx->r ^= ror32(mctx->l, 2); + mctx->l += mctx->r; +} + +static void michael_mic_hdr(struct michael_mic_ctx *mctx, const u8 *key, + struct ieee80211_hdr *hdr) +{ + u8 *da, *sa, tid; + + da = ieee80211_get_DA(hdr); + sa = ieee80211_get_SA(hdr); + if (ieee80211_is_data_qos(hdr->frame_control)) + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + else + tid = 0; + + mctx->l = get_unaligned_le32(key); + mctx->r = get_unaligned_le32(key + 4); + + /* + * A pseudo header (DA, SA, Priority, 0, 0, 0) is used in Michael MIC + * calculation, but it is _not_ transmitted + */ + michael_block(mctx, get_unaligned_le32(da)); + michael_block(mctx, get_unaligned_le16(&da[4]) | + (get_unaligned_le16(sa) << 16)); + michael_block(mctx, get_unaligned_le32(&sa[2])); + michael_block(mctx, tid); +} + +void michael_mic(const u8 *key, struct ieee80211_hdr *hdr, + const u8 *data, size_t data_len, u8 *mic) +{ + u32 val; + size_t block, blocks, left; + struct michael_mic_ctx mctx; + + michael_mic_hdr(&mctx, key, hdr); + + /* Real data */ + blocks = data_len / 4; + left = data_len % 4; + + for (block = 0; block < blocks; block++) + michael_block(&mctx, get_unaligned_le32(&data[block * 4])); + + /* Partial block of 0..3 bytes and padding: 0x5a + 4..7 zeros to make + * total length a multiple of 4. */ + val = 0x5a; + while (left > 0) { + val <<= 8; + left--; + val |= data[blocks * 4 + left]; + } + + michael_block(&mctx, val); + michael_block(&mctx, 0); + + put_unaligned_le32(mctx.l, mic); + put_unaligned_le32(mctx.r, mic + 4); +} diff --git a/net/mac80211/michael.h b/net/mac80211/michael.h new file mode 100644 index 00000000..3b848dad --- /dev/null +++ b/net/mac80211/michael.h @@ -0,0 +1,24 @@ +/* + * Michael MIC implementation - optimized for TKIP MIC operations + * Copyright 2002-2003, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MICHAEL_H +#define MICHAEL_H + +#include + +#define MICHAEL_MIC_LEN 8 + +struct michael_mic_ctx { + u32 l, r; +}; + +void michael_mic(const u8 *key, struct ieee80211_hdr *hdr, + const u8 *data, size_t data_len, u8 *mic); + +#endif /* MICHAEL_H */ diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c new file mode 100644 index 00000000..1563250a --- /dev/null +++ b/net/mac80211/mlme.c @@ -0,0 +1,2671 @@ +/* + * BSS client mode implementation + * Copyright 2003-2008, Jouni Malinen + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" +#include "led.h" + +static int max_nullfunc_tries = 2; +module_param(max_nullfunc_tries, int, 0644); +MODULE_PARM_DESC(max_nullfunc_tries, + "Maximum nullfunc tx tries before disconnecting (reason 4)."); + +static int max_probe_tries = 5; +module_param(max_probe_tries, int, 0644); +MODULE_PARM_DESC(max_probe_tries, + "Maximum probe tries before disconnecting (reason 4)."); + +/* + * Beacon loss timeout is calculated as N frames times the + * advertised beacon interval. This may need to be somewhat + * higher than what hardware might detect to account for + * delays in the host processing frames. But since we also + * probe on beacon miss before declaring the connection lost + * default to what we want. + */ +#define IEEE80211_BEACON_LOSS_COUNT 7 + +/* + * Time the connection can be idle before we probe + * it to see if we can still talk to the AP. + */ +#define IEEE80211_CONNECTION_IDLE_TIME (30 * HZ) +/* + * Time we wait for a probe response after sending + * a probe request because of beacon loss or for + * checking the connection still works. + */ +static int probe_wait_ms = 500; +module_param(probe_wait_ms, int, 0644); +MODULE_PARM_DESC(probe_wait_ms, + "Maximum time(ms) to wait for probe response" + " before disconnecting (reason 4)."); + +/* + * Weight given to the latest Beacon frame when calculating average signal + * strength for Beacon frames received in the current BSS. This must be + * between 1 and 15. + */ +#define IEEE80211_SIGNAL_AVE_WEIGHT 3 + +/* + * How many Beacon frames need to have been used in average signal strength + * before starting to indicate signal change events. + */ +#define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 + +#define TMR_RUNNING_TIMER 0 +#define TMR_RUNNING_CHANSW 1 + +/* + * All cfg80211 functions have to be called outside a locked + * section so that they can acquire a lock themselves... This + * is much simpler than queuing up things in cfg80211, but we + * do need some indirection for that here. + */ +enum rx_mgmt_action { + /* no action required */ + RX_MGMT_NONE, + + /* caller must call cfg80211_send_deauth() */ + RX_MGMT_CFG80211_DEAUTH, + + /* caller must call cfg80211_send_disassoc() */ + RX_MGMT_CFG80211_DISASSOC, +}; + +/* utils */ +static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd) +{ + lockdep_assert_held(&ifmgd->mtx); +} + +/* + * We can have multiple work items (and connection probing) + * scheduling this timer, but we need to take care to only + * reschedule it when it should fire _earlier_ than it was + * asked for before, or if it's not pending right now. This + * function ensures that. Note that it then is required to + * run this function for all timeouts after the first one + * has happened -- the work that runs from this timer will + * do that. + */ +static void run_again(struct ieee80211_if_managed *ifmgd, + unsigned long timeout) +{ + ASSERT_MGD_MTX(ifmgd); + + if (!timer_pending(&ifmgd->timer) || + time_before(timeout, ifmgd->timer.expires)) + mod_timer(&ifmgd->timer, timeout); +} + +void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) +{ + if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER) + return; + + mod_timer(&sdata->u.mgd.bcn_mon_timer, + round_jiffies_up(jiffies + sdata->u.mgd.beacon_timeout)); +} + +void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (unlikely(!sdata->u.mgd.associated)) + return; + + if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + return; + + mod_timer(&sdata->u.mgd.conn_mon_timer, + round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); + + ifmgd->probe_send_count = 0; +} + +static int ecw2cw(int ecw) +{ + return (1 << ecw) - 1; +} + +/* + * ieee80211_enable_ht should be called only after the operating band + * has been determined as ht configuration depends on the hw's + * HT abilities for a specific band. + */ +static u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata, + struct ieee80211_ht_info *hti, + const u8 *bssid, u16 ap_ht_cap_flags) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + struct sta_info *sta; + u32 changed = 0; + int hti_cfreq; + u16 ht_opmode; + bool enable_ht = true; + enum nl80211_channel_type prev_chantype; + enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + prev_chantype = sdata->vif.bss_conf.channel_type; + + /* HT is not supported */ + if (!sband->ht_cap.ht_supported) + enable_ht = false; + + if (enable_ht) { + hti_cfreq = ieee80211_channel_to_frequency(hti->control_chan, + sband->band); + /* check that channel matches the right operating channel */ + if (local->hw.conf.channel->center_freq != hti_cfreq) { + /* Some APs mess this up, evidently. + * Netgear WNDR3700 sometimes reports 4 higher than + * the actual channel, for instance. + */ + printk(KERN_DEBUG + "%s: Wrong control channel in association" + " response: configured center-freq: %d" + " hti-cfreq: %d hti->control_chan: %d" + " band: %d. Disabling HT.\n", + sdata->name, + local->hw.conf.channel->center_freq, + hti_cfreq, hti->control_chan, + sband->band); + enable_ht = false; + } + } + + if (enable_ht) { + channel_type = NL80211_CHAN_HT20; + + if (!(ap_ht_cap_flags & IEEE80211_HT_CAP_40MHZ_INTOLERANT) && + (sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) && + (hti->ht_param & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) { + switch(hti->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { + case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: + if (!(local->hw.conf.channel->flags & + IEEE80211_CHAN_NO_HT40PLUS)) + channel_type = NL80211_CHAN_HT40PLUS; + break; + case IEEE80211_HT_PARAM_CHA_SEC_BELOW: + if (!(local->hw.conf.channel->flags & + IEEE80211_CHAN_NO_HT40MINUS)) + channel_type = NL80211_CHAN_HT40MINUS; + break; + } + } + } + + if (local->tmp_channel) + local->tmp_channel_type = channel_type; + + if (!ieee80211_set_channel_type(local, sdata, channel_type)) { + /* can only fail due to HT40+/- mismatch */ + channel_type = NL80211_CHAN_HT20; + WARN_ON(!ieee80211_set_channel_type(local, sdata, channel_type)); + } + + /* channel_type change automatically detected */ + ieee80211_hw_config(local, 0); + + if (prev_chantype != channel_type) { + rcu_read_lock(); + sta = sta_info_get(sdata, bssid); + if (sta) + rate_control_rate_update(local, sband, sta, + IEEE80211_RC_HT_CHANGED, + channel_type); + rcu_read_unlock(); + } + + ht_opmode = le16_to_cpu(hti->operation_mode); + + /* if bss configuration changed store the new one */ + if (sdata->ht_opmode_valid != enable_ht || + sdata->vif.bss_conf.ht_operation_mode != ht_opmode || + prev_chantype != channel_type) { + changed |= BSS_CHANGED_HT; + sdata->vif.bss_conf.ht_operation_mode = ht_opmode; + sdata->ht_opmode_valid = enable_ht; + } + + return changed; +} + +/* frame sending functions */ + +static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, + const u8 *bssid, u16 stype, u16 reason, + void *cookie, bool send_frame) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for " + "deauth/disassoc frame\n", sdata->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, bssid, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); + skb_put(skb, 2); + /* u.deauth.reason_code == u.disassoc.reason_code */ + mgmt->u.deauth.reason_code = cpu_to_le16(reason); + + if (stype == IEEE80211_STYPE_DEAUTH) + if (cookie) + __cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); + else + cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); + else + if (cookie) + __cfg80211_send_disassoc(sdata->dev, (u8 *)mgmt, skb->len); + else + cfg80211_send_disassoc(sdata->dev, (u8 *)mgmt, skb->len); + if (!(ifmgd->flags & IEEE80211_STA_MFP_ENABLED)) + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + if (send_frame) + ieee80211_tx_skb(sdata, skb); + else + kfree_skb(skb); +} + +void ieee80211_send_pspoll(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_pspoll *pspoll; + struct sk_buff *skb; + + skb = ieee80211_pspoll_get(&local->hw, &sdata->vif); + if (!skb) + return; + + pspoll = (struct ieee80211_pspoll *) skb->data; + pspoll->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_send_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + int powersave) +{ + struct sk_buff *skb; + struct ieee80211_hdr_3addr *nullfunc; + + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif); + if (!skb) + return; + + nullfunc = (struct ieee80211_hdr_3addr *) skb->data; + if (powersave) + nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct sk_buff *skb; + struct ieee80211_hdr *nullfunc; + __le16 fc; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) + return; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for 4addr " + "nullfunc frame\n", sdata->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr *) skb_put(skb, 30); + memset(nullfunc, 0, 30); + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | + IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); + nullfunc->frame_control = fc; + memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); + memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(nullfunc->addr4, sdata->vif.addr, ETH_ALEN); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +/* spectrum management related things */ +static void ieee80211_chswitch_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work); + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (!ieee80211_sdata_running(sdata)) + return; + + mutex_lock(&ifmgd->mtx); + if (!ifmgd->associated) + goto out; + + sdata->local->oper_channel = sdata->local->csa_channel; + if (!sdata->local->ops->channel_switch) { + /* call "hw_config" only if doing sw channel switch */ + ieee80211_hw_config(sdata->local, + IEEE80211_CONF_CHANGE_CHANNEL); + } + + /* XXX: shouldn't really modify cfg80211-owned data! */ + ifmgd->associated->channel = sdata->local->oper_channel; + + ieee80211_wake_queues_by_reason(&sdata->local->hw, + IEEE80211_QUEUE_STOP_REASON_CSA); + out: + ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; + mutex_unlock(&ifmgd->mtx); +} + +void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + + trace_api_chswitch_done(sdata, success); + if (!success) { + /* + * If the channel switch was not successful, stay + * around on the old channel. We currently lack + * good handling of this situation, possibly we + * should just drop the association. + */ + sdata->local->csa_channel = sdata->local->oper_channel; + } + + ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work); +} +EXPORT_SYMBOL(ieee80211_chswitch_done); + +static void ieee80211_chswitch_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (sdata->local->quiescing) { + set_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running); + return; + } + + ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work); +} + +void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel_sw_ie *sw_elem, + struct ieee80211_bss *bss, + u64 timestamp) +{ + struct cfg80211_bss *cbss = + container_of((void *)bss, struct cfg80211_bss, priv); + struct ieee80211_channel *new_ch; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num, + cbss->channel->band); + + ASSERT_MGD_MTX(ifmgd); + + if (!ifmgd->associated) + return; + + if (sdata->local->scanning) + return; + + /* Disregard subsequent beacons if we are already running a timer + processing a CSA */ + + if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED) + return; + + new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); + if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED) + return; + + sdata->local->csa_channel = new_ch; + + if (sdata->local->ops->channel_switch) { + /* use driver's channel switch callback */ + struct ieee80211_channel_switch ch_switch; + memset(&ch_switch, 0, sizeof(ch_switch)); + ch_switch.timestamp = timestamp; + if (sw_elem->mode) { + ch_switch.block_tx = true; + ieee80211_stop_queues_by_reason(&sdata->local->hw, + IEEE80211_QUEUE_STOP_REASON_CSA); + } + ch_switch.channel = new_ch; + ch_switch.count = sw_elem->count; + ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; + drv_channel_switch(sdata->local, &ch_switch); + return; + } + + /* channel switch handled in software */ + if (sw_elem->count <= 1) { + ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work); + } else { + if (sw_elem->mode) + ieee80211_stop_queues_by_reason(&sdata->local->hw, + IEEE80211_QUEUE_STOP_REASON_CSA); + ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; + mod_timer(&ifmgd->chswitch_timer, + jiffies + + msecs_to_jiffies(sw_elem->count * + cbss->beacon_interval)); + } +} + +static void ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, + u16 capab_info, u8 *pwr_constr_elem, + u8 pwr_constr_elem_len) +{ + struct ieee80211_conf *conf = &sdata->local->hw.conf; + + if (!(capab_info & WLAN_CAPABILITY_SPECTRUM_MGMT)) + return; + + /* Power constraint IE length should be 1 octet */ + if (pwr_constr_elem_len != 1) + return; + + if ((*pwr_constr_elem <= conf->channel->max_power) && + (*pwr_constr_elem != sdata->local->power_constr_level)) { + sdata->local->power_constr_level = *pwr_constr_elem; + ieee80211_hw_config(sdata->local, 0); + } +} + +void ieee80211_enable_dyn_ps(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_conf *conf = &local->hw.conf; + + WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || + !(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) || + (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)); + + local->disable_dynamic_ps = false; + conf->dynamic_ps_timeout = local->dynamic_ps_user_timeout; +} +EXPORT_SYMBOL(ieee80211_enable_dyn_ps); + +void ieee80211_disable_dyn_ps(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_conf *conf = &local->hw.conf; + + WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || + !(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) || + (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)); + + local->disable_dynamic_ps = true; + conf->dynamic_ps_timeout = 0; + del_timer_sync(&local->dynamic_ps_timer); + ieee80211_queue_work(&local->hw, + &local->dynamic_ps_enable_work); +} +EXPORT_SYMBOL(ieee80211_disable_dyn_ps); + +/* powersave */ +static void ieee80211_enable_ps(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_conf *conf = &local->hw.conf; + + /* + * If we are scanning right now then the parameters will + * take effect when scan finishes. + */ + if (local->scanning) + return; + + if (conf->dynamic_ps_timeout > 0 && + !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) { + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(conf->dynamic_ps_timeout)); + } else { + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + ieee80211_send_nullfunc(local, sdata, 1); + + if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && + (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + return; + + conf->flags |= IEEE80211_CONF_PS; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + } +} + +static void ieee80211_change_ps(struct ieee80211_local *local) +{ + struct ieee80211_conf *conf = &local->hw.conf; + + if (local->ps_sdata) { + ieee80211_enable_ps(local, local->ps_sdata); + } else if (conf->flags & IEEE80211_CONF_PS) { + conf->flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + del_timer_sync(&local->dynamic_ps_timer); + cancel_work_sync(&local->dynamic_ps_enable_work); + } +} + +static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *mgd = &sdata->u.mgd; + struct sta_info *sta = NULL; + u32 sta_flags = 0; + + if (!mgd->powersave) + return false; + + if (mgd->broken_ap) + return false; + + if (!mgd->associated) + return false; + + if (!mgd->associated->beacon_ies) + return false; + + if (mgd->flags & (IEEE80211_STA_BEACON_POLL | + IEEE80211_STA_CONNECTION_POLL)) + return false; + + rcu_read_lock(); + sta = sta_info_get(sdata, mgd->bssid); + if (sta) + sta_flags = get_sta_flags(sta); + rcu_read_unlock(); + + if (!(sta_flags & WLAN_STA_AUTHORIZED)) + return false; + + return true; +} + +/* need to hold RTNL or interface lock */ +void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) +{ + struct ieee80211_sub_if_data *sdata, *found = NULL; + int count = 0; + int timeout; + + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) { + local->ps_sdata = NULL; + return; + } + + if (!list_empty(&local->work_list)) { + local->ps_sdata = NULL; + goto change; + } + + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + if (sdata->vif.type == NL80211_IFTYPE_AP) { + /* If an AP vif is found, then disable PS + * by setting the count to zero thereby setting + * ps_sdata to NULL. + */ + count = 0; + break; + } + if (sdata->vif.type != NL80211_IFTYPE_STATION) + continue; + found = sdata; + count++; + } + + if (count == 1 && ieee80211_powersave_allowed(found)) { + struct ieee80211_conf *conf = &local->hw.conf; + s32 beaconint_us; + + if (latency < 0) + latency = pm_qos_request(PM_QOS_NETWORK_LATENCY); + + beaconint_us = ieee80211_tu_to_usec( + found->vif.bss_conf.beacon_int); + + timeout = local->dynamic_ps_forced_timeout; + if (timeout < 0) { + /* + * Go to full PSM if the user configures a very low + * latency requirement. + * The 2000 second value is there for compatibility + * until the PM_QOS_NETWORK_LATENCY is configured + * with real values. + */ + if (latency > (1900 * USEC_PER_MSEC) && + latency != (2000 * USEC_PER_SEC)) + timeout = 0; + else + timeout = 100; + } + local->dynamic_ps_user_timeout = timeout; + if (!local->disable_dynamic_ps) + conf->dynamic_ps_timeout = + local->dynamic_ps_user_timeout; + + if (beaconint_us > latency) { + local->ps_sdata = NULL; + } else { + struct ieee80211_bss *bss; + int maxslp = 1; + u8 dtimper; + + bss = (void *)found->u.mgd.associated->priv; + dtimper = bss->dtim_period; + + /* If the TIM IE is invalid, pretend the value is 1 */ + if (!dtimper) + dtimper = 1; + else if (dtimper > 1) + maxslp = min_t(int, dtimper, + latency / beaconint_us); + + local->hw.conf.max_sleep_period = maxslp; + local->hw.conf.ps_dtim_period = dtimper; + local->ps_sdata = found; + } + } else { + local->ps_sdata = NULL; + } + + change: + ieee80211_change_ps(local); +} + +void ieee80211_dynamic_ps_disable_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, + dynamic_ps_disable_work); + + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + } + + ieee80211_wake_queues_by_reason(&local->hw, + IEEE80211_QUEUE_STOP_REASON_PS); +} + +void ieee80211_dynamic_ps_enable_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, + dynamic_ps_enable_work); + struct ieee80211_sub_if_data *sdata = local->ps_sdata; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + unsigned long flags; + int q; + + /* can only happen when PS was just disabled anyway */ + if (!sdata) + return; + + if (local->hw.conf.flags & IEEE80211_CONF_PS) + return; + + /* + * transmission can be stopped by others which leads to + * dynamic_ps_timer expiry. Postpond the ps timer if it + * is not the actual idle state. + */ + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + for (q = 0; q < local->hw.queues; q++) { + if (local->queue_stop_reasons[q]) { + spin_unlock_irqrestore(&local->queue_stop_reason_lock, + flags); + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies( + local->hw.conf.dynamic_ps_timeout)); + return; + } + } + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && + (!(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED))) { + netif_tx_stop_all_queues(sdata->dev); + + if (drv_tx_frames_pending(local)) + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies( + local->hw.conf.dynamic_ps_timeout)); + else { + ieee80211_send_nullfunc(local, sdata, 1); + /* Flush to get the tx status of nullfunc frame */ + drv_flush(local, false); + } + } + + if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && + (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) || + (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { + ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; + local->hw.conf.flags |= IEEE80211_CONF_PS; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + } + + netif_tx_wake_all_queues(sdata->dev); +} + +void ieee80211_dynamic_ps_timer(unsigned long data) +{ + struct ieee80211_local *local = (void *) data; + + if (local->quiescing || local->suspended) + return; + + ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work); +} + +/* MLME */ +static void ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u8 *wmm_param, size_t wmm_param_len) +{ + struct ieee80211_tx_queue_params params; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + size_t left; + int count; + u8 *pos, uapsd_queues = 0; + + if (!local->ops->conf_tx) + return; + + if (local->hw.queues < 4) + return; + + if (!wmm_param) + return; + + if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) + return; + + if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) + uapsd_queues = local->uapsd_queues; + + count = wmm_param[6] & 0x0f; + if (count == ifmgd->wmm_last_param_set) + return; + ifmgd->wmm_last_param_set = count; + + pos = wmm_param + 8; + left = wmm_param_len - 8; + + memset(¶ms, 0, sizeof(params)); + + local->wmm_acm = 0; + for (; left >= 4; left -= 4, pos += 4) { + int aci = (pos[0] >> 5) & 0x03; + int acm = (pos[0] >> 4) & 0x01; + bool uapsd = false; + int queue; + + switch (aci) { + case 1: /* AC_BK */ + queue = 3; + if (acm) + local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) + uapsd = true; + break; + case 2: /* AC_VI */ + queue = 1; + if (acm) + local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) + uapsd = true; + break; + case 3: /* AC_VO */ + queue = 0; + if (acm) + local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + uapsd = true; + break; + case 0: /* AC_BE */ + default: + queue = 2; + if (acm) + local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ + if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) + uapsd = true; + break; + } + + params.aifs = pos[0] & 0x0f; + params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4); + params.cw_min = ecw2cw(pos[1] & 0x0f); + params.txop = get_unaligned_le16(pos + 2); + params.uapsd = uapsd; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, + "WMM queue=%d aci=%d acm=%d aifs=%d " + "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", + queue, aci, acm, + params.aifs, params.cw_min, params.cw_max, + params.txop, params.uapsd); +#endif + if (drv_conf_tx(local, queue, ¶ms)) + wiphy_debug(local->hw.wiphy, + "failed to set TX queue parameters for queue %d\n", + queue); + } + + /* enable WMM or activate new settings */ + sdata->vif.bss_conf.qos = true; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS); +} + +static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, + u16 capab, bool erp_valid, u8 erp) +{ + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + u32 changed = 0; + bool use_protection; + bool use_short_preamble; + bool use_short_slot; + + if (erp_valid) { + use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0; + use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0; + } else { + use_protection = false; + use_short_preamble = !!(capab & WLAN_CAPABILITY_SHORT_PREAMBLE); + } + + use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); + if (sdata->local->hw.conf.channel->band == IEEE80211_BAND_5GHZ) + use_short_slot = true; + + if (use_protection != bss_conf->use_cts_prot) { + bss_conf->use_cts_prot = use_protection; + changed |= BSS_CHANGED_ERP_CTS_PROT; + } + + if (use_short_preamble != bss_conf->use_short_preamble) { + bss_conf->use_short_preamble = use_short_preamble; + changed |= BSS_CHANGED_ERP_PREAMBLE; + } + + if (use_short_slot != bss_conf->use_short_slot) { + bss_conf->use_short_slot = use_short_slot; + changed |= BSS_CHANGED_ERP_SLOT; + } + + return changed; +} + +static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, + struct cfg80211_bss *cbss, + u32 bss_info_changed) +{ + struct ieee80211_bss *bss = (void *)cbss->priv; + struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + + bss_info_changed |= BSS_CHANGED_ASSOC; + /* set timing information */ + bss_conf->beacon_int = cbss->beacon_interval; + bss_conf->timestamp = cbss->tsf; + + bss_info_changed |= BSS_CHANGED_BEACON_INT; + bss_info_changed |= ieee80211_handle_bss_capability(sdata, + cbss->capability, bss->has_erp_value, bss->erp_value); + + sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec( + IEEE80211_BEACON_LOSS_COUNT * bss_conf->beacon_int)); + + sdata->u.mgd.associated = cbss; + memcpy(sdata->u.mgd.bssid, cbss->bssid, ETH_ALEN); + + sdata->u.mgd.flags |= IEEE80211_STA_RESET_SIGNAL_AVE; + + /* just to be sure */ + sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL | + IEEE80211_STA_BEACON_POLL); + + ieee80211_led_assoc(local, 1); + + if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) + bss_conf->dtim_period = bss->dtim_period; + else + bss_conf->dtim_period = 0; + + bss_conf->assoc = 1; + /* + * For now just always ask the driver to update the basic rateset + * when we have associated, we aren't checking whether it actually + * changed or not. + */ + bss_info_changed |= BSS_CHANGED_BASIC_RATES; + + /* And the BSSID changed - we're associated now */ + bss_info_changed |= BSS_CHANGED_BSSID; + + /* Tell the driver to monitor connection quality (if supported) */ + if ((local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI) && + bss_conf->cqm_rssi_thold) + bss_info_changed |= BSS_CHANGED_CQM; + + /* Enable ARP filtering */ + if (bss_conf->arp_filter_enabled != sdata->arp_filter_state) { + bss_conf->arp_filter_enabled = sdata->arp_filter_state; + bss_info_changed |= BSS_CHANGED_ARP_FILTER; + } + + ieee80211_bss_info_change_notify(sdata, bss_info_changed); + + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_ps(local, -1); + ieee80211_recalc_smps(local); + mutex_unlock(&local->iflist_mtx); + + netif_tx_start_all_queues(sdata->dev); + netif_carrier_on(sdata->dev); +} + +static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, + bool remove_sta, bool tx) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + u32 changed = 0, config_changed = 0; + u8 bssid[ETH_ALEN]; + + ASSERT_MGD_MTX(ifmgd); + + if (WARN_ON(!ifmgd->associated)) + return; + + memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); + + ifmgd->associated = NULL; + memset(ifmgd->bssid, 0, ETH_ALEN); + + /* + * we need to commit the associated = NULL change because the + * scan code uses that to determine whether this iface should + * go to/wake up from powersave or not -- and could otherwise + * wake the queues erroneously. + */ + smp_mb(); + + /* + * Thus, we can only afterwards stop the queues -- to account + * for the case where another CPU is finishing a scan at this + * time -- we don't want the scan code to enable queues. + */ + + netif_tx_stop_all_queues(sdata->dev); + netif_carrier_off(sdata->dev); + + mutex_lock(&local->sta_mtx); + sta = sta_info_get(sdata, bssid); + if (sta) { + set_sta_flags(sta, WLAN_STA_BLOCK_BA); + ieee80211_sta_tear_down_BA_sessions(sta, tx); + } + mutex_unlock(&local->sta_mtx); + + changed |= ieee80211_reset_erp_info(sdata); + + ieee80211_led_assoc(local, 0); + changed |= BSS_CHANGED_ASSOC; + sdata->vif.bss_conf.assoc = false; + + ieee80211_set_wmm_default(sdata); + + /* channel(_type) changes are handled by ieee80211_hw_config */ + WARN_ON(!ieee80211_set_channel_type(local, sdata, NL80211_CHAN_NO_HT)); + + /* on the next assoc, re-program HT parameters */ + sdata->ht_opmode_valid = false; + + local->power_constr_level = 0; + + del_timer_sync(&local->dynamic_ps_timer); + cancel_work_sync(&local->dynamic_ps_enable_work); + + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + config_changed |= IEEE80211_CONF_CHANGE_PS; + } + local->ps_sdata = NULL; + + ieee80211_hw_config(local, config_changed); + + /* Disable ARP filtering */ + if (sdata->vif.bss_conf.arp_filter_enabled) { + sdata->vif.bss_conf.arp_filter_enabled = false; + changed |= BSS_CHANGED_ARP_FILTER; + } + + /* The BSSID (not really interesting) and HT changed */ + changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT; + ieee80211_bss_info_change_notify(sdata, changed); + + if (remove_sta) + sta_info_destroy_addr(sdata, bssid); + + del_timer_sync(&sdata->u.mgd.conn_mon_timer); + del_timer_sync(&sdata->u.mgd.bcn_mon_timer); + del_timer_sync(&sdata->u.mgd.timer); + del_timer_sync(&sdata->u.mgd.chswitch_timer); +} + +void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, + struct ieee80211_hdr *hdr) +{ + /* + * We can postpone the mgd.timer whenever receiving unicast frames + * from AP because we know that the connection is working both ways + * at that time. But multicast frames (and hence also beacons) must + * be ignored here, because we need to trigger the timer during + * data idle periods for sending the periodic probe request to the + * AP we're connected to. + */ + if (is_multicast_ether_addr(hdr->addr1)) + return; + + ieee80211_sta_reset_conn_monitor(sdata); +} + +static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (!(ifmgd->flags & (IEEE80211_STA_BEACON_POLL | + IEEE80211_STA_CONNECTION_POLL))) + return; + + ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL | + IEEE80211_STA_BEACON_POLL); + mutex_lock(&sdata->local->iflist_mtx); + ieee80211_recalc_ps(sdata->local, -1); + mutex_unlock(&sdata->local->iflist_mtx); + + if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + return; + + /* + * We've received a probe response, but are not sure whether + * we have or will be receiving any beacons or data, so let's + * schedule the timers again, just in case. + */ + ieee80211_sta_reset_beacon_monitor(sdata); + + mod_timer(&ifmgd->conn_mon_timer, + round_jiffies_up(jiffies + + IEEE80211_CONNECTION_IDLE_TIME)); +} + +void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, + struct ieee80211_hdr *hdr, bool ack) +{ + if (!ieee80211_is_data(hdr->frame_control)) + return; + + if (ack) + ieee80211_sta_reset_conn_monitor(sdata); + + if (ieee80211_is_nullfunc(hdr->frame_control) && + sdata->u.mgd.probe_send_count > 0) { + if (ack) + sdata->u.mgd.probe_send_count = 0; + else + sdata->u.mgd.nullfunc_failed = true; + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + } +} + +static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + const u8 *ssid; + u8 *dst = ifmgd->associated->bssid; + u8 unicast_limit = max(1, max_probe_tries - 3); + + /* + * Try sending broadcast probe requests for the last three + * probe requests after the first ones failed since some + * buggy APs only support broadcast probe requests. + */ + if (ifmgd->probe_send_count >= unicast_limit) + dst = NULL; + + /* + * When the hardware reports an accurate Tx ACK status, it's + * better to send a nullfunc frame instead of a probe request, + * as it will kick us off the AP quickly if we aren't associated + * anymore. The timeout will be reset if the frame is ACKed by + * the AP. + */ + if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ifmgd->nullfunc_failed = false; + ieee80211_send_nullfunc(sdata->local, sdata, 0); + } else { + ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID); + ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid[1], NULL, 0); + } + + ifmgd->probe_send_count++; + ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms); + run_again(ifmgd, ifmgd->probe_timeout); +} + +static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, + bool beacon) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool already = false; + + if (!ieee80211_sdata_running(sdata)) + return; + + if (sdata->local->scanning) + return; + + if (sdata->local->tmp_channel) + return; + + mutex_lock(&ifmgd->mtx); + + if (!ifmgd->associated) + goto out; + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + if (beacon && net_ratelimit()) + printk(KERN_DEBUG "%s: detected beacon loss from AP " + "- sending probe request\n", sdata->name); +#endif + + /* + * The driver/our work has already reported this event or the + * connection monitoring has kicked in and we have already sent + * a probe request. Or maybe the AP died and the driver keeps + * reporting until we disassociate... + * + * In either case we have to ignore the current call to this + * function (except for setting the correct probe reason bit) + * because otherwise we would reset the timer every time and + * never check whether we received a probe response! + */ + if (ifmgd->flags & (IEEE80211_STA_BEACON_POLL | + IEEE80211_STA_CONNECTION_POLL)) + already = true; + + if (beacon) + ifmgd->flags |= IEEE80211_STA_BEACON_POLL; + else + ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL; + + if (already) + goto out; + + mutex_lock(&sdata->local->iflist_mtx); + ieee80211_recalc_ps(sdata->local, -1); + mutex_unlock(&sdata->local->iflist_mtx); + + ifmgd->probe_send_count = 0; + ieee80211_mgd_probe_ap_send(sdata); + out: + mutex_unlock(&ifmgd->mtx); +} + +struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct sk_buff *skb; + const u8 *ssid; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) + return NULL; + + ASSERT_MGD_MTX(ifmgd); + + if (!ifmgd->associated) + return NULL; + + ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID); + skb = ieee80211_build_probe_req(sdata, ifmgd->associated->bssid, + ssid + 2, ssid[1], NULL, 0); + + return skb; +} +EXPORT_SYMBOL(ieee80211_ap_probereq_get); + +static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_local *local = sdata->local; + u8 bssid[ETH_ALEN]; + + mutex_lock(&ifmgd->mtx); + if (!ifmgd->associated) { + mutex_unlock(&ifmgd->mtx); + return; + } + + memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); + + printk(KERN_DEBUG "%s: Connection to AP %pM lost.\n", + sdata->name, bssid); + + ieee80211_set_disassoc(sdata, true, true); + mutex_unlock(&ifmgd->mtx); + + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + /* + * must be outside lock due to cfg80211, + * but that's not a problem. + */ + ieee80211_send_deauth_disassoc(sdata, bssid, + IEEE80211_STYPE_DEAUTH, + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, + NULL, true); +} + +void ieee80211_beacon_connection_loss_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.beacon_connection_loss_work); + + if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + __ieee80211_connection_loss(sdata); + else + ieee80211_mgd_probe_ap(sdata, true); +} + +void ieee80211_beacon_loss(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_hw *hw = &sdata->local->hw; + + trace_api_beacon_loss(sdata); + + WARN_ON(hw->flags & IEEE80211_HW_CONNECTION_MONITOR); + ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); +} +EXPORT_SYMBOL(ieee80211_beacon_loss); + +void ieee80211_connection_loss(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_hw *hw = &sdata->local->hw; + + trace_api_connection_loss(sdata); + + WARN_ON(!(hw->flags & IEEE80211_HW_CONNECTION_MONITOR)); + ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work); +} +EXPORT_SYMBOL(ieee80211_connection_loss); + + +static enum rx_mgmt_action __must_check +ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + const u8 *bssid = NULL; + u16 reason_code; + + if (len < 24 + 2) + return RX_MGMT_NONE; + + ASSERT_MGD_MTX(ifmgd); + + bssid = ifmgd->associated->bssid; + + reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); + + printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n", + sdata->name, bssid, reason_code); + + ieee80211_set_disassoc(sdata, true, false); + mutex_lock(&sdata->local->mtx); + ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); + + return RX_MGMT_CFG80211_DEAUTH; +} + + +static enum rx_mgmt_action __must_check +ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + u16 reason_code; + + if (len < 24 + 2) + return RX_MGMT_NONE; + + ASSERT_MGD_MTX(ifmgd); + + if (WARN_ON(!ifmgd->associated)) + return RX_MGMT_NONE; + + if (WARN_ON(memcmp(ifmgd->associated->bssid, mgmt->sa, ETH_ALEN))) + return RX_MGMT_NONE; + + reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); + + printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n", + sdata->name, mgmt->sa, reason_code); + + ieee80211_set_disassoc(sdata, true, false); + mutex_lock(&sdata->local->mtx); + ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); + return RX_MGMT_CFG80211_DISASSOC; +} + + +static bool ieee80211_assoc_success(struct ieee80211_work *wk, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + struct sta_info *sta; + struct cfg80211_bss *cbss = wk->assoc.bss; + u8 *pos; + u32 rates, basic_rates; + u16 capab_info, aid; + struct ieee802_11_elems elems; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + u32 changed = 0; + int i, j, err; + bool have_higher_than_11mbit = false; + u16 ap_ht_cap_flags; + + /* AssocResp and ReassocResp have identical structure */ + + aid = le16_to_cpu(mgmt->u.assoc_resp.aid); + capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + + if ((aid & (BIT(15) | BIT(14))) != (BIT(15) | BIT(14))) + printk(KERN_DEBUG + "%s: invalid AID value 0x%x; bits 15:14 not set\n", + sdata->name, aid); + aid &= ~(BIT(15) | BIT(14)); + + ifmgd->broken_ap = false; + + if (aid == 0 || aid > IEEE80211_MAX_AID) { + printk(KERN_DEBUG + "%s: invalid AID value %d (out of range), turn off PS\n", + sdata->name, aid); + aid = 0; + ifmgd->broken_ap = true; + } + + pos = mgmt->u.assoc_resp.variable; + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + + if (!elems.supp_rates) { + printk(KERN_DEBUG "%s: no SuppRates element in AssocResp\n", + sdata->name); + return false; + } + + ifmgd->aid = aid; + + sta = sta_info_alloc(sdata, cbss->bssid, GFP_KERNEL); + if (!sta) { + printk(KERN_DEBUG "%s: failed to alloc STA entry for" + " the AP\n", sdata->name); + return false; + } + + set_sta_flags(sta, WLAN_STA_AUTH | WLAN_STA_ASSOC | + WLAN_STA_ASSOC_AP); + if (!(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) + set_sta_flags(sta, WLAN_STA_AUTHORIZED); + + rates = 0; + basic_rates = 0; + sband = local->hw.wiphy->bands[wk->chan->band]; + + for (i = 0; i < elems.supp_rates_len; i++) { + int rate = (elems.supp_rates[i] & 0x7f) * 5; + bool is_basic = !!(elems.supp_rates[i] & 0x80); + + if (rate > 110) + have_higher_than_11mbit = true; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) { + rates |= BIT(j); + if (is_basic) + basic_rates |= BIT(j); + break; + } + } + } + + for (i = 0; i < elems.ext_supp_rates_len; i++) { + int rate = (elems.ext_supp_rates[i] & 0x7f) * 5; + bool is_basic = !!(elems.ext_supp_rates[i] & 0x80); + + if (rate > 110) + have_higher_than_11mbit = true; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) { + rates |= BIT(j); + if (is_basic) + basic_rates |= BIT(j); + break; + } + } + } + + sta->sta.supp_rates[wk->chan->band] = rates; + sdata->vif.bss_conf.basic_rates = basic_rates; + + /* cf. IEEE 802.11 9.2.12 */ + if (wk->chan->band == IEEE80211_BAND_2GHZ && + have_higher_than_11mbit) + sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; + else + sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; + + if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_11N)) + ieee80211_ht_cap_ie_to_sta_ht_cap(sband, + elems.ht_cap_elem, &sta->sta.ht_cap); + + ap_ht_cap_flags = sta->sta.ht_cap.cap; + + rate_control_rate_init(sta); + + if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) + set_sta_flags(sta, WLAN_STA_MFP); + + if (elems.wmm_param) + set_sta_flags(sta, WLAN_STA_WME); + + err = sta_info_insert(sta); + sta = NULL; + if (err) { + printk(KERN_DEBUG "%s: failed to insert STA entry for" + " the AP (error %d)\n", sdata->name, err); + return false; + } + + /* + * Always handle WMM once after association regardless + * of the first value the AP uses. Setting -1 here has + * that effect because the AP values is an unsigned + * 4-bit value. + */ + ifmgd->wmm_last_param_set = -1; + + if (elems.wmm_param) + ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, + elems.wmm_param_len); + else + ieee80211_set_wmm_default(sdata); + + local->oper_channel = wk->chan; + + if (elems.ht_info_elem && elems.wmm_param && + (sdata->local->hw.queues >= 4) && + !(ifmgd->flags & IEEE80211_STA_DISABLE_11N)) + changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem, + cbss->bssid, ap_ht_cap_flags); + + /* set AID and assoc capability, + * ieee80211_set_associated() will tell the driver */ + bss_conf->aid = aid; + bss_conf->assoc_capability = capab_info; + ieee80211_set_associated(sdata, cbss, changed); + + /* + * If we're using 4-addr mode, let the AP know that we're + * doing so, so that it can create the STA VLAN on its side + */ + if (ifmgd->use_4addr) + ieee80211_send_4addr_nullfunc(local, sdata); + + /* + * Start timer to probe the connection to the AP now. + * Also start the timer that will detect beacon loss. + */ + ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt); + ieee80211_sta_reset_beacon_monitor(sdata); + + return true; +} + + +static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status, + struct ieee802_11_elems *elems, + bool beacon) +{ + struct ieee80211_local *local = sdata->local; + int freq; + struct ieee80211_bss *bss; + struct ieee80211_channel *channel; + bool need_ps = false; + + if (sdata->u.mgd.associated) { + bss = (void *)sdata->u.mgd.associated->priv; + /* not previously set so we may need to recalc */ + need_ps = !bss->dtim_period; + } + + if (elems->ds_params && elems->ds_params_len == 1) + freq = ieee80211_channel_to_frequency(elems->ds_params[0], + rx_status->band); + else + freq = rx_status->freq; + + channel = ieee80211_get_channel(local->hw.wiphy, freq); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return; + + bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, + channel, beacon); + if (bss) + ieee80211_rx_bss_put(local, bss); + + if (!sdata->u.mgd.associated) + return; + + if (need_ps) { + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_ps(local, -1); + mutex_unlock(&local->iflist_mtx); + } + + if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) && + (memcmp(mgmt->bssid, sdata->u.mgd.associated->bssid, + ETH_ALEN) == 0)) { + struct ieee80211_channel_sw_ie *sw_elem = + (struct ieee80211_channel_sw_ie *)elems->ch_switch_elem; + ieee80211_sta_process_chanswitch(sdata, sw_elem, + bss, rx_status->mactime); + } +} + + +static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_rx_status *rx_status = (void *) skb->cb; + size_t baselen, len = skb->len; + struct ieee802_11_elems elems; + + ifmgd = &sdata->u.mgd; + + ASSERT_MGD_MTX(ifmgd); + + if (memcmp(mgmt->da, sdata->vif.addr, ETH_ALEN)) + return; /* ignore ProbeResp to foreign address */ + + baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; + if (baselen > len) + return; + + ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, + &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false); + + if (ifmgd->associated && + memcmp(mgmt->bssid, ifmgd->associated->bssid, ETH_ALEN) == 0) + ieee80211_reset_ap_probe(sdata); +} + +/* + * This is the canonical list of information elements we care about, + * the filter code also gives us all changes to the Microsoft OUI + * (00:50:F2) vendor IE which is used for WMM which we need to track. + * + * We implement beacon filtering in software since that means we can + * avoid processing the frame here and in cfg80211, and userspace + * will not be able to tell whether the hardware supports it or not. + * + * XXX: This list needs to be dynamic -- userspace needs to be able to + * add items it requires. It also needs to be able to tell us to + * look out for other vendor IEs. + */ +static const u64 care_about_ies = + (1ULL << WLAN_EID_COUNTRY) | + (1ULL << WLAN_EID_ERP_INFO) | + (1ULL << WLAN_EID_CHANNEL_SWITCH) | + (1ULL << WLAN_EID_PWR_CONSTRAINT) | + (1ULL << WLAN_EID_HT_CAPABILITY) | + (1ULL << WLAN_EID_HT_INFORMATION); + +static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + size_t baselen; + struct ieee802_11_elems elems; + struct ieee80211_local *local = sdata->local; + u32 changed = 0; + bool erp_valid, directed_tim = false; + u8 erp_value = 0; + u32 ncrc; + u8 *bssid; + + ASSERT_MGD_MTX(ifmgd); + + /* Process beacon from the current BSS */ + baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; + if (baselen > len) + return; + + if (rx_status->freq != local->hw.conf.channel->center_freq) + return; + + /* + * We might have received a number of frames, among them a + * disassoc frame and a beacon... + */ + if (!ifmgd->associated) + return; + + bssid = ifmgd->associated->bssid; + + /* + * And in theory even frames from a different AP we were just + * associated to a split-second ago! + */ + if (memcmp(bssid, mgmt->bssid, ETH_ALEN) != 0) + return; + + /* Track average RSSI from the Beacon frames of the current AP */ + ifmgd->last_beacon_signal = rx_status->signal; + if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { + ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; + ifmgd->ave_beacon_signal = rx_status->signal * 16; + ifmgd->last_cqm_event_signal = 0; + ifmgd->count_beacon_signal = 1; + } else { + ifmgd->ave_beacon_signal = + (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 + + (16 - IEEE80211_SIGNAL_AVE_WEIGHT) * + ifmgd->ave_beacon_signal) / 16; + ifmgd->count_beacon_signal++; + } + if (bss_conf->cqm_rssi_thold && + ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && + !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) { + int sig = ifmgd->ave_beacon_signal / 16; + int last_event = ifmgd->last_cqm_event_signal; + int thold = bss_conf->cqm_rssi_thold; + int hyst = bss_conf->cqm_rssi_hyst; + if (sig < thold && + (last_event == 0 || sig < last_event - hyst)) { + ifmgd->last_cqm_event_signal = sig; + ieee80211_cqm_rssi_notify( + &sdata->vif, + NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, + GFP_KERNEL); + } else if (sig > thold && + (last_event == 0 || sig > last_event + hyst)) { + ifmgd->last_cqm_event_signal = sig; + ieee80211_cqm_rssi_notify( + &sdata->vif, + NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, + GFP_KERNEL); + } + } + + if (ifmgd->flags & IEEE80211_STA_BEACON_POLL) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + if (net_ratelimit()) { + printk(KERN_DEBUG "%s: cancelling probereq poll due " + "to a received beacon\n", sdata->name); + } +#endif + ifmgd->flags &= ~IEEE80211_STA_BEACON_POLL; + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_ps(local, -1); + mutex_unlock(&local->iflist_mtx); + } + + /* + * Push the beacon loss detection into the future since + * we are processing a beacon from the AP just now. + */ + ieee80211_sta_reset_beacon_monitor(sdata); + + ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); + ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable, + len - baselen, &elems, + care_about_ies, ncrc); + + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len, + ifmgd->aid); + + if (ncrc != ifmgd->beacon_crc || !ifmgd->beacon_crc_valid) { + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, + true); + + ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, + elems.wmm_param_len); + } + + if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { + if (directed_tim) { + if (local->hw.conf.dynamic_ps_timeout > 0) { + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); + ieee80211_send_nullfunc(local, sdata, 0); + } else { + local->pspolling = true; + + /* + * Here is assumed that the driver will be + * able to send ps-poll frame and receive a + * response even though power save mode is + * enabled, but some drivers might require + * to disable power save here. This needs + * to be investigated. + */ + ieee80211_send_pspoll(local, sdata); + } + } + } + + if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid) + return; + ifmgd->beacon_crc = ncrc; + ifmgd->beacon_crc_valid = true; + + if (elems.erp_info && elems.erp_info_len >= 1) { + erp_valid = true; + erp_value = elems.erp_info[0]; + } else { + erp_valid = false; + } + changed |= ieee80211_handle_bss_capability(sdata, + le16_to_cpu(mgmt->u.beacon.capab_info), + erp_valid, erp_value); + + + if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param && + !(ifmgd->flags & IEEE80211_STA_DISABLE_11N)) { + struct sta_info *sta; + struct ieee80211_supported_band *sband; + u16 ap_ht_cap_flags; + + rcu_read_lock(); + + sta = sta_info_get(sdata, bssid); + if (WARN_ON(!sta)) { + rcu_read_unlock(); + return; + } + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + ieee80211_ht_cap_ie_to_sta_ht_cap(sband, + elems.ht_cap_elem, &sta->sta.ht_cap); + + ap_ht_cap_flags = sta->sta.ht_cap.cap; + + rcu_read_unlock(); + + changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem, + bssid, ap_ht_cap_flags); + } + + /* Note: country IE parsing is done for us by cfg80211 */ + if (elems.country_elem) { + /* TODO: IBSS also needs this */ + if (elems.pwr_constr_elem) + ieee80211_handle_pwr_constr(sdata, + le16_to_cpu(mgmt->u.probe_resp.capab_info), + elems.pwr_constr_elem, + elems.pwr_constr_elem_len); + } + + ieee80211_bss_info_change_notify(sdata, changed); +} + +void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_rx_status *rx_status; + struct ieee80211_mgmt *mgmt; + enum rx_mgmt_action rma = RX_MGMT_NONE; + u16 fc; + + rx_status = (struct ieee80211_rx_status *) skb->cb; + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + mutex_lock(&ifmgd->mtx); + + if (ifmgd->associated && + memcmp(ifmgd->associated->bssid, mgmt->bssid, ETH_ALEN) == 0) { + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_BEACON: + ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_PROBE_RESP: + ieee80211_rx_mgmt_probe_resp(sdata, skb); + break; + case IEEE80211_STYPE_DEAUTH: + rma = ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len); + break; + case IEEE80211_STYPE_DISASSOC: + rma = ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len); + break; + case IEEE80211_STYPE_ACTION: + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_SPECTRUM_MGMT: + ieee80211_sta_process_chanswitch(sdata, + &mgmt->u.action.u.chan_switch.sw_elem, + (void *)ifmgd->associated->priv, + rx_status->mactime); + break; + } + } + mutex_unlock(&ifmgd->mtx); + + switch (rma) { + case RX_MGMT_NONE: + /* no action */ + break; + case RX_MGMT_CFG80211_DEAUTH: + cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); + break; + case RX_MGMT_CFG80211_DISASSOC: + cfg80211_send_disassoc(sdata->dev, (u8 *)mgmt, skb->len); + break; + default: + WARN(1, "unexpected: %d", rma); + } + return; + } + + mutex_unlock(&ifmgd->mtx); + + if (skb->len >= 24 + 2 /* mgmt + deauth reason */ && + (fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_DEAUTH) { + struct ieee80211_local *local = sdata->local; + struct ieee80211_work *wk; + + mutex_lock(&local->mtx); + list_for_each_entry(wk, &local->work_list, list) { + if (wk->sdata != sdata) + continue; + + if (wk->type != IEEE80211_WORK_ASSOC && + wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT) + continue; + + if (memcmp(mgmt->bssid, wk->filter_ta, ETH_ALEN)) + continue; + if (memcmp(mgmt->sa, wk->filter_ta, ETH_ALEN)) + continue; + + /* + * Printing the message only here means we can't + * spuriously print it, but it also means that it + * won't be printed when the frame comes in before + * we even tried to associate or in similar cases. + * + * Ultimately, I suspect cfg80211 should print the + * messages instead. + */ + printk(KERN_DEBUG + "%s: deauthenticated from %pM (Reason: %u)\n", + sdata->name, mgmt->bssid, + le16_to_cpu(mgmt->u.deauth.reason_code)); + + list_del_rcu(&wk->list); + free_work(wk); + break; + } + mutex_unlock(&local->mtx); + + cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); + } +} + +static void ieee80211_sta_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_local *local = sdata->local; + + if (local->quiescing) { + set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); + return; + } + + ieee80211_queue_work(&local->hw, &sdata->work); +} + +static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, + u8 *bssid) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL | + IEEE80211_STA_BEACON_POLL); + + ieee80211_set_disassoc(sdata, true, true); + mutex_unlock(&ifmgd->mtx); + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + /* + * must be outside lock due to cfg80211, + * but that's not a problem. + */ + ieee80211_send_deauth_disassoc(sdata, bssid, + IEEE80211_STYPE_DEAUTH, + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, + NULL, true); + mutex_lock(&ifmgd->mtx); +} + +void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + /* then process the rest of the work */ + mutex_lock(&ifmgd->mtx); + + if (ifmgd->flags & (IEEE80211_STA_BEACON_POLL | + IEEE80211_STA_CONNECTION_POLL) && + ifmgd->associated) { + u8 bssid[ETH_ALEN]; + int max_tries; + + memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); + + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + max_tries = max_nullfunc_tries; + else + max_tries = max_probe_tries; + + /* ACK received for nullfunc probing frame */ + if (!ifmgd->probe_send_count) + ieee80211_reset_ap_probe(sdata); + else if (ifmgd->nullfunc_failed) { + if (ifmgd->probe_send_count < max_tries) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, + "%s: No ack for nullfunc frame to" + " AP %pM, try %d/%i\n", + sdata->name, bssid, + ifmgd->probe_send_count, max_tries); +#endif + ieee80211_mgd_probe_ap_send(sdata); + } else { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, + "%s: No ack for nullfunc frame to" + " AP %pM, disconnecting.\n", + sdata->name, bssid); +#endif + ieee80211_sta_connection_lost(sdata, bssid); + } + } else if (time_is_after_jiffies(ifmgd->probe_timeout)) + run_again(ifmgd, ifmgd->probe_timeout); + else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, + "%s: Failed to send nullfunc to AP %pM" + " after %dms, disconnecting.\n", + sdata->name, + bssid, probe_wait_ms); +#endif + ieee80211_sta_connection_lost(sdata, bssid); + } else if (ifmgd->probe_send_count < max_tries) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, + "%s: No probe response from AP %pM" + " after %dms, try %d/%i\n", + sdata->name, + bssid, probe_wait_ms, + ifmgd->probe_send_count, max_tries); +#endif + ieee80211_mgd_probe_ap_send(sdata); + } else { + /* + * We actually lost the connection ... or did we? + * Let's make sure! + */ + wiphy_debug(local->hw.wiphy, + "%s: No probe response from AP %pM" + " after %dms, disconnecting.\n", + sdata->name, + bssid, probe_wait_ms); + + ieee80211_sta_connection_lost(sdata, bssid); + } + } + + mutex_unlock(&ifmgd->mtx); +} + +static void ieee80211_sta_bcn_mon_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_local *local = sdata->local; + + if (local->quiescing) + return; + + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.beacon_connection_loss_work); +} + +static void ieee80211_sta_conn_mon_timer(unsigned long data) +{ + struct ieee80211_sub_if_data *sdata = + (struct ieee80211_sub_if_data *) data; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_local *local = sdata->local; + + if (local->quiescing) + return; + + ieee80211_queue_work(&local->hw, &ifmgd->monitor_work); +} + +static void ieee80211_sta_monitor_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.monitor_work); + + ieee80211_mgd_probe_ap(sdata, false); +} + +static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) +{ + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + sdata->u.mgd.flags &= ~(IEEE80211_STA_BEACON_POLL | + IEEE80211_STA_CONNECTION_POLL); + + /* let's probe the connection once */ + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.monitor_work); + /* and do all the other regular work too */ + ieee80211_queue_work(&sdata->local->hw, &sdata->work); + } +} + +#ifdef CONFIG_PM +void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + /* + * we need to use atomic bitops for the running bits + * only because both timers might fire at the same + * time -- the code here is properly synchronised. + */ + + cancel_work_sync(&ifmgd->request_smps_work); + + cancel_work_sync(&ifmgd->beacon_connection_loss_work); + if (del_timer_sync(&ifmgd->timer)) + set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); + + cancel_work_sync(&ifmgd->chswitch_work); + if (del_timer_sync(&ifmgd->chswitch_timer)) + set_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running); + + cancel_work_sync(&ifmgd->monitor_work); + /* these will just be re-established on connection */ + del_timer_sync(&ifmgd->conn_mon_timer); + del_timer_sync(&ifmgd->bcn_mon_timer); +} + +void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (!ifmgd->associated) + return; + + if (test_and_clear_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running)) + add_timer(&ifmgd->timer); + if (test_and_clear_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running)) + add_timer(&ifmgd->chswitch_timer); + ieee80211_sta_reset_beacon_monitor(sdata); + ieee80211_restart_sta_timer(sdata); +} +#endif + +/* interface setup */ +void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd; + + ifmgd = &sdata->u.mgd; + INIT_WORK(&ifmgd->monitor_work, ieee80211_sta_monitor_work); + INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work); + INIT_WORK(&ifmgd->beacon_connection_loss_work, + ieee80211_beacon_connection_loss_work); + INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work); + setup_timer(&ifmgd->timer, ieee80211_sta_timer, + (unsigned long) sdata); + setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, + (unsigned long) sdata); + setup_timer(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, + (unsigned long) sdata); + setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, + (unsigned long) sdata); + + ifmgd->flags = 0; + + mutex_init(&ifmgd->mtx); + + if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC; + else + ifmgd->req_smps = IEEE80211_SMPS_OFF; +} + +/* scan finished notification */ +void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata = local->scan_sdata; + + /* Restart STA timers */ + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) + ieee80211_restart_sta_timer(sdata); + rcu_read_unlock(); +} + +int ieee80211_max_network_latency(struct notifier_block *nb, + unsigned long data, void *dummy) +{ + s32 latency_usec = (s32) data; + struct ieee80211_local *local = + container_of(nb, struct ieee80211_local, + network_latency_notifier); + + mutex_lock(&local->iflist_mtx); + ieee80211_recalc_ps(local, latency_usec); + mutex_unlock(&local->iflist_mtx); + + return 0; +} + +/* config hooks */ +static enum work_done_result +ieee80211_probe_auth_done(struct ieee80211_work *wk, + struct sk_buff *skb) +{ + if (!skb) { + cfg80211_send_auth_timeout(wk->sdata->dev, wk->filter_ta); + return WORK_DONE_DESTROY; + } + + if (wk->type == IEEE80211_WORK_AUTH) { + cfg80211_send_rx_auth(wk->sdata->dev, skb->data, skb->len); + return WORK_DONE_DESTROY; + } + + mutex_lock(&wk->sdata->u.mgd.mtx); + ieee80211_rx_mgmt_probe_resp(wk->sdata, skb); + mutex_unlock(&wk->sdata->u.mgd.mtx); + + wk->type = IEEE80211_WORK_AUTH; + wk->probe_auth.tries = 0; + return WORK_DONE_REQUEUE; +} + +int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, + struct cfg80211_auth_request *req) +{ + const u8 *ssid; + struct ieee80211_work *wk; + u16 auth_alg; + + if (req->local_state_change) + return 0; /* no need to update mac80211 state */ + + switch (req->auth_type) { + case NL80211_AUTHTYPE_OPEN_SYSTEM: + auth_alg = WLAN_AUTH_OPEN; + break; + case NL80211_AUTHTYPE_SHARED_KEY: + if (IS_ERR(sdata->local->wep_tx_tfm)) + return -EOPNOTSUPP; + auth_alg = WLAN_AUTH_SHARED_KEY; + break; + case NL80211_AUTHTYPE_FT: + auth_alg = WLAN_AUTH_FT; + break; + case NL80211_AUTHTYPE_NETWORK_EAP: + auth_alg = WLAN_AUTH_LEAP; + break; + default: + return -EOPNOTSUPP; + } + + wk = kzalloc(sizeof(*wk) + req->ie_len, GFP_KERNEL); + if (!wk) + return -ENOMEM; + + memcpy(wk->filter_ta, req->bss->bssid, ETH_ALEN); + + if (req->ie && req->ie_len) { + memcpy(wk->ie, req->ie, req->ie_len); + wk->ie_len = req->ie_len; + } + + if (req->key && req->key_len) { + wk->probe_auth.key_len = req->key_len; + wk->probe_auth.key_idx = req->key_idx; + memcpy(wk->probe_auth.key, req->key, req->key_len); + } + + ssid = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); + memcpy(wk->probe_auth.ssid, ssid + 2, ssid[1]); + wk->probe_auth.ssid_len = ssid[1]; + + wk->probe_auth.algorithm = auth_alg; + wk->probe_auth.privacy = req->bss->capability & WLAN_CAPABILITY_PRIVACY; + + /* if we already have a probe, don't probe again */ + if (req->bss->proberesp_ies) + wk->type = IEEE80211_WORK_AUTH; + else + wk->type = IEEE80211_WORK_DIRECT_PROBE; + wk->chan = req->bss->channel; + wk->chan_type = NL80211_CHAN_NO_HT; + wk->sdata = sdata; + wk->done = ieee80211_probe_auth_done; + + ieee80211_add_work(wk); + return 0; +} + +static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt; + struct ieee80211_rx_status *rx_status; + struct ieee802_11_elems elems; + u16 status; + + if (!skb) { + cfg80211_send_assoc_timeout(wk->sdata->dev, wk->filter_ta); + return WORK_DONE_DESTROY; + } + + if (wk->type == IEEE80211_WORK_ASSOC_BEACON_WAIT) { + mutex_lock(&wk->sdata->u.mgd.mtx); + rx_status = (void *) skb->cb; + ieee802_11_parse_elems(skb->data + 24 + 12, skb->len - 24 - 12, &elems); + ieee80211_rx_bss_info(wk->sdata, (void *)skb->data, skb->len, rx_status, + &elems, true); + mutex_unlock(&wk->sdata->u.mgd.mtx); + + wk->type = IEEE80211_WORK_ASSOC; + /* not really done yet */ + return WORK_DONE_REQUEUE; + } + + mgmt = (void *)skb->data; + status = le16_to_cpu(mgmt->u.assoc_resp.status_code); + + if (status == WLAN_STATUS_SUCCESS) { + mutex_lock(&wk->sdata->u.mgd.mtx); + if (!ieee80211_assoc_success(wk, mgmt, skb->len)) { + mutex_unlock(&wk->sdata->u.mgd.mtx); + /* oops -- internal error -- send timeout for now */ + cfg80211_send_assoc_timeout(wk->sdata->dev, + wk->filter_ta); + return WORK_DONE_DESTROY; + } + + mutex_unlock(&wk->sdata->u.mgd.mtx); + } + + cfg80211_send_rx_assoc(wk->sdata->dev, skb->data, skb->len); + return WORK_DONE_DESTROY; +} + +int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, + struct cfg80211_assoc_request *req) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_bss *bss = (void *)req->bss->priv; + struct ieee80211_work *wk; + const u8 *ssid; + int i; + + mutex_lock(&ifmgd->mtx); + if (ifmgd->associated) { + if (!req->prev_bssid || + memcmp(req->prev_bssid, ifmgd->associated->bssid, + ETH_ALEN)) { + /* + * We are already associated and the request was not a + * reassociation request from the current BSS, so + * reject it. + */ + mutex_unlock(&ifmgd->mtx); + return -EALREADY; + } + + /* Trying to reassociate - clear previous association state */ + ieee80211_set_disassoc(sdata, true, false); + } + mutex_unlock(&ifmgd->mtx); + + wk = kzalloc(sizeof(*wk) + req->ie_len, GFP_KERNEL); + if (!wk) + return -ENOMEM; + + ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N; + ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; + + ifmgd->beacon_crc_valid = false; + + for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) + if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || + req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || + req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) + ifmgd->flags |= IEEE80211_STA_DISABLE_11N; + + + if (req->ie && req->ie_len) { + memcpy(wk->ie, req->ie, req->ie_len); + wk->ie_len = req->ie_len; + } else + wk->ie_len = 0; + + wk->assoc.bss = req->bss; + + memcpy(wk->filter_ta, req->bss->bssid, ETH_ALEN); + + /* new association always uses requested smps mode */ + if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + ifmgd->ap_smps = IEEE80211_SMPS_DYNAMIC; + else + ifmgd->ap_smps = IEEE80211_SMPS_OFF; + } else + ifmgd->ap_smps = ifmgd->req_smps; + + wk->assoc.smps = ifmgd->ap_smps; + /* + * IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode. + * We still associate in non-HT mode (11a/b/g) if any one of these + * ciphers is configured as pairwise. + * We can set this to true for non-11n hardware, that'll be checked + * separately along with the peer capabilities. + */ + wk->assoc.use_11n = !(ifmgd->flags & IEEE80211_STA_DISABLE_11N); + wk->assoc.capability = req->bss->capability; + wk->assoc.wmm_used = bss->wmm_used; + wk->assoc.supp_rates = bss->supp_rates; + wk->assoc.supp_rates_len = bss->supp_rates_len; + wk->assoc.ht_information_ie = + ieee80211_bss_get_ie(req->bss, WLAN_EID_HT_INFORMATION); + + if (bss->wmm_used && bss->uapsd_supported && + (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) { + wk->assoc.uapsd_used = true; + ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; + } else { + wk->assoc.uapsd_used = false; + ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED; + } + + ssid = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); + memcpy(wk->assoc.ssid, ssid + 2, ssid[1]); + wk->assoc.ssid_len = ssid[1]; + + if (req->prev_bssid) + memcpy(wk->assoc.prev_bssid, req->prev_bssid, ETH_ALEN); + + wk->chan = req->bss->channel; + wk->chan_type = NL80211_CHAN_NO_HT; + wk->sdata = sdata; + wk->done = ieee80211_assoc_done; + if (!bss->dtim_period && + sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) + wk->type = IEEE80211_WORK_ASSOC_BEACON_WAIT; + else + wk->type = IEEE80211_WORK_ASSOC; + + if (req->use_mfp) { + ifmgd->mfp = IEEE80211_MFP_REQUIRED; + ifmgd->flags |= IEEE80211_STA_MFP_ENABLED; + } else { + ifmgd->mfp = IEEE80211_MFP_DISABLED; + ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED; + } + + if (req->crypto.control_port) + ifmgd->flags |= IEEE80211_STA_CONTROL_PORT; + else + ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT; + + sdata->control_port_protocol = req->crypto.control_port_ethertype; + sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; + + ieee80211_add_work(wk); + return 0; +} + +int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, + struct cfg80211_deauth_request *req, + void *cookie) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_work *wk; + u8 bssid[ETH_ALEN]; + bool assoc_bss = false; + + mutex_lock(&ifmgd->mtx); + + memcpy(bssid, req->bss->bssid, ETH_ALEN); + if (ifmgd->associated == req->bss) { + ieee80211_set_disassoc(sdata, false, true); + mutex_unlock(&ifmgd->mtx); + assoc_bss = true; + } else { + bool not_auth_yet = false; + + mutex_unlock(&ifmgd->mtx); + + mutex_lock(&local->mtx); + list_for_each_entry(wk, &local->work_list, list) { + if (wk->sdata != sdata) + continue; + + if (wk->type != IEEE80211_WORK_DIRECT_PROBE && + wk->type != IEEE80211_WORK_AUTH && + wk->type != IEEE80211_WORK_ASSOC && + wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT) + continue; + + if (memcmp(req->bss->bssid, wk->filter_ta, ETH_ALEN)) + continue; + + not_auth_yet = wk->type == IEEE80211_WORK_DIRECT_PROBE; + list_del_rcu(&wk->list); + free_work(wk); + break; + } + mutex_unlock(&local->mtx); + + /* + * If somebody requests authentication and we haven't + * sent out an auth frame yet there's no need to send + * out a deauth frame either. If the state was PROBE, + * then this is the case. If it's AUTH we have sent a + * frame, and if it's IDLE we have completed the auth + * process already. + */ + if (not_auth_yet) { + __cfg80211_auth_canceled(sdata->dev, bssid); + return 0; + } + } + + printk(KERN_DEBUG "%s: deauthenticating from %pM by local choice (reason=%d)\n", + sdata->name, bssid, req->reason_code); + + ieee80211_send_deauth_disassoc(sdata, bssid, IEEE80211_STYPE_DEAUTH, + req->reason_code, cookie, + !req->local_state_change); + if (assoc_bss) + sta_info_destroy_addr(sdata, bssid); + + mutex_lock(&sdata->local->mtx); + ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); + + return 0; +} + +int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, + struct cfg80211_disassoc_request *req, + void *cookie) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + u8 bssid[ETH_ALEN]; + + mutex_lock(&ifmgd->mtx); + + /* + * cfg80211 should catch this ... but it's racy since + * we can receive a disassoc frame, process it, hand it + * to cfg80211 while that's in a locked section already + * trying to tell us that the user wants to disconnect. + */ + if (ifmgd->associated != req->bss) { + mutex_unlock(&ifmgd->mtx); + return -ENOLINK; + } + + printk(KERN_DEBUG "%s: disassociating from %pM by local choice (reason=%d)\n", + sdata->name, req->bss->bssid, req->reason_code); + + memcpy(bssid, req->bss->bssid, ETH_ALEN); + ieee80211_set_disassoc(sdata, false, true); + + mutex_unlock(&ifmgd->mtx); + + ieee80211_send_deauth_disassoc(sdata, req->bss->bssid, + IEEE80211_STYPE_DISASSOC, req->reason_code, + cookie, !req->local_state_change); + sta_info_destroy_addr(sdata, bssid); + + mutex_lock(&sdata->local->mtx); + ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); + + return 0; +} + +void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, + enum nl80211_cqm_rssi_threshold_event rssi_event, + gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + trace_api_cqm_rssi_notify(sdata, rssi_event); + + cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp); +} +EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c new file mode 100644 index 00000000..c55eb9d8 --- /dev/null +++ b/net/mac80211/offchannel.c @@ -0,0 +1,299 @@ +/* + * Off-channel operation helpers + * + * Copyright 2003, Jouni Malinen + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2009 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include "ieee80211_i.h" +#include "driver-trace.h" + +/* + * Tell our hardware to disable PS. + * Optionally inform AP that we will go to sleep so that it will buffer + * the frames while we are doing off-channel work. This is optional + * because we *may* be doing work on-operating channel, and want our + * hardware unconditionally awake, but still let the AP send us normal frames. + */ +static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata, + bool tell_ap) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + local->offchannel_ps_enabled = false; + + /* FIXME: what to do when local->pspolling is true? */ + + del_timer_sync(&local->dynamic_ps_timer); + del_timer_sync(&ifmgd->bcn_mon_timer); + del_timer_sync(&ifmgd->conn_mon_timer); + + cancel_work_sync(&local->dynamic_ps_enable_work); + + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + local->offchannel_ps_enabled = true; + local->hw.conf.flags &= ~IEEE80211_CONF_PS; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + } + + if (tell_ap && (!local->offchannel_ps_enabled || + !(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK))) + /* + * If power save was enabled, no need to send a nullfunc + * frame because AP knows that we are sleeping. But if the + * hardware is creating the nullfunc frame for power save + * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not + * enabled) and power save was enabled, the firmware just + * sent a null frame with power save disabled. So we need + * to send a new nullfunc frame to inform the AP that we + * are again sleeping. + */ + ieee80211_send_nullfunc(local, sdata, 1); +} + +/* inform AP that we are awake again, unless power save is enabled */ +static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + + if (!local->ps_sdata) + ieee80211_send_nullfunc(local, sdata, 0); + else if (local->offchannel_ps_enabled) { + /* + * In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware + * will send a nullfunc frame with the powersave bit set + * even though the AP already knows that we are sleeping. + * This could be avoided by sending a null frame with power + * save bit disabled before enabling the power save, but + * this doesn't gain anything. + * + * When IEEE80211_HW_PS_NULLFUNC_STACK is enabled, no need + * to send a nullfunc frame because AP already knows that + * we are sleeping, let's just enable power save mode in + * hardware. + */ + /* TODO: Only set hardware if CONF_PS changed? + * TODO: Should we set offchannel_ps_enabled to false? + */ + local->hw.conf.flags |= IEEE80211_CONF_PS; + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + } else if (local->hw.conf.dynamic_ps_timeout > 0) { + /* + * If IEEE80211_CONF_PS was not set and the dynamic_ps_timer + * had been running before leaving the operating channel, + * restart the timer now and send a nullfunc frame to inform + * the AP that we are awake. + */ + ieee80211_send_nullfunc(local, sdata, 0); + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); + } + + ieee80211_sta_reset_beacon_monitor(sdata); + ieee80211_sta_reset_conn_monitor(sdata); +} + +void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local, + bool offchannel_ps_enable) +{ + struct ieee80211_sub_if_data *sdata; + + /* + * notify the AP about us leaving the channel and stop all + * STA interfaces. + */ + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) + set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); + + /* Check to see if we should disable beaconing. */ + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_ADHOC || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + ieee80211_bss_info_change_notify( + sdata, BSS_CHANGED_BEACON_ENABLED); + + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { + netif_tx_stop_all_queues(sdata->dev); + if (offchannel_ps_enable && + (sdata->vif.type == NL80211_IFTYPE_STATION) && + sdata->u.mgd.associated) + ieee80211_offchannel_ps_enable(sdata, true); + } + } + mutex_unlock(&local->iflist_mtx); +} + +void ieee80211_offchannel_enable_all_ps(struct ieee80211_local *local, + bool tell_ap) +{ + struct ieee80211_sub_if_data *sdata; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + sdata->u.mgd.associated) + ieee80211_offchannel_ps_enable(sdata, tell_ap); + } + mutex_unlock(&local->iflist_mtx); +} + +void ieee80211_offchannel_return(struct ieee80211_local *local, + bool enable_beaconing, + bool offchannel_ps_disable) +{ + struct ieee80211_sub_if_data *sdata; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + /* Tell AP we're back */ + if (offchannel_ps_disable && + sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->u.mgd.associated) + ieee80211_offchannel_ps_disable(sdata); + } + + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { + clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); + /* + * This may wake up queues even though the driver + * currently has them stopped. This is not very + * likely, since the driver won't have gotten any + * (or hardly any) new packets while we weren't + * on the right channel, and even if it happens + * it will at most lead to queueing up one more + * packet per queue in mac80211 rather than on + * the interface qdisc. + */ + netif_tx_wake_all_queues(sdata->dev); + } + + /* Check to see if we should re-enable beaconing */ + if (enable_beaconing && + (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_ADHOC || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT)) + ieee80211_bss_info_change_notify( + sdata, BSS_CHANGED_BEACON_ENABLED); + } + mutex_unlock(&local->iflist_mtx); +} + +static void ieee80211_hw_roc_start(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, hw_roc_start); + struct ieee80211_sub_if_data *sdata; + + mutex_lock(&local->mtx); + + if (!local->hw_roc_channel) { + mutex_unlock(&local->mtx); + return; + } + + ieee80211_recalc_idle(local); + + if (local->hw_roc_skb) { + sdata = IEEE80211_DEV_TO_SUB_IF(local->hw_roc_dev); + ieee80211_tx_skb(sdata, local->hw_roc_skb); + local->hw_roc_skb = NULL; + } else { + cfg80211_ready_on_channel(local->hw_roc_dev, + local->hw_roc_cookie, + local->hw_roc_channel, + local->hw_roc_channel_type, + local->hw_roc_duration, + GFP_KERNEL); + } + + mutex_unlock(&local->mtx); +} + +void ieee80211_ready_on_channel(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_ready_on_channel(local); + + ieee80211_queue_work(hw, &local->hw_roc_start); +} +EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel); + +static void ieee80211_hw_roc_done(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, hw_roc_done); + + mutex_lock(&local->mtx); + + if (!local->hw_roc_channel) { + mutex_unlock(&local->mtx); + return; + } + + /* was never transmitted */ + if (local->hw_roc_skb) { + u64 cookie; + + cookie = local->hw_roc_cookie ^ 2; + + cfg80211_mgmt_tx_status(local->hw_roc_dev, cookie, + local->hw_roc_skb->data, + local->hw_roc_skb->len, false, + GFP_KERNEL); + + kfree_skb(local->hw_roc_skb); + local->hw_roc_skb = NULL; + local->hw_roc_skb_for_status = NULL; + } + + if (!local->hw_roc_for_tx) + cfg80211_remain_on_channel_expired(local->hw_roc_dev, + local->hw_roc_cookie, + local->hw_roc_channel, + local->hw_roc_channel_type, + GFP_KERNEL); + + local->hw_roc_channel = NULL; + local->hw_roc_cookie = 0; + + ieee80211_recalc_idle(local); + + mutex_unlock(&local->mtx); +} + +void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_remain_on_channel_expired(local); + + ieee80211_queue_work(hw, &local->hw_roc_done); +} +EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired); + +void ieee80211_hw_roc_setup(struct ieee80211_local *local) +{ + INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start); + INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done); +} diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c new file mode 100644 index 00000000..730778a2 --- /dev/null +++ b/net/mac80211/pm.c @@ -0,0 +1,130 @@ +#include +#include + +#include "ieee80211_i.h" +#include "mesh.h" +#include "driver-ops.h" +#include "led.h" + +int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + + ieee80211_scan_cancel(local); + + if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + set_sta_flags(sta, WLAN_STA_BLOCK_BA); + ieee80211_sta_tear_down_BA_sessions(sta, true); + } + mutex_unlock(&local->sta_mtx); + } + + ieee80211_stop_queues_by_reason(hw, + IEEE80211_QUEUE_STOP_REASON_SUSPEND); + + /* flush out all packets */ + synchronize_net(); + + drv_flush(local, false); + + local->quiescing = true; + /* make quiescing visible to timers everywhere */ + mb(); + + flush_workqueue(local->workqueue); + + /* Don't try to run timers while suspended. */ + del_timer_sync(&local->sta_cleanup); + + /* + * Note that this particular timer doesn't need to be + * restarted at resume. + */ + cancel_work_sync(&local->dynamic_ps_enable_work); + del_timer_sync(&local->dynamic_ps_timer); + + local->wowlan = wowlan && local->open_count; + if (local->wowlan) { + int err = drv_suspend(local, wowlan); + if (err) { + local->quiescing = false; + return err; + } + goto suspend; + } + + /* disable keys */ + list_for_each_entry(sdata, &local->interfaces, list) + ieee80211_disable_keys(sdata); + + /* tear down aggregation sessions and remove STAs */ + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + if (sta->uploaded) { + sdata = sta->sdata; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + drv_sta_remove(local, sdata, &sta->sta); + } + + mesh_plink_quiesce(sta); + } + mutex_unlock(&local->sta_mtx); + + /* remove all interfaces */ + list_for_each_entry(sdata, &local->interfaces, list) { + cancel_work_sync(&sdata->work); + + switch(sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ieee80211_sta_quiesce(sdata); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_quiesce(sdata); + break; + case NL80211_IFTYPE_MESH_POINT: + ieee80211_mesh_quiesce(sdata); + break; + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MONITOR: + /* don't tell driver about this */ + continue; + default: + break; + } + + if (!ieee80211_sdata_running(sdata)) + continue; + + /* disable beaconing */ + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_BEACON_ENABLED); + + drv_remove_interface(local, &sdata->vif); + } + + /* stop hardware - this must stop RX */ + if (local->open_count) + ieee80211_stop_device(local); + + suspend: + local->suspended = true; + /* need suspended to be visible before quiescing is false */ + barrier(); + local->quiescing = false; + + return 0; +} + +/* + * __ieee80211_resume() is a static inline which just calls + * ieee80211_reconfig(), which is also needed for hardware + * hang/firmware failure/etc. recovery. + */ diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c new file mode 100644 index 00000000..816590b0 --- /dev/null +++ b/net/mac80211/rate.c @@ -0,0 +1,415 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright (c) 2006 Jiri Benc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include "rate.h" +#include "ieee80211_i.h" +#include "debugfs.h" + +struct rate_control_alg { + struct list_head list; + struct rate_control_ops *ops; +}; + +static LIST_HEAD(rate_ctrl_algs); +static DEFINE_MUTEX(rate_ctrl_mutex); + +static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT; +module_param(ieee80211_default_rc_algo, charp, 0644); +MODULE_PARM_DESC(ieee80211_default_rc_algo, + "Default rate control algorithm for mac80211 to use"); + +int ieee80211_rate_control_register(struct rate_control_ops *ops) +{ + struct rate_control_alg *alg; + + if (!ops->name) + return -EINVAL; + + mutex_lock(&rate_ctrl_mutex); + list_for_each_entry(alg, &rate_ctrl_algs, list) { + if (!strcmp(alg->ops->name, ops->name)) { + /* don't register an algorithm twice */ + WARN_ON(1); + mutex_unlock(&rate_ctrl_mutex); + return -EALREADY; + } + } + + alg = kzalloc(sizeof(*alg), GFP_KERNEL); + if (alg == NULL) { + mutex_unlock(&rate_ctrl_mutex); + return -ENOMEM; + } + alg->ops = ops; + + list_add_tail(&alg->list, &rate_ctrl_algs); + mutex_unlock(&rate_ctrl_mutex); + + return 0; +} +EXPORT_SYMBOL(ieee80211_rate_control_register); + +void ieee80211_rate_control_unregister(struct rate_control_ops *ops) +{ + struct rate_control_alg *alg; + + mutex_lock(&rate_ctrl_mutex); + list_for_each_entry(alg, &rate_ctrl_algs, list) { + if (alg->ops == ops) { + list_del(&alg->list); + kfree(alg); + break; + } + } + mutex_unlock(&rate_ctrl_mutex); +} +EXPORT_SYMBOL(ieee80211_rate_control_unregister); + +static struct rate_control_ops * +ieee80211_try_rate_control_ops_get(const char *name) +{ + struct rate_control_alg *alg; + struct rate_control_ops *ops = NULL; + + if (!name) + return NULL; + + mutex_lock(&rate_ctrl_mutex); + list_for_each_entry(alg, &rate_ctrl_algs, list) { + if (!strcmp(alg->ops->name, name)) + if (try_module_get(alg->ops->module)) { + ops = alg->ops; + break; + } + } + mutex_unlock(&rate_ctrl_mutex); + return ops; +} + +/* Get the rate control algorithm. */ +static struct rate_control_ops * +ieee80211_rate_control_ops_get(const char *name) +{ + struct rate_control_ops *ops; + const char *alg_name; + + kparam_block_sysfs_write(ieee80211_default_rc_algo); + if (!name) + alg_name = ieee80211_default_rc_algo; + else + alg_name = name; + + ops = ieee80211_try_rate_control_ops_get(alg_name); + if (!ops) { + request_module("rc80211_%s", alg_name); + ops = ieee80211_try_rate_control_ops_get(alg_name); + } + if (!ops && name) + /* try default if specific alg requested but not found */ + ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); + + /* try built-in one if specific alg requested but not found */ + if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT)) + ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); + kparam_unblock_sysfs_write(ieee80211_default_rc_algo); + + return ops; +} + +static void ieee80211_rate_control_ops_put(struct rate_control_ops *ops) +{ + module_put(ops->module); +} + +#ifdef CONFIG_MAC80211_DEBUGFS +static ssize_t rcname_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct rate_control_ref *ref = file->private_data; + int len = strlen(ref->ops->name); + + return simple_read_from_buffer(userbuf, count, ppos, + ref->ops->name, len); +} + +static const struct file_operations rcname_ops = { + .read = rcname_read, + .open = mac80211_open_file_generic, + .llseek = default_llseek, +}; +#endif + +static struct rate_control_ref *rate_control_alloc(const char *name, + struct ieee80211_local *local) +{ + struct dentry *debugfsdir = NULL; + struct rate_control_ref *ref; + + ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); + if (!ref) + goto fail_ref; + kref_init(&ref->kref); + ref->local = local; + ref->ops = ieee80211_rate_control_ops_get(name); + if (!ref->ops) + goto fail_ops; + +#ifdef CONFIG_MAC80211_DEBUGFS + debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); + local->debugfs.rcdir = debugfsdir; + debugfs_create_file("name", 0400, debugfsdir, ref, &rcname_ops); +#endif + + ref->priv = ref->ops->alloc(&local->hw, debugfsdir); + if (!ref->priv) + goto fail_priv; + return ref; + +fail_priv: + ieee80211_rate_control_ops_put(ref->ops); +fail_ops: + kfree(ref); +fail_ref: + return NULL; +} + +static void rate_control_release(struct kref *kref) +{ + struct rate_control_ref *ctrl_ref; + + ctrl_ref = container_of(kref, struct rate_control_ref, kref); + ctrl_ref->ops->free(ctrl_ref->priv); + +#ifdef CONFIG_MAC80211_DEBUGFS + debugfs_remove_recursive(ctrl_ref->local->debugfs.rcdir); + ctrl_ref->local->debugfs.rcdir = NULL; +#endif + + ieee80211_rate_control_ops_put(ctrl_ref->ops); + kfree(ctrl_ref); +} + +static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc) +{ + struct sk_buff *skb = txrc->skb; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + __le16 fc; + + fc = hdr->frame_control; + + return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc); +} + +static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, + struct ieee80211_supported_band *sband) +{ + u8 i; + + if (basic_rates == 0) + return; /* assume basic rates unknown and accept rate */ + if (*idx < 0) + return; + if (basic_rates & (1 << *idx)) + return; /* selected rate is a basic rate */ + + for (i = *idx + 1; i <= sband->n_bitrates; i++) { + if (basic_rates & (1 << i)) { + *idx = i; + return; + } + } + + /* could not find a basic rate; use original selection */ +} + +bool rate_control_send_low(struct ieee80211_sta *sta, + void *priv_sta, + struct ieee80211_tx_rate_control *txrc) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); + struct ieee80211_supported_band *sband = txrc->sband; + int mcast_rate; + + if (!sta || !priv_sta || rc_no_data_or_no_ack(txrc)) { + info->control.rates[0].idx = rate_lowest_index(txrc->sband, sta); + info->control.rates[0].count = + (info->flags & IEEE80211_TX_CTL_NO_ACK) ? + 1 : txrc->hw->max_rate_tries; + if (!sta && txrc->bss) { + mcast_rate = txrc->bss_conf->mcast_rate[sband->band]; + if (mcast_rate > 0) { + info->control.rates[0].idx = mcast_rate - 1; + return true; + } + + rc_send_low_broadcast(&info->control.rates[0].idx, + txrc->bss_conf->basic_rates, + sband); + } + return true; + } + return false; +} +EXPORT_SYMBOL(rate_control_send_low); + +static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, + int n_bitrates, u32 mask) +{ + int j; + + /* See whether the selected rate or anything below it is allowed. */ + for (j = rate->idx; j >= 0; j--) { + if (mask & (1 << j)) { + /* Okay, found a suitable rate. Use it. */ + rate->idx = j; + return; + } + } + + /* Try to find a higher rate that would be allowed */ + for (j = rate->idx + 1; j < n_bitrates; j++) { + if (mask & (1 << j)) { + /* Okay, found a suitable rate. Use it. */ + rate->idx = j; + return; + } + } + + /* + * Uh.. No suitable rate exists. This should not really happen with + * sane TX rate mask configurations. However, should someone manage to + * configure supported rates and TX rate mask in incompatible way, + * allow the frame to be transmitted with whatever the rate control + * selected. + */ +} + +void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_tx_rate_control *txrc) +{ + struct rate_control_ref *ref = sdata->local->rate_ctrl; + void *priv_sta = NULL; + struct ieee80211_sta *ista = NULL; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); + int i; + u32 mask; + + if (sta) { + ista = &sta->sta; + priv_sta = sta->rate_ctrl_priv; + } + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + info->control.rates[i].idx = -1; + info->control.rates[i].flags = 0; + info->control.rates[i].count = 0; + } + + if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + return; + + ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + + /* + * Try to enforce the rateidx mask the user wanted. skip this if the + * default mask (allow all rates) is used to save some processing for + * the common case. + */ + mask = sdata->rc_rateidx_mask[info->band]; + if (mask != (1 << txrc->sband->n_bitrates) - 1) { + if (sta) { + /* Filter out rates that the STA does not support */ + mask &= sta->sta.supp_rates[info->band]; + } + /* + * Make sure the rate index selected for each TX rate is + * included in the configured mask and change the rate indexes + * if needed. + */ + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + /* Skip invalid rates */ + if (info->control.rates[i].idx < 0) + break; + /* Rate masking supports only legacy rates for now */ + if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS) + continue; + rate_idx_match_mask(&info->control.rates[i], + txrc->sband->n_bitrates, mask); + } + } + + BUG_ON(info->control.rates[0].idx < 0); +} + +struct rate_control_ref *rate_control_get(struct rate_control_ref *ref) +{ + kref_get(&ref->kref); + return ref; +} + +void rate_control_put(struct rate_control_ref *ref) +{ + kref_put(&ref->kref, rate_control_release); +} + +int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, + const char *name) +{ + struct rate_control_ref *ref, *old; + + ASSERT_RTNL(); + + if (local->open_count) + return -EBUSY; + + if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (WARN_ON(!local->ops->set_rts_threshold)) + return -EINVAL; + return 0; + } + + ref = rate_control_alloc(name, local); + if (!ref) { + wiphy_warn(local->hw.wiphy, + "Failed to select rate control algorithm\n"); + return -ENOENT; + } + + old = local->rate_ctrl; + local->rate_ctrl = ref; + if (old) { + rate_control_put(old); + sta_info_flush(local, NULL); + } + + wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n", + ref->ops->name); + + return 0; +} + +void rate_control_deinitialize(struct ieee80211_local *local) +{ + struct rate_control_ref *ref; + + ref = local->rate_ctrl; + + if (!ref) + return; + + local->rate_ctrl = NULL; + rate_control_put(ref); +} + diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h new file mode 100644 index 00000000..168427b0 --- /dev/null +++ b/net/mac80211/rate.h @@ -0,0 +1,164 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright (c) 2006 Jiri Benc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef IEEE80211_RATE_H +#define IEEE80211_RATE_H + +#include +#include +#include +#include +#include +#include "ieee80211_i.h" +#include "sta_info.h" + +struct rate_control_ref { + struct ieee80211_local *local; + struct rate_control_ops *ops; + void *priv; + struct kref kref; +}; + +void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_tx_rate_control *txrc); +struct rate_control_ref *rate_control_get(struct rate_control_ref *ref); +void rate_control_put(struct rate_control_ref *ref); + +static inline void rate_control_tx_status(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct sta_info *sta, + struct sk_buff *skb) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + + if (!ref) + return; + + ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); +} + + +static inline void rate_control_rate_init(struct sta_info *sta) +{ + struct ieee80211_local *local = sta->sdata->local; + struct rate_control_ref *ref = sta->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + struct ieee80211_supported_band *sband; + + if (!ref) + return; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + ref->ops->rate_init(ref->priv, sband, ista, priv_sta); +} + +static inline void rate_control_rate_update(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct sta_info *sta, u32 changed, + enum nl80211_channel_type oper_chan_type) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + + if (ref && ref->ops->rate_update) + ref->ops->rate_update(ref->priv, sband, ista, + priv_sta, changed, oper_chan_type); +} + +static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, + struct ieee80211_sta *sta, + gfp_t gfp) +{ + return ref->ops->alloc_sta(ref->priv, sta, gfp); +} + +static inline void rate_control_free_sta(struct sta_info *sta) +{ + struct rate_control_ref *ref = sta->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + + ref->ops->free_sta(ref->priv, ista, priv_sta); +} + +static inline void rate_control_add_sta_debugfs(struct sta_info *sta) +{ +#ifdef CONFIG_MAC80211_DEBUGFS + struct rate_control_ref *ref = sta->rate_ctrl; + if (ref && sta->debugfs.dir && ref->ops->add_sta_debugfs) + ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv, + sta->debugfs.dir); +#endif +} + +static inline void rate_control_remove_sta_debugfs(struct sta_info *sta) +{ +#ifdef CONFIG_MAC80211_DEBUGFS + struct rate_control_ref *ref = sta->rate_ctrl; + if (ref && ref->ops->remove_sta_debugfs) + ref->ops->remove_sta_debugfs(ref->priv, sta->rate_ctrl_priv); +#endif +} + +/* Get a reference to the rate control algorithm. If `name' is NULL, get the + * first available algorithm. */ +int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, + const char *name); +void rate_control_deinitialize(struct ieee80211_local *local); + + +/* Rate control algorithms */ +#ifdef CONFIG_MAC80211_RC_PID +extern int rc80211_pid_init(void); +extern void rc80211_pid_exit(void); +#else +static inline int rc80211_pid_init(void) +{ + return 0; +} +static inline void rc80211_pid_exit(void) +{ +} +#endif + +#ifdef CONFIG_MAC80211_RC_MINSTREL +extern int rc80211_minstrel_init(void); +extern void rc80211_minstrel_exit(void); +#else +static inline int rc80211_minstrel_init(void) +{ + return 0; +} +static inline void rc80211_minstrel_exit(void) +{ +} +#endif + +#ifdef CONFIG_MAC80211_RC_MINSTREL_HT +extern int rc80211_minstrel_ht_init(void); +extern void rc80211_minstrel_ht_exit(void); +#else +static inline int rc80211_minstrel_ht_init(void) +{ + return 0; +} +static inline void rc80211_minstrel_ht_exit(void) +{ +} +#endif + + +#endif /* IEEE80211_RATE_H */ diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c new file mode 100644 index 00000000..8adac673 --- /dev/null +++ b/net/mac80211/rc80211_minstrel.c @@ -0,0 +1,570 @@ +/* + * Copyright (C) 2008 Felix Fietkau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on minstrel.c: + * Copyright (C) 2005-2007 Derek Smithies + * Sponsored by Indranet Technologies Ltd + * + * Based on sample.c: + * Copyright (c) 2005 John Bicket + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "rate.h" +#include "rc80211_minstrel.h" + +#define SAMPLE_COLUMNS 10 +#define SAMPLE_TBL(_mi, _idx, _col) \ + _mi->sample_table[(_idx * SAMPLE_COLUMNS) + _col] + +/* convert mac80211 rate index to local array index */ +static inline int +rix_to_ndx(struct minstrel_sta_info *mi, int rix) +{ + int i = rix; + for (i = rix; i >= 0; i--) + if (mi->r[i].rix == rix) + break; + return i; +} + +static void +minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) +{ + u32 max_tp = 0, index_max_tp = 0, index_max_tp2 = 0; + u32 max_prob = 0, index_max_prob = 0; + u32 usecs; + u32 p; + int i; + + mi->stats_update = jiffies; + for (i = 0; i < mi->n_rates; i++) { + struct minstrel_rate *mr = &mi->r[i]; + + usecs = mr->perfect_tx_time; + if (!usecs) + usecs = 1000000; + + /* To avoid rounding issues, probabilities scale from 0 (0%) + * to 18000 (100%) */ + if (mr->attempts) { + p = (mr->success * 18000) / mr->attempts; + mr->succ_hist += mr->success; + mr->att_hist += mr->attempts; + mr->cur_prob = p; + p = ((p * (100 - mp->ewma_level)) + (mr->probability * + mp->ewma_level)) / 100; + mr->probability = p; + mr->cur_tp = p * (1000000 / usecs); + } + + mr->last_success = mr->success; + mr->last_attempts = mr->attempts; + mr->success = 0; + mr->attempts = 0; + + /* Sample less often below the 10% chance of success. + * Sample less often above the 95% chance of success. */ + if ((mr->probability > 17100) || (mr->probability < 1800)) { + mr->adjusted_retry_count = mr->retry_count >> 1; + if (mr->adjusted_retry_count > 2) + mr->adjusted_retry_count = 2; + mr->sample_limit = 4; + } else { + mr->sample_limit = -1; + mr->adjusted_retry_count = mr->retry_count; + } + if (!mr->adjusted_retry_count) + mr->adjusted_retry_count = 2; + } + + for (i = 0; i < mi->n_rates; i++) { + struct minstrel_rate *mr = &mi->r[i]; + if (max_tp < mr->cur_tp) { + index_max_tp = i; + max_tp = mr->cur_tp; + } + if (max_prob < mr->probability) { + index_max_prob = i; + max_prob = mr->probability; + } + } + + max_tp = 0; + for (i = 0; i < mi->n_rates; i++) { + struct minstrel_rate *mr = &mi->r[i]; + + if (i == index_max_tp) + continue; + + if (max_tp < mr->cur_tp) { + index_max_tp2 = i; + max_tp = mr->cur_tp; + } + } + mi->max_tp_rate = index_max_tp; + mi->max_tp_rate2 = index_max_tp2; + mi->max_prob_rate = index_max_prob; +} + +static void +minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta, + struct sk_buff *skb) +{ + struct minstrel_sta_info *mi = priv_sta; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_rate *ar = info->status.rates; + int i, ndx; + int success; + + success = !!(info->flags & IEEE80211_TX_STAT_ACK); + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + if (ar[i].idx < 0) + break; + + ndx = rix_to_ndx(mi, ar[i].idx); + if (ndx < 0) + continue; + + mi->r[ndx].attempts += ar[i].count; + + if ((i != IEEE80211_TX_MAX_RATES - 1) && (ar[i + 1].idx < 0)) + mi->r[ndx].success += success; + } + + if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0)) + mi->sample_count++; + + if (mi->sample_deferred > 0) + mi->sample_deferred--; +} + + +static inline unsigned int +minstrel_get_retry_count(struct minstrel_rate *mr, + struct ieee80211_tx_info *info) +{ + unsigned int retry = mr->adjusted_retry_count; + + if (info->control.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) + retry = max(2U, min(mr->retry_count_rtscts, retry)); + else if (info->control.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) + retry = max(2U, min(mr->retry_count_cts, retry)); + return retry; +} + + +static int +minstrel_get_next_sample(struct minstrel_sta_info *mi) +{ + unsigned int sample_ndx; + sample_ndx = SAMPLE_TBL(mi, mi->sample_idx, mi->sample_column); + mi->sample_idx++; + if ((int) mi->sample_idx > (mi->n_rates - 2)) { + mi->sample_idx = 0; + mi->sample_column++; + if (mi->sample_column >= SAMPLE_COLUMNS) + mi->sample_column = 0; + } + return sample_ndx; +} + +static void +minstrel_get_rate(void *priv, struct ieee80211_sta *sta, + void *priv_sta, struct ieee80211_tx_rate_control *txrc) +{ + struct sk_buff *skb = txrc->skb; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct minstrel_sta_info *mi = priv_sta; + struct minstrel_priv *mp = priv; + struct ieee80211_tx_rate *ar = info->control.rates; + unsigned int ndx, sample_ndx = 0; + bool mrr; + bool sample_slower = false; + bool sample = false; + int i, delta; + int mrr_ndx[3]; + int sample_rate; + + if (rate_control_send_low(sta, priv_sta, txrc)) + return; + + mrr = mp->has_mrr && !txrc->rts && !txrc->bss_conf->use_cts_prot; + + if (time_after(jiffies, mi->stats_update + (mp->update_interval * + HZ) / 1000)) + minstrel_update_stats(mp, mi); + + ndx = mi->max_tp_rate; + + if (mrr) + sample_rate = mp->lookaround_rate_mrr; + else + sample_rate = mp->lookaround_rate; + + mi->packet_count++; + delta = (mi->packet_count * sample_rate / 100) - + (mi->sample_count + mi->sample_deferred / 2); + + /* delta > 0: sampling required */ + if ((delta > 0) && (mrr || !mi->prev_sample)) { + struct minstrel_rate *msr; + if (mi->packet_count >= 10000) { + mi->sample_deferred = 0; + mi->sample_count = 0; + mi->packet_count = 0; + } else if (delta > mi->n_rates * 2) { + /* With multi-rate retry, not every planned sample + * attempt actually gets used, due to the way the retry + * chain is set up - [max_tp,sample,prob,lowest] for + * sample_rate < max_tp. + * + * If there's too much sampling backlog and the link + * starts getting worse, minstrel would start bursting + * out lots of sampling frames, which would result + * in a large throughput loss. */ + mi->sample_count += (delta - mi->n_rates * 2); + } + + sample_ndx = minstrel_get_next_sample(mi); + msr = &mi->r[sample_ndx]; + sample = true; + sample_slower = mrr && (msr->perfect_tx_time > + mi->r[ndx].perfect_tx_time); + + if (!sample_slower) { + if (msr->sample_limit != 0) { + ndx = sample_ndx; + mi->sample_count++; + if (msr->sample_limit > 0) + msr->sample_limit--; + } else { + sample = false; + } + } else { + /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark + * packets that have the sampling rate deferred to the + * second MRR stage. Increase the sample counter only + * if the deferred sample rate was actually used. + * Use the sample_deferred counter to make sure that + * the sampling is not done in large bursts */ + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; + mi->sample_deferred++; + } + } + mi->prev_sample = sample; + + /* If we're not using MRR and the sampling rate already + * has a probability of >95%, we shouldn't be attempting + * to use it, as this only wastes precious airtime */ + if (!mrr && sample && (mi->r[ndx].probability > 17100)) + ndx = mi->max_tp_rate; + + ar[0].idx = mi->r[ndx].rix; + ar[0].count = minstrel_get_retry_count(&mi->r[ndx], info); + + if (!mrr) { + if (!sample) + ar[0].count = mp->max_retry; + ar[1].idx = mi->lowest_rix; + ar[1].count = mp->max_retry; + return; + } + + /* MRR setup */ + if (sample) { + if (sample_slower) + mrr_ndx[0] = sample_ndx; + else + mrr_ndx[0] = mi->max_tp_rate; + } else { + mrr_ndx[0] = mi->max_tp_rate2; + } + mrr_ndx[1] = mi->max_prob_rate; + mrr_ndx[2] = 0; + for (i = 1; i < 4; i++) { + ar[i].idx = mi->r[mrr_ndx[i - 1]].rix; + ar[i].count = mi->r[mrr_ndx[i - 1]].adjusted_retry_count; + } +} + + +static void +calc_rate_durations(struct minstrel_sta_info *mi, struct ieee80211_local *local, + struct minstrel_rate *d, struct ieee80211_rate *rate) +{ + int erp = !!(rate->flags & IEEE80211_RATE_ERP_G); + + d->perfect_tx_time = ieee80211_frame_duration(local, 1200, + rate->bitrate, erp, 1); + d->ack_time = ieee80211_frame_duration(local, 10, + rate->bitrate, erp, 1); +} + +static void +init_sample_table(struct minstrel_sta_info *mi) +{ + unsigned int i, col, new_idx; + unsigned int n_srates = mi->n_rates - 1; + u8 rnd[8]; + + mi->sample_column = 0; + mi->sample_idx = 0; + memset(mi->sample_table, 0, SAMPLE_COLUMNS * mi->n_rates); + + for (col = 0; col < SAMPLE_COLUMNS; col++) { + for (i = 0; i < n_srates; i++) { + get_random_bytes(rnd, sizeof(rnd)); + new_idx = (i + rnd[i & 7]) % n_srates; + + while (SAMPLE_TBL(mi, new_idx, col) != 0) + new_idx = (new_idx + 1) % n_srates; + + /* Don't sample the slowest rate (i.e. slowest base + * rate). We must presume that the slowest rate works + * fine, or else other management frames will also be + * failing and the link will break */ + SAMPLE_TBL(mi, new_idx, col) = i + 1; + } + } +} + +static void +minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta) +{ + struct minstrel_sta_info *mi = priv_sta; + struct minstrel_priv *mp = priv; + struct ieee80211_local *local = hw_to_local(mp->hw); + struct ieee80211_rate *ctl_rate; + unsigned int i, n = 0; + unsigned int t_slot = 9; /* FIXME: get real slot time */ + + mi->lowest_rix = rate_lowest_index(sband, sta); + ctl_rate = &sband->bitrates[mi->lowest_rix]; + mi->sp_ack_dur = ieee80211_frame_duration(local, 10, ctl_rate->bitrate, + !!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1); + + for (i = 0; i < sband->n_bitrates; i++) { + struct minstrel_rate *mr = &mi->r[n]; + unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0; + unsigned int tx_time_single; + unsigned int cw = mp->cw_min; + + if (!rate_supported(sta, sband->band, i)) + continue; + n++; + memset(mr, 0, sizeof(*mr)); + + mr->rix = i; + mr->bitrate = sband->bitrates[i].bitrate / 5; + calc_rate_durations(mi, local, mr, + &sband->bitrates[i]); + + /* calculate maximum number of retransmissions before + * fallback (based on maximum segment size) */ + mr->sample_limit = -1; + mr->retry_count = 1; + mr->retry_count_cts = 1; + mr->retry_count_rtscts = 1; + tx_time = mr->perfect_tx_time + mi->sp_ack_dur; + do { + /* add one retransmission */ + tx_time_single = mr->ack_time + mr->perfect_tx_time; + + /* contention window */ + tx_time_single += (t_slot * cw) >> 1; + cw = min((cw << 1) | 1, mp->cw_max); + + tx_time += tx_time_single; + tx_time_cts += tx_time_single + mi->sp_ack_dur; + tx_time_rtscts += tx_time_single + 2 * mi->sp_ack_dur; + if ((tx_time_cts < mp->segment_size) && + (mr->retry_count_cts < mp->max_retry)) + mr->retry_count_cts++; + if ((tx_time_rtscts < mp->segment_size) && + (mr->retry_count_rtscts < mp->max_retry)) + mr->retry_count_rtscts++; + } while ((tx_time < mp->segment_size) && + (++mr->retry_count < mp->max_retry)); + mr->adjusted_retry_count = mr->retry_count; + } + + for (i = n; i < sband->n_bitrates; i++) { + struct minstrel_rate *mr = &mi->r[i]; + mr->rix = -1; + } + + mi->n_rates = n; + mi->stats_update = jiffies; + + init_sample_table(mi); +} + +static void * +minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) +{ + struct ieee80211_supported_band *sband; + struct minstrel_sta_info *mi; + struct minstrel_priv *mp = priv; + struct ieee80211_hw *hw = mp->hw; + int max_rates = 0; + int i; + + mi = kzalloc(sizeof(struct minstrel_sta_info), gfp); + if (!mi) + return NULL; + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + sband = hw->wiphy->bands[i]; + if (sband && sband->n_bitrates > max_rates) + max_rates = sband->n_bitrates; + } + + mi->r = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp); + if (!mi->r) + goto error; + + mi->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp); + if (!mi->sample_table) + goto error1; + + mi->stats_update = jiffies; + return mi; + +error1: + kfree(mi->r); +error: + kfree(mi); + return NULL; +} + +static void +minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) +{ + struct minstrel_sta_info *mi = priv_sta; + + kfree(mi->sample_table); + kfree(mi->r); + kfree(mi); +} + +static void * +minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) +{ + struct minstrel_priv *mp; + + mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); + if (!mp) + return NULL; + + /* contention window settings + * Just an approximation. Using the per-queue values would complicate + * the calculations and is probably unnecessary */ + mp->cw_min = 15; + mp->cw_max = 1023; + + /* number of packets (in %) to use for sampling other rates + * sample less often for non-mrr packets, because the overhead + * is much higher than with mrr */ + mp->lookaround_rate = 5; + mp->lookaround_rate_mrr = 10; + + /* moving average weight for EWMA */ + mp->ewma_level = 75; + + /* maximum time that the hw is allowed to stay in one MRR segment */ + mp->segment_size = 6000; + + if (hw->max_rate_tries > 0) + mp->max_retry = hw->max_rate_tries; + else + /* safe default, does not necessarily have to match hw properties */ + mp->max_retry = 7; + + if (hw->max_rates >= 4) + mp->has_mrr = true; + + mp->hw = hw; + mp->update_interval = 100; + + return mp; +} + +static void +minstrel_free(void *priv) +{ + kfree(priv); +} + +struct rate_control_ops mac80211_minstrel = { + .name = "minstrel", + .tx_status = minstrel_tx_status, + .get_rate = minstrel_get_rate, + .rate_init = minstrel_rate_init, + .alloc = minstrel_alloc, + .free = minstrel_free, + .alloc_sta = minstrel_alloc_sta, + .free_sta = minstrel_free_sta, +#ifdef CONFIG_MAC80211_DEBUGFS + .add_sta_debugfs = minstrel_add_sta_debugfs, + .remove_sta_debugfs = minstrel_remove_sta_debugfs, +#endif +}; + +int __init +rc80211_minstrel_init(void) +{ + return ieee80211_rate_control_register(&mac80211_minstrel); +} + +void +rc80211_minstrel_exit(void) +{ + ieee80211_rate_control_unregister(&mac80211_minstrel); +} + diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h new file mode 100644 index 00000000..0f5a8337 --- /dev/null +++ b/net/mac80211/rc80211_minstrel.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2008 Felix Fietkau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __RC_MINSTREL_H +#define __RC_MINSTREL_H + +struct minstrel_rate { + int bitrate; + int rix; + + unsigned int perfect_tx_time; + unsigned int ack_time; + + int sample_limit; + unsigned int retry_count; + unsigned int retry_count_cts; + unsigned int retry_count_rtscts; + unsigned int adjusted_retry_count; + + u32 success; + u32 attempts; + u32 last_attempts; + u32 last_success; + + /* parts per thousand */ + u32 cur_prob; + u32 probability; + + /* per-rate throughput */ + u32 cur_tp; + + u64 succ_hist; + u64 att_hist; +}; + +struct minstrel_sta_info { + unsigned long stats_update; + unsigned int sp_ack_dur; + unsigned int rate_avg; + + unsigned int lowest_rix; + + unsigned int max_tp_rate; + unsigned int max_tp_rate2; + unsigned int max_prob_rate; + unsigned int packet_count; + unsigned int sample_count; + int sample_deferred; + + unsigned int sample_idx; + unsigned int sample_column; + + int n_rates; + struct minstrel_rate *r; + bool prev_sample; + + /* sampling table */ + u8 *sample_table; + +#ifdef CONFIG_MAC80211_DEBUGFS + struct dentry *dbg_stats; +#endif +}; + +struct minstrel_priv { + struct ieee80211_hw *hw; + bool has_mrr; + unsigned int cw_min; + unsigned int cw_max; + unsigned int max_retry; + unsigned int ewma_level; + unsigned int segment_size; + unsigned int update_interval; + unsigned int lookaround_rate; + unsigned int lookaround_rate_mrr; +}; + +struct minstrel_debugfs_info { + size_t len; + char buf[]; +}; + +extern struct rate_control_ops mac80211_minstrel; +void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); +void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); + +/* debugfs */ +int minstrel_stats_open(struct inode *inode, struct file *file); +ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); +int minstrel_stats_release(struct inode *inode, struct file *file); + +#endif diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c new file mode 100644 index 00000000..a290ad23 --- /dev/null +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2008 Felix Fietkau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on minstrel.c: + * Copyright (C) 2005-2007 Derek Smithies + * Sponsored by Indranet Technologies Ltd + * + * Based on sample.c: + * Copyright (c) 2005 John Bicket + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + */ +#include +#include +#include +#include +#include +#include +#include +#include "rc80211_minstrel.h" + +int +minstrel_stats_open(struct inode *inode, struct file *file) +{ + struct minstrel_sta_info *mi = inode->i_private; + struct minstrel_debugfs_info *ms; + unsigned int i, tp, prob, eprob; + char *p; + + ms = kmalloc(sizeof(*ms) + 4096, GFP_KERNEL); + if (!ms) + return -ENOMEM; + + file->private_data = ms; + p = ms->buf; + p += sprintf(p, "rate throughput ewma prob this prob " + "this succ/attempt success attempts\n"); + for (i = 0; i < mi->n_rates; i++) { + struct minstrel_rate *mr = &mi->r[i]; + + *(p++) = (i == mi->max_tp_rate) ? 'T' : ' '; + *(p++) = (i == mi->max_tp_rate2) ? 't' : ' '; + *(p++) = (i == mi->max_prob_rate) ? 'P' : ' '; + p += sprintf(p, "%3u%s", mr->bitrate / 2, + (mr->bitrate & 1 ? ".5" : " ")); + + tp = mr->cur_tp / ((18000 << 10) / 96); + prob = mr->cur_prob / 18; + eprob = mr->probability / 18; + + p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " + "%3u(%3u) %8llu %8llu\n", + tp / 10, tp % 10, + eprob / 10, eprob % 10, + prob / 10, prob % 10, + mr->last_success, + mr->last_attempts, + (unsigned long long)mr->succ_hist, + (unsigned long long)mr->att_hist); + } + p += sprintf(p, "\nTotal packet count:: ideal %d " + "lookaround %d\n\n", + mi->packet_count - mi->sample_count, + mi->sample_count); + ms->len = p - ms->buf; + + return 0; +} + +ssize_t +minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +{ + struct minstrel_debugfs_info *ms; + + ms = file->private_data; + return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len); +} + +int +minstrel_stats_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations minstrel_stat_fops = { + .owner = THIS_MODULE, + .open = minstrel_stats_open, + .read = minstrel_stats_read, + .release = minstrel_stats_release, + .llseek = default_llseek, +}; + +void +minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) +{ + struct minstrel_sta_info *mi = priv_sta; + + mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi, + &minstrel_stat_fops); +} + +void +minstrel_remove_sta_debugfs(void *priv, void *priv_sta) +{ + struct minstrel_sta_info *mi = priv_sta; + + debugfs_remove(mi->dbg_stats); +} diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c new file mode 100644 index 00000000..333b5118 --- /dev/null +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -0,0 +1,881 @@ +/* + * Copyright (C) 2010 Felix Fietkau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "rate.h" +#include "rc80211_minstrel.h" +#include "rc80211_minstrel_ht.h" + +#define AVG_PKT_SIZE 1200 +#define SAMPLE_COLUMNS 10 +#define EWMA_LEVEL 75 + +/* Number of bits for an average sized packet */ +#define MCS_NBITS (AVG_PKT_SIZE << 3) + +/* Number of symbols for a packet with (bps) bits per symbol */ +#define MCS_NSYMS(bps) ((MCS_NBITS + (bps) - 1) / (bps)) + +/* Transmission time for a packet containing (syms) symbols */ +#define MCS_SYMBOL_TIME(sgi, syms) \ + (sgi ? \ + ((syms) * 18 + 4) / 5 : /* syms * 3.6 us */ \ + (syms) << 2 /* syms * 4 us */ \ + ) + +/* Transmit duration for the raw data part of an average sized packet */ +#define MCS_DURATION(streams, sgi, bps) MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) + +/* MCS rate information for an MCS group */ +#define MCS_GROUP(_streams, _sgi, _ht40) { \ + .streams = _streams, \ + .flags = \ + (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ + (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ + .duration = { \ + MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234), \ + MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) \ + } \ +} + +/* + * To enable sufficiently targeted rate sampling, MCS rates are divided into + * groups, based on the number of streams and flags (HT40, SGI) that they + * use. + */ +const struct mcs_group minstrel_mcs_groups[] = { + MCS_GROUP(1, 0, 0), + MCS_GROUP(2, 0, 0), +#if MINSTREL_MAX_STREAMS >= 3 + MCS_GROUP(3, 0, 0), +#endif + + MCS_GROUP(1, 1, 0), + MCS_GROUP(2, 1, 0), +#if MINSTREL_MAX_STREAMS >= 3 + MCS_GROUP(3, 1, 0), +#endif + + MCS_GROUP(1, 0, 1), + MCS_GROUP(2, 0, 1), +#if MINSTREL_MAX_STREAMS >= 3 + MCS_GROUP(3, 0, 1), +#endif + + MCS_GROUP(1, 1, 1), + MCS_GROUP(2, 1, 1), +#if MINSTREL_MAX_STREAMS >= 3 + MCS_GROUP(3, 1, 1), +#endif +}; + +static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES]; + +/* + * Perform EWMA (Exponentially Weighted Moving Average) calculation + */ +static int +minstrel_ewma(int old, int new, int weight) +{ + return (new * (100 - weight) + old * weight) / 100; +} + +/* + * Look up an MCS group index based on mac80211 rate information + */ +static int +minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) +{ + int streams = (rate->idx / MCS_GROUP_RATES) + 1; + u32 flags = IEEE80211_TX_RC_SHORT_GI | IEEE80211_TX_RC_40_MHZ_WIDTH; + int i; + + for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { + if (minstrel_mcs_groups[i].streams != streams) + continue; + if (minstrel_mcs_groups[i].flags != (rate->flags & flags)) + continue; + + return i; + } + + WARN_ON(1); + return 0; +} + +static inline struct minstrel_rate_stats * +minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) +{ + return &mi->groups[index / MCS_GROUP_RATES].rates[index % MCS_GROUP_RATES]; +} + + +/* + * Recalculate success probabilities and counters for a rate using EWMA + */ +static void +minstrel_calc_rate_ewma(struct minstrel_priv *mp, struct minstrel_rate_stats *mr) +{ + if (unlikely(mr->attempts > 0)) { + mr->sample_skipped = 0; + mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts); + if (!mr->att_hist) + mr->probability = mr->cur_prob; + else + mr->probability = minstrel_ewma(mr->probability, + mr->cur_prob, EWMA_LEVEL); + mr->att_hist += mr->attempts; + mr->succ_hist += mr->success; + } else { + mr->sample_skipped++; + } + mr->last_success = mr->success; + mr->last_attempts = mr->attempts; + mr->success = 0; + mr->attempts = 0; +} + +/* + * Calculate throughput based on the average A-MPDU length, taking into account + * the expected number of retransmissions and their expected length + */ +static void +minstrel_ht_calc_tp(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + int group, int rate) +{ + struct minstrel_rate_stats *mr; + unsigned int usecs; + + mr = &mi->groups[group].rates[rate]; + + if (mr->probability < MINSTREL_FRAC(1, 10)) { + mr->cur_tp = 0; + return; + } + + usecs = mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); + usecs += minstrel_mcs_groups[group].duration[rate]; + mr->cur_tp = MINSTREL_TRUNC((1000000 / usecs) * mr->probability); +} + +/* + * Update rate statistics and select new primary rates + * + * Rules for rate selection: + * - max_prob_rate must use only one stream, as a tradeoff between delivery + * probability and throughput during strong fluctuations + * - as long as the max prob rate has a probability of more than 3/4, pick + * higher throughput rates, even if the probablity is a bit lower + */ +static void +minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) +{ + struct minstrel_mcs_group_data *mg; + struct minstrel_rate_stats *mr; + int cur_prob, cur_prob_tp, cur_tp, cur_tp2; + int group, i, index; + + if (mi->ampdu_packets > 0) { + mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, + MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL); + mi->ampdu_len = 0; + mi->ampdu_packets = 0; + } + + mi->sample_slow = 0; + mi->sample_count = 0; + mi->max_tp_rate = 0; + mi->max_tp_rate2 = 0; + mi->max_prob_rate = 0; + + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + cur_prob = 0; + cur_prob_tp = 0; + cur_tp = 0; + cur_tp2 = 0; + + mg = &mi->groups[group]; + if (!mg->supported) + continue; + + mg->max_tp_rate = 0; + mg->max_tp_rate2 = 0; + mg->max_prob_rate = 0; + mi->sample_count++; + + for (i = 0; i < MCS_GROUP_RATES; i++) { + if (!(mg->supported & BIT(i))) + continue; + + mr = &mg->rates[i]; + mr->retry_updated = false; + index = MCS_GROUP_RATES * group + i; + minstrel_calc_rate_ewma(mp, mr); + minstrel_ht_calc_tp(mp, mi, group, i); + + if (!mr->cur_tp) + continue; + + /* ignore the lowest rate of each single-stream group */ + if (!i && minstrel_mcs_groups[group].streams == 1) + continue; + + if ((mr->cur_tp > cur_prob_tp && mr->probability > + MINSTREL_FRAC(3, 4)) || mr->probability > cur_prob) { + mg->max_prob_rate = index; + cur_prob = mr->probability; + cur_prob_tp = mr->cur_tp; + } + + if (mr->cur_tp > cur_tp) { + swap(index, mg->max_tp_rate); + cur_tp = mr->cur_tp; + mr = minstrel_get_ratestats(mi, index); + } + + if (index >= mg->max_tp_rate) + continue; + + if (mr->cur_tp > cur_tp2) { + mg->max_tp_rate2 = index; + cur_tp2 = mr->cur_tp; + } + } + } + + /* try to sample up to half of the available rates during each interval */ + mi->sample_count *= 4; + + cur_prob = 0; + cur_prob_tp = 0; + cur_tp = 0; + cur_tp2 = 0; + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + mg = &mi->groups[group]; + if (!mg->supported) + continue; + + mr = minstrel_get_ratestats(mi, mg->max_prob_rate); + if (cur_prob_tp < mr->cur_tp && + minstrel_mcs_groups[group].streams == 1) { + mi->max_prob_rate = mg->max_prob_rate; + cur_prob = mr->cur_prob; + cur_prob_tp = mr->cur_tp; + } + + mr = minstrel_get_ratestats(mi, mg->max_tp_rate); + if (cur_tp < mr->cur_tp) { + mi->max_tp_rate = mg->max_tp_rate; + cur_tp = mr->cur_tp; + } + + mr = minstrel_get_ratestats(mi, mg->max_tp_rate2); + if (cur_tp2 < mr->cur_tp) { + mi->max_tp_rate2 = mg->max_tp_rate2; + cur_tp2 = mr->cur_tp; + } + } + + mi->stats_update = jiffies; +} + +static bool +minstrel_ht_txstat_valid(struct ieee80211_tx_rate *rate) +{ + if (!rate->count) + return false; + + if (rate->idx < 0) + return false; + + return !!(rate->flags & IEEE80211_TX_RC_MCS); +} + +static void +minstrel_next_sample_idx(struct minstrel_ht_sta *mi) +{ + struct minstrel_mcs_group_data *mg; + + for (;;) { + mi->sample_group++; + mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups); + mg = &mi->groups[mi->sample_group]; + + if (!mg->supported) + continue; + + if (++mg->index >= MCS_GROUP_RATES) { + mg->index = 0; + if (++mg->column >= ARRAY_SIZE(sample_table)) + mg->column = 0; + } + break; + } +} + +static void +minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx, + bool primary) +{ + int group, orig_group; + + orig_group = group = *idx / MCS_GROUP_RATES; + while (group > 0) { + group--; + + if (!mi->groups[group].supported) + continue; + + if (minstrel_mcs_groups[group].streams > + minstrel_mcs_groups[orig_group].streams) + continue; + + if (primary) + *idx = mi->groups[group].max_tp_rate; + else + *idx = mi->groups[group].max_tp_rate2; + break; + } +} + +static void +minstrel_aggr_check(struct minstrel_priv *mp, struct ieee80211_sta *pubsta, struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + u16 tid; + + if (unlikely(!ieee80211_is_data_qos(hdr->frame_control))) + return; + + if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE))) + return; + + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + if (likely(sta->ampdu_mlme.tid_tx[tid])) + return; + + if (skb_get_queue_mapping(skb) == IEEE80211_AC_VO) + return; + + ieee80211_start_tx_ba_session(pubsta, tid, 5000); +} + +static void +minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta, + struct sk_buff *skb) +{ + struct minstrel_ht_sta_priv *msp = priv_sta; + struct minstrel_ht_sta *mi = &msp->ht; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_rate *ar = info->status.rates; + struct minstrel_rate_stats *rate, *rate2; + struct minstrel_priv *mp = priv; + bool last = false; + int group; + int i = 0; + + if (!msp->is_ht) + return mac80211_minstrel.tx_status(priv, sband, sta, &msp->legacy, skb); + + /* This packet was aggregated but doesn't carry status info */ + if ((info->flags & IEEE80211_TX_CTL_AMPDU) && + !(info->flags & IEEE80211_TX_STAT_AMPDU)) + return; + + if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) { + info->status.ampdu_ack_len = + (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0); + info->status.ampdu_len = 1; + } + + mi->ampdu_packets++; + mi->ampdu_len += info->status.ampdu_len; + + if (!mi->sample_wait && !mi->sample_tries && mi->sample_count > 0) { + mi->sample_wait = 16 + 2 * MINSTREL_TRUNC(mi->avg_ampdu_len); + mi->sample_tries = 2; + mi->sample_count--; + } + + if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) + mi->sample_packets += info->status.ampdu_len; + + for (i = 0; !last; i++) { + last = (i == IEEE80211_TX_MAX_RATES - 1) || + !minstrel_ht_txstat_valid(&ar[i + 1]); + + if (!minstrel_ht_txstat_valid(&ar[i])) + break; + + group = minstrel_ht_get_group_idx(&ar[i]); + rate = &mi->groups[group].rates[ar[i].idx % 8]; + + if (last) + rate->success += info->status.ampdu_ack_len; + + rate->attempts += ar[i].count * info->status.ampdu_len; + } + + /* + * check for sudden death of spatial multiplexing, + * downgrade to a lower number of streams if necessary. + */ + rate = minstrel_get_ratestats(mi, mi->max_tp_rate); + if (rate->attempts > 30 && + MINSTREL_FRAC(rate->success, rate->attempts) < + MINSTREL_FRAC(20, 100)) + minstrel_downgrade_rate(mi, &mi->max_tp_rate, true); + + rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate2); + if (rate2->attempts > 30 && + MINSTREL_FRAC(rate2->success, rate2->attempts) < + MINSTREL_FRAC(20, 100)) + minstrel_downgrade_rate(mi, &mi->max_tp_rate2, false); + + if (time_after(jiffies, mi->stats_update + (mp->update_interval / 2 * HZ) / 1000)) { + minstrel_ht_update_stats(mp, mi); + minstrel_aggr_check(mp, sta, skb); + } +} + +static void +minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + int index) +{ + struct minstrel_rate_stats *mr; + const struct mcs_group *group; + unsigned int tx_time, tx_time_rtscts, tx_time_data; + unsigned int cw = mp->cw_min; + unsigned int ctime = 0; + unsigned int t_slot = 9; /* FIXME */ + unsigned int ampdu_len = MINSTREL_TRUNC(mi->avg_ampdu_len); + + mr = minstrel_get_ratestats(mi, index); + if (mr->probability < MINSTREL_FRAC(1, 10)) { + mr->retry_count = 1; + mr->retry_count_rtscts = 1; + return; + } + + mr->retry_count = 2; + mr->retry_count_rtscts = 2; + mr->retry_updated = true; + + group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; + tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len; + + /* Contention time for first 2 tries */ + ctime = (t_slot * cw) >> 1; + cw = min((cw << 1) | 1, mp->cw_max); + ctime += (t_slot * cw) >> 1; + cw = min((cw << 1) | 1, mp->cw_max); + + /* Total TX time for data and Contention after first 2 tries */ + tx_time = ctime + 2 * (mi->overhead + tx_time_data); + tx_time_rtscts = ctime + 2 * (mi->overhead_rtscts + tx_time_data); + + /* See how many more tries we can fit inside segment size */ + do { + /* Contention time for this try */ + ctime = (t_slot * cw) >> 1; + cw = min((cw << 1) | 1, mp->cw_max); + + /* Total TX time after this try */ + tx_time += ctime + mi->overhead + tx_time_data; + tx_time_rtscts += ctime + mi->overhead_rtscts + tx_time_data; + + if (tx_time_rtscts < mp->segment_size) + mr->retry_count_rtscts++; + } while ((tx_time < mp->segment_size) && + (++mr->retry_count < mp->max_retry)); +} + + +static void +minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, + struct ieee80211_tx_rate *rate, int index, + struct ieee80211_tx_rate_control *txrc, + bool sample, bool rtscts) +{ + const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; + struct minstrel_rate_stats *mr; + + mr = minstrel_get_ratestats(mi, index); + if (!mr->retry_updated) + minstrel_calc_retransmit(mp, mi, index); + + if (sample) + rate->count = 1; + else if (mr->probability < MINSTREL_FRAC(20, 100)) + rate->count = 2; + else if (rtscts) + rate->count = mr->retry_count_rtscts; + else + rate->count = mr->retry_count; + + rate->flags = IEEE80211_TX_RC_MCS | group->flags; + if (rtscts) + rate->flags |= IEEE80211_TX_RC_USE_RTS_CTS; + rate->idx = index % MCS_GROUP_RATES + (group->streams - 1) * MCS_GROUP_RATES; +} + +static inline int +minstrel_get_duration(int index) +{ + const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; + return group->duration[index % MCS_GROUP_RATES]; +} + +static int +minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) +{ + struct minstrel_rate_stats *mr; + struct minstrel_mcs_group_data *mg; + int sample_idx = 0; + + if (mi->sample_wait > 0) { + mi->sample_wait--; + return -1; + } + + if (!mi->sample_tries) + return -1; + + mi->sample_tries--; + mg = &mi->groups[mi->sample_group]; + sample_idx = sample_table[mg->column][mg->index]; + mr = &mg->rates[sample_idx]; + sample_idx += mi->sample_group * MCS_GROUP_RATES; + minstrel_next_sample_idx(mi); + + /* + * When not using MRR, do not sample if the probability is already + * higher than 95% to avoid wasting airtime + */ + if (!mp->has_mrr && (mr->probability > MINSTREL_FRAC(95, 100))) + return -1; + + /* + * Make sure that lower rates get sampled only occasionally, + * if the link is working perfectly. + */ + if (minstrel_get_duration(sample_idx) > + minstrel_get_duration(mi->max_tp_rate)) { + if (mr->sample_skipped < 20) + return -1; + + if (mi->sample_slow++ > 2) + return -1; + } + + return sample_idx; +} + +static void +minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, + struct ieee80211_tx_rate_control *txrc) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); + struct ieee80211_tx_rate *ar = info->status.rates; + struct minstrel_ht_sta_priv *msp = priv_sta; + struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_priv *mp = priv; + int sample_idx; + bool sample = false; + + if (rate_control_send_low(sta, priv_sta, txrc)) + return; + + if (!msp->is_ht) + return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc); + + info->flags |= mi->tx_flags; + sample_idx = minstrel_get_sample_rate(mp, mi); + if (sample_idx >= 0) { + sample = true; + minstrel_ht_set_rate(mp, mi, &ar[0], sample_idx, + txrc, true, false); + info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; + } else { + minstrel_ht_set_rate(mp, mi, &ar[0], mi->max_tp_rate, + txrc, false, false); + } + + if (mp->hw->max_rates >= 3) { + /* + * At least 3 tx rates supported, use + * sample_rate -> max_tp_rate -> max_prob_rate for sampling and + * max_tp_rate -> max_tp_rate2 -> max_prob_rate by default. + */ + if (sample_idx >= 0) + minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate, + txrc, false, false); + else + minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate2, + txrc, false, true); + + minstrel_ht_set_rate(mp, mi, &ar[2], mi->max_prob_rate, + txrc, false, !sample); + + ar[3].count = 0; + ar[3].idx = -1; + } else if (mp->hw->max_rates == 2) { + /* + * Only 2 tx rates supported, use + * sample_rate -> max_prob_rate for sampling and + * max_tp_rate -> max_prob_rate by default. + */ + minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_prob_rate, + txrc, false, !sample); + + ar[2].count = 0; + ar[2].idx = -1; + } else { + /* Not using MRR, only use the first rate */ + ar[1].count = 0; + ar[1].idx = -1; + } + + mi->total_packets++; + + /* wraparound */ + if (mi->total_packets == ~0) { + mi->total_packets = 0; + mi->sample_packets = 0; + } +} + +static void +minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta, + enum nl80211_channel_type oper_chan_type) +{ + struct minstrel_priv *mp = priv; + struct minstrel_ht_sta_priv *msp = priv_sta; + struct minstrel_ht_sta *mi = &msp->ht; + struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; + struct ieee80211_local *local = hw_to_local(mp->hw); + u16 sta_cap = sta->ht_cap.cap; + int n_supported = 0; + int ack_dur; + int stbc; + int i; + + /* fall back to the old minstrel for legacy stations */ + if (!sta->ht_cap.ht_supported) + goto use_legacy; + + BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != + MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS); + + msp->is_ht = true; + memset(mi, 0, sizeof(*mi)); + mi->stats_update = jiffies; + + ack_dur = ieee80211_frame_duration(local, 10, 60, 1, 1); + mi->overhead = ieee80211_frame_duration(local, 0, 60, 1, 1) + ack_dur; + mi->overhead_rtscts = mi->overhead + 2 * ack_dur; + + mi->avg_ampdu_len = MINSTREL_FRAC(1, 1); + + /* When using MRR, sample more on the first attempt, without delay */ + if (mp->has_mrr) { + mi->sample_count = 16; + mi->sample_wait = 0; + } else { + mi->sample_count = 8; + mi->sample_wait = 8; + } + mi->sample_tries = 4; + + stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >> + IEEE80211_HT_CAP_RX_STBC_SHIFT; + mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT; + + if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING) + mi->tx_flags |= IEEE80211_TX_CTL_LDPC; + + if (oper_chan_type != NL80211_CHAN_HT40MINUS && + oper_chan_type != NL80211_CHAN_HT40PLUS) + sta_cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; + + for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { + u16 req = 0; + + mi->groups[i].supported = 0; + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) { + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + req |= IEEE80211_HT_CAP_SGI_40; + else + req |= IEEE80211_HT_CAP_SGI_20; + } + + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + req |= IEEE80211_HT_CAP_SUP_WIDTH_20_40; + + if ((sta_cap & req) != req) + continue; + + mi->groups[i].supported = + mcs->rx_mask[minstrel_mcs_groups[i].streams - 1]; + + if (mi->groups[i].supported) + n_supported++; + } + + if (!n_supported) + goto use_legacy; + + return; + +use_legacy: + msp->is_ht = false; + memset(&msp->legacy, 0, sizeof(msp->legacy)); + msp->legacy.r = msp->ratelist; + msp->legacy.sample_table = msp->sample_table; + return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy); +} + +static void +minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta) +{ + struct minstrel_priv *mp = priv; + + minstrel_ht_update_caps(priv, sband, sta, priv_sta, mp->hw->conf.channel_type); +} + +static void +minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta, + u32 changed, enum nl80211_channel_type oper_chan_type) +{ + minstrel_ht_update_caps(priv, sband, sta, priv_sta, oper_chan_type); +} + +static void * +minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) +{ + struct ieee80211_supported_band *sband; + struct minstrel_ht_sta_priv *msp; + struct minstrel_priv *mp = priv; + struct ieee80211_hw *hw = mp->hw; + int max_rates = 0; + int i; + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + sband = hw->wiphy->bands[i]; + if (sband && sband->n_bitrates > max_rates) + max_rates = sband->n_bitrates; + } + + msp = kzalloc(sizeof(struct minstrel_ht_sta), gfp); + if (!msp) + return NULL; + + msp->ratelist = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp); + if (!msp->ratelist) + goto error; + + msp->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp); + if (!msp->sample_table) + goto error1; + + return msp; + +error1: + kfree(msp->ratelist); +error: + kfree(msp); + return NULL; +} + +static void +minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) +{ + struct minstrel_ht_sta_priv *msp = priv_sta; + + kfree(msp->sample_table); + kfree(msp->ratelist); + kfree(msp); +} + +static void * +minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) +{ + return mac80211_minstrel.alloc(hw, debugfsdir); +} + +static void +minstrel_ht_free(void *priv) +{ + mac80211_minstrel.free(priv); +} + +static struct rate_control_ops mac80211_minstrel_ht = { + .name = "minstrel_ht", + .tx_status = minstrel_ht_tx_status, + .get_rate = minstrel_ht_get_rate, + .rate_init = minstrel_ht_rate_init, + .rate_update = minstrel_ht_rate_update, + .alloc_sta = minstrel_ht_alloc_sta, + .free_sta = minstrel_ht_free_sta, + .alloc = minstrel_ht_alloc, + .free = minstrel_ht_free, +#ifdef CONFIG_MAC80211_DEBUGFS + .add_sta_debugfs = minstrel_ht_add_sta_debugfs, + .remove_sta_debugfs = minstrel_ht_remove_sta_debugfs, +#endif +}; + + +static void +init_sample_table(void) +{ + int col, i, new_idx; + u8 rnd[MCS_GROUP_RATES]; + + memset(sample_table, 0xff, sizeof(sample_table)); + for (col = 0; col < SAMPLE_COLUMNS; col++) { + for (i = 0; i < MCS_GROUP_RATES; i++) { + get_random_bytes(rnd, sizeof(rnd)); + new_idx = (i + rnd[i]) % MCS_GROUP_RATES; + + while (sample_table[col][new_idx] != 0xff) + new_idx = (new_idx + 1) % MCS_GROUP_RATES; + + sample_table[col][new_idx] = i; + } + } +} + +int __init +rc80211_minstrel_ht_init(void) +{ + init_sample_table(); + return ieee80211_rate_control_register(&mac80211_minstrel_ht); +} + +void +rc80211_minstrel_ht_exit(void) +{ + ieee80211_rate_control_unregister(&mac80211_minstrel_ht); +} diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h new file mode 100644 index 00000000..462d2b22 --- /dev/null +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2010 Felix Fietkau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __RC_MINSTREL_HT_H +#define __RC_MINSTREL_HT_H + +/* + * The number of streams can be changed to 2 to reduce code + * size and memory footprint. + */ +#define MINSTREL_MAX_STREAMS 3 +#define MINSTREL_STREAM_GROUPS 4 + +/* scaled fraction values */ +#define MINSTREL_SCALE 16 +#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) +#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) + +#define MCS_GROUP_RATES 8 + +struct mcs_group { + u32 flags; + unsigned int streams; + unsigned int duration[MCS_GROUP_RATES]; +}; + +extern const struct mcs_group minstrel_mcs_groups[]; + +struct minstrel_rate_stats { + /* current / last sampling period attempts/success counters */ + unsigned int attempts, last_attempts; + unsigned int success, last_success; + + /* total attempts/success counters */ + u64 att_hist, succ_hist; + + /* current throughput */ + unsigned int cur_tp; + + /* packet delivery probabilities */ + unsigned int cur_prob, probability; + + /* maximum retry counts */ + unsigned int retry_count; + unsigned int retry_count_rtscts; + + bool retry_updated; + u8 sample_skipped; +}; + +struct minstrel_mcs_group_data { + u8 index; + u8 column; + + /* bitfield of supported MCS rates of this group */ + u8 supported; + + /* selected primary rates */ + unsigned int max_tp_rate; + unsigned int max_tp_rate2; + unsigned int max_prob_rate; + + /* MCS rate statistics */ + struct minstrel_rate_stats rates[MCS_GROUP_RATES]; +}; + +struct minstrel_ht_sta { + /* ampdu length (average, per sampling interval) */ + unsigned int ampdu_len; + unsigned int ampdu_packets; + + /* ampdu length (EWMA) */ + unsigned int avg_ampdu_len; + + /* best throughput rate */ + unsigned int max_tp_rate; + + /* second best throughput rate */ + unsigned int max_tp_rate2; + + /* best probability rate */ + unsigned int max_prob_rate; + + /* time of last status update */ + unsigned long stats_update; + + /* overhead time in usec for each frame */ + unsigned int overhead; + unsigned int overhead_rtscts; + + unsigned int total_packets; + unsigned int sample_packets; + + /* tx flags to add for frames for this sta */ + u32 tx_flags; + + u8 sample_wait; + u8 sample_tries; + u8 sample_count; + u8 sample_slow; + + /* current MCS group to be sampled */ + u8 sample_group; + + /* MCS rate group info and statistics */ + struct minstrel_mcs_group_data groups[MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS]; +}; + +struct minstrel_ht_sta_priv { + union { + struct minstrel_ht_sta ht; + struct minstrel_sta_info legacy; + }; +#ifdef CONFIG_MAC80211_DEBUGFS + struct dentry *dbg_stats; +#endif + void *ratelist; + void *sample_table; + bool is_ht; +}; + +void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); +void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta); + +#endif diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c new file mode 100644 index 00000000..cefcb5d2 --- /dev/null +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2010 Felix Fietkau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include "rc80211_minstrel.h" +#include "rc80211_minstrel_ht.h" + +static int +minstrel_ht_stats_open(struct inode *inode, struct file *file) +{ + struct minstrel_ht_sta_priv *msp = inode->i_private; + struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_debugfs_info *ms; + unsigned int i, j, tp, prob, eprob; + char *p; + int ret; + + if (!msp->is_ht) { + inode->i_private = &msp->legacy; + ret = minstrel_stats_open(inode, file); + inode->i_private = msp; + return ret; + } + + ms = kmalloc(sizeof(*ms) + 8192, GFP_KERNEL); + if (!ms) + return -ENOMEM; + + file->private_data = ms; + p = ms->buf; + p += sprintf(p, "type rate throughput ewma prob this prob " + "this succ/attempt success attempts\n"); + for (i = 0; i < MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS; i++) { + char htmode = '2'; + char gimode = 'L'; + + if (!mi->groups[i].supported) + continue; + + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + htmode = '4'; + if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) + gimode = 'S'; + + for (j = 0; j < MCS_GROUP_RATES; j++) { + struct minstrel_rate_stats *mr = &mi->groups[i].rates[j]; + int idx = i * MCS_GROUP_RATES + j; + + if (!(mi->groups[i].supported & BIT(j))) + continue; + + p += sprintf(p, "HT%c0/%cGI ", htmode, gimode); + + *(p++) = (idx == mi->max_tp_rate) ? 'T' : ' '; + *(p++) = (idx == mi->max_tp_rate2) ? 't' : ' '; + *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; + p += sprintf(p, "MCS%-2u", (minstrel_mcs_groups[i].streams - 1) * + MCS_GROUP_RATES + j); + + tp = mr->cur_tp / 10; + prob = MINSTREL_TRUNC(mr->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mr->probability * 1000); + + p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " + "%3u(%3u) %8llu %8llu\n", + tp / 10, tp % 10, + eprob / 10, eprob % 10, + prob / 10, prob % 10, + mr->last_success, + mr->last_attempts, + (unsigned long long)mr->succ_hist, + (unsigned long long)mr->att_hist); + } + } + p += sprintf(p, "\nTotal packet count:: ideal %d " + "lookaround %d\n", + max(0, (int) mi->total_packets - (int) mi->sample_packets), + mi->sample_packets); + p += sprintf(p, "Average A-MPDU length: %d.%d\n", + MINSTREL_TRUNC(mi->avg_ampdu_len), + MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10); + ms->len = p - ms->buf; + + return nonseekable_open(inode, file); +} + +static const struct file_operations minstrel_ht_stat_fops = { + .owner = THIS_MODULE, + .open = minstrel_ht_stats_open, + .read = minstrel_stats_read, + .release = minstrel_stats_release, + .llseek = no_llseek, +}; + +void +minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) +{ + struct minstrel_ht_sta_priv *msp = priv_sta; + + msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp, + &minstrel_ht_stat_fops); +} + +void +minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta) +{ + struct minstrel_ht_sta_priv *msp = priv_sta; + + debugfs_remove(msp->dbg_stats); +} diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h new file mode 100644 index 00000000..19111c7b --- /dev/null +++ b/net/mac80211/rc80211_pid.h @@ -0,0 +1,278 @@ +/* + * Copyright 2007, Mattias Nissler + * Copyright 2007, Stefano Brivio + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef RC80211_PID_H +#define RC80211_PID_H + +/* Sampling period for measuring percentage of failed frames in ms. */ +#define RC_PID_INTERVAL 125 + +/* Exponential averaging smoothness (used for I part of PID controller) */ +#define RC_PID_SMOOTHING_SHIFT 3 +#define RC_PID_SMOOTHING (1 << RC_PID_SMOOTHING_SHIFT) + +/* Sharpening factor (used for D part of PID controller) */ +#define RC_PID_SHARPENING_FACTOR 0 +#define RC_PID_SHARPENING_DURATION 0 + +/* Fixed point arithmetic shifting amount. */ +#define RC_PID_ARITH_SHIFT 8 + +/* Proportional PID component coefficient. */ +#define RC_PID_COEFF_P 15 +/* Integral PID component coefficient. */ +#define RC_PID_COEFF_I 9 +/* Derivative PID component coefficient. */ +#define RC_PID_COEFF_D 15 + +/* Target failed frames rate for the PID controller. NB: This effectively gives + * maximum failed frames percentage we're willing to accept. If the wireless + * link quality is good, the controller will fail to adjust failed frames + * percentage to the target. This is intentional. + */ +#define RC_PID_TARGET_PF 14 + +/* Rate behaviour normalization quantity over time. */ +#define RC_PID_NORM_OFFSET 3 + +/* Push high rates right after loading. */ +#define RC_PID_FAST_START 0 + +/* Arithmetic right shift for positive and negative values for ISO C. */ +#define RC_PID_DO_ARITH_RIGHT_SHIFT(x, y) \ + ((x) < 0 ? -((-(x)) >> (y)) : (x) >> (y)) + +enum rc_pid_event_type { + RC_PID_EVENT_TYPE_TX_STATUS, + RC_PID_EVENT_TYPE_RATE_CHANGE, + RC_PID_EVENT_TYPE_TX_RATE, + RC_PID_EVENT_TYPE_PF_SAMPLE, +}; + +union rc_pid_event_data { + /* RC_PID_EVENT_TX_STATUS */ + struct { + u32 flags; + struct ieee80211_tx_info tx_status; + }; + /* RC_PID_EVENT_TYPE_RATE_CHANGE */ + /* RC_PID_EVENT_TYPE_TX_RATE */ + struct { + int index; + int rate; + }; + /* RC_PID_EVENT_TYPE_PF_SAMPLE */ + struct { + s32 pf_sample; + s32 prop_err; + s32 int_err; + s32 der_err; + }; +}; + +struct rc_pid_event { + /* The time when the event occurred */ + unsigned long timestamp; + + /* Event ID number */ + unsigned int id; + + /* Type of event */ + enum rc_pid_event_type type; + + /* type specific data */ + union rc_pid_event_data data; +}; + +/* Size of the event ring buffer. */ +#define RC_PID_EVENT_RING_SIZE 32 + +struct rc_pid_event_buffer { + /* Counter that generates event IDs */ + unsigned int ev_count; + + /* Ring buffer of events */ + struct rc_pid_event ring[RC_PID_EVENT_RING_SIZE]; + + /* Index to the entry in events_buf to be reused */ + unsigned int next_entry; + + /* Lock that guards against concurrent access to this buffer struct */ + spinlock_t lock; + + /* Wait queue for poll/select and blocking I/O */ + wait_queue_head_t waitqueue; +}; + +struct rc_pid_events_file_info { + /* The event buffer we read */ + struct rc_pid_event_buffer *events; + + /* The entry we have should read next */ + unsigned int next_entry; +}; + +/** + * struct rc_pid_debugfs_entries - tunable parameters + * + * Algorithm parameters, tunable via debugfs. + * @target: target percentage for failed frames + * @sampling_period: error sampling interval in milliseconds + * @coeff_p: absolute value of the proportional coefficient + * @coeff_i: absolute value of the integral coefficient + * @coeff_d: absolute value of the derivative coefficient + * @smoothing_shift: absolute value of the integral smoothing factor (i.e. + * amount of smoothing introduced by the exponential moving average) + * @sharpen_factor: absolute value of the derivative sharpening factor (i.e. + * amount of emphasis given to the derivative term after low activity + * events) + * @sharpen_duration: duration of the sharpening effect after the detected low + * activity event, relative to sampling_period + * @norm_offset: amount of normalization periodically performed on the learnt + * rate behaviour values (lower means we should trust more what we learnt + * about behaviour of rates, higher means we should trust more the natural + * ordering of rates) + */ +struct rc_pid_debugfs_entries { + struct dentry *target; + struct dentry *sampling_period; + struct dentry *coeff_p; + struct dentry *coeff_i; + struct dentry *coeff_d; + struct dentry *smoothing_shift; + struct dentry *sharpen_factor; + struct dentry *sharpen_duration; + struct dentry *norm_offset; +}; + +void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf, + struct ieee80211_tx_info *stat); + +void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf, + int index, int rate); + +void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf, + int index, int rate); + +void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf, + s32 pf_sample, s32 prop_err, + s32 int_err, s32 der_err); + +void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, + struct dentry *dir); + +void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta); + +struct rc_pid_sta_info { + unsigned long last_change; + unsigned long last_sample; + + u32 tx_num_failed; + u32 tx_num_xmit; + + int txrate_idx; + + /* Average failed frames percentage error (i.e. actual vs. target + * percentage), scaled by RC_PID_SMOOTHING. This value is computed + * using using an exponential weighted average technique: + * + * (RC_PID_SMOOTHING - 1) * err_avg_old + err + * err_avg = ------------------------------------------ + * RC_PID_SMOOTHING + * + * where err_avg is the new approximation, err_avg_old the previous one + * and err is the error w.r.t. to the current failed frames percentage + * sample. Note that the bigger RC_PID_SMOOTHING the more weight is + * given to the previous estimate, resulting in smoother behavior (i.e. + * corresponding to a longer integration window). + * + * For computation, we actually don't use the above formula, but this + * one: + * + * err_avg_scaled = err_avg_old_scaled - err_avg_old + err + * + * where: + * err_avg_scaled = err * RC_PID_SMOOTHING + * err_avg_old_scaled = err_avg_old * RC_PID_SMOOTHING + * + * This avoids floating point numbers and the per_failed_old value can + * easily be obtained by shifting per_failed_old_scaled right by + * RC_PID_SMOOTHING_SHIFT. + */ + s32 err_avg_sc; + + /* Last framed failes percentage sample. */ + u32 last_pf; + + /* Sharpening needed. */ + u8 sharp_cnt; + +#ifdef CONFIG_MAC80211_DEBUGFS + /* Event buffer */ + struct rc_pid_event_buffer events; + + /* Events debugfs file entry */ + struct dentry *events_entry; +#endif +}; + +/* Algorithm parameters. We keep them on a per-algorithm approach, so they can + * be tuned individually for each interface. + */ +struct rc_pid_rateinfo { + + /* Map sorted rates to rates in ieee80211_hw_mode. */ + int index; + + /* Map rates in ieee80211_hw_mode to sorted rates. */ + int rev_index; + + /* Did we do any measurement on this rate? */ + bool valid; + + /* Comparison with the lowest rate. */ + int diff; +}; + +struct rc_pid_info { + + /* The failed frames percentage target. */ + unsigned int target; + + /* Rate at which failed frames percentage is sampled in 0.001s. */ + unsigned int sampling_period; + + /* P, I and D coefficients. */ + int coeff_p; + int coeff_i; + int coeff_d; + + /* Exponential averaging shift. */ + unsigned int smoothing_shift; + + /* Sharpening factor and duration. */ + unsigned int sharpen_factor; + unsigned int sharpen_duration; + + /* Normalization offset. */ + unsigned int norm_offset; + + /* Rates information. */ + struct rc_pid_rateinfo *rinfo; + + /* Index of the last used rate. */ + int oldrate; + +#ifdef CONFIG_MAC80211_DEBUGFS + /* Debugfs entries created for the parameters above. */ + struct rc_pid_debugfs_entries dentries; +#endif +}; + +#endif /* RC80211_PID_H */ diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c new file mode 100644 index 00000000..aeda6546 --- /dev/null +++ b/net/mac80211/rc80211_pid_algo.c @@ -0,0 +1,477 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2007, Mattias Nissler + * Copyright 2007-2008, Stefano Brivio + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include "rate.h" +#include "mesh.h" +#include "rc80211_pid.h" + + +/* This is an implementation of a TX rate control algorithm that uses a PID + * controller. Given a target failed frames rate, the controller decides about + * TX rate changes to meet the target failed frames rate. + * + * The controller basically computes the following: + * + * adj = CP * err + CI * err_avg + CD * (err - last_err) * (1 + sharpening) + * + * where + * adj adjustment value that is used to switch TX rate (see below) + * err current error: target vs. current failed frames percentage + * last_err last error + * err_avg average (i.e. poor man's integral) of recent errors + * sharpening non-zero when fast response is needed (i.e. right after + * association or no frames sent for a long time), heading + * to zero over time + * CP Proportional coefficient + * CI Integral coefficient + * CD Derivative coefficient + * + * CP, CI, CD are subject to careful tuning. + * + * The integral component uses a exponential moving average approach instead of + * an actual sliding window. The advantage is that we don't need to keep an + * array of the last N error values and computation is easier. + * + * Once we have the adj value, we map it to a rate by means of a learning + * algorithm. This algorithm keeps the state of the percentual failed frames + * difference between rates. The behaviour of the lowest available rate is kept + * as a reference value, and every time we switch between two rates, we compute + * the difference between the failed frames each rate exhibited. By doing so, + * we compare behaviours which different rates exhibited in adjacent timeslices, + * thus the comparison is minimally affected by external conditions. This + * difference gets propagated to the whole set of measurements, so that the + * reference is always the same. Periodically, we normalize this set so that + * recent events weigh the most. By comparing the adj value with this set, we + * avoid pejorative switches to lower rates and allow for switches to higher + * rates if they behaved well. + * + * Note that for the computations we use a fixed-point representation to avoid + * floating point arithmetic. Hence, all values are shifted left by + * RC_PID_ARITH_SHIFT. + */ + + +/* Adjust the rate while ensuring that we won't switch to a lower rate if it + * exhibited a worse failed frames behaviour and we'll choose the highest rate + * whose failed frames behaviour is not worse than the one of the original rate + * target. While at it, check that the new rate is valid. */ +static void rate_control_pid_adjust_rate(struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, + struct rc_pid_sta_info *spinfo, int adj, + struct rc_pid_rateinfo *rinfo) +{ + int cur_sorted, new_sorted, probe, tmp, n_bitrates, band; + int cur = spinfo->txrate_idx; + + band = sband->band; + n_bitrates = sband->n_bitrates; + + /* Map passed arguments to sorted values. */ + cur_sorted = rinfo[cur].rev_index; + new_sorted = cur_sorted + adj; + + /* Check limits. */ + if (new_sorted < 0) + new_sorted = rinfo[0].rev_index; + else if (new_sorted >= n_bitrates) + new_sorted = rinfo[n_bitrates - 1].rev_index; + + tmp = new_sorted; + + if (adj < 0) { + /* Ensure that the rate decrease isn't disadvantageous. */ + for (probe = cur_sorted; probe >= new_sorted; probe--) + if (rinfo[probe].diff <= rinfo[cur_sorted].diff && + rate_supported(sta, band, rinfo[probe].index)) + tmp = probe; + } else { + /* Look for rate increase with zero (or below) cost. */ + for (probe = new_sorted + 1; probe < n_bitrates; probe++) + if (rinfo[probe].diff <= rinfo[new_sorted].diff && + rate_supported(sta, band, rinfo[probe].index)) + tmp = probe; + } + + /* Fit the rate found to the nearest supported rate. */ + do { + if (rate_supported(sta, band, rinfo[tmp].index)) { + spinfo->txrate_idx = rinfo[tmp].index; + break; + } + if (adj < 0) + tmp--; + else + tmp++; + } while (tmp < n_bitrates && tmp >= 0); + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_rate_change(&spinfo->events, + spinfo->txrate_idx, + sband->bitrates[spinfo->txrate_idx].bitrate); +#endif +} + +/* Normalize the failed frames per-rate differences. */ +static void rate_control_pid_normalize(struct rc_pid_info *pinfo, int l) +{ + int i, norm_offset = pinfo->norm_offset; + struct rc_pid_rateinfo *r = pinfo->rinfo; + + if (r[0].diff > norm_offset) + r[0].diff -= norm_offset; + else if (r[0].diff < -norm_offset) + r[0].diff += norm_offset; + for (i = 0; i < l - 1; i++) + if (r[i + 1].diff > r[i].diff + norm_offset) + r[i + 1].diff -= norm_offset; + else if (r[i + 1].diff <= r[i].diff) + r[i + 1].diff += norm_offset; +} + +static void rate_control_pid_sample(struct rc_pid_info *pinfo, + struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, + struct rc_pid_sta_info *spinfo) +{ + struct rc_pid_rateinfo *rinfo = pinfo->rinfo; + u32 pf; + s32 err_avg; + u32 err_prop; + u32 err_int; + u32 err_der; + int adj, i, j, tmp; + unsigned long period; + + /* In case nothing happened during the previous control interval, turn + * the sharpening factor on. */ + period = msecs_to_jiffies(pinfo->sampling_period); + if (jiffies - spinfo->last_sample > 2 * period) + spinfo->sharp_cnt = pinfo->sharpen_duration; + + spinfo->last_sample = jiffies; + + /* This should never happen, but in case, we assume the old sample is + * still a good measurement and copy it. */ + if (unlikely(spinfo->tx_num_xmit == 0)) + pf = spinfo->last_pf; + else + pf = spinfo->tx_num_failed * 100 / spinfo->tx_num_xmit; + + spinfo->tx_num_xmit = 0; + spinfo->tx_num_failed = 0; + + /* If we just switched rate, update the rate behaviour info. */ + if (pinfo->oldrate != spinfo->txrate_idx) { + + i = rinfo[pinfo->oldrate].rev_index; + j = rinfo[spinfo->txrate_idx].rev_index; + + tmp = (pf - spinfo->last_pf); + tmp = RC_PID_DO_ARITH_RIGHT_SHIFT(tmp, RC_PID_ARITH_SHIFT); + + rinfo[j].diff = rinfo[i].diff + tmp; + pinfo->oldrate = spinfo->txrate_idx; + } + rate_control_pid_normalize(pinfo, sband->n_bitrates); + + /* Compute the proportional, integral and derivative errors. */ + err_prop = (pinfo->target - pf) << RC_PID_ARITH_SHIFT; + + err_avg = spinfo->err_avg_sc >> pinfo->smoothing_shift; + spinfo->err_avg_sc = spinfo->err_avg_sc - err_avg + err_prop; + err_int = spinfo->err_avg_sc >> pinfo->smoothing_shift; + + err_der = (pf - spinfo->last_pf) * + (1 + pinfo->sharpen_factor * spinfo->sharp_cnt); + spinfo->last_pf = pf; + if (spinfo->sharp_cnt) + spinfo->sharp_cnt--; + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_pf_sample(&spinfo->events, pf, err_prop, err_int, + err_der); +#endif + + /* Compute the controller output. */ + adj = (err_prop * pinfo->coeff_p + err_int * pinfo->coeff_i + + err_der * pinfo->coeff_d); + adj = RC_PID_DO_ARITH_RIGHT_SHIFT(adj, 2 * RC_PID_ARITH_SHIFT); + + /* Change rate. */ + if (adj) + rate_control_pid_adjust_rate(sband, sta, spinfo, adj, rinfo); +} + +static void rate_control_pid_tx_status(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta, + struct sk_buff *skb) +{ + struct rc_pid_info *pinfo = priv; + struct rc_pid_sta_info *spinfo = priv_sta; + unsigned long period; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + if (!spinfo) + return; + + /* Ignore all frames that were sent with a different rate than the rate + * we currently advise mac80211 to use. */ + if (info->status.rates[0].idx != spinfo->txrate_idx) + return; + + spinfo->tx_num_xmit++; + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_tx_status(&spinfo->events, info); +#endif + + /* We count frames that totally failed to be transmitted as two bad + * frames, those that made it out but had some retries as one good and + * one bad frame. */ + if (!(info->flags & IEEE80211_TX_STAT_ACK)) { + spinfo->tx_num_failed += 2; + spinfo->tx_num_xmit++; + } else if (info->status.rates[0].count > 1) { + spinfo->tx_num_failed++; + spinfo->tx_num_xmit++; + } + + /* Update PID controller state. */ + period = msecs_to_jiffies(pinfo->sampling_period); + if (time_after(jiffies, spinfo->last_sample + period)) + rate_control_pid_sample(pinfo, sband, sta, spinfo); +} + +static void +rate_control_pid_get_rate(void *priv, struct ieee80211_sta *sta, + void *priv_sta, + struct ieee80211_tx_rate_control *txrc) +{ + struct sk_buff *skb = txrc->skb; + struct ieee80211_supported_band *sband = txrc->sband; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct rc_pid_sta_info *spinfo = priv_sta; + int rateidx; + + if (txrc->rts) + info->control.rates[0].count = + txrc->hw->conf.long_frame_max_tx_count; + else + info->control.rates[0].count = + txrc->hw->conf.short_frame_max_tx_count; + + /* Send management frames and NO_ACK data using lowest rate. */ + if (rate_control_send_low(sta, priv_sta, txrc)) + return; + + rateidx = spinfo->txrate_idx; + + if (rateidx >= sband->n_bitrates) + rateidx = sband->n_bitrates - 1; + + info->control.rates[0].idx = rateidx; + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_tx_rate(&spinfo->events, + rateidx, sband->bitrates[rateidx].bitrate); +#endif +} + +static void +rate_control_pid_rate_init(void *priv, struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, void *priv_sta) +{ + struct rc_pid_sta_info *spinfo = priv_sta; + struct rc_pid_info *pinfo = priv; + struct rc_pid_rateinfo *rinfo = pinfo->rinfo; + int i, j, tmp; + bool s; + + /* TODO: This routine should consider using RSSI from previous packets + * as we need to have IEEE 802.1X auth succeed immediately after assoc.. + * Until that method is implemented, we will use the lowest supported + * rate as a workaround. */ + + /* Sort the rates. This is optimized for the most common case (i.e. + * almost-sorted CCK+OFDM rates). Kind of bubble-sort with reversed + * mapping too. */ + for (i = 0; i < sband->n_bitrates; i++) { + rinfo[i].index = i; + rinfo[i].rev_index = i; + if (RC_PID_FAST_START) + rinfo[i].diff = 0; + else + rinfo[i].diff = i * pinfo->norm_offset; + } + for (i = 1; i < sband->n_bitrates; i++) { + s = 0; + for (j = 0; j < sband->n_bitrates - i; j++) + if (unlikely(sband->bitrates[rinfo[j].index].bitrate > + sband->bitrates[rinfo[j + 1].index].bitrate)) { + tmp = rinfo[j].index; + rinfo[j].index = rinfo[j + 1].index; + rinfo[j + 1].index = tmp; + rinfo[rinfo[j].index].rev_index = j; + rinfo[rinfo[j + 1].index].rev_index = j + 1; + s = 1; + } + if (!s) + break; + } + + spinfo->txrate_idx = rate_lowest_index(sband, sta); +} + +static void *rate_control_pid_alloc(struct ieee80211_hw *hw, + struct dentry *debugfsdir) +{ + struct rc_pid_info *pinfo; + struct rc_pid_rateinfo *rinfo; + struct ieee80211_supported_band *sband; + int i, max_rates = 0; +#ifdef CONFIG_MAC80211_DEBUGFS + struct rc_pid_debugfs_entries *de; +#endif + + pinfo = kmalloc(sizeof(*pinfo), GFP_ATOMIC); + if (!pinfo) + return NULL; + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + sband = hw->wiphy->bands[i]; + if (sband && sband->n_bitrates > max_rates) + max_rates = sband->n_bitrates; + } + + rinfo = kmalloc(sizeof(*rinfo) * max_rates, GFP_ATOMIC); + if (!rinfo) { + kfree(pinfo); + return NULL; + } + + pinfo->target = RC_PID_TARGET_PF; + pinfo->sampling_period = RC_PID_INTERVAL; + pinfo->coeff_p = RC_PID_COEFF_P; + pinfo->coeff_i = RC_PID_COEFF_I; + pinfo->coeff_d = RC_PID_COEFF_D; + pinfo->smoothing_shift = RC_PID_SMOOTHING_SHIFT; + pinfo->sharpen_factor = RC_PID_SHARPENING_FACTOR; + pinfo->sharpen_duration = RC_PID_SHARPENING_DURATION; + pinfo->norm_offset = RC_PID_NORM_OFFSET; + pinfo->rinfo = rinfo; + pinfo->oldrate = 0; + +#ifdef CONFIG_MAC80211_DEBUGFS + de = &pinfo->dentries; + de->target = debugfs_create_u32("target_pf", S_IRUSR | S_IWUSR, + debugfsdir, &pinfo->target); + de->sampling_period = debugfs_create_u32("sampling_period", + S_IRUSR | S_IWUSR, debugfsdir, + &pinfo->sampling_period); + de->coeff_p = debugfs_create_u32("coeff_p", S_IRUSR | S_IWUSR, + debugfsdir, (u32 *)&pinfo->coeff_p); + de->coeff_i = debugfs_create_u32("coeff_i", S_IRUSR | S_IWUSR, + debugfsdir, (u32 *)&pinfo->coeff_i); + de->coeff_d = debugfs_create_u32("coeff_d", S_IRUSR | S_IWUSR, + debugfsdir, (u32 *)&pinfo->coeff_d); + de->smoothing_shift = debugfs_create_u32("smoothing_shift", + S_IRUSR | S_IWUSR, debugfsdir, + &pinfo->smoothing_shift); + de->sharpen_factor = debugfs_create_u32("sharpen_factor", + S_IRUSR | S_IWUSR, debugfsdir, + &pinfo->sharpen_factor); + de->sharpen_duration = debugfs_create_u32("sharpen_duration", + S_IRUSR | S_IWUSR, debugfsdir, + &pinfo->sharpen_duration); + de->norm_offset = debugfs_create_u32("norm_offset", + S_IRUSR | S_IWUSR, debugfsdir, + &pinfo->norm_offset); +#endif + + return pinfo; +} + +static void rate_control_pid_free(void *priv) +{ + struct rc_pid_info *pinfo = priv; +#ifdef CONFIG_MAC80211_DEBUGFS + struct rc_pid_debugfs_entries *de = &pinfo->dentries; + + debugfs_remove(de->norm_offset); + debugfs_remove(de->sharpen_duration); + debugfs_remove(de->sharpen_factor); + debugfs_remove(de->smoothing_shift); + debugfs_remove(de->coeff_d); + debugfs_remove(de->coeff_i); + debugfs_remove(de->coeff_p); + debugfs_remove(de->sampling_period); + debugfs_remove(de->target); +#endif + + kfree(pinfo->rinfo); + kfree(pinfo); +} + +static void *rate_control_pid_alloc_sta(void *priv, struct ieee80211_sta *sta, + gfp_t gfp) +{ + struct rc_pid_sta_info *spinfo; + + spinfo = kzalloc(sizeof(*spinfo), gfp); + if (spinfo == NULL) + return NULL; + + spinfo->last_sample = jiffies; + +#ifdef CONFIG_MAC80211_DEBUGFS + spin_lock_init(&spinfo->events.lock); + init_waitqueue_head(&spinfo->events.waitqueue); +#endif + + return spinfo; +} + +static void rate_control_pid_free_sta(void *priv, struct ieee80211_sta *sta, + void *priv_sta) +{ + kfree(priv_sta); +} + +static struct rate_control_ops mac80211_rcpid = { + .name = "pid", + .tx_status = rate_control_pid_tx_status, + .get_rate = rate_control_pid_get_rate, + .rate_init = rate_control_pid_rate_init, + .alloc = rate_control_pid_alloc, + .free = rate_control_pid_free, + .alloc_sta = rate_control_pid_alloc_sta, + .free_sta = rate_control_pid_free_sta, +#ifdef CONFIG_MAC80211_DEBUGFS + .add_sta_debugfs = rate_control_pid_add_sta_debugfs, + .remove_sta_debugfs = rate_control_pid_remove_sta_debugfs, +#endif +}; + +int __init rc80211_pid_init(void) +{ + return ieee80211_rate_control_register(&mac80211_rcpid); +} + +void rc80211_pid_exit(void) +{ + ieee80211_rate_control_unregister(&mac80211_rcpid); +} diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c new file mode 100644 index 00000000..4851e9e2 --- /dev/null +++ b/net/mac80211/rc80211_pid_debugfs.c @@ -0,0 +1,227 @@ +/* + * Copyright 2007, Mattias Nissler + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "rate.h" + +#include "rc80211_pid.h" + +static void rate_control_pid_event(struct rc_pid_event_buffer *buf, + enum rc_pid_event_type type, + union rc_pid_event_data *data) +{ + struct rc_pid_event *ev; + unsigned long status; + + spin_lock_irqsave(&buf->lock, status); + ev = &(buf->ring[buf->next_entry]); + buf->next_entry = (buf->next_entry + 1) % RC_PID_EVENT_RING_SIZE; + + ev->timestamp = jiffies; + ev->id = buf->ev_count++; + ev->type = type; + ev->data = *data; + + spin_unlock_irqrestore(&buf->lock, status); + + wake_up_all(&buf->waitqueue); +} + +void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf, + struct ieee80211_tx_info *stat) +{ + union rc_pid_event_data evd; + + evd.flags = stat->flags; + memcpy(&evd.tx_status, stat, sizeof(struct ieee80211_tx_info)); + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_STATUS, &evd); +} + +void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf, + int index, int rate) +{ + union rc_pid_event_data evd; + + evd.index = index; + evd.rate = rate; + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_RATE_CHANGE, &evd); +} + +void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf, + int index, int rate) +{ + union rc_pid_event_data evd; + + evd.index = index; + evd.rate = rate; + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_RATE, &evd); +} + +void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf, + s32 pf_sample, s32 prop_err, + s32 int_err, s32 der_err) +{ + union rc_pid_event_data evd; + + evd.pf_sample = pf_sample; + evd.prop_err = prop_err; + evd.int_err = int_err; + evd.der_err = der_err; + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_PF_SAMPLE, &evd); +} + +static int rate_control_pid_events_open(struct inode *inode, struct file *file) +{ + struct rc_pid_sta_info *sinfo = inode->i_private; + struct rc_pid_event_buffer *events = &sinfo->events; + struct rc_pid_events_file_info *file_info; + unsigned long status; + + /* Allocate a state struct */ + file_info = kmalloc(sizeof(*file_info), GFP_KERNEL); + if (file_info == NULL) + return -ENOMEM; + + spin_lock_irqsave(&events->lock, status); + + file_info->next_entry = events->next_entry; + file_info->events = events; + + spin_unlock_irqrestore(&events->lock, status); + + file->private_data = file_info; + + return 0; +} + +static int rate_control_pid_events_release(struct inode *inode, + struct file *file) +{ + struct rc_pid_events_file_info *file_info = file->private_data; + + kfree(file_info); + + return 0; +} + +static unsigned int rate_control_pid_events_poll(struct file *file, + poll_table *wait) +{ + struct rc_pid_events_file_info *file_info = file->private_data; + + poll_wait(file, &file_info->events->waitqueue, wait); + + return POLLIN | POLLRDNORM; +} + +#define RC_PID_PRINT_BUF_SIZE 64 + +static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf, + size_t length, loff_t *offset) +{ + struct rc_pid_events_file_info *file_info = file->private_data; + struct rc_pid_event_buffer *events = file_info->events; + struct rc_pid_event *ev; + char pb[RC_PID_PRINT_BUF_SIZE]; + int ret; + int p; + unsigned long status; + + /* Check if there is something to read. */ + if (events->next_entry == file_info->next_entry) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + /* Wait */ + ret = wait_event_interruptible(events->waitqueue, + events->next_entry != file_info->next_entry); + + if (ret) + return ret; + } + + /* Write out one event per call. I don't care whether it's a little + * inefficient, this is debugging code anyway. */ + spin_lock_irqsave(&events->lock, status); + + /* Get an event */ + ev = &(events->ring[file_info->next_entry]); + file_info->next_entry = (file_info->next_entry + 1) % + RC_PID_EVENT_RING_SIZE; + + /* Print information about the event. Note that userspace needs to + * provide large enough buffers. */ + length = length < RC_PID_PRINT_BUF_SIZE ? + length : RC_PID_PRINT_BUF_SIZE; + p = snprintf(pb, length, "%u %lu ", ev->id, ev->timestamp); + switch (ev->type) { + case RC_PID_EVENT_TYPE_TX_STATUS: + p += snprintf(pb + p, length - p, "tx_status %u %u", + !(ev->data.flags & IEEE80211_TX_STAT_ACK), + ev->data.tx_status.status.rates[0].idx); + break; + case RC_PID_EVENT_TYPE_RATE_CHANGE: + p += snprintf(pb + p, length - p, "rate_change %d %d", + ev->data.index, ev->data.rate); + break; + case RC_PID_EVENT_TYPE_TX_RATE: + p += snprintf(pb + p, length - p, "tx_rate %d %d", + ev->data.index, ev->data.rate); + break; + case RC_PID_EVENT_TYPE_PF_SAMPLE: + p += snprintf(pb + p, length - p, + "pf_sample %d %d %d %d", + ev->data.pf_sample, ev->data.prop_err, + ev->data.int_err, ev->data.der_err); + break; + } + p += snprintf(pb + p, length - p, "\n"); + + spin_unlock_irqrestore(&events->lock, status); + + if (copy_to_user(buf, pb, p)) + return -EFAULT; + + return p; +} + +#undef RC_PID_PRINT_BUF_SIZE + +static const struct file_operations rc_pid_fop_events = { + .owner = THIS_MODULE, + .read = rate_control_pid_events_read, + .poll = rate_control_pid_events_poll, + .open = rate_control_pid_events_open, + .release = rate_control_pid_events_release, + .llseek = noop_llseek, +}; + +void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, + struct dentry *dir) +{ + struct rc_pid_sta_info *spinfo = priv_sta; + + spinfo->events_entry = debugfs_create_file("rc_pid_events", S_IRUGO, + dir, spinfo, + &rc_pid_fop_events); +} + +void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta) +{ + struct rc_pid_sta_info *spinfo = priv_sta; + + debugfs_remove(spinfo->events_entry); +} diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c new file mode 100644 index 00000000..41000650 --- /dev/null +++ b/net/mac80211/rx.c @@ -0,0 +1,2965 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007-2010 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "led.h" +#include "mesh.h" +#include "wep.h" +#include "wpa.h" +#include "tkip.h" +#include "wme.h" + +/* + * monitor mode reception + * + * This function cleans up the SKB, i.e. it removes all the stuff + * only useful for monitoring. + */ +static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, + struct sk_buff *skb) +{ + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) { + if (likely(skb->len > FCS_LEN)) + __pskb_trim(skb, skb->len - FCS_LEN); + else { + /* driver bug */ + WARN_ON(1); + dev_kfree_skb(skb); + skb = NULL; + } + } + + return skb; +} + +static inline int should_drop_frame(struct sk_buff *skb, + int present_fcs_len) +{ + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + + if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) + return 1; + if (unlikely(skb->len < 16 + present_fcs_len)) + return 1; + if (ieee80211_is_ctl(hdr->frame_control) && + !ieee80211_is_pspoll(hdr->frame_control) && + !ieee80211_is_back_req(hdr->frame_control)) + return 1; + return 0; +} + +static int +ieee80211_rx_radiotap_len(struct ieee80211_local *local, + struct ieee80211_rx_status *status) +{ + int len; + + /* always present fields */ + len = sizeof(struct ieee80211_radiotap_header) + 9; + + if (status->flag & RX_FLAG_MACTIME_MPDU) + len += 8; + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + len += 1; + + if (len & 1) /* padding for RX_FLAGS if necessary */ + len++; + + if (status->flag & RX_FLAG_HT) /* HT info */ + len += 3; + + return len; +} + +/* + * ieee80211_add_rx_radiotap_header - add radiotap header + * + * add a radiotap header containing all the fields which the hardware provided. + */ +static void +ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, + struct sk_buff *skb, + struct ieee80211_rate *rate, + int rtap_len) +{ + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_radiotap_header *rthdr; + unsigned char *pos; + u16 rx_flags = 0; + + rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len); + memset(rthdr, 0, rtap_len); + + /* radiotap header, set always present flags */ + rthdr->it_present = + cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | + (1 << IEEE80211_RADIOTAP_CHANNEL) | + (1 << IEEE80211_RADIOTAP_ANTENNA) | + (1 << IEEE80211_RADIOTAP_RX_FLAGS)); + rthdr->it_len = cpu_to_le16(rtap_len); + + pos = (unsigned char *)(rthdr+1); + + /* the order of the following fields is important */ + + /* IEEE80211_RADIOTAP_TSFT */ + if (status->flag & RX_FLAG_MACTIME_MPDU) { + put_unaligned_le64(status->mactime, pos); + rthdr->it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT); + pos += 8; + } + + /* IEEE80211_RADIOTAP_FLAGS */ + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + *pos |= IEEE80211_RADIOTAP_F_FCS; + if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) + *pos |= IEEE80211_RADIOTAP_F_BADFCS; + if (status->flag & RX_FLAG_SHORTPRE) + *pos |= IEEE80211_RADIOTAP_F_SHORTPRE; + pos++; + + /* IEEE80211_RADIOTAP_RATE */ + if (!rate || status->flag & RX_FLAG_HT) { + /* + * Without rate information don't add it. If we have, + * MCS information is a separate field in radiotap, + * added below. The byte here is needed as padding + * for the channel though, so initialise it to 0. + */ + *pos = 0; + } else { + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); + *pos = rate->bitrate / 5; + } + pos++; + + /* IEEE80211_RADIOTAP_CHANNEL */ + put_unaligned_le16(status->freq, pos); + pos += 2; + if (status->band == IEEE80211_BAND_5GHZ) + put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ, + pos); + else if (status->flag & RX_FLAG_HT) + put_unaligned_le16(IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ, + pos); + else if (rate && rate->flags & IEEE80211_RATE_ERP_G) + put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ, + pos); + else if (rate) + put_unaligned_le16(IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ, + pos); + else + put_unaligned_le16(IEEE80211_CHAN_2GHZ, pos); + pos += 2; + + /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */ + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { + *pos = status->signal; + rthdr->it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL); + pos++; + } + + /* IEEE80211_RADIOTAP_LOCK_QUALITY is missing */ + + /* IEEE80211_RADIOTAP_ANTENNA */ + *pos = status->antenna; + pos++; + + /* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */ + + /* IEEE80211_RADIOTAP_RX_FLAGS */ + /* ensure 2 byte alignment for the 2 byte field as required */ + if ((pos - (u8 *)rthdr) & 1) + pos++; + if (status->flag & RX_FLAG_FAILED_PLCP_CRC) + rx_flags |= IEEE80211_RADIOTAP_F_RX_BADPLCP; + put_unaligned_le16(rx_flags, pos); + pos += 2; + + if (status->flag & RX_FLAG_HT) { + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + *pos++ = IEEE80211_RADIOTAP_MCS_HAVE_MCS | + IEEE80211_RADIOTAP_MCS_HAVE_GI | + IEEE80211_RADIOTAP_MCS_HAVE_BW; + *pos = 0; + if (status->flag & RX_FLAG_SHORT_GI) + *pos |= IEEE80211_RADIOTAP_MCS_SGI; + if (status->flag & RX_FLAG_40MHZ) + *pos |= IEEE80211_RADIOTAP_MCS_BW_40; + pos++; + *pos++ = status->rate_idx; + } +} + +/* + * This function copies a received frame to all monitor interfaces and + * returns a cleaned-up SKB that no longer includes the FCS nor the + * radiotap header the driver might have added. + */ +static struct sk_buff * +ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, + struct ieee80211_rate *rate) +{ + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb); + struct ieee80211_sub_if_data *sdata; + int needed_headroom = 0; + struct sk_buff *skb, *skb2; + struct net_device *prev_dev = NULL; + int present_fcs_len = 0; + + /* + * First, we may need to make a copy of the skb because + * (1) we need to modify it for radiotap (if not present), and + * (2) the other RX handlers will modify the skb we got. + * + * We don't need to, of course, if we aren't going to return + * the SKB because it has a bad FCS/PLCP checksum. + */ + + /* room for the radiotap header based on driver features */ + needed_headroom = ieee80211_rx_radiotap_len(local, status); + + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + present_fcs_len = FCS_LEN; + + /* make sure hdr->frame_control is on the linear part */ + if (!pskb_may_pull(origskb, 2)) { + dev_kfree_skb(origskb); + return NULL; + } + + if (!local->monitors) { + if (should_drop_frame(origskb, present_fcs_len)) { + dev_kfree_skb(origskb); + return NULL; + } + + return remove_monitor_info(local, origskb); + } + + if (should_drop_frame(origskb, present_fcs_len)) { + /* only need to expand headroom if necessary */ + skb = origskb; + origskb = NULL; + + /* + * This shouldn't trigger often because most devices have an + * RX header they pull before we get here, and that should + * be big enough for our radiotap information. We should + * probably export the length to drivers so that we can have + * them allocate enough headroom to start with. + */ + if (skb_headroom(skb) < needed_headroom && + pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) { + dev_kfree_skb(skb); + return NULL; + } + } else { + /* + * Need to make a copy and possibly remove radiotap header + * and FCS from the original. + */ + skb = skb_copy_expand(origskb, needed_headroom, 0, GFP_ATOMIC); + + origskb = remove_monitor_info(local, origskb); + + if (!skb) + return origskb; + } + + /* prepend radiotap information */ + ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom); + + skb_reset_mac_header(skb); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_OTHERHOST; + skb->protocol = htons(ETH_P_802_2); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) + continue; + + if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) + continue; + + if (!ieee80211_sdata_running(sdata)) + continue; + + if (prev_dev) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) { + skb2->dev = prev_dev; + netif_receive_skb(skb2); + } + } + + prev_dev = sdata->dev; + sdata->dev->stats.rx_packets++; + sdata->dev->stats.rx_bytes += skb->len; + } + + if (prev_dev) { + skb->dev = prev_dev; + netif_receive_skb(skb); + } else + dev_kfree_skb(skb); + + return origskb; +} + + +static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + int tid; + + /* does the frame have a qos control field? */ + if (ieee80211_is_data_qos(hdr->frame_control)) { + u8 *qc = ieee80211_get_qos_ctl(hdr); + /* frame has qos control */ + tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT) + status->rx_flags |= IEEE80211_RX_AMSDU; + } else { + /* + * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"): + * + * Sequence numbers for management frames, QoS data + * frames with a broadcast/multicast address in the + * Address 1 field, and all non-QoS data frames sent + * by QoS STAs are assigned using an additional single + * modulo-4096 counter, [...] + * + * We also use that counter for non-QoS STAs. + */ + tid = NUM_RX_DATA_QUEUES - 1; + } + + rx->queue = tid; + /* Set skb->priority to 1d tag if highest order bit of TID is not set. + * For now, set skb->priority to 0 for other cases. */ + rx->skb->priority = (tid > 7) ? 0 : tid; +} + +/** + * DOC: Packet alignment + * + * Drivers always need to pass packets that are aligned to two-byte boundaries + * to the stack. + * + * Additionally, should, if possible, align the payload data in a way that + * guarantees that the contained IP header is aligned to a four-byte + * boundary. In the case of regular frames, this simply means aligning the + * payload to a four-byte boundary (because either the IP header is directly + * contained, or IV/RFC1042 headers that have a length divisible by four are + * in front of it). If the payload data is not properly aligned and the + * architecture doesn't support efficient unaligned operations, mac80211 + * will align the data. + * + * With A-MSDU frames, however, the payload data address must yield two modulo + * four because there are 14-byte 802.3 headers within the A-MSDU frames that + * push the IP header further back to a multiple of four again. Thankfully, the + * specs were sane enough this time around to require padding each A-MSDU + * subframe to a length that is a multiple of four. + * + * Padding like Atheros hardware adds which is between the 802.11 header and + * the payload is not supported, the driver is required to move the 802.11 + * header to be directly in front of the payload in that case. + */ +static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx) +{ +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + WARN_ONCE((unsigned long)rx->skb->data & 1, + "unaligned packet at 0x%p\n", rx->skb->data); +#endif +} + + +/* rx handlers */ + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + struct sk_buff *skb = rx->skb; + + if (likely(!(status->rx_flags & IEEE80211_RX_IN_SCAN) && + !local->sched_scanning)) + return RX_CONTINUE; + + if (test_bit(SCAN_HW_SCANNING, &local->scanning) || + test_bit(SCAN_SW_SCANNING, &local->scanning) || + local->sched_scanning) + return ieee80211_scan_rx(rx->sdata, skb); + + /* scanning finished during invoking of handlers */ + I802_DEBUG_INC(local->rx_handlers_drop_passive_scan); + return RX_DROP_UNUSABLE; +} + + +static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + + if (skb->len < 24 || is_multicast_ether_addr(hdr->addr1)) + return 0; + + return ieee80211_is_robust_mgmt_frame(hdr); +} + + +static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + + if (skb->len < 24 || !is_multicast_ether_addr(hdr->addr1)) + return 0; + + return ieee80211_is_robust_mgmt_frame(hdr); +} + + +/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */ +static int ieee80211_get_mmie_keyidx(struct sk_buff *skb) +{ + struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data; + struct ieee80211_mmie *mmie; + + if (skb->len < 24 + sizeof(*mmie) || + !is_multicast_ether_addr(hdr->da)) + return -1; + + if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) hdr)) + return -1; /* not a robust management frame */ + + mmie = (struct ieee80211_mmie *) + (skb->data + skb->len - sizeof(*mmie)); + if (mmie->element_id != WLAN_EID_MMIE || + mmie->length != sizeof(*mmie) - 2) + return -1; + + return le16_to_cpu(mmie->key_id); +} + + +static ieee80211_rx_result +ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + unsigned int hdrlen = ieee80211_hdrlen(hdr->frame_control); + char *dev_addr = rx->sdata->vif.addr; + + if (ieee80211_is_data(hdr->frame_control)) { + if (is_multicast_ether_addr(hdr->addr1)) { + if (ieee80211_has_tods(hdr->frame_control) || + !ieee80211_has_fromds(hdr->frame_control)) + return RX_DROP_MONITOR; + if (memcmp(hdr->addr3, dev_addr, ETH_ALEN) == 0) + return RX_DROP_MONITOR; + } else { + if (!ieee80211_has_a4(hdr->frame_control)) + return RX_DROP_MONITOR; + if (memcmp(hdr->addr4, dev_addr, ETH_ALEN) == 0) + return RX_DROP_MONITOR; + } + } + + /* If there is not an established peer link and this is not a peer link + * establisment frame, beacon or probe, drop the frame. + */ + + if (!rx->sta || sta_plink_state(rx->sta) != NL80211_PLINK_ESTAB) { + struct ieee80211_mgmt *mgmt; + + if (!ieee80211_is_mgmt(hdr->frame_control)) + return RX_DROP_MONITOR; + + if (ieee80211_is_action(hdr->frame_control)) { + u8 category; + mgmt = (struct ieee80211_mgmt *)hdr; + category = mgmt->u.action.category; + if (category != WLAN_CATEGORY_MESH_ACTION && + category != WLAN_CATEGORY_SELF_PROTECTED) + return RX_DROP_MONITOR; + return RX_CONTINUE; + } + + if (ieee80211_is_probe_req(hdr->frame_control) || + ieee80211_is_probe_resp(hdr->frame_control) || + ieee80211_is_beacon(hdr->frame_control) || + ieee80211_is_auth(hdr->frame_control)) + return RX_CONTINUE; + + return RX_DROP_MONITOR; + + } + +#define msh_h_get(h, l) ((struct ieee80211s_hdr *) ((u8 *)h + l)) + + if (ieee80211_is_data(hdr->frame_control) && + is_multicast_ether_addr(hdr->addr1) && + mesh_rmc_check(hdr->addr3, msh_h_get(hdr, hdrlen), rx->sdata)) + return RX_DROP_MONITOR; +#undef msh_h_get + + return RX_CONTINUE; +} + +#define SEQ_MODULO 0x1000 +#define SEQ_MASK 0xfff + +static inline int seq_less(u16 sq1, u16 sq2) +{ + return ((sq1 - sq2) & SEQ_MASK) > (SEQ_MODULO >> 1); +} + +static inline u16 seq_inc(u16 sq) +{ + return (sq + 1) & SEQ_MASK; +} + +static inline u16 seq_sub(u16 sq1, u16 sq2) +{ + return (sq1 - sq2) & SEQ_MASK; +} + + +static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + int index) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sk_buff *skb = tid_agg_rx->reorder_buf[index]; + struct ieee80211_rx_status *status; + + lockdep_assert_held(&tid_agg_rx->reorder_lock); + + if (!skb) + goto no_frame; + + /* release the frame from the reorder ring buffer */ + tid_agg_rx->stored_mpdu_num--; + tid_agg_rx->reorder_buf[index] = NULL; + status = IEEE80211_SKB_RXCB(skb); + status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE; + skb_queue_tail(&local->rx_skb_queue, skb); + +no_frame: + tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); +} + +static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + u16 head_seq_num) +{ + int index; + + lockdep_assert_held(&tid_agg_rx->reorder_lock); + + while (seq_less(tid_agg_rx->head_seq_num, head_seq_num)) { + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + ieee80211_release_reorder_frame(hw, tid_agg_rx, index); + } +} + +/* + * Timeout (in jiffies) for skb's that are waiting in the RX reorder buffer. If + * the skb was added to the buffer longer than this time ago, the earlier + * frames that have not yet been received are assumed to be lost and the skb + * can be released for processing. This may also release other skb's from the + * reorder buffer if there are no additional gaps between the frames. + * + * Callers must hold tid_agg_rx->reorder_lock. + */ +#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10) + +static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx) +{ + int index, j; + + lockdep_assert_held(&tid_agg_rx->reorder_lock); + + /* release the buffer until next missing frame */ + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + if (!tid_agg_rx->reorder_buf[index] && + tid_agg_rx->stored_mpdu_num) { + /* + * No buffers ready to be released, but check whether any + * frames in the reorder buffer have timed out. + */ + int skipped = 1; + for (j = (index + 1) % tid_agg_rx->buf_size; j != index; + j = (j + 1) % tid_agg_rx->buf_size) { + if (!tid_agg_rx->reorder_buf[j]) { + skipped++; + continue; + } + if (skipped && + !time_after(jiffies, tid_agg_rx->reorder_time[j] + + HT_RX_REORDER_BUF_TIMEOUT)) + goto set_release_timer; + +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + wiphy_debug(hw->wiphy, + "release an RX reorder frame due to timeout on earlier frames\n"); +#endif + ieee80211_release_reorder_frame(hw, tid_agg_rx, j); + + /* + * Increment the head seq# also for the skipped slots. + */ + tid_agg_rx->head_seq_num = + (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK; + skipped = 0; + } + } else while (tid_agg_rx->reorder_buf[index]) { + ieee80211_release_reorder_frame(hw, tid_agg_rx, index); + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + } + + if (tid_agg_rx->stored_mpdu_num) { + j = index = seq_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; + + for (; j != (index - 1) % tid_agg_rx->buf_size; + j = (j + 1) % tid_agg_rx->buf_size) { + if (tid_agg_rx->reorder_buf[j]) + break; + } + + set_release_timer: + + mod_timer(&tid_agg_rx->reorder_timer, + tid_agg_rx->reorder_time[j] + 1 + + HT_RX_REORDER_BUF_TIMEOUT); + } else { + del_timer(&tid_agg_rx->reorder_timer); + } +} + +/* + * As this function belongs to the RX path it must be under + * rcu_read_lock protection. It returns false if the frame + * can be processed immediately, true if it was consumed. + */ +static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + u16 sc = le16_to_cpu(hdr->seq_ctrl); + u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; + u16 head_seq_num, buf_size; + int index; + bool ret = true; + + spin_lock(&tid_agg_rx->reorder_lock); + + buf_size = tid_agg_rx->buf_size; + head_seq_num = tid_agg_rx->head_seq_num; + + /* frame with out of date sequence number */ + if (seq_less(mpdu_seq_num, head_seq_num)) { + dev_kfree_skb(skb); + goto out; + } + + /* + * If frame the sequence number exceeds our buffering window + * size release some previous frames to make room for this one. + */ + if (!seq_less(mpdu_seq_num, head_seq_num + buf_size)) { + head_seq_num = seq_inc(seq_sub(mpdu_seq_num, buf_size)); + /* release stored frames up to new head to stack */ + ieee80211_release_reorder_frames(hw, tid_agg_rx, head_seq_num); + } + + /* Now the new frame is always in the range of the reordering buffer */ + + index = seq_sub(mpdu_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; + + /* check if we already stored this frame */ + if (tid_agg_rx->reorder_buf[index]) { + dev_kfree_skb(skb); + goto out; + } + + /* + * If the current MPDU is in the right order and nothing else + * is stored we can process it directly, no need to buffer it. + * If it is first but there's something stored, we may be able + * to release frames after this one. + */ + if (mpdu_seq_num == tid_agg_rx->head_seq_num && + tid_agg_rx->stored_mpdu_num == 0) { + tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); + ret = false; + goto out; + } + + /* put the frame in the reordering buffer */ + tid_agg_rx->reorder_buf[index] = skb; + tid_agg_rx->reorder_time[index] = jiffies; + tid_agg_rx->stored_mpdu_num++; + ieee80211_sta_reorder_release(hw, tid_agg_rx); + + out: + spin_unlock(&tid_agg_rx->reorder_lock); + return ret; +} + +/* + * Reorder MPDUs from A-MPDUs, keeping them on a buffer. Returns + * true if the MPDU was buffered, false if it should be processed. + */ +static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_local *local = rx->local; + struct ieee80211_hw *hw = &local->hw; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct sta_info *sta = rx->sta; + struct tid_ampdu_rx *tid_agg_rx; + u16 sc; + int tid; + + if (!ieee80211_is_data_qos(hdr->frame_control)) + goto dont_reorder; + + /* + * filter the QoS data rx stream according to + * STA/TID and check if this STA/TID is on aggregation + */ + + if (!sta) + goto dont_reorder; + + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + goto dont_reorder; + + /* qos null data frames are excluded */ + if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC))) + goto dont_reorder; + + /* new, potentially un-ordered, ampdu frame - process it */ + + /* reset session timer */ + if (tid_agg_rx->timeout) + mod_timer(&tid_agg_rx->session_timer, + TU_TO_EXP_TIME(tid_agg_rx->timeout)); + + /* if this mpdu is fragmented - terminate rx aggregation session */ + sc = le16_to_cpu(hdr->seq_ctrl); + if (sc & IEEE80211_SCTL_FRAG) { + skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&rx->sdata->skb_queue, skb); + ieee80211_queue_work(&local->hw, &rx->sdata->work); + return; + } + + /* + * No locking needed -- we will only ever process one + * RX packet at a time, and thus own tid_agg_rx. All + * other code manipulating it needs to (and does) make + * sure that we cannot get to it any more before doing + * anything with it. + */ + if (ieee80211_sta_manage_reorder_buf(hw, tid_agg_rx, skb)) + return; + + dont_reorder: + skb_queue_tail(&local->rx_skb_queue, skb); +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_check(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + /* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */ + if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) { + if (unlikely(ieee80211_has_retry(hdr->frame_control) && + rx->sta->last_seq_ctrl[rx->queue] == + hdr->seq_ctrl)) { + if (status->rx_flags & IEEE80211_RX_RA_MATCH) { + rx->local->dot11FrameDuplicateCount++; + rx->sta->num_duplicates++; + } + return RX_DROP_UNUSABLE; + } else + rx->sta->last_seq_ctrl[rx->queue] = hdr->seq_ctrl; + } + + if (unlikely(rx->skb->len < 16)) { + I802_DEBUG_INC(rx->local->rx_handlers_drop_short); + return RX_DROP_MONITOR; + } + + /* Drop disallowed frame classes based on STA auth/assoc state; + * IEEE 802.11, Chap 5.5. + * + * mac80211 filters only based on association state, i.e. it drops + * Class 3 frames from not associated stations. hostapd sends + * deauth/disassoc frames when needed. In addition, hostapd is + * responsible for filtering on both auth and assoc states. + */ + + if (ieee80211_vif_is_mesh(&rx->sdata->vif)) + return ieee80211_rx_mesh_check(rx); + + if (unlikely((ieee80211_is_data(hdr->frame_control) || + ieee80211_is_pspoll(hdr->frame_control)) && + rx->sdata->vif.type != NL80211_IFTYPE_ADHOC && + rx->sdata->vif.type != NL80211_IFTYPE_WDS && + (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) + return RX_DROP_MONITOR; + + return RX_CONTINUE; +} + + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + int keyidx; + int hdrlen; + ieee80211_rx_result result = RX_DROP_UNUSABLE; + struct ieee80211_key *sta_ptk = NULL; + int mmie_keyidx = -1; + __le16 fc; + + /* + * Key selection 101 + * + * There are four types of keys: + * - GTK (group keys) + * - IGTK (group keys for management frames) + * - PTK (pairwise keys) + * - STK (station-to-station pairwise keys) + * + * When selecting a key, we have to distinguish between multicast + * (including broadcast) and unicast frames, the latter can only + * use PTKs and STKs while the former always use GTKs and IGTKs. + * Unless, of course, actual WEP keys ("pre-RSNA") are used, then + * unicast frames can also use key indices like GTKs. Hence, if we + * don't have a PTK/STK we check the key index for a WEP key. + * + * Note that in a regular BSS, multicast frames are sent by the + * AP only, associated stations unicast the frame to the AP first + * which then multicasts it on their behalf. + * + * There is also a slight problem in IBSS mode: GTKs are negotiated + * with each station, that is something we don't currently handle. + * The spec seems to expect that one negotiates the same key with + * every station but there's no such requirement; VLANs could be + * possible. + */ + + /* + * No point in finding a key and decrypting if the frame is neither + * addressed to us nor a multicast frame. + */ + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) + return RX_CONTINUE; + + /* start without a key */ + rx->key = NULL; + + if (rx->sta) + sta_ptk = rcu_dereference(rx->sta->ptk); + + fc = hdr->frame_control; + + if (!ieee80211_has_protected(fc)) + mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb); + + if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) { + rx->key = sta_ptk; + if ((status->flag & RX_FLAG_DECRYPTED) && + (status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; + /* Skip decryption if the frame is not protected. */ + if (!ieee80211_has_protected(fc)) + return RX_CONTINUE; + } else if (mmie_keyidx >= 0) { + /* Broadcast/multicast robust management frame / BIP */ + if ((status->flag & RX_FLAG_DECRYPTED) && + (status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; + + if (mmie_keyidx < NUM_DEFAULT_KEYS || + mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) + return RX_DROP_MONITOR; /* unexpected BIP keyidx */ + if (rx->sta) + rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]); + if (!rx->key) + rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); + } else if (!ieee80211_has_protected(fc)) { + /* + * The frame was not protected, so skip decryption. However, we + * need to set rx->key if there is a key that could have been + * used so that the frame may be dropped if encryption would + * have been expected. + */ + struct ieee80211_key *key = NULL; + struct ieee80211_sub_if_data *sdata = rx->sdata; + int i; + + if (ieee80211_is_mgmt(fc) && + is_multicast_ether_addr(hdr->addr1) && + (key = rcu_dereference(rx->sdata->default_mgmt_key))) + rx->key = key; + else { + if (rx->sta) { + for (i = 0; i < NUM_DEFAULT_KEYS; i++) { + key = rcu_dereference(rx->sta->gtk[i]); + if (key) + break; + } + } + if (!key) { + for (i = 0; i < NUM_DEFAULT_KEYS; i++) { + key = rcu_dereference(sdata->keys[i]); + if (key) + break; + } + } + if (key) + rx->key = key; + } + return RX_CONTINUE; + } else { + u8 keyid; + /* + * The device doesn't give us the IV so we won't be + * able to look up the key. That's ok though, we + * don't need to decrypt the frame, we just won't + * be able to keep statistics accurate. + * Except for key threshold notifications, should + * we somehow allow the driver to tell us which key + * the hardware used if this flag is set? + */ + if ((status->flag & RX_FLAG_DECRYPTED) && + (status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; + + hdrlen = ieee80211_hdrlen(fc); + + if (rx->skb->len < 8 + hdrlen) + return RX_DROP_UNUSABLE; /* TODO: count this? */ + + /* + * no need to call ieee80211_wep_get_keyidx, + * it verifies a bunch of things we've done already + */ + skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1); + keyidx = keyid >> 6; + + /* check per-station GTK first, if multicast packet */ + if (is_multicast_ether_addr(hdr->addr1) && rx->sta) + rx->key = rcu_dereference(rx->sta->gtk[keyidx]); + + /* if not found, try default key */ + if (!rx->key) { + rx->key = rcu_dereference(rx->sdata->keys[keyidx]); + + /* + * RSNA-protected unicast frames should always be + * sent with pairwise or station-to-station keys, + * but for WEP we allow using a key index as well. + */ + if (rx->key && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 && + !is_multicast_ether_addr(hdr->addr1)) + rx->key = NULL; + } + } + + if (rx->key) { + rx->key->tx_rx_count++; + /* TODO: add threshold stuff again */ + } else { + return RX_DROP_MONITOR; + } + + if (skb_linearize(rx->skb)) + return RX_DROP_UNUSABLE; + /* the hdr variable is invalid now! */ + + switch (rx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + /* Check for weak IVs if possible */ + if (rx->sta && ieee80211_is_data(fc) && + (!(status->flag & RX_FLAG_IV_STRIPPED) || + !(status->flag & RX_FLAG_DECRYPTED)) && + ieee80211_wep_is_weak_iv(rx->skb, rx->key)) + rx->sta->wep_weak_iv_count++; + + result = ieee80211_crypto_wep_decrypt(rx); + break; + case WLAN_CIPHER_SUITE_TKIP: + result = ieee80211_crypto_tkip_decrypt(rx); + break; + case WLAN_CIPHER_SUITE_CCMP: + result = ieee80211_crypto_ccmp_decrypt(rx); + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + result = ieee80211_crypto_aes_cmac_decrypt(rx); + break; + default: + /* + * We can reach here only with HW-only algorithms + * but why didn't it decrypt the frame?! + */ + return RX_DROP_UNUSABLE; + } + + /* either the frame has been decrypted or will be dropped */ + status->flag |= RX_FLAG_DECRYPTED; + + return result; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local; + struct ieee80211_hdr *hdr; + struct sk_buff *skb; + + local = rx->local; + skb = rx->skb; + hdr = (struct ieee80211_hdr *) skb->data; + + if (!local->pspolling) + return RX_CONTINUE; + + if (!ieee80211_has_fromds(hdr->frame_control)) + /* this is not from AP */ + return RX_CONTINUE; + + if (!ieee80211_is_data(hdr->frame_control)) + return RX_CONTINUE; + + if (!ieee80211_has_moredata(hdr->frame_control)) { + /* AP has no more frames buffered for us */ + local->pspolling = false; + return RX_CONTINUE; + } + + /* more data bit is set, let's request a new frame from the AP */ + ieee80211_send_pspoll(local, rx->sdata); + + return RX_CONTINUE; +} + +static void ap_sta_ps_start(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + + atomic_inc(&sdata->bss->num_sta_ps); + set_sta_flags(sta, WLAN_STA_PS_STA); + if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta); +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "%s: STA %pM aid %d enters power save mode\n", + sdata->name, sta->sta.addr, sta->sta.aid); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ +} + +static void ap_sta_ps_end(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + + atomic_dec(&sdata->bss->num_sta_ps); + +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "%s: STA %pM aid %d exits power save mode\n", + sdata->name, sta->sta.addr, sta->sta.aid); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ + + if (test_sta_flags(sta, WLAN_STA_PS_DRIVER)) { +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "%s: STA %pM aid %d driver-ps-blocked\n", + sdata->name, sta->sta.addr, sta->sta.aid); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ + return; + } + + ieee80211_sta_ps_deliver_wakeup(sta); +} + +int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start) +{ + struct sta_info *sta_inf = container_of(sta, struct sta_info, sta); + bool in_ps; + + WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS)); + + /* Don't let the same PS state be set twice */ + in_ps = test_sta_flags(sta_inf, WLAN_STA_PS_STA); + if ((start && in_ps) || (!start && !in_ps)) + return -EINVAL; + + if (start) + ap_sta_ps_start(sta_inf); + else + ap_sta_ps_end(sta_inf); + + return 0; +} +EXPORT_SYMBOL(ieee80211_sta_ps_transition); + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) +{ + struct sta_info *sta = rx->sta; + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + + if (!sta) + return RX_CONTINUE; + + /* + * Update last_rx only for IBSS packets which are for the current + * BSSID to avoid keeping the current IBSS network alive in cases + * where other STAs start using different BSSID. + */ + if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) { + u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, + NL80211_IFTYPE_ADHOC); + if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0) { + sta->last_rx = jiffies; + if (ieee80211_is_data(hdr->frame_control)) { + sta->last_rx_rate_idx = status->rate_idx; + sta->last_rx_rate_flag = status->flag; + } + } + } else if (!is_multicast_ether_addr(hdr->addr1)) { + /* + * Mesh beacons will update last_rx when if they are found to + * match the current local configuration when processed. + */ + sta->last_rx = jiffies; + if (ieee80211_is_data(hdr->frame_control)) { + sta->last_rx_rate_idx = status->rate_idx; + sta->last_rx_rate_flag = status->flag; + } + } + + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) + return RX_CONTINUE; + + if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) + ieee80211_sta_rx_notify(rx->sdata, hdr); + + sta->rx_fragments++; + sta->rx_bytes += rx->skb->len; + sta->last_signal = status->signal; + ewma_add(&sta->avg_signal, -status->signal); + + /* + * Change STA power saving mode only at the end of a frame + * exchange sequence. + */ + if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) && + !ieee80211_has_morefrags(hdr->frame_control) && + !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && + (rx->sdata->vif.type == NL80211_IFTYPE_AP || + rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) { + if (test_sta_flags(sta, WLAN_STA_PS_STA)) { + /* + * Ignore doze->wake transitions that are + * indicated by non-data frames, the standard + * is unclear here, but for example going to + * PS mode and then scanning would cause a + * doze->wake transition for the probe request, + * and that is clearly undesirable. + */ + if (ieee80211_is_data(hdr->frame_control) && + !ieee80211_has_pm(hdr->frame_control)) + ap_sta_ps_end(sta); + } else { + if (ieee80211_has_pm(hdr->frame_control)) + ap_sta_ps_start(sta); + } + } + + /* + * Drop (qos-)data::nullfunc frames silently, since they + * are used only to control station power saving mode. + */ + if (ieee80211_is_nullfunc(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control)) { + I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc); + + /* + * If we receive a 4-addr nullfunc frame from a STA + * that was not moved to a 4-addr STA vlan yet, drop + * the frame to the monitor interface, to make sure + * that hostapd sees it + */ + if (ieee80211_has_a4(hdr->frame_control) && + (rx->sdata->vif.type == NL80211_IFTYPE_AP || + (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + !rx->sdata->u.vlan.sta))) + return RX_DROP_MONITOR; + /* + * Update counter and free packet here to avoid + * counting this as a dropped packed. + */ + sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + } + + return RX_CONTINUE; +} /* ieee80211_rx_h_sta_process */ + +static inline struct ieee80211_fragment_entry * +ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata, + unsigned int frag, unsigned int seq, int rx_queue, + struct sk_buff **skb) +{ + struct ieee80211_fragment_entry *entry; + int idx; + + idx = sdata->fragment_next; + entry = &sdata->fragments[sdata->fragment_next++]; + if (sdata->fragment_next >= IEEE80211_FRAGMENT_MAX) + sdata->fragment_next = 0; + + if (!skb_queue_empty(&entry->skb_list)) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + struct ieee80211_hdr *hdr = + (struct ieee80211_hdr *) entry->skb_list.next->data; + printk(KERN_DEBUG "%s: RX reassembly removed oldest " + "fragment entry (idx=%d age=%lu seq=%d last_frag=%d " + "addr1=%pM addr2=%pM\n", + sdata->name, idx, + jiffies - entry->first_frag_time, entry->seq, + entry->last_frag, hdr->addr1, hdr->addr2); +#endif + __skb_queue_purge(&entry->skb_list); + } + + __skb_queue_tail(&entry->skb_list, *skb); /* no need for locking */ + *skb = NULL; + entry->first_frag_time = jiffies; + entry->seq = seq; + entry->rx_queue = rx_queue; + entry->last_frag = frag; + entry->ccmp = 0; + entry->extra_len = 0; + + return entry; +} + +static inline struct ieee80211_fragment_entry * +ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata, + unsigned int frag, unsigned int seq, + int rx_queue, struct ieee80211_hdr *hdr) +{ + struct ieee80211_fragment_entry *entry; + int i, idx; + + idx = sdata->fragment_next; + for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) { + struct ieee80211_hdr *f_hdr; + + idx--; + if (idx < 0) + idx = IEEE80211_FRAGMENT_MAX - 1; + + entry = &sdata->fragments[idx]; + if (skb_queue_empty(&entry->skb_list) || entry->seq != seq || + entry->rx_queue != rx_queue || + entry->last_frag + 1 != frag) + continue; + + f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data; + + /* + * Check ftype and addresses are equal, else check next fragment + */ + if (((hdr->frame_control ^ f_hdr->frame_control) & + cpu_to_le16(IEEE80211_FCTL_FTYPE)) || + compare_ether_addr(hdr->addr1, f_hdr->addr1) != 0 || + compare_ether_addr(hdr->addr2, f_hdr->addr2) != 0) + continue; + + if (time_after(jiffies, entry->first_frag_time + 2 * HZ)) { + __skb_queue_purge(&entry->skb_list); + continue; + } + return entry; + } + + return NULL; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr; + u16 sc; + __le16 fc; + unsigned int frag, seq; + struct ieee80211_fragment_entry *entry; + struct sk_buff *skb; + struct ieee80211_rx_status *status; + + hdr = (struct ieee80211_hdr *)rx->skb->data; + fc = hdr->frame_control; + sc = le16_to_cpu(hdr->seq_ctrl); + frag = sc & IEEE80211_SCTL_FRAG; + + if (likely((!ieee80211_has_morefrags(fc) && frag == 0) || + (rx->skb)->len < 24 || + is_multicast_ether_addr(hdr->addr1))) { + /* not fragmented */ + goto out; + } + I802_DEBUG_INC(rx->local->rx_handlers_fragments); + + if (skb_linearize(rx->skb)) + return RX_DROP_UNUSABLE; + + /* + * skb_linearize() might change the skb->data and + * previously cached variables (in this case, hdr) need to + * be refreshed with the new data. + */ + hdr = (struct ieee80211_hdr *)rx->skb->data; + seq = (sc & IEEE80211_SCTL_SEQ) >> 4; + + if (frag == 0) { + /* This is the first fragment of a new frame. */ + entry = ieee80211_reassemble_add(rx->sdata, frag, seq, + rx->queue, &(rx->skb)); + if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP && + ieee80211_has_protected(fc)) { + int queue = ieee80211_is_mgmt(fc) ? + NUM_RX_DATA_QUEUES : rx->queue; + /* Store CCMP PN so that we can verify that the next + * fragment has a sequential PN value. */ + entry->ccmp = 1; + memcpy(entry->last_pn, + rx->key->u.ccmp.rx_pn[queue], + CCMP_PN_LEN); + } + return RX_QUEUED; + } + + /* This is a fragment for a frame that should already be pending in + * fragment cache. Add this fragment to the end of the pending entry. + */ + entry = ieee80211_reassemble_find(rx->sdata, frag, seq, rx->queue, hdr); + if (!entry) { + I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); + return RX_DROP_MONITOR; + } + + /* Verify that MPDUs within one MSDU have sequential PN values. + * (IEEE 802.11i, 8.3.3.4.5) */ + if (entry->ccmp) { + int i; + u8 pn[CCMP_PN_LEN], *rpn; + int queue; + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP) + return RX_DROP_UNUSABLE; + memcpy(pn, entry->last_pn, CCMP_PN_LEN); + for (i = CCMP_PN_LEN - 1; i >= 0; i--) { + pn[i]++; + if (pn[i]) + break; + } + queue = ieee80211_is_mgmt(fc) ? + NUM_RX_DATA_QUEUES : rx->queue; + rpn = rx->key->u.ccmp.rx_pn[queue]; + if (memcmp(pn, rpn, CCMP_PN_LEN)) + return RX_DROP_UNUSABLE; + memcpy(entry->last_pn, pn, CCMP_PN_LEN); + } + + skb_pull(rx->skb, ieee80211_hdrlen(fc)); + __skb_queue_tail(&entry->skb_list, rx->skb); + entry->last_frag = frag; + entry->extra_len += rx->skb->len; + if (ieee80211_has_morefrags(fc)) { + rx->skb = NULL; + return RX_QUEUED; + } + + rx->skb = __skb_dequeue(&entry->skb_list); + if (skb_tailroom(rx->skb) < entry->extra_len) { + I802_DEBUG_INC(rx->local->rx_expand_skb_head2); + if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len, + GFP_ATOMIC))) { + I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); + __skb_queue_purge(&entry->skb_list); + return RX_DROP_UNUSABLE; + } + } + while ((skb = __skb_dequeue(&entry->skb_list))) { + memcpy(skb_put(rx->skb, skb->len), skb->data, skb->len); + dev_kfree_skb(skb); + } + + /* Complete frame has been reassembled - process it now */ + status = IEEE80211_SKB_RXCB(rx->skb); + status->rx_flags |= IEEE80211_RX_FRAGMENTED; + + out: + if (rx->sta) + rx->sta->rx_packets++; + if (is_multicast_ether_addr(hdr->addr1)) + rx->local->dot11MulticastReceivedFrameCount++; + else + ieee80211_led_rx(rx->local); + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + __le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + if (likely(!rx->sta || !ieee80211_is_pspoll(fc) || + !(status->rx_flags & IEEE80211_RX_RA_MATCH))) + return RX_CONTINUE; + + if ((sdata->vif.type != NL80211_IFTYPE_AP) && + (sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) + return RX_DROP_UNUSABLE; + + if (!test_sta_flags(rx->sta, WLAN_STA_PS_DRIVER)) + ieee80211_sta_ps_deliver_poll_response(rx->sta); + else + set_sta_flags(rx->sta, WLAN_STA_PSPOLL); + + /* Free PS Poll skb here instead of returning RX_DROP that would + * count as an dropped frame. */ + dev_kfree_skb(rx->skb); + + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_remove_qos_control(struct ieee80211_rx_data *rx) +{ + u8 *data = rx->skb->data; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)data; + + if (!ieee80211_is_data_qos(hdr->frame_control)) + return RX_CONTINUE; + + /* remove the qos control field, update frame type and meta-data */ + memmove(data + IEEE80211_QOS_CTL_LEN, data, + ieee80211_hdrlen(hdr->frame_control) - IEEE80211_QOS_CTL_LEN); + hdr = (struct ieee80211_hdr *)skb_pull(rx->skb, IEEE80211_QOS_CTL_LEN); + /* change frame type to non QOS */ + hdr->frame_control &= ~cpu_to_le16(IEEE80211_STYPE_QOS_DATA); + + return RX_CONTINUE; +} + +static int +ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx) +{ + if (unlikely(!rx->sta || + !test_sta_flags(rx->sta, WLAN_STA_AUTHORIZED))) + return -EACCES; + + return 0; +} + +static int +ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + + /* + * Pass through unencrypted frames if the hardware has + * decrypted them already. + */ + if (status->flag & RX_FLAG_DECRYPTED) + return 0; + + /* Drop unencrypted frames if key is set. */ + if (unlikely(!ieee80211_has_protected(fc) && + !ieee80211_is_nullfunc(fc) && + ieee80211_is_data(fc) && + (rx->key || rx->sdata->drop_unencrypted))) + return -EACCES; + + return 0; +} + +static int +ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + __le16 fc = hdr->frame_control; + + /* + * Pass through unencrypted frames if the hardware has + * decrypted them already. + */ + if (status->flag & RX_FLAG_DECRYPTED) + return 0; + + if (rx->sta && test_sta_flags(rx->sta, WLAN_STA_MFP)) { + if (unlikely(!ieee80211_has_protected(fc) && + ieee80211_is_unicast_robust_mgmt_frame(rx->skb) && + rx->key)) { + if (ieee80211_is_deauth(fc)) + cfg80211_send_unprot_deauth(rx->sdata->dev, + rx->skb->data, + rx->skb->len); + else if (ieee80211_is_disassoc(fc)) + cfg80211_send_unprot_disassoc(rx->sdata->dev, + rx->skb->data, + rx->skb->len); + return -EACCES; + } + /* BIP does not use Protected field, so need to check MMIE */ + if (unlikely(ieee80211_is_multicast_robust_mgmt_frame(rx->skb) && + ieee80211_get_mmie_keyidx(rx->skb) < 0)) { + if (ieee80211_is_deauth(fc)) + cfg80211_send_unprot_deauth(rx->sdata->dev, + rx->skb->data, + rx->skb->len); + else if (ieee80211_is_disassoc(fc)) + cfg80211_send_unprot_disassoc(rx->sdata->dev, + rx->skb->data, + rx->skb->len); + return -EACCES; + } + /* + * When using MFP, Action frames are not allowed prior to + * having configured keys. + */ + if (unlikely(ieee80211_is_action(fc) && !rx->key && + ieee80211_is_robust_mgmt_frame( + (struct ieee80211_hdr *) rx->skb->data))) + return -EACCES; + } + + return 0; +} + +static int +__ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + bool check_port_control = false; + struct ethhdr *ehdr; + int ret; + + *port_control = false; + if (ieee80211_has_a4(hdr->frame_control) && + sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta) + return -1; + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !!sdata->u.mgd.use_4addr != !!ieee80211_has_a4(hdr->frame_control)) { + + if (!sdata->u.mgd.use_4addr) + return -1; + else + check_port_control = true; + } + + if (is_multicast_ether_addr(hdr->addr1) && + sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sdata->u.vlan.sta) + return -1; + + ret = ieee80211_data_to_8023(rx->skb, sdata->vif.addr, sdata->vif.type); + if (ret < 0) + return ret; + + ehdr = (struct ethhdr *) rx->skb->data; + if (ehdr->h_proto == rx->sdata->control_port_protocol) + *port_control = true; + else if (check_port_control) + return -1; + + return 0; +} + +/* + * requires that rx->skb is a frame with ethernet header + */ +static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc) +{ + static const u8 pae_group_addr[ETH_ALEN] __aligned(2) + = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 }; + struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; + + /* + * Allow EAPOL frames to us/the PAE group address regardless + * of whether the frame was encrypted or not. + */ + if (ehdr->h_proto == rx->sdata->control_port_protocol && + (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 || + compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0)) + return true; + + if (ieee80211_802_1x_port_control(rx) || + ieee80211_drop_unencrypted(rx, fc)) + return false; + + return true; +} + +/* + * requires that rx->skb is a frame with ethernet header + */ +static void +ieee80211_deliver_skb(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct net_device *dev = sdata->dev; + struct sk_buff *skb, *xmit_skb; + struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; + struct sta_info *dsta; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + skb = rx->skb; + xmit_skb = NULL; + + if ((sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && + !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && + (status->rx_flags & IEEE80211_RX_RA_MATCH) && + (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { + if (is_multicast_ether_addr(ehdr->h_dest)) { + /* + * send multicast frames both to higher layers in + * local net stack and back to the wireless medium + */ + xmit_skb = skb_copy(skb, GFP_ATOMIC); + if (!xmit_skb && net_ratelimit()) + printk(KERN_DEBUG "%s: failed to clone " + "multicast frame\n", dev->name); + } else { + dsta = sta_info_get(sdata, skb->data); + if (dsta) { + /* + * The destination station is associated to + * this AP (in this VLAN), so send the frame + * directly to it and do not pass it to local + * net stack. + */ + xmit_skb = skb; + skb = NULL; + } + } + } + + if (skb) { + int align __maybe_unused; + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + /* + * 'align' will only take the values 0 or 2 here + * since all frames are required to be aligned + * to 2-byte boundaries when being passed to + * mac80211. That also explains the __skb_push() + * below. + */ + align = ((unsigned long)(skb->data + sizeof(struct ethhdr))) & 3; + if (align) { + if (WARN_ON(skb_headroom(skb) < 3)) { + dev_kfree_skb(skb); + skb = NULL; + } else { + u8 *data = skb->data; + size_t len = skb_headlen(skb); + skb->data -= align; + memmove(skb->data, data, len); + skb_set_tail_pointer(skb, len); + } + } +#endif + + if (skb) { + /* deliver to local stack */ + skb->protocol = eth_type_trans(skb, dev); + memset(skb->cb, 0, sizeof(skb->cb)); + netif_receive_skb(skb); + } + } + + if (xmit_skb) { + /* send to wireless media */ + xmit_skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(xmit_skb); + skb_reset_mac_header(xmit_skb); + dev_queue_xmit(xmit_skb); + } +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) +{ + struct net_device *dev = rx->sdata->dev; + struct sk_buff *skb = rx->skb; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + __le16 fc = hdr->frame_control; + struct sk_buff_head frame_list; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + if (unlikely(!ieee80211_is_data(fc))) + return RX_CONTINUE; + + if (unlikely(!ieee80211_is_data_present(fc))) + return RX_DROP_MONITOR; + + if (!(status->rx_flags & IEEE80211_RX_AMSDU)) + return RX_CONTINUE; + + if (ieee80211_has_a4(hdr->frame_control) && + rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + !rx->sdata->u.vlan.sta) + return RX_DROP_UNUSABLE; + + if (is_multicast_ether_addr(hdr->addr1) && + ((rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + rx->sdata->u.vlan.sta) || + (rx->sdata->vif.type == NL80211_IFTYPE_STATION && + rx->sdata->u.mgd.use_4addr))) + return RX_DROP_UNUSABLE; + + skb->dev = dev; + __skb_queue_head_init(&frame_list); + + if (skb_linearize(skb)) + return RX_DROP_UNUSABLE; + + ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, + rx->sdata->vif.type, + rx->local->hw.extra_tx_headroom, true); + + while (!skb_queue_empty(&frame_list)) { + rx->skb = __skb_dequeue(&frame_list); + + if (!ieee80211_frame_allowed(rx, fc)) { + dev_kfree_skb(rx->skb); + continue; + } + dev->stats.rx_packets++; + dev->stats.rx_bytes += rx->skb->len; + + ieee80211_deliver_skb(rx); + } + + return RX_QUEUED; +} + +#ifdef CONFIG_MAC80211_MESH +static ieee80211_rx_result +ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr; + struct ieee80211s_hdr *mesh_hdr; + unsigned int hdrlen; + struct sk_buff *skb = rx->skb, *fwd_skb; + struct ieee80211_local *local = rx->local; + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + + hdr = (struct ieee80211_hdr *) skb->data; + hdrlen = ieee80211_hdrlen(hdr->frame_control); + mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + + if (!ieee80211_is_data(hdr->frame_control)) + return RX_CONTINUE; + + if (!mesh_hdr->ttl) + /* illegal frame */ + return RX_DROP_MONITOR; + + if (mesh_hdr->flags & MESH_FLAGS_AE) { + struct mesh_path *mppath; + char *proxied_addr; + char *mpp_addr; + + if (is_multicast_ether_addr(hdr->addr1)) { + mpp_addr = hdr->addr3; + proxied_addr = mesh_hdr->eaddr1; + } else { + mpp_addr = hdr->addr4; + proxied_addr = mesh_hdr->eaddr2; + } + + rcu_read_lock(); + mppath = mpp_path_lookup(proxied_addr, sdata); + if (!mppath) { + mpp_path_add(proxied_addr, mpp_addr, sdata); + } else { + spin_lock_bh(&mppath->state_lock); + if (compare_ether_addr(mppath->mpp, mpp_addr) != 0) + memcpy(mppath->mpp, mpp_addr, ETH_ALEN); + spin_unlock_bh(&mppath->state_lock); + } + rcu_read_unlock(); + } + + /* Frame has reached destination. Don't forward */ + if (!is_multicast_ether_addr(hdr->addr1) && + compare_ether_addr(sdata->vif.addr, hdr->addr3) == 0) + return RX_CONTINUE; + + mesh_hdr->ttl--; + + if (status->rx_flags & IEEE80211_RX_RA_MATCH) { + if (!mesh_hdr->ttl) + IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh, + dropped_frames_ttl); + else { + struct ieee80211_hdr *fwd_hdr; + struct ieee80211_tx_info *info; + + fwd_skb = skb_copy(skb, GFP_ATOMIC); + + if (!fwd_skb && net_ratelimit()) + printk(KERN_DEBUG "%s: failed to clone mesh frame\n", + sdata->name); + if (!fwd_skb) + goto out; + + fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data; + memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN); + info = IEEE80211_SKB_CB(fwd_skb); + memset(info, 0, sizeof(*info)); + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + info->control.vif = &rx->sdata->vif; + skb_set_queue_mapping(skb, + ieee80211_select_queue(rx->sdata, fwd_skb)); + ieee80211_set_qos_hdr(local, skb); + if (is_multicast_ether_addr(fwd_hdr->addr1)) + IEEE80211_IFSTA_MESH_CTR_INC(&sdata->u.mesh, + fwded_mcast); + else { + int err; + /* + * Save TA to addr1 to send TA a path error if a + * suitable next hop is not found + */ + memcpy(fwd_hdr->addr1, fwd_hdr->addr2, + ETH_ALEN); + err = mesh_nexthop_lookup(fwd_skb, sdata); + /* Failed to immediately resolve next hop: + * fwded frame was dropped or will be added + * later to the pending skb queue. */ + if (err) + return RX_DROP_MONITOR; + + IEEE80211_IFSTA_MESH_CTR_INC(&sdata->u.mesh, + fwded_unicast); + } + IEEE80211_IFSTA_MESH_CTR_INC(&sdata->u.mesh, + fwded_frames); + ieee80211_add_pending_skb(local, fwd_skb); + } + } + + out: + if (is_multicast_ether_addr(hdr->addr1) || + sdata->dev->flags & IFF_PROMISC) + return RX_CONTINUE; + else + return RX_DROP_MONITOR; +} +#endif + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_data(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_local *local = rx->local; + struct net_device *dev = sdata->dev; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + __le16 fc = hdr->frame_control; + bool port_control; + int err; + + if (unlikely(!ieee80211_is_data(hdr->frame_control))) + return RX_CONTINUE; + + if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) + return RX_DROP_MONITOR; + + /* + * Allow the cooked monitor interface of an AP to see 4-addr frames so + * that a 4-addr station can be detected and moved into a separate VLAN + */ + if (ieee80211_has_a4(hdr->frame_control) && + sdata->vif.type == NL80211_IFTYPE_AP) + return RX_DROP_MONITOR; + + err = __ieee80211_data_to_8023(rx, &port_control); + if (unlikely(err)) + return RX_DROP_UNUSABLE; + + if (!ieee80211_frame_allowed(rx, fc)) + return RX_DROP_MONITOR; + + if (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + unlikely(port_control) && sdata->bss) { + sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, + u.ap); + dev = sdata->dev; + rx->sdata = sdata; + } + + rx->skb->dev = dev; + + dev->stats.rx_packets++; + dev->stats.rx_bytes += rx->skb->len; + + if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 && + !is_multicast_ether_addr( + ((struct ethhdr *)rx->skb->data)->h_dest) && + (!local->scanning && + !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) { + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); + } + + ieee80211_deliver_skb(rx); + + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_hw *hw = &local->hw; + struct sk_buff *skb = rx->skb; + struct ieee80211_bar *bar = (struct ieee80211_bar *)skb->data; + struct tid_ampdu_rx *tid_agg_rx; + u16 start_seq_num; + u16 tid; + + if (likely(!ieee80211_is_ctl(bar->frame_control))) + return RX_CONTINUE; + + if (ieee80211_is_back_req(bar->frame_control)) { + struct { + __le16 control, start_seq_num; + } __packed bar_data; + + if (!rx->sta) + return RX_DROP_MONITOR; + + if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control), + &bar_data, sizeof(bar_data))) + return RX_DROP_MONITOR; + + tid = le16_to_cpu(bar_data.control) >> 12; + + tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + return RX_DROP_MONITOR; + + start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; + + /* reset session timer */ + if (tid_agg_rx->timeout) + mod_timer(&tid_agg_rx->session_timer, + TU_TO_EXP_TIME(tid_agg_rx->timeout)); + + spin_lock(&tid_agg_rx->reorder_lock); + /* release stored frames up to start of BAR */ + ieee80211_release_reorder_frames(hw, tid_agg_rx, start_seq_num); + spin_unlock(&tid_agg_rx->reorder_lock); + + kfree_skb(skb); + return RX_QUEUED; + } + + /* + * After this point, we only want management frames, + * so we can drop all remaining control frames to + * cooked monitor interfaces. + */ + return RX_DROP_MONITOR; +} + +static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *resp; + + if (compare_ether_addr(mgmt->da, sdata->vif.addr) != 0) { + /* Not to own unicast address */ + return; + } + + if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 || + compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) { + /* Not from the current AP or not associated yet. */ + return; + } + + if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) { + /* Too short SA Query request frame */ + return; + } + + skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom); + if (skb == NULL) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + resp = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(resp, 0, 24); + memcpy(resp->da, mgmt->sa, ETH_ALEN); + memcpy(resp->sa, sdata->vif.addr, ETH_ALEN); + memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN); + resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); + resp->u.action.category = WLAN_CATEGORY_SA_QUERY; + resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE; + memcpy(resp->u.action.u.sa_query.trans_id, + mgmt->u.action.u.sa_query.trans_id, + WLAN_SA_QUERY_TR_ID_LEN); + + ieee80211_tx_skb(sdata, skb); +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + /* + * From here on, look only at management frames. + * Data and control frames are already handled, + * and unknown (reserved) frames are useless. + */ + if (rx->skb->len < 24) + return RX_DROP_MONITOR; + + if (!ieee80211_is_mgmt(mgmt->frame_control)) + return RX_DROP_MONITOR; + + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) + return RX_DROP_MONITOR; + + if (ieee80211_drop_unencrypted_mgmt(rx)) + return RX_DROP_UNUSABLE; + + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + int len = rx->skb->len; + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + /* drop too small frames */ + if (len < IEEE80211_MIN_ACTION_SIZE) + return RX_DROP_UNUSABLE; + + if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) + return RX_DROP_UNUSABLE; + + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) + return RX_DROP_UNUSABLE; + + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_BACK: + /* + * The aggregation code is not prepared to handle + * anything but STA/AP due to the BSSID handling; + * IBSS could work in the code but isn't supported + * by drivers or the standard. + */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_AP) + break; + + /* verify action_code is present */ + if (len < IEEE80211_MIN_ACTION_SIZE + 1) + break; + + switch (mgmt->u.action.u.addba_req.action_code) { + case WLAN_ACTION_ADDBA_REQ: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.addba_req))) + goto invalid; + break; + case WLAN_ACTION_ADDBA_RESP: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.addba_resp))) + goto invalid; + break; + case WLAN_ACTION_DELBA: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.delba))) + goto invalid; + break; + default: + goto invalid; + } + + goto queue; + case WLAN_CATEGORY_SPECTRUM_MGMT: + if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ) + break; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + /* verify action_code is present */ + if (len < IEEE80211_MIN_ACTION_SIZE + 1) + break; + + switch (mgmt->u.action.u.measurement.action_code) { + case WLAN_ACTION_SPCT_MSR_REQ: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.measurement))) + break; + ieee80211_process_measurement_req(sdata, mgmt, len); + goto handled; + case WLAN_ACTION_SPCT_CHL_SWITCH: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.chan_switch))) + break; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN)) + break; + + goto queue; + } + break; + case WLAN_CATEGORY_SA_QUERY: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.sa_query))) + break; + + switch (mgmt->u.action.u.sa_query.action) { + case WLAN_ACTION_SA_QUERY_REQUEST: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + ieee80211_process_sa_query_req(sdata, mgmt, len); + goto handled; + } + break; + case WLAN_CATEGORY_MESH_ACTION: + if (!ieee80211_vif_is_mesh(&sdata->vif)) + break; + goto queue; + case WLAN_CATEGORY_MESH_PATH_SEL: + if (!mesh_path_sel_is_hwmp(sdata)) + break; + goto queue; + } + + return RX_CONTINUE; + + invalid: + status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM; + /* will return in the next handlers */ + return RX_CONTINUE; + + handled: + if (rx->sta) + rx->sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + + queue: + rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&sdata->skb_queue, rx->skb); + ieee80211_queue_work(&local->hw, &sdata->work); + if (rx->sta) + rx->sta->rx_packets++; + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + /* skip known-bad action frames and return them in the next handler */ + if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) + return RX_CONTINUE; + + /* + * Getting here means the kernel doesn't know how to handle + * it, but maybe userspace does ... include returned frames + * so userspace can register for those to know whether ones + * it transmitted were processed or returned. + */ + + if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq, + rx->skb->data, rx->skb->len, + GFP_ATOMIC)) { + if (rx->sta) + rx->sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + } + + + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct sk_buff *nskb; + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + /* + * For AP mode, hostapd is responsible for handling any action + * frames that we didn't handle, including returning unknown + * ones. For all other modes we will return them to the sender, + * setting the 0x80 bit in the action category, as required by + * 802.11-2007 7.3.1.11. + * Newer versions of hostapd shall also use the management frame + * registration mechanisms, but older ones still use cooked + * monitor interfaces so push all frames there. + */ + if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) && + (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) + return RX_DROP_MONITOR; + + /* do not return rejected action frames */ + if (mgmt->u.action.category & 0x80) + return RX_DROP_UNUSABLE; + + nskb = skb_copy_expand(rx->skb, local->hw.extra_tx_headroom, 0, + GFP_ATOMIC); + if (nskb) { + struct ieee80211_mgmt *nmgmt = (void *)nskb->data; + + nmgmt->u.action.category |= 0x80; + memcpy(nmgmt->da, nmgmt->sa, ETH_ALEN); + memcpy(nmgmt->sa, rx->sdata->vif.addr, ETH_ALEN); + + memset(nskb->cb, 0, sizeof(nskb->cb)); + + ieee80211_tx_skb(rx->sdata, nskb); + } + dev_kfree_skb(rx->skb); + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + ieee80211_rx_result rxs; + struct ieee80211_mgmt *mgmt = (void *)rx->skb->data; + __le16 stype; + + rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb); + if (rxs != RX_CONTINUE) + return rxs; + + stype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE); + + if (!ieee80211_vif_is_mesh(&sdata->vif) && + sdata->vif.type != NL80211_IFTYPE_ADHOC && + sdata->vif.type != NL80211_IFTYPE_STATION) + return RX_DROP_MONITOR; + + switch (stype) { + case cpu_to_le16(IEEE80211_STYPE_BEACON): + case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP): + /* process for all: mesh, mlme, ibss */ + break; + case cpu_to_le16(IEEE80211_STYPE_DEAUTH): + case cpu_to_le16(IEEE80211_STYPE_DISASSOC): + if (is_multicast_ether_addr(mgmt->da) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + + /* process only for station */ + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return RX_DROP_MONITOR; + break; + case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ): + case cpu_to_le16(IEEE80211_STYPE_AUTH): + /* process only for ibss */ + if (sdata->vif.type != NL80211_IFTYPE_ADHOC) + return RX_DROP_MONITOR; + break; + default: + return RX_DROP_MONITOR; + } + + /* queue up frame and kick off work to process it */ + rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&sdata->skb_queue, rx->skb); + ieee80211_queue_work(&rx->local->hw, &sdata->work); + if (rx->sta) + rx->sta->rx_packets++; + + return RX_QUEUED; +} + +/* TODO: use IEEE80211_RX_FRAGMENTED */ +static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, + struct ieee80211_rate *rate) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_local *local = rx->local; + struct ieee80211_rtap_hdr { + struct ieee80211_radiotap_header hdr; + u8 flags; + u8 rate_or_pad; + __le16 chan_freq; + __le16 chan_flags; + } __packed *rthdr; + struct sk_buff *skb = rx->skb, *skb2; + struct net_device *prev_dev = NULL; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + + /* + * If cooked monitor has been processed already, then + * don't do it again. If not, set the flag. + */ + if (rx->flags & IEEE80211_RX_CMNTR) + goto out_free_skb; + rx->flags |= IEEE80211_RX_CMNTR; + + if (skb_headroom(skb) < sizeof(*rthdr) && + pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC)) + goto out_free_skb; + + rthdr = (void *)skb_push(skb, sizeof(*rthdr)); + memset(rthdr, 0, sizeof(*rthdr)); + rthdr->hdr.it_len = cpu_to_le16(sizeof(*rthdr)); + rthdr->hdr.it_present = + cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | + (1 << IEEE80211_RADIOTAP_CHANNEL)); + + if (rate) { + rthdr->rate_or_pad = rate->bitrate / 5; + rthdr->hdr.it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); + } + rthdr->chan_freq = cpu_to_le16(status->freq); + + if (status->band == IEEE80211_BAND_5GHZ) + rthdr->chan_flags = cpu_to_le16(IEEE80211_CHAN_OFDM | + IEEE80211_CHAN_5GHZ); + else + rthdr->chan_flags = cpu_to_le16(IEEE80211_CHAN_DYN | + IEEE80211_CHAN_2GHZ); + + skb_set_mac_header(skb, 0); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_OTHERHOST; + skb->protocol = htons(ETH_P_802_2); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if (sdata->vif.type != NL80211_IFTYPE_MONITOR || + !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)) + continue; + + if (prev_dev) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) { + skb2->dev = prev_dev; + netif_receive_skb(skb2); + } + } + + prev_dev = sdata->dev; + sdata->dev->stats.rx_packets++; + sdata->dev->stats.rx_bytes += skb->len; + } + + if (prev_dev) { + skb->dev = prev_dev; + netif_receive_skb(skb); + return; + } + + out_free_skb: + dev_kfree_skb(skb); +} + +static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, + ieee80211_rx_result res) +{ + switch (res) { + case RX_DROP_MONITOR: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); + if (rx->sta) + rx->sta->rx_dropped++; + /* fall through */ + case RX_CONTINUE: { + struct ieee80211_rate *rate = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_rx_status *status; + + status = IEEE80211_SKB_RXCB((rx->skb)); + + sband = rx->local->hw.wiphy->bands[status->band]; + if (!(status->flag & RX_FLAG_HT)) + rate = &sband->bitrates[status->rate_idx]; + + ieee80211_rx_cooked_monitor(rx, rate); + break; + } + case RX_DROP_UNUSABLE: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); + if (rx->sta) + rx->sta->rx_dropped++; + dev_kfree_skb(rx->skb); + break; + case RX_QUEUED: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued); + break; + } +} + +static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx) +{ + ieee80211_rx_result res = RX_DROP_MONITOR; + struct sk_buff *skb; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(rx); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0); + + spin_lock(&rx->local->rx_skb_queue.lock); + if (rx->local->running_rx_handler) + goto unlock; + + rx->local->running_rx_handler = true; + + while ((skb = __skb_dequeue(&rx->local->rx_skb_queue))) { + spin_unlock(&rx->local->rx_skb_queue.lock); + + /* + * all the other fields are valid across frames + * that belong to an aMPDU since they are on the + * same TID from the same station + */ + rx->skb = skb; + + CALL_RXH(ieee80211_rx_h_decrypt) + CALL_RXH(ieee80211_rx_h_check_more_data) + CALL_RXH(ieee80211_rx_h_sta_process) + CALL_RXH(ieee80211_rx_h_defragment) + CALL_RXH(ieee80211_rx_h_ps_poll) + CALL_RXH(ieee80211_rx_h_michael_mic_verify) + /* must be after MMIC verify so header is counted in MPDU mic */ + CALL_RXH(ieee80211_rx_h_remove_qos_control) + CALL_RXH(ieee80211_rx_h_amsdu) +#ifdef CONFIG_MAC80211_MESH + if (ieee80211_vif_is_mesh(&rx->sdata->vif)) + CALL_RXH(ieee80211_rx_h_mesh_fwding); +#endif + CALL_RXH(ieee80211_rx_h_data) + CALL_RXH(ieee80211_rx_h_ctrl); + CALL_RXH(ieee80211_rx_h_mgmt_check) + CALL_RXH(ieee80211_rx_h_action) + CALL_RXH(ieee80211_rx_h_userspace_mgmt) + CALL_RXH(ieee80211_rx_h_action_return) + CALL_RXH(ieee80211_rx_h_mgmt) + + rxh_next: + ieee80211_rx_handlers_result(rx, res); + spin_lock(&rx->local->rx_skb_queue.lock); +#undef CALL_RXH + } + + rx->local->running_rx_handler = false; + + unlock: + spin_unlock(&rx->local->rx_skb_queue.lock); +} + +static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) +{ + ieee80211_rx_result res = RX_DROP_MONITOR; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(rx); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0); + + CALL_RXH(ieee80211_rx_h_passive_scan) + CALL_RXH(ieee80211_rx_h_check) + + ieee80211_rx_reorder_ampdu(rx); + + ieee80211_rx_handlers(rx); + return; + + rxh_next: + ieee80211_rx_handlers_result(rx, res); + +#undef CALL_RXH +} + +/* + * This function makes calls into the RX path, therefore + * it has to be invoked under RCU read lock. + */ +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) +{ + struct ieee80211_rx_data rx = { + .sta = sta, + .sdata = sta->sdata, + .local = sta->local, + .queue = tid, + .flags = 0, + }; + struct tid_ampdu_rx *tid_agg_rx; + + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + return; + + spin_lock(&tid_agg_rx->reorder_lock); + ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx); + spin_unlock(&tid_agg_rx->reorder_lock); + + ieee80211_rx_handlers(&rx); +} + +/* main receive path */ + +static int prepare_for_handlers(struct ieee80211_rx_data *rx, + struct ieee80211_hdr *hdr) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); + int multicast = is_multicast_ether_addr(hdr->addr1); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + if (!bssid && !sdata->u.mgd.use_4addr) + return 0; + if (!multicast && + compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) { + if (!(sdata->dev->flags & IFF_PROMISC) || + sdata->u.mgd.use_4addr) + return 0; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + } + break; + case NL80211_IFTYPE_ADHOC: + if (!bssid) + return 0; + if (ieee80211_is_beacon(hdr->frame_control)) { + return 1; + } + else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { + if (!(status->rx_flags & IEEE80211_RX_IN_SCAN)) + return 0; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + } else if (!multicast && + compare_ether_addr(sdata->vif.addr, + hdr->addr1) != 0) { + if (!(sdata->dev->flags & IFF_PROMISC)) + return 0; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + } else if (!rx->sta) { + int rate_idx; + if (status->flag & RX_FLAG_HT) + rate_idx = 0; /* TODO: HT rates */ + else + rate_idx = status->rate_idx; + rx->sta = ieee80211_ibss_add_sta(sdata, bssid, + hdr->addr2, BIT(rate_idx), GFP_ATOMIC); + } + break; + case NL80211_IFTYPE_MESH_POINT: + if (!multicast && + compare_ether_addr(sdata->vif.addr, + hdr->addr1) != 0) { + if (!(sdata->dev->flags & IFF_PROMISC)) + return 0; + + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + } + break; + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_AP: + if (!bssid) { + if (compare_ether_addr(sdata->vif.addr, + hdr->addr1)) + return 0; + } else if (!ieee80211_bssid_match(bssid, + sdata->vif.addr)) { + if (!(status->rx_flags & IEEE80211_RX_IN_SCAN) && + !ieee80211_is_beacon(hdr->frame_control)) + return 0; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + } + break; + case NL80211_IFTYPE_WDS: + if (bssid || !ieee80211_is_data(hdr->frame_control)) + return 0; + if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2)) + return 0; + break; + default: + /* should never get here */ + WARN_ON(1); + break; + } + + return 1; +} + +/* + * This function returns whether or not the SKB + * was destined for RX processing or not, which, + * if consume is true, is equivalent to whether + * or not the skb was consumed. + */ +static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, + struct sk_buff *skb, bool consume) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (void *)skb->data; + int prepares; + + rx->skb = skb; + status->rx_flags |= IEEE80211_RX_RA_MATCH; + prepares = prepare_for_handlers(rx, hdr); + + if (!prepares) + return false; + + if (!consume) { + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) { + if (net_ratelimit()) + wiphy_debug(local->hw.wiphy, + "failed to copy skb for %s\n", + sdata->name); + return true; + } + + rx->skb = skb; + } + + ieee80211_invoke_rx_handlers(rx); + return true; +} + +/* + * This is the actual Rx frames handler. as it blongs to Rx path it must + * be called with rcu_read_lock protection. + */ +static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, + struct sk_buff *skb) +{ + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr; + __le16 fc; + struct ieee80211_rx_data rx; + struct ieee80211_sub_if_data *prev; + struct sta_info *sta, *tmp, *prev_sta; + int err = 0; + + fc = ((struct ieee80211_hdr *)skb->data)->frame_control; + memset(&rx, 0, sizeof(rx)); + rx.skb = skb; + rx.local = local; + + if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) + local->dot11ReceivedFragmentCount++; + + if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) || + test_bit(SCAN_SW_SCANNING, &local->scanning))) + status->rx_flags |= IEEE80211_RX_IN_SCAN; + + if (ieee80211_is_mgmt(fc)) + err = skb_linearize(skb); + else + err = !pskb_may_pull(skb, ieee80211_hdrlen(fc)); + + if (err) { + dev_kfree_skb(skb); + return; + } + + hdr = (struct ieee80211_hdr *)skb->data; + ieee80211_parse_qos(&rx); + ieee80211_verify_alignment(&rx); + + if (ieee80211_is_data(fc)) { + prev_sta = NULL; + + for_each_sta_info(local, hdr->addr2, sta, tmp) { + if (!prev_sta) { + prev_sta = sta; + continue; + } + + rx.sta = prev_sta; + rx.sdata = prev_sta->sdata; + ieee80211_prepare_and_rx_handle(&rx, skb, false); + + prev_sta = sta; + } + + if (prev_sta) { + rx.sta = prev_sta; + rx.sdata = prev_sta->sdata; + + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; + goto out; + } + } + + prev = NULL; + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if (sdata->vif.type == NL80211_IFTYPE_MONITOR || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; + + /* + * frame is destined for this interface, but if it's + * not also for the previous one we handle that after + * the loop to avoid copying the SKB once too much + */ + + if (!prev) { + prev = sdata; + continue; + } + + rx.sta = sta_info_get_bss(prev, hdr->addr2); + rx.sdata = prev; + ieee80211_prepare_and_rx_handle(&rx, skb, false); + + prev = sdata; + } + + if (prev) { + rx.sta = sta_info_get_bss(prev, hdr->addr2); + rx.sdata = prev; + + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; + } + + out: + dev_kfree_skb(skb); +} + +/* + * This is the receive path handler. It is called by a low level driver when an + * 802.11 MPDU is received from the hardware. + */ +void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_rate *rate = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + + WARN_ON_ONCE(softirq_count() == 0); + + if (WARN_ON(status->band < 0 || + status->band >= IEEE80211_NUM_BANDS)) + goto drop; + + sband = local->hw.wiphy->bands[status->band]; + if (WARN_ON(!sband)) + goto drop; + + /* + * If we're suspending, it is possible although not too likely + * that we'd be receiving frames after having already partially + * quiesced the stack. We can't process such frames then since + * that might, for example, cause stations to be added or other + * driver callbacks be invoked. + */ + if (unlikely(local->quiescing || local->suspended)) + goto drop; + + /* + * The same happens when we're not even started, + * but that's worth a warning. + */ + if (WARN_ON(!local->started)) + goto drop; + + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) { + /* + * Validate the rate, unless a PLCP error means that + * we probably can't have a valid rate here anyway. + */ + + if (status->flag & RX_FLAG_HT) { + /* + * rate_idx is MCS index, which can be [0-76] + * as documented on: + * + * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n + * + * Anything else would be some sort of driver or + * hardware error. The driver should catch hardware + * errors. + */ + if (WARN((status->rate_idx < 0 || + status->rate_idx > 76), + "Rate marked as an HT rate but passed " + "status->rate_idx is not " + "an MCS index [0-76]: %d (0x%02x)\n", + status->rate_idx, + status->rate_idx)) + goto drop; + } else { + if (WARN_ON(status->rate_idx < 0 || + status->rate_idx >= sband->n_bitrates)) + goto drop; + rate = &sband->bitrates[status->rate_idx]; + } + } + + status->rx_flags = 0; + + /* + * key references and virtual interfaces are protected using RCU + * and this requires that we are in a read-side RCU section during + * receive processing + */ + rcu_read_lock(); + + /* + * Frames with failed FCS/PLCP checksum are not returned, + * all other frames are returned without radiotap header + * if it was previously present. + * Also, frames with less than 16 bytes are dropped. + */ + skb = ieee80211_rx_monitor(local, skb, rate); + if (!skb) { + rcu_read_unlock(); + return; + } + + ieee80211_tpt_led_trig_rx(local, + ((struct ieee80211_hdr *)skb->data)->frame_control, + skb->len); + __ieee80211_rx_handle_packet(hw, skb); + + rcu_read_unlock(); + + return; + drop: + kfree_skb(skb); +} +EXPORT_SYMBOL(ieee80211_rx); + +/* This is a version of the rx handler that can be called from hard irq + * context. Post the skb on the queue and schedule the tasklet */ +void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) +{ + struct ieee80211_local *local = hw_to_local(hw); + + BUILD_BUG_ON(sizeof(struct ieee80211_rx_status) > sizeof(skb->cb)); + + skb->pkt_type = IEEE80211_RX_MSG; + skb_queue_tail(&local->skb_queue, skb); + tasklet_schedule(&local->tasklet); +} +EXPORT_SYMBOL(ieee80211_rx_irqsafe); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c new file mode 100644 index 00000000..669d2e32 --- /dev/null +++ b/net/mac80211/scan.c @@ -0,0 +1,977 @@ +/* + * Scanning implementation + * + * Copyright 2003, Jouni Malinen + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "mesh.h" + +#define IEEE80211_PROBE_DELAY (HZ / 33) +#define IEEE80211_CHANNEL_TIME (HZ / 33) +#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 8) + +struct ieee80211_bss * +ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq, + u8 *ssid, u8 ssid_len) +{ + struct cfg80211_bss *cbss; + + cbss = cfg80211_get_bss(local->hw.wiphy, + ieee80211_get_channel(local->hw.wiphy, freq), + bssid, ssid, ssid_len, 0, 0); + if (!cbss) + return NULL; + return (void *)cbss->priv; +} + +static void ieee80211_rx_bss_free(struct cfg80211_bss *cbss) +{ + struct ieee80211_bss *bss = (void *)cbss->priv; + + kfree(bss_mesh_id(bss)); + kfree(bss_mesh_cfg(bss)); +} + +void ieee80211_rx_bss_put(struct ieee80211_local *local, + struct ieee80211_bss *bss) +{ + if (!bss) + return; + cfg80211_put_bss(container_of((void *)bss, struct cfg80211_bss, priv)); +} + +static bool is_uapsd_supported(struct ieee802_11_elems *elems) +{ + u8 qos_info; + + if (elems->wmm_info && elems->wmm_info_len == 7 + && elems->wmm_info[5] == 1) + qos_info = elems->wmm_info[6]; + else if (elems->wmm_param && elems->wmm_param_len == 24 + && elems->wmm_param[5] == 1) + qos_info = elems->wmm_param[6]; + else + /* no valid wmm information or parameter element found */ + return false; + + return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; +} + +struct ieee80211_bss * +ieee80211_bss_info_update(struct ieee80211_local *local, + struct ieee80211_rx_status *rx_status, + struct ieee80211_mgmt *mgmt, + size_t len, + struct ieee802_11_elems *elems, + struct ieee80211_channel *channel, + bool beacon) +{ + struct cfg80211_bss *cbss; + struct ieee80211_bss *bss; + int clen, srlen; + s32 signal = 0; + + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + signal = rx_status->signal * 100; + else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + signal = (rx_status->signal * 100) / local->hw.max_signal; + + cbss = cfg80211_inform_bss_frame(local->hw.wiphy, channel, + mgmt, len, signal, GFP_ATOMIC); + + if (!cbss) + return NULL; + + cbss->free_priv = ieee80211_rx_bss_free; + bss = (void *)cbss->priv; + + /* save the ERP value so that it is available at association time */ + if (elems->erp_info && elems->erp_info_len >= 1) { + bss->erp_value = elems->erp_info[0]; + bss->has_erp_value = 1; + } + + if (elems->tim) { + struct ieee80211_tim_ie *tim_ie = + (struct ieee80211_tim_ie *)elems->tim; + bss->dtim_period = tim_ie->dtim_period; + } + + /* If the beacon had no TIM IE, or it was invalid, use 1 */ + if (beacon && !bss->dtim_period) + bss->dtim_period = 1; + + /* replace old supported rates if we get new values */ + srlen = 0; + if (elems->supp_rates) { + clen = IEEE80211_MAX_SUPP_RATES; + if (clen > elems->supp_rates_len) + clen = elems->supp_rates_len; + memcpy(bss->supp_rates, elems->supp_rates, clen); + srlen += clen; + } + if (elems->ext_supp_rates) { + clen = IEEE80211_MAX_SUPP_RATES - srlen; + if (clen > elems->ext_supp_rates_len) + clen = elems->ext_supp_rates_len; + memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); + srlen += clen; + } + if (srlen) + bss->supp_rates_len = srlen; + + bss->wmm_used = elems->wmm_param || elems->wmm_info; + bss->uapsd_supported = is_uapsd_supported(elems); + + if (!beacon) + bss->last_probe_resp = jiffies; + + return bss; +} + +ieee80211_rx_result +ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) +{ + struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_mgmt *mgmt; + struct ieee80211_bss *bss; + u8 *elements; + struct ieee80211_channel *channel; + size_t baselen; + int freq; + __le16 fc; + bool presp, beacon = false; + struct ieee802_11_elems elems; + + if (skb->len < 2) + return RX_DROP_UNUSABLE; + + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = mgmt->frame_control; + + if (ieee80211_is_ctl(fc)) + return RX_CONTINUE; + + if (skb->len < 24) + return RX_CONTINUE; + + presp = ieee80211_is_probe_resp(fc); + if (presp) { + /* ignore ProbeResp to foreign address */ + if (memcmp(mgmt->da, sdata->vif.addr, ETH_ALEN)) + return RX_DROP_MONITOR; + + presp = true; + elements = mgmt->u.probe_resp.variable; + baselen = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); + } else { + beacon = ieee80211_is_beacon(fc); + baselen = offsetof(struct ieee80211_mgmt, u.beacon.variable); + elements = mgmt->u.beacon.variable; + } + + if (!presp && !beacon) + return RX_CONTINUE; + + if (baselen > skb->len) + return RX_DROP_MONITOR; + + ieee802_11_parse_elems(elements, skb->len - baselen, &elems); + + if (elems.ds_params && elems.ds_params_len == 1) + freq = ieee80211_channel_to_frequency(elems.ds_params[0], + rx_status->band); + else + freq = rx_status->freq; + + channel = ieee80211_get_channel(sdata->local->hw.wiphy, freq); + + if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) + return RX_DROP_MONITOR; + + bss = ieee80211_bss_info_update(sdata->local, rx_status, + mgmt, skb->len, &elems, + channel, beacon); + if (bss) + ieee80211_rx_bss_put(sdata->local, bss); + + /* If we are on-operating-channel, and this packet is for the + * current channel, pass the pkt on up the stack so that + * the rest of the stack can make use of it. + */ + if (ieee80211_cfg_on_oper_channel(sdata->local) + && (channel == sdata->local->oper_channel)) + return RX_CONTINUE; + + dev_kfree_skb(skb); + return RX_QUEUED; +} + +/* return false if no more work */ +static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) +{ + struct cfg80211_scan_request *req = local->scan_req; + enum ieee80211_band band; + int i, ielen, n_chans; + + do { + if (local->hw_scan_band == IEEE80211_NUM_BANDS) + return false; + + band = local->hw_scan_band; + n_chans = 0; + for (i = 0; i < req->n_channels; i++) { + if (req->channels[i]->band == band) { + local->hw_scan_req->channels[n_chans] = + req->channels[i]; + n_chans++; + } + } + + local->hw_scan_band++; + } while (!n_chans); + + local->hw_scan_req->n_channels = n_chans; + + ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie, + req->ie, req->ie_len, band, (u32) -1, + 0); + local->hw_scan_req->ie_len = ielen; + + return true; +} + +static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, + bool was_hw_scan) +{ + struct ieee80211_local *local = hw_to_local(hw); + bool on_oper_chan; + bool enable_beacons = false; + + lockdep_assert_held(&local->mtx); + + /* + * It's ok to abort a not-yet-running scan (that + * we have one at all will be verified by checking + * local->scan_req next), but not to complete it + * successfully. + */ + if (WARN_ON(!local->scanning && !aborted)) + aborted = true; + + if (WARN_ON(!local->scan_req)) + return; + + if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { + int rc = drv_hw_scan(local, local->scan_sdata, local->hw_scan_req); + if (rc == 0) + return; + } + + kfree(local->hw_scan_req); + local->hw_scan_req = NULL; + + if (local->scan_req != local->int_scan_req) + cfg80211_scan_done(local->scan_req, aborted); + local->scan_req = NULL; + local->scan_sdata = NULL; + + local->scanning = 0; + local->scan_channel = NULL; + + on_oper_chan = ieee80211_cfg_on_oper_channel(local); + + if (was_hw_scan || !on_oper_chan) + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + else + /* Set power back to normal operating levels. */ + ieee80211_hw_config(local, 0); + + if (!was_hw_scan) { + bool on_oper_chan2; + ieee80211_configure_filter(local); + drv_sw_scan_complete(local); + on_oper_chan2 = ieee80211_cfg_on_oper_channel(local); + /* We should always be on-channel at this point. */ + WARN_ON(!on_oper_chan2); + if (on_oper_chan2 && (on_oper_chan != on_oper_chan2)) + enable_beacons = true; + + ieee80211_offchannel_return(local, enable_beacons, true); + } + + ieee80211_recalc_idle(local); + + ieee80211_mlme_notify_scan_completed(local); + ieee80211_ibss_notify_scan_completed(local); + ieee80211_mesh_notify_scan_completed(local); + ieee80211_queue_work(&local->hw, &local->work_work); +} + +void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_scan_completed(local, aborted); + + set_bit(SCAN_COMPLETED, &local->scanning); + if (aborted) + set_bit(SCAN_ABORTED, &local->scanning); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); +} +EXPORT_SYMBOL(ieee80211_scan_completed); + +static int ieee80211_start_sw_scan(struct ieee80211_local *local) +{ + /* + * Hardware/driver doesn't support hw_scan, so use software + * scanning instead. First send a nullfunc frame with power save + * bit on so that AP will buffer the frames for us while we are not + * listening, then send probe requests to each channel and wait for + * the responses. After all channels are scanned, tune back to the + * original channel and send a nullfunc frame with power save bit + * off to trigger the AP to send us all the buffered frames. + * + * Note that while local->sw_scanning is true everything else but + * nullfunc frames and probe requests will be dropped in + * ieee80211_tx_h_check_assoc(). + */ + drv_sw_scan_start(local); + + local->leave_oper_channel_time = 0; + local->next_scan_state = SCAN_DECISION; + local->scan_channel_idx = 0; + + /* We always want to use off-channel PS, even if we + * are not really leaving oper-channel. Don't + * tell the AP though, as long as we are on-channel. + */ + ieee80211_offchannel_enable_all_ps(local, false); + + ieee80211_configure_filter(local); + + /* We need to set power level at maximum rate for scanning. */ + ieee80211_hw_config(local, 0); + + ieee80211_queue_delayed_work(&local->hw, + &local->scan_work, + IEEE80211_CHANNEL_TIME); + + return 0; +} + + +static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, + struct cfg80211_scan_request *req) +{ + struct ieee80211_local *local = sdata->local; + int rc; + + lockdep_assert_held(&local->mtx); + + if (local->scan_req) + return -EBUSY; + + if (!list_empty(&local->work_list)) { + /* wait for the work to finish/time out */ + local->scan_req = req; + local->scan_sdata = sdata; + return 0; + } + + if (local->ops->hw_scan) { + u8 *ies; + + local->hw_scan_req = kmalloc( + sizeof(*local->hw_scan_req) + + req->n_channels * sizeof(req->channels[0]) + + 2 + IEEE80211_MAX_SSID_LEN + local->scan_ies_len + + req->ie_len, GFP_KERNEL); + if (!local->hw_scan_req) + return -ENOMEM; + + local->hw_scan_req->ssids = req->ssids; + local->hw_scan_req->n_ssids = req->n_ssids; + ies = (u8 *)local->hw_scan_req + + sizeof(*local->hw_scan_req) + + req->n_channels * sizeof(req->channels[0]); + local->hw_scan_req->ie = ies; + + local->hw_scan_band = 0; + + /* + * After allocating local->hw_scan_req, we must + * go through until ieee80211_prep_hw_scan(), so + * anything that might be changed here and leave + * this function early must not go after this + * allocation. + */ + } + + local->scan_req = req; + local->scan_sdata = sdata; + + if (local->ops->hw_scan) + __set_bit(SCAN_HW_SCANNING, &local->scanning); + else + __set_bit(SCAN_SW_SCANNING, &local->scanning); + + ieee80211_recalc_idle(local); + + if (local->ops->hw_scan) { + WARN_ON(!ieee80211_prep_hw_scan(local)); + rc = drv_hw_scan(local, sdata, local->hw_scan_req); + } else + rc = ieee80211_start_sw_scan(local); + + if (rc) { + kfree(local->hw_scan_req); + local->hw_scan_req = NULL; + local->scanning = 0; + + ieee80211_recalc_idle(local); + + local->scan_req = NULL; + local->scan_sdata = NULL; + } + + return rc; +} + +static unsigned long +ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) +{ + /* + * TODO: channel switching also consumes quite some time, + * add that delay as well to get a better estimation + */ + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) + return IEEE80211_PASSIVE_CHANNEL_TIME; + return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; +} + +static void ieee80211_scan_state_decision(struct ieee80211_local *local, + unsigned long *next_delay) +{ + bool associated = false; + bool tx_empty = true; + bool bad_latency; + bool listen_int_exceeded; + unsigned long min_beacon_int = 0; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_channel *next_chan; + + /* + * check if at least one STA interface is associated, + * check if at least one STA interface has pending tx frames + * and grab the lowest used beacon interval + */ + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->u.mgd.associated) { + associated = true; + + if (sdata->vif.bss_conf.beacon_int < + min_beacon_int || min_beacon_int == 0) + min_beacon_int = + sdata->vif.bss_conf.beacon_int; + + if (!qdisc_all_tx_empty(sdata->dev)) { + tx_empty = false; + break; + } + } + } + } + mutex_unlock(&local->iflist_mtx); + + next_chan = local->scan_req->channels[local->scan_channel_idx]; + + if (ieee80211_cfg_on_oper_channel(local)) { + /* We're currently on operating channel. */ + if (next_chan == local->oper_channel) + /* We don't need to move off of operating channel. */ + local->next_scan_state = SCAN_SET_CHANNEL; + else + /* + * We do need to leave operating channel, as next + * scan is somewhere else. + */ + local->next_scan_state = SCAN_LEAVE_OPER_CHANNEL; + } else { + /* + * we're currently scanning a different channel, let's + * see if we can scan another channel without interfering + * with the current traffic situation. + * + * Since we don't know if the AP has pending frames for us + * we can only check for our tx queues and use the current + * pm_qos requirements for rx. Hence, if no tx traffic occurs + * at all we will scan as many channels in a row as the pm_qos + * latency allows us to. Additionally we also check for the + * currently negotiated listen interval to prevent losing + * frames unnecessarily. + * + * Otherwise switch back to the operating channel. + */ + + bad_latency = time_after(jiffies + + ieee80211_scan_get_channel_time(next_chan), + local->leave_oper_channel_time + + usecs_to_jiffies(pm_qos_request(PM_QOS_NETWORK_LATENCY))); + + listen_int_exceeded = time_after(jiffies + + ieee80211_scan_get_channel_time(next_chan), + local->leave_oper_channel_time + + usecs_to_jiffies(min_beacon_int * 1024) * + local->hw.conf.listen_interval); + + if (associated && ( !tx_empty || bad_latency || + listen_int_exceeded)) + local->next_scan_state = SCAN_ENTER_OPER_CHANNEL; + else + local->next_scan_state = SCAN_SET_CHANNEL; + } + + *next_delay = 0; +} + +static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local, + unsigned long *next_delay) +{ + /* PS will already be in off-channel mode, + * we do that once at the beginning of scanning. + */ + ieee80211_offchannel_stop_vifs(local, false); + + /* + * What if the nullfunc frames didn't arrive? + */ + drv_flush(local, false); + if (local->ops->flush) + *next_delay = 0; + else + *next_delay = HZ / 10; + + /* remember when we left the operating channel */ + local->leave_oper_channel_time = jiffies; + + /* advance to the next channel to be scanned */ + local->next_scan_state = SCAN_SET_CHANNEL; +} + +static void ieee80211_scan_state_enter_oper_channel(struct ieee80211_local *local, + unsigned long *next_delay) +{ + /* switch back to the operating channel */ + local->scan_channel = NULL; + if (!ieee80211_cfg_on_oper_channel(local)) + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + + /* + * Re-enable vifs and beaconing. Leave PS + * in off-channel state..will put that back + * on-channel at the end of scanning. + */ + ieee80211_offchannel_return(local, true, false); + + *next_delay = HZ / 5; + local->next_scan_state = SCAN_DECISION; +} + +static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, + unsigned long *next_delay) +{ + int skip; + struct ieee80211_channel *chan; + + skip = 0; + chan = local->scan_req->channels[local->scan_channel_idx]; + + local->scan_channel = chan; + + /* Only call hw-config if we really need to change channels. */ + if (chan != local->hw.conf.channel) + if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL)) + skip = 1; + + /* advance state machine to next channel/band */ + local->scan_channel_idx++; + + if (skip) { + /* if we skip this channel return to the decision state */ + local->next_scan_state = SCAN_DECISION; + return; + } + + /* + * Probe delay is used to update the NAV, cf. 11.1.3.2.2 + * (which unfortunately doesn't say _why_ step a) is done, + * but it waits for the probe delay or until a frame is + * received - and the received frame would update the NAV). + * For now, we do not support waiting until a frame is + * received. + * + * In any case, it is not necessary for a passive scan. + */ + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN || + !local->scan_req->n_ssids) { + *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; + local->next_scan_state = SCAN_DECISION; + return; + } + + /* active scan, send probes */ + *next_delay = IEEE80211_PROBE_DELAY; + local->next_scan_state = SCAN_SEND_PROBE; +} + +static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, + unsigned long *next_delay) +{ + int i; + struct ieee80211_sub_if_data *sdata = local->scan_sdata; + + for (i = 0; i < local->scan_req->n_ssids; i++) + ieee80211_send_probe_req( + sdata, NULL, + local->scan_req->ssids[i].ssid, + local->scan_req->ssids[i].ssid_len, + local->scan_req->ie, local->scan_req->ie_len); + + /* + * After sending probe requests, wait for probe responses + * on the channel. + */ + *next_delay = IEEE80211_CHANNEL_TIME; + local->next_scan_state = SCAN_DECISION; +} + +void ieee80211_scan_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, scan_work.work); + struct ieee80211_sub_if_data *sdata; + unsigned long next_delay = 0; + bool aborted, hw_scan; + + mutex_lock(&local->mtx); + + sdata = local->scan_sdata; + + if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { + aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); + goto out_complete; + } + + if (!sdata || !local->scan_req) + goto out; + + if (local->scan_req && !local->scanning) { + struct cfg80211_scan_request *req = local->scan_req; + int rc; + + local->scan_req = NULL; + local->scan_sdata = NULL; + + rc = __ieee80211_start_scan(sdata, req); + if (rc) { + /* need to complete scan in cfg80211 */ + local->scan_req = req; + aborted = true; + goto out_complete; + } else + goto out; + } + + /* + * Avoid re-scheduling when the sdata is going away. + */ + if (!ieee80211_sdata_running(sdata)) { + aborted = true; + goto out_complete; + } + + /* + * as long as no delay is required advance immediately + * without scheduling a new work + */ + do { + if (!ieee80211_sdata_running(sdata)) { + aborted = true; + goto out_complete; + } + + switch (local->next_scan_state) { + case SCAN_DECISION: + /* if no more bands/channels left, complete scan */ + if (local->scan_channel_idx >= local->scan_req->n_channels) { + aborted = false; + goto out_complete; + } + ieee80211_scan_state_decision(local, &next_delay); + break; + case SCAN_SET_CHANNEL: + ieee80211_scan_state_set_channel(local, &next_delay); + break; + case SCAN_SEND_PROBE: + ieee80211_scan_state_send_probe(local, &next_delay); + break; + case SCAN_LEAVE_OPER_CHANNEL: + ieee80211_scan_state_leave_oper_channel(local, &next_delay); + break; + case SCAN_ENTER_OPER_CHANNEL: + ieee80211_scan_state_enter_oper_channel(local, &next_delay); + break; + } + } while (next_delay == 0); + + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay); + goto out; + +out_complete: + hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); + __ieee80211_scan_completed(&local->hw, aborted, hw_scan); +out: + mutex_unlock(&local->mtx); +} + +int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, + struct cfg80211_scan_request *req) +{ + int res; + + mutex_lock(&sdata->local->mtx); + res = __ieee80211_start_scan(sdata, req); + mutex_unlock(&sdata->local->mtx); + + return res; +} + +int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, + const u8 *ssid, u8 ssid_len, + struct ieee80211_channel *chan) +{ + struct ieee80211_local *local = sdata->local; + int ret = -EBUSY; + enum ieee80211_band band; + + mutex_lock(&local->mtx); + + /* busy scanning */ + if (local->scan_req) + goto unlock; + + /* fill internal scan request */ + if (!chan) { + int i, nchan = 0; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!local->hw.wiphy->bands[band]) + continue; + for (i = 0; + i < local->hw.wiphy->bands[band]->n_channels; + i++) { + local->int_scan_req->channels[nchan] = + &local->hw.wiphy->bands[band]->channels[i]; + nchan++; + } + } + + local->int_scan_req->n_channels = nchan; + } else { + local->int_scan_req->channels[0] = chan; + local->int_scan_req->n_channels = 1; + } + + local->int_scan_req->ssids = &local->scan_ssid; + local->int_scan_req->n_ssids = 1; + memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); + local->int_scan_req->ssids[0].ssid_len = ssid_len; + + ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); + unlock: + mutex_unlock(&local->mtx); + return ret; +} + +/* + * Only call this function when a scan can't be queued -- under RTNL. + */ +void ieee80211_scan_cancel(struct ieee80211_local *local) +{ + bool abortscan; + + /* + * We are only canceling software scan, or deferred scan that was not + * yet really started (see __ieee80211_start_scan ). + * + * Regarding hardware scan: + * - we can not call __ieee80211_scan_completed() as when + * SCAN_HW_SCANNING bit is set this function change + * local->hw_scan_req to operate on 5G band, what race with + * driver which can use local->hw_scan_req + * + * - we can not cancel scan_work since driver can schedule it + * by ieee80211_scan_completed(..., true) to finish scan + * + * Hence low lever driver is responsible for canceling HW scan. + */ + + mutex_lock(&local->mtx); + abortscan = local->scan_req && !test_bit(SCAN_HW_SCANNING, &local->scanning); + if (abortscan) { + /* + * The scan is canceled, but stop work from being pending. + * + * If the work is currently running, it must be blocked on + * the mutex, but we'll set scan_sdata = NULL and it'll + * simply exit once it acquires the mutex. + */ + cancel_delayed_work(&local->scan_work); + /* and clean up */ + __ieee80211_scan_completed(&local->hw, true, false); + } + mutex_unlock(&local->mtx); +} + +int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, + struct cfg80211_sched_scan_request *req) +{ + struct ieee80211_local *local = sdata->local; + int ret, i; + + mutex_lock(&sdata->local->mtx); + + if (local->sched_scanning) { + ret = -EBUSY; + goto out; + } + + if (!local->ops->sched_scan_start) { + ret = -ENOTSUPP; + goto out; + } + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + local->sched_scan_ies.ie[i] = kzalloc(2 + + IEEE80211_MAX_SSID_LEN + + local->scan_ies_len + + req->ie_len, + GFP_KERNEL); + if (!local->sched_scan_ies.ie[i]) { + ret = -ENOMEM; + goto out_free; + } + + local->sched_scan_ies.len[i] = + ieee80211_build_preq_ies(local, + local->sched_scan_ies.ie[i], + req->ie, req->ie_len, i, + (u32) -1, 0); + } + + ret = drv_sched_scan_start(local, sdata, req, + &local->sched_scan_ies); + if (ret == 0) { + local->sched_scanning = true; + goto out; + } + +out_free: + while (i > 0) + kfree(local->sched_scan_ies.ie[--i]); +out: + mutex_unlock(&sdata->local->mtx); + return ret; +} + +int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + int ret = 0, i; + + mutex_lock(&sdata->local->mtx); + + if (!local->ops->sched_scan_stop) { + ret = -ENOTSUPP; + goto out; + } + + if (local->sched_scanning) { + for (i = 0; i < IEEE80211_NUM_BANDS; i++) + kfree(local->sched_scan_ies.ie[i]); + + drv_sched_scan_stop(local, sdata); + local->sched_scanning = false; + } +out: + mutex_unlock(&sdata->local->mtx); + + return ret; +} + +void ieee80211_sched_scan_results(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_sched_scan_results(local); + + cfg80211_sched_scan_results(hw->wiphy); +} +EXPORT_SYMBOL(ieee80211_sched_scan_results); + +void ieee80211_sched_scan_stopped_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, + sched_scan_stopped_work); + int i; + + mutex_lock(&local->mtx); + + if (!local->sched_scanning) { + mutex_unlock(&local->mtx); + return; + } + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) + kfree(local->sched_scan_ies.ie[i]); + + local->sched_scanning = false; + + mutex_unlock(&local->mtx); + + cfg80211_sched_scan_stopped(local->hw.wiphy); +} + +void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_sched_scan_stopped(local); + + ieee80211_queue_work(&local->hw, &local->sched_scan_stopped_work); +} +EXPORT_SYMBOL(ieee80211_sched_scan_stopped); diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c new file mode 100644 index 00000000..7733f66e --- /dev/null +++ b/net/mac80211/spectmgmt.c @@ -0,0 +1,86 @@ +/* + * spectrum management + * + * Copyright 2003, Jouni Malinen + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2007-2008, Intel Corporation + * Copyright 2008, Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include "ieee80211_i.h" +#include "sta_info.h" +#include "wme.h" + +static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata, + struct ieee80211_msrment_ie *request_ie, + const u8 *da, const u8 *bssid, + u8 dialog_token) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *msr_report; + + skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom + + sizeof(struct ieee80211_msrment_ie)); + + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer for " + "measurement report frame\n", sdata->name); + return; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24); + memset(msr_report, 0, 24); + memcpy(msr_report->da, da, ETH_ALEN); + memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN); + memcpy(msr_report->bssid, bssid, ETH_ALEN); + msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement)); + msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; + msr_report->u.action.u.measurement.action_code = + WLAN_ACTION_SPCT_MSR_RPRT; + msr_report->u.action.u.measurement.dialog_token = dialog_token; + + msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT; + msr_report->u.action.u.measurement.length = + sizeof(struct ieee80211_msrment_ie); + + memset(&msr_report->u.action.u.measurement.msr_elem, 0, + sizeof(struct ieee80211_msrment_ie)); + msr_report->u.action.u.measurement.msr_elem.token = request_ie->token; + msr_report->u.action.u.measurement.msr_elem.mode |= + IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED; + msr_report->u.action.u.measurement.msr_elem.type = request_ie->type; + + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + /* + * Ignoring measurement request is spec violation. + * Mandatory measurements must be reported optional + * measurements might be refused or reported incapable + * For now just refuse + * TODO: Answer basic measurement as unmeasured + */ + ieee80211_send_refuse_measurement_request(sdata, + &mgmt->u.action.u.measurement.msr_elem, + mgmt->sa, mgmt->bssid, + mgmt->u.action.u.measurement.dialog_token); +} diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c new file mode 100644 index 00000000..3ff633e8 --- /dev/null +++ b/net/mac80211/sta_info.c @@ -0,0 +1,1005 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2006-2007 Jiri Benc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" +#include "sta_info.h" +#include "debugfs_sta.h" +#include "mesh.h" + +/** + * DOC: STA information lifetime rules + * + * STA info structures (&struct sta_info) are managed in a hash table + * for faster lookup and a list for iteration. They are managed using + * RCU, i.e. access to the list and hash table is protected by RCU. + * + * Upon allocating a STA info structure with sta_info_alloc(), the caller + * owns that structure. It must then insert it into the hash table using + * either sta_info_insert() or sta_info_insert_rcu(); only in the latter + * case (which acquires an rcu read section but must not be called from + * within one) will the pointer still be valid after the call. Note that + * the caller may not do much with the STA info before inserting it, in + * particular, it may not start any mesh peer link management or add + * encryption keys. + * + * When the insertion fails (sta_info_insert()) returns non-zero), the + * structure will have been freed by sta_info_insert()! + * + * Station entries are added by mac80211 when you establish a link with a + * peer. This means different things for the different type of interfaces + * we support. For a regular station this mean we add the AP sta when we + * receive an association response from the AP. For IBSS this occurs when + * get to know about a peer on the same IBSS. For WDS we add the sta for + * the peer immediately upon device open. When using AP mode we add stations + * for each respective station upon request from userspace through nl80211. + * + * In order to remove a STA info structure, various sta_info_destroy_*() + * calls are available. + * + * There is no concept of ownership on a STA entry, each structure is + * owned by the global hash table/list until it is removed. All users of + * the structure need to be RCU protected so that the structure won't be + * freed before they are done using it. + */ + +/* Caller must hold local->sta_lock */ +static int sta_info_hash_del(struct ieee80211_local *local, + struct sta_info *sta) +{ + struct sta_info *s; + + s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)], + lockdep_is_held(&local->sta_lock)); + if (!s) + return -ENOENT; + if (s == sta) { + rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], + s->hnext); + return 0; + } + + while (rcu_access_pointer(s->hnext) && + rcu_access_pointer(s->hnext) != sta) + s = rcu_dereference_protected(s->hnext, + lockdep_is_held(&local->sta_lock)); + if (rcu_access_pointer(s->hnext)) { + rcu_assign_pointer(s->hnext, sta->hnext); + return 0; + } + + return -ENOENT; +} + +/* protected by RCU */ +struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, + const u8 *addr) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], + rcu_read_lock_held() || + lockdep_is_held(&local->sta_lock) || + lockdep_is_held(&local->sta_mtx)); + while (sta) { + if (sta->sdata == sdata && + memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) + break; + sta = rcu_dereference_check(sta->hnext, + rcu_read_lock_held() || + lockdep_is_held(&local->sta_lock) || + lockdep_is_held(&local->sta_mtx)); + } + return sta; +} + +/* + * Get sta info either from the specified interface + * or from one of its vlans + */ +struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, + const u8 *addr) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], + rcu_read_lock_held() || + lockdep_is_held(&local->sta_lock) || + lockdep_is_held(&local->sta_mtx)); + while (sta) { + if ((sta->sdata == sdata || + (sta->sdata->bss && sta->sdata->bss == sdata->bss)) && + memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) + break; + sta = rcu_dereference_check(sta->hnext, + rcu_read_lock_held() || + lockdep_is_held(&local->sta_lock) || + lockdep_is_held(&local->sta_mtx)); + } + return sta; +} + +struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, + int idx) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + int i = 0; + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata) + continue; + if (i < idx) { + ++i; + continue; + } + return sta; + } + + return NULL; +} + +/** + * __sta_info_free - internal STA free helper + * + * @local: pointer to the global information + * @sta: STA info to free + * + * This function must undo everything done by sta_info_alloc() + * that may happen before sta_info_insert(). + */ +static void __sta_info_free(struct ieee80211_local *local, + struct sta_info *sta) +{ + if (sta->rate_ctrl) { + rate_control_free_sta(sta); + rate_control_put(sta->rate_ctrl); + } + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + + kfree(sta); +} + +/* Caller must hold local->sta_lock */ +static void sta_info_hash_add(struct ieee80211_local *local, + struct sta_info *sta) +{ + sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)]; + rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta); +} + +static void sta_unblock(struct work_struct *wk) +{ + struct sta_info *sta; + + sta = container_of(wk, struct sta_info, drv_unblock_wk); + + if (sta->dead) + return; + + if (!test_sta_flags(sta, WLAN_STA_PS_STA)) + ieee80211_sta_ps_deliver_wakeup(sta); + else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) { + clear_sta_flags(sta, WLAN_STA_PS_DRIVER); + ieee80211_sta_ps_deliver_poll_response(sta); + } else + clear_sta_flags(sta, WLAN_STA_PS_DRIVER); +} + +static int sta_prepare_rate_control(struct ieee80211_local *local, + struct sta_info *sta, gfp_t gfp) +{ + if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + return 0; + + sta->rate_ctrl = rate_control_get(local->rate_ctrl); + sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, + &sta->sta, gfp); + if (!sta->rate_ctrl_priv) { + rate_control_put(sta->rate_ctrl); + return -ENOMEM; + } + + return 0; +} + +struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, + u8 *addr, gfp_t gfp) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + struct timespec uptime; + int i; + + sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp); + if (!sta) + return NULL; + + spin_lock_init(&sta->lock); + spin_lock_init(&sta->flaglock); + INIT_WORK(&sta->drv_unblock_wk, sta_unblock); + INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); + mutex_init(&sta->ampdu_mlme.mtx); + + memcpy(sta->sta.addr, addr, ETH_ALEN); + sta->local = local; + sta->sdata = sdata; + sta->last_rx = jiffies; + + do_posix_clock_monotonic_gettime(&uptime); + sta->last_connected = uptime.tv_sec; + ewma_init(&sta->avg_signal, 1024, 8); + + if (sta_prepare_rate_control(local, sta, gfp)) { + kfree(sta); + return NULL; + } + + for (i = 0; i < STA_TID_NUM; i++) { + /* + * timer_to_tid must be initialized with identity mapping + * to enable session_timer's data differentiation. See + * sta_rx_agg_session_timer_expired for usage. + */ + sta->timer_to_tid[i] = i; + } + skb_queue_head_init(&sta->ps_tx_buf); + skb_queue_head_init(&sta->tx_filtered); + + for (i = 0; i < NUM_RX_DATA_QUEUES; i++) + sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + +#ifdef CONFIG_MAC80211_MESH + sta->plink_state = NL80211_PLINK_LISTEN; + init_timer(&sta->plink_timer); +#endif + + return sta; +} + +static int sta_info_finish_insert(struct sta_info *sta, bool async) +{ + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct station_info sinfo; + unsigned long flags; + int err = 0; + + lockdep_assert_held(&local->sta_mtx); + + /* notify driver */ + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + err = drv_sta_add(local, sdata, &sta->sta); + if (err) { + if (!async) + return err; + printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)" + " - keeping it anyway.\n", + sdata->name, sta->sta.addr, err); + } else { + sta->uploaded = true; +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + if (async) + wiphy_debug(local->hw.wiphy, + "Finished adding IBSS STA %pM\n", + sta->sta.addr); +#endif + } + + sdata = sta->sdata; + + if (!async) { + local->num_sta++; + local->sta_generation++; + smp_mb(); + + /* make the station visible */ + spin_lock_irqsave(&local->sta_lock, flags); + sta_info_hash_add(local, sta); + spin_unlock_irqrestore(&local->sta_lock, flags); + } + + list_add(&sta->list, &local->sta_list); + + ieee80211_sta_debugfs_add(sta); + rate_control_add_sta_debugfs(sta); + + memset(&sinfo, 0, sizeof(sinfo)); + sinfo.filled = 0; + sinfo.generation = local->sta_generation; + cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL); + + + return 0; +} + +static void sta_info_finish_pending(struct ieee80211_local *local) +{ + struct sta_info *sta; + unsigned long flags; + + spin_lock_irqsave(&local->sta_lock, flags); + while (!list_empty(&local->sta_pending_list)) { + sta = list_first_entry(&local->sta_pending_list, + struct sta_info, list); + list_del(&sta->list); + spin_unlock_irqrestore(&local->sta_lock, flags); + + sta_info_finish_insert(sta, true); + + spin_lock_irqsave(&local->sta_lock, flags); + } + spin_unlock_irqrestore(&local->sta_lock, flags); +} + +static void sta_info_finish_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, sta_finish_work); + + mutex_lock(&local->sta_mtx); + sta_info_finish_pending(local); + mutex_unlock(&local->sta_mtx); +} + +int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) +{ + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + unsigned long flags; + int err = 0; + + /* + * Can't be a WARN_ON because it can be triggered through a race: + * something inserts a STA (on one CPU) without holding the RTNL + * and another CPU turns off the net device. + */ + if (unlikely(!ieee80211_sdata_running(sdata))) { + err = -ENETDOWN; + rcu_read_lock(); + goto out_free; + } + + if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 || + is_multicast_ether_addr(sta->sta.addr))) { + err = -EINVAL; + rcu_read_lock(); + goto out_free; + } + + /* + * In ad-hoc mode, we sometimes need to insert stations + * from tasklet context from the RX path. To avoid races, + * always do so in that case -- see the comment below. + */ + if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + spin_lock_irqsave(&local->sta_lock, flags); + /* check if STA exists already */ + if (sta_info_get_bss(sdata, sta->sta.addr)) { + spin_unlock_irqrestore(&local->sta_lock, flags); + rcu_read_lock(); + err = -EEXIST; + goto out_free; + } + + local->num_sta++; + local->sta_generation++; + smp_mb(); + sta_info_hash_add(local, sta); + + list_add_tail(&sta->list, &local->sta_pending_list); + + rcu_read_lock(); + spin_unlock_irqrestore(&local->sta_lock, flags); + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n", + sta->sta.addr); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + + ieee80211_queue_work(&local->hw, &local->sta_finish_work); + + return 0; + } + + /* + * On first glance, this will look racy, because the code + * below this point, which inserts a station with sleeping, + * unlocks the sta_lock between checking existence in the + * hash table and inserting into it. + * + * However, it is not racy against itself because it keeps + * the mutex locked. It still seems to race against the + * above code that atomically inserts the station... That, + * however, is not true because the above code can only + * be invoked for IBSS interfaces, and the below code will + * not be -- and the two do not race against each other as + * the hash table also keys off the interface. + */ + + might_sleep(); + + mutex_lock(&local->sta_mtx); + + spin_lock_irqsave(&local->sta_lock, flags); + /* check if STA exists already */ + if (sta_info_get_bss(sdata, sta->sta.addr)) { + spin_unlock_irqrestore(&local->sta_lock, flags); + mutex_unlock(&local->sta_mtx); + rcu_read_lock(); + err = -EEXIST; + goto out_free; + } + + spin_unlock_irqrestore(&local->sta_lock, flags); + + err = sta_info_finish_insert(sta, false); + if (err) { + mutex_unlock(&local->sta_mtx); + rcu_read_lock(); + goto out_free; + } + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + + /* move reference to rcu-protected */ + rcu_read_lock(); + mutex_unlock(&local->sta_mtx); + + if (ieee80211_vif_is_mesh(&sdata->vif)) + mesh_accept_plinks_update(sdata); + + return 0; + out_free: + BUG_ON(!err); + __sta_info_free(local, sta); + return err; +} + +int sta_info_insert(struct sta_info *sta) +{ + int err = sta_info_insert_rcu(sta); + + rcu_read_unlock(); + + return err; +} + +static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid) +{ + /* + * This format has been mandated by the IEEE specifications, + * so this line may not be changed to use the __set_bit() format. + */ + bss->tim[aid / 8] |= (1 << (aid % 8)); +} + +static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid) +{ + /* + * This format has been mandated by the IEEE specifications, + * so this line may not be changed to use the __clear_bit() format. + */ + bss->tim[aid / 8] &= ~(1 << (aid % 8)); +} + +static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss, + struct sta_info *sta) +{ + BUG_ON(!bss); + + __bss_tim_set(bss, sta->sta.aid); + + if (sta->local->ops->set_tim) { + sta->local->tim_in_locked_section = true; + drv_set_tim(sta->local, &sta->sta, true); + sta->local->tim_in_locked_section = false; + } +} + +void sta_info_set_tim_bit(struct sta_info *sta) +{ + unsigned long flags; + + BUG_ON(!sta->sdata->bss); + + spin_lock_irqsave(&sta->local->sta_lock, flags); + __sta_info_set_tim_bit(sta->sdata->bss, sta); + spin_unlock_irqrestore(&sta->local->sta_lock, flags); +} + +static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss, + struct sta_info *sta) +{ + BUG_ON(!bss); + + __bss_tim_clear(bss, sta->sta.aid); + + if (sta->local->ops->set_tim) { + sta->local->tim_in_locked_section = true; + drv_set_tim(sta->local, &sta->sta, false); + sta->local->tim_in_locked_section = false; + } +} + +void sta_info_clear_tim_bit(struct sta_info *sta) +{ + unsigned long flags; + + BUG_ON(!sta->sdata->bss); + + spin_lock_irqsave(&sta->local->sta_lock, flags); + __sta_info_clear_tim_bit(sta->sdata->bss, sta); + spin_unlock_irqrestore(&sta->local->sta_lock, flags); +} + +static int sta_info_buffer_expired(struct sta_info *sta, + struct sk_buff *skb) +{ + struct ieee80211_tx_info *info; + int timeout; + + if (!skb) + return 0; + + info = IEEE80211_SKB_CB(skb); + + /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ + timeout = (sta->listen_interval * + sta->sdata->vif.bss_conf.beacon_int * + 32 / 15625) * HZ; + if (timeout < STA_TX_BUFFER_EXPIRE) + timeout = STA_TX_BUFFER_EXPIRE; + return time_after(jiffies, info->control.jiffies + timeout); +} + + +static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, + struct sta_info *sta) +{ + unsigned long flags; + struct sk_buff *skb; + + if (skb_queue_empty(&sta->ps_tx_buf)) + return false; + + for (;;) { + spin_lock_irqsave(&sta->ps_tx_buf.lock, flags); + skb = skb_peek(&sta->ps_tx_buf); + if (sta_info_buffer_expired(sta, skb)) + skb = __skb_dequeue(&sta->ps_tx_buf); + else + skb = NULL; + spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags); + + if (!skb) + break; + + local->total_ps_buffered--; +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n", + sta->sta.addr); +#endif + dev_kfree_skb(skb); + + if (skb_queue_empty(&sta->ps_tx_buf) && + !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF)) + sta_info_clear_tim_bit(sta); + } + + return true; +} + +static int __must_check __sta_info_destroy(struct sta_info *sta) +{ + struct ieee80211_local *local; + struct ieee80211_sub_if_data *sdata; + struct sk_buff *skb; + unsigned long flags; + int ret, i; + + might_sleep(); + + if (!sta) + return -ENOENT; + + local = sta->local; + sdata = sta->sdata; + + /* + * Before removing the station from the driver and + * rate control, it might still start new aggregation + * sessions -- block that to make sure the tear-down + * will be sufficient. + */ + set_sta_flags(sta, WLAN_STA_BLOCK_BA); + ieee80211_sta_tear_down_BA_sessions(sta, true); + + spin_lock_irqsave(&local->sta_lock, flags); + ret = sta_info_hash_del(local, sta); + /* this might still be the pending list ... which is fine */ + if (!ret) + list_del(&sta->list); + spin_unlock_irqrestore(&local->sta_lock, flags); + if (ret) + return ret; + + mutex_lock(&local->key_mtx); + for (i = 0; i < NUM_DEFAULT_KEYS; i++) + __ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i])); + if (sta->ptk) + __ieee80211_key_free(key_mtx_dereference(local, sta->ptk)); + mutex_unlock(&local->key_mtx); + + sta->dead = true; + + if (test_and_clear_sta_flags(sta, + WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) { + BUG_ON(!sdata->bss); + + atomic_dec(&sdata->bss->num_sta_ps); + sta_info_clear_tim_bit(sta); + } + + local->num_sta--; + local->sta_generation++; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + rcu_assign_pointer(sdata->u.vlan.sta, NULL); + + if (sta->uploaded) { + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + drv_sta_remove(local, sdata, &sta->sta); + sdata = sta->sdata; + } + + /* + * At this point, after we wait for an RCU grace period, + * neither mac80211 nor the driver can reference this + * sta struct any more except by still existing timers + * associated with this station that we clean up below. + */ + synchronize_rcu(); + +#ifdef CONFIG_MAC80211_MESH + if (ieee80211_vif_is_mesh(&sdata->vif)) + mesh_accept_plinks_update(sdata); +#endif + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + cancel_work_sync(&sta->drv_unblock_wk); + + cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL); + + rate_control_remove_sta_debugfs(sta); + ieee80211_sta_debugfs_remove(sta); + +#ifdef CONFIG_MAC80211_MESH + if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { + mesh_plink_deactivate(sta); + del_timer_sync(&sta->plink_timer); + } +#endif + + while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) { + local->total_ps_buffered--; + dev_kfree_skb_any(skb); + } + + while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) + dev_kfree_skb_any(skb); + + __sta_info_free(local, sta); + + return 0; +} + +int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) +{ + struct sta_info *sta; + int ret; + + mutex_lock(&sdata->local->sta_mtx); + sta = sta_info_get(sdata, addr); + ret = __sta_info_destroy(sta); + mutex_unlock(&sdata->local->sta_mtx); + + return ret; +} + +int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, + const u8 *addr) +{ + struct sta_info *sta; + int ret; + + mutex_lock(&sdata->local->sta_mtx); + sta = sta_info_get_bss(sdata, addr); + ret = __sta_info_destroy(sta); + mutex_unlock(&sdata->local->sta_mtx); + + return ret; +} + +static void sta_info_cleanup(unsigned long data) +{ + struct ieee80211_local *local = (struct ieee80211_local *) data; + struct sta_info *sta; + bool timer_needed = false; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) + if (sta_info_cleanup_expire_buffered(local, sta)) + timer_needed = true; + rcu_read_unlock(); + + if (local->quiescing) + return; + + if (!timer_needed) + return; + + mod_timer(&local->sta_cleanup, + round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); +} + +void sta_info_init(struct ieee80211_local *local) +{ + spin_lock_init(&local->sta_lock); + mutex_init(&local->sta_mtx); + INIT_LIST_HEAD(&local->sta_list); + INIT_LIST_HEAD(&local->sta_pending_list); + INIT_WORK(&local->sta_finish_work, sta_info_finish_work); + + setup_timer(&local->sta_cleanup, sta_info_cleanup, + (unsigned long)local); +} + +void sta_info_stop(struct ieee80211_local *local) +{ + del_timer(&local->sta_cleanup); + sta_info_flush(local, NULL); +} + +/** + * sta_info_flush - flush matching STA entries from the STA table + * + * Returns the number of removed STA entries. + * + * @local: local interface data + * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs + */ +int sta_info_flush(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta, *tmp; + int ret = 0; + + might_sleep(); + + mutex_lock(&local->sta_mtx); + + sta_info_finish_pending(local); + + list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { + if (!sdata || sdata == sta->sdata) + WARN_ON(__sta_info_destroy(sta)); + } + mutex_unlock(&local->sta_mtx); + + return ret; +} + +void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, + unsigned long exp_time) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta, *tmp; + + mutex_lock(&local->sta_mtx); + list_for_each_entry_safe(sta, tmp, &local->sta_list, list) + if (time_after(jiffies, sta->last_rx + exp_time)) { +#ifdef CONFIG_MAC80211_IBSS_DEBUG + printk(KERN_DEBUG "%s: expiring inactive STA %pM\n", + sdata->name, sta->sta.addr); +#endif + WARN_ON(__sta_info_destroy(sta)); + } + mutex_unlock(&local->sta_mtx); +} + +struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, + const u8 *addr, + const u8 *localaddr) +{ + struct sta_info *sta, *nxt; + + /* + * Just return a random station if localaddr is NULL + * ... first in list. + */ + for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { + if (localaddr && + compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0) + continue; + if (!sta->uploaded) + return NULL; + return &sta->sta; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); + +struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, + const u8 *addr) +{ + struct sta_info *sta; + + if (!vif) + return NULL; + + sta = sta_info_get_bss(vif_to_sdata(vif), addr); + if (!sta) + return NULL; + + if (!sta->uploaded) + return NULL; + + return &sta->sta; +} +EXPORT_SYMBOL(ieee80211_find_sta); + +static void clear_sta_ps_flags(void *_sta) +{ + struct sta_info *sta = _sta; + + clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA); +} + +/* powersave support code */ +void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + int sent, buffered; + + clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); + if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); + + if (!skb_queue_empty(&sta->ps_tx_buf)) + sta_info_clear_tim_bit(sta); + + /* Send all buffered frames to the station */ + sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered); + buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf, + clear_sta_ps_flags, sta); + sent += buffered; + local->total_ps_buffered -= buffered; + +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames " + "since STA not sleeping anymore\n", sdata->name, + sta->sta.addr, sta->sta.aid, sent - buffered, buffered); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ +} + +void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + int no_pending_pkts; + + skb = skb_dequeue(&sta->tx_filtered); + if (!skb) { + skb = skb_dequeue(&sta->ps_tx_buf); + if (skb) + local->total_ps_buffered--; + } + no_pending_pkts = skb_queue_empty(&sta->tx_filtered) && + skb_queue_empty(&sta->ps_tx_buf); + + if (skb) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = + (struct ieee80211_hdr *) skb->data; + + /* + * Tell TX path to send this frame even though the STA may + * still remain is PS mode after this frame exchange. + */ + info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE; + +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n", + sta->sta.addr, sta->sta.aid, + skb_queue_len(&sta->ps_tx_buf)); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ + + /* Use MoreData flag to indicate whether there are more + * buffered frames for this STA */ + if (no_pending_pkts) + hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); + else + hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); + + ieee80211_add_pending_skb(local, skb); + + if (no_pending_pkts) + sta_info_clear_tim_bit(sta); +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + } else { + /* + * FIXME: This can be the result of a race condition between + * us expiring a frame and the station polling for it. + * Should we send it a null-func frame indicating we + * have nothing buffered for it? + */ + printk(KERN_DEBUG "%s: STA %pM sent PS Poll even " + "though there are no buffered frames for it\n", + sdata->name, sta->sta.addr); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ + } +} + +void ieee80211_sta_block_awake(struct ieee80211_hw *hw, + struct ieee80211_sta *pubsta, bool block) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + + trace_api_sta_block_awake(sta->local, pubsta, block); + + if (block) + set_sta_flags(sta, WLAN_STA_PS_DRIVER); + else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER)) + ieee80211_queue_work(hw, &sta->drv_unblock_wk); +} +EXPORT_SYMBOL(ieee80211_sta_block_awake); + +void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + + set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF); + sta_info_set_tim_bit(sta); +} +EXPORT_SYMBOL(ieee80211_sta_set_tim); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h new file mode 100644 index 00000000..c6ae8718 --- /dev/null +++ b/net/mac80211/sta_info.h @@ -0,0 +1,502 @@ +/* + * Copyright 2002-2005, Devicescape Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef STA_INFO_H +#define STA_INFO_H + +#include +#include +#include +#include +#include +#include "key.h" + +/** + * enum ieee80211_sta_info_flags - Stations flags + * + * These flags are used with &struct sta_info's @flags member. + * + * @WLAN_STA_AUTH: Station is authenticated. + * @WLAN_STA_ASSOC: Station is associated. + * @WLAN_STA_PS_STA: Station is in power-save mode + * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic. + * This bit is always checked so needs to be enabled for all stations + * when virtual port control is not in use. + * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble + * frames. + * @WLAN_STA_ASSOC_AP: We're associated to that station, it is an AP. + * @WLAN_STA_WME: Station is a QoS-STA. + * @WLAN_STA_WDS: Station is one of our WDS peers. + * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the + * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next + * frame to this station is transmitted. + * @WLAN_STA_MFP: Management frame protection is used with this STA. + * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX) + * during suspend/resume and station removal. + * @WLAN_STA_PS_DRIVER: driver requires keeping this station in + * power-save mode logically to flush frames that might still + * be in the queues + * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping + * station in power-save mode, reply when the driver unblocks. + * @WLAN_STA_PS_DRIVER_BUF: Station has frames pending in driver internal + * buffers. Automatically cleared on station wake-up. + */ +enum ieee80211_sta_info_flags { + WLAN_STA_AUTH = 1<<0, + WLAN_STA_ASSOC = 1<<1, + WLAN_STA_PS_STA = 1<<2, + WLAN_STA_AUTHORIZED = 1<<3, + WLAN_STA_SHORT_PREAMBLE = 1<<4, + WLAN_STA_ASSOC_AP = 1<<5, + WLAN_STA_WME = 1<<6, + WLAN_STA_WDS = 1<<7, + WLAN_STA_CLEAR_PS_FILT = 1<<9, + WLAN_STA_MFP = 1<<10, + WLAN_STA_BLOCK_BA = 1<<11, + WLAN_STA_PS_DRIVER = 1<<12, + WLAN_STA_PSPOLL = 1<<13, + WLAN_STA_PS_DRIVER_BUF = 1<<14, +}; + +#define STA_TID_NUM 16 +#define ADDBA_RESP_INTERVAL HZ +#define HT_AGG_MAX_RETRIES 0x3 + +#define HT_AGG_STATE_DRV_READY 0 +#define HT_AGG_STATE_RESPONSE_RECEIVED 1 +#define HT_AGG_STATE_OPERATIONAL 2 +#define HT_AGG_STATE_STOPPING 3 +#define HT_AGG_STATE_WANT_START 4 +#define HT_AGG_STATE_WANT_STOP 5 + +/** + * struct tid_ampdu_tx - TID aggregation information (Tx). + * + * @rcu_head: rcu head for freeing structure + * @addba_resp_timer: timer for peer's response to addba request + * @pending: pending frames queue -- use sta's spinlock to protect + * @dialog_token: dialog token for aggregation session + * @timeout: session timeout value to be filled in ADDBA requests + * @state: session state (see above) + * @stop_initiator: initiator of a session stop + * @tx_stop: TX DelBA frame when stopping + * @buf_size: reorder buffer size at receiver + * + * This structure's lifetime is managed by RCU, assignments to + * the array holding it must hold the aggregation mutex. + * + * The TX path can access it under RCU lock-free if, and + * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL + * set. Otherwise, the TX path must also acquire the spinlock + * and re-check the state, see comments in the tx code + * touching it. + */ +struct tid_ampdu_tx { + struct rcu_head rcu_head; + struct timer_list addba_resp_timer; + struct sk_buff_head pending; + unsigned long state; + u16 timeout; + u8 dialog_token; + u8 stop_initiator; + bool tx_stop; + u8 buf_size; +}; + +/** + * struct tid_ampdu_rx - TID aggregation information (Rx). + * + * @reorder_buf: buffer to reorder incoming aggregated MPDUs + * @reorder_time: jiffies when skb was added + * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) + * @reorder_timer: releases expired frames from the reorder buffer. + * @head_seq_num: head sequence number in reordering buffer. + * @stored_mpdu_num: number of MPDUs in reordering buffer + * @ssn: Starting Sequence Number expected to be aggregated. + * @buf_size: buffer size for incoming A-MPDUs + * @timeout: reset timer value (in TUs). + * @dialog_token: dialog token for aggregation session + * @rcu_head: RCU head used for freeing this struct + * @reorder_lock: serializes access to reorder buffer, see below. + * + * This structure's lifetime is managed by RCU, assignments to + * the array holding it must hold the aggregation mutex. + * + * The @reorder_lock is used to protect the members of this + * struct, except for @timeout, @buf_size and @dialog_token, + * which are constant across the lifetime of the struct (the + * dialog token being used only for debugging). + */ +struct tid_ampdu_rx { + struct rcu_head rcu_head; + spinlock_t reorder_lock; + struct sk_buff **reorder_buf; + unsigned long *reorder_time; + struct timer_list session_timer; + struct timer_list reorder_timer; + u16 head_seq_num; + u16 stored_mpdu_num; + u16 ssn; + u16 buf_size; + u16 timeout; + u8 dialog_token; +}; + +/** + * struct sta_ampdu_mlme - STA aggregation information. + * + * @tid_rx: aggregation info for Rx per TID -- RCU protected + * @tid_tx: aggregation info for Tx per TID + * @tid_start_tx: sessions where start was requested + * @addba_req_num: number of times addBA request has been sent. + * @dialog_token_allocator: dialog token enumerator for each new session; + * @work: work struct for starting/stopping aggregation + * @tid_rx_timer_expired: bitmap indicating on which TIDs the + * RX timer expired until the work for it runs + * @mtx: mutex to protect all TX data (except non-NULL assignments + * to tid_tx[idx], which are protected by the sta spinlock) + */ +struct sta_ampdu_mlme { + struct mutex mtx; + /* rx */ + struct tid_ampdu_rx __rcu *tid_rx[STA_TID_NUM]; + unsigned long tid_rx_timer_expired[BITS_TO_LONGS(STA_TID_NUM)]; + /* tx */ + struct work_struct work; + struct tid_ampdu_tx __rcu *tid_tx[STA_TID_NUM]; + struct tid_ampdu_tx *tid_start_tx[STA_TID_NUM]; + u8 addba_req_num[STA_TID_NUM]; + u8 dialog_token_allocator; +}; + + +/** + * struct sta_info - STA information + * + * This structure collects information about a station that + * mac80211 is communicating with. + * + * @list: global linked list entry + * @hnext: hash table linked list pointer + * @local: pointer to the global information + * @sdata: virtual interface this station belongs to + * @ptk: peer key negotiated with this station, if any + * @gtk: group keys negotiated with this station, if any + * @rate_ctrl: rate control algorithm reference + * @rate_ctrl_priv: rate control private per-STA pointer + * @last_tx_rate: rate used for last transmit, to report to userspace as + * "the" transmit rate + * @last_rx_rate_idx: rx status rate index of the last data packet + * @last_rx_rate_flag: rx status flag of the last data packet + * @lock: used for locking all fields that require locking, see comments + * in the header file. + * @flaglock: spinlock for flags accesses + * @drv_unblock_wk: used for driver PS unblocking + * @listen_interval: listen interval of this station, when we're acting as AP + * @flags: STA flags, see &enum ieee80211_sta_info_flags + * @ps_tx_buf: buffer of frames to transmit to this station + * when it leaves power saving state + * @tx_filtered: buffer of frames we already tried to transmit + * but were filtered by hardware due to STA having entered + * power saving state + * @rx_packets: Number of MSDUs received from this STA + * @rx_bytes: Number of bytes received from this STA + * @wep_weak_iv_count: number of weak WEP IVs received from this station + * @last_rx: time (in jiffies) when last frame was received from this STA + * @last_connected: time (in seconds) when a station got connected + * @num_duplicates: number of duplicate frames received from this STA + * @rx_fragments: number of received MPDUs + * @rx_dropped: number of dropped MPDUs from this STA + * @last_signal: signal of last received frame from this STA + * @avg_signal: moving average of signal of received frames from this STA + * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue) + * @tx_filtered_count: number of frames the hardware filtered for this STA + * @tx_retry_failed: number of frames that failed retry + * @tx_retry_count: total number of retries for frames to this STA + * @fail_avg: moving percentage of failed MSDUs + * @tx_packets: number of RX/TX MSDUs + * @tx_bytes: number of bytes transmitted to this STA + * @tx_fragments: number of transmitted MPDUs + * @tid_seq: per-TID sequence numbers for sending to this STA + * @ampdu_mlme: A-MPDU state machine state + * @timer_to_tid: identity mapping to ID timers + * @llid: Local link ID + * @plid: Peer link ID + * @reason: Cancel reason on PLINK_HOLDING state + * @plink_retries: Retries in establishment + * @ignore_plink_timer: ignore the peer-link timer (used internally) + * @plink_state: peer link state + * @plink_timeout: timeout of peer link + * @plink_timer: peer link watch timer + * @plink_timer_was_running: used by suspend/resume to restore timers + * @debugfs: debug filesystem info + * @sta: station information we share with the driver + * @dead: set to true when sta is unlinked + * @uploaded: set to true when sta is uploaded to the driver + * @lost_packets: number of consecutive lost packets + */ +struct sta_info { + /* General information, mostly static */ + struct list_head list; + struct sta_info __rcu *hnext; + struct ieee80211_local *local; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; + struct ieee80211_key __rcu *ptk; + struct rate_control_ref *rate_ctrl; + void *rate_ctrl_priv; + spinlock_t lock; + spinlock_t flaglock; + + struct work_struct drv_unblock_wk; + + u16 listen_interval; + + bool dead; + + bool uploaded; + + /* + * frequently updated, locked with own spinlock (flaglock), + * use the accessors defined below + */ + u32 flags; + + /* + * STA powersave frame queues, no more than the internal + * locking required. + */ + struct sk_buff_head ps_tx_buf; + struct sk_buff_head tx_filtered; + + /* Updated from RX path only, no locking requirements */ + unsigned long rx_packets, rx_bytes; + unsigned long wep_weak_iv_count; + unsigned long last_rx; + long last_connected; + unsigned long num_duplicates; + unsigned long rx_fragments; + unsigned long rx_dropped; + int last_signal; + struct ewma avg_signal; + __le16 last_seq_ctrl[NUM_RX_DATA_QUEUES]; + + /* Updated from TX status path only, no locking requirements */ + unsigned long tx_filtered_count; + unsigned long tx_retry_failed, tx_retry_count; + /* moving percentage of failed MSDUs */ + unsigned int fail_avg; + + /* Updated from TX path only, no locking requirements */ + unsigned long tx_packets; + unsigned long tx_bytes; + unsigned long tx_fragments; + struct ieee80211_tx_rate last_tx_rate; + int last_rx_rate_idx; + int last_rx_rate_flag; + u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1]; + + /* + * Aggregation information, locked with lock. + */ + struct sta_ampdu_mlme ampdu_mlme; + u8 timer_to_tid[STA_TID_NUM]; + +#ifdef CONFIG_MAC80211_MESH + /* + * Mesh peer link attributes + * TODO: move to a sub-structure that is referenced with pointer? + */ + __le16 llid; + __le16 plid; + __le16 reason; + u8 plink_retries; + bool ignore_plink_timer; + bool plink_timer_was_running; + enum nl80211_plink_state plink_state; + u32 plink_timeout; + struct timer_list plink_timer; +#endif + +#ifdef CONFIG_MAC80211_DEBUGFS + struct sta_info_debugfsdentries { + struct dentry *dir; + bool add_has_run; + } debugfs; +#endif + + unsigned int lost_packets; + + /* keep last! */ + struct ieee80211_sta sta; +}; + +static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta) +{ +#ifdef CONFIG_MAC80211_MESH + return sta->plink_state; +#endif + return NL80211_PLINK_LISTEN; +} + +static inline void set_sta_flags(struct sta_info *sta, const u32 flags) +{ + unsigned long irqfl; + + spin_lock_irqsave(&sta->flaglock, irqfl); + sta->flags |= flags; + spin_unlock_irqrestore(&sta->flaglock, irqfl); +} + +static inline void clear_sta_flags(struct sta_info *sta, const u32 flags) +{ + unsigned long irqfl; + + spin_lock_irqsave(&sta->flaglock, irqfl); + sta->flags &= ~flags; + spin_unlock_irqrestore(&sta->flaglock, irqfl); +} + +static inline u32 test_sta_flags(struct sta_info *sta, const u32 flags) +{ + u32 ret; + unsigned long irqfl; + + spin_lock_irqsave(&sta->flaglock, irqfl); + ret = sta->flags & flags; + spin_unlock_irqrestore(&sta->flaglock, irqfl); + + return ret; +} + +static inline u32 test_and_clear_sta_flags(struct sta_info *sta, + const u32 flags) +{ + u32 ret; + unsigned long irqfl; + + spin_lock_irqsave(&sta->flaglock, irqfl); + ret = sta->flags & flags; + sta->flags &= ~flags; + spin_unlock_irqrestore(&sta->flaglock, irqfl); + + return ret; +} + +static inline u32 get_sta_flags(struct sta_info *sta) +{ + u32 ret; + unsigned long irqfl; + + spin_lock_irqsave(&sta->flaglock, irqfl); + ret = sta->flags; + spin_unlock_irqrestore(&sta->flaglock, irqfl); + + return ret; +} + +void ieee80211_assign_tid_tx(struct sta_info *sta, int tid, + struct tid_ampdu_tx *tid_tx); + +static inline struct tid_ampdu_tx * +rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid) +{ + return rcu_dereference_protected(sta->ampdu_mlme.tid_tx[tid], + lockdep_is_held(&sta->lock) || + lockdep_is_held(&sta->ampdu_mlme.mtx)); +} + +#define STA_HASH_SIZE 256 +#define STA_HASH(sta) (sta[5]) + + +/* Maximum number of frames to buffer per power saving station */ +#define STA_MAX_TX_BUFFER 128 + +/* Minimum buffered frame expiry time. If STA uses listen interval that is + * smaller than this value, the minimum value here is used instead. */ +#define STA_TX_BUFFER_EXPIRE (10 * HZ) + +/* How often station data is cleaned up (e.g., expiration of buffered frames) + */ +#define STA_INFO_CLEANUP_INTERVAL (10 * HZ) + +/* + * Get a STA info, must be under RCU read lock. + */ +struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, + const u8 *addr); + +struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, + const u8 *addr); + +static inline +void for_each_sta_info_type_check(struct ieee80211_local *local, + const u8 *addr, + struct sta_info *sta, + struct sta_info *nxt) +{ +} + +#define for_each_sta_info(local, _addr, _sta, nxt) \ + for ( /* initialise loop */ \ + _sta = rcu_dereference(local->sta_hash[STA_HASH(_addr)]),\ + nxt = _sta ? rcu_dereference(_sta->hnext) : NULL; \ + /* typecheck */ \ + for_each_sta_info_type_check(local, (_addr), _sta, nxt),\ + /* continue condition */ \ + _sta; \ + /* advance loop */ \ + _sta = nxt, \ + nxt = _sta ? rcu_dereference(_sta->hnext) : NULL \ + ) \ + /* compare address and run code only if it matches */ \ + if (memcmp(_sta->sta.addr, (_addr), ETH_ALEN) == 0) + +/* + * Get STA info by index, BROKEN! + */ +struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, + int idx); +/* + * Create a new STA info, caller owns returned structure + * until sta_info_insert(). + */ +struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, + u8 *addr, gfp_t gfp); +/* + * Insert STA info into hash table/list, returns zero or a + * -EEXIST if (if the same MAC address is already present). + * + * Calling the non-rcu version makes the caller relinquish, + * the _rcu version calls read_lock_rcu() and must be called + * without it held. + */ +int sta_info_insert(struct sta_info *sta); +int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU); +int sta_info_insert_atomic(struct sta_info *sta); + +int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, + const u8 *addr); +int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, + const u8 *addr); + +void sta_info_set_tim_bit(struct sta_info *sta); +void sta_info_clear_tim_bit(struct sta_info *sta); + +void sta_info_init(struct ieee80211_local *local); +void sta_info_stop(struct ieee80211_local *local); +int sta_info_flush(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, + unsigned long exp_time); + +void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); +void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); + +#endif /* STA_INFO_H */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c new file mode 100644 index 00000000..04cdbaf1 --- /dev/null +++ b/net/mac80211/status.c @@ -0,0 +1,456 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2008-2010 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include "ieee80211_i.h" +#include "rate.h" +#include "mesh.h" +#include "led.h" + + +void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, + struct sk_buff *skb) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + int tmp; + + skb->pkt_type = IEEE80211_TX_STATUS_MSG; + skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? + &local->skb_queue : &local->skb_queue_unreliable, skb); + tmp = skb_queue_len(&local->skb_queue) + + skb_queue_len(&local->skb_queue_unreliable); + while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && + (skb = skb_dequeue(&local->skb_queue_unreliable))) { + dev_kfree_skb_irq(skb); + tmp--; + I802_DEBUG_INC(local->tx_status_drop); + } + tasklet_schedule(&local->tasklet); +} +EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); + +static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, + struct sta_info *sta, + struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + /* + * This skb 'survived' a round-trip through the driver, and + * hopefully the driver didn't mangle it too badly. However, + * we can definitely not rely on the control information + * being correct. Clear it so we don't get junk there, and + * indicate that it needs new processing, but must not be + * modified/encrypted again. + */ + memset(&info->control, 0, sizeof(info->control)); + + info->control.jiffies = jiffies; + info->control.vif = &sta->sdata->vif; + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING | + IEEE80211_TX_INTFL_RETRANSMISSION; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; + + sta->tx_filtered_count++; + + /* + * Clear the TX filter mask for this STA when sending the next + * packet. If the STA went to power save mode, this will happen + * when it wakes up for the next time. + */ + set_sta_flags(sta, WLAN_STA_CLEAR_PS_FILT); + + /* + * This code races in the following way: + * + * (1) STA sends frame indicating it will go to sleep and does so + * (2) hardware/firmware adds STA to filter list, passes frame up + * (3) hardware/firmware processes TX fifo and suppresses a frame + * (4) we get TX status before having processed the frame and + * knowing that the STA has gone to sleep. + * + * This is actually quite unlikely even when both those events are + * processed from interrupts coming in quickly after one another or + * even at the same time because we queue both TX status events and + * RX frames to be processed by a tasklet and process them in the + * same order that they were received or TX status last. Hence, there + * is no race as long as the frame RX is processed before the next TX + * status, which drivers can ensure, see below. + * + * Note that this can only happen if the hardware or firmware can + * actually add STAs to the filter list, if this is done by the + * driver in response to set_tim() (which will only reduce the race + * this whole filtering tries to solve, not completely solve it) + * this situation cannot happen. + * + * To completely solve this race drivers need to make sure that they + * (a) don't mix the irq-safe/not irq-safe TX status/RX processing + * functions and + * (b) always process RX events before TX status events if ordering + * can be unknown, for example with different interrupt status + * bits. + * (c) if PS mode transitions are manual (i.e. the flag + * %IEEE80211_HW_AP_LINK_PS is set), always process PS state + * changes before calling TX status events if ordering can be + * unknown. + */ + if (test_sta_flags(sta, WLAN_STA_PS_STA) && + skb_queue_len(&sta->tx_filtered) < STA_MAX_TX_BUFFER) { + skb_queue_tail(&sta->tx_filtered, skb); + return; + } + + if (!test_sta_flags(sta, WLAN_STA_PS_STA) && + !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { + /* Software retry the packet once */ + info->flags |= IEEE80211_TX_INTFL_RETRIED; + ieee80211_add_pending_skb(local, skb); + return; + } + +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + if (net_ratelimit()) + wiphy_debug(local->hw.wiphy, + "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", + skb_queue_len(&sta->tx_filtered), + !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies); +#endif + dev_kfree_skb(skb); +} + +static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *) skb->data; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + + if (ieee80211_is_action(mgmt->frame_control) && + sdata->vif.type == NL80211_IFTYPE_STATION && + mgmt->u.action.category == WLAN_CATEGORY_HT && + mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS) { + /* + * This update looks racy, but isn't -- if we come + * here we've definitely got a station that we're + * talking to, and on a managed interface that can + * only be the AP. And the only other place updating + * this variable is before we're associated. + */ + switch (mgmt->u.action.u.ht_smps.smps_control) { + case WLAN_HT_SMPS_CONTROL_DYNAMIC: + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_SMPS_CONTROL_STATIC: + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_SMPS_CONTROL_DISABLED: + default: /* shouldn't happen since we don't send that */ + sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_OFF; + break; + } + + ieee80211_queue_work(&local->hw, &local->recalc_smps); + } +} + +/* + * Use a static threshold for now, best value to be determined + * by testing ... + * Should it depend on: + * - on # of retransmissions + * - current throughput (higher value for higher tpt)? + */ +#define STA_LOST_PKT_THRESHOLD 50 + +void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +{ + struct sk_buff *skb2; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + u16 frag, type; + __le16 fc; + struct ieee80211_supported_band *sband; + struct ieee80211_tx_status_rtap_hdr *rthdr; + struct ieee80211_sub_if_data *sdata; + struct net_device *prev_dev = NULL; + struct sta_info *sta, *tmp; + int retry_count = -1, i; + int rates_idx = -1; + bool send_to_cooked; + bool acked; + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + if (info->status.rates[i].idx < 0) { + break; + } else if (i >= hw->max_report_rates) { + /* the HW cannot have attempted that rate */ + info->status.rates[i].idx = -1; + info->status.rates[i].count = 0; + break; + } + + retry_count += info->status.rates[i].count; + } + rates_idx = i - 1; + + if (retry_count < 0) + retry_count = 0; + + rcu_read_lock(); + + sband = local->hw.wiphy->bands[info->band]; + fc = hdr->frame_control; + + for_each_sta_info(local, hdr->addr1, sta, tmp) { + /* skip wrong virtual interface */ + if (memcmp(hdr->addr2, sta->sdata->vif.addr, ETH_ALEN)) + continue; + + acked = !!(info->flags & IEEE80211_TX_STAT_ACK); + if (!acked && test_sta_flags(sta, WLAN_STA_PS_STA)) { + /* + * The STA is in power save mode, so assume + * that this TX packet failed because of that. + */ + ieee80211_handle_filtered_frame(local, sta, skb); + rcu_read_unlock(); + return; + } + + if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && + (rates_idx != -1)) + sta->last_tx_rate = info->status.rates[rates_idx]; + + if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && + (ieee80211_is_data_qos(fc))) { + u16 tid, ssn; + u8 *qc; + + qc = ieee80211_get_qos_ctl(hdr); + tid = qc[0] & 0xf; + ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) + & IEEE80211_SCTL_SEQ); + ieee80211_send_bar(sta->sdata, hdr->addr1, + tid, ssn); + } + + if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { + ieee80211_handle_filtered_frame(local, sta, skb); + rcu_read_unlock(); + return; + } else { + if (!acked) + sta->tx_retry_failed++; + sta->tx_retry_count += retry_count; + } + + rate_control_tx_status(local, sband, sta, skb); + if (ieee80211_vif_is_mesh(&sta->sdata->vif)) + ieee80211s_update_metric(local, sta, skb); + + if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) + ieee80211_frame_acked(sta, skb); + + if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) && + (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data, acked); + + if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (info->flags & IEEE80211_TX_STAT_ACK) { + if (sta->lost_packets) + sta->lost_packets = 0; + } else if (++sta->lost_packets >= STA_LOST_PKT_THRESHOLD) { + cfg80211_cqm_pktloss_notify(sta->sdata->dev, + sta->sta.addr, + sta->lost_packets, + GFP_ATOMIC); + sta->lost_packets = 0; + } + } + } + + rcu_read_unlock(); + + ieee80211_led_tx(local, 0); + + /* SNMP counters + * Fragments are passed to low-level drivers as separate skbs, so these + * are actually fragments, not frames. Update frame counters only for + * the first fragment of the frame. */ + + frag = le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_FRAG; + type = le16_to_cpu(hdr->frame_control) & IEEE80211_FCTL_FTYPE; + + if (info->flags & IEEE80211_TX_STAT_ACK) { + if (frag == 0) { + local->dot11TransmittedFrameCount++; + if (is_multicast_ether_addr(hdr->addr1)) + local->dot11MulticastTransmittedFrameCount++; + if (retry_count > 0) + local->dot11RetryCount++; + if (retry_count > 1) + local->dot11MultipleRetryCount++; + } + + /* This counter shall be incremented for an acknowledged MPDU + * with an individual address in the address 1 field or an MPDU + * with a multicast address in the address 1 field of type Data + * or Management. */ + if (!is_multicast_ether_addr(hdr->addr1) || + type == IEEE80211_FTYPE_DATA || + type == IEEE80211_FTYPE_MGMT) + local->dot11TransmittedFragmentCount++; + } else { + if (frag == 0) + local->dot11FailedCount++; + } + + if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && + (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && + !(info->flags & IEEE80211_TX_CTL_INJECTED) && + local->ps_sdata && !(local->scanning)) { + if (info->flags & IEEE80211_TX_STAT_ACK) { + local->ps_sdata->u.mgd.flags |= + IEEE80211_STA_NULLFUNC_ACKED; + } else + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(10)); + } + + if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { + struct ieee80211_work *wk; + u64 cookie = (unsigned long)skb; + + rcu_read_lock(); + list_for_each_entry_rcu(wk, &local->work_list, list) { + if (wk->type != IEEE80211_WORK_OFFCHANNEL_TX) + continue; + if (wk->offchan_tx.frame != skb) + continue; + wk->offchan_tx.status = true; + break; + } + rcu_read_unlock(); + if (local->hw_roc_skb_for_status == skb) { + cookie = local->hw_roc_cookie ^ 2; + local->hw_roc_skb_for_status = NULL; + } + + if (cookie == local->hw_offchan_tx_cookie) + local->hw_offchan_tx_cookie = 0; + + cfg80211_mgmt_tx_status( + skb->dev, cookie, skb->data, skb->len, + !!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC); + } + + /* this was a transmitted frame, but now we want to reuse it */ + skb_orphan(skb); + + /* Need to make a copy before skb->cb gets cleared */ + send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || + (type != IEEE80211_FTYPE_DATA); + + /* + * This is a bit racy but we can avoid a lot of work + * with this test... + */ + if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { + dev_kfree_skb(skb); + return; + } + + /* send frame to monitor interfaces now */ + + if (skb_headroom(skb) < sizeof(*rthdr)) { + printk(KERN_ERR "ieee80211_tx_status: headroom too small\n"); + dev_kfree_skb(skb); + return; + } + + rthdr = (struct ieee80211_tx_status_rtap_hdr *) + skb_push(skb, sizeof(*rthdr)); + + memset(rthdr, 0, sizeof(*rthdr)); + rthdr->hdr.it_len = cpu_to_le16(sizeof(*rthdr)); + rthdr->hdr.it_present = + cpu_to_le32((1 << IEEE80211_RADIOTAP_TX_FLAGS) | + (1 << IEEE80211_RADIOTAP_DATA_RETRIES) | + (1 << IEEE80211_RADIOTAP_RATE)); + + if (!(info->flags & IEEE80211_TX_STAT_ACK) && + !is_multicast_ether_addr(hdr->addr1)) + rthdr->tx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_TX_FAIL); + + /* + * XXX: Once radiotap gets the bitmap reset thing the vendor + * extensions proposal contains, we can actually report + * the whole set of tries we did. + */ + if ((info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) || + (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)) + rthdr->tx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_TX_CTS); + else if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) + rthdr->tx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_TX_RTS); + if (info->status.rates[0].idx >= 0 && + !(info->status.rates[0].flags & IEEE80211_TX_RC_MCS)) + rthdr->rate = sband->bitrates[ + info->status.rates[0].idx].bitrate / 5; + + /* for now report the total retry_count */ + rthdr->data_retries = retry_count; + + /* XXX: is this sufficient for BPF? */ + skb_set_mac_header(skb, 0); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_OTHERHOST; + skb->protocol = htons(ETH_P_802_2); + memset(skb->cb, 0, sizeof(skb->cb)); + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (!ieee80211_sdata_running(sdata)) + continue; + + if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && + !send_to_cooked) + continue; + + if (prev_dev) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) { + skb2->dev = prev_dev; + netif_rx(skb2); + } + } + + prev_dev = sdata->dev; + } + } + if (prev_dev) { + skb->dev = prev_dev; + netif_rx(skb); + skb = NULL; + } + rcu_read_unlock(); + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(ieee80211_tx_status); + +void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, + num_packets, GFP_ATOMIC); +} +EXPORT_SYMBOL(ieee80211_report_low_ack); diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c new file mode 100644 index 00000000..757e4eb2 --- /dev/null +++ b/net/mac80211/tkip.c @@ -0,0 +1,342 @@ +/* + * Copyright 2002-2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include +#include "driver-ops.h" +#include "key.h" +#include "tkip.h" +#include "wep.h" + +#define PHASE1_LOOP_COUNT 8 + +/* + * 2-byte by 2-byte subset of the full AES S-box table; second part of this + * table is identical to first part but byte-swapped + */ +static const u16 tkip_sbox[256] = +{ + 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154, + 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A, + 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B, + 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B, + 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F, + 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F, + 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5, + 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F, + 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB, + 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397, + 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED, + 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A, + 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194, + 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3, + 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104, + 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D, + 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39, + 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695, + 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83, + 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76, + 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4, + 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B, + 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0, + 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018, + 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751, + 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85, + 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12, + 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9, + 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7, + 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A, + 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8, + 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A, +}; + +static u16 tkipS(u16 val) +{ + return tkip_sbox[val & 0xff] ^ swab16(tkip_sbox[val >> 8]); +} + +static u8 *write_tkip_iv(u8 *pos, u16 iv16) +{ + *pos++ = iv16 >> 8; + *pos++ = ((iv16 >> 8) | 0x20) & 0x7f; + *pos++ = iv16 & 0xFF; + return pos; +} + +/* + * P1K := Phase1(TA, TK, TSC) + * TA = transmitter address (48 bits) + * TK = dot11DefaultKeyValue or dot11KeyMappingValue (128 bits) + * TSC = TKIP sequence counter (48 bits, only 32 msb bits used) + * P1K: 80 bits + */ +static void tkip_mixing_phase1(const u8 *tk, struct tkip_ctx *ctx, + const u8 *ta, u32 tsc_IV32) +{ + int i, j; + u16 *p1k = ctx->p1k; + + p1k[0] = tsc_IV32 & 0xFFFF; + p1k[1] = tsc_IV32 >> 16; + p1k[2] = get_unaligned_le16(ta + 0); + p1k[3] = get_unaligned_le16(ta + 2); + p1k[4] = get_unaligned_le16(ta + 4); + + for (i = 0; i < PHASE1_LOOP_COUNT; i++) { + j = 2 * (i & 1); + p1k[0] += tkipS(p1k[4] ^ get_unaligned_le16(tk + 0 + j)); + p1k[1] += tkipS(p1k[0] ^ get_unaligned_le16(tk + 4 + j)); + p1k[2] += tkipS(p1k[1] ^ get_unaligned_le16(tk + 8 + j)); + p1k[3] += tkipS(p1k[2] ^ get_unaligned_le16(tk + 12 + j)); + p1k[4] += tkipS(p1k[3] ^ get_unaligned_le16(tk + 0 + j)) + i; + } + ctx->state = TKIP_STATE_PHASE1_DONE; +} + +static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx, + u16 tsc_IV16, u8 *rc4key) +{ + u16 ppk[6]; + const u16 *p1k = ctx->p1k; + int i; + + ppk[0] = p1k[0]; + ppk[1] = p1k[1]; + ppk[2] = p1k[2]; + ppk[3] = p1k[3]; + ppk[4] = p1k[4]; + ppk[5] = p1k[4] + tsc_IV16; + + ppk[0] += tkipS(ppk[5] ^ get_unaligned_le16(tk + 0)); + ppk[1] += tkipS(ppk[0] ^ get_unaligned_le16(tk + 2)); + ppk[2] += tkipS(ppk[1] ^ get_unaligned_le16(tk + 4)); + ppk[3] += tkipS(ppk[2] ^ get_unaligned_le16(tk + 6)); + ppk[4] += tkipS(ppk[3] ^ get_unaligned_le16(tk + 8)); + ppk[5] += tkipS(ppk[4] ^ get_unaligned_le16(tk + 10)); + ppk[0] += ror16(ppk[5] ^ get_unaligned_le16(tk + 12), 1); + ppk[1] += ror16(ppk[0] ^ get_unaligned_le16(tk + 14), 1); + ppk[2] += ror16(ppk[1], 1); + ppk[3] += ror16(ppk[2], 1); + ppk[4] += ror16(ppk[3], 1); + ppk[5] += ror16(ppk[4], 1); + + rc4key = write_tkip_iv(rc4key, tsc_IV16); + *rc4key++ = ((ppk[5] ^ get_unaligned_le16(tk)) >> 1) & 0xFF; + + for (i = 0; i < 6; i++) + put_unaligned_le16(ppk[i], rc4key + 2 * i); +} + +/* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets + * of the IV. Returns pointer to the octet following IVs (i.e., beginning of + * the packet payload). */ +u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key, u16 iv16) +{ + pos = write_tkip_iv(pos, iv16); + *pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */; + put_unaligned_le32(key->u.tkip.tx.iv32, pos); + return pos + 4; +} + +void ieee80211_get_tkip_key(struct ieee80211_key_conf *keyconf, + struct sk_buff *skb, enum ieee80211_tkip_key_type type, + u8 *outkey) +{ + struct ieee80211_key *key = (struct ieee80211_key *) + container_of(keyconf, struct ieee80211_key, conf); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + u8 *data; + const u8 *tk; + struct tkip_ctx *ctx; + u16 iv16; + u32 iv32; + + data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control); + iv16 = data[2] | (data[0] << 8); + iv32 = get_unaligned_le32(&data[4]); + + tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY]; + ctx = &key->u.tkip.tx; + +#ifdef CONFIG_MAC80211_TKIP_DEBUG + printk(KERN_DEBUG "TKIP encrypt: iv16 = 0x%04x, iv32 = 0x%08x\n", + iv16, iv32); + + if (iv32 != ctx->iv32) { + printk(KERN_DEBUG "skb: iv32 = 0x%08x key: iv32 = 0x%08x\n", + iv32, ctx->iv32); + printk(KERN_DEBUG "Wrap around of iv16 in the middle of a " + "fragmented packet\n"); + } +#endif + + /* Update the p1k only when the iv16 in the packet wraps around, this + * might occur after the wrap around of iv16 in the key in case of + * fragmented packets. */ + if (iv16 == 0 || ctx->state == TKIP_STATE_NOT_INIT) + tkip_mixing_phase1(tk, ctx, hdr->addr2, iv32); + + if (type == IEEE80211_TKIP_P1_KEY) { + memcpy(outkey, ctx->p1k, sizeof(u16) * 5); + return; + } + + tkip_mixing_phase2(tk, ctx, iv16, outkey); +} +EXPORT_SYMBOL(ieee80211_get_tkip_key); + +/* + * Encrypt packet payload with TKIP using @key. @pos is a pointer to the + * beginning of the buffer containing payload. This payload must include + * the IV/Ext.IV and space for (taildroom) four octets for ICV. + * @payload_len is the length of payload (_not_ including IV/ICV length). + * @ta is the transmitter addresses. + */ +int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm, + struct ieee80211_key *key, + u8 *pos, size_t payload_len, u8 *ta) +{ + u8 rc4key[16]; + struct tkip_ctx *ctx = &key->u.tkip.tx; + const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY]; + + /* Calculate per-packet key */ + if (ctx->iv16 == 0 || ctx->state == TKIP_STATE_NOT_INIT) + tkip_mixing_phase1(tk, ctx, ta, ctx->iv32); + + tkip_mixing_phase2(tk, ctx, ctx->iv16, rc4key); + + return ieee80211_wep_encrypt_data(tfm, rc4key, 16, pos, payload_len); +} + +/* Decrypt packet payload with TKIP using @key. @pos is a pointer to the + * beginning of the buffer containing IEEE 802.11 header payload, i.e., + * including IV, Ext. IV, real data, Michael MIC, ICV. @payload_len is the + * length of payload, including IV, Ext. IV, MIC, ICV. */ +int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm, + struct ieee80211_key *key, + u8 *payload, size_t payload_len, u8 *ta, + u8 *ra, int only_iv, int queue, + u32 *out_iv32, u16 *out_iv16) +{ + u32 iv32; + u32 iv16; + u8 rc4key[16], keyid, *pos = payload; + int res; + const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY]; + + if (payload_len < 12) + return -1; + + iv16 = (pos[0] << 8) | pos[2]; + keyid = pos[3]; + iv32 = get_unaligned_le32(pos + 4); + pos += 8; +#ifdef CONFIG_MAC80211_TKIP_DEBUG + { + int i; + printk(KERN_DEBUG "TKIP decrypt: data(len=%zd)", payload_len); + for (i = 0; i < payload_len; i++) + printk(" %02x", payload[i]); + printk("\n"); + printk(KERN_DEBUG "TKIP decrypt: iv16=%04x iv32=%08x\n", + iv16, iv32); + } +#endif + + if (!(keyid & (1 << 5))) + return TKIP_DECRYPT_NO_EXT_IV; + + if ((keyid >> 6) != key->conf.keyidx) + return TKIP_DECRYPT_INVALID_KEYIDX; + + if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT && + (iv32 < key->u.tkip.rx[queue].iv32 || + (iv32 == key->u.tkip.rx[queue].iv32 && + iv16 <= key->u.tkip.rx[queue].iv16))) { +#ifdef CONFIG_MAC80211_TKIP_DEBUG + printk(KERN_DEBUG "TKIP replay detected for RX frame from " + "%pM (RX IV (%04x,%02x) <= prev. IV (%04x,%02x)\n", + ta, + iv32, iv16, key->u.tkip.rx[queue].iv32, + key->u.tkip.rx[queue].iv16); +#endif + return TKIP_DECRYPT_REPLAY; + } + + if (only_iv) { + res = TKIP_DECRYPT_OK; + key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED; + goto done; + } + + if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT || + key->u.tkip.rx[queue].iv32 != iv32) { + /* IV16 wrapped around - perform TKIP phase 1 */ + tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32); +#ifdef CONFIG_MAC80211_TKIP_DEBUG + { + int i; + u8 key_offset = NL80211_TKIP_DATA_OFFSET_ENCR_KEY; + printk(KERN_DEBUG "TKIP decrypt: Phase1 TA=%pM" + " TK=", ta); + for (i = 0; i < 16; i++) + printk("%02x ", + key->conf.key[key_offset + i]); + printk("\n"); + printk(KERN_DEBUG "TKIP decrypt: P1K="); + for (i = 0; i < 5; i++) + printk("%04x ", key->u.tkip.rx[queue].p1k[i]); + printk("\n"); + } +#endif + } + if (key->local->ops->update_tkip_key && + key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) { + struct ieee80211_sub_if_data *sdata = key->sdata; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(key->sdata->bss, + struct ieee80211_sub_if_data, u.ap); + drv_update_tkip_key(key->local, sdata, &key->conf, key->sta, + iv32, key->u.tkip.rx[queue].p1k); + key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED; + } + + tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key); +#ifdef CONFIG_MAC80211_TKIP_DEBUG + { + int i; + printk(KERN_DEBUG "TKIP decrypt: Phase2 rc4key="); + for (i = 0; i < 16; i++) + printk("%02x ", rc4key[i]); + printk("\n"); + } +#endif + + res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12); + done: + if (res == TKIP_DECRYPT_OK) { + /* + * Record previously received IV, will be copied into the + * key information after MIC verification. It is possible + * that we don't catch replays of fragments but that's ok + * because the Michael MIC verication will then fail. + */ + *out_iv32 = iv32; + *out_iv16 = iv16; + } + + return res; +} diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h new file mode 100644 index 00000000..1cab9c86 --- /dev/null +++ b/net/mac80211/tkip.h @@ -0,0 +1,33 @@ +/* + * Copyright 2002-2004, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef TKIP_H +#define TKIP_H + +#include +#include +#include "key.h" + +u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key, u16 iv16); + +int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm, + struct ieee80211_key *key, + u8 *pos, size_t payload_len, u8 *ta); +enum { + TKIP_DECRYPT_OK = 0, + TKIP_DECRYPT_NO_EXT_IV = -1, + TKIP_DECRYPT_INVALID_KEYIDX = -2, + TKIP_DECRYPT_REPLAY = -3, +}; +int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm, + struct ieee80211_key *key, + u8 *payload, size_t payload_len, u8 *ta, + u8 *ra, int only_iv, int queue, + u32 *out_iv32, u16 *out_iv16); + +#endif /* TKIP_H */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c new file mode 100644 index 00000000..da878c14 --- /dev/null +++ b/net/mac80211/tx.c @@ -0,0 +1,2560 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * + * Transmit and frame generation functions. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "led.h" +#include "mesh.h" +#include "wep.h" +#include "wpa.h" +#include "wme.h" +#include "rate.h" + +/* misc utils */ + +static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr, + int next_frag_len) +{ + int rate, mrate, erp, dur, i; + struct ieee80211_rate *txrate; + struct ieee80211_local *local = tx->local; + struct ieee80211_supported_band *sband; + struct ieee80211_hdr *hdr; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + + /* assume HW handles this */ + if (info->control.rates[0].flags & IEEE80211_TX_RC_MCS) + return 0; + + /* uh huh? */ + if (WARN_ON_ONCE(info->control.rates[0].idx < 0)) + return 0; + + sband = local->hw.wiphy->bands[tx->channel->band]; + txrate = &sband->bitrates[info->control.rates[0].idx]; + + erp = txrate->flags & IEEE80211_RATE_ERP_G; + + /* + * data and mgmt (except PS Poll): + * - during CFP: 32768 + * - during contention period: + * if addr1 is group address: 0 + * if more fragments = 0 and addr1 is individual address: time to + * transmit one ACK plus SIFS + * if more fragments = 1 and addr1 is individual address: time to + * transmit next fragment plus 2 x ACK plus 3 x SIFS + * + * IEEE 802.11, 9.6: + * - control response frame (CTS or ACK) shall be transmitted using the + * same rate as the immediately previous frame in the frame exchange + * sequence, if this rate belongs to the PHY mandatory rates, or else + * at the highest possible rate belonging to the PHY rates in the + * BSSBasicRateSet + */ + hdr = (struct ieee80211_hdr *)tx->skb->data; + if (ieee80211_is_ctl(hdr->frame_control)) { + /* TODO: These control frames are not currently sent by + * mac80211, but should they be implemented, this function + * needs to be updated to support duration field calculation. + * + * RTS: time needed to transmit pending data/mgmt frame plus + * one CTS frame plus one ACK frame plus 3 x SIFS + * CTS: duration of immediately previous RTS minus time + * required to transmit CTS and its SIFS + * ACK: 0 if immediately previous directed data/mgmt had + * more=0, with more=1 duration in ACK frame is duration + * from previous frame minus time needed to transmit ACK + * and its SIFS + * PS Poll: BIT(15) | BIT(14) | aid + */ + return 0; + } + + /* data/mgmt */ + if (0 /* FIX: data/mgmt during CFP */) + return cpu_to_le16(32768); + + if (group_addr) /* Group address as the destination - no ACK */ + return 0; + + /* Individual destination address: + * IEEE 802.11, Ch. 9.6 (after IEEE 802.11g changes) + * CTS and ACK frames shall be transmitted using the highest rate in + * basic rate set that is less than or equal to the rate of the + * immediately previous frame and that is using the same modulation + * (CCK or OFDM). If no basic rate set matches with these requirements, + * the highest mandatory rate of the PHY that is less than or equal to + * the rate of the previous frame is used. + * Mandatory rates for IEEE 802.11g PHY: 1, 2, 5.5, 11, 6, 12, 24 Mbps + */ + rate = -1; + /* use lowest available if everything fails */ + mrate = sband->bitrates[0].bitrate; + for (i = 0; i < sband->n_bitrates; i++) { + struct ieee80211_rate *r = &sband->bitrates[i]; + + if (r->bitrate > txrate->bitrate) + break; + + if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) + rate = r->bitrate; + + switch (sband->band) { + case IEEE80211_BAND_2GHZ: { + u32 flag; + if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) + flag = IEEE80211_RATE_MANDATORY_G; + else + flag = IEEE80211_RATE_MANDATORY_B; + if (r->flags & flag) + mrate = r->bitrate; + break; + } + case IEEE80211_BAND_5GHZ: + if (r->flags & IEEE80211_RATE_MANDATORY_A) + mrate = r->bitrate; + break; + case IEEE80211_NUM_BANDS: + WARN_ON(1); + break; + } + } + if (rate == -1) { + /* No matching basic rate found; use highest suitable mandatory + * PHY rate */ + rate = mrate; + } + + /* Time needed to transmit ACK + * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up + * to closest integer */ + + dur = ieee80211_frame_duration(local, 10, rate, erp, + tx->sdata->vif.bss_conf.use_short_preamble); + + if (next_frag_len) { + /* Frame is fragmented: duration increases with time needed to + * transmit next fragment plus ACK and 2 x SIFS. */ + dur *= 2; /* ACK + SIFS */ + /* next fragment */ + dur += ieee80211_frame_duration(local, next_frag_len, + txrate->bitrate, erp, + tx->sdata->vif.bss_conf.use_short_preamble); + } + + return cpu_to_le16(dur); +} + +static inline int is_ieee80211_device(struct ieee80211_local *local, + struct net_device *dev) +{ + return local == wdev_priv(dev->ieee80211_ptr); +} + +/* tx handlers */ +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) +{ + struct ieee80211_local *local = tx->local; + struct ieee80211_if_managed *ifmgd; + + /* driver doesn't support power save */ + if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + return TX_CONTINUE; + + /* hardware does dynamic power save */ + if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + return TX_CONTINUE; + + /* dynamic power save disabled */ + if (local->hw.conf.dynamic_ps_timeout <= 0) + return TX_CONTINUE; + + /* we are scanning, don't enable power save */ + if (local->scanning) + return TX_CONTINUE; + + if (!local->ps_sdata) + return TX_CONTINUE; + + /* No point if we're going to suspend */ + if (local->quiescing) + return TX_CONTINUE; + + /* dynamic ps is supported only in managed mode */ + if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) + return TX_CONTINUE; + + ifmgd = &tx->sdata->u.mgd; + + /* + * Don't wakeup from power save if u-apsd is enabled, voip ac has + * u-apsd enabled and the frame is in voip class. This effectively + * means that even if all access categories have u-apsd enabled, in + * practise u-apsd is only used with the voip ac. This is a + * workaround for the case when received voip class packets do not + * have correct qos tag for some reason, due the network or the + * peer application. + * + * Note: local->uapsd_queues access is racy here. If the value is + * changed via debugfs, user needs to reassociate manually to have + * everything in sync. + */ + if ((ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) + && (local->uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) + && skb_get_queue_mapping(tx->skb) == 0) + return TX_CONTINUE; + + if (local->hw.conf.flags & IEEE80211_CONF_PS) { + ieee80211_stop_queues_by_reason(&local->hw, + IEEE80211_QUEUE_STOP_REASON_PS); + ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; + ieee80211_queue_work(&local->hw, + &local->dynamic_ps_disable_work); + } + + /* Don't restart the timer if we're not disassociated */ + if (!ifmgd->associated) + return TX_CONTINUE; + + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) +{ + + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + u32 sta_flags; + + if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED)) + return TX_CONTINUE; + + if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) && + test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) && + !ieee80211_is_probe_req(hdr->frame_control) && + !ieee80211_is_nullfunc(hdr->frame_control)) + /* + * When software scanning only nullfunc frames (to notify + * the sleep state to the AP) and probe requests (for the + * active scan) are allowed, all other frames should not be + * sent and we should not get here, but if we do + * nonetheless, drop them to avoid sending them + * off-channel. See the link below and + * ieee80211_start_scan() for more. + * + * http://article.gmane.org/gmane.linux.kernel.wireless.general/30089 + */ + return TX_DROP; + + if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) + return TX_CONTINUE; + + if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + return TX_CONTINUE; + + if (tx->flags & IEEE80211_TX_PS_BUFFERED) + return TX_CONTINUE; + + sta_flags = tx->sta ? get_sta_flags(tx->sta) : 0; + + if (likely(tx->flags & IEEE80211_TX_UNICAST)) { + if (unlikely(!(sta_flags & WLAN_STA_ASSOC) && + tx->sdata->vif.type != NL80211_IFTYPE_ADHOC && + ieee80211_is_data(hdr->frame_control))) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + printk(KERN_DEBUG "%s: dropped data frame to not " + "associated station %pM\n", + tx->sdata->name, hdr->addr1); +#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ + I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc); + return TX_DROP; + } + } else { + if (unlikely(ieee80211_is_data(hdr->frame_control) && + tx->local->num_sta == 0 && + tx->sdata->vif.type != NL80211_IFTYPE_ADHOC)) { + /* + * No associated STAs - no need to send multicast + * frames. + */ + return TX_DROP; + } + return TX_CONTINUE; + } + + return TX_CONTINUE; +} + +/* This function is called whenever the AP is about to exceed the maximum limit + * of buffered frames for power saving STAs. This situation should not really + * happen often during normal operation, so dropping the oldest buffered packet + * from each queue should be OK to make some room for new frames. */ +static void purge_old_ps_buffers(struct ieee80211_local *local) +{ + int total = 0, purged = 0; + struct sk_buff *skb; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + + /* + * virtual interfaces are protected by RCU + */ + rcu_read_lock(); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + struct ieee80211_if_ap *ap; + if (sdata->vif.type != NL80211_IFTYPE_AP) + continue; + ap = &sdata->u.ap; + skb = skb_dequeue(&ap->ps_bc_buf); + if (skb) { + purged++; + dev_kfree_skb(skb); + } + total += skb_queue_len(&ap->ps_bc_buf); + } + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + skb = skb_dequeue(&sta->ps_tx_buf); + if (skb) { + purged++; + dev_kfree_skb(skb); + } + total += skb_queue_len(&sta->ps_tx_buf); + } + + rcu_read_unlock(); + + local->total_ps_buffered = total; +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n", + purged); +#endif +} + +static ieee80211_tx_result +ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; + + /* + * broadcast/multicast frame + * + * If any of the associated stations is in power save mode, + * the frame is buffered to be sent after DTIM beacon frame. + * This is done either by the hardware or us. + */ + + /* powersaving STAs only in AP/VLAN mode */ + if (!tx->sdata->bss) + return TX_CONTINUE; + + /* no buffering for ordered frames */ + if (ieee80211_has_order(hdr->frame_control)) + return TX_CONTINUE; + + /* no stations in PS mode */ + if (!atomic_read(&tx->sdata->bss->num_sta_ps)) + return TX_CONTINUE; + + info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; + + /* device releases frame after DTIM beacon */ + if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)) + return TX_CONTINUE; + + /* buffered in mac80211 */ + if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) + purge_old_ps_buffers(tx->local); + + if (skb_queue_len(&tx->sdata->bss->ps_bc_buf) >= AP_MAX_BC_BUFFER) { +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "%s: BC TX buffer full - dropping the oldest frame\n", + tx->sdata->name); +#endif + dev_kfree_skb(skb_dequeue(&tx->sdata->bss->ps_bc_buf)); + } else + tx->local->total_ps_buffered++; + + skb_queue_tail(&tx->sdata->bss->ps_bc_buf, tx->skb); + + return TX_QUEUED; +} + +static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta, + struct sk_buff *skb) +{ + if (!ieee80211_is_mgmt(fc)) + return 0; + + if (sta == NULL || !test_sta_flags(sta, WLAN_STA_MFP)) + return 0; + + if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) + skb->data)) + return 0; + + return 1; +} + +static ieee80211_tx_result +ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) +{ + struct sta_info *sta = tx->sta; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; + struct ieee80211_local *local = tx->local; + u32 staflags; + + if (unlikely(!sta || + ieee80211_is_probe_resp(hdr->frame_control) || + ieee80211_is_auth(hdr->frame_control) || + ieee80211_is_assoc_resp(hdr->frame_control) || + ieee80211_is_reassoc_resp(hdr->frame_control))) + return TX_CONTINUE; + + staflags = get_sta_flags(sta); + + if (unlikely((staflags & (WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) && + !(info->flags & IEEE80211_TX_CTL_PSPOLL_RESPONSE))) { +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + printk(KERN_DEBUG "STA %pM aid %d: PS buffer (entries " + "before %d)\n", + sta->sta.addr, sta->sta.aid, + skb_queue_len(&sta->ps_tx_buf)); +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ + if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) + purge_old_ps_buffers(tx->local); + if (skb_queue_len(&sta->ps_tx_buf) >= STA_MAX_TX_BUFFER) { + struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf); +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + if (net_ratelimit()) { + printk(KERN_DEBUG "%s: STA %pM TX " + "buffer full - dropping oldest frame\n", + tx->sdata->name, sta->sta.addr); + } +#endif + dev_kfree_skb(old); + } else + tx->local->total_ps_buffered++; + + /* + * Queue frame to be sent after STA wakes up/polls, + * but don't set the TIM bit if the driver is blocking + * wakeup or poll response transmissions anyway. + */ + if (skb_queue_empty(&sta->ps_tx_buf) && + !(staflags & WLAN_STA_PS_DRIVER)) + sta_info_set_tim_bit(sta); + + info->control.jiffies = jiffies; + info->control.vif = &tx->sdata->vif; + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + skb_queue_tail(&sta->ps_tx_buf, tx->skb); + + if (!timer_pending(&local->sta_cleanup)) + mod_timer(&local->sta_cleanup, + round_jiffies(jiffies + + STA_INFO_CLEANUP_INTERVAL)); + + return TX_QUEUED; + } +#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG + else if (unlikely(staflags & WLAN_STA_PS_STA)) { + printk(KERN_DEBUG "%s: STA %pM in PS mode, but pspoll " + "set -> send frame\n", tx->sdata->name, + sta->sta.addr); + } +#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx) +{ + if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED)) + return TX_CONTINUE; + + if (tx->flags & IEEE80211_TX_UNICAST) + return ieee80211_tx_h_unicast_ps_buf(tx); + else + return ieee80211_tx_h_multicast_ps_buf(tx); +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + + if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol && + tx->sdata->control_port_no_encrypt)) + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) +{ + struct ieee80211_key *key = NULL; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; + + if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) + tx->key = NULL; + else if (tx->sta && (key = rcu_dereference(tx->sta->ptk))) + tx->key = key; + else if (ieee80211_is_mgmt(hdr->frame_control) && + is_multicast_ether_addr(hdr->addr1) && + ieee80211_is_robust_mgmt_frame(hdr) && + (key = rcu_dereference(tx->sdata->default_mgmt_key))) + tx->key = key; + else if (is_multicast_ether_addr(hdr->addr1) && + (key = rcu_dereference(tx->sdata->default_multicast_key))) + tx->key = key; + else if (!is_multicast_ether_addr(hdr->addr1) && + (key = rcu_dereference(tx->sdata->default_unicast_key))) + tx->key = key; + else if (tx->sdata->drop_unencrypted && + (tx->skb->protocol != tx->sdata->control_port_protocol) && + !(info->flags & IEEE80211_TX_CTL_INJECTED) && + (!ieee80211_is_robust_mgmt_frame(hdr) || + (ieee80211_is_action(hdr->frame_control) && + tx->sta && test_sta_flags(tx->sta, WLAN_STA_MFP)))) { + I802_DEBUG_INC(tx->local->tx_handlers_drop_unencrypted); + return TX_DROP; + } else + tx->key = NULL; + + if (tx->key) { + bool skip_hw = false; + + tx->key->tx_rx_count++; + /* TODO: add threshold stuff again */ + + switch (tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + if (ieee80211_is_auth(hdr->frame_control)) + break; + case WLAN_CIPHER_SUITE_TKIP: + if (!ieee80211_is_data_present(hdr->frame_control)) + tx->key = NULL; + break; + case WLAN_CIPHER_SUITE_CCMP: + if (!ieee80211_is_data_present(hdr->frame_control) && + !ieee80211_use_mfp(hdr->frame_control, tx->sta, + tx->skb)) + tx->key = NULL; + else + skip_hw = (tx->key->conf.flags & + IEEE80211_KEY_FLAG_SW_MGMT) && + ieee80211_is_mgmt(hdr->frame_control); + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + if (!ieee80211_is_mgmt(hdr->frame_control)) + tx->key = NULL; + break; + } + + if (!skip_hw && tx->key && + tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) + info->control.hw_key = &tx->key->conf; + } + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + struct ieee80211_hdr *hdr = (void *)tx->skb->data; + struct ieee80211_supported_band *sband; + struct ieee80211_rate *rate; + int i; + u32 len; + bool inval = false, rts = false, short_preamble = false; + struct ieee80211_tx_rate_control txrc; + u32 sta_flags; + + memset(&txrc, 0, sizeof(txrc)); + + sband = tx->local->hw.wiphy->bands[tx->channel->band]; + + len = min_t(u32, tx->skb->len + FCS_LEN, + tx->local->hw.wiphy->frag_threshold); + + /* set up the tx rate control struct we give the RC algo */ + txrc.hw = local_to_hw(tx->local); + txrc.sband = sband; + txrc.bss_conf = &tx->sdata->vif.bss_conf; + txrc.skb = tx->skb; + txrc.reported_rate.idx = -1; + txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[tx->channel->band]; + if (txrc.rate_idx_mask == (1 << sband->n_bitrates) - 1) + txrc.max_rate_idx = -1; + else + txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1; + txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || + tx->sdata->vif.type == NL80211_IFTYPE_ADHOC); + + /* set up RTS protection if desired */ + if (len > tx->local->hw.wiphy->rts_threshold) { + txrc.rts = rts = true; + } + + /* + * Use short preamble if the BSS can handle it, but not for + * management frames unless we know the receiver can handle + * that -- the management frame might be to a station that + * just wants a probe response. + */ + if (tx->sdata->vif.bss_conf.use_short_preamble && + (ieee80211_is_data(hdr->frame_control) || + (tx->sta && test_sta_flags(tx->sta, WLAN_STA_SHORT_PREAMBLE)))) + txrc.short_preamble = short_preamble = true; + + sta_flags = tx->sta ? get_sta_flags(tx->sta) : 0; + + /* + * Lets not bother rate control if we're associated and cannot + * talk to the sta. This should not happen. + */ + if (WARN(test_bit(SCAN_SW_SCANNING, &tx->local->scanning) && + (sta_flags & WLAN_STA_ASSOC) && + !rate_usable_index_exists(sband, &tx->sta->sta), + "%s: Dropped data frame as no usable bitrate found while " + "scanning and associated. Target station: " + "%pM on %d GHz band\n", + tx->sdata->name, hdr->addr1, + tx->channel->band ? 5 : 2)) + return TX_DROP; + + /* + * If we're associated with the sta at this point we know we can at + * least send the frame at the lowest bit rate. + */ + rate_control_get_rate(tx->sdata, tx->sta, &txrc); + + if (unlikely(info->control.rates[0].idx < 0)) + return TX_DROP; + + if (txrc.reported_rate.idx < 0) { + txrc.reported_rate = info->control.rates[0]; + if (tx->sta && ieee80211_is_data(hdr->frame_control)) + tx->sta->last_tx_rate = txrc.reported_rate; + } else if (tx->sta) + tx->sta->last_tx_rate = txrc.reported_rate; + + if (unlikely(!info->control.rates[0].count)) + info->control.rates[0].count = 1; + + if (WARN_ON_ONCE((info->control.rates[0].count > 1) && + (info->flags & IEEE80211_TX_CTL_NO_ACK))) + info->control.rates[0].count = 1; + + if (is_multicast_ether_addr(hdr->addr1)) { + /* + * XXX: verify the rate is in the basic rateset + */ + return TX_CONTINUE; + } + + /* + * set up the RTS/CTS rate as the fastest basic rate + * that is not faster than the data rate + * + * XXX: Should this check all retry rates? + */ + if (!(info->control.rates[0].flags & IEEE80211_TX_RC_MCS)) { + s8 baserate = 0; + + rate = &sband->bitrates[info->control.rates[0].idx]; + + for (i = 0; i < sband->n_bitrates; i++) { + /* must be a basic rate */ + if (!(tx->sdata->vif.bss_conf.basic_rates & BIT(i))) + continue; + /* must not be faster than the data rate */ + if (sband->bitrates[i].bitrate > rate->bitrate) + continue; + /* maximum */ + if (sband->bitrates[baserate].bitrate < + sband->bitrates[i].bitrate) + baserate = i; + } + + info->control.rts_cts_rate_idx = baserate; + } + + for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { + /* + * make sure there's no valid rate following + * an invalid one, just in case drivers don't + * take the API seriously to stop at -1. + */ + if (inval) { + info->control.rates[i].idx = -1; + continue; + } + if (info->control.rates[i].idx < 0) { + inval = true; + continue; + } + + /* + * For now assume MCS is already set up correctly, this + * needs to be fixed. + */ + if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS) { + WARN_ON(info->control.rates[i].idx > 76); + continue; + } + + /* set up RTS protection if desired */ + if (rts) + info->control.rates[i].flags |= + IEEE80211_TX_RC_USE_RTS_CTS; + + /* RC is busted */ + if (WARN_ON_ONCE(info->control.rates[i].idx >= + sband->n_bitrates)) { + info->control.rates[i].idx = -1; + continue; + } + + rate = &sband->bitrates[info->control.rates[i].idx]; + + /* set up short preamble */ + if (short_preamble && + rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) + info->control.rates[i].flags |= + IEEE80211_TX_RC_USE_SHORT_PREAMBLE; + + /* set up G protection */ + if (!rts && tx->sdata->vif.bss_conf.use_cts_prot && + rate->flags & IEEE80211_RATE_ERP_G) + info->control.rates[i].flags |= + IEEE80211_TX_RC_USE_CTS_PROTECT; + } + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; + u16 *seq; + u8 *qc; + int tid; + + /* + * Packet injection may want to control the sequence + * number, if we have no matching interface then we + * neither assign one ourselves nor ask the driver to. + */ + if (unlikely(info->control.vif->type == NL80211_IFTYPE_MONITOR)) + return TX_CONTINUE; + + if (unlikely(ieee80211_is_ctl(hdr->frame_control))) + return TX_CONTINUE; + + if (ieee80211_hdrlen(hdr->frame_control) < 24) + return TX_CONTINUE; + + /* + * Anything but QoS data that has a sequence number field + * (is long enough) gets a sequence number from the global + * counter. + */ + if (!ieee80211_is_data_qos(hdr->frame_control)) { + /* driver should assign sequence number */ + info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; + /* for pure STA mode without beacons, we can do it */ + hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number); + tx->sdata->sequence_number += 0x10; + return TX_CONTINUE; + } + + /* + * This should be true for injected/management frames only, for + * management frames we have set the IEEE80211_TX_CTL_ASSIGN_SEQ + * above since they are not QoS-data frames. + */ + if (!tx->sta) + return TX_CONTINUE; + + /* include per-STA, per-TID sequence counter */ + + qc = ieee80211_get_qos_ctl(hdr); + tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + seq = &tx->sta->tid_seq[tid]; + + hdr->seq_ctrl = cpu_to_le16(*seq); + + /* Increase the sequence number. */ + *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ; + + return TX_CONTINUE; +} + +static int ieee80211_fragment(struct ieee80211_local *local, + struct sk_buff *skb, int hdrlen, + int frag_threshold) +{ + struct sk_buff *tail = skb, *tmp; + int per_fragm = frag_threshold - hdrlen - FCS_LEN; + int pos = hdrlen + per_fragm; + int rem = skb->len - hdrlen - per_fragm; + + if (WARN_ON(rem < 0)) + return -EINVAL; + + while (rem) { + int fraglen = per_fragm; + + if (fraglen > rem) + fraglen = rem; + rem -= fraglen; + tmp = dev_alloc_skb(local->tx_headroom + + frag_threshold + + IEEE80211_ENCRYPT_HEADROOM + + IEEE80211_ENCRYPT_TAILROOM); + if (!tmp) + return -ENOMEM; + tail->next = tmp; + tail = tmp; + skb_reserve(tmp, local->tx_headroom + + IEEE80211_ENCRYPT_HEADROOM); + /* copy control information */ + memcpy(tmp->cb, skb->cb, sizeof(tmp->cb)); + skb_copy_queue_mapping(tmp, skb); + tmp->priority = skb->priority; + tmp->dev = skb->dev; + + /* copy header and data */ + memcpy(skb_put(tmp, hdrlen), skb->data, hdrlen); + memcpy(skb_put(tmp, fraglen), skb->data + pos, fraglen); + + pos += fraglen; + } + + skb->len = hdrlen + per_fragm; + return 0; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *)skb->data; + int frag_threshold = tx->local->hw.wiphy->frag_threshold; + int hdrlen; + int fragnum; + + if (!(tx->flags & IEEE80211_TX_FRAGMENTED)) + return TX_CONTINUE; + + /* + * Warn when submitting a fragmented A-MPDU frame and drop it. + * This scenario is handled in ieee80211_tx_prepare but extra + * caution taken here as fragmented ampdu may cause Tx stop. + */ + if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) + return TX_DROP; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + + /* internal error, why is TX_FRAGMENTED set? */ + if (WARN_ON(skb->len + FCS_LEN <= frag_threshold)) + return TX_DROP; + + /* + * Now fragment the frame. This will allocate all the fragments and + * chain them (using skb as the first fragment) to skb->next. + * During transmission, we will remove the successfully transmitted + * fragments from this list. When the low-level driver rejects one + * of the fragments then we will simply pretend to accept the skb + * but store it away as pending. + */ + if (ieee80211_fragment(tx->local, skb, hdrlen, frag_threshold)) + return TX_DROP; + + /* update duration/seq/flags of fragments */ + fragnum = 0; + do { + int next_len; + const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS); + + hdr = (void *)skb->data; + info = IEEE80211_SKB_CB(skb); + + if (skb->next) { + hdr->frame_control |= morefrags; + next_len = skb->next->len; + /* + * No multi-rate retries for fragmented frames, that + * would completely throw off the NAV at other STAs. + */ + info->control.rates[1].idx = -1; + info->control.rates[2].idx = -1; + info->control.rates[3].idx = -1; + info->control.rates[4].idx = -1; + BUILD_BUG_ON(IEEE80211_TX_MAX_RATES != 5); + info->flags &= ~IEEE80211_TX_CTL_RATE_CTRL_PROBE; + } else { + hdr->frame_control &= ~morefrags; + next_len = 0; + } + hdr->duration_id = ieee80211_duration(tx, 0, next_len); + hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG); + fragnum++; + } while ((skb = skb->next)); + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + + if (!tx->sta) + return TX_CONTINUE; + + tx->sta->tx_packets++; + do { + tx->sta->tx_fragments++; + tx->sta->tx_bytes += skb->len; + } while ((skb = skb->next)); + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + + if (!tx->key) + return TX_CONTINUE; + + switch (tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + return ieee80211_crypto_wep_encrypt(tx); + case WLAN_CIPHER_SUITE_TKIP: + return ieee80211_crypto_tkip_encrypt(tx); + case WLAN_CIPHER_SUITE_CCMP: + return ieee80211_crypto_ccmp_encrypt(tx); + case WLAN_CIPHER_SUITE_AES_CMAC: + return ieee80211_crypto_aes_cmac_encrypt(tx); + default: + /* handle hw-only algorithm */ + if (info->control.hw_key) { + ieee80211_tx_set_protected(tx); + return TX_CONTINUE; + } + break; + + } + + return TX_DROP; +} + +static ieee80211_tx_result debug_noinline +ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + struct ieee80211_hdr *hdr; + int next_len; + bool group_addr; + + do { + hdr = (void *) skb->data; + if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) + break; /* must not overwrite AID */ + next_len = skb->next ? skb->next->len : 0; + group_addr = is_multicast_ether_addr(hdr->addr1); + + hdr->duration_id = + ieee80211_duration(tx, group_addr, next_len); + } while ((skb = skb->next)); + + return TX_CONTINUE; +} + +/* actual transmit path */ + +/* + * deal with packet injection down monitor interface + * with Radiotap Header -- only called for monitor mode interface + */ +static bool __ieee80211_parse_tx_radiotap(struct ieee80211_tx_data *tx, + struct sk_buff *skb) +{ + /* + * this is the moment to interpret and discard the radiotap header that + * must be at the start of the packet injected in Monitor mode + * + * Need to take some care with endian-ness since radiotap + * args are little-endian + */ + + struct ieee80211_radiotap_iterator iterator; + struct ieee80211_radiotap_header *rthdr = + (struct ieee80211_radiotap_header *) skb->data; + bool hw_frag; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len, + NULL); + + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + tx->flags &= ~IEEE80211_TX_FRAGMENTED; + + /* packet is fragmented in HW if we have a non-NULL driver callback */ + hw_frag = (tx->local->ops->set_frag_threshold != NULL); + + /* + * for every radiotap entry that is present + * (ieee80211_radiotap_iterator_next returns -ENOENT when no more + * entries present, or -EINVAL on error) + */ + + while (!ret) { + ret = ieee80211_radiotap_iterator_next(&iterator); + + if (ret) + continue; + + /* see if this argument is something we can use */ + switch (iterator.this_arg_index) { + /* + * You must take care when dereferencing iterator.this_arg + * for multibyte types... the pointer is not aligned. Use + * get_unaligned((type *)iterator.this_arg) to dereference + * iterator.this_arg for type "type" safely on all arches. + */ + case IEEE80211_RADIOTAP_FLAGS: + if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FCS) { + /* + * this indicates that the skb we have been + * handed has the 32-bit FCS CRC at the end... + * we should react to that by snipping it off + * because it will be recomputed and added + * on transmission + */ + if (skb->len < (iterator._max_length + FCS_LEN)) + return false; + + skb_trim(skb, skb->len - FCS_LEN); + } + if (*iterator.this_arg & IEEE80211_RADIOTAP_F_WEP) + info->flags &= ~IEEE80211_TX_INTFL_DONT_ENCRYPT; + if ((*iterator.this_arg & IEEE80211_RADIOTAP_F_FRAG) && + !hw_frag) + tx->flags |= IEEE80211_TX_FRAGMENTED; + break; + + /* + * Please update the file + * Documentation/networking/mac80211-injection.txt + * when parsing new fields here. + */ + + default: + break; + } + } + + if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */ + return false; + + /* + * remove the radiotap header + * iterator->_max_length was sanity-checked against + * skb->len by iterator init + */ + skb_pull(skb, iterator._max_length); + + return true; +} + +static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, + struct sk_buff *skb, + struct ieee80211_tx_info *info, + struct tid_ampdu_tx *tid_tx, + int tid) +{ + bool queued = false; + + if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { + info->flags |= IEEE80211_TX_CTL_AMPDU; + } else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) { + /* + * nothing -- this aggregation session is being started + * but that might still fail with the driver + */ + } else { + spin_lock(&tx->sta->lock); + /* + * Need to re-check now, because we may get here + * + * 1) in the window during which the setup is actually + * already done, but not marked yet because not all + * packets are spliced over to the driver pending + * queue yet -- if this happened we acquire the lock + * either before or after the splice happens, but + * need to recheck which of these cases happened. + * + * 2) during session teardown, if the OPERATIONAL bit + * was cleared due to the teardown but the pointer + * hasn't been assigned NULL yet (or we loaded it + * before it was assigned) -- in this case it may + * now be NULL which means we should just let the + * packet pass through because splicing the frames + * back is already done. + */ + tid_tx = rcu_dereference_protected_tid_tx(tx->sta, tid); + + if (!tid_tx) { + /* do nothing, let packet pass through */ + } else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) { + info->flags |= IEEE80211_TX_CTL_AMPDU; + } else { + queued = true; + info->control.vif = &tx->sdata->vif; + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + __skb_queue_tail(&tid_tx->pending, skb); + } + spin_unlock(&tx->sta->lock); + } + + return queued; +} + +/* + * initialises @tx + */ +static ieee80211_tx_result +ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, + struct ieee80211_tx_data *tx, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_hdr *hdr; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + int hdrlen, tid; + u8 *qc; + + memset(tx, 0, sizeof(*tx)); + tx->skb = skb; + tx->local = local; + tx->sdata = sdata; + tx->channel = local->hw.conf.channel; + /* + * Set this flag (used below to indicate "automatic fragmentation"), + * it will be cleared/left by radiotap as desired. + * Only valid when fragmentation is done by the stack. + */ + if (!local->ops->set_frag_threshold) + tx->flags |= IEEE80211_TX_FRAGMENTED; + + /* process and remove the injection radiotap header */ + if (unlikely(info->flags & IEEE80211_TX_INTFL_HAS_RADIOTAP)) { + if (!__ieee80211_parse_tx_radiotap(tx, skb)) + return TX_DROP; + + /* + * __ieee80211_parse_tx_radiotap has now removed + * the radiotap header that was present and pre-filled + * 'tx' with tx control information. + */ + info->flags &= ~IEEE80211_TX_INTFL_HAS_RADIOTAP; + } + + /* + * If this flag is set to true anywhere, and we get here, + * we are doing the needed processing, so remove the flag + * now. + */ + info->flags &= ~IEEE80211_TX_INTFL_NEED_TXPROCESSING; + + hdr = (struct ieee80211_hdr *) skb->data; + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + tx->sta = rcu_dereference(sdata->u.vlan.sta); + if (!tx->sta && sdata->dev->ieee80211_ptr->use_4addr) + return TX_DROP; + } else if (info->flags & IEEE80211_TX_CTL_INJECTED || + tx->sdata->control_port_protocol == tx->skb->protocol) { + tx->sta = sta_info_get_bss(sdata, hdr->addr1); + } + if (!tx->sta) + tx->sta = sta_info_get(sdata, hdr->addr1); + + if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && + (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)) { + struct tid_ampdu_tx *tid_tx; + + qc = ieee80211_get_qos_ctl(hdr); + tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + + tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); + if (tid_tx) { + bool queued; + + queued = ieee80211_tx_prep_agg(tx, skb, info, + tid_tx, tid); + + if (unlikely(queued)) + return TX_QUEUED; + } + } + + if (is_multicast_ether_addr(hdr->addr1)) { + tx->flags &= ~IEEE80211_TX_UNICAST; + info->flags |= IEEE80211_TX_CTL_NO_ACK; + } else { + tx->flags |= IEEE80211_TX_UNICAST; + if (unlikely(local->wifi_wme_noack_test)) + info->flags |= IEEE80211_TX_CTL_NO_ACK; + else + info->flags &= ~IEEE80211_TX_CTL_NO_ACK; + } + + if (tx->flags & IEEE80211_TX_FRAGMENTED) { + if ((tx->flags & IEEE80211_TX_UNICAST) && + skb->len + FCS_LEN > local->hw.wiphy->frag_threshold && + !(info->flags & IEEE80211_TX_CTL_AMPDU)) + tx->flags |= IEEE80211_TX_FRAGMENTED; + else + tx->flags &= ~IEEE80211_TX_FRAGMENTED; + } + + if (!tx->sta) + info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; + else if (test_and_clear_sta_flags(tx->sta, WLAN_STA_CLEAR_PS_FILT)) + info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (skb->len > hdrlen + sizeof(rfc1042_header) + 2) { + u8 *pos = &skb->data[hdrlen + sizeof(rfc1042_header)]; + tx->ethertype = (pos[0] << 8) | pos[1]; + } + info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; + + return TX_CONTINUE; +} + +/* + * Returns false if the frame couldn't be transmitted but was queued instead. + */ +static bool __ieee80211_tx(struct ieee80211_local *local, struct sk_buff **skbp, + struct sta_info *sta, bool txpending) +{ + struct sk_buff *skb = *skbp, *next; + struct ieee80211_tx_info *info; + struct ieee80211_sub_if_data *sdata; + unsigned long flags; + int len; + bool fragm = false; + + while (skb) { + int q = skb_get_queue_mapping(skb); + __le16 fc; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + if (local->queue_stop_reasons[q] || + (!txpending && !skb_queue_empty(&local->pending[q]))) { + /* + * Since queue is stopped, queue up frames for later + * transmission from the tx-pending tasklet when the + * queue is woken again. + */ + + do { + next = skb->next; + skb->next = NULL; + /* + * NB: If txpending is true, next must already + * be NULL since we must've gone through this + * loop before already; therefore we can just + * queue the frame to the head without worrying + * about reordering of fragments. + */ + if (unlikely(txpending)) + __skb_queue_head(&local->pending[q], + skb); + else + __skb_queue_tail(&local->pending[q], + skb); + } while ((skb = next)); + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, + flags); + return false; + } + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + info = IEEE80211_SKB_CB(skb); + + if (fragm) + info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | + IEEE80211_TX_CTL_FIRST_FRAGMENT); + + next = skb->next; + len = skb->len; + + if (next) + info->flags |= IEEE80211_TX_CTL_MORE_FRAMES; + + sdata = vif_to_sdata(info->control.vif); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + info->control.vif = NULL; + break; + case NL80211_IFTYPE_AP_VLAN: + info->control.vif = &container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap)->vif; + break; + default: + /* keep */ + break; + } + + if (sta && sta->uploaded) + info->control.sta = &sta->sta; + else + info->control.sta = NULL; + + fc = ((struct ieee80211_hdr *)skb->data)->frame_control; + drv_tx(local, skb); + + ieee80211_tpt_led_trig_tx(local, fc, len); + *skbp = skb = next; + ieee80211_led_tx(local, 1); + fragm = true; + } + + return true; +} + +/* + * Invoke TX handlers, return 0 on success and non-zero if the + * frame was dropped or queued. + */ +static int invoke_tx_handlers(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + ieee80211_tx_result res = TX_DROP; + +#define CALL_TXH(txh) \ + do { \ + res = txh(tx); \ + if (res != TX_CONTINUE) \ + goto txh_done; \ + } while (0) + + CALL_TXH(ieee80211_tx_h_dynamic_ps); + CALL_TXH(ieee80211_tx_h_check_assoc); + CALL_TXH(ieee80211_tx_h_ps_buf); + CALL_TXH(ieee80211_tx_h_check_control_port_protocol); + CALL_TXH(ieee80211_tx_h_select_key); + if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + CALL_TXH(ieee80211_tx_h_rate_ctrl); + + if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) + goto txh_done; + + CALL_TXH(ieee80211_tx_h_michael_mic_add); + CALL_TXH(ieee80211_tx_h_sequence); + CALL_TXH(ieee80211_tx_h_fragment); + /* handlers after fragment must be aware of tx info fragmentation! */ + CALL_TXH(ieee80211_tx_h_stats); + CALL_TXH(ieee80211_tx_h_encrypt); + if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + CALL_TXH(ieee80211_tx_h_calculate_duration); +#undef CALL_TXH + + txh_done: + if (unlikely(res == TX_DROP)) { + I802_DEBUG_INC(tx->local->tx_handlers_drop); + while (skb) { + struct sk_buff *next; + + next = skb->next; + dev_kfree_skb(skb); + skb = next; + } + return -1; + } else if (unlikely(res == TX_QUEUED)) { + I802_DEBUG_INC(tx->local->tx_handlers_queued); + return -1; + } + + return 0; +} + +/* + * Returns false if the frame couldn't be transmitted but was queued instead. + */ +static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, bool txpending) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_data tx; + ieee80211_tx_result res_prepare; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + bool result = true; + + if (unlikely(skb->len < 10)) { + dev_kfree_skb(skb); + return true; + } + + rcu_read_lock(); + + /* initialises tx */ + res_prepare = ieee80211_tx_prepare(sdata, &tx, skb); + + if (unlikely(res_prepare == TX_DROP)) { + dev_kfree_skb(skb); + goto out; + } else if (unlikely(res_prepare == TX_QUEUED)) { + goto out; + } + + tx.channel = local->hw.conf.channel; + info->band = tx.channel->band; + + if (!invoke_tx_handlers(&tx)) + result = __ieee80211_tx(local, &tx.skb, tx.sta, txpending); + out: + rcu_read_unlock(); + return result; +} + +/* device xmit handlers */ + +static int ieee80211_skb_resize(struct ieee80211_local *local, + struct sk_buff *skb, + int head_need, bool may_encrypt) +{ + int tail_need = 0; + + /* + * This could be optimised, devices that do full hardware + * crypto (including TKIP MMIC) need no tailroom... But we + * have no drivers for such devices currently. + */ + if (may_encrypt) { + tail_need = IEEE80211_ENCRYPT_TAILROOM; + tail_need -= skb_tailroom(skb); + tail_need = max_t(int, tail_need, 0); + } + + if (head_need || tail_need) { + /* Sorry. Can't account for this any more */ + skb_orphan(skb); + } + + if (skb_cloned(skb)) + I802_DEBUG_INC(local->tx_expand_skb_head_cloned); + else if (head_need || tail_need) + I802_DEBUG_INC(local->tx_expand_skb_head); + else + return 0; + + if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { + wiphy_debug(local->hw.wiphy, + "failed to reallocate TX buffer\n"); + return -ENOMEM; + } + + /* update truesize too */ + skb->truesize += head_need + tail_need; + + return 0; +} + +static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_sub_if_data *tmp_sdata; + int headroom; + bool may_encrypt; + + rcu_read_lock(); + + if (unlikely(sdata->vif.type == NL80211_IFTYPE_MONITOR)) { + int hdrlen; + u16 len_rthdr; + + info->flags |= IEEE80211_TX_CTL_INJECTED | + IEEE80211_TX_INTFL_HAS_RADIOTAP; + + len_rthdr = ieee80211_get_radiotap_len(skb->data); + hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); + hdrlen = ieee80211_hdrlen(hdr->frame_control); + + /* check the header is complete in the frame */ + if (likely(skb->len >= len_rthdr + hdrlen)) { + /* + * We process outgoing injected frames that have a + * local address we handle as though they are our + * own frames. + * This code here isn't entirely correct, the local + * MAC address is not necessarily enough to find + * the interface to use; for that proper VLAN/WDS + * support we will need a different mechanism. + */ + + list_for_each_entry_rcu(tmp_sdata, &local->interfaces, + list) { + if (!ieee80211_sdata_running(tmp_sdata)) + continue; + if (tmp_sdata->vif.type == + NL80211_IFTYPE_MONITOR || + tmp_sdata->vif.type == + NL80211_IFTYPE_AP_VLAN || + tmp_sdata->vif.type == + NL80211_IFTYPE_WDS) + continue; + if (compare_ether_addr(tmp_sdata->vif.addr, + hdr->addr2) == 0) { + sdata = tmp_sdata; + break; + } + } + } + } + + may_encrypt = !(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT); + + headroom = local->tx_headroom; + if (may_encrypt) + headroom += IEEE80211_ENCRYPT_HEADROOM; + headroom -= skb_headroom(skb); + headroom = max_t(int, 0, headroom); + + if (ieee80211_skb_resize(local, skb, headroom, may_encrypt)) { + dev_kfree_skb(skb); + rcu_read_unlock(); + return; + } + + hdr = (struct ieee80211_hdr *) skb->data; + info->control.vif = &sdata->vif; + + if (ieee80211_vif_is_mesh(&sdata->vif) && + ieee80211_is_data(hdr->frame_control) && + !is_multicast_ether_addr(hdr->addr1)) + if (mesh_nexthop_lookup(skb, sdata)) { + /* skb queued: don't free */ + rcu_read_unlock(); + return; + } + + ieee80211_set_qos_hdr(local, skb); + ieee80211_tx(sdata, skb, false); + rcu_read_unlock(); +} + +netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_channel *chan = local->hw.conf.channel; + struct ieee80211_radiotap_header *prthdr = + (struct ieee80211_radiotap_header *)skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + u16 len_rthdr; + + /* + * Frame injection is not allowed if beaconing is not allowed + * or if we need radar detection. Beaconing is usually not allowed when + * the mode or operation (Adhoc, AP, Mesh) does not support DFS. + * Passive scan is also used in world regulatory domains where + * your country is not known and as such it should be treated as + * NO TX unless the channel is explicitly allowed in which case + * your current regulatory domain would not have the passive scan + * flag. + * + * Since AP mode uses monitor interfaces to inject/TX management + * frames we can make AP mode the exception to this rule once it + * supports radar detection as its implementation can deal with + * radar detection by itself. We can do that later by adding a + * monitor flag interfaces used for AP support. + */ + if ((chan->flags & (IEEE80211_CHAN_NO_IBSS | IEEE80211_CHAN_RADAR | + IEEE80211_CHAN_PASSIVE_SCAN))) + goto fail; + + /* check for not even having the fixed radiotap header part */ + if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header))) + goto fail; /* too short to be possibly valid */ + + /* is it a header version we can trust to find length from? */ + if (unlikely(prthdr->it_version)) + goto fail; /* only version 0 is supported */ + + /* then there must be a radiotap header with a length we can use */ + len_rthdr = ieee80211_get_radiotap_len(skb->data); + + /* does the skb contain enough to deliver on the alleged length? */ + if (unlikely(skb->len < len_rthdr)) + goto fail; /* skb too short for claimed rt header extent */ + + /* + * fix up the pointers accounting for the radiotap + * header still being in there. We are being given + * a precooked IEEE80211 header so no need for + * normal processing + */ + skb_set_mac_header(skb, len_rthdr); + /* + * these are just fixed to the end of the rt area since we + * don't have any better information and at this point, nobody cares + */ + skb_set_network_header(skb, len_rthdr); + skb_set_transport_header(skb, len_rthdr); + + memset(info, 0, sizeof(*info)); + + info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + + /* pass the radiotap header up to xmit */ + ieee80211_xmit(IEEE80211_DEV_TO_SUB_IF(dev), skb); + return NETDEV_TX_OK; + +fail: + dev_kfree_skb(skb); + return NETDEV_TX_OK; /* meaning, we dealt with the skb */ +} + +/** + * ieee80211_subif_start_xmit - netif start_xmit function for Ethernet-type + * subinterfaces (wlan#, WDS, and VLAN interfaces) + * @skb: packet to be sent + * @dev: incoming interface + * + * Returns: 0 on success (and frees skb in this case) or 1 on failure (skb will + * not be freed, and caller is responsible for either retrying later or freeing + * skb). + * + * This function takes in an Ethernet header and encapsulates it with suitable + * IEEE 802.11 header based on which interface the packet is coming in. The + * encapsulated packet will then be passed to master interface, wlan#.11, for + * transmission (through low-level driver). + */ +netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info; + int ret = NETDEV_TX_BUSY, head_need; + u16 ethertype, hdrlen, meshhdrlen = 0; + __le16 fc; + struct ieee80211_hdr hdr; + struct ieee80211s_hdr mesh_hdr __maybe_unused; + struct mesh_path __maybe_unused *mppath = NULL; + const u8 *encaps_data; + int encaps_len, skip_header_bytes; + int nh_pos, h_pos; + struct sta_info *sta = NULL; + u32 sta_flags = 0; + struct sk_buff *tmp_skb; + + if (unlikely(skb->len < ETH_HLEN)) { + ret = NETDEV_TX_OK; + goto fail; + } + + /* convert Ethernet header to proper 802.11 header (based on + * operation mode) */ + ethertype = (skb->data[12] << 8) | skb->data[13]; + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + rcu_read_lock(); + sta = rcu_dereference(sdata->u.vlan.sta); + if (sta) { + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN); + memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); + memcpy(hdr.addr3, skb->data, ETH_ALEN); + memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); + hdrlen = 30; + sta_flags = get_sta_flags(sta); + } + rcu_read_unlock(); + if (sta) + break; + /* fall through */ + case NL80211_IFTYPE_AP: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA BSSID SA */ + memcpy(hdr.addr1, skb->data, ETH_ALEN); + memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); + memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); + hdrlen = 24; + break; + case NL80211_IFTYPE_WDS: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN); + memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); + memcpy(hdr.addr3, skb->data, ETH_ALEN); + memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); + hdrlen = 30; + break; +#ifdef CONFIG_MAC80211_MESH + case NL80211_IFTYPE_MESH_POINT: + if (!sdata->u.mesh.mshcfg.dot11MeshTTL) { + /* Do not send frames with mesh_ttl == 0 */ + sdata->u.mesh.mshstats.dropped_frames_ttl++; + ret = NETDEV_TX_OK; + goto fail; + } + rcu_read_lock(); + if (!is_multicast_ether_addr(skb->data)) + mppath = mpp_path_lookup(skb->data, sdata); + + /* + * Use address extension if it is a packet from + * another interface or if we know the destination + * is being proxied by a portal (i.e. portal address + * differs from proxied address) + */ + if (compare_ether_addr(sdata->vif.addr, + skb->data + ETH_ALEN) == 0 && + !(mppath && compare_ether_addr(mppath->mpp, skb->data))) { + hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, + skb->data, skb->data + ETH_ALEN); + rcu_read_unlock(); + meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, + sdata, NULL, NULL); + } else { + int is_mesh_mcast = 1; + const u8 *mesh_da; + + if (is_multicast_ether_addr(skb->data)) + /* DA TA mSA AE:SA */ + mesh_da = skb->data; + else { + static const u8 bcast[ETH_ALEN] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + if (mppath) { + /* RA TA mDA mSA AE:DA SA */ + mesh_da = mppath->mpp; + is_mesh_mcast = 0; + } else { + /* DA TA mSA AE:SA */ + mesh_da = bcast; + } + } + hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, + mesh_da, sdata->vif.addr); + rcu_read_unlock(); + if (is_mesh_mcast) + meshhdrlen = + ieee80211_new_mesh_header(&mesh_hdr, + sdata, + skb->data + ETH_ALEN, + NULL); + else + meshhdrlen = + ieee80211_new_mesh_header(&mesh_hdr, + sdata, + skb->data, + skb->data + ETH_ALEN); + + } + break; +#endif + case NL80211_IFTYPE_STATION: + memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); + if (sdata->u.mgd.use_4addr && + cpu_to_be16(ethertype) != sdata->control_port_protocol) { + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); + memcpy(hdr.addr3, skb->data, ETH_ALEN); + memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); + hdrlen = 30; + } else { + fc |= cpu_to_le16(IEEE80211_FCTL_TODS); + /* BSSID SA DA */ + memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); + memcpy(hdr.addr3, skb->data, ETH_ALEN); + hdrlen = 24; + } + break; + case NL80211_IFTYPE_ADHOC: + /* DA SA BSSID */ + memcpy(hdr.addr1, skb->data, ETH_ALEN); + memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); + memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN); + hdrlen = 24; + break; + default: + ret = NETDEV_TX_OK; + goto fail; + } + + /* + * There's no need to try to look up the destination + * if it is a multicast address (which can only happen + * in AP mode) + */ + if (!is_multicast_ether_addr(hdr.addr1)) { + rcu_read_lock(); + sta = sta_info_get(sdata, hdr.addr1); + if (sta) + sta_flags = get_sta_flags(sta); + rcu_read_unlock(); + } + + /* receiver and we are QoS enabled, use a QoS type frame */ + if ((sta_flags & WLAN_STA_WME) && local->hw.queues >= 4) { + fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); + hdrlen += 2; + } + + /* + * Drop unicast frames to unauthorised stations unless they are + * EAPOL frames from the local station. + */ + if (!ieee80211_vif_is_mesh(&sdata->vif) && + unlikely(!is_multicast_ether_addr(hdr.addr1) && + !(sta_flags & WLAN_STA_AUTHORIZED) && + !(cpu_to_be16(ethertype) == sdata->control_port_protocol && + compare_ether_addr(sdata->vif.addr, + skb->data + ETH_ALEN) == 0))) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "%s: dropped frame to %pM" + " (unauthorized port)\n", dev->name, + hdr.addr1); +#endif + + I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); + + ret = NETDEV_TX_OK; + goto fail; + } + + /* + * If the skb is shared we need to obtain our own copy. + */ + if (skb_shared(skb)) { + tmp_skb = skb; + skb = skb_clone(skb, GFP_ATOMIC); + kfree_skb(tmp_skb); + + if (!skb) { + ret = NETDEV_TX_OK; + goto fail; + } + } + + hdr.frame_control = fc; + hdr.duration_id = 0; + hdr.seq_ctrl = 0; + + skip_header_bytes = ETH_HLEN; + if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { + encaps_data = bridge_tunnel_header; + encaps_len = sizeof(bridge_tunnel_header); + skip_header_bytes -= 2; + } else if (ethertype >= 0x600) { + encaps_data = rfc1042_header; + encaps_len = sizeof(rfc1042_header); + skip_header_bytes -= 2; + } else { + encaps_data = NULL; + encaps_len = 0; + } + + nh_pos = skb_network_header(skb) - skb->data; + h_pos = skb_transport_header(skb) - skb->data; + + skb_pull(skb, skip_header_bytes); + nh_pos -= skip_header_bytes; + h_pos -= skip_header_bytes; + + head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); + + /* + * So we need to modify the skb header and hence need a copy of + * that. The head_need variable above doesn't, so far, include + * the needed header space that we don't need right away. If we + * can, then we don't reallocate right now but only after the + * frame arrives at the master device (if it does...) + * + * If we cannot, however, then we will reallocate to include all + * the ever needed space. Also, if we need to reallocate it anyway, + * make it big enough for everything we may ever need. + */ + + if (head_need > 0 || skb_cloned(skb)) { + head_need += IEEE80211_ENCRYPT_HEADROOM; + head_need += local->tx_headroom; + head_need = max_t(int, 0, head_need); + if (ieee80211_skb_resize(local, skb, head_need, true)) + goto fail; + } + + if (encaps_data) { + memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); + nh_pos += encaps_len; + h_pos += encaps_len; + } + +#ifdef CONFIG_MAC80211_MESH + if (meshhdrlen > 0) { + memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); + nh_pos += meshhdrlen; + h_pos += meshhdrlen; + } +#endif + + if (ieee80211_is_data_qos(fc)) { + __le16 *qos_control; + + qos_control = (__le16*) skb_push(skb, 2); + memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2); + /* + * Maybe we could actually set some fields here, for now just + * initialise to zero to indicate no special operation. + */ + *qos_control = 0; + } else + memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); + + nh_pos += hdrlen; + h_pos += hdrlen; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + /* Update skb pointers to various headers since this modified frame + * is going to go through Linux networking code that may potentially + * need things like pointer to IP header. */ + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, nh_pos); + skb_set_transport_header(skb, h_pos); + + info = IEEE80211_SKB_CB(skb); + memset(info, 0, sizeof(*info)); + + dev->trans_start = jiffies; + ieee80211_xmit(sdata, skb); + + return NETDEV_TX_OK; + + fail: + if (ret == NETDEV_TX_OK) + dev_kfree_skb(skb); + + return ret; +} + + +/* + * ieee80211_clear_tx_pending may not be called in a context where + * it is possible that it packets could come in again. + */ +void ieee80211_clear_tx_pending(struct ieee80211_local *local) +{ + int i; + + for (i = 0; i < local->hw.queues; i++) + skb_queue_purge(&local->pending[i]); +} + +/* + * Returns false if the frame couldn't be transmitted but was queued instead, + * which in this case means re-queued -- take as an indication to stop sending + * more pending frames. + */ +static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, + struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + struct ieee80211_hdr *hdr; + bool result; + + sdata = vif_to_sdata(info->control.vif); + + if (info->flags & IEEE80211_TX_INTFL_NEED_TXPROCESSING) { + result = ieee80211_tx(sdata, skb, true); + } else { + hdr = (struct ieee80211_hdr *)skb->data; + sta = sta_info_get(sdata, hdr->addr1); + + result = __ieee80211_tx(local, &skb, sta, true); + } + + return result; +} + +/* + * Transmit all pending packets. Called from tasklet. + */ +void ieee80211_tx_pending(unsigned long data) +{ + struct ieee80211_local *local = (struct ieee80211_local *)data; + struct ieee80211_sub_if_data *sdata; + unsigned long flags; + int i; + bool txok; + + rcu_read_lock(); + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + for (i = 0; i < local->hw.queues; i++) { + /* + * If queue is stopped by something other than due to pending + * frames, or we have no pending frames, proceed to next queue. + */ + if (local->queue_stop_reasons[i] || + skb_queue_empty(&local->pending[i])) + continue; + + while (!skb_queue_empty(&local->pending[i])) { + struct sk_buff *skb = __skb_dequeue(&local->pending[i]); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + if (WARN_ON(!info->control.vif)) { + kfree_skb(skb); + continue; + } + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, + flags); + + txok = ieee80211_tx_pending_skb(local, skb); + spin_lock_irqsave(&local->queue_stop_reason_lock, + flags); + if (!txok) + break; + } + + if (skb_queue_empty(&local->pending[i])) + list_for_each_entry_rcu(sdata, &local->interfaces, list) + netif_wake_subqueue(sdata->dev, i); + } + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + rcu_read_unlock(); +} + +/* functions for drivers to get certain frames */ + +static void ieee80211_beacon_add_tim(struct ieee80211_if_ap *bss, + struct sk_buff *skb, + struct beacon_data *beacon) +{ + u8 *pos, *tim; + int aid0 = 0; + int i, have_bits = 0, n1, n2; + + /* Generate bitmap for TIM only if there are any STAs in power save + * mode. */ + if (atomic_read(&bss->num_sta_ps) > 0) + /* in the hope that this is faster than + * checking byte-for-byte */ + have_bits = !bitmap_empty((unsigned long*)bss->tim, + IEEE80211_MAX_AID+1); + + if (bss->dtim_count == 0) + bss->dtim_count = beacon->dtim_period - 1; + else + bss->dtim_count--; + + tim = pos = (u8 *) skb_put(skb, 6); + *pos++ = WLAN_EID_TIM; + *pos++ = 4; + *pos++ = bss->dtim_count; + *pos++ = beacon->dtim_period; + + if (bss->dtim_count == 0 && !skb_queue_empty(&bss->ps_bc_buf)) + aid0 = 1; + + bss->dtim_bc_mc = aid0 == 1; + + if (have_bits) { + /* Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. */ + n1 = 0; + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (bss->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (bss->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + *pos++ = n1 | aid0; + /* Part Virt Bitmap */ + memcpy(pos, bss->tim + n1, n2 - n1 + 1); + + tim[1] = n2 - n1 + 4; + skb_put(skb, n2 - n1); + } else { + *pos++ = aid0; /* Bitmap control */ + *pos++ = 0; /* Part Virt Bitmap */ + } +} + +struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 *tim_offset, u16 *tim_length) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sk_buff *skb = NULL; + struct ieee80211_tx_info *info; + struct ieee80211_sub_if_data *sdata = NULL; + struct ieee80211_if_ap *ap = NULL; + struct beacon_data *beacon; + struct ieee80211_supported_band *sband; + enum ieee80211_band band = local->hw.conf.channel->band; + struct ieee80211_tx_rate_control txrc; + + sband = local->hw.wiphy->bands[band]; + + rcu_read_lock(); + + sdata = vif_to_sdata(vif); + + if (!ieee80211_sdata_running(sdata)) + goto out; + + if (tim_offset) + *tim_offset = 0; + if (tim_length) + *tim_length = 0; + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + ap = &sdata->u.ap; + beacon = rcu_dereference(ap->beacon); + if (beacon) { + /* + * headroom, head length, + * tail length and maximum TIM length + */ + skb = dev_alloc_skb(local->tx_headroom + + beacon->head_len + + beacon->tail_len + 256); + if (!skb) + goto out; + + skb_reserve(skb, local->tx_headroom); + memcpy(skb_put(skb, beacon->head_len), beacon->head, + beacon->head_len); + + /* + * Not very nice, but we want to allow the driver to call + * ieee80211_beacon_get() as a response to the set_tim() + * callback. That, however, is already invoked under the + * sta_lock to guarantee consistent and race-free update + * of the tim bitmap in mac80211 and the driver. + */ + if (local->tim_in_locked_section) { + ieee80211_beacon_add_tim(ap, skb, beacon); + } else { + unsigned long flags; + + spin_lock_irqsave(&local->sta_lock, flags); + ieee80211_beacon_add_tim(ap, skb, beacon); + spin_unlock_irqrestore(&local->sta_lock, flags); + } + + if (tim_offset) + *tim_offset = beacon->head_len; + if (tim_length) + *tim_length = skb->len - beacon->head_len; + + if (beacon->tail) + memcpy(skb_put(skb, beacon->tail_len), + beacon->tail, beacon->tail_len); + } else + goto out; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; + struct ieee80211_hdr *hdr; + struct sk_buff *presp = rcu_dereference(ifibss->presp); + + if (!presp) + goto out; + + skb = skb_copy(presp, GFP_ATOMIC); + if (!skb) + goto out; + + hdr = (struct ieee80211_hdr *) skb->data; + hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON); + } else if (ieee80211_vif_is_mesh(&sdata->vif)) { + struct ieee80211_mgmt *mgmt; + u8 *pos; + +#ifdef CONFIG_MAC80211_MESH + if (!sdata->u.mesh.mesh_id_len) + goto out; +#endif + + /* headroom, head length, tail length and maximum TIM length */ + skb = dev_alloc_skb(local->tx_headroom + 400 + + sdata->u.mesh.ie_len); + if (!skb) + goto out; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) + skb_put(skb, 24 + sizeof(mgmt->u.beacon)); + memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon)); + mgmt->frame_control = + cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); + memset(mgmt->da, 0xff, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + mgmt->u.beacon.beacon_int = + cpu_to_le16(sdata->vif.bss_conf.beacon_int); + mgmt->u.beacon.capab_info = 0x0; /* 0x0 for MPs */ + + pos = skb_put(skb, 2); + *pos++ = WLAN_EID_SSID; + *pos++ = 0x0; + + mesh_mgmt_ies_add(skb, sdata); + } else { + WARN_ON(1); + goto out; + } + + info = IEEE80211_SKB_CB(skb); + + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + info->flags |= IEEE80211_TX_CTL_NO_ACK; + info->band = band; + + memset(&txrc, 0, sizeof(txrc)); + txrc.hw = hw; + txrc.sband = sband; + txrc.bss_conf = &sdata->vif.bss_conf; + txrc.skb = skb; + txrc.reported_rate.idx = -1; + txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; + if (txrc.rate_idx_mask == (1 << sband->n_bitrates) - 1) + txrc.max_rate_idx = -1; + else + txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1; + txrc.bss = true; + rate_control_get_rate(sdata, NULL, &txrc); + + info->control.vif = vif; + + info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | + IEEE80211_TX_CTL_ASSIGN_SEQ | + IEEE80211_TX_CTL_FIRST_FRAGMENT; + out: + rcu_read_unlock(); + return skb; +} +EXPORT_SYMBOL(ieee80211_beacon_get_tim); + +struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_pspoll *pspoll; + struct ieee80211_local *local; + struct sk_buff *skb; + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return NULL; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + local = sdata->local; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for " + "pspoll template\n", sdata->name); + return NULL; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + pspoll = (struct ieee80211_pspoll *) skb_put(skb, sizeof(*pspoll)); + memset(pspoll, 0, sizeof(*pspoll)); + pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | + IEEE80211_STYPE_PSPOLL); + pspoll->aid = cpu_to_le16(ifmgd->aid); + + /* aid in PS-Poll has its two MSBs each set to 1 */ + pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); + + memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN); + memcpy(pspoll->ta, vif->addr, ETH_ALEN); + + return skb; +} +EXPORT_SYMBOL(ieee80211_pspoll_get); + +struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_hdr_3addr *nullfunc; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_managed *ifmgd; + struct ieee80211_local *local; + struct sk_buff *skb; + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return NULL; + + sdata = vif_to_sdata(vif); + ifmgd = &sdata->u.mgd; + local = sdata->local; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc " + "template\n", sdata->name); + return NULL; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + nullfunc = (struct ieee80211_hdr_3addr *) skb_put(skb, + sizeof(*nullfunc)); + memset(nullfunc, 0, sizeof(*nullfunc)); + nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA | + IEEE80211_STYPE_NULLFUNC | + IEEE80211_FCTL_TODS); + memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN); + memcpy(nullfunc->addr2, vif->addr, ETH_ALEN); + memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN); + + return skb; +} +EXPORT_SYMBOL(ieee80211_nullfunc_get); + +struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_local *local; + struct ieee80211_hdr_3addr *hdr; + struct sk_buff *skb; + size_t ie_ssid_len; + u8 *pos; + + sdata = vif_to_sdata(vif); + local = sdata->local; + ie_ssid_len = 2 + ssid_len; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*hdr) + + ie_ssid_len + ie_len); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for probe " + "request template\n", sdata->name); + return NULL; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + + hdr = (struct ieee80211_hdr_3addr *) skb_put(skb, sizeof(*hdr)); + memset(hdr, 0, sizeof(*hdr)); + hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_REQ); + memset(hdr->addr1, 0xff, ETH_ALEN); + memcpy(hdr->addr2, vif->addr, ETH_ALEN); + memset(hdr->addr3, 0xff, ETH_ALEN); + + pos = skb_put(skb, ie_ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = ssid_len; + if (ssid) + memcpy(pos, ssid, ssid_len); + pos += ssid_len; + + if (ie) { + pos = skb_put(skb, ie_len); + memcpy(pos, ie, ie_len); + } + + return skb; +} +EXPORT_SYMBOL(ieee80211_probereq_get); + +void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + const void *frame, size_t frame_len, + const struct ieee80211_tx_info *frame_txctl, + struct ieee80211_rts *rts) +{ + const struct ieee80211_hdr *hdr = frame; + + rts->frame_control = + cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS); + rts->duration = ieee80211_rts_duration(hw, vif, frame_len, + frame_txctl); + memcpy(rts->ra, hdr->addr1, sizeof(rts->ra)); + memcpy(rts->ta, hdr->addr2, sizeof(rts->ta)); +} +EXPORT_SYMBOL(ieee80211_rts_get); + +void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + const void *frame, size_t frame_len, + const struct ieee80211_tx_info *frame_txctl, + struct ieee80211_cts *cts) +{ + const struct ieee80211_hdr *hdr = frame; + + cts->frame_control = + cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS); + cts->duration = ieee80211_ctstoself_duration(hw, vif, + frame_len, frame_txctl); + memcpy(cts->ra, hdr->addr1, sizeof(cts->ra)); +} +EXPORT_SYMBOL(ieee80211_ctstoself_get); + +struct sk_buff * +ieee80211_get_buffered_bc(struct ieee80211_hw *hw, + struct ieee80211_vif *vif) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct sk_buff *skb = NULL; + struct ieee80211_tx_data tx; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_if_ap *bss = NULL; + struct beacon_data *beacon; + struct ieee80211_tx_info *info; + + sdata = vif_to_sdata(vif); + bss = &sdata->u.ap; + + rcu_read_lock(); + beacon = rcu_dereference(bss->beacon); + + if (sdata->vif.type != NL80211_IFTYPE_AP || !beacon || !beacon->head) + goto out; + + if (bss->dtim_count != 0 || !bss->dtim_bc_mc) + goto out; /* send buffered bc/mc only after DTIM beacon */ + + while (1) { + skb = skb_dequeue(&bss->ps_bc_buf); + if (!skb) + goto out; + local->total_ps_buffered--; + + if (!skb_queue_empty(&bss->ps_bc_buf) && skb->len >= 2) { + struct ieee80211_hdr *hdr = + (struct ieee80211_hdr *) skb->data; + /* more buffered multicast/broadcast frames ==> set + * MoreData flag in IEEE 802.11 header to inform PS + * STAs */ + hdr->frame_control |= + cpu_to_le16(IEEE80211_FCTL_MOREDATA); + } + + if (!ieee80211_tx_prepare(sdata, &tx, skb)) + break; + dev_kfree_skb_any(skb); + } + + info = IEEE80211_SKB_CB(skb); + + tx.flags |= IEEE80211_TX_PS_BUFFERED; + tx.channel = local->hw.conf.channel; + info->band = tx.channel->band; + + if (invoke_tx_handlers(&tx)) + skb = NULL; + out: + rcu_read_unlock(); + + return skb; +} +EXPORT_SYMBOL(ieee80211_get_buffered_bc); + +void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) +{ + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, 0); + skb_set_transport_header(skb, 0); + + /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */ + skb_set_queue_mapping(skb, IEEE80211_AC_VO); + skb->priority = 7; + + /* + * The other path calling ieee80211_xmit is from the tasklet, + * and while we can handle concurrent transmissions locking + * requirements are that we do not come into tx with bhs on. + */ + local_bh_disable(); + ieee80211_xmit(sdata, skb); + local_bh_enable(); +} diff --git a/net/mac80211/util.c b/net/mac80211/util.c new file mode 100644 index 00000000..11d9d49f --- /dev/null +++ b/net/mac80211/util.c @@ -0,0 +1,1443 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * utilities for mac80211 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" +#include "mesh.h" +#include "wme.h" +#include "led.h" +#include "wep.h" + +/* privid for wiphys to determine whether they belong to us or not */ +void *mac80211_wiphy_privid = &mac80211_wiphy_privid; + +struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) +{ + struct ieee80211_local *local; + BUG_ON(!wiphy); + + local = wiphy_priv(wiphy); + return &local->hw; +} +EXPORT_SYMBOL(wiphy_to_ieee80211_hw); + +u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, + enum nl80211_iftype type) +{ + __le16 fc = hdr->frame_control; + + /* drop ACK/CTS frames and incorrect hdr len (ctrl) */ + if (len < 16) + return NULL; + + if (ieee80211_is_data(fc)) { + if (len < 24) /* drop incorrect hdr len (data) */ + return NULL; + + if (ieee80211_has_a4(fc)) + return NULL; + if (ieee80211_has_tods(fc)) + return hdr->addr1; + if (ieee80211_has_fromds(fc)) + return hdr->addr2; + + return hdr->addr3; + } + + if (ieee80211_is_mgmt(fc)) { + if (len < 24) /* drop incorrect hdr len (mgmt) */ + return NULL; + return hdr->addr3; + } + + if (ieee80211_is_ctl(fc)) { + if(ieee80211_is_pspoll(fc)) + return hdr->addr1; + + if (ieee80211_is_back_req(fc)) { + switch (type) { + case NL80211_IFTYPE_STATION: + return hdr->addr2; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + return hdr->addr1; + default: + break; /* fall through to the return */ + } + } + } + + return NULL; +} + +void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + struct ieee80211_hdr *hdr; + + do { + hdr = (struct ieee80211_hdr *) skb->data; + hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + } while ((skb = skb->next)); +} + +int ieee80211_frame_duration(struct ieee80211_local *local, size_t len, + int rate, int erp, int short_preamble) +{ + int dur; + + /* calculate duration (in microseconds, rounded up to next higher + * integer if it includes a fractional microsecond) to send frame of + * len bytes (does not include FCS) at the given rate. Duration will + * also include SIFS. + * + * rate is in 100 kbps, so divident is multiplied by 10 in the + * DIV_ROUND_UP() operations. + */ + + if (local->hw.conf.channel->band == IEEE80211_BAND_5GHZ || erp) { + /* + * OFDM: + * + * N_DBPS = DATARATE x 4 + * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS) + * (16 = SIGNAL time, 6 = tail bits) + * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext + * + * T_SYM = 4 usec + * 802.11a - 17.5.2: aSIFSTime = 16 usec + * 802.11g - 19.8.4: aSIFSTime = 10 usec + + * signal ext = 6 usec + */ + dur = 16; /* SIFS + signal ext */ + dur += 16; /* 17.3.2.3: T_PREAMBLE = 16 usec */ + dur += 4; /* 17.3.2.3: T_SIGNAL = 4 usec */ + dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10, + 4 * rate); /* T_SYM x N_SYM */ + } else { + /* + * 802.11b or 802.11g with 802.11b compatibility: + * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime + + * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0. + * + * 802.11 (DS): 15.3.3, 802.11b: 18.3.4 + * aSIFSTime = 10 usec + * aPreambleLength = 144 usec or 72 usec with short preamble + * aPLCPHeaderLength = 48 usec or 24 usec with short preamble + */ + dur = 10; /* aSIFSTime = 10 usec */ + dur += short_preamble ? (72 + 24) : (144 + 48); + + dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate); + } + + return dur; +} + +/* Exported duration function for driver use */ +__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + size_t frame_len, + struct ieee80211_rate *rate) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + u16 dur; + int erp; + bool short_preamble = false; + + erp = 0; + if (vif) { + sdata = vif_to_sdata(vif); + short_preamble = sdata->vif.bss_conf.use_short_preamble; + if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) + erp = rate->flags & IEEE80211_RATE_ERP_G; + } + + dur = ieee80211_frame_duration(local, frame_len, rate->bitrate, erp, + short_preamble); + + return cpu_to_le16(dur); +} +EXPORT_SYMBOL(ieee80211_generic_frame_duration); + +__le16 ieee80211_rts_duration(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, size_t frame_len, + const struct ieee80211_tx_info *frame_txctl) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_rate *rate; + struct ieee80211_sub_if_data *sdata; + bool short_preamble; + int erp; + u16 dur; + struct ieee80211_supported_band *sband; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + short_preamble = false; + + rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; + + erp = 0; + if (vif) { + sdata = vif_to_sdata(vif); + short_preamble = sdata->vif.bss_conf.use_short_preamble; + if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) + erp = rate->flags & IEEE80211_RATE_ERP_G; + } + + /* CTS duration */ + dur = ieee80211_frame_duration(local, 10, rate->bitrate, + erp, short_preamble); + /* Data frame duration */ + dur += ieee80211_frame_duration(local, frame_len, rate->bitrate, + erp, short_preamble); + /* ACK duration */ + dur += ieee80211_frame_duration(local, 10, rate->bitrate, + erp, short_preamble); + + return cpu_to_le16(dur); +} +EXPORT_SYMBOL(ieee80211_rts_duration); + +__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + size_t frame_len, + const struct ieee80211_tx_info *frame_txctl) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_rate *rate; + struct ieee80211_sub_if_data *sdata; + bool short_preamble; + int erp; + u16 dur; + struct ieee80211_supported_band *sband; + + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + + short_preamble = false; + + rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx]; + erp = 0; + if (vif) { + sdata = vif_to_sdata(vif); + short_preamble = sdata->vif.bss_conf.use_short_preamble; + if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) + erp = rate->flags & IEEE80211_RATE_ERP_G; + } + + /* Data frame duration */ + dur = ieee80211_frame_duration(local, frame_len, rate->bitrate, + erp, short_preamble); + if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) { + /* ACK duration */ + dur += ieee80211_frame_duration(local, 10, rate->bitrate, + erp, short_preamble); + } + + return cpu_to_le16(dur); +} +EXPORT_SYMBOL(ieee80211_ctstoself_duration); + +static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + + trace_wake_queue(local, queue, reason); + + if (WARN_ON(queue >= hw->queues)) + return; + + __clear_bit(reason, &local->queue_stop_reasons[queue]); + + if (local->queue_stop_reasons[queue] != 0) + /* someone still has this queue stopped */ + return; + + if (skb_queue_empty(&local->pending[queue])) { + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) + continue; + netif_wake_subqueue(sdata->dev, queue); + } + rcu_read_unlock(); + } else + tasklet_schedule(&local->tx_pending_tasklet); +} + +void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason) +{ + struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + __ieee80211_wake_queue(hw, queue, reason); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) +{ + ieee80211_wake_queue_by_reason(hw, queue, + IEEE80211_QUEUE_STOP_REASON_DRIVER); +} +EXPORT_SYMBOL(ieee80211_wake_queue); + +static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + + trace_stop_queue(local, queue, reason); + + if (WARN_ON(queue >= hw->queues)) + return; + + __set_bit(reason, &local->queue_stop_reasons[queue]); + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) + netif_stop_subqueue(sdata->dev, queue); + rcu_read_unlock(); +} + +void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, + enum queue_stop_reason reason) +{ + struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + __ieee80211_stop_queue(hw, queue, reason); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) +{ + ieee80211_stop_queue_by_reason(hw, queue, + IEEE80211_QUEUE_STOP_REASON_DRIVER); +} +EXPORT_SYMBOL(ieee80211_stop_queue); + +void ieee80211_add_pending_skb(struct ieee80211_local *local, + struct sk_buff *skb) +{ + struct ieee80211_hw *hw = &local->hw; + unsigned long flags; + int queue = skb_get_queue_mapping(skb); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + if (WARN_ON(!info->control.vif)) { + kfree_skb(skb); + return; + } + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + __skb_queue_tail(&local->pending[queue], skb); + __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +int ieee80211_add_pending_skbs_fn(struct ieee80211_local *local, + struct sk_buff_head *skbs, + void (*fn)(void *data), void *data) +{ + struct ieee80211_hw *hw = &local->hw; + struct sk_buff *skb; + unsigned long flags; + int queue, ret = 0, i; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + for (i = 0; i < hw->queues; i++) + __ieee80211_stop_queue(hw, i, + IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + + while ((skb = skb_dequeue(skbs))) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + if (WARN_ON(!info->control.vif)) { + kfree_skb(skb); + continue; + } + + ret++; + queue = skb_get_queue_mapping(skb); + __skb_queue_tail(&local->pending[queue], skb); + } + + if (fn) + fn(data); + + for (i = 0; i < hw->queues; i++) + __ieee80211_wake_queue(hw, i, + IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + return ret; +} + +int ieee80211_add_pending_skbs(struct ieee80211_local *local, + struct sk_buff_head *skbs) +{ + return ieee80211_add_pending_skbs_fn(local, skbs, NULL, NULL); +} + +void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, + enum queue_stop_reason reason) +{ + struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + int i; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + + for (i = 0; i < hw->queues; i++) + __ieee80211_stop_queue(hw, i, reason); + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +void ieee80211_stop_queues(struct ieee80211_hw *hw) +{ + ieee80211_stop_queues_by_reason(hw, + IEEE80211_QUEUE_STOP_REASON_DRIVER); +} +EXPORT_SYMBOL(ieee80211_stop_queues); + +int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) +{ + struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + int ret; + + if (WARN_ON(queue >= hw->queues)) + return true; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + ret = !!local->queue_stop_reasons[queue]; + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + return ret; +} +EXPORT_SYMBOL(ieee80211_queue_stopped); + +void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, + enum queue_stop_reason reason) +{ + struct ieee80211_local *local = hw_to_local(hw); + unsigned long flags; + int i; + + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + + for (i = 0; i < hw->queues; i++) + __ieee80211_wake_queue(hw, i, reason); + + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +void ieee80211_wake_queues(struct ieee80211_hw *hw) +{ + ieee80211_wake_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_DRIVER); +} +EXPORT_SYMBOL(ieee80211_wake_queues); + +void ieee80211_iterate_active_interfaces( + struct ieee80211_hw *hw, + void (*iterator)(void *data, u8 *mac, + struct ieee80211_vif *vif), + void *data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + + mutex_lock(&local->iflist_mtx); + + list_for_each_entry(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_AP_VLAN: + continue; + default: + break; + } + if (ieee80211_sdata_running(sdata)) + iterator(data, sdata->vif.addr, + &sdata->vif); + } + + mutex_unlock(&local->iflist_mtx); +} +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces); + +void ieee80211_iterate_active_interfaces_atomic( + struct ieee80211_hw *hw, + void (*iterator)(void *data, u8 *mac, + struct ieee80211_vif *vif), + void *data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + + rcu_read_lock(); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_AP_VLAN: + continue; + default: + break; + } + if (ieee80211_sdata_running(sdata)) + iterator(data, sdata->vif.addr, + &sdata->vif); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); + +/* + * Nothing should have been stuffed into the workqueue during + * the suspend->resume cycle. If this WARN is seen then there + * is a bug with either the driver suspend or something in + * mac80211 stuffing into the workqueue which we haven't yet + * cleared during mac80211's suspend cycle. + */ +static bool ieee80211_can_queue_work(struct ieee80211_local *local) +{ + if (WARN(local->suspended && !local->resuming, + "queueing ieee80211 work while going to suspend\n")) + return false; + + return true; +} + +void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work) +{ + struct ieee80211_local *local = hw_to_local(hw); + + if (!ieee80211_can_queue_work(local)) + return; + + queue_work(local->workqueue, work); +} +EXPORT_SYMBOL(ieee80211_queue_work); + +void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, + struct delayed_work *dwork, + unsigned long delay) +{ + struct ieee80211_local *local = hw_to_local(hw); + + if (!ieee80211_can_queue_work(local)) + return; + + queue_delayed_work(local->workqueue, dwork, delay); +} +EXPORT_SYMBOL(ieee80211_queue_delayed_work); + +void ieee802_11_parse_elems(u8 *start, size_t len, + struct ieee802_11_elems *elems) +{ + ieee802_11_parse_elems_crc(start, len, elems, 0, 0); +} + +u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, + struct ieee802_11_elems *elems, + u64 filter, u32 crc) +{ + size_t left = len; + u8 *pos = start; + bool calc_crc = filter != 0; + + memset(elems, 0, sizeof(*elems)); + elems->ie_start = start; + elems->total_len = len; + + while (left >= 2) { + u8 id, elen; + + id = *pos++; + elen = *pos++; + left -= 2; + + if (elen > left) + break; + + if (calc_crc && id < 64 && (filter & (1ULL << id))) + crc = crc32_be(crc, pos - 2, elen + 2); + + switch (id) { + case WLAN_EID_SSID: + elems->ssid = pos; + elems->ssid_len = elen; + break; + case WLAN_EID_SUPP_RATES: + elems->supp_rates = pos; + elems->supp_rates_len = elen; + break; + case WLAN_EID_FH_PARAMS: + elems->fh_params = pos; + elems->fh_params_len = elen; + break; + case WLAN_EID_DS_PARAMS: + elems->ds_params = pos; + elems->ds_params_len = elen; + break; + case WLAN_EID_CF_PARAMS: + elems->cf_params = pos; + elems->cf_params_len = elen; + break; + case WLAN_EID_TIM: + if (elen >= sizeof(struct ieee80211_tim_ie)) { + elems->tim = (void *)pos; + elems->tim_len = elen; + } + break; + case WLAN_EID_IBSS_PARAMS: + elems->ibss_params = pos; + elems->ibss_params_len = elen; + break; + case WLAN_EID_CHALLENGE: + elems->challenge = pos; + elems->challenge_len = elen; + break; + case WLAN_EID_VENDOR_SPECIFIC: + if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && + pos[2] == 0xf2) { + /* Microsoft OUI (00:50:F2) */ + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + if (pos[3] == 1) { + /* OUI Type 1 - WPA IE */ + elems->wpa = pos; + elems->wpa_len = elen; + } else if (elen >= 5 && pos[3] == 2) { + /* OUI Type 2 - WMM IE */ + if (pos[4] == 0) { + elems->wmm_info = pos; + elems->wmm_info_len = elen; + } else if (pos[4] == 1) { + elems->wmm_param = pos; + elems->wmm_param_len = elen; + } + } + } + break; + case WLAN_EID_RSN: + elems->rsn = pos; + elems->rsn_len = elen; + break; + case WLAN_EID_ERP_INFO: + elems->erp_info = pos; + elems->erp_info_len = elen; + break; + case WLAN_EID_EXT_SUPP_RATES: + elems->ext_supp_rates = pos; + elems->ext_supp_rates_len = elen; + break; + case WLAN_EID_HT_CAPABILITY: + if (elen >= sizeof(struct ieee80211_ht_cap)) + elems->ht_cap_elem = (void *)pos; + break; + case WLAN_EID_HT_INFORMATION: + if (elen >= sizeof(struct ieee80211_ht_info)) + elems->ht_info_elem = (void *)pos; + break; + case WLAN_EID_MESH_ID: + elems->mesh_id = pos; + elems->mesh_id_len = elen; + break; + case WLAN_EID_MESH_CONFIG: + if (elen >= sizeof(struct ieee80211_meshconf_ie)) + elems->mesh_config = (void *)pos; + break; + case WLAN_EID_PEER_LINK: + elems->peer_link = pos; + elems->peer_link_len = elen; + break; + case WLAN_EID_PREQ: + elems->preq = pos; + elems->preq_len = elen; + break; + case WLAN_EID_PREP: + elems->prep = pos; + elems->prep_len = elen; + break; + case WLAN_EID_PERR: + elems->perr = pos; + elems->perr_len = elen; + break; + case WLAN_EID_RANN: + if (elen >= sizeof(struct ieee80211_rann_ie)) + elems->rann = (void *)pos; + break; + case WLAN_EID_CHANNEL_SWITCH: + elems->ch_switch_elem = pos; + elems->ch_switch_elem_len = elen; + break; + case WLAN_EID_QUIET: + if (!elems->quiet_elem) { + elems->quiet_elem = pos; + elems->quiet_elem_len = elen; + } + elems->num_of_quiet_elem++; + break; + case WLAN_EID_COUNTRY: + elems->country_elem = pos; + elems->country_elem_len = elen; + break; + case WLAN_EID_PWR_CONSTRAINT: + elems->pwr_constr_elem = pos; + elems->pwr_constr_elem_len = elen; + break; + case WLAN_EID_TIMEOUT_INTERVAL: + elems->timeout_int = pos; + elems->timeout_int_len = elen; + break; + default: + break; + } + + left -= elen; + pos += elen; + } + + return crc; +} + +void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_queue_params qparam; + int queue; + bool use_11b; + int aCWmin, aCWmax; + + if (!local->ops->conf_tx) + return; + + memset(&qparam, 0, sizeof(qparam)); + + use_11b = (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) && + !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE); + + for (queue = 0; queue < local_to_hw(local)->queues; queue++) { + /* Set defaults according to 802.11-2007 Table 7-37 */ + aCWmax = 1023; + if (use_11b) + aCWmin = 31; + else + aCWmin = 15; + + switch (queue) { + case 3: /* AC_BK */ + qparam.cw_max = aCWmax; + qparam.cw_min = aCWmin; + qparam.txop = 0; + qparam.aifs = 7; + break; + default: /* never happens but let's not leave undefined */ + case 2: /* AC_BE */ + qparam.cw_max = aCWmax; + qparam.cw_min = aCWmin; + qparam.txop = 0; + qparam.aifs = 3; + break; + case 1: /* AC_VI */ + qparam.cw_max = aCWmin; + qparam.cw_min = (aCWmin + 1) / 2 - 1; + if (use_11b) + qparam.txop = 6016/32; + else + qparam.txop = 3008/32; + qparam.aifs = 2; + break; + case 0: /* AC_VO */ + qparam.cw_max = (aCWmin + 1) / 2 - 1; + qparam.cw_min = (aCWmin + 1) / 4 - 1; + if (use_11b) + qparam.txop = 3264/32; + else + qparam.txop = 1504/32; + qparam.aifs = 2; + break; + } + + qparam.uapsd = false; + + drv_conf_tx(local, queue, &qparam); + } + + /* after reinitialize QoS TX queues setting to default, + * disable QoS at all */ + + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { + sdata->vif.bss_conf.qos = + sdata->vif.type != NL80211_IFTYPE_STATION; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS); + } +} + +void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata, + const size_t supp_rates_len, + const u8 *supp_rates) +{ + struct ieee80211_local *local = sdata->local; + int i, have_higher_than_11mbit = 0; + + /* cf. IEEE 802.11 9.2.12 */ + for (i = 0; i < supp_rates_len; i++) + if ((supp_rates[i] & 0x7f) * 5 > 110) + have_higher_than_11mbit = 1; + + if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && + have_higher_than_11mbit) + sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; + else + sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; + + ieee80211_set_wmm_default(sdata); +} + +u32 ieee80211_mandatory_rates(struct ieee80211_local *local, + enum ieee80211_band band) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_rate *bitrates; + u32 mandatory_rates; + enum ieee80211_rate_flags mandatory_flag; + int i; + + sband = local->hw.wiphy->bands[band]; + if (!sband) { + WARN_ON(1); + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + } + + if (band == IEEE80211_BAND_2GHZ) + mandatory_flag = IEEE80211_RATE_MANDATORY_B; + else + mandatory_flag = IEEE80211_RATE_MANDATORY_A; + + bitrates = sband->bitrates; + mandatory_rates = 0; + for (i = 0; i < sband->n_bitrates; i++) + if (bitrates[i].flags & mandatory_flag) + mandatory_rates |= BIT(i); + return mandatory_rates; +} + +void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, + u16 transaction, u16 auth_alg, + u8 *extra, size_t extra_len, const u8 *bssid, + const u8 *key, u8 key_len, u8 key_idx) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + int err; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(*mgmt) + 6 + extra_len); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for auth " + "frame\n", sdata->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6); + memset(mgmt, 0, 24 + 6); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_AUTH); + memcpy(mgmt->da, bssid, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg); + mgmt->u.auth.auth_transaction = cpu_to_le16(transaction); + mgmt->u.auth.status_code = cpu_to_le16(0); + if (extra) + memcpy(skb_put(skb, extra_len), extra, extra_len); + + if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { + mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx); + WARN_ON(err); + } + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, + const u8 *ie, size_t ie_len, + enum ieee80211_band band, u32 rate_mask, + u8 channel) +{ + struct ieee80211_supported_band *sband; + u8 *pos; + size_t offset = 0, noffset; + int supp_rates_len, i; + u8 rates[32]; + int num_rates; + int ext_rates_len; + + sband = local->hw.wiphy->bands[band]; + + pos = buffer; + + num_rates = 0; + for (i = 0; i < sband->n_bitrates; i++) { + if ((BIT(i) & rate_mask) == 0) + continue; /* skip rate */ + rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5); + } + + supp_rates_len = min_t(int, num_rates, 8); + + *pos++ = WLAN_EID_SUPP_RATES; + *pos++ = supp_rates_len; + memcpy(pos, rates, supp_rates_len); + pos += supp_rates_len; + + /* insert "request information" if in custom IEs */ + if (ie && ie_len) { + static const u8 before_extrates[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_extrates, + ARRAY_SIZE(before_extrates), + offset); + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + offset = noffset; + } + + ext_rates_len = num_rates - supp_rates_len; + if (ext_rates_len > 0) { + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = ext_rates_len; + memcpy(pos, rates + supp_rates_len, ext_rates_len); + pos += ext_rates_len; + } + + if (channel && sband->band == IEEE80211_BAND_2GHZ) { + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = channel; + } + + /* insert custom IEs that go before HT */ + if (ie && ie_len) { + static const u8 before_ht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_REQUEST, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_DS_PARAMS, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_ht, ARRAY_SIZE(before_ht), + offset); + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + offset = noffset; + } + + if (sband->ht_cap.ht_supported) { + u16 cap = sband->ht_cap.cap; + __le16 tmp; + + *pos++ = WLAN_EID_HT_CAPABILITY; + *pos++ = sizeof(struct ieee80211_ht_cap); + memset(pos, 0, sizeof(struct ieee80211_ht_cap)); + tmp = cpu_to_le16(cap); + memcpy(pos, &tmp, sizeof(u16)); + pos += sizeof(u16); + *pos++ = sband->ht_cap.ampdu_factor | + (sband->ht_cap.ampdu_density << + IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); + memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs)); + pos += sizeof(sband->ht_cap.mcs); + pos += 2 + 4 + 1; /* ext info, BF cap, antsel */ + } + + /* + * If adding more here, adjust code in main.c + * that calculates local->scan_ies_len. + */ + + /* add any remaining custom IEs */ + if (ie && ie_len) { + noffset = ie_len; + memcpy(pos, ie + offset, noffset - offset); + pos += noffset - offset; + } + + return pos - buffer; +} + +struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, + u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + size_t buf_len; + u8 *buf; + u8 chan; + + /* FIXME: come up with a proper value */ + buf = kmalloc(200 + ie_len, GFP_KERNEL); + if (!buf) { + printk(KERN_DEBUG "%s: failed to allocate temporary IE " + "buffer\n", sdata->name); + return NULL; + } + + chan = ieee80211_frequency_to_channel( + local->hw.conf.channel->center_freq); + + buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len, + local->hw.conf.channel->band, + sdata->rc_rateidx_mask + [local->hw.conf.channel->band], + chan); + + skb = ieee80211_probereq_get(&local->hw, &sdata->vif, + ssid, ssid_len, + buf, buf_len); + if (!skb) + goto out; + + if (dst) { + mgmt = (struct ieee80211_mgmt *) skb->data; + memcpy(mgmt->da, dst, ETH_ALEN); + memcpy(mgmt->bssid, dst, ETH_ALEN); + } + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + out: + kfree(buf); + + return skb; +} + +void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len) +{ + struct sk_buff *skb; + + skb = ieee80211_build_probe_req(sdata, dst, ssid, ssid_len, ie, ie_len); + if (skb) + ieee80211_tx_skb(sdata, skb); +} + +u32 ieee80211_sta_get_rates(struct ieee80211_local *local, + struct ieee802_11_elems *elems, + enum ieee80211_band band) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_rate *bitrates; + size_t num_rates; + u32 supp_rates; + int i, j; + sband = local->hw.wiphy->bands[band]; + + if (!sband) { + WARN_ON(1); + sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + } + + bitrates = sband->bitrates; + num_rates = sband->n_bitrates; + supp_rates = 0; + for (i = 0; i < elems->supp_rates_len + + elems->ext_supp_rates_len; i++) { + u8 rate = 0; + int own_rate; + if (i < elems->supp_rates_len) + rate = elems->supp_rates[i]; + else if (elems->ext_supp_rates) + rate = elems->ext_supp_rates + [i - elems->supp_rates_len]; + own_rate = 5 * (rate & 0x7f); + for (j = 0; j < num_rates; j++) + if (bitrates[j].bitrate == own_rate) + supp_rates |= BIT(j); + } + return supp_rates; +} + +void ieee80211_stop_device(struct ieee80211_local *local) +{ + ieee80211_led_radio(local, false); + ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); + + cancel_work_sync(&local->reconfig_filter); + + flush_workqueue(local->workqueue); + drv_stop(local); +} + +int ieee80211_reconfig(struct ieee80211_local *local) +{ + struct ieee80211_hw *hw = &local->hw; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + int res; + +#ifdef CONFIG_PM + if (local->suspended) + local->resuming = true; + + if (local->wowlan) { + local->wowlan = false; + res = drv_resume(local); + if (res < 0) { + local->resuming = false; + return res; + } + if (res == 0) + goto wake_up; + WARN_ON(res > 1); + /* + * res is 1, which means the driver requested + * to go through a regular reset on wakeup. + */ + } +#endif + + /* restart hardware */ + if (local->open_count) { + /* + * Upon resume hardware can sometimes be goofy due to + * various platform / driver / bus issues, so restarting + * the device may at times not work immediately. Propagate + * the error. + */ + res = drv_start(local); + if (res) { + WARN(local->suspended, "Hardware became unavailable " + "upon resume. This could be a software issue " + "prior to suspend or a hardware issue.\n"); + return res; + } + + ieee80211_led_radio(local, true); + ieee80211_mod_tpt_led_trig(local, + IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); + } + + /* add interfaces */ + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_MONITOR && + ieee80211_sdata_running(sdata)) + res = drv_add_interface(local, &sdata->vif); + } + + /* add STAs back */ + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + if (sta->uploaded) { + sdata = sta->sdata; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + WARN_ON(drv_sta_add(local, sdata, &sta->sta)); + } + } + mutex_unlock(&local->sta_mtx); + + /* setup fragmentation threshold */ + drv_set_frag_threshold(local, hw->wiphy->frag_threshold); + + /* setup RTS threshold */ + drv_set_rts_threshold(local, hw->wiphy->rts_threshold); + + /* reconfigure hardware */ + ieee80211_hw_config(local, ~0); + + ieee80211_configure_filter(local); + + /* Finally also reconfigure all the BSS information */ + list_for_each_entry(sdata, &local->interfaces, list) { + u32 changed; + + if (!ieee80211_sdata_running(sdata)) + continue; + + /* common change flags for all interface types */ + changed = BSS_CHANGED_ERP_CTS_PROT | + BSS_CHANGED_ERP_PREAMBLE | + BSS_CHANGED_ERP_SLOT | + BSS_CHANGED_HT | + BSS_CHANGED_BASIC_RATES | + BSS_CHANGED_BEACON_INT | + BSS_CHANGED_BSSID | + BSS_CHANGED_CQM | + BSS_CHANGED_QOS; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + changed |= BSS_CHANGED_ASSOC; + mutex_lock(&sdata->u.mgd.mtx); + ieee80211_bss_info_change_notify(sdata, changed); + mutex_unlock(&sdata->u.mgd.mtx); + break; + case NL80211_IFTYPE_ADHOC: + changed |= BSS_CHANGED_IBSS; + /* fall through */ + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_MESH_POINT: + changed |= BSS_CHANGED_BEACON | + BSS_CHANGED_BEACON_ENABLED; + ieee80211_bss_info_change_notify(sdata, changed); + break; + case NL80211_IFTYPE_WDS: + break; + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MONITOR: + /* ignore virtual */ + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_P2P_GO: + WARN_ON(1); + break; + } + } + + /* add back keys */ + list_for_each_entry(sdata, &local->interfaces, list) + if (ieee80211_sdata_running(sdata)) + ieee80211_enable_keys(sdata); + + wake_up: + /* + * Clear the WLAN_STA_BLOCK_BA flag so new aggregation + * sessions can be established after a resume. + * + * Also tear down aggregation sessions since reconfiguring + * them in a hardware restart scenario is not easily done + * right now, and the hardware will have lost information + * about the sessions, but we and the AP still think they + * are active. This is really a workaround though. + */ + if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + mutex_lock(&local->sta_mtx); + + list_for_each_entry(sta, &local->sta_list, list) { + ieee80211_sta_tear_down_BA_sessions(sta, true); + clear_sta_flags(sta, WLAN_STA_BLOCK_BA); + } + + mutex_unlock(&local->sta_mtx); + } + + ieee80211_wake_queues_by_reason(hw, + IEEE80211_QUEUE_STOP_REASON_SUSPEND); + + /* + * If this is for hw restart things are still running. + * We may want to change that later, however. + */ + if (!local->suspended) + return 0; + +#ifdef CONFIG_PM + /* first set suspended false, then resuming */ + local->suspended = false; + mb(); + local->resuming = false; + + list_for_each_entry(sdata, &local->interfaces, list) { + switch(sdata->vif.type) { + case NL80211_IFTYPE_STATION: + ieee80211_sta_restart(sdata); + break; + case NL80211_IFTYPE_ADHOC: + ieee80211_ibss_restart(sdata); + break; + case NL80211_IFTYPE_MESH_POINT: + ieee80211_mesh_restart(sdata); + break; + default: + break; + } + } + + mod_timer(&local->sta_cleanup, jiffies + 1); + + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) + mesh_plink_restart(sta); + mutex_unlock(&local->sta_mtx); +#else + WARN_ON(1); +#endif + return 0; +} + +static int check_mgd_smps(struct ieee80211_if_managed *ifmgd, + enum ieee80211_smps_mode *smps_mode) +{ + if (ifmgd->associated) { + *smps_mode = ifmgd->ap_smps; + + if (*smps_mode == IEEE80211_SMPS_AUTOMATIC) { + if (ifmgd->powersave) + *smps_mode = IEEE80211_SMPS_DYNAMIC; + else + *smps_mode = IEEE80211_SMPS_OFF; + } + + return 1; + } + + return 0; +} + +/* must hold iflist_mtx */ +void ieee80211_recalc_smps(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata; + enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF; + int count = 0; + + lockdep_assert_held(&local->iflist_mtx); + + /* + * This function could be improved to handle multiple + * interfaces better, but right now it makes any + * non-station interfaces force SM PS to be turned + * off. If there are multiple station interfaces it + * could also use the best possible mode, e.g. if + * one is in static and the other in dynamic then + * dynamic is ok. + */ + + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + if (sdata->vif.type != NL80211_IFTYPE_STATION) + goto set; + + count += check_mgd_smps(&sdata->u.mgd, &smps_mode); + + if (count > 1) { + smps_mode = IEEE80211_SMPS_OFF; + break; + } + } + + if (smps_mode == local->smps_mode) + return; + + set: + local->smps_mode = smps_mode; + /* changed flag is auto-detected for this */ + ieee80211_hw_config(local, 0); +} + +static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) +{ + int i; + + for (i = 0; i < n_ids; i++) + if (ids[i] == id) + return true; + return false; +} + +/** + * ieee80211_ie_split - split an IE buffer according to ordering + * + * @ies: the IE buffer + * @ielen: the length of the IE buffer + * @ids: an array with element IDs that are allowed before + * the split + * @n_ids: the size of the element ID array + * @offset: offset where to start splitting in the buffer + * + * This function splits an IE buffer by updating the @offset + * variable to point to the location where the buffer should be + * split. + * + * It assumes that the given IE buffer is well-formed, this + * has to be guaranteed by the caller! + * + * It also assumes that the IEs in the buffer are ordered + * correctly, if not the result of using this function will not + * be ordered correctly either, i.e. it does no reordering. + * + * The function returns the offset where the next part of the + * buffer starts, which may be @ielen if the entire (remainder) + * of the buffer should be used. + */ +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) + pos += 2 + ies[pos + 1]; + + return pos; +} + +size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC) + pos += 2 + ies[pos + 1]; + + return pos; +} diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c new file mode 100644 index 00000000..a1c6bfd5 --- /dev/null +++ b/net/mac80211/wep.c @@ -0,0 +1,342 @@ +/* + * Software WEP encryption implementation + * Copyright 2002, Jouni Malinen + * Copyright 2003, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ieee80211_i.h" +#include "wep.h" + + +int ieee80211_wep_init(struct ieee80211_local *local) +{ + /* start WEP IV from a random value */ + get_random_bytes(&local->wep_iv, WEP_IV_LEN); + + local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(local->wep_tx_tfm)) { + local->wep_rx_tfm = ERR_PTR(-EINVAL); + return PTR_ERR(local->wep_tx_tfm); + } + + local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(local->wep_rx_tfm)) { + crypto_free_cipher(local->wep_tx_tfm); + local->wep_tx_tfm = ERR_PTR(-EINVAL); + return PTR_ERR(local->wep_rx_tfm); + } + + return 0; +} + +void ieee80211_wep_free(struct ieee80211_local *local) +{ + if (!IS_ERR(local->wep_tx_tfm)) + crypto_free_cipher(local->wep_tx_tfm); + if (!IS_ERR(local->wep_rx_tfm)) + crypto_free_cipher(local->wep_rx_tfm); +} + +static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen) +{ + /* + * Fluhrer, Mantin, and Shamir have reported weaknesses in the + * key scheduling algorithm of RC4. At least IVs (KeyByte + 3, + * 0xff, N) can be used to speedup attacks, so avoid using them. + */ + if ((iv & 0xff00) == 0xff00) { + u8 B = (iv >> 16) & 0xff; + if (B >= 3 && B < 3 + keylen) + return true; + } + return false; +} + + +static void ieee80211_wep_get_iv(struct ieee80211_local *local, + int keylen, int keyidx, u8 *iv) +{ + local->wep_iv++; + if (ieee80211_wep_weak_iv(local->wep_iv, keylen)) + local->wep_iv += 0x0100; + + if (!iv) + return; + + *iv++ = (local->wep_iv >> 16) & 0xff; + *iv++ = (local->wep_iv >> 8) & 0xff; + *iv++ = local->wep_iv & 0xff; + *iv++ = keyidx << 6; +} + + +static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, + struct sk_buff *skb, + int keylen, int keyidx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + unsigned int hdrlen; + u8 *newhdr; + + hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + + if (WARN_ON(skb_tailroom(skb) < WEP_ICV_LEN || + skb_headroom(skb) < WEP_IV_LEN)) + return NULL; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + newhdr = skb_push(skb, WEP_IV_LEN); + memmove(newhdr, newhdr + WEP_IV_LEN, hdrlen); + ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen); + return newhdr + hdrlen; +} + + +static void ieee80211_wep_remove_iv(struct ieee80211_local *local, + struct sk_buff *skb, + struct ieee80211_key *key) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + unsigned int hdrlen; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + memmove(skb->data + WEP_IV_LEN, skb->data, hdrlen); + skb_pull(skb, WEP_IV_LEN); +} + + +/* Perform WEP encryption using given key. data buffer must have tailroom + * for 4-byte ICV. data_len must not include this ICV. Note: this function + * does _not_ add IV. data = RC4(data | CRC32(data)) */ +int ieee80211_wep_encrypt_data(struct crypto_cipher *tfm, u8 *rc4key, + size_t klen, u8 *data, size_t data_len) +{ + __le32 icv; + int i; + + if (IS_ERR(tfm)) + return -1; + + icv = cpu_to_le32(~crc32_le(~0, data, data_len)); + put_unaligned(icv, (__le32 *)(data + data_len)); + + crypto_cipher_setkey(tfm, rc4key, klen); + for (i = 0; i < data_len + WEP_ICV_LEN; i++) + crypto_cipher_encrypt_one(tfm, data + i, data + i); + + return 0; +} + + +/* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the + * beginning of the buffer 4 bytes of extra space (ICV) in the end of the + * buffer will be added. Both IV and ICV will be transmitted, so the + * payload length increases with 8 bytes. + * + * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) + */ +int ieee80211_wep_encrypt(struct ieee80211_local *local, + struct sk_buff *skb, + const u8 *key, int keylen, int keyidx) +{ + u8 *iv; + size_t len; + u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; + + iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); + if (!iv) + return -1; + + len = skb->len - (iv + WEP_IV_LEN - skb->data); + + /* Prepend 24-bit IV to RC4 key */ + memcpy(rc4key, iv, 3); + + /* Copy rest of the WEP key (the secret part) */ + memcpy(rc4key + 3, key, keylen); + + /* Add room for ICV */ + skb_put(skb, WEP_ICV_LEN); + + return ieee80211_wep_encrypt_data(local->wep_tx_tfm, rc4key, keylen + 3, + iv + WEP_IV_LEN, len); +} + + +/* Perform WEP decryption using given key. data buffer includes encrypted + * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV. + * Return 0 on success and -1 on ICV mismatch. */ +int ieee80211_wep_decrypt_data(struct crypto_cipher *tfm, u8 *rc4key, + size_t klen, u8 *data, size_t data_len) +{ + __le32 crc; + int i; + + if (IS_ERR(tfm)) + return -1; + + crypto_cipher_setkey(tfm, rc4key, klen); + for (i = 0; i < data_len + WEP_ICV_LEN; i++) + crypto_cipher_decrypt_one(tfm, data + i, data + i); + + crc = cpu_to_le32(~crc32_le(~0, data, data_len)); + if (memcmp(&crc, data + data_len, WEP_ICV_LEN) != 0) + /* ICV mismatch */ + return -1; + + return 0; +} + + +/* Perform WEP decryption on given skb. Buffer includes whole WEP part of + * the frame: IV (4 bytes), encrypted payload (including SNAP header), + * ICV (4 bytes). skb->len includes both IV and ICV. + * + * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on + * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload + * is moved to the beginning of the skb and skb length will be reduced. + */ +static int ieee80211_wep_decrypt(struct ieee80211_local *local, + struct sk_buff *skb, + struct ieee80211_key *key) +{ + u32 klen; + u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; + u8 keyidx; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + unsigned int hdrlen; + size_t len; + int ret = 0; + + if (!ieee80211_has_protected(hdr->frame_control)) + return -1; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (skb->len < hdrlen + WEP_IV_LEN + WEP_ICV_LEN) + return -1; + + len = skb->len - hdrlen - WEP_IV_LEN - WEP_ICV_LEN; + + keyidx = skb->data[hdrlen + 3] >> 6; + + if (!key || keyidx != key->conf.keyidx) + return -1; + + klen = 3 + key->conf.keylen; + + /* Prepend 24-bit IV to RC4 key */ + memcpy(rc4key, skb->data + hdrlen, 3); + + /* Copy rest of the WEP key (the secret part) */ + memcpy(rc4key + 3, key->conf.key, key->conf.keylen); + + if (ieee80211_wep_decrypt_data(local->wep_rx_tfm, rc4key, klen, + skb->data + hdrlen + WEP_IV_LEN, + len)) + ret = -1; + + /* Trim ICV */ + skb_trim(skb, skb->len - WEP_ICV_LEN); + + /* Remove IV */ + memmove(skb->data + WEP_IV_LEN, skb->data, hdrlen); + skb_pull(skb, WEP_IV_LEN); + + return ret; +} + + +bool ieee80211_wep_is_weak_iv(struct sk_buff *skb, struct ieee80211_key *key) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + unsigned int hdrlen; + u8 *ivpos; + u32 iv; + + if (!ieee80211_has_protected(hdr->frame_control)) + return false; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + ivpos = skb->data + hdrlen; + iv = (ivpos[0] << 16) | (ivpos[1] << 8) | ivpos[2]; + + return ieee80211_wep_weak_iv(iv, key->conf.keylen); +} + +ieee80211_rx_result +ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + + if (!ieee80211_is_data(hdr->frame_control) && + !ieee80211_is_auth(hdr->frame_control)) + return RX_CONTINUE; + + if (!(status->flag & RX_FLAG_DECRYPTED)) { + if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) + return RX_DROP_UNUSABLE; + } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { + ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); + /* remove ICV */ + skb_trim(rx->skb, rx->skb->len - WEP_ICV_LEN); + } + + return RX_CONTINUE; +} + +static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + if (!info->control.hw_key) { + if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key, + tx->key->conf.keylen, + tx->key->conf.keyidx)) + return -1; + } else if (info->control.hw_key->flags & + IEEE80211_KEY_FLAG_GENERATE_IV) { + if (!ieee80211_wep_add_iv(tx->local, skb, + tx->key->conf.keylen, + tx->key->conf.keyidx)) + return -1; + } + + return 0; +} + +ieee80211_tx_result +ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb; + + ieee80211_tx_set_protected(tx); + + skb = tx->skb; + do { + if (wep_encrypt_skb(tx, skb) < 0) { + I802_DEBUG_INC(tx->local->tx_handlers_drop_wep); + return TX_DROP; + } + } while ((skb = skb->next)); + + return TX_CONTINUE; +} diff --git a/net/mac80211/wep.h b/net/mac80211/wep.h new file mode 100644 index 00000000..01e54840 --- /dev/null +++ b/net/mac80211/wep.h @@ -0,0 +1,35 @@ +/* + * Software WEP encryption implementation + * Copyright 2002, Jouni Malinen + * Copyright 2003, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef WEP_H +#define WEP_H + +#include +#include +#include "ieee80211_i.h" +#include "key.h" + +int ieee80211_wep_init(struct ieee80211_local *local); +void ieee80211_wep_free(struct ieee80211_local *local); +int ieee80211_wep_encrypt_data(struct crypto_cipher *tfm, u8 *rc4key, + size_t klen, u8 *data, size_t data_len); +int ieee80211_wep_encrypt(struct ieee80211_local *local, + struct sk_buff *skb, + const u8 *key, int keylen, int keyidx); +int ieee80211_wep_decrypt_data(struct crypto_cipher *tfm, u8 *rc4key, + size_t klen, u8 *data, size_t data_len); +bool ieee80211_wep_is_weak_iv(struct sk_buff *skb, struct ieee80211_key *key); + +ieee80211_rx_result +ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx); +ieee80211_tx_result +ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx); + +#endif /* WEP_H */ diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c new file mode 100644 index 00000000..28bc084d --- /dev/null +++ b/net/mac80211/wme.c @@ -0,0 +1,160 @@ +/* + * Copyright 2004, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "ieee80211_i.h" +#include "wme.h" + +/* Default mapping in classifier to work with default + * queue setup. + */ +const int ieee802_1d_to_ac[8] = { + IEEE80211_AC_BE, + IEEE80211_AC_BK, + IEEE80211_AC_BK, + IEEE80211_AC_BE, + IEEE80211_AC_VI, + IEEE80211_AC_VI, + IEEE80211_AC_VO, + IEEE80211_AC_VO +}; + +static int wme_downgrade_ac(struct sk_buff *skb) +{ + switch (skb->priority) { + case 6: + case 7: + skb->priority = 5; /* VO -> VI */ + return 0; + case 4: + case 5: + skb->priority = 3; /* VI -> BE */ + return 0; + case 0: + case 3: + skb->priority = 2; /* BE -> BK */ + return 0; + default: + return -1; + } +} + + +/* Indicate which queue to use. */ +u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta = NULL; + const u8 *ra = NULL; + bool qos = false; + + if (local->hw.queues < 4 || skb->len < 6) { + skb->priority = 0; /* required for correct WPA/11i MIC */ + return min_t(u16, local->hw.queues - 1, IEEE80211_AC_BE); + } + + rcu_read_lock(); + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + sta = rcu_dereference(sdata->u.vlan.sta); + if (sta) { + qos = get_sta_flags(sta) & WLAN_STA_WME; + break; + } + case NL80211_IFTYPE_AP: + ra = skb->data; + break; + case NL80211_IFTYPE_WDS: + ra = sdata->u.wds.remote_addr; + break; +#ifdef CONFIG_MAC80211_MESH + case NL80211_IFTYPE_MESH_POINT: + /* + * XXX: This is clearly broken ... but already was before, + * because ieee80211_fill_mesh_addresses() would clear A1 + * except for multicast addresses. + */ + break; +#endif + case NL80211_IFTYPE_STATION: + ra = sdata->u.mgd.bssid; + break; + case NL80211_IFTYPE_ADHOC: + ra = skb->data; + break; + default: + break; + } + + if (!sta && ra && !is_multicast_ether_addr(ra)) { + sta = sta_info_get(sdata, ra); + if (sta) + qos = get_sta_flags(sta) & WLAN_STA_WME; + } + rcu_read_unlock(); + + if (!qos) { + skb->priority = 0; /* required for correct WPA/11i MIC */ + return IEEE80211_AC_BE; + } + + /* use the data classifier to determine what 802.1d tag the + * data frame has */ + skb->priority = cfg80211_classify8021d(skb); + + return ieee80211_downgrade_queue(local, skb); +} + +u16 ieee80211_downgrade_queue(struct ieee80211_local *local, + struct sk_buff *skb) +{ + /* in case we are a client verify acm is not set for this ac */ + while (unlikely(local->wmm_acm & BIT(skb->priority))) { + if (wme_downgrade_ac(skb)) { + /* + * This should not really happen. The AP has marked all + * lower ACs to require admission control which is not + * a reasonable configuration. Allow the frame to be + * transmitted using AC_BK as a workaround. + */ + break; + } + } + + /* look up which queue to use for frames with this 1d tag */ + return ieee802_1d_to_ac[skb->priority]; +} + +void ieee80211_set_qos_hdr(struct ieee80211_local *local, struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (void *)skb->data; + + /* Fill in the QoS header if there is one. */ + if (ieee80211_is_data_qos(hdr->frame_control)) { + u8 *p = ieee80211_get_qos_ctl(hdr); + u8 ack_policy = 0, tid; + + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + + if (unlikely(local->wifi_wme_noack_test)) + ack_policy |= QOS_CONTROL_ACK_POLICY_NOACK << + QOS_CONTROL_ACK_POLICY_SHIFT; + /* qos header is 2 bytes, second reserved */ + *p++ = ack_policy | tid; + *p = 0; + } +} diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h new file mode 100644 index 00000000..6053b1c9 --- /dev/null +++ b/net/mac80211/wme.h @@ -0,0 +1,30 @@ +/* + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _WME_H +#define _WME_H + +#include +#include "ieee80211_i.h" + +#define QOS_CONTROL_ACK_POLICY_NORMAL 0 +#define QOS_CONTROL_ACK_POLICY_NOACK 1 + +#define QOS_CONTROL_ACK_POLICY_SHIFT 5 + +extern const int ieee802_1d_to_ac[8]; + +u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +void ieee80211_set_qos_hdr(struct ieee80211_local *local, struct sk_buff *skb); +u16 ieee80211_downgrade_queue(struct ieee80211_local *local, + struct sk_buff *skb); + + +#endif /* _WME_H */ diff --git a/net/mac80211/work.c b/net/mac80211/work.c new file mode 100644 index 00000000..52b758db --- /dev/null +++ b/net/mac80211/work.c @@ -0,0 +1,1276 @@ +/* + * mac80211 work implementation + * + * Copyright 2003-2008, Jouni Malinen + * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc + * Copyright 2007, Michael Wu + * Copyright 2009, Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "rate.h" + +#define IEEE80211_AUTH_TIMEOUT (HZ / 5) +#define IEEE80211_AUTH_MAX_TRIES 3 +#define IEEE80211_ASSOC_TIMEOUT (HZ / 5) +#define IEEE80211_ASSOC_MAX_TRIES 3 + +enum work_action { + WORK_ACT_MISMATCH, + WORK_ACT_NONE, + WORK_ACT_TIMEOUT, + WORK_ACT_DONE, +}; + + +/* utils */ +static inline void ASSERT_WORK_MTX(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->mtx); +} + +/* + * We can have multiple work items (and connection probing) + * scheduling this timer, but we need to take care to only + * reschedule it when it should fire _earlier_ than it was + * asked for before, or if it's not pending right now. This + * function ensures that. Note that it then is required to + * run this function for all timeouts after the first one + * has happened -- the work that runs from this timer will + * do that. + */ +static void run_again(struct ieee80211_local *local, + unsigned long timeout) +{ + ASSERT_WORK_MTX(local); + + if (!timer_pending(&local->work_timer) || + time_before(timeout, local->work_timer.expires)) + mod_timer(&local->work_timer, timeout); +} + +void free_work(struct ieee80211_work *wk) +{ + kfree_rcu(wk, rcu_head); +} + +static int ieee80211_compatible_rates(const u8 *supp_rates, int supp_rates_len, + struct ieee80211_supported_band *sband, + u32 *rates) +{ + int i, j, count; + *rates = 0; + count = 0; + for (i = 0; i < supp_rates_len; i++) { + int rate = (supp_rates[i] & 0x7F) * 5; + + for (j = 0; j < sband->n_bitrates; j++) + if (sband->bitrates[j].bitrate == rate) { + *rates |= BIT(j); + count++; + break; + } + } + + return count; +} + +/* frame sending functions */ + +static void ieee80211_add_ht_ie(struct sk_buff *skb, const u8 *ht_info_ie, + struct ieee80211_supported_band *sband, + struct ieee80211_channel *channel, + enum ieee80211_smps_mode smps) +{ + struct ieee80211_ht_info *ht_info; + u8 *pos; + u32 flags = channel->flags; + u16 cap = sband->ht_cap.cap; + __le16 tmp; + + if (!sband->ht_cap.ht_supported) + return; + + if (!ht_info_ie) + return; + + if (ht_info_ie[1] < sizeof(struct ieee80211_ht_info)) + return; + + ht_info = (struct ieee80211_ht_info *)(ht_info_ie + 2); + + /* determine capability flags */ + + switch (ht_info->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { + case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: + if (flags & IEEE80211_CHAN_NO_HT40PLUS) { + cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; + cap &= ~IEEE80211_HT_CAP_SGI_40; + } + break; + case IEEE80211_HT_PARAM_CHA_SEC_BELOW: + if (flags & IEEE80211_CHAN_NO_HT40MINUS) { + cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; + cap &= ~IEEE80211_HT_CAP_SGI_40; + } + break; + } + + /* set SM PS mode properly */ + cap &= ~IEEE80211_HT_CAP_SM_PS; + switch (smps) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + case IEEE80211_SMPS_OFF: + cap |= WLAN_HT_CAP_SM_PS_DISABLED << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_STATIC: + cap |= WLAN_HT_CAP_SM_PS_STATIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + } + + /* reserve and fill IE */ + + pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); + *pos++ = WLAN_EID_HT_CAPABILITY; + *pos++ = sizeof(struct ieee80211_ht_cap); + memset(pos, 0, sizeof(struct ieee80211_ht_cap)); + + /* capability flags */ + tmp = cpu_to_le16(cap); + memcpy(pos, &tmp, sizeof(u16)); + pos += sizeof(u16); + + /* AMPDU parameters */ + *pos++ = sband->ht_cap.ampdu_factor | + (sband->ht_cap.ampdu_density << + IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT); + + /* MCS set */ + memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs)); + pos += sizeof(sband->ht_cap.mcs); + + /* extended capabilities */ + pos += sizeof(__le16); + + /* BF capabilities */ + pos += sizeof(__le32); + + /* antenna selection */ + pos += sizeof(u8); +} + +static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_work *wk) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u8 *pos, qos_info; + size_t offset = 0, noffset; + int i, count, rates_len, supp_rates_len; + u16 capab; + struct ieee80211_supported_band *sband; + u32 rates = 0; + + sband = local->hw.wiphy->bands[wk->chan->band]; + + if (wk->assoc.supp_rates_len) { + /* + * Get all rates supported by the device and the AP as + * some APs don't like getting a superset of their rates + * in the association request (e.g. D-Link DAP 1353 in + * b-only mode)... + */ + rates_len = ieee80211_compatible_rates(wk->assoc.supp_rates, + wk->assoc.supp_rates_len, + sband, &rates); + } else { + /* + * In case AP not provide any supported rates information + * before association, we send information element(s) with + * all rates that we support. + */ + rates = ~0; + rates_len = sband->n_bitrates; + } + + skb = alloc_skb(local->hw.extra_tx_headroom + + sizeof(*mgmt) + /* bit too much but doesn't matter */ + 2 + wk->assoc.ssid_len + /* SSID */ + 4 + rates_len + /* (extended) rates */ + 4 + /* power capability */ + 2 + 2 * sband->n_channels + /* supported channels */ + 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ + wk->ie_len + /* extra IEs */ + 9, /* WMM */ + GFP_KERNEL); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer for assoc " + "frame\n", sdata->name); + return; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + capab = WLAN_CAPABILITY_ESS; + + if (sband->band == IEEE80211_BAND_2GHZ) { + if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) + capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; + if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) + capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + } + + if (wk->assoc.capability & WLAN_CAPABILITY_PRIVACY) + capab |= WLAN_CAPABILITY_PRIVACY; + + if ((wk->assoc.capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && + (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) + capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; + + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, wk->filter_ta, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, wk->filter_ta, ETH_ALEN); + + if (!is_zero_ether_addr(wk->assoc.prev_bssid)) { + skb_put(skb, 10); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_REASSOC_REQ); + mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab); + mgmt->u.reassoc_req.listen_interval = + cpu_to_le16(local->hw.conf.listen_interval); + memcpy(mgmt->u.reassoc_req.current_ap, wk->assoc.prev_bssid, + ETH_ALEN); + } else { + skb_put(skb, 4); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ASSOC_REQ); + mgmt->u.assoc_req.capab_info = cpu_to_le16(capab); + mgmt->u.assoc_req.listen_interval = + cpu_to_le16(local->hw.conf.listen_interval); + } + + /* SSID */ + pos = skb_put(skb, 2 + wk->assoc.ssid_len); + *pos++ = WLAN_EID_SSID; + *pos++ = wk->assoc.ssid_len; + memcpy(pos, wk->assoc.ssid, wk->assoc.ssid_len); + + /* add all rates which were marked to be used above */ + supp_rates_len = rates_len; + if (supp_rates_len > 8) + supp_rates_len = 8; + + pos = skb_put(skb, supp_rates_len + 2); + *pos++ = WLAN_EID_SUPP_RATES; + *pos++ = supp_rates_len; + + count = 0; + for (i = 0; i < sband->n_bitrates; i++) { + if (BIT(i) & rates) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + if (++count == 8) + break; + } + } + + if (rates_len > count) { + pos = skb_put(skb, rates_len - count + 2); + *pos++ = WLAN_EID_EXT_SUPP_RATES; + *pos++ = rates_len - count; + + for (i++; i < sband->n_bitrates; i++) { + if (BIT(i) & rates) { + int rate = sband->bitrates[i].bitrate; + *pos++ = (u8) (rate / 5); + } + } + } + + if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) { + /* 1. power capabilities */ + pos = skb_put(skb, 4); + *pos++ = WLAN_EID_PWR_CAPABILITY; + *pos++ = 2; + *pos++ = 0; /* min tx power */ + *pos++ = wk->chan->max_power; /* max tx power */ + + /* 2. supported channels */ + /* TODO: get this in reg domain format */ + pos = skb_put(skb, 2 * sband->n_channels + 2); + *pos++ = WLAN_EID_SUPPORTED_CHANNELS; + *pos++ = 2 * sband->n_channels; + for (i = 0; i < sband->n_channels; i++) { + *pos++ = ieee80211_frequency_to_channel( + sband->channels[i].center_freq); + *pos++ = 1; /* one channel in the subband*/ + } + } + + /* if present, add any custom IEs that go before HT */ + if (wk->ie_len && wk->ie) { + static const u8 before_ht[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_PWR_CAPABILITY, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + WLAN_EID_QOS_CAPA, + WLAN_EID_RRM_ENABLED_CAPABILITIES, + WLAN_EID_MOBILITY_DOMAIN, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(wk->ie, wk->ie_len, + before_ht, ARRAY_SIZE(before_ht), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + offset = noffset; + } + + if (wk->assoc.use_11n && wk->assoc.wmm_used && + local->hw.queues >= 4) + ieee80211_add_ht_ie(skb, wk->assoc.ht_information_ie, + sband, wk->chan, wk->assoc.smps); + + /* if present, add any custom non-vendor IEs that go after HT */ + if (wk->ie_len && wk->ie) { + noffset = ieee80211_ie_split_vendor(wk->ie, wk->ie_len, + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + offset = noffset; + } + + if (wk->assoc.wmm_used && local->hw.queues >= 4) { + if (wk->assoc.uapsd_used) { + qos_info = local->uapsd_queues; + qos_info |= (local->uapsd_max_sp_len << + IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); + } else { + qos_info = 0; + } + + pos = skb_put(skb, 9); + *pos++ = WLAN_EID_VENDOR_SPECIFIC; + *pos++ = 7; /* len */ + *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */ + *pos++ = 0x50; + *pos++ = 0xf2; + *pos++ = 2; /* WME */ + *pos++ = 0; /* WME info */ + *pos++ = 1; /* WME ver */ + *pos++ = qos_info; + } + + /* add any remaining custom (i.e. vendor specific here) IEs */ + if (wk->ie_len && wk->ie) { + noffset = wk->ie_len; + pos = skb_put(skb, noffset - offset); + memcpy(pos, wk->ie + offset, noffset - offset); + } + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + ieee80211_tx_skb(sdata, skb); +} + +static void ieee80211_remove_auth_bss(struct ieee80211_local *local, + struct ieee80211_work *wk) +{ + struct cfg80211_bss *cbss; + u16 capa_val = WLAN_CAPABILITY_ESS; + + if (wk->probe_auth.privacy) + capa_val |= WLAN_CAPABILITY_PRIVACY; + + cbss = cfg80211_get_bss(local->hw.wiphy, wk->chan, wk->filter_ta, + wk->probe_auth.ssid, wk->probe_auth.ssid_len, + WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_PRIVACY, + capa_val); + if (!cbss) + return; + + cfg80211_unlink_bss(local->hw.wiphy, cbss); + cfg80211_put_bss(cbss); +} + +static enum work_action __must_check +ieee80211_direct_probe(struct ieee80211_work *wk) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_local *local = sdata->local; + + wk->probe_auth.tries++; + if (wk->probe_auth.tries > IEEE80211_AUTH_MAX_TRIES) { + printk(KERN_DEBUG "%s: direct probe to %pM timed out\n", + sdata->name, wk->filter_ta); + + /* + * Most likely AP is not in the range so remove the + * bss struct for that AP. + */ + ieee80211_remove_auth_bss(local, wk); + + return WORK_ACT_TIMEOUT; + } + + printk(KERN_DEBUG "%s: direct probe to %pM (try %d/%i)\n", + sdata->name, wk->filter_ta, wk->probe_auth.tries, + IEEE80211_AUTH_MAX_TRIES); + + /* + * Direct probe is sent to broadcast address as some APs + * will not answer to direct packet in unassociated state. + */ + ieee80211_send_probe_req(sdata, NULL, wk->probe_auth.ssid, + wk->probe_auth.ssid_len, NULL, 0); + + wk->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; + run_again(local, wk->timeout); + + return WORK_ACT_NONE; +} + + +static enum work_action __must_check +ieee80211_authenticate(struct ieee80211_work *wk) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_local *local = sdata->local; + + wk->probe_auth.tries++; + if (wk->probe_auth.tries > IEEE80211_AUTH_MAX_TRIES) { + printk(KERN_DEBUG "%s: authentication with %pM" + " timed out\n", sdata->name, wk->filter_ta); + + /* + * Most likely AP is not in the range so remove the + * bss struct for that AP. + */ + ieee80211_remove_auth_bss(local, wk); + + return WORK_ACT_TIMEOUT; + } + + printk(KERN_DEBUG "%s: authenticate with %pM (try %d)\n", + sdata->name, wk->filter_ta, wk->probe_auth.tries); + + ieee80211_send_auth(sdata, 1, wk->probe_auth.algorithm, wk->ie, + wk->ie_len, wk->filter_ta, NULL, 0, 0); + wk->probe_auth.transaction = 2; + + wk->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; + run_again(local, wk->timeout); + + return WORK_ACT_NONE; +} + +static enum work_action __must_check +ieee80211_associate(struct ieee80211_work *wk) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_local *local = sdata->local; + + wk->assoc.tries++; + if (wk->assoc.tries > IEEE80211_ASSOC_MAX_TRIES) { + printk(KERN_DEBUG "%s: association with %pM" + " timed out\n", + sdata->name, wk->filter_ta); + + /* + * Most likely AP is not in the range so remove the + * bss struct for that AP. + */ + if (wk->assoc.bss) + cfg80211_unlink_bss(local->hw.wiphy, wk->assoc.bss); + + return WORK_ACT_TIMEOUT; + } + + printk(KERN_DEBUG "%s: associate with %pM (try %d)\n", + sdata->name, wk->filter_ta, wk->assoc.tries); + ieee80211_send_assoc(sdata, wk); + + wk->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; + run_again(local, wk->timeout); + + return WORK_ACT_NONE; +} + +static enum work_action __must_check +ieee80211_remain_on_channel_timeout(struct ieee80211_work *wk) +{ + /* + * First time we run, do nothing -- the generic code will + * have switched to the right channel etc. + */ + if (!wk->started) { + wk->timeout = jiffies + msecs_to_jiffies(wk->remain.duration); + + cfg80211_ready_on_channel(wk->sdata->dev, (unsigned long) wk, + wk->chan, wk->chan_type, + wk->remain.duration, GFP_KERNEL); + + return WORK_ACT_NONE; + } + + return WORK_ACT_TIMEOUT; +} + +static enum work_action __must_check +ieee80211_offchannel_tx(struct ieee80211_work *wk) +{ + if (!wk->started) { + wk->timeout = jiffies + msecs_to_jiffies(wk->offchan_tx.wait); + + /* + * After this, offchan_tx.frame remains but now is no + * longer a valid pointer -- we still need it as the + * cookie for canceling this work/status matching. + */ + ieee80211_tx_skb(wk->sdata, wk->offchan_tx.frame); + + return WORK_ACT_NONE; + } + + return WORK_ACT_TIMEOUT; +} + +static enum work_action __must_check +ieee80211_assoc_beacon_wait(struct ieee80211_work *wk) +{ + if (wk->started) + return WORK_ACT_TIMEOUT; + + /* + * Wait up to one beacon interval ... + * should this be more if we miss one? + */ + printk(KERN_DEBUG "%s: waiting for beacon from %pM\n", + wk->sdata->name, wk->filter_ta); + wk->timeout = TU_TO_EXP_TIME(wk->assoc.bss->beacon_interval); + return WORK_ACT_NONE; +} + +static void ieee80211_auth_challenge(struct ieee80211_work *wk, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + u8 *pos; + struct ieee802_11_elems elems; + + pos = mgmt->u.auth.variable; + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + if (!elems.challenge) + return; + ieee80211_send_auth(sdata, 3, wk->probe_auth.algorithm, + elems.challenge - 2, elems.challenge_len + 2, + wk->filter_ta, wk->probe_auth.key, + wk->probe_auth.key_len, wk->probe_auth.key_idx); + wk->probe_auth.transaction = 4; +} + +static enum work_action __must_check +ieee80211_rx_mgmt_auth(struct ieee80211_work *wk, + struct ieee80211_mgmt *mgmt, size_t len) +{ + u16 auth_alg, auth_transaction, status_code; + + if (wk->type != IEEE80211_WORK_AUTH) + return WORK_ACT_MISMATCH; + + if (len < 24 + 6) + return WORK_ACT_NONE; + + auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); + auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); + status_code = le16_to_cpu(mgmt->u.auth.status_code); + + if (auth_alg != wk->probe_auth.algorithm || + auth_transaction != wk->probe_auth.transaction) + return WORK_ACT_NONE; + + if (status_code != WLAN_STATUS_SUCCESS) { + printk(KERN_DEBUG "%s: %pM denied authentication (status %d)\n", + wk->sdata->name, mgmt->sa, status_code); + return WORK_ACT_DONE; + } + + switch (wk->probe_auth.algorithm) { + case WLAN_AUTH_OPEN: + case WLAN_AUTH_LEAP: + case WLAN_AUTH_FT: + break; + case WLAN_AUTH_SHARED_KEY: + if (wk->probe_auth.transaction != 4) { + ieee80211_auth_challenge(wk, mgmt, len); + /* need another frame */ + return WORK_ACT_NONE; + } + break; + default: + WARN_ON(1); + return WORK_ACT_NONE; + } + + printk(KERN_DEBUG "%s: authenticated\n", wk->sdata->name); + return WORK_ACT_DONE; +} + +static enum work_action __must_check +ieee80211_rx_mgmt_assoc_resp(struct ieee80211_work *wk, + struct ieee80211_mgmt *mgmt, size_t len, + bool reassoc) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_local *local = sdata->local; + u16 capab_info, status_code, aid; + struct ieee802_11_elems elems; + u8 *pos; + + if (wk->type != IEEE80211_WORK_ASSOC) + return WORK_ACT_MISMATCH; + + /* + * AssocResp and ReassocResp have identical structure, so process both + * of them in this function. + */ + + if (len < 24 + 6) + return WORK_ACT_NONE; + + capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); + aid = le16_to_cpu(mgmt->u.assoc_resp.aid); + + printk(KERN_DEBUG "%s: RX %sssocResp from %pM (capab=0x%x " + "status=%d aid=%d)\n", + sdata->name, reassoc ? "Rea" : "A", mgmt->sa, + capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); + + pos = mgmt->u.assoc_resp.variable; + ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); + + if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && + elems.timeout_int && elems.timeout_int_len == 5 && + elems.timeout_int[0] == WLAN_TIMEOUT_ASSOC_COMEBACK) { + u32 tu, ms; + tu = get_unaligned_le32(elems.timeout_int + 1); + ms = tu * 1024 / 1000; + printk(KERN_DEBUG "%s: %pM rejected association temporarily; " + "comeback duration %u TU (%u ms)\n", + sdata->name, mgmt->sa, tu, ms); + wk->timeout = jiffies + msecs_to_jiffies(ms); + if (ms > IEEE80211_ASSOC_TIMEOUT) + run_again(local, wk->timeout); + return WORK_ACT_NONE; + } + + if (status_code != WLAN_STATUS_SUCCESS) + printk(KERN_DEBUG "%s: %pM denied association (code=%d)\n", + sdata->name, mgmt->sa, status_code); + else + printk(KERN_DEBUG "%s: associated\n", sdata->name); + + return WORK_ACT_DONE; +} + +static enum work_action __must_check +ieee80211_rx_mgmt_probe_resp(struct ieee80211_work *wk, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_local *local = sdata->local; + size_t baselen; + + ASSERT_WORK_MTX(local); + + if (wk->type != IEEE80211_WORK_DIRECT_PROBE) + return WORK_ACT_MISMATCH; + + if (len < 24 + 12) + return WORK_ACT_NONE; + + baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; + if (baselen > len) + return WORK_ACT_NONE; + + printk(KERN_DEBUG "%s: direct probe responded\n", sdata->name); + return WORK_ACT_DONE; +} + +static enum work_action __must_check +ieee80211_rx_mgmt_beacon(struct ieee80211_work *wk, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_sub_if_data *sdata = wk->sdata; + struct ieee80211_local *local = sdata->local; + + ASSERT_WORK_MTX(local); + + if (wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT) + return WORK_ACT_MISMATCH; + + if (len < 24 + 12) + return WORK_ACT_NONE; + + printk(KERN_DEBUG "%s: beacon received\n", sdata->name); + return WORK_ACT_DONE; +} + +static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, + struct sk_buff *skb) +{ + struct ieee80211_rx_status *rx_status; + struct ieee80211_mgmt *mgmt; + struct ieee80211_work *wk; + enum work_action rma = WORK_ACT_NONE; + u16 fc; + + rx_status = (struct ieee80211_rx_status *) skb->cb; + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + mutex_lock(&local->mtx); + + list_for_each_entry(wk, &local->work_list, list) { + const u8 *bssid = NULL; + + switch (wk->type) { + case IEEE80211_WORK_DIRECT_PROBE: + case IEEE80211_WORK_AUTH: + case IEEE80211_WORK_ASSOC: + case IEEE80211_WORK_ASSOC_BEACON_WAIT: + bssid = wk->filter_ta; + break; + default: + continue; + } + + /* + * Before queuing, we already verified mgmt->sa, + * so this is needed just for matching. + */ + if (compare_ether_addr(bssid, mgmt->bssid)) + continue; + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_BEACON: + rma = ieee80211_rx_mgmt_beacon(wk, mgmt, skb->len); + break; + case IEEE80211_STYPE_PROBE_RESP: + rma = ieee80211_rx_mgmt_probe_resp(wk, mgmt, skb->len, + rx_status); + break; + case IEEE80211_STYPE_AUTH: + rma = ieee80211_rx_mgmt_auth(wk, mgmt, skb->len); + break; + case IEEE80211_STYPE_ASSOC_RESP: + rma = ieee80211_rx_mgmt_assoc_resp(wk, mgmt, + skb->len, false); + break; + case IEEE80211_STYPE_REASSOC_RESP: + rma = ieee80211_rx_mgmt_assoc_resp(wk, mgmt, + skb->len, true); + break; + default: + WARN_ON(1); + rma = WORK_ACT_NONE; + } + + /* + * We've either received an unexpected frame, or we have + * multiple work items and need to match the frame to the + * right one. + */ + if (rma == WORK_ACT_MISMATCH) + continue; + + /* + * We've processed this frame for that work, so it can't + * belong to another work struct. + * NB: this is also required for correctness for 'rma'! + */ + break; + } + + switch (rma) { + case WORK_ACT_MISMATCH: + /* ignore this unmatched frame */ + break; + case WORK_ACT_NONE: + break; + case WORK_ACT_DONE: + list_del_rcu(&wk->list); + break; + default: + WARN(1, "unexpected: %d", rma); + } + + mutex_unlock(&local->mtx); + + if (rma != WORK_ACT_DONE) + goto out; + + switch (wk->done(wk, skb)) { + case WORK_DONE_DESTROY: + free_work(wk); + break; + case WORK_DONE_REQUEUE: + synchronize_rcu(); + wk->started = false; /* restart */ + mutex_lock(&local->mtx); + list_add_tail(&wk->list, &local->work_list); + mutex_unlock(&local->mtx); + } + + out: + kfree_skb(skb); +} + +static bool ieee80211_work_ct_coexists(enum nl80211_channel_type wk_ct, + enum nl80211_channel_type oper_ct) +{ + switch (wk_ct) { + case NL80211_CHAN_NO_HT: + return true; + case NL80211_CHAN_HT20: + if (oper_ct != NL80211_CHAN_NO_HT) + return true; + return false; + case NL80211_CHAN_HT40MINUS: + case NL80211_CHAN_HT40PLUS: + return (wk_ct == oper_ct); + } + WARN_ON(1); /* shouldn't get here */ + return false; +} + +static enum nl80211_channel_type +ieee80211_calc_ct(enum nl80211_channel_type wk_ct, + enum nl80211_channel_type oper_ct) +{ + switch (wk_ct) { + case NL80211_CHAN_NO_HT: + return oper_ct; + case NL80211_CHAN_HT20: + if (oper_ct != NL80211_CHAN_NO_HT) + return oper_ct; + return wk_ct; + case NL80211_CHAN_HT40MINUS: + case NL80211_CHAN_HT40PLUS: + return wk_ct; + } + WARN_ON(1); /* shouldn't get here */ + return wk_ct; +} + + +static void ieee80211_work_timer(unsigned long data) +{ + struct ieee80211_local *local = (void *) data; + + if (local->quiescing) + return; + + ieee80211_queue_work(&local->hw, &local->work_work); +} + +static void ieee80211_work_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, work_work); + struct sk_buff *skb; + struct ieee80211_work *wk, *tmp; + LIST_HEAD(free_work); + enum work_action rma; + bool remain_off_channel = false; + + if (local->scanning) + return; + + /* + * ieee80211_queue_work() should have picked up most cases, + * here we'll pick the rest. + */ + if (WARN(local->suspended, "work scheduled while going to suspend\n")) + return; + + /* first process frames to avoid timing out while a frame is pending */ + while ((skb = skb_dequeue(&local->work_skb_queue))) + ieee80211_work_rx_queued_mgmt(local, skb); + + mutex_lock(&local->mtx); + + ieee80211_recalc_idle(local); + + list_for_each_entry_safe(wk, tmp, &local->work_list, list) { + bool started = wk->started; + + /* mark work as started if it's on the current off-channel */ + if (!started && local->tmp_channel && + wk->chan == local->tmp_channel && + wk->chan_type == local->tmp_channel_type) { + started = true; + wk->timeout = jiffies; + } + + if (!started && !local->tmp_channel) { + bool on_oper_chan; + bool tmp_chan_changed = false; + bool on_oper_chan2; + enum nl80211_channel_type wk_ct; + on_oper_chan = ieee80211_cfg_on_oper_channel(local); + + /* Work with existing channel type if possible. */ + wk_ct = wk->chan_type; + if (wk->chan == local->hw.conf.channel) + wk_ct = ieee80211_calc_ct(wk->chan_type, + local->hw.conf.channel_type); + + if (local->tmp_channel) + if ((local->tmp_channel != wk->chan) || + (local->tmp_channel_type != wk_ct)) + tmp_chan_changed = true; + + local->tmp_channel = wk->chan; + local->tmp_channel_type = wk_ct; + /* + * Leave the station vifs in awake mode if they + * happen to be on the same channel as + * the requested channel. + */ + on_oper_chan2 = ieee80211_cfg_on_oper_channel(local); + if (on_oper_chan != on_oper_chan2) { + if (on_oper_chan2) { + /* going off oper channel, PS too */ + ieee80211_offchannel_stop_vifs(local, + true); + ieee80211_hw_config(local, 0); + } else { + /* going on channel, but leave PS + * off-channel. */ + ieee80211_hw_config(local, 0); + ieee80211_offchannel_return(local, + true, + false); + } + } else if (tmp_chan_changed) + /* Still off-channel, but on some other + * channel, so update hardware. + * PS should already be off-channel. + */ + ieee80211_hw_config(local, 0); + + started = true; + wk->timeout = jiffies; + } + + /* don't try to work with items that aren't started */ + if (!started) + continue; + + if (time_is_after_jiffies(wk->timeout)) { + /* + * This work item isn't supposed to be worked on + * right now, but take care to adjust the timer + * properly. + */ + run_again(local, wk->timeout); + continue; + } + + switch (wk->type) { + default: + WARN_ON(1); + /* nothing */ + rma = WORK_ACT_NONE; + break; + case IEEE80211_WORK_ABORT: + rma = WORK_ACT_TIMEOUT; + break; + case IEEE80211_WORK_DIRECT_PROBE: + rma = ieee80211_direct_probe(wk); + break; + case IEEE80211_WORK_AUTH: + rma = ieee80211_authenticate(wk); + break; + case IEEE80211_WORK_ASSOC: + rma = ieee80211_associate(wk); + break; + case IEEE80211_WORK_REMAIN_ON_CHANNEL: + rma = ieee80211_remain_on_channel_timeout(wk); + break; + case IEEE80211_WORK_OFFCHANNEL_TX: + rma = ieee80211_offchannel_tx(wk); + break; + case IEEE80211_WORK_ASSOC_BEACON_WAIT: + rma = ieee80211_assoc_beacon_wait(wk); + break; + } + + wk->started = started; + + switch (rma) { + case WORK_ACT_NONE: + /* might have changed the timeout */ + run_again(local, wk->timeout); + break; + case WORK_ACT_TIMEOUT: + list_del_rcu(&wk->list); + synchronize_rcu(); + list_add(&wk->list, &free_work); + break; + default: + WARN(1, "unexpected: %d", rma); + } + } + + list_for_each_entry(wk, &local->work_list, list) { + if (!wk->started) + continue; + if (wk->chan != local->tmp_channel) + continue; + if (!ieee80211_work_ct_coexists(wk->chan_type, + local->tmp_channel_type)) + continue; + remain_off_channel = true; + } + + if (!remain_off_channel && local->tmp_channel) { + local->tmp_channel = NULL; + /* If tmp_channel wasn't operating channel, then + * we need to go back on-channel. + * NOTE: If we can ever be here while scannning, + * or if the hw_config() channel config logic changes, + * then we may need to do a more thorough check to see if + * we still need to do a hardware config. Currently, + * we cannot be here while scanning, however. + */ + if (!ieee80211_cfg_on_oper_channel(local)) + ieee80211_hw_config(local, 0); + + /* At the least, we need to disable offchannel_ps, + * so just go ahead and run the entire offchannel + * return logic here. We *could* skip enabling + * beaconing if we were already on-oper-channel + * as a future optimization. + */ + ieee80211_offchannel_return(local, true, true); + + /* give connection some time to breathe */ + run_again(local, jiffies + HZ/2); + } + + if (list_empty(&local->work_list) && local->scan_req && + !local->scanning) + ieee80211_queue_delayed_work(&local->hw, + &local->scan_work, + round_jiffies_relative(0)); + + ieee80211_recalc_idle(local); + + mutex_unlock(&local->mtx); + + list_for_each_entry_safe(wk, tmp, &free_work, list) { + wk->done(wk, NULL); + list_del(&wk->list); + kfree(wk); + } +} + +void ieee80211_add_work(struct ieee80211_work *wk) +{ + struct ieee80211_local *local; + + if (WARN_ON(!wk->chan)) + return; + + if (WARN_ON(!wk->sdata)) + return; + + if (WARN_ON(!wk->done)) + return; + + if (WARN_ON(!ieee80211_sdata_running(wk->sdata))) + return; + + wk->started = false; + + local = wk->sdata->local; + mutex_lock(&local->mtx); + list_add_tail(&wk->list, &local->work_list); + mutex_unlock(&local->mtx); + + ieee80211_queue_work(&local->hw, &local->work_work); +} + +void ieee80211_work_init(struct ieee80211_local *local) +{ + INIT_LIST_HEAD(&local->work_list); + setup_timer(&local->work_timer, ieee80211_work_timer, + (unsigned long)local); + INIT_WORK(&local->work_work, ieee80211_work_work); + skb_queue_head_init(&local->work_skb_queue); +} + +void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_work *wk; + bool cleanup = false; + + mutex_lock(&local->mtx); + list_for_each_entry(wk, &local->work_list, list) { + if (wk->sdata != sdata) + continue; + cleanup = true; + wk->type = IEEE80211_WORK_ABORT; + wk->started = true; + wk->timeout = jiffies; + } + mutex_unlock(&local->mtx); + + /* run cleanups etc. */ + if (cleanup) + ieee80211_work_work(&local->work_work); + + mutex_lock(&local->mtx); + list_for_each_entry(wk, &local->work_list, list) { + if (wk->sdata != sdata) + continue; + WARN_ON(1); + break; + } + mutex_unlock(&local->mtx); +} + +ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct ieee80211_work *wk; + u16 fc; + + if (skb->len < 24) + return RX_DROP_MONITOR; + + mgmt = (struct ieee80211_mgmt *) skb->data; + fc = le16_to_cpu(mgmt->frame_control); + + list_for_each_entry_rcu(wk, &local->work_list, list) { + if (sdata != wk->sdata) + continue; + if (compare_ether_addr(wk->filter_ta, mgmt->sa)) + continue; + if (compare_ether_addr(wk->filter_ta, mgmt->bssid)) + continue; + + switch (fc & IEEE80211_FCTL_STYPE) { + case IEEE80211_STYPE_AUTH: + case IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_STYPE_ASSOC_RESP: + case IEEE80211_STYPE_REASSOC_RESP: + case IEEE80211_STYPE_BEACON: + skb_queue_tail(&local->work_skb_queue, skb); + ieee80211_queue_work(&local->hw, &local->work_work); + return RX_QUEUED; + } + } + + return RX_CONTINUE; +} + +static enum work_done_result ieee80211_remain_done(struct ieee80211_work *wk, + struct sk_buff *skb) +{ + /* + * We are done serving the remain-on-channel command. + */ + cfg80211_remain_on_channel_expired(wk->sdata->dev, (unsigned long) wk, + wk->chan, wk->chan_type, + GFP_KERNEL); + + return WORK_DONE_DESTROY; +} + +int ieee80211_wk_remain_on_channel(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, u64 *cookie) +{ + struct ieee80211_work *wk; + + wk = kzalloc(sizeof(*wk), GFP_KERNEL); + if (!wk) + return -ENOMEM; + + wk->type = IEEE80211_WORK_REMAIN_ON_CHANNEL; + wk->chan = chan; + wk->chan_type = channel_type; + wk->sdata = sdata; + wk->done = ieee80211_remain_done; + + wk->remain.duration = duration; + + *cookie = (unsigned long) wk; + + ieee80211_add_work(wk); + + return 0; +} + +int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata, + u64 cookie) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_work *wk, *tmp; + bool found = false; + + mutex_lock(&local->mtx); + list_for_each_entry_safe(wk, tmp, &local->work_list, list) { + if ((unsigned long) wk == cookie) { + wk->timeout = jiffies; + found = true; + break; + } + } + mutex_unlock(&local->mtx); + + if (!found) + return -ENOENT; + + ieee80211_queue_work(&local->hw, &local->work_work); + + return 0; +} diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c new file mode 100644 index 00000000..aa1c40ab --- /dev/null +++ b/net/mac80211/wpa.c @@ -0,0 +1,630 @@ +/* + * Copyright 2002-2004, Instant802 Networks, Inc. + * Copyright 2008, Jouni Malinen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ieee80211_i.h" +#include "michael.h" +#include "tkip.h" +#include "aes_ccm.h" +#include "aes_cmac.h" +#include "wpa.h" + +ieee80211_tx_result +ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) +{ + u8 *data, *key, *mic; + size_t data_len; + unsigned int hdrlen; + struct ieee80211_hdr *hdr; + struct sk_buff *skb = tx->skb; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + int tail; + + hdr = (struct ieee80211_hdr *)skb->data; + if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || + skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control)) + return TX_CONTINUE; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (skb->len < hdrlen) + return TX_DROP; + + data = skb->data + hdrlen; + data_len = skb->len - hdrlen; + + if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) { + /* Need to use software crypto for the test */ + info->control.hw_key = NULL; + } + + if (info->control.hw_key && + !(tx->flags & IEEE80211_TX_FRAGMENTED) && + !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) { + /* hwaccel - with no need for SW-generated MMIC */ + return TX_CONTINUE; + } + + tail = MICHAEL_MIC_LEN; + if (!info->control.hw_key) + tail += TKIP_ICV_LEN; + + if (WARN_ON(skb_tailroom(skb) < tail || + skb_headroom(skb) < TKIP_IV_LEN)) + return TX_DROP; + + key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; + mic = skb_put(skb, MICHAEL_MIC_LEN); + michael_mic(key, hdr, data, data_len, mic); + if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) + mic[0]++; + + return TX_CONTINUE; +} + + +ieee80211_rx_result +ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) +{ + u8 *data, *key = NULL; + size_t data_len; + unsigned int hdrlen; + u8 mic[MICHAEL_MIC_LEN]; + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + int queue = rx->queue; + + /* otherwise, TKIP is vulnerable to TID 0 vs. non-QoS replays */ + if (rx->queue == NUM_RX_DATA_QUEUES - 1) + queue = 0; + + /* + * it makes no sense to check for MIC errors on anything other + * than data frames. + */ + if (!ieee80211_is_data_present(hdr->frame_control)) + return RX_CONTINUE; + + /* + * No way to verify the MIC if the hardware stripped it or + * the IV with the key index. In this case we have solely rely + * on the driver to set RX_FLAG_MMIC_ERROR in the event of a + * MIC failure report. + */ + if (status->flag & (RX_FLAG_MMIC_STRIPPED | RX_FLAG_IV_STRIPPED)) { + if (status->flag & RX_FLAG_MMIC_ERROR) + goto mic_fail; + + if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key) + goto update_iv; + + return RX_CONTINUE; + } + + /* + * Some hardware seems to generate Michael MIC failure reports; even + * though, the frame was not encrypted with TKIP and therefore has no + * MIC. Ignore the flag them to avoid triggering countermeasures. + */ + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || + !(status->flag & RX_FLAG_DECRYPTED)) + return RX_CONTINUE; + + if (rx->sdata->vif.type == NL80211_IFTYPE_AP && rx->key->conf.keyidx) { + /* + * APs with pairwise keys should never receive Michael MIC + * errors for non-zero keyidx because these are reserved for + * group keys and only the AP is sending real multicast + * frames in the BSS. ( + */ + return RX_DROP_UNUSABLE; + } + + if (status->flag & RX_FLAG_MMIC_ERROR) + goto mic_fail; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (skb->len < hdrlen + MICHAEL_MIC_LEN) + return RX_DROP_UNUSABLE; + + data = skb->data + hdrlen; + data_len = skb->len - hdrlen - MICHAEL_MIC_LEN; + key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY]; + michael_mic(key, hdr, data, data_len, mic); + if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0) + goto mic_fail; + + /* remove Michael MIC from payload */ + skb_trim(skb, skb->len - MICHAEL_MIC_LEN); + +update_iv: + /* update IV in key information to be able to detect replays */ + rx->key->u.tkip.rx[queue].iv32 = rx->tkip_iv32; + rx->key->u.tkip.rx[queue].iv16 = rx->tkip_iv16; + + return RX_CONTINUE; + +mic_fail: + /* + * In some cases the key can be unset - e.g. a multicast packet, in + * a driver that supports HW encryption. Send up the key idx only if + * the key is set. + */ + mac80211_ev_michael_mic_failure(rx->sdata, + rx->key ? rx->key->conf.keyidx : -1, + (void *) skb->data, NULL, GFP_ATOMIC); + return RX_DROP_UNUSABLE; +} + + +static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_key *key = tx->key; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + unsigned int hdrlen; + int len, tail; + u8 *pos; + + if (info->control.hw_key && + !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + /* hwaccel - with no need for software-generated IV */ + return 0; + } + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + len = skb->len - hdrlen; + + if (info->control.hw_key) + tail = 0; + else + tail = TKIP_ICV_LEN; + + if (WARN_ON(skb_tailroom(skb) < tail || + skb_headroom(skb) < TKIP_IV_LEN)) + return -1; + + pos = skb_push(skb, TKIP_IV_LEN); + memmove(pos, pos + TKIP_IV_LEN, hdrlen); + pos += hdrlen; + + /* Increase IV for the frame */ + key->u.tkip.tx.iv16++; + if (key->u.tkip.tx.iv16 == 0) + key->u.tkip.tx.iv32++; + + pos = ieee80211_tkip_add_iv(pos, key, key->u.tkip.tx.iv16); + + /* hwaccel - with software IV */ + if (info->control.hw_key) + return 0; + + /* Add room for ICV */ + skb_put(skb, TKIP_ICV_LEN); + + hdr = (struct ieee80211_hdr *) skb->data; + return ieee80211_tkip_encrypt_data(tx->local->wep_tx_tfm, + key, pos, len, hdr->addr2); +} + + +ieee80211_tx_result +ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + + ieee80211_tx_set_protected(tx); + + do { + if (tkip_encrypt_skb(tx, skb) < 0) + return TX_DROP; + } while ((skb = skb->next)); + + return TX_CONTINUE; +} + + +ieee80211_rx_result +ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data; + int hdrlen, res, hwaccel = 0; + struct ieee80211_key *key = rx->key; + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + int queue = rx->queue; + + /* otherwise, TKIP is vulnerable to TID 0 vs. non-QoS replays */ + if (rx->queue == NUM_RX_DATA_QUEUES - 1) + queue = 0; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + + if (!ieee80211_is_data(hdr->frame_control)) + return RX_CONTINUE; + + if (!rx->sta || skb->len - hdrlen < 12) + return RX_DROP_UNUSABLE; + + /* + * Let TKIP code verify IV, but skip decryption. + * In the case where hardware checks the IV as well, + * we don't even get here, see ieee80211_rx_h_decrypt() + */ + if (status->flag & RX_FLAG_DECRYPTED) + hwaccel = 1; + + res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm, + key, skb->data + hdrlen, + skb->len - hdrlen, rx->sta->sta.addr, + hdr->addr1, hwaccel, queue, + &rx->tkip_iv32, + &rx->tkip_iv16); + if (res != TKIP_DECRYPT_OK) + return RX_DROP_UNUSABLE; + + /* Trim ICV */ + skb_trim(skb, skb->len - TKIP_ICV_LEN); + + /* Remove IV */ + memmove(skb->data + TKIP_IV_LEN, skb->data, hdrlen); + skb_pull(skb, TKIP_IV_LEN); + + return RX_CONTINUE; +} + + +static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch, + int encrypted) +{ + __le16 mask_fc; + int a4_included, mgmt; + u8 qos_tid; + u8 *b_0, *aad; + u16 data_len, len_a; + unsigned int hdrlen; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + + b_0 = scratch + 3 * AES_BLOCK_LEN; + aad = scratch + 4 * AES_BLOCK_LEN; + + /* + * Mask FC: zero subtype b4 b5 b6 (if not mgmt) + * Retry, PwrMgt, MoreData; set Protected + */ + mgmt = ieee80211_is_mgmt(hdr->frame_control); + mask_fc = hdr->frame_control; + mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | + IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); + if (!mgmt) + mask_fc &= ~cpu_to_le16(0x0070); + mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + len_a = hdrlen - 2; + a4_included = ieee80211_has_a4(hdr->frame_control); + + if (ieee80211_is_data_qos(hdr->frame_control)) + qos_tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + else + qos_tid = 0; + + data_len = skb->len - hdrlen - CCMP_HDR_LEN; + if (encrypted) + data_len -= CCMP_MIC_LEN; + + /* First block, b_0 */ + b_0[0] = 0x59; /* flags: Adata: 1, M: 011, L: 001 */ + /* Nonce: Nonce Flags | A2 | PN + * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7) + */ + b_0[1] = qos_tid | (mgmt << 4); + memcpy(&b_0[2], hdr->addr2, ETH_ALEN); + memcpy(&b_0[8], pn, CCMP_PN_LEN); + /* l(m) */ + put_unaligned_be16(data_len, &b_0[14]); + + /* AAD (extra authenticate-only data) / masked 802.11 header + * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ + put_unaligned_be16(len_a, &aad[0]); + put_unaligned(mask_fc, (__le16 *)&aad[2]); + memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN); + + /* Mask Seq#, leave Frag# */ + aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; + aad[23] = 0; + + if (a4_included) { + memcpy(&aad[24], hdr->addr4, ETH_ALEN); + aad[30] = qos_tid; + aad[31] = 0; + } else { + memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN); + aad[24] = qos_tid; + } +} + + +static inline void ccmp_pn2hdr(u8 *hdr, u8 *pn, int key_id) +{ + hdr[0] = pn[5]; + hdr[1] = pn[4]; + hdr[2] = 0; + hdr[3] = 0x20 | (key_id << 6); + hdr[4] = pn[3]; + hdr[5] = pn[2]; + hdr[6] = pn[1]; + hdr[7] = pn[0]; +} + + +static inline void ccmp_hdr2pn(u8 *pn, u8 *hdr) +{ + pn[0] = hdr[7]; + pn[1] = hdr[6]; + pn[2] = hdr[5]; + pn[3] = hdr[4]; + pn[4] = hdr[1]; + pn[5] = hdr[0]; +} + + +static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_key *key = tx->key; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + int hdrlen, len, tail; + u8 *pos, *pn; + int i; + + if (info->control.hw_key && + !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + /* + * hwaccel has no need for preallocated room for CCMP + * header or MIC fields + */ + return 0; + } + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + len = skb->len - hdrlen; + + if (info->control.hw_key) + tail = 0; + else + tail = CCMP_MIC_LEN; + + if (WARN_ON(skb_tailroom(skb) < tail || + skb_headroom(skb) < CCMP_HDR_LEN)) + return -1; + + pos = skb_push(skb, CCMP_HDR_LEN); + memmove(pos, pos + CCMP_HDR_LEN, hdrlen); + hdr = (struct ieee80211_hdr *) pos; + pos += hdrlen; + + /* PN = PN + 1 */ + pn = key->u.ccmp.tx_pn; + + for (i = CCMP_PN_LEN - 1; i >= 0; i--) { + pn[i]++; + if (pn[i]) + break; + } + + ccmp_pn2hdr(pos, pn, key->conf.keyidx); + + /* hwaccel - with software CCMP header */ + if (info->control.hw_key) + return 0; + + pos += CCMP_HDR_LEN; + ccmp_special_blocks(skb, pn, key->u.ccmp.tx_crypto_buf, 0); + ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, key->u.ccmp.tx_crypto_buf, pos, len, + pos, skb_put(skb, CCMP_MIC_LEN)); + + return 0; +} + + +ieee80211_tx_result +ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + + ieee80211_tx_set_protected(tx); + + do { + if (ccmp_encrypt_skb(tx, skb) < 0) + return TX_DROP; + } while ((skb = skb->next)); + + return TX_CONTINUE; +} + + +ieee80211_rx_result +ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + int hdrlen; + struct ieee80211_key *key = rx->key; + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + u8 pn[CCMP_PN_LEN]; + int data_len; + int queue; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + + if (!ieee80211_is_data(hdr->frame_control) && + !ieee80211_is_robust_mgmt_frame(hdr)) + return RX_CONTINUE; + + data_len = skb->len - hdrlen - CCMP_HDR_LEN - CCMP_MIC_LEN; + if (!rx->sta || data_len < 0) + return RX_DROP_UNUSABLE; + + ccmp_hdr2pn(pn, skb->data + hdrlen); + + queue = ieee80211_is_mgmt(hdr->frame_control) ? + NUM_RX_DATA_QUEUES : rx->queue; + + if (memcmp(pn, key->u.ccmp.rx_pn[queue], CCMP_PN_LEN) <= 0) { + key->u.ccmp.replays++; + return RX_DROP_UNUSABLE; + } + + if (!(status->flag & RX_FLAG_DECRYPTED)) { + /* hardware didn't decrypt/verify MIC */ + ccmp_special_blocks(skb, pn, key->u.ccmp.rx_crypto_buf, 1); + + if (ieee80211_aes_ccm_decrypt( + key->u.ccmp.tfm, key->u.ccmp.rx_crypto_buf, + skb->data + hdrlen + CCMP_HDR_LEN, data_len, + skb->data + skb->len - CCMP_MIC_LEN, + skb->data + hdrlen + CCMP_HDR_LEN)) + return RX_DROP_UNUSABLE; + } + + memcpy(key->u.ccmp.rx_pn[queue], pn, CCMP_PN_LEN); + + /* Remove CCMP header and MIC */ + skb_trim(skb, skb->len - CCMP_MIC_LEN); + memmove(skb->data + CCMP_HDR_LEN, skb->data, hdrlen); + skb_pull(skb, CCMP_HDR_LEN); + + return RX_CONTINUE; +} + + +static void bip_aad(struct sk_buff *skb, u8 *aad) +{ + /* BIP AAD: FC(masked) || A1 || A2 || A3 */ + + /* FC type/subtype */ + aad[0] = skb->data[0]; + /* Mask FC Retry, PwrMgt, MoreData flags to zero */ + aad[1] = skb->data[1] & ~(BIT(4) | BIT(5) | BIT(6)); + /* A1 || A2 || A3 */ + memcpy(aad + 2, skb->data + 4, 3 * ETH_ALEN); +} + + +static inline void bip_ipn_swap(u8 *d, const u8 *s) +{ + *d++ = s[5]; + *d++ = s[4]; + *d++ = s[3]; + *d++ = s[2]; + *d++ = s[1]; + *d = s[0]; +} + + +ieee80211_tx_result +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) +{ + struct sk_buff *skb = tx->skb; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_key *key = tx->key; + struct ieee80211_mmie *mmie; + u8 *pn, aad[20]; + int i; + + if (info->control.hw_key) + return 0; + + if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) + return TX_DROP; + + mmie = (struct ieee80211_mmie *) skb_put(skb, sizeof(*mmie)); + mmie->element_id = WLAN_EID_MMIE; + mmie->length = sizeof(*mmie) - 2; + mmie->key_id = cpu_to_le16(key->conf.keyidx); + + /* PN = PN + 1 */ + pn = key->u.aes_cmac.tx_pn; + + for (i = sizeof(key->u.aes_cmac.tx_pn) - 1; i >= 0; i--) { + pn[i]++; + if (pn[i]) + break; + } + bip_ipn_swap(mmie->sequence_number, pn); + + bip_aad(skb, aad); + + /* + * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64) + */ + ieee80211_aes_cmac(key->u.aes_cmac.tfm, key->u.aes_cmac.tx_crypto_buf, + aad, skb->data + 24, skb->len - 24, mmie->mic); + + return TX_CONTINUE; +} + + +ieee80211_rx_result +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_key *key = rx->key; + struct ieee80211_mmie *mmie; + u8 aad[20], mic[8], ipn[6]; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + + if (!ieee80211_is_mgmt(hdr->frame_control)) + return RX_CONTINUE; + + if (skb->len < 24 + sizeof(*mmie)) + return RX_DROP_UNUSABLE; + + mmie = (struct ieee80211_mmie *) + (skb->data + skb->len - sizeof(*mmie)); + if (mmie->element_id != WLAN_EID_MMIE || + mmie->length != sizeof(*mmie) - 2) + return RX_DROP_UNUSABLE; /* Invalid MMIE */ + + bip_ipn_swap(ipn, mmie->sequence_number); + + if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { + key->u.aes_cmac.replays++; + return RX_DROP_UNUSABLE; + } + + if (!(status->flag & RX_FLAG_DECRYPTED)) { + /* hardware didn't decrypt/verify MIC */ + bip_aad(skb, aad); + ieee80211_aes_cmac(key->u.aes_cmac.tfm, + key->u.aes_cmac.rx_crypto_buf, aad, + skb->data + 24, skb->len - 24, mic); + if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) { + key->u.aes_cmac.icverrors++; + return RX_DROP_UNUSABLE; + } + } + + memcpy(key->u.aes_cmac.rx_pn, ipn, 6); + + /* Remove MMIE */ + skb_trim(skb, skb->len - sizeof(*mmie)); + + return RX_CONTINUE; +} diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h new file mode 100644 index 00000000..baba0608 --- /dev/null +++ b/net/mac80211/wpa.h @@ -0,0 +1,36 @@ +/* + * Copyright 2002-2004, Instant802 Networks, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef WPA_H +#define WPA_H + +#include +#include +#include "ieee80211_i.h" + +ieee80211_tx_result +ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx); +ieee80211_rx_result +ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx); + +ieee80211_tx_result +ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx); +ieee80211_rx_result +ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx); + +ieee80211_tx_result +ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx); +ieee80211_rx_result +ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx); + +ieee80211_tx_result +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx); +ieee80211_rx_result +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx); + +#endif /* WPA_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig new file mode 100644 index 00000000..5bd5c612 --- /dev/null +++ b/net/netfilter/Kconfig @@ -0,0 +1,1130 @@ +menu "Core Netfilter Configuration" + depends on NET && INET && NETFILTER + +config NETFILTER_NETLINK + tristate + +config NETFILTER_NETLINK_QUEUE + tristate "Netfilter NFQUEUE over NFNETLINK interface" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for queueing packets via NFNETLINK. + +config NETFILTER_NETLINK_LOG + tristate "Netfilter LOG over NFNETLINK interface" + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for logging packets via NFNETLINK. + + This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms, + and is also scheduled to replace the old syslog-based ipt_LOG + and ip6t_LOG modules. + +config NF_CONNTRACK + tristate "Netfilter connection tracking support" + default m if NETFILTER_ADVANCED=n + help + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is required to do Masquerading or other kinds of Network + Address Translation. It can also be used to enhance packet + filtering (see `Connection state match support' below). + + To compile it as a module, choose M here. If unsure, say N. + +if NF_CONNTRACK + +config NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + depends on NETFILTER_ADVANCED + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + +config NF_CONNTRACK_ZONES + bool 'Connection tracking zones' + depends on NETFILTER_ADVANCED + depends on NETFILTER_XT_TARGET_CT + help + This option enables support for connection tracking zones. + Normally, each connection needs to have a unique system wide + identity. Connection tracking zones allow to have multiple + connections using the same identity, as long as they are + contained in different zones. + + If unsure, say `N'. + +config NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on NETFILTER_ADVANCED + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified about changes in the connection tracking state. + + If unsure, say `N'. + +config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timestamping. + This allows you to store the flow start-time and to obtain + the flow-stop time (once it has been destroyed) via Connection + tracking events. + + If unsure, say `N'. + +config NF_CT_PROTO_DCCP + tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + default IP_DCCP + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on DCCP connections. + + If unsure, say 'N'. + +config NF_CT_PROTO_GRE + tristate + +config NF_CT_PROTO_SCTP + tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + default IP_SCTP + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NF_CT_PROTO_UDPLITE + tristate 'UDP-Lite protocol connection tracking support' + depends on NETFILTER_ADVANCED + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on UDP-Lite + connections. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_AMANDA + tristate "Amanda backup protocol support" + depends on NETFILTER_ADVANCED + select TEXTSEARCH + select TEXTSEARCH_KMP + help + If you are running the Amanda backup package + on this machine or machines that will be MASQUERADED through this + machine, then you may want to enable this feature. This allows the + connection tracking and natting code to allow the sub-channels that + Amanda requires for communication of the backup data, messages and + index. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_FTP + tristate "FTP protocol support" + default m if NETFILTER_ADVANCED=n + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + This is FTP support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_H323 + tristate "H.323 protocol support" + depends on (IPV6 || IPV6=n) + depends on NETFILTER_ADVANCED + help + H.323 is a VoIP signalling protocol from ITU-T. As one of the most + important VoIP protocols, it is widely used by voice hardware and + software including voice gateways, IP phones, Netmeeting, OpenPhone, + Gnomemeeting, etc. + + With this module you can support H.323 on a connection tracking/NAT + firewall. + + This module supports RAS, Fast Start, H.245 Tunnelling, Call + Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, + whiteboard, file transfer, etc. For more information, please + visit http://nath323.sourceforge.net/. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_IRC + tristate "IRC protocol support" + default m if NETFILTER_ADVANCED=n + help + There is a commonly-used extension to IRC called + Direct Client-to-Client Protocol (DCC). This enables users to send + files to each other, and also chat to each other without the need + of a server. DCC Sending is used anywhere you send files over IRC, + and DCC Chat is most commonly used by Eggdrop bots. If you are + using NAT, this extension will enable you to send files and initiate + chats. Note that you do NOT need this extension to get files or + have others initiate chats, or everything else in IRC. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_BROADCAST + tristate + +config NF_CONNTRACK_NETBIOS_NS + tristate "NetBIOS name service protocol support" + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST + help + NetBIOS name service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating NetBIOS name service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. When properly configured, the output + of "ip address show" should look similar to this: + + $ ip -4 address show eth0 + 4: eth0: mtu 1500 qdisc pfifo_fast qlen 1000 + inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0 + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_SNMP + tristate "SNMP service protocol support" + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST + help + SNMP service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating SNMP service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_PPTP + tristate "PPtP protocol support" + depends on NETFILTER_ADVANCED + select NF_CT_PROTO_GRE + help + This module adds support for PPTP (Point to Point Tunnelling + Protocol, RFC2637) connection tracking and NAT. + + If you are running PPTP sessions over a stateful firewall or NAT + box, you may want to enable this feature. + + Please note that not all PPTP modes of operation are supported yet. + Specifically these limitations exist: + - Blindly assumes that control connections are always established + in PNS->PAC direction. This is a violation of RFC2637. + - Only supports a single call within each session + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_SANE + tristate "SANE protocol support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + help + SANE is a protocol for remote access to scanners as implemented + by the 'saned' daemon. Like FTP, it uses separate control and + data connections. + + With this module you can support SANE on a connection tracking + firewall. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_SIP + tristate "SIP protocol support" + default m if NETFILTER_ADVANCED=n + help + SIP is an application-layer control protocol that can establish, + modify, and terminate multimedia sessions (conferences) such as + Internet telephony calls. With the ip_conntrack_sip and + the nf_nat_sip modules you can support the protocol on a connection + tracking/NATing firewall. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_TFTP + tristate "TFTP protocol support" + depends on NETFILTER_ADVANCED + help + TFTP connection tracking helper, this is required depending + on how restrictive your ruleset is. + If you are using a tftp client behind -j SNAT or -j MASQUERADING + you will need this. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CT_NETLINK + tristate 'Connection tracking netlink interface' + select NETFILTER_NETLINK + default m if NETFILTER_ADVANCED=n + help + This option enables support for a netlink-based userspace interface + +endif # NF_CONNTRACK + +# transparent proxy support +config NETFILTER_TPROXY + tristate "Transparent proxying support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option enables transparent proxying support, that is, + support for handling non-locally bound IPv4 TCP and UDP sockets. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XTABLES + tristate "Netfilter Xtables support (required for ip_tables)" + default m if NETFILTER_ADVANCED=n + help + This is required if you intend to use any of ip_tables, + ip6_tables or arp_tables. + +if NETFILTER_XTABLES + +comment "Xtables combined modules" + +config NETFILTER_XT_MARK + tristate 'nfmark target and match support' + default m if NETFILTER_ADVANCED=n + ---help--- + This option adds the "MARK" target and "mark" match. + + Netfilter mark matching allows you to match packets based on the + "nfmark" value in the packet. + The target allows you to create rules in the "mangle" table which alter + the netfilter mark (nfmark) field associated with the packet. + + Prior to routing, the nfmark can influence the routing method (see + "Use netfilter MARK value as routing key") and can also be used by + other subsystems to change their behavior. + +config NETFILTER_XT_CONNMARK + tristate 'ctmark target and match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_MARK + ---help--- + This option adds the "CONNMARK" target and "connmark" match. + + Netfilter allows you to store a mark value per connection (a.k.a. + ctmark), similarly to the packet mark (nfmark). Using this + target and match, you can set and match on this mark. + +config NETFILTER_XT_SET + tristate 'set target and match support' + depends on IP_SET + depends on NETFILTER_ADVANCED + help + This option adds the "SET" target and "set" match. + + Using this target and match, you can add/delete and match + elements in the sets created by ipset(8). + + To compile it as a module, choose M here. If unsure, say N. + +# alphabetically ordered list of targets + +comment "Xtables targets" + +config NETFILTER_XT_TARGET_AUDIT + tristate "AUDIT target support" + depends on AUDIT + depends on NETFILTER_ADVANCED + ---help--- + This option adds a 'AUDIT' target, which can be used to create + audit records for packets dropped/accepted. + + To compileit as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CHECKSUM + tristate "CHECKSUM target support" + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds a `CHECKSUM' target, which can be used in the iptables mangle + table. + + You can use this target to compute and fill in the checksum in + a packet that lacks a checksum. This is particularly useful, + if you need to work around old applications such as dhcp clients, + that do not work well with checksum offloads, but don't want to disable + checksum offload in your device. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CLASSIFY + tristate '"CLASSIFY" target support' + depends on NETFILTER_ADVANCED + help + This option adds a `CLASSIFY' target, which enables the user to set + the priority of a packet. Some qdiscs can use this value for + classification, among these are: + + atm, cbq, dsmark, pfifo_fast, htb, prio + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CONNMARK + tristate '"CONNMARK" target support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NETFILTER_XT_CONNMARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). + +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK + default m if NETFILTER_ADVANCED=n + help + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CT + tristate '"CT" target support' + depends on NF_CONNTRACK + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + This options adds a `CT' target, which allows to specify initial + connection tracking parameters like events to be delivered and + the helper to be used. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_DSCP + tristate '"DSCP" and "TOS" target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option adds a `DSCP' target, which allows you to manipulate + the IPv4/IPv6 header DSCP field (differentiated services codepoint). + + The DSCP field can have any value between 0x0 and 0x3f inclusive. + + It also adds the "TOS" target, which allows you to create rules in + the "mangle" table which alter the Type Of Service field of an IPv4 + or the Priority field of an IPv6 packet, prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_HL + tristate '"HL" hoplimit target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HL" (for IPv6) and "TTL" (for IPv4) + targets, which enable the user to change the + hoplimit/time-to-live value of the IP header. + + While it is safe to decrement the hoplimit/TTL value, the + modules also allow to increment and set the hoplimit value of + the header to arbitrary values. This is EXTREMELY DANGEROUS + since you can easily create immortal packets that loop + forever on the network. + +config NETFILTER_XT_TARGET_IDLETIMER + tristate "IDLETIMER target support" + depends on NETFILTER_ADVANCED + help + + This option adds the `IDLETIMER' target. Each matching packet + resets the timer associated with label specified when the rule is + added. When the timer expires, it triggers a sysfs notification. + The remaining time for expiration can be read via sysfs. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_LED + tristate '"LED" target support' + depends on LEDS_CLASS && LEDS_TRIGGERS + depends on NETFILTER_ADVANCED + help + This option adds a `LED' target, which allows you to blink LEDs in + response to particular packets passing through your machine. + + This can be used to turn a spare LED into a network activity LED, + which only flashes in response to FTP transfers, for example. Or + you could have an LED which lights up for a minute or two every time + somebody connects to your machine via SSH. + + You will need support for the "led" class to make this work. + + To create an LED trigger for incoming SSH traffic: + iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000 + + Then attach the new trigger to an LED on your system: + echo netfilter-ssh > /sys/class/leds//trigger + + For more information on the LEDs available on your system, see + Documentation/leds-class.txt + +config NETFILTER_XT_TARGET_MARK + tristate '"MARK" target support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). + +config NETFILTER_XT_TARGET_NFLOG + tristate '"NFLOG" target support' + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK_LOG + help + This option enables the NFLOG target, which allows to LOG + messages through nfnetlink_log. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NFQUEUE + tristate '"NFQUEUE" target Support' + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_QUEUE + help + This target replaced the old obsolete QUEUE target. + + As opposed to QUEUE, it supports 65535 different queues, + not just one. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NOTRACK + tristate '"NOTRACK" target support' + depends on IP_NF_RAW || IP6_NF_RAW + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + help + The NOTRACK target allows a select rule to specify + which packets *not* to enter the conntrack/NAT + subsystem with all the consequences (no ICMP error tracking, + no protocol helpers for the selected packets). + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_TARGET_RATEEST + tristate '"RATEEST" target support' + depends on NETFILTER_ADVANCED + help + This option adds a `RATEEST' target, which allows to measure + rates similar to TC estimators. The `rateest' match can be + used to match on the measured rates. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TEE + tristate '"TEE" - packet cloning to alternate destination' + depends on NETFILTER_ADVANCED + depends on (IPV6 || IPV6=n) + depends on !NF_CONNTRACK || NF_CONNTRACK + ---help--- + This option adds a "TEE" target with which a packet can be cloned and + this clone be rerouted to another nexthop. + +config NETFILTER_XT_TARGET_TPROXY + tristate '"TPROXY" target support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_TPROXY + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + help + This option adds a `TPROXY' target, which is somewhat similar to + REDIRECT. It can only be used in the mangle table and is useful + to redirect traffic to a transparent proxy. It does _not_ depend + on Netfilter connection tracking and NAT, unlike REDIRECT. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TRACE + tristate '"TRACE" target support' + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + The TRACE target allows you to mark packets so that the kernel + will log every rule which match the packets as those traverse + the tables, chains, rules. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_TARGET_SECMARK + tristate '"SECMARK" target support' + depends on NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n + help + The SECMARK target allows security marking of network + packets, for use with security subsystems. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TCPMSS + tristate '"TCPMSS" target support' + depends on (IPV6 || IPV6=n) + default m if NETFILTER_ADVANCED=n + ---help--- + This option adds a `TCPMSS' target, which allows you to alter the + MSS value of TCP SYN packets, to control the maximum size for that + connection (usually limiting it to your outgoing interface's MTU + minus 40). + + This is used to overcome criminally braindead ISPs or servers which + block ICMP Fragmentation Needed packets. The symptoms of this + problem are that everything works fine from your Linux + firewall/router, but machines behind it can never exchange large + packets: + 1) Web browsers connect, then hang with no data received. + 2) Small mail works fine, but large emails hang. + 3) ssh works fine, but scp hangs after initial handshaking. + + Workaround: activate this option and add a rule to your firewall + configuration like: + + iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \ + -j TCPMSS --clamp-mss-to-pmtu + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TCPOPTSTRIP + tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option adds a "TCPOPTSTRIP" target, which allows you to strip + TCP options from TCP packets. + +# alphabetically ordered list of matches + +comment "Xtables matches" + +config NETFILTER_XT_MATCH_ADDRTYPE + tristate '"addrtype" address type match support' + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_CLUSTER + tristate '"cluster" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to build work-load-sharing clusters of + network servers/stateful firewalls without having a dedicated + load-balancing router/server/switch. Basically, this match returns + true when the packet must be handled by this cluster node. Thus, + all nodes see all packets and this match decides which node handles + what packets. The work-load sharing algorithm is based on source + address hashing. + + If you say Y or M here, try `iptables -m cluster --help` for + more information. + +config NETFILTER_XT_MATCH_COMMENT + tristate '"comment" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `comment' dummy-match, which allows you to put + comments in your iptables ruleset. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNBYTES + tristate '"connbytes" per-connection counter match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + help + This option adds a `connbytes' match, which allows you to match the + number of bytes and/or packets for each direction within a connection. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNLIMIT + tristate '"connlimit" match support"' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This match allows you to match against the number of parallel + connections to a server per client IP address (or address block). + +config NETFILTER_XT_MATCH_CONNMARK + tristate '"connmark" connection mark match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NETFILTER_XT_CONNMARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). + +config NETFILTER_XT_MATCH_CONNTRACK + tristate '"conntrack" connection tracking match support' + depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + help + This is a general conntrack match module, a superset of the state match. + + It allows matching on additional conntrack information, which is + useful in complex configurations, such as NAT gateways with multiple + internet links or tunnels. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_CPU + tristate '"cpu" match support' + depends on NETFILTER_ADVANCED + help + CPU matching allows you to match packets based on the CPU + currently handling the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_DCCP + tristate '"dccp" protocol match support' + depends on NETFILTER_ADVANCED + default IP_DCCP + help + With this option enabled, you will be able to use the iptables + `dccp' match in order to match on DCCP source/destination ports + and DCCP flags. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_DEVGROUP + tristate '"devgroup" match support' + depends on NETFILTER_ADVANCED + help + This options adds a `devgroup' match, which allows to match on the + device group a network device is assigned to. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_DSCP + tristate '"dscp" and "tos" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `DSCP' match, which allows you to match against + the IPv4/IPv6 header DSCP field (differentiated services codepoint). + + The DSCP field can have any value between 0x0 and 0x3f inclusive. + + It will also add a "tos" match, which allows you to match packets + based on the Type Of Service fields of the IPv4 packet (which share + the same bits as DSCP). + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_ESP + tristate '"esp" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of SPIs + inside ESP header of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_HASHLIMIT + tristate '"hashlimit" match support' + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + help + This option adds a `hashlimit' match. + + As opposed to `limit', this match dynamically creates a hash table + of limit buckets, based on your selection of source/destination + addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination address' or `500pps from any given source address' + with a single rule. + +config NETFILTER_XT_MATCH_HELPER + tristate '"helper" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + help + Helper matching allows you to match packets in dynamic connections + tracked by a conntrack-helper, ie. ip_conntrack_ftp + + To compile it as a module, choose M here. If unsure, say Y. + +config NETFILTER_XT_MATCH_HL + tristate '"hl" hoplimit/TTL match support' + depends on NETFILTER_ADVANCED + ---help--- + HL matching allows you to match packets based on the hoplimit + in the IPv6 header, or the time-to-live field in the IPv4 + header of the packet. + +config NETFILTER_XT_MATCH_IPRANGE + tristate '"iprange" address range match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "iprange" match, which allows you to match based on + an IP address range. (Normal iptables only matches on single addresses + with an optional mask.) + + If unsure, say M. + +config NETFILTER_XT_MATCH_IPVS + tristate '"ipvs" match support' + depends on IP_VS + depends on NETFILTER_ADVANCED + depends on NF_CONNTRACK + help + This option allows you to match against IPVS properties of a packet. + + If unsure, say N. + +config NETFILTER_XT_MATCH_LENGTH + tristate '"length" match support' + depends on NETFILTER_ADVANCED + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_LIMIT + tristate '"limit" match support' + depends on NETFILTER_ADVANCED + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_MAC + tristate '"mac" address match support' + depends on NETFILTER_ADVANCED + help + MAC matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_MARK + tristate '"mark" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). + +config NETFILTER_XT_MATCH_MULTIPORT + tristate '"multiport" Multiple port match support' + depends on NETFILTER_ADVANCED + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_OSF + tristate '"osf" Passive OS fingerprint match' + depends on NETFILTER_ADVANCED && NETFILTER_NETLINK + help + This option selects the Passive OS Fingerprinting match module + that allows to passively match the remote operating system by + analyzing incoming TCP SYN packets. + + Rules and loading software can be downloaded from + http://www.ioremap.net/projects/osf + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_OWNER + tristate '"owner" match support' + depends on NETFILTER_ADVANCED + ---help--- + Socket owner matching allows you to match locally-generated packets + based on who created the socket: the user or group. It is also + possible to check whether a socket actually exists. + + Conflicts with '"quota, tag, uid" match' + +config NETFILTER_XT_MATCH_POLICY + tristate 'IPsec "policy" match support' + depends on XFRM + default m if NETFILTER_ADVANCED=n + help + Policy matching allows you to match packets based on the + IPsec policy that was used during decapsulation/will + be used during encapsulation. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_PHYSDEV + tristate '"physdev" match support' + depends on BRIDGE && BRIDGE_NETFILTER + depends on NETFILTER_ADVANCED + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_PKTTYPE + tristate '"pkttype" packet type match support' + depends on NETFILTER_ADVANCED + help + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... + + Typical usage: + iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_QTAGUID + bool '"quota, tag, owner" match and stats support' + depends on NETFILTER_XT_MATCH_SOCKET + depends on NETFILTER_XT_MATCH_OWNER=n + help + This option replaces the `owner' match. In addition to matching + on uid, it keeps stats based on a tag assigned to a socket. + The full tag is comprised of a UID and an accounting tag. + The tags are assignable to sockets from user space (e.g. a download + manager can assign the socket to another UID for accounting). + Stats and control are done via /proc/net/xt_qtaguid/. + It replaces owner as it takes the same arguments, but should + really be recognized by the iptables tool. + + If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA + tristate '"quota" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `quota' match, which allows to match on a + byte counter. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA2 + tristate '"quota2" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `quota2' match, which allows to match on a + byte counter correctly and not per CPU. + It allows naming the quotas. + This is based on http://xtables-addons.git.sourceforge.net + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA2_LOG + bool '"quota2" Netfilter LOG support' + depends on NETFILTER_XT_MATCH_QUOTA2 + depends on IP_NF_TARGET_ULOG=n # not yes, not module, just no + default n + help + This option allows `quota2' to log ONCE when a quota limit + is passed. It logs via NETLINK using the NETLINK_NFLOG family. + It logs similarly to how ipt_ULOG would without data. + + If unsure, say `N'. + +config NETFILTER_XT_MATCH_RATEEST + tristate '"rateest" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_RATEEST + help + This option adds a `rateest' match, which allows to match on the + rate estimated by the RATEEST target. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_REALM + tristate '"realm" match support' + depends on NETFILTER_ADVANCED + select IP_ROUTE_CLASSID + help + This option adds a `realm' match, which allows you to use the realm + key from the routing subsystem inside iptables. + + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + in tc world. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_RECENT + tristate '"recent" match support' + depends on NETFILTER_ADVANCED + ---help--- + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: + +config NETFILTER_XT_MATCH_SCTP + tristate '"sctp" protocol match support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + default IP_SCTP + help + With this option enabled, you will be able to use the + `sctp' match in order to match on SCTP source/destination ports + and SCTP chunk types. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config NETFILTER_XT_MATCH_SOCKET + tristate '"socket" match support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_TPROXY + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + help + This option adds a `socket' match, which can be used to match + packets for which a TCP or UDP socket lookup finds a valid socket. + It can be used in combination with the MARK target and policy + routing to implement full featured non-locally bound sockets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STATE + tristate '"state" match support' + depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + help + Connection state matching allows you to match packets based on their + relationship to a tracked connection (ie. previous packets). This + is a powerful tool for packet classification. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `statistic' match, which allows you to match + on packets periodically or randomly with a given percentage. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STRING + tristate '"string" match support' + depends on NETFILTER_ADVANCED + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_BM + select TEXTSEARCH_FSM + help + This option adds a `string' match, which allows you to look for + pattern matchings in packets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_TCPMSS + tristate '"tcpmss" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `tcpmss' match, which allows you to examine the + MSS value of TCP SYN packets, which control the maximum packet size + for that connection. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_TIME + tristate '"time" match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "time" match, which allows you to match based on + the packet arrival time (at the machine which netfilter is running) + on) or departure time/date (for locally generated packets). + + If you say Y here, try `iptables -m time --help` for + more information. + + If you want to compile it as a module, say M here. + If unsure, say N. + +config NETFILTER_XT_MATCH_U32 + tristate '"u32" match support' + depends on NETFILTER_ADVANCED + ---help--- + u32 allows you to extract quantities of up to 4 bytes from a packet, + AND them with specified masks, shift them by specified amounts and + test whether the results are in any of a set of specified ranges. + The specification of what to extract is general enough to skip over + headers with lengths stored in the packet, as in IP or TCP header + lengths. + + Details and examples are in the kernel module source. + +endif # NETFILTER_XTABLES + +endmenu + +source "net/netfilter/ipset/Kconfig" + +source "net/netfilter/ipvs/Kconfig" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile new file mode 100644 index 00000000..6d917176 --- /dev/null +++ b/net/netfilter/Makefile @@ -0,0 +1,117 @@ +netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o + +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o + +obj-$(CONFIG_NETFILTER) = netfilter.o + +obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o + +# connection tracking +obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o + +# SCTP protocol connection tracking +obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o +obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o +obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o +obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o + +# netlink interface for nf_conntrack +obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o + +# connection tracking helpers +nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o + +obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o +obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o +obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o +obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o +obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o +obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o +obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o +obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o +obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o +obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o +obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o + +# transparent proxy support +obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o + +# generic X tables +obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o + +# combos +obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o +obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o +obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o + +# targets +obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o +obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o + +# matches +obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o +obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o +obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o +obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o +obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o +obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o + +# ipset +obj-$(CONFIG_IP_SET) += ipset/ + +# IPVS +obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c new file mode 100644 index 00000000..899b71c0 --- /dev/null +++ b/net/netfilter/core.c @@ -0,0 +1,291 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +static DEFINE_MUTEX(afinfo_mutex); + +const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; +EXPORT_SYMBOL(nf_afinfo); + +int nf_register_afinfo(const struct nf_afinfo *afinfo) +{ + int err; + + err = mutex_lock_interruptible(&afinfo_mutex); + if (err < 0) + return err; + rcu_assign_pointer(nf_afinfo[afinfo->family], afinfo); + mutex_unlock(&afinfo_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(nf_register_afinfo); + +void nf_unregister_afinfo(const struct nf_afinfo *afinfo) +{ + mutex_lock(&afinfo_mutex); + rcu_assign_pointer(nf_afinfo[afinfo->family], NULL); + mutex_unlock(&afinfo_mutex); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_unregister_afinfo); + +struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; +EXPORT_SYMBOL(nf_hooks); +static DEFINE_MUTEX(nf_hook_mutex); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct nf_hook_ops *elem; + int err; + + err = mutex_lock_interruptible(&nf_hook_mutex); + if (err < 0) + return err; + list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + if (reg->priority < elem->priority) + break; + } + list_add_rcu(®->list, elem->list.prev); + mutex_unlock(&nf_hook_mutex); + return 0; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + mutex_lock(&nf_hook_mutex); + list_del_rcu(®->list); + mutex_unlock(&nf_hook_mutex); + + synchronize_net(); +} +EXPORT_SYMBOL(nf_unregister_hook); + +int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_hook(®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_hooks(reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_hooks); + +void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + while (n-- > 0) + nf_unregister_hook(®[n]); +} +EXPORT_SYMBOL(nf_unregister_hooks); + +unsigned int nf_iterate(struct list_head *head, + struct sk_buff *skb, + unsigned int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + unsigned int verdict; + + /* + * The caller must not block between calls to this + * function because of risk of continuing from deleted element. + */ + list_for_each_continue_rcu(*i, head) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + + if (hook_thresh > elem->priority) + continue; + + /* Optimization: we don't need to hold module + reference here, since function can't sleep. --RR */ +repeat: + verdict = elem->hook(hook, skb, indev, outdev, okfn); + if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely((verdict & NF_VERDICT_MASK) + > NF_MAX_VERDICT)) { + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); + continue; + } +#endif + if (verdict != NF_REPEAT) + return verdict; + goto repeat; + } + } + return NF_ACCEPT; +} + + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + + elem = &nf_hooks[pf][hook]; +next_hook: + verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; + } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { + kfree_skb(skb); + ret = NF_DROP_GETERR(verdict); + if (ret == 0) + ret = -EPERM; + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { + ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_QBITS); + if (ret < 0) { + if (ret == -ECANCELED) + goto next_hook; + if (ret == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } + ret = 0; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_hook_slow); + + +int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) +{ + if (writable_len > skb->len) + return 0; + + /* Not exclusive use of packet? Must copy. */ + if (!skb_cloned(skb)) { + if (writable_len <= skb_headlen(skb)) + return 1; + } else if (skb_clone_writable(skb, writable_len)) + return 1; + + if (writable_len <= skb_headlen(skb)) + writable_len = 0; + else + writable_len -= skb_headlen(skb); + + return !!__pskb_pull_tail(skb, writable_len); +} +EXPORT_SYMBOL(skb_make_writable); + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +/* This does not belong here, but locally generated errors need it if connection + tracking in use: without this, connection may not be in hash table, and hence + manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; +EXPORT_SYMBOL(ip_ct_attach); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ + void (*attach)(struct sk_buff *, struct sk_buff *); + + if (skb->nfct) { + rcu_read_lock(); + attach = rcu_dereference(ip_ct_attach); + if (attach) + attach(new, skb); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(nf_ct_attach); + +void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly; +EXPORT_SYMBOL(nf_ct_destroy); + +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + void (*destroy)(struct nf_conntrack *); + + rcu_read_lock(); + destroy = rcu_dereference(nf_ct_destroy); + BUG_ON(destroy == NULL); + destroy(nfct); + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_conntrack_destroy); +#endif /* CONFIG_NF_CONNTRACK */ + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_netfilter; +EXPORT_SYMBOL(proc_net_netfilter); +#endif + +void __init netfilter_init(void) +{ + int i, h; + for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); + } + +#ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); + if (!proc_net_netfilter) + panic("cannot create netfilter proc entry"); +#endif + + if (netfilter_queue_init() < 0) + panic("cannot initialize nf_queue"); + if (netfilter_log_init() < 0) + panic("cannot initialize nf_log"); +} + +#ifdef CONFIG_SYSCTL +struct ctl_path nf_net_netfilter_sysctl_path[] = { + { .procname = "net", }, + { .procname = "netfilter", }, + { } +}; +EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); +#endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig new file mode 100644 index 00000000..2c5b348e --- /dev/null +++ b/net/netfilter/ipset/Kconfig @@ -0,0 +1,122 @@ +menuconfig IP_SET + tristate "IP set support" + depends on INET && NETFILTER + depends on NETFILTER_NETLINK + help + This option adds IP set support to the kernel. + In order to define and use the sets, you need the userspace utility + ipset(8). You can use the sets in netfilter via the "set" match + and "SET" target. + + To compile it as a module, choose M here. If unsure, say N. + +if IP_SET + +config IP_SET_MAX + int "Maximum number of IP sets" + default 256 + range 2 65534 + depends on IP_SET + help + You can define here default value of the maximum number + of IP sets for the kernel. + + The value can be overriden by the 'max_sets' module + parameter of the 'ip_set' module. + +config IP_SET_BITMAP_IP + tristate "bitmap:ip set support" + depends on IP_SET + help + This option adds the bitmap:ip set type support, by which one + can store IPv4 addresses (or network addresse) from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_BITMAP_IPMAC + tristate "bitmap:ip,mac set support" + depends on IP_SET + help + This option adds the bitmap:ip,mac set type support, by which one + can store IPv4 address and (source) MAC address pairs from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_BITMAP_PORT + tristate "bitmap:port set support" + depends on IP_SET + help + This option adds the bitmap:port set type support, by which one + can store TCP/UDP port numbers from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IP + tristate "hash:ip set support" + depends on IP_SET + help + This option adds the hash:ip set type support, by which one + can store arbitrary IPv4 or IPv6 addresses (or network addresses) + in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORT + tristate "hash:ip,port set support" + depends on IP_SET + help + This option adds the hash:ip,port set type support, by which one + can store IPv4/IPv6 address and protocol/port pairs. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORTIP + tristate "hash:ip,port,ip set support" + depends on IP_SET + help + This option adds the hash:ip,port,ip set type support, by which + one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 + address triples in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORTNET + tristate "hash:ip,port,net set support" + depends on IP_SET + help + This option adds the hash:ip,port,net set type support, by which + one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 + network address/prefix triples in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NET + tristate "hash:net set support" + depends on IP_SET + help + This option adds the hash:net set type support, by which + one can store IPv4/IPv6 network address/prefix elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETPORT + tristate "hash:net,port set support" + depends on IP_SET + help + This option adds the hash:net,port set type support, by which + one can store IPv4/IPv6 network address/prefix and + protocol/port pairs as elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_LIST_SET + tristate "list:set set support" + depends on IP_SET + help + This option adds the list:set set type support. In this + kind of set one can store the name of other sets and it forms + an ordered union of the member sets. + + To compile it as a module, choose M here. If unsure, say N. + +endif # IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile new file mode 100644 index 00000000..5adbdab6 --- /dev/null +++ b/net/netfilter/ipset/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for the ipset modules +# + +ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o + +# ipset core +obj-$(CONFIG_IP_SET) += ip_set.o + +# bitmap types +obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o +obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o +obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o + +# hash types +obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o +obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o +obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o +obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o + +# list types +obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c new file mode 100644 index 00000000..ba2d1660 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -0,0 +1,586 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson + * Patrick Schaaf + * Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#define IP_SET_BITMAP_TIMEOUT +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("bitmap:ip type of IP sets"); +MODULE_ALIAS("ip_set_bitmap:ip"); + +/* Type structure */ +struct bitmap_ip { + void *members; /* the set members */ + u32 first_ip; /* host byte order, included in range */ + u32 last_ip; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ + u32 hosts; /* number of hosts in a subnet */ + size_t memsize; /* members size */ + u8 netmask; /* subnet netmask */ + u32 timeout; /* timeout parameter */ + struct timer_list gc; /* garbage collection */ +}; + +/* Base variant */ + +static inline u32 +ip_to_id(const struct bitmap_ip *m, u32 ip) +{ + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; +} + +static int +bitmap_ip_test(struct ip_set *set, void *value, u32 timeout) +{ + const struct bitmap_ip *map = set->data; + u16 id = *(u16 *)value; + + return !!test_bit(id, map->members); +} + +static int +bitmap_ip_add(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ip *map = set->data; + u16 id = *(u16 *)value; + + if (test_and_set_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_ip_del(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ip *map = set->data; + u16 id = *(u16 *)value; + + if (!test_and_clear_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_ip_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ip *map = set->data; + struct nlattr *atd, *nested; + u32 id, first = cb->args[2]; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] < map->elements; cb->args[2]++) { + id = cb->args[2]; + if (!test_bit(id, map->members)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +/* Timeout variant */ + +static int +bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout) +{ + const struct bitmap_ip *map = set->data; + const unsigned long *members = map->members; + u16 id = *(u16 *)value; + + return ip_set_timeout_test(members[id]); +} + +static int +bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ip *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + + if (ip_set_timeout_test(members[id])) + return -IPSET_ERR_EXIST; + + members[id] = ip_set_timeout_set(timeout); + + return 0; +} + +static int +bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ip *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + int ret = -IPSET_ERR_EXIST; + + if (ip_set_timeout_test(members[id])) + ret = 0; + + members[id] = IPSET_ELEM_UNSET; + return ret; +} + +static int +bitmap_ip_tlist(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ip *map = set->data; + struct nlattr *adt, *nested; + u32 id, first = cb->args[2]; + const unsigned long *members = map->members; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[2] < map->elements; cb->args[2]++) { + id = cb->args[2]; + if (!ip_set_timeout_test(members[id])) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(members[id]))); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, adt); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static int +bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + struct bitmap_ip *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 ip; + + ip = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + ip = ip_to_id(map, ip); + + return adtfn(set, &ip, map->timeout); +} + +static int +bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + struct bitmap_ip *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 timeout = map->timeout; + u32 ip, ip_to, id; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(map->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST) { + id = ip_to_id(map, ip); + return adtfn(set, &id, timeout); + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) { + swap(ip, ip_to); + if (ip < map->first_ip) + return -IPSET_ERR_BITMAP_RANGE; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip &= ip_set_hostmask(cidr); + ip_to = ip | ~ip_set_hostmask(cidr); + } else + ip_to = ip; + + if (ip_to > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + for (; !before(ip_to, ip); ip += map->hosts) { + id = ip_to_id(map, ip); + ret = adtfn(set, &id, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static void +bitmap_ip_destroy(struct ip_set *set) +{ + struct bitmap_ip *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + kfree(map); + + set->data = NULL; +} + +static void +bitmap_ip_flush(struct ip_set *set) +{ + struct bitmap_ip *map = set->data; + + memset(map->members, 0, map->memsize); +} + +static int +bitmap_ip_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct bitmap_ip *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); + if (map->netmask != 32) + NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->memsize)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static bool +bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_ip *x = a->data; + const struct bitmap_ip *y = b->data; + + return x->first_ip == y->first_ip && + x->last_ip == y->last_ip && + x->netmask == y->netmask && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant bitmap_ip = { + .kadt = bitmap_ip_kadt, + .uadt = bitmap_ip_uadt, + .adt = { + [IPSET_ADD] = bitmap_ip_add, + [IPSET_DEL] = bitmap_ip_del, + [IPSET_TEST] = bitmap_ip_test, + }, + .destroy = bitmap_ip_destroy, + .flush = bitmap_ip_flush, + .head = bitmap_ip_head, + .list = bitmap_ip_list, + .same_set = bitmap_ip_same_set, +}; + +static const struct ip_set_type_variant bitmap_tip = { + .kadt = bitmap_ip_kadt, + .uadt = bitmap_ip_uadt, + .adt = { + [IPSET_ADD] = bitmap_ip_tadd, + [IPSET_DEL] = bitmap_ip_tdel, + [IPSET_TEST] = bitmap_ip_ttest, + }, + .destroy = bitmap_ip_destroy, + .flush = bitmap_ip_flush, + .head = bitmap_ip_head, + .list = bitmap_ip_tlist, + .same_set = bitmap_ip_same_set, +}; + +static void +bitmap_ip_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct bitmap_ip *map = set->data; + unsigned long *table = map->members; + u32 id; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id < map->elements; id++) + if (ip_set_timeout_expired(table[id])) + table[id] = IPSET_ELEM_UNSET; + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +bitmap_ip_gc_init(struct ip_set *set) +{ + struct bitmap_ip *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = bitmap_ip_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create bitmap:ip type of sets */ + +static bool +init_map_ip(struct ip_set *set, struct bitmap_ip *map, + u32 first_ip, u32 last_ip, + u32 elements, u32 hosts, u8 netmask) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + map->first_ip = first_ip; + map->last_ip = last_ip; + map->elements = elements; + map->hosts = hosts; + map->netmask = netmask; + map->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = AF_INET; + + return true; +} + +static int +bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct bitmap_ip *map; + u32 first_ip, last_ip, hosts, elements; + u8 netmask = 32; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); + if (ret) + return ret; + if (first_ip > last_ip) { + u32 tmp = first_ip; + + first_ip = last_ip; + last_ip = tmp; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr >= 32) + return -IPSET_ERR_INVALID_CIDR; + last_ip = first_ip | ~ip_set_hostmask(cidr); + } else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if (netmask > 32) + return -IPSET_ERR_INVALID_NETMASK; + + first_ip &= ip_set_hostmask(netmask); + last_ip |= ~ip_set_hostmask(netmask); + } + + if (netmask == 32) { + hosts = 1; + elements = last_ip - first_ip + 1; + } else { + u8 mask_bits; + u32 mask; + + mask = range_to_mask(first_ip, last_ip, &mask_bits); + + if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) || + netmask <= mask_bits) + return -IPSET_ERR_BITMAP_RANGE; + + pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); + hosts = 2 << (32 - netmask - 1); + elements = 2 << (netmask - mask_bits - 1); + } + if (elements > IPSET_BITMAP_MAX_RANGE + 1) + return -IPSET_ERR_BITMAP_RANGE_SIZE; + + pr_debug("hosts %u, elements %u\n", hosts, elements); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + map->memsize = elements * sizeof(unsigned long); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->variant = &bitmap_tip; + + bitmap_ip_gc_init(set); + } else { + map->memsize = bitmap_bytes(0, elements - 1); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + + set->variant = &bitmap_ip; + } + return 0; +} + +static struct ip_set_type bitmap_ip_type __read_mostly = { + .name = "bitmap:ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = AF_INET, + .revision = 0, + .create = bitmap_ip_create, + .create_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_ip_init(void) +{ + return ip_set_type_register(&bitmap_ip_type); +} + +static void __exit +bitmap_ip_fini(void) +{ + ip_set_type_unregister(&bitmap_ip_type); +} + +module_init(bitmap_ip_init); +module_exit(bitmap_ip_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c new file mode 100644 index 00000000..a274300b --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -0,0 +1,655 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson + * Patrick Schaaf + * Martin Josefsson + * Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip,mac type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("bitmap:ip,mac type of IP sets"); +MODULE_ALIAS("ip_set_bitmap:ip,mac"); + +enum { + MAC_EMPTY, /* element is not set */ + MAC_FILLED, /* element is set with MAC */ + MAC_UNSET, /* element is set, without MAC */ +}; + +/* Type structure */ +struct bitmap_ipmac { + void *members; /* the set members */ + u32 first_ip; /* host byte order, included in range */ + u32 last_ip; /* host byte order, included in range */ + u32 timeout; /* timeout value */ + struct timer_list gc; /* garbage collector */ + size_t dsize; /* size of element */ +}; + +/* ADT structure for generic function args */ +struct ipmac { + u32 id; /* id in array */ + unsigned char *ether; /* ethernet address */ +}; + +/* Member element without and with timeout */ + +struct ipmac_elem { + unsigned char ether[ETH_ALEN]; + unsigned char match; +} __attribute__ ((aligned)); + +struct ipmac_telem { + unsigned char ether[ETH_ALEN]; + unsigned char match; + unsigned long timeout; +} __attribute__ ((aligned)); + +static inline void * +bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id) +{ + return (void *)((char *)map->members + id * map->dsize); +} + +static inline bool +bitmap_timeout(const struct bitmap_ipmac *map, u32 id) +{ + const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); + + return ip_set_timeout_test(elem->timeout); +} + +static inline bool +bitmap_expired(const struct bitmap_ipmac *map, u32 id) +{ + const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); + + return ip_set_timeout_expired(elem->timeout); +} + +static inline int +bitmap_ipmac_exist(const struct ipmac_telem *elem) +{ + return elem->match == MAC_UNSET || + (elem->match == MAC_FILLED && + !ip_set_timeout_expired(elem->timeout)); +} + +/* Base variant */ + +static int +bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; + case MAC_FILLED: + return data->ether == NULL || + compare_ether_addr(data->ether, elem->ether) == 0; + } + return 0; +} + +static int +bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + if (!data->ether) + /* Already added without ethernet address */ + return -IPSET_ERR_EXIST; + /* Fill the MAC address */ + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + break; + case MAC_FILLED: + return -IPSET_ERR_EXIST; + case MAC_EMPTY: + if (data->ether) { + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + } else + elem->match = MAC_UNSET; + } + + return 0; +} + +static int +bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + if (elem->match == MAC_EMPTY) + return -IPSET_ERR_EXIST; + + elem->match = MAC_EMPTY; + + return 0; +} + +static int +bitmap_ipmac_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac_elem *elem; + struct nlattr *atd, *nested; + u32 id, first = cb->args[2]; + u32 last = map->last_ip - map->first_ip; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + elem = bitmap_ipmac_elem(map, id); + if (elem->match == MAC_EMPTY) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)); + if (elem->match == MAC_FILLED) + NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN, + elem->ether); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +/* Timeout variant */ + +static int +bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; + case MAC_FILLED: + return (data->ether == NULL || + compare_ether_addr(data->ether, elem->ether) == 0) && + !bitmap_expired(map, data->id); + } + return 0; +} + +static int +bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + if (!data->ether) + /* Already added without ethernet address */ + return -IPSET_ERR_EXIST; + /* Fill the MAC address and activate the timer */ + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + if (timeout == map->timeout) + /* Timeout was not specified, get stored one */ + timeout = elem->timeout; + elem->timeout = ip_set_timeout_set(timeout); + break; + case MAC_FILLED: + if (!bitmap_expired(map, data->id)) + return -IPSET_ERR_EXIST; + /* Fall through */ + case MAC_EMPTY: + if (data->ether) { + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + } else + elem->match = MAC_UNSET; + /* If MAC is unset yet, we store plain timeout value + * because the timer is not activated yet + * and we can reuse it later when MAC is filled out, + * possibly by the kernel */ + elem->timeout = data->ether ? ip_set_timeout_set(timeout) + : timeout; + break; + } + + return 0; +} + +static int +bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); + + if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id)) + return -IPSET_ERR_EXIST; + + elem->match = MAC_EMPTY; + + return 0; +} + +static int +bitmap_ipmac_tlist(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac_telem *elem; + struct nlattr *atd, *nested; + u32 id, first = cb->args[2]; + u32 timeout, last = map->last_ip - map->first_ip; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + elem = bitmap_ipmac_elem(map, id); + if (!bitmap_ipmac_exist(elem)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)); + if (elem->match == MAC_FILLED) + NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN, + elem->ether); + timeout = elem->match == MAC_UNSET ? elem->timeout + : ip_set_timeout_get(elem->timeout); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout)); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + return -EMSGSIZE; +} + +static int +bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + struct bitmap_ipmac *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct ipmac data; + + /* MAC can be src only */ + if (!(flags & IPSET_DIM_TWO_SRC)) + return 0; + + data.id = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC)); + if (data.id < map->first_ip || data.id > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + /* Backward compatibility: we don't check the second flag */ + if (skb_mac_header(skb) < skb->head || + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + + data.id -= map->first_ip; + data.ether = eth_hdr(skb)->h_source; + + return adtfn(set, &data, map->timeout); +} + +static int +bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct bitmap_ipmac *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct ipmac data; + u32 timeout = map->timeout; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id); + if (ret) + return ret; + + if (data.id < map->first_ip || data.id > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + if (tb[IPSET_ATTR_ETHER]) + data.ether = nla_data(tb[IPSET_ATTR_ETHER]); + else + data.ether = NULL; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(map->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + data.id -= map->first_ip; + + ret = adtfn(set, &data, timeout); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static void +bitmap_ipmac_destroy(struct ip_set *set) +{ + struct bitmap_ipmac *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + kfree(map); + + set->data = NULL; +} + +static void +bitmap_ipmac_flush(struct ip_set *set) +{ + struct bitmap_ipmac *map = set->data; + + memset(map->members, 0, + (map->last_ip - map->first_ip + 1) * map->dsize); +} + +static int +bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct bitmap_ipmac *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + + (map->last_ip - map->first_ip + 1) * map->dsize)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static bool +bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_ipmac *x = a->data; + const struct bitmap_ipmac *y = b->data; + + return x->first_ip == y->first_ip && + x->last_ip == y->last_ip && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant bitmap_ipmac = { + .kadt = bitmap_ipmac_kadt, + .uadt = bitmap_ipmac_uadt, + .adt = { + [IPSET_ADD] = bitmap_ipmac_add, + [IPSET_DEL] = bitmap_ipmac_del, + [IPSET_TEST] = bitmap_ipmac_test, + }, + .destroy = bitmap_ipmac_destroy, + .flush = bitmap_ipmac_flush, + .head = bitmap_ipmac_head, + .list = bitmap_ipmac_list, + .same_set = bitmap_ipmac_same_set, +}; + +static const struct ip_set_type_variant bitmap_tipmac = { + .kadt = bitmap_ipmac_kadt, + .uadt = bitmap_ipmac_uadt, + .adt = { + [IPSET_ADD] = bitmap_ipmac_tadd, + [IPSET_DEL] = bitmap_ipmac_tdel, + [IPSET_TEST] = bitmap_ipmac_ttest, + }, + .destroy = bitmap_ipmac_destroy, + .flush = bitmap_ipmac_flush, + .head = bitmap_ipmac_head, + .list = bitmap_ipmac_tlist, + .same_set = bitmap_ipmac_same_set, +}; + +static void +bitmap_ipmac_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct bitmap_ipmac *map = set->data; + struct ipmac_telem *elem; + u32 id, last = map->last_ip - map->first_ip; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id <= last; id++) { + elem = bitmap_ipmac_elem(map, id); + if (elem->match == MAC_FILLED && + ip_set_timeout_expired(elem->timeout)) + elem->match = MAC_EMPTY; + } + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +bitmap_ipmac_gc_init(struct ip_set *set) +{ + struct bitmap_ipmac *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = bitmap_ipmac_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create bitmap:ip,mac type of sets */ + +static bool +init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, + u32 first_ip, u32 last_ip) +{ + map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize); + if (!map->members) + return false; + map->first_ip = first_ip; + map->last_ip = last_ip; + map->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = AF_INET; + + return true; +} + +static int +bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + u32 first_ip, last_ip, elements; + struct bitmap_ipmac *map; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); + if (ret) + return ret; + if (first_ip > last_ip) { + u32 tmp = first_ip; + + first_ip = last_ip; + last_ip = tmp; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr >= 32) + return -IPSET_ERR_INVALID_CIDR; + last_ip = first_ip | ~ip_set_hostmask(cidr); + } else + return -IPSET_ERR_PROTOCOL; + + elements = last_ip - first_ip + 1; + + if (elements > IPSET_BITMAP_MAX_RANGE + 1) + return -IPSET_ERR_BITMAP_RANGE_SIZE; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct ipmac_telem); + + if (!init_map_ipmac(set, map, first_ip, last_ip)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = &bitmap_tipmac; + + bitmap_ipmac_gc_init(set); + } else { + map->dsize = sizeof(struct ipmac_elem); + + if (!init_map_ipmac(set, map, first_ip, last_ip)) { + kfree(map); + return -ENOMEM; + } + set->variant = &bitmap_ipmac; + + } + return 0; +} + +static struct ip_set_type bitmap_ipmac_type = { + .name = "bitmap:ip,mac", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, + .dimension = IPSET_DIM_TWO, + .family = AF_INET, + .revision = 0, + .create = bitmap_ipmac_create, + .create_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_ipmac_init(void) +{ + return ip_set_type_register(&bitmap_ipmac_type); +} + +static void __exit +bitmap_ipmac_fini(void) +{ + ip_set_type_unregister(&bitmap_ipmac_type); +} + +module_init(bitmap_ipmac_init); +module_exit(bitmap_ipmac_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c new file mode 100644 index 00000000..6b38eb8f --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -0,0 +1,514 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:port type */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#define IP_SET_BITMAP_TIMEOUT +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("bitmap:port type of IP sets"); +MODULE_ALIAS("ip_set_bitmap:port"); + +/* Type structure */ +struct bitmap_port { + void *members; /* the set members */ + u16 first_port; /* host byte order, included in range */ + u16 last_port; /* host byte order, included in range */ + size_t memsize; /* members size */ + u32 timeout; /* timeout parameter */ + struct timer_list gc; /* garbage collection */ +}; + +/* Base variant */ + +static int +bitmap_port_test(struct ip_set *set, void *value, u32 timeout) +{ + const struct bitmap_port *map = set->data; + u16 id = *(u16 *)value; + + return !!test_bit(id, map->members); +} + +static int +bitmap_port_add(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_port *map = set->data; + u16 id = *(u16 *)value; + + if (test_and_set_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_port_del(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_port *map = set->data; + u16 id = *(u16 *)value; + + if (!test_and_clear_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_port_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_port *map = set->data; + struct nlattr *atd, *nested; + u16 id, first = cb->args[2]; + u16 last = map->last_port - map->first_port; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + if (!test_bit(id, map->members)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +/* Timeout variant */ + +static int +bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout) +{ + const struct bitmap_port *map = set->data; + const unsigned long *members = map->members; + u16 id = *(u16 *)value; + + return ip_set_timeout_test(members[id]); +} + +static int +bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_port *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + + if (ip_set_timeout_test(members[id])) + return -IPSET_ERR_EXIST; + + members[id] = ip_set_timeout_set(timeout); + + return 0; +} + +static int +bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout) +{ + struct bitmap_port *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + int ret = -IPSET_ERR_EXIST; + + if (ip_set_timeout_test(members[id])) + ret = 0; + + members[id] = IPSET_ELEM_UNSET; + return ret; +} + +static int +bitmap_port_tlist(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_port *map = set->data; + struct nlattr *adt, *nested; + u16 id, first = cb->args[2]; + u16 last = map->last_port - map->first_port; + const unsigned long *members = map->members; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + if (!ip_set_timeout_test(members[id])) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(members[id]))); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, adt); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static int +bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + struct bitmap_port *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + __be16 __port; + u16 port = 0; + + if (!ip_set_get_ip_port(skb, pf, flags & IPSET_DIM_ONE_SRC, &__port)) + return -EINVAL; + + port = ntohs(__port); + + if (port < map->first_port || port > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + port -= map->first_port; + + return adtfn(set, &port, map->timeout); +} + +static int +bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + struct bitmap_port *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 timeout = map->timeout; + u32 port; /* wraparound */ + u16 id, port_to; + int ret = 0; + + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); + if (port < map->first_port || port > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(map->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST) { + id = port - map->first_port; + return adtfn(set, &id, timeout); + } + + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) { + swap(port, port_to); + if (port < map->first_port) + return -IPSET_ERR_BITMAP_RANGE; + } + } else + port_to = port; + + if (port_to > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + for (; port <= port_to; port++) { + id = port - map->first_port; + ret = adtfn(set, &id, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static void +bitmap_port_destroy(struct ip_set *set) +{ + struct bitmap_port *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + kfree(map); + + set->data = NULL; +} + +static void +bitmap_port_flush(struct ip_set *set) +{ + struct bitmap_port *map = set->data; + + memset(map->members, 0, map->memsize); +} + +static int +bitmap_port_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct bitmap_port *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->memsize)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static bool +bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_port *x = a->data; + const struct bitmap_port *y = b->data; + + return x->first_port == y->first_port && + x->last_port == y->last_port && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant bitmap_port = { + .kadt = bitmap_port_kadt, + .uadt = bitmap_port_uadt, + .adt = { + [IPSET_ADD] = bitmap_port_add, + [IPSET_DEL] = bitmap_port_del, + [IPSET_TEST] = bitmap_port_test, + }, + .destroy = bitmap_port_destroy, + .flush = bitmap_port_flush, + .head = bitmap_port_head, + .list = bitmap_port_list, + .same_set = bitmap_port_same_set, +}; + +static const struct ip_set_type_variant bitmap_tport = { + .kadt = bitmap_port_kadt, + .uadt = bitmap_port_uadt, + .adt = { + [IPSET_ADD] = bitmap_port_tadd, + [IPSET_DEL] = bitmap_port_tdel, + [IPSET_TEST] = bitmap_port_ttest, + }, + .destroy = bitmap_port_destroy, + .flush = bitmap_port_flush, + .head = bitmap_port_head, + .list = bitmap_port_tlist, + .same_set = bitmap_port_same_set, +}; + +static void +bitmap_port_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct bitmap_port *map = set->data; + unsigned long *table = map->members; + u32 id; /* wraparound */ + u16 last = map->last_port - map->first_port; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id <= last; id++) + if (ip_set_timeout_expired(table[id])) + table[id] = IPSET_ELEM_UNSET; + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +bitmap_port_gc_init(struct ip_set *set) +{ + struct bitmap_port *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = bitmap_port_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create bitmap:ip type of sets */ + +static bool +init_map_port(struct ip_set *set, struct bitmap_port *map, + u16 first_port, u16 last_port) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + map->first_port = first_port; + map->last_port = last_port; + map->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = AF_UNSPEC; + + return true; +} + +static int +bitmap_port_create(struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + struct bitmap_port *map; + u16 first_port, last_port; + + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); + last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (first_port > last_port) { + u16 tmp = first_port; + + first_port = last_port; + last_port = tmp; + } + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + map->memsize = (last_port - first_port + 1) + * sizeof(unsigned long); + + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->variant = &bitmap_tport; + + bitmap_port_gc_init(set); + } else { + map->memsize = bitmap_bytes(0, last_port - first_port); + pr_debug("memsize: %zu\n", map->memsize); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + + set->variant = &bitmap_port; + } + return 0; +} + +static struct ip_set_type bitmap_port_type = { + .name = "bitmap:port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_PORT, + .dimension = IPSET_DIM_ONE, + .family = AF_UNSPEC, + .revision = 0, + .create = bitmap_port_create, + .create_policy = { + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_port_init(void) +{ + return ip_set_type_register(&bitmap_port_type); +} + +static void __exit +bitmap_port_fini(void) +{ + ip_set_type_unregister(&bitmap_port_type); +} + +module_init(bitmap_port_init); +module_exit(bitmap_port_fini); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c new file mode 100644 index 00000000..42aa64b6 --- /dev/null +++ b/net/netfilter/ipset/ip_set_core.c @@ -0,0 +1,1708 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson + * Patrick Schaaf + * Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module for IP set management */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static LIST_HEAD(ip_set_type_list); /* all registered set types */ +static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ +static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ + +static struct ip_set **ip_set_list; /* all individual sets */ +static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ + +#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) + +static unsigned int max_sets; + +module_param(max_sets, int, 0600); +MODULE_PARM_DESC(max_sets, "maximal number of sets"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("core IP set support"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); + +/* + * The set types are implemented in modules and registered set types + * can be found in ip_set_type_list. Adding/deleting types is + * serialized by ip_set_type_mutex. + */ + +static inline void +ip_set_type_lock(void) +{ + mutex_lock(&ip_set_type_mutex); +} + +static inline void +ip_set_type_unlock(void) +{ + mutex_unlock(&ip_set_type_mutex); +} + +/* Register and deregister settype */ + +static struct ip_set_type * +find_set_type(const char *name, u8 family, u8 revision) +{ + struct ip_set_type *type; + + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name) && + (type->family == family || type->family == AF_UNSPEC) && + type->revision == revision) + return type; + return NULL; +} + +/* Unlock, try to load a set type module and lock again */ +static int +try_to_load_type(const char *name) +{ + nfnl_unlock(); + pr_debug("try to load ip_set_%s\n", name); + if (request_module("ip_set_%s", name) < 0) { + pr_warning("Can't find ip_set type %s\n", name); + nfnl_lock(); + return -IPSET_ERR_FIND_TYPE; + } + nfnl_lock(); + return -EAGAIN; +} + +/* Find a set type and reference it */ +static int +find_set_type_get(const char *name, u8 family, u8 revision, + struct ip_set_type **found) +{ + struct ip_set_type *type; + int err; + + rcu_read_lock(); + *found = find_set_type(name, family, revision); + if (*found) { + err = !try_module_get((*found)->me) ? -EFAULT : 0; + goto unlock; + } + /* Make sure the type is loaded but we don't support the revision */ + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name)) { + err = -IPSET_ERR_FIND_TYPE; + goto unlock; + } + rcu_read_unlock(); + + return try_to_load_type(name); + +unlock: + rcu_read_unlock(); + return err; +} + +/* Find a given set type by name and family. + * If we succeeded, the supported minimal and maximum revisions are + * filled out. + */ +static int +find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max) +{ + struct ip_set_type *type; + bool found = false; + + *min = 255; *max = 0; + rcu_read_lock(); + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name) && + (type->family == family || type->family == AF_UNSPEC)) { + found = true; + if (type->revision < *min) + *min = type->revision; + if (type->revision > *max) + *max = type->revision; + } + rcu_read_unlock(); + if (found) + return 0; + + return try_to_load_type(name); +} + +#define family_name(f) ((f) == AF_INET ? "inet" : \ + (f) == AF_INET6 ? "inet6" : "any") + +/* Register a set type structure. The type is identified by + * the unique triple of name, family and revision. + */ +int +ip_set_type_register(struct ip_set_type *type) +{ + int ret = 0; + + if (type->protocol != IPSET_PROTOCOL) { + pr_warning("ip_set type %s, family %s, revision %u uses " + "wrong protocol version %u (want %u)\n", + type->name, family_name(type->family), + type->revision, type->protocol, IPSET_PROTOCOL); + return -EINVAL; + } + + ip_set_type_lock(); + if (find_set_type(type->name, type->family, type->revision)) { + /* Duplicate! */ + pr_warning("ip_set type %s, family %s, revision %u " + "already registered!\n", type->name, + family_name(type->family), type->revision); + ret = -EINVAL; + goto unlock; + } + list_add_rcu(&type->list, &ip_set_type_list); + pr_debug("type %s, family %s, revision %u registered.\n", + type->name, family_name(type->family), type->revision); +unlock: + ip_set_type_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_type_register); + +/* Unregister a set type. There's a small race with ip_set_create */ +void +ip_set_type_unregister(struct ip_set_type *type) +{ + ip_set_type_lock(); + if (!find_set_type(type->name, type->family, type->revision)) { + pr_warning("ip_set type %s, family %s, revision %u " + "not registered\n", type->name, + family_name(type->family), type->revision); + goto unlock; + } + list_del_rcu(&type->list); + pr_debug("type %s, family %s, revision %u unregistered.\n", + type->name, family_name(type->family), type->revision); +unlock: + ip_set_type_unlock(); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ip_set_type_unregister); + +/* Utility functions */ +void * +ip_set_alloc(size_t size) +{ + void *members = NULL; + + if (size < KMALLOC_MAX_SIZE) + members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + + if (members) { + pr_debug("%p: allocated with kmalloc\n", members); + return members; + } + + members = vzalloc(size); + if (!members) + return NULL; + pr_debug("%p: allocated with vmalloc\n", members); + + return members; +} +EXPORT_SYMBOL_GPL(ip_set_alloc); + +void +ip_set_free(void *members) +{ + pr_debug("%p: free with %s\n", members, + is_vmalloc_addr(members) ? "vfree" : "kfree"); + if (is_vmalloc_addr(members)) + vfree(members); + else + kfree(members); +} +EXPORT_SYMBOL_GPL(ip_set_free); + +static inline bool +flag_nested(const struct nlattr *nla) +{ + return nla->nla_type & NLA_F_NESTED; +} + +static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { + [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, + [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, +}; + +int +ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) +{ + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + + if (unlikely(!flag_nested(nla))) + return -IPSET_ERR_PROTOCOL; + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + return -IPSET_ERR_PROTOCOL; + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) + return -IPSET_ERR_PROTOCOL; + + *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); + +int +ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) +{ + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + + if (unlikely(!flag_nested(nla))) + return -IPSET_ERR_PROTOCOL; + + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + return -IPSET_ERR_PROTOCOL; + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) + return -IPSET_ERR_PROTOCOL; + + memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), + sizeof(struct in6_addr)); + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); + +/* + * Creating/destroying/renaming/swapping affect the existence and + * the properties of a set. All of these can be executed from userspace + * only and serialized by the nfnl mutex indirectly from nfnetlink. + * + * Sets are identified by their index in ip_set_list and the index + * is used by the external references (set/SET netfilter modules). + * + * The set behind an index may change by swapping only, from userspace. + */ + +static inline void +__ip_set_get(ip_set_id_t index) +{ + write_lock_bh(&ip_set_ref_lock); + ip_set_list[index]->ref++; + write_unlock_bh(&ip_set_ref_lock); +} + +static inline void +__ip_set_put(ip_set_id_t index) +{ + write_lock_bh(&ip_set_ref_lock); + BUG_ON(ip_set_list[index]->ref == 0); + ip_set_list[index]->ref--; + write_unlock_bh(&ip_set_ref_lock); +} + +/* + * Add, del and test set entries from kernel. + * + * The set behind the index must exist and must be referenced + * so it can't be destroyed (or changed) under our foot. + */ + +int +ip_set_test(ip_set_id_t index, const struct sk_buff *skb, + u8 family, u8 dim, u8 flags) +{ + struct ip_set *set = ip_set_list[index]; + int ret = 0; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (dim < set->type->dimension || + !(family == set->family || set->family == AF_UNSPEC)) + return 0; + + read_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, IPSET_TEST, family, dim, flags); + read_unlock_bh(&set->lock); + + if (ret == -EAGAIN) { + /* Type requests element to be completed */ + pr_debug("element must be competed, ADD is triggered\n"); + write_lock_bh(&set->lock); + set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags); + write_unlock_bh(&set->lock); + ret = 1; + } + + /* Convert error codes to nomatch */ + return (ret < 0 ? 0 : ret); +} +EXPORT_SYMBOL_GPL(ip_set_test); + +int +ip_set_add(ip_set_id_t index, const struct sk_buff *skb, + u8 family, u8 dim, u8 flags) +{ + struct ip_set *set = ip_set_list[index]; + int ret; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (dim < set->type->dimension || + !(family == set->family || set->family == AF_UNSPEC)) + return 0; + + write_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags); + write_unlock_bh(&set->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_add); + +int +ip_set_del(ip_set_id_t index, const struct sk_buff *skb, + u8 family, u8 dim, u8 flags) +{ + struct ip_set *set = ip_set_list[index]; + int ret = 0; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (dim < set->type->dimension || + !(family == set->family || set->family == AF_UNSPEC)) + return 0; + + write_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, IPSET_DEL, family, dim, flags); + write_unlock_bh(&set->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_del); + +/* + * Find set by name, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + */ +ip_set_id_t +ip_set_get_byname(const char *name, struct ip_set **set) +{ + ip_set_id_t i, index = IPSET_INVALID_ID; + struct ip_set *s; + + for (i = 0; i < ip_set_max; i++) { + s = ip_set_list[i]; + if (s != NULL && STREQ(s->name, name)) { + __ip_set_get(i); + index = i; + *set = s; + } + } + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_get_byname); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + */ +void +ip_set_put_byindex(ip_set_id_t index) +{ + if (ip_set_list[index] != NULL) + __ip_set_put(index); +} +EXPORT_SYMBOL_GPL(ip_set_put_byindex); + +/* + * Get the name of a set behind a set index. + * We assume the set is referenced, so it does exist and + * can't be destroyed. The set cannot be renamed due to + * the referencing either. + * + */ +const char * +ip_set_name_byindex(ip_set_id_t index) +{ + const struct ip_set *set = ip_set_list[index]; + + BUG_ON(set == NULL); + BUG_ON(set->ref == 0); + + /* Referenced, so it's safe */ + return set->name; +} +EXPORT_SYMBOL_GPL(ip_set_name_byindex); + +/* + * Routines to call by external subsystems, which do not + * call nfnl_lock for us. + */ + +/* + * Find set by name, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + * The nfnl mutex is used in the function. + */ +ip_set_id_t +ip_set_nfnl_get(const char *name) +{ + struct ip_set *s; + ip_set_id_t index; + + nfnl_lock(); + index = ip_set_get_byname(name, &s); + nfnl_unlock(); + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_get); + +/* + * Find set by index, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + * The nfnl mutex is used in the function. + */ +ip_set_id_t +ip_set_nfnl_get_byindex(ip_set_id_t index) +{ + if (index > ip_set_max) + return IPSET_INVALID_ID; + + nfnl_lock(); + if (ip_set_list[index]) + __ip_set_get(index); + else + index = IPSET_INVALID_ID; + nfnl_unlock(); + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + * The nfnl mutex is used in the function. + */ +void +ip_set_nfnl_put(ip_set_id_t index) +{ + nfnl_lock(); + ip_set_put_byindex(index); + nfnl_unlock(); +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_put); + +/* + * Communication protocol with userspace over netlink. + * + * The commands are serialized by the nfnl mutex. + */ + +static inline bool +protocol_failed(const struct nlattr * const tb[]) +{ + return !tb[IPSET_ATTR_PROTOCOL] || + nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL; +} + +static inline u32 +flag_exist(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; +} + +static struct nlmsghdr * +start_msg(struct sk_buff *skb, u32 pid, u32 seq, unsigned int flags, + enum ipset_cmd cmd) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + nlh = nlmsg_put(skb, pid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), + sizeof(*nfmsg), flags); + if (nlh == NULL) + return NULL; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + return nlh; +} + +/* Create a set */ + +static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1}, + [IPSET_ATTR_REVISION] = { .type = NLA_U8 }, + [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, + [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, +}; + +static ip_set_id_t +find_set_id(const char *name) +{ + ip_set_id_t i, index = IPSET_INVALID_ID; + const struct ip_set *set; + + for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) { + set = ip_set_list[i]; + if (set != NULL && STREQ(set->name, name)) + index = i; + } + return index; +} + +static inline struct ip_set * +find_set(const char *name) +{ + ip_set_id_t index = find_set_id(name); + + return index == IPSET_INVALID_ID ? NULL : ip_set_list[index]; +} + +static int +find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) +{ + ip_set_id_t i; + + *index = IPSET_INVALID_ID; + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] == NULL) { + if (*index == IPSET_INVALID_ID) + *index = i; + } else if (STREQ(name, ip_set_list[i]->name)) { + /* Name clash */ + *set = ip_set_list[i]; + return -EEXIST; + } + } + if (*index == IPSET_INVALID_ID) + /* No free slot remained */ + return -IPSET_ERR_MAX_SETS; + return 0; +} + +static int +ip_set_create(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set, *clash = NULL; + ip_set_id_t index = IPSET_INVALID_ID; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + const char *name, *typename; + u8 family, revision; + u32 flags = flag_exist(nlh); + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_TYPENAME] == NULL || + attr[IPSET_ATTR_REVISION] == NULL || + attr[IPSET_ATTR_FAMILY] == NULL || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])))) + return -IPSET_ERR_PROTOCOL; + + name = nla_data(attr[IPSET_ATTR_SETNAME]); + typename = nla_data(attr[IPSET_ATTR_TYPENAME]); + family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); + revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); + pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", + name, typename, family_name(family), revision); + + /* + * First, and without any locks, allocate and initialize + * a normal base set structure. + */ + set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + if (!set) + return -ENOMEM; + rwlock_init(&set->lock); + strlcpy(set->name, name, IPSET_MAXNAMELEN); + set->family = family; + + /* + * Next, check that we know the type, and take + * a reference on the type, to make sure it stays available + * while constructing our new set. + * + * After referencing the type, we try to create the type + * specific part of the set without holding any locks. + */ + ret = find_set_type_get(typename, family, revision, &(set->type)); + if (ret) + goto out; + + /* + * Without holding any locks, create private part. + */ + if (attr[IPSET_ATTR_DATA] && + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy)) { + ret = -IPSET_ERR_PROTOCOL; + goto put_out; + } + + ret = set->type->create(set, tb, flags); + if (ret != 0) + goto put_out; + + /* BTW, ret==0 here. */ + + /* + * Here, we have a valid, constructed set and we are protected + * by the nfnl mutex. Find the first free index in ip_set_list + * and check clashing. + */ + if ((ret = find_free_id(set->name, &index, &clash)) != 0) { + /* If this is the same set and requested, ignore error */ + if (ret == -EEXIST && + (flags & IPSET_FLAG_EXIST) && + STREQ(set->type->name, clash->type->name) && + set->type->family == clash->type->family && + set->type->revision == clash->type->revision && + set->variant->same_set(set, clash)) + ret = 0; + goto cleanup; + } + + /* + * Finally! Add our shiny new set to the list, and be done. + */ + pr_debug("create: '%s' created with index %u!\n", set->name, index); + ip_set_list[index] = set; + + return ret; + +cleanup: + set->variant->destroy(set); +put_out: + module_put(set->type->me); +out: + kfree(set); + return ret; +} + +/* Destroy sets */ + +static const struct nla_policy +ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, +}; + +static void +ip_set_destroy_set(ip_set_id_t index) +{ + struct ip_set *set = ip_set_list[index]; + + pr_debug("set: %s\n", set->name); + ip_set_list[index] = NULL; + + /* Must call it without holding any lock */ + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); +} + +static int +ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + ip_set_id_t i; + int ret = 0; + + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + /* Commands are serialized and references are + * protected by the ip_set_ref_lock. + * External systems (i.e. xt_set) must call + * ip_set_put|get_nfnl_* functions, that way we + * can safely check references here. + * + * list:set timer can only decrement the reference + * counter, so if it's already zero, we can proceed + * without holding the lock. + */ + read_lock_bh(&ip_set_ref_lock); + if (!attr[IPSET_ATTR_SETNAME]) { + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + } + read_unlock_bh(&ip_set_ref_lock); + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] != NULL) + ip_set_destroy_set(i); + } + } else { + i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (i == IPSET_INVALID_ID) { + ret = -ENOENT; + goto out; + } else if (ip_set_list[i]->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + read_unlock_bh(&ip_set_ref_lock); + + ip_set_destroy_set(i); + } + return 0; +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; +} + +/* Flush sets */ + +static void +ip_set_flush_set(struct ip_set *set) +{ + pr_debug("set: %s\n", set->name); + + write_lock_bh(&set->lock); + set->variant->flush(set); + write_unlock_bh(&set->lock); +} + +static int +ip_set_flush(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + ip_set_id_t i; + + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + if (!attr[IPSET_ATTR_SETNAME]) { + for (i = 0; i < ip_set_max; i++) + if (ip_set_list[i] != NULL) + ip_set_flush_set(ip_set_list[i]); + } else { + i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (i == IPSET_INVALID_ID) + return -ENOENT; + + ip_set_flush_set(ip_set_list[i]); + } + + return 0; +} + +/* Rename a set */ + +static const struct nla_policy +ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, +}; + +static int +ip_set_rename(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + const char *name2; + ip_set_id_t i; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_SETNAME2] == NULL)) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + read_lock_bh(&ip_set_ref_lock); + if (set->ref != 0) { + ret = -IPSET_ERR_REFERENCED; + goto out; + } + + name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] != NULL && + STREQ(ip_set_list[i]->name, name2)) { + ret = -IPSET_ERR_EXIST_SETNAME2; + goto out; + } + } + strncpy(set->name, name2, IPSET_MAXNAMELEN); + +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; +} + +/* Swap two sets so that name/index points to the other. + * References and set names are also swapped. + * + * The commands are serialized by the nfnl mutex and references are + * protected by the ip_set_ref_lock. The kernel interfaces + * do not hold the mutex but the pointer settings are atomic + * so the ip_set_list always contains valid pointers to the sets. + */ + +static int +ip_set_swap(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *from, *to; + ip_set_id_t from_id, to_id; + char from_name[IPSET_MAXNAMELEN]; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_SETNAME2] == NULL)) + return -IPSET_ERR_PROTOCOL; + + from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (from_id == IPSET_INVALID_ID) + return -ENOENT; + + to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2])); + if (to_id == IPSET_INVALID_ID) + return -IPSET_ERR_EXIST_SETNAME2; + + from = ip_set_list[from_id]; + to = ip_set_list[to_id]; + + /* Features must not change. + * Not an artificial restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. */ + if (!(from->type->features == to->type->features && + from->type->family == to->type->family)) + return -IPSET_ERR_TYPE_MISMATCH; + + strncpy(from_name, from->name, IPSET_MAXNAMELEN); + strncpy(from->name, to->name, IPSET_MAXNAMELEN); + strncpy(to->name, from_name, IPSET_MAXNAMELEN); + + write_lock_bh(&ip_set_ref_lock); + swap(from->ref, to->ref); + ip_set_list[from_id] = to; + ip_set_list[to_id] = from; + write_unlock_bh(&ip_set_ref_lock); + + return 0; +} + +/* List/save set data */ + +#define DUMP_INIT 0L +#define DUMP_ALL 1L +#define DUMP_ONE 2L +#define DUMP_LAST 3L + +static int +ip_set_dump_done(struct netlink_callback *cb) +{ + if (cb->args[2]) { + pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); + ip_set_put_byindex((ip_set_id_t) cb->args[1]); + } + return 0; +} + +static inline void +dump_attrs(struct nlmsghdr *nlh) +{ + const struct nlattr *attr; + int rem; + + pr_debug("dump nlmsg\n"); + nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { + pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); + } +} + +static int +dump_init(struct netlink_callback *cb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *attr = (void *)nlh + min_len; + ip_set_id_t index; + + /* Second pass, so parser can't fail */ + nla_parse(cda, IPSET_ATTR_CMD_MAX, + attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); + + /* cb->args[0] : dump single set/all sets + * [1] : set index + * [..]: type specific + */ + + if (!cda[IPSET_ATTR_SETNAME]) { + cb->args[0] = DUMP_ALL; + return 0; + } + + index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME])); + if (index == IPSET_INVALID_ID) + return -ENOENT; + + cb->args[0] = DUMP_ONE; + cb->args[1] = index; + return 0; +} + +static int +ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +{ + ip_set_id_t index = IPSET_INVALID_ID, max; + struct ip_set *set = NULL; + struct nlmsghdr *nlh = NULL; + unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0; + int ret = 0; + + if (cb->args[0] == DUMP_INIT) { + ret = dump_init(cb); + if (ret < 0) { + nlh = nlmsg_hdr(cb->skb); + /* We have to create and send the error message + * manually :-( */ + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(cb->skb, nlh, ret); + return ret; + } + } + + if (cb->args[1] >= ip_set_max) + goto out; + + max = cb->args[0] == DUMP_ONE ? cb->args[1] + 1 : ip_set_max; +dump_last: + pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]); + for (; cb->args[1] < max; cb->args[1]++) { + index = (ip_set_id_t) cb->args[1]; + set = ip_set_list[index]; + if (set == NULL) { + if (cb->args[0] == DUMP_ONE) { + ret = -ENOENT; + goto out; + } + continue; + } + /* When dumping all sets, we must dump "sorted" + * so that lists (unions of sets) are dumped last. + */ + if (cb->args[0] != DUMP_ONE && + ((cb->args[0] == DUMP_ALL) == + !!(set->type->features & IPSET_DUMP_LAST))) + continue; + pr_debug("List set: %s\n", set->name); + if (!cb->args[2]) { + /* Start listing: make sure set won't be destroyed */ + pr_debug("reference set\n"); + __ip_set_get(index); + } + nlh = start_msg(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, flags, + IPSET_CMD_LIST); + if (!nlh) { + ret = -EMSGSIZE; + goto release_refcount; + } + NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name); + switch (cb->args[2]) { + case 0: + /* Core header data */ + NLA_PUT_STRING(skb, IPSET_ATTR_TYPENAME, + set->type->name); + NLA_PUT_U8(skb, IPSET_ATTR_FAMILY, + set->family); + NLA_PUT_U8(skb, IPSET_ATTR_REVISION, + set->type->revision); + ret = set->variant->head(set, skb); + if (ret < 0) + goto release_refcount; + /* Fall through and add elements */ + default: + read_lock_bh(&set->lock); + ret = set->variant->list(set, skb, cb); + read_unlock_bh(&set->lock); + if (!cb->args[2]) { + /* Set is done, proceed with next one */ + if (cb->args[0] == DUMP_ONE) + cb->args[1] = IPSET_INVALID_ID; + else + cb->args[1]++; + } + goto release_refcount; + } + } + /* If we dump all sets, continue with dumping last ones */ + if (cb->args[0] == DUMP_ALL) { + cb->args[0] = DUMP_LAST; + cb->args[1] = 0; + goto dump_last; + } + goto out; + +nla_put_failure: + ret = -EFAULT; +release_refcount: + /* If there was an error or set is done, release set */ + if (ret || !cb->args[2]) { + pr_debug("release set %s\n", ip_set_list[index]->name); + ip_set_put_byindex(index); + } +out: + if (nlh) { + nlmsg_end(skb, nlh); + pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); + dump_attrs(nlh); + } + + return ret < 0 ? ret : skb->len; +} + +static int +ip_set_dump(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + return netlink_dump_start(ctnl, skb, nlh, + ip_set_dump_start, + ip_set_dump_done); +} + +/* Add, del and test */ + +static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, + [IPSET_ATTR_ADT] = { .type = NLA_NESTED }, +}; + +static int +call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, + struct nlattr *tb[], enum ipset_adt adt, + u32 flags, bool use_lineno) +{ + int ret, retried = 0; + u32 lineno = 0; + bool eexist = flags & IPSET_FLAG_EXIST; + + do { + write_lock_bh(&set->lock); + ret = set->variant->uadt(set, tb, adt, &lineno, flags); + write_unlock_bh(&set->lock); + } while (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried++)) == 0); + + if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) + return 0; + if (lineno && use_lineno) { + /* Error in restore/batch mode: send back lineno */ + struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + struct nlmsgerr *errmsg; + size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cmdattr; + u32 *errline; + + skb2 = nlmsg_new(payload, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + rep = __nlmsg_put(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); + errmsg = nlmsg_data(rep); + errmsg->error = ret; + memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); + cmdattr = (void *)&errmsg->msg + min_len; + + nla_parse(cda, IPSET_ATTR_CMD_MAX, + cmdattr, nlh->nlmsg_len - min_len, + ip_set_adt_policy); + + errline = nla_data(cda[IPSET_ATTR_LINENO]); + + *errline = lineno; + + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + /* Signal netlink not to send its ACK/errmsg. */ + return -EINTR; + } + + return ret; +} + +static int +ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + const struct nlattr *nla; + u32 flags = flag_exist(nlh); + bool use_lineno; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + !((attr[IPSET_ATTR_DATA] != NULL) ^ + (attr[IPSET_ATTR_ADT] != NULL)) || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])) || + (attr[IPSET_ATTR_ADT] != NULL && + (!flag_nested(attr[IPSET_ATTR_ADT]) || + attr[IPSET_ATTR_LINENO] == NULL)))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + use_lineno = !!attr[IPSET_ATTR_LINENO]; + if (attr[IPSET_ATTR_DATA]) { + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + use_lineno); + } else { + int nla_rem; + + nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { + memset(tb, 0, sizeof(tb)); + if (nla_type(nla) != IPSET_ATTR_DATA || + !flag_nested(nla) || + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + flags, use_lineno); + if (ret < 0) + return ret; + } + } + return ret; +} + +static int +ip_set_udel(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + const struct nlattr *nla; + u32 flags = flag_exist(nlh); + bool use_lineno; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + !((attr[IPSET_ATTR_DATA] != NULL) ^ + (attr[IPSET_ATTR_ADT] != NULL)) || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])) || + (attr[IPSET_ATTR_ADT] != NULL && + (!flag_nested(attr[IPSET_ATTR_ADT]) || + attr[IPSET_ATTR_LINENO] == NULL)))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + use_lineno = !!attr[IPSET_ATTR_LINENO]; + if (attr[IPSET_ATTR_DATA]) { + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, + use_lineno); + } else { + int nla_rem; + + nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { + memset(tb, 0, sizeof(*tb)); + if (nla_type(nla) != IPSET_ATTR_DATA || + !flag_nested(nla) || + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, + flags, use_lineno); + if (ret < 0) + return ret; + } + } + return ret; +} + +static int +ip_set_utest(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_DATA] == NULL || + !flag_nested(attr[IPSET_ATTR_DATA]))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + + read_lock_bh(&set->lock); + ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0); + read_unlock_bh(&set->lock); + /* Userspace can't trigger element to be re-added */ + if (ret == -EAGAIN) + ret = 1; + + return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST; +} + +/* Get headed data of a set */ + +static int +ip_set_header(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + const struct ip_set *set; + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + ip_set_id_t index; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL)) + return -IPSET_ERR_PROTOCOL; + + index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (index == IPSET_INVALID_ID) + return -ENOENT; + set = ip_set_list[index]; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + IPSET_CMD_HEADER); + if (!nlh2) + goto nlmsg_failure; + NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name); + NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name); + NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family); + NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->type->revision); + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get type data */ + +static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, +}; + +static int +ip_set_type(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + u8 family, min, max; + const char *typename; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_TYPENAME] == NULL || + attr[IPSET_ATTR_FAMILY] == NULL)) + return -IPSET_ERR_PROTOCOL; + + family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); + typename = nla_data(attr[IPSET_ATTR_TYPENAME]); + ret = find_set_type_minmax(typename, family, &min, &max); + if (ret) + return ret; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + IPSET_CMD_TYPE); + if (!nlh2) + goto nlmsg_failure; + NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, typename); + NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, family); + NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, max); + NLA_PUT_U8(skb2, IPSET_ATTR_REVISION_MIN, min); + nlmsg_end(skb2, nlh2); + + pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get protocol version */ + +static const struct nla_policy +ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, +}; + +static int +ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + int ret = 0; + + if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + return -IPSET_ERR_PROTOCOL; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + IPSET_CMD_PROTOCOL); + if (!nlh2) + goto nlmsg_failure; + NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { + [IPSET_CMD_CREATE] = { + .call = ip_set_create, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_create_policy, + }, + [IPSET_CMD_DESTROY] = { + .call = ip_set_destroy, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_FLUSH] = { + .call = ip_set_flush, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_RENAME] = { + .call = ip_set_rename, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname2_policy, + }, + [IPSET_CMD_SWAP] = { + .call = ip_set_swap, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname2_policy, + }, + [IPSET_CMD_LIST] = { + .call = ip_set_dump, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_SAVE] = { + .call = ip_set_dump, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_ADD] = { + .call = ip_set_uadd, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_DEL] = { + .call = ip_set_udel, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_TEST] = { + .call = ip_set_utest, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_HEADER] = { + .call = ip_set_header, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_TYPE] = { + .call = ip_set_type, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_type_policy, + }, + [IPSET_CMD_PROTOCOL] = { + .call = ip_set_protocol, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_protocol_policy, + }, +}; + +static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { + .name = "ip_set", + .subsys_id = NFNL_SUBSYS_IPSET, + .cb_count = IPSET_MSG_MAX, + .cb = ip_set_netlink_subsys_cb, +}; + +/* Interface to iptables/ip6tables */ + +static int +ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) +{ + unsigned *op; + void *data; + int copylen = *len, ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optval != SO_IP_SET) + return -EBADF; + if (*len < sizeof(unsigned)) + return -EINVAL; + + data = vmalloc(*len); + if (!data) + return -ENOMEM; + if (copy_from_user(data, user, *len) != 0) { + ret = -EFAULT; + goto done; + } + op = (unsigned *) data; + + if (*op < IP_SET_OP_VERSION) { + /* Check the version at the beginning of operations */ + struct ip_set_req_version *req_version = data; + if (req_version->version != IPSET_PROTOCOL) { + ret = -EPROTO; + goto done; + } + } + + switch (*op) { + case IP_SET_OP_VERSION: { + struct ip_set_req_version *req_version = data; + + if (*len != sizeof(struct ip_set_req_version)) { + ret = -EINVAL; + goto done; + } + + req_version->version = IPSET_PROTOCOL; + ret = copy_to_user(user, req_version, + sizeof(struct ip_set_req_version)); + goto done; + } + case IP_SET_OP_GET_BYNAME: { + struct ip_set_req_get_set *req_get = data; + + if (*len != sizeof(struct ip_set_req_get_set)) { + ret = -EINVAL; + goto done; + } + req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; + nfnl_lock(); + req_get->set.index = find_set_id(req_get->set.name); + nfnl_unlock(); + goto copy; + } + case IP_SET_OP_GET_BYINDEX: { + struct ip_set_req_get_set *req_get = data; + + if (*len != sizeof(struct ip_set_req_get_set) || + req_get->set.index >= ip_set_max) { + ret = -EINVAL; + goto done; + } + nfnl_lock(); + strncpy(req_get->set.name, + ip_set_list[req_get->set.index] + ? ip_set_list[req_get->set.index]->name : "", + IPSET_MAXNAMELEN); + nfnl_unlock(); + goto copy; + } + default: + ret = -EBADMSG; + goto done; + } /* end of switch(op) */ + +copy: + ret = copy_to_user(user, data, copylen); + +done: + vfree(data); + if (ret > 0) + ret = 0; + return ret; +} + +static struct nf_sockopt_ops so_set __read_mostly = { + .pf = PF_INET, + .get_optmin = SO_IP_SET, + .get_optmax = SO_IP_SET + 1, + .get = &ip_set_sockfn_get, + .owner = THIS_MODULE, +}; + +static int __init +ip_set_init(void) +{ + int ret; + + if (max_sets) + ip_set_max = max_sets; + if (ip_set_max >= IPSET_INVALID_ID) + ip_set_max = IPSET_INVALID_ID - 1; + + ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max, + GFP_KERNEL); + if (!ip_set_list) { + pr_err("ip_set: Unable to create ip_set_list\n"); + return -ENOMEM; + } + + ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { + pr_err("ip_set: cannot register with nfnetlink.\n"); + kfree(ip_set_list); + return ret; + } + ret = nf_register_sockopt(&so_set); + if (ret != 0) { + pr_err("SO_SET registry failed: %d\n", ret); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + kfree(ip_set_list); + return ret; + } + + pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL); + return 0; +} + +static void __exit +ip_set_fini(void) +{ + /* There can't be any existing set */ + nf_unregister_sockopt(&so_set); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + kfree(ip_set_list); + pr_debug("these are the famous last words\n"); +} + +module_init(ip_set_init); +module_exit(ip_set_fini); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c new file mode 100644 index 00000000..757143b2 --- /dev/null +++ b/net/netfilter/ipset/ip_set_getport.c @@ -0,0 +1,155 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Get Layer-4 data from the packets */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* We must handle non-linear skbs */ +static bool +get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, + bool src, __be16 *port, u8 *proto) +{ + switch (protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); + if (th == NULL) + /* No choice either */ + return false; + + *port = src ? th->source : th->dest; + break; + } + case IPPROTO_SCTP: { + sctp_sctphdr_t _sh; + const sctp_sctphdr_t *sh; + + sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); + if (sh == NULL) + /* No choice either */ + return false; + + *port = src ? sh->source : sh->dest; + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); + if (uh == NULL) + /* No choice either */ + return false; + + *port = src ? uh->source : uh->dest; + break; + } + case IPPROTO_ICMP: { + struct icmphdr _ich; + const struct icmphdr *ic; + + ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); + if (ic == NULL) + return false; + + *port = (__force __be16)htons((ic->type << 8) | ic->code); + break; + } + case IPPROTO_ICMPV6: { + struct icmp6hdr _ich; + const struct icmp6hdr *ic; + + ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); + if (ic == NULL) + return false; + + *port = (__force __be16) + htons((ic->icmp6_type << 8) | ic->icmp6_code); + break; + } + default: + break; + } + *proto = protocol; + + return true; +} + +bool +ip_set_get_ip4_port(const struct sk_buff *skb, bool src, + __be16 *port, u8 *proto) +{ + const struct iphdr *iph = ip_hdr(skb); + unsigned int protooff = ip_hdrlen(skb); + int protocol = iph->protocol; + + /* See comments at tcp_match in ip_tables.c */ + if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET)) + return false; + + return get_port(skb, protocol, protooff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip4_port); + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +bool +ip_set_get_ip6_port(const struct sk_buff *skb, bool src, + __be16 *port, u8 *proto) +{ + int protoff; + u8 nexthdr; + + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + if (protoff < 0) + return false; + + return get_port(skb, nexthdr, protoff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); +#endif + +bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case AF_INET: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case AF_INET6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c new file mode 100644 index 00000000..43bcce20 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -0,0 +1,464 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("hash:ip type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip"); + +/* Type specific function prefix */ +#define TYPE hash_ip + +static bool +hash_ip_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ip4_same_set hash_ip_same_set +#define hash_ip6_same_set hash_ip_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ip4_elem { + __be32 ip; +}; + +/* Member elements with timeout support */ +struct hash_ip4_telem { + __be32 ip; + unsigned long timeout; +}; + +static inline bool +hash_ip4_data_equal(const struct hash_ip4_elem *ip1, + const struct hash_ip4_elem *ip2) +{ + return ip1->ip == ip2->ip; +} + +static inline bool +hash_ip4_data_isnull(const struct hash_ip4_elem *elem) +{ + return elem->ip == 0; +} + +static inline void +hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src) +{ + dst->ip = src->ip; +} + +/* Zero valued IP addresses cannot be stored */ +static inline void +hash_ip4_data_zero_out(struct hash_ip4_elem *elem) +{ + elem->ip = 0; +} + +static inline bool +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data) +{ + const struct hash_ip4_telem *tdata = + (const struct hash_ip4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_NETMASK +#define PF 4 +#define HOST_MASK 32 +#include + +static int +hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + __be32 ip; + + ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip); + ip &= ip_set_netmask(h->netmask); + if (ip == 0) + return -EINVAL; + + return adtfn(set, &ip, h->timeout); +} + +static int +hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 ip, ip_to, hosts, timeout = h->timeout; + __be32 nip; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ip &= ip_set_hostmask(h->netmask); + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST) { + nip = htonl(ip); + if (nip == 0) + return -IPSET_ERR_HASH_ELEM; + return adtfn(set, &nip, timeout); + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip &= ip_set_hostmask(cidr); + ip_to = ip | ~ip_set_hostmask(cidr); + } else + ip_to = ip; + + hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + + for (; !before(ip_to, ip); ip += hosts) { + nip = htonl(ip); + if (nip == 0) + return -IPSET_ERR_HASH_ELEM; + ret = adtfn(set, &nip, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +hash_ip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout && + x->netmask == y->netmask; +} + +/* The type variant functions: IPv6 */ + +struct hash_ip6_elem { + union nf_inet_addr ip; +}; + +struct hash_ip6_telem { + union nf_inet_addr ip; + unsigned long timeout; +}; + +static inline bool +hash_ip6_data_equal(const struct hash_ip6_elem *ip1, + const struct hash_ip6_elem *ip2) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0; +} + +static inline bool +hash_ip6_data_isnull(const struct hash_ip6_elem *elem) +{ + return ipv6_addr_any(&elem->ip.in6); +} + +static inline void +hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src) +{ + ipv6_addr_copy(&dst->ip.in6, &src->ip.in6); +} + +static inline void +hash_ip6_data_zero_out(struct hash_ip6_elem *elem) +{ + ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0); +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static bool +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data) +{ + const struct hash_ip6_telem *e = + (const struct hash_ip6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include + +static int +hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + union nf_inet_addr ip; + + ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip.in6); + ip6_netmask(&ip, h->netmask); + if (ipv6_addr_any(&ip.in6)) + return -EINVAL; + + return adtfn(set, &ip, h->timeout); +} + +static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, +}; + +static int +hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + union nf_inet_addr ip; + u32 timeout = h->timeout; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ip6_netmask(&ip, h->netmask); + if (ipv6_addr_any(&ip.in6)) + return -IPSET_ERR_HASH_ELEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + ret = adtfn(set, &ip, timeout); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 netmask, hbits; + struct ip_set_hash *h; + + if (!(set->family == AF_INET || set->family == AF_INET6)) + return -IPSET_ERR_INVALID_FAMILY; + netmask = set->family == AF_INET ? 32 : 128; + pr_debug("Create set %s with family %s\n", + set->name, set->family == AF_INET ? "inet" : "inet6"); + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if ((set->family == AF_INET && netmask > 32) || + (set->family == AF_INET6 && netmask > 128) || + netmask == 0) + return -IPSET_ERR_INVALID_NETMASK; + } + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + h->netmask = netmask; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + h->table = ip_set_alloc( + sizeof(struct htable) + + jhash_size(hbits) * sizeof(struct hbucket)); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == AF_INET + ? &hash_ip4_tvariant : &hash_ip6_tvariant; + + if (set->family == AF_INET) + hash_ip4_gc_init(set); + else + hash_ip6_gc_init(set); + } else { + set->variant = set->family == AF_INET + ? &hash_ip4_variant : &hash_ip6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ip_type __read_mostly = { + .name = "hash:ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = AF_UNSPEC, + .revision = 0, + .create = hash_ip_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ip_init(void) +{ + return ip_set_type_register(&hash_ip_type); +} + +static void __exit +hash_ip_fini(void) +{ + ip_set_type_unregister(&hash_ip_type); +} + +module_init(hash_ip_init); +module_exit(hash_ip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c new file mode 100644 index 00000000..14281b6b --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -0,0 +1,530 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("hash:ip,port type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip,port"); + +/* Type specific function prefix */ +#define TYPE hash_ipport + +static bool +hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ipport4_same_set hash_ipport_same_set +#define hash_ipport6_same_set hash_ipport_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ipport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Member elements with timeout support */ +struct hash_ipport4_telem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, + const struct hash_ipport4_elem *ip2) +{ + return ip1->ip == ip2->ip && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipport4_data_copy(struct hash_ipport4_elem *dst, + const struct hash_ipport4_elem *src) +{ + dst->ip = src->ip; + dst->port = src->port; + dst->proto = src->proto; +} + +static inline void +hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipport4_data_list(struct sk_buff *skb, + const struct hash_ipport4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipport4_data_tlist(struct sk_buff *skb, + const struct hash_ipport4_elem *data) +{ + const struct hash_ipport4_telem *tdata = + (const struct hash_ipport4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define PF 4 +#define HOST_MASK 32 +#include + +static int +hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport4_elem data = { }; + + if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport4_elem data = { }; + u32 ip, ip_to, p, port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip = ntohl(data.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip &= ip_set_hostmask(cidr); + ip_to = ip | ~ip_set_hostmask(cidr); + } else + ip_to = ip; + + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + for (; !before(ip_to, ip); ip++) + for (p = port; p <= port_to; p++) { + data.ip = htonl(ip); + data.port = htons(p); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_ipport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; +}; + +struct hash_ipport6_telem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, + const struct hash_ipport6_elem *ip2) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipport6_data_copy(struct hash_ipport6_elem *dst, + const struct hash_ipport6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipport6_data_list(struct sk_buff *skb, + const struct hash_ipport6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipport6_data_tlist(struct sk_buff *skb, + const struct hash_ipport6_elem *data) +{ + const struct hash_ipport6_telem *e = + (const struct hash_ipport6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include + +static int +hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport6_elem data = { }; + + if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport6_elem data = { }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + + if (!(set->family == AF_INET || set->family == AF_INET6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + h->table = ip_set_alloc( + sizeof(struct htable) + + jhash_size(hbits) * sizeof(struct hbucket)); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == AF_INET + ? &hash_ipport4_tvariant : &hash_ipport6_tvariant; + + if (set->family == AF_INET) + hash_ipport4_gc_init(set); + else + hash_ipport6_gc_init(set); + } else { + set->variant = set->family == AF_INET + ? &hash_ipport4_variant : &hash_ipport6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ipport_type __read_mostly = { + .name = "hash:ip,port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, + .dimension = IPSET_DIM_TWO, + .family = AF_UNSPEC, + .revision = 1, + .create = hash_ipport_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipport_init(void) +{ + return ip_set_type_register(&hash_ipport_type); +} + +static void __exit +hash_ipport_fini(void) +{ + ip_set_type_unregister(&hash_ipport_type); +} + +module_init(hash_ipport_init); +module_exit(hash_ipport_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c new file mode 100644 index 00000000..401c8a25 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -0,0 +1,548 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,ip type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("hash:ip,port,ip type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip,port,ip"); + +/* Type specific function prefix */ +#define TYPE hash_ipportip + +static bool +hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ipportip4_same_set hash_ipportip_same_set +#define hash_ipportip6_same_set hash_ipportip_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ipportip4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Member elements with timeout support */ +struct hash_ipportip4_telem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, + const struct hash_ipportip4_elem *ip2) +{ + return ip1->ip == ip2->ip && + ip1->ip2 == ip2->ip2 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst, + const struct hash_ipportip4_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipportip4_data_list(struct sk_buff *skb, + const struct hash_ipportip4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportip4_data_tlist(struct sk_buff *skb, + const struct hash_ipportip4_elem *data) +{ + const struct hash_ipportip4_telem *tdata = + (const struct hash_ipportip4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define PF 4 +#define HOST_MASK 32 +#include + +static int +hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip4_elem data = { }; + + if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip); + ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip4_elem data = { }; + u32 ip, ip_to, p, port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip = ntohl(data.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip &= ip_set_hostmask(cidr); + ip_to = ip | ~ip_set_hostmask(cidr); + } else + ip_to = ip; + + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + for (; !before(ip_to, ip); ip++) + for (p = port; p <= port_to; p++) { + data.ip = htonl(ip); + data.port = htons(p); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_ipportip6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; +}; + +struct hash_ipportip6_telem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, + const struct hash_ipportip6_elem *ip2) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst, + const struct hash_ipportip6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipportip6_data_list(struct sk_buff *skb, + const struct hash_ipportip6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportip6_data_tlist(struct sk_buff *skb, + const struct hash_ipportip6_elem *data) +{ + const struct hash_ipportip6_telem *e = + (const struct hash_ipportip6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include + +static int +hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip6_elem data = { }; + + if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip6_elem data = { }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + + if (!(set->family == AF_INET || set->family == AF_INET6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + h->table = ip_set_alloc( + sizeof(struct htable) + + jhash_size(hbits) * sizeof(struct hbucket)); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == AF_INET + ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant; + + if (set->family == AF_INET) + hash_ipportip4_gc_init(set); + else + hash_ipportip6_gc_init(set); + } else { + set->variant = set->family == AF_INET + ? &hash_ipportip4_variant : &hash_ipportip6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ipportip_type __read_mostly = { + .name = "hash:ip,port,ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, + .dimension = IPSET_DIM_THREE, + .family = AF_UNSPEC, + .revision = 1, + .create = hash_ipportip_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipportip_init(void) +{ + return ip_set_type_register(&hash_ipportip_type); +} + +static void __exit +hash_ipportip_fini(void) +{ + ip_set_type_unregister(&hash_ipportip_type); +} + +module_init(hash_ipportip_init); +module_exit(hash_ipportip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c new file mode 100644 index 00000000..565a7c5b --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -0,0 +1,616 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("hash:ip,port,net type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip,port,net"); + +/* Type specific function prefix */ +#define TYPE hash_ipportnet + +static bool +hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ipportnet4_same_set hash_ipportnet_same_set +#define hash_ipportnet6_same_set hash_ipportnet_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ipportnet4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr; + u8 proto; +}; + +/* Member elements with timeout support */ +struct hash_ipportnet4_telem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr; + u8 proto; + unsigned long timeout; +}; + +static inline bool +hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, + const struct hash_ipportnet4_elem *ip2) +{ + return ip1->ip == ip2->ip && + ip1->ip2 == ip2->ip2 && + ip1->cidr == ip2->cidr && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst, + const struct hash_ipportnet4_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) +{ + elem->ip2 &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +static inline void +hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipportnet4_data_list(struct sk_buff *skb, + const struct hash_ipportnet4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportnet4_data_tlist(struct sk_buff *skb, + const struct hash_ipportnet4_elem *data) +{ + const struct hash_ipportnet4_telem *tdata = + (const struct hash_ipportnet4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +#define PF 4 +#define HOST_MASK 32 +#include + +static int +hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip); + ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2); + data.ip2 &= ip_set_netmask(data.cidr); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet4_elem data = { .cidr = HOST_MASK }; + u32 ip, ip_to, p, port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR2]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + + if (!data.cidr) + return -IPSET_ERR_INVALID_CIDR; + + data.ip2 &= ip_set_netmask(data.cidr); + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip = ntohl(data.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip &= ip_set_hostmask(cidr); + ip_to = ip | ~ip_set_hostmask(cidr); + } else + ip_to = ip; + + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + for (; !before(ip_to, ip); ip++) + for (p = port; p <= port_to; p++) { + data.ip = htonl(ip); + data.port = htons(p); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_ipportnet6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr; + u8 proto; +}; + +struct hash_ipportnet6_telem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr; + u8 proto; + unsigned long timeout; +}; + +static inline bool +hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, + const struct hash_ipportnet6_elem *ip2) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + ip1->cidr == ip2->cidr && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst, + const struct hash_ipportnet6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem) +{ + elem->proto = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip2, cidr); + elem->cidr = cidr; +} + +static bool +hash_ipportnet6_data_list(struct sk_buff *skb, + const struct hash_ipportnet6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportnet6_data_tlist(struct sk_buff *skb, + const struct hash_ipportnet6_elem *data) +{ + const struct hash_ipportnet6_telem *e = + (const struct hash_ipportnet6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include + +static int +hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); + ip6_netmask(&data.ip2, data.cidr); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet6_elem data = { .cidr = HOST_MASK }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR2]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + + if (!data.cidr) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&data.ip2, data.cidr); + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + + if (!(set->family == AF_INET || set->family == AF_INET6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + h->table = ip_set_alloc( + sizeof(struct htable) + + jhash_size(hbits) * sizeof(struct hbucket)); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == AF_INET + ? &hash_ipportnet4_tvariant + : &hash_ipportnet6_tvariant; + + if (set->family == AF_INET) + hash_ipportnet4_gc_init(set); + else + hash_ipportnet6_gc_init(set); + } else { + set->variant = set->family == AF_INET + ? &hash_ipportnet4_variant : &hash_ipportnet6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ipportnet_type __read_mostly = { + .name = "hash:ip,port,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, + .dimension = IPSET_DIM_THREE, + .family = AF_UNSPEC, + .revision = 1, + .create = hash_ipportnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipportnet_init(void) +{ + return ip_set_type_register(&hash_ipportnet_type); +} + +static void __exit +hash_ipportnet_fini(void) +{ + ip_set_type_unregister(&hash_ipportnet_type); +} + +module_init(hash_ipportnet_init); +module_exit(hash_ipportnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c new file mode 100644 index 00000000..2aeeabcd --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -0,0 +1,462 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("hash:net type of IP sets"); +MODULE_ALIAS("ip_set_hash:net"); + +/* Type specific function prefix */ +#define TYPE hash_net + +static bool +hash_net_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_net4_same_set hash_net_same_set +#define hash_net6_same_set hash_net_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_net4_elem { + __be32 ip; + u16 padding0; + u8 padding1; + u8 cidr; +}; + +/* Member elements with timeout support */ +struct hash_net4_telem { + __be32 ip; + u16 padding0; + u8 padding1; + u8 cidr; + unsigned long timeout; +}; + +static inline bool +hash_net4_data_equal(const struct hash_net4_elem *ip1, + const struct hash_net4_elem *ip2) +{ + return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr; +} + +static inline bool +hash_net4_data_isnull(const struct hash_net4_elem *elem) +{ + return elem->cidr == 0; +} + +static inline void +hash_net4_data_copy(struct hash_net4_elem *dst, + const struct hash_net4_elem *src) +{ + dst->ip = src->ip; + dst->cidr = src->cidr; +} + +static inline void +hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +/* Zero CIDR values cannot be stored */ +static inline void +hash_net4_data_zero_out(struct hash_net4_elem *elem) +{ + elem->cidr = 0; +} + +static bool +hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data) +{ + const struct hash_net4_telem *tdata = + (const struct hash_net4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, tdata->cidr); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_NETS + +#define PF 4 +#define HOST_MASK 32 +#include + +static int +hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip); + data.ip &= ip_set_netmask(data.cidr); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net4_elem data = { .cidr = HOST_MASK }; + u32 timeout = h->timeout; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!data.cidr) + return -IPSET_ERR_INVALID_CIDR; + + data.ip &= ip_set_netmask(data.cidr); + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + ret = adtfn(set, &data, timeout); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static bool +hash_net_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_net6_elem { + union nf_inet_addr ip; + u16 padding0; + u8 padding1; + u8 cidr; +}; + +struct hash_net6_telem { + union nf_inet_addr ip; + u16 padding0; + u8 padding1; + u8 cidr; + unsigned long timeout; +}; + +static inline bool +hash_net6_data_equal(const struct hash_net6_elem *ip1, + const struct hash_net6_elem *ip2) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_net6_data_isnull(const struct hash_net6_elem *elem) +{ + return elem->cidr == 0; +} + +static inline void +hash_net6_data_copy(struct hash_net6_elem *dst, + const struct hash_net6_elem *src) +{ + ipv6_addr_copy(&dst->ip.in6, &src->ip.in6); + dst->cidr = src->cidr; +} + +static inline void +hash_net6_data_zero_out(struct hash_net6_elem *elem) +{ + elem->cidr = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr; +} + +static bool +hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data) +{ + const struct hash_net6_telem *e = + (const struct hash_net6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, e->cidr); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include + +static int +hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6_netmask(&data.ip, data.cidr); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net6_elem data = { .cidr = HOST_MASK }; + u32 timeout = h->timeout; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!data.cidr) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&data.ip, data.cidr); + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + ret = adtfn(set, &data, timeout); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + struct ip_set_hash *h; + u8 hbits; + + if (!(set->family == AF_INET || set->family == AF_INET6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + h->table = ip_set_alloc( + sizeof(struct htable) + + jhash_size(hbits) * sizeof(struct hbucket)); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == AF_INET + ? &hash_net4_tvariant : &hash_net6_tvariant; + + if (set->family == AF_INET) + hash_net4_gc_init(set); + else + hash_net6_gc_init(set); + } else { + set->variant = set->family == AF_INET + ? &hash_net4_variant : &hash_net6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_net_type __read_mostly = { + .name = "hash:net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = AF_UNSPEC, + .revision = 0, + .create = hash_net_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_net_init(void) +{ + return ip_set_type_register(&hash_net_type); +} + +static void __exit +hash_net_fini(void) +{ + ip_set_type_unregister(&hash_net_type); +} + +module_init(hash_net_init); +module_exit(hash_net_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c new file mode 100644 index 00000000..e50d9bb8 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -0,0 +1,566 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,port type */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("hash:net,port type of IP sets"); +MODULE_ALIAS("ip_set_hash:net,port"); + +/* Type specific function prefix */ +#define TYPE hash_netport + +static bool +hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_netport4_same_set hash_netport_same_set +#define hash_netport6_same_set hash_netport_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_netport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr; +}; + +/* Member elements with timeout support */ +struct hash_netport4_telem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr; + unsigned long timeout; +}; + +static inline bool +hash_netport4_data_equal(const struct hash_netport4_elem *ip1, + const struct hash_netport4_elem *ip2) +{ + return ip1->ip == ip2->ip && + ip1->port == ip2->port && + ip1->proto == ip2->proto && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_netport4_data_isnull(const struct hash_netport4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_netport4_data_copy(struct hash_netport4_elem *dst, + const struct hash_netport4_elem *src) +{ + dst->ip = src->ip; + dst->port = src->port; + dst->proto = src->proto; + dst->cidr = src->cidr; +} + +static inline void +hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +static inline void +hash_netport4_data_zero_out(struct hash_netport4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_netport4_data_list(struct sk_buff *skb, + const struct hash_netport4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_netport4_data_tlist(struct sk_buff *skb, + const struct hash_netport4_elem *data) +{ + const struct hash_netport4_telem *tdata = + (const struct hash_netport4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +#define PF 4 +#define HOST_MASK 32 +#include + +static int +hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip); + data.ip &= ip_set_netmask(data.cidr); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport4_elem data = { .cidr = HOST_MASK }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!data.cidr) + return -IPSET_ERR_INVALID_CIDR; + data.ip &= ip_set_netmask(data.cidr); + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +hash_netport_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_netport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr; +}; + +struct hash_netport6_telem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr; + unsigned long timeout; +}; + +static inline bool +hash_netport6_data_equal(const struct hash_netport6_elem *ip1, + const struct hash_netport6_elem *ip2) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->port == ip2->port && + ip1->proto == ip2->proto && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_netport6_data_isnull(const struct hash_netport6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_netport6_data_copy(struct hash_netport6_elem *dst, + const struct hash_netport6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_netport6_data_zero_out(struct hash_netport6_elem *elem) +{ + elem->proto = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr; +} + +static bool +hash_netport6_data_list(struct sk_buff *skb, + const struct hash_netport6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_netport6_data_tlist(struct sk_buff *skb, + const struct hash_netport6_elem *data) +{ + const struct hash_netport6_telem *e = + (const struct hash_netport6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include + +static int +hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6_netmask(&data.ip, data.cidr); + + return adtfn(set, &data, h->timeout); +} + +static int +hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport6_elem data = { .cidr = HOST_MASK }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!data.cidr) + return -IPSET_ERR_INVALID_CIDR; + ip6_netmask(&data.ip, data.cidr); + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + + if (!(set->family == AF_INET || set->family == AF_INET6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == AF_INET ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + h->table = ip_set_alloc( + sizeof(struct htable) + + jhash_size(hbits) * sizeof(struct hbucket)); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == AF_INET + ? &hash_netport4_tvariant : &hash_netport6_tvariant; + + if (set->family == AF_INET) + hash_netport4_gc_init(set); + else + hash_netport6_gc_init(set); + } else { + set->variant = set->family == AF_INET + ? &hash_netport4_variant : &hash_netport6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_netport_type __read_mostly = { + .name = "hash:net,port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, + .dimension = IPSET_DIM_TWO, + .family = AF_UNSPEC, + .revision = 1, + .create = hash_netport_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netport_init(void) +{ + return ip_set_type_register(&hash_netport_type); +} + +static void __exit +hash_netport_fini(void) +{ + ip_set_type_unregister(&hash_netport_type); +} + +module_init(hash_netport_init); +module_exit(hash_netport_fini); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c new file mode 100644 index 00000000..e9159e99 --- /dev/null +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -0,0 +1,577 @@ +/* Copyright (C) 2008-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the list:set type */ + +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("list:set type of IP sets"); +MODULE_ALIAS("ip_set_list:set"); + +/* Member elements without and with timeout */ +struct set_elem { + ip_set_id_t id; +}; + +struct set_telem { + ip_set_id_t id; + unsigned long timeout; +}; + +/* Type structure */ +struct list_set { + size_t dsize; /* element size */ + u32 size; /* size of set list array */ + u32 timeout; /* timeout value */ + struct timer_list gc; /* garbage collection */ + struct set_elem members[0]; /* the set members */ +}; + +static inline struct set_elem * +list_set_elem(const struct list_set *map, u32 id) +{ + return (struct set_elem *)((void *)map->members + id * map->dsize); +} + +static inline struct set_telem * +list_set_telem(const struct list_set *map, u32 id) +{ + return (struct set_telem *)((void *)map->members + id * map->dsize); +} + +static inline bool +list_set_timeout(const struct list_set *map, u32 id) +{ + const struct set_telem *elem = list_set_telem(map, id); + + return ip_set_timeout_test(elem->timeout); +} + +static inline bool +list_set_expired(const struct list_set *map, u32 id) +{ + const struct set_telem *elem = list_set_telem(map, id); + + return ip_set_timeout_expired(elem->timeout); +} + +/* Set list without and with timeout */ + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, + enum ipset_adt adt, u8 pf, u8 dim, u8 flags) +{ + struct list_set *map = set->data; + struct set_elem *elem; + u32 i; + int ret; + + for (i = 0; i < map->size; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID) + return 0; + if (with_timeout(map->timeout) && list_set_expired(map, i)) + continue; + switch (adt) { + case IPSET_TEST: + ret = ip_set_test(elem->id, skb, pf, dim, flags); + if (ret > 0) + return ret; + break; + case IPSET_ADD: + ret = ip_set_add(elem->id, skb, pf, dim, flags); + if (ret == 0) + return ret; + break; + case IPSET_DEL: + ret = ip_set_del(elem->id, skb, pf, dim, flags); + if (ret == 0) + return ret; + break; + default: + break; + } + } + return -EINVAL; +} + +static bool +next_id_eq(const struct list_set *map, u32 i, ip_set_id_t id) +{ + const struct set_elem *elem; + + if (i + 1 < map->size) { + elem = list_set_elem(map, i + 1); + return !!(elem->id == id && + !(with_timeout(map->timeout) && + list_set_expired(map, i + 1))); + } + + return 0; +} + +static void +list_elem_add(struct list_set *map, u32 i, ip_set_id_t id) +{ + struct set_elem *e; + + for (; i < map->size; i++) { + e = list_set_elem(map, i); + swap(e->id, id); + if (e->id == IPSET_INVALID_ID) + break; + } +} + +static void +list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id, + unsigned long timeout) +{ + struct set_telem *e; + + for (; i < map->size; i++) { + e = list_set_telem(map, i); + swap(e->id, id); + swap(e->timeout, timeout); + if (e->id == IPSET_INVALID_ID) + break; + } +} + +static int +list_set_add(struct list_set *map, u32 i, ip_set_id_t id, + unsigned long timeout) +{ + const struct set_elem *e = list_set_elem(map, i); + + if (i == map->size - 1 && e->id != IPSET_INVALID_ID) + /* Last element replaced: e.g. add new,before,last */ + ip_set_put_byindex(e->id); + if (with_timeout(map->timeout)) + list_elem_tadd(map, i, id, ip_set_timeout_set(timeout)); + else + list_elem_add(map, i, id); + + return 0; +} + +static int +list_set_del(struct list_set *map, u32 i) +{ + struct set_elem *a = list_set_elem(map, i), *b; + + ip_set_put_byindex(a->id); + + for (; i < map->size - 1; i++) { + b = list_set_elem(map, i + 1); + a->id = b->id; + if (with_timeout(map->timeout)) + ((struct set_telem *)a)->timeout = + ((struct set_telem *)b)->timeout; + a = b; + if (a->id == IPSET_INVALID_ID) + break; + } + /* Last element */ + a->id = IPSET_INVALID_ID; + return 0; +} + +static int +list_set_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags) +{ + struct list_set *map = set->data; + bool with_timeout = with_timeout(map->timeout); + int before = 0; + u32 timeout = map->timeout; + ip_set_id_t id, refid = IPSET_INVALID_ID; + const struct set_elem *elem; + struct ip_set *s; + u32 i; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_NAME] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); + if (id == IPSET_INVALID_ID) + return -IPSET_ERR_NAME; + /* "Loop detection" */ + if (s->type->features & IPSET_TYPE_NAME) { + ret = -IPSET_ERR_LOOP; + goto finish; + } + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + before = f & IPSET_FLAG_BEFORE; + } + + if (before && !tb[IPSET_ATTR_NAMEREF]) { + ret = -IPSET_ERR_BEFORE; + goto finish; + } + + if (tb[IPSET_ATTR_NAMEREF]) { + refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), + &s); + if (refid == IPSET_INVALID_ID) { + ret = -IPSET_ERR_NAMEREF; + goto finish; + } + if (!before) + before = -1; + } + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout) { + ret = -IPSET_ERR_TIMEOUT; + goto finish; + } + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + switch (adt) { + case IPSET_TEST: + for (i = 0; i < map->size && !ret; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID || + (before != 0 && i + 1 >= map->size)) + break; + else if (with_timeout && list_set_expired(map, i)) + continue; + else if (before > 0 && elem->id == id) + ret = next_id_eq(map, i, refid); + else if (before < 0 && elem->id == refid) + ret = next_id_eq(map, i, id); + else if (before == 0 && elem->id == id) + ret = 1; + } + break; + case IPSET_ADD: + for (i = 0; i < map->size && !ret; i++) { + elem = list_set_elem(map, i); + if (elem->id == id && + !(with_timeout && list_set_expired(map, i))) + ret = -IPSET_ERR_EXIST; + } + if (ret == -IPSET_ERR_EXIST) + break; + ret = -IPSET_ERR_LIST_FULL; + for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID) + ret = before != 0 ? -IPSET_ERR_REF_EXIST + : list_set_add(map, i, id, timeout); + else if (elem->id != refid) + continue; + else if (with_timeout && list_set_expired(map, i)) + ret = -IPSET_ERR_REF_EXIST; + else if (before) + ret = list_set_add(map, i, id, timeout); + else if (i + 1 < map->size) + ret = list_set_add(map, i + 1, id, timeout); + } + break; + case IPSET_DEL: + ret = -IPSET_ERR_EXIST; + for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID) { + ret = before != 0 ? -IPSET_ERR_REF_EXIST + : -IPSET_ERR_EXIST; + break; + } else if (with_timeout && list_set_expired(map, i)) + continue; + else if (elem->id == id && + (before == 0 || + (before > 0 && + next_id_eq(map, i, refid)))) + ret = list_set_del(map, i); + else if (before < 0 && + elem->id == refid && + next_id_eq(map, i, id)) + ret = list_set_del(map, i + 1); + } + break; + default: + break; + } + +finish: + if (refid != IPSET_INVALID_ID) + ip_set_put_byindex(refid); + if (adt != IPSET_ADD || ret) + ip_set_put_byindex(id); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static void +list_set_flush(struct ip_set *set) +{ + struct list_set *map = set->data; + struct set_elem *elem; + u32 i; + + for (i = 0; i < map->size; i++) { + elem = list_set_elem(map, i); + if (elem->id != IPSET_INVALID_ID) { + ip_set_put_byindex(elem->id); + elem->id = IPSET_INVALID_ID; + } + } +} + +static void +list_set_destroy(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + list_set_flush(set); + kfree(map); + + set->data = NULL; +} + +static int +list_set_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct list_set *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->size * map->dsize)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +list_set_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct list_set *map = set->data; + struct nlattr *atd, *nested; + u32 i, first = cb->args[2]; + const struct set_elem *e; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] < map->size; cb->args[2]++) { + i = cb->args[2]; + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + goto finish; + if (with_timeout(map->timeout) && list_set_expired(map, i)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (i == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_STRING(skb, IPSET_ATTR_NAME, + ip_set_name_byindex(e->id)); + if (with_timeout(map->timeout)) { + const struct set_telem *te = + (const struct set_telem *) e; + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(te->timeout))); + } + ipset_nest_end(skb, nested); + } +finish: + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(i == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static bool +list_set_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct list_set *x = a->data; + const struct list_set *y = b->data; + + return x->size == y->size && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant list_set = { + .kadt = list_set_kadt, + .uadt = list_set_uadt, + .destroy = list_set_destroy, + .flush = list_set_flush, + .head = list_set_head, + .list = list_set_list, + .same_set = list_set_same_set, +}; + +static void +list_set_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct list_set *map = set->data; + struct set_telem *e; + u32 i; + + write_lock_bh(&set->lock); + for (i = 0; i < map->size; i++) { + e = list_set_telem(map, i); + if (e->id != IPSET_INVALID_ID && list_set_expired(map, i)) + list_set_del(map, i); + } + write_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +list_set_gc_init(struct ip_set *set) +{ + struct list_set *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = list_set_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create list:set type of sets */ + +static bool +init_list_set(struct ip_set *set, u32 size, size_t dsize, + unsigned long timeout) +{ + struct list_set *map; + struct set_elem *e; + u32 i; + + map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL); + if (!map) + return false; + + map->size = size; + map->dsize = dsize; + map->timeout = timeout; + set->data = map; + + for (i = 0; i < size; i++) { + e = list_set_elem(map, i); + e->id = IPSET_INVALID_ID; + } + + return true; +} + +static int +list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 size = IP_SET_LIST_DEFAULT_SIZE; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_SIZE]) + size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]); + if (size < IP_SET_LIST_MIN_SIZE) + size = IP_SET_LIST_MIN_SIZE; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!init_list_set(set, size, sizeof(struct set_telem), + ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]))) + return -ENOMEM; + + list_set_gc_init(set); + } else { + if (!init_list_set(set, size, sizeof(struct set_elem), + IPSET_NO_TIMEOUT)) + return -ENOMEM; + } + set->variant = &list_set; + return 0; +} + +static struct ip_set_type list_set_type __read_mostly = { + .name = "list:set", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST, + .dimension = IPSET_DIM_ONE, + .family = AF_UNSPEC, + .revision = 0, + .create = list_set_create, + .create_policy = { + [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_NAME] = { .type = NLA_STRING, + .len = IPSET_MAXNAMELEN }, + [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING, + .len = IPSET_MAXNAMELEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +list_set_init(void) +{ + return ip_set_type_register(&list_set_type); +} + +static void __exit +list_set_fini(void) +{ + ip_set_type_unregister(&list_set_type); +} + +module_init(list_set_init); +module_exit(list_set_fini); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c new file mode 100644 index 00000000..23f8c816 --- /dev/null +++ b/net/netfilter/ipset/pfxlen.c @@ -0,0 +1,291 @@ +#include + +/* + * Prefixlen maps for fast conversions, by Jan Engelhardt. + */ + +#define E(a, b, c, d) \ + {.ip6 = { \ + __constant_htonl(a), __constant_htonl(b), \ + __constant_htonl(c), __constant_htonl(d), \ + } } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_netmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_netmask_map[] = { + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_netmask_map); + +#undef E +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32) a, (__force __be32) b, \ + (__force __be32) c, (__force __be32) d, \ + } } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_hostmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_hostmask_map[] = { + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_hostmask_map); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 00000000..70bd1d07 --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,267 @@ +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support" + depends on NET && INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: . + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS" + depends on IPV6 = y || IP_VS = IPV6 + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + See http://www.mindbasket.com/ipvs for more information. + + Say N if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 + default 12 + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + ---help--- + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_SCTP + bool "SCTP load balancing support" + select LIBCRC32C + ---help--- + This option enables support for load balancing SCTP transport + protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + select GCD + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + select IP_VS_NFCT + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + +config IP_VS_PE_SIP + tristate "SIP persistence engine" + depends on IP_VS_PROTO_UDP + depends on NF_CONNTRACK_SIP + ---help--- + Allow persistence based on the SIP Call-ID + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 00000000..34ee602d --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,40 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o + +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +# IPVS connection template retrievers +obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 00000000..059af312 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,601 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +static DEFINE_MUTEX(__ip_vs_app_mutex); + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(net, inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s App %s:%u registered\n", + pp->name, inc->name, ntohs(inc->port)); + + return 0; + + out: + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(net, inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, ntohs(inc->port)); + + list_del(&inc->a_list); + + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(net, app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct net *net, struct ip_vs_app *app) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ipvs->app_list); + + mutex_unlock(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(net, inc); + } + + list_del(&app->a_list); + + mutex_unlock(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "%s(): added delta (%d) to seq\n", + __func__, vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "%s(): subtracted delta " + "(%d) from ack_seq\n", __func__, vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "%s(): subtracted " + "previous_delta (%d) from ack_seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ipvs->app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(ipvs, 0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_app_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +int __net_init __ip_vs_app_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->app_list); + proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + +void __net_exit __ip_vs_app_cleanup(struct net *net) +{ + proc_net_remove(net, "ip_vs_app"); +} + +int __init ip_vs_app_init(void) +{ + return 0; +} + + +void ip_vs_app_cleanup(void) +{ +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 00000000..782db275 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1326 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include /* for proc_net_* */ +#include +#include +#include +#include + +#include +#include + + +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct hlist_head *ip_vs_conn_tab __read_mostly; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd __read_mostly; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 5 +#define CT_LOCKARRAY_SIZE (1<>8)) & ip_vs_conn_tab_mask; +#endif + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; +} + +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) +{ + const union nf_inet_addr *addr; + __be16 port; + + if (p->pe_data && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; + + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); +} + +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +{ + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); + + if (cp->pe) { + p.pe = cp->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + + return ip_vs_conn_hashkey_param(&p, false); +} + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return 0; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock(hash); + spin_lock(&cp->lock); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); + ret = 0; + } + + spin_unlock(&cp->lock); + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + hlist_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + spin_unlock(&cp->lock); + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + unsigned hash; + struct ip_vs_conn *cp; + struct hlist_node *n; + + hash = ip_vs_conn_hashkey_param(p, false); + + ct_read_lock(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == p->af && + p->cport == cp->cport && p->vport == cp->vport && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +static int +ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse, + struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + struct net *net = skb_net(skb); + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return 1; + + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); + return 0; +} + +struct ip_vs_conn * +ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + return NULL; + + return ip_vs_conn_in_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) +{ + unsigned hash; + struct ip_vs_conn *cp; + struct hlist_node *n; + + hash = ip_vs_conn_hashkey_param(p, false); + + ct_read_lock(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe_data && p->pe->ct_match) { + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) + goto out; + continue; + } + + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->cport == cp->cport && p->vport == cp->vport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + p->protocol == cp->protocol) + goto out; + } + cp = NULL; + + out: + if (cp) + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + struct hlist_node *n; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey_param(p, true); + + ct_read_lock(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == p->af && + p->vport == cp->cport && p->cport == cp->dport && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +struct ip_vs_conn * +ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + return NULL; + + return ip_vs_conn_out_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? + 0 : cp->timeout; + mod_timer(&cp->timer, jiffies+t); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + unsigned int conn_flags; + + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + conn_flags = atomic_read(&dest->conn_flags); + if (cp->protocol != IPPROTO_UDP) + conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + /* Bind with the destination and its corresponding transmitter */ + if (cp->flags & IP_VS_CONN_F_SYNC) { + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + cp->flags &= ~IP_VS_CONN_F_FWD_MASK; + } + cp->flags |= conn_flags; + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the persistent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the persistent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + +static int expire_quiescent_template(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL + return ipvs->sysctl_expire_quiescent_template && + (atomic_read(&dest->weight) == 0); +#else + return 0; +#endif +} + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + expire_quiescent_template(ipvs, dest)) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (cp->flags & IP_VS_CONN_F_NFCT) { + ip_vs_conn_drop_conntrack(cp); + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } + + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ipvs->conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(const struct ip_vs_conn_param *p, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + struct ip_vs_dest *dest, __u32 fwmark) +{ + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(p->net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + p->protocol); + + cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NULL; + } + + INIT_HLIST_NODE(&cp->c_list); + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + ip_vs_conn_net_set(cp, p->net); + cp->af = p->af; + cp->protocol = p->protocol; + ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + cp->vport = p->vport; + /* proto should only be IPPROTO_IP if d_addr is a fwmark */ + ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + cp->fwmark = fwmark; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ipvs->conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (p->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); + + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled(ipvs)) + cp->flags |= IP_VS_CONN_F_NFCT; + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct hlist_head *l; +}; + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *n; + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + ct_read_lock_bh(idx); + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *e; + struct hlist_head *l = iter->l; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next)) + return hlist_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < ip_vs_conn_tab_size) { + ct_read_lock_bh(idx); + hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + iter->l = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct ip_vs_iter_state *iter = seq->private; + struct hlist_head *l = iter->l; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + if (cp->pe_data) { + pe_data[0] = ' '; + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->pe->show_pe_data(cp, pe_data + len); + } + pe_data[len] = '\0'; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%pI6 %04X %-11s %7lu%s\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + &cp->daddr.in6, ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ, pe_data); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu%s\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ, pe_data); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state)); +} + +static const struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + &cp->daddr.in6, ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state)); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(struct net *net) +{ + int idx; + struct ip_vs_conn *cp; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned hash = net_random() & ip_vs_conn_tab_mask; + struct hlist_node *n; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + /* connection template */ + continue; + if (!ip_vs_conn_net_eq(cp, net)) + continue; + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(struct net *net) +{ + int idx; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); + +flush_again: + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + struct hlist_node *n; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(idx); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + if (!ip_vs_conn_net_eq(cp, net)) + continue; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ipvs->conn_count) != 0) { + schedule(); + goto flush_again; + } +} +/* + * per netns init and exit + */ +int __net_init __ip_vs_conn_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + atomic_set(&ipvs->conn_count, 0); + + proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + return 0; +} + +void __net_exit __ip_vs_conn_cleanup(struct net *net) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(net); + proc_net_remove(net, "ip_vs_conn"); + proc_net_remove(net, "ip_vs_conn_sync"); +} + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + pr_info("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) + INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + +void ip_vs_conn_cleanup(void) +{ + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 00000000..0787bed0 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,2050 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include +#include +#include /* net_generic() */ + +#include +#include + +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#include +#endif + +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + +int ip_vs_net_id __read_mostly; +#ifdef IP_VS_GENERIC_NETNS +EXPORT_SYMBOL(ip_vs_net_id); +#endif +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_SCTP: + return "SCTP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(cp->dest->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.conns++; +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (unlikely(!pd->pp->state_transition)) + return 0; + return pd->pp->state_transition(cp, direction, skb, pd); +} + +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + vport, p); + p->pe = svc->pe; + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + + return 0; +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + struct sk_buff *skb, + __be16 src_port, __be16 dst_port, int *ignored) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport = 0; /* destination port to forward */ + unsigned int flags; + struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + union nf_inet_addr snet; /* source network of the client, + after masking */ + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + { + int protocol = iph.protocol; + const union nf_inet_addr *vaddr = &iph.daddr; + __be16 vport = 0; + + if (dst_port == svc->port) { + /* non-FTP template: + * + * FTP template: + * + */ + if (svc->port != FTPPORT) + vport = dst_port; + } else { + /* Note: persistent fwmark-based services and + * persistent port zero service are handled here. + * fwmark template: + * + * port zero template: + * + */ + if (svc->fwmark) { + protocol = IPPROTO_IP; + vaddr = &fwmark; + } + } + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } + } + + /* Check if a template already exists */ + ct = ip_vs_ct_in_get(¶m); + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP + */ + dest = svc->scheduler->schedule(svc, skb); + if (!dest) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); + *ignored = 0; + return NULL; + } + + if (dst_port == svc->port && svc->port != FTPPORT) + dport = dest->port; + + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ + ct = ip_vs_conn_new(¶m, &dest->addr, dport, + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); + if (ct == NULL) { + kfree(param.pe_data); + *ignored = -1; + return NULL; + } + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + kfree(param.pe_data); + } + + dport = dst_port; + if (dport == svc->port && dest->port) + dport = dest->port; + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a new connection according to the template + */ + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, + src_port, &iph.daddr, dst_port, ¶m); + + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + if (cp == NULL) { + ip_vs_conn_put(ct); + *ignored = -1; + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *ignored) +{ + struct ip_vs_protocol *pp = pd->pp; + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr; + unsigned int flags; + + *ignored = 1; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); + + *ignored = 0; + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + pr_err("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a connection entry. + */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, + &iph.saddr, pptr[0], &iph.daddr, pptr[1], + &p); + cp = ip_vs_conn_new(&p, &dest->addr, + dest->port ? dest->port : pptr[1], + flags, dest, skb->mark); + if (!cp) { + *ignored = -1; + return NULL; + } + } + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + __be16 _ports[2], *pptr; + struct ip_vs_iphdr iph; +#ifdef CONFIG_SYSCTL + struct net *net; + struct netns_ipvs *ipvs; + int unicast; +#endif + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + +#ifdef CONFIG_SYSCTL + net = skb_net(skb); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + ipvs = net_ipvs(net); + if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + int ret, cs; + struct ip_vs_conn *cp; + unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL, skb->mark); + if (!cp) + return NF_DROP; + } + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pd->pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } +#endif + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + } else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + +#ifdef CONFIG_SYSCTL + +static int sysctl_snat_reroute(struct sk_buff *skb) +{ + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + return ipvs->sysctl_snat_reroute; +} + +static int sysctl_nat_icmp_send(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + return ipvs->sysctl_nat_icmp_send; +} + +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_expire_nodest_conn; +} + +#else + +static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } +static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } + +#endif + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + +static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err = ip_defrag(skb, user); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + +static int ip_vs_route_me_harder(int af, struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) + return 1; + } else +#endif + if ((sysctl_snat_reroute(skb) || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + return 1; + + return 0; +} + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || + IPPROTO_SCTP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr || + IPPROTO_SCTP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + pr_err("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + if (ip_vs_route_me_harder(af, skb)) + goto out; + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); +} +#endif + +/* + * Check if sctp chunc is ABORT chunk + */ +static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) +{ + sctp_chunkhdr_t *sch, schunk; + sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return 0; + if (sch->type == SCTP_CID_ABORT) + return 1; + return 0; +} + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* Handle response packets: rewrite addresses and send away... + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + struct ip_vs_conn *cp, int ihl) +{ + struct ip_vs_protocol *pp = pd->pp; + + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_vs_route_me_harder(af, skb)) + goto drop; + + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + ip_vs_conn_put(cp); + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + LeaveFunction(11); + return NF_STOLEN; +} + +/* + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) +{ + struct net *net = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + + EnterFunction(11); + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } else +#endif + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); + + if (likely(cp)) + return handle_response(af, skb, pd, cp, iph.len); + if (sysctl_nat_icmp_send(net) && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(net, af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + struct net *net = + dev_net(skb_dst(skb)->dev); + + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; +} + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct net *net = NULL; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, ihl, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + net = skb_net(skb); + + pd = ip_vs_proto_data_get(net, cih->protocol); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", + &iph->saddr); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum); + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct net *net = NULL; + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->nexthdr); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr || + IPPROTO_SCTP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum); + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) +{ + struct net *net; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + int ret, restart, pkts; + struct netns_ipvs *ipvs; + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset + */ + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); + return NF_ACCEPT; + } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* Protocol supported? */ + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); + + if (unlikely(!cp)) { + int v; + + if (!pp->conn_schedule(af, skb, pd, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_in: packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); + ipvs = net_ipvs(net); + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_expire_nodest_conn(ipvs)) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. + */ + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = sysctl_sync_threshold(ipvs); + else + pkts = atomic_add_return(1, &cp->in_pkts); + + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && + cp->protocol == IPPROTO_SCTP) { + if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && + (pkts % sysctl_sync_period(ipvs) + == sysctl_sync_threshold(ipvs))) || + (cp->old_state != cp->state && + ((cp->state == IP_VS_SCTP_S_CLOSED) || + (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || + (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { + ip_vs_sync_conn(net, cp); + goto out; + } + } + + /* Keep this block last: TCP and others with pp->num_states <= 1 */ + else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (pkts % sysctl_sync_period(ipvs) + == sysctl_sync_threshold(ipvs))) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE) || + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) + ip_vs_sync_conn(net, cp); +out: + cp->old_state = cp->state; + + ip_vs_conn_put(cp); + return ret; +} + +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + struct net *net; + + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + struct net *net; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + + +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_remote_request4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_remote_request6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply6, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +#endif +}; +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; + ipvs->net = net; + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + + if (__ip_vs_estimator_init(net) < 0) + goto estimator_fail; + + if (__ip_vs_control_init(net) < 0) + goto control_fail; + + if (__ip_vs_protocol_init(net) < 0) + goto protocol_fail; + + if (__ip_vs_app_init(net) < 0) + goto app_fail; + + if (__ip_vs_conn_init(net) < 0) + goto conn_fail; + + if (__ip_vs_sync_init(net) < 0) + goto sync_fail; + + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", + sizeof(struct netns_ipvs), ipvs->gen); + return 0; +/* + * Error handling + */ + +sync_fail: + __ip_vs_conn_cleanup(net); +conn_fail: + __ip_vs_app_cleanup(net); +app_fail: + __ip_vs_protocol_cleanup(net); +protocol_fail: + __ip_vs_control_cleanup(net); +control_fail: + __ip_vs_estimator_cleanup(net); +estimator_fail: + return -ENOMEM; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ + __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */ + __ip_vs_conn_cleanup(net); + __ip_vs_app_cleanup(net); + __ip_vs_protocol_cleanup(net); + __ip_vs_control_cleanup(net); + __ip_vs_estimator_cleanup(net); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); +} + +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + smp_wmb(); + __ip_vs_sync_cleanup(net); + LeaveFunction(2); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit = __ip_vs_cleanup, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; + +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ip_vs_estimator_init(); + ret = ip_vs_control_init(); + if (ret < 0) { + pr_err("can't setup control.\n"); + goto cleanup_estimator; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + pr_err("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + pr_err("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = ip_vs_sync_init(); + if (ret < 0) { + pr_err("can't setup sync data.\n"); + goto cleanup_conn; + } + + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_sync; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) { + pr_err("can't register hooks.\n"); + goto cleanup_dev; + } + + pr_info("ipvs loaded.\n"); + + return ret; + +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); +cleanup_sync: + ip_vs_sync_cleanup(); + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_estimator: + ip_vs_estimator_cleanup(); + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ + ip_vs_sync_cleanup(); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + ip_vs_estimator_cleanup(); + pr_info("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 00000000..a178cb34 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,3778 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif +#include +#include +#include + +#include + +#include + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* sysctl variables */ + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc); + + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi6 fl6 = { + .daddr = *addr, + }; + + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); + if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif + +#ifdef CONFIG_SYSCTL +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(struct netns_ipvs *ipvs) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < ipvs->sysctl_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { + case 0: + atomic_set(&ipvs->dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; + } else { + atomic_set(&ipvs->dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + } else { + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ipvs->dropentry, 1); + break; + } + spin_unlock(&ipvs->dropentry_lock); + + /* drop_packet */ + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { + case 0: + ipvs->drop_rate = 0; + break; + case 1: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; + } else { + ipvs->drop_rate = 0; + } + break; + case 2: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + } else { + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; + } + break; + case 3: + ipvs->drop_rate = ipvs->sysctl_am_droprate; + break; + } + spin_unlock(&ipvs->droppacket_lock); + + /* secure_tcp */ + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + ipvs->sysctl_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + ipvs->sysctl_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = ipvs->sysctl_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); + + local_bh_enable(); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ + +static void defense_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); + + update_defense_level(ipvs); + if (atomic_read(&ipvs->dropentry)) + ip_vs_random_dropentry(ipvs->net); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); +} +#endif + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + + +/* + * Returns hash value for virtual service + */ +static inline unsigned +ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, + const union nf_inet_addr *addr, __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + addr_fold ^= ((size_t)net>>8); + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +{ + return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + &svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from svc_table / svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + pr_err("%s(): request for unhash flagged, called from %pF\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {netns, proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_find(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol) + && net_eq(svc->net, net)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(net, fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af + && net_eq(svc->net, net)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + struct netns_ipvs *ipvs = net_ipvs(net); + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark) { + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (svc) + goto out; + } + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ipvs->ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ipvs->nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); + } + + out: + if (svc) + atomic_inc(&svc->usecnt); + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); + kfree(svc); + } +} + + +/* + * Returns hash value for real service + */ +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in rs_table by . + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + + list_add(&dest->d_list, &ipvs->rs_table[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from rs_table. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the rs_table table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + read_lock(&ipvs->rs_lock); + list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&ipvs->rs_lock); + return dest; + } + } + read_unlock(&ipvs->rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, + const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol, __u32 fwmark) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(struct net *net) +{ + struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); + + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); + kfree(dest); + } +} + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c + + spin_lock_bh(&src->lock); + + IP_VS_SHOW_STATS_COUNTER(conns); + IP_VS_SHOW_STATS_COUNTER(inpkts); + IP_VS_SHOW_STATS_COUNTER(outpkts); + IP_VS_SHOW_STATS_COUNTER(inbytes); + IP_VS_SHOW_STATS_COUNTER(outbytes); + + ip_vs_read_estimator(dst, src); + + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + /* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c + + IP_VS_ZERO_STATS_COUNTER(conns); + IP_VS_ZERO_STATS_COUNTER(inpkts); + IP_VS_ZERO_STATS_COUNTER(outpkts); + IP_VS_ZERO_STATS_COUNTER(inbytes); + IP_VS_ZERO_STATS_COUNTER(outbytes); + + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_vs_dest_user_kern *udest, int add) +{ + struct netns_ipvs *ipvs = net_ipvs(svc->net); + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; + conn_flags |= IP_VS_CONN_F_INACTIVE; + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in rs_table if not present. + * For now only for NAT! + */ + write_lock_bh(&ipvs->rs_lock); + ip_vs_rs_hash(ipvs, dest); + write_unlock_bh(&ipvs->rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; + + spin_lock_bh(&dest->dst_lock); + ip_vs_dst_reset(dest); + spin_unlock_bh(&dest->dst_lock); + + if (add) + ip_vs_start_estimator(svc->net, &dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + if (add) { + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + } + + /* call the update_service, because server weight may be changed */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(svc->net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); + if (dest == NULL) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto err_alloc; + } + + dest->af = svc->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 1); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest, 1); + + *dest_p = dest; + + LeaveFunction(2); + return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG(1, "%s(): dest already exists\n", __func__); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + __ip_vs_update_dest(svc, dest, udest, 1); + ret = 0; + } else { + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + } + LeaveFunction(2); + + return ret; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest, 0); + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_stop_estimator(net, &dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&ipvs->rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&ipvs->rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + free_percpu(dest->stats.cpustats); + kfree(dest); + } else { + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ipvs->dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): destination not found!\n", __func__); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(svc->net, dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; + struct ip_vs_service *svc = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); + ret = -ENOENT; + goto out_err; + } + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } +#endif + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); + if (svc == NULL) { + IP_VS_DBG(1, "%s(): no memory\n", __func__); + ret = -ENOMEM; + goto out_err; + } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto out_err; + } + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 0); + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + svc->net = net; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Bind the ct retriever */ + ip_vs_bind_pe(svc, pe); + pe = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ipvs->nullsvc_counter); + + ip_vs_start_estimator(net, &svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; + return 0; + + + out_err: + if (svc != NULL) { + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); + } + ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); + return -ENOENT; + } + old_sched = sched; + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { + ret = -EINVAL; + goto out; + } +#endif + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out_unlock; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out_unlock; + } + } + + old_pe = svc->pe; + if (pe != old_pe) { + ip_vs_unbind_pe(svc); + ip_vs_bind_pe(svc, pe); + } + + out_unlock: + write_unlock_bh(&__ip_vs_svc_lock); + out: + ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + pr_info("%s: enter\n", __func__); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services--; + + ip_vs_stop_estimator(svc->net, &svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + ip_vs_scheduler_put(old_sched); + + /* Unbind persistence engine */ + old_pe = svc->pe; + ip_vs_unbind_pe(svc); + ip_vs_pe_put(old_pe); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(svc->net, dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ipvs->nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); + kfree(svc); + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Unlink a service from list and try to delete it if its refcnt reached 0 + */ +static void ip_vs_unlink_service(struct ip_vs_service *svc) +{ + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + ip_vs_unlink_service(svc); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(struct net *net) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], + s_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); + } + } + + return 0; +} + +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void __ip_vs_service_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} +/* + * Release dst hold by dst_cache + */ +static inline void +__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +{ + spin_lock_bh(&dest->dst_lock); + if (dest->dst_cache && dest->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + ip_vs_dst_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* + * Netdev event receiver + * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to + * a device that is "unregister" it must be released. + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + } + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + + } + } + + list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + __ip_vs_dev_reset(dest, dev); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(struct net *net) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&net_ipvs(net)->tot_stats); + return 0; +} + +#ifdef CONFIG_SYSCTL +static int +proc_do_defense_mode(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(net_ipvs(net)); + } + } + return rc; +} + +static int +proc_do_sync_threshold(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + +static int +proc_do_sync_mode(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 1)) { + /* Restore the correct value */ + *valp = val; + } else { + struct net *net = current->nsproxy->net_ns; + ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in __ip_vs_control_init() + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "am_droprate", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_entry", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "secure_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "snat_reroute", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if 0 + { + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif + { } +}; + +const struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "vs", }, + { } +}; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); +#endif + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct seq_net_private p; /* Do not move this, netns depends upon it*/ + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net) && pos-- == 0) { + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net) && pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [%pI6]:%04X %s ", + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } else { + seq_printf(seq, "FWM %08X %s %s", + svc->fwmark, svc->scheduler->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [%pI6]:%04X" + " %-7s %-6d %-10d %-10d\n", + &dest->addr.in6, + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); +} + +static const struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats_user show; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, + show.inpkts, show.outpkts, + (unsigned long long) show.inbytes, + (unsigned long long) show.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8X %8X %8X %16X %16X\n", + show.cps, show.inpps, show.outpps, + show.inbps, show.outbps); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_show); +} + +static const struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_stats_user rates; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); + unsigned int start; + __u64 inbytes, outbytes; + + do { + start = u64_stats_fetch_begin_bh(&u->syncp); + inbytes = u->ustats.inbytes; + outbytes = u->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)inbytes, + (__u64)outbytes); + } + + spin_lock_bh(&tot_stats->lock); + + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", + tot_stats->ustats.conns, tot_stats->ustats.inpkts, + tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); + + ip_vs_read_estimator(&rates, tot_stats); + + spin_unlock_bh(&tot_stats->lock); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8X %8X %8X %16X %16X\n", + rates.cps, + rates.inpps, + rates.outpps, + rates.inbps, + rates.outbps); + + return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + memset(udest, 0, sizeof(*udest)); + + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + struct net *net = sock_net(sk); + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) + return -EINVAL; + if (len < 0 || len > MAX_ARG_LEN) + return -EINVAL; + if (len != set_arglen[SET_CMDID(cmd)]) { + pr_err("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (mutex_lock_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(net); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(net, dm->state); + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(net); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && + usvc.protocol != IPPROTO_SCTP) { + pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n", + usvc.protocol, &usvc.addr.ip, + ntohs(usvc.port), usvc.sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(net, &usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + break; + default: + ret = -EINVAL; + } + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(struct net *net, + const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || !net_eq(svc->net, net)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || !net_eq(svc->net, net)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); + else + svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + get->port); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + +#ifdef CONFIG_IP_VS_PROTO_TCP + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + u->udp_timeout = + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + unsigned int copylen; + struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + BUG_ON(!net); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) + return -EINVAL; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + pr_err("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + copylen = get_arglen[GET_CMDID(cmd)]; + if (copylen > 128) + return -EINVAL; + + if (copy_from_user(arg, user, copylen) != 0) + return -EFAULT; + + if (mutex_lock_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = ip_vs_conn_tab_size; + info.num_services = ipvs->num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(net, get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); + else + svc = __ip_vs_service_find(net, AF_INET, + entry->protocol, &addr, + entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(net, get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ipvs->sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; + } + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct ip_vs_stats_user ustats; + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + ip_vs_copy_stats(&ustats, stats); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + if (svc->pe) + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct net *net = skb_sknet(skb); + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start || !net_eq(svc->net, net)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start || !net_eq(svc->net, net)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct net *net, + struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry, + struct ip_vs_service **ret_svc) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + struct ip_vs_service *svc; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = nla_get_u16(nla_af); +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + *ret_svc = svc; + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (svc) + usvc->flags = svc->flags; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, + struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + int ret; + + ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + return ret ? ERR_PTR(ret) : svc; +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net = skb_sknet(skb); + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + memset(udest, 0, sizeof(*udest)); + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = skb_sknet(skb); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_mutex); + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ipvs->master_mcast_ifn, + ipvs->master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ipvs->backup_mcast_ifn, + ipvs->backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(net, &t); +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + struct net *net; + struct netns_ipvs *ipvs; + + net = skb_sknet(skb); + ipvs = net_ipvs(net); + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(net); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(net, info->attrs); + goto out; + } else if (cmd == IPVS_CMD_NEW_DAEMON || + cmd == IPVS_CMD_DEL_DAEMON) { + + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(net, daemon_attrs); + else + ret = ip_vs_genl_del_daemon(net, daemon_attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(net); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(net, &usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc, &svc); + if (ret) + goto out; + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(net, &usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + /* do not use svc, it can be freed */ + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + struct net *net; + struct netns_ipvs *ipvs; + + net = skb_sknet(skb); + ipvs = net_ipvs(net); + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + pr_err("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(net, + info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + goto out; + +nla_put_failure: + pr_err("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + return genl_register_family_with_ops(&ip_vs_genl_family, + ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops)); +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + +/* + * per netns intit/exit func. + */ +#ifdef CONFIG_SYSCTL +int __net_init __ip_vs_control_init_sysctl(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ctl_table *tbl; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; + ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + + + ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, + tbl); + if (ipvs->sysctl_hdr == NULL) { + if (!net_eq(net, &init_net)) + kfree(tbl); + return -ENOMEM; + } + ip_vs_start_estimator(net, &ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + + return 0; +} + +void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); +} + +#else + +int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; } +void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } + +#endif + +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + +int __net_init __ip_vs_control_init(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + + ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + + /* procfs stats */ + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats.cpustats) { + pr_err("%s(): alloc_percpu.\n", __func__); + return -ENOMEM; + } + spin_lock_init(&ipvs->tot_stats.lock); + + proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); + proc_net_fops_create(net, "ip_vs_stats_percpu", 0, + &ip_vs_stats_percpu_fops); + + if (__ip_vs_control_init_sysctl(net)) + goto err; + + return 0; + +err: + free_percpu(ipvs->tot_stats.cpustats); + return -ENOMEM; +} + +void __net_exit __ip_vs_control_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_trash_cleanup(net); + ip_vs_stop_estimator(net, &ipvs->tot_stats); + __ip_vs_control_cleanup_sysctl(net); + proc_net_remove(net, "ip_vs_stats_percpu"); + proc_net_remove(net, "ip_vs_stats"); + proc_net_remove(net, "ip_vs"); + free_percpu(ipvs->tot_stats.cpustats); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + pr_err("cannot register sockopt.\n"); + goto err_sock; + } + + ret = ip_vs_genl_register(); + if (ret) { + pr_err("cannot register Generic Netlink interface.\n"); + goto err_genl; + } + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + goto err_notf; + + LeaveFunction(2); + return 0; + +err_notf: + ip_vs_genl_unregister(); +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: + return ret; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + unregister_netdevice_notifier(&ip_vs_dst_notifier); + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 00000000..95fd0d14 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,271 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, + const union nf_inet_addr *addr) +{ + return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + pr_err("%s(): no memory\n", __func__); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 00000000..508cce98 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,218 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, + struct ip_vs_cpu_stats *stats) +{ + int i; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + __u64 inbytes, outbytes; + if (i) { + sum->conns += s->ustats.conns; + sum->inpkts += s->ustats.inpkts; + sum->outpkts += s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + inbytes = s->ustats.inbytes; + outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + sum->conns = s->ustats.conns; + sum->inpkts = s->ustats.inpkts; + sum->outpkts = s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + sum->inbytes = s->ustats.inbytes; + sum->outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + } + } +} + + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + struct net *net = (struct net *)arg; + struct netns_ipvs *ipvs; + + ipvs = net_ipvs(net); + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns) << 9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps) >> 2; + + rate = (n_inpkts - e->last_inpkts) << 9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps) >> 2; + + rate = (n_outpkts - e->last_outpkts) << 9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps) >> 2; + + rate = (n_inbytes - e->last_inbytes) << 4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps) >> 2; + + rate = (n_outbytes - e->last_outbytes) << 4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps) >> 2; + spin_unlock(&s->lock); + } + spin_unlock(&ipvs->est_lock); + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +} + +void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&ipvs->est_lock); + list_del(&est->list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + struct ip_vs_stats_user *u = &stats->ustats; + + /* reset counters, caller must hold the stats->lock lock */ + est->last_inbytes = u->inbytes; + est->last_outbytes = u->outbytes; + est->last_conns = u->conns; + est->last_inpkts = u->inpkts; + est->last_outpkts = u->outpkts; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_stats_user *dst, + struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e = &stats->est; + + dst->cps = (e->cps + 0x1FF) >> 10; + dst->inpps = (e->inpps + 0x1FF) >> 10; + dst->outpps = (e->outpps + 0x1FF) >> 10; + dst->inbps = (e->inbps + 0xF) >> 5; + dst->outbps = (e->outbps + 0xF) >> 5; +} + +int __net_init __ip_vs_estimator_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + return 0; +} + +void __net_exit __ip_vs_estimator_cleanup(struct net *net) +{ + del_timer_sync(&net_ipvs(net)->est_timer); +} + +int __init ip_vs_estimator_init(void) +{ + return 0; +} + +void ip_vs_estimator_cleanup(void) +{ +} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 00000000..af63553f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,480 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, NULL, 0); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __be32 *addr, __be16 *port, + char **start, char **end) +{ + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + *start = data + plen; + + for (data = *start; *data != term; data++) { + if (data == data_limit) + return -1; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = *start; data != *end; data++) { + if (*data >= '0' && *data <= '9') { + p[i] = p[i]*10 + *data - '0'; + } else if (*data == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *addr = get_unaligned((__be32 *)p); + *port = get_unaligned((__be16 *)(p + 4)); + return 1; +} + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct net *net; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", + &from.ip, ntohs(port), &cp->caddr.ip, 0); + + /* + * Now update or create an connection entry for it + */ + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), + AF_INET, IPPROTO_TCP, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&from.ip)[0], + ((unsigned char *)&from.ip)[1], + ((unsigned char *)&from.ip)[2], + ((unsigned char *)&from.ip)[3], + ntohs(port) >> 8, + ntohs(port) & 0xFF); + + buf_len = strlen(buf); + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) { + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + start-data, end-start, + buf, buf_len); + if (ret) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } + } + + /* + * Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + net = skb_net(skb); + cp->app_data = NULL; + ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + struct net *net; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n", + ip_vs_proto_name(iph->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, 0); + + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + } + + /* + * Move tunnel to listen state + */ + net = skb_net(skb); + ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + +/* + * per netns ip_vs_ftp initialization + */ +static int __net_init __ip_vs_ftp_init(struct net *net) +{ + int i, ret; + struct ip_vs_app *app; + struct netns_ipvs *ipvs = net_ipvs(net); + + app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL); + if (!app) + return -ENOMEM; + INIT_LIST_HEAD(&app->a_list); + INIT_LIST_HEAD(&app->incs_list); + ipvs->ftp_app = app; + + ret = register_ip_vs_app(net, app); + if (ret) + goto err_exit; + + for (i=0; iprotocol, ports[i]); + if (ret) + goto err_unreg; + pr_info("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + return 0; + +err_unreg: + unregister_ip_vs_app(net, app); +err_exit: + kfree(ipvs->ftp_app); + return ret; +} +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_ip_vs_app(net, ipvs->ftp_app); + kfree(ipvs->ftp_app); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; + +int __init ip_vs_ftp_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_ftp_ops); + return rv; +} + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_pernet_subsys(&ip_vs_ftp_ops); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 00000000..87e40ea7 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,625 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitialized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ +#ifdef CONFIG_SYSCTL +static ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is referred either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned +ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned hash = ip_vs_lblc_hashkey(af, addr); + struct ip_vs_lblc_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(dest->af, tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + pr_err("%s(): no memory\n", __func__); + return NULL; + } + + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + struct ip_vs_lblc_entry *en, *nxt; + int i; + + for (i=0; ibucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + } +} + +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblc_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; + unsigned long now = jiffies; + int i, j; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + + sysctl_lblc_expiration(svc))) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + pr_err("%s(): no memory\n", __func__); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLC: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, &iph.daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + return -ENOMEM; + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblc_ops); + return ret; +} + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 00000000..90f618ab --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,821 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_set_elem { + struct list_head list; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct list_head list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_set_elem * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_set_elem *e; + + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) { + pr_err("%s(): no memory\n", __func__); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + list_add(&e->list, &set->list); + atomic_inc(&set->size); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_set_elem *e; + + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) { + /* HIT */ + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + list_del(&e->list); + kfree(e); + break; + } + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_set_elem *e, *ep; + + write_lock(&set->lock); + list_for_each_entry_safe(e, ep, &set->list, list) { + /* + * We don't kfree dest because it is referred either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + list_del(&e->list); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry(e, &set->list, list) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + list_for_each_entry(e, &set->list, list) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry(e, &set->list, list) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = ip_vs_dest_conn_overhead(most); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + list_for_each_entry(e, &set->list, list) { + dest = e->dest; + doh = ip_vs_dest_conn_overhead(dest); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +#ifdef CONFIG_SYSCTL +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned +ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned hash = ip_vs_lblcr_hashkey(af, addr); + struct ip_vs_lblcr_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(dest->af, tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + pr_err("%s(): no memory\n", __func__); + return NULL; + } + + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; + + /* initialize its dest set */ + atomic_set(&(en->set.size), 0); + INIT_LIST_HEAD(&en->set.list); + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); + } + + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + /* No locking required, only called during cleanup. */ + for (i=0; ibucket[i], list) { + ip_vs_lblcr_free(en); + } + } +} + +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblcr_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse + + sysctl_lblcr_expiration(svc), now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + pr_err("%s(): no memory\n", __func__); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); + dest = ip_vs_dest_set_min(&en->set); + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_lblcr_expiration(svc))) { + struct ip_vs_dest *m; + + write_lock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + read_unlock(&svc->sched_lock); + return NULL; + } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, &iph.daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + return -ENOMEM; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblcr_ops); + return ret; +} + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 00000000..f391819c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,91 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (!least) + ip_vs_scheduler_err(svc, "no destination available"); + else + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " + "inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 00000000..f454c80d --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,294 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Authors: + * Ben North + * Julian Anastasov Reorganize and sync with latest kernels + * Hannes Eder Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(&p); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 00000000..984d9c13 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,140 @@ +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 00000000..5cf859cc --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,140 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* lock for service table */ +static DEFINE_SPINLOCK(ip_vs_pe_lock); + +/* Bind a service with a pe */ +void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) +{ + svc->pe = pe; +} + +/* Unbind a service from its pe */ +void ip_vs_unbind_pe(struct ip_vs_service *svc) +{ + svc->pe = NULL; +} + +/* Get pe in the pe list by name */ +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + spin_lock_bh(&ip_vs_pe_lock); + + list_for_each_entry(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + spin_unlock_bh(&ip_vs_pe_lock); + return pe; + } + if (pe->module) + module_put(pe->module); + } + + spin_unlock_bh(&ip_vs_pe_lock); + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = __ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = __ip_vs_pe_getbyname(name); + } + + return pe; +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + spin_lock_bh(&ip_vs_pe_lock); + + if (!list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already linked\n", + __func__, pe->name); + return -EINVAL; + } + + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add(&pe->n_list, &ip_vs_pe); + spin_unlock_bh(&ip_vs_pe_lock); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + spin_lock_bh(&ip_vs_pe_lock); + if (list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + pr_err("%s(): [%s] pe is not in the list. failed\n", + __func__, pe->name); + return -EINVAL; + } + + /* Remove it from the d-linked pe list */ + list_del(&pe->n_list); + spin_unlock_bh(&ip_vs_pe_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c new file mode 100644 index 00000000..13d607ae --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -0,0 +1,171 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include +#include +#include + +#ifdef CONFIG_IP_VS_DEBUG +static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, + const char *callid, size_t callid_len, + int *idx) +{ + size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + memcpy(buf + *idx, callid, len); + buf[*idx+len] = '\0'; + *idx += len + 1; + return buf + *idx - len; +} + +#define IP_VS_DEBUG_CALLID(callid, len) \ + ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ + callid, len, &ip_vs_dbg_idx) +#endif + +static int get_callid(const char *dptr, unsigned int dataoff, + unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen) +{ + /* Find callid */ + while (1) { + int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen, + SIP_HDR_CALL_ID, matchoff, + matchlen); + if (ret > 0) + break; + if (!ret) + return 0; + dataoff += *matchoff; + } + + /* Empty callid is useless */ + if (!*matchlen) + return -EINVAL; + + /* Too large is useless */ + if (*matchlen > IP_VS_PEDATA_MAXLEN) + return -EINVAL; + + /* SIP headers are always followed by a line terminator */ + if (*matchoff + *matchlen == datalen) + return -EINVAL; + + /* RFC 2543 allows lines to be terminated with CR, LF or CRLF, + * RFC 3261 allows only CRLF, we support both. */ + if (*(dptr + *matchoff + *matchlen) != '\r' && + *(dptr + *matchoff + *matchlen) != '\n') + return -EINVAL; + + IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n", + IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen), + *matchlen); + return 0; +} + +static int +ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) +{ + struct ip_vs_iphdr iph; + unsigned int dataoff, datalen, matchoff, matchlen; + const char *dptr; + int retc; + + ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); + + /* Only useful with UDP */ + if (iph.protocol != IPPROTO_UDP) + return -EINVAL; + + /* No Data ? */ + dataoff = iph.len + sizeof(struct udphdr); + if (dataoff >= skb->len) + return -EINVAL; + + if ((retc=skb_linearize(skb)) < 0) + return retc; + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + + if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) + return -EINVAL; + + /* N.B: pe_data is only set on success, + * this allows fallback to the default persistence logic on failure + */ + p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + + p->pe_data_len = matchlen; + + return 0; +} + +static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct) + +{ + bool ret = 0; + + if (ct->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && + /* protocol should only be IPPROTO_IP if + * d_addr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + p->vaddr, &ct->vaddr) && + ct->vport == p->vport && + ct->flags & IP_VS_CONN_F_TEMPLATE && + ct->protocol == p->protocol && + ct->pe_data && ct->pe_data_len == p->pe_data_len && + !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) + ret = 1; + + IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, + u32 initval, bool inverse) +{ + return jhash(p->pe_data, p->pe_data_len, initval); +} + +static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) +{ + memcpy(buf, cp->pe_data, cp->pe_data_len); + return cp->pe_data_len; +} + +static struct ip_vs_pe ip_vs_sip_pe = +{ + .name = "sip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list), + .fill_param = ip_vs_sip_fill_param, + .ct_match = ip_vs_sip_ct_match, + .hashkey_raw = ip_vs_sip_hashkey_raw, + .show_pe_data = ip_vs_sip_show_pe_data, +}; + +static int __init ip_vs_sip_init(void) +{ + return register_ip_vs_pe(&ip_vs_sip_pe); +} + +static void __exit ip_vs_sip_cleanup(void) +{ + unregister_ip_vs_pe(&ip_vs_sip_pe); +} + +module_init(ip_vs_sip_init); +module_exit(ip_vs_sip_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 00000000..eb860285 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,395 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \ + defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \ + defined(CONFIG_IP_VS_PROTO_ESP) +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + + if (!pd) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) + pp->init_netns(net, pd); + + return 0; +} +#endif + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data **pd_p; + unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(net, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} +EXPORT_SYMBOL(ip_vs_proto_get); + +/* + * get ip_vs_protocol object data by netns and proto + */ +struct ip_vs_proto_data * +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ + struct ip_vs_proto_data *pd; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + return __ipvs_proto_data_get(ipvs, proto); +} +EXPORT_SYMBOL(ip_vs_proto_data_get); + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) +{ + struct ip_vs_proto_data *pd; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_ATOMIC); +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, const char *const *names, + const char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI4->%pI4", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI4:%u->%pI4:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%pI6->%pI6 frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI6->%pI6", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI6:%u->%pI6:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + +/* + * per network name-space init + */ +int __net_init __ip_vs_protocol_init(struct net *net) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); +#endif + return 0; +} + +void __net_exit __ip_vs_protocol_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(net, pd); + } +} + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + REGISTER_PROTOCOL(&ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + pr_info("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000..5b8eb8b1 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,166 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + +static void +ah_esp_conn_fill_param_proto(struct net *net, int af, + const struct ip_vs_iphdr *iph, int inverse, + struct ip_vs_conn_param *p) +{ + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} + +static struct ip_vs_conn * +ah_esp_conn_in_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); + + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_in_get(&p); + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); + + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_out_get(&p); + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c new file mode 100644 index 00000000..d12ed53e --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -0,0 +1,1136 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + struct net *net; + struct ip_vs_service *svc; + sctp_chunkhdr_t _schunkh, *sch; + sctp_sctphdr_t *sh, _sctph; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch == NULL) + return 0; + net = skb_net(skb); + if ((sch->type == SCTP_CID_INIT) && + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, sh->dest))) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } + return 0; + } + ip_vs_service_put(svc); + } + /* NF_ACCEPT */ + return 1; +} + +static int +sctp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff; + struct sk_buff *iter; + __be32 crc32; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + sctph->source = cp->vport; + + /* Calculate the checksum */ + crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), + crc32); + crc32 = sctp_end_cksum(crc32); + sctph->checksum = crc32; + + return 1; +} + +static int +sctp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff; + struct sk_buff *iter; + __be32 crc32; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + sctph->dest = cp->dport; + + /* Calculate the checksum */ + crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), + crc32); + crc32 = sctp_end_cksum(crc32); + sctph->checksum = crc32; + + return 1; +} + +static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int sctphoff; + struct sctphdr *sh, _sctph; + struct sk_buff *iter; + __le32 cmp; + __le32 val; + __u32 tmp; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + cmp = sh->checksum; + + tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); + skb_walk_frags(skb, iter) + tmp = sctp_update_cksum((__u8 *) iter->data, + skb_headlen(iter), tmp); + + val = sctp_end_cksum(tmp); + + if (val != cmp) { + /* CRC failure, dump it. */ + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + return 1; +} + +struct ipvs_sctp_nextstate { + int next_state; +}; +enum ipvs_sctp_event_t { + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_SER, + IP_VS_SCTP_EVE_INIT_CLI, + IP_VS_SCTP_EVE_INIT_SER, + IP_VS_SCTP_EVE_INIT_ACK_CLI, + IP_VS_SCTP_EVE_INIT_ACK_SER, + IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, + IP_VS_SCTP_EVE_COOKIE_ECHO_SER, + IP_VS_SCTP_EVE_COOKIE_ACK_CLI, + IP_VS_SCTP_EVE_COOKIE_ACK_SER, + IP_VS_SCTP_EVE_ABORT_CLI, + IP_VS_SCTP_EVE__ABORT_SER, + IP_VS_SCTP_EVE_SHUT_CLI, + IP_VS_SCTP_EVE_SHUT_SER, + IP_VS_SCTP_EVE_SHUT_ACK_CLI, + IP_VS_SCTP_EVE_SHUT_ACK_SER, + IP_VS_SCTP_EVE_SHUT_COM_CLI, + IP_VS_SCTP_EVE_SHUT_COM_SER, + IP_VS_SCTP_EVE_LAST +}; + +static enum ipvs_sctp_event_t sctp_events[255] = { + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_INIT_CLI, + IP_VS_SCTP_EVE_INIT_ACK_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_ABORT_CLI, + IP_VS_SCTP_EVE_SHUT_CLI, + IP_VS_SCTP_EVE_SHUT_ACK_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, + IP_VS_SCTP_EVE_COOKIE_ACK_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_SHUT_COM_CLI, +}; + +static struct ipvs_sctp_nextstate + sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = { + /* + * STATE : IP_VS_SCTP_S_NONE + */ + /*next state *//*event */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }, + }, + /* + * STATE : IP_VS_SCTP_S_INIT_CLI + * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT) + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_INIT_SER + * Server sent INIT and waiting for INIT ACK from the client + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_INIT_ACK_CLI + * Client sent INIT ACK and waiting for ECHO from the server + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK has been resent by the client, let us stay is in + * the same state + */ + {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK sent by the server, close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * ECHO by client, it should not happen, close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO by server, this is what we are expecting, move to ECHO_SER + */ + {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, it should not happen, close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * Unexpected COOKIE ACK from server, staty in the same state + */ + {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_INIT_ACK_SER + * Server sent INIT ACK and waiting for ECHO from the client + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * Unexpected INIT_ACK by the client, let us close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK resent by the server, let us move to same state + */ + {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client send the ECHO, this is what we are expecting, + * move to ECHO_CLI + */ + {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO received from the server, Not sure what to do, + * let us close it + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, let us stay in the same state + */ + {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * COOKIE ACK from server, hmm... this should not happen, lets close + * the connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_ECHO_CLI + * Cient sent ECHO and waiting COOKEI ACK from the Server + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK has been by the client, let us close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client resent the ECHO, let us stay in the same state + */ + {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO received from the server, Not sure what to do, + * let us close it + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, this shoud not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * COOKIE ACK from server, this is what we are awaiting,lets move to + * ESTABLISHED. + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_ECHO_SER + * Server sent ECHO and waiting COOKEI ACK from the client + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK has been by the server, let us close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent the ECHO, not sure what to do, let's close the + * connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO resent by the server, stay in the same state + */ + {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, this is what we are expecting, let's move + * to ESTABLISHED. + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * COOKIE ACK from server, this should not happen, lets close the + * connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_ESTABLISHED + * Association established + */ + {{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this should not happen, let's close + * the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_SHUT_CLI + * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN resent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this should not happen, let's close + * the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server sent SHUTDOWN ACK, this is what we are expecting, let's move + * to SHUDOWN_ACK_SER + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_SHUT_SER + * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN resent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN resent from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this is what we are expecting, let's + * move to SHUT_ACK_CLI + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server sent SHUTDOWN ACK, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + + /* + * State : IP_VS_SCTP_S_SHUT_ACK_CLI + * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN sent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN sent from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client resent SHUDTDOWN_ACK, let's stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server sent SHUTDOWN ACK, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + /* + * SHUTDOWN COMPLETE from server this is what we are expecting. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + + /* + * State : IP_VS_SCTP_S_SHUT_ACK_SER + * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN sent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN sent from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this should not happen let's close + * the connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server resent SHUTDOWN ACK, stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this what we are expecting, let's close + * the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + /* + * SHUTDOWN COMPLETE from server this should not happen. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_CLOSED + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + } +}; + +/* + * Timeout table[state] + */ +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_INIT_ACK_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_INIT_ACK_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_ECHO_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_ECHO_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_ACK_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_ACK_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_CLOSED] = 10 * HZ, + [IP_VS_SCTP_S_LAST] = 2 * HZ, +}; + +static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT_CLI] = "INIT_CLI", + [IP_VS_SCTP_S_INIT_SER] = "INIT_SER", + [IP_VS_SCTP_S_INIT_ACK_CLI] = "INIT_ACK_CLI", + [IP_VS_SCTP_S_INIT_ACK_SER] = "INIT_ACK_SER", + [IP_VS_SCTP_S_ECHO_CLI] = "COOKIE_ECHO_CLI", + [IP_VS_SCTP_S_ECHO_SER] = "COOKIE_ECHO_SER", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABISHED", + [IP_VS_SCTP_S_SHUT_CLI] = "SHUTDOWN_CLI", + [IP_VS_SCTP_S_SHUT_SER] = "SHUTDOWN_SER", + [IP_VS_SCTP_S_SHUT_ACK_CLI] = "SHUTDOWN_ACK_CLI", + [IP_VS_SCTP_S_SHUT_ACK_SER] = "SHUTDOWN_ACK_SER", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!" +}; + + +static const char *sctp_state_name(int state) +{ + if (state >= IP_VS_SCTP_S_LAST) + return "ERR!"; + if (sctp_state_name_table[state]) + return sctp_state_name_table[state]; + return "?"; +} + +static inline int +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, const struct sk_buff *skb) +{ + sctp_chunkhdr_t _sctpch, *sch; + unsigned char chunk_type; + int event, next_state; + int ihl; + +#ifdef CONFIG_IP_VS_IPV6 + ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + ihl = ip_hdrlen(skb); +#endif + + sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), + sizeof(_sctpch), &_sctpch); + if (sch == NULL) + return 0; + + chunk_type = sch->type; + /* + * Section 3: Multiple chunks can be bundled into one SCTP packet + * up to the MTU size, except for the INIT, INIT ACK, and + * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with + * any other chunk in a packet. + * + * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control + * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be + * bundled with an ABORT, but they MUST be placed before the ABORT + * in the SCTP packet or they will be ignored by the receiver. + */ + if ((sch->type == SCTP_CID_COOKIE_ECHO) || + (sch->type == SCTP_CID_COOKIE_ACK)) { + sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + + sch->length), sizeof(_sctpch), &_sctpch); + if (sch) { + if (sch->type == SCTP_CID_ABORT) + chunk_type = sch->type; + } + } + + event = sctp_events[chunk_type]; + + /* + * If the direction is IP_VS_DIR_OUTPUT, this event is from server + */ + if (direction == IP_VS_DIR_OUTPUT) + event++; + /* + * get next state + */ + next_state = sctp_states_table[cp->state][event].next_state; + + if (next_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((direction == IP_VS_DIR_OUTPUT) ? + "output " : "input "), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + sctp_state_name(cp->state), + sctp_state_name(next_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state == IP_VS_SCTP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; + + return 1; +} + +static int +sctp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, struct ip_vs_proto_data *pd) +{ + int ret = 0; + + spin_lock(&cp->lock); + ret = set_sctp_state(pd, cp, direction, skb); + spin_unlock(&cp->lock); + + return ret; +} + +static inline __u16 sctp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) + & SCTP_APP_TAB_MASK; +} + +static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + hash = sctp_app_hashkey(port); + + spin_lock_bh(&ipvs->sctp_app_lock); + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->appcnt); +out: + spin_unlock_bh(&ipvs->sctp_app_lock); + + return ret; +} + +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + spin_lock_bh(&ipvs->sctp_app_lock); + atomic_dec(&pd->appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&ipvs->sctp_app_lock); +} + +static int sctp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + /* Lookup application incarnations and bind the right one */ + hash = sctp_app_hashkey(cp->vport); + + spin_lock(&ipvs->sctp_app_lock); + list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&ipvs->sctp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&ipvs->sctp_app_lock); +out: + return result; +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + spin_lock_init(&ipvs->sctp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); +} + +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + +struct ip_vs_protocol ip_vs_protocol_sctp = { + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, + .unregister_app = sctp_unregister_app, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .csum_check = sctp_csum_check, + .state_name = sctp_state_name, + .state_transition = sctp_state_transition, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 00000000..c0cc341b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,721 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom + * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include +#include + +#include + +static int +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + struct net *net; + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + net = skb_net(skb); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + if (th->syn && + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, th->dest))) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } + return 0; + } + ip_vs_service_put(svc); + } + /* NF_ACCEPT */ + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); + else +#endif + tcph->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!(ret = ip_vs_app_pkt_out(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; +} + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + spin_lock(&cp->lock); + set_tcp_state(pd, cp, direction, th); + spin_unlock(&cp->lock); + + return 1; +} + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&ipvs->tcp_app_lock); + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + spin_unlock_bh(&ipvs->tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock_bh(&ipvs->tcp_app_lock); + atomic_dec(&pd->appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&ipvs->tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&ipvs->tcp_app_lock); + list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&ipvs->tcp_app_lock); + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&ipvs->tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); + spin_unlock(&cp->lock); +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + spin_lock_init(&ipvs->tcp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); + pd->tcp_state_table = tcp_states; +} + +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 00000000..f1282cbe --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,509 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom + * Network name space (netns) aware. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + struct net *net; + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + net = skb_net(skb); + svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } + return 0; + } + ip_vs_service_put(svc); + } + /* NF_ACCEPT */ + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); +} + + +static int +udp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!(ret = ip_vs_app_pkt_out(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&ipvs->udp_app_lock); + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + spin_unlock_bh(&ipvs->udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct netns_ipvs *ipvs = net_ipvs(net); + + spin_lock_bh(&ipvs->udp_app_lock); + atomic_dec(&pd->appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&ipvs->udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&ipvs->udp_app_lock); + list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&ipvs->udp_app_lock); + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&ipvs->udp_app_lock); + + out: + return result; +} + + +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return 0; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + spin_lock_init(&ipvs->udp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); +} + +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 00000000..c49b388d --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,113 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), + .init_service = ip_vs_rr_init_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 00000000..08dbdd5b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,260 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(ip_vs_scheduler_err); +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_SPINLOCK(ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + pr_err("%s(): init error\n", __func__); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched = svc->scheduler; + + if (!sched) + return 0; + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + pr_err("%s(): done error\n", __func__); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); + + spin_lock_bh(&ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + spin_unlock_bh(&ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + spin_unlock_bh(&ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler && scheduler->module) + module_put(scheduler->module); +} + +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ + if (svc->fwmark) { + IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", + svc->scheduler->name, svc->fwmark, + svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 + } else if (svc->af == AF_INET6) { + IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n", + svc->scheduler->name, + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, ntohs(svc->port), msg); +#endif + } else { + IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", + svc->scheduler->name, + ip_vs_proto_name(svc->protocol), + &svc->addr.ip, ntohs(svc->port), msg); + } +} + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + if (!scheduler->name) { + pr_err("%s(): NULL scheduler_name\n", __func__); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + spin_lock_bh(&ip_vs_sched_lock); + + if (!list_empty(&scheduler->n_list)) { + spin_unlock_bh(&ip_vs_sched_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already linked\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + spin_unlock_bh(&ip_vs_sched_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already existed " + "in the system\n", __func__, scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + spin_unlock_bh(&ip_vs_sched_lock); + + pr_info("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + spin_lock_bh(&ip_vs_sched_lock); + if (list_empty(&scheduler->n_list)) { + spin_unlock_bh(&ip_vs_sched_lock); + pr_err("%s(): [%s] scheduler is not in the list. failed\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + spin_unlock_bh(&ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 00000000..89ead246 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,141 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 00000000..b5e2556c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,269 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, + const union nf_inet_addr *addr) +{ + return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + pr_err("%s(): no memory\n", __func__); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 00000000..e292e5bd --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,1700 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_mc_join_group */ +#include +#include +#include +#include +#include + +#include /* Used for ntoh_seq and hton_seq */ + +#include +#include + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + +#define SYNC_PROTO_VER 1 /* Protocol version in header */ + +/* + * IPVS sync connection entry + * Version 0, i.e. original version. + */ +struct ip_vs_sync_conn_v0 { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + +struct ip_vs_sync_thread_data { + struct net *net; + struct socket *sock; + char *buf; +}; + +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + ~ . ~ + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | +*/ + +#define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ + +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* Version 1 header */ +struct ip_vs_sync_mesg { + __u8 reserved; /* must be zero */ + __u8 syncid; + __u16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + +/* multicast addr */ +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), +}; + +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} + +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} + +static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ipvs->sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ipvs->sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ipvs->sync_lock); + + return sb; +} + +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->version = SYNC_PROTO_VER; + sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->nr_conns = 0; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); + sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb = ipvs->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ipvs->sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ipvs->sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (ipvs->sync_buff && + time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { + sb = ipvs->sync_buff; + ipvs->sync_buff = NULL; + } else + sb = NULL; + spin_unlock_bh(&ipvs->sync_buff_lock); + return sb; +} + +/* + * Switch mode from sending version 0 or 1 + * - must handle sync_buf + */ +void ip_vs_sync_switch_mode(struct net *net, int mode) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) + return; + if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + /* Buffer empty ? then let buf_create do the job */ + if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { + kfree(ipvs->sync_buff); + ipvs->sync_buff = NULL; + } else { + spin_lock_bh(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&ipvs->sync_buff->list, + &ipvs->sync_queue); + else + ip_vs_sync_buff_release(ipvs->sync_buff); + spin_unlock_bh(&ipvs->sync_lock); + } + spin_unlock_bh(&ipvs->sync_buff_lock); +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ipvs->master_syncid; + mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +/* + * Version 0 , could be switched in by sys_ctl. + * Add an ip_vs_conn information into the current sync_buff. + */ +void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; + int len; + + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + spin_lock(&ipvs->sync_buff_lock); + if (!ipvs->sync_buff) { + ipvs->sync_buff = + ip_vs_sync_buff_create_v0(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; + s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; + + /* copy members */ + s->reserved = 0; + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + ipvs->sync_buff->head += len; + + /* check if there is a space for next one */ + if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; + } + spin_unlock(&ipvs->sync_buff_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(net, cp->control); +} + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + * Sending Version 1 messages + */ +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m; + union ip_vs_sync_conn *s; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Handle old version of the protocol */ + if (sysctl_sync_ver(ipvs) == 0) { + ip_vs_sync_conn_v0(net, cp); + return; + } + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } + + spin_lock(&ipvs->sync_buff_lock); + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + if (ipvs->sync_buff) { + pad = (4 - (size_t)ipvs->sync_buff->head) & 3; + if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; + pad = 0; + } + } + + if (!ipvs->sync_buff) { + ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + m = ipvs->sync_buff->mesg; + p = ipvs->sync_buff->head; + ipvs->sync_buff->head += pad + len; + m->size += pad + len; + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; + + s = (union ip_vs_sync_conn *)p; + + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); + m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6); + ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6); + ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6); + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; + } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + + spin_unlock(&ipvs->sync_buff_lock); + +control: + /* synchronize its controller if it has */ + cp = cp->control; + if (!cp) + return; + /* + * Reduce sync rate for templates + * i.e only increment in_pkts for Templates. + */ + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + int pkts = atomic_add_return(1, &cp->in_pkts); + + if (pkts % sysctl_sync_period(ipvs) != 1) + return; + } + goto sloop; +} + +/* + * fill_param used by version 1 + */ +static inline int +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(net, af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(net, af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + if (p->pe->module) + module_put(p->pe->module); + return -ENOMEM; + } + p->pe_data_len = pe_data_len; + } + return 0; +} + +/* + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. + */ +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); + + if (cp && param->pe_data) /* Free pe_data */ + kfree(param->pe_data); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark); + + /* Set the approprite ativity flag */ + if (protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } else if (protocol == IPPROTO_SCTP) { + if (state != IP_VS_SCTP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + if (param->pe_data) + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && + (cp->state != state)) { + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(net, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + } + ip_vs_conn_put(cp); +} + +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct net *net, const char *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i; + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); + for (i=0; inr_conns; i++) { + unsigned flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); + return; + } + s = (struct ip_vs_sync_conn_v0 *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", + state); + state = 0; + } + } + + ip_vs_conn_fill_param(net, AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; + } + } + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", + state); + state = 0; + } + } + if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#endif + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct net *net, __u8 *buffer, + const size_t buflen) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; + __u8 *p, *msg_end; + int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + /* Convert size back to host byte order */ + m2->size = ntohs(m2->size); + + if (buflen != m2->size) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); + nr_conns = m2->nr_conns; + + for (i=0; iv4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + return; + } + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); + return; + } + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; + } + /* Process a single sync_conn */ + retc = ip_vs_proc_sync_conn(net, p, msg_end); + if (retc < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; + } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); + } + } else { + /* Old type of message */ + ip_vs_process_message_v0(net, buffer, buflen); + return; + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(struct net *net, int sync_state) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); + if (!dev) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", ipvs->send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); + if (!dev) + return -ENODEV; + + ipvs->recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", ipvs->recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct net *net = sock_net(sk); + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net *net = sock_net(sock->sk); + struct net_device *dev; + __be32 addr; + struct sockaddr_in sin; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + pr_err("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %pI4\n", + ifname, &addr); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket *make_send_sock(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct socket *sock; + int result; + + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + if (result < 0) { + pr_err("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + if (result < 0) { + pr_err("Error binding address of the mcast interface\n"); + goto error; + } + + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { + pr_err("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + +error: + sk_release_kernel(sock->sk); + return ERR_PTR(result); +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket *make_receive_sock(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { + pr_err("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ipvs->backup_mcast_ifn); + if (result < 0) { + pr_err("Error joining to the multicast group\n"); + goto error; + } + + return sock; + +error: + sk_release_kernel(sock->sk); + return ERR_PTR(result); +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + pr_err("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ip_vs_sync_buff *sb; + + pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid); + + while (!kthread_should_stop()) { + while ((sb = sb_dequeue(ipvs))) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in ipvs->sync_buff for 2 seconds */ + sb = get_curr_sync_buff(ipvs, 2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + schedule_timeout_interruptible(HZ); + } + + /* clean up the sync_buff queue */ + while ((sb = sb_dequeue(ipvs))) + ip_vs_sync_buff_release(sb); + + /* clean up the current sync_buff */ + sb = get_curr_sync_buff(ipvs, 0); + if (sb) + ip_vs_sync_buff_release(sb); + + /* release the sending multicast socket */ + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + int len; + + pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid); + + while (!kthread_should_stop()) { + wait_event_interruptible(*sk_sleep(tinfo->sock->sk), + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + ipvs->recv_mesg_maxlen); + if (len <= 0) { + pr_err("receiving message error\n"); + break; + } + + /* disable bottom half, because it accesses the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(tinfo->net, tinfo->buf, len); + local_bh_enable(); + } + } + + /* release the sending multicast socket */ + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + + return 0; +} + + +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +{ + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + struct netns_ipvs *ipvs = net_ipvs(net); + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn_v0)); + + if (state == IP_VS_STATE_MASTER) { + if (ipvs->master_thread) + return -EEXIST; + + strlcpy(ipvs->master_mcast_ifn, mcast_ifn, + sizeof(ipvs->master_mcast_ifn)); + ipvs->master_syncid = syncid; + realtask = &ipvs->master_thread; + name = "ipvs_master:%d"; + threadfn = sync_thread_master; + sock = make_send_sock(net); + } else if (state == IP_VS_STATE_BACKUP) { + if (ipvs->backup_thread) + return -EEXIST; + + strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, + sizeof(ipvs->backup_mcast_ifn)); + ipvs->backup_syncid = syncid; + realtask = &ipvs->backup_thread; + name = "ipvs_backup:%d"; + threadfn = sync_thread_backup; + sock = make_receive_sock(net); + } else { + return -EINVAL; + } + + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } + + set_sync_mesg_maxlen(net, state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; + + tinfo->net = net; + tinfo->sock = sock; + tinfo->buf = buf; + + task = kthread_run(threadfn, tinfo, name, ipvs->gen); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + + /* mark as active */ + *realtask = task; + ipvs->sync_state |= state; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + return 0; + +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sk_release_kernel(sock->sk); +out: + return result; +} + + +int stop_sync_thread(struct net *net, int state) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + int retc = -EINVAL; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + + if (state == IP_VS_STATE_MASTER) { + if (!ipvs->master_thread) + return -ESRCH; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ipvs->master_thread)); + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ipvs->sync_lock); + retc = kthread_stop(ipvs->master_thread); + ipvs->master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!ipvs->backup_thread) + return -ESRCH; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(ipvs->backup_thread)); + + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + retc = kthread_stop(ipvs->backup_thread); + ipvs->backup_thread = NULL; + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return retc; +} + +/* + * Initialize data struct for each netns + */ +int __net_init __ip_vs_sync_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->sync_queue); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); + + ipvs->sync_mcast_addr.sin_family = AF_INET; + ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); + ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); + return 0; +} + +void __ip_vs_sync_cleanup(struct net *net) +{ + int retc; + + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); +} + +int __init ip_vs_sync_init(void) +{ + return 0; +} + +void ip_vs_sync_cleanup(void) +{ +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 00000000..bc1bfc48 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,113 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 00000000..1ef41f50 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,232 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->weight); + if (new_weight > weight) + weight = new_weight; + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + pr_err("%s(): no memory\n", __func__); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + ip_vs_scheduler_err(svc, + "no destination available: " + "no destinations present"); + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + ip_vs_scheduler_err(svc, + "no destination available"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + ip_vs_scheduler_err(svc, + "no destination available: " + "all destinations are overloaded"); + goto out; + } + } + + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 00000000..ee319a43 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1370 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include +#include +#include +#include +#include + +#include + +enum { + IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ + IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ + IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to + * local + */ +}; + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, + u32 dst_cookie) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dest->dst_cookie = dst_cookie; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, dest->dst_cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +/* Get route to destination or remote server */ +static struct rtable * +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct rtable *rt; /* Route to the other host */ + struct rtable *ort; /* Original route */ + int local; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos))) { + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dest->addr.ip; + fl4.flowi4_tos = rtos; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", + &dest->addr.ip); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); + dest->dst_saddr.ip = fl4.saddr; + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " + "rtos=%X\n", + &dest->addr.ip, &dest->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt), rtos); + } + daddr = dest->addr.ip; + if (ret_saddr) + *ret_saddr = dest->dst_saddr.ip; + spin_unlock(&dest->dst_lock); + } else { + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.flowi4_tos = rtos; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", + &daddr); + return NULL; + } + if (ret_saddr) + *ret_saddr = fl4.saddr; + } + + local = rt->rt_flags & RTCF_LOCAL; + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &daddr); + ip_rt_put(rt); + return NULL; + } + if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && + !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " + "requires NAT method, dest: %pI4\n", + &ip_hdr(skb)->daddr, &daddr); + ip_rt_put(rt); + return NULL; + } + if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " + "to non-local address, dest: %pI4\n", + &ip_hdr(skb)->saddr, &daddr); + ip_rt_put(rt); + return NULL; + } + + return rt; +} + +/* Reroute packet to local IPv4 stack after DNAT */ +static int +__ip_vs_reroute_locally(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + struct net *net = dev_net(dev); + struct iphdr *iph = ip_hdr(skb); + + if (rt_is_input_route(rt)) { + unsigned long orefdst = skb->_skb_refdst; + + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + return 0; + refdst_drop(orefdst); + } else { + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_mark = skb->mark, + }; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return 0; + if (!(rt->rt_flags & RTCF_LOCAL)) { + ip_rt_put(rt); + return 0; + } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } + return 1; +} + +#ifdef CONFIG_IP_VS_IPV6 + +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; +} + +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm) +{ + struct dst_entry *dst; + struct flowi6 fl6 = { + .daddr = *daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6.daddr, 0, &fl6.saddr) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + dst = NULL; + goto out_err; + } + } + ipv6_addr_copy(ret_saddr, &fl6.saddr); + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + +/* + * Get route to destination or remote server + */ +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + int do_xfrm, int rt_mode) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct rt6_info *rt; /* Route to the other host */ + struct rt6_info *ort; /* Original route */ + struct dst_entry *dst; + int local; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); + if (!rt) { + u32 cookie; + + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest->dst_saddr.in6, + do_xfrm); + if (!dst) { + spin_unlock(&dest->dst_lock); + return NULL; + } + rt = (struct rt6_info *) dst; + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest->dst_saddr.in6, + atomic_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + ipv6_addr_copy(ret_saddr, &dest->dst_saddr.in6); + spin_unlock(&dest->dst_lock); + } else { + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + if (!dst) + return NULL; + rt = (struct rt6_info *) dst; + } + + local = __ip_vs_is_local_route6(rt); + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + local ? "local":"non-local", daddr); + dst_release(&rt->dst); + return NULL; + } + if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && + !((ort = (struct rt6_info *) skb_dst(skb)) && + __ip_vs_is_local_route6(ort))) { + IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " + "requires NAT method, dest: %pI6\n", + &ipv6_hdr(skb)->daddr, daddr); + dst_release(&rt->dst); + return NULL; + } + if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " + "to non-local address, dest: %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + dst_release(&rt->dst); + return NULL; + } + + return rt; +} +#endif + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT_TUNNEL(skb, cp) \ +({ \ + int __ret = NF_ACCEPT; \ + \ + (skb)->ipvs_property = 1; \ + if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ + __ret = ip_vs_confirm_conntrack(skb, cp); \ + if (__ret == NF_ACCEPT) { \ + nf_reset(skb); \ + skb_forward_csum(skb); \ + } \ + __ret; \ +}) + +#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ +do { \ + (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + ip_vs_notrack(skb); \ + else \ + ip_vs_update_conntrack(skb, cp, 1); \ + if (local) \ + return NF_ACCEPT; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + +#define IP_VS_XMIT(pf, skb, cp, local) \ +do { \ + (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + ip_vs_notrack(skb); \ + if (local) \ + return NF_ACCEPT; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), + IP_VS_RT_MODE_NON_LOCAL, NULL))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, + IP_VS_RT_MODE_NON_LOCAL))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu && !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + dst_release(&rt->dst); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->dst); + return NF_STOLEN; + } + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = ip_hdr(skb); + int local; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL))) + goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && + rt_is_input_route(skb_rtable(skb))) { + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): frag needed for"); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error_put; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + if (!local) { + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int local; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, (IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR)))) + goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu && !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; +} +#endif + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + __be32 saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + u8 tos = old_iph->tos; + __be16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + int ret; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(tos), IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL, + &saddr))) + goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } + + tdev = rt->dst.dev; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto tx_error_put; + } + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + df |= (old_iph->frag_off & htons(IP_DF)); + + if ((old_iph->frag_off & htons(IP_DF) && + mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = skb->network_header; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = cp->daddr.ip; + iph->saddr = saddr; + iph->ttl = old_iph->ttl; + ip_select_ident(iph, &rt->dst, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + int ret; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, 1, (IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL)))) + goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } + + tdev = rt->dst.dev; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto tx_error_put; + } + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && + !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->dst); + kfree_skb(skb); + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = skb->network_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len; + be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + ipv6_addr_copy(&iph->daddr, &cp->daddr.in6); + ipv6_addr_copy(&iph->saddr, &saddr); + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL, NULL))) + goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && + !skb_is_gso(skb)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, (IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL)))) + goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + dst_release(&rt->dst); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->dst); + return NF_STOLEN; + } + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + int local; + int rt_mode; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(ip_hdr(skb)->tos), + rt_mode, NULL))) + goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && + rt_is_input_route(skb_rtable(skb))) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + if (!local) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + int local; + int rt_mode; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, rt_mode))) + goto tx_error_icmp; + + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu && !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; +} +#endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c new file mode 100644 index 00000000..5178c691 --- /dev/null +++ b/net/netfilter/nf_conntrack_acct.c @@ -0,0 +1,136 @@ +/* Accouting handling for netfilter. */ + +/* + * (C) 2008 Krzysztof Piotr Oledzki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include + +static int nf_ct_acct __read_mostly; + +module_param_named(acct, nf_ct_acct, bool, 0644); +MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table acct_sysctl_table[] = { + { + .procname = "nf_conntrack_acct", + .data = &init_net.ct.sysctl_acct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +unsigned int +seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) +{ + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)acct[dir].packets, + (unsigned long long)acct[dir].bytes); +}; +EXPORT_SYMBOL_GPL(seq_print_acct); + +static struct nf_ct_ext_type acct_extend __read_mostly = { + .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]), + .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]), + .id = NF_CT_EXT_ACCT, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_acct; + + net->ct.acct_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.acct_sysctl_header) { + printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.acct_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.acct_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_acct_init(struct net *net) +{ + int ret; + + net->ct.sysctl_acct = nf_ct_acct; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&acct_extend); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_acct_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&acct_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_acct_fini(struct net *net) +{ + nf_conntrack_acct_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&acct_extend); +} diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c new file mode 100644 index 00000000..13fd2c55 --- /dev/null +++ b/net/netfilter/nf_conntrack_amanda.c @@ -0,0 +1,237 @@ +/* Amanda extension for IP connection tracking + * + * (C) 2002 by Brian J. Murrell + * based on HW's ip_conntrack_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static unsigned int master_timeout __read_mostly = 300; +static char *ts_algo = "kmp"; + +MODULE_AUTHOR("Brian J. Murrell "); +MODULE_DESCRIPTION("Amanda connection tracking module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_amanda"); +MODULE_ALIAS_NFCT_HELPER("amanda"); + +module_param(master_timeout, uint, 0600); +MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); +module_param(ts_algo, charp, 0400); +MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); + +unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_amanda_hook); + +enum amanda_strings { + SEARCH_CONNECT, + SEARCH_NEWLINE, + SEARCH_DATA, + SEARCH_MESG, + SEARCH_INDEX, +}; + +static struct { + const char *string; + size_t len; + struct ts_config *ts; +} search[] __read_mostly = { + [SEARCH_CONNECT] = { + .string = "CONNECT ", + .len = 8, + }, + [SEARCH_NEWLINE] = { + .string = "\n", + .len = 1, + }, + [SEARCH_DATA] = { + .string = "DATA ", + .len = 5, + }, + [SEARCH_MESG] = { + .string = "MESG ", + .len = 5, + }, + [SEARCH_INDEX] = { + .string = "INDEX ", + .len = 6, + }, +}; + +static int amanda_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct ts_state ts; + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + unsigned int dataoff, start, stop, off, i; + char pbuf[sizeof("65535")], *tmp; + u_int16_t len; + __be16 port; + int ret = NF_ACCEPT; + typeof(nf_nat_amanda_hook) nf_nat_amanda; + + /* Only look at packets from the Amanda server */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* increase the UDP timeout of the master connection as replies from + * Amanda clients to the server can be quite delayed */ + nf_ct_refresh(ct, skb, master_timeout * HZ); + + /* No data? */ + dataoff = protoff + sizeof(struct udphdr); + if (dataoff >= skb->len) { + if (net_ratelimit()) + printk(KERN_ERR "amanda_help: skblen = %u\n", skb->len); + return NF_ACCEPT; + } + + memset(&ts, 0, sizeof(ts)); + start = skb_find_text(skb, dataoff, skb->len, + search[SEARCH_CONNECT].ts, &ts); + if (start == UINT_MAX) + goto out; + start += dataoff + search[SEARCH_CONNECT].len; + + memset(&ts, 0, sizeof(ts)); + stop = skb_find_text(skb, start, skb->len, + search[SEARCH_NEWLINE].ts, &ts); + if (stop == UINT_MAX) + goto out; + stop += start; + + for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { + memset(&ts, 0, sizeof(ts)); + off = skb_find_text(skb, start, stop, search[i].ts, &ts); + if (off == UINT_MAX) + continue; + off += start + search[i].len; + + len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); + if (skb_copy_bits(skb, off, pbuf, len)) + break; + pbuf[len] = '\0'; + + port = htons(simple_strtoul(pbuf, &tmp, 10)); + len = tmp - pbuf; + if (port == 0 || len > 5) + break; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_TCP, NULL, &port); + + nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook); + if (nf_nat_amanda && ct->status & IPS_NAT_MASK) + ret = nf_nat_amanda(skb, ctinfo, off - dataoff, + len, exp); + else if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + nf_ct_expect_put(exp); + } + +out: + return ret; +} + +static const struct nf_conntrack_expect_policy amanda_exp_policy = { + .max_expected = 3, + .timeout = 180, +}; + +static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { + { + .name = "amanda", + .me = THIS_MODULE, + .help = amanda_help, + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(10080), + .tuple.dst.protonum = IPPROTO_UDP, + .expect_policy = &amanda_exp_policy, + }, + { + .name = "amanda", + .me = THIS_MODULE, + .help = amanda_help, + .tuple.src.l3num = AF_INET6, + .tuple.src.u.udp.port = cpu_to_be16(10080), + .tuple.dst.protonum = IPPROTO_UDP, + .expect_policy = &amanda_exp_policy, + }, +}; + +static void __exit nf_conntrack_amanda_fini(void) +{ + int i; + + nf_conntrack_helper_unregister(&amanda_helper[0]); + nf_conntrack_helper_unregister(&amanda_helper[1]); + for (i = 0; i < ARRAY_SIZE(search); i++) + textsearch_destroy(search[i].ts); +} + +static int __init nf_conntrack_amanda_init(void) +{ + int ret, i; + + for (i = 0; i < ARRAY_SIZE(search); i++) { + search[i].ts = textsearch_prepare(ts_algo, search[i].string, + search[i].len, + GFP_KERNEL, TS_AUTOLOAD); + if (IS_ERR(search[i].ts)) { + ret = PTR_ERR(search[i].ts); + goto err1; + } + } + ret = nf_conntrack_helper_register(&amanda_helper[0]); + if (ret < 0) + goto err1; + ret = nf_conntrack_helper_register(&amanda_helper[1]); + if (ret < 0) + goto err2; + return 0; + +err2: + nf_conntrack_helper_unregister(&amanda_helper[0]); +err1: + while (--i >= 0) + textsearch_destroy(search[i].ts); + + return ret; +} + +module_init(nf_conntrack_amanda_init); +module_exit(nf_conntrack_amanda_fini); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c new file mode 100644 index 00000000..4e99cca6 --- /dev/null +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -0,0 +1,82 @@ +/* + * broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout) +{ + struct nf_conntrack_expect *exp; + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct nf_conn_help *help = nfct_help(ct); + __be32 mask = 0; + + /* we're only interested in locally generated packets */ + if (skb->sk == NULL) + goto out; + if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) + goto out; + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + goto out; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (in_dev != NULL) { + for_primary_ifa(in_dev) { + if (ifa->ifa_broadcast == iph->daddr) { + mask = ifa->ifa_mask; + break; + } + } endfor_ifa(in_dev); + } + rcu_read_unlock(); + + if (mask == 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + goto out; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + + exp->mask.src.u3.ip = mask; + exp->mask.src.u.udp.port = htons(0xFFFF); + + exp->expectfn = NULL; + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->class = NF_CT_EXPECT_CLASS_DEFAULT; + exp->helper = NULL; + + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); + + nf_ct_refresh(ct, skb, timeout * HZ); +out: + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c new file mode 100644 index 00000000..f7af8b86 --- /dev/null +++ b/net/netfilter/nf_conntrack_core.c @@ -0,0 +1,1592 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NF_CONNTRACK_VERSION "0.5.0" + +int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) __read_mostly; +EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); + +DEFINE_SPINLOCK(nf_conntrack_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_lock); + +unsigned int nf_conntrack_htable_size __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); + +unsigned int nf_conntrack_max __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_max); + +DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); + +unsigned int nf_conntrack_hash_rnd __read_mostly; + +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) +{ + unsigned int n; + + /* The direction must be ignored, so we hash everything up to the + * destination ports (which is a multiple of 4) and treat the last + * three bytes manually. + */ + n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); + return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + (((__force __u16)tuple->dst.u.all << 16) | + tuple->dst.protonum)); +} + +static u32 __hash_bucket(u32 hash, unsigned int size) +{ + return ((u64)hash * size) >> 32; +} + +static u32 hash_bucket(u32 hash, const struct net *net) +{ + return __hash_bucket(hash, net->ct.htable_size); +} + +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + u16 zone, unsigned int size) +{ + return __hash_bucket(hash_conntrack_raw(tuple, zone), size); +} + +static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, zone, net->ct.htable_size); +} + +bool +nf_ct_get_tuple(const struct sk_buff *skb, + unsigned int nhoff, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) +{ + memset(tuple, 0, sizeof(*tuple)); + + tuple->src.l3num = l3num; + if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + return false; + + tuple->dst.protonum = protonum; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return l4proto->pkt_to_tuple(skb, dataoff, tuple); +} +EXPORT_SYMBOL_GPL(nf_ct_get_tuple); + +bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, + u_int16_t l3num, struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + unsigned int protoff; + u_int8_t protonum; + int ret; + + rcu_read_lock(); + + l3proto = __nf_ct_l3proto_find(l3num); + ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); + if (ret != NF_ACCEPT) { + rcu_read_unlock(); + return false; + } + + l4proto = __nf_ct_l4proto_find(l3num, protonum); + + ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, + l3proto, l4proto); + + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); + +bool +nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) +{ + memset(inverse, 0, sizeof(*inverse)); + + inverse->src.l3num = orig->src.l3num; + if (l3proto->invert_tuple(inverse, orig) == 0) + return false; + + inverse->dst.dir = !orig->dst.dir; + + inverse->dst.protonum = orig->dst.protonum; + return l4proto->invert_tuple(inverse, orig); +} +EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); + +static void +clean_from_lists(struct nf_conn *ct) +{ + pr_debug("clean_from_lists(%p)\n", ct); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); + + /* Destroy all pending expectations */ + nf_ct_remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + struct net *net = nf_ct_net(ct); + struct nf_conntrack_l4proto *l4proto; + + pr_debug("destroy_conntrack(%p)\n", ct); + NF_CT_ASSERT(atomic_read(&nfct->use) == 0); + NF_CT_ASSERT(!timer_pending(&ct->timeout)); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to nf_conntrack_lock!!! -HW */ + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto && l4proto->destroy) + l4proto->destroy(ct); + + rcu_read_unlock(); + + spin_lock_bh(&nf_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + nf_ct_remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + } + + NF_CT_STAT_INC(net, delete); + spin_unlock_bh(&nf_conntrack_lock); + + if (ct->master) + nf_ct_put(ct->master); + + pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); + nf_conntrack_free(ct); +} + +void nf_ct_delete_from_lists(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + nf_ct_helper_destroy(ct); + spin_lock_bh(&nf_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + NF_CT_STAT_INC(net, delete_list); + clean_from_lists(ct); + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); + +static void death_by_event(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); + + if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { + /* bad luck, let's retry again */ + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); + return; + } + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + spin_lock(&nf_conntrack_lock); + hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&nf_conntrack_lock); + nf_ct_put(ct); +} + +void nf_ct_insert_dying_list(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + /* add this conntrack to the dying list */ + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.dying); + spin_unlock_bh(&nf_conntrack_lock); + /* set a new timer to retry event delivery */ + setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); +} +EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); + + if (!test_bit(IPS_DYING_BIT, &ct->status) && + unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + /* destroy event was not delivered */ + nf_ct_delete_from_lists(ct); + nf_ct_insert_dying_list(ct); + return; + } + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_delete_from_lists(ct); + nf_ct_put(ct); +} + +/* + * Warning : + * - Caller must take a reference on returned object + * and recheck nf_ct_tuple_equal(tuple, &h->tuple) + * OR + * - Caller must lock nf_conntrack_lock before calling this function + */ +static struct nf_conntrack_tuple_hash * +____nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) +{ + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + unsigned int bucket = hash_bucket(hash, net); + + /* Disable BHs the entire time since we normally need to disable them + * at least once for the stats anyway. + */ + local_bh_disable(); +begin: + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { + if (nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { + NF_CT_STAT_INC(net, found); + local_bh_enable(); + return h; + } + NF_CT_STAT_INC(net, searched); + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(n) != bucket) { + NF_CT_STAT_INC(net, search_restart); + goto begin; + } + local_bh_enable(); + + return NULL; +} + +struct nf_conntrack_tuple_hash * +__nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return ____nf_conntrack_find(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} +EXPORT_SYMBOL_GPL(__nf_conntrack_find); + +/* Find a connection corresponding to a tuple. */ +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + rcu_read_lock(); +begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) + h = NULL; + else { + if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || + nf_ct_zone(ct) != zone)) { + nf_ct_put(ct); + goto begin; + } + } + } + rcu_read_unlock(); + + return h; +} + +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} +EXPORT_SYMBOL_GPL(nf_conntrack_find_get); + +static void __nf_conntrack_hash_insert(struct nf_conn *ct, + unsigned int hash, + unsigned int repl_hash) +{ + struct net *net = nf_ct_net(ct); + + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.hash[hash]); + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &net->ct.hash[repl_hash]); +} + +void nf_conntrack_hash_insert(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + unsigned int hash, repl_hash; + u16 zone; + + zone = nf_ct_zone(ct); + hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + __nf_conntrack_hash_insert(ct, hash, repl_hash); +} +EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); + +/* Confirm a connection given skb; places it in hash table */ +int +__nf_conntrack_confirm(struct sk_buff *skb) +{ + unsigned int hash, repl_hash; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conn_help *help; + struct nf_conn_tstamp *tstamp; + struct hlist_nulls_node *n; + enum ip_conntrack_info ctinfo; + struct net *net; + u16 zone; + + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); + + /* ipt_REJECT uses nf_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + zone = nf_ct_zone(ct); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means no one else could have + confirmed us. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + pr_debug("Confirming conntrack %p\n", ct); + + spin_lock_bh(&nf_conntrack_lock); + + /* We have to check the DYING flag inside the lock to prevent + a race against nf_ct_get_next_corpse() possibly called from + user context, else we insert an already 'dead' hash, blocking + further use of that particular connection -JM */ + + if (unlikely(nf_ct_is_dying(ct))) { + spin_unlock_bh(&nf_conntrack_lock); + return NF_ACCEPT; + } + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + + /* Remove from unconfirmed list */ + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + ct->status |= IPS_CONFIRMED; + + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + tstamp->start = ktime_to_ns(skb->tstamp); + } + /* Since the lookup is lockless, hash insertion must be done after + * starting the timer and setting the CONFIRMED bit. The RCU barriers + * guarantee that no other CPU can find the conntrack before the above + * stores are visible. + */ + __nf_conntrack_hash_insert(ct, hash, repl_hash); + NF_CT_STAT_INC(net, insert); + spin_unlock_bh(&nf_conntrack_lock); + + help = nfct_help(ct); + if (help && help->helper) + nf_conntrack_event_cache(IPCT_HELPER, ct); + + nf_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, ct); + return NF_ACCEPT; + +out: + NF_CT_STAT_INC(net, insert_failed); + spin_unlock_bh(&nf_conntrack_lock); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct net *net = nf_ct_net(ignored_conntrack); + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *ct; + u16 zone = nf_ct_zone(ignored_conntrack); + unsigned int hash = hash_conntrack(net, zone, tuple); + + /* Disable BHs the entire time since we need to disable them at + * least once for the stats anyway. + */ + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (ct != ignored_conntrack && + nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone) { + NF_CT_STAT_INC(net, found); + rcu_read_unlock_bh(); + return 1; + } + NF_CT_STAT_INC(net, searched); + } + rcu_read_unlock_bh(); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); + +#define NF_CT_EVICTION_RANGE 8 + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static noinline int early_drop(struct net *net, unsigned int hash) +{ + /* Use oldest entry, which is roughly LRU */ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct = NULL, *tmp; + struct hlist_nulls_node *n; + unsigned int i, cnt = 0; + int dropped = 0; + + rcu_read_lock(); + for (i = 0; i < net->ct.htable_size; i++) { + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], + hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); + if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) + ct = tmp; + cnt++; + } + + if (ct != NULL) { + if (likely(!nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use))) + break; + else + ct = NULL; + } + + if (cnt >= NF_CT_EVICTION_RANGE) + break; + + hash = (hash + 1) % net->ct.htable_size; + } + rcu_read_unlock(); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + NF_CT_STAT_INC_ATOMIC(net, early_drop); + } + nf_ct_put(ct); + return dropped; +} + +void init_nf_conntrack_hash_rnd(void) +{ + unsigned int rand; + + /* + * Why not initialize nf_conntrack_rnd in a "init()" function ? + * Because there isn't enough entropy when system initializing, + * and we initialize it as late as possible. + */ + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&nf_conntrack_hash_rnd, 0, rand); +} + +static struct nf_conn * +__nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp, u32 hash) +{ + struct nf_conn *ct; + + if (unlikely(!nf_conntrack_hash_rnd)) { + init_nf_conntrack_hash_rnd(); + /* recompute the hash as nf_conntrack_hash_rnd is initialized */ + hash = hash_conntrack_raw(orig, zone); + } + + /* We don't want any race condition at early drop stage */ + atomic_inc(&net->ct.count); + + if (nf_conntrack_max && + unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { + if (!early_drop(net, hash_bucket(hash, net))) { + atomic_dec(&net->ct.count); + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + /* + * Do not use kmem_cache_zalloc(), as this cache uses + * SLAB_DESTROY_BY_RCU. + */ + ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + if (ct == NULL) { + pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); + atomic_dec(&net->ct.count); + return ERR_PTR(-ENOMEM); + } + /* + * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next + * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. + */ + memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, + offsetof(struct nf_conn, proto) - + offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); + spin_lock_init(&ct->lock); + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* save hash for reusing when confirming */ + *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; + /* Don't set timer yet: wait for confirmation */ + setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); + write_pnet(&ct->ct_net, net); +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (zone) { + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); + if (!nf_ct_zone) + goto out_free; + nf_ct_zone->id = zone; + } +#endif + /* + * changes to lookup keys must be done before setting refcnt to 1 + */ + smp_wmb(); + atomic_set(&ct->ct_general.use, 1); + return ct; + +#ifdef CONFIG_NF_CONNTRACK_ZONES +out_free: + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + return ERR_PTR(-ENOMEM); +#endif +} + +struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp) +{ + return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); +} +EXPORT_SYMBOL_GPL(nf_conntrack_alloc); + +void nf_conntrack_free(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + nf_ct_ext_destroy(ct); + atomic_dec(&net->ct.count); + nf_ct_ext_free(ct); + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +} +EXPORT_SYMBOL_GPL(nf_conntrack_free); + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct nf_conntrack_tuple_hash * +init_conntrack(struct net *net, struct nf_conn *tmpl, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_l4proto *l4proto, + struct sk_buff *skb, + unsigned int dataoff, u32 hash) +{ + struct nf_conn *ct; + struct nf_conn_help *help; + struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_ecache *ecache; + struct nf_conntrack_expect *exp; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { + pr_debug("Can't invert tuple.\n"); + return NULL; + } + + ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, + hash); + if (IS_ERR(ct)) { + pr_debug("Can't allocate conntrack.\n"); + return (struct nf_conntrack_tuple_hash *)ct; + } + + if (!l4proto->new(ct, skb, dataoff)) { + nf_conntrack_free(ct); + pr_debug("init conntrack: can't track with proto module\n"); + return NULL; + } + + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + + ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; + nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC); + + spin_lock_bh(&nf_conntrack_lock); + exp = nf_ct_find_expectation(net, zone, tuple); + if (exp) { + pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + ct, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &ct->status); + ct->master = exp->master; + if (exp->helper) { + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (help) + rcu_assign_pointer(help->helper, exp->helper); + } + +#ifdef CONFIG_NF_CONNTRACK_MARK + ct->mark = exp->master->mark; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + ct->secmark = exp->master->secmark; +#endif + nf_conntrack_get(&ct->master->ct_general); + NF_CT_STAT_INC(net, expect_new); + } else { + __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); + NF_CT_STAT_INC(net, new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.unconfirmed); + + spin_unlock_bh(&nf_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(ct, exp); + nf_ct_expect_put(exp); + } + + return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct nf_conn * +resolve_normal_ct(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_l4proto *l4proto, + int *set_reply, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + u32 hash; + + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), + dataoff, l3num, protonum, &tuple, l3proto, + l4proto)) { + pr_debug("resolve_normal_ct: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + hash = hash_conntrack_raw(&tuple, zone); + h = __nf_conntrack_find_get(net, zone, &tuple, hash); + if (!h) { + h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + skb, dataoff, hash); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = nf_ct_tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + pr_debug("nf_conntrack_in: normal packet for %p\n", ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + pr_debug("nf_conntrack_in: related packet for %p\n", + ct); + *ctinfo = IP_CT_RELATED; + } else { + pr_debug("nf_conntrack_in: new packet for %p\n", ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +unsigned int +nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, + struct sk_buff *skb) +{ + struct nf_conn *ct, *tmpl = NULL; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + unsigned int dataoff; + u_int8_t protonum; + int set_reply = 0; + int ret; + + if (skb->nfct) { + /* Previously seen (loopback or untracked)? Ignore. */ + tmpl = (struct nf_conn *)skb->nfct; + if (!nf_ct_is_template(tmpl)) { + NF_CT_STAT_INC_ATOMIC(net, ignore); + return NF_ACCEPT; + } + skb->nfct = NULL; + } + + /* rcu_read_lock()ed by nf_hook_slow */ + l3proto = __nf_ct_l3proto_find(pf); + ret = l3proto->get_l4proto(skb, skb_network_offset(skb), + &dataoff, &protonum); + if (ret <= 0) { + pr_debug("not prepared to track yet or error occurred\n"); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = -ret; + goto out; + } + + l4proto = __nf_ct_l4proto_find(pf, protonum); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (l4proto->error != NULL) { + ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, + pf, hooknum); + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = -ret; + goto out; + } + /* ICMP[v6] protocol trackers may assign one conntrack. */ + if (skb->nfct) + goto out; + } + + ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, + l3proto, l4proto, &set_reply, &ctinfo); + if (!ct) { + /* Not valid part of a connection */ + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = NF_ACCEPT; + goto out; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + NF_CT_STAT_INC_ATOMIC(net, drop); + ret = NF_DROP; + goto out; + } + + NF_CT_ASSERT(skb->nfct); + + ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); + if (ret <= 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do */ + pr_debug("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; + NF_CT_STAT_INC_ATOMIC(net, invalid); + if (ret == -NF_DROP) + NF_CT_STAT_INC_ATOMIC(net, drop); + ret = -ret; + goto out; + } + + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_REPLY, ct); +out: + if (tmpl) { + /* Special case: we have to repeat this hook, assign the + * template again to this packet. We assume that this packet + * has no conntrack assigned. This is used by nf_ct_tcp. */ + if (ret == NF_REPEAT) + skb->nfct = (struct nf_conntrack *)tmpl; + else + nf_ct_put(tmpl); + } + + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_in); + +bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig) +{ + bool ret; + + rcu_read_lock(); + ret = nf_ct_invert_tuple(inverse, orig, + __nf_ct_l3proto_find(orig->src.l3num), + __nf_ct_l4proto_find(orig->src.l3num, + orig->dst.protonum)); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __nf_conntrack_confirm */ +void nf_conntrack_alter_reply(struct nf_conn *ct, + const struct nf_conntrack_tuple *newreply) +{ + struct nf_conn_help *help = nfct_help(ct); + + /* Should be unconfirmed, so not in hash table yet */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + + pr_debug("Altering reply tuple of %p to ", ct); + nf_ct_dump_tuple(newreply); + + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (ct->master || (help && !hlist_empty(&help->expectations))) + return; + + rcu_read_lock(); + __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); + +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct) +{ + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + NF_CT_ASSERT(skb); + + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + goto acct; + + /* If not in hash table, timer will not be active yet */ + if (!nf_ct_is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + } else { + unsigned long newtime = jiffies + extra_jiffies; + + /* Only update the timeout if the new timeout is at least + HZ jiffies from the old timeout. Need del_timer for race + avoidance (may already be dying). */ + if (newtime - ct->timeout.expires >= HZ) + mod_timer_pending(&ct->timeout, newtime); + } + +acct: + if (do_acct) { + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + acct[CTINFO2DIR(ctinfo)].packets++; + acct[CTINFO2DIR(ctinfo)].bytes += skb->len; + spin_unlock_bh(&ct->lock); + } + } +} +EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); + +bool __nf_ct_kill_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + int do_acct) +{ + if (do_acct) { + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + acct[CTINFO2DIR(ctinfo)].packets++; + acct[CTINFO2DIR(ctinfo)].bytes += + skb->len - skb_network_offset(skb); + spin_unlock_bh(&ct->lock); + } + } + + if (del_timer(&ct->timeout)) { + ct->timeout.function((unsigned long)ct); + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); + +#ifdef CONFIG_NF_CONNTRACK_ZONES +static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_zone), + .align = __alignof__(struct nf_conntrack_zone), + .id = NF_CT_EXT_ZONE, +}; +#endif + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include +#include + +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); + +const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, + [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, +}; +EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); + +int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) + return -EINVAL; + + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); + +int nf_ct_port_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} +EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); +#endif + +/* Used by ipt_REJECT and ip6t_REJECT. */ +static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = nf_ct_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +/* Bring out ya dead! */ +static struct nf_conn * +get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), + void *data, unsigned int *bucket) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + + spin_lock_bh(&nf_conntrack_lock); + for (; *bucket < net->ct.htable_size; (*bucket)++) { + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } + hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + set_bit(IPS_DYING_BIT, &ct->status); + } + spin_unlock_bh(&nf_conntrack_lock); + return NULL; +found: + atomic_inc(&ct->ct_general.use); + spin_unlock_bh(&nf_conntrack_lock); + return ct; +} + +void nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data) +{ + struct nf_conn *ct; + unsigned int bucket = 0; + + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + nf_ct_put(ct); + } +} +EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); + +struct __nf_ct_flush_report { + u32 pid; + int report; +}; + +static int kill_report(struct nf_conn *i, void *data) +{ + struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(i); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); + + /* If we fail to deliver the event, death_by_timeout() will retry */ + if (nf_conntrack_event_report(IPCT_DESTROY, i, + fr->pid, fr->report) < 0) + return 1; + + /* Avoid the delivery of the destroy event in death_by_timeout(). */ + set_bit(IPS_DYING_BIT, &i->status); + return 1; +} + +static int kill_all(struct nf_conn *i, void *data) +{ + return 1; +} + +void nf_ct_free_hashtable(void *hash, unsigned int size) +{ + if (is_vmalloc_addr(hash)) + vfree(hash); + else + free_pages((unsigned long)hash, + get_order(sizeof(struct hlist_head) * size)); +} +EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); + +void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +{ + struct __nf_ct_flush_report fr = { + .pid = pid, + .report = report, + }; + nf_ct_iterate_cleanup(net, kill_report, &fr); +} +EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); + +static void nf_ct_release_dying_list(struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&nf_conntrack_lock); +} + +static int untrack_refs(void) +{ + int cnt = 0, cpu; + + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + + cnt += atomic_read(&ct->ct_general.use) - 1; + } + return cnt; +} + +static void nf_conntrack_cleanup_init_net(void) +{ + while (untrack_refs() > 0) + schedule(); + + nf_conntrack_helper_fini(); + nf_conntrack_proto_fini(); +#ifdef CONFIG_NF_CONNTRACK_ZONES + nf_ct_extend_unregister(&nf_ct_zone_extend); +#endif +} + +static void nf_conntrack_cleanup_net(struct net *net) +{ + i_see_dead_people: + nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_release_dying_list(net); + if (atomic_read(&net->ct.count) != 0) { + schedule(); + goto i_see_dead_people; + } + + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_ecache_fini(net); + nf_conntrack_tstamp_fini(net); + nf_conntrack_acct_fini(net); + nf_conntrack_expect_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); + free_percpu(net->ct.stat); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void nf_conntrack_cleanup(struct net *net) +{ + if (net_eq(net, &init_net)) + rcu_assign_pointer(ip_ct_attach, NULL); + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + nf_conntrack_cleanup_net(net); + + if (net_eq(net, &init_net)) { + rcu_assign_pointer(nf_ct_destroy, NULL); + nf_conntrack_cleanup_init_net(); + } +} + +void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) +{ + struct hlist_nulls_head *hash; + unsigned int nr_slots, i; + size_t sz; + + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); + nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + sz = nr_slots * sizeof(struct hlist_nulls_head); + hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + get_order(sz)); + if (!hash) { + printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + } + + if (hash && nulls) + for (i = 0; i < nr_slots; i++) + INIT_HLIST_NULLS_HEAD(&hash[i], i); + + return hash; +} +EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); + +int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket; + unsigned int hashsize, old_size; + struct hlist_nulls_head *hash, *old_hash; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + hashsize = simple_strtoul(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = nf_ct_alloc_hashtable(&hashsize, 1); + if (!hash) + return -ENOMEM; + + /* Lookups in the old hash might happen in parallel, which means we + * might get false negatives during connection lookup. New connections + * created because of a false negative won't make it into the hash + * though since that required taking the lock. + */ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < init_net.ct.htable_size; i++) { + while (!hlist_nulls_empty(&init_net.ct.hash[i])) { + h = hlist_nulls_entry(init_net.ct.hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); + ct = nf_ct_tuplehash_to_ctrack(h); + hlist_nulls_del_rcu(&h->hnnode); + bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), + hashsize); + hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); + } + } + old_size = init_net.ct.htable_size; + old_hash = init_net.ct.hash; + + init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; + init_net.ct.hash = hash; + spin_unlock_bh(&nf_conntrack_lock); + + nf_ct_free_hashtable(old_hash, old_size); + return 0; +} +EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); + +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +void nf_ct_untracked_status_or(unsigned long bits) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(nf_conntrack_untracked, cpu).status |= bits; +} +EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); + +static int nf_conntrack_init_init_net(void) +{ + int max_factor = 8; + int ret, cpu; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((totalram_pages << PAGE_SHIFT) / 16384) + / sizeof(struct hlist_head)); + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 16384; + if (nf_conntrack_htable_size < 32) + nf_conntrack_htable_size = 32; + + /* Use a max. factor of four by default to get the same max as + * with the old struct list_heads. When a table size is given + * we use the old value of 8 to avoid reducing the max. + * entries. */ + max_factor = 4; + } + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); + + ret = nf_conntrack_proto_init(); + if (ret < 0) + goto err_proto; + + ret = nf_conntrack_helper_init(); + if (ret < 0) + goto err_helper; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + ret = nf_ct_extend_register(&nf_ct_zone_extend); + if (ret < 0) + goto err_extend; +#endif + /* Set up fake conntrack: to never be deleted, not in any hashes */ + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + write_pnet(&ct->ct_net, &init_net); + atomic_set(&ct->ct_general.use, 1); + } + /* - and look it like as a confirmed connection */ + nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); + return 0; + +#ifdef CONFIG_NF_CONNTRACK_ZONES +err_extend: + nf_conntrack_helper_fini(); +#endif +err_helper: + nf_conntrack_proto_fini(); +err_proto: + return ret; +} + +/* + * We need to use special "null" values, not used in hash table + */ +#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) +#define DYING_NULLS_VAL ((1<<30)+1) + +static int nf_conntrack_init_net(struct net *net) +{ + int ret; + + atomic_set(&net->ct.count, 0); + INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) { + ret = -ENOMEM; + goto err_stat; + } + + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + if (!net->ct.slabname) { + ret = -ENOMEM; + goto err_slabname; + } + + net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!net->ct.nf_conntrack_cachep) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + ret = -ENOMEM; + goto err_cache; + } + + net->ct.htable_size = nf_conntrack_htable_size; + net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); + if (!net->ct.hash) { + ret = -ENOMEM; + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_hash; + } + ret = nf_conntrack_expect_init(net); + if (ret < 0) + goto err_expect; + ret = nf_conntrack_acct_init(net); + if (ret < 0) + goto err_acct; + ret = nf_conntrack_tstamp_init(net); + if (ret < 0) + goto err_tstamp; + ret = nf_conntrack_ecache_init(net); + if (ret < 0) + goto err_ecache; + + return 0; + +err_ecache: + nf_conntrack_tstamp_fini(net); +err_tstamp: + nf_conntrack_acct_fini(net); +err_acct: + nf_conntrack_expect_fini(net); +err_expect: + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); +err_hash: + kmem_cache_destroy(net->ct.nf_conntrack_cachep); +err_cache: + kfree(net->ct.slabname); +err_slabname: + free_percpu(net->ct.stat); +err_stat: + return ret; +} + +s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq); +EXPORT_SYMBOL_GPL(nf_ct_nat_offset); + +int nf_conntrack_init(struct net *net) +{ + int ret; + + if (net_eq(net, &init_net)) { + ret = nf_conntrack_init_init_net(); + if (ret < 0) + goto out_init_net; + } + ret = nf_conntrack_init_net(net); + if (ret < 0) + goto out_net; + + if (net_eq(net, &init_net)) { + /* For use by REJECT target */ + rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); + rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + + /* Howto get NAT offsets */ + rcu_assign_pointer(nf_ct_nat_offset, NULL); + } + return 0; + +out_net: + if (net_eq(net, &init_net)) + nf_conntrack_cleanup_init_net(); +out_init_net: + return ret; +} diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c new file mode 100644 index 00000000..63a1b915 --- /dev/null +++ b/net/netfilter/nf_conntrack_ecache.c @@ -0,0 +1,266 @@ +/* Event cache for netfilter. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static DEFINE_MUTEX(nf_ct_ecache_mutex); + +struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); + +struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly; +EXPORT_SYMBOL_GPL(nf_expect_event_cb); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +void nf_ct_deliver_cached_events(struct nf_conn *ct) +{ + unsigned long events; + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + + rcu_read_lock(); + notify = rcu_dereference(nf_conntrack_event_cb); + if (notify == NULL) + goto out_unlock; + + e = nf_ct_ecache_find(ct); + if (e == NULL) + goto out_unlock; + + events = xchg(&e->cache, 0); + + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) { + struct nf_ct_event item = { + .ct = ct, + .pid = 0, + .report = 0 + }; + int ret; + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. */ + unsigned long missed = e->missed; + + if (!((events | missed) & e->ctmask)) + goto out_unlock; + + ret = notify->fcn(events | missed, &item); + if (unlikely(ret < 0 || missed)) { + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + } + } + +out_unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); + +int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new) +{ + int ret = 0; + struct nf_ct_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(nf_conntrack_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + if (notify != NULL) { + ret = -EBUSY; + goto out_unlock; + } + rcu_assign_pointer(nf_conntrack_event_cb, new); + mutex_unlock(&nf_ct_ecache_mutex); + return ret; + +out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); + +void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new) +{ + struct nf_ct_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(nf_conntrack_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + BUG_ON(notify != new); + rcu_assign_pointer(nf_conntrack_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); +} +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); + +int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new) +{ + int ret = 0; + struct nf_exp_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(nf_expect_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + if (notify != NULL) { + ret = -EBUSY; + goto out_unlock; + } + rcu_assign_pointer(nf_expect_event_cb, new); + mutex_unlock(&nf_ct_ecache_mutex); + return ret; + +out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); + +void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new) +{ + struct nf_exp_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(nf_expect_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + BUG_ON(notify != new); + rcu_assign_pointer(nf_expect_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); + +#define NF_CT_EVENTS_DEFAULT 1 +static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; +static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; + +#ifdef CONFIG_SYSCTL +static struct ctl_table event_sysctl_table[] = { + { + .procname = "nf_conntrack_events", + .data = &init_net.ct.sysctl_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_events_retry_timeout", + .data = &init_net.ct.sysctl_events_retry_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type event_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_ecache), + .align = __alignof__(struct nf_conntrack_ecache), + .id = NF_CT_EXT_ECACHE, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_events; + table[1].data = &net->ct.sysctl_events_retry_timeout; + + net->ct.event_sysctl_header = + register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.event_sysctl_header) { + printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.event_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.event_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +int nf_conntrack_ecache_init(struct net *net) +{ + int ret; + + net->ct.sysctl_events = nf_ct_events; + net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&event_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_event: Unable to register " + "event extension.\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_event_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&event_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_ecache_fini(struct net *net) +{ + nf_conntrack_event_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&event_extend); +} diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c new file mode 100644 index 00000000..cd1e8e09 --- /dev/null +++ b/net/netfilter/nf_conntrack_expect.c @@ -0,0 +1,677 @@ +/* Expectation handling for nf_conntrack. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +unsigned int nf_ct_expect_hsize __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); + +unsigned int nf_ct_expect_max __read_mostly; + +static struct kmem_cache *nf_ct_expect_cachep __read_mostly; + +static HLIST_HEAD(nf_ct_userspace_expect_list); + +/* nf_conntrack_expect helper functions */ +void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + u32 pid, int report) +{ + struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); + + NF_CT_ASSERT(!timer_pending(&exp->timeout)); + + hlist_del_rcu(&exp->hnode); + net->ct.expect_count--; + + hlist_del(&exp->lnode); + if (!(exp->flags & NF_CT_EXPECT_USERSPACE)) + master_help->expecting[exp->class]--; + + nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_put(exp); + + NF_CT_STAT_INC(net, expect_delete); +} +EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); + +static void nf_ct_expectation_timed_out(unsigned long ul_expect) +{ + struct nf_conntrack_expect *exp = (void *)ul_expect; + + spin_lock_bh(&nf_conntrack_lock); + nf_ct_unlink_expect(exp); + spin_unlock_bh(&nf_conntrack_lock); + nf_ct_expect_put(exp); +} + +static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +{ + unsigned int hash; + + if (unlikely(!nf_conntrack_hash_rnd)) { + init_nf_conntrack_hash_rnd(); + } + + hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), + (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | + (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + return ((u64)hash * nf_ct_expect_hsize) >> 32; +} + +struct nf_conntrack_expect * +__nf_ct_expect_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + struct hlist_node *n; + unsigned int h; + + if (!net->ct.expect_count) + return NULL; + + h = nf_ct_expect_dst_hash(tuple); + hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + nf_ct_zone(i->master) == zone) + return i; + } + return NULL; +} +EXPORT_SYMBOL_GPL(__nf_ct_expect_find); + +/* Just find a expectation corresponding to a tuple. */ +struct nf_conntrack_expect * +nf_ct_expect_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + rcu_read_lock(); + i = __nf_ct_expect_find(net, zone, tuple); + if (i && !atomic_inc_not_zero(&i->use)) + i = NULL; + rcu_read_unlock(); + + return i; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +struct nf_conntrack_expect * +nf_ct_find_expectation(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i, *exp = NULL; + struct hlist_node *n; + unsigned int h; + + if (!net->ct.expect_count) + return NULL; + + h = nf_ct_expect_dst_hash(tuple); + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + if (!(i->flags & NF_CT_EXPECT_INACTIVE) && + nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + nf_ct_zone(i->master) == zone) { + exp = i; + break; + } + } + if (!exp) + return NULL; + + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (!nf_ct_is_confirmed(exp->master)) + return NULL; + + if (exp->flags & NF_CT_EXPECT_PERMANENT) { + atomic_inc(&exp->use); + return exp; + } else if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + return exp; + } + + return NULL; +} + +/* delete all expectations for this conntrack */ +void nf_ct_remove_expectations(struct nf_conn *ct) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + /* Optimization: most connection never expect any others. */ + if (!help) + return; + + hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } +} +EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); + +/* Would two expected things clash? */ +static inline int expect_clash(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct nf_conntrack_tuple_mask intersect_mask; + int count; + + intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.src.u3.all[count] = + a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; + } + + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + return a->master == b->master && a->class == b->class && + nf_ct_tuple_equal(&a->tuple, &b->tuple) && + nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + nf_ct_zone(a->master) == nf_ct_zone(b->master); +} + +/* Generally a bad idea to call this: could have matched already. */ +void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) +{ + spin_lock_bh(&nf_conntrack_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); + +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) +{ + struct nf_conntrack_expect *new; + + new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); + if (!new) + return NULL; + + new->master = me; + atomic_set(&new->use, 1); + return new; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); + +void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, + u_int8_t family, + const union nf_inet_addr *saddr, + const union nf_inet_addr *daddr, + u_int8_t proto, const __be16 *src, const __be16 *dst) +{ + int len; + + if (family == AF_INET) + len = 4; + else + len = 16; + + exp->flags = 0; + exp->class = class; + exp->expectfn = NULL; + exp->helper = NULL; + exp->tuple.src.l3num = family; + exp->tuple.dst.protonum = proto; + + if (saddr) { + memcpy(&exp->tuple.src.u3, saddr, len); + if (sizeof(exp->tuple.src.u3) > len) + /* address needs to be cleared for nf_ct_tuple_equal */ + memset((void *)&exp->tuple.src.u3 + len, 0x00, + sizeof(exp->tuple.src.u3) - len); + memset(&exp->mask.src.u3, 0xFF, len); + if (sizeof(exp->mask.src.u3) > len) + memset((void *)&exp->mask.src.u3 + len, 0x00, + sizeof(exp->mask.src.u3) - len); + } else { + memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); + memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); + } + + if (src) { + exp->tuple.src.u.all = *src; + exp->mask.src.u.all = htons(0xFFFF); + } else { + exp->tuple.src.u.all = 0; + exp->mask.src.u.all = 0; + } + + memcpy(&exp->tuple.dst.u3, daddr, len); + if (sizeof(exp->tuple.dst.u3) > len) + /* address needs to be cleared for nf_ct_tuple_equal */ + memset((void *)&exp->tuple.dst.u3 + len, 0x00, + sizeof(exp->tuple.dst.u3) - len); + + exp->tuple.dst.u.all = *dst; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_init); + +static void nf_ct_expect_free_rcu(struct rcu_head *head) +{ + struct nf_conntrack_expect *exp; + + exp = container_of(head, struct nf_conntrack_expect, rcu); + kmem_cache_free(nf_ct_expect_cachep, exp); +} + +void nf_ct_expect_put(struct nf_conntrack_expect *exp) +{ + if (atomic_dec_and_test(&exp->use)) + call_rcu(&exp->rcu, nf_ct_expect_free_rcu); +} +EXPORT_SYMBOL_GPL(nf_ct_expect_put); + +static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) +{ + struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); + const struct nf_conntrack_expect_policy *p; + unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + + /* two references : one for hash insert, one for the timer */ + atomic_add(2, &exp->use); + + if (master_help) { + hlist_add_head(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + } else if (exp->flags & NF_CT_EXPECT_USERSPACE) + hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list); + + hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + net->ct.expect_count++; + + setup_timer(&exp->timeout, nf_ct_expectation_timed_out, + (unsigned long)exp); + if (master_help) { + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[exp->class]; + exp->timeout.expires = jiffies + p->timeout * HZ; + } + add_timer(&exp->timeout); + + NF_CT_STAT_INC(net, expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct nf_conn *master, + struct nf_conntrack_expect *new) +{ + struct nf_conn_help *master_help = nfct_help(master); + struct nf_conntrack_expect *exp, *last = NULL; + struct hlist_node *n; + + hlist_for_each_entry(exp, n, &master_help->expectations, lnode) { + if (exp->class == new->class) + last = exp; + } + + if (last && del_timer(&last->timeout)) { + nf_ct_unlink_expect(last); + nf_ct_expect_put(last); + } +} + +static inline int refresh_timer(struct nf_conntrack_expect *i) +{ + struct nf_conn_help *master_help = nfct_help(i->master); + const struct nf_conntrack_expect_policy *p; + + if (!del_timer(&i->timeout)) + return 0; + + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[i->class]; + i->timeout.expires = jiffies + p->timeout * HZ; + add_timer(&i->timeout); + return 1; +} + +static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) +{ + const struct nf_conntrack_expect_policy *p; + struct nf_conntrack_expect *i; + struct nf_conn *master = expect->master; + struct nf_conn_help *master_help = nfct_help(master); + struct net *net = nf_ct_exp_net(expect); + struct hlist_node *n; + unsigned int h; + int ret = 1; + + /* Don't allow expectations created from kernel-space with no helper */ + if (!(expect->flags & NF_CT_EXPECT_USERSPACE) && + (!master_help || (master_help && !master_help->helper))) { + ret = -ESHUTDOWN; + goto out; + } + h = nf_ct_expect_dst_hash(&expect->tuple); + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + /* Will be over limit? */ + if (master_help) { + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[expect->class]; + if (p->max_expected && + master_help->expecting[expect->class] >= p->max_expected) { + evict_oldest_expect(master, expect); + if (master_help->expecting[expect->class] + >= p->max_expected) { + ret = -EMFILE; + goto out; + } + } + } + + if (net->ct.expect_count >= nf_ct_expect_max) { + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: expectation table full\n"); + ret = -EMFILE; + } +out: + return ret; +} + +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + u32 pid, int report) +{ + int ret; + + spin_lock_bh(&nf_conntrack_lock); + ret = __nf_ct_expect_check(expect); + if (ret <= 0) + goto out; + + ret = 0; + nf_ct_expect_insert(expect); + spin_unlock_bh(&nf_conntrack_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + return ret; +out: + spin_unlock_bh(&nf_conntrack_lock); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); + +void nf_ct_remove_userspace_expectations(void) +{ + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + hlist_for_each_entry_safe(exp, n, next, + &nf_ct_userspace_expect_list, lnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } +} +EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations); + +#ifdef CONFIG_PROC_FS +struct ct_expect_iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *ct_expect_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + struct hlist_node *n; + + for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { + n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + if (n) + return n; + } + return NULL; +} + +static struct hlist_node *ct_expect_get_next(struct seq_file *seq, + struct hlist_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + + head = rcu_dereference(hlist_next_rcu(head)); + while (head == NULL) { + if (++st->bucket >= nf_ct_expect_hsize) + return NULL; + head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + } + return head; +} + +static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head = ct_expect_get_first(seq); + + if (head) + while (pos && (head = ct_expect_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *exp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ct_expect_get_idx(seq, *pos); +} + +static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return ct_expect_get_next(seq, v); +} + +static void exp_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_expect *expect; + struct nf_conntrack_helper *helper; + struct hlist_node *n = v; + char *delim = ""; + + expect = hlist_entry(n, struct nf_conntrack_expect, hnode); + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + seq_printf(s, "l3proto = %u proto=%u ", + expect->tuple.src.l3num, + expect->tuple.dst.protonum); + print_tuple(s, &expect->tuple, + __nf_ct_l3proto_find(expect->tuple.src.l3num), + __nf_ct_l4proto_find(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); + + if (expect->flags & NF_CT_EXPECT_PERMANENT) { + seq_printf(s, "PERMANENT"); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_INACTIVE) { + seq_printf(s, "%sINACTIVE", delim); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_USERSPACE) + seq_printf(s, "%sUSERSPACE", delim); + + helper = rcu_dereference(nfct_help(expect->master)->helper); + if (helper) { + seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); + if (helper->expect_policy[expect->class].name) + seq_printf(s, "/%s", + helper->expect_policy[expect->class].name); + } + + return seq_putc(s, '\n'); +} + +static const struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); +} + +static const struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif /* CONFIG_PROC_FS */ + +static int exp_proc_init(struct net *net) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; + + proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops); + if (!proc) + return -ENOMEM; +#endif /* CONFIG_PROC_FS */ + return 0; +} + +static void exp_proc_remove(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "nf_conntrack_expect"); +#endif /* CONFIG_PROC_FS */ +} + +module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); + +int nf_conntrack_expect_init(struct net *net) +{ + int err = -ENOMEM; + + if (net_eq(net, &init_net)) { + if (!nf_ct_expect_hsize) { + nf_ct_expect_hsize = net->ct.htable_size / 256; + if (!nf_ct_expect_hsize) + nf_ct_expect_hsize = 1; + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; + } + + net->ct.expect_count = 0; + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (net->ct.expect_hash == NULL) + goto err1; + + if (net_eq(net, &init_net)) { + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL); + if (!nf_ct_expect_cachep) + goto err2; + } + + err = exp_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + if (net_eq(net, &init_net)) + kmem_cache_destroy(nf_ct_expect_cachep); +err2: + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +err1: + return err; +} + +void nf_conntrack_expect_fini(struct net *net) +{ + exp_proc_remove(net); + if (net_eq(net, &init_net)) { + rcu_barrier(); /* Wait for call_rcu() before destroy */ + kmem_cache_destroy(nf_ct_expect_cachep); + } + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +} diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c new file mode 100644 index 00000000..05ecdc28 --- /dev/null +++ b/net/netfilter/nf_conntrack_extend.c @@ -0,0 +1,189 @@ +/* Structure dynamic extension infrastructure + * Copyright (C) 2004 Rusty Russell IBM Corporation + * Copyright (C) 2007 Netfilter Core Team + * Copyright (C) 2007 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include + +static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; +static DEFINE_MUTEX(nf_ct_ext_type_mutex); + +void __nf_ct_ext_destroy(struct nf_conn *ct) +{ + unsigned int i; + struct nf_ct_ext_type *t; + struct nf_ct_ext *ext = ct->ext; + + for (i = 0; i < NF_CT_EXT_NUM; i++) { + if (!__nf_ct_ext_exist(ext, i)) + continue; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[i]); + + /* Here the nf_ct_ext_type might have been unregisterd. + * I.e., it has responsible to cleanup private + * area in all conntracks when it is unregisterd. + */ + if (t && t->destroy) + t->destroy(ct); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(__nf_ct_ext_destroy); + +static void * +nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) +{ + unsigned int off, len; + struct nf_ct_ext_type *t; + size_t alloc_size; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[id]); + BUG_ON(t == NULL); + off = ALIGN(sizeof(struct nf_ct_ext), t->align); + len = off + t->len; + alloc_size = t->alloc_size; + rcu_read_unlock(); + + *ext = kzalloc(alloc_size, gfp); + if (!*ext) + return NULL; + + (*ext)->offset[id] = off; + (*ext)->len = len; + + return (void *)(*ext) + off; +} + +void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) +{ + struct nf_ct_ext *old, *new; + int i, newlen, newoff; + struct nf_ct_ext_type *t; + + /* Conntrack must not be confirmed to avoid races on reallocation. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + + old = ct->ext; + if (!old) + return nf_ct_ext_create(&ct->ext, id, gfp); + + if (__nf_ct_ext_exist(old, id)) + return NULL; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[id]); + BUG_ON(t == NULL); + + newoff = ALIGN(old->len, t->align); + newlen = newoff + t->len; + rcu_read_unlock(); + + new = __krealloc(old, newlen, gfp); + if (!new) + return NULL; + + if (new != old) { + for (i = 0; i < NF_CT_EXT_NUM; i++) { + if (!__nf_ct_ext_exist(old, i)) + continue; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[i]); + if (t && t->move) + t->move((void *)new + new->offset[i], + (void *)old + old->offset[i]); + rcu_read_unlock(); + } + kfree_rcu(old, rcu); + ct->ext = new; + } + + new->offset[id] = newoff; + new->len = newlen; + memset((void *)new + newoff, 0, newlen - newoff); + return (void *)new + newoff; +} +EXPORT_SYMBOL(__nf_ct_ext_add); + +static void update_alloc_size(struct nf_ct_ext_type *type) +{ + int i, j; + struct nf_ct_ext_type *t1, *t2; + enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1; + + /* unnecessary to update all types */ + if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) { + min = type->id; + max = type->id; + } + + /* This assumes that extended areas in conntrack for the types + whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ + for (i = min; i <= max; i++) { + t1 = rcu_dereference_protected(nf_ct_ext_types[i], + lockdep_is_held(&nf_ct_ext_type_mutex)); + if (!t1) + continue; + + t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + + t1->len; + for (j = 0; j < NF_CT_EXT_NUM; j++) { + t2 = rcu_dereference_protected(nf_ct_ext_types[j], + lockdep_is_held(&nf_ct_ext_type_mutex)); + if (t2 == NULL || t2 == t1 || + (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) + continue; + + t1->alloc_size = ALIGN(t1->alloc_size, t2->align) + + t2->len; + } + } +} + +/* This MUST be called in process context. */ +int nf_ct_extend_register(struct nf_ct_ext_type *type) +{ + int ret = 0; + + mutex_lock(&nf_ct_ext_type_mutex); + if (nf_ct_ext_types[type->id]) { + ret = -EBUSY; + goto out; + } + + /* This ensures that nf_ct_ext_create() can allocate enough area + before updating alloc_size */ + type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align) + + type->len; + rcu_assign_pointer(nf_ct_ext_types[type->id], type); + update_alloc_size(type); +out: + mutex_unlock(&nf_ct_ext_type_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_extend_register); + +/* This MUST be called in process context. */ +void nf_ct_extend_unregister(struct nf_ct_ext_type *type) +{ + mutex_lock(&nf_ct_ext_type_mutex); + rcu_assign_pointer(nf_ct_ext_types[type->id], NULL); + update_alloc_size(type); + mutex_unlock(&nf_ct_ext_type_mutex); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} +EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c new file mode 100644 index 00000000..6f5801ea --- /dev/null +++ b/net/netfilter/nf_conntrack_ftp.c @@ -0,0 +1,589 @@ +/* FTP extension for connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("ftp connection tracking helper"); +MODULE_ALIAS("ip_conntrack_ftp"); +MODULE_ALIAS_NFCT_HELPER("ftp"); + +/* This is slow, but it's simple. --RR */ +static char *ftp_buffer; + +static DEFINE_SPINLOCK(nf_ftp_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +static int loose; +module_param(loose, bool, 0600); + +unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, + char); + +static struct ftp_search { + const char *pattern; + size_t plen; + char skip; + char term; + enum nf_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = NF_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = NF_CT_FTP_EPRT, + .getnum = try_eprt, + }, + }, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = NF_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = NF_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, + }, +}; + +static int +get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) +{ + const char *end; + int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end); + if (ret > 0) + return (int)(end - src); + return 0; +} + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + pr_debug("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + pr_debug("Failed to fill %u numbers separated by %c\n", + array_size, sep); + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + int length; + u_int32_t array[6]; + + length = try_number(data, dlen, array, 6, ',', term); + if (length == 0) + return 0; + + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + (array[2] << 8) | array[3]); + cmd->u.tcp.port = htons((array[4] << 8) | array[5]); + return length; +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + __be16 *port) +{ + u_int16_t tmp_port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (tmp_port == 0) + break; + *port = htons(tmp_port); + pr_debug("get_port: return %d\n", tmp_port); + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + tmp_port = tmp_port*10 + data[i] - '0'; + else { /* Some other crap */ + pr_debug("get_port: invalid char.\n"); + break; + } + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ +static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, + then delimiter again. */ + if (dlen <= 3) { + pr_debug("EPRT: too short\n"); + return 0; + } + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { + pr_debug("try_eprt: invalid delimitter.\n"); + return 0; + } + + if ((cmd->l3num == PF_INET && data[1] != '1') || + (cmd->l3num == PF_INET6 && data[1] != '2')) { + pr_debug("EPRT: invalid protocol number.\n"); + return 0; + } + + pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim); + + if (data[1] == '1') { + u_int32_t array[4]; + + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length != 0) + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } else { + /* Now we have IPv6 address. */ + length = get_ipv6_addr(data + 3, dlen - 3, + (struct in6_addr *)cmd->u3.ip6, delim); + } + + if (length == 0) + return 0; + pr_debug("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || + data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, + struct nf_conntrack_man *, char)) +{ + size_t i; + + pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + pr_debug("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + pr_debug("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + pr_debug("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, cmd, term); + if (!*numlen) + return -1; + + pr_debug("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(struct nf_conn *ct, u32 nl_seq, + struct nf_ct_ftp_master *info, int dir, + struct sk_buff *skb) +{ + unsigned int i, oldest; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + } else { + if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1])) + oldest = 0; + else + oldest = 1; + + if (after(nl_seq, info->seq_aft_nl[dir][oldest])) + info->seq_aft_nl[dir][oldest] = nl_seq; + } +} + +static int help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const struct tcphdr *th; + struct tcphdr _tcph; + const char *fb_ptr; + int ret; + u32 seq; + int dir = CTINFO2DIR(ctinfo); + unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); + struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info; + struct nf_conntrack_expect *exp; + union nf_inet_addr *daddr; + struct nf_conntrack_man cmd = {}; + unsigned int i; + int found = 0, ends_in_nl; + typeof(nf_nat_ftp_hook) nf_nat_ftp; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) { + pr_debug("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = protoff + th->doff * 4; + /* No data? */ + if (dataoff >= skb->len) { + pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, + skb->len); + return NF_ACCEPT; + } + datalen = skb->len - dataoff; + + spin_lock_bh(&nf_ftp_lock); + fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][0], + ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][1]); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP/IPv6 addr to expected address (it's not mentioned + in EPSV responses) */ + cmd.l3num = nf_ct_l3num(ct); + memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all)); + + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { + found = find_pattern(fb_ptr, datalen, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, + &matchoff, &matchlen, + &cmd, + search[dir][i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + pr_debug("conntrack_ftp: partial %s %u+%u\n", + search[dir][i].pattern, ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + matchlen, fb_ptr + matchoff, + matchlen, ntohl(th->seq) + matchoff); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + daddr = &ct->tuplehash[!dir].tuple.dst.u3; + + /* Update the ftp info */ + if ((cmd.l3num == nf_ct_l3num(ct)) && + memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all))) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + if (cmd.l3num == PF_INET) { + pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n", + &cmd.u3.ip, + &ct->tuplehash[dir].tuple.src.u3.ip); + } else { + pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n", + cmd.u3.ip6, + ct->tuplehash[dir].tuple.src.u3.ip6); + } + + /* Thanks to Cristiano Lincoln Mattos + for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + goto out_put_expect; + } + daddr = &cmd.u3; + } + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num, + &ct->tuplehash[!dir].tuple.src.u3, daddr, + IPPROTO_TCP, NULL, &cmd.u.tcp.port); + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook); + if (nf_nat_ftp && ct->status & IPS_NAT_MASK) + ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype, + matchoff, matchlen, exp); + else { + /* Can't expect this? Best to drop packet now. */ + if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + +out_put_expect: + nf_ct_expect_put(exp); + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(ct, seq, ct_ftp_info, dir, skb); + out: + spin_unlock_bh(&nf_ftp_lock); + return ret; +} + +static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; +static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy ftp_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; + +/* don't make this __exit, since it's called from __init ! */ +static void nf_conntrack_ftp_fini(void) +{ + int i, j; + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + if (ftp[i][j].me == NULL) + continue; + + pr_debug("nf_ct_ftp: unregistering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&ftp[i][j]); + } + } + + kfree(ftp_buffer); +} + +static int __init nf_conntrack_ftp_init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 FTP connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + ftp[i][0].tuple.src.l3num = PF_INET; + ftp[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; + ftp[i][j].expect_policy = &ftp_exp_policy; + ftp[i][j].me = THIS_MODULE; + ftp[i][j].help = help; + tmpname = &ftp_names[i][j][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i][j].name = tmpname; + + pr_debug("nf_ct_ftp: registering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&ftp[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_ftp: failed to register" + " helper for pf: %d port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_ftp_fini(); + return ret; + } + } + } + + return 0; +} + +module_init(nf_conntrack_ftp_init); +module_exit(nf_conntrack_ftp_fini); diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c new file mode 100644 index 00000000..bcd5ed6b --- /dev/null +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -0,0 +1,888 @@ +/**************************************************************************** + * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323 + * conntrack/NAT module. + * + * Copyright (c) 2006 by Jing Min Zhao + * + * This source code is licensed under General Public License version 2. + * + * See ip_conntrack_helper_h323_asn1.h for details. + * + ****************************************************************************/ + +#ifdef __KERNEL__ +#include +#else +#include +#endif +#include + +/* Trace Flag */ +#ifndef H323_TRACE +#define H323_TRACE 0 +#endif + +#if H323_TRACE +#define TAB_SIZE 4 +#define IFTHEN(cond, act) if(cond){act;} +#ifdef __KERNEL__ +#define PRINT printk +#else +#define PRINT printf +#endif +#define FNAME(name) name, +#else +#define IFTHEN(cond, act) +#define PRINT(fmt, args...) +#define FNAME(name) +#endif + +/* ASN.1 Types */ +#define NUL 0 +#define BOOL 1 +#define OID 2 +#define INT 3 +#define ENUM 4 +#define BITSTR 5 +#define NUMSTR 6 +#define NUMDGT 6 +#define TBCDSTR 6 +#define OCTSTR 7 +#define PRTSTR 7 +#define IA5STR 7 +#define GENSTR 7 +#define BMPSTR 8 +#define SEQ 9 +#define SET 9 +#define SEQOF 10 +#define SETOF 10 +#define CHOICE 11 + +/* Constraint Types */ +#define FIXD 0 +/* #define BITS 1-8 */ +#define BYTE 9 +#define WORD 10 +#define CONS 11 +#define SEMI 12 +#define UNCO 13 + +/* ASN.1 Type Attributes */ +#define SKIP 0 +#define STOP 1 +#define DECODE 2 +#define EXT 4 +#define OPEN 8 +#define OPT 16 + + +/* ASN.1 Field Structure */ +typedef struct field_t { +#if H323_TRACE + char *name; +#endif + unsigned char type; + unsigned char sz; + unsigned char lb; + unsigned char ub; + unsigned short attr; + unsigned short offset; + const struct field_t *fields; +} field_t; + +/* Bit Stream */ +typedef struct { + unsigned char *buf; + unsigned char *beg; + unsigned char *end; + unsigned char *cur; + unsigned int bit; +} bitstr_t; + +/* Tool Functions */ +#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} +#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} +#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} +#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) +static unsigned int get_len(bitstr_t *bs); +static unsigned int get_bit(bitstr_t *bs); +static unsigned int get_bits(bitstr_t *bs, unsigned int b); +static unsigned int get_bitmap(bitstr_t *bs, unsigned int b); +static unsigned int get_uint(bitstr_t *bs, int b); + +/* Decoder Functions */ +static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level); + +/* Decoder Functions Vector */ +typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int); +static const decoder_t Decoders[] = { + decode_nul, + decode_bool, + decode_oid, + decode_int, + decode_enum, + decode_bitstr, + decode_numstr, + decode_octstr, + decode_bmpstr, + decode_seq, + decode_seqof, + decode_choice, +}; + +/**************************************************************************** + * H.323 Types + ****************************************************************************/ +#include "nf_conntrack_h323_types.c" + +/**************************************************************************** + * Functions + ****************************************************************************/ +/* Assume bs is aligned && v < 16384 */ +static unsigned int get_len(bitstr_t *bs) +{ + unsigned int v; + + v = *bs->cur++; + + if (v & 0x80) { + v &= 0x3f; + v <<= 8; + v += *bs->cur++; + } + + return v; +} + +/****************************************************************************/ +static unsigned int get_bit(bitstr_t *bs) +{ + unsigned int b = (*bs->cur) & (0x80 >> bs->bit); + + INC_BIT(bs); + + return b; +} + +/****************************************************************************/ +/* Assume b <= 8 */ +static unsigned int get_bits(bitstr_t *bs, unsigned int b) +{ + unsigned int v, l; + + v = (*bs->cur) & (0xffU >> bs->bit); + l = b + bs->bit; + + if (l < 8) { + v >>= 8 - l; + bs->bit = l; + } else if (l == 8) { + bs->cur++; + bs->bit = 0; + } else { /* l > 8 */ + + v <<= 8; + v += *(++bs->cur); + v >>= 16 - l; + bs->bit = l - 8; + } + + return v; +} + +/****************************************************************************/ +/* Assume b <= 32 */ +static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) +{ + unsigned int v, l, shift, bytes; + + if (!b) + return 0; + + l = bs->bit + b; + + if (l < 8) { + v = (unsigned int)(*bs->cur) << (bs->bit + 24); + bs->bit = l; + } else if (l == 8) { + v = (unsigned int)(*bs->cur++) << (bs->bit + 24); + bs->bit = 0; + } else { + for (bytes = l >> 3, shift = 24, v = 0; bytes; + bytes--, shift -= 8) + v |= (unsigned int)(*bs->cur++) << shift; + + if (l < 32) { + v |= (unsigned int)(*bs->cur) << shift; + v <<= bs->bit; + } else if (l > 32) { + v <<= bs->bit; + v |= (*bs->cur) >> (8 - bs->bit); + } + + bs->bit = l & 0x7; + } + + v &= 0xffffffff << (32 - b); + + return v; +} + +/**************************************************************************** + * Assume bs is aligned and sizeof(unsigned int) == 4 + ****************************************************************************/ +static unsigned int get_uint(bitstr_t *bs, int b) +{ + unsigned int v = 0; + + switch (b) { + case 4: + v |= *bs->cur++; + v <<= 8; + case 3: + v |= *bs->cur++; + v <<= 8; + case 2: + v |= *bs->cur++; + v <<= 8; + case 1: + v |= *bs->cur++; + break; + } + return v; +} + +/****************************************************************************/ +static int decode_nul(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_bool(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + INC_BIT(bs); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_oid(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + len = *bs->cur++; + bs->cur += len; + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_int(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s", level * TAB_SIZE, " ", f->name); + + switch (f->sz) { + case BYTE: /* Range == 256 */ + BYTE_ALIGN(bs); + bs->cur++; + break; + case WORD: /* 257 <= Range <= 64K */ + BYTE_ALIGN(bs); + bs->cur += 2; + break; + case CONS: /* 64K < Range < 4G */ + len = get_bits(bs, 2) + 1; + BYTE_ALIGN(bs); + if (base && (f->attr & DECODE)) { /* timeToLive */ + unsigned int v = get_uint(bs, len) + f->lb; + PRINT(" = %u", v); + *((unsigned int *)(base + f->offset)) = v; + } + bs->cur += len; + break; + case UNCO: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + len = get_len(bs); + bs->cur += len; + break; + default: /* 2 <= Range <= 255 */ + INC_BITS(bs, f->sz); + break; + } + + PRINT("\n"); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_enum(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + if ((f->attr & EXT) && get_bit(bs)) { + INC_BITS(bs, 7); + } else { + INC_BITS(bs, f->sz); + } + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_bitstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + BYTE_ALIGN(bs); + switch (f->sz) { + case FIXD: /* fixed length > 16 */ + len = f->lb; + break; + case WORD: /* 2-byte length */ + CHECK_BOUND(bs, 2); + len = (*bs->cur++) << 8; + len += (*bs->cur++) + f->lb; + break; + case SEMI: + CHECK_BOUND(bs, 2); + len = get_len(bs); + break; + default: + len = 0; + break; + } + + bs->cur += len >> 3; + bs->bit = len & 7; + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_numstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* 2 <= Range <= 255 */ + len = get_bits(bs, f->sz) + f->lb; + + BYTE_ALIGN(bs); + INC_BITS(bs, (len << 2)); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_octstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s", level * TAB_SIZE, " ", f->name); + + switch (f->sz) { + case FIXD: /* Range == 1 */ + if (f->lb > 2) { + BYTE_ALIGN(bs); + if (base && (f->attr & DECODE)) { + /* The IP Address */ + IFTHEN(f->lb == 4, + PRINT(" = %d.%d.%d.%d:%d", + bs->cur[0], bs->cur[1], + bs->cur[2], bs->cur[3], + bs->cur[4] * 256 + bs->cur[5])); + *((unsigned int *)(base + f->offset)) = + bs->cur - bs->buf; + } + } + len = f->lb; + break; + case BYTE: /* Range == 256 */ + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + len = (*bs->cur++) + f->lb; + break; + case SEMI: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + len = get_len(bs) + f->lb; + break; + default: /* 2 <= Range <= 255 */ + len = get_bits(bs, f->sz) + f->lb; + BYTE_ALIGN(bs); + break; + } + + bs->cur += len; + + PRINT("\n"); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + switch (f->sz) { + case BYTE: /* Range == 256 */ + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + len = (*bs->cur++) + f->lb; + break; + default: /* 2 <= Range <= 255 */ + len = get_bits(bs, f->sz) + f->lb; + BYTE_ALIGN(bs); + break; + } + + bs->cur += len << 1; + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_seq(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len; + int err; + const struct field_t *son; + unsigned char *beg = NULL; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* Decode? */ + base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; + + /* Extensible? */ + ext = (f->attr & EXT) ? get_bit(bs) : 0; + + /* Get fields bitmap */ + bmp = get_bitmap(bs, f->sz); + if (base) + *(unsigned int *)base = bmp; + + /* Decode the root components */ + for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) { + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + return H323_ERROR_STOP; + } + + if (son->attr & OPT) { /* Optional component */ + if (!((0x80000000U >> (opt++)) & bmp)) /* Not exist */ + continue; + } + + /* Decode */ + if (son->attr & OPEN) { /* Open field */ + CHECK_BOUND(bs, 2); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, + " ", son->name); + bs->cur += len; + continue; + } + beg = bs->cur; + + /* Decode */ + if ((err = (Decoders[son->type]) (bs, son, base, + level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } else if ((err = (Decoders[son->type]) (bs, son, base, + level + 1)) < + H323_ERROR_NONE) + return err; + } + + /* No extension? */ + if (!ext) + return H323_ERROR_NONE; + + /* Get the extension bitmap */ + bmp2_len = get_bits(bs, 7) + 1; + CHECK_BOUND(bs, (bmp2_len + 7) >> 3); + bmp2 = get_bitmap(bs, bmp2_len); + bmp |= bmp2 >> f->sz; + if (base) + *(unsigned int *)base = bmp; + BYTE_ALIGN(bs); + + /* Decode the extension components */ + for (opt = 0; opt < bmp2_len; opt++, i++, son++) { + /* Check Range */ + if (i >= f->ub) { /* Newer Version? */ + CHECK_BOUND(bs, 2); + len = get_len(bs); + CHECK_BOUND(bs, len); + bs->cur += len; + continue; + } + + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + return H323_ERROR_STOP; + } + + if (!((0x80000000 >> opt) & bmp2)) /* Not present */ + continue; + + CHECK_BOUND(bs, 2); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + bs->cur += len; + continue; + } + beg = bs->cur; + + if ((err = (Decoders[son->type]) (bs, son, base, + level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_seqof(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int count, effective_count = 0, i, len = 0; + int err; + const struct field_t *son; + unsigned char *beg = NULL; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* Decode? */ + base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; + + /* Decode item count */ + switch (f->sz) { + case BYTE: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + count = *bs->cur++; + break; + case WORD: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + count = *bs->cur++; + count <<= 8; + count += *bs->cur++; + break; + case SEMI: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + count = get_len(bs); + break; + default: + count = get_bits(bs, f->sz); + break; + } + count += f->lb; + + /* Write Count */ + if (base) { + effective_count = count > f->ub ? f->ub : count; + *(unsigned int *)base = effective_count; + base += sizeof(unsigned int); + } + + /* Decode nested field */ + son = f->fields; + if (base) + base -= son->offset; + for (i = 0; i < count; i++) { + if (son->attr & OPEN) { + BYTE_ALIGN(bs); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, + " ", son->name); + bs->cur += len; + continue; + } + beg = bs->cur; + + if ((err = (Decoders[son->type]) (bs, son, + i < + effective_count ? + base : NULL, + level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } else + if ((err = (Decoders[son->type]) (bs, son, + i < + effective_count ? + base : NULL, + level + 1)) < + H323_ERROR_NONE) + return err; + + if (base) + base += son->offset; + } + + return H323_ERROR_NONE; +} + + +/****************************************************************************/ +static int decode_choice(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int type, ext, len = 0; + int err; + const struct field_t *son; + unsigned char *beg = NULL; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* Decode? */ + base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; + + /* Decode the choice index number */ + if ((f->attr & EXT) && get_bit(bs)) { + ext = 1; + type = get_bits(bs, 7) + f->lb; + } else { + ext = 0; + type = get_bits(bs, f->sz); + if (type >= f->lb) + return H323_ERROR_RANGE; + } + + /* Write Type */ + if (base) + *(unsigned int *)base = type; + + /* Check Range */ + if (type >= f->ub) { /* Newer version? */ + BYTE_ALIGN(bs); + len = get_len(bs); + CHECK_BOUND(bs, len); + bs->cur += len; + return H323_ERROR_NONE; + } + + /* Transfer to son level */ + son = &f->fields[type]; + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); + return H323_ERROR_STOP; + } + + if (ext || (son->attr & OPEN)) { + BYTE_ALIGN(bs); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + bs->cur += len; + return H323_ERROR_NONE; + } + beg = bs->cur; + + if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } else if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) < + H323_ERROR_NONE) + return err; + + return H323_ERROR_NONE; +} + +/****************************************************************************/ +int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) +{ + static const struct field_t ras_message = { + FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT, + 0, _RasMessage + }; + bitstr_t bs; + + bs.buf = bs.beg = bs.cur = buf; + bs.end = buf + sz; + bs.bit = 0; + + return decode_choice(&bs, &ras_message, (char *) ras, 0); +} + +/****************************************************************************/ +static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, + size_t sz, H323_UserInformation *uuie) +{ + static const struct field_t h323_userinformation = { + FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT, + 0, _H323_UserInformation + }; + bitstr_t bs; + + bs.buf = buf; + bs.beg = bs.cur = beg; + bs.end = beg + sz; + bs.bit = 0; + + return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0); +} + +/****************************************************************************/ +int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, + MultimediaSystemControlMessage * + mscm) +{ + static const struct field_t multimediasystemcontrolmessage = { + FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4, + DECODE | EXT, 0, _MultimediaSystemControlMessage + }; + bitstr_t bs; + + bs.buf = bs.beg = bs.cur = buf; + bs.end = buf + sz; + bs.bit = 0; + + return decode_choice(&bs, &multimediasystemcontrolmessage, + (char *) mscm, 0); +} + +/****************************************************************************/ +int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) +{ + unsigned char *p = buf; + int len; + + if (!p || sz < 1) + return H323_ERROR_BOUND; + + /* Protocol Discriminator */ + if (*p != 0x08) { + PRINT("Unknown Protocol Discriminator\n"); + return H323_ERROR_RANGE; + } + p++; + sz--; + + /* CallReferenceValue */ + if (sz < 1) + return H323_ERROR_BOUND; + len = *p++; + sz--; + if (sz < len) + return H323_ERROR_BOUND; + p += len; + sz -= len; + + /* Message Type */ + if (sz < 1) + return H323_ERROR_BOUND; + q931->MessageType = *p++; + PRINT("MessageType = %02X\n", q931->MessageType); + if (*p & 0x80) { + p++; + sz--; + } + + /* Decode Information Elements */ + while (sz > 0) { + if (*p == 0x7e) { /* UserUserIE */ + if (sz < 3) + break; + p++; + len = *p++ << 8; + len |= *p++; + sz -= 3; + if (sz < len) + break; + p++; + len--; + return DecodeH323_UserInformation(buf, p, len, + &q931->UUIE); + } + p++; + sz--; + if (sz < 1) + break; + len = *p++; + if (sz < len) + break; + p += len; + sz -= len; + } + + PRINT("Q.931 UUIE not found\n"); + + return H323_ERROR_BOUND; +} diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c new file mode 100644 index 00000000..f03c2d45 --- /dev/null +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -0,0 +1,1837 @@ +/* + * H.323 connection tracking helper + * + * Copyright (c) 2006 Jing Min Zhao + * + * This source code is licensed under General Public License version 2. + * + * Based on the 'brute force' H.323 connection tracking module by + * Jozsef Kadlecsik + * + * For more information, please see http://nath323.sourceforge.net/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Parameters */ +static unsigned int default_rrq_ttl __read_mostly = 300; +module_param(default_rrq_ttl, uint, 0600); +MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ"); + +static int gkrouted_only __read_mostly = 1; +module_param(gkrouted_only, int, 0600); +MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); + +static int callforward_filter __read_mostly = 1; +module_param(callforward_filter, bool, 0600); +MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " + "if both endpoints are on different sides " + "(determined by routing information)"); + +/* Hooks for NAT */ +int (*set_h245_addr_hook) (struct sk_buff *skb, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, + union nf_inet_addr *addr, __be16 port) + __read_mostly; +int (*set_h225_addr_hook) (struct sk_buff *skb, + unsigned char **data, int dataoff, + TransportAddress *taddr, + union nf_inet_addr *addr, __be16 port) + __read_mostly; +int (*set_sig_addr_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) __read_mostly; +int (*set_ras_addr_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) __read_mostly; +int (*nat_rtp_rtcp_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, + __be16 port, __be16 rtp_port, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp) __read_mostly; +int (*nat_t120_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) __read_mostly; +int (*nat_h245_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) __read_mostly; +int (*nat_callforwarding_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) __read_mostly; +int (*nat_q931_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, TransportAddress *taddr, int idx, + __be16 port, struct nf_conntrack_expect *exp) + __read_mostly; + +static DEFINE_SPINLOCK(nf_h323_lock); +static char *h323_buffer; + +static struct nf_conntrack_helper nf_conntrack_helper_h245; +static struct nf_conntrack_helper nf_conntrack_helper_q931[]; +static struct nf_conntrack_helper nf_conntrack_helper_ras[]; + +/****************************************************************************/ +static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned char **data, int *datalen, int *dataoff) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + const struct tcphdr *th; + struct tcphdr _tcph; + int tcpdatalen; + int tcpdataoff; + unsigned char *tpkt; + int tpktlen; + int tpktoff; + + /* Get TCP header */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + /* Get TCP data offset */ + tcpdataoff = protoff + th->doff * 4; + + /* Get TCP data length */ + tcpdatalen = skb->len - tcpdataoff; + if (tcpdatalen <= 0) /* No TCP data */ + goto clear_out; + + if (*data == NULL) { /* first TPKT */ + /* Get first TPKT pointer */ + tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, + h323_buffer); + BUG_ON(tpkt == NULL); + + /* Validate TPKT identifier */ + if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { + /* Netmeeting sends TPKT header and data separately */ + if (info->tpkt_len[dir] > 0) { + pr_debug("nf_ct_h323: previous packet " + "indicated separate TPKT data of %hu " + "bytes\n", info->tpkt_len[dir]); + if (info->tpkt_len[dir] <= tcpdatalen) { + /* Yes, there was a TPKT header + * received */ + *data = tpkt; + *datalen = info->tpkt_len[dir]; + *dataoff = 0; + goto out; + } + + /* Fragmented TPKT */ + pr_debug("nf_ct_h323: fragmented TPKT\n"); + goto clear_out; + } + + /* It is not even a TPKT */ + return 0; + } + tpktoff = 0; + } else { /* Next TPKT */ + tpktoff = *dataoff + *datalen; + tcpdatalen -= tpktoff; + if (tcpdatalen <= 4) /* No more TPKT */ + goto clear_out; + tpkt = *data + *datalen; + + /* Validate TPKT identifier */ + if (tpkt[0] != 0x03 || tpkt[1] != 0) + goto clear_out; + } + + /* Validate TPKT length */ + tpktlen = tpkt[2] * 256 + tpkt[3]; + if (tpktlen < 4) + goto clear_out; + if (tpktlen > tcpdatalen) { + if (tcpdatalen == 4) { /* Separate TPKT header */ + /* Netmeeting sends TPKT header and data separately */ + pr_debug("nf_ct_h323: separate TPKT header indicates " + "there will be TPKT data of %hu bytes\n", + tpktlen - 4); + info->tpkt_len[dir] = tpktlen - 4; + return 0; + } + + pr_debug("nf_ct_h323: incomplete TPKT (fragmented?)\n"); + goto clear_out; + } + + /* This is the encapsulated data */ + *data = tpkt + 4; + *datalen = tpktlen - 4; + *dataoff = tpktoff + 4; + + out: + /* Clear TPKT length */ + info->tpkt_len[dir] = 0; + return 1; + + clear_out: + info->tpkt_len[dir] = 0; + return 0; +} + +/****************************************************************************/ +static int get_h245_addr(struct nf_conn *ct, const unsigned char *data, + H245_TransportAddress *taddr, + union nf_inet_addr *addr, __be16 *port) +{ + const unsigned char *p; + int len; + + if (taddr->choice != eH245_TransportAddress_unicastAddress) + return 0; + + switch (taddr->unicastAddress.choice) { + case eUnicastAddress_iPAddress: + if (nf_ct_l3num(ct) != AF_INET) + return 0; + p = data + taddr->unicastAddress.iPAddress.network; + len = 4; + break; + case eUnicastAddress_iP6Address: + if (nf_ct_l3num(ct) != AF_INET6) + return 0; + p = data + taddr->unicastAddress.iP6Address.network; + len = 16; + break; + default: + return 0; + } + + memcpy(addr, p, len); + memset((void *)addr + len, 0, sizeof(*addr) - len); + memcpy(port, p + len, sizeof(__be16)); + + return 1; +} + +/****************************************************************************/ +static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + __be16 rtp_port, rtcp_port; + union nf_inet_addr addr; + struct nf_conntrack_expect *rtp_exp; + struct nf_conntrack_expect *rtcp_exp; + typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; + + /* Read RTP or RTCP address */ + if (!get_h245_addr(ct, *data, taddr, &addr, &port) || + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) || + port == 0) + return 0; + + /* RTP port is even */ + port &= htons(~1); + rtp_port = port; + rtcp_port = htons(ntohs(port) + 1); + + /* Create expect for RTP */ + if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(rtp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_UDP, NULL, &rtp_port); + + /* Create expect for RTCP */ + if ((rtcp_exp = nf_ct_expect_alloc(ct)) == NULL) { + nf_ct_expect_put(rtp_exp); + return -1; + } + nf_ct_expect_init(rtcp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_UDP, NULL, &rtcp_port); + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) && + ct->status & IPS_NAT_MASK) { + /* NAT needed */ + ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + taddr, port, rtp_port, rtp_exp, rtcp_exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(rtp_exp) == 0) { + if (nf_ct_expect_related(rtcp_exp) == 0) { + pr_debug("nf_ct_h323: expect RTP "); + nf_ct_dump_tuple(&rtp_exp->tuple); + pr_debug("nf_ct_h323: expect RTCP "); + nf_ct_dump_tuple(&rtcp_exp->tuple); + } else { + nf_ct_unexpect_related(rtp_exp); + ret = -1; + } + } else + ret = -1; + } + + nf_ct_expect_put(rtp_exp); + nf_ct_expect_put(rtcp_exp); + + return ret; +} + +/****************************************************************************/ +static int expect_t120(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_t120_hook) nat_t120; + + /* Read T.120 address */ + if (!get_h245_addr(ct, *data, taddr, &addr, &port) || + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) || + port == 0) + return 0; + + /* Create expect for T.120 connections */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); + exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */ + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_t120 = rcu_dereference(nat_t120_hook)) && + ct->status & IPS_NAT_MASK) { + /* NAT needed */ + ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr, + port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_h323: expect T.120 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_h245_channel(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H2250LogicalChannelParameters *channel) +{ + int ret; + + if (channel->options & eH2250LogicalChannelParameters_mediaChannel) { + /* RTP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &channel->mediaChannel); + if (ret < 0) + return -1; + } + + if (channel-> + options & eH2250LogicalChannelParameters_mediaControlChannel) { + /* RTCP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &channel->mediaControlChannel); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_olc(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + OpenLogicalChannel *olc) +{ + int ret; + + pr_debug("nf_ct_h323: OpenLogicalChannel\n"); + + if (olc->forwardLogicalChannelParameters.multiplexParameters.choice == + eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters) + { + ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, + &olc-> + forwardLogicalChannelParameters. + multiplexParameters. + h2250LogicalChannelParameters); + if (ret < 0) + return -1; + } + + if ((olc->options & + eOpenLogicalChannel_reverseLogicalChannelParameters) && + (olc->reverseLogicalChannelParameters.options & + eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters) + && (olc->reverseLogicalChannelParameters.multiplexParameters. + choice == + eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) + { + ret = + process_h245_channel(skb, ct, ctinfo, data, dataoff, + &olc-> + reverseLogicalChannelParameters. + multiplexParameters. + h2250LogicalChannelParameters); + if (ret < 0) + return -1; + } + + if ((olc->options & eOpenLogicalChannel_separateStack) && + olc->forwardLogicalChannelParameters.dataType.choice == + eDataType_data && + olc->forwardLogicalChannelParameters.dataType.data.application. + choice == eDataApplicationCapability_application_t120 && + olc->forwardLogicalChannelParameters.dataType.data.application. + t120.choice == eDataProtocolCapability_separateLANStack && + olc->separateStack.networkAddress.choice == + eNetworkAccessParameters_networkAddress_localAreaAddress) { + ret = expect_t120(skb, ct, ctinfo, data, dataoff, + &olc->separateStack.networkAddress. + localAreaAddress); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_olca(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + OpenLogicalChannelAck *olca) +{ + H2250LogicalChannelAckParameters *ack; + int ret; + + pr_debug("nf_ct_h323: OpenLogicalChannelAck\n"); + + if ((olca->options & + eOpenLogicalChannelAck_reverseLogicalChannelParameters) && + (olca->reverseLogicalChannelParameters.options & + eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters) + && (olca->reverseLogicalChannelParameters.multiplexParameters. + choice == + eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) + { + ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, + &olca-> + reverseLogicalChannelParameters. + multiplexParameters. + h2250LogicalChannelParameters); + if (ret < 0) + return -1; + } + + if ((olca->options & + eOpenLogicalChannelAck_forwardMultiplexAckParameters) && + (olca->forwardMultiplexAckParameters.choice == + eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters)) + { + ack = &olca->forwardMultiplexAckParameters. + h2250LogicalChannelAckParameters; + if (ack->options & + eH2250LogicalChannelAckParameters_mediaChannel) { + /* RTP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &ack->mediaChannel); + if (ret < 0) + return -1; + } + + if (ack->options & + eH2250LogicalChannelAckParameters_mediaControlChannel) { + /* RTCP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &ack->mediaControlChannel); + if (ret < 0) + return -1; + } + } + + if ((olca->options & eOpenLogicalChannelAck_separateStack) && + olca->separateStack.networkAddress.choice == + eNetworkAccessParameters_networkAddress_localAreaAddress) { + ret = expect_t120(skb, ct, ctinfo, data, dataoff, + &olca->separateStack.networkAddress. + localAreaAddress); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_h245(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + MultimediaSystemControlMessage *mscm) +{ + switch (mscm->choice) { + case eMultimediaSystemControlMessage_request: + if (mscm->request.choice == + eRequestMessage_openLogicalChannel) { + return process_olc(skb, ct, ctinfo, data, dataoff, + &mscm->request.openLogicalChannel); + } + pr_debug("nf_ct_h323: H.245 Request %d\n", + mscm->request.choice); + break; + case eMultimediaSystemControlMessage_response: + if (mscm->response.choice == + eResponseMessage_openLogicalChannelAck) { + return process_olca(skb, ct, ctinfo, data, dataoff, + &mscm->response. + openLogicalChannelAck); + } + pr_debug("nf_ct_h323: H.245 Response %d\n", + mscm->response.choice); + break; + default: + pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice); + break; + } + + return 0; +} + +/****************************************************************************/ +static int h245_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + static MultimediaSystemControlMessage mscm; + unsigned char *data = NULL; + int datalen; + int dataoff; + int ret; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + pr_debug("nf_ct_h245: skblen = %u\n", skb->len); + + spin_lock_bh(&nf_h323_lock); + + /* Process each TPKT */ + while (get_tpkt_data(skb, protoff, ct, ctinfo, + &data, &datalen, &dataoff)) { + pr_debug("nf_ct_h245: TPKT len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + + /* Decode H.245 signal */ + ret = DecodeMultimediaSystemControlMessage(data, datalen, + &mscm); + if (ret < 0) { + pr_debug("nf_ct_h245: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); + /* We don't drop when decoding error */ + break; + } + + /* Process H.245 signal */ + if (process_h245(skb, ct, ctinfo, &data, dataoff, &mscm) < 0) + goto drop; + } + + spin_unlock_bh(&nf_h323_lock); + return NF_ACCEPT; + + drop: + spin_unlock_bh(&nf_h323_lock); + if (net_ratelimit()) + pr_info("nf_ct_h245: packet dropped\n"); + return NF_DROP; +} + +/****************************************************************************/ +static const struct nf_conntrack_expect_policy h245_exp_policy = { + .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */, + .timeout = 240, +}; + +static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { + .name = "H.245", + .me = THIS_MODULE, + .tuple.src.l3num = AF_UNSPEC, + .tuple.dst.protonum = IPPROTO_UDP, + .help = h245_help, + .expect_policy = &h245_exp_policy, +}; + +/****************************************************************************/ +int get_h225_addr(struct nf_conn *ct, unsigned char *data, + TransportAddress *taddr, + union nf_inet_addr *addr, __be16 *port) +{ + const unsigned char *p; + int len; + + switch (taddr->choice) { + case eTransportAddress_ipAddress: + if (nf_ct_l3num(ct) != AF_INET) + return 0; + p = data + taddr->ipAddress.ip; + len = 4; + break; + case eTransportAddress_ip6Address: + if (nf_ct_l3num(ct) != AF_INET6) + return 0; + p = data + taddr->ip6Address.ip; + len = 16; + break; + default: + return 0; + } + + memcpy(addr, p, len); + memset((void *)addr + len, 0, sizeof(*addr) - len); + memcpy(port, p + len, sizeof(__be16)); + + return 1; +} + +/****************************************************************************/ +static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_h245_hook) nat_h245; + + /* Read h245Address */ + if (!get_h225_addr(ct, *data, taddr, &addr, &port) || + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) || + port == 0) + return 0; + + /* Create expect for h245 connection */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); + exp->helper = &nf_conntrack_helper_h245; + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_h245 = rcu_dereference(nat_h245_hook)) && + ct->status & IPS_NAT_MASK) { + /* NAT needed */ + ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr, + port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_q931: expect H.245 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ +static int callforward_do_filter(const union nf_inet_addr *src, + const union nf_inet_addr *dst, + u_int8_t family) +{ + const struct nf_afinfo *afinfo; + int ret = 0; + + /* rcu_read_lock()ed by nf_hook_slow() */ + afinfo = nf_get_afinfo(family); + if (!afinfo) + return 0; + + switch (family) { + case AF_INET: { + struct flowi4 fl1, fl2; + struct rtable *rt1, *rt2; + + memset(&fl1, 0, sizeof(fl1)); + fl1.daddr = src->ip; + + memset(&fl2, 0, sizeof(fl2)); + fl2.daddr = dst->ip; + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, + flowi4_to_flowi(&fl1), false)) { + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, + flowi4_to_flowi(&fl2), false)) { + if (rt1->rt_gateway == rt2->rt_gateway && + rt1->dst.dev == rt2->dst.dev) + ret = 1; + dst_release(&rt2->dst); + } + dst_release(&rt1->dst); + } + break; + } +#if defined(CONFIG_NF_CONNTRACK_IPV6) || \ + defined(CONFIG_NF_CONNTRACK_IPV6_MODULE) + case AF_INET6: { + struct flowi6 fl1, fl2; + struct rt6_info *rt1, *rt2; + + memset(&fl1, 0, sizeof(fl1)); + ipv6_addr_copy(&fl1.daddr, &src->in6); + + memset(&fl2, 0, sizeof(fl2)); + ipv6_addr_copy(&fl2.daddr, &dst->in6); + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, + flowi6_to_flowi(&fl1), false)) { + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, + flowi6_to_flowi(&fl2), false)) { + if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, + sizeof(rt1->rt6i_gateway)) && + rt1->dst.dev == rt2->dst.dev) + ret = 1; + dst_release(&rt2->dst); + } + dst_release(&rt1->dst); + } + break; + } +#endif + } + return ret; + +} + +/****************************************************************************/ +static int expect_callforwarding(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_callforwarding_hook) nat_callforwarding; + + /* Read alternativeAddress */ + if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0) + return 0; + + /* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ + if (callforward_filter && + callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3, + nf_ct_l3num(ct))) { + pr_debug("nf_ct_q931: Call Forwarding not tracked\n"); + return 0; + } + + /* Create expect for the second call leg */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); + exp->helper = nf_conntrack_helper_q931; + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) && + ct->status & IPS_NAT_MASK) { + /* Need NAT */ + ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff, + taddr, port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_q931: expect Call Forwarding "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_setup(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Setup_UUIE *setup) +{ + int dir = CTINFO2DIR(ctinfo); + int ret; + int i; + __be16 port; + union nf_inet_addr addr; + typeof(set_h225_addr_hook) set_h225_addr; + + pr_debug("nf_ct_q931: Setup\n"); + + if (setup->options & eSetup_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &setup->h245Address); + if (ret < 0) + return -1; + } + + set_h225_addr = rcu_dereference(set_h225_addr_hook); + if ((setup->options & eSetup_UUIE_destCallSignalAddress) && + (set_h225_addr) && ct->status & IPS_NAT_MASK && + get_h225_addr(ct, *data, &setup->destCallSignalAddress, + &addr, &port) && + memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) { + pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n", + &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3, + ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); + ret = set_h225_addr(skb, data, dataoff, + &setup->destCallSignalAddress, + &ct->tuplehash[!dir].tuple.src.u3, + ct->tuplehash[!dir].tuple.src.u.tcp.port); + if (ret < 0) + return -1; + } + + if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && + (set_h225_addr) && ct->status & IPS_NAT_MASK && + get_h225_addr(ct, *data, &setup->sourceCallSignalAddress, + &addr, &port) && + memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) { + pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n", + &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3, + ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); + ret = set_h225_addr(skb, data, dataoff, + &setup->sourceCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + ct->tuplehash[!dir].tuple.dst.u.tcp.port); + if (ret < 0) + return -1; + } + + if (setup->options & eSetup_UUIE_fastStart) { + for (i = 0; i < setup->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &setup->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_callproceeding(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + CallProceeding_UUIE *callproc) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: CallProceeding\n"); + + if (callproc->options & eCallProceeding_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &callproc->h245Address); + if (ret < 0) + return -1; + } + + if (callproc->options & eCallProceeding_UUIE_fastStart) { + for (i = 0; i < callproc->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &callproc->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_connect(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Connect_UUIE *connect) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Connect\n"); + + if (connect->options & eConnect_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &connect->h245Address); + if (ret < 0) + return -1; + } + + if (connect->options & eConnect_UUIE_fastStart) { + for (i = 0; i < connect->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &connect->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Alerting_UUIE *alert) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Alerting\n"); + + if (alert->options & eAlerting_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &alert->h245Address); + if (ret < 0) + return -1; + } + + if (alert->options & eAlerting_UUIE_fastStart) { + for (i = 0; i < alert->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &alert->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_facility(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Facility_UUIE *facility) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Facility\n"); + + if (facility->reason.choice == eFacilityReason_callForwarded) { + if (facility->options & eFacility_UUIE_alternativeAddress) + return expect_callforwarding(skb, ct, ctinfo, data, + dataoff, + &facility-> + alternativeAddress); + return 0; + } + + if (facility->options & eFacility_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &facility->h245Address); + if (ret < 0) + return -1; + } + + if (facility->options & eFacility_UUIE_fastStart) { + for (i = 0; i < facility->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &facility->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_progress(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Progress_UUIE *progress) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Progress\n"); + + if (progress->options & eProgress_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &progress->h245Address); + if (ret < 0) + return -1; + } + + if (progress->options & eProgress_UUIE_fastStart) { + for (i = 0; i < progress->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &progress->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_q931(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, Q931 *q931) +{ + H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu; + int i; + int ret = 0; + + switch (pdu->h323_message_body.choice) { + case eH323_UU_PDU_h323_message_body_setup: + ret = process_setup(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.setup); + break; + case eH323_UU_PDU_h323_message_body_callProceeding: + ret = process_callproceeding(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body. + callProceeding); + break; + case eH323_UU_PDU_h323_message_body_connect: + ret = process_connect(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.connect); + break; + case eH323_UU_PDU_h323_message_body_alerting: + ret = process_alerting(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.alerting); + break; + case eH323_UU_PDU_h323_message_body_facility: + ret = process_facility(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.facility); + break; + case eH323_UU_PDU_h323_message_body_progress: + ret = process_progress(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.progress); + break; + default: + pr_debug("nf_ct_q931: Q.931 signal %d\n", + pdu->h323_message_body.choice); + break; + } + + if (ret < 0) + return -1; + + if (pdu->options & eH323_UU_PDU_h245Control) { + for (i = 0; i < pdu->h245Control.count; i++) { + ret = process_h245(skb, ct, ctinfo, data, dataoff, + &pdu->h245Control.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int q931_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + static Q931 q931; + unsigned char *data = NULL; + int datalen; + int dataoff; + int ret; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + pr_debug("nf_ct_q931: skblen = %u\n", skb->len); + + spin_lock_bh(&nf_h323_lock); + + /* Process each TPKT */ + while (get_tpkt_data(skb, protoff, ct, ctinfo, + &data, &datalen, &dataoff)) { + pr_debug("nf_ct_q931: TPKT len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + + /* Decode Q.931 signal */ + ret = DecodeQ931(data, datalen, &q931); + if (ret < 0) { + pr_debug("nf_ct_q931: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); + /* We don't drop when decoding error */ + break; + } + + /* Process Q.931 signal */ + if (process_q931(skb, ct, ctinfo, &data, dataoff, &q931) < 0) + goto drop; + } + + spin_unlock_bh(&nf_h323_lock); + return NF_ACCEPT; + + drop: + spin_unlock_bh(&nf_h323_lock); + if (net_ratelimit()) + pr_info("nf_ct_q931: packet dropped\n"); + return NF_DROP; +} + +/****************************************************************************/ +static const struct nf_conntrack_expect_policy q931_exp_policy = { + /* T.120 and H.245 */ + .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4, + .timeout = 240, +}; + +static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { + { + .name = "Q.931", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET, + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), + .tuple.dst.protonum = IPPROTO_TCP, + .help = q931_help, + .expect_policy = &q931_exp_policy, + }, + { + .name = "Q.931", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET6, + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), + .tuple.dst.protonum = IPPROTO_TCP, + .help = q931_help, + .expect_policy = &q931_exp_policy, + }, +}; + +/****************************************************************************/ +static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, + int *datalen) +{ + const struct udphdr *uh; + struct udphdr _uh; + int dataoff; + + uh = skb_header_pointer(skb, protoff, sizeof(_uh), &_uh); + if (uh == NULL) + return NULL; + dataoff = protoff + sizeof(_uh); + if (dataoff >= skb->len) + return NULL; + *datalen = skb->len - dataoff; + return skb_header_pointer(skb, dataoff, *datalen, h323_buffer); +} + +/****************************************************************************/ +static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, + union nf_inet_addr *addr, + __be16 port) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple tuple; + + memset(&tuple.src.u3, 0, sizeof(tuple.src.u3)); + tuple.src.u.tcp.port = 0; + memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3)); + tuple.dst.u.tcp.port = port; + tuple.dst.protonum = IPPROTO_TCP; + + exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); + if (exp && exp->master == ct) + return exp; + return NULL; +} + +/****************************************************************************/ +static int set_expect_timeout(struct nf_conntrack_expect *exp, + unsigned timeout) +{ + if (!exp || !del_timer(&exp->timeout)) + return 0; + + exp->timeout.expires = jiffies + timeout * HZ; + add_timer(&exp->timeout); + + return 1; +} + +/****************************************************************************/ +static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + int i; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_q931_hook) nat_q931; + + /* Look for the first related address */ + for (i = 0; i < count; i++) { + if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) && + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, + sizeof(addr)) == 0 && port != 0) + break; + } + + if (i >= count) /* Not found */ + return 0; + + /* Create expect for Q.931 */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + gkrouted_only ? /* only accept calls from GK? */ + &ct->tuplehash[!dir].tuple.src.u3 : NULL, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); + exp->helper = nf_conntrack_helper_q931; + exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ + + nat_q931 = rcu_dereference(nat_q931_hook); + if (nat_q931 && ct->status & IPS_NAT_MASK) { /* Need NAT */ + ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); + + /* Save port for looking up expect in processing RCF */ + info->sig_port[dir] = port; + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_grq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, GatekeeperRequest *grq) +{ + typeof(set_ras_addr_hook) set_ras_addr; + + pr_debug("nf_ct_ras: GRQ\n"); + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) /* NATed */ + return set_ras_addr(skb, ct, ctinfo, data, + &grq->rasAddress, 1); + return 0; +} + +/****************************************************************************/ +static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, GatekeeperConfirm *gcf) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + + pr_debug("nf_ct_ras: GCF\n"); + + if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port)) + return 0; + + /* Registration port is the same as discovery port */ + if (!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && + port == ct->tuplehash[dir].tuple.src.u.udp.port) + return 0; + + /* Avoid RAS expectation loops. A GCF is never expected. */ + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + return 0; + + /* Need new expect */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_UDP, NULL, &port); + exp->helper = nf_conntrack_helper_ras; + + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect RAS "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, RegistrationRequest *rrq) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int ret; + typeof(set_ras_addr_hook) set_ras_addr; + + pr_debug("nf_ct_ras: RRQ\n"); + + ret = expect_q931(skb, ct, ctinfo, data, + rrq->callSignalAddress.item, + rrq->callSignalAddress.count); + if (ret < 0) + return -1; + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, data, + rrq->rasAddress.item, + rrq->rasAddress.count); + if (ret < 0) + return -1; + } + + if (rrq->options & eRegistrationRequest_timeToLive) { + pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive); + info->timeout = rrq->timeToLive; + } else + info->timeout = default_rrq_ttl; + + return 0; +} + +/****************************************************************************/ +static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, RegistrationConfirm *rcf) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int ret; + struct nf_conntrack_expect *exp; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: RCF\n"); + + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, data, + rcf->callSignalAddress.item, + rcf->callSignalAddress.count); + if (ret < 0) + return -1; + } + + if (rcf->options & eRegistrationConfirm_timeToLive) { + pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive); + info->timeout = rcf->timeToLive; + } + + if (info->timeout > 0) { + pr_debug("nf_ct_ras: set RAS connection timeout to " + "%u seconds\n", info->timeout); + nf_ct_refresh(ct, skb, info->timeout * HZ); + + /* Set expect timeout */ + spin_lock_bh(&nf_conntrack_lock); + exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, + info->sig_port[!dir]); + if (exp) { + pr_debug("nf_ct_ras: set Q.931 expect " + "timeout to %u seconds for", + info->timeout); + nf_ct_dump_tuple(&exp->tuple); + set_expect_timeout(exp, info->timeout); + } + spin_unlock_bh(&nf_conntrack_lock); + } + + return 0; +} + +/****************************************************************************/ +static int process_urq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, UnregistrationRequest *urq) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int ret; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: URQ\n"); + + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, data, + urq->callSignalAddress.item, + urq->callSignalAddress.count); + if (ret < 0) + return -1; + } + + /* Clear old expect */ + nf_ct_remove_expectations(ct); + info->sig_port[dir] = 0; + info->sig_port[!dir] = 0; + + /* Give it 30 seconds for UCF or URJ */ + nf_ct_refresh(ct, skb, 30 * HZ); + + return 0; +} + +/****************************************************************************/ +static int process_arq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, AdmissionRequest *arq) +{ + const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + __be16 port; + union nf_inet_addr addr; + typeof(set_h225_addr_hook) set_h225_addr; + + pr_debug("nf_ct_ras: ARQ\n"); + + set_h225_addr = rcu_dereference(set_h225_addr_hook); + if ((arq->options & eAdmissionRequest_destCallSignalAddress) && + get_h225_addr(ct, *data, &arq->destCallSignalAddress, + &addr, &port) && + !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && + port == info->sig_port[dir] && + set_h225_addr && ct->status & IPS_NAT_MASK) { + /* Answering ARQ */ + return set_h225_addr(skb, data, 0, + &arq->destCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + info->sig_port[!dir]); + } + + if ((arq->options & eAdmissionRequest_srcCallSignalAddress) && + get_h225_addr(ct, *data, &arq->srcCallSignalAddress, + &addr, &port) && + !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && + set_h225_addr && ct->status & IPS_NAT_MASK) { + /* Calling ARQ */ + return set_h225_addr(skb, data, 0, + &arq->srcCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + port); + } + + return 0; +} + +/****************************************************************************/ +static int process_acf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, AdmissionConfirm *acf) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: ACF\n"); + + if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress, + &addr, &port)) + return 0; + + if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) { + /* Answering ACF */ + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) + return set_sig_addr(skb, ct, ctinfo, data, + &acf->destCallSignalAddress, 1); + return 0; + } + + /* Need new expect */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->helper = nf_conntrack_helper_q931; + + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, LocationRequest *lrq) +{ + typeof(set_ras_addr_hook) set_ras_addr; + + pr_debug("nf_ct_ras: LRQ\n"); + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) + return set_ras_addr(skb, ct, ctinfo, data, + &lrq->replyAddress, 1); + return 0; +} + +/****************************************************************************/ +static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, LocationConfirm *lcf) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + + pr_debug("nf_ct_ras: LCF\n"); + + if (!get_h225_addr(ct, *data, &lcf->callSignalAddress, + &addr, &port)) + return 0; + + /* Need new expect for call signal */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->helper = nf_conntrack_helper_q931; + + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + + nf_ct_expect_put(exp); + + /* Ignore rasAddress */ + + return ret; +} + +/****************************************************************************/ +static int process_irr(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, InfoRequestResponse *irr) +{ + int ret; + typeof(set_ras_addr_hook) set_ras_addr; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: IRR\n"); + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, data, + &irr->rasAddress, 1); + if (ret < 0) + return -1; + } + + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, data, + irr->callSignalAddress.item, + irr->callSignalAddress.count); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_ras(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, RasMessage *ras) +{ + switch (ras->choice) { + case eRasMessage_gatekeeperRequest: + return process_grq(skb, ct, ctinfo, data, + &ras->gatekeeperRequest); + case eRasMessage_gatekeeperConfirm: + return process_gcf(skb, ct, ctinfo, data, + &ras->gatekeeperConfirm); + case eRasMessage_registrationRequest: + return process_rrq(skb, ct, ctinfo, data, + &ras->registrationRequest); + case eRasMessage_registrationConfirm: + return process_rcf(skb, ct, ctinfo, data, + &ras->registrationConfirm); + case eRasMessage_unregistrationRequest: + return process_urq(skb, ct, ctinfo, data, + &ras->unregistrationRequest); + case eRasMessage_admissionRequest: + return process_arq(skb, ct, ctinfo, data, + &ras->admissionRequest); + case eRasMessage_admissionConfirm: + return process_acf(skb, ct, ctinfo, data, + &ras->admissionConfirm); + case eRasMessage_locationRequest: + return process_lrq(skb, ct, ctinfo, data, + &ras->locationRequest); + case eRasMessage_locationConfirm: + return process_lcf(skb, ct, ctinfo, data, + &ras->locationConfirm); + case eRasMessage_infoRequestResponse: + return process_irr(skb, ct, ctinfo, data, + &ras->infoRequestResponse); + default: + pr_debug("nf_ct_ras: RAS message %d\n", ras->choice); + break; + } + + return 0; +} + +/****************************************************************************/ +static int ras_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + static RasMessage ras; + unsigned char *data; + int datalen = 0; + int ret; + + pr_debug("nf_ct_ras: skblen = %u\n", skb->len); + + spin_lock_bh(&nf_h323_lock); + + /* Get UDP data */ + data = get_udp_data(skb, protoff, &datalen); + if (data == NULL) + goto accept; + pr_debug("nf_ct_ras: RAS message len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + + /* Decode RAS message */ + ret = DecodeRasMessage(data, datalen, &ras); + if (ret < 0) { + pr_debug("nf_ct_ras: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); + goto accept; + } + + /* Process RAS message */ + if (process_ras(skb, ct, ctinfo, &data, &ras) < 0) + goto drop; + + accept: + spin_unlock_bh(&nf_h323_lock); + return NF_ACCEPT; + + drop: + spin_unlock_bh(&nf_h323_lock); + if (net_ratelimit()) + pr_info("nf_ct_ras: packet dropped\n"); + return NF_DROP; +} + +/****************************************************************************/ +static const struct nf_conntrack_expect_policy ras_exp_policy = { + .max_expected = 32, + .timeout = 240, +}; + +static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { + { + .name = "RAS", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .help = ras_help, + .expect_policy = &ras_exp_policy, + }, + { + .name = "RAS", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET6, + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .help = ras_help, + .expect_policy = &ras_exp_policy, + }, +}; + +/****************************************************************************/ +static void __exit nf_conntrack_h323_fini(void) +{ + nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[1]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); + kfree(h323_buffer); + pr_debug("nf_ct_h323: fini\n"); +} + +/****************************************************************************/ +static int __init nf_conntrack_h323_init(void) +{ + int ret; + + h323_buffer = kmalloc(65536, GFP_KERNEL); + if (!h323_buffer) + return -ENOMEM; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245); + if (ret < 0) + goto err1; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]); + if (ret < 0) + goto err2; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]); + if (ret < 0) + goto err3; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]); + if (ret < 0) + goto err4; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]); + if (ret < 0) + goto err5; + pr_debug("nf_ct_h323: init success\n"); + return 0; + +err5: + nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); +err4: + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); +err3: + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); +err2: + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); +err1: + kfree(h323_buffer); + return ret; +} + +/****************************************************************************/ +module_init(nf_conntrack_h323_init); +module_exit(nf_conntrack_h323_fini); + +EXPORT_SYMBOL_GPL(get_h225_addr); +EXPORT_SYMBOL_GPL(set_h245_addr_hook); +EXPORT_SYMBOL_GPL(set_h225_addr_hook); +EXPORT_SYMBOL_GPL(set_sig_addr_hook); +EXPORT_SYMBOL_GPL(set_ras_addr_hook); +EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); +EXPORT_SYMBOL_GPL(nat_t120_hook); +EXPORT_SYMBOL_GPL(nat_h245_hook); +EXPORT_SYMBOL_GPL(nat_callforwarding_hook); +EXPORT_SYMBOL_GPL(nat_q931_hook); + +MODULE_AUTHOR("Jing Min Zhao "); +MODULE_DESCRIPTION("H.323 connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_h323"); +MODULE_ALIAS_NFCT_HELPER("h323"); diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c new file mode 100644 index 00000000..d880f352 --- /dev/null +++ b/net/netfilter/nf_conntrack_h323_types.c @@ -0,0 +1,1922 @@ +/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007 + * + * Copyright (c) 2006 Jing Min Zhao + * + * This source code is licensed under General Public License version 2. + */ + +static const struct field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */ + {FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE, + offsetof(TransportAddress_ipAddress, ip), NULL}, + {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ipSourceRoute_route[] = { /* SEQUENCE OF */ + {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ipSourceRoute_routing[] = { /* CHOICE */ + {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */ + {FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0, + _TransportAddress_ipSourceRoute_route}, + {FNAME("routing") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _TransportAddress_ipSourceRoute_routing}, +}; + +static const struct field_t _TransportAddress_ipxAddress[] = { /* SEQUENCE */ + {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL}, + {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ip6Address[] = { /* SEQUENCE */ + {FNAME("ip") OCTSTR, FIXD, 16, 0, DECODE, + offsetof(TransportAddress_ip6Address, ip), NULL}, + {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H221NonStandard[] = { /* SEQUENCE */ + {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _NonStandardIdentifier[] = { /* CHOICE */ + {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0, + _H221NonStandard}, +}; + +static const struct field_t _NonStandardParameter[] = { /* SEQUENCE */ + {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _NonStandardIdentifier}, + {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress[] = { /* CHOICE */ + {FNAME("ipAddress") SEQ, 0, 2, 2, DECODE, + offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress}, + {FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0, + _TransportAddress_ipSourceRoute}, + {FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0, + _TransportAddress_ipxAddress}, + {FNAME("ip6Address") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(TransportAddress, ip6Address), + _TransportAddress_ip6Address}, + {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, + {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, + _NonStandardParameter}, +}; + +static const struct field_t _AliasAddress[] = { /* CHOICE */ + {FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("transportID") CHOICE, 3, 7, 7, SKIP | EXT, 0, NULL}, + {FNAME("email-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("partyNumber") CHOICE, 3, 5, 5, SKIP | EXT, 0, NULL}, + {FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_sourceAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _VendorIdentifier[] = { /* SEQUENCE */ + {FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard}, + {FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _GatekeeperInfo[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, +}; + +static const struct field_t _H310Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H320Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H321Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H322Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H323Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H324Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _VoiceCaps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _T120OnlyCaps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _SupportedProtocols[] = { /* CHOICE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0, + _NonStandardParameter}, + {FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps}, + {FNAME("h320") SEQ, 1, 1, 3, SKIP | EXT, 0, _H320Caps}, + {FNAME("h321") SEQ, 1, 1, 3, SKIP | EXT, 0, _H321Caps}, + {FNAME("h322") SEQ, 1, 1, 3, SKIP | EXT, 0, _H322Caps}, + {FNAME("h323") SEQ, 1, 1, 3, SKIP | EXT, 0, _H323Caps}, + {FNAME("h324") SEQ, 1, 1, 3, SKIP | EXT, 0, _H324Caps}, + {FNAME("voice") SEQ, 1, 1, 3, SKIP | EXT, 0, _VoiceCaps}, + {FNAME("t120-only") SEQ, 1, 1, 3, SKIP | EXT, 0, _T120OnlyCaps}, + {FNAME("nonStandardProtocol") SEQ, 2, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _GatewayInfo_protocol[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols}, +}; + +static const struct field_t _GatewayInfo[] = { /* SEQUENCE */ + {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _GatewayInfo_protocol}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, +}; + +static const struct field_t _McuInfo[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _TerminalInfo[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, +}; + +static const struct field_t _EndpointType[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0, + _VendorIdentifier}, + {FNAME("gatekeeper") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, + _GatekeeperInfo}, + {FNAME("gateway") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, _GatewayInfo}, + {FNAME("mcu") SEQ, 1, 1, 2, SKIP | EXT | OPT, 0, _McuInfo}, + {FNAME("terminal") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, _TerminalInfo}, + {FNAME("mc") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("undefinedNode") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("set") BITSTR, FIXD, 32, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedTunnelledProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, + 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_destinationAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _Setup_UUIE_destExtraCallInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _Setup_UUIE_destExtraCRV[] = { /* SEQUENCE OF */ + {FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */ + {FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("capability-negotiation") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callIndependentSupplementaryService") NUL, FIXD, 0, 0, SKIP, + 0, NULL}, +}; + +static const struct field_t _Q954Details[] = { /* SEQUENCE */ + {FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _QseriesOptions[] = { /* SEQUENCE */ + {FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q953Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q955Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q956Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q957Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details}, +}; + +static const struct field_t _CallType[] = { /* CHOICE */ + {FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H245_NonStandardIdentifier_h221NonStandard[] = { /* SEQUENCE */ + {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H245_NonStandardIdentifier[] = { /* CHOICE */ + {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0, + _H245_NonStandardIdentifier_h221NonStandard}, +}; + +static const struct field_t _H245_NonStandardParameter[] = { /* SEQUENCE */ + {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0, + _H245_NonStandardIdentifier}, + {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H261VideoCapability[] = { /* SEQUENCE */ + {FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0, + NULL}, + {FNAME("maxBitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("stillImageTransmission") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H262VideoCapability[] = { /* SEQUENCE */ + {FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-SNRatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-SNRatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-SpatialatH-14") BOOL, FIXD, 0, 0, SKIP, 0, + NULL}, + {FNAME("profileAndLevel-HPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-HPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-HPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("framesPerSecond") INT, 4, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H263VideoCapability[] = { /* SEQUENCE */ + {FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cif4MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cif16MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("maxBitRate") INT, CONS, 1, 0, SKIP, 0, NULL}, + {FNAME("unrestrictedVector") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("arithmeticCoding") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("advancedPrediction") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("pbFrames") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0, + NULL}, + {FNAME("hrd-B") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("bppMaxKb") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowSqcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowQcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowCifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowCif4MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowCif16MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("errorCompensation") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("enhancementLayerInfo") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _IS11172VideoCapability[] = { /* SEQUENCE */ + {FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("pictureRate") INT, 4, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _VideoCapability[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0, + _H261VideoCapability}, + {FNAME("h262VideoCapability") SEQ, 6, 17, 18, SKIP | EXT, 0, + _H262VideoCapability}, + {FNAME("h263VideoCapability") SEQ, 7, 13, 21, SKIP | EXT, 0, + _H263VideoCapability}, + {FNAME("is11172VideoCapability") SEQ, 6, 7, 8, SKIP | EXT, 0, + _IS11172VideoCapability}, + {FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _AudioCapability_g7231[] = { /* SEQUENCE */ + {FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _IS11172AudioCapability[] = { /* SEQUENCE */ + {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _IS13818AudioCapability[] = { /* SEQUENCE */ + {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling16k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling22k05") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling24k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("threeChannels2-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("threeChannels3-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fourChannels2-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fourChannels2-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fourChannels3-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fiveChannels3-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fiveChannels3-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("lowFrequencyEnhancement") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("multilingual") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _AudioCapability[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g711Alaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g711Ulaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g711Ulaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g722-64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g722-56k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g722-48k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g7231") SEQ, 0, 2, 2, SKIP, 0, _AudioCapability_g7231}, + {FNAME("g728") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g729") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g729AnnexA") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("is11172AudioCapability") SEQ, 0, 9, 9, SKIP | EXT, 0, + _IS11172AudioCapability}, + {FNAME("is13818AudioCapability") SEQ, 0, 21, 21, SKIP | EXT, 0, + _IS13818AudioCapability}, + {FNAME("g729wAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g729AnnexAwAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g7231AnnexCCapability") SEQ, 1, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("gsmFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("gsmHalfRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("gsmEnhancedFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("genericAudioCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, + {FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _DataProtocolCapability[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("v42lapm") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdlcFrameTunnelling") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h310SeparateVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h310SingleVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("transparent") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("segmentationAndReassembly") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdlcFrameTunnelingwSAR") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("v120") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("separateLANStack") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("v76wCompression") CHOICE, 2, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("tcp") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */ + {FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("ccir601Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdtvSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdtvProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g3FacsMH200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g3FacsMH200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g4FacsMMR200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g4FacsMMR200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig200x200Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig200x200Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig300x300Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig300x300Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoLow") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoMedSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoMedProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoHighSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _T84Profile[] = { /* CHOICE */ + {FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0, + _T84Profile_t84Restricted}, +}; + +static const struct field_t _DataApplicationCapability_application_t84[] = { /* SEQUENCE */ + {FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile}, +}; + +static const struct field_t _DataApplicationCapability_application_nlpid[] = { /* SEQUENCE */ + {FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _DataApplicationCapability_application[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT, + offsetof(DataApplicationCapability_application, t120), + _DataProtocolCapability}, + {FNAME("dsm-cc") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("userData") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("t84") SEQ, 0, 2, 2, SKIP, 0, + _DataApplicationCapability_application_t84}, + {FNAME("t434") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("h224") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("nlpid") SEQ, 0, 2, 2, SKIP, 0, + _DataApplicationCapability_application_nlpid}, + {FNAME("dsvdControl") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h222DataPartitioning") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("t30fax") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL}, + {FNAME("t140") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL}, + {FNAME("t38fax") SEQ, 0, 2, 2, SKIP, 0, NULL}, + {FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _DataApplicationCapability[] = { /* SEQUENCE */ + {FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT, + offsetof(DataApplicationCapability, application), + _DataApplicationCapability_application}, + {FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _EncryptionMode[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _DataType[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoData") CHOICE, 3, 5, 6, SKIP | EXT, 0, _VideoCapability}, + {FNAME("audioData") CHOICE, 4, 14, 22, SKIP | EXT, 0, + _AudioCapability}, + {FNAME("data") SEQ, 0, 2, 2, DECODE | EXT, offsetof(DataType, data), + _DataApplicationCapability}, + {FNAME("encryptionData") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _EncryptionMode}, + {FNAME("h235Control") SEQ, 0, 2, 2, SKIP, 0, NULL}, + {FNAME("h235Media") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, + {FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("programDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = { /* SEQUENCE */ + {FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL}, + {FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al1NotFramed") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al2WithoutSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al2WithSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al3") SEQ, 0, 2, 2, SKIP, 0, + _H223LogicalChannelParameters_adaptationLayerType_al3}, + {FNAME("al1M") SEQ, 0, 7, 8, SKIP | EXT, 0, NULL}, + {FNAME("al2M") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, + {FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _H223LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0, + _H223LogicalChannelParameters_adaptationLayerType}, + {FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CRCLength[] = { /* CHOICE */ + {FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76HDLCParameters[] = { /* SEQUENCE */ + {FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength}, + {FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters_suspendResume[] = { /* CHOICE */ + {FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = { /* CHOICE */ + {FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters_mode_eRM[] = { /* SEQUENCE */ + {FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _V76LogicalChannelParameters_mode_eRM_recovery}, +}; + +static const struct field_t _V76LogicalChannelParameters_mode[] = { /* CHOICE */ + {FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0, + _V76LogicalChannelParameters_mode_eRM}, + {FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V75Parameters[] = { /* SEQUENCE */ + {FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0, + _V76HDLCParameters}, + {FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _V76LogicalChannelParameters_suspendResume}, + {FNAME("uIH") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("mode") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _V76LogicalChannelParameters_mode}, + {FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters}, +}; + +static const struct field_t _H2250LogicalChannelParameters_nonStandard[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, +}; + +static const struct field_t _UnicastAddress_iPAddress[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 4, 0, DECODE, + offsetof(UnicastAddress_iPAddress, network), NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPXAddress[] = { /* SEQUENCE */ + {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL}, + {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iP6Address[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 16, 0, DECODE, + offsetof(UnicastAddress_iP6Address, network), NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPSourceRouteAddress_routing[] = { /* CHOICE */ + {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPSourceRouteAddress_route[] = { /* SEQUENCE OF */ + {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */ + {FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0, + _UnicastAddress_iPSourceRouteAddress_routing}, + {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0, + _UnicastAddress_iPSourceRouteAddress_route}, +}; + +static const struct field_t _UnicastAddress[] = { /* CHOICE */ + {FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress}, + {FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0, + _UnicastAddress_iPXAddress}, + {FNAME("iP6Address") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(UnicastAddress, iP6Address), _UnicastAddress_iP6Address}, + {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0, + _UnicastAddress_iPSourceRouteAddress}, + {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, + {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL}, +}; + +static const struct field_t _MulticastAddress_iPAddress[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _MulticastAddress_iP6Address[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _MulticastAddress[] = { /* CHOICE */ + {FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0, + _MulticastAddress_iPAddress}, + {FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0, + _MulticastAddress_iP6Address}, + {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, + {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL}, +}; + +static const struct field_t _H245_TransportAddress[] = { /* CHOICE */ + {FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT, + offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress}, + {FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0, + _MulticastAddress}, +}; + +static const struct field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _H2250LogicalChannelParameters_nonStandard}, + {FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("associatedSessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelParameters, mediaChannel), + _H245_TransportAddress}, + {FNAME("mediaGuaranteedDelivery") BOOL, FIXD, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelParameters, mediaControlChannel), + _H245_TransportAddress}, + {FNAME("mediaControlGuaranteedDelivery") BOOL, FIXD, 0, 0, STOP | OPT, + 0, NULL}, + {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destination") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, STOP | OPT, 0, NULL}, + {FNAME("mediaPacketization") CHOICE, 0, 1, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("transportCapability") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, + NULL}, + {FNAME("redundancyEncoding") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ + {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0, + _H222LogicalChannelParameters}, + {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0, + _H223LogicalChannelParameters}, + {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0, + _V76LogicalChannelParameters}, + {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, + offsetof + (OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters, + h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, + {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT, + offsetof(OpenLogicalChannel_forwardLogicalChannelParameters, + dataType), _DataType}, + {FNAME("multiplexParameters") CHOICE, 2, 3, 5, DECODE | EXT, + offsetof(OpenLogicalChannel_forwardLogicalChannelParameters, + multiplexParameters), + _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters}, + {FNAME("forwardLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT, + 0, NULL}, + {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ + {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0, + _H223LogicalChannelParameters}, + {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0, + _V76LogicalChannelParameters}, + {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, + offsetof + (OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters, + h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, +}; + +static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType}, + {FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT, + offsetof(OpenLogicalChannel_reverseLogicalChannelParameters, + multiplexParameters), + _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters}, + {FNAME("reverseLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT, + 0, NULL}, + {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _NetworkAccessParameters_distribution[] = { /* CHOICE */ + {FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Q2931Address_address[] = { /* CHOICE */ + {FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL}, + {FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Q2931Address[] = { /* SEQUENCE */ + {FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _Q2931Address_address}, + {FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */ + {FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address}, + {FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT, + offsetof(NetworkAccessParameters_networkAddress, localAreaAddress), + _H245_TransportAddress}, +}; + +static const struct field_t _NetworkAccessParameters[] = { /* SEQUENCE */ + {FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, + _NetworkAccessParameters_distribution}, + {FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT, + offsetof(NetworkAccessParameters, networkAddress), + _NetworkAccessParameters_networkAddress}, + {FNAME("associateConference") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("externalReference") OCTSTR, 8, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("t120SetupProcedure") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, +}; + +static const struct field_t _OpenLogicalChannel[] = { /* SEQUENCE */ + {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT, + offsetof(OpenLogicalChannel, forwardLogicalChannelParameters), + _OpenLogicalChannel_forwardLogicalChannelParameters}, + {FNAME("reverseLogicalChannelParameters") SEQ, 1, 2, 4, + DECODE | EXT | OPT, offsetof(OpenLogicalChannel, + reverseLogicalChannelParameters), + _OpenLogicalChannel_reverseLogicalChannelParameters}, + {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT, + offsetof(OpenLogicalChannel, separateStack), + _NetworkAccessParameters}, + {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Setup_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Setup_UUIE, h245Address), _TransportAddress}, + {FNAME("sourceAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_sourceAddress}, + {FNAME("sourceInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, + {FNAME("destinationAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_destinationAddress}, + {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Setup_UUIE, destCallSignalAddress), _TransportAddress}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_destExtraCallInfo}, + {FNAME("destExtraCRV") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_destExtraCRV}, + {FNAME("activeMC") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("conferenceGoal") CHOICE, 2, 3, 5, SKIP | EXT, 0, + _Setup_UUIE_conferenceGoal}, + {FNAME("callServices") SEQ, 0, 8, 8, SKIP | EXT | OPT, 0, + _QseriesOptions}, + {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType}, + {FNAME("sourceCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Setup_UUIE, sourceCallSignalAddress), _TransportAddress}, + {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityCapability") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Setup_UUIE, fastStart), _Setup_UUIE_fastStart}, + {FNAME("mediaWaitForConnect") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("canOverlapSend") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("connectionParameters") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("symmetricOperationRequired") NUL, FIXD, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, + {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("neededFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("desiredFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("parallelH245Control") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("additionalSourceAddresses") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + NULL}, +}; + +static const struct field_t _CallProceeding_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _CallProceeding_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(CallProceeding_UUIE, h245Address), _TransportAddress}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(CallProceeding_UUIE, fastStart), + _CallProceeding_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Connect_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Connect_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Connect_UUIE, h245Address), _TransportAddress}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Connect_UUIE, fastStart), _Connect_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("connectedAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Alerting_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Alerting_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Alerting_UUIE, h245Address), _TransportAddress}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Alerting_UUIE, fastStart), _Alerting_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("alertingAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Information_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, SKIP | OPT, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _ReleaseCompleteReason[] = { /* CHOICE */ + {FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationRejection") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("invalidRevision") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("noPermission") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("unreachableGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("gatewayResources") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("badFormatAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("adaptiveBusy") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("inConf") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("facilityCallDeflection") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("securityDenied") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("calledPartyNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callerNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("newConnectionNeeded") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardReason") SEQ, 0, 2, 2, SKIP, 0, NULL}, + {FNAME("replaceWithConferenceInvite") OCTSTR, FIXD, 16, 0, SKIP, 0, + NULL}, + {FNAME("genericDataReason") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("neededFeatureNotSupported") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0, + _ReleaseCompleteReason}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("busyAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Facility_UUIE_alternativeAliasAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _FacilityReason[] = { /* CHOICE */ + {FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("conferenceListChoice") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("startH245") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("noH245") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("newTokens") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("featureSetUpdate") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("forwardedElements") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Facility_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, + {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Facility_UUIE_alternativeAliasAddress}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, + {FNAME("reason") CHOICE, 2, 4, 11, DECODE | EXT, + offsetof(Facility_UUIE, reason), _FacilityReason}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("conferences") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, h245Address), _TransportAddress}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Facility_UUIE, fastStart), _Facility_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT | OPT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, +}; + +static const struct field_t _CallIdentifier[] = { /* SEQUENCE */ + {FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _SecurityServiceMode[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, + {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _SecurityCapabilities[] = { /* SEQUENCE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _SecurityServiceMode}, + {FNAME("authenticaton") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _SecurityServiceMode}, + {FNAME("integrity") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _SecurityServiceMode}, +}; + +static const struct field_t _H245Security[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, + {FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities}, + {FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities}, +}; + +static const struct field_t _DHset[] = { /* SEQUENCE */ + {FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TypedCertificate[] = { /* SEQUENCE */ + {FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H235_NonStandardParameter[] = { /* SEQUENCE */ + {FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _ClearToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("dhkey") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, _DHset}, + {FNAME("challenge") OCTSTR, 7, 8, 0, SKIP | OPT, 0, NULL}, + {FNAME("random") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("certificate") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, + _TypedCertificate}, + {FNAME("generalID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0, + _H235_NonStandardParameter}, + {FNAME("eckasdhkey") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _Progress_UUIE_tokens[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken}, +}; + +static const struct field_t _Params[] = { /* SEQUENCE */ + {FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL}, + {FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPPwdHash_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPPwdHash[] = { /* SEQUENCE */ + {FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, + {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoEPPwdHash_token}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKPwdHash_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKPwdHash[] = { /* SEQUENCE */ + {FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoGKPwdHash_token}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPPwdEncr[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKPwdEncr[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPCert[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKCert[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoFastStart[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoEncryptedToken_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoEncryptedToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoEncryptedToken_token}, +}; + +static const struct field_t _CryptoToken_cryptoSignedToken_token[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoSignedToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 4, 4, SKIP, 0, + _CryptoToken_cryptoSignedToken_token}, +}; + +static const struct field_t _CryptoToken_cryptoHashedToken_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoHashedToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoHashedToken_token}, +}; + +static const struct field_t _CryptoToken_cryptoPwdEncr[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken[] = { /* CHOICE */ + {FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0, + _CryptoToken_cryptoEncryptedToken}, + {FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0, + _CryptoToken_cryptoSignedToken}, + {FNAME("cryptoHashedToken") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoHashedToken}, + {FNAME("cryptoPwdEncr") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoPwdEncr}, +}; + +static const struct field_t _CryptoH323Token[] = { /* CHOICE */ + {FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoEPPwdHash}, + {FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoGKPwdHash}, + {FNAME("cryptoEPPwdEncr") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoEPPwdEncr}, + {FNAME("cryptoGKPwdEncr") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoGKPwdEncr}, + {FNAME("cryptoEPCert") SEQ, 0, 4, 4, SKIP, 0, + _CryptoH323Token_cryptoEPCert}, + {FNAME("cryptoGKCert") SEQ, 0, 4, 4, SKIP, 0, + _CryptoH323Token_cryptoGKCert}, + {FNAME("cryptoFastStart") SEQ, 0, 4, 4, SKIP, 0, + _CryptoH323Token_cryptoFastStart}, + {FNAME("nestedcryptoToken") CHOICE, 2, 4, 4, SKIP | EXT, 0, + _CryptoToken}, +}; + +static const struct field_t _Progress_UUIE_cryptoTokens[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token}, +}; + +static const struct field_t _Progress_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Progress_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Progress_UUIE, h245Address), _TransportAddress}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, + _CallIdentifier}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + _H245Security}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Progress_UUIE_tokens}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Progress_UUIE_cryptoTokens}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Progress_UUIE, fastStart), _Progress_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */ + {FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE}, + {FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, callProceeding), + _CallProceeding_UUIE}, + {FNAME("connect") SEQ, 1, 4, 19, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE}, + {FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE}, + {FNAME("information") SEQ, 0, 1, 7, SKIP | EXT, 0, _Information_UUIE}, + {FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0, + _ReleaseComplete_UUIE}, + {FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, facility), _Facility_UUIE}, + {FNAME("progress") SEQ, 5, 8, 11, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, progress), _Progress_UUIE}, + {FNAME("empty") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("status") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, + {FNAME("statusInquiry") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, + {FNAME("setupAcknowledge") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, + {FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _RequestMessage[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("openLogicalChannel") SEQ, 1, 3, 5, DECODE | EXT, + offsetof(RequestMessage, openLogicalChannel), _OpenLogicalChannel}, + {FNAME("closeLogicalChannel") SEQ, 0, 2, 3, STOP | EXT, 0, NULL}, + {FNAME("requestChannelClose") SEQ, 0, 1, 3, STOP | EXT, 0, NULL}, + {FNAME("multiplexEntrySend") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("requestMultiplexEntry") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestMode") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("roundTripDelayRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("maintenanceLoopRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("communicationModeRequest") SEQ, 0, 0, 0, STOP | EXT, 0, NULL}, + {FNAME("conferenceRequest") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL}, + {FNAME("multilinkRequest") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("logicalChannelRateRequest") SEQ, 0, 3, 3, STOP | EXT, 0, + NULL}, +}; + +static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ + {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0, + _H222LogicalChannelParameters}, + {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, + offsetof + (OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters, + h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, +}; + +static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT, + offsetof(OpenLogicalChannelAck_reverseLogicalChannelParameters, + multiplexParameters), + _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters}, + {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _H2250LogicalChannelAckParameters_nonStandard[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, +}; + +static const struct field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */ + {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _H2250LogicalChannelAckParameters_nonStandard}, + {FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelAckParameters, mediaChannel), + _H245_TransportAddress}, + {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelAckParameters, mediaControlChannel), + _H245_TransportAddress}, + {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, SKIP | OPT, 0, NULL}, + {FNAME("flowControlToZero") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = { /* CHOICE */ + {FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT, + offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters, + h2250LogicalChannelAckParameters), + _H2250LogicalChannelAckParameters}, +}; + +static const struct field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */ + {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4, + DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck, + reverseLogicalChannelParameters), + _OpenLogicalChannelAck_reverseLogicalChannelParameters}, + {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT, + offsetof(OpenLogicalChannelAck, separateStack), + _NetworkAccessParameters}, + {FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1, + DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck, + forwardMultiplexAckParameters), + _OpenLogicalChannelAck_forwardMultiplexAckParameters}, + {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _ResponseMessage[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0, + NULL}, + {FNAME("masterSlaveDeterminationReject") SEQ, 0, 1, 1, STOP | EXT, 0, + NULL}, + {FNAME("terminalCapabilitySetAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("terminalCapabilitySetReject") SEQ, 0, 2, 2, STOP | EXT, 0, + NULL}, + {FNAME("openLogicalChannelAck") SEQ, 1, 2, 5, DECODE | EXT, + offsetof(ResponseMessage, openLogicalChannelAck), + _OpenLogicalChannelAck}, + {FNAME("openLogicalChannelReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("closeLogicalChannelAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestChannelCloseAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestChannelCloseReject") SEQ, 0, 2, 2, STOP | EXT, 0, + NULL}, + {FNAME("multiplexEntrySendAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("multiplexEntrySendReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("requestMultiplexEntryAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestMultiplexEntryReject") SEQ, 0, 2, 2, STOP | EXT, 0, + NULL}, + {FNAME("requestModeAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("requestModeReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("roundTripDelayResponse") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("maintenanceLoopAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("maintenanceLoopReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("communicationModeResponse") CHOICE, 0, 1, 1, STOP | EXT, 0, + NULL}, + {FNAME("conferenceResponse") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL}, + {FNAME("multilinkResponse") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("logicalChannelRateAcknowledge") SEQ, 0, 3, 3, STOP | EXT, 0, + NULL}, + {FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL}, +}; + +static const struct field_t _MultimediaSystemControlMessage[] = { /* CHOICE */ + {FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT, + offsetof(MultimediaSystemControlMessage, request), _RequestMessage}, + {FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT, + offsetof(MultimediaSystemControlMessage, response), + _ResponseMessage}, + {FNAME("command") CHOICE, 3, 7, 12, STOP | EXT, 0, NULL}, + {FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL}, +}; + +static const struct field_t _H323_UU_PDU_h245Control[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT, + sizeof(MultimediaSystemControlMessage), + _MultimediaSystemControlMessage} + , +}; + +static const struct field_t _H323_UU_PDU[] = { /* SEQUENCE */ + {FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT, + offsetof(H323_UU_PDU, h323_message_body), + _H323_UU_PDU_h323_message_body}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("h4501SupplementaryService") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("h245Tunneling") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h245Control") SEQOF, SEMI, 0, 4, DECODE | OPT, + offsetof(H323_UU_PDU, h245Control), _H323_UU_PDU_h245Control}, + {FNAME("nonStandardControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("tunnelledSignallingMessage") SEQ, 2, 4, 4, STOP | EXT | OPT, + 0, NULL}, + {FNAME("provisionalRespToH245Tunneling") NUL, FIXD, 0, 0, STOP | OPT, + 0, NULL}, + {FNAME("stimulusControl") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _H323_UserInformation[] = { /* SEQUENCE */ + {FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT, + offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU}, + {FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _GatekeeperRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(GatekeeperRequest, rasAddress), _TransportAddress}, + {FNAME("endpointType") SEQ, 6, 8, 10, STOP | EXT, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL}, + {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("authenticationCapability") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("algorithmOIDs") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _GatekeeperConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(GatekeeperConfirm, rasAddress), _TransportAddress}, + {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("authenticationMode") CHOICE, 3, 7, 8, STOP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _RegistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _RegistrationRequest_rasAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _RegistrationRequest_terminalAlias[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _RegistrationRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("discoveryComplete") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(RegistrationRequest, callSignalAddress), + _RegistrationRequest_callSignalAddress}, + {FNAME("rasAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(RegistrationRequest, rasAddress), + _RegistrationRequest_rasAddress}, + {FNAME("terminalType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, + {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _RegistrationRequest_terminalAlias}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("endpointVendor") SEQ, 2, 3, 3, SKIP | EXT, 0, + _VendorIdentifier}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT, + offsetof(RegistrationRequest, timeToLive), NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("keepAlive") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("additiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("usageReportingCapability") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, + NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("supportedH248Packages") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("callCreditCapability") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("capacityReportingCapability") SEQ, 0, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _RegistrationConfirm_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _RegistrationConfirm_terminalAlias[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _RegistrationConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(RegistrationConfirm, callSignalAddress), + _RegistrationConfirm_callSignalAddress}, + {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _RegistrationConfirm_terminalAlias}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT, + offsetof(RegistrationConfirm, timeToLive), NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("preGrantedARQ") SEQ, 0, 4, 8, STOP | EXT | OPT, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("supportsAdditiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureServerAlias") CHOICE, 1, 2, 7, STOP | EXT | OPT, 0, + NULL}, + {FNAME("capacityReportingSpec") SEQ, 0, 1, 1, STOP | EXT | OPT, 0, + NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _UnregistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _UnregistrationRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(UnregistrationRequest, callSignalAddress), + _UnregistrationRequest_callSignalAddress}, + {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("reason") CHOICE, 2, 4, 5, STOP | EXT | OPT, 0, NULL}, + {FNAME("endpointAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _CallModel[] = { /* CHOICE */ + {FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _AdmissionRequest_destinationInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _AdmissionRequest_destExtraCallInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _AdmissionRequest_srcInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _AdmissionRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType}, + {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _AdmissionRequest_destinationInfo}, + {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(AdmissionRequest, destCallSignalAddress), + _TransportAddress}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _AdmissionRequest_destExtraCallInfo}, + {FNAME("srcInfo") SEQOF, SEMI, 0, 0, SKIP, 0, + _AdmissionRequest_srcInfo}, + {FNAME("srcCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(AdmissionRequest, srcCallSignalAddress), _TransportAddress}, + {FNAME("bandWidth") INT, CONS, 0, 0, STOP, 0, NULL}, + {FNAME("callReferenceValue") INT, WORD, 0, 0, STOP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, STOP, 0, NULL}, + {FNAME("activeMC") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("answerCall") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("srcAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("gatewayDataRate") SEQ, 2, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _AdmissionConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL}, + {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel}, + {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(AdmissionConfirm, destCallSignalAddress), + _TransportAddress}, + {FNAME("irrFrequency") INT, WORD, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL}, + {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("uuiesRequested") SEQ, 0, 9, 13, STOP | EXT, 0, NULL}, + {FNAME("language") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("useSpecifiedTransport") CHOICE, 1, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _LocationRequest_destinationInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _LocationRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0, + _LocationRequest_destinationInfo}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("replyAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(LocationRequest, replyAddress), _TransportAddress}, + {FNAME("sourceInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("hopCount") INT, 8, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _LocationConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(LocationConfirm, callSignalAddress), _TransportAddress}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(LocationConfirm, rasAddress), _TransportAddress}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL}, + {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _InfoRequestResponse_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _InfoRequestResponse[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("endpointType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(InfoRequestResponse, rasAddress), _TransportAddress}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(InfoRequestResponse, callSignalAddress), + _InfoRequestResponse_callSignalAddress}, + {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("perCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("needResponse") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("irrStatus") CHOICE, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("unsolicited") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _RasMessage[] = { /* CHOICE */ + {FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT, + offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest}, + {FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT, + offsetof(RasMessage, gatekeeperConfirm), _GatekeeperConfirm}, + {FNAME("gatekeeperReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL}, + {FNAME("registrationRequest") SEQ, 3, 10, 31, DECODE | EXT, + offsetof(RasMessage, registrationRequest), _RegistrationRequest}, + {FNAME("registrationConfirm") SEQ, 3, 7, 24, DECODE | EXT, + offsetof(RasMessage, registrationConfirm), _RegistrationConfirm}, + {FNAME("registrationReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL}, + {FNAME("unregistrationRequest") SEQ, 3, 5, 15, DECODE | EXT, + offsetof(RasMessage, unregistrationRequest), _UnregistrationRequest}, + {FNAME("unregistrationConfirm") SEQ, 1, 2, 6, STOP | EXT, 0, NULL}, + {FNAME("unregistrationReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL}, + {FNAME("admissionRequest") SEQ, 7, 16, 34, DECODE | EXT, + offsetof(RasMessage, admissionRequest), _AdmissionRequest}, + {FNAME("admissionConfirm") SEQ, 2, 6, 27, DECODE | EXT, + offsetof(RasMessage, admissionConfirm), _AdmissionConfirm}, + {FNAME("admissionReject") SEQ, 1, 3, 11, STOP | EXT, 0, NULL}, + {FNAME("bandwidthRequest") SEQ, 2, 7, 18, STOP | EXT, 0, NULL}, + {FNAME("bandwidthConfirm") SEQ, 1, 3, 8, STOP | EXT, 0, NULL}, + {FNAME("bandwidthReject") SEQ, 1, 4, 9, STOP | EXT, 0, NULL}, + {FNAME("disengageRequest") SEQ, 1, 6, 19, STOP | EXT, 0, NULL}, + {FNAME("disengageConfirm") SEQ, 1, 2, 9, STOP | EXT, 0, NULL}, + {FNAME("disengageReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL}, + {FNAME("locationRequest") SEQ, 2, 5, 17, DECODE | EXT, + offsetof(RasMessage, locationRequest), _LocationRequest}, + {FNAME("locationConfirm") SEQ, 1, 4, 19, DECODE | EXT, + offsetof(RasMessage, locationConfirm), _LocationConfirm}, + {FNAME("locationReject") SEQ, 1, 3, 10, STOP | EXT, 0, NULL}, + {FNAME("infoRequest") SEQ, 2, 4, 15, STOP | EXT, 0, NULL}, + {FNAME("infoRequestResponse") SEQ, 3, 8, 16, DECODE | EXT, + offsetof(RasMessage, infoRequestResponse), _InfoRequestResponse}, + {FNAME("nonStandardMessage") SEQ, 0, 2, 7, STOP | EXT, 0, NULL}, + {FNAME("unknownMessageResponse") SEQ, 0, 1, 5, STOP | EXT, 0, NULL}, + {FNAME("requestInProgress") SEQ, 4, 6, 6, STOP | EXT, 0, NULL}, + {FNAME("resourcesAvailableIndicate") SEQ, 4, 9, 11, STOP | EXT, 0, + NULL}, + {FNAME("resourcesAvailableConfirm") SEQ, 4, 6, 7, STOP | EXT, 0, + NULL}, + {FNAME("infoRequestAck") SEQ, 4, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("infoRequestNak") SEQ, 5, 7, 7, STOP | EXT, 0, NULL}, + {FNAME("serviceControlIndication") SEQ, 8, 10, 10, STOP | EXT, 0, + NULL}, + {FNAME("serviceControlResponse") SEQ, 7, 8, 8, STOP | EXT, 0, NULL}, +}; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c new file mode 100644 index 00000000..1bdfea35 --- /dev/null +++ b/net/netfilter/nf_conntrack_helper.c @@ -0,0 +1,288 @@ +/* Helper handling for netfilter. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(nf_ct_helper_mutex); +static struct hlist_head *nf_ct_helper_hash __read_mostly; +static unsigned int nf_ct_helper_hsize __read_mostly; +static unsigned int nf_ct_helper_count __read_mostly; + + +/* Stupid hash, but collision free for the default registrations of the + * helpers currently in the kernel. */ +static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) +{ + return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^ + (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; +} + +static struct nf_conntrack_helper * +__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_helper *helper; + struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; + struct hlist_node *n; + unsigned int h; + + if (!nf_ct_helper_count) + return NULL; + + h = helper_hash(tuple); + hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) + return helper; + } + return NULL; +} + +struct nf_conntrack_helper * +__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + struct hlist_node *n; + unsigned int i; + + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(h->name, name) && + h->tuple.src.l3num == l3num && + h->tuple.dst.protonum == protonum) + return h; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); + +struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + + h = __nf_conntrack_helper_find(name, l3num, protonum); +#ifdef CONFIG_MODULES + if (h == NULL) { + if (request_module("nfct-helper-%s", name) == 0) + h = __nf_conntrack_helper_find(name, l3num, protonum); + } +#endif + if (h != NULL && !try_module_get(h->me)) + h = NULL; + + return h; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); + +struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) +{ + struct nf_conn_help *help; + + help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); + if (help) + INIT_HLIST_HEAD(&help->expectations); + else + pr_debug("failed to add helper extension area"); + return help; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); + +int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags) +{ + struct nf_conntrack_helper *helper = NULL; + struct nf_conn_help *help; + int ret = 0; + + if (tmpl != NULL) { + help = nfct_help(tmpl); + if (help != NULL) + helper = help->helper; + } + + help = nfct_help(ct); + if (helper == NULL) + helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (helper == NULL) { + if (help) + rcu_assign_pointer(help->helper, NULL); + goto out; + } + + if (help == NULL) { + help = nf_ct_helper_ext_add(ct, flags); + if (help == NULL) { + ret = -ENOMEM; + goto out; + } + } else { + memset(&help->help, 0, sizeof(help->help)); + } + + rcu_assign_pointer(help->helper, helper); +out: + return ret; +} +EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); + +static inline int unhelp(struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_helper *me) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); + struct nf_conn_help *help = nfct_help(ct); + + if (help && rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me) { + nf_conntrack_event(IPCT_HELPER, ct); + rcu_assign_pointer(help->helper, NULL); + } + return 0; +} + +void nf_ct_helper_destroy(struct nf_conn *ct) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; + + if (help) { + rcu_read_lock(); + helper = rcu_dereference(help->helper); + if (helper && helper->destroy) + helper->destroy(ct); + rcu_read_unlock(); + } +} + +int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +{ + unsigned int h = helper_hash(&me->tuple); + + BUG_ON(me->expect_policy == NULL); + BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); + BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + + mutex_lock(&nf_ct_helper_mutex); + hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); + nf_ct_helper_count++; + mutex_unlock(&nf_ct_helper_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); + +static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, + struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_expect *exp; + const struct hlist_node *n, *next; + const struct hlist_nulls_node *nn; + unsigned int i; + + /* Get rid of expectations */ + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, + &net->ct.expect_hash[i], hnode) { + struct nf_conn_help *help = nfct_help(exp->master); + if ((rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me || exp->helper == me) && + del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } + } + + /* Get rid of expecteds, set helpers to NULL. */ + hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) + unhelp(h, me); + for (i = 0; i < net->ct.htable_size; i++) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + unhelp(h, me); + } +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + struct net *net; + + mutex_lock(&nf_ct_helper_mutex); + hlist_del_rcu(&me->hnode); + nf_ct_helper_count--; + mutex_unlock(&nf_ct_helper_mutex); + + /* Make sure every nothing is still using the helper unless its a + * connection in the hash. + */ + synchronize_rcu(); + + rtnl_lock(); + spin_lock_bh(&nf_conntrack_lock); + for_each_net(net) + __nf_conntrack_helper_unregister(me, net); + spin_unlock_bh(&nf_conntrack_lock); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); + +static struct nf_ct_ext_type helper_extend __read_mostly = { + .len = sizeof(struct nf_conn_help), + .align = __alignof__(struct nf_conn_help), + .id = NF_CT_EXT_HELPER, +}; + +int nf_conntrack_helper_init(void) +{ + int err; + + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ + nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); + if (!nf_ct_helper_hash) + return -ENOMEM; + + err = nf_ct_extend_register(&helper_extend); + if (err < 0) + goto err1; + + return 0; + +err1: + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + return err; +} + +void nf_conntrack_helper_fini(void) +{ + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); +} diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c new file mode 100644 index 00000000..4f9390b9 --- /dev/null +++ b/net/netfilter/nf_conntrack_irc.c @@ -0,0 +1,291 @@ +/* IRC extension for IP connection tracking, Version 1.21 + * (C) 2000-2002 by Harald Welte + * based on RR's ip_conntrack_ftp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static unsigned int ports_c; +static unsigned int max_dcc_channels = 8; +static unsigned int dcc_timeout __read_mostly = 300; +/* This is slow, but it's simple. --RR */ +static char *irc_buffer; +static DEFINE_SPINLOCK(irc_buffer_lock); + +unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_irc_hook); + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_irc"); +MODULE_ALIAS_NFCT_HELPER("irc"); + +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of IRC servers"); +module_param(max_dcc_channels, uint, 0400); +MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per " + "IRC session"); +module_param(dcc_timeout, uint, 0400); +MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); + +static const char *const dccprotos[] = { + "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " +}; + +#define MINMATCHLEN 5 + +/* tries to get the ip_addr and port out of a dcc command + * return value: -1 on failure, 0 on success + * data pointer to first byte of DCC command data + * data_end pointer to last byte of dcc command data + * ip returns parsed ip of dcc command + * port returns parsed port of dcc command + * ad_beg_p returns pointer to first byte of addr data + * ad_end_p returns pointer to last byte of addr data + */ +static int parse_dcc(char *data, const char *data_end, __be32 *ip, + u_int16_t *port, char **ad_beg_p, char **ad_end_p) +{ + char *tmp; + + /* at least 12: "AAAAAAAA P\1\n" */ + while (*data++ != ' ') + if (data > data_end - 12) + return -1; + + /* Make sure we have a newline character within the packet boundaries + * because simple_strtoul parses until the first invalid character. */ + for (tmp = data; tmp <= data_end; tmp++) + if (*tmp == '\n') + break; + if (tmp > data_end || *tmp != '\n') + return -1; + + *ad_beg_p = data; + *ip = cpu_to_be32(simple_strtoul(data, &data, 10)); + + /* skip blanks between ip and port */ + while (*data == ' ') { + if (data >= data_end) + return -1; + data++; + } + + *port = simple_strtoul(data, &data, 10); + *ad_end_p = data; + + return 0; +} + +static int help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff; + const struct iphdr *iph; + const struct tcphdr *th; + struct tcphdr _tcph; + const char *data_limit; + char *data, *ib_ptr; + int dir = CTINFO2DIR(ctinfo); + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + __be32 dcc_ip; + u_int16_t dcc_port; + __be16 port; + int i, ret = NF_ACCEPT; + char *addr_beg_p, *addr_end_p; + typeof(nf_nat_irc_hook) nf_nat_irc; + + /* If packet is coming from IRC server */ + if (dir == IP_CT_DIR_REPLY) + return NF_ACCEPT; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + /* Not a full tcp header? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = protoff + th->doff*4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + spin_lock_bh(&irc_buffer_lock); + ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, + irc_buffer); + BUG_ON(ib_ptr == NULL); + + data = ib_ptr; + data_limit = ib_ptr + skb->len - dataoff; + + /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 + * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ + while (data < data_limit - (19 + MINMATCHLEN)) { + if (memcmp(data, "\1DCC ", 5)) { + data++; + continue; + } + data += 5; + /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + + iph = ip_hdr(skb); + pr_debug("DCC found in master %pI4:%u %pI4:%u\n", + &iph->saddr, ntohs(th->source), + &iph->daddr, ntohs(th->dest)); + + for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { + if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { + /* no match */ + continue; + } + data += strlen(dccprotos[i]); + pr_debug("DCC %s detected\n", dccprotos[i]); + + /* we have at least + * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * data left (== 14/13 bytes) */ + if (parse_dcc(data, data_limit, &dcc_ip, + &dcc_port, &addr_beg_p, &addr_end_p)) { + pr_debug("unable to parse dcc command\n"); + continue; + } + + pr_debug("DCC bound ip/port: %pI4:%u\n", + &dcc_ip, dcc_port); + + /* dcc_ip can be the internal OR external (NAT'ed) IP */ + tuple = &ct->tuplehash[dir].tuple; + if (tuple->src.u3.ip != dcc_ip && + tuple->dst.u3.ip != dcc_ip) { + if (net_ratelimit()) + printk(KERN_WARNING + "Forged DCC command from %pI4: %pI4:%u\n", + &tuple->src.u3.ip, + &dcc_ip, dcc_port); + continue; + } + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + tuple = &ct->tuplehash[!dir].tuple; + port = htons(dcc_port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + tuple->src.l3num, + NULL, &tuple->dst.u3, + IPPROTO_TCP, NULL, &port); + + nf_nat_irc = rcu_dereference(nf_nat_irc_hook); + if (nf_nat_irc && ct->status & IPS_NAT_MASK) + ret = nf_nat_irc(skb, ctinfo, + addr_beg_p - ib_ptr, + addr_end_p - addr_beg_p, + exp); + else if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + nf_ct_expect_put(exp); + goto out; + } + } + out: + spin_unlock_bh(&irc_buffer_lock); + return ret; +} + +static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; +static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly; +static struct nf_conntrack_expect_policy irc_exp_policy; + +static void nf_conntrack_irc_fini(void); + +static int __init nf_conntrack_irc_init(void) +{ + int i, ret; + char *tmpname; + + if (max_dcc_channels < 1) { + printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n"); + return -EINVAL; + } + + irc_exp_policy.max_expected = max_dcc_channels; + irc_exp_policy.timeout = dcc_timeout; + + irc_buffer = kmalloc(65536, GFP_KERNEL); + if (!irc_buffer) + return -ENOMEM; + + /* If no port given, default to standard irc port */ + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; + + for (i = 0; i < ports_c; i++) { + irc[i].tuple.src.l3num = AF_INET; + irc[i].tuple.src.u.tcp.port = htons(ports[i]); + irc[i].tuple.dst.protonum = IPPROTO_TCP; + irc[i].expect_policy = &irc_exp_policy; + irc[i].me = THIS_MODULE; + irc[i].help = help; + + tmpname = &irc_names[i][0]; + if (ports[i] == IRC_PORT) + sprintf(tmpname, "irc"); + else + sprintf(tmpname, "irc-%u", i); + irc[i].name = tmpname; + + ret = nf_conntrack_helper_register(&irc[i]); + if (ret) { + printk(KERN_ERR "nf_ct_irc: failed to register helper " + "for pf: %u port: %u\n", + irc[i].tuple.src.l3num, ports[i]); + nf_conntrack_irc_fini(); + return ret; + } + } + return 0; +} + +/* This function is intentionally _NOT_ defined as __exit, because + * it is needed by the init function */ +static void nf_conntrack_irc_fini(void) +{ + int i; + + for (i = 0; i < ports_c; i++) + nf_conntrack_helper_unregister(&irc[i]); + kfree(irc_buffer); +} + +module_init(nf_conntrack_irc_init); +module_exit(nf_conntrack_irc_fini); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c new file mode 100644 index 00000000..e7eb807f --- /dev/null +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -0,0 +1,74 @@ +/* + * (C) 2003,2004 USAGI/WIDE Project + * + * Based largely upon the original ip_conntrack code which + * had the following copyright information: + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return true; +} + +static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return true; +} + +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + unsigned int *dataoff, u_int8_t *protonum) +{ + /* Never track !!! */ + return -NF_ACCEPT; +} + + +struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { + .l3proto = PF_UNSPEC, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .get_l4proto = generic_get_l4proto, +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c new file mode 100644 index 00000000..4c8f30a3 --- /dev/null +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -0,0 +1,71 @@ +/* + * NetBIOS name service broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* + * This helper tracks locally originating NetBIOS name service + * requests by issuing permanent expectations (valid until + * timing out) matching all reply connections from the + * destination network. The only NetBIOS specific thing is + * actually the port number. + */ +#include +#include +#include +#include + +#include +#include +#include + +#define NMBD_PORT 137 + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_netbios_ns"); +MODULE_ALIAS_NFCT_HELPER("netbios_ns"); + +static unsigned int timeout __read_mostly = 3; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); +} + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "netbios-ns", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = netbios_ns_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_netbios_ns_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_netbios_ns_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_netbios_ns_init); +module_exit(nf_conntrack_netbios_ns_fini); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c new file mode 100644 index 00000000..482e90c6 --- /dev/null +++ b/net/netfilter/nf_conntrack_netlink.c @@ -0,0 +1,2226 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist + * (C) 2002-2006 by Harald Welte + * (C) 2003 by Patrick Mchardy + * (C) 2005-2008 by Pablo Neira Ayuso + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NF_NAT_NEEDED +#include +#include +#endif + +#include +#include + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.93"; + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l4proto *l4proto) +{ + int ret = 0; + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum); + + if (likely(l4proto->tuple_to_nlattr)) + ret = l4proto->tuple_to_nlattr(skb, tuple); + + nla_nest_end(skb, nest_parms); + + return ret; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples_ip(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto) +{ + int ret = 0; + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (likely(l3proto->tuple_to_nlattr)) + ret = l3proto->tuple_to_nlattr(skb, tuple); + + nla_nest_end(skb, nest_parms); + + return ret; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + int ret; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); + + if (unlikely(ret < 0)) + return ret; + + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); + + return ret; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status)); + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +{ + long timeout = (ct->timeout.expires - jiffies) / HZ; + + if (timeout < 0) + timeout = 0; + + NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout)); + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nf_conntrack_l4proto *l4proto; + struct nlattr *nest_proto; + int ret; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (!l4proto->to_nlattr) + return 0; + + nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED); + if (!nest_proto) + goto nla_put_failure; + + ret = l4proto->to_nlattr(skb, nest_proto, ct); + + nla_nest_end(skb, nest_proto); + + return ret; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_helper; + const struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; + + if (!help) + return 0; + + helper = rcu_dereference(help->helper); + if (!helper) + goto out; + + nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED); + if (!nest_helper) + goto nla_put_failure; + NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name); + + if (helper->to_nlattr) + helper->to_nlattr(skb, ct); + + nla_nest_end(skb, nest_helper); +out: + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nlattr *nest_count; + const struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + nest_count = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS, + cpu_to_be64(acct[dir].packets)); + NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES, + cpu_to_be64(acct[dir].bytes)); + + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_count; + const struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (!tstamp) + return 0; + + nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)); + if (tstamp->stop != 0) { + NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP, + cpu_to_be64(tstamp->stop)); + } + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark)); + return 0; + +nla_put_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static inline int +ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_secctx; + int len, ret; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = -1; + nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED); + if (!nest_secctx) + goto nla_put_failure; + + NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx); + nla_nest_end(skb, nest_secctx); + + ret = 0; +nla_put_failure: + security_release_secctx(secctx, len); + return ret; +} +#else +#define ctnetlink_dump_secctx(a, b) (0) +#endif + +#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) + +static inline int +ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + if (!(ct->status & IPS_EXPECTED)) + return 0; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static int +dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS, + htonl(natseq->correction_pos)); + NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, + htonl(natseq->offset_before)); + NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER, + htonl(natseq->offset_after)); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_nat_seq *natseq; + struct nf_conn_nat *nat = nfct_nat(ct); + + if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + return 0; + + natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; + if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + return -1; + + natseq = &nat->seq[IP_CT_DIR_REPLY]; + if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + return -1; + + return 0; +} +#else +#define ctnetlink_dump_nat_seq_adj(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct)); + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))); + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, struct nf_conn *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nlattr *nest_parms; + unsigned int flags = pid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = nf_ct_l3num(ct); + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct)) + NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0 || + ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static inline size_t +ctnetlink_proto_size(const struct nf_conn *ct) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + size_t len = 0; + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + len += l3proto->nla_size; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + len += l4proto->nla_size; + rcu_read_unlock(); + + return len; +} + +static inline size_t +ctnetlink_counters_size(const struct nf_conn *ct) +{ + if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) + return 0; + return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ + + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ + ; +} + +static inline int +ctnetlink_secctx_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_SECMARK + int len, ret; + + ret = security_secid_to_secctx(ct->secmark, NULL, &len); + if (ret) + return 0; + + return nla_total_size(0) /* CTA_SECCTX */ + + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ +#else + return 0; +#endif +} + +static inline size_t +ctnetlink_timestamp_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + return 0; + return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); +#else + return 0; +#endif +} + +static inline size_t +ctnetlink_nlmsg_size(const struct nf_conn *ct) +{ + return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + + ctnetlink_counters_size(ct) + + ctnetlink_timestamp_size(ct) + + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + + nla_total_size(0) /* CTA_PROTOINFO */ + + nla_total_size(0) /* CTA_HELP */ + + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + + ctnetlink_secctx_size(ct) +#ifdef CONFIG_NF_NAT_NEEDED + + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif + + ctnetlink_proto_size(ct) + ; +} + +static int +ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) +{ + struct net *net; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nlattr *nest_parms; + struct nf_conn *ct = item->ct; + struct sk_buff *skb; + unsigned int type; + unsigned int flags = 0, group; + int err; + + /* ignore our fake conntrack entry */ + if (nf_ct_is_untracked(ct)) + return 0; + + if (events & (1 << IPCT_DESTROY)) { + type = IPCTNL_MSG_CT_DELETE; + group = NFNLGRP_CONNTRACK_DESTROY; + } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_NEW; + } else if (events) { + type = IPCTNL_MSG_CT_NEW; + group = NFNLGRP_CONNTRACK_UPDATE; + } else + return 0; + + net = nf_ct_net(ct); + if (!item->report && !nfnetlink_has_listeners(net, group)) + return 0; + + skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = nf_ct_l3num(ct); + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + rcu_read_lock(); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct)) + NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); + + if (ctnetlink_dump_id(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_status(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_DESTROY)) { + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) + goto nla_put_failure; + } else { + if (ctnetlink_dump_timeout(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_PROTOINFO) + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nla_put_failure; + + if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if ((events & (1 << IPCT_SECMARK) || ct->secmark) + && ctnetlink_dump_secctx(skb, ct) < 0) + goto nla_put_failure; +#endif + + if (events & (1 << IPCT_RELATED) && + ctnetlink_dump_master(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_NATSEQADJ) && + ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + goto nla_put_failure; + } + +#ifdef CONFIG_NF_CONNTRACK_MARK + if ((events & (1 << IPCT_MARK) || ct->mark) + && ctnetlink_dump_mark(skb, ct) < 0) + goto nla_put_failure; +#endif + rcu_read_unlock(); + + nlmsg_end(skb, nlh); + err = nfnetlink_send(skb, net, item->pid, group, item->report, + GFP_ATOMIC); + if (err == -ENOBUFS || err == -EAGAIN) + return -ENOBUFS; + + return 0; + +nla_put_failure: + rcu_read_unlock(); + nlmsg_cancel(skb, nlh); +nlmsg_failure: + kfree_skb(skb); +errout: + if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0) + return -ENOBUFS; + + return 0; +} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); + return 0; +} + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nf_conn *ct, *last; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + + spin_lock_bh(&nf_conntrack_lock); + last = (struct nf_conn *)cb->args[1]; + for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { +restart: + hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], + hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + /* Dump entries of a given L3 protocol number. + * If it is not specified, ie. l3proto == 0, + * then dump everything. */ + if (l3proto && nf_ct_l3num(ct) != l3proto) + continue; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; + goto out; + } + + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (acct) + memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX])); + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + } +out: + spin_unlock_bh(&nf_conntrack_lock); + if (last) + nf_ct_put(last); + + return skb->len; +} + +static inline int +ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) +{ + struct nlattr *tb[CTA_IP_MAX+1]; + struct nf_conntrack_l3proto *l3proto; + int ret = 0; + + nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + + if (likely(l3proto->nlattr_to_tuple)) { + ret = nla_validate_nested(attr, CTA_IP_MAX, + l3proto->nla_policy); + if (ret == 0) + ret = l3proto->nlattr_to_tuple(tb, tuple); + } + + rcu_read_unlock(); + + return ret; +} + +static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_NUM] = { .type = NLA_U8 }, +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nlattr *attr, + struct nf_conntrack_tuple *tuple) +{ + struct nlattr *tb[CTA_PROTO_MAX+1]; + struct nf_conntrack_l4proto *l4proto; + int ret = 0; + + ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy); + if (ret < 0) + return ret; + + if (!tb[CTA_PROTO_NUM]) + return -EINVAL; + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); + + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + + if (likely(l4proto->nlattr_to_tuple)) { + ret = nla_validate_nested(attr, CTA_PROTO_MAX, + l4proto->nla_policy); + if (ret == 0) + ret = l4proto->nlattr_to_tuple(tb, tuple); + } + + rcu_read_unlock(); + + return ret; +} + +static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { + [CTA_TUPLE_IP] = { .type = NLA_NESTED }, + [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, +}; + +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + enum ctattr_type type, u_int8_t l3num) +{ + struct nlattr *tb[CTA_TUPLE_MAX+1]; + int err; + + memset(tuple, 0, sizeof(*tuple)); + + nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); + + if (!tb[CTA_TUPLE_IP]) + return -EINVAL; + + tuple->src.l3num = l3num; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return 0; +} + +static int +ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) +{ + if (attr) +#ifdef CONFIG_NF_CONNTRACK_ZONES + *zone = ntohs(nla_get_be16(attr)); +#else + return -EOPNOTSUPP; +#endif + else + *zone = 0; + + return 0; +} + +static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { + [CTA_HELP_NAME] = { .type = NLA_NUL_STRING }, +}; + +static inline int +ctnetlink_parse_help(const struct nlattr *attr, char **helper_name) +{ + struct nlattr *tb[CTA_HELP_MAX+1]; + + nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); + + if (!tb[CTA_HELP_NAME]) + return -EINVAL; + + *helper_name = nla_data(tb[CTA_HELP_NAME]); + + return 0; +} + +static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { + [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, + [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, + [CTA_STATUS] = { .type = NLA_U32 }, + [CTA_PROTOINFO] = { .type = NLA_NESTED }, + [CTA_HELP] = { .type = NLA_NESTED }, + [CTA_NAT_SRC] = { .type = NLA_NESTED }, + [CTA_TIMEOUT] = { .type = NLA_U32 }, + [CTA_MARK] = { .type = NLA_U32 }, + [CTA_ID] = { .type = NLA_U32 }, + [CTA_NAT_DST] = { .type = NLA_NESTED }, + [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, + [CTA_ZONE] = { .type = NLA_U16 }, +}; + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else { + /* Flush the whole table */ + nf_conntrack_flush_report(net, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + return 0; + } + + if (err < 0) + return err; + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_ID]) { + u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); + if (id != (u32)(unsigned long)ct) { + nf_ct_put(ct); + return -ENOENT; + } + } + + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)) < 0) { + nf_ct_delete_from_lists(ct); + /* we failed to report the event, try later */ + nf_ct_insert_dying_list(ct); + nf_ct_put(ct); + return 0; + } + + /* death_by_timeout would report the event again */ + set_bit(IPS_DYING_BIT, &ct->status); + + nf_ct_kill(ct); + nf_ct_put(ct); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct sk_buff *skb2 = NULL; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) + return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, + ctnetlink_done); + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + nf_ct_put(ct); + return -ENOMEM; + } + + rcu_read_lock(); + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, ct); + rcu_read_unlock(); + nf_ct_put(ct); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + return 0; + +free: + kfree_skb(skb2); +out: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static int +ctnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; + + parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); + if (!parse_nat_setup) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + spin_unlock_bh(&nf_conntrack_lock); + nfnl_unlock(); + if (request_module("nf-nat-ipv4") < 0) { + nfnl_lock(); + spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); + return -EOPNOTSUPP; + } + nfnl_lock(); + spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); + if (nfnetlink_parse_nat_setup_hook) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + + return parse_nat_setup(ct, manip, attr); +} +#endif + +static int +ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + unsigned long d; + unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EBUSY; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EBUSY; + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EBUSY; + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * nf_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + +static int +ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_NAT_NEEDED + int ret; + + if (cda[CTA_NAT_DST]) { + ret = ctnetlink_parse_nat_setup(ct, + IP_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + } + if (cda[CTA_NAT_SRC]) { + ret = ctnetlink_parse_nat_setup(ct, + IP_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + if (ret < 0) + return ret; + } + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline int +ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help = nfct_help(ct); + char *helpname = NULL; + int err; + + /* don't change helper of sibling connections */ + if (ct->master) + return -EBUSY; + + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); + if (err < 0) + return err; + + if (!strcmp(helpname, "")) { + if (help && help->helper) { + /* we had a helper before ... */ + nf_ct_remove_expectations(ct); + rcu_assign_pointer(help->helper, NULL); + } + + return 0; + } + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + spin_unlock_bh(&nf_conntrack_lock); + + if (request_module("nfct-helper-%s", helpname) < 0) { + spin_lock_bh(&nf_conntrack_lock); + return -EOPNOTSUPP; + } + + spin_lock_bh(&nf_conntrack_lock); + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + + if (help) { + if (help->helper == helper) + return 0; + if (help->helper) + return -EBUSY; + /* need to zero data of old helper */ + memset(&help->help, 0, sizeof(help->help)); + } else { + /* we cannot set a helper for an existing conntrack */ + return -EOPNOTSUPP; + } + + rcu_assign_pointer(help->helper, helper); + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { + [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, + [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, + [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, +}; + +static inline int +ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + const struct nlattr *attr = cda[CTA_PROTOINFO]; + struct nlattr *tb[CTA_PROTOINFO_MAX+1]; + struct nf_conntrack_l4proto *l4proto; + int err = 0; + + nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); + + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->from_nlattr) + err = l4proto->from_nlattr(tb, ct); + rcu_read_unlock(); + + return err; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { + [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 }, +}; + +static inline int +change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +{ + struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + + nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); + + if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + return -EINVAL; + + natseq->correction_pos = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + + if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + return -EINVAL; + + natseq->offset_before = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + + if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + return -EINVAL; + + natseq->offset_after = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + + return 0; +} + +static int +ctnetlink_change_nat_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + int ret = 0; + struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + + if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { + ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { + ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], + cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + return 0; +} +#endif + +static int +ctnetlink_change_conntrack(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + int err; + + /* only allow NAT changes and master assignation for new conntracks */ + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) + return -EOPNOTSUPP; + + if (cda[CTA_HELP]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_PROTOINFO]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + return err; + } + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (cda[CTA_MARK]) + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); +#endif + +#ifdef CONFIG_NF_NAT_NEEDED + if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_nat_seq_adj(ct, cda); + if (err < 0) + return err; + } +#endif + + return 0; +} + +static struct nf_conn * +ctnetlink_create_conntrack(struct net *net, u16 zone, + const struct nlattr * const cda[], + struct nf_conntrack_tuple *otuple, + struct nf_conntrack_tuple *rtuple, + u8 u3) +{ + struct nf_conn *ct; + int err = -EINVAL; + struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; + + ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); + if (IS_ERR(ct)) + return ERR_PTR(-ENOMEM); + + if (!cda[CTA_TIMEOUT]) + goto err1; + ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + + rcu_read_lock(); + if (cda[CTA_HELP]) { + char *helpname = NULL; + + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); + if (err < 0) + goto err2; + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { + rcu_read_unlock(); +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto err1; + } + + rcu_read_lock(); + helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto err2; + } + rcu_read_unlock(); +#endif + err = -EOPNOTSUPP; + goto err1; + } else { + struct nf_conn_help *help; + + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (help == NULL) { + err = -ENOMEM; + goto err2; + } + + /* not in hash table yet so not strictly necessary */ + rcu_assign_pointer(help->helper, helper); + } + } else { + /* try an implicit helper assignation */ + err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); + if (err < 0) + goto err2; + } + + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { + err = ctnetlink_change_nat(ct, cda); + if (err < 0) + goto err2; + } + + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + /* we must add conntrack extensions before confirmation. */ + ct->status |= IPS_CONFIRMED; + + if (cda[CTA_STATUS]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err2; + } + +#ifdef CONFIG_NF_NAT_NEEDED + if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_nat_seq_adj(ct, cda); + if (err < 0) + goto err2; + } +#endif + + memset(&ct->proto, 0, sizeof(ct->proto)); + if (cda[CTA_PROTOINFO]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + goto err2; + } + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (cda[CTA_MARK]) + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); +#endif + + /* setup master conntrack: this is a confirmed expectation */ + if (cda[CTA_TUPLE_MASTER]) { + struct nf_conntrack_tuple master; + struct nf_conntrack_tuple_hash *master_h; + struct nf_conn *master_ct; + + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + if (err < 0) + goto err2; + + master_h = nf_conntrack_find_get(net, zone, &master); + if (master_h == NULL) { + err = -ENOENT; + goto err2; + } + master_ct = nf_ct_tuplehash_to_ctrack(master_h); + __set_bit(IPS_EXPECTED_BIT, &ct->status); + ct->master = master_ct; + } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); + + add_timer(&ct->timeout); + nf_conntrack_hash_insert(ct); + rcu_read_unlock(); + + return ct; + +err2: + rcu_read_unlock(); +err1: + nf_conntrack_free(ct); + return ERR_PTR(err); +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + if (err < 0) + return err; + } + + spin_lock_bh(&nf_conntrack_lock); + if (cda[CTA_TUPLE_ORIG]) + h = __nf_conntrack_find(net, zone, &otuple); + else if (cda[CTA_TUPLE_REPLY]) + h = __nf_conntrack_find(net, zone, &rtuple); + + if (h == NULL) { + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) { + struct nf_conn *ct; + enum ip_conntrack_events events; + + ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, + &rtuple, u3); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto out_unlock; + } + err = 0; + nf_conntrack_get(&ct->ct_general); + spin_unlock_bh(&nf_conntrack_lock); + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + events = IPCT_RELATED; + else + events = IPCT_NEW; + + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | + (1 << IPCT_ASSURED) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_NATSEQADJ) | + (1 << IPCT_MARK) | events, + ct, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_put(ct); + } else + spin_unlock_bh(&nf_conntrack_lock); + + return err; + } + /* implicit 'else' */ + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + err = ctnetlink_change_conntrack(ct, cda); + if (err == 0) { + nf_conntrack_get(&ct->ct_general); + spin_unlock_bh(&nf_conntrack_lock); + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | + (1 << IPCT_ASSURED) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_NATSEQADJ) | + (1 << IPCT_MARK), + ct, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_put(ct); + } else + spin_unlock_bh(&nf_conntrack_lock); + + return err; + } + +out_unlock: + spin_unlock_bh(&nf_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_mask(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple_mask *mask) +{ + int ret; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple m; + struct nlattr *nest_parms; + + memset(&m, 0xFF, sizeof(m)); + memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); + m.src.u.all = mask->src.u.all; + m.dst.protonum = tuple->dst.protonum; + + nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); + + if (unlikely(ret < 0)) + goto nla_put_failure; + + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); + if (unlikely(ret < 0)) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct nf_conntrack_expect *exp) +{ + struct nf_conn *master = exp->master; + long timeout = (exp->timeout.expires - jiffies) / HZ; + struct nf_conn_help *help; + + if (timeout < 0) + timeout = 0; + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nla_put_failure; + if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) + goto nla_put_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nla_put_failure; + + NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); + NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); + NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); + help = nfct_help(master); + if (help) { + struct nf_conntrack_helper *helper; + + helper = rcu_dereference(help->helper); + if (helper) + NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + } + + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, const struct nf_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = pid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int +ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) +{ + struct nf_conntrack_expect *exp = item->exp; + struct net *net = nf_ct_exp_net(exp); + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *skb; + unsigned int type, group; + int flags = 0; + + if (events & (1 << IPEXP_DESTROY)) { + type = IPCTNL_MSG_EXP_DELETE; + group = NFNLGRP_CONNTRACK_EXP_DESTROY; + } else if (events & (1 << IPEXP_NEW)) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_EXP_NEW; + } else + return 0; + + if (!item->report && !nfnetlink_has_listeners(net, group)) + return 0; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + goto errout; + + type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + rcu_read_lock(); + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nla_put_failure; + rcu_read_unlock(); + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC); + return 0; + +nla_put_failure: + rcu_read_unlock(); + nlmsg_cancel(skb, nlh); +nlmsg_failure: + kfree_skb(skb); +errout: + nfnetlink_set_err(net, 0, 0, -ENOBUFS); + return 0; +} +#endif +static int ctnetlink_exp_done(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); + return 0; +} + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct hlist_node *n; + u_int8_t l3proto = nfmsg->nfgen_family; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; + for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { +restart: + hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], + hnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + } +out: + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); + + return skb->len; +} + +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, +}; + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct sk_buff *skb2; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + return netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_exp_done); + } + + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_MASTER]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = nf_ct_expect_find_get(net, zone, &tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + if (ntohl(id) != (u32)(unsigned long)exp) { + nf_ct_expect_put(exp); + return -ENOENT; + } + } + + err = -ENOMEM; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + goto out; + + rcu_read_lock(); + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); + rcu_read_unlock(); + if (err <= 0) + goto free; + + nf_ct_expect_put(exp); + + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + +free: + kfree_skb(skb2); +out: + nf_ct_expect_put(exp); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple tuple; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct hlist_node *n, *next; + u_int8_t u3 = nfmsg->nfgen_family; + unsigned int i; + u16 zone; + int err; + + if (cda[CTA_EXPECT_TUPLE]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = nf_ct_expect_find_get(net, zone, &tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + if (ntohl(id) != (u32)(unsigned long)exp) { + nf_ct_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + spin_lock_bh(&nf_conntrack_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_lock); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + nf_ct_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME]) { + char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); + struct nf_conn_help *m_help; + + /* delete all expectations for this helper */ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, + &net->ct.expect_hash[i], + hnode) { + m_help = nfct_help(exp->master); + if (!strcmp(m_help->helper->name, name) && + del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + } + } + spin_unlock_bh(&nf_conntrack_lock); + } else { + /* This basically means we have to flush everything*/ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, + &net->ct.expect_hash[i], + hnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + } + } + spin_unlock_bh(&nf_conntrack_lock); + } + + return 0; +} +static int +ctnetlink_change_expect(struct nf_conntrack_expect *x, + const struct nlattr * const cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct net *net, u16 zone, + const struct nlattr * const cda[], + u_int8_t u3, + u32 pid, int report) +{ + struct nf_conntrack_tuple tuple, mask, master_tuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_expect *exp; + struct nf_conn *ct; + struct nf_conn_help *help; + int err = 0; + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = nf_conntrack_find_get(net, zone, &master_tuple); + if (!h) + return -ENOENT; + ct = nf_ct_tuplehash_to_ctrack(h); + exp = nf_ct_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + help = nfct_help(ct); + if (!help) { + if (!cda[CTA_EXPECT_TIMEOUT]) { + err = -EINVAL; + goto out; + } + exp->timeout.expires = + jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; + + exp->flags = NF_CT_EXPECT_USERSPACE; + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags |= + ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + } + } else { + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + exp->flags &= ~NF_CT_EXPECT_USERSPACE; + } else + exp->flags = 0; + } + + exp->class = 0; + exp->expectfn = NULL; + exp->master = ct; + exp->helper = NULL; + memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); + memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3)); + exp->mask.src.u.all = mask.src.u.all; + + err = nf_ct_expect_related_report(exp, pid, report); + nf_ct_expect_put(exp); + +out: + nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + if (!cda[CTA_EXPECT_TUPLE] + || !cda[CTA_EXPECT_MASK] + || !cda[CTA_EXPECT_MASTER]) + return -EINVAL; + + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + spin_lock_bh(&nf_conntrack_lock); + exp = __nf_ct_expect_find(net, zone, &tuple); + + if (!exp) { + spin_unlock_bh(&nf_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) { + err = ctnetlink_create_expect(net, zone, cda, + u3, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + } + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + spin_unlock_bh(&nf_conntrack_lock); + + return err; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static struct nf_ct_event_notifier ctnl_notifier = { + .fcn = ctnetlink_conntrack_event, +}; + +static struct nf_exp_event_notifier ctnl_notifier_exp = { + .fcn = ctnetlink_expect_event, +}; +#endif + +static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, +}; + +static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, +}; + +static const struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .cb = ctnl_cb, +}; + +static const struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .cb = ctnl_exp_cb, +}; + +MODULE_ALIAS("ip_conntrack_netlink"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); + +static int __init ctnetlink_init(void) +{ + int ret; + + pr_info("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + ret = nf_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = nf_ct_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + pr_err("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: + nf_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + pr_info("ctnetlink: unregistering from nfnetlink.\n"); + + nf_ct_remove_userspace_expectations(); +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_ct_expect_unregister_notifier(&ctnl_notifier_exp); + nf_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c new file mode 100644 index 00000000..2fd45651 --- /dev/null +++ b/net/netfilter/nf_conntrack_pptp.c @@ -0,0 +1,631 @@ +/* + * Connection tracking support for PPTP (Point to Point Tunneling Protocol). + * PPTP is a a protocol for creating virtual private networks. + * It is a specification defined by Microsoft and some vendors + * working with Microsoft. PPTP is built on top of a modified + * version of the Internet Generic Routing Encapsulation Protocol. + * GRE is defined in RFC 1701 and RFC 1702. Documentation of + * PPTP can be found in RFC 2637 + * + * (C) 2000-2005 by Harald Welte + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + * + * Limitations: + * - We blindly assume that control connections are always + * established in PNS->PAC direction. This is a violation + * of RFFC2673 + * - We can only support one single call within each session + * TODO: + * - testing of incoming PPTP calls + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define NF_CT_PPTP_VERSION "3.1" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); +MODULE_ALIAS("ip_conntrack_pptp"); +MODULE_ALIAS_NFCT_HELPER("pptp"); + +static DEFINE_SPINLOCK(nf_pptp_lock); + +int +(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound); + +int +(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); + +void +(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig, + struct nf_conntrack_expect *expect_reply) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre); + +void +(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct, + struct nf_conntrack_expect *exp) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); + +#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) +/* PptpControlMessageType names */ +const char *const pptp_msg_name[] = { + "UNKNOWN_MESSAGE", + "START_SESSION_REQUEST", + "START_SESSION_REPLY", + "STOP_SESSION_REQUEST", + "STOP_SESSION_REPLY", + "ECHO_REQUEST", + "ECHO_REPLY", + "OUT_CALL_REQUEST", + "OUT_CALL_REPLY", + "IN_CALL_REQUEST", + "IN_CALL_REPLY", + "IN_CALL_CONNECT", + "CALL_CLEAR_REQUEST", + "CALL_DISCONNECT_NOTIFY", + "WAN_ERROR_NOTIFY", + "SET_LINK_INFO" +}; +EXPORT_SYMBOL(pptp_msg_name); +#endif + +#define SECS *HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS + +#define PPTP_GRE_TIMEOUT (10 MINS) +#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS) + +static void pptp_expectfn(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct net *net = nf_ct_net(ct); + typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn; + pr_debug("increasing timeouts\n"); + + /* increase timeout of GRE data channel conntrack entry */ + ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; + ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT; + + /* Can you see how rusty this code is, compared with the pre-2.6.11 + * one? That's what happened to my shiny newnat of 2002 ;( -HW */ + + rcu_read_lock(); + nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn); + if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK) + nf_nat_pptp_expectfn(ct, exp); + else { + struct nf_conntrack_tuple inv_t; + struct nf_conntrack_expect *exp_other; + + /* obviously this tuple inversion only works until you do NAT */ + nf_ct_invert_tuplepr(&inv_t, &exp->tuple); + pr_debug("trying to unexpect other dir: "); + nf_ct_dump_tuple(&inv_t); + + exp_other = nf_ct_expect_find_get(net, nf_ct_zone(ct), &inv_t); + if (exp_other) { + /* delete other expectation. */ + pr_debug("found\n"); + nf_ct_unexpect_related(exp_other); + nf_ct_expect_put(exp_other); + } else { + pr_debug("not found\n"); + } + } + rcu_read_unlock(); +} + +static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, + const struct nf_conntrack_tuple *t) +{ + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_expect *exp; + struct nf_conn *sibling; + u16 zone = nf_ct_zone(ct); + + pr_debug("trying to timeout ct or exp for tuple "); + nf_ct_dump_tuple(t); + + h = nf_conntrack_find_get(net, zone, t); + if (h) { + sibling = nf_ct_tuplehash_to_ctrack(h); + pr_debug("setting timeout of conntrack %p to 0\n", sibling); + sibling->proto.gre.timeout = 0; + sibling->proto.gre.stream_timeout = 0; + if (del_timer(&sibling->timeout)) + sibling->timeout.function((unsigned long)sibling); + nf_ct_put(sibling); + return 1; + } else { + exp = nf_ct_expect_find_get(net, zone, t); + if (exp) { + pr_debug("unexpect_related of expect %p\n", exp); + nf_ct_unexpect_related(exp); + nf_ct_expect_put(exp); + return 1; + } + } + return 0; +} + +/* timeout GRE data connections */ +static void pptp_destroy_siblings(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + const struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_tuple t; + + nf_ct_gre_keymap_destroy(ct); + + /* try original (pns->pac) tuple */ + memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t)); + t.dst.protonum = IPPROTO_GRE; + t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id; + t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id; + if (!destroy_sibling_or_exp(net, ct, &t)) + pr_debug("failed to timeout original pns->pac ct/exp\n"); + + /* try reply (pac->pns) tuple */ + memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t)); + t.dst.protonum = IPPROTO_GRE; + t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id; + t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id; + if (!destroy_sibling_or_exp(net, ct, &t)) + pr_debug("failed to timeout reply pac->pns ct/exp\n"); +} + +/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ +static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) +{ + struct nf_conntrack_expect *exp_orig, *exp_reply; + enum ip_conntrack_dir dir; + int ret = 1; + typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre; + + exp_orig = nf_ct_expect_alloc(ct); + if (exp_orig == NULL) + goto out; + + exp_reply = nf_ct_expect_alloc(ct); + if (exp_reply == NULL) + goto out_put_orig; + + /* original direction, PNS->PAC */ + dir = IP_CT_DIR_ORIGINAL; + nf_ct_expect_init(exp_orig, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[dir].tuple.dst.u3, + IPPROTO_GRE, &peer_callid, &callid); + exp_orig->expectfn = pptp_expectfn; + + /* reply direction, PAC->PNS */ + dir = IP_CT_DIR_REPLY; + nf_ct_expect_init(exp_reply, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[dir].tuple.dst.u3, + IPPROTO_GRE, &callid, &peer_callid); + exp_reply->expectfn = pptp_expectfn; + + nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre); + if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK) + nf_nat_pptp_exp_gre(exp_orig, exp_reply); + if (nf_ct_expect_related(exp_orig) != 0) + goto out_put_both; + if (nf_ct_expect_related(exp_reply) != 0) + goto out_unexpect_orig; + + /* Add GRE keymap entries */ + if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0) + goto out_unexpect_both; + if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) { + nf_ct_gre_keymap_destroy(ct); + goto out_unexpect_both; + } + ret = 0; + +out_put_both: + nf_ct_expect_put(exp_reply); +out_put_orig: + nf_ct_expect_put(exp_orig); +out: + return ret; + +out_unexpect_both: + nf_ct_unexpect_related(exp_reply); +out_unexpect_orig: + nf_ct_unexpect_related(exp_orig); + goto out_put_both; +} + +static inline int +pptp_inbound_pkt(struct sk_buff *skb, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq, + unsigned int reqlen, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + u_int16_t msg; + __be16 cid = 0, pcid = 0; + typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; + + msg = ntohs(ctlh->messageType); + pr_debug("inbound control message %s\n", pptp_msg_name[msg]); + + switch (msg) { + case PPTP_START_SESSION_REPLY: + /* server confirms new control session */ + if (info->sstate < PPTP_SESSION_REQUESTED) + goto invalid; + if (pptpReq->srep.resultCode == PPTP_START_OK) + info->sstate = PPTP_SESSION_CONFIRMED; + else + info->sstate = PPTP_SESSION_ERROR; + break; + + case PPTP_STOP_SESSION_REPLY: + /* server confirms end of control session */ + if (info->sstate > PPTP_SESSION_STOPREQ) + goto invalid; + if (pptpReq->strep.resultCode == PPTP_STOP_OK) + info->sstate = PPTP_SESSION_NONE; + else + info->sstate = PPTP_SESSION_ERROR; + break; + + case PPTP_OUT_CALL_REPLY: + /* server accepted call, we now expect GRE frames */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + if (info->cstate != PPTP_CALL_OUT_REQ && + info->cstate != PPTP_CALL_OUT_CONF) + goto invalid; + + cid = pptpReq->ocack.callID; + pcid = pptpReq->ocack.peersCallID; + if (info->pns_call_id != pcid) + goto invalid; + pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], + ntohs(cid), ntohs(pcid)); + + if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { + info->cstate = PPTP_CALL_OUT_CONF; + info->pac_call_id = cid; + exp_gre(ct, cid, pcid); + } else + info->cstate = PPTP_CALL_NONE; + break; + + case PPTP_IN_CALL_REQUEST: + /* server tells us about incoming call request */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + + cid = pptpReq->icreq.callID; + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + info->cstate = PPTP_CALL_IN_REQ; + info->pac_call_id = cid; + break; + + case PPTP_IN_CALL_CONNECT: + /* server tells us about incoming call established */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + if (info->cstate != PPTP_CALL_IN_REP && + info->cstate != PPTP_CALL_IN_CONF) + goto invalid; + + pcid = pptpReq->iccon.peersCallID; + cid = info->pac_call_id; + + if (info->pns_call_id != pcid) + goto invalid; + + pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); + info->cstate = PPTP_CALL_IN_CONF; + + /* we expect a GRE connection from PAC to PNS */ + exp_gre(ct, cid, pcid); + break; + + case PPTP_CALL_DISCONNECT_NOTIFY: + /* server confirms disconnect */ + cid = pptpReq->disc.callID; + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + info->cstate = PPTP_CALL_NONE; + + /* untrack this call id, unexpect GRE packets */ + pptp_destroy_siblings(ct); + break; + + case PPTP_WAN_ERROR_NOTIFY: + case PPTP_ECHO_REQUEST: + case PPTP_ECHO_REPLY: + /* I don't have to explain these ;) */ + break; + + default: + goto invalid; + } + + nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound); + if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) + return nf_nat_pptp_inbound(skb, ct, ctinfo, ctlh, pptpReq); + return NF_ACCEPT; + +invalid: + pr_debug("invalid %s: type=%d cid=%u pcid=%u " + "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, + ntohs(info->pns_call_id), ntohs(info->pac_call_id)); + return NF_ACCEPT; +} + +static inline int +pptp_outbound_pkt(struct sk_buff *skb, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq, + unsigned int reqlen, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + u_int16_t msg; + __be16 cid = 0, pcid = 0; + typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; + + msg = ntohs(ctlh->messageType); + pr_debug("outbound control message %s\n", pptp_msg_name[msg]); + + switch (msg) { + case PPTP_START_SESSION_REQUEST: + /* client requests for new control session */ + if (info->sstate != PPTP_SESSION_NONE) + goto invalid; + info->sstate = PPTP_SESSION_REQUESTED; + break; + + case PPTP_STOP_SESSION_REQUEST: + /* client requests end of control session */ + info->sstate = PPTP_SESSION_STOPREQ; + break; + + case PPTP_OUT_CALL_REQUEST: + /* client initiating connection to server */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + info->cstate = PPTP_CALL_OUT_REQ; + /* track PNS call id */ + cid = pptpReq->ocreq.callID; + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + info->pns_call_id = cid; + break; + + case PPTP_IN_CALL_REPLY: + /* client answers incoming call */ + if (info->cstate != PPTP_CALL_IN_REQ && + info->cstate != PPTP_CALL_IN_REP) + goto invalid; + + cid = pptpReq->icack.callID; + pcid = pptpReq->icack.peersCallID; + if (info->pac_call_id != pcid) + goto invalid; + pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], + ntohs(cid), ntohs(pcid)); + + if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { + /* part two of the three-way handshake */ + info->cstate = PPTP_CALL_IN_REP; + info->pns_call_id = cid; + } else + info->cstate = PPTP_CALL_NONE; + break; + + case PPTP_CALL_CLEAR_REQUEST: + /* client requests hangup of call */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + /* FUTURE: iterate over all calls and check if + * call ID is valid. We don't do this without newnat, + * because we only know about last call */ + info->cstate = PPTP_CALL_CLEAR_REQ; + break; + + case PPTP_SET_LINK_INFO: + case PPTP_ECHO_REQUEST: + case PPTP_ECHO_REPLY: + /* I don't have to explain these ;) */ + break; + + default: + goto invalid; + } + + nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound); + if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) + return nf_nat_pptp_outbound(skb, ct, ctinfo, ctlh, pptpReq); + return NF_ACCEPT; + +invalid: + pr_debug("invalid %s: type=%d cid=%u pcid=%u " + "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, + ntohs(info->pns_call_id), ntohs(info->pac_call_id)); + return NF_ACCEPT; +} + +static const unsigned int pptp_msg_size[] = { + [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest), + [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply), + [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest), + [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply), + [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest), + [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply), + [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest), + [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply), + [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected), + [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest), + [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify), + [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify), + [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo), +}; + +/* track caller id inside control connection, call expect_related */ +static int +conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) + +{ + int dir = CTINFO2DIR(ctinfo); + const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + const struct tcphdr *tcph; + struct tcphdr _tcph; + const struct pptp_pkt_hdr *pptph; + struct pptp_pkt_hdr _pptph; + struct PptpControlHeader _ctlh, *ctlh; + union pptp_ctrl_union _pptpReq, *pptpReq; + unsigned int tcplen = skb->len - protoff; + unsigned int datalen, reqlen, nexthdr_off; + int oldsstate, oldcstate; + int ret; + u_int16_t msg; + + /* don't do any tracking before tcp handshake complete */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + nexthdr_off = protoff; + tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph); + BUG_ON(!tcph); + nexthdr_off += tcph->doff * 4; + datalen = tcplen - tcph->doff * 4; + + pptph = skb_header_pointer(skb, nexthdr_off, sizeof(_pptph), &_pptph); + if (!pptph) { + pr_debug("no full PPTP header, can't track\n"); + return NF_ACCEPT; + } + nexthdr_off += sizeof(_pptph); + datalen -= sizeof(_pptph); + + /* if it's not a control message we can't do anything with it */ + if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || + ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { + pr_debug("not a control packet\n"); + return NF_ACCEPT; + } + + ctlh = skb_header_pointer(skb, nexthdr_off, sizeof(_ctlh), &_ctlh); + if (!ctlh) + return NF_ACCEPT; + nexthdr_off += sizeof(_ctlh); + datalen -= sizeof(_ctlh); + + reqlen = datalen; + msg = ntohs(ctlh->messageType); + if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg]) + return NF_ACCEPT; + if (reqlen > sizeof(*pptpReq)) + reqlen = sizeof(*pptpReq); + + pptpReq = skb_header_pointer(skb, nexthdr_off, reqlen, &_pptpReq); + if (!pptpReq) + return NF_ACCEPT; + + oldsstate = info->sstate; + oldcstate = info->cstate; + + spin_lock_bh(&nf_pptp_lock); + + /* FIXME: We just blindly assume that the control connection is always + * established from PNS->PAC. However, RFC makes no guarantee */ + if (dir == IP_CT_DIR_ORIGINAL) + /* client -> server (PNS -> PAC) */ + ret = pptp_outbound_pkt(skb, ctlh, pptpReq, reqlen, ct, + ctinfo); + else + /* server -> client (PAC -> PNS) */ + ret = pptp_inbound_pkt(skb, ctlh, pptpReq, reqlen, ct, + ctinfo); + pr_debug("sstate: %d->%d, cstate: %d->%d\n", + oldsstate, info->sstate, oldcstate, info->cstate); + spin_unlock_bh(&nf_pptp_lock); + + return ret; +} + +static const struct nf_conntrack_expect_policy pptp_exp_policy = { + .max_expected = 2, + .timeout = 5 * 60, +}; + +/* control protocol helper */ +static struct nf_conntrack_helper pptp __read_mostly = { + .name = "pptp", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET, + .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT), + .tuple.dst.protonum = IPPROTO_TCP, + .help = conntrack_pptp_help, + .destroy = pptp_destroy_siblings, + .expect_policy = &pptp_exp_policy, +}; + +static void nf_conntrack_pptp_net_exit(struct net *net) +{ + nf_ct_gre_keymap_flush(net); +} + +static struct pernet_operations nf_conntrack_pptp_net_ops = { + .exit = nf_conntrack_pptp_net_exit, +}; + +static int __init nf_conntrack_pptp_init(void) +{ + int rv; + + rv = nf_conntrack_helper_register(&pptp); + if (rv < 0) + return rv; + rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); + if (rv < 0) + nf_conntrack_helper_unregister(&pptp); + return rv; +} + +static void __exit nf_conntrack_pptp_fini(void) +{ + nf_conntrack_helper_unregister(&pptp); + unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); +} + +module_init(nf_conntrack_pptp_init); +module_exit(nf_conntrack_pptp_fini); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c new file mode 100644 index 00000000..5701c8dd --- /dev/null +++ b/net/netfilter/nf_conntrack_proto.c @@ -0,0 +1,384 @@ +/* L3/L4 protocol support for nf_conntrack. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_l3protos); + +static DEFINE_MUTEX(nf_ct_proto_mutex); + +#ifdef CONFIG_SYSCTL +static int +nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, + struct ctl_table *table, unsigned int *users) +{ + if (*header == NULL) { + *header = register_sysctl_paths(path, table); + if (*header == NULL) + return -ENOMEM; + } + if (users != NULL) + (*users)++; + return 0; +} + +static void +nf_ct_unregister_sysctl(struct ctl_table_header **header, + struct ctl_table *table, unsigned int *users) +{ + if (users != NULL && --*users > 0) + return; + + unregister_sysctl_table(*header); + *header = NULL; +} +#endif + +struct nf_conntrack_l4proto * +__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) +{ + if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) + return &nf_conntrack_l4proto_generic; + + return rcu_dereference(nf_ct_protos[l3proto][l4proto]); +} +EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct nf_conntrack_l3proto * +nf_ct_l3proto_find_get(u_int16_t l3proto) +{ + struct nf_conntrack_l3proto *p; + + rcu_read_lock(); + p = __nf_ct_l3proto_find(l3proto); + if (!try_module_get(p->me)) + p = &nf_conntrack_l3proto_generic; + rcu_read_unlock(); + + return p; +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); + +void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) +{ + module_put(p->me); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); + +int +nf_ct_l3proto_try_module_get(unsigned short l3proto) +{ + int ret; + struct nf_conntrack_l3proto *p; + +retry: p = nf_ct_l3proto_find_get(l3proto); + if (p == &nf_conntrack_l3proto_generic) { + ret = request_module("nf_conntrack-%d", l3proto); + if (!ret) + goto retry; + + return -EPROTOTYPE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get); + +void nf_ct_l3proto_module_put(unsigned short l3proto) +{ + struct nf_conntrack_l3proto *p; + + /* rcu_read_lock not necessary since the caller holds a reference, but + * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find() + */ + rcu_read_lock(); + p = __nf_ct_l3proto_find(l3proto); + module_put(p->me); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); + +static int kill_l3proto(struct nf_conn *i, void *data) +{ + return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; +} + +static int kill_l4proto(struct nf_conn *i, void *data) +{ + struct nf_conntrack_l4proto *l4proto; + l4proto = (struct nf_conntrack_l4proto *)data; + return nf_ct_protonum(i) == l4proto->l4proto && + nf_ct_l3num(i) == l4proto->l3proto; +} + +static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) +{ + int err = 0; + +#ifdef CONFIG_SYSCTL + if (l3proto->ctl_table != NULL) { + err = nf_ct_register_sysctl(&l3proto->ctl_table_header, + l3proto->ctl_table_path, + l3proto->ctl_table, NULL); + } +#endif + return err; +} + +static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto) +{ +#ifdef CONFIG_SYSCTL + if (l3proto->ctl_table_header != NULL) + nf_ct_unregister_sysctl(&l3proto->ctl_table_header, + l3proto->ctl_table, NULL); +#endif +} + +int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +{ + int ret = 0; + struct nf_conntrack_l3proto *old; + + if (proto->l3proto >= AF_MAX) + return -EBUSY; + + if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size) + return -EINVAL; + + mutex_lock(&nf_ct_proto_mutex); + old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex)); + if (old != &nf_conntrack_l3proto_generic) { + ret = -EBUSY; + goto out_unlock; + } + + ret = nf_ct_l3proto_register_sysctl(proto); + if (ret < 0) + goto out_unlock; + + if (proto->nlattr_tuple_size) + proto->nla_size = 3 * proto->nlattr_tuple_size(); + + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); + +out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); + +void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{ + struct net *net; + + BUG_ON(proto->l3proto >= AF_MAX); + + mutex_lock(&nf_ct_proto_mutex); + BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != proto); + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], + &nf_conntrack_l3proto_generic); + nf_ct_l3proto_unregister_sysctl(proto); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_rcu(); + + /* Remove all contrack entries for this protocol */ + rtnl_lock(); + for_each_net(net) + nf_ct_iterate_cleanup(net, kill_l3proto, proto); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); + +static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) +{ + int err = 0; + +#ifdef CONFIG_SYSCTL + if (l4proto->ctl_table != NULL) { + err = nf_ct_register_sysctl(l4proto->ctl_table_header, + nf_net_netfilter_sysctl_path, + l4proto->ctl_table, + l4proto->ctl_table_users); + if (err < 0) + goto out; + } +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + if (l4proto->ctl_compat_table != NULL) { + err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header, + nf_net_ipv4_netfilter_sysctl_path, + l4proto->ctl_compat_table, NULL); + if (err == 0) + goto out; + nf_ct_unregister_sysctl(l4proto->ctl_table_header, + l4proto->ctl_table, + l4proto->ctl_table_users); + } +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +out: +#endif /* CONFIG_SYSCTL */ + return err; +} + +static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto) +{ +#ifdef CONFIG_SYSCTL + if (l4proto->ctl_table_header != NULL && + *l4proto->ctl_table_header != NULL) + nf_ct_unregister_sysctl(l4proto->ctl_table_header, + l4proto->ctl_table, + l4proto->ctl_table_users); +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + if (l4proto->ctl_compat_table_header != NULL) + nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header, + l4proto->ctl_compat_table, NULL); +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) +{ + int ret = 0; + + if (l4proto->l3proto >= PF_MAX) + return -EBUSY; + + if ((l4proto->to_nlattr && !l4proto->nlattr_size) + || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) + return -EINVAL; + + mutex_lock(&nf_ct_proto_mutex); + if (!nf_ct_protos[l4proto->l3proto]) { + /* l3proto may be loaded latter. */ + struct nf_conntrack_l4proto __rcu **proto_array; + int i; + + proto_array = kmalloc(MAX_NF_CT_PROTO * + sizeof(struct nf_conntrack_l4proto *), + GFP_KERNEL); + if (proto_array == NULL) { + ret = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < MAX_NF_CT_PROTO; i++) + RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic); + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + + nf_ct_protos[l4proto->l3proto] = proto_array; + } else if (rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != &nf_conntrack_l4proto_generic) { + ret = -EBUSY; + goto out_unlock; + } + + ret = nf_ct_l4proto_register_sysctl(l4proto); + if (ret < 0) + goto out_unlock; + + l4proto->nla_size = 0; + if (l4proto->nlattr_size) + l4proto->nla_size += l4proto->nlattr_size(); + if (l4proto->nlattr_tuple_size) + l4proto->nla_size += 3 * l4proto->nlattr_tuple_size(); + + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + l4proto); + +out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); + +void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +{ + struct net *net; + + BUG_ON(l4proto->l3proto >= PF_MAX); + + mutex_lock(&nf_ct_proto_mutex); + BUG_ON(rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != l4proto); + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + &nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(l4proto); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_rcu(); + + /* Remove all contrack entries for this protocol */ + rtnl_lock(); + for_each_net(net) + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); + +int nf_conntrack_proto_init(void) +{ + unsigned int i; + int err; + + err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + if (err < 0) + return err; + + for (i = 0; i < AF_MAX; i++) + rcu_assign_pointer(nf_ct_l3protos[i], + &nf_conntrack_l3proto_generic); + return 0; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + + nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + kfree(nf_ct_protos[i]); +} diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c new file mode 100644 index 00000000..2e664a69 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -0,0 +1,899 @@ +/* + * DCCP connection tracking protocol helper + * + * Copyright (c) 2005, 2006, 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +/* Timeouts are based on values from RFC4340: + * + * - REQUEST: + * + * 8.1.2. Client Request + * + * A client MAY give up on its DCCP-Requests after some time + * (3 minutes, for example). + * + * - RESPOND: + * + * 8.1.3. Server Response + * + * It MAY also leave the RESPOND state for CLOSED after a timeout of + * not less than 4MSL (8 minutes); + * + * - PARTOPEN: + * + * 8.1.5. Handshake Completion + * + * If the client remains in PARTOPEN for more than 4MSL (8 minutes), + * it SHOULD reset the connection with Reset Code 2, "Aborted". + * + * - OPEN: + * + * The DCCP timestamp overflows after 11.9 hours. If the connection + * stays idle this long the sequence number won't be recognized + * as valid anymore. + * + * - CLOSEREQ/CLOSING: + * + * 8.3. Termination + * + * The retransmission timer should initially be set to go off in two + * round-trip times and should back off to not less than once every + * 64 seconds ... + * + * - TIMEWAIT: + * + * 4.3. States + * + * A server or client socket remains in this state for 2MSL (4 minutes) + * after the connection has been town down, ... + */ + +#define DCCP_MSL (2 * 60 * HZ) + +static const char * const dccp_state_names[] = { + [CT_DCCP_NONE] = "NONE", + [CT_DCCP_REQUEST] = "REQUEST", + [CT_DCCP_RESPOND] = "RESPOND", + [CT_DCCP_PARTOPEN] = "PARTOPEN", + [CT_DCCP_OPEN] = "OPEN", + [CT_DCCP_CLOSEREQ] = "CLOSEREQ", + [CT_DCCP_CLOSING] = "CLOSING", + [CT_DCCP_TIMEWAIT] = "TIMEWAIT", + [CT_DCCP_IGNORE] = "IGNORE", + [CT_DCCP_INVALID] = "INVALID", +}; + +#define sNO CT_DCCP_NONE +#define sRQ CT_DCCP_REQUEST +#define sRS CT_DCCP_RESPOND +#define sPO CT_DCCP_PARTOPEN +#define sOP CT_DCCP_OPEN +#define sCR CT_DCCP_CLOSEREQ +#define sCG CT_DCCP_CLOSING +#define sTW CT_DCCP_TIMEWAIT +#define sIG CT_DCCP_IGNORE +#define sIV CT_DCCP_INVALID + +/* + * DCCP state transition table + * + * The assumption is the same as for TCP tracking: + * + * We are the man in the middle. All the packets go through us but might + * get lost in transit to the destination. It is assumed that the destination + * can't receive segments we haven't seen. + * + * The following states exist: + * + * NONE: Initial state, expecting Request + * REQUEST: Request seen, waiting for Response from server + * RESPOND: Response from server seen, waiting for Ack from client + * PARTOPEN: Ack after Response seen, waiting for packet other than Response, + * Reset or Sync from server + * OPEN: Packet other than Response, Reset or Sync seen + * CLOSEREQ: CloseReq from server seen, expecting Close from client + * CLOSING: Close seen, expecting Reset + * TIMEWAIT: Reset seen + * IGNORE: Not determinable whether packet is valid + * + * Some states exist only on one side of the connection: REQUEST, RESPOND, + * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to + * the one it was in before. + * + * Packets are marked as ignored (sIG) if we don't know if they're valid + * (for example a reincarnation of a connection we didn't notice is dead + * already) and the server may send back a connection closing Reset or a + * Response. They're also used for Sync/SyncAck packets, which we don't + * care about. + */ +static const u_int8_t +dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { + [CT_DCCP_ROLE_CLIENT] = { + [DCCP_PKT_REQUEST] = { + /* + * sNO -> sRQ Regular Request + * sRQ -> sRQ Retransmitted Request or reincarnation + * sRS -> sRS Retransmitted Request (apparently Response + * got lost after we saw it) or reincarnation + * sPO -> sIG Ignore, conntrack might be out of sync + * sOP -> sIG Ignore, conntrack might be out of sync + * sCR -> sIG Ignore, conntrack might be out of sync + * sCG -> sIG Ignore, conntrack might be out of sync + * sTW -> sRQ Reincarnation + * + * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ + sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, + }, + [DCCP_PKT_RESPONSE] = { + /* + * sNO -> sIV Invalid + * sRQ -> sIG Ignore, might be response to ignored Request + * sRS -> sIG Ignore, might be response to ignored Request + * sPO -> sIG Ignore, might be response to ignored Request + * sOP -> sIG Ignore, might be response to ignored Request + * sCR -> sIG Ignore, might be response to ignored Request + * sCG -> sIG Ignore, might be response to ignored Request + * sTW -> sIV Invalid, reincarnation in reverse direction + * goes through sRQ + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, + }, + [DCCP_PKT_ACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) + * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN + * sOP -> sOP Regular ACK, remain in OPEN + * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV + }, + [DCCP_PKT_DATA] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) + * sOP -> sOP Regular Data packet + * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, + }, + [DCCP_PKT_DATAACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) + * sPO -> sPO Remain in PARTOPEN state + * sOP -> sOP Regular DataAck packet in OPEN state + * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV + }, + [DCCP_PKT_CLOSEREQ] = { + /* + * CLOSEREQ may only be sent by the server. + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV + }, + [DCCP_PKT_CLOSE] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sCG Client-initiated close + * sOP -> sCG Client-initiated close + * sCR -> sCG Close in response to CloseReq (8.3.) + * sCG -> sCG Retransmit + * sTW -> sIV Late retransmit, already in TIME_WAIT + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV + }, + [DCCP_PKT_RESET] = { + /* + * sNO -> sIV No connection + * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) + * sRS -> sTW Response received without Request + * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) + * sOP -> sTW Connection reset + * sCR -> sTW Connection reset + * sCG -> sTW Connection reset + * sTW -> sIG Ignore (don't refresh timer) + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG + }, + [DCCP_PKT_SYNC] = { + /* + * We currently ignore Sync packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + [DCCP_PKT_SYNCACK] = { + /* + * We currently ignore SyncAck packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + }, + [CT_DCCP_ROLE_SERVER] = { + [DCCP_PKT_REQUEST] = { + /* + * sNO -> sIV Invalid + * sRQ -> sIG Ignore, conntrack might be out of sync + * sRS -> sIG Ignore, conntrack might be out of sync + * sPO -> sIG Ignore, conntrack might be out of sync + * sOP -> sIG Ignore, conntrack might be out of sync + * sCR -> sIG Ignore, conntrack might be out of sync + * sCG -> sIG Ignore, conntrack might be out of sync + * sTW -> sRQ Reincarnation, must reverse roles + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ + }, + [DCCP_PKT_RESPONSE] = { + /* + * sNO -> sIV Response without Request + * sRQ -> sRS Response to clients Request + * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) + * sPO -> sIG Response to an ignored Request or late retransmit + * sOP -> sIG Ignore, might be response to ignored Request + * sCR -> sIG Ignore, might be response to ignored Request + * sCG -> sIG Ignore, might be response to ignored Request + * sTW -> sIV Invalid, Request from client in sTW moves to sRQ + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV + }, + [DCCP_PKT_ACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular Ack in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_DATA] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular Data packet in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_DATAACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular DataAck in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_CLOSEREQ] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) + * sOP -> sCR CloseReq in OPEN state + * sCR -> sCR Retransmit + * sCG -> sCR Simultaneous close, client sends another Close + * sTW -> sIV Already closed + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV + }, + [DCCP_PKT_CLOSE] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP -> sCG Move direcly to CLOSING + * sOP -> sCG Move to CLOSING + * sCR -> sIV Close after CloseReq is invalid + * sCG -> sCG Retransmit + * sTW -> sIV Already closed + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV + }, + [DCCP_PKT_RESET] = { + /* + * sNO -> sIV No connection + * sRQ -> sTW Reset in response to Request + * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) + * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) + * sOP -> sTW + * sCR -> sTW + * sCG -> sTW + * sTW -> sIG Ignore (don't refresh timer) + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ + sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG + }, + [DCCP_PKT_SYNC] = { + /* + * We currently ignore Sync packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + [DCCP_PKT_SYNCACK] = { + /* + * We currently ignore SyncAck packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + }, +}; + +/* this module per-net specifics */ +static int dccp_net_id __read_mostly; +struct dccp_net { + int dccp_loose; + unsigned int dccp_timeout[CT_DCCP_MAX + 1]; +#ifdef CONFIG_SYSCTL + struct ctl_table_header *sysctl_header; + struct ctl_table *sysctl_table; +#endif +}; + +static inline struct dccp_net *dccp_pernet(struct net *net) +{ + return net_generic(net, dccp_net_id); +} + +static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct dccp_hdr _hdr, *dh; + + dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (dh == NULL) + return false; + + tuple->src.u.dccp.port = dh->dccph_sport; + tuple->dst.u.dccp.port = dh->dccph_dport; + return true; +} + +static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, + const struct nf_conntrack_tuple *tuple) +{ + inv->src.u.dccp.port = tuple->dst.u.dccp.port; + inv->dst.u.dccp.port = tuple->src.u.dccp.port; + return true; +} + +static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + struct net *net = nf_ct_net(ct); + struct dccp_net *dn; + struct dccp_hdr _dh, *dh; + const char *msg; + u_int8_t state; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + BUG_ON(dh == NULL); + + state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; + switch (state) { + default: + dn = dccp_pernet(net); + if (dn->dccp_loose == 0) { + msg = "nf_ct_dccp: not picking up existing connection "; + goto out_invalid; + } + case CT_DCCP_REQUEST: + break; + case CT_DCCP_INVALID: + msg = "nf_ct_dccp: invalid state transition "; + goto out_invalid; + } + + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.state = CT_DCCP_NONE; + ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; + ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; + ct->proto.dccp.handshake_seq = 0; + return true; + +out_invalid: + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); + return false; +} + +static u64 dccp_ack_seq(const struct dccp_hdr *dh) +{ + const struct dccp_hdr_ack_bits *dhack; + + dhack = (void *)dh + __dccp_basic_hdr_len(dh); + return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + + ntohl(dhack->dccph_ack_nr_low); +} + +static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info ctinfo, + u_int8_t pf, unsigned int hooknum) +{ + struct net *net = nf_ct_net(ct); + struct dccp_net *dn; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct dccp_hdr _dh, *dh; + u_int8_t type, old_state, new_state; + enum ct_dccp_roles role; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + BUG_ON(dh == NULL); + type = dh->dccph_type; + + if (type == DCCP_PKT_RESET && + !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + /* Tear down connection immediately if only reply is a RESET */ + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_ACCEPT; + } + + spin_lock_bh(&ct->lock); + + role = ct->proto.dccp.role[dir]; + old_state = ct->proto.dccp.state; + new_state = dccp_state_table[role][type][old_state]; + + switch (new_state) { + case CT_DCCP_REQUEST: + if (old_state == CT_DCCP_TIMEWAIT && + role == CT_DCCP_ROLE_SERVER) { + /* Reincarnation in the reverse direction: reopen and + * reverse client/server roles. */ + ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; + } + break; + case CT_DCCP_RESPOND: + if (old_state == CT_DCCP_REQUEST) + ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); + break; + case CT_DCCP_PARTOPEN: + if (old_state == CT_DCCP_RESPOND && + type == DCCP_PKT_ACK && + dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) + set_bit(IPS_ASSURED_BIT, &ct->status); + break; + case CT_DCCP_IGNORE: + /* + * Connection tracking might be out of sync, so we ignore + * packets that might establish a new connection and resync + * if the server responds with a valid Response. + */ + if (ct->proto.dccp.last_dir == !dir && + ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && + type == DCCP_PKT_RESPONSE) { + ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); + new_state = CT_DCCP_RESPOND; + break; + } + ct->proto.dccp.last_dir = dir; + ct->proto.dccp.last_pkt = type; + + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_dccp: invalid packet ignored "); + return NF_ACCEPT; + case CT_DCCP_INVALID: + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_dccp: invalid state transition "); + return -NF_ACCEPT; + } + + ct->proto.dccp.last_dir = dir; + ct->proto.dccp.last_pkt = type; + ct->proto.dccp.state = new_state; + spin_unlock_bh(&ct->lock); + + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + + dn = dccp_pernet(net); + nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]); + + return NF_ACCEPT; +} + +static int dccp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, unsigned int hooknum) +{ + struct dccp_hdr _dh, *dh; + unsigned int dccp_len = skb->len - dataoff; + unsigned int cscov; + const char *msg; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + if (dh == NULL) { + msg = "nf_ct_dccp: short packet "; + goto out_invalid; + } + + if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || + dh->dccph_doff * 4 > dccp_len) { + msg = "nf_ct_dccp: truncated/malformed packet "; + goto out_invalid; + } + + cscov = dccp_len; + if (dh->dccph_cscov) { + cscov = (dh->dccph_cscov - 1) * 4; + if (cscov > dccp_len) { + msg = "nf_ct_dccp: bad checksum coverage "; + goto out_invalid; + } + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, + pf)) { + msg = "nf_ct_dccp: bad checksum "; + goto out_invalid; + } + + if (dh->dccph_type >= DCCP_PKT_INVALID) { + msg = "nf_ct_dccp: reserved packet type "; + goto out_invalid; + } + + return NF_ACCEPT; + +out_invalid: + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); + return -NF_ACCEPT; +} + +static int dccp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.dccp.port), + ntohs(tuple->dst.u.dccp.port)); +} + +static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); + NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE, + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]); + NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, + cpu_to_be64(ct->proto.dccp.handshake_seq)); + nla_nest_end(skb, nest_parms); + spin_unlock_bh(&ct->lock); + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { + [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, +}; + +static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; + struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; + int err; + + if (!attr) + return 0; + + err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr, + dccp_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_PROTOINFO_DCCP_STATE] || + !tb[CTA_PROTOINFO_DCCP_ROLE] || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { + return -EINVAL; + } + + spin_lock_bh(&ct->lock); + ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); + if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + } else { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; + } + if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { + ct->proto.dccp.handshake_seq = + be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); + } + spin_unlock_bh(&ct->lock); + return 0; +} + +static int dccp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_DCCP */ + + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1); +} +#endif + +#ifdef CONFIG_SYSCTL +/* template, data assigned later */ +static struct ctl_table dccp_sysctl_table[] = { + { + .procname = "nf_conntrack_dccp_timeout_request", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_respond", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_partopen", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_open", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_closereq", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_closing", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_timewait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_loose", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { + .l3proto = AF_INET, + .l4proto = IPPROTO_DCCP, + .name = "dccp", + .pkt_to_tuple = dccp_pkt_to_tuple, + .invert_tuple = dccp_invert_tuple, + .new = dccp_new, + .packet = dccp_packet, + .error = dccp_error, + .print_tuple = dccp_print_tuple, + .print_conntrack = dccp_print_conntrack, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, + .from_nlattr = nlattr_to_dccp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +}; + +static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { + .l3proto = AF_INET6, + .l4proto = IPPROTO_DCCP, + .name = "dccp", + .pkt_to_tuple = dccp_pkt_to_tuple, + .invert_tuple = dccp_invert_tuple, + .new = dccp_new, + .packet = dccp_packet, + .error = dccp_error, + .print_tuple = dccp_print_tuple, + .print_conntrack = dccp_print_conntrack, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, + .from_nlattr = nlattr_to_dccp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +}; + +static __net_init int dccp_net_init(struct net *net) +{ + struct dccp_net *dn = dccp_pernet(net); + + /* default values */ + dn->dccp_loose = 1; + dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + +#ifdef CONFIG_SYSCTL + dn->sysctl_table = kmemdup(dccp_sysctl_table, + sizeof(dccp_sysctl_table), GFP_KERNEL); + if (!dn->sysctl_table) + return -ENOMEM; + + dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; + dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; + dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; + dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; + dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; + dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; + dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; + dn->sysctl_table[7].data = &dn->dccp_loose; + + dn->sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, dn->sysctl_table); + if (!dn->sysctl_header) { + kfree(dn->sysctl_table); + return -ENOMEM; + } +#endif + + return 0; +} + +static __net_exit void dccp_net_exit(struct net *net) +{ + struct dccp_net *dn = dccp_pernet(net); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(dn->sysctl_header); + kfree(dn->sysctl_table); +#endif +} + +static struct pernet_operations dccp_net_ops = { + .init = dccp_net_init, + .exit = dccp_net_exit, + .id = &dccp_net_id, + .size = sizeof(struct dccp_net), +}; + +static int __init nf_conntrack_proto_dccp_init(void) +{ + int err; + + err = register_pernet_subsys(&dccp_net_ops); + if (err < 0) + goto err1; + + err = nf_conntrack_l4proto_register(&dccp_proto4); + if (err < 0) + goto err2; + + err = nf_conntrack_l4proto_register(&dccp_proto6); + if (err < 0) + goto err3; + return 0; + +err3: + nf_conntrack_l4proto_unregister(&dccp_proto4); +err2: + unregister_pernet_subsys(&dccp_net_ops); +err1: + return err; +} + +static void __exit nf_conntrack_proto_dccp_fini(void) +{ + unregister_pernet_subsys(&dccp_net_ops); + nf_conntrack_l4proto_unregister(&dccp_proto6); + nf_conntrack_l4proto_unregister(&dccp_proto4); +} + +module_init(nf_conntrack_proto_dccp_init); +module_exit(nf_conntrack_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("DCCP connection tracking protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c new file mode 100644 index 00000000..e2091d0c --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -0,0 +1,105 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; + +static bool generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return true; +} + +static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + return true; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *generic_sysctl_header; +static struct ctl_table generic_sysctl_table[] = { + { + .procname = "nf_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table generic_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = +{ + .l3proto = PF_UNSPEC, + .l4proto = 255, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .packet = packet, + .new = new, +#ifdef CONFIG_SYSCTL + .ctl_table_header = &generic_sysctl_header, + .ctl_table = generic_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = generic_compat_sysctl_table, +#endif +#endif +}; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c new file mode 100644 index 00000000..cf616e55 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -0,0 +1,346 @@ +/* + * ip_conntrack_proto_gre.c - Version 3.0 + * + * Connection tracking protocol helper module for GRE. + * + * GRE is a generic encapsulation protocol, which is generally not very + * suited for NAT, as it has no protocol-specific part as port numbers. + * + * It has an optional key field, which may help us distinguishing two + * connections between the same two hosts. + * + * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 + * + * PPTP is built on top of a modified version of GRE, and has a mandatory + * field called "CallID", which serves us for the same purpose as the key + * field in plain GRE. + * + * Documentation about PPTP can be found in RFC 2637 + * + * (C) 2000-2005 by Harald Welte + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GRE_TIMEOUT (30 * HZ) +#define GRE_STREAM_TIMEOUT (180 * HZ) + +static int proto_gre_net_id __read_mostly; +struct netns_proto_gre { + rwlock_t keymap_lock; + struct list_head keymap_list; +}; + +void nf_ct_gre_keymap_flush(struct net *net) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_ct_gre_keymap *km, *tmp; + + write_lock_bh(&net_gre->keymap_lock); + list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { + list_del(&km->list); + kfree(km); + } + write_unlock_bh(&net_gre->keymap_lock); +} +EXPORT_SYMBOL(nf_ct_gre_keymap_flush); + +static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, + const struct nf_conntrack_tuple *t) +{ + return km->tuple.src.l3num == t->src.l3num && + !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) && + !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) && + km->tuple.dst.protonum == t->dst.protonum && + km->tuple.dst.u.all == t->dst.u.all; +} + +/* look up the source key for a given tuple */ +static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_ct_gre_keymap *km; + __be16 key = 0; + + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { + if (gre_key_cmpfn(km, t)) { + key = km->tuple.src.u.gre.key; + break; + } + } + read_unlock_bh(&net_gre->keymap_lock); + + pr_debug("lookup src key 0x%x for ", key); + nf_ct_dump_tuple(t); + + return key; +} + +/* add a single keymap entry, associate with specified master ct */ +int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, + struct nf_conntrack_tuple *t) +{ + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_gre_keymap **kmp, *km; + + kmp = &help->help.ct_pptp_info.keymap[dir]; + if (*kmp) { + /* check whether it's a retransmission */ + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { + if (gre_key_cmpfn(km, t) && km == *kmp) { + read_unlock_bh(&net_gre->keymap_lock); + return 0; + } + } + read_unlock_bh(&net_gre->keymap_lock); + pr_debug("trying to override keymap_%s for ct %p\n", + dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); + return -EEXIST; + } + + km = kmalloc(sizeof(*km), GFP_ATOMIC); + if (!km) + return -ENOMEM; + memcpy(&km->tuple, t, sizeof(*t)); + *kmp = km; + + pr_debug("adding new entry %p: ", km); + nf_ct_dump_tuple(&km->tuple); + + write_lock_bh(&net_gre->keymap_lock); + list_add_tail(&km->list, &net_gre->keymap_list); + write_unlock_bh(&net_gre->keymap_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); + +/* destroy the keymap entries associated with specified master ct */ +void nf_ct_gre_keymap_destroy(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_conn_help *help = nfct_help(ct); + enum ip_conntrack_dir dir; + + pr_debug("entering for ct %p\n", ct); + + write_lock_bh(&net_gre->keymap_lock); + for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { + if (help->help.ct_pptp_info.keymap[dir]) { + pr_debug("removing %p from list\n", + help->help.ct_pptp_info.keymap[dir]); + list_del(&help->help.ct_pptp_info.keymap[dir]->list); + kfree(help->help.ct_pptp_info.keymap[dir]); + help->help.ct_pptp_info.keymap[dir] = NULL; + } + } + write_unlock_bh(&net_gre->keymap_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); + +/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ + +/* invert gre part of tuple */ +static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->dst.u.gre.key = orig->src.u.gre.key; + tuple->src.u.gre.key = orig->dst.u.gre.key; + return true; +} + +/* gre hdr info to tuple */ +static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev); + const struct gre_hdr_pptp *pgrehdr; + struct gre_hdr_pptp _pgrehdr; + __be16 srckey; + const struct gre_hdr *grehdr; + struct gre_hdr _grehdr; + + /* first only delinearize old RFC1701 GRE header */ + grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); + if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { + /* try to behave like "nf_conntrack_proto_generic" */ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + return true; + } + + /* PPTP header is variable length, only need up to the call_id field */ + pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); + if (!pgrehdr) + return true; + + if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { + pr_debug("GRE_VERSION_PPTP but unknown proto\n"); + return false; + } + + tuple->dst.u.gre.key = pgrehdr->call_id; + srckey = gre_keymap_lookup(net, tuple); + tuple->src.u.gre.key = srckey; + + return true; +} + +/* print gre part of tuple */ +static int gre_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "srckey=0x%x dstkey=0x%x ", + ntohs(tuple->src.u.gre.key), + ntohs(tuple->dst.u.gre.key)); +} + +/* print private data for conntrack */ +static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + return seq_printf(s, "timeout=%u, stream_timeout=%u ", + (ct->proto.gre.timeout / HZ), + (ct->proto.gre.stream_timeout / HZ)); +} + +/* Returns verdict for packet, and may modify conntrack */ +static int gre_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + /* If we've seen traffic both ways, this is a GRE connection. + * Extend timeout. */ + if (ct->status & IPS_SEEN_REPLY) { + nf_ct_refresh_acct(ct, ctinfo, skb, + ct->proto.gre.stream_timeout); + /* Also, more likely to be important, and not a probe. */ + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else + nf_ct_refresh_acct(ct, ctinfo, skb, + ct->proto.gre.timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + pr_debug(": "); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + /* initialize to sane value. Ideally a conntrack helper + * (e.g. in case of pptp) is increasing them */ + ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; + ct->proto.gre.timeout = GRE_TIMEOUT; + + return true; +} + +/* Called when a conntrack entry has already been removed from the hashes + * and is about to be deleted from memory */ +static void gre_destroy(struct nf_conn *ct) +{ + struct nf_conn *master = ct->master; + pr_debug(" entering\n"); + + if (!master) + pr_debug("no master !?!\n"); + else + nf_ct_gre_keymap_destroy(master); +} + +/* protocol helper struct */ +static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { + .l3proto = AF_INET, + .l4proto = IPPROTO_GRE, + .name = "gre", + .pkt_to_tuple = gre_pkt_to_tuple, + .invert_tuple = gre_invert_tuple, + .print_tuple = gre_print_tuple, + .print_conntrack = gre_print_conntrack, + .packet = gre_packet, + .new = gre_new, + .destroy = gre_destroy, + .me = THIS_MODULE, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +}; + +static int proto_gre_net_init(struct net *net) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + + rwlock_init(&net_gre->keymap_lock); + INIT_LIST_HEAD(&net_gre->keymap_list); + + return 0; +} + +static void proto_gre_net_exit(struct net *net) +{ + nf_ct_gre_keymap_flush(net); +} + +static struct pernet_operations proto_gre_net_ops = { + .init = proto_gre_net_init, + .exit = proto_gre_net_exit, + .id = &proto_gre_net_id, + .size = sizeof(struct netns_proto_gre), +}; + +static int __init nf_ct_proto_gre_init(void) +{ + int rv; + + rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); + if (rv < 0) + return rv; + rv = register_pernet_subsys(&proto_gre_net_ops); + if (rv < 0) + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); + return rv; +} + +static void __exit nf_ct_proto_gre_fini(void) +{ + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); + unregister_pernet_subsys(&proto_gre_net_ops); +} + +module_init(nf_ct_proto_gre_init); +module_exit(nf_ct_proto_gre_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c new file mode 100644 index 00000000..6772b115 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -0,0 +1,750 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *const sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { + [SCTP_CONNTRACK_CLOSED] = 10 SECS, + [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, + [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, + [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, +}; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct sctphdr *hp; + struct sctphdr _hdr; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return true; +} + +static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + enum sctp_conntrack state; + + spin_lock_bh(&ct->lock); + state = ct->proto.sctp.state; + spin_unlock_bh(&ct->lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ +for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ + (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + unsigned long *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK || + sch->type == SCTP_CID_SHUTDOWN_COMPLETE) + flag = 1; + + /* + * Cookie Ack/Echo chunks not the first OR + * Init / Init Ack / Shutdown compl chunks not the only chunks + * OR zero-length. + */ + if (((sch->type == SCTP_CID_COOKIE_ACK || + sch->type == SCTP_CID_COOKIE_ECHO || + flag) && + count != 0) || !sch->length) { + pr_debug("Basic checks failed\n"); + return 1; + } + + if (map) + set_bit(sch->type, map); + } + + pr_debug("Basic checks passed\n"); + return count == 0; +} + +static int sctp_new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + pr_debug("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + pr_debug("SCTP_CID_INIT\n"); + i = 0; + break; + case SCTP_CID_INIT_ACK: + pr_debug("SCTP_CID_INIT_ACK\n"); + i = 1; + break; + case SCTP_CID_ABORT: + pr_debug("SCTP_CID_ABORT\n"); + i = 2; + break; + case SCTP_CID_SHUTDOWN: + pr_debug("SCTP_CID_SHUTDOWN\n"); + i = 3; + break; + case SCTP_CID_SHUTDOWN_ACK: + pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; + break; + case SCTP_CID_ERROR: + pr_debug("SCTP_CID_ERROR\n"); + i = 5; + break; + case SCTP_CID_COOKIE_ECHO: + pr_debug("SCTP_CID_COOKIE_ECHO\n"); + i = 6; + break; + case SCTP_CID_COOKIE_ACK: + pr_debug("SCTP_CID_COOKIE_ACK\n"); + i = 7; + break; + case SCTP_CID_SHUTDOWN_COMPLETE: + pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; + break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + pr_debug("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -NF_ACCEPT for invalid. */ +static int sctp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + enum sctp_conntrack new_state, old_state; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + const struct sctphdr *sh; + struct sctphdr _sctph; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + u_int32_t offset, count; + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + goto out; + + if (do_basic_checks(ct, skb, dataoff, map) != 0) + goto out; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; + } + + old_state = new_state = SCTP_CONNTRACK_NONE; + spin_lock_bh(&ct->lock); + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) + goto out_unlock; + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir]) + goto out_unlock; + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir] && + sch->flags & SCTP_CHUNK_FLAG_T) + goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (sh->vtag != ct->proto.sctp.vtag[dir]) + goto out_unlock; + } + + old_state = ct->proto.sctp.state; + new_state = sctp_new_state(dir, old_state, sch->type); + + /* Invalid */ + if (new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " + "conntrack=%u\n", + dir, sch->type, old_state); + goto out_unlock; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + goto out_unlock; + pr_debug("Setting vtag %x for dir %d\n", + ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; + } + + ct->proto.sctp.state = new_state; + if (old_state != new_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + } + spin_unlock_bh(&ct->lock); + + nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]); + + if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && + dir == IP_CT_DIR_REPLY && + new_state == SCTP_CONNTRACK_ESTABLISHED) { + pr_debug("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } + + return NF_ACCEPT; + +out_unlock: + spin_unlock_bh(&ct->lock); +out: + return -NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + enum sctp_conntrack new_state; + const struct sctphdr *sh; + struct sctphdr _sctph; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + u_int32_t offset, count; + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return false; + + if (do_basic_checks(ct, skb, dataoff, map) != 0) + return false; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if (test_bit(SCTP_CID_ABORT, map) || + test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || + test_bit(SCTP_CID_COOKIE_ACK, map)) + return false; + + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); + new_state = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (new_state == SCTP_CONNTRACK_NONE || + new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); + return false; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return false; + + pr_debug("Setting vtag %x for new conn\n", + ih->init_tag); + + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return false; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + pr_debug("Setting vtag %x for new conn OOTB\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + ct->proto.sctp.state = new_state; + } + + return true; +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_U8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state); + + NLA_PUT_BE32(skb, + CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]); + + NLA_PUT_BE32(skb, + CTA_PROTOINFO_SCTP_VTAG_REPLY, + ct->proto.sctp.vtag[IP_CT_DIR_REPLY]); + + spin_unlock_bh(&ct->lock); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { + [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, + [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, +}; + +static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; + struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; + int err; + + /* updates may not contain the internal protocol info, skip parsing */ + if (!attr) + return 0; + + err = nla_parse_nested(tb, + CTA_PROTOINFO_SCTP_MAX, + attr, + sctp_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_PROTOINFO_SCTP_STATE] || + !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || + !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) + return -EINVAL; + + spin_lock_bh(&ct->lock); + ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); + spin_unlock_bh(&ct->lock); + + return 0; +} + +static int sctp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_SCTP */ + + nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1); +} +#endif + +#ifdef CONFIG_SYSCTL +static unsigned int sctp_sysctl_table_users; +static struct ctl_table_header *sctp_sysctl_header; +static struct ctl_table sctp_sysctl_table[] = { + { + .procname = "nf_conntrack_sctp_timeout_closed", + .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_cookie_wait", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_cookie_echoed", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_established", + .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_shutdown_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_shutdown_recd", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table sctp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_sctp_timeout_closed", + .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_cookie_wait", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_cookie_echoed", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_established", + .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_shutdown_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_shutdown_recd", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { + .l3proto = PF_INET, + .l4proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .me = THIS_MODULE, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nlattr = sctp_to_nlattr, + .nlattr_size = sctp_nlattr_size, + .from_nlattr = nlattr_to_sctp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &sctp_sysctl_table_users, + .ctl_table_header = &sctp_sysctl_header, + .ctl_table = sctp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = sctp_compat_sysctl_table, +#endif +#endif +}; + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { + .l3proto = PF_INET6, + .l4proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .me = THIS_MODULE, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nlattr = sctp_to_nlattr, + .nlattr_size = sctp_nlattr_size, + .from_nlattr = nlattr_to_sctp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &sctp_sysctl_table_users, + .ctl_table_header = &sctp_sysctl_header, + .ctl_table = sctp_sysctl_table, +#endif +}; + +static int __init nf_conntrack_proto_sctp_init(void) +{ + int ret; + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4); + if (ret) { + pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n"); + goto out; + } + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6); + if (ret) { + pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n"); + goto cleanup_sctp4; + } + + return ret; + + cleanup_sctp4: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); + out: + return ret; +} + +static void __exit nf_conntrack_proto_sctp_fini(void) +{ + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +} + +module_init(nf_conntrack_proto_sctp_init); +module_exit(nf_conntrack_proto_sctp_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); +MODULE_ALIAS("ip_conntrack_proto_sctp"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c new file mode 100644 index 00000000..37bf9439 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -0,0 +1,1497 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +static int nf_ct_tcp_be_liberal __read_mostly = 0; + +/* If it is set to zero, we disable picking up already established + connections. */ +static int nf_ct_tcp_loose __read_mostly = 1; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +static int nf_ct_tcp_max_retrans __read_mostly = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *const tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "SYN_SENT2", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; +static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly = 5 MINS; + +static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { + [TCP_CONNTRACK_SYN_SENT] = 2 MINS, + [TCP_CONNTRACK_SYN_RECV] = 60 SECS, + [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, + [TCP_CONNTRACK_LAST_ACK] = 30 SECS, + [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE] = 10 SECS, + [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, +}; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sS2 TCP_CONNTRACK_SYN_SENT2 +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection (RST) + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if we regard them as truly invalid packets + */ +static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sS2 -> sS2 Late retransmitted SYN + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/* + * sNO -> sIV Too late and no reason to do anything + * sSS -> sIV Client can't send SYN and then SYN/ACK + * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sS2 -> sIV + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sS2 -> sIV + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 }, +/* + * sNO -> sIV Never reached. + * sSS -> sS2 Simultaneous open + * sS2 -> sS2 Retransmitted simultaneous SYN + * sSR -> sIV Invalid SYN packets sent by the server + * sES -> sIV + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/* + * sSS -> sSR Standard open. + * sS2 -> sSR Simultaneous open + * sSR -> sIG Retransmitted SYN/ACK, ignore it. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sS2 -> sIV + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, +/* + * sSS -> sIG Might be a half-open connection. + * sS2 -> sIG + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct tcphdr *hp; + struct tcphdr _hdr; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return true; +} + +static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + enum tcp_conntrack state; + + spin_lock_bh(&ct->lock); + state = ct->proto.tcp.state; + spin_unlock_bh(&ct->lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.sane.nl/events/sane2000/papers.html + http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid (s)ack: sack <= receiver.td_end + IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet + or ack in the case of packet without SACK option. + + The upper bound limit for a valid (s)ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + unsigned int dataoff, + const struct tcphdr *tcph) +{ + /* XXX Should I use payload length field in IP/IPv6 header ? + * - YK */ + return (seq + len - dataoff - tcph->doff*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + const unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, + const struct tcphdr *tcph, __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + const unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode = *ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +#ifdef CONFIG_NF_NAT_NEEDED +static inline s16 nat_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); + + return get_offset != NULL ? get_offset(ct, dir, seq) : 0; +} +#define NAT_OFFSET(pf, ct, dir, seq) \ + (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0) +#else +#define NAT_OFFSET(pf, ct, dir, seq) 0 +#endif + +static bool tcp_in_window(const struct nf_conn *ct, + struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u_int8_t pf) +{ + struct net *net = nf_ct_net(ct); + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; + __u32 seq, ack, sack, end, win, swin; + s16 receiver_offset; + bool res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, dataoff, tcph, &sack); + + /* Take into account NAT sequence number mangling */ + receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1); + ack -= receiver_offset; + sack -= receiver_offset; + + pr_debug("tcp_in_window: START\n"); + pr_debug("tcp_in_window: "); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_maxwin == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn) { + /* + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + if (!tcph->ack) + /* Simultaneous open */ + return true; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + win <<= sender->td_scale; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->td_maxwin == 0) + receiver->td_end = receiver->td_maxend = sack; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + pr_debug("tcp_in_window: "); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); + + if (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) { + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) + sender->td_maxack = ack; + } + + /* + * Update receiver data. + */ + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end + && state->last_win == win) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win; + state->retrans = 0; + } + } + res = true; + } else { + res = false; + if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || + nf_ct_tcp_be_liberal) + res = true; + if (!res && LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + } + + pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +/* table of valid flag combinations - PUSH, ECE and CWR are always valid */ +static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| + TCPHDR_URG) + 1] = +{ + [TCPHDR_SYN] = 1, + [TCPHDR_SYN|TCPHDR_URG] = 1, + [TCPHDR_SYN|TCPHDR_ACK] = 1, + [TCPHDR_RST] = 1, + [TCPHDR_RST|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, + [TCPHDR_ACK] = 1, + [TCPHDR_ACK|TCPHDR_URG] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + const struct tcphdr *th; + struct tcphdr _tcph; + unsigned int tcplen = skb->len - dataoff; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the checksum is assumed to be correct. + */ + /* FIXME: Source route IP option packets --RR */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_tuple *tuple; + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + const struct tcphdr *th; + struct tcphdr _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + tuple = &ct->tuplehash[dir].tuple; + + switch (new_state) { + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + /* RFC 1122: "When a connection is closed actively, + * it MUST linger in TIME-WAIT state for a time 2xMSL + * (Maximum Segment Lifetime). However, it MAY accept + * a new SYN from the remote TCP to reopen the connection + * directly from TIME-WAIT state, if..." + * We ignore the conditions because we are in the + * TIME-WAIT state anyway. + * + * Handle aborted connections: we and the server + * think there is an existing connection but the client + * aborts it and starts a new one. + */ + if (((ct->proto.tcp.seen[dir].flags + | ct->proto.tcp.seen[!dir].flags) + & IP_CT_TCP_FLAG_CLOSE_INIT) + || (ct->proto.tcp.last_dir == dir + && ct->proto.tcp.last_index == TCP_RST_SET)) { + /* Attempt to reopen a closed/aborted connection. + * Delete this connection and look up again. */ + spin_unlock_bh(&ct->lock); + + /* Only repeat if we can actually remove the timer. + * Destruction may already be in progress in process + * context and we must give it a chance to terminate. + */ + if (nf_ct_kill(ct)) + return -NF_REPEAT; + return NF_DROP; + } + /* Fall through */ + case TCP_CONNTRACK_IGNORE: + /* Ignored packets: + * + * Our connection entry may be out of sync, so ignore + * packets which may signal the real connection between + * the client and the server. + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + * + * If the ignored packet is invalid, the receiver will send + * a RST we'll catch below. + */ + if (index == TCP_SYNACK_SET + && ct->proto.tcp.last_index == TCP_SYN_SET + && ct->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { + /* b) This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We get in sync from the previously annotated + * values. + */ + old_state = TCP_CONNTRACK_SYN_SENT; + new_state = TCP_CONNTRACK_SYN_RECV; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = + ct->proto.tcp.last_win == 0 ? + 1 : ct->proto.tcp.last_win; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = + ct->proto.tcp.last_wscale; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = + ct->proto.tcp.last_flags; + memset(&ct->proto.tcp.seen[dir], 0, + sizeof(struct ip_ct_tcp_state)); + break; + } + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + ct->proto.tcp.last_seq = ntohl(th->seq); + ct->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); + ct->proto.tcp.last_win = ntohs(th->window); + + /* a) This is a SYN in ORIGINAL. The client and the server + * may be in sync but we are not. In that case, we annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server ignores it. */ + if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { + struct ip_ct_tcp_state seen = {}; + + ct->proto.tcp.last_flags = + ct->proto.tcp.last_wscale = 0; + tcp_options(skb, dataoff, th, &seen); + if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + ct->proto.tcp.last_wscale = seen.td_scale; + } + if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_SACK_PERM; + } + } + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packet ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), old_state); + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) + && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { + /* Invalid RST */ + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid RST "); + return -NF_ACCEPT; + } + if (index == TCP_RST_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_ACK_SET)) + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { + /* RST sent to invalid SYN or ACK we had let through + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. + * We skip window checking, because packet might ACK + * segments we ignored. */ + goto in_window; + } + /* Just fall through */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, + skb, dataoff, th, pf)) { + spin_unlock_bh(&ct->lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + + pr_debug("tcp_conntracks: "); + nf_ct_dump_tuple(tuple); + pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + ct->proto.tcp.state = new_state; + if (old_state != new_state + && new_state == TCP_CONNTRACK_FIN_WAIT) + ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && + tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans) + timeout = nf_ct_tcp_timeout_max_retrans; + else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & + IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && + tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged) + timeout = nf_ct_tcp_timeout_unacknowledged; + else + timeout = tcp_timeouts[new_state]; + spin_unlock_bh(&ct->lock); + + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } + nf_ct_refresh_acct(ct, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + enum tcp_conntrack new_state; + const struct tcphdr *th; + struct tcphdr _tcph; + const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; + const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + pr_debug("nf_ct_tcp: invalid new deleting.\n"); + return false; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* SYN packet */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + } else if (nf_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return false; + } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end + + ct->proto.tcp.seen[0].td_maxwin; + + /* We assume SACK and liberal window checking to handle + * window scaling */ + ct->proto.tcp.seen[0].flags = + ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | + IP_CT_TCP_FLAG_BE_LIBERAL; + } + + /* tcp_packet will set them */ + ct->proto.tcp.last_index = TCP_NONE_SET; + + pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return true; +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + struct nf_ct_tcp_flags tmp = {}; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state); + + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + ct->proto.tcp.seen[0].td_scale); + + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, + ct->proto.tcp.seen[1].td_scale); + + tmp.flags = ct->proto.tcp.seen[0].flags; + NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, + sizeof(struct nf_ct_tcp_flags), &tmp); + + tmp.flags = ct->proto.tcp.seen[1].flags; + NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, + sizeof(struct nf_ct_tcp_flags), &tmp); + spin_unlock_bh(&ct->lock); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { + [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, + [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, +}; + +static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; + struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; + int err; + + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!pattr) + return 0; + + err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy); + if (err < 0) + return err; + + if (tb[CTA_PROTOINFO_TCP_STATE] && + nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) + return -EINVAL; + + spin_lock_bh(&ct->lock); + if (tb[CTA_PROTOINFO_TCP_STATE]) + ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); + + if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { + struct nf_ct_tcp_flags *attr = + nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); + ct->proto.tcp.seen[0].flags &= ~attr->mask; + ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; + } + + if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { + struct nf_ct_tcp_flags *attr = + nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); + ct->proto.tcp.seen[1].flags &= ~attr->mask; + ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; + } + + if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && + tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && + ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.seen[0].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); + ct->proto.tcp.seen[1].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); + } + spin_unlock_bh(&ct->lock); + + return 0; +} + +static int tcp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_TCP */ + + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); +} + +static int tcp_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} +#endif + +#ifdef CONFIG_SYSCTL +static unsigned int tcp_sysctl_table_users; +static struct ctl_table_header *tcp_sysctl_header; +static struct ctl_table tcp_sysctl_table[] = { + { + .procname = "nf_conntrack_tcp_timeout_syn_sent", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_syn_recv", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_established", + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_fin_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_close_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_last_ack", + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_time_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_close", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_max_retrans", + .data = &nf_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_unacknowledged", + .data = &nf_ct_tcp_timeout_unacknowledged, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table tcp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_tcp_timeout_syn_sent", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_sent2", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_recv", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_established", + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_fin_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_close_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_last_ack", + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_time_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_close", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_max_retrans", + .data = &nf_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, + .from_nlattr = nlattr_to_tcp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &tcp_sysctl_table_users, + .ctl_table_header = &tcp_sysctl_header, + .ctl_table = tcp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = tcp_compat_sysctl_table, +#endif +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); + +struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, + .from_nlattr = nlattr_to_tcp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &tcp_sysctl_table_users, + .ctl_table_header = &tcp_sysctl_header, + .ctl_table = tcp_sysctl_table, +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c new file mode 100644 index 00000000..8289088b --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -0,0 +1,231 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ; +static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ; + +static bool udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct udphdr *hp; + struct udphdr _hdr; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return true; +} + +static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + return true; +} + +static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the checksum is assumed to be correct. + * FIXME: Source route IP option packets --RR */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +#ifdef CONFIG_SYSCTL +static unsigned int udp_sysctl_table_users; +static struct ctl_table_header *udp_sysctl_header; +static struct ctl_table udp_sysctl_table[] = { + { + .procname = "nf_conntrack_udp_timeout", + .data = &nf_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_udp_timeout_stream", + .data = &nf_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table udp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_udp_timeout", + .data = &nf_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_udp_timeout_stream", + .data = &nf_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .packet = udp_packet, + .new = udp_new, + .error = udp_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udp_sysctl_table_users, + .ctl_table_header = &udp_sysctl_header, + .ctl_table = udp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = udp_compat_sysctl_table, +#endif +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); + +struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .packet = udp_packet, + .new = udp_new, + .error = udp_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udp_sysctl_table_users, + .ctl_table_header = &udp_sysctl_header, + .ctl_table = udp_sysctl_table, +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c new file mode 100644 index 00000000..263b5a72 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -0,0 +1,240 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * (C) 2007 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static unsigned int nf_ct_udplite_timeout __read_mostly = 30*HZ; +static unsigned int nf_ct_udplite_timeout_stream __read_mostly = 180*HZ; + +static bool udplite_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct udphdr *hp; + struct udphdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + return true; +} + +static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int udplite_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udplite_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + nf_ct_udplite_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + return true; +} + +static int udplite_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + unsigned int cscov; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: short packet "); + return -NF_ACCEPT; + } + + cscov = ntohs(hdr->len); + if (cscov == 0) + cscov = udplen; + else if (cscov < sizeof(*hdr) || cscov > udplen) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: invalid checksum coverage "); + return -NF_ACCEPT; + } + + /* UDPLITE mandates checksums */ + if (!hdr->check) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: checksum missing "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, + pf)) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: bad UDPLite checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +#ifdef CONFIG_SYSCTL +static unsigned int udplite_sysctl_table_users; +static struct ctl_table_header *udplite_sysctl_header; +static struct ctl_table udplite_sysctl_table[] = { + { + .procname = "nf_conntrack_udplite_timeout", + .data = &nf_ct_udplite_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_udplite_timeout_stream", + .data = &nf_ct_udplite_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .pkt_to_tuple = udplite_pkt_to_tuple, + .invert_tuple = udplite_invert_tuple, + .print_tuple = udplite_print_tuple, + .packet = udplite_packet, + .new = udplite_new, + .error = udplite_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udplite_sysctl_table_users, + .ctl_table_header = &udplite_sysctl_header, + .ctl_table = udplite_sysctl_table, +#endif +}; + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .pkt_to_tuple = udplite_pkt_to_tuple, + .invert_tuple = udplite_invert_tuple, + .print_tuple = udplite_print_tuple, + .packet = udplite_packet, + .new = udplite_new, + .error = udplite_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udplite_sysctl_table_users, + .ctl_table_header = &udplite_sysctl_header, + .ctl_table = udplite_sysctl_table, +#endif +}; + +static int __init nf_conntrack_proto_udplite_init(void) +{ + int err; + + err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4); + if (err < 0) + goto err1; + err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6); + if (err < 0) + goto err2; + return 0; +err2: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +err1: + return err; +} + +static void __exit nf_conntrack_proto_udplite_exit(void) +{ + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +} + +module_init(nf_conntrack_proto_udplite_init); +module_exit(nf_conntrack_proto_udplite_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c new file mode 100644 index 00000000..8501823b --- /dev/null +++ b/net/netfilter/nf_conntrack_sane.c @@ -0,0 +1,238 @@ +/* SANE connection tracking helper + * (SANE = Scanner Access Now Easy) + * For documentation about the SANE network protocol see + * http://www.sane-project.org/html/doc015.html + */ + +/* Copyright (C) 2007 Red Hat, Inc. + * Author: Michal Schmidt + * Based on the FTP conntrack helper (net/netfilter/nf_conntrack_ftp.c): + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * (C) 2003 Yasuyuki Kozakai @USAGI + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Schmidt "); +MODULE_DESCRIPTION("SANE connection tracking helper"); +MODULE_ALIAS_NFCT_HELPER("sane"); + +static char *sane_buffer; + +static DEFINE_SPINLOCK(nf_sane_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +struct sane_request { + __be32 RPC_code; +#define SANE_NET_START 7 /* RPC code */ + + __be32 handle; +}; + +struct sane_reply_net_start { + __be32 status; +#define SANE_STATUS_SUCCESS 0 + + __be16 zero; + __be16 port; + /* other fields aren't interesting for conntrack */ +}; + +static int help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const struct tcphdr *th; + struct tcphdr _tcph; + void *sb_ptr; + int ret = NF_ACCEPT; + int dir = CTINFO2DIR(ctinfo); + struct nf_ct_sane_master *ct_sane_info; + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + struct sane_request *req; + struct sane_reply_net_start *reply; + + ct_sane_info = &nfct_help(ct)->help.ct_sane_info; + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + /* Not a full tcp header? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = protoff + th->doff * 4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + datalen = skb->len - dataoff; + + spin_lock_bh(&nf_sane_lock); + sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer); + BUG_ON(sb_ptr == NULL); + + if (dir == IP_CT_DIR_ORIGINAL) { + if (datalen != sizeof(struct sane_request)) + goto out; + + req = sb_ptr; + if (req->RPC_code != htonl(SANE_NET_START)) { + /* Not an interesting command */ + ct_sane_info->state = SANE_STATE_NORMAL; + goto out; + } + + /* We're interested in the next reply */ + ct_sane_info->state = SANE_STATE_START_REQUESTED; + goto out; + } + + /* Is it a reply to an uninteresting command? */ + if (ct_sane_info->state != SANE_STATE_START_REQUESTED) + goto out; + + /* It's a reply to SANE_NET_START. */ + ct_sane_info->state = SANE_STATE_NORMAL; + + if (datalen < sizeof(struct sane_reply_net_start)) { + pr_debug("nf_ct_sane: NET_START reply too short\n"); + goto out; + } + + reply = sb_ptr; + if (reply->status != htonl(SANE_STATUS_SUCCESS)) { + /* saned refused the command */ + pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n", + ntohl(reply->status)); + goto out; + } + + /* Invalid saned reply? Ignore it. */ + if (reply->zero != 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_TCP, NULL, &reply->port); + + pr_debug("nf_ct_sane: expect: "); + nf_ct_dump_tuple(&exp->tuple); + + /* Can't expect this? Best to drop packet now. */ + if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + + nf_ct_expect_put(exp); + +out: + spin_unlock_bh(&nf_sane_lock); + return ret; +} + +static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; +static char sane_names[MAX_PORTS][2][sizeof("sane-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy sane_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; + +/* don't make this __exit, since it's called from __init ! */ +static void nf_conntrack_sane_fini(void) +{ + int i, j; + + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + pr_debug("nf_ct_sane: unregistering helper for pf: %d " + "port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&sane[i][j]); + } + } + + kfree(sane_buffer); +} + +static int __init nf_conntrack_sane_init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + sane_buffer = kmalloc(65536, GFP_KERNEL); + if (!sane_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = SANE_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + sane[i][0].tuple.src.l3num = PF_INET; + sane[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); + sane[i][j].tuple.dst.protonum = IPPROTO_TCP; + sane[i][j].expect_policy = &sane_exp_policy; + sane[i][j].me = THIS_MODULE; + sane[i][j].help = help; + tmpname = &sane_names[i][j][0]; + if (ports[i] == SANE_PORT) + sprintf(tmpname, "sane"); + else + sprintf(tmpname, "sane-%d", ports[i]); + sane[i][j].name = tmpname; + + pr_debug("nf_ct_sane: registering helper for pf: %d " + "port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&sane[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_sane: failed to " + "register helper for pf: %d port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_sane_fini(); + return ret; + } + } + } + + return 0; +} + +module_init(nf_conntrack_sane_init); +module_exit(nf_conntrack_sane_fini); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c new file mode 100644 index 00000000..93faf6a3 --- /dev/null +++ b/net/netfilter/nf_conntrack_sip.c @@ -0,0 +1,1610 @@ +/* SIP extension for IP connection tracking. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_conntrack_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP connection tracking helper"); +MODULE_ALIAS("ip_conntrack_sip"); +MODULE_ALIAS_NFCT_HELPER("sip"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of SIP servers"); + +static unsigned int sip_timeout __read_mostly = SIP_TIMEOUT; +module_param(sip_timeout, uint, 0600); +MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); + +static int sip_direct_signalling __read_mostly = 1; +module_param(sip_direct_signalling, int, 0600); +MODULE_PARM_DESC(sip_direct_signalling, "expect incoming calls from registrar " + "only (default 1)"); + +static int sip_direct_media __read_mostly = 1; +module_param(sip_direct_media, int, 0600); +MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling " + "endpoints only (default 1)"); + +unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sip_hook); + +void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, s16 off) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook); + +unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb, + unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + struct nf_conntrack_expect *exp, + unsigned int matchoff, + unsigned int matchlen) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook); + +unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + const union nf_inet_addr *addr) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook); + +unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + unsigned int matchoff, + unsigned int matchlen, + u_int16_t port) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook); + +unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb, + unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + unsigned int sdpoff, + const union nf_inet_addr *addr) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook); + +unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp, + unsigned int mediaoff, + unsigned int medialen, + union nf_inet_addr *rtp_addr) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook); + +static int string_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = 0; + + while (dptr < limit && isalpha(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int digits_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = 0; + while (dptr < limit && isdigit(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int iswordc(const char c) +{ + if (isalnum(c) || c == '!' || c == '"' || c == '%' || + (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || + c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || + c == '{' || c == '}' || c == '~') + return 1; + return 0; +} + +static int word_len(const char *dptr, const char *limit) +{ + int len = 0; + while (dptr < limit && iswordc(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int callid_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len, domain_len; + + len = word_len(dptr, limit); + dptr += len; + if (!len || dptr == limit || *dptr != '@') + return len; + dptr++; + len++; + + domain_len = word_len(dptr, limit); + if (!domain_len) + return 0; + return len + domain_len; +} + +/* get media type + port length */ +static int media_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = string_len(ct, dptr, limit, shift); + + dptr += len; + if (dptr >= limit || *dptr != ' ') + return 0; + len++; + dptr++; + + return len + digits_len(ct, dptr, limit, shift); +} + +static int parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret = 0; + + if (!ct) + return 0; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0 || end == cp) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int epaddr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + /* Port number */ + if (*dptr == ':') { + dptr++; + dptr += digits_len(ct, dptr, limit, shift); + } + return dptr - aux; +} + +/* get address length, skiping user info. */ +static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + const char *start = dptr; + int s = *shift; + + /* Search for @, but stop at the end of the line. + * We are inside a sip: URI, so we don't need to worry about + * continuation lines. */ + while (dptr < limit && + *dptr != '@' && *dptr != '\r' && *dptr != '\n') { + (*shift)++; + dptr++; + } + + if (dptr < limit && *dptr == '@') { + dptr++; + (*shift)++; + } else { + dptr = start; + *shift = s; + } + + return epaddr_len(ct, dptr, limit, shift); +} + +/* Parse a SIP request line of the form: + * + * Request-Line = Method SP Request-URI SP SIP-Version CRLF + * + * and return the offset and length of the address contained in the Request-URI. + */ +int ct_sip_parse_request(const struct nf_conn *ct, + const char *dptr, unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, __be16 *port) +{ + const char *start = dptr, *limit = dptr + datalen, *end; + unsigned int mlen; + unsigned int p; + int shift = 0; + + /* Skip method and following whitespace */ + mlen = string_len(ct, dptr, limit, NULL); + if (!mlen) + return 0; + dptr += mlen; + if (++dptr >= limit) + return 0; + + /* Find SIP URI */ + for (; dptr < limit - strlen("sip:"); dptr++) { + if (*dptr == '\r' || *dptr == '\n') + return -1; + if (strnicmp(dptr, "sip:", strlen("sip:")) == 0) { + dptr += strlen("sip:"); + break; + } + } + if (!skp_epaddr_len(ct, dptr, limit, &shift)) + return 0; + dptr += shift; + + if (!parse_addr(ct, dptr, &end, addr, limit)) + return -1; + if (end < limit && *end == ':') { + end++; + p = simple_strtoul(end, (char **)&end, 10); + if (p < 1024 || p > 65535) + return -1; + *port = htons(p); + } else + *port = htons(SIP_PORT); + + if (end == dptr) + return 0; + *matchoff = dptr - start; + *matchlen = end - dptr; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_request); + +/* SIP header parsing: SIP headers are located at the beginning of a line, but + * may span several lines, in which case the continuation lines begin with a + * whitespace character. RFC 2543 allows lines to be terminated with CR, LF or + * CRLF, RFC 3261 allows only CRLF, we support both. + * + * Headers are followed by (optionally) whitespace, a colon, again (optionally) + * whitespace and the values. Whitespace in this context means any amount of + * tabs, spaces and continuation lines, which are treated as a single whitespace + * character. + * + * Some headers may appear multiple times. A comma separated list of values is + * equivalent to multiple headers. + */ +static const struct sip_header ct_sip_hdrs[] = { + [SIP_HDR_CSEQ] = SIP_HDR("CSeq", NULL, NULL, digits_len), + [SIP_HDR_FROM] = SIP_HDR("From", "f", "sip:", skp_epaddr_len), + [SIP_HDR_TO] = SIP_HDR("To", "t", "sip:", skp_epaddr_len), + [SIP_HDR_CONTACT] = SIP_HDR("Contact", "m", "sip:", skp_epaddr_len), + [SIP_HDR_VIA_UDP] = SIP_HDR("Via", "v", "UDP ", epaddr_len), + [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len), + [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len), + [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len), + [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len), +}; + +static const char *sip_follow_continuation(const char *dptr, const char *limit) +{ + /* Walk past newline */ + if (++dptr >= limit) + return NULL; + + /* Skip '\n' in CR LF */ + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + return NULL; + } + + /* Continuation line? */ + if (*dptr != ' ' && *dptr != '\t') + return NULL; + + /* skip leading whitespace */ + for (; dptr < limit; dptr++) { + if (*dptr != ' ' && *dptr != '\t') + break; + } + return dptr; +} + +static const char *sip_skip_whitespace(const char *dptr, const char *limit) +{ + for (; dptr < limit; dptr++) { + if (*dptr == ' ') + continue; + if (*dptr != '\r' && *dptr != '\n') + break; + dptr = sip_follow_continuation(dptr, limit); + if (dptr == NULL) + return NULL; + } + return dptr; +} + +/* Search within a SIP header value, dealing with continuation lines */ +static const char *ct_sip_header_search(const char *dptr, const char *limit, + const char *needle, unsigned int len) +{ + for (limit -= len; dptr < limit; dptr++) { + if (*dptr == '\r' || *dptr == '\n') { + dptr = sip_follow_continuation(dptr, limit); + if (dptr == NULL) + break; + continue; + } + + if (strnicmp(dptr, needle, len) == 0) + return dptr; + } + return NULL; +} + +int ct_sip_get_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sip_hdrs[type]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + for (dptr += dataoff; dptr < limit; dptr++) { + /* Find beginning of line */ + if (*dptr != '\r' && *dptr != '\n') + continue; + if (++dptr >= limit) + break; + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + break; + } + + /* Skip continuation lines */ + if (*dptr == ' ' || *dptr == '\t') + continue; + + /* Find header. Compact headers must be followed by a + * non-alphabetic character to avoid mismatches. */ + if (limit - dptr >= hdr->len && + strnicmp(dptr, hdr->name, hdr->len) == 0) + dptr += hdr->len; + else if (hdr->cname && limit - dptr >= hdr->clen + 1 && + strnicmp(dptr, hdr->cname, hdr->clen) == 0 && + !isalpha(*(dptr + hdr->clen))) + dptr += hdr->clen; + else + continue; + + /* Find and skip colon */ + dptr = sip_skip_whitespace(dptr, limit); + if (dptr == NULL) + break; + if (*dptr != ':' || ++dptr >= limit) + break; + + /* Skip whitespace after colon */ + dptr = sip_skip_whitespace(dptr, limit); + if (dptr == NULL) + break; + + *matchoff = dptr - start; + if (hdr->search) { + dptr = ct_sip_header_search(dptr, limit, hdr->search, + hdr->slen); + if (!dptr) + return -1; + dptr += hdr->slen; + } + + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff = dptr - start + shift; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(ct_sip_get_header); + +/* Get next header field in a list of comma separated values */ +static int ct_sip_next_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sip_hdrs[type]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + dptr += dataoff; + + dptr = ct_sip_header_search(dptr, limit, ",", strlen(",")); + if (!dptr) + return 0; + + dptr = ct_sip_header_search(dptr, limit, hdr->search, hdr->slen); + if (!dptr) + return 0; + dptr += hdr->slen; + + *matchoff = dptr - start; + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff += shift; + return 1; +} + +/* Walk through headers until a parsable one is found or no header of the + * given type is left. */ +static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, int *in_header, + unsigned int *matchoff, unsigned int *matchlen) +{ + int ret; + + if (in_header && *in_header) { + while (1) { + ret = ct_sip_next_header(ct, dptr, dataoff, datalen, + type, matchoff, matchlen); + if (ret > 0) + return ret; + if (ret == 0) + break; + dataoff += *matchoff; + } + *in_header = 0; + } + + while (1) { + ret = ct_sip_get_header(ct, dptr, dataoff, datalen, + type, matchoff, matchlen); + if (ret > 0) + break; + if (ret == 0) + return ret; + dataoff += *matchoff; + } + + if (in_header) + *in_header = 1; + return 1; +} + +/* Locate a SIP header, parse the URI and return the offset and length of + * the address as well as the address and port themselves. A stream of + * headers can be parsed by handing in a non-NULL datalen and in_header + * pointer. + */ +int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, + unsigned int *dataoff, unsigned int datalen, + enum sip_header_types type, int *in_header, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, __be16 *port) +{ + const char *c, *limit = dptr + datalen; + unsigned int p; + int ret; + + ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen, + type, in_header, matchoff, matchlen); + WARN_ON(ret < 0); + if (ret == 0) + return ret; + + if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + return -1; + if (*c == ':') { + c++; + p = simple_strtoul(c, (char **)&c, 10); + if (p < 1024 || p > 65535) + return -1; + *port = htons(p); + } else + *port = htons(SIP_PORT); + + if (dataoff) + *dataoff = c - dptr; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_header_uri); + +static int ct_sip_parse_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen) +{ + const char *limit = dptr + datalen; + const char *start; + const char *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + start += strlen(name); + + end = ct_sip_header_search(start, limit, ";", strlen(";")); + if (!end) + end = limit; + + *matchoff = start - dptr; + *matchlen = end - start; + return 1; +} + +/* Parse address from header parameter and return address, offset and length */ +int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr) +{ + const char *limit = dptr + datalen; + const char *start, *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + + start += strlen(name); + if (!parse_addr(ct, start, &end, addr, limit)) + return 0; + *matchoff = start - dptr; + *matchlen = end - start; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_address_param); + +/* Parse numerical header parameter and return value, offset and length */ +int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen, + unsigned int *val) +{ + const char *limit = dptr + datalen; + const char *start; + char *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + + start += strlen(name); + *val = simple_strtoul(start, &end, 0); + if (start == end) + return 0; + if (matchoff && matchlen) { + *matchoff = start - dptr; + *matchlen = end - start; + } + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_numerical_param); + +static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + u8 *proto) +{ + unsigned int matchoff, matchlen; + + if (ct_sip_parse_param(ct, dptr, dataoff, datalen, "transport=", + &matchoff, &matchlen)) { + if (!strnicmp(dptr + matchoff, "TCP", strlen("TCP"))) + *proto = IPPROTO_TCP; + else if (!strnicmp(dptr + matchoff, "UDP", strlen("UDP"))) + *proto = IPPROTO_UDP; + else + return 0; + + if (*proto != nf_ct_protonum(ct)) + return 0; + } else + *proto = nf_ct_protonum(ct); + + return 1; +} + +/* SDP header parsing: a SDP session description contains an ordered set of + * headers, starting with a section containing general session parameters, + * optionally followed by multiple media descriptions. + * + * SDP headers always start at the beginning of a line. According to RFC 2327: + * "The sequence CRLF (0x0d0a) is used to end a record, although parsers should + * be tolerant and also accept records terminated with a single newline + * character". We handle both cases. + */ +static const struct sip_header ct_sdp_hdrs[] = { + [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), +}; + +/* Linear string search within SDP header values */ +static const char *ct_sdp_header_search(const char *dptr, const char *limit, + const char *needle, unsigned int len) +{ + for (limit -= len; dptr < limit; dptr++) { + if (*dptr == '\r' || *dptr == '\n') + break; + if (strncmp(dptr, needle, len) == 0) + return dptr; + } + return NULL; +} + +/* Locate a SDP header (optionally a substring within the header value), + * optionally stopping at the first occurrence of the term header, parse + * it and return the offset and length of the data we're interested in. + */ +int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sdp_header_types type, + enum sdp_header_types term, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sdp_hdrs[type]; + const struct sip_header *thdr = &ct_sdp_hdrs[term]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + for (dptr += dataoff; dptr < limit; dptr++) { + /* Find beginning of line */ + if (*dptr != '\r' && *dptr != '\n') + continue; + if (++dptr >= limit) + break; + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + break; + } + + if (term != SDP_HDR_UNSPEC && + limit - dptr >= thdr->len && + strnicmp(dptr, thdr->name, thdr->len) == 0) + break; + else if (limit - dptr >= hdr->len && + strnicmp(dptr, hdr->name, hdr->len) == 0) + dptr += hdr->len; + else + continue; + + *matchoff = dptr - start; + if (hdr->search) { + dptr = ct_sdp_header_search(dptr, limit, hdr->search, + hdr->slen); + if (!dptr) + return -1; + dptr += hdr->slen; + } + + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff = dptr - start + shift; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(ct_sip_get_sdp_header); + +static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sdp_header_types type, + enum sdp_header_types term, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr) +{ + int ret; + + ret = ct_sip_get_sdp_header(ct, dptr, dataoff, datalen, type, term, + matchoff, matchlen); + if (ret <= 0) + return ret; + + if (!parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) + return -1; + return 1; +} + +static int refresh_signalling_expectation(struct nf_conn *ct, + union nf_inet_addr *addr, + u8 proto, __be16 port, + unsigned int expires) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + int found = 0; + + spin_lock_bh(&nf_conntrack_lock); + hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + if (exp->class != SIP_EXPECT_SIGNALLING || + !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || + exp->tuple.dst.protonum != proto || + exp->tuple.dst.u.udp.port != port) + continue; + if (!del_timer(&exp->timeout)) + continue; + exp->flags &= ~NF_CT_EXPECT_INACTIVE; + exp->timeout.expires = jiffies + expires * HZ; + add_timer(&exp->timeout); + found = 1; + break; + } + spin_unlock_bh(&nf_conntrack_lock); + return found; +} + +static void flush_expectations(struct nf_conn *ct, bool media) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + spin_lock_bh(&nf_conntrack_lock); + hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) + continue; + if (!del_timer(&exp->timeout)) + continue; + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + if (!media) + break; + } + spin_unlock_bh(&nf_conntrack_lock); +} + +static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + union nf_inet_addr *daddr, __be16 port, + enum sip_expectation_classes class, + unsigned int mediaoff, unsigned int medialen) +{ + struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct net *net = nf_ct_net(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + union nf_inet_addr *saddr; + struct nf_conntrack_tuple tuple; + int direct_rtp = 0, skip_expect = 0, ret = NF_DROP; + u_int16_t base_port; + __be16 rtp_port, rtcp_port; + typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port; + typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media; + + saddr = NULL; + if (sip_direct_media) { + if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3)) + return NF_ACCEPT; + saddr = &ct->tuplehash[!dir].tuple.src.u3; + } + + /* We need to check whether the registration exists before attempting + * to register it since we can see the same media description multiple + * times on different connections in case multiple endpoints receive + * the same call. + * + * RTP optimization: if we find a matching media channel expectation + * and both the expectation and this connection are SNATed, we assume + * both sides can reach each other directly and use the final + * destination address from the expectation. We still need to keep + * the NATed expectations for media that might arrive from the + * outside, and additionally need to expect the direct RTP stream + * in case it passes through us even without NAT. + */ + memset(&tuple, 0, sizeof(tuple)); + if (saddr) + tuple.src.u3 = *saddr; + tuple.src.l3num = nf_ct_l3num(ct); + tuple.dst.protonum = IPPROTO_UDP; + tuple.dst.u3 = *daddr; + tuple.dst.u.udp.port = port; + + rcu_read_lock(); + do { + exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); + + if (!exp || exp->master == ct || + nfct_help(exp->master)->helper != nfct_help(ct)->helper || + exp->class != class) + break; +#ifdef CONFIG_NF_NAT_NEEDED + if (exp->tuple.src.l3num == AF_INET && !direct_rtp && + (exp->saved_ip != exp->tuple.dst.u3.ip || + exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) && + ct->status & IPS_NAT_MASK) { + daddr->ip = exp->saved_ip; + tuple.dst.u3.ip = exp->saved_ip; + tuple.dst.u.udp.port = exp->saved_proto.udp.port; + direct_rtp = 1; + } else +#endif + skip_expect = 1; + } while (!skip_expect); + rcu_read_unlock(); + + base_port = ntohs(tuple.dst.u.udp.port) & ~1; + rtp_port = htons(base_port); + rtcp_port = htons(base_port + 1); + + if (direct_rtp) { + nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook); + if (nf_nat_sdp_port && + !nf_nat_sdp_port(skb, dataoff, dptr, datalen, + mediaoff, medialen, ntohs(rtp_port))) + goto err1; + } + + if (skip_expect) + return NF_ACCEPT; + + rtp_exp = nf_ct_expect_alloc(ct); + if (rtp_exp == NULL) + goto err1; + nf_ct_expect_init(rtp_exp, class, nf_ct_l3num(ct), saddr, daddr, + IPPROTO_UDP, NULL, &rtp_port); + + rtcp_exp = nf_ct_expect_alloc(ct); + if (rtcp_exp == NULL) + goto err2; + nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr, + IPPROTO_UDP, NULL, &rtcp_port); + + nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook); + if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp) + ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen, + rtp_exp, rtcp_exp, + mediaoff, medialen, daddr); + else { + if (nf_ct_expect_related(rtp_exp) == 0) { + if (nf_ct_expect_related(rtcp_exp) != 0) + nf_ct_unexpect_related(rtp_exp); + else + ret = NF_ACCEPT; + } + } + nf_ct_expect_put(rtcp_exp); +err2: + nf_ct_expect_put(rtp_exp); +err1: + return ret; +} + +static const struct sdp_media_type sdp_media_types[] = { + SDP_MEDIA_TYPE("audio ", SIP_EXPECT_AUDIO), + SDP_MEDIA_TYPE("video ", SIP_EXPECT_VIDEO), + SDP_MEDIA_TYPE("image ", SIP_EXPECT_IMAGE), +}; + +static const struct sdp_media_type *sdp_media_type(const char *dptr, + unsigned int matchoff, + unsigned int matchlen) +{ + const struct sdp_media_type *t; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sdp_media_types); i++) { + t = &sdp_media_types[i]; + if (matchlen < t->len || + strncmp(dptr + matchoff, t->name, t->len)) + continue; + return t; + } + return NULL; +} + +static int process_sdp(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + unsigned int mediaoff, medialen; + unsigned int sdpoff; + unsigned int caddr_len, maddr_len; + unsigned int i; + union nf_inet_addr caddr, maddr, rtp_addr; + unsigned int port; + enum sdp_header_types c_hdr; + const struct sdp_media_type *t; + int ret = NF_ACCEPT; + typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr; + typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session; + + nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook); + c_hdr = nf_ct_l3num(ct) == AF_INET ? SDP_HDR_CONNECTION_IP4 : + SDP_HDR_CONNECTION_IP6; + + /* Find beginning of session description */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return NF_ACCEPT; + sdpoff = matchoff; + + /* The connection information is contained in the session description + * and/or once per media description. The first media description marks + * the end of the session description. */ + caddr_len = 0; + if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen, + c_hdr, SDP_HDR_MEDIA, + &matchoff, &matchlen, &caddr) > 0) + caddr_len = matchlen; + + mediaoff = sdpoff; + for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) { + if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen, + SDP_HDR_MEDIA, SDP_HDR_UNSPEC, + &mediaoff, &medialen) <= 0) + break; + + /* Get media type and port number. A media port value of zero + * indicates an inactive stream. */ + t = sdp_media_type(*dptr, mediaoff, medialen); + if (!t) { + mediaoff += medialen; + continue; + } + mediaoff += t->len; + medialen -= t->len; + + port = simple_strtoul(*dptr + mediaoff, NULL, 10); + if (port == 0) + continue; + if (port < 1024 || port > 65535) + return NF_DROP; + + /* The media description overrides the session description. */ + maddr_len = 0; + if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen, + c_hdr, SDP_HDR_MEDIA, + &matchoff, &matchlen, &maddr) > 0) { + maddr_len = matchlen; + memcpy(&rtp_addr, &maddr, sizeof(rtp_addr)); + } else if (caddr_len) + memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); + else + return NF_DROP; + + ret = set_expected_rtp_rtcp(skb, dataoff, dptr, datalen, + &rtp_addr, htons(port), t->class, + mediaoff, medialen); + if (ret != NF_ACCEPT) + return ret; + + /* Update media connection address if present */ + if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) { + ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen, + mediaoff, c_hdr, SDP_HDR_MEDIA, + &rtp_addr); + if (ret != NF_ACCEPT) + return ret; + } + i++; + } + + /* Update session connection and owner addresses */ + nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook); + if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK) + ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff, + &rtp_addr); + + return ret; +} +static int process_invite_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, dataoff, dptr, datalen, cseq); + else if (help->help.ct_sip_info.invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_update_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, dataoff, dptr, datalen, cseq); + else if (help->help.ct_sip_info.invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_prack_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, dataoff, dptr, datalen, cseq); + else if (help->help.ct_sip_info.invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_invite_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + unsigned int ret; + + flush_expectations(ct, true); + ret = process_sdp(skb, dataoff, dptr, datalen, cseq); + if (ret == NF_ACCEPT) + help->help.ct_sip_info.invite_cseq = cseq; + return ret; +} + +static int process_bye_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + flush_expectations(ct, true); + return NF_ACCEPT; +} + +/* Parse a REGISTER request and create a permanent expectation for incoming + * signalling connections. The expectation is marked inactive and is activated + * when receiving a response indicating success from the registrar. + */ +static int process_register_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int matchoff, matchlen; + struct nf_conntrack_expect *exp; + union nf_inet_addr *saddr, daddr; + __be16 port; + u8 proto; + unsigned int expires = 0; + int ret; + typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect; + + /* Expected connections can not register again. */ + if (ct->status & IPS_EXPECTED) + return NF_ACCEPT; + + /* We must check the expiration time: a value of zero signals the + * registrar to release the binding. We'll remove our expectation + * when receiving the new bindings in the response, but we don't + * want to create new ones. + * + * The expiration time may be contained in Expires: header, the + * Contact: header parameters or the URI parameters. + */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, + &matchoff, &matchlen) > 0) + expires = simple_strtoul(*dptr + matchoff, NULL, 10); + + ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_CONTACT, NULL, + &matchoff, &matchlen, &daddr, &port); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + + /* We don't support third-party registrations */ + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &daddr)) + return NF_ACCEPT; + + if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, *datalen, + &proto) == 0) + return NF_ACCEPT; + + if (ct_sip_parse_numerical_param(ct, *dptr, + matchoff + matchlen, *datalen, + "expires=", NULL, NULL, &expires) < 0) + return NF_DROP; + + if (expires == 0) { + ret = NF_ACCEPT; + goto store_cseq; + } + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return NF_DROP; + + saddr = NULL; + if (sip_direct_signalling) + saddr = &ct->tuplehash[!dir].tuple.src.u3; + + nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), + saddr, &daddr, proto, NULL, &port); + exp->timeout.expires = sip_timeout * HZ; + exp->helper = nfct_help(ct)->helper; + exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; + + nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook); + if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK) + ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp, + matchoff, matchlen); + else { + if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + nf_ct_expect_put(exp); + +store_cseq: + if (ret == NF_ACCEPT) + help->help.ct_sip_info.register_cseq = cseq; + return ret; +} + +static int process_register_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + union nf_inet_addr addr; + __be16 port; + u8 proto; + unsigned int matchoff, matchlen, coff = 0; + unsigned int expires = 0; + int in_contact = 0, ret; + + /* According to RFC 3261, "UAs MUST NOT send a new registration until + * they have received a final response from the registrar for the + * previous one or the previous REGISTER request has timed out". + * + * However, some servers fail to detect retransmissions and send late + * responses, so we store the sequence number of the last valid + * request and compare it here. + */ + if (help->help.ct_sip_info.register_cseq != cseq) + return NF_ACCEPT; + + if (code >= 100 && code <= 199) + return NF_ACCEPT; + if (code < 200 || code > 299) + goto flush; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, + &matchoff, &matchlen) > 0) + expires = simple_strtoul(*dptr + matchoff, NULL, 10); + + while (1) { + unsigned int c_expires = expires; + + ret = ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, + SIP_HDR_CONTACT, &in_contact, + &matchoff, &matchlen, + &addr, &port); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + break; + + /* We don't support third-party registrations */ + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &addr)) + continue; + + if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, + *datalen, &proto) == 0) + continue; + + ret = ct_sip_parse_numerical_param(ct, *dptr, + matchoff + matchlen, + *datalen, "expires=", + NULL, NULL, &c_expires); + if (ret < 0) + return NF_DROP; + if (c_expires == 0) + break; + if (refresh_signalling_expectation(ct, &addr, proto, port, + c_expires)) + return NF_ACCEPT; + } + +flush: + flush_expectations(ct, false); + return NF_ACCEPT; +} + +static const struct sip_handler sip_handlers[] = { + SIP_HANDLER("INVITE", process_invite_request, process_invite_response), + SIP_HANDLER("UPDATE", process_sdp, process_update_response), + SIP_HANDLER("ACK", process_sdp, NULL), + SIP_HANDLER("PRACK", process_sdp, process_prack_response), + SIP_HANDLER("BYE", process_bye_request, NULL), + SIP_HANDLER("REGISTER", process_register_request, process_register_response), +}; + +static int process_sip_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen, matchend; + unsigned int code, cseq, i; + + if (*datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); + if (!code) + return NF_DROP; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, + &matchoff, &matchlen) <= 0) + return NF_DROP; + cseq = simple_strtoul(*dptr + matchoff, NULL, 10); + if (!cseq) + return NF_DROP; + matchend = matchoff + matchlen + 1; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + + handler = &sip_handlers[i]; + if (handler->response == NULL) + continue; + if (*datalen < matchend + handler->len || + strnicmp(*dptr + matchend, handler->method, handler->len)) + continue; + return handler->response(skb, dataoff, dptr, datalen, + cseq, code); + } + return NF_ACCEPT; +} + +static int process_sip_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + unsigned int cseq, i; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + + handler = &sip_handlers[i]; + if (handler->request == NULL) + continue; + if (*datalen < handler->len || + strnicmp(*dptr, handler->method, handler->len)) + continue; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, + &matchoff, &matchlen) <= 0) + return NF_DROP; + cseq = simple_strtoul(*dptr + matchoff, NULL, 10); + if (!cseq) + return NF_DROP; + + return handler->request(skb, dataoff, dptr, datalen, cseq); + } + return NF_ACCEPT; +} + +static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct, + unsigned int dataoff, const char **dptr, + unsigned int *datalen) +{ + typeof(nf_nat_sip_hook) nf_nat_sip; + int ret; + + if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0) + ret = process_sip_request(skb, dataoff, dptr, datalen); + else + ret = process_sip_response(skb, dataoff, dptr, datalen); + + if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { + nf_nat_sip = rcu_dereference(nf_nat_sip_hook); + if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen)) + ret = NF_DROP; + } + + return ret; +} + +static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + struct tcphdr *th, _tcph; + unsigned int dataoff, datalen; + unsigned int matchoff, matchlen, clen; + unsigned int msglen, origlen; + const char *dptr, *end; + s16 diff, tdiff = 0; + int ret = NF_ACCEPT; + bool term; + typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; + + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + /* No Data ? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + dataoff = protoff + th->doff * 4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + nf_ct_refresh(ct, skb, sip_timeout * HZ); + + if (unlikely(skb_linearize(skb))) + return NF_DROP; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + if (datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + + while (1) { + if (ct_sip_get_header(ct, dptr, 0, datalen, + SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) + break; + + clen = simple_strtoul(dptr + matchoff, (char **)&end, 10); + if (dptr + matchoff == end) + break; + + term = false; + for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) { + if (end[0] == '\r' && end[1] == '\n' && + end[2] == '\r' && end[3] == '\n') { + term = true; + break; + } + } + if (!term) + break; + end += strlen("\r\n\r\n") + clen; + + msglen = origlen = end - dptr; + if (msglen > datalen) + return NF_DROP; + + ret = process_sip_msg(skb, ct, dataoff, &dptr, &msglen); + if (ret != NF_ACCEPT) + break; + diff = msglen - origlen; + tdiff += diff; + + dataoff += msglen; + dptr += msglen; + datalen = datalen + diff - msglen; + } + + if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { + nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook); + if (nf_nat_sip_seq_adjust) + nf_nat_sip_seq_adjust(skb, tdiff); + } + + return ret; +} + +static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + + /* No Data ? */ + dataoff = protoff + sizeof(struct udphdr); + if (dataoff >= skb->len) + return NF_ACCEPT; + + nf_ct_refresh(ct, skb, sip_timeout * HZ); + + if (unlikely(skb_linearize(skb))) + return NF_DROP; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + if (datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + + return process_sip_msg(skb, ct, dataoff, &dptr, &datalen); +} + +static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; +static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { + [SIP_EXPECT_SIGNALLING] = { + .name = "signalling", + .max_expected = 1, + .timeout = 3 * 60, + }, + [SIP_EXPECT_AUDIO] = { + .name = "audio", + .max_expected = 2 * IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, + [SIP_EXPECT_VIDEO] = { + .name = "video", + .max_expected = 2 * IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, + [SIP_EXPECT_IMAGE] = { + .name = "image", + .max_expected = IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, +}; + +static void nf_conntrack_sip_fini(void) +{ + int i, j; + + for (i = 0; i < ports_c; i++) { + for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { + if (sip[i][j].me == NULL) + continue; + nf_conntrack_helper_unregister(&sip[i][j]); + } + } +} + +static int __init nf_conntrack_sip_init(void) +{ + int i, j, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = SIP_PORT; + + for (i = 0; i < ports_c; i++) { + memset(&sip[i], 0, sizeof(sip[i])); + + sip[i][0].tuple.src.l3num = AF_INET; + sip[i][0].tuple.dst.protonum = IPPROTO_UDP; + sip[i][0].help = sip_help_udp; + sip[i][1].tuple.src.l3num = AF_INET; + sip[i][1].tuple.dst.protonum = IPPROTO_TCP; + sip[i][1].help = sip_help_tcp; + + sip[i][2].tuple.src.l3num = AF_INET6; + sip[i][2].tuple.dst.protonum = IPPROTO_UDP; + sip[i][2].help = sip_help_udp; + sip[i][3].tuple.src.l3num = AF_INET6; + sip[i][3].tuple.dst.protonum = IPPROTO_TCP; + sip[i][3].help = sip_help_tcp; + + for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { + sip[i][j].tuple.src.u.udp.port = htons(ports[i]); + sip[i][j].expect_policy = sip_exp_policy; + sip[i][j].expect_class_max = SIP_EXPECT_MAX; + sip[i][j].me = THIS_MODULE; + + tmpname = &sip_names[i][j][0]; + if (ports[i] == SIP_PORT) + sprintf(tmpname, "sip"); + else + sprintf(tmpname, "sip-%u", i); + sip[i][j].name = tmpname; + + pr_debug("port #%u: %u\n", i, ports[i]); + + ret = nf_conntrack_helper_register(&sip[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_sip: failed to register" + " helper for pf: %u port: %u\n", + sip[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_sip_fini(); + return ret; + } + } + } + return 0; +} + +module_init(nf_conntrack_sip_init); +module_exit(nf_conntrack_sip_fini); diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c new file mode 100644 index 00000000..6e545e26 --- /dev/null +++ b/net/netfilter/nf_conntrack_snmp.c @@ -0,0 +1,77 @@ +/* + * SNMP service broadcast connection tracking helper + * + * (c) 2011 Jiri Olsa + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +#include +#include +#include + +#define SNMP_PORT 161 + +MODULE_AUTHOR("Jiri Olsa "); +MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFCT_HELPER("snmp"); + +static unsigned int timeout __read_mostly = 30; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); + +static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + typeof(nf_nat_snmp_hook) nf_nat_snmp; + + nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + + nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); + if (nf_nat_snmp && ct->status & IPS_NAT_MASK) + return nf_nat_snmp(skb, protoff, ct, ctinfo); + + return NF_ACCEPT; +} + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "snmp", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = snmp_conntrack_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_snmp_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_snmp_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_snmp_init); +module_exit(nf_conntrack_snmp_fini); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c new file mode 100644 index 00000000..05e9feb1 --- /dev/null +++ b/net/netfilter/nf_conntrack_standalone.c @@ -0,0 +1,590 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_PROC_FS +int +print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) +{ + return l3proto->print_tuple(s, tuple) || l4proto->print_tuple(s, tuple); +} +EXPORT_SYMBOL_GPL(print_tuple); + +struct ct_iter_state { + struct seq_net_private p; + unsigned int bucket; + u_int64_t time_now; +}; + +static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_iter_state *st = seq->private; + struct hlist_nulls_node *n; + + for (st->bucket = 0; + st->bucket < net->ct.htable_size; + st->bucket++) { + n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + if (!is_a_nulls(n)) + return n; + } + return NULL; +} + +static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, + struct hlist_nulls_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_iter_state *st = seq->private; + + head = rcu_dereference(hlist_nulls_next_rcu(head)); + while (is_a_nulls(head)) { + if (likely(get_nulls_value(head) == st->bucket)) { + if (++st->bucket >= net->ct.htable_size) + return NULL; + } + head = rcu_dereference( + hlist_nulls_first_rcu( + &net->ct.hash[st->bucket])); + } + return head; +} + +static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_nulls_node *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct ct_iter_state *st = seq->private; + + st->time_now = ktime_to_ns(ktime_get_real()); + rcu_read_lock(); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + struct ct_iter_state *st = s->private; + struct nf_conn_tstamp *tstamp; + s64 delta_time; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + delta_time = st->time_now - tstamp->start; + if (delta_time > 0) + delta_time = div_s64(delta_time, NSEC_PER_SEC); + else + delta_time = 0; + + return seq_printf(s, "delta-time=%llu ", + (unsigned long long)delta_time); + } + return 0; +} +#else +static inline int +ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + +/* return 0 on success, 1 in case of error */ +static int ct_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_tuple_hash *hash = v; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; + int ret = 0; + + NF_CT_ASSERT(ct); + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; + + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + goto release; + + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + NF_CT_ASSERT(l3proto); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + NF_CT_ASSERT(l4proto); + + ret = -ENOSPC; + if (seq_printf(s, "%-8s %u %-8s %u %ld ", + l3proto->name, nf_ct_l3num(ct), + l4proto->name, nf_ct_protonum(ct), + timer_pending(&ct->timeout) + ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) + goto release; + + if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) + goto release; + + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, l4proto)) + goto release; + + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) + goto release; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) + if (seq_printf(s, "[UNREPLIED] ")) + goto release; + + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, l4proto)) + goto release; + + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) + goto release; + + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + if (seq_printf(s, "[ASSURED] ")) + goto release; + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%u ", ct->mark)) + goto release; +#endif + + if (ct_show_secctx(s, ct)) + goto release; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) + goto release; +#endif + + if (ct_show_delta_time(s, ct)) + goto release; + + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + goto release; + + ret = 0; +release: + nf_ct_put(ct); + return ret; +} + +static const struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ct_seq_ops, + sizeof(struct ct_iter_state)); +} + +static const struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(net->ct.stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(net->ct.stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); + const struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete, + st->search_restart + ); + return 0; +} + +static const struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int nf_conntrack_standalone_init_proc(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); + if (!pde) + goto out_nf_conntrack; + + pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, + &ct_cpu_seq_fops); + if (!pde) + goto out_stat_nf_conntrack; + return 0; + +out_stat_nf_conntrack: + proc_net_remove(net, "nf_conntrack"); +out_nf_conntrack: + return -ENOMEM; +} + +static void nf_conntrack_standalone_fini_proc(struct net *net) +{ + remove_proc_entry("nf_conntrack", net->proc_net_stat); + proc_net_remove(net, "nf_conntrack"); +} +#else +static int nf_conntrack_standalone_init_proc(struct net *net) +{ + return 0; +} + +static void nf_conntrack_standalone_fini_proc(struct net *net) +{ +} +#endif /* CONFIG_PROC_FS */ + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *nf_ct_netfilter_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_count", + .data = &init_net.ct.count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_buckets", + .data = &init_net.ct.htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_checksum", + .data = &init_net.ct.sysctl_checksum, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_log_invalid", + .data = &init_net.ct.sysctl_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .procname = "nf_conntrack_expect_max", + .data = &nf_ct_expect_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +#define NET_NF_CONNTRACK_MAX 2089 + +static ctl_table nf_ct_netfilter_table[] = { + { + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path nf_ct_path[] = { + { .procname = "net", }, + { } +}; + +static int nf_conntrack_standalone_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + if (net_eq(net, &init_net)) { + nf_ct_netfilter_header = + register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); + if (!nf_ct_netfilter_header) + goto out; + } + + table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out_kmemdup; + + table[1].data = &net->ct.count; + table[2].data = &net->ct.htable_size; + table[3].data = &net->ct.sysctl_checksum; + table[4].data = &net->ct.sysctl_log_invalid; + + net->ct.sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.sysctl_header) + goto out_unregister_netfilter; + + return 0; + +out_unregister_netfilter: + kfree(table); +out_kmemdup: + if (net_eq(net, &init_net)) + unregister_sysctl_table(nf_ct_netfilter_header); +out: + printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n"); + return -ENOMEM; +} + +static void nf_conntrack_standalone_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + if (net_eq(net, &init_net)) + unregister_sysctl_table(nf_ct_netfilter_header); + table = net->ct.sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_standalone_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_standalone_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +static int nf_conntrack_net_init(struct net *net) +{ + int ret; + + ret = nf_conntrack_init(net); + if (ret < 0) + goto out_init; + ret = nf_conntrack_standalone_init_proc(net); + if (ret < 0) + goto out_proc; + net->ct.sysctl_checksum = 1; + net->ct.sysctl_log_invalid = 0; + ret = nf_conntrack_standalone_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + return 0; + +out_sysctl: + nf_conntrack_standalone_fini_proc(net); +out_proc: + nf_conntrack_cleanup(net); +out_init: + return ret; +} + +static void nf_conntrack_net_exit(struct net *net) +{ + nf_conntrack_standalone_fini_sysctl(net); + nf_conntrack_standalone_fini_proc(net); + nf_conntrack_cleanup(net); +} + +static struct pernet_operations nf_conntrack_net_ops = { + .init = nf_conntrack_net_init, + .exit = nf_conntrack_net_exit, +}; + +static int __init nf_conntrack_standalone_init(void) +{ + return register_pernet_subsys(&nf_conntrack_net_ops); +} + +static void __exit nf_conntrack_standalone_fini(void) +{ + unregister_pernet_subsys(&nf_conntrack_net_ops); +} + +module_init(nf_conntrack_standalone_init); +module_exit(nf_conntrack_standalone_fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_conntrack(void) +{ +} +EXPORT_SYMBOL_GPL(need_conntrack); diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c new file mode 100644 index 00000000..75466fd7 --- /dev/null +++ b/net/netfilter/nf_conntrack_tftp.c @@ -0,0 +1,153 @@ +/* (C) 2001-2002 Magnus Boden + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Magnus Boden "); +MODULE_DESCRIPTION("TFTP connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_tftp"); +MODULE_ALIAS_NFCT_HELPER("tftp"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "Port numbers of TFTP servers"); + +unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_tftp_hook); + +static int tftp_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + const struct tftphdr *tfh; + struct tftphdr _tftph; + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + unsigned int ret = NF_ACCEPT; + typeof(nf_nat_tftp_hook) nf_nat_tftp; + + tfh = skb_header_pointer(skb, protoff + sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) + return NF_ACCEPT; + + switch (ntohs(tfh->opcode)) { + case TFTP_OPCODE_READ: + case TFTP_OPCODE_WRITE: + /* RRQ and WRQ works the same way */ + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + return NF_DROP; + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_UDP, NULL, &tuple->dst.u.udp.port); + + pr_debug("expect: "); + nf_ct_dump_tuple(&exp->tuple); + + nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook); + if (nf_nat_tftp && ct->status & IPS_NAT_MASK) + ret = nf_nat_tftp(skb, ctinfo, exp); + else if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + nf_ct_expect_put(exp); + break; + case TFTP_OPCODE_DATA: + case TFTP_OPCODE_ACK: + pr_debug("Data/ACK opcode\n"); + break; + case TFTP_OPCODE_ERROR: + pr_debug("Error opcode\n"); + break; + default: + pr_debug("Unknown opcode\n"); + } + return ret; +} + +static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; +static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy tftp_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; + +static void nf_conntrack_tftp_fini(void) +{ + int i, j; + + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) + nf_conntrack_helper_unregister(&tftp[i][j]); + } +} + +static int __init nf_conntrack_tftp_init(void) +{ + int i, j, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; + + for (i = 0; i < ports_c; i++) { + memset(&tftp[i], 0, sizeof(tftp[i])); + + tftp[i][0].tuple.src.l3num = AF_INET; + tftp[i][1].tuple.src.l3num = AF_INET6; + for (j = 0; j < 2; j++) { + tftp[i][j].tuple.dst.protonum = IPPROTO_UDP; + tftp[i][j].tuple.src.u.udp.port = htons(ports[i]); + tftp[i][j].expect_policy = &tftp_exp_policy; + tftp[i][j].me = THIS_MODULE; + tftp[i][j].help = tftp_help; + + tmpname = &tftp_names[i][j][0]; + if (ports[i] == TFTP_PORT) + sprintf(tmpname, "tftp"); + else + sprintf(tmpname, "tftp-%u", i); + tftp[i][j].name = tmpname; + + ret = nf_conntrack_helper_register(&tftp[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_tftp: failed to register" + " helper for pf: %u port: %u\n", + tftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_tftp_fini(); + return ret; + } + } + } + return 0; +} + +module_init(nf_conntrack_tftp_init); +module_exit(nf_conntrack_tftp_fini); diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c new file mode 100644 index 00000000..af7dd31a --- /dev/null +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -0,0 +1,120 @@ +/* + * (C) 2010 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include +#include +#include +#include + +#include +#include +#include + +static int nf_ct_tstamp __read_mostly; + +module_param_named(tstamp, nf_ct_tstamp, bool, 0644); +MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table tstamp_sysctl_table[] = { + { + .procname = "nf_conntrack_timestamp", + .data = &init_net.ct.sysctl_tstamp, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type tstamp_extend __read_mostly = { + .len = sizeof(struct nf_conn_tstamp), + .align = __alignof__(struct nf_conn_tstamp), + .id = NF_CT_EXT_TSTAMP, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_tstamp; + + net->ct.tstamp_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.tstamp_sysctl_header) { + printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.tstamp_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_tstamp_init(struct net *net) +{ + int ret; + + net->ct.sysctl_tstamp = nf_ct_tstamp; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&tstamp_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_tstamp: Unable to register " + "extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_tstamp_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_tstamp_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +} diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h new file mode 100644 index 00000000..770f7643 --- /dev/null +++ b/net/netfilter/nf_internals.h @@ -0,0 +1,38 @@ +#ifndef _NF_INTERNALS_H +#define _NF_INTERNALS_H + +#include +#include +#include + +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(KERN_DEBUG format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + + +/* core.c */ +extern unsigned int nf_iterate(struct list_head *head, + struct sk_buff *skb, + unsigned int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh); + +/* nf_queue.c */ +extern int nf_queue(struct sk_buff *skb, + struct list_head *elem, + u_int8_t pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum); +extern int __init netfilter_queue_init(void); + +/* nf_log.c */ +extern int __init netfilter_log_init(void); + +#endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c new file mode 100644 index 00000000..20714edf --- /dev/null +++ b/net/netfilter/nf_log.c @@ -0,0 +1,318 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* Internal logging interface, which relies on the real + LOG target modules */ + +#define NF_LOG_PREFIXLEN 128 +#define NFLOGGER_NAME_LEN 64 + +static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; +static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; +static DEFINE_MUTEX(nf_log_mutex); + +static struct nf_logger *__find_logger(int pf, const char *str_logger) +{ + struct nf_logger *t; + + list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) { + if (!strnicmp(str_logger, t->name, strlen(t->name))) + return t; + } + + return NULL; +} + +/* return EEXIST if the same logger is registred, 0 on success. */ +int nf_log_register(u_int8_t pf, struct nf_logger *logger) +{ + const struct nf_logger *llog; + int i; + + if (pf >= ARRAY_SIZE(nf_loggers)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(logger->list); i++) + INIT_LIST_HEAD(&logger->list[i]); + + mutex_lock(&nf_log_mutex); + + if (pf == NFPROTO_UNSPEC) { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + list_add_tail(&(logger->list[i]), &(nf_loggers_l[i])); + } else { + /* register at end of list to honor first register win */ + list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); + llog = rcu_dereference_protected(nf_loggers[pf], + lockdep_is_held(&nf_log_mutex)); + if (llog == NULL) + rcu_assign_pointer(nf_loggers[pf], logger); + } + + mutex_unlock(&nf_log_mutex); + + return 0; +} +EXPORT_SYMBOL(nf_log_register); + +void nf_log_unregister(struct nf_logger *logger) +{ + const struct nf_logger *c_logger; + int i; + + mutex_lock(&nf_log_mutex); + for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { + c_logger = rcu_dereference_protected(nf_loggers[i], + lockdep_is_held(&nf_log_mutex)); + if (c_logger == logger) + rcu_assign_pointer(nf_loggers[i], NULL); + list_del(&logger->list[i]); + } + mutex_unlock(&nf_log_mutex); + + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_log_unregister); + +int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +{ + if (pf >= ARRAY_SIZE(nf_loggers)) + return -EINVAL; + mutex_lock(&nf_log_mutex); + if (__find_logger(pf, logger->name) == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(nf_loggers[pf], logger); + mutex_unlock(&nf_log_mutex); + return 0; +} +EXPORT_SYMBOL(nf_log_bind_pf); + +void nf_log_unbind_pf(u_int8_t pf) +{ + if (pf >= ARRAY_SIZE(nf_loggers)) + return; + mutex_lock(&nf_log_mutex); + rcu_assign_pointer(nf_loggers[pf], NULL); + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_unbind_pf); + +void nf_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + const struct nf_logger *logger; + + rcu_read_lock(); + logger = rcu_dereference(nf_loggers[pf]); + if (logger) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_packet); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + mutex_lock(&nf_log_mutex); + + if (*pos >= ARRAY_SIZE(nf_loggers)) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= ARRAY_SIZE(nf_loggers)) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + mutex_unlock(&nf_log_mutex); +} + +static int seq_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + const struct nf_logger *logger; + struct nf_logger *t; + int ret; + + logger = rcu_dereference_protected(nf_loggers[*pos], + lockdep_is_held(&nf_log_mutex)); + + if (!logger) + ret = seq_printf(s, "%2lld NONE (", *pos); + else + ret = seq_printf(s, "%2lld %s (", *pos, logger->name); + + if (ret < 0) + return ret; + + list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) { + ret = seq_printf(s, "%s", t->name); + if (ret < 0) + return ret; + if (&t->list[*pos] != nf_loggers_l[*pos].prev) { + ret = seq_printf(s, ","); + if (ret < 0) + return ret; + } + } + + return seq_printf(s, ")\n"); +} + +static const struct seq_operations nflog_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nflog_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nflog_seq_ops); +} + +static const struct file_operations nflog_file_ops = { + .owner = THIS_MODULE, + .open = nflog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +#endif /* PROC_FS */ + +#ifdef CONFIG_SYSCTL +static struct ctl_path nf_log_sysctl_path[] = { + { .procname = "net", }, + { .procname = "netfilter", }, + { .procname = "nf_log", }, + { } +}; + +static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table_header *nf_log_dir_header; + +static int nf_log_proc_dostring(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + const struct nf_logger *logger; + char buf[NFLOGGER_NAME_LEN]; + size_t size = *lenp; + int r = 0; + int tindex = (unsigned long)table->extra1; + + if (write) { + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, buffer, size)) + return -EFAULT; + + if (!strcmp(buf, "NONE")) { + nf_log_unbind_pf(tindex); + return 0; + } + mutex_lock(&nf_log_mutex); + logger = __find_logger(tindex, buf); + if (logger == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(nf_loggers[tindex], logger); + mutex_unlock(&nf_log_mutex); + } else { + mutex_lock(&nf_log_mutex); + logger = rcu_dereference_protected(nf_loggers[tindex], + lockdep_is_held(&nf_log_mutex)); + if (!logger) + table->data = "NONE"; + else + table->data = logger->name; + r = proc_dostring(table, write, buffer, lenp, ppos); + mutex_unlock(&nf_log_mutex); + } + + return r; +} + +static __init int netfilter_log_sysctl_init(void) +{ + int i; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; + } + + nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path, + nf_log_sysctl_table); + if (!nf_log_dir_header) + return -ENOMEM; + + return 0; +} +#else +static __init int netfilter_log_sysctl_init(void) +{ + return 0; +} +#endif /* CONFIG_SYSCTL */ + +int __init netfilter_log_init(void) +{ + int i, r; +#ifdef CONFIG_PROC_FS + if (!proc_create("nf_log", S_IRUGO, + proc_net_netfilter, &nflog_file_ops)) + return -1; +#endif + + /* Errors will trigger panic, unroll on error is unnecessary. */ + r = netfilter_log_sysctl_init(); + if (r < 0) + return r; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&(nf_loggers_l[i])); + + return 0; +} diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c new file mode 100644 index 00000000..5b466cd1 --- /dev/null +++ b/net/netfilter/nf_queue.c @@ -0,0 +1,395 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* + * A queue handler may be registered for each protocol. Each is protected by + * long term mutex. The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; + +static DEFINE_MUTEX(queue_handler_mutex); + +/* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ +int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +{ + int ret; + const struct nf_queue_handler *old; + + if (pf >= ARRAY_SIZE(queue_handler)) + return -EINVAL; + + mutex_lock(&queue_handler_mutex); + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old == qh) + ret = -EEXIST; + else if (old) + ret = -EBUSY; + else { + rcu_assign_pointer(queue_handler[pf], qh); + ret = 0; + } + mutex_unlock(&queue_handler_mutex); + + return ret; +} +EXPORT_SYMBOL(nf_register_queue_handler); + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +{ + const struct nf_queue_handler *old; + + if (pf >= ARRAY_SIZE(queue_handler)) + return -EINVAL; + + mutex_lock(&queue_handler_mutex); + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old && old != qh) { + mutex_unlock(&queue_handler_mutex); + return -EINVAL; + } + + rcu_assign_pointer(queue_handler[pf], NULL); + mutex_unlock(&queue_handler_mutex); + + synchronize_rcu(); + + return 0; +} +EXPORT_SYMBOL(nf_unregister_queue_handler); + +void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) +{ + u_int8_t pf; + + mutex_lock(&queue_handler_mutex); + for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { + if (rcu_dereference_protected( + queue_handler[pf], + lockdep_is_held(&queue_handler_mutex) + ) == qh) + rcu_assign_pointer(queue_handler[pf], NULL); + } + mutex_unlock(&queue_handler_mutex); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); + +static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +{ + /* Release those devices we held, or Alexey will kill me. */ + if (entry->indev) + dev_put(entry->indev); + if (entry->outdev) + dev_put(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + + if (nf_bridge->physindev) + dev_put(nf_bridge->physindev); + if (nf_bridge->physoutdev) + dev_put(nf_bridge->physoutdev); + } +#endif + /* Drop reference to owner of hook which queued us. */ + module_put(entry->elem->owner); +} + +/* + * Any packet that leaves via this function must come back + * through nf_reinject(). + */ +static int __nf_queue(struct sk_buff *skb, + struct list_head *elem, + u_int8_t pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum) +{ + int status = -ENOENT; + struct nf_queue_entry *entry = NULL; +#ifdef CONFIG_BRIDGE_NETFILTER + struct net_device *physindev; + struct net_device *physoutdev; +#endif + const struct nf_afinfo *afinfo; + const struct nf_queue_handler *qh; + + /* QUEUE == DROP if no one is waiting, to be safe. */ + rcu_read_lock(); + + qh = rcu_dereference(queue_handler[pf]); + if (!qh) { + status = -ESRCH; + goto err_unlock; + } + + afinfo = nf_get_afinfo(pf); + if (!afinfo) + goto err_unlock; + + entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); + if (!entry) { + status = -ENOMEM; + goto err_unlock; + } + + *entry = (struct nf_queue_entry) { + .skb = skb, + .elem = list_entry(elem, struct nf_hook_ops, list), + .pf = pf, + .hook = hook, + .indev = indev, + .outdev = outdev, + .okfn = okfn, + }; + + /* If it's going away, ignore hook. */ + if (!try_module_get(entry->elem->owner)) { + status = -ECANCELED; + goto err_unlock; + } + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) + dev_hold(indev); + if (outdev) + dev_hold(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + physindev = skb->nf_bridge->physindev; + if (physindev) + dev_hold(physindev); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev) + dev_hold(physoutdev); + } +#endif + skb_dst_force(skb); + afinfo->saveroute(skb, entry); + status = qh->outfn(entry, queuenum); + + rcu_read_unlock(); + + if (status < 0) { + nf_queue_entry_release_refs(entry); + goto err; + } + + return 0; + +err_unlock: + rcu_read_unlock(); +err: + kfree(entry); + return status; +} + +int nf_queue(struct sk_buff *skb, + struct list_head *elem, + u_int8_t pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum) +{ + struct sk_buff *segs; + int err; + unsigned int queued; + + if (!skb_is_gso(skb)) + return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + queuenum); + + switch (pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to mean + * 'ignore this hook'. + */ + if (IS_ERR(segs)) + return -EINVAL; + + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + + segs->next = NULL; + if (err == 0) + err = __nf_queue(segs, elem, pf, hook, indev, + outdev, okfn, queuenum); + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + /* also free orig skb if only some segments were queued */ + if (unlikely(err && queued)) + err = 0; + if (err == 0) + kfree_skb(skb); + return err; +} + +void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) +{ + struct sk_buff *skb = entry->skb; + struct list_head *elem = &entry->elem->list; + const struct nf_afinfo *afinfo; + int err; + + rcu_read_lock(); + + nf_queue_entry_release_refs(entry); + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + + if (verdict == NF_ACCEPT) { + afinfo = nf_get_afinfo(entry->pf); + if (!afinfo || afinfo->reroute(skb, entry) < 0) + verdict = NF_DROP; + } + + if (verdict == NF_ACCEPT) { + next_hook: + verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook], + skb, entry->hook, + entry->indev, entry->outdev, &elem, + entry->okfn, INT_MIN); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_STOP: + local_bh_disable(); + entry->okfn(skb); + local_bh_enable(); + break; + case NF_QUEUE: + err = __nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } + break; + case NF_STOLEN: + default: + kfree_skb(skb); + } + rcu_read_unlock(); + kfree(entry); +} +EXPORT_SYMBOL(nf_reinject); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(queue_handler)) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= ARRAY_SIZE(queue_handler)) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + +} + +static int seq_show(struct seq_file *s, void *v) +{ + int ret; + loff_t *pos = v; + const struct nf_queue_handler *qh; + + rcu_read_lock(); + qh = rcu_dereference(queue_handler[*pos]); + if (!qh) + ret = seq_printf(s, "%2lld NONE\n", *pos); + else + ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); + rcu_read_unlock(); + + return ret; +} + +static const struct seq_operations nfqueue_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nfqueue_seq_ops); +} + +static const struct file_operations nfqueue_file_ops = { + .owner = THIS_MODULE, + .open = nfqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* PROC_FS */ + + +int __init netfilter_queue_init(void) +{ +#ifdef CONFIG_PROC_FS + if (!proc_create("nf_queue", S_IRUGO, + proc_net_netfilter, &nfqueue_file_ops)) + return -1; +#endif + return 0; +} + diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c new file mode 100644 index 00000000..f042ae52 --- /dev/null +++ b/net/netfilter/nf_sockopt.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* Sockopts only registered and called from user context, so + net locking would be overkill. Also, [gs]etsockopt calls may + sleep. */ +static DEFINE_MUTEX(nf_sockopt_mutex); +static LIST_HEAD(nf_sockopts); + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct nf_sockopt_ops *ops; + int ret = 0; + + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each_entry(ops, &nf_sockopts, list) { + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + mutex_unlock(&nf_sockopt_mutex); + return ret; +} +EXPORT_SYMBOL(nf_register_sockopt); + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ + mutex_lock(&nf_sockopt_mutex); + list_del(®->list); + mutex_unlock(&nf_sockopt_mutex); +} +EXPORT_SYMBOL(nf_unregister_sockopt); + +static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, + int val, int get) +{ + struct nf_sockopt_ops *ops; + + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(ops, &nf_sockopts, list) { + if (ops->pf == pf) { + if (!try_module_get(ops->owner)) + goto out_nosup; + + if (get) { + if (val >= ops->get_optmin && + val < ops->get_optmax) + goto out; + } else { + if (val >= ops->set_optmin && + val < ops->set_optmax) + goto out; + } + module_put(ops->owner); + } + } +out_nosup: + ops = ERR_PTR(-ENOPROTOOPT); +out: + mutex_unlock(&nf_sockopt_mutex); + return ops; +} + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, + char __user *opt, int *len, int get) +{ + struct nf_sockopt_ops *ops; + int ret; + + ops = nf_sockopt_find(sk, pf, val, get); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + if (get) + ret = ops->get(sk, val, opt, len); + else + ret = ops->set(sk, val, opt, *len); + + module_put(ops->owner); + return ret; +} + +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + unsigned int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} +EXPORT_SYMBOL(nf_setsockopt); + +int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} +EXPORT_SYMBOL(nf_getsockopt); + +#ifdef CONFIG_COMPAT +static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, + char __user *opt, int *len, int get) +{ + struct nf_sockopt_ops *ops; + int ret; + + ops = nf_sockopt_find(sk, pf, val, get); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + if (get) { + if (ops->compat_get) + ret = ops->compat_get(sk, val, opt, len); + else + ret = ops->get(sk, val, opt, len); + } else { + if (ops->compat_set) + ret = ops->compat_set(sk, val, opt, *len); + else + ret = ops->set(sk, val, opt, *len); + } + + module_put(ops->owner); + return ret; +} + +int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, + int val, char __user *opt, unsigned int len) +{ + return compat_nf_sockopt(sk, pf, val, opt, &len, 0); +} +EXPORT_SYMBOL(compat_nf_setsockopt); + +int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, + int val, char __user *opt, int *len) +{ + return compat_nf_sockopt(sk, pf, val, opt, len, 1); +} +EXPORT_SYMBOL(compat_nf_getsockopt); +#endif diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c new file mode 100644 index 00000000..474d621c --- /dev/null +++ b/net/netfilter/nf_tproxy_core.c @@ -0,0 +1,62 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2007 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include + +#include +#include +#include +#include +#include + + +static void +nf_tproxy_destructor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + skb->sk = NULL; + skb->destructor = NULL; + + if (sk) + sock_put(sk); +} + +/* consumes sk */ +void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + /* assigning tw sockets complicates things; most + * skb->sk->X checks would have to test sk->sk_state first */ + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = nf_tproxy_destructor; +} +EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); + +static int __init nf_tproxy_init(void) +{ + pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); + pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); + return 0; +} + +module_init(nf_tproxy_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs"); +MODULE_DESCRIPTION("Transparent proxy support core routines"); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c new file mode 100644 index 00000000..b4a45328 --- /dev/null +++ b/net/netfilter/nfnetlink.c @@ -0,0 +1,225 @@ +/* Netfilter messages via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist , + * (C) 2002-2005 by Harald Welte + * (C) 2005,2007 by Pablo Neira Ayuso + * + * Initial netfilter messages via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); + +static char __initdata nfversion[] = "0.30"; + +static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; +static DEFINE_MUTEX(nfnl_mutex); + +void nfnl_lock(void) +{ + mutex_lock(&nfnl_mutex); +} +EXPORT_SYMBOL_GPL(nfnl_lock); + +void nfnl_unlock(void) +{ + mutex_unlock(&nfnl_mutex); +} +EXPORT_SYMBOL_GPL(nfnl_unlock); + +int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) +{ + nfnl_lock(); + if (subsys_table[n->subsys_id]) { + nfnl_unlock(); + return -EBUSY; + } + subsys_table[n->subsys_id] = n; + nfnl_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); + +int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) +{ + nfnl_lock(); + subsys_table[n->subsys_id] = NULL; + nfnl_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); + +static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +{ + u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + + if (subsys_id >= NFNL_SUBSYS_COUNT) + return NULL; + + return subsys_table[subsys_id]; +} + +static inline const struct nfnl_callback * +nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) +{ + u_int8_t cb_id = NFNL_MSG_TYPE(type); + + if (cb_id >= ss->cb_count) + return NULL; + + return &ss->cb[cb_id]; +} + +int nfnetlink_has_listeners(struct net *net, unsigned int group) +{ + return netlink_has_listeners(net->nfnl, group); +} +EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, + unsigned group, int echo, gfp_t flags) +{ + return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); +} +EXPORT_SYMBOL_GPL(nfnetlink_send); + +int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +{ + return netlink_set_err(net->nfnl, pid, group, error); +} +EXPORT_SYMBOL_GPL(nfnetlink_set_err); + +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +{ + return netlink_unicast(net->nfnl, skb, pid, flags); +} +EXPORT_SYMBOL_GPL(nfnetlink_unicast); + +/* Process one complete nfnetlink message. */ +static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + const struct nfnl_callback *nc; + const struct nfnetlink_subsystem *ss; + int type, err; + + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; + + /* All the messages must at least contain nfgenmsg */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg))) + return 0; + + type = nlh->nlmsg_type; +replay: + ss = nfnetlink_get_subsys(type); + if (!ss) { +#ifdef CONFIG_MODULES + nfnl_unlock(); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + nfnl_lock(); + ss = nfnetlink_get_subsys(type); + if (!ss) +#endif + return -EINVAL; + } + + nc = nfnetlink_find_client(type, ss); + if (!nc) + return -EINVAL; + + { + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *attr = (void *)nlh + min_len; + int attrlen = nlh->nlmsg_len - min_len; + + err = nla_parse(cda, ss->cb[cb_id].attr_count, + attr, attrlen, ss->cb[cb_id].policy); + if (err < 0) + return err; + + err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda); + if (err == -EAGAIN) + goto replay; + return err; + } +} + +static void nfnetlink_rcv(struct sk_buff *skb) +{ + nfnl_lock(); + netlink_rcv_skb(skb, &nfnetlink_rcv_msg); + nfnl_unlock(); +} + +static int __net_init nfnetlink_net_init(struct net *net) +{ + struct sock *nfnl; + + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, + nfnetlink_rcv, NULL, THIS_MODULE); + if (!nfnl) + return -ENOMEM; + net->nfnl_stash = nfnl; + rcu_assign_pointer(net->nfnl, nfnl); + return 0; +} + +static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + rcu_assign_pointer(net->nfnl, NULL); + synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) + netlink_kernel_release(net->nfnl_stash); +} + +static struct pernet_operations nfnetlink_net_ops = { + .init = nfnetlink_net_init, + .exit_batch = nfnetlink_net_exit_batch, +}; + +static int __init nfnetlink_init(void) +{ + pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); + return register_pernet_subsys(&nfnetlink_net_ops); +} + +static void __exit nfnetlink_exit(void) +{ + pr_info("Removing netfilter NETLINK layer.\n"); + unregister_pernet_subsys(&nfnetlink_net_ops); +} +module_init(nfnetlink_init); +module_exit(nfnetlink_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c new file mode 100644 index 00000000..2e7ccbb4 --- /dev/null +++ b/net/netfilter/nfnetlink_log.c @@ -0,0 +1,1015 @@ +/* + * This is a module which is used for logging packets to userspace via + * nfetlink. + * + * (C) 2005 by Harald Welte + * + * Based on the old ipv4-only ipt_ULOG.c: + * (C) 2000-2004 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE +#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ +#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ +#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */ + +#define PRINTR(x, args...) do { if (net_ratelimit()) \ + printk(x, ## args); } while (0); + +struct nfulnl_instance { + struct hlist_node hlist; /* global list of instances */ + spinlock_t lock; + atomic_t use; /* use count */ + + unsigned int qlen; /* number of nlmsgs in skb */ + struct sk_buff *skb; /* pre-allocatd skb */ + struct timer_list timer; + int peer_pid; /* PID of the peer process */ + + /* configurable parameters */ + unsigned int flushtimeout; /* timeout until queue flush */ + unsigned int nlbufsiz; /* netlink buffer allocation size */ + unsigned int qthreshold; /* threshold of the queue */ + u_int32_t copy_range; + u_int32_t seq; /* instance-local sequential counter */ + u_int16_t group_num; /* number of this queue */ + u_int16_t flags; + u_int8_t copy_mode; + struct rcu_head rcu; +}; + +static DEFINE_SPINLOCK(instances_lock); +static atomic_t global_seq; + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; +static unsigned int hash_init; + +static inline u_int8_t instance_hashfn(u_int16_t group_num) +{ + return ((group_num & 0xff) % INSTANCE_BUCKETS); +} + +static struct nfulnl_instance * +__instance_lookup(u_int16_t group_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfulnl_instance *inst; + + head = &instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry_rcu(inst, pos, head, hlist) { + if (inst->group_num == group_num) + return inst; + } + return NULL; +} + +static inline void +instance_get(struct nfulnl_instance *inst) +{ + atomic_inc(&inst->use); +} + +static struct nfulnl_instance * +instance_lookup_get(u_int16_t group_num) +{ + struct nfulnl_instance *inst; + + rcu_read_lock_bh(); + inst = __instance_lookup(group_num); + if (inst && !atomic_inc_not_zero(&inst->use)) + inst = NULL; + rcu_read_unlock_bh(); + + return inst; +} + +static void nfulnl_instance_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct nfulnl_instance, rcu)); + module_put(THIS_MODULE); +} + +static void +instance_put(struct nfulnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) + call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); +} + +static void nfulnl_timer(unsigned long data); + +static struct nfulnl_instance * +instance_create(u_int16_t group_num, int pid) +{ + struct nfulnl_instance *inst; + int err; + + spin_lock_bh(&instances_lock); + if (__instance_lookup(group_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + if (!try_module_get(THIS_MODULE)) { + kfree(inst); + err = -EAGAIN; + goto out_unlock; + } + + INIT_HLIST_NODE(&inst->hlist); + spin_lock_init(&inst->lock); + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + + setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); + + inst->peer_pid = pid; + inst->group_num = group_num; + + inst->qthreshold = NFULNL_QTHRESH_DEFAULT; + inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; + inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; + inst->copy_mode = NFULNL_COPY_PACKET; + inst->copy_range = NFULNL_COPY_RANGE_MAX; + + hlist_add_head_rcu(&inst->hlist, + &instance_table[instance_hashfn(group_num)]); + + spin_unlock_bh(&instances_lock); + + return inst; + +out_unlock: + spin_unlock_bh(&instances_lock); + return ERR_PTR(err); +} + +static void __nfulnl_flush(struct nfulnl_instance *inst); + +/* called with BH disabled */ +static void +__instance_destroy(struct nfulnl_instance *inst) +{ + /* first pull it out of the global list */ + hlist_del_rcu(&inst->hlist); + + /* then flush all pending packets from skb */ + + spin_lock(&inst->lock); + + /* lockless readers wont be able to use us */ + inst->copy_mode = NFULNL_COPY_DISABLED; + + if (inst->skb) + __nfulnl_flush(inst); + spin_unlock(&inst->lock); + + /* and finally put the refcount */ + instance_put(inst); +} + +static inline void +instance_destroy(struct nfulnl_instance *inst) +{ + spin_lock_bh(&instances_lock); + __instance_destroy(inst); + spin_unlock_bh(&instances_lock); +} + +static int +nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, + unsigned int range) +{ + int status = 0; + + spin_lock_bh(&inst->lock); + + switch (mode) { + case NFULNL_COPY_NONE: + case NFULNL_COPY_META: + inst->copy_mode = mode; + inst->copy_range = 0; + break; + + case NFULNL_COPY_PACKET: + inst->copy_mode = mode; + inst->copy_range = min_t(unsigned int, + range, NFULNL_COPY_RANGE_MAX); + break; + + default: + status = -EINVAL; + break; + } + + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) +{ + int status; + + spin_lock_bh(&inst->lock); + if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) + status = -ERANGE; + else if (nlbufsiz > 131072) + status = -ERANGE; + else { + inst->nlbufsiz = nlbufsiz; + status = 0; + } + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) +{ + spin_lock_bh(&inst->lock); + inst->flushtimeout = timeout; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) +{ + spin_lock_bh(&inst->lock); + inst->qthreshold = qthresh; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) +{ + spin_lock_bh(&inst->lock); + inst->flags = flags; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static struct sk_buff * +nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +{ + struct sk_buff *skb; + unsigned int n; + + /* alloc skb which should be big enough for a whole multipart + * message. WARNING: has to be <= 128k due to slab restrictions */ + + n = max(inst_size, pkt_size); + skb = alloc_skb(n, GFP_ATOMIC); + if (!skb) { + pr_notice("nfnetlink_log: can't alloc whole buffer (%u bytes)\n", + inst_size); + + if (n > pkt_size) { + /* try to allocate only as much as we need for current + * packet */ + + skb = alloc_skb(pkt_size, GFP_ATOMIC); + if (!skb) + pr_err("nfnetlink_log: can't even alloc %u " + "bytes\n", pkt_size); + } + } + + return skb; +} + +static int +__nfulnl_send(struct nfulnl_instance *inst) +{ + int status = -1; + + if (inst->qlen > 1) + NLMSG_PUT(inst->skb, 0, 0, + NLMSG_DONE, + sizeof(struct nfgenmsg)); + + status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid, + MSG_DONTWAIT); + + inst->qlen = 0; + inst->skb = NULL; + +nlmsg_failure: + return status; +} + +static void +__nfulnl_flush(struct nfulnl_instance *inst) +{ + /* timer holds a reference */ + if (del_timer(&inst->timer)) + instance_put(inst); + if (inst->skb) + __nfulnl_send(inst); +} + +static void +nfulnl_timer(unsigned long data) +{ + struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + + spin_lock_bh(&inst->lock); + if (inst->skb) + __nfulnl_send(inst); + spin_unlock_bh(&inst->lock); + instance_put(inst); +} + +/* This is an inline function, we don't really care about a long + * list of arguments */ +static inline int +__build_packet_message(struct nfulnl_instance *inst, + const struct sk_buff *skb, + unsigned int data_len, + u_int8_t pf, + unsigned int hooknum, + const struct net_device *indev, + const struct net_device *outdev, + const char *prefix, unsigned int plen) +{ + struct nfulnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + sk_buff_data_t old_tail = inst->skb->tail; + + nlh = NLMSG_PUT(inst->skb, 0, 0, + NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(inst->group_num); + + pmsg.hw_protocol = skb->protocol; + pmsg.hook = hooknum; + + NLA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (prefix) + NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix); + + if (indev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex)); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex)); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex)); + if (skb->nf_bridge && skb->nf_bridge->physindev) + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(skb->nf_bridge->physindev->ifindex)); + } +#endif + } + + if (outdev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); + } else { + /* Case 2: indev is a bridge group, we need to look + * for physical device (when called from ipv4) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); + if (skb->nf_bridge && skb->nf_bridge->physoutdev) + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(skb->nf_bridge->physoutdev->ifindex)); + } +#endif + } + + if (skb->mark) + NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark)); + + if (indev && skb->dev && + skb->mac_header != skb->network_header) { + struct nfulnl_msg_packet_hw phw; + int len = dev_parse_header(skb, phw.hw_addr); + if (len > 0) { + phw.hw_addrlen = htons(len); + NLA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + } + } + + if (indev && skb_mac_header_was_set(skb)) { + NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)); + NLA_PUT_BE16(inst->skb, NFULA_HWLEN, + htons(skb->dev->hard_header_len)); + NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, + skb_mac_header(skb)); + } + + if (skb->tstamp.tv64) { + struct nfulnl_msg_packet_timestamp ts; + struct timeval tv = ktime_to_timeval(skb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); + + NLA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + } + + /* UID */ + if (skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) { + struct file *file = skb->sk->sk_socket->file; + __be32 uid = htonl(file->f_cred->fsuid); + __be32 gid = htonl(file->f_cred->fsgid); + /* need to unlock here since NLA_PUT may goto */ + read_unlock_bh(&skb->sk->sk_callback_lock); + NLA_PUT_BE32(inst->skb, NFULA_UID, uid); + NLA_PUT_BE32(inst->skb, NFULA_GID, gid); + } else + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* local sequence number */ + if (inst->flags & NFULNL_CFG_F_SEQ) + NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++)); + + /* global sequence number */ + if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) + NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL, + htonl(atomic_inc_return(&global_seq))); + + if (data_len) { + struct nlattr *nla; + int size = nla_attr_size(data_len); + + if (skb_tailroom(inst->skb) < nla_total_size(data_len)) { + printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); + goto nlmsg_failure; + } + + nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len)); + nla->nla_type = NFULA_PAYLOAD; + nla->nla_len = size; + + if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) + BUG(); + } + + nlh->nlmsg_len = inst->skb->tail - old_tail; + return 0; + +nlmsg_failure: +nla_put_failure: + PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); + return -1; +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_ULOG, + .u = { + .ulog = { + .copy_len = 0xffff, + .group = 0, + .qthreshold = 1, + }, + }, +}; + +/* log handler for internal netfilter logging api */ +void +nfulnl_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li_user, + const char *prefix) +{ + unsigned int size, data_len; + struct nfulnl_instance *inst; + const struct nf_loginfo *li; + unsigned int qthreshold; + unsigned int plen; + + if (li_user && li_user->type == NF_LOG_TYPE_ULOG) + li = li_user; + else + li = &default_loginfo; + + inst = instance_lookup_get(li->u.ulog.group); + if (!inst) + return; + + plen = 0; + if (prefix) + plen = strlen(prefix) + 1; + + /* FIXME: do we want to make the size calculation conditional based on + * what is actually present? way more branches and checks, but more + * memory efficient... */ + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t)) /* gid */ + + nla_total_size(plen) /* prefix */ + + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)); + + if (in && skb_mac_header_was_set(skb)) { + size += nla_total_size(skb->dev->hard_header_len) + + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + + nla_total_size(sizeof(u_int16_t)); /* hwlen */ + } + + spin_lock_bh(&inst->lock); + + if (inst->flags & NFULNL_CFG_F_SEQ) + size += nla_total_size(sizeof(u_int32_t)); + if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) + size += nla_total_size(sizeof(u_int32_t)); + + qthreshold = inst->qthreshold; + /* per-rule qthreshold overrides per-instance */ + if (li->u.ulog.qthreshold) + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + + + switch (inst->copy_mode) { + case NFULNL_COPY_META: + case NFULNL_COPY_NONE: + data_len = 0; + break; + + case NFULNL_COPY_PACKET: + if (inst->copy_range == 0 + || inst->copy_range > skb->len) + data_len = skb->len; + else + data_len = inst->copy_range; + + size += nla_total_size(data_len); + break; + + case NFULNL_COPY_DISABLED: + default: + goto unlock_and_release; + } + + if (inst->skb && + size > skb_tailroom(inst->skb) - sizeof(struct nfgenmsg)) { + /* either the queue len is too high or we don't have + * enough room in the skb left. flush to userspace. */ + __nfulnl_flush(inst); + } + + if (!inst->skb) { + inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + if (!inst->skb) + goto alloc_failure; + } + + inst->qlen++; + + __build_packet_message(inst, skb, data_len, pf, + hooknum, in, out, prefix, plen); + + if (inst->qlen >= qthreshold) + __nfulnl_flush(inst); + /* timer_pending always called within inst->lock, so there + * is no chance of a race here */ + else if (!timer_pending(&inst->timer)) { + instance_get(inst); + inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); + add_timer(&inst->timer); + } + +unlock_and_release: + spin_unlock_bh(&inst->lock); + instance_put(inst); + return; + +alloc_failure: + /* FIXME: statistics */ + goto unlock_and_release; +} +EXPORT_SYMBOL_GPL(nfulnl_log_packet); + +static int +nfulnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this pid */ + spin_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfulnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if ((net_eq(n->net, &init_net)) && + (n->pid == inst->peer_pid)) + __instance_destroy(inst); + } + } + spin_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfulnl_rtnl_notifier = { + .notifier_call = nfulnl_rcv_nl_event, +}; + +static int +nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static struct nf_logger nfulnl_logger __read_mostly = { + .name = "nfnetlink_log", + .logfn = &nfulnl_log_packet, + .me = THIS_MODULE, +}; + +static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { + [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) }, + [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) }, + [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 }, + [NFULA_CFG_QTHRESH] = { .type = NLA_U32 }, + [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 }, + [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, +}; + +static int +nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfula[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t group_num = ntohs(nfmsg->res_id); + struct nfulnl_instance *inst; + struct nfulnl_msg_config_cmd *cmd = NULL; + int ret = 0; + + if (nfula[NFULA_CFG_CMD]) { + u_int8_t pf = nfmsg->nfgen_family; + cmd = nla_data(nfula[NFULA_CFG_CMD]); + + /* Commands without queue context */ + switch (cmd->command) { + case NFULNL_CFG_CMD_PF_BIND: + return nf_log_bind_pf(pf, &nfulnl_logger); + case NFULNL_CFG_CMD_PF_UNBIND: + nf_log_unbind_pf(pf); + return 0; + } + } + + inst = instance_lookup_get(group_num); + if (inst && inst->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFULNL_CFG_CMD_BIND: + if (inst) { + ret = -EBUSY; + goto out_put; + } + + inst = instance_create(group_num, + NETLINK_CB(skb).pid); + if (IS_ERR(inst)) { + ret = PTR_ERR(inst); + goto out; + } + break; + case NFULNL_CFG_CMD_UNBIND: + if (!inst) { + ret = -ENODEV; + goto out; + } + + instance_destroy(inst); + goto out_put; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfula[NFULA_CFG_MODE]) { + struct nfulnl_msg_config_mode *params; + params = nla_data(nfula[NFULA_CFG_MODE]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_mode(inst, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfula[NFULA_CFG_TIMEOUT]) { + __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_timeout(inst, ntohl(timeout)); + } + + if (nfula[NFULA_CFG_NLBUFSIZ]) { + __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); + } + + if (nfula[NFULA_CFG_QTHRESH]) { + __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_qthresh(inst, ntohl(qthresh)); + } + + if (nfula[NFULA_CFG_FLAGS]) { + __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_flags(inst, ntohs(flags)); + } + +out_put: + instance_put(inst); +out: + return ret; +} + +static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { + [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, + .attr_count = NFULA_MAX, }, + [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, + .attr_count = NFULA_CFG_MAX, + .policy = nfula_cfg_policy }, +}; + +static const struct nfnetlink_subsystem nfulnl_subsys = { + .name = "log", + .subsys_id = NFNL_SUBSYS_ULOG, + .cb_count = NFULNL_MSG_MAX, + .cb = nfulnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct iter_state *st) +{ + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + } + return NULL; +} + +static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +{ + h = rcu_dereference_bh(hlist_next_rcu(h)); + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + } + return h; +} + +static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +{ + struct hlist_node *head; + head = get_first(st); + + if (head) + while (pos && (head = get_next(st, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu_bh) +{ + rcu_read_lock_bh(); + return get_idx(seq->private, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s->private, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(rcu_bh) +{ + rcu_read_unlock_bh(); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfulnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", + inst->group_num, + inst->peer_pid, inst->qlen, + inst->copy_mode, inst->copy_range, + inst->flushtimeout, atomic_read(&inst->use)); +} + +static const struct seq_operations nful_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nful_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &nful_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nful_file_ops = { + .owner = THIS_MODULE, + .open = nful_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int __init nfnetlink_log_init(void) +{ + int i, status = -ENOMEM; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + /* it's not really all that important to have a random value, so + * we can do this from the init function, even if there hasn't + * been that much entropy yet */ + get_random_bytes(&hash_init, sizeof(hash_init)); + + netlink_register_notifier(&nfulnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfulnl_subsys); + if (status < 0) { + printk(KERN_ERR "log: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); + if (status < 0) { + printk(KERN_ERR "log: failed to register logger\n"); + goto cleanup_subsys; + } + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_log", 0440, + proc_net_netfilter, &nful_file_ops)) + goto cleanup_logger; +#endif + return status; + +#ifdef CONFIG_PROC_FS +cleanup_logger: + nf_log_unregister(&nfulnl_logger); +#endif +cleanup_subsys: + nfnetlink_subsys_unregister(&nfulnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfulnl_rtnl_notifier); + return status; +} + +static void __exit nfnetlink_log_fini(void) +{ + nf_log_unregister(&nfulnl_logger); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", proc_net_netfilter); +#endif + nfnetlink_subsys_unregister(&nfulnl_subsys); + netlink_unregister_notifier(&nfulnl_rtnl_notifier); +} + +MODULE_DESCRIPTION("netfilter userspace logging"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); + +module_init(nfnetlink_log_init); +module_exit(nfnetlink_log_fini); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c new file mode 100644 index 00000000..fdd2fafe --- /dev/null +++ b/net/netfilter/nfnetlink_queue.c @@ -0,0 +1,950 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfnetlink. + * + * (C) 2005 by Harald Welte + * (C) 2007 by Patrick McHardy + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + struct rcu_head rcu; + + int peer_pid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ + spinlock_t lock; + unsigned int queue_total; + atomic_t id_sequence; /* 'sequence' of pkt ids */ + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); + +static DEFINE_SPINLOCK(instances_lock); + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +instance_lookup(u_int16_t queue_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfqnl_instance *inst; + + head = &instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, pos, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_create(u_int16_t queue_num, int pid) +{ + struct nfqnl_instance *inst; + unsigned int h; + int err; + + spin_lock(&instances_lock); + if (instance_lookup(queue_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + inst->queue_num = queue_num; + inst->peer_pid = pid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = 0xfffff; + inst->copy_mode = NFQNL_COPY_NONE; + spin_lock_init(&inst->lock); + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) { + err = -EAGAIN; + goto out_free; + } + + h = instance_hashfn(queue_num); + hlist_add_head_rcu(&inst->hlist, &instance_table[h]); + + spin_unlock(&instances_lock); + + return inst; + +out_free: + kfree(inst); +out_unlock: + spin_unlock(&instances_lock); + return ERR_PTR(err); +} + +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data); + +static void +instance_destroy_rcu(struct rcu_head *head) +{ + struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, + rcu); + + nfqnl_flush(inst, NULL, 0); + kfree(inst); + module_put(THIS_MODULE); +} + +static void +__instance_destroy(struct nfqnl_instance *inst) +{ + hlist_del_rcu(&inst->hlist); + call_rcu(&inst->rcu, instance_destroy_rcu); +} + +static void +instance_destroy(struct nfqnl_instance *inst) +{ + spin_lock(&instances_lock); + __instance_destroy(inst); + spin_unlock(&instances_lock); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue->lock); + + list_for_each_entry(i, &queue->queue_list, list) { + if (i->id == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue->queue_total--; + } + + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + spin_lock_bh(&queue->lock); + list_for_each_entry_safe(entry, next, &queue->queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue->queue_total--; + nf_reinject(entry, NF_DROP); + } + } + spin_unlock_bh(&queue->lock); +} + +static struct sk_buff * +nfqnl_build_packet_message(struct nfqnl_instance *queue, + struct nf_queue_entry *entry) +{ + sk_buff_data_t old_tail; + size_t size; + size_t data_len = 0; + struct sk_buff *skb; + struct nfqnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *entskb = entry->skb; + struct net_device *indev; + struct net_device *outdev; + + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + outdev = entry->outdev; + + switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + break; + + case NFQNL_COPY_PACKET: + if (entskb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(entskb)) + return NULL; + + data_len = ACCESS_ONCE(queue->copy_range); + if (data_len == 0 || data_len > entskb->len) + data_len = entskb->len; + + size += nla_total_size(data_len); + break; + } + + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = entry->pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + entry->id = atomic_inc_return(&queue->id_sequence); + pmsg.packet_id = htonl(entry->id); + pmsg.hw_protocol = entskb->protocol; + pmsg.hook = entry->hook; + + NLA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); + + indev = entry->indev; + if (indev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)); +#else + if (entry->pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex)); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex)); + if (entskb->nf_bridge && entskb->nf_bridge->physindev) + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(entskb->nf_bridge->physindev->ifindex)); + } +#endif + } + + if (outdev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)); +#else + if (entry->pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); + } else { + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); + if (entskb->nf_bridge && entskb->nf_bridge->physoutdev) + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(entskb->nf_bridge->physoutdev->ifindex)); + } +#endif + } + + if (entskb->mark) + NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark)); + + if (indev && entskb->dev && + entskb->mac_header != entskb->network_header) { + struct nfqnl_msg_packet_hw phw; + int len = dev_parse_header(entskb, phw.hw_addr); + if (len) { + phw.hw_addrlen = htons(len); + NLA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); + } + } + + if (entskb->tstamp.tv64) { + struct nfqnl_msg_packet_timestamp ts; + struct timeval tv = ktime_to_timeval(entskb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); + + NLA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); + } + + if (data_len) { + struct nlattr *nla; + int sz = nla_attr_size(data_len); + + if (skb_tailroom(skb) < nla_total_size(data_len)) { + printk(KERN_WARNING "nf_queue: no tailroom!\n"); + goto nlmsg_failure; + } + + nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len)); + nla->nla_type = NFQA_PAYLOAD; + nla->nla_len = sz; + + if (skb_copy_bits(entskb, 0, nla_data(nla), data_len)) + BUG(); + } + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: +nla_put_failure: + if (skb) + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_ERR "nf_queue: error creating packet message\n"); + return NULL; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + struct sk_buff *nskb; + struct nfqnl_instance *queue; + int err = -ENOBUFS; + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(queuenum); + if (!queue) { + err = -ESRCH; + goto err_out; + } + + if (queue->copy_mode == NFQNL_COPY_NONE) { + err = -EINVAL; + goto err_out; + } + + nskb = nfqnl_build_packet_message(queue, entry); + if (nskb == NULL) { + err = -ENOMEM; + goto err_out; + } + spin_lock_bh(&queue->lock); + + if (!queue->peer_pid) { + err = -EINVAL; + goto err_out_free_nskb; + } + if (queue->queue_total >= queue->queue_maxlen) { + queue->queue_dropped++; + if (net_ratelimit()) + printk(KERN_WARNING "nf_queue: full at %d entries, " + "dropping packets(s).\n", + queue->queue_total); + goto err_out_free_nskb; + } + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT); + if (err < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + return 0; + +err_out_free_nskb: + kfree_skb(nskb); +err_out_unlock: + spin_unlock_bh(&queue->lock); +err_out: + return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e) +{ + struct sk_buff *nskb; + int diff; + + diff = data_len - e->skb->len; + if (diff < 0) { + if (pskb_trim(e->skb, data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "nf_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, data, data_len); + e->skb->ip_summed = CHECKSUM_NONE; + return 0; +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + spin_lock_bh(&queue->lock); + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + /* we're using struct nlattr which has 16bit nla_len */ + if (range > 0xffff) + queue->copy_range = 0xffff; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->indev) + if (entry->indev->ifindex == ifindex) + return 1; + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) + return 1; +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(int ifindex) +{ + int i; + + rcu_read_lock(); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_rcu(inst, tmp, head, hlist) + nfqnl_flush(inst, dev_cmp, ifindex); + } + + rcu_read_unlock(); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this pid */ + spin_lock(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if ((n->net == &init_net) && + (n->pid == inst->peer_pid)) + __instance_destroy(inst); + } + } + spin_unlock(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, + [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, +}; + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nf_queue_entry *entry; + int err; + + rcu_read_lock(); + queue = instance_lookup(queue_num); + if (!queue) { + err = -ENODEV; + goto err_out_unlock; + } + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + err = -EPERM; + goto err_out_unlock; + } + + if (!nfqa[NFQA_VERDICT_HDR]) { + err = -EINVAL; + goto err_out_unlock; + } + + vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); + verdict = ntohl(vhdr->verdict); + + if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { + err = -EINVAL; + goto err_out_unlock; + } + + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + if (entry == NULL) { + err = -ENOENT; + goto err_out_unlock; + } + rcu_read_unlock(); + + if (nfqa[NFQA_PAYLOAD]) { + if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), + nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0) + verdict = NF_DROP; + } + + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + + nf_reinject(entry, verdict); + return 0; + +err_out_unlock: + rcu_read_unlock(); + return err; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { + [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, + [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, +}; + +static const struct nf_queue_handler nfqh = { + .name = "nf_queue", + .outfn = &nfqnl_enqueue_packet, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + struct nfqnl_msg_config_cmd *cmd = NULL; + int ret = 0; + + if (nfqa[NFQA_CFG_CMD]) { + cmd = nla_data(nfqa[NFQA_CFG_CMD]); + + /* Commands without queue context - might sleep */ + switch (cmd->command) { + case NFQNL_CFG_CMD_PF_BIND: + return nf_register_queue_handler(ntohs(cmd->pf), + &nfqh); + case NFQNL_CFG_CMD_PF_UNBIND: + return nf_unregister_queue_handler(ntohs(cmd->pf), + &nfqh); + } + } + + rcu_read_lock(); + queue = instance_lookup(queue_num); + if (queue && queue->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto err_out_unlock; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) { + ret = -EBUSY; + goto err_out_unlock; + } + queue = instance_create(queue_num, NETLINK_CB(skb).pid); + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto err_out_unlock; + } + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + instance_destroy(queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + case NFQNL_CFG_CMD_PF_UNBIND: + break; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfqa[NFQA_CFG_PARAMS]) { + struct nfqnl_msg_config_params *params; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + params = nla_data(nfqa[NFQA_CFG_PARAMS]); + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { + __be32 *queue_maxlen; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); + spin_lock_bh(&queue->lock); + queue->queue_maxlen = ntohl(*queue_maxlen); + spin_unlock_bh(&queue->lock); + } + +err_out_unlock: + rcu_read_unlock(); + return ret; +} + +static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, }, + [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy }, +}; + +static const struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) + __acquires(instances_lock) +{ + spin_lock(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(instances_lock) +{ + spin_unlock(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + inst->queue_num, + inst->peer_pid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + atomic_read(&inst->id_sequence), 1); +} + +static const struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &nfqnl_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int __init nfnetlink_queue_init(void) +{ + int i, status = -ENOMEM; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + proc_net_netfilter, &nfqnl_file_ops)) + goto cleanup_subsys; +#endif + + register_netdevice_notifier(&nfqnl_dev_notifier); + return status; + +#ifdef CONFIG_PROC_FS +cleanup_subsys: + nfnetlink_subsys_unregister(&nfqnl_subsys); +#endif +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + return status; +} + +static void __exit nfnetlink_queue_fini(void) +{ + nf_unregister_queue_handlers(&nfqh); + unregister_netdevice_notifier(&nfqnl_dev_notifier); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", proc_net_netfilter); +#endif + nfnetlink_subsys_unregister(&nfqnl_subsys); + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(nfnetlink_queue_init); +module_exit(nfnetlink_queue_fini); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c new file mode 100644 index 00000000..b0869fe3 --- /dev/null +++ b/net/netfilter/x_tables.c @@ -0,0 +1,1395 @@ +/* + * x_tables core - Backend for {ip,ip6,arp}_tables + * + * Copyright (C) 2006-2006 Harald Welte + * + * Based on existing ip_tables code which is + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); + +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +struct compat_delta { + unsigned int offset; /* offset in kernel */ + int delta; /* delta in 32bit user land */ +}; + +struct xt_af { + struct mutex mutex; + struct list_head match; + struct list_head target; +#ifdef CONFIG_COMPAT + struct mutex compat_mutex; + struct compat_delta *compat_tab; + unsigned int number; /* number of slots in compat_tab[] */ + unsigned int cur; /* number of used slots in compat_tab[] */ +#endif +}; + +static struct xt_af *xt; + +static const char *const xt_prefix[NFPROTO_NUMPROTO] = { + [NFPROTO_UNSPEC] = "x", + [NFPROTO_IPV4] = "ip", + [NFPROTO_ARP] = "arp", + [NFPROTO_BRIDGE] = "eb", + [NFPROTO_IPV6] = "ip6", +}; + +/* Allow this many total (re)entries. */ +static const unsigned int xt_jumpstack_multiplier = 2; + +/* Registration hooks for targets. */ +int +xt_register_target(struct xt_target *target) +{ + u_int8_t af = target->family; + int ret; + + ret = mutex_lock_interruptible(&xt[af].mutex); + if (ret != 0) + return ret; + list_add(&target->list, &xt[af].target); + mutex_unlock(&xt[af].mutex); + return ret; +} +EXPORT_SYMBOL(xt_register_target); + +void +xt_unregister_target(struct xt_target *target) +{ + u_int8_t af = target->family; + + mutex_lock(&xt[af].mutex); + list_del(&target->list); + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_target); + +int +xt_register_targets(struct xt_target *target, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = xt_register_target(&target[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + xt_unregister_targets(target, i); + return err; +} +EXPORT_SYMBOL(xt_register_targets); + +void +xt_unregister_targets(struct xt_target *target, unsigned int n) +{ + while (n-- > 0) + xt_unregister_target(&target[n]); +} +EXPORT_SYMBOL(xt_unregister_targets); + +int +xt_register_match(struct xt_match *match) +{ + u_int8_t af = match->family; + int ret; + + ret = mutex_lock_interruptible(&xt[af].mutex); + if (ret != 0) + return ret; + + list_add(&match->list, &xt[af].match); + mutex_unlock(&xt[af].mutex); + + return ret; +} +EXPORT_SYMBOL(xt_register_match); + +void +xt_unregister_match(struct xt_match *match) +{ + u_int8_t af = match->family; + + mutex_lock(&xt[af].mutex); + list_del(&match->list); + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_match); + +int +xt_register_matches(struct xt_match *match, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = xt_register_match(&match[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + xt_unregister_matches(match, i); + return err; +} +EXPORT_SYMBOL(xt_register_matches); + +void +xt_unregister_matches(struct xt_match *match, unsigned int n) +{ + while (n-- > 0) + xt_unregister_match(&match[n]); +} +EXPORT_SYMBOL(xt_unregister_matches); + + +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use. + */ + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) +{ + struct xt_match *m; + int err = -ENOENT; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &xt[af].match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + mutex_unlock(&xt[af].mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_match(NFPROTO_UNSPEC, name, revision); + + return ERR_PTR(err); +} +EXPORT_SYMBOL(xt_find_match); + +struct xt_match * +xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) +{ + struct xt_match *match; + + match = xt_find_match(nfproto, name, revision); + if (IS_ERR(match)) { + request_module("%st_%s", xt_prefix[nfproto], name); + match = xt_find_match(nfproto, name, revision); + } + + return match; +} +EXPORT_SYMBOL_GPL(xt_request_find_match); + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) +{ + struct xt_target *t; + int err = -ENOENT; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &xt[af].target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_target(NFPROTO_UNSPEC, name, revision); + + return ERR_PTR(err); +} +EXPORT_SYMBOL(xt_find_target); + +struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) +{ + struct xt_target *target; + + target = xt_find_target(af, name, revision); + if (IS_ERR(target)) { + request_module("%st_%s", xt_prefix[af], name); + target = xt_find_target(af, name, revision); + } + + return target; +} +EXPORT_SYMBOL_GPL(xt_request_find_target); + +static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) +{ + const struct xt_match *m; + int have_rev = 0; + + list_for_each_entry(m, &xt[af].match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + + if (af != NFPROTO_UNSPEC && !have_rev) + return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); + + return have_rev; +} + +static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) +{ + const struct xt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &xt[af].target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + + if (af != NFPROTO_UNSPEC && !have_rev) + return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); + + return have_rev; +} + +/* Returns true or false (if no such extension at all) */ +int xt_find_revision(u8 af, const char *name, u8 revision, int target, + int *err) +{ + int have_rev, best = -1; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) { + *err = -EINTR; + return 1; + } + if (target == 1) + have_rev = target_revfn(af, name, revision, &best); + else + have_rev = match_revfn(af, name, revision, &best); + mutex_unlock(&xt[af].mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; +} +EXPORT_SYMBOL_GPL(xt_find_revision); + +static char *textify_hooks(char *buf, size_t size, unsigned int mask) +{ + static const char *const names[] = { + "PREROUTING", "INPUT", "FORWARD", + "OUTPUT", "POSTROUTING", "BROUTING", + }; + unsigned int i; + char *p = buf; + bool np = false; + int res; + + *p = '\0'; + for (i = 0; i < ARRAY_SIZE(names); ++i) { + if (!(mask & (1 << i))) + continue; + res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); + if (res > 0) { + size -= res; + p += res; + } + np = true; + } + + return buf; +} + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u_int8_t proto, bool inv_proto) +{ + int ret; + + if (XT_ALIGN(par->match->matchsize) != size && + par->match->matchsize != -1) { + /* + * ebt_among is exempt from centralized matchsize checking + * because it uses a dynamic-size data set. + */ + pr_err("%s_tables: %s.%u match: invalid size " + "%u (kernel) != (user) %u\n", + xt_prefix[par->family], par->match->name, + par->match->revision, + XT_ALIGN(par->match->matchsize), size); + return -EINVAL; + } + if (par->match->table != NULL && + strcmp(par->match->table, par->table) != 0) { + pr_err("%s_tables: %s match: only valid in %s table, not %s\n", + xt_prefix[par->family], par->match->name, + par->match->table, par->table); + return -EINVAL; + } + if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { + char used[64], allow[64]; + + pr_err("%s_tables: %s match: used from hooks %s, but only " + "valid from %s\n", + xt_prefix[par->family], par->match->name, + textify_hooks(used, sizeof(used), par->hook_mask), + textify_hooks(allow, sizeof(allow), par->match->hooks)); + return -EINVAL; + } + if (par->match->proto && (par->match->proto != proto || inv_proto)) { + pr_err("%s_tables: %s match: only valid for protocol %u\n", + xt_prefix[par->family], par->match->name, + par->match->proto); + return -EINVAL; + } + if (par->match->checkentry != NULL) { + ret = par->match->checkentry(par); + if (ret < 0) + return ret; + else if (ret > 0) + /* Flag up potential errors. */ + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_match); + +#ifdef CONFIG_COMPAT +int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) +{ + struct xt_af *xp = &xt[af]; + + if (!xp->compat_tab) { + if (!xp->number) + return -EINVAL; + xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); + if (!xp->compat_tab) + return -ENOMEM; + xp->cur = 0; + } + + if (xp->cur >= xp->number) + return -EINVAL; + + if (xp->cur) + delta += xp->compat_tab[xp->cur - 1].delta; + xp->compat_tab[xp->cur].offset = offset; + xp->compat_tab[xp->cur].delta = delta; + xp->cur++; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_add_offset); + +void xt_compat_flush_offsets(u_int8_t af) +{ + if (xt[af].compat_tab) { + vfree(xt[af].compat_tab); + xt[af].compat_tab = NULL; + xt[af].number = 0; + xt[af].cur = 0; + } +} +EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); + +int xt_compat_calc_jump(u_int8_t af, unsigned int offset) +{ + struct compat_delta *tmp = xt[af].compat_tab; + int mid, left = 0, right = xt[af].cur - 1; + + while (left <= right) { + mid = (left + right) >> 1; + if (offset > tmp[mid].offset) + left = mid + 1; + else if (offset < tmp[mid].offset) + right = mid - 1; + else + return mid ? tmp[mid - 1].delta : 0; + } + return left ? tmp[left - 1].delta : 0; +} +EXPORT_SYMBOL_GPL(xt_compat_calc_jump); + +void xt_compat_init_offsets(u_int8_t af, unsigned int number) +{ + xt[af].number = number; + xt[af].cur = 0; +} +EXPORT_SYMBOL(xt_compat_init_offsets); + +int xt_compat_match_offset(const struct xt_match *match) +{ + u_int16_t csize = match->compatsize ? : match->matchsize; + return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize); +} +EXPORT_SYMBOL_GPL(xt_compat_match_offset); + +int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, + unsigned int *size) +{ + const struct xt_match *match = m->u.kernel.match; + struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; + int pad, off = xt_compat_match_offset(match); + u_int16_t msize = cm->u.user.match_size; + + m = *dstptr; + memcpy(m, cm, sizeof(*cm)); + if (match->compat_from_user) + match->compat_from_user(m->data, cm->data); + else + memcpy(m->data, cm->data, msize - sizeof(*cm)); + pad = XT_ALIGN(match->matchsize) - match->matchsize; + if (pad > 0) + memset(m->data + match->matchsize, 0, pad); + + msize += off; + m->u.user.match_size = msize; + + *size += off; + *dstptr += msize; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_match_from_user); + +int xt_compat_match_to_user(const struct xt_entry_match *m, + void __user **dstptr, unsigned int *size) +{ + const struct xt_match *match = m->u.kernel.match; + struct compat_xt_entry_match __user *cm = *dstptr; + int off = xt_compat_match_offset(match); + u_int16_t msize = m->u.user.match_size - off; + + if (copy_to_user(cm, m, sizeof(*cm)) || + put_user(msize, &cm->u.user.match_size) || + copy_to_user(cm->u.user.name, m->u.kernel.match->name, + strlen(m->u.kernel.match->name) + 1)) + return -EFAULT; + + if (match->compat_to_user) { + if (match->compat_to_user((void __user *)cm->data, m->data)) + return -EFAULT; + } else { + if (copy_to_user(cm->data, m->data, msize - sizeof(*cm))) + return -EFAULT; + } + + *size -= off; + *dstptr += msize; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_match_to_user); +#endif /* CONFIG_COMPAT */ + +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u_int8_t proto, bool inv_proto) +{ + int ret; + + if (XT_ALIGN(par->target->targetsize) != size) { + pr_err("%s_tables: %s.%u target: invalid size " + "%u (kernel) != (user) %u\n", + xt_prefix[par->family], par->target->name, + par->target->revision, + XT_ALIGN(par->target->targetsize), size); + return -EINVAL; + } + if (par->target->table != NULL && + strcmp(par->target->table, par->table) != 0) { + pr_err("%s_tables: %s target: only valid in %s table, not %s\n", + xt_prefix[par->family], par->target->name, + par->target->table, par->table); + return -EINVAL; + } + if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { + char used[64], allow[64]; + + pr_err("%s_tables: %s target: used from hooks %s, but only " + "usable from %s\n", + xt_prefix[par->family], par->target->name, + textify_hooks(used, sizeof(used), par->hook_mask), + textify_hooks(allow, sizeof(allow), par->target->hooks)); + return -EINVAL; + } + if (par->target->proto && (par->target->proto != proto || inv_proto)) { + pr_err("%s_tables: %s target: only valid for protocol %u\n", + xt_prefix[par->family], par->target->name, + par->target->proto); + return -EINVAL; + } + if (par->target->checkentry != NULL) { + ret = par->target->checkentry(par); + if (ret < 0) + return ret; + else if (ret > 0) + /* Flag up potential errors. */ + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_target); + +#ifdef CONFIG_COMPAT +int xt_compat_target_offset(const struct xt_target *target) +{ + u_int16_t csize = target->compatsize ? : target->targetsize; + return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize); +} +EXPORT_SYMBOL_GPL(xt_compat_target_offset); + +void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, + unsigned int *size) +{ + const struct xt_target *target = t->u.kernel.target; + struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; + int pad, off = xt_compat_target_offset(target); + u_int16_t tsize = ct->u.user.target_size; + + t = *dstptr; + memcpy(t, ct, sizeof(*ct)); + if (target->compat_from_user) + target->compat_from_user(t->data, ct->data); + else + memcpy(t->data, ct->data, tsize - sizeof(*ct)); + pad = XT_ALIGN(target->targetsize) - target->targetsize; + if (pad > 0) + memset(t->data + target->targetsize, 0, pad); + + tsize += off; + t->u.user.target_size = tsize; + + *size += off; + *dstptr += tsize; +} +EXPORT_SYMBOL_GPL(xt_compat_target_from_user); + +int xt_compat_target_to_user(const struct xt_entry_target *t, + void __user **dstptr, unsigned int *size) +{ + const struct xt_target *target = t->u.kernel.target; + struct compat_xt_entry_target __user *ct = *dstptr; + int off = xt_compat_target_offset(target); + u_int16_t tsize = t->u.user.target_size - off; + + if (copy_to_user(ct, t, sizeof(*ct)) || + put_user(tsize, &ct->u.user.target_size) || + copy_to_user(ct->u.user.name, t->u.kernel.target->name, + strlen(t->u.kernel.target->name) + 1)) + return -EFAULT; + + if (target->compat_to_user) { + if (target->compat_to_user((void __user *)ct->data, t->data)) + return -EFAULT; + } else { + if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct))) + return -EFAULT; + } + + *size -= off; + *dstptr += tsize; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_target_to_user); +#endif + +struct xt_table_info *xt_alloc_table_info(unsigned int size) +{ + struct xt_table_info *newinfo; + int cpu; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) + return NULL; + + newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_possible_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + xt_free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} +EXPORT_SYMBOL(xt_alloc_table_info); + +void xt_free_table_info(struct xt_table_info *info) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + + if (info->jumpstack != NULL) { + if (sizeof(void *) * info->stacksize > PAGE_SIZE) { + for_each_possible_cpu(cpu) + vfree(info->jumpstack[cpu]); + } else { + for_each_possible_cpu(cpu) + kfree(info->jumpstack[cpu]); + } + } + + if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE) + vfree(info->jumpstack); + else + kfree(info->jumpstack); + + free_percpu(info->stackptr); + + kfree(info); +} +EXPORT_SYMBOL(xt_free_table_info); + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, + const char *name) +{ + struct xt_table *t; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &net->xt.tables[af], list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + mutex_unlock(&xt[af].mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(xt_find_table_lock); + +void xt_table_unlock(struct xt_table *table) +{ + mutex_unlock(&xt[table->af].mutex); +} +EXPORT_SYMBOL_GPL(xt_table_unlock); + +#ifdef CONFIG_COMPAT +void xt_compat_lock(u_int8_t af) +{ + mutex_lock(&xt[af].compat_mutex); +} +EXPORT_SYMBOL_GPL(xt_compat_lock); + +void xt_compat_unlock(u_int8_t af) +{ + mutex_unlock(&xt[af].compat_mutex); +} +EXPORT_SYMBOL_GPL(xt_compat_unlock); +#endif + +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); + +static int xt_jumpstack_alloc(struct xt_table_info *i) +{ + unsigned int size; + int cpu; + + i->stackptr = alloc_percpu(unsigned int); + if (i->stackptr == NULL) + return -ENOMEM; + + size = sizeof(void **) * nr_cpu_ids; + if (size > PAGE_SIZE) + i->jumpstack = vmalloc(size); + else + i->jumpstack = kmalloc(size, GFP_KERNEL); + if (i->jumpstack == NULL) + return -ENOMEM; + memset(i->jumpstack, 0, size); + + i->stacksize *= xt_jumpstack_multiplier; + size = sizeof(void *) * i->stacksize; + for_each_possible_cpu(cpu) { + if (size > PAGE_SIZE) + i->jumpstack[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + else + i->jumpstack[cpu] = kmalloc_node(size, + GFP_KERNEL, cpu_to_node(cpu)); + if (i->jumpstack[cpu] == NULL) + /* + * Freeing will be done later on by the callers. The + * chain is: xt_replace_table -> __do_replace -> + * do_replace -> xt_free_table_info. + */ + return -ENOMEM; + } + + return 0; +} + +struct xt_table_info * +xt_replace_table(struct xt_table *table, + unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *private; + int ret; + + ret = xt_jumpstack_alloc(newinfo); + if (ret < 0) { + *error = ret; + return NULL; + } + + /* Do the substitution. */ + local_bh_disable(); + private = table->private; + + /* Check inside lock: is the old number correct? */ + if (num_counters != private->number) { + pr_debug("num_counters != table->private->number (%u/%u)\n", + num_counters, private->number); + local_bh_enable(); + *error = -EAGAIN; + return NULL; + } + + table->private = newinfo; + newinfo->initial_entries = private->initial_entries; + + /* + * Even though table entries have now been swapped, other CPU's + * may still be using the old entries. This is okay, because + * resynchronization happens because of the locking done + * during the get_counters() routine. + */ + local_bh_enable(); + +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + table->name, table->af, + private->number); + audit_log_end(ab); + } + } +#endif + + return private; +} +EXPORT_SYMBOL_GPL(xt_replace_table); + +struct xt_table *xt_register_table(struct net *net, + const struct xt_table *input_table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo) +{ + int ret; + struct xt_table_info *private; + struct xt_table *t, *table; + + /* Don't add one object to multiple lists. */ + table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); + if (!table) { + ret = -ENOMEM; + goto out; + } + + ret = mutex_lock_interruptible(&xt[table->af].mutex); + if (ret != 0) + goto out_free; + + /* Don't autoload: we'd eat our tail... */ + list_for_each_entry(t, &net->xt.tables[table->af], list) { + if (strcmp(t->name, table->name) == 0) { + ret = -EEXIST; + goto unlock; + } + } + + /* Simplifies replace_table code. */ + table->private = bootstrap; + + if (!xt_replace_table(table, 0, newinfo, &ret)) + goto unlock; + + private = table->private; + pr_debug("table->private->number = %u\n", private->number); + + /* save number of initial entries */ + private->initial_entries = private->number; + + list_add(&table->list, &net->xt.tables[table->af]); + mutex_unlock(&xt[table->af].mutex); + return table; + + unlock: + mutex_unlock(&xt[table->af].mutex); +out_free: + kfree(table); +out: + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(xt_register_table); + +void *xt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + + mutex_lock(&xt[table->af].mutex); + private = table->private; + list_del(&table->list); + mutex_unlock(&xt[table->af].mutex); + kfree(table); + + return private; +} +EXPORT_SYMBOL_GPL(xt_unregister_table); + +#ifdef CONFIG_PROC_FS +struct xt_names_priv { + struct seq_net_private p; + u_int8_t af; +}; +static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct xt_names_priv *priv = seq->private; + struct net *net = seq_file_net(seq); + u_int8_t af = priv->af; + + mutex_lock(&xt[af].mutex); + return seq_list_start(&net->xt.tables[af], *pos); +} + +static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct xt_names_priv *priv = seq->private; + struct net *net = seq_file_net(seq); + u_int8_t af = priv->af; + + return seq_list_next(v, &net->xt.tables[af], pos); +} + +static void xt_table_seq_stop(struct seq_file *seq, void *v) +{ + struct xt_names_priv *priv = seq->private; + u_int8_t af = priv->af; + + mutex_unlock(&xt[af].mutex); +} + +static int xt_table_seq_show(struct seq_file *seq, void *v) +{ + struct xt_table *table = list_entry(v, struct xt_table, list); + + if (strlen(table->name)) + return seq_printf(seq, "%s\n", table->name); + else + return 0; +} + +static const struct seq_operations xt_table_seq_ops = { + .start = xt_table_seq_start, + .next = xt_table_seq_next, + .stop = xt_table_seq_stop, + .show = xt_table_seq_show, +}; + +static int xt_table_open(struct inode *inode, struct file *file) +{ + int ret; + struct xt_names_priv *priv; + + ret = seq_open_net(inode, file, &xt_table_seq_ops, + sizeof(struct xt_names_priv)); + if (!ret) { + priv = ((struct seq_file *)file->private_data)->private; + priv->af = (unsigned long)PDE(inode)->data; + } + return ret; +} + +static const struct file_operations xt_table_ops = { + .owner = THIS_MODULE, + .open = xt_table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +/* + * Traverse state for ip{,6}_{tables,matches} for helping crossing + * the multi-AF mutexes. + */ +struct nf_mttg_trav { + struct list_head *head, *curr; + uint8_t class, nfproto; +}; + +enum { + MTTG_TRAV_INIT, + MTTG_TRAV_NFP_UNSPEC, + MTTG_TRAV_NFP_SPEC, + MTTG_TRAV_DONE, +}; + +static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, + bool is_target) +{ + static const uint8_t next_class[] = { + [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, + [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, + }; + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_INIT: + trav->class = MTTG_TRAV_NFP_UNSPEC; + mutex_lock(&xt[NFPROTO_UNSPEC].mutex); + trav->head = trav->curr = is_target ? + &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match; + break; + case MTTG_TRAV_NFP_UNSPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + mutex_lock(&xt[trav->nfproto].mutex); + trav->head = trav->curr = is_target ? + &xt[trav->nfproto].target : &xt[trav->nfproto].match; + trav->class = next_class[trav->class]; + break; + case MTTG_TRAV_NFP_SPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + /* fallthru, _stop will unlock */ + default: + return NULL; + } + + if (ppos != NULL) + ++*ppos; + return trav; +} + +static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, + bool is_target) +{ + struct nf_mttg_trav *trav = seq->private; + unsigned int j; + + trav->class = MTTG_TRAV_INIT; + for (j = 0; j < *pos; ++j) + if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL) + return NULL; + return trav; +} + +static void xt_mttg_seq_stop(struct seq_file *seq, void *v) +{ + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + break; + case MTTG_TRAV_NFP_SPEC: + mutex_unlock(&xt[trav->nfproto].mutex); + break; + } +} + +static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, false); +} + +static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + return xt_mttg_seq_next(seq, v, ppos, false); +} + +static int xt_match_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_match *match; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + match = list_entry(trav->curr, struct xt_match, list); + return (*match->name == '\0') ? 0 : + seq_printf(seq, "%s\n", match->name); + } + return 0; +} + +static const struct seq_operations xt_match_seq_ops = { + .start = xt_match_seq_start, + .next = xt_match_seq_next, + .stop = xt_mttg_seq_stop, + .show = xt_match_seq_show, +}; + +static int xt_match_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct nf_mttg_trav *trav; + int ret; + + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; + + ret = seq_open(file, &xt_match_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; + } + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE(inode)->data; + return 0; +} + +static const struct file_operations xt_match_ops = { + .owner = THIS_MODULE, + .open = xt_match_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, true); +} + +static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + return xt_mttg_seq_next(seq, v, ppos, true); +} + +static int xt_target_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_target *target; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + target = list_entry(trav->curr, struct xt_target, list); + return (*target->name == '\0') ? 0 : + seq_printf(seq, "%s\n", target->name); + } + return 0; +} + +static const struct seq_operations xt_target_seq_ops = { + .start = xt_target_seq_start, + .next = xt_target_seq_next, + .stop = xt_mttg_seq_stop, + .show = xt_target_seq_show, +}; + +static int xt_target_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct nf_mttg_trav *trav; + int ret; + + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; + + ret = seq_open(file, &xt_target_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; + } + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE(inode)->data; + return 0; +} + +static const struct file_operations xt_target_ops = { + .owner = THIS_MODULE, + .open = xt_target_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#define FORMAT_TABLES "_tables_names" +#define FORMAT_MATCHES "_tables_matches" +#define FORMAT_TARGETS "_tables_targets" + +#endif /* CONFIG_PROC_FS */ + +/** + * xt_hook_link - set up hooks for a new table + * @table: table with metadata needed to set up hooks + * @fn: Hook function + * + * This function will take care of creating and registering the necessary + * Netfilter hooks for XT tables. + */ +struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) +{ + unsigned int hook_mask = table->valid_hooks; + uint8_t i, num_hooks = hweight32(hook_mask); + uint8_t hooknum; + struct nf_hook_ops *ops; + int ret; + + ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0; + hook_mask >>= 1, ++hooknum) { + if (!(hook_mask & 1)) + continue; + ops[i].hook = fn; + ops[i].owner = table->me; + ops[i].pf = table->af; + ops[i].hooknum = hooknum; + ops[i].priority = table->priority; + ++i; + } + + ret = nf_register_hooks(ops, num_hooks); + if (ret < 0) { + kfree(ops); + return ERR_PTR(ret); + } + + return ops; +} +EXPORT_SYMBOL_GPL(xt_hook_link); + +/** + * xt_hook_unlink - remove hooks for a table + * @ops: nf_hook_ops array as returned by nf_hook_link + * @hook_mask: the very same mask that was passed to nf_hook_link + */ +void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops) +{ + nf_unregister_hooks(ops, hweight32(table->valid_hooks)); + kfree(ops); +} +EXPORT_SYMBOL_GPL(xt_hook_unlink); + +int xt_proto_init(struct net *net, u_int8_t af) +{ +#ifdef CONFIG_PROC_FS + char buf[XT_FUNCTION_MAXNAMELEN]; + struct proc_dir_entry *proc; +#endif + + if (af >= ARRAY_SIZE(xt_prefix)) + return -EINVAL; + + +#ifdef CONFIG_PROC_FS + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops, + (void *)(unsigned long)af); + if (!proc) + goto out; + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops, + (void *)(unsigned long)af); + if (!proc) + goto out_remove_tables; + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TARGETS, sizeof(buf)); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops, + (void *)(unsigned long)af); + if (!proc) + goto out_remove_matches; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +out_remove_matches: + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc_net_remove(net, buf); + +out_remove_tables: + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc_net_remove(net, buf); +out: + return -1; +#endif +} +EXPORT_SYMBOL_GPL(xt_proto_init); + +void xt_proto_fini(struct net *net, u_int8_t af) +{ +#ifdef CONFIG_PROC_FS + char buf[XT_FUNCTION_MAXNAMELEN]; + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc_net_remove(net, buf); + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TARGETS, sizeof(buf)); + proc_net_remove(net, buf); + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc_net_remove(net, buf); +#endif /*CONFIG_PROC_FS*/ +} +EXPORT_SYMBOL_GPL(xt_proto_fini); + +static int __net_init xt_net_init(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&net->xt.tables[i]); + return 0; +} + +static struct pernet_operations xt_net_ops = { + .init = xt_net_init, +}; + +static int __init xt_init(void) +{ + unsigned int i; + int rv; + + for_each_possible_cpu(i) { + seqcount_init(&per_cpu(xt_recseq, i)); + } + + xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); + if (!xt) + return -ENOMEM; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + mutex_init(&xt[i].mutex); +#ifdef CONFIG_COMPAT + mutex_init(&xt[i].compat_mutex); + xt[i].compat_tab = NULL; +#endif + INIT_LIST_HEAD(&xt[i].target); + INIT_LIST_HEAD(&xt[i].match); + } + rv = register_pernet_subsys(&xt_net_ops); + if (rv < 0) + kfree(xt); + return rv; +} + +static void __exit xt_fini(void) +{ + unregister_pernet_subsys(&xt_net_ops); + kfree(xt); +} + +module_init(xt_init); +module_exit(xt_fini); + diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c new file mode 100644 index 00000000..363a99ec --- /dev/null +++ b/net/netfilter/xt_AUDIT.c @@ -0,0 +1,222 @@ +/* + * Creates audit record for dropped/accepted packets + * + * (C) 2010-2011 Thomas Graf + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Graf "); +MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); +MODULE_ALIAS("ipt_AUDIT"); +MODULE_ALIAS("ip6t_AUDIT"); +MODULE_ALIAS("ebt_AUDIT"); +MODULE_ALIAS("arpt_AUDIT"); + +static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, + unsigned int proto, unsigned int offset) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + const __be16 *pptr; + __be16 _ports[2]; + + pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); + if (pptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " sport=%hu dport=%hu", + ntohs(pptr[0]), ntohs(pptr[1])); + } + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: { + const u8 *iptr; + u8 _ih[2]; + + iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); + if (iptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", + iptr[0], iptr[1]); + + } + break; + } +} + +static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", + &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + + if (ntohs(ih->frag_off) & IP_OFFSET) { + audit_log_format(ab, " frag=1"); + return; + } + + audit_proto(ab, skb, ih->protocol, ih->ihl * 4); +} + +static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + int offset; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + nexthdr = ih->nexthdr; + offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), + &nexthdr); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + if (offset) + audit_proto(ab, skb, nexthdr, offset); +} + +static unsigned int +audit_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (ab == NULL) + goto errout; + + audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", + info->type, par->hooknum, skb->len, + par->in ? par->in->name : "?", + par->out ? par->out->name : "?"); + + if (skb->mark) + audit_log_format(ab, " mark=%#x", skb->mark); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + + if (par->family == NFPROTO_BRIDGE) { + switch (eth_hdr(skb)->h_proto) { + case __constant_htons(ETH_P_IP): + audit_ip4(ab, skb); + break; + + case __constant_htons(ETH_P_IPV6): + audit_ip6(ab, skb); + break; + } + } + } + + switch (par->family) { + case NFPROTO_IPV4: + audit_ip4(ab, skb); + break; + + case NFPROTO_IPV6: + audit_ip6(ab, skb); + break; + } + + audit_log_end(ab); + +errout: + return XT_CONTINUE; +} + +static unsigned int +audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par) +{ + audit_tg(skb, par); + return EBT_CONTINUE; +} + +static int audit_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + + if (info->type > XT_AUDIT_TYPE_MAX) { + pr_info("Audit type out of range (valid range: 0..%hhu)\n", + XT_AUDIT_TYPE_MAX); + return -ERANGE; + } + + return 0; +} + +static struct xt_target audit_tg_reg[] __read_mostly = { + { + .name = "AUDIT", + .family = NFPROTO_UNSPEC, + .target = audit_tg, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, + }, + { + .name = "AUDIT", + .family = NFPROTO_BRIDGE, + .target = audit_tg_ebt, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, + }, +}; + +static int __init audit_tg_init(void) +{ + return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +static void __exit audit_tg_exit(void) +{ + xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +module_init(audit_tg_init); +module_exit(audit_tg_exit); diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c new file mode 100644 index 00000000..0f642ef8 --- /dev/null +++ b/net/netfilter/xt_CHECKSUM.c @@ -0,0 +1,70 @@ +/* iptables module for the packet checksum mangling + * + * (C) 2002 by Harald Welte + * (C) 2010 Red Hat, Inc. + * + * Author: Michael S. Tsirkin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michael S. Tsirkin "); +MODULE_DESCRIPTION("Xtables: checksum modification"); +MODULE_ALIAS("ipt_CHECKSUM"); +MODULE_ALIAS("ip6t_CHECKSUM"); + +static unsigned int +checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb_checksum_help(skb); + + return XT_CONTINUE; +} + +static int checksum_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_CHECKSUM_info *einfo = par->targinfo; + + if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { + pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); + return -EINVAL; + } + if (!einfo->operation) { + pr_info("no CHECKSUM operation enabled\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target checksum_tg_reg __read_mostly = { + .name = "CHECKSUM", + .family = NFPROTO_UNSPEC, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, +}; + +static int __init checksum_tg_init(void) +{ + return xt_register_target(&checksum_tg_reg); +} + +static void __exit checksum_tg_exit(void) +{ + xt_unregister_target(&checksum_tg_reg); +} + +module_init(checksum_tg_init); +module_exit(checksum_tg_exit); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c new file mode 100644 index 00000000..af9c4dad --- /dev/null +++ b/net/netfilter/xt_CLASSIFY.c @@ -0,0 +1,73 @@ +/* + * This is a module which is used for setting the skb->priority field + * of an skb for qdisc classification. + */ + +/* (C) 2001-2002 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Qdisc classification"); +MODULE_ALIAS("ipt_CLASSIFY"); +MODULE_ALIAS("ip6t_CLASSIFY"); +MODULE_ALIAS("arpt_CLASSIFY"); + +static unsigned int +classify_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_classify_target_info *clinfo = par->targinfo; + + skb->priority = clinfo->priority; + return XT_CONTINUE; +} + +static struct xt_target classify_tg_reg[] __read_mostly = { + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_UNSPEC, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_ARP, + .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, +}; + +static int __init classify_tg_init(void) +{ + return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); +} + +static void __exit classify_tg_exit(void) +{ + xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); +} + +module_init(classify_tg_init); +module_exit(classify_tg_exit); diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c new file mode 100644 index 00000000..e04dc282 --- /dev/null +++ b/net/netfilter/xt_CONNSECMARK.c @@ -0,0 +1,143 @@ +/* + * This module is used to copy security markings from packets + * to connections, and restore security markings from connections + * back to packets. This would normally be performed in conjunction + * with the SECMARK target and state match. + * + * Based somewhat on CONNMARK: + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * (C) 2006,2008 Red Hat, Inc., James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark"); +MODULE_ALIAS("ipt_CONNSECMARK"); +MODULE_ALIAS("ip6t_CONNSECMARK"); + +/* + * If the packet has a security mark and the connection does not, copy + * the security mark from the packet to the connection. + */ +static void secmark_save(const struct sk_buff *skb) +{ + if (skb->secmark) { + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !ct->secmark) { + ct->secmark = skb->secmark; + nf_conntrack_event_cache(IPCT_SECMARK, ct); + } + } +} + +/* + * If packet has no security mark, and the connection does, restore the + * security mark from the connection to the packet. + */ +static void secmark_restore(struct sk_buff *skb) +{ + if (!skb->secmark) { + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (ct && ct->secmark) + skb->secmark = ct->secmark; + } +} + +static unsigned int +connsecmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connsecmark_target_info *info = par->targinfo; + + switch (info->mode) { + case CONNSECMARK_SAVE: + secmark_save(skb); + break; + + case CONNSECMARK_RESTORE: + secmark_restore(skb); + break; + + default: + BUG(); + } + + return XT_CONTINUE; +} + +static int connsecmark_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_connsecmark_target_info *info = par->targinfo; + int ret; + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { + pr_info("target only valid in the \'mangle\' " + "or \'security\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + switch (info->mode) { + case CONNSECMARK_SAVE: + case CONNSECMARK_RESTORE: + break; + + default: + pr_info("invalid mode: %hu\n", info->mode); + return -EINVAL; + } + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target connsecmark_tg_reg __read_mostly = { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, +}; + +static int __init connsecmark_tg_init(void) +{ + return xt_register_target(&connsecmark_tg_reg); +} + +static void __exit connsecmark_tg_exit(void) +{ + xt_unregister_target(&connsecmark_tg_reg); +} + +module_init(connsecmark_tg_init); +module_exit(connsecmark_tg_exit); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c new file mode 100644 index 00000000..782e5198 --- /dev/null +++ b/net/netfilter/xt_CT.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2010 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int xt_ct_target(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + atomic_inc(&ct->ct_general.use); + skb->nfct = &ct->ct_general; + skb->nfctinfo = IP_CT_NEW; + + return XT_CONTINUE; +} + +static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) +{ + if (par->family == NFPROTO_IPV4) { + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.invflags & IPT_INV_PROTO) + return 0; + return e->ip.proto; + } else if (par->family == NFPROTO_IPV6) { + const struct ip6t_entry *e = par->entryinfo; + + if (e->ipv6.invflags & IP6T_INV_PROTO) + return 0; + return e->ipv6.proto; + } else + return 0; +} + +static int xt_ct_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conntrack_tuple t; + struct nf_conn_help *help; + struct nf_conn *ct; + int ret = 0; + u8 proto; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + if (info->flags & XT_CT_NOTRACK) { + ct = nf_ct_untracked_get(); + atomic_inc(&ct->ct_general.use); + goto out; + } + +#ifndef CONFIG_NF_CONNTRACK_ZONES + if (info->zone) + goto err1; +#endif + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + goto err1; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); + ret = PTR_ERR(ct); + if (IS_ERR(ct)) + goto err2; + + ret = 0; + if ((info->ct_events || info->exp_events) && + !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, + GFP_KERNEL)) + goto err3; + + if (info->helper[0]) { + ret = -ENOENT; + proto = xt_ct_find_proto(par); + if (!proto) + goto err3; + + ret = -ENOMEM; + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (help == NULL) + goto err3; + + ret = -ENOENT; + help->helper = nf_conntrack_helper_try_module_get(info->helper, + par->family, + proto); + if (help->helper == NULL) + goto err3; + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); +out: + info->ct = ct; + return 0; + +err3: + nf_conntrack_free(ct); +err2: + nf_ct_l3proto_module_put(par->family); +err1: + return ret; +} + +static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + struct nf_conn_help *help; + + if (!nf_ct_is_untracked(ct)) { + help = nfct_help(ct); + if (help) + module_put(help->helper->me); + + nf_ct_l3proto_module_put(par->family); + } + nf_ct_put(info->ct); +} + +static struct xt_target xt_ct_tg __read_mostly = { + .name = "CT", + .family = NFPROTO_UNSPEC, + .targetsize = sizeof(struct xt_ct_target_info), + .checkentry = xt_ct_tg_check, + .destroy = xt_ct_tg_destroy, + .target = xt_ct_target, + .table = "raw", + .me = THIS_MODULE, +}; + +static int __init xt_ct_tg_init(void) +{ + return xt_register_target(&xt_ct_tg); +} + +static void __exit xt_ct_tg_exit(void) +{ + xt_unregister_target(&xt_ct_tg); +} + +module_init(xt_ct_tg_init); +module_exit(xt_ct_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: connection tracking target"); +MODULE_ALIAS("ipt_CT"); +MODULE_ALIAS("ip6t_CT"); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c new file mode 100644 index 00000000..a192f55e --- /dev/null +++ b/net/netfilter/xt_IDLETIMER.c @@ -0,0 +1,408 @@ +/* + * linux/net/netfilter/xt_IDLETIMER.c + * + * Netfilter module to trigger a timer when packet matches. + * After timer expires a kevent will be sent. + * + * Copyright (C) 2004, 2010 Nokia Corporation + * + * Written by Timo Teras + * + * Converted to x_tables and reworked for upstream inclusion + * by Luciano Coelho + * + * Contact: Luciano Coelho + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sock *nl_sk; + +struct idletimer_tg_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); +}; + +struct idletimer_tg { + struct list_head entry; + struct timer_list timer; + struct work_struct work; + + struct kobject *kobj; + struct idletimer_tg_attr attr; + + unsigned int refcnt; + bool send_nl_msg; + bool active; +}; + +static LIST_HEAD(idletimer_tg_list); +static DEFINE_MUTEX(list_mutex); + +static struct kobject *idletimer_tg_kobj; + +static void notify_netlink(const char *iface, struct idletimer_tg *timer) +{ + struct sk_buff *log_skb; + size_t size; + struct nlmsghdr *nlh; + char str[NLMSG_MAX_SIZE]; + int event_type, res; + + size = NLMSG_SPACE(NLMSG_MAX_SIZE); + size = max(size, (size_t)NLMSG_GOODSIZE); + log_skb = alloc_skb(size, GFP_ATOMIC); + if (!log_skb) { + pr_err("xt_cannot alloc skb for logging\n"); + return; + } + + event_type = timer->active ? NL_EVENT_TYPE_ACTIVE + : NL_EVENT_TYPE_INACTIVE; + res = snprintf(str, NLMSG_MAX_SIZE, "%s %s\n", iface, + timer->active ? "ACTIVE" : "INACTIVE"); + if (NLMSG_MAX_SIZE <= res) + goto nlmsg_failure; + + /* NLMSG_PUT() uses "goto nlmsg_failure" */ + nlh = NLMSG_PUT(log_skb, /*pid*/0, /*seq*/0, event_type, + /* Size of message */NLMSG_MAX_SIZE); + + strncpy(NLMSG_DATA(nlh), str, MAX_IDLETIMER_LABEL_SIZE); + + NETLINK_CB(log_skb).dst_group = 1; + netlink_broadcast(nl_sk, log_skb, 0, 1, GFP_ATOMIC); + + pr_debug("putting nlmsg: %s", str); + return; + +nlmsg_failure: /* Used within NLMSG_PUT() */ + consume_skb(log_skb); + pr_debug("Failed nlmsg_put\n"); +} + +static +struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) +{ + struct idletimer_tg *entry; + + BUG_ON(!label); + + list_for_each_entry(entry, &idletimer_tg_list, entry) { + if (!strcmp(label, entry->attr.attr.name)) + return entry; + } + + return NULL; +} + +static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct idletimer_tg *timer; + unsigned long expires = 0; + unsigned long now = jiffies; + + mutex_lock(&list_mutex); + + timer = __idletimer_tg_find_by_label(attr->name); + if (timer) + expires = timer->timer.expires; + + mutex_unlock(&list_mutex); + + if (time_after(expires, now)) + return sprintf(buf, "%u\n", + jiffies_to_msecs(expires - now) / 1000); + + if (timer->send_nl_msg) + return sprintf(buf, "0 %d\n", + jiffies_to_msecs(now - expires) / 1000); + else + return sprintf(buf, "0\n"); +} + +static void idletimer_tg_work(struct work_struct *work) +{ + struct idletimer_tg *timer = container_of(work, struct idletimer_tg, + work); + + sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); + + if (timer->send_nl_msg) + notify_netlink(timer->attr.attr.name, timer); +} + +static void idletimer_tg_expired(unsigned long data) +{ + struct idletimer_tg *timer = (struct idletimer_tg *) data; + + pr_debug("timer %s expired\n", timer->attr.attr.name); + + timer->active = false; + schedule_work(&timer->work); +} + +static int idletimer_tg_create(struct idletimer_tg_info *info) +{ + int ret; + + info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + if (!info->timer) { + pr_debug("couldn't alloc timer\n"); + ret = -ENOMEM; + goto out; + } + + info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); + if (!info->timer->attr.attr.name) { + pr_debug("couldn't alloc attribute name\n"); + ret = -ENOMEM; + goto out_free_timer; + } + info->timer->attr.attr.mode = S_IRUGO; + info->timer->attr.show = idletimer_tg_show; + + ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); + if (ret < 0) { + pr_debug("couldn't add file to sysfs"); + goto out_free_attr; + } + + list_add(&info->timer->entry, &idletimer_tg_list); + + setup_timer(&info->timer->timer, idletimer_tg_expired, + (unsigned long) info->timer); + info->timer->refcnt = 1; + info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true; + info->timer->active = true; + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + INIT_WORK(&info->timer->work, idletimer_tg_work); + + return 0; + +out_free_attr: + kfree(info->timer->attr.attr.name); +out_free_timer: + kfree(info->timer); +out: + return ret; +} + +/* + * The actual xt_tables plugin. + */ +static unsigned int idletimer_tg_target(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + unsigned long now = jiffies; + + pr_debug("resetting timer %s, timeout period %u\n", + info->label, info->timeout); + + BUG_ON(!info->timer); + + info->timer->active = true; + + if (time_before(info->timer->timer.expires, now)) { + schedule_work(&info->timer->work); + pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n", + info->label, info->timer->timer.expires, now); + } + + /* TODO: Avoid modifying timers on each packet */ + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + now); + + return XT_CONTINUE; +} + +static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info *info = par->targinfo; + int ret; + unsigned long now = jiffies; + + pr_debug("checkentry targinfo %s\n", info->label); + + if (info->timeout == 0) { + pr_debug("timeout value is zero\n"); + return -EINVAL; + } + + if (info->label[0] == '\0' || + strnlen(info->label, + MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { + pr_debug("label is empty or not nul-terminated\n"); + return -EINVAL; + } + + mutex_lock(&list_mutex); + + info->timer = __idletimer_tg_find_by_label(info->label); + if (info->timer) { + info->timer->refcnt++; + info->timer->active = true; + + if (time_before(info->timer->timer.expires, now)) { + schedule_work(&info->timer->work); + pr_debug("Starting Checkentry timer" + "(Expired, Jiffies): %lu, %lu\n", + info->timer->timer.expires, now); + } + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + now); + + pr_debug("increased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } else { + ret = idletimer_tg_create(info); + if (ret < 0) { + pr_debug("failed to create timer\n"); + mutex_unlock(&list_mutex); + return ret; + } + } + + mutex_unlock(&list_mutex); + + return 0; +} + +static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + + pr_debug("destroy targinfo %s\n", info->label); + + mutex_lock(&list_mutex); + + if (--info->timer->refcnt == 0) { + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); + del_timer_sync(&info->timer->timer); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); + } else { + pr_debug("decreased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } + + mutex_unlock(&list_mutex); +} + +static struct xt_target idletimer_tg __read_mostly = { + .name = "IDLETIMER", + .revision = 1, + .family = NFPROTO_UNSPEC, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, +}; + +static struct class *idletimer_tg_class; + +static struct device *idletimer_tg_device; + +static int __init idletimer_tg_init(void) +{ + int err; + + idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer"); + err = PTR_ERR(idletimer_tg_class); + if (IS_ERR(idletimer_tg_class)) { + pr_debug("couldn't register device class\n"); + goto out; + } + + idletimer_tg_device = device_create(idletimer_tg_class, NULL, + MKDEV(0, 0), NULL, "timers"); + err = PTR_ERR(idletimer_tg_device); + if (IS_ERR(idletimer_tg_device)) { + pr_debug("couldn't register system device\n"); + goto out_class; + } + + idletimer_tg_kobj = &idletimer_tg_device->kobj; + + err = xt_register_target(&idletimer_tg); + if (err < 0) { + pr_debug("couldn't register xt target\n"); + goto out_dev; + } + + nl_sk = netlink_kernel_create(&init_net, + NETLINK_IDLETIMER, 1, NULL, + NULL, THIS_MODULE); + + if (!nl_sk) { + pr_err("Failed to create netlink socket\n"); + return -ENOMEM; + } + + return 0; +out_dev: + device_destroy(idletimer_tg_class, MKDEV(0, 0)); +out_class: + class_destroy(idletimer_tg_class); +out: + return err; +} + +static void __exit idletimer_tg_exit(void) +{ + xt_unregister_target(&idletimer_tg); + + device_destroy(idletimer_tg_class, MKDEV(0, 0)); + class_destroy(idletimer_tg_class); +} + +module_init(idletimer_tg_init); +module_exit(idletimer_tg_exit); + +MODULE_AUTHOR("Timo Teras "); +MODULE_AUTHOR("Luciano Coelho "); +MODULE_DESCRIPTION("Xtables: idle time monitor"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ipt_IDLETIMER"); +MODULE_ALIAS("ip6t_IDLETIMER"); +MODULE_ALIAS("arpt_IDLETIMER"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c new file mode 100644 index 00000000..993de2ba --- /dev/null +++ b/net/netfilter/xt_LED.c @@ -0,0 +1,215 @@ +/* + * xt_LED.c - netfilter target to make LEDs blink upon packet matches + * + * Copyright (C) 2008 Adam Nielsen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Adam Nielsen "); +MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); +MODULE_ALIAS("ipt_LED"); +MODULE_ALIAS("ip6t_LED"); + +static LIST_HEAD(xt_led_triggers); +static DEFINE_MUTEX(xt_led_mutex); + +/* + * This is declared in here (the kernel module) only, to avoid having these + * dependencies in userspace code. This is what xt_led_info.internal_data + * points to. + */ +struct xt_led_info_internal { + struct list_head list; + int refcnt; + char *trigger_id; + struct led_trigger netfilter_led_trigger; + struct timer_list timer; +}; + +static unsigned int +led_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + /* + * If "always blink" is enabled, and there's still some time until the + * LED will switch off, briefly switch it off now. + */ + if ((ledinfo->delay > 0) && ledinfo->always_blink && + timer_pending(&ledinternal->timer)) + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); + + /* If there's a positive delay, start/update the timer */ + if (ledinfo->delay > 0) { + mod_timer(&ledinternal->timer, + jiffies + msecs_to_jiffies(ledinfo->delay)); + + /* Otherwise if there was no delay given, blink as fast as possible */ + } else if (ledinfo->delay == 0) { + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + } + + /* else the delay is negative, which means switch on and stay on */ + + return XT_CONTINUE; +} + +static void led_timeout_callback(unsigned long data) +{ + struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data; + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); +} + +static struct xt_led_info_internal *led_trigger_lookup(const char *name) +{ + struct xt_led_info_internal *ledinternal; + + list_for_each_entry(ledinternal, &xt_led_triggers, list) { + if (!strcmp(name, ledinternal->netfilter_led_trigger.name)) { + return ledinternal; + } + } + return NULL; +} + +static int led_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal; + int err; + + if (ledinfo->id[0] == '\0') { + pr_info("No 'id' parameter given.\n"); + return -EINVAL; + } + + mutex_lock(&xt_led_mutex); + + ledinternal = led_trigger_lookup(ledinfo->id); + if (ledinternal) { + ledinternal->refcnt++; + goto out; + } + + err = -ENOMEM; + ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL); + if (!ledinternal) + goto exit_mutex_only; + + ledinternal->trigger_id = kstrdup(ledinfo->id, GFP_KERNEL); + if (!ledinternal->trigger_id) + goto exit_internal_alloc; + + ledinternal->refcnt = 1; + ledinternal->netfilter_led_trigger.name = ledinternal->trigger_id; + + err = led_trigger_register(&ledinternal->netfilter_led_trigger); + if (err) { + pr_warning("led_trigger_register() failed\n"); + if (err == -EEXIST) + pr_warning("Trigger name is already in use.\n"); + goto exit_alloc; + } + + /* See if we need to set up a timer */ + if (ledinfo->delay > 0) + setup_timer(&ledinternal->timer, led_timeout_callback, + (unsigned long)ledinternal); + + list_add_tail(&ledinternal->list, &xt_led_triggers); + +out: + mutex_unlock(&xt_led_mutex); + + ledinfo->internal_data = ledinternal; + + return 0; + +exit_alloc: + kfree(ledinternal->trigger_id); + +exit_internal_alloc: + kfree(ledinternal); + +exit_mutex_only: + mutex_unlock(&xt_led_mutex); + + return err; +} + +static void led_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + mutex_lock(&xt_led_mutex); + + if (--ledinternal->refcnt) { + mutex_unlock(&xt_led_mutex); + return; + } + + list_del(&ledinternal->list); + + if (ledinfo->delay > 0) + del_timer_sync(&ledinternal->timer); + + led_trigger_unregister(&ledinternal->netfilter_led_trigger); + + mutex_unlock(&xt_led_mutex); + + kfree(ledinternal->trigger_id); + kfree(ledinternal); +} + +static struct xt_target led_tg_reg __read_mostly = { + .name = "LED", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, +}; + +static int __init led_tg_init(void) +{ + return xt_register_target(&led_tg_reg); +} + +static void __exit led_tg_exit(void) +{ + xt_unregister_target(&led_tg_reg); +} + +module_init(led_tg_init); +module_exit(led_tg_exit); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c new file mode 100644 index 00000000..a17dd0f5 --- /dev/null +++ b/net/netfilter/xt_NFLOG.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NFLOG"); +MODULE_ALIAS("ip6t_NFLOG"); + +static unsigned int +nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_nflog_info *info = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_ULOG; + li.u.ulog.copy_len = info->len; + li.u.ulog.group = info->group; + li.u.ulog.qthreshold = info->threshold; + + nfulnl_log_packet(par->family, par->hooknum, skb, par->in, + par->out, &li, info->prefix); + return XT_CONTINUE; +} + +static int nflog_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_nflog_info *info = par->targinfo; + + if (info->flags & ~XT_NFLOG_MASK) + return -EINVAL; + if (info->prefix[sizeof(info->prefix) - 1] != '\0') + return -EINVAL; + return 0; +} + +static struct xt_target nflog_tg_reg __read_mostly = { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nflog_tg_check, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, +}; + +static int __init nflog_tg_init(void) +{ + return xt_register_target(&nflog_tg_reg); +} + +static void __exit nflog_tg_exit(void) +{ + xt_unregister_target(&nflog_tg_reg); +} + +module_init(nflog_tg_init); +module_exit(nflog_tg_exit); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c new file mode 100644 index 00000000..d4f4b5d6 --- /dev/null +++ b/net/netfilter/xt_NFQUEUE.c @@ -0,0 +1,160 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NFQUEUE"); +MODULE_ALIAS("ip6t_NFQUEUE"); +MODULE_ALIAS("arpt_NFQUEUE"); + +static u32 jhash_initval __read_mostly; +static bool rnd_inited __read_mostly; + +static unsigned int +nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info *tinfo = par->targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static u32 hash_v4(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + __be32 ipaddr; + + /* packets in either direction go into same queue */ + ipaddr = iph->saddr ^ iph->daddr; + + return jhash_2words((__force u32)ipaddr, iph->protocol, jhash_initval); +} + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +static u32 hash_v6(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __be32 addr[4]; + + addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0]; + addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1]; + addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2]; + addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3]; + + return jhash2((__force u32 *)addr, ARRAY_SIZE(addr), jhash_initval); +} +#endif + +static unsigned int +nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v1 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) { + if (par->family == NFPROTO_IPV4) + queue = (((u64) hash_v4(skb) * info->queues_total) >> + 32) + queue; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + else if (par->family == NFPROTO_IPV6) + queue = (((u64) hash_v6(skb) * info->queues_total) >> + 32) + queue; +#endif + } + return NF_QUEUE_NR(queue); +} + +static unsigned int +nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; + unsigned int ret = nfqueue_tg_v1(skb, par); + + if (info->bypass) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + return ret; +} + +static int nfqueue_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; + u32 maxid; + + if (unlikely(!rnd_inited)) { + get_random_bytes(&jhash_initval, sizeof(jhash_initval)); + rnd_inited = true; + } + if (info->queues_total == 0) { + pr_err("NFQUEUE: number of total queues is 0\n"); + return -EINVAL; + } + maxid = info->queues_total - 1 + info->queuenum; + if (maxid > 0xffff) { + pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n", + info->queues_total, maxid); + return -ERANGE; + } + if (par->target->revision == 2 && info->bypass > 1) + return -EINVAL; + return 0; +} + +static struct xt_target nfqueue_tg_reg[] __read_mostly = { + { + .name = "NFQUEUE", + .family = NFPROTO_UNSPEC, + .target = nfqueue_tg, + .targetsize = sizeof(struct xt_NFQ_info), + .me = THIS_MODULE, + }, + { + .name = "NFQUEUE", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v1, + .targetsize = sizeof(struct xt_NFQ_info_v1), + .me = THIS_MODULE, + }, + { + .name = "NFQUEUE", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v2, + .targetsize = sizeof(struct xt_NFQ_info_v2), + .me = THIS_MODULE, + }, +}; + +static int __init nfqueue_tg_init(void) +{ + return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); +} + +static void __exit nfqueue_tg_exit(void) +{ + xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); +} + +module_init(nfqueue_tg_init); +module_exit(nfqueue_tg_exit); diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c new file mode 100644 index 00000000..9d782181 --- /dev/null +++ b/net/netfilter/xt_NOTRACK.c @@ -0,0 +1,53 @@ +/* This is a module which is used for setting up fake conntracks + * on packets so that they are not seen by the conntrack/NAT code. + */ +#include +#include + +#include +#include + +MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NOTRACK"); +MODULE_ALIAS("ip6t_NOTRACK"); + +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + /* Attach fake conntrack entry. + If there is a real ct entry correspondig to this packet, + it'll hang aroun till timing out. We don't deal with it + for performance reasons. JK */ + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + + return XT_CONTINUE; +} + +static struct xt_target notrack_tg_reg __read_mostly = { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, +}; + +static int __init notrack_tg_init(void) +{ + return xt_register_target(¬rack_tg_reg); +} + +static void __exit notrack_tg_exit(void) +{ + xt_unregister_target(¬rack_tg_reg); +} + +module_init(notrack_tg_init); +module_exit(notrack_tg_exit); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c new file mode 100644 index 00000000..9faf5e05 --- /dev/null +++ b/net/netfilter/xt_SECMARK.c @@ -0,0 +1,147 @@ +/* + * Module for modifying the secmark field of the skb, for use by + * security subsystems. + * + * Based on the nfmark match by: + * (C) 1999-2001 Marc Boucher + * + * (C) 2006,2008 Red Hat, Inc., James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("Xtables: packet security mark modification"); +MODULE_ALIAS("ipt_SECMARK"); +MODULE_ALIAS("ip6t_SECMARK"); + +#define PFX "SECMARK: " + +static u8 mode; + +static unsigned int +secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + u32 secmark = 0; + const struct xt_secmark_target_info *info = par->targinfo; + + BUG_ON(info->mode != mode); + + switch (mode) { + case SECMARK_MODE_SEL: + secmark = info->secid; + break; + default: + BUG(); + } + + skb->secmark = secmark; + return XT_CONTINUE; +} + +static int checkentry_lsm(struct xt_secmark_target_info *info) +{ + int err; + + info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; + info->secid = 0; + + err = security_secctx_to_secid(info->secctx, strlen(info->secctx), + &info->secid); + if (err) { + if (err == -EINVAL) + pr_info("invalid security context \'%s\'\n", info->secctx); + return err; + } + + if (!info->secid) { + pr_info("unable to map security context \'%s\'\n", info->secctx); + return -ENOENT; + } + + err = security_secmark_relabel_packet(info->secid); + if (err) { + pr_info("unable to obtain relabeling permission\n"); + return err; + } + + security_secmark_refcount_inc(); + return 0; +} + +static int secmark_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_secmark_target_info *info = par->targinfo; + int err; + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { + pr_info("target only valid in the \'mangle\' " + "or \'security\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + if (mode && mode != info->mode) { + pr_info("mode already set to %hu cannot mix with " + "rules for mode %hu\n", mode, info->mode); + return -EINVAL; + } + + switch (info->mode) { + case SECMARK_MODE_SEL: + break; + default: + pr_info("invalid mode: %hu\n", info->mode); + return -EINVAL; + } + + err = checkentry_lsm(info); + if (err) + return err; + + if (!mode) + mode = info->mode; + return 0; +} + +static void secmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + switch (mode) { + case SECMARK_MODE_SEL: + security_secmark_refcount_dec(); + } +} + +static struct xt_target secmark_tg_reg __read_mostly = { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = secmark_tg_check, + .destroy = secmark_tg_destroy, + .target = secmark_tg, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, +}; + +static int __init secmark_tg_init(void) +{ + return xt_register_target(&secmark_tg_reg); +} + +static void __exit secmark_tg_exit(void) +{ + xt_unregister_target(&secmark_tg_reg); +} + +module_init(secmark_tg_init); +module_exit(secmark_tg_exit); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c new file mode 100644 index 00000000..9dc9ecfd --- /dev/null +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -0,0 +1,142 @@ +/* + * A module for stripping a specific TCP option from TCP packets. + * + * Copyright (C) 2007 Sven Schnelle + * Copyright © CC Computer Consultants GmbH, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) + return 1; + else + return opt[offset+1]; +} + +static unsigned int +tcpoptstrip_mangle_packet(struct sk_buff *skb, + const struct xt_tcpoptstrip_target_info *info, + unsigned int tcphoff, unsigned int minlen) +{ + unsigned int optl, i, j; + struct tcphdr *tcph; + u_int16_t n, o; + u_int8_t *opt; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u_int8_t *)tcph; + + /* + * Walk through all TCP options - if we find some option to remove, + * set all octets to %TCPOPT_NOP and adjust checksum. + */ + for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { + optl = optlen(opt, i); + + if (i + optl > tcp_hdrlen(skb)) + break; + + if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) + continue; + + for (j = 0; j < optl; ++j) { + o = opt[i+j]; + n = TCPOPT_NOP; + if ((i + j) % 2 == 0) { + o <<= 8; + n <<= 8; + } + inet_proto_csum_replace2(&tcph->check, skb, htons(o), + htons(n), 0); + } + memset(opt + i, TCPOPT_NOP, optl); + } + + return XT_CONTINUE; +} + +static unsigned int +tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), + sizeof(struct iphdr) + sizeof(struct tcphdr)); +} + +#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) +static unsigned int +tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int tcphoff; + u_int8_t nexthdr; + + nexthdr = ipv6h->nexthdr; + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); + if (tcphoff < 0) + return NF_DROP; + + return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, + sizeof(*ipv6h) + sizeof(struct tcphdr)); +} +#endif + +static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { + { + .name = "TCPOPTSTRIP", + .family = NFPROTO_IPV4, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg4, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) + { + .name = "TCPOPTSTRIP", + .family = NFPROTO_IPV6, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg6, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tcpoptstrip_tg_init(void) +{ + return xt_register_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +static void __exit tcpoptstrip_tg_exit(void) +{ + xt_unregister_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +module_init(tcpoptstrip_tg_init); +module_exit(tcpoptstrip_tg_exit); +MODULE_AUTHOR("Sven Schnelle , Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: TCP option stripping"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TCPOPTSTRIP"); +MODULE_ALIAS("ip6t_TCPOPTSTRIP"); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c new file mode 100644 index 00000000..5f054a0d --- /dev/null +++ b/net/netfilter/xt_TEE.c @@ -0,0 +1,310 @@ +/* + * "TEE" target extension for Xtables + * Copyright © Sebastian Claßen, 2007 + * Jan Engelhardt, 2007-2010 + * + * based on ipt_ROUTE.c from Cédric de Launois + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 or later, as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +# define WITH_CONNTRACK 1 +# include +#endif +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +# define WITH_IPV6 1 +#endif + +struct xt_tee_priv { + struct notifier_block notifier; + struct xt_tee_tginfo *tginfo; + int oif; +}; + +static const union nf_inet_addr tee_zero_address; +static DEFINE_PER_CPU(bool, tee_active); + +static struct net *pick_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS + const struct dst_entry *dst; + + if (skb->dev != NULL) + return dev_net(skb->dev); + dst = skb_dst(skb); + if (dst != NULL && dst->dev != NULL) + return dev_net(dst->dev); +#endif + return &init_net; +} + +static bool +tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) +{ + const struct iphdr *iph = ip_hdr(skb); + struct net *net = pick_net(skb); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + if (info->priv) { + if (info->priv->oif == -1) + return false; + fl4.flowi4_oif = info->priv->oif; + } + fl4.daddr = info->gw.ip; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; + skb->protocol = htons(ETH_P_IP); + return true; +} + +static unsigned int +tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tee_tginfo *info = par->targinfo; + struct iphdr *iph; + + if (percpu_read(tee_active)) + return XT_CONTINUE; + /* + * Copy the skb, and route the copy. Will later return %XT_CONTINUE for + * the original skb, which should continue on its way as if nothing has + * happened. The copy should be independently delivered to the TEE + * --gateway. + */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return XT_CONTINUE; + +#ifdef WITH_CONNTRACK + /* Avoid counting cloned packets towards the original connection. */ + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + /* + * If we are in PREROUTING/INPUT, the checksum must be recalculated + * since the length could have changed as a result of defragmentation. + * + * We also decrease the TTL to mitigate potential TEE loops + * between two hosts. + * + * Set %IP_DF so that the original source is notified of a potentially + * decreased MTU on the clone route. IPv6 does this too. + */ + iph = ip_hdr(skb); + iph->frag_off |= htons(IP_DF); + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_IN) + --iph->ttl; + ip_send_check(iph); + + if (tee_tg_route4(skb, info)) { + percpu_write(tee_active, true); + ip_local_out(skb); + percpu_write(tee_active, false); + } else { + kfree_skb(skb); + } + return XT_CONTINUE; +} + +#ifdef WITH_IPV6 +static bool +tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = pick_net(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + if (info->priv) { + if (info->priv->oif == -1) + return false; + fl6.flowi6_oif = info->priv->oif; + } + fl6.daddr = info->gw.in6; + fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + return true; +} + +static unsigned int +tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tee_tginfo *info = par->targinfo; + + if (percpu_read(tee_active)) + return XT_CONTINUE; + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return XT_CONTINUE; + +#ifdef WITH_CONNTRACK + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_IN) { + struct ipv6hdr *iph = ipv6_hdr(skb); + --iph->hop_limit; + } + if (tee_tg_route6(skb, info)) { + percpu_write(tee_active, true); + ip6_local_out(skb); + percpu_write(tee_active, false); + } else { + kfree_skb(skb); + } + return XT_CONTINUE; +} +#endif /* WITH_IPV6 */ + +static int tee_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct xt_tee_priv *priv; + + priv = container_of(this, struct xt_tee_priv, notifier); + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + else if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + } + + return NOTIFY_DONE; +} + +static int tee_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_tee_tginfo *info = par->targinfo; + struct xt_tee_priv *priv; + + /* 0.0.0.0 and :: not allowed */ + if (memcmp(&info->gw, &tee_zero_address, + sizeof(tee_zero_address)) == 0) + return -EINVAL; + + if (info->oif[0]) { + if (info->oif[sizeof(info->oif)-1] != '\0') + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + priv->tginfo = info; + priv->oif = -1; + priv->notifier.notifier_call = tee_netdev_event; + info->priv = priv; + + register_netdevice_notifier(&priv->notifier); + } else + info->priv = NULL; + + return 0; +} + +static void tee_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_tee_tginfo *info = par->targinfo; + + if (info->priv) { + unregister_netdevice_notifier(&info->priv->notifier); + kfree(info->priv); + } +} + +static struct xt_target tee_tg_reg[] __read_mostly = { + { + .name = "TEE", + .revision = 1, + .family = NFPROTO_IPV4, + .target = tee_tg4, + .targetsize = sizeof(struct xt_tee_tginfo), + .checkentry = tee_tg_check, + .destroy = tee_tg_destroy, + .me = THIS_MODULE, + }, +#ifdef WITH_IPV6 + { + .name = "TEE", + .revision = 1, + .family = NFPROTO_IPV6, + .target = tee_tg6, + .targetsize = sizeof(struct xt_tee_tginfo), + .checkentry = tee_tg_check, + .destroy = tee_tg_destroy, + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tee_tg_init(void) +{ + return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +} + +static void __exit tee_tg_exit(void) +{ + xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +} + +module_init(tee_tg_init); +module_exit(tee_tg_exit); +MODULE_AUTHOR("Sebastian Claßen "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: Reroute packet copy"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TEE"); +MODULE_ALIAS("ip6t_TEE"); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c new file mode 100644 index 00000000..dcfd57eb --- /dev/null +++ b/net/netfilter/xt_TPROXY.c @@ -0,0 +1,432 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2010 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#define XT_TPROXY_HAVE_IPV6 1 +#include +#include +#include +#include +#endif + +#include +#include + +static bool tproxy_sk_is_transparent(struct sock *sk) +{ + if (sk->sk_state != TCP_TIME_WAIT) { + if (inet_sk(sk)->transparent) + return true; + sock_put(sk); + } else { + if (inet_twsk(sk)->tw_transparent) + return true; + inet_twsk_put(inet_twsk(sk)); + } + return false; +} + +static inline __be32 +tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) +{ + struct in_device *indev; + __be32 laddr; + + if (user_laddr) + return user_laddr; + + laddr = 0; + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + for_primary_ifa(indev) { + laddr = ifa->ifa_local; + break; + } endfor_ifa(indev); + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/** + * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @laddr: IPv4 address to redirect to or zero. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait4() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, + struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, laddr ? laddr : iph->daddr, + hp->source, lport ? lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +static unsigned int +tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, + u_int32_t mark_mask, u_int32_t mark_value) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp; + struct sock *sk; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) + return NF_DROP; + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, iph->daddr, + hp->source, hp->dest, + skb->dev, NFT_LOOKUP_ESTABLISHED); + + laddr = tproxy_laddr4(skb, laddr, iph->daddr); + if (!lport) + lport = hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); + else if (!sk) + /* no, there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, laddr, + hp->source, lport, + skb->dev, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && tproxy_sk_is_transparent(sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~mark_mask) ^ mark_value; + + pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->daddr, ntohs(hp->dest), + &laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + return NF_DROP; +} + +static unsigned int +tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +static unsigned int +tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +#ifdef XT_TPROXY_HAVE_IPV6 + +static inline const struct in6_addr * +tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr) +{ + struct inet6_dev *indev; + struct inet6_ifaddr *ifa; + struct in6_addr *laddr; + + if (!ipv6_addr_any(user_laddr)) + return user_laddr; + laddr = NULL; + + rcu_read_lock(); + indev = __in6_dev_get(skb->dev); + if (indev) + list_for_each_entry(ifa, &indev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + + laddr = &ifa->addr; + break; + } + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/** + * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + const struct xt_action_param *par, + struct sock *sk) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr _hdr, *hp; + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, + tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), + hp->source, + tgi->lport ? tgi->lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +static unsigned int +tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + struct udphdr _hdr, *hp; + struct sock *sk; + const struct in6_addr *laddr; + __be16 lport; + int thoff; + int tproto; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n"); + return NF_DROP; + } + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, &iph->daddr, + hp->source, hp->dest, + par->in, NFT_LOOKUP_ESTABLISHED); + + laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); + lport = tgi->lport ? tgi->lport : hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk); + else if (!sk) + /* no there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, laddr, + hp->source, lport, + par->in, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && tproxy_sk_is_transparent(sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; + + pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->source), + laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + + return NF_DROP; +} + +static int tproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_ip6 *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->flags & IP6T_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} +#endif + +static int tproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_ip *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->invflags & IPT_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} + +static struct xt_target tproxy_tg_reg[] __read_mostly = { + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v0, + .revision = 0, + .targetsize = sizeof(struct xt_tproxy_target_info), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#ifdef XT_TPROXY_HAVE_IPV6 + { + .name = "TPROXY", + .family = NFPROTO_IPV6, + .table = "mangle", + .target = tproxy_tg6_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg6_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#endif + +}; + +static int __init tproxy_tg_init(void) +{ + nf_defrag_ipv4_enable(); +#ifdef XT_TPROXY_HAVE_IPV6 + nf_defrag_ipv6_enable(); +#endif + + return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); +} + +static void __exit tproxy_tg_exit(void) +{ + xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); +} + +module_init(tproxy_tg_init); +module_exit(tproxy_tg_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); +MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); +MODULE_ALIAS("ipt_TPROXY"); +MODULE_ALIAS("ip6t_TPROXY"); diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c new file mode 100644 index 00000000..df48967a --- /dev/null +++ b/net/netfilter/xt_TRACE.c @@ -0,0 +1,40 @@ +/* This is a module which is used to mark packets for tracing. + */ +#include +#include + +#include + +MODULE_DESCRIPTION("Xtables: packet flow tracing"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TRACE"); +MODULE_ALIAS("ip6t_TRACE"); + +static unsigned int +trace_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + skb->nf_trace = 1; + return XT_CONTINUE; +} + +static struct xt_target trace_tg_reg __read_mostly = { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "raw", + .target = trace_tg, + .me = THIS_MODULE, +}; + +static int __init trace_tg_init(void) +{ + return xt_register_target(&trace_tg_reg); +} + +static void __exit trace_tg_exit(void) +{ + xt_unregister_target(&trace_tg_reg); +} + +module_init(trace_tg_init); +module_exit(trace_tg_exit); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c new file mode 100644 index 00000000..b77d383c --- /dev/null +++ b/net/netfilter/xt_addrtype.c @@ -0,0 +1,243 @@ +/* + * iptables module to match inet_addr_type() of an ip. + * + * Copyright (c) 2004 Patrick McHardy + * (C) 2007 Laszlo Attila Toth + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#include +#include +#include +#endif + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Xtables: address type match"); +MODULE_ALIAS("ipt_addrtype"); +MODULE_ALIAS("ip6t_addrtype"); + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr) +{ + const struct nf_afinfo *afinfo; + struct flowi6 flow; + struct rt6_info *rt; + u32 ret; + int route_err; + + memset(&flow, 0, sizeof(flow)); + ipv6_addr_copy(&flow.daddr, addr); + if (dev) + flow.flowi6_oif = dev->ifindex; + + rcu_read_lock(); + + afinfo = nf_get_afinfo(NFPROTO_IPV6); + if (afinfo != NULL) + route_err = afinfo->route(net, (struct dst_entry **)&rt, + flowi6_to_flowi(&flow), !!dev); + else + route_err = 1; + + rcu_read_unlock(); + + if (route_err) + return XT_ADDRTYPE_UNREACHABLE; + + if (rt->rt6i_flags & RTF_REJECT) + ret = XT_ADDRTYPE_UNREACHABLE; + else + ret = 0; + + if (rt->rt6i_flags & RTF_LOCAL) + ret |= XT_ADDRTYPE_LOCAL; + if (rt->rt6i_flags & RTF_ANYCAST) + ret |= XT_ADDRTYPE_ANYCAST; + + + dst_release(&rt->dst); + return ret; +} + +static bool match_type6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr, u16 mask) +{ + int addr_type = ipv6_addr_type(addr); + + if ((mask & XT_ADDRTYPE_MULTICAST) && + !(addr_type & IPV6_ADDR_MULTICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) + return false; + + if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | + XT_ADDRTYPE_UNREACHABLE) & mask) + return !!(mask & match_lookup_rt6(net, dev, addr)); + return true; +} + +static bool +addrtype_mt6(struct net *net, const struct net_device *dev, + const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type6(net, dev, &iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} +#endif + +static inline bool match_type(struct net *net, const struct net_device *dev, + __be32 addr, u_int16_t mask) +{ + return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); +} + +static bool +addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_addrtype_info *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type(net, NULL, iph->saddr, info->source) ^ + info->invert_source; + if (info->dest) + ret &= match_type(net, NULL, iph->daddr, info->dest) ^ + info->invert_dest; + + return ret; +} + +static bool +addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_addrtype_info_v1 *info = par->matchinfo; + const struct iphdr *iph; + const struct net_device *dev = NULL; + bool ret = true; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) + dev = par->in; + else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + dev = par->out; + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + if (par->family == NFPROTO_IPV6) + return addrtype_mt6(net, dev, skb, info); +#endif + iph = ip_hdr(skb); + if (info->source) + ret &= match_type(net, dev, iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type(net, dev, iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { + pr_info("both incoming and outgoing " + "interface limitation cannot be selected\n"); + return -EINVAL; + } + + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { + pr_info("output interface limitation " + "not valid in PREROUTING and INPUT\n"); + return -EINVAL; + } + + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { + pr_info("input interface limitation " + "not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; + } + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + if (par->family == NFPROTO_IPV6) { + if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { + pr_err("ipv6 BLACKHOLE matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { + pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { + pr_err("ipv6 does not support BROADCAST matching\n"); + return -EINVAL; + } + } +#endif + return 0; +} + +static struct xt_match addrtype_mt_reg[] __read_mostly = { + { + .name = "addrtype", + .family = NFPROTO_IPV4, + .match = addrtype_mt_v0, + .matchsize = sizeof(struct xt_addrtype_info), + .me = THIS_MODULE + }, + { + .name = "addrtype", + .family = NFPROTO_UNSPEC, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + } +}; + +static int __init addrtype_mt_init(void) +{ + return xt_register_matches(addrtype_mt_reg, + ARRAY_SIZE(addrtype_mt_reg)); +} + +static void __exit addrtype_mt_exit(void) +{ + xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); +} + +module_init(addrtype_mt_init); +module_exit(addrtype_mt_exit); diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c new file mode 100644 index 00000000..f4af1bfa --- /dev/null +++ b/net/netfilter/xt_cluster.c @@ -0,0 +1,178 @@ +/* + * (C) 2008-2009 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +#include +#include +#include + +static inline u32 nf_ct_orig_ipv4_src(const struct nf_conn *ct) +{ + return (__force u32)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; +} + +static inline const u32 *nf_ct_orig_ipv6_src(const struct nf_conn *ct) +{ + return (__force u32 *)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6; +} + +static inline u_int32_t +xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info) +{ + return jhash_1word(ip, info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info) +{ + return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash(const struct nf_conn *ct, + const struct xt_cluster_match_info *info) +{ + u_int32_t hash = 0; + + switch(nf_ct_l3num(ct)) { + case AF_INET: + hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info); + break; + case AF_INET6: + hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info); + break; + default: + WARN_ON(1); + break; + } + return (((u64)hash * info->total_nodes) >> 32); +} + +static inline bool +xt_cluster_ipv6_is_multicast(const struct in6_addr *addr) +{ + __be32 st = addr->s6_addr32[0]; + return ((st & htonl(0xFF000000)) == htonl(0xFF000000)); +} + +static inline bool +xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) +{ + bool is_multicast = false; + + switch(family) { + case NFPROTO_IPV4: + is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); + break; + case NFPROTO_IPV6: + is_multicast = + xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr); + break; + default: + WARN_ON(1); + break; + } + return is_multicast; +} + +static bool +xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct sk_buff *pskb = (struct sk_buff *)skb; + const struct xt_cluster_match_info *info = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned long hash; + + /* This match assumes that all nodes see the same packets. This can be + * achieved if the switch that connects the cluster nodes support some + * sort of 'port mirroring'. However, if your switch does not support + * this, your cluster nodes can reply ARP request using a multicast MAC + * address. Thus, your switch will flood the same packets to the + * cluster nodes with the same multicast MAC address. Using a multicast + * link address is a RFC 1812 (section 3.3.2) violation, but this works + * fine in practise. + * + * Unfortunately, if you use the multicast MAC address, the link layer + * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted + * by TCP and others for packets coming to this node. For that reason, + * this match mangles skbuff's pkt_type if it detects a packet + * addressed to a unicast address but using PACKET_MULTICAST. Yes, I + * know, matches should not alter packets, but we are doing this here + * because we would need to add a PKTTYPE target for this sole purpose. + */ + if (!xt_cluster_is_multicast_addr(skb, par->family) && + skb->pkt_type == PACKET_MULTICAST) { + pskb->pkt_type = PACKET_HOST; + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + if (nf_ct_is_untracked(ct)) + return false; + + if (ct->master) + hash = xt_cluster_hash(ct->master, info); + else + hash = xt_cluster_hash(ct, info); + + return !!((1 << hash) & info->node_mask) ^ + !!(info->flags & XT_CLUSTER_F_INV); +} + +static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_cluster_match_info *info = par->matchinfo; + + if (info->total_nodes > XT_CLUSTER_NODES_MAX) { + pr_info("you have exceeded the maximum " + "number of cluster nodes (%u > %u)\n", + info->total_nodes, XT_CLUSTER_NODES_MAX); + return -EINVAL; + } + if (info->node_mask >= (1ULL << info->total_nodes)) { + pr_info("this node mask cannot be " + "higher than the total number of nodes\n"); + return -EDOM; + } + return 0; +} + +static struct xt_match xt_cluster_match __read_mostly = { + .name = "cluster", + .family = NFPROTO_UNSPEC, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_cluster_mt_init(void) +{ + return xt_register_match(&xt_cluster_match); +} + +static void __exit xt_cluster_mt_fini(void) +{ + xt_unregister_match(&xt_cluster_match); +} + +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: hash-based cluster match"); +MODULE_ALIAS("ipt_cluster"); +MODULE_ALIAS("ip6t_cluster"); +module_init(xt_cluster_mt_init); +module_exit(xt_cluster_mt_fini); diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c new file mode 100644 index 00000000..5c861d2f --- /dev/null +++ b/net/netfilter/xt_comment.c @@ -0,0 +1,45 @@ +/* + * Implements a dummy match to allow attaching comments to rules + * + * 2003-05-13 Brad Fisher (brad@info-link.net) + */ + +#include +#include +#include +#include + +MODULE_AUTHOR("Brad Fisher "); +MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a comment"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_comment"); +MODULE_ALIAS("ip6t_comment"); + +static bool +comment_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + /* We always match */ + return true; +} + +static struct xt_match comment_mt_reg __read_mostly = { + .name = "comment", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = comment_mt, + .matchsize = sizeof(struct xt_comment_info), + .me = THIS_MODULE, +}; + +static int __init comment_mt_init(void) +{ + return xt_register_match(&comment_mt_reg); +} + +static void __exit comment_mt_exit(void) +{ + xt_unregister_match(&comment_mt_reg); +} + +module_init(comment_mt_init); +module_exit(comment_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c new file mode 100644 index 00000000..5b138506 --- /dev/null +++ b/net/netfilter/xt_connbytes.c @@ -0,0 +1,155 @@ +/* Kernel module to match connection tracking byte counter. + * GPL (C) 2002 Martin Devera (devik@cdi.cz). + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); +MODULE_ALIAS("ipt_connbytes"); +MODULE_ALIAS("ip6t_connbytes"); + +static bool +connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connbytes_info *sinfo = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + u_int64_t what = 0; /* initialize to make gcc happy */ + u_int64_t bytes = 0; + u_int64_t pkts = 0; + const struct nf_conn_counter *counters; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + + counters = nf_conn_acct_find(ct); + if (!counters) + return false; + + switch (sinfo->what) { + case XT_CONNBYTES_PKTS: + switch (sinfo->direction) { + case XT_CONNBYTES_DIR_ORIGINAL: + what = counters[IP_CT_DIR_ORIGINAL].packets; + break; + case XT_CONNBYTES_DIR_REPLY: + what = counters[IP_CT_DIR_REPLY].packets; + break; + case XT_CONNBYTES_DIR_BOTH: + what = counters[IP_CT_DIR_ORIGINAL].packets; + what += counters[IP_CT_DIR_REPLY].packets; + break; + } + break; + case XT_CONNBYTES_BYTES: + switch (sinfo->direction) { + case XT_CONNBYTES_DIR_ORIGINAL: + what = counters[IP_CT_DIR_ORIGINAL].bytes; + break; + case XT_CONNBYTES_DIR_REPLY: + what = counters[IP_CT_DIR_REPLY].bytes; + break; + case XT_CONNBYTES_DIR_BOTH: + what = counters[IP_CT_DIR_ORIGINAL].bytes; + what += counters[IP_CT_DIR_REPLY].bytes; + break; + } + break; + case XT_CONNBYTES_AVGPKT: + switch (sinfo->direction) { + case XT_CONNBYTES_DIR_ORIGINAL: + bytes = counters[IP_CT_DIR_ORIGINAL].bytes; + pkts = counters[IP_CT_DIR_ORIGINAL].packets; + break; + case XT_CONNBYTES_DIR_REPLY: + bytes = counters[IP_CT_DIR_REPLY].bytes; + pkts = counters[IP_CT_DIR_REPLY].packets; + break; + case XT_CONNBYTES_DIR_BOTH: + bytes = counters[IP_CT_DIR_ORIGINAL].bytes + + counters[IP_CT_DIR_REPLY].bytes; + pkts = counters[IP_CT_DIR_ORIGINAL].packets + + counters[IP_CT_DIR_REPLY].packets; + break; + } + if (pkts != 0) + what = div64_u64(bytes, pkts); + break; + } + + if (sinfo->count.to) + return what <= sinfo->count.to && what >= sinfo->count.from; + else + return what >= sinfo->count.from; +} + +static int connbytes_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_connbytes_info *sinfo = par->matchinfo; + int ret; + + if (sinfo->what != XT_CONNBYTES_PKTS && + sinfo->what != XT_CONNBYTES_BYTES && + sinfo->what != XT_CONNBYTES_AVGPKT) + return -EINVAL; + + if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && + sinfo->direction != XT_CONNBYTES_DIR_REPLY && + sinfo->direction != XT_CONNBYTES_DIR_BOTH) + return -EINVAL; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + + /* + * This filter cannot function correctly unless connection tracking + * accounting is enabled, so complain in the hope that someone notices. + */ + if (!nf_ct_acct_enabled(par->net)) { + pr_warning("Forcing CT accounting to be enabled\n"); + nf_ct_set_acct(par->net, true); + } + + return ret; +} + +static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match connbytes_mt_reg __read_mostly = { + .name = "connbytes", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connbytes_mt_check, + .match = connbytes_mt, + .destroy = connbytes_mt_destroy, + .matchsize = sizeof(struct xt_connbytes_info), + .me = THIS_MODULE, +}; + +static int __init connbytes_mt_init(void) +{ + return xt_register_match(&connbytes_mt_reg); +} + +static void __exit connbytes_mt_exit(void) +{ + xt_unregister_match(&connbytes_mt_reg); +} + +module_init(connbytes_mt_init); +module_exit(connbytes_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c new file mode 100644 index 00000000..c6d5a834 --- /dev/null +++ b/net/netfilter/xt_connlimit.c @@ -0,0 +1,317 @@ +/* + * netfilter module to limit the number of parallel tcp + * connections per IP address. + * (c) 2000 Gerd Knorr + * Nov 2002: Martin Bene : + * only ignore TIME_WAIT or gone connections + * (C) CC Computer Consultants GmbH, 2007 + * + * based on ... + * + * Kernel module to match connection tracking information. + * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* we will save the tuples of all connections we care about */ +struct xt_connlimit_conn { + struct hlist_node node; + struct nf_conntrack_tuple tuple; + union nf_inet_addr addr; +}; + +struct xt_connlimit_data { + struct hlist_head iphash[256]; + spinlock_t lock; +}; + +static u_int32_t connlimit_rnd __read_mostly; + +static inline unsigned int connlimit_iphash(__be32 addr) +{ + return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; +} + +static inline unsigned int +connlimit_iphash6(const union nf_inet_addr *addr, + const union nf_inet_addr *mask) +{ + union nf_inet_addr res; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) + res.ip6[i] = addr->ip6[i] & mask->ip6[i]; + + return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; +} + +static inline bool already_closed(const struct nf_conn *conn) +{ + if (nf_ct_protonum(conn) == IPPROTO_TCP) + return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || + conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; + else + return 0; +} + +static inline unsigned int +same_source_net(const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + const union nf_inet_addr *u3, u_int8_t family) +{ + if (family == NFPROTO_IPV4) { + return (addr->ip & mask->ip) == (u3->ip & mask->ip); + } else { + union nf_inet_addr lh, rh; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) { + lh.ip6[i] = addr->ip6[i] & mask->ip6[i]; + rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; + } + + return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; + } +} + +static int count_them(struct net *net, + struct xt_connlimit_data *data, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + u_int8_t family) +{ + const struct nf_conntrack_tuple_hash *found; + struct xt_connlimit_conn *conn; + struct hlist_node *pos, *n; + struct nf_conn *found_ct; + struct hlist_head *hash; + bool addit = true; + int matches = 0; + + if (family == NFPROTO_IPV6) + hash = &data->iphash[connlimit_iphash6(addr, mask)]; + else + hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; + + rcu_read_lock(); + + /* check the saved connections */ + hlist_for_each_entry_safe(conn, pos, n, hash, node) { + found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, + &conn->tuple); + found_ct = NULL; + + if (found != NULL) + found_ct = nf_ct_tuplehash_to_ctrack(found); + + if (found_ct != NULL && + nf_ct_tuple_equal(&conn->tuple, tuple) && + !already_closed(found_ct)) + /* + * Just to be sure we have it only once in the list. + * We should not see tuples twice unless someone hooks + * this into a table without "-p tcp --syn". + */ + addit = false; + + if (found == NULL) { + /* this one is gone */ + hlist_del(&conn->node); + kfree(conn); + continue; + } + + if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + hlist_del(&conn->node); + kfree(conn); + continue; + } + + if (same_source_net(addr, mask, &conn->addr, family)) + /* same source network -> be counted! */ + ++matches; + nf_ct_put(found_ct); + } + + rcu_read_unlock(); + + if (addit) { + /* save the new connection in our list */ + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (conn == NULL) + return -ENOMEM; + conn->tuple = *tuple; + conn->addr = *addr; + hlist_add_head(&conn->node, hash); + ++matches; + } + + return matches; +} + +static bool +connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_connlimit_info *info = par->matchinfo; + union nf_inet_addr addr; + struct nf_conntrack_tuple tuple; + const struct nf_conntrack_tuple *tuple_ptr = &tuple; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + int connections; + + ct = nf_ct_get(skb, &ctinfo); + if (ct != NULL) + tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + par->family, &tuple)) + goto hotdrop; + + if (par->family == NFPROTO_IPV6) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? + &iph->daddr : &iph->saddr, sizeof(addr.ip6)); + } else { + const struct iphdr *iph = ip_hdr(skb); + addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? + iph->daddr : iph->saddr; + } + + spin_lock_bh(&info->data->lock); + connections = count_them(net, info->data, tuple_ptr, &addr, + &info->mask, par->family); + spin_unlock_bh(&info->data->lock); + + if (connections < 0) + /* kmalloc failed, drop it entirely */ + goto hotdrop; + + return (connections > info->limit) ^ + !!(info->flags & XT_CONNLIMIT_INVERT); + + hotdrop: + par->hotdrop = true; + return false; +} + +static int connlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_connlimit_info *info = par->matchinfo; + unsigned int i; + int ret; + + if (unlikely(!connlimit_rnd)) { + u_int32_t rand; + + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&connlimit_rnd, 0, rand); + } + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for " + "address family %u\n", par->family); + return ret; + } + + /* init private data */ + info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); + if (info->data == NULL) { + nf_ct_l3proto_module_put(par->family); + return -ENOMEM; + } + + spin_lock_init(&info->data->lock); + for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) + INIT_HLIST_HEAD(&info->data->iphash[i]); + + return 0; +} + +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_connlimit_info *info = par->matchinfo; + struct xt_connlimit_conn *conn; + struct hlist_node *pos, *n; + struct hlist_head *hash = info->data->iphash; + unsigned int i; + + nf_ct_l3proto_module_put(par->family); + + for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { + hlist_for_each_entry_safe(conn, pos, n, &hash[i], node) { + hlist_del(&conn->node); + kfree(conn); + } + } + + kfree(info->data); +} + +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init connlimit_mt_init(void) +{ + return xt_register_matches(connlimit_mt_reg, + ARRAY_SIZE(connlimit_mt_reg)); +} + +static void __exit connlimit_mt_exit(void) +{ + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); +} + +module_init(connlimit_mt_init); +module_exit(connlimit_mt_exit); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: Number of connections matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_connlimit"); +MODULE_ALIAS("ip6t_connlimit"); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c new file mode 100644 index 00000000..7278145e --- /dev/null +++ b/net/netfilter/xt_connmark.c @@ -0,0 +1,167 @@ +/* + * xt_connmark - Netfilter module to operate on connection marks + * + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Henrik Nordstrom "); +MODULE_DESCRIPTION("Xtables: connection mark operations"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_CONNMARK"); +MODULE_ALIAS("ip6t_CONNMARK"); +MODULE_ALIAS("ipt_connmark"); +MODULE_ALIAS("ip6t_connmark"); + +static unsigned int +connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo1 *info = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u_int32_t newmark; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return XT_CONTINUE; + + switch (info->mode) { + case XT_CONNMARK_SET: + newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_SAVE: + newmark = (ct->mark & ~info->ctmask) ^ + (skb->mark & info->nfmask); + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_RESTORE: + newmark = (skb->mark & ~info->nfmask) ^ + (ct->mark & info->ctmask); + skb->mark = newmark; + break; + } + + return XT_CONTINUE; +} + +static int connmark_tg_check(const struct xt_tgchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void connmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static bool +connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connmark_mtinfo1 *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int connmark_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void connmark_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target connmark_tg_reg __read_mostly = { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, +}; + +static struct xt_match connmark_mt_reg __read_mostly = { + .name = "connmark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_mt_check, + .match = connmark_mt, + .matchsize = sizeof(struct xt_connmark_mtinfo1), + .destroy = connmark_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connmark_mt_init(void) +{ + int ret; + + ret = xt_register_target(&connmark_tg_reg); + if (ret < 0) + return ret; + ret = xt_register_match(&connmark_mt_reg); + if (ret < 0) { + xt_unregister_target(&connmark_tg_reg); + return ret; + } + return 0; +} + +static void __exit connmark_mt_exit(void) +{ + xt_unregister_match(&connmark_mt_reg); + xt_unregister_target(&connmark_tg_reg); +} + +module_init(connmark_mt_init); +module_exit(connmark_mt_exit); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c new file mode 100644 index 00000000..61805d7b --- /dev/null +++ b/net/netfilter/xt_conntrack.c @@ -0,0 +1,332 @@ +/* + * xt_conntrack - Netfilter module to match connection tracking + * information. (Superset of Rusty's minimalistic state match.) + * + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: connection tracking state match"); +MODULE_ALIAS("ipt_conntrack"); +MODULE_ALIAS("ip6t_conntrack"); + +static bool +conntrack_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; + else + return false; +} + +static inline bool +conntrack_mt_origsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + &info->origsrc_addr, &info->origsrc_mask, family); +} + +static inline bool +conntrack_mt_origdst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, + &info->origdst_addr, &info->origdst_mask, family); +} + +static inline bool +conntrack_mt_replsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, + &info->replsrc_addr, &info->replsrc_mask, family); +} + +static inline bool +conntrack_mt_repldst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, + &info->repldst_addr, &info->repldst_mask, family); +} + +static inline bool +ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + (tuple->src.u.all == info->origsrc_port) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + (tuple->dst.u.all == info->origdst_port) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + (tuple->src.u.all == info->replsrc_port) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + (tuple->dst.u.all == info->repldst_port) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + +static inline bool +port_match(u16 min, u16 max, u16 port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static inline bool +ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + !port_match(info->origsrc_port, info->origsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + !port_match(info->origdst_port, info->origdst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + !port_match(info->replsrc_port, info->replsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + !port_match(info->repldst_port, info->repldst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + +static bool +conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, + u16 state_mask, u16 status_mask) +{ + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int statebit; + + ct = nf_ct_get(skb, &ctinfo); + + if (ct) { + if (nf_ct_is_untracked(ct)) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + } else + statebit = XT_CONNTRACK_STATE_INVALID; + + if (info->match_flags & XT_CONNTRACK_STATE) { + if (ct != NULL) { + if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_SNAT; + if (test_bit(IPS_DST_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_DNAT; + } + if (!!(state_mask & statebit) ^ + !(info->invert_flags & XT_CONNTRACK_STATE)) + return false; + } + + if (ct == NULL) + return info->match_flags & XT_CONNTRACK_STATE; + if ((info->match_flags & XT_CONNTRACK_DIRECTION) && + (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ + !(info->invert_flags & XT_CONNTRACK_DIRECTION)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGSRC) + if (conntrack_mt_origsrc(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGDST) + if (conntrack_mt_origdst(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLSRC) + if (conntrack_mt_replsrc(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLDST) + if (conntrack_mt_repldst(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST)) + return false; + + if (par->match->revision != 3) { + if (!ct_proto_port_check(info, ct)) + return false; + } else { + if (!ct_proto_port_check_v3(par->matchinfo, ct)) + return false; + } + + if ((info->match_flags & XT_CONNTRACK_STATUS) && + (!!(status_mask & ct->status) ^ + !(info->invert_flags & XT_CONNTRACK_STATUS))) + return false; + + if (info->match_flags & XT_CONNTRACK_EXPIRES) { + unsigned long expires = 0; + + if (timer_pending(&ct->timeout)) + expires = (ct->timeout.expires - jiffies) / HZ; + if ((expires >= info->expires_min && + expires <= info->expires_max) ^ + !(info->invert_flags & XT_CONNTRACK_EXPIRES)) + return false; + } + return true; +} + +static bool +conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo1 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static bool +conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static bool +conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo3 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static int conntrack_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match conntrack_mt_reg[] __read_mostly = { + { + .name = "conntrack", + .revision = 1, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo1), + .match = conntrack_mt_v1, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 2, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo2), + .match = conntrack_mt_v2, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 3, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo3), + .match = conntrack_mt_v3, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init conntrack_mt_init(void) +{ + return xt_register_matches(conntrack_mt_reg, + ARRAY_SIZE(conntrack_mt_reg)); +} + +static void __exit conntrack_mt_exit(void) +{ + xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); +} + +module_init(conntrack_mt_init); +module_exit(conntrack_mt_exit); diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c new file mode 100644 index 00000000..c7a2e546 --- /dev/null +++ b/net/netfilter/xt_cpu.c @@ -0,0 +1,65 @@ +/* Kernel module to match running CPU */ + +/* + * Might be used to distribute connections on several daemons, if + * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, + * each RX queue IRQ affined to one CPU (1:1 mapping) + * + */ + +/* (C) 2010 Eric Dumazet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Dumazet "); +MODULE_DESCRIPTION("Xtables: CPU match"); +MODULE_ALIAS("ipt_cpu"); +MODULE_ALIAS("ip6t_cpu"); + +static int cpu_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + return 0; +} + +static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + return (info->cpu == smp_processor_id()) ^ info->invert; +} + +static struct xt_match cpu_mt_reg __read_mostly = { + .name = "cpu", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cpu_mt_check, + .match = cpu_mt, + .matchsize = sizeof(struct xt_cpu_info), + .me = THIS_MODULE, +}; + +static int __init cpu_mt_init(void) +{ + return xt_register_match(&cpu_mt_reg); +} + +static void __exit cpu_mt_exit(void) +{ + xt_unregister_match(&cpu_mt_reg); +} + +module_init(cpu_mt_init); +module_exit(cpu_mt_exit); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c new file mode 100644 index 00000000..b63d2a3d --- /dev/null +++ b/net/netfilter/xt_dccp.c @@ -0,0 +1,188 @@ +/* + * iptables module for DCCP protocol header matching + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: DCCP protocol packet match"); +MODULE_ALIAS("ipt_dccp"); +MODULE_ALIAS("ip6t_dccp"); + +#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static unsigned char *dccp_optbuf; +static DEFINE_SPINLOCK(dccp_buflock); + +static inline bool +dccp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int protoff, + const struct dccp_hdr *dh, + bool *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + const unsigned char *op; + unsigned int optoff = __dccp_hdr_len(dh); + unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); + unsigned int i; + + if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) + goto invalid; + + if (!optlen) + return false; + + spin_lock_bh(&dccp_buflock); + op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf); + if (op == NULL) { + /* If we don't have the whole header, drop packet. */ + goto partial; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) { + spin_unlock_bh(&dccp_buflock); + return true; + } + + if (op[i] < 2) + i++; + else + i += op[i+1]?:1; + } + + spin_unlock_bh(&dccp_buflock); + return false; + +partial: + spin_unlock_bh(&dccp_buflock); +invalid: + *hotdrop = true; + return false; +} + + +static inline bool +match_types(const struct dccp_hdr *dh, u_int16_t typemask) +{ + return typemask & (1 << dh->dccph_type); +} + +static inline bool +match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, + const struct dccp_hdr *dh, bool *hotdrop) +{ + return dccp_find_option(option, skb, protoff, dh, hotdrop); +} + +static bool +dccp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_dccp_info *info = par->matchinfo; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (par->fragoff != 0) + return false; + + dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); + if (dh == NULL) { + par->hotdrop = true; + return false; + } + + return DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0] + && ntohs(dh->dccph_sport) <= info->spts[1], + XT_DCCP_SRC_PORTS, info->flags, info->invflags) + && DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0] + && ntohs(dh->dccph_dport) <= info->dpts[1], + XT_DCCP_DEST_PORTS, info->flags, info->invflags) + && DCCHECK(match_types(dh, info->typemask), + XT_DCCP_TYPE, info->flags, info->invflags) + && DCCHECK(match_option(info->option, skb, par->thoff, dh, + &par->hotdrop), + XT_DCCP_OPTION, info->flags, info->invflags); +} + +static int dccp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_dccp_info *info = par->matchinfo; + + if (info->flags & ~XT_DCCP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~XT_DCCP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~info->flags) + return -EINVAL; + return 0; +} + +static struct xt_match dccp_mt_reg[] __read_mostly = { + { + .name = "dccp", + .family = NFPROTO_IPV4, + .checkentry = dccp_mt_check, + .match = dccp_mt, + .matchsize = sizeof(struct xt_dccp_info), + .proto = IPPROTO_DCCP, + .me = THIS_MODULE, + }, + { + .name = "dccp", + .family = NFPROTO_IPV6, + .checkentry = dccp_mt_check, + .match = dccp_mt, + .matchsize = sizeof(struct xt_dccp_info), + .proto = IPPROTO_DCCP, + .me = THIS_MODULE, + }, +}; + +static int __init dccp_mt_init(void) +{ + int ret; + + /* doff is 8 bits, so the maximum option size is (4*256). Don't put + * this in BSS since DaveM is worried about locked TLB's for kernel + * BSS. */ + dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); + if (!dccp_optbuf) + return -ENOMEM; + ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); + if (ret) + goto out_kfree; + return ret; + +out_kfree: + kfree(dccp_optbuf); + return ret; +} + +static void __exit dccp_mt_exit(void) +{ + xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); + kfree(dccp_optbuf); +} + +module_init(dccp_mt_init); +module_exit(dccp_mt_exit); diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c new file mode 100644 index 00000000..d9202cdd --- /dev/null +++ b/net/netfilter/xt_devgroup.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2011 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Device group match"); +MODULE_ALIAS("ipt_devgroup"); +MODULE_ALIAS("ip6t_devgroup"); + +static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & XT_DEVGROUP_MATCH_SRC && + (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^ + ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) + return false; + + if (info->flags & XT_DEVGROUP_MATCH_DST && + (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^ + ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) + return false; + + return true; +} + +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + if (info->flags & XT_DEVGROUP_MATCH_SRC && + par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD))) + return -EINVAL; + + if (info->flags & XT_DEVGROUP_MATCH_DST && + par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) + return -EINVAL; + + return 0; +} + +static struct xt_match devgroup_mt_reg __read_mostly = { + .name = "devgroup", + .match = devgroup_mt, + .checkentry = devgroup_mt_checkentry, + .matchsize = sizeof(struct xt_devgroup_info), + .family = NFPROTO_UNSPEC, + .me = THIS_MODULE +}; + +static int __init devgroup_mt_init(void) +{ + return xt_register_match(&devgroup_mt_reg); +} + +static void __exit devgroup_mt_exit(void) +{ + xt_unregister_match(&devgroup_mt_reg); +} + +module_init(devgroup_mt_init); +module_exit(devgroup_mt_exit); diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c new file mode 100644 index 00000000..64670fc5 --- /dev/null +++ b/net/netfilter/xt_dscp.c @@ -0,0 +1,115 @@ +/* IP tables module for matching the value of the IPv4/IPv6 DSCP field + * + * (C) 2002 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_dscp"); +MODULE_ALIAS("ip6t_dscp"); +MODULE_ALIAS("ipt_tos"); +MODULE_ALIAS("ip6t_tos"); + +static bool +dscp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_dscp_info *info = par->matchinfo; + u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + + return (dscp == info->dscp) ^ !!info->invert; +} + +static bool +dscp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_dscp_info *info = par->matchinfo; + u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + + return (dscp == info->dscp) ^ !!info->invert; +} + +static int dscp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_dscp_info *info = par->matchinfo; + + if (info->dscp > XT_DSCP_MAX) { + pr_info("dscp %x out of range\n", info->dscp); + return -EDOM; + } + + return 0; +} + +static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_tos_match_info *info = par->matchinfo; + + if (par->family == NFPROTO_IPV4) + return ((ip_hdr(skb)->tos & info->tos_mask) == + info->tos_value) ^ !!info->invert; + else + return ((ipv6_get_dsfield(ipv6_hdr(skb)) & info->tos_mask) == + info->tos_value) ^ !!info->invert; +} + +static struct xt_match dscp_mt_reg[] __read_mostly = { + { + .name = "dscp", + .family = NFPROTO_IPV4, + .checkentry = dscp_mt_check, + .match = dscp_mt, + .matchsize = sizeof(struct xt_dscp_info), + .me = THIS_MODULE, + }, + { + .name = "dscp", + .family = NFPROTO_IPV6, + .checkentry = dscp_mt_check, + .match = dscp_mt6, + .matchsize = sizeof(struct xt_dscp_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = NFPROTO_IPV4, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = NFPROTO_IPV6, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, +}; + +static int __init dscp_mt_init(void) +{ + return xt_register_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); +} + +static void __exit dscp_mt_exit(void) +{ + xt_unregister_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); +} + +module_init(dscp_mt_init); +module_exit(dscp_mt_exit); diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c new file mode 100644 index 00000000..171ba82b --- /dev/null +++ b/net/netfilter/xt_esp.c @@ -0,0 +1,107 @@ +/* Kernel module to match ESP parameters. */ + +/* (C) 1999-2000 Yon Uriarte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte "); +MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match"); +MODULE_ALIAS("ipt_esp"); +MODULE_ALIAS("ip6t_esp"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool esp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip_esp_hdr *eh; + struct ip_esp_hdr _esp; + const struct xt_esp *espinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp); + if (eh == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil ESP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi), + !!(espinfo->invflags & XT_ESP_INV_SPI)); +} + +static int esp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_esp *espinfo = par->matchinfo; + + if (espinfo->invflags & ~XT_ESP_INV_MASK) { + pr_debug("unknown flags %X\n", espinfo->invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match esp_mt_reg[] __read_mostly = { + { + .name = "esp", + .family = NFPROTO_IPV4, + .checkentry = esp_mt_check, + .match = esp_mt, + .matchsize = sizeof(struct xt_esp), + .proto = IPPROTO_ESP, + .me = THIS_MODULE, + }, + { + .name = "esp", + .family = NFPROTO_IPV6, + .checkentry = esp_mt_check, + .match = esp_mt, + .matchsize = sizeof(struct xt_esp), + .proto = IPPROTO_ESP, + .me = THIS_MODULE, + }, +}; + +static int __init esp_mt_init(void) +{ + return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); +} + +static void __exit esp_mt_exit(void) +{ + xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); +} + +module_init(esp_mt_init); +module_exit(esp_mt_exit); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c new file mode 100644 index 00000000..9228ee0d --- /dev/null +++ b/net/netfilter/xt_hashlimit.c @@ -0,0 +1,847 @@ +/* + * xt_hashlimit - Netfilter module to limit the number of packets per time + * separately for each hashbucket (sourceip/sourceport/dstip/dstport) + * + * (C) 2003-2004 by Harald Welte + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * Development of this code was funded by Astaro AG, http://www.astaro.com/ + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#include +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match"); +MODULE_ALIAS("ipt_hashlimit"); +MODULE_ALIAS("ip6t_hashlimit"); + +struct hashlimit_net { + struct hlist_head htables; + struct proc_dir_entry *ipt_hashlimit; + struct proc_dir_entry *ip6t_hashlimit; +}; + +static int hashlimit_net_id; +static inline struct hashlimit_net *hashlimit_pernet(struct net *net) +{ + return net_generic(net, hashlimit_net_id); +} + +/* need to declare this at the top */ +static const struct file_operations dl_file_ops; + +/* hash table crap */ +struct dsthash_dst { + union { + struct { + __be32 src; + __be32 dst; + } ip; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + struct { + __be32 src[4]; + __be32 dst[4]; + } ip6; +#endif + }; + __be16 src_port; + __be16 dst_port; +}; + +struct dsthash_ent { + /* static / read-only parts in the beginning */ + struct hlist_node node; + struct dsthash_dst dst; + + /* modified structure members in the end */ + spinlock_t lock; + unsigned long expires; /* precalculated expiry time */ + struct { + unsigned long prev; /* last modification */ + u_int32_t credit; + u_int32_t credit_cap, cost; + } rateinfo; + struct rcu_head rcu; +}; + +struct xt_hashlimit_htable { + struct hlist_node node; /* global list of all htables */ + int use; + u_int8_t family; + bool rnd_initialized; + + struct hashlimit_cfg1 cfg; /* config */ + + /* used internally */ + spinlock_t lock; /* lock for list_head */ + u_int32_t rnd; /* random seed for hash */ + unsigned int count; /* number entries in table */ + struct timer_list timer; /* timer for gc */ + + /* seq_file stuff */ + struct proc_dir_entry *pde; + struct net *net; + + struct hlist_head hash[0]; /* hashtable itself */ +}; + +static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ +static struct kmem_cache *hashlimit_cachep __read_mostly; + +static inline bool dst_cmp(const struct dsthash_ent *ent, + const struct dsthash_dst *b) +{ + return !memcmp(&ent->dst, b, sizeof(ent->dst)); +} + +static u_int32_t +hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) +{ + u_int32_t hash = jhash2((const u32 *)dst, + sizeof(*dst)/sizeof(u32), + ht->rnd); + /* + * Instead of returning hash % ht->cfg.size (implying a divide) + * we return the high 32 bits of the (hash * ht->cfg.size) that will + * give results between [0 and cfg.size-1] and same hash distribution, + * but using a multiply, less expensive than a divide + */ + return ((u64)hash * ht->cfg.size) >> 32; +} + +static struct dsthash_ent * +dsthash_find(const struct xt_hashlimit_htable *ht, + const struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + struct hlist_node *pos; + u_int32_t hash = hash_dst(ht, dst); + + if (!hlist_empty(&ht->hash[hash])) { + hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node) + if (dst_cmp(ent, dst)) { + spin_lock(&ent->lock); + return ent; + } + } + return NULL; +} + +/* allocate dsthash_ent, initialize dst, put in htable and lock it */ +static struct dsthash_ent * +dsthash_alloc_init(struct xt_hashlimit_htable *ht, + const struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + + spin_lock(&ht->lock); + /* initialize hash with random val at the time we allocate + * the first hashtable entry */ + if (unlikely(!ht->rnd_initialized)) { + get_random_bytes(&ht->rnd, sizeof(ht->rnd)); + ht->rnd_initialized = true; + } + + if (ht->cfg.max && ht->count >= ht->cfg.max) { + /* FIXME: do something. question is what.. */ + if (net_ratelimit()) + pr_err("max count of %u reached\n", ht->cfg.max); + ent = NULL; + } else + ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); + if (!ent) { + if (net_ratelimit()) + pr_err("cannot allocate dsthash_ent\n"); + } else { + memcpy(&ent->dst, dst, sizeof(ent->dst)); + spin_lock_init(&ent->lock); + + spin_lock(&ent->lock); + hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]); + ht->count++; + } + spin_unlock(&ht->lock); + return ent; +} + +static void dsthash_free_rcu(struct rcu_head *head) +{ + struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu); + + kmem_cache_free(hashlimit_cachep, ent); +} + +static inline void +dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) +{ + hlist_del_rcu(&ent->node); + call_rcu_bh(&ent->rcu, dsthash_free_rcu); + ht->count--; +} +static void htable_gc(unsigned long htlong); + +static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, + u_int8_t family) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + struct xt_hashlimit_htable *hinfo; + unsigned int size; + unsigned int i; + + if (minfo->cfg.size) { + size = minfo->cfg.size; + } else { + size = (totalram_pages << PAGE_SHIFT) / 16384 / + sizeof(struct list_head); + if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + size = 8192; + if (size < 16) + size = 16; + } + /* FIXME: don't use vmalloc() here or anywhere else -HW */ + hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + + sizeof(struct list_head) * size); + if (hinfo == NULL) + return -ENOMEM; + minfo->hinfo = hinfo; + + /* copy match config into hashtable config */ + memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + hinfo->cfg.size = size; + if (hinfo->cfg.max == 0) + hinfo->cfg.max = 8 * hinfo->cfg.size; + else if (hinfo->cfg.max < hinfo->cfg.size) + hinfo->cfg.max = hinfo->cfg.size; + + for (i = 0; i < hinfo->cfg.size; i++) + INIT_HLIST_HEAD(&hinfo->hash[i]); + + hinfo->use = 1; + hinfo->count = 0; + hinfo->family = family; + hinfo->rnd_initialized = false; + spin_lock_init(&hinfo->lock); + + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, + &dl_file_ops, hinfo); + if (hinfo->pde == NULL) { + vfree(hinfo); + return -ENOMEM; + } + hinfo->net = net; + + setup_timer(&hinfo->timer, htable_gc, (unsigned long)hinfo); + hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); + add_timer(&hinfo->timer); + + hlist_add_head(&hinfo->node, &hashlimit_net->htables); + + return 0; +} + +static bool select_all(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he) +{ + return 1; +} + +static bool select_gc(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he) +{ + return time_after_eq(jiffies, he->expires); +} + +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, + bool (*select)(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he)) +{ + unsigned int i; + + /* lock hash table and iterate over it */ + spin_lock_bh(&ht->lock); + for (i = 0; i < ht->cfg.size; i++) { + struct dsthash_ent *dh; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + if ((*select)(ht, dh)) + dsthash_free(ht, dh); + } + } + spin_unlock_bh(&ht->lock); +} + +/* hash table garbage collector, run by timer */ +static void htable_gc(unsigned long htlong) +{ + struct xt_hashlimit_htable *ht = (struct xt_hashlimit_htable *)htlong; + + htable_selective_cleanup(ht, select_gc); + + /* re-add the timer accordingly */ + ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval); + add_timer(&ht->timer); +} + +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); + struct proc_dir_entry *parent; + + del_timer_sync(&hinfo->timer); + + if (hinfo->family == NFPROTO_IPV4) + parent = hashlimit_net->ipt_hashlimit; + else + parent = hashlimit_net->ip6t_hashlimit; + remove_proc_entry(hinfo->pde->name, parent); + htable_selective_cleanup(hinfo, select_all); + vfree(hinfo); +} + +static struct xt_hashlimit_htable *htable_find_get(struct net *net, + const char *name, + u_int8_t family) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + struct xt_hashlimit_htable *hinfo; + struct hlist_node *pos; + + hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) { + if (!strcmp(name, hinfo->pde->name) && + hinfo->family == family) { + hinfo->use++; + return hinfo; + } + } + return NULL; +} + +static void htable_put(struct xt_hashlimit_htable *hinfo) +{ + mutex_lock(&hashlimit_mutex); + if (--hinfo->use == 0) { + hlist_del(&hinfo->node); + htable_destroy(hinfo); + } + mutex_unlock(&hashlimit_mutex); +} + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maximum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. +*/ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +/* Precision saver. */ +static inline u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; +} + +static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +{ + dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; + if (dh->rateinfo.credit > dh->rateinfo.credit_cap) + dh->rateinfo.credit = dh->rateinfo.credit_cap; + dh->rateinfo.prev = now; +} + +static inline __be32 maskl(__be32 a, unsigned int l) +{ + return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0; +} + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +static void hashlimit_ipv6_mask(__be32 *i, unsigned int p) +{ + switch (p) { + case 0 ... 31: + i[0] = maskl(i[0], p); + i[1] = i[2] = i[3] = 0; + break; + case 32 ... 63: + i[1] = maskl(i[1], p - 32); + i[2] = i[3] = 0; + break; + case 64 ... 95: + i[2] = maskl(i[2], p - 64); + i[3] = 0; + break; + case 96 ... 127: + i[3] = maskl(i[3], p - 96); + break; + case 128: + break; + } +} +#endif + +static int +hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, + struct dsthash_dst *dst, + const struct sk_buff *skb, unsigned int protoff) +{ + __be16 _ports[2], *ports; + u8 nexthdr; + int poff; + + memset(dst, 0, sizeof(*dst)); + + switch (hinfo->family) { + case NFPROTO_IPV4: + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) + dst->ip.dst = maskl(ip_hdr(skb)->daddr, + hinfo->cfg.dstmask); + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) + dst->ip.src = maskl(ip_hdr(skb)->saddr, + hinfo->cfg.srcmask); + + if (!(hinfo->cfg.mode & + (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) + return 0; + nexthdr = ip_hdr(skb)->protocol; + break; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + case NFPROTO_IPV6: + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) { + memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr, + sizeof(dst->ip6.dst)); + hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask); + } + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) { + memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr, + sizeof(dst->ip6.src)); + hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask); + } + + if (!(hinfo->cfg.mode & + (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) + return 0; + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + if ((int)protoff < 0) + return -1; + break; +#endif + default: + BUG(); + return 0; + } + + poff = proto_ports_offset(nexthdr); + if (poff >= 0) { + ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), + &_ports); + } else { + _ports[0] = _ports[1] = 0; + ports = _ports; + } + if (!ports) + return -1; + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT) + dst->src_port = ports[0]; + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DPT) + dst->dst_port = ports[1]; + return 0; +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + unsigned long now = jiffies; + struct dsthash_ent *dh; + struct dsthash_dst dst; + + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) + goto hotdrop; + + rcu_read_lock_bh(); + dh = dsthash_find(hinfo, &dst); + if (dh == NULL) { + dh = dsthash_alloc_init(hinfo, &dst); + if (dh == NULL) { + rcu_read_unlock_bh(); + goto hotdrop; + } + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + dh->rateinfo.prev = jiffies; + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + } else { + /* update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_recalc(dh, now); + } + + if (dh->rateinfo.credit >= dh->rateinfo.cost) { + /* below the limit */ + dh->rateinfo.credit -= dh->rateinfo.cost; + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + return !(info->cfg.mode & XT_HASHLIMIT_INVERT); + } + + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + /* default match is underlimit - so over the limit, we need to invert */ + return info->cfg.mode & XT_HASHLIMIT_INVERT; + + hotdrop: + par->hotdrop = true; + return false; +} + +static int hashlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct net *net = par->net; + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + int ret; + + /* Check for overflow. */ + if (info->cfg.burst == 0 || + user2credits(info->cfg.avg * info->cfg.burst) < + user2credits(info->cfg.avg)) { + pr_info("overflow, try lower: %u/%u\n", + info->cfg.avg, info->cfg.burst); + return -ERANGE; + } + if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) + return -EINVAL; + if (info->name[sizeof(info->name)-1] != '\0') + return -EINVAL; + if (par->family == NFPROTO_IPV4) { + if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) + return -EINVAL; + } else { + if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) + return -EINVAL; + } + + mutex_lock(&hashlimit_mutex); + info->hinfo = htable_find_get(net, info->name, par->family); + if (info->hinfo == NULL) { + ret = htable_create(net, info, par->family); + if (ret < 0) { + mutex_unlock(&hashlimit_mutex); + return ret; + } + } + mutex_unlock(&hashlimit_mutex); + return 0; +} + +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + + htable_put(info->hinfo); +} + +static struct xt_match hashlimit_mt_reg[] __read_mostly = { + { + .name = "hashlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + { + .name = "hashlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, +#endif +}; + +/* PROC stuff */ +static void *dl_seq_start(struct seq_file *s, loff_t *pos) + __acquires(htable->lock) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket; + + spin_lock_bh(&htable->lock); + if (*pos >= htable->cfg.size) + return NULL; + + bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC); + if (!bucket) + return ERR_PTR(-ENOMEM); + + *bucket = *pos; + return bucket; +} + +static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + + *pos = ++(*bucket); + if (*pos >= htable->cfg.size) { + kfree(v); + return NULL; + } + return bucket; +} + +static void dl_seq_stop(struct seq_file *s, void *v) + __releases(htable->lock) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + + if (!IS_ERR(bucket)) + kfree(bucket); + spin_unlock_bh(&htable->lock); +} + +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + int res; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies); + + switch (family) { + case NFPROTO_IPV4: + res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + &ent->dst.ip.src, + ntohs(ent->dst.src_port), + &ent->dst.ip.dst, + ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); + break; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + case NFPROTO_IPV6: + res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + &ent->dst.ip6.src, + ntohs(ent->dst.src_port), + &ent->dst.ip6.dst, + ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); + break; +#endif + default: + BUG(); + res = 0; + } + spin_unlock(&ent->lock); + return res; +} + +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + struct hlist_node *pos; + + if (!hlist_empty(&htable->hash[*bucket])) { + hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) + if (dl_seq_real_show(ent, htable->family, s)) + return -1; + } + return 0; +} + +static const struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode)->data; + } + return ret; +} + +static const struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static int __net_init hashlimit_proc_net_init(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net); + if (!hashlimit_net->ipt_hashlimit) + return -ENOMEM; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); + if (!hashlimit_net->ip6t_hashlimit) { + proc_net_remove(net, "ipt_hashlimit"); + return -ENOMEM; + } +#endif + return 0; +} + +static void __net_exit hashlimit_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "ipt_hashlimit"); +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) + proc_net_remove(net, "ip6t_hashlimit"); +#endif +} + +static int __net_init hashlimit_net_init(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + INIT_HLIST_HEAD(&hashlimit_net->htables); + return hashlimit_proc_net_init(net); +} + +static void __net_exit hashlimit_net_exit(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + BUG_ON(!hlist_empty(&hashlimit_net->htables)); + hashlimit_proc_net_exit(net); +} + +static struct pernet_operations hashlimit_net_ops = { + .init = hashlimit_net_init, + .exit = hashlimit_net_exit, + .id = &hashlimit_net_id, + .size = sizeof(struct hashlimit_net), +}; + +static int __init hashlimit_mt_init(void) +{ + int err; + + err = register_pernet_subsys(&hashlimit_net_ops); + if (err < 0) + return err; + err = xt_register_matches(hashlimit_mt_reg, + ARRAY_SIZE(hashlimit_mt_reg)); + if (err < 0) + goto err1; + + err = -ENOMEM; + hashlimit_cachep = kmem_cache_create("xt_hashlimit", + sizeof(struct dsthash_ent), 0, 0, + NULL); + if (!hashlimit_cachep) { + pr_warning("unable to create slab cache\n"); + goto err2; + } + return 0; + +err2: + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); +err1: + unregister_pernet_subsys(&hashlimit_net_ops); + return err; + +} + +static void __exit hashlimit_mt_exit(void) +{ + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); + unregister_pernet_subsys(&hashlimit_net_ops); + + rcu_barrier_bh(); + kmem_cache_destroy(hashlimit_cachep); +} + +module_init(hashlimit_mt_init); +module_exit(hashlimit_mt_exit); diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c new file mode 100644 index 00000000..9f4ab00c --- /dev/null +++ b/net/netfilter/xt_helper.c @@ -0,0 +1,99 @@ +/* iptables module to match on related connections */ +/* + * (C) 2001 Martin Josefsson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson "); +MODULE_DESCRIPTION("Xtables: Related connection matching"); +MODULE_ALIAS("ipt_helper"); +MODULE_ALIAS("ip6t_helper"); + + +static bool +helper_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_helper_info *info = par->matchinfo; + const struct nf_conn *ct; + const struct nf_conn_help *master_help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + bool ret = info->invert; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || !ct->master) + return ret; + + master_help = nfct_help(ct->master); + if (!master_help) + return ret; + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(master_help->helper); + if (!helper) + return ret; + + if (info->name[0] == '\0') + ret = !ret; + else + ret ^= !strncmp(helper->name, info->name, + strlen(helper->name)); + return ret; +} + +static int helper_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_helper_info *info = par->matchinfo; + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + info->name[29] = '\0'; + return 0; +} + +static void helper_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match helper_mt_reg __read_mostly = { + .name = "helper", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = helper_mt_check, + .match = helper_mt, + .destroy = helper_mt_destroy, + .matchsize = sizeof(struct xt_helper_info), + .me = THIS_MODULE, +}; + +static int __init helper_mt_init(void) +{ + return xt_register_match(&helper_mt_reg); +} + +static void __exit helper_mt_exit(void) +{ + xt_unregister_match(&helper_mt_reg); +} + +module_init(helper_mt_init); +module_exit(helper_mt_exit); diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c new file mode 100644 index 00000000..7d12221e --- /dev/null +++ b/net/netfilter/xt_hl.c @@ -0,0 +1,96 @@ +/* + * IP tables module for matching the value of the TTL + * (C) 2000,2001 by Harald Welte + * + * Hop Limit matching module + * (C) 2001-2002 Maciej Soltysiak + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Maciej Soltysiak "); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL field match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ttl"); +MODULE_ALIAS("ip6t_hl"); + +static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ipt_ttl_info *info = par->matchinfo; + const u8 ttl = ip_hdr(skb)->ttl; + + switch (info->mode) { + case IPT_TTL_EQ: + return ttl == info->ttl; + case IPT_TTL_NE: + return ttl != info->ttl; + case IPT_TTL_LT: + return ttl < info->ttl; + case IPT_TTL_GT: + return ttl > info->ttl; + } + + return false; +} + +static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_hl_info *info = par->matchinfo; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_EQ: + return ip6h->hop_limit == info->hop_limit; + case IP6T_HL_NE: + return ip6h->hop_limit != info->hop_limit; + case IP6T_HL_LT: + return ip6h->hop_limit < info->hop_limit; + case IP6T_HL_GT: + return ip6h->hop_limit > info->hop_limit; + } + + return false; +} + +static struct xt_match hl_mt_reg[] __read_mostly = { + { + .name = "ttl", + .revision = 0, + .family = NFPROTO_IPV4, + .match = ttl_mt, + .matchsize = sizeof(struct ipt_ttl_info), + .me = THIS_MODULE, + }, + { + .name = "hl", + .revision = 0, + .family = NFPROTO_IPV6, + .match = hl_mt6, + .matchsize = sizeof(struct ip6t_hl_info), + .me = THIS_MODULE, + }, +}; + +static int __init hl_mt_init(void) +{ + return xt_register_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +static void __exit hl_mt_exit(void) +{ + xt_unregister_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +module_init(hl_mt_init); +module_exit(hl_mt_exit); diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c new file mode 100644 index 00000000..b46626cd --- /dev/null +++ b/net/netfilter/xt_iprange.c @@ -0,0 +1,140 @@ +/* + * xt_iprange - Netfilter module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik + * (C) CC Computer Consultants GmbH, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +static bool +iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_iprange_mtinfo *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = ntohl(iph->saddr) < ntohl(info->src_min.ip); + m |= ntohl(iph->saddr) > ntohl(info->src_max.ip); + m ^= !!(info->flags & IPRANGE_SRC_INV); + if (m) { + pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->saddr, + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + &info->src_min.ip, + &info->src_max.ip); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = ntohl(iph->daddr) < ntohl(info->dst_min.ip); + m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip); + m ^= !!(info->flags & IPRANGE_DST_INV); + if (m) { + pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->daddr, + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + &info->dst_min.ip, + &info->dst_max.ip); + return false; + } + } + return true; +} + +static inline int +iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b) +{ + unsigned int i; + + for (i = 0; i < 4; ++i) { + if (a->s6_addr32[i] != b->s6_addr32[i]) + return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]); + } + + return 0; +} + +static bool +iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_iprange_mtinfo *info = par->matchinfo; + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6); + m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr); + m ^= !!(info->flags & IPRANGE_SRC_INV); + if (m) { + pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n", + &iph->saddr, + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + &info->src_min.in6, + &info->src_max.in6); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6); + m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr); + m ^= !!(info->flags & IPRANGE_DST_INV); + if (m) { + pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n", + &iph->daddr, + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + &info->dst_min.in6, + &info->dst_max.in6); + return false; + } + } + return true; +} + +static struct xt_match iprange_mt_reg[] __read_mostly = { + { + .name = "iprange", + .revision = 1, + .family = NFPROTO_IPV4, + .match = iprange_mt4, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, + { + .name = "iprange", + .revision = 1, + .family = NFPROTO_IPV6, + .match = iprange_mt6, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, +}; + +static int __init iprange_mt_init(void) +{ + return xt_register_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +static void __exit iprange_mt_exit(void) +{ + xt_unregister_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +module_init(iprange_mt_init); +module_exit(iprange_mt_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); +MODULE_ALIAS("ipt_iprange"); +MODULE_ALIAS("ip6t_iprange"); diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c new file mode 100644 index 00000000..bb10b071 --- /dev/null +++ b/net/netfilter/xt_ipvs.c @@ -0,0 +1,188 @@ +/* + * xt_ipvs - kernel module to match IPVS connection properties + * + * Author: Hannes Eder + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#ifdef CONFIG_IP_VS_IPV6 +#include +#endif +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Hannes Eder "); +MODULE_DESCRIPTION("Xtables: match IPVS connection properties"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ipvs"); +MODULE_ALIAS("ip6t_ipvs"); + +/* borrowed from xt_conntrack */ +static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, + unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; +#ifdef CONFIG_IP_VS_IPV6 + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; +#endif + else + return false; +} + +static bool +ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ipvs_mtinfo *data = par->matchinfo; + /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ + const u_int8_t family = par->family; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + bool match = true; + + if (data->bitmask == XT_IPVS_IPVS_PROPERTY) { + match = skb->ipvs_property ^ + !!(data->invert & XT_IPVS_IPVS_PROPERTY); + goto out; + } + + /* other flags than XT_IPVS_IPVS_PROPERTY are set */ + if (!skb->ipvs_property) { + match = false; + goto out; + } + + ip_vs_fill_iphdr(family, skb_network_header(skb), &iph); + + if (data->bitmask & XT_IPVS_PROTO) + if ((iph.protocol == data->l4proto) ^ + !(data->invert & XT_IPVS_PROTO)) { + match = false; + goto out; + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) { + match = false; + goto out; + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */); + if (unlikely(cp == NULL)) { + match = false; + goto out; + } + + /* + * We found a connection, i.e. ct != 0, make sure to call + * __ip_vs_conn_put before returning. In our case jump to out_put_con. + */ + + if (data->bitmask & XT_IPVS_VPORT) + if ((cp->vport == data->vport) ^ + !(data->invert & XT_IPVS_VPORT)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VPORTCTL) + if ((cp->control != NULL && + cp->control->vport == data->vportctl) ^ + !(data->invert & XT_IPVS_VPORTCTL)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_DIR) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct == NULL || nf_ct_is_untracked(ct)) { + match = false; + goto out_put_cp; + } + + if ((ctinfo >= IP_CT_IS_REPLY) ^ + !!(data->invert & XT_IPVS_DIR)) { + match = false; + goto out_put_cp; + } + } + + if (data->bitmask & XT_IPVS_METHOD) + if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^ + !(data->invert & XT_IPVS_METHOD)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VADDR) { + if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr, + &data->vmask, family) ^ + !(data->invert & XT_IPVS_VADDR)) { + match = false; + goto out_put_cp; + } + } + +out_put_cp: + __ip_vs_conn_put(cp); +out: + pr_debug("match=%d\n", match); + return match; +} + +static int ipvs_mt_check(const struct xt_mtchk_param *par) +{ + if (par->family != NFPROTO_IPV4 +#ifdef CONFIG_IP_VS_IPV6 + && par->family != NFPROTO_IPV6 +#endif + ) { + pr_info("protocol family %u not supported\n", par->family); + return -EINVAL; + } + + return 0; +} + +static struct xt_match xt_ipvs_mt_reg __read_mostly = { + .name = "ipvs", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = ipvs_mt, + .checkentry = ipvs_mt_check, + .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)), + .me = THIS_MODULE, +}; + +static int __init ipvs_mt_init(void) +{ + return xt_register_match(&xt_ipvs_mt_reg); +} + +static void __exit ipvs_mt_exit(void) +{ + xt_unregister_match(&xt_ipvs_mt_reg); +} + +module_init(ipvs_mt_init); +module_exit(ipvs_mt_exit); diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c new file mode 100644 index 00000000..176e5570 --- /dev/null +++ b/net/netfilter/xt_length.c @@ -0,0 +1,70 @@ +/* Kernel module to match packet length. */ +/* (C) 1999-2001 James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("Xtables: Packet length (Layer3,4,5) match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_length"); +MODULE_ALIAS("ip6t_length"); + +static bool +length_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_length_info *info = par->matchinfo; + u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static bool +length_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_length_info *info = par->matchinfo; + const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static struct xt_match length_mt_reg[] __read_mostly = { + { + .name = "length", + .family = NFPROTO_IPV4, + .match = length_mt, + .matchsize = sizeof(struct xt_length_info), + .me = THIS_MODULE, + }, + { + .name = "length", + .family = NFPROTO_IPV6, + .match = length_mt6, + .matchsize = sizeof(struct xt_length_info), + .me = THIS_MODULE, + }, +}; + +static int __init length_mt_init(void) +{ + return xt_register_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); +} + +static void __exit length_mt_exit(void) +{ + xt_unregister_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); +} + +module_init(length_mt_init); +module_exit(length_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c new file mode 100644 index 00000000..32b7a579 --- /dev/null +++ b/net/netfilter/xt_limit.c @@ -0,0 +1,210 @@ +/* (C) 1999 Jérôme de Vivie + * (C) 1999 Hervé Eychenne + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include + +struct xt_limit_priv { + unsigned long prev; + uint32_t credit; +}; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne "); +MODULE_DESCRIPTION("Xtables: rate-limit match"); +MODULE_ALIAS("ipt_limit"); +MODULE_ALIAS("ip6t_limit"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maxmum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static bool +limit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv = r->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + if (priv->credit > r->credit_cap) + priv->credit = r->credit_cap; + + if (priv->credit >= r->cost) { + /* We're not limited. */ + priv->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return true; + } + + spin_unlock_bh(&limit_lock); + return false; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; +} + +static int limit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + pr_info("Overflow, try lower: %u/%u\n", + r->avg, r->burst); + return -ERANGE; + } + + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + /* For SMP, we only want to use one set of state. */ + r->master = priv; + if (r->cost == 0) { + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * + 128. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + } + return 0; +} + +static void limit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_rateinfo *info = par->matchinfo; + + kfree(info->master); +} + +#ifdef CONFIG_COMPAT +struct compat_xt_rateinfo { + u_int32_t avg; + u_int32_t burst; + + compat_ulong_t prev; + u_int32_t credit; + u_int32_t credit_cap, cost; + + u_int32_t master; +}; + +/* To keep the full "prev" timestamp, the upper 32 bits are stored in the + * master pointer, which does not need to be preserved. */ +static void limit_mt_compat_from_user(void *dst, const void *src) +{ + const struct compat_xt_rateinfo *cm = src; + struct xt_rateinfo m = { + .avg = cm->avg, + .burst = cm->burst, + .prev = cm->prev | (unsigned long)cm->master << 32, + .credit = cm->credit, + .credit_cap = cm->credit_cap, + .cost = cm->cost, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int limit_mt_compat_to_user(void __user *dst, const void *src) +{ + const struct xt_rateinfo *m = src; + struct compat_xt_rateinfo cm = { + .avg = m->avg, + .burst = m->burst, + .prev = m->prev, + .credit = m->credit, + .credit_cap = m->credit_cap, + .cost = m->cost, + .master = m->prev >> 32, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_match limit_mt_reg __read_mostly = { + .name = "limit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = limit_mt, + .checkentry = limit_mt_check, + .destroy = limit_mt_destroy, + .matchsize = sizeof(struct xt_rateinfo), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_rateinfo), + .compat_from_user = limit_mt_compat_from_user, + .compat_to_user = limit_mt_compat_to_user, +#endif + .me = THIS_MODULE, +}; + +static int __init limit_mt_init(void) +{ + return xt_register_match(&limit_mt_reg); +} + +static void __exit limit_mt_exit(void) +{ + xt_unregister_match(&limit_mt_reg); +} + +module_init(limit_mt_init); +module_exit(limit_mt_exit); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c new file mode 100644 index 00000000..8160f6b1 --- /dev/null +++ b/net/netfilter/xt_mac.c @@ -0,0 +1,66 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Xtables: MAC address match"); +MODULE_ALIAS("ipt_mac"); +MODULE_ALIAS("ip6t_mac"); + +static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_mac_info *info = par->matchinfo; + bool ret; + + if (skb->dev == NULL || skb->dev->type != ARPHRD_ETHER) + return false; + if (skb_mac_header(skb) < skb->head) + return false; + if (skb_mac_header(skb) + ETH_HLEN > skb->data) + return false; + ret = compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr) == 0; + ret ^= info->invert; + return ret; +} + +static struct xt_match mac_mt_reg __read_mostly = { + .name = "mac", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = mac_mt, + .matchsize = sizeof(struct xt_mac_info), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, +}; + +static int __init mac_mt_init(void) +{ + return xt_register_match(&mac_mt_reg); +} + +static void __exit mac_mt_exit(void) +{ + xt_unregister_match(&mac_mt_reg); +} + +module_init(mac_mt_init); +module_exit(mac_mt_exit); diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c new file mode 100644 index 00000000..23345238 --- /dev/null +++ b/net/netfilter/xt_mark.c @@ -0,0 +1,84 @@ +/* + * xt_mark - Netfilter module to match NFMARK value + * + * (C) 1999-2001 Marc Boucher + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("Xtables: packet mark operations"); +MODULE_ALIAS("ipt_mark"); +MODULE_ALIAS("ip6t_mark"); +MODULE_ALIAS("ipt_MARK"); +MODULE_ALIAS("ip6t_MARK"); + +static unsigned int +mark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_mark_tginfo2 *info = par->targinfo; + + skb->mark = (skb->mark & ~info->mask) ^ info->mark; + return XT_CONTINUE; +} + +static bool +mark_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_mark_mtinfo1 *info = par->matchinfo; + + return ((skb->mark & info->mask) == info->mark) ^ info->invert; +} + +static struct xt_target mark_tg_reg __read_mostly = { + .name = "MARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, +}; + +static struct xt_match mark_mt_reg __read_mostly = { + .name = "mark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = mark_mt, + .matchsize = sizeof(struct xt_mark_mtinfo1), + .me = THIS_MODULE, +}; + +static int __init mark_mt_init(void) +{ + int ret; + + ret = xt_register_target(&mark_tg_reg); + if (ret < 0) + return ret; + ret = xt_register_match(&mark_mt_reg); + if (ret < 0) { + xt_unregister_target(&mark_tg_reg); + return ret; + } + return 0; +} + +static void __exit mark_mt_exit(void) +{ + xt_unregister_match(&mark_mt_reg); + xt_unregister_target(&mark_tg_reg); +} + +module_init(mark_mt_init); +module_exit(mark_mt_exit); diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c new file mode 100644 index 00000000..ac1d3c3d --- /dev/null +++ b/net/netfilter/xt_multiport.c @@ -0,0 +1,165 @@ +/* Kernel module to match one of a list of TCP/UDP(-Lite)/SCTP/DCCP ports: + ports are in the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP"); +MODULE_ALIAS("ipt_multiport"); +MODULE_ALIAS("ip6t_multiport"); + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline bool +ports_match_v1(const struct xt_multiport_v1 *minfo, + u_int16_t src, u_int16_t dst) +{ + unsigned int i; + u_int16_t s, e; + + for (i = 0; i < minfo->count; i++) { + s = minfo->ports[i]; + + if (minfo->pflags[i]) { + /* range port matching */ + e = minfo->ports[++i]; + pr_debug("src or dst matches with %d-%d?\n", s, e); + + if (minfo->flags == XT_MULTIPORT_SOURCE + && src >= s && src <= e) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_DESTINATION + && dst >= s && dst <= e) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_EITHER + && ((dst >= s && dst <= e) + || (src >= s && src <= e))) + return true ^ minfo->invert; + } else { + /* exact port matching */ + pr_debug("src or dst matches with %d?\n", s); + + if (minfo->flags == XT_MULTIPORT_SOURCE + && src == s) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_DESTINATION + && dst == s) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_EITHER + && (src == s || dst == s)) + return true ^ minfo->invert; + } + } + + return minfo->invert; +} + +static bool +multiport_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const __be16 *pptr; + __be16 _ports[2]; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; + + if (par->fragoff != 0) + return false; + + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil offset=0 tinygram.\n"); + par->hotdrop = true; + return false; + } + + return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); +} + +static inline bool +check(u_int16_t proto, + u_int8_t ip_invflags, + u_int8_t match_flags, + u_int8_t count) +{ + /* Must specify supported protocol, no unknown flags or bad count */ + return (proto == IPPROTO_TCP || proto == IPPROTO_UDP + || proto == IPPROTO_UDPLITE + || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) + && !(ip_invflags & XT_INV_PROTO) + && (match_flags == XT_MULTIPORT_SOURCE + || match_flags == XT_MULTIPORT_DESTINATION + || match_flags == XT_MULTIPORT_EITHER) + && count <= XT_MULTI_PORTS; +} + +static int multiport_mt_check(const struct xt_mtchk_param *par) +{ + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; + + return check(ip->proto, ip->invflags, multiinfo->flags, + multiinfo->count) ? 0 : -EINVAL; +} + +static int multiport_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; + + return check(ip->proto, ip->invflags, multiinfo->flags, + multiinfo->count) ? 0 : -EINVAL; +} + +static struct xt_match multiport_mt_reg[] __read_mostly = { + { + .name = "multiport", + .family = NFPROTO_IPV4, + .revision = 1, + .checkentry = multiport_mt_check, + .match = multiport_mt, + .matchsize = sizeof(struct xt_multiport_v1), + .me = THIS_MODULE, + }, + { + .name = "multiport", + .family = NFPROTO_IPV6, + .revision = 1, + .checkentry = multiport_mt6_check, + .match = multiport_mt, + .matchsize = sizeof(struct xt_multiport_v1), + .me = THIS_MODULE, + }, +}; + +static int __init multiport_mt_init(void) +{ + return xt_register_matches(multiport_mt_reg, + ARRAY_SIZE(multiport_mt_reg)); +} + +static void __exit multiport_mt_exit(void) +{ + xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg)); +} + +module_init(multiport_mt_init); +module_exit(multiport_mt_exit); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c new file mode 100644 index 00000000..846f895c --- /dev/null +++ b/net/netfilter/xt_osf.c @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2003+ Evgeniy Polyakov + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +struct xt_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct xt_osf_user_finger finger; +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +/* + * Indexed by dont-fragment bit. + * It is the only constant value in the fingerprint. + */ +static struct list_head xt_osf_fingers[2]; + +static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { + [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, +}; + +static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[]) +{ + struct xt_osf_user_finger *f; + struct xt_osf_finger *kf = NULL, *sf; + int err = 0; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL); + if (!kf) + return -ENOMEM; + + memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger)); + + list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) + continue; + + kfree(kf); + kf = NULL; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + err = -EEXIST; + break; + } + + /* + * We are protected by nfnl mutex. + */ + if (kf) + list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]); + + return err; +} + +static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[]) +{ + struct xt_osf_user_finger *f; + struct xt_osf_finger *sf; + int err = -ENOENT; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) + continue; + + /* + * We are protected by nfnl mutex. + */ + list_del_rcu(&sf->finger_entry); + kfree_rcu(sf, rcu_head); + + err = 0; + break; + } + + return err; +} + +static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = { + [OSF_MSG_ADD] = { + .call = xt_osf_add_callback, + .attr_count = OSF_ATTR_MAX, + .policy = xt_osf_policy, + }, + [OSF_MSG_REMOVE] = { + .call = xt_osf_remove_callback, + .attr_count = OSF_ATTR_MAX, + .policy = xt_osf_policy, + }, +}; + +static const struct nfnetlink_subsystem xt_osf_nfnetlink = { + .name = "osf", + .subsys_id = NFNL_SUBSYS_OSF, + .cb_count = OSF_MSG_MAX, + .cb = xt_osf_nfnetlink_callbacks, +}; + +static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info, + unsigned char f_ttl) +{ + const struct iphdr *ip = ip_hdr(skb); + + if (info->flags & XT_OSF_TTL) { + if (info->ttl == XT_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (info->ttl == XT_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + else { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int ret = 0; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; + } + } + endfor_ifa(in_dev); + + return ret; + } + } + + return ip->ttl == f_ttl; +} + +static bool +xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) +{ + const struct xt_osf_info *info = p->matchinfo; + const struct iphdr *ip = ip_hdr(skb); + const struct tcphdr *tcp; + struct tcphdr _tcph; + int fmatch = FMATCH_WRONG, fcount = 0; + unsigned int optsize = 0, check_WSS = 0; + u16 window, totlen, mss = 0; + bool df; + const unsigned char *optp = NULL, *_optp = NULL; + unsigned char opts[MAX_IPOPTLEN]; + const struct xt_osf_finger *kf; + const struct xt_osf_user_finger *f; + + if (!info) + return false; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return false; + + if (!tcp->syn) + return false; + + totlen = ntohs(ip->tot_len); + df = ntohs(ip->frag_off) & IP_DF; + window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), optsize, opts); + } + + rcu_read_lock(); + list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { + f = &kf->finger; + + if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + optp = _optp; + fmatch = FMATCH_WRONG; + + if (totlen == f->ss && xt_osf_ttl(skb, info, f->ttl)) { + int foptsize, optnum; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + continue; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + optsize > MAX_IPOPTLEN || + optsize != foptsize) + continue; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == (*optp)) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = optp + len; + int loop_cont = 0; + + fmatch = FMATCH_OK; + + switch (*optp) { + case OSFOPT_MSS: + mss = optp[3]; + mss <<= 8; + mss |= optp[2]; + + mss = ntohs(mss); + break; + case OSFOPT_TS: + loop_cont = 1; + break; + } + + optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (window == f->wss.val * mss || + window == f->wss.val * SMART_MSS_1 || + window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (window == f->wss.val * (mss + 40) || + window == f->wss.val * (SMART_MSS_1 + 40) || + window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + if (fmatch != FMATCH_OK) + continue; + + fcount++; + + if (info->flags & XT_OSF_LOG) + nf_log_packet(p->family, p->hooknum, skb, + p->in, p->out, NULL, + "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", + f->genre, f->version, f->subtype, + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest), + f->ttl - ip->ttl); + + if ((info->flags & XT_OSF_LOG) && + info->loglevel == XT_OSF_LOGLEVEL_FIRST) + break; + } + } + rcu_read_unlock(); + + if (!fcount && (info->flags & XT_OSF_LOG)) + nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL, + "Remote OS is not known: %pI4:%u -> %pI4:%u\n", + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest)); + + if (fcount) + fmatch = FMATCH_OK; + + return fmatch == FMATCH_OK; +} + +static struct xt_match xt_osf_match = { + .name = "osf", + .revision = 0, + .family = NFPROTO_IPV4, + .proto = IPPROTO_TCP, + .hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD), + .match = xt_osf_match_packet, + .matchsize = sizeof(struct xt_osf_info), + .me = THIS_MODULE, +}; + +static int __init xt_osf_init(void) +{ + int err = -EINVAL; + int i; + + for (i=0; ifinger_entry); + kfree_rcu(f, rcu_head); + } + } + rcu_read_unlock(); + + rcu_barrier(); +} + +module_init(xt_osf_init); +module_exit(xt_osf_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Evgeniy Polyakov "); +MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c new file mode 100644 index 00000000..772d7389 --- /dev/null +++ b/net/netfilter/xt_owner.c @@ -0,0 +1,82 @@ +/* + * Kernel module to match various things tied to sockets associated with + * locally generated outgoing packets. + * + * (C) 2000 Marc Boucher + * + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +static bool +owner_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_owner_match_info *info = par->matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return (info->match ^ info->invert) == 0; + else if (info->match & info->invert & XT_OWNER_SOCKET) + /* + * Socket exists but user wanted ! --socket-exists. + * (Single ampersands intended.) + */ + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return ((info->match ^ info->invert) & + (XT_OWNER_UID | XT_OWNER_GID)) == 0; + + if (info->match & XT_OWNER_UID) + if ((filp->f_cred->fsuid >= info->uid_min && + filp->f_cred->fsuid <= info->uid_max) ^ + !(info->invert & XT_OWNER_UID)) + return false; + + if (info->match & XT_OWNER_GID) + if ((filp->f_cred->fsgid >= info->gid_min && + filp->f_cred->fsgid <= info->gid_max) ^ + !(info->invert & XT_OWNER_GID)) + return false; + + return true; +} + +static struct xt_match owner_mt_reg __read_mostly = { + .name = "owner", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, +}; + +static int __init owner_mt_init(void) +{ + return xt_register_match(&owner_mt_reg); +} + +static void __exit owner_mt_exit(void) +{ + xt_unregister_match(&owner_mt_reg); +} + +module_init(owner_mt_init); +module_exit(owner_mt_exit); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: socket owner matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_owner"); +MODULE_ALIAS("ip6t_owner"); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c new file mode 100644 index 00000000..d7ca16b8 --- /dev/null +++ b/net/netfilter/xt_physdev.c @@ -0,0 +1,128 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("Xtables: Bridge physical device match"); +MODULE_ALIAS("ipt_physdev"); +MODULE_ALIAS("ip6t_physdev"); + + +static bool +physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + const struct xt_physdev_info *info = par->matchinfo; + unsigned long ret; + const char *indev, *outdev; + const struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && + !(info->invert & XT_PHYSDEV_OP_BRIDGED)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_ISIN) && + !(info->invert & XT_PHYSDEV_OP_ISIN)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) && + !(info->invert & XT_PHYSDEV_OP_ISOUT)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_IN) && + !(info->invert & XT_PHYSDEV_OP_IN)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_OUT) && + !(info->invert & XT_PHYSDEV_OP_OUT)) + return false; + return true; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & XT_PHYSDEV_OP_BRIDGED))) + return false; + + if ((info->bitmask & XT_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) || + (info->bitmask & XT_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT)))) + return false; + + if (!(info->bitmask & XT_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + ret = ifname_compare_aligned(indev, info->physindev, info->in_mask); + + if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN)) + return false; + +match_outdev: + if (!(info->bitmask & XT_PHYSDEV_OP_OUT)) + return true; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask); + + return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); +} + +static int physdev_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_physdev_info *info = par->matchinfo; + + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) + return -EINVAL; + if (info->bitmask & XT_PHYSDEV_OP_OUT && + (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || + info->invert & XT_PHYSDEV_OP_BRIDGED) && + par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { + pr_info("using --physdev-out in the OUTPUT, FORWARD and " + "POSTROUTING chains for non-bridged traffic is not " + "supported anymore.\n"); + if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) + return -EINVAL; + } + return 0; +} + +static struct xt_match physdev_mt_reg __read_mostly = { + .name = "physdev", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = physdev_mt_check, + .match = physdev_mt, + .matchsize = sizeof(struct xt_physdev_info), + .me = THIS_MODULE, +}; + +static int __init physdev_mt_init(void) +{ + return xt_register_match(&physdev_mt_reg); +} + +static void __exit physdev_mt_exit(void) +{ + xt_unregister_match(&physdev_mt_reg); +} + +module_init(physdev_mt_init); +module_exit(physdev_mt_exit); diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c new file mode 100644 index 00000000..5b645cb5 --- /dev/null +++ b/net/netfilter/xt_pkttype.c @@ -0,0 +1,65 @@ +/* (C) 1999-2001 Michal Ludvig + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Ludvig "); +MODULE_DESCRIPTION("Xtables: link layer packet type match"); +MODULE_ALIAS("ipt_pkttype"); +MODULE_ALIAS("ip6t_pkttype"); + +static bool +pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_pkttype_info *info = par->matchinfo; + u_int8_t type; + + if (skb->pkt_type != PACKET_LOOPBACK) + type = skb->pkt_type; + else if (par->family == NFPROTO_IPV4 && + ipv4_is_multicast(ip_hdr(skb)->daddr)) + type = PACKET_MULTICAST; + else if (par->family == NFPROTO_IPV6 && + ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + type = PACKET_MULTICAST; + else + type = PACKET_BROADCAST; + + return (type == info->pkttype) ^ info->invert; +} + +static struct xt_match pkttype_mt_reg __read_mostly = { + .name = "pkttype", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = pkttype_mt, + .matchsize = sizeof(struct xt_pkttype_info), + .me = THIS_MODULE, +}; + +static int __init pkttype_mt_init(void) +{ + return xt_register_match(&pkttype_mt_reg); +} + +static void __exit pkttype_mt_exit(void) +{ + xt_unregister_match(&pkttype_mt_reg); +} + +module_init(pkttype_mt_init); +module_exit(pkttype_mt_exit); diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c new file mode 100644 index 00000000..f23e97bb --- /dev/null +++ b/net/netfilter/xt_policy.c @@ -0,0 +1,188 @@ +/* IP tables module for matching IPsec policy + * + * Copyright (c) 2004,2005 Patrick McHardy, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Xtables: IPsec policy match"); +MODULE_LICENSE("GPL"); + +static inline bool +xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, + const union nf_inet_addr *a2, unsigned short family) +{ + switch (family) { + case NFPROTO_IPV4: + return ((a1->ip ^ a2->ip) & m->ip) == 0; + case NFPROTO_IPV6: + return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; + } + return false; +} + +static bool +match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, + unsigned short family) +{ +#define MATCH_ADDR(x,y,z) (!e->match.x || \ + (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ + ^ e->invert.x)) +#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) + + return MATCH_ADDR(saddr, smask, &x->props.saddr) && + MATCH_ADDR(daddr, dmask, &x->id.daddr) && + MATCH(proto, x->id.proto) && + MATCH(mode, x->props.mode) && + MATCH(spi, x->id.spi) && + MATCH(reqid, x->props.reqid); +} + +static int +match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, + unsigned short family) +{ + const struct xt_policy_elem *e; + const struct sec_path *sp = skb->sp; + int strict = info->flags & XT_POLICY_MATCH_STRICT; + int i, pos; + + if (sp == NULL) + return -1; + if (strict && info->len != sp->len) + return 0; + + for (i = sp->len - 1; i >= 0; i--) { + pos = strict ? i - sp->len + 1 : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(sp->xvec[i], e, family)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? 1 : 0; +} + +static int +match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, + unsigned short family) +{ + const struct xt_policy_elem *e; + const struct dst_entry *dst = skb_dst(skb); + int strict = info->flags & XT_POLICY_MATCH_STRICT; + int i, pos; + + if (dst->xfrm == NULL) + return -1; + + for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + pos = strict ? i : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(dst->xfrm, e, family)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? i == info->len : 0; +} + +static bool +policy_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + int ret; + + if (info->flags & XT_POLICY_MATCH_IN) + ret = match_policy_in(skb, info, par->family); + else + ret = match_policy_out(skb, info, par->family); + + if (ret < 0) + ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; + else if (info->flags & XT_POLICY_MATCH_NONE) + ret = false; + + return ret; +} + +static int policy_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) { + pr_info("neither incoming nor outgoing policy selected\n"); + return -EINVAL; + } + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { + pr_info("output policy not valid in PREROUTING and INPUT\n"); + return -EINVAL; + } + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { + pr_info("input policy not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; + } + if (info->len > XT_POLICY_MAX_ELEM) { + pr_info("too many policy elements\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_match policy_mt_reg[] __read_mostly = { + { + .name = "policy", + .family = NFPROTO_IPV4, + .checkentry = policy_mt_check, + .match = policy_mt, + .matchsize = sizeof(struct xt_policy_info), + .me = THIS_MODULE, + }, + { + .name = "policy", + .family = NFPROTO_IPV6, + .checkentry = policy_mt_check, + .match = policy_mt, + .matchsize = sizeof(struct xt_policy_info), + .me = THIS_MODULE, + }, +}; + +static int __init policy_mt_init(void) +{ + return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); +} + +static void __exit policy_mt_exit(void) +{ + xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); +} + +module_init(policy_mt_init); +module_exit(policy_mt_exit); +MODULE_ALIAS("ipt_policy"); +MODULE_ALIAS("ip6t_policy"); diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c new file mode 100644 index 00000000..08086d68 --- /dev/null +++ b/net/netfilter/xt_qtaguid.c @@ -0,0 +1,2785 @@ +/* + * Kernel iptables module to track stats for packets based on user tags. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * There are run-time debug flags enabled via the debug_mask module param, or + * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h. + */ +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xt_qtaguid_internal.h" +#include "xt_qtaguid_print.h" + +/* + * We only use the xt_socket funcs within a similar context to avoid unexpected + * return values. + */ +#define XT_SOCKET_SUPPORTED_HOOKS \ + ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) + + +static const char *module_procdirname = "xt_qtaguid"; +static struct proc_dir_entry *xt_qtaguid_procdir; + +static unsigned int proc_iface_perms = S_IRUGO; +module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR); + +static struct proc_dir_entry *xt_qtaguid_stats_file; +static unsigned int proc_stats_perms = S_IRUGO; +module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR); + +static struct proc_dir_entry *xt_qtaguid_ctrl_file; +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO; +#else +static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUSR; +#endif +module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR); + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +#include +static gid_t proc_stats_readall_gid = AID_NET_BW_STATS; +static gid_t proc_ctrl_write_gid = AID_NET_BW_ACCT; +#else +/* 0 means, don't limit anybody */ +static gid_t proc_stats_readall_gid; +static gid_t proc_ctrl_write_gid; +#endif +module_param_named(stats_readall_gid, proc_stats_readall_gid, uint, + S_IRUGO | S_IWUSR); +module_param_named(ctrl_write_gid, proc_ctrl_write_gid, uint, + S_IRUGO | S_IWUSR); + +/* + * Limit the number of active tags (via socket tags) for a given UID. + * Multiple processes could share the UID. + */ +static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS; +module_param(max_sock_tags, int, S_IRUGO | S_IWUSR); + +/* + * After the kernel has initiallized this module, it is still possible + * to make it passive. + * Setting passive to Y: + * - the iface stats handling will not act on notifications. + * - iptables matches will never match. + * - ctrl commands silently succeed. + * - stats are always empty. + * This is mostly usefull when a bug is suspected. + */ +static bool module_passive; +module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR); + +/* + * Control how qtaguid data is tracked per proc/uid. + * Setting tag_tracking_passive to Y: + * - don't create proc specific structs to track tags + * - don't check that active tag stats exceed some limits. + * - don't clean up socket tags on process exits. + * This is mostly usefull when a bug is suspected. + */ +static bool qtu_proc_handling_passive; +module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool, + S_IRUGO | S_IWUSR); + +#define QTU_DEV_NAME "xt_qtaguid" + +uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK; +module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR); + +/*---------------------------------------------------------------------------*/ +static const char *iface_stat_procdirname = "iface_stat"; +static struct proc_dir_entry *iface_stat_procdir; +static const char *iface_stat_all_procfilename = "iface_stat_all"; +static struct proc_dir_entry *iface_stat_all_procfile; + +/* + * Ordering of locks: + * outer locks: + * iface_stat_list_lock + * sock_tag_list_lock + * inner locks: + * uid_tag_data_tree_lock + * tag_counter_set_list_lock + * Notice how sock_tag_list_lock is held sometimes when uid_tag_data_tree_lock + * is acquired. + * + * Call tree with all lock holders as of 2011-09-25: + * + * iface_stat_all_proc_read() + * iface_stat_list_lock + * (struct iface_stat) + * + * qtaguid_ctrl_proc_read() + * sock_tag_list_lock + * (sock_tag_tree) + * (struct proc_qtu_data->sock_tag_list) + * prdebug_full_state() + * sock_tag_list_lock + * (sock_tag_tree) + * uid_tag_data_tree_lock + * (uid_tag_data_tree) + * (proc_qtu_data_tree) + * iface_stat_list_lock + * + * qtaguid_stats_proc_read() + * iface_stat_list_lock + * struct iface_stat->tag_stat_list_lock + * + * qtudev_open() + * uid_tag_data_tree_lock + * + * qtudev_release() + * sock_tag_data_list_lock + * uid_tag_data_tree_lock + * prdebug_full_state() + * sock_tag_list_lock + * uid_tag_data_tree_lock + * iface_stat_list_lock + * + * iface_netdev_event_handler() + * iface_stat_create() + * iface_stat_list_lock + * iface_stat_update() + * iface_stat_list_lock + * + * iface_inetaddr_event_handler() + * iface_stat_create() + * iface_stat_list_lock + * iface_stat_update() + * iface_stat_list_lock + * + * iface_inet6addr_event_handler() + * iface_stat_create_ipv6() + * iface_stat_list_lock + * iface_stat_update() + * iface_stat_list_lock + * + * qtaguid_mt() + * account_for_uid() + * if_tag_stat_update() + * get_sock_stat() + * sock_tag_list_lock + * struct iface_stat->tag_stat_list_lock + * tag_stat_update() + * get_active_counter_set() + * tag_counter_set_list_lock + * tag_stat_update() + * get_active_counter_set() + * tag_counter_set_list_lock + * + * + * qtaguid_ctrl_parse() + * ctrl_cmd_delete() + * sock_tag_list_lock + * tag_counter_set_list_lock + * iface_stat_list_lock + * struct iface_stat->tag_stat_list_lock + * uid_tag_data_tree_lock + * ctrl_cmd_counter_set() + * tag_counter_set_list_lock + * ctrl_cmd_tag() + * sock_tag_list_lock + * (sock_tag_tree) + * get_tag_ref() + * uid_tag_data_tree_lock + * (uid_tag_data_tree) + * uid_tag_data_tree_lock + * (proc_qtu_data_tree) + * ctrl_cmd_untag() + * sock_tag_list_lock + * uid_tag_data_tree_lock + * + */ +static LIST_HEAD(iface_stat_list); +static DEFINE_SPINLOCK(iface_stat_list_lock); + +static struct rb_root sock_tag_tree = RB_ROOT; +static DEFINE_SPINLOCK(sock_tag_list_lock); + +static struct rb_root tag_counter_set_tree = RB_ROOT; +static DEFINE_SPINLOCK(tag_counter_set_list_lock); + +static struct rb_root uid_tag_data_tree = RB_ROOT; +static DEFINE_SPINLOCK(uid_tag_data_tree_lock); + +static struct rb_root proc_qtu_data_tree = RB_ROOT; +/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */ + +static struct qtaguid_event_counts qtu_events; +/*----------------------------------------------*/ +static bool can_manipulate_uids(void) +{ + /* root pwnd */ + return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid) + || in_egroup_p(proc_ctrl_write_gid); +} + +static bool can_impersonate_uid(uid_t uid) +{ + return uid == current_fsuid() || can_manipulate_uids(); +} + +static bool can_read_other_uid_stats(uid_t uid) +{ + /* root pwnd */ + return unlikely(!current_fsuid()) || uid == current_fsuid() + || unlikely(!proc_stats_readall_gid) + || in_egroup_p(proc_stats_readall_gid); +} + +static inline void dc_add_byte_packets(struct data_counters *counters, int set, + enum ifs_tx_rx direction, + enum ifs_proto ifs_proto, + int bytes, + int packets) +{ + counters->bpc[set][direction][ifs_proto].bytes += bytes; + counters->bpc[set][direction][ifs_proto].packets += packets; +} + +static inline uint64_t dc_sum_bytes(struct data_counters *counters, + int set, + enum ifs_tx_rx direction) +{ + return counters->bpc[set][direction][IFS_TCP].bytes + + counters->bpc[set][direction][IFS_UDP].bytes + + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes; +} + +static inline uint64_t dc_sum_packets(struct data_counters *counters, + int set, + enum ifs_tx_rx direction) +{ + return counters->bpc[set][direction][IFS_TCP].packets + + counters->bpc[set][direction][IFS_UDP].packets + + counters->bpc[set][direction][IFS_PROTO_OTHER].packets; +} + +static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct tag_node *data = rb_entry(node, struct tag_node, node); + int result; + RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): " + " node=%p data=%p\n", tag, node, data); + result = tag_compare(tag, data->tag); + RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): " + " data.tag=0x%llx (uid=%u) res=%d\n", + tag, data->tag, get_uid_from_tag(data->tag), result); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct tag_node *this = rb_entry(*new, struct tag_node, + node); + int result = tag_compare(data->tag, this->tag); + RB_DEBUG("qtaguid: %s(): tag=0x%llx" + " (uid=%u)\n", __func__, + this->tag, + get_uid_from_tag(this->tag)); + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root) +{ + tag_node_tree_insert(&data->tn, root); +} + +static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag) +{ + struct tag_node *node = tag_node_tree_search(root, tag); + if (!node) + return NULL; + return rb_entry(&node->node, struct tag_stat, tn.node); +} + +static void tag_counter_set_tree_insert(struct tag_counter_set *data, + struct rb_root *root) +{ + tag_node_tree_insert(&data->tn, root); +} + +static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root, + tag_t tag) +{ + struct tag_node *node = tag_node_tree_search(root, tag); + if (!node) + return NULL; + return rb_entry(&node->node, struct tag_counter_set, tn.node); + +} + +static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root) +{ + tag_node_tree_insert(&data->tn, root); +} + +static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag) +{ + struct tag_node *node = tag_node_tree_search(root, tag); + if (!node) + return NULL; + return rb_entry(&node->node, struct tag_ref, tn.node); +} + +static struct sock_tag *sock_tag_tree_search(struct rb_root *root, + const struct sock *sk) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct sock_tag *data = rb_entry(node, struct sock_tag, + sock_node); + if (sk < data->sk) + node = node->rb_left; + else if (sk > data->sk) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct sock_tag *this = rb_entry(*new, struct sock_tag, + sock_node); + parent = *new; + if (data->sk < this->sk) + new = &((*new)->rb_left); + else if (data->sk > this->sk) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->sock_node, parent, new); + rb_insert_color(&data->sock_node, root); +} + +static void sock_tag_tree_erase(struct rb_root *st_to_free_tree) +{ + struct rb_node *node; + struct sock_tag *st_entry; + + node = rb_first(st_to_free_tree); + while (node) { + st_entry = rb_entry(node, struct sock_tag, sock_node); + node = rb_next(node); + CT_DEBUG("qtaguid: %s(): " + "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__, + st_entry->sk, + st_entry->tag, + get_uid_from_tag(st_entry->tag)); + rb_erase(&st_entry->sock_node, st_to_free_tree); + sockfd_put(st_entry->socket); + kfree(st_entry); + } +} + +static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root, + const pid_t pid) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct proc_qtu_data *data = rb_entry(node, + struct proc_qtu_data, + node); + if (pid < data->pid) + node = node->rb_left; + else if (pid > data->pid) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void proc_qtu_data_tree_insert(struct proc_qtu_data *data, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_qtu_data *this = rb_entry(*new, + struct proc_qtu_data, + node); + parent = *new; + if (data->pid < this->pid) + new = &((*new)->rb_left); + else if (data->pid > this->pid) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static void uid_tag_data_tree_insert(struct uid_tag_data *data, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct uid_tag_data *this = rb_entry(*new, + struct uid_tag_data, + node); + parent = *new; + if (data->uid < this->uid) + new = &((*new)->rb_left); + else if (data->uid > this->uid) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root, + uid_t uid) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct uid_tag_data *data = rb_entry(node, + struct uid_tag_data, + node); + if (uid < data->uid) + node = node->rb_left; + else if (uid > data->uid) + node = node->rb_right; + else + return data; + } + return NULL; +} + +/* + * Allocates a new uid_tag_data struct if needed. + * Returns a pointer to the found or allocated uid_tag_data. + * Returns a PTR_ERR on failures, and lock is not held. + * If found is not NULL: + * sets *found to true if not allocated. + * sets *found to false if allocated. + */ +struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res) +{ + struct uid_tag_data *utd_entry; + + /* Look for top level uid_tag_data for the UID */ + utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid); + DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry); + + if (found_res) + *found_res = utd_entry; + if (utd_entry) + return utd_entry; + + utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC); + if (!utd_entry) { + pr_err("qtaguid: get_uid_data(%u): " + "tag data alloc failed\n", uid); + return ERR_PTR(-ENOMEM); + } + + utd_entry->uid = uid; + utd_entry->tag_ref_tree = RB_ROOT; + uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree); + DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry); + return utd_entry; +} + +/* Never returns NULL. Either PTR_ERR or a valid ptr. */ +static struct tag_ref *new_tag_ref(tag_t new_tag, + struct uid_tag_data *utd_entry) +{ + struct tag_ref *tr_entry; + int res; + + if (utd_entry->num_active_tags + 1 > max_sock_tags) { + pr_info("qtaguid: new_tag_ref(0x%llx): " + "tag ref alloc quota exceeded. max=%d\n", + new_tag, max_sock_tags); + res = -EMFILE; + goto err_res; + + } + + tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC); + if (!tr_entry) { + pr_err("qtaguid: new_tag_ref(0x%llx): " + "tag ref alloc failed\n", + new_tag); + res = -ENOMEM; + goto err_res; + } + tr_entry->tn.tag = new_tag; + /* tr_entry->num_sock_tags handled by caller */ + utd_entry->num_active_tags++; + tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree); + DR_DEBUG("qtaguid: new_tag_ref(0x%llx): " + " inserted new tag ref %p\n", + new_tag, tr_entry); + return tr_entry; + +err_res: + return ERR_PTR(res); +} + +static struct tag_ref *lookup_tag_ref(tag_t full_tag, + struct uid_tag_data **utd_res) +{ + struct uid_tag_data *utd_entry; + struct tag_ref *tr_entry; + bool found_utd; + uid_t uid = get_uid_from_tag(full_tag); + + DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n", + full_tag, uid); + + utd_entry = get_uid_data(uid, &found_utd); + if (IS_ERR_OR_NULL(utd_entry)) { + if (utd_res) + *utd_res = utd_entry; + return NULL; + } + + tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag); + if (utd_res) + *utd_res = utd_entry; + DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n", + full_tag, utd_entry, tr_entry); + return tr_entry; +} + +/* Never returns NULL. Either PTR_ERR or a valid ptr. */ +static struct tag_ref *get_tag_ref(tag_t full_tag, + struct uid_tag_data **utd_res) +{ + struct uid_tag_data *utd_entry; + struct tag_ref *tr_entry; + + DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n", + full_tag); + spin_lock_bh(&uid_tag_data_tree_lock); + tr_entry = lookup_tag_ref(full_tag, &utd_entry); + BUG_ON(IS_ERR_OR_NULL(utd_entry)); + if (!tr_entry) + tr_entry = new_tag_ref(full_tag, utd_entry); + + spin_unlock_bh(&uid_tag_data_tree_lock); + if (utd_res) + *utd_res = utd_entry; + DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n", + full_tag, utd_entry, tr_entry); + return tr_entry; +} + +/* Checks and maybe frees the UID Tag Data entry */ +static void put_utd_entry(struct uid_tag_data *utd_entry) +{ + /* Are we done with the UID tag data entry? */ + if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) && + !utd_entry->num_pqd) { + DR_DEBUG("qtaguid: %s(): " + "erase utd_entry=%p uid=%u " + "by pid=%u tgid=%u uid=%u\n", __func__, + utd_entry, utd_entry->uid, + current->pid, current->tgid, current_fsuid()); + BUG_ON(utd_entry->num_active_tags); + rb_erase(&utd_entry->node, &uid_tag_data_tree); + kfree(utd_entry); + } else { + DR_DEBUG("qtaguid: %s(): " + "utd_entry=%p still has %d tags %d proc_qtu_data\n", + __func__, utd_entry, utd_entry->num_active_tags, + utd_entry->num_pqd); + BUG_ON(!(utd_entry->num_active_tags || + utd_entry->num_pqd)); + } +} + +/* + * If no sock_tags are using this tag_ref, + * decrements refcount of utd_entry, removes tr_entry + * from utd_entry->tag_ref_tree and frees. + */ +static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry, + struct uid_tag_data *utd_entry) +{ + DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__, + tr_entry, tr_entry->tn.tag, + get_uid_from_tag(tr_entry->tn.tag)); + if (!tr_entry->num_sock_tags) { + BUG_ON(!utd_entry->num_active_tags); + utd_entry->num_active_tags--; + rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree); + DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry); + kfree(tr_entry); + } +} + +static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry) +{ + struct rb_node *node; + struct tag_ref *tr_entry; + tag_t acct_tag; + + DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__, + full_tag, get_uid_from_tag(full_tag)); + acct_tag = get_atag_from_tag(full_tag); + node = rb_first(&utd_entry->tag_ref_tree); + while (node) { + tr_entry = rb_entry(node, struct tag_ref, tn.node); + node = rb_next(node); + if (!acct_tag || tr_entry->tn.tag == full_tag) + free_tag_ref_from_utd_entry(tr_entry, utd_entry); + } +} + +static int read_proc_u64(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + uint64_t value; + char *p = page; + uint64_t *iface_entry = data; + + if (!data) + return 0; + + value = *iface_entry; + p += sprintf(p, "%llu\n", value); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; +} + +static int read_proc_bool(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + bool value; + char *p = page; + bool *bool_entry = data; + + if (!data) + return 0; + + value = *bool_entry; + p += sprintf(p, "%u\n", value); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; +} + +static int get_active_counter_set(tag_t tag) +{ + int active_set = 0; + struct tag_counter_set *tcs; + + MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)" + " (uid=%u)\n", + tag, get_uid_from_tag(tag)); + /* For now we only handle UID tags for active sets */ + tag = get_utag_from_tag(tag); + spin_lock_bh(&tag_counter_set_list_lock); + tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag); + if (tcs) + active_set = tcs->active_set; + spin_unlock_bh(&tag_counter_set_list_lock); + return active_set; +} + +/* + * Find the entry for tracking the specified interface. + * Caller must hold iface_stat_list_lock + */ +static struct iface_stat *get_iface_entry(const char *ifname) +{ + struct iface_stat *iface_entry; + + /* Find the entry for tracking the specified tag within the interface */ + if (ifname == NULL) { + pr_info("qtaguid: iface_stat: get() NULL device name\n"); + return NULL; + } + + /* Iterate over interfaces */ + list_for_each_entry(iface_entry, &iface_stat_list, list) { + if (!strcmp(ifname, iface_entry->ifname)) + goto done; + } + iface_entry = NULL; +done: + return iface_entry; +} + +static int iface_stat_all_proc_read(char *page, char **num_items_returned, + off_t items_to_skip, int char_count, + int *eof, void *data) +{ + char *outp = page; + int item_index = 0; + int len; + struct iface_stat *iface_entry; + struct rtnl_link_stats64 dev_stats, *stats; + struct rtnl_link_stats64 no_dev_stats = {0}; + + if (unlikely(module_passive)) { + *eof = 1; + return 0; + } + + CT_DEBUG("qtaguid:proc iface_stat_all " + "page=%p *num_items_returned=%p off=%ld " + "char_count=%d *eof=%d\n", page, *num_items_returned, + items_to_skip, char_count, *eof); + + if (*eof) + return 0; + + /* + * This lock will prevent iface_stat_update() from changing active, + * and in turn prevent an interface from unregistering itself. + */ + spin_lock_bh(&iface_stat_list_lock); + list_for_each_entry(iface_entry, &iface_stat_list, list) { + if (item_index++ < items_to_skip) + continue; + + if (iface_entry->active) { + stats = dev_get_stats(iface_entry->net_dev, + &dev_stats); + } else { + stats = &no_dev_stats; + } + len = snprintf(outp, char_count, + "%s %d " + "%llu %llu %llu %llu " + "%llu %llu %llu %llu\n", + iface_entry->ifname, + iface_entry->active, + iface_entry->totals[IFS_RX].bytes, + iface_entry->totals[IFS_RX].packets, + iface_entry->totals[IFS_TX].bytes, + iface_entry->totals[IFS_TX].packets, + stats->rx_bytes, stats->rx_packets, + stats->tx_bytes, stats->tx_packets); + if (len >= char_count) { + spin_unlock_bh(&iface_stat_list_lock); + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + spin_unlock_bh(&iface_stat_list_lock); + + *eof = 1; + return outp - page; +} + +static void iface_create_proc_worker(struct work_struct *work) +{ + struct proc_dir_entry *proc_entry; + struct iface_stat_work *isw = container_of(work, struct iface_stat_work, + iface_work); + struct iface_stat *new_iface = isw->iface_entry; + + /* iface_entries are not deleted, so safe to manipulate. */ + proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir); + if (IS_ERR_OR_NULL(proc_entry)) { + pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n"); + kfree(isw); + return; + } + + new_iface->proc_ptr = proc_entry; + + create_proc_read_entry("tx_bytes", proc_iface_perms, proc_entry, + read_proc_u64, &new_iface->totals[IFS_TX].bytes); + create_proc_read_entry("rx_bytes", proc_iface_perms, proc_entry, + read_proc_u64, &new_iface->totals[IFS_RX].bytes); + create_proc_read_entry("tx_packets", proc_iface_perms, proc_entry, + read_proc_u64, &new_iface->totals[IFS_TX].packets); + create_proc_read_entry("rx_packets", proc_iface_perms, proc_entry, + read_proc_u64, &new_iface->totals[IFS_RX].packets); + create_proc_read_entry("active", proc_iface_perms, proc_entry, + read_proc_bool, &new_iface->active); + + IF_DEBUG("qtaguid: iface_stat: create_proc(): done " + "entry=%p dev=%s\n", new_iface, new_iface->ifname); + kfree(isw); +} + +/* + * Will set the entry's active state, and + * update the net_dev accordingly also. + */ +static void _iface_stat_set_active(struct iface_stat *entry, + struct net_device *net_dev, + bool activate) +{ + if (activate) { + entry->net_dev = net_dev; + entry->active = true; + IF_DEBUG("qtaguid: %s(%s): " + "enable tracking. rfcnt=%d\n", __func__, + entry->ifname, + percpu_read(*net_dev->pcpu_refcnt)); + } else { + entry->active = false; + entry->net_dev = NULL; + IF_DEBUG("qtaguid: %s(%s): " + "disable tracking. rfcnt=%d\n", __func__, + entry->ifname, + percpu_read(*net_dev->pcpu_refcnt)); + + } +} + +/* Caller must hold iface_stat_list_lock */ +static struct iface_stat *iface_alloc(struct net_device *net_dev) +{ + struct iface_stat *new_iface; + struct iface_stat_work *isw; + + new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC); + if (new_iface == NULL) { + pr_err("qtaguid: iface_stat: create(%s): " + "iface_stat alloc failed\n", net_dev->name); + return NULL; + } + new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC); + if (new_iface->ifname == NULL) { + pr_err("qtaguid: iface_stat: create(%s): " + "ifname alloc failed\n", net_dev->name); + kfree(new_iface); + return NULL; + } + spin_lock_init(&new_iface->tag_stat_list_lock); + new_iface->tag_stat_tree = RB_ROOT; + _iface_stat_set_active(new_iface, net_dev, true); + + /* + * ipv6 notifier chains are atomic :( + * No create_proc_read_entry() for you! + */ + isw = kmalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) { + pr_err("qtaguid: iface_stat: create(%s): " + "work alloc failed\n", new_iface->ifname); + _iface_stat_set_active(new_iface, net_dev, false); + kfree(new_iface->ifname); + kfree(new_iface); + return NULL; + } + isw->iface_entry = new_iface; + INIT_WORK(&isw->iface_work, iface_create_proc_worker); + schedule_work(&isw->iface_work); + list_add(&new_iface->list, &iface_stat_list); + return new_iface; +} + +static void iface_check_stats_reset_and_adjust(struct net_device *net_dev, + struct iface_stat *iface) +{ + struct rtnl_link_stats64 dev_stats, *stats; + bool stats_rewound; + + stats = dev_get_stats(net_dev, &dev_stats); + /* No empty packets */ + stats_rewound = + (stats->rx_bytes < iface->last_known[IFS_RX].bytes) + || (stats->tx_bytes < iface->last_known[IFS_TX].bytes); + + IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p " + "bytes rx/tx=%llu/%llu " + "active=%d last_known=%d " + "stats_rewound=%d\n", __func__, + net_dev ? net_dev->name : "?", + iface, net_dev, + stats->rx_bytes, stats->tx_bytes, + iface->active, iface->last_known_valid, stats_rewound); + + if (iface->active && iface->last_known_valid && stats_rewound) { + pr_warn_once("qtaguid: iface_stat: %s(%s): " + "iface reset its stats unexpectedly\n", __func__, + net_dev->name); + + iface->totals[IFS_TX].bytes += iface->last_known[IFS_TX].bytes; + iface->totals[IFS_TX].packets += + iface->last_known[IFS_TX].packets; + iface->totals[IFS_RX].bytes += iface->last_known[IFS_RX].bytes; + iface->totals[IFS_RX].packets += + iface->last_known[IFS_RX].packets; + iface->last_known_valid = false; + IF_DEBUG("qtaguid: %s(%s): iface=%p " + "used last known bytes rx/tx=%llu/%llu\n", __func__, + iface->ifname, iface, iface->last_known[IFS_RX].bytes, + iface->last_known[IFS_TX].bytes); + } +} + +/* + * Create a new entry for tracking the specified interface. + * Do nothing if the entry already exists. + * Called when an interface is configured with a valid IP address. + */ +static void iface_stat_create(struct net_device *net_dev, + struct in_ifaddr *ifa) +{ + struct in_device *in_dev = NULL; + const char *ifname; + struct iface_stat *entry; + __be32 ipaddr = 0; + struct iface_stat *new_iface; + + IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n", + net_dev ? net_dev->name : "?", + ifa, net_dev); + if (!net_dev) { + pr_err("qtaguid: iface_stat: create(): no net dev\n"); + return; + } + + ifname = net_dev->name; + if (!ifa) { + in_dev = in_dev_get(net_dev); + if (!in_dev) { + pr_err("qtaguid: iface_stat: create(%s): no inet dev\n", + ifname); + return; + } + IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n", + ifname, in_dev); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + IF_DEBUG("qtaguid: iface_stat: create(%s): " + "ifa=%p ifa_label=%s\n", + ifname, ifa, + ifa->ifa_label ? ifa->ifa_label : "(null)"); + if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label)) + break; + } + } + + if (!ifa) { + IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n", + ifname); + goto done_put; + } + ipaddr = ifa->ifa_local; + + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(ifname); + if (entry != NULL) { + bool activate = !ipv4_is_loopback(ipaddr); + IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n", + ifname, entry); + iface_check_stats_reset_and_adjust(net_dev, entry); + _iface_stat_set_active(entry, net_dev, activate); + IF_DEBUG("qtaguid: %s(%s): " + "tracking now %d on ip=%pI4\n", __func__, + entry->ifname, activate, &ipaddr); + goto done_unlock_put; + } else if (ipv4_is_loopback(ipaddr)) { + IF_DEBUG("qtaguid: iface_stat: create(%s): " + "ignore loopback dev. ip=%pI4\n", ifname, &ipaddr); + goto done_unlock_put; + } + + new_iface = iface_alloc(net_dev); + IF_DEBUG("qtaguid: iface_stat: create(%s): done " + "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr); +done_unlock_put: + spin_unlock_bh(&iface_stat_list_lock); +done_put: + if (in_dev) + in_dev_put(in_dev); +} + +static void iface_stat_create_ipv6(struct net_device *net_dev, + struct inet6_ifaddr *ifa) +{ + struct in_device *in_dev; + const char *ifname; + struct iface_stat *entry; + struct iface_stat *new_iface; + int addr_type; + + IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n", + ifa, net_dev, net_dev ? net_dev->name : ""); + if (!net_dev) { + pr_err("qtaguid: iface_stat: create6(): no net dev!\n"); + return; + } + ifname = net_dev->name; + + in_dev = in_dev_get(net_dev); + if (!in_dev) { + pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n", + ifname); + return; + } + + IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n", + ifname, in_dev); + + if (!ifa) { + IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n", + ifname); + goto done_put; + } + addr_type = ipv6_addr_type(&ifa->addr); + + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(ifname); + if (entry != NULL) { + bool activate = !(addr_type & IPV6_ADDR_LOOPBACK); + IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + ifname, entry); + iface_check_stats_reset_and_adjust(net_dev, entry); + _iface_stat_set_active(entry, net_dev, activate); + IF_DEBUG("qtaguid: %s(%s): " + "tracking now %d on ip=%pI6c\n", __func__, + entry->ifname, activate, &ifa->addr); + goto done_unlock_put; + } else if (addr_type & IPV6_ADDR_LOOPBACK) { + IF_DEBUG("qtaguid: %s(%s): " + "ignore loopback dev. ip=%pI6c\n", __func__, + ifname, &ifa->addr); + goto done_unlock_put; + } + + new_iface = iface_alloc(net_dev); + IF_DEBUG("qtaguid: iface_stat: create6(%s): done " + "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr); + +done_unlock_put: + spin_unlock_bh(&iface_stat_list_lock); +done_put: + in_dev_put(in_dev); +} + +static struct sock_tag *get_sock_stat_nl(const struct sock *sk) +{ + MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk); + return sock_tag_tree_search(&sock_tag_tree, sk); +} + +static struct sock_tag *get_sock_stat(const struct sock *sk) +{ + struct sock_tag *sock_tag_entry; + MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk); + if (!sk) + return NULL; + spin_lock_bh(&sock_tag_list_lock); + sock_tag_entry = get_sock_stat_nl(sk); + spin_unlock_bh(&sock_tag_list_lock); + return sock_tag_entry; +} + +static void +data_counters_update(struct data_counters *dc, int set, + enum ifs_tx_rx direction, int proto, int bytes) +{ + switch (proto) { + case IPPROTO_TCP: + dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1); + break; + case IPPROTO_UDP: + dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1); + break; + case IPPROTO_IP: + default: + dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes, + 1); + break; + } +} + +/* + * Update stats for the specified interface. Do nothing if the entry + * does not exist (when a device was never configured with an IP address). + * Called when an device is being unregistered. + */ +static void iface_stat_update(struct net_device *net_dev, bool stash_only) +{ + struct rtnl_link_stats64 dev_stats, *stats; + struct iface_stat *entry; + + stats = dev_get_stats(net_dev, &dev_stats); + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(net_dev->name); + if (entry == NULL) { + IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n", + net_dev->name); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + + IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + net_dev->name, entry); + if (!entry->active) { + IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__, + net_dev->name); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + + if (stash_only) { + entry->last_known[IFS_TX].bytes = stats->tx_bytes; + entry->last_known[IFS_TX].packets = stats->tx_packets; + entry->last_known[IFS_RX].bytes = stats->rx_bytes; + entry->last_known[IFS_RX].packets = stats->rx_packets; + entry->last_known_valid = true; + IF_DEBUG("qtaguid: %s(%s): " + "dev stats stashed rx/tx=%llu/%llu\n", __func__, + net_dev->name, stats->rx_bytes, stats->tx_bytes); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + entry->totals[IFS_TX].bytes += stats->tx_bytes; + entry->totals[IFS_TX].packets += stats->tx_packets; + entry->totals[IFS_RX].bytes += stats->rx_bytes; + entry->totals[IFS_RX].packets += stats->rx_packets; + /* We don't need the last_known[] anymore */ + entry->last_known_valid = false; + _iface_stat_set_active(entry, net_dev, false); + IF_DEBUG("qtaguid: %s(%s): " + "disable tracking. rx/tx=%llu/%llu\n", __func__, + net_dev->name, stats->rx_bytes, stats->tx_bytes); + spin_unlock_bh(&iface_stat_list_lock); +} + +static void tag_stat_update(struct tag_stat *tag_entry, + enum ifs_tx_rx direction, int proto, int bytes) +{ + int active_set; + active_set = get_active_counter_set(tag_entry->tn.tag); + MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d " + "dir=%d proto=%d bytes=%d)\n", + tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag), + active_set, direction, proto, bytes); + data_counters_update(&tag_entry->counters, active_set, direction, + proto, bytes); + if (tag_entry->parent_counters) + data_counters_update(tag_entry->parent_counters, active_set, + direction, proto, bytes); +} + +/* + * Create a new entry for tracking the specified {acct_tag,uid_tag} within + * the interface. + * iface_entry->tag_stat_list_lock should be held. + */ +static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry, + tag_t tag) +{ + struct tag_stat *new_tag_stat_entry = NULL; + IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx" + " (uid=%u)\n", __func__, + iface_entry, tag, get_uid_from_tag(tag)); + new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC); + if (!new_tag_stat_entry) { + pr_err("qtaguid: iface_stat: tag stat alloc failed\n"); + goto done; + } + new_tag_stat_entry->tn.tag = tag; + tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree); +done: + return new_tag_stat_entry; +} + +static void if_tag_stat_update(const char *ifname, uid_t uid, + const struct sock *sk, enum ifs_tx_rx direction, + int proto, int bytes) +{ + struct tag_stat *tag_stat_entry; + tag_t tag, acct_tag; + tag_t uid_tag; + struct data_counters *uid_tag_counters; + struct sock_tag *sock_tag_entry; + struct iface_stat *iface_entry; + struct tag_stat *new_tag_stat; + MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s " + "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n", + ifname, uid, sk, direction, proto, bytes); + + + iface_entry = get_iface_entry(ifname); + if (!iface_entry) { + pr_err("qtaguid: iface_stat: stat_update() %s not found\n", + ifname); + return; + } + /* It is ok to process data when an iface_entry is inactive */ + + MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n", + ifname, iface_entry); + + /* + * Look for a tagged sock. + * It will have an acct_uid. + */ + sock_tag_entry = get_sock_stat(sk); + if (sock_tag_entry) { + tag = sock_tag_entry->tag; + acct_tag = get_atag_from_tag(tag); + uid_tag = get_utag_from_tag(tag); + } else { + acct_tag = make_atag_from_value(0); + tag = combine_atag_with_uid(acct_tag, uid); + uid_tag = make_tag_from_uid(uid); + } + MT_DEBUG("qtaguid: iface_stat: stat_update(): " + " looking for tag=0x%llx (uid=%u) in ife=%p\n", + tag, get_uid_from_tag(tag), iface_entry); + /* Loop over tag list under this interface for {acct_tag,uid_tag} */ + spin_lock_bh(&iface_entry->tag_stat_list_lock); + + tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree, + tag); + if (tag_stat_entry) { + /* + * Updating the {acct_tag, uid_tag} entry handles both stats: + * {0, uid_tag} will also get updated. + */ + tag_stat_update(tag_stat_entry, direction, proto, bytes); + spin_unlock_bh(&iface_entry->tag_stat_list_lock); + return; + } + + /* Loop over tag list under this interface for {0,uid_tag} */ + tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree, + uid_tag); + if (!tag_stat_entry) { + /* Here: the base uid_tag did not exist */ + /* + * No parent counters. So + * - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats. + */ + new_tag_stat = create_if_tag_stat(iface_entry, uid_tag); + uid_tag_counters = &new_tag_stat->counters; + } else { + uid_tag_counters = &tag_stat_entry->counters; + } + + if (acct_tag) { + new_tag_stat = create_if_tag_stat(iface_entry, tag); + new_tag_stat->parent_counters = uid_tag_counters; + } + tag_stat_update(new_tag_stat, direction, proto, bytes); + spin_unlock_bh(&iface_entry->tag_stat_list_lock); +} + +static int iface_netdev_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) { + struct net_device *dev = ptr; + + if (unlikely(module_passive)) + return NOTIFY_DONE; + + IF_DEBUG("qtaguid: iface_stat: netdev_event(): " + "ev=0x%lx/%s netdev=%p->name=%s\n", + event, netdev_evt_str(event), dev, dev ? dev->name : ""); + + switch (event) { + case NETDEV_UP: + iface_stat_create(dev, NULL); + atomic64_inc(&qtu_events.iface_events); + break; + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + iface_stat_update(dev, event == NETDEV_DOWN); + atomic64_inc(&qtu_events.iface_events); + break; + } + return NOTIFY_DONE; +} + +static int iface_inet6addr_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct net_device *dev; + + if (unlikely(module_passive)) + return NOTIFY_DONE; + + IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): " + "ev=0x%lx/%s ifa=%p\n", + event, netdev_evt_str(event), ifa); + + switch (event) { + case NETDEV_UP: + BUG_ON(!ifa || !ifa->idev); + dev = (struct net_device *)ifa->idev->dev; + iface_stat_create_ipv6(dev, ifa); + atomic64_inc(&qtu_events.iface_events); + break; + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + BUG_ON(!ifa || !ifa->idev); + dev = (struct net_device *)ifa->idev->dev; + iface_stat_update(dev, event == NETDEV_DOWN); + atomic64_inc(&qtu_events.iface_events); + break; + } + return NOTIFY_DONE; +} + +static int iface_inetaddr_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct net_device *dev; + + if (unlikely(module_passive)) + return NOTIFY_DONE; + + IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): " + "ev=0x%lx/%s ifa=%p\n", + event, netdev_evt_str(event), ifa); + + switch (event) { + case NETDEV_UP: + BUG_ON(!ifa || !ifa->ifa_dev); + dev = ifa->ifa_dev->dev; + iface_stat_create(dev, ifa); + atomic64_inc(&qtu_events.iface_events); + break; + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + BUG_ON(!ifa || !ifa->ifa_dev); + dev = ifa->ifa_dev->dev; + iface_stat_update(dev, event == NETDEV_DOWN); + atomic64_inc(&qtu_events.iface_events); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block iface_netdev_notifier_blk = { + .notifier_call = iface_netdev_event_handler, +}; + +static struct notifier_block iface_inetaddr_notifier_blk = { + .notifier_call = iface_inetaddr_event_handler, +}; + +static struct notifier_block iface_inet6addr_notifier_blk = { + .notifier_call = iface_inet6addr_event_handler, +}; + +static int __init iface_stat_init(struct proc_dir_entry *parent_procdir) +{ + int err; + + iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir); + if (!iface_stat_procdir) { + pr_err("qtaguid: iface_stat: init failed to create proc entry\n"); + err = -1; + goto err; + } + + iface_stat_all_procfile = create_proc_entry(iface_stat_all_procfilename, + proc_iface_perms, + parent_procdir); + if (!iface_stat_all_procfile) { + pr_err("qtaguid: iface_stat: init " + " failed to create stat_all proc entry\n"); + err = -1; + goto err_zap_entry; + } + iface_stat_all_procfile->read_proc = iface_stat_all_proc_read; + + + err = register_netdevice_notifier(&iface_netdev_notifier_blk); + if (err) { + pr_err("qtaguid: iface_stat: init " + "failed to register dev event handler\n"); + goto err_zap_all_stats_entry; + } + err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk); + if (err) { + pr_err("qtaguid: iface_stat: init " + "failed to register ipv4 dev event handler\n"); + goto err_unreg_nd; + } + + err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk); + if (err) { + pr_err("qtaguid: iface_stat: init " + "failed to register ipv6 dev event handler\n"); + goto err_unreg_ip4_addr; + } + return 0; + +err_unreg_ip4_addr: + unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk); +err_unreg_nd: + unregister_netdevice_notifier(&iface_netdev_notifier_blk); +err_zap_all_stats_entry: + remove_proc_entry(iface_stat_all_procfilename, parent_procdir); +err_zap_entry: + remove_proc_entry(iface_stat_procdirname, parent_procdir); +err: + return err; +} + +static struct sock *qtaguid_find_sk(const struct sk_buff *skb, + struct xt_action_param *par) +{ + struct sock *sk; + unsigned int hook_mask = (1 << par->hooknum); + + MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb, + par->hooknum, par->family); + + /* + * Let's not abuse the the xt_socket_get*_sk(), or else it will + * return garbage SKs. + */ + if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS)) + return NULL; + + switch (par->family) { + case NFPROTO_IPV6: + sk = xt_socket_get6_sk(skb, par); + break; + case NFPROTO_IPV4: + sk = xt_socket_get4_sk(skb, par); + break; + default: + return NULL; + } + + /* + * Seems to be issues on the file ptr for TCP_TIME_WAIT SKs. + * http://kerneltrap.org/mailarchive/linux-netdev/2010/10/21/6287959 + * Not fixed in 3.0-r3 :( + */ + if (sk) { + MT_DEBUG("qtaguid: %p->sk_proto=%u " + "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state); + if (sk->sk_state == TCP_TIME_WAIT) { + xt_socket_put_sk(sk); + sk = NULL; + } + } + return sk; +} + +static void account_for_uid(const struct sk_buff *skb, + const struct sock *alternate_sk, uid_t uid, + struct xt_action_param *par) +{ + const struct net_device *el_dev; + + if (!skb->dev) { + MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); + el_dev = par->in ? : par->out; + } else { + const struct net_device *other_dev; + el_dev = skb->dev; + other_dev = par->in ? : par->out; + if (el_dev != other_dev) { + MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " + "par->(in/out)=%p %s\n", + par->hooknum, el_dev, el_dev->name, other_dev, + other_dev->name); + } + } + + if (unlikely(!el_dev)) { + pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum); + } else if (unlikely(!el_dev->name)) { + pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum); + } else { + MT_DEBUG("qtaguid[%d]: dev name=%s type=%d\n", + par->hooknum, + el_dev->name, + el_dev->type); + + if_tag_stat_update(el_dev->name, uid, + skb->sk ? skb->sk : alternate_sk, + par->in ? IFS_RX : IFS_TX, + ip_hdr(skb)->protocol, skb->len); + } +} + +static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_qtaguid_match_info *info = par->matchinfo; + const struct file *filp; + bool got_sock = false; + struct sock *sk; + uid_t sock_uid; + bool res; + + if (unlikely(module_passive)) + return (info->match ^ info->invert) == 0; + + MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n", + par->hooknum, skb, par->in, par->out, par->family); + + atomic64_inc(&qtu_events.match_calls); + if (skb == NULL) { + res = (info->match ^ info->invert) == 0; + goto ret_res; + } + + sk = skb->sk; + + if (sk == NULL) { + /* + * A missing sk->sk_socket happens when packets are in-flight + * and the matching socket is already closed and gone. + */ + sk = qtaguid_find_sk(skb, par); + /* + * If we got the socket from the find_sk(), we will need to put + * it back, as nf_tproxy_get_sock_v4() got it. + */ + got_sock = sk; + if (sk) + atomic64_inc(&qtu_events.match_found_sk_in_ct); + else + atomic64_inc(&qtu_events.match_found_no_sk_in_ct); + } else { + atomic64_inc(&qtu_events.match_found_sk); + } + MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d proto=%d\n", + par->hooknum, sk, got_sock, ip_hdr(skb)->protocol); + if (sk != NULL) { + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? filp->f_cred->fsuid : -1); + } + + if (sk == NULL || sk->sk_socket == NULL) { + /* + * Here, the qtaguid_find_sk() using connection tracking + * couldn't find the owner, so for now we just count them + * against the system. + */ + /* + * TODO: unhack how to force just accounting. + * For now we only do iface stats when the uid-owner is not + * requested. + */ + if (!(info->match & XT_QTAGUID_UID)) + account_for_uid(skb, sk, 0, par); + MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", + par->hooknum, + sk ? sk->sk_socket : NULL); + res = (info->match ^ info->invert) == 0; + atomic64_inc(&qtu_events.match_no_sk); + goto put_sock_ret_res; + } else if (info->match & info->invert & XT_QTAGUID_SOCKET) { + res = false; + goto put_sock_ret_res; + } + filp = sk->sk_socket->file; + if (filp == NULL) { + MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); + account_for_uid(skb, sk, 0, par); + res = ((info->match ^ info->invert) & + (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; + atomic64_inc(&qtu_events.match_no_sk_file); + goto put_sock_ret_res; + } + sock_uid = filp->f_cred->fsuid; + /* + * TODO: unhack how to force just accounting. + * For now we only do iface stats when the uid-owner is not requested + */ + if (!(info->match & XT_QTAGUID_UID)) + account_for_uid(skb, sk, sock_uid, par); + + /* + * The following two tests fail the match when: + * id not in range AND no inverted condition requested + * or id in range AND inverted condition requested + * Thus (!a && b) || (a && !b) == a ^ b + */ + if (info->match & XT_QTAGUID_UID) + if ((filp->f_cred->fsuid >= info->uid_min && + filp->f_cred->fsuid <= info->uid_max) ^ + !(info->invert & XT_QTAGUID_UID)) { + MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", + par->hooknum); + res = false; + goto put_sock_ret_res; + } + if (info->match & XT_QTAGUID_GID) + if ((filp->f_cred->fsgid >= info->gid_min && + filp->f_cred->fsgid <= info->gid_max) ^ + !(info->invert & XT_QTAGUID_GID)) { + MT_DEBUG("qtaguid[%d]: leaving gid not matching\n", + par->hooknum); + res = false; + goto put_sock_ret_res; + } + + MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum); + res = true; + +put_sock_ret_res: + if (got_sock) + xt_socket_put_sk(sk); +ret_res: + MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res); + return res; +} + +#ifdef DDEBUG +/* This function is not in xt_qtaguid_print.c because of locks visibility */ +static void prdebug_full_state(int indent_level, const char *fmt, ...) +{ + va_list args; + char *fmt_buff; + char *buff; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + fmt_buff = kasprintf(GFP_ATOMIC, + "qtaguid: %s(): %s {\n", __func__, fmt); + BUG_ON(!fmt_buff); + va_start(args, fmt); + buff = kvasprintf(GFP_ATOMIC, + fmt_buff, args); + BUG_ON(!buff); + pr_debug("%s", buff); + kfree(fmt_buff); + kfree(buff); + va_end(args); + + spin_lock_bh(&sock_tag_list_lock); + prdebug_sock_tag_tree(indent_level, &sock_tag_tree); + spin_unlock_bh(&sock_tag_list_lock); + + spin_lock_bh(&sock_tag_list_lock); + spin_lock_bh(&uid_tag_data_tree_lock); + prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree); + prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree); + spin_unlock_bh(&uid_tag_data_tree_lock); + spin_unlock_bh(&sock_tag_list_lock); + + spin_lock_bh(&iface_stat_list_lock); + prdebug_iface_stat_list(indent_level, &iface_stat_list); + spin_unlock_bh(&iface_stat_list_lock); + + pr_debug("qtaguid: %s(): }\n", __func__); +} +#else +static void prdebug_full_state(int indent_level, const char *fmt, ...) {} +#endif + +/* + * Procfs reader to get all active socket tags using style "1)" as described in + * fs/proc/generic.c + */ +static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned, + off_t items_to_skip, int char_count, int *eof, + void *data) +{ + char *outp = page; + int len; + uid_t uid; + struct rb_node *node; + struct sock_tag *sock_tag_entry; + int item_index = 0; + int indent_level = 0; + long f_count; + + if (unlikely(module_passive)) { + *eof = 1; + return 0; + } + + if (*eof) + return 0; + + CT_DEBUG("qtaguid: proc ctrl page=%p off=%ld char_count=%d *eof=%d\n", + page, items_to_skip, char_count, *eof); + + spin_lock_bh(&sock_tag_list_lock); + for (node = rb_first(&sock_tag_tree); + node; + node = rb_next(node)) { + if (item_index++ < items_to_skip) + continue; + sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); + uid = get_uid_from_tag(sock_tag_entry->tag); + CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) " + "pid=%u\n", + sock_tag_entry->sk, + sock_tag_entry->tag, + uid, + sock_tag_entry->pid + ); + f_count = atomic_long_read( + &sock_tag_entry->socket->file->f_count); + len = snprintf(outp, char_count, + "sock=%p tag=0x%llx (uid=%u) pid=%u " + "f_count=%lu\n", + sock_tag_entry->sk, + sock_tag_entry->tag, uid, + sock_tag_entry->pid, f_count); + if (len >= char_count) { + spin_unlock_bh(&sock_tag_list_lock); + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + spin_unlock_bh(&sock_tag_list_lock); + + if (item_index++ >= items_to_skip) { + len = snprintf(outp, char_count, + "events: sockets_tagged=%llu " + "sockets_untagged=%llu " + "counter_set_changes=%llu " + "delete_cmds=%llu " + "iface_events=%llu " + "match_calls=%llu " + "match_found_sk=%llu " + "match_found_sk_in_ct=%llu " + "match_found_no_sk_in_ct=%llu " + "match_no_sk=%llu " + "match_no_sk_file=%llu\n", + atomic64_read(&qtu_events.sockets_tagged), + atomic64_read(&qtu_events.sockets_untagged), + atomic64_read(&qtu_events.counter_set_changes), + atomic64_read(&qtu_events.delete_cmds), + atomic64_read(&qtu_events.iface_events), + atomic64_read(&qtu_events.match_calls), + atomic64_read(&qtu_events.match_found_sk), + atomic64_read(&qtu_events.match_found_sk_in_ct), + atomic64_read( + &qtu_events.match_found_no_sk_in_ct), + atomic64_read(&qtu_events.match_no_sk), + atomic64_read(&qtu_events.match_no_sk_file)); + if (len >= char_count) { + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + + /* Count the following as part of the last item_index */ + if (item_index > items_to_skip) { + prdebug_full_state(indent_level, "proc ctrl"); + } + + *eof = 1; + return outp - page; +} + +/* + * Delete socket tags, and stat tags associated with a given + * accouting tag and uid. + */ +static int ctrl_cmd_delete(const char *input) +{ + char cmd; + uid_t uid; + uid_t entry_uid; + tag_t acct_tag; + tag_t tag; + int res, argc; + struct iface_stat *iface_entry; + struct rb_node *node; + struct sock_tag *st_entry; + struct rb_root st_to_free_tree = RB_ROOT; + struct tag_stat *ts_entry; + struct tag_counter_set *tcs_entry; + struct tag_ref *tr_entry; + struct uid_tag_data *utd_entry; + + argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid); + CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c " + "user_tag=0x%llx uid=%u\n", input, argc, cmd, + acct_tag, uid); + if (argc < 2) { + res = -EINVAL; + goto err; + } + if (!valid_atag(acct_tag)) { + pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input); + res = -EINVAL; + goto err; + } + if (argc < 3) { + uid = current_fsuid(); + } else if (!can_impersonate_uid(uid)) { + pr_info("qtaguid: ctrl_delete(%s): " + "insufficient priv from pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + res = -EPERM; + goto err; + } + + tag = combine_atag_with_uid(acct_tag, uid); + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "looking for tag=0x%llx (uid=%u)\n", + input, tag, uid); + + /* Delete socket tags */ + spin_lock_bh(&sock_tag_list_lock); + node = rb_first(&sock_tag_tree); + while (node) { + st_entry = rb_entry(node, struct sock_tag, sock_node); + entry_uid = get_uid_from_tag(st_entry->tag); + node = rb_next(node); + if (entry_uid != uid) + continue; + + CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n", + input, st_entry->tag, entry_uid); + + if (!acct_tag || st_entry->tag == tag) { + rb_erase(&st_entry->sock_node, &sock_tag_tree); + /* Can't sockfd_put() within spinlock, do it later. */ + sock_tag_tree_insert(st_entry, &st_to_free_tree); + tr_entry = lookup_tag_ref(st_entry->tag, NULL); + BUG_ON(tr_entry->num_sock_tags <= 0); + tr_entry->num_sock_tags--; + /* + * TODO: remove if, and start failing. + * This is a hack to work around the fact that in some + * places we have "if (IS_ERR_OR_NULL(pqd_entry))" + * and are trying to work around apps + * that didn't open the /dev/xt_qtaguid. + */ + if (st_entry->list.next && st_entry->list.prev) + list_del(&st_entry->list); + } + } + spin_unlock_bh(&sock_tag_list_lock); + + sock_tag_tree_erase(&st_to_free_tree); + + /* Delete tag counter-sets */ + spin_lock_bh(&tag_counter_set_list_lock); + /* Counter sets are only on the uid tag, not full tag */ + tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag); + if (tcs_entry) { + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "erase tcs: tag=0x%llx (uid=%u) set=%d\n", + input, + tcs_entry->tn.tag, + get_uid_from_tag(tcs_entry->tn.tag), + tcs_entry->active_set); + rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree); + kfree(tcs_entry); + } + spin_unlock_bh(&tag_counter_set_list_lock); + + /* + * If acct_tag is 0, then all entries belonging to uid are + * erased. + */ + spin_lock_bh(&iface_stat_list_lock); + list_for_each_entry(iface_entry, &iface_stat_list, list) { + spin_lock_bh(&iface_entry->tag_stat_list_lock); + node = rb_first(&iface_entry->tag_stat_tree); + while (node) { + ts_entry = rb_entry(node, struct tag_stat, tn.node); + entry_uid = get_uid_from_tag(ts_entry->tn.tag); + node = rb_next(node); + + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "ts tag=0x%llx (uid=%u)\n", + input, ts_entry->tn.tag, entry_uid); + + if (entry_uid != uid) + continue; + if (!acct_tag || ts_entry->tn.tag == tag) { + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "erase ts: %s 0x%llx %u\n", + input, iface_entry->ifname, + get_atag_from_tag(ts_entry->tn.tag), + entry_uid); + rb_erase(&ts_entry->tn.node, + &iface_entry->tag_stat_tree); + kfree(ts_entry); + } + } + spin_unlock_bh(&iface_entry->tag_stat_list_lock); + } + spin_unlock_bh(&iface_stat_list_lock); + + /* Cleanup the uid_tag_data */ + spin_lock_bh(&uid_tag_data_tree_lock); + node = rb_first(&uid_tag_data_tree); + while (node) { + utd_entry = rb_entry(node, struct uid_tag_data, node); + entry_uid = utd_entry->uid; + node = rb_next(node); + + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "utd uid=%u\n", + input, entry_uid); + + if (entry_uid != uid) + continue; + /* + * Go over the tag_refs, and those that don't have + * sock_tags using them are freed. + */ + put_tag_ref_tree(tag, utd_entry); + put_utd_entry(utd_entry); + } + spin_unlock_bh(&uid_tag_data_tree_lock); + + atomic64_inc(&qtu_events.delete_cmds); + res = 0; + +err: + return res; +} + +static int ctrl_cmd_counter_set(const char *input) +{ + char cmd; + uid_t uid = 0; + tag_t tag; + int res, argc; + struct tag_counter_set *tcs; + int counter_set; + + argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid); + CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c " + "set=%d uid=%u\n", input, argc, cmd, + counter_set, uid); + if (argc != 3) { + res = -EINVAL; + goto err; + } + if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) { + pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n", + input); + res = -EINVAL; + goto err; + } + if (!can_manipulate_uids()) { + pr_info("qtaguid: ctrl_counterset(%s): " + "insufficient priv from pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + res = -EPERM; + goto err; + } + + tag = make_tag_from_uid(uid); + spin_lock_bh(&tag_counter_set_list_lock); + tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag); + if (!tcs) { + tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC); + if (!tcs) { + spin_unlock_bh(&tag_counter_set_list_lock); + pr_err("qtaguid: ctrl_counterset(%s): " + "failed to alloc counter set\n", + input); + res = -ENOMEM; + goto err; + } + tcs->tn.tag = tag; + tag_counter_set_tree_insert(tcs, &tag_counter_set_tree); + CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx " + "(uid=%u) set=%d\n", + input, tag, get_uid_from_tag(tag), counter_set); + } + tcs->active_set = counter_set; + spin_unlock_bh(&tag_counter_set_list_lock); + atomic64_inc(&qtu_events.counter_set_changes); + res = 0; + +err: + return res; +} + +static int ctrl_cmd_tag(const char *input) +{ + char cmd; + int sock_fd = 0; + uid_t uid = 0; + tag_t acct_tag = make_atag_from_value(0); + tag_t full_tag; + struct socket *el_socket; + int res, argc; + struct sock_tag *sock_tag_entry; + struct tag_ref *tag_ref_entry; + struct uid_tag_data *uid_tag_data_entry; + struct proc_qtu_data *pqd_entry; + + /* Unassigned args will get defaulted later. */ + argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid); + CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d " + "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd, + acct_tag, uid); + if (argc < 2) { + res = -EINVAL; + goto err; + } + el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ + if (!el_socket) { + pr_info("qtaguid: ctrl_tag(%s): failed to lookup" + " sock_fd=%d err=%d\n", input, sock_fd, res); + goto err; + } + CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n", + input, atomic_long_read(&el_socket->file->f_count), + el_socket->sk); + if (argc < 3) { + acct_tag = make_atag_from_value(0); + } else if (!valid_atag(acct_tag)) { + pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input); + res = -EINVAL; + goto err_put; + } + CT_DEBUG("qtaguid: ctrl_tag(%s): " + "pid=%u tgid=%u uid=%u euid=%u fsuid=%u " + "in_group=%d in_egroup=%d\n", + input, current->pid, current->tgid, current_uid(), + current_euid(), current_fsuid(), + in_group_p(proc_ctrl_write_gid), + in_egroup_p(proc_ctrl_write_gid)); + if (argc < 4) { + uid = current_fsuid(); + } else if (!can_impersonate_uid(uid)) { + pr_info("qtaguid: ctrl_tag(%s): " + "insufficient priv from pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + res = -EPERM; + goto err_put; + } + full_tag = combine_atag_with_uid(acct_tag, uid); + + spin_lock_bh(&sock_tag_list_lock); + sock_tag_entry = get_sock_stat_nl(el_socket->sk); + tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry); + if (IS_ERR(tag_ref_entry)) { + res = PTR_ERR(tag_ref_entry); + spin_unlock_bh(&sock_tag_list_lock); + goto err_put; + } + tag_ref_entry->num_sock_tags++; + if (sock_tag_entry) { + struct tag_ref *prev_tag_ref_entry; + + CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p " + "st@%p ...->f_count=%ld\n", + input, el_socket->sk, sock_tag_entry, + atomic_long_read(&el_socket->file->f_count)); + /* + * This is a re-tagging, so release the sock_fd that was + * locked at the time of the 1st tagging. + * There is still the ref from this call's sockfd_lookup() so + * it can be done within the spinlock. + */ + sockfd_put(sock_tag_entry->socket); + prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, + &uid_tag_data_entry); + BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry)); + BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0); + prev_tag_ref_entry->num_sock_tags--; + sock_tag_entry->tag = full_tag; + } else { + CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n", + input, el_socket->sk); + sock_tag_entry = kzalloc(sizeof(*sock_tag_entry), + GFP_ATOMIC); + if (!sock_tag_entry) { + pr_err("qtaguid: ctrl_tag(%s): " + "socket tag alloc failed\n", + input); + spin_unlock_bh(&sock_tag_list_lock); + res = -ENOMEM; + goto err_tag_unref_put; + } + sock_tag_entry->sk = el_socket->sk; + sock_tag_entry->socket = el_socket; + sock_tag_entry->pid = current->tgid; + sock_tag_entry->tag = combine_atag_with_uid(acct_tag, + uid); + spin_lock_bh(&uid_tag_data_tree_lock); + pqd_entry = proc_qtu_data_tree_search( + &proc_qtu_data_tree, current->tgid); + /* + * TODO: remove if, and start failing. + * At first, we want to catch user-space code that is not + * opening the /dev/xt_qtaguid. + */ + if (IS_ERR_OR_NULL(pqd_entry)) + pr_warn_once( + "qtaguid: %s(): " + "User space forgot to open /dev/xt_qtaguid? " + "pid=%u tgid=%u uid=%u\n", __func__, + current->pid, current->tgid, + current_fsuid()); + else + list_add(&sock_tag_entry->list, + &pqd_entry->sock_tag_list); + spin_unlock_bh(&uid_tag_data_tree_lock); + + sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree); + atomic64_inc(&qtu_events.sockets_tagged); + } + spin_unlock_bh(&sock_tag_list_lock); + /* We keep the ref to the socket (file) until it is untagged */ + CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n", + input, sock_tag_entry, + atomic_long_read(&el_socket->file->f_count)); + return 0; + +err_tag_unref_put: + BUG_ON(tag_ref_entry->num_sock_tags <= 0); + tag_ref_entry->num_sock_tags--; + free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry); +err_put: + CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n", + input, atomic_long_read(&el_socket->file->f_count) - 1); + /* Release the sock_fd that was grabbed by sockfd_lookup(). */ + sockfd_put(el_socket); + return res; + +err: + CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input); + return res; +} + +static int ctrl_cmd_untag(const char *input) +{ + char cmd; + int sock_fd = 0; + struct socket *el_socket; + int res, argc; + struct sock_tag *sock_tag_entry; + struct tag_ref *tag_ref_entry; + struct uid_tag_data *utd_entry; + struct proc_qtu_data *pqd_entry; + + argc = sscanf(input, "%c %d", &cmd, &sock_fd); + CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n", + input, argc, cmd, sock_fd); + if (argc < 2) { + res = -EINVAL; + goto err; + } + el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ + if (!el_socket) { + pr_info("qtaguid: ctrl_untag(%s): failed to lookup" + " sock_fd=%d err=%d\n", input, sock_fd, res); + goto err; + } + CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n", + input, atomic_long_read(&el_socket->file->f_count), + el_socket->sk); + spin_lock_bh(&sock_tag_list_lock); + sock_tag_entry = get_sock_stat_nl(el_socket->sk); + if (!sock_tag_entry) { + spin_unlock_bh(&sock_tag_list_lock); + res = -EINVAL; + goto err_put; + } + /* + * The socket already belongs to the current process + * so it can do whatever it wants to it. + */ + rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree); + + tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry); + BUG_ON(!tag_ref_entry); + BUG_ON(tag_ref_entry->num_sock_tags <= 0); + spin_lock_bh(&uid_tag_data_tree_lock); + pqd_entry = proc_qtu_data_tree_search( + &proc_qtu_data_tree, current->tgid); + /* + * TODO: remove if, and start failing. + * At first, we want to catch user-space code that is not + * opening the /dev/xt_qtaguid. + */ + if (IS_ERR_OR_NULL(pqd_entry)) + pr_warn_once("qtaguid: %s(): " + "User space forgot to open /dev/xt_qtaguid? " + "pid=%u tgid=%u uid=%u\n", __func__, + current->pid, current->tgid, current_fsuid()); + else + list_del(&sock_tag_entry->list); + spin_unlock_bh(&uid_tag_data_tree_lock); + /* + * We don't free tag_ref from the utd_entry here, + * only during a cmd_delete(). + */ + tag_ref_entry->num_sock_tags--; + spin_unlock_bh(&sock_tag_list_lock); + /* + * Release the sock_fd that was grabbed at tag time, + * and once more for the sockfd_lookup() here. + */ + sockfd_put(sock_tag_entry->socket); + CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n", + input, sock_tag_entry, + atomic_long_read(&el_socket->file->f_count) - 1); + sockfd_put(el_socket); + + kfree(sock_tag_entry); + atomic64_inc(&qtu_events.sockets_untagged); + + return 0; + +err_put: + CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n", + input, atomic_long_read(&el_socket->file->f_count) - 1); + /* Release the sock_fd that was grabbed by sockfd_lookup(). */ + sockfd_put(el_socket); + return res; + +err: + CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input); + return res; +} + +static int qtaguid_ctrl_parse(const char *input, int count) +{ + char cmd; + int res; + + cmd = input[0]; + /* Collect params for commands */ + switch (cmd) { + case 'd': + res = ctrl_cmd_delete(input); + break; + + case 's': + res = ctrl_cmd_counter_set(input); + break; + + case 't': + res = ctrl_cmd_tag(input); + break; + + case 'u': + res = ctrl_cmd_untag(input); + break; + + default: + res = -EINVAL; + goto err; + } + if (!res) + res = count; +err: + CT_DEBUG("qtaguid: ctrl(%s): res=%d\n", input, res); + return res; +} + +#define MAX_QTAGUID_CTRL_INPUT_LEN 255 +static int qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN]; + + if (unlikely(module_passive)) + return count; + + if (count >= MAX_QTAGUID_CTRL_INPUT_LEN) + return -EINVAL; + + if (copy_from_user(input_buf, buffer, count)) + return -EFAULT; + + input_buf[count] = '\0'; + return qtaguid_ctrl_parse(input_buf, count); +} + +struct proc_print_info { + char *outp; + char **num_items_returned; + struct iface_stat *iface_entry; + struct tag_stat *ts_entry; + int item_index; + int items_to_skip; + int char_count; +}; + +static int pp_stats_line(struct proc_print_info *ppi, int cnt_set) +{ + int len; + struct data_counters *cnts; + + if (!ppi->item_index) { + if (ppi->item_index++ < ppi->items_to_skip) + return 0; + len = snprintf(ppi->outp, ppi->char_count, + "idx iface acct_tag_hex uid_tag_int cnt_set " + "rx_bytes rx_packets " + "tx_bytes tx_packets " + "rx_tcp_bytes rx_tcp_packets " + "rx_udp_bytes rx_udp_packets " + "rx_other_bytes rx_other_packets " + "tx_tcp_bytes tx_tcp_packets " + "tx_udp_bytes tx_udp_packets " + "tx_other_bytes tx_other_packets\n"); + } else { + tag_t tag = ppi->ts_entry->tn.tag; + uid_t stat_uid = get_uid_from_tag(tag); + + if (!can_read_other_uid_stats(stat_uid)) { + CT_DEBUG("qtaguid: stats line: " + "%s 0x%llx %u: insufficient priv " + "from pid=%u tgid=%u uid=%u\n", + ppi->iface_entry->ifname, + get_atag_from_tag(tag), stat_uid, + current->pid, current->tgid, current_fsuid()); + return 0; + } + if (ppi->item_index++ < ppi->items_to_skip) + return 0; + cnts = &ppi->ts_entry->counters; + len = snprintf( + ppi->outp, ppi->char_count, + "%d %s 0x%llx %u %u " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu\n", + ppi->item_index, + ppi->iface_entry->ifname, + get_atag_from_tag(tag), + stat_uid, + cnt_set, + dc_sum_bytes(cnts, cnt_set, IFS_RX), + dc_sum_packets(cnts, cnt_set, IFS_RX), + dc_sum_bytes(cnts, cnt_set, IFS_TX), + dc_sum_packets(cnts, cnt_set, IFS_TX), + cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes, + cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets, + cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes, + cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets, + cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes, + cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets, + cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes, + cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets, + cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes, + cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, + cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, + cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); + } + return len; +} + +static bool pp_sets(struct proc_print_info *ppi) +{ + int len; + int counter_set; + for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS; + counter_set++) { + len = pp_stats_line(ppi, counter_set); + if (len >= ppi->char_count) { + *ppi->outp = '\0'; + return false; + } + if (len) { + ppi->outp += len; + ppi->char_count -= len; + (*ppi->num_items_returned)++; + } + } + return true; +} + +/* + * Procfs reader to get all tag stats using style "1)" as described in + * fs/proc/generic.c + * Groups all protocols tx/rx bytes. + */ +static int qtaguid_stats_proc_read(char *page, char **num_items_returned, + off_t items_to_skip, int char_count, int *eof, + void *data) +{ + struct proc_print_info ppi; + int len; + + ppi.outp = page; + ppi.item_index = 0; + ppi.char_count = char_count; + ppi.num_items_returned = num_items_returned; + ppi.items_to_skip = items_to_skip; + + if (unlikely(module_passive)) { + len = pp_stats_line(&ppi, 0); + /* The header should always be shorter than the buffer. */ + BUG_ON(len >= ppi.char_count); + (*num_items_returned)++; + *eof = 1; + return len; + } + + CT_DEBUG("qtaguid:proc stats page=%p *num_items_returned=%p off=%ld " + "char_count=%d *eof=%d\n", page, *num_items_returned, + items_to_skip, char_count, *eof); + + if (*eof) + return 0; + + /* The idx is there to help debug when things go belly up. */ + len = pp_stats_line(&ppi, 0); + /* Don't advance the outp unless the whole line was printed */ + if (len >= ppi.char_count) { + *ppi.outp = '\0'; + return ppi.outp - page; + } + if (len) { + ppi.outp += len; + ppi.char_count -= len; + (*num_items_returned)++; + } + + spin_lock_bh(&iface_stat_list_lock); + list_for_each_entry(ppi.iface_entry, &iface_stat_list, list) { + struct rb_node *node; + spin_lock_bh(&ppi.iface_entry->tag_stat_list_lock); + for (node = rb_first(&ppi.iface_entry->tag_stat_tree); + node; + node = rb_next(node)) { + ppi.ts_entry = rb_entry(node, struct tag_stat, tn.node); + if (!pp_sets(&ppi)) { + spin_unlock_bh( + &ppi.iface_entry->tag_stat_list_lock); + spin_unlock_bh(&iface_stat_list_lock); + return ppi.outp - page; + } + } + spin_unlock_bh(&ppi.iface_entry->tag_stat_list_lock); + } + spin_unlock_bh(&iface_stat_list_lock); + + *eof = 1; + return ppi.outp - page; +} + +/*------------------------------------------*/ +static int qtudev_open(struct inode *inode, struct file *file) +{ + struct uid_tag_data *utd_entry; + struct proc_qtu_data *pqd_entry; + struct proc_qtu_data *new_pqd_entry; + int res; + bool utd_entry_found; + + if (unlikely(qtu_proc_handling_passive)) + return 0; + + DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n", + current->pid, current->tgid, current_fsuid()); + + spin_lock_bh(&uid_tag_data_tree_lock); + + /* Look for existing uid data, or alloc one. */ + utd_entry = get_uid_data(current_fsuid(), &utd_entry_found); + if (IS_ERR_OR_NULL(utd_entry)) { + res = PTR_ERR(utd_entry); + goto err; + } + + /* Look for existing PID based proc_data */ + pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree, + current->tgid); + if (pqd_entry) { + pr_err("qtaguid: qtudev_open(): %u/%u %u " + "%s already opened\n", + current->pid, current->tgid, current_fsuid(), + QTU_DEV_NAME); + res = -EBUSY; + goto err_unlock_free_utd; + } + + new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC); + if (!new_pqd_entry) { + pr_err("qtaguid: qtudev_open(): %u/%u %u: " + "proc data alloc failed\n", + current->pid, current->tgid, current_fsuid()); + res = -ENOMEM; + goto err_unlock_free_utd; + } + new_pqd_entry->pid = current->tgid; + INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list); + new_pqd_entry->parent_tag_data = utd_entry; + utd_entry->num_pqd++; + + proc_qtu_data_tree_insert(new_pqd_entry, + &proc_qtu_data_tree); + + spin_unlock_bh(&uid_tag_data_tree_lock); + DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n", + current_fsuid(), new_pqd_entry); + file->private_data = new_pqd_entry; + return 0; + +err_unlock_free_utd: + if (!utd_entry_found) { + rb_erase(&utd_entry->node, &uid_tag_data_tree); + kfree(utd_entry); + } + spin_unlock_bh(&uid_tag_data_tree_lock); +err: + return res; +} + +static int qtudev_release(struct inode *inode, struct file *file) +{ + struct proc_qtu_data *pqd_entry = file->private_data; + struct uid_tag_data *utd_entry = pqd_entry->parent_tag_data; + struct sock_tag *st_entry; + struct rb_root st_to_free_tree = RB_ROOT; + struct list_head *entry, *next; + struct tag_ref *tr; + + if (unlikely(qtu_proc_handling_passive)) + return 0; + + /* + * Do not trust the current->pid, it might just be a kworker cleaning + * up after a dead proc. + */ + DR_DEBUG("qtaguid: qtudev_release(): " + "pid=%u tgid=%u uid=%u " + "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n", + current->pid, current->tgid, pqd_entry->parent_tag_data->uid, + pqd_entry, pqd_entry->pid, utd_entry, + utd_entry->num_active_tags); + + spin_lock_bh(&sock_tag_list_lock); + spin_lock_bh(&uid_tag_data_tree_lock); + + list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) { + st_entry = list_entry(entry, struct sock_tag, list); + DR_DEBUG("qtaguid: %s(): " + "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n", + __func__, + st_entry, st_entry->sk, + current->pid, current->tgid, + pqd_entry->parent_tag_data->uid); + + utd_entry = uid_tag_data_tree_search( + &uid_tag_data_tree, + get_uid_from_tag(st_entry->tag)); + BUG_ON(IS_ERR_OR_NULL(utd_entry)); + DR_DEBUG("qtaguid: %s(): " + "looking for tag=0x%llx in utd_entry=%p\n", __func__, + st_entry->tag, utd_entry); + tr = tag_ref_tree_search(&utd_entry->tag_ref_tree, + st_entry->tag); + BUG_ON(!tr); + BUG_ON(tr->num_sock_tags <= 0); + tr->num_sock_tags--; + free_tag_ref_from_utd_entry(tr, utd_entry); + + rb_erase(&st_entry->sock_node, &sock_tag_tree); + list_del(&st_entry->list); + /* Can't sockfd_put() within spinlock, do it later. */ + sock_tag_tree_insert(st_entry, &st_to_free_tree); + + /* + * Try to free the utd_entry if no other proc_qtu_data is + * using it (num_pqd is 0) and it doesn't have active tags + * (num_active_tags is 0). + */ + put_utd_entry(utd_entry); + } + + rb_erase(&pqd_entry->node, &proc_qtu_data_tree); + BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1); + pqd_entry->parent_tag_data->num_pqd--; + put_utd_entry(pqd_entry->parent_tag_data); + kfree(pqd_entry); + file->private_data = NULL; + + spin_unlock_bh(&uid_tag_data_tree_lock); + spin_unlock_bh(&sock_tag_list_lock); + + + sock_tag_tree_erase(&st_to_free_tree); + + prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__, + current->pid, current->tgid); + return 0; +} + +/*------------------------------------------*/ +static const struct file_operations qtudev_fops = { + .owner = THIS_MODULE, + .open = qtudev_open, + .release = qtudev_release, +}; + +static struct miscdevice qtu_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = QTU_DEV_NAME, + .fops = &qtudev_fops, + /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */ +}; + +/*------------------------------------------*/ +static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir) +{ + int ret; + *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net); + if (!*res_procdir) { + pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n"); + ret = -ENOMEM; + goto no_dir; + } + + xt_qtaguid_ctrl_file = create_proc_entry("ctrl", proc_ctrl_perms, + *res_procdir); + if (!xt_qtaguid_ctrl_file) { + pr_err("qtaguid: failed to create xt_qtaguid/ctrl " + " file\n"); + ret = -ENOMEM; + goto no_ctrl_entry; + } + xt_qtaguid_ctrl_file->read_proc = qtaguid_ctrl_proc_read; + xt_qtaguid_ctrl_file->write_proc = qtaguid_ctrl_proc_write; + + xt_qtaguid_stats_file = create_proc_entry("stats", proc_stats_perms, + *res_procdir); + if (!xt_qtaguid_stats_file) { + pr_err("qtaguid: failed to create xt_qtaguid/stats " + "file\n"); + ret = -ENOMEM; + goto no_stats_entry; + } + xt_qtaguid_stats_file->read_proc = qtaguid_stats_proc_read; + /* + * TODO: add support counter hacking + * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write; + */ + return 0; + +no_stats_entry: + remove_proc_entry("ctrl", *res_procdir); +no_ctrl_entry: + remove_proc_entry("xt_qtaguid", NULL); +no_dir: + return ret; +} + +static struct xt_match qtaguid_mt_reg __read_mostly = { + /* + * This module masquerades as the "owner" module so that iptables + * tools can deal with it. + */ + .name = "owner", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = qtaguid_mt, + .matchsize = sizeof(struct xt_qtaguid_match_info), + .me = THIS_MODULE, +}; + +static int __init qtaguid_mt_init(void) +{ + if (qtaguid_proc_register(&xt_qtaguid_procdir) + || iface_stat_init(xt_qtaguid_procdir) + || xt_register_match(&qtaguid_mt_reg) + || misc_register(&qtu_device)) + return -1; + return 0; +} + +/* + * TODO: allow unloading of the module. + * For now stats are permanent. + * Kconfig forces'y/n' and never an 'm'. + */ + +module_init(qtaguid_mt_init); +MODULE_AUTHOR("jpa "); +MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_owner"); +MODULE_ALIAS("ip6t_owner"); +MODULE_ALIAS("ipt_qtaguid"); +MODULE_ALIAS("ip6t_qtaguid"); diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h new file mode 100644 index 00000000..02479d6d --- /dev/null +++ b/net/netfilter/xt_qtaguid_internal.h @@ -0,0 +1,330 @@ +/* + * Kernel iptables module to track stats for packets based on user tags. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __XT_QTAGUID_INTERNAL_H__ +#define __XT_QTAGUID_INTERNAL_H__ + +#include +#include +#include +#include + +/* Iface handling */ +#define IDEBUG_MASK (1<<0) +/* Iptable Matching. Per packet. */ +#define MDEBUG_MASK (1<<1) +/* Red-black tree handling. Per packet. */ +#define RDEBUG_MASK (1<<2) +/* procfs ctrl/stats handling */ +#define CDEBUG_MASK (1<<3) +/* dev and resource tracking */ +#define DDEBUG_MASK (1<<4) + +/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */ +#define DEFAULT_DEBUG_MASK 0 + +/* + * (Un)Define these *DEBUG to compile out/in the pr_debug calls. + * All undef: text size ~ 0x3030; all def: ~ 0x4404. + */ +#define IDEBUG +#define MDEBUG +#define RDEBUG +#define CDEBUG +#define DDEBUG + +#define MSK_DEBUG(mask, ...) do { \ + if (unlikely(qtaguid_debug_mask & (mask))) \ + pr_debug(__VA_ARGS__); \ + } while (0) +#ifdef IDEBUG +#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__) +#else +#define IF_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef MDEBUG +#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__) +#else +#define MT_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef RDEBUG +#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__) +#else +#define RB_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef CDEBUG +#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__) +#else +#define CT_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef DDEBUG +#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__) +#else +#define DR_DEBUG(...) no_printk(__VA_ARGS__) +#endif + +extern uint qtaguid_debug_mask; + +/*---------------------------------------------------------------------------*/ +/* + * Tags: + * + * They represent what the data usage counters will be tracked against. + * By default a tag is just based on the UID. + * The UID is used as the base for policing, and can not be ignored. + * So a tag will always at least represent a UID (uid_tag). + * + * A tag can be augmented with an "accounting tag" which is associated + * with a UID. + * User space can set the acct_tag portion of the tag which is then used + * with sockets: all data belonging to that socket will be counted against the + * tag. The policing is then based on the tag's uid_tag portion, + * and stats are collected for the acct_tag portion separately. + * + * There could be + * a: {acct_tag=1, uid_tag=10003} + * b: {acct_tag=2, uid_tag=10003} + * c: {acct_tag=3, uid_tag=10003} + * d: {acct_tag=0, uid_tag=10003} + * a, b, and c represent tags associated with specific sockets. + * d is for the totals for that uid, including all untagged traffic. + * Typically d is used with policing/quota rules. + * + * We want tag_t big enough to distinguish uid_t and acct_tag. + * It might become a struct if needed. + * Nothing should be using it as an int. + */ +typedef uint64_t tag_t; /* Only used via accessors */ + +#define TAG_UID_MASK 0xFFFFFFFFULL +#define TAG_ACCT_MASK (~0xFFFFFFFFULL) + +static inline int tag_compare(tag_t t1, tag_t t2) +{ + return t1 < t2 ? -1 : t1 == t2 ? 0 : 1; +} + +static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid) +{ + return acct_tag | uid; +} +static inline tag_t make_tag_from_uid(uid_t uid) +{ + return uid; +} +static inline uid_t get_uid_from_tag(tag_t tag) +{ + return tag & TAG_UID_MASK; +} +static inline tag_t get_utag_from_tag(tag_t tag) +{ + return tag & TAG_UID_MASK; +} +static inline tag_t get_atag_from_tag(tag_t tag) +{ + return tag & TAG_ACCT_MASK; +} + +static inline bool valid_atag(tag_t tag) +{ + return !(tag & TAG_UID_MASK); +} +static inline tag_t make_atag_from_value(uint32_t value) +{ + return (uint64_t)value << 32; +} +/*---------------------------------------------------------------------------*/ + +/* + * Maximum number of socket tags that a UID is allowed to have active. + * Multiple processes belonging to the same UID contribute towards this limit. + * Special UIDs that can impersonate a UID also contribute (e.g. download + * manager, ...) + */ +#define DEFAULT_MAX_SOCK_TAGS 1024 + +/* + * For now we only track 2 sets of counters. + * The default set is 0. + * Userspace can activate another set for a given uid being tracked. + */ +#define IFS_MAX_COUNTER_SETS 2 + +enum ifs_tx_rx { + IFS_TX, + IFS_RX, + IFS_MAX_DIRECTIONS +}; + +/* For now, TCP, UDP, the rest */ +enum ifs_proto { + IFS_TCP, + IFS_UDP, + IFS_PROTO_OTHER, + IFS_MAX_PROTOS +}; + +struct byte_packet_counters { + uint64_t bytes; + uint64_t packets; +}; + +struct data_counters { + struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS]; +}; + +/* Generic X based nodes used as a base for rb_tree ops */ +struct tag_node { + struct rb_node node; + tag_t tag; +}; + +struct tag_stat { + struct tag_node tn; + struct data_counters counters; + /* + * If this tag is acct_tag based, we need to count against the + * matching parent uid_tag. + */ + struct data_counters *parent_counters; +}; + +struct iface_stat { + struct list_head list; /* in iface_stat_list */ + char *ifname; + bool active; + /* net_dev is only valid for active iface_stat */ + struct net_device *net_dev; + + struct byte_packet_counters totals[IFS_MAX_DIRECTIONS]; + /* + * We keep the last_known, because some devices reset their counters + * just before NETDEV_UP, while some will reset just before + * NETDEV_REGISTER (which is more normal). + * So now, if the device didn't do a NETDEV_UNREGISTER and we see + * its current dev stats smaller that what was previously known, we + * assume an UNREGISTER and just use the last_known. + */ + struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS]; + /* last_known is usable when last_known_valid is true */ + bool last_known_valid; + + struct proc_dir_entry *proc_ptr; + + struct rb_root tag_stat_tree; + spinlock_t tag_stat_list_lock; +}; + +/* This is needed to create proc_dir_entries from atomic context. */ +struct iface_stat_work { + struct work_struct iface_work; + struct iface_stat *iface_entry; +}; + +/* + * Track tag that this socket is transferring data for, and not necessarily + * the uid that owns the socket. + * This is the tag against which tag_stat.counters will be billed. + * These structs need to be looked up by sock and pid. + */ +struct sock_tag { + struct rb_node sock_node; + struct sock *sk; /* Only used as a number, never dereferenced */ + /* The socket is needed for sockfd_put() */ + struct socket *socket; + /* Used to associate with a given pid */ + struct list_head list; /* in proc_qtu_data.sock_tag_list */ + pid_t pid; + + tag_t tag; +}; + +struct qtaguid_event_counts { + /* Various successful events */ + atomic64_t sockets_tagged; + atomic64_t sockets_untagged; + atomic64_t counter_set_changes; + atomic64_t delete_cmds; + atomic64_t iface_events; /* Number of NETDEV_* events handled */ + + atomic64_t match_calls; /* Number of times iptables called mt */ + /* + * match_found_sk_*: numbers related to the netfilter matching + * function finding a sock for the sk_buff. + * Total skbs processed is sum(match_found*). + */ + atomic64_t match_found_sk; /* An sk was already in the sk_buff. */ + /* The connection tracker had or didn't have the sk. */ + atomic64_t match_found_sk_in_ct; + atomic64_t match_found_no_sk_in_ct; + /* + * No sk could be found. No apparent owner. Could happen with + * unsolicited traffic. + */ + atomic64_t match_no_sk; + /* + * The file ptr in the sk_socket wasn't there. + * This might happen for traffic while the socket is being closed. + */ + atomic64_t match_no_sk_file; +}; + +/* Track the set active_set for the given tag. */ +struct tag_counter_set { + struct tag_node tn; + int active_set; +}; + +/*----------------------------------------------*/ +/* + * The qtu uid data is used to track resources that are created directly or + * indirectly by processes (uid tracked). + * It is shared by the processes with the same uid. + * Some of the resource will be counted to prevent further rogue allocations, + * some will need freeing once the owner process (uid) exits. + */ +struct uid_tag_data { + struct rb_node node; + uid_t uid; + + /* + * For the uid, how many accounting tags have been set. + */ + int num_active_tags; + /* Track the number of proc_qtu_data that reference it */ + int num_pqd; + struct rb_root tag_ref_tree; + /* No tag_node_tree_lock; use uid_tag_data_tree_lock */ +}; + +struct tag_ref { + struct tag_node tn; + + /* + * This tracks the number of active sockets that have a tag on them + * which matches this tag_ref.tn.tag. + * A tag ref can live on after the sockets are untagged. + * A tag ref can only be removed during a tag delete command. + */ + int num_sock_tags; +}; + +struct proc_qtu_data { + struct rb_node node; + pid_t pid; + + struct uid_tag_data *parent_tag_data; + + /* Tracks the sock_tags that need freeing upon this proc's death */ + struct list_head sock_tag_list; + /* No spinlock_t sock_tag_list_lock; use the global one. */ +}; + +/*----------------------------------------------*/ +#endif /* ifndef __XT_QTAGUID_INTERNAL_H__ */ diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c new file mode 100644 index 00000000..39176785 --- /dev/null +++ b/net/netfilter/xt_qtaguid_print.c @@ -0,0 +1,556 @@ +/* + * Pretty printing Support for iptables xt_qtaguid module. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Most of the functions in this file just waste time if DEBUG is not defined. + * The matching xt_qtaguid_print.h will static inline empty funcs if the needed + * debug flags ore not defined. + * Those funcs that fail to allocate memory will panic as there is no need to + * hobble allong just pretending to do the requested work. + */ + +#define DEBUG + +#include +#include +#include +#include +#include +#include + + +#include "xt_qtaguid_internal.h" +#include "xt_qtaguid_print.h" + +#ifdef DDEBUG + +static void _bug_on_err_or_null(void *ptr) +{ + if (IS_ERR_OR_NULL(ptr)) { + pr_err("qtaguid: kmalloc failed\n"); + BUG(); + } +} + +char *pp_tag_t(tag_t *tag) +{ + char *res; + + if (!tag) + res = kasprintf(GFP_ATOMIC, "tag_t@null{}"); + else + res = kasprintf(GFP_ATOMIC, + "tag_t@%p{tag=0x%llx, uid=%u}", + tag, *tag, get_uid_from_tag(*tag)); + _bug_on_err_or_null(res); + return res; +} + +char *pp_data_counters(struct data_counters *dc, bool showValues) +{ + char *res; + + if (!dc) + res = kasprintf(GFP_ATOMIC, "data_counters@null{}"); + else if (showValues) + res = kasprintf( + GFP_ATOMIC, "data_counters@%p{" + "set0{" + "rx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}, " + "tx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}}, " + "set1{" + "rx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}, " + "tx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}}}", + dc, + dc->bpc[0][IFS_RX][IFS_TCP].bytes, + dc->bpc[0][IFS_RX][IFS_TCP].packets, + dc->bpc[0][IFS_RX][IFS_UDP].bytes, + dc->bpc[0][IFS_RX][IFS_UDP].packets, + dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes, + dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets, + dc->bpc[0][IFS_TX][IFS_TCP].bytes, + dc->bpc[0][IFS_TX][IFS_TCP].packets, + dc->bpc[0][IFS_TX][IFS_UDP].bytes, + dc->bpc[0][IFS_TX][IFS_UDP].packets, + dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes, + dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets, + dc->bpc[1][IFS_RX][IFS_TCP].bytes, + dc->bpc[1][IFS_RX][IFS_TCP].packets, + dc->bpc[1][IFS_RX][IFS_UDP].bytes, + dc->bpc[1][IFS_RX][IFS_UDP].packets, + dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes, + dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets, + dc->bpc[1][IFS_TX][IFS_TCP].bytes, + dc->bpc[1][IFS_TX][IFS_TCP].packets, + dc->bpc[1][IFS_TX][IFS_UDP].bytes, + dc->bpc[1][IFS_TX][IFS_UDP].packets, + dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes, + dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets); + else + res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc); + _bug_on_err_or_null(res); + return res; +} + +char *pp_tag_node(struct tag_node *tn) +{ + char *tag_str; + char *res; + + if (!tn) { + res = kasprintf(GFP_ATOMIC, "tag_node@null{}"); + _bug_on_err_or_null(res); + return res; + } + tag_str = pp_tag_t(&tn->tag); + res = kasprintf(GFP_ATOMIC, + "tag_node@%p{tag=%s}", + tn, tag_str); + _bug_on_err_or_null(res); + kfree(tag_str); + return res; +} + +char *pp_tag_ref(struct tag_ref *tr) +{ + char *tn_str; + char *res; + + if (!tr) { + res = kasprintf(GFP_ATOMIC, "tag_ref@null{}"); + _bug_on_err_or_null(res); + return res; + } + tn_str = pp_tag_node(&tr->tn); + res = kasprintf(GFP_ATOMIC, + "tag_ref@%p{%s, num_sock_tags=%d}", + tr, tn_str, tr->num_sock_tags); + _bug_on_err_or_null(res); + kfree(tn_str); + return res; +} + +char *pp_tag_stat(struct tag_stat *ts) +{ + char *tn_str; + char *counters_str; + char *parent_counters_str; + char *res; + + if (!ts) { + res = kasprintf(GFP_ATOMIC, "tag_stat@null{}"); + _bug_on_err_or_null(res); + return res; + } + tn_str = pp_tag_node(&ts->tn); + counters_str = pp_data_counters(&ts->counters, true); + parent_counters_str = pp_data_counters(ts->parent_counters, false); + res = kasprintf(GFP_ATOMIC, + "tag_stat@%p{%s, counters=%s, parent_counters=%s}", + ts, tn_str, counters_str, parent_counters_str); + _bug_on_err_or_null(res); + kfree(tn_str); + kfree(counters_str); + kfree(parent_counters_str); + return res; +} + +char *pp_iface_stat(struct iface_stat *is) +{ + char *res; + if (!is) + res = kasprintf(GFP_ATOMIC, "iface_stat@null{}"); + else + res = kasprintf(GFP_ATOMIC, "iface_stat@%p{" + "list=list_head{...}, " + "ifname=%s, " + "total={rx={bytes=%llu, " + "packets=%llu}, " + "tx={bytes=%llu, " + "packets=%llu}}, " + "last_known_valid=%d, " + "last_known={rx={bytes=%llu, " + "packets=%llu}, " + "tx={bytes=%llu, " + "packets=%llu}}, " + "active=%d, " + "net_dev=%p, " + "proc_ptr=%p, " + "tag_stat_tree=rb_root{...}}", + is, + is->ifname, + is->totals[IFS_RX].bytes, + is->totals[IFS_RX].packets, + is->totals[IFS_TX].bytes, + is->totals[IFS_TX].packets, + is->last_known_valid, + is->last_known[IFS_RX].bytes, + is->last_known[IFS_RX].packets, + is->last_known[IFS_TX].bytes, + is->last_known[IFS_TX].packets, + is->active, + is->net_dev, + is->proc_ptr); + _bug_on_err_or_null(res); + return res; +} + +char *pp_sock_tag(struct sock_tag *st) +{ + char *tag_str; + char *res; + + if (!st) { + res = kasprintf(GFP_ATOMIC, "sock_tag@null{}"); + _bug_on_err_or_null(res); + return res; + } + tag_str = pp_tag_t(&st->tag); + res = kasprintf(GFP_ATOMIC, "sock_tag@%p{" + "sock_node=rb_node{...}, " + "sk=%p socket=%p (f_count=%lu), list=list_head{...}, " + "pid=%u, tag=%s}", + st, st->sk, st->socket, atomic_long_read( + &st->socket->file->f_count), + st->pid, tag_str); + _bug_on_err_or_null(res); + kfree(tag_str); + return res; +} + +char *pp_uid_tag_data(struct uid_tag_data *utd) +{ + char *res; + + if (!utd) + res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}"); + else + res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{" + "uid=%u, num_active_acct_tags=%d, " + "num_pqd=%d, " + "tag_node_tree=rb_root{...}, " + "proc_qtu_data_tree=rb_root{...}}", + utd, utd->uid, + utd->num_active_tags, utd->num_pqd); + _bug_on_err_or_null(res); + return res; +} + +char *pp_proc_qtu_data(struct proc_qtu_data *pqd) +{ + char *parent_tag_data_str; + char *res; + + if (!pqd) { + res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}"); + _bug_on_err_or_null(res); + return res; + } + parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data); + res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{" + "node=rb_node{...}, pid=%u, " + "parent_tag_data=%s, " + "sock_tag_list=list_head{...}}", + pqd, pqd->pid, parent_tag_data_str + ); + _bug_on_err_or_null(res); + kfree(parent_tag_data_str); + return res; +} + +/*------------------------------------------*/ +void prdebug_sock_tag_tree(int indent_level, + struct rb_root *sock_tag_tree) +{ + struct rb_node *node; + struct sock_tag *sock_tag_entry; + char *str; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(sock_tag_tree)) { + str = "sock_tag_tree=rb_root{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "sock_tag_tree=rb_root{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(sock_tag_tree); + node; + node = rb_next(node)) { + sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); + str = pp_sock_tag(sock_tag_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_sock_tag_list(int indent_level, + struct list_head *sock_tag_list) +{ + struct sock_tag *sock_tag_entry; + char *str; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (list_empty(sock_tag_list)) { + str = "sock_tag_list=list_head{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "sock_tag_list=list_head{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + list_for_each_entry(sock_tag_entry, sock_tag_list, list) { + str = pp_sock_tag(sock_tag_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_proc_qtu_data_tree(int indent_level, + struct rb_root *proc_qtu_data_tree) +{ + char *str; + struct rb_node *node; + struct proc_qtu_data *proc_qtu_data_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(proc_qtu_data_tree)) { + str = "proc_qtu_data_tree=rb_root{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "proc_qtu_data_tree=rb_root{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(proc_qtu_data_tree); + node; + node = rb_next(node)) { + proc_qtu_data_entry = rb_entry(node, + struct proc_qtu_data, + node); + str = pp_proc_qtu_data(proc_qtu_data_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, + str); + kfree(str); + indent_level++; + prdebug_sock_tag_list(indent_level, + &proc_qtu_data_entry->sock_tag_list); + indent_level--; + + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree) +{ + char *str; + struct rb_node *node; + struct tag_ref *tag_ref_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(tag_ref_tree)) { + str = "tag_ref_tree{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "tag_ref_tree{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(tag_ref_tree); + node; + node = rb_next(node)) { + tag_ref_entry = rb_entry(node, + struct tag_ref, + tn.node); + str = pp_tag_ref(tag_ref_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, + str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_uid_tag_data_tree(int indent_level, + struct rb_root *uid_tag_data_tree) +{ + char *str; + struct rb_node *node; + struct uid_tag_data *uid_tag_data_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(uid_tag_data_tree)) { + str = "uid_tag_data_tree=rb_root{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "uid_tag_data_tree=rb_root{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(uid_tag_data_tree); + node; + node = rb_next(node)) { + uid_tag_data_entry = rb_entry(node, struct uid_tag_data, + node); + str = pp_uid_tag_data(uid_tag_data_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); + kfree(str); + if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) { + indent_level++; + prdebug_tag_ref_tree(indent_level, + &uid_tag_data_entry->tag_ref_tree); + indent_level--; + } + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_tag_stat_tree(int indent_level, + struct rb_root *tag_stat_tree) +{ + char *str; + struct rb_node *node; + struct tag_stat *ts_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(tag_stat_tree)) { + str = "tag_stat_tree{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "tag_stat_tree{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(tag_stat_tree); + node; + node = rb_next(node)) { + ts_entry = rb_entry(node, struct tag_stat, tn.node); + str = pp_tag_stat(ts_entry); + pr_debug("%*d: %s\n", indent_level*2, indent_level, + str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_iface_stat_list(int indent_level, + struct list_head *iface_stat_list) +{ + char *str; + struct iface_stat *iface_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (list_empty(iface_stat_list)) { + str = "iface_stat_list=list_head{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "iface_stat_list=list_head{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + list_for_each_entry(iface_entry, iface_stat_list, list) { + str = pp_iface_stat(iface_entry); + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + kfree(str); + + spin_lock_bh(&iface_entry->tag_stat_list_lock); + if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) { + indent_level++; + prdebug_tag_stat_tree(indent_level, + &iface_entry->tag_stat_tree); + indent_level--; + } + spin_unlock_bh(&iface_entry->tag_stat_list_lock); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +#endif /* ifdef DDEBUG */ +/*------------------------------------------*/ +static const char * const netdev_event_strings[] = { + "netdev_unknown", + "NETDEV_UP", + "NETDEV_DOWN", + "NETDEV_REBOOT", + "NETDEV_CHANGE", + "NETDEV_REGISTER", + "NETDEV_UNREGISTER", + "NETDEV_CHANGEMTU", + "NETDEV_CHANGEADDR", + "NETDEV_GOING_DOWN", + "NETDEV_CHANGENAME", + "NETDEV_FEAT_CHANGE", + "NETDEV_BONDING_FAILOVER", + "NETDEV_PRE_UP", + "NETDEV_PRE_TYPE_CHANGE", + "NETDEV_POST_TYPE_CHANGE", + "NETDEV_POST_INIT", + "NETDEV_UNREGISTER_BATCH", + "NETDEV_RELEASE", + "NETDEV_NOTIFY_PEERS", + "NETDEV_JOIN", +}; + +const char *netdev_evt_str(int netdev_event) +{ + if (netdev_event < 0 + || netdev_event >= ARRAY_SIZE(netdev_event_strings)) + return "bad event num"; + return netdev_event_strings[netdev_event]; +} diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h new file mode 100644 index 00000000..b63871a0 --- /dev/null +++ b/net/netfilter/xt_qtaguid_print.h @@ -0,0 +1,120 @@ +/* + * Pretty printing Support for iptables xt_qtaguid module. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __XT_QTAGUID_PRINT_H__ +#define __XT_QTAGUID_PRINT_H__ + +#include "xt_qtaguid_internal.h" + +#ifdef DDEBUG + +char *pp_tag_t(tag_t *tag); +char *pp_data_counters(struct data_counters *dc, bool showValues); +char *pp_tag_node(struct tag_node *tn); +char *pp_tag_ref(struct tag_ref *tr); +char *pp_tag_stat(struct tag_stat *ts); +char *pp_iface_stat(struct iface_stat *is); +char *pp_sock_tag(struct sock_tag *st); +char *pp_uid_tag_data(struct uid_tag_data *qtd); +char *pp_proc_qtu_data(struct proc_qtu_data *pqd); + +/*------------------------------------------*/ +void prdebug_sock_tag_list(int indent_level, + struct list_head *sock_tag_list); +void prdebug_sock_tag_tree(int indent_level, + struct rb_root *sock_tag_tree); +void prdebug_proc_qtu_data_tree(int indent_level, + struct rb_root *proc_qtu_data_tree); +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree); +void prdebug_uid_tag_data_tree(int indent_level, + struct rb_root *uid_tag_data_tree); +void prdebug_tag_stat_tree(int indent_level, + struct rb_root *tag_stat_tree); +void prdebug_iface_stat_list(int indent_level, + struct list_head *iface_stat_list); + +#else + +/*------------------------------------------*/ +static inline char *pp_tag_t(tag_t *tag) +{ + return NULL; +} +static inline char *pp_data_counters(struct data_counters *dc, bool showValues) +{ + return NULL; +} +static inline char *pp_tag_node(struct tag_node *tn) +{ + return NULL; +} +static inline char *pp_tag_ref(struct tag_ref *tr) +{ + return NULL; +} +static inline char *pp_tag_stat(struct tag_stat *ts) +{ + return NULL; +} +static inline char *pp_iface_stat(struct iface_stat *is) +{ + return NULL; +} +static inline char *pp_sock_tag(struct sock_tag *st) +{ + return NULL; +} +static inline char *pp_uid_tag_data(struct uid_tag_data *qtd) +{ + return NULL; +} +static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd) +{ + return NULL; +} + +/*------------------------------------------*/ +static inline +void prdebug_sock_tag_list(int indent_level, + struct list_head *sock_tag_list) +{ +} +static inline +void prdebug_sock_tag_tree(int indent_level, + struct rb_root *sock_tag_tree) +{ +} +static inline +void prdebug_proc_qtu_data_tree(int indent_level, + struct rb_root *proc_qtu_data_tree) +{ +} +static inline +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree) +{ +} +static inline +void prdebug_uid_tag_data_tree(int indent_level, + struct rb_root *uid_tag_data_tree) +{ +} +static inline +void prdebug_tag_stat_tree(int indent_level, + struct rb_root *tag_stat_tree) +{ +} +static inline +void prdebug_iface_stat_list(int indent_level, + struct list_head *iface_stat_list) +{ +} +#endif +/*------------------------------------------*/ +const char *netdev_evt_str(int netdev_event); +#endif /* ifndef __XT_QTAGUID_PRINT_H__ */ diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c new file mode 100644 index 00000000..70eb2b49 --- /dev/null +++ b/net/netfilter/xt_quota.c @@ -0,0 +1,89 @@ +/* + * netfilter module to enforce network quotas + * + * Sam Johnston + */ +#include +#include +#include + +#include +#include + +struct xt_quota_priv { + spinlock_t lock; + uint64_t quota; +}; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sam Johnston "); +MODULE_DESCRIPTION("Xtables: countdown quota match"); +MODULE_ALIAS("ipt_quota"); +MODULE_ALIAS("ip6t_quota"); + +static bool +quota_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct xt_quota_info *q = (void *)par->matchinfo; + struct xt_quota_priv *priv = q->master; + bool ret = q->flags & XT_QUOTA_INVERT; + + spin_lock_bh(&priv->lock); + if (priv->quota >= skb->len) { + priv->quota -= skb->len; + ret = !ret; + } else { + /* we do not allow even small packets from now on */ + priv->quota = 0; + } + spin_unlock_bh(&priv->lock); + + return ret; +} + +static int quota_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_quota_info *q = par->matchinfo; + + if (q->flags & ~XT_QUOTA_MASK) + return -EINVAL; + + q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); + if (q->master == NULL) + return -ENOMEM; + + spin_lock_init(&q->master->lock); + q->master->quota = q->quota; + return 0; +} + +static void quota_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_quota_info *q = par->matchinfo; + + kfree(q->master); +} + +static struct xt_match quota_mt_reg __read_mostly = { + .name = "quota", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = quota_mt, + .checkentry = quota_mt_check, + .destroy = quota_mt_destroy, + .matchsize = sizeof(struct xt_quota_info), + .me = THIS_MODULE, +}; + +static int __init quota_mt_init(void) +{ + return xt_register_match("a_mt_reg); +} + +static void __exit quota_mt_exit(void) +{ + xt_unregister_match("a_mt_reg); +} + +module_init(quota_mt_init); +module_exit(quota_mt_exit); diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c new file mode 100644 index 00000000..3c72bea2 --- /dev/null +++ b/net/netfilter/xt_quota2.c @@ -0,0 +1,381 @@ +/* + * xt_quota2 - enhanced xt_quota that can count upwards and in packets + * as a minimal accounting match. + * by Jan Engelhardt , 2008 + * + * Originally based on xt_quota.c: + * netfilter module to enforce network quotas + * Sam Johnston + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License; either + * version 2 of the License, as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +#include +#include +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +#include +#endif + +/** + * @lock: lock to protect quota writers from each other + */ +struct xt_quota_counter { + u_int64_t quota; + spinlock_t lock; + struct list_head list; + atomic_t ref; + char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)]; + struct proc_dir_entry *procfs_entry; +}; + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +/* Harald's favorite number +1 :D From ipt_ULOG.C */ +static int qlog_nl_event = 112; +module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(event_num, + "Event number for NETLINK_NFLOG message. 0 disables log." + "111 is what ipt_ULOG uses."); +static struct sock *nflognl; +#endif + +static LIST_HEAD(counter_list); +static DEFINE_SPINLOCK(counter_list_lock); + +static struct proc_dir_entry *proc_xt_quota; +static unsigned int quota_list_perms = S_IRUGO | S_IWUSR; +static unsigned int quota_list_uid = 0; +static unsigned int quota_list_gid = 0; +module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR); +module_param_named(uid, quota_list_uid, uint, S_IRUGO | S_IWUSR); +module_param_named(gid, quota_list_gid, uint, S_IRUGO | S_IWUSR); + + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +static void quota2_log(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + ulog_packet_msg_t *pm; + struct sk_buff *log_skb; + size_t size; + struct nlmsghdr *nlh; + + if (!qlog_nl_event) + return; + + size = NLMSG_SPACE(sizeof(*pm)); + size = max(size, (size_t)NLMSG_GOODSIZE); + log_skb = alloc_skb(size, GFP_ATOMIC); + if (!log_skb) { + pr_err("xt_quota2: cannot alloc skb for logging\n"); + return; + } + + /* NLMSG_PUT() uses "goto nlmsg_failure" */ + nlh = NLMSG_PUT(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event, + sizeof(*pm)); + pm = NLMSG_DATA(nlh); + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + pm->data_len = 0; + pm->hook = hooknum; + if (prefix != NULL) + strlcpy(pm->prefix, prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; + if (in) + strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; + + if (out) + strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; + + NETLINK_CB(log_skb).dst_group = 1; + pr_debug("throwing 1 packets to netlink group 1\n"); + netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC); + +nlmsg_failure: /* Used within NLMSG_PUT() */ + pr_debug("xt_quota2: error during NLMSG_PUT\n"); +} +#else +static void quota2_log(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ +} +#endif /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */ + +static int quota_proc_read(char *page, char **start, off_t offset, + int count, int *eof, void *data) +{ + struct xt_quota_counter *e = data; + int ret; + + spin_lock_bh(&e->lock); + ret = snprintf(page, PAGE_SIZE, "%llu\n", e->quota); + spin_unlock_bh(&e->lock); + return ret; +} + +static int quota_proc_write(struct file *file, const char __user *input, + unsigned long size, void *data) +{ + struct xt_quota_counter *e = data; + char buf[sizeof("18446744073709551616")]; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + buf[sizeof(buf)-1] = '\0'; + + spin_lock_bh(&e->lock); + e->quota = simple_strtoull(buf, NULL, 0); + spin_unlock_bh(&e->lock); + return size; +} + +static struct xt_quota_counter * +q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon) +{ + struct xt_quota_counter *e; + unsigned int size; + + /* Do not need all the procfs things for anonymous counters. */ + size = anon ? offsetof(typeof(*e), list) : sizeof(*e); + e = kmalloc(size, GFP_KERNEL); + if (e == NULL) + return NULL; + + e->quota = q->quota; + spin_lock_init(&e->lock); + if (!anon) { + INIT_LIST_HEAD(&e->list); + atomic_set(&e->ref, 1); + strlcpy(e->name, q->name, sizeof(e->name)); + } + return e; +} + +/** + * q2_get_counter - get ref to counter or create new + * @name: name of counter + */ +static struct xt_quota_counter * +q2_get_counter(const struct xt_quota_mtinfo2 *q) +{ + struct proc_dir_entry *p; + struct xt_quota_counter *e = NULL; + struct xt_quota_counter *new_e; + + if (*q->name == '\0') + return q2_new_counter(q, true); + + /* No need to hold a lock while getting a new counter */ + new_e = q2_new_counter(q, false); + if (new_e == NULL) + goto out; + + spin_lock_bh(&counter_list_lock); + list_for_each_entry(e, &counter_list, list) + if (strcmp(e->name, q->name) == 0) { + atomic_inc(&e->ref); + spin_unlock_bh(&counter_list_lock); + kfree(new_e); + pr_debug("xt_quota2: old counter name=%s", e->name); + return e; + } + e = new_e; + pr_debug("xt_quota2: new_counter name=%s", e->name); + list_add_tail(&e->list, &counter_list); + /* The entry having a refcount of 1 is not directly destructible. + * This func has not yet returned the new entry, thus iptables + * has not references for destroying this entry. + * For another rule to try to destroy it, it would 1st need for this + * func* to be re-invoked, acquire a new ref for the same named quota. + * Nobody will access the e->procfs_entry either. + * So release the lock. */ + spin_unlock_bh(&counter_list_lock); + + /* create_proc_entry() is not spin_lock happy */ + p = e->procfs_entry = create_proc_entry(e->name, quota_list_perms, + proc_xt_quota); + + if (IS_ERR_OR_NULL(p)) { + spin_lock_bh(&counter_list_lock); + list_del(&e->list); + spin_unlock_bh(&counter_list_lock); + goto out; + } + p->data = e; + p->read_proc = quota_proc_read; + p->write_proc = quota_proc_write; + p->uid = quota_list_uid; + p->gid = quota_list_gid; + return e; + + out: + kfree(e); + return NULL; +} + +static int quota_mt2_check(const struct xt_mtchk_param *par) +{ + struct xt_quota_mtinfo2 *q = par->matchinfo; + + pr_debug("xt_quota2: check() flags=0x%04x", q->flags); + + if (q->flags & ~XT_QUOTA_MASK) + return -EINVAL; + + q->name[sizeof(q->name)-1] = '\0'; + if (*q->name == '.' || strchr(q->name, '/') != NULL) { + printk(KERN_ERR "xt_quota.3: illegal name\n"); + return -EINVAL; + } + + q->master = q2_get_counter(q); + if (q->master == NULL) { + printk(KERN_ERR "xt_quota.3: memory alloc failure\n"); + return -ENOMEM; + } + + return 0; +} + +static void quota_mt2_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_quota_mtinfo2 *q = par->matchinfo; + struct xt_quota_counter *e = q->master; + + if (*q->name == '\0') { + kfree(e); + return; + } + + spin_lock_bh(&counter_list_lock); + if (!atomic_dec_and_test(&e->ref)) { + spin_unlock_bh(&counter_list_lock); + return; + } + + list_del(&e->list); + remove_proc_entry(e->name, proc_xt_quota); + spin_unlock_bh(&counter_list_lock); + kfree(e); +} + +static bool +quota_mt2(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct xt_quota_mtinfo2 *q = (void *)par->matchinfo; + struct xt_quota_counter *e = q->master; + bool ret = q->flags & XT_QUOTA_INVERT; + + spin_lock_bh(&e->lock); + if (q->flags & XT_QUOTA_GROW) { + /* + * While no_change is pointless in "grow" mode, we will + * implement it here simply to have a consistent behavior. + */ + if (!(q->flags & XT_QUOTA_NO_CHANGE)) { + e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + } + ret = true; + } else { + if (e->quota >= skb->len) { + if (!(q->flags & XT_QUOTA_NO_CHANGE)) + e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + ret = !ret; + } else { + /* We are transitioning, log that fact. */ + if (e->quota) { + quota2_log(par->hooknum, + skb, + par->in, + par->out, + q->name); + } + /* we do not allow even small packets from now on */ + e->quota = 0; + } + } + spin_unlock_bh(&e->lock); + return ret; +} + +static struct xt_match quota_mt2_reg[] __read_mostly = { + { + .name = "quota2", + .revision = 3, + .family = NFPROTO_IPV4, + .checkentry = quota_mt2_check, + .match = quota_mt2, + .destroy = quota_mt2_destroy, + .matchsize = sizeof(struct xt_quota_mtinfo2), + .me = THIS_MODULE, + }, + { + .name = "quota2", + .revision = 3, + .family = NFPROTO_IPV6, + .checkentry = quota_mt2_check, + .match = quota_mt2, + .destroy = quota_mt2_destroy, + .matchsize = sizeof(struct xt_quota_mtinfo2), + .me = THIS_MODULE, + }, +}; + +static int __init quota_mt2_init(void) +{ + int ret; + pr_debug("xt_quota2: init()"); + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG + nflognl = netlink_kernel_create(&init_net, + NETLINK_NFLOG, 1, NULL, + NULL, THIS_MODULE); + if (!nflognl) + return -ENOMEM; +#endif + + proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net); + if (proc_xt_quota == NULL) + return -EACCES; + + ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg)); + if (ret < 0) + remove_proc_entry("xt_quota", init_net.proc_net); + pr_debug("xt_quota2: init() %d", ret); + return ret; +} + +static void __exit quota_mt2_exit(void) +{ + xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg)); + remove_proc_entry("xt_quota", init_net.proc_net); +} + +module_init(quota_mt2_init); +module_exit(quota_mt2_exit); +MODULE_DESCRIPTION("Xtables: countdown quota match; up counter"); +MODULE_AUTHOR("Sam Johnston "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_quota2"); +MODULE_ALIAS("ip6t_quota2"); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c new file mode 100644 index 00000000..76a08318 --- /dev/null +++ b/net/netfilter/xt_rateest.c @@ -0,0 +1,158 @@ +/* + * (C) 2007 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +#include +#include +#include + + +static bool +xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rateest_match_info *info = par->matchinfo; + struct gnet_stats_rate_est *r; + u_int32_t bps1, bps2, pps1, pps2; + bool ret = true; + + spin_lock_bh(&info->est1->lock); + r = &info->est1->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0; + pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0; + } else { + bps1 = r->bps; + pps1 = r->pps; + } + spin_unlock_bh(&info->est1->lock); + + if (info->flags & XT_RATEEST_MATCH_ABS) { + bps2 = info->bps2; + pps2 = info->pps2; + } else { + spin_lock_bh(&info->est2->lock); + r = &info->est2->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0; + pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0; + } else { + bps2 = r->bps; + pps2 = r->pps; + } + spin_unlock_bh(&info->est2->lock); + } + + switch (info->mode) { + case XT_RATEEST_MATCH_LT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 < bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 < pps2; + break; + case XT_RATEEST_MATCH_GT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 > bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 > pps2; + break; + case XT_RATEEST_MATCH_EQ: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 == bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 == pps2; + break; + } + + ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; + return ret; +} + +static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_rateest_match_info *info = par->matchinfo; + struct xt_rateest *est1, *est2; + int ret = false; + + if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | + XT_RATEEST_MATCH_REL)) != 1) + goto err1; + + if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) + goto err1; + + switch (info->mode) { + case XT_RATEEST_MATCH_EQ: + case XT_RATEEST_MATCH_LT: + case XT_RATEEST_MATCH_GT: + break; + default: + goto err1; + } + + ret = -ENOENT; + est1 = xt_rateest_lookup(info->name1); + if (!est1) + goto err1; + + if (info->flags & XT_RATEEST_MATCH_REL) { + est2 = xt_rateest_lookup(info->name2); + if (!est2) + goto err2; + } else + est2 = NULL; + + + info->est1 = est1; + info->est2 = est2; + return 0; + +err2: + xt_rateest_put(est1); +err1: + return -EINVAL; +} + +static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_rateest_match_info *info = par->matchinfo; + + xt_rateest_put(info->est1); + if (info->est2) + xt_rateest_put(info->est2); +} + +static struct xt_match xt_rateest_mt_reg __read_mostly = { + .name = "rateest", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = xt_rateest_mt, + .checkentry = xt_rateest_mt_checkentry, + .destroy = xt_rateest_mt_destroy, + .matchsize = sizeof(struct xt_rateest_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_rateest_mt_init(void) +{ + return xt_register_match(&xt_rateest_mt_reg); +} + +static void __exit xt_rateest_mt_fini(void) +{ + xt_unregister_match(&xt_rateest_mt_reg); +} + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xtables rate estimator match"); +MODULE_ALIAS("ipt_rateest"); +MODULE_ALIAS("ip6t_rateest"); +module_init(xt_rateest_mt_init); +module_exit(xt_rateest_mt_fini); diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c new file mode 100644 index 00000000..459a7b25 --- /dev/null +++ b/net/netfilter/xt_realm.c @@ -0,0 +1,54 @@ +/* IP tables module for matching the routing realm + * + * (C) 2003 by Sampsa Ranta + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Sampsa Ranta "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Routing realm match"); +MODULE_ALIAS("ipt_realm"); + +static bool +realm_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_realm_info *info = par->matchinfo; + const struct dst_entry *dst = skb_dst(skb); + + return (info->id == (dst->tclassid & info->mask)) ^ info->invert; +} + +static struct xt_match realm_mt_reg __read_mostly = { + .name = "realm", + .match = realm_mt, + .matchsize = sizeof(struct xt_realm_info), + .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), + .family = NFPROTO_UNSPEC, + .me = THIS_MODULE +}; + +static int __init realm_mt_init(void) +{ + return xt_register_match(&realm_mt_reg); +} + +static void __exit realm_mt_exit(void) +{ + xt_unregister_match(&realm_mt_reg); +} + +module_init(realm_mt_init); +module_exit(realm_mt_exit); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c new file mode 100644 index 00000000..d2ff15a2 --- /dev/null +++ b/net/netfilter/xt_recent.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_recent"); +MODULE_ALIAS("ip6t_recent"); + +static unsigned int ip_list_tot = 100; +static unsigned int ip_pkt_list_tot = 20; +static unsigned int ip_list_hash_size = 0; +static unsigned int ip_list_perms = 0644; +static unsigned int ip_list_uid = 0; +static unsigned int ip_list_gid = 0; +module_param(ip_list_tot, uint, 0400); +module_param(ip_pkt_list_tot, uint, 0400); +module_param(ip_list_hash_size, uint, 0400); +module_param(ip_list_perms, uint, 0400); +module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR); +module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files"); + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + union nf_inet_addr addr; + u_int16_t family; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; +}; + +struct recent_table { + struct list_head list; + char name[XT_RECENT_NAME_LEN]; + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; +}; + +struct recent_net { + struct list_head tables; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *xt_recent; +#endif +}; + +static int recent_net_id; +static inline struct recent_net *recent_pernet(struct net *net) +{ + return net_generic(net, recent_net_id); +} + +static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); + +#ifdef CONFIG_PROC_FS +static const struct file_operations recent_old_fops, recent_mt_fops; +#endif + +static u_int32_t hash_rnd __read_mostly; +static bool hash_rnd_inited __read_mostly; + +static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) +{ + return jhash_1word((__force u32)addr->ip, hash_rnd) & + (ip_list_hash_size - 1); +} + +static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr) +{ + return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) & + (ip_list_hash_size - 1); +} + +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, + const union nf_inet_addr *addrp, u_int16_t family, + u_int8_t ttl) +{ + struct recent_entry *e; + unsigned int h; + + if (family == NFPROTO_IPV4) + h = recent_entry_hash4(addrp); + else + h = recent_entry_hash6(addrp); + + list_for_each_entry(e, &table->iphash[h], list) + if (e->family == family && + memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 && + (ttl == e->ttl || ttl == 0 || e->ttl == 0)) + return e; + return NULL; +} + +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} + +/* + * Drop entries with timestamps older then 'time'. + */ +static void recent_entry_reap(struct recent_table *t, unsigned long time) +{ + struct recent_entry *e; + + /* + * The head of the LRU list is always the oldest entry. + */ + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + + /* + * The last time stamp is the most recent. + */ + if (time_after(time, e->stamps[e->index-1])) + recent_entry_remove(t, e); +} + +static struct recent_entry * +recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, + u_int16_t family, u_int8_t ttl) +{ + struct recent_entry *e; + + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); + } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + memcpy(&e->addr, addr, sizeof(e->addr)); + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + e->family = family; + if (family == NFPROTO_IPV4) + list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); + else + list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} + +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->index %= ip_pkt_list_tot; + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + list_move_tail(&e->lru_list, &t->lru_list); +} + +static struct recent_table *recent_table_lookup(struct recent_net *recent_net, + const char *name) +{ + struct recent_table *t; + + list_for_each_entry(t, &recent_net->tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} + +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; + + for (i = 0; i < ip_list_hash_size; i++) + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); +} + +static bool +recent_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + struct recent_net *recent_net = recent_pernet(net); + const struct xt_recent_mtinfo *info = par->matchinfo; + struct recent_table *t; + struct recent_entry *e; + union nf_inet_addr addr = {}; + u_int8_t ttl; + bool ret = info->invert; + + if (par->family == NFPROTO_IPV4) { + const struct iphdr *iph = ip_hdr(skb); + + if (info->side == XT_RECENT_DEST) + addr.ip = iph->daddr; + else + addr.ip = iph->saddr; + + ttl = iph->ttl; + } else { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (info->side == XT_RECENT_DEST) + memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6)); + else + memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6)); + + ttl = iph->hop_limit; + } + + /* use TTL as seen before forwarding */ + if (par->out != NULL && skb->sk == NULL) + ttl++; + + spin_lock_bh(&recent_lock); + t = recent_table_lookup(recent_net, info->name); + e = recent_entry_lookup(t, &addr, par->family, + (info->check_set & XT_RECENT_TTL) ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & XT_RECENT_SET)) + goto out; + e = recent_entry_init(t, &addr, par->family, ttl); + if (e == NULL) + par->hotdrop = true; + ret = !ret; + goto out; + } + + if (info->check_set & XT_RECENT_SET) + ret = !ret; + else if (info->check_set & XT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret = !ret; + } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) { + unsigned long time = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(time, e->stamps[i])) + continue; + if (!info->hit_count || ++hits >= info->hit_count) { + ret = !ret; + break; + } + } + + /* info->seconds must be non-zero */ + if (info->check_set & XT_RECENT_REAP) + recent_entry_reap(t, time); + } + + if (info->check_set & XT_RECENT_SET || + (info->check_set & XT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; +} + +static int recent_mt_check(const struct xt_mtchk_param *par) +{ + struct recent_net *recent_net = recent_pernet(par->net); + const struct xt_recent_mtinfo *info = par->matchinfo; + struct recent_table *t; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; +#endif + unsigned i; + int ret = -EINVAL; + + if (unlikely(!hash_rnd_inited)) { + get_random_bytes(&hash_rnd, sizeof(hash_rnd)); + hash_rnd_inited = true; + } + if (info->check_set & ~XT_RECENT_VALID_FLAGS) { + pr_info("Unsupported user space flags (%08x)\n", + info->check_set); + return -EINVAL; + } + if (hweight8(info->check_set & + (XT_RECENT_SET | XT_RECENT_REMOVE | + XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1) + return -EINVAL; + if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) && + (info->seconds || info->hit_count || + (info->check_set & XT_RECENT_MODIFIERS))) + return -EINVAL; + if ((info->check_set & XT_RECENT_REAP) && !info->seconds) + return -EINVAL; + if (info->hit_count > ip_pkt_list_tot) { + pr_info("hitcount (%u) is larger than " + "packets to be remembered (%u)\n", + info->hit_count, ip_pkt_list_tot); + return -EINVAL; + } + if (info->name[0] == '\0' || + strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) + return -EINVAL; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(recent_net, info->name); + if (t != NULL) { + t->refcnt++; + ret = 0; + goto out; + } + + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_KERNEL); + if (t == NULL) { + ret = -ENOMEM; + goto out; + } + t->refcnt = 1; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, + &recent_mt_fops, t); + if (pde == NULL) { + kfree(t); + ret = -ENOMEM; + goto out; + } + pde->uid = ip_list_uid; + pde->gid = ip_list_gid; +#endif + spin_lock_bh(&recent_lock); + list_add_tail(&t->list, &recent_net->tables); + spin_unlock_bh(&recent_lock); + ret = 0; +out: + mutex_unlock(&recent_mutex); + return ret; +} + +static void recent_mt_destroy(const struct xt_mtdtor_param *par) +{ + struct recent_net *recent_net = recent_pernet(par->net); + const struct xt_recent_mtinfo *info = par->matchinfo; + struct recent_table *t; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(recent_net, info->name); + if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); + list_del(&t->list); + spin_unlock_bh(&recent_lock); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, recent_net->xt_recent); +#endif + recent_table_flush(t); + kfree(t); + } + mutex_unlock(&recent_mutex); +} + +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + const struct recent_table *table; + unsigned int bucket; +}; + +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(recent_lock) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; + + spin_lock_bh(&recent_lock); + + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) + list_for_each_entry(e, &t->iphash[st->bucket], list) + if (p-- == 0) + return e; + return NULL; +} + +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + const struct recent_entry *e = v; + const struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); +} + +static void recent_seq_stop(struct seq_file *s, void *v) + __releases(recent_lock) +{ + spin_unlock_bh(&recent_lock); +} + +static int recent_seq_show(struct seq_file *seq, void *v) +{ + const struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + if (e->family == NFPROTO_IPV4) + seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u", + &e->addr.ip, e->ttl, e->stamps[i], e->index); + else + seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u", + &e->addr.in6, e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} + +static const struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; + +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct recent_iter_state *st; + + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); + if (st == NULL) + return -ENOMEM; + + st->table = pde->data; + return 0; +} + +static ssize_t +recent_mt_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + const char *c = buf; + union nf_inet_addr addr = {}; + u_int16_t family; + bool add, succ; + + if (size == 0) + return 0; + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + + /* Strict protocol! */ + if (*loff != 0) + return -ESPIPE; + switch (*c) { + case '/': /* flush table */ + spin_lock_bh(&recent_lock); + recent_table_flush(t); + spin_unlock_bh(&recent_lock); + return size; + case '-': /* remove address */ + add = false; + break; + case '+': /* add address */ + add = true; + break; + default: + pr_info("Need \"+ip\", \"-ip\" or \"/\"\n"); + return -EINVAL; + } + + ++c; + --size; + if (strnchr(c, size, ':') != NULL) { + family = NFPROTO_IPV6; + succ = in6_pton(c, size, (void *)&addr, '\n', NULL); + } else { + family = NFPROTO_IPV4; + succ = in4_pton(c, size, (void *)&addr, '\n', NULL); + } + + if (!succ) { + pr_info("illegal address written to procfs\n"); + return -EINVAL; + } + + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, &addr, family, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, &addr, family, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); + } + spin_unlock_bh(&recent_lock); + /* Note we removed one above */ + *loff += size + 1; + return size + 1; +} + +static const struct file_operations recent_mt_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_mt_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, + .llseek = seq_lseek, +}; + +static int __net_init recent_proc_net_init(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net); + if (!recent_net->xt_recent) + return -ENOMEM; + return 0; +} + +static void __net_exit recent_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "xt_recent"); +} +#else +static inline int recent_proc_net_init(struct net *net) +{ + return 0; +} + +static inline void recent_proc_net_exit(struct net *net) +{ +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init recent_net_init(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + INIT_LIST_HEAD(&recent_net->tables); + return recent_proc_net_init(net); +} + +static void __net_exit recent_net_exit(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + BUG_ON(!list_empty(&recent_net->tables)); + recent_proc_net_exit(net); +} + +static struct pernet_operations recent_net_ops = { + .init = recent_net_init, + .exit = recent_net_exit, + .id = &recent_net_id, + .size = sizeof(struct recent_net), +}; + +static struct xt_match recent_mt_reg[] __read_mostly = { + { + .name = "recent", + .revision = 0, + .family = NFPROTO_IPV4, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 0, + .family = NFPROTO_IPV6, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init recent_mt_init(void) +{ + int err; + + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = register_pernet_subsys(&recent_net_ops); + if (err) + return err; + err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + if (err) + unregister_pernet_subsys(&recent_net_ops); + return err; +} + +static void __exit recent_mt_exit(void) +{ + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + unregister_pernet_subsys(&recent_net_ops); +} + +module_init(recent_mt_init); +module_exit(recent_mt_exit); diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h new file mode 100644 index 00000000..6efe4e5a --- /dev/null +++ b/net/netfilter/xt_repldata.h @@ -0,0 +1,35 @@ +/* + * Today's hack: quantum tunneling in structs + * + * 'entries' and 'term' are never anywhere referenced by word in code. In fact, + * they serve as the hanging-off data accessed through repl.data[]. + */ + +#define xt_alloc_initial_table(type, typ2) ({ \ + unsigned int hook_mask = info->valid_hooks; \ + unsigned int nhooks = hweight32(hook_mask); \ + unsigned int bytes = 0, hooknum = 0, i = 0; \ + struct { \ + struct type##_replace repl; \ + struct type##_standard entries[nhooks]; \ + struct type##_error term; \ + } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ + if (tbl == NULL) \ + return NULL; \ + strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + tbl->term = (struct type##_error)typ2##_ERROR_INIT; \ + tbl->repl.valid_hooks = hook_mask; \ + tbl->repl.num_entries = nhooks + 1; \ + tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ + sizeof(struct type##_error); \ + for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ + if (!(hook_mask & 1)) \ + continue; \ + tbl->repl.hook_entry[hooknum] = bytes; \ + tbl->repl.underflow[hooknum] = bytes; \ + tbl->entries[i++] = (struct type##_standard) \ + typ2##_STANDARD_INIT(NF_ACCEPT); \ + bytes += sizeof(struct type##_standard); \ + } \ + tbl; \ +}) diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c new file mode 100644 index 00000000..ef36a56a --- /dev/null +++ b/net/netfilter/xt_sctp.c @@ -0,0 +1,198 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Xtables: SCTP protocol packet match"); +MODULE_ALIAS("ipt_sctp"); +MODULE_ALIAS("ip6t_sctp"); + +#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static bool +match_flags(const struct xt_sctp_flag_info *flag_info, + const int flag_count, + u_int8_t chunktype, + u_int8_t chunkflags) +{ + int i; + + for (i = 0; i < flag_count; i++) + if (flag_info[i].chunktype == chunktype) + return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; + + return true; +} + +static inline bool +match_packet(const struct sk_buff *skb, + unsigned int offset, + const struct xt_sctp_info *info, + bool *hotdrop) +{ + u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; + const sctp_chunkhdr_t *sch; + sctp_chunkhdr_t _sch; + int chunk_match_type = info->chunk_match_type; + const struct xt_sctp_flag_info *flag_info = info->flag_info; + int flag_count = info->flag_count; + +#ifdef DEBUG + int i = 0; +#endif + + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) + SCTP_CHUNKMAP_COPY(chunkmapcopy, info->chunkmap); + + do { + sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); + if (sch == NULL || sch->length == 0) { + pr_debug("Dropping invalid SCTP packet.\n"); + *hotdrop = true; + return false; + } +#ifdef DEBUG + pr_debug("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d" + "\tflags: %x\n", + ++i, offset, sch->type, htons(sch->length), + sch->flags); +#endif + offset += WORD_ROUND(ntohs(sch->length)); + + pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); + + if (SCTP_CHUNKMAP_IS_SET(info->chunkmap, sch->type)) { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ANY: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return true; + } + break; + + case SCTP_CHUNK_MATCH_ALL: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) + SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); + break; + + case SCTP_CHUNK_MATCH_ONLY: + if (!match_flags(flag_info, flag_count, + sch->type, sch->flags)) + return false; + break; + } + } else { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ONLY: + return false; + } + } + } while (offset < skb->len); + + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ALL: + return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy); + case SCTP_CHUNK_MATCH_ANY: + return false; + case SCTP_CHUNK_MATCH_ONLY: + return true; + } + + /* This will never be reached, but required to stop compiler whine */ + return false; +} + +static bool +sctp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_sctp_info *info = par->matchinfo; + const sctp_sctphdr_t *sh; + sctp_sctphdr_t _sh; + + if (par->fragoff != 0) { + pr_debug("Dropping non-first fragment.. FIXME\n"); + return false; + } + + sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh); + if (sh == NULL) { + pr_debug("Dropping evil TCP offset=0 tinygram.\n"); + par->hotdrop = true; + return false; + } + pr_debug("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); + + return SCCHECK(ntohs(sh->source) >= info->spts[0] + && ntohs(sh->source) <= info->spts[1], + XT_SCTP_SRC_PORTS, info->flags, info->invflags) + && SCCHECK(ntohs(sh->dest) >= info->dpts[0] + && ntohs(sh->dest) <= info->dpts[1], + XT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t), + info, &par->hotdrop), + XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); +} + +static int sctp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_sctp_info *info = par->matchinfo; + + if (info->flags & ~XT_SCTP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~XT_SCTP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~info->flags) + return -EINVAL; + if (!(info->flags & XT_SCTP_CHUNK_TYPES)) + return 0; + if (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL | + SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY)) + return 0; + return -EINVAL; +} + +static struct xt_match sctp_mt_reg[] __read_mostly = { + { + .name = "sctp", + .family = NFPROTO_IPV4, + .checkentry = sctp_mt_check, + .match = sctp_mt, + .matchsize = sizeof(struct xt_sctp_info), + .proto = IPPROTO_SCTP, + .me = THIS_MODULE + }, + { + .name = "sctp", + .family = NFPROTO_IPV6, + .checkentry = sctp_mt_check, + .match = sctp_mt, + .matchsize = sizeof(struct xt_sctp_info), + .proto = IPPROTO_SCTP, + .me = THIS_MODULE + }, +}; + +static int __init sctp_mt_init(void) +{ + return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); +} + +static void __exit sctp_mt_exit(void) +{ + xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); +} + +module_init(sctp_mt_init); +module_exit(sctp_mt_exit); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c new file mode 100644 index 00000000..b3babaed --- /dev/null +++ b/net/netfilter/xt_set.c @@ -0,0 +1,373 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson + * Patrick Schaaf + * Martin Josefsson + * Copyright (C) 2003-2011 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module which implements the set match and SET target + * for netfilter/iptables. */ + +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("Xtables: IP set match and target module"); +MODULE_ALIAS("xt_SET"); +MODULE_ALIAS("ipt_set"); +MODULE_ALIAS("ip6t_set"); +MODULE_ALIAS("ipt_SET"); +MODULE_ALIAS("ip6t_SET"); + +static inline int +match_set(ip_set_id_t index, const struct sk_buff *skb, + u8 pf, u8 dim, u8 flags, int inv) +{ + if (ip_set_test(index, skb, pf, dim, flags)) + inv = !inv; + return inv; +} + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + +static bool +set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v0 *info = par->matchinfo; + + return match_set(info->match_set.index, skb, par->family, + info->match_set.u.compat.dim, + info->match_set.u.compat.flags, + info->match_set.u.compat.flags & IPSET_INV_MATCH); +} + +static void +compat_flags(struct xt_set_info_v0 *info) +{ + u_int8_t i; + + /* Fill out compatibility data according to enum ip_set_kopt */ + info->u.compat.dim = IPSET_DIM_ZERO; + if (info->u.flags[0] & IPSET_MATCH_INV) + info->u.compat.flags |= IPSET_INV_MATCH; + for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + info->u.compat.dim++; + if (info->u.flags[i] & IPSET_SRC) + info->u.compat.flags |= (1<u.compat.dim); + } +} + +static int +set_match_v0_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match_v0 *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set indentified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(info->match_set.index); + return -ERANGE; + } + + /* Fill out compatibility data */ + compat_flags(&info->match_set); + + return 0; +} + +static void +set_match_v0_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match_v0 *info = par->matchinfo; + + ip_set_nfnl_put(info->match_set.index); +} + +static unsigned int +set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v0 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par->family, + info->add_set.u.compat.dim, + info->add_set.u.compat.flags); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par->family, + info->del_set.u.compat.dim, + info->del_set.u.compat.flags); + + return XT_CONTINUE; +} + +static int +set_target_v0_checkentry(const struct xt_tgchk_param *par) +{ + struct xt_set_info_target_v0 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + return -ENOENT; + } + } + if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + pr_warning("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); + return -ERANGE; + } + + /* Fill out compatibility data */ + compat_flags(&info->add_set); + compat_flags(&info->del_set); + + return 0; +} + +static void +set_target_v0_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v0 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); +} + +/* Revision 1: current interface to netfilter/iptables */ + +static bool +set_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match *info = par->matchinfo; + + return match_set(info->match_set.index, skb, par->family, + info->match_set.dim, + info->match_set.flags, + info->match_set.flags & IPSET_INV_MATCH); +} + +static int +set_match_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set indentified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(info->match_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_match_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match *info = par->matchinfo; + + ip_set_nfnl_put(info->match_set.index); +} + +static unsigned int +set_target(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, + skb, par->family, + info->add_set.dim, + info->add_set.flags); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, + skb, par->family, + info->del_set.dim, + info->del_set.flags); + + return XT_CONTINUE; +} + +static int +set_target_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + return -ENOENT; + } + } + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); +} + +static struct xt_match set_matches[] __read_mostly = { + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 0, + .match = set_match_v0, + .matchsize = sizeof(struct xt_set_info_match_v0), + .checkentry = set_match_v0_checkentry, + .destroy = set_match_v0_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 1, + .match = set_match, + .matchsize = sizeof(struct xt_set_info_match), + .checkentry = set_match_checkentry, + .destroy = set_match_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 1, + .match = set_match, + .matchsize = sizeof(struct xt_set_info_match), + .checkentry = set_match_checkentry, + .destroy = set_match_destroy, + .me = THIS_MODULE + }, +}; + +static struct xt_target set_targets[] __read_mostly = { + { + .name = "SET", + .revision = 0, + .family = NFPROTO_IPV4, + .target = set_target_v0, + .targetsize = sizeof(struct xt_set_info_target_v0), + .checkentry = set_target_v0_checkentry, + .destroy = set_target_v0_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 1, + .family = NFPROTO_IPV4, + .target = set_target, + .targetsize = sizeof(struct xt_set_info_target), + .checkentry = set_target_checkentry, + .destroy = set_target_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 1, + .family = NFPROTO_IPV6, + .target = set_target, + .targetsize = sizeof(struct xt_set_info_target), + .checkentry = set_target_checkentry, + .destroy = set_target_destroy, + .me = THIS_MODULE + }, +}; + +static int __init xt_set_init(void) +{ + int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches)); + + if (!ret) { + ret = xt_register_targets(set_targets, + ARRAY_SIZE(set_targets)); + if (ret) + xt_unregister_matches(set_matches, + ARRAY_SIZE(set_matches)); + } + return ret; +} + +static void __exit xt_set_fini(void) +{ + xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches)); + xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets)); +} + +module_init(xt_set_init); +module_exit(xt_set_fini); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c new file mode 100644 index 00000000..ddf5e050 --- /dev/null +++ b/net/netfilter/xt_socket.c @@ -0,0 +1,404 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#define XT_SOCKET_HAVE_IPV6 1 +#include +#include +#endif + +#include + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#define XT_SOCKET_HAVE_CONNTRACK 1 +#include +#endif + +void +xt_socket_put_sk(struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else + sock_put(sk); +} +EXPORT_SYMBOL(xt_socket_put_sk); + +static int +extract_icmp4_fields(const struct sk_buff *skb, + u8 *protocol, + __be32 *raddr, + __be32 *laddr, + __be16 *rport, + __be16 *lport) +{ + unsigned int outside_hdrlen = ip_hdrlen(skb); + struct iphdr *inside_iph, _inside_iph; + struct icmphdr *icmph, _icmph; + __be16 *ports, _ports[2]; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return 1; + } + + inside_iph = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr), + sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + + if (inside_iph->protocol != IPPROTO_TCP && + inside_iph->protocol != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr) + + (inside_iph->ihl << 2), + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_iph->protocol; + *laddr = inside_iph->saddr; + *lport = ports[0]; + *raddr = inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +struct sock* +xt_socket_get4_sk(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk; + __be32 daddr, saddr; + __be16 dport, sport; + u8 protocol; +#ifdef XT_SOCKET_HAVE_CONNTRACK + struct nf_conn const *ct; + enum ip_conntrack_info ctinfo; +#endif + + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_hdr), &_hdr); + if (hp == NULL) + return NULL; + + protocol = iph->protocol; + saddr = iph->saddr; + sport = hp->source; + daddr = iph->daddr; + dport = hp->dest; + + } else if (iph->protocol == IPPROTO_ICMP) { + if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, + &sport, &dport)) + return NULL; + } else { + return NULL; + } + +#ifdef XT_SOCKET_HAVE_CONNTRACK + /* Do the lookup with the original socket address in case this is a + * reply packet of an established SNAT-ted connection. */ + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && + ((iph->protocol != IPPROTO_ICMP && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (iph->protocol == IPPROTO_ICMP && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + + daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; + dport = (iph->protocol == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); + + pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n", + protocol, &saddr, ntohs(sport), + &daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + + return sk; +} +EXPORT_SYMBOL(xt_socket_get4_sk); + +static bool +socket_match(const struct sk_buff *skb, struct xt_action_param *par, + const struct xt_socket_mtinfo1 *info) +{ + struct sock *sk; + + sk = xt_socket_get4_sk(skb, par); + if (sk != NULL) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY */ + wildcard = (sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->inet_rcv_saddr == 0); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info && info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + xt_socket_put_sk(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + return (sk != NULL); +} + +static bool +socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + return socket_match(skb, par, NULL); +} + +static bool +socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + return socket_match(skb, par, par->matchinfo); +} + +#ifdef XT_SOCKET_HAVE_IPV6 + +static int +extract_icmp6_fields(const struct sk_buff *skb, + unsigned int outside_hdrlen, + int *protocol, + struct in6_addr **raddr, + struct in6_addr **laddr, + __be16 *rport, + __be16 *lport) +{ + struct ipv6hdr *inside_iph, _inside_iph; + struct icmp6hdr *icmph, _icmph; + __be16 *ports, _ports[2]; + u8 inside_nexthdr; + int inside_hdrlen; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) + return 1; + + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + inside_nexthdr = inside_iph->nexthdr; + + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr); + if (inside_hdrlen < 0) + return 1; /* hjm: Packet has no/incomplete transport layer headers. */ + + if (inside_nexthdr != IPPROTO_TCP && + inside_nexthdr != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, inside_hdrlen, + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_nexthdr; + *laddr = &inside_iph->saddr; + *lport = ports[0]; + *raddr = &inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +struct sock* +xt_socket_get6_sk(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk; + struct in6_addr *daddr, *saddr; + __be16 dport, sport; + int thoff, tproto; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) { + hp = skb_header_pointer(skb, thoff, + sizeof(_hdr), &_hdr); + if (hp == NULL) + return NULL; + + saddr = &iph->saddr; + sport = hp->source; + daddr = &iph->daddr; + dport = hp->dest; + + } else if (tproto == IPPROTO_ICMPV6) { + if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, + &sport, &dport)) + return NULL; + } else { + return NULL; + } + + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); + pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu " + "(orig %pI6:%hu) sock %p\n", + tproto, saddr, ntohs(sport), + daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + return sk; +} +EXPORT_SYMBOL(xt_socket_get6_sk); + +static bool +socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct sock *sk; + const struct xt_socket_mtinfo1 *info; + + info = (struct xt_socket_mtinfo1 *) par->matchinfo; + sk = xt_socket_get6_sk(skb, par); + if (sk != NULL) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY */ + wildcard = (sk->sk_state != TCP_TIME_WAIT && + ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info && info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + xt_socket_put_sk(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + return (sk != NULL); +} +#endif + +static struct xt_match socket_mt_reg[] __read_mostly = { + { + .name = "socket", + .revision = 0, + .family = NFPROTO_IPV4, + .match = socket_mt4_v0, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init socket_mt_init(void) +{ + nf_defrag_ipv4_enable(); +#ifdef XT_SOCKET_HAVE_IPV6 + nf_defrag_ipv6_enable(); +#endif + + return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); +} + +static void __exit socket_mt_exit(void) +{ + xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); +} + +module_init(socket_mt_init); +module_exit(socket_mt_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); +MODULE_DESCRIPTION("x_tables socket match module"); +MODULE_ALIAS("ipt_socket"); +MODULE_ALIAS("ip6t_socket"); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c new file mode 100644 index 00000000..a507922d --- /dev/null +++ b/net/netfilter/xt_state.c @@ -0,0 +1,79 @@ +/* Kernel module to match connection tracking information. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module"); +MODULE_ALIAS("ipt_state"); +MODULE_ALIAS("ip6t_state"); + +static bool +state_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_state_info *sinfo = par->matchinfo; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (!ct) + statebit = XT_STATE_INVALID; + else { + if (nf_ct_is_untracked(ct)) + statebit = XT_STATE_UNTRACKED; + else + statebit = XT_STATE_BIT(ctinfo); + } + return (sinfo->statemask & statebit); +} + +static int state_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void state_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match state_mt_reg __read_mostly = { + .name = "state", + .family = NFPROTO_UNSPEC, + .checkentry = state_mt_check, + .match = state_mt, + .destroy = state_mt_destroy, + .matchsize = sizeof(struct xt_state_info), + .me = THIS_MODULE, +}; + +static int __init state_mt_init(void) +{ + return xt_register_match(&state_mt_reg); +} + +static void __exit state_mt_exit(void) +{ + xt_unregister_match(&state_mt_reg); +} + +module_init(state_mt_init); +module_exit(state_mt_exit); diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c new file mode 100644 index 00000000..42ecb71d --- /dev/null +++ b/net/netfilter/xt_statistic.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on ipt_random and ipt_nth by Fabrice MARIE . + */ + +#include +#include +#include +#include +#include + +#include +#include + +struct xt_statistic_priv { + atomic_t count; +} ____cacheline_aligned_in_smp; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); +MODULE_ALIAS("ipt_statistic"); +MODULE_ALIAS("ip6t_statistic"); + +static bool +statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_statistic_info *info = par->matchinfo; + bool ret = info->flags & XT_STATISTIC_INVERT; + int nval, oval; + + switch (info->mode) { + case XT_STATISTIC_MODE_RANDOM: + if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) + ret = !ret; + break; + case XT_STATISTIC_MODE_NTH: + do { + oval = atomic_read(&info->master->count); + nval = (oval == info->u.nth.every) ? 0 : oval + 1; + } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); + if (nval == 0) + ret = !ret; + break; + } + + return ret; +} + +static int statistic_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_statistic_info *info = par->matchinfo; + + if (info->mode > XT_STATISTIC_MODE_MAX || + info->flags & ~XT_STATISTIC_MASK) + return -EINVAL; + + info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); + if (info->master == NULL) + return -ENOMEM; + atomic_set(&info->master->count, info->u.nth.count); + + return 0; +} + +static void statistic_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_statistic_info *info = par->matchinfo; + + kfree(info->master); +} + +static struct xt_match xt_statistic_mt_reg __read_mostly = { + .name = "statistic", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = statistic_mt, + .checkentry = statistic_mt_check, + .destroy = statistic_mt_destroy, + .matchsize = sizeof(struct xt_statistic_info), + .me = THIS_MODULE, +}; + +static int __init statistic_mt_init(void) +{ + return xt_register_match(&xt_statistic_mt_reg); +} + +static void __exit statistic_mt_exit(void) +{ + xt_unregister_match(&xt_statistic_mt_reg); +} + +module_init(statistic_mt_init); +module_exit(statistic_mt_exit); diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c new file mode 100644 index 00000000..d3c48b14 --- /dev/null +++ b/net/netfilter/xt_string.c @@ -0,0 +1,96 @@ +/* String matching match for iptables + * + * (C) 2005 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("Xtables: string-based matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_string"); +MODULE_ALIAS("ip6t_string"); + +static bool +string_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_string_info *conf = par->matchinfo; + struct ts_state state; + bool invert; + + memset(&state, 0, sizeof(struct ts_state)); + invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; + + return (skb_find_text((struct sk_buff *)skb, conf->from_offset, + conf->to_offset, conf->config, &state) + != UINT_MAX) ^ invert; +} + +#define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) + +static int string_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_string_info *conf = par->matchinfo; + struct ts_config *ts_conf; + int flags = TS_AUTOLOAD; + + /* Damn, can't handle this case properly with iptables... */ + if (conf->from_offset > conf->to_offset) + return -EINVAL; + if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') + return -EINVAL; + if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) + return -EINVAL; + if (conf->u.v1.flags & + ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) + return -EINVAL; + if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE) + flags |= TS_IGNORECASE; + ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, + GFP_KERNEL, flags); + if (IS_ERR(ts_conf)) + return PTR_ERR(ts_conf); + + conf->config = ts_conf; + return 0; +} + +static void string_mt_destroy(const struct xt_mtdtor_param *par) +{ + textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); +} + +static struct xt_match xt_string_mt_reg __read_mostly = { + .name = "string", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = string_mt_check, + .match = string_mt, + .destroy = string_mt_destroy, + .matchsize = sizeof(struct xt_string_info), + .me = THIS_MODULE, +}; + +static int __init string_mt_init(void) +{ + return xt_register_match(&xt_string_mt_reg); +} + +static void __exit string_mt_exit(void) +{ + xt_unregister_match(&xt_string_mt_reg); +} + +module_init(string_mt_init); +module_exit(string_mt_exit); diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c new file mode 100644 index 00000000..c53d4d18 --- /dev/null +++ b/net/netfilter/xt_tcpmss.c @@ -0,0 +1,110 @@ +/* Kernel module to match TCP MSS values. */ + +/* Copyright (C) 2000 Marc Boucher + * Portions (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("Xtables: TCP MSS match"); +MODULE_ALIAS("ipt_tcpmss"); +MODULE_ALIAS("ip6t_tcpmss"); + +static bool +tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_tcpmss_match_info *info = par->matchinfo; + const struct tcphdr *th; + struct tcphdr _tcph; + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + const u_int8_t *op; + u8 _opt[15 * 4 - sizeof(_tcph)]; + unsigned int i, optlen; + + /* If we don't have the whole header, drop packet. */ + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); + if (th == NULL) + goto dropit; + + /* Malformed. */ + if (th->doff*4 < sizeof(*th)) + goto dropit; + + optlen = th->doff*4 - sizeof(*th); + if (!optlen) + goto out; + + /* Truncated options. */ + op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt); + if (op == NULL) + goto dropit; + + for (i = 0; i < optlen; ) { + if (op[i] == TCPOPT_MSS + && (optlen - i) >= TCPOLEN_MSS + && op[i+1] == TCPOLEN_MSS) { + u_int16_t mssval; + + mssval = (op[i+2] << 8) | op[i+3]; + + return (mssval >= info->mss_min && + mssval <= info->mss_max) ^ info->invert; + } + if (op[i] < 2) + i++; + else + i += op[i+1] ? : 1; + } +out: + return info->invert; + +dropit: + par->hotdrop = true; + return false; +} + +static struct xt_match tcpmss_mt_reg[] __read_mostly = { + { + .name = "tcpmss", + .family = NFPROTO_IPV4, + .match = tcpmss_mt, + .matchsize = sizeof(struct xt_tcpmss_match_info), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, + { + .name = "tcpmss", + .family = NFPROTO_IPV6, + .match = tcpmss_mt, + .matchsize = sizeof(struct xt_tcpmss_match_info), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, +}; + +static int __init tcpmss_mt_init(void) +{ + return xt_register_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); +} + +static void __exit tcpmss_mt_exit(void) +{ + xt_unregister_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); +} + +module_init(tcpmss_mt_init); +module_exit(tcpmss_mt_exit); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c new file mode 100644 index 00000000..c14d4645 --- /dev/null +++ b/net/netfilter/xt_tcpudp.c @@ -0,0 +1,234 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("xt_tcp"); +MODULE_ALIAS("xt_udp"); +MODULE_ALIAS("ipt_udp"); +MODULE_ALIAS("ipt_tcp"); +MODULE_ALIAS("ip6t_udp"); +MODULE_ALIAS("ip6t_tcp"); + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline bool +port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static bool +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int protoff, + unsigned int optlen, + bool invert, + bool *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + const u_int8_t *op; + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + unsigned int i; + + pr_debug("finding option\n"); + + if (!optlen) + return invert; + + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { + *hotdrop = true; + return false; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct tcphdr *th; + struct tcphdr _tcph; + const struct xt_tcp *tcpinfo = par->matchinfo; + + if (par->fragoff != 0) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (par->fragoff == 1) { + pr_debug("Dropping evil TCP offset=1 frag.\n"); + par->hotdrop = true; + } + /* Must not be a fragment. */ + return false; + } + +#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) + + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + pr_debug("Dropping evil TCP offset=0 tinygram.\n"); + par->hotdrop = true; + return false; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) + return false; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) + return false; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + XT_TCP_INV_FLAGS)) + return false; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + par->hotdrop = true; + return false; + } + if (!tcp_find_option(tcpinfo->option, skb, par->thoff, + th->doff*4 - sizeof(_tcph), + tcpinfo->invflags & XT_TCP_INV_OPTION, + &par->hotdrop)) + return false; + } + return true; +} + +static int tcp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_tcp *tcpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; +} + +static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct udphdr *uh; + struct udphdr _udph; + const struct xt_udp *udpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + pr_debug("Dropping evil UDP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); +} + +static int udp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_udp *udpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; +} + +static struct xt_match tcpudp_mt_reg[] __read_mostly = { + { + .name = "tcp", + .family = NFPROTO_IPV4, + .checkentry = tcp_mt_check, + .match = tcp_mt, + .matchsize = sizeof(struct xt_tcp), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, + { + .name = "tcp", + .family = NFPROTO_IPV6, + .checkentry = tcp_mt_check, + .match = tcp_mt, + .matchsize = sizeof(struct xt_tcp), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, + { + .name = "udp", + .family = NFPROTO_IPV4, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDP, + .me = THIS_MODULE, + }, + { + .name = "udp", + .family = NFPROTO_IPV6, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDP, + .me = THIS_MODULE, + }, + { + .name = "udplite", + .family = NFPROTO_IPV4, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDPLITE, + .me = THIS_MODULE, + }, + { + .name = "udplite", + .family = NFPROTO_IPV6, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDPLITE, + .me = THIS_MODULE, + }, +}; + +static int __init tcpudp_mt_init(void) +{ + return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); +} + +static void __exit tcpudp_mt_exit(void) +{ + xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); +} + +module_init(tcpudp_mt_init); +module_exit(tcpudp_mt_exit); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c new file mode 100644 index 00000000..c48975ff --- /dev/null +++ b/net/netfilter/xt_time.c @@ -0,0 +1,269 @@ +/* + * xt_time + * Copyright © CC Computer Consultants GmbH, 2007 + * + * based on ipt_time by Fabrice MARIE + * This is a module which is used for time matching + * It is using some modified code from dietlibc (localtime() function) + * that you can find at http://www.fefe.de/dietlibc/ + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. + */ +#include +#include +#include +#include +#include +#include + +struct xtm { + u_int8_t month; /* (1-12) */ + u_int8_t monthday; /* (1-31) */ + u_int8_t weekday; /* (1-7) */ + u_int8_t hour; /* (0-23) */ + u_int8_t minute; /* (0-59) */ + u_int8_t second; /* (0-59) */ + unsigned int dse; +}; + +extern struct timezone sys_tz; /* ouch */ + +static const u_int16_t days_since_year[] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, +}; + +static const u_int16_t days_since_leapyear[] = { + 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, +}; + +/* + * Since time progresses forward, it is best to organize this array in reverse, + * to minimize lookup time. + */ +enum { + DSE_FIRST = 2039, +}; +static const u_int16_t days_since_epoch[] = { + /* 2039 - 2030 */ + 25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915, + /* 2029 - 2020 */ + 21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262, + /* 2019 - 2010 */ + 17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610, + /* 2009 - 2000 */ + 14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957, + /* 1999 - 1990 */ + 10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305, + /* 1989 - 1980 */ + 6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652, + /* 1979 - 1970 */ + 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, +}; + +static inline bool is_leap(unsigned int y) +{ + return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); +} + +/* + * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. + * Since we match against days and daytime, the SSTE value needs to be + * computed back into human-readable dates. + * + * This is done in three separate functions so that the most expensive + * calculations are done last, in case a "simple match" can be found earlier. + */ +static inline unsigned int localtime_1(struct xtm *r, time_t time) +{ + unsigned int v, w; + + /* Each day has 86400s, so finding the hour/minute is actually easy. */ + v = time % 86400; + r->second = v % 60; + w = v / 60; + r->minute = w % 60; + r->hour = w / 60; + return v; +} + +static inline void localtime_2(struct xtm *r, time_t time) +{ + /* + * Here comes the rest (weekday, monthday). First, divide the SSTE + * by seconds-per-day to get the number of _days_ since the epoch. + */ + r->dse = time / 86400; + + /* + * 1970-01-01 (w=0) was a Thursday (4). + * -1 and +1 map Sunday properly onto 7. + */ + r->weekday = (4 + r->dse - 1) % 7 + 1; +} + +static void localtime_3(struct xtm *r, time_t time) +{ + unsigned int year, i, w = r->dse; + + /* + * In each year, a certain number of days-since-the-epoch have passed. + * Find the year that is closest to said days. + * + * Consider, for example, w=21612 (2029-03-04). Loop will abort on + * dse[i] <= w, which happens when dse[i] == 21550. This implies + * year == 2009. w will then be 62. + */ + for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w; + ++i, --year) + /* just loop */; + + w -= days_since_epoch[i]; + + /* + * By now we have the current year, and the day of the year. + * r->yearday = w; + * + * On to finding the month (like above). In each month, a certain + * number of days-since-New Year have passed, and find the closest + * one. + * + * Consider w=62 (in a non-leap year). Loop will abort on + * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2). + * Concludes i == 2, i.e. 3rd month => March. + * + * (A different approach to use would be to subtract a monthlength + * from w repeatedly while counting.) + */ + if (is_leap(year)) { + /* use days_since_leapyear[] in a leap year */ + for (i = ARRAY_SIZE(days_since_leapyear) - 1; + i > 0 && days_since_leapyear[i] > w; --i) + /* just loop */; + r->monthday = w - days_since_leapyear[i] + 1; + } else { + for (i = ARRAY_SIZE(days_since_year) - 1; + i > 0 && days_since_year[i] > w; --i) + /* just loop */; + r->monthday = w - days_since_year[i] + 1; + } + + r->month = i + 1; +} + +static bool +time_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_time_info *info = par->matchinfo; + unsigned int packet_time; + struct xtm current_time; + s64 stamp; + + /* + * We cannot use get_seconds() instead of __net_timestamp() here. + * Suppose you have two rules: + * 1. match before 13:00 + * 2. match after 13:00 + * If you match against processing time (get_seconds) it + * may happen that the same packet matches both rules if + * it arrived at the right moment before 13:00. + */ + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + stamp = ktime_to_ns(skb->tstamp); + stamp = div_s64(stamp, NSEC_PER_SEC); + + if (info->flags & XT_TIME_LOCAL_TZ) + /* Adjust for local timezone */ + stamp -= 60 * sys_tz.tz_minuteswest; + + /* + * xt_time will match when _all_ of the following hold: + * - 'now' is in the global time range date_start..date_end + * - 'now' is in the monthday mask + * - 'now' is in the weekday mask + * - 'now' is in the daytime range time_start..time_end + * (and by default, libxt_time will set these so as to match) + */ + + if (stamp < info->date_start || stamp > info->date_stop) + return false; + + packet_time = localtime_1(¤t_time, stamp); + + if (info->daytime_start < info->daytime_stop) { + if (packet_time < info->daytime_start || + packet_time > info->daytime_stop) + return false; + } else { + if (packet_time < info->daytime_start && + packet_time > info->daytime_stop) + return false; + } + + localtime_2(¤t_time, stamp); + + if (!(info->weekdays_match & (1 << current_time.weekday))) + return false; + + /* Do not spend time computing monthday if all days match anyway */ + if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) { + localtime_3(¤t_time, stamp); + if (!(info->monthdays_match & (1 << current_time.monthday))) + return false; + } + + return true; +} + +static int time_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_time_info *info = par->matchinfo; + + if (info->daytime_start > XT_TIME_MAX_DAYTIME || + info->daytime_stop > XT_TIME_MAX_DAYTIME) { + pr_info("invalid argument - start or " + "stop time greater than 23:59:59\n"); + return -EDOM; + } + + return 0; +} + +static struct xt_match xt_time_mt_reg __read_mostly = { + .name = "time", + .family = NFPROTO_UNSPEC, + .match = time_mt, + .checkentry = time_mt_check, + .matchsize = sizeof(struct xt_time_info), + .me = THIS_MODULE, +}; + +static int __init time_mt_init(void) +{ + int minutes = sys_tz.tz_minuteswest; + + if (minutes < 0) /* east of Greenwich */ + printk(KERN_INFO KBUILD_MODNAME + ": kernel timezone is +%02d%02d\n", + -minutes / 60, -minutes % 60); + else /* west of Greenwich */ + printk(KERN_INFO KBUILD_MODNAME + ": kernel timezone is -%02d%02d\n", + minutes / 60, minutes % 60); + + return xt_register_match(&xt_time_mt_reg); +} + +static void __exit time_mt_exit(void) +{ + xt_unregister_match(&xt_time_mt_reg); +} + +module_init(time_mt_init); +module_exit(time_mt_exit); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: time-based matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_time"); +MODULE_ALIAS("ip6t_time"); diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c new file mode 100644 index 00000000..a95b5034 --- /dev/null +++ b/net/netfilter/xt_u32.c @@ -0,0 +1,123 @@ +/* + * xt_u32 - kernel module to match u32 packet content + * + * Original author: Don Cohen + * (C) CC Computer Consultants GmbH, 2007 + */ + +#include +#include +#include +#include +#include +#include +#include + +static bool u32_match_it(const struct xt_u32 *data, + const struct sk_buff *skb) +{ + const struct xt_u32_test *ct; + unsigned int testind; + unsigned int nnums; + unsigned int nvals; + unsigned int i; + __be32 n; + u_int32_t pos; + u_int32_t val; + u_int32_t at; + + /* + * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17" + * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands. + */ + for (testind = 0; testind < data->ntests; ++testind) { + ct = &data->tests[testind]; + at = 0; + pos = ct->location[0].number; + + if (skb->len < 4 || pos > skb->len - 4) + return false; + + if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0) + BUG(); + val = ntohl(n); + nnums = ct->nnums; + + /* Inner loop runs over "&", "<<", ">>" and "@" operands */ + for (i = 1; i < nnums; ++i) { + u_int32_t number = ct->location[i].number; + switch (ct->location[i].nextop) { + case XT_U32_AND: + val &= number; + break; + case XT_U32_LEFTSH: + val <<= number; + break; + case XT_U32_RIGHTSH: + val >>= number; + break; + case XT_U32_AT: + if (at + val < at) + return false; + at += val; + pos = number; + if (at + 4 < at || skb->len < at + 4 || + pos > skb->len - at - 4) + return false; + + if (skb_copy_bits(skb, at + pos, &n, + sizeof(n)) < 0) + BUG(); + val = ntohl(n); + break; + } + } + + /* Run over the "," and ":" operands */ + nvals = ct->nvalues; + for (i = 0; i < nvals; ++i) + if (ct->value[i].min <= val && val <= ct->value[i].max) + break; + + if (i >= ct->nvalues) + return false; + } + + return true; +} + +static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_u32 *data = par->matchinfo; + bool ret; + + ret = u32_match_it(data, skb); + return ret ^ data->invert; +} + +static struct xt_match xt_u32_mt_reg __read_mostly = { + .name = "u32", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = u32_mt, + .matchsize = sizeof(struct xt_u32), + .me = THIS_MODULE, +}; + +static int __init u32_mt_init(void) +{ + return xt_register_match(&xt_u32_mt_reg); +} + +static void __exit u32_mt_exit(void) +{ + xt_unregister_match(&xt_u32_mt_reg); +} + +module_init(u32_mt_init); +module_exit(u32_mt_exit); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("Xtables: arbitrary byte matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_u32"); +MODULE_ALIAS("ip6t_u32"); diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig new file mode 100644 index 00000000..56958c85 --- /dev/null +++ b/net/netlabel/Kconfig @@ -0,0 +1,17 @@ +# +# NetLabel configuration +# + +config NETLABEL + bool "NetLabel subsystem support" + depends on SECURITY + default n + ---help--- + NetLabel provides support for explicit network packet labeling + protocols such as CIPSO and RIPSO. For more information see + Documentation/netlabel as well as the NetLabel SourceForge project + for configuration tools and additional documentation. + + * http://netlabel.sf.net + + If you are unsure, say N. diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile new file mode 100644 index 00000000..ea750e9d --- /dev/null +++ b/net/netlabel/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the NetLabel subsystem. +# +# Feb 9, 2006, Paul Moore +# + +# base objects +obj-y := netlabel_user.o netlabel_kapi.o +obj-y += netlabel_domainhash.o netlabel_addrlist.o + +# management objects +obj-y += netlabel_mgmt.o + +# protocol modules +obj-y += netlabel_unlabeled.o +obj-y += netlabel_cipso_v4.o + diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c new file mode 100644 index 00000000..c0519139 --- /dev/null +++ b/net/netlabel/netlabel_addrlist.c @@ -0,0 +1,384 @@ +/* + * NetLabel Network Address Lists + * + * This file contains network address list functions used to manage ordered + * lists of network addresses for use by the NetLabel subsystem. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_addrlist.h" + +/* + * Address List Functions + */ + +/** + * netlbl_af4list_search - Search for a matching IPv4 address entry + * @addr: IPv4 address + * @head: the list head + * + * Description: + * Searches the IPv4 address list given by @head. If a matching address entry + * is found it is returned, otherwise NULL is returned. The caller is + * responsible for calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af4list *netlbl_af4list_search(__be32 addr, + struct list_head *head) +{ + struct netlbl_af4list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && (addr & iter->mask) == iter->addr) + return iter; + + return NULL; +} + +/** + * netlbl_af4list_search_exact - Search for an exact IPv4 address entry + * @addr: IPv4 address + * @mask: IPv4 address mask + * @head: the list head + * + * Description: + * Searches the IPv4 address list given by @head. If an exact match if found + * it is returned, otherwise NULL is returned. The caller is responsible for + * calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, + __be32 mask, + struct list_head *head) +{ + struct netlbl_af4list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && iter->addr == addr && iter->mask == mask) + return iter; + + return NULL; +} + + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_search - Search for a matching IPv6 address entry + * @addr: IPv6 address + * @head: the list head + * + * Description: + * Searches the IPv6 address list given by @head. If a matching address entry + * is found it is returned, otherwise NULL is returned. The caller is + * responsible for calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, + struct list_head *head) +{ + struct netlbl_af6list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0) + return iter; + + return NULL; +} + +/** + * netlbl_af6list_search_exact - Search for an exact IPv6 address entry + * @addr: IPv6 address + * @mask: IPv6 address mask + * @head: the list head + * + * Description: + * Searches the IPv6 address list given by @head. If an exact match if found + * it is returned, otherwise NULL is returned. The caller is responsible for + * calling the rcu_read_[un]lock() functions. + * + */ +struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head) +{ + struct netlbl_af6list *iter; + + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ipv6_addr_equal(&iter->addr, addr) && + ipv6_addr_equal(&iter->mask, mask)) + return iter; + + return NULL; +} +#endif /* IPv6 */ + +/** + * netlbl_af4list_add - Add a new IPv4 address entry to a list + * @entry: address entry + * @head: the list head + * + * Description: + * Add a new address entry to the list pointed to by @head. On success zero is + * returned, otherwise a negative value is returned. The caller is responsible + * for calling the necessary locking functions. + * + */ +int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head) +{ + struct netlbl_af4list *iter; + + iter = netlbl_af4list_search(entry->addr, head); + if (iter != NULL && + iter->addr == entry->addr && iter->mask == entry->mask) + return -EEXIST; + + /* in order to speed up address searches through the list (the common + * case) we need to keep the list in order based on the size of the + * address mask such that the entry with the widest mask (smallest + * numerical value) appears first in the list */ + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ntohl(entry->mask) > ntohl(iter->mask)) { + __list_add_rcu(&entry->list, + iter->list.prev, + &iter->list); + return 0; + } + list_add_tail_rcu(&entry->list, head); + return 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_add - Add a new IPv6 address entry to a list + * @entry: address entry + * @head: the list head + * + * Description: + * Add a new address entry to the list pointed to by @head. On success zero is + * returned, otherwise a negative value is returned. The caller is responsible + * for calling the necessary locking functions. + * + */ +int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head) +{ + struct netlbl_af6list *iter; + + iter = netlbl_af6list_search(&entry->addr, head); + if (iter != NULL && + ipv6_addr_equal(&iter->addr, &entry->addr) && + ipv6_addr_equal(&iter->mask, &entry->mask)) + return -EEXIST; + + /* in order to speed up address searches through the list (the common + * case) we need to keep the list in order based on the size of the + * address mask such that the entry with the widest mask (smallest + * numerical value) appears first in the list */ + list_for_each_entry_rcu(iter, head, list) + if (iter->valid && + ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) { + __list_add_rcu(&entry->list, + iter->list.prev, + &iter->list); + return 0; + } + list_add_tail_rcu(&entry->list, head); + return 0; +} +#endif /* IPv6 */ + +/** + * netlbl_af4list_remove_entry - Remove an IPv4 address entry + * @entry: address entry + * + * Description: + * Remove the specified IP address entry. The caller is responsible for + * calling the necessary locking functions. + * + */ +void netlbl_af4list_remove_entry(struct netlbl_af4list *entry) +{ + entry->valid = 0; + list_del_rcu(&entry->list); +} + +/** + * netlbl_af4list_remove - Remove an IPv4 address entry + * @addr: IP address + * @mask: IP address mask + * @head: the list head + * + * Description: + * Remove an IP address entry from the list pointed to by @head. Returns the + * entry on success, NULL on failure. The caller is responsible for calling + * the necessary locking functions. + * + */ +struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, + struct list_head *head) +{ + struct netlbl_af4list *entry; + + entry = netlbl_af4list_search_exact(addr, mask, head); + if (entry == NULL) + return NULL; + netlbl_af4list_remove_entry(entry); + return entry; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_remove_entry - Remove an IPv6 address entry + * @entry: address entry + * + * Description: + * Remove the specified IP address entry. The caller is responsible for + * calling the necessary locking functions. + * + */ +void netlbl_af6list_remove_entry(struct netlbl_af6list *entry) +{ + entry->valid = 0; + list_del_rcu(&entry->list); +} + +/** + * netlbl_af6list_remove - Remove an IPv6 address entry + * @addr: IP address + * @mask: IP address mask + * @head: the list head + * + * Description: + * Remove an IP address entry from the list pointed to by @head. Returns the + * entry on success, NULL on failure. The caller is responsible for calling + * the necessary locking functions. + * + */ +struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head) +{ + struct netlbl_af6list *entry; + + entry = netlbl_af6list_search_exact(addr, mask, head); + if (entry == NULL) + return NULL; + netlbl_af6list_remove_entry(entry); + return entry; +} +#endif /* IPv6 */ + +/* + * Audit Helper Functions + */ + +#ifdef CONFIG_AUDIT +/** + * netlbl_af4list_audit_addr - Audit an IPv4 address + * @audit_buf: audit buffer + * @src: true if source address, false if destination + * @dev: network interface + * @addr: IP address + * @mask: IP address mask + * + * Description: + * Write the IPv4 address and address mask, if necessary, to @audit_buf. + * + */ +void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, + int src, const char *dev, + __be32 addr, __be32 mask) +{ + u32 mask_val = ntohl(mask); + char *dir = (src ? "src" : "dst"); + + if (dev != NULL) + audit_log_format(audit_buf, " netif=%s", dev); + audit_log_format(audit_buf, " %s=%pI4", dir, &addr); + if (mask_val != 0xffffffff) { + u32 mask_len = 0; + while (mask_val > 0) { + mask_val <<= 1; + mask_len++; + } + audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len); + } +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_af6list_audit_addr - Audit an IPv6 address + * @audit_buf: audit buffer + * @src: true if source address, false if destination + * @dev: network interface + * @addr: IP address + * @mask: IP address mask + * + * Description: + * Write the IPv6 address and address mask, if necessary, to @audit_buf. + * + */ +void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, + int src, + const char *dev, + const struct in6_addr *addr, + const struct in6_addr *mask) +{ + char *dir = (src ? "src" : "dst"); + + if (dev != NULL) + audit_log_format(audit_buf, " netif=%s", dev); + audit_log_format(audit_buf, " %s=%pI6", dir, addr); + if (ntohl(mask->s6_addr32[3]) != 0xffffffff) { + u32 mask_len = 0; + u32 mask_val; + int iter = -1; + while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff) + mask_len += 32; + mask_val = ntohl(mask->s6_addr32[iter]); + while (mask_val > 0) { + mask_val <<= 1; + mask_len++; + } + audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len); + } +} +#endif /* IPv6 */ +#endif /* CONFIG_AUDIT */ diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h new file mode 100644 index 00000000..2b9644e1 --- /dev/null +++ b/net/netlabel/netlabel_addrlist.h @@ -0,0 +1,209 @@ +/* + * NetLabel Network Address Lists + * + * This file contains network address list functions used to manage ordered + * lists of network addresses for use by the NetLabel subsystem. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_ADDRLIST_H +#define _NETLABEL_ADDRLIST_H + +#include +#include +#include +#include +#include + +/** + * struct netlbl_af4list - NetLabel IPv4 address list + * @addr: IPv4 address + * @mask: IPv4 address mask + * @valid: valid flag + * @list: list structure, used internally + */ +struct netlbl_af4list { + __be32 addr; + __be32 mask; + + u32 valid; + struct list_head list; +}; + +/** + * struct netlbl_af6list - NetLabel IPv6 address list + * @addr: IPv6 address + * @mask: IPv6 address mask + * @valid: valid flag + * @list: list structure, used internally + */ +struct netlbl_af6list { + struct in6_addr addr; + struct in6_addr mask; + + u32 valid; + struct list_head list; +}; + +#define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list) + +static inline struct netlbl_af4list *__af4list_valid(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af4list *n = __af4list_entry(s); + while (i != h && !n->valid) { + i = i->next; + n = __af4list_entry(i); + } + return n; +} + +static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af4list *n = __af4list_entry(s); + while (i != h && !n->valid) { + i = rcu_dereference(i->next); + n = __af4list_entry(i); + } + return n; +} + +#define netlbl_af4list_foreach(iter, head) \ + for (iter = __af4list_valid((head)->next, head); \ + &iter->list != (head); \ + iter = __af4list_valid(iter->list.next, head)) + +#define netlbl_af4list_foreach_rcu(iter, head) \ + for (iter = __af4list_valid_rcu((head)->next, head); \ + &iter->list != (head); \ + iter = __af4list_valid_rcu(iter->list.next, head)) + +#define netlbl_af4list_foreach_safe(iter, tmp, head) \ + for (iter = __af4list_valid((head)->next, head), \ + tmp = __af4list_valid(iter->list.next, head); \ + &iter->list != (head); \ + iter = tmp, tmp = __af4list_valid(iter->list.next, head)) + +int netlbl_af4list_add(struct netlbl_af4list *entry, + struct list_head *head); +struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, + struct list_head *head); +void netlbl_af4list_remove_entry(struct netlbl_af4list *entry); +struct netlbl_af4list *netlbl_af4list_search(__be32 addr, + struct list_head *head); +struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, + __be32 mask, + struct list_head *head); + +#ifdef CONFIG_AUDIT +void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, + int src, const char *dev, + __be32 addr, __be32 mask); +#else +static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, + int src, const char *dev, + __be32 addr, __be32 mask) +{ +} +#endif + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +#define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list) + +static inline struct netlbl_af6list *__af6list_valid(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af6list *n = __af6list_entry(s); + while (i != h && !n->valid) { + i = i->next; + n = __af6list_entry(i); + } + return n; +} + +static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, + struct list_head *h) +{ + struct list_head *i = s; + struct netlbl_af6list *n = __af6list_entry(s); + while (i != h && !n->valid) { + i = rcu_dereference(i->next); + n = __af6list_entry(i); + } + return n; +} + +#define netlbl_af6list_foreach(iter, head) \ + for (iter = __af6list_valid((head)->next, head); \ + &iter->list != (head); \ + iter = __af6list_valid(iter->list.next, head)) + +#define netlbl_af6list_foreach_rcu(iter, head) \ + for (iter = __af6list_valid_rcu((head)->next, head); \ + &iter->list != (head); \ + iter = __af6list_valid_rcu(iter->list.next, head)) + +#define netlbl_af6list_foreach_safe(iter, tmp, head) \ + for (iter = __af6list_valid((head)->next, head), \ + tmp = __af6list_valid(iter->list.next, head); \ + &iter->list != (head); \ + iter = tmp, tmp = __af6list_valid(iter->list.next, head)) + +int netlbl_af6list_add(struct netlbl_af6list *entry, + struct list_head *head); +struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head); +void netlbl_af6list_remove_entry(struct netlbl_af6list *entry); +struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, + struct list_head *head); +struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, + const struct in6_addr *mask, + struct list_head *head); + +#ifdef CONFIG_AUDIT +void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, + int src, + const char *dev, + const struct in6_addr *addr, + const struct in6_addr *mask); +#else +static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, + int src, + const char *dev, + const struct in6_addr *addr, + const struct in6_addr *mask) +{ +} +#endif +#endif /* IPV6 */ + +#endif diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c new file mode 100644 index 00000000..bae5756b --- /dev/null +++ b/net/netlabel/netlabel_cipso_v4.c @@ -0,0 +1,787 @@ +/* + * NetLabel CIPSO/IPv4 Support + * + * This file defines the CIPSO/IPv4 functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_user.h" +#include "netlabel_cipso_v4.h" +#include "netlabel_mgmt.h" +#include "netlabel_domainhash.h" + +/* Argument struct for cipso_v4_doi_walk() */ +struct netlbl_cipsov4_doiwalk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + +/* Argument struct for netlbl_domhsh_walk() */ +struct netlbl_domhsh_walk_arg { + struct netlbl_audit *audit_info; + u32 doi; +}; + +/* NetLabel Generic NETLINK CIPSOv4 family */ +static struct genl_family netlbl_cipsov4_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_CIPSOV4_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_CIPSOV4_A_MAX, +}; + +/* NetLabel Netlink attribute policy */ +static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = { + [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 }, + [NLBL_CIPSOV4_A_MTYPE] = { .type = NLA_U32 }, + [NLBL_CIPSOV4_A_TAG] = { .type = NLA_U8 }, + [NLBL_CIPSOV4_A_TAGLST] = { .type = NLA_NESTED }, + [NLBL_CIPSOV4_A_MLSLVLLOC] = { .type = NLA_U32 }, + [NLBL_CIPSOV4_A_MLSLVLREM] = { .type = NLA_U32 }, + [NLBL_CIPSOV4_A_MLSLVL] = { .type = NLA_NESTED }, + [NLBL_CIPSOV4_A_MLSLVLLST] = { .type = NLA_NESTED }, + [NLBL_CIPSOV4_A_MLSCATLOC] = { .type = NLA_U32 }, + [NLBL_CIPSOV4_A_MLSCATREM] = { .type = NLA_U32 }, + [NLBL_CIPSOV4_A_MLSCAT] = { .type = NLA_NESTED }, + [NLBL_CIPSOV4_A_MLSCATLST] = { .type = NLA_NESTED }, +}; + +/* + * Helper Functions + */ + +/** + * netlbl_cipsov4_add_common - Parse the common sections of a ADD message + * @info: the Generic NETLINK info block + * @doi_def: the CIPSO V4 DOI definition + * + * Description: + * Parse the common sections of a ADD message and fill in the related values + * in @doi_def. Returns zero on success, negative values on failure. + * + */ +static int netlbl_cipsov4_add_common(struct genl_info *info, + struct cipso_v4_doi *doi_def) +{ + struct nlattr *nla; + int nla_rem; + u32 iter = 0; + + doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); + + if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_TAGLST], + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy) != 0) + return -EINVAL; + + nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem) + if (nla_type(nla) == NLBL_CIPSOV4_A_TAG) { + if (iter >= CIPSO_V4_TAG_MAXCNT) + return -EINVAL; + doi_def->tags[iter++] = nla_get_u8(nla); + } + while (iter < CIPSO_V4_TAG_MAXCNT) + doi_def->tags[iter++] = CIPSO_V4_TAG_INVALID; + + return 0; +} + +/* + * NetLabel Command Handlers + */ + +/** + * netlbl_cipsov4_add_std - Adds a CIPSO V4 DOI definition + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD + * message and add it to the CIPSO V4 engine. Return zero on success and + * non-zero on error. + * + */ +static int netlbl_cipsov4_add_std(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val = -EINVAL; + struct cipso_v4_doi *doi_def = NULL; + struct nlattr *nla_a; + struct nlattr *nla_b; + int nla_a_rem; + int nla_b_rem; + u32 iter; + + if (!info->attrs[NLBL_CIPSOV4_A_TAGLST] || + !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST]) + return -EINVAL; + + if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy) != 0) + return -EINVAL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (doi_def == NULL) + return -ENOMEM; + doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); + if (doi_def->map.std == NULL) { + ret_val = -ENOMEM; + goto add_std_failure; + } + doi_def->type = CIPSO_V4_MAP_TRANS; + + ret_val = netlbl_cipsov4_add_common(info, doi_def); + if (ret_val != 0) + goto add_std_failure; + ret_val = -EINVAL; + + nla_for_each_nested(nla_a, + info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], + nla_a_rem) + if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { + if (nla_validate_nested(nla_a, + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy) != 0) + goto add_std_failure; + nla_for_each_nested(nla_b, nla_a, nla_b_rem) + switch (nla_type(nla_b)) { + case NLBL_CIPSOV4_A_MLSLVLLOC: + if (nla_get_u32(nla_b) > + CIPSO_V4_MAX_LOC_LVLS) + goto add_std_failure; + if (nla_get_u32(nla_b) >= + doi_def->map.std->lvl.local_size) + doi_def->map.std->lvl.local_size = + nla_get_u32(nla_b) + 1; + break; + case NLBL_CIPSOV4_A_MLSLVLREM: + if (nla_get_u32(nla_b) > + CIPSO_V4_MAX_REM_LVLS) + goto add_std_failure; + if (nla_get_u32(nla_b) >= + doi_def->map.std->lvl.cipso_size) + doi_def->map.std->lvl.cipso_size = + nla_get_u32(nla_b) + 1; + break; + } + } + doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, + sizeof(u32), + GFP_KERNEL); + if (doi_def->map.std->lvl.local == NULL) { + ret_val = -ENOMEM; + goto add_std_failure; + } + doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, + sizeof(u32), + GFP_KERNEL); + if (doi_def->map.std->lvl.cipso == NULL) { + ret_val = -ENOMEM; + goto add_std_failure; + } + for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++) + doi_def->map.std->lvl.local[iter] = CIPSO_V4_INV_LVL; + for (iter = 0; iter < doi_def->map.std->lvl.cipso_size; iter++) + doi_def->map.std->lvl.cipso[iter] = CIPSO_V4_INV_LVL; + nla_for_each_nested(nla_a, + info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], + nla_a_rem) + if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { + struct nlattr *lvl_loc; + struct nlattr *lvl_rem; + + lvl_loc = nla_find_nested(nla_a, + NLBL_CIPSOV4_A_MLSLVLLOC); + lvl_rem = nla_find_nested(nla_a, + NLBL_CIPSOV4_A_MLSLVLREM); + if (lvl_loc == NULL || lvl_rem == NULL) + goto add_std_failure; + doi_def->map.std->lvl.local[nla_get_u32(lvl_loc)] = + nla_get_u32(lvl_rem); + doi_def->map.std->lvl.cipso[nla_get_u32(lvl_rem)] = + nla_get_u32(lvl_loc); + } + + if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) { + if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSCATLST], + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy) != 0) + goto add_std_failure; + + nla_for_each_nested(nla_a, + info->attrs[NLBL_CIPSOV4_A_MLSCATLST], + nla_a_rem) + if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { + if (nla_validate_nested(nla_a, + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy) != 0) + goto add_std_failure; + nla_for_each_nested(nla_b, nla_a, nla_b_rem) + switch (nla_type(nla_b)) { + case NLBL_CIPSOV4_A_MLSCATLOC: + if (nla_get_u32(nla_b) > + CIPSO_V4_MAX_LOC_CATS) + goto add_std_failure; + if (nla_get_u32(nla_b) >= + doi_def->map.std->cat.local_size) + doi_def->map.std->cat.local_size = + nla_get_u32(nla_b) + 1; + break; + case NLBL_CIPSOV4_A_MLSCATREM: + if (nla_get_u32(nla_b) > + CIPSO_V4_MAX_REM_CATS) + goto add_std_failure; + if (nla_get_u32(nla_b) >= + doi_def->map.std->cat.cipso_size) + doi_def->map.std->cat.cipso_size = + nla_get_u32(nla_b) + 1; + break; + } + } + doi_def->map.std->cat.local = kcalloc( + doi_def->map.std->cat.local_size, + sizeof(u32), + GFP_KERNEL); + if (doi_def->map.std->cat.local == NULL) { + ret_val = -ENOMEM; + goto add_std_failure; + } + doi_def->map.std->cat.cipso = kcalloc( + doi_def->map.std->cat.cipso_size, + sizeof(u32), + GFP_KERNEL); + if (doi_def->map.std->cat.cipso == NULL) { + ret_val = -ENOMEM; + goto add_std_failure; + } + for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++) + doi_def->map.std->cat.local[iter] = CIPSO_V4_INV_CAT; + for (iter = 0; iter < doi_def->map.std->cat.cipso_size; iter++) + doi_def->map.std->cat.cipso[iter] = CIPSO_V4_INV_CAT; + nla_for_each_nested(nla_a, + info->attrs[NLBL_CIPSOV4_A_MLSCATLST], + nla_a_rem) + if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { + struct nlattr *cat_loc; + struct nlattr *cat_rem; + + cat_loc = nla_find_nested(nla_a, + NLBL_CIPSOV4_A_MLSCATLOC); + cat_rem = nla_find_nested(nla_a, + NLBL_CIPSOV4_A_MLSCATREM); + if (cat_loc == NULL || cat_rem == NULL) + goto add_std_failure; + doi_def->map.std->cat.local[ + nla_get_u32(cat_loc)] = + nla_get_u32(cat_rem); + doi_def->map.std->cat.cipso[ + nla_get_u32(cat_rem)] = + nla_get_u32(cat_loc); + } + } + + ret_val = cipso_v4_doi_add(doi_def, audit_info); + if (ret_val != 0) + goto add_std_failure; + return 0; + +add_std_failure: + if (doi_def) + cipso_v4_doi_free(doi_def); + return ret_val; +} + +/** + * netlbl_cipsov4_add_pass - Adds a CIPSO V4 DOI definition + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Create a new CIPSO_V4_MAP_PASS DOI definition based on the given ADD message + * and add it to the CIPSO V4 engine. Return zero on success and non-zero on + * error. + * + */ +static int netlbl_cipsov4_add_pass(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct cipso_v4_doi *doi_def = NULL; + + if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) + return -EINVAL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (doi_def == NULL) + return -ENOMEM; + doi_def->type = CIPSO_V4_MAP_PASS; + + ret_val = netlbl_cipsov4_add_common(info, doi_def); + if (ret_val != 0) + goto add_pass_failure; + + ret_val = cipso_v4_doi_add(doi_def, audit_info); + if (ret_val != 0) + goto add_pass_failure; + return 0; + +add_pass_failure: + cipso_v4_doi_free(doi_def); + return ret_val; +} + +/** + * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD + * message and add it to the CIPSO V4 engine. Return zero on success and + * non-zero on error. + * + */ +static int netlbl_cipsov4_add_local(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct cipso_v4_doi *doi_def = NULL; + + if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) + return -EINVAL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (doi_def == NULL) + return -ENOMEM; + doi_def->type = CIPSO_V4_MAP_LOCAL; + + ret_val = netlbl_cipsov4_add_common(info, doi_def); + if (ret_val != 0) + goto add_local_failure; + + ret_val = cipso_v4_doi_add(doi_def, audit_info); + if (ret_val != 0) + goto add_local_failure; + return 0; + +add_local_failure: + cipso_v4_doi_free(doi_def); + return ret_val; +} + +/** + * netlbl_cipsov4_add - Handle an ADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Create a new DOI definition based on the given ADD message and add it to the + * CIPSO V4 engine. Returns zero on success, negative values on failure. + * + */ +static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) + +{ + int ret_val = -EINVAL; + struct netlbl_audit audit_info; + + if (!info->attrs[NLBL_CIPSOV4_A_DOI] || + !info->attrs[NLBL_CIPSOV4_A_MTYPE]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { + case CIPSO_V4_MAP_TRANS: + ret_val = netlbl_cipsov4_add_std(info, &audit_info); + break; + case CIPSO_V4_MAP_PASS: + ret_val = netlbl_cipsov4_add_pass(info, &audit_info); + break; + case CIPSO_V4_MAP_LOCAL: + ret_val = netlbl_cipsov4_add_local(info, &audit_info); + break; + } + if (ret_val == 0) + atomic_inc(&netlabel_mgmt_protocount); + + return ret_val; +} + +/** + * netlbl_cipsov4_list - Handle a LIST message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated LIST message and respond accordingly. While the + * response message generated by the kernel is straightforward, determining + * before hand the size of the buffer to allocate is not (we have to generate + * the message to know the size). In order to keep this function sane what we + * do is allocate a buffer of NLMSG_GOODSIZE and try to fit the response in + * that size, if we fail then we restart with a larger buffer and try again. + * We continue in this manner until we hit a limit of failed attempts then we + * give up and just send an error message. Returns zero on success and + * negative values on error. + * + */ +static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val; + struct sk_buff *ans_skb = NULL; + u32 nlsze_mult = 1; + void *data; + u32 doi; + struct nlattr *nla_a; + struct nlattr *nla_b; + struct cipso_v4_doi *doi_def; + u32 iter; + + if (!info->attrs[NLBL_CIPSOV4_A_DOI]) { + ret_val = -EINVAL; + goto list_failure; + } + +list_start: + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE * nlsze_mult, GFP_KERNEL); + if (ans_skb == NULL) { + ret_val = -ENOMEM; + goto list_failure; + } + data = genlmsg_put_reply(ans_skb, info, &netlbl_cipsov4_gnl_family, + 0, NLBL_CIPSOV4_C_LIST); + if (data == NULL) { + ret_val = -ENOMEM; + goto list_failure; + } + + doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); + + rcu_read_lock(); + doi_def = cipso_v4_doi_getdef(doi); + if (doi_def == NULL) { + ret_val = -EINVAL; + goto list_failure_lock; + } + + ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type); + if (ret_val != 0) + goto list_failure_lock; + + nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_TAGLST); + if (nla_a == NULL) { + ret_val = -ENOMEM; + goto list_failure_lock; + } + for (iter = 0; + iter < CIPSO_V4_TAG_MAXCNT && + doi_def->tags[iter] != CIPSO_V4_TAG_INVALID; + iter++) { + ret_val = nla_put_u8(ans_skb, + NLBL_CIPSOV4_A_TAG, + doi_def->tags[iter]); + if (ret_val != 0) + goto list_failure_lock; + } + nla_nest_end(ans_skb, nla_a); + + switch (doi_def->type) { + case CIPSO_V4_MAP_TRANS: + nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST); + if (nla_a == NULL) { + ret_val = -ENOMEM; + goto list_failure_lock; + } + for (iter = 0; + iter < doi_def->map.std->lvl.local_size; + iter++) { + if (doi_def->map.std->lvl.local[iter] == + CIPSO_V4_INV_LVL) + continue; + + nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVL); + if (nla_b == NULL) { + ret_val = -ENOMEM; + goto list_retry; + } + ret_val = nla_put_u32(ans_skb, + NLBL_CIPSOV4_A_MLSLVLLOC, + iter); + if (ret_val != 0) + goto list_retry; + ret_val = nla_put_u32(ans_skb, + NLBL_CIPSOV4_A_MLSLVLREM, + doi_def->map.std->lvl.local[iter]); + if (ret_val != 0) + goto list_retry; + nla_nest_end(ans_skb, nla_b); + } + nla_nest_end(ans_skb, nla_a); + + nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCATLST); + if (nla_a == NULL) { + ret_val = -ENOMEM; + goto list_retry; + } + for (iter = 0; + iter < doi_def->map.std->cat.local_size; + iter++) { + if (doi_def->map.std->cat.local[iter] == + CIPSO_V4_INV_CAT) + continue; + + nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCAT); + if (nla_b == NULL) { + ret_val = -ENOMEM; + goto list_retry; + } + ret_val = nla_put_u32(ans_skb, + NLBL_CIPSOV4_A_MLSCATLOC, + iter); + if (ret_val != 0) + goto list_retry; + ret_val = nla_put_u32(ans_skb, + NLBL_CIPSOV4_A_MLSCATREM, + doi_def->map.std->cat.local[iter]); + if (ret_val != 0) + goto list_retry; + nla_nest_end(ans_skb, nla_b); + } + nla_nest_end(ans_skb, nla_a); + + break; + } + rcu_read_unlock(); + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +list_retry: + /* XXX - this limit is a guesstimate */ + if (nlsze_mult < 4) { + rcu_read_unlock(); + kfree_skb(ans_skb); + nlsze_mult *= 2; + goto list_start; + } +list_failure_lock: + rcu_read_unlock(); +list_failure: + kfree_skb(ans_skb); + return ret_val; +} + +/** + * netlbl_cipsov4_listall_cb - cipso_v4_doi_walk() callback for LISTALL + * @doi_def: the CIPSOv4 DOI definition + * @arg: the netlbl_cipsov4_doiwalk_arg structure + * + * Description: + * This function is designed to be used as a callback to the + * cipso_v4_doi_walk() function for use in generating a response for a LISTALL + * message. Returns the size of the message on success, negative values on + * failure. + * + */ +static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_cipsov4_doiwalk_arg *cb_arg = arg; + void *data; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, + cb_arg->seq, &netlbl_cipsov4_gnl_family, + NLM_F_MULTI, NLBL_CIPSOV4_C_LISTALL); + if (data == NULL) + goto listall_cb_failure; + + ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_DOI, doi_def->doi); + if (ret_val != 0) + goto listall_cb_failure; + ret_val = nla_put_u32(cb_arg->skb, + NLBL_CIPSOV4_A_MTYPE, + doi_def->type); + if (ret_val != 0) + goto listall_cb_failure; + + return genlmsg_end(cb_arg->skb, data); + +listall_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_cipsov4_listall - Handle a LISTALL message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated LISTALL message and respond accordingly. Returns + * zero on success and negative values on error. + * + */ +static int netlbl_cipsov4_listall(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_cipsov4_doiwalk_arg cb_arg; + u32 doi_skip = cb->args[0]; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + cipso_v4_doi_walk(&doi_skip, netlbl_cipsov4_listall_cb, &cb_arg); + + cb->args[0] = doi_skip; + return skb->len; +} + +/** + * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE + * @entry: LSM domain mapping entry + * @arg: the netlbl_domhsh_walk_arg structure + * + * Description: + * This function is intended for use by netlbl_cipsov4_remove() as the callback + * for the netlbl_domhsh_walk() function; it removes LSM domain map entries + * which are associated with the CIPSO DOI specified in @arg. Returns zero on + * success, negative values on failure. + * + */ +static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg) +{ + struct netlbl_domhsh_walk_arg *cb_arg = arg; + + if (entry->type == NETLBL_NLTYPE_CIPSOV4 && + entry->type_def.cipsov4->doi == cb_arg->doi) + return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); + + return 0; +} + +/** + * netlbl_cipsov4_remove - Handle a REMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated REMOVE message and respond accordingly. Returns + * zero on success, negative values on failure. + * + */ +static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -EINVAL; + struct netlbl_domhsh_walk_arg cb_arg; + struct netlbl_audit audit_info; + u32 skip_bkt = 0; + u32 skip_chain = 0; + + if (!info->attrs[NLBL_CIPSOV4_A_DOI]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); + cb_arg.audit_info = &audit_info; + ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, + netlbl_cipsov4_remove_cb, &cb_arg); + if (ret_val == 0 || ret_val == -ENOENT) { + ret_val = cipso_v4_doi_remove(cb_arg.doi, &audit_info); + if (ret_val == 0) + atomic_dec(&netlabel_mgmt_protocount); + } + + return ret_val; +} + +/* + * NetLabel Generic NETLINK Command Definitions + */ + +static struct genl_ops netlbl_cipsov4_ops[] = { + { + .cmd = NLBL_CIPSOV4_C_ADD, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_cipsov4_genl_policy, + .doit = netlbl_cipsov4_add, + .dumpit = NULL, + }, + { + .cmd = NLBL_CIPSOV4_C_REMOVE, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_cipsov4_genl_policy, + .doit = netlbl_cipsov4_remove, + .dumpit = NULL, + }, + { + .cmd = NLBL_CIPSOV4_C_LIST, + .flags = 0, + .policy = netlbl_cipsov4_genl_policy, + .doit = netlbl_cipsov4_list, + .dumpit = NULL, + }, + { + .cmd = NLBL_CIPSOV4_C_LISTALL, + .flags = 0, + .policy = netlbl_cipsov4_genl_policy, + .doit = NULL, + .dumpit = netlbl_cipsov4_listall, + }, +}; + +/* + * NetLabel Generic NETLINK Protocol Functions + */ + +/** + * netlbl_cipsov4_genl_init - Register the CIPSOv4 NetLabel component + * + * Description: + * Register the CIPSOv4 packet NetLabel component with the Generic NETLINK + * mechanism. Returns zero on success, negative values on failure. + * + */ +int __init netlbl_cipsov4_genl_init(void) +{ + return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family, + netlbl_cipsov4_ops, ARRAY_SIZE(netlbl_cipsov4_ops)); +} diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h new file mode 100644 index 00000000..af7f3355 --- /dev/null +++ b/net/netlabel/netlabel_cipso_v4.h @@ -0,0 +1,170 @@ +/* + * NetLabel CIPSO/IPv4 Support + * + * This file defines the CIPSO/IPv4 functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_CIPSO_V4 +#define _NETLABEL_CIPSO_V4 + +#include + +/* + * The following NetLabel payloads are supported by the CIPSO subsystem. + * + * o ADD: + * Sent by an application to add a new DOI mapping table. + * + * Required attributes: + * + * NLBL_CIPSOV4_A_DOI + * NLBL_CIPSOV4_A_MTYPE + * NLBL_CIPSOV4_A_TAGLST + * + * If using CIPSO_V4_MAP_TRANS the following attributes are required: + * + * NLBL_CIPSOV4_A_MLSLVLLST + * NLBL_CIPSOV4_A_MLSCATLST + * + * If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes + * are required. + * + * o REMOVE: + * Sent by an application to remove a specific DOI mapping table from the + * CIPSO V4 system. + * + * Required attributes: + * + * NLBL_CIPSOV4_A_DOI + * + * o LIST: + * Sent by an application to list the details of a DOI definition. On + * success the kernel should send a response using the following format. + * + * Required attributes: + * + * NLBL_CIPSOV4_A_DOI + * + * The valid response message format depends on the type of the DOI mapping, + * the defined formats are shown below. + * + * Required attributes: + * + * NLBL_CIPSOV4_A_MTYPE + * NLBL_CIPSOV4_A_TAGLST + * + * If using CIPSO_V4_MAP_TRANS the following attributes are required: + * + * NLBL_CIPSOV4_A_MLSLVLLST + * NLBL_CIPSOV4_A_MLSCATLST + * + * If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes + * are required. + * + * o LISTALL: + * This message is sent by an application to list the valid DOIs on the + * system. When sent by an application there is no payload and the + * NLM_F_DUMP flag should be set. The kernel should respond with a series of + * the following messages. + * + * Required attributes: + * + * NLBL_CIPSOV4_A_DOI + * NLBL_CIPSOV4_A_MTYPE + * + */ + +/* NetLabel CIPSOv4 commands */ +enum { + NLBL_CIPSOV4_C_UNSPEC, + NLBL_CIPSOV4_C_ADD, + NLBL_CIPSOV4_C_REMOVE, + NLBL_CIPSOV4_C_LIST, + NLBL_CIPSOV4_C_LISTALL, + __NLBL_CIPSOV4_C_MAX, +}; + +/* NetLabel CIPSOv4 attributes */ +enum { + NLBL_CIPSOV4_A_UNSPEC, + NLBL_CIPSOV4_A_DOI, + /* (NLA_U32) + * the DOI value */ + NLBL_CIPSOV4_A_MTYPE, + /* (NLA_U32) + * the mapping table type (defined in the cipso_ipv4.h header as + * CIPSO_V4_MAP_*) */ + NLBL_CIPSOV4_A_TAG, + /* (NLA_U8) + * a CIPSO tag type, meant to be used within a NLBL_CIPSOV4_A_TAGLST + * attribute */ + NLBL_CIPSOV4_A_TAGLST, + /* (NLA_NESTED) + * the CIPSO tag list for the DOI, there must be at least one + * NLBL_CIPSOV4_A_TAG attribute, tags listed first are given higher + * priorirty when sending packets */ + NLBL_CIPSOV4_A_MLSLVLLOC, + /* (NLA_U32) + * the local MLS sensitivity level */ + NLBL_CIPSOV4_A_MLSLVLREM, + /* (NLA_U32) + * the remote MLS sensitivity level */ + NLBL_CIPSOV4_A_MLSLVL, + /* (NLA_NESTED) + * a MLS sensitivity level mapping, must contain only one attribute of + * each of the following types: NLBL_CIPSOV4_A_MLSLVLLOC and + * NLBL_CIPSOV4_A_MLSLVLREM */ + NLBL_CIPSOV4_A_MLSLVLLST, + /* (NLA_NESTED) + * the CIPSO level mappings, there must be at least one + * NLBL_CIPSOV4_A_MLSLVL attribute */ + NLBL_CIPSOV4_A_MLSCATLOC, + /* (NLA_U32) + * the local MLS category */ + NLBL_CIPSOV4_A_MLSCATREM, + /* (NLA_U32) + * the remote MLS category */ + NLBL_CIPSOV4_A_MLSCAT, + /* (NLA_NESTED) + * a MLS category mapping, must contain only one attribute of each of + * the following types: NLBL_CIPSOV4_A_MLSCATLOC and + * NLBL_CIPSOV4_A_MLSCATREM */ + NLBL_CIPSOV4_A_MLSCATLST, + /* (NLA_NESTED) + * the CIPSO category mappings, there must be at least one + * NLBL_CIPSOV4_A_MLSCAT attribute */ + __NLBL_CIPSOV4_A_MAX, +}; +#define NLBL_CIPSOV4_A_MAX (__NLBL_CIPSOV4_A_MAX - 1) + +/* NetLabel protocol functions */ +int netlbl_cipsov4_genl_init(void); + +/* Free the memory associated with a CIPSOv4 DOI definition */ +void netlbl_cipsov4_doi_free(struct rcu_head *entry); + +#endif diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c new file mode 100644 index 00000000..de0d8e4c --- /dev/null +++ b/net/netlabel/netlabel_domainhash.c @@ -0,0 +1,729 @@ +/* + * NetLabel Domain Hash Table + * + * This file manages the domain hash table that NetLabel uses to determine + * which network labeling protocol to use for a given domain. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_mgmt.h" +#include "netlabel_addrlist.h" +#include "netlabel_domainhash.h" +#include "netlabel_user.h" + +struct netlbl_domhsh_tbl { + struct list_head *tbl; + u32 size; +}; + +/* Domain hash table */ +/* updates should be so rare that having one spinlock for the entire hash table + * should be okay */ +static DEFINE_SPINLOCK(netlbl_domhsh_lock); +#define netlbl_domhsh_rcu_deref(p) \ + rcu_dereference_check(p, rcu_read_lock_held() || \ + lockdep_is_held(&netlbl_domhsh_lock)) +static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; +static struct netlbl_dom_map *netlbl_domhsh_def = NULL; + +/* + * Domain Hash Table Helper Functions + */ + +/** + * netlbl_domhsh_free_entry - Frees a domain hash table entry + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to a hash table entry can be released + * safely. + * + */ +static void netlbl_domhsh_free_entry(struct rcu_head *entry) +{ + struct netlbl_dom_map *ptr; + struct netlbl_af4list *iter4; + struct netlbl_af4list *tmp4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; + struct netlbl_af6list *tmp6; +#endif /* IPv6 */ + + ptr = container_of(entry, struct netlbl_dom_map, rcu); + if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) { + netlbl_af4list_foreach_safe(iter4, tmp4, + &ptr->type_def.addrsel->list4) { + netlbl_af4list_remove_entry(iter4); + kfree(netlbl_domhsh_addr4_entry(iter4)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_safe(iter6, tmp6, + &ptr->type_def.addrsel->list6) { + netlbl_af6list_remove_entry(iter6); + kfree(netlbl_domhsh_addr6_entry(iter6)); + } +#endif /* IPv6 */ + } + kfree(ptr->domain); + kfree(ptr); +} + +/** + * netlbl_domhsh_hash - Hashing function for the domain hash table + * @domain: the domain name to hash + * + * Description: + * This is the hashing function for the domain hash table, it returns the + * correct bucket number for the domain. The caller is responsible for + * ensuring that the hash table is protected with either a RCU read lock or the + * hash table lock. + * + */ +static u32 netlbl_domhsh_hash(const char *key) +{ + u32 iter; + u32 val; + u32 len; + + /* This is taken (with slight modification) from + * security/selinux/ss/symtab.c:symhash() */ + + for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) + val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; + return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); +} + +/** + * netlbl_domhsh_search - Search for a domain entry + * @domain: the domain + * + * Description: + * Searches the domain hash table and returns a pointer to the hash table + * entry if found, otherwise NULL is returned. The caller is responsible for + * ensuring that the hash table is protected with either a RCU read lock or the + * hash table lock. + * + */ +static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) +{ + u32 bkt; + struct list_head *bkt_list; + struct netlbl_dom_map *iter; + + if (domain != NULL) { + bkt = netlbl_domhsh_hash(domain); + bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; + list_for_each_entry_rcu(iter, bkt_list, list) + if (iter->valid && strcmp(iter->domain, domain) == 0) + return iter; + } + + return NULL; +} + +/** + * netlbl_domhsh_search_def - Search for a domain entry + * @domain: the domain + * @def: return default if no match is found + * + * Description: + * Searches the domain hash table and returns a pointer to the hash table + * entry if an exact match is found, if an exact match is not present in the + * hash table then the default entry is returned if valid otherwise NULL is + * returned. The caller is responsible ensuring that the hash table is + * protected with either a RCU read lock or the hash table lock. + * + */ +static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) +{ + struct netlbl_dom_map *entry; + + entry = netlbl_domhsh_search(domain); + if (entry == NULL) { + entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def); + if (entry != NULL && !entry->valid) + entry = NULL; + } + + return entry; +} + +/** + * netlbl_domhsh_audit_add - Generate an audit entry for an add event + * @entry: the entry being added + * @addr4: the IPv4 address information + * @addr6: the IPv6 address information + * @result: the result code + * @audit_info: NetLabel audit information + * + * Description: + * Generate an audit record for adding a new NetLabel/LSM mapping entry with + * the given information. Caller is responsible for holding the necessary + * locks. + * + */ +static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, + struct netlbl_af4list *addr4, + struct netlbl_af6list *addr6, + int result, + struct netlbl_audit *audit_info) +{ + struct audit_buffer *audit_buf; + struct cipso_v4_doi *cipsov4 = NULL; + u32 type; + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); + if (audit_buf != NULL) { + audit_log_format(audit_buf, " nlbl_domain=%s", + entry->domain ? entry->domain : "(default)"); + if (addr4 != NULL) { + struct netlbl_domaddr4_map *map4; + map4 = netlbl_domhsh_addr4_entry(addr4); + type = map4->type; + cipsov4 = map4->type_def.cipsov4; + netlbl_af4list_audit_addr(audit_buf, 0, NULL, + addr4->addr, addr4->mask); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } else if (addr6 != NULL) { + struct netlbl_domaddr6_map *map6; + map6 = netlbl_domhsh_addr6_entry(addr6); + type = map6->type; + netlbl_af6list_audit_addr(audit_buf, 0, NULL, + &addr6->addr, &addr6->mask); +#endif /* IPv6 */ + } else { + type = entry->type; + cipsov4 = entry->type_def.cipsov4; + } + switch (type) { + case NETLBL_NLTYPE_UNLABELED: + audit_log_format(audit_buf, " nlbl_protocol=unlbl"); + break; + case NETLBL_NLTYPE_CIPSOV4: + BUG_ON(cipsov4 == NULL); + audit_log_format(audit_buf, + " nlbl_protocol=cipsov4 cipso_doi=%u", + cipsov4->doi); + break; + } + audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); + audit_log_end(audit_buf); + } +} + +/* + * Domain Hash Table Functions + */ + +/** + * netlbl_domhsh_init - Init for the domain hash + * @size: the number of bits to use for the hash buckets + * + * Description: + * Initializes the domain hash table, should be called only by + * netlbl_user_init() during initialization. Returns zero on success, non-zero + * values on error. + * + */ +int __init netlbl_domhsh_init(u32 size) +{ + u32 iter; + struct netlbl_domhsh_tbl *hsh_tbl; + + if (size == 0) + return -EINVAL; + + hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); + if (hsh_tbl == NULL) + return -ENOMEM; + hsh_tbl->size = 1 << size; + hsh_tbl->tbl = kcalloc(hsh_tbl->size, + sizeof(struct list_head), + GFP_KERNEL); + if (hsh_tbl->tbl == NULL) { + kfree(hsh_tbl); + return -ENOMEM; + } + for (iter = 0; iter < hsh_tbl->size; iter++) + INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); + + spin_lock(&netlbl_domhsh_lock); + rcu_assign_pointer(netlbl_domhsh, hsh_tbl); + spin_unlock(&netlbl_domhsh_lock); + + return 0; +} + +/** + * netlbl_domhsh_add - Adds a entry to the domain hash table + * @entry: the entry to add + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new entry to the domain hash table and handles any updates to the + * lower level protocol handler (i.e. CIPSO). Returns zero on success, + * negative on failure. + * + */ +int netlbl_domhsh_add(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info) +{ + int ret_val = 0; + struct netlbl_dom_map *entry_old; + struct netlbl_af4list *iter4; + struct netlbl_af4list *tmp4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; + struct netlbl_af6list *tmp6; +#endif /* IPv6 */ + + /* XXX - we can remove this RCU read lock as the spinlock protects the + * entire function, but before we do we need to fixup the + * netlbl_af[4,6]list RCU functions to do "the right thing" with + * respect to rcu_dereference() when only a spinlock is held. */ + rcu_read_lock(); + spin_lock(&netlbl_domhsh_lock); + if (entry->domain != NULL) + entry_old = netlbl_domhsh_search(entry->domain); + else + entry_old = netlbl_domhsh_search_def(entry->domain); + if (entry_old == NULL) { + entry->valid = 1; + + if (entry->domain != NULL) { + u32 bkt = netlbl_domhsh_hash(entry->domain); + list_add_tail_rcu(&entry->list, + &rcu_dereference(netlbl_domhsh)->tbl[bkt]); + } else { + INIT_LIST_HEAD(&entry->list); + rcu_assign_pointer(netlbl_domhsh_def, entry); + } + + if (entry->type == NETLBL_NLTYPE_ADDRSELECT) { + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) + netlbl_domhsh_audit_add(entry, iter4, NULL, + ret_val, audit_info); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, + &entry->type_def.addrsel->list6) + netlbl_domhsh_audit_add(entry, NULL, iter6, + ret_val, audit_info); +#endif /* IPv6 */ + } else + netlbl_domhsh_audit_add(entry, NULL, NULL, + ret_val, audit_info); + } else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT && + entry->type == NETLBL_NLTYPE_ADDRSELECT) { + struct list_head *old_list4; + struct list_head *old_list6; + + old_list4 = &entry_old->type_def.addrsel->list4; + old_list6 = &entry_old->type_def.addrsel->list6; + + /* we only allow the addition of address selectors if all of + * the selectors do not exist in the existing domain map */ + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) + if (netlbl_af4list_search_exact(iter4->addr, + iter4->mask, + old_list4)) { + ret_val = -EEXIST; + goto add_return; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, + &entry->type_def.addrsel->list6) + if (netlbl_af6list_search_exact(&iter6->addr, + &iter6->mask, + old_list6)) { + ret_val = -EEXIST; + goto add_return; + } +#endif /* IPv6 */ + + netlbl_af4list_foreach_safe(iter4, tmp4, + &entry->type_def.addrsel->list4) { + netlbl_af4list_remove_entry(iter4); + iter4->valid = 1; + ret_val = netlbl_af4list_add(iter4, old_list4); + netlbl_domhsh_audit_add(entry_old, iter4, NULL, + ret_val, audit_info); + if (ret_val != 0) + goto add_return; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_safe(iter6, tmp6, + &entry->type_def.addrsel->list6) { + netlbl_af6list_remove_entry(iter6); + iter6->valid = 1; + ret_val = netlbl_af6list_add(iter6, old_list6); + netlbl_domhsh_audit_add(entry_old, NULL, iter6, + ret_val, audit_info); + if (ret_val != 0) + goto add_return; + } +#endif /* IPv6 */ + } else + ret_val = -EINVAL; + +add_return: + spin_unlock(&netlbl_domhsh_lock); + rcu_read_unlock(); + return ret_val; +} + +/** + * netlbl_domhsh_add_default - Adds the default entry to the domain hash table + * @entry: the entry to add + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new default entry to the domain hash table and handles any updates + * to the lower level protocol handler (i.e. CIPSO). Returns zero on success, + * negative on failure. + * + */ +int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info) +{ + return netlbl_domhsh_add(entry, audit_info); +} + +/** + * netlbl_domhsh_remove_entry - Removes a given entry from the domain table + * @entry: the entry to remove + * @audit_info: NetLabel audit information + * + * Description: + * Removes an entry from the domain hash table and handles any updates to the + * lower level protocol handler (i.e. CIPSO). Caller is responsible for + * ensuring that the RCU read lock is held. Returns zero on success, negative + * on failure. + * + */ +int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info) +{ + int ret_val = 0; + struct audit_buffer *audit_buf; + + if (entry == NULL) + return -ENOENT; + + spin_lock(&netlbl_domhsh_lock); + if (entry->valid) { + entry->valid = 0; + if (entry != rcu_dereference(netlbl_domhsh_def)) + list_del_rcu(&entry->list); + else + rcu_assign_pointer(netlbl_domhsh_def, NULL); + } else + ret_val = -ENOENT; + spin_unlock(&netlbl_domhsh_lock); + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); + if (audit_buf != NULL) { + audit_log_format(audit_buf, + " nlbl_domain=%s res=%u", + entry->domain ? entry->domain : "(default)", + ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + if (ret_val == 0) { + struct netlbl_af4list *iter4; + struct netlbl_domaddr4_map *map4; + + switch (entry->type) { + case NETLBL_NLTYPE_ADDRSELECT: + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) { + map4 = netlbl_domhsh_addr4_entry(iter4); + cipso_v4_doi_putdef(map4->type_def.cipsov4); + } + /* no need to check the IPv6 list since we currently + * support only unlabeled protocols for IPv6 */ + break; + case NETLBL_NLTYPE_CIPSOV4: + cipso_v4_doi_putdef(entry->type_def.cipsov4); + break; + } + call_rcu(&entry->rcu, netlbl_domhsh_free_entry); + } + + return ret_val; +} + +/** + * netlbl_domhsh_remove_af4 - Removes an address selector entry + * @domain: the domain + * @addr: IPv4 address + * @mask: IPv4 address mask + * @audit_info: NetLabel audit information + * + * Description: + * Removes an individual address selector from a domain mapping and potentially + * the entire mapping if it is empty. Returns zero on success, negative values + * on failure. + * + */ +int netlbl_domhsh_remove_af4(const char *domain, + const struct in_addr *addr, + const struct in_addr *mask, + struct netlbl_audit *audit_info) +{ + struct netlbl_dom_map *entry_map; + struct netlbl_af4list *entry_addr; + struct netlbl_af4list *iter4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; +#endif /* IPv6 */ + struct netlbl_domaddr4_map *entry; + + rcu_read_lock(); + + if (domain) + entry_map = netlbl_domhsh_search(domain); + else + entry_map = netlbl_domhsh_search_def(domain); + if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT) + goto remove_af4_failure; + + spin_lock(&netlbl_domhsh_lock); + entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr, + &entry_map->type_def.addrsel->list4); + spin_unlock(&netlbl_domhsh_lock); + + if (entry_addr == NULL) + goto remove_af4_failure; + netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4) + goto remove_af4_single_addr; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6) + goto remove_af4_single_addr; +#endif /* IPv6 */ + /* the domain mapping is empty so remove it from the mapping table */ + netlbl_domhsh_remove_entry(entry_map, audit_info); + +remove_af4_single_addr: + rcu_read_unlock(); + /* yick, we can't use call_rcu here because we don't have a rcu head + * pointer but hopefully this should be a rare case so the pause + * shouldn't be a problem */ + synchronize_rcu(); + entry = netlbl_domhsh_addr4_entry(entry_addr); + cipso_v4_doi_putdef(entry->type_def.cipsov4); + kfree(entry); + return 0; + +remove_af4_failure: + rcu_read_unlock(); + return -ENOENT; +} + +/** + * netlbl_domhsh_remove - Removes an entry from the domain hash table + * @domain: the domain to remove + * @audit_info: NetLabel audit information + * + * Description: + * Removes an entry from the domain hash table and handles any updates to the + * lower level protocol handler (i.e. CIPSO). Returns zero on success, + * negative on failure. + * + */ +int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) +{ + int ret_val; + struct netlbl_dom_map *entry; + + rcu_read_lock(); + if (domain) + entry = netlbl_domhsh_search(domain); + else + entry = netlbl_domhsh_search_def(domain); + ret_val = netlbl_domhsh_remove_entry(entry, audit_info); + rcu_read_unlock(); + + return ret_val; +} + +/** + * netlbl_domhsh_remove_default - Removes the default entry from the table + * @audit_info: NetLabel audit information + * + * Description: + * Removes/resets the default entry for the domain hash table and handles any + * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on + * success, non-zero on failure. + * + */ +int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) +{ + return netlbl_domhsh_remove(NULL, audit_info); +} + +/** + * netlbl_domhsh_getentry - Get an entry from the domain hash table + * @domain: the domain name to search for + * + * Description: + * Look through the domain hash table searching for an entry to match @domain, + * return a pointer to a copy of the entry or NULL. The caller is responsible + * for ensuring that rcu_read_[un]lock() is called. + * + */ +struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) +{ + return netlbl_domhsh_search_def(domain); +} + +/** + * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table + * @domain: the domain name to search for + * @addr: the IP address to search for + * + * Description: + * Look through the domain hash table searching for an entry to match @domain + * and @addr, return a pointer to a copy of the entry or NULL. The caller is + * responsible for ensuring that rcu_read_[un]lock() is called. + * + */ +struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, + __be32 addr) +{ + struct netlbl_dom_map *dom_iter; + struct netlbl_af4list *addr_iter; + + dom_iter = netlbl_domhsh_search_def(domain); + if (dom_iter == NULL) + return NULL; + if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) + return NULL; + + addr_iter = netlbl_af4list_search(addr, + &dom_iter->type_def.addrsel->list4); + if (addr_iter == NULL) + return NULL; + + return netlbl_domhsh_addr4_entry(addr_iter); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table + * @domain: the domain name to search for + * @addr: the IP address to search for + * + * Description: + * Look through the domain hash table searching for an entry to match @domain + * and @addr, return a pointer to a copy of the entry or NULL. The caller is + * responsible for ensuring that rcu_read_[un]lock() is called. + * + */ +struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, + const struct in6_addr *addr) +{ + struct netlbl_dom_map *dom_iter; + struct netlbl_af6list *addr_iter; + + dom_iter = netlbl_domhsh_search_def(domain); + if (dom_iter == NULL) + return NULL; + if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT) + return NULL; + + addr_iter = netlbl_af6list_search(addr, + &dom_iter->type_def.addrsel->list6); + if (addr_iter == NULL) + return NULL; + + return netlbl_domhsh_addr6_entry(addr_iter); +} +#endif /* IPv6 */ + +/** + * netlbl_domhsh_walk - Iterate through the domain mapping hash table + * @skip_bkt: the number of buckets to skip at the start + * @skip_chain: the number of entries to skip in the first iterated bucket + * @callback: callback for each entry + * @cb_arg: argument for the callback function + * + * Description: + * Interate over the domain mapping hash table, skipping the first @skip_bkt + * buckets and @skip_chain entries. For each entry in the table call + * @callback, if @callback returns a negative value stop 'walking' through the + * table and return. Updates the values in @skip_bkt and @skip_chain on + * return. Returns zero on success, negative values on failure. + * + */ +int netlbl_domhsh_walk(u32 *skip_bkt, + u32 *skip_chain, + int (*callback) (struct netlbl_dom_map *entry, void *arg), + void *cb_arg) +{ + int ret_val = -ENOENT; + u32 iter_bkt; + struct list_head *iter_list; + struct netlbl_dom_map *iter_entry; + u32 chain_cnt = 0; + + rcu_read_lock(); + for (iter_bkt = *skip_bkt; + iter_bkt < rcu_dereference(netlbl_domhsh)->size; + iter_bkt++, chain_cnt = 0) { + iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt]; + list_for_each_entry_rcu(iter_entry, iter_list, list) + if (iter_entry->valid) { + if (chain_cnt++ < *skip_chain) + continue; + ret_val = callback(iter_entry, cb_arg); + if (ret_val < 0) { + chain_cnt--; + goto walk_return; + } + } + } + +walk_return: + rcu_read_unlock(); + *skip_bkt = iter_bkt; + *skip_chain = chain_cnt; + return ret_val; +} diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h new file mode 100644 index 00000000..0261dda3 --- /dev/null +++ b/net/netlabel/netlabel_domainhash.h @@ -0,0 +1,112 @@ +/* + * NetLabel Domain Hash Table + * + * This file manages the domain hash table that NetLabel uses to determine + * which network labeling protocol to use for a given domain. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_DOMAINHASH_H +#define _NETLABEL_DOMAINHASH_H + +#include +#include +#include + +#include "netlabel_addrlist.h" + +/* Domain hash table size */ +/* XXX - currently this number is an uneducated guess */ +#define NETLBL_DOMHSH_BITSIZE 7 + +/* Domain mapping definition structures */ +#define netlbl_domhsh_addr4_entry(iter) \ + container_of(iter, struct netlbl_domaddr4_map, list) +struct netlbl_domaddr4_map { + u32 type; + union { + struct cipso_v4_doi *cipsov4; + } type_def; + + struct netlbl_af4list list; +}; +#define netlbl_domhsh_addr6_entry(iter) \ + container_of(iter, struct netlbl_domaddr6_map, list) +struct netlbl_domaddr6_map { + u32 type; + + /* NOTE: no 'type_def' union needed at present since we don't currently + * support any IPv6 labeling protocols */ + + struct netlbl_af6list list; +}; +struct netlbl_domaddr_map { + struct list_head list4; + struct list_head list6; +}; +struct netlbl_dom_map { + char *domain; + u32 type; + union { + struct cipso_v4_doi *cipsov4; + struct netlbl_domaddr_map *addrsel; + } type_def; + + u32 valid; + struct list_head list; + struct rcu_head rcu; +}; + +/* init function */ +int netlbl_domhsh_init(u32 size); + +/* Manipulate the domain hash table */ +int netlbl_domhsh_add(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info); +int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_af4(const char *domain, + const struct in_addr *addr, + const struct in_addr *mask, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info); +struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain); +struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain, + __be32 addr); +int netlbl_domhsh_walk(u32 *skip_bkt, + u32 *skip_chain, + int (*callback) (struct netlbl_dom_map *entry, void *arg), + void *cb_arg); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain, + const struct in6_addr *addr); +#endif /* IPv6 */ + +#endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c new file mode 100644 index 00000000..1b83e000 --- /dev/null +++ b/net/netlabel/netlabel_kapi.c @@ -0,0 +1,1090 @@ +/* + * NetLabel Kernel API + * + * This file defines the kernel API for the NetLabel system. The NetLabel + * system manages static and dynamic label mappings for network protocols such + * as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_domainhash.h" +#include "netlabel_unlabeled.h" +#include "netlabel_cipso_v4.h" +#include "netlabel_user.h" +#include "netlabel_mgmt.h" +#include "netlabel_addrlist.h" + +/* + * Configuration Functions + */ + +/** + * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping + * @domain: the domain mapping to remove + * @family: address family + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Removes a NetLabel/LSM domain mapping. A @domain value of NULL causes the + * default domain mapping to be removed. Returns zero on success, negative + * values on failure. + * + */ +int netlbl_cfg_map_del(const char *domain, + u16 family, + const void *addr, + const void *mask, + struct netlbl_audit *audit_info) +{ + if (addr == NULL && mask == NULL) { + return netlbl_domhsh_remove(domain, audit_info); + } else if (addr != NULL && mask != NULL) { + switch (family) { + case AF_INET: + return netlbl_domhsh_remove_af4(domain, addr, mask, + audit_info); + default: + return -EPFNOSUPPORT; + } + } else + return -EINVAL; +} + +/** + * netlbl_cfg_unlbl_map_add - Add a new unlabeled mapping + * @domain: the domain mapping to add + * @family: address family + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new unlabeled NetLabel/LSM domain mapping. A @domain value of NULL + * causes a new default domain mapping to be added. Returns zero on success, + * negative values on failure. + * + */ +int netlbl_cfg_unlbl_map_add(const char *domain, + u16 family, + const void *addr, + const void *mask, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMEM; + struct netlbl_dom_map *entry; + struct netlbl_domaddr_map *addrmap = NULL; + struct netlbl_domaddr4_map *map4 = NULL; + struct netlbl_domaddr6_map *map6 = NULL; + const struct in_addr *addr4, *mask4; + const struct in6_addr *addr6, *mask6; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto cfg_unlbl_map_add_failure; + } + + if (addr == NULL && mask == NULL) + entry->type = NETLBL_NLTYPE_UNLABELED; + else if (addr != NULL && mask != NULL) { + addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + if (addrmap == NULL) + goto cfg_unlbl_map_add_failure; + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + switch (family) { + case AF_INET: + addr4 = addr; + mask4 = mask; + map4 = kzalloc(sizeof(*map4), GFP_ATOMIC); + if (map4 == NULL) + goto cfg_unlbl_map_add_failure; + map4->type = NETLBL_NLTYPE_UNLABELED; + map4->list.addr = addr4->s_addr & mask4->s_addr; + map4->list.mask = mask4->s_addr; + map4->list.valid = 1; + ret_val = netlbl_af4list_add(&map4->list, + &addrmap->list4); + if (ret_val != 0) + goto cfg_unlbl_map_add_failure; + break; + case AF_INET6: + addr6 = addr; + mask6 = mask; + map6 = kzalloc(sizeof(*map6), GFP_ATOMIC); + if (map6 == NULL) + goto cfg_unlbl_map_add_failure; + map6->type = NETLBL_NLTYPE_UNLABELED; + ipv6_addr_copy(&map6->list.addr, addr6); + map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0]; + map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1]; + map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2]; + map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3]; + ipv6_addr_copy(&map6->list.mask, mask6); + map6->list.valid = 1; + ret_val = netlbl_af4list_add(&map4->list, + &addrmap->list4); + if (ret_val != 0) + goto cfg_unlbl_map_add_failure; + break; + default: + goto cfg_unlbl_map_add_failure; + break; + } + + entry->type_def.addrsel = addrmap; + entry->type = NETLBL_NLTYPE_ADDRSELECT; + } else { + ret_val = -EINVAL; + goto cfg_unlbl_map_add_failure; + } + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_unlbl_map_add_failure; + + return 0; + +cfg_unlbl_map_add_failure: + kfree(entry->domain); + kfree(entry); + kfree(addrmap); + kfree(map4); + kfree(map6); + return ret_val; +} + + +/** + * netlbl_cfg_unlbl_static_add - Adds a new static label + * @net: network namespace + * @dev_name: interface name + * @addr: IP address in network byte order (struct in[6]_addr) + * @mask: address mask in network byte order (struct in[6]_addr) + * @family: address family + * @secid: LSM secid value for the entry + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new NetLabel static label to be used when protocol provided labels + * are not present on incoming traffic. If @dev_name is NULL then the default + * interface will be used. Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_unlbl_static_add(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u16 family, + u32 secid, + struct netlbl_audit *audit_info) +{ + u32 addr_len; + + switch (family) { + case AF_INET: + addr_len = sizeof(struct in_addr); + break; + case AF_INET6: + addr_len = sizeof(struct in6_addr); + break; + default: + return -EPFNOSUPPORT; + } + + return netlbl_unlhsh_add(net, + dev_name, addr, mask, addr_len, + secid, audit_info); +} + +/** + * netlbl_cfg_unlbl_static_del - Removes an existing static label + * @net: network namespace + * @dev_name: interface name + * @addr: IP address in network byte order (struct in[6]_addr) + * @mask: address mask in network byte order (struct in[6]_addr) + * @family: address family + * @secid: LSM secid value for the entry + * @audit_info: NetLabel audit information + * + * Description: + * Removes an existing NetLabel static label used when protocol provided labels + * are not present on incoming traffic. If @dev_name is NULL then the default + * interface will be used. Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_unlbl_static_del(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u16 family, + struct netlbl_audit *audit_info) +{ + u32 addr_len; + + switch (family) { + case AF_INET: + addr_len = sizeof(struct in_addr); + break; + case AF_INET6: + addr_len = sizeof(struct in6_addr); + break; + default: + return -EPFNOSUPPORT; + } + + return netlbl_unlhsh_remove(net, + dev_name, addr, mask, addr_len, + audit_info); +} + +/** + * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition + * @doi_def: CIPSO DOI definition + * @audit_info: NetLabel audit information + * + * Description: + * Add a new CIPSO DOI definition as defined by @doi_def. Returns zero on + * success and negative values on failure. + * + */ +int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, + struct netlbl_audit *audit_info) +{ + return cipso_v4_doi_add(doi_def, audit_info); +} + +/** + * netlbl_cfg_cipsov4_del - Remove an existing CIPSOv4 DOI definition + * @doi: CIPSO DOI + * @audit_info: NetLabel audit information + * + * Description: + * Remove an existing CIPSO DOI definition matching @doi. Returns zero on + * success and negative values on failure. + * + */ +void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info) +{ + cipso_v4_doi_remove(doi, audit_info); +} + +/** + * netlbl_cfg_cipsov4_map_add - Add a new CIPSOv4 DOI mapping + * @doi: the CIPSO DOI + * @domain: the domain mapping to add + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Add a new NetLabel/LSM domain mapping for the given CIPSO DOI to the NetLabel + * subsystem. A @domain value of NULL adds a new default domain mapping. + * Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_cipsov4_map_add(u32 doi, + const char *domain, + const struct in_addr *addr, + const struct in_addr *mask, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMEM; + struct cipso_v4_doi *doi_def; + struct netlbl_dom_map *entry; + struct netlbl_domaddr_map *addrmap = NULL; + struct netlbl_domaddr4_map *addrinfo = NULL; + + doi_def = cipso_v4_doi_getdef(doi); + if (doi_def == NULL) + return -ENOENT; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto cfg_cipsov4_map_add_failure; + } + + if (addr == NULL && mask == NULL) { + entry->type_def.cipsov4 = doi_def; + entry->type = NETLBL_NLTYPE_CIPSOV4; + } else if (addr != NULL && mask != NULL) { + addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + if (addrmap == NULL) + goto cfg_cipsov4_map_add_failure; + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); + if (addrinfo == NULL) + goto cfg_cipsov4_map_add_failure; + addrinfo->type_def.cipsov4 = doi_def; + addrinfo->type = NETLBL_NLTYPE_CIPSOV4; + addrinfo->list.addr = addr->s_addr & mask->s_addr; + addrinfo->list.mask = mask->s_addr; + addrinfo->list.valid = 1; + ret_val = netlbl_af4list_add(&addrinfo->list, &addrmap->list4); + if (ret_val != 0) + goto cfg_cipsov4_map_add_failure; + + entry->type_def.addrsel = addrmap; + entry->type = NETLBL_NLTYPE_ADDRSELECT; + } else { + ret_val = -EINVAL; + goto cfg_cipsov4_map_add_failure; + } + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_cipsov4_map_add_failure; + + return 0; + +cfg_cipsov4_map_add_failure: + cipso_v4_doi_putdef(doi_def); + kfree(entry->domain); + kfree(entry); + kfree(addrmap); + kfree(addrinfo); + return ret_val; +} + +/* + * Security Attribute Functions + */ + +/** + * netlbl_secattr_catmap_walk - Walk a LSM secattr catmap looking for a bit + * @catmap: the category bitmap + * @offset: the offset to start searching at, in bits + * + * Description: + * This function walks a LSM secattr category bitmap starting at @offset and + * returns the spot of the first set bit or -ENOENT if no bits are set. + * + */ +int netlbl_secattr_catmap_walk(struct netlbl_lsm_secattr_catmap *catmap, + u32 offset) +{ + struct netlbl_lsm_secattr_catmap *iter = catmap; + u32 node_idx; + u32 node_bit; + NETLBL_CATMAP_MAPTYPE bitmap; + + if (offset > iter->startbit) { + while (offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) { + iter = iter->next; + if (iter == NULL) + return -ENOENT; + } + node_idx = (offset - iter->startbit) / NETLBL_CATMAP_MAPSIZE; + node_bit = offset - iter->startbit - + (NETLBL_CATMAP_MAPSIZE * node_idx); + } else { + node_idx = 0; + node_bit = 0; + } + bitmap = iter->bitmap[node_idx] >> node_bit; + + for (;;) { + if (bitmap != 0) { + while ((bitmap & NETLBL_CATMAP_BIT) == 0) { + bitmap >>= 1; + node_bit++; + } + return iter->startbit + + (NETLBL_CATMAP_MAPSIZE * node_idx) + node_bit; + } + if (++node_idx >= NETLBL_CATMAP_MAPCNT) { + if (iter->next != NULL) { + iter = iter->next; + node_idx = 0; + } else + return -ENOENT; + } + bitmap = iter->bitmap[node_idx]; + node_bit = 0; + } + + return -ENOENT; +} + +/** + * netlbl_secattr_catmap_walk_rng - Find the end of a string of set bits + * @catmap: the category bitmap + * @offset: the offset to start searching at, in bits + * + * Description: + * This function walks a LSM secattr category bitmap starting at @offset and + * returns the spot of the first cleared bit or -ENOENT if the offset is past + * the end of the bitmap. + * + */ +int netlbl_secattr_catmap_walk_rng(struct netlbl_lsm_secattr_catmap *catmap, + u32 offset) +{ + struct netlbl_lsm_secattr_catmap *iter = catmap; + u32 node_idx; + u32 node_bit; + NETLBL_CATMAP_MAPTYPE bitmask; + NETLBL_CATMAP_MAPTYPE bitmap; + + if (offset > iter->startbit) { + while (offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) { + iter = iter->next; + if (iter == NULL) + return -ENOENT; + } + node_idx = (offset - iter->startbit) / NETLBL_CATMAP_MAPSIZE; + node_bit = offset - iter->startbit - + (NETLBL_CATMAP_MAPSIZE * node_idx); + } else { + node_idx = 0; + node_bit = 0; + } + bitmask = NETLBL_CATMAP_BIT << node_bit; + + for (;;) { + bitmap = iter->bitmap[node_idx]; + while (bitmask != 0 && (bitmap & bitmask) != 0) { + bitmask <<= 1; + node_bit++; + } + + if (bitmask != 0) + return iter->startbit + + (NETLBL_CATMAP_MAPSIZE * node_idx) + + node_bit - 1; + else if (++node_idx >= NETLBL_CATMAP_MAPCNT) { + if (iter->next == NULL) + return iter->startbit + NETLBL_CATMAP_SIZE - 1; + iter = iter->next; + node_idx = 0; + } + bitmask = NETLBL_CATMAP_BIT; + node_bit = 0; + } + + return -ENOENT; +} + +/** + * netlbl_secattr_catmap_setbit - Set a bit in a LSM secattr catmap + * @catmap: the category bitmap + * @bit: the bit to set + * @flags: memory allocation flags + * + * Description: + * Set the bit specified by @bit in @catmap. Returns zero on success, + * negative values on failure. + * + */ +int netlbl_secattr_catmap_setbit(struct netlbl_lsm_secattr_catmap *catmap, + u32 bit, + gfp_t flags) +{ + struct netlbl_lsm_secattr_catmap *iter = catmap; + u32 node_bit; + u32 node_idx; + + while (iter->next != NULL && + bit >= (iter->startbit + NETLBL_CATMAP_SIZE)) + iter = iter->next; + if (bit >= (iter->startbit + NETLBL_CATMAP_SIZE)) { + iter->next = netlbl_secattr_catmap_alloc(flags); + if (iter->next == NULL) + return -ENOMEM; + iter = iter->next; + iter->startbit = bit & ~(NETLBL_CATMAP_SIZE - 1); + } + + /* gcc always rounds to zero when doing integer division */ + node_idx = (bit - iter->startbit) / NETLBL_CATMAP_MAPSIZE; + node_bit = bit - iter->startbit - (NETLBL_CATMAP_MAPSIZE * node_idx); + iter->bitmap[node_idx] |= NETLBL_CATMAP_BIT << node_bit; + + return 0; +} + +/** + * netlbl_secattr_catmap_setrng - Set a range of bits in a LSM secattr catmap + * @catmap: the category bitmap + * @start: the starting bit + * @end: the last bit in the string + * @flags: memory allocation flags + * + * Description: + * Set a range of bits, starting at @start and ending with @end. Returns zero + * on success, negative values on failure. + * + */ +int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap, + u32 start, + u32 end, + gfp_t flags) +{ + int ret_val = 0; + struct netlbl_lsm_secattr_catmap *iter = catmap; + u32 iter_max_spot; + u32 spot; + + /* XXX - This could probably be made a bit faster by combining writes + * to the catmap instead of setting a single bit each time, but for + * right now skipping to the start of the range in the catmap should + * be a nice improvement over calling the individual setbit function + * repeatedly from a loop. */ + + while (iter->next != NULL && + start >= (iter->startbit + NETLBL_CATMAP_SIZE)) + iter = iter->next; + iter_max_spot = iter->startbit + NETLBL_CATMAP_SIZE; + + for (spot = start; spot <= end && ret_val == 0; spot++) { + if (spot >= iter_max_spot && iter->next != NULL) { + iter = iter->next; + iter_max_spot = iter->startbit + NETLBL_CATMAP_SIZE; + } + ret_val = netlbl_secattr_catmap_setbit(iter, spot, GFP_ATOMIC); + } + + return ret_val; +} + +/* + * LSM Functions + */ + +/** + * netlbl_enabled - Determine if the NetLabel subsystem is enabled + * + * Description: + * The LSM can use this function to determine if it should use NetLabel + * security attributes in it's enforcement mechanism. Currently, NetLabel is + * considered to be enabled when it's configuration contains a valid setup for + * at least one labeled protocol (i.e. NetLabel can understand incoming + * labeled packets of at least one type); otherwise NetLabel is considered to + * be disabled. + * + */ +int netlbl_enabled(void) +{ + /* At some point we probably want to expose this mechanism to the user + * as well so that admins can toggle NetLabel regardless of the + * configuration */ + return (atomic_read(&netlabel_mgmt_protocount) > 0); +} + +/** + * netlbl_sock_setattr - Label a socket using the correct protocol + * @sk: the socket to label + * @family: protocol family + * @secattr: the security attributes + * + * Description: + * Attach the correct label to the given socket using the security attributes + * specified in @secattr. This function requires exclusive access to @sk, + * which means it either needs to be in the process of being created or locked. + * Returns zero on success, -EDESTADDRREQ if the domain is configured to use + * network address selectors (can't blindly label the socket), and negative + * values on all other failures. + * + */ +int netlbl_sock_setattr(struct sock *sk, + u16 family, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct netlbl_dom_map *dom_entry; + + rcu_read_lock(); + dom_entry = netlbl_domhsh_getentry(secattr->domain); + if (dom_entry == NULL) { + ret_val = -ENOENT; + goto socket_setattr_return; + } + switch (family) { + case AF_INET: + switch (dom_entry->type) { + case NETLBL_NLTYPE_ADDRSELECT: + ret_val = -EDESTADDRREQ; + break; + case NETLBL_NLTYPE_CIPSOV4: + ret_val = cipso_v4_sock_setattr(sk, + dom_entry->type_def.cipsov4, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + /* since we don't support any IPv6 labeling protocols right + * now we can optimize everything away until we do */ + ret_val = 0; + break; +#endif /* IPv6 */ + default: + ret_val = -EPROTONOSUPPORT; + } + +socket_setattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** + * netlbl_sock_delattr - Delete all the NetLabel labels on a socket + * @sk: the socket + * + * Description: + * Remove all the NetLabel labeling from @sk. The caller is responsible for + * ensuring that @sk is locked. + * + */ +void netlbl_sock_delattr(struct sock *sk) +{ + cipso_v4_sock_delattr(sk); +} + +/** + * netlbl_sock_getattr - Determine the security attributes of a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Examines the given sock to see if any NetLabel style labeling has been + * applied to the sock, if so it parses the socket label and returns the + * security attributes in @secattr. Returns zero on success, negative values + * on failure. + * + */ +int netlbl_sock_getattr(struct sock *sk, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + + switch (sk->sk_family) { + case AF_INET: + ret_val = cipso_v4_sock_getattr(sk, secattr); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + ret_val = -ENOMSG; + break; +#endif /* IPv6 */ + default: + ret_val = -EPROTONOSUPPORT; + } + + return ret_val; +} + +/** + * netlbl_conn_setattr - Label a connected socket using the correct protocol + * @sk: the socket to label + * @addr: the destination address + * @secattr: the security attributes + * + * Description: + * Attach the correct label to the given connected socket using the security + * attributes specified in @secattr. The caller is responsible for ensuring + * that @sk is locked. Returns zero on success, negative values on failure. + * + */ +int netlbl_conn_setattr(struct sock *sk, + struct sockaddr *addr, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct sockaddr_in *addr4; + struct netlbl_domaddr4_map *af4_entry; + + rcu_read_lock(); + switch (addr->sa_family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + af4_entry = netlbl_domhsh_getentry_af4(secattr->domain, + addr4->sin_addr.s_addr); + if (af4_entry == NULL) { + ret_val = -ENOENT; + goto conn_setattr_return; + } + switch (af4_entry->type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = cipso_v4_sock_setattr(sk, + af4_entry->type_def.cipsov4, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + cipso_v4_sock_delattr(sk); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + /* since we don't support any IPv6 labeling protocols right + * now we can optimize everything away until we do */ + ret_val = 0; + break; +#endif /* IPv6 */ + default: + ret_val = -EPROTONOSUPPORT; + } + +conn_setattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** + * netlbl_req_setattr - Label a request socket using the correct protocol + * @req: the request socket to label + * @secattr: the security attributes + * + * Description: + * Attach the correct label to the given socket using the security attributes + * specified in @secattr. Returns zero on success, negative values on failure. + * + */ +int netlbl_req_setattr(struct request_sock *req, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct netlbl_dom_map *dom_entry; + struct netlbl_domaddr4_map *af4_entry; + u32 proto_type; + struct cipso_v4_doi *proto_cv4; + + rcu_read_lock(); + dom_entry = netlbl_domhsh_getentry(secattr->domain); + if (dom_entry == NULL) { + ret_val = -ENOENT; + goto req_setattr_return; + } + switch (req->rsk_ops->family) { + case AF_INET: + if (dom_entry->type == NETLBL_NLTYPE_ADDRSELECT) { + struct inet_request_sock *req_inet = inet_rsk(req); + af4_entry = netlbl_domhsh_getentry_af4(secattr->domain, + req_inet->rmt_addr); + if (af4_entry == NULL) { + ret_val = -ENOENT; + goto req_setattr_return; + } + proto_type = af4_entry->type; + proto_cv4 = af4_entry->type_def.cipsov4; + } else { + proto_type = dom_entry->type; + proto_cv4 = dom_entry->type_def.cipsov4; + } + switch (proto_type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = cipso_v4_req_setattr(req, proto_cv4, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + cipso_v4_req_delattr(req); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + /* since we don't support any IPv6 labeling protocols right + * now we can optimize everything away until we do */ + ret_val = 0; + break; +#endif /* IPv6 */ + default: + ret_val = -EPROTONOSUPPORT; + } + +req_setattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** +* netlbl_req_delattr - Delete all the NetLabel labels on a socket +* @req: the socket +* +* Description: +* Remove all the NetLabel labeling from @req. +* +*/ +void netlbl_req_delattr(struct request_sock *req) +{ + cipso_v4_req_delattr(req); +} + +/** + * netlbl_skbuff_setattr - Label a packet using the correct protocol + * @skb: the packet + * @family: protocol family + * @secattr: the security attributes + * + * Description: + * Attach the correct label to the given packet using the security attributes + * specified in @secattr. Returns zero on success, negative values on failure. + * + */ +int netlbl_skbuff_setattr(struct sk_buff *skb, + u16 family, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct iphdr *hdr4; + struct netlbl_domaddr4_map *af4_entry; + + rcu_read_lock(); + switch (family) { + case AF_INET: + hdr4 = ip_hdr(skb); + af4_entry = netlbl_domhsh_getentry_af4(secattr->domain, + hdr4->daddr); + if (af4_entry == NULL) { + ret_val = -ENOENT; + goto skbuff_setattr_return; + } + switch (af4_entry->type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = cipso_v4_skbuff_setattr(skb, + af4_entry->type_def.cipsov4, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + ret_val = cipso_v4_skbuff_delattr(skb); + break; + default: + ret_val = -ENOENT; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + /* since we don't support any IPv6 labeling protocols right + * now we can optimize everything away until we do */ + ret_val = 0; + break; +#endif /* IPv6 */ + default: + ret_val = -EPROTONOSUPPORT; + } + +skbuff_setattr_return: + rcu_read_unlock(); + return ret_val; +} + +/** + * netlbl_skbuff_getattr - Determine the security attributes of a packet + * @skb: the packet + * @family: protocol family + * @secattr: the security attributes + * + * Description: + * Examines the given packet to see if a recognized form of packet labeling + * is present, if so it parses the packet label and returns the security + * attributes in @secattr. Returns zero on success, negative values on + * failure. + * + */ +int netlbl_skbuff_getattr(const struct sk_buff *skb, + u16 family, + struct netlbl_lsm_secattr *secattr) +{ + switch (family) { + case AF_INET: + if (CIPSO_V4_OPTEXIST(skb) && + cipso_v4_skbuff_getattr(skb, secattr) == 0) + return 0; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + break; +#endif /* IPv6 */ + } + + return netlbl_unlabel_getattr(skb, family, secattr); +} + +/** + * netlbl_skbuff_err - Handle a LSM error on a sk_buff + * @skb: the packet + * @error: the error code + * @gateway: true if host is acting as a gateway, false otherwise + * + * Description: + * Deal with a LSM problem when handling the packet in @skb, typically this is + * a permission denied problem (-EACCES). The correct action is determined + * according to the packet's labeling protocol. + * + */ +void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) +{ + if (CIPSO_V4_OPTEXIST(skb)) + cipso_v4_error(skb, error, gateway); +} + +/** + * netlbl_cache_invalidate - Invalidate all of the NetLabel protocol caches + * + * Description: + * For all of the NetLabel protocols that support some form of label mapping + * cache, invalidate the cache. Returns zero on success, negative values on + * error. + * + */ +void netlbl_cache_invalidate(void) +{ + cipso_v4_cache_invalidate(); +} + +/** + * netlbl_cache_add - Add an entry to a NetLabel protocol cache + * @skb: the packet + * @secattr: the packet's security attributes + * + * Description: + * Add the LSM security attributes for the given packet to the underlying + * NetLabel protocol's label mapping cache. Returns zero on success, negative + * values on error. + * + */ +int netlbl_cache_add(const struct sk_buff *skb, + const struct netlbl_lsm_secattr *secattr) +{ + if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0) + return -ENOMSG; + + if (CIPSO_V4_OPTEXIST(skb)) + return cipso_v4_cache_add(skb, secattr); + + return -ENOMSG; +} + +/* + * Protocol Engine Functions + */ + +/** + * netlbl_audit_start - Start an audit message + * @type: audit message type + * @audit_info: NetLabel audit information + * + * Description: + * Start an audit message using the type specified in @type and fill the audit + * message with some fields common to all NetLabel audit messages. This + * function should only be used by protocol engines, not LSMs. Returns a + * pointer to the audit buffer on success, NULL on failure. + * + */ +struct audit_buffer *netlbl_audit_start(int type, + struct netlbl_audit *audit_info) +{ + return netlbl_audit_start_common(type, audit_info); +} + +/* + * Setup Functions + */ + +/** + * netlbl_init - Initialize NetLabel + * + * Description: + * Perform the required NetLabel initialization before first use. + * + */ +static int __init netlbl_init(void) +{ + int ret_val; + + printk(KERN_INFO "NetLabel: Initializing\n"); + printk(KERN_INFO "NetLabel: domain hash size = %u\n", + (1 << NETLBL_DOMHSH_BITSIZE)); + printk(KERN_INFO "NetLabel: protocols =" + " UNLABELED" + " CIPSOv4" + "\n"); + + ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE); + if (ret_val != 0) + goto init_failure; + + ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE); + if (ret_val != 0) + goto init_failure; + + ret_val = netlbl_netlink_init(); + if (ret_val != 0) + goto init_failure; + + ret_val = netlbl_unlabel_defconf(); + if (ret_val != 0) + goto init_failure; + printk(KERN_INFO "NetLabel: unlabeled traffic allowed by default\n"); + + return 0; + +init_failure: + panic("NetLabel: failed to initialize properly (%d)\n", ret_val); +} + +subsys_initcall(netlbl_init); diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c new file mode 100644 index 00000000..4f251b19 --- /dev/null +++ b/net/netlabel/netlabel_mgmt.c @@ -0,0 +1,785 @@ +/* + * NetLabel Management Support + * + * This file defines the management functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_domainhash.h" +#include "netlabel_user.h" +#include "netlabel_mgmt.h" + +/* NetLabel configured protocol counter */ +atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0); + +/* Argument struct for netlbl_domhsh_walk() */ +struct netlbl_domhsh_walk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + +/* NetLabel Generic NETLINK CIPSOv4 family */ +static struct genl_family netlbl_mgmt_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_MGMT_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_MGMT_A_MAX, +}; + +/* NetLabel Netlink attribute policy */ +static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { + [NLBL_MGMT_A_DOMAIN] = { .type = NLA_NUL_STRING }, + [NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 }, + [NLBL_MGMT_A_VERSION] = { .type = NLA_U32 }, + [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 }, +}; + +/* + * Helper Functions + */ + +/** + * netlbl_mgmt_add - Handle an ADD message + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Helper function for the ADD and ADDDEF messages to add the domain mappings + * from the message to the hash table. See netlabel.h for a description of the + * message format. Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_add_common(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val = -EINVAL; + struct netlbl_dom_map *entry = NULL; + struct netlbl_domaddr_map *addrmap = NULL; + struct cipso_v4_doi *cipsov4 = NULL; + u32 tmp_val; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (entry == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]); + if (info->attrs[NLBL_MGMT_A_DOMAIN]) { + size_t tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]); + entry->domain = kmalloc(tmp_size, GFP_KERNEL); + if (entry->domain == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + nla_strlcpy(entry->domain, + info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); + } + + /* NOTE: internally we allow/use a entry->type value of + * NETLBL_NLTYPE_ADDRSELECT but we don't currently allow users + * to pass that as a protocol value because we need to know the + * "real" protocol */ + + switch (entry->type) { + case NETLBL_NLTYPE_UNLABELED: + break; + case NETLBL_NLTYPE_CIPSOV4: + if (!info->attrs[NLBL_MGMT_A_CV4DOI]) + goto add_failure; + + tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]); + cipsov4 = cipso_v4_doi_getdef(tmp_val); + if (cipsov4 == NULL) + goto add_failure; + entry->type_def.cipsov4 = cipsov4; + break; + default: + goto add_failure; + } + + if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) { + struct in_addr *addr; + struct in_addr *mask; + struct netlbl_domaddr4_map *map; + + addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); + if (addrmap == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + if (nla_len(info->attrs[NLBL_MGMT_A_IPV4ADDR]) != + sizeof(struct in_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + if (nla_len(info->attrs[NLBL_MGMT_A_IPV4MASK]) != + sizeof(struct in_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]); + mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + map->list.addr = addr->s_addr & mask->s_addr; + map->list.mask = mask->s_addr; + map->list.valid = 1; + map->type = entry->type; + if (cipsov4) + map->type_def.cipsov4 = cipsov4; + + ret_val = netlbl_af4list_add(&map->list, &addrmap->list4); + if (ret_val != 0) { + kfree(map); + goto add_failure; + } + + entry->type = NETLBL_NLTYPE_ADDRSELECT; + entry->type_def.addrsel = addrmap; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } else if (info->attrs[NLBL_MGMT_A_IPV6ADDR]) { + struct in6_addr *addr; + struct in6_addr *mask; + struct netlbl_domaddr6_map *map; + + addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); + if (addrmap == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + if (nla_len(info->attrs[NLBL_MGMT_A_IPV6ADDR]) != + sizeof(struct in6_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + if (nla_len(info->attrs[NLBL_MGMT_A_IPV6MASK]) != + sizeof(struct in6_addr)) { + ret_val = -EINVAL; + goto add_failure; + } + addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]); + mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + ret_val = -ENOMEM; + goto add_failure; + } + ipv6_addr_copy(&map->list.addr, addr); + map->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + map->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + map->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + map->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + ipv6_addr_copy(&map->list.mask, mask); + map->list.valid = 1; + map->type = entry->type; + + ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); + if (ret_val != 0) { + kfree(map); + goto add_failure; + } + + entry->type = NETLBL_NLTYPE_ADDRSELECT; + entry->type_def.addrsel = addrmap; +#endif /* IPv6 */ + } + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto add_failure; + + return 0; + +add_failure: + if (cipsov4) + cipso_v4_doi_putdef(cipsov4); + if (entry) + kfree(entry->domain); + kfree(addrmap); + kfree(entry); + return ret_val; +} + +/** + * netlbl_mgmt_listentry - List a NetLabel/LSM domain map entry + * @skb: the NETLINK buffer + * @entry: the map entry + * + * Description: + * This function is a helper function used by the LISTALL and LISTDEF command + * handlers. The caller is responsible for ensuring that the RCU read lock + * is held. Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_listentry(struct sk_buff *skb, + struct netlbl_dom_map *entry) +{ + int ret_val = 0; + struct nlattr *nla_a; + struct nlattr *nla_b; + struct netlbl_af4list *iter4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; +#endif + + if (entry->domain != NULL) { + ret_val = nla_put_string(skb, + NLBL_MGMT_A_DOMAIN, entry->domain); + if (ret_val != 0) + return ret_val; + } + + switch (entry->type) { + case NETLBL_NLTYPE_ADDRSELECT: + nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST); + if (nla_a == NULL) + return -ENOMEM; + + netlbl_af4list_foreach_rcu(iter4, + &entry->type_def.addrsel->list4) { + struct netlbl_domaddr4_map *map4; + struct in_addr addr_struct; + + nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR); + if (nla_b == NULL) + return -ENOMEM; + + addr_struct.s_addr = iter4->addr; + ret_val = nla_put(skb, NLBL_MGMT_A_IPV4ADDR, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + return ret_val; + addr_struct.s_addr = iter4->mask; + ret_val = nla_put(skb, NLBL_MGMT_A_IPV4MASK, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + return ret_val; + map4 = netlbl_domhsh_addr4_entry(iter4); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + map4->type); + if (ret_val != 0) + return ret_val; + switch (map4->type) { + case NETLBL_NLTYPE_CIPSOV4: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, + map4->type_def.cipsov4->doi); + if (ret_val != 0) + return ret_val; + break; + } + + nla_nest_end(skb, nla_b); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, + &entry->type_def.addrsel->list6) { + struct netlbl_domaddr6_map *map6; + + nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR); + if (nla_b == NULL) + return -ENOMEM; + + ret_val = nla_put(skb, NLBL_MGMT_A_IPV6ADDR, + sizeof(struct in6_addr), + &iter6->addr); + if (ret_val != 0) + return ret_val; + ret_val = nla_put(skb, NLBL_MGMT_A_IPV6MASK, + sizeof(struct in6_addr), + &iter6->mask); + if (ret_val != 0) + return ret_val; + map6 = netlbl_domhsh_addr6_entry(iter6); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + map6->type); + if (ret_val != 0) + return ret_val; + + nla_nest_end(skb, nla_b); + } +#endif /* IPv6 */ + + nla_nest_end(skb, nla_a); + break; + case NETLBL_NLTYPE_UNLABELED: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type); + break; + case NETLBL_NLTYPE_CIPSOV4: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type); + if (ret_val != 0) + return ret_val; + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, + entry->type_def.cipsov4->doi); + break; + } + + return ret_val; +} + +/* + * NetLabel Command Handlers + */ + +/** + * netlbl_mgmt_add - Handle an ADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated ADD message and add the domains from the message + * to the hash table. See netlabel.h for a description of the message format. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info) +{ + struct netlbl_audit audit_info; + + if ((!info->attrs[NLBL_MGMT_A_DOMAIN]) || + (!info->attrs[NLBL_MGMT_A_PROTOCOL]) || + (info->attrs[NLBL_MGMT_A_IPV4ADDR] && + info->attrs[NLBL_MGMT_A_IPV6ADDR]) || + (info->attrs[NLBL_MGMT_A_IPV4MASK] && + info->attrs[NLBL_MGMT_A_IPV6MASK]) || + ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) || + ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + return netlbl_mgmt_add_common(info, &audit_info); +} + +/** + * netlbl_mgmt_remove - Handle a REMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated REMOVE message and remove the specified domain + * mappings. Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) +{ + char *domain; + struct netlbl_audit audit_info; + + if (!info->attrs[NLBL_MGMT_A_DOMAIN]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); + return netlbl_domhsh_remove(domain, &audit_info); +} + +/** + * netlbl_mgmt_listall_cb - netlbl_domhsh_walk() callback for LISTALL + * @entry: the domain mapping hash table entry + * @arg: the netlbl_domhsh_walk_arg structure + * + * Description: + * This function is designed to be used as a callback to the + * netlbl_domhsh_walk() function for use in generating a response for a LISTALL + * message. Returns the size of the message on success, negative values on + * failure. + * + */ +static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_domhsh_walk_arg *cb_arg = arg; + void *data; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, + cb_arg->seq, &netlbl_mgmt_gnl_family, + NLM_F_MULTI, NLBL_MGMT_C_LISTALL); + if (data == NULL) + goto listall_cb_failure; + + ret_val = netlbl_mgmt_listentry(cb_arg->skb, entry); + if (ret_val != 0) + goto listall_cb_failure; + + cb_arg->seq++; + return genlmsg_end(cb_arg->skb, data); + +listall_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_mgmt_listall - Handle a LISTALL message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated LISTALL message and dumps the domain hash table in + * a form suitable for use in a kernel generated LISTALL message. Returns zero + * on success, negative values on failure. + * + */ +static int netlbl_mgmt_listall(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_domhsh_walk_arg cb_arg; + u32 skip_bkt = cb->args[0]; + u32 skip_chain = cb->args[1]; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + netlbl_domhsh_walk(&skip_bkt, + &skip_chain, + netlbl_mgmt_listall_cb, + &cb_arg); + + cb->args[0] = skip_bkt; + cb->args[1] = skip_chain; + return skb->len; +} + +/** + * netlbl_mgmt_adddef - Handle an ADDDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated ADDDEF message and respond accordingly. Returns + * zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info) +{ + struct netlbl_audit audit_info; + + if ((!info->attrs[NLBL_MGMT_A_PROTOCOL]) || + (info->attrs[NLBL_MGMT_A_IPV4ADDR] && + info->attrs[NLBL_MGMT_A_IPV6ADDR]) || + (info->attrs[NLBL_MGMT_A_IPV4MASK] && + info->attrs[NLBL_MGMT_A_IPV6MASK]) || + ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) || + ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^ + (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + return netlbl_mgmt_add_common(info, &audit_info); +} + +/** + * netlbl_mgmt_removedef - Handle a REMOVEDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated REMOVEDEF message and remove the default domain + * mapping. Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) +{ + struct netlbl_audit audit_info; + + netlbl_netlink_auditinfo(skb, &audit_info); + + return netlbl_domhsh_remove_default(&audit_info); +} + +/** + * netlbl_mgmt_listdef - Handle a LISTDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated LISTDEF message and dumps the default domain + * mapping in a form suitable for use in a kernel generated LISTDEF message. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -ENOMEM; + struct sk_buff *ans_skb = NULL; + void *data; + struct netlbl_dom_map *entry; + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (ans_skb == NULL) + return -ENOMEM; + data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family, + 0, NLBL_MGMT_C_LISTDEF); + if (data == NULL) + goto listdef_failure; + + rcu_read_lock(); + entry = netlbl_domhsh_getentry(NULL); + if (entry == NULL) { + ret_val = -ENOENT; + goto listdef_failure_lock; + } + ret_val = netlbl_mgmt_listentry(ans_skb, entry); + rcu_read_unlock(); + if (ret_val != 0) + goto listdef_failure; + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +listdef_failure_lock: + rcu_read_unlock(); +listdef_failure: + kfree_skb(ans_skb); + return ret_val; +} + +/** + * netlbl_mgmt_protocols_cb - Write an individual PROTOCOL message response + * @skb: the skb to write to + * @cb: the NETLINK callback + * @protocol: the NetLabel protocol to use in the message + * + * Description: + * This function is to be used in conjunction with netlbl_mgmt_protocols() to + * answer a application's PROTOCOLS message. Returns the size of the message + * on success, negative values on failure. + * + */ +static int netlbl_mgmt_protocols_cb(struct sk_buff *skb, + struct netlink_callback *cb, + u32 protocol) +{ + int ret_val = -ENOMEM; + void *data; + + data = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &netlbl_mgmt_gnl_family, NLM_F_MULTI, + NLBL_MGMT_C_PROTOCOLS); + if (data == NULL) + goto protocols_cb_failure; + + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, protocol); + if (ret_val != 0) + goto protocols_cb_failure; + + return genlmsg_end(skb, data); + +protocols_cb_failure: + genlmsg_cancel(skb, data); + return ret_val; +} + +/** + * netlbl_mgmt_protocols - Handle a PROTOCOLS message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated PROTOCOLS message and respond accordingly. + * + */ +static int netlbl_mgmt_protocols(struct sk_buff *skb, + struct netlink_callback *cb) +{ + u32 protos_sent = cb->args[0]; + + if (protos_sent == 0) { + if (netlbl_mgmt_protocols_cb(skb, + cb, + NETLBL_NLTYPE_UNLABELED) < 0) + goto protocols_return; + protos_sent++; + } + if (protos_sent == 1) { + if (netlbl_mgmt_protocols_cb(skb, + cb, + NETLBL_NLTYPE_CIPSOV4) < 0) + goto protocols_return; + protos_sent++; + } + +protocols_return: + cb->args[0] = protos_sent; + return skb->len; +} + +/** + * netlbl_mgmt_version - Handle a VERSION message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated VERSION message and respond accordingly. Returns + * zero on success, negative values on failure. + * + */ +static int netlbl_mgmt_version(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -ENOMEM; + struct sk_buff *ans_skb = NULL; + void *data; + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (ans_skb == NULL) + return -ENOMEM; + data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family, + 0, NLBL_MGMT_C_VERSION); + if (data == NULL) + goto version_failure; + + ret_val = nla_put_u32(ans_skb, + NLBL_MGMT_A_VERSION, + NETLBL_PROTO_VERSION); + if (ret_val != 0) + goto version_failure; + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +version_failure: + kfree_skb(ans_skb); + return ret_val; +} + + +/* + * NetLabel Generic NETLINK Command Definitions + */ + +static struct genl_ops netlbl_mgmt_genl_ops[] = { + { + .cmd = NLBL_MGMT_C_ADD, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_mgmt_genl_policy, + .doit = netlbl_mgmt_add, + .dumpit = NULL, + }, + { + .cmd = NLBL_MGMT_C_REMOVE, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_mgmt_genl_policy, + .doit = netlbl_mgmt_remove, + .dumpit = NULL, + }, + { + .cmd = NLBL_MGMT_C_LISTALL, + .flags = 0, + .policy = netlbl_mgmt_genl_policy, + .doit = NULL, + .dumpit = netlbl_mgmt_listall, + }, + { + .cmd = NLBL_MGMT_C_ADDDEF, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_mgmt_genl_policy, + .doit = netlbl_mgmt_adddef, + .dumpit = NULL, + }, + { + .cmd = NLBL_MGMT_C_REMOVEDEF, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_mgmt_genl_policy, + .doit = netlbl_mgmt_removedef, + .dumpit = NULL, + }, + { + .cmd = NLBL_MGMT_C_LISTDEF, + .flags = 0, + .policy = netlbl_mgmt_genl_policy, + .doit = netlbl_mgmt_listdef, + .dumpit = NULL, + }, + { + .cmd = NLBL_MGMT_C_PROTOCOLS, + .flags = 0, + .policy = netlbl_mgmt_genl_policy, + .doit = NULL, + .dumpit = netlbl_mgmt_protocols, + }, + { + .cmd = NLBL_MGMT_C_VERSION, + .flags = 0, + .policy = netlbl_mgmt_genl_policy, + .doit = netlbl_mgmt_version, + .dumpit = NULL, + }, +}; + +/* + * NetLabel Generic NETLINK Protocol Functions + */ + +/** + * netlbl_mgmt_genl_init - Register the NetLabel management component + * + * Description: + * Register the NetLabel management component with the Generic NETLINK + * mechanism. Returns zero on success, negative values on failure. + * + */ +int __init netlbl_mgmt_genl_init(void) +{ + return genl_register_family_with_ops(&netlbl_mgmt_gnl_family, + netlbl_mgmt_genl_ops, ARRAY_SIZE(netlbl_mgmt_genl_ops)); +} diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h new file mode 100644 index 00000000..0a25838b --- /dev/null +++ b/net/netlabel/netlabel_mgmt.h @@ -0,0 +1,223 @@ +/* + * NetLabel Management Support + * + * This file defines the management functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_MGMT_H +#define _NETLABEL_MGMT_H + +#include +#include + +/* + * The following NetLabel payloads are supported by the management interface. + * + * o ADD: + * Sent by an application to add a domain mapping to the NetLabel system. + * + * Required attributes: + * + * NLBL_MGMT_A_DOMAIN + * NLBL_MGMT_A_PROTOCOL + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_MGMT_A_IPV4ADDR + * NLBL_MGMT_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_MGMT_A_IPV6ADDR + * NLBL_MGMT_A_IPV6MASK + * + * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required: + * + * NLBL_MGMT_A_CV4DOI + * + * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * + * o REMOVE: + * Sent by an application to remove a domain mapping from the NetLabel + * system. + * + * Required attributes: + * + * NLBL_MGMT_A_DOMAIN + * + * o LISTALL: + * This message can be sent either from an application or by the kernel in + * response to an application generated LISTALL message. When sent by an + * application there is no payload and the NLM_F_DUMP flag should be set. + * The kernel should respond with a series of the following messages. + * + * Required attributes: + * + * NLBL_MGMT_A_DOMAIN + * + * If the IP address selectors are not used the following attribute is + * required: + * + * NLBL_MGMT_A_PROTOCOL + * + * If the IP address selectors are used then the following attritbute is + * required: + * + * NLBL_MGMT_A_SELECTORLIST + * + * If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following + * attributes are required: + * + * NLBL_MGMT_A_CV4DOI + * + * If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other + * attributes are required. + * + * o ADDDEF: + * Sent by an application to set the default domain mapping for the NetLabel + * system. + * + * Required attributes: + * + * NLBL_MGMT_A_PROTOCOL + * + * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required: + * + * NLBL_MGMT_A_CV4DOI + * + * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * + * o REMOVEDEF: + * Sent by an application to remove the default domain mapping from the + * NetLabel system, there is no payload. + * + * o LISTDEF: + * This message can be sent either from an application or by the kernel in + * response to an application generated LISTDEF message. When sent by an + * application there is no payload. On success the kernel should send a + * response using the following format. + * + * If the IP address selectors are not used the following attribute is + * required: + * + * NLBL_MGMT_A_PROTOCOL + * + * If the IP address selectors are used then the following attritbute is + * required: + * + * NLBL_MGMT_A_SELECTORLIST + * + * If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following + * attributes are required: + * + * NLBL_MGMT_A_CV4DOI + * + * If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other + * attributes are required. + * + * o PROTOCOLS: + * Sent by an application to request a list of configured NetLabel protocols + * in the kernel. When sent by an application there is no payload and the + * NLM_F_DUMP flag should be set. The kernel should respond with a series of + * the following messages. + * + * Required attributes: + * + * NLBL_MGMT_A_PROTOCOL + * + * o VERSION: + * Sent by an application to request the NetLabel version. When sent by an + * application there is no payload. This message type is also used by the + * kernel to respond to an VERSION request. + * + * Required attributes: + * + * NLBL_MGMT_A_VERSION + * + */ + +/* NetLabel Management commands */ +enum { + NLBL_MGMT_C_UNSPEC, + NLBL_MGMT_C_ADD, + NLBL_MGMT_C_REMOVE, + NLBL_MGMT_C_LISTALL, + NLBL_MGMT_C_ADDDEF, + NLBL_MGMT_C_REMOVEDEF, + NLBL_MGMT_C_LISTDEF, + NLBL_MGMT_C_PROTOCOLS, + NLBL_MGMT_C_VERSION, + __NLBL_MGMT_C_MAX, +}; + +/* NetLabel Management attributes */ +enum { + NLBL_MGMT_A_UNSPEC, + NLBL_MGMT_A_DOMAIN, + /* (NLA_NUL_STRING) + * the NULL terminated LSM domain string */ + NLBL_MGMT_A_PROTOCOL, + /* (NLA_U32) + * the NetLabel protocol type (defined by NETLBL_NLTYPE_*) */ + NLBL_MGMT_A_VERSION, + /* (NLA_U32) + * the NetLabel protocol version number (defined by + * NETLBL_PROTO_VERSION) */ + NLBL_MGMT_A_CV4DOI, + /* (NLA_U32) + * the CIPSOv4 DOI value */ + NLBL_MGMT_A_IPV6ADDR, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address */ + NLBL_MGMT_A_IPV6MASK, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address mask */ + NLBL_MGMT_A_IPV4ADDR, + /* (NLA_BINARY, struct in_addr) + * an IPv4 address */ + NLBL_MGMT_A_IPV4MASK, + /* (NLA_BINARY, struct in_addr) + * and IPv4 address mask */ + NLBL_MGMT_A_ADDRSELECTOR, + /* (NLA_NESTED) + * an IP address selector, must contain an address, mask, and protocol + * attribute plus any protocol specific attributes */ + NLBL_MGMT_A_SELECTORLIST, + /* (NLA_NESTED) + * the selector list, there must be at least one + * NLBL_MGMT_A_ADDRSELECTOR attribute */ + __NLBL_MGMT_A_MAX, +}; +#define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1) + +/* NetLabel protocol functions */ +int netlbl_mgmt_genl_init(void); + +/* NetLabel configured protocol reference counter */ +extern atomic_t netlabel_mgmt_protocount; + +#endif diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c new file mode 100644 index 00000000..9c38658f --- /dev/null +++ b/net/netlabel/netlabel_unlabeled.c @@ -0,0 +1,1563 @@ +/* + * NetLabel Unlabeled Support + * + * This file defines functions for dealing with unlabeled packets for the + * NetLabel system. The NetLabel system manages static and dynamic label + * mappings for network protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_user.h" +#include "netlabel_addrlist.h" +#include "netlabel_domainhash.h" +#include "netlabel_unlabeled.h" +#include "netlabel_mgmt.h" + +/* NOTE: at present we always use init's network namespace since we don't + * presently support different namespaces even though the majority of + * the functions in this file are "namespace safe" */ + +/* The unlabeled connection hash table which we use to map network interfaces + * and addresses of unlabeled packets to a user specified secid value for the + * LSM. The hash table is used to lookup the network interface entry + * (struct netlbl_unlhsh_iface) and then the interface entry is used to + * lookup an IP address match from an ordered list. If a network interface + * match can not be found in the hash table then the default entry + * (netlbl_unlhsh_def) is used. The IP address entry list + * (struct netlbl_unlhsh_addr) is ordered such that the entries with a + * larger netmask come first. + */ +struct netlbl_unlhsh_tbl { + struct list_head *tbl; + u32 size; +}; +#define netlbl_unlhsh_addr4_entry(iter) \ + container_of(iter, struct netlbl_unlhsh_addr4, list) +struct netlbl_unlhsh_addr4 { + u32 secid; + + struct netlbl_af4list list; + struct rcu_head rcu; +}; +#define netlbl_unlhsh_addr6_entry(iter) \ + container_of(iter, struct netlbl_unlhsh_addr6, list) +struct netlbl_unlhsh_addr6 { + u32 secid; + + struct netlbl_af6list list; + struct rcu_head rcu; +}; +struct netlbl_unlhsh_iface { + int ifindex; + struct list_head addr4_list; + struct list_head addr6_list; + + u32 valid; + struct list_head list; + struct rcu_head rcu; +}; + +/* Argument struct for netlbl_unlhsh_walk() */ +struct netlbl_unlhsh_walk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + +/* Unlabeled connection hash table */ +/* updates should be so rare that having one spinlock for the entire + * hash table should be okay */ +static DEFINE_SPINLOCK(netlbl_unlhsh_lock); +#define netlbl_unlhsh_rcu_deref(p) \ + rcu_dereference_check(p, rcu_read_lock_held() || \ + lockdep_is_held(&netlbl_unlhsh_lock)) +static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; +static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; + +/* Accept unlabeled packets flag */ +static u8 netlabel_unlabel_acceptflg = 0; + +/* NetLabel Generic NETLINK unlabeled family */ +static struct genl_family netlbl_unlabel_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_UNLABELED_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_UNLABEL_A_MAX, +}; + +/* NetLabel Netlink attribute policy */ +static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { + [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, + [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, + .len = sizeof(struct in_addr) }, + [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, + .len = sizeof(struct in_addr) }, + [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 }, + [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } +}; + +/* + * Unlabeled Connection Hash Table Functions + */ + +/** + * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that memory allocated to a hash table interface entry can be + * released safely. It is important to note that this function does not free + * the IPv4 and IPv6 address lists contained as part of an interface entry. It + * is up to the rest of the code to make sure an interface entry is only freed + * once it's address lists are empty. + * + */ +static void netlbl_unlhsh_free_iface(struct rcu_head *entry) +{ + struct netlbl_unlhsh_iface *iface; + struct netlbl_af4list *iter4; + struct netlbl_af4list *tmp4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; + struct netlbl_af6list *tmp6; +#endif /* IPv6 */ + + iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); + + /* no need for locks here since we are the only one with access to this + * structure */ + + netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { + netlbl_af4list_remove_entry(iter4); + kfree(netlbl_unlhsh_addr4_entry(iter4)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { + netlbl_af6list_remove_entry(iter6); + kfree(netlbl_unlhsh_addr6_entry(iter6)); + } +#endif /* IPv6 */ + kfree(iface); +} + +/** + * netlbl_unlhsh_hash - Hashing function for the hash table + * @ifindex: the network interface/device to hash + * + * Description: + * This is the hashing function for the unlabeled hash table, it returns the + * bucket number for the given device/interface. The caller is responsible for + * ensuring that the hash table is protected with either a RCU read lock or + * the hash table lock. + * + */ +static u32 netlbl_unlhsh_hash(int ifindex) +{ + return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); +} + +/** + * netlbl_unlhsh_search_iface - Search for a matching interface entry + * @ifindex: the network interface + * + * Description: + * Searches the unlabeled connection hash table and returns a pointer to the + * interface entry which matches @ifindex, otherwise NULL is returned. The + * caller is responsible for ensuring that the hash table is protected with + * either a RCU read lock or the hash table lock. + * + */ +static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) +{ + u32 bkt; + struct list_head *bkt_list; + struct netlbl_unlhsh_iface *iter; + + bkt = netlbl_unlhsh_hash(ifindex); + bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; + list_for_each_entry_rcu(iter, bkt_list, list) + if (iter->valid && iter->ifindex == ifindex) + return iter; + + return NULL; +} + +/** + * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table + * @iface: the associated interface entry + * @addr: IPv4 address in network byte order + * @mask: IPv4 address mask in network byte order + * @secid: LSM secid value for entry + * + * Description: + * Add a new address entry into the unlabeled connection hash table using the + * interface entry specified by @iface. On success zero is returned, otherwise + * a negative value is returned. + * + */ +static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, + const struct in_addr *addr, + const struct in_addr *mask, + u32 secid) +{ + int ret_val; + struct netlbl_unlhsh_addr4 *entry; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + + entry->list.addr = addr->s_addr & mask->s_addr; + entry->list.mask = mask->s_addr; + entry->list.valid = 1; + entry->secid = secid; + + spin_lock(&netlbl_unlhsh_lock); + ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); + spin_unlock(&netlbl_unlhsh_lock); + + if (ret_val != 0) + kfree(entry); + return ret_val; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table + * @iface: the associated interface entry + * @addr: IPv6 address in network byte order + * @mask: IPv6 address mask in network byte order + * @secid: LSM secid value for entry + * + * Description: + * Add a new address entry into the unlabeled connection hash table using the + * interface entry specified by @iface. On success zero is returned, otherwise + * a negative value is returned. + * + */ +static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, + const struct in6_addr *addr, + const struct in6_addr *mask, + u32 secid) +{ + int ret_val; + struct netlbl_unlhsh_addr6 *entry; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + return -ENOMEM; + + ipv6_addr_copy(&entry->list.addr, addr); + entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + ipv6_addr_copy(&entry->list.mask, mask); + entry->list.valid = 1; + entry->secid = secid; + + spin_lock(&netlbl_unlhsh_lock); + ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); + spin_unlock(&netlbl_unlhsh_lock); + + if (ret_val != 0) + kfree(entry); + return 0; +} +#endif /* IPv6 */ + +/** + * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table + * @ifindex: network interface + * + * Description: + * Add a new, empty, interface entry into the unlabeled connection hash table. + * On success a pointer to the new interface entry is returned, on failure NULL + * is returned. + * + */ +static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) +{ + u32 bkt; + struct netlbl_unlhsh_iface *iface; + + iface = kzalloc(sizeof(*iface), GFP_ATOMIC); + if (iface == NULL) + return NULL; + + iface->ifindex = ifindex; + INIT_LIST_HEAD(&iface->addr4_list); + INIT_LIST_HEAD(&iface->addr6_list); + iface->valid = 1; + + spin_lock(&netlbl_unlhsh_lock); + if (ifindex > 0) { + bkt = netlbl_unlhsh_hash(ifindex); + if (netlbl_unlhsh_search_iface(ifindex) != NULL) + goto add_iface_failure; + list_add_tail_rcu(&iface->list, + &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); + } else { + INIT_LIST_HEAD(&iface->list); + if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) + goto add_iface_failure; + rcu_assign_pointer(netlbl_unlhsh_def, iface); + } + spin_unlock(&netlbl_unlhsh_lock); + + return iface; + +add_iface_failure: + spin_unlock(&netlbl_unlhsh_lock); + kfree(iface); + return NULL; +} + +/** + * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table + * @net: network namespace + * @dev_name: interface name + * @addr: IP address in network byte order + * @mask: address mask in network byte order + * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) + * @secid: LSM secid value for the entry + * @audit_info: NetLabel audit information + * + * Description: + * Adds a new entry to the unlabeled connection hash table. Returns zero on + * success, negative values on failure. + * + */ +int netlbl_unlhsh_add(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u32 addr_len, + u32 secid, + struct netlbl_audit *audit_info) +{ + int ret_val; + int ifindex; + struct net_device *dev; + struct netlbl_unlhsh_iface *iface; + struct audit_buffer *audit_buf = NULL; + char *secctx = NULL; + u32 secctx_len; + + if (addr_len != sizeof(struct in_addr) && + addr_len != sizeof(struct in6_addr)) + return -EINVAL; + + rcu_read_lock(); + if (dev_name != NULL) { + dev = dev_get_by_name_rcu(net, dev_name); + if (dev == NULL) { + ret_val = -ENODEV; + goto unlhsh_add_return; + } + ifindex = dev->ifindex; + iface = netlbl_unlhsh_search_iface(ifindex); + } else { + ifindex = 0; + iface = rcu_dereference(netlbl_unlhsh_def); + } + if (iface == NULL) { + iface = netlbl_unlhsh_add_iface(ifindex); + if (iface == NULL) { + ret_val = -ENOMEM; + goto unlhsh_add_return; + } + } + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, + audit_info); + switch (addr_len) { + case sizeof(struct in_addr): { + struct in_addr *addr4, *mask4; + + addr4 = (struct in_addr *)addr; + mask4 = (struct in_addr *)mask; + ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); + if (audit_buf != NULL) + netlbl_af4list_audit_addr(audit_buf, 1, + dev_name, + addr4->s_addr, + mask4->s_addr); + break; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case sizeof(struct in6_addr): { + struct in6_addr *addr6, *mask6; + + addr6 = (struct in6_addr *)addr; + mask6 = (struct in6_addr *)mask; + ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); + if (audit_buf != NULL) + netlbl_af6list_audit_addr(audit_buf, 1, + dev_name, + addr6, mask6); + break; + } +#endif /* IPv6 */ + default: + ret_val = -EINVAL; + } + if (ret_val == 0) + atomic_inc(&netlabel_mgmt_protocount); + +unlhsh_add_return: + rcu_read_unlock(); + if (audit_buf != NULL) { + if (security_secid_to_secctx(secid, + &secctx, + &secctx_len) == 0) { + audit_log_format(audit_buf, " sec_obj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + return ret_val; +} + +/** + * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry + * @net: network namespace + * @iface: interface entry + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Remove an IP address entry from the unlabeled connection hash table. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlhsh_remove_addr4(struct net *net, + struct netlbl_unlhsh_iface *iface, + const struct in_addr *addr, + const struct in_addr *mask, + struct netlbl_audit *audit_info) +{ + struct netlbl_af4list *list_entry; + struct netlbl_unlhsh_addr4 *entry; + struct audit_buffer *audit_buf; + struct net_device *dev; + char *secctx; + u32 secctx_len; + + spin_lock(&netlbl_unlhsh_lock); + list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, + &iface->addr4_list); + spin_unlock(&netlbl_unlhsh_lock); + if (list_entry != NULL) + entry = netlbl_unlhsh_addr4_entry(list_entry); + else + entry = NULL; + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, + audit_info); + if (audit_buf != NULL) { + dev = dev_get_by_index(net, iface->ifindex); + netlbl_af4list_audit_addr(audit_buf, 1, + (dev != NULL ? dev->name : NULL), + addr->s_addr, mask->s_addr); + if (dev != NULL) + dev_put(dev); + if (entry != NULL && + security_secid_to_secctx(entry->secid, + &secctx, &secctx_len) == 0) { + audit_log_format(audit_buf, " sec_obj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); + audit_log_end(audit_buf); + } + + if (entry == NULL) + return -ENOENT; + + kfree_rcu(entry, rcu); + return 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry + * @net: network namespace + * @iface: interface entry + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Remove an IP address entry from the unlabeled connection hash table. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlhsh_remove_addr6(struct net *net, + struct netlbl_unlhsh_iface *iface, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ + struct netlbl_af6list *list_entry; + struct netlbl_unlhsh_addr6 *entry; + struct audit_buffer *audit_buf; + struct net_device *dev; + char *secctx; + u32 secctx_len; + + spin_lock(&netlbl_unlhsh_lock); + list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); + spin_unlock(&netlbl_unlhsh_lock); + if (list_entry != NULL) + entry = netlbl_unlhsh_addr6_entry(list_entry); + else + entry = NULL; + + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, + audit_info); + if (audit_buf != NULL) { + dev = dev_get_by_index(net, iface->ifindex); + netlbl_af6list_audit_addr(audit_buf, 1, + (dev != NULL ? dev->name : NULL), + addr, mask); + if (dev != NULL) + dev_put(dev); + if (entry != NULL && + security_secid_to_secctx(entry->secid, + &secctx, &secctx_len) == 0) { + audit_log_format(audit_buf, " sec_obj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); + audit_log_end(audit_buf); + } + + if (entry == NULL) + return -ENOENT; + + kfree_rcu(entry, rcu); + return 0; +} +#endif /* IPv6 */ + +/** + * netlbl_unlhsh_condremove_iface - Remove an interface entry + * @iface: the interface entry + * + * Description: + * Remove an interface entry from the unlabeled connection hash table if it is + * empty. An interface entry is considered to be empty if there are no + * address entries assigned to it. + * + */ +static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) +{ + struct netlbl_af4list *iter4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *iter6; +#endif /* IPv6 */ + + spin_lock(&netlbl_unlhsh_lock); + netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) + goto unlhsh_condremove_failure; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) + goto unlhsh_condremove_failure; +#endif /* IPv6 */ + iface->valid = 0; + if (iface->ifindex > 0) + list_del_rcu(&iface->list); + else + rcu_assign_pointer(netlbl_unlhsh_def, NULL); + spin_unlock(&netlbl_unlhsh_lock); + + call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); + return; + +unlhsh_condremove_failure: + spin_unlock(&netlbl_unlhsh_lock); +} + +/** + * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table + * @net: network namespace + * @dev_name: interface name + * @addr: IP address in network byte order + * @mask: address mask in network byte order + * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) + * @audit_info: NetLabel audit information + * + * Description: + * Removes and existing entry from the unlabeled connection hash table. + * Returns zero on success, negative values on failure. + * + */ +int netlbl_unlhsh_remove(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u32 addr_len, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct net_device *dev; + struct netlbl_unlhsh_iface *iface; + + if (addr_len != sizeof(struct in_addr) && + addr_len != sizeof(struct in6_addr)) + return -EINVAL; + + rcu_read_lock(); + if (dev_name != NULL) { + dev = dev_get_by_name_rcu(net, dev_name); + if (dev == NULL) { + ret_val = -ENODEV; + goto unlhsh_remove_return; + } + iface = netlbl_unlhsh_search_iface(dev->ifindex); + } else + iface = rcu_dereference(netlbl_unlhsh_def); + if (iface == NULL) { + ret_val = -ENOENT; + goto unlhsh_remove_return; + } + switch (addr_len) { + case sizeof(struct in_addr): + ret_val = netlbl_unlhsh_remove_addr4(net, + iface, addr, mask, + audit_info); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case sizeof(struct in6_addr): + ret_val = netlbl_unlhsh_remove_addr6(net, + iface, addr, mask, + audit_info); + break; +#endif /* IPv6 */ + default: + ret_val = -EINVAL; + } + if (ret_val == 0) { + netlbl_unlhsh_condremove_iface(iface); + atomic_dec(&netlabel_mgmt_protocount); + } + +unlhsh_remove_return: + rcu_read_unlock(); + return ret_val; +} + +/* + * General Helper Functions + */ + +/** + * netlbl_unlhsh_netdev_handler - Network device notification handler + * @this: notifier block + * @event: the event + * @ptr: the network device (cast to void) + * + * Description: + * Handle network device events, although at present all we care about is a + * network device going away. In the case of a device going away we clear any + * related entries from the unlabeled connection hash table. + * + */ +static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct netlbl_unlhsh_iface *iface = NULL; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ + if (event == NETDEV_DOWN) { + spin_lock(&netlbl_unlhsh_lock); + iface = netlbl_unlhsh_search_iface(dev->ifindex); + if (iface != NULL && iface->valid) { + iface->valid = 0; + list_del_rcu(&iface->list); + } else + iface = NULL; + spin_unlock(&netlbl_unlhsh_lock); + } + + if (iface != NULL) + call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); + + return NOTIFY_DONE; +} + +/** + * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag + * @value: desired value + * @audit_info: NetLabel audit information + * + * Description: + * Set the value of the unlabeled accept flag to @value. + * + */ +static void netlbl_unlabel_acceptflg_set(u8 value, + struct netlbl_audit *audit_info) +{ + struct audit_buffer *audit_buf; + u8 old_val; + + old_val = netlabel_unlabel_acceptflg; + netlabel_unlabel_acceptflg = value; + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, + audit_info); + if (audit_buf != NULL) { + audit_log_format(audit_buf, + " unlbl_accept=%u old=%u", value, old_val); + audit_log_end(audit_buf); + } +} + +/** + * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information + * @info: the Generic NETLINK info block + * @addr: the IP address + * @mask: the IP address mask + * @len: the address length + * + * Description: + * Examine the Generic NETLINK message and extract the IP address information. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlabel_addrinfo_get(struct genl_info *info, + void **addr, + void **mask, + u32 *len) +{ + u32 addr_len; + + if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { + addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); + if (addr_len != sizeof(struct in_addr) && + addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) + return -EINVAL; + *len = addr_len; + *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); + *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); + return 0; + } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { + addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); + if (addr_len != sizeof(struct in6_addr) && + addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) + return -EINVAL; + *len = addr_len; + *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); + *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); + return 0; + } + + return -EINVAL; +} + +/* + * NetLabel Command Handlers + */ + +/** + * netlbl_unlabel_accept - Handle an ACCEPT message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated ACCEPT message and set the accept flag accordingly. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) +{ + u8 value; + struct netlbl_audit audit_info; + + if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { + value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); + if (value == 1 || value == 0) { + netlbl_netlink_auditinfo(skb, &audit_info); + netlbl_unlabel_acceptflg_set(value, &audit_info); + return 0; + } + } + + return -EINVAL; +} + +/** + * netlbl_unlabel_list - Handle a LIST message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated LIST message and respond with the current status. + * Returns zero on success, negative values on failure. + * + */ +static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -EINVAL; + struct sk_buff *ans_skb; + void *data; + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (ans_skb == NULL) + goto list_failure; + data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, + 0, NLBL_UNLABEL_C_LIST); + if (data == NULL) { + ret_val = -ENOMEM; + goto list_failure; + } + + ret_val = nla_put_u8(ans_skb, + NLBL_UNLABEL_A_ACPTFLG, + netlabel_unlabel_acceptflg); + if (ret_val != 0) + goto list_failure; + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +list_failure: + kfree_skb(ans_skb); + return ret_val; +} + +/** + * netlbl_unlabel_staticadd - Handle a STATICADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICADD message and add a new unlabeled + * connection entry to the hash table. Returns zero on success, negative + * values on failure. + * + */ +static int netlbl_unlabel_staticadd(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + char *dev_name; + void *addr; + void *mask; + u32 addr_len; + u32 secid; + struct netlbl_audit audit_info; + + /* Don't allow users to add both IPv4 and IPv6 addresses for a + * single entry. However, allow users to create two entries, one each + * for IPv4 and IPv4, with the same LSM security context which should + * achieve the same result. */ + if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || + !info->attrs[NLBL_UNLABEL_A_IFACE] || + !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); + ret_val = security_secctx_to_secid( + nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), + nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), + &secid); + if (ret_val != 0) + return ret_val; + + return netlbl_unlhsh_add(&init_net, + dev_name, addr, mask, addr_len, secid, + &audit_info); +} + +/** + * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICADDDEF message and add a new default + * unlabeled connection entry. Returns zero on success, negative values on + * failure. + * + */ +static int netlbl_unlabel_staticadddef(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + void *addr; + void *mask; + u32 addr_len; + u32 secid; + struct netlbl_audit audit_info; + + /* Don't allow users to add both IPv4 and IPv6 addresses for a + * single entry. However, allow users to create two entries, one each + * for IPv4 and IPv6, with the same LSM security context which should + * achieve the same result. */ + if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || + !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + ret_val = security_secctx_to_secid( + nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), + nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), + &secid); + if (ret_val != 0) + return ret_val; + + return netlbl_unlhsh_add(&init_net, + NULL, addr, mask, addr_len, secid, + &audit_info); +} + +/** + * netlbl_unlabel_staticremove - Handle a STATICREMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICREMOVE message and remove the specified + * unlabeled connection entry. Returns zero on success, negative values on + * failure. + * + */ +static int netlbl_unlabel_staticremove(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + char *dev_name; + void *addr; + void *mask; + u32 addr_len; + struct netlbl_audit audit_info; + + /* See the note in netlbl_unlabel_staticadd() about not allowing both + * IPv4 and IPv6 in the same entry. */ + if (!info->attrs[NLBL_UNLABEL_A_IFACE] || + !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); + + return netlbl_unlhsh_remove(&init_net, + dev_name, addr, mask, addr_len, + &audit_info); +} + +/** + * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated STATICREMOVEDEF message and remove the default + * unlabeled connection entry. Returns zero on success, negative values on + * failure. + * + */ +static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, + struct genl_info *info) +{ + int ret_val; + void *addr; + void *mask; + u32 addr_len; + struct netlbl_audit audit_info; + + /* See the note in netlbl_unlabel_staticadd() about not allowing both + * IPv4 and IPv6 in the same entry. */ + if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ + (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || + !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + + ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); + if (ret_val != 0) + return ret_val; + + return netlbl_unlhsh_remove(&init_net, + NULL, addr, mask, addr_len, + &audit_info); +} + + +/** + * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] + * @cmd: command/message + * @iface: the interface entry + * @addr4: the IPv4 address entry + * @addr6: the IPv6 address entry + * @arg: the netlbl_unlhsh_walk_arg structure + * + * Description: + * This function is designed to be used to generate a response for a + * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 + * can be specified, not both, the other unspecified entry should be set to + * NULL by the caller. Returns the size of the message on success, negative + * values on failure. + * + */ +static int netlbl_unlabel_staticlist_gen(u32 cmd, + const struct netlbl_unlhsh_iface *iface, + const struct netlbl_unlhsh_addr4 *addr4, + const struct netlbl_unlhsh_addr6 *addr6, + void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_unlhsh_walk_arg *cb_arg = arg; + struct net_device *dev; + void *data; + u32 secid; + char *secctx; + u32 secctx_len; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, + cb_arg->seq, &netlbl_unlabel_gnl_family, + NLM_F_MULTI, cmd); + if (data == NULL) + goto list_cb_failure; + + if (iface->ifindex > 0) { + dev = dev_get_by_index(&init_net, iface->ifindex); + if (!dev) { + ret_val = -ENODEV; + goto list_cb_failure; + } + ret_val = nla_put_string(cb_arg->skb, + NLBL_UNLABEL_A_IFACE, dev->name); + dev_put(dev); + if (ret_val != 0) + goto list_cb_failure; + } + + if (addr4) { + struct in_addr addr_struct; + + addr_struct.s_addr = addr4->list.addr; + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV4ADDR, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + goto list_cb_failure; + + addr_struct.s_addr = addr4->list.mask; + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV4MASK, + sizeof(struct in_addr), + &addr_struct); + if (ret_val != 0) + goto list_cb_failure; + + secid = addr4->secid; + } else { + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV6ADDR, + sizeof(struct in6_addr), + &addr6->list.addr); + if (ret_val != 0) + goto list_cb_failure; + + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_IPV6MASK, + sizeof(struct in6_addr), + &addr6->list.mask); + if (ret_val != 0) + goto list_cb_failure; + + secid = addr6->secid; + } + + ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); + if (ret_val != 0) + goto list_cb_failure; + ret_val = nla_put(cb_arg->skb, + NLBL_UNLABEL_A_SECCTX, + secctx_len, + secctx); + security_release_secctx(secctx, secctx_len); + if (ret_val != 0) + goto list_cb_failure; + + cb_arg->seq++; + return genlmsg_end(cb_arg->skb, data); + +list_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_unlabel_staticlist - Handle a STATICLIST message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated STATICLIST message and dump the unlabeled + * connection hash table in a form suitable for use in a kernel generated + * STATICLIST message. Returns the length of @skb. + * + */ +static int netlbl_unlabel_staticlist(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_unlhsh_walk_arg cb_arg; + u32 skip_bkt = cb->args[0]; + u32 skip_chain = cb->args[1]; + u32 skip_addr4 = cb->args[2]; + u32 skip_addr6 = cb->args[3]; + u32 iter_bkt; + u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + struct netlbl_unlhsh_iface *iface; + struct list_head *iter_list; + struct netlbl_af4list *addr4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct netlbl_af6list *addr6; +#endif + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + for (iter_bkt = skip_bkt; + iter_bkt < rcu_dereference(netlbl_unlhsh)->size; + iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; + list_for_each_entry_rcu(iface, iter_list, list) { + if (!iface->valid || + iter_chain++ < skip_chain) + continue; + netlbl_af4list_foreach_rcu(addr4, + &iface->addr4_list) { + if (iter_addr4++ < skip_addr4) + continue; + if (netlbl_unlabel_staticlist_gen( + NLBL_UNLABEL_C_STATICLIST, + iface, + netlbl_unlhsh_addr4_entry(addr4), + NULL, + &cb_arg) < 0) { + iter_addr4--; + iter_chain--; + goto unlabel_staticlist_return; + } + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(addr6, + &iface->addr6_list) { + if (iter_addr6++ < skip_addr6) + continue; + if (netlbl_unlabel_staticlist_gen( + NLBL_UNLABEL_C_STATICLIST, + iface, + NULL, + netlbl_unlhsh_addr6_entry(addr6), + &cb_arg) < 0) { + iter_addr6--; + iter_chain--; + goto unlabel_staticlist_return; + } + } +#endif /* IPv6 */ + } + } + +unlabel_staticlist_return: + rcu_read_unlock(); + cb->args[0] = skip_bkt; + cb->args[1] = skip_chain; + cb->args[2] = skip_addr4; + cb->args[3] = skip_addr6; + return skb->len; +} + +/** + * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated STATICLISTDEF message and dump the default + * unlabeled connection entry in a form suitable for use in a kernel generated + * STATICLISTDEF message. Returns the length of @skb. + * + */ +static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_unlhsh_walk_arg cb_arg; + struct netlbl_unlhsh_iface *iface; + u32 skip_addr4 = cb->args[0]; + u32 skip_addr6 = cb->args[1]; + u32 iter_addr4 = 0; + struct netlbl_af4list *addr4; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + u32 iter_addr6 = 0; + struct netlbl_af6list *addr6; +#endif + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + iface = rcu_dereference(netlbl_unlhsh_def); + if (iface == NULL || !iface->valid) + goto unlabel_staticlistdef_return; + + netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { + if (iter_addr4++ < skip_addr4) + continue; + if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, + iface, + netlbl_unlhsh_addr4_entry(addr4), + NULL, + &cb_arg) < 0) { + iter_addr4--; + goto unlabel_staticlistdef_return; + } + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { + if (iter_addr6++ < skip_addr6) + continue; + if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, + iface, + NULL, + netlbl_unlhsh_addr6_entry(addr6), + &cb_arg) < 0) { + iter_addr6--; + goto unlabel_staticlistdef_return; + } + } +#endif /* IPv6 */ + +unlabel_staticlistdef_return: + rcu_read_unlock(); + cb->args[0] = skip_addr4; + cb->args[1] = skip_addr6; + return skb->len; +} + +/* + * NetLabel Generic NETLINK Command Definitions + */ + +static struct genl_ops netlbl_unlabel_genl_ops[] = { + { + .cmd = NLBL_UNLABEL_C_STATICADD, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticadd, + .dumpit = NULL, + }, + { + .cmd = NLBL_UNLABEL_C_STATICREMOVE, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticremove, + .dumpit = NULL, + }, + { + .cmd = NLBL_UNLABEL_C_STATICLIST, + .flags = 0, + .policy = netlbl_unlabel_genl_policy, + .doit = NULL, + .dumpit = netlbl_unlabel_staticlist, + }, + { + .cmd = NLBL_UNLABEL_C_STATICADDDEF, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticadddef, + .dumpit = NULL, + }, + { + .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_staticremovedef, + .dumpit = NULL, + }, + { + .cmd = NLBL_UNLABEL_C_STATICLISTDEF, + .flags = 0, + .policy = netlbl_unlabel_genl_policy, + .doit = NULL, + .dumpit = netlbl_unlabel_staticlistdef, + }, + { + .cmd = NLBL_UNLABEL_C_ACCEPT, + .flags = GENL_ADMIN_PERM, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_accept, + .dumpit = NULL, + }, + { + .cmd = NLBL_UNLABEL_C_LIST, + .flags = 0, + .policy = netlbl_unlabel_genl_policy, + .doit = netlbl_unlabel_list, + .dumpit = NULL, + }, +}; + +/* + * NetLabel Generic NETLINK Protocol Functions + */ + +/** + * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component + * + * Description: + * Register the unlabeled packet NetLabel component with the Generic NETLINK + * mechanism. Returns zero on success, negative values on failure. + * + */ +int __init netlbl_unlabel_genl_init(void) +{ + return genl_register_family_with_ops(&netlbl_unlabel_gnl_family, + netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops)); +} + +/* + * NetLabel KAPI Hooks + */ + +static struct notifier_block netlbl_unlhsh_netdev_notifier = { + .notifier_call = netlbl_unlhsh_netdev_handler, +}; + +/** + * netlbl_unlabel_init - Initialize the unlabeled connection hash table + * @size: the number of bits to use for the hash buckets + * + * Description: + * Initializes the unlabeled connection hash table and registers a network + * device notification handler. This function should only be called by the + * NetLabel subsystem itself during initialization. Returns zero on success, + * non-zero values on error. + * + */ +int __init netlbl_unlabel_init(u32 size) +{ + u32 iter; + struct netlbl_unlhsh_tbl *hsh_tbl; + + if (size == 0) + return -EINVAL; + + hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); + if (hsh_tbl == NULL) + return -ENOMEM; + hsh_tbl->size = 1 << size; + hsh_tbl->tbl = kcalloc(hsh_tbl->size, + sizeof(struct list_head), + GFP_KERNEL); + if (hsh_tbl->tbl == NULL) { + kfree(hsh_tbl); + return -ENOMEM; + } + for (iter = 0; iter < hsh_tbl->size; iter++) + INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); + + rcu_read_lock(); + spin_lock(&netlbl_unlhsh_lock); + rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); + spin_unlock(&netlbl_unlhsh_lock); + rcu_read_unlock(); + + register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); + + return 0; +} + +/** + * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet + * @skb: the packet + * @family: protocol family + * @secattr: the security attributes + * + * Description: + * Determine the security attributes, if any, for an unlabled packet and return + * them in @secattr. Returns zero on success and negative values on failure. + * + */ +int netlbl_unlabel_getattr(const struct sk_buff *skb, + u16 family, + struct netlbl_lsm_secattr *secattr) +{ + struct netlbl_unlhsh_iface *iface; + + rcu_read_lock(); + iface = netlbl_unlhsh_search_iface(skb->skb_iif); + if (iface == NULL) + iface = rcu_dereference(netlbl_unlhsh_def); + if (iface == NULL || !iface->valid) + goto unlabel_getattr_nolabel; + switch (family) { + case PF_INET: { + struct iphdr *hdr4; + struct netlbl_af4list *addr4; + + hdr4 = ip_hdr(skb); + addr4 = netlbl_af4list_search(hdr4->saddr, + &iface->addr4_list); + if (addr4 == NULL) + goto unlabel_getattr_nolabel; + secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; + break; + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case PF_INET6: { + struct ipv6hdr *hdr6; + struct netlbl_af6list *addr6; + + hdr6 = ipv6_hdr(skb); + addr6 = netlbl_af6list_search(&hdr6->saddr, + &iface->addr6_list); + if (addr6 == NULL) + goto unlabel_getattr_nolabel; + secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; + break; + } +#endif /* IPv6 */ + default: + goto unlabel_getattr_nolabel; + } + rcu_read_unlock(); + + secattr->flags |= NETLBL_SECATTR_SECID; + secattr->type = NETLBL_NLTYPE_UNLABELED; + return 0; + +unlabel_getattr_nolabel: + rcu_read_unlock(); + if (netlabel_unlabel_acceptflg == 0) + return -ENOMSG; + secattr->type = NETLBL_NLTYPE_UNLABELED; + return 0; +} + +/** + * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets + * + * Description: + * Set the default NetLabel configuration to allow incoming unlabeled packets + * and to send unlabeled network traffic by default. + * + */ +int __init netlbl_unlabel_defconf(void) +{ + int ret_val; + struct netlbl_dom_map *entry; + struct netlbl_audit audit_info; + + /* Only the kernel is allowed to call this function and the only time + * it is called is at bootup before the audit subsystem is reporting + * messages so don't worry to much about these values. */ + security_task_getsecid(current, &audit_info.secid); + audit_info.loginuid = 0; + audit_info.sessionid = 0; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + entry->type = NETLBL_NLTYPE_UNLABELED; + ret_val = netlbl_domhsh_add_default(entry, &audit_info); + if (ret_val != 0) + return ret_val; + + netlbl_unlabel_acceptflg_set(1, &audit_info); + + return 0; +} diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h new file mode 100644 index 00000000..0bc8dc3f --- /dev/null +++ b/net/netlabel/netlabel_unlabeled.h @@ -0,0 +1,246 @@ +/* + * NetLabel Unlabeled Support + * + * This file defines functions for dealing with unlabeled packets for the + * NetLabel system. The NetLabel system manages static and dynamic label + * mappings for network protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_UNLABELED_H +#define _NETLABEL_UNLABELED_H + +#include + +/* + * The following NetLabel payloads are supported by the Unlabeled subsystem. + * + * o STATICADD + * This message is sent from an application to add a new static label for + * incoming unlabeled connections. + * + * Required attributes: + * + * NLBL_UNLABEL_A_IFACE + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICREMOVE + * This message is sent from an application to remove an existing static + * label for incoming unlabeled connections. + * + * Required attributes: + * + * NLBL_UNLABEL_A_IFACE + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICLIST + * This message can be sent either from an application or by the kernel in + * response to an application generated STATICLIST message. When sent by an + * application there is no payload and the NLM_F_DUMP flag should be set. + * The kernel should response with a series of the following messages. + * + * Required attributes: + * + * NLBL_UNLABEL_A_IFACE + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICADDDEF + * This message is sent from an application to set the default static + * label for incoming unlabeled connections. + * + * Required attribute: + * + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICREMOVEDEF + * This message is sent from an application to remove the existing default + * static label for incoming unlabeled connections. + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o STATICLISTDEF + * This message can be sent either from an application or by the kernel in + * response to an application generated STATICLISTDEF message. When sent by + * an application there is no payload and the NLM_F_DUMP flag should be set. + * The kernel should response with the following message. + * + * Required attribute: + * + * NLBL_UNLABEL_A_SECCTX + * + * If IPv4 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV4ADDR + * NLBL_UNLABEL_A_IPV4MASK + * + * If IPv6 is specified the following attributes are required: + * + * NLBL_UNLABEL_A_IPV6ADDR + * NLBL_UNLABEL_A_IPV6MASK + * + * o ACCEPT + * This message is sent from an application to specify if the kernel should + * allow unlabled packets to pass if they do not match any of the static + * mappings defined in the unlabeled module. + * + * Required attributes: + * + * NLBL_UNLABEL_A_ACPTFLG + * + * o LIST + * This message can be sent either from an application or by the kernel in + * response to an application generated LIST message. When sent by an + * application there is no payload. The kernel should respond to a LIST + * message with a LIST message on success. + * + * Required attributes: + * + * NLBL_UNLABEL_A_ACPTFLG + * + */ + +/* NetLabel Unlabeled commands */ +enum { + NLBL_UNLABEL_C_UNSPEC, + NLBL_UNLABEL_C_ACCEPT, + NLBL_UNLABEL_C_LIST, + NLBL_UNLABEL_C_STATICADD, + NLBL_UNLABEL_C_STATICREMOVE, + NLBL_UNLABEL_C_STATICLIST, + NLBL_UNLABEL_C_STATICADDDEF, + NLBL_UNLABEL_C_STATICREMOVEDEF, + NLBL_UNLABEL_C_STATICLISTDEF, + __NLBL_UNLABEL_C_MAX, +}; + +/* NetLabel Unlabeled attributes */ +enum { + NLBL_UNLABEL_A_UNSPEC, + NLBL_UNLABEL_A_ACPTFLG, + /* (NLA_U8) + * if true then unlabeled packets are allowed to pass, else unlabeled + * packets are rejected */ + NLBL_UNLABEL_A_IPV6ADDR, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address */ + NLBL_UNLABEL_A_IPV6MASK, + /* (NLA_BINARY, struct in6_addr) + * an IPv6 address mask */ + NLBL_UNLABEL_A_IPV4ADDR, + /* (NLA_BINARY, struct in_addr) + * an IPv4 address */ + NLBL_UNLABEL_A_IPV4MASK, + /* (NLA_BINARY, struct in_addr) + * and IPv4 address mask */ + NLBL_UNLABEL_A_IFACE, + /* (NLA_NULL_STRING) + * network interface */ + NLBL_UNLABEL_A_SECCTX, + /* (NLA_BINARY) + * a LSM specific security context */ + __NLBL_UNLABEL_A_MAX, +}; +#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) + +/* NetLabel protocol functions */ +int netlbl_unlabel_genl_init(void); + +/* Unlabeled connection hash table size */ +/* XXX - currently this number is an uneducated guess */ +#define NETLBL_UNLHSH_BITSIZE 7 + +/* General Unlabeled init function */ +int netlbl_unlabel_init(u32 size); + +/* Static/Fallback label management functions */ +int netlbl_unlhsh_add(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u32 addr_len, + u32 secid, + struct netlbl_audit *audit_info); +int netlbl_unlhsh_remove(struct net *net, + const char *dev_name, + const void *addr, + const void *mask, + u32 addr_len, + struct netlbl_audit *audit_info); + +/* Process Unlabeled incoming network packets */ +int netlbl_unlabel_getattr(const struct sk_buff *skb, + u16 family, + struct netlbl_lsm_secattr *secattr); + +/* Set the default configuration to allow Unlabeled packets */ +int netlbl_unlabel_defconf(void); + +#endif diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c new file mode 100644 index 00000000..a3fd75ac --- /dev/null +++ b/net/netlabel/netlabel_user.c @@ -0,0 +1,124 @@ +/* + * NetLabel NETLINK Interface + * + * This file defines the NETLINK interface for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_mgmt.h" +#include "netlabel_unlabeled.h" +#include "netlabel_cipso_v4.h" +#include "netlabel_user.h" + +/* + * NetLabel NETLINK Setup Functions + */ + +/** + * netlbl_netlink_init - Initialize the NETLINK communication channel + * + * Description: + * Call out to the NetLabel components so they can register their families and + * commands with the Generic NETLINK mechanism. Returns zero on success and + * non-zero on failure. + * + */ +int __init netlbl_netlink_init(void) +{ + int ret_val; + + ret_val = netlbl_mgmt_genl_init(); + if (ret_val != 0) + return ret_val; + + ret_val = netlbl_cipsov4_genl_init(); + if (ret_val != 0) + return ret_val; + + ret_val = netlbl_unlabel_genl_init(); + if (ret_val != 0) + return ret_val; + + return 0; +} + +/* + * NetLabel Audit Functions + */ + +/** + * netlbl_audit_start_common - Start an audit message + * @type: audit message type + * @audit_info: NetLabel audit information + * + * Description: + * Start an audit message using the type specified in @type and fill the audit + * message with some fields common to all NetLabel audit messages. Returns + * a pointer to the audit buffer on success, NULL on failure. + * + */ +struct audit_buffer *netlbl_audit_start_common(int type, + struct netlbl_audit *audit_info) +{ + struct audit_buffer *audit_buf; + char *secctx; + u32 secctx_len; + + if (audit_enabled == 0) + return NULL; + + audit_buf = audit_log_start(current->audit_context, GFP_ATOMIC, type); + if (audit_buf == NULL) + return NULL; + + audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", + audit_info->loginuid, + audit_info->sessionid); + + if (audit_info->secid != 0 && + security_secid_to_secctx(audit_info->secid, + &secctx, + &secctx_len) == 0) { + audit_log_format(audit_buf, " subj=%s", secctx); + security_release_secctx(secctx, secctx_len); + } + + return audit_buf; +} diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h new file mode 100644 index 00000000..f4fc4c9a --- /dev/null +++ b/net/netlabel/netlabel_user.h @@ -0,0 +1,66 @@ +/* + * NetLabel NETLINK Interface + * + * This file defines the NETLINK interface for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Author: Paul Moore + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _NETLABEL_USER_H +#define _NETLABEL_USER_H + +#include +#include +#include +#include +#include +#include +#include + +/* NetLabel NETLINK helper functions */ + +/** + * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg + * @skb: the packet + * @audit_info: NetLabel audit information + */ +static inline void netlbl_netlink_auditinfo(struct sk_buff *skb, + struct netlbl_audit *audit_info) +{ + security_task_getsecid(current, &audit_info->secid); + audit_info->loginuid = audit_get_loginuid(current); + audit_info->sessionid = audit_get_sessionid(current); +} + +/* NetLabel NETLINK I/O functions */ + +int netlbl_netlink_init(void); + +/* NetLabel Audit Functions */ + +struct audit_buffer *netlbl_audit_start_common(int type, + struct netlbl_audit *audit_info); + +#endif diff --git a/net/netlink/Makefile b/net/netlink/Makefile new file mode 100644 index 00000000..bdd6ddf4 --- /dev/null +++ b/net/netlink/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the netlink driver. +# + +obj-y := af_netlink.o genetlink.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c new file mode 100644 index 00000000..24bc620b --- /dev/null +++ b/net/netlink/af_netlink.c @@ -0,0 +1,2165 @@ +/* + * NETLINK Kernel-user communication protocol. + * + * Authors: Alan Cox + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith + * added netlink_proto_exit + * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo + * use nlk_sk, as sk->protinfo is on a diet 8) + * Fri Jul 22 19:51:12 MEST 2005 Harald Welte + * - inc module use count of module that owns + * the kernel socket in case userspace opens + * socket of same protocol + * - remove all module support, since netlink is + * mandatory if CONFIG_NET=y these days + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) +#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + u32 dst_pid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + struct module *module; +}; + +struct listeners { + struct rcu_head rcu; + unsigned long masks[0]; +}; + +#define NETLINK_KERNEL_SOCKET 0x1 +#define NETLINK_RECV_PKTINFO 0x2 +#define NETLINK_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_RECV_NO_ENOBUFS 0x8 + +static inline struct netlink_sock *nlk_sk(struct sock *sk) +{ + return container_of(sk, struct netlink_sock, sk); +} + +static inline int netlink_is_kernel(struct sock *sk) +{ + return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; +} + +struct nl_pid_hash { + struct hlist_head *table; + unsigned long rehash_time; + + unsigned int mask; + unsigned int shift; + + unsigned int entries; + unsigned int max_shift; + + u32 rnd; +}; + +struct netlink_table { + struct nl_pid_hash hash; + struct hlist_head mc_list; + struct listeners __rcu *listeners; + unsigned int nl_nonroot; + unsigned int groups; + struct mutex *cb_mutex; + struct module *module; + int registered; +}; + +static struct netlink_table *nl_table; + +static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); + +static int netlink_dump(struct sock *sk); +static void netlink_destroy_callback(struct netlink_callback *cb); + +static DEFINE_RWLOCK(nl_table_lock); +static atomic_t nl_table_users = ATOMIC_INIT(0); + +static ATOMIC_NOTIFIER_HEAD(netlink_chain); + +static u32 netlink_group_mask(u32 group) +{ + return group ? 1 << (group - 1) : 0; +} + +static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) +{ + return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; +} + +static void netlink_sock_destruct(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->cb) { + if (nlk->cb->done) + nlk->cb->done(nlk->cb); + netlink_destroy_callback(nlk->cb); + } + + skb_queue_purge(&sk->sk_receive_queue); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); + return; + } + + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + WARN_ON(nlk_sk(sk)->groups); +} + +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on + * SMP. Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines. + */ + +void netlink_table_grab(void) + __acquires(nl_table_lock) +{ + might_sleep(); + + write_lock_irq(&nl_table_lock); + + if (atomic_read(&nl_table_users)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&nl_table_wait, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&nl_table_users) == 0) + break; + write_unlock_irq(&nl_table_lock); + schedule(); + write_lock_irq(&nl_table_lock); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nl_table_wait, &wait); + } +} + +void netlink_table_ungrab(void) + __releases(nl_table_lock) +{ + write_unlock_irq(&nl_table_lock); + wake_up(&nl_table_wait); +} + +static inline void +netlink_lock_table(void) +{ + /* read_lock() synchronizes us to netlink_table_grab */ + + read_lock(&nl_table_lock); + atomic_inc(&nl_table_users); + read_unlock(&nl_table_lock); +} + +static inline void +netlink_unlock_table(void) +{ + if (atomic_dec_and_test(&nl_table_users)) + wake_up(&nl_table_wait); +} + +static inline struct sock *netlink_lookup(struct net *net, int protocol, + u32 pid) +{ + struct nl_pid_hash *hash = &nl_table[protocol].hash; + struct hlist_head *head; + struct sock *sk; + struct hlist_node *node; + + read_lock(&nl_table_lock); + head = nl_pid_hashfn(hash, pid); + sk_for_each(sk, node, head) { + if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) { + sock_hold(sk); + goto found; + } + } + sk = NULL; +found: + read_unlock(&nl_table_lock); + return sk; +} + +static inline struct hlist_head *nl_pid_hash_zalloc(size_t size) +{ + if (size <= PAGE_SIZE) + return kzalloc(size, GFP_ATOMIC); + else + return (struct hlist_head *) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); +} + +static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(table); + else + free_pages((unsigned long)table, get_order(size)); +} + +static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) +{ + unsigned int omask, mask, shift; + size_t osize, size; + struct hlist_head *otable, *table; + int i; + + omask = mask = hash->mask; + osize = size = (mask + 1) * sizeof(*table); + shift = hash->shift; + + if (grow) { + if (++shift > hash->max_shift) + return 0; + mask = mask * 2 + 1; + size *= 2; + } + + table = nl_pid_hash_zalloc(size); + if (!table) + return 0; + + otable = hash->table; + hash->table = table; + hash->mask = mask; + hash->shift = shift; + get_random_bytes(&hash->rnd, sizeof(hash->rnd)); + + for (i = 0; i <= omask; i++) { + struct sock *sk; + struct hlist_node *node, *tmp; + + sk_for_each_safe(sk, node, tmp, &otable[i]) + __sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid)); + } + + nl_pid_hash_free(otable, osize); + hash->rehash_time = jiffies + 10 * 60 * HZ; + return 1; +} + +static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) +{ + int avg = hash->entries >> hash->shift; + + if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1)) + return 1; + + if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { + nl_pid_hash_rehash(hash, 0); + return 1; + } + + return 0; +} + +static const struct proto_ops netlink_ops; + +static void +netlink_update_listeners(struct sock *sk) +{ + struct netlink_table *tbl = &nl_table[sk->sk_protocol]; + struct hlist_node *node; + unsigned long mask; + unsigned int i; + + for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { + mask = 0; + sk_for_each_bound(sk, node, &tbl->mc_list) { + if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) + mask |= nlk_sk(sk)->groups[i]; + } + tbl->listeners->masks[i] = mask; + } + /* this function is only called with the netlink table "grabbed", which + * makes sure updates are visible before bind or setsockopt return. */ +} + +static int netlink_insert(struct sock *sk, struct net *net, u32 pid) +{ + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; + int err = -EADDRINUSE; + struct sock *osk; + struct hlist_node *node; + int len; + + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + len = 0; + sk_for_each(osk, node, head) { + if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->pid == pid)) + break; + len++; + } + if (node) + goto err; + + err = -EBUSY; + if (nlk_sk(sk)->pid) + goto err; + + err = -ENOMEM; + if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + goto err; + + if (len && nl_pid_hash_dilute(hash, len)) + head = nl_pid_hashfn(hash, pid); + hash->entries++; + nlk_sk(sk)->pid = pid; + sk_add_node(sk, head); + err = 0; + +err: + netlink_table_ungrab(); + return err; +} + +static void netlink_remove(struct sock *sk) +{ + netlink_table_grab(); + if (sk_del_node_init(sk)) + nl_table[sk->sk_protocol].hash.entries--; + if (nlk_sk(sk)->subscriptions) + __sk_del_bind_node(sk); + netlink_table_ungrab(); +} + +static struct proto netlink_proto = { + .name = "NETLINK", + .owner = THIS_MODULE, + .obj_size = sizeof(struct netlink_sock), +}; + +static int __netlink_create(struct net *net, struct socket *sock, + struct mutex *cb_mutex, int protocol) +{ + struct sock *sk; + struct netlink_sock *nlk; + + sock->ops = &netlink_ops; + + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + nlk = nlk_sk(sk); + if (cb_mutex) + nlk->cb_mutex = cb_mutex; + else { + nlk->cb_mutex = &nlk->cb_def_mutex; + mutex_init(nlk->cb_mutex); + } + init_waitqueue_head(&nlk->wait); + + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; +} + +static int netlink_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct module *module = NULL; + struct mutex *cb_mutex; + struct netlink_sock *nlk; + int err = 0; + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + if (protocol < 0 || protocol >= MAX_LINKS) + return -EPROTONOSUPPORT; + + netlink_lock_table(); +#ifdef CONFIG_MODULES + if (!nl_table[protocol].registered) { + netlink_unlock_table(); + request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); + netlink_lock_table(); + } +#endif + if (nl_table[protocol].registered && + try_module_get(nl_table[protocol].module)) + module = nl_table[protocol].module; + else + err = -EPROTONOSUPPORT; + cb_mutex = nl_table[protocol].cb_mutex; + netlink_unlock_table(); + + if (err < 0) + goto out; + + err = __netlink_create(net, sock, cb_mutex, protocol); + if (err < 0) + goto out_module; + + local_bh_disable(); + sock_prot_inuse_add(net, &netlink_proto, 1); + local_bh_enable(); + + nlk = nlk_sk(sock->sk); + nlk->module = module; +out: + return err; + +out_module: + module_put(module); + goto out; +} + +static int netlink_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk; + + if (!sk) + return 0; + + netlink_remove(sk); + sock_orphan(sk); + nlk = nlk_sk(sk); + + /* + * OK. Socket is unlinked, any packets that arrive now + * will be purged. + */ + + sock->sk = NULL; + wake_up_interruptible_all(&nlk->wait); + + skb_queue_purge(&sk->sk_write_queue); + + if (nlk->pid) { + struct netlink_notify n = { + .net = sock_net(sk), + .protocol = sk->sk_protocol, + .pid = nlk->pid, + }; + atomic_notifier_call_chain(&netlink_chain, + NETLINK_URELEASE, &n); + } + + module_put(nlk->module); + + netlink_table_grab(); + if (netlink_is_kernel(sk)) { + BUG_ON(nl_table[sk->sk_protocol].registered == 0); + if (--nl_table[sk->sk_protocol].registered == 0) { + kfree(nl_table[sk->sk_protocol].listeners); + nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; + } + } else if (nlk->subscriptions) + netlink_update_listeners(sk); + netlink_table_ungrab(); + + kfree(nlk->groups); + nlk->groups = NULL; + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); + local_bh_enable(); + sock_put(sk); + return 0; +} + +static int netlink_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; + struct sock *osk; + struct hlist_node *node; + s32 pid = task_tgid_vnr(current); + int err; + static s32 rover = -4097; + +retry: + cond_resched(); + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + sk_for_each(osk, node, head) { + if (!net_eq(sock_net(osk), net)) + continue; + if (nlk_sk(osk)->pid == pid) { + /* Bind collision, search negative pid values. */ + pid = rover--; + if (rover > -4097) + rover = -4097; + netlink_table_ungrab(); + goto retry; + } + } + netlink_table_ungrab(); + + err = netlink_insert(sk, net, pid); + if (err == -EADDRINUSE) + goto retry; + + /* If 2 threads race to autobind, that is fine. */ + if (err == -EBUSY) + err = 0; + + return err; +} + +static inline int netlink_capable(struct socket *sock, unsigned int flag) +{ + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || + capable(CAP_NET_ADMIN); +} + +static void +netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->subscriptions && !subscriptions) + __sk_del_bind_node(sk); + else if (!nlk->subscriptions && subscriptions) + sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); + nlk->subscriptions = subscriptions; +} + +static int netlink_realloc_groups(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int groups; + unsigned long *new_groups; + int err = 0; + + netlink_table_grab(); + + groups = nl_table[sk->sk_protocol].groups; + if (!nl_table[sk->sk_protocol].registered) { + err = -ENOENT; + goto out_unlock; + } + + if (nlk->ngroups >= groups) + goto out_unlock; + + new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); + if (new_groups == NULL) { + err = -ENOMEM; + goto out_unlock; + } + memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, + NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); + + nlk->groups = new_groups; + nlk->ngroups = groups; + out_unlock: + netlink_table_ungrab(); + return err; +} + +static int netlink_bind(struct socket *sock, struct sockaddr *addr, + int addr_len) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; + int err; + + if (nladdr->nl_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to listen multicasts */ + if (nladdr->nl_groups) { + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + err = netlink_realloc_groups(sk); + if (err) + return err; + } + + if (nlk->pid) { + if (nladdr->nl_pid != nlk->pid) + return -EINVAL; + } else { + err = nladdr->nl_pid ? + netlink_insert(sk, net, nladdr->nl_pid) : + netlink_autobind(sock); + if (err) + return err; + } + + if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) + return 0; + + netlink_table_grab(); + netlink_update_subscriptions(sk, nlk->subscriptions + + hweight32(nladdr->nl_groups) - + hweight32(nlk->groups[0])); + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; + netlink_update_listeners(sk); + netlink_table_ungrab(); + + return 0; +} + +static int netlink_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + int err = 0; + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; + + if (alen < sizeof(addr->sa_family)) + return -EINVAL; + + if (addr->sa_family == AF_UNSPEC) { + sk->sk_state = NETLINK_UNCONNECTED; + nlk->dst_pid = 0; + nlk->dst_group = 0; + return 0; + } + if (addr->sa_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to send multicasts */ + if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; + + if (!nlk->pid) + err = netlink_autobind(sock); + + if (err == 0) { + sk->sk_state = NETLINK_CONNECTED; + nlk->dst_pid = nladdr->nl_pid; + nlk->dst_group = ffs(nladdr->nl_groups); + } + + return err; +} + +static int netlink_getname(struct socket *sock, struct sockaddr *addr, + int *addr_len, int peer) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); + + nladdr->nl_family = AF_NETLINK; + nladdr->nl_pad = 0; + *addr_len = sizeof(*nladdr); + + if (peer) { + nladdr->nl_pid = nlk->dst_pid; + nladdr->nl_groups = netlink_group_mask(nlk->dst_group); + } else { + nladdr->nl_pid = nlk->pid; + nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; + } + return 0; +} + +static void netlink_overrun(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } + } + atomic_inc(&sk->sk_drops); +} + +static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) +{ + struct sock *sock; + struct netlink_sock *nlk; + + sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, pid); + if (!sock) + return ERR_PTR(-ECONNREFUSED); + + /* Don't bother queuing skb if kernel socket has no input function */ + nlk = nlk_sk(sock); + if (sock->sk_state == NETLINK_CONNECTED && + nlk->dst_pid != nlk_sk(ssk)->pid) { + sock_put(sock); + return ERR_PTR(-ECONNREFUSED); + } + return sock; +} + +struct sock *netlink_getsockbyfilp(struct file *filp) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct sock *sock; + + if (!S_ISSOCK(inode->i_mode)) + return ERR_PTR(-ENOTSOCK); + + sock = SOCKET_I(inode)->sk; + if (sock->sk_family != AF_NETLINK) + return ERR_PTR(-EINVAL); + + sock_hold(sock); + return sock; +} + +/* + * Attach a skb to a netlink socket. + * The caller must hold a reference to the destination socket. On error, the + * reference is dropped. The skb is not send to the destination, just all + * all error checks are performed and memory in the queue is reserved. + * Return values: + * < 0: error. skb freed, reference to sock dropped. + * 0: continue + * 1: repeat lookup - reference dropped while waiting for socket memory. + */ +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, + long *timeo, struct sock *ssk) +{ + struct netlink_sock *nlk; + + nlk = nlk_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); + if (!*timeo) { + if (!ssk || netlink_is_kernel(ssk)) + netlink_overrun(sk); + sock_put(sk); + kfree_skb(skb); + return -EAGAIN; + } + + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&nlk->wait, &wait); + + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) && + !sock_flag(sk, SOCK_DEAD)) + *timeo = schedule_timeout(*timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nlk->wait, &wait); + sock_put(sk); + + if (signal_pending(current)) { + kfree_skb(skb); + return sock_intr_errno(*timeo); + } + return 1; + } + skb_set_owner_r(skb, sk); + return 0; +} + +static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) +{ + int len = skb->len; + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, len); + return len; +} + +int netlink_sendskb(struct sock *sk, struct sk_buff *skb) +{ + int len = __netlink_sendskb(sk, skb); + + sock_put(sk); + return len; +} + +void netlink_detachskb(struct sock *sk, struct sk_buff *skb) +{ + kfree_skb(skb); + sock_put(sk); +} + +static inline struct sk_buff *netlink_trim(struct sk_buff *skb, + gfp_t allocation) +{ + int delta; + + skb_orphan(skb); + + delta = skb->end - skb->tail; + if (delta * 2 < skb->truesize) + return skb; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, allocation); + if (!nskb) + return skb; + kfree_skb(skb); + skb = nskb; + } + + if (!pskb_expand_head(skb, 0, -delta, allocation)) + skb->truesize -= delta; + + return skb; +} + +static inline void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(0, &nlk->state); + if (!test_bit(0, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb) +{ + int ret; + struct netlink_sock *nlk = nlk_sk(sk); + + ret = -ECONNREFUSED; + if (nlk->netlink_rcv != NULL) { + ret = skb->len; + skb_set_owner_r(skb, sk); + nlk->netlink_rcv(skb); + } + kfree_skb(skb); + sock_put(sk); + return ret; +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, + u32 pid, int nonblock) +{ + struct sock *sk; + int err; + long timeo; + + skb = netlink_trim(skb, gfp_any()); + + timeo = sock_sndtimeo(ssk, nonblock); +retry: + sk = netlink_getsockbypid(ssk, pid); + if (IS_ERR(sk)) { + kfree_skb(skb); + return PTR_ERR(sk); + } + if (netlink_is_kernel(sk)) + return netlink_unicast_kernel(sk, skb); + + if (sk_filter(sk, skb)) { + err = skb->len; + kfree_skb(skb); + sock_put(sk); + return err; + } + + err = netlink_attachskb(sk, skb, &timeo, ssk); + if (err == 1) + goto retry; + if (err) + return err; + + return netlink_sendskb(sk, skb); +} +EXPORT_SYMBOL(netlink_unicast); + +int netlink_has_listeners(struct sock *sk, unsigned int group) +{ + int res = 0; + struct listeners *listeners; + + BUG_ON(!netlink_is_kernel(sk)); + + rcu_read_lock(); + listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); + + if (group - 1 < nl_table[sk->sk_protocol].groups) + res = test_bit(group - 1, listeners->masks); + + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL_GPL(netlink_has_listeners); + +static inline int netlink_broadcast_deliver(struct sock *sk, + struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !test_bit(0, &nlk->state)) { + skb_set_owner_r(skb, sk); + __netlink_sendskb(sk, skb); + return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; + } + return -1; +} + +struct netlink_broadcast_data { + struct sock *exclude_sk; + struct net *net; + u32 pid; + u32 group; + int failure; + int delivery_failure; + int congested; + int delivered; + gfp_t allocation; + struct sk_buff *skb, *skb2; + int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); + void *tx_data; +}; + +static inline int do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int val; + + if (p->exclude_sk == sk) + goto out; + + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) + goto out; + + if (!net_eq(sock_net(sk), p->net)) + goto out; + + if (p->failure) { + netlink_overrun(sk); + goto out; + } + + sock_hold(sk); + if (p->skb2 == NULL) { + if (skb_shared(p->skb)) { + p->skb2 = skb_clone(p->skb, p->allocation); + } else { + p->skb2 = skb_get(p->skb); + /* + * skb ownership may have been set when + * delivered to a previous socket. + */ + skb_orphan(p->skb2); + } + } + if (p->skb2 == NULL) { + netlink_overrun(sk); + /* Clone failed. Notify ALL listeners. */ + p->failure = 1; + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; + } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + } else if (sk_filter(sk, p->skb2)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + netlink_overrun(sk); + if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + p->delivery_failure = 1; + } else { + p->congested |= val; + p->delivered = 1; + p->skb2 = NULL; + } + sock_put(sk); + +out: + return 0; +} + +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid, + u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), + void *filter_data) +{ + struct net *net = sock_net(ssk); + struct netlink_broadcast_data info; + struct hlist_node *node; + struct sock *sk; + + skb = netlink_trim(skb, allocation); + + info.exclude_sk = ssk; + info.net = net; + info.pid = pid; + info.group = group; + info.failure = 0; + info.delivery_failure = 0; + info.congested = 0; + info.delivered = 0; + info.allocation = allocation; + info.skb = skb; + info.skb2 = NULL; + info.tx_filter = filter; + info.tx_data = filter_data; + + /* While we sleep in clone, do not allow to change socket list */ + + netlink_lock_table(); + + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_broadcast(sk, &info); + + consume_skb(skb); + + netlink_unlock_table(); + + if (info.delivery_failure) { + kfree_skb(info.skb2); + return -ENOBUFS; + } else + consume_skb(info.skb2); + + if (info.delivered) { + if (info.congested && (allocation & __GFP_WAIT)) + yield(); + return 0; + } + return -ESRCH; +} +EXPORT_SYMBOL(netlink_broadcast_filtered); + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, + u32 group, gfp_t allocation) +{ + return netlink_broadcast_filtered(ssk, skb, pid, group, allocation, + NULL, NULL); +} +EXPORT_SYMBOL(netlink_broadcast); + +struct netlink_set_err_data { + struct sock *exclude_sk; + u32 pid; + u32 group; + int code; +}; + +static inline int do_one_set_err(struct sock *sk, + struct netlink_set_err_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int ret = 0; + + if (sk == p->exclude_sk) + goto out; + + if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) + goto out; + + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) + goto out; + + if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { + ret = 1; + goto out; + } + + sk->sk_err = p->code; + sk->sk_error_report(sk); +out: + return ret; +} + +/** + * netlink_set_err - report error to broadcast listeners + * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() + * @pid: the PID of a process that we want to skip (if any) + * @groups: the broadcast group that will notice the error + * @code: error code, must be negative (as usual in kernelspace) + * + * This function returns the number of broadcast listeners that have set the + * NETLINK_RECV_NO_ENOBUFS socket option. + */ +int netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) +{ + struct netlink_set_err_data info; + struct hlist_node *node; + struct sock *sk; + int ret = 0; + + info.exclude_sk = ssk; + info.pid = pid; + info.group = group; + /* sk->sk_err wants a positive error value */ + info.code = -code; + + read_lock(&nl_table_lock); + + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + ret += do_one_set_err(sk, &info); + + read_unlock(&nl_table_lock); + return ret; +} +EXPORT_SYMBOL(netlink_set_err); + +/* must be called with netlink table grabbed */ +static void netlink_update_socket_mc(struct netlink_sock *nlk, + unsigned int group, + int is_new) +{ + int old, new = !!is_new, subscriptions; + + old = test_bit(group - 1, nlk->groups); + subscriptions = nlk->subscriptions - old + new; + if (new) + __set_bit(group - 1, nlk->groups); + else + __clear_bit(group - 1, nlk->groups); + netlink_update_subscriptions(&nlk->sk, subscriptions); + netlink_update_listeners(&nlk->sk); +} + +static int netlink_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int val = 0; + int err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (optlen >= sizeof(int) && + get_user(val, (unsigned int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETLINK_PKTINFO: + if (val) + nlk->flags |= NETLINK_RECV_PKTINFO; + else + nlk->flags &= ~NETLINK_RECV_PKTINFO; + err = 0; + break; + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: { + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + err = netlink_realloc_groups(sk); + if (err) + return err; + if (!val || val - 1 >= nlk->ngroups) + return -EINVAL; + netlink_table_grab(); + netlink_update_socket_mc(nlk, val, + optname == NETLINK_ADD_MEMBERSHIP); + netlink_table_ungrab(); + err = 0; + break; + } + case NETLINK_BROADCAST_ERROR: + if (val) + nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + else + nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + err = 0; + break; + case NETLINK_NO_ENOBUFS: + if (val) { + nlk->flags |= NETLINK_RECV_NO_ENOBUFS; + clear_bit(0, &nlk->state); + wake_up_interruptible(&nlk->wait); + } else + nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + err = 0; + break; + default: + err = -ENOPROTOOPT; + } + return err; +} + +static int netlink_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int len, val, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETLINK_PKTINFO: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + case NETLINK_BROADCAST_ERROR: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + case NETLINK_NO_ENOBUFS: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; + default: + err = -ENOPROTOOPT; + } + return err; +} + +static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct nl_pktinfo info; + + info.group = NETLINK_CB(skb).dst_group; + put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); +} + +static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *addr = msg->msg_name; + u32 dst_pid; + u32 dst_group; + struct sk_buff *skb; + int err; + struct scm_cookie scm; + + if (msg->msg_flags&MSG_OOB) + return -EOPNOTSUPP; + + if (NULL == siocb->scm) { + siocb->scm = &scm; + memset(&scm, 0, sizeof(scm)); + } + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + if (msg->msg_namelen) { + err = -EINVAL; + if (addr->nl_family != AF_NETLINK) + goto out; + dst_pid = addr->nl_pid; + dst_group = ffs(addr->nl_groups); + err = -EPERM; + if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + goto out; + } else { + dst_pid = nlk->dst_pid; + dst_group = nlk->dst_group; + } + + if (!nlk->pid) { + err = netlink_autobind(sock); + if (err) + goto out; + } + + err = -EMSGSIZE; + if (len > sk->sk_sndbuf - 32) + goto out; + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); + if (skb == NULL) + goto out; + + NETLINK_CB(skb).pid = nlk->pid; + NETLINK_CB(skb).dst_group = dst_group; + memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + + err = -EFAULT; + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + kfree_skb(skb); + goto out; + } + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (dst_group) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + +out: + scm_destroy(siocb->scm); + return err; +} + +static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct scm_cookie scm; + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int noblock = flags&MSG_DONTWAIT; + size_t copied; + struct sk_buff *skb, *data_skb; + int err, ret; + + if (flags&MSG_OOB) + return -EOPNOTSUPP; + + copied = 0; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (skb == NULL) + goto out; + + data_skb = skb; + +#ifdef CONFIG_COMPAT_NETLINK_MESSAGES + if (unlikely(skb_shinfo(skb)->frag_list)) { + /* + * If this skb has a frag_list, then here that means that we + * will have to use the frag_list skb's data for compat tasks + * and the regular skb's data for normal (non-compat) tasks. + * + * If we need to send the compat skb, assign it to the + * 'data_skb' variable so that it will be used below for data + * copying. We keep 'skb' for everything else, including + * freeing both later. + */ + if (flags & MSG_CMSG_COMPAT) + data_skb = skb_shinfo(skb)->frag_list; + } +#endif + + msg->msg_namelen = 0; + + copied = data_skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb_reset_transport_header(data_skb); + err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + + if (msg->msg_name) { + struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; + addr->nl_family = AF_NETLINK; + addr->nl_pad = 0; + addr->nl_pid = NETLINK_CB(skb).pid; + addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); + msg->msg_namelen = sizeof(*addr); + } + + if (nlk->flags & NETLINK_RECV_PKTINFO) + netlink_cmsg_recv_pktinfo(msg, skb); + + if (NULL == siocb->scm) { + memset(&scm, 0, sizeof(scm)); + siocb->scm = &scm; + } + siocb->scm->creds = *NETLINK_CREDS(skb); + if (flags & MSG_TRUNC) + copied = data_skb->len; + + skb_free_datagram(sk, skb); + + if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { + ret = netlink_dump(sk); + if (ret) { + sk->sk_err = ret; + sk->sk_error_report(sk); + } + } + + scm_recv(sock, msg, siocb->scm, flags); +out: + netlink_rcv_wake(sk); + return err ? : copied; +} + +static void netlink_data_ready(struct sock *sk, int len) +{ + BUG(); +} + +/* + * We export these functions to other modules. They provide a + * complete set of kernel non-blocking support for message + * queueing. + */ + +struct sock * +netlink_kernel_create(struct net *net, int unit, unsigned int groups, + void (*input)(struct sk_buff *skb), + struct mutex *cb_mutex, struct module *module) +{ + struct socket *sock; + struct sock *sk; + struct netlink_sock *nlk; + struct listeners *listeners = NULL; + + BUG_ON(!nl_table); + + if (unit < 0 || unit >= MAX_LINKS) + return NULL; + + if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) + return NULL; + + /* + * We have to just have a reference on the net from sk, but don't + * get_net it. Besides, we cannot get and then put the net here. + * So we create one inside init_net and the move it to net. + */ + + if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + goto out_sock_release_nosk; + + sk = sock->sk; + sk_change_net(sk, net); + + if (groups < 32) + groups = 32; + + listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); + if (!listeners) + goto out_sock_release; + + sk->sk_data_ready = netlink_data_ready; + if (input) + nlk_sk(sk)->netlink_rcv = input; + + if (netlink_insert(sk, net, 0)) + goto out_sock_release; + + nlk = nlk_sk(sk); + nlk->flags |= NETLINK_KERNEL_SOCKET; + + netlink_table_grab(); + if (!nl_table[unit].registered) { + nl_table[unit].groups = groups; + rcu_assign_pointer(nl_table[unit].listeners, listeners); + nl_table[unit].cb_mutex = cb_mutex; + nl_table[unit].module = module; + nl_table[unit].registered = 1; + } else { + kfree(listeners); + nl_table[unit].registered++; + } + netlink_table_ungrab(); + return sk; + +out_sock_release: + kfree(listeners); + netlink_kernel_release(sk); + return NULL; + +out_sock_release_nosk: + sock_release(sock); + return NULL; +} +EXPORT_SYMBOL(netlink_kernel_create); + + +void +netlink_kernel_release(struct sock *sk) +{ + sk_release_kernel(sk); +} +EXPORT_SYMBOL(netlink_kernel_release); + +int __netlink_change_ngroups(struct sock *sk, unsigned int groups) +{ + struct listeners *new, *old; + struct netlink_table *tbl = &nl_table[sk->sk_protocol]; + + if (groups < 32) + groups = 32; + + if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { + new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); + if (!new) + return -ENOMEM; + old = rcu_dereference_raw(tbl->listeners); + memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); + rcu_assign_pointer(tbl->listeners, new); + + kfree_rcu(old, rcu); + } + tbl->groups = groups; + + return 0; +} + +/** + * netlink_change_ngroups - change number of multicast groups + * + * This changes the number of multicast groups that are available + * on a certain netlink family. Note that it is not possible to + * change the number of groups to below 32. Also note that it does + * not implicitly call netlink_clear_multicast_users() when the + * number of groups is reduced. + * + * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). + * @groups: The new number of groups. + */ +int netlink_change_ngroups(struct sock *sk, unsigned int groups) +{ + int err; + + netlink_table_grab(); + err = __netlink_change_ngroups(sk, groups); + netlink_table_ungrab(); + + return err; +} + +void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) +{ + struct sock *sk; + struct hlist_node *node; + struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; + + sk_for_each_bound(sk, node, &tbl->mc_list) + netlink_update_socket_mc(nlk_sk(sk), group, 0); +} + +/** + * netlink_clear_multicast_users - kick off multicast listeners + * + * This function removes all listeners from the given group. + * @ksk: The kernel netlink socket, as returned by + * netlink_kernel_create(). + * @group: The multicast group to clear. + */ +void netlink_clear_multicast_users(struct sock *ksk, unsigned int group) +{ + netlink_table_grab(); + __netlink_clear_multicast_users(ksk, group); + netlink_table_ungrab(); +} + +void netlink_set_nonroot(int protocol, unsigned int flags) +{ + if ((unsigned int)protocol < MAX_LINKS) + nl_table[protocol].nl_nonroot = flags; +} +EXPORT_SYMBOL(netlink_set_nonroot); + +static void netlink_destroy_callback(struct netlink_callback *cb) +{ + kfree_skb(cb->skb); + kfree(cb); +} + +/* + * It looks a bit ugly. + * It would be better to create kernel thread. + */ + +static int netlink_dump(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_callback *cb; + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len, err = -ENOBUFS; + + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + goto errout; + + mutex_lock(nlk->cb_mutex); + + cb = nlk->cb; + if (cb == NULL) { + err = -EINVAL; + goto errout_skb; + } + + len = cb->dump(skb, cb); + + if (len > 0) { + mutex_unlock(nlk->cb_mutex); + + if (sk_filter(sk, skb)) + kfree_skb(skb); + else + __netlink_sendskb(sk, skb); + return 0; + } + + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI); + if (!nlh) + goto errout_skb; + + memcpy(nlmsg_data(nlh), &len, sizeof(len)); + + if (sk_filter(sk, skb)) + kfree_skb(skb); + else + __netlink_sendskb(sk, skb); + + if (cb->done) + cb->done(cb); + nlk->cb = NULL; + mutex_unlock(nlk->cb_mutex); + + netlink_destroy_callback(cb); + return 0; + +errout_skb: + mutex_unlock(nlk->cb_mutex); + kfree_skb(skb); +errout: + return err; +} + +int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + int (*dump)(struct sk_buff *skb, + struct netlink_callback *), + int (*done)(struct netlink_callback *)) +{ + struct netlink_callback *cb; + struct sock *sk; + struct netlink_sock *nlk; + int ret; + + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (cb == NULL) + return -ENOBUFS; + + cb->dump = dump; + cb->done = done; + cb->nlh = nlh; + atomic_inc(&skb->users); + cb->skb = skb; + + sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).pid); + if (sk == NULL) { + netlink_destroy_callback(cb); + return -ECONNREFUSED; + } + nlk = nlk_sk(sk); + /* A dump is in progress... */ + mutex_lock(nlk->cb_mutex); + if (nlk->cb) { + mutex_unlock(nlk->cb_mutex); + netlink_destroy_callback(cb); + sock_put(sk); + return -EBUSY; + } + nlk->cb = cb; + mutex_unlock(nlk->cb_mutex); + + ret = netlink_dump(sk); + + sock_put(sk); + + if (ret) + return ret; + + /* We successfully started a dump, by returning -EINTR we + * signal not to send ACK even if it was requested. + */ + return -EINTR; +} +EXPORT_SYMBOL(netlink_dump_start); + +void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) +{ + struct sk_buff *skb; + struct nlmsghdr *rep; + struct nlmsgerr *errmsg; + size_t payload = sizeof(*errmsg); + + /* error messages get the original request appened */ + if (err) + payload += nlmsg_len(nlh); + + skb = nlmsg_new(payload, GFP_KERNEL); + if (!skb) { + struct sock *sk; + + sk = netlink_lookup(sock_net(in_skb->sk), + in_skb->sk->sk_protocol, + NETLINK_CB(in_skb).pid); + if (sk) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + sock_put(sk); + } + return; + } + + rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + NLMSG_ERROR, payload, 0); + errmsg = nlmsg_data(rep); + errmsg->error = err; + memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); +} +EXPORT_SYMBOL(netlink_ack); + +int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *)) +{ + struct nlmsghdr *nlh; + int err; + + while (skb->len >= nlmsg_total_size(0)) { + int msglen; + + nlh = nlmsg_hdr(skb); + err = 0; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) + return 0; + + /* Only requests are handled by the kernel */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + goto ack; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + + err = cb(skb, nlh); + if (err == -EINTR) + goto skip; + +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) + netlink_ack(skb, nlh, err); + +skip: + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); + } + + return 0; +} +EXPORT_SYMBOL(netlink_rcv_skb); + +/** + * nlmsg_notify - send a notification netlink message + * @sk: netlink socket to use + * @skb: notification message + * @pid: destination netlink pid for reports or 0 + * @group: destination multicast group or 0 + * @report: 1 to report back, 0 to disable + * @flags: allocation flags + */ +int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, + unsigned int group, int report, gfp_t flags) +{ + int err = 0; + + if (group) { + int exclude_pid = 0; + + if (report) { + atomic_inc(&skb->users); + exclude_pid = pid; + } + + /* errors reported via destination sk->sk_err, but propagate + * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ + err = nlmsg_multicast(sk, skb, exclude_pid, group, flags); + } + + if (report) { + int err2; + + err2 = nlmsg_unicast(sk, skb, pid); + if (!err || err == -ESRCH) + err = err2; + } + + return err; +} +EXPORT_SYMBOL(nlmsg_notify); + +#ifdef CONFIG_PROC_FS +struct nl_seq_iter { + struct seq_net_private p; + int link; + int hash_idx; +}; + +static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) +{ + struct nl_seq_iter *iter = seq->private; + int i, j; + struct sock *s; + struct hlist_node *node; + loff_t off = 0; + + for (i = 0; i < MAX_LINKS; i++) { + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (j = 0; j <= hash->mask; j++) { + sk_for_each(s, node, &hash->table[j]) { + if (sock_net(s) != seq_file_net(seq)) + continue; + if (off == pos) { + iter->link = i; + iter->hash_idx = j; + return s; + } + ++off; + } + } + } + return NULL; +} + +static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(nl_table_lock) +{ + read_lock(&nl_table_lock); + return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *s; + struct nl_seq_iter *iter; + int i, j; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return netlink_seq_socket_idx(seq, 0); + + iter = seq->private; + s = v; + do { + s = sk_next(s); + } while (s && sock_net(s) != seq_file_net(seq)); + if (s) + return s; + + i = iter->link; + j = iter->hash_idx + 1; + + do { + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); + while (s && sock_net(s) != seq_file_net(seq)) + s = sk_next(s); + if (s) { + iter->link = i; + iter->hash_idx = j; + return s; + } + } + + j = 0; + } while (++i < MAX_LINKS); + + return NULL; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) + __releases(nl_table_lock) +{ + read_unlock(&nl_table_lock); +} + + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "sk Eth Pid Groups " + "Rmem Wmem Dump Locks Drops Inode\n"); + else { + struct sock *s = v; + struct netlink_sock *nlk = nlk_sk(s); + + seq_printf(seq, "%pK %-3d %-6d %08x %-8d %-8d %pK %-8d %-8d %-8lu\n", + s, + s->sk_protocol, + nlk->pid, + nlk->groups ? (u32)nlk->groups[0] : 0, + sk_rmem_alloc_get(s), + sk_wmem_alloc_get(s), + nlk->cb, + atomic_read(&s->sk_refcnt), + atomic_read(&s->sk_drops), + sock_i_ino(s) + ); + + } + return 0; +} + +static const struct seq_operations netlink_seq_ops = { + .start = netlink_seq_start, + .next = netlink_seq_next, + .stop = netlink_seq_stop, + .show = netlink_seq_show, +}; + + +static int netlink_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &netlink_seq_ops, + sizeof(struct nl_seq_iter)); +} + +static const struct file_operations netlink_seq_fops = { + .owner = THIS_MODULE, + .open = netlink_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +int netlink_register_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&netlink_chain, nb); +} +EXPORT_SYMBOL(netlink_register_notifier); + +int netlink_unregister_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&netlink_chain, nb); +} +EXPORT_SYMBOL(netlink_unregister_notifier); + +static const struct proto_ops netlink_ops = { + .family = PF_NETLINK, + .owner = THIS_MODULE, + .release = netlink_release, + .bind = netlink_bind, + .connect = netlink_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = netlink_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = netlink_setsockopt, + .getsockopt = netlink_getsockopt, + .sendmsg = netlink_sendmsg, + .recvmsg = netlink_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct net_proto_family netlink_family_ops = { + .family = PF_NETLINK, + .create = netlink_create, + .owner = THIS_MODULE, /* for consistency 8) */ +}; + +static int __net_init netlink_net_init(struct net *net) +{ +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops)) + return -ENOMEM; +#endif + return 0; +} + +static void __net_exit netlink_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "netlink"); +#endif +} + +static void __init netlink_add_usersock_entry(void) +{ + struct listeners *listeners; + int groups = 32; + + listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); + if (!listeners) + panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); + + netlink_table_grab(); + + nl_table[NETLINK_USERSOCK].groups = groups; + rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); + nl_table[NETLINK_USERSOCK].module = THIS_MODULE; + nl_table[NETLINK_USERSOCK].registered = 1; + + netlink_table_ungrab(); +} + +static struct pernet_operations __net_initdata netlink_net_ops = { + .init = netlink_net_init, + .exit = netlink_net_exit, +}; + +static int __init netlink_proto_init(void) +{ + struct sk_buff *dummy_skb; + int i; + unsigned long limit; + unsigned int order; + int err = proto_register(&netlink_proto, 0); + + if (err != 0) + goto out; + + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)); + + nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); + if (!nl_table) + goto panic; + + if (totalram_pages >= (128 * 1024)) + limit = totalram_pages >> (21 - PAGE_SHIFT); + else + limit = totalram_pages >> (23 - PAGE_SHIFT); + + order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; + limit = (1UL << order) / sizeof(struct hlist_head); + order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; + + for (i = 0; i < MAX_LINKS; i++) { + struct nl_pid_hash *hash = &nl_table[i].hash; + + hash->table = nl_pid_hash_zalloc(1 * sizeof(*hash->table)); + if (!hash->table) { + while (i-- > 0) + nl_pid_hash_free(nl_table[i].hash.table, + 1 * sizeof(*hash->table)); + kfree(nl_table); + goto panic; + } + hash->max_shift = order; + hash->shift = 0; + hash->mask = 0; + hash->rehash_time = jiffies; + } + + netlink_add_usersock_entry(); + + sock_register(&netlink_family_ops); + register_pernet_subsys(&netlink_net_ops); + /* The netlink device handler may be needed early. */ + rtnetlink_init(); +out: + return err; +panic: + panic("netlink_init: Cannot allocate nl_table\n"); +} + +core_initcall(netlink_proto_init); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c new file mode 100644 index 00000000..1781d991 --- /dev/null +++ b/net/netlink/genetlink.c @@ -0,0 +1,948 @@ +/* + * NETLINK Generic Netlink Family + * + * Authors: Jamal Hadi Salim + * Thomas Graf + * Johannes Berg + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ + +void genl_lock(void) +{ + mutex_lock(&genl_mutex); +} +EXPORT_SYMBOL(genl_lock); + +void genl_unlock(void) +{ + mutex_unlock(&genl_mutex); +} +EXPORT_SYMBOL(genl_unlock); + +#define GENL_FAM_TAB_SIZE 16 +#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) + +static struct list_head family_ht[GENL_FAM_TAB_SIZE]; +/* + * Bitmap of multicast groups that are currently in use. + * + * To avoid an allocation at boot of just one unsigned long, + * declare it global instead. + * Bit 0 is marked as already used since group 0 is invalid. + */ +static unsigned long mc_group_start = 0x1; +static unsigned long *mc_groups = &mc_group_start; +static unsigned long mc_groups_longs = 1; + +static int genl_ctrl_event(int event, void *data); + +static inline unsigned int genl_family_hash(unsigned int id) +{ + return id & GENL_FAM_TAB_MASK; +} + +static inline struct list_head *genl_family_chain(unsigned int id) +{ + return &family_ht[genl_family_hash(id)]; +} + +static struct genl_family *genl_family_find_byid(unsigned int id) +{ + struct genl_family *f; + + list_for_each_entry(f, genl_family_chain(id), family_list) + if (f->id == id) + return f; + + return NULL; +} + +static struct genl_family *genl_family_find_byname(char *name) +{ + struct genl_family *f; + int i; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + list_for_each_entry(f, genl_family_chain(i), family_list) + if (strcmp(f->name, name) == 0) + return f; + + return NULL; +} + +static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) +{ + struct genl_ops *ops; + + list_for_each_entry(ops, &family->ops_list, ops_list) + if (ops->cmd == cmd) + return ops; + + return NULL; +} + +/* Of course we are going to have problems once we hit + * 2^16 alive types, but that can only happen by year 2K +*/ +static inline u16 genl_generate_id(void) +{ + static u16 id_gen_idx = GENL_MIN_ID; + int i; + + for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) { + if (!genl_family_find_byid(id_gen_idx)) + return id_gen_idx; + if (++id_gen_idx > GENL_MAX_ID) + id_gen_idx = GENL_MIN_ID; + } + + return 0; +} + +static struct genl_multicast_group notify_grp; + +/** + * genl_register_mc_group - register a multicast group + * + * Registers the specified multicast group and notifies userspace + * about the new group. + * + * Returns 0 on success or a negative error code. + * + * @family: The generic netlink family the group shall be registered for. + * @grp: The group to register, must have a name. + */ +int genl_register_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ + int id; + unsigned long *new_groups; + int err = 0; + + BUG_ON(grp->name[0] == '\0'); + + genl_lock(); + + /* special-case our own group */ + if (grp == ¬ify_grp) + id = GENL_ID_CTRL; + else + id = find_first_zero_bit(mc_groups, + mc_groups_longs * BITS_PER_LONG); + + + if (id >= mc_groups_longs * BITS_PER_LONG) { + size_t nlen = (mc_groups_longs + 1) * sizeof(unsigned long); + + if (mc_groups == &mc_group_start) { + new_groups = kzalloc(nlen, GFP_KERNEL); + if (!new_groups) { + err = -ENOMEM; + goto out; + } + mc_groups = new_groups; + *mc_groups = mc_group_start; + } else { + new_groups = krealloc(mc_groups, nlen, GFP_KERNEL); + if (!new_groups) { + err = -ENOMEM; + goto out; + } + mc_groups = new_groups; + mc_groups[mc_groups_longs] = 0; + } + mc_groups_longs++; + } + + if (family->netnsok) { + struct net *net; + + netlink_table_grab(); + rcu_read_lock(); + for_each_net_rcu(net) { + err = __netlink_change_ngroups(net->genl_sock, + mc_groups_longs * BITS_PER_LONG); + if (err) { + /* + * No need to roll back, can only fail if + * memory allocation fails and then the + * number of _possible_ groups has been + * increased on some sockets which is ok. + */ + rcu_read_unlock(); + netlink_table_ungrab(); + goto out; + } + } + rcu_read_unlock(); + netlink_table_ungrab(); + } else { + err = netlink_change_ngroups(init_net.genl_sock, + mc_groups_longs * BITS_PER_LONG); + if (err) + goto out; + } + + grp->id = id; + set_bit(id, mc_groups); + list_add_tail(&grp->list, &family->mcast_groups); + grp->family = family; + + genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, grp); + out: + genl_unlock(); + return err; +} +EXPORT_SYMBOL(genl_register_mc_group); + +static void __genl_unregister_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ + struct net *net; + BUG_ON(grp->family != family); + + netlink_table_grab(); + rcu_read_lock(); + for_each_net_rcu(net) + __netlink_clear_multicast_users(net->genl_sock, grp->id); + rcu_read_unlock(); + netlink_table_ungrab(); + + clear_bit(grp->id, mc_groups); + list_del(&grp->list); + genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp); + grp->id = 0; + grp->family = NULL; +} + +/** + * genl_unregister_mc_group - unregister a multicast group + * + * Unregisters the specified multicast group and notifies userspace + * about it. All current listeners on the group are removed. + * + * Note: It is not necessary to unregister all multicast groups before + * unregistering the family, unregistering the family will cause + * all assigned multicast groups to be unregistered automatically. + * + * @family: Generic netlink family the group belongs to. + * @grp: The group to unregister, must have been registered successfully + * previously. + */ +void genl_unregister_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ + genl_lock(); + __genl_unregister_mc_group(family, grp); + genl_unlock(); +} +EXPORT_SYMBOL(genl_unregister_mc_group); + +static void genl_unregister_mc_groups(struct genl_family *family) +{ + struct genl_multicast_group *grp, *tmp; + + list_for_each_entry_safe(grp, tmp, &family->mcast_groups, list) + __genl_unregister_mc_group(family, grp); +} + +/** + * genl_register_ops - register generic netlink operations + * @family: generic netlink family + * @ops: operations to be registered + * + * Registers the specified operations and assigns them to the specified + * family. Either a doit or dumpit callback must be specified or the + * operation will fail. Only one operation structure per command + * identifier may be registered. + * + * See include/net/genetlink.h for more documenation on the operations + * structure. + * + * Returns 0 on success or a negative error code. + */ +int genl_register_ops(struct genl_family *family, struct genl_ops *ops) +{ + int err = -EINVAL; + + if (ops->dumpit == NULL && ops->doit == NULL) + goto errout; + + if (genl_get_cmd(ops->cmd, family)) { + err = -EEXIST; + goto errout; + } + + if (ops->dumpit) + ops->flags |= GENL_CMD_CAP_DUMP; + if (ops->doit) + ops->flags |= GENL_CMD_CAP_DO; + if (ops->policy) + ops->flags |= GENL_CMD_CAP_HASPOL; + + genl_lock(); + list_add_tail(&ops->ops_list, &family->ops_list); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWOPS, ops); + err = 0; +errout: + return err; +} +EXPORT_SYMBOL(genl_register_ops); + +/** + * genl_unregister_ops - unregister generic netlink operations + * @family: generic netlink family + * @ops: operations to be unregistered + * + * Unregisters the specified operations and unassigns them from the + * specified family. The operation blocks until the current message + * processing has finished and doesn't start again until the + * unregister process has finished. + * + * Note: It is not necessary to unregister all operations before + * unregistering the family, unregistering the family will cause + * all assigned operations to be unregistered automatically. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) +{ + struct genl_ops *rc; + + genl_lock(); + list_for_each_entry(rc, &family->ops_list, ops_list) { + if (rc == ops) { + list_del(&ops->ops_list); + genl_unlock(); + genl_ctrl_event(CTRL_CMD_DELOPS, ops); + return 0; + } + } + genl_unlock(); + + return -ENOENT; +} +EXPORT_SYMBOL(genl_unregister_ops); + +/** + * genl_register_family - register a generic netlink family + * @family: generic netlink family + * + * Registers the specified family after validating it first. Only one + * family may be registered with the same family name or identifier. + * The family id may equal GENL_ID_GENERATE causing an unique id to + * be automatically generated and assigned. + * + * Return 0 on success or a negative error code. + */ +int genl_register_family(struct genl_family *family) +{ + int err = -EINVAL; + + if (family->id && family->id < GENL_MIN_ID) + goto errout; + + if (family->id > GENL_MAX_ID) + goto errout; + + INIT_LIST_HEAD(&family->ops_list); + INIT_LIST_HEAD(&family->mcast_groups); + + genl_lock(); + + if (genl_family_find_byname(family->name)) { + err = -EEXIST; + goto errout_locked; + } + + if (family->id == GENL_ID_GENERATE) { + u16 newid = genl_generate_id(); + + if (!newid) { + err = -ENOMEM; + goto errout_locked; + } + + family->id = newid; + } else if (genl_family_find_byid(family->id)) { + err = -EEXIST; + goto errout_locked; + } + + if (family->maxattr) { + family->attrbuf = kmalloc((family->maxattr+1) * + sizeof(struct nlattr *), GFP_KERNEL); + if (family->attrbuf == NULL) { + err = -ENOMEM; + goto errout_locked; + } + } else + family->attrbuf = NULL; + + list_add_tail(&family->family_list, genl_family_chain(family->id)); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); + + return 0; + +errout_locked: + genl_unlock(); +errout: + return err; +} +EXPORT_SYMBOL(genl_register_family); + +/** + * genl_register_family_with_ops - register a generic netlink family + * @family: generic netlink family + * @ops: operations to be registered + * @n_ops: number of elements to register + * + * Registers the specified family and operations from the specified table. + * Only one family may be registered with the same family name or identifier. + * + * The family id may equal GENL_ID_GENERATE causing an unique id to + * be automatically generated and assigned. + * + * Either a doit or dumpit callback must be specified for every registered + * operation or the function will fail. Only one operation structure per + * command identifier may be registered. + * + * See include/net/genetlink.h for more documenation on the operations + * structure. + * + * This is equivalent to calling genl_register_family() followed by + * genl_register_ops() for every operation entry in the table taking + * care to unregister the family on error path. + * + * Return 0 on success or a negative error code. + */ +int genl_register_family_with_ops(struct genl_family *family, + struct genl_ops *ops, size_t n_ops) +{ + int err, i; + + err = genl_register_family(family); + if (err) + return err; + + for (i = 0; i < n_ops; ++i, ++ops) { + err = genl_register_ops(family, ops); + if (err) + goto err_out; + } + return 0; +err_out: + genl_unregister_family(family); + return err; +} +EXPORT_SYMBOL(genl_register_family_with_ops); + +/** + * genl_unregister_family - unregister generic netlink family + * @family: generic netlink family + * + * Unregisters the specified family. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_family(struct genl_family *family) +{ + struct genl_family *rc; + + genl_lock(); + + genl_unregister_mc_groups(family); + + list_for_each_entry(rc, genl_family_chain(family->id), family_list) { + if (family->id != rc->id || strcmp(rc->name, family->name)) + continue; + + list_del(&rc->family_list); + INIT_LIST_HEAD(&family->ops_list); + genl_unlock(); + + kfree(family->attrbuf); + genl_ctrl_event(CTRL_CMD_DELFAMILY, family); + return 0; + } + + genl_unlock(); + + return -ENOENT; +} +EXPORT_SYMBOL(genl_unregister_family); + +static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct genl_ops *ops; + struct genl_family *family; + struct net *net = sock_net(skb->sk); + struct genl_info info; + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen, err; + + family = genl_family_find_byid(nlh->nlmsg_type); + if (family == NULL) + return -ENOENT; + + /* this family doesn't exist in this netns */ + if (!family->netnsok && !net_eq(net, &init_net)) + return -ENOENT; + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) + return -EOPNOTSUPP; + + if ((ops->flags & GENL_ADMIN_PERM) && + security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (ops->dumpit == NULL) + return -EOPNOTSUPP; + + genl_unlock(); + err = netlink_dump_start(net->genl_sock, skb, nlh, + ops->dumpit, ops->done); + genl_lock(); + return err; + } + + if (ops->doit == NULL) + return -EOPNOTSUPP; + + if (family->attrbuf) { + err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, + ops->policy); + if (err < 0) + return err; + } + + info.snd_seq = nlh->nlmsg_seq; + info.snd_pid = NETLINK_CB(skb).pid; + info.nlhdr = nlh; + info.genlhdr = nlmsg_data(nlh); + info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; + info.attrs = family->attrbuf; + genl_info_net_set(&info, net); + memset(&info.user_ptr, 0, sizeof(info.user_ptr)); + + if (family->pre_doit) { + err = family->pre_doit(ops, skb, &info); + if (err) + return err; + } + + err = ops->doit(skb, &info); + + if (family->post_doit) + family->post_doit(ops, skb, &info); + + return err; +} + +static void genl_rcv(struct sk_buff *skb) +{ + genl_lock(); + netlink_rcv_skb(skb, &genl_rcv_msg); + genl_unlock(); +} + +/************************************************************************** + * Controller + **************************************************************************/ + +static struct genl_family genl_ctrl = { + .id = GENL_ID_CTRL, + .name = "nlctrl", + .version = 0x2, + .maxattr = CTRL_ATTR_MAX, + .netnsok = true, +}; + +static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd); + if (hdr == NULL) + return -1; + + NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name); + NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id); + NLA_PUT_U32(skb, CTRL_ATTR_VERSION, family->version); + NLA_PUT_U32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize); + NLA_PUT_U32(skb, CTRL_ATTR_MAXATTR, family->maxattr); + + if (!list_empty(&family->ops_list)) { + struct nlattr *nla_ops; + struct genl_ops *ops; + int idx = 1; + + nla_ops = nla_nest_start(skb, CTRL_ATTR_OPS); + if (nla_ops == NULL) + goto nla_put_failure; + + list_for_each_entry(ops, &family->ops_list, ops_list) { + struct nlattr *nest; + + nest = nla_nest_start(skb, idx++); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT_U32(skb, CTRL_ATTR_OP_ID, ops->cmd); + NLA_PUT_U32(skb, CTRL_ATTR_OP_FLAGS, ops->flags); + + nla_nest_end(skb, nest); + } + + nla_nest_end(skb, nla_ops); + } + + if (!list_empty(&family->mcast_groups)) { + struct genl_multicast_group *grp; + struct nlattr *nla_grps; + int idx = 1; + + nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS); + if (nla_grps == NULL) + goto nla_put_failure; + + list_for_each_entry(grp, &family->mcast_groups, list) { + struct nlattr *nest; + + nest = nla_nest_start(skb, idx++); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id); + NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME, + grp->name); + + nla_nest_end(skb, nest); + } + nla_nest_end(skb, nla_grps); + } + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid, + u32 seq, u32 flags, struct sk_buff *skb, + u8 cmd) +{ + void *hdr; + struct nlattr *nla_grps; + struct nlattr *nest; + + hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd); + if (hdr == NULL) + return -1; + + NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, grp->family->name); + NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, grp->family->id); + + nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS); + if (nla_grps == NULL) + goto nla_put_failure; + + nest = nla_nest_start(skb, 1); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id); + NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME, + grp->name); + + nla_nest_end(skb, nest); + nla_nest_end(skb, nla_grps); + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) +{ + + int i, n = 0; + struct genl_family *rt; + struct net *net = sock_net(skb->sk); + int chains_to_skip = cb->args[0]; + int fams_to_skip = cb->args[1]; + + for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) { + n = 0; + list_for_each_entry(rt, genl_family_chain(i), family_list) { + if (!rt->netnsok && !net_eq(net, &init_net)) + continue; + if (++n < fams_to_skip) + continue; + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, CTRL_CMD_NEWFAMILY) < 0) + goto errout; + } + + fams_to_skip = 0; + } + +errout: + cb->args[0] = i; + cb->args[1] = n; + + return skb->len; +} + +static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, + u32 pid, int seq, u8 cmd) +{ + struct sk_buff *skb; + int err; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + err = ctrl_fill_info(family, pid, seq, 0, skb, cmd); + if (err < 0) { + nlmsg_free(skb); + return ERR_PTR(err); + } + + return skb; +} + +static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_multicast_group *grp, + u32 pid, int seq, u8 cmd) +{ + struct sk_buff *skb; + int err; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + err = ctrl_fill_mcgrp_info(grp, pid, seq, 0, skb, cmd); + if (err < 0) { + nlmsg_free(skb); + return ERR_PTR(err); + } + + return skb; +} + +static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = { + [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, + [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, + .len = GENL_NAMSIZ - 1 }, +}; + +static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct genl_family *res = NULL; + int err = -EINVAL; + + if (info->attrs[CTRL_ATTR_FAMILY_ID]) { + u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); + res = genl_family_find_byid(id); + err = -ENOENT; + } + + if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { + char *name; + + name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); + res = genl_family_find_byname(name); + err = -ENOENT; + } + + if (res == NULL) + return err; + + if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { + /* family doesn't exist here */ + return -ENOENT; + } + + msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq, + CTRL_CMD_NEWFAMILY); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + return genlmsg_reply(msg, info); +} + +static int genl_ctrl_event(int event, void *data) +{ + struct sk_buff *msg; + struct genl_family *family; + struct genl_multicast_group *grp; + + /* genl is still initialising */ + if (!init_net.genl_sock) + return 0; + + switch (event) { + case CTRL_CMD_NEWFAMILY: + case CTRL_CMD_DELFAMILY: + family = data; + msg = ctrl_build_family_msg(family, 0, 0, event); + break; + case CTRL_CMD_NEWMCAST_GRP: + case CTRL_CMD_DELMCAST_GRP: + grp = data; + family = grp->family; + msg = ctrl_build_mcgrp_msg(data, 0, 0, event); + break; + default: + return -EINVAL; + } + + if (IS_ERR(msg)) + return PTR_ERR(msg); + + if (!family->netnsok) { + genlmsg_multicast_netns(&init_net, msg, 0, + GENL_ID_CTRL, GFP_KERNEL); + } else { + rcu_read_lock(); + genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC); + rcu_read_unlock(); + } + + return 0; +} + +static struct genl_ops genl_ctrl_ops = { + .cmd = CTRL_CMD_GETFAMILY, + .doit = ctrl_getfamily, + .dumpit = ctrl_dumpfamily, + .policy = ctrl_policy, +}; + +static struct genl_multicast_group notify_grp = { + .name = "notify", +}; + +static int __net_init genl_pernet_init(struct net *net) +{ + /* we'll bump the group number right afterwards */ + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0, + genl_rcv, &genl_mutex, + THIS_MODULE); + + if (!net->genl_sock && net_eq(net, &init_net)) + panic("GENL: Cannot initialize generic netlink\n"); + + if (!net->genl_sock) + return -ENOMEM; + + return 0; +} + +static void __net_exit genl_pernet_exit(struct net *net) +{ + netlink_kernel_release(net->genl_sock); + net->genl_sock = NULL; +} + +static struct pernet_operations genl_pernet_ops = { + .init = genl_pernet_init, + .exit = genl_pernet_exit, +}; + +static int __init genl_init(void) +{ + int i, err; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + INIT_LIST_HEAD(&family_ht[i]); + + err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1); + if (err < 0) + goto problem; + + netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); + + err = register_pernet_subsys(&genl_pernet_ops); + if (err) + goto problem; + + err = genl_register_mc_group(&genl_ctrl, ¬ify_grp); + if (err < 0) + goto problem; + + return 0; + +problem: + panic("GENL: Cannot register controller: %d\n", err); +} + +subsys_initcall(genl_init); + +static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group, + gfp_t flags) +{ + struct sk_buff *tmp; + struct net *net, *prev = NULL; + int err; + + for_each_net_rcu(net) { + if (prev) { + tmp = skb_clone(skb, flags); + if (!tmp) { + err = -ENOMEM; + goto error; + } + err = nlmsg_multicast(prev->genl_sock, tmp, + pid, group, flags); + if (err) + goto error; + } + + prev = net; + } + + return nlmsg_multicast(prev->genl_sock, skb, pid, group, flags); + error: + kfree_skb(skb); + return err; +} + +int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group, + gfp_t flags) +{ + return genlmsg_mcast(skb, pid, group, flags); +} +EXPORT_SYMBOL(genlmsg_multicast_allns); diff --git a/net/netrom/Makefile b/net/netrom/Makefile new file mode 100644 index 00000000..2660f5a1 --- /dev/null +++ b/net/netrom/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux NET/ROM layer. +# + +obj-$(CONFIG_NETROM) += netrom.o + +netrom-y := af_netrom.o nr_dev.o nr_in.o nr_loopback.o \ + nr_out.o nr_route.o nr_subr.o nr_timer.o +netrom-$(CONFIG_SYSCTL) += sysctl_net_netrom.o diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c new file mode 100644 index 00000000..732152f7 --- /dev/null +++ b/net/netrom/af_netrom.c @@ -0,0 +1,1509 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int nr_ndevs = 4; + +int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL; +int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS; +int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL; +int sysctl_netrom_transport_timeout = NR_DEFAULT_T1; +int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2; +int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2; +int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4; +int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW; +int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE; +int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING; +int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS; +int sysctl_netrom_reset_circuit = NR_DEFAULT_RESET; + +static unsigned short circuit = 0x101; + +static HLIST_HEAD(nr_list); +static DEFINE_SPINLOCK(nr_list_lock); + +static const struct proto_ops nr_proto_ops; + +/* + * NETROM network devices are virtual network devices encapsulating NETROM + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key nr_netdev_xmit_lock_key; +static struct lock_class_key nr_netdev_addr_lock_key; + +static void nr_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); +} + +static void nr_set_lockdep_key(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); + netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void nr_remove_socket(struct sock *sk) +{ + spin_lock_bh(&nr_list_lock); + sk_del_node_init(sk); + spin_unlock_bh(&nr_list_lock); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void nr_kill_by_device(struct net_device *dev) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) + if (nr_sk(s)->device == dev) + nr_disconnect(s, ENETUNREACH); + spin_unlock_bh(&nr_list_lock); +} + +/* + * Handle device status changes. + */ +static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + nr_kill_by_device(dev); + nr_rt_device_down(dev); + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void nr_insert_socket(struct sock *sk) +{ + spin_lock_bh(&nr_list_lock); + sk_add_node(sk, &nr_list); + spin_unlock_bh(&nr_list_lock); +} + +/* + * Find a socket that wants to accept the Connect Request we just + * received. + */ +static struct sock *nr_find_listener(ax25_address *addr) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) + if (!ax25cmp(&nr_sk(s)->source_addr, addr) && + s->sk_state == TCP_LISTEN) { + bh_lock_sock(s); + goto found; + } + s = NULL; +found: + spin_unlock_bh(&nr_list_lock); + return s; +} + +/* + * Find a connected NET/ROM socket given my circuit IDs. + */ +static struct sock *nr_find_socket(unsigned char index, unsigned char id) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) { + struct nr_sock *nr = nr_sk(s); + + if (nr->my_index == index && nr->my_id == id) { + bh_lock_sock(s); + goto found; + } + } + s = NULL; +found: + spin_unlock_bh(&nr_list_lock); + return s; +} + +/* + * Find a connected NET/ROM socket given their circuit IDs. + */ +static struct sock *nr_find_peer(unsigned char index, unsigned char id, + ax25_address *dest) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) { + struct nr_sock *nr = nr_sk(s); + + if (nr->your_index == index && nr->your_id == id && + !ax25cmp(&nr->dest_addr, dest)) { + bh_lock_sock(s); + goto found; + } + } + s = NULL; +found: + spin_unlock_bh(&nr_list_lock); + return s; +} + +/* + * Find next free circuit ID. + */ +static unsigned short nr_find_next_circuit(void) +{ + unsigned short id = circuit; + unsigned char i, j; + struct sock *sk; + + for (;;) { + i = id / 256; + j = id % 256; + + if (i != 0 && j != 0) { + if ((sk=nr_find_socket(i, j)) == NULL) + break; + bh_unlock_sock(sk); + } + + id++; + } + + return id; +} + +/* + * Deferred destroy. + */ +void nr_destroy_socket(struct sock *); + +/* + * Handler for deferred kills. + */ +static void nr_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + bh_lock_sock(sk); + sock_hold(sk); + nr_destroy_socket(sk); + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + */ +void nr_destroy_socket(struct sock *sk) +{ + struct sk_buff *skb; + + nr_remove_socket(sk); + + nr_stop_heartbeat(sk); + nr_stop_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + + nr_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + /* Queue the unaccepted socket for death */ + sock_set_flag(skb->sk, SOCK_DEAD); + nr_start_heartbeat(skb->sk); + nr_sk(skb->sk)->state = NR_STATE_0; + } + + kfree_skb(skb); + } + + if (sk_has_allocations(sk)) { + /* Defer: outstanding buffers */ + sk->sk_timer.function = nr_destroy_timer; + sk->sk_timer.expires = jiffies + 2 * HZ; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +/* + * Handling for system calls applied via the various interfaces to a + * NET/ROM socket object. + */ + +static int nr_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + int opt; + + if (level != SOL_NETROM) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETROM_T1: + if (opt < 1) + return -EINVAL; + nr->t1 = opt * HZ; + return 0; + + case NETROM_T2: + if (opt < 1) + return -EINVAL; + nr->t2 = opt * HZ; + return 0; + + case NETROM_N2: + if (opt < 1 || opt > 31) + return -EINVAL; + nr->n2 = opt; + return 0; + + case NETROM_T4: + if (opt < 1) + return -EINVAL; + nr->t4 = opt * HZ; + return 0; + + case NETROM_IDLE: + if (opt < 0) + return -EINVAL; + nr->idle = opt * 60 * HZ; + return 0; + + default: + return -ENOPROTOOPT; + } +} + +static int nr_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + int val = 0; + int len; + + if (level != SOL_NETROM) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETROM_T1: + val = nr->t1 / HZ; + break; + + case NETROM_T2: + val = nr->t2 / HZ; + break; + + case NETROM_N2: + val = nr->n2; + break; + + case NETROM_T4: + val = nr->t4 / HZ; + break; + + case NETROM_IDLE: + val = nr->idle / (60 * HZ); + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, len, sizeof(int)); + + if (put_user(len, optlen)) + return -EFAULT; + + return copy_to_user(optval, &val, len) ? -EFAULT : 0; +} + +static int nr_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + if (sk->sk_state != TCP_LISTEN) { + memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + release_sock(sk); + return 0; + } + release_sock(sk); + + return -EOPNOTSUPP; +} + +static struct proto nr_proto = { + .name = "NETROM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct nr_sock), +}; + +static int nr_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct nr_sock *nr; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + if (sock->type != SOCK_SEQPACKET || protocol != 0) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto); + if (sk == NULL) + return -ENOMEM; + + nr = nr_sk(sk); + + sock_init_data(sock, sk); + + sock->ops = &nr_proto_ops; + sk->sk_protocol = protocol; + + skb_queue_head_init(&nr->ack_queue); + skb_queue_head_init(&nr->reseq_queue); + skb_queue_head_init(&nr->frag_queue); + + nr_init_timers(sk); + + nr->t1 = + msecs_to_jiffies(sysctl_netrom_transport_timeout); + nr->t2 = + msecs_to_jiffies(sysctl_netrom_transport_acknowledge_delay); + nr->n2 = + msecs_to_jiffies(sysctl_netrom_transport_maximum_tries); + nr->t4 = + msecs_to_jiffies(sysctl_netrom_transport_busy_delay); + nr->idle = + msecs_to_jiffies(sysctl_netrom_transport_no_activity_timeout); + nr->window = sysctl_netrom_transport_requested_window_size; + + nr->bpqext = 1; + nr->state = NR_STATE_0; + + return 0; +} + +static struct sock *nr_make_new(struct sock *osk) +{ + struct sock *sk; + struct nr_sock *nr, *onr; + + if (osk->sk_type != SOCK_SEQPACKET) + return NULL; + + sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot); + if (sk == NULL) + return NULL; + + nr = nr_sk(sk); + + sock_init_data(NULL, sk); + + sk->sk_type = osk->sk_type; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sock_copy_flags(sk, osk); + + skb_queue_head_init(&nr->ack_queue); + skb_queue_head_init(&nr->reseq_queue); + skb_queue_head_init(&nr->frag_queue); + + nr_init_timers(sk); + + onr = nr_sk(osk); + + nr->t1 = onr->t1; + nr->t2 = onr->t2; + nr->n2 = onr->n2; + nr->t4 = onr->t4; + nr->idle = onr->idle; + nr->window = onr->window; + + nr->device = onr->device; + nr->bpqext = onr->bpqext; + + return sk; +} + +static int nr_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr; + + if (sk == NULL) return 0; + + sock_hold(sk); + sock_orphan(sk); + lock_sock(sk); + nr = nr_sk(sk); + + switch (nr->state) { + case NR_STATE_0: + case NR_STATE_1: + case NR_STATE_2: + nr_disconnect(sk, 0); + nr_destroy_socket(sk); + break; + + case NR_STATE_3: + nr_clear_queues(sk); + nr->n2count = 0; + nr_write_internal(sk, NR_DISCREQ); + nr_start_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + nr->state = NR_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DESTROY); + break; + + default: + break; + } + + sock->sk = NULL; + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; + struct net_device *dev; + ax25_uid_assoc *user; + ax25_address *source; + + lock_sock(sk); + if (!sock_flag(sk, SOCK_ZAPPED)) { + release_sock(sk); + return -EINVAL; + } + if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) { + release_sock(sk); + return -EINVAL; + } + if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) { + release_sock(sk); + return -EINVAL; + } + if (addr->fsa_ax25.sax25_family != AF_NETROM) { + release_sock(sk); + return -EINVAL; + } + if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) { + release_sock(sk); + return -EADDRNOTAVAIL; + } + + /* + * Only the super user can set an arbitrary user callsign. + */ + if (addr->fsa_ax25.sax25_ndigis == 1) { + if (!capable(CAP_NET_BIND_SERVICE)) { + dev_put(dev); + release_sock(sk); + return -EACCES; + } + nr->user_addr = addr->fsa_digipeater[0]; + nr->source_addr = addr->fsa_ax25.sax25_call; + } else { + source = &addr->fsa_ax25.sax25_call; + + user = ax25_findbyuid(current_euid()); + if (user) { + nr->user_addr = user->call; + ax25_uid_put(user); + } else { + if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { + release_sock(sk); + dev_put(dev); + return -EPERM; + } + nr->user_addr = *source; + } + + nr->source_addr = *source; + } + + nr->device = dev; + nr_insert_socket(sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + dev_put(dev); + release_sock(sk); + + return 0; +} + +static int nr_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; + ax25_address *source = NULL; + ax25_uid_assoc *user; + struct net_device *dev; + int err = 0; + + lock_sock(sk); + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + goto out_release; /* Connect completed during a ERESTARTSYS event */ + } + + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + err = -ECONNREFUSED; + goto out_release; + } + + if (sk->sk_state == TCP_ESTABLISHED) { + err = -EISCONN; /* No reconnect on a seqpacket socket */ + goto out_release; + } + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) { + err = -EINVAL; + goto out_release; + } + if (addr->sax25_family != AF_NETROM) { + err = -EINVAL; + goto out_release; + } + if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ + sock_reset_flag(sk, SOCK_ZAPPED); + + if ((dev = nr_dev_first()) == NULL) { + err = -ENETUNREACH; + goto out_release; + } + source = (ax25_address *)dev->dev_addr; + + user = ax25_findbyuid(current_euid()); + if (user) { + nr->user_addr = user->call; + ax25_uid_put(user); + } else { + if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { + dev_put(dev); + err = -EPERM; + goto out_release; + } + nr->user_addr = *source; + } + + nr->source_addr = *source; + nr->device = dev; + + dev_put(dev); + nr_insert_socket(sk); /* Finish the bind */ + } + + nr->dest_addr = addr->sax25_call; + + release_sock(sk); + circuit = nr_find_next_circuit(); + lock_sock(sk); + + nr->my_index = circuit / 256; + nr->my_id = circuit % 256; + + circuit++; + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + nr_establish_data_link(sk); + + nr->state = NR_STATE_1; + + nr_start_heartbeat(sk); + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { + err = -EINPROGRESS; + goto out_release; + } + + /* + * A Connect Ack with Choke or timeout or failed routing will go to + * closed. + */ + if (sk->sk_state == TCP_SYN_SENT) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk->sk_state != TCP_SYN_SENT) + break; + if (!signal_pending(current)) { + release_sock(sk); + schedule(); + lock_sock(sk); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(sk_sleep(sk), &wait); + if (err) + goto out_release; + } + + if (sk->sk_state != TCP_ESTABLISHED) { + sock->state = SS_UNCONNECTED; + err = sock_error(sk); /* Always set at this point */ + goto out_release; + } + + sock->state = SS_CONNECTED; + +out_release: + release_sock(sk); + + return err; +} + +static int nr_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sk_buff *skb; + struct sock *newsk; + DEFINE_WAIT(wait); + struct sock *sk; + int err = 0; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EOPNOTSUPP; + goto out_release; + } + + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto out_release; + } + + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + if (flags & O_NONBLOCK) { + err = -EWOULDBLOCK; + break; + } + if (!signal_pending(current)) { + release_sock(sk); + schedule(); + lock_sock(sk); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(sk_sleep(sk), &wait); + if (err) + goto out_release; + + newsk = skb->sk; + sock_graft(newsk, newsock); + + /* Now attach up the new socket */ + kfree_skb(skb); + sk_acceptq_removed(sk); + +out_release: + release_sock(sk); + + return err; +} + +static int nr_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + + lock_sock(sk); + if (peer != 0) { + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + return -ENOTCONN; + } + sax->fsa_ax25.sax25_family = AF_NETROM; + sax->fsa_ax25.sax25_ndigis = 1; + sax->fsa_ax25.sax25_call = nr->user_addr; + memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater)); + sax->fsa_digipeater[0] = nr->dest_addr; + *uaddr_len = sizeof(struct full_sockaddr_ax25); + } else { + sax->fsa_ax25.sax25_family = AF_NETROM; + sax->fsa_ax25.sax25_ndigis = 0; + sax->fsa_ax25.sax25_call = nr->source_addr; + *uaddr_len = sizeof(struct sockaddr_ax25); + } + release_sock(sk); + + return 0; +} + +int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) +{ + struct sock *sk; + struct sock *make; + struct nr_sock *nr_make; + ax25_address *src, *dest, *user; + unsigned short circuit_index, circuit_id; + unsigned short peer_circuit_index, peer_circuit_id; + unsigned short frametype, flags, window, timeout; + int ret; + + skb->sk = NULL; /* Initially we don't know who it's for */ + + /* + * skb->data points to the netrom frame start + */ + + src = (ax25_address *)(skb->data + 0); + dest = (ax25_address *)(skb->data + 7); + + circuit_index = skb->data[15]; + circuit_id = skb->data[16]; + peer_circuit_index = skb->data[17]; + peer_circuit_id = skb->data[18]; + frametype = skb->data[19] & 0x0F; + flags = skb->data[19] & 0xF0; + + /* + * Check for an incoming IP over NET/ROM frame. + */ + if (frametype == NR_PROTOEXT && + circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { + skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + skb_reset_transport_header(skb); + + return nr_rx_ip(skb, dev); + } + + /* + * Find an existing socket connection, based on circuit ID, if it's + * a Connect Request base it on their circuit ID. + * + * Circuit ID 0/0 is not valid but it could still be a "reset" for a + * circuit that no longer exists at the other end ... + */ + + sk = NULL; + + if (circuit_index == 0 && circuit_id == 0) { + if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) + sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src); + } else { + if (frametype == NR_CONNREQ) + sk = nr_find_peer(circuit_index, circuit_id, src); + else + sk = nr_find_socket(circuit_index, circuit_id); + } + + if (sk != NULL) { + skb_reset_transport_header(skb); + + if (frametype == NR_CONNACK && skb->len == 22) + nr_sk(sk)->bpqext = 1; + else + nr_sk(sk)->bpqext = 0; + + ret = nr_process_rx_frame(sk, skb); + bh_unlock_sock(sk); + return ret; + } + + /* + * Now it should be a CONNREQ. + */ + if (frametype != NR_CONNREQ) { + /* + * Here it would be nice to be able to send a reset but + * NET/ROM doesn't have one. We've tried to extend the protocol + * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that + * apparently kills BPQ boxes... :-( + * So now we try to follow the established behaviour of + * G8PZT's Xrouter which is sending packets with command type 7 + * as an extension of the protocol. + */ + if (sysctl_netrom_reset_circuit && + (frametype != NR_RESET || flags != 0)) + nr_transmit_reset(skb, 1); + + return 0; + } + + sk = nr_find_listener(dest); + + user = (ax25_address *)(skb->data + 21); + + if (sk == NULL || sk_acceptq_is_full(sk) || + (make = nr_make_new(sk)) == NULL) { + nr_transmit_refusal(skb, 0); + if (sk) + bh_unlock_sock(sk); + return 0; + } + + window = skb->data[20]; + + skb->sk = make; + make->sk_state = TCP_ESTABLISHED; + + /* Fill in his circuit details */ + nr_make = nr_sk(make); + nr_make->source_addr = *dest; + nr_make->dest_addr = *src; + nr_make->user_addr = *user; + + nr_make->your_index = circuit_index; + nr_make->your_id = circuit_id; + + bh_unlock_sock(sk); + circuit = nr_find_next_circuit(); + bh_lock_sock(sk); + + nr_make->my_index = circuit / 256; + nr_make->my_id = circuit % 256; + + circuit++; + + /* Window negotiation */ + if (window < nr_make->window) + nr_make->window = window; + + /* L4 timeout negotiation */ + if (skb->len == 37) { + timeout = skb->data[36] * 256 + skb->data[35]; + if (timeout * HZ < nr_make->t1) + nr_make->t1 = timeout * HZ; + nr_make->bpqext = 1; + } else { + nr_make->bpqext = 0; + } + + nr_write_internal(make, NR_CONNACK); + + nr_make->condition = 0x00; + nr_make->vs = 0; + nr_make->va = 0; + nr_make->vr = 0; + nr_make->vl = 0; + nr_make->state = NR_STATE_3; + sk_acceptq_added(sk); + skb_queue_head(&sk->sk_receive_queue, skb); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + + bh_unlock_sock(sk); + + nr_insert_socket(make); + + nr_start_heartbeat(make); + nr_start_idletimer(make); + + return 1; +} + +static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; + int err; + struct sockaddr_ax25 sax; + struct sk_buff *skb; + unsigned char *asmptr; + int size; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) { + err = -EADDRNOTAVAIL; + goto out; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + goto out; + } + + if (nr->device == NULL) { + err = -ENETUNREACH; + goto out; + } + + if (usax) { + if (msg->msg_namelen < sizeof(sax)) { + err = -EINVAL; + goto out; + } + sax = *usax; + if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) { + err = -EISCONN; + goto out; + } + if (sax.sax25_family != AF_NETROM) { + err = -EINVAL; + goto out; + } + } else { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + sax.sax25_family = AF_NETROM; + sax.sax25_call = nr->dest_addr; + } + + /* Build a packet - the conventional user limit is 236 bytes. We can + do ludicrously large NetROM frames but must not overflow */ + if (len > 65536) { + err = -EMSGSIZE; + goto out; + } + + size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN; + + if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + goto out; + + skb_reserve(skb, size - len); + skb_reset_transport_header(skb); + + /* + * Push down the NET/ROM header + */ + + asmptr = skb_push(skb, NR_TRANSPORT_LEN); + + /* Build a NET/ROM Transport header */ + + *asmptr++ = nr->your_index; + *asmptr++ = nr->your_id; + *asmptr++ = 0; /* To be filled in later */ + *asmptr++ = 0; /* Ditto */ + *asmptr++ = NR_INFO; + + /* + * Put the data on the end + */ + skb_put(skb, len); + + /* User data follows immediately after the NET/ROM transport header */ + if (memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len)) { + kfree_skb(skb); + err = -EFAULT; + goto out; + } + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + err = -ENOTCONN; + goto out; + } + + nr_output(sk, skb); /* Shove it onto the queue */ + + err = len; +out: + release_sock(sk); + return err; +} + +static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + size_t copied; + struct sk_buff *skb; + int er; + + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + + lock_sock(sk); + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + return -ENOTCONN; + } + + /* Now we can treat all alike */ + if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL) { + release_sock(sk); + return er; + } + + skb_reset_transport_header(skb); + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (sax != NULL) { + sax->sax25_family = AF_NETROM; + skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call, + AX25_ADDR_LEN); + } + + msg->msg_namelen = sizeof(*sax); + + skb_free_datagram(sk, skb); + + release_sock(sk); + return copied; +} + + +static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + int ret; + + switch (cmd) { + case TIOCOUTQ: { + long amount; + + lock_sock(sk); + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + release_sock(sk); + return put_user(amount, (int __user *)argp); + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + + lock_sock(sk); + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + release_sock(sk); + return put_user(amount, (int __user *)argp); + } + + case SIOCGSTAMP: + lock_sock(sk); + ret = sock_get_timestamp(sk, argp); + release_sock(sk); + return ret; + + case SIOCGSTAMPNS: + lock_sock(sk); + ret = sock_get_timestampns(sk, argp); + release_sock(sk); + return ret; + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + return -EINVAL; + + case SIOCADDRT: + case SIOCDELRT: + case SIOCNRDECOBS: + if (!capable(CAP_NET_ADMIN)) return -EPERM; + return nr_rt_ioctl(cmd, argp); + + default: + return -ENOIOCTLCMD; + } + + return 0; +} + +#ifdef CONFIG_PROC_FS + +static void *nr_info_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_bh(&nr_list_lock); + return seq_hlist_start_head(&nr_list, *pos); +} + +static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &nr_list, pos); +} + +static void nr_info_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&nr_list_lock); +} + +static int nr_info_show(struct seq_file *seq, void *v) +{ + struct sock *s = sk_entry(v); + struct net_device *dev; + struct nr_sock *nr; + const char *devname; + char buf[11]; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, +"user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n"); + + else { + + bh_lock_sock(s); + nr = nr_sk(s); + + if ((dev = nr->device) == NULL) + devname = "???"; + else + devname = dev->name; + + seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr)); + seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr)); + seq_printf(seq, +"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", + ax2asc(buf, &nr->source_addr), + devname, + nr->my_index, + nr->my_id, + nr->your_index, + nr->your_id, + nr->state, + nr->vs, + nr->vr, + nr->va, + ax25_display_timer(&nr->t1timer) / HZ, + nr->t1 / HZ, + ax25_display_timer(&nr->t2timer) / HZ, + nr->t2 / HZ, + ax25_display_timer(&nr->t4timer) / HZ, + nr->t4 / HZ, + ax25_display_timer(&nr->idletimer) / (60 * HZ), + nr->idle / (60 * HZ), + nr->n2count, + nr->n2, + nr->window, + sk_wmem_alloc_get(s), + sk_rmem_alloc_get(s), + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + + bh_unlock_sock(s); + } + return 0; +} + +static const struct seq_operations nr_info_seqops = { + .start = nr_info_start, + .next = nr_info_next, + .stop = nr_info_stop, + .show = nr_info_show, +}; + +static int nr_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nr_info_seqops); +} + +static const struct file_operations nr_info_fops = { + .owner = THIS_MODULE, + .open = nr_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static const struct net_proto_family nr_family_ops = { + .family = PF_NETROM, + .create = nr_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops nr_proto_ops = { + .family = PF_NETROM, + .owner = THIS_MODULE, + .release = nr_release, + .bind = nr_bind, + .connect = nr_connect, + .socketpair = sock_no_socketpair, + .accept = nr_accept, + .getname = nr_getname, + .poll = datagram_poll, + .ioctl = nr_ioctl, + .listen = nr_listen, + .shutdown = sock_no_shutdown, + .setsockopt = nr_setsockopt, + .getsockopt = nr_getsockopt, + .sendmsg = nr_sendmsg, + .recvmsg = nr_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct notifier_block nr_dev_notifier = { + .notifier_call = nr_device_event, +}; + +static struct net_device **dev_nr; + +static struct ax25_protocol nr_pid = { + .pid = AX25_P_NETROM, + .func = nr_route_frame +}; + +static struct ax25_linkfail nr_linkfail_notifier = { + .func = nr_link_failed, +}; + +static int __init nr_proto_init(void) +{ + int i; + int rc = proto_register(&nr_proto, 0); + + if (rc != 0) + goto out; + + if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { + printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n"); + return -1; + } + + dev_nr = kzalloc(nr_ndevs * sizeof(struct net_device *), GFP_KERNEL); + if (dev_nr == NULL) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n"); + return -1; + } + + for (i = 0; i < nr_ndevs; i++) { + char name[IFNAMSIZ]; + struct net_device *dev; + + sprintf(name, "nr%d", i); + dev = alloc_netdev(0, name, nr_setup); + if (!dev) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); + goto fail; + } + + dev->base_addr = i; + if (register_netdev(dev)) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n"); + free_netdev(dev); + goto fail; + } + nr_set_lockdep_key(dev); + dev_nr[i] = dev; + } + + if (sock_register(&nr_family_ops)) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n"); + goto fail; + } + + register_netdevice_notifier(&nr_dev_notifier); + + ax25_register_pid(&nr_pid); + ax25_linkfail_register(&nr_linkfail_notifier); + +#ifdef CONFIG_SYSCTL + nr_register_sysctl(); +#endif + + nr_loopback_init(); + + proc_net_fops_create(&init_net, "nr", S_IRUGO, &nr_info_fops); + proc_net_fops_create(&init_net, "nr_neigh", S_IRUGO, &nr_neigh_fops); + proc_net_fops_create(&init_net, "nr_nodes", S_IRUGO, &nr_nodes_fops); +out: + return rc; +fail: + while (--i >= 0) { + unregister_netdev(dev_nr[i]); + free_netdev(dev_nr[i]); + } + kfree(dev_nr); + proto_unregister(&nr_proto); + rc = -1; + goto out; +} + +module_init(nr_proto_init); + +module_param(nr_ndevs, int, 0); +MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices"); + +MODULE_AUTHOR("Jonathan Naylor G4KLX "); +MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_NETROM); + +static void __exit nr_exit(void) +{ + int i; + + proc_net_remove(&init_net, "nr"); + proc_net_remove(&init_net, "nr_neigh"); + proc_net_remove(&init_net, "nr_nodes"); + nr_loopback_clear(); + + nr_rt_free(); + +#ifdef CONFIG_SYSCTL + nr_unregister_sysctl(); +#endif + + ax25_linkfail_release(&nr_linkfail_notifier); + ax25_protocol_release(AX25_P_NETROM); + + unregister_netdevice_notifier(&nr_dev_notifier); + + sock_unregister(PF_NETROM); + + for (i = 0; i < nr_ndevs; i++) { + struct net_device *dev = dev_nr[i]; + if (dev) { + unregister_netdev(dev); + free_netdev(dev); + } + } + + kfree(dev_nr); + proto_unregister(&nr_proto); +} +module_exit(nr_exit); diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c new file mode 100644 index 00000000..64e6dde9 --- /dev/null +++ b/net/netrom/nr_dev.c @@ -0,0 +1,213 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +/* + * Only allow IP over NET/ROM frames through if the netrom device is up. + */ + +int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = &dev->stats; + + if (!netif_running(dev)) { + stats->rx_dropped++; + return 0; + } + + stats->rx_packets++; + stats->rx_bytes += skb->len; + + skb->protocol = htons(ETH_P_IP); + + /* Spoof incoming device */ + skb->dev = dev; + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + skb->pkt_type = PACKET_HOST; + + netif_rx(skb); + + return 1; +} + +#ifdef CONFIG_INET + +static int nr_rebuild_header(struct sk_buff *skb) +{ + unsigned char *bp = skb->data; + + if (arp_find(bp + 7, skb)) + return 1; + + bp[6] &= ~AX25_CBIT; + bp[6] &= ~AX25_EBIT; + bp[6] |= AX25_SSSID_SPARE; + bp += AX25_ADDR_LEN; + + bp[6] &= ~AX25_CBIT; + bp[6] |= AX25_EBIT; + bp[6] |= AX25_SSSID_SPARE; + + return 0; +} + +#else + +static int nr_rebuild_header(struct sk_buff *skb) +{ + return 1; +} + +#endif + +static int nr_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len); + buff[6] &= ~AX25_CBIT; + buff[6] &= ~AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + if (daddr != NULL) + memcpy(buff, daddr, dev->addr_len); + buff[6] &= ~AX25_CBIT; + buff[6] |= AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + *buff++ = sysctl_netrom_network_ttl_initialiser; + + *buff++ = NR_PROTO_IP; + *buff++ = NR_PROTO_IP; + *buff++ = 0; + *buff++ = 0; + *buff++ = NR_PROTOEXT; + + if (daddr != NULL) + return 37; + + return -37; +} + +static int __must_check nr_set_mac_address(struct net_device *dev, void *addr) +{ + struct sockaddr *sa = addr; + int err; + + if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) + return 0; + + if (dev->flags & IFF_UP) { + err = ax25_listen_register((ax25_address *)sa->sa_data, NULL); + if (err) + return err; + + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + } + + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + return 0; +} + +static int nr_open(struct net_device *dev) +{ + int err; + + err = ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + if (err) + return err; + + netif_start_queue(dev); + + return 0; +} + +static int nr_close(struct net_device *dev) +{ + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + netif_stop_queue(dev); + return 0; +} + +static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = &dev->stats; + unsigned int len = skb->len; + + if (!nr_route_frame(skb, NULL)) { + kfree_skb(skb); + stats->tx_errors++; + return NETDEV_TX_OK; + } + + stats->tx_packets++; + stats->tx_bytes += len; + + return NETDEV_TX_OK; +} + +static const struct header_ops nr_header_ops = { + .create = nr_header, + .rebuild= nr_rebuild_header, +}; + +static const struct net_device_ops nr_netdev_ops = { + .ndo_open = nr_open, + .ndo_stop = nr_close, + .ndo_start_xmit = nr_xmit, + .ndo_set_mac_address = nr_set_mac_address, +}; + +void nr_setup(struct net_device *dev) +{ + dev->mtu = NR_MAX_PACKET_SIZE; + dev->netdev_ops = &nr_netdev_ops; + dev->header_ops = &nr_header_ops; + dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + dev->addr_len = AX25_ADDR_LEN; + dev->type = ARPHRD_NETROM; + + /* New-style flags. */ + dev->flags = IFF_NOARP; +} diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c new file mode 100644 index 00000000..6d4ef6d6 --- /dev/null +++ b/net/netrom/nr_in.c @@ -0,0 +1,306 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) +{ + struct sk_buff *skbo, *skbn = skb; + struct nr_sock *nr = nr_sk(sk); + + skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + nr_start_idletimer(sk); + + if (more) { + nr->fraglen += skb->len; + skb_queue_tail(&nr->frag_queue, skb); + return 0; + } + + if (!more && nr->fraglen > 0) { /* End of fragment */ + nr->fraglen += skb->len; + skb_queue_tail(&nr->frag_queue, skb); + + if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL) + return 1; + + skb_reset_transport_header(skbn); + + while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) { + skb_copy_from_linear_data(skbo, + skb_put(skbn, skbo->len), + skbo->len); + kfree_skb(skbo); + } + + nr->fraglen = 0; + } + + return sock_queue_rcv_skb(sk, skbn); +} + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file nr_timer.c. + * Handling of state 0 and connection release is in netrom.c. + */ +static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, + int frametype) +{ + switch (frametype) { + case NR_CONNACK: { + struct nr_sock *nr = nr_sk(sk); + + nr_stop_t1timer(sk); + nr_start_idletimer(sk); + nr->your_index = skb->data[17]; + nr->your_id = skb->data[18]; + nr->vs = 0; + nr->va = 0; + nr->vr = 0; + nr->vl = 0; + nr->state = NR_STATE_3; + nr->n2count = 0; + nr->window = skb->data[20]; + sk->sk_state = TCP_ESTABLISHED; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + } + + case NR_CONNACK | NR_CHOKE_FLAG: + nr_disconnect(sk, ECONNREFUSED); + break; + + case NR_RESET: + if (sysctl_netrom_reset_circuit) + nr_disconnect(sk, ECONNRESET); + break; + + default: + break; + } + return 0; +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file nr_timer.c + * Handling of state 0 and connection release is in netrom.c. + */ +static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, + int frametype) +{ + switch (frametype) { + case NR_CONNACK | NR_CHOKE_FLAG: + nr_disconnect(sk, ECONNRESET); + break; + + case NR_DISCREQ: + nr_write_internal(sk, NR_DISCACK); + + case NR_DISCACK: + nr_disconnect(sk, 0); + break; + + case NR_RESET: + if (sysctl_netrom_reset_circuit) + nr_disconnect(sk, ECONNRESET); + break; + + default: + break; + } + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file nr_timer.c + * Handling of state 0 and connection release is in netrom.c. + */ +static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct nr_sock *nrom = nr_sk(sk); + struct sk_buff_head temp_queue; + struct sk_buff *skbn; + unsigned short save_vr; + unsigned short nr, ns; + int queued = 0; + + nr = skb->data[18]; + ns = skb->data[17]; + + switch (frametype) { + case NR_CONNREQ: + nr_write_internal(sk, NR_CONNACK); + break; + + case NR_DISCREQ: + nr_write_internal(sk, NR_DISCACK); + nr_disconnect(sk, 0); + break; + + case NR_CONNACK | NR_CHOKE_FLAG: + case NR_DISCACK: + nr_disconnect(sk, ECONNRESET); + break; + + case NR_INFOACK: + case NR_INFOACK | NR_CHOKE_FLAG: + case NR_INFOACK | NR_NAK_FLAG: + case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG: + if (frametype & NR_CHOKE_FLAG) { + nrom->condition |= NR_COND_PEER_RX_BUSY; + nr_start_t4timer(sk); + } else { + nrom->condition &= ~NR_COND_PEER_RX_BUSY; + nr_stop_t4timer(sk); + } + if (!nr_validate_nr(sk, nr)) { + break; + } + if (frametype & NR_NAK_FLAG) { + nr_frames_acked(sk, nr); + nr_send_nak_frame(sk); + } else { + if (nrom->condition & NR_COND_PEER_RX_BUSY) { + nr_frames_acked(sk, nr); + } else { + nr_check_iframes_acked(sk, nr); + } + } + break; + + case NR_INFO: + case NR_INFO | NR_NAK_FLAG: + case NR_INFO | NR_CHOKE_FLAG: + case NR_INFO | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG: + case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG: + if (frametype & NR_CHOKE_FLAG) { + nrom->condition |= NR_COND_PEER_RX_BUSY; + nr_start_t4timer(sk); + } else { + nrom->condition &= ~NR_COND_PEER_RX_BUSY; + nr_stop_t4timer(sk); + } + if (nr_validate_nr(sk, nr)) { + if (frametype & NR_NAK_FLAG) { + nr_frames_acked(sk, nr); + nr_send_nak_frame(sk); + } else { + if (nrom->condition & NR_COND_PEER_RX_BUSY) { + nr_frames_acked(sk, nr); + } else { + nr_check_iframes_acked(sk, nr); + } + } + } + queued = 1; + skb_queue_head(&nrom->reseq_queue, skb); + if (nrom->condition & NR_COND_OWN_RX_BUSY) + break; + skb_queue_head_init(&temp_queue); + do { + save_vr = nrom->vr; + while ((skbn = skb_dequeue(&nrom->reseq_queue)) != NULL) { + ns = skbn->data[17]; + if (ns == nrom->vr) { + if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) { + nrom->vr = (nrom->vr + 1) % NR_MODULUS; + } else { + nrom->condition |= NR_COND_OWN_RX_BUSY; + skb_queue_tail(&temp_queue, skbn); + } + } else if (nr_in_rx_window(sk, ns)) { + skb_queue_tail(&temp_queue, skbn); + } else { + kfree_skb(skbn); + } + } + while ((skbn = skb_dequeue(&temp_queue)) != NULL) { + skb_queue_tail(&nrom->reseq_queue, skbn); + } + } while (save_vr != nrom->vr); + /* + * Window is full, ack it immediately. + */ + if (((nrom->vl + nrom->window) % NR_MODULUS) == nrom->vr) { + nr_enquiry_response(sk); + } else { + if (!(nrom->condition & NR_COND_ACK_PENDING)) { + nrom->condition |= NR_COND_ACK_PENDING; + nr_start_t2timer(sk); + } + } + break; + + case NR_RESET: + if (sysctl_netrom_reset_circuit) + nr_disconnect(sk, ECONNRESET); + break; + + default: + break; + } + return queued; +} + +/* Higher level upcall for a LAPB frame - called with sk locked */ +int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + struct nr_sock *nr = nr_sk(sk); + int queued = 0, frametype; + + if (nr->state == NR_STATE_0) + return 0; + + frametype = skb->data[19]; + + switch (nr->state) { + case NR_STATE_1: + queued = nr_state1_machine(sk, skb, frametype); + break; + case NR_STATE_2: + queued = nr_state2_machine(sk, skb, frametype); + break; + case NR_STATE_3: + queued = nr_state3_machine(sk, skb, frametype); + break; + } + + nr_kick(sk); + + return queued; +} diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c new file mode 100644 index 00000000..94d4e922 --- /dev/null +++ b/net/netrom/nr_loopback.c @@ -0,0 +1,77 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void nr_loopback_timer(unsigned long); + +static struct sk_buff_head loopback_queue; +static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0); + +void __init nr_loopback_init(void) +{ + skb_queue_head_init(&loopback_queue); +} + +static inline int nr_loopback_running(void) +{ + return timer_pending(&loopback_timer); +} + +int nr_loopback_queue(struct sk_buff *skb) +{ + struct sk_buff *skbn; + + if ((skbn = alloc_skb(skb->len, GFP_ATOMIC)) != NULL) { + skb_copy_from_linear_data(skb, skb_put(skbn, skb->len), skb->len); + skb_reset_transport_header(skbn); + + skb_queue_tail(&loopback_queue, skbn); + + if (!nr_loopback_running()) + mod_timer(&loopback_timer, jiffies + 10); + } + + kfree_skb(skb); + return 1; +} + +static void nr_loopback_timer(unsigned long param) +{ + struct sk_buff *skb; + ax25_address *nr_dest; + struct net_device *dev; + + if ((skb = skb_dequeue(&loopback_queue)) != NULL) { + nr_dest = (ax25_address *)(skb->data + 7); + + dev = nr_dev_get(nr_dest); + + if (dev == NULL || nr_rx_frame(skb, dev) == 0) + kfree_skb(skb); + + if (dev != NULL) + dev_put(dev); + + if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running()) + mod_timer(&loopback_timer, jiffies + 10); + } +} + +void __exit nr_loopback_clear(void) +{ + del_timer_sync(&loopback_timer); + skb_queue_purge(&loopback_queue); +} diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c new file mode 100644 index 00000000..607fddb4 --- /dev/null +++ b/net/netrom/nr_out.c @@ -0,0 +1,274 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is where all NET/ROM frames pass, except for IP-over-NET/ROM which + * cannot be fragmented in this manner. + */ +void nr_output(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char transport[NR_TRANSPORT_LEN]; + int err, frontlen, len; + + if (skb->len - NR_TRANSPORT_LEN > NR_MAX_PACKET_SIZE) { + /* Save a copy of the Transport Header */ + skb_copy_from_linear_data(skb, transport, NR_TRANSPORT_LEN); + skb_pull(skb, NR_TRANSPORT_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL) + return; + + skb_reserve(skbn, frontlen); + + len = (NR_MAX_PACKET_SIZE > skb->len) ? skb->len : NR_MAX_PACKET_SIZE; + + /* Copy the user data */ + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); + skb_pull(skb, len); + + /* Duplicate the Transport Header */ + skb_push(skbn, NR_TRANSPORT_LEN); + skb_copy_to_linear_data(skbn, transport, + NR_TRANSPORT_LEN); + if (skb->len > 0) + skbn->data[4] |= NR_MORE_FLAG; + + skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ + } + + kfree_skb(skb); + } else { + skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ + } + + nr_kick(sk); +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void nr_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + struct nr_sock *nr = nr_sk(sk); + + if (skb == NULL) + return; + + skb->data[2] = nr->vs; + skb->data[3] = nr->vr; + + if (nr->condition & NR_COND_OWN_RX_BUSY) + skb->data[4] |= NR_CHOKE_FLAG; + + nr_start_idletimer(sk); + + nr_transmit_buffer(sk, skb); +} + +void nr_send_nak_frame(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + struct nr_sock *nr = nr_sk(sk); + + if ((skb = skb_peek(&nr->ack_queue)) == NULL) + return; + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) + return; + + skbn->data[2] = nr->va; + skbn->data[3] = nr->vr; + + if (nr->condition & NR_COND_OWN_RX_BUSY) + skbn->data[4] |= NR_CHOKE_FLAG; + + nr_transmit_buffer(sk, skbn); + + nr->condition &= ~NR_COND_ACK_PENDING; + nr->vl = nr->vr; + + nr_stop_t1timer(sk); +} + +void nr_kick(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + struct sk_buff *skb, *skbn; + unsigned short start, end; + + if (nr->state != NR_STATE_3) + return; + + if (nr->condition & NR_COND_PEER_RX_BUSY) + return; + + if (!skb_peek(&sk->sk_write_queue)) + return; + + start = (skb_peek(&nr->ack_queue) == NULL) ? nr->va : nr->vs; + end = (nr->va + nr->window) % NR_MODULUS; + + if (start == end) + return; + + nr->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&sk->sk_write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->sk_write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + + /* + * Transmit the frame copy. + */ + nr_send_iframe(sk, skbn); + + nr->vs = (nr->vs + 1) % NR_MODULUS; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&nr->ack_queue, skb); + + } while (nr->vs != end && + (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); + + nr->vl = nr->vr; + nr->condition &= ~NR_COND_ACK_PENDING; + + if (!nr_t1timer_running(sk)) + nr_start_t1timer(sk); +} + +void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) +{ + struct nr_sock *nr = nr_sk(sk); + unsigned char *dptr; + + /* + * Add the protocol byte and network header. + */ + dptr = skb_push(skb, NR_NETWORK_LEN); + + memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + memcpy(dptr, &nr->dest_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] |= AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + *dptr++ = sysctl_netrom_network_ttl_initialiser; + + if (!nr_route_frame(skb, NULL)) { + kfree_skb(skb); + nr_disconnect(sk, ENETUNREACH); + } +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void nr_establish_data_link(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + nr->condition = 0x00; + nr->n2count = 0; + + nr_write_internal(sk, NR_CONNREQ); + + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + nr_start_t1timer(sk); +} + +/* + * Never send a NAK when we are CHOKEd. + */ +void nr_enquiry_response(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + int frametype = NR_INFOACK; + + if (nr->condition & NR_COND_OWN_RX_BUSY) { + frametype |= NR_CHOKE_FLAG; + } else { + if (skb_peek(&nr->reseq_queue) != NULL) + frametype |= NR_NAK_FLAG; + } + + nr_write_internal(sk, frametype); + + nr->vl = nr->vr; + nr->condition &= ~NR_COND_ACK_PENDING; +} + +void nr_check_iframes_acked(struct sock *sk, unsigned short nr) +{ + struct nr_sock *nrom = nr_sk(sk); + + if (nrom->vs == nr) { + nr_frames_acked(sk, nr); + nr_stop_t1timer(sk); + nrom->n2count = 0; + } else { + if (nrom->va != nr) { + nr_frames_acked(sk, nr); + nr_start_t1timer(sk); + } + } +} diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c new file mode 100644 index 00000000..44059d0c --- /dev/null +++ b/net/netrom/nr_route.c @@ -0,0 +1,1027 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int nr_neigh_no = 1; + +static HLIST_HEAD(nr_node_list); +static DEFINE_SPINLOCK(nr_node_list_lock); +static HLIST_HEAD(nr_neigh_list); +static DEFINE_SPINLOCK(nr_neigh_list_lock); + +static struct nr_node *nr_node_get(ax25_address *callsign) +{ + struct nr_node *found = NULL; + struct nr_node *nr_node; + struct hlist_node *node; + + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each(nr_node, node, &nr_node_list) + if (ax25cmp(callsign, &nr_node->callsign) == 0) { + nr_node_hold(nr_node); + found = nr_node; + break; + } + spin_unlock_bh(&nr_node_list_lock); + return found; +} + +static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, + struct net_device *dev) +{ + struct nr_neigh *found = NULL; + struct nr_neigh *nr_neigh; + struct hlist_node *node; + + spin_lock_bh(&nr_neigh_list_lock); + nr_neigh_for_each(nr_neigh, node, &nr_neigh_list) + if (ax25cmp(callsign, &nr_neigh->callsign) == 0 && + nr_neigh->dev == dev) { + nr_neigh_hold(nr_neigh); + found = nr_neigh; + break; + } + spin_unlock_bh(&nr_neigh_list_lock); + return found; +} + +static void nr_remove_neigh(struct nr_neigh *); + +/* + * Add a new route to a node, and in the process add the node and the + * neighbour if it is new. + */ +static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, + ax25_address *ax25, ax25_digi *ax25_digi, struct net_device *dev, + int quality, int obs_count) +{ + struct nr_node *nr_node; + struct nr_neigh *nr_neigh; + struct nr_route nr_route; + int i, found; + struct net_device *odev; + + if ((odev=nr_dev_get(nr)) != NULL) { /* Can't add routes to ourself */ + dev_put(odev); + return -EINVAL; + } + + nr_node = nr_node_get(nr); + + nr_neigh = nr_neigh_get_dev(ax25, dev); + + /* + * The L2 link to a neighbour has failed in the past + * and now a frame comes from this neighbour. We assume + * it was a temporary trouble with the link and reset the + * routes now (and not wait for a node broadcast). + */ + if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { + struct nr_node *nr_nodet; + struct hlist_node *node; + + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each(nr_nodet, node, &nr_node_list) { + nr_node_lock(nr_nodet); + for (i = 0; i < nr_nodet->count; i++) + if (nr_nodet->routes[i].neighbour == nr_neigh) + if (i < nr_nodet->which) + nr_nodet->which = i; + nr_node_unlock(nr_nodet); + } + spin_unlock_bh(&nr_node_list_lock); + } + + if (nr_neigh != NULL) + nr_neigh->failed = 0; + + if (quality == 0 && nr_neigh != NULL && nr_node != NULL) { + nr_neigh_put(nr_neigh); + nr_node_put(nr_node); + return 0; + } + + if (nr_neigh == NULL) { + if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) { + if (nr_node) + nr_node_put(nr_node); + return -ENOMEM; + } + + nr_neigh->callsign = *ax25; + nr_neigh->digipeat = NULL; + nr_neigh->ax25 = NULL; + nr_neigh->dev = dev; + nr_neigh->quality = sysctl_netrom_default_path_quality; + nr_neigh->locked = 0; + nr_neigh->count = 0; + nr_neigh->number = nr_neigh_no++; + nr_neigh->failed = 0; + atomic_set(&nr_neigh->refcount, 1); + + if (ax25_digi != NULL && ax25_digi->ndigi > 0) { + nr_neigh->digipeat = kmemdup(ax25_digi, + sizeof(*ax25_digi), + GFP_KERNEL); + if (nr_neigh->digipeat == NULL) { + kfree(nr_neigh); + if (nr_node) + nr_node_put(nr_node); + return -ENOMEM; + } + } + + spin_lock_bh(&nr_neigh_list_lock); + hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); + nr_neigh_hold(nr_neigh); + spin_unlock_bh(&nr_neigh_list_lock); + } + + if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked) + nr_neigh->quality = quality; + + if (nr_node == NULL) { + if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) { + if (nr_neigh) + nr_neigh_put(nr_neigh); + return -ENOMEM; + } + + nr_node->callsign = *nr; + strcpy(nr_node->mnemonic, mnemonic); + + nr_node->which = 0; + nr_node->count = 1; + atomic_set(&nr_node->refcount, 1); + spin_lock_init(&nr_node->node_lock); + + nr_node->routes[0].quality = quality; + nr_node->routes[0].obs_count = obs_count; + nr_node->routes[0].neighbour = nr_neigh; + + nr_neigh_hold(nr_neigh); + nr_neigh->count++; + + spin_lock_bh(&nr_node_list_lock); + hlist_add_head(&nr_node->node_node, &nr_node_list); + /* refcount initialized at 1 */ + spin_unlock_bh(&nr_node_list_lock); + + return 0; + } + nr_node_lock(nr_node); + + if (quality != 0) + strcpy(nr_node->mnemonic, mnemonic); + + for (found = 0, i = 0; i < nr_node->count; i++) { + if (nr_node->routes[i].neighbour == nr_neigh) { + nr_node->routes[i].quality = quality; + nr_node->routes[i].obs_count = obs_count; + found = 1; + break; + } + } + + if (!found) { + /* We have space at the bottom, slot it in */ + if (nr_node->count < 3) { + nr_node->routes[2] = nr_node->routes[1]; + nr_node->routes[1] = nr_node->routes[0]; + + nr_node->routes[0].quality = quality; + nr_node->routes[0].obs_count = obs_count; + nr_node->routes[0].neighbour = nr_neigh; + + nr_node->which++; + nr_node->count++; + nr_neigh_hold(nr_neigh); + nr_neigh->count++; + } else { + /* It must be better than the worst */ + if (quality > nr_node->routes[2].quality) { + nr_node->routes[2].neighbour->count--; + nr_neigh_put(nr_node->routes[2].neighbour); + + if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked) + nr_remove_neigh(nr_node->routes[2].neighbour); + + nr_node->routes[2].quality = quality; + nr_node->routes[2].obs_count = obs_count; + nr_node->routes[2].neighbour = nr_neigh; + + nr_neigh_hold(nr_neigh); + nr_neigh->count++; + } + } + } + + /* Now re-sort the routes in quality order */ + switch (nr_node->count) { + case 3: + if (nr_node->routes[1].quality > nr_node->routes[0].quality) { + switch (nr_node->which) { + case 0: nr_node->which = 1; break; + case 1: nr_node->which = 0; break; + default: break; + } + nr_route = nr_node->routes[0]; + nr_node->routes[0] = nr_node->routes[1]; + nr_node->routes[1] = nr_route; + } + if (nr_node->routes[2].quality > nr_node->routes[1].quality) { + switch (nr_node->which) { + case 1: nr_node->which = 2; + break; + + case 2: nr_node->which = 1; + break; + + default: + break; + } + nr_route = nr_node->routes[1]; + nr_node->routes[1] = nr_node->routes[2]; + nr_node->routes[2] = nr_route; + } + case 2: + if (nr_node->routes[1].quality > nr_node->routes[0].quality) { + switch (nr_node->which) { + case 0: nr_node->which = 1; + break; + + case 1: nr_node->which = 0; + break; + + default: break; + } + nr_route = nr_node->routes[0]; + nr_node->routes[0] = nr_node->routes[1]; + nr_node->routes[1] = nr_route; + } + case 1: + break; + } + + for (i = 0; i < nr_node->count; i++) { + if (nr_node->routes[i].neighbour == nr_neigh) { + if (i < nr_node->which) + nr_node->which = i; + break; + } + } + + nr_neigh_put(nr_neigh); + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return 0; +} + +static inline void __nr_remove_node(struct nr_node *nr_node) +{ + hlist_del_init(&nr_node->node_node); + nr_node_put(nr_node); +} + +#define nr_remove_node_locked(__node) \ + __nr_remove_node(__node) + +static void nr_remove_node(struct nr_node *nr_node) +{ + spin_lock_bh(&nr_node_list_lock); + __nr_remove_node(nr_node); + spin_unlock_bh(&nr_node_list_lock); +} + +static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) +{ + hlist_del_init(&nr_neigh->neigh_node); + nr_neigh_put(nr_neigh); +} + +#define nr_remove_neigh_locked(__neigh) \ + __nr_remove_neigh(__neigh) + +static void nr_remove_neigh(struct nr_neigh *nr_neigh) +{ + spin_lock_bh(&nr_neigh_list_lock); + __nr_remove_neigh(nr_neigh); + spin_unlock_bh(&nr_neigh_list_lock); +} + +/* + * "Delete" a node. Strictly speaking remove a route to a node. The node + * is only deleted if no routes are left to it. + */ +static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev) +{ + struct nr_node *nr_node; + struct nr_neigh *nr_neigh; + int i; + + nr_node = nr_node_get(callsign); + + if (nr_node == NULL) + return -EINVAL; + + nr_neigh = nr_neigh_get_dev(neighbour, dev); + + if (nr_neigh == NULL) { + nr_node_put(nr_node); + return -EINVAL; + } + + nr_node_lock(nr_node); + for (i = 0; i < nr_node->count; i++) { + if (nr_node->routes[i].neighbour == nr_neigh) { + nr_neigh->count--; + nr_neigh_put(nr_neigh); + + if (nr_neigh->count == 0 && !nr_neigh->locked) + nr_remove_neigh(nr_neigh); + nr_neigh_put(nr_neigh); + + nr_node->count--; + + if (nr_node->count == 0) { + nr_remove_node(nr_node); + } else { + switch (i) { + case 0: + nr_node->routes[0] = nr_node->routes[1]; + case 1: + nr_node->routes[1] = nr_node->routes[2]; + case 2: + break; + } + nr_node_put(nr_node); + } + nr_node_unlock(nr_node); + + return 0; + } + } + nr_neigh_put(nr_neigh); + nr_node_unlock(nr_node); + nr_node_put(nr_node); + + return -EINVAL; +} + +/* + * Lock a neighbour with a quality. + */ +static int __must_check nr_add_neigh(ax25_address *callsign, + ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality) +{ + struct nr_neigh *nr_neigh; + + nr_neigh = nr_neigh_get_dev(callsign, dev); + if (nr_neigh) { + nr_neigh->quality = quality; + nr_neigh->locked = 1; + nr_neigh_put(nr_neigh); + return 0; + } + + if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + nr_neigh->callsign = *callsign; + nr_neigh->digipeat = NULL; + nr_neigh->ax25 = NULL; + nr_neigh->dev = dev; + nr_neigh->quality = quality; + nr_neigh->locked = 1; + nr_neigh->count = 0; + nr_neigh->number = nr_neigh_no++; + nr_neigh->failed = 0; + atomic_set(&nr_neigh->refcount, 1); + + if (ax25_digi != NULL && ax25_digi->ndigi > 0) { + nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), + GFP_KERNEL); + if (nr_neigh->digipeat == NULL) { + kfree(nr_neigh); + return -ENOMEM; + } + } + + spin_lock_bh(&nr_neigh_list_lock); + hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); + /* refcount is initialized at 1 */ + spin_unlock_bh(&nr_neigh_list_lock); + + return 0; +} + +/* + * "Delete" a neighbour. The neighbour is only removed if the number + * of nodes that may use it is zero. + */ +static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality) +{ + struct nr_neigh *nr_neigh; + + nr_neigh = nr_neigh_get_dev(callsign, dev); + + if (nr_neigh == NULL) return -EINVAL; + + nr_neigh->quality = quality; + nr_neigh->locked = 0; + + if (nr_neigh->count == 0) + nr_remove_neigh(nr_neigh); + nr_neigh_put(nr_neigh); + + return 0; +} + +/* + * Decrement the obsolescence count by one. If a route is reduced to a + * count of zero, remove it. Also remove any unlocked neighbours with + * zero nodes routing via it. + */ +static int nr_dec_obs(void) +{ + struct nr_neigh *nr_neigh; + struct nr_node *s; + struct hlist_node *node, *nodet; + int i; + + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each_safe(s, node, nodet, &nr_node_list) { + nr_node_lock(s); + for (i = 0; i < s->count; i++) { + switch (s->routes[i].obs_count) { + case 0: /* A locked entry */ + break; + + case 1: /* From 1 -> 0 */ + nr_neigh = s->routes[i].neighbour; + + nr_neigh->count--; + nr_neigh_put(nr_neigh); + + if (nr_neigh->count == 0 && !nr_neigh->locked) + nr_remove_neigh(nr_neigh); + + s->count--; + + switch (i) { + case 0: + s->routes[0] = s->routes[1]; + case 1: + s->routes[1] = s->routes[2]; + case 2: + break; + } + break; + + default: + s->routes[i].obs_count--; + break; + + } + } + + if (s->count <= 0) + nr_remove_node_locked(s); + nr_node_unlock(s); + } + spin_unlock_bh(&nr_node_list_lock); + + return 0; +} + +/* + * A device has been removed. Remove its routes and neighbours. + */ +void nr_rt_device_down(struct net_device *dev) +{ + struct nr_neigh *s; + struct hlist_node *node, *nodet, *node2, *node2t; + struct nr_node *t; + int i; + + spin_lock_bh(&nr_neigh_list_lock); + nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) { + if (s->dev == dev) { + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each_safe(t, node2, node2t, &nr_node_list) { + nr_node_lock(t); + for (i = 0; i < t->count; i++) { + if (t->routes[i].neighbour == s) { + t->count--; + + switch (i) { + case 0: + t->routes[0] = t->routes[1]; + case 1: + t->routes[1] = t->routes[2]; + case 2: + break; + } + } + } + + if (t->count <= 0) + nr_remove_node_locked(t); + nr_node_unlock(t); + } + spin_unlock_bh(&nr_node_list_lock); + + nr_remove_neigh_locked(s); + } + } + spin_unlock_bh(&nr_neigh_list_lock); +} + +/* + * Check that the device given is a valid AX.25 interface that is "up". + * Or a valid ethernet interface with an AX.25 callsign binding. + */ +static struct net_device *nr_ax25_dev_get(char *devname) +{ + struct net_device *dev; + + if ((dev = dev_get_by_name(&init_net, devname)) == NULL) + return NULL; + + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) + return dev; + + dev_put(dev); + return NULL; +} + +/* + * Find the first active NET/ROM device, usually "nr0". + */ +struct net_device *nr_dev_first(void) +{ + struct net_device *dev, *first = NULL; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) + if (first == NULL || strncmp(dev->name, first->name, 3) < 0) + first = dev; + } + if (first) + dev_hold(first); + rcu_read_unlock(); + + return first; +} + +/* + * Find the NET/ROM device for the given callsign. + */ +struct net_device *nr_dev_get(ax25_address *addr) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && + ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) { + dev_hold(dev); + goto out; + } + } + dev = NULL; +out: + rcu_read_unlock(); + return dev; +} + +static ax25_digi *nr_call_to_digi(ax25_digi *digi, int ndigis, + ax25_address *digipeaters) +{ + int i; + + if (ndigis == 0) + return NULL; + + for (i = 0; i < ndigis; i++) { + digi->calls[i] = digipeaters[i]; + digi->repeated[i] = 0; + } + + digi->ndigi = ndigis; + digi->lastrepeat = -1; + + return digi; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int nr_rt_ioctl(unsigned int cmd, void __user *arg) +{ + struct nr_route_struct nr_route; + struct net_device *dev; + ax25_digi digi; + int ret; + + switch (cmd) { + case SIOCADDRT: + if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) + return -EFAULT; + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) + return -EINVAL; + if (nr_route.ndigis < 0 || nr_route.ndigis > AX25_MAX_DIGIS) { + dev_put(dev); + return -EINVAL; + } + switch (nr_route.type) { + case NETROM_NODE: + ret = nr_add_node(&nr_route.callsign, + nr_route.mnemonic, + &nr_route.neighbour, + nr_call_to_digi(&digi, nr_route.ndigis, + nr_route.digipeaters), + dev, nr_route.quality, + nr_route.obs_count); + break; + case NETROM_NEIGH: + ret = nr_add_neigh(&nr_route.callsign, + nr_call_to_digi(&digi, nr_route.ndigis, + nr_route.digipeaters), + dev, nr_route.quality); + break; + default: + ret = -EINVAL; + } + dev_put(dev); + return ret; + + case SIOCDELRT: + if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) + return -EFAULT; + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) + return -EINVAL; + switch (nr_route.type) { + case NETROM_NODE: + ret = nr_del_node(&nr_route.callsign, + &nr_route.neighbour, dev); + break; + case NETROM_NEIGH: + ret = nr_del_neigh(&nr_route.callsign, + dev, nr_route.quality); + break; + default: + ret = -EINVAL; + } + dev_put(dev); + return ret; + + case SIOCNRDECOBS: + return nr_dec_obs(); + + default: + return -EINVAL; + } + + return 0; +} + +/* + * A level 2 link has timed out, therefore it appears to be a poor link, + * then don't use that neighbour until it is reset. + */ +void nr_link_failed(ax25_cb *ax25, int reason) +{ + struct nr_neigh *s, *nr_neigh = NULL; + struct hlist_node *node; + struct nr_node *nr_node = NULL; + + spin_lock_bh(&nr_neigh_list_lock); + nr_neigh_for_each(s, node, &nr_neigh_list) { + if (s->ax25 == ax25) { + nr_neigh_hold(s); + nr_neigh = s; + break; + } + } + spin_unlock_bh(&nr_neigh_list_lock); + + if (nr_neigh == NULL) + return; + + nr_neigh->ax25 = NULL; + ax25_cb_put(ax25); + + if (++nr_neigh->failed < sysctl_netrom_link_fails_count) { + nr_neigh_put(nr_neigh); + return; + } + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each(nr_node, node, &nr_node_list) { + nr_node_lock(nr_node); + if (nr_node->which < nr_node->count && + nr_node->routes[nr_node->which].neighbour == nr_neigh) + nr_node->which++; + nr_node_unlock(nr_node); + } + spin_unlock_bh(&nr_node_list_lock); + nr_neigh_put(nr_neigh); +} + +/* + * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb + * indicates an internally generated frame. + */ +int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) +{ + ax25_address *nr_src, *nr_dest; + struct nr_neigh *nr_neigh; + struct nr_node *nr_node; + struct net_device *dev; + unsigned char *dptr; + ax25_cb *ax25s; + int ret; + struct sk_buff *skbn; + + + nr_src = (ax25_address *)(skb->data + 0); + nr_dest = (ax25_address *)(skb->data + 7); + + if (ax25 != NULL) { + ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, + ax25->ax25_dev->dev, 0, + sysctl_netrom_obsolescence_count_initialiser); + if (ret) + return ret; + } + + if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */ + if (ax25 == NULL) /* Its from me */ + ret = nr_loopback_queue(skb); + else + ret = nr_rx_frame(skb, dev); + dev_put(dev); + return ret; + } + + if (!sysctl_netrom_routing_control && ax25 != NULL) + return 0; + + /* Its Time-To-Live has expired */ + if (skb->data[14] == 1) { + return 0; + } + + nr_node = nr_node_get(nr_dest); + if (nr_node == NULL) + return 0; + nr_node_lock(nr_node); + + if (nr_node->which >= nr_node->count) { + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return 0; + } + + nr_neigh = nr_node->routes[nr_node->which].neighbour; + + if ((dev = nr_dev_first()) == NULL) { + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return 0; + } + + /* We are going to change the netrom headers so we should get our + own skb, we also did not know until now how much header space + we had to reserve... - RXQ */ + if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) { + nr_node_unlock(nr_node); + nr_node_put(nr_node); + dev_put(dev); + return 0; + } + kfree_skb(skb); + skb=skbn; + skb->data[14]--; + + dptr = skb_push(skb, 1); + *dptr = AX25_P_NETROM; + + ax25s = nr_neigh->ax25; + nr_neigh->ax25 = ax25_send_frame(skb, 256, + (ax25_address *)dev->dev_addr, + &nr_neigh->callsign, + nr_neigh->digipeat, nr_neigh->dev); + if (ax25s) + ax25_cb_put(ax25s); + + dev_put(dev); + ret = (nr_neigh->ax25 != NULL); + nr_node_unlock(nr_node); + nr_node_put(nr_node); + + return ret; +} + +#ifdef CONFIG_PROC_FS + +static void *nr_node_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_bh(&nr_node_list_lock); + return seq_hlist_start_head(&nr_node_list, *pos); +} + +static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &nr_node_list, pos); +} + +static void nr_node_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&nr_node_list_lock); +} + +static int nr_node_show(struct seq_file *seq, void *v) +{ + char buf[11]; + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n"); + else { + struct nr_node *nr_node = hlist_entry(v, struct nr_node, + node_node); + + nr_node_lock(nr_node); + seq_printf(seq, "%-9s %-7s %d %d", + ax2asc(buf, &nr_node->callsign), + (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic, + nr_node->which + 1, + nr_node->count); + + for (i = 0; i < nr_node->count; i++) { + seq_printf(seq, " %3d %d %05d", + nr_node->routes[i].quality, + nr_node->routes[i].obs_count, + nr_node->routes[i].neighbour->number); + } + nr_node_unlock(nr_node); + + seq_puts(seq, "\n"); + } + return 0; +} + +static const struct seq_operations nr_node_seqops = { + .start = nr_node_start, + .next = nr_node_next, + .stop = nr_node_stop, + .show = nr_node_show, +}; + +static int nr_node_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nr_node_seqops); +} + +const struct file_operations nr_nodes_fops = { + .owner = THIS_MODULE, + .open = nr_node_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *nr_neigh_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_bh(&nr_neigh_list_lock); + return seq_hlist_start_head(&nr_neigh_list, *pos); +} + +static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &nr_neigh_list, pos); +} + +static void nr_neigh_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&nr_neigh_list_lock); +} + +static int nr_neigh_show(struct seq_file *seq, void *v) +{ + char buf[11]; + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "addr callsign dev qual lock count failed digipeaters\n"); + else { + struct nr_neigh *nr_neigh; + + nr_neigh = hlist_entry(v, struct nr_neigh, neigh_node); + seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d", + nr_neigh->number, + ax2asc(buf, &nr_neigh->callsign), + nr_neigh->dev ? nr_neigh->dev->name : "???", + nr_neigh->quality, + nr_neigh->locked, + nr_neigh->count, + nr_neigh->failed); + + if (nr_neigh->digipeat != NULL) { + for (i = 0; i < nr_neigh->digipeat->ndigi; i++) + seq_printf(seq, " %s", + ax2asc(buf, &nr_neigh->digipeat->calls[i])); + } + + seq_puts(seq, "\n"); + } + return 0; +} + +static const struct seq_operations nr_neigh_seqops = { + .start = nr_neigh_start, + .next = nr_neigh_next, + .stop = nr_neigh_stop, + .show = nr_neigh_show, +}; + +static int nr_neigh_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nr_neigh_seqops); +} + +const struct file_operations nr_neigh_fops = { + .owner = THIS_MODULE, + .open = nr_neigh_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +/* + * Free all memory associated with the nodes and routes lists. + */ +void __exit nr_rt_free(void) +{ + struct nr_neigh *s = NULL; + struct nr_node *t = NULL; + struct hlist_node *node, *nodet; + + spin_lock_bh(&nr_neigh_list_lock); + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each_safe(t, node, nodet, &nr_node_list) { + nr_node_lock(t); + nr_remove_node_locked(t); + nr_node_unlock(t); + } + nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) { + while(s->count) { + s->count--; + nr_neigh_put(s); + } + nr_remove_neigh_locked(s); + } + spin_unlock_bh(&nr_node_list_lock); + spin_unlock_bh(&nr_neigh_list_lock); +} diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c new file mode 100644 index 00000000..6a947ae5 --- /dev/null +++ b/net/netrom/nr_subr.c @@ -0,0 +1,282 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all of the queues of frames. + */ +void nr_clear_queues(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&nr->ack_queue); + skb_queue_purge(&nr->reseq_queue); + skb_queue_purge(&nr->frag_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void nr_frames_acked(struct sock *sk, unsigned short nr) +{ + struct nr_sock *nrom = nr_sk(sk); + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (nrom->va != nr) { + while (skb_peek(&nrom->ack_queue) != NULL && nrom->va != nr) { + skb = skb_dequeue(&nrom->ack_queue); + kfree_skb(skb); + nrom->va = (nrom->va + 1) % NR_MODULUS; + } + } +} + +/* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by nr_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ +void nr_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + while ((skb = skb_dequeue(&nr_sk(sk)->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->sk_write_queue, skb); + else + skb_append(skb_prev, skb, &sk->sk_write_queue); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int nr_validate_nr(struct sock *sk, unsigned short nr) +{ + struct nr_sock *nrom = nr_sk(sk); + unsigned short vc = nrom->va; + + while (vc != nrom->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % NR_MODULUS; + } + + return nr == nrom->vs; +} + +/* + * Check that ns is within the receive window. + */ +int nr_in_rx_window(struct sock *sk, unsigned short ns) +{ + struct nr_sock *nr = nr_sk(sk); + unsigned short vc = nr->vr; + unsigned short vt = (nr->vl + nr->window) % NR_MODULUS; + + while (vc != vt) { + if (ns == vc) return 1; + vc = (vc + 1) % NR_MODULUS; + } + + return 0; +} + +/* + * This routine is called when the HDLC layer internally generates a + * control frame. + */ +void nr_write_internal(struct sock *sk, int frametype) +{ + struct nr_sock *nr = nr_sk(sk); + struct sk_buff *skb; + unsigned char *dptr; + int len, timeout; + + len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + + switch (frametype & 0x0F) { + case NR_CONNREQ: + len += 17; + break; + case NR_CONNACK: + len += (nr->bpqext) ? 2 : 1; + break; + case NR_DISCREQ: + case NR_DISCACK: + case NR_INFOACK: + break; + default: + printk(KERN_ERR "NET/ROM: nr_write_internal - invalid frame type %d\n", frametype); + return; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for AX.25 and NET/ROM network header + */ + skb_reserve(skb, NR_NETWORK_LEN); + + dptr = skb_put(skb, skb_tailroom(skb)); + + switch (frametype & 0x0F) { + case NR_CONNREQ: + timeout = nr->t1 / HZ; + *dptr++ = nr->my_index; + *dptr++ = nr->my_id; + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = frametype; + *dptr++ = nr->window; + memcpy(dptr, &nr->user_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + *dptr++ = timeout % 256; + *dptr++ = timeout / 256; + break; + + case NR_CONNACK: + *dptr++ = nr->your_index; + *dptr++ = nr->your_id; + *dptr++ = nr->my_index; + *dptr++ = nr->my_id; + *dptr++ = frametype; + *dptr++ = nr->window; + if (nr->bpqext) *dptr++ = sysctl_netrom_network_ttl_initialiser; + break; + + case NR_DISCREQ: + case NR_DISCACK: + *dptr++ = nr->your_index; + *dptr++ = nr->your_id; + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = frametype; + break; + + case NR_INFOACK: + *dptr++ = nr->your_index; + *dptr++ = nr->your_id; + *dptr++ = 0; + *dptr++ = nr->vr; + *dptr++ = frametype; + break; + } + + nr_transmit_buffer(sk, skb); +} + +/* + * This routine is called to send an error reply. + */ +void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags) +{ + struct sk_buff *skbn; + unsigned char *dptr; + int len; + + len = NR_NETWORK_LEN + NR_TRANSPORT_LEN + 1; + + if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skbn, 0); + + dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + skb_copy_from_linear_data_offset(skb, 7, dptr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + skb_copy_from_linear_data(skb, dptr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] |= AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + *dptr++ = sysctl_netrom_network_ttl_initialiser; + + if (mine) { + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; + } else { + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; + *dptr++ = 0; + *dptr++ = 0; + } + + *dptr++ = cmdflags; + *dptr++ = 0; + + if (!nr_route_frame(skbn, NULL)) + kfree_skb(skbn); +} + +void nr_disconnect(struct sock *sk, int reason) +{ + nr_stop_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + + nr_clear_queues(sk); + + nr_sk(sk)->state = NR_STATE_0; + + sk->sk_state = TCP_CLOSE; + sk->sk_err = reason; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } +} diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c new file mode 100644 index 00000000..1cb98e88 --- /dev/null +++ b/net/netrom/nr_timer.c @@ -0,0 +1,249 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nr_heartbeat_expiry(unsigned long); +static void nr_t1timer_expiry(unsigned long); +static void nr_t2timer_expiry(unsigned long); +static void nr_t4timer_expiry(unsigned long); +static void nr_idletimer_expiry(unsigned long); + +void nr_init_timers(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + setup_timer(&nr->t1timer, nr_t1timer_expiry, (unsigned long)sk); + setup_timer(&nr->t2timer, nr_t2timer_expiry, (unsigned long)sk); + setup_timer(&nr->t4timer, nr_t4timer_expiry, (unsigned long)sk); + setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk); + + /* initialized by sock_init_data */ + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = &nr_heartbeat_expiry; +} + +void nr_start_t1timer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + mod_timer(&nr->t1timer, jiffies + nr->t1); +} + +void nr_start_t2timer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + mod_timer(&nr->t2timer, jiffies + nr->t2); +} + +void nr_start_t4timer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + mod_timer(&nr->t4timer, jiffies + nr->t4); +} + +void nr_start_idletimer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + if (nr->idle > 0) + mod_timer(&nr->idletimer, jiffies + nr->idle); +} + +void nr_start_heartbeat(struct sock *sk) +{ + mod_timer(&sk->sk_timer, jiffies + 5 * HZ); +} + +void nr_stop_t1timer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->t1timer); +} + +void nr_stop_t2timer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->t2timer); +} + +void nr_stop_t4timer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->t4timer); +} + +void nr_stop_idletimer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->idletimer); +} + +void nr_stop_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +int nr_t1timer_running(struct sock *sk) +{ + return timer_pending(&nr_sk(sk)->t1timer); +} + +static void nr_heartbeat_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + switch (nr->state) { + case NR_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + sock_hold(sk); + bh_unlock_sock(sk); + nr_destroy_socket(sk); + sock_put(sk); + return; + } + break; + + case NR_STATE_3: + /* + * Check for the state of the receive buffer. + */ + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && + (nr->condition & NR_COND_OWN_RX_BUSY)) { + nr->condition &= ~NR_COND_OWN_RX_BUSY; + nr->condition &= ~NR_COND_ACK_PENDING; + nr->vl = nr->vr; + nr_write_internal(sk, NR_INFOACK); + break; + } + break; + } + + nr_start_heartbeat(sk); + bh_unlock_sock(sk); +} + +static void nr_t2timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + if (nr->condition & NR_COND_ACK_PENDING) { + nr->condition &= ~NR_COND_ACK_PENDING; + nr_enquiry_response(sk); + } + bh_unlock_sock(sk); +} + +static void nr_t4timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY; + bh_unlock_sock(sk); +} + +static void nr_idletimer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + + nr_clear_queues(sk); + + nr->n2count = 0; + nr_write_internal(sk, NR_DISCREQ); + nr->state = NR_STATE_2; + + nr_start_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + + sk->sk_state = TCP_CLOSE; + sk->sk_err = 0; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + bh_unlock_sock(sk); +} + +static void nr_t1timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + switch (nr->state) { + case NR_STATE_1: + if (nr->n2count == nr->n2) { + nr_disconnect(sk, ETIMEDOUT); + bh_unlock_sock(sk); + return; + } else { + nr->n2count++; + nr_write_internal(sk, NR_CONNREQ); + } + break; + + case NR_STATE_2: + if (nr->n2count == nr->n2) { + nr_disconnect(sk, ETIMEDOUT); + bh_unlock_sock(sk); + return; + } else { + nr->n2count++; + nr_write_internal(sk, NR_DISCREQ); + } + break; + + case NR_STATE_3: + if (nr->n2count == nr->n2) { + nr_disconnect(sk, ETIMEDOUT); + bh_unlock_sock(sk); + return; + } else { + nr->n2count++; + nr_requeue_frames(sk); + } + break; + } + + nr_start_t1timer(sk); + bh_unlock_sock(sk); +} diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c new file mode 100644 index 00000000..1e0fa9e5 --- /dev/null +++ b/net/netrom/sysctl_net_netrom.c @@ -0,0 +1,163 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) + */ +#include +#include +#include +#include +#include + +/* + * Values taken from NET/ROM documentation. + */ +static int min_quality[] = {0}, max_quality[] = {255}; +static int min_obs[] = {0}, max_obs[] = {255}; +static int min_ttl[] = {0}, max_ttl[] = {255}; +static int min_t1[] = {5 * HZ}; +static int max_t1[] = {600 * HZ}; +static int min_n2[] = {2}, max_n2[] = {127}; +static int min_t2[] = {1 * HZ}; +static int max_t2[] = {60 * HZ}; +static int min_t4[] = {1 * HZ}; +static int max_t4[] = {1000 * HZ}; +static int min_window[] = {1}, max_window[] = {127}; +static int min_idle[] = {0 * HZ}; +static int max_idle[] = {65535 * HZ}; +static int min_route[] = {0}, max_route[] = {1}; +static int min_fails[] = {1}, max_fails[] = {10}; +static int min_reset[] = {0}, max_reset[] = {1}; + +static struct ctl_table_header *nr_table_header; + +static ctl_table nr_table[] = { + { + .procname = "default_path_quality", + .data = &sysctl_netrom_default_path_quality, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_quality, + .extra2 = &max_quality + }, + { + .procname = "obsolescence_count_initialiser", + .data = &sysctl_netrom_obsolescence_count_initialiser, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_obs, + .extra2 = &max_obs + }, + { + .procname = "network_ttl_initialiser", + .data = &sysctl_netrom_network_ttl_initialiser, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ttl, + .extra2 = &max_ttl + }, + { + .procname = "transport_timeout", + .data = &sysctl_netrom_transport_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t1, + .extra2 = &max_t1 + }, + { + .procname = "transport_maximum_tries", + .data = &sysctl_netrom_transport_maximum_tries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_n2, + .extra2 = &max_n2 + }, + { + .procname = "transport_acknowledge_delay", + .data = &sysctl_netrom_transport_acknowledge_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t2, + .extra2 = &max_t2 + }, + { + .procname = "transport_busy_delay", + .data = &sysctl_netrom_transport_busy_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_t4, + .extra2 = &max_t4 + }, + { + .procname = "transport_requested_window_size", + .data = &sysctl_netrom_transport_requested_window_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_window, + .extra2 = &max_window + }, + { + .procname = "transport_no_activity_timeout", + .data = &sysctl_netrom_transport_no_activity_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_idle, + .extra2 = &max_idle + }, + { + .procname = "routing_control", + .data = &sysctl_netrom_routing_control, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_route, + .extra2 = &max_route + }, + { + .procname = "link_fails_count", + .data = &sysctl_netrom_link_fails_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_fails, + .extra2 = &max_fails + }, + { + .procname = "reset", + .data = &sysctl_netrom_reset_circuit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_reset, + .extra2 = &max_reset + }, + { } +}; + +static struct ctl_path nr_path[] = { + { .procname = "net", }, + { .procname = "netrom", }, + { } +}; + +void __init nr_register_sysctl(void) +{ + nr_table_header = register_sysctl_paths(nr_path, nr_table); +} + +void nr_unregister_sysctl(void) +{ + unregister_sysctl_table(nr_table_header); +} diff --git a/net/nonet.c b/net/nonet.c new file mode 100644 index 00000000..b1a73fda --- /dev/null +++ b/net/nonet.c @@ -0,0 +1,26 @@ +/* + * net/nonet.c + * + * Dummy functions to allow us to configure network support entirely + * out of the kernel. + * + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) Matthew Wilcox 2003 + */ + +#include +#include +#include +#include +#include + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare) +{ + return -ENXIO; +} + +const struct file_operations bad_sock_fops = { + .owner = THIS_MODULE, + .open = sock_no_open, + .llseek = noop_llseek, +}; diff --git a/net/packet/Kconfig b/net/packet/Kconfig new file mode 100644 index 00000000..0060e3b3 --- /dev/null +++ b/net/packet/Kconfig @@ -0,0 +1,16 @@ +# +# Packet configuration +# + +config PACKET + tristate "Packet socket" + ---help--- + The Packet protocol is used by applications which communicate + directly with network devices without an intermediate network + protocol implemented in the kernel, e.g. tcpdump. If you want them + to work, choose Y. + + To compile this driver as a module, choose M here: the module will + be called af_packet. + + If unsure, say Y. diff --git a/net/packet/Makefile b/net/packet/Makefile new file mode 100644 index 00000000..81183eab --- /dev/null +++ b/net/packet/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the packet AF. +# + +obj-$(CONFIG_PACKET) += af_packet.o diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c new file mode 100644 index 00000000..fafb9683 --- /dev/null +++ b/net/packet/af_packet.c @@ -0,0 +1,2808 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PACKET - implements raw packet sockets. + * + * Authors: Ross Biro + * Fred N. van Kempen, + * Alan Cox, + * + * Fixes: + * Alan Cox : verify_area() now used correctly + * Alan Cox : new skbuff lists, look ma no backlogs! + * Alan Cox : tidied skbuff lists. + * Alan Cox : Now uses generic datagram routines I + * added. Also fixed the peek/read crash + * from all old Linux datagram code. + * Alan Cox : Uses the improved datagram code. + * Alan Cox : Added NULL's for socket options. + * Alan Cox : Re-commented the code. + * Alan Cox : Use new kernel side addressing + * Rob Janssen : Correct MTU usage. + * Dave Platt : Counter leaks caused by incorrect + * interrupt locking and some slightly + * dubious gcc output. Can you read + * compiler: it said _VOLATILE_ + * Richard Kooijman : Timestamp fixes. + * Alan Cox : New buffers. Use sk->mac.raw. + * Alan Cox : sendmsg/recvmsg support. + * Alan Cox : Protocol setting support + * Alexey Kuznetsov : Untied from IPv4 stack. + * Cyrus Durgin : Fixed kerneld for kmod. + * Michal Ostrowski : Module initialization cleanup. + * Ulises Alonso : Frame number limit removal and + * packet_set_ring memory leak. + * Eric Biederman : Allow for > 8 byte hardware addresses. + * The convention is that longer addresses + * will simply extend the hardware address + * byte arrays at the end of sockaddr_ll + * and packet_mreq. + * Johann Baudy : Added TX RING. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_INET +#include +#endif + +/* + Assumptions: + - if device has no dev->hard_header routine, it adds and removes ll header + inside itself. In this case ll header is invisible outside of device, + but higher levels still should reserve dev->hard_header_len. + Some devices are enough clever to reallocate skb, when header + will not fit to reserved space (tunnel), another ones are silly + (PPP). + - packet socket receives packets with pulled ll header, + so that SOCK_RAW should push it back. + +On receive: +----------- + +Incoming, dev->hard_header!=NULL + mac_header -> ll header + data -> data + +Outgoing, dev->hard_header!=NULL + mac_header -> ll header + data -> ll header + +Incoming, dev->hard_header==NULL + mac_header -> UNKNOWN position. It is very likely, that it points to ll + header. PPP makes it, that is wrong, because introduce + assymetry between rx and tx paths. + data -> data + +Outgoing, dev->hard_header==NULL + mac_header -> data. ll header is still not built! + data -> data + +Resume + If dev->hard_header==NULL we are unlikely to restore sensible ll header. + + +On transmit: +------------ + +dev->hard_header != NULL + mac_header -> ll header + data -> ll header + +dev->hard_header == NULL (ll header is added by device, we cannot control it) + mac_header -> data + data -> data + + We should set nh.raw on output to correct posistion, + packet classifier depends on it. + */ + +/* Private packet socket structures. */ + +struct packet_mclist { + struct packet_mclist *next; + int ifindex; + int count; + unsigned short type; + unsigned short alen; + unsigned char addr[MAX_ADDR_LEN]; +}; +/* identical to struct packet_mreq except it has + * a longer address field. + */ +struct packet_mreq_max { + int mr_ifindex; + unsigned short mr_type; + unsigned short mr_alen; + unsigned char mr_address[MAX_ADDR_LEN]; +}; + +static int packet_set_ring(struct sock *sk, struct tpacket_req *req, + int closing, int tx_ring); + +struct pgv { + char *buffer; +}; + +struct packet_ring_buffer { + struct pgv *pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + + unsigned int pg_vec_order; + unsigned int pg_vec_pages; + unsigned int pg_vec_len; + + atomic_t pending; +}; + +struct packet_sock; +static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); + +static void packet_flush_mclist(struct sock *sk); + +struct packet_sock { + /* struct sock has to be the first member of packet_sock */ + struct sock sk; + struct tpacket_stats stats; + struct packet_ring_buffer rx_ring; + struct packet_ring_buffer tx_ring; + int copy_thresh; + spinlock_t bind_lock; + struct mutex pg_vec_lock; + unsigned int running:1, /* prot_hook is attached*/ + auxdata:1, + origdev:1, + has_vnet_hdr:1; + int ifindex; /* bound device */ + __be16 num; + struct packet_mclist *mclist; + atomic_t mapped; + enum tpacket_versions tp_version; + unsigned int tp_hdrlen; + unsigned int tp_reserve; + unsigned int tp_loss:1; + unsigned int tp_tstamp; + struct packet_type prot_hook ____cacheline_aligned_in_smp; +}; + +struct packet_skb_cb { + unsigned int origlen; + union { + struct sockaddr_pkt pkt; + struct sockaddr_ll ll; + } sa; +}; + +#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) + +static inline __pure struct page *pgv_to_page(void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + return virt_to_page(addr); +} + +static void __packet_set_status(struct packet_sock *po, void *frame, int status) +{ + union { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + void *raw; + } h; + + h.raw = frame; + switch (po->tp_version) { + case TPACKET_V1: + h.h1->tp_status = status; + flush_dcache_page(pgv_to_page(&h.h1->tp_status)); + break; + case TPACKET_V2: + h.h2->tp_status = status; + flush_dcache_page(pgv_to_page(&h.h2->tp_status)); + break; + default: + pr_err("TPACKET version not supported\n"); + BUG(); + } + + smp_wmb(); +} + +static int __packet_get_status(struct packet_sock *po, void *frame) +{ + union { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + void *raw; + } h; + + smp_rmb(); + + h.raw = frame; + switch (po->tp_version) { + case TPACKET_V1: + flush_dcache_page(pgv_to_page(&h.h1->tp_status)); + return h.h1->tp_status; + case TPACKET_V2: + flush_dcache_page(pgv_to_page(&h.h2->tp_status)); + return h.h2->tp_status; + default: + pr_err("TPACKET version not supported\n"); + BUG(); + return 0; + } +} + +static void *packet_lookup_frame(struct packet_sock *po, + struct packet_ring_buffer *rb, + unsigned int position, + int status) +{ + unsigned int pg_vec_pos, frame_offset; + union { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + void *raw; + } h; + + pg_vec_pos = position / rb->frames_per_block; + frame_offset = position % rb->frames_per_block; + + h.raw = rb->pg_vec[pg_vec_pos].buffer + + (frame_offset * rb->frame_size); + + if (status != __packet_get_status(po, h.raw)) + return NULL; + + return h.raw; +} + +static inline void *packet_current_frame(struct packet_sock *po, + struct packet_ring_buffer *rb, + int status) +{ + return packet_lookup_frame(po, rb, rb->head, status); +} + +static inline void *packet_previous_frame(struct packet_sock *po, + struct packet_ring_buffer *rb, + int status) +{ + unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; + return packet_lookup_frame(po, rb, previous, status); +} + +static inline void packet_increment_head(struct packet_ring_buffer *buff) +{ + buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; +} + +static inline struct packet_sock *pkt_sk(struct sock *sk) +{ + return (struct packet_sock *)sk; +} + +static void packet_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_error_queue); + + WARN_ON(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + + if (!sock_flag(sk, SOCK_DEAD)) { + pr_err("Attempt to release alive packet socket: %p\n", sk); + return; + } + + sk_refcnt_debug_dec(sk); +} + + +static const struct proto_ops packet_ops; + +static const struct proto_ops packet_ops_spkt; + +static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct sock *sk; + struct sockaddr_pkt *spkt; + + /* + * When we registered the protocol we saved the socket in the data + * field for just this event. + */ + + sk = pt->af_packet_priv; + + /* + * Yank back the headers [hope the device set this + * right or kerboom...] + * + * Incoming packets have ll header pulled, + * push it back. + * + * For outgoing ones skb->data == skb_mac_header(skb) + * so that this procedure is noop. + */ + + if (skb->pkt_type == PACKET_LOOPBACK) + goto out; + + if (!net_eq(dev_net(dev), sock_net(sk))) + goto out; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) + goto oom; + + /* drop any routing info */ + skb_dst_drop(skb); + + /* drop conntrack reference */ + nf_reset(skb); + + spkt = &PACKET_SKB_CB(skb)->sa.pkt; + + skb_push(skb, skb->data - skb_mac_header(skb)); + + /* + * The SOCK_PACKET socket receives _all_ frames. + */ + + spkt->spkt_family = dev->type; + strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); + spkt->spkt_protocol = skb->protocol; + + /* + * Charge the memory to the socket. This is done specifically + * to prevent sockets using all the memory up. + */ + + if (sock_queue_rcv_skb(sk, skb) == 0) + return 0; + +out: + kfree_skb(skb); +oom: + return 0; +} + + +/* + * Output a raw packet to a device layer. This bypasses all the other + * protocol layers and you must therefore supply it with a complete frame + */ + +static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; + struct sk_buff *skb = NULL; + struct net_device *dev; + __be16 proto = 0; + int err; + + /* + * Get and verify the address. + */ + + if (saddr) { + if (msg->msg_namelen < sizeof(struct sockaddr)) + return -EINVAL; + if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) + proto = saddr->spkt_protocol; + } else + return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ + + /* + * Find the device first to size check it + */ + + saddr->spkt_device[13] = 0; +retry: + rcu_read_lock(); + dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); + err = -ENODEV; + if (dev == NULL) + goto out_unlock; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_unlock; + + /* + * You may not queue a frame bigger than the mtu. This is the lowest level + * raw protocol and you must do your own fragmentation at this level. + */ + + err = -EMSGSIZE; + if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN) + goto out_unlock; + + if (!skb) { + size_t reserved = LL_RESERVED_SPACE(dev); + unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; + + rcu_read_unlock(); + skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; + /* FIXME: Save some space for broken drivers that write a hard + * header at transmission time by themselves. PPP is the notable + * one here. This should really be fixed at the driver level. + */ + skb_reserve(skb, reserved); + skb_reset_network_header(skb); + + /* Try to align data part correctly */ + if (hhlen) { + skb->data -= hhlen; + skb->tail -= hhlen; + if (len < hhlen) + skb_reset_network_header(skb); + } + err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (err) + goto out_free; + goto retry; + } + + if (len > (dev->mtu + dev->hard_header_len)) { + /* Earlier code assumed this would be a VLAN pkt, + * double-check this now that we have the actual + * packet in hand. + */ + struct ethhdr *ehdr; + skb_reset_mac_header(skb); + ehdr = eth_hdr(skb); + if (ehdr->h_proto != htons(ETH_P_8021Q)) { + err = -EMSGSIZE; + goto out_unlock; + } + } + + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + if (err < 0) + goto out_unlock; + + dev_queue_xmit(skb); + rcu_read_unlock(); + return len; + +out_unlock: + rcu_read_unlock(); +out_free: + kfree_skb(skb); + return err; +} + +static inline unsigned int run_filter(const struct sk_buff *skb, + const struct sock *sk, + unsigned int res) +{ + struct sk_filter *filter; + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter != NULL) + res = SK_RUN_FILTER(filter, skb); + rcu_read_unlock(); + + return res; +} + +/* + * This function makes lazy skb cloning in hope that most of packets + * are discarded by BPF. + * + * Note tricky part: we DO mangle shared skb! skb->data, skb->len + * and skb->cb are mangled. It works because (and until) packets + * falling here are owned by current CPU. Output packets are cloned + * by dev_queue_xmit_nit(), input packets are processed by net_bh + * sequencially, so that if we return skb to original state on exit, + * we will not harm anyone. + */ + +static int packet_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct sock *sk; + struct sockaddr_ll *sll; + struct packet_sock *po; + u8 *skb_head = skb->data; + int skb_len = skb->len; + unsigned int snaplen, res; + + if (skb->pkt_type == PACKET_LOOPBACK) + goto drop; + + sk = pt->af_packet_priv; + po = pkt_sk(sk); + + if (!net_eq(dev_net(dev), sock_net(sk))) + goto drop; + + skb->dev = dev; + + if (dev->header_ops) { + /* The device has an explicit notion of ll header, + * exported to higher levels. + * + * Otherwise, the device hides details of its frame + * structure, so that corresponding packet head is + * never delivered to user. + */ + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb_mac_header(skb)); + else if (skb->pkt_type == PACKET_OUTGOING) { + /* Special case: outgoing packets have ll header at head */ + skb_pull(skb, skb_network_offset(skb)); + } + } + + snaplen = skb->len; + + res = run_filter(skb, sk, snaplen); + if (!res) + goto drop_n_restore; + if (snaplen > res) + snaplen = res; + + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) + goto drop_n_acct; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb == NULL) + goto drop_n_acct; + + if (skb_head != skb->data) { + skb->data = skb_head; + skb->len = skb_len; + } + kfree_skb(skb); + skb = nskb; + } + + BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > + sizeof(skb->cb)); + + sll = &PACKET_SKB_CB(skb)->sa.ll; + sll->sll_family = AF_PACKET; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + if (unlikely(po->origdev)) + sll->sll_ifindex = orig_dev->ifindex; + else + sll->sll_ifindex = dev->ifindex; + + sll->sll_halen = dev_parse_header(skb, sll->sll_addr); + + PACKET_SKB_CB(skb)->origlen = skb->len; + + if (pskb_trim(skb, snaplen)) + goto drop_n_acct; + + skb_set_owner_r(skb, sk); + skb->dev = NULL; + skb_dst_drop(skb); + + /* drop conntrack reference */ + nf_reset(skb); + + spin_lock(&sk->sk_receive_queue.lock); + po->stats.tp_packets++; + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock(&sk->sk_receive_queue.lock); + sk->sk_data_ready(sk, skb->len); + return 0; + +drop_n_acct: + spin_lock(&sk->sk_receive_queue.lock); + po->stats.tp_drops++; + atomic_inc(&sk->sk_drops); + spin_unlock(&sk->sk_receive_queue.lock); + +drop_n_restore: + if (skb_head != skb->data && skb_shared(skb)) { + skb->data = skb_head; + skb->len = skb_len; + } +drop: + consume_skb(skb); + return 0; +} + +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct sock *sk; + struct packet_sock *po; + struct sockaddr_ll *sll; + union { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + void *raw; + } h; + u8 *skb_head = skb->data; + int skb_len = skb->len; + unsigned int snaplen, res; + unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; + unsigned short macoff, netoff, hdrlen; + struct sk_buff *copy_skb = NULL; + struct timeval tv; + struct timespec ts; + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + + if (skb->pkt_type == PACKET_LOOPBACK) + goto drop; + + sk = pt->af_packet_priv; + po = pkt_sk(sk); + + if (!net_eq(dev_net(dev), sock_net(sk))) + goto drop; + + if (dev->header_ops) { + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb_mac_header(skb)); + else if (skb->pkt_type == PACKET_OUTGOING) { + /* Special case: outgoing packets have ll header at head */ + skb_pull(skb, skb_network_offset(skb)); + } + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) + status |= TP_STATUS_CSUMNOTREADY; + + snaplen = skb->len; + + res = run_filter(skb, sk, snaplen); + if (!res) + goto drop_n_restore; + if (snaplen > res) + snaplen = res; + + if (sk->sk_type == SOCK_DGRAM) { + macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + + po->tp_reserve; + } else { + unsigned maclen = skb_network_offset(skb); + netoff = TPACKET_ALIGN(po->tp_hdrlen + + (maclen < 16 ? 16 : maclen)) + + po->tp_reserve; + macoff = netoff - maclen; + } + + if (macoff + snaplen > po->rx_ring.frame_size) { + if (po->copy_thresh && + atomic_read(&sk->sk_rmem_alloc) + skb->truesize < + (unsigned)sk->sk_rcvbuf) { + if (skb_shared(skb)) { + copy_skb = skb_clone(skb, GFP_ATOMIC); + } else { + copy_skb = skb_get(skb); + skb_head = skb->data; + } + if (copy_skb) + skb_set_owner_r(copy_skb, sk); + } + snaplen = po->rx_ring.frame_size - macoff; + if ((int)snaplen < 0) + snaplen = 0; + } + + spin_lock(&sk->sk_receive_queue.lock); + h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); + if (!h.raw) + goto ring_is_full; + packet_increment_head(&po->rx_ring); + po->stats.tp_packets++; + if (copy_skb) { + status |= TP_STATUS_COPY; + __skb_queue_tail(&sk->sk_receive_queue, copy_skb); + } + if (!po->stats.tp_drops) + status &= ~TP_STATUS_LOSING; + spin_unlock(&sk->sk_receive_queue.lock); + + skb_copy_bits(skb, 0, h.raw + macoff, snaplen); + + switch (po->tp_version) { + case TPACKET_V1: + h.h1->tp_len = skb->len; + h.h1->tp_snaplen = snaplen; + h.h1->tp_mac = macoff; + h.h1->tp_net = netoff; + if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) + && shhwtstamps->syststamp.tv64) + tv = ktime_to_timeval(shhwtstamps->syststamp); + else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) + && shhwtstamps->hwtstamp.tv64) + tv = ktime_to_timeval(shhwtstamps->hwtstamp); + else if (skb->tstamp.tv64) + tv = ktime_to_timeval(skb->tstamp); + else + do_gettimeofday(&tv); + h.h1->tp_sec = tv.tv_sec; + h.h1->tp_usec = tv.tv_usec; + hdrlen = sizeof(*h.h1); + break; + case TPACKET_V2: + h.h2->tp_len = skb->len; + h.h2->tp_snaplen = snaplen; + h.h2->tp_mac = macoff; + h.h2->tp_net = netoff; + if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) + && shhwtstamps->syststamp.tv64) + ts = ktime_to_timespec(shhwtstamps->syststamp); + else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) + && shhwtstamps->hwtstamp.tv64) + ts = ktime_to_timespec(shhwtstamps->hwtstamp); + else if (skb->tstamp.tv64) + ts = ktime_to_timespec(skb->tstamp); + else + getnstimeofday(&ts); + h.h2->tp_sec = ts.tv_sec; + h.h2->tp_nsec = ts.tv_nsec; + if (vlan_tx_tag_present(skb)) { + h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); + status |= TP_STATUS_VLAN_VALID; + } else { + h.h2->tp_vlan_tci = 0; + } + h.h2->tp_padding = 0; + hdrlen = sizeof(*h.h2); + break; + default: + BUG(); + } + + sll = h.raw + TPACKET_ALIGN(hdrlen); + sll->sll_halen = dev_parse_header(skb, sll->sll_addr); + sll->sll_family = AF_PACKET; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + if (unlikely(po->origdev)) + sll->sll_ifindex = orig_dev->ifindex; + else + sll->sll_ifindex = dev->ifindex; + + __packet_set_status(po, h.raw, status); + smp_mb(); +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + { + u8 *start, *end; + + end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); + for (start = h.raw; start < end; start += PAGE_SIZE) + flush_dcache_page(pgv_to_page(start)); + } +#endif + + sk->sk_data_ready(sk, 0); + +drop_n_restore: + if (skb_head != skb->data && skb_shared(skb)) { + skb->data = skb_head; + skb->len = skb_len; + } +drop: + kfree_skb(skb); + return 0; + +ring_is_full: + po->stats.tp_drops++; + spin_unlock(&sk->sk_receive_queue.lock); + + sk->sk_data_ready(sk, 0); + kfree_skb(copy_skb); + goto drop_n_restore; +} + +static void tpacket_destruct_skb(struct sk_buff *skb) +{ + struct packet_sock *po = pkt_sk(skb->sk); + void *ph; + + BUG_ON(skb == NULL); + + if (likely(po->tx_ring.pg_vec)) { + ph = skb_shinfo(skb)->destructor_arg; + BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); + BUG_ON(atomic_read(&po->tx_ring.pending) == 0); + atomic_dec(&po->tx_ring.pending); + __packet_set_status(po, ph, TP_STATUS_AVAILABLE); + } + + sock_wfree(skb); +} + +static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, + void *frame, struct net_device *dev, int size_max, + __be16 proto, unsigned char *addr) +{ + union { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + void *raw; + } ph; + int to_write, offset, len, tp_len, nr_frags, len_max; + struct socket *sock = po->sk.sk_socket; + struct page *page; + void *data; + int err; + + ph.raw = frame; + + skb->protocol = proto; + skb->dev = dev; + skb->priority = po->sk.sk_priority; + skb->mark = po->sk.sk_mark; + skb_shinfo(skb)->destructor_arg = ph.raw; + + switch (po->tp_version) { + case TPACKET_V2: + tp_len = ph.h2->tp_len; + break; + default: + tp_len = ph.h1->tp_len; + break; + } + if (unlikely(tp_len > size_max)) { + pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); + return -EMSGSIZE; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reset_network_header(skb); + + data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); + to_write = tp_len; + + if (sock->type == SOCK_DGRAM) { + err = dev_hard_header(skb, dev, ntohs(proto), addr, + NULL, tp_len); + if (unlikely(err < 0)) + return -EINVAL; + } else if (dev->hard_header_len) { + /* net device doesn't like empty head */ + if (unlikely(tp_len <= dev->hard_header_len)) { + pr_err("packet size is too short (%d < %d)\n", + tp_len, dev->hard_header_len); + return -EINVAL; + } + + skb_push(skb, dev->hard_header_len); + err = skb_store_bits(skb, 0, data, + dev->hard_header_len); + if (unlikely(err)) + return err; + + data += dev->hard_header_len; + to_write -= dev->hard_header_len; + } + + err = -EFAULT; + offset = offset_in_page(data); + len_max = PAGE_SIZE - offset; + len = ((to_write > len_max) ? len_max : to_write); + + skb->data_len = to_write; + skb->len += to_write; + skb->truesize += to_write; + atomic_add(to_write, &po->sk.sk_wmem_alloc); + + while (likely(to_write)) { + nr_frags = skb_shinfo(skb)->nr_frags; + + if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { + pr_err("Packet exceed the number of skb frags(%lu)\n", + MAX_SKB_FRAGS); + return -EFAULT; + } + + page = pgv_to_page(data); + data += len; + flush_dcache_page(page); + get_page(page); + skb_fill_page_desc(skb, nr_frags, page, offset, len); + to_write -= len; + offset = 0; + len_max = PAGE_SIZE; + len = ((to_write > len_max) ? len_max : to_write); + } + + return tp_len; +} + +static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) +{ + struct sk_buff *skb; + struct net_device *dev; + __be16 proto; + int ifindex, err, reserve = 0; + void *ph; + struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; + int tp_len, size_max; + unsigned char *addr; + int len_sum = 0; + int status = 0; + + mutex_lock(&po->pg_vec_lock); + + err = -EBUSY; + if (saddr == NULL) { + ifindex = po->ifindex; + proto = po->num; + addr = NULL; + } else { + err = -EINVAL; + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) + goto out; + if (msg->msg_namelen < (saddr->sll_halen + + offsetof(struct sockaddr_ll, + sll_addr))) + goto out; + ifindex = saddr->sll_ifindex; + proto = saddr->sll_protocol; + addr = saddr->sll_addr; + } + + dev = dev_get_by_index(sock_net(&po->sk), ifindex); + err = -ENXIO; + if (unlikely(dev == NULL)) + goto out; + + reserve = dev->hard_header_len; + + err = -ENETDOWN; + if (unlikely(!(dev->flags & IFF_UP))) + goto out_put; + + size_max = po->tx_ring.frame_size + - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); + + if (size_max > dev->mtu + reserve) + size_max = dev->mtu + reserve; + + do { + ph = packet_current_frame(po, &po->tx_ring, + TP_STATUS_SEND_REQUEST); + + if (unlikely(ph == NULL)) { + schedule(); + continue; + } + + status = TP_STATUS_SEND_REQUEST; + skb = sock_alloc_send_skb(&po->sk, + LL_ALLOCATED_SPACE(dev) + + sizeof(struct sockaddr_ll), + 0, &err); + + if (unlikely(skb == NULL)) + goto out_status; + + tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, + addr); + + if (unlikely(tp_len < 0)) { + if (po->tp_loss) { + __packet_set_status(po, ph, + TP_STATUS_AVAILABLE); + packet_increment_head(&po->tx_ring); + kfree_skb(skb); + continue; + } else { + status = TP_STATUS_WRONG_FORMAT; + err = tp_len; + goto out_status; + } + } + + skb->destructor = tpacket_destruct_skb; + __packet_set_status(po, ph, TP_STATUS_SENDING); + atomic_inc(&po->tx_ring.pending); + + status = TP_STATUS_SEND_REQUEST; + err = dev_queue_xmit(skb); + if (unlikely(err > 0)) { + err = net_xmit_errno(err); + if (err && __packet_get_status(po, ph) == + TP_STATUS_AVAILABLE) { + /* skb was destructed already */ + skb = NULL; + goto out_status; + } + /* + * skb was dropped but not destructed yet; + * let's treat it like congestion or err < 0 + */ + err = 0; + } + packet_increment_head(&po->tx_ring); + len_sum += tp_len; + } while (likely((ph != NULL) || + ((!(msg->msg_flags & MSG_DONTWAIT)) && + (atomic_read(&po->tx_ring.pending)))) + ); + + err = len_sum; + goto out_put; + +out_status: + __packet_set_status(po, ph, status); + kfree_skb(skb); +out_put: + dev_put(dev); +out: + mutex_unlock(&po->pg_vec_lock); + return err; +} + +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, + size_t reserve, size_t len, + size_t linear, int noblock, + int *err) +{ + struct sk_buff *skb; + + /* Under a page? Don't bother with paged skb. */ + if (prepad + len < PAGE_SIZE || !linear) + linear = len; + + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, + err); + if (!skb) + return NULL; + + skb_reserve(skb, reserve); + skb_put(skb, linear); + skb->data_len = len - linear; + skb->len += len - linear; + + return skb; +} + +static int packet_snd(struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; + struct sk_buff *skb; + struct net_device *dev; + __be16 proto; + unsigned char *addr; + int ifindex, err, reserve = 0; + struct virtio_net_hdr vnet_hdr = { 0 }; + int offset = 0; + int vnet_hdr_len; + struct packet_sock *po = pkt_sk(sk); + unsigned short gso_type = 0; + + /* + * Get and verify the address. + */ + + if (saddr == NULL) { + ifindex = po->ifindex; + proto = po->num; + addr = NULL; + } else { + err = -EINVAL; + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) + goto out; + if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) + goto out; + ifindex = saddr->sll_ifindex; + proto = saddr->sll_protocol; + addr = saddr->sll_addr; + } + + + dev = dev_get_by_index(sock_net(sk), ifindex); + err = -ENXIO; + if (dev == NULL) + goto out_unlock; + if (sock->type == SOCK_RAW) + reserve = dev->hard_header_len; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_unlock; + + if (po->has_vnet_hdr) { + vnet_hdr_len = sizeof(vnet_hdr); + + err = -EINVAL; + if (len < vnet_hdr_len) + goto out_unlock; + + len -= vnet_hdr_len; + + err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, + vnet_hdr_len); + if (err < 0) + goto out_unlock; + + if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > + vnet_hdr.hdr_len)) + vnet_hdr.hdr_len = vnet_hdr.csum_start + + vnet_hdr.csum_offset + 2; + + err = -EINVAL; + if (vnet_hdr.hdr_len > len) + goto out_unlock; + + if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + gso_type = SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + gso_type = SKB_GSO_UDP; + break; + default: + goto out_unlock; + } + + if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) + gso_type |= SKB_GSO_TCP_ECN; + + if (vnet_hdr.gso_size == 0) + goto out_unlock; + + } + } + + err = -EMSGSIZE; + if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN)) + goto out_unlock; + + err = -ENOBUFS; + skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), + LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len, + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto out_unlock; + + skb_set_network_header(skb, reserve); + + err = -EINVAL; + if (sock->type == SOCK_DGRAM && + (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) + goto out_free; + + /* Returns -EFAULT on error */ + err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); + if (err) + goto out_free; + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + if (err < 0) + goto out_free; + + if (!gso_type && (len > dev->mtu + reserve)) { + /* Earlier code assumed this would be a VLAN pkt, + * double-check this now that we have the actual + * packet in hand. + */ + struct ethhdr *ehdr; + skb_reset_mac_header(skb); + ehdr = eth_hdr(skb); + if (ehdr->h_proto != htons(ETH_P_8021Q)) { + err = -EMSGSIZE; + goto out_free; + } + } + + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + if (po->has_vnet_hdr) { + if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, + vnet_hdr.csum_offset)) { + err = -EINVAL; + goto out_free; + } + } + + skb_shinfo(skb)->gso_size = vnet_hdr.gso_size; + skb_shinfo(skb)->gso_type = gso_type; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + + len += vnet_hdr_len; + } + + /* + * Now send it + */ + + err = dev_queue_xmit(skb); + if (err > 0 && (err = net_xmit_errno(err)) != 0) + goto out_unlock; + + dev_put(dev); + + return len; + +out_free: + kfree_skb(skb); +out_unlock: + if (dev) + dev_put(dev); +out: + return err; +} + +static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + if (po->tx_ring.pg_vec) + return tpacket_snd(po, msg); + else + return packet_snd(sock, msg, len); +} + +/* + * Close a PACKET socket. This is fairly simple. We immediately go + * to 'closed' state and remove our protocol entry in the device list. + */ + +static int packet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct packet_sock *po; + struct net *net; + struct tpacket_req req; + + if (!sk) + return 0; + + net = sock_net(sk); + po = pkt_sk(sk); + + spin_lock_bh(&net->packet.sklist_lock); + sk_del_node_init_rcu(sk); + sock_prot_inuse_add(net, sk->sk_prot, -1); + spin_unlock_bh(&net->packet.sklist_lock); + + spin_lock(&po->bind_lock); + if (po->running) { + /* + * Remove from protocol table + */ + po->running = 0; + po->num = 0; + __dev_remove_pack(&po->prot_hook); + __sock_put(sk); + } + spin_unlock(&po->bind_lock); + + packet_flush_mclist(sk); + + memset(&req, 0, sizeof(req)); + + if (po->rx_ring.pg_vec) + packet_set_ring(sk, &req, 1, 0); + + if (po->tx_ring.pg_vec) + packet_set_ring(sk, &req, 1, 1); + + synchronize_net(); + /* + * Now the socket is dead. No more input will appear. + */ + sock_orphan(sk); + sock->sk = NULL; + + /* Purge queues */ + + skb_queue_purge(&sk->sk_receive_queue); + sk_refcnt_debug_release(sk); + + sock_put(sk); + return 0; +} + +/* + * Attach a packet hook. + */ + +static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) +{ + struct packet_sock *po = pkt_sk(sk); + /* + * Detach an existing hook if present. + */ + + lock_sock(sk); + + spin_lock(&po->bind_lock); + if (po->running) { + __sock_put(sk); + po->running = 0; + po->num = 0; + spin_unlock(&po->bind_lock); + dev_remove_pack(&po->prot_hook); + spin_lock(&po->bind_lock); + } + + po->num = protocol; + po->prot_hook.type = protocol; + po->prot_hook.dev = dev; + + po->ifindex = dev ? dev->ifindex : 0; + + if (protocol == 0) + goto out_unlock; + + if (!dev || (dev->flags & IFF_UP)) { + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } else { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + +out_unlock: + spin_unlock(&po->bind_lock); + release_sock(sk); + return 0; +} + +/* + * Bind a packet socket to a device + */ + +static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sock *sk = sock->sk; + char name[15]; + struct net_device *dev; + int err = -ENODEV; + + /* + * Check legality + */ + + if (addr_len != sizeof(struct sockaddr)) + return -EINVAL; + strlcpy(name, uaddr->sa_data, sizeof(name)); + + dev = dev_get_by_name(sock_net(sk), name); + if (dev) { + err = packet_do_bind(sk, dev, pkt_sk(sk)->num); + dev_put(dev); + } + return err; +} + +static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; + struct sock *sk = sock->sk; + struct net_device *dev = NULL; + int err; + + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ll)) + return -EINVAL; + if (sll->sll_family != AF_PACKET) + return -EINVAL; + + if (sll->sll_ifindex) { + err = -ENODEV; + dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); + if (dev == NULL) + goto out; + } + err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); + if (dev) + dev_put(dev); + +out: + return err; +} + +static struct proto packet_proto = { + .name = "PACKET", + .owner = THIS_MODULE, + .obj_size = sizeof(struct packet_sock), +}; + +/* + * Create a packet of type SOCK_PACKET. + */ + +static int packet_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct packet_sock *po; + __be16 proto = (__force __be16)protocol; /* weird, but documented */ + int err; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && + sock->type != SOCK_PACKET) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + + err = -ENOBUFS; + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + if (sk == NULL) + goto out; + + sock->ops = &packet_ops; + if (sock->type == SOCK_PACKET) + sock->ops = &packet_ops_spkt; + + sock_init_data(sock, sk); + + po = pkt_sk(sk); + sk->sk_family = PF_PACKET; + po->num = proto; + + sk->sk_destruct = packet_sock_destruct; + sk_refcnt_debug_inc(sk); + + /* + * Attach a protocol block + */ + + spin_lock_init(&po->bind_lock); + mutex_init(&po->pg_vec_lock); + po->prot_hook.func = packet_rcv; + + if (sock->type == SOCK_PACKET) + po->prot_hook.func = packet_rcv_spkt; + + po->prot_hook.af_packet_priv = sk; + + if (proto) { + po->prot_hook.type = proto; + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } + + spin_lock_bh(&net->packet.sklist_lock); + sk_add_node_rcu(sk, &net->packet.sklist); + sock_prot_inuse_add(net, &packet_proto, 1); + spin_unlock_bh(&net->packet.sklist_lock); + + return 0; +out: + return err; +} + +static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP, + sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + struct sockaddr_ll *sll; + int vnet_hdr_len = 0; + + err = -EINVAL; + if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) + goto out; + +#if 0 + /* What error should we return now? EUNATTACH? */ + if (pkt_sk(sk)->ifindex < 0) + return -ENODEV; +#endif + + if (flags & MSG_ERRQUEUE) { + err = packet_recv_error(sk, msg, len); + goto out; + } + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if (skb == NULL) + goto out; + + if (pkt_sk(sk)->has_vnet_hdr) { + struct virtio_net_hdr vnet_hdr = { 0 }; + + err = -EINVAL; + vnet_hdr_len = sizeof(vnet_hdr); + if (len < vnet_hdr_len) + goto out_free; + + len -= vnet_hdr_len; + + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + /* This is a hint as to how much should be linear. */ + vnet_hdr.hdr_len = skb_headlen(skb); + vnet_hdr.gso_size = sinfo->gso_size; + if (sinfo->gso_type & SKB_GSO_TCPV4) + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; + else if (sinfo->gso_type & SKB_GSO_FCOE) + goto out_free; + else + BUG(); + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet_hdr.csum_start = skb_checksum_start_offset(skb); + vnet_hdr.csum_offset = skb->csum_offset; + } /* else everything is zero */ + + err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, + vnet_hdr_len); + if (err < 0) + goto out_free; + } + + /* + * If the address length field is there to be filled in, we fill + * it in now. + */ + + sll = &PACKET_SKB_CB(skb)->sa.ll; + if (sock->type == SOCK_PACKET) + msg->msg_namelen = sizeof(struct sockaddr_pkt); + else + msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (msg->msg_name) + memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, + msg->msg_namelen); + + if (pkt_sk(sk)->auxdata) { + struct tpacket_auxdata aux; + + aux.tp_status = TP_STATUS_USER; + if (skb->ip_summed == CHECKSUM_PARTIAL) + aux.tp_status |= TP_STATUS_CSUMNOTREADY; + aux.tp_len = PACKET_SKB_CB(skb)->origlen; + aux.tp_snaplen = skb->len; + aux.tp_mac = 0; + aux.tp_net = skb_network_offset(skb); + if (vlan_tx_tag_present(skb)) { + aux.tp_vlan_tci = vlan_tx_tag_get(skb); + aux.tp_status |= TP_STATUS_VLAN_VALID; + } else { + aux.tp_vlan_tci = 0; + } + aux.tp_padding = 0; + put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); + } + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct net_device *dev; + struct sock *sk = sock->sk; + + if (peer) + return -EOPNOTSUPP; + + uaddr->sa_family = AF_PACKET; + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); + if (dev) + strncpy(uaddr->sa_data, dev->name, 14); + else + memset(uaddr->sa_data, 0, 14); + rcu_read_unlock(); + *uaddr_len = sizeof(*uaddr); + + return 0; +} + +static int packet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct net_device *dev; + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); + + if (peer) + return -EOPNOTSUPP; + + sll->sll_family = AF_PACKET; + sll->sll_ifindex = po->ifindex; + sll->sll_protocol = po->num; + sll->sll_pkttype = 0; + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); + if (dev) { + sll->sll_hatype = dev->type; + sll->sll_halen = dev->addr_len; + memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + } else { + sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ + sll->sll_halen = 0; + } + rcu_read_unlock(); + *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; + + return 0; +} + +static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, + int what) +{ + switch (i->type) { + case PACKET_MR_MULTICAST: + if (i->alen != dev->addr_len) + return -EINVAL; + if (what > 0) + return dev_mc_add(dev, i->addr); + else + return dev_mc_del(dev, i->addr); + break; + case PACKET_MR_PROMISC: + return dev_set_promiscuity(dev, what); + break; + case PACKET_MR_ALLMULTI: + return dev_set_allmulti(dev, what); + break; + case PACKET_MR_UNICAST: + if (i->alen != dev->addr_len) + return -EINVAL; + if (what > 0) + return dev_uc_add(dev, i->addr); + else + return dev_uc_del(dev, i->addr); + break; + default: + break; + } + return 0; +} + +static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) +{ + for ( ; i; i = i->next) { + if (i->ifindex == dev->ifindex) + packet_dev_mc(dev, i, what); + } +} + +static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_mclist *ml, *i; + struct net_device *dev; + int err; + + rtnl_lock(); + + err = -ENODEV; + dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); + if (!dev) + goto done; + + err = -EINVAL; + if (mreq->mr_alen > dev->addr_len) + goto done; + + err = -ENOBUFS; + i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i == NULL) + goto done; + + err = 0; + for (ml = po->mclist; ml; ml = ml->next) { + if (ml->ifindex == mreq->mr_ifindex && + ml->type == mreq->mr_type && + ml->alen == mreq->mr_alen && + memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { + ml->count++; + /* Free the new element ... */ + kfree(i); + goto done; + } + } + + i->type = mreq->mr_type; + i->ifindex = mreq->mr_ifindex; + i->alen = mreq->mr_alen; + memcpy(i->addr, mreq->mr_address, i->alen); + i->count = 1; + i->next = po->mclist; + po->mclist = i; + err = packet_dev_mc(dev, i, 1); + if (err) { + po->mclist = i->next; + kfree(i); + } + +done: + rtnl_unlock(); + return err; +} + +static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) +{ + struct packet_mclist *ml, **mlp; + + rtnl_lock(); + + for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { + if (ml->ifindex == mreq->mr_ifindex && + ml->type == mreq->mr_type && + ml->alen == mreq->mr_alen && + memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { + if (--ml->count == 0) { + struct net_device *dev; + *mlp = ml->next; + dev = __dev_get_by_index(sock_net(sk), ml->ifindex); + if (dev) + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + rtnl_unlock(); + return 0; + } + } + rtnl_unlock(); + return -EADDRNOTAVAIL; +} + +static void packet_flush_mclist(struct sock *sk) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_mclist *ml; + + if (!po->mclist) + return; + + rtnl_lock(); + while ((ml = po->mclist) != NULL) { + struct net_device *dev; + + po->mclist = ml->next; + dev = __dev_get_by_index(sock_net(sk), ml->ifindex); + if (dev != NULL) + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + rtnl_unlock(); +} + +static int +packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + int ret; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; + + switch (optname) { + case PACKET_ADD_MEMBERSHIP: + case PACKET_DROP_MEMBERSHIP: + { + struct packet_mreq_max mreq; + int len = optlen; + memset(&mreq, 0, sizeof(mreq)); + if (len < sizeof(struct packet_mreq)) + return -EINVAL; + if (len > sizeof(mreq)) + len = sizeof(mreq); + if (copy_from_user(&mreq, optval, len)) + return -EFAULT; + if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) + return -EINVAL; + if (optname == PACKET_ADD_MEMBERSHIP) + ret = packet_mc_add(sk, &mreq); + else + ret = packet_mc_drop(sk, &mreq); + return ret; + } + + case PACKET_RX_RING: + case PACKET_TX_RING: + { + struct tpacket_req req; + + if (optlen < sizeof(req)) + return -EINVAL; + if (pkt_sk(sk)->has_vnet_hdr) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); + } + case PACKET_COPY_THRESH: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + pkt_sk(sk)->copy_thresh = val; + return 0; + } + case PACKET_VERSION: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) + return -EBUSY; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + switch (val) { + case TPACKET_V1: + case TPACKET_V2: + po->tp_version = val; + return 0; + default: + return -EINVAL; + } + } + case PACKET_RESERVE: + { + unsigned int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) + return -EBUSY; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + po->tp_reserve = val; + return 0; + } + case PACKET_LOSS: + { + unsigned int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) + return -EBUSY; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + po->tp_loss = !!val; + return 0; + } + case PACKET_AUXDATA: + { + int val; + + if (optlen < sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->auxdata = !!val; + return 0; + } + case PACKET_ORIGDEV: + { + int val; + + if (optlen < sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->origdev = !!val; + return 0; + } + case PACKET_VNET_HDR: + { + int val; + + if (sock->type != SOCK_RAW) + return -EINVAL; + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) + return -EBUSY; + if (optlen < sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->has_vnet_hdr = !!val; + return 0; + } + case PACKET_TIMESTAMP: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->tp_tstamp = val; + return 0; + } + default: + return -ENOPROTOOPT; + } +} + +static int packet_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + int len; + int val; + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + void *data; + struct tpacket_stats st; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + switch (optname) { + case PACKET_STATISTICS: + if (len > sizeof(struct tpacket_stats)) + len = sizeof(struct tpacket_stats); + spin_lock_bh(&sk->sk_receive_queue.lock); + st = po->stats; + memset(&po->stats, 0, sizeof(st)); + spin_unlock_bh(&sk->sk_receive_queue.lock); + st.tp_packets += st.tp_drops; + + data = &st; + break; + case PACKET_AUXDATA: + if (len > sizeof(int)) + len = sizeof(int); + val = po->auxdata; + + data = &val; + break; + case PACKET_ORIGDEV: + if (len > sizeof(int)) + len = sizeof(int); + val = po->origdev; + + data = &val; + break; + case PACKET_VNET_HDR: + if (len > sizeof(int)) + len = sizeof(int); + val = po->has_vnet_hdr; + + data = &val; + break; + case PACKET_VERSION: + if (len > sizeof(int)) + len = sizeof(int); + val = po->tp_version; + data = &val; + break; + case PACKET_HDRLEN: + if (len > sizeof(int)) + len = sizeof(int); + if (copy_from_user(&val, optval, len)) + return -EFAULT; + switch (val) { + case TPACKET_V1: + val = sizeof(struct tpacket_hdr); + break; + case TPACKET_V2: + val = sizeof(struct tpacket2_hdr); + break; + default: + return -EINVAL; + } + data = &val; + break; + case PACKET_RESERVE: + if (len > sizeof(unsigned int)) + len = sizeof(unsigned int); + val = po->tp_reserve; + data = &val; + break; + case PACKET_LOSS: + if (len > sizeof(unsigned int)) + len = sizeof(unsigned int); + val = po->tp_loss; + data = &val; + break; + case PACKET_TIMESTAMP: + if (len > sizeof(int)) + len = sizeof(int); + val = po->tp_tstamp; + data = &val; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, data, len)) + return -EFAULT; + return 0; +} + + +static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct sock *sk; + struct hlist_node *node; + struct net_device *dev = data; + struct net *net = dev_net(dev); + + rcu_read_lock(); + sk_for_each_rcu(sk, node, &net->packet.sklist) { + struct packet_sock *po = pkt_sk(sk); + + switch (msg) { + case NETDEV_UNREGISTER: + if (po->mclist) + packet_dev_mclist(dev, po->mclist, -1); + /* fallthrough */ + + case NETDEV_DOWN: + if (dev->ifindex == po->ifindex) { + spin_lock(&po->bind_lock); + if (po->running) { + __dev_remove_pack(&po->prot_hook); + __sock_put(sk); + po->running = 0; + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + if (msg == NETDEV_UNREGISTER) { + po->ifindex = -1; + po->prot_hook.dev = NULL; + } + spin_unlock(&po->bind_lock); + } + break; + case NETDEV_UP: + if (dev->ifindex == po->ifindex) { + spin_lock(&po->bind_lock); + if (po->num && !po->running) { + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } + spin_unlock(&po->bind_lock); + } + break; + } + } + rcu_read_unlock(); + return NOTIFY_DONE; +} + + +static int packet_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCOUTQ: + { + int amount = sk_wmem_alloc_get(sk); + + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + amount = skb->len; + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *)arg); + +#ifdef CONFIG_INET + case SIOCADDRT: + case SIOCDELRT: + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFFLAGS: + return inet_dgram_ops.ioctl(sock, cmd, arg); +#endif + + default: + return -ENOIOCTLCMD; + } + return 0; +} + +static unsigned int packet_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + unsigned int mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (po->rx_ring.pg_vec) { + if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_write_queue.lock); + if (po->tx_ring.pg_vec) { + if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) + mask |= POLLOUT | POLLWRNORM; + } + spin_unlock_bh(&sk->sk_write_queue.lock); + return mask; +} + + +/* Dirty? Well, I still did not learn better way to account + * for user mmaps. + */ + +static void packet_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&pkt_sk(sk)->mapped); +} + +static void packet_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&pkt_sk(sk)->mapped); +} + +static const struct vm_operations_struct packet_mmap_ops = { + .open = packet_mm_open, + .close = packet_mm_close, +}; + +static void free_pg_vec(struct pgv *pg_vec, unsigned int order, + unsigned int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (likely(pg_vec[i].buffer)) { + if (is_vmalloc_addr(pg_vec[i].buffer)) + vfree(pg_vec[i].buffer); + else + free_pages((unsigned long)pg_vec[i].buffer, + order); + pg_vec[i].buffer = NULL; + } + } + kfree(pg_vec); +} + +static inline char *alloc_one_pg_vec_page(unsigned long order) +{ + char *buffer = NULL; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | + __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + buffer = (char *) __get_free_pages(gfp_flags, order); + + if (buffer) + return buffer; + + /* + * __get_free_pages failed, fall back to vmalloc + */ + buffer = vzalloc((1 << order) * PAGE_SIZE); + + if (buffer) + return buffer; + + /* + * vmalloc failed, lets dig into swap here + */ + gfp_flags &= ~__GFP_NORETRY; + buffer = (char *)__get_free_pages(gfp_flags, order); + if (buffer) + return buffer; + + /* + * complete and utter failure + */ + return NULL; +} + +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) +{ + unsigned int block_nr = req->tp_block_nr; + struct pgv *pg_vec; + int i; + + pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); + if (unlikely(!pg_vec)) + goto out; + + for (i = 0; i < block_nr; i++) { + pg_vec[i].buffer = alloc_one_pg_vec_page(order); + if (unlikely(!pg_vec[i].buffer)) + goto out_free_pgvec; + } + +out: + return pg_vec; + +out_free_pgvec: + free_pg_vec(pg_vec, order, block_nr); + pg_vec = NULL; + goto out; +} + +static int packet_set_ring(struct sock *sk, struct tpacket_req *req, + int closing, int tx_ring) +{ + struct pgv *pg_vec = NULL; + struct packet_sock *po = pkt_sk(sk); + int was_running, order = 0; + struct packet_ring_buffer *rb; + struct sk_buff_head *rb_queue; + __be16 num; + int err; + + rb = tx_ring ? &po->tx_ring : &po->rx_ring; + rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + err = -EBUSY; + if (!closing) { + if (atomic_read(&po->mapped)) + goto out; + if (atomic_read(&rb->pending)) + goto out; + } + + if (req->tp_block_nr) { + /* Sanity tests and some calculations */ + err = -EBUSY; + if (unlikely(rb->pg_vec)) + goto out; + + switch (po->tp_version) { + case TPACKET_V1: + po->tp_hdrlen = TPACKET_HDRLEN; + break; + case TPACKET_V2: + po->tp_hdrlen = TPACKET2_HDRLEN; + break; + } + + err = -EINVAL; + if (unlikely((int)req->tp_block_size <= 0)) + goto out; + if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) + goto out; + if (unlikely(req->tp_frame_size < po->tp_hdrlen + + po->tp_reserve)) + goto out; + if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) + goto out; + + rb->frames_per_block = req->tp_block_size/req->tp_frame_size; + if (unlikely(rb->frames_per_block <= 0)) + goto out; + if (unlikely((rb->frames_per_block * req->tp_block_nr) != + req->tp_frame_nr)) + goto out; + + err = -ENOMEM; + order = get_order(req->tp_block_size); + pg_vec = alloc_pg_vec(req, order); + if (unlikely(!pg_vec)) + goto out; + } + /* Done */ + else { + err = -EINVAL; + if (unlikely(req->tp_frame_nr)) + goto out; + } + + lock_sock(sk); + + /* Detach socket from network */ + spin_lock(&po->bind_lock); + was_running = po->running; + num = po->num; + if (was_running) { + __dev_remove_pack(&po->prot_hook); + po->num = 0; + po->running = 0; + __sock_put(sk); + } + spin_unlock(&po->bind_lock); + + synchronize_net(); + + err = -EBUSY; + mutex_lock(&po->pg_vec_lock); + if (closing || atomic_read(&po->mapped) == 0) { + err = 0; + spin_lock_bh(&rb_queue->lock); + swap(rb->pg_vec, pg_vec); + rb->frame_max = (req->tp_frame_nr - 1); + rb->head = 0; + rb->frame_size = req->tp_frame_size; + spin_unlock_bh(&rb_queue->lock); + + swap(rb->pg_vec_order, order); + swap(rb->pg_vec_len, req->tp_block_nr); + + rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; + po->prot_hook.func = (po->rx_ring.pg_vec) ? + tpacket_rcv : packet_rcv; + skb_queue_purge(rb_queue); + if (atomic_read(&po->mapped)) + pr_err("packet_mmap: vma is busy: %d\n", + atomic_read(&po->mapped)); + } + mutex_unlock(&po->pg_vec_lock); + + spin_lock(&po->bind_lock); + if (was_running && !po->running) { + sock_hold(sk); + po->running = 1; + po->num = num; + dev_add_pack(&po->prot_hook); + } + spin_unlock(&po->bind_lock); + + release_sock(sk); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->tp_block_nr); +out: + return err; +} + +static int packet_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + unsigned long size, expected_size; + struct packet_ring_buffer *rb; + unsigned long start; + int err = -EINVAL; + int i; + + if (vma->vm_pgoff) + return -EINVAL; + + mutex_lock(&po->pg_vec_lock); + + expected_size = 0; + for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { + if (rb->pg_vec) { + expected_size += rb->pg_vec_len + * rb->pg_vec_pages + * PAGE_SIZE; + } + } + + if (expected_size == 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if (size != expected_size) + goto out; + + start = vma->vm_start; + for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { + if (rb->pg_vec == NULL) + continue; + + for (i = 0; i < rb->pg_vec_len; i++) { + struct page *page; + void *kaddr = rb->pg_vec[i].buffer; + int pg_num; + + for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { + page = pgv_to_page(kaddr); + err = vm_insert_page(vma, start, page); + if (unlikely(err)) + goto out; + start += PAGE_SIZE; + kaddr += PAGE_SIZE; + } + } + } + + atomic_inc(&po->mapped); + vma->vm_ops = &packet_mmap_ops; + err = 0; + +out: + mutex_unlock(&po->pg_vec_lock); + return err; +} + +static const struct proto_ops packet_ops_spkt = { + .family = PF_PACKET, + .owner = THIS_MODULE, + .release = packet_release, + .bind = packet_bind_spkt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = packet_getname_spkt, + .poll = datagram_poll, + .ioctl = packet_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = packet_sendmsg_spkt, + .recvmsg = packet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops packet_ops = { + .family = PF_PACKET, + .owner = THIS_MODULE, + .release = packet_release, + .bind = packet_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = packet_getname, + .poll = packet_poll, + .ioctl = packet_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = packet_setsockopt, + .getsockopt = packet_getsockopt, + .sendmsg = packet_sendmsg, + .recvmsg = packet_recvmsg, + .mmap = packet_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct net_proto_family packet_family_ops = { + .family = PF_PACKET, + .create = packet_create, + .owner = THIS_MODULE, +}; + +static struct notifier_block packet_netdev_notifier = { + .notifier_call = packet_notifier, +}; + +#ifdef CONFIG_PROC_FS + +static void *packet_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct net *net = seq_file_net(seq); + + rcu_read_lock(); + return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); +} + +static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + return seq_hlist_next_rcu(v, &net->packet.sklist, pos); +} + +static void packet_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int packet_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); + else { + struct sock *s = sk_entry(v); + const struct packet_sock *po = pkt_sk(s); + + seq_printf(seq, + "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", + s, + atomic_read(&s->sk_refcnt), + s->sk_type, + ntohs(po->num), + po->ifindex, + po->running, + atomic_read(&s->sk_rmem_alloc), + sock_i_uid(s), + sock_i_ino(s)); + } + + return 0; +} + +static const struct seq_operations packet_seq_ops = { + .start = packet_seq_start, + .next = packet_seq_next, + .stop = packet_seq_stop, + .show = packet_seq_show, +}; + +static int packet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &packet_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations packet_seq_fops = { + .owner = THIS_MODULE, + .open = packet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +static int __net_init packet_net_init(struct net *net) +{ + spin_lock_init(&net->packet.sklist_lock); + INIT_HLIST_HEAD(&net->packet.sklist); + + if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) + return -ENOMEM; + + return 0; +} + +static void __net_exit packet_net_exit(struct net *net) +{ + proc_net_remove(net, "packet"); +} + +static struct pernet_operations packet_net_ops = { + .init = packet_net_init, + .exit = packet_net_exit, +}; + + +static void __exit packet_exit(void) +{ + unregister_netdevice_notifier(&packet_netdev_notifier); + unregister_pernet_subsys(&packet_net_ops); + sock_unregister(PF_PACKET); + proto_unregister(&packet_proto); +} + +static int __init packet_init(void) +{ + int rc = proto_register(&packet_proto, 0); + + if (rc != 0) + goto out; + + sock_register(&packet_family_ops); + register_pernet_subsys(&packet_net_ops); + register_netdevice_notifier(&packet_netdev_notifier); +out: + return rc; +} + +module_init(packet_init); +module_exit(packet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_PACKET); diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig new file mode 100644 index 00000000..6ec7d55b --- /dev/null +++ b/net/phonet/Kconfig @@ -0,0 +1,16 @@ +# +# Phonet protocol +# + +config PHONET + tristate "Phonet protocols family" + help + The Phone Network protocol (PhoNet) is a packet-oriented + communication protocol developed by Nokia for use with its modems. + + This is required for Maemo to use cellular data connectivity (if + supported). It can also be used to control Nokia phones + from a Linux computer, although AT commands may be easier to use. + + To compile this driver as a module, choose M here: the module + will be called phonet. If unsure, say N. diff --git a/net/phonet/Makefile b/net/phonet/Makefile new file mode 100644 index 00000000..e10b1b18 --- /dev/null +++ b/net/phonet/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_PHONET) += phonet.o pn_pep.o + +phonet-y := \ + pn_dev.o \ + pn_netlink.o \ + socket.o \ + datagram.o \ + sysctl.o \ + af_phonet.o + +pn_pep-y := pep.o pep-gprs.o diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c new file mode 100644 index 00000000..c6fffd94 --- /dev/null +++ b/net/phonet/af_phonet.c @@ -0,0 +1,548 @@ +/* + * File: af_phonet.c + * + * Phonet protocols family + * + * Copyright (C) 2008 Nokia Corporation. + * + * Contact: Remi Denis-Courmont + * Original author: Sakari Ailus + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Transport protocol registration */ +static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; + +static struct phonet_protocol *phonet_proto_get(unsigned int protocol) +{ + struct phonet_protocol *pp; + + if (protocol >= PHONET_NPROTO) + return NULL; + + rcu_read_lock(); + pp = rcu_dereference(proto_tab[protocol]); + if (pp && !try_module_get(pp->prot->owner)) + pp = NULL; + rcu_read_unlock(); + + return pp; +} + +static inline void phonet_proto_put(struct phonet_protocol *pp) +{ + module_put(pp->prot->owner); +} + +/* protocol family functions */ + +static int pn_socket_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct pn_sock *pn; + struct phonet_protocol *pnp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (protocol == 0) { + /* Default protocol selection */ + switch (sock->type) { + case SOCK_DGRAM: + protocol = PN_PROTO_PHONET; + break; + case SOCK_SEQPACKET: + protocol = PN_PROTO_PIPE; + break; + default: + return -EPROTONOSUPPORT; + } + } + + pnp = phonet_proto_get(protocol); + if (pnp == NULL && + request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0) + pnp = phonet_proto_get(protocol); + + if (pnp == NULL) + return -EPROTONOSUPPORT; + if (sock->type != pnp->sock_type) { + err = -EPROTONOSUPPORT; + goto out; + } + + sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot); + if (sk == NULL) { + err = -ENOMEM; + goto out; + } + + sock_init_data(sock, sk); + sock->state = SS_UNCONNECTED; + sock->ops = pnp->ops; + sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sk->sk_protocol = protocol; + pn = pn_sk(sk); + pn->sobject = 0; + pn->dobject = 0; + pn->resource = 0; + sk->sk_prot->init(sk); + err = 0; + +out: + phonet_proto_put(pnp); + return err; +} + +static const struct net_proto_family phonet_proto_family = { + .family = PF_PHONET, + .create = pn_socket_create, + .owner = THIS_MODULE, +}; + +/* Phonet device header operations */ +static int pn_header_create(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned len) +{ + u8 *media = skb_push(skb, 1); + + if (type != ETH_P_PHONET) + return -1; + + if (!saddr) + saddr = dev->dev_addr; + *media = *(const u8 *)saddr; + return 1; +} + +static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + const u8 *media = skb_mac_header(skb); + *haddr = *media; + return 1; +} + +struct header_ops phonet_header_ops = { + .create = pn_header_create, + .parse = pn_header_parse, +}; +EXPORT_SYMBOL(phonet_header_ops); + +/* + * Prepends an ISI header and sends a datagram. + */ +static int pn_send(struct sk_buff *skb, struct net_device *dev, + u16 dst, u16 src, u8 res, u8 irq) +{ + struct phonethdr *ph; + int err; + + if (skb->len + 2 > 0xffff /* Phonet length field limit */ || + skb->len + sizeof(struct phonethdr) > dev->mtu) { + err = -EMSGSIZE; + goto drop; + } + + /* Broadcast sending is not implemented */ + if (pn_addr(dst) == PNADDR_BROADCAST) { + err = -EOPNOTSUPP; + goto drop; + } + + skb_reset_transport_header(skb); + WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */ + skb_push(skb, sizeof(struct phonethdr)); + skb_reset_network_header(skb); + ph = pn_hdr(skb); + ph->pn_rdev = pn_dev(dst); + ph->pn_sdev = pn_dev(src); + ph->pn_res = res; + ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph)); + ph->pn_robj = pn_obj(dst); + ph->pn_sobj = pn_obj(src); + + skb->protocol = htons(ETH_P_PHONET); + skb->priority = 0; + skb->dev = dev; + + if (skb->pkt_type == PACKET_LOOPBACK) { + skb_reset_mac_header(skb); + skb_orphan(skb); + err = (irq ? netif_rx(skb) : netif_rx_ni(skb)) ? -ENOBUFS : 0; + } else { + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + NULL, NULL, skb->len); + if (err < 0) { + err = -EHOSTUNREACH; + goto drop; + } + err = dev_queue_xmit(skb); + if (unlikely(err > 0)) + err = net_xmit_errno(err); + } + + return err; +drop: + kfree_skb(skb); + return err; +} + +static int pn_raw_send(const void *data, int len, struct net_device *dev, + u16 dst, u16 src, u8 res) +{ + struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0) + skb->pkt_type = PACKET_LOOPBACK; + + skb_reserve(skb, MAX_PHONET_HEADER); + __skb_put(skb, len); + skb_copy_to_linear_data(skb, data, len); + return pn_send(skb, dev, dst, src, res, 1); +} + +/* + * Create a Phonet header for the skb and send it out. Returns + * non-zero error code if failed. The skb is freed then. + */ +int pn_skb_send(struct sock *sk, struct sk_buff *skb, + const struct sockaddr_pn *target) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + struct pn_sock *pn = pn_sk(sk); + int err; + u16 src, dst; + u8 daddr, saddr, res; + + src = pn->sobject; + if (target != NULL) { + dst = pn_sockaddr_get_object(target); + res = pn_sockaddr_get_resource(target); + } else { + dst = pn->dobject; + res = pn->resource; + } + daddr = pn_addr(dst); + + err = -EHOSTUNREACH; + if (sk->sk_bound_dev_if) + dev = dev_get_by_index(net, sk->sk_bound_dev_if); + else if (phonet_address_lookup(net, daddr) == 0) { + dev = phonet_device_get(net); + skb->pkt_type = PACKET_LOOPBACK; + } else if (dst == 0) { + /* Resource routing (small race until phonet_rcv()) */ + struct sock *sk = pn_find_sock_by_res(net, res); + if (sk) { + sock_put(sk); + dev = phonet_device_get(net); + skb->pkt_type = PACKET_LOOPBACK; + } else + dev = phonet_route_output(net, daddr); + } else + dev = phonet_route_output(net, daddr); + + if (!dev || !(dev->flags & IFF_UP)) + goto drop; + + saddr = phonet_address_get(dev, daddr); + if (saddr == PN_NO_ADDR) + goto drop; + + if (!pn_addr(src)) + src = pn_object(saddr, pn_obj(src)); + + err = pn_send(skb, dev, dst, src, res, 0); + dev_put(dev); + return err; + +drop: + kfree_skb(skb); + if (dev) + dev_put(dev); + return err; +} +EXPORT_SYMBOL(pn_skb_send); + +/* Do not send an error message in response to an error message */ +static inline int can_respond(struct sk_buff *skb) +{ + const struct phonethdr *ph; + const struct phonetmsg *pm; + u8 submsg_id; + + if (!pskb_may_pull(skb, 3)) + return 0; + + ph = pn_hdr(skb); + if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5)) + return 0; + if (ph->pn_res == PN_COMMGR) /* indications */ + return 0; + + ph = pn_hdr(skb); /* re-acquires the pointer */ + pm = pn_msg(skb); + if (pm->pn_msg_id != PN_COMMON_MESSAGE) + return 1; + submsg_id = (ph->pn_res == PN_PREFIX) + ? pm->pn_e_submsg_id : pm->pn_submsg_id; + if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP && + pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP) + return 1; + return 0; +} + +static int send_obj_unreachable(struct sk_buff *rskb) +{ + const struct phonethdr *oph = pn_hdr(rskb); + const struct phonetmsg *opm = pn_msg(rskb); + struct phonetmsg resp; + + memset(&resp, 0, sizeof(resp)); + resp.pn_trans_id = opm->pn_trans_id; + resp.pn_msg_id = PN_COMMON_MESSAGE; + if (oph->pn_res == PN_PREFIX) { + resp.pn_e_res_id = opm->pn_e_res_id; + resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; + resp.pn_e_orig_msg_id = opm->pn_msg_id; + resp.pn_e_status = 0; + } else { + resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; + resp.pn_orig_msg_id = opm->pn_msg_id; + resp.pn_status = 0; + } + return pn_raw_send(&resp, sizeof(resp), rskb->dev, + pn_object(oph->pn_sdev, oph->pn_sobj), + pn_object(oph->pn_rdev, oph->pn_robj), + oph->pn_res); +} + +static int send_reset_indications(struct sk_buff *rskb) +{ + struct phonethdr *oph = pn_hdr(rskb); + static const u8 data[4] = { + 0x00 /* trans ID */, 0x10 /* subscribe msg */, + 0x00 /* subscription count */, 0x00 /* dummy */ + }; + + return pn_raw_send(data, sizeof(data), rskb->dev, + pn_object(oph->pn_sdev, 0x00), + pn_object(oph->pn_rdev, oph->pn_robj), + PN_COMMGR); +} + + +/* packet type functions */ + +/* + * Stuff received packets to associated sockets. + * On error, returns non-zero and releases the skb. + */ +static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pkttype, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct phonethdr *ph; + struct sockaddr_pn sa; + u16 len; + + /* check we have at least a full Phonet header */ + if (!pskb_pull(skb, sizeof(struct phonethdr))) + goto out; + + /* check that the advertised length is correct */ + ph = pn_hdr(skb); + len = get_unaligned_be16(&ph->pn_length); + if (len < 2) + goto out; + len -= 2; + if ((len > skb->len) || pskb_trim(skb, len)) + goto out; + skb_reset_transport_header(skb); + + pn_skb_get_dst_sockaddr(skb, &sa); + + /* check if this is broadcasted */ + if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) { + pn_deliver_sock_broadcast(net, skb); + goto out; + } + + /* resource routing */ + if (pn_sockaddr_get_object(&sa) == 0) { + struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource); + if (sk) + return sk_receive_skb(sk, skb, 0); + } + + /* check if we are the destination */ + if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { + /* Phonet packet input */ + struct sock *sk = pn_find_sock_by_sa(net, &sa); + + if (sk) + return sk_receive_skb(sk, skb, 0); + + if (can_respond(skb)) { + send_obj_unreachable(skb); + send_reset_indications(skb); + } + } else if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + goto out; /* Race between address deletion and loopback */ + else { + /* Phonet packet routing */ + struct net_device *out_dev; + + out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa)); + if (!out_dev) { + LIMIT_NETDEBUG(KERN_WARNING"No Phonet route to %02X\n", + pn_sockaddr_get_addr(&sa)); + goto out; + } + + __skb_push(skb, sizeof(struct phonethdr)); + skb->dev = out_dev; + if (out_dev == dev) { + LIMIT_NETDEBUG(KERN_ERR"Phonet loop to %02X on %s\n", + pn_sockaddr_get_addr(&sa), dev->name); + goto out_dev; + } + /* Some drivers (e.g. TUN) do not allocate HW header space */ + if (skb_cow_head(skb, out_dev->hard_header_len)) + goto out_dev; + + if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL, + skb->len) < 0) + goto out_dev; + dev_queue_xmit(skb); + dev_put(out_dev); + return NET_RX_SUCCESS; +out_dev: + dev_put(out_dev); + } + +out: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type phonet_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_PHONET), + .func = phonet_rcv, +}; + +static DEFINE_MUTEX(proto_tab_lock); + +int __init_or_module phonet_proto_register(unsigned int protocol, + struct phonet_protocol *pp) +{ + int err = 0; + + if (protocol >= PHONET_NPROTO) + return -EINVAL; + + err = proto_register(pp->prot, 1); + if (err) + return err; + + mutex_lock(&proto_tab_lock); + if (proto_tab[protocol]) + err = -EBUSY; + else + rcu_assign_pointer(proto_tab[protocol], pp); + mutex_unlock(&proto_tab_lock); + + return err; +} +EXPORT_SYMBOL(phonet_proto_register); + +void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp) +{ + mutex_lock(&proto_tab_lock); + BUG_ON(proto_tab[protocol] != pp); + rcu_assign_pointer(proto_tab[protocol], NULL); + mutex_unlock(&proto_tab_lock); + synchronize_rcu(); + proto_unregister(pp->prot); +} +EXPORT_SYMBOL(phonet_proto_unregister); + +/* Module registration */ +static int __init phonet_init(void) +{ + int err; + + err = phonet_device_init(); + if (err) + return err; + + pn_sock_init(); + err = sock_register(&phonet_proto_family); + if (err) { + printk(KERN_ALERT + "phonet protocol family initialization failed\n"); + goto err_sock; + } + + dev_add_pack(&phonet_packet_type); + phonet_sysctl_init(); + + err = isi_register(); + if (err) + goto err; + return 0; + +err: + phonet_sysctl_exit(); + sock_unregister(PF_PHONET); + dev_remove_pack(&phonet_packet_type); +err_sock: + phonet_device_exit(); + return err; +} + +static void __exit phonet_exit(void) +{ + isi_unregister(); + phonet_sysctl_exit(); + sock_unregister(PF_PHONET); + dev_remove_pack(&phonet_packet_type); + phonet_device_exit(); +} + +module_init(phonet_init); +module_exit(phonet_exit); +MODULE_DESCRIPTION("Phonet protocol stack for Linux"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_PHONET); diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c new file mode 100644 index 00000000..2f032381 --- /dev/null +++ b/net/phonet/datagram.c @@ -0,0 +1,214 @@ +/* + * File: datagram.c + * + * Datagram (ISI) Phonet sockets + * + * Copyright (C) 2008 Nokia Corporation. + * + * Contact: Remi Denis-Courmont + * Original author: Sakari Ailus + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include +#include + +static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb); + +/* associated socket ceases to exist */ +static void pn_sock_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct sk_buff *skb; + int answ; + + switch (cmd) { + case SIOCINQ: + lock_sock(sk); + skb = skb_peek(&sk->sk_receive_queue); + answ = skb ? skb->len : 0; + release_sock(sk); + return put_user(answ, (int __user *)arg); + + case SIOCPNADDRESOURCE: + case SIOCPNDELRESOURCE: { + u32 res; + if (get_user(res, (u32 __user *)arg)) + return -EFAULT; + if (res >= 256) + return -EINVAL; + if (cmd == SIOCPNADDRESOURCE) + return pn_sock_bind_res(sk, res); + else + return pn_sock_unbind_res(sk, res); + } + } + + return -ENOIOCTLCMD; +} + +/* Destroy socket. All references are gone. */ +static void pn_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + +static int pn_init(struct sock *sk) +{ + sk->sk_destruct = pn_destruct; + return 0; +} + +static int pn_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct sockaddr_pn *target; + struct sk_buff *skb; + int err; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| + MSG_CMSG_COMPAT)) + return -EOPNOTSUPP; + + if (msg->msg_name == NULL) + return -EDESTADDRREQ; + + if (msg->msg_namelen < sizeof(struct sockaddr_pn)) + return -EINVAL; + + target = (struct sockaddr_pn *)msg->msg_name; + if (target->spn_family != AF_PHONET) + return -EAFNOSUPPORT; + + skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len, + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb == NULL) + return err; + skb_reserve(skb, MAX_PHONET_HEADER); + + err = memcpy_fromiovec((void *)skb_put(skb, len), msg->msg_iov, len); + if (err < 0) { + kfree_skb(skb); + return err; + } + + /* + * Fill in the Phonet header and + * finally pass the packet forwards. + */ + err = pn_skb_send(sk, skb, target); + + /* If ok, return len. */ + return (err >= 0) ? len : err; +} + +static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) +{ + struct sk_buff *skb = NULL; + struct sockaddr_pn sa; + int rval = -EOPNOTSUPP; + int copylen; + + if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL| + MSG_CMSG_COMPAT)) + goto out_nofree; + + if (addr_len) + *addr_len = sizeof(sa); + + skb = skb_recv_datagram(sk, flags, noblock, &rval); + if (skb == NULL) + goto out_nofree; + + pn_skb_get_src_sockaddr(skb, &sa); + + copylen = skb->len; + if (len < copylen) { + msg->msg_flags |= MSG_TRUNC; + copylen = len; + } + + rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen); + if (rval) { + rval = -EFAULT; + goto out; + } + + rval = (flags & MSG_TRUNC) ? skb->len : copylen; + + if (msg->msg_name != NULL) + memcpy(msg->msg_name, &sa, sizeof(struct sockaddr_pn)); + +out: + skb_free_datagram(sk, skb); + +out_nofree: + return rval; +} + +/* Queue an skb for a sock. */ +static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int err = sock_queue_rcv_skb(sk, skb); + + if (err < 0) + kfree_skb(skb); + return err ? NET_RX_DROP : NET_RX_SUCCESS; +} + +/* Module registration */ +static struct proto pn_proto = { + .close = pn_sock_close, + .ioctl = pn_ioctl, + .init = pn_init, + .sendmsg = pn_sendmsg, + .recvmsg = pn_recvmsg, + .backlog_rcv = pn_backlog_rcv, + .hash = pn_sock_hash, + .unhash = pn_sock_unhash, + .get_port = pn_sock_get_port, + .obj_size = sizeof(struct pn_sock), + .owner = THIS_MODULE, + .name = "PHONET", +}; + +static struct phonet_protocol pn_dgram_proto = { + .ops = &phonet_dgram_ops, + .prot = &pn_proto, + .sock_type = SOCK_DGRAM, +}; + +int __init isi_register(void) +{ + return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto); +} + +void __exit isi_unregister(void) +{ + phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto); +} diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c new file mode 100644 index 00000000..d0120896 --- /dev/null +++ b/net/phonet/pep-gprs.c @@ -0,0 +1,328 @@ +/* + * File: pep-gprs.c + * + * GPRS over Phonet pipe end point socket + * + * Copyright (C) 2008 Nokia Corporation. + * + * Author: Rémi Denis-Courmont + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define GPRS_DEFAULT_MTU 1400 + +struct gprs_dev { + struct sock *sk; + void (*old_state_change)(struct sock *); + void (*old_data_ready)(struct sock *, int); + void (*old_write_space)(struct sock *); + + struct net_device *dev; +}; + +static __be16 gprs_type_trans(struct sk_buff *skb) +{ + const u8 *pvfc; + u8 buf; + + pvfc = skb_header_pointer(skb, 0, 1, &buf); + if (!pvfc) + return htons(0); + /* Look at IP version field */ + switch (*pvfc >> 4) { + case 4: + return htons(ETH_P_IP); + case 6: + return htons(ETH_P_IPV6); + } + return htons(0); +} + +static void gprs_writeable(struct gprs_dev *gp) +{ + struct net_device *dev = gp->dev; + + if (pep_writeable(gp->sk)) + netif_wake_queue(dev); +} + +/* + * Socket callbacks + */ + +static void gprs_state_change(struct sock *sk) +{ + struct gprs_dev *gp = sk->sk_user_data; + + if (sk->sk_state == TCP_CLOSE_WAIT) { + struct net_device *dev = gp->dev; + + netif_stop_queue(dev); + netif_carrier_off(dev); + } +} + +static int gprs_recv(struct gprs_dev *gp, struct sk_buff *skb) +{ + struct net_device *dev = gp->dev; + int err = 0; + __be16 protocol = gprs_type_trans(skb); + + if (!protocol) { + err = -EINVAL; + goto drop; + } + + if (skb_headroom(skb) & 3) { + struct sk_buff *rskb, *fs; + int flen = 0; + + /* Phonet Pipe data header may be misaligned (3 bytes), + * so wrap the IP packet as a single fragment of an head-less + * socket buffer. The network stack will pull what it needs, + * but at least, the whole IP payload is not memcpy'd. */ + rskb = netdev_alloc_skb(dev, 0); + if (!rskb) { + err = -ENOBUFS; + goto drop; + } + skb_shinfo(rskb)->frag_list = skb; + rskb->len += skb->len; + rskb->data_len += rskb->len; + rskb->truesize += rskb->len; + + /* Avoid nested fragments */ + skb_walk_frags(skb, fs) + flen += fs->len; + skb->next = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + skb->len -= flen; + skb->data_len -= flen; + skb->truesize -= flen; + + skb = rskb; + } + + skb->protocol = protocol; + skb_reset_mac_header(skb); + skb->dev = dev; + + if (likely(dev->flags & IFF_UP)) { + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; + netif_rx(skb); + skb = NULL; + } else + err = -ENODEV; + +drop: + if (skb) { + dev_kfree_skb(skb); + dev->stats.rx_dropped++; + } + return err; +} + +static void gprs_data_ready(struct sock *sk, int len) +{ + struct gprs_dev *gp = sk->sk_user_data; + struct sk_buff *skb; + + while ((skb = pep_read(sk)) != NULL) { + skb_orphan(skb); + gprs_recv(gp, skb); + } +} + +static void gprs_write_space(struct sock *sk) +{ + struct gprs_dev *gp = sk->sk_user_data; + + if (netif_running(gp->dev)) + gprs_writeable(gp); +} + +/* + * Network device callbacks + */ + +static int gprs_open(struct net_device *dev) +{ + struct gprs_dev *gp = netdev_priv(dev); + + gprs_writeable(gp); + return 0; +} + +static int gprs_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct gprs_dev *gp = netdev_priv(dev); + struct sock *sk = gp->sk; + int len, err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + break; + default: + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + skb_orphan(skb); + skb_set_owner_w(skb, sk); + len = skb->len; + err = pep_write(sk, skb); + if (err) { + LIMIT_NETDEBUG(KERN_WARNING"%s: TX error (%d)\n", + dev->name, err); + dev->stats.tx_aborted_errors++; + dev->stats.tx_errors++; + } else { + dev->stats.tx_packets++; + dev->stats.tx_bytes += len; + } + + netif_stop_queue(dev); + if (pep_writeable(sk)) + netif_wake_queue(dev); + return NETDEV_TX_OK; +} + +static int gprs_set_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11))) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops gprs_netdev_ops = { + .ndo_open = gprs_open, + .ndo_stop = gprs_close, + .ndo_start_xmit = gprs_xmit, + .ndo_change_mtu = gprs_set_mtu, +}; + +static void gprs_setup(struct net_device *dev) +{ + dev->features = NETIF_F_FRAGLIST; + dev->type = ARPHRD_PHONET_PIPE; + dev->flags = IFF_POINTOPOINT | IFF_NOARP; + dev->mtu = GPRS_DEFAULT_MTU; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->tx_queue_len = 10; + + dev->netdev_ops = &gprs_netdev_ops; + dev->destructor = free_netdev; +} + +/* + * External interface + */ + +/* + * Attach a GPRS interface to a datagram socket. + * Returns the interface index on success, negative error code on error. + */ +int gprs_attach(struct sock *sk) +{ + static const char ifname[] = "gprs%d"; + struct gprs_dev *gp; + struct net_device *dev; + int err; + + if (unlikely(sk->sk_type == SOCK_STREAM)) + return -EINVAL; /* need packet boundaries */ + + /* Create net device */ + dev = alloc_netdev(sizeof(*gp), ifname, gprs_setup); + if (!dev) + return -ENOMEM; + gp = netdev_priv(dev); + gp->sk = sk; + gp->dev = dev; + + netif_stop_queue(dev); + err = register_netdev(dev); + if (err) { + free_netdev(dev); + return err; + } + + lock_sock(sk); + if (unlikely(sk->sk_user_data)) { + err = -EBUSY; + goto out_rel; + } + if (unlikely((1 << sk->sk_state & (TCPF_CLOSE|TCPF_LISTEN)) || + sock_flag(sk, SOCK_DEAD))) { + err = -EINVAL; + goto out_rel; + } + sk->sk_user_data = gp; + gp->old_state_change = sk->sk_state_change; + gp->old_data_ready = sk->sk_data_ready; + gp->old_write_space = sk->sk_write_space; + sk->sk_state_change = gprs_state_change; + sk->sk_data_ready = gprs_data_ready; + sk->sk_write_space = gprs_write_space; + release_sock(sk); + sock_hold(sk); + + printk(KERN_DEBUG"%s: attached\n", dev->name); + return dev->ifindex; + +out_rel: + release_sock(sk); + unregister_netdev(dev); + return err; +} + +void gprs_detach(struct sock *sk) +{ + struct gprs_dev *gp = sk->sk_user_data; + struct net_device *dev = gp->dev; + + lock_sock(sk); + sk->sk_user_data = NULL; + sk->sk_state_change = gp->old_state_change; + sk->sk_data_ready = gp->old_data_ready; + sk->sk_write_space = gp->old_write_space; + release_sock(sk); + + printk(KERN_DEBUG"%s: detached\n", dev->name); + unregister_netdev(dev); + sock_put(sk); +} diff --git a/net/phonet/pep.c b/net/phonet/pep.c new file mode 100644 index 00000000..d29a7fb3 --- /dev/null +++ b/net/phonet/pep.c @@ -0,0 +1,1293 @@ +/* + * File: pep.c + * + * Phonet pipe protocol end point socket + * + * Copyright (C) 2008 Nokia Corporation. + * + * Author: Rémi Denis-Courmont + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* sk_state values: + * TCP_CLOSE sock not in use yet + * TCP_CLOSE_WAIT disconnected pipe + * TCP_LISTEN listening pipe endpoint + * TCP_SYN_RECV connected pipe in disabled state + * TCP_ESTABLISHED connected pipe in enabled state + * + * pep_sock locking: + * - sk_state, hlist: sock lock needed + * - listener: read only + * - pipe_handle: read only + */ + +#define CREDITS_MAX 10 +#define CREDITS_THR 7 + +#define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */ + +/* Get the next TLV sub-block. */ +static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen, + void *buf) +{ + void *data = NULL; + struct { + u8 sb_type; + u8 sb_len; + } *ph, h; + int buflen = *plen; + + ph = skb_header_pointer(skb, 0, 2, &h); + if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len)) + return NULL; + ph->sb_len -= 2; + *ptype = ph->sb_type; + *plen = ph->sb_len; + + if (buflen > ph->sb_len) + buflen = ph->sb_len; + data = skb_header_pointer(skb, 2, buflen, buf); + __skb_pull(skb, 2 + ph->sb_len); + return data; +} + +static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload, + int len, gfp_t priority) +{ + struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority); + if (!skb) + return NULL; + skb_set_owner_w(skb, sk); + + skb_reserve(skb, MAX_PNPIPE_HEADER); + __skb_put(skb, len); + skb_copy_to_linear_data(skb, payload, len); + __skb_push(skb, sizeof(struct pnpipehdr)); + skb_reset_transport_header(skb); + return skb; +} + +static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code, + const void *data, int len, gfp_t priority) +{ + const struct pnpipehdr *oph = pnp_hdr(oskb); + struct pnpipehdr *ph; + struct sk_buff *skb; + struct sockaddr_pn peer; + + skb = pep_alloc_skb(sk, data, len, priority); + if (!skb) + return -ENOMEM; + + ph = pnp_hdr(skb); + ph->utid = oph->utid; + ph->message_id = oph->message_id + 1; /* REQ -> RESP */ + ph->pipe_handle = oph->pipe_handle; + ph->error_code = code; + + pn_skb_get_src_sockaddr(oskb, &peer); + return pn_skb_send(sk, skb, &peer); +} + +static int pep_indicate(struct sock *sk, u8 id, u8 code, + const void *data, int len, gfp_t priority) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *ph; + struct sk_buff *skb; + + skb = pep_alloc_skb(sk, data, len, priority); + if (!skb) + return -ENOMEM; + + ph = pnp_hdr(skb); + ph->utid = 0; + ph->message_id = id; + ph->pipe_handle = pn->pipe_handle; + ph->data[0] = code; + return pn_skb_send(sk, skb, NULL); +} + +#define PAD 0x00 + +static int pipe_handler_request(struct sock *sk, u8 id, u8 code, + const void *data, int len) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *ph; + struct sk_buff *skb; + + skb = pep_alloc_skb(sk, data, len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + ph = pnp_hdr(skb); + ph->utid = id; /* whatever */ + ph->message_id = id; + ph->pipe_handle = pn->pipe_handle; + ph->data[0] = code; + return pn_skb_send(sk, skb, NULL); +} + +static int pipe_handler_send_created_ind(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + u8 data[4] = { + PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2), + pn->tx_fc, pn->rx_fc, + }; + + return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */, + data, 4, GFP_ATOMIC); +} + +static int pep_accept_conn(struct sock *sk, struct sk_buff *skb) +{ + static const u8 data[20] = { + PAD, PAD, PAD, 2 /* sub-blocks */, + PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD, + PN_MULTI_CREDIT_FLOW_CONTROL, + PN_ONE_CREDIT_FLOW_CONTROL, + PN_LEGACY_FLOW_CONTROL, + PAD, + PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD, + PN_MULTI_CREDIT_FLOW_CONTROL, + PN_ONE_CREDIT_FLOW_CONTROL, + PN_LEGACY_FLOW_CONTROL, + PAD, + }; + + might_sleep(); + return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data), + GFP_KERNEL); +} + +static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code, + gfp_t priority) +{ + static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ }; + WARN_ON(code == PN_PIPE_NO_ERROR); + return pep_reply(sk, skb, code, data, sizeof(data), priority); +} + +/* Control requests are not sent by the pipe service and have a specific + * message format. */ +static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, + gfp_t priority) +{ + const struct pnpipehdr *oph = pnp_hdr(oskb); + struct sk_buff *skb; + struct pnpipehdr *ph; + struct sockaddr_pn dst; + u8 data[4] = { + oph->data[0], /* PEP type */ + code, /* error code, at an unusual offset */ + PAD, PAD, + }; + + skb = pep_alloc_skb(sk, data, 4, priority); + if (!skb) + return -ENOMEM; + + ph = pnp_hdr(skb); + ph->utid = oph->utid; + ph->message_id = PNS_PEP_CTRL_RESP; + ph->pipe_handle = oph->pipe_handle; + ph->data[0] = oph->data[1]; /* CTRL id */ + + pn_skb_get_src_sockaddr(oskb, &dst); + return pn_skb_send(sk, skb, &dst); +} + +static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) +{ + u8 data[4] = { type, PAD, PAD, status }; + + return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON, + data, 4, priority); +} + +/* Send our RX flow control information to the sender. + * Socket must be locked. */ +static void pipe_grant_credits(struct sock *sk, gfp_t priority) +{ + struct pep_sock *pn = pep_sk(sk); + + BUG_ON(sk->sk_state != TCP_ESTABLISHED); + + switch (pn->rx_fc) { + case PN_LEGACY_FLOW_CONTROL: /* TODO */ + break; + case PN_ONE_CREDIT_FLOW_CONTROL: + if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL, + PEP_IND_READY, priority) == 0) + pn->rx_credits = 1; + break; + case PN_MULTI_CREDIT_FLOW_CONTROL: + if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX) + break; + if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, + CREDITS_MAX - pn->rx_credits, + priority) == 0) + pn->rx_credits = CREDITS_MAX; + break; + } +} + +static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *hdr; + int wake = 0; + + if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) + return -EINVAL; + + hdr = pnp_hdr(skb); + if (hdr->data[0] != PN_PEP_TYPE_COMMON) { + LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n", + (unsigned)hdr->data[0]); + return -EOPNOTSUPP; + } + + switch (hdr->data[1]) { + case PN_PEP_IND_FLOW_CONTROL: + switch (pn->tx_fc) { + case PN_LEGACY_FLOW_CONTROL: + switch (hdr->data[4]) { + case PEP_IND_BUSY: + atomic_set(&pn->tx_credits, 0); + break; + case PEP_IND_READY: + atomic_set(&pn->tx_credits, wake = 1); + break; + } + break; + case PN_ONE_CREDIT_FLOW_CONTROL: + if (hdr->data[4] == PEP_IND_READY) + atomic_set(&pn->tx_credits, wake = 1); + break; + } + break; + + case PN_PEP_IND_ID_MCFC_GRANT_CREDITS: + if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL) + break; + atomic_add(wake = hdr->data[4], &pn->tx_credits); + break; + + default: + LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP indication: %u\n", + (unsigned)hdr->data[1]); + return -EOPNOTSUPP; + } + if (wake) + sk->sk_write_space(sk); + return 0; +} + +static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *hdr = pnp_hdr(skb); + u8 n_sb = hdr->data[0]; + + pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; + __skb_pull(skb, sizeof(*hdr)); + while (n_sb > 0) { + u8 type, buf[2], len = sizeof(buf); + u8 *data = pep_get_sb(skb, &type, &len, buf); + + if (data == NULL) + return -EINVAL; + switch (type) { + case PN_PIPE_SB_NEGOTIATED_FC: + if (len < 2 || (data[0] | data[1]) > 3) + break; + pn->tx_fc = data[0] & 3; + pn->rx_fc = data[1] & 3; + break; + } + n_sb--; + } + return 0; +} + +/* Queue an skb to a connected sock. + * Socket lock must be held. */ +static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *hdr = pnp_hdr(skb); + struct sk_buff_head *queue; + int err = 0; + + BUG_ON(sk->sk_state == TCP_CLOSE_WAIT); + + switch (hdr->message_id) { + case PNS_PEP_CONNECT_REQ: + pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC); + break; + + case PNS_PEP_DISCONNECT_REQ: + pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); + sk->sk_state = TCP_CLOSE_WAIT; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + + case PNS_PEP_ENABLE_REQ: + /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */ + pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); + break; + + case PNS_PEP_RESET_REQ: + switch (hdr->state_after_reset) { + case PN_PIPE_DISABLE: + pn->init_enable = 0; + break; + case PN_PIPE_ENABLE: + pn->init_enable = 1; + break; + default: /* not allowed to send an error here!? */ + err = -EINVAL; + goto out; + } + /* fall through */ + case PNS_PEP_DISABLE_REQ: + atomic_set(&pn->tx_credits, 0); + pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); + break; + + case PNS_PEP_CTRL_REQ: + if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { + atomic_inc(&sk->sk_drops); + break; + } + __skb_pull(skb, 4); + queue = &pn->ctrlreq_queue; + goto queue; + + case PNS_PIPE_ALIGNED_DATA: + __skb_pull(skb, 1); + /* fall through */ + case PNS_PIPE_DATA: + __skb_pull(skb, 3); /* Pipe data header */ + if (!pn_flow_safe(pn->rx_fc)) { + err = sock_queue_rcv_skb(sk, skb); + if (!err) + return NET_RX_SUCCESS; + err = -ENOBUFS; + break; + } + + if (pn->rx_credits == 0) { + atomic_inc(&sk->sk_drops); + err = -ENOBUFS; + break; + } + pn->rx_credits--; + queue = &sk->sk_receive_queue; + goto queue; + + case PNS_PEP_STATUS_IND: + pipe_rcv_status(sk, skb); + break; + + case PNS_PIPE_REDIRECTED_IND: + err = pipe_rcv_created(sk, skb); + break; + + case PNS_PIPE_CREATED_IND: + err = pipe_rcv_created(sk, skb); + if (err) + break; + /* fall through */ + case PNS_PIPE_RESET_IND: + if (!pn->init_enable) + break; + /* fall through */ + case PNS_PIPE_ENABLED_IND: + if (!pn_flow_safe(pn->tx_fc)) { + atomic_set(&pn->tx_credits, 1); + sk->sk_write_space(sk); + } + if (sk->sk_state == TCP_ESTABLISHED) + break; /* Nothing to do */ + sk->sk_state = TCP_ESTABLISHED; + pipe_grant_credits(sk, GFP_ATOMIC); + break; + + case PNS_PIPE_DISABLED_IND: + sk->sk_state = TCP_SYN_RECV; + pn->rx_credits = 0; + break; + + default: + LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP message: %u\n", + hdr->message_id); + err = -EINVAL; + } +out: + kfree_skb(skb); + return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS; + +queue: + skb->dev = NULL; + skb_set_owner_r(skb, sk); + err = skb->len; + skb_queue_tail(queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, err); + return NET_RX_SUCCESS; +} + +/* Destroy connected sock. */ +static void pipe_destruct(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&pn->ctrlreq_queue); +} + +static u8 pipe_negotiate_fc(const u8 *fcs, unsigned n) +{ + unsigned i; + u8 final_fc = PN_NO_FLOW_CONTROL; + + for (i = 0; i < n; i++) { + u8 fc = fcs[i]; + + if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL) + final_fc = fc; + } + return final_fc; +} + +static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *hdr; + u8 n_sb; + + if (!pskb_pull(skb, sizeof(*hdr) + 4)) + return -EINVAL; + + hdr = pnp_hdr(skb); + if (hdr->error_code != PN_PIPE_NO_ERROR) + return -ECONNREFUSED; + + /* Parse sub-blocks */ + n_sb = hdr->data[4]; + while (n_sb > 0) { + u8 type, buf[6], len = sizeof(buf); + const u8 *data = pep_get_sb(skb, &type, &len, buf); + + if (data == NULL) + return -EINVAL; + + switch (type) { + case PN_PIPE_SB_REQUIRED_FC_TX: + if (len < 2 || len < data[0]) + break; + pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2); + break; + + case PN_PIPE_SB_PREFERRED_FC_RX: + if (len < 2 || len < data[0]) + break; + pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2); + break; + + } + n_sb--; + } + + return pipe_handler_send_created_ind(sk); +} + +/* Queue an skb to an actively connected sock. + * Socket lock must be held. */ +static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *hdr = pnp_hdr(skb); + int err = NET_RX_SUCCESS; + + switch (hdr->message_id) { + case PNS_PIPE_ALIGNED_DATA: + __skb_pull(skb, 1); + /* fall through */ + case PNS_PIPE_DATA: + __skb_pull(skb, 3); /* Pipe data header */ + if (!pn_flow_safe(pn->rx_fc)) { + err = sock_queue_rcv_skb(sk, skb); + if (!err) + return NET_RX_SUCCESS; + err = NET_RX_DROP; + break; + } + + if (pn->rx_credits == 0) { + atomic_inc(&sk->sk_drops); + err = NET_RX_DROP; + break; + } + pn->rx_credits--; + skb->dev = NULL; + skb_set_owner_r(skb, sk); + err = skb->len; + skb_queue_tail(&sk->sk_receive_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, err); + return NET_RX_SUCCESS; + + case PNS_PEP_CONNECT_RESP: + if (sk->sk_state != TCP_SYN_SENT) + break; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + if (pep_connresp_rcv(sk, skb)) { + sk->sk_state = TCP_CLOSE_WAIT; + break; + } + + sk->sk_state = TCP_ESTABLISHED; + if (!pn_flow_safe(pn->tx_fc)) { + atomic_set(&pn->tx_credits, 1); + sk->sk_write_space(sk); + } + pipe_grant_credits(sk, GFP_ATOMIC); + break; + + case PNS_PEP_DISCONNECT_RESP: + /* sock should already be dead, nothing to do */ + break; + + case PNS_PEP_STATUS_IND: + pipe_rcv_status(sk, skb); + break; + } + kfree_skb(skb); + return err; +} + +/* Listening sock must be locked */ +static struct sock *pep_find_pipe(const struct hlist_head *hlist, + const struct sockaddr_pn *dst, + u8 pipe_handle) +{ + struct hlist_node *node; + struct sock *sknode; + u16 dobj = pn_sockaddr_get_object(dst); + + sk_for_each(sknode, node, hlist) { + struct pep_sock *pnnode = pep_sk(sknode); + + /* Ports match, but addresses might not: */ + if (pnnode->pn_sk.sobject != dobj) + continue; + if (pnnode->pipe_handle != pipe_handle) + continue; + if (sknode->sk_state == TCP_CLOSE_WAIT) + continue; + + sock_hold(sknode); + return sknode; + } + return NULL; +} + +/* + * Deliver an skb to a listening sock. + * Socket lock must be held. + * We then queue the skb to the right connected sock (if any). + */ +static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct sock *sknode; + struct pnpipehdr *hdr; + struct sockaddr_pn dst; + u8 pipe_handle; + + if (!pskb_may_pull(skb, sizeof(*hdr))) + goto drop; + + hdr = pnp_hdr(skb); + pipe_handle = hdr->pipe_handle; + if (pipe_handle == PN_PIPE_INVALID_HANDLE) + goto drop; + + pn_skb_get_dst_sockaddr(skb, &dst); + + /* Look for an existing pipe handle */ + sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle); + if (sknode) + return sk_receive_skb(sknode, skb, 1); + + switch (hdr->message_id) { + case PNS_PEP_CONNECT_REQ: + if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) { + pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, + GFP_ATOMIC); + break; + } + skb_queue_head(&sk->sk_receive_queue, skb); + sk_acceptq_added(sk); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + return NET_RX_SUCCESS; + + case PNS_PEP_DISCONNECT_REQ: + pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); + break; + + case PNS_PEP_CTRL_REQ: + pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC); + break; + + case PNS_PEP_RESET_REQ: + case PNS_PEP_ENABLE_REQ: + case PNS_PEP_DISABLE_REQ: + /* invalid handle is not even allowed here! */ + break; + + default: + if ((1 << sk->sk_state) + & ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT)) + /* actively connected socket */ + return pipe_handler_do_rcv(sk, skb); + } +drop: + kfree_skb(skb); + return NET_RX_SUCCESS; +} + +static int pipe_do_remove(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *ph; + struct sk_buff *skb; + + skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + ph = pnp_hdr(skb); + ph->utid = 0; + ph->message_id = PNS_PIPE_REMOVE_REQ; + ph->pipe_handle = pn->pipe_handle; + ph->data[0] = PAD; + return pn_skb_send(sk, skb, NULL); +} + +/* associated socket ceases to exist */ +static void pep_sock_close(struct sock *sk, long timeout) +{ + struct pep_sock *pn = pep_sk(sk); + int ifindex = 0; + + sock_hold(sk); /* keep a reference after sk_common_release() */ + sk_common_release(sk); + + lock_sock(sk); + if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) { + if (sk->sk_backlog_rcv == pipe_do_rcv) + /* Forcefully remove dangling Phonet pipe */ + pipe_do_remove(sk); + else + pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD, + NULL, 0); + } + sk->sk_state = TCP_CLOSE; + + ifindex = pn->ifindex; + pn->ifindex = 0; + release_sock(sk); + + if (ifindex) + gprs_detach(sk); + sock_put(sk); +} + +static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) +{ + struct pep_sock *pn = pep_sk(sk), *newpn; + struct sock *newsk = NULL; + struct sk_buff *skb; + struct pnpipehdr *hdr; + struct sockaddr_pn dst, src; + int err; + u16 peer_type; + u8 pipe_handle, enabled, n_sb; + u8 aligned = 0; + + skb = skb_recv_datagram(sk, 0, flags & O_NONBLOCK, errp); + if (!skb) + return NULL; + + lock_sock(sk); + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto drop; + } + sk_acceptq_removed(sk); + + err = -EPROTO; + if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) + goto drop; + + hdr = pnp_hdr(skb); + pipe_handle = hdr->pipe_handle; + switch (hdr->state_after_connect) { + case PN_PIPE_DISABLE: + enabled = 0; + break; + case PN_PIPE_ENABLE: + enabled = 1; + break; + default: + pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM, + GFP_KERNEL); + goto drop; + } + peer_type = hdr->other_pep_type << 8; + + /* Parse sub-blocks (options) */ + n_sb = hdr->data[4]; + while (n_sb > 0) { + u8 type, buf[1], len = sizeof(buf); + const u8 *data = pep_get_sb(skb, &type, &len, buf); + + if (data == NULL) + goto drop; + switch (type) { + case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE: + if (len < 1) + goto drop; + peer_type = (peer_type & 0xff00) | data[0]; + break; + case PN_PIPE_SB_ALIGNED_DATA: + aligned = data[0] != 0; + break; + } + n_sb--; + } + + /* Check for duplicate pipe handle */ + newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle); + if (unlikely(newsk)) { + __sock_put(newsk); + newsk = NULL; + pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL); + goto drop; + } + + /* Create a new to-be-accepted sock */ + newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot); + if (!newsk) { + pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); + err = -ENOBUFS; + goto drop; + } + + sock_init_data(NULL, newsk); + newsk->sk_state = TCP_SYN_RECV; + newsk->sk_backlog_rcv = pipe_do_rcv; + newsk->sk_protocol = sk->sk_protocol; + newsk->sk_destruct = pipe_destruct; + + newpn = pep_sk(newsk); + pn_skb_get_dst_sockaddr(skb, &dst); + pn_skb_get_src_sockaddr(skb, &src); + newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst); + newpn->pn_sk.dobject = pn_sockaddr_get_object(&src); + newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst); + sock_hold(sk); + newpn->listener = sk; + skb_queue_head_init(&newpn->ctrlreq_queue); + newpn->pipe_handle = pipe_handle; + atomic_set(&newpn->tx_credits, 0); + newpn->ifindex = 0; + newpn->peer_type = peer_type; + newpn->rx_credits = 0; + newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL; + newpn->init_enable = enabled; + newpn->aligned = aligned; + + err = pep_accept_conn(newsk, skb); + if (err) { + sock_put(newsk); + newsk = NULL; + goto drop; + } + sk_add_node(newsk, &pn->hlist); +drop: + release_sock(sk); + kfree_skb(skb); + *errp = err; + return newsk; +} + +static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len) +{ + struct pep_sock *pn = pep_sk(sk); + int err; + u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD }; + + pn->pipe_handle = 1; /* anything but INVALID_HANDLE */ + err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ, + PN_PIPE_ENABLE, data, 4); + if (err) { + pn->pipe_handle = PN_PIPE_INVALID_HANDLE; + return err; + } + sk->sk_state = TCP_SYN_SENT; + return 0; +} + +static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct pep_sock *pn = pep_sk(sk); + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + if (sock_flag(sk, SOCK_URGINLINE) && + !skb_queue_empty(&pn->ctrlreq_queue)) + answ = skb_peek(&pn->ctrlreq_queue)->len; + else if (!skb_queue_empty(&sk->sk_receive_queue)) + answ = skb_peek(&sk->sk_receive_queue)->len; + else + answ = 0; + release_sock(sk); + return put_user(answ, (int __user *)arg); + } + + return -ENOIOCTLCMD; +} + +static int pep_init(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + + sk->sk_destruct = pipe_destruct; + INIT_HLIST_HEAD(&pn->hlist); + pn->listener = NULL; + skb_queue_head_init(&pn->ctrlreq_queue); + atomic_set(&pn->tx_credits, 0); + pn->ifindex = 0; + pn->peer_type = 0; + pn->pipe_handle = PN_PIPE_INVALID_HANDLE; + pn->rx_credits = 0; + pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; + pn->init_enable = 1; + pn->aligned = 0; + return 0; +} + +static int pep_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct pep_sock *pn = pep_sk(sk); + int val = 0, err = 0; + + if (level != SOL_PNPIPE) + return -ENOPROTOOPT; + if (optlen >= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } + + lock_sock(sk); + switch (optname) { + case PNPIPE_ENCAP: + if (val && val != PNPIPE_ENCAP_IP) { + err = -EINVAL; + break; + } + if (!pn->ifindex == !val) + break; /* Nothing to do! */ + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (val) { + release_sock(sk); + err = gprs_attach(sk); + if (err > 0) { + pn->ifindex = err; + err = 0; + } + } else { + pn->ifindex = 0; + release_sock(sk); + gprs_detach(sk); + err = 0; + } + goto out_norel; + + default: + err = -ENOPROTOOPT; + } + release_sock(sk); + +out_norel: + return err; +} + +static int pep_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct pep_sock *pn = pep_sk(sk); + int len, val; + + if (level != SOL_PNPIPE) + return -ENOPROTOOPT; + if (get_user(len, optlen)) + return -EFAULT; + + switch (optname) { + case PNPIPE_ENCAP: + val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE; + break; + + case PNPIPE_IFINDEX: + val = pn->ifindex; + break; + + case PNPIPE_HANDLE: + val = pn->pipe_handle; + if (val == PN_PIPE_INVALID_HANDLE) + return -EINVAL; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, sizeof(int), len); + if (put_user(len, optlen)) + return -EFAULT; + if (put_user(val, (int __user *) optval)) + return -EFAULT; + return 0; +} + +static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *ph; + int err; + + if (pn_flow_safe(pn->tx_fc) && + !atomic_add_unless(&pn->tx_credits, -1, 0)) { + kfree_skb(skb); + return -ENOBUFS; + } + + skb_push(skb, 3 + pn->aligned); + skb_reset_transport_header(skb); + ph = pnp_hdr(skb); + ph->utid = 0; + if (pn->aligned) { + ph->message_id = PNS_PIPE_ALIGNED_DATA; + ph->data[0] = 0; /* padding */ + } else + ph->message_id = PNS_PIPE_DATA; + ph->pipe_handle = pn->pipe_handle; + err = pn_skb_send(sk, skb, NULL); + + if (err && pn_flow_safe(pn->tx_fc)) + atomic_inc(&pn->tx_credits); + return err; + +} + +static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct pep_sock *pn = pep_sk(sk); + struct sk_buff *skb; + long timeo; + int flags = msg->msg_flags; + int err, done; + + if (len > USHRT_MAX) + return -EMSGSIZE; + + if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| + MSG_CMSG_COMPAT)) || + !(msg->msg_flags & MSG_EOR)) + return -EOPNOTSUPP; + + skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, + flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned); + err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (err < 0) + goto outfree; + + lock_sock(sk); + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) { + err = -ENOTCONN; + goto out; + } + if (sk->sk_state != TCP_ESTABLISHED) { + /* Wait until the pipe gets to enabled state */ +disabled: + err = sk_stream_wait_connect(sk, &timeo); + if (err) + goto out; + + if (sk->sk_state == TCP_CLOSE_WAIT) { + err = -ECONNRESET; + goto out; + } + } + BUG_ON(sk->sk_state != TCP_ESTABLISHED); + + /* Wait until flow control allows TX */ + done = atomic_read(&pn->tx_credits); + while (!done) { + DEFINE_WAIT(wait); + + if (!timeo) { + err = -EAGAIN; + goto out; + } + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits)); + finish_wait(sk_sleep(sk), &wait); + + if (sk->sk_state != TCP_ESTABLISHED) + goto disabled; + } + + err = pipe_skb_send(sk, skb); + if (err >= 0) + err = len; /* success! */ + skb = NULL; +out: + release_sock(sk); +outfree: + kfree_skb(skb); + return err; +} + +int pep_writeable(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + + return atomic_read(&pn->tx_credits); +} + +int pep_write(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *rskb, *fs; + int flen = 0; + + if (pep_sk(sk)->aligned) + return pipe_skb_send(sk, skb); + + rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC); + if (!rskb) { + kfree_skb(skb); + return -ENOMEM; + } + skb_shinfo(rskb)->frag_list = skb; + rskb->len += skb->len; + rskb->data_len += rskb->len; + rskb->truesize += rskb->len; + + /* Avoid nested fragments */ + skb_walk_frags(skb, fs) + flen += fs->len; + skb->next = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + skb->len -= flen; + skb->data_len -= flen; + skb->truesize -= flen; + + skb_reserve(rskb, MAX_PHONET_HEADER + 3); + return pipe_skb_send(sk, rskb); +} + +struct sk_buff *pep_read(struct sock *sk) +{ + struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue); + + if (sk->sk_state == TCP_ESTABLISHED) + pipe_grant_credits(sk, GFP_ATOMIC); + return skb; +} + +static int pep_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) +{ + struct sk_buff *skb; + int err; + + if (flags & ~(MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_WAITALL| + MSG_NOSIGNAL|MSG_CMSG_COMPAT)) + return -EOPNOTSUPP; + + if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE))) + return -ENOTCONN; + + if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) { + /* Dequeue and acknowledge control request */ + struct pep_sock *pn = pep_sk(sk); + + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + skb = skb_dequeue(&pn->ctrlreq_queue); + if (skb) { + pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR, + GFP_KERNEL); + msg->msg_flags |= MSG_OOB; + goto copy; + } + if (flags & MSG_OOB) + return -EINVAL; + } + + skb = skb_recv_datagram(sk, flags, noblock, &err); + lock_sock(sk); + if (skb == NULL) { + if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT) + err = -ECONNRESET; + release_sock(sk); + return err; + } + + if (sk->sk_state == TCP_ESTABLISHED) + pipe_grant_credits(sk, GFP_KERNEL); + release_sock(sk); +copy: + msg->msg_flags |= MSG_EOR; + if (skb->len > len) + msg->msg_flags |= MSG_TRUNC; + else + len = skb->len; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + if (!err) + err = (flags & MSG_TRUNC) ? skb->len : len; + + skb_free_datagram(sk, skb); + return err; +} + +static void pep_sock_unhash(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + struct sock *skparent = NULL; + + lock_sock(sk); + + if (pn->listener != NULL) { + skparent = pn->listener; + pn->listener = NULL; + release_sock(sk); + + pn = pep_sk(skparent); + lock_sock(skparent); + sk_del_node_init(sk); + sk = skparent; + } + + /* Unhash a listening sock only when it is closed + * and all of its active connected pipes are closed. */ + if (hlist_empty(&pn->hlist)) + pn_sock_unhash(&pn->pn_sk.sk); + release_sock(sk); + + if (skparent) + sock_put(skparent); +} + +static struct proto pep_proto = { + .close = pep_sock_close, + .accept = pep_sock_accept, + .connect = pep_sock_connect, + .ioctl = pep_ioctl, + .init = pep_init, + .setsockopt = pep_setsockopt, + .getsockopt = pep_getsockopt, + .sendmsg = pep_sendmsg, + .recvmsg = pep_recvmsg, + .backlog_rcv = pep_do_rcv, + .hash = pn_sock_hash, + .unhash = pep_sock_unhash, + .get_port = pn_sock_get_port, + .obj_size = sizeof(struct pep_sock), + .owner = THIS_MODULE, + .name = "PNPIPE", +}; + +static struct phonet_protocol pep_pn_proto = { + .ops = &phonet_stream_ops, + .prot = &pep_proto, + .sock_type = SOCK_SEQPACKET, +}; + +static int __init pep_register(void) +{ + return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto); +} + +static void __exit pep_unregister(void) +{ + phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto); +} + +module_init(pep_register); +module_exit(pep_unregister); +MODULE_AUTHOR("Remi Denis-Courmont, Nokia"); +MODULE_DESCRIPTION("Phonet pipe protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c new file mode 100644 index 00000000..d2df8f33 --- /dev/null +++ b/net/phonet/pn_dev.c @@ -0,0 +1,448 @@ +/* + * File: pn_dev.c + * + * Phonet network device + * + * Copyright (C) 2008 Nokia Corporation. + * + * Contact: Remi Denis-Courmont + * Original author: Sakari Ailus + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct phonet_routes { + struct mutex lock; + struct net_device *table[64]; +}; + +struct phonet_net { + struct phonet_device_list pndevs; + struct phonet_routes routes; +}; + +int phonet_net_id __read_mostly; + +static struct phonet_net *phonet_pernet(struct net *net) +{ + BUG_ON(!net); + + return net_generic(net, phonet_net_id); +} + +struct phonet_device_list *phonet_device_list(struct net *net) +{ + struct phonet_net *pnn = phonet_pernet(net); + return &pnn->pndevs; +} + +/* Allocate new Phonet device. */ +static struct phonet_device *__phonet_device_alloc(struct net_device *dev) +{ + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); + struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC); + if (pnd == NULL) + return NULL; + pnd->netdev = dev; + bitmap_zero(pnd->addrs, 64); + + BUG_ON(!mutex_is_locked(&pndevs->lock)); + list_add_rcu(&pnd->list, &pndevs->list); + return pnd; +} + +static struct phonet_device *__phonet_get(struct net_device *dev) +{ + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); + struct phonet_device *pnd; + + BUG_ON(!mutex_is_locked(&pndevs->lock)); + list_for_each_entry(pnd, &pndevs->list, list) { + if (pnd->netdev == dev) + return pnd; + } + return NULL; +} + +static struct phonet_device *__phonet_get_rcu(struct net_device *dev) +{ + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); + struct phonet_device *pnd; + + list_for_each_entry_rcu(pnd, &pndevs->list, list) { + if (pnd->netdev == dev) + return pnd; + } + return NULL; +} + +static void phonet_device_destroy(struct net_device *dev) +{ + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); + struct phonet_device *pnd; + + ASSERT_RTNL(); + + mutex_lock(&pndevs->lock); + pnd = __phonet_get(dev); + if (pnd) + list_del_rcu(&pnd->list); + mutex_unlock(&pndevs->lock); + + if (pnd) { + u8 addr; + + for_each_set_bit(addr, pnd->addrs, 64) + phonet_address_notify(RTM_DELADDR, dev, addr); + kfree(pnd); + } +} + +struct net_device *phonet_device_get(struct net *net) +{ + struct phonet_device_list *pndevs = phonet_device_list(net); + struct phonet_device *pnd; + struct net_device *dev = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(pnd, &pndevs->list, list) { + dev = pnd->netdev; + BUG_ON(!dev); + + if ((dev->reg_state == NETREG_REGISTERED) && + ((pnd->netdev->flags & IFF_UP)) == IFF_UP) + break; + dev = NULL; + } + if (dev) + dev_hold(dev); + rcu_read_unlock(); + return dev; +} + +int phonet_address_add(struct net_device *dev, u8 addr) +{ + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); + struct phonet_device *pnd; + int err = 0; + + mutex_lock(&pndevs->lock); + /* Find or create Phonet-specific device data */ + pnd = __phonet_get(dev); + if (pnd == NULL) + pnd = __phonet_device_alloc(dev); + if (unlikely(pnd == NULL)) + err = -ENOMEM; + else if (test_and_set_bit(addr >> 2, pnd->addrs)) + err = -EEXIST; + mutex_unlock(&pndevs->lock); + return err; +} + +int phonet_address_del(struct net_device *dev, u8 addr) +{ + struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); + struct phonet_device *pnd; + int err = 0; + + mutex_lock(&pndevs->lock); + pnd = __phonet_get(dev); + if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) { + err = -EADDRNOTAVAIL; + pnd = NULL; + } else if (bitmap_empty(pnd->addrs, 64)) + list_del_rcu(&pnd->list); + else + pnd = NULL; + mutex_unlock(&pndevs->lock); + + if (pnd) + kfree_rcu(pnd, rcu); + + return err; +} + +/* Gets a source address toward a destination, through a interface. */ +u8 phonet_address_get(struct net_device *dev, u8 daddr) +{ + struct phonet_device *pnd; + u8 saddr; + + rcu_read_lock(); + pnd = __phonet_get_rcu(dev); + if (pnd) { + BUG_ON(bitmap_empty(pnd->addrs, 64)); + + /* Use same source address as destination, if possible */ + if (test_bit(daddr >> 2, pnd->addrs)) + saddr = daddr; + else + saddr = find_first_bit(pnd->addrs, 64) << 2; + } else + saddr = PN_NO_ADDR; + rcu_read_unlock(); + + if (saddr == PN_NO_ADDR) { + /* Fallback to another device */ + struct net_device *def_dev; + + def_dev = phonet_device_get(dev_net(dev)); + if (def_dev) { + if (def_dev != dev) + saddr = phonet_address_get(def_dev, daddr); + dev_put(def_dev); + } + } + return saddr; +} + +int phonet_address_lookup(struct net *net, u8 addr) +{ + struct phonet_device_list *pndevs = phonet_device_list(net); + struct phonet_device *pnd; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + list_for_each_entry_rcu(pnd, &pndevs->list, list) { + /* Don't allow unregistering devices! */ + if ((pnd->netdev->reg_state != NETREG_REGISTERED) || + ((pnd->netdev->flags & IFF_UP)) != IFF_UP) + continue; + + if (test_bit(addr >> 2, pnd->addrs)) { + err = 0; + goto found; + } + } +found: + rcu_read_unlock(); + return err; +} + +/* automatically configure a Phonet device, if supported */ +static int phonet_device_autoconf(struct net_device *dev) +{ + struct if_phonet_req req; + int ret; + + if (!dev->netdev_ops->ndo_do_ioctl) + return -EOPNOTSUPP; + + ret = dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *)&req, + SIOCPNGAUTOCONF); + if (ret < 0) + return ret; + + ASSERT_RTNL(); + ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device); + if (ret) + return ret; + phonet_address_notify(RTM_NEWADDR, dev, + req.ifr_phonet_autoconf.device); + return 0; +} + +static void phonet_route_autodel(struct net_device *dev) +{ + struct phonet_net *pnn = phonet_pernet(dev_net(dev)); + unsigned i; + DECLARE_BITMAP(deleted, 64); + + /* Remove left-over Phonet routes */ + bitmap_zero(deleted, 64); + mutex_lock(&pnn->routes.lock); + for (i = 0; i < 64; i++) + if (dev == pnn->routes.table[i]) { + rcu_assign_pointer(pnn->routes.table[i], NULL); + set_bit(i, deleted); + } + mutex_unlock(&pnn->routes.lock); + + if (bitmap_empty(deleted, 64)) + return; /* short-circuit RCU */ + synchronize_rcu(); + for_each_set_bit(i, deleted, 64) { + rtm_phonet_notify(RTM_DELROUTE, dev, i); + dev_put(dev); + } +} + +/* notify Phonet of device events */ +static int phonet_device_notify(struct notifier_block *me, unsigned long what, + void *arg) +{ + struct net_device *dev = arg; + + switch (what) { + case NETDEV_REGISTER: + if (dev->type == ARPHRD_PHONET) + phonet_device_autoconf(dev); + break; + case NETDEV_UNREGISTER: + phonet_device_destroy(dev); + phonet_route_autodel(dev); + break; + } + return 0; + +} + +static struct notifier_block phonet_device_notifier = { + .notifier_call = phonet_device_notify, + .priority = 0, +}; + +/* Per-namespace Phonet devices handling */ +static int __net_init phonet_init_net(struct net *net) +{ + struct phonet_net *pnn = phonet_pernet(net); + + if (!proc_net_fops_create(net, "phonet", 0, &pn_sock_seq_fops)) + return -ENOMEM; + + INIT_LIST_HEAD(&pnn->pndevs.list); + mutex_init(&pnn->pndevs.lock); + mutex_init(&pnn->routes.lock); + return 0; +} + +static void __net_exit phonet_exit_net(struct net *net) +{ + struct phonet_net *pnn = phonet_pernet(net); + struct net_device *dev; + unsigned i; + + rtnl_lock(); + for_each_netdev(net, dev) + phonet_device_destroy(dev); + + for (i = 0; i < 64; i++) { + dev = pnn->routes.table[i]; + if (dev) { + rtm_phonet_notify(RTM_DELROUTE, dev, i); + dev_put(dev); + } + } + rtnl_unlock(); + + proc_net_remove(net, "phonet"); +} + +static struct pernet_operations phonet_net_ops = { + .init = phonet_init_net, + .exit = phonet_exit_net, + .id = &phonet_net_id, + .size = sizeof(struct phonet_net), +}; + +/* Initialize Phonet devices list */ +int __init phonet_device_init(void) +{ + int err = register_pernet_device(&phonet_net_ops); + if (err) + return err; + + proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops); + register_netdevice_notifier(&phonet_device_notifier); + err = phonet_netlink_register(); + if (err) + phonet_device_exit(); + return err; +} + +void phonet_device_exit(void) +{ + rtnl_unregister_all(PF_PHONET); + unregister_netdevice_notifier(&phonet_device_notifier); + unregister_pernet_device(&phonet_net_ops); + proc_net_remove(&init_net, "pnresource"); +} + +int phonet_route_add(struct net_device *dev, u8 daddr) +{ + struct phonet_net *pnn = phonet_pernet(dev_net(dev)); + struct phonet_routes *routes = &pnn->routes; + int err = -EEXIST; + + daddr = daddr >> 2; + mutex_lock(&routes->lock); + if (routes->table[daddr] == NULL) { + rcu_assign_pointer(routes->table[daddr], dev); + dev_hold(dev); + err = 0; + } + mutex_unlock(&routes->lock); + return err; +} + +int phonet_route_del(struct net_device *dev, u8 daddr) +{ + struct phonet_net *pnn = phonet_pernet(dev_net(dev)); + struct phonet_routes *routes = &pnn->routes; + + daddr = daddr >> 2; + mutex_lock(&routes->lock); + if (dev == routes->table[daddr]) + rcu_assign_pointer(routes->table[daddr], NULL); + else + dev = NULL; + mutex_unlock(&routes->lock); + + if (!dev) + return -ENOENT; + synchronize_rcu(); + dev_put(dev); + return 0; +} + +struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr) +{ + struct phonet_net *pnn = phonet_pernet(net); + struct phonet_routes *routes = &pnn->routes; + struct net_device *dev; + + daddr >>= 2; + dev = rcu_dereference(routes->table[daddr]); + return dev; +} + +struct net_device *phonet_route_output(struct net *net, u8 daddr) +{ + struct phonet_net *pnn = phonet_pernet(net); + struct phonet_routes *routes = &pnn->routes; + struct net_device *dev; + + daddr >>= 2; + rcu_read_lock(); + dev = rcu_dereference(routes->table[daddr]); + if (dev) + dev_hold(dev); + rcu_read_unlock(); + + if (!dev) + dev = phonet_device_get(net); /* Default route */ + return dev; +} diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c new file mode 100644 index 00000000..438accb7 --- /dev/null +++ b/net/phonet/pn_netlink.c @@ -0,0 +1,303 @@ +/* + * File: pn_netlink.c + * + * Phonet netlink interface + * + * Copyright (C) 2008 Nokia Corporation. + * + * Contact: Remi Denis-Courmont + * Original author: Sakari Ailus + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include + +/* Device address handling */ + +static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, + u32 pid, u32 seq, int event); + +void phonet_address_notify(int event, struct net_device *dev, u8 addr) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(1), GFP_KERNEL); + if (skb == NULL) + goto errout; + err = fill_addr(skb, dev, addr, 0, 0, event); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, dev_net(dev), 0, + RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + return; +errout: + rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); +} + +static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { + [IFA_LOCAL] = { .type = NLA_U8 }, +}; + +static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX+1]; + struct net_device *dev; + struct ifaddrmsg *ifm; + int err; + u8 pnaddr; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ASSERT_RTNL(); + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + if (tb[IFA_LOCAL] == NULL) + return -EINVAL; + pnaddr = nla_get_u8(tb[IFA_LOCAL]); + if (pnaddr & 3) + /* Phonet addresses only have 6 high-order bits */ + return -EINVAL; + + dev = __dev_get_by_index(net, ifm->ifa_index); + if (dev == NULL) + return -ENODEV; + + if (nlh->nlmsg_type == RTM_NEWADDR) + err = phonet_address_add(dev, pnaddr); + else + err = phonet_address_del(dev, pnaddr); + if (!err) + phonet_address_notify(nlh->nlmsg_type, dev, pnaddr); + return err; +} + +static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), 0); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_PHONET; + ifm->ifa_prefixlen = 0; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_LINK; + ifm->ifa_index = dev->ifindex; + NLA_PUT_U8(skb, IFA_LOCAL, addr); + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct phonet_device_list *pndevs; + struct phonet_device *pnd; + int dev_idx = 0, dev_start_idx = cb->args[0]; + int addr_idx = 0, addr_start_idx = cb->args[1]; + + pndevs = phonet_device_list(sock_net(skb->sk)); + rcu_read_lock(); + list_for_each_entry_rcu(pnd, &pndevs->list, list) { + u8 addr; + + if (dev_idx > dev_start_idx) + addr_start_idx = 0; + if (dev_idx++ < dev_start_idx) + continue; + + addr_idx = 0; + for_each_set_bit(addr, pnd->addrs, 64) { + if (addr_idx++ < addr_start_idx) + continue; + + if (fill_addr(skb, pnd->netdev, addr << 2, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) + goto out; + } + } + +out: + rcu_read_unlock(); + cb->args[0] = dev_idx; + cb->args[1] = addr_idx; + + return skb->len; +} + +/* Routes handling */ + +static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, + u32 pid, u32 seq, int event) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), 0); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_PHONET; + rtm->rtm_dst_len = 6; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_protocol = RTPROT_STATIC; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + NLA_PUT_U8(skb, RTA_DST, dst); + NLA_PUT_U32(skb, RTA_OIF, dev->ifindex); + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(1) + nla_total_size(4), GFP_KERNEL); + if (skb == NULL) + goto errout; + err = fill_route(skb, dev, dst, 0, 0, event); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, dev_net(dev), 0, + RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); + return; +errout: + rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err); +} + +static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { + [RTA_DST] = { .type = NLA_U8 }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + +static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[RTA_MAX+1]; + struct net_device *dev; + struct rtmsg *rtm; + int err; + u8 dst; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ASSERT_RTNL(); + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy); + if (err < 0) + return err; + + rtm = nlmsg_data(nlh); + if (rtm->rtm_table != RT_TABLE_MAIN || rtm->rtm_type != RTN_UNICAST) + return -EINVAL; + if (tb[RTA_DST] == NULL || tb[RTA_OIF] == NULL) + return -EINVAL; + dst = nla_get_u8(tb[RTA_DST]); + if (dst & 3) /* Phonet addresses only have 6 high-order bits */ + return -EINVAL; + + dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF])); + if (dev == NULL) + return -ENODEV; + + if (nlh->nlmsg_type == RTM_NEWROUTE) + err = phonet_route_add(dev, dst); + else + err = phonet_route_del(dev, dst); + if (!err) + rtm_phonet_notify(nlh->nlmsg_type, dev, dst); + return err; +} + +static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + u8 addr, addr_idx = 0, addr_start_idx = cb->args[0]; + + rcu_read_lock(); + for (addr = 0; addr < 64; addr++) { + struct net_device *dev; + + dev = phonet_route_get_rcu(net, addr << 2); + if (!dev) + continue; + + if (addr_idx++ < addr_start_idx) + continue; + if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE)) + goto out; + } + +out: + rcu_read_unlock(); + cb->args[0] = addr_idx; + cb->args[1] = 0; + + return skb->len; +} + +int __init phonet_netlink_register(void) +{ + int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, NULL); + if (err) + return err; + + /* Further __rtnl_register() cannot fail */ + __rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL); + __rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit); + __rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL); + __rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL); + __rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit); + return 0; +} diff --git a/net/phonet/socket.c b/net/phonet/socket.c new file mode 100644 index 00000000..ab07711c --- /dev/null +++ b/net/phonet/socket.c @@ -0,0 +1,825 @@ +/* + * File: socket.c + * + * Phonet sockets + * + * Copyright (C) 2008 Nokia Corporation. + * + * Contact: Remi Denis-Courmont + * Original author: Sakari Ailus + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int pn_socket_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + sk->sk_prot->close(sk, 0); + } + return 0; +} + +#define PN_HASHSIZE 16 +#define PN_HASHMASK (PN_HASHSIZE-1) + + +static struct { + struct hlist_head hlist[PN_HASHSIZE]; + struct mutex lock; +} pnsocks; + +void __init pn_sock_init(void) +{ + unsigned i; + + for (i = 0; i < PN_HASHSIZE; i++) + INIT_HLIST_HEAD(pnsocks.hlist + i); + mutex_init(&pnsocks.lock); +} + +static struct hlist_head *pn_hash_list(u16 obj) +{ + return pnsocks.hlist + (obj & PN_HASHMASK); +} + +/* + * Find address based on socket address, match only certain fields. + * Also grab sock if it was found. Remember to sock_put it later. + */ +struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn) +{ + struct hlist_node *node; + struct sock *sknode; + struct sock *rval = NULL; + u16 obj = pn_sockaddr_get_object(spn); + u8 res = spn->spn_resource; + struct hlist_head *hlist = pn_hash_list(obj); + + rcu_read_lock(); + sk_for_each_rcu(sknode, node, hlist) { + struct pn_sock *pn = pn_sk(sknode); + BUG_ON(!pn->sobject); /* unbound socket */ + + if (!net_eq(sock_net(sknode), net)) + continue; + if (pn_port(obj)) { + /* Look up socket by port */ + if (pn_port(pn->sobject) != pn_port(obj)) + continue; + } else { + /* If port is zero, look up by resource */ + if (pn->resource != res) + continue; + } + if (pn_addr(pn->sobject) && + pn_addr(pn->sobject) != pn_addr(obj)) + continue; + + rval = sknode; + sock_hold(sknode); + break; + } + rcu_read_unlock(); + + return rval; +} + +/* Deliver a broadcast packet (only in bottom-half) */ +void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) +{ + struct hlist_head *hlist = pnsocks.hlist; + unsigned h; + + rcu_read_lock(); + for (h = 0; h < PN_HASHSIZE; h++) { + struct hlist_node *node; + struct sock *sknode; + + sk_for_each(sknode, node, hlist) { + struct sk_buff *clone; + + if (!net_eq(sock_net(sknode), net)) + continue; + if (!sock_flag(sknode, SOCK_BROADCAST)) + continue; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone) { + sock_hold(sknode); + sk_receive_skb(sknode, clone, 0); + } + } + hlist++; + } + rcu_read_unlock(); +} + +void pn_sock_hash(struct sock *sk) +{ + struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); + + mutex_lock(&pnsocks.lock); + sk_add_node_rcu(sk, hlist); + mutex_unlock(&pnsocks.lock); +} +EXPORT_SYMBOL(pn_sock_hash); + +void pn_sock_unhash(struct sock *sk) +{ + mutex_lock(&pnsocks.lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&pnsocks.lock); + pn_sock_unbind_all_res(sk); + synchronize_rcu(); +} +EXPORT_SYMBOL(pn_sock_unhash); + +static DEFINE_MUTEX(port_mutex); + +static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) +{ + struct sock *sk = sock->sk; + struct pn_sock *pn = pn_sk(sk); + struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; + int err; + u16 handle; + u8 saddr; + + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, addr, len); + + if (len < sizeof(struct sockaddr_pn)) + return -EINVAL; + if (spn->spn_family != AF_PHONET) + return -EAFNOSUPPORT; + + handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr); + saddr = pn_addr(handle); + if (saddr && phonet_address_lookup(sock_net(sk), saddr)) + return -EADDRNOTAVAIL; + + lock_sock(sk); + if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) { + err = -EINVAL; /* attempt to rebind */ + goto out; + } + WARN_ON(sk_hashed(sk)); + mutex_lock(&port_mutex); + err = sk->sk_prot->get_port(sk, pn_port(handle)); + if (err) + goto out_port; + + /* get_port() sets the port, bind() sets the address if applicable */ + pn->sobject = pn_object(saddr, pn_port(pn->sobject)); + pn->resource = spn->spn_resource; + + /* Enable RX on the socket */ + sk->sk_prot->hash(sk); +out_port: + mutex_unlock(&port_mutex); +out: + release_sock(sk); + return err; +} + +static int pn_socket_autobind(struct socket *sock) +{ + struct sockaddr_pn sa; + int err; + + memset(&sa, 0, sizeof(sa)); + sa.spn_family = AF_PHONET; + err = pn_socket_bind(sock, (struct sockaddr *)&sa, + sizeof(struct sockaddr_pn)); + if (err != -EINVAL) + return err; + BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); + return 0; /* socket was already bound */ +} + +static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, + int len, int flags) +{ + struct sock *sk = sock->sk; + struct pn_sock *pn = pn_sk(sk); + struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; + struct task_struct *tsk = current; + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + int err; + + if (pn_socket_autobind(sock)) + return -ENOBUFS; + if (len < sizeof(struct sockaddr_pn)) + return -EINVAL; + if (spn->spn_family != AF_PHONET) + return -EAFNOSUPPORT; + + lock_sock(sk); + + switch (sock->state) { + case SS_UNCONNECTED: + if (sk->sk_state != TCP_CLOSE) { + err = -EISCONN; + goto out; + } + break; + case SS_CONNECTING: + err = -EALREADY; + goto out; + default: + err = -EISCONN; + goto out; + } + + pn->dobject = pn_sockaddr_get_object(spn); + pn->resource = pn_sockaddr_get_resource(spn); + sock->state = SS_CONNECTING; + + err = sk->sk_prot->connect(sk, addr, len); + if (err) { + sock->state = SS_UNCONNECTED; + pn->dobject = 0; + goto out; + } + + while (sk->sk_state == TCP_SYN_SENT) { + DEFINE_WAIT(wait); + + if (!timeo) { + err = -EINPROGRESS; + goto out; + } + if (signal_pending(tsk)) { + err = sock_intr_errno(timeo); + goto out; + } + + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + finish_wait(sk_sleep(sk), &wait); + } + + if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) + err = 0; + else if (sk->sk_state == TCP_CLOSE_WAIT) + err = -ECONNRESET; + else + err = -ECONNREFUSED; + sock->state = err ? SS_UNCONNECTED : SS_CONNECTED; +out: + release_sock(sk); + return err; +} + +static int pn_socket_accept(struct socket *sock, struct socket *newsock, + int flags) +{ + struct sock *sk = sock->sk; + struct sock *newsk; + int err; + + if (unlikely(sk->sk_state != TCP_LISTEN)) + return -EINVAL; + + newsk = sk->sk_prot->accept(sk, flags, &err); + if (!newsk) + return err; + + lock_sock(newsk); + sock_graft(newsk, newsock); + newsock->state = SS_CONNECTED; + release_sock(newsk); + return 0; +} + +static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, + int *sockaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct pn_sock *pn = pn_sk(sk); + + memset(addr, 0, sizeof(struct sockaddr_pn)); + addr->sa_family = AF_PHONET; + if (!peer) /* Race with bind() here is userland's problem. */ + pn_sockaddr_set_object((struct sockaddr_pn *)addr, + pn->sobject); + + *sockaddr_len = sizeof(struct sockaddr_pn); + return 0; +} + +static unsigned int pn_socket_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct pep_sock *pn = pep_sk(sk); + unsigned int mask = 0; + + poll_wait(file, sk_sleep(sk), wait); + + if (sk->sk_state == TCP_CLOSE) + return POLLERR; + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + if (!skb_queue_empty(&pn->ctrlreq_queue)) + mask |= POLLPRI; + if (!mask && sk->sk_state == TCP_CLOSE_WAIT) + return POLLHUP; + + if (sk->sk_state == TCP_ESTABLISHED && + atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf && + atomic_read(&pn->tx_credits)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + +static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct sock *sk = sock->sk; + struct pn_sock *pn = pn_sk(sk); + + if (cmd == SIOCPNGETOBJECT) { + struct net_device *dev; + u16 handle; + u8 saddr; + + if (get_user(handle, (__u16 __user *)arg)) + return -EFAULT; + + lock_sock(sk); + if (sk->sk_bound_dev_if) + dev = dev_get_by_index(sock_net(sk), + sk->sk_bound_dev_if); + else + dev = phonet_device_get(sock_net(sk)); + if (dev && (dev->flags & IFF_UP)) + saddr = phonet_address_get(dev, pn_addr(handle)); + else + saddr = PN_NO_ADDR; + release_sock(sk); + + if (dev) + dev_put(dev); + if (saddr == PN_NO_ADDR) + return -EHOSTUNREACH; + + handle = pn_object(saddr, pn_port(pn->sobject)); + return put_user(handle, (__u16 __user *)arg); + } + + return sk->sk_prot->ioctl(sk, cmd, arg); +} + +static int pn_socket_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = 0; + + if (pn_socket_autobind(sock)) + return -ENOBUFS; + + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + err = -EINVAL; + goto out; + } + + if (sk->sk_state != TCP_LISTEN) { + sk->sk_state = TCP_LISTEN; + sk->sk_ack_backlog = 0; + } + sk->sk_max_ack_backlog = backlog; +out: + release_sock(sk); + return err; +} + +static int pn_socket_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + + if (pn_socket_autobind(sock)) + return -EAGAIN; + + return sk->sk_prot->sendmsg(iocb, sk, m, total_len); +} + +const struct proto_ops phonet_dgram_ops = { + .family = AF_PHONET, + .owner = THIS_MODULE, + .release = pn_socket_release, + .bind = pn_socket_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = pn_socket_getname, + .poll = datagram_poll, + .ioctl = pn_socket_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, +#ifdef CONFIG_COMPAT + .compat_setsockopt = sock_no_setsockopt, + .compat_getsockopt = sock_no_getsockopt, +#endif + .sendmsg = pn_socket_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +const struct proto_ops phonet_stream_ops = { + .family = AF_PHONET, + .owner = THIS_MODULE, + .release = pn_socket_release, + .bind = pn_socket_bind, + .connect = pn_socket_connect, + .socketpair = sock_no_socketpair, + .accept = pn_socket_accept, + .getname = pn_socket_getname, + .poll = pn_socket_poll, + .ioctl = pn_socket_ioctl, + .listen = pn_socket_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif + .sendmsg = pn_socket_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; +EXPORT_SYMBOL(phonet_stream_ops); + +/* allocate port for a socket */ +int pn_sock_get_port(struct sock *sk, unsigned short sport) +{ + static int port_cur; + struct net *net = sock_net(sk); + struct pn_sock *pn = pn_sk(sk); + struct sockaddr_pn try_sa; + struct sock *tmpsk; + + memset(&try_sa, 0, sizeof(struct sockaddr_pn)); + try_sa.spn_family = AF_PHONET; + WARN_ON(!mutex_is_locked(&port_mutex)); + if (!sport) { + /* search free port */ + int port, pmin, pmax; + + phonet_get_local_port_range(&pmin, &pmax); + for (port = pmin; port <= pmax; port++) { + port_cur++; + if (port_cur < pmin || port_cur > pmax) + port_cur = pmin; + + pn_sockaddr_set_port(&try_sa, port_cur); + tmpsk = pn_find_sock_by_sa(net, &try_sa); + if (tmpsk == NULL) { + sport = port_cur; + goto found; + } else + sock_put(tmpsk); + } + } else { + /* try to find specific port */ + pn_sockaddr_set_port(&try_sa, sport); + tmpsk = pn_find_sock_by_sa(net, &try_sa); + if (tmpsk == NULL) + /* No sock there! We can use that port... */ + goto found; + else + sock_put(tmpsk); + } + /* the port must be in use already */ + return -EADDRINUSE; + +found: + pn->sobject = pn_object(pn_addr(pn->sobject), sport); + return 0; +} +EXPORT_SYMBOL(pn_sock_get_port); + +#ifdef CONFIG_PROC_FS +static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct hlist_head *hlist = pnsocks.hlist; + struct hlist_node *node; + struct sock *sknode; + unsigned h; + + for (h = 0; h < PN_HASHSIZE; h++) { + sk_for_each_rcu(sknode, node, hlist) { + if (!net_eq(net, sock_net(sknode))) + continue; + if (!pos) + return sknode; + pos--; + } + hlist++; + } + return NULL; +} + +static struct sock *pn_sock_get_next(struct seq_file *seq, struct sock *sk) +{ + struct net *net = seq_file_net(seq); + + do + sk = sk_next(sk); + while (sk && !net_eq(net, sock_net(sk))); + + return sk; +} + +static void *pn_sock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + return *pos ? pn_sock_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *pn_sock_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = pn_sock_get_idx(seq, 0); + else + sk = pn_sock_get_next(seq, v); + (*pos)++; + return sk; +} + +static void pn_sock_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static int pn_sock_seq_show(struct seq_file *seq, void *v) +{ + int len; + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%s%n", "pt loc rem rs st tx_queue rx_queue " + " uid inode ref pointer drops", &len); + else { + struct sock *sk = v; + struct pn_sock *pn = pn_sk(sk); + + seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu " + "%d %pK %d%n", + sk->sk_protocol, pn->sobject, pn->dobject, + pn->resource, sk->sk_state, + sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), + sock_i_uid(sk), sock_i_ino(sk), + atomic_read(&sk->sk_refcnt), sk, + atomic_read(&sk->sk_drops), &len); + } + seq_printf(seq, "%*s\n", 127 - len, ""); + return 0; +} + +static const struct seq_operations pn_sock_seq_ops = { + .start = pn_sock_seq_start, + .next = pn_sock_seq_next, + .stop = pn_sock_seq_stop, + .show = pn_sock_seq_show, +}; + +static int pn_sock_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &pn_sock_seq_ops, + sizeof(struct seq_net_private)); +} + +const struct file_operations pn_sock_seq_fops = { + .owner = THIS_MODULE, + .open = pn_sock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +static struct { + struct sock *sk[256]; +} pnres; + +/* + * Find and hold socket based on resource. + */ +struct sock *pn_find_sock_by_res(struct net *net, u8 res) +{ + struct sock *sk; + + if (!net_eq(net, &init_net)) + return NULL; + + rcu_read_lock(); + sk = rcu_dereference(pnres.sk[res]); + if (sk) + sock_hold(sk); + rcu_read_unlock(); + return sk; +} + +static DEFINE_MUTEX(resource_mutex); + +int pn_sock_bind_res(struct sock *sk, u8 res) +{ + int ret = -EADDRINUSE; + + if (!net_eq(sock_net(sk), &init_net)) + return -ENOIOCTLCMD; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (pn_socket_autobind(sk->sk_socket)) + return -EAGAIN; + + mutex_lock(&resource_mutex); + if (pnres.sk[res] == NULL) { + sock_hold(sk); + rcu_assign_pointer(pnres.sk[res], sk); + ret = 0; + } + mutex_unlock(&resource_mutex); + return ret; +} + +int pn_sock_unbind_res(struct sock *sk, u8 res) +{ + int ret = -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&resource_mutex); + if (pnres.sk[res] == sk) { + rcu_assign_pointer(pnres.sk[res], NULL); + ret = 0; + } + mutex_unlock(&resource_mutex); + + if (ret == 0) { + synchronize_rcu(); + sock_put(sk); + } + return ret; +} + +void pn_sock_unbind_all_res(struct sock *sk) +{ + unsigned res, match = 0; + + mutex_lock(&resource_mutex); + for (res = 0; res < 256; res++) { + if (pnres.sk[res] == sk) { + rcu_assign_pointer(pnres.sk[res], NULL); + match++; + } + } + mutex_unlock(&resource_mutex); + + while (match > 0) { + __sock_put(sk); + match--; + } + /* Caller is responsible for RCU sync before final sock_put() */ +} + +#ifdef CONFIG_PROC_FS +static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + unsigned i; + + if (!net_eq(net, &init_net)) + return NULL; + + for (i = 0; i < 256; i++) { + if (pnres.sk[i] == NULL) + continue; + if (!pos) + return pnres.sk + i; + pos--; + } + return NULL; +} + +static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) +{ + struct net *net = seq_file_net(seq); + unsigned i; + + BUG_ON(!net_eq(net, &init_net)); + + for (i = (sk - pnres.sk) + 1; i < 256; i++) + if (pnres.sk[i]) + return pnres.sk + i; + return NULL; +} + +static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(resource_mutex) +{ + mutex_lock(&resource_mutex); + return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock **sk; + + if (v == SEQ_START_TOKEN) + sk = pn_res_get_idx(seq, 0); + else + sk = pn_res_get_next(seq, v); + (*pos)++; + return sk; +} + +static void pn_res_seq_stop(struct seq_file *seq, void *v) + __releases(resource_mutex) +{ + mutex_unlock(&resource_mutex); +} + +static int pn_res_seq_show(struct seq_file *seq, void *v) +{ + int len; + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%s%n", "rs uid inode", &len); + else { + struct sock **psk = v; + struct sock *sk = *psk; + + seq_printf(seq, "%02X %5d %lu%n", + (int) (psk - pnres.sk), sock_i_uid(sk), + sock_i_ino(sk), &len); + } + seq_printf(seq, "%*s\n", 63 - len, ""); + return 0; +} + +static const struct seq_operations pn_res_seq_ops = { + .start = pn_res_seq_start, + .next = pn_res_seq_next, + .stop = pn_res_seq_stop, + .show = pn_res_seq_show, +}; + +static int pn_res_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &pn_res_seq_ops, + sizeof(struct seq_net_private)); +} + +const struct file_operations pn_res_seq_fops = { + .owner = THIS_MODULE, + .open = pn_res_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c new file mode 100644 index 00000000..cea1c7db --- /dev/null +++ b/net/phonet/sysctl.c @@ -0,0 +1,111 @@ +/* + * File: sysctl.c + * + * Phonet /proc/sys/net/phonet interface implementation + * + * Copyright (C) 2008 Nokia Corporation. + * + * Contact: Remi Denis-Courmont + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#include +#include +#include +#include + +#define DYNAMIC_PORT_MIN 0x40 +#define DYNAMIC_PORT_MAX 0x7f + +static DEFINE_SEQLOCK(local_port_range_lock); +static int local_port_range_min[2] = {0, 0}; +static int local_port_range_max[2] = {1023, 1023}; +static int local_port_range[2] = {DYNAMIC_PORT_MIN, DYNAMIC_PORT_MAX}; +static struct ctl_table_header *phonet_table_hrd; + +static void set_local_port_range(int range[2]) +{ + write_seqlock(&local_port_range_lock); + local_port_range[0] = range[0]; + local_port_range[1] = range[1]; + write_sequnlock(&local_port_range_lock); +} + +void phonet_get_local_port_range(int *min, int *max) +{ + unsigned seq; + do { + seq = read_seqbegin(&local_port_range_lock); + if (min) + *min = local_port_range[0]; + if (max) + *max = local_port_range[1]; + } while (read_seqretry(&local_port_range_lock, seq)); +} + +static int proc_local_port_range(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + int range[2] = {local_port_range[0], local_port_range[1]}; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &local_port_range_min, + .extra2 = &local_port_range_max, + }; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) { + if (range[1] < range[0]) + ret = -EINVAL; + else + set_local_port_range(range); + } + + return ret; +} + +static struct ctl_table phonet_table[] = { + { + .procname = "local_port_range", + .data = &local_port_range, + .maxlen = sizeof(local_port_range), + .mode = 0644, + .proc_handler = proc_local_port_range, + }, + { } +}; + +static struct ctl_path phonet_ctl_path[] = { + { .procname = "net", }, + { .procname = "phonet", }, + { }, +}; + +int __init phonet_sysctl_init(void) +{ + phonet_table_hrd = register_sysctl_paths(phonet_ctl_path, phonet_table); + return phonet_table_hrd == NULL ? -ENOMEM : 0; +} + +void phonet_sysctl_exit(void) +{ + unregister_sysctl_table(phonet_table_hrd); +} diff --git a/net/rds/Kconfig b/net/rds/Kconfig new file mode 100644 index 00000000..ec753b3a --- /dev/null +++ b/net/rds/Kconfig @@ -0,0 +1,28 @@ + +config RDS + tristate "The RDS Protocol (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + ---help--- + The RDS (Reliable Datagram Sockets) protocol provides reliable, + sequenced delivery of datagrams over Infiniband, iWARP, + or TCP. + +config RDS_RDMA + tristate "RDS over Infiniband and iWARP" + depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS + ---help--- + Allow RDS to use Infiniband and iWARP as a transport. + This transport supports RDMA operations. + +config RDS_TCP + tristate "RDS over TCP" + depends on RDS + ---help--- + Allow RDS to use TCP as a transport. + This transport does not support RDMA operations. + +config RDS_DEBUG + bool "RDS debugging messages" + depends on RDS + default n + diff --git a/net/rds/Makefile b/net/rds/Makefile new file mode 100644 index 00000000..56d3f602 --- /dev/null +++ b/net/rds/Makefile @@ -0,0 +1,19 @@ +obj-$(CONFIG_RDS) += rds.o +rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ + recv.o send.o stats.o sysctl.o threads.o transport.o \ + loop.o page.o rdma.o + +obj-$(CONFIG_RDS_RDMA) += rds_rdma.o +rds_rdma-y := rdma_transport.o \ + ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ + ib_sysctl.o ib_rdma.o \ + iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ + iw_sysctl.o iw_rdma.o + + +obj-$(CONFIG_RDS_TCP) += rds_tcp.o +rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ + tcp_send.o tcp_stats.o + +ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG + diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c new file mode 100644 index 00000000..424ff622 --- /dev/null +++ b/net/rds/af_rds.c @@ -0,0 +1,599 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" + +char *rds_str_array(char **array, size_t elements, size_t index) +{ + if ((index < elements) && array[index]) + return array[index]; + else + return "unknown"; +} +EXPORT_SYMBOL(rds_str_array); + +/* this is just used for stats gathering :/ */ +static DEFINE_SPINLOCK(rds_sock_lock); +static unsigned long rds_sock_count; +static LIST_HEAD(rds_sock_list); +DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); + +/* + * This is called as the final descriptor referencing this socket is closed. + * We have to unbind the socket so that another socket can be bound to the + * address it was using. + * + * We have to be careful about racing with the incoming path. sock_orphan() + * sets SOCK_DEAD and we use that as an indicator to the rx path that new + * messages shouldn't be queued. + */ +static int rds_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs; + + if (!sk) + goto out; + + rs = rds_sk_to_rs(sk); + + sock_orphan(sk); + /* Note - rds_clear_recv_queue grabs rs_recv_lock, so + * that ensures the recv path has completed messing + * with the socket. */ + rds_clear_recv_queue(rs); + rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we sychronize_rcu before we free our + * entry + */ + rds_remove_bound(rs); + synchronize_rcu(); + + rds_send_drop_to(rs, NULL); + rds_rdma_drop_keys(rs); + rds_notify_queue_get(rs, NULL); + + spin_lock_bh(&rds_sock_lock); + list_del_init(&rs->rs_item); + rds_sock_count--; + spin_unlock_bh(&rds_sock_lock); + + rds_trans_put(rs->rs_transport); + + sock->sk = NULL; + sock_put(sk); +out: + return 0; +} + +/* + * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. + * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK + * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but + * this seems more conservative. + * NB - normally, one would use sk_callback_lock for this, but we can + * get here from interrupts, whereas the network code grabs sk_callback_lock + * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. + */ +void rds_wake_sk_sleep(struct rds_sock *rs) +{ + unsigned long flags; + + read_lock_irqsave(&rs->rs_recv_lock, flags); + __rds_wake_sk_sleep(rds_rs_to_sk(rs)); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +static int rds_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + /* racey, don't care */ + if (peer) { + if (!rs->rs_conn_addr) + return -ENOTCONN; + + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr; + } else { + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr; + } + + sin->sin_family = AF_INET; + + *uaddr_len = sizeof(*sin); + return 0; +} + +/* + * RDS' poll is without a doubt the least intuitive part of the interface, + * as POLLIN and POLLOUT do not behave entirely as you would expect from + * a network protocol. + * + * POLLIN is asserted if + * - there is data on the receive queue. + * - to signal that a previously congested destination may have become + * uncongested + * - A notification has been queued to the socket (this can be a congestion + * update, or a RDMA completion). + * + * POLLOUT is asserted if there is room on the send queue. This does not mean + * however, that the next sendmsg() call will succeed. If the application tries + * to send to a congested destination, the system call may still fail (and + * return ENOBUFS). + */ +static unsigned int rds_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, sk_sleep(sk), wait); + + if (rs->rs_seen_congestion) + poll_wait(file, &rds_poll_waitq, wait); + + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!rs->rs_cong_monitor) { + /* When a congestion map was updated, we signal POLLIN for + * "historical" reasons. Applications can also poll for + * WRBAND instead. */ + if (rds_cong_updated_since(&rs->rs_cong_track)) + mask |= (POLLIN | POLLRDNORM | POLLWRBAND); + } else { + spin_lock(&rs->rs_lock); + if (rs->rs_cong_notify) + mask |= (POLLIN | POLLRDNORM); + spin_unlock(&rs->rs_lock); + } + if (!list_empty(&rs->rs_recv_queue) || + !list_empty(&rs->rs_notify_queue)) + mask |= (POLLIN | POLLRDNORM); + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) + mask |= (POLLOUT | POLLWRNORM); + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + + /* clear state any time we wake a seen-congested socket */ + if (mask) + rs->rs_seen_congestion = 0; + + return mask; +} + +static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, + int len) +{ + struct sockaddr_in sin; + int ret = 0; + + /* racing with another thread binding seems ok here */ + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (len < sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (copy_from_user(&sin, optval, sizeof(sin))) { + ret = -EFAULT; + goto out; + } + + rds_send_drop_to(rs, &sin); +out: + return ret; +} + +static int rds_set_bool_option(unsigned char *optvar, char __user *optval, + int optlen) +{ + int value; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(value, (int __user *) optval)) + return -EFAULT; + *optvar = !!value; + return 0; +} + +static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int ret; + + ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); + if (ret == 0) { + if (rs->rs_cong_monitor) { + rds_cong_add_socket(rs); + } else { + rds_cong_remove_socket(rs); + rs->rs_cong_mask = 0; + rs->rs_cong_notify = 0; + } + } + return ret; +} + +static int rds_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret; + + if (level != SOL_RDS) { + ret = -ENOPROTOOPT; + goto out; + } + + switch (optname) { + case RDS_CANCEL_SENT_TO: + ret = rds_cancel_sent_to(rs, optval, optlen); + break; + case RDS_GET_MR: + ret = rds_get_mr(rs, optval, optlen); + break; + case RDS_GET_MR_FOR_DEST: + ret = rds_get_mr_for_dest(rs, optval, optlen); + break; + case RDS_FREE_MR: + ret = rds_free_mr(rs, optval, optlen); + break; + case RDS_RECVERR: + ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); + break; + case RDS_CONG_MONITOR: + ret = rds_cong_monitor(rs, optval, optlen); + break; + default: + ret = -ENOPROTOOPT; + } +out: + return ret; +} + +static int rds_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + int ret = -ENOPROTOOPT, len; + + if (level != SOL_RDS) + goto out; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + switch (optname) { + case RDS_INFO_FIRST ... RDS_INFO_LAST: + ret = rds_info_getsockopt(sock, optname, optval, + optlen); + break; + + case RDS_RECVERR: + if (len < sizeof(int)) + ret = -EINVAL; + else + if (put_user(rs->rs_recverr, (int __user *) optval) || + put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; + default: + break; + } + +out: + return ret; + +} + +static int rds_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in)) { + ret = -EINVAL; + goto out; + } + + if (sin->sin_family != AF_INET) { + ret = -EAFNOSUPPORT; + goto out; + } + + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EDESTADDRREQ; + goto out; + } + + rs->rs_conn_addr = sin->sin_addr.s_addr; + rs->rs_conn_port = sin->sin_port; + +out: + release_sock(sk); + return ret; +} + +static struct proto rds_proto = { + .name = "RDS", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rds_sock), +}; + +static const struct proto_ops rds_proto_ops = { + .family = AF_RDS, + .owner = THIS_MODULE, + .release = rds_release, + .bind = rds_bind, + .connect = rds_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = rds_getname, + .poll = rds_poll, + .ioctl = rds_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rds_setsockopt, + .getsockopt = rds_getsockopt, + .sendmsg = rds_sendmsg, + .recvmsg = rds_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static int __rds_create(struct socket *sock, struct sock *sk, int protocol) +{ + struct rds_sock *rs; + + sock_init_data(sock, sk); + sock->ops = &rds_proto_ops; + sk->sk_protocol = protocol; + + rs = rds_sk_to_rs(sk); + spin_lock_init(&rs->rs_lock); + rwlock_init(&rs->rs_recv_lock); + INIT_LIST_HEAD(&rs->rs_send_queue); + INIT_LIST_HEAD(&rs->rs_recv_queue); + INIT_LIST_HEAD(&rs->rs_notify_queue); + INIT_LIST_HEAD(&rs->rs_cong_list); + spin_lock_init(&rs->rs_rdma_lock); + rs->rs_rdma_keys = RB_ROOT; + + spin_lock_bh(&rds_sock_lock); + list_add_tail(&rs->rs_item, &rds_sock_list); + rds_sock_count++; + spin_unlock_bh(&rds_sock_lock); + + return 0; +} + +static int rds_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + if (sock->type != SOCK_SEQPACKET || protocol) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + if (!sk) + return -ENOMEM; + + return __rds_create(sock, sk, protocol); +} + +void rds_sock_addref(struct rds_sock *rs) +{ + sock_hold(rds_rs_to_sk(rs)); +} + +void rds_sock_put(struct rds_sock *rs) +{ + sock_put(rds_rs_to_sk(rs)); +} + +static const struct net_proto_family rds_family_ops = { + .family = AF_RDS, + .create = rds_create, + .owner = THIS_MODULE, +}; + +static void rds_sock_inc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_sock *rs; + struct rds_incoming *inc; + unsigned int total = 0; + + len /= sizeof(struct rds_info_message); + + spin_lock_bh(&rds_sock_lock); + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + read_lock(&rs->rs_recv_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rds_inc_info_copy(inc, iter, inc->i_saddr, + rs->rs_bound_addr, 1); + } + + read_unlock(&rs->rs_recv_lock); + } + + spin_unlock_bh(&rds_sock_lock); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_sock_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_socket sinfo; + struct rds_sock *rs; + + len /= sizeof(struct rds_info_socket); + + spin_lock_bh(&rds_sock_lock); + + if (len < rds_sock_count) + goto out; + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sinfo.sndbuf = rds_sk_sndbuf(rs); + sinfo.rcvbuf = rds_sk_rcvbuf(rs); + sinfo.bound_addr = rs->rs_bound_addr; + sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_port = rs->rs_bound_port; + sinfo.connected_port = rs->rs_conn_port; + sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); + + rds_info_copy(iter, &sinfo, sizeof(sinfo)); + } + +out: + lens->nr = rds_sock_count; + lens->each = sizeof(struct rds_info_socket); + + spin_unlock_bh(&rds_sock_lock); +} + +static void rds_exit(void) +{ + sock_unregister(rds_family_ops.family); + proto_unregister(&rds_proto); + rds_conn_exit(); + rds_cong_exit(); + rds_sysctl_exit(); + rds_threads_exit(); + rds_stats_exit(); + rds_page_exit(); + rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); +} +module_exit(rds_exit); + +static int rds_init(void) +{ + int ret; + + ret = rds_conn_init(); + if (ret) + goto out; + ret = rds_threads_init(); + if (ret) + goto out_conn; + ret = rds_sysctl_init(); + if (ret) + goto out_threads; + ret = rds_stats_init(); + if (ret) + goto out_sysctl; + ret = proto_register(&rds_proto, 1); + if (ret) + goto out_stats; + ret = sock_register(&rds_family_ops); + if (ret) + goto out_proto; + + rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); + rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + + goto out; + +out_proto: + proto_unregister(&rds_proto); +out_stats: + rds_stats_exit(); +out_sysctl: + rds_sysctl_exit(); +out_threads: + rds_threads_exit(); +out_conn: + rds_conn_exit(); + rds_cong_exit(); + rds_page_exit(); +out: + return ret; +} +module_init(rds_init); + +#define DRV_VERSION "4.0" +#define DRV_RELDATE "Feb 12, 2009" + +MODULE_AUTHOR("Oracle Corporation "); +MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" + " v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NETPROTO(PF_RDS); diff --git a/net/rds/bind.c b/net/rds/bind.c new file mode 100644 index 00000000..2f6b3fcc --- /dev/null +++ b/net/rds/bind.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include "rds.h" + +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; +static DEFINE_SPINLOCK(rds_bind_lock); + +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} + +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) +{ + struct rds_sock *rs; + struct hlist_node *node; + struct hlist_head *head = hash_to_bucket(addr, port); + u64 cmp; + u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); + + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { + cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | + be16_to_cpu(rs->rs_bound_port); + + if (cmp == needle) { + rcu_read_unlock(); + return rs; + } + } + rcu_read_unlock(); + + if (insert) { + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); + } + return NULL; +} + +/* + * Return the rds_sock bound at the given local address. + * + * The rx path can race with rds_release. We notice if rds_release() has + * marked this socket and don't return a rs ref to the rx path. + */ +struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +{ + struct rds_sock *rs; + + rs = rds_bind_lookup(addr, port, NULL); + + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) + rds_sock_addref(rs); + else + rs = NULL; + + rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, + ntohs(port)); + return rs; +} + +/* returns -ve errno or +ve port */ +static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +{ + unsigned long flags; + int ret = -EADDRINUSE; + u16 rover, last; + + if (*port != 0) { + rover = be16_to_cpu(*port); + last = rover; + } else { + rover = max_t(u16, net_random(), 2); + last = rover - 1; + } + + spin_lock_irqsave(&rds_bind_lock, flags); + + do { + if (rover == 0) + rover++; + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; + ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); + break; + } + } while (rover++ != last); + + spin_unlock_irqrestore(&rds_bind_lock, flags); + + return ret; +} + +void rds_remove_bound(struct rds_sock *rs) +{ + unsigned long flags; + + spin_lock_irqsave(&rds_bind_lock, flags); + + if (rs->rs_bound_addr) { + rdsdebug("rs %p unbinding from %pI4:%d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port)); + + hlist_del_init_rcu(&rs->rs_bound_node); + rds_sock_put(rs); + rs->rs_bound_addr = 0; + } + + spin_unlock_irqrestore(&rds_bind_lock, flags); +} + +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct rds_transport *trans; + int ret = 0; + + lock_sock(sk); + + if (addr_len != sizeof(struct sockaddr_in) || + sin->sin_family != AF_INET || + rs->rs_bound_addr || + sin->sin_addr.s_addr == htonl(INADDR_ANY)) { + ret = -EINVAL; + goto out; + } + + ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + if (ret) + goto out; + + trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + if (!trans) { + ret = -EADDRNOTAVAIL; + rds_remove_bound(rs); + if (printk_ratelimit()) + printk(KERN_INFO "RDS: rds_bind() could not find a transport, " + "load rds_tcp or rds_rdma?\n"); + goto out; + } + + rs->rs_transport = trans; + ret = 0; + +out: + release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); + return ret; +} diff --git a/net/rds/cong.c b/net/rds/cong.c new file mode 100644 index 00000000..6daaa49d --- /dev/null +++ b/net/rds/cong.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" + +/* + * This file implements the receive side of the unconventional congestion + * management in RDS. + * + * Messages waiting in the receive queue on the receiving socket are accounted + * against the sockets SO_RCVBUF option value. Only the payload bytes in the + * message are accounted for. If the number of bytes queued equals or exceeds + * rcvbuf then the socket is congested. All sends attempted to this socket's + * address should return block or return -EWOULDBLOCK. + * + * Applications are expected to be reasonably tuned such that this situation + * very rarely occurs. An application encountering this "back-pressure" is + * considered a bug. + * + * This is implemented by having each node maintain bitmaps which indicate + * which ports on bound addresses are congested. As the bitmap changes it is + * sent through all the connections which terminate in the local address of the + * bitmap which changed. + * + * The bitmaps are allocated as connections are brought up. This avoids + * allocation in the interrupt handling path which queues messages on sockets. + * The dense bitmaps let transports send the entire bitmap on any bitmap change + * reasonably efficiently. This is much easier to implement than some + * finer-grained communication of per-port congestion. The sender does a very + * inexpensive bit test to test if the port it's about to send to is congested + * or not. + */ + +/* + * Interaction with poll is a tad tricky. We want all processes stuck in + * poll to wake up and check whether a congested destination became uncongested. + * The really sad thing is we have no idea which destinations the application + * wants to send to - we don't even know which rds_connections are involved. + * So until we implement a more flexible rds poll interface, we have to make + * do with this: + * We maintain a global counter that is incremented each time a congestion map + * update is received. Each rds socket tracks this value, and if rds_poll + * finds that the saved generation number is smaller than the global generation + * number, it wakes up the process. + */ +static atomic_t rds_cong_generation = ATOMIC_INIT(0); + +/* + * Congestion monitoring + */ +static LIST_HEAD(rds_cong_monitor); +static DEFINE_RWLOCK(rds_cong_monitor_lock); + +/* + * Yes, a global lock. It's used so infrequently that it's worth keeping it + * global to simplify the locking. It's only used in the following + * circumstances: + * + * - on connection buildup to associate a conn with its maps + * - on map changes to inform conns of a new map to send + * + * It's sadly ordered under the socket callback lock and the connection lock. + * Receive paths can mark ports congested from interrupt context so the + * lock masks interrupts. + */ +static DEFINE_SPINLOCK(rds_cong_lock); +static struct rb_root rds_cong_tree = RB_ROOT; + +static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, + struct rds_cong_map *insert) +{ + struct rb_node **p = &rds_cong_tree.rb_node; + struct rb_node *parent = NULL; + struct rds_cong_map *map; + + while (*p) { + parent = *p; + map = rb_entry(parent, struct rds_cong_map, m_rb_node); + + if (addr < map->m_addr) + p = &(*p)->rb_left; + else if (addr > map->m_addr) + p = &(*p)->rb_right; + else + return map; + } + + if (insert) { + rb_link_node(&insert->m_rb_node, parent, p); + rb_insert_color(&insert->m_rb_node, &rds_cong_tree); + } + return NULL; +} + +/* + * There is only ever one bitmap for any address. Connections try and allocate + * these bitmaps in the process getting pointers to them. The bitmaps are only + * ever freed as the module is removed after all connections have been freed. + */ +static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +{ + struct rds_cong_map *map; + struct rds_cong_map *ret = NULL; + unsigned long zp; + unsigned long i; + unsigned long flags; + + map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); + if (!map) + return NULL; + + map->m_addr = addr; + init_waitqueue_head(&map->m_waitq); + INIT_LIST_HEAD(&map->m_conn_list); + + for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { + zp = get_zeroed_page(GFP_KERNEL); + if (zp == 0) + goto out; + map->m_page_addrs[i] = zp; + } + + spin_lock_irqsave(&rds_cong_lock, flags); + ret = rds_cong_tree_walk(addr, map); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (!ret) { + ret = map; + map = NULL; + } + +out: + if (map) { + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } + + rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + + return ret; +} + +/* + * Put the conn on its local map's list. This is called when the conn is + * really added to the hash. It's nested under the rds_conn_lock, sadly. + */ +void rds_cong_add_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_remove_conn(struct rds_connection *conn) +{ + unsigned long flags; + + rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); + spin_lock_irqsave(&rds_cong_lock, flags); + list_del_init(&conn->c_map_item); + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +int rds_cong_get_maps(struct rds_connection *conn) +{ + conn->c_lcong = rds_cong_from_addr(conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + + if (!(conn->c_lcong && conn->c_fcong)) + return -ENOMEM; + + return 0; +} + +void rds_cong_queue_updates(struct rds_cong_map *map) +{ + struct rds_connection *conn; + unsigned long flags; + + spin_lock_irqsave(&rds_cong_lock, flags); + + list_for_each_entry(conn, &map->m_conn_list, c_map_item) { + if (!test_and_set_bit(0, &conn->c_map_queued)) { + rds_stats_inc(s_cong_update_queued); + rds_send_xmit(conn); + } + } + + spin_unlock_irqrestore(&rds_cong_lock, flags); +} + +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) +{ + rdsdebug("waking map %p for %pI4\n", + map, &map->m_addr); + rds_stats_inc(s_cong_update_received); + atomic_inc(&rds_cong_generation); + if (waitqueue_active(&map->m_waitq)) + wake_up(&map->m_waitq); + if (waitqueue_active(&rds_poll_waitq)) + wake_up_all(&rds_poll_waitq); + + if (portmask && !list_empty(&rds_cong_monitor)) { + unsigned long flags; + struct rds_sock *rs; + + read_lock_irqsave(&rds_cong_monitor_lock, flags); + list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { + spin_lock(&rs->rs_lock); + rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); + rs->rs_cong_mask &= ~portmask; + spin_unlock(&rs->rs_lock); + if (rs->rs_cong_notify) + rds_wake_sk_sleep(rs); + } + read_unlock_irqrestore(&rds_cong_monitor_lock, flags); + } +} +EXPORT_SYMBOL_GPL(rds_cong_map_updated); + +int rds_cong_updated_since(unsigned long *recent) +{ + unsigned long gen = atomic_read(&rds_cong_generation); + + if (likely(*recent == gen)) + return 0; + *recent = gen; + return 1; +} + +/* + * We're called under the locking that protects the sockets receive buffer + * consumption. This makes it a lot easier for the caller to only call us + * when it knows that an existing set bit needs to be cleared, and vice versa. + * We can't block and we need to deal with concurrent sockets working against + * the same per-address map. + */ +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("setting congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + __set_bit_le(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + rdsdebug("clearing congestion for %pI4:%u in map %p\n", + &map->m_addr, ntohs(port), map); + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + __clear_bit_le(off, (void *)map->m_page_addrs[i]); +} + +static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) +{ + unsigned long i; + unsigned long off; + + i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; + off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; + + return test_bit_le(off, (void *)map->m_page_addrs[i]); +} + +void rds_cong_add_socket(struct rds_sock *rs) +{ + unsigned long flags; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + if (list_empty(&rs->rs_cong_list)) + list_add(&rs->rs_cong_list, &rds_cong_monitor); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); +} + +void rds_cong_remove_socket(struct rds_sock *rs) +{ + unsigned long flags; + struct rds_cong_map *map; + + write_lock_irqsave(&rds_cong_monitor_lock, flags); + list_del_init(&rs->rs_cong_list); + write_unlock_irqrestore(&rds_cong_monitor_lock, flags); + + /* update congestion map for now-closed port */ + spin_lock_irqsave(&rds_cong_lock, flags); + map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + spin_unlock_irqrestore(&rds_cong_lock, flags); + + if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { + rds_cong_clear_bit(map, rs->rs_bound_port); + rds_cong_queue_updates(map); + } +} + +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, + struct rds_sock *rs) +{ + if (!rds_cong_test_bit(map, port)) + return 0; + if (nonblock) { + if (rs && rs->rs_cong_monitor) { + unsigned long flags; + + /* It would have been nice to have an atomic set_bit on + * a uint64_t. */ + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); + spin_unlock_irqrestore(&rs->rs_lock, flags); + + /* Test again - a congestion update may have arrived in + * the meantime. */ + if (!rds_cong_test_bit(map, port)) + return 0; + } + rds_stats_inc(s_cong_send_error); + return -ENOBUFS; + } + + rds_stats_inc(s_cong_send_blocked); + rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); + + return wait_event_interruptible(map->m_waitq, + !rds_cong_test_bit(map, port)); +} + +void rds_cong_exit(void) +{ + struct rb_node *node; + struct rds_cong_map *map; + unsigned long i; + + while ((node = rb_first(&rds_cong_tree))) { + map = rb_entry(node, struct rds_cong_map, m_rb_node); + rdsdebug("freeing map %p\n", map); + rb_erase(&map->m_rb_node, &rds_cong_tree); + for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) + free_page(map->m_page_addrs[i]); + kfree(map); + } +} + +/* + * Allocate a RDS message containing a congestion update. + */ +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) +{ + struct rds_cong_map *map = conn->c_lcong; + struct rds_message *rm; + + rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); + if (!IS_ERR(rm)) + rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; + + return rm; +} diff --git a/net/rds/connection.c b/net/rds/connection.c new file mode 100644 index 00000000..9334d892 --- /dev/null +++ b/net/rds/connection.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "loop.h" + +#define RDS_CONNECTION_HASH_BITS 12 +#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) +#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) + +/* converting this to RCU is a chore for another day.. */ +static DEFINE_SPINLOCK(rds_conn_lock); +static unsigned long rds_conn_count; +static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; +static struct kmem_cache *rds_conn_slab; + +static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +{ + /* Pass NULL, don't need struct net for hash */ + unsigned long hash = inet_ehashfn(NULL, + be32_to_cpu(laddr), 0, + be32_to_cpu(faddr), 0); + return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; +} + +#define rds_conn_info_set(var, test, suffix) do { \ + if (test) \ + var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ +} while (0) + +/* rcu read lock must be held or the connection spinlock */ +static struct rds_connection *rds_conn_lookup(struct hlist_head *head, + __be32 laddr, __be32 faddr, + struct rds_transport *trans) +{ + struct rds_connection *conn, *ret = NULL; + struct hlist_node *pos; + + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { + if (conn->c_faddr == faddr && conn->c_laddr == laddr && + conn->c_trans == trans) { + ret = conn; + break; + } + } + rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, + &laddr, &faddr); + return ret; +} + +/* + * This is called by transports as they're bringing down a connection. + * It clears partial message state so that the transport can start sending + * and receiving over this connection again in the future. It is up to + * the transport to have serialized this call with its send and recv. + */ +static void rds_conn_reset(struct rds_connection *conn) +{ + rdsdebug("connection %pI4 to %pI4 reset\n", + &conn->c_laddr, &conn->c_faddr); + + rds_stats_inc(s_conn_reset); + rds_send_reset(conn); + conn->c_flags = 0; + + /* Do not clear next_rx_seq here, else we cannot distinguish + * retransmitted packets from new packets, and will hand all + * of them to the application. That is not consistent with the + * reliability guarantees of RDS. */ +} + +/* + * There is only every one 'conn' for a given pair of addresses in the + * system at a time. They contain messages to be retransmitted and so + * span the lifetime of the actual underlying transport connections. + * + * For now they are not garbage collected once they're created. They + * are torn down as the module is removed, if ever. + */ +static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp, + int is_outgoing) +{ + struct rds_connection *conn, *parent = NULL; + struct hlist_head *head = rds_conn_bucket(laddr, faddr); + struct rds_transport *loop_trans; + unsigned long flags; + int ret; + + rcu_read_lock(); + conn = rds_conn_lookup(head, laddr, faddr, trans); + if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && + !is_outgoing) { + /* This is a looped back IB connection, and we're + * called by the code handling the incoming connect. + * We need a second connection object into which we + * can stick the other QP. */ + parent = conn; + conn = parent->c_passive; + } + rcu_read_unlock(); + if (conn) + goto out; + + conn = kmem_cache_zalloc(rds_conn_slab, gfp); + if (!conn) { + conn = ERR_PTR(-ENOMEM); + goto out; + } + + INIT_HLIST_NODE(&conn->c_hash_node); + conn->c_laddr = laddr; + conn->c_faddr = faddr; + spin_lock_init(&conn->c_lock); + conn->c_next_tx_seq = 1; + + init_waitqueue_head(&conn->c_waitq); + INIT_LIST_HEAD(&conn->c_send_queue); + INIT_LIST_HEAD(&conn->c_retrans); + + ret = rds_cong_get_maps(conn); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + /* + * This is where a connection becomes loopback. If *any* RDS sockets + * can bind to the destination address then we'd rather the messages + * flow through loopback rather than either transport. + */ + loop_trans = rds_trans_get_preferred(faddr); + if (loop_trans) { + rds_trans_put(loop_trans); + conn->c_loopback = 1; + if (is_outgoing && trans->t_prefer_loopback) { + /* "outgoing" connection - and the transport + * says it wants the connection handled by the + * loopback transport. This is what TCP does. + */ + trans = &rds_loop_transport; + } + } + + conn->c_trans = trans; + + ret = trans->conn_alloc(conn, gfp); + if (ret) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(ret); + goto out; + } + + atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_reconnect_jiffies = 0; + INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); + INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); + INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); + INIT_WORK(&conn->c_down_w, rds_shutdown_worker); + mutex_init(&conn->c_cm_lock); + conn->c_flags = 0; + + rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", + conn, &laddr, &faddr, + trans->t_name ? trans->t_name : "[unknown]", + is_outgoing ? "(outgoing)" : ""); + + /* + * Since we ran without holding the conn lock, someone could + * have created the same conn (either normal or passive) in the + * interim. We check while holding the lock. If we won, we complete + * init and return our conn. If we lost, we rollback and return the + * other one. + */ + spin_lock_irqsave(&rds_conn_lock, flags); + if (parent) { + /* Creating passive conn */ + if (parent->c_passive) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = parent->c_passive; + } else { + parent->c_passive = conn; + rds_cong_add_conn(conn); + rds_conn_count++; + } + } else { + /* Creating normal conn */ + struct rds_connection *found; + + found = rds_conn_lookup(head, laddr, faddr, trans); + if (found) { + trans->conn_free(conn->c_transport_data); + kmem_cache_free(rds_conn_slab, conn); + conn = found; + } else { + hlist_add_head_rcu(&conn->c_hash_node, head); + rds_cong_add_conn(conn); + rds_conn_count++; + } + } + spin_unlock_irqrestore(&rds_conn_lock, flags); + +out: + return conn; +} + +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_create); + +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp) +{ + return __rds_conn_create(laddr, faddr, trans, gfp, 1); +} +EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); + +void rds_conn_shutdown(struct rds_connection *conn) +{ + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work_sync(&conn->c_conn_w); + rcu_read_lock(); + if (!hlist_unhashed(&conn->c_hash_node)) { + rcu_read_unlock(); + rds_queue_reconnect(conn); + } else { + rcu_read_unlock(); + } +} + +/* + * Stop and free a connection. + * + * This can only be used in very limited circumstances. It assumes that once + * the conn has been shutdown that no one else is referencing the connection. + * We can only ensure this in the rmmod path in the current code. + */ +void rds_conn_destroy(struct rds_connection *conn) +{ + struct rds_message *rm, *rtmp; + unsigned long flags; + + rdsdebug("freeing conn %p for %pI4 -> " + "%pI4\n", conn, &conn->c_laddr, + &conn->c_faddr); + + /* Ensure conn will not be scheduled for reconnect */ + spin_lock_irq(&rds_conn_lock); + hlist_del_init_rcu(&conn->c_hash_node); + spin_unlock_irq(&rds_conn_lock); + synchronize_rcu(); + + /* shut the connection down */ + rds_conn_drop(conn); + flush_work(&conn->c_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&conn->c_send_w); + cancel_delayed_work_sync(&conn->c_recv_w); + + /* tear down queued messages */ + list_for_each_entry_safe(rm, rtmp, + &conn->c_send_queue, + m_conn_item) { + list_del_init(&rm->m_conn_item); + BUG_ON(!list_empty(&rm->m_sock_item)); + rds_message_put(rm); + } + if (conn->c_xmit_rm) + rds_message_put(conn->c_xmit_rm); + + conn->c_trans->conn_free(conn->c_transport_data); + + /* + * The congestion maps aren't freed up here. They're + * freed by rds_cong_exit() after all the connections + * have been freed. + */ + rds_cong_remove_conn(conn); + + BUG_ON(!list_empty(&conn->c_retrans)); + kmem_cache_free(rds_conn_slab, conn); + + spin_lock_irqsave(&rds_conn_lock, flags); + rds_conn_count--; + spin_unlock_irqrestore(&rds_conn_lock, flags); +} +EXPORT_SYMBOL_GPL(rds_conn_destroy); + +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct list_head *list; + struct rds_connection *conn; + struct rds_message *rm; + unsigned int total = 0; + unsigned long flags; + size_t i; + + len /= sizeof(struct rds_info_message); + + rcu_read_lock(); + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { + if (want_send) + list = &conn->c_send_queue; + else + list = &conn->c_retrans; + + spin_lock_irqsave(&conn->c_lock, flags); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(rm, list, m_conn_item) { + total++; + if (total <= len) + rds_inc_info_copy(&rm->m_inc, iter, + conn->c_laddr, + conn->c_faddr, 0); + } + + spin_unlock_irqrestore(&conn->c_lock, flags); + } + } + rcu_read_unlock(); + + lens->nr = total; + lens->each = sizeof(struct rds_info_message); +} + +static void rds_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 1); +} + +static void rds_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_conn_message_info(sock, len, iter, lens, 0); +} + +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len) +{ + uint64_t buffer[(item_len + 7) / 8]; + struct hlist_head *head; + struct hlist_node *pos; + struct rds_connection *conn; + size_t i; + + rcu_read_lock(); + + lens->nr = 0; + lens->each = item_len; + + for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); + i++, head++) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { + + /* XXX no c_lock usage.. */ + if (!visitor(conn, buffer)) + continue; + + /* We copy as much as we can fit in the buffer, + * but we count all items so that the caller + * can resize the buffer. */ + if (len >= item_len) { + rds_info_copy(iter, buffer, item_len); + len -= item_len; + } + lens->nr++; + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rds_for_each_conn_info); + +static int rds_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_connection *cinfo = buffer; + + cinfo->next_tx_seq = conn->c_next_tx_seq; + cinfo->next_rx_seq = conn->c_next_rx_seq; + cinfo->laddr = conn->c_laddr; + cinfo->faddr = conn->c_faddr; + strncpy(cinfo->transport, conn->c_trans->t_name, + sizeof(cinfo->transport)); + cinfo->flags = 0; + + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo->flags, + atomic_read(&conn->c_state) == RDS_CONN_UP, + CONNECTED); + return 1; +} + +static void rds_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_conn_info_visitor, + sizeof(struct rds_info_connection)); +} + +int rds_conn_init(void) +{ + rds_conn_slab = kmem_cache_create("rds_connection", + sizeof(struct rds_connection), + 0, 0, NULL); + if (!rds_conn_slab) + return -ENOMEM; + + rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_register_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); + + return 0; +} + +void rds_conn_exit(void) +{ + rds_loop_exit(); + + WARN_ON(!hlist_empty(rds_conn_hash)); + + kmem_cache_destroy(rds_conn_slab); + + rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); + rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, + rds_conn_message_info_send); + rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, + rds_conn_message_info_retrans); +} + +/* + * Force a disconnect + */ +void rds_conn_drop(struct rds_connection *conn) +{ + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); +} +EXPORT_SYMBOL_GPL(rds_conn_drop); + +/* + * If the connection is down, trigger a connect. We may have scheduled a + * delayed reconnect however - in this case we should not interfere. + */ +void rds_conn_connect_if_down(struct rds_connection *conn) +{ + if (rds_conn_state(conn) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); + +/* + * An error occurred on the connection + */ +void +__rds_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); + + rds_conn_drop(conn); +} diff --git a/net/rds/ib.c b/net/rds/ib.c new file mode 100644 index 00000000..3b83086b --- /dev/null +++ b/net/rds/ib.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; + +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); +module_param(fmr_message_size, int, 0444); +MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +module_param(rds_ib_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); + +/* + * we have a clumsy combination of RCU and a rwsem protecting this list + * because it is used both in the get_mr fast path and while blocking in + * the FMR flushing path. + */ +DECLARE_RWSEM(rds_ib_devices_lock); +struct list_head rds_ib_devices; + +/* NOTE: if also grabbing ibdev lock, grab this first */ +DEFINE_SPINLOCK(ib_nodev_conns_lock); +LIST_HEAD(ib_nodev_conns); + +static void rds_ib_nodev_connect(void) +{ + struct rds_ib_connection *ic; + + spin_lock(&ib_nodev_conns_lock); + list_for_each_entry(ic, &ib_nodev_conns, ib_node) + rds_conn_connect_if_down(ic->conn); + spin_unlock(&ib_nodev_conns_lock); +} + +static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + rds_conn_drop(ic->conn); + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); +} + +/* + * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references + * from interrupt context so we push freing off into a work struct in krdsd. + */ +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + kfree(rds_ibdev); +} + +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) +{ + BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); + if (atomic_dec_and_test(&rds_ibdev->refcount)) + queue_work(rds_wq, &rds_ibdev->free_work); +} + +static void rds_ib_add_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct ib_device_attr *dev_attr; + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); + if (!rds_ibdev) + goto free_attr; + + spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); + + rds_ibdev->max_wrs = dev_attr->max_qp_wr; + rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + + rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; + rds_ibdev->max_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : + fmr_pool_size; + + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } + + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } + + rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); + if (IS_ERR(rds_ibdev->mr_pool)) { + rds_ibdev->mr_pool = NULL; + goto put_dev; + } + + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); + atomic_inc(&rds_ibdev->refcount); + + ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); + + rds_ib_nodev_connect(); + +put_dev: + rds_ib_dev_put(rds_ibdev); +free_attr: + kfree(dev_attr); +} + +/* + * New connections use this to find the device to associate with the + * connection. It's not in the fast path so we're not concerned about the + * performance of the IB call. (As of this writing, it uses an interrupt + * blocking spinlock to serialize walking a per-device list of all registered + * clients.) + * + * RCU is used to handle incoming connections racing with device teardown. + * Rather than use a lock to serialize removal from the client_data and + * getting a new reference, we use an RCU grace period. The destruction + * path removes the device from client_data and then waits for all RCU + * readers to finish. + * + * A new connection can get NULL from this if its arriving on a + * device that is in the process of being removed. + */ +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + + rcu_read_lock(); + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (rds_ibdev) + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; +} + +/* + * The IB stack is letting us know that a device is going away. This can + * happen if the underlying HCA driver is removed or if PCI hotplug is removing + * the pci function, for example. + * + * This can be called at any time and can be racing with any other RDS path. + */ +static void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; + + rds_ib_dev_shutdown(rds_ibdev); + + /* stop connection attempts from getting a reference to this device. */ + ib_set_client_data(device, &rds_ib_client, NULL); + + down_write(&rds_ib_devices_lock); + list_del_rcu(&rds_ibdev->list); + up_write(&rds_ib_devices_lock); + + /* + * This synchronize rcu is waiting for readers of both the ib + * client data and the devices list to finish before we drop + * both of those references. + */ + synchronize_rcu(); + rds_ib_dev_put(rds_ibdev); + rds_ib_dev_put(rds_ibdev); +} + +struct ib_client rds_ib_client = { + .name = "rds_ib", + .add = rds_ib_add_one, + .remove = rds_ib_remove_one +}; + +static int rds_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_ibdev = ic->rds_ibdev; + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_ibdev->max_sge; + rds_ib_get_mr_info(rds_ibdev, iinfo); + } + return 1; +} + +static void rds_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_ib_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_ib_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support iWARP devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +static void rds_ib_unregister_client(void) +{ + ib_unregister_client(&rds_ib_client); + /* wait for rds_ib_dev_free() to complete */ + flush_workqueue(rds_wq); +} + +void rds_ib_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_unregister_client(); + rds_ib_destroy_nodev_conns(); + rds_ib_sysctl_exit(); + rds_ib_recv_exit(); + rds_trans_unregister(&rds_ib_transport); +} + +struct rds_transport rds_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rds_ib_xmit_complete, + .xmit = rds_ib_xmit, + .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, + .recv = rds_ib_recv, + .conn_alloc = rds_ib_conn_alloc, + .conn_free = rds_ib_conn_free, + .conn_connect = rds_ib_conn_connect, + .conn_shutdown = rds_ib_conn_shutdown, + .inc_copy_to_user = rds_ib_inc_copy_to_user, + .inc_free = rds_ib_inc_free, + .cm_initiate_connect = rds_ib_cm_initiate_connect, + .cm_handle_connect = rds_ib_cm_handle_connect, + .cm_connect_complete = rds_ib_cm_connect_complete, + .stats_info_copy = rds_ib_stats_info_copy, + .exit = rds_ib_exit, + .get_mr = rds_ib_get_mr, + .sync_mr = rds_ib_sync_mr, + .free_mr = rds_ib_free_mr, + .flush_mrs = rds_ib_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "infiniband", + .t_type = RDS_TRANS_IB +}; + +int rds_ib_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_ib_devices); + + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out; + + ret = rds_ib_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_ib_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_ib_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + + goto out; + +out_recv: + rds_ib_recv_exit(); +out_sysctl: + rds_ib_sysctl_exit(); +out_ibreg: + rds_ib_unregister_client(); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/ib.h b/net/rds/ib.h new file mode 100644 index 00000000..4297d927 --- /dev/null +++ b/net/rds/ib.h @@ -0,0 +1,373 @@ +#ifndef _RDS_IB_H +#define _RDS_IB_H + +#include +#include +#include +#include +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FMR_SIZE 256 +#define RDS_FMR_POOL_SIZE 8192 + +#define RDS_IB_MAX_SGE 8 +#define RDS_IB_RECV_SGE 2 + +#define RDS_IB_DEFAULT_RECV_WR 1024 +#define RDS_IB_DEFAULT_SEND_WR 256 + +#define RDS_IB_DEFAULT_RETRY_COUNT 2 + +#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +#define RDS_IB_RECYCLE_BATCH_COUNT 32 + +extern struct rw_semaphore rds_ib_devices_lock; +extern struct list_head rds_ib_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +struct rds_page_frag { + struct list_head f_item; + struct list_head f_cache_entry; + struct scatterlist f_sg; +}; + +struct rds_ib_incoming { + struct list_head ii_frags; + struct list_head ii_cache_entry; + struct rds_incoming ii_inc; +}; + +struct rds_ib_cache_head { + struct list_head *first; + unsigned long count; +}; + +struct rds_ib_refill_cache { + struct rds_ib_cache_head *percpu; + struct list_head *xfer; + struct list_head *ready; +}; + +struct rds_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_ib_send_work { + void *s_op; + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IB_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_ib_recv_work { + struct rds_ib_incoming *r_ibinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_ib_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_ib_device; + +struct rds_ib_connection { + + struct list_head ib_node; + struct rds_ib_device *rds_ibdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_ib_work_ring i_send_ring; + struct rm_data_op *i_data_op; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_ib_send_work *i_sends; + atomic_t i_signaled_sends; + + /* rx */ + struct tasklet_struct i_recv_tasklet; + struct mutex i_recv_mutex; + struct rds_ib_work_ring i_recv_ring; + struct rds_ib_incoming *i_ibinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_ib_recv_work *i_recvs; + u64 i_ack_recv; /* last ACK received */ + struct rds_ib_refill_cache i_cache_incs; + struct rds_ib_refill_cache i_cache_frags; + + /* sending acks */ + unsigned long i_ack_flags; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t i_ack_next; /* next ACK to send */ +#else + spinlock_t i_ack_lock; /* protect i_ack_next */ + u64 i_ack_next; /* next ACK to send */ +#endif + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + + /* Batched completions */ + unsigned int i_unsignaled_wrs; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_ib_ipaddr { + struct list_head list; + __be32 ipaddr; +}; + +struct rds_ib_device { + struct list_head list; + struct list_head ipaddr_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_ib_mr_pool *mr_pool; + unsigned int fmr_max_remaps; + unsigned int max_fmrs; + int max_sge; + unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; + spinlock_t spinlock; /* protect the above */ + atomic_t refcount; + struct work_struct free_work; +}; + +#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) +#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device)) +#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IB_ACK_WR_ID (~(u64) 0) + +struct rds_ib_statistics { + uint64_t s_ib_connect_raced; + uint64_t s_ib_listen_closed_stale; + uint64_t s_ib_tx_cq_call; + uint64_t s_ib_tx_cq_event; + uint64_t s_ib_tx_ring_full; + uint64_t s_ib_tx_throttle; + uint64_t s_ib_tx_sg_mapping_failure; + uint64_t s_ib_tx_stalled; + uint64_t s_ib_tx_credit_updates; + uint64_t s_ib_rx_cq_call; + uint64_t s_ib_rx_cq_event; + uint64_t s_ib_rx_ring_empty; + uint64_t s_ib_rx_refill_from_cq; + uint64_t s_ib_rx_refill_from_thread; + uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_credit_updates; + uint64_t s_ib_ack_sent; + uint64_t s_ib_ack_send_failure; + uint64_t s_ib_ack_send_delayed; + uint64_t s_ib_ack_send_piggybacked; + uint64_t s_ib_ack_received; + uint64_t s_ib_rdma_mr_alloc; + uint64_t s_ib_rdma_mr_free; + uint64_t s_ib_rdma_mr_used; + uint64_t s_ib_rdma_mr_pool_flush; + uint64_t s_ib_rdma_mr_pool_wait; + uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_atomic_cswp; + uint64_t s_ib_atomic_fadd; +}; + +extern struct workqueue_struct *rds_ib_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu + +static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device + + +/* ib.c */ +extern struct rds_transport rds_ib_transport; +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); +extern struct ib_client rds_ib_client; + +extern unsigned int fmr_message_size; +extern unsigned int rds_ib_retry_count; + +extern spinlock_t ib_nodev_conns_lock; +extern struct list_head ib_nodev_conns; + +/* ib_cm.c */ +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_ib_conn_free(void *arg); +int rds_ib_conn_connect(struct rds_connection *conn); +void rds_ib_conn_shutdown(struct rds_connection *conn); +void rds_ib_state_change(struct sock *sk); +int rds_ib_listen_init(void); +void rds_ib_listen_stop(void); +void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_ib_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_ib_conn_error(conn, fmt...) \ + __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) + +/* ib_rdma.c */ +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_destroy_nodev_conns(void); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); + +/* ib_recv.c */ +int rds_ib_recv_init(void); +void rds_ib_recv_exit(void); +int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +void rds_ib_recv_free_caches(struct rds_ib_connection *ic); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_inc_free(struct rds_incoming *inc); +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_tasklet_fn(unsigned long data); +void rds_ib_recv_init_ring(struct rds_ib_connection *ic); +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); +void rds_ib_recv_init_ack(struct rds_ib_connection *ic); +void rds_ib_attempt_ack(struct rds_ib_connection *ic); +void rds_ib_ack_send_complete(struct rds_ib_connection *ic); +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); + +/* ib_ring.c */ +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); +int rds_ib_ring_empty(struct rds_ib_work_ring *ring); +int rds_ib_ring_low(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_ib_ring_empty_wait; + +/* ib_send.c */ +char *rds_ib_wc_status_str(enum ib_wc_status status); +void rds_ib_xmit_complete(struct rds_connection *conn); +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_init_ring(struct rds_ib_connection *ic); +void rds_ib_send_clear_ring(struct rds_ib_connection *ic); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted, int max_posted); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int rds_ib_sysctl_init(void); +void rds_ib_sysctl_exit(void); +extern unsigned long rds_ib_sysctl_max_send_wr; +extern unsigned long rds_ib_sysctl_max_recv_wr; +extern unsigned long rds_ib_sysctl_max_unsig_wrs; +extern unsigned long rds_ib_sysctl_max_unsig_bytes; +extern unsigned long rds_ib_sysctl_max_recv_allocation; +extern unsigned int rds_ib_sysctl_flow_control; + +#endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c new file mode 100644 index 00000000..fd453dd5 --- /dev/null +++ b/net/rds/ib_cm.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); +}; + +/* + * Set the selected protocol version + */ +static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (rds_ib_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_ib_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Tune RNR behavior. Without flow control, we use a rather + * low timeout, but not the absolute minimum - this should + * be tunable. + * + * We already set the RNR retry count to 7 (which is the + * smallest infinite number :-) above. + * If flow control is off, we want to change this back to 0 + * so that we learn quickly when our credit accounting is + * buggy. + * + * Caller passes in a qp_attr pointer - don't waste stack spacv + * by allocation this twice. + */ +static void +rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) +{ + int ret; + + attr->min_rnr_timer = IB_RNR_TIMER_000_32; + ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); + if (ret) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_ib_connect_private *dp = NULL; + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_qp_attr qp_attr; + int err; + + if (event->param.conn.private_data_len >= sizeof(*dp)) { + dp = event->param.conn.private_data; + + /* make sure it isn't empty data */ + if (dp->dp_protocol_major) { + rds_ib_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + } + + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } + + /* + * Init rings and fill recv. this needs to wait until protocol negotiation + * is complete, since ring layout is different from 3.0 to 3.1. + */ + rds_ib_send_init_ring(ic); + rds_ib_recv_init_ring(ic); + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_ib_recv_refill(conn, 1); + + /* Tune RNR behavior */ + rds_ib_tune_rnr(ic, &qp_attr); + + qp_attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + if (err) + printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); + + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); + if (err) + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + rds_connect_complete(conn); +} + +static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_ib_connect_private *dp, + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); + conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); + conn_param->rnr_retry_count = 7; + + if (dp) { + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_ib_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_ib_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); +} + +static void rds_ib_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + default: + rdsdebug("Fatal QP Event %u (%s) " + "- connection %pI4->%pI4, reconnecting\n", + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); + break; + } +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_ib_setup_qp(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_ib_device *rds_ibdev; + int ret; + + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. + */ + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) + return -EOPNOTSUPP; + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); + + if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); + if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) + rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); + + /* Protection domain and memory range */ + ic->i_pd = rds_ibdev->pd; + ic->i_mr = rds_ibdev->mr; + + ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_send_ring.w_nr + 1, 0); + if (IS_ERR(ic->i_send_cq)) { + ret = PTR_ERR(ic->i_send_cq); + ic->i_send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, + rds_ib_cq_event_handler, conn, + ic->i_recv_ring.w_nr, 0); + if (IS_ERR(ic->i_recv_cq)) { + ret = PTR_ERR(ic->i_recv_cq); + ic->i_recv_cq = NULL; + rdsdebug("ib_create_cq recv failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + + /* XXX negotiate max send/recv with remote? */ + memset(&attr, 0, sizeof(attr)); + attr.event_handler = rds_ib_qp_event_handler; + attr.qp_context = conn; + /* + 1 to allow for the single ack message */ + attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; + attr.cap.max_send_sge = rds_ibdev->max_sge; + attr.cap.max_recv_sge = RDS_IB_RECV_SGE; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = ic->i_send_cq; + attr.recv_cq = ic->i_recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (!ic->i_send_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (!ic->i_recv_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (!ic->i_ack) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); + if (!ic->i_sends) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); + + ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); + if (!ic->i_recvs) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); + + rds_ib_recv_init_ack(ic); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + rds_ib_dev_put(rds_ibdev); + return ret; +} + +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) +{ + const struct rds_ib_connect_private *dp = event->param.conn.private_data; + u16 common; + u32 version = 0; + + /* + * rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( + */ + + /* Be paranoid. RDS always has privdata */ + if (!event->param.conn.private_data_len) { + printk(KERN_NOTICE "RDS incoming connection has no private data, " + "rejecting\n"); + return 0; + } + + /* Even if len is crap *now* I still want to check it. -ASG */ + if (event->param.conn.private_data_len < sizeof (*dp) || + dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else if (printk_ratelimit()) { + printk(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + return version; +} + +int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; + __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; + const struct rds_ib_connect_private *dp = event->param.conn.private_data; + struct rds_ib_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_ib_connection *ic = NULL; + struct rdma_conn_param conn_param; + u32 version; + int err = 1, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_ib_protocol_compatible(event); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " + "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), + (unsigned long long)be64_to_cpu(lguid), + (unsigned long long)be64_to_cpu(fguid)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_ib_stats_inc(s_ib_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_ib_stats_inc(s_ib_connect_raced); + } + goto out; + } + + ic = conn->c_transport_data; + + rds_ib_set_protocol(conn, version); + rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_ib_setup_qp(conn); + if (err) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + if (err) + rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); + +out: + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_ib_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ + + ret = rds_ib_setup_qp(conn); + if (ret) { + rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_ib_conn_connect(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_ib_conn_shutdown(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int err = 0; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("failed to disconnect, cm: %p err %d\n", + ic->i_cm_id, err); + } + + /* + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. + */ + wait_event(rds_ib_ring_empty_wait, + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_ib_send_clear_ring(ic); + if (ic->i_recvs) + rds_ib_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* + * Move connection back to the nodev list. + */ + if (ic->rds_ibdev) + rds_ib_remove_conn(ic->rds_ibdev, conn); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_ibdev); + + /* Clear pending transmit */ + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_set(&ic->i_ack_next, 0); +#else + ic->i_ack_next = 0; +#endif + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + if (ic->i_ibinc) { + rds_inc_put(&ic->i_ibinc->ii_inc); + ic->i_ibinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; +} + +int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_ib_connection *ic; + unsigned long flags; + int ret; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); + if (!ic) + return -ENOMEM; + + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + + INIT_LIST_HEAD(&ic->ib_node); + tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, + (unsigned long) ic); + mutex_init(&ic->i_recv_mutex); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&ic->i_ack_lock); +#endif + atomic_set(&ic->i_signaled_sends, 0); + + /* + * rds_ib_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); + rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&ib_nodev_conns_lock, flags); + list_add_tail(&ic->ib_node, &ib_nodev_conns); + spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ +void rds_ib_conn_free(void *arg) +{ + struct rds_ib_connection *ic = arg; + spinlock_t *lock_ptr; + + rdsdebug("ic %p\n", ic); + + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_ibdev would change) but that should never happen. + */ + lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; + + spin_lock_irq(lock_ptr); + list_del(&ic->ib_node); + spin_unlock_irq(lock_ptr); + + rds_ib_recv_free_caches(ic); + + kfree(ic); +} + + +/* + * An error occurred on the connection + */ +void +__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c new file mode 100644 index 00000000..819c35a0 --- /dev/null +++ b/net/rds/ib_rdma.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "ib.h" +#include "xlist.h" + +static DEFINE_PER_CPU(unsigned long, clean_list_grace); +#define CLEAN_LIST_BUSY_BIT 0 + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct ib_fmr *fmr; + + struct xlist_head xlist; + + /* unmap_list is for freeing */ + struct list_head unmap_list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + u64 *dma; + int sg_dma_len; +}; + +/* + * Our own little FMR pool + */ +struct rds_ib_mr_pool { + struct mutex flush_lock; /* serialize fmr invalidate */ + struct delayed_work flush_worker; /* flush worker */ + + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + + struct xlist_head drop_list; /* MRs that have reached their max_maps limit */ + struct xlist_head free_list; /* unused MRs */ + struct xlist_head clean_list; /* global unused & unamapped MRs */ + wait_queue_head_t flush_wait; + + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; +}; + +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); +static void rds_ib_mr_pool_flush_worker(struct work_struct *work); + +static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; + } + } + } + rcu_read_unlock(); + + return NULL; +} + +static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr; + + i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); + if (!i_ipaddr) + return -ENOMEM; + + i_ipaddr->ipaddr = ipaddr; + + spin_lock_irq(&rds_ibdev->spinlock); + list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + spin_unlock_irq(&rds_ibdev->spinlock); + + return 0; +} + +static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_ipaddr *i_ipaddr; + struct rds_ib_ipaddr *to_free = NULL; + + + spin_lock_irq(&rds_ibdev->spinlock); + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + if (i_ipaddr->ipaddr == ipaddr) { + list_del_rcu(&i_ipaddr->list); + to_free = i_ipaddr; + break; + } + } + spin_unlock_irq(&rds_ibdev->spinlock); + + if (to_free) { + synchronize_rcu(); + kfree(to_free); + } +} + +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +{ + struct rds_ib_device *rds_ibdev_old; + + rds_ibdev_old = rds_ib_get_device(ipaddr); + if (rds_ibdev_old) { + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_dev_put(rds_ibdev_old); + } + + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); +} + +void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&ib_nodev_conns_lock); + BUG_ON(list_empty(&ib_nodev_conns)); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + + spin_lock(&rds_ibdev->spinlock); + list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); + spin_unlock(&rds_ibdev->spinlock); + spin_unlock_irq(&ib_nodev_conns_lock); + + ic->rds_ibdev = rds_ibdev; + atomic_inc(&rds_ibdev->refcount); +} + +void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* place conn on nodev_conns_list */ + spin_lock(&ib_nodev_conns_lock); + + spin_lock_irq(&rds_ibdev->spinlock); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&rds_ibdev->spinlock); + + list_add_tail(&ic->ib_node, &ib_nodev_conns); + + spin_unlock(&ib_nodev_conns_lock); + + ic->rds_ibdev = NULL; + rds_ib_dev_put(rds_ibdev); +} + +void rds_ib_destroy_nodev_conns(void) +{ + struct rds_ib_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + spin_unlock_irq(&ib_nodev_conns_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) + rds_conn_destroy(ic->conn); +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + INIT_XLIST_HEAD(&pool->free_list); + INIT_XLIST_HEAD(&pool->drop_list); + INIT_XLIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + pool->fmr_attr.max_pages = fmr_message_size; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; + pool->max_items = rds_ibdev->max_fmrs; + + return pool; +} + +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->fmr_attr.max_pages; +} + +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl, + struct rds_ib_mr **ibmr_ret) +{ + struct xlist_head *ibmr_xl; + ibmr_xl = xlist_del_head_fast(xl); + *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist); +} + +static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + struct xlist_head *ret; + unsigned long *flag; + + preempt_disable(); + flag = &__get_cpu_var(clean_list_grace); + set_bit(CLEAN_LIST_BUSY_BIT, flag); + ret = xlist_del_head(&pool->clean_list); + if (ret) + ibmr = list_entry(ret, struct rds_ib_mr, xlist); + + clear_bit(CLEAN_LIST_BUSY_BIT, flag); + preempt_enable(); + return ibmr; +} + +static inline void wait_clean_list_grace(void) +{ + int cpu; + unsigned long *flag; + + for_each_online_cpu(cpu) { + flag = &per_cpu(clean_list_grace, cpu); + while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) + cpu_relax(); + } +} + +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr *ibmr = NULL; + int err = 0, iter = 0; + + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); + + while (1) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; + } + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + memset(ibmr, 0, sizeof(*ibmr)); + + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_ATOMIC), + &pool->fmr_attr); + if (IS_ERR(ibmr->fmr)) { + err = PTR_ERR(ibmr->fmr); + ibmr->fmr = NULL; + printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); + goto out_no_cigar; + } + + rds_ib_stats_inc(s_ib_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + if (ibmr->fmr) + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, + DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + if (page_cnt > fmr_message_size) + return -EINVAL; + + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + + ret = ib_map_phys_fmr(ibmr->fmr, + dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + rds_ib_stats_inc(s_ib_rdma_mr_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +void rds_ib_sync_mr(void *trans_private, int direction) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, + ibmr->sg_dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_device *rds_ibdev = ibmr->device; + + if (ibmr->sg_dma_len) { + ib_dma_unmap_sg(rds_ibdev->dev, + ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + } + + /* Release the s/g list */ + if (ibmr->sg_len) { + unsigned int i; + + for (i = 0; i < ibmr->sg_len; ++i) { + struct page *page = sg_page(&ibmr->sg[i]); + + /* FIXME we need a way to tell a r/w MR + * from a r/o MR */ + BUG_ON(irqs_disabled()); + set_page_dirty(page); + put_page(page); + } + kfree(ibmr->sg); + + ibmr->sg = NULL; + ibmr->sg_len = 0; + } +} + +static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +{ + unsigned int pinned = ibmr->sg_len; + + __rds_ib_teardown_mr(ibmr); + if (pinned) { + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + atomic_sub(pinned, &pool->free_pinned); + } +} + +static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * given an xlist of mrs, put them all into the list_head for more processing + */ +static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list) +{ + struct rds_ib_mr *ibmr; + struct xlist_head splice; + struct xlist_head *cur; + struct xlist_head *next; + + splice.next = NULL; + xlist_splice(xlist, &splice); + cur = splice.next; + while (cur) { + next = cur->next; + ibmr = list_entry(cur, struct rds_ib_mr, xlist); + list_add_tail(&ibmr->unmap_list, list); + cur = next; + } +} + +/* + * this takes a list head of mrs and turns it into an xlist of clusters. + * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for + * reuse. + */ +static void list_append_to_xlist(struct rds_ib_mr_pool *pool, + struct list_head *list, struct xlist_head *xlist, + struct xlist_head **tail_ret) +{ + struct rds_ib_mr *ibmr; + struct xlist_head *cur_mr = xlist; + struct xlist_head *tail_mr = NULL; + + list_for_each_entry(ibmr, list, unmap_list) { + tail_mr = &ibmr->xlist; + tail_mr->next = NULL; + cur_mr->next = tail_mr; + cur_mr = tail_mr; + } + *tail_ret = tail_mr; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) +{ + struct rds_ib_mr *ibmr, *next; + struct xlist_head clean_xlist; + struct xlist_head *clean_tail; + LIST_HEAD(unmap_list); + LIST_HEAD(fmr_list); + unsigned long unpinned = 0; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + + if (ibmr_ret) { + DEFINE_WAIT(wait); + while(!mutex_trylock(&pool->flush_lock)) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + + prepare_to_wait(&pool->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (xlist_empty(&pool->clean_list)) + schedule(); + + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + } + finish_wait(&pool->flush_wait, &wait); + } else + mutex_lock(&pool->flush_lock); + + if (ibmr_ret) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + goto out; + } + } + + /* Get the list of all MRs to be dropped. Ordering matters - + * we want to put drop_list ahead of free_list. + */ + xlist_append_to_list(&pool->drop_list, &unmap_list); + xlist_append_to_list(&pool->free_list, &unmap_list); + if (free_all) + xlist_append_to_list(&pool->clean_list, &unmap_list); + + free_goal = rds_ib_flush_goal(pool, free_all); + + if (list_empty(&unmap_list)) + goto out; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, &unmap_list, unmap_list) + list_add(&ibmr->fmr->list, &fmr_list); + + ret = ib_unmap_fmr(&fmr_list); + if (ret) + printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { + unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { + rds_ib_stats_inc(s_ib_rdma_mr_free); + list_del(&ibmr->unmap_list); + ib_dealloc_fmr(ibmr->fmr); + kfree(ibmr); + nfreed++; + } + ncleaned++; + } + + if (!list_empty(&unmap_list)) { + /* we have to make sure that none of the things we're about + * to put on the clean list would race with other cpus trying + * to pull items off. The xlist would explode if we managed to + * remove something from the clean list and then add it back again + * while another CPU was spinning on that same item in xlist_del_head. + * + * This is pretty unlikely, but just in case wait for an xlist grace period + * here before adding anything back into the clean list. + */ + wait_clean_list_grace(); + + list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail); + if (ibmr_ret) + refill_local(pool, &clean_xlist, ibmr_ret); + + /* refill_local may have emptied our list */ + if (!xlist_empty(&clean_xlist)) + xlist_add(clean_xlist.next, clean_tail, &pool->clean_list); + + } + + atomic_sub(unpinned, &pool->free_pinned); + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + +out: + mutex_unlock(&pool->flush_lock); + if (waitqueue_active(&pool->flush_wait)) + wake_up(&pool->flush_wait); +out_nolock: + return ret; +} + +static void rds_ib_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); + + rds_ib_flush_mr_pool(pool, 0, NULL); +} + +void rds_ib_free_mr(void *trans_private, int invalidate) +{ + struct rds_ib_mr *ibmr = trans_private; + struct rds_ib_device *rds_ibdev = ibmr->device; + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); + + /* Return it to the pool's free list */ + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list); + else + xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list); + + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 10) + schedule_delayed_work(&pool->flush_worker, 10); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_ib_flush_mr_pool(pool, 0, NULL); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + schedule_delayed_work(&pool->flush_worker, 10); + } + } + + rds_ib_dev_put(rds_ibdev); +} + +void rds_ib_flush_mrs(void) +{ + struct rds_ib_device *rds_ibdev; + + down_read(&rds_ib_devices_lock); + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + + if (pool) + rds_ib_flush_mr_pool(pool, 0, NULL); + } + up_read(&rds_ib_devices_lock); +} + +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_mr *ibmr = NULL; + int ret; + + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + if (!rds_ibdev) { + ret = -ENODEV; + goto out; + } + + if (!rds_ibdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_ib_alloc_fmr(rds_ibdev); + if (IS_ERR(ibmr)) + return ibmr; + + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->fmr->rkey; + else + printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); + + ibmr->device = rds_ibdev; + rds_ibdev = NULL; + + out: + if (ret) { + if (ibmr) + rds_ib_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); + return ibmr; +} + diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c new file mode 100644 index 00000000..e29e0ca3 --- /dev/null +++ b/net/rds/ib_recv.c @@ -0,0 +1,1076 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static struct kmem_cache *rds_ib_incoming_slab; +static struct kmem_cache *rds_ib_frag_slab; +static atomic_t rds_ib_allocation = ATOMIC_INIT(0); + +void rds_ib_recv_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_ibinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IB_RECV_SGE; + + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + sge = &recv->r_sge[1]; + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = ic->i_mr->lkey; + } +} + +/* + * The entire 'from' list, including the from element itself, is put on + * to the tail of the 'to' list. + */ +static void list_splice_entire_tail(struct list_head *from, + struct list_head *to) +{ + struct list_head *from_last = from->prev; + + list_splice_tail(from_last, to); + list_add_tail(from_last, to); +} + +static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) +{ + struct list_head *tmp; + + tmp = xchg(&cache->xfer, NULL); + if (tmp) { + if (cache->ready) + list_splice_entire_tail(tmp, cache->ready); + else + cache->ready = tmp; + } +} + +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +{ + struct rds_ib_cache_head *head; + int cpu; + + cache->percpu = alloc_percpu(struct rds_ib_cache_head); + if (!cache->percpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + head->first = NULL; + head->count = 0; + } + cache->xfer = NULL; + cache->ready = NULL; + + return 0; +} + +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +{ + int ret; + + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + if (!ret) { + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + if (ret) + free_percpu(ic->i_cache_incs.percpu); + } + + return ret; +} + +static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, + struct list_head *caller_list) +{ + struct rds_ib_cache_head *head; + int cpu; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + if (head->first) { + list_splice_entire_tail(head->first, caller_list); + head->first = NULL; + } + } + + if (cache->ready) { + list_splice_entire_tail(cache->ready, caller_list); + cache->ready = NULL; + } +} + +void rds_ib_recv_free_caches(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *inc; + struct rds_ib_incoming *inc_tmp; + struct rds_page_frag *frag; + struct rds_page_frag *frag_tmp; + LIST_HEAD(list); + + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); + free_percpu(ic->i_cache_incs.percpu); + + list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { + list_del(&inc->ii_cache_entry); + WARN_ON(!list_empty(&inc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, inc); + } + + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); + free_percpu(ic->i_cache_frags.percpu); + + list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { + list_del(&frag->f_cache_entry); + WARN_ON(!list_empty(&frag->f_item)); + kmem_cache_free(rds_ib_frag_slab, frag); + } +} + +/* fwd decl */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache); +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); + + +/* Recycle frag and attached recv buffer f_sg */ +static void rds_ib_frag_free(struct rds_ib_connection *ic, + struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + + rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); +} + +/* Recycle inc after freeing attached frags */ +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + struct rds_ib_connection *ic = inc->i_conn->c_transport_data; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + /* Free attached frags */ + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_free(ic, frag); + } + BUG_ON(!list_empty(&ibinc->ii_frags)); + + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); +} + +static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + if (recv->r_ibinc) { + rds_inc_put(&recv->r_ibinc->ii_inc); + recv->r_ibinc = NULL; + } + if (recv->r_frag) { + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); +} + +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, + gfp_t slab_mask) +{ + struct rds_ib_incoming *ibinc; + struct list_head *cache_item; + int avail_allocs; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); + if (cache_item) { + ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); + } else { + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + if (!avail_allocs) { + rds_ib_stats_inc(s_ib_rx_alloc_limit); + return NULL; + } + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); + if (!ibinc) { + atomic_dec(&rds_ib_allocation); + return NULL; + } + } + INIT_LIST_HEAD(&ibinc->ii_frags); + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + + return ibinc; +} + +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, + gfp_t slab_mask, gfp_t page_mask) +{ + struct rds_page_frag *frag; + struct list_head *cache_item; + int ret; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); + if (cache_item) { + frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + } else { + frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); + if (!frag) + return NULL; + + sg_init_table(&frag->f_sg, 1); + ret = rds_page_remainder_alloc(&frag->f_sg, + RDS_FRAG_SIZE, page_mask); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } + } + + INIT_LIST_HEAD(&frag->f_item); + + return frag; +} + +static int rds_ib_recv_refill_one(struct rds_connection *conn, + struct rds_ib_recv_work *recv, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_sge *sge; + int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; + } + + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + + /* + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. + */ + if (!recv->r_ibinc) { + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); + if (!recv->r_ibinc) + goto out; + } + + WARN_ON(recv->r_frag); /* leak! */ + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); + if (!recv->r_frag) + goto out; + + ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); + + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + sge = &recv->r_sge[1]; + sge->addr = sg_dma_address(&recv->r_frag->f_sg); + sge->length = sg_dma_len(&recv->r_frag->f_sg); + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) && + rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_ib_recv_refill_one(conn, recv, prefill); + if (ret) { + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, + recv->r_ibinc, sg_page(&recv->r_frag->f_sg), + (long) sg_dma_address(&recv->r_frag->f_sg), ret); + if (ret) { + rds_ib_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_ib_advertise_credits(conn, posted); + + if (ret) + rds_ib_ring_unalloc(&ic->i_recv_ring, 1); +} + +/* + * We want to recycle several types of recv allocations, like incs and frags. + * To use this, the *_free() function passes in the ptr to a list_head within + * the recyclee, as well as the cache to put it on. + * + * First, we put the memory on a percpu list. When this reaches a certain size, + * We move it to an intermediate non-percpu list in a lockless manner, with some + * xchg/compxchg wizardry. + * + * N.B. Instead of a list_head as the anchor, we use a single pointer, which can + * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and + * list_empty() will return true with one element is actually present. + */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache) +{ + unsigned long flags; + struct rds_ib_cache_head *chp; + struct list_head *old; + + local_irq_save(flags); + + chp = per_cpu_ptr(cache->percpu, smp_processor_id()); + if (!chp->first) + INIT_LIST_HEAD(new_item); + else /* put on front */ + list_add_tail(new_item, chp->first); + chp->first = new_item; + chp->count++; + + if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT) + goto end; + + /* + * Return our per-cpu first list to the cache's xfer by atomically + * grabbing the current xfer list, appending it to our per-cpu list, + * and then atomically returning that entire list back to the + * cache's xfer list as long as it's still empty. + */ + do { + old = xchg(&cache->xfer, NULL); + if (old) + list_splice_entire_tail(old, chp->first); + old = cmpxchg(&cache->xfer, NULL, chp->first); + } while (old); + + chp->first = NULL; + chp->count = 0; +end: + local_irq_restore(flags); +} + +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) +{ + struct list_head *head = cache->ready; + + if (head) { + if (!list_empty(head)) { + cache->ready = head->next; + list_del_init(head); + } else + cache->ready = NULL; + } + + return head; +} + +int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(sg_page(&frag->f_sg), + frag->f_sg.offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_ib_recv_init_ack(struct rds_ib_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IB_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +#ifndef KERNEL_HAS_ATOMIC64 +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->i_ack_lock, flags); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + spin_unlock_irqrestore(&ic->i_ack_lock, flags); +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + unsigned long flags; + u64 seq; + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + spin_lock_irqsave(&ic->i_ack_lock, flags); + seq = ic->i_ack_next; + spin_unlock_irqrestore(&ic->i_ack_lock, flags); + + return seq; +} +#else +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + atomic64_set(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_clear_bit(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_clear_bit(); + + return atomic64_read(&ic->i_ack_next); +} +#endif + + +static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_ib_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_ib_stats_inc(s_ib_ack_send_failure); + + rds_ib_conn_error(ic->conn, "sending ack failed\n"); + } else + rds_ib_stats_inc(s_ib_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_ib_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_ib_attempt_ack(struct rds_ib_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_ib_stats_inc(s_ib_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { + rds_ib_stats_inc(s_ib_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_ib_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_ib_ack_send_complete(struct rds_ib_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_ib_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_ib_stats_inc(s_ib_ack_send_piggybacked); + return rds_ib_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_ib_cong_recv(struct rds_connection *conn, + struct rds_ib_incoming *ibinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr, KM_SOFTIRQ0); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_ib_process_recv(struct rds_connection *conn, + struct rds_ib_recv_work *recv, u32 data_len, + struct rds_ib_ack_state *state) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_incoming *ibinc = ic->i_ibinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv, + data_len); + + if (data_len < sizeof(struct rds_header)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + data_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_ib_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_ib_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_ib_stats_inc(s_ib_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. + * + * FIXME: Fold this into the code path below. + */ + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (!ibinc) { + ibinc = recv->r_ibinc; + recv->r_ibinc = NULL; + ic->i_ibinc = ibinc; + + hdr = &ibinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &ibinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { + rds_ib_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_ibinc = NULL; + + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_ib_cong_recv(conn, ibinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &ibinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&ibinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_rx_cq_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static inline void rds_poll_cq(struct rds_ib_connection *ic, + struct rds_ib_ack_state *state) +{ + struct rds_connection *conn = ic->conn; + struct ib_wc wc; + struct rds_ib_recv_work *recv; + + while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_rx_cq_event); + + recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; + + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, state); + } else { + /* We expect errors as the qp is drained during shutdown */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had " + "status %u (%s), disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status, + rds_ib_wc_status_str(wc.status)); + } + + /* + * It's very important that we only free this ring entry if we've truly + * freed the resources allocated to the entry. The refilling path can + * leak if we don't. + */ + rds_ib_ring_free(&ic->i_recv_ring, 1); + } +} + +void rds_ib_recv_tasklet_fn(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state state = { 0, }; + + rds_poll_cq(ic, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + rds_poll_cq(ic, &state); + + if (state.ack_next_valid) + rds_ib_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_ib_ring_empty(&ic->i_recv_ring)) + rds_ib_stats_inc(s_ib_rx_ring_empty); + + if (rds_ib_ring_low(&ic->i_recv_ring)) + rds_ib_recv_refill(conn, 0); +} + +int rds_ib_recv(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + return ret; +} + +int rds_ib_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", + sizeof(struct rds_ib_incoming), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_incoming_slab) + goto out; + + rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", + sizeof(struct rds_page_frag), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_frag_slab) + kmem_cache_destroy(rds_ib_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_ib_recv_exit(void) +{ + kmem_cache_destroy(rds_ib_incoming_slab); + kmem_cache_destroy(rds_ib_frag_slab); +} diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c new file mode 100644 index 00000000..ff97e8ed --- /dev/null +++ b/net/rds/ib_ring.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "rds.h" +#include "ib.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait); + +void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_ib_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) == 0; +} + +u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_ib_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_ib_ring_empty(ring) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); +} + +void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_ib_ring_empty(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_empty(ring); +} + +int rds_ib_ring_low(struct rds_ib_work_ring *ring) +{ + return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1); +} + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c new file mode 100644 index 00000000..7c4dce8f --- /dev/null +++ b/net/rds/ib_send.c @@ -0,0 +1,1021 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +static char *rds_ib_wc_status_strings[] = { +#define RDS_IB_WC_STATUS_STR(foo) \ + [IB_WC_##foo] = __stringify(IB_WC_##foo) + RDS_IB_WC_STATUS_STR(SUCCESS), + RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), + RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), + RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), + RDS_IB_WC_STATUS_STR(MW_BIND_ERR), + RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), + RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_OP_ERR), + RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), + RDS_IB_WC_STATUS_STR(INV_EECN_ERR), + RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), + RDS_IB_WC_STATUS_STR(FATAL_ERR), + RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), + RDS_IB_WC_STATUS_STR(GENERAL_ERR), +#undef RDS_IB_WC_STATUS_STR +}; + +char *rds_ib_wc_status_str(enum ib_wc_status status) +{ + return rds_str_array(rds_ib_wc_status_strings, + ARRAY_SIZE(rds_ib_wc_status_strings), status); +} + +/* + * Convert IB-specific error message to RDS error message and call core + * completion handler. + */ +static void rds_ib_send_complete(struct rds_message *rm, + int wc_status, + void (*complete)(struct rds_message *rm, int status)) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + complete(rm, notify_status); +} + +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); +} + +static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, + struct rm_rdma_op *op, + int wc_status) +{ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; + } + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * we would need to take an event for the rdma WR. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_complete(container_of(op, struct rds_message, rdma), + wc_status, rds_rdma_send_complete); + + if (op->op_write) + rds_stats_add(s_send_rdma_bytes, op->op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, op->op_bytes); +} + +static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, + struct rm_atomic_op *op, + int wc_status) +{ + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; + } + + rds_ib_send_complete(container_of(op, struct rds_message, atomic), + wc_status, rds_atomic_send_complete); + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) + rds_ib_stats_inc(s_ib_atomic_cswp); + else + rds_ib_stats_inc(s_ib_atomic_fadd); +} + +/* + * Unmap the resources associated with a struct send_work. + * + * Returns the rm for no good reason other than it is unobtainable + * other than by switching on wr.opcode, currently, and the caller, + * the event handler, needs it. + */ +static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = NULL; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, data); + rds_ib_send_unmap_data(ic, send->s_op, wc_status); + } + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, rdma); + rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); + } + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, atomic); + rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); + } + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + + return rm; +} + +void rds_ib_send_init_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_op = NULL; + + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.ex.imm_data = 0; + + sge = &send->s_sge[0]; + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + send->s_sge[1].lkey = ic->i_mr->lkey; + } +} + +void rds_ib_send_clear_ring(struct rds_ib_connection *ic) +{ + struct rds_ib_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + if (send->s_op && send->s_wr.opcode != 0xdead) + rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); + } +} + +/* + * The only fast path caller always has a non-zero nr, so we don't + * bother testing nr before performing the atomic sub. + */ +static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) +{ + if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_message *rm = NULL; + struct ib_wc wc; + struct rds_ib_send_work *send; + u32 completed; + u32 oldest; + u32 i = 0; + int ret; + int nr_sig = 0; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_ib_stats_inc(s_ib_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_ib_stats_inc(s_ib_tx_cq_event); + + if (wc.wr_id == RDS_IB_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + continue; + } + + oldest = rds_ib_ring_oldest(&ic->i_send_ring); + + completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + + rm = rds_ib_send_unmap_op(ic, send, wc.status); + + if (send->s_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + } + rds_message_put(rm); + send->s_op = NULL; + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, "send completion on %pI4 had status " + "%u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc.status, + rds_ib_wc_status_str(wc.status)); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the receiver's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_ib_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * sets RDS_IN_XMIT to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_ib_send_grab_credits(struct rds_ib_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted, int max_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, max_posted); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_ib_stats_inc(s_ib_rx_credit_updates); +} + +void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) +{ + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0 || notify) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED; + return 1; + } + return 0; +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc = 0; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int bytes_sent = 0; + int ret; + int flow_controlled = 0; + int nr_sig = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* Do not send cong updates to IB loopback */ + if (conn->c_loopback + && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + scat = &rm->data.op_sg[sg]; + ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + ret = min_t(int, ret, scat->length - conn->c_xmit_data_off); + return ret; + } + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + if (ic->i_flowctl) { + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled = 1; + } + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_stats_inc(s_ib_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (!ic->i_data_op) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->data.op_count = 0; + } + + rds_message_addref(rm); + ic->i_data_op = &rm->data; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->rdma.op_active) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_ib_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + if (ic->i_flowctl) { + rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } + } + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->rdma.op_active && rm->rdma.op_fence) + send_flags = IB_SEND_FENCE; + + /* Each frag gets a header. Msgs may be 0 bytes */ + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &ic->i_data_op->op_sg[sg]; + i = 0; + do { + unsigned int len = 0; + + /* Set up the header */ + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + send->s_sge[0].addr = ic->i_send_hdrs_dma + + (pos * sizeof(struct rds_header)); + send->s_sge[0].length = sizeof(struct rds_header); + + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + /* Set up the data, if present */ + if (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]) { + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + send->s_wr.num_sge = 2; + + send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].length = len; + + bytes_sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + } + + rds_ib_set_wr_signal_state(ic, send, 0); + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + if (ic->i_flowctl && adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_ib_stats_inc(s_ib_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + send = &ic->i_sends[pos]; + i++; + + } while (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]); + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + bytes_sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_op = ic->i_data_op; + prev->s_wr.send_flags |= IB_SEND_SOLICITED; + ic->i_data_op = NULL; + } + + /* Put back wrs & credits we didn't use */ + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_ib_send_add_credits(conn, credit_alloc - i); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + if (prev->s_op) { + ic->i_data_op = prev->s_op; + prev->s_op = NULL; + } + + rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); + goto out; + } + + ret = bytes_sent; +out: + BUG_ON(adv_credits); + return ret; +} + +/* + * Issue atomic operation. + * A simplified version of the rdma case, we always map 1 SG, and + * only 8 bytes, for the return value from the atomic operation. + */ +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + u32 pos; + u32 work_alloc; + int ret; + int nr_sig = 0; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); + if (work_alloc != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + /* address of send request in ring */ + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; + send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + } else { /* FADD */ + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; + send->s_wr.wr.atomic.swap = 0; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_wr.wr.atomic.swap_mask = 0; + } + nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; + send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_op = op; + rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); + + /* map 8 byte retval buffer to the device */ + ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); + rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); + if (ret != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + /* Convert our struct scatterlist to struct ib_sge */ + send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].lkey = ic->i_mr->lkey; + + rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, + send->s_sge[0].addr, send->s_sge[0].length); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &send->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, + send, &send->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &send->s_wr)) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &send->s_wr); + } + +out: + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct rds_ib_send_work *first; + struct rds_ib_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->op_remote_addr; + u32 max_sge = ic->rds_ibdev->max_sge; + u32 pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + int nr_sig = 0; + + /* map the op the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->op_mapped = 1; + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->op_count, max_sge); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &op->op_sg[0]; + sent = 0; + num_sge = op->op_count; + + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + send->s_op = NULL; + + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); + + send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->op_rkey; + + if (num_sge > max_sge) { + send->s_wr.num_sge = max_sge; + num_sge -= max_sge; + } else { + send->s_wr.num_sge = num_sge; + } + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + send->s_sge[j].addr = + ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = ic->i_mr->lkey; + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + + remote_addr += len; + scat++; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* give a reference to the last op */ + if (scat == &op->op_sg[op->op_count]) { + prev->s_op = op; + rds_message_addref(container_of(op, struct rds_message, rdma)); + } + + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &first->s_wr)) { + printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &first->s_wr); + } + + +out: + return ret; +} + +void rds_ib_xmit_complete(struct rds_connection *conn) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_ib_attempt_ack(ic); +} diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c new file mode 100644 index 00000000..2d5965d6 --- /dev/null +++ b/net/rds/ib_stats.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "ib.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); + +static const char *const rds_ib_stat_names[] = { + "ib_connect_raced", + "ib_listen_closed_stale", + "ib_tx_cq_call", + "ib_tx_cq_event", + "ib_tx_ring_full", + "ib_tx_throttle", + "ib_tx_sg_mapping_failure", + "ib_tx_stalled", + "ib_tx_credit_updates", + "ib_rx_cq_call", + "ib_rx_cq_event", + "ib_rx_ring_empty", + "ib_rx_refill_from_cq", + "ib_rx_refill_from_thread", + "ib_rx_alloc_limit", + "ib_rx_credit_updates", + "ib_ack_sent", + "ib_ack_send_failure", + "ib_ack_send_delayed", + "ib_ack_send_piggybacked", + "ib_ack_received", + "ib_rdma_mr_alloc", + "ib_rdma_mr_free", + "ib_rdma_mr_used", + "ib_rdma_mr_pool_flush", + "ib_rdma_mr_pool_wait", + "ib_rdma_mr_pool_depleted", + "ib_atomic_cswp", + "ib_atomic_fadd", +}; + +unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_ib_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_ib_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, + ARRAY_SIZE(rds_ib_stat_names)); +out: + return ARRAY_SIZE(rds_ib_stat_names); +} diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c new file mode 100644 index 00000000..1253b006 --- /dev/null +++ b/net/rds/ib_sysctl.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "ib.h" + +static struct ctl_table_header *rds_ib_sysctl_hdr; + +unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR; +unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR; +unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_ib_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_ib_sysctl_max_unsig_wrs = 16; +static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; + +/* + * This sysctl does nothing. + * + * Backwards compatibility with RDS 3.0 wire protocol + * disables initial FC credit exchange. + * If it's ever possible to drop 3.0 support, + * setting this to 1 and moving init/refill of send/recv + * rings from ib_cm_connect_complete() back into ib_setup_qp() + * will cause credits to be added before protocol negotiation. + */ +unsigned int rds_ib_sysctl_flow_control = 0; + +static ctl_table rds_ib_sysctl_table[] = { + { + .procname = "max_send_wr", + .data = &rds_ib_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .procname = "max_recv_wr", + .data = &rds_ib_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_wr_min, + .extra2 = &rds_ib_sysctl_max_wr_max, + }, + { + .procname = "max_unsignaled_wr", + .data = &rds_ib_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_ib_sysctl_max_unsig_wr_min, + .extra2 = &rds_ib_sysctl_max_unsig_wr_max, + }, + { + .procname = "max_recv_allocation", + .data = &rds_ib_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "flow_control", + .data = &rds_ib_sysctl_flow_control, + .maxlen = sizeof(rds_ib_sysctl_flow_control), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path rds_ib_sysctl_path[] = { + { .procname = "net", }, + { .procname = "rds", }, + { .procname = "ib", }, + { } +}; + +void rds_ib_sysctl_exit(void) +{ + if (rds_ib_sysctl_hdr) + unregister_sysctl_table(rds_ib_sysctl_hdr); +} + +int rds_ib_sysctl_init(void) +{ + rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); + if (!rds_ib_sysctl_hdr) + return -ENOMEM; + return 0; +} diff --git a/net/rds/info.c b/net/rds/info.c new file mode 100644 index 00000000..4fdf1b6e --- /dev/null +++ b/net/rds/info.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" + +/* + * This file implements a getsockopt() call which copies a set of fixed + * sized structs into a user-specified buffer as a means of providing + * read-only information about RDS. + * + * For a given information source there are a given number of fixed sized + * structs at a given time. The structs are only copied if the user-specified + * buffer is big enough. The destination pages that make up the buffer + * are pinned for the duration of the copy. + * + * This gives us the following benefits: + * + * - simple implementation, no copy "position" across multiple calls + * - consistent snapshot of an info source + * - atomic copy works well with whatever locking info source has + * - one portable tool to get rds info across implementations + * - long-lived tool can get info without allocating + * + * at the following costs: + * + * - info source copy must be pinned, may be "large" + */ + +struct rds_info_iterator { + struct page **pages; + void *addr; + unsigned long offset; +}; + +static DEFINE_SPINLOCK(rds_info_lock); +static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; + +void rds_info_register_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset]); + rds_info_funcs[offset] = func; + spin_unlock(&rds_info_lock); +} +EXPORT_SYMBOL_GPL(rds_info_register_func); + +void rds_info_deregister_func(int optname, rds_info_func func) +{ + int offset = optname - RDS_INFO_FIRST; + + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); + + spin_lock(&rds_info_lock); + BUG_ON(rds_info_funcs[offset] != func); + rds_info_funcs[offset] = NULL; + spin_unlock(&rds_info_lock); +} +EXPORT_SYMBOL_GPL(rds_info_deregister_func); + +/* + * Typically we hold an atomic kmap across multiple rds_info_copy() calls + * because the kmap is so expensive. This must be called before using blocking + * operations while holding the mapping and as the iterator is torn down. + */ +void rds_info_iter_unmap(struct rds_info_iterator *iter) +{ + if (iter->addr) { + kunmap_atomic(iter->addr, KM_USER0); + iter->addr = NULL; + } +} + +/* + * get_user_pages() called flush_dcache_page() on the pages for us. + */ +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes) +{ + unsigned long this; + + while (bytes) { + if (!iter->addr) + iter->addr = kmap_atomic(*iter->pages, KM_USER0); + + this = min(bytes, PAGE_SIZE - iter->offset); + + rdsdebug("page %p addr %p offset %lu this %lu data %p " + "bytes %lu\n", *iter->pages, iter->addr, + iter->offset, this, data, bytes); + + memcpy(iter->addr + iter->offset, data, this); + + data += this; + bytes -= this; + iter->offset += this; + + if (iter->offset == PAGE_SIZE) { + kunmap_atomic(iter->addr, KM_USER0); + iter->addr = NULL; + iter->offset = 0; + iter->pages++; + } + } +} +EXPORT_SYMBOL_GPL(rds_info_copy); + +/* + * @optval points to the userspace buffer that the information snapshot + * will be copied into. + * + * @optlen on input is the size of the buffer in userspace. @optlen + * on output is the size of the requested snapshot in bytes. + * + * This function returns -errno if there is a failure, particularly -ENOSPC + * if the given userspace buffer was not large enough to fit the snapshot. + * On success it returns the positive number of bytes of each array element + * in the snapshot. + */ +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen) +{ + struct rds_info_iterator iter; + struct rds_info_lengths lens; + unsigned long nr_pages = 0; + unsigned long start; + unsigned long i; + rds_info_func func; + struct page **pages = NULL; + int ret; + int len; + int total; + + if (get_user(len, optlen)) { + ret = -EFAULT; + goto out; + } + + /* check for all kinds of wrapping and the like */ + start = (unsigned long)optval; + if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { + ret = -EINVAL; + goto out; + } + + /* a 0 len call is just trying to probe its length */ + if (len == 0) + goto call_func; + + nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) + >> PAGE_SHIFT; + + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + ret = get_user_pages_fast(start, nr_pages, 1, pages); + if (ret != nr_pages) { + if (ret > 0) + nr_pages = ret; + else + nr_pages = 0; + ret = -EAGAIN; /* XXX ? */ + goto out; + } + + rdsdebug("len %d nr_pages %lu\n", len, nr_pages); + +call_func: + func = rds_info_funcs[optname - RDS_INFO_FIRST]; + if (!func) { + ret = -ENOPROTOOPT; + goto out; + } + + iter.pages = pages; + iter.addr = NULL; + iter.offset = start & (PAGE_SIZE - 1); + + func(sock, len, &iter, &lens); + BUG_ON(lens.each == 0); + + total = lens.nr * lens.each; + + rds_info_iter_unmap(&iter); + + if (total > len) { + len = total; + ret = -ENOSPC; + } else { + len = total; + ret = lens.each; + } + + if (put_user(len, optlen)) + ret = -EFAULT; + +out: + for (i = 0; pages && i < nr_pages; i++) + put_page(pages[i]); + kfree(pages); + + return ret; +} diff --git a/net/rds/info.h b/net/rds/info.h new file mode 100644 index 00000000..b6c052ca --- /dev/null +++ b/net/rds/info.h @@ -0,0 +1,30 @@ +#ifndef _RDS_INFO_H +#define _RDS_INFO_H + +struct rds_info_lengths { + unsigned int nr; + unsigned int each; +}; + +struct rds_info_iterator; + +/* + * These functions must fill in the fields of @lens to reflect the size + * of the available info source. If the snapshot fits in @len then it + * should be copied using @iter. The caller will deduce if it was copied + * or not by comparing the lengths. + */ +typedef void (*rds_info_func)(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens); + +void rds_info_register_func(int optname, rds_info_func func); +void rds_info_deregister_func(int optname, rds_info_func func); +int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, + int __user *optlen); +void rds_info_copy(struct rds_info_iterator *iter, void *data, + unsigned long bytes); +void rds_info_iter_unmap(struct rds_info_iterator *iter); + + +#endif diff --git a/net/rds/iw.c b/net/rds/iw.c new file mode 100644 index 00000000..f7474844 --- /dev/null +++ b/net/rds/iw.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; +unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fastreg_pool_size, int, 0444); +MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); +module_param(fastreg_message_size, int, 0444); +MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); + +struct list_head rds_iw_devices; + +/* NOTE: if also grabbing iwdev lock, grab this first */ +DEFINE_SPINLOCK(iw_nodev_conns_lock); +LIST_HEAD(iw_nodev_conns); + +static void rds_iw_add_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct ib_device_attr *dev_attr; + + /* Only handle iwarp devices */ + if (device->node_type != RDMA_NODE_RNIC) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); + if (!rds_iwdev) + goto free_attr; + + spin_lock_init(&rds_iwdev->spinlock); + + rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = dev_attr->max_qp_wr; + rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + + rds_iwdev->dev = device; + rds_iwdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_iwdev->pd)) + goto free_dev; + + if (!rds_iwdev->dma_local_lkey) { + rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_iwdev->mr)) + goto err_pd; + } else + rds_iwdev->mr = NULL; + + rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); + if (IS_ERR(rds_iwdev->mr_pool)) { + rds_iwdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_iwdev->cm_id_list); + INIT_LIST_HEAD(&rds_iwdev->conn_list); + list_add_tail(&rds_iwdev->list, &rds_iw_devices); + + ib_set_client_data(device, &rds_iw_client, rds_iwdev); + + goto free_attr; + +err_mr: + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); +err_pd: + ib_dealloc_pd(rds_iwdev->pd); +free_dev: + kfree(rds_iwdev); +free_attr: + kfree(dev_attr); +} + +static void rds_iw_remove_one(struct ib_device *device) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_cm_id *i_cm_id, *next; + + rds_iwdev = ib_get_client_data(device, &rds_iw_client); + if (!rds_iwdev) + return; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + } + spin_unlock_irq(&rds_iwdev->spinlock); + + rds_iw_destroy_conns(rds_iwdev); + + if (rds_iwdev->mr_pool) + rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); + + if (rds_iwdev->mr) + ib_dereg_mr(rds_iwdev->mr); + + while (ib_dealloc_pd(rds_iwdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); + msleep(1); + } + + list_del(&rds_iwdev->list); + kfree(rds_iwdev); +} + +struct ib_client rds_iw_client = { + .name = "rds_iw", + .add = rds_iw_add_one, + .remove = rds_iw_remove_one +}; + +static int rds_iw_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_iw_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_iw_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_iw_device *rds_iwdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_iwdev->max_sge; + rds_iw_get_mr_info(rds_iwdev, iinfo); + } + return 1; +} + +static void rds_iw_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_iw_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_iw_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support IB devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_iw_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + rds_iw_destroy_nodev_conns(); + ib_unregister_client(&rds_iw_client); + rds_iw_sysctl_exit(); + rds_iw_recv_exit(); + rds_trans_unregister(&rds_iw_transport); +} + +struct rds_transport rds_iw_transport = { + .laddr_check = rds_iw_laddr_check, + .xmit_complete = rds_iw_xmit_complete, + .xmit = rds_iw_xmit, + .xmit_rdma = rds_iw_xmit_rdma, + .recv = rds_iw_recv, + .conn_alloc = rds_iw_conn_alloc, + .conn_free = rds_iw_conn_free, + .conn_connect = rds_iw_conn_connect, + .conn_shutdown = rds_iw_conn_shutdown, + .inc_copy_to_user = rds_iw_inc_copy_to_user, + .inc_free = rds_iw_inc_free, + .cm_initiate_connect = rds_iw_cm_initiate_connect, + .cm_handle_connect = rds_iw_cm_handle_connect, + .cm_connect_complete = rds_iw_cm_connect_complete, + .stats_info_copy = rds_iw_stats_info_copy, + .exit = rds_iw_exit, + .get_mr = rds_iw_get_mr, + .sync_mr = rds_iw_sync_mr, + .free_mr = rds_iw_free_mr, + .flush_mrs = rds_iw_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "iwarp", + .t_type = RDS_TRANS_IWARP, + .t_prefer_loopback = 1, +}; + +int rds_iw_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_iw_devices); + + ret = ib_register_client(&rds_iw_client); + if (ret) + goto out; + + ret = rds_iw_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_iw_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_iw_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); + + goto out; + +out_recv: + rds_iw_recv_exit(); +out_sysctl: + rds_iw_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_iw_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + diff --git a/net/rds/iw.h b/net/rds/iw.h new file mode 100644 index 00000000..90151922 --- /dev/null +++ b/net/rds/iw.h @@ -0,0 +1,395 @@ +#ifndef _RDS_IW_H +#define _RDS_IW_H + +#include +#include +#include "rds.h" +#include "rdma_transport.h" + +#define RDS_FASTREG_SIZE 20 +#define RDS_FASTREG_POOL_SIZE 2048 + +#define RDS_IW_MAX_SGE 8 +#define RDS_IW_RECV_SGE 2 + +#define RDS_IW_DEFAULT_RECV_WR 1024 +#define RDS_IW_DEFAULT_SEND_WR 256 + +#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ + +extern struct list_head rds_iw_devices; + +/* + * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to + * try and minimize the amount of memory tied up both the device and + * socket receive queues. + */ +/* page offset of the final full frag that fits in the page */ +#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) +struct rds_page_frag { + struct list_head f_item; + struct page *f_page; + unsigned long f_offset; + dma_addr_t f_mapped; +}; + +struct rds_iw_incoming { + struct list_head ii_frags; + struct rds_incoming ii_inc; +}; + +struct rds_iw_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + __be32 dp_saddr; + __be32 dp_daddr; + u8 dp_protocol_major; + u8 dp_protocol_minor; + __be16 dp_protocol_minor_mask; /* bitmask */ + __be32 dp_reserved1; + __be64 dp_ack_seq; + __be32 dp_credit; /* non-zero enables flow ctl */ +}; + +struct rds_iw_scatterlist { + struct scatterlist *list; + unsigned int len; + int dma_len; + unsigned int dma_npages; + unsigned int bytes; +}; + +struct rds_iw_mapping { + spinlock_t m_lock; /* protect the mapping struct */ + struct list_head m_list; + struct rds_iw_mr *m_mr; + uint32_t m_rkey; + struct rds_iw_scatterlist m_sg; +}; + +struct rds_iw_send_work { + struct rds_message *s_rm; + + /* We should really put these into a union: */ + struct rm_rdma_op *s_op; + struct rds_iw_mapping *s_mapping; + struct ib_mr *s_mr; + struct ib_fast_reg_page_list *s_page_list; + unsigned char s_remap_count; + + struct ib_send_wr s_wr; + struct ib_sge s_sge[RDS_IW_MAX_SGE]; + unsigned long s_queued; +}; + +struct rds_iw_recv_work { + struct rds_iw_incoming *r_iwinc; + struct rds_page_frag *r_frag; + struct ib_recv_wr r_wr; + struct ib_sge r_sge[2]; +}; + +struct rds_iw_work_ring { + u32 w_nr; + u32 w_alloc_ptr; + u32 w_alloc_ctr; + u32 w_free_ptr; + atomic_t w_free_ctr; +}; + +struct rds_iw_device; + +struct rds_iw_connection { + + struct list_head iw_node; + struct rds_iw_device *rds_iwdev; + struct rds_connection *conn; + + /* alphabet soup, IBTA style */ + struct rdma_cm_id *i_cm_id; + struct ib_pd *i_pd; + struct ib_mr *i_mr; + struct ib_cq *i_send_cq; + struct ib_cq *i_recv_cq; + + /* tx */ + struct rds_iw_work_ring i_send_ring; + struct rds_message *i_rm; + struct rds_header *i_send_hdrs; + u64 i_send_hdrs_dma; + struct rds_iw_send_work *i_sends; + + /* rx */ + struct tasklet_struct i_recv_tasklet; + struct mutex i_recv_mutex; + struct rds_iw_work_ring i_recv_ring; + struct rds_iw_incoming *i_iwinc; + u32 i_recv_data_rem; + struct rds_header *i_recv_hdrs; + u64 i_recv_hdrs_dma; + struct rds_iw_recv_work *i_recvs; + struct rds_page_frag i_frag; + u64 i_ack_recv; /* last ACK received */ + + /* sending acks */ + unsigned long i_ack_flags; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t i_ack_next; /* next ACK to send */ +#else + spinlock_t i_ack_lock; /* protect i_ack_next */ + u64 i_ack_next; /* next ACK to send */ +#endif + struct rds_header *i_ack; + struct ib_send_wr i_ack_wr; + struct ib_sge i_ack_sge; + u64 i_ack_dma; + unsigned long i_ack_queued; + + /* Flow control related information + * + * Our algorithm uses a pair variables that we need to access + * atomically - one for the send credits, and one posted + * recv credits we need to transfer to remote. + * Rather than protect them using a slow spinlock, we put both into + * a single atomic_t and update it using cmpxchg + */ + atomic_t i_credits; + + /* Protocol version specific information */ + unsigned int i_flowctl:1; /* enable/disable flow ctl */ + unsigned int i_dma_local_lkey:1; + unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ + /* Batched completions */ + unsigned int i_unsignaled_wrs; + long i_unsignaled_bytes; +}; + +/* This assumes that atomic_t is at least 32 bits */ +#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_GET_POST_CREDITS(v) ((v) >> 16) +#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) +#define IB_SET_POST_CREDITS(v) ((v) << 16) + +struct rds_iw_cm_id { + struct list_head list; + struct rdma_cm_id *cm_id; +}; + +struct rds_iw_device { + struct list_head list; + struct list_head cm_id_list; + struct list_head conn_list; + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct rds_iw_mr_pool *mr_pool; + int max_sge; + unsigned int max_wrs; + unsigned int dma_local_lkey:1; + spinlock_t spinlock; /* protect the above */ +}; + +/* bits for i_ack_flags */ +#define IB_ACK_IN_FLIGHT 0 +#define IB_ACK_REQUESTED 1 + +/* Magic WR_ID for ACKs */ +#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) +#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL) +#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) + +struct rds_iw_statistics { + uint64_t s_iw_connect_raced; + uint64_t s_iw_listen_closed_stale; + uint64_t s_iw_tx_cq_call; + uint64_t s_iw_tx_cq_event; + uint64_t s_iw_tx_ring_full; + uint64_t s_iw_tx_throttle; + uint64_t s_iw_tx_sg_mapping_failure; + uint64_t s_iw_tx_stalled; + uint64_t s_iw_tx_credit_updates; + uint64_t s_iw_rx_cq_call; + uint64_t s_iw_rx_cq_event; + uint64_t s_iw_rx_ring_empty; + uint64_t s_iw_rx_refill_from_cq; + uint64_t s_iw_rx_refill_from_thread; + uint64_t s_iw_rx_alloc_limit; + uint64_t s_iw_rx_credit_updates; + uint64_t s_iw_ack_sent; + uint64_t s_iw_ack_send_failure; + uint64_t s_iw_ack_send_delayed; + uint64_t s_iw_ack_send_piggybacked; + uint64_t s_iw_ack_received; + uint64_t s_iw_rdma_mr_alloc; + uint64_t s_iw_rdma_mr_free; + uint64_t s_iw_rdma_mr_used; + uint64_t s_iw_rdma_mr_pool_flush; + uint64_t s_iw_rdma_mr_pool_wait; + uint64_t s_iw_rdma_mr_pool_depleted; +}; + +extern struct workqueue_struct *rds_iw_wq; + +/* + * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h + * doesn't define it. + */ +static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_cpu(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu + +static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, + struct scatterlist *sg, unsigned int sg_dma_len, int direction) +{ + unsigned int i; + + for (i = 0; i < sg_dma_len; ++i) { + ib_dma_sync_single_for_device(dev, + ib_sg_dma_address(dev, &sg[i]), + ib_sg_dma_len(dev, &sg[i]), + direction); + } +} +#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device + +static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) +{ + return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; +} + +/* ib.c */ +extern struct rds_transport rds_iw_transport; +extern struct ib_client rds_iw_client; + +extern unsigned int fastreg_pool_size; +extern unsigned int fastreg_message_size; + +extern spinlock_t iw_nodev_conns_lock; +extern struct list_head iw_nodev_conns; + +/* ib_cm.c */ +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); +void rds_iw_conn_free(void *arg); +int rds_iw_conn_connect(struct rds_connection *conn); +void rds_iw_conn_shutdown(struct rds_connection *conn); +void rds_iw_state_change(struct sock *sk); +int rds_iw_listen_init(void); +void rds_iw_listen_stop(void); +void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); +void rds_iw_cm_connect_complete(struct rds_connection *conn, + struct rdma_cm_event *event); + + +#define rds_iw_conn_error(conn, fmt...) \ + __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) + +/* ib_rdma.c */ +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); +void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); +static inline void rds_iw_destroy_nodev_conns(void) +{ + __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); +} +static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) +{ + __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); +} +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_iw_sync_mr(void *trans_private, int dir); +void rds_iw_free_mr(void *trans_private, int invalidate); +void rds_iw_flush_mrs(void); + +/* ib_recv.c */ +int rds_iw_recv_init(void); +void rds_iw_recv_exit(void); +int rds_iw_recv(struct rds_connection *conn); +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill); +void rds_iw_inc_free(struct rds_incoming *inc); +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_recv_tasklet_fn(unsigned long data); +void rds_iw_recv_init_ring(struct rds_iw_connection *ic); +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); +void rds_iw_recv_init_ack(struct rds_iw_connection *ic); +void rds_iw_attempt_ack(struct rds_iw_connection *ic); +void rds_iw_ack_send_complete(struct rds_iw_connection *ic); +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); + +/* ib_ring.c */ +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); +int rds_iw_ring_empty(struct rds_iw_work_ring *ring); +int rds_iw_ring_low(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); +extern wait_queue_head_t rds_iw_ring_empty_wait; + +/* ib_send.c */ +void rds_iw_xmit_complete(struct rds_connection *conn); +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_iw_send_init_ring(struct rds_iw_connection *ic); +void rds_iw_send_clear_ring(struct rds_iw_connection *ic); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, + u32 *adv_credits, int need_posted, int max_posted); + +/* ib_stats.c */ +DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); +#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +/* ib_sysctl.c */ +int rds_iw_sysctl_init(void); +void rds_iw_sysctl_exit(void); +extern unsigned long rds_iw_sysctl_max_send_wr; +extern unsigned long rds_iw_sysctl_max_recv_wr; +extern unsigned long rds_iw_sysctl_max_unsig_wrs; +extern unsigned long rds_iw_sysctl_max_unsig_bytes; +extern unsigned long rds_iw_sysctl_max_recv_allocation; +extern unsigned int rds_iw_sysctl_flow_control; + +/* + * Helper functions for getting/setting the header and data SGEs in + * RDS packets (not RDMA) + */ +static inline struct ib_sge * +rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[0]; +} + +static inline struct ib_sge * +rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) +{ + return &sge[1]; +} + +#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c new file mode 100644 index 00000000..c12db66f --- /dev/null +++ b/net/rds/iw_cm.c @@ -0,0 +1,766 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +/* + * Set the selected protocol version + */ +static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) +{ + conn->c_version = version; +} + +/* + * Set up flow control + */ +static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (rds_iw_sysctl_flow_control && credits != 0) { + /* We're doing flow control */ + ic->i_flowctl = 1; + rds_iw_send_add_credits(conn, credits); + } else { + ic->i_flowctl = 0; + } +} + +/* + * Connection established. + * We get here for both outgoing and incoming connection. + */ +void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = NULL; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + int err; + + if (event->param.conn.private_data_len) { + dp = event->param.conn.private_data; + + rds_iw_set_protocol(conn, + RDS_PROTOCOL(dp->dp_protocol_major, + dp->dp_protocol_minor)); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + } + + /* update ib_device with this local ipaddr & conn */ + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); + if (err) + printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); + rds_iw_add_conn(rds_iwdev, conn); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp && dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + + rds_connect_complete(conn); +} + +static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, + struct rdma_conn_param *conn_param, + struct rds_iw_connect_private *dp, + u32 protocol_version) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + memset(conn_param, 0, sizeof(struct rdma_conn_param)); + /* XXX tune these? */ + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + + if (dp) { + memset(dp, 0, sizeof(*dp)); + dp->dp_saddr = conn->c_laddr; + dp->dp_daddr = conn->c_faddr; + dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); + dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); + dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); + dp->dp_ack_seq = rds_iw_piggyb_ack(ic); + + /* Advertise flow control */ + if (ic->i_flowctl) { + unsigned int credits; + + credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + dp->dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + } + + conn_param->private_data = dp; + conn_param->private_data_len = sizeof(*dp); + } +} + +static void rds_iw_cq_event_handler(struct ib_event *event, void *data) +{ + rdsdebug("event %u data %p\n", event->event, data); +} + +static void rds_iw_qp_event_handler(struct ib_event *event, void *data) +{ + struct rds_connection *conn = data; + struct rds_iw_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_FATAL: + default: + rdsdebug("Fatal QP Event %u " + "- connection %pI4->%pI4, reconnecting\n", + event->event, &conn->c_laddr, + &conn->c_faddr); + rds_conn_drop(conn); + break; + } +} + +/* + * Create a QP + */ +static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, + struct rds_iw_device *rds_iwdev, + struct rds_iw_work_ring *send_ring, + void (*send_cq_handler)(struct ib_cq *, void *), + struct rds_iw_work_ring *recv_ring, + void (*recv_cq_handler)(struct ib_cq *, void *), + void *context) +{ + struct ib_device *dev = rds_iwdev->dev; + unsigned int send_size, recv_size; + int ret; + + /* The offset of 1 is to accommodate the additional ACK WR. */ + send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); + recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); + rds_iw_ring_resize(send_ring, send_size - 1); + rds_iw_ring_resize(recv_ring, recv_size - 1); + + memset(attr, 0, sizeof(*attr)); + attr->event_handler = rds_iw_qp_event_handler; + attr->qp_context = context; + attr->cap.max_send_wr = send_size; + attr->cap.max_recv_wr = recv_size; + attr->cap.max_send_sge = rds_iwdev->max_sge; + attr->cap.max_recv_sge = RDS_IW_RECV_SGE; + attr->sq_sig_type = IB_SIGNAL_REQ_WR; + attr->qp_type = IB_QPT_RC; + + attr->send_cq = ib_create_cq(dev, send_cq_handler, + rds_iw_cq_event_handler, + context, send_size, 0); + if (IS_ERR(attr->send_cq)) { + ret = PTR_ERR(attr->send_cq); + attr->send_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + attr->recv_cq = ib_create_cq(dev, recv_cq_handler, + rds_iw_cq_event_handler, + context, recv_size, 0); + if (IS_ERR(attr->recv_cq)) { + ret = PTR_ERR(attr->recv_cq); + attr->recv_cq = NULL; + rdsdebug("ib_create_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); + if (ret) { + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + goto out; + } + + ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); + if (ret) { + rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); + goto out; + } + +out: + if (ret) { + if (attr->send_cq) + ib_destroy_cq(attr->send_cq); + if (attr->recv_cq) + ib_destroy_cq(attr->recv_cq); + } + return ret; +} + +/* + * This needs to be very careful to not leave IS_ERR pointers around for + * cleanup to trip over. + */ +static int rds_iw_setup_qp(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct ib_qp_init_attr attr; + struct rds_iw_device *rds_iwdev; + int ret; + + /* rds_iw_add_one creates a rds_iw_device object per IB device, + * and allocates a protection domain, memory range and MR pool + * for each. If that fails for any reason, it will not register + * the rds_iwdev at all. + */ + rds_iwdev = ib_get_client_data(dev, &rds_iw_client); + if (!rds_iwdev) { + if (printk_ratelimit()) + printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", + dev->name); + return -EOPNOTSUPP; + } + + /* Protection domain and memory range */ + ic->i_pd = rds_iwdev->pd; + ic->i_mr = rds_iwdev->mr; + + ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, + &ic->i_send_ring, rds_iw_send_cq_comp_handler, + &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, + conn); + if (ret < 0) + goto out; + + ic->i_send_cq = attr.send_cq; + ic->i_recv_cq = attr.recv_cq; + + /* + * XXX this can fail if max_*_wr is too large? Are we supposed + * to back off until we get a value that the hardware can support? + */ + ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); + if (ret) { + rdsdebug("rdma_create_qp failed: %d\n", ret); + goto out; + } + + ic->i_send_hdrs = ib_dma_alloc_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + &ic->i_send_hdrs_dma, GFP_KERNEL); + if (!ic->i_send_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent send failed\n"); + goto out; + } + + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (!ic->i_recv_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } + + ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), + &ic->i_ack_dma, GFP_KERNEL); + if (!ic->i_ack) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent ack failed\n"); + goto out; + } + + ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); + if (!ic->i_sends) { + ret = -ENOMEM; + rdsdebug("send allocation failed\n"); + goto out; + } + rds_iw_send_init_ring(ic); + + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); + if (!ic->i_recvs) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + + rds_iw_recv_init_ring(ic); + rds_iw_recv_init_ack(ic); + + /* Post receive buffers - as a side effect, this will update + * the posted credit count. */ + rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + + rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + ic->i_send_cq, ic->i_recv_cq); + +out: + return ret; +} + +static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) +{ + u16 common; + u32 version = 0; + + /* rdma_cm private data is odd - when there is any private data in the + * request, we will be given a pretty large buffer without telling us the + * original size. The only way to tell the difference is by looking at + * the contents, which are initialized to zero. + * If the protocol version fields aren't set, this is a connection attempt + * from an older version. This could could be 3.0 or 2.0 - we can't tell. + * We really should have changed this for OFED 1.3 :-( */ + if (dp->dp_protocol_major == 0) + return RDS_PROTOCOL_3_0; + + common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; + if (dp->dp_protocol_major == 3 && common) { + version = RDS_PROTOCOL_3_0; + while ((common >>= 1) != 0) + version++; + } else if (printk_ratelimit()) { + printk(KERN_NOTICE "RDS: Connection from %pI4 using " + "incompatible protocol version %u.%u\n", + &dp->dp_saddr, + dp->dp_protocol_major, + dp->dp_protocol_minor); + } + return version; +} + +int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + const struct rds_iw_connect_private *dp = event->param.conn.private_data; + struct rds_iw_connect_private dp_rep; + struct rds_connection *conn = NULL; + struct rds_iw_connection *ic = NULL; + struct rdma_conn_param conn_param; + struct rds_iw_device *rds_iwdev; + u32 version; + int err, destroy = 1; + + /* Check whether the remote protocol version matches ours. */ + version = rds_iw_protocol_compatible(dp); + if (!version) + goto out; + + rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", + &dp->dp_saddr, &dp->dp_daddr, + RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); + + conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, + GFP_KERNEL); + if (IS_ERR(conn)) { + rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); + conn = NULL; + goto out; + } + + /* + * The connection request may occur while the + * previous connection exist, e.g. in case of failover. + * But as connections may be initiated simultaneously + * by both hosts, we have a random backoff mechanism - + * see the comment above rds_queue_reconnect() + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) { + rdsdebug("incoming connect while connecting\n"); + rds_conn_drop(conn); + rds_iw_stats_inc(s_iw_listen_closed_stale); + } else + if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + /* Wait and see - our connect may still be succeeding */ + rds_iw_stats_inc(s_iw_connect_raced); + } + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + ic = conn->c_transport_data; + + rds_iw_set_protocol(conn, version); + rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + + /* If the peer gave us the last packet it saw, process this as if + * we had received a regular ACK. */ + if (dp->dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + + BUG_ON(cm_id->context); + BUG_ON(ic->i_cm_id); + + ic->i_cm_id = cm_id; + cm_id->context = conn; + + rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + /* We got halfway through setting up the ib_connection, if we + * fail now, we have to take the long route out of this mess. */ + destroy = 0; + + err = rds_iw_setup_qp(conn); + if (err) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); + mutex_unlock(&conn->c_cm_lock); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + + /* rdma_accept() calls rdma_reject() internally if it fails */ + err = rdma_accept(cm_id, &conn_param); + mutex_unlock(&conn->c_cm_lock); + if (err) { + rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); + goto out; + } + + return 0; + +out: + rdma_reject(cm_id, NULL, 0); + return destroy; +} + + +int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) +{ + struct rds_connection *conn = cm_id->context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct rdma_conn_param conn_param; + struct rds_iw_connect_private dp; + int ret; + + /* If the peer doesn't do protocol negotiation, we must + * default to RDSv3.0 */ + rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); + ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ + + ret = rds_iw_setup_qp(conn); + if (ret) { + rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); + goto out; + } + + rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); + + ret = rdma_connect(cm_id, &conn_param); + if (ret) + rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); + +out: + /* Beware - returning non-zero tells the rdma_cm to destroy + * the cm_id. We should certainly not do it as long as we still + * "own" the cm_id. */ + if (ret) { + struct rds_iw_connection *ic = conn->c_transport_data; + + if (ic->i_cm_id == cm_id) + ret = 0; + } + return ret; +} + +int rds_iw_conn_connect(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_device *rds_iwdev; + struct sockaddr_in src, dest; + int ret; + + /* XXX I wonder what affect the port space has */ + /* delegate cm event handler to rdma_transport */ + ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ic->i_cm_id)) { + ret = PTR_ERR(ic->i_cm_id); + ic->i_cm_id = NULL; + rdsdebug("rdma_create_id() failed: %d\n", ret); + goto out; + } + + rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + /* First, bind to the local address and device. */ + ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); + if (ret) { + rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", + &conn->c_laddr, ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + goto out; + } + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_PORT); + + ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, + (struct sockaddr *)&dest, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, + ret); + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; + } + +out: + return ret; +} + +/* + * This is so careful about only cleaning up resources that were built up + * so that it can be called at any point during startup. In fact it + * can be called multiple times for a given connection. + */ +void rds_iw_conn_shutdown(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int err = 0; + struct ib_qp_attr qp_attr; + + rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_send_cq, ic->i_recv_cq, + ic->i_cm_id ? ic->i_cm_id->qp : NULL); + + if (ic->i_cm_id) { + struct ib_device *dev = ic->i_cm_id->device; + + rdsdebug("disconnecting cm %p\n", ic->i_cm_id); + err = rdma_disconnect(ic->i_cm_id); + if (err) { + /* Actually this may happen quite frequently, when + * an outgoing connect raced with an incoming connect. + */ + rdsdebug("rds_iw_conn_shutdown: failed to disconnect," + " cm: %p err %d\n", ic->i_cm_id, err); + } + + if (ic->i_cm_id->qp) { + qp_attr.qp_state = IB_QPS_ERR; + ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); + } + + wait_event(rds_iw_ring_empty_wait, + rds_iw_ring_empty(&ic->i_send_ring) && + rds_iw_ring_empty(&ic->i_recv_ring)); + + if (ic->i_send_hdrs) + ib_dma_free_coherent(dev, + ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, + ic->i_send_hdrs_dma); + + if (ic->i_recv_hdrs) + ib_dma_free_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, + ic->i_recv_hdrs_dma); + + if (ic->i_ack) + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); + + if (ic->i_sends) + rds_iw_send_clear_ring(ic); + if (ic->i_recvs) + rds_iw_recv_clear_ring(ic); + + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + + /* + * If associated with an rds_iw_device: + * Move connection back to the nodev list. + * Remove cm_id from the device cm_id list. + */ + if (ic->rds_iwdev) + rds_iw_remove_conn(ic->rds_iwdev, conn); + + rdma_destroy_id(ic->i_cm_id); + + ic->i_cm_id = NULL; + ic->i_pd = NULL; + ic->i_mr = NULL; + ic->i_send_cq = NULL; + ic->i_recv_cq = NULL; + ic->i_send_hdrs = NULL; + ic->i_recv_hdrs = NULL; + ic->i_ack = NULL; + } + BUG_ON(ic->rds_iwdev); + + /* Clear pending transmit */ + if (ic->i_rm) { + rds_message_put(ic->i_rm); + ic->i_rm = NULL; + } + + /* Clear the ACK state */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_set(&ic->i_ack_next, 0); +#else + ic->i_ack_next = 0; +#endif + ic->i_ack_recv = 0; + + /* Clear flow control state */ + ic->i_flowctl = 0; + atomic_set(&ic->i_credits, 0); + + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + if (ic->i_iwinc) { + rds_inc_put(&ic->i_iwinc->ii_inc); + ic->i_iwinc = NULL; + } + + vfree(ic->i_sends); + ic->i_sends = NULL; + vfree(ic->i_recvs); + ic->i_recvs = NULL; + rdsdebug("shutdown complete\n"); +} + +int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_iw_connection *ic; + unsigned long flags; + + /* XXX too lazy? */ + ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); + if (!ic) + return -ENOMEM; + + INIT_LIST_HEAD(&ic->iw_node); + tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn, + (unsigned long) ic); + mutex_init(&ic->i_recv_mutex); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&ic->i_ack_lock); +#endif + + /* + * rds_iw_conn_shutdown() waits for these to be emptied so they + * must be initialized before it can be called. + */ + rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); + rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); + + ic->conn = conn; + conn->c_transport_data = ic; + + spin_lock_irqsave(&iw_nodev_conns_lock, flags); + list_add_tail(&ic->iw_node, &iw_nodev_conns); + spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); + + + rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); + return 0; +} + +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ +void rds_iw_conn_free(void *arg) +{ + struct rds_iw_connection *ic = arg; + spinlock_t *lock_ptr; + + rdsdebug("ic %p\n", ic); + + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_iwdev would change) but that should never happen. + */ + lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; + + spin_lock_irq(lock_ptr); + list_del(&ic->iw_node); + spin_unlock_irq(lock_ptr); + + kfree(ic); +} + +/* + * An error occurred on the connection + */ +void +__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) +{ + va_list ap; + + rds_conn_drop(conn); + + va_start(ap, fmt); + vprintk(fmt, ap); + va_end(ap); +} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c new file mode 100644 index 00000000..6deaa774 --- /dev/null +++ b/net/rds/iw_rdma.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "rds.h" +#include "iw.h" + + +/* + * This is stored as mr->r_trans_private. + */ +struct rds_iw_mr { + struct rds_iw_device *device; + struct rds_iw_mr_pool *pool; + struct rdma_cm_id *cm_id; + + struct ib_mr *mr; + struct ib_fast_reg_page_list *page_list; + + struct rds_iw_mapping mapping; + unsigned char remap_count; +}; + +/* + * Our own little MR pool + */ +struct rds_iw_mr_pool { + struct rds_iw_device *device; /* back ptr to the device that owns us */ + + struct mutex flush_lock; /* serialize fmr invalidate */ + struct work_struct flush_worker; /* flush worker */ + + spinlock_t list_lock; /* protect variables below */ + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + struct list_head dirty_list; /* dirty mappings */ + struct list_head clean_list; /* unused & unamapped MRs */ + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_message_size; /* in pages */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + int max_pages; +}; + +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); +static void rds_iw_mr_pool_flush_worker(struct work_struct *work); +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, unsigned int nents); +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list); +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); + +static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +{ + struct rds_iw_device *iwdev; + struct rds_iw_cm_id *i_cm_id; + + *rds_iwdev = NULL; + *cm_id = NULL; + + list_for_each_entry(iwdev, &rds_iw_devices, list) { + spin_lock_irq(&iwdev->spinlock); + list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { + struct sockaddr_in *src_addr, *dst_addr; + + src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; + + rdsdebug("local ipaddr = %x port %d, " + "remote ipaddr = %x port %d" + "..looking for %x port %d, " + "remote ipaddr = %x port %d\n", + src_addr->sin_addr.s_addr, + src_addr->sin_port, + dst_addr->sin_addr.s_addr, + dst_addr->sin_port, + rs->rs_bound_addr, + rs->rs_bound_port, + rs->rs_conn_addr, + rs->rs_conn_port); +#ifdef WORKING_TUPLE_DETECTION + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && + src_addr->sin_port == rs->rs_bound_port && + dst_addr->sin_addr.s_addr == rs->rs_conn_addr && + dst_addr->sin_port == rs->rs_conn_port) { +#else + /* FIXME - needs to compare the local and remote + * ipaddr/port tuple, but the ipaddr is the only + * available information in the rds_sock (as the rest are + * zero'ed. It doesn't appear to be properly populated + * during connection setup... + */ + if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { +#endif + spin_unlock_irq(&iwdev->spinlock); + *rds_iwdev = iwdev; + *cm_id = i_cm_id->cm_id; + return 0; + } + } + spin_unlock_irq(&iwdev->spinlock); + } + + return 1; +} + +static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); + if (!i_cm_id) + return -ENOMEM; + + i_cm_id->cm_id = cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); + spin_unlock_irq(&rds_iwdev->spinlock); + + return 0; +} + +static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, + struct rdma_cm_id *cm_id) +{ + struct rds_iw_cm_id *i_cm_id; + + spin_lock_irq(&rds_iwdev->spinlock); + list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { + if (i_cm_id->cm_id == cm_id) { + list_del(&i_cm_id->list); + kfree(i_cm_id); + break; + } + } + spin_unlock_irq(&rds_iwdev->spinlock); +} + + +int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +{ + struct sockaddr_in *src_addr, *dst_addr; + struct rds_iw_device *rds_iwdev_old; + struct rds_sock rs; + struct rdma_cm_id *pcm_id; + int rc; + + src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; + dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; + + rs.rs_bound_addr = src_addr->sin_addr.s_addr; + rs.rs_bound_port = src_addr->sin_port; + rs.rs_conn_addr = dst_addr->sin_addr.s_addr; + rs.rs_conn_port = dst_addr->sin_port; + + rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + if (rc) + rds_iw_remove_cm_id(rds_iwdev, cm_id); + + return rds_iw_add_cm_id(rds_iwdev, cm_id); +} + +void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* conn was previously on the nodev_conns_list */ + spin_lock_irq(&iw_nodev_conns_lock); + BUG_ON(list_empty(&iw_nodev_conns)); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + + spin_lock(&rds_iwdev->spinlock); + list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); + spin_unlock(&rds_iwdev->spinlock); + spin_unlock_irq(&iw_nodev_conns_lock); + + ic->rds_iwdev = rds_iwdev; +} + +void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* place conn on nodev_conns_list */ + spin_lock(&iw_nodev_conns_lock); + + spin_lock_irq(&rds_iwdev->spinlock); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&rds_iwdev->spinlock); + + list_add_tail(&ic->iw_node, &iw_nodev_conns); + + spin_unlock(&iw_nodev_conns_lock); + + rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); + ic->rds_iwdev = NULL; +} + +void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) +{ + struct rds_iw_connection *ic, *_ic; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(list_lock); + list_splice(list, &tmp_list); + INIT_LIST_HEAD(list); + spin_unlock_irq(list_lock); + + list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) + rds_conn_destroy(ic->conn); +} + +static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, + struct scatterlist *list, unsigned int sg_len) +{ + sg->list = list; + sg->len = sg_len; + sg->dma_len = 0; + sg->dma_npages = 0; + sg->bytes = 0; +} + +static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, + struct rds_iw_scatterlist *sg) +{ + struct ib_device *dev = rds_iwdev->dev; + u64 *dma_pages = NULL; + int i, j, ret; + + WARN_ON(sg->dma_len); + + sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + if (unlikely(!sg->dma_len)) { + printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); + return ERR_PTR(-EBUSY); + } + + sg->bytes = 0; + sg->dma_npages = 0; + + ret = -EINVAL; + for (i = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + sg->bytes += dma_len; + + end_addr = dma_addr + dma_len; + if (dma_addr & PAGE_MASK) { + if (i > 0) + goto out_unmap; + dma_addr &= ~PAGE_MASK; + } + if (end_addr & PAGE_MASK) { + if (i < sg->dma_len - 1) + goto out_unmap; + end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK; + } + + sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT; + } + + /* Now gather the dma addrs into one list */ + if (sg->dma_npages > fastreg_message_size) + goto out_unmap; + + dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC); + if (!dma_pages) { + ret = -ENOMEM; + goto out_unmap; + } + + for (i = j = 0; i < sg->dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); + u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); + u64 end_addr; + + end_addr = dma_addr + dma_len; + dma_addr &= ~PAGE_MASK; + for (; dma_addr < end_addr; dma_addr += PAGE_SIZE) + dma_pages[j++] = dma_addr; + BUG_ON(j > sg->dma_npages); + } + + return dma_pages; + +out_unmap: + ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); + sg->dma_len = 0; + kfree(dma_pages); + return ERR_PTR(ret); +} + + +struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); + return ERR_PTR(-ENOMEM); + } + + pool->device = rds_iwdev; + INIT_LIST_HEAD(&pool->dirty_list); + INIT_LIST_HEAD(&pool->clean_list); + mutex_init(&pool->flush_lock); + spin_lock_init(&pool->list_lock); + INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); + + pool->max_message_size = fastreg_message_size; + pool->max_items = fastreg_pool_size; + pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; + pool->max_pages = fastreg_message_size; + + /* We never allow more than max_items MRs to be allocated. + * When we exceed more than max_items_soft, we start freeing + * items more aggressively. + * Make sure that max_items > max_items_soft > max_items / 2 + */ + pool->max_items_soft = pool->max_items * 3 / 4; + + return pool; +} + +void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + iinfo->rdma_mr_max = pool->max_items; + iinfo->rdma_mr_size = pool->max_pages; +} + +void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) +{ + flush_workqueue(rds_wq); + rds_iw_flush_mr_pool(pool, 1); + BUG_ON(atomic_read(&pool->item_count)); + BUG_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) +{ + struct rds_iw_mr *ibmr = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + if (!list_empty(&pool->clean_list)) { + ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); + list_del_init(&ibmr->mapping.m_list); + } + spin_unlock_irqrestore(&pool->list_lock, flags); + + return ibmr; +} + +static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) +{ + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + struct rds_iw_mr *ibmr = NULL; + int err = 0, iter = 0; + + while (1) { + ibmr = rds_iw_reuse_fmr(pool); + if (ibmr) + return ibmr; + + /* No clean MRs - now we have the choice of either + * allocating a fresh MR up to the limit imposed by the + * driver, or flush any dirty unused MRs. + * We try to avoid stalling in the send path if possible, + * so we allocate as long as we're allowed to. + * + * We're fussy with enforcing the FMR limit, though. If the driver + * tells us we can't use more than N fmrs, we shouldn't start + * arguing with it */ + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); + rds_iw_flush_mr_pool(pool, 0); + } + + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + spin_lock_init(&ibmr->mapping.m_lock); + INIT_LIST_HEAD(&ibmr->mapping.m_list); + ibmr->mapping.m_mr = ibmr; + + err = rds_iw_init_fastreg(pool, ibmr); + if (err) + goto out_no_cigar; + + rds_iw_stats_inc(s_iw_rdma_mr_alloc); + return ibmr; + +out_no_cigar: + if (ibmr) { + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +void rds_iw_sync_mr(void *trans_private, int direction) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_device *rds_iwdev = ibmr->device; + + switch (direction) { + case DMA_FROM_DEVICE: + ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + case DMA_TO_DEVICE: + ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, + ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); + break; + } +} + +static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all) +{ + unsigned int item_count; + + item_count = atomic_read(&pool->item_count); + if (free_all) + return item_count; + + return 0; +} + +/* + * Flush our pool of MRs. + * At a minimum, all currently unused MRs are unmapped. + * If the number of MRs allocated exceeds the limit, we also try + * to free as many MRs as needed to get back to this limit. + */ +static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) +{ + struct rds_iw_mr *ibmr, *next; + LIST_HEAD(unmap_list); + LIST_HEAD(kill_list); + unsigned long flags; + unsigned int nfreed = 0, ncleaned = 0, free_goal; + int ret = 0; + + rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); + + mutex_lock(&pool->flush_lock); + + spin_lock_irqsave(&pool->list_lock, flags); + /* Get the list of all mappings to be destroyed */ + list_splice_init(&pool->dirty_list, &unmap_list); + if (free_all) + list_splice_init(&pool->clean_list, &kill_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + + free_goal = rds_iw_flush_goal(pool, free_all); + + /* Batched invalidate of dirty MRs. + * For FMR based MRs, the mappings on the unmap list are + * actually members of an ibmr (ibmr->mapping). They either + * migrate to the kill_list, or have been cleaned and should be + * moved to the clean_list. + * For fastregs, they will be dynamically allocated, and + * will be destroyed by the unmap function. + */ + if (!list_empty(&unmap_list)) { + ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list); + /* If we've been asked to destroy all MRs, move those + * that were simply cleaned to the kill list */ + if (free_all) + list_splice_init(&unmap_list, &kill_list); + } + + /* Destroy any MRs that are past their best before date */ + list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { + rds_iw_stats_inc(s_iw_rdma_mr_free); + list_del(&ibmr->mapping.m_list); + rds_iw_destroy_fastreg(pool, ibmr); + kfree(ibmr); + nfreed++; + } + + /* Anything that remains are laundered ibmrs, which we can add + * back to the clean list. */ + if (!list_empty(&unmap_list)) { + spin_lock_irqsave(&pool->list_lock, flags); + list_splice(&unmap_list, &pool->clean_list); + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(nfreed, &pool->item_count); + + mutex_unlock(&pool->flush_lock); + return ret; +} + +static void rds_iw_mr_pool_flush_worker(struct work_struct *work) +{ + struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); + + rds_iw_flush_mr_pool(pool, 0); +} + +void rds_iw_free_mr(void *trans_private, int invalidate) +{ + struct rds_iw_mr *ibmr = trans_private; + struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; + + rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); + if (!pool) + return; + + /* Return it to the pool's free list */ + rds_iw_free_fastreg(pool, ibmr); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_work(rds_wq, &pool->flush_worker); + + if (invalidate) { + if (likely(!in_interrupt())) { + rds_iw_flush_mr_pool(pool, 0); + } else { + /* We get here if the user created a MR marked + * as use_once and invalidate at the same time. */ + queue_work(rds_wq, &pool->flush_worker); + } + } +} + +void rds_iw_flush_mrs(void) +{ + struct rds_iw_device *rds_iwdev; + + list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { + struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; + + if (pool) + rds_iw_flush_mr_pool(pool, 0); + } +} + +void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret) +{ + struct rds_iw_device *rds_iwdev; + struct rds_iw_mr *ibmr = NULL; + struct rdma_cm_id *cm_id; + int ret; + + ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + if (ret || !cm_id) { + ret = -ENODEV; + goto out; + } + + if (!rds_iwdev->mr_pool) { + ret = -ENODEV; + goto out; + } + + ibmr = rds_iw_alloc_mr(rds_iwdev); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->cm_id = cm_id; + ibmr->device = rds_iwdev; + + ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents); + if (ret == 0) + *key_ret = ibmr->mr->rkey; + else + printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); + +out: + if (ret) { + if (ibmr) + rds_iw_free_mr(ibmr, 0); + ibmr = ERR_PTR(ret); + } + return ibmr; +} + +/* + * iWARP fastreg handling + * + * The life cycle of a fastreg registration is a bit different from + * FMRs. + * The idea behind fastreg is to have one MR, to which we bind different + * mappings over time. To avoid stalling on the expensive map and invalidate + * operations, these operations are pipelined on the same send queue on + * which we want to send the message containing the r_key. + * + * This creates a bit of a problem for us, as we do not have the destination + * IP in GET_MR, so the connection must be setup prior to the GET_MR call for + * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit + * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request + * before queuing the SEND. When completions for these arrive, they are + * dispatched to the MR has a bit set showing that RDMa can be performed. + * + * There is another interesting aspect that's related to invalidation. + * The application can request that a mapping is invalidated in FREE_MR. + * The expectation there is that this invalidation step includes ALL + * PREVIOUSLY FREED MRs. + */ +static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct ib_fast_reg_page_list *page_list = NULL; + struct ib_mr *mr; + int err; + + mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); + return err; + } + + /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages + * is not filled in. + */ + page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size); + if (IS_ERR(page_list)) { + err = PTR_ERR(page_list); + + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err); + ib_dereg_mr(mr); + return err; + } + + ibmr->page_list = page_list; + ibmr->mr = mr; + return 0; +} + +static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping) +{ + struct rds_iw_mr *ibmr = mapping->m_mr; + struct ib_send_wr f_wr, *failed_wr; + int ret; + + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); + mapping->m_rkey = ibmr->mr->rkey; + + memset(&f_wr, 0, sizeof(f_wr)); + f_wr.wr_id = RDS_IW_FAST_REG_WR_ID; + f_wr.opcode = IB_WR_FAST_REG_MR; + f_wr.wr.fast_reg.length = mapping->m_sg.bytes; + f_wr.wr.fast_reg.rkey = mapping->m_rkey; + f_wr.wr.fast_reg.page_list = ibmr->page_list; + f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len; + f_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + f_wr.wr.fast_reg.iova_start = 0; + f_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &f_wr; + ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr); + BUG_ON(failed_wr != &f_wr); + if (ret && printk_ratelimit()) + printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + return ret; +} + +static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) +{ + struct ib_send_wr s_wr, *failed_wr; + int ret = 0; + + if (!ibmr->cm_id->qp || !ibmr->mr) + goto out; + + memset(&s_wr, 0, sizeof(s_wr)); + s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; + s_wr.opcode = IB_WR_LOCAL_INV; + s_wr.ex.invalidate_rkey = ibmr->mr->rkey; + s_wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = &s_wr; + ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); + if (ret && printk_ratelimit()) { + printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", + __func__, __LINE__, ret); + goto out; + } +out: + return ret; +} + +static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr, + struct scatterlist *sg, + unsigned int sg_len) +{ + struct rds_iw_device *rds_iwdev = pool->device; + struct rds_iw_mapping *mapping = &ibmr->mapping; + u64 *dma_pages; + int i, ret = 0; + + rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); + + dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); + if (IS_ERR(dma_pages)) { + ret = PTR_ERR(dma_pages); + dma_pages = NULL; + goto out; + } + + if (mapping->m_sg.dma_len > pool->max_message_size) { + ret = -EMSGSIZE; + goto out; + } + + for (i = 0; i < mapping->m_sg.dma_npages; ++i) + ibmr->page_list->page_list[i] = dma_pages[i]; + + ret = rds_iw_rdma_build_fastreg(mapping); + if (ret) + goto out; + + rds_iw_stats_inc(s_iw_rdma_mr_used); + +out: + kfree(dma_pages); + + return ret; +} + +/* + * "Free" a fastreg MR. + */ +static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + unsigned long flags; + int ret; + + if (!ibmr->mapping.m_sg.dma_len) + return; + + ret = rds_iw_rdma_fastreg_inv(ibmr); + if (ret) + return; + + /* Try to post the LOCAL_INV WR to the queue. */ + spin_lock_irqsave(&pool->list_lock, flags); + + list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); + atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + spin_unlock_irqrestore(&pool->list_lock, flags); +} + +static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, + struct list_head *unmap_list, + struct list_head *kill_list) +{ + struct rds_iw_mapping *mapping, *next; + unsigned int ncleaned = 0; + LIST_HEAD(laundered); + + /* Batched invalidation of fastreg MRs. + * Why do we do it this way, even though we could pipeline unmap + * and remap? The reason is the application semantics - when the + * application requests an invalidation of MRs, it expects all + * previously released R_Keys to become invalid. + * + * If we implement MR reuse naively, we risk memory corruption + * (this has actually been observed). So the default behavior + * requires that a MR goes through an explicit unmap operation before + * we can reuse it again. + * + * We could probably improve on this a little, by allowing immediate + * reuse of a MR on the same socket (eg you could add small + * cache of unused MRs to strct rds_socket - GET_MR could grab one + * of these without requiring an explicit invalidate). + */ + while (!list_empty(unmap_list)) { + unsigned long flags; + + spin_lock_irqsave(&pool->list_lock, flags); + list_for_each_entry_safe(mapping, next, unmap_list, m_list) { + list_move(&mapping->m_list, &laundered); + ncleaned++; + } + spin_unlock_irqrestore(&pool->list_lock, flags); + } + + /* Move all laundered mappings back to the unmap list. + * We do not kill any WRs right now - it doesn't seem the + * fastreg API has a max_remap limit. */ + list_splice_init(&laundered, unmap_list); + + return ncleaned; +} + +static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, + struct rds_iw_mr *ibmr) +{ + if (ibmr->page_list) + ib_free_fast_reg_page_list(ibmr->page_list); + if (ibmr->mr) + ib_dereg_mr(ibmr->mr); +} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c new file mode 100644 index 00000000..5e57347f --- /dev/null +++ b/net/rds/iw_recv.c @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +static struct kmem_cache *rds_iw_incoming_slab; +static struct kmem_cache *rds_iw_frag_slab; +static atomic_t rds_iw_allocation = ATOMIC_INIT(0); + +static void rds_iw_frag_drop_page(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + __free_page(frag->f_page); + frag->f_page = NULL; +} + +static void rds_iw_frag_free(struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, frag->f_page); + BUG_ON(frag->f_page); + kmem_cache_free(rds_iw_frag_slab, frag); +} + +/* + * We map a page at a time. Its fragments are posted in order. This + * is called in fragment order as the fragments get send completion events. + * Only the last frag in the page performs the unmapping. + * + * It's OK for ring cleanup to call this in whatever order it likes because + * DMA is not in flight and so we can unmap while other ring entries still + * hold page references in their frags. + */ +static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + struct rds_page_frag *frag = recv->r_frag; + + rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); + if (frag->f_mapped) + ib_dma_unmap_page(ic->i_cm_id->device, + frag->f_mapped, + RDS_FRAG_SIZE, DMA_FROM_DEVICE); + frag->f_mapped = 0; +} + +void rds_iw_recv_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_recv_work *recv; + u32 i; + + for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { + struct ib_sge *sge; + + recv->r_iwinc = NULL; + recv->r_frag = NULL; + + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IW_RECV_SGE; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + } +} + +static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, + struct rds_iw_recv_work *recv) +{ + if (recv->r_iwinc) { + rds_inc_put(&recv->r_iwinc->ii_inc); + recv->r_iwinc = NULL; + } + if (recv->r_frag) { + rds_iw_recv_unmap_page(ic, recv); + if (recv->r_frag->f_page) + rds_iw_frag_drop_page(recv->r_frag); + rds_iw_frag_free(recv->r_frag); + recv->r_frag = NULL; + } +} + +void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) +{ + u32 i; + + for (i = 0; i < ic->i_recv_ring.w_nr; i++) + rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); + + if (ic->i_frag.f_page) + rds_iw_frag_drop_page(&ic->i_frag); +} + +static int rds_iw_recv_refill_one(struct rds_connection *conn, + struct rds_iw_recv_work *recv, + gfp_t kptr_gfp, gfp_t page_gfp) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + dma_addr_t dma_addr; + struct ib_sge *sge; + int ret = -ENOMEM; + + if (!recv->r_iwinc) { + if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { + rds_iw_stats_inc(s_iw_rx_alloc_limit); + goto out; + } + recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, + kptr_gfp); + if (!recv->r_iwinc) { + atomic_dec(&rds_iw_allocation); + goto out; + } + INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); + rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); + } + + if (!recv->r_frag) { + recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); + if (!recv->r_frag) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + recv->r_frag->f_page = NULL; + } + + if (!ic->i_frag.f_page) { + ic->i_frag.f_page = alloc_page(page_gfp); + if (!ic->i_frag.f_page) + goto out; + ic->i_frag.f_offset = 0; + } + + dma_addr = ib_dma_map_page(ic->i_cm_id->device, + ic->i_frag.f_page, + ic->i_frag.f_offset, + RDS_FRAG_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) + goto out; + + /* + * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() + * must be called on this recv. This happens as completions hit + * in order or on connection shutdown. + */ + recv->r_frag->f_page = ic->i_frag.f_page; + recv->r_frag->f_offset = ic->i_frag.f_offset; + recv->r_frag->f_mapped = dma_addr; + + sge = rds_iw_data_sge(ic, recv->r_sge); + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + + sge = rds_iw_header_sge(ic, recv->r_sge); + sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + + get_page(recv->r_frag->f_page); + + if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { + ic->i_frag.f_offset += RDS_FRAG_SIZE; + } else { + put_page(ic->i_frag.f_page); + ic->i_frag.f_page = NULL; + ic->i_frag.f_offset = 0; + } + + ret = 0; +out: + return ret; +} + +/* + * This tries to allocate and post unused work requests after making sure that + * they have all the allocations they need to queue received fragments into + * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc + * pairs don't go unmatched. + * + * -1 is returned if posting fails due to temporary resource exhaustion. + */ +int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, + gfp_t page_gfp, int prefill) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_recv_work *recv; + struct ib_recv_wr *failed_wr; + unsigned int posted = 0; + int ret = 0; + u32 pos; + + while ((prefill || rds_conn_up(conn)) && + rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + if (pos >= ic->i_recv_ring.w_nr) { + printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", + pos); + ret = -EINVAL; + break; + } + + recv = &ic->i_recvs[pos]; + ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + if (ret) { + ret = -1; + break; + } + + /* XXX when can this fail? */ + ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); + rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, + recv->r_iwinc, recv->r_frag->f_page, + (long) recv->r_frag->f_mapped, ret); + if (ret) { + rds_iw_conn_error(conn, "recv post on " + "%pI4 returned %d, disconnecting and " + "reconnecting\n", &conn->c_faddr, + ret); + ret = -1; + break; + } + + posted++; + } + + /* We're doing flow control - update the window. */ + if (ic->i_flowctl && posted) + rds_iw_advertise_credits(conn, posted); + + if (ret) + rds_iw_ring_unalloc(&ic->i_recv_ring, 1); + return ret; +} + +static void rds_iw_inc_purge(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); + + list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_iw_frag_drop_page(frag); + rds_iw_frag_free(frag); + } +} + +void rds_iw_inc_free(struct rds_incoming *inc) +{ + struct rds_iw_incoming *iwinc; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + + rds_iw_inc_purge(inc); + rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); + BUG_ON(!list_empty(&iwinc->ii_frags)); + kmem_cache_free(rds_iw_incoming_slab, iwinc); + atomic_dec(&rds_iw_allocation); + BUG_ON(atomic_read(&rds_iw_allocation) < 0); +} + +int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_iw_incoming *iwinc; + struct rds_page_frag *frag; + struct iovec *iov = first_iov; + unsigned long to_copy; + unsigned long frag_off = 0; + unsigned long iov_off = 0; + int copied = 0; + int ret; + u32 len; + + iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + len = be32_to_cpu(inc->i_hdr.h_len); + + while (copied < size && copied < len) { + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " + "[%p, %lu] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + frag->f_page, frag->f_offset, frag_off); + + /* XXX needs + offset for multiple recvs per page */ + ret = rds_page_copy_to_user(frag->f_page, + frag->f_offset + frag_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + frag_off += to_copy; + copied += to_copy; + } + + return copied; +} + +/* ic starts out kzalloc()ed */ +void rds_iw_recv_init_ack(struct rds_iw_connection *ic) +{ + struct ib_send_wr *wr = &ic->i_ack_wr; + struct ib_sge *sge = &ic->i_ack_sge; + + sge->addr = ic->i_ack_dma; + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); + + wr->sg_list = sge; + wr->num_sge = 1; + wr->opcode = IB_WR_SEND; + wr->wr_id = RDS_IW_ACK_WR_ID; + wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; +} + +/* + * You'd think that with reliable IB connections you wouldn't need to ack + * messages that have been received. The problem is that IB hardware generates + * an ack message before it has DMAed the message into memory. This creates a + * potential message loss if the HCA is disabled for any reason between when it + * sends the ack and before the message is DMAed and processed. This is only a + * potential issue if another HCA is available for fail-over. + * + * When the remote host receives our ack they'll free the sent message from + * their send queue. To decrease the latency of this we always send an ack + * immediately after we've received messages. + * + * For simplicity, we only have one ack in flight at a time. This puts + * pressure on senders to have deep enough send queues to absorb the latency of + * a single ack frame being in flight. This might not be good enough. + * + * This is implemented by have a long-lived send_wr and sge which point to a + * statically allocated ack frame. This ack wr does not fall under the ring + * accounting that the tx and rx wrs do. The QP attribute specifically makes + * room for it beyond the ring size. Send completion notices its special + * wr_id and avoids working with the ring in that case. + */ +#ifndef KERNEL_HAS_ATOMIC64 +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->i_ack_lock, flags); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + spin_unlock_irqrestore(&ic->i_ack_lock, flags); +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + unsigned long flags; + u64 seq; + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + spin_lock_irqsave(&ic->i_ack_lock, flags); + seq = ic->i_ack_next; + spin_unlock_irqrestore(&ic->i_ack_lock, flags); + + return seq; +} +#else +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + atomic64_set(&ic->i_ack_next, seq); + if (ack_required) { + smp_mb__before_clear_bit(); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + } +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + smp_mb__after_clear_bit(); + + return atomic64_read(&ic->i_ack_next); +} +#endif + + +static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) +{ + struct rds_header *hdr = ic->i_ack; + struct ib_send_wr *failed_wr; + u64 seq; + int ret; + + seq = rds_iw_get_ack(ic); + + rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); + rds_message_populate_header(hdr, 0, 0, 0); + hdr->h_ack = cpu_to_be64(seq); + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + ic->i_ack_queued = jiffies; + + ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); + if (unlikely(ret)) { + /* Failed to send. Release the WR, and + * force another ACK. + */ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + rds_iw_stats_inc(s_iw_ack_send_failure); + + rds_iw_conn_error(ic->conn, "sending ack failed\n"); + } else + rds_iw_stats_inc(s_iw_ack_sent); +} + +/* + * There are 3 ways of getting acknowledgements to the peer: + * 1. We call rds_iw_attempt_ack from the recv completion handler + * to send an ACK-only frame. + * However, there can be only one such frame in the send queue + * at any time, so we may have to postpone it. + * 2. When another (data) packet is transmitted while there's + * an ACK in the queue, we piggyback the ACK sequence number + * on the data packet. + * 3. If the ACK WR is done sending, we get called from the + * send queue completion handler, and check whether there's + * another ACK pending (postponed because the WR was on the + * queue). If so, we transmit it. + * + * We maintain 2 variables: + * - i_ack_flags, which keeps track of whether the ACK WR + * is currently in the send queue or not (IB_ACK_IN_FLIGHT) + * - i_ack_next, which is the last sequence number we received + * + * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. + * + * Reconnecting complicates this picture just slightly. When we + * reconnect, we may be seeing duplicate packets. The peer + * is retransmitting them, because it hasn't seen an ACK for + * them. It is important that we ACK these. + * + * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with + * this flag set *MUST* be acknowledged immediately. + */ + +/* + * When we get here, we're called from the recv queue handler. + * Check whether we ought to transmit an ACK. + */ +void rds_iw_attempt_ack(struct rds_iw_connection *ic) +{ + unsigned int adv_credits; + + if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + return; + + if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { + rds_iw_stats_inc(s_iw_ack_send_delayed); + return; + } + + /* Can we get a send credit? */ + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { + rds_iw_stats_inc(s_iw_tx_throttle); + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + return; + } + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + rds_iw_send_ack(ic, adv_credits); +} + +/* + * We get here from the send completion handler, when the + * adapter tells us the ACK frame was sent. + */ +void rds_iw_ack_send_complete(struct rds_iw_connection *ic) +{ + clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); + rds_iw_attempt_ack(ic); +} + +/* + * This is called by the regular xmit code when it wants to piggyback + * an ACK on an outgoing frame. + */ +u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) +{ + if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) + rds_iw_stats_inc(s_iw_ack_send_piggybacked); + return rds_iw_get_ack(ic); +} + +/* + * It's kind of lame that we're copying from the posted receive pages into + * long-lived bitmaps. We could have posted the bitmaps and rdma written into + * them. But receiving new congestion bitmaps should be a *rare* event, so + * hopefully we won't need to invest that complexity in making it more + * efficient. By copying we can share a simpler core with TCP which has to + * copy. + */ +static void rds_iw_cong_recv(struct rds_connection *conn, + struct rds_iw_incoming *iwinc) +{ + struct rds_cong_map *map; + unsigned int map_off; + unsigned int map_page; + struct rds_page_frag *frag; + unsigned long frag_off; + unsigned long to_copy; + unsigned long copied; + uint64_t uncongested = 0; + void *addr; + + /* catch completely corrupt packets */ + if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map = conn->c_fcong; + map_page = 0; + map_off = 0; + + frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); + frag_off = 0; + + copied = 0; + + while (copied < RDS_CONG_MAP_BYTES) { + uint64_t *src, *dst; + unsigned int k; + + to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); + BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ + + addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + + src = addr + frag_off; + dst = (void *)map->m_page_addrs[map_page] + map_off; + for (k = 0; k < to_copy; k += 8) { + /* Record ports that became uncongested, ie + * bits that changed from 0 to 1. */ + uncongested |= ~(*src) & *dst; + *dst++ = *src++; + } + kunmap_atomic(addr, KM_SOFTIRQ0); + + copied += to_copy; + + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + + frag_off += to_copy; + if (frag_off == RDS_FRAG_SIZE) { + frag = list_entry(frag->f_item.next, + struct rds_page_frag, f_item); + frag_off = 0; + } + } + + /* the congestion map is in little endian order */ + uncongested = le64_to_cpu(uncongested); + + rds_cong_map_updated(map, uncongested); +} + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_iw_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + +static void rds_iw_process_recv(struct rds_connection *conn, + struct rds_iw_recv_work *recv, u32 byte_len, + struct rds_iw_ack_state *state) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_incoming *iwinc = ic->i_iwinc; + struct rds_header *ihdr, *hdr; + + /* XXX shut down the connection if port 0,0 are seen? */ + + rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, + byte_len); + + if (byte_len < sizeof(struct rds_header)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + return; + } + byte_len -= sizeof(struct rds_header); + + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; + + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + rds_iw_conn_error(conn, "incoming message " + "from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + return; + } + + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + /* Process the credits update if there was one */ + if (ihdr->h_credit) + rds_iw_send_add_credits(conn, ihdr->h_credit); + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { + /* This is an ACK-only packet. The fact that it gets + * special treatment here is that historically, ACKs + * were rather special beasts. + */ + rds_iw_stats_inc(s_iw_ack_received); + + /* + * Usually the frags make their way on to incs and are then freed as + * the inc is freed. We don't go that route, so we have to drop the + * page ref ourselves. We can't just leave the page on the recv + * because that confuses the dma mapping of pages and each recv's use + * of a partial page. We can leave the frag, though, it will be + * reused. + * + * FIXME: Fold this into the code path below. + */ + rds_iw_frag_drop_page(recv->r_frag); + return; + } + + /* + * If we don't already have an inc on the connection then this + * fragment has a header and starts a message.. copy its header + * into the inc and save the inc so we can hang upcoming fragments + * off its list. + */ + if (!iwinc) { + iwinc = recv->r_iwinc; + recv->r_iwinc = NULL; + ic->i_iwinc = iwinc; + + hdr = &iwinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + + rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, + ic->i_recv_data_rem, hdr->h_flags); + } else { + hdr = &iwinc->ii_inc.i_hdr; + /* We can't just use memcmp here; fragments of a + * single message may carry different ACKs */ + if (hdr->h_sequence != ihdr->h_sequence || + hdr->h_len != ihdr->h_len || + hdr->h_sport != ihdr->h_sport || + hdr->h_dport != ihdr->h_dport) { + rds_iw_conn_error(conn, + "fragment header mismatch; forcing reconnect\n"); + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_iwinc = NULL; + + if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_iw_cong_recv(conn, iwinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &iwinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + + /* Evaluate the ACK_REQUIRED flag *after* we received + * the complete frame, and after bumping the next_rx + * sequence. */ + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + + rds_inc_put(&iwinc->ii_inc); + } +} + +/* + * Plucking the oldest entry from the ring can be done concurrently with + * the thread refilling the ring. Each ring operation is protected by + * spinlocks and the transient state of refilling doesn't change the + * recording of which entry is oldest. + * + * This relies on IB only calling one cq comp_handler for each cq so that + * there will only be one caller of rds_recv_incoming() per RDS connection. + */ +void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_iw_stats_inc(s_iw_rx_cq_call); + + tasklet_schedule(&ic->i_recv_tasklet); +} + +static inline void rds_poll_cq(struct rds_iw_connection *ic, + struct rds_iw_ack_state *state) +{ + struct rds_connection *conn = ic->conn; + struct ib_wc wc; + struct rds_iw_recv_work *recv; + + while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_rx_cq_event); + + recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; + + rds_iw_recv_unmap_page(ic, recv); + + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc.status == IB_WC_SUCCESS) { + rds_iw_process_recv(conn, recv, wc.byte_len, state); + } else { + rds_iw_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); + } + } + + rds_iw_ring_free(&ic->i_recv_ring, 1); + } +} + +void rds_iw_recv_tasklet_fn(unsigned long data) +{ + struct rds_iw_connection *ic = (struct rds_iw_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_iw_ack_state state = { 0, }; + + rds_poll_cq(ic, &state); + ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + rds_poll_cq(ic, &state); + + if (state.ack_next_valid) + rds_iw_set_ack(ic, state.ack_next, state.ack_required); + if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, state.ack_recv, NULL); + ic->i_ack_recv = state.ack_recv; + } + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (rds_iw_ring_empty(&ic->i_recv_ring)) + rds_iw_stats_inc(s_iw_rx_ring_empty); + + /* + * If the ring is running low, then schedule the thread to refill. + */ + if (rds_iw_ring_low(&ic->i_recv_ring)) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} + +int rds_iw_recv(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + int ret = 0; + + rdsdebug("conn %p\n", conn); + + /* + * If we get a temporary posting failure in this context then + * we're really low and we want the caller to back off for a bit. + */ + mutex_lock(&ic->i_recv_mutex); + if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + ret = -ENOMEM; + else + rds_iw_stats_inc(s_iw_rx_refill_from_thread); + mutex_unlock(&ic->i_recv_mutex); + + if (rds_conn_up(conn)) + rds_iw_attempt_ack(ic); + + return ret; +} + +int rds_iw_recv_init(void) +{ + struct sysinfo si; + int ret = -ENOMEM; + + /* Default to 30% of all available RAM for recv memory */ + si_meminfo(&si); + rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; + + rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", + sizeof(struct rds_iw_incoming), + 0, 0, NULL); + if (!rds_iw_incoming_slab) + goto out; + + rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", + sizeof(struct rds_page_frag), + 0, 0, NULL); + if (!rds_iw_frag_slab) + kmem_cache_destroy(rds_iw_incoming_slab); + else + ret = 0; +out: + return ret; +} + +void rds_iw_recv_exit(void) +{ + kmem_cache_destroy(rds_iw_incoming_slab); + kmem_cache_destroy(rds_iw_frag_slab); +} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c new file mode 100644 index 00000000..da8e3b63 --- /dev/null +++ b/net/rds/iw_ring.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "rds.h" +#include "iw.h" + +/* + * Locking for IB rings. + * We assume that allocation is always protected by a mutex + * in the caller (this is a valid assumption for the current + * implementation). + * + * Freeing always happens in an interrupt, and hence only + * races with allocations, but not with other free()s. + * + * The interaction between allocation and freeing is that + * the alloc code has to determine the number of free entries. + * To this end, we maintain two counters; an allocation counter + * and a free counter. Both are allowed to run freely, and wrap + * around. + * The number of used entries is always (alloc_ctr - free_ctr) % NR. + * + * The current implementation makes free_ctr atomic. When the + * caller finds an allocation fails, it should set an "alloc fail" + * bit and retry the allocation. The "alloc fail" bit essentially tells + * the CQ completion handlers to wake it up after freeing some + * more entries. + */ + +/* + * This only happens on shutdown. + */ +DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); + +void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) +{ + memset(ring, 0, sizeof(*ring)); + ring->w_nr = nr; + rdsdebug("ring %p nr %u\n", ring, ring->w_nr); +} + +static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) +{ + u32 diff; + + /* This assumes that atomic_t has at least as many bits as u32 */ + diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); + BUG_ON(diff > ring->w_nr); + + return diff; +} + +void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) +{ + /* We only ever get called from the connection setup code, + * prior to creating the QP. */ + BUG_ON(__rds_iw_ring_used(ring)); + ring->w_nr = nr; +} + +static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) == 0; +} + +u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) +{ + u32 ret = 0, avail; + + avail = ring->w_nr - __rds_iw_ring_used(ring); + + rdsdebug("ring %p val %u next %u free %u\n", ring, val, + ring->w_alloc_ptr, avail); + + if (val && avail) { + ret = min(val, avail); + *pos = ring->w_alloc_ptr; + + ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; + ring->w_alloc_ctr += ret; + } + + return ret; +} + +void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; + atomic_add(val, &ring->w_free_ctr); + + if (__rds_iw_ring_empty(ring) && + waitqueue_active(&rds_iw_ring_empty_wait)) + wake_up(&rds_iw_ring_empty_wait); +} + +void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) +{ + ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; + ring->w_alloc_ctr -= val; +} + +int rds_iw_ring_empty(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_empty(ring); +} + +int rds_iw_ring_low(struct rds_iw_work_ring *ring) +{ + return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); +} + + +/* + * returns the oldest alloced ring entry. This will be the next one + * freed. This can't be called if there are none allocated. + */ +u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) +{ + return ring->w_free_ptr; +} + +/* + * returns the number of completed work requests. + */ + +u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) +{ + u32 ret; + + if (oldest <= (unsigned long long)wr_id) + ret = (unsigned long long)wr_id - oldest + 1; + else + ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; + + rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, + wr_id, oldest); + return ret; +} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c new file mode 100644 index 00000000..545d8ee3 --- /dev/null +++ b/net/rds/iw_send.c @@ -0,0 +1,974 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +static void rds_iw_send_rdma_complete(struct rds_message *rm, + int wc_status) +{ + int notify_status; + + switch (wc_status) { + case IB_WC_WR_FLUSH_ERR: + return; + + case IB_WC_SUCCESS: + notify_status = RDS_RDMA_SUCCESS; + break; + + case IB_WC_REM_ACCESS_ERR: + notify_status = RDS_RDMA_REMOTE_ERROR; + break; + + default: + notify_status = RDS_RDMA_OTHER_ERROR; + break; + } + rds_rdma_send_complete(rm, notify_status); +} + +static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, + struct rm_rdma_op *op) +{ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; + } +} + +static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, + int wc_status) +{ + struct rds_message *rm = send->s_rm; + + rdsdebug("ic %p send %p rm %p\n", ic, send, rm); + + ib_dma_unmap_sg(ic->i_cm_id->device, + rm->data.op_sg, rm->data.op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma); + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * call rds_rdma_send_complete from the cq_handler. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_iw_send_rdma_complete(rm, wc_status); + + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); + } + + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_rm = NULL; +} + +void rds_iw_send_init_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + struct ib_sge *sge; + + send->s_rm = NULL; + send->s_op = NULL; + send->s_mapping = NULL; + + send->s_wr.next = NULL; + send->s_wr.wr_id = i; + send->s_wr.sg_list = send->s_sge; + send->s_wr.num_sge = 1; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.send_flags = 0; + send->s_wr.ex.imm_data = 0; + + sge = rds_iw_data_sge(ic, send->s_sge); + sge->lkey = 0; + + sge = rds_iw_header_sge(ic, send->s_sge); + sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = 0; + + send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + if (IS_ERR(send->s_mr)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); + break; + } + + send->s_page_list = ib_alloc_fast_reg_page_list( + ic->i_cm_id->device, fastreg_message_size); + if (IS_ERR(send->s_page_list)) { + printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n"); + break; + } + } +} + +void rds_iw_send_clear_ring(struct rds_iw_connection *ic) +{ + struct rds_iw_send_work *send; + u32 i; + + for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { + BUG_ON(!send->s_mr); + ib_dereg_mr(send->s_mr); + BUG_ON(!send->s_page_list); + ib_free_fast_reg_page_list(send->s_page_list); + if (send->s_wr.opcode == 0xdead) + continue; + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op) + rds_iw_send_unmap_rdma(ic, send->s_op); + } +} + +/* + * The _oldest/_free ring operations here race cleanly with the alloc/unalloc + * operations performed in the send path. As the sender allocs and potentially + * unallocs the next free entry in the ring it doesn't alter which is + * the next to be freed, which is what this is concerned with. + */ +void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_wc wc; + struct rds_iw_send_work *send; + u32 completed; + u32 oldest; + u32 i; + int ret; + + rdsdebug("cq %p conn %p\n", cq, conn); + rds_iw_stats_inc(s_iw_tx_cq_call); + ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (ret) + rdsdebug("ib_req_notify_cq send failed: %d\n", ret); + + while (ib_poll_cq(cq, 1, &wc) > 0) { + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + be32_to_cpu(wc.ex.imm_data)); + rds_iw_stats_inc(s_iw_tx_cq_event); + + if (wc.status != IB_WC_SUCCESS) { + printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); + break; + } + + if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { + ic->i_fastreg_posted = 0; + continue; + } + + if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) { + ic->i_fastreg_posted = 1; + continue; + } + + if (wc.wr_id == RDS_IW_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_iw_stats_inc(s_iw_tx_stalled); + rds_iw_ack_send_complete(ic); + continue; + } + + oldest = rds_iw_ring_oldest(&ic->i_send_ring); + + completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_rm) + rds_iw_send_unmap_rm(ic, send, wc.status); + break; + case IB_WR_FAST_REG_MR: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + /* Nothing to be done - the SG list will be unmapped + * when the SEND completes. */ + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; + } + + send->s_wr.opcode = 0xdead; + send->s_wr.num_sge = 1; + if (send->s_queued + HZ/2 < jiffies) + rds_iw_stats_inc(s_iw_tx_stalled); + + /* If a RDMA operation produced an error, signal this right + * away. If we don't, the subsequent SEND that goes with this + * RDMA will be canceled with ERR_WFLUSH, and the application + * never learn that the RDMA failed. */ + if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { + struct rds_message *rm; + + rm = rds_send_get_message(conn, send->s_op); + if (rm) + rds_iw_send_rdma_complete(rm, wc.status); + } + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_iw_ring_free(&ic->i_send_ring, completed); + + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + /* We expect errors as the qp is drained during shutdown */ + if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_iw_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc.status); + } + } +} + +/* + * This is the main function for allocating credits when sending + * messages. + * + * Conceptually, we have two counters: + * - send credits: this tells us how many WRs we're allowed + * to submit without overruning the receiver's queue. For + * each SEND WR we post, we decrement this by one. + * + * - posted credits: this tells us how many WRs we recently + * posted to the receive queue. This value is transferred + * to the peer as a "credit update" in a RDS header field. + * Every time we transmit credits to the peer, we subtract + * the amount of transferred credits from this counter. + * + * It is essential that we avoid situations where both sides have + * exhausted their send credits, and are unable to send new credits + * to the peer. We achieve this by requiring that we send at least + * one credit update to the peer before exhausting our credits. + * When new credits arrive, we subtract one credit that is withheld + * until we've posted new buffers and are ready to transmit these + * credits (see rds_iw_send_add_credits below). + * + * The RDS send code is essentially single-threaded; rds_send_xmit + * grabs c_send_lock to ensure exclusive access to the send ring. + * However, the ACK sending code is independent and can race with + * message SENDs. + * + * In the send path, we need to update the counters for send credits + * and the counter of posted buffers atomically - when we use the + * last available credit, we cannot allow another thread to race us + * and grab the posted credits counter. Hence, we have to use a + * spinlock to protect the credit counter, or use atomics. + * + * Spinlocks shared between the send and the receive path are bad, + * because they create unnecessary delays. An early implementation + * using a spinlock showed a 5% degradation in throughput at some + * loads. + * + * This implementation avoids spinlocks completely, putting both + * counters into a single atomic, and updating that atomic using + * atomic_add (in the receive path, when receiving fresh credits), + * and using atomic_cmpxchg when updating the two counters. + */ +int rds_iw_send_grab_credits(struct rds_iw_connection *ic, + u32 wanted, u32 *adv_credits, int need_posted, int max_posted) +{ + unsigned int avail, posted, got = 0, advertise; + long oldval, newval; + + *adv_credits = 0; + if (!ic->i_flowctl) + return wanted; + +try_again: + advertise = 0; + oldval = newval = atomic_read(&ic->i_credits); + posted = IB_GET_POST_CREDITS(oldval); + avail = IB_GET_SEND_CREDITS(oldval); + + rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n", + wanted, avail, posted); + + /* The last credit must be used to send a credit update. */ + if (avail && !posted) + avail--; + + if (avail < wanted) { + struct rds_connection *conn = ic->i_cm_id->context; + + /* Oops, there aren't that many credits left! */ + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + got = avail; + } else { + /* Sometimes you get what you want, lalala. */ + got = wanted; + } + newval -= IB_SET_SEND_CREDITS(got); + + /* + * If need_posted is non-zero, then the caller wants + * the posted regardless of whether any send credits are + * available. + */ + if (posted && (got || need_posted)) { + advertise = min_t(unsigned int, posted, max_posted); + newval -= IB_SET_POST_CREDITS(advertise); + } + + /* Finally bill everything */ + if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) + goto try_again; + + *adv_credits = advertise; + return got; +} + +void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (credits == 0) + return; + + rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n", + credits, + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), + test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); + + atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); + + rds_iw_stats_inc(s_iw_rx_credit_updates); +} + +void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + if (posted == 0) + return; + + atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); + + /* Decide whether to send an update to the peer now. + * If we would send a credit update for every single buffer we + * post, we would end up with an ACK storm (ACK arrives, + * consumes buffer, we refill the ring, send ACK to remote + * advertising the newly posted buffer... ad inf) + * + * Performance pretty much depends on how often we send + * credit updates - too frequent updates mean lots of ACKs. + * Too infrequent updates, and the peer will run out of + * credits and has to throttle. + * For the time being, 16 seems to be a good compromise. + */ + if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); +} + +static inline void +rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, + struct rds_iw_send_work *send, unsigned int pos, + unsigned long buffer, unsigned int length, + int send_flags) +{ + struct ib_sge *sge; + + WARN_ON(pos != send - ic->i_sends); + + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 2; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; + + if (length != 0) { + sge = rds_iw_data_sge(ic, send->s_sge); + sge->addr = buffer; + sge->length = length; + sge->lkey = rds_iw_local_dma_lkey(ic); + + sge = rds_iw_header_sge(ic, send->s_sge); + } else { + /* We're sending a packet with no payload. There is only + * one SGE */ + send->s_wr.num_sge = 1; + sge = &send->s_sge[0]; + } + + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = rds_iw_local_dma_lkey(ic); +} + +/* + * This can be called multiple times for a given message. The first time + * we see a message we map its scatterlist into the IB device so that + * we can provide that mapped address to the IB scatter gather entries + * in the IB work requests. We translate the scatterlist into a series + * of work requests that fragment the message. These work requests complete + * in order so we pass ownership of the message to the completion handler + * once we send the final fragment. + * + * The RDS core uses the c_send_lock to only enter this function once + * per connection. This makes sure that the tx ring alloc/unalloc pairs + * don't get out of sync and confuse the ring. + */ +int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct ib_device *dev = ic->i_cm_id->device; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct scatterlist *scat; + u32 pos; + u32 i; + u32 work_alloc; + u32 credit_alloc; + u32 posted; + u32 adv_credits = 0; + int send_flags = 0; + int sent; + int ret; + int flow_controlled = 0; + + BUG_ON(off % RDS_FRAG_SIZE); + BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); + + /* Fastreg support */ + if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { + ret = -EAGAIN; + goto out; + } + + /* FIXME we may overallocate here */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) + i = 1; + else + i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + credit_alloc = work_alloc; + if (ic->i_flowctl) { + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); + adv_credits += posted; + if (credit_alloc < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); + work_alloc = credit_alloc; + flow_controlled++; + } + if (work_alloc == 0) { + set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_stats_inc(s_iw_tx_throttle); + ret = -ENOMEM; + goto out; + } + } + + /* map the message the first time we see it */ + if (!ic->i_rm) { + /* + printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", + be16_to_cpu(rm->m_inc.i_hdr.h_dport), + rm->m_inc.i_hdr.h_flags, + be32_to_cpu(rm->m_inc.i_hdr.h_len)); + */ + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + } else { + rm->data.op_count = 0; + } + + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + rds_message_addref(rm); + ic->i_rm = rm; + + /* Finalize the header */ + if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + + /* If it has a RDMA op, tell the peer we did it. This is + * used by the peer to release use-once RDMA MRs. */ + if (rm->rdma.op_active) { + struct rds_ext_header_rdma ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + } + if (rm->m_rdma_cookie) { + rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie)); + } + + /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so + * we should not do this unless we have a chance of at least + * sticking the header into the send ring. Which is why we + * should call rds_iw_ring_alloc first. */ + rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); + rds_message_make_checksum(&rm->m_inc.i_hdr); + + /* + * Update adv_credits since we reset the ACK_REQUIRED bit. + */ + rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } + + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->data.op_sg[sg]; + sent = 0; + i = 0; + + /* Sometimes you want to put a fence between an RDMA + * READ and the following SEND. + * We could either do this all the time + * or when requested by the user. Right now, we let + * the application choose. + */ + if (rm->rdma.op_active && rm->rdma.op_fence) + send_flags = IB_SEND_FENCE; + + /* + * We could be copying the header into the unused tail of the page. + * That would need to be changed in the future when those pages might + * be mapped userspace pages or page cache pages. So instead we always + * use a second sge and our long-lived ring of mapped headers. We send + * the header after the data so that the data payload can be aligned on + * the receiver. + */ + + /* handle a 0-len message */ + if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { + rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); + goto add_header; + } + + /* if there's data reference it with a chain of work reqs */ + for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { + unsigned int len; + + send = &ic->i_sends[pos]; + + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + rds_iw_xmit_populate_wr(ic, send, pos, + ib_sg_dma_address(dev, scat) + off, len, + send_flags); + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + ic->i_unsignaled_bytes -= len; + if (ic->i_unsignaled_bytes <= 0) { + ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* + * Always signal the last one if we're stopping due to flow control. + */ + if (flow_controlled && i == (work_alloc-1)) + send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + +add_header: + /* Tack on the header after the data. The header SGE should already + * have been set up to point to the right header buffer. */ + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + if (0) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", + be16_to_cpu(hdr->h_dport), + hdr->h_flags, + be32_to_cpu(hdr->h_len)); + } + if (adv_credits) { + struct rds_header *hdr = &ic->i_send_hdrs[pos]; + + /* add credit and redo the header checksum */ + hdr->h_credit = adv_credits; + rds_message_make_checksum(hdr); + adv_credits = 0; + rds_iw_stats_inc(s_iw_tx_credit_updates); + } + + if (prev) + prev->s_wr.next = &send->s_wr; + prev = send; + + pos = (pos + 1) % ic->i_send_ring.w_nr; + } + + /* Account the RDS header in the number of bytes we sent, but just once. + * The caller has no concept of fragmentation. */ + if (hdr_off == 0) + sent += sizeof(struct rds_header); + + /* if we finished the message then send completion owns it */ + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_rm = ic->i_rm; + prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + ic->i_rm = NULL; + } + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + if (ic->i_flowctl && i < credit_alloc) + rds_iw_send_add_credits(conn, credit_alloc - i); + + /* XXX need to worry about failed_wr and partial sends. */ + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + if (prev->s_rm) { + ic->i_rm = prev->s_rm; + prev->s_rm = NULL; + } + goto out; + } + + ret = sent; +out: + BUG_ON(adv_credits); + return ret; +} + +static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr) +{ + BUG_ON(nent > send->s_page_list->max_page_list_len); + /* + * Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. + */ + send->s_wr.opcode = IB_WR_FAST_REG_MR; + send->s_wr.wr.fast_reg.length = len; + send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey; + send->s_wr.wr.fast_reg.page_list = send->s_page_list; + send->s_wr.wr.fast_reg.page_list_len = nent; + send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE; + send->s_wr.wr.fast_reg.iova_start = sg_addr; + + ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); +} + +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + struct rds_iw_send_work *send = NULL; + struct rds_iw_send_work *first; + struct rds_iw_send_work *prev; + struct ib_send_wr *failed_wr; + struct rds_iw_device *rds_iwdev; + struct scatterlist *scat; + unsigned long len; + u64 remote_addr = op->op_remote_addr; + u32 pos, fr_pos; + u32 work_alloc; + u32 i; + u32 j; + int sent; + int ret; + int num_sge; + + rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); + + /* map the message the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { + rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + op->op_mapped = 1; + } + + if (!op->op_write) { + /* Alloc space on the send queue for the fastreg */ + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); + if (work_alloc != 1) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + } + + /* + * Instead of knowing how to return a partial rdma read/write we insist that there + * be enough work requests to send the entire message. + */ + i = ceil(op->op_count, rds_iwdev->max_sge); + + work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc != i) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_iw_stats_inc(s_iw_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + send = &ic->i_sends[pos]; + if (!op->op_write) { + first = prev = &ic->i_sends[fr_pos]; + } else { + first = send; + prev = NULL; + } + scat = &op->op_sg[0]; + sent = 0; + num_sge = op->op_count; + + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { + send->s_wr.send_flags = 0; + send->s_queued = jiffies; + + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0) { + ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; + send->s_wr.send_flags = IB_SEND_SIGNALED; + } + + /* To avoid the need to have the plumbing to invalidate the fastreg_mr used + * for local access after RDS is finished with it, using + * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. + */ + if (op->op_write) + send->s_wr.opcode = IB_WR_RDMA_WRITE; + else + send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + + send->s_wr.wr.rdma.remote_addr = remote_addr; + send->s_wr.wr.rdma.rkey = op->op_rkey; + send->s_op = op; + + if (num_sge > rds_iwdev->max_sge) { + send->s_wr.num_sge = rds_iwdev->max_sge; + num_sge -= rds_iwdev->max_sge; + } else + send->s_wr.num_sge = num_sge; + + send->s_wr.next = NULL; + + if (prev) + prev->s_wr.next = &send->s_wr; + + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { + len = ib_sg_dma_len(ic->i_cm_id->device, scat); + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) + send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat); + else { + send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); + send->s_sge[j].length = len; + send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); + } + + sent += len; + rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); + remote_addr += len; + + scat++; + } + + if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) { + send->s_wr.num_sge = 1; + send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; + send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; + send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; + } + + rdsdebug("send %p wr %p num_sge %u next %p\n", send, + &send->s_wr, send->s_wr.num_sge, send->s_wr.next); + + prev = send; + if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) + send = ic->i_sends; + } + + /* if we finished the message then send completion owns it */ + if (scat == &op->op_sg[op->op_count]) + first->s_wr.send_flags = IB_SEND_SIGNALED; + + if (i < work_alloc) { + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + + /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not + * recommended. Putting the lkey on the wire is a security hole, as it can + * allow for memory access to all of memory on the remote system. Some + * adapters do not allow using the lkey for this at all. To bypass this use a + * fastreg_mr (or possibly a dma_mr) + */ + if (!op->op_write) { + rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], + op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + work_alloc++; + } + + failed_wr = &first->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); + rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, + first, &first->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &first->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + +out: + return ret; +} + +void rds_iw_xmit_complete(struct rds_connection *conn) +{ + struct rds_iw_connection *ic = conn->c_transport_data; + + /* We may have a pending ACK or window update we were unable + * to send previously (due to flow control). Try again. */ + rds_iw_attempt_ack(ic); +} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c new file mode 100644 index 00000000..5fe67f6a --- /dev/null +++ b/net/rds/iw_stats.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "iw.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); + +static const char *const rds_iw_stat_names[] = { + "iw_connect_raced", + "iw_listen_closed_stale", + "iw_tx_cq_call", + "iw_tx_cq_event", + "iw_tx_ring_full", + "iw_tx_throttle", + "iw_tx_sg_mapping_failure", + "iw_tx_stalled", + "iw_tx_credit_updates", + "iw_rx_cq_call", + "iw_rx_cq_event", + "iw_rx_ring_empty", + "iw_rx_refill_from_cq", + "iw_rx_refill_from_thread", + "iw_rx_alloc_limit", + "iw_rx_credit_updates", + "iw_ack_sent", + "iw_ack_send_failure", + "iw_ack_send_delayed", + "iw_ack_send_piggybacked", + "iw_ack_received", + "iw_rdma_mr_alloc", + "iw_rdma_mr_free", + "iw_rdma_mr_used", + "iw_rdma_mr_pool_flush", + "iw_rdma_mr_pool_wait", + "iw_rdma_mr_pool_depleted", +}; + +unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_iw_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_iw_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, + ARRAY_SIZE(rds_iw_stat_names)); +out: + return ARRAY_SIZE(rds_iw_stat_names); +} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c new file mode 100644 index 00000000..e2e47176 --- /dev/null +++ b/net/rds/iw_sysctl.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "iw.h" + +static struct ctl_table_header *rds_iw_sysctl_hdr; + +unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; +unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; +unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; +static unsigned long rds_iw_sysctl_max_wr_min = 1; +/* hardware will fail CQ creation long before this */ +static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; + +unsigned long rds_iw_sysctl_max_unsig_wrs = 16; +static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; + +unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); +static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; +static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; + +unsigned int rds_iw_sysctl_flow_control = 1; + +static ctl_table rds_iw_sysctl_table[] = { + { + .procname = "max_send_wr", + .data = &rds_iw_sysctl_max_send_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .procname = "max_recv_wr", + .data = &rds_iw_sysctl_max_recv_wr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_wr_min, + .extra2 = &rds_iw_sysctl_max_wr_max, + }, + { + .procname = "max_unsignaled_wr", + .data = &rds_iw_sysctl_max_unsig_wrs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_wr_min, + .extra2 = &rds_iw_sysctl_max_unsig_wr_max, + }, + { + .procname = "max_unsignaled_bytes", + .data = &rds_iw_sysctl_max_unsig_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, + .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, + }, + { + .procname = "max_recv_allocation", + .data = &rds_iw_sysctl_max_recv_allocation, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "flow_control", + .data = &rds_iw_sysctl_flow_control, + .maxlen = sizeof(rds_iw_sysctl_flow_control), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path rds_iw_sysctl_path[] = { + { .procname = "net", }, + { .procname = "rds", }, + { .procname = "iw", }, + { } +}; + +void rds_iw_sysctl_exit(void) +{ + if (rds_iw_sysctl_hdr) + unregister_sysctl_table(rds_iw_sysctl_hdr); +} + +int rds_iw_sysctl_init(void) +{ + rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); + if (!rds_iw_sysctl_hdr) + return -ENOMEM; + return 0; +} diff --git a/net/rds/loop.c b/net/rds/loop.c new file mode 100644 index 00000000..bca6761a --- /dev/null +++ b/net/rds/loop.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "loop.h" + +static DEFINE_SPINLOCK(loop_conns_lock); +static LIST_HEAD(loop_conns); + +/* + * This 'loopback' transport is a special case for flows that originate + * and terminate on the same machine. + * + * Connection build-up notices if the destination address is thought of + * as a local address by a transport. At that time it decides to use the + * loopback transport instead of the bound transport of the sending socket. + * + * The loopback transport's sending path just hands the sent rds_message + * straight to the receiving path via an embedded rds_incoming. + */ + +/* + * Usually a message transits both the sender and receiver's conns as it + * flows to the receiver. In the loopback case, though, the receive path + * is handed the sending conn so the sense of the addresses is reversed. + */ +static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, + unsigned int off) +{ + struct scatterlist *sgp = &rm->data.op_sg[sg]; + int ret = sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); + goto out; + } + + BUG_ON(hdr_off || sg || off); + + rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + /* For the embedded inc. Matching put is in loop_inc_free() */ + rds_message_addref(rm); + + rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + GFP_KERNEL, KM_USER0); + + rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), + NULL); + + rds_inc_put(&rm->m_inc); +out: + return ret; +} + +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void rds_loop_inc_free(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); +} + +/* we need to at least give the thread something to succeed */ +static int rds_loop_recv(struct rds_connection *conn) +{ + return 0; +} + +struct rds_loop_connection { + struct list_head loop_node; + struct rds_connection *conn; +}; + +/* + * Even the loopback transport needs to keep track of its connections, + * so it can call rds_conn_destroy() on them on exit. N.B. there are + * 1+ loopback addresses (127.*.*.*) so it's not a bug to have + * multiple loopback conns allocated, although rather useless. + */ +static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_loop_connection *lc; + unsigned long flags; + + lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); + if (!lc) + return -ENOMEM; + + INIT_LIST_HEAD(&lc->loop_node); + lc->conn = conn; + conn->c_transport_data = lc; + + spin_lock_irqsave(&loop_conns_lock, flags); + list_add_tail(&lc->loop_node, &loop_conns); + spin_unlock_irqrestore(&loop_conns_lock, flags); + + return 0; +} + +static void rds_loop_conn_free(void *arg) +{ + struct rds_loop_connection *lc = arg; + unsigned long flags; + + rdsdebug("lc %p\n", lc); + spin_lock_irqsave(&loop_conns_lock, flags); + list_del(&lc->loop_node); + spin_unlock_irqrestore(&loop_conns_lock, flags); + kfree(lc); +} + +static int rds_loop_conn_connect(struct rds_connection *conn) +{ + rds_connect_complete(conn); + return 0; +} + +static void rds_loop_conn_shutdown(struct rds_connection *conn) +{ +} + +void rds_loop_exit(void) +{ + struct rds_loop_connection *lc, *_lc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&loop_conns_lock); + list_splice(&loop_conns, &tmp_list); + INIT_LIST_HEAD(&loop_conns); + spin_unlock_irq(&loop_conns_lock); + + list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { + WARN_ON(lc->conn->c_passive); + rds_conn_destroy(lc->conn); + } +} + +/* + * This is missing .xmit_* because loop doesn't go through generic + * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and + * .laddr_check are missing because transport.c doesn't iterate over + * rds_loop_transport. + */ +struct rds_transport rds_loop_transport = { + .xmit = rds_loop_xmit, + .recv = rds_loop_recv, + .conn_alloc = rds_loop_conn_alloc, + .conn_free = rds_loop_conn_free, + .conn_connect = rds_loop_conn_connect, + .conn_shutdown = rds_loop_conn_shutdown, + .inc_copy_to_user = rds_message_inc_copy_to_user, + .inc_free = rds_loop_inc_free, + .t_name = "loopback", +}; diff --git a/net/rds/loop.h b/net/rds/loop.h new file mode 100644 index 00000000..f32b0939 --- /dev/null +++ b/net/rds/loop.h @@ -0,0 +1,9 @@ +#ifndef _RDS_LOOP_H +#define _RDS_LOOP_H + +/* loop.c */ +extern struct rds_transport rds_loop_transport; + +void rds_loop_exit(void); + +#endif diff --git a/net/rds/message.c b/net/rds/message.c new file mode 100644 index 00000000..1fd3d290 --- /dev/null +++ b/net/rds/message.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "rds.h" + +static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { +[RDS_EXTHDR_NONE] = 0, +[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), +[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), +[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +}; + + +void rds_message_addref(struct rds_message *rm) +{ + rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + atomic_inc(&rm->m_refcount); +} +EXPORT_SYMBOL_GPL(rds_message_addref); + +/* + * This relies on dma_map_sg() not touching sg[].page during merging. + */ +static void rds_message_purge(struct rds_message *rm) +{ + unsigned long i; + + if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) + return; + + for (i = 0; i < rm->data.op_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); + /* XXX will have to put_page for page refs */ + __free_page(sg_page(&rm->data.op_sg[i])); + } + rm->data.op_nents = 0; + + if (rm->rdma.op_active) + rds_rdma_free_op(&rm->rdma); + if (rm->rdma.op_rdma_mr) + rds_mr_put(rm->rdma.op_rdma_mr); + + if (rm->atomic.op_active) + rds_atomic_free_op(&rm->atomic); + if (rm->atomic.op_rdma_mr) + rds_mr_put(rm->atomic.op_rdma_mr); +} + +void rds_message_put(struct rds_message *rm) +{ + rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); + if (atomic_read(&rm->m_refcount) == 0) { +printk(KERN_CRIT "danger refcount zero on %p\n", rm); +WARN_ON(1); + } + if (atomic_dec_and_test(&rm->m_refcount)) { + BUG_ON(!list_empty(&rm->m_sock_item)); + BUG_ON(!list_empty(&rm->m_conn_item)); + rds_message_purge(rm); + + kfree(rm); + } +} +EXPORT_SYMBOL_GPL(rds_message_put); + +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq) +{ + hdr->h_flags = 0; + hdr->h_sport = sport; + hdr->h_dport = dport; + hdr->h_sequence = cpu_to_be64(seq); + hdr->h_exthdr[0] = RDS_EXTHDR_NONE; +} +EXPORT_SYMBOL_GPL(rds_message_populate_header); + +int rds_message_add_extension(struct rds_header *hdr, unsigned int type, + const void *data, unsigned int len) +{ + unsigned int ext_len = sizeof(u8) + len; + unsigned char *dst; + + /* For now, refuse to add more than one extension header */ + if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) + return 0; + + if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) + return 0; + + if (ext_len >= RDS_HEADER_EXT_SPACE) + return 0; + dst = hdr->h_exthdr; + + *dst++ = type; + memcpy(dst, data, len); + + dst[len] = RDS_EXTHDR_NONE; + return 1; +} +EXPORT_SYMBOL_GPL(rds_message_add_extension); + +/* + * If a message has extension headers, retrieve them here. + * Call like this: + * + * unsigned int pos = 0; + * + * while (1) { + * buflen = sizeof(buffer); + * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); + * if (type == RDS_EXTHDR_NONE) + * break; + * ... + * } + */ +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen) +{ + unsigned int offset, ext_type, ext_len; + u8 *src = hdr->h_exthdr; + + offset = *pos; + if (offset >= RDS_HEADER_EXT_SPACE) + goto none; + + /* Get the extension type and length. For now, the + * length is implied by the extension type. */ + ext_type = src[offset++]; + + if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) + goto none; + ext_len = rds_exthdr_size[ext_type]; + if (offset + ext_len > RDS_HEADER_EXT_SPACE) + goto none; + + *pos = offset + ext_len; + if (ext_len < *buflen) + *buflen = ext_len; + memcpy(buf, src + offset, *buflen); + return ext_type; + +none: + *pos = RDS_HEADER_EXT_SPACE; + *buflen = 0; + return RDS_EXTHDR_NONE; +} + +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) +{ + struct rds_ext_header_rdma_dest ext_hdr; + + ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); + ext_hdr.h_rdma_offset = cpu_to_be32(offset); + return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); +} +EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); + +/* + * Each rds_message is allocated with extra space for the scatterlist entries + * rds ops will need. This is to minimize memory allocation count. Then, each rds op + * can grab SGs when initializing its part of the rds_message. + */ +struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) +{ + struct rds_message *rm; + + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); + if (!rm) + goto out; + + rm->m_used_sgs = 0; + rm->m_total_sgs = extra_len / sizeof(struct scatterlist); + + atomic_set(&rm->m_refcount, 1); + INIT_LIST_HEAD(&rm->m_sock_item); + INIT_LIST_HEAD(&rm->m_conn_item); + spin_lock_init(&rm->m_rs_lock); + init_waitqueue_head(&rm->m_flush_wait); + +out: + return rm; +} + +/* + * RDS ops use this to grab SG entries from the rm's sg pool. + */ +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +{ + struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; + struct scatterlist *sg_ret; + + WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + WARN_ON(!nents); + + if (rm->m_used_sgs + nents > rm->m_total_sgs) + return NULL; + + sg_ret = &sg_first[rm->m_used_sgs]; + sg_init_table(sg_ret, nents); + rm->m_used_sgs += nents; + + return sg_ret; +} + +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) +{ + struct rds_message *rm; + unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); + + rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); + if (!rm) + return ERR_PTR(-ENOMEM); + + set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + if (!rm->data.op_sg) { + rds_message_put(rm); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < rm->data.op_nents; ++i) { + sg_set_page(&rm->data.op_sg[i], + virt_to_page(page_addrs[i]), + PAGE_SIZE, 0); + } + + return rm; +} + +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, + size_t total_len) +{ + unsigned long to_copy; + unsigned long iov_off; + unsigned long sg_off; + struct iovec *iov; + struct scatterlist *sg; + int ret = 0; + + rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + + /* + * now allocate and copy in the data payload. + */ + sg = rm->data.op_sg; + iov = first_iov; + iov_off = 0; + sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + + while (total_len) { + if (!sg_page(sg)) { + ret = rds_page_remainder_alloc(sg, total_len, + GFP_HIGHUSER); + if (ret) + goto out; + rm->data.op_nents++; + sg_off = 0; + } + + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); + to_copy = min_t(size_t, to_copy, total_len); + + rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + (void *)sg_page(sg), sg->offset, sg->length, sg_off); + + ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, + iov->iov_base + iov_off, + to_copy); + if (ret) + goto out; + + iov_off += to_copy; + total_len -= to_copy; + sg_off += to_copy; + + if (sg_off == sg->length) + sg++; + } + +out: + return ret; +} + +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size) +{ + struct rds_message *rm; + struct iovec *iov; + struct scatterlist *sg; + unsigned long to_copy; + unsigned long iov_off; + unsigned long vec_off; + int copied; + int ret; + u32 len; + + rm = container_of(inc, struct rds_message, m_inc); + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + iov = first_iov; + iov_off = 0; + sg = rm->data.op_sg; + vec_off = 0; + copied = 0; + + while (copied < size && copied < len) { + while (iov_off == iov->iov_len) { + iov_off = 0; + iov++; + } + + to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); + to_copy = min_t(size_t, to_copy, size - copied); + to_copy = min_t(unsigned long, to_copy, len - copied); + + rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " + "sg [%p, %u, %u] + %lu\n", + to_copy, iov->iov_base, iov->iov_len, iov_off, + sg_page(sg), sg->offset, sg->length, vec_off); + + ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, + iov->iov_base + iov_off, + to_copy); + if (ret) { + copied = ret; + break; + } + + iov_off += to_copy; + vec_off += to_copy; + copied += to_copy; + + if (vec_off == sg->length) { + vec_off = 0; + sg++; + } + } + + return copied; +} + +/* + * If the message is still on the send queue, wait until the transport + * is done with it. This is particularly important for RDMA operations. + */ +void rds_message_wait(struct rds_message *rm) +{ + wait_event_interruptible(rm->m_flush_wait, + !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); +} + +void rds_message_unmapped(struct rds_message *rm) +{ + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); +} +EXPORT_SYMBOL_GPL(rds_message_unmapped); + diff --git a/net/rds/page.c b/net/rds/page.c new file mode 100644 index 00000000..d8acdebe --- /dev/null +++ b/net/rds/page.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "rds.h" + +struct rds_page_remainder { + struct page *r_page; + unsigned long r_offset; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, + rds_page_remainders); + +/* + * returns 0 on success or -errno on failure. + * + * We don't have to worry about flush_dcache_page() as this only works + * with private pages. If, say, we were to do directed receive to pinned + * user pages we'd have to worry more about cache coherence. (Though + * the flush_dcache_page() in get_user_pages() would probably be enough). + */ +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user) +{ + unsigned long ret; + void *addr; + + addr = kmap(page); + if (to_user) { + rds_stats_add(s_copy_to_user, bytes); + ret = copy_to_user(ptr, addr + offset, bytes); + } else { + rds_stats_add(s_copy_from_user, bytes); + ret = copy_from_user(addr + offset, ptr, bytes); + } + kunmap(page); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(rds_page_copy_user); + +/* + * Message allocation uses this to build up regions of a message. + * + * @bytes - the number of bytes needed. + * @gfp - the waiting behaviour of the allocation + * + * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to + * kmap the pages, etc. + * + * If @bytes is at least a full page then this just returns a page from + * alloc_page(). + * + * If @bytes is a partial page then this stores the unused region of the + * page in a per-cpu structure. Future partial-page allocations may be + * satisfied from that cached region. This lets us waste less memory on + * small allocations with minimal complexity. It works because the transmit + * path passes read-only page regions down to devices. They hold a page + * reference until they are done with the region. + */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp) +{ + struct rds_page_remainder *rem; + unsigned long flags; + struct page *page; + int ret; + + gfp |= __GFP_HIGHMEM; + + /* jump straight to allocation if we're trying for a huge page */ + if (bytes >= PAGE_SIZE) { + page = alloc_page(gfp); + if (!page) { + ret = -ENOMEM; + } else { + sg_set_page(scat, page, PAGE_SIZE, 0); + ret = 0; + } + goto out; + } + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + while (1) { + /* avoid a tiny region getting stuck by tossing it */ + if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { + rds_stats_inc(s_page_remainder_miss); + __free_page(rem->r_page); + rem->r_page = NULL; + } + + /* hand out a fragment from the cached page */ + if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { + sg_set_page(scat, rem->r_page, bytes, rem->r_offset); + get_page(sg_page(scat)); + + if (rem->r_offset != 0) + rds_stats_inc(s_page_remainder_hit); + + rem->r_offset += bytes; + if (rem->r_offset == PAGE_SIZE) { + __free_page(rem->r_page); + rem->r_page = NULL; + } + ret = 0; + break; + } + + /* alloc if there is nothing for us to use */ + local_irq_restore(flags); + put_cpu(); + + page = alloc_page(gfp); + + rem = &per_cpu(rds_page_remainders, get_cpu()); + local_irq_save(flags); + + if (!page) { + ret = -ENOMEM; + break; + } + + /* did someone race to fill the remainder before us? */ + if (rem->r_page) { + __free_page(page); + continue; + } + + /* otherwise install our page and loop around to alloc */ + rem->r_page = page; + rem->r_offset = 0; + } + + local_irq_restore(flags); + put_cpu(); +out: + rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, + ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, + ret ? 0 : scat->length); + return ret; +} +EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); + +static int rds_page_remainder_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct rds_page_remainder *rem; + long cpu = (long)hcpu; + + rem = &per_cpu(rds_page_remainders, cpu); + + rdsdebug("cpu %ld action 0x%lx\n", cpu, action); + + switch (action) { + case CPU_DEAD: + if (rem->r_page) + __free_page(rem->r_page); + rem->r_page = NULL; + break; + } + + return 0; +} + +static struct notifier_block rds_page_remainder_nb = { + .notifier_call = rds_page_remainder_cpu_notify, +}; + +void rds_page_exit(void) +{ + int i; + + for_each_possible_cpu(i) + rds_page_remainder_cpu_notify(&rds_page_remainder_nb, + (unsigned long)CPU_DEAD, + (void *)(long)i); +} diff --git a/net/rds/rdma.c b/net/rds/rdma.c new file mode 100644 index 00000000..4e37c1cb --- /dev/null +++ b/net/rds/rdma.c @@ -0,0 +1,858 @@ +/* + * Copyright (c) 2007 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include /* for DMA_*_DEVICE */ + +#include "rds.h" + +/* + * XXX + * - build with sparse + * - should we limit the size of a mr region? let transport return failure? + * - should we detect duplicate keys on a socket? hmm. + * - an rdma is an mlock, apply rlimit? + */ + +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct scatterlist'. + */ +static unsigned int rds_pages_in_vec(struct rds_iovec *vec) +{ + if ((vec->addr + vec->bytes <= vec->addr) || + (vec->bytes > (u64)UINT_MAX)) + return 0; + + return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (vec->addr >> PAGE_SHIFT); +} + +static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, + struct rds_mr *insert) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rds_mr *mr; + + while (*p) { + parent = *p; + mr = rb_entry(parent, struct rds_mr, r_rb_node); + + if (key < mr->r_key) + p = &(*p)->rb_left; + else if (key > mr->r_key) + p = &(*p)->rb_right; + else + return mr; + } + + if (insert) { + rb_link_node(&insert->r_rb_node, parent, p); + rb_insert_color(&insert->r_rb_node, root); + atomic_inc(&insert->r_refcount); + } + return NULL; +} + +/* + * Destroy the transport-specific part of a MR. + */ +static void rds_destroy_mr(struct rds_mr *mr) +{ + struct rds_sock *rs = mr->r_sock; + void *trans_private = NULL; + unsigned long flags; + + rdsdebug("RDS: destroy mr key is %x refcnt %u\n", + mr->r_key, atomic_read(&mr->r_refcount)); + + if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) + return; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + if (!RB_EMPTY_NODE(&mr->r_rb_node)) + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + trans_private = mr->r_trans_private; + mr->r_trans_private = NULL; + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (trans_private) + mr->r_trans->free_mr(trans_private, mr->r_invalidate); +} + +void __rds_put_mr_final(struct rds_mr *mr) +{ + rds_destroy_mr(mr); + kfree(mr); +} + +/* + * By the time this is called we can't have any more ioctls called on + * the socket so we don't need to worry about racing with others. + */ +void rds_rdma_drop_keys(struct rds_sock *rs) +{ + struct rds_mr *mr; + struct rb_node *node; + unsigned long flags; + + /* Release any MRs associated with this socket */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + while ((node = rb_first(&rs->rs_rdma_keys))) { + mr = container_of(node, struct rds_mr, r_rb_node); + if (mr->r_trans == rs->rs_transport) + mr->r_invalidate = 0; + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + rds_destroy_mr(mr); + rds_mr_put(mr); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (rs->rs_transport && rs->rs_transport->flush_mrs) + rs->rs_transport->flush_mrs(); +} + +/* + * Helper function to pin user pages. + */ +static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + int ret; + + ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + + if (ret >= 0 && ret < nr_pages) { + while (ret--) + put_page(pages[ret]); + ret = -EFAULT; + } + + return ret; +} + +static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, + u64 *cookie_ret, struct rds_mr **mr_ret) +{ + struct rds_mr *mr = NULL, *found; + unsigned int nr_pages; + struct page **pages = NULL; + struct scatterlist *sg; + void *trans_private; + unsigned long flags; + rds_rdma_cookie_t cookie; + unsigned int nents; + long i; + int ret; + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (!rs->rs_transport->get_mr) { + ret = -EOPNOTSUPP; + goto out; + } + + nr_pages = rds_pages_in_vec(&args->vec); + if (nr_pages == 0) { + ret = -EINVAL; + goto out; + } + + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", + args->vec.addr, args->vec.bytes, nr_pages); + + /* XXX clamp nr_pages to limit the size of this alloc? */ + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); + if (!mr) { + ret = -ENOMEM; + goto out; + } + + atomic_set(&mr->r_refcount, 1); + RB_CLEAR_NODE(&mr->r_rb_node); + mr->r_trans = rs->rs_transport; + mr->r_sock = rs; + + if (args->flags & RDS_RDMA_USE_ONCE) + mr->r_use_once = 1; + if (args->flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + if (args->flags & RDS_RDMA_READWRITE) + mr->r_write = 1; + + /* + * Pin the pages that make up the user buffer and transfer the page + * pointers to the mr's sg array. We check to see if we've mapped + * the whole region after transferring the partial page references + * to the sg array so that we can have one page ref cleanup path. + * + * For now we have no flag that tells us whether the mapping is + * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to + * the zero page. + */ + ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); + if (ret < 0) + goto out; + + nents = ret; + sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); + if (!sg) { + ret = -ENOMEM; + goto out; + } + WARN_ON(!nents); + sg_init_table(sg, nents); + + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < nents; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + + rdsdebug("RDS: trans_private nents is %u\n", nents); + + /* Obtain a transport specific MR. If this succeeds, the + * s/g list is now owned by the MR. + * Note that dma_map() implies that pending writes are + * flushed to RAM, so no dma_sync is needed here. */ + trans_private = rs->rs_transport->get_mr(sg, nents, rs, + &mr->r_key); + + if (IS_ERR(trans_private)) { + for (i = 0 ; i < nents; i++) + put_page(sg_page(&sg[i])); + kfree(sg); + ret = PTR_ERR(trans_private); + goto out; + } + + mr->r_trans_private = trans_private; + + rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", + mr->r_key, (void *)(unsigned long) args->cookie_addr); + + /* The user may pass us an unaligned address, but we can only + * map page aligned regions. So we keep the offset, and build + * a 64bit cookie containing and pass that + * around. */ + cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); + if (cookie_ret) + *cookie_ret = cookie; + + if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { + ret = -EFAULT; + goto out; + } + + /* Inserting the new MR into the rbtree bumps its + * reference count. */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + BUG_ON(found && found != mr); + + rdsdebug("RDS: get_mr key is %x\n", mr->r_key); + if (mr_ret) { + atomic_inc(&mr->r_refcount); + *mr_ret = mr; + } + + ret = 0; +out: + kfree(pages); + if (mr) + rds_mr_put(mr); + return ret; +} + +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_args args; + + if (optlen != sizeof(struct rds_get_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval, + sizeof(struct rds_get_mr_args))) + return -EFAULT; + + return __rds_rdma_map(rs, &args, NULL, NULL); +} + +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_get_mr_for_dest_args args; + struct rds_get_mr_args new_args; + + if (optlen != sizeof(struct rds_get_mr_for_dest_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval, + sizeof(struct rds_get_mr_for_dest_args))) + return -EFAULT; + + /* + * Initially, just behave like get_mr(). + * TODO: Implement get_mr as wrapper around this + * and deprecate it. + */ + new_args.vec = args.vec; + new_args.cookie_addr = args.cookie_addr; + new_args.flags = args.flags; + + return __rds_rdma_map(rs, &new_args, NULL, NULL); +} + +/* + * Free the MR indicated by the given R_Key + */ +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_free_mr_args args; + struct rds_mr *mr; + unsigned long flags; + + if (optlen != sizeof(struct rds_free_mr_args)) + return -EINVAL; + + if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval, + sizeof(struct rds_free_mr_args))) + return -EFAULT; + + /* Special case - a null cookie means flush all unused MRs */ + if (args.cookie == 0) { + if (!rs->rs_transport || !rs->rs_transport->flush_mrs) + return -EINVAL; + rs->rs_transport->flush_mrs(); + return 0; + } + + /* Look up the MR given its R_key and remove it from the rbtree + * so nobody else finds it. + * This should also prevent races with rds_rdma_unuse. + */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); + if (mr) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + if (args.flags & RDS_RDMA_INVALIDATE) + mr->r_invalidate = 1; + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (!mr) + return -EINVAL; + + /* + * call rds_destroy_mr() ourselves so that we're sure it's done by the time + * we return. If we let rds_mr_put() do it it might not happen until + * someone else drops their ref. + */ + rds_destroy_mr(mr); + rds_mr_put(mr); + return 0; +} + +/* + * This is called when we receive an extension header that + * tells us this MR was used. It allows us to implement + * use_once semantics + */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) +{ + struct rds_mr *mr; + unsigned long flags; + int zot_me = 0; + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (!mr) { + printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + return; + } + + if (mr->r_use_once || force) { + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + zot_me = 1; + } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + /* May have to issue a dma_sync on this memory region. + * Note we could avoid this if the operation was a RDMA READ, + * but at this point we can't tell. */ + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); +} + +void rds_rdma_free_op(struct rm_rdma_op *ro) +{ + unsigned int i; + + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + if (!ro->op_write) { + BUG_ON(irqs_disabled()); + set_page_dirty(page); + } + put_page(page); + } + + kfree(ro->op_notifier); + ro->op_notifier = NULL; + ro->op_active = 0; +} + +void rds_atomic_free_op(struct rm_atomic_op *ao) +{ + struct page *page = sg_page(ao->op_sg); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + set_page_dirty(page); + put_page(page); + + kfree(ao->op_notifier); + ao->op_notifier = NULL; + ao->op_active = 0; +} + + +/* + * Count the number of pages needed to describe an incoming iovec array. + */ +static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) +{ + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + /* figure out the number of pages in the vector */ + for (i = 0; i < nr_iovecs; i++) { + nr_pages = rds_pages_in_vec(&iov[i]); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_iovec __user *local_vec; + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages * sizeof(struct scatterlist); +} + +/* + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op + */ +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_rdma_args *args; + struct rm_rdma_op *op = &rm->rdma; + int nr_pages; + unsigned int nr_bytes; + struct page **pages = NULL; + struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; + int iov_size; + unsigned int i, j; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->rdma.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + if (rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + if (args->nr_local > UIO_MAXIOV) { + ret = -EMSGSIZE; + goto out; + } + + /* Check whether to allocate the iovec area */ + iov_size = args->nr_local * sizeof(struct rds_iovec); + if (args->nr_local > UIO_FASTIOV) { + iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); + if (!iovs) { + ret = -ENOMEM; + goto out; + } + } + + if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { + ret = -EFAULT; + goto out; + } + + nr_pages = rds_rdma_pages(iovs, args->nr_local); + if (nr_pages < 0) { + ret = -EINVAL; + goto out; + } + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } + + op->op_write = !!(args->flags & RDS_RDMA_READWRITE); + op->op_fence = !!(args->flags & RDS_RDMA_FENCE); + op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_silent = !!(args->flags & RDS_RDMA_SILENT); + op->op_active = 1; + op->op_recverr = rs->rs_recverr; + WARN_ON(!nr_pages); + op->op_sg = rds_message_alloc_sgs(rm, nr_pages); + if (!op->op_sg) { + ret = -ENOMEM; + goto out; + } + + if (op->op_notify || op->op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->op_notifier) { + ret = -ENOMEM; + goto out; + } + op->op_notifier->n_user_token = args->user_token; + op->op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + /* The cookie contains the R_Key of the remote memory region, and + * optionally an offset into it. This is how we implement RDMA into + * unaligned memory. + * When setting up the RDMA, we need to add that offset to the + * destination address (which is really an offset into the MR) + * FIXME: We may want to move this into ib_rdma.c + */ + op->op_rkey = rds_rdma_cookie_key(args->cookie); + op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + + nr_bytes = 0; + + rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", + (unsigned long long)args->nr_local, + (unsigned long long)args->remote_vec.addr, + op->op_rkey); + + for (i = 0; i < args->nr_local; i++) { + struct rds_iovec *iov = &iovs[i]; + /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ + unsigned int nr = rds_pages_in_vec(iov); + + rs->rs_user_addr = iov->addr; + rs->rs_user_bytes = iov->bytes; + + /* If it's a WRITE operation, we want to pin the pages for reading. + * If it's a READ operation, we need to pin the pages for writing. + */ + ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); + if (ret < 0) + goto out; + + rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", + nr_bytes, nr, iov->bytes, iov->addr); + + nr_bytes += iov->bytes; + + for (j = 0; j < nr; j++) { + unsigned int offset = iov->addr & ~PAGE_MASK; + struct scatterlist *sg; + + sg = &op->op_sg[op->op_nents + j]; + sg_set_page(sg, pages[j], + min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), + offset); + + rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", + sg->offset, sg->length, iov->addr, iov->bytes); + + iov->addr += sg->length; + iov->bytes -= sg->length; + } + + op->op_nents += nr; + } + + if (nr_bytes > args->remote_vec.bytes) { + rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", + nr_bytes, + (unsigned int) args->remote_vec.bytes); + ret = -EINVAL; + goto out; + } + op->op_bytes = nr_bytes; + +out: + if (iovs != iovstack) + sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); + kfree(pages); + if (ret) + rds_rdma_free_op(op); + else + rds_stats_inc(s_send_rdma); + + return ret; +} + +/* + * The application wants us to pass an RDMA destination (aka MR) + * to the remote + */ +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + unsigned long flags; + struct rds_mr *mr; + u32 r_key; + int err = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || + rm->m_rdma_cookie != 0) + return -EINVAL; + + memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); + + /* We are reusing a previously mapped MR here. Most likely, the + * application has written to the buffer, so we need to explicitly + * flush those writes to RAM. Otherwise the HCA may not see them + * when doing a DMA from that buffer. + */ + r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); + + spin_lock_irqsave(&rs->rs_rdma_lock, flags); + mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); + if (!mr) + err = -EINVAL; /* invalid r_key */ + else + atomic_inc(&mr->r_refcount); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + + if (mr) { + mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); + rm->rdma.op_rdma_mr = mr; + } + return err; +} + +/* + * The application passes us an address range it wants to enable RDMA + * to/from. We map the area, and save the pair + * in rm->m_rdma_cookie. This causes it to be sent along to the peer + * in an extension header. + */ +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || + rm->m_rdma_cookie != 0) + return -EINVAL; + + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); +} + +/* + * Fill in rds_message for an atomic request. + */ +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct page *page = NULL; + struct rds_atomic_args *args; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) + || rm->atomic.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + /* Nonmasked & masked cmsg ops converted to masked hw ops */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = 0; + break; + case RDS_CMSG_MASKED_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->m_fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; + break; + case RDS_CMSG_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->cswp.compare; + rm->atomic.op_m_cswp.swap = args->cswp.swap; + rm->atomic.op_m_cswp.compare_mask = ~0; + rm->atomic.op_m_cswp.swap_mask = ~0; + break; + case RDS_CMSG_MASKED_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->m_cswp.compare; + rm->atomic.op_m_cswp.swap = args->m_cswp.swap; + rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; + rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; + break; + default: + BUG(); /* should never happen */ + } + + rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); + rm->atomic.op_active = 1; + rm->atomic.op_recverr = rs->rs_recverr; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + if (!rm->atomic.op_sg) { + ret = -ENOMEM; + goto err; + } + + /* verify 8 byte-aligned */ + if (args->local_addr & 0x7) { + ret = -EFAULT; + goto err; + } + + ret = rds_pin_pages(args->local_addr, 1, &page, 1); + if (ret != 1) + goto err; + ret = 0; + + sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); + + if (rm->atomic.op_notify || rm->atomic.op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + if (!rm->atomic.op_notifier) { + ret = -ENOMEM; + goto err; + } + + rm->atomic.op_notifier->n_user_token = args->user_token; + rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); + rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); + + return ret; +err: + if (page) + put_page(page); + kfree(rm->atomic.op_notifier); + + return ret; +} diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c new file mode 100644 index 00000000..f8760e1b --- /dev/null +++ b/net/rds/rdma_transport.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2009 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "rdma_transport.h" + +static struct rdma_cm_id *rds_rdma_listen_id; + +static char *rds_cm_event_strings[] = { +#define RDS_CM_EVENT_STRING(foo) \ + [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) + RDS_CM_EVENT_STRING(ADDR_RESOLVED), + RDS_CM_EVENT_STRING(ADDR_ERROR), + RDS_CM_EVENT_STRING(ROUTE_RESOLVED), + RDS_CM_EVENT_STRING(ROUTE_ERROR), + RDS_CM_EVENT_STRING(CONNECT_REQUEST), + RDS_CM_EVENT_STRING(CONNECT_RESPONSE), + RDS_CM_EVENT_STRING(CONNECT_ERROR), + RDS_CM_EVENT_STRING(UNREACHABLE), + RDS_CM_EVENT_STRING(REJECTED), + RDS_CM_EVENT_STRING(ESTABLISHED), + RDS_CM_EVENT_STRING(DISCONNECTED), + RDS_CM_EVENT_STRING(DEVICE_REMOVAL), + RDS_CM_EVENT_STRING(MULTICAST_JOIN), + RDS_CM_EVENT_STRING(MULTICAST_ERROR), + RDS_CM_EVENT_STRING(ADDR_CHANGE), + RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), +#undef RDS_CM_EVENT_STRING +}; + +static char *rds_cm_event_str(enum rdma_cm_event_type type) +{ + return rds_str_array(rds_cm_event_strings, + ARRAY_SIZE(rds_cm_event_strings), type); +}; + +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + /* this can be null in the listening path */ + struct rds_connection *conn = cm_id->context; + struct rds_transport *trans; + int ret = 0; + + rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, + event->event, rds_cm_event_str(event->event)); + + if (cm_id->device->node_type == RDMA_NODE_RNIC) + trans = &rds_iw_transport; + else + trans = &rds_ib_transport; + + /* Prevent shutdown from tearing down the connection + * while we're executing. */ + if (conn) { + mutex_lock(&conn->c_cm_lock); + + /* If the connection is being shut down, bail out + * right away. We return 0 so cm_id doesn't get + * destroyed prematurely */ + if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { + /* Reject incoming connections while we're tearing + * down an existing one. */ + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + ret = 1; + goto out; + } + } + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = trans->cm_handle_connect(cm_id, event); + break; + + case RDMA_CM_EVENT_ADDR_RESOLVED: + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, + RDS_RDMA_RESOLVE_TIMEOUT_MS); + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + /* XXX worry about racing with listen acceptance */ + ret = trans->cm_initiate_connect(cm_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + trans->cm_connect_complete(conn, event); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: + if (conn) + rds_conn_drop(conn); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + rdsdebug("DISCONNECT event - dropping connection " + "%pI4->%pI4\n", &conn->c_laddr, + &conn->c_faddr); + rds_conn_drop(conn); + break; + + default: + /* things like device disconnect? */ + printk(KERN_ERR "RDS: unknown event %u (%s)!\n", + event->event, rds_cm_event_str(event->event)); + break; + } + +out: + if (conn) + mutex_unlock(&conn->c_cm_lock); + + rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, + rds_cm_event_str(event->event), ret); + + return ret; +} + +static int rds_rdma_listen_init(void) +{ + struct sockaddr_in sin; + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " + "rdma_create_id() returned %d\n", ret); + return ret; + } + + sin.sin_family = AF_INET, + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_PORT); + + /* + * XXX I bet this binds the cm_id to a device. If we want to support + * fail-over we'll have to take this into consideration. + */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + if (ret) { + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " + "rdma_bind_addr() returned %d\n", ret); + goto out; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + printk(KERN_ERR "RDS/RDMA: failed to setup listener, " + "rdma_listen() returned %d\n", ret); + goto out; + } + + rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); + + rds_rdma_listen_id = cm_id; + cm_id = NULL; +out: + if (cm_id) + rdma_destroy_id(cm_id); + return ret; +} + +static void rds_rdma_listen_stop(void) +{ + if (rds_rdma_listen_id) { + rdsdebug("cm %p\n", rds_rdma_listen_id); + rdma_destroy_id(rds_rdma_listen_id); + rds_rdma_listen_id = NULL; + } +} + +static int rds_rdma_init(void) +{ + int ret; + + ret = rds_rdma_listen_init(); + if (ret) + goto out; + + ret = rds_iw_init(); + if (ret) + goto err_iw_init; + + ret = rds_ib_init(); + if (ret) + goto err_ib_init; + + goto out; + +err_ib_init: + rds_iw_exit(); +err_iw_init: + rds_rdma_listen_stop(); +out: + return ret; +} +module_init(rds_rdma_init); + +static void rds_rdma_exit(void) +{ + /* stop listening first to ensure no new connections are attempted */ + rds_rdma_listen_stop(); + rds_ib_exit(); + rds_iw_exit(); +} +module_exit(rds_rdma_exit); + +MODULE_AUTHOR("Oracle Corporation "); +MODULE_DESCRIPTION("RDS: IB/iWARP transport"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h new file mode 100644 index 00000000..faba4e38 --- /dev/null +++ b/net/rds/rdma_transport.h @@ -0,0 +1,24 @@ +#ifndef _RDMA_TRANSPORT_H +#define _RDMA_TRANSPORT_H + +#include +#include +#include "rds.h" + +#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 + +int rds_rdma_conn_connect(struct rds_connection *conn); +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + +/* from ib.c */ +extern struct rds_transport rds_ib_transport; +int rds_ib_init(void); +void rds_ib_exit(void); + +/* from iw.c */ +extern struct rds_transport rds_iw_transport; +int rds_iw_init(void); +void rds_iw_exit(void); + +#endif diff --git a/net/rds/rds.h b/net/rds/rds.h new file mode 100644 index 00000000..da8adac2 --- /dev/null +++ b/net/rds/rds.h @@ -0,0 +1,812 @@ +#ifndef _RDS_RDS_H +#define _RDS_RDS_H + +#include +#include +#include +#include +#include +#include + +#include "info.h" + +/* + * RDS Network protocol version + */ +#define RDS_PROTOCOL_3_0 0x0300 +#define RDS_PROTOCOL_3_1 0x0301 +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 +#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) +#define RDS_PROTOCOL_MINOR(v) ((v) & 255) +#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) + +/* + * XXX randomly chosen, but at least seems to be unused: + * # 18464-18768 Unassigned + * We should do better. We want a reserved port to discourage unpriv'ed + * userspace from listening. + */ +#define RDS_PORT 18634 + +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif + +#ifdef DEBUG +#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) +#else +/* sigh, pr_debug() causes unused variable warnings */ +static inline void __attribute__ ((format (printf, 1, 2))) +rdsdebug(char *fmt, ...) +{ +} +#endif + +/* XXX is there one of these somewhere? */ +#define ceil(x, y) \ + ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) + +#define RDS_FRAG_SHIFT 12 +#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) + +#define RDS_CONG_MAP_BYTES (65536 / 8) +#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) +#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) + +struct rds_cong_map { + struct rb_node m_rb_node; + __be32 m_addr; + wait_queue_head_t m_waitq; + struct list_head m_conn_list; + unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; +}; + + +/* + * This is how we will track the connection state: + * A connection is always in one of the following + * states. Updates to the state are atomic and imply + * a memory barrier. + */ +enum { + RDS_CONN_DOWN = 0, + RDS_CONN_CONNECTING, + RDS_CONN_DISCONNECTING, + RDS_CONN_UP, + RDS_CONN_ERROR, +}; + +/* Bits for c_flags */ +#define RDS_LL_SEND_FULL 0 +#define RDS_RECONNECT_PENDING 1 +#define RDS_IN_XMIT 2 + +struct rds_connection { + struct hlist_node c_hash_node; + __be32 c_laddr; + __be32 c_faddr; + unsigned int c_loopback:1; + struct rds_connection *c_passive; + + struct rds_cong_map *c_lcong; + struct rds_cong_map *c_fcong; + + struct rds_message *c_xmit_rm; + unsigned long c_xmit_sg; + unsigned int c_xmit_hdr_off; + unsigned int c_xmit_data_off; + unsigned int c_xmit_atomic_sent; + unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_data_sent; + + spinlock_t c_lock; /* protect msg queues */ + u64 c_next_tx_seq; + struct list_head c_send_queue; + struct list_head c_retrans; + + u64 c_next_rx_seq; + + struct rds_transport *c_trans; + void *c_transport_data; + + atomic_t c_state; + unsigned long c_flags; + unsigned long c_reconnect_jiffies; + struct delayed_work c_send_w; + struct delayed_work c_recv_w; + struct delayed_work c_conn_w; + struct work_struct c_down_w; + struct mutex c_cm_lock; /* protect conn state & cm */ + wait_queue_head_t c_waitq; + + struct list_head c_map_item; + unsigned long c_map_queued; + + unsigned int c_unacked_packets; + unsigned int c_unacked_bytes; + + /* Protocol version */ + unsigned int c_version; +}; + +#define RDS_FLAG_CONG_BITMAP 0x01 +#define RDS_FLAG_ACK_REQUIRED 0x02 +#define RDS_FLAG_RETRANSMITTED 0x04 +#define RDS_MAX_ADV_CREDIT 255 + +/* + * Maximum space available for extension headers. + */ +#define RDS_HEADER_EXT_SPACE 16 + +struct rds_header { + __be64 h_sequence; + __be64 h_ack; + __be32 h_len; + __be16 h_sport; + __be16 h_dport; + u8 h_flags; + u8 h_credit; + u8 h_padding[4]; + __sum16 h_csum; + + u8 h_exthdr[RDS_HEADER_EXT_SPACE]; +}; + +/* + * Reserved - indicates end of extensions + */ +#define RDS_EXTHDR_NONE 0 + +/* + * This extension header is included in the very + * first message that is sent on a new connection, + * and identifies the protocol level. This will help + * rolling updates if a future change requires breaking + * the protocol. + * NB: This is no longer true for IB, where we do a version + * negotiation during the connection setup phase (protocol + * version information is included in the RDMA CM private data). + */ +#define RDS_EXTHDR_VERSION 1 +struct rds_ext_header_version { + __be32 h_version; +}; + +/* + * This extension header is included in the RDS message + * chasing an RDMA operation. + */ +#define RDS_EXTHDR_RDMA 2 +struct rds_ext_header_rdma { + __be32 h_rdma_rkey; +}; + +/* + * This extension header tells the peer about the + * destination of the requested RDMA + * operation. + */ +#define RDS_EXTHDR_RDMA_DEST 3 +struct rds_ext_header_rdma_dest { + __be32 h_rdma_rkey; + __be32 h_rdma_offset; +}; + +#define __RDS_EXTHDR_MAX 16 /* for now */ + +struct rds_incoming { + atomic_t i_refcount; + struct list_head i_item; + struct rds_connection *i_conn; + struct rds_header i_hdr; + unsigned long i_rx_jiffies; + __be32 i_saddr; + + rds_rdma_cookie_t i_rdma_cookie; +}; + +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +/* atomic operation types */ +#define RDS_ATOMIC_TYPE_CSWP 0 +#define RDS_ATOMIC_TYPE_FADD 1 + +/* + * m_sock_item and m_conn_item are on lists that are serialized under + * conn->c_lock. m_sock_item has additional meaning in that once it is empty + * the message will not be put back on the retransmit list after being sent. + * messages that are canceled while being sent rely on this. + * + * m_inc is used by loopback so that it can pass an incoming message straight + * back up into the rx path. It embeds a wire header which is also used by + * the send path, which is kind of awkward. + * + * m_sock_item indicates the message's presence on a socket's send or receive + * queue. m_rs will point to that socket. + * + * m_daddr is used by cancellation to prune messages to a given destination. + * + * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock + * nesting. As paths iterate over messages on a sock, or conn, they must + * also lock the conn, or sock, to remove the message from those lists too. + * Testing the flag to determine if the message is still on the lists lets + * us avoid testing the list_head directly. That means each path can use + * the message's list_head to keep it on a local list while juggling locks + * without confusing the other path. + * + * m_ack_seq is an optional field set by transports who need a different + * sequence number range to invalidate. They can use this in a callback + * that they pass to rds_send_drop_acked() to see if each message has been + * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't + * had ack_seq set yet. + */ +#define RDS_MSG_ON_SOCK 1 +#define RDS_MSG_ON_CONN 2 +#define RDS_MSG_HAS_ACK_SEQ 3 +#define RDS_MSG_ACK_REQUIRED 4 +#define RDS_MSG_RETRANSMITTED 5 +#define RDS_MSG_MAPPED 6 +#define RDS_MSG_PAGEVEC 7 + +struct rds_message { + atomic_t m_refcount; + struct list_head m_sock_item; + struct list_head m_conn_item; + struct rds_incoming m_inc; + u64 m_ack_seq; + __be32 m_daddr; + unsigned long m_flags; + + /* Never access m_rs without holding m_rs_lock. + * Lock nesting is + * rm->m_rs_lock + * -> rs->rs_lock + */ + spinlock_t m_rs_lock; + wait_queue_head_t m_flush_wait; + + struct rds_sock *m_rs; + + /* cookie to send to remote, in rds header */ + rds_rdma_cookie_t m_rdma_cookie; + + unsigned int m_used_sgs; + unsigned int m_total_sgs; + + void *m_final_op; + + struct { + struct rm_atomic_op { + int op_type; + union { + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } op_m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } op_m_fadd; + }; + + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } atomic; + struct rm_rdma_op { + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_write:1; + unsigned int op_fence:1; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + unsigned int op_bytes; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } rdma; + struct rm_data_op { + unsigned int op_active:1; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + } data; + }; +}; + +/* + * The RDS notifier is used (optionally) to tell the application about + * completed RDMA operations. Rather than keeping the whole rds message + * around on the queue, we allocate a small notifier that is put on the + * socket's notifier_list. Notifications are delivered to the application + * through control messages. + */ +struct rds_notifier { + struct list_head n_list; + uint64_t n_user_token; + int n_status; +}; + +/** + * struct rds_transport - transport specific behavioural hooks + * + * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + * + * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rds_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rds_recv_incoming(). This is called in process context but + * should try hard not to block. + */ + +#define RDS_TRANS_IB 0 +#define RDS_TRANS_IWARP 1 +#define RDS_TRANS_TCP 2 +#define RDS_TRANS_COUNT 3 + +struct rds_transport { + char t_name[TRANSNAMSIZ]; + struct list_head t_item; + struct module *t_owner; + unsigned int t_prefer_loopback:1; + unsigned int t_type; + + int (*laddr_check)(__be32 addr); + int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); + void (*conn_free)(void *data); + int (*conn_connect)(struct rds_connection *conn); + void (*conn_shutdown)(struct rds_connection *conn); + void (*xmit_prepare)(struct rds_connection *conn); + void (*xmit_complete)(struct rds_connection *conn); + int (*xmit)(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); + int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); + int (*recv)(struct rds_connection *conn); + int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, + size_t size); + void (*inc_free)(struct rds_incoming *inc); + + int (*cm_handle_connect)(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + void (*cm_connect_complete)(struct rds_connection *conn, + struct rdma_cm_event *event); + + unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, + unsigned int avail); + void (*exit)(void); + void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, + struct rds_sock *rs, u32 *key_ret); + void (*sync_mr)(void *trans_private, int direction); + void (*free_mr)(void *trans_private, int invalidate); + void (*flush_mrs)(void); +}; + +struct rds_sock { + struct sock rs_sk; + + u64 rs_user_addr; + u64 rs_user_bytes; + + /* + * bound_addr used for both incoming and outgoing, no INADDR_ANY + * support. + */ + struct hlist_node rs_bound_node; + __be32 rs_bound_addr; + __be32 rs_conn_addr; + __be16 rs_bound_port; + __be16 rs_conn_port; + struct rds_transport *rs_transport; + + /* + * rds_sendmsg caches the conn it used the last time around. + * This helps avoid costly lookups. + */ + struct rds_connection *rs_conn; + + /* flag indicating we were congested or not */ + int rs_congested; + /* seen congestion (ENOBUFS) when sending? */ + int rs_seen_congestion; + + /* rs_lock protects all these adjacent members before the newline */ + spinlock_t rs_lock; + struct list_head rs_send_queue; + u32 rs_snd_bytes; + int rs_rcv_bytes; + struct list_head rs_notify_queue; /* currently used for failed RDMAs */ + + /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask + * to decide whether the application should be woken up. + * If not set, we use rs_cong_track to find out whether a cong map + * update arrived. + */ + uint64_t rs_cong_mask; + uint64_t rs_cong_notify; + struct list_head rs_cong_list; + unsigned long rs_cong_track; + + /* + * rs_recv_lock protects the receive queue, and is + * used to serialize with rds_release. + */ + rwlock_t rs_recv_lock; + struct list_head rs_recv_queue; + + /* just for stats reporting */ + struct list_head rs_item; + + /* these have their own lock */ + spinlock_t rs_rdma_lock; + struct rb_root rs_rdma_keys; + + /* Socket options - in case there will be more */ + unsigned char rs_recverr, + rs_cong_monitor; +}; + +static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) +{ + return container_of(sk, struct rds_sock, rs_sk); +} +static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) +{ + return &rs->rs_sk; +} + +/* + * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value + * to account for overhead. We don't account for overhead, we just apply + * the number of payload bytes to the specified value. + */ +static inline int rds_sk_sndbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_sndbuf / 2; +} +static inline int rds_sk_rcvbuf(struct rds_sock *rs) +{ + return rds_rs_to_sk(rs)->sk_rcvbuf / 2; +} + +struct rds_statistics { + uint64_t s_conn_reset; + uint64_t s_recv_drop_bad_checksum; + uint64_t s_recv_drop_old_seq; + uint64_t s_recv_drop_no_sock; + uint64_t s_recv_drop_dead_sock; + uint64_t s_recv_deliver_raced; + uint64_t s_recv_delivered; + uint64_t s_recv_queued; + uint64_t s_recv_immediate_retry; + uint64_t s_recv_delayed_retry; + uint64_t s_recv_ack_required; + uint64_t s_recv_rdma_bytes; + uint64_t s_recv_ping; + uint64_t s_send_queue_empty; + uint64_t s_send_queue_full; + uint64_t s_send_lock_contention; + uint64_t s_send_lock_queue_raced; + uint64_t s_send_immediate_retry; + uint64_t s_send_delayed_retry; + uint64_t s_send_drop_acked; + uint64_t s_send_ack_required; + uint64_t s_send_queued; + uint64_t s_send_rdma; + uint64_t s_send_rdma_bytes; + uint64_t s_send_pong; + uint64_t s_page_remainder_hit; + uint64_t s_page_remainder_miss; + uint64_t s_copy_to_user; + uint64_t s_copy_from_user; + uint64_t s_cong_update_queued; + uint64_t s_cong_update_received; + uint64_t s_cong_send_error; + uint64_t s_cong_send_blocked; +}; + +/* af_rds.c */ +char *rds_str_array(char **array, size_t elements, size_t index); +void rds_sock_addref(struct rds_sock *rs); +void rds_sock_put(struct rds_sock *rs); +void rds_wake_sk_sleep(struct rds_sock *rs); +static inline void __rds_wake_sk_sleep(struct sock *sk) +{ + wait_queue_head_t *waitq = sk_sleep(sk); + + if (!sock_flag(sk, SOCK_DEAD) && waitq) + wake_up(waitq); +} +extern wait_queue_head_t rds_poll_waitq; + + +/* bind.c */ +int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +void rds_remove_bound(struct rds_sock *rs); +struct rds_sock *rds_find_bound(__be32 addr, __be16 port); + +/* cong.c */ +int rds_cong_get_maps(struct rds_connection *conn); +void rds_cong_add_conn(struct rds_connection *conn); +void rds_cong_remove_conn(struct rds_connection *conn); +void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); +void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); +int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); +void rds_cong_queue_updates(struct rds_cong_map *map); +void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); +int rds_cong_updated_since(unsigned long *recent); +void rds_cong_add_socket(struct rds_sock *); +void rds_cong_remove_socket(struct rds_sock *); +void rds_cong_exit(void); +struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); + +/* conn.c */ +int rds_conn_init(void); +void rds_conn_exit(void); +struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, + struct rds_transport *trans, gfp_t gfp); +void rds_conn_shutdown(struct rds_connection *conn); +void rds_conn_destroy(struct rds_connection *conn); +void rds_conn_drop(struct rds_connection *conn); +void rds_conn_connect_if_down(struct rds_connection *conn); +void rds_for_each_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_connection *, void *), + size_t item_len); +void __rds_conn_error(struct rds_connection *conn, const char *, ...) + __attribute__ ((format (printf, 2, 3))); +#define rds_conn_error(conn, fmt...) \ + __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) + +static inline int +rds_conn_transition(struct rds_connection *conn, int old, int new) +{ + return atomic_cmpxchg(&conn->c_state, old, new) == old; +} + +static inline int +rds_conn_state(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state); +} + +static inline int +rds_conn_up(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_UP; +} + +static inline int +rds_conn_connecting(struct rds_connection *conn) +{ + return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; +} + +/* message.c */ +struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, + size_t total_len); +struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); +void rds_message_populate_header(struct rds_header *hdr, __be16 sport, + __be16 dport, u64 seq); +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len); +int rds_message_next_extension(struct rds_header *hdr, + unsigned int *pos, void *buf, unsigned int *buflen); +int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); +int rds_message_inc_copy_to_user(struct rds_incoming *inc, + struct iovec *first_iov, size_t size); +void rds_message_inc_free(struct rds_incoming *inc); +void rds_message_addref(struct rds_message *rm); +void rds_message_put(struct rds_message *rm); +void rds_message_wait(struct rds_message *rm); +void rds_message_unmapped(struct rds_message *rm); + +static inline void rds_message_make_checksum(struct rds_header *hdr) +{ + hdr->h_csum = 0; + hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); +} + +static inline int rds_message_verify_checksum(const struct rds_header *hdr) +{ + return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; +} + + +/* page.c */ +int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, + gfp_t gfp); +int rds_page_copy_user(struct page *page, unsigned long offset, + void __user *ptr, unsigned long bytes, + int to_user); +#define rds_page_copy_to_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 1) +#define rds_page_copy_from_user(page, offset, ptr, bytes) \ + rds_page_copy_user(page, offset, ptr, bytes, 0) +void rds_page_exit(void); + +/* recv.c */ +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr); +void rds_inc_put(struct rds_incoming *inc); +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp, enum km_type km); +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags); +void rds_clear_recv_queue(struct rds_sock *rs); +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip); + +/* send.c */ +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len); +void rds_send_reset(struct rds_connection *conn); +int rds_send_xmit(struct rds_connection *conn); +struct sockaddr_in; +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked); +int rds_send_pong(struct rds_connection *conn, __be16 dport); +struct rds_message *rds_send_get_message(struct rds_connection *, + struct rm_rdma_op *); + +/* rdma.c */ +void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_rdma_extra_size(struct rds_rdma_args *args); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rm_rdma_op *ro); +void rds_atomic_free_op(struct rm_atomic_op *ao); +void rds_rdma_send_complete(struct rds_message *rm, int wc_status); +void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); + +extern void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} + +/* stats.c */ +DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); +#define rds_stats_inc_which(which, member) do { \ + per_cpu(which, get_cpu()).member++; \ + put_cpu(); \ +} while (0) +#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) +#define rds_stats_add_which(which, member, count) do { \ + per_cpu(which, get_cpu()).member += count; \ + put_cpu(); \ +} while (0) +#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) +int rds_stats_init(void); +void rds_stats_exit(void); +void rds_stats_info_copy(struct rds_info_iterator *iter, + uint64_t *values, const char *const *names, + size_t nr); + +/* sysctl.c */ +int rds_sysctl_init(void); +void rds_sysctl_exit(void); +extern unsigned long rds_sysctl_sndbuf_min; +extern unsigned long rds_sysctl_sndbuf_default; +extern unsigned long rds_sysctl_sndbuf_max; +extern unsigned long rds_sysctl_reconnect_min_jiffies; +extern unsigned long rds_sysctl_reconnect_max_jiffies; +extern unsigned int rds_sysctl_max_unacked_packets; +extern unsigned int rds_sysctl_max_unacked_bytes; +extern unsigned int rds_sysctl_ping_enable; +extern unsigned long rds_sysctl_trace_flags; +extern unsigned int rds_sysctl_trace_level; + +/* threads.c */ +int rds_threads_init(void); +void rds_threads_exit(void); +extern struct workqueue_struct *rds_wq; +void rds_queue_reconnect(struct rds_connection *conn); +void rds_connect_worker(struct work_struct *); +void rds_shutdown_worker(struct work_struct *); +void rds_send_worker(struct work_struct *); +void rds_recv_worker(struct work_struct *); +void rds_connect_complete(struct rds_connection *conn); + +/* transport.c */ +int rds_trans_register(struct rds_transport *trans); +void rds_trans_unregister(struct rds_transport *trans); +struct rds_transport *rds_trans_get_preferred(__be32 addr); +void rds_trans_put(struct rds_transport *trans); +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); +int rds_trans_init(void); +void rds_trans_exit(void); + +#endif diff --git a/net/rds/recv.c b/net/rds/recv.c new file mode 100644 index 00000000..596689e5 --- /dev/null +++ b/net/rds/recv.c @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" + +void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, + __be32 saddr) +{ + atomic_set(&inc->i_refcount, 1); + INIT_LIST_HEAD(&inc->i_item); + inc->i_conn = conn; + inc->i_saddr = saddr; + inc->i_rdma_cookie = 0; +} +EXPORT_SYMBOL_GPL(rds_inc_init); + +static void rds_inc_addref(struct rds_incoming *inc) +{ + rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + atomic_inc(&inc->i_refcount); +} + +void rds_inc_put(struct rds_incoming *inc) +{ + rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); + if (atomic_dec_and_test(&inc->i_refcount)) { + BUG_ON(!list_empty(&inc->i_item)); + + inc->i_conn->c_trans->inc_free(inc); + } +} +EXPORT_SYMBOL_GPL(rds_inc_put); + +static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, + struct rds_cong_map *map, + int delta, __be16 port) +{ + int now_congested; + + if (delta == 0) + return; + + rs->rs_rcv_bytes += delta; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); + + rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + "now_cong %d delta %d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, + rds_sk_rcvbuf(rs), now_congested, delta); + + /* wasn't -> am congested */ + if (!rs->rs_congested && now_congested) { + rs->rs_congested = 1; + rds_cong_set_bit(map, port); + rds_cong_queue_updates(map); + } + /* was -> aren't congested */ + /* Require more free space before reporting uncongested to prevent + bouncing cong/uncong state too often */ + else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) { + rs->rs_congested = 0; + rds_cong_clear_bit(map, port); + rds_cong_queue_updates(map); + } + + /* do nothing if no change in cong state */ +} + +/* + * Process all extension headers that come with this message. + */ +static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs) +{ + struct rds_header *hdr = &inc->i_hdr; + unsigned int pos = 0, type, len; + union { + struct rds_ext_header_version version; + struct rds_ext_header_rdma rdma; + struct rds_ext_header_rdma_dest rdma_dest; + } buffer; + + while (1) { + len = sizeof(buffer); + type = rds_message_next_extension(hdr, &pos, &buffer, &len); + if (type == RDS_EXTHDR_NONE) + break; + /* Process extension header here */ + switch (type) { + case RDS_EXTHDR_RDMA: + rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0); + break; + + case RDS_EXTHDR_RDMA_DEST: + /* We ignore the size for now. We could stash it + * somewhere and use it for error checking. */ + inc->i_rdma_cookie = rds_rdma_make_cookie( + be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), + be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); + + break; + } + } +} + +/* + * The transport must make sure that this is serialized against other + * rx and conn reset on this specific conn. + * + * We currently assert that only one fragmented message will be sent + * down a connection at a time. This lets us reassemble in the conn + * instead of per-flow which means that we don't have to go digging through + * flows to tear down partial reassembly progress on conn failure and + * we save flow lookup and locking for each frag arrival. It does mean + * that small messages will wait behind large ones. Fragmenting at all + * is only to reduce the memory consumption of pre-posted buffers. + * + * The caller passes in saddr and daddr instead of us getting it from the + * conn. This lets loopback, who only has one conn for both directions, + * tell us which roles the addrs in the conn are playing for this message. + */ +void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, + struct rds_incoming *inc, gfp_t gfp, enum km_type km) +{ + struct rds_sock *rs = NULL; + struct sock *sk; + unsigned long flags; + + inc->i_conn = conn; + inc->i_rx_jiffies = jiffies; + + rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " + "flags 0x%x rx_jiffies %lu\n", conn, + (unsigned long long)conn->c_next_rx_seq, + inc, + (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), + be32_to_cpu(inc->i_hdr.h_len), + be16_to_cpu(inc->i_hdr.h_sport), + be16_to_cpu(inc->i_hdr.h_dport), + inc->i_hdr.h_flags, + inc->i_rx_jiffies); + + /* + * Sequence numbers should only increase. Messages get their + * sequence number as they're queued in a sending conn. They + * can be dropped, though, if the sending socket is closed before + * they hit the wire. So sequence numbers can skip forward + * under normal operation. They can also drop back in the conn + * failover case as previously sent messages are resent down the + * new instance of a conn. We drop those, otherwise we have + * to assume that the next valid seq does not come after a + * hole in the fragment stream. + * + * The headers don't give us a way to realize if fragments of + * a message have been dropped. We assume that frags that arrive + * to a flow are part of the current message on the flow that is + * being reassembled. This means that senders can't drop messages + * from the sending conn until all their frags are sent. + * + * XXX we could spend more on the wire to get more robust failure + * detection, arguably worth it to avoid data corruption. + */ + if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && + (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { + rds_stats_inc(s_recv_drop_old_seq); + goto out; + } + conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; + + if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { + rds_stats_inc(s_recv_ping); + rds_send_pong(conn, inc->i_hdr.h_sport); + goto out; + } + + rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + if (!rs) { + rds_stats_inc(s_recv_drop_no_sock); + goto out; + } + + /* Process extension headers */ + rds_recv_incoming_exthdrs(inc, rs); + + /* We can be racing with rds_release() which marks the socket dead. */ + sk = rds_rs_to_sk(rs); + + /* serialize with rds_release -> sock_orphan */ + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!sock_flag(sk, SOCK_DEAD)) { + rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); + rds_stats_inc(s_recv_queued); + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + rds_inc_addref(inc); + list_add_tail(&inc->i_item, &rs->rs_recv_queue); + __rds_wake_sk_sleep(sk); + } else { + rds_stats_inc(s_recv_drop_dead_sock); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + +out: + if (rs) + rds_sock_put(rs); +} +EXPORT_SYMBOL_GPL(rds_recv_incoming); + +/* + * be very careful here. This is being called as the condition in + * wait_event_*() needs to cope with being called many times. + */ +static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) +{ + unsigned long flags; + + if (!*inc) { + read_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&rs->rs_recv_queue)) { + *inc = list_entry(rs->rs_recv_queue.next, + struct rds_incoming, + i_item); + rds_inc_addref(*inc); + } + read_unlock_irqrestore(&rs->rs_recv_lock, flags); + } + + return *inc != NULL; +} + +static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, + int drop) +{ + struct sock *sk = rds_rs_to_sk(rs); + int ret = 0; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + if (!list_empty(&inc->i_item)) { + ret = 1; + if (drop) { + /* XXX make sure this i_conn is reliable */ + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); + return ret; +} + +/* + * Pull errors off the error queue. + * If msghdr is NULL, we will just purge the error queue. + */ +int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) +{ + struct rds_notifier *notifier; + struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ + unsigned int count = 0, max_messages = ~0U; + unsigned long flags; + LIST_HEAD(copy); + int err = 0; + + + /* put_cmsg copies to user space and thus may sleep. We can't do this + * with rs_lock held, so first grab as many notifications as we can stuff + * in the user provided cmsg buffer. We don't try to copy more, to avoid + * losing notifications - except when the buffer is so small that it wouldn't + * even hold a single notification. Then we give him as much of this single + * msg as we can squeeze in, and set MSG_CTRUNC. + */ + if (msghdr) { + max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg)); + if (!max_messages) + max_messages = 1; + } + + spin_lock_irqsave(&rs->rs_lock, flags); + while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { + notifier = list_entry(rs->rs_notify_queue.next, + struct rds_notifier, n_list); + list_move(¬ifier->n_list, ©); + count++; + } + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (!count) + return 0; + + while (!list_empty(©)) { + notifier = list_entry(copy.next, struct rds_notifier, n_list); + + if (msghdr) { + cmsg.user_token = notifier->n_user_token; + cmsg.status = notifier->n_status; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, + sizeof(cmsg), &cmsg); + if (err) + break; + } + + list_del_init(¬ifier->n_list); + kfree(notifier); + } + + /* If we bailed out because of an error in put_cmsg, + * we may be left with one or more notifications that we + * didn't process. Return them to the head of the list. */ + if (!list_empty(©)) { + spin_lock_irqsave(&rs->rs_lock, flags); + list_splice(©, &rs->rs_notify_queue); + spin_unlock_irqrestore(&rs->rs_lock, flags); + } + + return err; +} + +/* + * Queue a congestion notification + */ +static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) +{ + uint64_t notify = rs->rs_cong_notify; + unsigned long flags; + int err; + + err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE, + sizeof(notify), ¬ify); + if (err) + return err; + + spin_lock_irqsave(&rs->rs_lock, flags); + rs->rs_cong_notify &= ~notify; + spin_unlock_irqrestore(&rs->rs_lock, flags); + + return 0; +} + +/* + * Receive any control messages. + */ +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +{ + int ret = 0; + + if (inc->i_rdma_cookie) { + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, + sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); + if (ret) + return ret; + } + + return 0; +} + +int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int msg_flags) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + long timeo; + int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + struct sockaddr_in *sin; + struct rds_incoming *inc = NULL; + + /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */ + timeo = sock_rcvtimeo(sk, nonblock); + + rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); + + if (msg_flags & MSG_OOB) + goto out; + + while (1) { + /* If there are pending notifications, do those - and nothing else */ + if (!list_empty(&rs->rs_notify_queue)) { + ret = rds_notify_queue_get(rs, msg); + break; + } + + if (rs->rs_cong_notify) { + ret = rds_notify_cong(rs, msg); + break; + } + + if (!rds_next_incoming(rs, &inc)) { + if (nonblock) { + ret = -EAGAIN; + break; + } + + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + (!list_empty(&rs->rs_notify_queue) || + rs->rs_cong_notify || + rds_next_incoming(rs, &inc)), timeo); + rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, + timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + break; + } + + rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + &inc->i_conn->c_faddr, + ntohs(inc->i_hdr.h_sport)); + ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov, + size); + if (ret < 0) + break; + + /* + * if the message we just copied isn't at the head of the + * recv queue then someone else raced us to return it, try + * to get the next message. + */ + if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) { + rds_inc_put(inc); + inc = NULL; + rds_stats_inc(s_recv_deliver_raced); + continue; + } + + if (ret < be32_to_cpu(inc->i_hdr.h_len)) { + if (msg_flags & MSG_TRUNC) + ret = be32_to_cpu(inc->i_hdr.h_len); + msg->msg_flags |= MSG_TRUNC; + } + + if (rds_cmsg_recv(inc, msg)) { + ret = -EFAULT; + goto out; + } + + rds_stats_inc(s_recv_delivered); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + break; + } + + if (inc) + rds_inc_put(inc); + +out: + return ret; +} + +/* + * The socket is being shut down and we're asked to drop messages that were + * queued for recvmsg. The caller has unbound the socket so the receive path + * won't queue any more incoming fragments or messages on the socket. + */ +void rds_clear_recv_queue(struct rds_sock *rs) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct rds_incoming *inc, *tmp; + unsigned long flags; + + write_lock_irqsave(&rs->rs_recv_lock, flags); + list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { + rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, + -be32_to_cpu(inc->i_hdr.h_len), + inc->i_hdr.h_dport); + list_del_init(&inc->i_item); + rds_inc_put(inc); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); +} + +/* + * inc->i_saddr isn't used here because it is only set in the receive + * path. + */ +void rds_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + __be32 saddr, __be32 daddr, int flip) +{ + struct rds_info_message minfo; + + minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo.laddr = daddr; + minfo.faddr = saddr; + minfo.lport = inc->i_hdr.h_dport; + minfo.fport = inc->i_hdr.h_sport; + } else { + minfo.laddr = saddr; + minfo.faddr = daddr; + minfo.lport = inc->i_hdr.h_sport; + minfo.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo, sizeof(minfo)); +} diff --git a/net/rds/send.c b/net/rds/send.c new file mode 100644 index 00000000..c803341f --- /dev/null +++ b/net/rds/send.c @@ -0,0 +1,1133 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include + +#include "rds.h" + +/* When transmitting messages in rds_send_xmit, we need to emerge from + * time to time and briefly release the CPU. Otherwise the softlock watchdog + * will kick our shin. + * Also, it seems fairer to not let one busy connection stall all the + * others. + * + * send_batch_count is the number of times we'll loop in send_xmit. Setting + * it to 0 will restore the old behavior (where we looped until we had + * drained the queue). + */ +static int send_batch_count = 64; +module_param(send_batch_count, int, 0444); +MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); + +static void rds_send_remove_from_sock(struct list_head *messages, int status); + +/* + * Reset the send state. Callers must ensure that this doesn't race with + * rds_send_xmit(). + */ +void rds_send_reset(struct rds_connection *conn) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + + if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + conn->c_xmit_rm = NULL; + /* Tell the user the RDMA op is no longer mapped by the + * transport. This isn't entirely true (it's flushed out + * independently) but as the connection is down, there's + * no ongoing RDMA to/from that memory */ + rds_message_unmapped(rm); + rds_message_put(rm); + } + + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_data_sent = 0; + + conn->c_map_queued = 0; + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + + /* Mark messages as retransmissions, and move them to the send q */ + spin_lock_irqsave(&conn->c_lock, flags); + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); + } + list_splice_init(&conn->c_retrans, &conn->c_send_queue); + spin_unlock_irqrestore(&conn->c_lock, flags); +} + +static int acquire_in_xmit(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; +} + +static void release_in_xmit(struct rds_connection *conn) +{ + clear_bit(RDS_IN_XMIT, &conn->c_flags); + smp_mb__after_clear_bit(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + +/* + * We're making the conscious trade-off here to only send one message + * down the connection at a time. + * Pro: + * - tx queueing is a simple fifo list + * - reassembly is optional and easily done by transports per conn + * - no per flow rx lookup at all, straight to the socket + * - less per-frag memory and wire overhead + * Con: + * - queued acks can be delayed behind large messages + * Depends: + * - small message latency is higher behind queued large messages + * - large message latency isn't starved by intervening small sends + */ +int rds_send_xmit(struct rds_connection *conn) +{ + struct rds_message *rm; + unsigned long flags; + unsigned int tmp; + struct scatterlist *sg; + int ret = 0; + LIST_HEAD(to_be_dropped); + +restart: + + /* + * sendmsg calls here after having queued its message on the send + * queue. We only have one task feeding the connection at a time. If + * another thread is already feeding the queue then we back off. This + * avoids blocking the caller and trading per-connection data between + * caches per message. + */ + if (!acquire_in_xmit(conn)) { + rds_stats_inc(s_send_lock_contention); + ret = -ENOMEM; + goto out; + } + + /* + * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, + * we do the opposite to avoid races. + */ + if (!rds_conn_up(conn)) { + release_in_xmit(conn); + ret = 0; + goto out; + } + + if (conn->c_trans->xmit_prepare) + conn->c_trans->xmit_prepare(conn); + + /* + * spin trying to push headers and data down the connection until + * the connection doesn't make forward progress. + */ + while (1) { + + rm = conn->c_xmit_rm; + + /* + * If between sending messages, we can send a pending congestion + * map update. + */ + if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { + rm = rds_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; + } + rm->data.op_active = 1; + + conn->c_xmit_rm = rm; + } + + /* + * If not already working on one, grab the next message. + * + * c_xmit_rm holds a ref while we're sending this message down + * the connction. We can use this ref while holding the + * send_sem.. rds_send_reset() is serialized with it. + */ + if (!rm) { + unsigned int len; + + spin_lock_irqsave(&conn->c_lock, flags); + + if (!list_empty(&conn->c_send_queue)) { + rm = list_entry(conn->c_send_queue.next, + struct rds_message, + m_conn_item); + rds_message_addref(rm); + + /* + * Move the message from the send queue to the retransmit + * list right away. + */ + list_move_tail(&rm->m_conn_item, &conn->c_retrans); + } + + spin_unlock_irqrestore(&conn->c_lock, flags); + + if (!rm) + break; + + /* Unfortunately, the way Infiniband deals with + * RDMA to a bad MR key is by moving the entire + * queue pair to error state. We cold possibly + * recover from that, but right now we drop the + * connection. + * Therefore, we never retransmit messages with RDMA ops. + */ + if (rm->rdma.op_active && + test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + spin_lock_irqsave(&conn->c_lock, flags); + if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + list_move(&rm->m_conn_item, &to_be_dropped); + spin_unlock_irqrestore(&conn->c_lock, flags); + continue; + } + + /* Require an ACK every once in a while */ + len = ntohl(rm->m_inc.i_hdr.h_len); + if (conn->c_unacked_packets == 0 || + conn->c_unacked_bytes < len) { + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + conn->c_unacked_packets = rds_sysctl_max_unacked_packets; + conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes; + rds_stats_inc(s_send_ack_required); + } else { + conn->c_unacked_bytes -= len; + conn->c_unacked_packets--; + } + + conn->c_xmit_rm = rm; + } + + /* The transport either sends the whole rdma or none of it */ + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rm->m_final_op = &rm->rdma; + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); + if (ret) + break; + conn->c_xmit_rdma_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + rm->m_final_op = &rm->atomic; + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + if (ret) + break; + conn->c_xmit_atomic_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + /* + * A number of cases require an RDS header to be sent + * even if there is no data. + * We permit 0-byte sends; rds-ping depends on this. + * However, if there are exclusively attached silent ops, + * we skip the hdr/data send, to enable silent operation. + */ + if (rm->data.op_nents == 0) { + int ops_present; + int all_ops_are_silent = 1; + + ops_present = (rm->atomic.op_active || rm->rdma.op_active); + if (rm->atomic.op_active && !rm->atomic.op_silent) + all_ops_are_silent = 0; + if (rm->rdma.op_active && !rm->rdma.op_silent) + all_ops_are_silent = 0; + + if (ops_present && all_ops_are_silent + && !rm->m_rdma_cookie) + rm->data.op_active = 0; + } + + if (rm->data.op_active && !conn->c_xmit_data_sent) { + rm->m_final_op = &rm->data; + ret = conn->c_trans->xmit(conn, rm, + conn->c_xmit_hdr_off, + conn->c_xmit_sg, + conn->c_xmit_data_off); + if (ret <= 0) + break; + + if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) { + tmp = min_t(int, ret, + sizeof(struct rds_header) - + conn->c_xmit_hdr_off); + conn->c_xmit_hdr_off += tmp; + ret -= tmp; + } + + sg = &rm->data.op_sg[conn->c_xmit_sg]; + while (ret) { + tmp = min_t(int, ret, sg->length - + conn->c_xmit_data_off); + conn->c_xmit_data_off += tmp; + ret -= tmp; + if (conn->c_xmit_data_off == sg->length) { + conn->c_xmit_data_off = 0; + sg++; + conn->c_xmit_sg++; + BUG_ON(ret != 0 && + conn->c_xmit_sg == rm->data.op_nents); + } + } + + if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && + (conn->c_xmit_sg == rm->data.op_nents)) + conn->c_xmit_data_sent = 1; + } + + /* + * A rm will only take multiple times through this loop + * if there is a data op. Thus, if the data is sent (or there was + * none), then we're done with the rm. + */ + if (!rm->data.op_active || conn->c_xmit_data_sent) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_data_sent = 0; + + rds_message_put(rm); + } + } + + if (conn->c_trans->xmit_complete) + conn->c_trans->xmit_complete(conn); + + release_in_xmit(conn); + + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) { + /* irqs on here, so we can put(), unlike above */ + list_for_each_entry(rm, &to_be_dropped, m_conn_item) + rds_message_put(rm); + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + } + + /* + * Other senders can queue a message after we last test the send queue + * but before we clear RDS_IN_XMIT. In that case they'd back off and + * not try and send their newly queued message. We need to check the + * send queue after having cleared RDS_IN_XMIT so that their message + * doesn't get stuck on the send queue. + * + * If the transport cannot continue (i.e ret != 0), then it must + * call us when more room is available, such as from the tx + * completion handler. + */ + if (ret == 0) { + smp_mb(); + if (!list_empty(&conn->c_send_queue)) { + rds_stats_inc(s_send_lock_queue_raced); + goto restart; + } + } +out: + return ret; +} + +static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) +{ + u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + assert_spin_locked(&rs->rs_lock); + + BUG_ON(rs->rs_snd_bytes < len); + rs->rs_snd_bytes -= len; + + if (rs->rs_snd_bytes == 0) + rds_stats_inc(s_send_queue_empty); +} + +static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, + is_acked_func is_acked) +{ + if (is_acked) + return is_acked(rm, ack); + return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; +} + +/* + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. + */ +void rds_rdma_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rm_rdma_op *ro; + struct rds_notifier *notifier; + unsigned long flags; + + spin_lock_irqsave(&rm->m_rs_lock, flags); + + ro = &rm->rdma; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + ro->op_active && ro->op_notify && ro->op_notifier) { + notifier = ro->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->op_notifier = NULL; + } + + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} +EXPORT_SYMBOL_GPL(rds_rdma_send_complete); + +/* + * Just like above, except looks at atomic op + */ +void rds_atomic_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rm_atomic_op *ao; + struct rds_notifier *notifier; + unsigned long flags; + + spin_lock_irqsave(&rm->m_rs_lock, flags); + + ao = &rm->atomic; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ao->op_active && ao->op_notify && ao->op_notifier) { + notifier = ao->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ao->op_notifier = NULL; + } + + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} +EXPORT_SYMBOL_GPL(rds_atomic_send_complete); + +/* + * This is the same as rds_rdma_send_complete except we + * don't do any locking - we have all the ingredients (message, + * socket, socket lock) and can just move the notifier. + */ +static inline void +__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +{ + struct rm_rdma_op *ro; + struct rm_atomic_op *ao; + + ro = &rm->rdma; + if (ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_notifier->n_status = status; + list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); + ro->op_notifier = NULL; + } + + ao = &rm->atomic; + if (ao->op_active && ao->op_notify && ao->op_notifier) { + ao->op_notifier->n_status = status; + list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); + ao->op_notifier = NULL; + } + + /* No need to wake the app - caller does this */ +} + +/* + * This is called from the IB send completion when we detect + * a RDMA operation that failed with remote access error. + * So speed is not an issue here. + */ +struct rds_message *rds_send_get_message(struct rds_connection *conn, + struct rm_rdma_op *op) +{ + struct rds_message *rm, *tmp, *found = NULL; + unsigned long flags; + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (&rm->rdma == op) { + atomic_inc(&rm->m_refcount); + found = rm; + goto out; + } + } + + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { + if (&rm->rdma == op) { + atomic_inc(&rm->m_refcount); + found = rm; + break; + } + } + +out: + spin_unlock_irqrestore(&conn->c_lock, flags); + + return found; +} +EXPORT_SYMBOL_GPL(rds_send_get_message); + +/* + * This removes messages from the socket's list if they're on it. The list + * argument must be private to the caller, we must be able to modify it + * without locks. The messages must have a reference held for their + * position on the list. This function will drop that reference after + * removing the messages from the 'messages' list regardless of if it found + * the messages on the socket list or not. + */ +static void rds_send_remove_from_sock(struct list_head *messages, int status) +{ + unsigned long flags; + struct rds_sock *rs = NULL; + struct rds_message *rm; + + while (!list_empty(messages)) { + int was_on_sock = 0; + + rm = list_entry(messages->next, struct rds_message, + m_conn_item); + list_del_init(&rm->m_conn_item); + + /* + * If we see this flag cleared then we're *sure* that someone + * else beat us to removing it from the sock. If we race + * with their flag update we'll get the lock and then really + * see that the flag has been cleared. + * + * The message spinlock makes sure nobody clears rm->m_rs + * while we're messing with it. It does not prevent the + * message from being removed from the socket, though. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) + goto unlock_and_drop; + + if (rs != rm->m_rs) { + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + } + spin_lock(&rs->rs_lock); + + if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { + struct rm_rdma_op *ro = &rm->rdma; + struct rds_notifier *notifier; + + list_del_init(&rm->m_sock_item); + rds_send_sndbuf_remove(rs, rm); + + if (ro->op_active && ro->op_notifier && + (ro->op_notify || (ro->op_recverr && status))) { + notifier = ro->op_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + rm->rdma.op_notifier = NULL; + } + was_on_sock = 1; + rm->m_rs = NULL; + } + spin_unlock(&rs->rs_lock); + +unlock_and_drop: + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); + if (was_on_sock) + rds_message_put(rm); + } + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} + +/* + * Transports call here when they've determined that the receiver queued + * messages up to, and including, the given sequence number. Messages are + * moved to the retrans queue when rds_send_xmit picks them off the send + * queue. This means that in the TCP case, the message may not have been + * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked + * checks the RDS_MSG_HAS_ACK_SEQ bit. + * + * XXX It's not clear to me how this is safely serialized with socket + * destruction. Maybe it should bail if it sees SOCK_DEAD. + */ +void rds_send_drop_acked(struct rds_connection *conn, u64 ack, + is_acked_func is_acked) +{ + struct rds_message *rm, *tmp; + unsigned long flags; + LIST_HEAD(list); + + spin_lock_irqsave(&conn->c_lock, flags); + + list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { + if (!rds_send_is_acked(rm, ack, is_acked)) + break; + + list_move(&rm->m_conn_item, &list); + clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); + } + + /* order flag updates with spin locks */ + if (!list_empty(&list)) + smp_mb__after_clear_bit(); + + spin_unlock_irqrestore(&conn->c_lock, flags); + + /* now remove the messages from the sock list as needed */ + rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); +} +EXPORT_SYMBOL_GPL(rds_send_drop_acked); + +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +{ + struct rds_message *rm, *tmp; + struct rds_connection *conn; + unsigned long flags; + LIST_HEAD(list); + + /* get all the messages we're dropping under the rs lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { + if (dest && (dest->sin_addr.s_addr != rm->m_daddr || + dest->sin_port != rm->m_inc.i_hdr.h_dport)) + continue; + + list_move(&rm->m_sock_item, &list); + rds_send_sndbuf_remove(rs, rm); + clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + } + + /* order flag updates with the rs lock */ + smp_mb__after_clear_bit(); + + spin_unlock_irqrestore(&rs->rs_lock, flags); + + if (list_empty(&list)) + return; + + /* Remove the messages from the conn */ + list_for_each_entry(rm, &list, m_sock_item) { + + conn = rm->m_inc.i_conn; + + spin_lock_irqsave(&conn->c_lock, flags); + /* + * Maybe someone else beat us to removing rm from the conn. + * If we race with their flag update we'll get the lock and + * then really see that the flag has been cleared. + */ + if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + spin_unlock_irqrestore(&conn->c_lock, flags); + continue; + } + list_del_init(&rm->m_conn_item); + spin_unlock_irqrestore(&conn->c_lock, flags); + + /* + * Couldn't grab m_rs_lock in top loop (lock ordering), + * but we can now. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + rds_message_put(rm); + } + + rds_wake_sk_sleep(rs); + + while (!list_empty(&list)) { + rm = list_entry(list.next, struct rds_message, m_sock_item); + list_del_init(&rm->m_sock_item); + + rds_message_wait(rm); + rds_message_put(rm); + } +} + +/* + * we only want this to fire once so we use the callers 'queued'. It's + * possible that another thread can race with us and remove the + * message from the flow with RDS_CANCEL_SENT_TO. + */ +static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, + struct rds_message *rm, __be16 sport, + __be16 dport, int *queued) +{ + unsigned long flags; + u32 len; + + if (*queued) + goto out; + + len = be32_to_cpu(rm->m_inc.i_hdr.h_len); + + /* this is the only place which holds both the socket's rs_lock + * and the connection's c_lock */ + spin_lock_irqsave(&rs->rs_lock, flags); + + /* + * If there is a little space in sndbuf, we don't queue anything, + * and userspace gets -EAGAIN. But poll() indicates there's send + * room. This can lead to bad behavior (spinning) if snd_bytes isn't + * freed up by incoming acks. So we check the *old* value of + * rs_snd_bytes here to allow the last msg to exceed the buffer, + * and poll() now knows no more data can be sent. + */ + if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { + rs->rs_snd_bytes += len; + + /* let recv side know we are close to send space exhaustion. + * This is probably not the optimal way to do it, as this + * means we set the flag on *all* messages as soon as our + * throughput hits a certain threshold. + */ + if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) + __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); + + list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); + set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); + rds_message_addref(rm); + rm->m_rs = rs; + + /* The code ordering is a little weird, but we're + trying to minimize the time we hold c_lock */ + rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); + rm->m_inc.i_conn = conn; + rds_message_addref(rm); + + spin_lock(&conn->c_lock); + rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + spin_unlock(&conn->c_lock); + + rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", + rm, len, rs, rs->rs_snd_bytes, + (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); + + *queued = 1; + } + + spin_unlock_irqrestore(&rs->rs_lock, flags); +out: + return *queued; +} + +/* + * rds_message is getting to be quite complicated, and we'd like to allocate + * it all in one go. This figures out how big it needs to be up front. + */ +static int rds_rm_size(struct msghdr *msg, int data_len) +{ + struct cmsghdr *cmsg; + int size = 0; + int cmsg_groups = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + cmsg_groups |= 1; + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + cmsg_groups |= 2; + /* these are valid but do no add any size */ + break; + + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + cmsg_groups |= 1; + size += sizeof(struct scatterlist); + break; + + default: + return -EINVAL; + } + + } + + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + + /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ + if (cmsg_groups == 3) + return -EINVAL; + + return size; +} + +static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, + struct msghdr *msg, int *allocated_mr) +{ + struct cmsghdr *cmsg; + int ret = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + /* As a side effect, RDMA_DEST and RDMA_MAP will set + * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. + */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + ret = rds_cmsg_rdma_args(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_DEST: + ret = rds_cmsg_rdma_dest(rs, rm, cmsg); + break; + + case RDS_CMSG_RDMA_MAP: + ret = rds_cmsg_rdma_map(rs, rm, cmsg); + if (!ret) + *allocated_mr = 1; + break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + ret = rds_cmsg_atomic(rs, rm, cmsg); + break; + + default: + return -EINVAL; + } + + if (ret) + break; + } + + return ret; +} + +int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t payload_len) +{ + struct sock *sk = sock->sk; + struct rds_sock *rs = rds_sk_to_rs(sk); + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + __be32 daddr; + __be16 dport; + struct rds_message *rm = NULL; + struct rds_connection *conn; + int ret = 0; + int queued = 0, allocated_mr = 0; + int nonblock = msg->msg_flags & MSG_DONTWAIT; + long timeo = sock_sndtimeo(sk, nonblock); + + /* Mirror Linux UDP mirror of BSD error message compatibility */ + /* XXX: Perhaps MSG_MORE someday */ + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (msg->msg_namelen) { + /* XXX fail non-unicast destination IPs? */ + if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + ret = -EINVAL; + goto out; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } else { + /* We only care about consistency with ->connect() */ + lock_sock(sk); + daddr = rs->rs_conn_addr; + dport = rs->rs_conn_port; + release_sock(sk); + } + + /* racing with another thread binding seems ok here */ + if (daddr == 0 || rs->rs_bound_addr == 0) { + ret = -ENOTCONN; /* XXX not a great errno */ + goto out; + } + + /* size of rm including all sgs */ + ret = rds_rm_size(msg, payload_len); + if (ret < 0) + goto out; + + rm = rds_message_alloc(ret, GFP_KERNEL); + if (!rm) { + ret = -ENOMEM; + goto out; + } + + /* Attach data to the rm */ + if (payload_len) { + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + if (!rm->data.op_sg) { + ret = -ENOMEM; + goto out; + } + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; + } + rm->data.op_active = 1; + + rm->m_daddr = daddr; + + /* rds_conn_create has a spinlock that runs with IRQ off. + * Caching the conn in the socket helps a lot. */ + if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + conn = rs->rs_conn; + else { + conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + rs->rs_transport, + sock->sk->sk_allocation); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + rs->rs_conn = conn; + } + + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) + goto out; + + if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { + if (printk_ratelimit()) + printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", + &rm->rdma, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { + if (printk_ratelimit()) + printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", + &rm->atomic, conn->c_trans->xmit_atomic); + ret = -EOPNOTSUPP; + goto out; + } + + rds_conn_connect_if_down(conn); + + ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); + if (ret) { + rs->rs_seen_congestion = 1; + goto out; + } + + while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, + dport, &queued)) { + rds_stats_inc(s_send_queue_full); + /* XXX make sure this is reasonable */ + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + if (nonblock) { + ret = -EAGAIN; + goto out; + } + + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + rds_send_queue_rm(rs, conn, rm, + rs->rs_bound_port, + dport, + &queued), + timeo); + rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); + if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) + continue; + + ret = timeo; + if (ret == 0) + ret = -ETIMEDOUT; + goto out; + } + + /* + * By now we've committed to the send. We reuse rds_send_worker() + * to retry sends in the rds thread if the transport asks us to. + */ + rds_stats_inc(s_send_queued); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_xmit(conn); + + rds_message_put(rm); + return payload_len; + +out: + /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. + * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN + * or in any other way, we need to destroy the MR again */ + if (allocated_mr) + rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); + + if (rm) + rds_message_put(rm); + return ret; +} + +/* + * Reply to a ping packet. + */ +int +rds_send_pong(struct rds_connection *conn, __be16 dport) +{ + struct rds_message *rm; + unsigned long flags; + int ret = 0; + + rm = rds_message_alloc(0, GFP_ATOMIC); + if (!rm) { + ret = -ENOMEM; + goto out; + } + + rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; + + rds_conn_connect_if_down(conn); + + ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); + if (ret) + goto out; + + spin_lock_irqsave(&conn->c_lock, flags); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + rds_message_addref(rm); + rm->m_inc.i_conn = conn; + + rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport, + conn->c_next_tx_seq); + conn->c_next_tx_seq++; + spin_unlock_irqrestore(&conn->c_lock, flags); + + rds_stats_inc(s_send_queued); + rds_stats_inc(s_send_pong); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_xmit(conn); + + rds_message_put(rm); + return 0; + +out: + if (rm) + rds_message_put(rm); + return ret; +} diff --git a/net/rds/stats.c b/net/rds/stats.c new file mode 100644 index 00000000..10c759cc --- /dev/null +++ b/net/rds/stats.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" + +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); +EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); + +/* :.,$s/unsigned long\>.*\= sizeof(ctr.name)); + strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); + ctr.value = values[i]; + + rds_info_copy(iter, &ctr, sizeof(ctr)); + } +} +EXPORT_SYMBOL_GPL(rds_stats_info_copy); + +/* + * This gives global counters across all the transports. The strings + * are copied in so that the tool doesn't need knowledge of the specific + * stats that we're exporting. Some are pretty implementation dependent + * and may change over time. That doesn't stop them from being useful. + * + * This is the only function in the chain that knows about the byte granular + * length in userspace. It converts it to number of stat entries that the + * rest of the functions operate in. + */ +static void rds_stats_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + unsigned int avail; + + avail = len / sizeof(struct rds_info_counter); + + if (avail < ARRAY_SIZE(rds_stat_names)) { + avail = 0; + goto trans; + } + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, + ARRAY_SIZE(rds_stat_names)); + avail -= ARRAY_SIZE(rds_stat_names); + +trans: + lens->each = sizeof(struct rds_info_counter); + lens->nr = rds_trans_stats_info_copy(iter, avail) + + ARRAY_SIZE(rds_stat_names); +} + +void rds_stats_exit(void) +{ + rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); +} + +int rds_stats_init(void) +{ + rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); + return 0; +} diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c new file mode 100644 index 00000000..25ad0c77 --- /dev/null +++ b/net/rds/sysctl.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" + +static struct ctl_table_header *rds_sysctl_reg_table; + +static unsigned long rds_sysctl_reconnect_min = 1; +static unsigned long rds_sysctl_reconnect_max = ~0UL; + +unsigned long rds_sysctl_reconnect_min_jiffies; +unsigned long rds_sysctl_reconnect_max_jiffies = HZ; + +unsigned int rds_sysctl_max_unacked_packets = 8; +unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); + +unsigned int rds_sysctl_ping_enable = 1; + +static ctl_table rds_sysctl_rds_table[] = { + { + .procname = "reconnect_min_delay_ms", + .data = &rds_sysctl_reconnect_min_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min, + .extra2 = &rds_sysctl_reconnect_max_jiffies, + }, + { + .procname = "reconnect_max_delay_ms", + .data = &rds_sysctl_reconnect_max_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &rds_sysctl_reconnect_min_jiffies, + .extra2 = &rds_sysctl_reconnect_max, + }, + { + .procname = "max_unacked_packets", + .data = &rds_sysctl_max_unacked_packets, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_unacked_bytes", + .data = &rds_sysctl_max_unacked_bytes, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ping_enable", + .data = &rds_sysctl_ping_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path rds_sysctl_path[] = { + { .procname = "net", }, + { .procname = "rds", }, + { } +}; + + +void rds_sysctl_exit(void) +{ + if (rds_sysctl_reg_table) + unregister_sysctl_table(rds_sysctl_reg_table); +} + +int rds_sysctl_init(void) +{ + rds_sysctl_reconnect_min = msecs_to_jiffies(1); + rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; + + rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); + if (!rds_sysctl_reg_table) + return -ENOMEM; + return 0; +} diff --git a/net/rds/tcp.c b/net/rds/tcp.c new file mode 100644 index 00000000..8e0a3200 --- /dev/null +++ b/net/rds/tcp.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +/* only for info exporting */ +static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); +static LIST_HEAD(rds_tcp_tc_list); +static unsigned int rds_tcp_tc_count; + +/* Track rds_tcp_connection structs so they can be cleaned up */ +static DEFINE_SPINLOCK(rds_tcp_conn_lock); +static LIST_HEAD(rds_tcp_conn_list); + +static struct kmem_cache *rds_tcp_conn_slab; + +#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) + +/* doing it this way avoids calling tcp_sk() */ +void rds_tcp_nonagle(struct socket *sock) +{ + mm_segment_t oldfs = get_fs(); + int val = 1; + + set_fs(KERNEL_DS); + sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, + sizeof(val)); + set_fs(oldfs); +} + +void rds_tcp_tune(struct socket *sock) +{ + struct sock *sk = sock->sk; + + rds_tcp_nonagle(sock); + + /* + * We're trying to saturate gigabit with the default, + * see svc_sock_setbufsize(). + */ + lock_sock(sk); + sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE; + sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sk); +} + +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) +{ + return tcp_sk(tc->t_sock->sk)->snd_nxt; +} + +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc) +{ + return tcp_sk(tc->t_sock->sk)->snd_una; +} + +void rds_tcp_restore_callbacks(struct socket *sock, + struct rds_tcp_connection *tc) +{ + rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc); + write_lock_bh(&sock->sk->sk_callback_lock); + + /* done under the callback_lock to serialize with write_space */ + spin_lock(&rds_tcp_tc_list_lock); + list_del_init(&tc->t_list_item); + rds_tcp_tc_count--; + spin_unlock(&rds_tcp_tc_list_lock); + + tc->t_sock = NULL; + + sock->sk->sk_write_space = tc->t_orig_write_space; + sock->sk->sk_data_ready = tc->t_orig_data_ready; + sock->sk->sk_state_change = tc->t_orig_state_change; + sock->sk->sk_user_data = NULL; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +/* + * This is the only path that sets tc->t_sock. Send and receive trust that + * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * called while it isn't set. + */ +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); + write_lock_bh(&sock->sk->sk_callback_lock); + + /* done under the callback_lock to serialize with write_space */ + spin_lock(&rds_tcp_tc_list_lock); + list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); + rds_tcp_tc_count++; + spin_unlock(&rds_tcp_tc_list_lock); + + /* accepted sockets need our listen data ready undone */ + if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) + sock->sk->sk_data_ready = sock->sk->sk_user_data; + + tc->t_sock = sock; + tc->conn = conn; + tc->t_orig_data_ready = sock->sk->sk_data_ready; + tc->t_orig_write_space = sock->sk->sk_write_space; + tc->t_orig_state_change = sock->sk->sk_state_change; + + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = rds_tcp_data_ready; + sock->sk->sk_write_space = rds_tcp_write_space; + sock->sk->sk_state_change = rds_tcp_state_change; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void rds_tcp_tc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_info_tcp_socket tsinfo; + struct rds_tcp_connection *tc; + unsigned long flags; + struct sockaddr_in sin; + int sinlen; + + spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + + if (len / sizeof(tsinfo) < rds_tcp_tc_count) + goto out; + + list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + + sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0); + tsinfo.local_addr = sin.sin_addr.s_addr; + tsinfo.local_port = sin.sin_port; + sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1); + tsinfo.peer_addr = sin.sin_addr.s_addr; + tsinfo.peer_port = sin.sin_port; + + tsinfo.hdr_rem = tc->t_tinc_hdr_rem; + tsinfo.data_rem = tc->t_tinc_data_rem; + tsinfo.last_sent_nxt = tc->t_last_sent_nxt; + tsinfo.last_expected_una = tc->t_last_expected_una; + tsinfo.last_seen_una = tc->t_last_seen_una; + + rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); + } + +out: + lens->nr = rds_tcp_tc_count; + lens->each = sizeof(tsinfo); + + spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + +static int rds_tcp_laddr_check(__be32 addr) +{ + if (inet_addr_type(&init_net, addr) == RTN_LOCAL) + return 0; + return -EADDRNOTAVAIL; +} + +static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) +{ + struct rds_tcp_connection *tc; + + tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + if (!tc) + return -ENOMEM; + + tc->t_sock = NULL; + tc->t_tinc = NULL; + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + + conn->c_transport_data = tc; + + spin_lock_irq(&rds_tcp_conn_lock); + list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + + rdsdebug("alloced tc %p\n", conn->c_transport_data); + return 0; +} + +static void rds_tcp_conn_free(void *arg) +{ + struct rds_tcp_connection *tc = arg; + unsigned long flags; + rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + + kmem_cache_free(rds_tcp_conn_slab, tc); +} + +static void rds_tcp_destroy_conns(void) +{ + struct rds_tcp_connection *tc, *_tc; + LIST_HEAD(tmp_list); + + /* avoid calling conn_destroy with irqs off */ + spin_lock_irq(&rds_tcp_conn_lock); + list_splice(&rds_tcp_conn_list, &tmp_list); + INIT_LIST_HEAD(&rds_tcp_conn_list); + spin_unlock_irq(&rds_tcp_conn_lock); + + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { + if (tc->conn->c_passive) + rds_conn_destroy(tc->conn->c_passive); + rds_conn_destroy(tc->conn); + } +} + +static void rds_tcp_exit(void) +{ + rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_tcp_listen_stop(); + rds_tcp_destroy_conns(); + rds_trans_unregister(&rds_tcp_transport); + rds_tcp_recv_exit(); + kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + +struct rds_transport rds_tcp_transport = { + .laddr_check = rds_tcp_laddr_check, + .xmit_prepare = rds_tcp_xmit_prepare, + .xmit_complete = rds_tcp_xmit_complete, + .xmit = rds_tcp_xmit, + .recv = rds_tcp_recv, + .conn_alloc = rds_tcp_conn_alloc, + .conn_free = rds_tcp_conn_free, + .conn_connect = rds_tcp_conn_connect, + .conn_shutdown = rds_tcp_conn_shutdown, + .inc_copy_to_user = rds_tcp_inc_copy_to_user, + .inc_free = rds_tcp_inc_free, + .stats_info_copy = rds_tcp_stats_info_copy, + .exit = rds_tcp_exit, + .t_owner = THIS_MODULE, + .t_name = "tcp", + .t_type = RDS_TRANS_TCP, + .t_prefer_loopback = 1, +}; + +static int rds_tcp_init(void) +{ + int ret; + + rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", + sizeof(struct rds_tcp_connection), + 0, 0, NULL); + if (!rds_tcp_conn_slab) { + ret = -ENOMEM; + goto out; + } + + ret = rds_tcp_recv_init(); + if (ret) + goto out_slab; + + ret = rds_trans_register(&rds_tcp_transport); + if (ret) + goto out_recv; + + ret = rds_tcp_listen_init(); + if (ret) + goto out_register; + + rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + + goto out; + +out_register: + rds_trans_unregister(&rds_tcp_transport); +out_recv: + rds_tcp_recv_exit(); +out_slab: + kmem_cache_destroy(rds_tcp_conn_slab); +out: + return ret; +} +module_init(rds_tcp_init); + +MODULE_AUTHOR("Oracle Corporation "); +MODULE_DESCRIPTION("RDS: TCP transport"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/net/rds/tcp.h b/net/rds/tcp.h new file mode 100644 index 00000000..9cf2927d --- /dev/null +++ b/net/rds/tcp.h @@ -0,0 +1,88 @@ +#ifndef _RDS_TCP_H +#define _RDS_TCP_H + +#define RDS_TCP_PORT 16385 + +struct rds_tcp_incoming { + struct rds_incoming ti_inc; + struct sk_buff_head ti_skb_list; +}; + +struct rds_tcp_connection { + + struct list_head t_tcp_node; + struct rds_connection *conn; + struct socket *t_sock; + void *t_orig_write_space; + void *t_orig_data_ready; + void *t_orig_state_change; + + struct rds_tcp_incoming *t_tinc; + size_t t_tinc_hdr_rem; + size_t t_tinc_data_rem; + + /* XXX error report? */ + struct work_struct t_conn_w; + struct work_struct t_send_w; + struct work_struct t_down_w; + struct work_struct t_recv_w; + + /* for info exporting only */ + struct list_head t_list_item; + u32 t_last_sent_nxt; + u32 t_last_expected_una; + u32 t_last_seen_una; +}; + +struct rds_tcp_statistics { + uint64_t s_tcp_data_ready_calls; + uint64_t s_tcp_write_space_calls; + uint64_t s_tcp_sndbuf_full; + uint64_t s_tcp_connect_raced; + uint64_t s_tcp_listen_closed_stale; +}; + +/* tcp.c */ +void rds_tcp_tune(struct socket *sock); +void rds_tcp_nonagle(struct socket *sock); +void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); +void rds_tcp_restore_callbacks(struct socket *sock, + struct rds_tcp_connection *tc); +u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); +u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); +u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); +extern struct rds_transport rds_tcp_transport; + +/* tcp_connect.c */ +int rds_tcp_conn_connect(struct rds_connection *conn); +void rds_tcp_conn_shutdown(struct rds_connection *conn); +void rds_tcp_state_change(struct sock *sk); + +/* tcp_listen.c */ +int rds_tcp_listen_init(void); +void rds_tcp_listen_stop(void); +void rds_tcp_listen_data_ready(struct sock *sk, int bytes); + +/* tcp_recv.c */ +int rds_tcp_recv_init(void); +void rds_tcp_recv_exit(void); +void rds_tcp_data_ready(struct sock *sk, int bytes); +int rds_tcp_recv(struct rds_connection *conn); +void rds_tcp_inc_free(struct rds_incoming *inc); +int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, + size_t size); + +/* tcp_send.c */ +void rds_tcp_xmit_prepare(struct rds_connection *conn); +void rds_tcp_xmit_complete(struct rds_connection *conn); +int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off); +void rds_tcp_write_space(struct sock *sk); + +/* tcp_stats.c */ +DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); +#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member) +unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail); + +#endif diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c new file mode 100644 index 00000000..af95c8e0 --- /dev/null +++ b/net/rds/tcp_connect.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +void rds_tcp_state_change(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + read_lock_bh(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { + state_change = sk->sk_state_change; + goto out; + } + tc = conn->c_transport_data; + state_change = tc->t_orig_state_change; + + rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); + + switch(sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + rds_connect_complete(conn); + break; + case TCP_CLOSE: + rds_conn_drop(conn); + default: + break; + } +out: + read_unlock_bh(&sk->sk_callback_lock); + state_change(sk); +} + +int rds_tcp_conn_connect(struct rds_connection *conn) +{ + struct socket *sock = NULL; + struct sockaddr_in src, dest; + int ret; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) + goto out; + + rds_tcp_tune(sock); + + src.sin_family = AF_INET; + src.sin_addr.s_addr = (__force u32)conn->c_laddr; + src.sin_port = (__force u16)htons(0); + + ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + if (ret) { + rdsdebug("bind failed with %d at address %pI4\n", + ret, &conn->c_laddr); + goto out; + } + + dest.sin_family = AF_INET; + dest.sin_addr.s_addr = (__force u32)conn->c_faddr; + dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + + /* + * once we call connect() we can start getting callbacks and they + * own the socket + */ + rds_tcp_set_callbacks(sock, conn); + ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), + O_NONBLOCK); + sock = NULL; + + rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + if (ret == -EINPROGRESS) + ret = 0; + +out: + if (sock) + sock_release(sock); + return ret; +} + +/* + * Before killing the tcp socket this needs to serialize with callbacks. The + * caller has already grabbed the sending sem so we're serialized with other + * senders. + * + * TCP calls the callbacks with the sock lock so we hold it while we reset the + * callbacks to those set by TCP. Our callbacks won't execute again once we + * hold the sock lock. + */ +void rds_tcp_conn_shutdown(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + + rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); + + if (sock) { + sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); + lock_sock(sock->sk); + rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ + + release_sock(sock->sk); + sock_release(sock); + } + + if (tc->t_tinc) { + rds_inc_put(&tc->t_tinc->ti_inc); + tc->t_tinc = NULL; + } + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; +} diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c new file mode 100644 index 00000000..8b5cc4aa --- /dev/null +++ b/net/rds/tcp_listen.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +/* + * cheesy, but simple.. + */ +static void rds_tcp_accept_worker(struct work_struct *work); +static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); +static struct socket *rds_tcp_listen_sock; + +static int rds_tcp_accept_one(struct socket *sock) +{ + struct socket *new_sock = NULL; + struct rds_connection *conn; + int ret; + struct inet_sock *inet; + + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, + sock->sk->sk_protocol, &new_sock); + if (ret) + goto out; + + new_sock->type = sock->type; + new_sock->ops = sock->ops; + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + if (ret < 0) + goto out; + + rds_tcp_tune(new_sock); + + inet = inet_sk(new_sock->sk); + + rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", + &inet->inet_saddr, ntohs(inet->inet_sport), + &inet->inet_daddr, ntohs(inet->inet_dport)); + + conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, + &rds_tcp_transport, GFP_KERNEL); + if (IS_ERR(conn)) { + ret = PTR_ERR(conn); + goto out; + } + + /* + * see the comment above rds_queue_delayed_reconnect() + */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + if (rds_conn_state(conn) == RDS_CONN_UP) + rds_tcp_stats_inc(s_tcp_listen_closed_stale); + else + rds_tcp_stats_inc(s_tcp_connect_raced); + rds_conn_drop(conn); + ret = 0; + goto out; + } + + rds_tcp_set_callbacks(new_sock, conn); + rds_connect_complete(conn); + new_sock = NULL; + ret = 0; + +out: + if (new_sock) + sock_release(new_sock); + return ret; +} + +static void rds_tcp_accept_worker(struct work_struct *work) +{ + while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) + cond_resched(); +} + +void rds_tcp_listen_data_ready(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); + + rdsdebug("listen data ready sk %p\n", sk); + + read_lock_bh(&sk->sk_callback_lock); + ready = sk->sk_user_data; + if (!ready) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + /* + * ->sk_data_ready is also called for a newly established child socket + * before it has been accepted and the accepter has set up their + * data_ready.. we only want to queue listen work for our listening + * socket + */ + if (sk->sk_state == TCP_LISTEN) + queue_work(rds_wq, &rds_tcp_listen_work); + +out: + read_unlock_bh(&sk->sk_callback_lock); + ready(sk, bytes); +} + +int rds_tcp_listen_init(void) +{ + struct sockaddr_in sin; + struct socket *sock = NULL; + int ret; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) + goto out; + + sock->sk->sk_reuse = 1; + rds_tcp_nonagle(sock); + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = rds_tcp_listen_data_ready; + write_unlock_bh(&sock->sk->sk_callback_lock); + + sin.sin_family = PF_INET, + sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); + sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + + ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + if (ret < 0) + goto out; + + ret = sock->ops->listen(sock, 64); + if (ret < 0) + goto out; + + rds_tcp_listen_sock = sock; + sock = NULL; +out: + if (sock) + sock_release(sock); + return ret; +} + +void rds_tcp_listen_stop(void) +{ + struct socket *sock = rds_tcp_listen_sock; + struct sock *sk; + + if (!sock) + return; + + sk = sock->sk; + + /* serialize with and prevent further callbacks */ + lock_sock(sk); + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_user_data) { + sk->sk_data_ready = sk->sk_user_data; + sk->sk_user_data = NULL; + } + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + + /* wait for accepts to stop and close the socket */ + flush_workqueue(rds_wq); + sock_release(sock); + rds_tcp_listen_sock = NULL; +} diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c new file mode 100644 index 00000000..78205e25 --- /dev/null +++ b/net/rds/tcp_recv.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +static struct kmem_cache *rds_tcp_incoming_slab; + +static void rds_tcp_inc_purge(struct rds_incoming *inc) +{ + struct rds_tcp_incoming *tinc; + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + rdsdebug("purging tinc %p inc %p\n", tinc, inc); + skb_queue_purge(&tinc->ti_skb_list); +} + +void rds_tcp_inc_free(struct rds_incoming *inc) +{ + struct rds_tcp_incoming *tinc; + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + rds_tcp_inc_purge(inc); + rdsdebug("freeing tinc %p inc %p\n", tinc, inc); + kmem_cache_free(rds_tcp_incoming_slab, tinc); +} + +/* + * this is pretty lame, but, whatever. + */ +int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, + size_t size) +{ + struct rds_tcp_incoming *tinc; + struct iovec *iov, tmp; + struct sk_buff *skb; + unsigned long to_copy, skb_off; + int ret = 0; + + if (size == 0) + goto out; + + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + iov = first_iov; + tmp = *iov; + + skb_queue_walk(&tinc->ti_skb_list, skb) { + skb_off = 0; + while (skb_off < skb->len) { + while (tmp.iov_len == 0) { + iov++; + tmp = *iov; + } + + to_copy = min(tmp.iov_len, size); + to_copy = min(to_copy, skb->len - skb_off); + + rdsdebug("ret %d size %zu skb %p skb_off %lu " + "skblen %d iov_base %p iov_len %zu cpy %lu\n", + ret, size, skb, skb_off, skb->len, + tmp.iov_base, tmp.iov_len, to_copy); + + /* modifies tmp as it copies */ + if (skb_copy_datagram_iovec(skb, skb_off, &tmp, + to_copy)) { + ret = -EFAULT; + goto out; + } + + rds_stats_add(s_copy_to_user, to_copy); + size -= to_copy; + ret += to_copy; + skb_off += to_copy; + if (size == 0) + goto out; + } + } +out: + return ret; +} + +/* + * We have a series of skbs that have fragmented pieces of the congestion + * bitmap. They must add up to the exact size of the congestion bitmap. We + * use the skb helpers to copy those into the pages that make up the in-memory + * congestion bitmap for the remote address of this connection. We then tell + * the congestion core that the bitmap has been changed so that it can wake up + * sleepers. + * + * This is racing with sending paths which are using test_bit to see if the + * bitmap indicates that their recipient is congested. + */ + +static void rds_tcp_cong_recv(struct rds_connection *conn, + struct rds_tcp_incoming *tinc) +{ + struct sk_buff *skb; + unsigned int to_copy, skb_off; + unsigned int map_off; + unsigned int map_page; + struct rds_cong_map *map; + int ret; + + /* catch completely corrupt packets */ + if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) + return; + + map_page = 0; + map_off = 0; + map = conn->c_fcong; + + skb_queue_walk(&tinc->ti_skb_list, skb) { + skb_off = 0; + while (skb_off < skb->len) { + to_copy = min_t(unsigned int, PAGE_SIZE - map_off, + skb->len - skb_off); + + BUG_ON(map_page >= RDS_CONG_MAP_PAGES); + + /* only returns 0 or -error */ + ret = skb_copy_bits(skb, skb_off, + (void *)map->m_page_addrs[map_page] + map_off, + to_copy); + BUG_ON(ret != 0); + + skb_off += to_copy; + map_off += to_copy; + if (map_off == PAGE_SIZE) { + map_off = 0; + map_page++; + } + } + } + + rds_cong_map_updated(map, ~(u64) 0); +} + +struct rds_tcp_desc_arg { + struct rds_connection *conn; + gfp_t gfp; + enum km_type km; +}; + +static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rds_tcp_desc_arg *arg = desc->arg.data; + struct rds_connection *conn = arg->conn; + struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_incoming *tinc = tc->t_tinc; + struct sk_buff *clone; + size_t left = len, to_copy; + + rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, + len); + + /* + * tcp_read_sock() interprets partial progress as an indication to stop + * processing. + */ + while (left) { + if (!tinc) { + tinc = kmem_cache_alloc(rds_tcp_incoming_slab, + arg->gfp); + if (!tinc) { + desc->error = -ENOMEM; + goto out; + } + tc->t_tinc = tinc; + rdsdebug("alloced tinc %p\n", tinc); + rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); + /* + * XXX * we might be able to use the __ variants when + * we've already serialized at a higher level. + */ + skb_queue_head_init(&tinc->ti_skb_list); + } + + if (left && tc->t_tinc_hdr_rem) { + to_copy = min(tc->t_tinc_hdr_rem, left); + rdsdebug("copying %zu header from skb %p\n", to_copy, + skb); + skb_copy_bits(skb, offset, + (char *)&tinc->ti_inc.i_hdr + + sizeof(struct rds_header) - + tc->t_tinc_hdr_rem, + to_copy); + tc->t_tinc_hdr_rem -= to_copy; + left -= to_copy; + offset += to_copy; + + if (tc->t_tinc_hdr_rem == 0) { + /* could be 0 for a 0 len message */ + tc->t_tinc_data_rem = + be32_to_cpu(tinc->ti_inc.i_hdr.h_len); + } + } + + if (left && tc->t_tinc_data_rem) { + clone = skb_clone(skb, arg->gfp); + if (!clone) { + desc->error = -ENOMEM; + goto out; + } + + to_copy = min(tc->t_tinc_data_rem, left); + pskb_pull(clone, offset); + pskb_trim(clone, to_copy); + skb_queue_tail(&tinc->ti_skb_list, clone); + + rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " + "clone %p data %p len %d\n", + skb, skb->data, skb->len, offset, to_copy, + clone, clone->data, clone->len); + + tc->t_tinc_data_rem -= to_copy; + left -= to_copy; + offset += to_copy; + } + + if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { + if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_tcp_cong_recv(conn, tinc); + else + rds_recv_incoming(conn, conn->c_faddr, + conn->c_laddr, &tinc->ti_inc, + arg->gfp, arg->km); + + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + tc->t_tinc = NULL; + rds_inc_put(&tinc->ti_inc); + tinc = NULL; + } + } +out: + rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", + len, left, skb->len, + skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); + return len - left; +} + +/* the caller has to hold the sock lock */ +static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, + enum km_type km) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + read_descriptor_t desc; + struct rds_tcp_desc_arg arg; + + /* It's like glib in the kernel! */ + arg.conn = conn; + arg.gfp = gfp; + arg.km = km; + desc.arg.data = &arg; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); + rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, + desc.error); + + return desc.error; +} + +/* + * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from + * data_ready. + * + * if we fail to allocate we're in trouble.. blindly wait some time before + * trying again to see if the VM can free up something for us. + */ +int rds_tcp_recv(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *sock = tc->t_sock; + int ret = 0; + + rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); + + lock_sock(sock->sk); + ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); + release_sock(sock->sk); + + return ret; +} + +void rds_tcp_data_ready(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + rdsdebug("data ready sk %p bytes %d\n", sk, bytes); + + read_lock_bh(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + tc = conn->c_transport_data; + ready = tc->t_orig_data_ready; + rds_tcp_stats_inc(s_tcp_data_ready_calls); + + if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +out: + read_unlock_bh(&sk->sk_callback_lock); + ready(sk, bytes); +} + +int rds_tcp_recv_init(void) +{ + rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", + sizeof(struct rds_tcp_incoming), + 0, 0, NULL); + if (!rds_tcp_incoming_slab) + return -ENOMEM; + return 0; +} + +void rds_tcp_recv_exit(void) +{ + kmem_cache_destroy(rds_tcp_incoming_slab); +} diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c new file mode 100644 index 00000000..1b4fd68f --- /dev/null +++ b/net/rds/tcp_send.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +static void rds_tcp_cork(struct socket *sock, int val) +{ + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val, + sizeof(val)); + set_fs(oldfs); +} + +void rds_tcp_xmit_prepare(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rds_tcp_cork(tc->t_sock, 1); +} + +void rds_tcp_xmit_complete(struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + + rds_tcp_cork(tc->t_sock, 0); +} + +/* the core send_sem serializes this with other xmit and shutdown */ +static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) +{ + struct kvec vec = { + .iov_base = data, + .iov_len = len, + }; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, + }; + + return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); +} + +/* the core send_sem serializes this with other xmit and shutdown */ +int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, + unsigned int hdr_off, unsigned int sg, unsigned int off) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + int done = 0; + int ret = 0; + + if (hdr_off == 0) { + /* + * m_ack_seq is set to the sequence number of the last byte of + * header and data. see rds_tcp_is_acked(). + */ + tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); + rm->m_ack_seq = tc->t_last_sent_nxt + + sizeof(struct rds_header) + + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; + smp_mb__before_clear_bit(); + set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); + tc->t_last_expected_una = rm->m_ack_seq + 1; + + rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", + rm, rds_tcp_snd_nxt(tc), + (unsigned long long)rm->m_ack_seq); + } + + if (hdr_off < sizeof(struct rds_header)) { + /* see rds_tcp_write_space() */ + set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); + + ret = rds_tcp_sendmsg(tc->t_sock, + (void *)&rm->m_inc.i_hdr + hdr_off, + sizeof(rm->m_inc.i_hdr) - hdr_off); + if (ret < 0) + goto out; + done += ret; + if (hdr_off + done != sizeof(struct rds_header)) + goto out; + } + + while (sg < rm->data.op_nents) { + ret = tc->t_sock->ops->sendpage(tc->t_sock, + sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, + rm->data.op_sg[sg].length - off, + MSG_DONTWAIT|MSG_NOSIGNAL); + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, + ret); + if (ret <= 0) + break; + + off += ret; + done += ret; + if (off == rm->data.op_sg[sg].length) { + off = 0; + sg++; + } + } + +out: + if (ret <= 0) { + /* write_space will hit after EAGAIN, all else fatal */ + if (ret == -EAGAIN) { + rds_tcp_stats_inc(s_tcp_sndbuf_full); + ret = 0; + } else { + printk(KERN_WARNING "RDS/tcp: send to %pI4 " + "returned %d, disconnecting and reconnecting\n", + &conn->c_faddr, ret); + rds_conn_drop(conn); + } + } + if (done == 0) + done = ret; + return done; +} + +/* + * rm->m_ack_seq is set to the tcp sequence number that corresponds to the + * last byte of the message, including the header. This means that the + * entire message has been received if rm->m_ack_seq is "before" the next + * unacked byte of the TCP sequence space. We have to do very careful + * wrapping 32bit comparisons here. + */ +static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) +{ + if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) + return 0; + return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; +} + +void rds_tcp_write_space(struct sock *sk) +{ + void (*write_space)(struct sock *sk); + struct rds_connection *conn; + struct rds_tcp_connection *tc; + + read_lock_bh(&sk->sk_callback_lock); + conn = sk->sk_user_data; + if (!conn) { + write_space = sk->sk_write_space; + goto out; + } + + tc = conn->c_transport_data; + rdsdebug("write_space for tc %p\n", tc); + write_space = tc->t_orig_write_space; + rds_tcp_stats_inc(s_tcp_write_space_calls); + + rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); + tc->t_last_seen_una = rds_tcp_snd_una(tc); + rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); + + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + +out: + read_unlock_bh(&sk->sk_callback_lock); + + /* + * write_space is only called when data leaves tcp's send queue if + * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put + * data in tcp's send queue because we use write_space to parse the + * sequence numbers and notice that rds messages have been fully + * received. + * + * tcp's write_space clears SOCK_NOSPACE if the send queue has more + * than a certain amount of space. So we need to set it again *after* + * we call tcp's write_space or else we might only get called on the + * first of a series of incoming tcp acks. + */ + write_space(sk); + + if (sk->sk_socket) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +} diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c new file mode 100644 index 00000000..d5898d03 --- /dev/null +++ b/net/rds/tcp_stats.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "tcp.h" + +DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) + ____cacheline_aligned; + +static const char const *rds_tcp_stat_names[] = { + "tcp_data_ready_calls", + "tcp_write_space_calls", + "tcp_sndbuf_full", + "tcp_connect_raced", + "tcp_listen_closed_stale", +}; + +unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) +{ + struct rds_tcp_statistics stats = {0, }; + uint64_t *src; + uint64_t *sum; + size_t i; + int cpu; + + if (avail < ARRAY_SIZE(rds_tcp_stat_names)) + goto out; + + for_each_online_cpu(cpu) { + src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); + sum = (uint64_t *)&stats; + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) + *(sum++) += *(src++); + } + + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, + ARRAY_SIZE(rds_tcp_stat_names)); +out: + return ARRAY_SIZE(rds_tcp_stat_names); +} diff --git a/net/rds/threads.c b/net/rds/threads.c new file mode 100644 index 00000000..0fd90f8c --- /dev/null +++ b/net/rds/threads.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include + +#include "rds.h" + +/* + * All of connection management is simplified by serializing it through + * work queues that execute in a connection managing thread. + * + * TCP wants to send acks through sendpage() in response to data_ready(), + * but it needs a process context to do so. + * + * The receive paths need to allocate but can't drop packets (!) so we have + * a thread around to block allocating if the receive fast path sees an + * allocation failure. + */ + +/* Grand Unified Theory of connection life cycle: + * At any point in time, the connection can be in one of these states: + * DOWN, CONNECTING, UP, DISCONNECTING, ERROR + * + * The following transitions are possible: + * ANY -> ERROR + * UP -> DISCONNECTING + * ERROR -> DISCONNECTING + * DISCONNECTING -> DOWN + * DOWN -> CONNECTING + * CONNECTING -> UP + * + * Transition to state DISCONNECTING/DOWN: + * - Inside the shutdown worker; synchronizes with xmit path + * through RDS_IN_XMIT, and with connection management callbacks + * via c_cm_lock. + * + * For receive callbacks, we rely on the underlying transport + * (TCP, IB/RDMA) to provide the necessary synchronisation. + */ +struct workqueue_struct *rds_wq; +EXPORT_SYMBOL_GPL(rds_wq); + +void rds_connect_complete(struct rds_connection *conn) +{ + if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { + printk(KERN_WARNING "%s: Cannot transition to state UP, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + atomic_set(&conn->c_state, RDS_CONN_ERROR); + queue_work(rds_wq, &conn->c_down_w); + return; + } + + rdsdebug("conn %p for %pI4 to %pI4 complete\n", + conn, &conn->c_laddr, &conn->c_faddr); + + conn->c_reconnect_jiffies = 0; + set_bit(0, &conn->c_map_queued); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); +} +EXPORT_SYMBOL_GPL(rds_connect_complete); + +/* + * This random exponential backoff is relied on to eventually resolve racing + * connects. + * + * If connect attempts race then both parties drop both connections and come + * here to wait for a random amount of time before trying again. Eventually + * the backoff range will be so much greater than the time it takes to + * establish a connection that one of the pair will establish the connection + * before the other's random delay fires. + * + * Connection attempts that arrive while a connection is already established + * are also considered to be racing connects. This lets a connection from + * a rebooted machine replace an existing stale connection before the transport + * notices that the connection has failed. + * + * We should *always* start with a random backoff; otherwise a broken connection + * will always take several iterations to be re-established. + */ +void rds_queue_reconnect(struct rds_connection *conn) +{ + unsigned long rand; + + rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, + conn->c_reconnect_jiffies); + + set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (conn->c_reconnect_jiffies == 0) { + conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + return; + } + + get_random_bytes(&rand, sizeof(rand)); + rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", + rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, + conn, &conn->c_laddr, &conn->c_faddr); + queue_delayed_work(rds_wq, &conn->c_conn_w, + rand % conn->c_reconnect_jiffies); + + conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, + rds_sysctl_reconnect_max_jiffies); +} + +void rds_connect_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); + int ret; + + clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); + if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + ret = conn->c_trans->conn_connect(conn); + rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", + conn, &conn->c_laddr, &conn->c_faddr, ret); + + if (ret) { + if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) + rds_queue_reconnect(conn); + else + rds_conn_error(conn, "RDS: connect failed\n"); + } + } +} + +void rds_send_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = rds_send_xmit(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_send_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_send_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_send_w, 2); + default: + break; + } + } +} + +void rds_recv_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); + int ret; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + ret = conn->c_trans->recv(conn); + rdsdebug("conn %p ret %d\n", conn, ret); + switch (ret) { + case -EAGAIN: + rds_stats_inc(s_recv_immediate_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + break; + case -ENOMEM: + rds_stats_inc(s_recv_delayed_retry); + queue_delayed_work(rds_wq, &conn->c_recv_w, 2); + default: + break; + } + } +} + +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + rds_conn_shutdown(conn); +} + +void rds_threads_exit(void) +{ + destroy_workqueue(rds_wq); +} + +int rds_threads_init(void) +{ + rds_wq = create_singlethread_workqueue("krdsd"); + if (!rds_wq) + return -ENOMEM; + + return 0; +} diff --git a/net/rds/transport.c b/net/rds/transport.c new file mode 100644 index 00000000..7f2ac4fe --- /dev/null +++ b/net/rds/transport.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include + +#include "rds.h" +#include "loop.h" + +static struct rds_transport *transports[RDS_TRANS_COUNT]; +static DECLARE_RWSEM(rds_trans_sem); + +int rds_trans_register(struct rds_transport *trans) +{ + BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); + + down_write(&rds_trans_sem); + + if (transports[trans->t_type]) + printk(KERN_ERR "RDS Transport type %d already registered\n", + trans->t_type); + else { + transports[trans->t_type] = trans; + printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); + } + + up_write(&rds_trans_sem); + + return 0; +} +EXPORT_SYMBOL_GPL(rds_trans_register); + +void rds_trans_unregister(struct rds_transport *trans) +{ + down_write(&rds_trans_sem); + + transports[trans->t_type] = NULL; + printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); + + up_write(&rds_trans_sem); +} +EXPORT_SYMBOL_GPL(rds_trans_unregister); + +void rds_trans_put(struct rds_transport *trans) +{ + if (trans && trans->t_owner) + module_put(trans->t_owner); +} + +struct rds_transport *rds_trans_get_preferred(__be32 addr) +{ + struct rds_transport *ret = NULL; + struct rds_transport *trans; + unsigned int i; + + if (IN_LOOPBACK(ntohl(addr))) + return &rds_loop_transport; + + down_read(&rds_trans_sem); + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && (trans->laddr_check(addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + +/* + * This returns the number of stats entries in the snapshot and only + * copies them using the iter if there is enough space for them. The + * caller passes in the global stats so that we can size and copy while + * holding the lock. + */ +unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, + unsigned int avail) + +{ + struct rds_transport *trans; + unsigned int total = 0; + unsigned int part; + int i; + + rds_info_iter_unmap(iter); + down_read(&rds_trans_sem); + + for (i = 0; i < RDS_TRANS_COUNT; i++) + { + trans = transports[i]; + if (!trans || !trans->stats_info_copy) + continue; + + part = trans->stats_info_copy(iter, avail); + avail -= min(avail, part); + total += part; + } + + up_read(&rds_trans_sem); + + return total; +} + diff --git a/net/rds/xlist.h b/net/rds/xlist.h new file mode 100644 index 00000000..e6b5190d --- /dev/null +++ b/net/rds/xlist.h @@ -0,0 +1,80 @@ +#ifndef _LINUX_XLIST_H +#define _LINUX_XLIST_H + +#include +#include +#include +#include + +struct xlist_head { + struct xlist_head *next; +}; + +static inline void INIT_XLIST_HEAD(struct xlist_head *list) +{ + list->next = NULL; +} + +static inline int xlist_empty(struct xlist_head *head) +{ + return head->next == NULL; +} + +static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, + struct xlist_head *head) +{ + struct xlist_head *cur; + struct xlist_head *check; + + while (1) { + cur = head->next; + tail->next = cur; + check = cmpxchg(&head->next, cur, new); + if (check == cur) + break; + } +} + +static inline struct xlist_head *xlist_del_head(struct xlist_head *head) +{ + struct xlist_head *cur; + struct xlist_head *check; + struct xlist_head *next; + + while (1) { + cur = head->next; + if (!cur) + goto out; + + next = cur->next; + check = cmpxchg(&head->next, cur, next); + if (check == cur) + goto out; + } +out: + return cur; +} + +static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head) +{ + struct xlist_head *cur; + + cur = head->next; + if (!cur) + return NULL; + + head->next = cur->next; + return cur; +} + +static inline void xlist_splice(struct xlist_head *list, + struct xlist_head *head) +{ + struct xlist_head *cur; + + WARN_ON(head->next); + cur = xchg(&list->next, NULL); + head->next = cur; +} + +#endif diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig new file mode 100644 index 00000000..8e12c8a2 --- /dev/null +++ b/net/rfkill/Kconfig @@ -0,0 +1,49 @@ +# +# RF switch subsystem configuration +# +menuconfig RFKILL + tristate "RF switch subsystem support" + help + Say Y here if you want to have control over RF switches + found on many WiFi and Bluetooth cards. + + To compile this driver as a module, choose M here: the + module will be called rfkill. + +config RFKILL_PM + bool "Power off on suspend" + depends on RFKILL && PM + default y + +# LED trigger support +config RFKILL_LEDS + bool + depends on RFKILL + depends on LEDS_TRIGGERS = y || RFKILL = LEDS_TRIGGERS + default y + +config RFKILL_INPUT + bool "RF switch input support" if EXPERT + depends on RFKILL + depends on INPUT = y || RFKILL = INPUT + default y if !EXPERT + +config RFKILL_REGULATOR + tristate "Generic rfkill regulator driver" + depends on RFKILL || !RFKILL + depends on REGULATOR + help + This options enable controlling radio transmitters connected to + voltage regulator using the regulator framework. + + To compile this driver as a module, choose M here: the module will + be called rfkill-regulator. + +config RFKILL_GPIO + tristate "GPIO RFKILL driver" + depends on RFKILL && GPIOLIB && HAVE_CLK + default n + help + If you say yes here you get support of a generic gpio RFKILL + driver. The platform should fill in the appropriate fields in the + rfkill_gpio_platform_data structure and pass that to the driver. diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile new file mode 100644 index 00000000..31176878 --- /dev/null +++ b/net/rfkill/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the RF switch subsystem. +# + +rfkill-y += core.o +rfkill-$(CONFIG_RFKILL_INPUT) += input.o +obj-$(CONFIG_RFKILL) += rfkill.o +obj-$(CONFIG_RFKILL_REGULATOR) += rfkill-regulator.o +obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o diff --git a/net/rfkill/core.c b/net/rfkill/core.c new file mode 100644 index 00000000..df2dae6b --- /dev/null +++ b/net/rfkill/core.c @@ -0,0 +1,1283 @@ +/* + * Copyright (C) 2006 - 2007 Ivo van Doorn + * Copyright (C) 2007 Dmitry Torokhov + * Copyright 2009 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rfkill.h" + +#define POLL_INTERVAL (5 * HZ) + +#define RFKILL_BLOCK_HW BIT(0) +#define RFKILL_BLOCK_SW BIT(1) +#define RFKILL_BLOCK_SW_PREV BIT(2) +#define RFKILL_BLOCK_ANY (RFKILL_BLOCK_HW |\ + RFKILL_BLOCK_SW |\ + RFKILL_BLOCK_SW_PREV) +#define RFKILL_BLOCK_SW_SETCALL BIT(31) + +struct rfkill { + spinlock_t lock; + + const char *name; + enum rfkill_type type; + + unsigned long state; + + u32 idx; + + bool registered; + bool persistent; + + const struct rfkill_ops *ops; + void *data; + +#ifdef CONFIG_RFKILL_LEDS + struct led_trigger led_trigger; + const char *ledtrigname; +#endif + + struct device dev; + struct list_head node; + + struct delayed_work poll_work; + struct work_struct uevent_work; + struct work_struct sync_work; +}; +#define to_rfkill(d) container_of(d, struct rfkill, dev) + +struct rfkill_int_event { + struct list_head list; + struct rfkill_event ev; +}; + +struct rfkill_data { + struct list_head list; + struct list_head events; + struct mutex mtx; + wait_queue_head_t read_wait; + bool input_handler; +}; + + +MODULE_AUTHOR("Ivo van Doorn "); +MODULE_AUTHOR("Johannes Berg "); +MODULE_DESCRIPTION("RF switch support"); +MODULE_LICENSE("GPL"); + + +/* + * The locking here should be made much smarter, we currently have + * a bit of a stupid situation because drivers might want to register + * the rfkill struct under their own lock, and take this lock during + * rfkill method calls -- which will cause an AB-BA deadlock situation. + * + * To fix that, we need to rework this code here to be mostly lock-free + * and only use the mutex for list manipulations, not to protect the + * various other global variables. Then we can avoid holding the mutex + * around driver operations, and all is happy. + */ +static LIST_HEAD(rfkill_list); /* list of registered rf switches */ +static DEFINE_MUTEX(rfkill_global_mutex); +static LIST_HEAD(rfkill_fds); /* list of open fds of /dev/rfkill */ + +static unsigned int rfkill_default_state = 1; +module_param_named(default_state, rfkill_default_state, uint, 0444); +MODULE_PARM_DESC(default_state, + "Default initial state for all radio types, 0 = radio off"); + +static struct { + bool cur, sav; +} rfkill_global_states[NUM_RFKILL_TYPES]; + +static bool rfkill_epo_lock_active; + + +#ifdef CONFIG_RFKILL_LEDS +static void rfkill_led_trigger_event(struct rfkill *rfkill) +{ + struct led_trigger *trigger; + + if (!rfkill->registered) + return; + + trigger = &rfkill->led_trigger; + + if (rfkill->state & RFKILL_BLOCK_ANY) + led_trigger_event(trigger, LED_OFF); + else + led_trigger_event(trigger, LED_FULL); +} + +static void rfkill_led_trigger_activate(struct led_classdev *led) +{ + struct rfkill *rfkill; + + rfkill = container_of(led->trigger, struct rfkill, led_trigger); + + rfkill_led_trigger_event(rfkill); +} + +static int rfkill_led_trigger_register(struct rfkill *rfkill) +{ + rfkill->led_trigger.name = rfkill->ledtrigname + ? : dev_name(&rfkill->dev); + rfkill->led_trigger.activate = rfkill_led_trigger_activate; + return led_trigger_register(&rfkill->led_trigger); +} + +static void rfkill_led_trigger_unregister(struct rfkill *rfkill) +{ + led_trigger_unregister(&rfkill->led_trigger); +} +#else +static void rfkill_led_trigger_event(struct rfkill *rfkill) +{ +} + +static inline int rfkill_led_trigger_register(struct rfkill *rfkill) +{ + return 0; +} + +static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) +{ +} +#endif /* CONFIG_RFKILL_LEDS */ + +static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill, + enum rfkill_operation op) +{ + unsigned long flags; + + ev->idx = rfkill->idx; + ev->type = rfkill->type; + ev->op = op; + + spin_lock_irqsave(&rfkill->lock, flags); + ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW); + ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW | + RFKILL_BLOCK_SW_PREV)); + spin_unlock_irqrestore(&rfkill->lock, flags); +} + +static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op) +{ + struct rfkill_data *data; + struct rfkill_int_event *ev; + + list_for_each_entry(data, &rfkill_fds, list) { + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + continue; + rfkill_fill_event(&ev->ev, rfkill, op); + mutex_lock(&data->mtx); + list_add_tail(&ev->list, &data->events); + mutex_unlock(&data->mtx); + wake_up_interruptible(&data->read_wait); + } +} + +static void rfkill_event(struct rfkill *rfkill) +{ + if (!rfkill->registered) + return; + + kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE); + + /* also send event to /dev/rfkill */ + rfkill_send_events(rfkill, RFKILL_OP_CHANGE); +} + +static bool __rfkill_set_hw_state(struct rfkill *rfkill, + bool blocked, bool *change) +{ + unsigned long flags; + bool prev, any; + + BUG_ON(!rfkill); + + spin_lock_irqsave(&rfkill->lock, flags); + prev = !!(rfkill->state & RFKILL_BLOCK_HW); + if (blocked) + rfkill->state |= RFKILL_BLOCK_HW; + else + rfkill->state &= ~RFKILL_BLOCK_HW; + *change = prev != blocked; + any = rfkill->state & RFKILL_BLOCK_ANY; + spin_unlock_irqrestore(&rfkill->lock, flags); + + rfkill_led_trigger_event(rfkill); + + return any; +} + +/** + * rfkill_set_block - wrapper for set_block method + * + * @rfkill: the rfkill struct to use + * @blocked: the new software state + * + * Calls the set_block method (when applicable) and handles notifications + * etc. as well. + */ +static void rfkill_set_block(struct rfkill *rfkill, bool blocked) +{ + unsigned long flags; + int err; + + if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP)) + return; + + /* + * Some platforms (...!) generate input events which affect the + * _hard_ kill state -- whenever something tries to change the + * current software state query the hardware state too. + */ + if (rfkill->ops->query) + rfkill->ops->query(rfkill, rfkill->data); + + spin_lock_irqsave(&rfkill->lock, flags); + if (rfkill->state & RFKILL_BLOCK_SW) + rfkill->state |= RFKILL_BLOCK_SW_PREV; + else + rfkill->state &= ~RFKILL_BLOCK_SW_PREV; + + if (blocked) + rfkill->state |= RFKILL_BLOCK_SW; + else + rfkill->state &= ~RFKILL_BLOCK_SW; + + rfkill->state |= RFKILL_BLOCK_SW_SETCALL; + spin_unlock_irqrestore(&rfkill->lock, flags); + + err = rfkill->ops->set_block(rfkill->data, blocked); + + spin_lock_irqsave(&rfkill->lock, flags); + if (err) { + /* + * Failed -- reset status to _prev, this may be different + * from what set set _PREV to earlier in this function + * if rfkill_set_sw_state was invoked. + */ + if (rfkill->state & RFKILL_BLOCK_SW_PREV) + rfkill->state |= RFKILL_BLOCK_SW; + else + rfkill->state &= ~RFKILL_BLOCK_SW; + } + rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL; + rfkill->state &= ~RFKILL_BLOCK_SW_PREV; + spin_unlock_irqrestore(&rfkill->lock, flags); + + rfkill_led_trigger_event(rfkill); + rfkill_event(rfkill); +} + +#ifdef CONFIG_RFKILL_INPUT +static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); + +/** + * __rfkill_switch_all - Toggle state of all switches of given type + * @type: type of interfaces to be affected + * @state: the new state + * + * This function sets the state of all switches of given type, + * unless a specific switch is claimed by userspace (in which case, + * that switch is left alone) or suspended. + * + * Caller must have acquired rfkill_global_mutex. + */ +static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) +{ + struct rfkill *rfkill; + + rfkill_global_states[type].cur = blocked; + list_for_each_entry(rfkill, &rfkill_list, node) { + if (rfkill->type != type) + continue; + + rfkill_set_block(rfkill, blocked); + } +} + +/** + * rfkill_switch_all - Toggle state of all switches of given type + * @type: type of interfaces to be affected + * @state: the new state + * + * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state). + * Please refer to __rfkill_switch_all() for details. + * + * Does nothing if the EPO lock is active. + */ +void rfkill_switch_all(enum rfkill_type type, bool blocked) +{ + if (atomic_read(&rfkill_input_disabled)) + return; + + mutex_lock(&rfkill_global_mutex); + + if (!rfkill_epo_lock_active) + __rfkill_switch_all(type, blocked); + + mutex_unlock(&rfkill_global_mutex); +} + +/** + * rfkill_epo - emergency power off all transmitters + * + * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED, + * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex. + * + * The global state before the EPO is saved and can be restored later + * using rfkill_restore_states(). + */ +void rfkill_epo(void) +{ + struct rfkill *rfkill; + int i; + + if (atomic_read(&rfkill_input_disabled)) + return; + + mutex_lock(&rfkill_global_mutex); + + rfkill_epo_lock_active = true; + list_for_each_entry(rfkill, &rfkill_list, node) + rfkill_set_block(rfkill, true); + + for (i = 0; i < NUM_RFKILL_TYPES; i++) { + rfkill_global_states[i].sav = rfkill_global_states[i].cur; + rfkill_global_states[i].cur = true; + } + + mutex_unlock(&rfkill_global_mutex); +} + +/** + * rfkill_restore_states - restore global states + * + * Restore (and sync switches to) the global state from the + * states in rfkill_default_states. This can undo the effects of + * a call to rfkill_epo(). + */ +void rfkill_restore_states(void) +{ + int i; + + if (atomic_read(&rfkill_input_disabled)) + return; + + mutex_lock(&rfkill_global_mutex); + + rfkill_epo_lock_active = false; + for (i = 0; i < NUM_RFKILL_TYPES; i++) + __rfkill_switch_all(i, rfkill_global_states[i].sav); + mutex_unlock(&rfkill_global_mutex); +} + +/** + * rfkill_remove_epo_lock - unlock state changes + * + * Used by rfkill-input manually unlock state changes, when + * the EPO switch is deactivated. + */ +void rfkill_remove_epo_lock(void) +{ + if (atomic_read(&rfkill_input_disabled)) + return; + + mutex_lock(&rfkill_global_mutex); + rfkill_epo_lock_active = false; + mutex_unlock(&rfkill_global_mutex); +} + +/** + * rfkill_is_epo_lock_active - returns true EPO is active + * + * Returns 0 (false) if there is NOT an active EPO contidion, + * and 1 (true) if there is an active EPO contition, which + * locks all radios in one of the BLOCKED states. + * + * Can be called in atomic context. + */ +bool rfkill_is_epo_lock_active(void) +{ + return rfkill_epo_lock_active; +} + +/** + * rfkill_get_global_sw_state - returns global state for a type + * @type: the type to get the global state of + * + * Returns the current global state for a given wireless + * device type. + */ +bool rfkill_get_global_sw_state(const enum rfkill_type type) +{ + return rfkill_global_states[type].cur; +} +#endif + + +bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) +{ + bool ret, change; + + ret = __rfkill_set_hw_state(rfkill, blocked, &change); + + if (!rfkill->registered) + return ret; + + if (change) + schedule_work(&rfkill->uevent_work); + + return ret; +} +EXPORT_SYMBOL(rfkill_set_hw_state); + +static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) +{ + u32 bit = RFKILL_BLOCK_SW; + + /* if in a ops->set_block right now, use other bit */ + if (rfkill->state & RFKILL_BLOCK_SW_SETCALL) + bit = RFKILL_BLOCK_SW_PREV; + + if (blocked) + rfkill->state |= bit; + else + rfkill->state &= ~bit; +} + +bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) +{ + unsigned long flags; + bool prev, hwblock; + + BUG_ON(!rfkill); + + spin_lock_irqsave(&rfkill->lock, flags); + prev = !!(rfkill->state & RFKILL_BLOCK_SW); + __rfkill_set_sw_state(rfkill, blocked); + hwblock = !!(rfkill->state & RFKILL_BLOCK_HW); + blocked = blocked || hwblock; + spin_unlock_irqrestore(&rfkill->lock, flags); + + if (!rfkill->registered) + return blocked; + + if (prev != blocked && !hwblock) + schedule_work(&rfkill->uevent_work); + + rfkill_led_trigger_event(rfkill); + + return blocked; +} +EXPORT_SYMBOL(rfkill_set_sw_state); + +void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked) +{ + unsigned long flags; + + BUG_ON(!rfkill); + BUG_ON(rfkill->registered); + + spin_lock_irqsave(&rfkill->lock, flags); + __rfkill_set_sw_state(rfkill, blocked); + rfkill->persistent = true; + spin_unlock_irqrestore(&rfkill->lock, flags); +} +EXPORT_SYMBOL(rfkill_init_sw_state); + +void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) +{ + unsigned long flags; + bool swprev, hwprev; + + BUG_ON(!rfkill); + + spin_lock_irqsave(&rfkill->lock, flags); + + /* + * No need to care about prev/setblock ... this is for uevent only + * and that will get triggered by rfkill_set_block anyway. + */ + swprev = !!(rfkill->state & RFKILL_BLOCK_SW); + hwprev = !!(rfkill->state & RFKILL_BLOCK_HW); + __rfkill_set_sw_state(rfkill, sw); + if (hw) + rfkill->state |= RFKILL_BLOCK_HW; + else + rfkill->state &= ~RFKILL_BLOCK_HW; + + spin_unlock_irqrestore(&rfkill->lock, flags); + + if (!rfkill->registered) { + rfkill->persistent = true; + } else { + if (swprev != sw || hwprev != hw) + schedule_work(&rfkill->uevent_work); + + rfkill_led_trigger_event(rfkill); + } +} +EXPORT_SYMBOL(rfkill_set_states); + +static ssize_t rfkill_name_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%s\n", rfkill->name); +} + +static const char *rfkill_get_type_str(enum rfkill_type type) +{ + BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_FM + 1); + + switch (type) { + case RFKILL_TYPE_WLAN: + return "wlan"; + case RFKILL_TYPE_BLUETOOTH: + return "bluetooth"; + case RFKILL_TYPE_UWB: + return "ultrawideband"; + case RFKILL_TYPE_WIMAX: + return "wimax"; + case RFKILL_TYPE_WWAN: + return "wwan"; + case RFKILL_TYPE_GPS: + return "gps"; + case RFKILL_TYPE_FM: + return "fm"; + default: + BUG(); + } +} + +static ssize_t rfkill_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type)); +} + +static ssize_t rfkill_idx_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%d\n", rfkill->idx); +} + +static ssize_t rfkill_persistent_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%d\n", rfkill->persistent); +} + +static ssize_t rfkill_hard_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0 ); +} + +static ssize_t rfkill_soft_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0 ); +} + +static ssize_t rfkill_soft_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct rfkill *rfkill = to_rfkill(dev); + unsigned long state; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + err = strict_strtoul(buf, 0, &state); + if (err) + return err; + + if (state > 1 ) + return -EINVAL; + + mutex_lock(&rfkill_global_mutex); + rfkill_set_block(rfkill, state); + mutex_unlock(&rfkill_global_mutex); + + return err ?: count; +} + +static u8 user_state_from_blocked(unsigned long state) +{ + if (state & RFKILL_BLOCK_HW) + return RFKILL_USER_STATE_HARD_BLOCKED; + if (state & RFKILL_BLOCK_SW) + return RFKILL_USER_STATE_SOFT_BLOCKED; + + return RFKILL_USER_STATE_UNBLOCKED; +} + +static ssize_t rfkill_state_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rfkill *rfkill = to_rfkill(dev); + + return sprintf(buf, "%d\n", user_state_from_blocked(rfkill->state)); +} + +static ssize_t rfkill_state_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct rfkill *rfkill = to_rfkill(dev); + unsigned long state; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + err = strict_strtoul(buf, 0, &state); + if (err) + return err; + + if (state != RFKILL_USER_STATE_SOFT_BLOCKED && + state != RFKILL_USER_STATE_UNBLOCKED) + return -EINVAL; + + mutex_lock(&rfkill_global_mutex); + rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED); + mutex_unlock(&rfkill_global_mutex); + + return err ?: count; +} + +static ssize_t rfkill_claim_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", 0); +} + +static ssize_t rfkill_claim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + return -EOPNOTSUPP; +} + +static struct device_attribute rfkill_dev_attrs[] = { + __ATTR(name, S_IRUGO, rfkill_name_show, NULL), + __ATTR(type, S_IRUGO, rfkill_type_show, NULL), + __ATTR(index, S_IRUGO, rfkill_idx_show, NULL), + __ATTR(persistent, S_IRUGO, rfkill_persistent_show, NULL), + __ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store), + __ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store), + __ATTR(soft, S_IRUGO|S_IWUSR, rfkill_soft_show, rfkill_soft_store), + __ATTR(hard, S_IRUGO, rfkill_hard_show, NULL), + __ATTR_NULL +}; + +static void rfkill_release(struct device *dev) +{ + struct rfkill *rfkill = to_rfkill(dev); + + kfree(rfkill); +} + +static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct rfkill *rfkill = to_rfkill(dev); + unsigned long flags; + u32 state; + int error; + + error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name); + if (error) + return error; + error = add_uevent_var(env, "RFKILL_TYPE=%s", + rfkill_get_type_str(rfkill->type)); + if (error) + return error; + spin_lock_irqsave(&rfkill->lock, flags); + state = rfkill->state; + spin_unlock_irqrestore(&rfkill->lock, flags); + error = add_uevent_var(env, "RFKILL_STATE=%d", + user_state_from_blocked(state)); + return error; +} + +void rfkill_pause_polling(struct rfkill *rfkill) +{ + BUG_ON(!rfkill); + + if (!rfkill->ops->poll) + return; + + cancel_delayed_work_sync(&rfkill->poll_work); +} +EXPORT_SYMBOL(rfkill_pause_polling); + +#ifdef CONFIG_RFKILL_PM +void rfkill_resume_polling(struct rfkill *rfkill) +{ + BUG_ON(!rfkill); + + if (!rfkill->ops->poll) + return; + + schedule_work(&rfkill->poll_work.work); +} +EXPORT_SYMBOL(rfkill_resume_polling); + +static int rfkill_suspend(struct device *dev, pm_message_t state) +{ + struct rfkill *rfkill = to_rfkill(dev); + + rfkill_pause_polling(rfkill); + + return 0; +} + +static int rfkill_resume(struct device *dev) +{ + struct rfkill *rfkill = to_rfkill(dev); + bool cur; + + if (!rfkill->persistent) { + cur = !!(rfkill->state & RFKILL_BLOCK_SW); + rfkill_set_block(rfkill, cur); + } + + rfkill_resume_polling(rfkill); + + return 0; +} +#endif + +static struct class rfkill_class = { + .name = "rfkill", + .dev_release = rfkill_release, + .dev_attrs = rfkill_dev_attrs, + .dev_uevent = rfkill_dev_uevent, +#ifdef CONFIG_RFKILL_PM + .suspend = rfkill_suspend, + .resume = rfkill_resume, +#endif +}; + +bool rfkill_blocked(struct rfkill *rfkill) +{ + unsigned long flags; + u32 state; + + spin_lock_irqsave(&rfkill->lock, flags); + state = rfkill->state; + spin_unlock_irqrestore(&rfkill->lock, flags); + + return !!(state & RFKILL_BLOCK_ANY); +} +EXPORT_SYMBOL(rfkill_blocked); + + +struct rfkill * __must_check rfkill_alloc(const char *name, + struct device *parent, + const enum rfkill_type type, + const struct rfkill_ops *ops, + void *ops_data) +{ + struct rfkill *rfkill; + struct device *dev; + + if (WARN_ON(!ops)) + return NULL; + + if (WARN_ON(!ops->set_block)) + return NULL; + + if (WARN_ON(!name)) + return NULL; + + if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES)) + return NULL; + + rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL); + if (!rfkill) + return NULL; + + spin_lock_init(&rfkill->lock); + INIT_LIST_HEAD(&rfkill->node); + rfkill->type = type; + rfkill->name = name; + rfkill->ops = ops; + rfkill->data = ops_data; + + dev = &rfkill->dev; + dev->class = &rfkill_class; + dev->parent = parent; + device_initialize(dev); + + return rfkill; +} +EXPORT_SYMBOL(rfkill_alloc); + +static void rfkill_poll(struct work_struct *work) +{ + struct rfkill *rfkill; + + rfkill = container_of(work, struct rfkill, poll_work.work); + + /* + * Poll hardware state -- driver will use one of the + * rfkill_set{,_hw,_sw}_state functions and use its + * return value to update the current status. + */ + rfkill->ops->poll(rfkill, rfkill->data); + + schedule_delayed_work(&rfkill->poll_work, + round_jiffies_relative(POLL_INTERVAL)); +} + +static void rfkill_uevent_work(struct work_struct *work) +{ + struct rfkill *rfkill; + + rfkill = container_of(work, struct rfkill, uevent_work); + + mutex_lock(&rfkill_global_mutex); + rfkill_event(rfkill); + mutex_unlock(&rfkill_global_mutex); +} + +static void rfkill_sync_work(struct work_struct *work) +{ + struct rfkill *rfkill; + bool cur; + + rfkill = container_of(work, struct rfkill, sync_work); + + mutex_lock(&rfkill_global_mutex); + cur = rfkill_global_states[rfkill->type].cur; + rfkill_set_block(rfkill, cur); + mutex_unlock(&rfkill_global_mutex); +} + +int __must_check rfkill_register(struct rfkill *rfkill) +{ + static unsigned long rfkill_no; + struct device *dev = &rfkill->dev; + int error; + + BUG_ON(!rfkill); + + mutex_lock(&rfkill_global_mutex); + + if (rfkill->registered) { + error = -EALREADY; + goto unlock; + } + + rfkill->idx = rfkill_no; + dev_set_name(dev, "rfkill%lu", rfkill_no); + rfkill_no++; + + list_add_tail(&rfkill->node, &rfkill_list); + + error = device_add(dev); + if (error) + goto remove; + + error = rfkill_led_trigger_register(rfkill); + if (error) + goto devdel; + + rfkill->registered = true; + + INIT_DELAYED_WORK(&rfkill->poll_work, rfkill_poll); + INIT_WORK(&rfkill->uevent_work, rfkill_uevent_work); + INIT_WORK(&rfkill->sync_work, rfkill_sync_work); + + if (rfkill->ops->poll) + schedule_delayed_work(&rfkill->poll_work, + round_jiffies_relative(POLL_INTERVAL)); + + if (!rfkill->persistent || rfkill_epo_lock_active) { + schedule_work(&rfkill->sync_work); + } else { +#ifdef CONFIG_RFKILL_INPUT + bool soft_blocked = !!(rfkill->state & RFKILL_BLOCK_SW); + + if (!atomic_read(&rfkill_input_disabled)) + __rfkill_switch_all(rfkill->type, soft_blocked); +#endif + } + + rfkill_send_events(rfkill, RFKILL_OP_ADD); + + mutex_unlock(&rfkill_global_mutex); + return 0; + + devdel: + device_del(&rfkill->dev); + remove: + list_del_init(&rfkill->node); + unlock: + mutex_unlock(&rfkill_global_mutex); + return error; +} +EXPORT_SYMBOL(rfkill_register); + +void rfkill_unregister(struct rfkill *rfkill) +{ + BUG_ON(!rfkill); + + if (rfkill->ops->poll) + cancel_delayed_work_sync(&rfkill->poll_work); + + cancel_work_sync(&rfkill->uevent_work); + cancel_work_sync(&rfkill->sync_work); + + rfkill->registered = false; + + device_del(&rfkill->dev); + + mutex_lock(&rfkill_global_mutex); + rfkill_send_events(rfkill, RFKILL_OP_DEL); + list_del_init(&rfkill->node); + mutex_unlock(&rfkill_global_mutex); + + rfkill_led_trigger_unregister(rfkill); +} +EXPORT_SYMBOL(rfkill_unregister); + +void rfkill_destroy(struct rfkill *rfkill) +{ + if (rfkill) + put_device(&rfkill->dev); +} +EXPORT_SYMBOL(rfkill_destroy); + +static int rfkill_fop_open(struct inode *inode, struct file *file) +{ + struct rfkill_data *data; + struct rfkill *rfkill; + struct rfkill_int_event *ev, *tmp; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + INIT_LIST_HEAD(&data->events); + mutex_init(&data->mtx); + init_waitqueue_head(&data->read_wait); + + mutex_lock(&rfkill_global_mutex); + mutex_lock(&data->mtx); + /* + * start getting events from elsewhere but hold mtx to get + * startup events added first + */ + + list_for_each_entry(rfkill, &rfkill_list, node) { + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + goto free; + rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD); + list_add_tail(&ev->list, &data->events); + } + list_add(&data->list, &rfkill_fds); + mutex_unlock(&data->mtx); + mutex_unlock(&rfkill_global_mutex); + + file->private_data = data; + + return nonseekable_open(inode, file); + + free: + mutex_unlock(&data->mtx); + mutex_unlock(&rfkill_global_mutex); + mutex_destroy(&data->mtx); + list_for_each_entry_safe(ev, tmp, &data->events, list) + kfree(ev); + kfree(data); + return -ENOMEM; +} + +static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait) +{ + struct rfkill_data *data = file->private_data; + unsigned int res = POLLOUT | POLLWRNORM; + + poll_wait(file, &data->read_wait, wait); + + mutex_lock(&data->mtx); + if (!list_empty(&data->events)) + res = POLLIN | POLLRDNORM; + mutex_unlock(&data->mtx); + + return res; +} + +static bool rfkill_readable(struct rfkill_data *data) +{ + bool r; + + mutex_lock(&data->mtx); + r = !list_empty(&data->events); + mutex_unlock(&data->mtx); + + return r; +} + +static ssize_t rfkill_fop_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct rfkill_data *data = file->private_data; + struct rfkill_int_event *ev; + unsigned long sz; + int ret; + + mutex_lock(&data->mtx); + + while (list_empty(&data->events)) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + mutex_unlock(&data->mtx); + ret = wait_event_interruptible(data->read_wait, + rfkill_readable(data)); + mutex_lock(&data->mtx); + + if (ret) + goto out; + } + + ev = list_first_entry(&data->events, struct rfkill_int_event, + list); + + sz = min_t(unsigned long, sizeof(ev->ev), count); + ret = sz; + if (copy_to_user(buf, &ev->ev, sz)) + ret = -EFAULT; + + list_del(&ev->list); + kfree(ev); + out: + mutex_unlock(&data->mtx); + return ret; +} + +static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct rfkill *rfkill; + struct rfkill_event ev; + + /* we don't need the 'hard' variable but accept it */ + if (count < RFKILL_EVENT_SIZE_V1 - 1) + return -EINVAL; + + /* + * Copy as much data as we can accept into our 'ev' buffer, + * but tell userspace how much we've copied so it can determine + * our API version even in a write() call, if it cares. + */ + count = min(count, sizeof(ev)); + if (copy_from_user(&ev, buf, count)) + return -EFAULT; + + if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL) + return -EINVAL; + + if (ev.type >= NUM_RFKILL_TYPES) + return -EINVAL; + + mutex_lock(&rfkill_global_mutex); + + if (ev.op == RFKILL_OP_CHANGE_ALL) { + if (ev.type == RFKILL_TYPE_ALL) { + enum rfkill_type i; + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = ev.soft; + } else { + rfkill_global_states[ev.type].cur = ev.soft; + } + } + + list_for_each_entry(rfkill, &rfkill_list, node) { + if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL) + continue; + + if (rfkill->type != ev.type && ev.type != RFKILL_TYPE_ALL) + continue; + + rfkill_set_block(rfkill, ev.soft); + } + mutex_unlock(&rfkill_global_mutex); + + return count; +} + +static int rfkill_fop_release(struct inode *inode, struct file *file) +{ + struct rfkill_data *data = file->private_data; + struct rfkill_int_event *ev, *tmp; + + mutex_lock(&rfkill_global_mutex); + list_del(&data->list); + mutex_unlock(&rfkill_global_mutex); + + mutex_destroy(&data->mtx); + list_for_each_entry_safe(ev, tmp, &data->events, list) + kfree(ev); + +#ifdef CONFIG_RFKILL_INPUT + if (data->input_handler) + if (atomic_dec_return(&rfkill_input_disabled) == 0) + printk(KERN_DEBUG "rfkill: input handler enabled\n"); +#endif + + kfree(data); + + return 0; +} + +#ifdef CONFIG_RFKILL_INPUT +static long rfkill_fop_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct rfkill_data *data = file->private_data; + + if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC) + return -ENOSYS; + + if (_IOC_NR(cmd) != RFKILL_IOC_NOINPUT) + return -ENOSYS; + + mutex_lock(&data->mtx); + + if (!data->input_handler) { + if (atomic_inc_return(&rfkill_input_disabled) == 1) + printk(KERN_DEBUG "rfkill: input handler disabled\n"); + data->input_handler = true; + } + + mutex_unlock(&data->mtx); + + return 0; +} +#endif + +static const struct file_operations rfkill_fops = { + .owner = THIS_MODULE, + .open = rfkill_fop_open, + .read = rfkill_fop_read, + .write = rfkill_fop_write, + .poll = rfkill_fop_poll, + .release = rfkill_fop_release, +#ifdef CONFIG_RFKILL_INPUT + .unlocked_ioctl = rfkill_fop_ioctl, + .compat_ioctl = rfkill_fop_ioctl, +#endif + .llseek = no_llseek, +}; + +static struct miscdevice rfkill_miscdev = { + .name = "rfkill", + .fops = &rfkill_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int __init rfkill_init(void) +{ + int error; + int i; + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = !rfkill_default_state; + + error = class_register(&rfkill_class); + if (error) + goto out; + + error = misc_register(&rfkill_miscdev); + if (error) { + class_unregister(&rfkill_class); + goto out; + } + +#ifdef CONFIG_RFKILL_INPUT + error = rfkill_handler_init(); + if (error) { + misc_deregister(&rfkill_miscdev); + class_unregister(&rfkill_class); + goto out; + } +#endif + + out: + return error; +} +subsys_initcall(rfkill_init); + +static void __exit rfkill_exit(void) +{ +#ifdef CONFIG_RFKILL_INPUT + rfkill_handler_exit(); +#endif + misc_deregister(&rfkill_miscdev); + class_unregister(&rfkill_class); +} +module_exit(rfkill_exit); diff --git a/net/rfkill/input.c b/net/rfkill/input.c new file mode 100644 index 00000000..1bca6d49 --- /dev/null +++ b/net/rfkill/input.c @@ -0,0 +1,350 @@ +/* + * Input layer to RF Kill interface connector + * + * Copyright (c) 2007 Dmitry Torokhov + * Copyright 2009 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * If you ever run into a situation in which you have a SW_ type rfkill + * input device, then you can revive code that was removed in the patch + * "rfkill-input: remove unused code". + */ + +#include +#include +#include +#include +#include +#include + +#include "rfkill.h" + +enum rfkill_input_master_mode { + RFKILL_INPUT_MASTER_UNLOCK = 0, + RFKILL_INPUT_MASTER_RESTORE = 1, + RFKILL_INPUT_MASTER_UNBLOCKALL = 2, + NUM_RFKILL_INPUT_MASTER_MODES +}; + +/* Delay (in ms) between consecutive switch ops */ +#define RFKILL_OPS_DELAY 200 + +static enum rfkill_input_master_mode rfkill_master_switch_mode = + RFKILL_INPUT_MASTER_UNBLOCKALL; +module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0); +MODULE_PARM_DESC(master_switch_mode, + "SW_RFKILL_ALL ON should: 0=do nothing (only unlock); 1=restore; 2=unblock all"); + +static spinlock_t rfkill_op_lock; +static bool rfkill_op_pending; +static unsigned long rfkill_sw_pending[BITS_TO_LONGS(NUM_RFKILL_TYPES)]; +static unsigned long rfkill_sw_state[BITS_TO_LONGS(NUM_RFKILL_TYPES)]; + +enum rfkill_sched_op { + RFKILL_GLOBAL_OP_EPO = 0, + RFKILL_GLOBAL_OP_RESTORE, + RFKILL_GLOBAL_OP_UNLOCK, + RFKILL_GLOBAL_OP_UNBLOCK, +}; + +static enum rfkill_sched_op rfkill_master_switch_op; +static enum rfkill_sched_op rfkill_op; + +static void __rfkill_handle_global_op(enum rfkill_sched_op op) +{ + unsigned int i; + + switch (op) { + case RFKILL_GLOBAL_OP_EPO: + rfkill_epo(); + break; + case RFKILL_GLOBAL_OP_RESTORE: + rfkill_restore_states(); + break; + case RFKILL_GLOBAL_OP_UNLOCK: + rfkill_remove_epo_lock(); + break; + case RFKILL_GLOBAL_OP_UNBLOCK: + rfkill_remove_epo_lock(); + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_switch_all(i, false); + break; + default: + /* memory corruption or bug, fail safely */ + rfkill_epo(); + WARN(1, "Unknown requested operation %d! " + "rfkill Emergency Power Off activated\n", + op); + } +} + +static void __rfkill_handle_normal_op(const enum rfkill_type type, + const bool complement) +{ + bool blocked; + + blocked = rfkill_get_global_sw_state(type); + if (complement) + blocked = !blocked; + + rfkill_switch_all(type, blocked); +} + +static void rfkill_op_handler(struct work_struct *work) +{ + unsigned int i; + bool c; + + spin_lock_irq(&rfkill_op_lock); + do { + if (rfkill_op_pending) { + enum rfkill_sched_op op = rfkill_op; + rfkill_op_pending = false; + memset(rfkill_sw_pending, 0, + sizeof(rfkill_sw_pending)); + spin_unlock_irq(&rfkill_op_lock); + + __rfkill_handle_global_op(op); + + spin_lock_irq(&rfkill_op_lock); + + /* + * handle global ops first -- during unlocked period + * we might have gotten a new global op. + */ + if (rfkill_op_pending) + continue; + } + + if (rfkill_is_epo_lock_active()) + continue; + + for (i = 0; i < NUM_RFKILL_TYPES; i++) { + if (__test_and_clear_bit(i, rfkill_sw_pending)) { + c = __test_and_clear_bit(i, rfkill_sw_state); + spin_unlock_irq(&rfkill_op_lock); + + __rfkill_handle_normal_op(i, c); + + spin_lock_irq(&rfkill_op_lock); + } + } + } while (rfkill_op_pending); + spin_unlock_irq(&rfkill_op_lock); +} + +static DECLARE_DELAYED_WORK(rfkill_op_work, rfkill_op_handler); +static unsigned long rfkill_last_scheduled; + +static unsigned long rfkill_ratelimit(const unsigned long last) +{ + const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY); + return time_after(jiffies, last + delay) ? 0 : delay; +} + +static void rfkill_schedule_ratelimited(void) +{ + if (delayed_work_pending(&rfkill_op_work)) + return; + schedule_delayed_work(&rfkill_op_work, + rfkill_ratelimit(rfkill_last_scheduled)); + rfkill_last_scheduled = jiffies; +} + +static void rfkill_schedule_global_op(enum rfkill_sched_op op) +{ + unsigned long flags; + + spin_lock_irqsave(&rfkill_op_lock, flags); + rfkill_op = op; + rfkill_op_pending = true; + if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) { + /* bypass the limiter for EPO */ + cancel_delayed_work(&rfkill_op_work); + schedule_delayed_work(&rfkill_op_work, 0); + rfkill_last_scheduled = jiffies; + } else + rfkill_schedule_ratelimited(); + spin_unlock_irqrestore(&rfkill_op_lock, flags); +} + +static void rfkill_schedule_toggle(enum rfkill_type type) +{ + unsigned long flags; + + if (rfkill_is_epo_lock_active()) + return; + + spin_lock_irqsave(&rfkill_op_lock, flags); + if (!rfkill_op_pending) { + __set_bit(type, rfkill_sw_pending); + __change_bit(type, rfkill_sw_state); + rfkill_schedule_ratelimited(); + } + spin_unlock_irqrestore(&rfkill_op_lock, flags); +} + +static void rfkill_schedule_evsw_rfkillall(int state) +{ + if (state) + rfkill_schedule_global_op(rfkill_master_switch_op); + else + rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO); +} + +static void rfkill_event(struct input_handle *handle, unsigned int type, + unsigned int code, int data) +{ + if (type == EV_KEY && data == 1) { + switch (code) { + case KEY_WLAN: + rfkill_schedule_toggle(RFKILL_TYPE_WLAN); + break; + case KEY_BLUETOOTH: + rfkill_schedule_toggle(RFKILL_TYPE_BLUETOOTH); + break; + case KEY_UWB: + rfkill_schedule_toggle(RFKILL_TYPE_UWB); + break; + case KEY_WIMAX: + rfkill_schedule_toggle(RFKILL_TYPE_WIMAX); + break; + case KEY_RFKILL: + rfkill_schedule_toggle(RFKILL_TYPE_ALL); + break; + } + } else if (type == EV_SW && code == SW_RFKILL_ALL) + rfkill_schedule_evsw_rfkillall(data); +} + +static int rfkill_connect(struct input_handler *handler, struct input_dev *dev, + const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "rfkill"; + + /* causes rfkill_start() to be called */ + error = input_register_handle(handle); + if (error) + goto err_free_handle; + + error = input_open_device(handle); + if (error) + goto err_unregister_handle; + + return 0; + + err_unregister_handle: + input_unregister_handle(handle); + err_free_handle: + kfree(handle); + return error; +} + +static void rfkill_start(struct input_handle *handle) +{ + /* + * Take event_lock to guard against configuration changes, we + * should be able to deal with concurrency with rfkill_event() + * just fine (which event_lock will also avoid). + */ + spin_lock_irq(&handle->dev->event_lock); + + if (test_bit(EV_SW, handle->dev->evbit) && + test_bit(SW_RFKILL_ALL, handle->dev->swbit)) + rfkill_schedule_evsw_rfkillall(test_bit(SW_RFKILL_ALL, + handle->dev->sw)); + + spin_unlock_irq(&handle->dev->event_lock); +} + +static void rfkill_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +static const struct input_device_id rfkill_ids[] = { + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_WIMAX)] = BIT_MASK(KEY_WIMAX) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT, + .evbit = { BIT_MASK(EV_KEY) }, + .keybit = { [BIT_WORD(KEY_RFKILL)] = BIT_MASK(KEY_RFKILL) }, + }, + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_SWBIT, + .evbit = { BIT(EV_SW) }, + .swbit = { [BIT_WORD(SW_RFKILL_ALL)] = BIT_MASK(SW_RFKILL_ALL) }, + }, + { } +}; + +static struct input_handler rfkill_handler = { + .name = "rfkill", + .event = rfkill_event, + .connect = rfkill_connect, + .start = rfkill_start, + .disconnect = rfkill_disconnect, + .id_table = rfkill_ids, +}; + +int __init rfkill_handler_init(void) +{ + switch (rfkill_master_switch_mode) { + case RFKILL_INPUT_MASTER_UNBLOCKALL: + rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNBLOCK; + break; + case RFKILL_INPUT_MASTER_RESTORE: + rfkill_master_switch_op = RFKILL_GLOBAL_OP_RESTORE; + break; + case RFKILL_INPUT_MASTER_UNLOCK: + rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNLOCK; + break; + default: + return -EINVAL; + } + + spin_lock_init(&rfkill_op_lock); + + /* Avoid delay at first schedule */ + rfkill_last_scheduled = + jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1; + return input_register_handler(&rfkill_handler); +} + +void __exit rfkill_handler_exit(void) +{ + input_unregister_handler(&rfkill_handler); + cancel_delayed_work_sync(&rfkill_op_work); +} diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c new file mode 100644 index 00000000..256c5ddd --- /dev/null +++ b/net/rfkill/rfkill-gpio.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2011, NVIDIA Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +enum rfkill_gpio_clk_state { + UNSPECIFIED = 0, + PWR_ENABLED, + PWR_DISABLED +}; + +#define PWR_CLK_SET(_RF, _EN) \ + ((_RF)->pwr_clk_enabled = (!(_EN) ? PWR_ENABLED : PWR_DISABLED)) +#define PWR_CLK_ENABLED(_RF) ((_RF)->pwr_clk_enabled == PWR_ENABLED) +#define PWR_CLK_DISABLED(_RF) ((_RF)->pwr_clk_enabled != PWR_ENABLED) + +struct rfkill_gpio_data { + struct rfkill_gpio_platform_data *pdata; + struct rfkill *rfkill_dev; + char *reset_name; + char *shutdown_name; + enum rfkill_gpio_clk_state pwr_clk_enabled; + struct clk *pwr_clk; +}; + +static int rfkill_gpio_set_power(void *data, bool blocked) +{ + struct rfkill_gpio_data *rfkill = data; + + if (blocked) { + if (gpio_is_valid(rfkill->pdata->shutdown_gpio)) + gpio_direction_output(rfkill->pdata->shutdown_gpio, 0); + if (gpio_is_valid(rfkill->pdata->reset_gpio)) + gpio_direction_output(rfkill->pdata->reset_gpio, 0); + if (rfkill->pwr_clk && PWR_CLK_ENABLED(rfkill)) + clk_disable(rfkill->pwr_clk); + } else { + if (rfkill->pwr_clk && PWR_CLK_DISABLED(rfkill)) + clk_enable(rfkill->pwr_clk); + if (gpio_is_valid(rfkill->pdata->reset_gpio)) + gpio_direction_output(rfkill->pdata->reset_gpio, 1); + if (gpio_is_valid(rfkill->pdata->shutdown_gpio)) + gpio_direction_output(rfkill->pdata->shutdown_gpio, 1); + } + + if (rfkill->pwr_clk) + PWR_CLK_SET(rfkill, blocked); + + return 0; +} + +static const struct rfkill_ops rfkill_gpio_ops = { + .set_block = rfkill_gpio_set_power, +}; + +static int rfkill_gpio_probe(struct platform_device *pdev) +{ + struct rfkill_gpio_data *rfkill; + struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data; + int ret = 0; + int len = 0; + + if (!pdata) { + pr_warn("%s: No platform data specified\n", __func__); + return -EINVAL; + } + + /* make sure at-least one of the GPIO is defined and that + * a name is specified for this instance */ + if (!pdata->name || (!gpio_is_valid(pdata->reset_gpio) && + !gpio_is_valid(pdata->shutdown_gpio))) { + pr_warn("%s: invalid platform data\n", __func__); + return -EINVAL; + } + + rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL); + if (!rfkill) + return -ENOMEM; + + rfkill->pdata = pdata; + + len = strlen(pdata->name); + rfkill->reset_name = kzalloc(len + 7, GFP_KERNEL); + if (!rfkill->reset_name) { + ret = -ENOMEM; + goto fail_alloc; + } + + rfkill->shutdown_name = kzalloc(len + 10, GFP_KERNEL); + if (!rfkill->shutdown_name) { + ret = -ENOMEM; + goto fail_reset_name; + } + + snprintf(rfkill->reset_name, len + 6 , "%s_reset", pdata->name); + snprintf(rfkill->shutdown_name, len + 9, "%s_shutdown", pdata->name); + + if (pdata->power_clk_name) { + rfkill->pwr_clk = clk_get(&pdev->dev, pdata->power_clk_name); + if (IS_ERR(rfkill->pwr_clk)) { + pr_warn("%s: can't find pwr_clk.\n", __func__); + goto fail_shutdown_name; + } + } + + if (gpio_is_valid(pdata->reset_gpio)) { + ret = gpio_request(pdata->reset_gpio, rfkill->reset_name); + if (ret) { + pr_warn("%s: failed to get reset gpio.\n", __func__); + goto fail_clock; + } + } + + if (gpio_is_valid(pdata->shutdown_gpio)) { + ret = gpio_request(pdata->shutdown_gpio, rfkill->shutdown_name); + if (ret) { + pr_warn("%s: failed to get shutdown gpio.\n", __func__); + goto fail_reset; + } + } + + rfkill->rfkill_dev = rfkill_alloc(pdata->name, &pdev->dev, pdata->type, + &rfkill_gpio_ops, rfkill); + if (!rfkill->rfkill_dev) + goto fail_shutdown; + + ret = rfkill_register(rfkill->rfkill_dev); + if (ret < 0) + goto fail_rfkill; + + platform_set_drvdata(pdev, rfkill); + + dev_info(&pdev->dev, "%s device registered.\n", pdata->name); + + return 0; + +fail_rfkill: + rfkill_destroy(rfkill->rfkill_dev); +fail_shutdown: + if (gpio_is_valid(pdata->shutdown_gpio)) + gpio_free(pdata->shutdown_gpio); +fail_reset: + if (gpio_is_valid(pdata->reset_gpio)) + gpio_free(pdata->reset_gpio); +fail_clock: + if (rfkill->pwr_clk) + clk_put(rfkill->pwr_clk); +fail_shutdown_name: + kfree(rfkill->shutdown_name); +fail_reset_name: + kfree(rfkill->reset_name); +fail_alloc: + kfree(rfkill); + + return ret; +} + +static int rfkill_gpio_remove(struct platform_device *pdev) +{ + struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev); + + rfkill_unregister(rfkill->rfkill_dev); + rfkill_destroy(rfkill->rfkill_dev); + if (gpio_is_valid(rfkill->pdata->shutdown_gpio)) + gpio_free(rfkill->pdata->shutdown_gpio); + if (gpio_is_valid(rfkill->pdata->reset_gpio)) + gpio_free(rfkill->pdata->reset_gpio); + if (rfkill->pwr_clk && PWR_CLK_ENABLED(rfkill)) + clk_disable(rfkill->pwr_clk); + if (rfkill->pwr_clk) + clk_put(rfkill->pwr_clk); + kfree(rfkill->shutdown_name); + kfree(rfkill->reset_name); + kfree(rfkill); + + return 0; +} + +static struct platform_driver rfkill_gpio_driver = { + .probe = rfkill_gpio_probe, + .remove = __devexit_p(rfkill_gpio_remove), + .driver = { + .name = "rfkill_gpio", + .owner = THIS_MODULE, + }, +}; + +static int __init rfkill_gpio_init(void) +{ + return platform_driver_register(&rfkill_gpio_driver); +} + +static void __exit rfkill_gpio_exit(void) +{ + platform_driver_unregister(&rfkill_gpio_driver); +} + +module_init(rfkill_gpio_init); +module_exit(rfkill_gpio_exit); + +MODULE_DESCRIPTION("gpio rfkill"); +MODULE_AUTHOR("NVIDIA"); +MODULE_LICENSE("GPL"); diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c new file mode 100644 index 00000000..18dc512a --- /dev/null +++ b/net/rfkill/rfkill-regulator.c @@ -0,0 +1,164 @@ +/* + * rfkill-regulator.c - Regulator consumer driver for rfkill + * + * Copyright (C) 2009 Guiming Zhuo + * Copyright (C) 2011 Antonio Ospite + * + * Implementation inspired by leds-regulator driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct rfkill_regulator_data { + struct rfkill *rf_kill; + bool reg_enabled; + + struct regulator *vcc; +}; + +static int rfkill_regulator_set_block(void *data, bool blocked) +{ + struct rfkill_regulator_data *rfkill_data = data; + + pr_debug("%s: blocked: %d\n", __func__, blocked); + + if (blocked) { + if (rfkill_data->reg_enabled) { + regulator_disable(rfkill_data->vcc); + rfkill_data->reg_enabled = 0; + } + } else { + if (!rfkill_data->reg_enabled) { + regulator_enable(rfkill_data->vcc); + rfkill_data->reg_enabled = 1; + } + } + + pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__, + regulator_is_enabled(rfkill_data->vcc)); + + return 0; +} + +struct rfkill_ops rfkill_regulator_ops = { + .set_block = rfkill_regulator_set_block, +}; + +static int __devinit rfkill_regulator_probe(struct platform_device *pdev) +{ + struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data; + struct rfkill_regulator_data *rfkill_data; + struct regulator *vcc; + struct rfkill *rf_kill; + int ret = 0; + + if (pdata == NULL) { + dev_err(&pdev->dev, "no platform data\n"); + return -ENODEV; + } + + if (pdata->name == NULL || pdata->type == 0) { + dev_err(&pdev->dev, "invalid name or type in platform data\n"); + return -EINVAL; + } + + vcc = regulator_get_exclusive(&pdev->dev, "vrfkill"); + if (IS_ERR(vcc)) { + dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name); + ret = PTR_ERR(vcc); + goto out; + } + + rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL); + if (rfkill_data == NULL) { + ret = -ENOMEM; + goto err_data_alloc; + } + + rf_kill = rfkill_alloc(pdata->name, &pdev->dev, + pdata->type, + &rfkill_regulator_ops, rfkill_data); + if (rf_kill == NULL) { + dev_err(&pdev->dev, "Cannot alloc rfkill device\n"); + ret = -ENOMEM; + goto err_rfkill_alloc; + } + + if (regulator_is_enabled(vcc)) { + dev_dbg(&pdev->dev, "Regulator already enabled\n"); + rfkill_data->reg_enabled = 1; + } + rfkill_data->vcc = vcc; + rfkill_data->rf_kill = rf_kill; + + ret = rfkill_register(rf_kill); + if (ret) { + dev_err(&pdev->dev, "Cannot register rfkill device\n"); + goto err_rfkill_register; + } + + platform_set_drvdata(pdev, rfkill_data); + dev_info(&pdev->dev, "%s initialized\n", pdata->name); + + return 0; + +err_rfkill_register: + rfkill_destroy(rf_kill); +err_rfkill_alloc: + kfree(rfkill_data); +err_data_alloc: + regulator_put(vcc); +out: + return ret; +} + +static int __devexit rfkill_regulator_remove(struct platform_device *pdev) +{ + struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev); + struct rfkill *rf_kill = rfkill_data->rf_kill; + + rfkill_unregister(rf_kill); + rfkill_destroy(rf_kill); + regulator_put(rfkill_data->vcc); + kfree(rfkill_data); + + return 0; +} + +static struct platform_driver rfkill_regulator_driver = { + .probe = rfkill_regulator_probe, + .remove = __devexit_p(rfkill_regulator_remove), + .driver = { + .name = "rfkill-regulator", + .owner = THIS_MODULE, + }, +}; + +static int __init rfkill_regulator_init(void) +{ + return platform_driver_register(&rfkill_regulator_driver); +} +module_init(rfkill_regulator_init); + +static void __exit rfkill_regulator_exit(void) +{ + platform_driver_unregister(&rfkill_regulator_driver); +} +module_exit(rfkill_regulator_exit); + +MODULE_AUTHOR("Guiming Zhuo "); +MODULE_AUTHOR("Antonio Ospite "); +MODULE_DESCRIPTION("Regulator consumer driver for rfkill"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:rfkill-regulator"); diff --git a/net/rfkill/rfkill.h b/net/rfkill/rfkill.h new file mode 100644 index 00000000..d1117cb6 --- /dev/null +++ b/net/rfkill/rfkill.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007 Ivo van Doorn + * Copyright 2009 Johannes Berg + */ + +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef __RFKILL_INPUT_H +#define __RFKILL_INPUT_H + +/* core code */ +void rfkill_switch_all(const enum rfkill_type type, bool blocked); +void rfkill_epo(void); +void rfkill_restore_states(void); +void rfkill_remove_epo_lock(void); +bool rfkill_is_epo_lock_active(void); +bool rfkill_get_global_sw_state(const enum rfkill_type type); + +/* input handler */ +int rfkill_handler_init(void); +void rfkill_handler_exit(void); + +#endif /* __RFKILL_INPUT_H */ diff --git a/net/rose/Makefile b/net/rose/Makefile new file mode 100644 index 00000000..fa248116 --- /dev/null +++ b/net/rose/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux Rose (X.25 PLP) layer. +# + +obj-$(CONFIG_ROSE) += rose.o + +rose-y := af_rose.o rose_dev.o rose_in.o rose_link.o rose_loopback.o \ + rose_out.o rose_route.o rose_subr.o rose_timer.o +rose-$(CONFIG_SYSCTL) += sysctl_net_rose.o diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c new file mode 100644 index 00000000..f9ea925a --- /dev/null +++ b/net/rose/af_rose.c @@ -0,0 +1,1643 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) + * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int rose_ndevs = 10; + +int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0; +int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1; +int sysctl_rose_reset_request_timeout = ROSE_DEFAULT_T2; +int sysctl_rose_clear_request_timeout = ROSE_DEFAULT_T3; +int sysctl_rose_no_activity_timeout = ROSE_DEFAULT_IDLE; +int sysctl_rose_ack_hold_back_timeout = ROSE_DEFAULT_HB; +int sysctl_rose_routing_control = ROSE_DEFAULT_ROUTING; +int sysctl_rose_link_fail_timeout = ROSE_DEFAULT_FAIL_TIMEOUT; +int sysctl_rose_maximum_vcs = ROSE_DEFAULT_MAXVC; +int sysctl_rose_window_size = ROSE_DEFAULT_WINDOW_SIZE; + +static HLIST_HEAD(rose_list); +static DEFINE_SPINLOCK(rose_list_lock); + +static const struct proto_ops rose_proto_ops; + +ax25_address rose_callsign; + +/* + * ROSE network devices are virtual network devices encapsulating ROSE + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key rose_netdev_xmit_lock_key; +static struct lock_class_key rose_netdev_addr_lock_key; + +static void rose_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); +} + +static void rose_set_lockdep_key(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); + netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); +} + +/* + * Convert a ROSE address into text. + */ +char *rose2asc(char *buf, const rose_address *addr) +{ + if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 && + addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 && + addr->rose_addr[4] == 0x00) { + strcpy(buf, "*"); + } else { + sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, + addr->rose_addr[1] & 0xFF, + addr->rose_addr[2] & 0xFF, + addr->rose_addr[3] & 0xFF, + addr->rose_addr[4] & 0xFF); + } + + return buf; +} + +/* + * Compare two ROSE addresses, 0 == equal. + */ +int rosecmp(rose_address *addr1, rose_address *addr2) +{ + int i; + + for (i = 0; i < 5; i++) + if (addr1->rose_addr[i] != addr2->rose_addr[i]) + return 1; + + return 0; +} + +/* + * Compare two ROSE addresses for only mask digits, 0 == equal. + */ +int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask) +{ + unsigned int i, j; + + if (mask > 10) + return 1; + + for (i = 0; i < mask; i++) { + j = i / 2; + + if ((i % 2) != 0) { + if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F)) + return 1; + } else { + if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0)) + return 1; + } + } + + return 0; +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void rose_remove_socket(struct sock *sk) +{ + spin_lock_bh(&rose_list_lock); + sk_del_node_init(sk); + spin_unlock_bh(&rose_list_lock); +} + +/* + * Kill all bound sockets on a broken link layer connection to a + * particular neighbour. + */ +void rose_kill_by_neigh(struct rose_neigh *neigh) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (rose->neighbour == neigh) { + rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); + rose->neighbour->use--; + rose->neighbour = NULL; + } + } + spin_unlock_bh(&rose_list_lock); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void rose_kill_by_device(struct net_device *dev) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (rose->device == dev) { + rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); + rose->neighbour->use--; + rose->device = NULL; + } + } + spin_unlock_bh(&rose_list_lock); +} + +/* + * Handle device status changes. + */ +static int rose_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + switch (dev->type) { + case ARPHRD_ROSE: + rose_kill_by_device(dev); + break; + case ARPHRD_AX25: + rose_link_device_down(dev); + rose_rt_device_down(dev); + break; + } + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void rose_insert_socket(struct sock *sk) +{ + + spin_lock_bh(&rose_list_lock); + sk_add_node(sk, &rose_list); + spin_unlock_bh(&rose_list_lock); +} + +/* + * Find a socket that wants to accept the Call Request we just + * received. + */ +static struct sock *rose_find_listener(rose_address *addr, ax25_address *call) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (!rosecmp(&rose->source_addr, addr) && + !ax25cmp(&rose->source_call, call) && + !rose->source_ndigis && s->sk_state == TCP_LISTEN) + goto found; + } + + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (!rosecmp(&rose->source_addr, addr) && + !ax25cmp(&rose->source_call, &null_ax25_address) && + s->sk_state == TCP_LISTEN) + goto found; + } + s = NULL; +found: + spin_unlock_bh(&rose_list_lock); + return s; +} + +/* + * Find a connected ROSE socket given my LCI and device. + */ +struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (rose->lci == lci && rose->neighbour == neigh) + goto found; + } + s = NULL; +found: + spin_unlock_bh(&rose_list_lock); + return s; +} + +/* + * Find a unique LCI for a given device. + */ +unsigned int rose_new_lci(struct rose_neigh *neigh) +{ + int lci; + + if (neigh->dce_mode) { + for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++) + if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) + return lci; + } else { + for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--) + if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) + return lci; + } + + return 0; +} + +/* + * Deferred destroy. + */ +void rose_destroy_socket(struct sock *); + +/* + * Handler for deferred kills. + */ +static void rose_destroy_timer(unsigned long data) +{ + rose_destroy_socket((struct sock *)data); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + */ +void rose_destroy_socket(struct sock *sk) +{ + struct sk_buff *skb; + + rose_remove_socket(sk); + rose_stop_heartbeat(sk); + rose_stop_idletimer(sk); + rose_stop_timer(sk); + + rose_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + /* Queue the unaccepted socket for death */ + sock_set_flag(skb->sk, SOCK_DEAD); + rose_start_heartbeat(skb->sk); + rose_sk(skb->sk)->state = ROSE_STATE_0; + } + + kfree_skb(skb); + } + + if (sk_has_allocations(sk)) { + /* Defer: outstanding buffers */ + setup_timer(&sk->sk_timer, rose_destroy_timer, + (unsigned long)sk); + sk->sk_timer.expires = jiffies + 10 * HZ; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +/* + * Handling for system calls applied via the various interfaces to a + * ROSE socket object. + */ + +static int rose_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + int opt; + + if (level != SOL_ROSE) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case ROSE_DEFER: + rose->defer = opt ? 1 : 0; + return 0; + + case ROSE_T1: + if (opt < 1) + return -EINVAL; + rose->t1 = opt * HZ; + return 0; + + case ROSE_T2: + if (opt < 1) + return -EINVAL; + rose->t2 = opt * HZ; + return 0; + + case ROSE_T3: + if (opt < 1) + return -EINVAL; + rose->t3 = opt * HZ; + return 0; + + case ROSE_HOLDBACK: + if (opt < 1) + return -EINVAL; + rose->hb = opt * HZ; + return 0; + + case ROSE_IDLE: + if (opt < 0) + return -EINVAL; + rose->idle = opt * 60 * HZ; + return 0; + + case ROSE_QBITINCL: + rose->qbitincl = opt ? 1 : 0; + return 0; + + default: + return -ENOPROTOOPT; + } +} + +static int rose_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + int val = 0; + int len; + + if (level != SOL_ROSE) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + switch (optname) { + case ROSE_DEFER: + val = rose->defer; + break; + + case ROSE_T1: + val = rose->t1 / HZ; + break; + + case ROSE_T2: + val = rose->t2 / HZ; + break; + + case ROSE_T3: + val = rose->t3 / HZ; + break; + + case ROSE_HOLDBACK: + val = rose->hb / HZ; + break; + + case ROSE_IDLE: + val = rose->idle / (60 * HZ); + break; + + case ROSE_QBITINCL: + val = rose->qbitincl; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, len, sizeof(int)); + + if (put_user(len, optlen)) + return -EFAULT; + + return copy_to_user(optval, &val, len) ? -EFAULT : 0; +} + +static int rose_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + if (sk->sk_state != TCP_LISTEN) { + struct rose_sock *rose = rose_sk(sk); + + rose->dest_ndigis = 0; + memset(&rose->dest_addr, 0, ROSE_ADDR_LEN); + memset(&rose->dest_call, 0, AX25_ADDR_LEN); + memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + return 0; + } + + return -EOPNOTSUPP; +} + +static struct proto rose_proto = { + .name = "ROSE", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rose_sock), +}; + +static int rose_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct rose_sock *rose; + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + if (sock->type != SOCK_SEQPACKET || protocol != 0) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto); + if (sk == NULL) + return -ENOMEM; + + rose = rose_sk(sk); + + sock_init_data(sock, sk); + + skb_queue_head_init(&rose->ack_queue); +#ifdef M_BIT + skb_queue_head_init(&rose->frag_queue); + rose->fraglen = 0; +#endif + + sock->ops = &rose_proto_ops; + sk->sk_protocol = protocol; + + init_timer(&rose->timer); + init_timer(&rose->idletimer); + + rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout); + rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout); + rose->t3 = msecs_to_jiffies(sysctl_rose_clear_request_timeout); + rose->hb = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout); + rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout); + + rose->state = ROSE_STATE_0; + + return 0; +} + +static struct sock *rose_make_new(struct sock *osk) +{ + struct sock *sk; + struct rose_sock *rose, *orose; + + if (osk->sk_type != SOCK_SEQPACKET) + return NULL; + + sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto); + if (sk == NULL) + return NULL; + + rose = rose_sk(sk); + + sock_init_data(NULL, sk); + + skb_queue_head_init(&rose->ack_queue); +#ifdef M_BIT + skb_queue_head_init(&rose->frag_queue); + rose->fraglen = 0; +#endif + + sk->sk_type = osk->sk_type; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sock_copy_flags(sk, osk); + + init_timer(&rose->timer); + init_timer(&rose->idletimer); + + orose = rose_sk(osk); + rose->t1 = orose->t1; + rose->t2 = orose->t2; + rose->t3 = orose->t3; + rose->hb = orose->hb; + rose->idle = orose->idle; + rose->defer = orose->defer; + rose->device = orose->device; + rose->qbitincl = orose->qbitincl; + + return sk; +} + +static int rose_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose; + + if (sk == NULL) return 0; + + sock_hold(sk); + sock_orphan(sk); + lock_sock(sk); + rose = rose_sk(sk); + + switch (rose->state) { + case ROSE_STATE_0: + release_sock(sk); + rose_disconnect(sk, 0, -1, -1); + lock_sock(sk); + rose_destroy_socket(sk); + break; + + case ROSE_STATE_2: + rose->neighbour->use--; + release_sock(sk); + rose_disconnect(sk, 0, -1, -1); + lock_sock(sk); + rose_destroy_socket(sk); + break; + + case ROSE_STATE_1: + case ROSE_STATE_3: + case ROSE_STATE_4: + case ROSE_STATE_5: + rose_clear_queues(sk); + rose_stop_idletimer(sk); + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + rose_start_t3timer(sk); + rose->state = ROSE_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + sock_set_flag(sk, SOCK_DESTROY); + break; + + default: + break; + } + + sock->sk = NULL; + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; + struct net_device *dev; + ax25_address *source; + ax25_uid_assoc *user; + int n; + + if (!sock_flag(sk, SOCK_ZAPPED)) + return -EINVAL; + + if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) + return -EINVAL; + + if (addr->srose_family != AF_ROSE) + return -EINVAL; + + if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) + return -EINVAL; + + if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) + return -EINVAL; + + if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) + return -EADDRNOTAVAIL; + + source = &addr->srose_call; + + user = ax25_findbyuid(current_euid()); + if (user) { + rose->source_call = user->call; + ax25_uid_put(user); + } else { + if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + rose->source_call = *source; + } + + rose->source_addr = addr->srose_addr; + rose->device = dev; + rose->source_ndigis = addr->srose_ndigis; + + if (addr_len == sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < addr->srose_ndigis ; n++) + rose->source_digis[n] = full_addr->srose_digis[n]; + } else { + if (rose->source_ndigis == 1) { + rose->source_digis[0] = addr->srose_digi; + } + } + + rose_insert_socket(sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + + return 0; +} + +static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; + unsigned char cause, diagnostic; + struct net_device *dev; + ax25_uid_assoc *user; + int n, err = 0; + + if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) + return -EINVAL; + + if (addr->srose_family != AF_ROSE) + return -EINVAL; + + if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) + return -EINVAL; + + if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) + return -EINVAL; + + /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ + if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + /* Connect completed during a ERESTARTSYS event */ + sock->state = SS_CONNECTED; + goto out_release; + } + + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + err = -ECONNREFUSED; + goto out_release; + } + + if (sk->sk_state == TCP_ESTABLISHED) { + /* No reconnect on a seqpacket socket */ + err = -EISCONN; + goto out_release; + } + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, + &diagnostic, 0); + if (!rose->neighbour) { + err = -ENETUNREACH; + goto out_release; + } + + rose->lci = rose_new_lci(rose->neighbour); + if (!rose->lci) { + err = -ENETUNREACH; + goto out_release; + } + + if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ + sock_reset_flag(sk, SOCK_ZAPPED); + + if ((dev = rose_dev_first()) == NULL) { + err = -ENETUNREACH; + goto out_release; + } + + user = ax25_findbyuid(current_euid()); + if (!user) { + err = -EINVAL; + goto out_release; + } + + memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); + rose->source_call = user->call; + rose->device = dev; + ax25_uid_put(user); + + rose_insert_socket(sk); /* Finish the bind */ + } + rose->dest_addr = addr->srose_addr; + rose->dest_call = addr->srose_call; + rose->rand = ((long)rose & 0xFFFF) + rose->lci; + rose->dest_ndigis = addr->srose_ndigis; + + if (addr_len == sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < addr->srose_ndigis ; n++) + rose->dest_digis[n] = full_addr->srose_digis[n]; + } else { + if (rose->dest_ndigis == 1) { + rose->dest_digis[0] = addr->srose_digi; + } + } + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + rose->state = ROSE_STATE_1; + + rose->neighbour->use++; + + rose_write_internal(sk, ROSE_CALL_REQUEST); + rose_start_heartbeat(sk); + rose_start_t1timer(sk); + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { + err = -EINPROGRESS; + goto out_release; + } + + /* + * A Connect Ack with Choke or timeout or failed routing will go to + * closed. + */ + if (sk->sk_state == TCP_SYN_SENT) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk->sk_state != TCP_SYN_SENT) + break; + if (!signal_pending(current)) { + release_sock(sk); + schedule(); + lock_sock(sk); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(sk_sleep(sk), &wait); + + if (err) + goto out_release; + } + + if (sk->sk_state != TCP_ESTABLISHED) { + sock->state = SS_UNCONNECTED; + err = sock_error(sk); /* Always set at this point */ + goto out_release; + } + + sock->state = SS_CONNECTED; + +out_release: + release_sock(sk); + + return err; +} + +static int rose_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sk_buff *skb; + struct sock *newsk; + DEFINE_WAIT(wait); + struct sock *sk; + int err = 0; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EOPNOTSUPP; + goto out_release; + } + + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto out_release; + } + + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + if (flags & O_NONBLOCK) { + err = -EWOULDBLOCK; + break; + } + if (!signal_pending(current)) { + release_sock(sk); + schedule(); + lock_sock(sk); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(sk_sleep(sk), &wait); + if (err) + goto out_release; + + newsk = skb->sk; + sock_graft(newsk, newsock); + + /* Now attach up the new socket */ + skb->sk = NULL; + kfree_skb(skb); + sk->sk_ack_backlog--; + +out_release: + release_sock(sk); + + return err; +} + +static int rose_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr; + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + int n; + + memset(srose, 0, sizeof(*srose)); + if (peer != 0) { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + srose->srose_family = AF_ROSE; + srose->srose_addr = rose->dest_addr; + srose->srose_call = rose->dest_call; + srose->srose_ndigis = rose->dest_ndigis; + for (n = 0; n < rose->dest_ndigis; n++) + srose->srose_digis[n] = rose->dest_digis[n]; + } else { + srose->srose_family = AF_ROSE; + srose->srose_addr = rose->source_addr; + srose->srose_call = rose->source_call; + srose->srose_ndigis = rose->source_ndigis; + for (n = 0; n < rose->source_ndigis; n++) + srose->srose_digis[n] = rose->source_digis[n]; + } + + *uaddr_len = sizeof(struct full_sockaddr_rose); + return 0; +} + +int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci) +{ + struct sock *sk; + struct sock *make; + struct rose_sock *make_rose; + struct rose_facilities_struct facilities; + int n; + + skb->sk = NULL; /* Initially we don't know who it's for */ + + /* + * skb->data points to the rose frame start + */ + memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); + + if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, + skb->len - ROSE_CALL_REQ_FACILITIES_OFF, + &facilities)) { + rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); + return 0; + } + + sk = rose_find_listener(&facilities.source_addr, &facilities.source_call); + + /* + * We can't accept the Call Request. + */ + if (sk == NULL || sk_acceptq_is_full(sk) || + (make = rose_make_new(sk)) == NULL) { + rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120); + return 0; + } + + skb->sk = make; + make->sk_state = TCP_ESTABLISHED; + make_rose = rose_sk(make); + + make_rose->lci = lci; + make_rose->dest_addr = facilities.dest_addr; + make_rose->dest_call = facilities.dest_call; + make_rose->dest_ndigis = facilities.dest_ndigis; + for (n = 0 ; n < facilities.dest_ndigis ; n++) + make_rose->dest_digis[n] = facilities.dest_digis[n]; + make_rose->source_addr = facilities.source_addr; + make_rose->source_call = facilities.source_call; + make_rose->source_ndigis = facilities.source_ndigis; + for (n = 0 ; n < facilities.source_ndigis ; n++) + make_rose->source_digis[n]= facilities.source_digis[n]; + make_rose->neighbour = neigh; + make_rose->device = dev; + make_rose->facilities = facilities; + + make_rose->neighbour->use++; + + if (rose_sk(sk)->defer) { + make_rose->state = ROSE_STATE_5; + } else { + rose_write_internal(make, ROSE_CALL_ACCEPTED); + make_rose->state = ROSE_STATE_3; + rose_start_idletimer(make); + } + + make_rose->condition = 0x00; + make_rose->vs = 0; + make_rose->va = 0; + make_rose->vr = 0; + make_rose->vl = 0; + sk->sk_ack_backlog++; + + rose_insert_socket(make); + + skb_queue_head(&sk->sk_receive_queue, skb); + + rose_start_heartbeat(make); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + + return 1; +} + +static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *usrose = (struct sockaddr_rose *)msg->msg_name; + int err; + struct full_sockaddr_rose srose; + struct sk_buff *skb; + unsigned char *asmptr; + int n, size, qbit = 0; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (sock_flag(sk, SOCK_ZAPPED)) + return -EADDRNOTAVAIL; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (rose->neighbour == NULL || rose->device == NULL) + return -ENETUNREACH; + + if (usrose != NULL) { + if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose)) + return -EINVAL; + memset(&srose, 0, sizeof(struct full_sockaddr_rose)); + memcpy(&srose, usrose, msg->msg_namelen); + if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 || + ax25cmp(&rose->dest_call, &srose.srose_call) != 0) + return -EISCONN; + if (srose.srose_ndigis != rose->dest_ndigis) + return -EISCONN; + if (srose.srose_ndigis == rose->dest_ndigis) { + for (n = 0 ; n < srose.srose_ndigis ; n++) + if (ax25cmp(&rose->dest_digis[n], + &srose.srose_digis[n])) + return -EISCONN; + } + if (srose.srose_family != AF_ROSE) + return -EINVAL; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + srose.srose_family = AF_ROSE; + srose.srose_addr = rose->dest_addr; + srose.srose_call = rose->dest_call; + srose.srose_ndigis = rose->dest_ndigis; + for (n = 0 ; n < rose->dest_ndigis ; n++) + srose.srose_digis[n] = rose->dest_digis[n]; + } + + /* Build a packet */ + /* Sanity check the packet size */ + if (len > 65535) + return -EMSGSIZE; + + size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; + + if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + return err; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN); + + /* + * Put the data on the end + */ + + skb_reset_transport_header(skb); + skb_put(skb, len); + + err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + if (err) { + kfree_skb(skb); + return err; + } + + /* + * If the Q BIT Include socket option is in force, the first + * byte of the user data is the logical value of the Q Bit. + */ + if (rose->qbitincl) { + qbit = skb->data[0]; + skb_pull(skb, 1); + } + + /* + * Push down the ROSE header + */ + asmptr = skb_push(skb, ROSE_MIN_LEN); + + /* Build a ROSE Network header */ + asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI; + asmptr[1] = (rose->lci >> 0) & 0xFF; + asmptr[2] = ROSE_DATA; + + if (qbit) + asmptr[0] |= ROSE_Q_BIT; + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + return -ENOTCONN; + } + +#ifdef M_BIT +#define ROSE_PACLEN (256-ROSE_MIN_LEN) + if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { + unsigned char header[ROSE_MIN_LEN]; + struct sk_buff *skbn; + int frontlen; + int lg; + + /* Save a copy of the Header */ + skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN); + skb_pull(skb, ROSE_MIN_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) { + kfree_skb(skb); + return err; + } + + skbn->sk = sk; + skbn->free = 1; + skbn->arp = 1; + + skb_reserve(skbn, frontlen); + + lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; + + /* Copy the user data */ + skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg); + skb_pull(skb, lg); + + /* Duplicate the Header */ + skb_push(skbn, ROSE_MIN_LEN); + skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN); + + if (skb->len > 0) + skbn->data[2] |= M_BIT; + + skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb); + } else { + skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ + } +#else + skb_queue_tail(&sk->sk_write_queue, skb); /* Shove it onto the queue */ +#endif + + rose_kick(sk); + + return len; +} + + +static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *srose = (struct sockaddr_rose *)msg->msg_name; + size_t copied; + unsigned char *asmptr; + struct sk_buff *skb; + int n, er, qbit; + + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + /* Now we can treat all alike */ + if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL) + return er; + + qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT; + + skb_pull(skb, ROSE_MIN_LEN); + + if (rose->qbitincl) { + asmptr = skb_push(skb, 1); + *asmptr = qbit; + } + + skb_reset_transport_header(skb); + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (srose != NULL) { + srose->srose_family = AF_ROSE; + srose->srose_addr = rose->dest_addr; + srose->srose_call = rose->dest_call; + srose->srose_ndigis = rose->dest_ndigis; + if (msg->msg_namelen >= sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)msg->msg_name; + for (n = 0 ; n < rose->dest_ndigis ; n++) + full_srose->srose_digis[n] = rose->dest_digis[n]; + msg->msg_namelen = sizeof(struct full_sockaddr_rose); + } else { + if (rose->dest_ndigis >= 1) { + srose->srose_ndigis = 1; + srose->srose_digi = rose->dest_digis[0]; + } + msg->msg_namelen = sizeof(struct sockaddr_rose); + } + } + + skb_free_datagram(sk, skb); + + return copied; +} + + +static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case TIOCOUTQ: { + long amount; + + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + return put_user(amount, (unsigned int __user *) argp); + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + return put_user(amount, (unsigned int __user *) argp); + } + + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *) argp); + + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *) argp); + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + return -EINVAL; + + case SIOCADDRT: + case SIOCDELRT: + case SIOCRSCLRRT: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + return rose_rt_ioctl(cmd, argp); + + case SIOCRSGCAUSE: { + struct rose_cause_struct rose_cause; + rose_cause.cause = rose->cause; + rose_cause.diagnostic = rose->diagnostic; + return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0; + } + + case SIOCRSSCAUSE: { + struct rose_cause_struct rose_cause; + if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct))) + return -EFAULT; + rose->cause = rose_cause.cause; + rose->diagnostic = rose_cause.diagnostic; + return 0; + } + + case SIOCRSSL2CALL: + if (!capable(CAP_NET_ADMIN)) return -EPERM; + if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) + ax25_listen_release(&rose_callsign, NULL); + if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address))) + return -EFAULT; + if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) + return ax25_listen_register(&rose_callsign, NULL); + + return 0; + + case SIOCRSGL2CALL: + return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0; + + case SIOCRSACCEPT: + if (rose->state == ROSE_STATE_5) { + rose_write_internal(sk, ROSE_CALL_ACCEPTED); + rose_start_idletimer(sk); + rose->condition = 0x00; + rose->vs = 0; + rose->va = 0; + rose->vr = 0; + rose->vl = 0; + rose->state = ROSE_STATE_3; + } + return 0; + + default: + return -ENOIOCTLCMD; + } + + return 0; +} + +#ifdef CONFIG_PROC_FS +static void *rose_info_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_list_lock) +{ + spin_lock_bh(&rose_list_lock); + return seq_hlist_start_head(&rose_list, *pos); +} + +static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &rose_list, pos); +} + +static void rose_info_stop(struct seq_file *seq, void *v) + __releases(rose_list_lock) +{ + spin_unlock_bh(&rose_list_lock); +} + +static int rose_info_show(struct seq_file *seq, void *v) +{ + char buf[11], rsbuf[11]; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); + + else { + struct sock *s = sk_entry(v); + struct rose_sock *rose = rose_sk(s); + const char *devname, *callsign; + const struct net_device *dev = rose->device; + + if (!dev) + devname = "???"; + else + devname = dev->name; + + seq_printf(seq, "%-10s %-9s ", + rose2asc(rsbuf, &rose->dest_addr), + ax2asc(buf, &rose->dest_call)); + + if (ax25cmp(&rose->source_call, &null_ax25_address) == 0) + callsign = "??????-?"; + else + callsign = ax2asc(buf, &rose->source_call); + + seq_printf(seq, + "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", + rose2asc(rsbuf, &rose->source_addr), + callsign, + devname, + rose->lci & 0x0FFF, + (rose->neighbour) ? rose->neighbour->number : 0, + rose->state, + rose->vs, + rose->vr, + rose->va, + ax25_display_timer(&rose->timer) / HZ, + rose->t1 / HZ, + rose->t2 / HZ, + rose->t3 / HZ, + rose->hb / HZ, + ax25_display_timer(&rose->idletimer) / (60 * HZ), + rose->idle / (60 * HZ), + sk_wmem_alloc_get(s), + sk_rmem_alloc_get(s), + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + } + + return 0; +} + +static const struct seq_operations rose_info_seqops = { + .start = rose_info_start, + .next = rose_info_next, + .stop = rose_info_stop, + .show = rose_info_show, +}; + +static int rose_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_info_seqops); +} + +static const struct file_operations rose_info_fops = { + .owner = THIS_MODULE, + .open = rose_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static const struct net_proto_family rose_family_ops = { + .family = PF_ROSE, + .create = rose_create, + .owner = THIS_MODULE, +}; + +static const struct proto_ops rose_proto_ops = { + .family = PF_ROSE, + .owner = THIS_MODULE, + .release = rose_release, + .bind = rose_bind, + .connect = rose_connect, + .socketpair = sock_no_socketpair, + .accept = rose_accept, + .getname = rose_getname, + .poll = datagram_poll, + .ioctl = rose_ioctl, + .listen = rose_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rose_setsockopt, + .getsockopt = rose_getsockopt, + .sendmsg = rose_sendmsg, + .recvmsg = rose_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct notifier_block rose_dev_notifier = { + .notifier_call = rose_device_event, +}; + +static struct net_device **dev_rose; + +static struct ax25_protocol rose_pid = { + .pid = AX25_P_ROSE, + .func = rose_route_frame +}; + +static struct ax25_linkfail rose_linkfail_notifier = { + .func = rose_link_failed +}; + +static int __init rose_proto_init(void) +{ + int i; + int rc; + + if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { + printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n"); + rc = -EINVAL; + goto out; + } + + rc = proto_register(&rose_proto, 0); + if (rc != 0) + goto out; + + rose_callsign = null_ax25_address; + + dev_rose = kzalloc(rose_ndevs * sizeof(struct net_device *), GFP_KERNEL); + if (dev_rose == NULL) { + printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n"); + rc = -ENOMEM; + goto out_proto_unregister; + } + + for (i = 0; i < rose_ndevs; i++) { + struct net_device *dev; + char name[IFNAMSIZ]; + + sprintf(name, "rose%d", i); + dev = alloc_netdev(0, name, rose_setup); + if (!dev) { + printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); + rc = -ENOMEM; + goto fail; + } + rc = register_netdev(dev); + if (rc) { + printk(KERN_ERR "ROSE: netdevice registration failed\n"); + free_netdev(dev); + goto fail; + } + rose_set_lockdep_key(dev); + dev_rose[i] = dev; + } + + sock_register(&rose_family_ops); + register_netdevice_notifier(&rose_dev_notifier); + + ax25_register_pid(&rose_pid); + ax25_linkfail_register(&rose_linkfail_notifier); + +#ifdef CONFIG_SYSCTL + rose_register_sysctl(); +#endif + rose_loopback_init(); + + rose_add_loopback_neigh(); + + proc_net_fops_create(&init_net, "rose", S_IRUGO, &rose_info_fops); + proc_net_fops_create(&init_net, "rose_neigh", S_IRUGO, &rose_neigh_fops); + proc_net_fops_create(&init_net, "rose_nodes", S_IRUGO, &rose_nodes_fops); + proc_net_fops_create(&init_net, "rose_routes", S_IRUGO, &rose_routes_fops); +out: + return rc; +fail: + while (--i >= 0) { + unregister_netdev(dev_rose[i]); + free_netdev(dev_rose[i]); + } + kfree(dev_rose); +out_proto_unregister: + proto_unregister(&rose_proto); + goto out; +} +module_init(rose_proto_init); + +module_param(rose_ndevs, int, 0); +MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices"); + +MODULE_AUTHOR("Jonathan Naylor G4KLX "); +MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_ROSE); + +static void __exit rose_exit(void) +{ + int i; + + proc_net_remove(&init_net, "rose"); + proc_net_remove(&init_net, "rose_neigh"); + proc_net_remove(&init_net, "rose_nodes"); + proc_net_remove(&init_net, "rose_routes"); + rose_loopback_clear(); + + rose_rt_free(); + + ax25_protocol_release(AX25_P_ROSE); + ax25_linkfail_release(&rose_linkfail_notifier); + + if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) + ax25_listen_release(&rose_callsign, NULL); + +#ifdef CONFIG_SYSCTL + rose_unregister_sysctl(); +#endif + unregister_netdevice_notifier(&rose_dev_notifier); + + sock_unregister(PF_ROSE); + + for (i = 0; i < rose_ndevs; i++) { + struct net_device *dev = dev_rose[i]; + + if (dev) { + unregister_netdev(dev); + free_netdev(dev); + } + } + + kfree(dev_rose); + proto_unregister(&rose_proto); +} + +module_exit(rose_exit); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c new file mode 100644 index 00000000..2679507a --- /dev/null +++ b/net/rose/rose_dev.c @@ -0,0 +1,172 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +static int rose_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); + + *buff++ = ROSE_GFI | ROSE_Q_BIT; + *buff++ = 0x00; + *buff++ = ROSE_DATA; + *buff++ = 0x7F; + *buff++ = AX25_P_IP; + + if (daddr != NULL) + return 37; + + return -37; +} + +static int rose_rebuild_header(struct sk_buff *skb) +{ +#ifdef CONFIG_INET + struct net_device *dev = skb->dev; + struct net_device_stats *stats = &dev->stats; + unsigned char *bp = (unsigned char *)skb->data; + struct sk_buff *skbn; + unsigned int len; + + if (arp_find(bp + 7, skb)) { + return 1; + } + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + return 1; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + + len = skbn->len; + + if (!rose_route_frame(skbn, NULL)) { + kfree_skb(skbn); + stats->tx_errors++; + return 1; + } + + stats->tx_packets++; + stats->tx_bytes += len; +#endif + return 1; +} + +static int rose_set_mac_address(struct net_device *dev, void *addr) +{ + struct sockaddr *sa = addr; + int err; + + if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) + return 0; + + if (dev->flags & IFF_UP) { + err = rose_add_loopback_node((rose_address *)sa->sa_data); + if (err) + return err; + + rose_del_loopback_node((rose_address *)dev->dev_addr); + } + + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + return 0; +} + +static int rose_open(struct net_device *dev) +{ + int err; + + err = rose_add_loopback_node((rose_address *)dev->dev_addr); + if (err) + return err; + + netif_start_queue(dev); + + return 0; +} + +static int rose_close(struct net_device *dev) +{ + netif_stop_queue(dev); + rose_del_loopback_node((rose_address *)dev->dev_addr); + return 0; +} + +static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = &dev->stats; + + if (!netif_running(dev)) { + printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); + return NETDEV_TX_BUSY; + } + dev_kfree_skb(skb); + stats->tx_errors++; + return NETDEV_TX_OK; +} + +static const struct header_ops rose_header_ops = { + .create = rose_header, + .rebuild= rose_rebuild_header, +}; + +static const struct net_device_ops rose_netdev_ops = { + .ndo_open = rose_open, + .ndo_stop = rose_close, + .ndo_start_xmit = rose_xmit, + .ndo_set_mac_address = rose_set_mac_address, +}; + +void rose_setup(struct net_device *dev) +{ + dev->mtu = ROSE_MAX_PACKET_SIZE - 2; + dev->netdev_ops = &rose_netdev_ops; + + dev->header_ops = &rose_header_ops; + dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; + dev->addr_len = ROSE_ADDR_LEN; + dev->type = ARPHRD_ROSE; + + /* New-style flags. */ + dev->flags = IFF_NOARP; +} diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c new file mode 100644 index 00000000..7f7fcb46 --- /dev/null +++ b/net/rose/rose_in.c @@ -0,0 +1,295 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * + * Most of this code is based on the SDL diagrams published in the 7th ARRL + * Computer Networking Conference papers. The diagrams have mistakes in them, + * but are mostly correct. Before you modify the code could you read the SDL + * diagrams as the code is not obvious and probably very easy to break. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 1, Awaiting Call Accepted State. + * The handling of the timer(s) is in file rose_timer.c. + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + + switch (frametype) { + case ROSE_CALL_ACCEPTED: + rose_stop_timer(sk); + rose_start_idletimer(sk); + rose->condition = 0x00; + rose->vs = 0; + rose->va = 0; + rose->vr = 0; + rose->vl = 0; + rose->state = ROSE_STATE_3; + sk->sk_state = TCP_ESTABLISHED; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Clear Confirmation State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + + switch (frametype) { + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + case ROSE_CLEAR_CONFIRMATION: + rose_disconnect(sk, 0, -1, -1); + rose->neighbour->use--; + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) +{ + struct rose_sock *rose = rose_sk(sk); + int queued = 0; + + switch (frametype) { + case ROSE_RESET_REQUEST: + rose_stop_timer(sk); + rose_start_idletimer(sk); + rose_write_internal(sk, ROSE_RESET_CONFIRMATION); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose_requeue_frames(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + case ROSE_RR: + case ROSE_RNR: + if (!rose_validate_nr(sk, nr)) { + rose_write_internal(sk, ROSE_RESET_REQUEST); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + } else { + rose_frames_acked(sk, nr); + if (frametype == ROSE_RNR) { + rose->condition |= ROSE_COND_PEER_RX_BUSY; + } else { + rose->condition &= ~ROSE_COND_PEER_RX_BUSY; + } + } + break; + + case ROSE_DATA: /* XXX */ + rose->condition &= ~ROSE_COND_PEER_RX_BUSY; + if (!rose_validate_nr(sk, nr)) { + rose_write_internal(sk, ROSE_RESET_REQUEST); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + break; + } + rose_frames_acked(sk, nr); + if (ns == rose->vr) { + rose_start_idletimer(sk); + if (sock_queue_rcv_skb(sk, skb) == 0) { + rose->vr = (rose->vr + 1) % ROSE_MODULUS; + queued = 1; + } else { + /* Should never happen ! */ + rose_write_internal(sk, ROSE_RESET_REQUEST); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + break; + } + if (atomic_read(&sk->sk_rmem_alloc) > + (sk->sk_rcvbuf >> 1)) + rose->condition |= ROSE_COND_OWN_RX_BUSY; + } + /* + * If the window is full, ack the frame, else start the + * acknowledge hold back timer. + */ + if (((rose->vl + sysctl_rose_window_size) % ROSE_MODULUS) == rose->vr) { + rose->condition &= ~ROSE_COND_ACK_PENDING; + rose_stop_timer(sk); + rose_enquiry_response(sk); + } else { + rose->condition |= ROSE_COND_ACK_PENDING; + rose_start_hbtimer(sk); + } + break; + + default: + printk(KERN_WARNING "ROSE: unknown %02X in state 3\n", frametype); + break; + } + + return queued; +} + +/* + * State machine for state 4, Awaiting Reset Confirmation State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + + switch (frametype) { + case ROSE_RESET_REQUEST: + rose_write_internal(sk, ROSE_RESET_CONFIRMATION); + case ROSE_RESET_CONFIRMATION: + rose_stop_timer(sk); + rose_start_idletimer(sk); + rose->condition = 0x00; + rose->va = 0; + rose->vr = 0; + rose->vs = 0; + rose->vl = 0; + rose->state = ROSE_STATE_3; + rose_requeue_frames(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 5, Awaiting Call Acceptance State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + if (frametype == ROSE_CLEAR_REQUEST) { + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose_sk(sk)->neighbour->use--; + } + + return 0; +} + +/* Higher level upcall for a LAPB frame */ +int rose_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + struct rose_sock *rose = rose_sk(sk); + int queued = 0, frametype, ns, nr, q, d, m; + + if (rose->state == ROSE_STATE_0) + return 0; + + frametype = rose_decode(skb, &ns, &nr, &q, &d, &m); + + switch (rose->state) { + case ROSE_STATE_1: + queued = rose_state1_machine(sk, skb, frametype); + break; + case ROSE_STATE_2: + queued = rose_state2_machine(sk, skb, frametype); + break; + case ROSE_STATE_3: + queued = rose_state3_machine(sk, skb, frametype, ns, nr, q, d, m); + break; + case ROSE_STATE_4: + queued = rose_state4_machine(sk, skb, frametype); + break; + case ROSE_STATE_5: + queued = rose_state5_machine(sk, skb, frametype); + break; + } + + rose_kick(sk); + + return queued; +} diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c new file mode 100644 index 00000000..fa5f5641 --- /dev/null +++ b/net/rose/rose_link.c @@ -0,0 +1,299 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void rose_ftimer_expiry(unsigned long); +static void rose_t0timer_expiry(unsigned long); + +static void rose_transmit_restart_confirmation(struct rose_neigh *neigh); +static void rose_transmit_restart_request(struct rose_neigh *neigh); + +void rose_start_ftimer(struct rose_neigh *neigh) +{ + del_timer(&neigh->ftimer); + + neigh->ftimer.data = (unsigned long)neigh; + neigh->ftimer.function = &rose_ftimer_expiry; + neigh->ftimer.expires = + jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout); + + add_timer(&neigh->ftimer); +} + +static void rose_start_t0timer(struct rose_neigh *neigh) +{ + del_timer(&neigh->t0timer); + + neigh->t0timer.data = (unsigned long)neigh; + neigh->t0timer.function = &rose_t0timer_expiry; + neigh->t0timer.expires = + jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout); + + add_timer(&neigh->t0timer); +} + +void rose_stop_ftimer(struct rose_neigh *neigh) +{ + del_timer(&neigh->ftimer); +} + +void rose_stop_t0timer(struct rose_neigh *neigh) +{ + del_timer(&neigh->t0timer); +} + +int rose_ftimer_running(struct rose_neigh *neigh) +{ + return timer_pending(&neigh->ftimer); +} + +static int rose_t0timer_running(struct rose_neigh *neigh) +{ + return timer_pending(&neigh->t0timer); +} + +static void rose_ftimer_expiry(unsigned long param) +{ +} + +static void rose_t0timer_expiry(unsigned long param) +{ + struct rose_neigh *neigh = (struct rose_neigh *)param; + + rose_transmit_restart_request(neigh); + + neigh->dce_mode = 0; + + rose_start_t0timer(neigh); +} + +/* + * Interface to ax25_send_frame. Changes my level 2 callsign depending + * on whether we have a global ROSE callsign or use the default port + * callsign. + */ +static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) +{ + ax25_address *rose_call; + ax25_cb *ax25s; + + if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) + rose_call = (ax25_address *)neigh->dev->dev_addr; + else + rose_call = &rose_callsign; + + ax25s = neigh->ax25; + neigh->ax25 = ax25_send_frame(skb, 260, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + if (ax25s) + ax25_cb_put(ax25s); + + return neigh->ax25 != NULL; +} + +/* + * Interface to ax25_link_up. Changes my level 2 callsign depending + * on whether we have a global ROSE callsign or use the default port + * callsign. + */ +static int rose_link_up(struct rose_neigh *neigh) +{ + ax25_address *rose_call; + ax25_cb *ax25s; + + if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) + rose_call = (ax25_address *)neigh->dev->dev_addr; + else + rose_call = &rose_callsign; + + ax25s = neigh->ax25; + neigh->ax25 = ax25_find_cb(rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + if (ax25s) + ax25_cb_put(ax25s); + + return neigh->ax25 != NULL; +} + +/* + * This handles all restart and diagnostic frames. + */ +void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigned short frametype) +{ + struct sk_buff *skbn; + + switch (frametype) { + case ROSE_RESTART_REQUEST: + rose_stop_t0timer(neigh); + neigh->restarted = 1; + neigh->dce_mode = (skb->data[3] == ROSE_DTE_ORIGINATED); + rose_transmit_restart_confirmation(neigh); + break; + + case ROSE_RESTART_CONFIRMATION: + rose_stop_t0timer(neigh); + neigh->restarted = 1; + break; + + case ROSE_DIAGNOSTIC: + printk(KERN_WARNING "ROSE: received diagnostic #%d - %02X %02X %02X\n", skb->data[3], skb->data[4], skb->data[5], skb->data[6]); + break; + + default: + printk(KERN_WARNING "ROSE: received unknown %02X with LCI 000\n", frametype); + break; + } + + if (neigh->restarted) { + while ((skbn = skb_dequeue(&neigh->queue)) != NULL) + if (!rose_send_frame(skbn, neigh)) + kfree_skb(skbn); + } +} + +/* + * This routine is called when a Restart Request is needed + */ +static void rose_transmit_restart_request(struct rose_neigh *neigh) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 3); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ROSE_GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_RESTART_REQUEST; + *dptr++ = ROSE_DTE_ORIGINATED; + *dptr++ = 0; + + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); +} + +/* + * This routine is called when a Restart Confirmation is needed + */ +static void rose_transmit_restart_confirmation(struct rose_neigh *neigh) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 1); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ROSE_GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_RESTART_CONFIRMATION; + + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); +} + +/* + * This routine is called when a Clear Request is needed outside of the context + * of a connected socket. + */ +void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, unsigned char cause, unsigned char diagnostic) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 3); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ((lci >> 8) & 0x0F) | ROSE_GFI; + *dptr++ = ((lci >> 0) & 0xFF); + *dptr++ = ROSE_CLEAR_REQUEST; + *dptr++ = cause; + *dptr++ = diagnostic; + + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); +} + +void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) +{ + unsigned char *dptr; + +#if 0 + if (call_fw_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) { + kfree_skb(skb); + return; + } +#endif + + if (neigh->loopback) { + rose_loopback_queue(skb, neigh); + return; + } + + if (!rose_link_up(neigh)) + neigh->restarted = 0; + + dptr = skb_push(skb, 1); + *dptr++ = AX25_P_ROSE; + + if (neigh->restarted) { + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); + } else { + skb_queue_tail(&neigh->queue, skb); + + if (!rose_t0timer_running(neigh)) { + rose_transmit_restart_request(neigh); + neigh->dce_mode = 0; + rose_start_t0timer(neigh); + } + } +} diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c new file mode 100644 index 00000000..34445620 --- /dev/null +++ b/net/rose/rose_loopback.c @@ -0,0 +1,124 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sk_buff_head loopback_queue; +static struct timer_list loopback_timer; + +static void rose_set_loopback_timer(void); + +void rose_loopback_init(void) +{ + skb_queue_head_init(&loopback_queue); + + init_timer(&loopback_timer); +} + +static int rose_loopback_running(void) +{ + return timer_pending(&loopback_timer); +} + +int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) +{ + struct sk_buff *skbn; + + skbn = skb_clone(skb, GFP_ATOMIC); + + kfree_skb(skb); + + if (skbn != NULL) { + skb_queue_tail(&loopback_queue, skbn); + + if (!rose_loopback_running()) + rose_set_loopback_timer(); + } + + return 1; +} + +static void rose_loopback_timer(unsigned long); + +static void rose_set_loopback_timer(void) +{ + del_timer(&loopback_timer); + + loopback_timer.data = 0; + loopback_timer.function = &rose_loopback_timer; + loopback_timer.expires = jiffies + 10; + + add_timer(&loopback_timer); +} + +static void rose_loopback_timer(unsigned long param) +{ + struct sk_buff *skb; + struct net_device *dev; + rose_address *dest; + struct sock *sk; + unsigned short frametype; + unsigned int lci_i, lci_o; + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + if (skb->len < ROSE_MIN_LEN) { + kfree_skb(skb); + continue; + } + lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + frametype = skb->data[2]; + if (frametype == ROSE_CALL_REQUEST && + (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || + skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != + ROSE_CALL_REQ_ADDR_LEN_VAL)) { + kfree_skb(skb); + continue; + } + dest = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); + lci_o = ROSE_DEFAULT_MAXVC + 1 - lci_i; + + skb_reset_transport_header(skb); + + sk = rose_find_socket(lci_o, rose_loopback_neigh); + if (sk) { + if (rose_process_rx_frame(sk, skb) == 0) + kfree_skb(skb); + continue; + } + + if (frametype == ROSE_CALL_REQUEST) { + if ((dev = rose_dev_get(dest)) != NULL) { + if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) + kfree_skb(skb); + } else { + kfree_skb(skb); + } + } else { + kfree_skb(skb); + } + } +} + +void __exit rose_loopback_clear(void) +{ + struct sk_buff *skb; + + del_timer(&loopback_timer); + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + skb->sk = NULL; + kfree_skb(skb); + } +} diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c new file mode 100644 index 00000000..4ebf33af --- /dev/null +++ b/net/rose/rose_out.c @@ -0,0 +1,126 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void rose_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + struct rose_sock *rose = rose_sk(sk); + + if (skb == NULL) + return; + + skb->data[2] |= (rose->vr << 5) & 0xE0; + skb->data[2] |= (rose->vs << 1) & 0x0E; + + rose_start_idletimer(sk); + + rose_transmit_link(skb, rose->neighbour); +} + +void rose_kick(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + struct sk_buff *skb, *skbn; + unsigned short start, end; + + if (rose->state != ROSE_STATE_3) + return; + + if (rose->condition & ROSE_COND_PEER_RX_BUSY) + return; + + if (!skb_peek(&sk->sk_write_queue)) + return; + + start = (skb_peek(&rose->ack_queue) == NULL) ? rose->va : rose->vs; + end = (rose->va + sysctl_rose_window_size) % ROSE_MODULUS; + + if (start == end) + return; + + rose->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + skb = skb_dequeue(&sk->sk_write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->sk_write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + + /* + * Transmit the frame copy. + */ + rose_send_iframe(sk, skbn); + + rose->vs = (rose->vs + 1) % ROSE_MODULUS; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&rose->ack_queue, skb); + + } while (rose->vs != end && + (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); + + rose->vl = rose->vr; + rose->condition &= ~ROSE_COND_ACK_PENDING; + + rose_stop_timer(sk); +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void rose_enquiry_response(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + if (rose->condition & ROSE_COND_OWN_RX_BUSY) + rose_write_internal(sk, ROSE_RNR); + else + rose_write_internal(sk, ROSE_RR); + + rose->vl = rose->vr; + rose->condition &= ~ROSE_COND_ACK_PENDING; + + rose_stop_timer(sk); +} diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c new file mode 100644 index 00000000..479cae57 --- /dev/null +++ b/net/rose/rose_route.c @@ -0,0 +1,1371 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include + +static unsigned int rose_neigh_no = 1; + +static struct rose_node *rose_node_list; +static DEFINE_SPINLOCK(rose_node_list_lock); +static struct rose_neigh *rose_neigh_list; +static DEFINE_SPINLOCK(rose_neigh_list_lock); +static struct rose_route *rose_route_list; +static DEFINE_SPINLOCK(rose_route_list_lock); + +struct rose_neigh *rose_loopback_neigh; + +/* + * Add a new route to a node, and in the process add the node and the + * neighbour if it is new. + */ +static int __must_check rose_add_node(struct rose_route_struct *rose_route, + struct net_device *dev) +{ + struct rose_node *rose_node, *rose_tmpn, *rose_tmpp; + struct rose_neigh *rose_neigh; + int i, res = 0; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == rose_route->mask) && + (rosecmpm(&rose_route->address, &rose_node->address, + rose_route->mask) == 0)) + break; + rose_node = rose_node->next; + } + + if (rose_node != NULL && rose_node->loopback) { + res = -EINVAL; + goto out; + } + + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (ax25cmp(&rose_route->neighbour, + &rose_neigh->callsign) == 0 && + rose_neigh->dev == dev) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh == NULL) { + rose_neigh = kmalloc(sizeof(*rose_neigh), GFP_ATOMIC); + if (rose_neigh == NULL) { + res = -ENOMEM; + goto out; + } + + rose_neigh->callsign = rose_route->neighbour; + rose_neigh->digipeat = NULL; + rose_neigh->ax25 = NULL; + rose_neigh->dev = dev; + rose_neigh->count = 0; + rose_neigh->use = 0; + rose_neigh->dce_mode = 0; + rose_neigh->loopback = 0; + rose_neigh->number = rose_neigh_no++; + rose_neigh->restarted = 0; + + skb_queue_head_init(&rose_neigh->queue); + + init_timer(&rose_neigh->ftimer); + init_timer(&rose_neigh->t0timer); + + if (rose_route->ndigis != 0) { + rose_neigh->digipeat = + kmalloc(sizeof(ax25_digi), GFP_ATOMIC); + if (rose_neigh->digipeat == NULL) { + kfree(rose_neigh); + res = -ENOMEM; + goto out; + } + + rose_neigh->digipeat->ndigi = rose_route->ndigis; + rose_neigh->digipeat->lastrepeat = -1; + + for (i = 0; i < rose_route->ndigis; i++) { + rose_neigh->digipeat->calls[i] = + rose_route->digipeaters[i]; + rose_neigh->digipeat->repeated[i] = 0; + } + } + + rose_neigh->next = rose_neigh_list; + rose_neigh_list = rose_neigh; + } + + /* + * This is a new node to be inserted into the list. Find where it needs + * to be inserted into the list, and insert it. We want to be sure + * to order the list in descending order of mask size to ensure that + * later when we are searching this list the first match will be the + * best match. + */ + if (rose_node == NULL) { + rose_tmpn = rose_node_list; + rose_tmpp = NULL; + + while (rose_tmpn != NULL) { + if (rose_tmpn->mask > rose_route->mask) { + rose_tmpp = rose_tmpn; + rose_tmpn = rose_tmpn->next; + } else { + break; + } + } + + /* create new node */ + rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC); + if (rose_node == NULL) { + res = -ENOMEM; + goto out; + } + + rose_node->address = rose_route->address; + rose_node->mask = rose_route->mask; + rose_node->count = 1; + rose_node->loopback = 0; + rose_node->neighbour[0] = rose_neigh; + + if (rose_tmpn == NULL) { + if (rose_tmpp == NULL) { /* Empty list */ + rose_node_list = rose_node; + rose_node->next = NULL; + } else { + rose_tmpp->next = rose_node; + rose_node->next = NULL; + } + } else { + if (rose_tmpp == NULL) { /* 1st node */ + rose_node->next = rose_node_list; + rose_node_list = rose_node; + } else { + rose_tmpp->next = rose_node; + rose_node->next = rose_tmpn; + } + } + rose_neigh->count++; + + goto out; + } + + /* We have space, slot it in */ + if (rose_node->count < 3) { + rose_node->neighbour[rose_node->count] = rose_neigh; + rose_node->count++; + rose_neigh->count++; + } + +out: + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return res; +} + +/* + * Caller is holding rose_node_list_lock. + */ +static void rose_remove_node(struct rose_node *rose_node) +{ + struct rose_node *s; + + if ((s = rose_node_list) == rose_node) { + rose_node_list = rose_node->next; + kfree(rose_node); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_node) { + s->next = rose_node->next; + kfree(rose_node); + return; + } + + s = s->next; + } +} + +/* + * Caller is holding rose_neigh_list_lock. + */ +static void rose_remove_neigh(struct rose_neigh *rose_neigh) +{ + struct rose_neigh *s; + + rose_stop_ftimer(rose_neigh); + rose_stop_t0timer(rose_neigh); + + skb_queue_purge(&rose_neigh->queue); + + if ((s = rose_neigh_list) == rose_neigh) { + rose_neigh_list = rose_neigh->next; + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_neigh) { + s->next = rose_neigh->next; + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + return; + } + + s = s->next; + } +} + +/* + * Caller is holding rose_route_list_lock. + */ +static void rose_remove_route(struct rose_route *rose_route) +{ + struct rose_route *s; + + if (rose_route->neigh1 != NULL) + rose_route->neigh1->use--; + + if (rose_route->neigh2 != NULL) + rose_route->neigh2->use--; + + if ((s = rose_route_list) == rose_route) { + rose_route_list = rose_route->next; + kfree(rose_route); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_route) { + s->next = rose_route->next; + kfree(rose_route); + return; + } + + s = s->next; + } +} + +/* + * "Delete" a node. Strictly speaking remove a route to a node. The node + * is only deleted if no routes are left to it. + */ +static int rose_del_node(struct rose_route_struct *rose_route, + struct net_device *dev) +{ + struct rose_node *rose_node; + struct rose_neigh *rose_neigh; + int i, err = 0; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == rose_route->mask) && + (rosecmpm(&rose_route->address, &rose_node->address, + rose_route->mask) == 0)) + break; + rose_node = rose_node->next; + } + + if (rose_node == NULL || rose_node->loopback) { + err = -EINVAL; + goto out; + } + + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (ax25cmp(&rose_route->neighbour, + &rose_neigh->callsign) == 0 && + rose_neigh->dev == dev) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh == NULL) { + err = -EINVAL; + goto out; + } + + for (i = 0; i < rose_node->count; i++) { + if (rose_node->neighbour[i] == rose_neigh) { + rose_neigh->count--; + + if (rose_neigh->count == 0 && rose_neigh->use == 0) + rose_remove_neigh(rose_neigh); + + rose_node->count--; + + if (rose_node->count == 0) { + rose_remove_node(rose_node); + } else { + switch (i) { + case 0: + rose_node->neighbour[0] = + rose_node->neighbour[1]; + case 1: + rose_node->neighbour[1] = + rose_node->neighbour[2]; + case 2: + break; + } + } + goto out; + } + } + err = -EINVAL; + +out: + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return err; +} + +/* + * Add the loopback neighbour. + */ +void rose_add_loopback_neigh(void) +{ + struct rose_neigh *sn; + + rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_KERNEL); + if (!rose_loopback_neigh) + return; + sn = rose_loopback_neigh; + + sn->callsign = null_ax25_address; + sn->digipeat = NULL; + sn->ax25 = NULL; + sn->dev = NULL; + sn->count = 0; + sn->use = 0; + sn->dce_mode = 1; + sn->loopback = 1; + sn->number = rose_neigh_no++; + sn->restarted = 1; + + skb_queue_head_init(&sn->queue); + + init_timer(&sn->ftimer); + init_timer(&sn->t0timer); + + spin_lock_bh(&rose_neigh_list_lock); + sn->next = rose_neigh_list; + rose_neigh_list = sn; + spin_unlock_bh(&rose_neigh_list_lock); +} + +/* + * Add a loopback node. + */ +int rose_add_loopback_node(rose_address *address) +{ + struct rose_node *rose_node; + int err = 0; + + spin_lock_bh(&rose_node_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == 10) && + (rosecmpm(address, &rose_node->address, 10) == 0) && + rose_node->loopback) + break; + rose_node = rose_node->next; + } + + if (rose_node != NULL) + goto out; + + if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) { + err = -ENOMEM; + goto out; + } + + rose_node->address = *address; + rose_node->mask = 10; + rose_node->count = 1; + rose_node->loopback = 1; + rose_node->neighbour[0] = rose_loopback_neigh; + + /* Insert at the head of list. Address is always mask=10 */ + rose_node->next = rose_node_list; + rose_node_list = rose_node; + + rose_loopback_neigh->count++; + +out: + spin_unlock_bh(&rose_node_list_lock); + + return err; +} + +/* + * Delete a loopback node. + */ +void rose_del_loopback_node(rose_address *address) +{ + struct rose_node *rose_node; + + spin_lock_bh(&rose_node_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == 10) && + (rosecmpm(address, &rose_node->address, 10) == 0) && + rose_node->loopback) + break; + rose_node = rose_node->next; + } + + if (rose_node == NULL) + goto out; + + rose_remove_node(rose_node); + + rose_loopback_neigh->count--; + +out: + spin_unlock_bh(&rose_node_list_lock); +} + +/* + * A device has been removed. Remove its routes and neighbours. + */ +void rose_rt_device_down(struct net_device *dev) +{ + struct rose_neigh *s, *rose_neigh; + struct rose_node *t, *rose_node; + int i; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + if (s->dev != dev) + continue; + + rose_node = rose_node_list; + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + + for (i = 0; i < t->count; i++) { + if (t->neighbour[i] != s) + continue; + + t->count--; + + switch (i) { + case 0: + t->neighbour[0] = t->neighbour[1]; + case 1: + t->neighbour[1] = t->neighbour[2]; + case 2: + break; + } + } + + if (t->count <= 0) + rose_remove_node(t); + } + + rose_remove_neigh(s); + } + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); +} + +#if 0 /* Currently unused */ +/* + * A device has been removed. Remove its links. + */ +void rose_route_device_down(struct net_device *dev) +{ + struct rose_route *s, *rose_route; + + spin_lock_bh(&rose_route_list_lock); + rose_route = rose_route_list; + while (rose_route != NULL) { + s = rose_route; + rose_route = rose_route->next; + + if (s->neigh1->dev == dev || s->neigh2->dev == dev) + rose_remove_route(s); + } + spin_unlock_bh(&rose_route_list_lock); +} +#endif + +/* + * Clear all nodes and neighbours out, except for neighbours with + * active connections going through them. + * Do not clear loopback neighbour and nodes. + */ +static int rose_clear_routes(void) +{ + struct rose_neigh *s, *rose_neigh; + struct rose_node *t, *rose_node; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + + rose_neigh = rose_neigh_list; + rose_node = rose_node_list; + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + if (!t->loopback) + rose_remove_node(t); + } + + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + if (s->use == 0 && !s->loopback) { + s->count = 0; + rose_remove_neigh(s); + } + } + + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return 0; +} + +/* + * Check that the device given is a valid AX.25 interface that is "up". + * called with RTNL + */ +static struct net_device *rose_ax25_dev_find(char *devname) +{ + struct net_device *dev; + + if ((dev = __dev_get_by_name(&init_net, devname)) == NULL) + return NULL; + + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) + return dev; + + return NULL; +} + +/* + * Find the first active ROSE device, usually "rose0". + */ +struct net_device *rose_dev_first(void) +{ + struct net_device *dev, *first = NULL; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE) + if (first == NULL || strncmp(dev->name, first->name, 3) < 0) + first = dev; + } + rcu_read_unlock(); + + return first; +} + +/* + * Find the ROSE device for the given address. + */ +struct net_device *rose_dev_get(rose_address *addr) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) { + dev_hold(dev); + goto out; + } + } + dev = NULL; +out: + rcu_read_unlock(); + return dev; +} + +static int rose_dev_exists(rose_address *addr) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) + goto out; + } + dev = NULL; +out: + rcu_read_unlock(); + return dev != NULL; +} + + + + +struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neigh) +{ + struct rose_route *rose_route; + + for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next) + if ((rose_route->neigh1 == neigh && rose_route->lci1 == lci) || + (rose_route->neigh2 == neigh && rose_route->lci2 == lci)) + return rose_route; + + return NULL; +} + +/* + * Find a neighbour or a route given a ROSE address. + */ +struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, + unsigned char *diagnostic, int route_frame) +{ + struct rose_neigh *res = NULL; + struct rose_node *node; + int failed = 0; + int i; + + if (!route_frame) spin_lock_bh(&rose_node_list_lock); + for (node = rose_node_list; node != NULL; node = node->next) { + if (rosecmpm(addr, &node->address, node->mask) == 0) { + for (i = 0; i < node->count; i++) { + if (node->neighbour[i]->restarted) { + res = node->neighbour[i]; + goto out; + } + } + } + } + if (!route_frame) { /* connect request */ + for (node = rose_node_list; node != NULL; node = node->next) { + if (rosecmpm(addr, &node->address, node->mask) == 0) { + for (i = 0; i < node->count; i++) { + if (!rose_ftimer_running(node->neighbour[i])) { + res = node->neighbour[i]; + failed = 0; + goto out; + } + failed = 1; + } + } + } + } + + if (failed) { + *cause = ROSE_OUT_OF_ORDER; + *diagnostic = 0; + } else { + *cause = ROSE_NOT_OBTAINABLE; + *diagnostic = 0; + } + +out: + if (!route_frame) spin_unlock_bh(&rose_node_list_lock); + return res; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int rose_rt_ioctl(unsigned int cmd, void __user *arg) +{ + struct rose_route_struct rose_route; + struct net_device *dev; + int err; + + switch (cmd) { + case SIOCADDRT: + if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) + return -EFAULT; + if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL) + return -EINVAL; + if (rose_dev_exists(&rose_route.address)) /* Can't add routes to ourself */ + return -EINVAL; + if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ + return -EINVAL; + if (rose_route.ndigis > AX25_MAX_DIGIS) + return -EINVAL; + err = rose_add_node(&rose_route, dev); + return err; + + case SIOCDELRT: + if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) + return -EFAULT; + if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL) + return -EINVAL; + err = rose_del_node(&rose_route, dev); + return err; + + case SIOCRSCLRRT: + return rose_clear_routes(); + + default: + return -EINVAL; + } + + return 0; +} + +static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) +{ + struct rose_route *rose_route, *s; + + rose_neigh->restarted = 0; + + rose_stop_t0timer(rose_neigh); + rose_start_ftimer(rose_neigh); + + skb_queue_purge(&rose_neigh->queue); + + spin_lock_bh(&rose_route_list_lock); + + rose_route = rose_route_list; + + while (rose_route != NULL) { + if ((rose_route->neigh1 == rose_neigh && rose_route->neigh2 == rose_neigh) || + (rose_route->neigh1 == rose_neigh && rose_route->neigh2 == NULL) || + (rose_route->neigh2 == rose_neigh && rose_route->neigh1 == NULL)) { + s = rose_route->next; + rose_remove_route(rose_route); + rose_route = s; + continue; + } + + if (rose_route->neigh1 == rose_neigh) { + rose_route->neigh1->use--; + rose_route->neigh1 = NULL; + rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); + } + + if (rose_route->neigh2 == rose_neigh) { + rose_route->neigh2->use--; + rose_route->neigh2 = NULL; + rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); + } + + rose_route = rose_route->next; + } + spin_unlock_bh(&rose_route_list_lock); +} + +/* + * A level 2 link has timed out, therefore it appears to be a poor link, + * then don't use that neighbour until it is reset. Blow away all through + * routes and connections using this route. + */ +void rose_link_failed(ax25_cb *ax25, int reason) +{ + struct rose_neigh *rose_neigh; + + spin_lock_bh(&rose_neigh_list_lock); + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (rose_neigh->ax25 == ax25) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh != NULL) { + rose_neigh->ax25 = NULL; + ax25_cb_put(ax25); + + rose_del_route_by_neigh(rose_neigh); + rose_kill_by_neigh(rose_neigh); + } + spin_unlock_bh(&rose_neigh_list_lock); +} + +/* + * A device has been "downed" remove its link status. Blow away all + * through routes and connections that use this device. + */ +void rose_link_device_down(struct net_device *dev) +{ + struct rose_neigh *rose_neigh; + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { + if (rose_neigh->dev == dev) { + rose_del_route_by_neigh(rose_neigh); + rose_kill_by_neigh(rose_neigh); + } + } +} + +/* + * Route a frame to an appropriate AX.25 connection. + */ +int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) +{ + struct rose_neigh *rose_neigh, *new_neigh; + struct rose_route *rose_route; + struct rose_facilities_struct facilities; + rose_address *src_addr, *dest_addr; + struct sock *sk; + unsigned short frametype; + unsigned int lci, new_lci; + unsigned char cause, diagnostic; + struct net_device *dev; + int res = 0; + char buf[11]; + +#if 0 + if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) + return res; +#endif + + if (skb->len < ROSE_MIN_LEN) + return res; + frametype = skb->data[2]; + lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + if (frametype == ROSE_CALL_REQUEST && + (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || + skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != + ROSE_CALL_REQ_ADDR_LEN_VAL)) + return res; + src_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_SRC_ADDR_OFF); + dest_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); + + spin_lock_bh(&rose_neigh_list_lock); + spin_lock_bh(&rose_route_list_lock); + + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 && + ax25->ax25_dev->dev == rose_neigh->dev) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh == NULL) { + printk("rose_route : unknown neighbour or device %s\n", + ax2asc(buf, &ax25->dest_addr)); + goto out; + } + + /* + * Obviously the link is working, halt the ftimer. + */ + rose_stop_ftimer(rose_neigh); + + /* + * LCI of zero is always for us, and its always a restart + * frame. + */ + if (lci == 0) { + rose_link_rx_restart(skb, rose_neigh, frametype); + goto out; + } + + /* + * Find an existing socket. + */ + if ((sk = rose_find_socket(lci, rose_neigh)) != NULL) { + if (frametype == ROSE_CALL_REQUEST) { + struct rose_sock *rose = rose_sk(sk); + + /* Remove an existing unused socket */ + rose_clear_queues(sk); + rose->cause = ROSE_NETWORK_CONGESTION; + rose->diagnostic = 0; + rose->neighbour->use--; + rose->neighbour = NULL; + rose->lci = 0; + rose->state = ROSE_STATE_0; + sk->sk_state = TCP_CLOSE; + sk->sk_err = 0; + sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + } + else { + skb_reset_transport_header(skb); + res = rose_process_rx_frame(sk, skb); + goto out; + } + } + + /* + * Is is a Call Request and is it for us ? + */ + if (frametype == ROSE_CALL_REQUEST) + if ((dev = rose_dev_get(dest_addr)) != NULL) { + res = rose_rx_call_request(skb, dev, rose_neigh, lci); + dev_put(dev); + goto out; + } + + if (!sysctl_rose_routing_control) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 0); + goto out; + } + + /* + * Route it to the next in line if we have an entry for it. + */ + rose_route = rose_route_list; + while (rose_route != NULL) { + if (rose_route->lci1 == lci && + rose_route->neigh1 == rose_neigh) { + if (frametype == ROSE_CALL_REQUEST) { + /* F6FBB - Remove an existing unused route */ + rose_remove_route(rose_route); + break; + } else if (rose_route->neigh2 != NULL) { + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; + rose_transmit_link(skb, rose_route->neigh2); + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + res = 1; + goto out; + } else { + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + goto out; + } + } + if (rose_route->lci2 == lci && + rose_route->neigh2 == rose_neigh) { + if (frametype == ROSE_CALL_REQUEST) { + /* F6FBB - Remove an existing unused route */ + rose_remove_route(rose_route); + break; + } else if (rose_route->neigh1 != NULL) { + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci1 >> 0) & 0xFF; + rose_transmit_link(skb, rose_route->neigh1); + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + res = 1; + goto out; + } else { + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + goto out; + } + } + rose_route = rose_route->next; + } + + /* + * We know that: + * 1. The frame isn't for us, + * 2. It isn't "owned" by any existing route. + */ + if (frametype != ROSE_CALL_REQUEST) { /* XXX */ + res = 0; + goto out; + } + + memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); + + if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, + skb->len - ROSE_CALL_REQ_FACILITIES_OFF, + &facilities)) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76); + goto out; + } + + /* + * Check for routing loops. + */ + rose_route = rose_route_list; + while (rose_route != NULL) { + if (rose_route->rand == facilities.rand && + rosecmp(src_addr, &rose_route->src_addr) == 0 && + ax25cmp(&facilities.dest_call, &rose_route->src_call) == 0 && + ax25cmp(&facilities.source_call, &rose_route->dest_call) == 0) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 120); + goto out; + } + rose_route = rose_route->next; + } + + if ((new_neigh = rose_get_neigh(dest_addr, &cause, &diagnostic, 1)) == NULL) { + rose_transmit_clear_request(rose_neigh, lci, cause, diagnostic); + goto out; + } + + if ((new_lci = rose_new_lci(new_neigh)) == 0) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); + goto out; + } + + if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); + goto out; + } + + rose_route->lci1 = lci; + rose_route->src_addr = *src_addr; + rose_route->dest_addr = *dest_addr; + rose_route->src_call = facilities.dest_call; + rose_route->dest_call = facilities.source_call; + rose_route->rand = facilities.rand; + rose_route->neigh1 = rose_neigh; + rose_route->lci2 = new_lci; + rose_route->neigh2 = new_neigh; + + rose_route->neigh1->use++; + rose_route->neigh2->use++; + + rose_route->next = rose_route_list; + rose_route_list = rose_route; + + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; + + rose_transmit_link(skb, rose_route->neigh2); + res = 1; + +out: + spin_unlock_bh(&rose_route_list_lock); + spin_unlock_bh(&rose_neigh_list_lock); + + return res; +} + +#ifdef CONFIG_PROC_FS + +static void *rose_node_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_node_list_lock) +{ + struct rose_node *rose_node; + int i = 1; + + spin_lock_bh(&rose_node_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (rose_node = rose_node_list; rose_node && i < *pos; + rose_node = rose_node->next, ++i); + + return (i == *pos) ? rose_node : NULL; +} + +static void *rose_node_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? rose_node_list + : ((struct rose_node *)v)->next; +} + +static void rose_node_stop(struct seq_file *seq, void *v) + __releases(rose_node_list_lock) +{ + spin_unlock_bh(&rose_node_list_lock); +} + +static int rose_node_show(struct seq_file *seq, void *v) +{ + char rsbuf[11]; + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "address mask n neigh neigh neigh\n"); + else { + const struct rose_node *rose_node = v; + /* if (rose_node->loopback) { + seq_printf(seq, "%-10s %04d 1 loopback\n", + rose2asc(rsbuf, &rose_node->address), + rose_node->mask); + } else { */ + seq_printf(seq, "%-10s %04d %d", + rose2asc(rsbuf, &rose_node->address), + rose_node->mask, + rose_node->count); + + for (i = 0; i < rose_node->count; i++) + seq_printf(seq, " %05d", + rose_node->neighbour[i]->number); + + seq_puts(seq, "\n"); + /* } */ + } + return 0; +} + +static const struct seq_operations rose_node_seqops = { + .start = rose_node_start, + .next = rose_node_next, + .stop = rose_node_stop, + .show = rose_node_show, +}; + +static int rose_nodes_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_node_seqops); +} + +const struct file_operations rose_nodes_fops = { + .owner = THIS_MODULE, + .open = rose_nodes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *rose_neigh_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_neigh_list_lock) +{ + struct rose_neigh *rose_neigh; + int i = 1; + + spin_lock_bh(&rose_neigh_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (rose_neigh = rose_neigh_list; rose_neigh && i < *pos; + rose_neigh = rose_neigh->next, ++i); + + return (i == *pos) ? rose_neigh : NULL; +} + +static void *rose_neigh_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? rose_neigh_list + : ((struct rose_neigh *)v)->next; +} + +static void rose_neigh_stop(struct seq_file *seq, void *v) + __releases(rose_neigh_list_lock) +{ + spin_unlock_bh(&rose_neigh_list_lock); +} + +static int rose_neigh_show(struct seq_file *seq, void *v) +{ + char buf[11]; + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "addr callsign dev count use mode restart t0 tf digipeaters\n"); + else { + struct rose_neigh *rose_neigh = v; + + /* if (!rose_neigh->loopback) { */ + seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", + rose_neigh->number, + (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), + rose_neigh->dev ? rose_neigh->dev->name : "???", + rose_neigh->count, + rose_neigh->use, + (rose_neigh->dce_mode) ? "DCE" : "DTE", + (rose_neigh->restarted) ? "yes" : "no", + ax25_display_timer(&rose_neigh->t0timer) / HZ, + ax25_display_timer(&rose_neigh->ftimer) / HZ); + + if (rose_neigh->digipeat != NULL) { + for (i = 0; i < rose_neigh->digipeat->ndigi; i++) + seq_printf(seq, " %s", ax2asc(buf, &rose_neigh->digipeat->calls[i])); + } + + seq_puts(seq, "\n"); + } + return 0; +} + + +static const struct seq_operations rose_neigh_seqops = { + .start = rose_neigh_start, + .next = rose_neigh_next, + .stop = rose_neigh_stop, + .show = rose_neigh_show, +}; + +static int rose_neigh_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_neigh_seqops); +} + +const struct file_operations rose_neigh_fops = { + .owner = THIS_MODULE, + .open = rose_neigh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +static void *rose_route_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_route_list_lock) +{ + struct rose_route *rose_route; + int i = 1; + + spin_lock_bh(&rose_route_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (rose_route = rose_route_list; rose_route && i < *pos; + rose_route = rose_route->next, ++i); + + return (i == *pos) ? rose_route : NULL; +} + +static void *rose_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? rose_route_list + : ((struct rose_route *)v)->next; +} + +static void rose_route_stop(struct seq_file *seq, void *v) + __releases(rose_route_list_lock) +{ + spin_unlock_bh(&rose_route_list_lock); +} + +static int rose_route_show(struct seq_file *seq, void *v) +{ + char buf[11], rsbuf[11]; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "lci address callsign neigh <-> lci address callsign neigh\n"); + else { + struct rose_route *rose_route = v; + + if (rose_route->neigh1) + seq_printf(seq, + "%3.3X %-10s %-9s %05d ", + rose_route->lci1, + rose2asc(rsbuf, &rose_route->src_addr), + ax2asc(buf, &rose_route->src_call), + rose_route->neigh1->number); + else + seq_puts(seq, + "000 * * 00000 "); + + if (rose_route->neigh2) + seq_printf(seq, + "%3.3X %-10s %-9s %05d\n", + rose_route->lci2, + rose2asc(rsbuf, &rose_route->dest_addr), + ax2asc(buf, &rose_route->dest_call), + rose_route->neigh2->number); + else + seq_puts(seq, + "000 * * 00000\n"); + } + return 0; +} + +static const struct seq_operations rose_route_seqops = { + .start = rose_route_start, + .next = rose_route_next, + .stop = rose_route_stop, + .show = rose_route_show, +}; + +static int rose_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_route_seqops); +} + +const struct file_operations rose_routes_fops = { + .owner = THIS_MODULE, + .open = rose_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +/* + * Release all memory associated with ROSE routing structures. + */ +void __exit rose_rt_free(void) +{ + struct rose_neigh *s, *rose_neigh = rose_neigh_list; + struct rose_node *t, *rose_node = rose_node_list; + struct rose_route *u, *rose_route = rose_route_list; + + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + rose_remove_neigh(s); + } + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + + rose_remove_node(t); + } + + while (rose_route != NULL) { + u = rose_route; + rose_route = rose_route->next; + + rose_remove_route(u); + } +} diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c new file mode 100644 index 00000000..f6c71caa --- /dev/null +++ b/net/rose/rose_subr.c @@ -0,0 +1,557 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose); + +/* + * This routine purges all of the queues of frames. + */ +void rose_clear_queues(struct sock *sk) +{ + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&rose_sk(sk)->ack_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void rose_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + struct rose_sock *rose = rose_sk(sk); + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (rose->va != nr) { + while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) { + skb = skb_dequeue(&rose->ack_queue); + kfree_skb(skb); + rose->va = (rose->va + 1) % ROSE_MODULUS; + } + } +} + +void rose_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by rose_kick. This arrangement handles the possibility of an + * empty output queue. + */ + while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->sk_write_queue, skb); + else + skb_append(skb_prev, skb, &sk->sk_write_queue); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int rose_validate_nr(struct sock *sk, unsigned short nr) +{ + struct rose_sock *rose = rose_sk(sk); + unsigned short vc = rose->va; + + while (vc != rose->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % ROSE_MODULUS; + } + + return nr == rose->vs; +} + +/* + * This routine is called when the packet layer internally generates a + * control frame. + */ +void rose_write_internal(struct sock *sk, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + struct sk_buff *skb; + unsigned char *dptr; + unsigned char lci1, lci2; + char buffer[100]; + int len, faclen = 0; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1; + + switch (frametype) { + case ROSE_CALL_REQUEST: + len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN; + faclen = rose_create_facilities(buffer, rose); + len += faclen; + break; + case ROSE_CALL_ACCEPTED: + case ROSE_CLEAR_REQUEST: + case ROSE_RESET_REQUEST: + len += 2; + break; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for AX.25 header and PID. + */ + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1); + + dptr = skb_put(skb, skb_tailroom(skb)); + + lci1 = (rose->lci >> 8) & 0x0F; + lci2 = (rose->lci >> 0) & 0xFF; + + switch (frametype) { + case ROSE_CALL_REQUEST: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL; + memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); + dptr += ROSE_ADDR_LEN; + memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); + dptr += ROSE_ADDR_LEN; + memcpy(dptr, buffer, faclen); + dptr += faclen; + break; + + case ROSE_CALL_ACCEPTED: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0x00; /* Address length */ + *dptr++ = 0; /* Facilities length */ + break; + + case ROSE_CLEAR_REQUEST: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = rose->cause; + *dptr++ = rose->diagnostic; + break; + + case ROSE_RESET_REQUEST: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = ROSE_DTE_ORIGINATED; + *dptr++ = 0; + break; + + case ROSE_RR: + case ROSE_RNR: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr = frametype; + *dptr++ |= (rose->vr << 5) & 0xE0; + break; + + case ROSE_CLEAR_CONFIRMATION: + case ROSE_RESET_CONFIRMATION: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + break; + + default: + printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype); + kfree_skb(skb); + return; + } + + rose_transmit_link(skb, rose->neighbour); +} + +int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) +{ + unsigned char *frame; + + frame = skb->data; + + *ns = *nr = *q = *d = *m = 0; + + switch (frame[2]) { + case ROSE_CALL_REQUEST: + case ROSE_CALL_ACCEPTED: + case ROSE_CLEAR_REQUEST: + case ROSE_CLEAR_CONFIRMATION: + case ROSE_RESET_REQUEST: + case ROSE_RESET_CONFIRMATION: + return frame[2]; + default: + break; + } + + if ((frame[2] & 0x1F) == ROSE_RR || + (frame[2] & 0x1F) == ROSE_RNR) { + *nr = (frame[2] >> 5) & 0x07; + return frame[2] & 0x1F; + } + + if ((frame[2] & 0x01) == ROSE_DATA) { + *q = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT; + *d = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT; + *m = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT; + *nr = (frame[2] >> 5) & 0x07; + *ns = (frame[2] >> 1) & 0x07; + return ROSE_DATA; + } + + return ROSE_ILLEGAL; +} + +static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len) +{ + unsigned char *pt; + unsigned char l, lg, n = 0; + int fac_national_digis_received = 0; + + do { + switch (*p & 0xC0) { + case 0x00: + if (len < 2) + return -1; + p += 2; + n += 2; + len -= 2; + break; + + case 0x40: + if (len < 3) + return -1; + if (*p == FAC_NATIONAL_RAND) + facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); + p += 3; + n += 3; + len -= 3; + break; + + case 0x80: + if (len < 4) + return -1; + p += 4; + n += 4; + len -= 4; + break; + + case 0xC0: + if (len < 2) + return -1; + l = p[1]; + if (len < 2 + l) + return -1; + if (*p == FAC_NATIONAL_DEST_DIGI) { + if (!fac_national_digis_received) { + if (l < AX25_ADDR_LEN) + return -1; + memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); + facilities->source_ndigis = 1; + } + } + else if (*p == FAC_NATIONAL_SRC_DIGI) { + if (!fac_national_digis_received) { + if (l < AX25_ADDR_LEN) + return -1; + memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); + facilities->dest_ndigis = 1; + } + } + else if (*p == FAC_NATIONAL_FAIL_CALL) { + if (l < AX25_ADDR_LEN) + return -1; + memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); + } + else if (*p == FAC_NATIONAL_FAIL_ADD) { + if (l < 1 + ROSE_ADDR_LEN) + return -1; + memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); + } + else if (*p == FAC_NATIONAL_DIGIS) { + if (l % AX25_ADDR_LEN) + return -1; + fac_national_digis_received = 1; + facilities->source_ndigis = 0; + facilities->dest_ndigis = 0; + for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { + if (pt[6] & AX25_HBIT) { + if (facilities->dest_ndigis >= ROSE_MAX_DIGIS) + return -1; + memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); + } else { + if (facilities->source_ndigis >= ROSE_MAX_DIGIS) + return -1; + memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); + } + } + } + p += l + 2; + n += l + 2; + len -= l + 2; + break; + } + } while (*p != 0x00 && len > 0); + + return n; +} + +static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len) +{ + unsigned char l, n = 0; + char callsign[11]; + + do { + switch (*p & 0xC0) { + case 0x00: + if (len < 2) + return -1; + p += 2; + n += 2; + len -= 2; + break; + + case 0x40: + if (len < 3) + return -1; + p += 3; + n += 3; + len -= 3; + break; + + case 0x80: + if (len < 4) + return -1; + p += 4; + n += 4; + len -= 4; + break; + + case 0xC0: + if (len < 2) + return -1; + l = p[1]; + + /* Prevent overflows*/ + if (l < 10 || l > 20) + return -1; + + if (*p == FAC_CCITT_DEST_NSAP) { + memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); + memcpy(callsign, p + 12, l - 10); + callsign[l - 10] = '\0'; + asc2ax(&facilities->source_call, callsign); + } + if (*p == FAC_CCITT_SRC_NSAP) { + memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN); + memcpy(callsign, p + 12, l - 10); + callsign[l - 10] = '\0'; + asc2ax(&facilities->dest_call, callsign); + } + p += l + 2; + n += l + 2; + len -= l + 2; + break; + } + } while (*p != 0x00 && len > 0); + + return n; +} + +int rose_parse_facilities(unsigned char *p, unsigned packet_len, + struct rose_facilities_struct *facilities) +{ + int facilities_len, len; + + facilities_len = *p++; + + if (facilities_len == 0 || (unsigned)facilities_len > packet_len) + return 0; + + while (facilities_len >= 3 && *p == 0x00) { + facilities_len--; + p++; + + switch (*p) { + case FAC_NATIONAL: /* National */ + len = rose_parse_national(p + 1, facilities, facilities_len - 1); + break; + + case FAC_CCITT: /* CCITT */ + len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); + break; + + default: + printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); + len = 1; + break; + } + + if (len < 0) + return 0; + if (WARN_ON(len >= facilities_len)) + return 0; + facilities_len -= len + 1; + p += len + 1; + } + + return facilities_len == 0; +} + +static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) +{ + unsigned char *p = buffer + 1; + char *callsign; + char buf[11]; + int len, nb; + + /* National Facilities */ + if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { + *p++ = 0x00; + *p++ = FAC_NATIONAL; + + if (rose->rand != 0) { + *p++ = FAC_NATIONAL_RAND; + *p++ = (rose->rand >> 8) & 0xFF; + *p++ = (rose->rand >> 0) & 0xFF; + } + + /* Sent before older facilities */ + if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) { + int maxdigi = 0; + *p++ = FAC_NATIONAL_DIGIS; + *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis); + for (nb = 0 ; nb < rose->source_ndigis ; nb++) { + if (++maxdigi >= ROSE_MAX_DIGIS) + break; + memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN); + p[6] |= AX25_HBIT; + p += AX25_ADDR_LEN; + } + for (nb = 0 ; nb < rose->dest_ndigis ; nb++) { + if (++maxdigi >= ROSE_MAX_DIGIS) + break; + memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN); + p[6] &= ~AX25_HBIT; + p += AX25_ADDR_LEN; + } + } + + /* For compatibility */ + if (rose->source_ndigis > 0) { + *p++ = FAC_NATIONAL_SRC_DIGI; + *p++ = AX25_ADDR_LEN; + memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN); + p += AX25_ADDR_LEN; + } + + /* For compatibility */ + if (rose->dest_ndigis > 0) { + *p++ = FAC_NATIONAL_DEST_DIGI; + *p++ = AX25_ADDR_LEN; + memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN); + p += AX25_ADDR_LEN; + } + } + + *p++ = 0x00; + *p++ = FAC_CCITT; + + *p++ = FAC_CCITT_DEST_NSAP; + + callsign = ax2asc(buf, &rose->dest_call); + + *p++ = strlen(callsign) + 10; + *p++ = (strlen(callsign) + 9) * 2; /* ??? */ + + *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; + *p++ = ROSE_ADDR_LEN * 2; + memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN); + p += ROSE_ADDR_LEN; + + memcpy(p, callsign, strlen(callsign)); + p += strlen(callsign); + + *p++ = FAC_CCITT_SRC_NSAP; + + callsign = ax2asc(buf, &rose->source_call); + + *p++ = strlen(callsign) + 10; + *p++ = (strlen(callsign) + 9) * 2; /* ??? */ + + *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; + *p++ = ROSE_ADDR_LEN * 2; + memcpy(p, &rose->source_addr, ROSE_ADDR_LEN); + p += ROSE_ADDR_LEN; + + memcpy(p, callsign, strlen(callsign)); + p += strlen(callsign); + + len = p - buffer; + buffer[0] = len - 1; + + return len; +} + +void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic) +{ + struct rose_sock *rose = rose_sk(sk); + + rose_stop_timer(sk); + rose_stop_idletimer(sk); + + rose_clear_queues(sk); + + rose->lci = 0; + rose->state = ROSE_STATE_0; + + if (cause != -1) + rose->cause = cause; + + if (diagnostic != -1) + rose->diagnostic = diagnostic; + + sk->sk_state = TCP_CLOSE; + sk->sk_err = reason; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } +} diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c new file mode 100644 index 00000000..b6c8f38c --- /dev/null +++ b/net/rose/rose_timer.c @@ -0,0 +1,217 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void rose_heartbeat_expiry(unsigned long); +static void rose_timer_expiry(unsigned long); +static void rose_idletimer_expiry(unsigned long); + +void rose_start_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); + + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = &rose_heartbeat_expiry; + sk->sk_timer.expires = jiffies + 5 * HZ; + + add_timer(&sk->sk_timer); +} + +void rose_start_t1timer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->t1; + + add_timer(&rose->timer); +} + +void rose_start_t2timer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->t2; + + add_timer(&rose->timer); +} + +void rose_start_t3timer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->t3; + + add_timer(&rose->timer); +} + +void rose_start_hbtimer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->hb; + + add_timer(&rose->timer); +} + +void rose_start_idletimer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->idletimer); + + if (rose->idle > 0) { + rose->idletimer.data = (unsigned long)sk; + rose->idletimer.function = &rose_idletimer_expiry; + rose->idletimer.expires = jiffies + rose->idle; + + add_timer(&rose->idletimer); + } +} + +void rose_stop_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +void rose_stop_timer(struct sock *sk) +{ + del_timer(&rose_sk(sk)->timer); +} + +void rose_stop_idletimer(struct sock *sk) +{ + del_timer(&rose_sk(sk)->idletimer); +} + +static void rose_heartbeat_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct rose_sock *rose = rose_sk(sk); + + bh_lock_sock(sk); + switch (rose->state) { + case ROSE_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + bh_unlock_sock(sk); + rose_destroy_socket(sk); + return; + } + break; + + case ROSE_STATE_3: + /* + * Check for the state of the receive buffer. + */ + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && + (rose->condition & ROSE_COND_OWN_RX_BUSY)) { + rose->condition &= ~ROSE_COND_OWN_RX_BUSY; + rose->condition &= ~ROSE_COND_ACK_PENDING; + rose->vl = rose->vr; + rose_write_internal(sk, ROSE_RR); + rose_stop_timer(sk); /* HB */ + break; + } + break; + } + + rose_start_heartbeat(sk); + bh_unlock_sock(sk); +} + +static void rose_timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct rose_sock *rose = rose_sk(sk); + + bh_lock_sock(sk); + switch (rose->state) { + case ROSE_STATE_1: /* T1 */ + case ROSE_STATE_4: /* T2 */ + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + rose->state = ROSE_STATE_2; + rose_start_t3timer(sk); + break; + + case ROSE_STATE_2: /* T3 */ + rose->neighbour->use--; + rose_disconnect(sk, ETIMEDOUT, -1, -1); + break; + + case ROSE_STATE_3: /* HB */ + if (rose->condition & ROSE_COND_ACK_PENDING) { + rose->condition &= ~ROSE_COND_ACK_PENDING; + rose_enquiry_response(sk); + } + break; + } + bh_unlock_sock(sk); +} + +static void rose_idletimer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + rose_clear_queues(sk); + + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + rose_sk(sk)->state = ROSE_STATE_2; + + rose_start_t3timer(sk); + + sk->sk_state = TCP_CLOSE; + sk->sk_err = 0; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + bh_unlock_sock(sk); +} diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c new file mode 100644 index 00000000..df6d9dac --- /dev/null +++ b/net/rose/sysctl_net_rose.c @@ -0,0 +1,135 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) + */ +#include +#include +#include +#include +#include + +static int min_timer[] = {1 * HZ}; +static int max_timer[] = {300 * HZ}; +static int min_idle[] = {0 * HZ}; +static int max_idle[] = {65535 * HZ}; +static int min_route[1], max_route[] = {1}; +static int min_ftimer[] = {60 * HZ}; +static int max_ftimer[] = {600 * HZ}; +static int min_maxvcs[] = {1}, max_maxvcs[] = {254}; +static int min_window[] = {1}, max_window[] = {7}; + +static struct ctl_table_header *rose_table_header; + +static ctl_table rose_table[] = { + { + .procname = "restart_request_timeout", + .data = &sysctl_rose_restart_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .procname = "call_request_timeout", + .data = &sysctl_rose_call_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .procname = "reset_request_timeout", + .data = &sysctl_rose_reset_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .procname = "clear_request_timeout", + .data = &sysctl_rose_clear_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .procname = "no_activity_timeout", + .data = &sysctl_rose_no_activity_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_idle, + .extra2 = &max_idle + }, + { + .procname = "acknowledge_hold_back_timeout", + .data = &sysctl_rose_ack_hold_back_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .procname = "routing_control", + .data = &sysctl_rose_routing_control, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_route, + .extra2 = &max_route + }, + { + .procname = "link_fail_timeout", + .data = &sysctl_rose_link_fail_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ftimer, + .extra2 = &max_ftimer + }, + { + .procname = "maximum_virtual_circuits", + .data = &sysctl_rose_maximum_vcs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_maxvcs, + .extra2 = &max_maxvcs + }, + { + .procname = "window_size", + .data = &sysctl_rose_window_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_window, + .extra2 = &max_window + }, + { } +}; + +static struct ctl_path rose_path[] = { + { .procname = "net", }, + { .procname = "rose", }, + { } +}; + +void __init rose_register_sysctl(void) +{ + rose_table_header = register_sysctl_paths(rose_path, rose_table); +} + +void rose_unregister_sysctl(void) +{ + unregister_sysctl_table(rose_table_header); +} diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig new file mode 100644 index 00000000..0d3103c4 --- /dev/null +++ b/net/rxrpc/Kconfig @@ -0,0 +1,44 @@ +# +# RxRPC session sockets +# + +config AF_RXRPC + tristate "RxRPC session sockets" + depends on INET && EXPERIMENTAL + select CRYPTO + select KEYS + help + Say Y or M here to include support for RxRPC session sockets (just + the transport part, not the presentation part: (un)marshalling is + left to the application). + + These are used for AFS kernel filesystem and userspace utilities. + + This module at the moment only supports client operations and is + currently incomplete. + + See Documentation/networking/rxrpc.txt. + + +config AF_RXRPC_DEBUG + bool "RxRPC dynamic debugging" + depends on AF_RXRPC + help + Say Y here to make runtime controllable debugging messages appear. + + See Documentation/networking/rxrpc.txt. + + +config RXKAD + tristate "RxRPC Kerberos security" + depends on AF_RXRPC + select CRYPTO + select CRYPTO_MANAGER + select CRYPTO_BLKCIPHER + select CRYPTO_PCBC + select CRYPTO_FCRYPT + help + Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC + through the use of the key retention service. + + See Documentation/networking/rxrpc.txt. diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile new file mode 100644 index 00000000..d1c3429b --- /dev/null +++ b/net/rxrpc/Makefile @@ -0,0 +1,29 @@ +# +# Makefile for Linux kernel RxRPC +# + +af-rxrpc-y := \ + af_rxrpc.o \ + ar-accept.o \ + ar-ack.o \ + ar-call.o \ + ar-connection.o \ + ar-connevent.o \ + ar-error.o \ + ar-input.o \ + ar-key.o \ + ar-local.o \ + ar-output.o \ + ar-peer.o \ + ar-recvmsg.o \ + ar-security.o \ + ar-skbuff.o \ + ar-transport.o + +ifeq ($(CONFIG_PROC_FS),y) +af-rxrpc-y += ar-proc.o +endif + +obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o + +obj-$(CONFIG_RXKAD) += rxkad.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c new file mode 100644 index 00000000..74c064c0 --- /dev/null +++ b/net/rxrpc/af_rxrpc.c @@ -0,0 +1,889 @@ +/* AF_RXRPC implementation + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +MODULE_DESCRIPTION("RxRPC network protocol"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_RXRPC); + +unsigned rxrpc_debug; // = RXRPC_DEBUG_KPROTO; +module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "RxRPC debugging mask"); + +static int sysctl_rxrpc_max_qlen __read_mostly = 10; + +static struct proto rxrpc_proto; +static const struct proto_ops rxrpc_rpc_ops; + +/* local epoch for detecting local-end reset */ +__be32 rxrpc_epoch; + +/* current debugging ID */ +atomic_t rxrpc_debug_id; + +/* count of skbs currently in use */ +atomic_t rxrpc_n_skbs; + +struct workqueue_struct *rxrpc_workqueue; + +static void rxrpc_sock_destructor(struct sock *); + +/* + * see if an RxRPC socket is currently writable + */ +static inline int rxrpc_writable(struct sock *sk) +{ + return atomic_read(&sk->sk_wmem_alloc) < (size_t) sk->sk_sndbuf; +} + +/* + * wait for write bufferage to become available + */ +static void rxrpc_write_space(struct sock *sk) +{ + _enter("%p", sk); + rcu_read_lock(); + if (rxrpc_writable(sk)) { + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } + rcu_read_unlock(); +} + +/* + * validate an RxRPC address + */ +static int rxrpc_validate_address(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, + int len) +{ + if (len < sizeof(struct sockaddr_rxrpc)) + return -EINVAL; + + if (srx->srx_family != AF_RXRPC) + return -EAFNOSUPPORT; + + if (srx->transport_type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + len -= offsetof(struct sockaddr_rxrpc, transport); + if (srx->transport_len < sizeof(sa_family_t) || + srx->transport_len > len) + return -EINVAL; + + if (srx->transport.family != rx->proto) + return -EAFNOSUPPORT; + + switch (srx->transport.family) { + case AF_INET: + _debug("INET: %x @ %pI4", + ntohs(srx->transport.sin.sin_port), + &srx->transport.sin.sin_addr); + if (srx->transport_len > 8) + memset((void *)&srx->transport + 8, 0, + srx->transport_len - 8); + break; + + case AF_INET6: + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +/* + * bind a local address to an RxRPC socket + */ +static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr; + struct sock *sk = sock->sk; + struct rxrpc_local *local; + struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; + __be16 service_id; + int ret; + + _enter("%p,%p,%d", rx, saddr, len); + + ret = rxrpc_validate_address(rx, srx, len); + if (ret < 0) + goto error; + + lock_sock(&rx->sk); + + if (rx->sk.sk_state != RXRPC_UNCONNECTED) { + ret = -EINVAL; + goto error_unlock; + } + + memcpy(&rx->srx, srx, sizeof(rx->srx)); + + /* find a local transport endpoint if we don't have one already */ + local = rxrpc_lookup_local(&rx->srx); + if (IS_ERR(local)) { + ret = PTR_ERR(local); + goto error_unlock; + } + + rx->local = local; + if (srx->srx_service) { + service_id = htons(srx->srx_service); + write_lock_bh(&local->services_lock); + list_for_each_entry(prx, &local->services, listen_link) { + if (prx->service_id == service_id) + goto service_in_use; + } + + rx->service_id = service_id; + list_add_tail(&rx->listen_link, &local->services); + write_unlock_bh(&local->services_lock); + + rx->sk.sk_state = RXRPC_SERVER_BOUND; + } else { + rx->sk.sk_state = RXRPC_CLIENT_BOUND; + } + + release_sock(&rx->sk); + _leave(" = 0"); + return 0; + +service_in_use: + ret = -EADDRINUSE; + write_unlock_bh(&local->services_lock); +error_unlock: + release_sock(&rx->sk); +error: + _leave(" = %d", ret); + return ret; +} + +/* + * set the number of pending calls permitted on a listening socket + */ +static int rxrpc_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + _enter("%p,%d", rx, backlog); + + lock_sock(&rx->sk); + + switch (rx->sk.sk_state) { + case RXRPC_UNCONNECTED: + ret = -EADDRNOTAVAIL; + break; + case RXRPC_CLIENT_BOUND: + case RXRPC_CLIENT_CONNECTED: + default: + ret = -EBUSY; + break; + case RXRPC_SERVER_BOUND: + ASSERT(rx->local != NULL); + sk->sk_max_ack_backlog = backlog; + rx->sk.sk_state = RXRPC_SERVER_LISTENING; + ret = 0; + break; + } + + release_sock(&rx->sk); + _leave(" = %d", ret); + return ret; +} + +/* + * find a transport by address + */ +static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock, + struct sockaddr *addr, + int addr_len, int flags, + gfp_t gfp) +{ + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; + struct rxrpc_transport *trans; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct rxrpc_peer *peer; + + _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); + + ASSERT(rx->local != NULL); + ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED); + + if (rx->srx.transport_type != srx->transport_type) + return ERR_PTR(-ESOCKTNOSUPPORT); + if (rx->srx.transport.family != srx->transport.family) + return ERR_PTR(-EAFNOSUPPORT); + + /* find a remote transport endpoint from the local one */ + peer = rxrpc_get_peer(srx, gfp); + if (IS_ERR(peer)) + return ERR_CAST(peer); + + /* find a transport */ + trans = rxrpc_get_transport(rx->local, peer, gfp); + rxrpc_put_peer(peer); + _leave(" = %p", trans); + return trans; +} + +/** + * rxrpc_kernel_begin_call - Allow a kernel service to begin a call + * @sock: The socket on which to make the call + * @srx: The address of the peer to contact (defaults to socket setting) + * @key: The security context to use (defaults to socket setting) + * @user_call_ID: The ID to use + * + * Allow a kernel service to begin a call on the nominated socket. This just + * sets up all the internal tracking structures and allocates connection and + * call IDs as appropriate. The call to be used is returned. + * + * The default socket destination address and security may be overridden by + * supplying @srx and @key. + */ +struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, + struct sockaddr_rxrpc *srx, + struct key *key, + unsigned long user_call_ID, + gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle; + struct rxrpc_transport *trans; + struct rxrpc_call *call; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + __be16 service_id; + + _enter(",,%x,%lx", key_serial(key), user_call_ID); + + lock_sock(&rx->sk); + + if (srx) { + trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx, + sizeof(*srx), 0, gfp); + if (IS_ERR(trans)) { + call = ERR_CAST(trans); + trans = NULL; + goto out_notrans; + } + } else { + trans = rx->trans; + if (!trans) { + call = ERR_PTR(-ENOTCONN); + goto out_notrans; + } + atomic_inc(&trans->usage); + } + + service_id = rx->service_id; + if (srx) + service_id = htons(srx->srx_service); + + if (!key) + key = rx->key; + if (key && !key->payload.data) + key = NULL; /* a no-security key */ + + bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp); + if (IS_ERR(bundle)) { + call = ERR_CAST(bundle); + goto out; + } + + call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true, + gfp); + rxrpc_put_bundle(trans, bundle); +out: + rxrpc_put_transport(trans); +out_notrans: + release_sock(&rx->sk); + _leave(" = %p", call); + return call; +} + +EXPORT_SYMBOL(rxrpc_kernel_begin_call); + +/** + * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using + * @call: The call to end + * + * Allow a kernel service to end a call it was using. The call must be + * complete before this is called (the call should be aborted if necessary). + */ +void rxrpc_kernel_end_call(struct rxrpc_call *call) +{ + _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); + rxrpc_remove_user_ID(call->socket, call); + rxrpc_put_call(call); +} + +EXPORT_SYMBOL(rxrpc_kernel_end_call); + +/** + * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages + * @sock: The socket to intercept received messages on + * @interceptor: The function to pass the messages to + * + * Allow a kernel service to intercept messages heading for the Rx queue on an + * RxRPC socket. They get passed to the specified function instead. + * @interceptor should free the socket buffers it is given. @interceptor is + * called with the socket receive queue spinlock held and softirqs disabled - + * this ensures that the messages will be delivered in the right order. + */ +void rxrpc_kernel_intercept_rx_messages(struct socket *sock, + rxrpc_interceptor_t interceptor) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + + _enter(""); + rx->interceptor = interceptor; +} + +EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages); + +/* + * connect an RxRPC socket + * - this just targets it at a specific destination; no actual connection + * negotiation takes place + */ +static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) +{ + struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr; + struct sock *sk = sock->sk; + struct rxrpc_transport *trans; + struct rxrpc_local *local; + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); + + ret = rxrpc_validate_address(rx, srx, addr_len); + if (ret < 0) { + _leave(" = %d [bad addr]", ret); + return ret; + } + + lock_sock(&rx->sk); + + switch (rx->sk.sk_state) { + case RXRPC_UNCONNECTED: + /* find a local transport endpoint if we don't have one already */ + ASSERTCMP(rx->local, ==, NULL); + rx->srx.srx_family = AF_RXRPC; + rx->srx.srx_service = 0; + rx->srx.transport_type = srx->transport_type; + rx->srx.transport_len = sizeof(sa_family_t); + rx->srx.transport.family = srx->transport.family; + local = rxrpc_lookup_local(&rx->srx); + if (IS_ERR(local)) { + release_sock(&rx->sk); + return PTR_ERR(local); + } + rx->local = local; + rx->sk.sk_state = RXRPC_CLIENT_BOUND; + case RXRPC_CLIENT_BOUND: + break; + case RXRPC_CLIENT_CONNECTED: + release_sock(&rx->sk); + return -EISCONN; + default: + release_sock(&rx->sk); + return -EBUSY; /* server sockets can't connect as well */ + } + + trans = rxrpc_name_to_transport(sock, addr, addr_len, flags, + GFP_KERNEL); + if (IS_ERR(trans)) { + release_sock(&rx->sk); + _leave(" = %ld", PTR_ERR(trans)); + return PTR_ERR(trans); + } + + rx->trans = trans; + rx->service_id = htons(srx->srx_service); + rx->sk.sk_state = RXRPC_CLIENT_CONNECTED; + + release_sock(&rx->sk); + return 0; +} + +/* + * send a message through an RxRPC socket + * - in a client this does a number of things: + * - finds/sets up a connection for the security specified (if any) + * - initiates a call (ID in control data) + * - ends the request phase of a call (if MSG_MORE is not set) + * - sends a call data packet + * - may send an abort (abort code in control data) + */ +static int rxrpc_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t len) +{ + struct rxrpc_transport *trans; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + int ret; + + _enter(",{%d},,%zu", rx->sk.sk_state, len); + + if (m->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + if (m->msg_name) { + ret = rxrpc_validate_address(rx, m->msg_name, m->msg_namelen); + if (ret < 0) { + _leave(" = %d [bad addr]", ret); + return ret; + } + } + + trans = NULL; + lock_sock(&rx->sk); + + if (m->msg_name) { + ret = -EISCONN; + trans = rxrpc_name_to_transport(sock, m->msg_name, + m->msg_namelen, 0, GFP_KERNEL); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out; + } + } else { + trans = rx->trans; + if (trans) + atomic_inc(&trans->usage); + } + + switch (rx->sk.sk_state) { + case RXRPC_SERVER_LISTENING: + if (!m->msg_name) { + ret = rxrpc_server_sendmsg(iocb, rx, m, len); + break; + } + case RXRPC_SERVER_BOUND: + case RXRPC_CLIENT_BOUND: + if (!m->msg_name) { + ret = -ENOTCONN; + break; + } + case RXRPC_CLIENT_CONNECTED: + ret = rxrpc_client_sendmsg(iocb, rx, trans, m, len); + break; + default: + ret = -ENOTCONN; + break; + } + +out: + release_sock(&rx->sk); + if (trans) + rxrpc_put_transport(trans); + _leave(" = %d", ret); + return ret; +} + +/* + * set RxRPC socket options + */ +static int rxrpc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + unsigned min_sec_level; + int ret; + + _enter(",%d,%d,,%d", level, optname, optlen); + + lock_sock(&rx->sk); + ret = -EOPNOTSUPP; + + if (level == SOL_RXRPC) { + switch (optname) { + case RXRPC_EXCLUSIVE_CONNECTION: + ret = -EINVAL; + if (optlen != 0) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags); + goto success; + + case RXRPC_SECURITY_KEY: + ret = -EINVAL; + if (rx->key) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + ret = rxrpc_request_key(rx, optval, optlen); + goto error; + + case RXRPC_SECURITY_KEYRING: + ret = -EINVAL; + if (rx->key) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + ret = rxrpc_server_keyring(rx, optval, optlen); + goto error; + + case RXRPC_MIN_SECURITY_LEVEL: + ret = -EINVAL; + if (optlen != sizeof(unsigned)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNCONNECTED) + goto error; + ret = get_user(min_sec_level, + (unsigned __user *) optval); + if (ret < 0) + goto error; + ret = -EINVAL; + if (min_sec_level > RXRPC_SECURITY_MAX) + goto error; + rx->min_sec_level = min_sec_level; + goto success; + + default: + break; + } + } + +success: + ret = 0; +error: + release_sock(&rx->sk); + return ret; +} + +/* + * permit an RxRPC socket to be polled + */ +static unsigned int rxrpc_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* the socket is readable if there are any messages waiting on the Rx + * queue */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* the socket is writable if there is space to add new data to the + * socket; there is no guarantee that any particular call in progress + * on the socket may have space in the Tx ACK window */ + if (rxrpc_writable(sk)) + mask |= POLLOUT | POLLWRNORM; + + return mask; +} + +/* + * create an RxRPC socket + */ +static int rxrpc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct rxrpc_sock *rx; + struct sock *sk; + + _enter("%p,%d", sock, protocol); + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + /* we support transport protocol UDP only */ + if (protocol != PF_INET) + return -EPROTONOSUPPORT; + + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + sock->ops = &rxrpc_rpc_ops; + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + sk->sk_state = RXRPC_UNCONNECTED; + sk->sk_write_space = rxrpc_write_space; + sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen; + sk->sk_destruct = rxrpc_sock_destructor; + + rx = rxrpc_sk(sk); + rx->proto = protocol; + rx->calls = RB_ROOT; + + INIT_LIST_HEAD(&rx->listen_link); + INIT_LIST_HEAD(&rx->secureq); + INIT_LIST_HEAD(&rx->acceptq); + rwlock_init(&rx->call_lock); + memset(&rx->srx, 0, sizeof(rx->srx)); + + _leave(" = 0 [%p]", rx); + return 0; +} + +/* + * RxRPC socket destructor + */ +static void rxrpc_sock_destructor(struct sock *sk) +{ + _enter("%p", sk); + + rxrpc_purge_queue(&sk->sk_receive_queue); + + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + WARN_ON(!sk_unhashed(sk)); + WARN_ON(sk->sk_socket); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive rxrpc socket: %p\n", sk); + return; + } +} + +/* + * release an RxRPC socket + */ +static int rxrpc_release_sock(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + + _enter("%p{%d,%d}", sk, sk->sk_state, atomic_read(&sk->sk_refcnt)); + + /* declare the socket closed for business */ + sock_orphan(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + spin_lock_bh(&sk->sk_receive_queue.lock); + sk->sk_state = RXRPC_CLOSE; + spin_unlock_bh(&sk->sk_receive_queue.lock); + + ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1); + + if (!list_empty(&rx->listen_link)) { + write_lock_bh(&rx->local->services_lock); + list_del(&rx->listen_link); + write_unlock_bh(&rx->local->services_lock); + } + + /* try to flush out this socket */ + rxrpc_release_calls_on_socket(rx); + flush_workqueue(rxrpc_workqueue); + rxrpc_purge_queue(&sk->sk_receive_queue); + + if (rx->conn) { + rxrpc_put_connection(rx->conn); + rx->conn = NULL; + } + + if (rx->bundle) { + rxrpc_put_bundle(rx->trans, rx->bundle); + rx->bundle = NULL; + } + if (rx->trans) { + rxrpc_put_transport(rx->trans); + rx->trans = NULL; + } + if (rx->local) { + rxrpc_put_local(rx->local); + rx->local = NULL; + } + + key_put(rx->key); + rx->key = NULL; + key_put(rx->securities); + rx->securities = NULL; + sock_put(sk); + + _leave(" = 0"); + return 0; +} + +/* + * release an RxRPC BSD socket on close() or equivalent + */ +static int rxrpc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + _enter("%p{%p}", sock, sk); + + if (!sk) + return 0; + + sock->sk = NULL; + + return rxrpc_release_sock(sk); +} + +/* + * RxRPC network protocol + */ +static const struct proto_ops rxrpc_rpc_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = rxrpc_release, + .bind = rxrpc_bind, + .connect = rxrpc_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = rxrpc_poll, + .ioctl = sock_no_ioctl, + .listen = rxrpc_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rxrpc_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = rxrpc_sendmsg, + .recvmsg = rxrpc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto rxrpc_proto = { + .name = "RXRPC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rxrpc_sock), + .max_header = sizeof(struct rxrpc_header), +}; + +static const struct net_proto_family rxrpc_family_ops = { + .family = PF_RXRPC, + .create = rxrpc_create, + .owner = THIS_MODULE, +}; + +/* + * initialise and register the RxRPC protocol + */ +static int __init af_rxrpc_init(void) +{ + struct sk_buff *dummy_skb; + int ret = -1; + + BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof(dummy_skb->cb)); + + rxrpc_epoch = htonl(get_seconds()); + + ret = -ENOMEM; + rxrpc_call_jar = kmem_cache_create( + "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!rxrpc_call_jar) { + printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n"); + goto error_call_jar; + } + + rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1); + if (!rxrpc_workqueue) { + printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n"); + goto error_work_queue; + } + + ret = proto_register(&rxrpc_proto, 1); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register protocol\n"); + goto error_proto; + } + + ret = sock_register(&rxrpc_family_ops); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register socket family\n"); + goto error_sock; + } + + ret = register_key_type(&key_type_rxrpc); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register client key type\n"); + goto error_key_type; + } + + ret = register_key_type(&key_type_rxrpc_s); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot register server key type\n"); + goto error_key_type_s; + } + +#ifdef CONFIG_PROC_FS + proc_net_fops_create(&init_net, "rxrpc_calls", 0, &rxrpc_call_seq_fops); + proc_net_fops_create(&init_net, "rxrpc_conns", 0, &rxrpc_connection_seq_fops); +#endif + return 0; + +error_key_type_s: + unregister_key_type(&key_type_rxrpc); +error_key_type: + sock_unregister(PF_RXRPC); +error_sock: + proto_unregister(&rxrpc_proto); +error_proto: + destroy_workqueue(rxrpc_workqueue); +error_work_queue: + kmem_cache_destroy(rxrpc_call_jar); +error_call_jar: + return ret; +} + +/* + * unregister the RxRPC protocol + */ +static void __exit af_rxrpc_exit(void) +{ + _enter(""); + unregister_key_type(&key_type_rxrpc_s); + unregister_key_type(&key_type_rxrpc); + sock_unregister(PF_RXRPC); + proto_unregister(&rxrpc_proto); + rxrpc_destroy_all_calls(); + rxrpc_destroy_all_connections(); + rxrpc_destroy_all_transports(); + rxrpc_destroy_all_peers(); + rxrpc_destroy_all_locals(); + + ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + + _debug("flush scheduled work"); + flush_workqueue(rxrpc_workqueue); + proc_net_remove(&init_net, "rxrpc_conns"); + proc_net_remove(&init_net, "rxrpc_calls"); + destroy_workqueue(rxrpc_workqueue); + kmem_cache_destroy(rxrpc_call_jar); + _leave(""); +} + +module_init(af_rxrpc_init); +module_exit(af_rxrpc_exit); diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c new file mode 100644 index 00000000..6d79310f --- /dev/null +++ b/net/rxrpc/ar-accept.c @@ -0,0 +1,510 @@ +/* incoming call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * generate a connection-level abort + */ +static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, + struct rxrpc_header *hdr) +{ + struct msghdr msg; + struct kvec iov[1]; + size_t len; + int ret; + + _enter("%d,,", local->debug_id); + + msg.msg_name = &srx->transport.sin; + msg.msg_namelen = sizeof(srx->transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->seq = 0; + hdr->type = RXRPC_PACKET_TYPE_BUSY; + hdr->flags = 0; + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + + len = iov[0].iov_len; + + hdr->serial = htonl(1); + _proto("Tx BUSY %%%u", ntohl(hdr->serial)); + + ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); + if (ret < 0) { + _leave(" = -EAGAIN [sendmsg failed: %d]", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * accept an incoming call that needs peer, transport and/or connection setting + * up + */ +static int rxrpc_accept_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + struct rxrpc_skb_priv *sp, *nsp; + struct rxrpc_peer *peer; + struct rxrpc_call *call; + struct sk_buff *notification; + int ret; + + _enter(""); + + sp = rxrpc_skb(skb); + + /* get a notification message to send to the server app */ + notification = alloc_skb(0, GFP_NOFS); + if (!notification) { + _debug("no memory"); + ret = -ENOMEM; + goto error_nofree; + } + rxrpc_new_skb(notification); + notification->mark = RXRPC_SKB_MARK_NEW_CALL; + + peer = rxrpc_get_peer(srx, GFP_NOIO); + if (IS_ERR(peer)) { + _debug("no peer"); + ret = -EBUSY; + goto error; + } + + trans = rxrpc_get_transport(local, peer, GFP_NOIO); + rxrpc_put_peer(peer); + if (IS_ERR(trans)) { + _debug("no trans"); + ret = -EBUSY; + goto error; + } + + conn = rxrpc_incoming_connection(trans, &sp->hdr, GFP_NOIO); + rxrpc_put_transport(trans); + if (IS_ERR(conn)) { + _debug("no conn"); + ret = PTR_ERR(conn); + goto error; + } + + call = rxrpc_incoming_call(rx, conn, &sp->hdr, GFP_NOIO); + rxrpc_put_connection(conn); + if (IS_ERR(call)) { + _debug("no call"); + ret = PTR_ERR(call); + goto error; + } + + /* attach the call to the socket */ + read_lock_bh(&local->services_lock); + if (rx->sk.sk_state == RXRPC_CLOSE) + goto invalid_service; + + write_lock(&rx->call_lock); + if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { + rxrpc_get_call(call); + + spin_lock(&call->conn->state_lock); + if (sp->hdr.securityIndex > 0 && + call->conn->state == RXRPC_CONN_SERVER_UNSECURED) { + _debug("await conn sec"); + list_add_tail(&call->accept_link, &rx->secureq); + call->conn->state = RXRPC_CONN_SERVER_CHALLENGING; + atomic_inc(&call->conn->usage); + set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events); + rxrpc_queue_conn(call->conn); + } else { + _debug("conn ready"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_add_tail(&call->accept_link, &rx->acceptq); + rxrpc_get_call(call); + nsp = rxrpc_skb(notification); + nsp->call = call; + + ASSERTCMP(atomic_read(&call->usage), >=, 3); + + _debug("notify"); + spin_lock(&call->lock); + ret = rxrpc_queue_rcv_skb(call, notification, true, + false); + spin_unlock(&call->lock); + notification = NULL; + BUG_ON(ret < 0); + } + spin_unlock(&call->conn->state_lock); + + _debug("queued"); + } + write_unlock(&rx->call_lock); + + _debug("process"); + rxrpc_fast_process_packet(call, skb); + + _debug("done"); + read_unlock_bh(&local->services_lock); + rxrpc_free_skb(notification); + rxrpc_put_call(call); + _leave(" = 0"); + return 0; + +invalid_service: + _debug("invalid"); + read_unlock_bh(&local->services_lock); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) { + rxrpc_get_call(call); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); + rxrpc_put_call(call); + ret = -ECONNREFUSED; +error: + rxrpc_free_skb(notification); +error_nofree: + _leave(" = %d", ret); + return ret; +} + +/* + * accept incoming calls that need peer, transport and/or connection setting up + * - the packets we get are all incoming client DATA packets that have seq == 1 + */ +void rxrpc_accept_incoming_calls(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, acceptor); + struct rxrpc_skb_priv *sp; + struct sockaddr_rxrpc srx; + struct rxrpc_sock *rx; + struct sk_buff *skb; + __be16 service_id; + int ret; + + _enter("%d", local->debug_id); + + read_lock_bh(&rxrpc_local_lock); + if (atomic_read(&local->usage) > 0) + rxrpc_get_local(local); + else + local = NULL; + read_unlock_bh(&rxrpc_local_lock); + if (!local) { + _leave(" [local dead]"); + return; + } + +process_next_packet: + skb = skb_dequeue(&local->accept_queue); + if (!skb) { + rxrpc_put_local(local); + _leave("\n"); + return; + } + + _net("incoming call skb %p", skb); + + sp = rxrpc_skb(skb); + + /* determine the remote address */ + memset(&srx, 0, sizeof(srx)); + srx.srx_family = AF_RXRPC; + srx.transport.family = local->srx.transport.family; + srx.transport_type = local->srx.transport_type; + switch (srx.transport.family) { + case AF_INET: + srx.transport_len = sizeof(struct sockaddr_in); + srx.transport.sin.sin_port = udp_hdr(skb)->source; + srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + break; + default: + goto busy; + } + + /* get the socket providing the service */ + service_id = sp->hdr.serviceId; + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->service_id == service_id && + rx->sk.sk_state != RXRPC_CLOSE) + goto found_service; + } + read_unlock_bh(&local->services_lock); + goto invalid_service; + +found_service: + _debug("found service %hd", ntohs(rx->service_id)); + if (sk_acceptq_is_full(&rx->sk)) + goto backlog_full; + sk_acceptq_added(&rx->sk); + sock_hold(&rx->sk); + read_unlock_bh(&local->services_lock); + + ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); + if (ret < 0) + sk_acceptq_removed(&rx->sk); + sock_put(&rx->sk); + switch (ret) { + case -ECONNRESET: /* old calls are ignored */ + case -ECONNABORTED: /* aborted calls are reaborted or ignored */ + case 0: + goto process_next_packet; + case -ECONNREFUSED: + goto invalid_service; + case -EBUSY: + goto busy; + case -EKEYREJECTED: + goto security_mismatch; + default: + BUG(); + } + +backlog_full: + read_unlock_bh(&local->services_lock); +busy: + rxrpc_busy(local, &srx, &sp->hdr); + rxrpc_free_skb(skb); + goto process_next_packet; + +invalid_service: + skb->priority = RX_INVALID_OPERATION; + rxrpc_reject_packet(local, skb); + goto process_next_packet; + + /* can't change connection security type mid-flow */ +security_mismatch: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + goto process_next_packet; +} + +/* + * handle acceptance of a call by userspace + * - assign the user call ID to the call at the front of the queue + */ +struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *parent, **pp; + int ret; + + _enter(",%lx", user_call_ID); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* check the user ID isn't already in use */ + ret = -EBADSLT; + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto out; + } + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + break; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* formalise the acceptance */ + call->user_call_ID = user_call_ID; + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + BUG(); + if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events)) + BUG(); + rxrpc_queue_call(call); + + rxrpc_get_call(call); + write_unlock_bh(&call->state_lock); + write_unlock(&rx->call_lock); + _leave(" = %p{%d}", call, call->debug_id); + return call; + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * handle rejectance of a call by userspace + * - reject the call at the front of the queue + */ +int rxrpc_reject_call(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(!irqs_disabled()); + + write_lock(&rx->call_lock); + + ret = -ENODATA; + if (list_empty(&rx->acceptq)) + goto out; + + /* dequeue the first call and check it's still valid */ + call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + call->state = RXRPC_CALL_SERVER_BUSY; + if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) + rxrpc_queue_call(call); + ret = 0; + goto out_release; + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_LOCALLY_ABORTED: + ret = -ECONNABORTED; + goto out_release; + case RXRPC_CALL_NETWORK_ERROR: + ret = call->conn->error; + goto out_release; + case RXRPC_CALL_DEAD: + ret = -ETIME; + goto out_discard; + default: + BUG(); + } + + /* if the call is already dying or dead, then we leave the socket's ref + * on it to be released by rxrpc_dead_call_expired() as induced by + * rxrpc_release_call() */ +out_release: + _debug("release %p", call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); +out_discard: + write_unlock_bh(&call->state_lock); + _debug("discard %p", call); +out: + write_unlock(&rx->call_lock); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call + * @sock: The socket on which the impending call is waiting + * @user_call_ID: The tag to attach to the call + * + * Allow a kernel service to accept an incoming call, assuming the incoming + * call is still valid. + */ +struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + + _enter(",%lx", user_call_ID); + call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); + _leave(" = %p", call); + return call; +} + +EXPORT_SYMBOL(rxrpc_kernel_accept_call); + +/** + * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call + * @sock: The socket on which the impending call is waiting + * + * Allow a kernel service to reject an incoming call with a BUSY message, + * assuming the incoming call is still valid. + */ +int rxrpc_kernel_reject_call(struct socket *sock) +{ + int ret; + + _enter(""); + ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL(rxrpc_kernel_reject_call); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c new file mode 100644 index 00000000..f99cfce7 --- /dev/null +++ b/net/rxrpc/ar-ack.c @@ -0,0 +1,1307 @@ +/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static unsigned rxrpc_ack_defer = 1; + +static const char *const rxrpc_acks[] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL", + "-?-" +}; + +static const s8 rxrpc_ack_priority[] = { + [0] = 0, + [RXRPC_ACK_DELAY] = 1, + [RXRPC_ACK_REQUESTED] = 2, + [RXRPC_ACK_IDLE] = 3, + [RXRPC_ACK_PING_RESPONSE] = 4, + [RXRPC_ACK_DUPLICATE] = 5, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, + [RXRPC_ACK_EXCEEDS_WINDOW] = 7, + [RXRPC_ACK_NOSPACE] = 8, +}; + +/* + * propose an ACK be sent + */ +void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + __be32 serial, bool immediate) +{ + unsigned long expiry; + s8 prior = rxrpc_ack_priority[ack_reason]; + + ASSERTCMP(prior, >, 0); + + _enter("{%d},%s,%%%x,%u", + call->debug_id, rxrpc_acks[ack_reason], ntohl(serial), + immediate); + + if (prior < rxrpc_ack_priority[call->ackr_reason]) { + if (immediate) + goto cancel_timer; + return; + } + + /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial + * numbers */ + if (prior == rxrpc_ack_priority[call->ackr_reason]) { + if (prior <= 4) + call->ackr_serial = serial; + if (immediate) + goto cancel_timer; + return; + } + + call->ackr_reason = ack_reason; + call->ackr_serial = serial; + + switch (ack_reason) { + case RXRPC_ACK_DELAY: + _debug("run delay timer"); + call->ack_timer.expires = jiffies + rxrpc_ack_timeout * HZ; + add_timer(&call->ack_timer); + return; + + case RXRPC_ACK_IDLE: + if (!immediate) { + _debug("run defer timer"); + expiry = 1; + goto run_timer; + } + goto cancel_timer; + + case RXRPC_ACK_REQUESTED: + if (!rxrpc_ack_defer) + goto cancel_timer; + if (!immediate || serial == cpu_to_be32(1)) { + _debug("run defer timer"); + expiry = rxrpc_ack_defer; + goto run_timer; + } + + default: + _debug("immediate ACK"); + goto cancel_timer; + } + +run_timer: + expiry += jiffies; + if (!timer_pending(&call->ack_timer) || + time_after(call->ack_timer.expires, expiry)) + mod_timer(&call->ack_timer, expiry); + return; + +cancel_timer: + _debug("cancel timer %%%u", ntohl(serial)); + try_to_del_timer_sync(&call->ack_timer); + read_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * propose an ACK be sent, locking the call structure + */ +void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + __be32 serial, bool immediate) +{ + s8 prior = rxrpc_ack_priority[ack_reason]; + + if (prior > rxrpc_ack_priority[call->ackr_reason]) { + spin_lock_bh(&call->lock); + __rxrpc_propose_ACK(call, ack_reason, serial, immediate); + spin_unlock_bh(&call->lock); + } +} + +/* + * set the resend timer + */ +static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, + unsigned long resend_at) +{ + read_lock_bh(&call->state_lock); + if (call->state >= RXRPC_CALL_COMPLETE) + resend = 0; + + if (resend & 1) { + _debug("SET RESEND"); + set_bit(RXRPC_CALL_RESEND, &call->events); + } + + if (resend & 2) { + _debug("MODIFY RESEND TIMER"); + set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + mod_timer(&call->resend_timer, resend_at); + } else { + _debug("KILL RESEND TIMER"); + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + read_unlock_bh(&call->state_lock); +} + +/* + * resend packets + */ +static void rxrpc_resend(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_header *hdr; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop, stop; + u8 resend; + + _enter("{%d,%d,%d,%d},", + call->acks_hard, call->acks_unacked, + atomic_read(&call->sequence), + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + stop = 0; + resend = 0; + resend_at = 0; + + for (loop = call->acks_tail; + loop != call->acks_head || stop; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + if (*p_txb & 1) + continue; + + txb = (struct sk_buff *) *p_txb; + sp = rxrpc_skb(txb); + + if (sp->need_resend) { + sp->need_resend = 0; + + /* each Tx packet has a new serial number */ + sp->hdr.serial = + htonl(atomic_inc_return(&call->conn->serial)); + + hdr = (struct rxrpc_header *) txb->head; + hdr->serial = sp->hdr.serial; + + _proto("Tx DATA %%%u { #%d }", + ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + if (rxrpc_send_packet(call->conn->trans, txb) < 0) { + stop = 0; + sp->resend_at = jiffies + 3; + } else { + sp->resend_at = + jiffies + rxrpc_resend_timeout * HZ; + } + } + + if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = 1; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * handle resend timer expiry + */ +static void rxrpc_resend_timer(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 resend; + + _enter("%d,%d,%d", + call->acks_tail, call->acks_unacked, call->acks_head); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + resend = 0; + resend_at = 0; + + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + ASSERT(!(*p_txb & 1)); + + if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = 1; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(""); +} + +/* + * process soft ACKs of our transmitted packets + * - these indicate packets the peer has or has not received, but hasn't yet + * given to the consumer, and so can still be discarded and re-requested + */ +static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, + struct rxrpc_ackpacket *ack, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *txb; + unsigned long *p_txb, resend_at; + int loop; + u8 sacks[RXRPC_MAXACKS], resend; + + _enter("{%d,%d},{%d},", + call->acks_hard, + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), + ack->nAcks); + + if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) + goto protocol_error; + + resend = 0; + resend_at = 0; + for (loop = 0; loop < ack->nAcks; loop++) { + p_txb = call->acks_window; + p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + switch (sacks[loop]) { + case RXRPC_ACK_TYPE_ACK: + sp->need_resend = 0; + *p_txb |= 1; + break; + case RXRPC_ACK_TYPE_NACK: + sp->need_resend = 1; + *p_txb &= ~1; + resend = 1; + break; + default: + _debug("Unsupported ACK type %d", sacks[loop]); + goto protocol_error; + } + } + + smp_mb(); + call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); + + /* anything not explicitly ACK'd is implicitly NACK'd, but may just not + * have been received or processed yet by the far end */ + for (loop = call->acks_unacked; + loop != call->acks_head; + loop = (loop + 1) & (call->acks_winsz - 1) + ) { + p_txb = call->acks_window + loop; + smp_read_barrier_depends(); + txb = (struct sk_buff *) (*p_txb & ~1); + sp = rxrpc_skb(txb); + + if (*p_txb & 1) { + /* packet must have been discarded */ + sp->need_resend = 1; + *p_txb &= ~1; + resend |= 1; + } else if (sp->need_resend) { + ; + } else if (time_after_eq(jiffies + 1, sp->resend_at)) { + sp->need_resend = 1; + resend |= 1; + } else if (resend & 2) { + if (time_before(sp->resend_at, resend_at)) + resend_at = sp->resend_at; + } else { + resend_at = sp->resend_at; + resend |= 2; + } + } + + rxrpc_set_resend(call, resend, resend_at); + _leave(" = 0"); + return 0; + +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * discard hard-ACK'd packets from the Tx window + */ +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) +{ + unsigned long _skb; + int tail = call->acks_tail, old_tail; + int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); + + _enter("{%u,%u},%u", call->acks_hard, win, hard); + + ASSERTCMP(hard - call->acks_hard, <=, win); + + while (call->acks_hard < hard) { + smp_read_barrier_depends(); + _skb = call->acks_window[tail] & ~1; + rxrpc_free_skb((struct sk_buff *) _skb); + old_tail = tail; + tail = (tail + 1) & (call->acks_winsz - 1); + call->acks_tail = tail; + if (call->acks_unacked == old_tail) + call->acks_unacked = tail; + call->acks_hard++; + } + + wake_up(&call->tx_waitq); +} + +/* + * clear the Tx window in the event of a failure + */ +static void rxrpc_clear_tx_window(struct rxrpc_call *call) +{ + rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); +} + +/* + * drain the out of sequence received packet queue into the packet Rx queue + */ +static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool terminal; + int ret; + + _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); + + spin_lock_bh(&call->lock); + + ret = -ECONNRESET; + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) + goto socket_unavailable; + + skb = skb_dequeue(&call->rx_oos_queue); + if (skb) { + sp = rxrpc_skb(skb); + + _debug("drain OOS packet %d [%d]", + ntohl(sp->hdr.seq), call->rx_first_oos); + + if (ntohl(sp->hdr.seq) != call->rx_first_oos) { + skb_queue_head(&call->rx_oos_queue, skb); + call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq); + _debug("requeue %p {%u}", skb, call->rx_first_oos); + } else { + skb->mark = RXRPC_SKB_MARK_DATA; + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); + BUG_ON(ret < 0); + _debug("drain #%u", call->rx_data_post); + call->rx_data_post++; + + /* find out what the next packet is */ + skb = skb_peek(&call->rx_oos_queue); + if (skb) + call->rx_first_oos = + ntohl(rxrpc_skb(skb)->hdr.seq); + else + call->rx_first_oos = 0; + _debug("peek %p {%u}", skb, call->rx_first_oos); + } + } + + ret = 0; +socket_unavailable: + spin_unlock_bh(&call->lock); + _leave(" = %d", ret); + return ret; +} + +/* + * insert an out of sequence packet into the buffer + */ +static void rxrpc_insert_oos_packet(struct rxrpc_call *call, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp, *psp; + struct sk_buff *p; + u32 seq; + + sp = rxrpc_skb(skb); + seq = ntohl(sp->hdr.seq); + _enter(",,{%u}", seq); + + skb->destructor = rxrpc_packet_destructor; + ASSERTCMP(sp->call, ==, NULL); + sp->call = call; + rxrpc_get_call(call); + + /* insert into the buffer in sequence order */ + spin_lock_bh(&call->lock); + + skb_queue_walk(&call->rx_oos_queue, p) { + psp = rxrpc_skb(p); + if (ntohl(psp->hdr.seq) > seq) { + _debug("insert oos #%u before #%u", + seq, ntohl(psp->hdr.seq)); + skb_insert(p, skb, &call->rx_oos_queue); + goto inserted; + } + } + + _debug("append oos #%u", seq); + skb_queue_tail(&call->rx_oos_queue, skb); +inserted: + + /* we might now have a new front to the queue */ + if (call->rx_first_oos == 0 || seq < call->rx_first_oos) + call->rx_first_oos = seq; + + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + } + read_unlock(&call->state_lock); + + spin_unlock_bh(&call->lock); + _leave(" [stored #%u]", call->rx_first_oos); +} + +/* + * clear the Tx window on final ACK reception + */ +static void rxrpc_zap_tx_window(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + unsigned long _skb, *acks_window; + u8 winsz = call->acks_winsz; + int tail; + + acks_window = call->acks_window; + call->acks_window = NULL; + + while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { + tail = call->acks_tail; + smp_read_barrier_depends(); + _skb = acks_window[tail] & ~1; + smp_mb(); + call->acks_tail = (call->acks_tail + 1) & (winsz - 1); + + skb = (struct sk_buff *) _skb; + sp = rxrpc_skb(skb); + _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); + rxrpc_free_skb(skb); + } + + kfree(acks_window); +} + +/* + * process the extra information that may be appended to an ACK packet + */ +static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, + unsigned latest, int nAcks) +{ + struct rxrpc_ackinfo ackinfo; + struct rxrpc_peer *peer; + unsigned mtu; + + if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { + _leave(" [no ackinfo]"); + return; + } + + _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", + latest, + ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), + ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); + + mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); + + peer = call->conn->trans->peer; + if (mtu < peer->maxdata) { + spin_lock_bh(&peer->lock); + peer->maxdata = mtu; + peer->mtu = mtu + peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); + } +} + +/* + * process packets in the reception queue + */ +static int rxrpc_process_rx_queue(struct rxrpc_call *call, + u32 *_abort_code) +{ + struct rxrpc_ackpacket ack; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + bool post_ACK; + int latest; + u32 hard, tx; + + _enter(""); + +process_further: + skb = skb_dequeue(&call->rx_queue); + if (!skb) + return -EAGAIN; + + _net("deferred skb %p", skb); + + sp = rxrpc_skb(skb); + + _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); + + post_ACK = false; + + switch (sp->hdr.type) { + /* data packets that wind up here have been received out of + * order, need security processing or are jumbo packets */ + case RXRPC_PACKET_TYPE_DATA: + _proto("OOSQ DATA %%%u { #%u }", + ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + + /* secured packets must be verified and possibly decrypted */ + if (rxrpc_verify_packet(call, skb, _abort_code) < 0) + goto protocol_error; + + rxrpc_insert_oos_packet(call, skb); + goto process_further; + + /* partial ACK to process */ + case RXRPC_PACKET_TYPE_ACK: + if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { + _debug("extraction failure"); + goto protocol_error; + } + if (!skb_pull(skb, sizeof(ack))) + BUG(); + + latest = ntohl(sp->hdr.serial); + hard = ntohl(ack.firstPacket); + tx = atomic_read(&call->sequence); + + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + latest, + ntohs(ack.maxSkew), + hard, + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks[ack.reason], + ack.nAcks); + + rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); + + if (ack.reason == RXRPC_ACK_PING) { + _proto("Rx ACK %%%u PING Request", latest); + rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, + sp->hdr.serial, true); + } + + /* discard any out-of-order or duplicate ACKs */ + if (latest - call->acks_latest <= 0) { + _debug("discard ACK %d <= %d", + latest, call->acks_latest); + goto discard; + } + call->acks_latest = latest; + + if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && + call->state != RXRPC_CALL_SERVER_SEND_REPLY && + call->state != RXRPC_CALL_SERVER_AWAIT_ACK) + goto discard; + + _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); + + if (hard > 0) { + if (hard - 1 > tx) { + _debug("hard-ACK'd packet %d not transmitted" + " (%d top)", + hard - 1, tx); + goto protocol_error; + } + + if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || + call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && + hard > tx) + goto all_acked; + + smp_rmb(); + rxrpc_rotate_tx_window(call, hard - 1); + } + + if (ack.nAcks > 0) { + if (hard - 1 + ack.nAcks > tx) { + _debug("soft-ACK'd packet %d+%d not" + " transmitted (%d top)", + hard - 1, ack.nAcks, tx); + goto protocol_error; + } + + if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) + goto protocol_error; + } + goto discard; + + /* complete ACK to process */ + case RXRPC_PACKET_TYPE_ACKALL: + goto all_acked; + + /* abort and busy are handled elsewhere */ + case RXRPC_PACKET_TYPE_BUSY: + case RXRPC_PACKET_TYPE_ABORT: + BUG(); + + /* connection level events - also handled elsewhere */ + case RXRPC_PACKET_TYPE_CHALLENGE: + case RXRPC_PACKET_TYPE_RESPONSE: + case RXRPC_PACKET_TYPE_DEBUG: + BUG(); + } + + /* if we've had a hard ACK that covers all the packets we've sent, then + * that ends that phase of the operation */ +all_acked: + write_lock_bh(&call->state_lock); + _debug("ack all %d", call->state); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + break; + case RXRPC_CALL_SERVER_AWAIT_ACK: + _debug("srv complete"); + call->state = RXRPC_CALL_COMPLETE; + post_ACK = true; + break; + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_SERVER_RECV_REQUEST: + goto protocol_error_unlock; /* can't occur yet */ + default: + write_unlock_bh(&call->state_lock); + goto discard; /* assume packet left over from earlier phase */ + } + + write_unlock_bh(&call->state_lock); + + /* if all the packets we sent are hard-ACK'd, then we can discard + * whatever we've got left */ + _debug("clear Tx %d", + CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); + + del_timer_sync(&call->resend_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + + if (call->acks_window) + rxrpc_zap_tx_window(call); + + if (post_ACK) { + /* post the final ACK message for userspace to pick up */ + _debug("post ACK"); + skb->mark = RXRPC_SKB_MARK_FINAL_ACK; + sp->call = call; + rxrpc_get_call(call); + spin_lock_bh(&call->lock); + if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) + BUG(); + spin_unlock_bh(&call->lock); + goto process_further; + } + +discard: + rxrpc_free_skb(skb); + goto process_further; + +protocol_error_unlock: + write_unlock_bh(&call->state_lock); +protocol_error: + rxrpc_free_skb(skb); + _leave(" = -EPROTO"); + return -EPROTO; +} + +/* + * post a message to the socket Rx queue for recvmsg() to pick up + */ +static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, + bool fatal) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + _enter("{%d,%lx},%u,%u,%d", + call->debug_id, call->flags, mark, error, fatal); + + /* remove timers and things for fatal messages */ + if (fatal) { + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + + if (mark != RXRPC_SKB_MARK_NEW_CALL && + !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + _leave("[no userid]"); + return 0; + } + + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + skb = alloc_skb(0, GFP_NOFS); + if (!skb) + return -ENOMEM; + + rxrpc_new_skb(skb); + + skb->mark = mark; + + sp = rxrpc_skb(skb); + memset(sp, 0, sizeof(*sp)); + sp->error = error; + sp->call = call; + rxrpc_get_call(call); + + spin_lock_bh(&call->lock); + ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); + spin_unlock_bh(&call->lock); + BUG_ON(ret < 0); + } + + return 0; +} + +/* + * handle background processing of incoming call packets and ACK / abort + * generation + */ +void rxrpc_process_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, processor); + struct rxrpc_ackpacket ack; + struct rxrpc_ackinfo ackinfo; + struct rxrpc_header hdr; + struct msghdr msg; + struct kvec iov[5]; + unsigned long bits; + __be32 data, pad; + size_t len; + int genbit, loop, nbit, ioc, ret, mtu; + u32 abort_code = RX_PROTOCOL_ERROR; + u8 *acks = NULL; + + //printk("\n--------------------\n"); + _enter("{%d,%s,%lx} [%lu]", + call->debug_id, rxrpc_call_states[call->state], call->events, + (jiffies - call->creation_jif) / (HZ / 10)); + + if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { + _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); + return; + } + + /* there's a good chance we're going to have to send a message, so set + * one up in advance */ + msg.msg_name = &call->conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(call->conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr.epoch = call->conn->epoch; + hdr.cid = call->cid; + hdr.callNumber = call->call_id; + hdr.seq = 0; + hdr.type = RXRPC_PACKET_TYPE_ACK; + hdr.flags = call->conn->out_clientflag; + hdr.userStatus = 0; + hdr.securityIndex = call->conn->security_ix; + hdr._rsvd = 0; + hdr.serviceId = call->conn->service_id; + + memset(iov, 0, sizeof(iov)); + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + + /* deal with events of a final nature */ + if (test_bit(RXRPC_CALL_RELEASE, &call->events)) { + rxrpc_release_call(call); + clear_bit(RXRPC_CALL_RELEASE, &call->events); + } + + if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) { + int error; + + clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); + clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_ABORT, &call->events); + + error = call->conn->trans->peer->net_error; + _debug("post net error %d", error); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR, + error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events); + clear_bit(RXRPC_CALL_ABORT, &call->events); + + _debug("post conn abort"); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + call->conn->error, true) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_CONN_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) { + hdr.type = RXRPC_PACKET_TYPE_BUSY; + genbit = RXRPC_CALL_REJECT_BUSY; + goto send_message; + } + + if (test_bit(RXRPC_CALL_ABORT, &call->events)) { + ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); + + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ECONNABORTED, true) < 0) + goto no_mem; + hdr.type = RXRPC_PACKET_TYPE_ABORT; + data = htonl(call->abort_code); + iov[1].iov_base = &data; + iov[1].iov_len = sizeof(data); + genbit = RXRPC_CALL_ABORT; + goto send_message; + } + + if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) { + genbit = RXRPC_CALL_ACK_FINAL; + + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + ack.serial = 0; + ack.reason = RXRPC_ACK_IDLE; + ack.nAcks = 0; + call->ackr_reason = 0; + + spin_lock_bh(&call->lock); + ack.serial = call->ackr_serial; + ack.previousPacket = call->ackr_prev_seq; + ack.firstPacket = htonl(call->rx_data_eaten + 1); + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = &pad; + iov[2].iov_len = 3; + iov[3].iov_base = &ackinfo; + iov[3].iov_len = sizeof(ackinfo); + goto send_ACK; + } + + if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) | + (1 << RXRPC_CALL_RCVD_ABORT)) + ) { + u32 mark; + + if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events)) + mark = RXRPC_SKB_MARK_REMOTE_ABORT; + else + mark = RXRPC_SKB_MARK_BUSY; + + _debug("post abort/busy"); + rxrpc_clear_tx_window(call); + if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events); + clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + goto kill_ACKs; + } + + if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) { + _debug("do implicit ackall"); + rxrpc_clear_tx_window(call); + } + + if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) { + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_CALL_TIMEOUT; + set_bit(RXRPC_CALL_ABORT, &call->events); + } + write_unlock_bh(&call->state_lock); + + _debug("post timeout"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, + ETIME, true) < 0) + goto no_mem; + + clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + goto kill_ACKs; + } + + /* deal with assorted inbound messages */ + if (!skb_queue_empty(&call->rx_queue)) { + switch (rxrpc_process_rx_queue(call, &abort_code)) { + case 0: + case -EAGAIN: + break; + case -ENOMEM: + goto no_mem; + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EPROTO: + rxrpc_abort_call(call, abort_code); + goto kill_ACKs; + } + } + + /* handle resending */ + if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + rxrpc_resend_timer(call); + if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events)) + rxrpc_resend(call); + + /* consider sending an ordinary ACK */ + if (test_bit(RXRPC_CALL_ACK, &call->events)) { + _debug("send ACK: window: %d - %d { %lx }", + call->rx_data_eaten, call->ackr_win_top, + call->ackr_window[0]); + + if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && + call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { + /* ACK by sending reply DATA packet in this state */ + clear_bit(RXRPC_CALL_ACK, &call->events); + goto maybe_reschedule; + } + + genbit = RXRPC_CALL_ACK; + + acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, + GFP_NOFS); + if (!acks) + goto no_mem; + + //hdr.flags = RXRPC_SLOW_START_OK; + ack.bufferSpace = htons(8); + ack.maxSkew = 0; + ack.serial = 0; + ack.reason = 0; + + spin_lock_bh(&call->lock); + ack.reason = call->ackr_reason; + ack.serial = call->ackr_serial; + ack.previousPacket = call->ackr_prev_seq; + ack.firstPacket = htonl(call->rx_data_eaten + 1); + + ack.nAcks = 0; + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + nbit = loop * BITS_PER_LONG; + for (bits = call->ackr_window[loop]; bits; bits >>= 1 + ) { + _debug("- l=%d n=%d b=%lx", loop, nbit, bits); + if (bits & 1) { + acks[nbit] = RXRPC_ACK_TYPE_ACK; + ack.nAcks = nbit + 1; + } + nbit++; + } + } + call->ackr_reason = 0; + spin_unlock_bh(&call->lock); + + pad = 0; + + iov[1].iov_base = &ack; + iov[1].iov_len = sizeof(ack); + iov[2].iov_base = acks; + iov[2].iov_len = ack.nAcks; + iov[3].iov_base = &pad; + iov[3].iov_len = 3; + iov[4].iov_base = &ackinfo; + iov[4].iov_len = sizeof(ackinfo); + + switch (ack.reason) { + case RXRPC_ACK_REQUESTED: + case RXRPC_ACK_DUPLICATE: + case RXRPC_ACK_OUT_OF_SEQUENCE: + case RXRPC_ACK_EXCEEDS_WINDOW: + case RXRPC_ACK_NOSPACE: + case RXRPC_ACK_PING: + case RXRPC_ACK_PING_RESPONSE: + goto send_ACK_with_skew; + case RXRPC_ACK_DELAY: + case RXRPC_ACK_IDLE: + goto send_ACK; + } + } + + /* handle completion of security negotiations on an incoming + * connection */ + if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) { + _debug("secured"); + spin_lock_bh(&call->lock); + + if (call->state == RXRPC_CALL_SERVER_SECURING) { + _debug("securing"); + write_lock(&call->conn->lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_bit(RXRPC_CALL_RELEASE, &call->events)) { + _debug("not released"); + call->state = RXRPC_CALL_SERVER_ACCEPTING; + list_move_tail(&call->accept_link, + &call->socket->acceptq); + } + write_unlock(&call->conn->lock); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + read_unlock(&call->state_lock); + } + + spin_unlock_bh(&call->lock); + if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) + goto maybe_reschedule; + } + + /* post a notification of an acceptable connection to the app */ + if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) { + _debug("post accept"); + if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, + 0, false) < 0) + goto no_mem; + clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events); + goto maybe_reschedule; + } + + /* handle incoming call acceptance */ + if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) { + _debug("accepted"); + ASSERTCMP(call->rx_data_post, ==, 0); + call->rx_data_post = 1; + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) + set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events); + read_unlock_bh(&call->state_lock); + } + + /* drain the out of sequence received packet queue into the packet Rx + * queue */ + if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) { + while (call->rx_data_post == call->rx_first_oos) + if (rxrpc_drain_rx_oos_queue(call) < 0) + break; + goto maybe_reschedule; + } + + /* other events may have been raised since we started checking */ + goto maybe_reschedule; + +send_ACK_with_skew: + ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - + ntohl(ack.serial)); +send_ACK: + mtu = call->conn->trans->peer->if_mtu; + mtu -= call->conn->trans->peer->hdrsize; + ackinfo.maxMTU = htonl(mtu); + ackinfo.rwind = htonl(32); + + /* permit the peer to send us jumbo packets if it wants to */ + ackinfo.rxMTU = htonl(5692); + ackinfo.jumbo_max = htonl(4); + + hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); + _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + ntohl(hdr.serial), + ntohs(ack.maxSkew), + ntohl(ack.firstPacket), + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks[ack.reason], + ack.nAcks); + + del_timer_sync(&call->ack_timer); + if (ack.nAcks > 0) + set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); + goto send_message_2; + +send_message: + _debug("send message"); + + hdr.serial = htonl(atomic_inc_return(&call->conn->serial)); + _proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial)); +send_message_2: + + len = iov[0].iov_len; + ioc = 1; + if (iov[4].iov_len) { + ioc = 5; + len += iov[4].iov_len; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[3].iov_len) { + ioc = 4; + len += iov[3].iov_len; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[2].iov_len) { + ioc = 3; + len += iov[2].iov_len; + len += iov[1].iov_len; + } else if (iov[1].iov_len) { + ioc = 2; + len += iov[1].iov_len; + } + + ret = kernel_sendmsg(call->conn->trans->local->socket, + &msg, iov, ioc, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + goto error; + } + + switch (genbit) { + case RXRPC_CALL_ABORT: + clear_bit(genbit, &call->events); + clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + goto kill_ACKs; + + case RXRPC_CALL_ACK_FINAL: + write_lock_bh(&call->state_lock); + if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) + call->state = RXRPC_CALL_COMPLETE; + write_unlock_bh(&call->state_lock); + goto kill_ACKs; + + default: + clear_bit(genbit, &call->events); + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + _debug("start ACK timer"); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, + call->ackr_serial, false); + default: + break; + } + goto maybe_reschedule; + } + +kill_ACKs: + del_timer_sync(&call->ack_timer); + if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events)) + rxrpc_put_call(call); + clear_bit(RXRPC_CALL_ACK, &call->events); + +maybe_reschedule: + if (call->events || !skb_queue_empty(&call->rx_queue)) { + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + + /* don't leave aborted connections on the accept queue */ + if (call->state >= RXRPC_CALL_COMPLETE && + !list_empty(&call->accept_link)) { + _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", + call, call->events, call->flags, + ntohl(call->conn->cid)); + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); + } + +error: + clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); + kfree(acks); + + /* because we don't want two CPUs both processing the work item for one + * call at the same time, we use a flag to note when it's busy; however + * this means there's a race between clearing the flag and setting the + * work pending bit and the work item being processed again */ + if (call->events && !work_pending(&call->processor)) { + _debug("jumpstart %x", ntohl(call->conn->cid)); + rxrpc_queue_call(call); + } + + _leave(""); + return; + +no_mem: + _debug("out of memory"); + goto maybe_reschedule; +} diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c new file mode 100644 index 00000000..bf656c23 --- /dev/null +++ b/net/rxrpc/ar-call.c @@ -0,0 +1,822 @@ +/* RxRPC individual remote procedure call handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "ar-internal.h" + +const char *const rxrpc_call_states[] = { + [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", + [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", + [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", + [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", + [RXRPC_CALL_SERVER_SECURING] = "SvSecure", + [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", + [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", + [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", + [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", + [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", + [RXRPC_CALL_COMPLETE] = "Complete", + [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", + [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CALL_NETWORK_ERROR] = "NetError", + [RXRPC_CALL_DEAD] = "Dead ", +}; + +struct kmem_cache *rxrpc_call_jar; +LIST_HEAD(rxrpc_calls); +DEFINE_RWLOCK(rxrpc_call_lock); +static unsigned rxrpc_call_max_lifetime = 60; +static unsigned rxrpc_dead_call_timeout = 2; + +static void rxrpc_destroy_call(struct work_struct *work); +static void rxrpc_call_life_expired(unsigned long _call); +static void rxrpc_dead_call_expired(unsigned long _call); +static void rxrpc_ack_time_expired(unsigned long _call); +static void rxrpc_resend_time_expired(unsigned long _call); + +/* + * allocate a new call + */ +static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +{ + struct rxrpc_call *call; + + call = kmem_cache_zalloc(rxrpc_call_jar, gfp); + if (!call) + return NULL; + + call->acks_winsz = 16; + call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), + gfp); + if (!call->acks_window) { + kmem_cache_free(rxrpc_call_jar, call); + return NULL; + } + + setup_timer(&call->lifetimer, &rxrpc_call_life_expired, + (unsigned long) call); + setup_timer(&call->deadspan, &rxrpc_dead_call_expired, + (unsigned long) call); + setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, + (unsigned long) call); + setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, + (unsigned long) call); + INIT_WORK(&call->destroyer, &rxrpc_destroy_call); + INIT_WORK(&call->processor, &rxrpc_process_call); + INIT_LIST_HEAD(&call->accept_link); + skb_queue_head_init(&call->rx_queue); + skb_queue_head_init(&call->rx_oos_queue); + init_waitqueue_head(&call->tx_waitq); + spin_lock_init(&call->lock); + rwlock_init(&call->state_lock); + atomic_set(&call->usage, 1); + call->debug_id = atomic_inc_return(&rxrpc_debug_id); + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + + memset(&call->sock_node, 0xed, sizeof(call->sock_node)); + + call->rx_data_expect = 1; + call->rx_data_eaten = 0; + call->rx_first_oos = 0; + call->ackr_win_top = call->rx_data_eaten + 1 + RXRPC_MAXACKS; + call->creation_jif = jiffies; + return call; +} + +/* + * allocate a new client call and attempt to get a connection slot for it + */ +static struct rxrpc_call *rxrpc_alloc_client_call( + struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + gfp_t gfp) +{ + struct rxrpc_call *call; + int ret; + + _enter(""); + + ASSERT(rx != NULL); + ASSERT(trans != NULL); + ASSERT(bundle != NULL); + + call = rxrpc_alloc_call(gfp); + if (!call) + return ERR_PTR(-ENOMEM); + + sock_hold(&rx->sk); + call->socket = rx; + call->rx_data_post = 1; + + ret = rxrpc_connect_call(rx, trans, bundle, call, gfp); + if (ret < 0) { + kmem_cache_free(rxrpc_call_jar, call); + return ERR_PTR(ret); + } + + spin_lock(&call->conn->trans->peer->lock); + list_add(&call->error_link, &call->conn->trans->peer->error_targets); + spin_unlock(&call->conn->trans->peer->lock); + + call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + add_timer(&call->lifetimer); + + _leave(" = %p", call); + return call; +} + +/* + * set up a call for the given data + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + unsigned long user_call_ID, + int create, + gfp_t gfp) +{ + struct rxrpc_call *call, *candidate; + struct rb_node *p, *parent, **pp; + + _enter("%p,%d,%d,%lx,%d", + rx, trans ? trans->debug_id : -1, bundle ? bundle->debug_id : -1, + user_call_ID, create); + + /* search the extant calls first for one that matches the specified + * user ID */ + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + + if (!create || !trans) + return ERR_PTR(-EBADSLT); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_client_call(rx, trans, bundle, gfp); + if (IS_ERR(candidate)) { + _leave(" = %ld", PTR_ERR(candidate)); + return candidate; + } + + candidate->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &candidate->flags); + + write_lock(&rx->call_lock); + + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + call = rb_entry(parent, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* second search also failed; add the new call */ + call = candidate; + candidate = NULL; + rxrpc_get_call(call); + + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + write_unlock(&rx->call_lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); + + _leave(" = %p [new]", call); + return call; + + /* we found the call in the list immediately */ +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [extant %d]", call, atomic_read(&call->usage)); + return call; + + /* we found the call on the second time through the list */ +found_extant_second: + rxrpc_get_call(call); + write_unlock(&rx->call_lock); + rxrpc_put_call(candidate); + _leave(" = %p [second %d]", call, atomic_read(&call->usage)); + return call; +} + +/* + * set up an incoming call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, + struct rxrpc_header *hdr, + gfp_t gfp) +{ + struct rxrpc_call *call, *candidate; + struct rb_node **p, *parent; + __be32 call_id; + + _enter(",%d,,%x", conn->debug_id, gfp); + + ASSERT(rx != NULL); + + candidate = rxrpc_alloc_call(gfp); + if (!candidate) + return ERR_PTR(-EBUSY); + + candidate->socket = rx; + candidate->conn = conn; + candidate->cid = hdr->cid; + candidate->call_id = hdr->callNumber; + candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK; + candidate->rx_data_post = 0; + candidate->state = RXRPC_CALL_SERVER_ACCEPTING; + if (conn->security_ix > 0) + candidate->state = RXRPC_CALL_SERVER_SECURING; + + write_lock_bh(&conn->lock); + + /* set the channel for this call */ + call = conn->channels[candidate->channel]; + _debug("channel[%u] is %p", candidate->channel, call); + if (call && call->call_id == hdr->callNumber) { + /* already set; must've been a duplicate packet */ + _debug("extant call [%d]", call->state); + ASSERTCMP(call->conn, ==, conn); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + rxrpc_queue_call(call); + case RXRPC_CALL_REMOTELY_ABORTED: + read_unlock(&call->state_lock); + goto aborted_call; + default: + rxrpc_get_call(call); + read_unlock(&call->state_lock); + goto extant_call; + } + } + + if (call) { + /* it seems the channel is still in use from the previous call + * - ditch the old binding if its call is now complete */ + _debug("CALL: %u { %s }", + call->debug_id, rxrpc_call_states[call->state]); + + if (call->state >= RXRPC_CALL_COMPLETE) { + conn->channels[call->channel] = NULL; + } else { + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -EBUSY"); + return ERR_PTR(-EBUSY); + } + } + + /* check the call number isn't duplicate */ + _debug("check dup"); + call_id = hdr->callNumber; + p = &conn->calls.rb_node; + parent = NULL; + while (*p) { + parent = *p; + call = rb_entry(parent, struct rxrpc_call, conn_node); + + if (call_id < call->call_id) + p = &(*p)->rb_left; + else if (call_id > call->call_id) + p = &(*p)->rb_right; + else + goto old_call; + } + + /* make the call available */ + _debug("new call"); + call = candidate; + candidate = NULL; + rb_link_node(&call->conn_node, parent, p); + rb_insert_color(&call->conn_node, &conn->calls); + conn->channels[call->channel] = call; + sock_hold(&rx->sk); + atomic_inc(&conn->usage); + write_unlock_bh(&conn->lock); + + spin_lock(&conn->trans->peer->lock); + list_add(&call->error_link, &conn->trans->peer->error_targets); + spin_unlock(&conn->trans->peer->lock); + + write_lock_bh(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock_bh(&rxrpc_call_lock); + + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); + + call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ; + add_timer(&call->lifetimer); + _leave(" = %p {%d} [new]", call, call->debug_id); + return call; + +extant_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); + return call; + +aborted_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNABORTED"); + return ERR_PTR(-ECONNABORTED); + +old_call: + write_unlock_bh(&conn->lock); + kmem_cache_free(rxrpc_call_jar, candidate); + _leave(" = -ECONNRESET [old]"); + return ERR_PTR(-ECONNRESET); +} + +/* + * find an extant server call + * - called in process context with IRQs enabled + */ +struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *rx, + unsigned long user_call_ID) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p,%lx", rx, user_call_ID); + + /* search the extant calls for one that matches the specified user + * ID */ + read_lock(&rx->call_lock); + + p = rx->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, sock_node); + + if (user_call_ID < call->user_call_ID) + p = p->rb_left; + else if (user_call_ID > call->user_call_ID) + p = p->rb_right; + else + goto found_extant_call; + } + + read_unlock(&rx->call_lock); + _leave(" = NULL"); + return NULL; + + /* we found the call in the list immediately */ +found_extant_call: + rxrpc_get_call(call); + read_unlock(&rx->call_lock); + _leave(" = %p [%d]", call, atomic_read(&call->usage)); + return call; +} + +/* + * detach a call from a socket and set up for release + */ +void rxrpc_release_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_sock *rx = call->socket; + + _enter("{%d,%d,%d,%d}", + call->debug_id, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + call->rx_first_oos); + + spin_lock_bh(&call->lock); + if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) + BUG(); + spin_unlock_bh(&call->lock); + + /* dissociate from the socket + * - the socket's ref on the call is passed to the death timer + */ + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + + write_lock_bh(&rx->call_lock); + if (!list_empty(&call->accept_link)) { + _debug("unlinking once-pending call %p { e=%lx f=%lx }", + call, call->events, call->flags); + ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + list_del_init(&call->accept_link); + sk_acceptq_removed(&rx->sk); + } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + rb_erase(&call->sock_node, &rx->calls); + memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + } + write_unlock_bh(&rx->call_lock); + + /* free up the channel for reuse */ + spin_lock(&conn->trans->client_lock); + write_lock_bh(&conn->lock); + write_lock(&call->state_lock); + + if (conn->channels[call->channel] == call) + conn->channels[call->channel] = NULL; + + if (conn->out_clientflag && conn->bundle) { + conn->avail_calls++; + switch (conn->avail_calls) { + case 1: + list_move_tail(&conn->bundle_link, + &conn->bundle->avail_conns); + case 2 ... RXRPC_MAXCALLS - 1: + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + break; + case RXRPC_MAXCALLS: + list_move_tail(&conn->bundle_link, + &conn->bundle->unused_conns); + ASSERT(conn->channels[0] == NULL && + conn->channels[1] == NULL && + conn->channels[2] == NULL && + conn->channels[3] == NULL); + break; + default: + printk(KERN_ERR "RxRPC: conn->avail_calls=%d\n", + conn->avail_calls); + BUG(); + } + } + + spin_unlock(&conn->trans->client_lock); + + if (call->state < RXRPC_CALL_COMPLETE && + call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { + _debug("+++ ABORTING STATE %d +++\n", call->state); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_CALL_DEAD; + set_bit(RXRPC_CALL_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + write_unlock_bh(&conn->lock); + + /* clean up the Rx queue */ + if (!skb_queue_empty(&call->rx_queue) || + !skb_queue_empty(&call->rx_oos_queue)) { + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + + _debug("purge Rx queues"); + + spin_lock_bh(&call->lock); + while ((skb = skb_dequeue(&call->rx_queue)) || + (skb = skb_dequeue(&call->rx_oos_queue))) { + sp = rxrpc_skb(skb); + if (sp->call) { + ASSERTCMP(sp->call, ==, call); + rxrpc_put_call(call); + sp->call = NULL; + } + skb->destructor = NULL; + spin_unlock_bh(&call->lock); + + _debug("- zap %s %%%u #%u", + rxrpc_pkts[sp->hdr.type], + ntohl(sp->hdr.serial), + ntohl(sp->hdr.seq)); + rxrpc_free_skb(skb); + spin_lock_bh(&call->lock); + } + spin_unlock_bh(&call->lock); + + ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); + } + + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->lifetimer); + call->deadspan.expires = jiffies + rxrpc_dead_call_timeout * HZ; + add_timer(&call->deadspan); + + _leave(""); +} + +/* + * handle a dead call being ready for reaping + */ +static void rxrpc_dead_call_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + write_lock_bh(&call->state_lock); + call->state = RXRPC_CALL_DEAD; + write_unlock_bh(&call->state_lock); + rxrpc_put_call(call); +} + +/* + * mark a call as to be released, aborting it if it's still in progress + * - called with softirqs disabled + */ +static void rxrpc_mark_call_released(struct rxrpc_call *call) +{ + bool sched; + + write_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + sched = false; + if (call->state < RXRPC_CALL_COMPLETE) { + _debug("abort call %p", call); + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_CALL_DEAD; + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + sched = true; + } + if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + sched = true; + if (sched) + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); +} + +/* + * release all the calls associated with a socket + */ +void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("%p", rx); + + read_lock_bh(&rx->call_lock); + + /* mark all the calls as no longer wanting incoming packets */ + for (p = rb_first(&rx->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, sock_node); + rxrpc_mark_call_released(call); + } + + /* kill the not-yet-accepted incoming calls */ + list_for_each_entry(call, &rx->secureq, accept_link) { + rxrpc_mark_call_released(call); + } + + list_for_each_entry(call, &rx->acceptq, accept_link) { + rxrpc_mark_call_released(call); + } + + read_unlock_bh(&rx->call_lock); + _leave(""); +} + +/* + * release a call + */ +void __rxrpc_put_call(struct rxrpc_call *call) +{ + ASSERT(call != NULL); + + _enter("%p{u=%d}", call, atomic_read(&call->usage)); + + ASSERTCMP(atomic_read(&call->usage), >, 0); + + if (atomic_dec_and_test(&call->usage)) { + _debug("call %d dead", call->debug_id); + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + rxrpc_queue_work(&call->destroyer); + } + _leave(""); +} + +/* + * clean up a call + */ +static void rxrpc_cleanup_call(struct rxrpc_call *call) +{ + _net("DESTROY CALL %d", call->debug_id); + + ASSERT(call->socket); + + memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); + + del_timer_sync(&call->lifetimer); + del_timer_sync(&call->deadspan); + del_timer_sync(&call->ack_timer); + del_timer_sync(&call->resend_timer); + + ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); + ASSERTCMP(call->events, ==, 0); + if (work_pending(&call->processor)) { + _debug("defer destroy"); + rxrpc_queue_work(&call->destroyer); + return; + } + + if (call->conn) { + spin_lock(&call->conn->trans->peer->lock); + list_del(&call->error_link); + spin_unlock(&call->conn->trans->peer->lock); + + write_lock_bh(&call->conn->lock); + rb_erase(&call->conn_node, &call->conn->calls); + write_unlock_bh(&call->conn->lock); + rxrpc_put_connection(call->conn); + } + + if (call->acks_window) { + _debug("kill Tx window %d", + CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz)); + smp_mb(); + while (CIRC_CNT(call->acks_head, call->acks_tail, + call->acks_winsz) > 0) { + struct rxrpc_skb_priv *sp; + unsigned long _skb; + + _skb = call->acks_window[call->acks_tail] & ~1; + sp = rxrpc_skb((struct sk_buff *) _skb); + _debug("+++ clear Tx %u", ntohl(sp->hdr.seq)); + rxrpc_free_skb((struct sk_buff *) _skb); + call->acks_tail = + (call->acks_tail + 1) & (call->acks_winsz - 1); + } + + kfree(call->acks_window); + } + + rxrpc_free_skb(call->tx_pending); + + rxrpc_purge_queue(&call->rx_queue); + ASSERT(skb_queue_empty(&call->rx_oos_queue)); + sock_put(&call->socket->sk); + kmem_cache_free(rxrpc_call_jar, call); +} + +/* + * destroy a call + */ +static void rxrpc_destroy_call(struct work_struct *work) +{ + struct rxrpc_call *call = + container_of(work, struct rxrpc_call, destroyer); + + _enter("%p{%d,%d,%p}", + call, atomic_read(&call->usage), call->channel, call->conn); + + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + + write_lock_bh(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock_bh(&rxrpc_call_lock); + + rxrpc_cleanup_call(call); + _leave(""); +} + +/* + * preemptively destroy all the call records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_calls(void) +{ + struct rxrpc_call *call; + + _enter(""); + write_lock_bh(&rxrpc_call_lock); + + while (!list_empty(&rxrpc_calls)) { + call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); + _debug("Zapping call %p", call); + + list_del_init(&call->link); + + switch (atomic_read(&call->usage)) { + case 0: + ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); + break; + case 1: + if (del_timer_sync(&call->deadspan) != 0 && + call->state != RXRPC_CALL_DEAD) + rxrpc_dead_call_expired((unsigned long) call); + if (call->state != RXRPC_CALL_DEAD) + break; + default: + printk(KERN_ERR "RXRPC:" + " Call %p still in use (%d,%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + atomic_read(&call->ackr_not_idle), + rxrpc_call_states[call->state], + call->flags, call->events); + if (!skb_queue_empty(&call->rx_queue)) + printk(KERN_ERR"RXRPC: Rx queue occupied\n"); + if (!skb_queue_empty(&call->rx_oos_queue)) + printk(KERN_ERR"RXRPC: OOS queue occupied\n"); + break; + } + + write_unlock_bh(&rxrpc_call_lock); + cond_resched(); + write_lock_bh(&rxrpc_call_lock); + } + + write_unlock_bh(&rxrpc_call_lock); + _leave(""); +} + +/* + * handle call lifetime being exceeded + */ +static void rxrpc_call_life_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + _enter("{%d}", call->debug_id); + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + set_bit(RXRPC_CALL_LIFE_TIMER, &call->events); + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * handle resend timer expiry + * - may not take call->state_lock as this can deadlock against del_timer_sync() + */ +static void rxrpc_resend_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); +} + +/* + * handle ACK timer expiry + */ +static void rxrpc_ack_time_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _enter("{%d}", call->debug_id); + + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_ACK, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c new file mode 100644 index 00000000..4106ca95 --- /dev/null +++ b/net/rxrpc/ar-connection.c @@ -0,0 +1,922 @@ +/* RxRPC virtual connection handler + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static void rxrpc_connection_reaper(struct work_struct *work); + +LIST_HEAD(rxrpc_connections); +DEFINE_RWLOCK(rxrpc_connection_lock); +static unsigned long rxrpc_connection_timeout = 10 * 60; +static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); + +/* + * allocate a new client connection bundle + */ +static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle; + + _enter(""); + + bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp); + if (bundle) { + INIT_LIST_HEAD(&bundle->unused_conns); + INIT_LIST_HEAD(&bundle->avail_conns); + INIT_LIST_HEAD(&bundle->busy_conns); + init_waitqueue_head(&bundle->chanwait); + atomic_set(&bundle->usage, 1); + } + + _leave(" = %p", bundle); + return bundle; +} + +/* + * compare bundle parameters with what we're looking for + * - return -ve, 0 or +ve + */ +static inline +int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle, + struct key *key, __be16 service_id) +{ + return (bundle->service_id - service_id) ?: + ((unsigned long) bundle->key - (unsigned long) key); +} + +/* + * get bundle of client connections that a client socket can make use of + */ +struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct key *key, + __be16 service_id, + gfp_t gfp) +{ + struct rxrpc_conn_bundle *bundle, *candidate; + struct rb_node *p, *parent, **pp; + + _enter("%p{%x},%x,%hx,", + rx, key_serial(key), trans->debug_id, ntohs(service_id)); + + if (rx->trans == trans && rx->bundle) { + atomic_inc(&rx->bundle->usage); + return rx->bundle; + } + + /* search the extant bundles first for one that matches the specified + * user ID */ + spin_lock(&trans->client_lock); + + p = trans->bundles.rb_node; + while (p) { + bundle = rb_entry(p, struct rxrpc_conn_bundle, node); + + if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) + p = p->rb_left; + else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) + p = p->rb_right; + else + goto found_extant_bundle; + } + + spin_unlock(&trans->client_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_bundle(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->key = key_get(key); + candidate->service_id = service_id; + + spin_lock(&trans->client_lock); + + pp = &trans->bundles.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + bundle = rb_entry(parent, struct rxrpc_conn_bundle, node); + + if (rxrpc_cmp_bundle(bundle, key, service_id) < 0) + pp = &(*pp)->rb_left; + else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* second search also failed; add the new bundle */ + bundle = candidate; + candidate = NULL; + + rb_link_node(&bundle->node, parent, pp); + rb_insert_color(&bundle->node, &trans->bundles); + spin_unlock(&trans->client_lock); + _net("BUNDLE new on trans %d", trans->debug_id); + if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { + atomic_inc(&bundle->usage); + rx->bundle = bundle; + } + _leave(" = %p [new]", bundle); + return bundle; + + /* we found the bundle in the list immediately */ +found_extant_bundle: + atomic_inc(&bundle->usage); + spin_unlock(&trans->client_lock); + _net("BUNDLE old on trans %d", trans->debug_id); + if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { + atomic_inc(&bundle->usage); + rx->bundle = bundle; + } + _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage)); + return bundle; + + /* we found the bundle on the second time through the list */ +found_extant_second: + atomic_inc(&bundle->usage); + spin_unlock(&trans->client_lock); + kfree(candidate); + _net("BUNDLE old2 on trans %d", trans->debug_id); + if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) { + atomic_inc(&bundle->usage); + rx->bundle = bundle; + } + _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage)); + return bundle; +} + +/* + * release a bundle + */ +void rxrpc_put_bundle(struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle) +{ + _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage)); + + if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) { + _debug("Destroy bundle"); + rb_erase(&bundle->node, &trans->bundles); + spin_unlock(&trans->client_lock); + ASSERT(list_empty(&bundle->unused_conns)); + ASSERT(list_empty(&bundle->avail_conns)); + ASSERT(list_empty(&bundle->busy_conns)); + ASSERTCMP(bundle->num_conns, ==, 0); + key_put(bundle->key); + kfree(bundle); + } + + _leave(""); +} + +/* + * allocate a new connection + */ +static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) +{ + struct rxrpc_connection *conn; + + _enter(""); + + conn = kzalloc(sizeof(struct rxrpc_connection), gfp); + if (conn) { + INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_LIST_HEAD(&conn->bundle_link); + conn->calls = RB_ROOT; + skb_queue_head_init(&conn->rx_queue); + rwlock_init(&conn->lock); + spin_lock_init(&conn->state_lock); + atomic_set(&conn->usage, 1); + conn->debug_id = atomic_inc_return(&rxrpc_debug_id); + conn->avail_calls = RXRPC_MAXCALLS; + conn->size_align = 4; + conn->header_size = sizeof(struct rxrpc_header); + } + + _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); + return conn; +} + +/* + * assign a connection ID to a connection and add it to the transport's + * connection lookup tree + * - called with transport client lock held + */ +static void rxrpc_assign_connection_id(struct rxrpc_connection *conn) +{ + struct rxrpc_connection *xconn; + struct rb_node *parent, **p; + __be32 epoch; + u32 real_conn_id; + + _enter(""); + + epoch = conn->epoch; + + write_lock_bh(&conn->trans->conn_lock); + + conn->trans->conn_idcounter += RXRPC_CID_INC; + if (conn->trans->conn_idcounter < RXRPC_CID_INC) + conn->trans->conn_idcounter = RXRPC_CID_INC; + real_conn_id = conn->trans->conn_idcounter; + +attempt_insertion: + parent = NULL; + p = &conn->trans->client_conns.rb_node; + + while (*p) { + parent = *p; + xconn = rb_entry(parent, struct rxrpc_connection, node); + + if (epoch < xconn->epoch) + p = &(*p)->rb_left; + else if (epoch > xconn->epoch) + p = &(*p)->rb_right; + else if (real_conn_id < xconn->real_conn_id) + p = &(*p)->rb_left; + else if (real_conn_id > xconn->real_conn_id) + p = &(*p)->rb_right; + else + goto id_exists; + } + + /* we've found a suitable hole - arrange for this connection to occupy + * it */ + rb_link_node(&conn->node, parent, p); + rb_insert_color(&conn->node, &conn->trans->client_conns); + + conn->real_conn_id = real_conn_id; + conn->cid = htonl(real_conn_id); + write_unlock_bh(&conn->trans->conn_lock); + _leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid)); + return; + + /* we found a connection with the proposed ID - walk the tree from that + * point looking for the next unused ID */ +id_exists: + for (;;) { + real_conn_id += RXRPC_CID_INC; + if (real_conn_id < RXRPC_CID_INC) { + real_conn_id = RXRPC_CID_INC; + conn->trans->conn_idcounter = real_conn_id; + goto attempt_insertion; + } + + parent = rb_next(parent); + if (!parent) + goto attempt_insertion; + + xconn = rb_entry(parent, struct rxrpc_connection, node); + if (epoch < xconn->epoch || + real_conn_id < xconn->real_conn_id) + goto attempt_insertion; + } +} + +/* + * add a call to a connection's call-by-ID tree + */ +static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn, + struct rxrpc_call *call) +{ + struct rxrpc_call *xcall; + struct rb_node *parent, **p; + __be32 call_id; + + write_lock_bh(&conn->lock); + + call_id = call->call_id; + p = &conn->calls.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xcall = rb_entry(parent, struct rxrpc_call, conn_node); + + if (call_id < xcall->call_id) + p = &(*p)->rb_left; + else if (call_id > xcall->call_id) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&call->conn_node, parent, p); + rb_insert_color(&call->conn_node, &conn->calls); + + write_unlock_bh(&conn->lock); +} + +/* + * connect a call on an exclusive connection + */ +static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + __be16 service_id, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn; + int chan, ret; + + _enter(""); + + conn = rx->conn; + if (!conn) { + /* not yet present - create a candidate for a new connection + * and then redo the check */ + conn = rxrpc_alloc_connection(gfp); + if (!conn) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + conn->trans = trans; + conn->bundle = NULL; + conn->service_id = service_id; + conn->epoch = rxrpc_epoch; + conn->in_clientflag = 0; + conn->out_clientflag = RXRPC_CLIENT_INITIATED; + conn->cid = 0; + conn->state = RXRPC_CONN_CLIENT; + conn->avail_calls = RXRPC_MAXCALLS - 1; + conn->security_level = rx->min_sec_level; + conn->key = key_get(rx->key); + + ret = rxrpc_init_client_conn_security(conn); + if (ret < 0) { + key_put(conn->key); + kfree(conn); + _leave(" = %d [key]", ret); + return ret; + } + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + spin_lock(&trans->client_lock); + atomic_inc(&trans->usage); + + _net("CONNECT EXCL new %d on TRANS %d", + conn->debug_id, conn->trans->debug_id); + + rxrpc_assign_connection_id(conn); + rx->conn = conn; + } + + /* we've got a connection with a free channel and we can now attach the + * call to it + * - we're holding the transport's client lock + * - we're holding a reference on the connection + */ + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan]) + goto found_channel; + goto no_free_channels; + +found_channel: + atomic_inc(&conn->usage); + conn->channels[chan] = call; + call->conn = conn; + call->channel = chan; + call->cid = conn->cid | htonl(chan); + call->call_id = htonl(++conn->call_counter); + + _net("CONNECT client on conn %d chan %d as call %x", + conn->debug_id, chan, ntohl(call->call_id)); + + spin_unlock(&trans->client_lock); + + rxrpc_add_call_ID_to_conn(conn, call); + _leave(" = 0"); + return 0; + +no_free_channels: + spin_unlock(&trans->client_lock); + _leave(" = -ENOSR"); + return -ENOSR; +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_sock *rx, + struct rxrpc_transport *trans, + struct rxrpc_conn_bundle *bundle, + struct rxrpc_call *call, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate; + int chan, ret; + + DECLARE_WAITQUEUE(myself, current); + + _enter("%p,%lx,", rx, call->user_call_ID); + + if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags)) + return rxrpc_connect_exclusive(rx, trans, bundle->service_id, + call, gfp); + + spin_lock(&trans->client_lock); + for (;;) { + /* see if the bundle has a call slot available */ + if (!list_empty(&bundle->avail_conns)) { + _debug("avail"); + conn = list_entry(bundle->avail_conns.next, + struct rxrpc_connection, + bundle_link); + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + list_del_init(&conn->bundle_link); + bundle->num_conns--; + continue; + } + if (--conn->avail_calls == 0) + list_move(&conn->bundle_link, + &bundle->busy_conns); + ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + atomic_inc(&conn->usage); + break; + } + + if (!list_empty(&bundle->unused_conns)) { + _debug("unused"); + conn = list_entry(bundle->unused_conns.next, + struct rxrpc_connection, + bundle_link); + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + list_del_init(&conn->bundle_link); + bundle->num_conns--; + continue; + } + ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS); + conn->avail_calls = RXRPC_MAXCALLS - 1; + ASSERT(conn->channels[0] == NULL && + conn->channels[1] == NULL && + conn->channels[2] == NULL && + conn->channels[3] == NULL); + atomic_inc(&conn->usage); + list_move(&conn->bundle_link, &bundle->avail_conns); + break; + } + + /* need to allocate a new connection */ + _debug("get new conn [%d]", bundle->num_conns); + + spin_unlock(&trans->client_lock); + + if (signal_pending(current)) + goto interrupted; + + if (bundle->num_conns >= 20) { + _debug("too many conns"); + + if (!(gfp & __GFP_WAIT)) { + _leave(" = -EAGAIN"); + return -EAGAIN; + } + + add_wait_queue(&bundle->chanwait, &myself); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (bundle->num_conns < 20 || + !list_empty(&bundle->unused_conns) || + !list_empty(&bundle->avail_conns)) + break; + if (signal_pending(current)) + goto interrupted_dequeue; + schedule(); + } + remove_wait_queue(&bundle->chanwait, &myself); + __set_current_state(TASK_RUNNING); + spin_lock(&trans->client_lock); + continue; + } + + /* not yet present - create a candidate for a new connection and then + * redo the check */ + candidate = rxrpc_alloc_connection(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + candidate->trans = trans; + candidate->bundle = bundle; + candidate->service_id = bundle->service_id; + candidate->epoch = rxrpc_epoch; + candidate->in_clientflag = 0; + candidate->out_clientflag = RXRPC_CLIENT_INITIATED; + candidate->cid = 0; + candidate->state = RXRPC_CONN_CLIENT; + candidate->avail_calls = RXRPC_MAXCALLS; + candidate->security_level = rx->min_sec_level; + candidate->key = key_get(bundle->key); + + ret = rxrpc_init_client_conn_security(candidate); + if (ret < 0) { + key_put(candidate->key); + kfree(candidate); + _leave(" = %d [key]", ret); + return ret; + } + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&candidate->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + spin_lock(&trans->client_lock); + + list_add(&candidate->bundle_link, &bundle->unused_conns); + bundle->num_conns++; + atomic_inc(&bundle->usage); + atomic_inc(&trans->usage); + + _net("CONNECT new %d on TRANS %d", + candidate->debug_id, candidate->trans->debug_id); + + rxrpc_assign_connection_id(candidate); + if (candidate->security) + candidate->security->prime_packet_security(candidate); + + /* leave the candidate lurking in zombie mode attached to the + * bundle until we're ready for it */ + rxrpc_put_connection(candidate); + candidate = NULL; + } + + /* we've got a connection with a free channel and we can now attach the + * call to it + * - we're holding the transport's client lock + * - we're holding a reference on the connection + * - we're holding a reference on the bundle + */ + for (chan = 0; chan < RXRPC_MAXCALLS; chan++) + if (!conn->channels[chan]) + goto found_channel; + ASSERT(conn->channels[0] == NULL || + conn->channels[1] == NULL || + conn->channels[2] == NULL || + conn->channels[3] == NULL); + BUG(); + +found_channel: + conn->channels[chan] = call; + call->conn = conn; + call->channel = chan; + call->cid = conn->cid | htonl(chan); + call->call_id = htonl(++conn->call_counter); + + _net("CONNECT client on conn %d chan %d as call %x", + conn->debug_id, chan, ntohl(call->call_id)); + + ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS); + spin_unlock(&trans->client_lock); + + rxrpc_add_call_ID_to_conn(conn, call); + + _leave(" = 0"); + return 0; + +interrupted_dequeue: + remove_wait_queue(&bundle->chanwait, &myself); + __set_current_state(TASK_RUNNING); +interrupted: + _leave(" = -ERESTARTSYS"); + return -ERESTARTSYS; +} + +/* + * get a record of an incoming connection + */ +struct rxrpc_connection * +rxrpc_incoming_connection(struct rxrpc_transport *trans, + struct rxrpc_header *hdr, + gfp_t gfp) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct rb_node *p, **pp; + const char *new = "old"; + __be32 epoch; + u32 conn_id; + + _enter(""); + + ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED); + + epoch = hdr->epoch; + conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + + /* search the connection list first */ + read_lock_bh(&trans->conn_lock); + + p = trans->server_conns.rb_node; + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->real_conn_id); + + if (epoch < conn->epoch) + p = p->rb_left; + else if (epoch > conn->epoch) + p = p->rb_right; + else if (conn_id < conn->real_conn_id) + p = p->rb_left; + else if (conn_id > conn->real_conn_id) + p = p->rb_right; + else + goto found_extant_connection; + } + read_unlock_bh(&trans->conn_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_connection(gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + candidate->trans = trans; + candidate->epoch = hdr->epoch; + candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK); + candidate->service_id = hdr->serviceId; + candidate->security_ix = hdr->securityIndex; + candidate->in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->out_clientflag = 0; + candidate->real_conn_id = conn_id; + candidate->state = RXRPC_CONN_SERVER; + if (candidate->service_id) + candidate->state = RXRPC_CONN_SERVER_UNSECURED; + + write_lock_bh(&trans->conn_lock); + + pp = &trans->server_conns.rb_node; + p = NULL; + while (*pp) { + p = *pp; + conn = rb_entry(p, struct rxrpc_connection, node); + + if (epoch < conn->epoch) + pp = &(*pp)->rb_left; + else if (epoch > conn->epoch) + pp = &(*pp)->rb_right; + else if (conn_id < conn->real_conn_id) + pp = &(*pp)->rb_left; + else if (conn_id > conn->real_conn_id) + pp = &(*pp)->rb_right; + else + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + conn = candidate; + candidate = NULL; + rb_link_node(&conn->node, p, pp); + rb_insert_color(&conn->node, &trans->server_conns); + atomic_inc(&conn->trans->usage); + + write_unlock_bh(&trans->conn_lock); + + write_lock_bh(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + write_unlock_bh(&rxrpc_connection_lock); + + new = "new"; + +success: + _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id); + + _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); + return conn; + + /* we found the connection in the list immediately */ +found_extant_connection: + if (hdr->securityIndex != conn->security_ix) { + read_unlock_bh(&trans->conn_lock); + goto security_mismatch; + } + atomic_inc(&conn->usage); + read_unlock_bh(&trans->conn_lock); + goto success; + + /* we found the connection on the second time through the list */ +found_extant_second: + if (hdr->securityIndex != conn->security_ix) { + write_unlock_bh(&trans->conn_lock); + goto security_mismatch; + } + atomic_inc(&conn->usage); + write_unlock_bh(&trans->conn_lock); + kfree(candidate); + goto success; + +security_mismatch: + kfree(candidate); + _leave(" = -EKEYREJECTED"); + return ERR_PTR(-EKEYREJECTED); +} + +/* + * find a connection based on transport and RxRPC connection ID for an incoming + * packet + */ +struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans, + struct rxrpc_header *hdr) +{ + struct rxrpc_connection *conn; + struct rb_node *p; + __be32 epoch; + u32 conn_id; + + _enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags); + + read_lock_bh(&trans->conn_lock); + + conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK; + epoch = hdr->epoch; + + if (hdr->flags & RXRPC_CLIENT_INITIATED) + p = trans->server_conns.rb_node; + else + p = trans->client_conns.rb_node; + + while (p) { + conn = rb_entry(p, struct rxrpc_connection, node); + + _debug("maybe %x", conn->real_conn_id); + + if (epoch < conn->epoch) + p = p->rb_left; + else if (epoch > conn->epoch) + p = p->rb_right; + else if (conn_id < conn->real_conn_id) + p = p->rb_left; + else if (conn_id > conn->real_conn_id) + p = p->rb_right; + else + goto found; + } + + read_unlock_bh(&trans->conn_lock); + _leave(" = NULL"); + return NULL; + +found: + atomic_inc(&conn->usage); + read_unlock_bh(&trans->conn_lock); + _leave(" = %p", conn); + return conn; +} + +/* + * release a virtual connection + */ +void rxrpc_put_connection(struct rxrpc_connection *conn) +{ + _enter("%p{u=%d,d=%d}", + conn, atomic_read(&conn->usage), conn->debug_id); + + ASSERTCMP(atomic_read(&conn->usage), >, 0); + + conn->put_time = get_seconds(); + if (atomic_dec_and_test(&conn->usage)) { + _debug("zombie"); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + } + + _leave(""); +} + +/* + * destroy a virtual connection + */ +static void rxrpc_destroy_connection(struct rxrpc_connection *conn) +{ + _enter("%p{%d}", conn, atomic_read(&conn->usage)); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + + _net("DESTROY CONN %d", conn->debug_id); + + if (conn->bundle) + rxrpc_put_bundle(conn->trans, conn->bundle); + + ASSERT(RB_EMPTY_ROOT(&conn->calls)); + rxrpc_purge_queue(&conn->rx_queue); + + rxrpc_clear_conn_security(conn); + rxrpc_put_transport(conn->trans); + kfree(conn); + _leave(""); +} + +/* + * reap dead connections + */ +static void rxrpc_connection_reaper(struct work_struct *work) +{ + struct rxrpc_connection *conn, *_p; + unsigned long now, earliest, reap_time; + + LIST_HEAD(graveyard); + + _enter(""); + + now = get_seconds(); + earliest = ULONG_MAX; + + write_lock_bh(&rxrpc_connection_lock); + list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) { + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long) now - (long) conn->put_time); + + if (likely(atomic_read(&conn->usage) > 0)) + continue; + + spin_lock(&conn->trans->client_lock); + write_lock(&conn->trans->conn_lock); + reap_time = conn->put_time + rxrpc_connection_timeout; + + if (atomic_read(&conn->usage) > 0) { + ; + } else if (reap_time <= now) { + list_move_tail(&conn->link, &graveyard); + if (conn->out_clientflag) + rb_erase(&conn->node, + &conn->trans->client_conns); + else + rb_erase(&conn->node, + &conn->trans->server_conns); + if (conn->bundle) { + list_del_init(&conn->bundle_link); + conn->bundle->num_conns--; + } + + } else if (reap_time < earliest) { + earliest = reap_time; + } + + write_unlock(&conn->trans->conn_lock); + spin_unlock(&conn->trans->client_lock); + } + write_unlock_bh(&rxrpc_connection_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, + (earliest - now) * HZ); + } + + /* then destroy all those pulled out */ + while (!list_empty(&graveyard)) { + conn = list_entry(graveyard.next, struct rxrpc_connection, + link); + list_del_init(&conn->link); + + ASSERTCMP(atomic_read(&conn->usage), ==, 0); + rxrpc_destroy_connection(conn); + } + + _leave(""); +} + +/* + * preemptively destroy all the connection records rather than waiting for them + * to time out + */ +void __exit rxrpc_destroy_all_connections(void) +{ + _enter(""); + + rxrpc_connection_timeout = 0; + cancel_delayed_work(&rxrpc_connection_reap); + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + + _leave(""); +} diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c new file mode 100644 index 00000000..e7ed43a5 --- /dev/null +++ b/net/rxrpc/ar-connevent.c @@ -0,0 +1,405 @@ +/* connection-level event handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * pass a connection-level abort onto all calls on that connection + */ +static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, + u32 abort_code) +{ + struct rxrpc_call *call; + struct rb_node *p; + + _enter("{%d},%x", conn->debug_id, abort_code); + + read_lock_bh(&conn->lock); + + for (p = rb_first(&conn->calls); p; p = rb_next(p)) { + call = rb_entry(p, struct rxrpc_call, conn_node); + write_lock(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = state; + call->abort_code = abort_code; + if (state == RXRPC_CALL_LOCALLY_ABORTED) + set_bit(RXRPC_CALL_CONN_ABORT, &call->events); + else + set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + } + + read_unlock_bh(&conn->lock); + _leave(""); +} + +/* + * generate a connection-level abort + */ +static int rxrpc_abort_connection(struct rxrpc_connection *conn, + u32 error, u32 abort_code) +{ + struct rxrpc_header hdr; + struct msghdr msg; + struct kvec iov[2]; + __be32 word; + size_t len; + int ret; + + _enter("%d,,%u,%u", conn->debug_id, error, abort_code); + + /* generate a connection-level abort */ + spin_lock_bh(&conn->state_lock); + if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { + conn->state = RXRPC_CONN_LOCALLY_ABORTED; + conn->error = error; + spin_unlock_bh(&conn->state_lock); + } else { + spin_unlock_bh(&conn->state_lock); + _leave(" = 0 [already dead]"); + return 0; + } + + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); + + msg.msg_name = &conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr.epoch = conn->epoch; + hdr.cid = conn->cid; + hdr.callNumber = 0; + hdr.seq = 0; + hdr.type = RXRPC_PACKET_TYPE_ABORT; + hdr.flags = conn->out_clientflag; + hdr.userStatus = 0; + hdr.securityIndex = conn->security_ix; + hdr._rsvd = 0; + hdr.serviceId = conn->service_id; + + word = htonl(abort_code); + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = &word; + iov[1].iov_len = sizeof(word); + + len = iov[0].iov_len + iov[1].iov_len; + + hdr.serial = htonl(atomic_inc_return(&conn->serial)); + _proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * mark a call as being on a now-secured channel + * - must be called with softirqs disabled + */ +static void rxrpc_call_is_secure(struct rxrpc_call *call) +{ + _enter("%p", call); + if (call) { + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_SECURED, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } +} + +/* + * connection-level Rx packet processor + */ +static int rxrpc_process_event(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 tmp; + u32 serial; + int loop, ret; + + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + kleave(" = -ECONNABORTED [%u]", conn->state); + return -ECONNABORTED; + } + + serial = ntohl(sp->hdr.serial); + + _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0) + return -EPROTO; + _proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp)); + + conn->state = RXRPC_CONN_REMOTELY_ABORTED; + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, + ntohl(tmp)); + return -ECONNABORTED; + + case RXRPC_PACKET_TYPE_CHALLENGE: + if (conn->security) + return conn->security->respond_to_challenge( + conn, skb, _abort_code); + return -EPROTO; + + case RXRPC_PACKET_TYPE_RESPONSE: + if (!conn->security) + return -EPROTO; + + ret = conn->security->verify_response(conn, skb, _abort_code); + if (ret < 0) + return ret; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) + return ret; + + conn->security->prime_packet_security(conn); + read_lock_bh(&conn->lock); + spin_lock(&conn->state_lock); + + if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) { + conn->state = RXRPC_CONN_SERVER; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop]); + } + + spin_unlock(&conn->state_lock); + read_unlock_bh(&conn->lock); + return 0; + + default: + _leave(" = -EPROTO [%u]", sp->hdr.type); + return -EPROTO; + } +} + +/* + * set up security and issue a challenge + */ +static void rxrpc_secure_connection(struct rxrpc_connection *conn) +{ + u32 abort_code; + int ret; + + _enter("{%d}", conn->debug_id); + + ASSERT(conn->security_ix != 0); + + if (!conn->key) { + _debug("set up security"); + ret = rxrpc_init_server_conn_security(conn); + switch (ret) { + case 0: + break; + case -ENOENT: + abort_code = RX_CALL_DEAD; + goto abort; + default: + abort_code = RXKADNOAUTH; + goto abort; + } + } + + ASSERT(conn->security != NULL); + + if (conn->security->issue_challenge(conn) < 0) { + abort_code = RX_CALL_DEAD; + ret = -ENOMEM; + goto abort; + } + + _leave(""); + return; + +abort: + _debug("abort %d, %d", ret, abort_code); + rxrpc_abort_connection(conn, -ret, abort_code); + _leave(" [aborted]"); +} + +/* + * connection-level event processor + */ +void rxrpc_process_connection(struct work_struct *work) +{ + struct rxrpc_connection *conn = + container_of(work, struct rxrpc_connection, processor); + struct sk_buff *skb; + u32 abort_code = RX_PROTOCOL_ERROR; + int ret; + + _enter("{%d}", conn->debug_id); + + atomic_inc(&conn->usage); + + if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) { + rxrpc_secure_connection(conn); + rxrpc_put_connection(conn); + } + + /* go through the conn-level event packets, releasing the ref on this + * connection that each one has when we've finished with it */ + while ((skb = skb_dequeue(&conn->rx_queue))) { + ret = rxrpc_process_event(conn, skb, &abort_code); + switch (ret) { + case -EPROTO: + case -EKEYEXPIRED: + case -EKEYREJECTED: + goto protocol_error; + case -EAGAIN: + goto requeue_and_leave; + case -ECONNABORTED: + default: + rxrpc_put_connection(conn); + rxrpc_free_skb(skb); + break; + } + } + +out: + rxrpc_put_connection(conn); + _leave(""); + return; + +requeue_and_leave: + skb_queue_head(&conn->rx_queue, skb); + goto out; + +protocol_error: + if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) + goto requeue_and_leave; + rxrpc_put_connection(conn); + rxrpc_free_skb(skb); + _leave(" [EPROTO]"); + goto out; +} + +/* + * put a packet up for transport-level abort + */ +void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + CHECK_SLAB_OKAY(&local->usage); + + if (!atomic_inc_not_zero(&local->usage)) { + printk("resurrected on reject\n"); + BUG(); + } + + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_work(&local->rejecter); +} + +/* + * reject packets through the local endpoint + */ +void rxrpc_reject_packets(struct work_struct *work) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + } sa; + struct rxrpc_skb_priv *sp; + struct rxrpc_header hdr; + struct rxrpc_local *local; + struct sk_buff *skb; + struct msghdr msg; + struct kvec iov[2]; + size_t size; + __be32 code; + + local = container_of(work, struct rxrpc_local, rejecter); + rxrpc_get_local(local); + + _enter("%d", local->debug_id); + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = &code; + iov[1].iov_len = sizeof(code); + size = sizeof(hdr) + sizeof(code); + + msg.msg_name = &sa; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa.sa_family = local->srx.transport.family; + switch (sa.sa.sa_family) { + case AF_INET: + msg.msg_namelen = sizeof(sa.sin); + break; + default: + msg.msg_namelen = 0; + break; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.type = RXRPC_PACKET_TYPE_ABORT; + + while ((skb = skb_dequeue(&local->reject_queue))) { + sp = rxrpc_skb(skb); + switch (sa.sa.sa_family) { + case AF_INET: + sa.sin.sin_port = udp_hdr(skb)->source; + sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + code = htonl(skb->priority); + + hdr.epoch = sp->hdr.epoch; + hdr.cid = sp->hdr.cid; + hdr.callNumber = sp->hdr.callNumber; + hdr.serviceId = sp->hdr.serviceId; + hdr.flags = sp->hdr.flags; + hdr.flags ^= RXRPC_CLIENT_INITIATED; + hdr.flags &= RXRPC_CLIENT_INITIATED; + + kernel_sendmsg(local->socket, &msg, iov, 2, size); + break; + + default: + break; + } + + rxrpc_free_skb(skb); + rxrpc_put_local(local); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c new file mode 100644 index 00000000..5d6b572a --- /dev/null +++ b/net/rxrpc/ar-error.c @@ -0,0 +1,251 @@ +/* Error message handling (ICMP) + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * handle an error received on the local endpoint + */ +void rxrpc_UDP_error_report(struct sock *sk) +{ + struct sock_exterr_skb *serr; + struct rxrpc_transport *trans; + struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_peer *peer; + struct sk_buff *skb; + __be32 addr; + __be16 port; + + _enter("%p{%d}", sk, local->debug_id); + + skb = skb_dequeue(&sk->sk_error_queue); + if (!skb) { + _leave("UDP socket errqueue empty"); + return; + } + + rxrpc_new_skb(skb); + + serr = SKB_EXT_ERR(skb); + addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); + port = serr->port; + + _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port)); + _debug("Msg l:%d d:%d", skb->len, skb->data_len); + + peer = rxrpc_find_peer(local, addr, port); + if (IS_ERR(peer)) { + rxrpc_free_skb(skb); + _leave(" [no peer]"); + return; + } + + trans = rxrpc_find_transport(local, peer); + if (!trans) { + rxrpc_put_peer(peer); + rxrpc_free_skb(skb); + _leave(" [no trans]"); + return; + } + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && + serr->ee.ee_type == ICMP_DEST_UNREACH && + serr->ee.ee_code == ICMP_FRAG_NEEDED + ) { + u32 mtu = serr->ee.ee_info; + + _net("Rx Received ICMP Fragmentation Needed (%d)", mtu); + + /* wind down the local interface MTU */ + if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { + peer->if_mtu = mtu; + _net("I/F MTU %u", mtu); + } + + /* ip_rt_frag_needed() may have eaten the info */ + if (mtu == 0) + mtu = ntohs(icmp_hdr(skb)->un.frag.mtu); + + if (mtu == 0) { + /* they didn't give us a size, estimate one */ + if (mtu > 1500) { + mtu >>= 1; + if (mtu < 1500) + mtu = 1500; + } else { + mtu -= 100; + if (mtu < peer->hdrsize) + mtu = peer->hdrsize + 4; + } + } + + if (mtu < peer->mtu) { + spin_lock_bh(&peer->lock); + peer->mtu = mtu; + peer->maxdata = peer->mtu - peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", + peer->mtu, peer->maxdata); + } + } + + rxrpc_put_peer(peer); + + /* pass the transport ref to error_handler to release */ + skb_queue_tail(&trans->error_queue, skb); + rxrpc_queue_work(&trans->error_handler); + + /* reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + skb = skb_peek(&sk->sk_error_queue); + if (skb) { + sk->sk_err = SKB_EXT_ERR(skb)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else { + spin_unlock_bh(&sk->sk_error_queue.lock); + } + + _leave(""); +} + +/* + * deal with UDP error messages + */ +void rxrpc_UDP_error_handler(struct work_struct *work) +{ + struct sock_extended_err *ee; + struct sock_exterr_skb *serr; + struct rxrpc_transport *trans = + container_of(work, struct rxrpc_transport, error_handler); + struct sk_buff *skb; + int err; + + _enter(""); + + skb = skb_dequeue(&trans->error_queue); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + ee = &serr->ee; + + _net("Rx Error o=%d t=%d c=%d e=%d", + ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno); + + err = ee->ee_errno; + + switch (ee->ee_origin) { + case SO_EE_ORIGIN_ICMP: + switch (ee->ee_type) { + case ICMP_DEST_UNREACH: + switch (ee->ee_code) { + case ICMP_NET_UNREACH: + _net("Rx Received ICMP Network Unreachable"); + err = ENETUNREACH; + break; + case ICMP_HOST_UNREACH: + _net("Rx Received ICMP Host Unreachable"); + err = EHOSTUNREACH; + break; + case ICMP_PORT_UNREACH: + _net("Rx Received ICMP Port Unreachable"); + err = ECONNREFUSED; + break; + case ICMP_FRAG_NEEDED: + _net("Rx Received ICMP Fragmentation Needed (%d)", + ee->ee_info); + err = 0; /* dealt with elsewhere */ + break; + case ICMP_NET_UNKNOWN: + _net("Rx Received ICMP Unknown Network"); + err = ENETUNREACH; + break; + case ICMP_HOST_UNKNOWN: + _net("Rx Received ICMP Unknown Host"); + err = EHOSTUNREACH; + break; + default: + _net("Rx Received ICMP DestUnreach code=%u", + ee->ee_code); + break; + } + break; + + case ICMP_TIME_EXCEEDED: + _net("Rx Received ICMP TTL Exceeded"); + break; + + default: + _proto("Rx Received ICMP error { type=%u code=%u }", + ee->ee_type, ee->ee_code); + break; + } + break; + + case SO_EE_ORIGIN_LOCAL: + _proto("Rx Received local error { error=%d }", + ee->ee_errno); + break; + + case SO_EE_ORIGIN_NONE: + case SO_EE_ORIGIN_ICMP6: + default: + _proto("Rx Received error report { orig=%u }", + ee->ee_origin); + break; + } + + /* terminate all the affected calls if there's an unrecoverable + * error */ + if (err) { + struct rxrpc_call *call, *_n; + + _debug("ISSUE ERROR %d", err); + + spin_lock_bh(&trans->peer->lock); + trans->peer->net_error = err; + + list_for_each_entry_safe(call, _n, &trans->peer->error_targets, + error_link) { + write_lock(&call->state_lock); + if (call->state != RXRPC_CALL_COMPLETE && + call->state < RXRPC_CALL_NETWORK_ERROR) { + call->state = RXRPC_CALL_NETWORK_ERROR; + set_bit(RXRPC_CALL_RCVD_ERROR, &call->events); + rxrpc_queue_call(call); + } + write_unlock(&call->state_lock); + list_del_init(&call->error_link); + } + + spin_unlock_bh(&trans->peer->lock); + } + + if (!skb_queue_empty(&trans->error_queue)) + rxrpc_queue_work(&trans->error_handler); + + rxrpc_free_skb(skb); + rxrpc_put_transport(trans); + _leave(""); +} diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c new file mode 100644 index 00000000..1a2b0633 --- /dev/null +++ b/net/rxrpc/ar-input.c @@ -0,0 +1,804 @@ +/* RxRPC packet reception + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +unsigned long rxrpc_ack_timeout = 1; + +const char *rxrpc_pkts[] = { + "?00", + "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", + "?09", "?10", "?11", "?12", "?13", "?14", "?15" +}; + +/* + * queue a packet for recvmsg to pass to userspace + * - the caller must hold a lock on call->lock + * - must not be called with interrupts disabled (sk_filter() disables BH's) + * - eats the packet whether successful or not + * - there must be just one reference to the packet, which the caller passes to + * this function + */ +int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, + bool force, bool terminal) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = call->socket; + struct sock *sk; + int skb_len, ret; + + _enter(",,%d,%d", force, terminal); + + ASSERT(!irqs_disabled()); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, call); + + /* if we've already posted the terminal message for a call, then we + * don't post any more */ + if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { + _debug("already terminated"); + ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); + skb->destructor = NULL; + sp->call = NULL; + rxrpc_put_call(call); + rxrpc_free_skb(skb); + return 0; + } + + sk = &rx->sk; + + if (!force) { + /* cast skb->rcvbuf to unsigned... It's pointless, but + * reduces number of warnings when compiling with -W + * --ANK */ +// ret = -ENOBUFS; +// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= +// (unsigned) sk->sk_rcvbuf) +// goto out; + + ret = sk_filter(sk, skb); + if (ret < 0) + goto out; + } + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && + !test_bit(RXRPC_CALL_RELEASED, &call->flags) && + call->socket->sk.sk_state != RXRPC_CLOSE) { + skb->destructor = rxrpc_packet_destructor; + skb->dev = NULL; + skb->sk = sk; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + if (terminal) { + _debug("<<<< TERMINAL MESSAGE >>>>"); + set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); + } + + /* allow interception by a kernel service */ + if (rx->interceptor) { + rx->interceptor(sk, call->user_call_ID, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + } else { + + /* Cache the SKB length before we tack it onto the + * receive queue. Once it is added it no longer + * belongs to us and may be freed by other threads of + * control pulling packets from the queue */ + skb_len = skb->len; + + _net("post skb %p", skb); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); + } + skb = NULL; + } else { + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + ret = 0; + +out: + /* release the socket buffer */ + if (skb) { + skb->destructor = NULL; + sp->call = NULL; + rxrpc_put_call(call); + rxrpc_free_skb(skb); + } + + _leave(" = %d", ret); + return ret; +} + +/* + * process a DATA packet, posting the packet to the appropriate queue + * - eats the packet if successful + */ +static int rxrpc_fast_process_data(struct rxrpc_call *call, + struct sk_buff *skb, u32 seq) +{ + struct rxrpc_skb_priv *sp; + bool terminal; + int ret, ackbit, ack; + + _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); + + sp = rxrpc_skb(skb); + ASSERTCMP(sp->call, ==, NULL); + + spin_lock(&call->lock); + + if (call->state > RXRPC_CALL_COMPLETE) + goto discard; + + ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); + ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); + ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); + + if (seq < call->rx_data_post) { + _debug("dup #%u [-%u]", seq, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + ret = -ENOBUFS; + goto discard_and_ack; + } + + /* we may already have the packet in the out of sequence queue */ + ackbit = seq - (call->rx_data_eaten + 1); + ASSERTCMP(ackbit, >=, 0); + if (__test_and_set_bit(ackbit, call->ackr_window)) { + _debug("dup oos #%u [%u,%u]", + seq, call->rx_data_eaten, call->rx_data_post); + ack = RXRPC_ACK_DUPLICATE; + goto discard_and_ack; + } + + if (seq >= call->ackr_win_top) { + _debug("exceed #%u [%u]", seq, call->ackr_win_top); + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_EXCEEDS_WINDOW; + goto discard_and_ack; + } + + if (seq == call->rx_data_expect) { + clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); + call->rx_data_expect++; + } else if (seq > call->rx_data_expect) { + _debug("oos #%u [%u]", seq, call->rx_data_expect); + call->rx_data_expect = seq + 1; + if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + goto enqueue_and_ack; + } + goto enqueue_packet; + } + + if (seq != call->rx_data_post) { + _debug("ahead #%u [%u]", seq, call->rx_data_post); + goto enqueue_packet; + } + + if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) + goto protocol_error; + + /* if the packet need security things doing to it, then it goes down + * the slow path */ + if (call->conn->security) + goto enqueue_packet; + + sp->call = call; + rxrpc_get_call(call); + terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && + !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); + ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOBUFS) { + __clear_bit(ackbit, call->ackr_window); + ack = RXRPC_ACK_NOSPACE; + goto discard_and_ack; + } + goto out; + } + + skb = NULL; + + _debug("post #%u", seq); + ASSERTCMP(call->rx_data_post, ==, seq); + call->rx_data_post++; + + if (sp->hdr.flags & RXRPC_LAST_PACKET) + set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); + + /* if we've reached an out of sequence packet then we need to drain + * that queue into the socket Rx queue now */ + if (call->rx_data_post == call->rx_first_oos) { + _debug("drain rx oos now"); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + } + + spin_unlock(&call->lock); + atomic_inc(&call->ackr_not_idle); + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false); + _leave(" = 0 [posted]"); + return 0; + +protocol_error: + ret = -EBADMSG; +out: + spin_unlock(&call->lock); + _leave(" = %d", ret); + return ret; + +discard_and_ack: + _debug("discard and ACK packet %p", skb); + __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); +discard: + spin_unlock(&call->lock); + rxrpc_free_skb(skb); + _leave(" = 0 [discarded]"); + return 0; + +enqueue_and_ack: + __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true); +enqueue_packet: + _net("defer skb %p", skb); + spin_unlock(&call->lock); + skb_queue_tail(&call->rx_queue, skb); + atomic_inc(&call->ackr_not_idle); + read_lock(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) + rxrpc_queue_call(call); + read_unlock(&call->state_lock); + _leave(" = 0 [queued]"); + return 0; +} + +/* + * assume an implicit ACKALL of the transmission phase of a client socket upon + * reception of the first reply packet + */ +static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) +{ + write_lock_bh(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + call->acks_latest = serial; + + _debug("implicit ACKALL %%%u", call->acks_latest); + set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events); + write_unlock_bh(&call->state_lock); + + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_RESEND, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + } + break; + + default: + write_unlock_bh(&call->state_lock); + break; + } +} + +/* + * post an incoming packet to the nominated call to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 _abort_code; + u32 serial, hi_serial, seq, abort_code; + + _enter("%p,%p", call, skb); + + ASSERT(!irqs_disabled()); + +#if 0 // INJECT RX ERROR + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + static int skip = 0; + if (++skip == 3) { + printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); + skip = 0; + goto free_packet; + } + } +#endif + + /* track the latest serial number on this connection for ACK packet + * information */ + serial = ntohl(sp->hdr.serial); + hi_serial = atomic_read(&call->conn->hi_serial); + while (serial > hi_serial) + hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, + serial); + + /* request ACK generation for any ACK or DATA packet that requests + * it */ + if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + _proto("ACK Requested on %%%u", serial); + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, + !(sp->hdr.flags & RXRPC_MORE_PACKETS)); + } + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_ABORT: + _debug("abort"); + + if (skb_copy_bits(skb, 0, &_abort_code, + sizeof(_abort_code)) < 0) + goto protocol_error; + + abort_code = ntohl(_abort_code); + _proto("Rx ABORT %%%u { %x }", serial, abort_code); + + write_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_REMOTELY_ABORTED; + call->abort_code = abort_code; + set_bit(RXRPC_CALL_RCVD_ABORT, &call->events); + rxrpc_queue_call(call); + } + goto free_packet_unlock; + + case RXRPC_PACKET_TYPE_BUSY: + _proto("Rx BUSY %%%u", serial); + + if (call->conn->out_clientflag) + goto protocol_error; + + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_SERVER_BUSY; + set_bit(RXRPC_CALL_RCVD_BUSY, &call->events); + rxrpc_queue_call(call); + case RXRPC_CALL_SERVER_BUSY: + goto free_packet_unlock; + default: + goto protocol_error_locked; + } + + default: + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial); + goto protocol_error; + + case RXRPC_PACKET_TYPE_DATA: + seq = ntohl(sp->hdr.seq); + + _proto("Rx DATA %%%u { #%u }", serial, seq); + + if (seq == 0) + goto protocol_error; + + call->ackr_prev_seq = sp->hdr.seq; + + /* received data implicitly ACKs all of the request packets we + * sent when we're acting as a client */ + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + rxrpc_assume_implicit_ackall(call, serial); + + switch (rxrpc_fast_process_data(call, skb, seq)) { + case 0: + skb = NULL; + goto done; + + default: + BUG(); + + /* data packet received beyond the last packet */ + case -EBADMSG: + goto protocol_error; + } + + case RXRPC_PACKET_TYPE_ACKALL: + case RXRPC_PACKET_TYPE_ACK: + /* ACK processing is done in process context */ + read_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_DEAD) { + skb_queue_tail(&call->rx_queue, skb); + rxrpc_queue_call(call); + skb = NULL; + } + read_unlock_bh(&call->state_lock); + goto free_packet; + } + +protocol_error: + _debug("protocol error"); + write_lock_bh(&call->state_lock); +protocol_error_locked: + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_ABORT, &call->events); + rxrpc_queue_call(call); + } +free_packet_unlock: + write_unlock_bh(&call->state_lock); +free_packet: + rxrpc_free_skb(skb); +done: + _leave(""); +} + +/* + * split up a jumbo data packet + */ +static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, + struct sk_buff *jumbo) +{ + struct rxrpc_jumbo_header jhdr; + struct rxrpc_skb_priv *sp; + struct sk_buff *part; + + _enter(",{%u,%u}", jumbo->data_len, jumbo->len); + + sp = rxrpc_skb(jumbo); + + do { + sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; + + /* make a clone to represent the first subpacket in what's left + * of the jumbo packet */ + part = skb_clone(jumbo, GFP_ATOMIC); + if (!part) { + /* simply ditch the tail in the event of ENOMEM */ + pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); + break; + } + rxrpc_new_skb(part); + + pskb_trim(part, RXRPC_JUMBO_DATALEN); + + if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) + goto protocol_error; + + if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) + goto protocol_error; + if (!pskb_pull(jumbo, sizeof(jhdr))) + BUG(); + + sp->hdr.seq = htonl(ntohl(sp->hdr.seq) + 1); + sp->hdr.serial = htonl(ntohl(sp->hdr.serial) + 1); + sp->hdr.flags = jhdr.flags; + sp->hdr._rsvd = jhdr._rsvd; + + _proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1); + + rxrpc_fast_process_packet(call, part); + part = NULL; + + } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); + + rxrpc_fast_process_packet(call, jumbo); + _leave(""); + return; + +protocol_error: + _debug("protocol error"); + rxrpc_free_skb(part); + rxrpc_free_skb(jumbo); + write_lock_bh(&call->state_lock); + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = RX_PROTOCOL_ERROR; + set_bit(RXRPC_CALL_ABORT, &call->events); + rxrpc_queue_call(call); + } + write_unlock_bh(&call->state_lock); + _leave(""); +} + +/* + * post an incoming packet to the appropriate call/socket to deal with + * - must get rid of the sk_buff, either by freeing it or by queuing it + */ +static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_call *call; + struct rb_node *p; + __be32 call_id; + + _enter("%p,%p", conn, skb); + + read_lock_bh(&conn->lock); + + sp = rxrpc_skb(skb); + + /* look at extant calls by channel number first */ + call = conn->channels[ntohl(sp->hdr.cid) & RXRPC_CHANNELMASK]; + if (!call || call->call_id != sp->hdr.callNumber) + goto call_not_extant; + + _debug("extant call [%d]", call->state); + ASSERTCMP(call->conn, ==, conn); + + read_lock(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_LOCALLY_ABORTED: + if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) + rxrpc_queue_call(call); + case RXRPC_CALL_REMOTELY_ABORTED: + case RXRPC_CALL_NETWORK_ERROR: + case RXRPC_CALL_DEAD: + goto free_unlock; + default: + break; + } + + read_unlock(&call->state_lock); + rxrpc_get_call(call); + read_unlock_bh(&conn->lock); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.flags & RXRPC_JUMBO_PACKET) + rxrpc_process_jumbo_packet(call, skb); + else + rxrpc_fast_process_packet(call, skb); + + rxrpc_put_call(call); + goto done; + +call_not_extant: + /* search the completed calls in case what we're dealing with is + * there */ + _debug("call not extant"); + + call_id = sp->hdr.callNumber; + p = conn->calls.rb_node; + while (p) { + call = rb_entry(p, struct rxrpc_call, conn_node); + + if (call_id < call->call_id) + p = p->rb_left; + else if (call_id > call->call_id) + p = p->rb_right; + else + goto found_completed_call; + } + +dead_call: + /* it's a either a really old call that we no longer remember or its a + * new incoming call */ + read_unlock_bh(&conn->lock); + + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && + sp->hdr.seq == cpu_to_be32(1)) { + _debug("incoming call"); + skb_queue_tail(&conn->trans->local->accept_queue, skb); + rxrpc_queue_work(&conn->trans->local->acceptor); + goto done; + } + + _debug("dead call"); + skb->priority = RX_CALL_DEAD; + rxrpc_reject_packet(conn->trans->local, skb); + goto done; + + /* resend last packet of a completed call + * - client calls may have been aborted or ACK'd + * - server calls may have been aborted + */ +found_completed_call: + _debug("completed call"); + + if (atomic_read(&call->usage) == 0) + goto dead_call; + + /* synchronise any state changes */ + read_lock(&call->state_lock); + ASSERTIFCMP(call->state != RXRPC_CALL_CLIENT_FINAL_ACK, + call->state, >=, RXRPC_CALL_COMPLETE); + + if (call->state == RXRPC_CALL_LOCALLY_ABORTED || + call->state == RXRPC_CALL_REMOTELY_ABORTED || + call->state == RXRPC_CALL_DEAD) { + read_unlock(&call->state_lock); + goto dead_call; + } + + if (call->conn->in_clientflag) { + read_unlock(&call->state_lock); + goto dead_call; /* complete server call */ + } + + _debug("final ack again"); + rxrpc_get_call(call); + set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + rxrpc_queue_call(call); + +free_unlock: + read_unlock(&call->state_lock); + read_unlock_bh(&conn->lock); + rxrpc_free_skb(skb); +done: + _leave(""); +} + +/* + * post connection-level events to the connection + * - this includes challenges, responses and some aborts + */ +static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + _enter("%p,%p", conn, skb); + + atomic_inc(&conn->usage); + skb_queue_tail(&conn->rx_queue, skb); + rxrpc_queue_conn(conn); +} + +/* + * handle data received on the local endpoint + * - may be called in interrupt context + */ +void rxrpc_data_ready(struct sock *sk, int count) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + struct rxrpc_skb_priv *sp; + struct rxrpc_local *local; + struct rxrpc_peer *peer; + struct sk_buff *skb; + int ret; + + _enter("%p, %d", sk, count); + + ASSERT(!irqs_disabled()); + + read_lock_bh(&rxrpc_local_lock); + local = sk->sk_user_data; + if (local && atomic_read(&local->usage) > 0) + rxrpc_get_local(local); + else + local = NULL; + read_unlock_bh(&rxrpc_local_lock); + if (!local) { + _leave(" [local dead]"); + return; + } + + skb = skb_recv_datagram(sk, 0, 1, &ret); + if (!skb) { + rxrpc_put_local(local); + if (ret == -EAGAIN) + return; + _debug("UDP socket error %d", ret); + return; + } + + rxrpc_new_skb(skb); + + _net("recv skb %p", skb); + + /* we'll probably need to checksum it (didn't call sock_recvmsg) */ + if (skb_checksum_complete(skb)) { + rxrpc_free_skb(skb); + rxrpc_put_local(local); + UDP_INC_STATS_BH(&init_net, UDP_MIB_INERRORS, 0); + _leave(" [CSUM failed]"); + return; + } + + UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0); + + /* the socket buffer we have is owned by UDP, with UDP's data all over + * it, but we really want our own */ + skb_orphan(skb); + sp = rxrpc_skb(skb); + memset(sp, 0, sizeof(*sp)); + + _net("Rx UDP packet from %08x:%04hu", + ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr, + sizeof(sp->hdr)) < 0) + goto bad_message; + if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr))) + BUG(); + + _net("Rx RxRPC %s ep=%x call=%x:%x", + sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", + ntohl(sp->hdr.epoch), + ntohl(sp->hdr.cid), + ntohl(sp->hdr.callNumber)); + + if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) { + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; + } + + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) + goto bad_message; + + peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, udp_hdr(skb)->source); + if (IS_ERR(peer)) + goto cant_route_call; + + trans = rxrpc_find_transport(local, peer); + rxrpc_put_peer(peer); + if (!trans) + goto cant_route_call; + + conn = rxrpc_find_connection(trans, &sp->hdr); + rxrpc_put_transport(trans); + if (!conn) + goto cant_route_call; + + _debug("CONN %p {%d}", conn, conn->debug_id); + + if (sp->hdr.callNumber == 0) + rxrpc_post_packet_to_conn(conn, skb); + else + rxrpc_post_packet_to_call(conn, skb); + rxrpc_put_connection(conn); + rxrpc_put_local(local); + return; + +cant_route_call: + _debug("can't route call"); + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && + sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { + if (sp->hdr.seq == cpu_to_be32(1)) { + _debug("first packet"); + skb_queue_tail(&local->accept_queue, skb); + rxrpc_queue_work(&local->acceptor); + rxrpc_put_local(local); + _leave(" [incoming]"); + return; + } + skb->priority = RX_INVALID_OPERATION; + } else { + skb->priority = RX_CALL_DEAD; + } + + _debug("reject"); + rxrpc_reject_packet(local, skb); + rxrpc_put_local(local); + _leave(" [no call]"); + return; + +bad_message: + skb->priority = RX_PROTOCOL_ERROR; + rxrpc_reject_packet(local, skb); + rxrpc_put_local(local); + _leave(" [badmsg]"); +} diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h new file mode 100644 index 00000000..8e22bd34 --- /dev/null +++ b/net/rxrpc/ar-internal.h @@ -0,0 +1,786 @@ +/* AF_RXRPC internal definitions + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#if 0 +#define CHECK_SLAB_OKAY(X) \ + BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \ + (POISON_FREE << 8 | POISON_FREE)) +#else +#define CHECK_SLAB_OKAY(X) do {} while(0) +#endif + +#define FCRYPT_BSIZE 8 +struct rxrpc_crypt { + union { + u8 x[FCRYPT_BSIZE]; + __be32 n[2]; + }; +} __attribute__((aligned(8))); + +#define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS)) +#define rxrpc_queue_delayed_work(WS,D) \ + queue_delayed_work(rxrpc_workqueue, (WS), (D)) + +#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor) +#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor) + +/* + * sk_state for RxRPC sockets + */ +enum { + RXRPC_UNCONNECTED = 0, + RXRPC_CLIENT_BOUND, /* client local address bound */ + RXRPC_CLIENT_CONNECTED, /* client is connected */ + RXRPC_SERVER_BOUND, /* server local address bound */ + RXRPC_SERVER_LISTENING, /* server listening for connections */ + RXRPC_CLOSE, /* socket is being closed */ +}; + +/* + * RxRPC socket definition + */ +struct rxrpc_sock { + /* WARNING: sk has to be the first member */ + struct sock sk; + rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ + struct rxrpc_local *local; /* local endpoint */ + struct rxrpc_transport *trans; /* transport handler */ + struct rxrpc_conn_bundle *bundle; /* virtual connection bundle */ + struct rxrpc_connection *conn; /* exclusive virtual connection */ + struct list_head listen_link; /* link in the local endpoint's listen list */ + struct list_head secureq; /* calls awaiting connection security clearance */ + struct list_head acceptq; /* calls awaiting acceptance */ + struct key *key; /* security for this socket */ + struct key *securities; /* list of server security descriptors */ + struct rb_root calls; /* outstanding calls on this socket */ + unsigned long flags; +#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */ + rwlock_t call_lock; /* lock for calls */ + u32 min_sec_level; /* minimum security level */ +#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT + struct sockaddr_rxrpc srx; /* local address */ + sa_family_t proto; /* protocol created with */ + __be16 service_id; /* service ID of local/remote service */ +}; + +#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk) + +/* + * RxRPC socket buffer private variables + * - max 48 bytes (struct sk_buff::cb) + */ +struct rxrpc_skb_priv { + struct rxrpc_call *call; /* call with which associated */ + unsigned long resend_at; /* time in jiffies at which to resend */ + union { + unsigned offset; /* offset into buffer of next read */ + int remain; /* amount of space remaining for next write */ + u32 error; /* network error code */ + bool need_resend; /* T if needs resending */ + }; + + struct rxrpc_header hdr; /* RxRPC packet header from this packet */ +}; + +#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) + +enum rxrpc_command { + RXRPC_CMD_SEND_DATA, /* send data message */ + RXRPC_CMD_SEND_ABORT, /* request abort generation */ + RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ + RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ +}; + +/* + * RxRPC security module interface + */ +struct rxrpc_security { + struct module *owner; /* providing module */ + struct list_head link; /* link in master list */ + const char *name; /* name of this service */ + u8 security_index; /* security type provided */ + + /* initialise a connection's security */ + int (*init_connection_security)(struct rxrpc_connection *); + + /* prime a connection's packet security */ + void (*prime_packet_security)(struct rxrpc_connection *); + + /* impose security on a packet */ + int (*secure_packet)(const struct rxrpc_call *, + struct sk_buff *, + size_t, + void *); + + /* verify the security on a received packet */ + int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *, + u32 *); + + /* issue a challenge */ + int (*issue_challenge)(struct rxrpc_connection *); + + /* respond to a challenge */ + int (*respond_to_challenge)(struct rxrpc_connection *, + struct sk_buff *, + u32 *); + + /* verify a response */ + int (*verify_response)(struct rxrpc_connection *, + struct sk_buff *, + u32 *); + + /* clear connection security */ + void (*clear)(struct rxrpc_connection *); +}; + +/* + * RxRPC local transport endpoint definition + * - matched by local port, address and protocol type + */ +struct rxrpc_local { + struct socket *socket; /* my UDP socket */ + struct work_struct destroyer; /* endpoint destroyer */ + struct work_struct acceptor; /* incoming call processor */ + struct work_struct rejecter; /* packet reject writer */ + struct list_head services; /* services listening on this endpoint */ + struct list_head link; /* link in endpoint list */ + struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ + struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ + struct sk_buff_head reject_queue; /* packets awaiting rejection */ + spinlock_t lock; /* access lock */ + rwlock_t services_lock; /* lock for services list */ + atomic_t usage; + int debug_id; /* debug ID for printks */ + volatile char error_rcvd; /* T if received ICMP error outstanding */ + struct sockaddr_rxrpc srx; /* local address */ +}; + +/* + * RxRPC remote transport endpoint definition + * - matched by remote port, address and protocol type + * - holds the connection ID counter for connections between the two endpoints + */ +struct rxrpc_peer { + struct work_struct destroyer; /* peer destroyer */ + struct list_head link; /* link in master peer list */ + struct list_head error_targets; /* targets for net error distribution */ + spinlock_t lock; /* access lock */ + atomic_t usage; + unsigned if_mtu; /* interface MTU for this peer */ + unsigned mtu; /* network MTU for this peer */ + unsigned maxdata; /* data size (MTU - hdrsize) */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + int debug_id; /* debug ID for printks */ + int net_error; /* network error distributed */ + struct sockaddr_rxrpc srx; /* remote address */ + + /* calculated RTT cache */ +#define RXRPC_RTT_CACHE_SIZE 32 + suseconds_t rtt; /* current RTT estimate (in uS) */ + unsigned rtt_point; /* next entry at which to insert */ + unsigned rtt_usage; /* amount of cache actually used */ + suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ +}; + +/* + * RxRPC point-to-point transport / connection manager definition + * - handles a bundle of connections between two endpoints + * - matched by { local, peer } + */ +struct rxrpc_transport { + struct rxrpc_local *local; /* local transport endpoint */ + struct rxrpc_peer *peer; /* remote transport endpoint */ + struct work_struct error_handler; /* network error distributor */ + struct rb_root bundles; /* client connection bundles on this transport */ + struct rb_root client_conns; /* client connections on this transport */ + struct rb_root server_conns; /* server connections on this transport */ + struct list_head link; /* link in master session list */ + struct sk_buff_head error_queue; /* error packets awaiting processing */ + time_t put_time; /* time at which to reap */ + spinlock_t client_lock; /* client connection allocation lock */ + rwlock_t conn_lock; /* lock for active/dead connections */ + atomic_t usage; + int debug_id; /* debug ID for printks */ + unsigned int conn_idcounter; /* connection ID counter (client) */ +}; + +/* + * RxRPC client connection bundle + * - matched by { transport, service_id, key } + */ +struct rxrpc_conn_bundle { + struct rb_node node; /* node in transport's lookup tree */ + struct list_head unused_conns; /* unused connections in this bundle */ + struct list_head avail_conns; /* available connections in this bundle */ + struct list_head busy_conns; /* busy connections in this bundle */ + struct key *key; /* security for this bundle */ + wait_queue_head_t chanwait; /* wait for channel to become available */ + atomic_t usage; + int debug_id; /* debug ID for printks */ + unsigned short num_conns; /* number of connections in this bundle */ + __be16 service_id; /* service ID */ + u8 security_ix; /* security type */ +}; + +/* + * RxRPC connection definition + * - matched by { transport, service_id, conn_id, direction, key } + * - each connection can only handle four simultaneous calls + */ +struct rxrpc_connection { + struct rxrpc_transport *trans; /* transport session */ + struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */ + struct work_struct processor; /* connection event processor */ + struct rb_node node; /* node in transport's lookup tree */ + struct list_head link; /* link in master connection list */ + struct list_head bundle_link; /* link in bundle */ + struct rb_root calls; /* calls on this connection */ + struct sk_buff_head rx_queue; /* received conn-level packets */ + struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */ + struct rxrpc_security *security; /* applied security module */ + struct key *key; /* security for this connection (client) */ + struct key *server_key; /* security for this service */ + struct crypto_blkcipher *cipher; /* encryption handle */ + struct rxrpc_crypt csum_iv; /* packet checksum base */ + unsigned long events; +#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */ + time_t put_time; /* time at which to reap */ + rwlock_t lock; /* access lock */ + spinlock_t state_lock; /* state-change lock */ + atomic_t usage; + u32 real_conn_id; /* connection ID (host-endian) */ + enum { /* current state of connection */ + RXRPC_CONN_UNUSED, /* - connection not yet attempted */ + RXRPC_CONN_CLIENT, /* - client connection */ + RXRPC_CONN_SERVER_UNSECURED, /* - server unsecured connection */ + RXRPC_CONN_SERVER_CHALLENGING, /* - server challenging for security */ + RXRPC_CONN_SERVER, /* - server secured connection */ + RXRPC_CONN_REMOTELY_ABORTED, /* - conn aborted by peer */ + RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */ + RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */ + } state; + int error; /* error code for local abort */ + int debug_id; /* debug ID for printks */ + unsigned call_counter; /* call ID counter */ + atomic_t serial; /* packet serial number counter */ + atomic_t hi_serial; /* highest serial number received */ + u8 avail_calls; /* number of calls available */ + u8 size_align; /* data size alignment (for security) */ + u8 header_size; /* rxrpc + security header size */ + u8 security_size; /* security header size */ + u32 security_level; /* security level negotiated */ + u32 security_nonce; /* response re-use preventer */ + + /* the following are all in net order */ + __be32 epoch; /* epoch of this connection */ + __be32 cid; /* connection ID */ + __be16 service_id; /* service ID */ + u8 security_ix; /* security type */ + u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */ + u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ +}; + +/* + * RxRPC call definition + * - matched by { connection, call_id } + */ +struct rxrpc_call { + struct rxrpc_connection *conn; /* connection carrying call */ + struct rxrpc_sock *socket; /* socket responsible */ + struct timer_list lifetimer; /* lifetime remaining on call */ + struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */ + struct timer_list ack_timer; /* ACK generation timer */ + struct timer_list resend_timer; /* Tx resend timer */ + struct work_struct destroyer; /* call destroyer */ + struct work_struct processor; /* packet processor and ACK generator */ + struct list_head link; /* link in master call list */ + struct list_head error_link; /* link in error distribution list */ + struct list_head accept_link; /* calls awaiting acceptance */ + struct rb_node sock_node; /* node in socket call tree */ + struct rb_node conn_node; /* node in connection call tree */ + struct sk_buff_head rx_queue; /* received packets */ + struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ + struct sk_buff *tx_pending; /* Tx socket buffer being filled */ + wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ + unsigned long user_call_ID; /* user-defined call ID */ + unsigned long creation_jif; /* time of call creation */ + unsigned long flags; +#define RXRPC_CALL_RELEASED 0 /* call has been released - no more message to userspace */ +#define RXRPC_CALL_TERMINAL_MSG 1 /* call has given the socket its final message */ +#define RXRPC_CALL_RCVD_LAST 2 /* all packets received */ +#define RXRPC_CALL_RUN_RTIMER 3 /* Tx resend timer started */ +#define RXRPC_CALL_TX_SOFT_ACK 4 /* sent some soft ACKs */ +#define RXRPC_CALL_PROC_BUSY 5 /* the processor is busy */ +#define RXRPC_CALL_INIT_ACCEPT 6 /* acceptance was initiated */ +#define RXRPC_CALL_HAS_USERID 7 /* has a user ID attached */ +#define RXRPC_CALL_EXPECT_OOS 8 /* expect out of sequence packets */ + unsigned long events; +#define RXRPC_CALL_RCVD_ACKALL 0 /* ACKALL or reply received */ +#define RXRPC_CALL_RCVD_BUSY 1 /* busy packet received */ +#define RXRPC_CALL_RCVD_ABORT 2 /* abort packet received */ +#define RXRPC_CALL_RCVD_ERROR 3 /* network error received */ +#define RXRPC_CALL_ACK_FINAL 4 /* need to generate final ACK (and release call) */ +#define RXRPC_CALL_ACK 5 /* need to generate ACK */ +#define RXRPC_CALL_REJECT_BUSY 6 /* need to generate busy message */ +#define RXRPC_CALL_ABORT 7 /* need to generate abort */ +#define RXRPC_CALL_CONN_ABORT 8 /* local connection abort generated */ +#define RXRPC_CALL_RESEND_TIMER 9 /* Tx resend timer expired */ +#define RXRPC_CALL_RESEND 10 /* Tx resend required */ +#define RXRPC_CALL_DRAIN_RX_OOS 11 /* drain the Rx out of sequence queue */ +#define RXRPC_CALL_LIFE_TIMER 12 /* call's lifetimer ran out */ +#define RXRPC_CALL_ACCEPTED 13 /* incoming call accepted by userspace app */ +#define RXRPC_CALL_SECURED 14 /* incoming call's connection is now secure */ +#define RXRPC_CALL_POST_ACCEPT 15 /* need to post an "accept?" message to the app */ +#define RXRPC_CALL_RELEASE 16 /* need to release the call's resources */ + + spinlock_t lock; + rwlock_t state_lock; /* lock for state transition */ + atomic_t usage; + atomic_t sequence; /* Tx data packet sequence counter */ + u32 abort_code; /* local/remote abort code */ + enum { /* current state of call */ + RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ + RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ + RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ + RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ + RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ + RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ + RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ + RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ + RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ + RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ + RXRPC_CALL_COMPLETE, /* - call completed */ + RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + RXRPC_CALL_DEAD, /* - call is dead */ + } state; + int debug_id; /* debug ID for printks */ + u8 channel; /* connection channel occupied by this call */ + + /* transmission-phase ACK management */ + u8 acks_head; /* offset into window of first entry */ + u8 acks_tail; /* offset into window of last entry */ + u8 acks_winsz; /* size of un-ACK'd window */ + u8 acks_unacked; /* lowest unacked packet in last ACK received */ + int acks_latest; /* serial number of latest ACK received */ + rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */ + unsigned long *acks_window; /* sent packet window + * - elements are pointers with LSB set if ACK'd + */ + + /* receive-phase ACK management */ + rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */ + rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */ + rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */ + rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */ + rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */ + rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ + rxrpc_seq_net_t ackr_prev_seq; /* previous sequence number received */ + u8 ackr_reason; /* reason to ACK */ + __be32 ackr_serial; /* serial of packet being ACK'd */ + atomic_t ackr_not_idle; /* number of packets in Rx queue */ + + /* received packet records, 1 bit per record */ +#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) + unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; + + /* the following should all be in net order */ + __be32 cid; /* connection ID + channel index */ + __be32 call_id; /* call ID on connection */ +}; + +/* + * locally abort an RxRPC call + */ +static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) +{ + write_lock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + call->abort_code = abort_code; + call->state = RXRPC_CALL_LOCALLY_ABORTED; + set_bit(RXRPC_CALL_ABORT, &call->events); + } + write_unlock_bh(&call->state_lock); +} + +/* + * af_rxrpc.c + */ +extern atomic_t rxrpc_n_skbs; +extern __be32 rxrpc_epoch; +extern atomic_t rxrpc_debug_id; +extern struct workqueue_struct *rxrpc_workqueue; + +/* + * ar-accept.c + */ +extern void rxrpc_accept_incoming_calls(struct work_struct *); +extern struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, + unsigned long); +extern int rxrpc_reject_call(struct rxrpc_sock *); + +/* + * ar-ack.c + */ +extern void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); +extern void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool); +extern void rxrpc_process_call(struct work_struct *); + +/* + * ar-call.c + */ +extern struct kmem_cache *rxrpc_call_jar; +extern struct list_head rxrpc_calls; +extern rwlock_t rxrpc_call_lock; + +extern struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, + struct rxrpc_transport *, + struct rxrpc_conn_bundle *, + unsigned long, int, gfp_t); +extern struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, + struct rxrpc_connection *, + struct rxrpc_header *, gfp_t); +extern struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, + unsigned long); +extern void rxrpc_release_call(struct rxrpc_call *); +extern void rxrpc_release_calls_on_socket(struct rxrpc_sock *); +extern void __rxrpc_put_call(struct rxrpc_call *); +extern void __exit rxrpc_destroy_all_calls(void); + +/* + * ar-connection.c + */ +extern struct list_head rxrpc_connections; +extern rwlock_t rxrpc_connection_lock; + +extern struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *, + struct rxrpc_transport *, + struct key *, + __be16, gfp_t); +extern void rxrpc_put_bundle(struct rxrpc_transport *, + struct rxrpc_conn_bundle *); +extern int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *, + struct rxrpc_conn_bundle *, struct rxrpc_call *, + gfp_t); +extern void rxrpc_put_connection(struct rxrpc_connection *); +extern void __exit rxrpc_destroy_all_connections(void); +extern struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, + struct rxrpc_header *); +extern struct rxrpc_connection * +rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *, + gfp_t); + +/* + * ar-connevent.c + */ +extern void rxrpc_process_connection(struct work_struct *); +extern void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); +extern void rxrpc_reject_packets(struct work_struct *); + +/* + * ar-error.c + */ +extern void rxrpc_UDP_error_report(struct sock *); +extern void rxrpc_UDP_error_handler(struct work_struct *); + +/* + * ar-input.c + */ +extern unsigned long rxrpc_ack_timeout; +extern const char *rxrpc_pkts[]; + +extern void rxrpc_data_ready(struct sock *, int); +extern int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, + bool); +extern void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); + +/* + * ar-local.c + */ +extern rwlock_t rxrpc_local_lock; +extern struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *); +extern void rxrpc_put_local(struct rxrpc_local *); +extern void __exit rxrpc_destroy_all_locals(void); + +/* + * ar-key.c + */ +extern struct key_type key_type_rxrpc; +extern struct key_type key_type_rxrpc_s; + +extern int rxrpc_request_key(struct rxrpc_sock *, char __user *, int); +extern int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int); +extern int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, + time_t, u32); + +/* + * ar-output.c + */ +extern int rxrpc_resend_timeout; + +extern int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); +extern int rxrpc_client_sendmsg(struct kiocb *, struct rxrpc_sock *, + struct rxrpc_transport *, struct msghdr *, + size_t); +extern int rxrpc_server_sendmsg(struct kiocb *, struct rxrpc_sock *, + struct msghdr *, size_t); + +/* + * ar-peer.c + */ +extern struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t); +extern void rxrpc_put_peer(struct rxrpc_peer *); +extern struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, + __be32, __be16); +extern void __exit rxrpc_destroy_all_peers(void); + +/* + * ar-proc.c + */ +extern const char *const rxrpc_call_states[]; +extern const struct file_operations rxrpc_call_seq_fops; +extern const struct file_operations rxrpc_connection_seq_fops; + +/* + * ar-recvmsg.c + */ +extern void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); +extern int rxrpc_recvmsg(struct kiocb *, struct socket *, struct msghdr *, + size_t, int); + +/* + * ar-security.c + */ +extern int rxrpc_register_security(struct rxrpc_security *); +extern void rxrpc_unregister_security(struct rxrpc_security *); +extern int rxrpc_init_client_conn_security(struct rxrpc_connection *); +extern int rxrpc_init_server_conn_security(struct rxrpc_connection *); +extern int rxrpc_secure_packet(const struct rxrpc_call *, struct sk_buff *, + size_t, void *); +extern int rxrpc_verify_packet(const struct rxrpc_call *, struct sk_buff *, + u32 *); +extern void rxrpc_clear_conn_security(struct rxrpc_connection *); + +/* + * ar-skbuff.c + */ +extern void rxrpc_packet_destructor(struct sk_buff *); + +/* + * ar-transport.c + */ +extern struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *, + struct rxrpc_peer *, + gfp_t); +extern void rxrpc_put_transport(struct rxrpc_transport *); +extern void __exit rxrpc_destroy_all_transports(void); +extern struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, + struct rxrpc_peer *); + +/* + * debug tracing + */ +extern unsigned rxrpc_debug; + +#define dbgprintk(FMT,...) \ + printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) + +#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) +#define kproto(FMT,...) dbgprintk("### "FMT ,##__VA_ARGS__) +#define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__) + + +#if defined(__KDEBUG) +#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) +#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) +#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) +#define _proto(FMT,...) kproto(FMT,##__VA_ARGS__) +#define _net(FMT,...) knet(FMT,##__VA_ARGS__) + +#elif defined(CONFIG_AF_RXRPC_DEBUG) +#define RXRPC_DEBUG_KENTER 0x01 +#define RXRPC_DEBUG_KLEAVE 0x02 +#define RXRPC_DEBUG_KDEBUG 0x04 +#define RXRPC_DEBUG_KPROTO 0x08 +#define RXRPC_DEBUG_KNET 0x10 + +#define _enter(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \ + kenter(FMT,##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \ + kleave(FMT,##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \ + kdebug(FMT,##__VA_ARGS__); \ +} while (0) + +#define _proto(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \ + kproto(FMT,##__VA_ARGS__); \ +} while (0) + +#define _net(FMT,...) \ +do { \ + if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \ + knet(FMT,##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) +#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) +#define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__) +#define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__) +#endif + +/* + * debug assertion checking + */ +#if 1 // defined(__KDEBUGALL) + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + BUG(); \ + } \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + printk(KERN_ERR "\n"); \ + printk(KERN_ERR "RxRPC: Assertion failed\n"); \ + printk(KERN_ERR "%lu " #OP " %lu is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while(0) + +#else + +#define ASSERT(X) \ +do { \ +} while(0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ +} while(0) + +#define ASSERTIF(C, X) \ +do { \ +} while(0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ +} while(0) + +#endif /* __KDEBUGALL */ + +/* + * socket buffer accounting / leak finding + */ +static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn) +{ + //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); + //atomic_inc(&rxrpc_n_skbs); +} + +#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__) + +static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn) +{ + //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); + //atomic_dec(&rxrpc_n_skbs); +} + +#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__) + +static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn) +{ + if (skb) { + CHECK_SLAB_OKAY(&skb->users); + //_net("free skb %p %s [%d]", + // skb, fn, atomic_read(&rxrpc_n_skbs)); + //atomic_dec(&rxrpc_n_skbs); + kfree_skb(skb); + } +} + +#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__) + +static inline void rxrpc_purge_queue(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue((list))) != NULL) + rxrpc_free_skb(skb); +} + +static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f) +{ + CHECK_SLAB_OKAY(&local->usage); + if (atomic_inc_return(&local->usage) == 1) + printk("resurrected (%s)\n", f); +} + +#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__) + +#define rxrpc_get_call(CALL) \ +do { \ + CHECK_SLAB_OKAY(&(CALL)->usage); \ + if (atomic_inc_return(&(CALL)->usage) == 1) \ + BUG(); \ +} while(0) + +#define rxrpc_put_call(CALL) \ +do { \ + __rxrpc_put_call(CALL); \ +} while(0) diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c new file mode 100644 index 00000000..43ea7de2 --- /dev/null +++ b/net/rxrpc/ar-key.c @@ -0,0 +1,1220 @@ +/* RxRPC key management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * RxRPC keys should have a description of describing their purpose: + * "afs@CAMBRIDGE.REDHAT.COM> + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static int rxrpc_vet_description_s(const char *); +static int rxrpc_instantiate(struct key *, const void *, size_t); +static int rxrpc_instantiate_s(struct key *, const void *, size_t); +static void rxrpc_destroy(struct key *); +static void rxrpc_destroy_s(struct key *); +static void rxrpc_describe(const struct key *, struct seq_file *); +static long rxrpc_read(const struct key *, char __user *, size_t); + +/* + * rxrpc defined keys take an arbitrary string as the description and an + * arbitrary blob of data as the payload + */ +struct key_type key_type_rxrpc = { + .name = "rxrpc", + .instantiate = rxrpc_instantiate, + .match = user_match, + .destroy = rxrpc_destroy, + .describe = rxrpc_describe, + .read = rxrpc_read, +}; +EXPORT_SYMBOL(key_type_rxrpc); + +/* + * rxrpc server defined keys take ":" as the + * description and an 8-byte decryption key as the payload + */ +struct key_type key_type_rxrpc_s = { + .name = "rxrpc_s", + .vet_description = rxrpc_vet_description_s, + .instantiate = rxrpc_instantiate_s, + .match = user_match, + .destroy = rxrpc_destroy_s, + .describe = rxrpc_describe, +}; + +/* + * Vet the description for an RxRPC server key + */ +static int rxrpc_vet_description_s(const char *desc) +{ + unsigned long num; + char *p; + + num = simple_strtoul(desc, &p, 10); + if (*p != ':' || num > 65535) + return -EINVAL; + num = simple_strtoul(p + 1, &p, 10); + if (*p || num < 1 || num > 255) + return -EINVAL; + return 0; +} + +/* + * parse an RxKAD type XDR format token + * - the caller guarantees we have at least 4 words + */ +static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr, + unsigned toklen) +{ + struct rxrpc_key_token *token, **pptoken; + size_t plen; + u32 tktlen; + int ret; + + _enter(",{%x,%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (toklen <= 8 * 4) + return -EKEYREJECTED; + tktlen = ntohl(xdr[7]); + _debug("tktlen: %x", tktlen); + if (tktlen > AFSTOKEN_RK_TIX_MAX) + return -EKEYREJECTED; + if (8 * 4 + tktlen != toklen) + return -EKEYREJECTED; + + plen = sizeof(*token) + sizeof(*token->kad) + tktlen; + ret = key_payload_reserve(key, key->datalen + plen); + if (ret < 0) + return ret; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + return -ENOMEM; + + token->kad = kzalloc(plen, GFP_KERNEL); + if (!token->kad) { + kfree(token); + return -ENOMEM; + } + + token->security_index = RXRPC_SECURITY_RXKAD; + token->kad->ticket_len = tktlen; + token->kad->vice_id = ntohl(xdr[0]); + token->kad->kvno = ntohl(xdr[1]); + token->kad->start = ntohl(xdr[4]); + token->kad->expiry = ntohl(xdr[5]); + token->kad->primary_flag = ntohl(xdr[6]); + memcpy(&token->kad->session_key, &xdr[2], 8); + memcpy(&token->kad->ticket, &xdr[8], tktlen); + + _debug("SCIX: %u", token->security_index); + _debug("TLEN: %u", token->kad->ticket_len); + _debug("EXPY: %x", token->kad->expiry); + _debug("KVNO: %u", token->kad->kvno); + _debug("PRIM: %u", token->kad->primary_flag); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + token->kad->session_key[0], token->kad->session_key[1], + token->kad->session_key[2], token->kad->session_key[3], + token->kad->session_key[4], token->kad->session_key[5], + token->kad->session_key[6], token->kad->session_key[7]); + if (token->kad->ticket_len >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + token->kad->ticket[0], token->kad->ticket[1], + token->kad->ticket[2], token->kad->ticket[3], + token->kad->ticket[4], token->kad->ticket[5], + token->kad->ticket[6], token->kad->ticket[7]); + + /* count the number of tokens attached */ + key->type_data.x[0]++; + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&key->payload.data; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + if (token->kad->expiry < key->expiry) + key->expiry = token->kad->expiry; + + _leave(" = 0"); + return 0; +} + +static void rxrpc_free_krb5_principal(struct krb5_principal *princ) +{ + int loop; + + if (princ->name_parts) { + for (loop = princ->n_name_parts - 1; loop >= 0; loop--) + kfree(princ->name_parts[loop]); + kfree(princ->name_parts); + } + kfree(princ->realm); +} + +static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td) +{ + kfree(td->data); +} + +/* + * free up an RxK5 token + */ +static void rxrpc_rxk5_free(struct rxk5_key *rxk5) +{ + int loop; + + rxrpc_free_krb5_principal(&rxk5->client); + rxrpc_free_krb5_principal(&rxk5->server); + rxrpc_free_krb5_tagged(&rxk5->session); + + if (rxk5->addresses) { + for (loop = rxk5->n_addresses - 1; loop >= 0; loop--) + rxrpc_free_krb5_tagged(&rxk5->addresses[loop]); + kfree(rxk5->addresses); + } + if (rxk5->authdata) { + for (loop = rxk5->n_authdata - 1; loop >= 0; loop--) + rxrpc_free_krb5_tagged(&rxk5->authdata[loop]); + kfree(rxk5->authdata); + } + + kfree(rxk5->ticket); + kfree(rxk5->ticket2); + kfree(rxk5); +} + +/* + * extract a krb5 principal + */ +static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, + const __be32 **_xdr, + unsigned *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned toklen = *_toklen, n_parts, loop, tmp; + + /* there must be at least one name, and at least #names+1 length + * words */ + if (toklen <= 12) + return -EINVAL; + + _enter(",{%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen); + + n_parts = ntohl(*xdr++); + toklen -= 4; + if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX) + return -EINVAL; + princ->n_name_parts = n_parts; + + if (toklen <= (n_parts + 1) * 4) + return -EINVAL; + + princ->name_parts = kcalloc(sizeof(char *), n_parts, GFP_KERNEL); + if (!princ->name_parts) + return -ENOMEM; + + for (loop = 0; loop < n_parts; loop++) { + if (toklen < 4) + return -EINVAL; + tmp = ntohl(*xdr++); + toklen -= 4; + if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX) + return -EINVAL; + if (tmp > toklen) + return -EINVAL; + princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL); + if (!princ->name_parts[loop]) + return -ENOMEM; + memcpy(princ->name_parts[loop], xdr, tmp); + princ->name_parts[loop][tmp] = 0; + tmp = (tmp + 3) & ~3; + toklen -= tmp; + xdr += tmp >> 2; + } + + if (toklen < 4) + return -EINVAL; + tmp = ntohl(*xdr++); + toklen -= 4; + if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX) + return -EINVAL; + if (tmp > toklen) + return -EINVAL; + princ->realm = kmalloc(tmp + 1, GFP_KERNEL); + if (!princ->realm) + return -ENOMEM; + memcpy(princ->realm, xdr, tmp); + princ->realm[tmp] = 0; + tmp = (tmp + 3) & ~3; + toklen -= tmp; + xdr += tmp >> 2; + + _debug("%s/...@%s", princ->name_parts[0], princ->realm); + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract a piece of krb5 tagged data + */ +static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td, + size_t max_data_size, + const __be32 **_xdr, + unsigned *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned toklen = *_toklen, len; + + /* there must be at least one tag and one length word */ + if (toklen <= 8) + return -EINVAL; + + _enter(",%zu,{%x,%x},%u", + max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen); + + td->tag = ntohl(*xdr++); + len = ntohl(*xdr++); + toklen -= 8; + if (len > max_data_size) + return -EINVAL; + td->data_len = len; + + if (len > 0) { + td->data = kmalloc(len, GFP_KERNEL); + if (!td->data) + return -ENOMEM; + memcpy(td->data, xdr, len); + len = (len + 3) & ~3; + toklen -= len; + xdr += len >> 2; + } + + _debug("tag %x len %x", td->tag, td->data_len); + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract an array of tagged data + */ +static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, + u8 *_n_elem, + u8 max_n_elem, + size_t max_elem_size, + const __be32 **_xdr, + unsigned *_toklen) +{ + struct krb5_tagged_data *td; + const __be32 *xdr = *_xdr; + unsigned toklen = *_toklen, n_elem, loop; + int ret; + + /* there must be at least one count */ + if (toklen < 4) + return -EINVAL; + + _enter(",,%u,%zu,{%x},%u", + max_n_elem, max_elem_size, ntohl(xdr[0]), toklen); + + n_elem = ntohl(*xdr++); + toklen -= 4; + if (n_elem < 0 || n_elem > max_n_elem) + return -EINVAL; + *_n_elem = n_elem; + if (n_elem > 0) { + if (toklen <= (n_elem + 1) * 4) + return -EINVAL; + + _debug("n_elem %d", n_elem); + + td = kcalloc(sizeof(struct krb5_tagged_data), n_elem, + GFP_KERNEL); + if (!td) + return -ENOMEM; + *_td = td; + + for (loop = 0; loop < n_elem; loop++) { + ret = rxrpc_krb5_decode_tagged_data(&td[loop], + max_elem_size, + &xdr, &toklen); + if (ret < 0) + return ret; + } + } + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * extract a krb5 ticket + */ +static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen, + const __be32 **_xdr, unsigned *_toklen) +{ + const __be32 *xdr = *_xdr; + unsigned toklen = *_toklen, len; + + /* there must be at least one length word */ + if (toklen <= 4) + return -EINVAL; + + _enter(",{%x},%u", ntohl(xdr[0]), toklen); + + len = ntohl(*xdr++); + toklen -= 4; + if (len > AFSTOKEN_K5_TIX_MAX) + return -EINVAL; + *_tktlen = len; + + _debug("ticket len %u", len); + + if (len > 0) { + *_ticket = kmalloc(len, GFP_KERNEL); + if (!*_ticket) + return -ENOMEM; + memcpy(*_ticket, xdr, len); + len = (len + 3) & ~3; + toklen -= len; + xdr += len >> 2; + } + + *_xdr = xdr; + *_toklen = toklen; + _leave(" = 0 [toklen=%u]", toklen); + return 0; +} + +/* + * parse an RxK5 type XDR format token + * - the caller guarantees we have at least 4 words + */ +static int rxrpc_instantiate_xdr_rxk5(struct key *key, const __be32 *xdr, + unsigned toklen) +{ + struct rxrpc_key_token *token, **pptoken; + struct rxk5_key *rxk5; + const __be32 *end_xdr = xdr + (toklen >> 2); + int ret; + + _enter(",{%x,%x,%x,%x},%u", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + /* reserve some payload space for this subkey - the length of the token + * is a reasonable approximation */ + ret = key_payload_reserve(key, key->datalen + toklen); + if (ret < 0) + return ret; + + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + return -ENOMEM; + + rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL); + if (!rxk5) { + kfree(token); + return -ENOMEM; + } + + token->security_index = RXRPC_SECURITY_RXK5; + token->k5 = rxk5; + + /* extract the principals */ + ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen); + if (ret < 0) + goto error; + ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen); + if (ret < 0) + goto error; + + /* extract the session key and the encoding type (the tag field -> + * ENCTYPE_xxx) */ + ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + if (toklen < 4 * 8 + 2 * 4) + goto inval; + rxk5->authtime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->starttime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->endtime = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->renew_till = be64_to_cpup((const __be64 *) xdr); + xdr += 2; + rxk5->is_skey = ntohl(*xdr++); + rxk5->flags = ntohl(*xdr++); + toklen -= 4 * 8 + 2 * 4; + + _debug("times: a=%llx s=%llx e=%llx rt=%llx", + rxk5->authtime, rxk5->starttime, rxk5->endtime, + rxk5->renew_till); + _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags); + + /* extract the permitted client addresses */ + ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses, + &rxk5->n_addresses, + AFSTOKEN_K5_ADDRESSES_MAX, + AFSTOKEN_DATA_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + /* extract the tickets */ + ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len, + &xdr, &toklen); + if (ret < 0) + goto error; + ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + /* extract the typed auth data */ + ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata, + &rxk5->n_authdata, + AFSTOKEN_K5_AUTHDATA_MAX, + AFSTOKEN_BDATALN_MAX, + &xdr, &toklen); + if (ret < 0) + goto error; + + ASSERTCMP((end_xdr - xdr) << 2, ==, toklen); + + if (toklen != 0) + goto inval; + + /* attach the payload to the key */ + for (pptoken = (struct rxrpc_key_token **)&key->payload.data; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + if (token->kad->expiry < key->expiry) + key->expiry = token->kad->expiry; + + _leave(" = 0"); + return 0; + +inval: + ret = -EINVAL; +error: + rxrpc_rxk5_free(rxk5); + kfree(token); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to parse the data as the XDR format + * - the caller guarantees we have more than 7 words + */ +static int rxrpc_instantiate_xdr(struct key *key, const void *data, size_t datalen) +{ + const __be32 *xdr = data, *token; + const char *cp; + unsigned len, tmp, loop, ntoken, toklen, sec_ix; + int ret; + + _enter(",{%x,%x,%x,%x},%zu", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + datalen); + + if (datalen > AFSTOKEN_LENGTH_MAX) + goto not_xdr; + + /* XDR is an array of __be32's */ + if (datalen & 3) + goto not_xdr; + + /* the flags should be 0 (the setpag bit must be handled by + * userspace) */ + if (ntohl(*xdr++) != 0) + goto not_xdr; + datalen -= 4; + + /* check the cell name */ + len = ntohl(*xdr++); + if (len < 1 || len > AFSTOKEN_CELL_MAX) + goto not_xdr; + datalen -= 4; + tmp = (len + 3) & ~3; + if (tmp > datalen) + goto not_xdr; + + cp = (const char *) xdr; + for (loop = 0; loop < len; loop++) + if (!isprint(cp[loop])) + goto not_xdr; + if (len < tmp) + for (; loop < tmp; loop++) + if (cp[loop]) + goto not_xdr; + _debug("cellname: [%u/%u] '%*.*s'", + len, tmp, len, len, (const char *) xdr); + datalen -= tmp; + xdr += tmp >> 2; + + /* get the token count */ + if (datalen < 12) + goto not_xdr; + ntoken = ntohl(*xdr++); + datalen -= 4; + _debug("ntoken: %x", ntoken); + if (ntoken < 1 || ntoken > AFSTOKEN_MAX) + goto not_xdr; + + /* check each token wrapper */ + token = xdr; + loop = ntoken; + do { + if (datalen < 8) + goto not_xdr; + toklen = ntohl(*xdr++); + sec_ix = ntohl(*xdr); + datalen -= 4; + _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix); + if (toklen < 20 || toklen > datalen) + goto not_xdr; + datalen -= (toklen + 3) & ~3; + xdr += (toklen + 3) >> 2; + + } while (--loop > 0); + + _debug("remainder: %zu", datalen); + if (datalen != 0) + goto not_xdr; + + /* okay: we're going to assume it's valid XDR format + * - we ignore the cellname, relying on the key to be correctly named + */ + do { + xdr = token; + toklen = ntohl(*xdr++); + token = xdr + ((toklen + 3) >> 2); + sec_ix = ntohl(*xdr++); + toklen -= 4; + + _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token); + + switch (sec_ix) { + case RXRPC_SECURITY_RXKAD: + ret = rxrpc_instantiate_xdr_rxkad(key, xdr, toklen); + if (ret != 0) + goto error; + break; + + case RXRPC_SECURITY_RXK5: + ret = rxrpc_instantiate_xdr_rxk5(key, xdr, toklen); + if (ret != 0) + goto error; + break; + + default: + ret = -EPROTONOSUPPORT; + goto error; + } + + } while (--ntoken > 0); + + _leave(" = 0"); + return 0; + +not_xdr: + _leave(" = -EPROTO"); + return -EPROTO; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * instantiate an rxrpc defined key + * data should be of the form: + * OFFSET LEN CONTENT + * 0 4 key interface version number + * 4 2 security index (type) + * 6 2 ticket length + * 8 4 key expiry time (time_t) + * 12 4 kvno + * 16 8 session key + * 24 [len] ticket + * + * if no data is provided, then a no-security key is made + */ +static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) +{ + const struct rxrpc_key_data_v1 *v1; + struct rxrpc_key_token *token, **pp; + size_t plen; + u32 kver; + int ret; + + _enter("{%x},,%zu", key_serial(key), datalen); + + /* handle a no-security key */ + if (!data && datalen == 0) + return 0; + + /* determine if the XDR payload format is being used */ + if (datalen > 7 * 4) { + ret = rxrpc_instantiate_xdr(key, data, datalen); + if (ret != -EPROTO) + return ret; + } + + /* get the key interface version number */ + ret = -EINVAL; + if (datalen <= 4 || !data) + goto error; + memcpy(&kver, data, sizeof(kver)); + data += sizeof(kver); + datalen -= sizeof(kver); + + _debug("KEY I/F VERSION: %u", kver); + + ret = -EKEYREJECTED; + if (kver != 1) + goto error; + + /* deal with a version 1 key */ + ret = -EINVAL; + if (datalen < sizeof(*v1)) + goto error; + + v1 = data; + if (datalen != sizeof(*v1) + v1->ticket_length) + goto error; + + _debug("SCIX: %u", v1->security_index); + _debug("TLEN: %u", v1->ticket_length); + _debug("EXPY: %x", v1->expiry); + _debug("KVNO: %u", v1->kvno); + _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x", + v1->session_key[0], v1->session_key[1], + v1->session_key[2], v1->session_key[3], + v1->session_key[4], v1->session_key[5], + v1->session_key[6], v1->session_key[7]); + if (v1->ticket_length >= 8) + _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x", + v1->ticket[0], v1->ticket[1], + v1->ticket[2], v1->ticket[3], + v1->ticket[4], v1->ticket[5], + v1->ticket[6], v1->ticket[7]); + + ret = -EPROTONOSUPPORT; + if (v1->security_index != RXRPC_SECURITY_RXKAD) + goto error; + + plen = sizeof(*token->kad) + v1->ticket_length; + ret = key_payload_reserve(key, plen + sizeof(*token)); + if (ret < 0) + goto error; + + ret = -ENOMEM; + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto error; + token->kad = kzalloc(plen, GFP_KERNEL); + if (!token->kad) + goto error_free; + + token->security_index = RXRPC_SECURITY_RXKAD; + token->kad->ticket_len = v1->ticket_length; + token->kad->expiry = v1->expiry; + token->kad->kvno = v1->kvno; + memcpy(&token->kad->session_key, &v1->session_key, 8); + memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length); + + /* attach the data */ + key->type_data.x[0]++; + + pp = (struct rxrpc_key_token **)&key->payload.data; + while (*pp) + pp = &(*pp)->next; + *pp = token; + if (token->kad->expiry < key->expiry) + key->expiry = token->kad->expiry; + token = NULL; + ret = 0; + +error_free: + kfree(token); +error: + return ret; +} + +/* + * instantiate a server secret key + * data should be a pointer to the 8-byte secret key + */ +static int rxrpc_instantiate_s(struct key *key, const void *data, + size_t datalen) +{ + struct crypto_blkcipher *ci; + + _enter("{%x},,%zu", key_serial(key), datalen); + + if (datalen != 8) + return -EINVAL; + + memcpy(&key->type_data, data, 8); + + ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _leave(" = %ld", PTR_ERR(ci)); + return PTR_ERR(ci); + } + + if (crypto_blkcipher_setkey(ci, data, 8) < 0) + BUG(); + + key->payload.data = ci; + _leave(" = 0"); + return 0; +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy(struct key *key) +{ + struct rxrpc_key_token *token; + + while ((token = key->payload.data)) { + key->payload.data = token->next; + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + kfree(token->kad); + break; + case RXRPC_SECURITY_RXK5: + if (token->k5) + rxrpc_rxk5_free(token->k5); + break; + default: + printk(KERN_ERR "Unknown token type %x on rxrpc key\n", + token->security_index); + BUG(); + } + + kfree(token); + } +} + +/* + * dispose of the data dangling from the corpse of a rxrpc key + */ +static void rxrpc_destroy_s(struct key *key) +{ + if (key->payload.data) { + crypto_free_blkcipher(key->payload.data); + key->payload.data = NULL; + } +} + +/* + * describe the rxrpc key + */ +static void rxrpc_describe(const struct key *key, struct seq_file *m) +{ + seq_puts(m, key->description); +} + +/* + * grab the security key for a socket + */ +int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = kmalloc(optlen + 1, GFP_KERNEL); + if (!description) + return -ENOMEM; + + if (copy_from_user(description, optval, optlen)) { + kfree(description); + return -EFAULT; + } + description[optlen] = 0; + + key = request_key(&key_type_rxrpc, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->key = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * grab the security keyring for a server socket + */ +int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, + int optlen) +{ + struct key *key; + char *description; + + _enter(""); + + if (optlen <= 0 || optlen > PAGE_SIZE - 1) + return -EINVAL; + + description = kmalloc(optlen + 1, GFP_KERNEL); + if (!description) + return -ENOMEM; + + if (copy_from_user(description, optval, optlen)) { + kfree(description); + return -EFAULT; + } + description[optlen] = 0; + + key = request_key(&key_type_keyring, description, NULL); + if (IS_ERR(key)) { + kfree(description); + _leave(" = %ld", PTR_ERR(key)); + return PTR_ERR(key); + } + + rx->securities = key; + kfree(description); + _leave(" = 0 [key %x]", key->serial); + return 0; +} + +/* + * generate a server data key + */ +int rxrpc_get_server_data_key(struct rxrpc_connection *conn, + const void *session_key, + time_t expiry, + u32 kvno) +{ + const struct cred *cred = current_cred(); + struct key *key; + int ret; + + struct { + u32 kver; + struct rxrpc_key_data_v1 v1; + } data; + + _enter(""); + + key = key_alloc(&key_type_rxrpc, "x", 0, 0, cred, 0, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + return -ENOMEM; + } + + _debug("key %d", key_serial(key)); + + data.kver = 1; + data.v1.security_index = RXRPC_SECURITY_RXKAD; + data.v1.ticket_length = 0; + data.v1.expiry = expiry; + data.v1.kvno = 0; + + memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key)); + + ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL); + if (ret < 0) + goto error; + + conn->key = key; + _leave(" = 0 [%d]", key_serial(key)); + return 0; + +error: + key_revoke(key); + key_put(key); + _leave(" = -ENOMEM [ins %d]", ret); + return -ENOMEM; +} +EXPORT_SYMBOL(rxrpc_get_server_data_key); + +/** + * rxrpc_get_null_key - Generate a null RxRPC key + * @keyname: The name to give the key. + * + * Generate a null RxRPC key that can be used to indicate anonymous security is + * required for a particular domain. + */ +struct key *rxrpc_get_null_key(const char *keyname) +{ + const struct cred *cred = current_cred(); + struct key *key; + int ret; + + key = key_alloc(&key_type_rxrpc, keyname, 0, 0, cred, + KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) + return key; + + ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL); + if (ret < 0) { + key_revoke(key); + key_put(key); + return ERR_PTR(ret); + } + + return key; +} +EXPORT_SYMBOL(rxrpc_get_null_key); + +/* + * read the contents of an rxrpc key + * - this returns the result in XDR form + */ +static long rxrpc_read(const struct key *key, + char __user *buffer, size_t buflen) +{ + const struct rxrpc_key_token *token; + const struct krb5_principal *princ; + size_t size; + __be32 __user *xdr, *oldxdr; + u32 cnlen, toksize, ntoks, tok, zero; + u16 toksizes[AFSTOKEN_MAX]; + int loop; + + _enter(""); + + /* we don't know what form we should return non-AFS keys in */ + if (memcmp(key->description, "afs@", 4) != 0) + return -EOPNOTSUPP; + cnlen = strlen(key->description + 4); + +#define RND(X) (((X) + 3) & ~3) + + /* AFS keys we return in XDR form, so we need to work out the size of + * the XDR */ + size = 2 * 4; /* flags, cellname len */ + size += RND(cnlen); /* cellname */ + size += 1 * 4; /* token count */ + + ntoks = 0; + for (token = key->payload.data; token; token = token->next) { + toksize = 4; /* sec index */ + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + toksize += 8 * 4; /* viceid, kvno, key*2, begin, + * end, primary, tktlen */ + toksize += RND(token->kad->ticket_len); + break; + + case RXRPC_SECURITY_RXK5: + princ = &token->k5->client; + toksize += 4 + princ->n_name_parts * 4; + for (loop = 0; loop < princ->n_name_parts; loop++) + toksize += RND(strlen(princ->name_parts[loop])); + toksize += 4 + RND(strlen(princ->realm)); + + princ = &token->k5->server; + toksize += 4 + princ->n_name_parts * 4; + for (loop = 0; loop < princ->n_name_parts; loop++) + toksize += RND(strlen(princ->name_parts[loop])); + toksize += 4 + RND(strlen(princ->realm)); + + toksize += 8 + RND(token->k5->session.data_len); + + toksize += 4 * 8 + 2 * 4; + + toksize += 4 + token->k5->n_addresses * 8; + for (loop = 0; loop < token->k5->n_addresses; loop++) + toksize += RND(token->k5->addresses[loop].data_len); + + toksize += 4 + RND(token->k5->ticket_len); + toksize += 4 + RND(token->k5->ticket2_len); + + toksize += 4 + token->k5->n_authdata * 8; + for (loop = 0; loop < token->k5->n_authdata; loop++) + toksize += RND(token->k5->authdata[loop].data_len); + break; + + default: /* we have a ticket we can't encode */ + BUG(); + continue; + } + + _debug("token[%u]: toksize=%u", ntoks, toksize); + ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX); + + toksizes[ntoks++] = toksize; + size += toksize + 4; /* each token has a length word */ + } + +#undef RND + + if (!buffer || buflen < size) + return size; + + xdr = (__be32 __user *) buffer; + zero = 0; +#define ENCODE(x) \ + do { \ + __be32 y = htonl(x); \ + if (put_user(y, xdr++) < 0) \ + goto fault; \ + } while(0) +#define ENCODE_DATA(l, s) \ + do { \ + u32 _l = (l); \ + ENCODE(l); \ + if (copy_to_user(xdr, (s), _l) != 0) \ + goto fault; \ + if (_l & 3 && \ + copy_to_user((u8 *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ + goto fault; \ + xdr += (_l + 3) >> 2; \ + } while(0) +#define ENCODE64(x) \ + do { \ + __be64 y = cpu_to_be64(x); \ + if (copy_to_user(xdr, &y, 8) != 0) \ + goto fault; \ + xdr += 8 >> 2; \ + } while(0) +#define ENCODE_STR(s) \ + do { \ + const char *_s = (s); \ + ENCODE_DATA(strlen(_s), _s); \ + } while(0) + + ENCODE(0); /* flags */ + ENCODE_DATA(cnlen, key->description + 4); /* cellname */ + ENCODE(ntoks); + + tok = 0; + for (token = key->payload.data; token; token = token->next) { + toksize = toksizes[tok++]; + ENCODE(toksize); + oldxdr = xdr; + ENCODE(token->security_index); + + switch (token->security_index) { + case RXRPC_SECURITY_RXKAD: + ENCODE(token->kad->vice_id); + ENCODE(token->kad->kvno); + ENCODE_DATA(8, token->kad->session_key); + ENCODE(token->kad->start); + ENCODE(token->kad->expiry); + ENCODE(token->kad->primary_flag); + ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); + break; + + case RXRPC_SECURITY_RXK5: + princ = &token->k5->client; + ENCODE(princ->n_name_parts); + for (loop = 0; loop < princ->n_name_parts; loop++) + ENCODE_STR(princ->name_parts[loop]); + ENCODE_STR(princ->realm); + + princ = &token->k5->server; + ENCODE(princ->n_name_parts); + for (loop = 0; loop < princ->n_name_parts; loop++) + ENCODE_STR(princ->name_parts[loop]); + ENCODE_STR(princ->realm); + + ENCODE(token->k5->session.tag); + ENCODE_DATA(token->k5->session.data_len, + token->k5->session.data); + + ENCODE64(token->k5->authtime); + ENCODE64(token->k5->starttime); + ENCODE64(token->k5->endtime); + ENCODE64(token->k5->renew_till); + ENCODE(token->k5->is_skey); + ENCODE(token->k5->flags); + + ENCODE(token->k5->n_addresses); + for (loop = 0; loop < token->k5->n_addresses; loop++) { + ENCODE(token->k5->addresses[loop].tag); + ENCODE_DATA(token->k5->addresses[loop].data_len, + token->k5->addresses[loop].data); + } + + ENCODE_DATA(token->k5->ticket_len, token->k5->ticket); + ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2); + + ENCODE(token->k5->n_authdata); + for (loop = 0; loop < token->k5->n_authdata; loop++) { + ENCODE(token->k5->authdata[loop].tag); + ENCODE_DATA(token->k5->authdata[loop].data_len, + token->k5->authdata[loop].data); + } + break; + + default: + BUG(); + break; + } + + ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, + toksize); + } + +#undef ENCODE_STR +#undef ENCODE_DATA +#undef ENCODE64 +#undef ENCODE + + ASSERTCMP(tok, ==, ntoks); + ASSERTCMP((char __user *) xdr - buffer, ==, size); + _leave(" = %zu", size); + return size; + +fault: + _leave(" = -EFAULT"); + return -EFAULT; +} diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c new file mode 100644 index 00000000..87f7135d --- /dev/null +++ b/net/rxrpc/ar-local.c @@ -0,0 +1,310 @@ +/* AF_RXRPC local endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_locals); +DEFINE_RWLOCK(rxrpc_local_lock); +static DECLARE_RWSEM(rxrpc_local_sem); +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); + +static void rxrpc_destroy_local(struct work_struct *work); + +/* + * allocate a new local + */ +static +struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + + local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); + if (local) { + INIT_WORK(&local->destroyer, &rxrpc_destroy_local); + INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); + INIT_WORK(&local->rejecter, &rxrpc_reject_packets); + INIT_LIST_HEAD(&local->services); + INIT_LIST_HEAD(&local->link); + init_rwsem(&local->defrag_sem); + skb_queue_head_init(&local->accept_queue); + skb_queue_head_init(&local->reject_queue); + spin_lock_init(&local->lock); + rwlock_init(&local->services_lock); + atomic_set(&local->usage, 1); + local->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&local->srx, srx, sizeof(*srx)); + } + + _leave(" = %p", local); + return local; +} + +/* + * create the local socket + * - must be called with rxrpc_local_sem writelocked + */ +static int rxrpc_create_local(struct rxrpc_local *local) +{ + struct sock *sock; + int ret, opt; + + _enter("%p{%d}", local, local->srx.transport_type); + + /* create a socket to represent the local endpoint */ + ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP, + &local->socket); + if (ret < 0) { + _leave(" = %d [socket]", ret); + return ret; + } + + /* if a local address was supplied then bind it */ + if (local->srx.transport_len > sizeof(sa_family_t)) { + _debug("bind"); + ret = kernel_bind(local->socket, + (struct sockaddr *) &local->srx.transport, + local->srx.transport_len); + if (ret < 0) { + _debug("bind failed"); + goto error; + } + } + + /* we want to receive ICMP errors */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* we want to set the don't fragment bit */ + opt = IP_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + write_lock_bh(&rxrpc_local_lock); + list_add(&local->link, &rxrpc_locals); + write_unlock_bh(&rxrpc_local_lock); + + /* set the socket up */ + sock = local->socket->sk; + sock->sk_user_data = local; + sock->sk_data_ready = rxrpc_data_ready; + sock->sk_error_report = rxrpc_UDP_error_report; + _leave(" = 0"); + return 0; + +error: + kernel_sock_shutdown(local->socket, SHUT_RDWR); + local->socket->sk->sk_user_data = NULL; + sock_release(local->socket); + local->socket = NULL; + + _leave(" = %d", ret); + return ret; +} + +/* + * create a new local endpoint using the specified UDP address + */ +struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx) +{ + struct rxrpc_local *local; + int ret; + + _enter("{%d,%u,%pI4+%hu}", + srx->transport_type, + srx->transport.family, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + + down_write(&rxrpc_local_sem); + + /* see if we have a suitable local local endpoint already */ + read_lock_bh(&rxrpc_local_lock); + + list_for_each_entry(local, &rxrpc_locals, link) { + _debug("CMP {%d,%u,%pI4+%hu}", + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + if (local->srx.transport_type != srx->transport_type || + local->srx.transport.family != srx->transport.family) + continue; + + switch (srx->transport.family) { + case AF_INET: + if (local->srx.transport.sin.sin_port != + srx->transport.sin.sin_port) + continue; + if (memcmp(&local->srx.transport.sin.sin_addr, + &srx->transport.sin.sin_addr, + sizeof(struct in_addr)) != 0) + continue; + goto found_local; + + default: + BUG(); + } + } + + read_unlock_bh(&rxrpc_local_lock); + + /* we didn't find one, so we need to create one */ + local = rxrpc_alloc_local(srx); + if (!local) { + up_write(&rxrpc_local_sem); + return ERR_PTR(-ENOMEM); + } + + ret = rxrpc_create_local(local); + if (ret < 0) { + up_write(&rxrpc_local_sem); + kfree(local); + _leave(" = %d", ret); + return ERR_PTR(ret); + } + + up_write(&rxrpc_local_sem); + + _net("LOCAL new %d {%d,%u,%pI4+%hu}", + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p [new]", local); + return local; + +found_local: + rxrpc_get_local(local); + read_unlock_bh(&rxrpc_local_lock); + up_write(&rxrpc_local_sem); + + _net("LOCAL old %d {%d,%u,%pI4+%hu}", + local->debug_id, + local->srx.transport_type, + local->srx.transport.family, + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port)); + + _leave(" = %p [reuse]", local); + return local; +} + +/* + * release a local endpoint + */ +void rxrpc_put_local(struct rxrpc_local *local) +{ + _enter("%p{u=%d}", local, atomic_read(&local->usage)); + + ASSERTCMP(atomic_read(&local->usage), >, 0); + + /* to prevent a race, the decrement and the dequeue must be effectively + * atomic */ + write_lock_bh(&rxrpc_local_lock); + if (unlikely(atomic_dec_and_test(&local->usage))) { + _debug("destroy local"); + rxrpc_queue_work(&local->destroyer); + } + write_unlock_bh(&rxrpc_local_lock); + _leave(""); +} + +/* + * destroy a local endpoint + */ +static void rxrpc_destroy_local(struct work_struct *work) +{ + struct rxrpc_local *local = + container_of(work, struct rxrpc_local, destroyer); + + _enter("%p{%d}", local, atomic_read(&local->usage)); + + down_write(&rxrpc_local_sem); + + write_lock_bh(&rxrpc_local_lock); + if (atomic_read(&local->usage) > 0) { + write_unlock_bh(&rxrpc_local_lock); + up_read(&rxrpc_local_sem); + _leave(" [resurrected]"); + return; + } + + list_del(&local->link); + local->socket->sk->sk_user_data = NULL; + write_unlock_bh(&rxrpc_local_lock); + + downgrade_write(&rxrpc_local_sem); + + ASSERT(list_empty(&local->services)); + ASSERT(!work_pending(&local->acceptor)); + ASSERT(!work_pending(&local->rejecter)); + + /* finish cleaning up the local descriptor */ + rxrpc_purge_queue(&local->accept_queue); + rxrpc_purge_queue(&local->reject_queue); + kernel_sock_shutdown(local->socket, SHUT_RDWR); + sock_release(local->socket); + + up_read(&rxrpc_local_sem); + + _net("DESTROY LOCAL %d", local->debug_id); + kfree(local); + + if (list_empty(&rxrpc_locals)) + wake_up_all(&rxrpc_local_wq); + + _leave(""); +} + +/* + * preemptively destroy all local local endpoint rather than waiting for + * them to be destroyed + */ +void __exit rxrpc_destroy_all_locals(void) +{ + DECLARE_WAITQUEUE(myself,current); + + _enter(""); + + /* we simply have to wait for them to go away */ + if (!list_empty(&rxrpc_locals)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&rxrpc_local_wq, &myself); + + while (!list_empty(&rxrpc_locals)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&rxrpc_local_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _leave(""); +} diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c new file mode 100644 index 00000000..5f22e263 --- /dev/null +++ b/net/rxrpc/ar-output.c @@ -0,0 +1,738 @@ +/* RxRPC packet transmission + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +int rxrpc_resend_timeout = 4; + +static int rxrpc_send_data(struct kiocb *iocb, + struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len); + +/* + * extract control messages from the sendmsg() control buffer + */ +static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg, + unsigned long *user_call_ID, + enum rxrpc_command *command, + u32 *abort_code, + bool server) +{ + struct cmsghdr *cmsg; + int len; + + *command = RXRPC_CMD_SEND_DATA; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_USER_CALL_ID: + if (msg->msg_flags & MSG_CMSG_COMPAT) { + if (len != sizeof(u32)) + return -EINVAL; + *user_call_ID = *(u32 *) CMSG_DATA(cmsg); + } else { + if (len != sizeof(unsigned long)) + return -EINVAL; + *user_call_ID = *(unsigned long *) + CMSG_DATA(cmsg); + } + _debug("User Call ID %lx", *user_call_ID); + break; + + case RXRPC_ABORT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_SEND_ABORT; + if (len != sizeof(*abort_code)) + return -EINVAL; + *abort_code = *(unsigned int *) CMSG_DATA(cmsg); + _debug("Abort %x", *abort_code); + if (*abort_code == 0) + return -EINVAL; + break; + + case RXRPC_ACCEPT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_ACCEPT; + if (len != 0) + return -EINVAL; + if (!server) + return -EISCONN; + break; + + default: + return -EINVAL; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * abort a call, sending an ABORT packet to the peer + */ +static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) +{ + write_lock_bh(&call->state_lock); + + if (call->state <= RXRPC_CALL_COMPLETE) { + call->state = RXRPC_CALL_LOCALLY_ABORTED; + call->abort_code = abort_code; + set_bit(RXRPC_CALL_ABORT, &call->events); + del_timer_sync(&call->resend_timer); + del_timer_sync(&call->ack_timer); + clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events); + clear_bit(RXRPC_CALL_ACK, &call->events); + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + rxrpc_queue_call(call); + } + + write_unlock_bh(&call->state_lock); +} + +/* + * send a message forming part of a client call through an RxRPC socket + * - caller holds the socket locked + * - the socket may be either a client socket or a server socket + */ +int rxrpc_client_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, + struct rxrpc_transport *trans, struct msghdr *msg, + size_t len) +{ + struct rxrpc_conn_bundle *bundle; + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + struct key *key; + __be16 service_id; + u32 abort_code = 0; + int ret; + + _enter(""); + + ASSERT(trans != NULL); + + ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code, + false); + if (ret < 0) + return ret; + + bundle = NULL; + if (trans) { + service_id = rx->service_id; + if (msg->msg_name) { + struct sockaddr_rxrpc *srx = + (struct sockaddr_rxrpc *) msg->msg_name; + service_id = htons(srx->srx_service); + } + key = rx->key; + if (key && !rx->key->payload.data) + key = NULL; + bundle = rxrpc_get_bundle(rx, trans, key, service_id, + GFP_KERNEL); + if (IS_ERR(bundle)) + return PTR_ERR(bundle); + } + + call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, + abort_code == 0, GFP_KERNEL); + if (trans) + rxrpc_put_bundle(trans, bundle); + if (IS_ERR(call)) { + _leave(" = %ld", PTR_ERR(call)); + return PTR_ERR(call); + } + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + /* it's too late for this call */ + ret = -ESHUTDOWN; + } else if (cmd == RXRPC_CMD_SEND_ABORT) { + rxrpc_send_abort(call, abort_code); + } else if (cmd != RXRPC_CMD_SEND_DATA) { + ret = -EINVAL; + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + /* request phase complete for this client call */ + ret = -EPROTO; + } else { + ret = rxrpc_send_data(iocb, rx, call, msg, len); + } + + rxrpc_put_call(call); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_send_data - Allow a kernel service to send data on a call + * @call: The call to send data through + * @msg: The data to send + * @len: The amount of data to send + * + * Allow a kernel service to send data on a call. The call must be in an state + * appropriate to sending data. No control data should be supplied in @msg, + * nor should an address be supplied. MSG_MORE should be flagged if there's + * more data to come, otherwise this data will end the transmission phase. + */ +int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, + size_t len) +{ + int ret; + + _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + + ASSERTCMP(msg->msg_name, ==, NULL); + ASSERTCMP(msg->msg_control, ==, NULL); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; /* it's too late for this call */ + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + ret = -EPROTO; /* request phase complete for this client call */ + } else { + mm_segment_t oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = rxrpc_send_data(NULL, call->socket, call, msg, len); + set_fs(oldfs); + } + + release_sock(&call->socket->sk); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL(rxrpc_kernel_send_data); + +/* + * rxrpc_kernel_abort_call - Allow a kernel service to abort a call + * @call: The call to be aborted + * @abort_code: The abort code to stick into the ABORT packet + * + * Allow a kernel service to abort a call, if it's still in an abortable state. + */ +void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) +{ + _enter("{%d},%d", call->debug_id, abort_code); + + lock_sock(&call->socket->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_send_abort(call, abort_code); + + release_sock(&call->socket->sk); + _leave(""); +} + +EXPORT_SYMBOL(rxrpc_kernel_abort_call); + +/* + * send a message through a server socket + * - caller holds the socket locked + */ +int rxrpc_server_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, + struct msghdr *msg, size_t len) +{ + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + u32 abort_code = 0; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code, + true); + if (ret < 0) + return ret; + + if (cmd == RXRPC_CMD_ACCEPT) { + call = rxrpc_accept_call(rx, user_call_ID); + if (IS_ERR(call)) + return PTR_ERR(call); + rxrpc_put_call(call); + return 0; + } + + call = rxrpc_find_server_call(rx, user_call_ID); + if (!call) + return -EBADSLT; + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; + goto out; + } + + switch (cmd) { + case RXRPC_CMD_SEND_DATA: + if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Tx phase not yet begun for this call */ + ret = -EPROTO; + break; + } + + ret = rxrpc_send_data(iocb, rx, call, msg, len); + break; + + case RXRPC_CMD_SEND_ABORT: + rxrpc_send_abort(call, abort_code); + break; + default: + BUG(); + } + + out: + rxrpc_put_call(call); + _leave(" = %d", ret); + return ret; +} + +/* + * send a packet through the transport endpoint + */ +int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb) +{ + struct kvec iov[1]; + struct msghdr msg; + int ret, opt; + + _enter(",{%d}", skb->len); + + iov[0].iov_base = skb->head; + iov[0].iov_len = skb->len; + + msg.msg_name = &trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + /* send the packet with the don't fragment bit set if we currently + * think it's small enough */ + if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) { + down_read(&trans->local->defrag_sem); + /* send the packet by UDP + * - returns -EMSGSIZE if UDP would have to fragment the packet + * to go out of the interface + * - in which case, we'll have processed the ICMP error + * message and update the peer record + */ + ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + iov[0].iov_len); + + up_read(&trans->local->defrag_sem); + if (ret == -EMSGSIZE) + goto send_fragmentable; + + _leave(" = %d [%u]", ret, trans->peer->maxdata); + return ret; + } + +send_fragmentable: + /* attempt to send this message with fragmentation enabled */ + _debug("send fragment"); + + down_write(&trans->local->defrag_sem); + opt = IP_PMTUDISC_DONT; + ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1, + iov[0].iov_len); + + opt = IP_PMTUDISC_DO; + kernel_setsockopt(trans->local->socket, SOL_IP, + IP_MTU_DISCOVER, (char *) &opt, sizeof(opt)); + } + + up_write(&trans->local->defrag_sem); + _leave(" = %d [frag %u]", ret, trans->peer->maxdata); + return ret; +} + +/* + * wait for space to appear in the transmit/ACK window + * - caller holds the socket locked + */ +static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret; + + _enter(",{%d},%ld", + CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz), + *timeo); + + add_wait_queue(&call->tx_waitq, &myself); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = 0; + if (CIRC_SPACE(call->acks_head, call->acks_tail, + call->acks_winsz) > 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + release_sock(&rx->sk); + *timeo = schedule_timeout(*timeo); + lock_sock(&rx->sk); + } + + remove_wait_queue(&call->tx_waitq, &myself); + set_current_state(TASK_RUNNING); + _leave(" = %d", ret); + return ret; +} + +/* + * attempt to schedule an instant Tx resend + */ +static inline void rxrpc_instant_resend(struct rxrpc_call *call) +{ + read_lock_bh(&call->state_lock); + if (try_to_del_timer_sync(&call->resend_timer) >= 0) { + clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); + if (call->state < RXRPC_CALL_COMPLETE && + !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events)) + rxrpc_queue_call(call); + } + read_unlock_bh(&call->state_lock); +} + +/* + * queue a packet for transmission, set the resend timer and attempt + * to send the packet immediately + */ +static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool last) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + int ret; + + _net("queue skb %p [%d]", skb, call->acks_head); + + ASSERT(call->acks_window != NULL); + call->acks_window[call->acks_head] = (unsigned long) skb; + smp_wmb(); + call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); + + if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { + _debug("________awaiting reply/ACK__________"); + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; + case RXRPC_CALL_SERVER_ACK_REQUEST: + call->state = RXRPC_CALL_SERVER_SEND_REPLY; + if (!last) + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + break; + default: + break; + } + write_unlock_bh(&call->state_lock); + } + + _proto("Tx DATA %%%u { #%u }", + ntohl(sp->hdr.serial), ntohl(sp->hdr.seq)); + + sp->need_resend = 0; + sp->resend_at = jiffies + rxrpc_resend_timeout * HZ; + if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { + _debug("run timer"); + call->resend_timer.expires = sp->resend_at; + add_timer(&call->resend_timer); + } + + /* attempt to cancel the rx-ACK timer, deferring reply transmission if + * we're ACK'ing the request phase of an incoming call */ + ret = -EAGAIN; + if (try_to_del_timer_sync(&call->ack_timer) >= 0) { + /* the packet may be freed by rxrpc_process_call() before this + * returns */ + ret = rxrpc_send_packet(call->conn->trans, skb); + _net("sent skb %p", skb); + } else { + _debug("failed to delete ACK timer"); + } + + if (ret < 0) { + _debug("need instant resend %d", ret); + sp->need_resend = 1; + rxrpc_instant_resend(call); + } + + _leave(""); +} + +/* + * send data through a socket + * - must be called in process context + * - caller holds the socket locked + */ +static int rxrpc_send_data(struct kiocb *iocb, + struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len) +{ + struct rxrpc_skb_priv *sp; + unsigned char __user *from; + struct sk_buff *skb; + struct iovec *iov; + struct sock *sk = &rx->sk; + long timeo; + bool more; + int ret, ioc, segment, copied; + + _enter(",,,{%zu},%zu", msg->msg_iovlen, len); + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + /* this should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + return -EPIPE; + + iov = msg->msg_iov; + ioc = msg->msg_iovlen - 1; + from = iov->iov_base; + segment = iov->iov_len; + iov++; + more = msg->msg_flags & MSG_MORE; + + skb = call->tx_pending; + call->tx_pending = NULL; + + copied = 0; + do { + int copy; + + if (segment > len) + segment = len; + + _debug("SEGMENT %d @%p", segment, from); + + if (!skb) { + size_t size, chunk, max, space; + + _debug("alloc"); + + if (CIRC_SPACE(call->acks_head, call->acks_tail, + call->acks_winsz) <= 0) { + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + ret = rxrpc_wait_for_tx_window(rx, call, + &timeo); + if (ret < 0) + goto maybe_error; + } + + max = call->conn->trans->peer->maxdata; + max -= call->conn->security_size; + max &= ~(call->conn->size_align - 1UL); + + chunk = max; + if (chunk > len && !more) + chunk = len; + + space = chunk + call->conn->size_align; + space &= ~(call->conn->size_align - 1UL); + + size = space + call->conn->header_size; + + _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + + /* create a buffer that we can retain until it's ACK'd */ + skb = sock_alloc_send_skb( + sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + if (!skb) + goto maybe_error; + + rxrpc_new_skb(skb); + + _debug("ALLOC SEND %p", skb); + + ASSERTCMP(skb->mark, ==, 0); + + _debug("HS: %u", call->conn->header_size); + skb_reserve(skb, call->conn->header_size); + skb->len += call->conn->header_size; + + sp = rxrpc_skb(skb); + sp->remain = chunk; + if (sp->remain > skb_tailroom(skb)) + sp->remain = skb_tailroom(skb); + + _net("skb: hr %d, tr %d, hl %d, rm %d", + skb_headroom(skb), + skb_tailroom(skb), + skb_headlen(skb), + sp->remain); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + _debug("append"); + sp = rxrpc_skb(skb); + + /* append next segment of data to the current buffer */ + copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > segment) + copy = segment; + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, from, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; + + len -= copy; + segment -= copy; + from += copy; + while (segment == 0 && ioc > 0) { + from = iov->iov_base; + segment = iov->iov_len; + iov++; + ioc--; + } + if (len == 0) { + segment = 0; + ioc = 0; + } + + /* check for the far side aborting the call or a network error + * occurring */ + if (call->state > RXRPC_CALL_COMPLETE) + goto call_aborted; + + /* add the packet to the send queue if it's now full */ + if (sp->remain <= 0 || (segment == 0 && !more)) { + struct rxrpc_connection *conn = call->conn; + size_t pad; + + /* pad out if we're using security */ + if (conn->security) { + pad = conn->security_size + skb->mark; + pad = conn->size_align - pad; + pad &= conn->size_align - 1; + _debug("pad %zu", pad); + if (pad) + memset(skb_put(skb, pad), 0, pad); + } + + sp->hdr.epoch = conn->epoch; + sp->hdr.cid = call->cid; + sp->hdr.callNumber = call->call_id; + sp->hdr.seq = + htonl(atomic_inc_return(&call->sequence)); + sp->hdr.serial = + htonl(atomic_inc_return(&conn->serial)); + sp->hdr.type = RXRPC_PACKET_TYPE_DATA; + sp->hdr.userStatus = 0; + sp->hdr.securityIndex = conn->security_ix; + sp->hdr._rsvd = 0; + sp->hdr.serviceId = conn->service_id; + + sp->hdr.flags = conn->out_clientflag; + if (len == 0 && !more) + sp->hdr.flags |= RXRPC_LAST_PACKET; + else if (CIRC_SPACE(call->acks_head, call->acks_tail, + call->acks_winsz) > 1) + sp->hdr.flags |= RXRPC_MORE_PACKETS; + + ret = rxrpc_secure_packet( + call, skb, skb->mark, + skb->head + sizeof(struct rxrpc_header)); + if (ret < 0) + goto out; + + memcpy(skb->head, &sp->hdr, + sizeof(struct rxrpc_header)); + rxrpc_queue_packet(call, skb, segment == 0 && !more); + skb = NULL; + } + + } while (segment > 0); + +success: + ret = copied; +out: + call->tx_pending = skb; + _leave(" = %d", ret); + return ret; + +call_aborted: + rxrpc_free_skb(skb); + if (call->state == RXRPC_CALL_NETWORK_ERROR) + ret = call->conn->trans->peer->net_error; + else + ret = -ECONNABORTED; + _leave(" = %d", ret); + return ret; + +maybe_error: + if (copied) + goto success; + goto out; + +efault: + ret = -EFAULT; + goto out; +} diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c new file mode 100644 index 00000000..2754f098 --- /dev/null +++ b/net/rxrpc/ar-peer.c @@ -0,0 +1,303 @@ +/* RxRPC remote transport endpoint management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_peers); +static DEFINE_RWLOCK(rxrpc_peer_lock); +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq); + +static void rxrpc_destroy_peer(struct work_struct *work); + +/* + * assess the MTU size for the network interface through which this peer is + * reached + */ +static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) +{ + struct rtable *rt; + struct flowi4 fl4; + + peer->if_mtu = 1500; + + rt = ip_route_output_ports(&init_net, &fl4, NULL, + peer->srx.transport.sin.sin_addr.s_addr, 0, + htons(7000), htons(7001), + IPPROTO_UDP, 0, 0); + if (IS_ERR(rt)) { + _leave(" [route err %ld]", PTR_ERR(rt)); + return; + } + + peer->if_mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); + + _leave(" [if_mtu %u]", peer->if_mtu); +} + +/* + * allocate a new peer + */ +static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + struct rxrpc_peer *peer; + + _enter(""); + + peer = kzalloc(sizeof(struct rxrpc_peer), gfp); + if (peer) { + INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer); + INIT_LIST_HEAD(&peer->link); + INIT_LIST_HEAD(&peer->error_targets); + spin_lock_init(&peer->lock); + atomic_set(&peer->usage, 1); + peer->debug_id = atomic_inc_return(&rxrpc_debug_id); + memcpy(&peer->srx, srx, sizeof(*srx)); + + rxrpc_assess_MTU_size(peer); + peer->mtu = peer->if_mtu; + + if (srx->transport.family == AF_INET) { + peer->hdrsize = sizeof(struct iphdr); + switch (srx->transport_type) { + case SOCK_DGRAM: + peer->hdrsize += sizeof(struct udphdr); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + + peer->hdrsize += sizeof(struct rxrpc_header); + peer->maxdata = peer->mtu - peer->hdrsize; + } + + _leave(" = %p", peer); + return peer; +} + +/* + * obtain a remote transport endpoint for the specified address + */ +struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp) +{ + struct rxrpc_peer *peer, *candidate; + const char *new = "old"; + int usage; + + _enter("{%d,%d,%pI4+%hu}", + srx->transport_type, + srx->transport_len, + &srx->transport.sin.sin_addr, + ntohs(srx->transport.sin.sin_port)); + + /* search the peer list first */ + read_lock_bh(&rxrpc_peer_lock); + list_for_each_entry(peer, &rxrpc_peers, link) { + _debug("check PEER %d { u=%d t=%d l=%d }", + peer->debug_id, + atomic_read(&peer->usage), + peer->srx.transport_type, + peer->srx.transport_len); + + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == srx->transport_type && + peer->srx.transport_len == srx->transport_len && + memcmp(&peer->srx.transport, + &srx->transport, + srx->transport_len) == 0) + goto found_extant_peer; + } + read_unlock_bh(&rxrpc_peer_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_peer(srx, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock_bh(&rxrpc_peer_lock); + + list_for_each_entry(peer, &rxrpc_peers, link) { + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == srx->transport_type && + peer->srx.transport_len == srx->transport_len && + memcmp(&peer->srx.transport, + &srx->transport, + srx->transport_len) == 0) + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + peer = candidate; + candidate = NULL; + usage = atomic_read(&peer->usage); + + list_add_tail(&peer->link, &rxrpc_peers); + write_unlock_bh(&rxrpc_peer_lock); + new = "new"; + +success: + _net("PEER %s %d {%d,%u,%pI4+%hu}", + new, + peer->debug_id, + peer->srx.transport_type, + peer->srx.transport.family, + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + _leave(" = %p {u=%d}", peer, usage); + return peer; + + /* we found the peer in the list immediately */ +found_extant_peer: + usage = atomic_inc_return(&peer->usage); + read_unlock_bh(&rxrpc_peer_lock); + goto success; + + /* we found the peer on the second time through the list */ +found_extant_second: + usage = atomic_inc_return(&peer->usage); + write_unlock_bh(&rxrpc_peer_lock); + kfree(candidate); + goto success; +} + +/* + * find the peer associated with a packet + */ +struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local, + __be32 addr, __be16 port) +{ + struct rxrpc_peer *peer; + + _enter(""); + + /* search the peer list */ + read_lock_bh(&rxrpc_peer_lock); + + if (local->srx.transport.family == AF_INET && + local->srx.transport_type == SOCK_DGRAM + ) { + list_for_each_entry(peer, &rxrpc_peers, link) { + if (atomic_read(&peer->usage) > 0 && + peer->srx.transport_type == SOCK_DGRAM && + peer->srx.transport.family == AF_INET && + peer->srx.transport.sin.sin_port == port && + peer->srx.transport.sin.sin_addr.s_addr == addr) + goto found_UDP_peer; + } + + goto new_UDP_peer; + } + + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = -EAFNOSUPPORT"); + return ERR_PTR(-EAFNOSUPPORT); + +found_UDP_peer: + _net("Rx UDP DGRAM from peer %d", peer->debug_id); + atomic_inc(&peer->usage); + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = %p", peer); + return peer; + +new_UDP_peer: + _net("Rx UDP DGRAM from NEW peer %d", peer->debug_id); + read_unlock_bh(&rxrpc_peer_lock); + _leave(" = -EBUSY [new]"); + return ERR_PTR(-EBUSY); +} + +/* + * release a remote transport endpoint + */ +void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + _enter("%p{u=%d}", peer, atomic_read(&peer->usage)); + + ASSERTCMP(atomic_read(&peer->usage), >, 0); + + if (likely(!atomic_dec_and_test(&peer->usage))) { + _leave(" [in use]"); + return; + } + + rxrpc_queue_work(&peer->destroyer); + _leave(""); +} + +/* + * destroy a remote transport endpoint + */ +static void rxrpc_destroy_peer(struct work_struct *work) +{ + struct rxrpc_peer *peer = + container_of(work, struct rxrpc_peer, destroyer); + + _enter("%p{%d}", peer, atomic_read(&peer->usage)); + + write_lock_bh(&rxrpc_peer_lock); + list_del(&peer->link); + write_unlock_bh(&rxrpc_peer_lock); + + _net("DESTROY PEER %d", peer->debug_id); + kfree(peer); + + if (list_empty(&rxrpc_peers)) + wake_up_all(&rxrpc_peer_wq); + _leave(""); +} + +/* + * preemptively destroy all the peer records from a transport endpoint rather + * than waiting for them to time out + */ +void __exit rxrpc_destroy_all_peers(void) +{ + DECLARE_WAITQUEUE(myself,current); + + _enter(""); + + /* we simply have to wait for them to go away */ + if (!list_empty(&rxrpc_peers)) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&rxrpc_peer_wq, &myself); + + while (!list_empty(&rxrpc_peers)) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&rxrpc_peer_wq, &myself); + set_current_state(TASK_RUNNING); + } + + _leave(""); +} diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c new file mode 100644 index 00000000..38047f71 --- /dev/null +++ b/net/rxrpc/ar-proc.c @@ -0,0 +1,192 @@ +/* /proc/net/ support for AF_RXRPC + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "ar-internal.h" + +static const char *const rxrpc_conn_states[] = { + [RXRPC_CONN_UNUSED] = "Unused ", + [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVER_UNSECURED] = "SvUnsec ", + [RXRPC_CONN_SERVER_CHALLENGING] = "SvChall ", + [RXRPC_CONN_SERVER] = "SvSecure", + [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", + [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CONN_NETWORK_ERROR] = "NetError", +}; + +/* + * generate a list of extant and dead calls in /proc/net/rxrpc_calls + */ +static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) +{ + read_lock(&rxrpc_call_lock); + return seq_list_start_head(&rxrpc_calls, *_pos); +} + +static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &rxrpc_calls, pos); +} + +static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_call_lock); +} + +static int rxrpc_call_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_transport *trans; + struct rxrpc_call *call; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == &rxrpc_calls) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID CallID End Use State Abort " + " UserID\n"); + return 0; + } + + call = list_entry(v, struct rxrpc_call, link); + trans = call->conn->trans; + + sprintf(lbuff, "%pI4:%u", + &trans->local->srx.transport.sin.sin_addr, + ntohs(trans->local->srx.transport.sin.sin_port)); + + sprintf(rbuff, "%pI4:%u", + &trans->peer->srx.transport.sin.sin_addr, + ntohs(trans->peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %-8.8s %08x %lx\n", + lbuff, + rbuff, + ntohs(call->conn->service_id), + ntohl(call->conn->cid), + ntohl(call->call_id), + call->conn->in_clientflag ? "Svc" : "Clt", + atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->abort_code, + call->user_call_ID); + + return 0; +} + +static const struct seq_operations rxrpc_call_seq_ops = { + .start = rxrpc_call_seq_start, + .next = rxrpc_call_seq_next, + .stop = rxrpc_call_seq_stop, + .show = rxrpc_call_seq_show, +}; + +static int rxrpc_call_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_call_seq_ops); +} + +const struct file_operations rxrpc_call_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_call_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * generate a list of extant virtual connections in /proc/net/rxrpc_conns + */ +static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) +{ + read_lock(&rxrpc_connection_lock); + return seq_list_start_head(&rxrpc_connections, *_pos); +} + +static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + return seq_list_next(v, &rxrpc_connections, pos); +} + +static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&rxrpc_connection_lock); +} + +static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_connection *conn; + struct rxrpc_transport *trans; + char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + + if (v == &rxrpc_connections) { + seq_puts(seq, + "Proto Local Remote " + " SvID ConnID Calls End Use State Key " + " Serial ISerial\n" + ); + return 0; + } + + conn = list_entry(v, struct rxrpc_connection, link); + trans = conn->trans; + + sprintf(lbuff, "%pI4:%u", + &trans->local->srx.transport.sin.sin_addr, + ntohs(trans->local->srx.transport.sin.sin_port)); + + sprintf(rbuff, "%pI4:%u", + &trans->peer->srx.transport.sin.sin_addr, + ntohs(trans->peer->srx.transport.sin.sin_port)); + + seq_printf(seq, + "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + " %s %08x %08x %08x\n", + lbuff, + rbuff, + ntohs(conn->service_id), + ntohl(conn->cid), + conn->call_counter, + conn->in_clientflag ? "Svc" : "Clt", + atomic_read(&conn->usage), + rxrpc_conn_states[conn->state], + key_serial(conn->key), + atomic_read(&conn->serial), + atomic_read(&conn->hi_serial)); + + return 0; +} + +static const struct seq_operations rxrpc_connection_seq_ops = { + .start = rxrpc_connection_seq_start, + .next = rxrpc_connection_seq_next, + .stop = rxrpc_connection_seq_stop, + .show = rxrpc_connection_seq_show, +}; + + +static int rxrpc_connection_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rxrpc_connection_seq_ops); +} + +const struct file_operations rxrpc_connection_seq_fops = { + .owner = THIS_MODULE, + .open = rxrpc_connection_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c new file mode 100644 index 00000000..0c65013e --- /dev/null +++ b/net/rxrpc/ar-recvmsg.c @@ -0,0 +1,438 @@ +/* RxRPC recvmsg() implementation + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * removal a call's user ID from the socket tree to make the user ID available + * again and so that it won't be seen again in association with that call + */ +void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) +{ + _debug("RELEASE CALL %d", call->debug_id); + + if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + write_lock_bh(&rx->call_lock); + rb_erase(&call->sock_node, &call->socket->calls); + clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + write_unlock_bh(&rx->call_lock); + } + + read_lock_bh(&call->state_lock); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && + !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) + rxrpc_queue_call(call); + read_unlock_bh(&call->state_lock); +} + +/* + * receive a message from an RxRPC socket + * - we need to be careful about two or more threads calling recvmsg + * simultaneously + */ +int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct rxrpc_skb_priv *sp; + struct rxrpc_call *call = NULL, *continue_call = NULL; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + long timeo; + int copy, ret, ullen, offset, copied = 0; + u32 abort_code; + + DEFINE_WAIT(wait); + + _enter(",,,%zu,%d", len, flags); + + if (flags & (MSG_OOB | MSG_TRUNC)) + return -EOPNOTSUPP; + + ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); + + timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); + msg->msg_flags |= MSG_MORE; + + lock_sock(&rx->sk); + + for (;;) { + /* return immediately if a client socket has no outstanding + * calls */ + if (RB_EMPTY_ROOT(&rx->calls)) { + if (copied) + goto out; + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { + release_sock(&rx->sk); + if (continue_call) + rxrpc_put_call(continue_call); + return -ENODATA; + } + } + + /* get the next message on the Rx queue */ + skb = skb_peek(&rx->sk.sk_receive_queue); + if (!skb) { + /* nothing remains on the queue */ + if (copied && + (msg->msg_flags & MSG_PEEK || timeo == 0)) + goto out; + + /* wait for a message to turn up */ + release_sock(&rx->sk); + prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, + TASK_INTERRUPTIBLE); + ret = sock_error(&rx->sk); + if (ret) + goto wait_error; + + if (skb_queue_empty(&rx->sk.sk_receive_queue)) { + if (signal_pending(current)) + goto wait_interrupted; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(&rx->sk), &wait); + lock_sock(&rx->sk); + continue; + } + + peek_next_packet: + sp = rxrpc_skb(skb); + call = sp->call; + ASSERT(call != NULL); + + _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); + + /* make sure we wait for the state to be updated in this call */ + spin_lock_bh(&call->lock); + spin_unlock_bh(&call->lock); + + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + _debug("packet from released call"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + continue; + } + + /* determine whether to continue last data receive */ + if (continue_call) { + _debug("maybe cont"); + if (call != continue_call || + skb->mark != RXRPC_SKB_MARK_DATA) { + release_sock(&rx->sk); + rxrpc_put_call(continue_call); + _leave(" = %d [noncont]", copied); + return copied; + } + } + + rxrpc_get_call(call); + + /* copy the peer address and timestamp */ + if (!continue_call) { + if (msg->msg_name && msg->msg_namelen > 0) + memcpy(msg->msg_name, + &call->conn->trans->peer->srx, + sizeof(call->conn->trans->peer->srx)); + sock_recv_ts_and_drops(msg, &rx->sk, skb); + } + + /* receive the message */ + if (skb->mark != RXRPC_SKB_MARK_DATA) + goto receive_non_data_message; + + _debug("recvmsg DATA #%u { %d, %d }", + ntohl(sp->hdr.seq), skb->len, sp->offset); + + if (!continue_call) { + /* only set the control data once per recvmsg() */ + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + } + + ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); + ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); + call->rx_data_recv = ntohl(sp->hdr.seq); + + ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + + offset = sp->offset; + copy = skb->len - offset; + if (copy > len - copied) + copy = len - copied; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + ret = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, copy); + } else { + ret = skb_copy_and_csum_datagram_iovec(skb, offset, + msg->msg_iov); + if (ret == -EINVAL) + goto csum_copy_error; + } + + if (ret < 0) + goto copy_error; + + /* handle piecemeal consumption of data packets */ + _debug("copied %d+%d", copy, copied); + + offset += copy; + copied += copy; + + if (!(flags & MSG_PEEK)) + sp->offset = offset; + + if (sp->offset < skb->len) { + _debug("buffer full"); + ASSERTCMP(copied, ==, len); + break; + } + + /* we transferred the whole data packet */ + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + _debug("last"); + if (call->conn->out_clientflag) { + /* last byte of reply received */ + ret = copied; + goto terminal_message; + } + + /* last bit of request received */ + if (!(flags & MSG_PEEK)) { + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != + skb) + BUG(); + rxrpc_free_skb(skb); + } + msg->msg_flags &= ~MSG_MORE; + break; + } + + /* move on to the next data message */ + _debug("next"); + if (!continue_call) + continue_call = sp->call; + else + rxrpc_put_call(call); + call = NULL; + + if (flags & MSG_PEEK) { + _debug("peek next"); + skb = skb->next; + if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) + break; + goto peek_next_packet; + } + + _debug("eat packet"); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + + /* end of non-terminal data packet reception for the moment */ + _debug("end rcv data"); +out: + release_sock(&rx->sk); + if (call) + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d [data]", copied); + return copied; + + /* handle non-DATA messages such as aborts, incoming connections and + * final ACKs */ +receive_non_data_message: + _debug("non-data"); + + if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { + _debug("RECV NEW CALL"); + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); + if (ret < 0) + goto copy_error; + if (!(flags & MSG_PEEK)) { + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + } + goto out; + } + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + ullen, &call->user_call_ID); + if (ret < 0) + goto copy_error; + ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); + + switch (skb->mark) { + case RXRPC_SKB_MARK_DATA: + BUG(); + case RXRPC_SKB_MARK_FINAL_ACK: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); + break; + case RXRPC_SKB_MARK_BUSY: + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); + break; + case RXRPC_SKB_MARK_REMOTE_ABORT: + abort_code = call->abort_code; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_NET_ERROR: + _debug("RECV NET ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ERROR: + _debug("RECV LOCAL ERROR %d", sp->error); + abort_code = sp->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, + &abort_code); + break; + default: + BUG(); + break; + } + + if (ret < 0) + goto copy_error; + +terminal_message: + _debug("terminal"); + msg->msg_flags &= ~MSG_MORE; + msg->msg_flags |= MSG_EOR; + + if (!(flags & MSG_PEEK)) { + _net("free terminal skb %p", skb); + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + rxrpc_free_skb(skb); + rxrpc_remove_user_ID(rx, call); + } + + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +copy_error: + _debug("copy error"); + release_sock(&rx->sk); + rxrpc_put_call(call); + if (continue_call) + rxrpc_put_call(continue_call); + _leave(" = %d", ret); + return ret; + +csum_copy_error: + _debug("csum error"); + release_sock(&rx->sk); + if (continue_call) + rxrpc_put_call(continue_call); + rxrpc_kill_skb(skb); + skb_kill_datagram(&rx->sk, skb, flags); + rxrpc_put_call(call); + return -EAGAIN; + +wait_interrupted: + ret = sock_intr_errno(timeo); +wait_error: + finish_wait(sk_sleep(&rx->sk), &wait); + if (continue_call) + rxrpc_put_call(continue_call); + if (copied) + copied = ret; + _leave(" = %d [waitfail %d]", copied, ret); + return copied; + +} + +/** + * rxrpc_kernel_data_delivered - Record delivery of data message + * @skb: Message holding data + * + * Record the delivery of a data message. This permits RxRPC to keep its + * tracking correct. The socket buffer will be deleted. + */ +void rxrpc_kernel_data_delivered(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv); + ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1); + call->rx_data_recv = ntohl(sp->hdr.seq); + + ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten); + rxrpc_free_skb(skb); +} + +EXPORT_SYMBOL(rxrpc_kernel_data_delivered); + +/** + * rxrpc_kernel_is_data_last - Determine if data message is last one + * @skb: Message holding data + * + * Determine if data message is last one for the parent call. + */ +bool rxrpc_kernel_is_data_last(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); + + return sp->hdr.flags & RXRPC_LAST_PACKET; +} + +EXPORT_SYMBOL(rxrpc_kernel_is_data_last); + +/** + * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message + * @skb: Message indicating an abort + * + * Get the abort code from an RxRPC abort message. + */ +u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_REMOTE_ABORT); + + return sp->call->abort_code; +} + +EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); + +/** + * rxrpc_kernel_get_error - Get the error number from an RxRPC error message + * @skb: Message indicating an error + * + * Get the error number from an RxRPC error message. + */ +int rxrpc_kernel_get_error_number(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + return sp->error; +} + +EXPORT_SYMBOL(rxrpc_kernel_get_error_number); diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c new file mode 100644 index 00000000..49b3cc31 --- /dev/null +++ b/net/rxrpc/ar-security.c @@ -0,0 +1,264 @@ +/* RxRPC security handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static LIST_HEAD(rxrpc_security_methods); +static DECLARE_RWSEM(rxrpc_security_sem); + +/* + * get an RxRPC security module + */ +static struct rxrpc_security *rxrpc_security_get(struct rxrpc_security *sec) +{ + return try_module_get(sec->owner) ? sec : NULL; +} + +/* + * release an RxRPC security module + */ +static void rxrpc_security_put(struct rxrpc_security *sec) +{ + module_put(sec->owner); +} + +/* + * look up an rxrpc security module + */ +static struct rxrpc_security *rxrpc_security_lookup(u8 security_index) +{ + struct rxrpc_security *sec = NULL; + + _enter(""); + + down_read(&rxrpc_security_sem); + + list_for_each_entry(sec, &rxrpc_security_methods, link) { + if (sec->security_index == security_index) { + if (unlikely(!rxrpc_security_get(sec))) + break; + goto out; + } + } + + sec = NULL; +out: + up_read(&rxrpc_security_sem); + _leave(" = %p [%s]", sec, sec ? sec->name : ""); + return sec; +} + +/** + * rxrpc_register_security - register an RxRPC security handler + * @sec: security module + * + * register an RxRPC security handler for use by RxRPC + */ +int rxrpc_register_security(struct rxrpc_security *sec) +{ + struct rxrpc_security *psec; + int ret; + + _enter(""); + down_write(&rxrpc_security_sem); + + ret = -EEXIST; + list_for_each_entry(psec, &rxrpc_security_methods, link) { + if (psec->security_index == sec->security_index) + goto out; + } + + list_add(&sec->link, &rxrpc_security_methods); + + printk(KERN_NOTICE "RxRPC: Registered security type %d '%s'\n", + sec->security_index, sec->name); + ret = 0; + +out: + up_write(&rxrpc_security_sem); + _leave(" = %d", ret); + return ret; +} + +EXPORT_SYMBOL_GPL(rxrpc_register_security); + +/** + * rxrpc_unregister_security - unregister an RxRPC security handler + * @sec: security module + * + * unregister an RxRPC security handler + */ +void rxrpc_unregister_security(struct rxrpc_security *sec) +{ + + _enter(""); + down_write(&rxrpc_security_sem); + list_del_init(&sec->link); + up_write(&rxrpc_security_sem); + + printk(KERN_NOTICE "RxRPC: Unregistered security type %d '%s'\n", + sec->security_index, sec->name); +} + +EXPORT_SYMBOL_GPL(rxrpc_unregister_security); + +/* + * initialise the security on a client connection + */ +int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) +{ + struct rxrpc_key_token *token; + struct rxrpc_security *sec; + struct key *key = conn->key; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(key)); + + if (!key) + return 0; + + ret = key_validate(key); + if (ret < 0) + return ret; + + if (!key->payload.data) + return -EKEYREJECTED; + token = key->payload.data; + + sec = rxrpc_security_lookup(token->security_index); + if (!sec) + return -EKEYREJECTED; + conn->security = sec; + + ret = conn->security->init_connection_security(conn); + if (ret < 0) { + rxrpc_security_put(conn->security); + conn->security = NULL; + return ret; + } + + _leave(" = 0"); + return 0; +} + +/* + * initialise the security on a server connection + */ +int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) +{ + struct rxrpc_security *sec; + struct rxrpc_local *local = conn->trans->local; + struct rxrpc_sock *rx; + struct key *key; + key_ref_t kref; + char kdesc[5+1+3+1]; + + _enter(""); + + sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix); + + sec = rxrpc_security_lookup(conn->security_ix); + if (!sec) { + _leave(" = -ENOKEY [lookup]"); + return -ENOKEY; + } + + /* find the service */ + read_lock_bh(&local->services_lock); + list_for_each_entry(rx, &local->services, listen_link) { + if (rx->service_id == conn->service_id) + goto found_service; + } + + /* the service appears to have died */ + read_unlock_bh(&local->services_lock); + rxrpc_security_put(sec); + _leave(" = -ENOENT"); + return -ENOENT; + +found_service: + if (!rx->securities) { + read_unlock_bh(&local->services_lock); + rxrpc_security_put(sec); + _leave(" = -ENOKEY"); + return -ENOKEY; + } + + /* look through the service's keyring */ + kref = keyring_search(make_key_ref(rx->securities, 1UL), + &key_type_rxrpc_s, kdesc); + if (IS_ERR(kref)) { + read_unlock_bh(&local->services_lock); + rxrpc_security_put(sec); + _leave(" = %ld [search]", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + key = key_ref_to_ptr(kref); + read_unlock_bh(&local->services_lock); + + conn->server_key = key; + conn->security = sec; + + _leave(" = 0"); + return 0; +} + +/* + * secure a packet prior to transmission + */ +int rxrpc_secure_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + size_t data_size, + void *sechdr) +{ + if (call->conn->security) + return call->conn->security->secure_packet( + call, skb, data_size, sechdr); + return 0; +} + +/* + * secure a packet prior to transmission + */ +int rxrpc_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb, + u32 *_abort_code) +{ + if (call->conn->security) + return call->conn->security->verify_packet( + call, skb, _abort_code); + return 0; +} + +/* + * clear connection security + */ +void rxrpc_clear_conn_security(struct rxrpc_connection *conn) +{ + _enter("{%d}", conn->debug_id); + + if (conn->security) { + conn->security->clear(conn); + rxrpc_security_put(conn->security); + conn->security = NULL; + } + + key_put(conn->key); + key_put(conn->server_key); +} diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c new file mode 100644 index 00000000..de755e04 --- /dev/null +++ b/net/rxrpc/ar-skbuff.c @@ -0,0 +1,132 @@ +/* ar-skbuff.c: socket buffer destruction handling + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "ar-internal.h" + +/* + * set up for the ACK at the end of the receive phase when we discard the final + * receive phase data packet + * - called with softirqs disabled + */ +static void rxrpc_request_final_ACK(struct rxrpc_call *call) +{ + /* the call may be aborted before we have a chance to ACK it */ + write_lock(&call->state_lock); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + call->state = RXRPC_CALL_CLIENT_FINAL_ACK; + _debug("request final ACK"); + + /* get an extra ref on the call for the final-ACK generator to + * release */ + rxrpc_get_call(call); + set_bit(RXRPC_CALL_ACK_FINAL, &call->events); + if (try_to_del_timer_sync(&call->ack_timer) >= 0) + rxrpc_queue_call(call); + break; + + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + default: + break; + } + + write_unlock(&call->state_lock); +} + +/* + * drop the bottom ACK off of the call ACK window and advance the window + */ +static void rxrpc_hard_ACK_data(struct rxrpc_call *call, + struct rxrpc_skb_priv *sp) +{ + int loop; + u32 seq; + + spin_lock_bh(&call->lock); + + _debug("hard ACK #%u", ntohl(sp->hdr.seq)); + + for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { + call->ackr_window[loop] >>= 1; + call->ackr_window[loop] |= + call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); + } + + seq = ntohl(sp->hdr.seq); + ASSERTCMP(seq, ==, call->rx_data_eaten + 1); + call->rx_data_eaten = seq; + + if (call->ackr_win_top < UINT_MAX) + call->ackr_win_top++; + + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_post, >=, call->rx_data_recv); + ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, + call->rx_data_recv, >=, call->rx_data_eaten); + + if (sp->hdr.flags & RXRPC_LAST_PACKET) { + rxrpc_request_final_ACK(call); + } else if (atomic_dec_and_test(&call->ackr_not_idle) && + test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { + _debug("send Rx idle ACK"); + __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, + true); + } + + spin_unlock_bh(&call->lock); +} + +/* + * destroy a packet that has an RxRPC control buffer + * - advance the hard-ACK state of the parent call (done here in case something + * in the kernel bypasses recvmsg() and steals the packet directly off of the + * socket receive queue) + */ +void rxrpc_packet_destructor(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = sp->call; + + _enter("%p{%p}", skb, call); + + if (call) { + /* send the final ACK on a client call */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) + rxrpc_hard_ACK_data(call, sp); + rxrpc_put_call(call); + sp->call = NULL; + } + + if (skb->sk) + sock_rfree(skb); + _leave(""); +} + +/** + * rxrpc_kernel_free_skb - Free an RxRPC socket buffer + * @skb: The socket buffer to be freed + * + * Let RxRPC free its own socket buffer, permitting it to maintain debug + * accounting. + */ +void rxrpc_kernel_free_skb(struct sk_buff *skb) +{ + rxrpc_free_skb(skb); +} + +EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c new file mode 100644 index 00000000..92df5669 --- /dev/null +++ b/net/rxrpc/ar-transport.c @@ -0,0 +1,279 @@ +/* RxRPC point-to-point transport session management + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +static void rxrpc_transport_reaper(struct work_struct *work); + +static LIST_HEAD(rxrpc_transports); +static DEFINE_RWLOCK(rxrpc_transport_lock); +static unsigned long rxrpc_transport_timeout = 3600 * 24; +static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper); + +/* + * allocate a new transport session manager + */ +static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer, + gfp_t gfp) +{ + struct rxrpc_transport *trans; + + _enter(""); + + trans = kzalloc(sizeof(struct rxrpc_transport), gfp); + if (trans) { + trans->local = local; + trans->peer = peer; + INIT_LIST_HEAD(&trans->link); + trans->bundles = RB_ROOT; + trans->client_conns = RB_ROOT; + trans->server_conns = RB_ROOT; + skb_queue_head_init(&trans->error_queue); + spin_lock_init(&trans->client_lock); + rwlock_init(&trans->conn_lock); + atomic_set(&trans->usage, 1); + trans->debug_id = atomic_inc_return(&rxrpc_debug_id); + + if (peer->srx.transport.family == AF_INET) { + switch (peer->srx.transport_type) { + case SOCK_DGRAM: + INIT_WORK(&trans->error_handler, + rxrpc_UDP_error_handler); + break; + default: + BUG(); + break; + } + } else { + BUG(); + } + } + + _leave(" = %p", trans); + return trans; +} + +/* + * obtain a transport session for the nominated endpoints + */ +struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer, + gfp_t gfp) +{ + struct rxrpc_transport *trans, *candidate; + const char *new = "old"; + int usage; + + _enter("{%pI4+%hu},{%pI4+%hu},", + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port), + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + /* search the transport list first */ + read_lock_bh(&rxrpc_transport_lock); + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_transport; + } + read_unlock_bh(&rxrpc_transport_lock); + + /* not yet present - create a candidate for a new record and then + * redo the search */ + candidate = rxrpc_alloc_transport(local, peer, gfp); + if (!candidate) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + write_lock_bh(&rxrpc_transport_lock); + + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_second; + } + + /* we can now add the new candidate to the list */ + trans = candidate; + candidate = NULL; + usage = atomic_read(&trans->usage); + + rxrpc_get_local(trans->local); + atomic_inc(&trans->peer->usage); + list_add_tail(&trans->link, &rxrpc_transports); + write_unlock_bh(&rxrpc_transport_lock); + new = "new"; + +success: + _net("TRANSPORT %s %d local %d -> peer %d", + new, + trans->debug_id, + trans->local->debug_id, + trans->peer->debug_id); + + _leave(" = %p {u=%d}", trans, usage); + return trans; + + /* we found the transport in the list immediately */ +found_extant_transport: + usage = atomic_inc_return(&trans->usage); + read_unlock_bh(&rxrpc_transport_lock); + goto success; + + /* we found the transport on the second time through the list */ +found_extant_second: + usage = atomic_inc_return(&trans->usage); + write_unlock_bh(&rxrpc_transport_lock); + kfree(candidate); + goto success; +} + +/* + * find the transport connecting two endpoints + */ +struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local, + struct rxrpc_peer *peer) +{ + struct rxrpc_transport *trans; + + _enter("{%pI4+%hu},{%pI4+%hu},", + &local->srx.transport.sin.sin_addr, + ntohs(local->srx.transport.sin.sin_port), + &peer->srx.transport.sin.sin_addr, + ntohs(peer->srx.transport.sin.sin_port)); + + /* search the transport list */ + read_lock_bh(&rxrpc_transport_lock); + + list_for_each_entry(trans, &rxrpc_transports, link) { + if (trans->local == local && trans->peer == peer) + goto found_extant_transport; + } + + read_unlock_bh(&rxrpc_transport_lock); + _leave(" = NULL"); + return NULL; + +found_extant_transport: + atomic_inc(&trans->usage); + read_unlock_bh(&rxrpc_transport_lock); + _leave(" = %p", trans); + return trans; +} + +/* + * release a transport session + */ +void rxrpc_put_transport(struct rxrpc_transport *trans) +{ + _enter("%p{u=%d}", trans, atomic_read(&trans->usage)); + + ASSERTCMP(atomic_read(&trans->usage), >, 0); + + trans->put_time = get_seconds(); + if (unlikely(atomic_dec_and_test(&trans->usage))) { + _debug("zombie"); + /* let the reaper determine the timeout to avoid a race with + * overextending the timeout if the reaper is running at the + * same time */ + rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); + } + _leave(""); +} + +/* + * clean up a transport session + */ +static void rxrpc_cleanup_transport(struct rxrpc_transport *trans) +{ + _net("DESTROY TRANS %d", trans->debug_id); + + rxrpc_purge_queue(&trans->error_queue); + + rxrpc_put_local(trans->local); + rxrpc_put_peer(trans->peer); + kfree(trans); +} + +/* + * reap dead transports that have passed their expiry date + */ +static void rxrpc_transport_reaper(struct work_struct *work) +{ + struct rxrpc_transport *trans, *_p; + unsigned long now, earliest, reap_time; + + LIST_HEAD(graveyard); + + _enter(""); + + now = get_seconds(); + earliest = ULONG_MAX; + + /* extract all the transports that have been dead too long */ + write_lock_bh(&rxrpc_transport_lock); + list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) { + _debug("reap TRANS %d { u=%d t=%ld }", + trans->debug_id, atomic_read(&trans->usage), + (long) now - (long) trans->put_time); + + if (likely(atomic_read(&trans->usage) > 0)) + continue; + + reap_time = trans->put_time + rxrpc_transport_timeout; + if (reap_time <= now) + list_move_tail(&trans->link, &graveyard); + else if (reap_time < earliest) + earliest = reap_time; + } + write_unlock_bh(&rxrpc_transport_lock); + + if (earliest != ULONG_MAX) { + _debug("reschedule reaper %ld", (long) earliest - now); + ASSERTCMP(earliest, >, now); + rxrpc_queue_delayed_work(&rxrpc_transport_reap, + (earliest - now) * HZ); + } + + /* then destroy all those pulled out */ + while (!list_empty(&graveyard)) { + trans = list_entry(graveyard.next, struct rxrpc_transport, + link); + list_del_init(&trans->link); + + ASSERTCMP(atomic_read(&trans->usage), ==, 0); + rxrpc_cleanup_transport(trans); + } + + _leave(""); +} + +/* + * preemptively destroy all the transport session records rather than waiting + * for them to time out + */ +void __exit rxrpc_destroy_all_transports(void) +{ + _enter(""); + + rxrpc_transport_timeout = 0; + cancel_delayed_work(&rxrpc_transport_reap); + rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0); + + _leave(""); +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c new file mode 100644 index 00000000..76351077 --- /dev/null +++ b/net/rxrpc/rxkad.c @@ -0,0 +1,1161 @@ +/* Kerberos-based RxRPC security + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define rxrpc_debug rxkad_debug +#include "ar-internal.h" + +#define RXKAD_VERSION 2 +#define MAXKRB5TICKETLEN 1024 +#define RXKAD_TKT_TYPE_KERBEROS_V5 256 +#define ANAME_SZ 40 /* size of authentication name */ +#define INST_SZ 40 /* size of principal's instance */ +#define REALM_SZ 40 /* size of principal's auth domain */ +#define SNAME_SZ 40 /* size of service name */ + +unsigned rxrpc_debug; +module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "rxkad debugging mask"); + +struct rxkad_level1_hdr { + __be32 data_size; /* true data size (excluding padding) */ +}; + +struct rxkad_level2_hdr { + __be32 data_size; /* true data size (excluding padding) */ + __be32 checksum; /* decrypted data checksum */ +}; + +MODULE_DESCRIPTION("RxRPC network protocol type-2 security (Kerberos 4)"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +/* + * this holds a pinned cipher so that keventd doesn't get called by the cipher + * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE + * packets + */ +static struct crypto_blkcipher *rxkad_ci; +static DEFINE_MUTEX(rxkad_ci_mutex); + +/* + * initialise connection security + */ +static int rxkad_init_connection_security(struct rxrpc_connection *conn) +{ + struct crypto_blkcipher *ci; + struct rxrpc_key_token *token; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + + token = conn->key->payload.data; + conn->security_ix = token->security_index; + + ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ci)) { + _debug("no cipher"); + ret = PTR_ERR(ci); + goto error; + } + + if (crypto_blkcipher_setkey(ci, token->kad->session_key, + sizeof(token->kad->session_key)) < 0) + BUG(); + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + break; + case RXRPC_SECURITY_AUTH: + conn->size_align = 8; + conn->security_size = sizeof(struct rxkad_level1_hdr); + conn->header_size += sizeof(struct rxkad_level1_hdr); + break; + case RXRPC_SECURITY_ENCRYPT: + conn->size_align = 8; + conn->security_size = sizeof(struct rxkad_level2_hdr); + conn->header_size += sizeof(struct rxkad_level2_hdr); + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + conn->cipher = ci; + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * prime the encryption state with the invariant parts of a connection's + * description + */ +static void rxkad_prime_packet_security(struct rxrpc_connection *conn) +{ + struct rxrpc_key_token *token; + struct blkcipher_desc desc; + struct scatterlist sg[2]; + struct rxrpc_crypt iv; + struct { + __be32 x[4]; + } tmpbuf __attribute__((aligned(16))); /* must all be in same page */ + + _enter(""); + + if (!conn->key) + return; + + token = conn->key->payload.data; + memcpy(&iv, token->kad->session_key, sizeof(iv)); + + desc.tfm = conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + tmpbuf.x[0] = conn->epoch; + tmpbuf.x[1] = conn->cid; + tmpbuf.x[2] = 0; + tmpbuf.x[3] = htonl(conn->security_ix); + + sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv)); + ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]); + + _leave(""); +} + +/* + * partially encrypt a packet (level 1 security) + */ +static int rxkad_secure_packet_auth(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 data_size, + void *sechdr) +{ + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct { + struct rxkad_level1_hdr hdr; + __be32 first; /* first four bytes of data and padding */ + } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + u16 check; + + sp = rxrpc_skb(skb); + + _enter(""); + + check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + data_size |= (u32) check << 16; + + tmpbuf.hdr.data_size = htonl(data_size); + memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first)); + + /* start the encryption afresh */ + memset(&iv, 0, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + memcpy(sechdr, &tmpbuf, sizeof(tmpbuf)); + + _leave(" = 0"); + return 0; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 data_size, + void *sechdr) +{ + const struct rxrpc_key_token *token; + struct rxkad_level2_hdr rxkhdr + __attribute__((aligned(8))); /* must be all on one page */ + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[16]; + struct sk_buff *trailer; + unsigned len; + u16 check; + int nsg; + + sp = rxrpc_skb(skb); + + _enter(""); + + check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + + rxkhdr.data_size = htonl(data_size | (u32) check << 16); + rxkhdr.checksum = 0; + + /* encrypt from the session key */ + token = call->conn->key->payload.data; + memcpy(&iv, token->kad->session_key, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); + sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr)); + + /* we want to encrypt the skbuff in-place */ + nsg = skb_cow_data(skb, 0, &trailer); + if (nsg < 0 || nsg > 16) + return -ENOMEM; + + len = data_size + call->conn->size_align - 1; + len &= ~(call->conn->size_align - 1); + + sg_init_table(sg, nsg); + skb_to_sgvec(skb, sg, 0, len); + crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + + _leave(" = 0"); + return 0; +} + +/* + * checksum an RxRPC packet header + */ +static int rxkad_secure_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + size_t data_size, + void *sechdr) +{ + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct { + __be32 x[2]; + } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + __be32 x; + u32 y; + int ret; + + sp = rxrpc_skb(skb); + + _enter("{%d{%x}},{#%u},%zu,", + call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq), + data_size); + + if (!call->conn->cipher) + return 0; + + ret = key_validate(call->conn->key); + if (ret < 0) + return ret; + + /* continue encrypting from where we left off */ + memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + /* calculate the security checksum */ + x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); + x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); + tmpbuf.x[0] = sp->hdr.callNumber; + tmpbuf.x[1] = x; + + sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + y = ntohl(tmpbuf.x[1]); + y = (y >> 16) & 0xffff; + if (y == 0) + y = 1; /* zero checksums are not permitted */ + sp->hdr.cksum = htons(y); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + ret = 0; + break; + case RXRPC_SECURITY_AUTH: + ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr); + break; + case RXRPC_SECURITY_ENCRYPT: + ret = rxkad_secure_packet_encrypt(call, skb, data_size, + sechdr); + break; + default: + ret = -EPERM; + break; + } + + _leave(" = %d [set %hx]", ret, y); + return ret; +} + +/* + * decrypt partial encryption on a packet (level 1 security) + */ +static int rxkad_verify_packet_auth(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxkad_level1_hdr sechdr; + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[16]; + struct sk_buff *trailer; + u32 data_size, buf; + u16 check; + int nsg; + + _enter(""); + + sp = rxrpc_skb(skb); + + /* we want to decrypt the skbuff in-place */ + nsg = skb_cow_data(skb, 0, &trailer); + if (nsg < 0 || nsg > 16) + goto nomem; + + sg_init_table(sg, nsg); + skb_to_sgvec(skb, sg, 0, 8); + + /* start the decryption afresh */ + memset(&iv, 0, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8); + + /* remove the decrypted packet length */ + if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) + goto datalen_error; + if (!skb_pull(skb, sizeof(sechdr))) + BUG(); + + buf = ntohl(sechdr.data_size); + data_size = buf & 0xffff; + + check = buf >> 16; + check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check &= 0xffff; + if (check != 0) { + *_abort_code = RXKADSEALEDINCON; + goto protocol_error; + } + + /* shorten the packet to remove the padding */ + if (data_size > skb->len) + goto datalen_error; + else if (data_size < skb->len) + skb->len = data_size; + + _leave(" = 0 [dlen=%x]", data_size); + return 0; + +datalen_error: + *_abort_code = RXKADDATALEN; +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * wholly decrypt a packet (level 2 security) + */ +static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + const struct rxrpc_key_token *token; + struct rxkad_level2_hdr sechdr; + struct rxrpc_skb_priv *sp; + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist _sg[4], *sg; + struct sk_buff *trailer; + u32 data_size, buf; + u16 check; + int nsg; + + _enter(",{%d}", skb->len); + + sp = rxrpc_skb(skb); + + /* we want to decrypt the skbuff in-place */ + nsg = skb_cow_data(skb, 0, &trailer); + if (nsg < 0) + goto nomem; + + sg = _sg; + if (unlikely(nsg > 4)) { + sg = kmalloc(sizeof(*sg) * nsg, GFP_NOIO); + if (!sg) + goto nomem; + } + + sg_init_table(sg, nsg); + skb_to_sgvec(skb, sg, 0, skb->len); + + /* decrypt from the session key */ + token = call->conn->key->payload.data; + memcpy(&iv, token->kad->session_key, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len); + if (sg != _sg) + kfree(sg); + + /* remove the decrypted packet length */ + if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) + goto datalen_error; + if (!skb_pull(skb, sizeof(sechdr))) + BUG(); + + buf = ntohl(sechdr.data_size); + data_size = buf & 0xffff; + + check = buf >> 16; + check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber); + check &= 0xffff; + if (check != 0) { + *_abort_code = RXKADSEALEDINCON; + goto protocol_error; + } + + /* shorten the packet to remove the padding */ + if (data_size > skb->len) + goto datalen_error; + else if (data_size < skb->len) + skb->len = data_size; + + _leave(" = 0 [dlen=%x]", data_size); + return 0; + +datalen_error: + *_abort_code = RXKADDATALEN; +protocol_error: + _leave(" = -EPROTO"); + return -EPROTO; + +nomem: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * verify the security on a received packet + */ +static int rxkad_verify_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct blkcipher_desc desc; + struct rxrpc_skb_priv *sp; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + struct { + __be32 x[2]; + } tmpbuf __attribute__((aligned(8))); /* must all be in same page */ + __be32 x; + __be16 cksum; + u32 y; + int ret; + + sp = rxrpc_skb(skb); + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), + ntohl(sp->hdr.seq)); + + if (!call->conn->cipher) + return 0; + + if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) { + *_abort_code = RXKADINCONSISTENCY; + _leave(" = -EPROTO [not rxkad]"); + return -EPROTO; + } + + /* continue encrypting from where we left off */ + memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); + desc.tfm = call->conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + /* validate the security checksum */ + x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); + x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); + tmpbuf.x[0] = call->call_id; + tmpbuf.x[1] = x; + + sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf)); + sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf)); + crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf)); + + y = ntohl(tmpbuf.x[1]); + y = (y >> 16) & 0xffff; + if (y == 0) + y = 1; /* zero checksums are not permitted */ + + cksum = htons(y); + if (sp->hdr.cksum != cksum) { + *_abort_code = RXKADSEALEDINCON; + _leave(" = -EPROTO [csum failed]"); + return -EPROTO; + } + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + ret = 0; + break; + case RXRPC_SECURITY_AUTH: + ret = rxkad_verify_packet_auth(call, skb, _abort_code); + break; + case RXRPC_SECURITY_ENCRYPT: + ret = rxkad_verify_packet_encrypt(call, skb, _abort_code); + break; + default: + ret = -ENOANO; + break; + } + + _leave(" = %d", ret); + return ret; +} + +/* + * issue a challenge + */ +static int rxkad_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxkad_challenge challenge; + struct rxrpc_header hdr; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return ret; + + get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce)); + + challenge.version = htonl(2); + challenge.nonce = htonl(conn->security_nonce); + challenge.min_level = htonl(0); + challenge.__padding = 0; + + msg.msg_name = &conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr.epoch = conn->epoch; + hdr.cid = conn->cid; + hdr.callNumber = 0; + hdr.seq = 0; + hdr.type = RXRPC_PACKET_TYPE_CHALLENGE; + hdr.flags = conn->out_clientflag; + hdr.userStatus = 0; + hdr.securityIndex = conn->security_ix; + hdr._rsvd = 0; + hdr.serviceId = conn->service_id; + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = &challenge; + iov[1].iov_len = sizeof(challenge); + + len = iov[0].iov_len + iov[1].iov_len; + + hdr.serial = htonl(atomic_inc_return(&conn->serial)); + _proto("Tx CHALLENGE %%%u", ntohl(hdr.serial)); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * send a Kerberos security response + */ +static int rxkad_send_response(struct rxrpc_connection *conn, + struct rxrpc_header *hdr, + struct rxkad_response *resp, + const struct rxkad_key *s2) +{ + struct msghdr msg; + struct kvec iov[3]; + size_t len; + int ret; + + _enter(""); + + msg.msg_name = &conn->trans->peer->srx.transport.sin; + msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->epoch = conn->epoch; + hdr->seq = 0; + hdr->type = RXRPC_PACKET_TYPE_RESPONSE; + hdr->flags = conn->out_clientflag; + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + iov[1].iov_base = resp; + iov[1].iov_len = sizeof(*resp); + iov[2].iov_base = (void *) s2->ticket; + iov[2].iov_len = s2->ticket_len; + + len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; + + hdr->serial = htonl(atomic_inc_return(&conn->serial)); + _proto("Tx RESPONSE %%%u", ntohl(hdr->serial)); + + ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len); + if (ret < 0) { + _debug("sendmsg failed: %d", ret); + return -EAGAIN; + } + + _leave(" = 0"); + return 0; +} + +/* + * calculate the response checksum + */ +static void rxkad_calc_response_checksum(struct rxkad_response *response) +{ + u32 csum = 1000003; + int loop; + u8 *p = (u8 *) response; + + for (loop = sizeof(*response); loop > 0; loop--) + csum = csum * 0x10204081 + *p++; + + response->encrypted.checksum = htonl(csum); +} + +/* + * load a scatterlist with a potentially split-page buffer + */ +static void rxkad_sg_set_buf2(struct scatterlist sg[2], + void *buf, size_t buflen) +{ + int nsg = 1; + + sg_init_table(sg, 2); + + sg_set_buf(&sg[0], buf, buflen); + if (sg[0].offset + buflen > PAGE_SIZE) { + /* the buffer was split over two pages */ + sg[0].length = PAGE_SIZE - sg[0].offset; + sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length); + nsg++; + } + + sg_mark_end(&sg[nsg - 1]); + + ASSERTCMP(sg[0].length + sg[1].length, ==, buflen); +} + +/* + * encrypt the response packet + */ +static void rxkad_encrypt_response(struct rxrpc_connection *conn, + struct rxkad_response *resp, + const struct rxkad_key *s2) +{ + struct blkcipher_desc desc; + struct rxrpc_crypt iv; + struct scatterlist sg[2]; + + /* continue encrypting from where we left off */ + memcpy(&iv, s2->session_key, sizeof(iv)); + desc.tfm = conn->cipher; + desc.info = iv.x; + desc.flags = 0; + + rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); + crypto_blkcipher_encrypt_iv(&desc, sg, sg, sizeof(resp->encrypted)); +} + +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + const struct rxrpc_key_token *token; + struct rxkad_challenge challenge; + struct rxkad_response resp + __attribute__((aligned(8))); /* must be aligned for crypto */ + struct rxrpc_skb_priv *sp; + u32 version, nonce, min_level, abort_code; + int ret; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (!conn->key) { + _leave(" = -EPROTO [no key]"); + return -EPROTO; + } + + ret = key_validate(conn->key); + if (ret < 0) { + *_abort_code = RXKADEXPIRED; + return ret; + } + + abort_code = RXKADPACKETSHORT; + sp = rxrpc_skb(skb); + if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0) + goto protocol_error; + + version = ntohl(challenge.version); + nonce = ntohl(challenge.nonce); + min_level = ntohl(challenge.min_level); + + _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }", + ntohl(sp->hdr.serial), version, nonce, min_level); + + abort_code = RXKADINCONSISTENCY; + if (version != RXKAD_VERSION) + goto protocol_error; + + abort_code = RXKADLEVELFAIL; + if (conn->security_level < min_level) + goto protocol_error; + + token = conn->key->payload.data; + + /* build the response packet */ + memset(&resp, 0, sizeof(resp)); + + resp.version = RXKAD_VERSION; + resp.encrypted.epoch = conn->epoch; + resp.encrypted.cid = conn->cid; + resp.encrypted.securityIndex = htonl(conn->security_ix); + resp.encrypted.call_id[0] = + (conn->channels[0] ? conn->channels[0]->call_id : 0); + resp.encrypted.call_id[1] = + (conn->channels[1] ? conn->channels[1]->call_id : 0); + resp.encrypted.call_id[2] = + (conn->channels[2] ? conn->channels[2]->call_id : 0); + resp.encrypted.call_id[3] = + (conn->channels[3] ? conn->channels[3]->call_id : 0); + resp.encrypted.inc_nonce = htonl(nonce + 1); + resp.encrypted.level = htonl(conn->security_level); + resp.kvno = htonl(token->kad->kvno); + resp.ticket_len = htonl(token->kad->ticket_len); + + /* calculate the response checksum and then do the encryption */ + rxkad_calc_response_checksum(&resp); + rxkad_encrypt_response(conn, &resp, token->kad); + return rxkad_send_response(conn, &sp->hdr, &resp, token->kad); + +protocol_error: + *_abort_code = abort_code; + _leave(" = -EPROTO [%d]", abort_code); + return -EPROTO; +} + +/* + * decrypt the kerberos IV ticket in the response + */ +static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, + void *ticket, size_t ticket_len, + struct rxrpc_crypt *_session_key, + time_t *_expiry, + u32 *_abort_code) +{ + struct blkcipher_desc desc; + struct rxrpc_crypt iv, key; + struct scatterlist sg[1]; + struct in_addr addr; + unsigned life; + time_t issue, now; + bool little_endian; + int ret; + u8 *p, *q, *name, *end; + + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key)); + + *_expiry = 0; + + ret = key_validate(conn->server_key); + if (ret < 0) { + switch (ret) { + case -EKEYEXPIRED: + *_abort_code = RXKADEXPIRED; + goto error; + default: + *_abort_code = RXKADNOAUTH; + goto error; + } + } + + ASSERT(conn->server_key->payload.data != NULL); + ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); + + memcpy(&iv, &conn->server_key->type_data, sizeof(iv)); + + desc.tfm = conn->server_key->payload.data; + desc.info = iv.x; + desc.flags = 0; + + sg_init_one(&sg[0], ticket, ticket_len); + crypto_blkcipher_decrypt_iv(&desc, sg, sg, ticket_len); + + p = ticket; + end = p + ticket_len; + +#define Z(size) \ + ({ \ + u8 *__str = p; \ + q = memchr(p, 0, end - p); \ + if (!q || q - p > (size)) \ + goto bad_ticket; \ + for (; p < q; p++) \ + if (!isprint(*p)) \ + goto bad_ticket; \ + p++; \ + __str; \ + }) + + /* extract the ticket flags */ + _debug("KIV FLAGS: %x", *p); + little_endian = *p & 1; + p++; + + /* extract the authentication name */ + name = Z(ANAME_SZ); + _debug("KIV ANAME: %s", name); + + /* extract the principal's instance */ + name = Z(INST_SZ); + _debug("KIV INST : %s", name); + + /* extract the principal's authentication domain */ + name = Z(REALM_SZ); + _debug("KIV REALM: %s", name); + + if (end - p < 4 + 8 + 4 + 2) + goto bad_ticket; + + /* get the IPv4 address of the entity that requested the ticket */ + memcpy(&addr, p, sizeof(addr)); + p += 4; + _debug("KIV ADDR : %pI4", &addr); + + /* get the session key from the ticket */ + memcpy(&key, p, sizeof(key)); + p += 8; + _debug("KIV KEY : %08x %08x", ntohl(key.n[0]), ntohl(key.n[1])); + memcpy(_session_key, &key, sizeof(key)); + + /* get the ticket's lifetime */ + life = *p++ * 5 * 60; + _debug("KIV LIFE : %u", life); + + /* get the issue time of the ticket */ + if (little_endian) { + __le32 stamp; + memcpy(&stamp, p, 4); + issue = le32_to_cpu(stamp); + } else { + __be32 stamp; + memcpy(&stamp, p, 4); + issue = be32_to_cpu(stamp); + } + p += 4; + now = get_seconds(); + _debug("KIV ISSUE: %lx [%lx]", issue, now); + + /* check the ticket is in date */ + if (issue > now) { + *_abort_code = RXKADNOAUTH; + ret = -EKEYREJECTED; + goto error; + } + + if (issue < now - life) { + *_abort_code = RXKADEXPIRED; + ret = -EKEYEXPIRED; + goto error; + } + + *_expiry = issue + life; + + /* get the service name */ + name = Z(SNAME_SZ); + _debug("KIV SNAME: %s", name); + + /* get the service instance name */ + name = Z(INST_SZ); + _debug("KIV SINST: %s", name); + + ret = 0; +error: + _leave(" = %d", ret); + return ret; + +bad_ticket: + *_abort_code = RXKADBADTICKET; + ret = -EBADMSG; + goto error; +} + +/* + * decrypt the response packet + */ +static void rxkad_decrypt_response(struct rxrpc_connection *conn, + struct rxkad_response *resp, + const struct rxrpc_crypt *session_key) +{ + struct blkcipher_desc desc; + struct scatterlist sg[2]; + struct rxrpc_crypt iv; + + _enter(",,%08x%08x", + ntohl(session_key->n[0]), ntohl(session_key->n[1])); + + ASSERT(rxkad_ci != NULL); + + mutex_lock(&rxkad_ci_mutex); + if (crypto_blkcipher_setkey(rxkad_ci, session_key->x, + sizeof(*session_key)) < 0) + BUG(); + + memcpy(&iv, session_key, sizeof(iv)); + desc.tfm = rxkad_ci; + desc.info = iv.x; + desc.flags = 0; + + rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted)); + crypto_blkcipher_decrypt_iv(&desc, sg, sg, sizeof(resp->encrypted)); + mutex_unlock(&rxkad_ci_mutex); + + _leave(""); +} + +/* + * verify a response + */ +static int rxkad_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + struct rxkad_response response + __attribute__((aligned(8))); /* must be aligned for crypto */ + struct rxrpc_skb_priv *sp; + struct rxrpc_crypt session_key; + time_t expiry; + void *ticket; + u32 abort_code, version, kvno, ticket_len, level; + __be32 csum; + int ret; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + + abort_code = RXKADPACKETSHORT; + if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0) + goto protocol_error; + if (!pskb_pull(skb, sizeof(response))) + BUG(); + + version = ntohl(response.version); + ticket_len = ntohl(response.ticket_len); + kvno = ntohl(response.kvno); + sp = rxrpc_skb(skb); + _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", + ntohl(sp->hdr.serial), version, kvno, ticket_len); + + abort_code = RXKADINCONSISTENCY; + if (version != RXKAD_VERSION) + goto protocol_error; + + abort_code = RXKADTICKETLEN; + if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) + goto protocol_error; + + abort_code = RXKADUNKNOWNKEY; + if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) + goto protocol_error; + + /* extract the kerberos ticket and decrypt and decode it */ + ticket = kmalloc(ticket_len, GFP_NOFS); + if (!ticket) + return -ENOMEM; + + abort_code = RXKADPACKETSHORT; + if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0) + goto protocol_error_free; + + ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, + &expiry, &abort_code); + if (ret < 0) { + *_abort_code = abort_code; + kfree(ticket); + return ret; + } + + /* use the session key from inside the ticket to decrypt the + * response */ + rxkad_decrypt_response(conn, &response, &session_key); + + abort_code = RXKADSEALEDINCON; + if (response.encrypted.epoch != conn->epoch) + goto protocol_error_free; + if (response.encrypted.cid != conn->cid) + goto protocol_error_free; + if (ntohl(response.encrypted.securityIndex) != conn->security_ix) + goto protocol_error_free; + csum = response.encrypted.checksum; + response.encrypted.checksum = 0; + rxkad_calc_response_checksum(&response); + if (response.encrypted.checksum != csum) + goto protocol_error_free; + + if (ntohl(response.encrypted.call_id[0]) > INT_MAX || + ntohl(response.encrypted.call_id[1]) > INT_MAX || + ntohl(response.encrypted.call_id[2]) > INT_MAX || + ntohl(response.encrypted.call_id[3]) > INT_MAX) + goto protocol_error_free; + + abort_code = RXKADOUTOFSEQUENCE; + if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1)) + goto protocol_error_free; + + abort_code = RXKADLEVELFAIL; + level = ntohl(response.encrypted.level); + if (level > RXRPC_SECURITY_ENCRYPT) + goto protocol_error_free; + conn->security_level = level; + + /* create a key to hold the security data and expiration time - after + * this the connection security can be handled in exactly the same way + * as for a client connection */ + ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); + if (ret < 0) { + kfree(ticket); + return ret; + } + + kfree(ticket); + _leave(" = 0"); + return 0; + +protocol_error_free: + kfree(ticket); +protocol_error: + *_abort_code = abort_code; + _leave(" = -EPROTO [%d]", abort_code); + return -EPROTO; +} + +/* + * clear the connection security + */ +static void rxkad_clear(struct rxrpc_connection *conn) +{ + _enter(""); + + if (conn->cipher) + crypto_free_blkcipher(conn->cipher); +} + +/* + * RxRPC Kerberos-based security + */ +static struct rxrpc_security rxkad = { + .owner = THIS_MODULE, + .name = "rxkad", + .security_index = RXRPC_SECURITY_RXKAD, + .init_connection_security = rxkad_init_connection_security, + .prime_packet_security = rxkad_prime_packet_security, + .secure_packet = rxkad_secure_packet, + .verify_packet = rxkad_verify_packet, + .issue_challenge = rxkad_issue_challenge, + .respond_to_challenge = rxkad_respond_to_challenge, + .verify_response = rxkad_verify_response, + .clear = rxkad_clear, +}; + +static __init int rxkad_init(void) +{ + _enter(""); + + /* pin the cipher we need so that the crypto layer doesn't invoke + * keventd to go get it */ + rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(rxkad_ci)) + return PTR_ERR(rxkad_ci); + + return rxrpc_register_security(&rxkad); +} + +module_init(rxkad_init); + +static __exit void rxkad_exit(void) +{ + _enter(""); + + rxrpc_unregister_security(&rxkad); + crypto_free_blkcipher(rxkad_ci); +} + +module_exit(rxkad_exit); diff --git a/net/sched/Kconfig b/net/sched/Kconfig new file mode 100644 index 00000000..2590e91b --- /dev/null +++ b/net/sched/Kconfig @@ -0,0 +1,585 @@ +# +# Traffic control configuration. +# + +menuconfig NET_SCHED + bool "QoS and/or fair queueing" + select NET_SCH_FIFO + ---help--- + When the kernel has several packets to send out over a network + device, it has to decide which ones to send first, which ones to + delay, and which ones to drop. This is the job of the queueing + disciplines, several different algorithms for how to do this + "fairly" have been proposed. + + If you say N here, you will get the standard packet scheduler, which + is a FIFO (first come, first served). If you say Y here, you will be + able to choose from among several alternative algorithms which can + then be attached to different network devices. This is useful for + example if some of your network devices are real time devices that + need a certain minimum data flow rate, or if you need to limit the + maximum data flow rate for traffic which matches specified criteria. + This code is considered to be experimental. + + To administer these schedulers, you'll need the user-level utilities + from the package iproute2+tc at . + That package also contains some documentation; for more, check out + . + + This Quality of Service (QoS) support will enable you to use + Differentiated Services (diffserv) and Resource Reservation Protocol + (RSVP) on your Linux router if you also say Y to the corresponding + classifiers below. Documentation and software is at + . + + If you say Y here and to "/proc file system" below, you will be able + to read status information about packet schedulers from the file + /proc/net/psched. + + The available schedulers are listed in the following questions; you + can say Y to as many as you like. If unsure, say N now. + +if NET_SCHED + +comment "Queueing/Scheduling" + +config NET_SCH_CBQ + tristate "Class Based Queueing (CBQ)" + ---help--- + Say Y here if you want to use the Class-Based Queueing (CBQ) packet + scheduling algorithm. This algorithm classifies the waiting packets + into a tree-like hierarchy of classes; the leaves of this tree are + in turn scheduled by separate algorithms. + + See the top of for more details. + + CBQ is a commonly used scheduler, so if you're unsure, you should + say Y here. Then say Y to all the queueing algorithms below that you + want to use as leaf disciplines. + + To compile this code as a module, choose M here: the + module will be called sch_cbq. + +config NET_SCH_HTB + tristate "Hierarchical Token Bucket (HTB)" + ---help--- + Say Y here if you want to use the Hierarchical Token Buckets (HTB) + packet scheduling algorithm. See + for complete manual and + in-depth articles. + + HTB is very similar to CBQ regarding its goals however is has + different properties and different algorithm. + + To compile this code as a module, choose M here: the + module will be called sch_htb. + +config NET_SCH_HFSC + tristate "Hierarchical Fair Service Curve (HFSC)" + ---help--- + Say Y here if you want to use the Hierarchical Fair Service Curve + (HFSC) packet scheduling algorithm. + + To compile this code as a module, choose M here: the + module will be called sch_hfsc. + +config NET_SCH_ATM + tristate "ATM Virtual Circuits (ATM)" + depends on ATM + ---help--- + Say Y here if you want to use the ATM pseudo-scheduler. This + provides a framework for invoking classifiers, which in turn + select classes of this queuing discipline. Each class maps + the flow(s) it is handling to a given virtual circuit. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_atm. + +config NET_SCH_PRIO + tristate "Multi Band Priority Queueing (PRIO)" + ---help--- + Say Y here if you want to use an n-band priority queue packet + scheduler. + + To compile this code as a module, choose M here: the + module will be called sch_prio. + +config NET_SCH_MULTIQ + tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)" + ---help--- + Say Y here if you want to use an n-band queue packet scheduler + to support devices that have multiple hardware transmit queues. + + To compile this code as a module, choose M here: the + module will be called sch_multiq. + +config NET_SCH_RED + tristate "Random Early Detection (RED)" + ---help--- + Say Y here if you want to use the Random Early Detection (RED) + packet scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_red. + +config NET_SCH_SFB + tristate "Stochastic Fair Blue (SFB)" + ---help--- + Say Y here if you want to use the Stochastic Fair Blue (SFB) + packet scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_sfb. + +config NET_SCH_SFQ + tristate "Stochastic Fairness Queueing (SFQ)" + ---help--- + Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) + packet scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_sfq. + +config NET_SCH_TEQL + tristate "True Link Equalizer (TEQL)" + ---help--- + Say Y here if you want to use the True Link Equalizer (TLE) packet + scheduling algorithm. This queueing discipline allows the combination + of several physical devices into one virtual device. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_teql. + +config NET_SCH_TBF + tristate "Token Bucket Filter (TBF)" + ---help--- + Say Y here if you want to use the Token Bucket Filter (TBF) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_tbf. + +config NET_SCH_GRED + tristate "Generic Random Early Detection (GRED)" + ---help--- + Say Y here if you want to use the Generic Random Early Detection + (GRED) packet scheduling algorithm for some of your network devices + (see the top of for details and + references about the algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_gred. + +config NET_SCH_DSMARK + tristate "Differentiated Services marker (DSMARK)" + ---help--- + Say Y if you want to schedule packets according to the + Differentiated Services architecture proposed in RFC 2475. + Technical information on this method, with pointers to associated + RFCs, is available at . + + To compile this code as a module, choose M here: the + module will be called sch_dsmark. + +config NET_SCH_NETEM + tristate "Network emulator (NETEM)" + ---help--- + Say Y if you want to emulate network delay, loss, and packet + re-ordering. This is often useful to simulate networks when + testing applications or protocols. + + To compile this driver as a module, choose M here: the module + will be called sch_netem. + + If unsure, say N. + +config NET_SCH_DRR + tristate "Deficit Round Robin scheduler (DRR)" + help + Say Y here if you want to use the Deficit Round Robin (DRR) packet + scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_drr. + + If unsure, say N. + +config NET_SCH_MQPRIO + tristate "Multi-queue priority scheduler (MQPRIO)" + help + Say Y here if you want to use the Multi-queue Priority scheduler. + This scheduler allows QOS to be offloaded on NICs that have support + for offloading QOS schedulers. + + To compile this driver as a module, choose M here: the module will + be called sch_mqprio. + + If unsure, say N. + +config NET_SCH_CHOKE + tristate "CHOose and Keep responsive flow scheduler (CHOKE)" + help + Say Y here if you want to use the CHOKe packet scheduler (CHOose + and Keep for responsive flows, CHOose and Kill for unresponsive + flows). This is a variation of RED which trys to penalize flows + that monopolize the queue. + + To compile this code as a module, choose M here: the + module will be called sch_choke. + +config NET_SCH_QFQ + tristate "Quick Fair Queueing scheduler (QFQ)" + help + Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_qfq. + + If unsure, say N. + +config NET_SCH_INGRESS + tristate "Ingress Qdisc" + depends on NET_CLS_ACT + ---help--- + Say Y here if you want to use classifiers for incoming packets. + If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called sch_ingress. + +comment "Classification" + +config NET_CLS + boolean + +config NET_CLS_BASIC + tristate "Elementary classification (BASIC)" + select NET_CLS + ---help--- + Say Y here if you want to be able to classify packets using + only extended matches and actions. + + To compile this code as a module, choose M here: the + module will be called cls_basic. + +config NET_CLS_TCINDEX + tristate "Traffic-Control Index (TCINDEX)" + select NET_CLS + ---help--- + Say Y here if you want to be able to classify packets based on + traffic control indices. You will want this feature if you want + to implement Differentiated Services together with DSMARK. + + To compile this code as a module, choose M here: the + module will be called cls_tcindex. + +config NET_CLS_ROUTE4 + tristate "Routing decision (ROUTE)" + depends on INET + select IP_ROUTE_CLASSID + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets + according to the route table entry they matched. + + To compile this code as a module, choose M here: the + module will be called cls_route. + +config NET_CLS_FW + tristate "Netfilter mark (FW)" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets + according to netfilter/firewall marks. + + To compile this code as a module, choose M here: the + module will be called cls_fw. + +config NET_CLS_U32 + tristate "Universal 32bit comparisons w/ hashing (U32)" + select NET_CLS + ---help--- + Say Y here to be able to classify packets using a universal + 32bit pieces based comparison scheme. + + To compile this code as a module, choose M here: the + module will be called cls_u32. + +config CLS_U32_PERF + bool "Performance counters support" + depends on NET_CLS_U32 + ---help--- + Say Y here to make u32 gather additional statistics useful for + fine tuning u32 classifiers. + +config CLS_U32_MARK + bool "Netfilter marks support" + depends on NET_CLS_U32 + ---help--- + Say Y here to be able to use netfilter marks as u32 key. + +config NET_CLS_RSVP + tristate "IPv4 Resource Reservation Protocol (RSVP)" + select NET_CLS + ---help--- + The Resource Reservation Protocol (RSVP) permits end systems to + request a minimum and maximum data flow rate for a connection; this + is important for real time data such as streaming sound or video. + + Say Y here if you want to be able to classify outgoing packets based + on their RSVP requests. + + To compile this code as a module, choose M here: the + module will be called cls_rsvp. + +config NET_CLS_RSVP6 + tristate "IPv6 Resource Reservation Protocol (RSVP6)" + select NET_CLS + ---help--- + The Resource Reservation Protocol (RSVP) permits end systems to + request a minimum and maximum data flow rate for a connection; this + is important for real time data such as streaming sound or video. + + Say Y here if you want to be able to classify outgoing packets based + on their RSVP requests and you are using the IPv6 protocol. + + To compile this code as a module, choose M here: the + module will be called cls_rsvp6. + +config NET_CLS_FLOW + tristate "Flow classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys. This is mostly useful + in combination with SFQ. + + To compile this code as a module, choose M here: the + module will be called cls_flow. + +config NET_CLS_CGROUP + tristate "Control Group Classifier" + select NET_CLS + depends on CGROUPS + ---help--- + Say Y here if you want to classify packets based on the control + cgroup of their process. + + To compile this code as a module, choose M here: the + module will be called cls_cgroup. + +config NET_EMATCH + bool "Extended Matches" + select NET_CLS + ---help--- + Say Y here if you want to use extended matches on top of classifiers + and select the extended matches below. + + Extended matches are small classification helpers not worth writing + a separate classifier for. + + A recent version of the iproute2 package is required to use + extended matches. + +config NET_EMATCH_STACK + int "Stack size" + depends on NET_EMATCH + default "32" + ---help--- + Size of the local stack variable used while evaluating the tree of + ematches. Limits the depth of the tree, i.e. the number of + encapsulated precedences. Every level requires 4 bytes of additional + stack space. + +config NET_EMATCH_CMP + tristate "Simple packet data comparison" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + simple packet data comparisons for 8, 16, and 32bit values. + + To compile this code as a module, choose M here: the + module will be called em_cmp. + +config NET_EMATCH_NBYTE + tristate "Multi byte comparison" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + multiple byte comparisons mainly useful for IPv6 address comparisons. + + To compile this code as a module, choose M here: the + module will be called em_nbyte. + +config NET_EMATCH_U32 + tristate "U32 key" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets using + the famous u32 key in combination with logic relations. + + To compile this code as a module, choose M here: the + module will be called em_u32. + +config NET_EMATCH_META + tristate "Metadata" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + metadata such as load average, netfilter attributes, socket + attributes and routing decisions. + + To compile this code as a module, choose M here: the + module will be called em_meta. + +config NET_EMATCH_TEXT + tristate "Textsearch" + depends on NET_EMATCH + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_BM + select TEXTSEARCH_FSM + ---help--- + Say Y here if you want to be able to classify packets based on + textsearch comparisons. + + To compile this code as a module, choose M here: the + module will be called em_text. + +config NET_CLS_ACT + bool "Actions" + ---help--- + Say Y here if you want to use traffic control actions. Actions + get attached to classifiers and are invoked after a successful + classification. They are used to overwrite the classification + result, instantly drop or redirect packets, etc. + + A recent version of the iproute2 package is required to use + extended matches. + +config NET_ACT_POLICE + tristate "Traffic Policing" + depends on NET_CLS_ACT + ---help--- + Say Y here if you want to do traffic policing, i.e. strict + bandwidth limiting. This action replaces the existing policing + module. + + To compile this code as a module, choose M here: the + module will be called act_police. + +config NET_ACT_GACT + tristate "Generic actions" + depends on NET_CLS_ACT + ---help--- + Say Y here to take generic actions such as dropping and + accepting packets. + + To compile this code as a module, choose M here: the + module will be called act_gact. + +config GACT_PROB + bool "Probability support" + depends on NET_ACT_GACT + ---help--- + Say Y here to use the generic action randomly or deterministically. + +config NET_ACT_MIRRED + tristate "Redirecting and Mirroring" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow packets to be mirrored or redirected to + other devices. + + To compile this code as a module, choose M here: the + module will be called act_mirred. + +config NET_ACT_IPT + tristate "IPtables targets" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- + Say Y here to be able to invoke iptables targets after successful + classification. + + To compile this code as a module, choose M here: the + module will be called act_ipt. + +config NET_ACT_NAT + tristate "Stateless NAT" + depends on NET_CLS_ACT + ---help--- + Say Y here to do stateless NAT on IPv4 packets. You should use + netfilter for NAT unless you know what you are doing. + + To compile this code as a module, choose M here: the + module will be called act_nat. + +config NET_ACT_PEDIT + tristate "Packet Editing" + depends on NET_CLS_ACT + ---help--- + Say Y here if you want to mangle the content of packets. + + To compile this code as a module, choose M here: the + module will be called act_pedit. + +config NET_ACT_SIMP + tristate "Simple Example (Debug)" + depends on NET_CLS_ACT + ---help--- + Say Y here to add a simple action for demonstration purposes. + It is meant as an example and for debugging purposes. It will + print a configured policy string followed by the packet count + to the console for every packet that passes by. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_simple. + +config NET_ACT_SKBEDIT + tristate "SKB Editing" + depends on NET_CLS_ACT + ---help--- + Say Y here to change skb priority or queue_mapping settings. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_skbedit. + +config NET_ACT_CSUM + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + ---help--- + Say Y here to update some common checksum after some direct + packet alterations. + + To compile this code as a module, choose M here: the + module will be called act_csum. + +config NET_CLS_IND + bool "Incoming device classification" + depends on NET_CLS_U32 || NET_CLS_FW + ---help--- + Say Y here to extend the u32 and fw classifier to support + classification based on the incoming device. This option is + likely to disappear in favour of the metadata ematch. + +endif # NET_SCHED + +config NET_SCH_FIFO + bool diff --git a/net/sched/Makefile b/net/sched/Makefile new file mode 100644 index 00000000..dc5889c0 --- /dev/null +++ b/net/sched/Makefile @@ -0,0 +1,54 @@ +# +# Makefile for the Linux Traffic Control Unit. +# + +obj-y := sch_generic.o sch_mq.o + +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o +obj-$(CONFIG_NET_CLS) += cls_api.o +obj-$(CONFIG_NET_CLS_ACT) += act_api.o +obj-$(CONFIG_NET_ACT_POLICE) += act_police.o +obj-$(CONFIG_NET_ACT_GACT) += act_gact.o +obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o +obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o +obj-$(CONFIG_NET_ACT_NAT) += act_nat.o +obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o +obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o +obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o +obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o +obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o +obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o +obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o +obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o +obj-$(CONFIG_NET_SCH_RED) += sch_red.o +obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o +obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o +obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o +obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o +obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o +obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o +obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o +obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o +obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o +obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o +obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o + +obj-$(CONFIG_NET_CLS_U32) += cls_u32.o +obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o +obj-$(CONFIG_NET_CLS_FW) += cls_fw.o +obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o +obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o +obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o +obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o +obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o +obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o +obj-$(CONFIG_NET_EMATCH) += ematch.o +obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o +obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o +obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o +obj-$(CONFIG_NET_EMATCH_META) += em_meta.o +obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c new file mode 100644 index 00000000..a6060258 --- /dev/null +++ b/net/sched/act_api.c @@ -0,0 +1,1125 @@ +/* + * net/sched/act_api.c Packet action API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Author: Jamal Hadi Salim + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) +{ + unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); + struct tcf_common **p1p; + + for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) { + if (*p1p == p) { + write_lock_bh(hinfo->lock); + *p1p = p->tcfc_next; + write_unlock_bh(hinfo->lock); + gen_kill_estimator(&p->tcfc_bstats, + &p->tcfc_rate_est); + /* + * gen_estimator est_timer() might access p->tcfc_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcfc_rcu); + return; + } + } + WARN_ON(1); +} +EXPORT_SYMBOL(tcf_hash_destroy); + +int tcf_hash_release(struct tcf_common *p, int bind, + struct tcf_hashinfo *hinfo) +{ + int ret = 0; + + if (p) { + if (bind) + p->tcfc_bindcnt--; + + p->tcfc_refcnt--; + if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { + tcf_hash_destroy(p, hinfo); + ret = 1; + } + } + return ret; +} +EXPORT_SYMBOL(tcf_hash_release); + +static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, + struct tc_action *a, struct tcf_hashinfo *hinfo) +{ + struct tcf_common *p; + int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + struct nlattr *nest; + + read_lock_bh(hinfo->lock); + + s_i = cb->args[0]; + + for (i = 0; i < (hinfo->hmask + 1); i++) { + p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; + + for (; p; p = p->tcfc_next) { + index++; + if (index < s_i) + continue; + a->priv = p; + a->order = n_i; + + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_1(skb, a, 0, 0); + if (err < 0) { + index--; + nlmsg_trim(skb, nest); + goto done; + } + nla_nest_end(skb, nest); + n_i++; + if (n_i >= TCA_ACT_MAX_PRIO) + goto done; + } + } +done: + read_unlock_bh(hinfo->lock); + if (n_i) + cb->args[0] += n_i; + return n_i; + +nla_put_failure: + nla_nest_cancel(skb, nest); + goto done; +} + +static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, + struct tcf_hashinfo *hinfo) +{ + struct tcf_common *p, *s_p; + struct nlattr *nest; + int i = 0, n_i = 0; + + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); + for (i = 0; i < (hinfo->hmask + 1); i++) { + p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; + + while (p != NULL) { + s_p = p->tcfc_next; + if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) + module_put(a->ops->owner); + n_i++; + p = s_p; + } + } + NLA_PUT_U32(skb, TCA_FCNT, n_i); + nla_nest_end(skb, nest); + + return n_i; +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EINVAL; +} + +int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + + if (type == RTM_DELACTION) { + return tcf_del_walker(skb, a, hinfo); + } else if (type == RTM_GETACTION) { + return tcf_dump_walker(skb, cb, a, hinfo); + } else { + WARN(1, "tcf_generic_walker: unknown action %d\n", type); + return -EINVAL; + } +} +EXPORT_SYMBOL(tcf_generic_walker); + +struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +{ + struct tcf_common *p; + + read_lock_bh(hinfo->lock); + for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p; + p = p->tcfc_next) { + if (p->tcfc_index == index) + break; + } + read_unlock_bh(hinfo->lock); + + return p; +} +EXPORT_SYMBOL(tcf_hash_lookup); + +u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo) +{ + u32 val = *idx_gen; + + do { + if (++val == 0) + val = 1; + } while (tcf_hash_lookup(val, hinfo)); + + return (*idx_gen = val); +} +EXPORT_SYMBOL(tcf_hash_new_index); + +int tcf_hash_search(struct tc_action *a, u32 index) +{ + struct tcf_hashinfo *hinfo = a->ops->hinfo; + struct tcf_common *p = tcf_hash_lookup(index, hinfo); + + if (p) { + a->priv = p; + return 1; + } + return 0; +} +EXPORT_SYMBOL(tcf_hash_search); + +struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind, + struct tcf_hashinfo *hinfo) +{ + struct tcf_common *p = NULL; + if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { + if (bind) + p->tcfc_bindcnt++; + p->tcfc_refcnt++; + a->priv = p; + } + return p; +} +EXPORT_SYMBOL(tcf_hash_check); + +struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, + struct tc_action *a, int size, int bind, + u32 *idx_gen, struct tcf_hashinfo *hinfo) +{ + struct tcf_common *p = kzalloc(size, GFP_KERNEL); + + if (unlikely(!p)) + return ERR_PTR(-ENOMEM); + p->tcfc_refcnt = 1; + if (bind) + p->tcfc_bindcnt = 1; + + spin_lock_init(&p->tcfc_lock); + p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo); + p->tcfc_tm.install = jiffies; + p->tcfc_tm.lastuse = jiffies; + if (est) { + int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est, + &p->tcfc_lock, est); + if (err) { + kfree(p); + return ERR_PTR(err); + } + } + + a->priv = (void *) p; + return p; +} +EXPORT_SYMBOL(tcf_hash_create); + +void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo) +{ + unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); + + write_lock_bh(hinfo->lock); + p->tcfc_next = hinfo->htab[h]; + hinfo->htab[h] = p; + write_unlock_bh(hinfo->lock); +} +EXPORT_SYMBOL(tcf_hash_insert); + +static struct tc_action_ops *act_base = NULL; +static DEFINE_RWLOCK(act_mod_lock); + +int tcf_register_action(struct tc_action_ops *act) +{ + struct tc_action_ops *a, **ap; + + write_lock(&act_mod_lock); + for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { + write_unlock(&act_mod_lock); + return -EEXIST; + } + } + act->next = NULL; + *ap = act; + write_unlock(&act_mod_lock); + return 0; +} +EXPORT_SYMBOL(tcf_register_action); + +int tcf_unregister_action(struct tc_action_ops *act) +{ + struct tc_action_ops *a, **ap; + int err = -ENOENT; + + write_lock(&act_mod_lock); + for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) + if (a == act) + break; + if (a) { + *ap = a->next; + a->next = NULL; + err = 0; + } + write_unlock(&act_mod_lock); + return err; +} +EXPORT_SYMBOL(tcf_unregister_action); + +/* lookup by name */ +static struct tc_action_ops *tc_lookup_action_n(char *kind) +{ + struct tc_action_ops *a = NULL; + + if (kind) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (strcmp(kind, a->kind) == 0) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} + +/* lookup by nlattr */ +static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) +{ + struct tc_action_ops *a = NULL; + + if (kind) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (nla_strcmp(kind, a->kind) == 0) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} + +#if 0 +/* lookup by id */ +static struct tc_action_ops *tc_lookup_action_id(u32 type) +{ + struct tc_action_ops *a = NULL; + + if (type) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (a->type == type) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} +#endif + +int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, + struct tcf_result *res) +{ + struct tc_action *a; + int ret = -1; + + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + ret = TC_ACT_OK; + goto exec_done; + } + while ((a = act) != NULL) { +repeat: + if (a->ops && a->ops->act) { + ret = a->ops->act(skb, a, res); + if (TC_MUNGED & skb->tc_verd) { + /* copied already, allow trampling */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); + } + if (ret == TC_ACT_REPEAT) + goto repeat; /* we need a ttl - JHS */ + if (ret != TC_ACT_PIPE) + goto exec_done; + } + act = a->next; + } +exec_done: + return ret; +} +EXPORT_SYMBOL(tcf_action_exec); + +void tcf_action_destroy(struct tc_action *act, int bind) +{ + struct tc_action *a; + + for (a = act; a; a = act) { + if (a->ops && a->ops->cleanup) { + if (a->ops->cleanup(a, bind) == ACT_P_DELETED) + module_put(a->ops->owner); + act = act->next; + kfree(a); + } else { + /*FIXME: Remove later - catch insertion bugs*/ + WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n"); + act = act->next; + kfree(a); + } + } +} + +int +tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + + if (a->ops == NULL || a->ops->dump == NULL) + return err; + return a->ops->dump(skb, a, bind, ref); +} + +int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + if (a->ops == NULL || a->ops->dump == NULL) + return err; + + NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); + if (tcf_action_copy_stats(skb, a, 0)) + goto nla_put_failure; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_old(skb, a, bind, ref); + if (err > 0) { + nla_nest_end(skb, nest); + return err; + } + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} +EXPORT_SYMBOL(tcf_action_dump_1); + +int +tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +{ + struct tc_action *a; + int err = -EINVAL; + struct nlattr *nest; + + while ((a = act) != NULL) { + act = a->next; + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_1(skb, a, bind, ref); + if (err < 0) + goto errout; + nla_nest_end(skb, nest); + } + + return 0; + +nla_put_failure: + err = -EINVAL; +errout: + nla_nest_cancel(skb, nest); + return err; +} + +struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind) +{ + struct tc_action *a; + struct tc_action_ops *a_o; + char act_name[IFNAMSIZ]; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct nlattr *kind; + int err; + + if (name == NULL) { + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) + goto err_out; + err = -EINVAL; + kind = tb[TCA_ACT_KIND]; + if (kind == NULL) + goto err_out; + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + goto err_out; + } else { + err = -EINVAL; + if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) + goto err_out; + } + + a_o = tc_lookup_action_n(act_name); + if (a_o == NULL) { +#ifdef CONFIG_MODULES + rtnl_unlock(); + request_module("act_%s", act_name); + rtnl_lock(); + + a_o = tc_lookup_action_n(act_name); + + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + */ + if (a_o != NULL) { + err = -EAGAIN; + goto err_mod; + } +#endif + err = -ENOENT; + goto err_out; + } + + err = -ENOMEM; + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (a == NULL) + goto err_mod; + + /* backward compatibility for policer */ + if (name == NULL) + err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + else + err = a_o->init(nla, est, a, ovr, bind); + if (err < 0) + goto err_free; + + /* module count goes up only when brand new policy is created + * if it exists and is only bound to in a_o->init() then + * ACT_P_CREATED is not returned (a zero is). + */ + if (err != ACT_P_CREATED) + module_put(a_o->owner); + a->ops = a_o; + + return a; + +err_free: + kfree(a); +err_mod: + module_put(a_o->owner); +err_out: + return ERR_PTR(err); +} + +struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind) +{ + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct tc_action *head = NULL, *act, *act_prev = NULL; + int err; + int i; + + err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + if (err < 0) + return ERR_PTR(err); + + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_init_1(tb[i], est, name, ovr, bind); + if (IS_ERR(act)) + goto err; + act->order = i; + + if (head == NULL) + head = act; + else + act_prev->next = act; + act_prev = act; + } + return head; + +err: + if (head != NULL) + tcf_action_destroy(head, bind); + return act; +} + +int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, + int compat_mode) +{ + int err = 0; + struct gnet_dump d; + struct tcf_act_hdr *h = a->priv; + + if (h == NULL) + goto errout; + + /* compat_mode being true specifies a call that is supposed + * to add additional backward compatibility statistic TLVs. + */ + if (compat_mode) { + if (a->type == TCA_OLD_COMPAT) + err = gnet_stats_start_copy_compat(skb, 0, + TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d); + else + return 0; + } else + err = gnet_stats_start_copy(skb, TCA_ACT_STATS, + &h->tcf_lock, &d); + + if (err < 0) + goto errout; + + if (a->ops != NULL && a->ops->get_stats != NULL) + if (a->ops->get_stats(skb, a) < 0) + goto errout; + + if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &h->tcf_bstats, + &h->tcf_rate_est) < 0 || + gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0) + goto errout; + + if (gnet_stats_finish_copy(&d) < 0) + goto errout; + + return 0; + +errout: + return -1; +} + +static int +tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, + u16 flags, int event, int bind, int ref) +{ + struct tcamsg *t; + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); + + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + t->tca__pad1 = 0; + t->tca__pad2 = 0; + + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; + + if (tcf_action_dump(skb, a, bind, ref) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +nla_put_failure: +nlmsg_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int +act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, + struct tc_action *a, int event) +{ + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnl_unicast(skb, net, pid); +} + +static struct tc_action * +tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) +{ + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct tc_action *a; + int index; + int err; + + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) + goto err_out; + + err = -EINVAL; + if (tb[TCA_ACT_INDEX] == NULL || + nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) + goto err_out; + index = nla_get_u32(tb[TCA_ACT_INDEX]); + + err = -ENOMEM; + a = kzalloc(sizeof(struct tc_action), GFP_KERNEL); + if (a == NULL) + goto err_out; + + err = -EINVAL; + a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); + if (a->ops == NULL) + goto err_free; + if (a->ops->lookup == NULL) + goto err_mod; + err = -ENOENT; + if (a->ops->lookup(a, index) == 0) + goto err_mod; + + module_put(a->ops->owner); + return a; + +err_mod: + module_put(a->ops->owner); +err_free: + kfree(a); +err_out: + return ERR_PTR(err); +} + +static void cleanup_a(struct tc_action *act) +{ + struct tc_action *a; + + for (a = act; a; a = act) { + act = a->next; + kfree(a); + } +} + +static struct tc_action *create_a(int i) +{ + struct tc_action *act; + + act = kzalloc(sizeof(*act), GFP_KERNEL); + if (act == NULL) { + pr_debug("create_a: failed to alloc!\n"); + return NULL; + } + act->order = i; + return act; +} + +static int tca_action_flush(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 pid) +{ + struct sk_buff *skb; + unsigned char *b; + struct nlmsghdr *nlh; + struct tcamsg *t; + struct netlink_callback dcb; + struct nlattr *nest; + struct nlattr *tb[TCA_ACT_MAX + 1]; + struct nlattr *kind; + struct tc_action *a = create_a(0); + int err = -ENOMEM; + + if (a == NULL) { + pr_debug("tca_action_flush: couldnt create tc_action\n"); + return err; + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + pr_debug("tca_action_flush: failed skb alloc\n"); + kfree(a); + return err; + } + + b = skb_tail_pointer(skb); + + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) + goto err_out; + + err = -EINVAL; + kind = tb[TCA_ACT_KIND]; + a->ops = tc_lookup_action(kind); + if (a->ops == NULL) + goto err_out; + + nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + t->tca__pad1 = 0; + t->tca__pad2 = 0; + + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; + + err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); + if (err < 0) + goto nla_put_failure; + if (err == 0) + goto noflush_out; + + nla_nest_end(skb, nest); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + nlh->nlmsg_flags |= NLM_F_ROOT; + module_put(a->ops->owner); + kfree(a); + err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (err > 0) + return 0; + + return err; + +nla_put_failure: +nlmsg_failure: + module_put(a->ops->owner); +err_out: +noflush_out: + kfree_skb(skb); + kfree(a); + return err; +} + +static int +tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, + u32 pid, int event) +{ + int i, ret; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct tc_action *head = NULL, *act, *act_prev = NULL; + + ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + if (ret < 0) + return ret; + + if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { + if (tb[1] != NULL) + return tca_action_flush(net, tb[1], n, pid); + else + return -EINVAL; + } + + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_get_1(tb[i], n, pid); + if (IS_ERR(act)) { + ret = PTR_ERR(act); + goto err; + } + act->order = i; + + if (head == NULL) + head = act; + else + act_prev->next = act; + act_prev = act; + } + + if (event == RTM_GETACTION) + ret = act_get_notify(net, pid, n, head, event); + else { /* delete */ + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + ret = -ENOBUFS; + goto err; + } + + if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, + 0, 1) <= 0) { + kfree_skb(skb); + ret = -EINVAL; + goto err; + } + + /* now do the delete */ + tcf_action_destroy(head, 0); + ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + if (ret > 0) + return 0; + return ret; + } +err: + cleanup_a(head); + return ret; +} + +static int tcf_add_notify(struct net *net, struct tc_action *a, + u32 pid, u32 seq, int event, u16 flags) +{ + struct tcamsg *t; + struct nlmsghdr *nlh; + struct sk_buff *skb; + struct nlattr *nest; + unsigned char *b; + int err = 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + b = skb_tail_pointer(skb); + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + t->tca__pad1 = 0; + t->tca__pad2 = 0; + + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; + + if (tcf_action_dump(skb, a, 0, 0) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + NETLINK_CB(skb).dst_group = RTNLGRP_TC; + + err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO); + if (err > 0) + err = 0; + return err; + +nla_put_failure: +nlmsg_failure: + kfree_skb(skb); + return -1; +} + + +static int +tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, + u32 pid, int ovr) +{ + int ret = 0; + struct tc_action *act; + struct tc_action *a; + u32 seq = n->nlmsg_seq; + + act = tcf_action_init(nla, NULL, NULL, ovr, 0); + if (act == NULL) + goto done; + if (IS_ERR(act)) { + ret = PTR_ERR(act); + goto done; + } + + /* dump then free all the actions after update; inserted policy + * stays intact + */ + ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); + for (a = act; a; a = act) { + act = a->next; + kfree(a); + } +done: + return ret; +} + +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_ACT_MAX + 1]; + u32 pid = skb ? NETLINK_CB(skb).pid : 0; + int ret = 0, ovr = 0; + + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); + if (ret < 0) + return ret; + + if (tca[TCA_ACT_TAB] == NULL) { + pr_notice("tc_ctl_action: received NO action attribs\n"); + return -EINVAL; + } + + /* n->nlmsg_flags & NLM_F_CREATE */ + switch (n->nlmsg_type) { + case RTM_NEWACTION: + /* we are going to assume all other flags + * imply create only if it doesn't exist + * Note that CREATE | EXCL implies that + * but since we want avoid ambiguity (eg when flags + * is zero) then just set this + */ + if (n->nlmsg_flags & NLM_F_REPLACE) + ovr = 1; +replay: + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); + if (ret == -EAGAIN) + goto replay; + break; + case RTM_DELACTION: + ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, + pid, RTM_DELACTION); + break; + case RTM_GETACTION: + ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, + pid, RTM_GETACTION); + break; + default: + BUG(); + } + + return ret; +} + +static struct nlattr * +find_dump_kind(const struct nlmsghdr *n) +{ + struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *nla[TCAA_MAX + 1]; + struct nlattr *kind; + + if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0) + return NULL; + tb1 = nla[TCA_ACT_TAB]; + if (tb1 == NULL) + return NULL; + + if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), + NLMSG_ALIGN(nla_len(tb1)), NULL) < 0) + return NULL; + + if (tb[1] == NULL) + return NULL; + if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]), + nla_len(tb[1]), NULL) < 0) + return NULL; + kind = tb2[TCA_ACT_KIND]; + + return kind; +} + +static int +tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + struct tc_action_ops *a_o; + struct tc_action a; + int ret = 0; + struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); + struct nlattr *kind = find_dump_kind(cb->nlh); + + if (kind == NULL) { + pr_info("tc_dump_action: action bad kind\n"); + return 0; + } + + a_o = tc_lookup_action(kind); + if (a_o == NULL) + return 0; + + memset(&a, 0, sizeof(struct tc_action)); + a.ops = a_o; + + if (a_o->walk == NULL) { + WARN(1, "tc_dump_action: %s !capable of dumping table\n", + a_o->kind); + goto nla_put_failure; + } + + nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*t)); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + t->tca__pad1 = 0; + t->tca__pad2 = 0; + + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; + + ret = a_o->walk(skb, cb, RTM_GETACTION, &a); + if (ret < 0) + goto nla_put_failure; + + if (ret > 0) { + nla_nest_end(skb, nest); + ret = skb->len; + } else + nla_nest_cancel(skb, nest); + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + if (NETLINK_CB(cb->skb).pid && ret) + nlh->nlmsg_flags |= NLM_F_MULTI; + module_put(a_o->owner); + return skb->len; + +nla_put_failure: +nlmsg_failure: + module_put(a_o->owner); + nlmsg_trim(skb, b); + return skb->len; +} + +static int __init tc_action_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL); + rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL); + rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action); + + return 0; +} + +subsys_initcall(tc_action_init); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c new file mode 100644 index 00000000..6cdf9abe --- /dev/null +++ b/net/sched/act_csum.c @@ -0,0 +1,594 @@ +/* + * Checksum updating actions + * + * Copyright (c) 2010 Gregoire Baron + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define CSUM_TAB_MASK 15 +static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; +static u32 csum_idx_gen; +static DEFINE_RWLOCK(csum_lock); + +static struct tcf_hashinfo csum_hash_info = { + .htab = tcf_csum_ht, + .hmask = CSUM_TAB_MASK, + .lock = &csum_lock, +}; + +static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { + [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, +}; + +static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_CSUM_MAX + 1]; + struct tc_csum *parm; + struct tcf_common *pc; + struct tcf_csum *p; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy); + if (err < 0) + return err; + + if (tb[TCA_CSUM_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_CSUM_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &csum_idx_gen, &csum_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + p = to_tcf_csum(pc); + ret = ACT_P_CREATED; + } else { + p = to_tcf_csum(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &csum_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&p->tcf_lock); + p->tcf_action = parm->action; + p->update_flags = parm->update_flags; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &csum_hash_info); + + return ret; +} + +static int tcf_csum_cleanup(struct tc_action *a, int bind) +{ + struct tcf_csum *p = a->priv; + return tcf_hash_release(&p->common, bind, &csum_hash_info); +} + +/** + * tcf_csum_skb_nextlayer - Get next layer pointer + * @skb: sk_buff to use + * @ihl: previous summed headers length + * @ipl: complete packet length + * @jhl: next header length + * + * Check the expected next layer availability in the specified sk_buff. + * Return the next layer pointer if pass, NULL otherwise. + */ +static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, + unsigned int jhl) +{ + int ntkoff = skb_network_offset(skb); + int hl = ihl + jhl; + + if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || + (skb_cloned(skb) && + !skb_clone_writable(skb, hl + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return NULL; + else + return (void *)(skb_network_header(skb) + ihl); +} + +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct icmphdr *icmph; + + icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph)); + if (icmph == NULL) + return 0; + + icmph->checksum = 0; + skb->csum = csum_partial(icmph, ipl - ihl, 0); + icmph->checksum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_igmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct igmphdr *igmph; + + igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph)); + if (igmph == NULL) + return 0; + + igmph->csum = 0; + skb->csum = csum_partial(igmph, ipl - ihl, 0); + igmph->csum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct icmp6hdr *icmp6h; + + icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); + if (icmp6h == NULL) + return 0; + + icmp6h->icmp6_cksum = 0; + skb->csum = csum_partial(icmp6h, ipl - ihl, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_ICMPV6, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = tcp_v4_check(ipl - ihl, + iph->saddr, iph->daddr, skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_TCP, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use iph->tot_len, or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + if (udplite || udph->check) { + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + ul, iph->protocol, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul, + udplite ? IPPROTO_UDPLITE : IPPROTO_UDP, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) +{ + struct iphdr *iph; + int ntkoff; + + ntkoff = skb_network_offset(skb); + + if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff)) + goto fail; + + iph = ip_hdr(skb); + + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_ICMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_IGMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) + if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 0)) + goto fail; + break; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 1)) + goto fail; + break; + } + + if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto fail; + + ip_send_check(iph); + } + + return 1; + +fail: + return 0; +} + +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, + unsigned int ixhl, unsigned int *pl) +{ + int off, len, optlen; + unsigned char *xh = (void *)ip6xh; + + off = sizeof(*ip6xh); + len = ixhl - off; + + while (len > 1) { + switch (xh[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + case IPV6_TLV_JUMBO: + optlen = xh[off + 1] + 2; + if (optlen != 6 || len < 6 || (off & 3) != 2) + /* wrong jumbo option length/alignment */ + return 0; + *pl = ntohl(*(__be32 *)(xh + off + 2)); + goto done; + default: + optlen = xh[off + 1] + 2; + if (optlen > len) + /* ignore obscure options */ + goto done; + break; + } + off += optlen; + len -= optlen; + } + +done: + return 1; +} + +static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) +{ + struct ipv6hdr *ip6h; + struct ipv6_opt_hdr *ip6xh; + unsigned int hl, ixhl; + unsigned int pl; + int ntkoff; + u8 nexthdr; + + ntkoff = skb_network_offset(skb); + + hl = sizeof(*ip6h); + + if (!pskb_may_pull(skb, hl + ntkoff)) + goto fail; + + ip6h = ipv6_hdr(skb); + + pl = ntohs(ip6h->payload_len); + nexthdr = ip6h->nexthdr; + + do { + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + goto ignore_skb; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff)) + goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); + ixhl = ipv6_optlen(ip6xh); + if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) + goto fail; + if ((nexthdr == NEXTHDR_HOP) && + !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) + goto fail; + nexthdr = ip6xh->nexthdr; + hl += ixhl; + break; + case IPPROTO_ICMPV6: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv6_icmp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv6_tcp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 0)) + goto fail; + goto done; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 1)) + goto fail; + goto done; + default: + goto ignore_skb; + } + } while (pskb_may_pull(skb, hl + 1 + ntkoff)); + +done: +ignore_skb: + return 1; + +fail: + return 0; +} + +static int tcf_csum(struct sk_buff *skb, + struct tc_action *a, struct tcf_result *res) +{ + struct tcf_csum *p = a->priv; + int action; + u32 update_flags; + + spin_lock(&p->tcf_lock); + p->tcf_tm.lastuse = jiffies; + bstats_update(&p->tcf_bstats, skb); + action = p->tcf_action; + update_flags = p->update_flags; + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (!tcf_csum_ipv4(skb, update_flags)) + goto drop; + break; + case cpu_to_be16(ETH_P_IPV6): + if (!tcf_csum_ipv6(skb, update_flags)) + goto drop; + break; + } + + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_csum_dump(struct sk_buff *skb, + struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_csum *p = a->priv; + struct tc_csum opt = { + .update_flags = p->update_flags, + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_csum_ops = { + .kind = "csum", + .hinfo = &csum_hash_info, + .type = TCA_ACT_CSUM, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_csum, + .dump = tcf_csum_dump, + .cleanup = tcf_csum_cleanup, + .lookup = tcf_hash_search, + .init = tcf_csum_init, + .walk = tcf_generic_walker +}; + +MODULE_DESCRIPTION("Checksum updating actions"); +MODULE_LICENSE("GPL"); + +static int __init csum_init_module(void) +{ + return tcf_register_action(&act_csum_ops); +} + +static void __exit csum_cleanup_module(void) +{ + tcf_unregister_action(&act_csum_ops); +} + +module_init(csum_init_module); +module_exit(csum_cleanup_module); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c new file mode 100644 index 00000000..2b4ab4b0 --- /dev/null +++ b/net/sched/act_gact.c @@ -0,0 +1,221 @@ +/* + * net/sched/gact.c Generic actions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2002-4) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GACT_TAB_MASK 15 +static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1]; +static u32 gact_idx_gen; +static DEFINE_RWLOCK(gact_lock); + +static struct tcf_hashinfo gact_hash_info = { + .htab = tcf_gact_ht, + .hmask = GACT_TAB_MASK, + .lock = &gact_lock, +}; + +#ifdef CONFIG_GACT_PROB +static int gact_net_rand(struct tcf_gact *gact) +{ + if (!gact->tcfg_pval || net_random() % gact->tcfg_pval) + return gact->tcf_action; + return gact->tcfg_paction; +} + +static int gact_determ(struct tcf_gact *gact) +{ + if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval) + return gact->tcf_action; + return gact->tcfg_paction; +} + +typedef int (*g_rand)(struct tcf_gact *gact); +static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ }; +#endif /* CONFIG_GACT_PROB */ + +static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { + [TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) }, + [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, +}; + +static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_GACT_MAX + 1]; + struct tc_gact *parm; + struct tcf_gact *gact; + struct tcf_common *pc; + int ret = 0; + int err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy); + if (err < 0) + return err; + + if (tb[TCA_GACT_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_GACT_PARMS]); + +#ifndef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB] != NULL) + return -EOPNOTSUPP; +#endif + + pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), + bind, &gact_idx_gen, &gact_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_hash_release(pc, bind, &gact_hash_info); + return -EEXIST; + } + } + + gact = to_gact(pc); + + spin_lock_bh(&gact->tcf_lock); + gact->tcf_action = parm->action; +#ifdef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB] != NULL) { + struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); + gact->tcfg_paction = p_parm->paction; + gact->tcfg_pval = p_parm->pval; + gact->tcfg_ptype = p_parm->ptype; + } +#endif + spin_unlock_bh(&gact->tcf_lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &gact_hash_info); + return ret; +} + +static int tcf_gact_cleanup(struct tc_action *a, int bind) +{ + struct tcf_gact *gact = a->priv; + + if (gact) + return tcf_hash_release(&gact->common, bind, &gact_hash_info); + return 0; +} + +static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +{ + struct tcf_gact *gact = a->priv; + int action = TC_ACT_SHOT; + + spin_lock(&gact->tcf_lock); +#ifdef CONFIG_GACT_PROB + if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) + action = gact_rand[gact->tcfg_ptype](gact); + else + action = gact->tcf_action; +#else + action = gact->tcf_action; +#endif + gact->tcf_bstats.bytes += qdisc_pkt_len(skb); + gact->tcf_bstats.packets++; + if (action == TC_ACT_SHOT) + gact->tcf_qstats.drops++; + gact->tcf_tm.lastuse = jiffies; + spin_unlock(&gact->tcf_lock); + + return action; +} + +static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_gact *gact = a->priv; + struct tc_gact opt = { + .index = gact->tcf_index, + .refcnt = gact->tcf_refcnt - ref, + .bindcnt = gact->tcf_bindcnt - bind, + .action = gact->tcf_action, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); +#ifdef CONFIG_GACT_PROB + if (gact->tcfg_ptype) { + struct tc_gact_p p_opt = { + .paction = gact->tcfg_paction, + .pval = gact->tcfg_pval, + .ptype = gact->tcfg_ptype, + }; + + NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + } +#endif + t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); + NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_gact_ops = { + .kind = "gact", + .hinfo = &gact_hash_info, + .type = TCA_ACT_GACT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_gact, + .dump = tcf_gact_dump, + .cleanup = tcf_gact_cleanup, + .lookup = tcf_hash_search, + .init = tcf_gact_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Generic Classifier actions"); +MODULE_LICENSE("GPL"); + +static int __init gact_init_module(void) +{ +#ifdef CONFIG_GACT_PROB + pr_info("GACT probability on\n"); +#else + pr_info("GACT probability NOT on\n"); +#endif + return tcf_register_action(&act_gact_ops); +} + +static void __exit gact_cleanup_module(void) +{ + tcf_unregister_action(&act_gact_ops); +} + +module_init(gact_init_module); +module_exit(gact_cleanup_module); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c new file mode 100644 index 00000000..9fc211a1 --- /dev/null +++ b/net/sched/act_ipt.c @@ -0,0 +1,317 @@ +/* + * net/sched/ipt.c iptables target interface + * + *TODO: Add other tables. For now we only support the ipv4 table targets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright: Jamal Hadi Salim (2002-4) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define IPT_TAB_MASK 15 +static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1]; +static u32 ipt_idx_gen; +static DEFINE_RWLOCK(ipt_lock); + +static struct tcf_hashinfo ipt_hash_info = { + .htab = tcf_ipt_ht, + .hmask = IPT_TAB_MASK, + .lock = &ipt_lock, +}; + +static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) +{ + struct xt_tgchk_param par; + struct xt_target *target; + int ret = 0; + + target = xt_request_find_target(AF_INET, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) + return PTR_ERR(target); + + t->u.kernel.target = target; + par.table = table; + par.entryinfo = NULL; + par.target = target; + par.targinfo = t->data; + par.hook_mask = hook; + par.family = NFPROTO_IPV4; + + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); + if (ret < 0) { + module_put(t->u.kernel.target->me); + return ret; + } + return 0; +} + +static void ipt_destroy_target(struct xt_entry_target *t) +{ + struct xt_tgdtor_param par = { + .target = t->u.kernel.target, + .targinfo = t->data, + }; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); +} + +static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) +{ + int ret = 0; + if (ipt) { + if (bind) + ipt->tcf_bindcnt--; + ipt->tcf_refcnt--; + if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) { + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_tname); + kfree(ipt->tcfi_t); + tcf_hash_destroy(&ipt->common, &ipt_hash_info); + ret = ACT_P_DELETED; + } + } + return ret; +} + +static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { + [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_IPT_INDEX] = { .type = NLA_U32 }, + [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, +}; + +static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_IPT_MAX + 1]; + struct tcf_ipt *ipt; + struct tcf_common *pc; + struct xt_entry_target *td, *t; + char *tname; + int ret = 0, err; + u32 hook = 0; + u32 index = 0; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy); + if (err < 0) + return err; + + if (tb[TCA_IPT_HOOK] == NULL) + return -EINVAL; + if (tb[TCA_IPT_TARG] == NULL) + return -EINVAL; + + td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); + if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) + return -EINVAL; + + if (tb[TCA_IPT_INDEX] != NULL) + index = nla_get_u32(tb[TCA_IPT_INDEX]); + + pc = tcf_hash_check(index, a, bind, &ipt_hash_info); + if (!pc) { + pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, + &ipt_idx_gen, &ipt_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_ipt_release(to_ipt(pc), bind); + return -EEXIST; + } + } + ipt = to_ipt(pc); + + hook = nla_get_u32(tb[TCA_IPT_HOOK]); + + err = -ENOMEM; + tname = kmalloc(IFNAMSIZ, GFP_KERNEL); + if (unlikely(!tname)) + goto err1; + if (tb[TCA_IPT_TABLE] == NULL || + nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) + strcpy(tname, "mangle"); + + t = kmemdup(td, td->u.target_size, GFP_KERNEL); + if (unlikely(!t)) + goto err2; + + err = ipt_init_target(t, tname, hook); + if (err < 0) + goto err3; + + spin_lock_bh(&ipt->tcf_lock); + if (ret != ACT_P_CREATED) { + ipt_destroy_target(ipt->tcfi_t); + kfree(ipt->tcfi_tname); + kfree(ipt->tcfi_t); + } + ipt->tcfi_tname = tname; + ipt->tcfi_t = t; + ipt->tcfi_hook = hook; + spin_unlock_bh(&ipt->tcf_lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &ipt_hash_info); + return ret; + +err3: + kfree(t); +err2: + kfree(tname); +err1: + kfree(pc); + return err; +} + +static int tcf_ipt_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ipt *ipt = a->priv; + return tcf_ipt_release(ipt, bind); +} + +static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + int ret = 0, result = 0; + struct tcf_ipt *ipt = a->priv; + struct xt_action_param par; + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return TC_ACT_UNSPEC; + } + + spin_lock(&ipt->tcf_lock); + + ipt->tcf_tm.lastuse = jiffies; + bstats_update(&ipt->tcf_bstats, skb); + + /* yes, we have to worry about both in and out dev + * worry later - danger - this API seems to have changed + * from earlier kernels + */ + par.in = skb->dev; + par.out = NULL; + par.hooknum = ipt->tcfi_hook; + par.target = ipt->tcfi_t->u.kernel.target; + par.targinfo = ipt->tcfi_t->data; + ret = par.target->target(skb, &par); + + switch (ret) { + case NF_ACCEPT: + result = TC_ACT_OK; + break; + case NF_DROP: + result = TC_ACT_SHOT; + ipt->tcf_qstats.drops++; + break; + case XT_CONTINUE: + result = TC_ACT_PIPE; + break; + default: + if (net_ratelimit()) + pr_notice("tc filter: Bogus netfilter code" + " %d assume ACCEPT\n", ret); + result = TC_POLICE_OK; + break; + } + spin_unlock(&ipt->tcf_lock); + return result; + +} + +static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_ipt *ipt = a->priv; + struct xt_entry_target *t; + struct tcf_t tm; + struct tc_cnt c; + + /* for simple targets kernel size == user size + * user name = target name + * for foolproof you need to not assume this + */ + + t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); + if (unlikely(!t)) + goto nla_put_failure; + + c.bindcnt = ipt->tcf_bindcnt - bind; + c.refcnt = ipt->tcf_refcnt - ref; + strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); + + NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t); + NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index); + NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook); + NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); + NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname); + tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); + tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); + NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + kfree(t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + kfree(t); + return -1; +} + +static struct tc_action_ops act_ipt_ops = { + .kind = "ipt", + .hinfo = &ipt_hash_info, + .type = TCA_ACT_IPT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_ipt, + .dump = tcf_ipt_dump, + .cleanup = tcf_ipt_cleanup, + .lookup = tcf_hash_search, + .init = tcf_ipt_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Iptables target actions"); +MODULE_LICENSE("GPL"); + +static int __init ipt_init_module(void) +{ + return tcf_register_action(&act_ipt_ops); +} + +static void __exit ipt_cleanup_module(void) +{ + tcf_unregister_action(&act_ipt_ops); +} + +module_init(ipt_init_module); +module_exit(ipt_cleanup_module); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c new file mode 100644 index 00000000..961386e2 --- /dev/null +++ b/net/sched/act_mirred.c @@ -0,0 +1,300 @@ +/* + * net/sched/mirred.c packet mirroring and redirect actions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2002-4) + * + * TODO: Add ingress support (and socket redirect support) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MIRRED_TAB_MASK 7 +static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; +static u32 mirred_idx_gen; +static DEFINE_RWLOCK(mirred_lock); +static LIST_HEAD(mirred_list); + +static struct tcf_hashinfo mirred_hash_info = { + .htab = tcf_mirred_ht, + .hmask = MIRRED_TAB_MASK, + .lock = &mirred_lock, +}; + +static int tcf_mirred_release(struct tcf_mirred *m, int bind) +{ + if (m) { + if (bind) + m->tcf_bindcnt--; + m->tcf_refcnt--; + if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) { + list_del(&m->tcfm_list); + if (m->tcfm_dev) + dev_put(m->tcfm_dev); + tcf_hash_destroy(&m->common, &mirred_hash_info); + return 1; + } + } + return 0; +} + +static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { + [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, +}; + +static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_MIRRED_MAX + 1]; + struct tc_mirred *parm; + struct tcf_mirred *m; + struct tcf_common *pc; + struct net_device *dev; + int ret, ok_push = 0; + + if (nla == NULL) + return -EINVAL; + ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy); + if (ret < 0) + return ret; + if (tb[TCA_MIRRED_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_MIRRED_PARMS]); + switch (parm->eaction) { + case TCA_EGRESS_MIRROR: + case TCA_EGRESS_REDIR: + break; + default: + return -EINVAL; + } + if (parm->ifindex) { + dev = __dev_get_by_index(&init_net, parm->ifindex); + if (dev == NULL) + return -ENODEV; + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + ok_push = 0; + break; + default: + ok_push = 1; + break; + } + } else { + dev = NULL; + } + + pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info); + if (!pc) { + if (dev == NULL) + return -EINVAL; + pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind, + &mirred_idx_gen, &mirred_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_mirred_release(to_mirred(pc), bind); + return -EEXIST; + } + } + m = to_mirred(pc); + + spin_lock_bh(&m->tcf_lock); + m->tcf_action = parm->action; + m->tcfm_eaction = parm->eaction; + if (dev != NULL) { + m->tcfm_ifindex = parm->ifindex; + if (ret != ACT_P_CREATED) + dev_put(m->tcfm_dev); + dev_hold(dev); + m->tcfm_dev = dev; + m->tcfm_ok_push = ok_push; + } + spin_unlock_bh(&m->tcf_lock); + if (ret == ACT_P_CREATED) { + list_add(&m->tcfm_list, &mirred_list); + tcf_hash_insert(pc, &mirred_hash_info); + } + + return ret; +} + +static int tcf_mirred_cleanup(struct tc_action *a, int bind) +{ + struct tcf_mirred *m = a->priv; + + if (m) + return tcf_mirred_release(m, bind); + return 0; +} + +static int tcf_mirred(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_mirred *m = a->priv; + struct net_device *dev; + struct sk_buff *skb2; + u32 at; + int retval, err = 1; + + spin_lock(&m->tcf_lock); + m->tcf_tm.lastuse = jiffies; + bstats_update(&m->tcf_bstats, skb); + + dev = m->tcfm_dev; + if (!dev) { + printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + goto out; + } + + if (!(dev->flags & IFF_UP)) { + if (net_ratelimit()) + pr_notice("tc mirred to Houston: device %s is down\n", + dev->name); + goto out; + } + + at = G_TC_AT(skb->tc_verd); + skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); + if (skb2 == NULL) + goto out; + + if (!(at & AT_EGRESS)) { + if (m->tcfm_ok_push) + skb_push(skb2, skb2->dev->hard_header_len); + } + + /* mirror is always swallowed */ + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); + + skb2->skb_iif = skb->dev->ifindex; + skb2->dev = dev; + dev_queue_xmit(skb2); + err = 0; + +out: + if (err) { + m->tcf_qstats.overlimits++; + /* should we be asking for packet to be dropped? + * may make sense for redirect case only + */ + retval = TC_ACT_SHOT; + } else { + retval = m->tcf_action; + } + spin_unlock(&m->tcf_lock); + + return retval; +} + +static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_mirred *m = a->priv; + struct tc_mirred opt = { + .index = m->tcf_index, + .action = m->tcf_action, + .refcnt = m->tcf_refcnt - ref, + .bindcnt = m->tcf_bindcnt - bind, + .eaction = m->tcfm_eaction, + .ifindex = m->tcfm_ifindex, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(m->tcf_tm.expires); + NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int mirred_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct tcf_mirred *m; + + if (event == NETDEV_UNREGISTER) + list_for_each_entry(m, &mirred_list, tcfm_list) { + if (m->tcfm_dev == dev) { + dev_put(dev); + m->tcfm_dev = NULL; + } + } + + return NOTIFY_DONE; +} + +static struct notifier_block mirred_device_notifier = { + .notifier_call = mirred_device_event, +}; + + +static struct tc_action_ops act_mirred_ops = { + .kind = "mirred", + .hinfo = &mirred_hash_info, + .type = TCA_ACT_MIRRED, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_mirred, + .dump = tcf_mirred_dump, + .cleanup = tcf_mirred_cleanup, + .lookup = tcf_hash_search, + .init = tcf_mirred_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002)"); +MODULE_DESCRIPTION("Device Mirror/redirect actions"); +MODULE_LICENSE("GPL"); + +static int __init mirred_init_module(void) +{ + int err = register_netdevice_notifier(&mirred_device_notifier); + if (err) + return err; + + pr_info("Mirror/redirect action on\n"); + return tcf_register_action(&act_mirred_ops); +} + +static void __exit mirred_cleanup_module(void) +{ + unregister_netdevice_notifier(&mirred_device_notifier); + tcf_unregister_action(&act_mirred_ops); +} + +module_init(mirred_init_module); +module_exit(mirred_cleanup_module); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c new file mode 100644 index 00000000..762b0276 --- /dev/null +++ b/net/sched/act_nat.c @@ -0,0 +1,328 @@ +/* + * Stateless NAT actions + * + * Copyright (c) 2007 Herbert Xu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define NAT_TAB_MASK 15 +static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1]; +static u32 nat_idx_gen; +static DEFINE_RWLOCK(nat_lock); + +static struct tcf_hashinfo nat_hash_info = { + .htab = tcf_nat_ht, + .hmask = NAT_TAB_MASK, + .lock = &nat_lock, +}; + +static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { + [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, +}; + +static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_NAT_MAX + 1]; + struct tc_nat *parm; + int ret = 0, err; + struct tcf_nat *p; + struct tcf_common *pc; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy); + if (err < 0) + return err; + + if (tb[TCA_NAT_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_NAT_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &nat_idx_gen, &nat_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + p = to_tcf_nat(pc); + ret = ACT_P_CREATED; + } else { + p = to_tcf_nat(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &nat_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&p->tcf_lock); + p->old_addr = parm->old_addr; + p->new_addr = parm->new_addr; + p->mask = parm->mask; + p->flags = parm->flags; + + p->tcf_action = parm->action; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &nat_hash_info); + + return ret; +} + +static int tcf_nat_cleanup(struct tc_action *a, int bind) +{ + struct tcf_nat *p = a->priv; + + return tcf_hash_release(&p->common, bind, &nat_hash_info); +} + +static int tcf_nat(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_nat *p = a->priv; + struct iphdr *iph; + __be32 old_addr; + __be32 new_addr; + __be32 mask; + __be32 addr; + int egress; + int action; + int ihl; + int noff; + + spin_lock(&p->tcf_lock); + + p->tcf_tm.lastuse = jiffies; + old_addr = p->old_addr; + new_addr = p->new_addr; + mask = p->mask; + egress = p->flags & TCA_NAT_FLAG_EGRESS; + action = p->tcf_action; + + bstats_update(&p->tcf_bstats, skb); + + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + noff = skb_network_offset(skb); + if (!pskb_may_pull(skb, sizeof(*iph) + noff)) + goto drop; + + iph = ip_hdr(skb); + + if (egress) + addr = iph->saddr; + else + addr = iph->daddr; + + if (!((old_addr ^ addr) & mask)) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + new_addr &= mask; + new_addr |= addr & ~mask; + + /* Rewrite IP header */ + iph = ip_hdr(skb); + if (egress) + iph->saddr = new_addr; + else + iph->daddr = new_addr; + + csum_replace4(&iph->check, addr, new_addr); + } else if ((iph->frag_off & htons(IP_OFFSET)) || + iph->protocol != IPPROTO_ICMP) { + goto out; + } + + ihl = iph->ihl * 4; + + /* It would be nice to share code with stateful NAT. */ + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_TCP: + { + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || + (skb_cloned(skb) && + !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto drop; + + tcph = (void *)(skb_network_header(skb) + ihl); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); + break; + } + case IPPROTO_UDP: + { + struct udphdr *udph; + + if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || + (skb_cloned(skb) && + !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto drop; + + udph = (void *)(skb_network_header(skb) + ihl); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, 1); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + break; + } + case IPPROTO_ICMP: + { + struct icmphdr *icmph; + + if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff)) + goto drop; + + icmph = (void *)(skb_network_header(skb) + ihl); + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_TIME_EXCEEDED) && + (icmph->type != ICMP_PARAMETERPROB)) + break; + + if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + + noff)) + goto drop; + + icmph = (void *)(skb_network_header(skb) + ihl); + iph = (void *)(icmph + 1); + if (egress) + addr = iph->daddr; + else + addr = iph->saddr; + + if ((old_addr ^ addr) & mask) + break; + + if (skb_cloned(skb) && + !skb_clone_writable(skb, ihl + sizeof(*icmph) + + sizeof(*iph) + noff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + + icmph = (void *)(skb_network_header(skb) + ihl); + iph = (void *)(icmph + 1); + + new_addr &= mask; + new_addr |= addr & ~mask; + + /* XXX Fix up the inner checksums. */ + if (egress) + iph->daddr = new_addr; + else + iph->saddr = new_addr; + + inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, + 0); + break; + } + default: + break; + } + +out: + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_nat *p = a->priv; + struct tc_nat opt = { + .old_addr = p->old_addr, + .new_addr = p->new_addr, + .mask = p->mask, + .flags = p->flags, + + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_nat_ops = { + .kind = "nat", + .hinfo = &nat_hash_info, + .type = TCA_ACT_NAT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_nat, + .dump = tcf_nat_dump, + .cleanup = tcf_nat_cleanup, + .lookup = tcf_hash_search, + .init = tcf_nat_init, + .walk = tcf_generic_walker +}; + +MODULE_DESCRIPTION("Stateless NAT actions"); +MODULE_LICENSE("GPL"); + +static int __init nat_init_module(void) +{ + return tcf_register_action(&act_nat_ops); +} + +static void __exit nat_cleanup_module(void) +{ + tcf_unregister_action(&act_nat_ops); +} + +module_init(nat_init_module); +module_exit(nat_cleanup_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c new file mode 100644 index 00000000..7affe9a9 --- /dev/null +++ b/net/sched/act_pedit.c @@ -0,0 +1,262 @@ +/* + * net/sched/pedit.c Generic packet editor + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2002-4) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PEDIT_TAB_MASK 15 +static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1]; +static u32 pedit_idx_gen; +static DEFINE_RWLOCK(pedit_lock); + +static struct tcf_hashinfo pedit_hash_info = { + .htab = tcf_pedit_ht, + .hmask = PEDIT_TAB_MASK, + .lock = &pedit_lock, +}; + +static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { + [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, +}; + +static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_PEDIT_MAX + 1]; + struct tc_pedit *parm; + int ret = 0, err; + struct tcf_pedit *p; + struct tcf_common *pc; + struct tc_pedit_key *keys = NULL; + int ksize; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy); + if (err < 0) + return err; + + if (tb[TCA_PEDIT_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_PEDIT_PARMS]); + ksize = parm->nkeys * sizeof(struct tc_pedit_key); + if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) + return -EINVAL; + + pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info); + if (!pc) { + if (!parm->nkeys) + return -EINVAL; + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &pedit_idx_gen, &pedit_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + p = to_pedit(pc); + keys = kmalloc(ksize, GFP_KERNEL); + if (keys == NULL) { + kfree(pc); + return -ENOMEM; + } + ret = ACT_P_CREATED; + } else { + p = to_pedit(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &pedit_hash_info); + return -EEXIST; + } + if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { + keys = kmalloc(ksize, GFP_KERNEL); + if (keys == NULL) + return -ENOMEM; + } + } + + spin_lock_bh(&p->tcf_lock); + p->tcfp_flags = parm->flags; + p->tcf_action = parm->action; + if (keys) { + kfree(p->tcfp_keys); + p->tcfp_keys = keys; + p->tcfp_nkeys = parm->nkeys; + } + memcpy(p->tcfp_keys, parm->keys, ksize); + spin_unlock_bh(&p->tcf_lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &pedit_hash_info); + return ret; +} + +static int tcf_pedit_cleanup(struct tc_action *a, int bind) +{ + struct tcf_pedit *p = a->priv; + + if (p) { + struct tc_pedit_key *keys = p->tcfp_keys; + if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) { + kfree(keys); + return 1; + } + } + return 0; +} + +static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_pedit *p = a->priv; + int i, munged = 0; + unsigned int off; + + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return p->tcf_action; + + off = skb_network_offset(skb); + + spin_lock(&p->tcf_lock); + + p->tcf_tm.lastuse = jiffies; + + if (p->tcfp_nkeys > 0) { + struct tc_pedit_key *tkey = p->tcfp_keys; + + for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { + u32 *ptr, _data; + int offset = tkey->off; + + if (tkey->offmask) { + char *d, _d; + + d = skb_header_pointer(skb, off + tkey->at, 1, + &_d); + if (!d) + goto bad; + offset += (*d & tkey->offmask) >> tkey->shift; + } + + if (offset % 4) { + pr_info("tc filter pedit" + " offset must be on 32 bit boundaries\n"); + goto bad; + } + if (offset > 0 && offset > skb->len) { + pr_info("tc filter pedit" + " offset %d can't exceed pkt length %d\n", + offset, skb->len); + goto bad; + } + + ptr = skb_header_pointer(skb, off + offset, 4, &_data); + if (!ptr) + goto bad; + /* just do it, baby */ + *ptr = ((*ptr & tkey->mask) ^ tkey->val); + if (ptr == &_data) + skb_store_bits(skb, off + offset, ptr, 4); + munged++; + } + + if (munged) + skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); + goto done; + } else + WARN(1, "pedit BUG: index %d\n", p->tcf_index); + +bad: + p->tcf_qstats.overlimits++; +done: + bstats_update(&p->tcf_bstats, skb); + spin_unlock(&p->tcf_lock); + return p->tcf_action; +} + +static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_pedit *p = a->priv; + struct tc_pedit *opt; + struct tcf_t t; + int s; + + s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key); + + /* netlink spinlocks held above us - must use ATOMIC */ + opt = kzalloc(s, GFP_ATOMIC); + if (unlikely(!opt)) + return -ENOBUFS; + + memcpy(opt->keys, p->tcfp_keys, + p->tcfp_nkeys * sizeof(struct tc_pedit_key)); + opt->index = p->tcf_index; + opt->nkeys = p->tcfp_nkeys; + opt->flags = p->tcfp_flags; + opt->action = p->tcf_action; + opt->refcnt = p->tcf_refcnt - ref; + opt->bindcnt = p->tcf_bindcnt - bind; + + NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + kfree(opt); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + kfree(opt); + return -1; +} + +static struct tc_action_ops act_pedit_ops = { + .kind = "pedit", + .hinfo = &pedit_hash_info, + .type = TCA_ACT_PEDIT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_pedit, + .dump = tcf_pedit_dump, + .cleanup = tcf_pedit_cleanup, + .lookup = tcf_hash_search, + .init = tcf_pedit_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Generic Packet Editor actions"); +MODULE_LICENSE("GPL"); + +static int __init pedit_init_module(void) +{ + return tcf_register_action(&act_pedit_ops); +} + +static void __exit pedit_cleanup_module(void) +{ + tcf_unregister_action(&act_pedit_ops); +} + +module_init(pedit_init_module); +module_exit(pedit_cleanup_module); + diff --git a/net/sched/act_police.c b/net/sched/act_police.c new file mode 100644 index 00000000..b3b9b32f --- /dev/null +++ b/net/sched/act_police.c @@ -0,0 +1,402 @@ +/* + * net/sched/police.c Input police filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * J Hadi Salim (action changes) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L) +#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L) + +#define POL_TAB_MASK 15 +static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; +static u32 police_idx_gen; +static DEFINE_RWLOCK(police_lock); + +static struct tcf_hashinfo police_hash_info = { + .htab = tcf_police_ht, + .hmask = POL_TAB_MASK, + .lock = &police_lock, +}; + +/* old policer structure from before tc actions */ +struct tc_police_compat { + u32 index; + int action; + u32 limit; + u32 burst; + u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; +}; + +/* Each policer is serialized by its individual spinlock */ + +static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) +{ + struct tcf_common *p; + int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + struct nlattr *nest; + + read_lock_bh(&police_lock); + + s_i = cb->args[0]; + + for (i = 0; i < (POL_TAB_MASK + 1); i++) { + p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)]; + + for (; p; p = p->tcfc_next) { + index++; + if (index < s_i) + continue; + a->priv = p; + a->order = index; + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + if (type == RTM_DELACTION) + err = tcf_action_dump_1(skb, a, 0, 1); + else + err = tcf_action_dump_1(skb, a, 0, 0); + if (err < 0) { + index--; + nla_nest_cancel(skb, nest); + goto done; + } + nla_nest_end(skb, nest); + n_i++; + } + } +done: + read_unlock_bh(&police_lock); + if (n_i) + cb->args[0] += n_i; + return n_i; + +nla_put_failure: + nla_nest_cancel(skb, nest); + goto done; +} + +static void tcf_police_destroy(struct tcf_police *p) +{ + unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); + struct tcf_common **p1p; + + for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) { + if (*p1p == &p->common) { + write_lock_bh(&police_lock); + *p1p = p->tcf_next; + write_unlock_bh(&police_lock); + gen_kill_estimator(&p->tcf_bstats, + &p->tcf_rate_est); + if (p->tcfp_R_tab) + qdisc_put_rtab(p->tcfp_R_tab); + if (p->tcfp_P_tab) + qdisc_put_rtab(p->tcfp_P_tab); + /* + * gen_estimator est_timer() might access p->tcf_lock + * or bstats, wait a RCU grace period before freeing p + */ + kfree_rcu(p, tcf_rcu); + return; + } + } + WARN_ON(1); +} + +static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { + [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, + [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, + [TCA_POLICE_AVRATE] = { .type = NLA_U32 }, + [TCA_POLICE_RESULT] = { .type = NLA_U32 }, +}; + +static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + unsigned int h; + int ret = 0, err; + struct nlattr *tb[TCA_POLICE_MAX + 1]; + struct tc_police *parm; + struct tcf_police *police; + struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + int size; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy); + if (err < 0) + return err; + + if (tb[TCA_POLICE_TBF] == NULL) + return -EINVAL; + size = nla_len(tb[TCA_POLICE_TBF]); + if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) + return -EINVAL; + parm = nla_data(tb[TCA_POLICE_TBF]); + + if (parm->index) { + struct tcf_common *pc; + + pc = tcf_hash_lookup(parm->index, &police_hash_info); + if (pc != NULL) { + a->priv = pc; + police = to_police(pc); + if (bind) { + police->tcf_bindcnt += 1; + police->tcf_refcnt += 1; + } + if (ovr) + goto override; + return ret; + } + } + + police = kzalloc(sizeof(*police), GFP_KERNEL); + if (police == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + police->tcf_refcnt = 1; + spin_lock_init(&police->tcf_lock); + if (bind) + police->tcf_bindcnt = 1; +override: + if (parm->rate.rate) { + err = -ENOMEM; + R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); + if (R_tab == NULL) + goto failure; + + if (parm->peakrate.rate) { + P_tab = qdisc_get_rtab(&parm->peakrate, + tb[TCA_POLICE_PEAKRATE]); + if (P_tab == NULL) + goto failure; + } + } + + spin_lock_bh(&police->tcf_lock); + if (est) { + err = gen_replace_estimator(&police->tcf_bstats, + &police->tcf_rate_est, + &police->tcf_lock, est); + if (err) + goto failure_unlock; + } else if (tb[TCA_POLICE_AVRATE] && + (ret == ACT_P_CREATED || + !gen_estimator_active(&police->tcf_bstats, + &police->tcf_rate_est))) { + err = -EINVAL; + goto failure_unlock; + } + + /* No failure allowed after this point */ + if (R_tab != NULL) { + qdisc_put_rtab(police->tcfp_R_tab); + police->tcfp_R_tab = R_tab; + } + if (P_tab != NULL) { + qdisc_put_rtab(police->tcfp_P_tab); + police->tcfp_P_tab = P_tab; + } + + if (tb[TCA_POLICE_RESULT]) + police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); + police->tcfp_toks = police->tcfp_burst = parm->burst; + police->tcfp_mtu = parm->mtu; + if (police->tcfp_mtu == 0) { + police->tcfp_mtu = ~0; + if (police->tcfp_R_tab) + police->tcfp_mtu = 255<tcfp_R_tab->rate.cell_log; + } + if (police->tcfp_P_tab) + police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu); + police->tcf_action = parm->action; + + if (tb[TCA_POLICE_AVRATE]) + police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + + spin_unlock_bh(&police->tcf_lock); + if (ret != ACT_P_CREATED) + return ret; + + police->tcfp_t_c = psched_get_time(); + police->tcf_index = parm->index ? parm->index : + tcf_hash_new_index(&police_idx_gen, &police_hash_info); + h = tcf_hash(police->tcf_index, POL_TAB_MASK); + write_lock_bh(&police_lock); + police->tcf_next = tcf_police_ht[h]; + tcf_police_ht[h] = &police->common; + write_unlock_bh(&police_lock); + + a->priv = police; + return ret; + +failure_unlock: + spin_unlock_bh(&police->tcf_lock); +failure: + if (P_tab) + qdisc_put_rtab(P_tab); + if (R_tab) + qdisc_put_rtab(R_tab); + if (ret == ACT_P_CREATED) + kfree(police); + return err; +} + +static int tcf_act_police_cleanup(struct tc_action *a, int bind) +{ + struct tcf_police *p = a->priv; + int ret = 0; + + if (p != NULL) { + if (bind) + p->tcf_bindcnt--; + + p->tcf_refcnt--; + if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) { + tcf_police_destroy(p); + ret = 1; + } + } + return ret; +} + +static int tcf_act_police(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_police *police = a->priv; + psched_time_t now; + long toks; + long ptoks = 0; + + spin_lock(&police->tcf_lock); + + bstats_update(&police->tcf_bstats, skb); + + if (police->tcfp_ewma_rate && + police->tcf_rate_est.bps >= police->tcfp_ewma_rate) { + police->tcf_qstats.overlimits++; + if (police->tcf_action == TC_ACT_SHOT) + police->tcf_qstats.drops++; + spin_unlock(&police->tcf_lock); + return police->tcf_action; + } + + if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { + if (police->tcfp_R_tab == NULL) { + spin_unlock(&police->tcf_lock); + return police->tcfp_result; + } + + now = psched_get_time(); + toks = psched_tdiff_bounded(now, police->tcfp_t_c, + police->tcfp_burst); + if (police->tcfp_P_tab) { + ptoks = toks + police->tcfp_ptoks; + if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) + ptoks = (long)L2T_P(police, police->tcfp_mtu); + ptoks -= L2T_P(police, qdisc_pkt_len(skb)); + } + toks += police->tcfp_toks; + if (toks > (long)police->tcfp_burst) + toks = police->tcfp_burst; + toks -= L2T(police, qdisc_pkt_len(skb)); + if ((toks|ptoks) >= 0) { + police->tcfp_t_c = now; + police->tcfp_toks = toks; + police->tcfp_ptoks = ptoks; + spin_unlock(&police->tcf_lock); + return police->tcfp_result; + } + } + + police->tcf_qstats.overlimits++; + if (police->tcf_action == TC_ACT_SHOT) + police->tcf_qstats.drops++; + spin_unlock(&police->tcf_lock); + return police->tcf_action; +} + +static int +tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_police *police = a->priv; + struct tc_police opt = { + .index = police->tcf_index, + .action = police->tcf_action, + .mtu = police->tcfp_mtu, + .burst = police->tcfp_burst, + .refcnt = police->tcf_refcnt - ref, + .bindcnt = police->tcf_bindcnt - bind, + }; + + if (police->tcfp_R_tab) + opt.rate = police->tcfp_R_tab->rate; + if (police->tcfp_P_tab) + opt.peakrate = police->tcfp_P_tab->rate; + NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + if (police->tcfp_result) + NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); + if (police->tcfp_ewma_rate) + NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +MODULE_AUTHOR("Alexey Kuznetsov"); +MODULE_DESCRIPTION("Policing actions"); +MODULE_LICENSE("GPL"); + +static struct tc_action_ops act_police_ops = { + .kind = "police", + .hinfo = &police_hash_info, + .type = TCA_ID_POLICE, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_act_police, + .dump = tcf_act_police_dump, + .cleanup = tcf_act_police_cleanup, + .lookup = tcf_hash_search, + .init = tcf_act_police_locate, + .walk = tcf_act_police_walker +}; + +static int __init +police_init_module(void) +{ + return tcf_register_action(&act_police_ops); +} + +static void __exit +police_cleanup_module(void) +{ + tcf_unregister_action(&act_police_ops); +} + +module_init(police_init_module); +module_exit(police_cleanup_module); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c new file mode 100644 index 00000000..a34a22de --- /dev/null +++ b/net/sched/act_simple.c @@ -0,0 +1,218 @@ +/* + * net/sched/simp.c Simple example of an action + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2005-8) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define TCA_ACT_SIMP 22 + +#include +#include + +#define SIMP_TAB_MASK 7 +static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1]; +static u32 simp_idx_gen; +static DEFINE_RWLOCK(simp_lock); + +static struct tcf_hashinfo simp_hash_info = { + .htab = tcf_simp_ht, + .hmask = SIMP_TAB_MASK, + .lock = &simp_lock, +}; + +#define SIMP_MAX_DATA 32 +static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +{ + struct tcf_defact *d = a->priv; + + spin_lock(&d->tcf_lock); + d->tcf_tm.lastuse = jiffies; + bstats_update(&d->tcf_bstats, skb); + + /* print policy string followed by _ then packet count + * Example if this was the 3rd packet and the string was "hello" + * then it would look like "hello_3" (without quotes) + */ + pr_info("simple: %s_%d\n", + (char *)d->tcfd_defdata, d->tcf_bstats.packets); + spin_unlock(&d->tcf_lock); + return d->tcf_action; +} + +static int tcf_simp_release(struct tcf_defact *d, int bind) +{ + int ret = 0; + if (d) { + if (bind) + d->tcf_bindcnt--; + d->tcf_refcnt--; + if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) { + kfree(d->tcfd_defdata); + tcf_hash_destroy(&d->common, &simp_hash_info); + ret = 1; + } + } + return ret; +} + +static int alloc_defdata(struct tcf_defact *d, char *defdata) +{ + d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); + if (unlikely(!d->tcfd_defdata)) + return -ENOMEM; + strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + return 0; +} + +static void reset_policy(struct tcf_defact *d, char *defdata, + struct tc_defact *p) +{ + spin_lock_bh(&d->tcf_lock); + d->tcf_action = p->action; + memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); + strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + spin_unlock_bh(&d->tcf_lock); +} + +static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { + [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, + [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, +}; + +static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_DEF_MAX + 1]; + struct tc_defact *parm; + struct tcf_defact *d; + struct tcf_common *pc; + char *defdata; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy); + if (err < 0) + return err; + + if (tb[TCA_DEF_PARMS] == NULL) + return -EINVAL; + + if (tb[TCA_DEF_DATA] == NULL) + return -EINVAL; + + parm = nla_data(tb[TCA_DEF_PARMS]); + defdata = nla_data(tb[TCA_DEF_DATA]); + + pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, + &simp_idx_gen, &simp_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + + d = to_defact(pc); + ret = alloc_defdata(d, defdata); + if (ret < 0) { + kfree(pc); + return ret; + } + d->tcf_action = parm->action; + ret = ACT_P_CREATED; + } else { + d = to_defact(pc); + if (!ovr) { + tcf_simp_release(d, bind); + return -EEXIST; + } + reset_policy(d, defdata, parm); + } + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &simp_hash_info); + return ret; +} + +static int tcf_simp_cleanup(struct tc_action *a, int bind) +{ + struct tcf_defact *d = a->priv; + + if (d) + return tcf_simp_release(d, bind); + return 0; +} + +static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_defact *d = a->priv; + struct tc_defact opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); + NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata); + t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_simp_ops = { + .kind = "simple", + .hinfo = &simp_hash_info, + .type = TCA_ACT_SIMP, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_simp, + .dump = tcf_simp_dump, + .cleanup = tcf_simp_cleanup, + .init = tcf_simp_init, + .walk = tcf_generic_walker, +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2005)"); +MODULE_DESCRIPTION("Simple example action"); +MODULE_LICENSE("GPL"); + +static int __init simp_init_module(void) +{ + int ret = tcf_register_action(&act_simp_ops); + if (!ret) + pr_info("Simple TC action Loaded\n"); + return ret; +} + +static void __exit simp_cleanup_module(void) +{ + tcf_unregister_action(&act_simp_ops); +} + +module_init(simp_init_module); +module_exit(simp_cleanup_module); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c new file mode 100644 index 00000000..5f6f0c7c --- /dev/null +++ b/net/sched/act_skbedit.c @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Alexander Duyck + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SKBEDIT_TAB_MASK 15 +static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; +static u32 skbedit_idx_gen; +static DEFINE_RWLOCK(skbedit_lock); + +static struct tcf_hashinfo skbedit_hash_info = { + .htab = tcf_skbedit_ht, + .hmask = SKBEDIT_TAB_MASK, + .lock = &skbedit_lock, +}; + +static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_skbedit *d = a->priv; + + spin_lock(&d->tcf_lock); + d->tcf_tm.lastuse = jiffies; + bstats_update(&d->tcf_bstats, skb); + + if (d->flags & SKBEDIT_F_PRIORITY) + skb->priority = d->priority; + if (d->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > d->queue_mapping) + skb_set_queue_mapping(skb, d->queue_mapping); + if (d->flags & SKBEDIT_F_MARK) + skb->mark = d->mark; + + spin_unlock(&d->tcf_lock); + return d->tcf_action; +} + +static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { + [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, + [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, + [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, +}; + +static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; + struct tc_skbedit *parm; + struct tcf_skbedit *d; + struct tcf_common *pc; + u32 flags = 0, *priority = NULL, *mark = NULL; + u16 *queue_mapping = NULL; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy); + if (err < 0) + return err; + + if (tb[TCA_SKBEDIT_PARMS] == NULL) + return -EINVAL; + + if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { + flags |= SKBEDIT_F_PRIORITY; + priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]); + } + + if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { + flags |= SKBEDIT_F_QUEUE_MAPPING; + queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); + } + + if (tb[TCA_SKBEDIT_MARK] != NULL) { + flags |= SKBEDIT_F_MARK; + mark = nla_data(tb[TCA_SKBEDIT_MARK]); + } + + if (!flags) + return -EINVAL; + + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, + &skbedit_idx_gen, &skbedit_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + + d = to_skbedit(pc); + ret = ACT_P_CREATED; + } else { + d = to_skbedit(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &skbedit_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&d->tcf_lock); + + d->flags = flags; + if (flags & SKBEDIT_F_PRIORITY) + d->priority = *priority; + if (flags & SKBEDIT_F_QUEUE_MAPPING) + d->queue_mapping = *queue_mapping; + if (flags & SKBEDIT_F_MARK) + d->mark = *mark; + + d->tcf_action = parm->action; + + spin_unlock_bh(&d->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &skbedit_hash_info); + return ret; +} + +static int tcf_skbedit_cleanup(struct tc_action *a, int bind) +{ + struct tcf_skbedit *d = a->priv; + + if (d) + return tcf_hash_release(&d->common, bind, &skbedit_hash_info); + return 0; +} + +static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_skbedit *d = a->priv; + struct tc_skbedit opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); + if (d->flags & SKBEDIT_F_PRIORITY) + NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), + &d->priority); + if (d->flags & SKBEDIT_F_QUEUE_MAPPING) + NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING, + sizeof(d->queue_mapping), &d->queue_mapping); + if (d->flags & SKBEDIT_F_MARK) + NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), + &d->mark); + t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_skbedit_ops = { + .kind = "skbedit", + .hinfo = &skbedit_hash_info, + .type = TCA_ACT_SKBEDIT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_skbedit, + .dump = tcf_skbedit_dump, + .cleanup = tcf_skbedit_cleanup, + .init = tcf_skbedit_init, + .walk = tcf_generic_walker, +}; + +MODULE_AUTHOR("Alexander Duyck, "); +MODULE_DESCRIPTION("SKB Editing"); +MODULE_LICENSE("GPL"); + +static int __init skbedit_init_module(void) +{ + return tcf_register_action(&act_skbedit_ops); +} + +static void __exit skbedit_cleanup_module(void) +{ + tcf_unregister_action(&act_skbedit_ops); +} + +module_init(skbedit_init_module); +module_exit(skbedit_cleanup_module); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c new file mode 100644 index 00000000..bb2c523f --- /dev/null +++ b/net/sched/cls_api.c @@ -0,0 +1,621 @@ +/* + * net/sched/cls_api.c Packet classifier API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * + * Eduardo J. Blanco :990222: kmod support + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The list of all installed classifier types */ + +static struct tcf_proto_ops *tcf_proto_base __read_mostly; + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(cls_mod_lock); + +/* Find classifier type by string name */ + +static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) +{ + struct tcf_proto_ops *t = NULL; + + if (kind) { + read_lock(&cls_mod_lock); + for (t = tcf_proto_base; t; t = t->next) { + if (nla_strcmp(kind, t->kind) == 0) { + if (!try_module_get(t->owner)) + t = NULL; + break; + } + } + read_unlock(&cls_mod_lock); + } + return t; +} + +/* Register(unregister) new classifier type */ + +int register_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + int rc = -EEXIST; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + if (!strcmp(ops->kind, t->kind)) + goto out; + + ops->next = NULL; + *tp = ops; + rc = 0; +out: + write_unlock(&cls_mod_lock); + return rc; +} +EXPORT_SYMBOL(register_tcf_proto_ops); + +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + int rc = -ENOENT; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + if (t == ops) + break; + + if (!t) + goto out; + *tp = t->next; + rc = 0; +out: + write_unlock(&cls_mod_lock); + return rc; +} +EXPORT_SYMBOL(unregister_tcf_proto_ops); + +static int tfilter_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct tcf_proto *tp, + unsigned long fh, int event); + + +/* Select new prio value from the range, managed by kernel. */ + +static inline u32 tcf_auto_prio(struct tcf_proto *tp) +{ + u32 first = TC_H_MAKE(0xC0000000U, 0U); + + if (tp) + first = tp->prio - 1; + + return first; +} + +/* Add/change/delete/get a filter node */ + +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + spinlock_t *root_lock; + struct tcmsg *t; + u32 protocol; + u32 prio; + u32 nprio; + u32 parent; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto **back, **chain; + struct tcf_proto *tp; + struct tcf_proto_ops *tp_ops; + const struct Qdisc_class_ops *cops; + unsigned long cl; + unsigned long fh; + int err; + int tp_created = 0; + +replay: + t = NLMSG_DATA(n); + protocol = TC_H_MIN(t->tcm_info); + prio = TC_H_MAJ(t->tcm_info); + nprio = prio; + parent = t->tcm_parent; + cl = 0; + + if (prio == 0) { + /* If no priority is given, user wants we allocated it. */ + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) + return -ENOENT; + prio = TC_H_MAKE(0x80000000U, 0U); + } + + /* Find head of filter chain. */ + + /* Find link */ + dev = __dev_get_by_index(net, t->tcm_ifindex); + if (dev == NULL) + return -ENODEV; + + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + /* Find qdisc */ + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); + if (q == NULL) + return -EINVAL; + } + + /* Is it classful? */ + cops = q->ops->cl_ops; + if (!cops) + return -EINVAL; + + if (cops->tcf_chain == NULL) + return -EOPNOTSUPP; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + cl = cops->get(q, parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + chain = cops->tcf_chain(q, cl); + err = -EINVAL; + if (chain == NULL) + goto errout; + + /* Check the chain for existence of proto-tcf with this priority */ + for (back = chain; (tp = *back) != NULL; back = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (!nprio || + (tp->protocol != protocol && protocol)) + goto errout; + } else + tp = NULL; + break; + } + } + + root_lock = qdisc_root_sleeping_lock(q); + + if (tp == NULL) { + /* Proto-tcf does not exist, create new one */ + + if (tca[TCA_KIND] == NULL || !protocol) + goto errout; + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) + goto errout; + + + /* Create new proto tcf */ + + err = -ENOBUFS; + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (tp == NULL) + goto errout; + err = -ENOENT; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); + if (tp_ops == NULL) { +#ifdef CONFIG_MODULES + struct nlattr *kind = tca[TCA_KIND]; + char name[IFNAMSIZ]; + + if (kind != NULL && + nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + rtnl_unlock(); + request_module("cls_%s", name); + rtnl_lock(); + tp_ops = tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * replay the request. We indicate this using + * -EAGAIN. + */ + if (tp_ops != NULL) { + module_put(tp_ops->owner); + err = -EAGAIN; + } + } +#endif + kfree(tp); + goto errout; + } + tp->ops = tp_ops; + tp->protocol = protocol; + tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back)); + tp->q = q; + tp->classify = tp_ops->classify; + tp->classid = parent; + + err = tp_ops->init(tp); + if (err != 0) { + module_put(tp_ops->owner); + kfree(tp); + goto errout; + } + + tp_created = 1; + + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) + goto errout; + + fh = tp->ops->get(tp, t->tcm_handle); + + if (fh == 0) { + if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + spin_lock_bh(root_lock); + *back = tp->next; + spin_unlock_bh(root_lock); + + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + tcf_destroy(tp); + err = 0; + goto errout; + } + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) + goto errout; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTFILTER: + err = -EEXIST; + if (n->nlmsg_flags & NLM_F_EXCL) { + if (tp_created) + tcf_destroy(tp); + goto errout; + } + break; + case RTM_DELTFILTER: + err = tp->ops->delete(tp, fh); + if (err == 0) + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + goto errout; + case RTM_GETTFILTER: + err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); + goto errout; + default: + err = -EINVAL; + goto errout; + } + } + + err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); + if (err == 0) { + if (tp_created) { + spin_lock_bh(root_lock); + tp->next = *back; + *back = tp; + spin_unlock_bh(root_lock); + } + tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); + } else { + if (tp_created) + tcf_destroy(tp); + } + +errout: + if (cl) + cops->put(q, cl); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, + unsigned long fh, u32 pid, u32 seq, u16 flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; + tcm->tcm_parent = tp->classid; + tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); + NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind); + tcm->tcm_handle = fh; + if (RTM_DELTFILTER != event) { + tcm->tcm_handle = 0; + if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + goto nla_put_failure; + } + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int tfilter_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct tcf_proto *tp, + unsigned long fh, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); +} + +struct tcf_dump_args { + struct tcf_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, + struct tcf_walker *arg) +{ + struct tcf_dump_args *a = (void *)arg; + + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); +} + +/* called with RTNL */ +static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto *tp, **chain; + struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); + unsigned long cl = 0; + const struct Qdisc_class_ops *cops; + struct tcf_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return skb->len; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + if (!tcm->tcm_parent) + q = dev->qdisc; + else + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + if (!q) + goto out; + cops = q->ops->cl_ops; + if (!cops) + goto errout; + if (cops->tcf_chain == NULL) + goto errout; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->get(q, tcm->tcm_parent); + if (cl == 0) + goto errout; + } + chain = cops->tcf_chain(q, cl); + if (chain == NULL) + goto errout; + + s_t = cb->args[0]; + + for (tp = *chain, t = 0; tp; tp = tp->next, t++) { + if (t < s_t) + continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if (cb->args[1] == 0) { + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER) <= 0) + break; + + cb->args[1] = 1; + } + if (tp->ops->walk == NULL) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1] - 1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count + 1; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + +errout: + if (cl) + cops->put(q, cl); +out: + return skb->len; +} + +void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->action) { + tcf_action_destroy(exts->action, TCA_ACT_UNBIND); + exts->action = NULL; + } +#endif +} +EXPORT_SYMBOL(tcf_exts_destroy); + +int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, + struct nlattr *rate_tlv, struct tcf_exts *exts, + const struct tcf_ext_map *map) +{ + memset(exts, 0, sizeof(*exts)); + +#ifdef CONFIG_NET_CLS_ACT + { + struct tc_action *act; + + if (map->police && tb[map->police]) { + act = tcf_action_init_1(tb[map->police], rate_tlv, + "police", TCA_ACT_NOREPLACE, + TCA_ACT_BIND); + if (IS_ERR(act)) + return PTR_ERR(act); + + act->type = TCA_OLD_COMPAT; + exts->action = act; + } else if (map->action && tb[map->action]) { + act = tcf_action_init(tb[map->action], rate_tlv, NULL, + TCA_ACT_NOREPLACE, TCA_ACT_BIND); + if (IS_ERR(act)) + return PTR_ERR(act); + + exts->action = act; + } + } +#else + if ((map->action && tb[map->action]) || + (map->police && tb[map->police])) + return -EOPNOTSUPP; +#endif + + return 0; +} +EXPORT_SYMBOL(tcf_exts_validate); + +void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, + struct tcf_exts *src) +{ +#ifdef CONFIG_NET_CLS_ACT + if (src->action) { + struct tc_action *act; + tcf_tree_lock(tp); + act = dst->action; + dst->action = src->action; + tcf_tree_unlock(tp); + if (act) + tcf_action_destroy(act, TCA_ACT_UNBIND); + } +#endif +} +EXPORT_SYMBOL(tcf_exts_change); + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, + const struct tcf_ext_map *map) +{ +#ifdef CONFIG_NET_CLS_ACT + if (map->action && exts->action) { + /* + * again for backward compatible mode - we want + * to work with both old and new modes of entering + * tc data even if iproute2 was newer - jhs + */ + struct nlattr *nest; + + if (exts->action->type != TCA_OLD_COMPAT) { + nest = nla_nest_start(skb, map->action); + if (nest == NULL) + goto nla_put_failure; + if (tcf_action_dump(skb, exts->action, 0, 0) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + } else if (map->police) { + nest = nla_nest_start(skb, map->police); + if (nest == NULL) + goto nla_put_failure; + if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + } + } +#endif + return 0; +nla_put_failure: __attribute__ ((unused)) + return -1; +} +EXPORT_SYMBOL(tcf_exts_dump); + + +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, + const struct tcf_ext_map *map) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->action) + if (tcf_action_copy_stats(skb, exts->action, 1) < 0) + goto nla_put_failure; +#endif + return 0; +nla_put_failure: __attribute__ ((unused)) + return -1; +} +EXPORT_SYMBOL(tcf_exts_dump_stats); + +static int __init tc_filter_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL); + rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, + tc_dump_tfilter); + + return 0; +} + +subsys_initcall(tc_filter_init); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c new file mode 100644 index 00000000..8be8872d --- /dev/null +++ b/net/sched/cls_basic.c @@ -0,0 +1,306 @@ +/* + * net/sched/cls_basic.c Basic Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct basic_head { + u32 hgenerator; + struct list_head flist; +}; + +struct basic_filter { + u32 handle; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; + struct tcf_result res; + struct list_head link; +}; + +static const struct tcf_ext_map basic_ext_map = { + .action = TCA_BASIC_ACT, + .police = TCA_BASIC_POLICE +}; + +static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int r; + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + list_for_each_entry(f, &head->flist, link) { + if (!tcf_em_tree_match(skb, &f->ematches, NULL)) + continue; + *res = f->res; + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + return r; + } + return -1; +} + +static unsigned long basic_get(struct tcf_proto *tp, u32 handle) +{ + unsigned long l = 0UL; + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + if (head == NULL) + return 0UL; + + list_for_each_entry(f, &head->flist, link) + if (f->handle == handle) + l = (unsigned long) f; + + return l; +} + +static void basic_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int basic_init(struct tcf_proto *tp) +{ + struct basic_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + INIT_LIST_HEAD(&head->flist); + tp->root = head; + return 0; +} + +static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + tcf_em_tree_destroy(tp, &f->ematches); + kfree(f); +} + +static void basic_destroy(struct tcf_proto *tp) +{ + struct basic_head *head = tp->root; + struct basic_filter *f, *n; + + list_for_each_entry_safe(f, n, &head->flist, link) { + list_del(&f->link); + basic_delete_filter(tp, f); + } + kfree(head); +} + +static int basic_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *t, *f = (struct basic_filter *) arg; + + list_for_each_entry(t, &head->flist, link) + if (t == f) { + tcf_tree_lock(tp); + list_del(&t->link); + tcf_tree_unlock(tp); + basic_delete_filter(tp, t); + return 0; + } + + return -ENOENT; +} + +static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { + [TCA_BASIC_CLASSID] = { .type = NLA_U32 }, + [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, +}; + +static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, + unsigned long base, struct nlattr **tb, + struct nlattr *est) +{ + int err = -EINVAL; + struct tcf_exts e; + struct tcf_ematch_tree t; + + err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); + if (err < 0) + goto errout; + + if (tb[TCA_BASIC_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + tcf_em_tree_change(tp, &f->ematches, &t); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg) +{ + int err; + struct basic_head *head = (struct basic_head *) tp->root; + struct nlattr *tb[TCA_BASIC_MAX + 1]; + struct basic_filter *f = (struct basic_filter *) *arg; + + if (tca[TCA_OPTIONS] == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], + basic_policy); + if (err < 0) + return err; + + if (f != NULL) { + if (handle && f->handle != handle) + return -EINVAL; + return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + } + + err = -ENOBUFS; + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (f == NULL) + goto errout; + + err = -EINVAL; + if (handle) + f->handle = handle; + else { + unsigned int i = 0x80000000; + do { + if (++head->hgenerator == 0x7FFFFFFF) + head->hgenerator = 1; + } while (--i > 0 && basic_get(tp, head->hgenerator)); + + if (i <= 0) { + pr_err("Insufficient number of handles\n"); + goto errout; + } + + f->handle = head->hgenerator; + } + + err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + if (err < 0) + goto errout; + + tcf_tree_lock(tp); + list_add(&f->link, &head->flist); + tcf_tree_unlock(tp); + *arg = (unsigned long) f; + + return 0; +errout: + if (*arg == 0UL && f) + kfree(f); + + return err; +} + +static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + list_for_each_entry(f, &head->flist, link) { + if (arg->count < arg->skip) + goto skip; + + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int basic_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct basic_filter *f = (struct basic_filter *) fh; + struct nlattr *nest; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (f->res.classid) + NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid); + + if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_basic_ops __read_mostly = { + .kind = "basic", + .classify = basic_classify, + .init = basic_init, + .destroy = basic_destroy, + .get = basic_get, + .put = basic_put, + .change = basic_change, + .delete = basic_delete, + .walk = basic_walk, + .dump = basic_dump, + .owner = THIS_MODULE, +}; + +static int __init init_basic(void) +{ + return register_tcf_proto_ops(&cls_basic_ops); +} + +static void __exit exit_basic(void) +{ + unregister_tcf_proto_ops(&cls_basic_ops); +} + +module_init(init_basic) +module_exit(exit_basic) +MODULE_LICENSE("GPL"); + diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c new file mode 100644 index 00000000..32a33519 --- /dev/null +++ b/net/sched/cls_cgroup.c @@ -0,0 +1,325 @@ +/* + * net/sched/cls_cgroup.c Control Group Classifier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); + +struct cgroup_subsys net_cls_subsys = { + .name = "net_cls", + .create = cgrp_create, + .destroy = cgrp_destroy, + .populate = cgrp_populate, +#ifdef CONFIG_NET_CLS_CGROUP + .subsys_id = net_cls_subsys_id, +#endif + .module = THIS_MODULE, +}; + + +static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), + struct cgroup_cls_state, css); +} + +static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ + return container_of(task_subsys_state(p, net_cls_subsys_id), + struct cgroup_cls_state, css); +} + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct cgroup_cls_state *cs; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + if (cgrp->parent) + cs->classid = cgrp_cls_state(cgrp->parent)->classid; + + return &cs->css; +} + +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + kfree(cgrp_cls_state(cgrp)); +} + +static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) +{ + return cgrp_cls_state(cgrp)->classid; +} + +static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) +{ + cgrp_cls_state(cgrp)->classid = (u32) value; + return 0; +} + +static struct cftype ss_files[] = { + { + .name = "classid", + .read_u64 = read_classid, + .write_u64 = write_classid, + }, +}; + +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); +} + +struct cls_cgroup_head { + u32 handle; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; +}; + +static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_cgroup_head *head = tp->root; + u32 classid; + + rcu_read_lock(); + classid = task_cls_state(current)->classid; + rcu_read_unlock(); + + /* + * Due to the nature of the classifier it is required to ignore all + * packets originating from softirq context as accessing `current' + * would lead to false results. + * + * This test assumes that all callers of dev_queue_xmit() explicitely + * disable bh. Knowing this, it is possible to detect softirq based + * calls by looking at the number of nested bh disable calls because + * softirqs always disables bh. + */ + if (in_serving_softirq()) { + /* If there is an sk_classid we'll use that. */ + if (!skb->sk) + return -1; + classid = skb->sk->sk_classid; + } + + if (!classid) + return -1; + + if (!tcf_em_tree_match(skb, &head->ematches, NULL)) + return -1; + + res->classid = classid; + res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); +} + +static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle) +{ + return 0UL; +} + +static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_cgroup_init(struct tcf_proto *tp) +{ + return 0; +} + +static const struct tcf_ext_map cgroup_ext_map = { + .action = TCA_CGROUP_ACT, + .police = TCA_CGROUP_POLICE, +}; + +static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { + [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, +}; + +static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg) +{ + struct nlattr *tb[TCA_CGROUP_MAX + 1]; + struct cls_cgroup_head *head = tp->root; + struct tcf_ematch_tree t; + struct tcf_exts e; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + if (head == NULL) { + if (!handle) + return -EINVAL; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + head->handle = handle; + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + if (handle != head->handle) + return -ENOENT; + + err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], + cgroup_policy); + if (err < 0) + return err; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); + if (err < 0) + return err; + + tcf_exts_change(tp, &head->exts, &e); + tcf_em_tree_change(tp, &head->ematches, &t); + + return 0; +} + +static void cls_cgroup_destroy(struct tcf_proto *tp) +{ + struct cls_cgroup_head *head = tp->root; + + if (head) { + tcf_exts_destroy(tp, &head->exts); + tcf_em_tree_destroy(tp, &head->ematches); + kfree(head); + } +} + +static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg) +{ + return -EOPNOTSUPP; +} + +static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_cgroup_head *head = tp->root; + + if (arg->count < arg->skip) + goto skip; + + if (arg->fn(tp, (unsigned long) head, arg) < 0) { + arg->stop = 1; + return; + } +skip: + arg->count++; +} + +static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_cgroup_head *head = tp->root; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + t->tcm_handle = head->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 || + tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { + .kind = "cgroup", + .init = cls_cgroup_init, + .change = cls_cgroup_change, + .classify = cls_cgroup_classify, + .destroy = cls_cgroup_destroy, + .get = cls_cgroup_get, + .put = cls_cgroup_put, + .delete = cls_cgroup_delete, + .walk = cls_cgroup_walk, + .dump = cls_cgroup_dump, + .owner = THIS_MODULE, +}; + +static int __init init_cgroup_cls(void) +{ + int ret; + + ret = cgroup_load_subsys(&net_cls_subsys); + if (ret) + goto out; + +#ifndef CONFIG_NET_CLS_CGROUP + /* We can't use rcu_assign_pointer because this is an int. */ + smp_wmb(); + net_cls_subsys_id = net_cls_subsys.subsys_id; +#endif + + ret = register_tcf_proto_ops(&cls_cgroup_ops); + if (ret) + cgroup_unload_subsys(&net_cls_subsys); + +out: + return ret; +} + +static void __exit exit_cgroup_cls(void) +{ + unregister_tcf_proto_ops(&cls_cgroup_ops); + +#ifndef CONFIG_NET_CLS_CGROUP + net_cls_subsys_id = -1; + synchronize_rcu(); +#endif + + cgroup_unload_subsys(&net_cls_subsys); +} + +module_init(init_cgroup_cls); +module_exit(exit_cgroup_cls); +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c new file mode 100644 index 00000000..8ec01391 --- /dev/null +++ b/net/sched/cls_flow.c @@ -0,0 +1,739 @@ +/* + * net/sched/cls_flow.c Generic flow classifier + * + * Copyright (c) 2007, 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include +#endif + +struct flow_head { + struct list_head filters; +}; + +struct flow_filter { + struct list_head list; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; + struct timer_list perturb_timer; + u32 perturb_period; + u32 handle; + + u32 nkeys; + u32 keymask; + u32 mode; + u32 mask; + u32 xor; + u32 rshift; + u32 addend; + u32 divisor; + u32 baseclass; + u32 hashrnd; +}; + +static const struct tcf_ext_map flow_ext_map = { + .action = TCA_FLOW_ACT, + .police = TCA_FLOW_POLICE, +}; + +static inline u32 addr_fold(void *addr) +{ + unsigned long a = (unsigned long)addr; + + return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0); +} + +static u32 flow_get_src(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + if (pskb_network_may_pull(skb, sizeof(struct iphdr))) + return ntohl(ip_hdr(skb)->saddr); + break; + case htons(ETH_P_IPV6): + if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]); + break; + } + + return addr_fold(skb->sk); +} + +static u32 flow_get_dst(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + if (pskb_network_may_pull(skb, sizeof(struct iphdr))) + return ntohl(ip_hdr(skb)->daddr); + break; + case htons(ETH_P_IPV6): + if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]); + break; + } + + return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; +} + +static u32 flow_get_proto(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return pskb_network_may_pull(skb, sizeof(struct iphdr)) ? + ip_hdr(skb)->protocol : 0; + case htons(ETH_P_IPV6): + return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ? + ipv6_hdr(skb)->nexthdr : 0; + default: + return 0; + } +} + +static u32 flow_get_proto_src(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct iphdr *iph; + int poff; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + break; + iph = ip_hdr(skb); + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + poff)); + } + break; + } + case htons(ETH_P_IPV6): { + struct ipv6hdr *iph; + int poff; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + break; + iph = ipv6_hdr(skb); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff)); + } + break; + } + } + + return addr_fold(skb->sk); +} + +static u32 flow_get_proto_dst(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct iphdr *iph; + int poff; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + break; + iph = ip_hdr(skb); + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + 2 + poff)); + } + break; + } + case htons(ETH_P_IPV6): { + struct ipv6hdr *iph; + int poff; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + break; + iph = ipv6_hdr(skb); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff + 2)); + } + break; + } + } + + return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol; +} + +static u32 flow_get_iif(const struct sk_buff *skb) +{ + return skb->skb_iif; +} + +static u32 flow_get_priority(const struct sk_buff *skb) +{ + return skb->priority; +} + +static u32 flow_get_mark(const struct sk_buff *skb) +{ + return skb->mark; +} + +static u32 flow_get_nfct(const struct sk_buff *skb) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + return addr_fold(skb->nfct); +#else + return 0; +#endif +} + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#define CTTUPLE(skb, member) \ +({ \ + enum ip_conntrack_info ctinfo; \ + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \ + if (ct == NULL) \ + goto fallback; \ + ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \ +}) +#else +#define CTTUPLE(skb, member) \ +({ \ + goto fallback; \ + 0; \ +}) +#endif + +static u32 flow_get_nfct_src(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return ntohl(CTTUPLE(skb, src.u3.ip)); + case htons(ETH_P_IPV6): + return ntohl(CTTUPLE(skb, src.u3.ip6[3])); + } +fallback: + return flow_get_src(skb); +} + +static u32 flow_get_nfct_dst(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return ntohl(CTTUPLE(skb, dst.u3.ip)); + case htons(ETH_P_IPV6): + return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); + } +fallback: + return flow_get_dst(skb); +} + +static u32 flow_get_nfct_proto_src(struct sk_buff *skb) +{ + return ntohs(CTTUPLE(skb, src.u.all)); +fallback: + return flow_get_proto_src(skb); +} + +static u32 flow_get_nfct_proto_dst(struct sk_buff *skb) +{ + return ntohs(CTTUPLE(skb, dst.u.all)); +fallback: + return flow_get_proto_dst(skb); +} + +static u32 flow_get_rtclassid(const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + if (skb_dst(skb)) + return skb_dst(skb)->tclassid; +#endif + return 0; +} + +static u32 flow_get_skuid(const struct sk_buff *skb) +{ + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) + return skb->sk->sk_socket->file->f_cred->fsuid; + return 0; +} + +static u32 flow_get_skgid(const struct sk_buff *skb) +{ + if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) + return skb->sk->sk_socket->file->f_cred->fsgid; + return 0; +} + +static u32 flow_get_vlan_tag(const struct sk_buff *skb) +{ + u16 uninitialized_var(tag); + + if (vlan_get_tag(skb, &tag) < 0) + return 0; + return tag & VLAN_VID_MASK; +} + +static u32 flow_get_rxhash(struct sk_buff *skb) +{ + return skb_get_rxhash(skb); +} + +static u32 flow_key_get(struct sk_buff *skb, int key) +{ + switch (key) { + case FLOW_KEY_SRC: + return flow_get_src(skb); + case FLOW_KEY_DST: + return flow_get_dst(skb); + case FLOW_KEY_PROTO: + return flow_get_proto(skb); + case FLOW_KEY_PROTO_SRC: + return flow_get_proto_src(skb); + case FLOW_KEY_PROTO_DST: + return flow_get_proto_dst(skb); + case FLOW_KEY_IIF: + return flow_get_iif(skb); + case FLOW_KEY_PRIORITY: + return flow_get_priority(skb); + case FLOW_KEY_MARK: + return flow_get_mark(skb); + case FLOW_KEY_NFCT: + return flow_get_nfct(skb); + case FLOW_KEY_NFCT_SRC: + return flow_get_nfct_src(skb); + case FLOW_KEY_NFCT_DST: + return flow_get_nfct_dst(skb); + case FLOW_KEY_NFCT_PROTO_SRC: + return flow_get_nfct_proto_src(skb); + case FLOW_KEY_NFCT_PROTO_DST: + return flow_get_nfct_proto_dst(skb); + case FLOW_KEY_RTCLASSID: + return flow_get_rtclassid(skb); + case FLOW_KEY_SKUID: + return flow_get_skuid(skb); + case FLOW_KEY_SKGID: + return flow_get_skgid(skb); + case FLOW_KEY_VLAN_TAG: + return flow_get_vlan_tag(skb); + case FLOW_KEY_RXHASH: + return flow_get_rxhash(skb); + default: + WARN_ON(1); + return 0; + } +} + +static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + u32 keymask; + u32 classid; + unsigned int n, key; + int r; + + list_for_each_entry(f, &head->filters, list) { + u32 keys[f->nkeys]; + + if (!tcf_em_tree_match(skb, &f->ematches, NULL)) + continue; + + keymask = f->keymask; + + for (n = 0; n < f->nkeys; n++) { + key = ffs(keymask) - 1; + keymask &= ~(1 << key); + keys[n] = flow_key_get(skb, key); + } + + if (f->mode == FLOW_MODE_HASH) + classid = jhash2(keys, f->nkeys, f->hashrnd); + else { + classid = keys[0]; + classid = (classid & f->mask) ^ f->xor; + classid = (classid >> f->rshift) + f->addend; + } + + if (f->divisor) + classid %= f->divisor; + + res->class = 0; + res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid); + + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + return r; + } + return -1; +} + +static void flow_perturbation(unsigned long arg) +{ + struct flow_filter *f = (struct flow_filter *)arg; + + get_random_bytes(&f->hashrnd, 4); + if (f->perturb_period) + mod_timer(&f->perturb_timer, jiffies + f->perturb_period); +} + +static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { + [TCA_FLOW_KEYS] = { .type = NLA_U32 }, + [TCA_FLOW_MODE] = { .type = NLA_U32 }, + [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, + [TCA_FLOW_MASK] = { .type = NLA_U32 }, + [TCA_FLOW_XOR] = { .type = NLA_U32 }, + [TCA_FLOW_DIVISOR] = { .type = NLA_U32 }, + [TCA_FLOW_ACT] = { .type = NLA_NESTED }, + [TCA_FLOW_POLICE] = { .type = NLA_NESTED }, + [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED }, + [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, +}; + +static int flow_change(struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_FLOW_MAX + 1]; + struct tcf_exts e; + struct tcf_ematch_tree t; + unsigned int nkeys = 0; + unsigned int perturb_period = 0; + u32 baseclass = 0; + u32 keymask = 0; + u32 mode; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy); + if (err < 0) + return err; + + if (tb[TCA_FLOW_BASECLASS]) { + baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]); + if (TC_H_MIN(baseclass) == 0) + return -EINVAL; + } + + if (tb[TCA_FLOW_KEYS]) { + keymask = nla_get_u32(tb[TCA_FLOW_KEYS]); + + nkeys = hweight32(keymask); + if (nkeys == 0) + return -EINVAL; + + if (fls(keymask) - 1 > FLOW_KEY_MAX) + return -EOPNOTSUPP; + } + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); + if (err < 0) + goto err1; + + f = (struct flow_filter *)*arg; + if (f != NULL) { + err = -EINVAL; + if (f->handle != handle && handle) + goto err2; + + mode = f->mode; + if (tb[TCA_FLOW_MODE]) + mode = nla_get_u32(tb[TCA_FLOW_MODE]); + if (mode != FLOW_MODE_HASH && nkeys > 1) + goto err2; + + if (mode == FLOW_MODE_HASH) + perturb_period = f->perturb_period; + if (tb[TCA_FLOW_PERTURB]) { + if (mode != FLOW_MODE_HASH) + goto err2; + perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; + } + } else { + err = -EINVAL; + if (!handle) + goto err2; + if (!tb[TCA_FLOW_KEYS]) + goto err2; + + mode = FLOW_MODE_MAP; + if (tb[TCA_FLOW_MODE]) + mode = nla_get_u32(tb[TCA_FLOW_MODE]); + if (mode != FLOW_MODE_HASH && nkeys > 1) + goto err2; + + if (tb[TCA_FLOW_PERTURB]) { + if (mode != FLOW_MODE_HASH) + goto err2; + perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; + } + + if (TC_H_MAJ(baseclass) == 0) + baseclass = TC_H_MAKE(tp->q->handle, baseclass); + if (TC_H_MIN(baseclass) == 0) + baseclass = TC_H_MAKE(baseclass, 1); + + err = -ENOBUFS; + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (f == NULL) + goto err2; + + f->handle = handle; + f->mask = ~0U; + + get_random_bytes(&f->hashrnd, 4); + f->perturb_timer.function = flow_perturbation; + f->perturb_timer.data = (unsigned long)f; + init_timer_deferrable(&f->perturb_timer); + } + + tcf_exts_change(tp, &f->exts, &e); + tcf_em_tree_change(tp, &f->ematches, &t); + + tcf_tree_lock(tp); + + if (tb[TCA_FLOW_KEYS]) { + f->keymask = keymask; + f->nkeys = nkeys; + } + + f->mode = mode; + + if (tb[TCA_FLOW_MASK]) + f->mask = nla_get_u32(tb[TCA_FLOW_MASK]); + if (tb[TCA_FLOW_XOR]) + f->xor = nla_get_u32(tb[TCA_FLOW_XOR]); + if (tb[TCA_FLOW_RSHIFT]) + f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); + if (tb[TCA_FLOW_ADDEND]) + f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); + + if (tb[TCA_FLOW_DIVISOR]) + f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); + if (baseclass) + f->baseclass = baseclass; + + f->perturb_period = perturb_period; + del_timer(&f->perturb_timer); + if (perturb_period) + mod_timer(&f->perturb_timer, jiffies + perturb_period); + + if (*arg == 0) + list_add_tail(&f->list, &head->filters); + + tcf_tree_unlock(tp); + + *arg = (unsigned long)f; + return 0; + +err2: + tcf_em_tree_destroy(tp, &t); +err1: + tcf_exts_destroy(tp, &e); + return err; +} + +static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f) +{ + del_timer_sync(&f->perturb_timer); + tcf_exts_destroy(tp, &f->exts); + tcf_em_tree_destroy(tp, &f->ematches); + kfree(f); +} + +static int flow_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct flow_filter *f = (struct flow_filter *)arg; + + tcf_tree_lock(tp); + list_del(&f->list); + tcf_tree_unlock(tp); + flow_destroy_filter(tp, f); + return 0; +} + +static int flow_init(struct tcf_proto *tp) +{ + struct flow_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + INIT_LIST_HEAD(&head->filters); + tp->root = head; + return 0; +} + +static void flow_destroy(struct tcf_proto *tp) +{ + struct flow_head *head = tp->root; + struct flow_filter *f, *next; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del(&f->list); + flow_destroy_filter(tp, f); + } + kfree(head); +} + +static unsigned long flow_get(struct tcf_proto *tp, u32 handle) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long)f; + return 0; +} + +static void flow_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int flow_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct flow_filter *f = (struct flow_filter *)fh; + struct nlattr *nest; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask); + NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode); + + if (f->mask != ~0 || f->xor != 0) { + NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask); + NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor); + } + if (f->rshift) + NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift); + if (f->addend) + NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend); + + if (f->divisor) + NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor); + if (f->baseclass) + NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass); + + if (f->perturb_period) + NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ); + + if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) + goto nla_put_failure; +#ifdef CONFIG_NET_EMATCH + if (f->ematches.hdr.nmatches && + tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0) + goto nla_put_failure; +#endif + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, nest); + return -1; +} + +static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct flow_head *head = tp->root; + struct flow_filter *f; + + list_for_each_entry(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static struct tcf_proto_ops cls_flow_ops __read_mostly = { + .kind = "flow", + .classify = flow_classify, + .init = flow_init, + .destroy = flow_destroy, + .change = flow_change, + .delete = flow_delete, + .get = flow_get, + .put = flow_put, + .dump = flow_dump, + .walk = flow_walk, + .owner = THIS_MODULE, +}; + +static int __init cls_flow_init(void) +{ + return register_tcf_proto_ops(&cls_flow_ops); +} + +static void __exit cls_flow_exit(void) +{ + unregister_tcf_proto_ops(&cls_flow_ops); +} + +module_init(cls_flow_init); +module_exit(cls_flow_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("TC flow classifier"); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c new file mode 100644 index 00000000..26e7bc4f --- /dev/null +++ b/net/sched/cls_fw.c @@ -0,0 +1,399 @@ +/* + * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * Karlis Peisenieks : 990415 : fw_walk off by one + * Karlis Peisenieks : 990415 : fw_delete killed all the filter (and kernel). + * Alex : 2004xxyy: Added Action extension + * + * JHS: We should remove the CONFIG_NET_CLS_IND from here + * eventually when the meta match extension is made available + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) + +struct fw_head { + struct fw_filter *ht[HTSIZE]; + u32 mask; +}; + +struct fw_filter { + struct fw_filter *next; + u32 id; + struct tcf_result res; +#ifdef CONFIG_NET_CLS_IND + char indev[IFNAMSIZ]; +#endif /* CONFIG_NET_CLS_IND */ + struct tcf_exts exts; +}; + +static const struct tcf_ext_map fw_ext_map = { + .action = TCA_FW_ACT, + .police = TCA_FW_POLICE +}; + +static inline int fw_hash(u32 handle) +{ + if (HTSIZE == 4096) + return ((handle >> 24) & 0xFFF) ^ + ((handle >> 12) & 0xFFF) ^ + (handle & 0xFFF); + else if (HTSIZE == 2048) + return ((handle >> 22) & 0x7FF) ^ + ((handle >> 11) & 0x7FF) ^ + (handle & 0x7FF); + else if (HTSIZE == 1024) + return ((handle >> 20) & 0x3FF) ^ + ((handle >> 10) & 0x3FF) ^ + (handle & 0x3FF); + else if (HTSIZE == 512) + return (handle >> 27) ^ + ((handle >> 18) & 0x1FF) ^ + ((handle >> 9) & 0x1FF) ^ + (handle & 0x1FF); + else if (HTSIZE == 256) { + u8 *t = (u8 *) &handle; + return t[0] ^ t[1] ^ t[2] ^ t[3]; + } else + return handle & (HTSIZE - 1); +} + +static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct fw_head *head = (struct fw_head *)tp->root; + struct fw_filter *f; + int r; + u32 id = skb->mark; + + if (head != NULL) { + id &= head->mask; + for (f = head->ht[fw_hash(id)]; f; f = f->next) { + if (f->id == id) { + *res = f->res; +#ifdef CONFIG_NET_CLS_IND + if (!tcf_match_indev(skb, f->indev)) + continue; +#endif /* CONFIG_NET_CLS_IND */ + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + + return r; + } + } + } else { + /* old method */ + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id ^ tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + } + + return -1; +} + +static unsigned long fw_get(struct tcf_proto *tp, u32 handle) +{ + struct fw_head *head = (struct fw_head *)tp->root; + struct fw_filter *f; + + if (head == NULL) + return 0; + + for (f = head->ht[fw_hash(handle)]; f; f = f->next) { + if (f->id == handle) + return (unsigned long)f; + } + return 0; +} + +static void fw_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int fw_init(struct tcf_proto *tp) +{ + return 0; +} + +static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void fw_destroy(struct tcf_proto *tp) +{ + struct fw_head *head = tp->root; + struct fw_filter *f; + int h; + + if (head == NULL) + return; + + for (h = 0; h < HTSIZE; h++) { + while ((f = head->ht[h]) != NULL) { + head->ht[h] = f->next; + fw_delete_filter(tp, f); + } + } + kfree(head); +} + +static int fw_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct fw_head *head = (struct fw_head *)tp->root; + struct fw_filter *f = (struct fw_filter *)arg; + struct fw_filter **fp; + + if (head == NULL || f == NULL) + goto out; + + for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + fw_delete_filter(tp, f); + return 0; + } + } +out: + return -EINVAL; +} + +static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { + [TCA_FW_CLASSID] = { .type = NLA_U32 }, + [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_FW_MASK] = { .type = NLA_U32 }, +}; + +static int +fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, + struct nlattr **tb, struct nlattr **tca, unsigned long base) +{ + struct fw_head *head = (struct fw_head *)tp->root; + struct tcf_exts e; + u32 mask; + int err; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_FW_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FW_INDEV]) { + err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]); + if (err < 0) + goto errout; + } +#endif /* CONFIG_NET_CLS_IND */ + + if (tb[TCA_FW_MASK]) { + mask = nla_get_u32(tb[TCA_FW_MASK]); + if (mask != head->mask) + goto errout; + } else if (head->mask != 0xFFFFFFFF) + goto errout; + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int fw_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct nlattr **tca, + unsigned long *arg) +{ + struct fw_head *head = (struct fw_head *)tp->root; + struct fw_filter *f = (struct fw_filter *) *arg; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_FW_MAX + 1]; + int err; + + if (!opt) + return handle ? -EINVAL : 0; + + err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); + if (err < 0) + return err; + + if (f != NULL) { + if (f->id != handle && handle) + return -EINVAL; + return fw_change_attrs(tp, f, tb, tca, base); + } + + if (!handle) + return -EINVAL; + + if (head == NULL) { + u32 mask = 0xFFFFFFFF; + if (tb[TCA_FW_MASK]) + mask = nla_get_u32(tb[TCA_FW_MASK]); + + head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + head->mask = mask; + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + + f->id = handle; + + err = fw_change_attrs(tp, f, tb, tca, base); + if (err < 0) + goto errout; + + f->next = head->ht[fw_hash(handle)]; + tcf_tree_lock(tp); + head->ht[fw_hash(handle)] = f; + tcf_tree_unlock(tp); + + *arg = (unsigned long)f; + return 0; + +errout: + kfree(f); + return err; +} + +static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct fw_head *head = (struct fw_head *)tp->root; + int h; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h < HTSIZE; h++) { + struct fw_filter *f; + + for (f = head->ht[h]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static int fw_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct fw_head *head = (struct fw_head *)tp->root; + struct fw_filter *f = (struct fw_filter *)fh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->id; + + if (!f->res.classid && !tcf_exts_is_available(&f->exts)) + return skb->len; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (f->res.classid) + NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid); +#ifdef CONFIG_NET_CLS_IND + if (strlen(f->indev)) + NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev); +#endif /* CONFIG_NET_CLS_IND */ + if (head->mask != 0xFFFFFFFF) + NLA_PUT_U32(skb, TCA_FW_MASK, head->mask); + + if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tcf_proto_ops cls_fw_ops __read_mostly = { + .kind = "fw", + .classify = fw_classify, + .init = fw_init, + .destroy = fw_destroy, + .get = fw_get, + .put = fw_put, + .change = fw_change, + .delete = fw_delete, + .walk = fw_walk, + .dump = fw_dump, + .owner = THIS_MODULE, +}; + +static int __init init_fw(void) +{ + return register_tcf_proto_ops(&cls_fw_ops); +} + +static void __exit exit_fw(void) +{ + unregister_tcf_proto_ops(&cls_fw_ops); +} + +module_init(init_fw) +module_exit(exit_fw) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c new file mode 100644 index 00000000..a9079053 --- /dev/null +++ b/net/sched/cls_route.c @@ -0,0 +1,627 @@ +/* + * net/sched/cls_route.c ROUTE4 classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * 1. For now we assume that route tags < 256. + * It allows to use direct table lookups, instead of hash tables. + * 2. For now we assume that "from TAG" and "fromdev DEV" statements + * are mutually exclusive. + * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" + */ + +struct route4_fastmap { + struct route4_filter *filter; + u32 id; + int iif; +}; + +struct route4_head { + struct route4_fastmap fastmap[16]; + struct route4_bucket *table[256 + 1]; +}; + +struct route4_bucket { + /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ + struct route4_filter *ht[16 + 16 + 1]; +}; + +struct route4_filter { + struct route4_filter *next; + u32 id; + int iif; + + struct tcf_result res; + struct tcf_exts exts; + u32 handle; + struct route4_bucket *bkt; +}; + +#define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) + +static const struct tcf_ext_map route_ext_map = { + .police = TCA_ROUTE4_POLICE, + .action = TCA_ROUTE4_ACT +}; + +static inline int route4_fastmap_hash(u32 id, int iif) +{ + return id & 0xF; +} + +static void +route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) +{ + spinlock_t *root_lock = qdisc_root_sleeping_lock(q); + + spin_lock_bh(root_lock); + memset(head->fastmap, 0, sizeof(head->fastmap)); + spin_unlock_bh(root_lock); +} + +static void +route4_set_fastmap(struct route4_head *head, u32 id, int iif, + struct route4_filter *f) +{ + int h = route4_fastmap_hash(id, iif); + + head->fastmap[h].id = id; + head->fastmap[h].iif = iif; + head->fastmap[h].filter = f; +} + +static inline int route4_hash_to(u32 id) +{ + return id & 0xFF; +} + +static inline int route4_hash_from(u32 id) +{ + return (id >> 16) & 0xF; +} + +static inline int route4_hash_iif(int iif) +{ + return 16 + ((iif >> 16) & 0xF); +} + +static inline int route4_hash_wild(void) +{ + return 32; +} + +#define ROUTE4_APPLY_RESULT() \ +{ \ + *res = f->res; \ + if (tcf_exts_is_available(&f->exts)) { \ + int r = tcf_exts_exec(skb, &f->exts, res); \ + if (r < 0) { \ + dont_cache = 1; \ + continue; \ + } \ + return r; \ + } else if (!dont_cache) \ + route4_set_fastmap(head, id, iif, f); \ + return 0; \ +} + +static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct route4_head *head = (struct route4_head *)tp->root; + struct dst_entry *dst; + struct route4_bucket *b; + struct route4_filter *f; + u32 id, h; + int iif, dont_cache = 0; + + dst = skb_dst(skb); + if (!dst) + goto failure; + + id = dst->tclassid; + if (head == NULL) + goto old_method; + + iif = ((struct rtable *)dst)->rt_iif; + + h = route4_fastmap_hash(id, iif); + if (id == head->fastmap[h].id && + iif == head->fastmap[h].iif && + (f = head->fastmap[h].filter) != NULL) { + if (f == ROUTE4_FAILURE) + goto failure; + + *res = f->res; + return 0; + } + + h = route4_hash_to(id); + +restart: + b = head->table[h]; + if (b) { + for (f = b->ht[route4_hash_from(id)]; f; f = f->next) + if (f->id == id) + ROUTE4_APPLY_RESULT(); + + for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) + if (f->iif == iif) + ROUTE4_APPLY_RESULT(); + + for (f = b->ht[route4_hash_wild()]; f; f = f->next) + ROUTE4_APPLY_RESULT(); + + } + if (h < 256) { + h = 256; + id &= ~0xFFFF; + goto restart; + } + + if (!dont_cache) + route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); +failure: + return -1; + +old_method: + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + return -1; +} + +static inline u32 to_hash(u32 id) +{ + u32 h = id & 0xFF; + + if (id & 0x8000) + h += 256; + return h; +} + +static inline u32 from_hash(u32 id) +{ + id &= 0xFFFF; + if (id == 0xFFFF) + return 32; + if (!(id & 0x8000)) { + if (id > 255) + return 256; + return id & 0xF; + } + return 16 + (id & 0xF); +} + +static unsigned long route4_get(struct tcf_proto *tp, u32 handle) +{ + struct route4_head *head = (struct route4_head *)tp->root; + struct route4_bucket *b; + struct route4_filter *f; + unsigned int h1, h2; + + if (!head) + return 0; + + h1 = to_hash(handle); + if (h1 > 256) + return 0; + + h2 = from_hash(handle >> 16); + if (h2 > 32) + return 0; + + b = head->table[h1]; + if (b) { + for (f = b->ht[h2]; f; f = f->next) + if (f->handle == handle) + return (unsigned long)f; + } + return 0; +} + +static void route4_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int route4_init(struct tcf_proto *tp) +{ + return 0; +} + +static void +route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void route4_destroy(struct tcf_proto *tp) +{ + struct route4_head *head = tp->root; + int h1, h2; + + if (head == NULL) + return; + + for (h1 = 0; h1 <= 256; h1++) { + struct route4_bucket *b; + + b = head->table[h1]; + if (b) { + for (h2 = 0; h2 <= 32; h2++) { + struct route4_filter *f; + + while ((f = b->ht[h2]) != NULL) { + b->ht[h2] = f->next; + route4_delete_filter(tp, f); + } + } + kfree(b); + } + } + kfree(head); +} + +static int route4_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct route4_head *head = (struct route4_head *)tp->root; + struct route4_filter **fp, *f = (struct route4_filter *)arg; + unsigned int h = 0; + struct route4_bucket *b; + int i; + + if (!head || !f) + return -EINVAL; + + h = f->handle; + b = f->bkt; + + for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q, head, f->id); + route4_delete_filter(tp, f); + + /* Strip tree */ + + for (i = 0; i <= 32; i++) + if (b->ht[i]) + return 0; + + /* OK, session has no flows */ + tcf_tree_lock(tp); + head->table[to_hash(h)] = NULL; + tcf_tree_unlock(tp); + + kfree(b); + return 0; + } + } + return 0; +} + +static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { + [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 }, + [TCA_ROUTE4_TO] = { .type = NLA_U32 }, + [TCA_ROUTE4_FROM] = { .type = NLA_U32 }, + [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, +}; + +static int route4_set_parms(struct tcf_proto *tp, unsigned long base, + struct route4_filter *f, u32 handle, struct route4_head *head, + struct nlattr **tb, struct nlattr *est, int new) +{ + int err; + u32 id = 0, to = 0, nhandle = 0x8000; + struct route4_filter *fp; + unsigned int h1; + struct route4_bucket *b; + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_ROUTE4_TO]) { + if (new && handle & 0x8000) + goto errout; + to = nla_get_u32(tb[TCA_ROUTE4_TO]); + if (to > 0xFF) + goto errout; + nhandle = to; + } + + if (tb[TCA_ROUTE4_FROM]) { + if (tb[TCA_ROUTE4_IIF]) + goto errout; + id = nla_get_u32(tb[TCA_ROUTE4_FROM]); + if (id > 0xFF) + goto errout; + nhandle |= id << 16; + } else if (tb[TCA_ROUTE4_IIF]) { + id = nla_get_u32(tb[TCA_ROUTE4_IIF]); + if (id > 0x7FFF) + goto errout; + nhandle |= (id | 0x8000) << 16; + } else + nhandle |= 0xFFFF << 16; + + if (handle && new) { + nhandle |= handle & 0x7F00; + if (nhandle != handle) + goto errout; + } + + h1 = to_hash(nhandle); + b = head->table[h1]; + if (!b) { + err = -ENOBUFS; + b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); + if (b == NULL) + goto errout; + + tcf_tree_lock(tp); + head->table[h1] = b; + tcf_tree_unlock(tp); + } else { + unsigned int h2 = from_hash(nhandle >> 16); + + err = -EEXIST; + for (fp = b->ht[h2]; fp; fp = fp->next) + if (fp->handle == f->handle) + goto errout; + } + + tcf_tree_lock(tp); + if (tb[TCA_ROUTE4_TO]) + f->id = to; + + if (tb[TCA_ROUTE4_FROM]) + f->id = to | id<<16; + else if (tb[TCA_ROUTE4_IIF]) + f->iif = id; + + f->handle = nhandle; + f->bkt = b; + tcf_tree_unlock(tp); + + if (tb[TCA_ROUTE4_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int route4_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct nlattr **tca, + unsigned long *arg) +{ + struct route4_head *head = tp->root; + struct route4_filter *f, *f1, **fp; + struct route4_bucket *b; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ROUTE4_MAX + 1]; + unsigned int h, th; + u32 old_handle = 0; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy); + if (err < 0) + return err; + + f = (struct route4_filter *)*arg; + if (f) { + if (f->handle != handle && handle) + return -EINVAL; + + if (f->bkt) + old_handle = f->handle; + + err = route4_set_parms(tp, base, f, handle, head, tb, + tca[TCA_RATE], 0); + if (err < 0) + return err; + + goto reinsert; + } + + err = -ENOBUFS; + if (head == NULL) { + head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); + if (head == NULL) + goto errout; + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); + if (f == NULL) + goto errout; + + err = route4_set_parms(tp, base, f, handle, head, tb, + tca[TCA_RATE], 1); + if (err < 0) + goto errout; + +reinsert: + h = from_hash(f->handle >> 16); + for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next) + if (f->handle < f1->handle) + break; + + f->next = f1; + tcf_tree_lock(tp); + *fp = f; + + if (old_handle && f->handle != old_handle) { + th = to_hash(old_handle); + h = from_hash(old_handle >> 16); + b = head->table[th]; + if (b) { + for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + *fp = f->next; + break; + } + } + } + } + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q, head, f->id); + *arg = (unsigned long)f; + return 0; + +errout: + kfree(f); + return err; +} + +static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct route4_head *head = tp->root; + unsigned int h, h1; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h <= 256; h++) { + struct route4_bucket *b = head->table[h]; + + if (b) { + for (h1 = 0; h1 <= 32; h1++) { + struct route4_filter *f; + + for (f = b->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } + } +} + +static int route4_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct route4_filter *f = (struct route4_filter *)fh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + u32 id; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (!(f->handle & 0x8000)) { + id = f->id & 0xFF; + NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); + } + if (f->handle & 0x80000000) { + if ((f->handle >> 16) != 0xFFFF) + NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); + } else { + id = f->id >> 16; + NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); + } + if (f->res.classid) + NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid); + + if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tcf_proto_ops cls_route4_ops __read_mostly = { + .kind = "route", + .classify = route4_classify, + .init = route4_init, + .destroy = route4_destroy, + .get = route4_get, + .put = route4_put, + .change = route4_change, + .delete = route4_delete, + .walk = route4_walk, + .dump = route4_dump, + .owner = THIS_MODULE, +}; + +static int __init init_route4(void) +{ + return register_tcf_proto_ops(&cls_route4_ops); +} + +static void __exit exit_route4(void) +{ + unregister_tcf_proto_ops(&cls_route4_ops); +} + +module_init(init_route4) +module_exit(exit_route4) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c new file mode 100644 index 00000000..cbb5e0d6 --- /dev/null +++ b/net/sched/cls_rsvp.c @@ -0,0 +1,28 @@ +/* + * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RSVP_DST_LEN 1 +#define RSVP_ID "rsvp" +#define RSVP_OPS cls_rsvp_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h new file mode 100644 index 00000000..402c44b2 --- /dev/null +++ b/net/sched/cls_rsvp.h @@ -0,0 +1,671 @@ +/* + * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +/* + Comparing to general packet classification problem, + RSVP needs only sevaral relatively simple rules: + + * (dst, protocol) are always specified, + so that we are able to hash them. + * src may be exact, or may be wildcard, so that + we can keep a hash table plus one wildcard entry. + * source port (or flow label) is important only if src is given. + + IMPLEMENTATION. + + We use a two level hash table: The top level is keyed by + destination address and protocol ID, every bucket contains a list + of "rsvp sessions", identified by destination address, protocol and + DPI(="Destination Port ID"): triple (key, mask, offset). + + Every bucket has a smaller hash table keyed by source address + (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. + Every bucket is again a list of "RSVP flows", selected by + source address and SPI(="Source Port ID" here rather than + "security parameter index"): triple (key, mask, offset). + + + NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) + and all fragmented packets go to the best-effort traffic class. + + + NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires + only one "Generalized Port Identifier". So that for classic + ah, esp (and udp,tcp) both *pi should coincide or one of them + should be wildcard. + + At first sight, this redundancy is just a waste of CPU + resources. But DPI and SPI add the possibility to assign different + priorities to GPIs. Look also at note 4 about tunnels below. + + + NOTE 3. One complication is the case of tunneled packets. + We implement it as following: if the first lookup + matches a special session with "tunnelhdr" value not zero, + flowid doesn't contain the true flow ID, but the tunnel ID (1...255). + In this case, we pull tunnelhdr bytes and restart lookup + with tunnel ID added to the list of keys. Simple and stupid 8)8) + It's enough for PIMREG and IPIP. + + + NOTE 4. Two GPIs make it possible to parse even GRE packets. + F.e. DPI can select ETH_P_IP (and necessary flags to make + tunnelhdr correct) in GRE protocol field and SPI matches + GRE key. Is it not nice? 8)8) + + + Well, as result, despite its simplicity, we get a pretty + powerful classification engine. */ + + +struct rsvp_head { + u32 tmap[256/32]; + u32 hgenerator; + u8 tgenerator; + struct rsvp_session *ht[256]; +}; + +struct rsvp_session { + struct rsvp_session *next; + __be32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; + /* 16 (src,sport) hash slots, and one wildcard source slot */ + struct rsvp_filter *ht[16 + 1]; +}; + + +struct rsvp_filter { + struct rsvp_filter *next; + __be32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; + + struct tcf_result res; + struct tcf_exts exts; + + u32 handle; + struct rsvp_session *sess; +}; + +static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) +{ + unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; + + h ^= h>>16; + h ^= h>>8; + return (h ^ protocol ^ tunnelid) & 0xFF; +} + +static inline unsigned int hash_src(__be32 *src) +{ + unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; + + h ^= h>>16; + h ^= h>>8; + h ^= h>>4; + return h & 0xF; +} + +static struct tcf_ext_map rsvp_ext_map = { + .police = TCA_RSVP_POLICE, + .action = TCA_RSVP_ACT +}; + +#define RSVP_APPLY_RESULT() \ +{ \ + int r = tcf_exts_exec(skb, &f->exts, res); \ + if (r < 0) \ + continue; \ + else if (r > 0) \ + return r; \ +} + +static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned int h1, h2; + __be32 *dst, *src; + u8 protocol; + u8 tunnelid = 0; + u8 *xprt; +#if RSVP_DST_LEN == 4 + struct ipv6hdr *nhptr; + + if (!pskb_network_may_pull(skb, sizeof(*nhptr))) + return -1; + nhptr = ipv6_hdr(skb); +#else + struct iphdr *nhptr; + + if (!pskb_network_may_pull(skb, sizeof(*nhptr))) + return -1; + nhptr = ip_hdr(skb); +#endif + +restart: + +#if RSVP_DST_LEN == 4 + src = &nhptr->saddr.s6_addr32[0]; + dst = &nhptr->daddr.s6_addr32[0]; + protocol = nhptr->nexthdr; + xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr); +#else + src = &nhptr->saddr; + dst = &nhptr->daddr; + protocol = nhptr->protocol; + xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off & htons(IP_MF | IP_OFFSET)) + return -1; +#endif + + h1 = hash_dst(dst, protocol, tunnelid); + h2 = hash_src(src); + + for (s = sht[h1]; s; s = s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && + protocol == s->protocol && + !(s->dpi.mask & + (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) && +#if RSVP_DST_LEN == 4 + dst[0] == s->dst[0] && + dst[1] == s->dst[1] && + dst[2] == s->dst[2] && +#endif + tunnelid == s->tunnelid) { + + for (f = s->ht[h2]; f; f = f->next) { + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && + !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) +#if RSVP_DST_LEN == 4 + && + src[0] == f->src[0] && + src[1] == f->src[1] && + src[2] == f->src[2] +#endif + ) { + *res = f->res; + RSVP_APPLY_RESULT(); + +matched: + if (f->tunnelhdr == 0) + return 0; + + tunnelid = f->res.classid; + nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr)); + goto restart; + } + } + + /* And wildcard bucket... */ + for (f = s->ht[16]; f; f = f->next) { + *res = f->res; + RSVP_APPLY_RESULT(); + goto matched; + } + return -1; + } + } + return -1; +} + +static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) +{ + struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned int h1 = handle & 0xFF; + unsigned int h2 = (handle >> 8) & 0xFF; + + if (h2 > 16) + return 0; + + for (s = sht[h1]; s; s = s->next) { + for (f = s->ht[h2]; f; f = f->next) { + if (f->handle == handle) + return (unsigned long)f; + } + } + return 0; +} + +static void rsvp_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int rsvp_init(struct tcf_proto *tp) +{ + struct rsvp_head *data; + + data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); + if (data) { + tp->root = data; + return 0; + } + return -ENOBUFS; +} + +static void +rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void rsvp_destroy(struct tcf_proto *tp) +{ + struct rsvp_head *data = xchg(&tp->root, NULL); + struct rsvp_session **sht; + int h1, h2; + + if (data == NULL) + return; + + sht = data->ht; + + for (h1 = 0; h1 < 256; h1++) { + struct rsvp_session *s; + + while ((s = sht[h1]) != NULL) { + sht[h1] = s->next; + + for (h2 = 0; h2 <= 16; h2++) { + struct rsvp_filter *f; + + while ((f = s->ht[h2]) != NULL) { + s->ht[h2] = f->next; + rsvp_delete_filter(tp, f); + } + } + kfree(s); + } + } + kfree(data); +} + +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg; + unsigned int h = f->handle; + struct rsvp_session **sp; + struct rsvp_session *s = f->sess; + int i; + + for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + rsvp_delete_filter(tp, f); + + /* Strip tree */ + + for (i = 0; i <= 16; i++) + if (s->ht[i]) + return 0; + + /* OK, session has no flows */ + for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF]; + *sp; sp = &(*sp)->next) { + if (*sp == s) { + tcf_tree_lock(tp); + *sp = s->next; + tcf_tree_unlock(tp); + + kfree(s); + return 0; + } + } + + return 0; + } + } + return 0; +} + +static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) +{ + struct rsvp_head *data = tp->root; + int i = 0xFFFF; + + while (i-- > 0) { + u32 h; + + if ((data->hgenerator += 0x10000) == 0) + data->hgenerator = 0x10000; + h = data->hgenerator|salt; + if (rsvp_get(tp, h) == 0) + return h; + } + return 0; +} + +static int tunnel_bts(struct rsvp_head *data) +{ + int n = data->tgenerator >> 5; + u32 b = 1 << (data->tgenerator & 0x1F); + + if (data->tmap[n] & b) + return 0; + data->tmap[n] |= b; + return 1; +} + +static void tunnel_recycle(struct rsvp_head *data) +{ + struct rsvp_session **sht = data->ht; + u32 tmap[256/32]; + int h1, h2; + + memset(tmap, 0, sizeof(tmap)); + + for (h1 = 0; h1 < 256; h1++) { + struct rsvp_session *s; + for (s = sht[h1]; s; s = s->next) { + for (h2 = 0; h2 <= 16; h2++) { + struct rsvp_filter *f; + + for (f = s->ht[h2]; f; f = f->next) { + if (f->tunnelhdr == 0) + continue; + data->tgenerator = f->res.classid; + tunnel_bts(data); + } + } + } + } + + memcpy(data->tmap, tmap, sizeof(tmap)); +} + +static u32 gen_tunnel(struct rsvp_head *data) +{ + int i, k; + + for (k = 0; k < 2; k++) { + for (i = 255; i > 0; i--) { + if (++data->tgenerator == 0) + data->tgenerator = 1; + if (tunnel_bts(data)) + return data->tgenerator; + } + tunnel_recycle(data); + } + return 0; +} + +static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { + [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, + [TCA_RSVP_DST] = { .type = NLA_BINARY, + .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_SRC] = { .type = NLA_BINARY, + .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, +}; + +static int rsvp_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct nlattr **tca, + unsigned long *arg) +{ + struct rsvp_head *data = tp->root; + struct rsvp_filter *f, **fp; + struct rsvp_session *s, **sp; + struct tc_rsvp_pinfo *pinfo = NULL; + struct nlattr *opt = tca[TCA_OPTIONS-1]; + struct nlattr *tb[TCA_RSVP_MAX + 1]; + struct tcf_exts e; + unsigned int h1, h2; + __be32 *dst; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy); + if (err < 0) + return err; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); + if (err < 0) + return err; + + f = (struct rsvp_filter *)*arg; + if (f) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + goto errout2; + if (tb[TCA_RSVP_CLASSID-1]) { + f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + return 0; + } + + /* Now more serious part... */ + err = -EINVAL; + if (handle) + goto errout2; + if (tb[TCA_RSVP_DST-1] == NULL) + goto errout2; + + err = -ENOBUFS; + f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + if (f == NULL) + goto errout2; + + h2 = 16; + if (tb[TCA_RSVP_SRC-1]) { + memcpy(f->src, nla_data(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + h2 = hash_src(f->src); + } + if (tb[TCA_RSVP_PINFO-1]) { + pinfo = nla_data(tb[TCA_RSVP_PINFO-1]); + f->spi = pinfo->spi; + f->tunnelhdr = pinfo->tunnelhdr; + } + if (tb[TCA_RSVP_CLASSID-1]) + f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]); + + dst = nla_data(tb[TCA_RSVP_DST-1]); + h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); + + err = -ENOMEM; + if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) + goto errout; + + if (f->tunnelhdr) { + err = -EINVAL; + if (f->res.classid > 255) + goto errout; + + err = -ENOMEM; + if (f->res.classid == 0 && + (f->res.classid = gen_tunnel(data)) == 0) + goto errout; + } + + for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + pinfo && pinfo->protocol == s->protocol && + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && +#if RSVP_DST_LEN == 4 + dst[0] == s->dst[0] && + dst[1] == s->dst[1] && + dst[2] == s->dst[2] && +#endif + pinfo->tunnelid == s->tunnelid) { + +insert: + /* OK, we found appropriate session */ + + fp = &s->ht[h2]; + + f->sess = s; + if (f->tunnelhdr == 0) + tcf_bind_filter(tp, &f->res, base); + + tcf_exts_change(tp, &f->exts, &e); + + for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) + if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask) + break; + f->next = *fp; + wmb(); + *fp = f; + + *arg = (unsigned long)f; + return 0; + } + } + + /* No session found. Create new one. */ + + err = -ENOBUFS; + s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL); + if (s == NULL) + goto errout; + memcpy(s->dst, dst, sizeof(s->dst)); + + if (pinfo) { + s->dpi = pinfo->dpi; + s->protocol = pinfo->protocol; + s->tunnelid = pinfo->tunnelid; + } + for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { + if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + break; + } + s->next = *sp; + wmb(); + *sp = s; + + goto insert; + +errout: + kfree(f); +errout2: + tcf_exts_destroy(tp, &e); + return err; +} + +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct rsvp_head *head = tp->root; + unsigned int h, h1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct rsvp_session *s; + + for (s = head->ht[h]; s; s = s->next) { + for (h1 = 0; h1 <= 16; h1++) { + struct rsvp_filter *f; + + for (f = s->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } + } +} + +static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct rsvp_filter *f = (struct rsvp_filter *)fh; + struct rsvp_session *s; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + struct tc_rsvp_pinfo pinfo; + + if (f == NULL) + return skb->len; + s = f->sess; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + pinfo.dpi = s->dpi; + pinfo.spi = f->spi; + pinfo.protocol = s->protocol; + pinfo.tunnelid = s->tunnelid; + pinfo.tunnelhdr = f->tunnelhdr; + pinfo.pad = 0; + NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + if (f->res.classid) + NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); + if (((f->handle >> 8) & 0xFF) != 16) + NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); + + if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tcf_proto_ops RSVP_OPS = { + .next = NULL, + .kind = RSVP_ID, + .classify = rsvp_classify, + .init = rsvp_init, + .destroy = rsvp_destroy, + .get = rsvp_get, + .put = rsvp_put, + .change = rsvp_change, + .delete = rsvp_delete, + .walk = rsvp_walk, + .dump = rsvp_dump, + .owner = THIS_MODULE, +}; + +static int __init init_rsvp(void) +{ + return register_tcf_proto_ops(&RSVP_OPS); +} + +static void __exit exit_rsvp(void) +{ + unregister_tcf_proto_ops(&RSVP_OPS); +} + +module_init(init_rsvp) +module_exit(exit_rsvp) diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c new file mode 100644 index 00000000..dd08aea2 --- /dev/null +++ b/net/sched/cls_rsvp6.c @@ -0,0 +1,28 @@ +/* + * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RSVP_DST_LEN 4 +#define RSVP_ID "rsvp6" +#define RSVP_OPS cls_rsvp6_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c new file mode 100644 index 00000000..36667fa6 --- /dev/null +++ b/net/sched/cls_tcindex.c @@ -0,0 +1,507 @@ +/* + * net/sched/cls_tcindex.c Packet classifier for skb->tc_index + * + * Written 1998,1999 by Werner Almesberger, EPFL ICA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Passing parameters to the root seems to be done more awkwardly than really + * necessary. At least, u32 doesn't seem to use such dirty hacks. To be + * verified. FIXME. + */ + +#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ +#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ + + +#define PRIV(tp) ((struct tcindex_data *) (tp)->root) + + +struct tcindex_filter_result { + struct tcf_exts exts; + struct tcf_result res; +}; + +struct tcindex_filter { + u16 key; + struct tcindex_filter_result result; + struct tcindex_filter *next; +}; + + +struct tcindex_data { + struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ + struct tcindex_filter **h; /* imperfect hash; only used if !perfect; + NULL if unused */ + u16 mask; /* AND key with mask */ + int shift; /* shift ANDed key to the right */ + int hash; /* hash table size; 0 if undefined */ + int alloc_hash; /* allocated size */ + int fall_through; /* 0: only classify if explicit match */ +}; + +static const struct tcf_ext_map tcindex_ext_map = { + .police = TCA_TCINDEX_POLICE, + .action = TCA_TCINDEX_ACT +}; + +static inline int +tcindex_filter_is_set(struct tcindex_filter_result *r) +{ + return tcf_exts_is_predicative(&r->exts) || r->res.classid; +} + +static struct tcindex_filter_result * +tcindex_lookup(struct tcindex_data *p, u16 key) +{ + struct tcindex_filter *f; + + if (p->perfect) + return tcindex_filter_is_set(p->perfect + key) ? + p->perfect + key : NULL; + else if (p->h) { + for (f = p->h[key % p->hash]; f; f = f->next) + if (f->key == key) + return &f->result; + } + + return NULL; +} + + +static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *f; + int key = (skb->tc_index & p->mask) >> p->shift; + + pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", + skb, tp, res, p); + + f = tcindex_lookup(p, key); + if (!f) { + if (!p->fall_through) + return -1; + res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); + res->class = 0; + pr_debug("alg 0x%x\n", res->classid); + return 0; + } + *res = f->res; + pr_debug("map 0x%x\n", res->classid); + + return tcf_exts_exec(skb, &f->exts, res); +} + + +static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r; + + pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); + if (p->perfect && handle >= p->alloc_hash) + return 0; + r = tcindex_lookup(p, handle); + return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL; +} + + +static void tcindex_put(struct tcf_proto *tp, unsigned long f) +{ + pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f); +} + + +static int tcindex_init(struct tcf_proto *tp) +{ + struct tcindex_data *p; + + pr_debug("tcindex_init(tp %p)\n", tp); + p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->mask = 0xffff; + p->hash = DEFAULT_HASH_SIZE; + p->fall_through = 1; + + tp->root = p; + return 0; +} + + +static int +__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter *f = NULL; + + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); + if (p->perfect) { + if (!r->res.class) + return -ENOENT; + } else { + int i; + struct tcindex_filter **walk = NULL; + + for (i = 0; i < p->hash; i++) + for (walk = p->h+i; *walk; walk = &(*walk)->next) + if (&(*walk)->result == r) + goto found; + return -ENOENT; + +found: + f = *walk; + if (lock) + tcf_tree_lock(tp); + *walk = f->next; + if (lock) + tcf_tree_unlock(tp); + } + tcf_unbind_filter(tp, &r->res); + tcf_exts_destroy(tp, &r->exts); + kfree(f); + return 0; +} + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +{ + return __tcindex_delete(tp, arg, 1); +} + +static inline int +valid_perfect_hash(struct tcindex_data *p) +{ + return p->hash > (p->mask >> p->shift); +} + +static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { + [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, + [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, + [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, + [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, + [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, +}; + +static int +tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, + struct tcindex_data *p, struct tcindex_filter_result *r, + struct nlattr **tb, struct nlattr *est) +{ + int err, balloc = 0; + struct tcindex_filter_result new_filter_result, *old_r = r; + struct tcindex_filter_result cr; + struct tcindex_data cp; + struct tcindex_filter *f = NULL; /* make gcc behave */ + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + if (err < 0) + return err; + + memcpy(&cp, p, sizeof(cp)); + memset(&new_filter_result, 0, sizeof(new_filter_result)); + + if (old_r) + memcpy(&cr, r, sizeof(cr)); + else + memset(&cr, 0, sizeof(cr)); + + if (tb[TCA_TCINDEX_HASH]) + cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + + if (tb[TCA_TCINDEX_MASK]) + cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + + if (tb[TCA_TCINDEX_SHIFT]) + cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + + err = -EBUSY; + /* Hash already allocated, make sure that we still meet the + * requirements for the allocated hash. + */ + if (cp.perfect) { + if (!valid_perfect_hash(&cp) || + cp.hash > cp.alloc_hash) + goto errout; + } else if (cp.h && cp.hash != cp.alloc_hash) + goto errout; + + err = -EINVAL; + if (tb[TCA_TCINDEX_FALL_THROUGH]) + cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); + + if (!cp.hash) { + /* Hash not specified, use perfect hash if the upper limit + * of the hashing index is below the threshold. + */ + if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) + cp.hash = (cp.mask >> cp.shift) + 1; + else + cp.hash = DEFAULT_HASH_SIZE; + } + + if (!cp.perfect && !cp.h) + cp.alloc_hash = cp.hash; + + /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) + * but then, we'd fail handles that may become valid after some future + * mask change. While this is extremely unlikely to ever matter, + * the check below is safer (and also more backwards-compatible). + */ + if (cp.perfect || valid_perfect_hash(&cp)) + if (handle >= cp.alloc_hash) + goto errout; + + + err = -ENOMEM; + if (!cp.perfect && !cp.h) { + if (valid_perfect_hash(&cp)) { + cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); + if (!cp.perfect) + goto errout; + balloc = 1; + } else { + cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); + if (!cp.h) + goto errout; + balloc = 2; + } + } + + if (cp.perfect) + r = cp.perfect + handle; + else + r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + + if (r == &new_filter_result) { + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) + goto errout_alloc; + } + + if (tb[TCA_TCINDEX_CLASSID]) { + cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); + tcf_bind_filter(tp, &cr.res, base); + } + + tcf_exts_change(tp, &cr.exts, &e); + + tcf_tree_lock(tp); + if (old_r && old_r != r) + memset(old_r, 0, sizeof(*old_r)); + + memcpy(p, &cp, sizeof(cp)); + memcpy(r, &cr, sizeof(cr)); + + if (r == &new_filter_result) { + struct tcindex_filter **fp; + + f->key = handle; + f->result = new_filter_result; + f->next = NULL; + for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) + /* nothing */; + *fp = f; + } + tcf_tree_unlock(tp); + + return 0; + +errout_alloc: + if (balloc == 1) + kfree(cp.perfect); + else if (balloc == 2) + kfree(cp.h); +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int +tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg) +{ + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_TCINDEX_MAX + 1]; + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; + int err; + + pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," + "p %p,r %p,*arg 0x%lx\n", + tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L); + + if (!opt) + return 0; + + err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy); + if (err < 0) + return err; + + return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]); +} + + +static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter *f, *next; + int i; + + pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); + if (p->perfect) { + for (i = 0; i < p->hash; i++) { + if (!p->perfect[i].res.class) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(tp, + (unsigned long) (p->perfect+i), walker) + < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } + if (!p->h) + return; + for (i = 0; i < p->hash; i++) { + for (f = p->h[i]; f; f = next) { + next = f->next; + if (walker->count >= walker->skip) { + if (walker->fn(tp, (unsigned long) &f->result, + walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } +} + + +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, struct tcf_walker *walker) +{ + return __tcindex_delete(tp, arg, 0); +} + + +static void tcindex_destroy(struct tcf_proto *tp) +{ + struct tcindex_data *p = PRIV(tp); + struct tcf_walker walker; + + pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); + walker.count = 0; + walker.skip = 0; + walker.fn = &tcindex_destroy_element; + tcindex_walk(tp, &walker); + kfree(p->perfect); + kfree(p->h); + kfree(p); + tp->root = NULL; +} + + +static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", + tp, fh, skb, t, p, r, b); + pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (!fh) { + t->tcm_handle = ~0; /* whatever ... */ + NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash); + NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask); + NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift); + NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through); + nla_nest_end(skb, nest); + } else { + if (p->perfect) { + t->tcm_handle = r-p->perfect; + } else { + struct tcindex_filter *f; + int i; + + t->tcm_handle = 0; + for (i = 0; !t->tcm_handle && i < p->hash; i++) { + for (f = p->h[i]; !t->tcm_handle && f; + f = f->next) { + if (&f->result == r) + t->tcm_handle = f->key; + } + } + } + pr_debug("handle = %d\n", t->tcm_handle); + if (r->res.class) + NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid); + + if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) + goto nla_put_failure; + } + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { + .kind = "tcindex", + .classify = tcindex_classify, + .init = tcindex_init, + .destroy = tcindex_destroy, + .get = tcindex_get, + .put = tcindex_put, + .change = tcindex_change, + .delete = tcindex_delete, + .walk = tcindex_walk, + .dump = tcindex_dump, + .owner = THIS_MODULE, +}; + +static int __init init_tcindex(void) +{ + return register_tcf_proto_ops(&cls_tcindex_ops); +} + +static void __exit exit_tcindex(void) +{ + unregister_tcf_proto_ops(&cls_tcindex_ops); +} + +module_init(init_tcindex) +module_exit(exit_tcindex) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c new file mode 100644 index 00000000..3b93fc0c --- /dev/null +++ b/net/sched/cls_u32.c @@ -0,0 +1,817 @@ +/* + * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * The filters are packed to hash tables of key nodes + * with a set of 32bit key/mask pairs at every node. + * Nodes reference next level hash tables etc. + * + * This scheme is the best universal classifier I managed to + * invent; it is not super-fast, but it is not slow (provided you + * program it correctly), and general enough. And its relative + * speed grows as the number of rules becomes larger. + * + * It seems that it represents the best middle point between + * speed and manageability both by human and by machine. + * + * It is especially useful for link sharing combined with QoS; + * pure RSVP doesn't need such a general approach and can use + * much simpler (and faster) schemes, sort of cls_rsvp.c. + * + * JHS: We should remove the CONFIG_NET_CLS_IND from here + * eventually when the meta match extension is made available + * + * nfmark match added by Catalin(ux aka Dino) BOIE + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct tc_u_knode { + struct tc_u_knode *next; + u32 handle; + struct tc_u_hnode *ht_up; + struct tcf_exts exts; +#ifdef CONFIG_NET_CLS_IND + char indev[IFNAMSIZ]; +#endif + u8 fshift; + struct tcf_result res; + struct tc_u_hnode *ht_down; +#ifdef CONFIG_CLS_U32_PERF + struct tc_u32_pcnt *pf; +#endif +#ifdef CONFIG_CLS_U32_MARK + struct tc_u32_mark mark; +#endif + struct tc_u32_sel sel; +}; + +struct tc_u_hnode { + struct tc_u_hnode *next; + u32 handle; + u32 prio; + struct tc_u_common *tp_c; + int refcnt; + unsigned int divisor; + struct tc_u_knode *ht[1]; +}; + +struct tc_u_common { + struct tc_u_hnode *hlist; + struct Qdisc *q; + int refcnt; + u32 hgenerator; +}; + +static const struct tcf_ext_map u32_ext_map = { + .action = TCA_U32_ACT, + .police = TCA_U32_POLICE +}; + +static inline unsigned int u32_hash_fold(__be32 key, + const struct tc_u32_sel *sel, + u8 fshift) +{ + unsigned int h = ntohl(key & sel->hmask) >> fshift; + + return h; +} + +static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + struct { + struct tc_u_knode *knode; + unsigned int off; + } stack[TC_U32_MAXDEPTH]; + + struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root; + unsigned int off = skb_network_offset(skb); + struct tc_u_knode *n; + int sdepth = 0; + int off2 = 0; + int sel = 0; +#ifdef CONFIG_CLS_U32_PERF + int j; +#endif + int i, r; + +next_ht: + n = ht->ht[sel]; + +next_knode: + if (n) { + struct tc_u32_key *key = n->sel.keys; + +#ifdef CONFIG_CLS_U32_PERF + n->pf->rcnt += 1; + j = 0; +#endif + +#ifdef CONFIG_CLS_U32_MARK + if ((skb->mark & n->mark.mask) != n->mark.val) { + n = n->next; + goto next_knode; + } else { + n->mark.success++; + } +#endif + + for (i = n->sel.nkeys; i > 0; i--, key++) { + int toff = off + key->off + (off2 & key->offmask); + __be32 *data, hdata; + + if (skb_headroom(skb) + toff > INT_MAX) + goto out; + + data = skb_header_pointer(skb, toff, 4, &hdata); + if (!data) + goto out; + if ((*data ^ key->val) & key->mask) { + n = n->next; + goto next_knode; + } +#ifdef CONFIG_CLS_U32_PERF + n->pf->kcnts[j] += 1; + j++; +#endif + } + if (n->ht_down == NULL) { +check_terminal: + if (n->sel.flags & TC_U32_TERMINAL) { + + *res = n->res; +#ifdef CONFIG_NET_CLS_IND + if (!tcf_match_indev(skb, n->indev)) { + n = n->next; + goto next_knode; + } +#endif +#ifdef CONFIG_CLS_U32_PERF + n->pf->rhit += 1; +#endif + r = tcf_exts_exec(skb, &n->exts, res); + if (r < 0) { + n = n->next; + goto next_knode; + } + + return r; + } + n = n->next; + goto next_knode; + } + + /* PUSH */ + if (sdepth >= TC_U32_MAXDEPTH) + goto deadloop; + stack[sdepth].knode = n; + stack[sdepth].off = off; + sdepth++; + + ht = n->ht_down; + sel = 0; + if (ht->divisor) { + __be32 *data, hdata; + + data = skb_header_pointer(skb, off + n->sel.hoff, 4, + &hdata); + if (!data) + goto out; + sel = ht->divisor & u32_hash_fold(*data, &n->sel, + n->fshift); + } + if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT))) + goto next_ht; + + if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) { + off2 = n->sel.off + 3; + if (n->sel.flags & TC_U32_VAROFFSET) { + __be16 *data, hdata; + + data = skb_header_pointer(skb, + off + n->sel.offoff, + 2, &hdata); + if (!data) + goto out; + off2 += ntohs(n->sel.offmask & *data) >> + n->sel.offshift; + } + off2 &= ~3; + } + if (n->sel.flags & TC_U32_EAT) { + off += off2; + off2 = 0; + } + + if (off < skb->len) + goto next_ht; + } + + /* POP */ + if (sdepth--) { + n = stack[sdepth].knode; + ht = n->ht_up; + off = stack[sdepth].off; + goto check_terminal; + } +out: + return -1; + +deadloop: + if (net_ratelimit()) + pr_warning("cls_u32: dead loop\n"); + return -1; +} + +static struct tc_u_hnode * +u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +{ + struct tc_u_hnode *ht; + + for (ht = tp_c->hlist; ht; ht = ht->next) + if (ht->handle == handle) + break; + + return ht; +} + +static struct tc_u_knode * +u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +{ + unsigned int sel; + struct tc_u_knode *n = NULL; + + sel = TC_U32_HASH(handle); + if (sel > ht->divisor) + goto out; + + for (n = ht->ht[sel]; n; n = n->next) + if (n->handle == handle) + break; +out: + return n; +} + + +static unsigned long u32_get(struct tcf_proto *tp, u32 handle) +{ + struct tc_u_hnode *ht; + struct tc_u_common *tp_c = tp->data; + + if (TC_U32_HTID(handle) == TC_U32_ROOT) + ht = tp->root; + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); + + if (!ht) + return 0; + + if (TC_U32_KEY(handle) == 0) + return (unsigned long)ht; + + return (unsigned long)u32_lookup_key(ht, handle); +} + +static void u32_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static u32 gen_new_htid(struct tc_u_common *tp_c) +{ + int i = 0x800; + + do { + if (++tp_c->hgenerator == 0x7FF) + tp_c->hgenerator = 1; + } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + + return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; +} + +static int u32_init(struct tcf_proto *tp) +{ + struct tc_u_hnode *root_ht; + struct tc_u_common *tp_c; + + tp_c = tp->q->u32_node; + + root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL); + if (root_ht == NULL) + return -ENOBUFS; + + root_ht->divisor = 0; + root_ht->refcnt++; + root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->prio = tp->prio; + + if (tp_c == NULL) { + tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); + if (tp_c == NULL) { + kfree(root_ht); + return -ENOBUFS; + } + tp_c->q = tp->q; + tp->q->u32_node = tp_c; + } + + tp_c->refcnt++; + root_ht->next = tp_c->hlist; + tp_c->hlist = root_ht; + root_ht->tp_c = tp_c; + + tp->root = root_ht; + tp->data = tp_c; + return 0; +} + +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +{ + tcf_unbind_filter(tp, &n->res); + tcf_exts_destroy(tp, &n->exts); + if (n->ht_down) + n->ht_down->refcnt--; +#ifdef CONFIG_CLS_U32_PERF + kfree(n->pf); +#endif + kfree(n); + return 0; +} + +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +{ + struct tc_u_knode **kp; + struct tc_u_hnode *ht = key->ht_up; + + if (ht) { + for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { + if (*kp == key) { + tcf_tree_lock(tp); + *kp = key->next; + tcf_tree_unlock(tp); + + u32_destroy_key(tp, key); + return 0; + } + } + } + WARN_ON(1); + return 0; +} + +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_knode *n; + unsigned int h; + + for (h = 0; h <= ht->divisor; h++) { + while ((n = ht->ht[h]) != NULL) { + ht->ht[h] = n->next; + + u32_destroy_key(tp, n); + } + } +} + +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode **hn; + + WARN_ON(ht->refcnt); + + u32_clear_hnode(tp, ht); + + for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { + if (*hn == ht) { + *hn = ht->next; + kfree(ht); + return 0; + } + } + + WARN_ON(1); + return -ENOENT; +} + +static void u32_destroy(struct tcf_proto *tp) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *root_ht = tp->root; + + WARN_ON(root_ht == NULL); + + if (root_ht && --root_ht->refcnt == 0) + u32_destroy_hnode(tp, root_ht); + + if (--tp_c->refcnt == 0) { + struct tc_u_hnode *ht; + + tp->q->u32_node = NULL; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + ht->refcnt--; + u32_clear_hnode(tp, ht); + } + + while ((ht = tp_c->hlist) != NULL) { + tp_c->hlist = ht->next; + + WARN_ON(ht->refcnt != 0); + + kfree(ht); + } + + kfree(tp_c); + } + + tp->data = NULL; +} + +static int u32_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tc_u_hnode *ht = (struct tc_u_hnode *)arg; + + if (ht == NULL) + return 0; + + if (TC_U32_KEY(ht->handle)) + return u32_delete_key(tp, (struct tc_u_knode *)ht); + + if (tp->root == ht) + return -EINVAL; + + if (ht->refcnt == 1) { + ht->refcnt--; + u32_destroy_hnode(tp, ht); + } else { + return -EBUSY; + } + + return 0; +} + +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +{ + struct tc_u_knode *n; + unsigned int i = 0x7FF; + + for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + if (i < TC_U32_NODE(n->handle)) + i = TC_U32_NODE(n->handle); + i++; + + return handle | (i > 0xFFF ? 0xFFF : i); +} + +static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { + [TCA_U32_CLASSID] = { .type = NLA_U32 }, + [TCA_U32_HASH] = { .type = NLA_U32 }, + [TCA_U32_LINK] = { .type = NLA_U32 }, + [TCA_U32_DIVISOR] = { .type = NLA_U32 }, + [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, + [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, +}; + +static int u32_set_parms(struct tcf_proto *tp, unsigned long base, + struct tc_u_hnode *ht, + struct tc_u_knode *n, struct nlattr **tb, + struct nlattr *est) +{ + int err; + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_U32_LINK]) { + u32 handle = nla_get_u32(tb[TCA_U32_LINK]); + struct tc_u_hnode *ht_down = NULL, *ht_old; + + if (TC_U32_KEY(handle)) + goto errout; + + if (handle) { + ht_down = u32_lookup_ht(ht->tp_c, handle); + + if (ht_down == NULL) + goto errout; + ht_down->refcnt++; + } + + tcf_tree_lock(tp); + ht_old = n->ht_down; + n->ht_down = ht_down; + tcf_tree_unlock(tp); + + if (ht_old) + ht_old->refcnt--; + } + if (tb[TCA_U32_CLASSID]) { + n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); + tcf_bind_filter(tp, &n->res, base); + } + +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_U32_INDEV]) { + err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]); + if (err < 0) + goto errout; + } +#endif + tcf_exts_change(tp, &n->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, + unsigned long *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + struct tc_u32_sel *s; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_U32_MAX + 1]; + u32 htid; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy); + if (err < 0) + return err; + + n = (struct tc_u_knode *)*arg; + if (n) { + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + + return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]); + } + + if (tb[TCA_U32_DIVISOR]) { + unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); + + if (--divisor > 0x100) + return -EINVAL; + if (TC_U32_KEY(handle)) + return -EINVAL; + if (handle == 0) { + handle = gen_new_htid(tp->data); + if (handle == 0) + return -ENOMEM; + } + ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); + if (ht == NULL) + return -ENOBUFS; + ht->tp_c = tp_c; + ht->refcnt = 1; + ht->divisor = divisor; + ht->handle = handle; + ht->prio = tp->prio; + ht->next = tp_c->hlist; + tp_c->hlist = ht; + *arg = (unsigned long)ht; + return 0; + } + + if (tb[TCA_U32_HASH]) { + htid = nla_get_u32(tb[TCA_U32_HASH]); + if (TC_U32_HTID(htid) == TC_U32_ROOT) { + ht = tp->root; + htid = ht->handle; + } else { + ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); + if (ht == NULL) + return -EINVAL; + } + } else { + ht = tp->root; + htid = ht->handle; + } + + if (ht->divisor < TC_U32_HASH(htid)) + return -EINVAL; + + if (handle) { + if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + return -EINVAL; + handle = htid | TC_U32_NODE(handle); + } else + handle = gen_new_kid(ht, htid); + + if (tb[TCA_U32_SEL] == NULL) + return -EINVAL; + + s = nla_data(tb[TCA_U32_SEL]); + + n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + if (n == NULL) + return -ENOBUFS; + +#ifdef CONFIG_CLS_U32_PERF + n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); + if (n->pf == NULL) { + kfree(n); + return -ENOBUFS; + } +#endif + + memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + n->ht_up = ht; + n->handle = handle; + n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; + +#ifdef CONFIG_CLS_U32_MARK + if (tb[TCA_U32_MARK]) { + struct tc_u32_mark *mark; + + mark = nla_data(tb[TCA_U32_MARK]); + memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); + n->mark.success = 0; + } +#endif + + err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]); + if (err == 0) { + struct tc_u_knode **ins; + for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) + if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) + break; + + n->next = *ins; + tcf_tree_lock(tp); + *ins = n; + tcf_tree_unlock(tp); + + *arg = (unsigned long)n; + return 0; + } +#ifdef CONFIG_CLS_U32_PERF + kfree(n->pf); +#endif + kfree(n); + return err; +} + +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned int h; + + if (arg->stop) + return; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + if (ht->prio != tp->prio) + continue; + if (arg->count >= arg->skip) { + if (arg->fn(tp, (unsigned long)ht, arg) < 0) { + arg->stop = 1; + return; + } + } + arg->count++; + for (h = 0; h <= ht->divisor; h++) { + for (n = ht->ht[h]; n; n = n->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)n, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } +} + +static int u32_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tc_u_knode *n = (struct tc_u_knode *)fh; + struct nlattr *nest; + + if (n == NULL) + return skb->len; + + t->tcm_handle = n->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (TC_U32_KEY(n->handle) == 0) { + struct tc_u_hnode *ht = (struct tc_u_hnode *)fh; + u32 divisor = ht->divisor + 1; + + NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); + } else { + NLA_PUT(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel); + if (n->ht_up) { + u32 htid = n->handle & 0xFFFFF000; + NLA_PUT_U32(skb, TCA_U32_HASH, htid); + } + if (n->res.classid) + NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid); + if (n->ht_down) + NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle); + +#ifdef CONFIG_CLS_U32_MARK + if (n->mark.val || n->mark.mask) + NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); +#endif + + if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NET_CLS_IND + if (strlen(n->indev)) + NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); +#endif +#ifdef CONFIG_CLS_U32_PERF + NLA_PUT(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), + n->pf); +#endif + } + + nla_nest_end(skb, nest); + + if (TC_U32_KEY(n->handle)) + if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_u32_ops __read_mostly = { + .kind = "u32", + .classify = u32_classify, + .init = u32_init, + .destroy = u32_destroy, + .get = u32_get, + .put = u32_put, + .change = u32_change, + .delete = u32_delete, + .walk = u32_walk, + .dump = u32_dump, + .owner = THIS_MODULE, +}; + +static int __init init_u32(void) +{ + pr_info("u32 classifier\n"); +#ifdef CONFIG_CLS_U32_PERF + pr_info(" Performance counters on\n"); +#endif +#ifdef CONFIG_NET_CLS_IND + pr_info(" input device check on\n"); +#endif +#ifdef CONFIG_NET_CLS_ACT + pr_info(" Actions configured\n"); +#endif + return register_tcf_proto_ops(&cls_u32_ops); +} + +static void __exit exit_u32(void) +{ + unregister_tcf_proto_ops(&cls_u32_ops); +} + +module_init(init_u32) +module_exit(exit_u32) +MODULE_LICENSE("GPL"); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c new file mode 100644 index 00000000..1c8360a2 --- /dev/null +++ b/net/sched/em_cmp.c @@ -0,0 +1,99 @@ +/* + * net/sched/em_cmp.c Simple packet data comparison ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include + +static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) +{ + return unlikely(cmp->flags & TCF_EM_CMP_TRANS); +} + +static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; + unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; + u32 val = 0; + + if (!tcf_valid_offset(skb, ptr, cmp->align)) + return 0; + + switch (cmp->align) { + case TCF_EM_ALIGN_U8: + val = *ptr; + break; + + case TCF_EM_ALIGN_U16: + val = get_unaligned_be16(ptr); + + if (cmp_needs_transformation(cmp)) + val = be16_to_cpu(val); + break; + + case TCF_EM_ALIGN_U32: + /* Worth checking boundries? The branching seems + * to get worse. Visit again. + */ + val = get_unaligned_be32(ptr); + + if (cmp_needs_transformation(cmp)) + val = be32_to_cpu(val); + break; + + default: + return 0; + } + + if (cmp->mask) + val &= cmp->mask; + + switch (cmp->opnd) { + case TCF_EM_OPND_EQ: + return val == cmp->val; + case TCF_EM_OPND_LT: + return val < cmp->val; + case TCF_EM_OPND_GT: + return val > cmp->val; + } + + return 0; +} + +static struct tcf_ematch_ops em_cmp_ops = { + .kind = TCF_EM_CMP, + .datalen = sizeof(struct tcf_em_cmp), + .match = em_cmp_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_cmp_ops.link) +}; + +static int __init init_em_cmp(void) +{ + return tcf_em_register(&em_cmp_ops); +} + +static void __exit exit_em_cmp(void) +{ + tcf_em_unregister(&em_cmp_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_cmp); +module_exit(exit_em_cmp); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c new file mode 100644 index 00000000..49130e8a --- /dev/null +++ b/net/sched/em_meta.c @@ -0,0 +1,879 @@ +/* + * net/sched/em_meta.c Metadata ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * + * ========================================================================== + * + * The metadata ematch compares two meta objects where each object + * represents either a meta value stored in the kernel or a static + * value provided by userspace. The objects are not provided by + * userspace itself but rather a definition providing the information + * to build them. Every object is of a certain type which must be + * equal to the object it is being compared to. + * + * The definition of a objects conists of the type (meta type), a + * identifier (meta id) and additional type specific information. + * The meta id is either TCF_META_TYPE_VALUE for values provided by + * userspace or a index to the meta operations table consisting of + * function pointers to type specific meta data collectors returning + * the value of the requested meta value. + * + * lvalue rvalue + * +-----------+ +-----------+ + * | type: INT | | type: INT | + * def | id: DEV | | id: VALUE | + * | data: | | data: 3 | + * +-----------+ +-----------+ + * | | + * ---> meta_ops[INT][DEV](...) | + * | | + * ----------- | + * V V + * +-----------+ +-----------+ + * | type: INT | | type: INT | + * obj | id: DEV | | id: VALUE | + * | data: 2 |<--data got filled out | data: 3 | + * +-----------+ +-----------+ + * | | + * --------------> 2 equals 3 <-------------- + * + * This is a simplified schema, the complexity varies depending + * on the meta type. Obviously, the length of the data must also + * be provided for non-numeric types. + * + * Additionally, type dependent modifiers such as shift operators + * or mask may be applied to extend the functionaliy. As of now, + * the variable length type supports shifting the byte string to + * the right, eating up any number of octets and thus supporting + * wildcard interface name comparisons such as "ppp%" matching + * ppp0..9. + * + * NOTE: Certain meta values depend on other subsystems and are + * only available if that subsystem is enabled in the kernel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct meta_obj { + unsigned long value; + unsigned int len; +}; + +struct meta_value { + struct tcf_meta_val hdr; + unsigned long val; + unsigned int len; +}; + +struct meta_match { + struct meta_value lvalue; + struct meta_value rvalue; +}; + +static inline int meta_id(struct meta_value *v) +{ + return TCF_META_ID(v->hdr.kind); +} + +static inline int meta_type(struct meta_value *v) +{ + return TCF_META_TYPE(v->hdr.kind); +} + +#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \ + struct tcf_pkt_info *info, struct meta_value *v, \ + struct meta_obj *dst, int *err) + +/************************************************************************** + * System status & misc + **************************************************************************/ + +META_COLLECTOR(int_random) +{ + get_random_bytes(&dst->value, sizeof(dst->value)); +} + +static inline unsigned long fixed_loadavg(int load) +{ + int rnd_load = load + (FIXED_1/200); + int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT; + + return ((rnd_load >> FSHIFT) * 100) + rnd_frac; +} + +META_COLLECTOR(int_loadavg_0) +{ + dst->value = fixed_loadavg(avenrun[0]); +} + +META_COLLECTOR(int_loadavg_1) +{ + dst->value = fixed_loadavg(avenrun[1]); +} + +META_COLLECTOR(int_loadavg_2) +{ + dst->value = fixed_loadavg(avenrun[2]); +} + +/************************************************************************** + * Device names & indices + **************************************************************************/ + +static inline int int_dev(struct net_device *dev, struct meta_obj *dst) +{ + if (unlikely(dev == NULL)) + return -1; + + dst->value = dev->ifindex; + return 0; +} + +static inline int var_dev(struct net_device *dev, struct meta_obj *dst) +{ + if (unlikely(dev == NULL)) + return -1; + + dst->value = (unsigned long) dev->name; + dst->len = strlen(dev->name); + return 0; +} + +META_COLLECTOR(int_dev) +{ + *err = int_dev(skb->dev, dst); +} + +META_COLLECTOR(var_dev) +{ + *err = var_dev(skb->dev, dst); +} + +/************************************************************************** + * vlan tag + **************************************************************************/ + +META_COLLECTOR(int_vlan_tag) +{ + unsigned short tag; + + tag = vlan_tx_tag_get(skb); + if (!tag && __vlan_get_tag(skb, &tag)) + *err = -1; + else + dst->value = tag; +} + + + +/************************************************************************** + * skb attributes + **************************************************************************/ + +META_COLLECTOR(int_priority) +{ + dst->value = skb->priority; +} + +META_COLLECTOR(int_protocol) +{ + /* Let userspace take care of the byte ordering */ + dst->value = skb->protocol; +} + +META_COLLECTOR(int_pkttype) +{ + dst->value = skb->pkt_type; +} + +META_COLLECTOR(int_pktlen) +{ + dst->value = skb->len; +} + +META_COLLECTOR(int_datalen) +{ + dst->value = skb->data_len; +} + +META_COLLECTOR(int_maclen) +{ + dst->value = skb->mac_len; +} + +META_COLLECTOR(int_rxhash) +{ + dst->value = skb_get_rxhash(skb); +} + +/************************************************************************** + * Netfilter + **************************************************************************/ + +META_COLLECTOR(int_mark) +{ + dst->value = skb->mark; +} + +/************************************************************************** + * Traffic Control + **************************************************************************/ + +META_COLLECTOR(int_tcindex) +{ + dst->value = skb->tc_index; +} + +/************************************************************************** + * Routing + **************************************************************************/ + +META_COLLECTOR(int_rtclassid) +{ + if (unlikely(skb_dst(skb) == NULL)) + *err = -1; + else +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->value = skb_dst(skb)->tclassid; +#else + dst->value = 0; +#endif +} + +META_COLLECTOR(int_rtiif) +{ + if (unlikely(skb_rtable(skb) == NULL)) + *err = -1; + else + dst->value = skb_rtable(skb)->rt_iif; +} + +/************************************************************************** + * Socket Attributes + **************************************************************************/ + +#define SKIP_NONLOCAL(skb) \ + if (unlikely(skb->sk == NULL)) { \ + *err = -1; \ + return; \ + } + +META_COLLECTOR(int_sk_family) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_family; +} + +META_COLLECTOR(int_sk_state) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_state; +} + +META_COLLECTOR(int_sk_reuse) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_reuse; +} + +META_COLLECTOR(int_sk_bound_if) +{ + SKIP_NONLOCAL(skb); + /* No error if bound_dev_if is 0, legal userspace check */ + dst->value = skb->sk->sk_bound_dev_if; +} + +META_COLLECTOR(var_sk_bound_if) +{ + SKIP_NONLOCAL(skb); + + if (skb->sk->sk_bound_dev_if == 0) { + dst->value = (unsigned long) "any"; + dst->len = 3; + } else { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(skb->sk), + skb->sk->sk_bound_dev_if); + *err = var_dev(dev, dst); + rcu_read_unlock(); + } +} + +META_COLLECTOR(int_sk_refcnt) +{ + SKIP_NONLOCAL(skb); + dst->value = atomic_read(&skb->sk->sk_refcnt); +} + +META_COLLECTOR(int_sk_rcvbuf) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_rcvbuf; +} + +META_COLLECTOR(int_sk_shutdown) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_shutdown; +} + +META_COLLECTOR(int_sk_proto) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_protocol; +} + +META_COLLECTOR(int_sk_type) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_type; +} + +META_COLLECTOR(int_sk_rmem_alloc) +{ + SKIP_NONLOCAL(skb); + dst->value = sk_rmem_alloc_get(skb->sk); +} + +META_COLLECTOR(int_sk_wmem_alloc) +{ + SKIP_NONLOCAL(skb); + dst->value = sk_wmem_alloc_get(skb->sk); +} + +META_COLLECTOR(int_sk_omem_alloc) +{ + SKIP_NONLOCAL(skb); + dst->value = atomic_read(&skb->sk->sk_omem_alloc); +} + +META_COLLECTOR(int_sk_rcv_qlen) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_receive_queue.qlen; +} + +META_COLLECTOR(int_sk_snd_qlen) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_write_queue.qlen; +} + +META_COLLECTOR(int_sk_wmem_queued) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_wmem_queued; +} + +META_COLLECTOR(int_sk_fwd_alloc) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_forward_alloc; +} + +META_COLLECTOR(int_sk_sndbuf) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_sndbuf; +} + +META_COLLECTOR(int_sk_alloc) +{ + SKIP_NONLOCAL(skb); + dst->value = (__force int) skb->sk->sk_allocation; +} + +META_COLLECTOR(int_sk_route_caps) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_route_caps; +} + +META_COLLECTOR(int_sk_hash) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_hash; +} + +META_COLLECTOR(int_sk_lingertime) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_lingertime / HZ; +} + +META_COLLECTOR(int_sk_err_qlen) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_error_queue.qlen; +} + +META_COLLECTOR(int_sk_ack_bl) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_ack_backlog; +} + +META_COLLECTOR(int_sk_max_ack_bl) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_max_ack_backlog; +} + +META_COLLECTOR(int_sk_prio) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_priority; +} + +META_COLLECTOR(int_sk_rcvlowat) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_rcvlowat; +} + +META_COLLECTOR(int_sk_rcvtimeo) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_rcvtimeo / HZ; +} + +META_COLLECTOR(int_sk_sndtimeo) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_sndtimeo / HZ; +} + +META_COLLECTOR(int_sk_sendmsg_off) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_sndmsg_off; +} + +META_COLLECTOR(int_sk_write_pend) +{ + SKIP_NONLOCAL(skb); + dst->value = skb->sk->sk_write_pending; +} + +/************************************************************************** + * Meta value collectors assignment table + **************************************************************************/ + +struct meta_ops { + void (*get)(struct sk_buff *, struct tcf_pkt_info *, + struct meta_value *, struct meta_obj *, int *); +}; + +#define META_ID(name) TCF_META_ID_##name +#define META_FUNC(name) { .get = meta_##name } + +/* Meta value operations table listing all meta value collectors and + * assigns them to a type and meta id. */ +static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = { + [TCF_META_TYPE_VAR] = { + [META_ID(DEV)] = META_FUNC(var_dev), + [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), + }, + [TCF_META_TYPE_INT] = { + [META_ID(RANDOM)] = META_FUNC(int_random), + [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0), + [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1), + [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2), + [META_ID(DEV)] = META_FUNC(int_dev), + [META_ID(PRIORITY)] = META_FUNC(int_priority), + [META_ID(PROTOCOL)] = META_FUNC(int_protocol), + [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), + [META_ID(PKTLEN)] = META_FUNC(int_pktlen), + [META_ID(DATALEN)] = META_FUNC(int_datalen), + [META_ID(MACLEN)] = META_FUNC(int_maclen), + [META_ID(NFMARK)] = META_FUNC(int_mark), + [META_ID(TCINDEX)] = META_FUNC(int_tcindex), + [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid), + [META_ID(RTIIF)] = META_FUNC(int_rtiif), + [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family), + [META_ID(SK_STATE)] = META_FUNC(int_sk_state), + [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse), + [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if), + [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt), + [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf), + [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf), + [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown), + [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto), + [META_ID(SK_TYPE)] = META_FUNC(int_sk_type), + [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc), + [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc), + [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc), + [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued), + [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen), + [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen), + [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen), + [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc), + [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc), + [META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps), + [META_ID(SK_HASH)] = META_FUNC(int_sk_hash), + [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime), + [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl), + [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl), + [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio), + [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat), + [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo), + [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo), + [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), + [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), + [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), + [META_ID(RXHASH)] = META_FUNC(int_rxhash), + } +}; + +static inline struct meta_ops *meta_ops(struct meta_value *val) +{ + return &__meta_ops[meta_type(val)][meta_id(val)]; +} + +/************************************************************************** + * Type specific operations for TCF_META_TYPE_VAR + **************************************************************************/ + +static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) +{ + int r = a->len - b->len; + + if (r == 0) + r = memcmp((void *) a->value, (void *) b->value, a->len); + + return r; +} + +static int meta_var_change(struct meta_value *dst, struct nlattr *nla) +{ + int len = nla_len(nla); + + dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL); + if (dst->val == 0UL) + return -ENOMEM; + dst->len = len; + return 0; +} + +static void meta_var_destroy(struct meta_value *v) +{ + kfree((void *) v->val); +} + +static void meta_var_apply_extras(struct meta_value *v, + struct meta_obj *dst) +{ + int shift = v->hdr.shift; + + if (shift && shift < dst->len) + dst->len -= shift; +} + +static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) +{ + if (v->val && v->len) + NLA_PUT(skb, tlv, v->len, (void *) v->val); + return 0; + +nla_put_failure: + return -1; +} + +/************************************************************************** + * Type specific operations for TCF_META_TYPE_INT + **************************************************************************/ + +static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) +{ + /* Let gcc optimize it, the unlikely is not really based on + * some numbers but jump free code for mismatches seems + * more logical. */ + if (unlikely(a->value == b->value)) + return 0; + else if (a->value < b->value) + return -1; + else + return 1; +} + +static int meta_int_change(struct meta_value *dst, struct nlattr *nla) +{ + if (nla_len(nla) >= sizeof(unsigned long)) { + dst->val = *(unsigned long *) nla_data(nla); + dst->len = sizeof(unsigned long); + } else if (nla_len(nla) == sizeof(u32)) { + dst->val = nla_get_u32(nla); + dst->len = sizeof(u32); + } else + return -EINVAL; + + return 0; +} + +static void meta_int_apply_extras(struct meta_value *v, + struct meta_obj *dst) +{ + if (v->hdr.shift) + dst->value >>= v->hdr.shift; + + if (v->val) + dst->value &= v->val; +} + +static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) +{ + if (v->len == sizeof(unsigned long)) + NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); + else if (v->len == sizeof(u32)) + NLA_PUT_U32(skb, tlv, v->val); + + return 0; + +nla_put_failure: + return -1; +} + +/************************************************************************** + * Type specific operations table + **************************************************************************/ + +struct meta_type_ops { + void (*destroy)(struct meta_value *); + int (*compare)(struct meta_obj *, struct meta_obj *); + int (*change)(struct meta_value *, struct nlattr *); + void (*apply_extras)(struct meta_value *, struct meta_obj *); + int (*dump)(struct sk_buff *, struct meta_value *, int); +}; + +static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { + [TCF_META_TYPE_VAR] = { + .destroy = meta_var_destroy, + .compare = meta_var_compare, + .change = meta_var_change, + .apply_extras = meta_var_apply_extras, + .dump = meta_var_dump + }, + [TCF_META_TYPE_INT] = { + .compare = meta_int_compare, + .change = meta_int_change, + .apply_extras = meta_int_apply_extras, + .dump = meta_int_dump + } +}; + +static inline struct meta_type_ops *meta_type_ops(struct meta_value *v) +{ + return &__meta_type_ops[meta_type(v)]; +} + +/************************************************************************** + * Core + **************************************************************************/ + +static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, + struct meta_value *v, struct meta_obj *dst) +{ + int err = 0; + + if (meta_id(v) == TCF_META_ID_VALUE) { + dst->value = v->val; + dst->len = v->len; + return 0; + } + + meta_ops(v)->get(skb, info, v, dst, &err); + if (err < 0) + return err; + + if (meta_type_ops(v)->apply_extras) + meta_type_ops(v)->apply_extras(v, dst); + + return 0; +} + +static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + int r; + struct meta_match *meta = (struct meta_match *) m->data; + struct meta_obj l_value, r_value; + + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 || + meta_get(skb, info, &meta->rvalue, &r_value) < 0) + return 0; + + r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); + + switch (meta->lvalue.hdr.op) { + case TCF_EM_OPND_EQ: + return !r; + case TCF_EM_OPND_LT: + return r < 0; + case TCF_EM_OPND_GT: + return r > 0; + } + + return 0; +} + +static void meta_delete(struct meta_match *meta) +{ + if (meta) { + struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); + + if (ops && ops->destroy) { + ops->destroy(&meta->lvalue); + ops->destroy(&meta->rvalue); + } + } + + kfree(meta); +} + +static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla) +{ + if (nla) { + if (nla_len(nla) == 0) + return -EINVAL; + + return meta_type_ops(dst)->change(dst, nla); + } + + return 0; +} + +static inline int meta_is_supported(struct meta_value *val) +{ + return !meta_id(val) || meta_ops(val)->get; +} + +static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { + [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, +}; + +static int em_meta_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + int err; + struct nlattr *tb[TCA_EM_META_MAX + 1]; + struct tcf_meta_hdr *hdr; + struct meta_match *meta = NULL; + + err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + if (tb[TCA_EM_META_HDR] == NULL) + goto errout; + hdr = nla_data(tb[TCA_EM_META_HDR]); + + if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || + TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || + TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX || + TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) + goto errout; + + meta = kzalloc(sizeof(*meta), GFP_KERNEL); + if (meta == NULL) + goto errout; + + memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); + memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); + + if (!meta_is_supported(&meta->lvalue) || + !meta_is_supported(&meta->rvalue)) { + err = -EOPNOTSUPP; + goto errout; + } + + if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 || + meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0) + goto errout; + + m->datalen = sizeof(*meta); + m->data = (unsigned long) meta; + + err = 0; +errout: + if (err && meta) + meta_delete(meta); + return err; +} + +static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + if (m) + meta_delete((struct meta_match *) m->data); +} + +static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) +{ + struct meta_match *meta = (struct meta_match *) em->data; + struct tcf_meta_hdr hdr; + struct meta_type_ops *ops; + + memset(&hdr, 0, sizeof(hdr)); + memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); + memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); + + NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + + ops = meta_type_ops(&meta->lvalue); + if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || + ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct tcf_ematch_ops em_meta_ops = { + .kind = TCF_EM_META, + .change = em_meta_change, + .match = em_meta_match, + .destroy = em_meta_destroy, + .dump = em_meta_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_meta_ops.link) +}; + +static int __init init_em_meta(void) +{ + return tcf_em_register(&em_meta_ops); +} + +static void __exit exit_em_meta(void) +{ + tcf_em_unregister(&em_meta_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_meta); +module_exit(exit_em_meta); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_META); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c new file mode 100644 index 00000000..a3bed07a --- /dev/null +++ b/net/sched/em_nbyte.c @@ -0,0 +1,80 @@ +/* + * net/sched/em_nbyte.c N-Byte ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nbyte_data { + struct tcf_em_nbyte hdr; + char pattern[0]; +}; + +static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, + struct tcf_ematch *em) +{ + struct tcf_em_nbyte *nbyte = data; + + if (data_len < sizeof(*nbyte) || + data_len < (sizeof(*nbyte) + nbyte->len)) + return -EINVAL; + + em->datalen = sizeof(*nbyte) + nbyte->len; + em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); + if (em->data == 0UL) + return -ENOBUFS; + + return 0; +} + +static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct nbyte_data *nbyte = (struct nbyte_data *) em->data; + unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); + + ptr += nbyte->hdr.off; + + if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) + return 0; + + return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); +} + +static struct tcf_ematch_ops em_nbyte_ops = { + .kind = TCF_EM_NBYTE, + .change = em_nbyte_change, + .match = em_nbyte_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_nbyte_ops.link) +}; + +static int __init init_em_nbyte(void) +{ + return tcf_em_register(&em_nbyte_ops); +} + +static void __exit exit_em_nbyte(void) +{ + tcf_em_unregister(&em_nbyte_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_nbyte); +module_exit(exit_em_nbyte); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE); diff --git a/net/sched/em_text.c b/net/sched/em_text.c new file mode 100644 index 00000000..15d353d2 --- /dev/null +++ b/net/sched/em_text.c @@ -0,0 +1,158 @@ +/* + * net/sched/em_text.c Textsearch ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct text_match { + u16 from_offset; + u16 to_offset; + u8 from_layer; + u8 to_layer; + struct ts_config *config; +}; + +#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data) + +static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + struct text_match *tm = EM_TEXT_PRIV(m); + int from, to; + struct ts_state state; + + from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; + from += tm->from_offset; + + to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; + to += tm->to_offset; + + return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; +} + +static int em_text_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + struct text_match *tm; + struct tcf_em_text *conf = data; + struct ts_config *ts_conf; + int flags = 0; + + if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len)) + return -EINVAL; + + if (conf->from_layer > conf->to_layer) + return -EINVAL; + + if (conf->from_layer == conf->to_layer && + conf->from_offset > conf->to_offset) + return -EINVAL; + +retry: + ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf), + conf->pattern_len, GFP_KERNEL, flags); + + if (flags & TS_AUTOLOAD) + rtnl_lock(); + + if (IS_ERR(ts_conf)) { + if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) { + rtnl_unlock(); + flags |= TS_AUTOLOAD; + goto retry; + } else + return PTR_ERR(ts_conf); + } else if (flags & TS_AUTOLOAD) { + textsearch_destroy(ts_conf); + return -EAGAIN; + } + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (tm == NULL) { + textsearch_destroy(ts_conf); + return -ENOBUFS; + } + + tm->from_offset = conf->from_offset; + tm->to_offset = conf->to_offset; + tm->from_layer = conf->from_layer; + tm->to_layer = conf->to_layer; + tm->config = ts_conf; + + m->datalen = sizeof(*tm); + m->data = (unsigned long) tm; + + return 0; +} + +static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + textsearch_destroy(EM_TEXT_PRIV(m)->config); +} + +static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) +{ + struct text_match *tm = EM_TEXT_PRIV(m); + struct tcf_em_text conf; + + strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1); + conf.from_offset = tm->from_offset; + conf.to_offset = tm->to_offset; + conf.from_layer = tm->from_layer; + conf.to_layer = tm->to_layer; + conf.pattern_len = textsearch_get_pattern_len(tm->config); + conf.pad = 0; + + if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0) + goto nla_put_failure; + if (nla_append(skb, conf.pattern_len, + textsearch_get_pattern(tm->config)) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct tcf_ematch_ops em_text_ops = { + .kind = TCF_EM_TEXT, + .change = em_text_change, + .match = em_text_match, + .destroy = em_text_destroy, + .dump = em_text_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_text_ops.link) +}; + +static int __init init_em_text(void) +{ + return tcf_em_register(&em_text_ops); +} + +static void __exit exit_em_text(void) +{ + tcf_em_unregister(&em_text_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_text); +module_exit(exit_em_text); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT); diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c new file mode 100644 index 00000000..797bdb88 --- /dev/null +++ b/net/sched/em_u32.c @@ -0,0 +1,64 @@ +/* + * net/sched/em_u32.c U32 Ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * Alexey Kuznetsov, + * + * Based on net/sched/cls_u32.c + */ + +#include +#include +#include +#include +#include + +static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct tc_u32_key *key = (struct tc_u32_key *) em->data; + const unsigned char *ptr = skb_network_header(skb); + + if (info) { + if (info->ptr) + ptr = info->ptr; + ptr += (info->nexthdr & key->offmask); + } + + ptr += key->off; + + if (!tcf_valid_offset(skb, ptr, sizeof(u32))) + return 0; + + return !(((*(__be32 *) ptr) ^ key->val) & key->mask); +} + +static struct tcf_ematch_ops em_u32_ops = { + .kind = TCF_EM_U32, + .datalen = sizeof(struct tc_u32_key), + .match = em_u32_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_u32_ops.link) +}; + +static int __init init_em_u32(void) +{ + return tcf_em_register(&em_u32_ops); +} + +static void __exit exit_em_u32(void) +{ + tcf_em_unregister(&em_u32_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_u32); +module_exit(exit_em_u32); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32); diff --git a/net/sched/ematch.c b/net/sched/ematch.c new file mode 100644 index 00000000..88d93eb9 --- /dev/null +++ b/net/sched/ematch.c @@ -0,0 +1,543 @@ +/* + * net/sched/ematch.c Extended Match API + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * + * ========================================================================== + * + * An extended match (ematch) is a small classification tool not worth + * writing a full classifier for. Ematches can be interconnected to form + * a logic expression and get attached to classifiers to extend their + * functionatlity. + * + * The userspace part transforms the logic expressions into an array + * consisting of multiple sequences of interconnected ematches separated + * by markers. Precedence is implemented by a special ematch kind + * referencing a sequence beyond the marker of the current sequence + * causing the current position in the sequence to be pushed onto a stack + * to allow the current position to be overwritten by the position referenced + * in the special ematch. Matching continues in the new sequence until a + * marker is reached causing the position to be restored from the stack. + * + * Example: + * A AND (B1 OR B2) AND C AND D + * + * ------->-PUSH------- + * -->-- / -->-- \ -->-- + * / \ / / \ \ / \ + * +-------+-------+-------+-------+-------+--------+ + * | A AND | B AND | C AND | D END | B1 OR | B2 END | + * +-------+-------+-------+-------+-------+--------+ + * \ / + * --------<-POP--------- + * + * where B is a virtual ematch referencing to sequence starting with B1. + * + * ========================================================================== + * + * How to write an ematch in 60 seconds + * ------------------------------------ + * + * 1) Provide a matcher function: + * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, + * struct tcf_pkt_info *info) + * { + * struct mydata *d = (struct mydata *) m->data; + * + * if (...matching goes here...) + * return 1; + * else + * return 0; + * } + * + * 2) Fill out a struct tcf_ematch_ops: + * static struct tcf_ematch_ops my_ops = { + * .kind = unique id, + * .datalen = sizeof(struct mydata), + * .match = my_match, + * .owner = THIS_MODULE, + * }; + * + * 3) Register/Unregister your ematch: + * static int __init init_my_ematch(void) + * { + * return tcf_em_register(&my_ops); + * } + * + * static void __exit exit_my_ematch(void) + * { + * tcf_em_unregister(&my_ops); + * } + * + * module_init(init_my_ematch); + * module_exit(exit_my_ematch); + * + * 4) By now you should have two more seconds left, barely enough to + * open up a beer to watch the compilation going. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(ematch_ops); +static DEFINE_RWLOCK(ematch_mod_lock); + +static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) +{ + struct tcf_ematch_ops *e = NULL; + + read_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) { + if (kind == e->kind) { + if (!try_module_get(e->owner)) + e = NULL; + read_unlock(&ematch_mod_lock); + return e; + } + } + read_unlock(&ematch_mod_lock); + + return NULL; +} + +/** + * tcf_em_register - register an extended match + * + * @ops: ematch operations lookup table + * + * This function must be called by ematches to announce their presence. + * The given @ops must have kind set to a unique identifier and the + * callback match() must be implemented. All other callbacks are optional + * and a fallback implementation is used instead. + * + * Returns -EEXISTS if an ematch of the same kind has already registered. + */ +int tcf_em_register(struct tcf_ematch_ops *ops) +{ + int err = -EEXIST; + struct tcf_ematch_ops *e; + + if (ops->match == NULL) + return -EINVAL; + + write_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) + if (ops->kind == e->kind) + goto errout; + + list_add_tail(&ops->link, &ematch_ops); + err = 0; +errout: + write_unlock(&ematch_mod_lock); + return err; +} +EXPORT_SYMBOL(tcf_em_register); + +/** + * tcf_em_unregister - unregster and extended match + * + * @ops: ematch operations lookup table + * + * This function must be called by ematches to announce their disappearance + * for examples when the module gets unloaded. The @ops parameter must be + * the same as the one used for registration. + * + * Returns -ENOENT if no matching ematch was found. + */ +void tcf_em_unregister(struct tcf_ematch_ops *ops) +{ + write_lock(&ematch_mod_lock); + list_del(&ops->link); + write_unlock(&ematch_mod_lock); +} +EXPORT_SYMBOL(tcf_em_unregister); + +static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, + int index) +{ + return &tree->matches[index]; +} + + +static int tcf_em_validate(struct tcf_proto *tp, + struct tcf_ematch_tree_hdr *tree_hdr, + struct tcf_ematch *em, struct nlattr *nla, int idx) +{ + int err = -EINVAL; + struct tcf_ematch_hdr *em_hdr = nla_data(nla); + int data_len = nla_len(nla) - sizeof(*em_hdr); + void *data = (void *) em_hdr + sizeof(*em_hdr); + + if (!TCF_EM_REL_VALID(em_hdr->flags)) + goto errout; + + if (em_hdr->kind == TCF_EM_CONTAINER) { + /* Special ematch called "container", carries an index + * referencing an external ematch sequence. + */ + u32 ref; + + if (data_len < sizeof(ref)) + goto errout; + ref = *(u32 *) data; + + if (ref >= tree_hdr->nmatches) + goto errout; + + /* We do not allow backward jumps to avoid loops and jumps + * to our own position are of course illegal. + */ + if (ref <= idx) + goto errout; + + + em->data = ref; + } else { + /* Note: This lookup will increase the module refcnt + * of the ematch module referenced. In case of a failure, + * a destroy function is called by the underlying layer + * which automatically releases the reference again, therefore + * the module MUST not be given back under any circumstances + * here. Be aware, the destroy function assumes that the + * module is held if the ops field is non zero. + */ + em->ops = tcf_em_lookup(em_hdr->kind); + + if (em->ops == NULL) { + err = -ENOENT; +#ifdef CONFIG_MODULES + __rtnl_unlock(); + request_module("ematch-kind-%u", em_hdr->kind); + rtnl_lock(); + em->ops = tcf_em_lookup(em_hdr->kind); + if (em->ops) { + /* We dropped the RTNL mutex in order to + * perform the module load. Tell the caller + * to replay the request. + */ + module_put(em->ops->owner); + err = -EAGAIN; + } +#endif + goto errout; + } + + /* ematch module provides expected length of data, so we + * can do a basic sanity check. + */ + if (em->ops->datalen && data_len < em->ops->datalen) + goto errout; + + if (em->ops->change) { + err = em->ops->change(tp, data, data_len, em); + if (err < 0) + goto errout; + } else if (data_len > 0) { + /* ematch module doesn't provide an own change + * procedure and expects us to allocate and copy + * the ematch data. + * + * TCF_EM_SIMPLE may be specified stating that the + * data only consists of a u32 integer and the module + * does not expected a memory reference but rather + * the value carried. + */ + if (em_hdr->flags & TCF_EM_SIMPLE) { + if (data_len < sizeof(u32)) + goto errout; + em->data = *(u32 *) data; + } else { + void *v = kmemdup(data, data_len, GFP_KERNEL); + if (v == NULL) { + err = -ENOBUFS; + goto errout; + } + em->data = (unsigned long) v; + } + } + } + + em->matchid = em_hdr->matchid; + em->flags = em_hdr->flags; + em->datalen = data_len; + + err = 0; +errout: + return err; +} + +static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { + [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, + [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, +}; + +/** + * tcf_em_tree_validate - validate ematch config TLV and build ematch tree + * + * @tp: classifier kind handle + * @nla: ematch tree configuration TLV + * @tree: destination ematch tree variable to store the resulting + * ematch tree. + * + * This function validates the given configuration TLV @nla and builds an + * ematch tree in @tree. The resulting tree must later be copied into + * the private classifier data using tcf_em_tree_change(). You MUST NOT + * provide the ematch tree variable of the private classifier data directly, + * the changes would not be locked properly. + * + * Returns a negative error code if the configuration TLV contains errors. + */ +int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, + struct tcf_ematch_tree *tree) +{ + int idx, list_len, matches_len, err; + struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; + struct nlattr *rt_match, *rt_hdr, *rt_list; + struct tcf_ematch_tree_hdr *tree_hdr; + struct tcf_ematch *em; + + memset(tree, 0, sizeof(*tree)); + if (!nla) + return 0; + + err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rt_hdr = tb[TCA_EMATCH_TREE_HDR]; + rt_list = tb[TCA_EMATCH_TREE_LIST]; + + if (rt_hdr == NULL || rt_list == NULL) + goto errout; + + tree_hdr = nla_data(rt_hdr); + memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); + + rt_match = nla_data(rt_list); + list_len = nla_len(rt_list); + matches_len = tree_hdr->nmatches * sizeof(*em); + + tree->matches = kzalloc(matches_len, GFP_KERNEL); + if (tree->matches == NULL) + goto errout; + + /* We do not use nla_parse_nested here because the maximum + * number of attributes is unknown. This saves us the allocation + * for a tb buffer which would serve no purpose at all. + * + * The array of rt attributes is parsed in the order as they are + * provided, their type must be incremental from 1 to n. Even + * if it does not serve any real purpose, a failure of sticking + * to this policy will result in parsing failure. + */ + for (idx = 0; nla_ok(rt_match, list_len); idx++) { + err = -EINVAL; + + if (rt_match->nla_type != (idx + 1)) + goto errout_abort; + + if (idx >= tree_hdr->nmatches) + goto errout_abort; + + if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) + goto errout_abort; + + em = tcf_em_get_match(tree, idx); + + err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); + if (err < 0) + goto errout_abort; + + rt_match = nla_next(rt_match, &list_len); + } + + /* Check if the number of matches provided by userspace actually + * complies with the array of matches. The number was used for + * the validation of references and a mismatch could lead to + * undefined references during the matching process. + */ + if (idx != tree_hdr->nmatches) { + err = -EINVAL; + goto errout_abort; + } + + err = 0; +errout: + return err; + +errout_abort: + tcf_em_tree_destroy(tp, tree); + return err; +} +EXPORT_SYMBOL(tcf_em_tree_validate); + +/** + * tcf_em_tree_destroy - destroy an ematch tree + * + * @tp: classifier kind handle + * @tree: ematch tree to be deleted + * + * This functions destroys an ematch tree previously created by + * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that + * the ematch tree is not in use before calling this function. + */ +void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) +{ + int i; + + if (tree->matches == NULL) + return; + + for (i = 0; i < tree->hdr.nmatches; i++) { + struct tcf_ematch *em = tcf_em_get_match(tree, i); + + if (em->ops) { + if (em->ops->destroy) + em->ops->destroy(tp, em); + else if (!tcf_em_is_simple(em)) + kfree((void *) em->data); + module_put(em->ops->owner); + } + } + + tree->hdr.nmatches = 0; + kfree(tree->matches); + tree->matches = NULL; +} +EXPORT_SYMBOL(tcf_em_tree_destroy); + +/** + * tcf_em_tree_dump - dump ematch tree into a rtnl message + * + * @skb: skb holding the rtnl message + * @t: ematch tree to be dumped + * @tlv: TLV type to be used to encapsulate the tree + * + * This function dumps a ematch tree into a rtnl message. It is valid to + * call this function while the ematch tree is in use. + * + * Returns -1 if the skb tailroom is insufficient. + */ +int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) +{ + int i; + u8 *tail; + struct nlattr *top_start; + struct nlattr *list_start; + + top_start = nla_nest_start(skb, tlv); + if (top_start == NULL) + goto nla_put_failure; + + NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + + list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST); + if (list_start == NULL) + goto nla_put_failure; + + tail = skb_tail_pointer(skb); + for (i = 0; i < tree->hdr.nmatches; i++) { + struct nlattr *match_start = (struct nlattr *)tail; + struct tcf_ematch *em = tcf_em_get_match(tree, i); + struct tcf_ematch_hdr em_hdr = { + .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, + .matchid = em->matchid, + .flags = em->flags + }; + + NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr); + + if (em->ops && em->ops->dump) { + if (em->ops->dump(skb, em) < 0) + goto nla_put_failure; + } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { + u32 u = em->data; + nla_put_nohdr(skb, sizeof(u), &u); + } else if (em->datalen > 0) + nla_put_nohdr(skb, em->datalen, (void *) em->data); + + tail = skb_tail_pointer(skb); + match_start->nla_len = tail - (u8 *)match_start; + } + + nla_nest_end(skb, list_start); + nla_nest_end(skb, top_start); + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL(tcf_em_tree_dump); + +static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + int r = em->ops->match(skb, em, info); + + return tcf_em_is_inverted(em) ? !r : r; +} + +/* Do not use this function directly, use tcf_em_tree_match instead */ +int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, + struct tcf_pkt_info *info) +{ + int stackp = 0, match_idx = 0, res = 0; + struct tcf_ematch *cur_match; + int stack[CONFIG_NET_EMATCH_STACK]; + +proceed: + while (match_idx < tree->hdr.nmatches) { + cur_match = tcf_em_get_match(tree, match_idx); + + if (tcf_em_is_container(cur_match)) { + if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) + goto stack_overflow; + + stack[stackp++] = match_idx; + match_idx = cur_match->data; + goto proceed; + } + + res = tcf_em_match(skb, cur_match, info); + + if (tcf_em_early_end(cur_match, res)) + break; + + match_idx++; + } + +pop_stack: + if (stackp > 0) { + match_idx = stack[--stackp]; + cur_match = tcf_em_get_match(tree, match_idx); + + if (tcf_em_early_end(cur_match, res)) + goto pop_stack; + else { + match_idx++; + goto proceed; + } + } + + return res; + +stack_overflow: + if (net_ratelimit()) + pr_warning("tc ematch: local stack overflow," + " increase NET_EMATCH_STACK\n"); + return -1; +} +EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 00000000..6b862766 --- /dev/null +++ b/net/sched/sch_api.c @@ -0,0 +1,1805 @@ +/* + * net/sched/sch_api.c Packet scheduler API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Fixes: + * + * Rani Assaf :980802: JIFFIES and CPU clock sources are repaired. + * Eduardo J. Blanco :990222: kmod support + * Jamal Hadi Salim : 990601: ingress support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int qdisc_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, int event); + +/* + + Short review. + ------------- + + This file consists of two interrelated parts: + + 1. queueing disciplines manager frontend. + 2. traffic classes manager frontend. + + Generally, queueing discipline ("qdisc") is a black box, + which is able to enqueue packets and to dequeue them (when + device is ready to send something) in order and at times + determined by algorithm hidden in it. + + qdisc's are divided to two categories: + - "queues", which have no internal structure visible from outside. + - "schedulers", which split all the packets to "traffic classes", + using "packet classifiers" (look at cls_api.c) + + In turn, classes may have child qdiscs (as rule, queues) + attached to them etc. etc. etc. + + The goal of the routines in this file is to translate + information supplied by user in the form of handles + to more intelligible for kernel form, to make some sanity + checks and part of work, which is common to all qdiscs + and to provide rtnetlink notifications. + + All real intelligent work is done inside qdisc modules. + + + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns 0, if packet was enqueued successfully. + If packet (this one or another one) was dropped, it returns + not zero error code. + NET_XMIT_DROP - this packet dropped + Expected action: do not backoff, but wait until queue will clear. + NET_XMIT_CN - probably this packet enqueued, but another one dropped. + Expected action: backoff or ignore + NET_XMIT_POLICED - dropped by police. + Expected action: backoff or error to real-time apps. + + Auxiliary routines: + + ---peek + + like dequeue but without removing a packet from the queue + + ---reset + + returns qdisc to initial state: purge all buffers, clear all + timers, counters (except for statistics) etc. + + ---init + + initializes newly created qdisc. + + ---destroy + + destroys resources allocated by init and during lifetime of qdisc. + + ---change + + changes qdisc parameters. + */ + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(qdisc_mod_lock); + + +/************************************************ + * Queueing disciplines manipulation. * + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int rc = -EEXIST; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) + if (!strcmp(qops->id, q->id)) + goto out; + + if (qops->enqueue == NULL) + qops->enqueue = noop_qdisc_ops.enqueue; + if (qops->peek == NULL) { + if (qops->dequeue == NULL) + qops->peek = noop_qdisc_ops.peek; + else + goto out_einval; + } + if (qops->dequeue == NULL) + qops->dequeue = noop_qdisc_ops.dequeue; + + if (qops->cl_ops) { + const struct Qdisc_class_ops *cops = qops->cl_ops; + + if (!(cops->get && cops->put && cops->walk && cops->leaf)) + goto out_einval; + + if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf)) + goto out_einval; + } + + qops->next = NULL; + *qp = qops; + rc = 0; +out: + write_unlock(&qdisc_mod_lock); + return rc; + +out_einval: + rc = -EINVAL; + goto out; +} +EXPORT_SYMBOL(register_qdisc); + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int err = -ENOENT; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) + if (q == qops) + break; + if (q) { + *qp = q->next; + q->next = NULL; + err = 0; + } + write_unlock(&qdisc_mod_lock); + return err; +} +EXPORT_SYMBOL(unregister_qdisc); + +/* We know handle. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) +{ + struct Qdisc *q; + + if (!(root->flags & TCQ_F_BUILTIN) && + root->handle == handle) + return root; + + list_for_each_entry(q, &root->list, list) { + if (q->handle == handle) + return q; + } + return NULL; +} + +static void qdisc_list_add(struct Qdisc *q) +{ + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) + list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list); +} + +void qdisc_list_del(struct Qdisc *q) +{ + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) + list_del(&q->list); +} +EXPORT_SYMBOL(qdisc_list_del); + +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +{ + struct Qdisc *q; + + q = qdisc_match_from_root(dev->qdisc, handle); + if (q) + goto out; + + if (dev_ingress_queue(dev)) + q = qdisc_match_from_root( + dev_ingress_queue(dev)->qdisc_sleeping, + handle); +out: + return q; +} + +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +{ + unsigned long cl; + struct Qdisc *leaf; + const struct Qdisc_class_ops *cops = p->ops->cl_ops; + + if (cops == NULL) + return NULL; + cl = cops->get(p, classid); + + if (cl == 0) + return NULL; + leaf = cops->leaf(p, cl); + cops->put(p, cl); + return leaf; +} + +/* Find queueing discipline by name */ + +static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) +{ + struct Qdisc_ops *q = NULL; + + if (kind) { + read_lock(&qdisc_mod_lock); + for (q = qdisc_base; q; q = q->next) { + if (nla_strcmp(kind, q->id) == 0) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + read_unlock(&qdisc_mod_lock); + } + return q; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) +{ + struct qdisc_rate_table *rtab; + + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { + if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + rtab->refcnt++; + return rtab; + } + } + + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + nla_len(tab) != TC_RTAB_SIZE) + return NULL; + + rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + if (rtab) { + rtab->rate = *r; + rtab->refcnt = 1; + memcpy(rtab->data, nla_data(tab), 1024); + rtab->next = qdisc_rtab_list; + qdisc_rtab_list = rtab; + } + return rtab; +} +EXPORT_SYMBOL(qdisc_get_rtab); + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ + struct qdisc_rate_table *rtab, **rtabp; + + if (!tab || --tab->refcnt) + return; + + for (rtabp = &qdisc_rtab_list; + (rtab = *rtabp) != NULL; + rtabp = &rtab->next) { + if (rtab == tab) { + *rtabp = rtab->next; + kfree(rtab); + return; + } + } +} +EXPORT_SYMBOL(qdisc_put_rtab); + +static LIST_HEAD(qdisc_stab_list); +static DEFINE_SPINLOCK(qdisc_stab_lock); + +static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { + [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, + [TCA_STAB_DATA] = { .type = NLA_BINARY }, +}; + +static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) +{ + struct nlattr *tb[TCA_STAB_MAX + 1]; + struct qdisc_size_table *stab; + struct tc_sizespec *s; + unsigned int tsize = 0; + u16 *tab = NULL; + int err; + + err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy); + if (err < 0) + return ERR_PTR(err); + if (!tb[TCA_STAB_BASE]) + return ERR_PTR(-EINVAL); + + s = nla_data(tb[TCA_STAB_BASE]); + + if (s->tsize > 0) { + if (!tb[TCA_STAB_DATA]) + return ERR_PTR(-EINVAL); + tab = nla_data(tb[TCA_STAB_DATA]); + tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); + } + + if (tsize != s->tsize || (!tab && tsize > 0)) + return ERR_PTR(-EINVAL); + + spin_lock(&qdisc_stab_lock); + + list_for_each_entry(stab, &qdisc_stab_list, list) { + if (memcmp(&stab->szopts, s, sizeof(*s))) + continue; + if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) + continue; + stab->refcnt++; + spin_unlock(&qdisc_stab_lock); + return stab; + } + + spin_unlock(&qdisc_stab_lock); + + stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); + if (!stab) + return ERR_PTR(-ENOMEM); + + stab->refcnt = 1; + stab->szopts = *s; + if (tsize > 0) + memcpy(stab->data, tab, tsize * sizeof(u16)); + + spin_lock(&qdisc_stab_lock); + list_add_tail(&stab->list, &qdisc_stab_list); + spin_unlock(&qdisc_stab_lock); + + return stab; +} + +static void stab_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct qdisc_size_table, rcu)); +} + +void qdisc_put_stab(struct qdisc_size_table *tab) +{ + if (!tab) + return; + + spin_lock(&qdisc_stab_lock); + + if (--tab->refcnt == 0) { + list_del(&tab->list); + call_rcu_bh(&tab->rcu, stab_kfree_rcu); + } + + spin_unlock(&qdisc_stab_lock); +} +EXPORT_SYMBOL(qdisc_put_stab); + +static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_STAB); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts); + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure: + return -1; +} + +void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) +{ + int pkt_len, slot; + + pkt_len = skb->len + stab->szopts.overhead; + if (unlikely(!stab->szopts.tsize)) + goto out; + + slot = pkt_len + stab->szopts.cell_align; + if (unlikely(slot < 0)) + slot = 0; + + slot >>= stab->szopts.cell_log; + if (likely(slot < stab->szopts.tsize)) + pkt_len = stab->data[slot]; + else + pkt_len = stab->data[stab->szopts.tsize - 1] * + (slot / stab->szopts.tsize) + + stab->data[slot % stab->szopts.tsize]; + + pkt_len <<= stab->szopts.size_log; +out: + if (unlikely(pkt_len < 1)) + pkt_len = 1; + qdisc_skb_cb(skb)->pkt_len = pkt_len; +} +EXPORT_SYMBOL(__qdisc_calculate_pkt_len); + +void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} +EXPORT_SYMBOL(qdisc_warn_nonwc); + +static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) +{ + struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, + timer); + + qdisc_unthrottled(wd->qdisc); + __netif_schedule(qdisc_root(wd->qdisc)); + + return HRTIMER_NORESTART; +} + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + wd->timer.function = qdisc_watchdog; + wd->qdisc = qdisc; +} +EXPORT_SYMBOL(qdisc_watchdog_init); + +void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) +{ + ktime_t time; + + if (test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(wd->qdisc)->state)) + return; + + qdisc_throttled(wd->qdisc); + time = ktime_set(0, 0); + time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); + hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); +} +EXPORT_SYMBOL(qdisc_watchdog_schedule); + +void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) +{ + hrtimer_cancel(&wd->timer); + qdisc_unthrottled(wd->qdisc); +} +EXPORT_SYMBOL(qdisc_watchdog_cancel); + +static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) +{ + unsigned int size = n * sizeof(struct hlist_head), i; + struct hlist_head *h; + + if (size <= PAGE_SIZE) + h = kmalloc(size, GFP_KERNEL); + else + h = (struct hlist_head *) + __get_free_pages(GFP_KERNEL, get_order(size)); + + if (h != NULL) { + for (i = 0; i < n; i++) + INIT_HLIST_HEAD(&h[i]); + } + return h; +} + +static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n) +{ + unsigned int size = n * sizeof(struct hlist_head); + + if (size <= PAGE_SIZE) + kfree(h); + else + free_pages((unsigned long)h, get_order(size)); +} + +void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) +{ + struct Qdisc_class_common *cl; + struct hlist_node *n, *next; + struct hlist_head *nhash, *ohash; + unsigned int nsize, nmask, osize; + unsigned int i, h; + + /* Rehash when load factor exceeds 0.75 */ + if (clhash->hashelems * 4 <= clhash->hashsize * 3) + return; + nsize = clhash->hashsize * 2; + nmask = nsize - 1; + nhash = qdisc_class_hash_alloc(nsize); + if (nhash == NULL) + return; + + ohash = clhash->hash; + osize = clhash->hashsize; + + sch_tree_lock(sch); + for (i = 0; i < osize; i++) { + hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) { + h = qdisc_class_hash(cl->classid, nmask); + hlist_add_head(&cl->hnode, &nhash[h]); + } + } + clhash->hash = nhash; + clhash->hashsize = nsize; + clhash->hashmask = nmask; + sch_tree_unlock(sch); + + qdisc_class_hash_free(ohash, osize); +} +EXPORT_SYMBOL(qdisc_class_hash_grow); + +int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) +{ + unsigned int size = 4; + + clhash->hash = qdisc_class_hash_alloc(size); + if (clhash->hash == NULL) + return -ENOMEM; + clhash->hashsize = size; + clhash->hashmask = size - 1; + clhash->hashelems = 0; + return 0; +} +EXPORT_SYMBOL(qdisc_class_hash_init); + +void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) +{ + qdisc_class_hash_free(clhash->hash, clhash->hashsize); +} +EXPORT_SYMBOL(qdisc_class_hash_destroy); + +void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, + struct Qdisc_class_common *cl) +{ + unsigned int h; + + INIT_HLIST_NODE(&cl->hnode); + h = qdisc_class_hash(cl->classid, clhash->hashmask); + hlist_add_head(&cl->hnode, &clhash->hash[h]); + clhash->hashelems++; +} +EXPORT_SYMBOL(qdisc_class_hash_insert); + +void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, + struct Qdisc_class_common *cl) +{ + hlist_del(&cl->hnode); + clhash->hashelems--; +} +EXPORT_SYMBOL(qdisc_class_hash_remove); + +/* Allocate an unique handle from space managed by kernel */ + +static u32 qdisc_alloc_handle(struct net_device *dev) +{ + int i = 0x10000; + static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + + do { + autohandle += TC_H_MAKE(0x10000U, 0); + if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) + autohandle = TC_H_MAKE(0x80000000U, 0); + } while (qdisc_lookup(dev, autohandle) && --i > 0); + + return i > 0 ? autohandle : 0; +} + +void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) +{ + const struct Qdisc_class_ops *cops; + unsigned long cl; + u32 parentid; + + if (n == 0) + return; + while ((parentid = sch->parent)) { + if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) + return; + + sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + if (sch == NULL) { + WARN_ON(parentid != TC_H_ROOT); + return; + } + cops = sch->ops->cl_ops; + if (cops->qlen_notify) { + cl = cops->get(sch, parentid); + cops->qlen_notify(sch, cl); + cops->put(sch, cl); + } + sch->q.qlen -= n; + } +} +EXPORT_SYMBOL(qdisc_tree_decrease_qlen); + +static void notify_and_destroy(struct net *net, struct sk_buff *skb, + struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new) +{ + if (new || old) + qdisc_notify(net, skb, n, clid, old, new); + + if (old) + qdisc_destroy(old); +} + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or + * to device "dev". + * + * When appropriate send a netlink notification using 'skb' + * and "n". + * + * On success, destroy old qdisc. + */ + +static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, + struct sk_buff *skb, struct nlmsghdr *n, u32 classid, + struct Qdisc *new, struct Qdisc *old) +{ + struct Qdisc *q = old; + struct net *net = dev_net(dev); + int err = 0; + + if (parent == NULL) { + unsigned int i, num_q, ingress; + + ingress = 0; + num_q = dev->num_tx_queues; + if ((q && q->flags & TCQ_F_INGRESS) || + (new && new->flags & TCQ_F_INGRESS)) { + num_q = 1; + ingress = 1; + if (!dev_ingress_queue(dev)) + return -ENOENT; + } + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + if (new && new->ops->attach) { + new->ops->attach(new); + num_q = 0; + } + + for (i = 0; i < num_q; i++) { + struct netdev_queue *dev_queue = dev_ingress_queue(dev); + + if (!ingress) + dev_queue = netdev_get_tx_queue(dev, i); + + old = dev_graft_qdisc(dev_queue, new); + if (new && i > 0) + atomic_inc(&new->refcnt); + + if (!ingress) + qdisc_destroy(old); + } + + if (!ingress) { + notify_and_destroy(net, skb, n, classid, + dev->qdisc, new); + if (new && !new->ops->attach) + atomic_inc(&new->refcnt); + dev->qdisc = new ? : &noop_qdisc; + } else { + notify_and_destroy(net, skb, n, classid, old, new); + } + + if (dev->flags & IFF_UP) + dev_activate(dev); + } else { + const struct Qdisc_class_ops *cops = parent->ops->cl_ops; + + err = -EOPNOTSUPP; + if (cops && cops->graft) { + unsigned long cl = cops->get(parent, classid); + if (cl) { + err = cops->graft(parent, cl, new, &old); + cops->put(parent, cl); + } else + err = -ENOENT; + } + if (!err) + notify_and_destroy(net, skb, n, classid, old, new); + } + return err; +} + +/* lockdep annotation is needed for ingress; egress gets it only for name */ +static struct lock_class_key qdisc_tx_lock; +static struct lock_class_key qdisc_rx_lock; + +/* + Allocate and initialize new qdisc. + + Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, + struct Qdisc *p, u32 parent, u32 handle, + struct nlattr **tca, int *errp) +{ + int err; + struct nlattr *kind = tca[TCA_KIND]; + struct Qdisc *sch; + struct Qdisc_ops *ops; + struct qdisc_size_table *stab; + + ops = qdisc_lookup_ops(kind); +#ifdef CONFIG_MODULES + if (ops == NULL && kind != NULL) { + char name[IFNAMSIZ]; + if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + * We replay the request because the device may + * go away in the mean time. + */ + rtnl_unlock(); + request_module("sch_%s", name); + rtnl_lock(); + ops = qdisc_lookup_ops(kind); + if (ops != NULL) { + /* We will try again qdisc_lookup_ops, + * so don't keep a reference. + */ + module_put(ops->owner); + err = -EAGAIN; + goto err_out; + } + } + } +#endif + + err = -ENOENT; + if (ops == NULL) + goto err_out; + + sch = qdisc_alloc(dev_queue, ops); + if (IS_ERR(sch)) { + err = PTR_ERR(sch); + goto err_out2; + } + + sch->parent = parent; + + if (handle == TC_H_INGRESS) { + sch->flags |= TCQ_F_INGRESS; + handle = TC_H_MAKE(TC_H_INGRESS, 0); + lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock); + } else { + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out3; + } + lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); + } + + sch->handle = handle; + + if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { + if (tca[TCA_STAB]) { + stab = qdisc_get_stab(tca[TCA_STAB]); + if (IS_ERR(stab)) { + err = PTR_ERR(stab); + goto err_out4; + } + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { + spinlock_t *root_lock; + + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) + goto err_out4; + + if ((sch->parent != TC_H_ROOT) && + !(sch->flags & TCQ_F_INGRESS) && + (!p || !(p->flags & TCQ_F_MQROOT))) + root_lock = qdisc_root_sleeping_lock(sch); + else + root_lock = qdisc_lock(sch); + + err = gen_new_estimator(&sch->bstats, &sch->rate_est, + root_lock, tca[TCA_RATE]); + if (err) + goto err_out4; + } + + qdisc_list_add(sch); + + return sch; + } +err_out3: + dev_put(dev); + kfree((char *) sch - sch->padded); +err_out2: + module_put(ops->owner); +err_out: + *errp = err; + return NULL; + +err_out4: + /* + * Any broken qdiscs that would require a ops->reset() here? + * The qdisc was never in action so it shouldn't be necessary. + */ + qdisc_put_stab(rtnl_dereference(sch->stab)); + if (ops->destroy) + ops->destroy(sch); + goto err_out3; +} + +static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) +{ + struct qdisc_size_table *ostab, *stab = NULL; + int err = 0; + + if (tca[TCA_OPTIONS]) { + if (sch->ops->change == NULL) + return -EINVAL; + err = sch->ops->change(sch, tca[TCA_OPTIONS]); + if (err) + return err; + } + + if (tca[TCA_STAB]) { + stab = qdisc_get_stab(tca[TCA_STAB]); + if (IS_ERR(stab)) + return PTR_ERR(stab); + } + + ostab = rtnl_dereference(sch->stab); + rcu_assign_pointer(sch->stab, stab); + qdisc_put_stab(ostab); + + if (tca[TCA_RATE]) { + /* NB: ignores errors from replace_estimator + because change can't be undone. */ + if (sch->flags & TCQ_F_MQROOT) + goto out; + gen_replace_estimator(&sch->bstats, &sch->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + } +out: + return 0; +} + +struct check_loop_arg { + struct qdisc_walker w; + struct Qdisc *p; + int depth; +}; + +static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); + +static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) +{ + struct check_loop_arg arg; + + if (q->ops->cl_ops == NULL) + return 0; + + arg.w.stop = arg.w.skip = arg.w.count = 0; + arg.w.fn = check_loop_fn; + arg.depth = depth; + arg.p = p; + q->ops->cl_ops->walk(q, &arg.w); + return arg.w.stop ? -ELOOP : 0; +} + +static int +check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) +{ + struct Qdisc *leaf; + const struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct check_loop_arg *arg = (struct check_loop_arg *)w; + + leaf = cops->leaf(q, cl); + if (leaf) { + if (leaf == arg->p || arg->depth > 7) + return -ELOOP; + return check_loop(leaf, arg->p, arg->depth + 1); + } + return 0; +} + +/* + * Delete/get qdisc. + */ + +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = NLMSG_DATA(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + int err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + if (clid) { + if (clid != TC_H_ROOT) { + if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (!p) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else if (dev_ingress_queue(dev)) { + q = dev_ingress_queue(dev)->qdisc_sleeping; + } + } else { + q = dev->qdisc; + } + if (!q) + return -ENOENT; + + if (tcm->tcm_handle && q->handle != tcm->tcm_handle) + return -EINVAL; + } else { + q = qdisc_lookup(dev, tcm->tcm_handle); + if (!q) + return -ENOENT; + } + + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + return -EINVAL; + + if (n->nlmsg_type == RTM_DELQDISC) { + if (!clid) + return -EINVAL; + if (q->handle == 0) + return -ENOENT; + err = qdisc_graft(dev, p, skb, n, clid, NULL, q); + if (err != 0) + return err; + } else { + qdisc_notify(net, skb, n, clid, NULL, q); + } + return 0; +} + +/* + * Create/change qdisc. + */ + +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm; + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + u32 clid; + struct Qdisc *q, *p; + int err; + +replay: + /* Reinit, just in case something touches this. */ + tcm = NLMSG_DATA(n); + clid = tcm->tcm_parent; + q = p = NULL; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + if (clid) { + if (clid != TC_H_ROOT) { + if (clid != TC_H_INGRESS) { + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (!p) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else if (dev_ingress_queue_create(dev)) { + q = dev_ingress_queue(dev)->qdisc_sleeping; + } + } else { + q = dev->qdisc; + } + + /* It may be default qdisc, ignore it */ + if (q && q->handle == 0) + q = NULL; + + if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { + if (tcm->tcm_handle) { + if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) + return -EEXIST; + if (TC_H_MIN(tcm->tcm_handle)) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + if (!q) + goto create_n_graft; + if (n->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + return -EINVAL; + if (q == p || + (p && check_loop(q, p, 0))) + return -ELOOP; + atomic_inc(&q->refcnt); + goto graft; + } else { + if (!q) + goto create_n_graft; + + /* This magic test requires explanation. + * + * We know, that some child q is already + * attached to this parent and have choice: + * either to change it or to create/graft new one. + * + * 1. We are allowed to create/graft only + * if CREATE and REPLACE flags are set. + * + * 2. If EXCL is set, requestor wanted to say, + * that qdisc tcm_handle is not expected + * to exist, so that we choose create/graft too. + * + * 3. The last case is when no flags are set. + * Alas, it is sort of hole in API, we + * cannot decide what to do unambiguously. + * For now we select create/graft, if + * user gave KIND, which does not match existing. + */ + if ((n->nlmsg_flags & NLM_F_CREATE) && + (n->nlmsg_flags & NLM_F_REPLACE) && + ((n->nlmsg_flags & NLM_F_EXCL) || + (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)))) + goto create_n_graft; + } + } + } else { + if (!tcm->tcm_handle) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + } + + /* Change qdisc parameters */ + if (q == NULL) + return -ENOENT; + if (n->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) + return -EINVAL; + err = qdisc_change(q, tca); + if (err == 0) + qdisc_notify(net, skb, n, clid, NULL, q); + return err; + +create_n_graft: + if (!(n->nlmsg_flags & NLM_F_CREATE)) + return -ENOENT; + if (clid == TC_H_INGRESS) { + if (dev_ingress_queue(dev)) + q = qdisc_create(dev, dev_ingress_queue(dev), p, + tcm->tcm_parent, tcm->tcm_parent, + tca, &err); + else + err = -ENOENT; + } else { + struct netdev_queue *dev_queue; + + if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) + dev_queue = p->ops->cl_ops->select_queue(p, tcm); + else if (p) + dev_queue = p->dev_queue; + else + dev_queue = netdev_get_tx_queue(dev, 0); + + q = qdisc_create(dev, dev_queue, p, + tcm->tcm_parent, tcm->tcm_handle, + tca, &err); + } + if (q == NULL) { + if (err == -EAGAIN) + goto replay; + return err; + } + +graft: + err = qdisc_graft(dev, p, skb, n, clid, q, NULL); + if (err) { + if (q) + qdisc_destroy(q); + return err; + } + + return 0; +} + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + u32 pid, u32 seq, u16 flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + struct gnet_dump d; + struct qdisc_size_table *stab; + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = clid; + tcm->tcm_handle = q->handle; + tcm->tcm_info = atomic_read(&q->refcnt); + NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); + if (q->ops->dump && q->ops->dump(q, skb) < 0) + goto nla_put_failure; + q->qstats.qlen = q->q.qlen; + + stab = rtnl_dereference(q->stab); + if (stab && qdisc_dump_stab(skb, stab) < 0) + goto nla_put_failure; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, + qdisc_root_sleeping_lock(q), &d) < 0) + goto nla_put_failure; + + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) + goto nla_put_failure; + + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || + gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || + gnet_stats_copy_queue(&d, &q->qstats) < 0) + goto nla_put_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static bool tc_qdisc_dump_ignore(struct Qdisc *q) +{ + return (q->flags & TCQ_F_BUILTIN) ? true : false; +} + +static int qdisc_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (old && !tc_qdisc_dump_ignore(old)) { + if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, + 0, RTM_DELQDISC) < 0) + goto err_out; + } + if (new && !tc_qdisc_dump_ignore(new)) { + if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + +static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, + struct netlink_callback *cb, + int *q_idx_p, int s_q_idx) +{ + int ret = 0, q_idx = *q_idx_p; + struct Qdisc *q; + + if (!root) + return 0; + + q = root; + if (q_idx < s_q_idx) { + q_idx++; + } else { + if (!tc_qdisc_dump_ignore(q) && + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + goto done; + q_idx++; + } + list_for_each_entry(q, &root->list, list) { + if (q_idx < s_q_idx) { + q_idx++; + continue; + } + if (!tc_qdisc_dump_ignore(q) && + tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + goto done; + q_idx++; + } + +out: + *q_idx_p = q_idx; + return ret; +done: + ret = -1; + goto out; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int idx, q_idx; + int s_idx, s_q_idx; + struct net_device *dev; + + s_idx = cb->args[0]; + s_q_idx = q_idx = cb->args[1]; + + rcu_read_lock(); + idx = 0; + for_each_netdev_rcu(net, dev) { + struct netdev_queue *dev_queue; + + if (idx < s_idx) + goto cont; + if (idx > s_idx) + s_q_idx = 0; + q_idx = 0; + + if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) + goto done; + + dev_queue = dev_ingress_queue(dev); + if (dev_queue && + tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, + &q_idx, s_q_idx) < 0) + goto done; + +cont: + idx++; + } + +done: + rcu_read_unlock(); + + cb->args[0] = idx; + cb->args[1] = q_idx; + + return skb->len; +} + + + +/************************************************ + * Traffic classes manipulation. * + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = NLMSG_DATA(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + struct Qdisc *q = NULL; + const struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long new_cl; + u32 pid = tcm->tcm_parent; + u32 clid = tcm->tcm_handle; + u32 qid = TC_H_MAJ(clid); + int err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + + /* + parent == TC_H_UNSPEC - unspecified parent. + parent == TC_H_ROOT - class is root, which has no parent. + parent == X:0 - parent is root class. + parent == X:Y - parent is a node in hierarchy. + parent == 0:Y - parent is X:Y, where X:0 is qdisc. + + handle == 0:0 - generate handle from kernel pool. + handle == 0:Y - class is X:Y, where X:0 is qdisc. + handle == X:Y - clear. + handle == X:0 - root class. + */ + + /* Step 1. Determine qdisc handle X:0 */ + + if (pid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(pid); + + if (qid && qid1) { + /* If both majors are known, they must be identical. */ + if (qid != qid1) + return -EINVAL; + } else if (qid1) { + qid = qid1; + } else if (qid == 0) + qid = dev->qdisc->handle; + + /* Now qid is genuine qdisc handle consistent + * both with parent and child. + * + * TC_H_MAJ(pid) still may be unspecified, complete it now. + */ + if (pid) + pid = TC_H_MAKE(qid, pid); + } else { + if (qid == 0) + qid = dev->qdisc->handle; + } + + /* OK. Locate qdisc */ + q = qdisc_lookup(dev, qid); + if (!q) + return -ENOENT; + + /* An check that it supports classes */ + cops = q->ops->cl_ops; + if (cops == NULL) + return -EINVAL; + + /* Now try to get class */ + if (clid == 0) { + if (pid == TC_H_ROOT) + clid = qid; + } else + clid = TC_H_MAKE(qid, clid); + + if (clid) + cl = cops->get(q, clid); + + if (cl == 0) { + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTCLASS || + !(n->nlmsg_flags & NLM_F_CREATE)) + goto out; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTCLASS: + err = -EEXIST; + if (n->nlmsg_flags & NLM_F_EXCL) + goto out; + break; + case RTM_DELTCLASS: + err = -EOPNOTSUPP; + if (cops->delete) + err = cops->delete(q, cl); + if (err == 0) + tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); + goto out; + case RTM_GETTCLASS: + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); + goto out; + default: + err = -EINVAL; + goto out; + } + } + + new_cl = cl; + err = -EOPNOTSUPP; + if (cops->change) + err = cops->change(q, clid, pid, tca, &new_cl); + if (err == 0) + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); + +out: + if (cl) + cops->put(q, cl); + + return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, + unsigned long cl, + u32 pid, u32 seq, u16 flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb_tail_pointer(skb); + struct gnet_dump d; + const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = q->handle; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); + if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) + goto nla_put_failure; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, + qdisc_root_sleeping_lock(q), &d) < 0) + goto nla_put_failure; + + if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) + goto nla_put_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int tclass_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); +} + +struct qdisc_dump_args { + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ + struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, + struct tcmsg *tcm, struct netlink_callback *cb, + int *t_p, int s_t) +{ + struct qdisc_dump_args arg; + + if (tc_qdisc_dump_ignore(q) || + *t_p < s_t || !q->ops->cl_ops || + (tcm->tcm_parent && + TC_H_MAJ(tcm->tcm_parent) != q->handle)) { + (*t_p)++; + return 0; + } + if (*t_p > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + return -1; + (*t_p)++; + return 0; +} + +static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, + struct tcmsg *tcm, struct netlink_callback *cb, + int *t_p, int s_t) +{ + struct Qdisc *q; + + if (!root) + return 0; + + if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) + return -1; + + list_for_each_entry(q, &root->list, list) { + if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) + return -1; + } + + return 0; +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); + struct net *net = sock_net(skb->sk); + struct netdev_queue *dev_queue; + struct net_device *dev; + int t, s_t; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return 0; + dev = dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return 0; + + s_t = cb->args[0]; + t = 0; + + if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) + goto done; + + dev_queue = dev_ingress_queue(dev); + if (dev_queue && + tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, + &t, s_t) < 0) + goto done; + +done: + cb->args[0] = t; + + dev_put(dev); + return skb->len; +} + +/* Main classifier routine: scans classifier chain attached + * to this qdisc, (optionally) tests for protocol and asks + * specific classifiers. + */ +int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + __be16 protocol = skb->protocol; + int err; + + for (; tp; tp = tp->next) { + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + err = tp->classify(skb, tp, res); + + if (err >= 0) { +#ifdef CONFIG_NET_CLS_ACT + if (err != TC_ACT_RECLASSIFY && skb->tc_verd) + skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); +#endif + return err; + } + } + return -1; +} +EXPORT_SYMBOL(tc_classify_compat); + +int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int err = 0; +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *otp = tp; +reclassify: +#endif + + err = tc_classify_compat(skb, tp, res); +#ifdef CONFIG_NET_CLS_ACT + if (err == TC_ACT_RECLASSIFY) { + u32 verd = G_TC_VERD(skb->tc_verd); + tp = otp; + + if (verd++ >= MAX_REC_LOOP) { + if (net_ratelimit()) + pr_notice("%s: packet reclassify loop" + " rule prio %u protocol %02x\n", + tp->q->ops->id, + tp->prio & 0xffff, + ntohs(tp->protocol)); + return TC_ACT_SHOT; + } + skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); + goto reclassify; + } +#endif + return err; +} +EXPORT_SYMBOL(tc_classify); + +void tcf_destroy(struct tcf_proto *tp) +{ + tp->ops->destroy(tp); + module_put(tp->ops->owner); + kfree(tp); +} + +void tcf_destroy_chain(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} +EXPORT_SYMBOL(tcf_destroy_chain); + +#ifdef CONFIG_PROC_FS +static int psched_show(struct seq_file *seq, void *v) +{ + struct timespec ts; + + hrtimer_get_res(CLOCK_MONOTONIC, &ts); + seq_printf(seq, "%08x %08x %08x %08x\n", + (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), + 1000000, + (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); + + return 0; +} + +static int psched_open(struct inode *inode, struct file *file) +{ + return single_open(file, psched_show, NULL); +} + +static const struct file_operations psched_fops = { + .owner = THIS_MODULE, + .open = psched_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __net_init psched_net_init(struct net *net) +{ + struct proc_dir_entry *e; + + e = proc_net_fops_create(net, "psched", 0, &psched_fops); + if (e == NULL) + return -ENOMEM; + + return 0; +} + +static void __net_exit psched_net_exit(struct net *net) +{ + proc_net_remove(net, "psched"); +} +#else +static int __net_init psched_net_init(struct net *net) +{ + return 0; +} + +static void __net_exit psched_net_exit(struct net *net) +{ +} +#endif + +static struct pernet_operations psched_net_ops = { + .init = psched_net_init, + .exit = psched_net_exit, +}; + +static int __init pktsched_init(void) +{ + int err; + + err = register_pernet_subsys(&psched_net_ops); + if (err) { + pr_err("pktsched_init: " + "cannot initialize per netns operations\n"); + return err; + } + + register_qdisc(&pfifo_qdisc_ops); + register_qdisc(&bfifo_qdisc_ops); + register_qdisc(&pfifo_head_drop_qdisc_ops); + register_qdisc(&mq_qdisc_ops); + + rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc); + rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL); + rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass); + + return 0; +} + +subsys_initcall(pktsched_init); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c new file mode 100644 index 00000000..3f08158b --- /dev/null +++ b/net/sched/sch_atm.c @@ -0,0 +1,690 @@ +/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for fput */ +#include +#include + +extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ + +/* + * The ATM queuing discipline provides a framework for invoking classifiers + * (aka "filters"), which in turn select classes of this queuing discipline. + * Each class maps the flow(s) it is handling to a given VC. Multiple classes + * may share the same VC. + * + * When creating a class, VCs are specified by passing the number of the open + * socket descriptor by which the calling process references the VC. The kernel + * keeps the VC open at least until all classes using it are removed. + * + * In this file, most functions are named atm_tc_* to avoid confusion with all + * the atm_* in net/atm. This naming convention differs from what's used in the + * rest of net/sched. + * + * Known bugs: + * - sometimes messes up the IP stack + * - any manipulations besides the few operations described in the README, are + * untested and likely to crash the system + * - should lock the flow while there is data in the queue (?) + */ + +#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) + +struct atm_flow_data { + struct Qdisc *q; /* FIFO, TBF, etc. */ + struct tcf_proto *filter_list; + struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ + void (*old_pop)(struct atm_vcc *vcc, + struct sk_buff *skb); /* chaining */ + struct atm_qdisc_data *parent; /* parent qdisc */ + struct socket *sock; /* for closing */ + u32 classid; /* x:y type ID */ + int ref; /* reference count */ + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct list_head list; + struct atm_flow_data *excess; /* flow for excess traffic; + NULL to set CLP instead */ + int hdr_len; + unsigned char hdr[0]; /* header data; MUST BE LAST */ +}; + +struct atm_qdisc_data { + struct atm_flow_data link; /* unclassified skbs go here */ + struct list_head flows; /* NB: "link" is also on this + list */ + struct tasklet_struct task; /* dequeue tasklet */ +}; + +/* ------------------------- Class/flow operations ------------------------- */ + +static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; + + list_for_each_entry(flow, &p->flows, list) { + if (flow->classid == classid) + return flow; + } + return NULL; +} + +static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)arg; + + pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", + sch, p, flow, new, old); + if (list_empty(&flow->list)) + return -EINVAL; + if (!new) + new = &noop_qdisc; + *old = flow->q; + flow->q = new; + if (*old) + qdisc_reset(*old); + return 0; +} + +static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct atm_flow_data *flow = (struct atm_flow_data *)cl; + + pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); + return flow ? flow->q : NULL; +} + +static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid) +{ + struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); + struct atm_flow_data *flow; + + pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); + flow = lookup_flow(sch, classid); + if (flow) + flow->ref++; + pr_debug("atm_tc_get: flow %p\n", flow); + return (unsigned long)flow; +} + +static unsigned long atm_tc_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return atm_tc_get(sch, classid); +} + +/* + * atm_tc_put handles all destructions, including the ones that are explicitly + * requested (atm_tc_destroy, etc.). The assumption here is that we never drop + * anything that still seems to be in use. + */ +static void atm_tc_put(struct Qdisc *sch, unsigned long cl) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)cl; + + pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + if (--flow->ref) + return; + pr_debug("atm_tc_put: destroying\n"); + list_del_init(&flow->list); + pr_debug("atm_tc_put: qdisc %p\n", flow->q); + qdisc_destroy(flow->q); + tcf_destroy_chain(&flow->filter_list); + if (flow->sock) { + pr_debug("atm_tc_put: f_count %ld\n", + file_count(flow->sock->file)); + flow->vcc->pop = flow->old_pop; + sockfd_put(flow->sock); + } + if (flow->excess) + atm_tc_put(sch, (unsigned long)flow->excess); + if (flow != &p->link) + kfree(flow); + /* + * If flow == &p->link, the qdisc no longer works at this point and + * needs to be removed. (By the caller of atm_tc_put.) + */ +} + +static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; + + pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); + VCC2FLOW(vcc)->old_pop(vcc, skb); + tasklet_schedule(&p->task); +} + +static const u8 llc_oui_ip[] = { + 0xaa, /* DSAP: non-ISO */ + 0xaa, /* SSAP: non-ISO */ + 0x03, /* Ctrl: Unnumbered Information Command PDU */ + 0x00, /* OUI: EtherType */ + 0x00, 0x00, + 0x08, 0x00 +}; /* Ethertype IP (0800) */ + +static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { + [TCA_ATM_FD] = { .type = NLA_U32 }, + [TCA_ATM_EXCESS] = { .type = NLA_U32 }, +}; + +static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, + struct nlattr **tca, unsigned long *arg) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)*arg; + struct atm_flow_data *excess = NULL; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ATM_MAX + 1]; + struct socket *sock; + int fd, error, hdr_len; + void *hdr; + + pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," + "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt); + /* + * The concept of parents doesn't apply for this qdisc. + */ + if (parent && parent != TC_H_ROOT && parent != sch->handle) + return -EINVAL; + /* + * ATM classes cannot be changed. In order to change properties of the + * ATM connection, that socket needs to be modified directly (via the + * native ATM API. In order to send a flow to a different VC, the old + * class needs to be removed and a new one added. (This may be changed + * later.) + */ + if (flow) + return -EBUSY; + if (opt == NULL) + return -EINVAL; + + error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy); + if (error < 0) + return error; + + if (!tb[TCA_ATM_FD]) + return -EINVAL; + fd = nla_get_u32(tb[TCA_ATM_FD]); + pr_debug("atm_tc_change: fd %d\n", fd); + if (tb[TCA_ATM_HDR]) { + hdr_len = nla_len(tb[TCA_ATM_HDR]); + hdr = nla_data(tb[TCA_ATM_HDR]); + } else { + hdr_len = RFC1483LLC_LEN; + hdr = NULL; /* default LLC/SNAP for IP */ + } + if (!tb[TCA_ATM_EXCESS]) + excess = NULL; + else { + excess = (struct atm_flow_data *) + atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS])); + if (!excess) + return -ENOENT; + } + pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n", + opt->nla_type, nla_len(opt), hdr_len); + sock = sockfd_lookup(fd, &error); + if (!sock) + return error; /* f_count++ */ + pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file)); + if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { + error = -EPROTOTYPE; + goto err_out; + } + /* @@@ should check if the socket is really operational or we'll crash + on vcc->send */ + if (classid) { + if (TC_H_MAJ(classid ^ sch->handle)) { + pr_debug("atm_tc_change: classid mismatch\n"); + error = -EINVAL; + goto err_out; + } + } else { + int i; + unsigned long cl; + + for (i = 1; i < 0x8000; i++) { + classid = TC_H_MAKE(sch->handle, 0x8000 | i); + cl = atm_tc_get(sch, classid); + if (!cl) + break; + atm_tc_put(sch, cl); + } + } + pr_debug("atm_tc_change: new id %x\n", classid); + flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL); + pr_debug("atm_tc_change: flow %p\n", flow); + if (!flow) { + error = -ENOBUFS; + goto err_out; + } + flow->filter_list = NULL; + flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + if (!flow->q) + flow->q = &noop_qdisc; + pr_debug("atm_tc_change: qdisc %p\n", flow->q); + flow->sock = sock; + flow->vcc = ATM_SD(sock); /* speedup */ + flow->vcc->user_back = flow; + pr_debug("atm_tc_change: vcc %p\n", flow->vcc); + flow->old_pop = flow->vcc->pop; + flow->parent = p; + flow->vcc->pop = sch_atm_pop; + flow->classid = classid; + flow->ref = 1; + flow->excess = excess; + list_add(&flow->list, &p->link.list); + flow->hdr_len = hdr_len; + if (hdr) + memcpy(flow->hdr, hdr, hdr_len); + else + memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip)); + *arg = (unsigned long)flow; + return 0; +err_out: + if (excess) + atm_tc_put(sch, (unsigned long)excess); + sockfd_put(sock); + return error; +} + +static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)arg; + + pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + if (list_empty(&flow->list)) + return -EINVAL; + if (flow->filter_list || flow == &p->link) + return -EBUSY; + /* + * Reference count must be 2: one for "keepalive" (set at class + * creation), and one for the reference held when calling delete. + */ + if (flow->ref < 2) { + pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref); + return -EINVAL; + } + if (flow->ref > 2) + return -EBUSY; /* catch references via excess, etc. */ + atm_tc_put(sch, arg); + return 0; +} + +static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; + + pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + if (walker->stop) + return; + list_for_each_entry(flow, &p->flows, list) { + if (walker->count >= walker->skip && + walker->fn(sch, (unsigned long)flow, walker) < 0) { + walker->stop = 1; + break; + } + walker->count++; + } +} + +static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)cl; + + pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + return flow ? &flow->filter_list : &p->link.filter_list; +} + +/* --------------------------- Qdisc operations ---------------------------- */ + +static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + result = TC_POLICE_OK; /* be nice to gcc */ + flow = NULL; + if (TC_H_MAJ(skb->priority) != sch->handle || + !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { + list_for_each_entry(flow, &p->flows, list) { + if (flow->filter_list) { + result = tc_classify_compat(skb, + flow->filter_list, + &res); + if (result < 0) + continue; + flow = (struct atm_flow_data *)res.class; + if (!flow) + flow = lookup_flow(sch, res.classid); + goto done; + } + } + flow = NULL; +done: + ; + } + if (!flow) { + flow = &p->link; + } else { + if (flow->vcc) + ATM_SKB(skb)->atm_options = flow->vcc->atm_options; + /*@@@ looks good ... but it's not supposed to work :-) */ +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + kfree_skb(skb); + goto drop; + case TC_POLICE_RECLASSIFY: + if (flow->excess) + flow = flow->excess; + else + ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP; + break; + } +#endif + } + + ret = qdisc_enqueue(skb, flow->q); + if (ret != NET_XMIT_SUCCESS) { +drop: __maybe_unused + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + if (flow) + flow->qstats.drops++; + } + return ret; + } + qdisc_bstats_update(sch, skb); + bstats_update(&flow->bstats, skb); + /* + * Okay, this may seem weird. We pretend we've dropped the packet if + * it goes via ATM. The reason for this is that the outer qdisc + * expects to be able to q->dequeue the packet later on if we return + * success at this place. Also, sch->q.qdisc needs to reflect whether + * there is a packet egligible for dequeuing or not. Note that the + * statistics of the outer qdisc are necessarily wrong because of all + * this. There's currently no correct solution for this. + */ + if (flow == &p->link) { + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + tasklet_schedule(&p->task); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; +} + +/* + * Dequeue packets and send them over ATM. Note that we quite deliberately + * avoid checking net_device's flow control here, simply because sch_atm + * uses its own channels, which have nothing to do with any CLIP/LANE/or + * non-ATM interfaces. + */ + +static void sch_atm_dequeue(unsigned long data) +{ + struct Qdisc *sch = (struct Qdisc *)data; + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; + struct sk_buff *skb; + + pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) { + if (flow == &p->link) + continue; + /* + * If traffic is properly shaped, this won't generate nasty + * little bursts. Otherwise, it may ... (but that's okay) + */ + while ((skb = flow->q->ops->peek(flow->q))) { + if (!atm_may_send(flow->vcc, skb->truesize)) + break; + + skb = qdisc_dequeue_peeked(flow->q); + if (unlikely(!skb)) + break; + + pr_debug("atm_tc_dequeue: sending on class %p\n", flow); + /* remove any LL header somebody else has attached */ + skb_pull(skb, skb_network_offset(skb)); + if (skb_headroom(skb) < flow->hdr_len) { + struct sk_buff *new; + + new = skb_realloc_headroom(skb, flow->hdr_len); + dev_kfree_skb(skb); + if (!new) + continue; + skb = new; + } + pr_debug("sch_atm_dequeue: ip %p, data %p\n", + skb_network_header(skb), skb->data); + ATM_SKB(skb)->vcc = flow->vcc; + memcpy(skb_push(skb, flow->hdr_len), flow->hdr, + flow->hdr_len); + atomic_add(skb->truesize, + &sk_atm(flow->vcc)->sk_wmem_alloc); + /* atm.atm_options are already set by atm_tc_enqueue */ + flow->vcc->send(flow->vcc, skb); + } + } +} + +static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct sk_buff *skb; + + pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); + tasklet_schedule(&p->task); + skb = qdisc_dequeue_peeked(p->link.q); + if (skb) + sch->q.qlen--; + return skb; +} + +static struct sk_buff *atm_tc_peek(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + + pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p); + + return p->link.q->ops->peek(p->link.q); +} + +static unsigned int atm_tc_drop(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; + unsigned int len; + + pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) { + if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) + return len; + } + return 0; +} + +static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + + pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + INIT_LIST_HEAD(&p->flows); + INIT_LIST_HEAD(&p->link.list); + list_add(&p->link.list, &p->flows); + p->link.q = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, sch->handle); + if (!p->link.q) + p->link.q = &noop_qdisc; + pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); + p->link.filter_list = NULL; + p->link.vcc = NULL; + p->link.sock = NULL; + p->link.classid = sch->handle; + p->link.ref = 1; + tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch); + return 0; +} + +static void atm_tc_reset(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow; + + pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) + qdisc_reset(flow->q); + sch->q.qlen = 0; +} + +static void atm_tc_destroy(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow, *tmp; + + pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); + list_for_each_entry(flow, &p->flows, list) + tcf_destroy_chain(&flow->filter_list); + + list_for_each_entry_safe(flow, tmp, &p->flows, list) { + if (flow->ref > 1) + pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref); + atm_tc_put(sch, (unsigned long)flow); + } + tasklet_kill(&p->task); +} + +static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct atm_qdisc_data *p = qdisc_priv(sch); + struct atm_flow_data *flow = (struct atm_flow_data *)cl; + struct nlattr *nest; + + pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", + sch, p, flow, skb, tcm); + if (list_empty(&flow->list)) + return -EINVAL; + tcm->tcm_handle = flow->classid; + tcm->tcm_info = flow->q->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr); + if (flow->vcc) { + struct sockaddr_atmpvc pvc; + int state; + + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; + pvc.sap_addr.vpi = flow->vcc->vpi; + pvc.sap_addr.vci = flow->vcc->vci; + NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc); + state = ATM_VF2VS(flow->vcc->flags); + NLA_PUT_U32(skb, TCA_ATM_STATE, state); + } + if (flow->excess) + NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); + else + NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); + + nla_nest_end(skb, nest); + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} +static int +atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct atm_flow_data *flow = (struct atm_flow_data *)arg; + + flow->qstats.qlen = flow->q->q.qlen; + + if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || + gnet_stats_copy_queue(d, &flow->qstats) < 0) + return -1; + + return 0; +} + +static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + return 0; +} + +static const struct Qdisc_class_ops atm_class_ops = { + .graft = atm_tc_graft, + .leaf = atm_tc_leaf, + .get = atm_tc_get, + .put = atm_tc_put, + .change = atm_tc_change, + .delete = atm_tc_delete, + .walk = atm_tc_walk, + .tcf_chain = atm_tc_find_tcf, + .bind_tcf = atm_tc_bind_filter, + .unbind_tcf = atm_tc_put, + .dump = atm_tc_dump_class, + .dump_stats = atm_tc_dump_class_stats, +}; + +static struct Qdisc_ops atm_qdisc_ops __read_mostly = { + .cl_ops = &atm_class_ops, + .id = "atm", + .priv_size = sizeof(struct atm_qdisc_data), + .enqueue = atm_tc_enqueue, + .dequeue = atm_tc_dequeue, + .peek = atm_tc_peek, + .drop = atm_tc_drop, + .init = atm_tc_init, + .reset = atm_tc_reset, + .destroy = atm_tc_destroy, + .dump = atm_tc_dump, + .owner = THIS_MODULE, +}; + +static int __init atm_init(void) +{ + return register_qdisc(&atm_qdisc_ops); +} + +static void __exit atm_exit(void) +{ + unregister_qdisc(&atm_qdisc_ops); +} + +module_init(atm_init) +module_exit(atm_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c new file mode 100644 index 00000000..094a874b --- /dev/null +++ b/net/sched/sch_blackhole.c @@ -0,0 +1,53 @@ +/* + * net/sched/sch_blackhole.c Black hole queue + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * + * Note: Quantum tunneling is not supported. + */ + +#include +#include +#include +#include +#include + +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + qdisc_drop(skb, sch); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { + .id = "blackhole", + .priv_size = 0, + .enqueue = blackhole_enqueue, + .dequeue = blackhole_dequeue, + .peek = blackhole_dequeue, + .owner = THIS_MODULE, +}; + +static int __init blackhole_module_init(void) +{ + return register_qdisc(&blackhole_qdisc_ops); +} + +static void __exit blackhole_module_exit(void) +{ + unregister_qdisc(&blackhole_qdisc_ops); +} + +module_init(blackhole_module_init) +module_exit(blackhole_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 00000000..24d94c09 --- /dev/null +++ b/net/sched/sch_cbq.c @@ -0,0 +1,2075 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Class-Based Queueing (CBQ) algorithm. + ======================================= + + Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource + Management Models for Packet Networks", + IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + + [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 + + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + Parameters", 1996 + + [4] Sally Floyd and Michael Speer, "Experimental Results + for Class-Based Queueing", 1998, not published. + + ----------------------------------------------------------------------- + + Algorithm skeleton was taken from NS simulator cbq.cc. + If someone wants to check this code against the LBL version, + he should take into account that ONLY the skeleton was borrowed, + the implementation is different. Particularly: + + --- The WRR algorithm is different. Our version looks more + reasonable (I hope) and works when quanta are allowed to be + less than MTU, which is always the case when real time classes + have small rates. Note, that the statement of [3] is + incomplete, delay may actually be estimated even if class + per-round allotment is less than MTU. Namely, if per-round + allotment is W*r_i, and r_1+...+r_k = r < 1 + + delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B + + In the worst case we have IntServ estimate with D = W*r+k*MTU + and C = MTU*r. The proof (if correct at all) is trivial. + + + --- It seems that cbq-2.0 is not very accurate. At least, I cannot + interpret some places, which look like wrong translations + from NS. Anyone is advised to find these differences + and explain to me, why I am wrong 8). + + --- Linux has no EOI event, so that we cannot estimate true class + idle time. Workaround is to consider the next dequeue event + as sign that previous packet is finished. This is wrong because of + internal device queueing, but on a permanently loaded link it is true. + Moreover, combined with clock integrator, this scheme looks + very close to an ideal solution. */ + +struct cbq_sched_data; + + +struct cbq_class { + struct Qdisc_class_common common; + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Parameters */ + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_ACT + unsigned char police; +#endif + + u32 defmap; + + /* Link-sharing scheduler parameters */ + long maxidle; /* Class parameters: see below. */ + long offtime; + long minidle; + u32 avpkt; + struct qdisc_rate_table *R_tab; + + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + psched_tdiff_t penalty; + + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ + + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ + + struct Qdisc *q; /* Elementary queueing discipline */ + + +/* Variables */ + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + psched_time_t penalized; + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + struct tc_cbq_xstats xstats; + + struct tcf_proto *filter_list; + + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX + 1]; +}; + +struct cbq_sched_data { + struct Qdisc_class_hash clhash; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO + 1]; + unsigned int quanta[TC_CBQ_MAXPRIO + 1]; + + struct cbq_class link; + + unsigned int activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes + with backlog */ + +#ifdef CONFIG_NET_CLS_ACT + struct cbq_class *rx_class; +#endif + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; + psched_time_t now; /* Cached timestamp */ + psched_time_t now_rt; /* Cached real time */ + unsigned int pmask; + + struct hrtimer delay_timer; + struct qdisc_watchdog watchdog; /* Watchdog timer, + started when CBQ has + backlog, but cannot + transmit just now */ + psched_tdiff_t wd_expires; + int toplevel; + u32 hgenerator; +}; + + +#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len) + +static inline struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct cbq_class, common); +} + +#ifdef CONFIG_NET_CLS_ACT + +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl; + + for (cl = this->tparent; cl; cl = cl->tparent) { + struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; + + if (new != NULL && new != this) + return new; + } + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + * it allows us to combine link sharing and priority scheduling + * transparently. + * + * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + * so that it resolves to split nodes. Then packets are classified + * by logical priority, or a more specific classifier may be attached + * to the split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio ^ sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + for (;;) { + int result = 0; + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || + (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) + goto fallback; + + cl = (void *)res.class; + if (!cl) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL || cl->level >= head->level) + goto fallback; + } + +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + case TC_ACT_RECLASSIFY: + return cbq_reclassify(skb, cl); + } +#endif + if (cl->level == 0) + return cl; + + /* + * Step 3+n. If classifier selected a link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit a leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio & TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} + +/* + * A packet has just been enqueued on the empty class. + * cbq_activate_class adds it to the tail of active class list + * of its priority band. + */ + +static inline void cbq_activate_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + int prio = cl->cpriority; + struct cbq_class *cl_tail; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; + } else { + cl->next_alive = cl; + q->activemask |= (1<qdisc); + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<active[prio]); +} + +static void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + int toplevel = q->toplevel; + + if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { + psched_time_t now; + psched_tdiff_t incr; + + now = psched_get_time(); + incr = now - q->now_rt; + now = q->now + incr; + + do { + if (cl->undertime < now) { + q->toplevel = cl->level; + return; + } + } while ((cl = cl->borrow) != NULL && toplevel > cl->level); + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + int uninitialized_var(ret); + struct cbq_class *cl = cbq_classify(skb, sch, &ret); + +#ifdef CONFIG_NET_CLS_ACT + q->rx_class = cl; +#endif + if (cl == NULL) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + +#ifdef CONFIG_NET_CLS_ACT + cl->q->__parent = sch; +#endif + ret = qdisc_enqueue(skb, cl->q); + if (ret == NET_XMIT_SUCCESS) { + sch->q.qlen++; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return ret; + } + + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + cbq_mark_toplevel(q, cl); + cl->qstats.drops++; + } + return ret; +} + +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = cl->undertime - q->now; + + if (!cl->delayed) { + delay += cl->offtime; + + /* + * Class goes to sleep, so that it will have no + * chance to work avgidle. Let's forgive it 8) + * + * BTW cbq-2.0 has a crap in this + * place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay <= 0) + delay = 1; + cl->undertime = q->now + delay; + + cl->xstats.overactions++; + cl->delayed = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + + /* Dirty work! We must schedule wakeups based on + * real available rate, rather than leaf rate, + * which may be tiny (even zero). + */ + if (q->toplevel == TC_CBQ_MAXLEVEL) { + struct cbq_class *b; + psched_tdiff_t base_delay = q->wd_expires; + + for (b = cl->borrow; b; b = b->borrow) { + delay = b->undertime - q->now; + if (delay < base_delay) { + if (delay <= 0) + delay = 1; + base_delay = delay; + } + } + + q->wd_expires = base_delay; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + * they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this = cl; + + do { + if (cl->level > q->toplevel) { + cl = NULL; + break; + } + } while ((cl = cl->borrow) != NULL); + + if (cl == NULL) + cl = this; + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = cl->undertime - q->now; + + if (test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(cl->qdisc)->state)) + return; + + if (!cl->delayed) { + psched_time_t sched = q->now; + ktime_t expires; + + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + cl->undertime = q->now + delay; + + if (delay > 0) { + sched += delay + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<delay_timer) && + ktime_to_ns(ktime_sub( + hrtimer_get_expires(&q->delay_timer), + expires)) > 0) + hrtimer_set_expires(&q->delay_timer, expires); + hrtimer_restart(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + return; + } + delay = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + cl->penalized = q->now + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); +} + +static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, + psched_time_t now) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + psched_time_t sched = now; + + if (cl_prev == NULL) + return 0; + + do { + cl = cl_prev->next_alive; + if (now - cl->penalized > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if (sched - cl->penalized > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return sched - now; +} + +static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) +{ + struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, + delay_timer); + struct Qdisc *sch = q->watchdog.qdisc; + psched_time_t now; + psched_tdiff_t delay = 0; + unsigned int pmask; + + now = psched_get_time(); + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + psched_tdiff_t tmp; + + pmask &= ~(1< 0) { + q->pmask |= 1<delay_timer, time, HRTIMER_MODE_ABS); + } + + qdisc_unthrottled(sch); + __netif_schedule(qdisc_root(sch)); + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_NET_CLS_ACT +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + struct Qdisc *sch = child->__parent; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = q->rx_class; + + q->rx_class = NULL; + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + int ret; + + cbq_mark_toplevel(q, cl); + + q->rx_class = cl; + cl->q->__parent = sch; + + ret = qdisc_enqueue(skb, cl->q); + if (ret == NET_XMIT_SUCCESS) { + sch->q.qlen++; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return 0; + } + + sch->qstats.drops++; + return -1; +} +#endif + +/* + * It is mission critical procedure. + * + * We "regenerate" toplevel cutoff, if transmitting class + * has backlog and it is not regulated. It is not part of + * original CBQ description, but looks more reasonable. + * Probably, it is wrong. This question needs further investigation. + */ + +static inline void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, + struct cbq_class *borrowed) +{ + if (cl && q->toplevel >= borrowed->level) { + if (cl->q->q.qlen > 1) { + do { + if (borrowed->undertime == PSCHED_PASTPERFECT) { + q->toplevel = borrowed->level; + return; + } + } while ((borrowed = borrowed->borrow) != NULL); + } +#if 0 + /* It is not necessary now. Uncommenting it + will save CPU cycles, but decrease fairness. + */ + q->toplevel = TC_CBQ_MAXLEVEL; +#endif + } +} + +static void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *this = q->tx_class; + struct cbq_class *cl = this; + int len = q->tx_len; + + q->tx_class = NULL; + + for ( ; cl; cl = cl->share) { + long avgidle = cl->avgidle; + long idle; + + cl->bstats.packets++; + cl->bstats.bytes += len; + + /* + * (now - last) is total time between packet right edges. + * (last_pktlen/rate) is "virtual" busy time, so that + * + * idle = (now - last) - last_pktlen/rate + */ + + idle = q->now - cl->last; + if ((unsigned long)idle > 128*1024*1024) { + avgidle = cl->maxidle; + } else { + idle -= L2T(cl, len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + * where W=2^{-ewma_log}. But cl->avgidle is scaled: + * cl->avgidle == true_avgidle/W, + * hence: + */ + avgidle += idle - (avgidle>>cl->ewma_log); + } + + if (avgidle <= 0) { + /* Overlimit or at-limit */ + + if (avgidle < cl->minidle) + avgidle = cl->minidle; + + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + * will be allowed to send. + * It will occur, when: + * (1-W)*true_avgidle + W*delay = 0, i.e. + * idle = (1/W - 1)*(-true_avgidle) + * or + * idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + + /* + * That is not all. + * To maintain the rate allocated to the class, + * we add to undertime virtual clock, + * necessary to complete transmitted packet. + * (len/phys_bandwidth has been already passed + * to the moment of cbq_update) + */ + + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + cl->undertime = q->now + idle; + } else { + /* Underlimit */ + + cl->undertime = PSCHED_PASTPERFECT; + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + + cbq_update_toplevel(q, this, q->tx_borrowed); +} + +static inline struct cbq_class * +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this_cl = cl; + + if (cl->tparent == NULL) + return cl; + + if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { + cl->delayed = 0; + return cl; + } + + do { + /* It is very suspicious place. Now overlimit + * action is generated for not bounded classes + * only if link is completely congested. + * Though it is in agree with ancestor-only paradigm, + * it looks very stupid. Particularly, + * it means that this chunk of code will either + * never be called or result in strong amplification + * of burstiness. Dangerous, silly, and, however, + * no another solution exists. + */ + cl = cl->borrow; + if (!cl) { + this_cl->qstats.overlimits++; + this_cl->overlimit(this_cl); + return NULL; + } + if (cl->level > q->toplevel) + return NULL; + } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); + + cl->delayed = 0; + return cl; +} + +static inline struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + struct cbq_class *borrow = cl; + + if (cl->q->q.qlen && + (borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per + * this round. Switch to the next one. + */ + deficit = 1; + cl->deficit += cl->quantum; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + * It could occur even if cl->q->q.qlen != 0 + * f.e. if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= qdisc_pkt_len(skb); + q->tx_class = cl; + q->tx_borrowed = borrow; + if (borrow != cl) { +#ifndef CBQ_XSTATS_BORROWS_BYTES + borrow->xstats.borrows++; + cl->xstats.borrows++; +#else + borrow->xstats.borrows += qdisc_pkt_len(skb); + cl->xstats.borrows += qdisc_pkt_len(skb); +#endif + } + q->tx_len = qdisc_pkt_len(skb); + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + * Unlink it from active chain. + */ + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<q->q.qlen) + cbq_activate_class(cl); + return NULL; + } + + q->active[prio] = cl_tail; + } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; + } + +next_class: + cl_prev = cl; + cl = cl->next_alive; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static inline struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned int activemask; + + activemask = q->activemask & 0xFF; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<now_rt; + + if (q->tx_class) { + psched_tdiff_t incr2; + /* Time integrator. We calculate EOS time + * by adding expected packet transmission time. + * If real time is greater, we warp artificial clock, + * so that: + * + * cbq_time = max(real_time, work); + */ + incr2 = L2T(&q->link, q->tx_len); + q->now += incr2; + cbq_update(q); + if ((incr -= incr2) < 0) + incr = 0; + } + q->now += incr; + q->now_rt = now; + + for (;;) { + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch); + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + qdisc_unthrottled(sch); + return skb; + } + + /* All the classes are overlimit. + * + * It is possible, if: + * + * 1. Scheduler is empty. + * 2. Toplevel cutoff inhibited borrowing. + * 3. Root class is overlimit. + * + * Reset 2d and 3d conditions and retry. + * + * Note, that NS and cbq-2.0 are buggy, peeking + * an arbitrary class is appropriate for ancestor-only + * sharing, but not for toplevel algorithm. + * + * Our version is better, but slower, because it requires + * two passes, but it is unavoidable with top-level sharing. + */ + + if (q->toplevel == TC_CBQ_MAXLEVEL && + q->link.undertime == PSCHED_PASTPERFECT) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + q->link.undertime = PSCHED_PASTPERFECT; + } + + /* No packets in scheduler or nobody wants to give them to us :-( + * Sigh... start watchdog timer in the last case. + */ + + if (sch->q.qlen) { + sch->qstats.overlimits++; + if (q->wd_expires) + qdisc_watchdog_schedule(&q->watchdog, + now + q->wd_expires); + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + if (this == NULL) + return; + + do { + int level = 0; + struct cbq_class *cl; + + cl = this->children; + if (cl) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level + 1; + } while ((this = this->tparent) != NULL); +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct hlist_node *n; + unsigned int h; + + if (q->quanta[prio] == 0) + return; + + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + /* BUGGGG... Beware! This expression suffer of + * arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { + pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n", + cl->common.classid, cl->quantum); + cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; + } + } + } +} + +static void cbq_sync_defmap(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *split = cl->split; + unsigned int h; + int i; + + if (split == NULL) + return; + + for (i = 0; i <= TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap & (1<defaults[i] = NULL; + } + + for (i = 0; i <= TC_PRIO_MAX; i++) { + int level = split->level; + + if (split->defaults[i]) + continue; + + for (h = 0; h < q->clhash.hashsize; h++) { + struct hlist_node *n; + struct cbq_class *c; + + hlist_for_each_entry(c, n, &q->clhash.hash[h], + common.hnode) { + if (c->split == split && c->level < level && + c->defmap & (1<defaults[i] = c; + level = c->level; + } + } + } + } +} + +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + split = cl->split; + if (!split) + return; + splitid = split->common.classid; + } + + if (split == NULL || split->common.classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->common.classid == splitid) + break; + } + + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def & mask; + } else + cl->defmap = (cl->defmap & ~mask) | (def & mask); + + cbq_sync_defmap(cl); +} + +static void cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + + qdisc_class_hash_remove(&q->clhash, &this->common); + + if (this->tparent) { + clp = &this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); + + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + WARN_ON(this->sibling != this); + } +} + +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + struct cbq_class *parent = this->tparent; + + this->sibling = this; + qdisc_class_hash_insert(&q->clhash, &this->common); + + if (parent == NULL) + return; + + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; + } +} + +static unsigned int cbq_drop(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl, *cl_head; + int prio; + unsigned int len; + + for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { + cl_head = q->active[prio]; + if (!cl_head) + continue; + + cl = cl_head; + do { + if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { + sch->q.qlen--; + if (!cl->q->q.qlen) + cbq_deactivate_class(cl); + return len; + } + } while ((cl = cl->next_alive) != cl_head); + } + return 0; +} + +static void +cbq_reset(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + struct hlist_node *n; + int prio; + unsigned int h; + + q->activemask = 0; + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + qdisc_watchdog_cancel(&q->watchdog); + hrtimer_cancel(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + q->now = psched_get_time(); + q->now_rt = q->now; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) + q->active[prio] = NULL; + + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + cl->undertime = PSCHED_PASTPERFECT; + cl->avgidle = cl->maxidle; + cl->deficit = cl->quantum; + cl->cpriority = cl->priority; + } + } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change & TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change & TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change & TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change & TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change & TCF_CBQ_LSS_MAXIDLE) { + cl->maxidle = lss->maxidle; + cl->avgidle = lss->maxidle; + } + if (lss->change & TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority - 1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO - 1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO || + ovl->priority2 - 1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2 - 1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = ovl->penalty; + return 0; +} + +#ifdef CONFIG_NET_CLS_ACT +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (cl->q->handle) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { + [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, + [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, + [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, + [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, + [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, + [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, +}; + +static int cbq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CBQ_MAX + 1]; + struct tc_ratespec *r; + int err; + + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + if (err < 0) + return err; + + if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) + return -EINVAL; + + r = nla_data(tb[TCA_CBQ_RATE]); + + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) + return -EINVAL; + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + goto put_rtab; + + q->link.refcnt = 1; + q->link.sibling = &q->link; + q->link.common.classid = sch->handle; + q->link.qdisc = sch; + q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); + if (!q->link.q) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO - 1; + q->link.priority2 = TC_CBQ_MAXPRIO - 1; + q->link.cpriority = TC_CBQ_MAXPRIO - 1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(qdisc_dev(sch)); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + + qdisc_watchdog_init(&q->watchdog, sch); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + q->now = psched_get_time(); + q->now_rt = q->now; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT]) + cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); + + cbq_addprio(q, &q->link); + return 0; + +put_rtab: + qdisc_put_rtab(q->link.R_tab); + return err; +} + +static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb_tail_pointer(skb); + + NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority + 1; + opt.cpriority = cl->cpriority + 1; + opt.weight = cl->weight; + NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2 + 1; + opt.pad = 0; + opt.penalty = cl->penalty; + NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->common.classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +#ifdef CONFIG_NET_CLS_ACT +static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + opt.__res1 = 0; + opt.__res2 = 0; + NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + } + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_ACT + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (cbq_dump_attr(skb, &q->link) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static int +cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + q->link.xstats.avgidle = q->link.avgidle; + return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + struct nlattr *nest; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->common.classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + tcm->tcm_info = cl->q->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (cbq_dump_attr(skb, cl) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static int +cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)arg; + + cl->qstats.qlen = cl->q->q.qlen; + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + + if (cl->undertime != PSCHED_PASTPERFECT) + cl->xstats.undertime = cl->undertime - q->now; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_ACT + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif + } + sch_tree_lock(sch); + *old = cl->q; + cl->q = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + + return cl->q; +} + +static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + + if (cl->q->q.qlen == 0) + cbq_deactivate_class(cl); +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + WARN_ON(cl->filters); + + tcf_destroy_chain(&cl->filter_list); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); + gen_kill_estimator(&cl->bstats, &cl->rate_est); + if (cl != &q->link) + kfree(cl); +} + +static void cbq_destroy(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct hlist_node *n, *next; + struct cbq_class *cl; + unsigned int h; + +#ifdef CONFIG_NET_CLS_ACT + q->rx_class = NULL; +#endif + /* + * Filters must be destroyed first because we don't destroy the + * classes from root to leafs which means that filters can still + * be bound to classes which have been destroyed already. --TGR '04 + */ + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) + tcf_destroy_chain(&cl->filter_list); + } + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], + common.hnode) + cbq_destroy_class(sch, cl); + } + qdisc_class_hash_destroy(&q->clhash); +} + +static void cbq_put(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + + if (--cl->refcnt == 0) { +#ifdef CONFIG_NET_CLS_ACT + spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); + struct cbq_sched_data *q = qdisc_priv(sch); + + spin_lock_bh(root_lock); + if (q->rx_class == cl) + q->rx_class = NULL; + spin_unlock_bh(root_lock); +#endif + + cbq_destroy_class(sch, cl); + } +} + +static int +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)*arg; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_CBQ_MAX + 1]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + if (err < 0) + return err; + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && + cl->tparent->common.classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE]) { + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), + tb[TCA_CBQ_RTAB]); + if (rtab == NULL) + return -EINVAL; + } + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + if (rtab) + qdisc_put_rtab(rtab); + return err; + } + } + + /* Change class parameters */ + sch_tree_lock(sch); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + qdisc_put_rtab(cl->R_tab); + cl->R_tab = rtab; + } + + if (tb[TCA_CBQ_LSSOPT]) + cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); + + if (tb[TCA_CBQ_WRROPT]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY]) + cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); + +#ifdef CONFIG_NET_CLS_ACT + if (tb[TCA_CBQ_POLICE]) + cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); +#endif + + if (tb[TCA_CBQ_FOPT]) + cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + sch_tree_unlock(sch); + + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || + tb[TCA_CBQ_LSSOPT] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); + if (rtab == NULL) + return -EINVAL; + + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid ^ sch->handle) || + cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle, 0x8000); + + for (i = 0; i < 0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; + } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; + } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kzalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + kfree(cl); + goto failure; + } + } + + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + if (!cl->q) + cl->q = &noop_qdisc; + cl->common.classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + + sch_tree_lock(sch); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cbq_adjust_levels(parent); + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); + cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); + if (cl->ewma_log == 0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle == 0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt == 0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY]) + cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); +#ifdef CONFIG_NET_CLS_ACT + if (tb[TCA_CBQ_POLICE]) + cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); +#endif + if (tb[TCA_CBQ_FOPT]) + cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; +} + +static int cbq_delete(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)arg; + unsigned int qlen; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + sch_tree_lock(sch); + + qlen = cl->q->q.qlen; + qdisc_reset(cl->q); + qdisc_tree_decrease_qlen(cl->q, qlen); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; + if (q->tx_class == cl) { + q->tx_class = NULL; + q->tx_borrowed = NULL; + } +#ifdef CONFIG_NET_CLS_ACT + if (q->rx_class == cl) + q->rx_class = NULL; +#endif + + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); + + cbq_rmprio(q, cl); + sch_tree_unlock(sch); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + return 0; +} + +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)arg; + + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *p = (struct cbq_class *)parent; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + if (p && p->level <= cl->level) + return 0; + cl->filters++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class *)arg; + + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + struct hlist_node *n; + unsigned int h; + + if (arg->stop) + return; + + for (h = 0; h < q->clhash.hashsize; h++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static const struct Qdisc_class_ops cbq_class_ops = { + .graft = cbq_graft, + .leaf = cbq_leaf, + .qlen_notify = cbq_qlen_notify, + .get = cbq_get, + .put = cbq_put, + .change = cbq_change_class, + .delete = cbq_delete, + .walk = cbq_walk, + .tcf_chain = cbq_find_tcf, + .bind_tcf = cbq_bind_filter, + .unbind_tcf = cbq_unbind_filter, + .dump = cbq_dump_class, + .dump_stats = cbq_dump_class_stats, +}; + +static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { + .next = NULL, + .cl_ops = &cbq_class_ops, + .id = "cbq", + .priv_size = sizeof(struct cbq_sched_data), + .enqueue = cbq_enqueue, + .dequeue = cbq_dequeue, + .peek = qdisc_peek_dequeued, + .drop = cbq_drop, + .init = cbq_init, + .reset = cbq_reset, + .destroy = cbq_destroy, + .change = NULL, + .dump = cbq_dump, + .dump_stats = cbq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cbq_module_init(void) +{ + return register_qdisc(&cbq_qdisc_ops); +} +static void __exit cbq_module_exit(void) +{ + unregister_qdisc(&cbq_qdisc_ops); +} +module_init(cbq_module_init) +module_exit(cbq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c new file mode 100644 index 00000000..178ee831 --- /dev/null +++ b/net/sched/sch_choke.c @@ -0,0 +1,687 @@ +/* + * net/sched/sch_choke.c CHOKE scheduler + * + * Copyright (c) 2011 Stephen Hemminger + * Copyright (c) 2011 Eric Dumazet + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + CHOKe stateless AQM for fair bandwidth allocation + ================================================= + + CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for + unresponsive flows) is a variant of RED that penalizes misbehaving flows but + maintains no flow state. The difference from RED is an additional step + during the enqueuing process. If average queue size is over the + low threshold (qmin), a packet is chosen at random from the queue. + If both the new and chosen packet are from the same flow, both + are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it + needs to access packets in queue randomly. It has a minimal class + interface to allow overriding the builtin flow classifier with + filters. + + Source: + R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless + Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", + IEEE INFOCOM, 2000. + + A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial + Characteristics", IEEE/ACM Transactions on Networking, 2004 + + */ + +/* Upper bound on size of sk_buff table (packets) */ +#define CHOKE_MAX_QUEUE (128*1024 - 1) + +struct choke_sched_data { +/* Parameters */ + u32 limit; + unsigned char flags; + + struct red_parms parms; + +/* Variables */ + struct tcf_proto *filter_list; + struct { + u32 prob_drop; /* Early probability drops */ + u32 prob_mark; /* Early probability marks */ + u32 forced_drop; /* Forced drops, qavg > max_thresh */ + u32 forced_mark; /* Forced marks, qavg > max_thresh */ + u32 pdrop; /* Drops due to queue limits */ + u32 other; /* Drops due to drop() calls */ + u32 matched; /* Drops to flow match */ + } stats; + + unsigned int head; + unsigned int tail; + + unsigned int tab_mask; /* size - 1 */ + + struct sk_buff **tab; +}; + +/* deliver a random number between 0 and N - 1 */ +static u32 random_N(unsigned int N) +{ + return reciprocal_divide(random32(), N); +} + +/* number of elements in queue including holes */ +static unsigned int choke_len(const struct choke_sched_data *q) +{ + return (q->tail - q->head) & q->tab_mask; +} + +/* Is ECN parameter configured */ +static int use_ecn(const struct choke_sched_data *q) +{ + return q->flags & TC_RED_ECN; +} + +/* Should packets over max just be dropped (versus marked) */ +static int use_harddrop(const struct choke_sched_data *q) +{ + return q->flags & TC_RED_HARDDROP; +} + +/* Move head pointer forward to skip over holes */ +static void choke_zap_head_holes(struct choke_sched_data *q) +{ + do { + q->head = (q->head + 1) & q->tab_mask; + if (q->head == q->tail) + break; + } while (q->tab[q->head] == NULL); +} + +/* Move tail pointer backwards to reuse holes */ +static void choke_zap_tail_holes(struct choke_sched_data *q) +{ + do { + q->tail = (q->tail - 1) & q->tab_mask; + if (q->head == q->tail) + break; + } while (q->tab[q->tail] == NULL); +} + +/* Drop packet from queue array by creating a "hole" */ +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = q->tab[idx]; + + q->tab[idx] = NULL; + + if (idx == q->head) + choke_zap_head_holes(q); + if (idx == q->tail) + choke_zap_tail_holes(q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + qdisc_tree_decrease_qlen(sch, 1); + --sch->q.qlen; +} + +/* + * Compare flow of two packets + * Returns true only if source and destination address and port match. + * false for special cases + */ +static bool choke_match_flow(struct sk_buff *skb1, + struct sk_buff *skb2) +{ + int off1, off2, poff; + const u32 *ports1, *ports2; + u8 ip_proto; + __u32 hash1; + + if (skb1->protocol != skb2->protocol) + return false; + + /* Use hash value as quick check + * Assumes that __skb_get_rxhash makes IP header and ports linear + */ + hash1 = skb_get_rxhash(skb1); + if (!hash1 || hash1 != skb_get_rxhash(skb2)) + return false; + + /* Probably match, but be sure to avoid hash collisions */ + off1 = skb_network_offset(skb1); + off2 = skb_network_offset(skb2); + + switch (skb1->protocol) { + case __constant_htons(ETH_P_IP): { + const struct iphdr *ip1, *ip2; + + ip1 = (const struct iphdr *) (skb1->data + off1); + ip2 = (const struct iphdr *) (skb2->data + off2); + + ip_proto = ip1->protocol; + if (ip_proto != ip2->protocol || + ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr) + return false; + + if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + off1 += ip1->ihl * 4; + off2 += ip2->ihl * 4; + break; + } + + case __constant_htons(ETH_P_IPV6): { + const struct ipv6hdr *ip1, *ip2; + + ip1 = (const struct ipv6hdr *) (skb1->data + off1); + ip2 = (const struct ipv6hdr *) (skb2->data + off2); + + ip_proto = ip1->nexthdr; + if (ip_proto != ip2->nexthdr || + ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) || + ipv6_addr_cmp(&ip1->daddr, &ip2->daddr)) + return false; + off1 += 40; + off2 += 40; + } + + default: /* Maybe compare MAC header here? */ + return false; + } + + poff = proto_ports_offset(ip_proto); + if (poff < 0) + return true; + + off1 += poff; + off2 += poff; + + ports1 = (__force u32 *)(skb1->data + off1); + ports2 = (__force u32 *)(skb2->data + off2); + return *ports1 == *ports2; +} + +struct choke_skb_cb { + u16 classid; +}; + +static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); + return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline void choke_set_classid(struct sk_buff *skb, u16 classid) +{ + choke_skb_cb(skb)->classid = classid; +} + +static u16 choke_get_classid(const struct sk_buff *skb) +{ + return choke_skb_cb(skb)->classid; +} + +/* + * Classify flow using either: + * 1. pre-existing classification result in skb + * 2. fast internal classification + * 3. use TC filter based classification + */ +static bool choke_classify(struct sk_buff *skb, + struct Qdisc *sch, int *qerr) + +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return false; + } +#endif + choke_set_classid(skb, TC_H_MIN(res.classid)); + return true; + } + + return false; +} + +/* + * Select a packet at random from queue + * HACK: since queue can have holes from previous deletion; retry several + * times to find a random skb but then just give up and return the head + * Will return NULL if queue is empty (q->head == q->tail) + */ +static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, + unsigned int *pidx) +{ + struct sk_buff *skb; + int retrys = 3; + + do { + *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask; + skb = q->tab[*pidx]; + if (skb) + return skb; + } while (--retrys > 0); + + return q->tab[*pidx = q->head]; +} + +/* + * Compare new packet with random packet in queue + * returns true if matched and sets *pidx + */ +static bool choke_match_random(const struct choke_sched_data *q, + struct sk_buff *nskb, + unsigned int *pidx) +{ + struct sk_buff *oskb; + + if (q->head == q->tail) + return false; + + oskb = choke_peek_random(q, pidx); + if (q->filter_list) + return choke_get_classid(nskb) == choke_get_classid(oskb); + + return choke_match_flow(oskb, nskb); +} + +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct red_parms *p = &q->parms; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + + if (q->filter_list) { + /* If using external classifiers, get result and record it. */ + if (!choke_classify(skb, sch, &ret)) + goto other_drop; /* Packet was eaten by filter */ + } + + /* Compute average queue usage (see RED) */ + p->qavg = red_calc_qavg(p, sch->q.qlen); + if (red_is_idling(p)) + red_end_of_idle_period(p); + + /* Is queue small? */ + if (p->qavg <= p->qth_min) + p->qcount = -1; + else { + unsigned int idx; + + /* Draw a packet at random from queue and compare flow */ + if (choke_match_random(q, skb, &idx)) { + q->stats.matched++; + choke_drop_by_idx(sch, idx); + goto congestion_drop; + } + + /* Queue is large, always mark/drop */ + if (p->qavg > p->qth_max) { + p->qcount = -1; + + sch->qstats.overlimits++; + if (use_harddrop(q) || !use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + q->stats.forced_mark++; + } else if (++p->qcount) { + if (red_mark_probability(p, p->qavg)) { + p->qcount = 0; + p->qR = red_random(p); + + sch->qstats.overlimits++; + if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + } + } else + p->qR = red_random(p); + } + + /* Admit new packet */ + if (sch->q.qlen < q->limit) { + q->tab[q->tail] = skb; + q->tail = (q->tail + 1) & q->tab_mask; + ++sch->q.qlen; + sch->qstats.backlog += qdisc_pkt_len(skb); + return NET_XMIT_SUCCESS; + } + + q->stats.pdrop++; + sch->qstats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; + + congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; + + other_drop: + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +} + +static struct sk_buff *choke_dequeue(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + if (q->head == q->tail) { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + return NULL; + } + + skb = q->tab[q->head]; + q->tab[q->head] = NULL; + choke_zap_head_holes(q); + --sch->q.qlen; + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + + return skb; +} + +static unsigned int choke_drop(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + unsigned int len; + + len = qdisc_queue_drop(sch); + if (len > 0) + q->stats.other++; + else { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + } + + return len; +} + +static void choke_reset(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + red_restart(&q->parms); +} + +static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { + [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, + [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, +}; + + +static void choke_free(void *addr) +{ + if (addr) { + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); + } +} + +static int choke_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CHOKE_MAX + 1]; + const struct tc_red_qopt *ctl; + int err; + struct sk_buff **old = NULL; + unsigned int mask; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy); + if (err < 0) + return err; + + if (tb[TCA_CHOKE_PARMS] == NULL || + tb[TCA_CHOKE_STAB] == NULL) + return -EINVAL; + + ctl = nla_data(tb[TCA_CHOKE_PARMS]); + + if (ctl->limit > CHOKE_MAX_QUEUE) + return -EINVAL; + + mask = roundup_pow_of_two(ctl->limit + 1) - 1; + if (mask != q->tab_mask) { + struct sk_buff **ntab; + + ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); + if (!ntab) + ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); + if (!ntab) + return -ENOMEM; + + sch_tree_lock(sch); + old = q->tab; + if (old) { + unsigned int oqlen = sch->q.qlen, tail = 0; + + while (q->head != q->tail) { + struct sk_buff *skb = q->tab[q->head]; + + q->head = (q->head + 1) & q->tab_mask; + if (!skb) + continue; + if (tail < mask) { + ntab[tail++] = skb; + continue; + } + sch->qstats.backlog -= qdisc_pkt_len(skb); + --sch->q.qlen; + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); + q->head = 0; + q->tail = tail; + } + + q->tab_mask = mask; + q->tab = ntab; + } else + sch_tree_lock(sch); + + q->flags = ctl->flags; + q->limit = ctl->limit; + + red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_CHOKE_STAB])); + + if (q->head == q->tail) + red_end_of_idle_period(&q->parms); + + sch_tree_unlock(sch); + choke_free(old); + return 0; +} + +static int choke_init(struct Qdisc *sch, struct nlattr *opt) +{ + return choke_change(sch, opt); +} + +static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct nlattr *opts = NULL; + struct tc_red_qopt opt = { + .limit = q->limit, + .flags = q->flags, + .qth_min = q->parms.qth_min >> q->parms.Wlog, + .qth_max = q->parms.qth_max >> q->parms.Wlog, + .Wlog = q->parms.Wlog, + .Plog = q->parms.Plog, + .Scell_log = q->parms.Scell_log, + }; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct choke_sched_data *q = qdisc_priv(sch); + struct tc_choke_xstats st = { + .early = q->stats.prob_drop + q->stats.forced_drop, + .marked = q->stats.prob_mark + q->stats.forced_mark, + .pdrop = q->stats.pdrop, + .other = q->stats.other, + .matched = q->stats.matched, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void choke_destroy(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + choke_free(q->tab); +} + +static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long choke_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static void choke_put(struct Qdisc *q, unsigned long cl) +{ +} + +static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int choke_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + if (!arg->stop) { + if (arg->fn(sch, 1, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops choke_class_ops = { + .leaf = choke_leaf, + .get = choke_get, + .put = choke_put, + .tcf_chain = choke_find_tcf, + .bind_tcf = choke_bind, + .unbind_tcf = choke_put, + .dump = choke_dump_class, + .walk = choke_walk, +}; + +static struct sk_buff *choke_peek_head(struct Qdisc *sch) +{ + struct choke_sched_data *q = qdisc_priv(sch); + + return (q->head != q->tail) ? q->tab[q->head] : NULL; +} + +static struct Qdisc_ops choke_qdisc_ops __read_mostly = { + .id = "choke", + .priv_size = sizeof(struct choke_sched_data), + + .enqueue = choke_enqueue, + .dequeue = choke_dequeue, + .peek = choke_peek_head, + .drop = choke_drop, + .init = choke_init, + .destroy = choke_destroy, + .reset = choke_reset, + .change = choke_change, + .dump = choke_dump, + .dump_stats = choke_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init choke_module_init(void) +{ + return register_qdisc(&choke_qdisc_ops); +} + +static void __exit choke_module_exit(void) +{ + unregister_qdisc(&choke_qdisc_ops); +} + +module_init(choke_module_init) +module_exit(choke_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c new file mode 100644 index 00000000..6b7fe4a8 --- /dev/null +++ b/net/sched/sch_drr.c @@ -0,0 +1,525 @@ +/* + * net/sched/sch_drr.c Deficit Round Robin scheduler + * + * Copyright (c) 2008 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct drr_class { + struct Qdisc_class_common common; + unsigned int refcnt; + unsigned int filter_cnt; + + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + struct list_head alist; + struct Qdisc *qdisc; + + u32 quantum; + u32 deficit; +}; + +struct drr_sched { + struct list_head active; + struct tcf_proto *filter_list; + struct Qdisc_class_hash clhash; +}; + +static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) +{ + struct drr_sched *q = qdisc_priv(sch); + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct drr_class, common); +} + +static void drr_purge_queue(struct drr_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { + [TCA_DRR_QUANTUM] = { .type = NLA_U32 }, +}; + +static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl = (struct drr_class *)*arg; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_DRR_MAX + 1]; + u32 quantum; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy); + if (err < 0) + return err; + + if (tb[TCA_DRR_QUANTUM]) { + quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]); + if (quantum == 0) + return -EINVAL; + } else + quantum = psched_mtu(qdisc_dev(sch)); + + if (cl != NULL) { + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + + sch_tree_lock(sch); + if (tb[TCA_DRR_QUANTUM]) + cl->quantum = quantum; + sch_tree_unlock(sch); + + return 0; + } + + cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + + cl->refcnt = 1; + cl->common.classid = classid; + cl->quantum = quantum; + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; + } + } + + sch_tree_lock(sch); + qdisc_class_hash_insert(&q->clhash, &cl->common); + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; +} + +static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) +{ + gen_kill_estimator(&cl->bstats, &cl->rate_est); + qdisc_destroy(cl->qdisc); + kfree(cl); +} + +static int drr_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl = (struct drr_class *)arg; + + if (cl->filter_cnt > 0) + return -EBUSY; + + sch_tree_lock(sch); + + drr_purge_queue(cl); + qdisc_class_hash_remove(&q->clhash, &cl->common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static unsigned long drr_get_class(struct Qdisc *sch, u32 classid) +{ + struct drr_class *cl = drr_find_class(sch, classid); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void drr_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + if (--cl->refcnt == 0) + drr_destroy_class(sch, cl); +} + +static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl) +{ + struct drr_sched *q = qdisc_priv(sch); + + if (cl) + return NULL; + + return &q->filter_list; +} + +static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct drr_class *cl = drr_find_class(sch, classid); + + if (cl != NULL) + cl->filter_cnt++; + + return (unsigned long)cl; +} + +static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + cl->filter_cnt--; +} + +static int drr_graft_class(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct drr_class *cl = (struct drr_class *)arg; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + drr_purge_queue(cl); + *old = cl->qdisc; + cl->qdisc = new; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + return cl->qdisc; +} + +static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) +{ + struct drr_class *cl = (struct drr_class *)arg; + + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); +} + +static int drr_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct drr_class *cl = (struct drr_class *)arg; + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum); + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct drr_class *cl = (struct drr_class *)arg; + struct tc_drr_stats xstats; + + memset(&xstats, 0, sizeof(xstats)); + if (cl->qdisc->q.qlen) { + xstats.deficit = cl->deficit; + cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; + } + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct hlist_node *n; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { + cl = drr_find_class(sch, skb->priority); + if (cl != NULL) + return cl; + } + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (struct drr_class *)res.class; + if (cl == NULL) + cl = drr_find_class(sch, res.classid); + return cl; + } + return NULL; +} + +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + int err; + + cl = drr_classify(skb, sch, &err); + if (cl == NULL) { + if (err & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + + err = qdisc_enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } + return err; + } + + if (cl->qdisc->q.qlen == 1) { + list_add_tail(&cl->alist, &q->active); + cl->deficit = cl->quantum; + } + + bstats_update(&cl->bstats, skb); + + sch->q.qlen++; + return err; +} + +static struct sk_buff *drr_dequeue(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct sk_buff *skb; + unsigned int len; + + if (list_empty(&q->active)) + goto out; + while (1) { + cl = list_first_entry(&q->active, struct drr_class, alist); + skb = cl->qdisc->ops->peek(cl->qdisc); + if (skb == NULL) + goto out; + + len = qdisc_pkt_len(skb); + if (len <= cl->deficit) { + cl->deficit -= len; + skb = qdisc_dequeue_peeked(cl->qdisc); + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + return skb; + } + + cl->deficit += cl->quantum; + list_move_tail(&cl->alist, &q->active); + } +out: + return NULL; +} + +static unsigned int drr_drop(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + unsigned int len; + + list_for_each_entry(cl, &q->active, alist) { + if (cl->qdisc->ops->drop) { + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + sch->q.qlen--; + if (cl->qdisc->q.qlen == 0) + list_del(&cl->alist); + return len; + } + } + } + return 0; +} + +static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct drr_sched *q = qdisc_priv(sch); + int err; + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + INIT_LIST_HEAD(&q->active); + return 0; +} + +static void drr_reset_qdisc(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct hlist_node *n; + unsigned int i; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (cl->qdisc->q.qlen) + list_del(&cl->alist); + qdisc_reset(cl->qdisc); + } + } + sch->q.qlen = 0; +} + +static void drr_destroy_qdisc(struct Qdisc *sch) +{ + struct drr_sched *q = qdisc_priv(sch); + struct drr_class *cl; + struct hlist_node *n, *next; + unsigned int i; + + tcf_destroy_chain(&q->filter_list); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + common.hnode) + drr_destroy_class(sch, cl); + } + qdisc_class_hash_destroy(&q->clhash); +} + +static const struct Qdisc_class_ops drr_class_ops = { + .change = drr_change_class, + .delete = drr_delete_class, + .get = drr_get_class, + .put = drr_put_class, + .tcf_chain = drr_tcf_chain, + .bind_tcf = drr_bind_tcf, + .unbind_tcf = drr_unbind_tcf, + .graft = drr_graft_class, + .leaf = drr_class_leaf, + .qlen_notify = drr_qlen_notify, + .dump = drr_dump_class, + .dump_stats = drr_dump_class_stats, + .walk = drr_walk, +}; + +static struct Qdisc_ops drr_qdisc_ops __read_mostly = { + .cl_ops = &drr_class_ops, + .id = "drr", + .priv_size = sizeof(struct drr_sched), + .enqueue = drr_enqueue, + .dequeue = drr_dequeue, + .peek = qdisc_peek_dequeued, + .drop = drr_drop, + .init = drr_init_qdisc, + .reset = drr_reset_qdisc, + .destroy = drr_destroy_qdisc, + .owner = THIS_MODULE, +}; + +static int __init drr_init(void) +{ + return register_qdisc(&drr_qdisc_ops); +} + +static void __exit drr_exit(void) +{ + unregister_qdisc(&drr_qdisc_ops); +} + +module_init(drr_init); +module_exit(drr_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c new file mode 100644 index 00000000..2c790204 --- /dev/null +++ b/net/sched/sch_dsmark.c @@ -0,0 +1,509 @@ +/* net/sched/sch_dsmark.c - Differentiated Services field marker */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * classid class marking + * ------- ----- ------- + * n/a 0 n/a + * x:0 1 use entry [0] + * ... ... ... + * x:y y>0 y+1 use entry [y] + * ... ... ... + * x:indices-1 indices use entry [indices-1] + * ... ... ... + * x:y y+1 use entry [y & (indices-1)] + * ... ... ... + * 0xffff 0x10000 use entry [indices-1] + */ + + +#define NO_DEFAULT_INDEX (1 << 16) + +struct dsmark_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; + u8 *mask; /* "owns" the array */ + u8 *value; + u16 indices; + u32 default_index; /* index range is 0...0xffff */ + int set_tc_index; +}; + +static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) +{ + return (index <= p->indices && index > 0); +} + +/* ------------------------- Class/flow operations ------------------------- */ + +static int dsmark_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + + pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", + sch, p, new, old); + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + *old = p->q; + p->q = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + return p->q; +} + +static unsigned long dsmark_get(struct Qdisc *sch, u32 classid) +{ + pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n", + sch, qdisc_priv(sch), classid); + + return TC_H_MIN(classid) + 1; +} + +static unsigned long dsmark_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return dsmark_get(sch, classid); +} + +static void dsmark_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { + [TCA_DSMARK_INDICES] = { .type = NLA_U16 }, + [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 }, + [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG }, + [TCA_DSMARK_MASK] = { .type = NLA_U8 }, + [TCA_DSMARK_VALUE] = { .type = NLA_U8 }, +}; + +static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, + struct nlattr **tca, unsigned long *arg) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_DSMARK_MAX + 1]; + int err = -EINVAL; + u8 mask = 0; + + pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n", sch, p, classid, parent, *arg); + + if (!dsmark_valid_index(p, *arg)) { + err = -ENOENT; + goto errout; + } + + if (!opt) + goto errout; + + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + if (err < 0) + goto errout; + + if (tb[TCA_DSMARK_MASK]) + mask = nla_get_u8(tb[TCA_DSMARK_MASK]); + + if (tb[TCA_DSMARK_VALUE]) + p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); + + if (tb[TCA_DSMARK_MASK]) + p->mask[*arg - 1] = mask; + + err = 0; + +errout: + return err; +} + +static int dsmark_delete(struct Qdisc *sch, unsigned long arg) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + + if (!dsmark_valid_index(p, arg)) + return -EINVAL; + + p->mask[arg - 1] = 0xff; + p->value[arg - 1] = 0; + + return 0; +} + +static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + int i; + + pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + + if (walker->stop) + return; + + for (i = 0; i < p->indices; i++) { + if (p->mask[i] == 0xff && !p->value[i]) + goto ignore; + if (walker->count >= walker->skip) { + if (walker->fn(sch, i + 1, walker) < 0) { + walker->stop = 1; + break; + } + } +ignore: + walker->count++; + } +} + +static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch, + unsigned long cl) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + return &p->filter_list; +} + +/* --------------------------- Qdisc operations ---------------------------- */ + +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + int err; + + pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + + if (p->set_tc_index) { + switch (skb->protocol) { + case htons(ETH_P_IP): + if (skb_cow_head(skb, sizeof(struct iphdr))) + goto drop; + + skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) + & ~INET_ECN_MASK; + break; + + case htons(ETH_P_IPV6): + if (skb_cow_head(skb, sizeof(struct ipv6hdr))) + goto drop; + + skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) + & ~INET_ECN_MASK; + break; + default: + skb->tc_index = 0; + break; + } + } + + if (TC_H_MAJ(skb->priority) == sch->handle) + skb->tc_index = TC_H_MIN(skb->priority); + else { + struct tcf_result res; + int result = tc_classify(skb, p->filter_list, &res); + + pr_debug("result %d class 0x%04x\n", result, res.classid); + + switch (result) { +#ifdef CONFIG_NET_CLS_ACT + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + + case TC_ACT_SHOT: + goto drop; +#endif + case TC_ACT_OK: + skb->tc_index = TC_H_MIN(res.classid); + break; + + default: + if (p->default_index != NO_DEFAULT_INDEX) + skb->tc_index = p->default_index; + break; + } + } + + err = qdisc_enqueue(skb, p->q); + if (err != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(err)) + sch->qstats.drops++; + return err; + } + + sch->q.qlen++; + + return NET_XMIT_SUCCESS; + +drop: + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; +} + +static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct sk_buff *skb; + u32 index; + + pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); + + skb = p->q->ops->dequeue(p->q); + if (skb == NULL) + return NULL; + + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + index = skb->tc_index & (p->indices - 1); + pr_debug("index %d->%d\n", skb->tc_index, index); + + switch (skb->protocol) { + case htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), p->mask[index], + p->value[index]); + break; + case htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], + p->value[index]); + break; + default: + /* + * Only complain if a change was actually attempted. + * This way, we can send non-IP traffic through dsmark + * and don't need yet another qdisc as a bypass. + */ + if (p->mask[index] != 0xff || p->value[index]) + pr_warning("dsmark_dequeue: unsupported protocol %d\n", + ntohs(skb->protocol)); + break; + } + + return skb; +} + +static struct sk_buff *dsmark_peek(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + + pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p); + + return p->q->ops->peek(p->q); +} + +static unsigned int dsmark_drop(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + unsigned int len; + + pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + + if (p->q->ops->drop == NULL) + return 0; + + len = p->q->ops->drop(p->q); + if (len) + sch->q.qlen--; + + return len; +} + +static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *tb[TCA_DSMARK_MAX + 1]; + int err = -EINVAL; + u32 default_index = NO_DEFAULT_INDEX; + u16 indices; + u8 *mask; + + pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + + if (!opt) + goto errout; + + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); + + if (hweight32(indices) != 1) + goto errout; + + if (tb[TCA_DSMARK_DEFAULT_INDEX]) + default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); + + mask = kmalloc(indices * 2, GFP_KERNEL); + if (mask == NULL) { + err = -ENOMEM; + goto errout; + } + + p->mask = mask; + memset(p->mask, 0xff, indices); + + p->value = p->mask + indices; + memset(p->value, 0, indices); + + p->indices = indices; + p->default_index = default_index; + p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); + + p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); + if (p->q == NULL) + p->q = &noop_qdisc; + + pr_debug("dsmark_init: qdisc %p\n", p->q); + + err = 0; +errout: + return err; +} + +static void dsmark_reset(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + + pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + qdisc_reset(p->q); + sch->q.qlen = 0; +} + +static void dsmark_destroy(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + + pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); + + tcf_destroy_chain(&p->filter_list); + qdisc_destroy(p->q); + kfree(p->mask); +} + +static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opts = NULL; + + pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); + + if (!dsmark_valid_index(p, cl)) + return -EINVAL; + + tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); + tcm->tcm_info = p->q->handle; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]); + NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]); + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opts = NULL; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); + + if (p->default_index != NO_DEFAULT_INDEX) + NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); + + if (p->set_tc_index) + NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static const struct Qdisc_class_ops dsmark_class_ops = { + .graft = dsmark_graft, + .leaf = dsmark_leaf, + .get = dsmark_get, + .put = dsmark_put, + .change = dsmark_change, + .delete = dsmark_delete, + .walk = dsmark_walk, + .tcf_chain = dsmark_find_tcf, + .bind_tcf = dsmark_bind_filter, + .unbind_tcf = dsmark_put, + .dump = dsmark_dump_class, +}; + +static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { + .next = NULL, + .cl_ops = &dsmark_class_ops, + .id = "dsmark", + .priv_size = sizeof(struct dsmark_qdisc_data), + .enqueue = dsmark_enqueue, + .dequeue = dsmark_dequeue, + .peek = dsmark_peek, + .drop = dsmark_drop, + .init = dsmark_init, + .reset = dsmark_reset, + .destroy = dsmark_destroy, + .change = NULL, + .dump = dsmark_dump, + .owner = THIS_MODULE, +}; + +static int __init dsmark_module_init(void) +{ + return register_qdisc(&dsmark_qdisc_ops); +} + +static void __exit dsmark_module_exit(void) +{ + unregister_qdisc(&dsmark_qdisc_ops); +} + +module_init(dsmark_module_init) +module_exit(dsmark_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c new file mode 100644 index 00000000..66effe2d --- /dev/null +++ b/net/sched/sch_fifo.c @@ -0,0 +1,179 @@ +/* + * net/sched/sch_fifo.c The simplest FIFO queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include + +/* 1 band FIFO pseudo-"scheduler" */ + +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) + return qdisc_enqueue_tail(skb, sch); + + return qdisc_reshape_fail(skb, sch); +} + +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) + return qdisc_enqueue_tail(skb, sch); + + return qdisc_reshape_fail(skb, sch); +} + +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + if (likely(skb_queue_len(&sch->q) < sch->limit)) + return qdisc_enqueue_tail(skb, sch); + + /* queue full, remove one skb to fulfill the limit */ + __qdisc_queue_drop_head(sch, &sch->q); + sch->qstats.drops++; + qdisc_enqueue_tail(skb, sch); + + return NET_XMIT_CN; +} + +static int fifo_init(struct Qdisc *sch, struct nlattr *opt) +{ + bool bypass; + bool is_bfifo = sch->ops == &bfifo_qdisc_ops; + + if (opt == NULL) { + u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; + + if (is_bfifo) + limit *= psched_mtu(qdisc_dev(sch)); + + sch->limit = limit; + } else { + struct tc_fifo_qopt *ctl = nla_data(opt); + + if (nla_len(opt) < sizeof(*ctl)) + return -EINVAL; + + sch->limit = ctl->limit; + } + + if (is_bfifo) + bypass = sch->limit >= psched_mtu(qdisc_dev(sch)); + else + bypass = sch->limit >= 1; + + if (bypass) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tc_fifo_qopt opt = { .limit = sch->limit }; + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + return -1; +} + +struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { + .id = "pfifo", + .priv_size = 0, + .enqueue = pfifo_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .drop = qdisc_queue_drop, + .init = fifo_init, + .reset = qdisc_reset_queue, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(pfifo_qdisc_ops); + +struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { + .id = "bfifo", + .priv_size = 0, + .enqueue = bfifo_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .drop = qdisc_queue_drop, + .init = fifo_init, + .reset = qdisc_reset_queue, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(bfifo_qdisc_ops); + +struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { + .id = "pfifo_head_drop", + .priv_size = 0, + .enqueue = pfifo_tail_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .drop = qdisc_queue_drop_head, + .init = fifo_init, + .reset = qdisc_reset_queue, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; + +/* Pass size change message down to embedded FIFO */ +int fifo_set_limit(struct Qdisc *q, unsigned int limit) +{ + struct nlattr *nla; + int ret = -ENOMEM; + + /* Hack to avoid sending change message to non-FIFO */ + if (strncmp(q->ops->id + 1, "fifo", 4) != 0) + return 0; + + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (nla) { + nla->nla_type = RTM_NEWQDISC; + nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; + + ret = q->ops->change(q, nla); + kfree(nla); + } + return ret; +} +EXPORT_SYMBOL(fifo_set_limit); + +struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, + unsigned int limit) +{ + struct Qdisc *q; + int err = -ENOMEM; + + q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1)); + if (q) { + err = fifo_set_limit(q, limit); + if (err < 0) { + qdisc_destroy(q); + q = NULL; + } + } + + return q ? : ERR_PTR(err); +} +EXPORT_SYMBOL(fifo_create_dflt); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c new file mode 100644 index 00000000..b4c68090 --- /dev/null +++ b/net/sched/sch_generic.c @@ -0,0 +1,907 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * Jamal Hadi Salim, 990601 + * - Ingress support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Main transmission queue. */ + +/* Modifications to data participating in scheduling must be protected with + * qdisc_lock(qdisc) spinlock. + * + * The idea is the following: + * - enqueue, dequeue are serialized via qdisc root lock + * - ingress filtering is also serialized via qdisc root lock + * - updates to tree and tree walking are only done under the rtnl mutex. + */ + +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + skb_dst_force(skb); + q->gso_skb = skb; + q->qstats.requeues++; + q->q.qlen++; /* it's still part of the queue */ + __netif_schedule(q); + + return 0; +} + +static inline struct sk_buff *dequeue_skb(struct Qdisc *q) +{ + struct sk_buff *skb = q->gso_skb; + + if (unlikely(skb)) { + struct net_device *dev = qdisc_dev(q); + struct netdev_queue *txq; + + /* check the reason of requeuing without tx lock first */ + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + if (!netif_tx_queue_frozen_or_stopped(txq)) { + q->gso_skb = NULL; + q->q.qlen--; + } else + skb = NULL; + } else { + skb = q->dequeue(q); + } + + return skb; +} + +static inline int handle_dev_cpu_collision(struct sk_buff *skb, + struct netdev_queue *dev_queue, + struct Qdisc *q) +{ + int ret; + + if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { + /* + * Same CPU holding the lock. It may be a transient + * configuration error, when hard_start_xmit() recurses. We + * detect it by checking xmit owner and drop the packet when + * deadloop is detected. Return OK to try the next skb. + */ + kfree_skb(skb); + if (net_ratelimit()) + pr_warning("Dead loop on netdevice %s, fix it urgently!\n", + dev_queue->dev->name); + ret = qdisc_qlen(q); + } else { + /* + * Another cpu is holding lock, requeue & delay xmits for + * some time. + */ + __this_cpu_inc(softnet_data.cpu_collision); + ret = dev_requeue_skb(skb, q); + } + + return ret; +} + +/* + * Transmit one skb, and handle the return status as required. Holding the + * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this + * function. + * + * Returns to the caller: + * 0 - queue is empty or throttled. + * >0 - queue is not empty. + */ +int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock) +{ + int ret = NETDEV_TX_BUSY; + + /* And release qdisc */ + spin_unlock(root_lock); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_tx_queue_frozen_or_stopped(txq)) + ret = dev_hard_start_xmit(skb, dev, txq); + + HARD_TX_UNLOCK(dev, txq); + + spin_lock(root_lock); + + if (dev_xmit_complete(ret)) { + /* Driver sent out skb successfully or skb was consumed */ + ret = qdisc_qlen(q); + } else if (ret == NETDEV_TX_LOCKED) { + /* Driver try lock failed */ + ret = handle_dev_cpu_collision(skb, txq, q); + } else { + /* Driver returned NETDEV_TX_BUSY - requeue skb */ + if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) + pr_warning("BUG %s code %d qlen %d\n", + dev->name, ret, q->q.qlen); + + ret = dev_requeue_skb(skb, q); + } + + if (ret && netif_tx_queue_frozen_or_stopped(txq)) + ret = 0; + + return ret; +} + +/* + * NOTE: Called under qdisc_lock(q) with locally disabled BH. + * + * __QDISC_STATE_RUNNING guarantees only one CPU can process + * this qdisc at a time. qdisc_lock(q) serializes queue accesses for + * this queue. + * + * netif_tx_lock serializes accesses to device driver. + * + * qdisc_lock(q) and netif_tx_lock are mutually exclusive, + * if one is grabbed, another must be free. + * + * Note, that this procedure can be called by a watchdog timer + * + * Returns to the caller: + * 0 - queue is empty or throttled. + * >0 - queue is not empty. + * + */ +static inline int qdisc_restart(struct Qdisc *q) +{ + struct netdev_queue *txq; + struct net_device *dev; + spinlock_t *root_lock; + struct sk_buff *skb; + + /* Dequeue packet */ + skb = dequeue_skb(q); + if (unlikely(!skb)) + return 0; + WARN_ON_ONCE(skb_dst_is_noref(skb)); + root_lock = qdisc_lock(q); + dev = qdisc_dev(q); + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + return sch_direct_xmit(skb, q, dev, txq, root_lock); +} + +void __qdisc_run(struct Qdisc *q) +{ + unsigned long start_time = jiffies; + + while (qdisc_restart(q)) { + /* + * Postpone processing if + * 1. another process needs the CPU; + * 2. we've been doing it for too long. + */ + if (need_resched() || jiffies != start_time) { + __netif_schedule(q); + break; + } + } + + qdisc_run_end(q); +} + +unsigned long dev_trans_start(struct net_device *dev) +{ + unsigned long val, res = dev->trans_start; + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + val = netdev_get_tx_queue(dev, i)->trans_start; + if (val && time_after(val, res)) + res = val; + } + dev->trans_start = res; + return res; +} +EXPORT_SYMBOL(dev_trans_start); + +static void dev_watchdog(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + + netif_tx_lock(dev); + if (!qdisc_tx_is_noop(dev)) { + if (netif_device_present(dev) && + netif_running(dev) && + netif_carrier_ok(dev)) { + int some_queue_timedout = 0; + unsigned int i; + unsigned long trans_start; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, i); + /* + * old device drivers set dev->trans_start + */ + trans_start = txq->trans_start ? : dev->trans_start; + if (netif_tx_queue_stopped(txq) && + time_after(jiffies, (trans_start + + dev->watchdog_timeo))) { + some_queue_timedout = 1; + break; + } + } + + if (some_queue_timedout) { + WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", + dev->name, netdev_drivername(dev), i); + dev->netdev_ops->ndo_tx_timeout(dev); + } + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + + dev->watchdog_timeo))) + dev_hold(dev); + } + } + netif_tx_unlock(dev); + + dev_put(dev); +} + +void __netdev_watchdog_up(struct net_device *dev) +{ + if (dev->netdev_ops->ndo_tx_timeout) { + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + dev->watchdog_timeo))) + dev_hold(dev); + } +} + +static void dev_watchdog_up(struct net_device *dev) +{ + __netdev_watchdog_up(dev); +} + +static void dev_watchdog_down(struct net_device *dev) +{ + netif_tx_lock_bh(dev); + if (del_timer(&dev->watchdog_timer)) + dev_put(dev); + netif_tx_unlock_bh(dev); +} + +/** + * netif_carrier_on - set carrier + * @dev: network device + * + * Device has detected that carrier. + */ +void netif_carrier_on(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + linkwatch_fire_event(dev); + if (netif_running(dev)) + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_carrier_on); + +/** + * netif_carrier_off - clear carrier + * @dev: network device + * + * Device has detected loss of carrier. + */ +void netif_carrier_off(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { + if (dev->reg_state == NETREG_UNINITIALIZED) + return; + linkwatch_fire_event(dev); + } +} +EXPORT_SYMBOL(netif_carrier_off); + +/** + * netif_notify_peers - notify network peers about existence of @dev + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void netif_notify_peers(struct net_device *dev) +{ + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(netif_notify_peers); + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + under all circumstances. It is difficult to invent anything faster or + cheaper. + */ + +static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb); + return NET_XMIT_CN; +} + +static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +struct Qdisc_ops noop_qdisc_ops __read_mostly = { + .id = "noop", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .peek = noop_dequeue, + .owner = THIS_MODULE, +}; + +static struct netdev_queue noop_netdev_queue = { + .qdisc = &noop_qdisc, + .qdisc_sleeping = &noop_qdisc, +}; + +struct Qdisc noop_qdisc = { + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noop_qdisc_ops, + .list = LIST_HEAD_INIT(noop_qdisc.list), + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), +}; +EXPORT_SYMBOL(noop_qdisc); + +static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { + .id = "noqueue", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .peek = noop_dequeue, + .owner = THIS_MODULE, +}; + +static struct Qdisc noqueue_qdisc; +static struct netdev_queue noqueue_netdev_queue = { + .qdisc = &noqueue_qdisc, + .qdisc_sleeping = &noqueue_qdisc, +}; + +static struct Qdisc noqueue_qdisc = { + .enqueue = NULL, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noqueue_qdisc_ops, + .list = LIST_HEAD_INIT(noqueue_qdisc.list), + .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), + .dev_queue = &noqueue_netdev_queue, + .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), +}; + + +static const u8 prio2band[TC_PRIO_MAX + 1] = { + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +}; + +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ + +#define PFIFO_FAST_BANDS 3 + +/* + * Private data for a pfifo_fast scheduler containing: + * - queues for the three band + * - bitmap indicating which of the bands contain skbs + */ +struct pfifo_fast_priv { + u32 bitmap; + struct sk_buff_head q[PFIFO_FAST_BANDS]; +}; + +/* + * Convert a bitmap to the first band number where an skb is queued, where: + * bitmap=0 means there are no skbs on any band. + * bitmap=1 means there is an skb on band 0. + * bitmap=7 means there are skbs on all 3 bands, etc. + */ +static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; + +static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, + int band) +{ + return priv->q + band; +} + +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +{ + if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct sk_buff_head *list = band2list(priv, band); + + priv->bitmap |= (1 << band); + qdisc->q.qlen++; + return __qdisc_enqueue_tail(skb, qdisc, list); + } + + return qdisc_drop(skb, qdisc); +} + +static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) +{ + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int band = bitmap2band[priv->bitmap]; + + if (likely(band >= 0)) { + struct sk_buff_head *list = band2list(priv, band); + struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + + qdisc->q.qlen--; + if (skb_queue_empty(list)) + priv->bitmap &= ~(1 << band); + + return skb; + } + + return NULL; +} + +static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) +{ + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int band = bitmap2band[priv->bitmap]; + + if (band >= 0) { + struct sk_buff_head *list = band2list(priv, band); + + return skb_peek(list); + } + + return NULL; +} + +static void pfifo_fast_reset(struct Qdisc *qdisc) +{ + int prio; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) + __qdisc_reset_queue(qdisc, band2list(priv, prio)); + + priv->bitmap = 0; + qdisc->qstats.backlog = 0; + qdisc->q.qlen = 0; +} + +static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) +{ + struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; + + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + return -1; +} + +static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + int prio; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) + skb_queue_head_init(band2list(priv, prio)); + + /* Can by-pass the queue discipline */ + qdisc->flags |= TCQ_F_CAN_BYPASS; + return 0; +} + +struct Qdisc_ops pfifo_fast_ops __read_mostly = { + .id = "pfifo_fast", + .priv_size = sizeof(struct pfifo_fast_priv), + .enqueue = pfifo_fast_enqueue, + .dequeue = pfifo_fast_dequeue, + .peek = pfifo_fast_peek, + .init = pfifo_fast_init, + .reset = pfifo_fast_reset, + .dump = pfifo_fast_dump, + .owner = THIS_MODULE, +}; +EXPORT_SYMBOL(pfifo_fast_ops); + +struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + struct Qdisc_ops *ops) +{ + void *p; + struct Qdisc *sch; + unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; + int err = -ENOBUFS; + + p = kzalloc_node(size, GFP_KERNEL, + netdev_queue_numa_node_read(dev_queue)); + + if (!p) + goto errout; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + /* if we got non aligned memory, ask more and do alignment ourself */ + if (sch != p) { + kfree(p); + p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL, + netdev_queue_numa_node_read(dev_queue)); + if (!p) + goto errout; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; + } + INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->q); + spin_lock_init(&sch->busylock); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev_queue = dev_queue; + dev_hold(qdisc_dev(sch)); + atomic_set(&sch->refcnt, 1); + + return sch; +errout: + return ERR_PTR(err); +} + +struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, + struct Qdisc_ops *ops, unsigned int parentid) +{ + struct Qdisc *sch; + + sch = qdisc_alloc(dev_queue, ops); + if (IS_ERR(sch)) + goto errout; + sch->parent = parentid; + + if (!ops->init || ops->init(sch, NULL) == 0) + return sch; + + qdisc_destroy(sch); +errout: + return NULL; +} +EXPORT_SYMBOL(qdisc_create_dflt); + +/* Under qdisc_lock(qdisc) and BH! */ + +void qdisc_reset(struct Qdisc *qdisc) +{ + const struct Qdisc_ops *ops = qdisc->ops; + + if (ops->reset) + ops->reset(qdisc); + + if (qdisc->gso_skb) { + kfree_skb(qdisc->gso_skb); + qdisc->gso_skb = NULL; + qdisc->q.qlen = 0; + } +} +EXPORT_SYMBOL(qdisc_reset); + +static void qdisc_rcu_free(struct rcu_head *head) +{ + struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); + + kfree((char *) qdisc - qdisc->padded); +} + +void qdisc_destroy(struct Qdisc *qdisc) +{ + const struct Qdisc_ops *ops = qdisc->ops; + + if (qdisc->flags & TCQ_F_BUILTIN || + !atomic_dec_and_test(&qdisc->refcnt)) + return; + +#ifdef CONFIG_NET_SCHED + qdisc_list_del(qdisc); + + qdisc_put_stab(rtnl_dereference(qdisc->stab)); +#endif + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + + module_put(ops->owner); + dev_put(qdisc_dev(qdisc)); + + kfree_skb(qdisc->gso_skb); + /* + * gen_estimator est_timer() might access qdisc->q.lock, + * wait a RCU grace period before freeing qdisc. + */ + call_rcu(&qdisc->rcu_head, qdisc_rcu_free); +} +EXPORT_SYMBOL(qdisc_destroy); + +/* Attach toplevel qdisc to device queue. */ +struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, + struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc = dev_queue->qdisc_sleeping; + spinlock_t *root_lock; + + root_lock = qdisc_lock(oqdisc); + spin_lock_bh(root_lock); + + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) + qdisc_reset(oqdisc); + + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; + dev_queue->qdisc_sleeping = qdisc; + rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); + + spin_unlock_bh(root_lock); + + return oqdisc; +} +EXPORT_SYMBOL(dev_graft_qdisc); + +static void attach_one_default_qdisc(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_unused) +{ + struct Qdisc *qdisc = &noqueue_qdisc; + + if (dev->tx_queue_len) { + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_fast_ops, TC_H_ROOT); + if (!qdisc) { + netdev_info(dev, "activation failed\n"); + return; + } + } + dev_queue->qdisc_sleeping = qdisc; +} + +static void attach_default_qdiscs(struct net_device *dev) +{ + struct netdev_queue *txq; + struct Qdisc *qdisc; + + txq = netdev_get_tx_queue(dev, 0); + + if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) { + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); + dev->qdisc = txq->qdisc_sleeping; + atomic_inc(&dev->qdisc->refcnt); + } else { + qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); + if (qdisc) { + qdisc->ops->attach(qdisc); + dev->qdisc = qdisc; + } + } +} + +static void transition_one_qdisc(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_need_watchdog) +{ + struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping; + int *need_watchdog_p = _need_watchdog; + + if (!(new_qdisc->flags & TCQ_F_BUILTIN)) + clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); + + rcu_assign_pointer(dev_queue->qdisc, new_qdisc); + if (need_watchdog_p && new_qdisc != &noqueue_qdisc) { + dev_queue->trans_start = 0; + *need_watchdog_p = 1; + } +} + +void dev_activate(struct net_device *dev) +{ + int need_watchdog; + + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual interfaces + */ + + if (dev->qdisc == &noop_qdisc) + attach_default_qdiscs(dev); + + if (!netif_carrier_ok(dev)) + /* Delay activation until next carrier-on event */ + return; + + need_watchdog = 0; + netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); + if (dev_ingress_queue(dev)) + transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); + + if (need_watchdog) { + dev->trans_start = jiffies; + dev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(dev_activate); + +static void dev_deactivate_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) +{ + struct Qdisc *qdisc_default = _qdisc_default; + struct Qdisc *qdisc; + + qdisc = dev_queue->qdisc; + if (qdisc) { + spin_lock_bh(qdisc_lock(qdisc)); + + if (!(qdisc->flags & TCQ_F_BUILTIN)) + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + } +} + +static bool some_qdisc_is_busy(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + spinlock_t *root_lock; + struct Qdisc *q; + int val; + + dev_queue = netdev_get_tx_queue(dev, i); + q = dev_queue->qdisc_sleeping; + root_lock = qdisc_lock(q); + + spin_lock_bh(root_lock); + + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); + + spin_unlock_bh(root_lock); + + if (val) + return true; + } + return false; +} + +/** + * dev_deactivate_many - deactivate transmissions on several devices + * @head: list of devices to deactivate + * + * This function returns only when all outstanding transmissions + * have completed, unless all devices are in dismantle phase. + */ +void dev_deactivate_many(struct list_head *head) +{ + struct net_device *dev; + bool sync_needed = false; + + list_for_each_entry(dev, head, unreg_list) { + netdev_for_each_tx_queue(dev, dev_deactivate_queue, + &noop_qdisc); + if (dev_ingress_queue(dev)) + dev_deactivate_queue(dev, dev_ingress_queue(dev), + &noop_qdisc); + + dev_watchdog_down(dev); + sync_needed |= !dev->dismantle; + } + + /* Wait for outstanding qdisc-less dev_queue_xmit calls. + * This is avoided if all devices are in dismantle phase : + * Caller will call synchronize_net() for us + */ + if (sync_needed) + synchronize_net(); + + /* Wait for outstanding qdisc_run calls. */ + list_for_each_entry(dev, head, unreg_list) + while (some_qdisc_is_busy(dev)) + yield(); +} + +void dev_deactivate(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + dev_deactivate_many(&single); + list_del(&single); +} +EXPORT_SYMBOL(dev_deactivate); + +static void dev_init_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc) +{ + struct Qdisc *qdisc = _qdisc; + + dev_queue->qdisc = qdisc; + dev_queue->qdisc_sleeping = qdisc; +} + +void dev_init_scheduler(struct net_device *dev) +{ + dev->qdisc = &noop_qdisc; + netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); + + setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); +} + +static void shutdown_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + struct Qdisc *qdisc_default = _qdisc_default; + + if (qdisc) { + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + dev_queue->qdisc_sleeping = qdisc_default; + + qdisc_destroy(qdisc); + } +} + +void dev_shutdown(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); + qdisc_destroy(dev->qdisc); + dev->qdisc = &noop_qdisc; + + WARN_ON(timer_pending(&dev->watchdog_timer)); +} diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c new file mode 100644 index 00000000..e1afe0c2 --- /dev/null +++ b/net/sched/sch_gred.c @@ -0,0 +1,605 @@ +/* + * net/sched/sch_gred.c Generic Random Early Detection queue. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 + * + * 991129: - Bug fix with grio mode + * - a better sing. AvgQ mode with Grio(WRED) + * - A finer grained VQ dequeue based on sugestion + * from Ren Liu + * - More error checks + * + * For all the glorious comments look at include/net/red.h + */ + +#include +#include +#include +#include +#include +#include +#include + +#define GRED_DEF_PRIO (MAX_DPs / 2) +#define GRED_VQ_MASK (MAX_DPs - 1) + +struct gred_sched_data; +struct gred_sched; + +struct gred_sched_data { + u32 limit; /* HARD maximal queue length */ + u32 DP; /* the drop pramaters */ + u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 packetsin; /* packets seen on virtualQ so far*/ + u32 backlog; /* bytes on the virtualQ */ + u8 prio; /* the prio of this vq */ + + struct red_parms parms; + struct red_stats stats; +}; + +enum { + GRED_WRED_MODE = 1, + GRED_RIO_MODE, +}; + +struct gred_sched { + struct gred_sched_data *tab[MAX_DPs]; + unsigned long flags; + u32 red_flags; + u32 DPs; + u32 def; + struct red_parms wred_set; +}; + +static inline int gred_wred_mode(struct gred_sched *table) +{ + return test_bit(GRED_WRED_MODE, &table->flags); +} + +static inline void gred_enable_wred_mode(struct gred_sched *table) +{ + __set_bit(GRED_WRED_MODE, &table->flags); +} + +static inline void gred_disable_wred_mode(struct gred_sched *table) +{ + __clear_bit(GRED_WRED_MODE, &table->flags); +} + +static inline int gred_rio_mode(struct gred_sched *table) +{ + return test_bit(GRED_RIO_MODE, &table->flags); +} + +static inline void gred_enable_rio_mode(struct gred_sched *table) +{ + __set_bit(GRED_RIO_MODE, &table->flags); +} + +static inline void gred_disable_rio_mode(struct gred_sched *table) +{ + __clear_bit(GRED_RIO_MODE, &table->flags); +} + +static inline int gred_wred_mode_check(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + int i; + + /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + int n; + + if (q == NULL) + continue; + + for (n = 0; n < table->DPs; n++) + if (table->tab[n] && table->tab[n] != q && + table->tab[n]->prio == q->prio) + return 1; + } + + return 0; +} + +static inline unsigned int gred_backlog(struct gred_sched *table, + struct gred_sched_data *q, + struct Qdisc *sch) +{ + if (gred_wred_mode(table)) + return sch->qstats.backlog; + else + return q->backlog; +} + +static inline u16 tc_index_to_dp(struct sk_buff *skb) +{ + return skb->tc_index & GRED_VQ_MASK; +} + +static inline void gred_load_wred_set(struct gred_sched *table, + struct gred_sched_data *q) +{ + q->parms.qavg = table->wred_set.qavg; + q->parms.qidlestart = table->wred_set.qidlestart; +} + +static inline void gred_store_wred_set(struct gred_sched *table, + struct gred_sched_data *q) +{ + table->wred_set.qavg = q->parms.qavg; +} + +static inline int gred_use_ecn(struct gred_sched *t) +{ + return t->red_flags & TC_RED_ECN; +} + +static inline int gred_use_harddrop(struct gred_sched *t) +{ + return t->red_flags & TC_RED_HARDDROP; +} + +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct gred_sched_data *q = NULL; + struct gred_sched *t = qdisc_priv(sch); + unsigned long qavg = 0; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + dp = t->def; + + q = t->tab[dp]; + if (!q) { + /* Pass through packets not assigned to a DP + * if no default DP has been configured. This + * allows for DP flows to be left untouched. + */ + if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) + return qdisc_enqueue_tail(skb, sch); + else + goto drop; + } + + /* fix tc_index? --could be controvesial but needed for + requeueing */ + skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; + } + + /* sum up all the qaves of prios <= to ours to get the new qave */ + if (!gred_wred_mode(t) && gred_rio_mode(t)) { + int i; + + for (i = 0; i < t->DPs; i++) { + if (t->tab[i] && t->tab[i]->prio < q->prio && + !red_is_idling(&t->tab[i]->parms)) + qavg += t->tab[i]->parms.qavg; + } + + } + + q->packetsin++; + q->bytesin += qdisc_pkt_len(skb); + + if (gred_wred_mode(t)) + gred_load_wred_set(t, q); + + q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); + + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); + + if (gred_wred_mode(t)) + gred_store_wred_set(t, q); + + switch (red_action(&q->parms, q->parms.qavg + qavg)) { + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (gred_use_harddrop(t) || !gred_use_ecn(t) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + q->stats.forced_mark++; + break; + } + + if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { + q->backlog += qdisc_pkt_len(skb); + return qdisc_enqueue_tail(skb, sch); + } + + q->stats.pdrop++; +drop: + return qdisc_drop(skb, sch); + +congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; +} + +static struct sk_buff *gred_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct gred_sched *t = qdisc_priv(sch); + + skb = qdisc_dequeue_head(sch); + + if (skb) { + struct gred_sched_data *q; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + pr_warning("GRED: Unable to relocate VQ 0x%x " + "after dequeue, screwing up " + "backlog.\n", tc_index_to_dp(skb)); + } else { + q->backlog -= qdisc_pkt_len(skb); + + if (!q->backlog && !gred_wred_mode(t)) + red_start_of_idle_period(&q->parms); + } + + return skb; + } + + if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) + red_start_of_idle_period(&t->wred_set); + + return NULL; +} + +static unsigned int gred_drop(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct gred_sched *t = qdisc_priv(sch); + + skb = qdisc_dequeue_tail(sch); + if (skb) { + unsigned int len = qdisc_pkt_len(skb); + struct gred_sched_data *q; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + pr_warning("GRED: Unable to relocate VQ 0x%x " + "while dropping, screwing up " + "backlog.\n", tc_index_to_dp(skb)); + } else { + q->backlog -= len; + q->stats.other++; + + if (!q->backlog && !gred_wred_mode(t)) + red_start_of_idle_period(&q->parms); + } + + qdisc_drop(skb, sch); + return len; + } + + if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) + red_start_of_idle_period(&t->wred_set); + + return 0; + +} + +static void gred_reset(struct Qdisc *sch) +{ + int i; + struct gred_sched *t = qdisc_priv(sch); + + qdisc_reset_queue(sch); + + for (i = 0; i < t->DPs; i++) { + struct gred_sched_data *q = t->tab[i]; + + if (!q) + continue; + + red_restart(&q->parms); + q->backlog = 0; + } +} + +static inline void gred_destroy_vq(struct gred_sched_data *q) +{ + kfree(q); +} + +static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_sopt *sopt; + int i; + + if (dps == NULL) + return -EINVAL; + + sopt = nla_data(dps); + + if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) + return -EINVAL; + + sch_tree_lock(sch); + table->DPs = sopt->DPs; + table->def = sopt->def_DP; + table->red_flags = sopt->flags; + + /* + * Every entry point to GRED is synchronized with the above code + * and the DP is checked against DPs, i.e. shadowed VQs can no + * longer be found so we can unlock right here. + */ + sch_tree_unlock(sch); + + if (sopt->grio) { + gred_enable_rio_mode(table); + gred_disable_wred_mode(table); + if (gred_wred_mode_check(sch)) + gred_enable_wred_mode(table); + } else { + gred_disable_rio_mode(table); + gred_disable_wred_mode(table); + } + + for (i = table->DPs; i < MAX_DPs; i++) { + if (table->tab[i]) { + pr_warning("GRED: Warning: Destroying " + "shadowed VQ 0x%x\n", i); + gred_destroy_vq(table->tab[i]); + table->tab[i] = NULL; + } + } + + return 0; +} + +static inline int gred_change_vq(struct Qdisc *sch, int dp, + struct tc_gred_qopt *ctl, int prio, u8 *stab) +{ + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; + + if (table->tab[dp] == NULL) { + table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC); + if (table->tab[dp] == NULL) + return -ENOMEM; + } + + q = table->tab[dp]; + q->DP = dp; + q->prio = prio; + q->limit = ctl->limit; + + if (q->backlog == 0) + red_end_of_idle_period(&q->parms); + + red_set_parms(&q->parms, + ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, + ctl->Scell_log, stab); + + return 0; +} + +static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { + [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, + [TCA_GRED_STAB] = { .len = 256 }, + [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, +}; + +static int gred_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_qopt *ctl; + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err, prio = GRED_DEF_PRIO; + u8 *stab; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + if (err < 0) + return err; + + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) + return gred_change_table_def(sch, opt); + + if (tb[TCA_GRED_PARMS] == NULL || + tb[TCA_GRED_STAB] == NULL) + return -EINVAL; + + err = -EINVAL; + ctl = nla_data(tb[TCA_GRED_PARMS]); + stab = nla_data(tb[TCA_GRED_STAB]); + + if (ctl->DP >= table->DPs) + goto errout; + + if (gred_rio_mode(table)) { + if (ctl->prio == 0) { + int def_prio = GRED_DEF_PRIO; + + if (table->tab[table->def]) + def_prio = table->tab[table->def]->prio; + + printk(KERN_DEBUG "GRED: DP %u does not have a prio " + "setting default to %d\n", ctl->DP, def_prio); + + prio = def_prio; + } else + prio = ctl->prio; + } + + sch_tree_lock(sch); + + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); + if (err < 0) + goto errout_locked; + + if (gred_rio_mode(table)) { + gred_disable_wred_mode(table); + if (gred_wred_mode_check(sch)) + gred_enable_wred_mode(table); + } + + err = 0; + +errout_locked: + sch_tree_unlock(sch); +errout: + return err; +} + +static int gred_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + if (err < 0) + return err; + + if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) + return -EINVAL; + + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); +} + +static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct gred_sched *table = qdisc_priv(sch); + struct nlattr *parms, *opts = NULL; + int i; + struct tc_gred_sopt sopt = { + .DPs = table->DPs, + .def_DP = table->def, + .grio = gred_rio_mode(table), + .flags = table->red_flags, + }; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + parms = nla_nest_start(skb, TCA_GRED_PARMS); + if (parms == NULL) + goto nla_put_failure; + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + struct tc_gred_qopt opt; + + memset(&opt, 0, sizeof(opt)); + + if (!q) { + /* hack -- fix at some point with proper message + This is how we indicate to tc that there is no VQ + at this DP */ + + opt.DP = MAX_DPs + i; + goto append_opt; + } + + opt.limit = q->limit; + opt.DP = q->DP; + opt.backlog = q->backlog; + opt.prio = q->prio; + opt.qth_min = q->parms.qth_min >> q->parms.Wlog; + opt.qth_max = q->parms.qth_max >> q->parms.Wlog; + opt.Wlog = q->parms.Wlog; + opt.Plog = q->parms.Plog; + opt.Scell_log = q->parms.Scell_log; + opt.other = q->stats.other; + opt.early = q->stats.prob_drop; + opt.forced = q->stats.forced_drop; + opt.pdrop = q->stats.pdrop; + opt.packets = q->packetsin; + opt.bytesin = q->bytesin; + + if (gred_wred_mode(table)) + gred_load_wred_set(table, q); + + opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); + +append_opt: + if (nla_append(skb, sizeof(opt), &opt) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, parms); + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static void gred_destroy(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + int i; + + for (i = 0; i < table->DPs; i++) { + if (table->tab[i]) + gred_destroy_vq(table->tab[i]); + } +} + +static struct Qdisc_ops gred_qdisc_ops __read_mostly = { + .id = "gred", + .priv_size = sizeof(struct gred_sched), + .enqueue = gred_enqueue, + .dequeue = gred_dequeue, + .peek = qdisc_peek_head, + .drop = gred_drop, + .init = gred_init, + .reset = gred_reset, + .destroy = gred_destroy, + .change = gred_change, + .dump = gred_dump, + .owner = THIS_MODULE, +}; + +static int __init gred_module_init(void) +{ + return register_qdisc(&gred_qdisc_ops); +} + +static void __exit gred_module_exit(void) +{ + unregister_qdisc(&gred_qdisc_ops); +} + +module_init(gred_module_init) +module_exit(gred_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c new file mode 100644 index 00000000..6488e642 --- /dev/null +++ b/net/sched/sch_hfsc.c @@ -0,0 +1,1744 @@ +/* + * Copyright (c) 2003 Patrick McHardy, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * 2003-10-17 - Ported from altq + */ +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * kernel internal service curve representation: + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. + * y-axis: unit is byte. + * + * The service curve parameters are converted to the internal + * representation. The slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + */ + +struct internal_sc { + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + u64 x; /* current starting position on x-axis */ + u64 y; /* current starting position on y-axis */ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +enum hfsc_class_flags { + HFSC_RSC = 0x1, + HFSC_FSC = 0x2, + HFSC_USC = 0x4 +}; + +struct hfsc_class { + struct Qdisc_class_common cl_common; + unsigned int refcnt; /* usage count */ + + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + unsigned int level; /* class level in hierarchy */ + struct tcf_proto *filter_list; /* filter list */ + unsigned int filter_cnt; /* filter count */ + + struct hfsc_sched *sched; /* scheduler data */ + struct hfsc_class *cl_parent; /* parent class */ + struct list_head siblings; /* sibling classes */ + struct list_head children; /* child classes */ + struct Qdisc *qdisc; /* leaf qdisc */ + + struct rb_node el_node; /* qdisc's eligible tree member */ + struct rb_root vt_tree; /* active children sorted by cl_vt */ + struct rb_node vt_node; /* parent's vt_tree member */ + struct rb_root cf_tree; /* active children sorted by cl_f */ + struct rb_node cf_node; /* parent's cf_heap member */ + struct list_head dlist; /* drop list member */ + + u64 cl_total; /* total work in bytes */ + u64 cl_cumul; /* cumulative work in bytes done by + real-time criteria */ + + u64 cl_d; /* deadline*/ + u64 cl_e; /* eligible time */ + u64 cl_vt; /* virtual time */ + u64 cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u64 cl_myf; /* my fit-time (calculated from this + class's own upperlimit curve) */ + u64 cl_myfadj; /* my fit-time adjustment (to cancel + history dependence) */ + u64 cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u64 cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u64 cl_vtadj; /* intra-period cumulative vt + adjustment */ + u64 cl_vtoff; /* inter-period cumulative vt offset */ + u64 cl_cvtmax; /* max child's vt in the last period */ + u64 cl_cvtoff; /* cumulative cvtmax of all periods */ + u64 cl_pcvtoff; /* parent's cvtoff at initialization + time */ + + struct internal_sc cl_rsc; /* internal real-time service curve */ + struct internal_sc cl_fsc; /* internal fair service curve */ + struct internal_sc cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + unsigned long cl_flags; /* which curves are valid */ + unsigned long cl_vtperiod; /* vt period sequence number */ + unsigned long cl_parentperiod;/* parent's vt period sequence number*/ + unsigned long cl_nactive; /* number of active children */ +}; + +struct hfsc_sched { + u16 defcls; /* default class id */ + struct hfsc_class root; /* root class */ + struct Qdisc_class_hash clhash; /* class hash */ + struct rb_root eligible; /* eligible tree */ + struct list_head droplist; /* active leaf class list (for + dropping) */ + struct qdisc_watchdog watchdog; /* watchdog timer */ +}; + +#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ + + +/* + * eligible tree holds backlogged classes being sorted by their eligible times. + * there is one eligible tree per hfsc instance. + */ + +static void +eltree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->sched->eligible.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, el_node); + if (cl->cl_e >= cl1->cl_e) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->el_node, parent, p); + rb_insert_color(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_update(struct hfsc_class *cl) +{ + eltree_remove(cl); + eltree_insert(cl); +} + +/* find the class with the minimum deadline among the eligible classes */ +static inline struct hfsc_class * +eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) +{ + struct hfsc_class *p, *cl = NULL; + struct rb_node *n; + + for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, el_node); + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return cl; +} + +/* find the class with minimum eligible time among the eligible classes */ +static inline struct hfsc_class * +eltree_get_minel(struct hfsc_sched *q) +{ + struct rb_node *n; + + n = rb_first(&q->eligible); + if (n == NULL) + return NULL; + return rb_entry(n, struct hfsc_class, el_node); +} + +/* + * vttree holds holds backlogged child classes being sorted by their virtual + * time. each intermediate class has one vttree. + */ +static void +vttree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->vt_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, vt_node); + if (cl->cl_vt >= cl1->cl_vt) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->vt_node, parent, p); + rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_update(struct hfsc_class *cl) +{ + vttree_remove(cl); + vttree_insert(cl); +} + +static inline struct hfsc_class * +vttree_firstfit(struct hfsc_class *cl, u64 cur_time) +{ + struct hfsc_class *p; + struct rb_node *n; + + for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, vt_node); + if (p->cl_f <= cur_time) + return p; + } + return NULL; +} + +/* + * get the leaf class with the minimum vt in the hierarchy + */ +static struct hfsc_class * +vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) +{ + /* if root-class's cfmin is bigger than cur_time nothing to do */ + if (cl->cl_cfmin > cur_time) + return NULL; + + while (cl->level > 0) { + cl = vttree_firstfit(cl, cur_time); + if (cl == NULL) + return NULL; + /* + * update parent's cl_cvtmin. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; + } + return cl; +} + +static void +cftree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->cf_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, cf_node); + if (cl->cl_f >= cl1->cl_f) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->cf_node, parent, p); + rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_update(struct hfsc_class *cl) +{ + cftree_remove(cl); + cftree_insert(cl); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bps + * d: us + * internal service curve parameters + * sm: (bytes/psched_us) << SM_SHIFT + * ism: (psched_us/byte) << ISM_SHIFT + * dx: psched_us + * + * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us. + * + * sm and ism are scaled in order to keep effective digits. + * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective + * digits in decimal using the following table. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ------------+------------------------------------------------------- + * bytes/1.024us 12.8e-3 128e-3 1280e-3 12800e-3 128000e-3 + * + * 1.024us/byte 78.125 7.8125 0.78125 0.078125 0.0078125 + * + * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18. + */ +#define SM_SHIFT (30 - PSCHED_SHIFT) +#define ISM_SHIFT (8 + PSCHED_SHIFT) + +#define SM_MASK ((1ULL << SM_SHIFT) - 1) +#define ISM_MASK ((1ULL << ISM_SHIFT) - 1) + +static inline u64 +seg_x2y(u64 x, u64 sm) +{ + u64 y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return y; +} + +static inline u64 +seg_y2x(u64 y, u64 ism) +{ + u64 x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return x; +} + +/* Convert m (bps) into sm (bytes/psched us) */ +static u64 +m2sm(u32 m) +{ + u64 sm; + + sm = ((u64)m << SM_SHIFT); + sm += PSCHED_TICKS_PER_SEC - 1; + do_div(sm, PSCHED_TICKS_PER_SEC); + return sm; +} + +/* convert m (bps) into ism (psched us/byte) */ +static u64 +m2ism(u32 m) +{ + u64 ism; + + if (m == 0) + ism = HT_INFINITY; + else { + ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT); + ism += m - 1; + do_div(ism, m); + } + return ism; +} + +/* convert d (us) into dx (psched us) */ +static u64 +d2dx(u32 d) +{ + u64 dx; + + dx = ((u64)d * PSCHED_TICKS_PER_SEC); + dx += USEC_PER_SEC - 1; + do_div(dx, USEC_PER_SEC); + return dx; +} + +/* convert sm (bytes/psched us) into m (bps) */ +static u32 +sm2m(u64 sm) +{ + u64 m; + + m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT; + return (u32)m; +} + +/* convert dx (psched us) into d (us) */ +static u32 +dx2d(u64 dx) +{ + u64 d; + + d = dx * USEC_PER_SEC; + do_div(d, PSCHED_TICKS_PER_SEC); + return (u32)d; +} + +static void +sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u64 +rtsc_y2x(struct runtime_sc *rtsc, u64 y) +{ + u64 x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return x; +} + +static u64 +rtsc_x2y(struct runtime_sc *rtsc, u64 x) +{ + u64 y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return y; +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + u64 y1, y2, dx, dy; + u32 dsm; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = (y1 - y) << SM_SHIFT; + dsm = isc->sm1 - isc->sm2; + do_div(dx, dsm); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; +} + +static void +init_ed(struct hfsc_class *cl, unsigned int next_len) +{ + u64 cur_time = psched_get_time(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_update(cl); +} + +static inline void +update_d(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static inline void +update_cfmin(struct hfsc_class *cl) +{ + struct rb_node *n = rb_first(&cl->cf_tree); + struct hfsc_class *p; + + if (n == NULL) { + cl->cl_cfmin = 0; + return; + } + p = rb_entry(n, struct hfsc_class, cf_node); + cl->cl_cfmin = p->cl_f; +} + +static void +init_vf(struct hfsc_class *cl, unsigned int len) +{ + struct hfsc_class *max_cl; + struct rb_node *n; + u64 vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + n = rb_last(&cl->cl_parent->vt_tree); + if (n != NULL) { + max_cl = rb_entry(n, struct hfsc_class, vt_node); + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to cvtoff to make a new + * vt (vtoff + vt) larger than the vt in the + * last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + cl->cl_parent->cl_cvtoff += vt; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + cl->cl_vt = 0; + } + + cl->cl_vtoff = cl->cl_parent->cl_cvtoff - + cl->cl_pcvtoff; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, + cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + vttree_insert(cl); + cftree_insert(cl); + + if (cl->cl_flags & HFSC_USC) { + /* class has upper limit curve */ + if (cur_time == 0) + cur_time = psched_get_time(); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + } + update_cfmin(cl->cl_parent); + } +} + +static void +update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) +{ + u64 f; /* , myf_bound, delta; */ + int go_passive = 0; + + if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) + go_passive = 1; + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + cl->cl_total += len; + + if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt tree */ + vttree_remove(cl); + + cftree_remove(cl); + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt tree */ + vttree_update(cl); + + if (cl->cl_flags & HFSC_USC) { + cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); +#if 0 + /* + * This code causes classes to stay way under their + * limit when multiple classes are used at gigabit + * speed. needs investigation. -kaber + */ + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - PSCHED_JIFFIE2US(1); + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } +#endif + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +set_active(struct hfsc_class *cl, unsigned int len) +{ + if (cl->cl_flags & HFSC_RSC) + init_ed(cl, len); + if (cl->cl_flags & HFSC_FSC) + init_vf(cl, len); + + list_add_tail(&cl->dlist, &cl->sched->droplist); +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_flags & HFSC_RSC) + eltree_remove(cl); + + list_del(&cl->dlist); + + /* + * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from vttree. + */ +} + +static unsigned int +qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->ops->peek(sch); + if (skb == NULL) { + qdisc_warn_nonwc("qdisc_peek_len", sch); + return 0; + } + len = qdisc_pkt_len(skb); + + return len; +} + +static void +hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static void +hfsc_adjust_levels(struct hfsc_class *cl) +{ + struct hfsc_class *p; + unsigned int level; + + do { + level = 0; + list_for_each_entry(p, &cl->children, siblings) { + if (p->level >= level) + level = p->level + 1; + } + cl->level = level; + } while ((cl = cl->cl_parent) != NULL); +} + +static inline struct hfsc_class * +hfsc_find_class(u32 classid, struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct hfsc_class, cl_common); +} + +static void +hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, + u64 cur_time) +{ + sc2isc(rsc, &cl->cl_rsc); + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + cl->cl_flags |= HFSC_RSC; +} + +static void +hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) +{ + sc2isc(fsc, &cl->cl_fsc); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + cl->cl_flags |= HFSC_FSC; +} + +static void +hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, + u64 cur_time) +{ + sc2isc(usc, &cl->cl_usc); + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); + cl->cl_flags |= HFSC_USC; +} + +static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { + [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_USC] = { .len = sizeof(struct tc_service_curve) }, +}; + +static int +hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)*arg; + struct hfsc_class *parent = NULL; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_HFSC_MAX + 1]; + struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; + u64 cur_time; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy); + if (err < 0) + return err; + + if (tb[TCA_HFSC_RSC]) { + rsc = nla_data(tb[TCA_HFSC_RSC]); + if (rsc->m1 == 0 && rsc->m2 == 0) + rsc = NULL; + } + + if (tb[TCA_HFSC_FSC]) { + fsc = nla_data(tb[TCA_HFSC_FSC]); + if (fsc->m1 == 0 && fsc->m2 == 0) + fsc = NULL; + } + + if (tb[TCA_HFSC_USC]) { + usc = nla_data(tb[TCA_HFSC_USC]); + if (usc->m1 == 0 && usc->m2 == 0) + usc = NULL; + } + + if (cl != NULL) { + if (parentid) { + if (cl->cl_parent && + cl->cl_parent->cl_common.classid != parentid) + return -EINVAL; + if (cl->cl_parent == NULL && parentid != TC_H_ROOT) + return -EINVAL; + } + cur_time = psched_get_time(); + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + + sch_tree_lock(sch); + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, cur_time); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, cur_time); + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) + update_ed(cl, qdisc_peek_len(cl->qdisc)); + if (cl->cl_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + } + sch_tree_unlock(sch); + + return 0; + } + + if (parentid == TC_H_ROOT) + return -EEXIST; + + parent = &q->root; + if (parentid) { + parent = hfsc_find_class(parentid, sch); + if (parent == NULL) + return -ENOENT; + } + + if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) + return -EINVAL; + if (hfsc_find_class(classid, sch)) + return -EEXIST; + + if (rsc == NULL && fsc == NULL) + return -EINVAL; + + cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + kfree(cl); + return err; + } + } + + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, 0); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, 0); + + cl->cl_common.classid = classid; + cl->refcnt = 1; + cl->sched = q; + cl->cl_parent = parent; + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + INIT_LIST_HEAD(&cl->children); + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + + sch_tree_lock(sch); + qdisc_class_hash_insert(&q->clhash, &cl->cl_common); + list_add_tail(&cl->siblings, &parent->children); + if (parent->level == 0) + hfsc_purge_queue(sch, parent); + hfsc_adjust_levels(parent); + cl->cl_pcvtoff = parent->cl_cvtoff; + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; +} + +static void +hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) +{ + struct hfsc_sched *q = qdisc_priv(sch); + + tcf_destroy_chain(&cl->filter_list); + qdisc_destroy(cl->qdisc); + gen_kill_estimator(&cl->bstats, &cl->rate_est); + if (cl != &q->root) + kfree(cl); +} + +static int +hfsc_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + return -EBUSY; + + sch_tree_lock(sch); + + list_del(&cl->siblings); + hfsc_adjust_levels(cl->cl_parent); + + hfsc_purge_queue(sch, cl); + qdisc_class_hash_remove(&q->clhash, &cl->cl_common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static struct hfsc_class * +hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *head, *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && + (cl = hfsc_find_class(skb->priority, sch)) != NULL) + if (cl->level == 0) + return cl; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + head = &q->root; + tcf = q->root.filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (struct hfsc_class *)res.class; + if (!cl) { + cl = hfsc_find_class(res.classid, sch); + if (!cl) + break; /* filter selected invalid classid */ + if (cl->level >= head->level) + break; /* filter may only point downwards */ + } + + if (cl->level == 0) + return cl; /* hit leaf class */ + + /* apply inner filter chain */ + tcf = cl->filter_list; + head = cl; + } + + /* classification failed, try default class */ + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + if (cl == NULL || cl->level > 0) + return NULL; + + return cl; +} + +static int +hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->level > 0) + return -EINVAL; + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->cl_common.classid); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + hfsc_purge_queue(sch, cl); + *old = cl->qdisc; + cl->qdisc = new; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc * +hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->level == 0) + return cl->qdisc; + + return NULL; +} + +static void +hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->qdisc->q.qlen == 0) { + update_vf(cl, 0, 0); + set_passive(cl); + } +} + +static unsigned long +hfsc_get_class(struct Qdisc *sch, u32 classid) +{ + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void +hfsc_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); +} + +static unsigned long +hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + struct hfsc_class *p = (struct hfsc_class *)parent; + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) { + if (p != NULL && p->level <= cl->level) + return 0; + cl->filter_cnt++; + } + + return (unsigned long)cl; +} + +static void +hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + cl->filter_cnt--; +} + +static struct tcf_proto ** +hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + cl = &q->root; + + return &cl->filter_list; +} + +static int +hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) +{ + struct tc_service_curve tsc; + + tsc.m1 = sm2m(sc->sm1); + tsc.d = dx2d(sc->dx); + tsc.m2 = sm2m(sc->sm2); + NLA_PUT(skb, attr, sizeof(tsc), &tsc); + + return skb->len; + + nla_put_failure: + return -1; +} + +static int +hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) +{ + if ((cl->cl_flags & HFSC_RSC) && + (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) + goto nla_put_failure; + + if ((cl->cl_flags & HFSC_FSC) && + (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) + goto nla_put_failure; + + if ((cl->cl_flags & HFSC_USC) && + (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) + goto nla_put_failure; + + return skb->len; + + nla_put_failure: + return -1; +} + +static int +hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + struct nlattr *nest; + + tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid : + TC_H_ROOT; + tcm->tcm_handle = cl->cl_common.classid; + if (cl->level == 0) + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + if (hfsc_dump_curves(skb, cl) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return skb->len; + + nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int +hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + struct tc_hfsc_stats xstats; + + cl->qstats.qlen = cl->qdisc->q.qlen; + xstats.level = cl->level; + xstats.period = cl->cl_vtperiod; + xstats.work = cl->cl_total; + xstats.rtwork = cl->cl_cumul; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + + + +static void +hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hlist_node *n; + struct hfsc_class *cl; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], + cl_common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static void +hfsc_schedule_watchdog(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + u64 next_time = 0; + + cl = eltree_get_minel(q); + if (cl) + next_time = cl->cl_e; + if (q->root.cl_cfmin != 0) { + if (next_time == 0 || next_time > q->root.cl_cfmin) + next_time = q->root.cl_cfmin; + } + WARN_ON(next_time == 0); + qdisc_watchdog_schedule(&q->watchdog, next_time); +} + +static int +hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct tc_hfsc_qopt *qopt; + int err; + + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = nla_data(opt); + + q->defcls = qopt->defcls; + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + + q->root.cl_common.classid = sch->handle; + q->root.refcnt = 1; + q->root.sched = q; + q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); + if (q->root.qdisc == NULL) + q->root.qdisc = &noop_qdisc; + INIT_LIST_HEAD(&q->root.children); + q->root.vt_tree = RB_ROOT; + q->root.cf_tree = RB_ROOT; + + qdisc_class_hash_insert(&q->clhash, &q->root.cl_common); + qdisc_class_hash_grow(sch, &q->clhash); + + qdisc_watchdog_init(&q->watchdog, sch); + + return 0; +} + +static int +hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct tc_hfsc_qopt *qopt; + + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = nla_data(opt); + + sch_tree_lock(sch); + q->defcls = qopt->defcls; + sch_tree_unlock(sch); + + return 0; +} + +static void +hfsc_reset_class(struct hfsc_class *cl) +{ + cl->cl_total = 0; + cl->cl_cumul = 0; + cl->cl_d = 0; + cl->cl_e = 0; + cl->cl_vt = 0; + cl->cl_vtadj = 0; + cl->cl_vtoff = 0; + cl->cl_cvtmin = 0; + cl->cl_cvtmax = 0; + cl->cl_cvtoff = 0; + cl->cl_pcvtoff = 0; + cl->cl_vtperiod = 0; + cl->cl_parentperiod = 0; + cl->cl_f = 0; + cl->cl_myf = 0; + cl->cl_myfadj = 0; + cl->cl_cfmin = 0; + cl->cl_nactive = 0; + + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + qdisc_reset(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); + if (cl->cl_flags & HFSC_FSC) + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); + if (cl->cl_flags & HFSC_USC) + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); +} + +static void +hfsc_reset_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + struct hlist_node *n; + unsigned int i; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hfsc_reset_class(cl); + } + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + qdisc_watchdog_cancel(&q->watchdog); + sch->q.qlen = 0; +} + +static void +hfsc_destroy_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hlist_node *n, *next; + struct hfsc_class *cl; + unsigned int i; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + tcf_destroy_chain(&cl->filter_list); + } + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + cl_common.hnode) + hfsc_destroy_class(sch, cl); + } + qdisc_class_hash_destroy(&q->clhash); + qdisc_watchdog_cancel(&q->watchdog); +} + +static int +hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hfsc_sched *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_hfsc_qopt qopt; + + qopt.defcls = q->defcls; + NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + return skb->len; + + nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_class *cl; + int uninitialized_var(err); + + cl = hfsc_classify(skb, sch, &err); + if (cl == NULL) { + if (err & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + + err = qdisc_enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } + return err; + } + + if (cl->qdisc->q.qlen == 1) + set_active(cl, qdisc_pkt_len(skb)); + + bstats_update(&cl->bstats, skb); + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static struct sk_buff * +hfsc_dequeue(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + struct sk_buff *skb; + u64 cur_time; + unsigned int next_len; + int realtime = 0; + + if (sch->q.qlen == 0) + return NULL; + + cur_time = psched_get_time(); + + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + cl = eltree_get_mindl(q, cur_time); + if (cl) { + realtime = 1; + } else { + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = vttree_get_minvt(&q->root, cur_time); + if (cl == NULL) { + sch->qstats.overlimits++; + hfsc_schedule_watchdog(sch); + return NULL; + } + } + + skb = qdisc_dequeue_peeked(cl->qdisc); + if (skb == NULL) { + qdisc_warn_nonwc("HFSC", cl->qdisc); + return NULL; + } + + update_vf(cl, qdisc_pkt_len(skb), cur_time); + if (realtime) + cl->cl_cumul += qdisc_pkt_len(skb); + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) { + /* update ed */ + next_len = qdisc_peek_len(cl->qdisc); + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + return skb; +} + +static unsigned int +hfsc_drop(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int len; + + list_for_each_entry(cl, &q->droplist, dlist) { + if (cl->qdisc->ops->drop != NULL && + (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { + if (cl->qdisc->q.qlen == 0) { + update_vf(cl, 0, 0); + set_passive(cl); + } else { + list_move_tail(&cl->dlist, &q->droplist); + } + cl->qstats.drops++; + sch->qstats.drops++; + sch->q.qlen--; + return len; + } + } + return 0; +} + +static const struct Qdisc_class_ops hfsc_class_ops = { + .change = hfsc_change_class, + .delete = hfsc_delete_class, + .graft = hfsc_graft_class, + .leaf = hfsc_class_leaf, + .qlen_notify = hfsc_qlen_notify, + .get = hfsc_get_class, + .put = hfsc_put_class, + .bind_tcf = hfsc_bind_tcf, + .unbind_tcf = hfsc_unbind_tcf, + .tcf_chain = hfsc_tcf_chain, + .dump = hfsc_dump_class, + .dump_stats = hfsc_dump_class_stats, + .walk = hfsc_walk +}; + +static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { + .id = "hfsc", + .init = hfsc_init_qdisc, + .change = hfsc_change_qdisc, + .reset = hfsc_reset_qdisc, + .destroy = hfsc_destroy_qdisc, + .dump = hfsc_dump_qdisc, + .enqueue = hfsc_enqueue, + .dequeue = hfsc_dequeue, + .peek = qdisc_peek_dequeued, + .drop = hfsc_drop, + .cl_ops = &hfsc_class_ops, + .priv_size = sizeof(struct hfsc_sched), + .owner = THIS_MODULE +}; + +static int __init +hfsc_init(void) +{ + return register_qdisc(&hfsc_qdisc_ops); +} + +static void __exit +hfsc_cleanup(void) +{ + unregister_qdisc(&hfsc_qdisc_ops); +} + +MODULE_LICENSE("GPL"); +module_init(hfsc_init); +module_exit(hfsc_cleanup); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c new file mode 100644 index 00000000..29b942ce --- /dev/null +++ b/net/sched/sch_htb.c @@ -0,0 +1,1587 @@ +/* + * net/sched/sch_htb.c Hierarchical token bucket, feed tree version + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Martin Devera, + * + * Credits (in time order) for older HTB versions: + * Stef Coene + * HTB support at LARTC mailing list + * Ondrej Kraus, + * found missing INIT_QDISC(htb) + * Vladimir Smelhaus, Aamer Akhter, Bert Hubert + * helped a lot to locate nasty class stall bug + * Andi Kleen, Jamal Hadi, Bert Hubert + * code review and helpful comments on shaping + * Tomasz Wrona, + * created test case so that I was able to fix nasty bug + * Wilfried Weissmann + * spotted bug in dequeue code and helped with fix + * Jiri Fojtasek + * fixed requeue routine + * and many others. thanks. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* HTB algorithm. + Author: devik@cdi.cz + ======================================================================== + HTB is like TBF with multiple classes. It is also similar to CBQ because + it allows to assign priority to each class in hierarchy. + In fact it is another implementation of Floyd's formal sharing. + + Levels: + Each class is assigned level. Leaf has ALWAYS level 0 and root + classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level + one less than their parent. +*/ + +static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */ +#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ + +#if HTB_VER >> 16 != TC_HTB_PROTOVER +#error "Mismatched sch_htb.c and pkt_sch.h" +#endif + +/* Module parameter and sysfs export */ +module_param (htb_hysteresis, int, 0640); +MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate"); + +/* used internaly to keep status of single class */ +enum htb_cmode { + HTB_CANT_SEND, /* class can't send and can't borrow */ + HTB_MAY_BORROW, /* class can't send but may borrow */ + HTB_CAN_SEND /* class can send */ +}; + +/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_class { + struct Qdisc_class_common common; + /* general class parameters */ + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + struct tc_htb_xstats xstats; /* our special stats */ + int refcnt; /* usage count of this class */ + + /* topology */ + int level; /* our level (see above) */ + unsigned int children; + struct htb_class *parent; /* parent class */ + + int prio; /* these two are used only by leaves... */ + int quantum; /* but stored for parent-to-leaf return */ + + union { + struct htb_class_leaf { + struct Qdisc *q; + int deficit[TC_HTB_MAXDEPTH]; + struct list_head drop_list; + } leaf; + struct htb_class_inner { + struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ + struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ + /* When class changes from state 1->2 and disconnects from + * parent's feed then we lost ptr value and start from the + * first child again. Here we store classid of the + * last valid ptr (used when ptr is NULL). + */ + u32 last_ptr_id[TC_HTB_NUMPRIO]; + } inner; + } un; + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ + struct rb_node pq_node; /* node for event queue */ + psched_time_t pq_key; + + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + + /* class attached filters */ + struct tcf_proto *filter_list; + int filter_cnt; + + /* token bucket parameters */ + struct qdisc_rate_table *rate; /* rate table of the class itself */ + struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ + long buffer, cbuffer; /* token bucket depth/rate */ + psched_tdiff_t mbuffer; /* max wait time */ + long tokens, ctokens; /* current number of tokens */ + psched_time_t t_c; /* checkpoint time */ +}; + +struct htb_sched { + struct Qdisc_class_hash clhash; + struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ + + /* self list - roots of self generating tree */ + struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + int row_mask[TC_HTB_MAXDEPTH]; + struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + + /* self wait list - roots of wait PQs per row */ + struct rb_root wait_pq[TC_HTB_MAXDEPTH]; + + /* time of nearest event per level (row) */ + psched_time_t near_ev_cache[TC_HTB_MAXDEPTH]; + + int defcls; /* class where unclassified flows go to */ + + /* filters for qdisc itself */ + struct tcf_proto *filter_list; + + int rate2quantum; /* quant = rate / rate2quantum */ + psched_time_t now; /* cached dequeue time */ + struct qdisc_watchdog watchdog; + + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + int direct_qlen; /* max qlen of above */ + + long direct_pkts; + +#define HTB_WARN_TOOMANYEVENTS 0x1 + unsigned int warned; /* only one warning */ + struct work_struct work; +}; + +/* find class in global hash table using given handle */ +static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, handle); + if (clc == NULL) + return NULL; + return container_of(clc, struct htb_class, common); +} + +/** + * htb_classify - classify a packet into class + * + * It returns NULL if the packet should be dropped or -1 if the packet + * should be passed directly thru. In all other cases leaf class is returned. + * We allow direct class selection by classid in priority. The we examine + * filters in qdisc and in inner nodes (if higher filter points to the inner + * node). If we end up with classid MAJOR:0 we enqueue the skb into special + * internal fifo (direct). These packets then go directly thru. If we still + * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful + * then finish and return direct queue. + */ +#define HTB_DIRECT ((struct htb_class *)-1L) + +static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + /* allow to select class by setting skb->priority to valid classid; + * note that nfmark can be used too by attaching filter fw with no + * rules in it + */ + if (skb->priority == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) selected */ + cl = htb_find(skb->priority, sch); + if (cl && cl->level == 0) + return cl; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + tcf = q->filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (void *)res.class; + if (!cl) { + if (res.classid == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) */ + cl = htb_find(res.classid, sch); + if (!cl) + break; /* filter selected invalid classid */ + } + if (!cl->level) + return cl; /* we hit leaf; return it */ + + /* we have got inner class; apply inner filter chain */ + tcf = cl->filter_list; + } + /* classification failed; try to use default class */ + cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + if (!cl || cl->level) + return HTB_DIRECT; /* bad default .. this is safe bet */ + return cl; +} + +/** + * htb_add_to_id_tree - adds class to the round robin list + * + * Routine adds class to the list (actually tree) sorted by classid. + * Make sure that class is not already on such list for given prio. + */ +static void htb_add_to_id_tree(struct rb_root *root, + struct htb_class *cl, int prio) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + + while (*p) { + struct htb_class *c; + parent = *p; + c = rb_entry(parent, struct htb_class, node[prio]); + + if (cl->common.classid > c->common.classid) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->node[prio], parent, p); + rb_insert_color(&cl->node[prio], root); +} + +/** + * htb_add_to_wait_tree - adds class to the event queue with delay + * + * The class is added to priority event queue to indicate that class will + * change its mode in cl->pq_key microseconds. Make sure that class is not + * already in the queue. + */ +static void htb_add_to_wait_tree(struct htb_sched *q, + struct htb_class *cl, long delay) +{ + struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + + cl->pq_key = q->now + delay; + if (cl->pq_key == q->now) + cl->pq_key++; + + /* update the nearest event cache */ + if (q->near_ev_cache[cl->level] > cl->pq_key) + q->near_ev_cache[cl->level] = cl->pq_key; + + while (*p) { + struct htb_class *c; + parent = *p; + c = rb_entry(parent, struct htb_class, pq_node); + if (cl->pq_key >= c->pq_key) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->pq_node, parent, p); + rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); +} + +/** + * htb_next_rb_node - finds next node in binary tree + * + * When we are past last key we return NULL. + * Average complexity is 2 steps per call. + */ +static inline void htb_next_rb_node(struct rb_node **n) +{ + *n = rb_next(*n); +} + +/** + * htb_add_class_to_row - add class to its row + * + * The class is added to row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static inline void htb_add_class_to_row(struct htb_sched *q, + struct htb_class *cl, int mask) +{ + q->row_mask[cl->level] |= mask; + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio); + } +} + +/* If this triggers, it is a bug in this code, but it need not be fatal */ +static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root) +{ + if (RB_EMPTY_NODE(rb)) { + WARN_ON(1); + } else { + rb_erase(rb, root); + RB_CLEAR_NODE(rb); + } +} + + +/** + * htb_remove_class_from_row - removes class from its row + * + * The class is removed from row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static inline void htb_remove_class_from_row(struct htb_sched *q, + struct htb_class *cl, int mask) +{ + int m = 0; + + while (mask) { + int prio = ffz(~mask); + + mask &= ~(1 << prio); + if (q->ptr[cl->level][prio] == cl->node + prio) + htb_next_rb_node(q->ptr[cl->level] + prio); + + htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio); + if (!q->row[cl->level][prio].rb_node) + m |= 1 << prio; + } + q->row_mask[cl->level] &= ~m; +} + +/** + * htb_activate_prios - creates active classe's feed chain + * + * The class is connected to ancestors and/or appropriate rows + * for priorities it is participating on. cl->cmode must be new + * (activated) mode. It does nothing if cl->prio_activity == 0. + */ +static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m, mask = cl->prio_activity; + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + m = mask; + while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.feed[prio].rb_node) + /* parent already has its feed in use so that + * reset bit in mask as parent is already ok + */ + mask &= ~(1 << prio); + + htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); + } + p->prio_activity |= mask; + cl = p; + p = cl->parent; + + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_add_class_to_row(q, cl, mask); +} + +/** + * htb_deactivate_prios - remove class from feed chain + * + * cl->cmode must represent old mode (before deactivation). It does + * nothing if cl->prio_activity == 0. Class is removed from all feed + * chains and rows. + */ +static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m, mask = cl->prio_activity; + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + m = mask; + mask = 0; + while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.ptr[prio] == cl->node + prio) { + /* we are removing child which is pointed to from + * parent feed - forget the pointer but remember + * classid + */ + p->un.inner.last_ptr_id[prio] = cl->common.classid; + p->un.inner.ptr[prio] = NULL; + } + + htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio); + + if (!p->un.inner.feed[prio].rb_node) + mask |= 1 << prio; + } + + p->prio_activity &= ~mask; + cl = p; + p = cl->parent; + + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_remove_class_from_row(q, cl, mask); +} + +static inline long htb_lowater(const struct htb_class *cl) +{ + if (htb_hysteresis) + return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; + else + return 0; +} +static inline long htb_hiwater(const struct htb_class *cl) +{ + if (htb_hysteresis) + return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; + else + return 0; +} + + +/** + * htb_class_mode - computes and returns current class mode + * + * It computes cl's mode at time cl->t_c+diff and returns it. If mode + * is not HTB_CAN_SEND then cl->pq_key is updated to time difference + * from now to time when cl will change its state. + * Also it is worth to note that class mode doesn't change simply + * at cl->{c,}tokens == 0 but there can rather be hysteresis of + * 0 .. -cl->{c,}buffer range. It is meant to limit number of + * mode transitions per time unit. The speed gain is about 1/6. + */ +static inline enum htb_cmode +htb_class_mode(struct htb_class *cl, long *diff) +{ + long toks; + + if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { + *diff = -toks; + return HTB_CANT_SEND; + } + + if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl)) + return HTB_CAN_SEND; + + *diff = -toks; + return HTB_MAY_BORROW; +} + +/** + * htb_change_class_mode - changes classe's mode + * + * This should be the only way how to change classe's mode under normal + * cirsumstances. Routine will update feed lists linkage, change mode + * and add class to the wait event queue if appropriate. New mode should + * be different from old one and cl->pq_key has to be valid if changing + * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). + */ +static void +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +{ + enum htb_cmode new_mode = htb_class_mode(cl, diff); + + if (new_mode == cl->cmode) + return; + + if (cl->prio_activity) { /* not necessary: speed optimization */ + if (cl->cmode != HTB_CANT_SEND) + htb_deactivate_prios(q, cl); + cl->cmode = new_mode; + if (new_mode != HTB_CANT_SEND) + htb_activate_prios(q, cl); + } else + cl->cmode = new_mode; +} + +/** + * htb_activate - inserts leaf cl into appropriate active feeds + * + * Routine learns (new) priority of leaf and activates feed chain + * for the prio. It can be called on already active leaf safely. + * It also adds leaf into droplist. + */ +static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) +{ + WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen); + + if (!cl->prio_activity) { + cl->prio_activity = 1 << cl->prio; + htb_activate_prios(q, cl); + list_add_tail(&cl->un.leaf.drop_list, + q->drops + cl->prio); + } +} + +/** + * htb_deactivate - remove leaf cl from active feeds + * + * Make sure that leaf is active. In the other words it can't be called + * with non-active leaf. It also removes class from the drop list. + */ +static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) +{ + WARN_ON(!cl->prio_activity); + + htb_deactivate_prios(q, cl); + cl->prio_activity = 0; + list_del_init(&cl->un.leaf.drop_list); +} + +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + int uninitialized_var(ret); + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = htb_classify(skb, sch, &ret); + + if (cl == HTB_DIRECT) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen) { + __skb_queue_tail(&q->direct_queue, skb); + q->direct_pkts++; + } else { + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; + } +#ifdef CONFIG_NET_CLS_ACT + } else if (!cl) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +#endif + } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + cl->qstats.drops++; + } + return ret; + } else { + bstats_update(&cl->bstats, skb); + htb_activate(q, cl); + } + + sch->q.qlen++; + return NET_XMIT_SUCCESS; +} + +static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) +{ + long toks = diff + cl->tokens; + + if (toks > cl->buffer) + toks = cl->buffer; + toks -= (long) qdisc_l2t(cl->rate, bytes); + if (toks <= -cl->mbuffer) + toks = 1 - cl->mbuffer; + + cl->tokens = toks; +} + +static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) +{ + long toks = diff + cl->ctokens; + + if (toks > cl->cbuffer) + toks = cl->cbuffer; + toks -= (long) qdisc_l2t(cl->ceil, bytes); + if (toks <= -cl->mbuffer) + toks = 1 - cl->mbuffer; + + cl->ctokens = toks; +} + +/** + * htb_charge_class - charges amount "bytes" to leaf and ancestors + * + * Routine assumes that packet "bytes" long was dequeued from leaf cl + * borrowing from "level". It accounts bytes to ceil leaky bucket for + * leaf and all ancestors and to rate bucket for ancestors at levels + * "level" and higher. It also handles possible change of mode resulting + * from the update. Note that mode can also increase here (MAY_BORROW to + * CAN_SEND) because we can use more precise clock that event queue here. + * In such case we remove class from event queue first. + */ +static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, + int level, struct sk_buff *skb) +{ + int bytes = qdisc_pkt_len(skb); + enum htb_cmode old_mode; + long diff; + + while (cl) { + diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + if (cl->level >= level) { + if (cl->level == level) + cl->xstats.lends++; + htb_accnt_tokens(cl, bytes, diff); + } else { + cl->xstats.borrows++; + cl->tokens += diff; /* we moved t_c; update tokens */ + } + htb_accnt_ctokens(cl, bytes, diff); + cl->t_c = q->now; + + old_mode = cl->cmode; + diff = 0; + htb_change_class_mode(q, cl, &diff); + if (old_mode != cl->cmode) { + if (old_mode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree(q, cl, diff); + } + + /* update basic stats except for leaves which are already updated */ + if (cl->level) + bstats_update(&cl->bstats, skb); + + cl = cl->parent; + } +} + +/** + * htb_do_events - make mode changes to classes at the level + * + * Scans event queue for pending events and applies them. Returns time of + * next pending event (0 for no event in pq, q->now for too many events). + * Note: Applied are events whose have cl->pq_key <= q->now. + */ +static psched_time_t htb_do_events(struct htb_sched *q, int level, + unsigned long start) +{ + /* don't run for longer than 2 jiffies; 2 is used instead of + * 1 to simplify things when jiffy is going to be incremented + * too soon + */ + unsigned long stop_at = start + 2; + while (time_before(jiffies, stop_at)) { + struct htb_class *cl; + long diff; + struct rb_node *p = rb_first(&q->wait_pq[level]); + + if (!p) + return 0; + + cl = rb_entry(p, struct htb_class, pq_node); + if (cl->pq_key > q->now) + return cl->pq_key; + + htb_safe_rb_erase(p, q->wait_pq + level); + diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + htb_change_class_mode(q, cl, &diff); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree(q, cl, diff); + } + + /* too much load - let's continue after a break for scheduling */ + if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { + pr_warning("htb: too many events!\n"); + q->warned |= HTB_WARN_TOOMANYEVENTS; + } + + return q->now; +} + +/* Returns class->node+prio from id-tree where classe's id is >= id. NULL + * is no such one exists. + */ +static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, + u32 id) +{ + struct rb_node *r = NULL; + while (n) { + struct htb_class *cl = + rb_entry(n, struct htb_class, node[prio]); + + if (id > cl->common.classid) { + n = n->rb_right; + } else if (id < cl->common.classid) { + r = n; + n = n->rb_left; + } else { + return n; + } + } + return r; +} + +/** + * htb_lookup_leaf - returns next leaf class in DRR order + * + * Find leaf where current feed pointers points to. + */ +static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, + struct rb_node **pptr, u32 * pid) +{ + int i; + struct { + struct rb_node *root; + struct rb_node **pptr; + u32 *pid; + } stk[TC_HTB_MAXDEPTH], *sp = stk; + + BUG_ON(!tree->rb_node); + sp->root = tree->rb_node; + sp->pptr = pptr; + sp->pid = pid; + + for (i = 0; i < 65535; i++) { + if (!*sp->pptr && *sp->pid) { + /* ptr was invalidated but id is valid - try to recover + * the original or next ptr + */ + *sp->pptr = + htb_id_find_next_upper(prio, sp->root, *sp->pid); + } + *sp->pid = 0; /* ptr is valid now so that remove this hint as it + * can become out of date quickly + */ + if (!*sp->pptr) { /* we are at right end; rewind & go up */ + *sp->pptr = sp->root; + while ((*sp->pptr)->rb_left) + *sp->pptr = (*sp->pptr)->rb_left; + if (sp > stk) { + sp--; + if (!*sp->pptr) { + WARN_ON(1); + return NULL; + } + htb_next_rb_node(sp->pptr); + } + } else { + struct htb_class *cl; + cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); + if (!cl->level) + return cl; + (++sp)->root = cl->un.inner.feed[prio].rb_node; + sp->pptr = cl->un.inner.ptr + prio; + sp->pid = cl->un.inner.last_ptr_id + prio; + } + } + WARN_ON(1); + return NULL; +} + +/* dequeues packet at given priority and level; call only if + * you are sure that there is active class at prio/level + */ +static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, + int level) +{ + struct sk_buff *skb = NULL; + struct htb_class *cl, *start; + /* look initial class up in the row */ + start = cl = htb_lookup_leaf(q->row[level] + prio, prio, + q->ptr[level] + prio, + q->last_ptr_id[level] + prio); + + do { +next: + if (unlikely(!cl)) + return NULL; + + /* class can be empty - it is unlikely but can be true if leaf + * qdisc drops packets in enqueue routine or if someone used + * graft operation on the leaf since last dequeue; + * simply deactivate and skip such class + */ + if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + struct htb_class *next; + htb_deactivate(q, cl); + + /* row/level might become empty */ + if ((q->row_mask[level] & (1 << prio)) == 0) + return NULL; + + next = htb_lookup_leaf(q->row[level] + prio, + prio, q->ptr[level] + prio, + q->last_ptr_id[level] + prio); + + if (cl == start) /* fix start if we just deleted it */ + start = next; + cl = next; + goto next; + } + + skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); + if (likely(skb != NULL)) + break; + + qdisc_warn_nonwc("htb", cl->un.leaf.q); + htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> + ptr[0]) + prio); + cl = htb_lookup_leaf(q->row[level] + prio, prio, + q->ptr[level] + prio, + q->last_ptr_id[level] + prio); + + } while (cl != start); + + if (likely(skb != NULL)) { + cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); + if (cl->un.leaf.deficit[level] < 0) { + cl->un.leaf.deficit[level] += cl->quantum; + htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> + ptr[0]) + prio); + } + /* this used to be after charge_class but this constelation + * gives us slightly better performance + */ + if (!cl->un.leaf.q->q.qlen) + htb_deactivate(q, cl); + htb_charge_class(q, cl, level, skb); + } + return skb; +} + +static struct sk_buff *htb_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct htb_sched *q = qdisc_priv(sch); + int level; + psched_time_t next_event; + unsigned long start_at; + + /* try to dequeue direct packets as high prio (!) to minimize cpu work */ + skb = __skb_dequeue(&q->direct_queue); + if (skb != NULL) { +ok: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); + sch->q.qlen--; + return skb; + } + + if (!sch->q.qlen) + goto fin; + q->now = psched_get_time(); + start_at = jiffies; + + next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; + + for (level = 0; level < TC_HTB_MAXDEPTH; level++) { + /* common case optimization - skip event handler quickly */ + int m; + psched_time_t event; + + if (q->now >= q->near_ev_cache[level]) { + event = htb_do_events(q, level, start_at); + if (!event) + event = q->now + PSCHED_TICKS_PER_SEC; + q->near_ev_cache[level] = event; + } else + event = q->near_ev_cache[level]; + + if (next_event > event) + next_event = event; + + m = ~q->row_mask[level]; + while (m != (int)(-1)) { + int prio = ffz(m); + + m |= 1 << prio; + skb = htb_dequeue_tree(q, prio, level); + if (likely(skb != NULL)) + goto ok; + } + } + sch->qstats.overlimits++; + if (likely(next_event > q->now)) + qdisc_watchdog_schedule(&q->watchdog, next_event); + else + schedule_work(&q->work); +fin: + return skb; +} + +/* try to drop from each class (by prio) until one succeed */ +static unsigned int htb_drop(struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int prio; + + for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { + struct list_head *p; + list_for_each(p, q->drops + prio) { + struct htb_class *cl = list_entry(p, struct htb_class, + un.leaf.drop_list); + unsigned int len; + if (cl->un.leaf.q->ops->drop && + (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + sch->q.qlen--; + if (!cl->un.leaf.q->q.qlen) + htb_deactivate(q, cl); + return len; + } + } + } + return 0; +} + +/* reset all classes */ +/* always caled under BH & queue lock */ +static void htb_reset(struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl; + struct hlist_node *n; + unsigned int i; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (cl->level) + memset(&cl->un.inner, 0, sizeof(cl->un.inner)); + else { + if (cl->un.leaf.q) + qdisc_reset(cl->un.leaf.q); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); + } + cl->prio_activity = 0; + cl->cmode = HTB_CAN_SEND; + + } + } + qdisc_watchdog_cancel(&q->watchdog); + __skb_queue_purge(&q->direct_queue); + sch->q.qlen = 0; + memset(q->row, 0, sizeof(q->row)); + memset(q->row_mask, 0, sizeof(q->row_mask)); + memset(q->wait_pq, 0, sizeof(q->wait_pq)); + memset(q->ptr, 0, sizeof(q->ptr)); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops + i); +} + +static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { + [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) }, + [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) }, + [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +}; + +static void htb_work_func(struct work_struct *work) +{ + struct htb_sched *q = container_of(work, struct htb_sched, work); + struct Qdisc *sch = q->watchdog.qdisc; + + __netif_schedule(qdisc_root(sch)); +} + +static int htb_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct htb_sched *q = qdisc_priv(sch); + struct nlattr *tb[TCA_HTB_INIT + 1]; + struct tc_htb_glob *gopt; + int err; + int i; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy); + if (err < 0) + return err; + + if (tb[TCA_HTB_INIT] == NULL) { + pr_err("HTB: hey probably you have bad tc tool ?\n"); + return -EINVAL; + } + gopt = nla_data(tb[TCA_HTB_INIT]); + if (gopt->version != HTB_VER >> 16) { + pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n", + HTB_VER >> 16, HTB_VER & 0xffff, gopt->version); + return -EINVAL; + } + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops + i); + + qdisc_watchdog_init(&q->watchdog, sch); + INIT_WORK(&q->work, htb_work_func); + skb_queue_head_init(&q->direct_queue); + + q->direct_qlen = qdisc_dev(sch)->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + + if ((q->rate2quantum = gopt->rate2quantum) < 1) + q->rate2quantum = 1; + q->defcls = gopt->defcls; + + return 0; +} + +static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); + struct htb_sched *q = qdisc_priv(sch); + struct nlattr *nest; + struct tc_htb_glob gopt; + + spin_lock_bh(root_lock); + + gopt.direct_pkts = q->direct_pkts; + gopt.version = HTB_VER; + gopt.rate2quantum = q->rate2quantum; + gopt.defcls = q->defcls; + gopt.debug = 0; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); + nla_nest_end(skb, nest); + + spin_unlock_bh(root_lock); + return skb->len; + +nla_put_failure: + spin_unlock_bh(root_lock); + nla_nest_cancel(skb, nest); + return -1; +} + +static int htb_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct htb_class *cl = (struct htb_class *)arg; + spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); + struct nlattr *nest; + struct tc_htb_opt opt; + + spin_lock_bh(root_lock); + tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + if (!cl->level && cl->un.leaf.q) + tcm->tcm_info = cl->un.leaf.q->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + memset(&opt, 0, sizeof(opt)); + + opt.rate = cl->rate->rate; + opt.buffer = cl->buffer; + opt.ceil = cl->ceil->rate; + opt.cbuffer = cl->cbuffer; + opt.quantum = cl->quantum; + opt.prio = cl->prio; + opt.level = cl->level; + NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + + nla_nest_end(skb, nest); + spin_unlock_bh(root_lock); + return skb->len; + +nla_put_failure: + spin_unlock_bh(root_lock); + nla_nest_cancel(skb, nest); + return -1; +} + +static int +htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) +{ + struct htb_class *cl = (struct htb_class *)arg; + + if (!cl->level && cl->un.leaf.q) + cl->qstats.qlen = cl->un.leaf.q->q.qlen; + cl->xstats.tokens = cl->tokens; + cl->xstats.ctokens = cl->ctokens; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct htb_class *cl = (struct htb_class *)arg; + + if (cl->level) + return -EINVAL; + if (new == NULL && + (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->common.classid)) == NULL) + return -ENOBUFS; + + sch_tree_lock(sch); + *old = cl->un.leaf.q; + cl->un.leaf.q = new; + if (*old != NULL) { + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + } + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class *)arg; + return !cl->level ? cl->un.leaf.q : NULL; +} + +static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class *)arg; + + if (cl->un.leaf.q->q.qlen == 0) + htb_deactivate(qdisc_priv(sch), cl); +} + +static unsigned long htb_get(struct Qdisc *sch, u32 classid) +{ + struct htb_class *cl = htb_find(classid, sch); + if (cl) + cl->refcnt++; + return (unsigned long)cl; +} + +static inline int htb_parent_last_child(struct htb_class *cl) +{ + if (!cl->parent) + /* the root class */ + return 0; + if (cl->parent->children > 1) + /* not the last child */ + return 0; + return 1; +} + +static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, + struct Qdisc *new_q) +{ + struct htb_class *parent = cl->parent; + + WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); + + if (parent->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level); + + parent->level = 0; + memset(&parent->un.inner, 0, sizeof(parent->un.inner)); + INIT_LIST_HEAD(&parent->un.leaf.drop_list); + parent->un.leaf.q = new_q ? new_q : &noop_qdisc; + parent->tokens = parent->buffer; + parent->ctokens = parent->cbuffer; + parent->t_c = psched_get_time(); + parent->cmode = HTB_CAN_SEND; +} + +static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) +{ + if (!cl->level) { + WARN_ON(!cl->un.leaf.q); + qdisc_destroy(cl->un.leaf.q); + } + gen_kill_estimator(&cl->bstats, &cl->rate_est); + qdisc_put_rtab(cl->rate); + qdisc_put_rtab(cl->ceil); + + tcf_destroy_chain(&cl->filter_list); + kfree(cl); +} + +static void htb_destroy(struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + struct hlist_node *n, *next; + struct htb_class *cl; + unsigned int i; + + cancel_work_sync(&q->work); + qdisc_watchdog_cancel(&q->watchdog); + /* This line used to be after htb_destroy_class call below + * and surprisingly it worked in 2.4. But it must precede it + * because filter need its target class alive to be able to call + * unbind_filter on it (without Oops). + */ + tcf_destroy_chain(&q->filter_list); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) + tcf_destroy_chain(&cl->filter_list); + } + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + common.hnode) + htb_destroy_class(sch, cl); + } + qdisc_class_hash_destroy(&q->clhash); + __skb_queue_purge(&q->direct_queue); +} + +static int htb_delete(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)arg; + unsigned int qlen; + struct Qdisc *new_q = NULL; + int last_child = 0; + + // TODO: why don't allow to delete subtree ? references ? does + // tc subsys quarantee us that in htb_destroy it holds no class + // refs so that we can remove children safely there ? + if (cl->children || cl->filter_cnt) + return -EBUSY; + + if (!cl->level && htb_parent_last_child(cl)) { + new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + cl->parent->common.classid); + last_child = 1; + } + + sch_tree_lock(sch); + + if (!cl->level) { + qlen = cl->un.leaf.q->q.qlen; + qdisc_reset(cl->un.leaf.q); + qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen); + } + + /* delete from hash and active; remainder in destroy_class */ + qdisc_class_hash_remove(&q->clhash, &cl->common); + if (cl->parent) + cl->parent->children--; + + if (cl->prio_activity) + htb_deactivate(q, cl); + + if (cl->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + + if (last_child) + htb_parent_to_leaf(q, cl, new_q); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static void htb_put(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class *)arg; + + if (--cl->refcnt == 0) + htb_destroy_class(sch, cl); +} + +static int htb_change_class(struct Qdisc *sch, u32 classid, + u32 parentid, struct nlattr **tca, + unsigned long *arg) +{ + int err = -EINVAL; + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)*arg, *parent; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; + struct nlattr *tb[__TCA_HTB_MAX]; + struct tc_htb_opt *hopt; + + /* extract all subattrs from opt attr */ + if (!opt) + goto failure; + + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); + if (err < 0) + goto failure; + + err = -EINVAL; + if (tb[TCA_HTB_PARMS] == NULL) + goto failure; + + parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); + + hopt = nla_data(tb[TCA_HTB_PARMS]); + + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); + if (!rtab || !ctab) + goto failure; + + if (!cl) { /* new class */ + struct Qdisc *new_q; + int prio; + struct { + struct nlattr nla; + struct gnet_estimator opt; + } est = { + .nla = { + .nla_len = nla_attr_size(sizeof(est.opt)), + .nla_type = TCA_RATE, + }, + .opt = { + /* 4s interval, 16s averaging constant */ + .interval = 2, + .ewma_log = 2, + }, + }; + + /* check for valid classid */ + if (!classid || TC_H_MAJ(classid ^ sch->handle) || + htb_find(classid, sch)) + goto failure; + + /* check maximal depth */ + if (parent && parent->parent && parent->parent->level < 2) { + pr_err("htb: tree is too deep\n"); + goto failure; + } + err = -ENOBUFS; + cl = kzalloc(sizeof(*cl), GFP_KERNEL); + if (!cl) + goto failure; + + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE] ? : &est.nla); + if (err) { + kfree(cl); + goto failure; + } + + cl->refcnt = 1; + cl->children = 0; + INIT_LIST_HEAD(&cl->un.leaf.drop_list); + RB_CLEAR_NODE(&cl->pq_node); + + for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) + RB_CLEAR_NODE(&cl->node[prio]); + + /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) + * so that can't be used inside of sch_tree_lock + * -- thanks to Karlis Peisenieks + */ + new_q = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + sch_tree_lock(sch); + if (parent && !parent->level) { + unsigned int qlen = parent->un.leaf.q->q.qlen; + + /* turn parent into inner node */ + qdisc_reset(parent->un.leaf.q); + qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen); + qdisc_destroy(parent->un.leaf.q); + if (parent->prio_activity) + htb_deactivate(q, parent); + + /* remove from evt list because of level change */ + if (parent->cmode != HTB_CAN_SEND) { + htb_safe_rb_erase(&parent->pq_node, q->wait_pq); + parent->cmode = HTB_CAN_SEND; + } + parent->level = (parent->parent ? parent->parent->level + : TC_HTB_MAXDEPTH) - 1; + memset(&parent->un.inner, 0, sizeof(parent->un.inner)); + } + /* leaf (we) needs elementary qdisc */ + cl->un.leaf.q = new_q ? new_q : &noop_qdisc; + + cl->common.classid = classid; + cl->parent = parent; + + /* set class to be in HTB_CAN_SEND state */ + cl->tokens = hopt->buffer; + cl->ctokens = hopt->cbuffer; + cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */ + cl->t_c = psched_get_time(); + cl->cmode = HTB_CAN_SEND; + + /* attach to the hash list and parent's family */ + qdisc_class_hash_insert(&q->clhash, &cl->common); + if (parent) + parent->children++; + } else { + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + sch_tree_lock(sch); + } + + /* it used to be a nasty bug here, we have to check that node + * is really leaf before changing cl->un.leaf ! + */ + if (!cl->level) { + cl->quantum = rtab->rate.rate / q->rate2quantum; + if (!hopt->quantum && cl->quantum < 1000) { + pr_warning( + "HTB: quantum of class %X is small. Consider r2q change.\n", + cl->common.classid); + cl->quantum = 1000; + } + if (!hopt->quantum && cl->quantum > 200000) { + pr_warning( + "HTB: quantum of class %X is big. Consider r2q change.\n", + cl->common.classid); + cl->quantum = 200000; + } + if (hopt->quantum) + cl->quantum = hopt->quantum; + if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO) + cl->prio = TC_HTB_NUMPRIO - 1; + } + + cl->buffer = hopt->buffer; + cl->cbuffer = hopt->cbuffer; + if (cl->rate) + qdisc_put_rtab(cl->rate); + cl->rate = rtab; + if (cl->ceil) + qdisc_put_rtab(cl->ceil); + cl->ceil = ctab; + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; + +failure: + if (rtab) + qdisc_put_rtab(rtab); + if (ctab) + qdisc_put_rtab(ctab); + return err; +} + +static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)arg; + struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + + return fl; +} + +static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct htb_class *cl = htb_find(classid, sch); + + /*if (cl && !cl->level) return 0; + * The line above used to be there to prevent attaching filters to + * leaves. But at least tc_index filter uses this just to get class + * for other reasons so that we have to allow for it. + * ---- + * 19.6.2002 As Werner explained it is ok - bind filter is just + * another way to "lock" the class - unlike "get" this lock can + * be broken by class during destroy IIUC. + */ + if (cl) + cl->filter_cnt++; + return (unsigned long)cl; +} + +static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class *)arg; + + if (cl) + cl->filter_cnt--; +} + +static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl; + struct hlist_node *n; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static const struct Qdisc_class_ops htb_class_ops = { + .graft = htb_graft, + .leaf = htb_leaf, + .qlen_notify = htb_qlen_notify, + .get = htb_get, + .put = htb_put, + .change = htb_change_class, + .delete = htb_delete, + .walk = htb_walk, + .tcf_chain = htb_find_tcf, + .bind_tcf = htb_bind_filter, + .unbind_tcf = htb_unbind_filter, + .dump = htb_dump_class, + .dump_stats = htb_dump_class_stats, +}; + +static struct Qdisc_ops htb_qdisc_ops __read_mostly = { + .cl_ops = &htb_class_ops, + .id = "htb", + .priv_size = sizeof(struct htb_sched), + .enqueue = htb_enqueue, + .dequeue = htb_dequeue, + .peek = qdisc_peek_dequeued, + .drop = htb_drop, + .init = htb_init, + .reset = htb_reset, + .destroy = htb_destroy, + .dump = htb_dump, + .owner = THIS_MODULE, +}; + +static int __init htb_module_init(void) +{ + return register_qdisc(&htb_qdisc_ops); +} +static void __exit htb_module_exit(void) +{ + unregister_qdisc(&htb_qdisc_ops); +} + +module_init(htb_module_init) +module_exit(htb_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c new file mode 100644 index 00000000..bce16652 --- /dev/null +++ b/net/sched/sch_ingress.c @@ -0,0 +1,143 @@ +/* net/sched/sch_ingress.c - Ingress qdisc + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim 1999 + */ + +#include +#include +#include +#include +#include +#include +#include + + +struct ingress_qdisc_data { + struct tcf_proto *filter_list; +}; + +/* ------------------------- Class/flow operations ------------------------- */ + +static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long ingress_get(struct Qdisc *sch, u32 classid) +{ + return TC_H_MIN(classid) + 1; +} + +static unsigned long ingress_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return ingress_get(sch, classid); +} + +static void ingress_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ +} + +static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct ingress_qdisc_data *p = qdisc_priv(sch); + + return &p->filter_list; +} + +/* --------------------------- Qdisc operations ---------------------------- */ + +static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = qdisc_priv(sch); + struct tcf_result res; + int result; + + result = tc_classify(skb, p->filter_list, &res); + + qdisc_bstats_update(sch, skb); + switch (result) { + case TC_ACT_SHOT: + result = TC_ACT_SHOT; + sch->qstats.drops++; + break; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + result = TC_ACT_STOLEN; + break; + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + skb->tc_index = TC_H_MIN(res.classid); + default: + result = TC_ACT_OK; + break; + } + + return result; +} + +/* ------------------------------------------------------------- */ + +static void ingress_destroy(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = qdisc_priv(sch); + + tcf_destroy_chain(&p->filter_list); +} + +static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + nla_nest_end(skb, nest); + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static const struct Qdisc_class_ops ingress_class_ops = { + .leaf = ingress_leaf, + .get = ingress_get, + .put = ingress_put, + .walk = ingress_walk, + .tcf_chain = ingress_find_tcf, + .bind_tcf = ingress_bind_filter, + .unbind_tcf = ingress_put, +}; + +static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { + .cl_ops = &ingress_class_ops, + .id = "ingress", + .priv_size = sizeof(struct ingress_qdisc_data), + .enqueue = ingress_enqueue, + .destroy = ingress_destroy, + .dump = ingress_dump, + .owner = THIS_MODULE, +}; + +static int __init ingress_module_init(void) +{ + return register_qdisc(&ingress_qdisc_ops); +} + +static void __exit ingress_module_exit(void) +{ + unregister_qdisc(&ingress_qdisc_ops); +} + +module_init(ingress_module_init) +module_exit(ingress_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c new file mode 100644 index 00000000..ec5cbc84 --- /dev/null +++ b/net/sched/sch_mq.c @@ -0,0 +1,240 @@ +/* + * net/sched/sch_mq.c Classful multiqueue dummy scheduler + * + * Copyright (c) 2009 Patrick McHardy + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct mq_sched { + struct Qdisc **qdiscs; +}; + +static void mq_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mq_sched *priv = qdisc_priv(sch); + unsigned int ntx; + + if (!priv->qdiscs) + return; + for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) + qdisc_destroy(priv->qdiscs[ntx]); + kfree(priv->qdiscs); +} + +static int mq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct mq_sched *priv = qdisc_priv(sch); + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + unsigned int ntx; + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + /* pre-allocate qdiscs, attachment can't fail */ + priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), + GFP_KERNEL); + if (priv->qdiscs == NULL) + return -ENOMEM; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + dev_queue = netdev_get_tx_queue(dev, ntx); + qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(ntx + 1))); + if (qdisc == NULL) + goto err; + priv->qdiscs[ntx] = qdisc; + } + + sch->flags |= TCQ_F_MQROOT; + return 0; + +err: + mq_destroy(sch); + return -ENOMEM; +} + +static void mq_attach(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mq_sched *priv = qdisc_priv(sch); + struct Qdisc *qdisc; + unsigned int ntx; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = priv->qdiscs[ntx]; + qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (qdisc) + qdisc_destroy(qdisc); + } + kfree(priv->qdiscs); + priv->qdiscs = NULL; +} + +static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int ntx; + + sch->q.qlen = 0; + memset(&sch->bstats, 0, sizeof(sch->bstats)); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; + spin_lock_bh(qdisc_lock(qdisc)); + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.qlen += qdisc->qstats.qlen; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + return 0; +} + +static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1; + + if (ntx >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, ntx); +} + +static struct netdev_queue *mq_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + unsigned int ntx = TC_H_MIN(tcm->tcm_parent); + struct netdev_queue *dev_queue = mq_queue_get(sch, ntx); + + if (!dev_queue) { + struct net_device *dev = qdisc_dev(sch); + + return netdev_get_tx_queue(dev, 0); + } + return dev_queue; +} + +static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + struct net_device *dev = qdisc_dev(sch); + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = dev_graft_qdisc(dev_queue, new); + + if (dev->flags & IFF_UP) + dev_activate(dev); + return 0; +} + +static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + return dev_queue->qdisc_sleeping; +} + +static unsigned long mq_get(struct Qdisc *sch, u32 classid) +{ + unsigned int ntx = TC_H_MIN(classid); + + if (!mq_queue_get(sch, ntx)) + return 0; + return ntx; +} + +static void mq_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static int mq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + return 0; +} + +static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + sch->qstats.qlen = sch->q.qlen; + if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, &sch->qstats) < 0) + return -1; + return 0; +} + +static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx; + + if (arg->stop) + return; + + arg->count = arg->skip; + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops mq_class_ops = { + .select_queue = mq_select_queue, + .graft = mq_graft, + .leaf = mq_leaf, + .get = mq_get, + .put = mq_put, + .walk = mq_walk, + .dump = mq_dump_class, + .dump_stats = mq_dump_class_stats, +}; + +struct Qdisc_ops mq_qdisc_ops __read_mostly = { + .cl_ops = &mq_class_ops, + .id = "mq", + .priv_size = sizeof(struct mq_sched), + .init = mq_init, + .destroy = mq_destroy, + .attach = mq_attach, + .dump = mq_dump, + .owner = THIS_MODULE, +}; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c new file mode 100644 index 00000000..59b26b8f --- /dev/null +++ b/net/sched/sch_mqprio.c @@ -0,0 +1,418 @@ +/* + * net/sched/sch_mqprio.c + * + * Copyright (c) 2010 John Fastabend + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct mqprio_sched { + struct Qdisc **qdiscs; + int hw_owned; +}; + +static void mqprio_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + unsigned int ntx; + + if (priv->qdiscs) { + for (ntx = 0; + ntx < dev->num_tx_queues && priv->qdiscs[ntx]; + ntx++) + qdisc_destroy(priv->qdiscs[ntx]); + kfree(priv->qdiscs); + } + + if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) + dev->netdev_ops->ndo_setup_tc(dev, 0); + else + netdev_set_num_tc(dev, 0); +} + +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ + int i, j; + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) + return -EINVAL; + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) + return -EINVAL; + } + + /* net_device does not support requested operation */ + if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) + return -EINVAL; + + /* if hw owned qcount and qoffset are taken from LLD so + * no reason to verify them here + */ + if (qopt->hw) + return 0; + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->real_num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) + return -EINVAL; + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) + return -EINVAL; + } + } + + return 0; +} + +static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + int i, err = -EOPNOTSUPP; + struct tc_mqprio_qopt *qopt = NULL; + + BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); + BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + if (!opt || nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); + if (mqprio_parse_opt(dev, qopt)) + return -EINVAL; + + /* pre-allocate qdisc, attachment can't fail */ + priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), + GFP_KERNEL); + if (priv->qdiscs == NULL) { + err = -ENOMEM; + goto err; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1))); + if (qdisc == NULL) { + err = -ENOMEM; + goto err; + } + priv->qdiscs[i] = qdisc; + } + + /* If the mqprio options indicate that hardware should own + * the queue mapping then run ndo_setup_tc otherwise use the + * supplied and verified mapping + */ + if (qopt->hw) { + priv->hw_owned = 1; + err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + if (err) + goto err; + } else { + netdev_set_num_tc(dev, qopt->num_tc); + for (i = 0; i < qopt->num_tc; i++) + netdev_set_tc_queue(dev, i, + qopt->count[i], qopt->offset[i]); + } + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); + + sch->flags |= TCQ_F_MQROOT; + return 0; + +err: + mqprio_destroy(sch); + return err; +} + +static void mqprio_attach(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + struct Qdisc *qdisc; + unsigned int ntx; + + /* Attach underlying qdisc */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = priv->qdiscs[ntx]; + qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (qdisc) + qdisc_destroy(qdisc); + } + kfree(priv->qdiscs); + priv->qdiscs = NULL; +} + +static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + + if (ntx >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, ntx); +} + +static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = dev_graft_qdisc(dev_queue, new); + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_mqprio_qopt opt = { 0 }; + struct Qdisc *qdisc; + unsigned int i; + + sch->q.qlen = 0; + memset(&sch->bstats, 0, sizeof(sch->bstats)); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + for (i = 0; i < dev->num_tx_queues; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc; + spin_lock_bh(qdisc_lock(qdisc)); + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.qlen += qdisc->qstats.qlen; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + opt.hw = priv->hw_owned; + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long mqprio_get(struct Qdisc *sch, u32 classid) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = TC_H_MIN(classid); + + if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) + return 0; + return ntx; +} + +static void mqprio_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct net_device *dev = qdisc_dev(sch); + + if (cl <= netdev_get_num_tc(dev)) { + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_info = 0; + } else { + int i; + struct netdev_queue *dev_queue; + + dev_queue = mqprio_queue_get(sch, cl); + tcm->tcm_parent = 0; + for (i = 0; i < netdev_get_num_tc(dev); i++) { + struct netdev_tc_txq tc = dev->tc_to_txq[i]; + int q_idx = cl - netdev_get_num_tc(dev); + + if (q_idx > tc.offset && + q_idx <= tc.offset + tc.count) { + tcm->tcm_parent = + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)); + break; + } + } + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + } + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) +{ + struct net_device *dev = qdisc_dev(sch); + + if (cl <= netdev_get_num_tc(dev)) { + int i; + struct Qdisc *qdisc; + struct gnet_stats_queue qstats = {0}; + struct gnet_stats_basic_packed bstats = {0}; + struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + + /* Drop lock here it will be reclaimed before touching + * statistics this is required because the d->lock we + * hold here is the look on dev_queue->qdisc_sleeping + * also acquired below. + */ + spin_unlock_bh(d->lock); + + for (i = tc.offset; i < tc.offset + tc.count; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc; + spin_lock_bh(qdisc_lock(qdisc)); + bstats.bytes += qdisc->bstats.bytes; + bstats.packets += qdisc->bstats.packets; + qstats.qlen += qdisc->qstats.qlen; + qstats.backlog += qdisc->qstats.backlog; + qstats.drops += qdisc->qstats.drops; + qstats.requeues += qdisc->qstats.requeues; + qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + /* Reclaim root sleeping lock before completing stats */ + spin_lock_bh(d->lock); + if (gnet_stats_copy_basic(d, &bstats) < 0 || + gnet_stats_copy_queue(d, &qstats) < 0) + return -1; + } else { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + sch->qstats.qlen = sch->q.qlen; + if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, &sch->qstats) < 0) + return -1; + } + return 0; +} + +static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + /* Walk hierarchy with a virtual class per tc */ + arg->count = arg->skip; + for (ntx = arg->skip; + ntx < dev->num_tx_queues + netdev_get_num_tc(dev); + ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops mqprio_class_ops = { + .graft = mqprio_graft, + .leaf = mqprio_leaf, + .get = mqprio_get, + .put = mqprio_put, + .walk = mqprio_walk, + .dump = mqprio_dump_class, + .dump_stats = mqprio_dump_class_stats, +}; + +static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { + .cl_ops = &mqprio_class_ops, + .id = "mqprio", + .priv_size = sizeof(struct mqprio_sched), + .init = mqprio_init, + .destroy = mqprio_destroy, + .attach = mqprio_attach, + .dump = mqprio_dump, + .owner = THIS_MODULE, +}; + +static int __init mqprio_module_init(void) +{ + return register_qdisc(&mqprio_qdisc_ops); +} + +static void __exit mqprio_module_exit(void) +{ + unregister_qdisc(&mqprio_qdisc_ops); +} + +module_init(mqprio_module_init); +module_exit(mqprio_module_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c new file mode 100644 index 00000000..edc1950e --- /dev/null +++ b/net/sched/sch_multiq.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2008, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Alexander Duyck + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct multiq_sched_data { + u16 bands; + u16 max_bands; + u16 curband; + struct tcf_proto *filter_list; + struct Qdisc **queues; +}; + + +static struct Qdisc * +multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + u32 band; + struct tcf_result res; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + err = tc_classify(skb, q->filter_list, &res); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + band = skb_get_queue_mapping(skb); + + if (band >= q->bands) + return q->queues[0]; + + return q->queues[band]; +} + +static int +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = multiq_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + ret = qdisc_enqueue(skb, qdisc); + if (ret == NET_XMIT_SUCCESS) { + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; +} + +static struct sk_buff *multiq_dequeue(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc; + struct sk_buff *skb; + int band; + + for (band = 0; band < q->bands; band++) { + /* cycle through bands to ensure fairness */ + q->curband++; + if (q->curband >= q->bands) + q->curband = 0; + + /* Check that target subqueue is available before + * pulling an skb to avoid head-of-line blocking. + */ + if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) { + qdisc = q->queues[q->curband]; + skb = qdisc->dequeue(qdisc); + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + return skb; + } + } + } + return NULL; + +} + +static struct sk_buff *multiq_peek(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned int curband = q->curband; + struct Qdisc *qdisc; + struct sk_buff *skb; + int band; + + for (band = 0; band < q->bands; band++) { + /* cycle through bands to ensure fairness */ + curband++; + if (curband >= q->bands) + curband = 0; + + /* Check that target subqueue is available before + * pulling an skb to avoid head-of-line blocking. + */ + if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) { + qdisc = q->queues[curband]; + skb = qdisc->ops->peek(qdisc); + if (skb) + return skb; + } + } + return NULL; + +} + +static unsigned int multiq_drop(struct Qdisc *sch) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int band; + unsigned int len; + struct Qdisc *qdisc; + + for (band = q->bands - 1; band >= 0; band--) { + qdisc = q->queues[band]; + if (qdisc->ops->drop) { + len = qdisc->ops->drop(qdisc); + if (len != 0) { + sch->q.qlen--; + return len; + } + } + } + return 0; +} + + +static void +multiq_reset(struct Qdisc *sch) +{ + u16 band; + struct multiq_sched_data *q = qdisc_priv(sch); + + for (band = 0; band < q->bands; band++) + qdisc_reset(q->queues[band]); + sch->q.qlen = 0; + q->curband = 0; +} + +static void +multiq_destroy(struct Qdisc *sch) +{ + int band; + struct multiq_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + for (band = 0; band < q->bands; band++) + qdisc_destroy(q->queues[band]); + + kfree(q->queues); +} + +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct tc_multiq_qopt *qopt; + int i; + + if (!netif_is_multiqueue(qdisc_dev(sch))) + return -EOPNOTSUPP; + if (nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); + + qopt->bands = qdisc_dev(sch)->real_num_tx_queues; + + sch_tree_lock(sch); + q->bands = qopt->bands; + for (i = q->bands; i < q->max_bands; i++) { + if (q->queues[i] != &noop_qdisc) { + struct Qdisc *child = q->queues[i]; + q->queues[i] = &noop_qdisc; + qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_destroy(child); + } + } + + sch_tree_unlock(sch); + + for (i = 0; i < q->bands; i++) { + if (q->queues[i] == &noop_qdisc) { + struct Qdisc *child, *old; + child = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, + i + 1)); + if (child) { + sch_tree_lock(sch); + old = q->queues[i]; + q->queues[i] = child; + + if (old != &noop_qdisc) { + qdisc_tree_decrease_qlen(old, + old->q.qlen); + qdisc_destroy(old); + } + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int multiq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int i, err; + + q->queues = NULL; + + if (opt == NULL) + return -EINVAL; + + q->max_bands = qdisc_dev(sch)->num_tx_queues; + + q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); + if (!q->queues) + return -ENOBUFS; + for (i = 0; i < q->max_bands; i++) + q->queues[i] = &noop_qdisc; + + err = multiq_tune(sch, opt); + + if (err) + kfree(q->queues); + + return err; +} + +static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_multiq_qopt opt; + + opt.bands = q->bands; + opt.max_bands = q->max_bands; + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +multiq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + return q->queues[band]; +} + +static unsigned long multiq_get(struct Qdisc *sch, u32 classid) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return multiq_get(sch, classid); +} + + +static void multiq_put(struct Qdisc *q, unsigned long cl) +{ +} + +static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = q->queues[cl - 1]->handle; + return 0; +} + +static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; + cl_q->qstats.qlen = cl_q->q.qlen; + if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + return -1; + + return 0; +} + +static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + int band; + + if (arg->stop) + return; + + for (band = 0; band < q->bands; band++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, band + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct multiq_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static const struct Qdisc_class_ops multiq_class_ops = { + .graft = multiq_graft, + .leaf = multiq_leaf, + .get = multiq_get, + .put = multiq_put, + .walk = multiq_walk, + .tcf_chain = multiq_find_tcf, + .bind_tcf = multiq_bind, + .unbind_tcf = multiq_put, + .dump = multiq_dump_class, + .dump_stats = multiq_dump_class_stats, +}; + +static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { + .next = NULL, + .cl_ops = &multiq_class_ops, + .id = "multiq", + .priv_size = sizeof(struct multiq_sched_data), + .enqueue = multiq_enqueue, + .dequeue = multiq_dequeue, + .peek = multiq_peek, + .drop = multiq_drop, + .init = multiq_init, + .reset = multiq_reset, + .destroy = multiq_destroy, + .change = multiq_tune, + .dump = multiq_dump, + .owner = THIS_MODULE, +}; + +static int __init multiq_module_init(void) +{ + return register_qdisc(&multiq_qdisc_ops); +} + +static void __exit multiq_module_exit(void) +{ + unregister_qdisc(&multiq_qdisc_ops); +} + +module_init(multiq_module_init) +module_exit(multiq_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c new file mode 100644 index 00000000..f0913ffc --- /dev/null +++ b/net/sched/sch_netem.c @@ -0,0 +1,974 @@ +/* + * net/sched/sch_netem.c Network emulator + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License. + * + * Many of the algorithms and ideas for this came from + * NIST Net which is not copyrighted. + * + * Authors: Stephen Hemminger + * Catalin(ux aka Dino) BOIE + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define VERSION "1.3" + +/* Network Emulation Queuing algorithm. + ==================================== + + Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based + Network Emulation Tool + [2] Luigi Rizzo, DummyNet for FreeBSD + + ---------------------------------------------------------------- + + This started out as a simple way to delay outgoing packets to + test TCP but has grown to include most of the functionality + of a full blown network emulator like NISTnet. It can delay + packets and add random jitter (and correlation). The random + distribution can be loaded from a table as well to provide + normal, Pareto, or experimental curves. Packet loss, + duplication, and reordering can also be emulated. + + This qdisc does not do classification that can be handled in + layering other disciplines. It does not need to do bandwidth + control either since that can be handled by using token + bucket or other rate control. + + Correlated Loss Generator models + + Added generation of correlated loss according to the + "Gilbert-Elliot" model, a 4-state markov model. + + References: + [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG + [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general + and intuitive loss model for packet networks and its implementation + in the Netem module in the Linux kernel", available in [1] + + Authors: Stefano Salsano +*/ + +struct netem_sched_data { + struct Qdisc *qdisc; + struct qdisc_watchdog watchdog; + + psched_tdiff_t latency; + psched_tdiff_t jitter; + + u32 loss; + u32 limit; + u32 counter; + u32 gap; + u32 duplicate; + u32 reorder; + u32 corrupt; + + struct crndstate { + u32 last; + u32 rho; + } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; + + struct disttable { + u32 size; + s16 table[0]; + } *delay_dist; + + enum { + CLG_RANDOM, + CLG_4_STATES, + CLG_GILB_ELL, + } loss_model; + + /* Correlated Loss Generation models */ + struct clgstate { + /* state of the Markov chain */ + u8 state; + + /* 4-states and Gilbert-Elliot models */ + u32 a1; /* p13 for 4-states or p for GE */ + u32 a2; /* p31 for 4-states or r for GE */ + u32 a3; /* p32 for 4-states or h for GE */ + u32 a4; /* p14 for 4-states or 1-k for GE */ + u32 a5; /* p23 used only in 4-states */ + } clg; + +}; + +/* Time stamp put into socket buffer control block */ +struct netem_skb_cb { + psched_time_t time_to_send; +}; + +static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); + return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; +} + +/* init_crandom - initialize correlated random number generator + * Use entropy source for initial seed. + */ +static void init_crandom(struct crndstate *state, unsigned long rho) +{ + state->rho = rho; + state->last = net_random(); +} + +/* get_crandom - correlated random number generator + * Next number depends on last value. + * rho is scaled to avoid floating point. + */ +static u32 get_crandom(struct crndstate *state) +{ + u64 value, rho; + unsigned long answer; + + if (state->rho == 0) /* no correlation */ + return net_random(); + + value = net_random(); + rho = (u64)state->rho + 1; + answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; + state->last = answer; + return answer; +} + +/* loss_4state - 4-state model loss generator + * Generates losses according to the 4-state Markov chain adopted in + * the GI (General and Intuitive) loss model. + */ +static bool loss_4state(struct netem_sched_data *q) +{ + struct clgstate *clg = &q->clg; + u32 rnd = net_random(); + + /* + * Makes a comparison between rnd and the transition + * probabilities outgoing from the current state, then decides the + * next state and if the next packet has to be transmitted or lost. + * The four states correspond to: + * 1 => successfully transmitted packets within a gap period + * 4 => isolated losses within a gap period + * 3 => lost packets within a burst period + * 2 => successfully transmitted packets within a burst period + */ + switch (clg->state) { + case 1: + if (rnd < clg->a4) { + clg->state = 4; + return true; + } else if (clg->a4 < rnd && rnd < clg->a1) { + clg->state = 3; + return true; + } else if (clg->a1 < rnd) + clg->state = 1; + + break; + case 2: + if (rnd < clg->a5) { + clg->state = 3; + return true; + } else + clg->state = 2; + + break; + case 3: + if (rnd < clg->a3) + clg->state = 2; + else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { + clg->state = 1; + return true; + } else if (clg->a2 + clg->a3 < rnd) { + clg->state = 3; + return true; + } + break; + case 4: + clg->state = 1; + break; + } + + return false; +} + +/* loss_gilb_ell - Gilbert-Elliot model loss generator + * Generates losses according to the Gilbert-Elliot loss model or + * its special cases (Gilbert or Simple Gilbert) + * + * Makes a comparison between random number and the transition + * probabilities outgoing from the current state, then decides the + * next state. A second random number is extracted and the comparison + * with the loss probability of the current state decides if the next + * packet will be transmitted or lost. + */ +static bool loss_gilb_ell(struct netem_sched_data *q) +{ + struct clgstate *clg = &q->clg; + + switch (clg->state) { + case 1: + if (net_random() < clg->a1) + clg->state = 2; + if (net_random() < clg->a4) + return true; + case 2: + if (net_random() < clg->a2) + clg->state = 1; + if (clg->a3 > net_random()) + return true; + } + + return false; +} + +static bool loss_event(struct netem_sched_data *q) +{ + switch (q->loss_model) { + case CLG_RANDOM: + /* Random packet drop 0 => none, ~0 => all */ + return q->loss && q->loss >= get_crandom(&q->loss_cor); + + case CLG_4_STATES: + /* 4state loss model algorithm (used also for GI model) + * Extracts a value from the markov 4 state loss generator, + * if it is 1 drops a packet and if needed writes the event in + * the kernel logs + */ + return loss_4state(q); + + case CLG_GILB_ELL: + /* Gilbert-Elliot loss model algorithm + * Extracts a value from the Gilbert-Elliot loss generator, + * if it is 1 drops a packet and if needed writes the event in + * the kernel logs + */ + return loss_gilb_ell(q); + } + + return false; /* not reached */ +} + + +/* tabledist - return a pseudo-randomly distributed value with mean mu and + * std deviation sigma. Uses table lookup to approximate the desired + * distribution, and a uniformly-distributed pseudo-random source. + */ +static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, + struct crndstate *state, + const struct disttable *dist) +{ + psched_tdiff_t x; + long t; + u32 rnd; + + if (sigma == 0) + return mu; + + rnd = get_crandom(state); + + /* default uniform distribution */ + if (dist == NULL) + return (rnd % (2*sigma)) - sigma + mu; + + t = dist->table[rnd % dist->size]; + x = (sigma % NETEM_DIST_SCALE) * t; + if (x >= 0) + x += NETEM_DIST_SCALE/2; + else + x -= NETEM_DIST_SCALE/2; + + return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; +} + +/* + * Insert one skb into qdisc. + * Note: parent depends on return value to account for queue length. + * NET_XMIT_DROP: queue length didn't change. + * NET_XMIT_SUCCESS: one skb was queued. + */ +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + /* We don't fill cb now as skb_unshare() may invalidate it */ + struct netem_skb_cb *cb; + struct sk_buff *skb2; + int ret; + int count = 1; + + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) + ++count; + + /* Drop packet? */ + if (loss_event(q)) + --count; + + if (count == 0) { + sch->qstats.drops++; + kfree_skb(skb); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } + + skb_orphan(skb); + + /* + * If we need to duplicate packet, then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + struct Qdisc *rootq = qdisc_root(sch); + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + q->duplicate = 0; + + qdisc_enqueue_root(skb2, rootq); + q->duplicate = dupsave; + } + + /* + * Randomized packet corruption. + * Make copy if needed since we are modifying + * If packet is going to be hardware checksummed, then + * do it now in software before we mangle it. + */ + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || + (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb))) + return qdisc_drop(skb, sch); + + skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + } + + cb = netem_skb_cb(skb); + if (q->gap == 0 || /* not doing reordering */ + q->counter < q->gap || /* inside last reordering gap */ + q->reorder < get_crandom(&q->reorder_cor)) { + psched_time_t now; + psched_tdiff_t delay; + + delay = tabledist(q->latency, q->jitter, + &q->delay_cor, q->delay_dist); + + now = psched_get_time(); + cb->time_to_send = now + delay; + ++q->counter; + ret = qdisc_enqueue(skb, q->qdisc); + } else { + /* + * Do re-ordering by putting one out of N packets at the front + * of the queue. + */ + cb->time_to_send = psched_get_time(); + q->counter = 0; + + __skb_queue_head(&q->qdisc->q, skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + sch->qstats.requeues++; + ret = NET_XMIT_SUCCESS; + } + + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) { + sch->qstats.drops++; + return ret; + } + } + + sch->q.qlen++; + return NET_XMIT_SUCCESS; +} + +static unsigned int netem_drop(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned int len = 0; + + if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + if (qdisc_is_throttled(sch)) + return NULL; + + skb = q->qdisc->ops->peek(q->qdisc); + if (skb) { + const struct netem_skb_cb *cb = netem_skb_cb(skb); + psched_time_t now = psched_get_time(); + + /* if more time remaining? */ + if (cb->time_to_send <= now) { + skb = qdisc_dequeue_peeked(q->qdisc); + if (unlikely(!skb)) + return NULL; + +#ifdef CONFIG_NET_CLS_ACT + /* + * If it's at ingress let's pretend the delay is + * from the network (tstamp will be updated). + */ + if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) + skb->tstamp.tv64 = 0; +#endif + + sch->q.qlen--; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + return skb; + } + + qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); + } + + return NULL; +} + +static void netem_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + qdisc_watchdog_cancel(&q->watchdog); +} + +static void dist_free(struct disttable *d) +{ + if (d) { + if (is_vmalloc_addr(d)) + vfree(d); + else + kfree(d); + } +} + +/* + * Distribution data is a variable size payload containing + * signed 16 bit values. + */ +static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + size_t n = nla_len(attr)/sizeof(__s16); + const __s16 *data = nla_data(attr); + spinlock_t *root_lock; + struct disttable *d; + int i; + size_t s; + + if (n > NETEM_DIST_MAX) + return -EINVAL; + + s = sizeof(struct disttable) + n * sizeof(s16); + d = kmalloc(s, GFP_KERNEL); + if (!d) + d = vmalloc(s); + if (!d) + return -ENOMEM; + + d->size = n; + for (i = 0; i < n; i++) + d->table[i] = data[i]; + + root_lock = qdisc_root_sleeping_lock(sch); + + spin_lock_bh(root_lock); + dist_free(q->delay_dist); + q->delay_dist = d; + spin_unlock_bh(root_lock); + return 0; +} + +static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corr *c = nla_data(attr); + + init_crandom(&q->delay_cor, c->delay_corr); + init_crandom(&q->loss_cor, c->loss_corr); + init_crandom(&q->dup_cor, c->dup_corr); +} + +static void get_reorder(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_reorder *r = nla_data(attr); + + q->reorder = r->probability; + init_crandom(&q->reorder_cor, r->correlation); +} + +static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corrupt *r = nla_data(attr); + + q->corrupt = r->probability; + init_crandom(&q->corrupt_cor, r->correlation); +} + +static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct nlattr *la; + int rem; + + nla_for_each_nested(la, attr, rem) { + u16 type = nla_type(la); + + switch(type) { + case NETEM_LOSS_GI: { + const struct tc_netem_gimodel *gi = nla_data(la); + + if (nla_len(la) != sizeof(struct tc_netem_gimodel)) { + pr_info("netem: incorrect gi model size\n"); + return -EINVAL; + } + + q->loss_model = CLG_4_STATES; + + q->clg.state = 1; + q->clg.a1 = gi->p13; + q->clg.a2 = gi->p31; + q->clg.a3 = gi->p32; + q->clg.a4 = gi->p14; + q->clg.a5 = gi->p23; + break; + } + + case NETEM_LOSS_GE: { + const struct tc_netem_gemodel *ge = nla_data(la); + + if (nla_len(la) != sizeof(struct tc_netem_gemodel)) { + pr_info("netem: incorrect gi model size\n"); + return -EINVAL; + } + + q->loss_model = CLG_GILB_ELL; + q->clg.state = 1; + q->clg.a1 = ge->p; + q->clg.a2 = ge->r; + q->clg.a3 = ge->h; + q->clg.a4 = ge->k1; + break; + } + + default: + pr_info("netem: unknown loss type %u\n", type); + return -EINVAL; + } + } + + return 0; +} + +static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { + [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, + [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, + [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, + [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, +}; + +static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy, int len) +{ + int nested_len = nla_len(nla) - NLA_ALIGN(len); + + if (nested_len < 0) { + pr_info("netem: invalid attributes len %d\n", nested_len); + return -EINVAL; + } + + if (nested_len >= nla_attr_size(0)) + return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), + nested_len, policy); + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + return 0; +} + +/* Parse netlink message to set options */ +static int netem_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_NETEM_MAX + 1]; + struct tc_netem_qopt *qopt; + int ret; + + if (opt == NULL) + return -EINVAL; + + qopt = nla_data(opt); + ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); + if (ret < 0) + return ret; + + ret = fifo_set_limit(q->qdisc, qopt->limit); + if (ret) { + pr_info("netem: can't set fifo limit\n"); + return ret; + } + + q->latency = qopt->latency; + q->jitter = qopt->jitter; + q->limit = qopt->limit; + q->gap = qopt->gap; + q->counter = 0; + q->loss = qopt->loss; + q->duplicate = qopt->duplicate; + + /* for compatibility with earlier versions. + * if gap is set, need to assume 100% probability + */ + if (q->gap) + q->reorder = ~0; + + if (tb[TCA_NETEM_CORR]) + get_correlation(sch, tb[TCA_NETEM_CORR]); + + if (tb[TCA_NETEM_DELAY_DIST]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); + if (ret) + return ret; + } + + if (tb[TCA_NETEM_REORDER]) + get_reorder(sch, tb[TCA_NETEM_REORDER]); + + if (tb[TCA_NETEM_CORRUPT]) + get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + + q->loss_model = CLG_RANDOM; + if (tb[TCA_NETEM_LOSS]) + ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); + + return ret; +} + +/* + * Special case version of FIFO queue for use by netem. + * It queues in order based on timestamps in skb's + */ +struct fifo_sched_data { + u32 limit; + psched_time_t oldest; +}; + +static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *list = &sch->q; + psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + struct sk_buff *skb; + + if (likely(skb_queue_len(list) < q->limit)) { + /* Optimize for add at tail */ + if (likely(skb_queue_empty(list) || tnext >= q->oldest)) { + q->oldest = tnext; + return qdisc_enqueue_tail(nskb, sch); + } + + skb_queue_reverse_walk(list, skb) { + const struct netem_skb_cb *cb = netem_skb_cb(skb); + + if (tnext >= cb->time_to_send) + break; + } + + __skb_queue_after(list, skb, nskb); + + sch->qstats.backlog += qdisc_pkt_len(nskb); + + return NET_XMIT_SUCCESS; + } + + return qdisc_reshape_fail(nskb, sch); +} + +static int tfifo_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (opt) { + struct tc_fifo_qopt *ctl = nla_data(opt); + if (nla_len(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } else + q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + + q->oldest = PSCHED_PASTPERFECT; + return 0; +} + +static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct tc_fifo_qopt opt = { .limit = q->limit }; + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + return -1; +} + +static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = { + .id = "tfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = tfifo_enqueue, + .dequeue = qdisc_dequeue_head, + .peek = qdisc_peek_head, + .drop = qdisc_queue_drop, + .init = tfifo_init, + .reset = qdisc_reset_queue, + .change = tfifo_init, + .dump = tfifo_dump, +}; + +static int netem_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if (!opt) + return -EINVAL; + + qdisc_watchdog_init(&q->watchdog, sch); + + q->loss_model = CLG_RANDOM; + q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 1)); + if (!q->qdisc) { + pr_notice("netem: qdisc create tfifo qdisc failed\n"); + return -ENOMEM; + } + + ret = netem_change(sch, opt); + if (ret) { + pr_info("netem: change failed\n"); + qdisc_destroy(q->qdisc); + } + return ret; +} + +static void netem_destroy(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); + qdisc_destroy(q->qdisc); + dist_free(q->delay_dist); +} + +static int dump_loss_model(const struct netem_sched_data *q, + struct sk_buff *skb) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_NETEM_LOSS); + if (nest == NULL) + goto nla_put_failure; + + switch (q->loss_model) { + case CLG_RANDOM: + /* legacy loss model */ + nla_nest_cancel(skb, nest); + return 0; /* no data */ + + case CLG_4_STATES: { + struct tc_netem_gimodel gi = { + .p13 = q->clg.a1, + .p31 = q->clg.a2, + .p32 = q->clg.a3, + .p14 = q->clg.a4, + .p23 = q->clg.a5, + }; + + NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi); + break; + } + case CLG_GILB_ELL: { + struct tc_netem_gemodel ge = { + .p = q->clg.a1, + .r = q->clg.a2, + .h = q->clg.a3, + .k1 = q->clg.a4, + }; + + NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge); + break; + } + } + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + const struct netem_sched_data *q = qdisc_priv(sch); + struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); + struct tc_netem_qopt qopt; + struct tc_netem_corr cor; + struct tc_netem_reorder reorder; + struct tc_netem_corrupt corrupt; + + qopt.latency = q->latency; + qopt.jitter = q->jitter; + qopt.limit = q->limit; + qopt.loss = q->loss; + qopt.gap = q->gap; + qopt.duplicate = q->duplicate; + NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + cor.delay_corr = q->delay_cor.rho; + cor.loss_corr = q->loss_cor.rho; + cor.dup_corr = q->dup_cor.rho; + NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + + reorder.probability = q->reorder; + reorder.correlation = q->reorder_cor.rho; + NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + + corrupt.probability = q->corrupt; + corrupt.correlation = q->corrupt_cor.rho; + NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + + if (dump_loss_model(q, skb) != 0) + goto nla_put_failure; + + return nla_nest_end(skb, nla); + +nla_put_failure: + nlmsg_trim(skb, nla); + return -1; +} + +static int netem_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct netem_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long netem_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void netem_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static const struct Qdisc_class_ops netem_class_ops = { + .graft = netem_graft, + .leaf = netem_leaf, + .get = netem_get, + .put = netem_put, + .walk = netem_walk, + .dump = netem_dump_class, +}; + +static struct Qdisc_ops netem_qdisc_ops __read_mostly = { + .id = "netem", + .cl_ops = &netem_class_ops, + .priv_size = sizeof(struct netem_sched_data), + .enqueue = netem_enqueue, + .dequeue = netem_dequeue, + .peek = qdisc_peek_dequeued, + .drop = netem_drop, + .init = netem_init, + .reset = netem_reset, + .destroy = netem_destroy, + .change = netem_change, + .dump = netem_dump, + .owner = THIS_MODULE, +}; + + +static int __init netem_module_init(void) +{ + pr_info("netem: version " VERSION "\n"); + return register_qdisc(&netem_qdisc_ops); +} +static void __exit netem_module_exit(void) +{ + unregister_qdisc(&netem_qdisc_ops); +} +module_init(netem_module_init) +module_exit(netem_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c new file mode 100644 index 00000000..b5d56a22 --- /dev/null +++ b/net/sched/sch_prio.c @@ -0,0 +1,405 @@ +/* + * net/sched/sch_prio.c Simple 3-band priority "scheduler". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * Fixes: 19990609: J Hadi Salim : + * Init -- EINVAL when opt undefined + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct prio_sched_data { + int bands; + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; + struct Qdisc *queues[TCQ_PRIO_BANDS]; +}; + + +static struct Qdisc * +prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct prio_sched_data *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + int err; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + if (TC_H_MAJ(skb->priority) != sch->handle) { + err = tc_classify(skb, q->filter_list, &res); +#ifdef CONFIG_NET_CLS_ACT + switch (err) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + if (!q->filter_list || err < 0) { + if (TC_H_MAJ(band)) + band = 0; + return q->queues[q->prio2band[band & TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band >= q->bands) + return q->queues[q->prio2band[0]]; + + return q->queues[band]; +} + +static int +prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = prio_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + ret = qdisc_enqueue(skb, qdisc); + if (ret == NET_XMIT_SUCCESS) { + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; +} + +static struct sk_buff *prio_peek(struct Qdisc *sch) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < q->bands; prio++) { + struct Qdisc *qdisc = q->queues[prio]; + struct sk_buff *skb = qdisc->ops->peek(qdisc); + if (skb) + return skb; + } + return NULL; +} + +static struct sk_buff *prio_dequeue(struct Qdisc *sch) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < q->bands; prio++) { + struct Qdisc *qdisc = q->queues[prio]; + struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + return skb; + } + } + return NULL; + +} + +static unsigned int prio_drop(struct Qdisc *sch) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + unsigned int len; + struct Qdisc *qdisc; + + for (prio = q->bands-1; prio >= 0; prio--) { + qdisc = q->queues[prio]; + if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { + sch->q.qlen--; + return len; + } + } + return 0; +} + + +static void +prio_reset(struct Qdisc *sch) +{ + int prio; + struct prio_sched_data *q = qdisc_priv(sch); + + for (prio = 0; prio < q->bands; prio++) + qdisc_reset(q->queues[prio]); + sch->q.qlen = 0; +} + +static void +prio_destroy(struct Qdisc *sch) +{ + int prio; + struct prio_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + for (prio = 0; prio < q->bands; prio++) + qdisc_destroy(q->queues[prio]); +} + +static int prio_tune(struct Qdisc *sch, struct nlattr *opt) +{ + struct prio_sched_data *q = qdisc_priv(sch); + struct tc_prio_qopt *qopt; + int i; + + if (nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = nla_data(opt); + + if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) + return -EINVAL; + + for (i = 0; i <= TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= qopt->bands) + return -EINVAL; + } + + sch_tree_lock(sch); + q->bands = qopt->bands; + memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); + + for (i = q->bands; i < TCQ_PRIO_BANDS; i++) { + struct Qdisc *child = q->queues[i]; + q->queues[i] = &noop_qdisc; + if (child != &noop_qdisc) { + qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_destroy(child); + } + } + sch_tree_unlock(sch); + + for (i = 0; i < q->bands; i++) { + if (q->queues[i] == &noop_qdisc) { + struct Qdisc *child, *old; + + child = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, i + 1)); + if (child) { + sch_tree_lock(sch); + old = q->queues[i]; + q->queues[i] = child; + + if (old != &noop_qdisc) { + qdisc_tree_decrease_qlen(old, + old->q.qlen); + qdisc_destroy(old); + } + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int prio_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int i; + + for (i = 0; i < TCQ_PRIO_BANDS; i++) + q->queues[i] = &noop_qdisc; + + if (opt == NULL) { + return -EINVAL; + } else { + int err; + + if ((err = prio_tune(sch, opt)) != 0) + return err; + } + return 0; +} + +static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_prio_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +prio_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + return q->queues[band]; +} + +static unsigned long prio_get(struct Qdisc *sch, u32 classid) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + return prio_get(sch, classid); +} + + +static void prio_put(struct Qdisc *q, unsigned long cl) +{ +} + +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct prio_sched_data *q = qdisc_priv(sch); + + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = q->queues[cl-1]->handle; + return 0; +} + +static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct prio_sched_data *q = qdisc_priv(sch); + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; + cl_q->qstats.qlen = cl_q->q.qlen; + if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + return -1; + + return 0; +} + +static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + + if (arg->stop) + return; + + for (prio = 0; prio < q->bands; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static const struct Qdisc_class_ops prio_class_ops = { + .graft = prio_graft, + .leaf = prio_leaf, + .get = prio_get, + .put = prio_put, + .walk = prio_walk, + .tcf_chain = prio_find_tcf, + .bind_tcf = prio_bind, + .unbind_tcf = prio_put, + .dump = prio_dump_class, + .dump_stats = prio_dump_class_stats, +}; + +static struct Qdisc_ops prio_qdisc_ops __read_mostly = { + .next = NULL, + .cl_ops = &prio_class_ops, + .id = "prio", + .priv_size = sizeof(struct prio_sched_data), + .enqueue = prio_enqueue, + .dequeue = prio_dequeue, + .peek = prio_peek, + .drop = prio_drop, + .init = prio_init, + .reset = prio_reset, + .destroy = prio_destroy, + .change = prio_tune, + .dump = prio_dump, + .owner = THIS_MODULE, +}; + +static int __init prio_module_init(void) +{ + return register_qdisc(&prio_qdisc_ops); +} + +static void __exit prio_module_exit(void) +{ + unregister_qdisc(&prio_qdisc_ops); +} + +module_init(prio_module_init) +module_exit(prio_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c new file mode 100644 index 00000000..10334340 --- /dev/null +++ b/net/sched/sch_qfq.c @@ -0,0 +1,1137 @@ +/* + * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. + * + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Quick Fair Queueing + =================== + + Sources: + + Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient + Packet Scheduling with Tight Bandwidth Distribution Guarantees." + + See also: + http://retis.sssup.it/~fabio/linux/qfq/ + */ + +/* + + Virtual time computations. + + S, F and V are all computed in fixed point arithmetic with + FRAC_BITS decimal bits. + + QFQ_MAX_INDEX is the maximum index allowed for a group. We need + one bit per index. + QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + + The layout of the bits is as below: + + [ MTU_SHIFT ][ FRAC_BITS ] + [ MAX_INDEX ][ MIN_SLOT_SHIFT ] + ^.__grp->index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + + The max group index corresponds to Lmax/w_min, where + Lmax=1<group mapping. We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the + * group with the smallest index that can support the L_i / r_i configured + * for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + */ +#define QFQ_MAX_INDEX 19 +#define QFQ_MAX_WSHIFT 16 + +#define QFQ_MAX_WEIGHT (1<clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct qfq_class, common); +} + +static void qfq_purge_queue(struct qfq_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { + [TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, + [TCA_QFQ_LMAX] = { .type = NLA_U32 }, +}; + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(u32 inv_w, unsigned int maxlen) +{ + u64 slot_size = (u64)maxlen * inv_w; + unsigned long size_map; + int index = 0; + + size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; + if (!size_map) + goto out; + + index = __fls(size_map) + 1; /* basically a log_2 */ + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; +out: + pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", + (unsigned long) ONE_FP/inv_w, maxlen, index); + + return index; +} + +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)*arg; + struct nlattr *tb[TCA_QFQ_MAX + 1]; + u32 weight, lmax, inv_w; + int i, err; + + if (tca[TCA_OPTIONS] == NULL) { + pr_notice("qfq: no options\n"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy); + if (err < 0) + return err; + + if (tb[TCA_QFQ_WEIGHT]) { + weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); + if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) { + pr_notice("qfq: invalid weight %u\n", weight); + return -EINVAL; + } + } else + weight = 1; + + inv_w = ONE_FP / weight; + weight = ONE_FP / inv_w; + if (q->wsum + weight > QFQ_MAX_WSUM) { + pr_notice("qfq: total weight out of range (%u + %u)\n", + weight, q->wsum); + return -EINVAL; + } + + if (tb[TCA_QFQ_LMAX]) { + lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); + if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + } else + lmax = 1UL << QFQ_MTU_SHIFT; + + if (cl != NULL) { + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + + sch_tree_lock(sch); + if (tb[TCA_QFQ_WEIGHT]) { + q->wsum = weight - ONE_FP / cl->inv_w; + cl->inv_w = inv_w; + } + sch_tree_unlock(sch); + + return 0; + } + + cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + + cl->refcnt = 1; + cl->common.classid = classid; + cl->lmax = lmax; + cl->inv_w = inv_w; + i = qfq_calc_index(cl->inv_w, cl->lmax); + + cl->grp = &q->groups[i]; + q->wsum += weight; + + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; + } + } + + sch_tree_lock(sch); + qdisc_class_hash_insert(&q->clhash, &cl->common); + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; +} + +static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) +{ + struct qfq_sched *q = qdisc_priv(sch); + + if (cl->inv_w) { + q->wsum -= ONE_FP / cl->inv_w; + cl->inv_w = 0; + } + + gen_kill_estimator(&cl->bstats, &cl->rate_est); + qdisc_destroy(cl->qdisc); + kfree(cl); +} + +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)arg; + + if (cl->filter_cnt > 0) + return -EBUSY; + + sch_tree_lock(sch); + + qfq_purge_queue(cl); + qdisc_class_hash_remove(&q->clhash, &cl->common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid) +{ + struct qfq_class *cl = qfq_find_class(sch, classid); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void qfq_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + if (--cl->refcnt == 0) + qfq_destroy_class(sch, cl); +} + +static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +{ + struct qfq_sched *q = qdisc_priv(sch); + + if (cl) + return NULL; + + return &q->filter_list; +} + +static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct qfq_class *cl = qfq_find_class(sch, classid); + + if (cl != NULL) + cl->filter_cnt++; + + return (unsigned long)cl; +} + +static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + cl->filter_cnt--; +} + +static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + qfq_purge_queue(cl); + *old = cl->qdisc; + cl->qdisc = new; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + return cl->qdisc; +} + +static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w); + NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax); + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + struct tc_qfq_stats xstats; + + memset(&xstats, 0, sizeof(xstats)); + cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; + + xstats.weight = ONE_FP/cl->inv_w; + xstats.lmax = cl->lmax; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct hlist_node *n; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { + pr_debug("qfq_classify: found %d\n", skb->priority); + cl = qfq_find_class(sch, skb->priority); + if (cl != NULL) + return cl; + } + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (struct qfq_class *)res.class; + if (cl == NULL) + cl = qfq_find_class(sch, res.classid); + return cl; + } + + return NULL; +} + +/* Generic comparison function, handling wraparound. */ +static inline int qfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline u64 qfq_round_down(u64 ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = __ffs(bitmap); + return &q->groups[index]; +} +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, + int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_F)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) +{ + unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, + u64 roundedS) +{ + u64 slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + hlist_add_head(&cl->next, &grp->slots[i]); + __set_bit(slot, &grp->full_slots); +} + +/* Maybe introduce hlist_first_entry?? */ +static struct qfq_class *qfq_slot_head(struct qfq_group *grp) +{ + return hlist_entry(grp->slots[grp->front].first, + struct qfq_class, next); +} + +/* + * remove the entry from the slot + */ +static void qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class *cl = qfq_slot_head(grp); + + BUG_ON(!cl); + hlist_del(&cl->next); + if (hlist_empty(&grp->slots[grp->front])) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) +{ + unsigned int i; + + pr_debug("qfq slot_scan: grp %u full %#lx\n", + grp->index, grp->full_slots); + + if (grp->full_slots == 0) + return NULL; + + i = __ffs(grp->full_slots); /* zero based */ + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return qfq_slot_head(grp); +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + +static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) +{ + struct qfq_group *grp; + unsigned long ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* What is length of next packet in queue (0 if queue is empty) */ +static unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = sch->ops->peek(sch); + return skb ? qdisc_pkt_len(skb) : 0; +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl) +{ + unsigned int len = qdisc_peek_len(cl->qdisc); + + cl->S = cl->F; + if (!len) + qfq_front_slot_remove(grp); /* queue is empty */ + else { + u64 roundedS; + + cl->F = cl->S + (u64)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return false; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + + return true; +} + +static struct sk_buff *qfq_dequeue(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + struct qfq_class *cl; + struct sk_buff *skb; + unsigned int len; + u64 old_V; + + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = qfq_slot_head(grp); + skb = qdisc_dequeue_peeked(cl->qdisc); + if (!skb) { + WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); + return NULL; + } + + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + + old_V = q->V; + len = qdisc_pkt_len(skb); + q->V += (u64)len * IWSUM; + pr_debug("qfq dequeue: len %u F %lld now %lld\n", + len, (unsigned long long) cl->F, (unsigned long long) q->V); + + if (qfq_update_class(grp, cl)) { + u64 old_F = grp->F; + + cl = qfq_slot_scan(grp); + if (!cl) + __clear_bit(grp->index, &q->bitmaps[ER]); + else { + u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + + return skb; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint32_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else /* timestamp is not stale */ + cl->S = cl->F; +} + +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + struct qfq_class *cl; + int err; + u64 roundedS; + int s; + + cl = qfq_classify(skb, sch, &err); + if (cl == NULL) { + if (err & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); + + err = qdisc_enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + pr_debug("qfq_enqueue: enqueue failed %d\n", err); + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } + return err; + } + + bstats_update(&cl->bstats, skb); + ++sch->q.qlen; + + /* If the new skb is not the head of queue, then done here. */ + if (cl->qdisc->q.qlen != 1) + return err; + + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); + + /* compute new finish time and rounded start. */ + cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + + /* create a slot for this cl->S */ + qfq_slot_rotate(grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + + pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", + s, q->bitmaps[s], + (unsigned long long) cl->S, + (unsigned long long) cl->F, + (unsigned long long) q->V); + +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return err; +} + + +static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + unsigned int i, offset; + u64 roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + + hlist_del(&cl->next); + if (hlist_empty(&grp->slots[i])) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_group *grp = cl->grp; + unsigned long mask; + u64 roundedS; + int s; + + cl->F = cl->S; + qfq_slot_remove(q, grp, cl); + + if (!grp->full_slots) { + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (hlist_empty(&grp->slots[grp->front])) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + + qfq_update_eligible(q, q->V); +} + +static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)arg; + + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); +} + +static unsigned int qfq_drop(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + unsigned int i, j, len; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + struct qfq_class *cl; + struct hlist_node *n; + + hlist_for_each_entry(cl, n, &grp->slots[j], next) { + + if (!cl->qdisc->ops->drop) + continue; + + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + sch->q.qlen--; + if (!cl->qdisc->q.qlen) + qfq_deactivate_class(q, cl); + + return len; + } + } + } + } + + return 0; +} + +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + int i, j, err; + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS + - (QFQ_MAX_INDEX - i); + for (j = 0; j < QFQ_MAX_SLOTS; j++) + INIT_HLIST_HEAD(&grp->slots[j]); + } + + return 0; +} + +static void qfq_reset_qdisc(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + struct qfq_class *cl; + struct hlist_node *n, *tmp; + unsigned int i, j; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + hlist_for_each_entry_safe(cl, n, tmp, + &grp->slots[j], next) { + qfq_deactivate_class(q, cl); + } + } + } + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) + qdisc_reset(cl->qdisc); + } + sch->q.qlen = 0; +} + +static void qfq_destroy_qdisc(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct hlist_node *n, *next; + unsigned int i; + + tcf_destroy_chain(&q->filter_list); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + common.hnode) { + qfq_destroy_class(sch, cl); + } + } + qdisc_class_hash_destroy(&q->clhash); +} + +static const struct Qdisc_class_ops qfq_class_ops = { + .change = qfq_change_class, + .delete = qfq_delete_class, + .get = qfq_get_class, + .put = qfq_put_class, + .tcf_chain = qfq_tcf_chain, + .bind_tcf = qfq_bind_tcf, + .unbind_tcf = qfq_unbind_tcf, + .graft = qfq_graft_class, + .leaf = qfq_class_leaf, + .qlen_notify = qfq_qlen_notify, + .dump = qfq_dump_class, + .dump_stats = qfq_dump_class_stats, + .walk = qfq_walk, +}; + +static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { + .cl_ops = &qfq_class_ops, + .id = "qfq", + .priv_size = sizeof(struct qfq_sched), + .enqueue = qfq_enqueue, + .dequeue = qfq_dequeue, + .peek = qdisc_peek_dequeued, + .drop = qfq_drop, + .init = qfq_init_qdisc, + .reset = qfq_reset_qdisc, + .destroy = qfq_destroy_qdisc, + .owner = THIS_MODULE, +}; + +static int __init qfq_init(void) +{ + return register_qdisc(&qfq_qdisc_ops); +} + +static void __exit qfq_exit(void) +{ + unregister_qdisc(&qfq_qdisc_ops); +} + +module_init(qfq_init); +module_exit(qfq_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 00000000..6649463d --- /dev/null +++ b/net/sched/sch_red.c @@ -0,0 +1,359 @@ +/* + * net/sched/sch_red.c Random Early Detection queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * J Hadi Salim 980914: computation fixes + * Alexey Makarenko 990814: qave on idle link was calculated incorrectly. + * J Hadi Salim 980816: ECN support + */ + +#include +#include +#include +#include +#include +#include +#include + + +/* Parameters, settable by user: + ----------------------------- + + limit - bytes (must be > qth_max + burst) + + Hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect the algorithms behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be reached + if RED works correctly. + */ + +struct red_sched_data { + u32 limit; /* HARD maximal queue length */ + unsigned char flags; + struct red_parms parms; + struct red_stats stats; + struct Qdisc *qdisc; +}; + +static inline int red_use_ecn(struct red_sched_data *q) +{ + return q->flags & TC_RED_ECN; +} + +static inline int red_use_harddrop(struct red_sched_data *q) +{ + return q->flags & TC_RED_HARDDROP; +} + +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + int ret; + + q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog); + + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); + + switch (red_action(&q->parms, q->parms.qavg)) { + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (red_use_harddrop(q) || !red_use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + q->stats.forced_mark++; + break; + } + + ret = qdisc_enqueue(skb, child); + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + } else if (net_xmit_drop_count(ret)) { + q->stats.pdrop++; + sch->qstats.drops++; + } + return ret; + +congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; +} + +static struct sk_buff *red_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + + skb = child->dequeue(child); + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + } else { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + } + return skb; +} + +static struct sk_buff *red_peek(struct Qdisc *sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + + return child->ops->peek(child); +} + +static unsigned int red_drop(struct Qdisc *sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + unsigned int len; + + if (child->ops->drop && (len = child->ops->drop(child)) > 0) { + q->stats.other++; + sch->qstats.drops++; + sch->q.qlen--; + return len; + } + + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + + return 0; +} + +static void red_reset(struct Qdisc *sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + red_restart(&q->parms); +} + +static void red_destroy(struct Qdisc *sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + qdisc_destroy(q->qdisc); +} + +static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { + [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, + [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, +}; + +static int red_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_RED_MAX + 1]; + struct tc_red_qopt *ctl; + struct Qdisc *child = NULL; + int err; + + if (opt == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy); + if (err < 0) + return err; + + if (tb[TCA_RED_PARMS] == NULL || + tb[TCA_RED_STAB] == NULL) + return -EINVAL; + + ctl = nla_data(tb[TCA_RED_PARMS]); + + if (ctl->limit > 0) { + child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit); + if (IS_ERR(child)) + return PTR_ERR(child); + } + + sch_tree_lock(sch); + q->flags = ctl->flags; + q->limit = ctl->limit; + if (child) { + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + } + + red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_RED_STAB])); + + if (skb_queue_empty(&sch->q)) + red_end_of_idle_period(&q->parms); + + sch_tree_unlock(sch); + return 0; +} + +static int red_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct red_sched_data *q = qdisc_priv(sch); + + q->qdisc = &noop_qdisc; + return red_change(sch, opt); +} + +static int red_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct nlattr *opts = NULL; + struct tc_red_qopt opt = { + .limit = q->limit, + .flags = q->flags, + .qth_min = q->parms.qth_min >> q->parms.Wlog, + .qth_max = q->parms.qth_max >> q->parms.Wlog, + .Wlog = q->parms.Wlog, + .Plog = q->parms.Plog, + .Scell_log = q->parms.Scell_log, + }; + + sch->qstats.backlog = q->qdisc->qstats.backlog; + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct tc_red_xstats st = { + .early = q->stats.prob_drop + q->stats.forced_drop, + .pdrop = q->stats.pdrop, + .other = q->stats.other, + .marked = q->stats.prob_mark + q->stats.forced_mark, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static int red_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct red_sched_data *q = qdisc_priv(sch); + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + return 0; +} + +static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct red_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct red_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long red_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void red_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static const struct Qdisc_class_ops red_class_ops = { + .graft = red_graft, + .leaf = red_leaf, + .get = red_get, + .put = red_put, + .walk = red_walk, + .dump = red_dump_class, +}; + +static struct Qdisc_ops red_qdisc_ops __read_mostly = { + .id = "red", + .priv_size = sizeof(struct red_sched_data), + .cl_ops = &red_class_ops, + .enqueue = red_enqueue, + .dequeue = red_dequeue, + .peek = red_peek, + .drop = red_drop, + .init = red_init, + .reset = red_reset, + .destroy = red_destroy, + .change = red_change, + .dump = red_dump, + .dump_stats = red_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init red_module_init(void) +{ + return register_qdisc(&red_qdisc_ops); +} + +static void __exit red_module_exit(void) +{ + unregister_qdisc(&red_qdisc_ops); +} + +module_init(red_module_init) +module_exit(red_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c new file mode 100644 index 00000000..47ee29fa --- /dev/null +++ b/net/sched/sch_sfb.c @@ -0,0 +1,708 @@ +/* + * net/sched/sch_sfb.c Stochastic Fair Blue + * + * Copyright (c) 2008-2011 Juliusz Chroboczek + * Copyright (c) 2011 Eric Dumazet + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: + * A New Class of Active Queue Management Algorithms. + * U. Michigan CSE-TR-387-99, April 1999. + * + * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) + * This implementation uses L = 8 and N = 16 + * This permits us to split one 32bit hash (provided per packet by rxhash or + * external classifier) into 8 subhashes of 4 bits. + */ +#define SFB_BUCKET_SHIFT 4 +#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ +#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) +#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ + +/* SFB algo uses a virtual queue, named "bin" */ +struct sfb_bucket { + u16 qlen; /* length of virtual queue */ + u16 p_mark; /* marking probability */ +}; + +/* We use a double buffering right before hash change + * (Section 4.4 of SFB reference : moving hash functions) + */ +struct sfb_bins { + u32 perturbation; /* jhash perturbation */ + struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; +}; + +struct sfb_sched_data { + struct Qdisc *qdisc; + struct tcf_proto *filter_list; + unsigned long rehash_interval; + unsigned long warmup_time; /* double buffering warmup time in jiffies */ + u32 max; + u32 bin_size; /* maximum queue length per bin */ + u32 increment; /* d1 */ + u32 decrement; /* d2 */ + u32 limit; /* HARD maximal queue length */ + u32 penalty_rate; + u32 penalty_burst; + u32 tokens_avail; + unsigned long rehash_time; + unsigned long token_time; + + u8 slot; /* current active bins (0 or 1) */ + bool double_buffering; + struct sfb_bins bins[2]; + + struct { + u32 earlydrop; + u32 penaltydrop; + u32 bucketdrop; + u32 queuedrop; + u32 childdrop; /* drops in child qdisc */ + u32 marked; /* ECN mark */ + } stats; +}; + +/* + * Each queued skb might be hashed on one or two bins + * We store in skb_cb the two hash values. + * (A zero value means double buffering was not used) + */ +struct sfb_skb_cb { + u32 hashes[2]; +}; + +static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); + return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; +} + +/* + * If using 'internal' SFB flow classifier, hash comes from skb rxhash + * If using external classifier, hash comes from the classid. + */ +static u32 sfb_hash(const struct sk_buff *skb, u32 slot) +{ + return sfb_skb_cb(skb)->hashes[slot]; +} + +/* Probabilities are coded as Q0.16 fixed-point values, + * with 0xFFFF representing 65535/65536 (almost 1.0) + * Addition and subtraction are saturating in [0, 65535] + */ +static u32 prob_plus(u32 p1, u32 p2) +{ + u32 res = p1 + p2; + + return min_t(u32, res, SFB_MAX_PROB); +} + +static u32 prob_minus(u32 p1, u32 p2) +{ + return p1 > p2 ? p1 - p2 : 0; +} + +static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) +{ + int i; + struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b[hash].qlen < 0xFFFF) + b[hash].qlen++; + b += SFB_NUMBUCKETS; /* next level */ + } +} + +static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ + u32 sfbhash; + + sfbhash = sfb_hash(skb, 0); + if (sfbhash) + increment_one_qlen(sfbhash, 0, q); + + sfbhash = sfb_hash(skb, 1); + if (sfbhash) + increment_one_qlen(sfbhash, 1, q); +} + +static void decrement_one_qlen(u32 sfbhash, u32 slot, + struct sfb_sched_data *q) +{ + int i; + struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b[hash].qlen > 0) + b[hash].qlen--; + b += SFB_NUMBUCKETS; /* next level */ + } +} + +static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ + u32 sfbhash; + + sfbhash = sfb_hash(skb, 0); + if (sfbhash) + decrement_one_qlen(sfbhash, 0, q); + + sfbhash = sfb_hash(skb, 1); + if (sfbhash) + decrement_one_qlen(sfbhash, 1, q); +} + +static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ + b->p_mark = prob_minus(b->p_mark, q->decrement); +} + +static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ + b->p_mark = prob_plus(b->p_mark, q->increment); +} + +static void sfb_zero_all_buckets(struct sfb_sched_data *q) +{ + memset(&q->bins, 0, sizeof(q->bins)); +} + +/* + * compute max qlen, max p_mark, and avg p_mark + */ +static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) +{ + int i; + u32 qlen = 0, prob = 0, totalpm = 0; + const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; + + for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { + if (qlen < b->qlen) + qlen = b->qlen; + totalpm += b->p_mark; + if (prob < b->p_mark) + prob = b->p_mark; + b++; + } + *prob_r = prob; + *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); + return qlen; +} + + +static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) +{ + q->bins[slot].perturbation = net_random(); +} + +static void sfb_swap_slot(struct sfb_sched_data *q) +{ + sfb_init_perturbation(q->slot, q); + q->slot ^= 1; + q->double_buffering = false; +} + +/* Non elastic flows are allowed to use part of the bandwidth, expressed + * in "penalty_rate" packets per second, with "penalty_burst" burst + */ +static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) +{ + if (q->penalty_rate == 0 || q->penalty_burst == 0) + return true; + + if (q->tokens_avail < 1) { + unsigned long age = min(10UL * HZ, jiffies - q->token_time); + + q->tokens_avail = (age * q->penalty_rate) / HZ; + if (q->tokens_avail > q->penalty_burst) + q->tokens_avail = q->penalty_burst; + q->token_time = jiffies; + if (q->tokens_avail < 1) + return true; + } + + q->tokens_avail--; + return false; +} + +static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, + int *qerr, u32 *salt) +{ + struct tcf_result res; + int result; + + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return false; + } +#endif + *salt = TC_H_MIN(res.classid); + return true; + } + return false; +} + +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + int i; + u32 p_min = ~0; + u32 minqlen = ~0; + u32 r, slot, salt, sfbhash; + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + + if (q->rehash_interval > 0) { + unsigned long limit = q->rehash_time + q->rehash_interval; + + if (unlikely(time_after(jiffies, limit))) { + sfb_swap_slot(q); + q->rehash_time = jiffies; + } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && + time_after(jiffies, limit - q->warmup_time))) { + q->double_buffering = true; + } + } + + if (q->filter_list) { + /* If using external classifiers, get result and record it. */ + if (!sfb_classify(skb, q, &ret, &salt)) + goto other_drop; + } else { + salt = skb_get_rxhash(skb); + } + + slot = q->slot; + + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + if (!sfbhash) + sfbhash = 1; + sfb_skb_cb(skb)->hashes[slot] = sfbhash; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b->qlen == 0) + decrement_prob(b, q); + else if (b->qlen >= q->bin_size) + increment_prob(b, q); + if (minqlen > b->qlen) + minqlen = b->qlen; + if (p_min > b->p_mark) + p_min = b->p_mark; + } + + slot ^= 1; + sfb_skb_cb(skb)->hashes[slot] = 0; + + if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) { + sch->qstats.overlimits++; + if (minqlen >= q->max) + q->stats.bucketdrop++; + else + q->stats.queuedrop++; + goto drop; + } + + if (unlikely(p_min >= SFB_MAX_PROB)) { + /* Inelastic flow */ + if (q->double_buffering) { + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); + if (!sfbhash) + sfbhash = 1; + sfb_skb_cb(skb)->hashes[slot] = sfbhash; + + for (i = 0; i < SFB_LEVELS; i++) { + u32 hash = sfbhash & SFB_BUCKET_MASK; + struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + + sfbhash >>= SFB_BUCKET_SHIFT; + if (b->qlen == 0) + decrement_prob(b, q); + else if (b->qlen >= q->bin_size) + increment_prob(b, q); + } + } + if (sfb_rate_limit(skb, q)) { + sch->qstats.overlimits++; + q->stats.penaltydrop++; + goto drop; + } + goto enqueue; + } + + r = net_random() & SFB_MAX_PROB; + + if (unlikely(r < p_min)) { + if (unlikely(p_min > SFB_MAX_PROB / 2)) { + /* If we're marking that many packets, then either + * this flow is unresponsive, or we're badly congested. + * In either case, we want to start dropping packets. + */ + if (r < (p_min - SFB_MAX_PROB / 2) * 2) { + q->stats.earlydrop++; + goto drop; + } + } + if (INET_ECN_set_ce(skb)) { + q->stats.marked++; + } else { + q->stats.earlydrop++; + goto drop; + } + } + +enqueue: + ret = qdisc_enqueue(skb, child); + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + increment_qlen(skb, q); + } else if (net_xmit_drop_count(ret)) { + q->stats.childdrop++; + sch->qstats.drops++; + } + return ret; + +drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; +other_drop: + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; +} + +static struct sk_buff *sfb_dequeue(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + struct sk_buff *skb; + + skb = child->dequeue(q->qdisc); + + if (skb) { + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + decrement_qlen(skb, q); + } + + return skb; +} + +static struct sk_buff *sfb_peek(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child = q->qdisc; + + return child->ops->peek(child); +} + +/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ + +static void sfb_reset(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + q->slot = 0; + q->double_buffering = false; + sfb_zero_all_buckets(q); + sfb_init_perturbation(0, q); +} + +static void sfb_destroy(struct Qdisc *sch) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + qdisc_destroy(q->qdisc); +} + +static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { + [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, +}; + +static const struct tc_sfb_qopt sfb_default_ops = { + .rehash_interval = 600 * MSEC_PER_SEC, + .warmup_time = 60 * MSEC_PER_SEC, + .limit = 0, + .max = 25, + .bin_size = 20, + .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ + .decrement = (SFB_MAX_PROB + 3000) / 6000, + .penalty_rate = 10, + .penalty_burst = 20, +}; + +static int sfb_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct Qdisc *child; + struct nlattr *tb[TCA_SFB_MAX + 1]; + const struct tc_sfb_qopt *ctl = &sfb_default_ops; + u32 limit; + int err; + + if (opt) { + err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy); + if (err < 0) + return -EINVAL; + + if (tb[TCA_SFB_PARMS] == NULL) + return -EINVAL; + + ctl = nla_data(tb[TCA_SFB_PARMS]); + } + + limit = ctl->limit; + if (limit == 0) + limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + + child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); + if (IS_ERR(child)) + return PTR_ERR(child); + + sch_tree_lock(sch); + + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + + q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); + q->warmup_time = msecs_to_jiffies(ctl->warmup_time); + q->rehash_time = jiffies; + q->limit = limit; + q->increment = ctl->increment; + q->decrement = ctl->decrement; + q->max = ctl->max; + q->bin_size = ctl->bin_size; + q->penalty_rate = ctl->penalty_rate; + q->penalty_burst = ctl->penalty_burst; + q->tokens_avail = ctl->penalty_burst; + q->token_time = jiffies; + + q->slot = 0; + q->double_buffering = false; + sfb_zero_all_buckets(q); + sfb_init_perturbation(0, q); + sfb_init_perturbation(1, q); + + sch_tree_unlock(sch); + + return 0; +} + +static int sfb_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + q->qdisc = &noop_qdisc; + return sfb_change(sch, opt); +} + +static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + struct tc_sfb_qopt opt = { + .rehash_interval = jiffies_to_msecs(q->rehash_interval), + .warmup_time = jiffies_to_msecs(q->warmup_time), + .limit = q->limit, + .max = q->max, + .bin_size = q->bin_size, + .increment = q->increment, + .decrement = q->decrement, + .penalty_rate = q->penalty_rate, + .penalty_burst = q->penalty_burst, + }; + + sch->qstats.backlog = q->qdisc->qstats.backlog; + opts = nla_nest_start(skb, TCA_OPTIONS); + NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -EMSGSIZE; +} + +static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + struct tc_sfb_xstats st = { + .earlydrop = q->stats.earlydrop, + .penaltydrop = q->stats.penaltydrop, + .bucketdrop = q->stats.bucketdrop, + .queuedrop = q->stats.queuedrop, + .childdrop = q->stats.childdrop, + .marked = q->stats.marked, + }; + + st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + return -ENOSYS; +} + +static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + return q->qdisc; +} + +static unsigned long sfb_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void sfb_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int sfb_delete(struct Qdisc *sch, unsigned long cl) +{ + return -ENOSYS; +} + +static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct sfb_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + + +static const struct Qdisc_class_ops sfb_class_ops = { + .graft = sfb_graft, + .leaf = sfb_leaf, + .get = sfb_get, + .put = sfb_put, + .change = sfb_change_class, + .delete = sfb_delete, + .walk = sfb_walk, + .tcf_chain = sfb_find_tcf, + .bind_tcf = sfb_bind, + .unbind_tcf = sfb_put, + .dump = sfb_dump_class, +}; + +static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { + .id = "sfb", + .priv_size = sizeof(struct sfb_sched_data), + .cl_ops = &sfb_class_ops, + .enqueue = sfb_enqueue, + .dequeue = sfb_dequeue, + .peek = sfb_peek, + .init = sfb_init, + .reset = sfb_reset, + .destroy = sfb_destroy, + .change = sfb_change, + .dump = sfb_dump, + .dump_stats = sfb_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init sfb_module_init(void) +{ + return register_qdisc(&sfb_qdisc_ops); +} + +static void __exit sfb_module_exit(void) +{ + unregister_qdisc(&sfb_qdisc_ops); +} + +module_init(sfb_module_init) +module_exit(sfb_module_exit) + +MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); +MODULE_AUTHOR("Juliusz Chroboczek"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c new file mode 100644 index 00000000..69400e3c --- /dev/null +++ b/net/sched/sch_sfq.c @@ -0,0 +1,722 @@ +/* + * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Stochastic Fairness Queuing algorithm. + ======================================= + + Source: + Paul E. McKenney "Stochastic Fairness Queuing", + IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + + Paul E. McKenney "Stochastic Fairness Queuing", + "Interworking: Research and Experience", v.2, 1991, p.113-131. + + + See also: + M. Shreedhar and George Varghese "Efficient Fair + Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + + + This is not the thing that is usually called (W)FQ nowadays. + It does not use any timestamp mechanism, but instead + processes queues in round-robin order. + + ADVANTAGE: + + - It is very cheap. Both CPU and memory requirements are minimal. + + DRAWBACKS: + + - "Stochastic" -> It is not 100% fair. + When hash collisions occur, several flows are considered as one. + + - "Round-robin" -> It introduces larger delays than virtual clock + based schemes, and should not be used for isolating interactive + traffic from non-interactive. It means, that this scheduler + should be used as leaf of CBQ or P3, which put interactive traffic + to higher priority band. + + We still need true WFQ for top level CSZ, but using WFQ + for the best effort traffic is absolutely pointless: + SFQ is superior for this purpose. + + IMPLEMENTATION: + This implementation limits maximal queue length to 128; + max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024. + The only goal of this restrictions was that all data + fit into one 4K page on 32bit arches. + + It is easy to increase these values, but not in flight. */ + +#define SFQ_DEPTH 128 /* max number of packets per flow */ +#define SFQ_SLOTS 128 /* max number of flows */ +#define SFQ_EMPTY_SLOT 255 +#define SFQ_DEFAULT_HASH_DIVISOR 1024 + +/* We use 16 bits to store allot, and want to handle packets up to 64K + * Scale allot by 8 (1<<3) so that no overflow occurs. + */ +#define SFQ_ALLOT_SHIFT 3 +#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) + +/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */ +typedef unsigned char sfq_index; + +/* + * We dont use pointers to save space. + * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array + * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1] + * are 'pointers' to dep[] array + */ +struct sfq_head { + sfq_index next; + sfq_index prev; +}; + +struct sfq_slot { + struct sk_buff *skblist_next; + struct sk_buff *skblist_prev; + sfq_index qlen; /* number of skbs in skblist */ + sfq_index next; /* next slot in sfq chain */ + struct sfq_head dep; /* anchor in dep[] chains */ + unsigned short hash; /* hash value (index in ht[]) */ + short allot; /* credit for this slot */ +}; + +struct sfq_sched_data { +/* Parameters */ + int perturb_period; + unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ + int limit; + unsigned int divisor; /* number of slots in hash table */ +/* Variables */ + struct tcf_proto *filter_list; + struct timer_list perturb_timer; + u32 perturbation; + sfq_index cur_depth; /* depth of longest slot */ + unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ + struct sfq_slot *tail; /* current slot in round */ + sfq_index *ht; /* Hash table (divisor slots) */ + struct sfq_slot slots[SFQ_SLOTS]; + struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ +}; + +/* + * sfq_head are either in a sfq_slot or in dep[] array + */ +static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) +{ + if (val < SFQ_SLOTS) + return &q->slots[val].dep; + return &q->dep[val - SFQ_SLOTS]; +} + +static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +{ + return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1); +} + +static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + + switch (skb->protocol) { + case htons(ETH_P_IP): + { + const struct iphdr *iph; + int poff; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + goto err; + iph = ip_hdr(skb); + h = (__force u32)iph->daddr; + h2 = (__force u32)iph->saddr ^ iph->protocol; + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff); + } + break; + } + case htons(ETH_P_IPV6): + { + const struct ipv6hdr *iph; + int poff; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + goto err; + iph = ipv6_hdr(skb); + h = (__force u32)iph->daddr.s6_addr32[3]; + h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { + iph = ipv6_hdr(skb); + h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff); + } + break; + } + default: +err: + h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol; + h2 = (unsigned long)skb->sk; + } + + return sfq_fold_hash(q, h, h2); +} + +static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->divisor) + return TC_H_MIN(skb->priority); + + if (!q->filter_list) + return sfq_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->divisor) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* + * x : slot number [0 .. SFQ_SLOTS - 1] + */ +static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int qlen = q->slots[x].qlen; + + p = qlen + SFQ_SLOTS; + n = q->dep[qlen].next; + + q->slots[x].dep.next = n; + q->slots[x].dep.prev = p; + + q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ + sfq_dep_head(q, n)->prev = x; +} + +#define sfq_unlink(q, x, n, p) \ + n = q->slots[x].dep.next; \ + p = q->slots[x].dep.prev; \ + sfq_dep_head(q, p)->next = n; \ + sfq_dep_head(q, n)->prev = p + + +static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + sfq_unlink(q, x, n, p); + + d = q->slots[x].qlen--; + if (n == p && q->cur_depth == d) + q->cur_depth--; + sfq_link(q, x); +} + +static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + sfq_unlink(q, x, n, p); + + d = ++q->slots[x].qlen; + if (q->cur_depth < d) + q->cur_depth = d; + sfq_link(q, x); +} + +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from tail of slot queue */ +static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) +{ + struct sk_buff *skb = slot->skblist_prev; + + slot->skblist_prev = skb->prev; + skb->prev->next = (struct sk_buff *)slot; + skb->next = skb->prev = NULL; + return skb; +} + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) +{ + struct sk_buff *skb = slot->skblist_next; + + slot->skblist_next = skb->next; + skb->next->prev = (struct sk_buff *)slot; + skb->next = skb->prev = NULL; + return skb; +} + +static inline void slot_queue_init(struct sfq_slot *slot) +{ + slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; +} + +/* add skb to slot queue (tail add) */ +static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) +{ + skb->prev = slot->skblist_prev; + skb->next = (struct sk_buff *)slot; + slot->skblist_prev->next = skb; + slot->skblist_prev = skb; +} + +#define slot_queue_walk(slot, skb) \ + for (skb = slot->skblist_next; \ + skb != (struct sk_buff *)slot; \ + skb = skb->next) + +static unsigned int sfq_drop(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + sfq_index x, d = q->cur_depth; + struct sk_buff *skb; + unsigned int len; + struct sfq_slot *slot; + + /* Queue is full! Find the longest slot and drop tail packet from it */ + if (d > 1) { + x = q->dep[d].next; + slot = &q->slots[x]; +drop: + skb = slot_dequeue_tail(slot); + len = qdisc_pkt_len(skb); + sfq_dec(q, x); + kfree_skb(skb); + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= len; + return len; + } + + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + x = q->tail->next; + slot = &q->slots[x]; + q->tail->next = slot->next; + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + goto drop; + } + + return 0; +} + +static int +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned int hash; + sfq_index x, qlen; + struct sfq_slot *slot; + int uninitialized_var(ret); + + hash = sfq_classify(skb, sch, &ret); + if (hash == 0) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + hash--; + + x = q->ht[hash]; + slot = &q->slots[x]; + if (x == SFQ_EMPTY_SLOT) { + x = q->dep[0].next; /* get a free slot */ + q->ht[hash] = x; + slot = &q->slots[x]; + slot->hash = hash; + } + + /* If selected queue has length q->limit, do simple tail drop, + * i.e. drop _this_ packet. + */ + if (slot->qlen >= q->limit) + return qdisc_drop(skb, sch); + + sch->qstats.backlog += qdisc_pkt_len(skb); + slot_queue_add(slot, skb); + sfq_inc(q, x); + if (slot->qlen == 1) { /* The flow is new */ + if (q->tail == NULL) { /* It is the first flow */ + slot->next = x; + } else { + slot->next = q->tail->next; + q->tail->next = x; + } + q->tail = slot; + slot->allot = q->scaled_quantum; + } + if (++sch->q.qlen <= q->limit) + return NET_XMIT_SUCCESS; + + qlen = slot->qlen; + sfq_drop(sch); + /* Return Congestion Notification only if we dropped a packet + * from this flow. + */ + if (qlen != slot->qlen) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff * +sfq_dequeue(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + sfq_index a, next_a; + struct sfq_slot *slot; + + /* No active slots */ + if (q->tail == NULL) + return NULL; + +next_slot: + a = q->tail->next; + slot = &q->slots[a]; + if (slot->allot <= 0) { + q->tail = slot; + slot->allot += q->scaled_quantum; + goto next_slot; + } + skb = slot_dequeue_head(slot); + sfq_dec(q, a); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + + /* Is the slot empty? */ + if (slot->qlen == 0) { + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + next_a = slot->next; + if (a == next_a) { + q->tail = NULL; /* no more active slots */ + return skb; + } + q->tail->next = next_a; + } else { + slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); + } + return skb; +} + +static void +sfq_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = sfq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void sfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct sfq_sched_data *q = qdisc_priv(sch); + + q->perturbation = net_random(); + + if (q->perturb_period) + mod_timer(&q->perturb_timer, jiffies + q->perturb_period); +} + +static int sfq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct tc_sfq_qopt *ctl = nla_data(opt); + unsigned int qlen; + + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) + return -EINVAL; + + if (ctl->divisor && + (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) + return -EINVAL; + + sch_tree_lock(sch); + q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + q->perturb_period = ctl->perturb_period * HZ; + if (ctl->limit) + q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); + if (ctl->divisor) + q->divisor = ctl->divisor; + qlen = sch->q.qlen; + while (sch->q.qlen > q->limit) + sfq_drop(sch); + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + del_timer(&q->perturb_timer); + if (q->perturb_period) { + mod_timer(&q->perturb_timer, jiffies + q->perturb_period); + q->perturbation = net_random(); + } + sch_tree_unlock(sch); + return 0; +} + +static int sfq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + size_t sz; + int i; + + q->perturb_timer.function = sfq_perturbation; + q->perturb_timer.data = (unsigned long)sch; + init_timer_deferrable(&q->perturb_timer); + + for (i = 0; i < SFQ_DEPTH; i++) { + q->dep[i].next = i + SFQ_SLOTS; + q->dep[i].prev = i + SFQ_SLOTS; + } + + q->limit = SFQ_DEPTH - 1; + q->cur_depth = 0; + q->tail = NULL; + q->divisor = SFQ_DEFAULT_HASH_DIVISOR; + if (opt == NULL) { + q->quantum = psched_mtu(qdisc_dev(sch)); + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + q->perturb_period = 0; + q->perturbation = net_random(); + } else { + int err = sfq_change(sch, opt); + if (err) + return err; + } + + sz = sizeof(q->ht[0]) * q->divisor; + q->ht = kmalloc(sz, GFP_KERNEL); + if (!q->ht && sz > PAGE_SIZE) + q->ht = vmalloc(sz); + if (!q->ht) + return -ENOMEM; + for (i = 0; i < q->divisor; i++) + q->ht[i] = SFQ_EMPTY_SLOT; + + for (i = 0; i < SFQ_SLOTS; i++) { + slot_queue_init(&q->slots[i]); + sfq_link(q, i); + } + if (q->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + q->perturb_period = 0; + del_timer_sync(&q->perturb_timer); + if (is_vmalloc_addr(q->ht)) + vfree(q->ht); + else + kfree(q->ht); +} + +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_sfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period / HZ; + + opt.limit = q->limit; + opt.divisor = q->divisor; + opt.flows = q->limit; + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long sfq_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void sfq_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + sfq_index idx = q->ht[cl - 1]; + struct gnet_stats_queue qs = { 0 }; + struct tc_sfq_xstats xstats = { 0 }; + struct sk_buff *skb; + + if (idx != SFQ_EMPTY_SLOT) { + const struct sfq_slot *slot = &q->slots[idx]; + + xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; + qs.qlen = slot->qlen; + slot_queue_walk(slot, skb) + qs.backlog += qdisc_pkt_len(skb); + } + if (gnet_stats_copy_queue(d, &qs) < 0) + return -1; + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->divisor; i++) { + if (q->ht[i] == SFQ_EMPTY_SLOT || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops sfq_class_ops = { + .leaf = sfq_leaf, + .get = sfq_get, + .put = sfq_put, + .tcf_chain = sfq_find_tcf, + .bind_tcf = sfq_bind, + .unbind_tcf = sfq_put, + .dump = sfq_dump_class, + .dump_stats = sfq_dump_class_stats, + .walk = sfq_walk, +}; + +static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { + .cl_ops = &sfq_class_ops, + .id = "sfq", + .priv_size = sizeof(struct sfq_sched_data), + .enqueue = sfq_enqueue, + .dequeue = sfq_dequeue, + .peek = qdisc_peek_dequeued, + .drop = sfq_drop, + .init = sfq_init, + .reset = sfq_reset, + .destroy = sfq_destroy, + .change = NULL, + .dump = sfq_dump, + .owner = THIS_MODULE, +}; + +static int __init sfq_module_init(void) +{ + return register_qdisc(&sfq_qdisc_ops); +} +static void __exit sfq_module_exit(void) +{ + unregister_qdisc(&sfq_qdisc_ops); +} +module_init(sfq_module_init) +module_exit(sfq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c new file mode 100644 index 00000000..1dcfb522 --- /dev/null +++ b/net/sched/sch_tbf.c @@ -0,0 +1,464 @@ +/* + * net/sched/sch_tbf.c Token Bucket Filter queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * Dmitry Torokhov - allow attaching inner qdiscs - + * original idea by Martin Devera + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Simple Token Bucket Filter. + ======================================= + + SOURCE. + ------- + + None. + + Description. + ------------ + + A data flow obeys TBF with rate R and depth B, if for any + time interval t_i...t_f the number of transmitted bits + does not exceed B + R*(t_f-t_i). + + Packetized version of this definition: + The sequence of packets of sizes s_i served at moments t_i + obeys TBF, if for any i<=k: + + s_i+....+s_k <= B + R*(t_k - t_i) + + Algorithm. + ---------- + + Let N(t_i) be B/R initially and N(t) grow continuously with time as: + + N(t+delta) = min{B/R, N(t) + delta} + + If the first packet in queue has length S, it may be + transmitted only at the time t_* when S/R <= N(t_*), + and in this case N(t) jumps: + + N(t_* + 0) = N(t_* - 0) - S/R. + + + + Actually, QoS requires two TBF to be applied to a data stream. + One of them controls steady state burst size, another + one with rate P (peak rate) and depth M (equal to link MTU) + limits bursts at a smaller time scale. + + It is easy to see that P>R, and B>M. If P is infinity, this double + TBF is equivalent to a single one. + + When TBF works in reshaping mode, latency is estimated as: + + lat = max ((L-B)/R, (L-M)/P) + + + NOTES. + ------ + + If TBF throttles, it starts a watchdog timer, which will wake it up + when it is ready to transmit. + Note that the minimal timer resolution is 1/HZ. + If no new packets arrive during this period, + or if the device is not awaken by EOI for some previous packet, + TBF can stop its activity for 1/HZ. + + + This means, that with depth B, the maximal rate is + + R_crit = B*HZ + + F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. + + Note that the peak rate TBF is much more tough: with MTU 1500 + P_crit = 150Kbytes/sec. So, if you need greater peak + rates, use alpha with HZ=1000 :-) + + With classful TBF, limit is just kept for backwards compatibility. + It is passed to the default bfifo qdisc - if the inner qdisc is + changed the limit is not effective anymore. +*/ + +struct tbf_sched_data { +/* Parameters */ + u32 limit; /* Maximal length of backlog: bytes */ + u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + u32 mtu; + u32 max_size; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; + +/* Variables */ + long tokens; /* Current number of B tokens */ + long ptokens; /* Current number of P tokens */ + psched_time_t t_c; /* Time check-point */ + struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ + struct qdisc_watchdog watchdog; /* Watchdog timer */ +}; + +#define L2T(q, L) qdisc_l2t((q)->R_tab, L) +#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L) + +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + int ret; + + if (qdisc_pkt_len(skb) > q->max_size) + return qdisc_reshape_fail(skb, sch); + + ret = qdisc_enqueue(skb, q->qdisc); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + return ret; + } + + sch->q.qlen++; + return NET_XMIT_SUCCESS; +} + +static unsigned int tbf_drop(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + unsigned int len = 0; + + if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +static struct sk_buff *tbf_dequeue(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->ops->peek(q->qdisc); + + if (skb) { + psched_time_t now; + long toks; + long ptoks = 0; + unsigned int len = qdisc_pkt_len(skb); + + now = psched_get_time(); + toks = psched_tdiff_bounded(now, q->t_c, q->buffer); + + if (q->P_tab) { + ptoks = toks + q->ptokens; + if (ptoks > (long)q->mtu) + ptoks = q->mtu; + ptoks -= L2T_P(q, len); + } + toks += q->tokens; + if (toks > (long)q->buffer) + toks = q->buffer; + toks -= L2T(q, len); + + if ((toks|ptoks) >= 0) { + skb = qdisc_dequeue_peeked(q->qdisc); + if (unlikely(!skb)) + return NULL; + + q->t_c = now; + q->tokens = toks; + q->ptokens = ptoks; + sch->q.qlen--; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + return skb; + } + + qdisc_watchdog_schedule(&q->watchdog, + now + max_t(long, -toks, -ptoks)); + + /* Maybe we have a shorter packet in the queue, + which can be sent now. It sounds cool, + but, however, this is wrong in principle. + We MUST NOT reorder packets under these circumstances. + + Really, if we split the flow into independent + subflows, it would be a very good solution. + This is the main idea of all FQ algorithms + (cf. CSZ, HPFQ, HFSC) + */ + + sch->qstats.overlimits++; + } + return NULL; +} + +static void tbf_reset(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + q->t_c = psched_get_time(); + q->tokens = q->buffer; + q->ptokens = q->mtu; + qdisc_watchdog_cancel(&q->watchdog); +} + +static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { + [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, + [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +}; + +static int tbf_change(struct Qdisc *sch, struct nlattr *opt) +{ + int err; + struct tbf_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_TBF_PTAB + 1]; + struct tc_tbf_qopt *qopt; + struct qdisc_rate_table *rtab = NULL; + struct qdisc_rate_table *ptab = NULL; + struct Qdisc *child = NULL; + int max_size, n; + + err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TBF_PARMS] == NULL) + goto done; + + qopt = nla_data(tb[TCA_TBF_PARMS]); + rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); + if (rtab == NULL) + goto done; + + if (qopt->peakrate.rate) { + if (qopt->peakrate.rate > qopt->rate.rate) + ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); + if (ptab == NULL) + goto done; + } + + for (n = 0; n < 256; n++) + if (rtab->data[n] > qopt->buffer) + break; + max_size = (n << qopt->rate.cell_log) - 1; + if (ptab) { + int size; + + for (n = 0; n < 256; n++) + if (ptab->data[n] > qopt->mtu) + break; + size = (n << qopt->peakrate.cell_log) - 1; + if (size < max_size) + max_size = size; + } + if (max_size < 0) + goto done; + + if (q->qdisc != &noop_qdisc) { + err = fifo_set_limit(q->qdisc, qopt->limit); + if (err) + goto done; + } else if (qopt->limit > 0) { + child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto done; + } + } + + sch_tree_lock(sch); + if (child) { + qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); + qdisc_destroy(q->qdisc); + q->qdisc = child; + } + q->limit = qopt->limit; + q->mtu = qopt->mtu; + q->max_size = max_size; + q->buffer = qopt->buffer; + q->tokens = q->buffer; + q->ptokens = q->mtu; + + swap(q->R_tab, rtab); + swap(q->P_tab, ptab); + + sch_tree_unlock(sch); + err = 0; +done: + if (rtab) + qdisc_put_rtab(rtab); + if (ptab) + qdisc_put_rtab(ptab); + return err; +} + +static int tbf_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (opt == NULL) + return -EINVAL; + + q->t_c = psched_get_time(); + qdisc_watchdog_init(&q->watchdog, sch); + q->qdisc = &noop_qdisc; + + return tbf_change(sch, opt); +} + +static void tbf_destroy(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); + + if (q->P_tab) + qdisc_put_rtab(q->P_tab); + if (q->R_tab) + qdisc_put_rtab(q->R_tab); + + qdisc_destroy(q->qdisc); +} + +static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct nlattr *nest; + struct tc_tbf_qopt opt; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + opt.limit = q->limit; + opt.rate = q->R_tab->rate; + if (q->P_tab) + opt.peakrate = q->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + opt.mtu = q->mtu; + opt.buffer = q->buffer; + NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + + nla_nest_end(skb, nest); + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->qdisc; + q->qdisc = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long tbf_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void tbf_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static const struct Qdisc_class_ops tbf_class_ops = { + .graft = tbf_graft, + .leaf = tbf_leaf, + .get = tbf_get, + .put = tbf_put, + .walk = tbf_walk, + .dump = tbf_dump_class, +}; + +static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { + .next = NULL, + .cl_ops = &tbf_class_ops, + .id = "tbf", + .priv_size = sizeof(struct tbf_sched_data), + .enqueue = tbf_enqueue, + .dequeue = tbf_dequeue, + .peek = qdisc_peek_dequeued, + .drop = tbf_drop, + .init = tbf_init, + .reset = tbf_reset, + .destroy = tbf_destroy, + .change = tbf_change, + .dump = tbf_dump, + .owner = THIS_MODULE, +}; + +static int __init tbf_module_init(void) +{ + return register_qdisc(&tbf_qdisc_ops); +} + +static void __exit tbf_module_exit(void) +{ + unregister_qdisc(&tbf_qdisc_ops); +} +module_init(tbf_module_init) +module_exit(tbf_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c new file mode 100644 index 00000000..4f4c52c0 --- /dev/null +++ b/net/sched/sch_teql.c @@ -0,0 +1,538 @@ +/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + How to setup it. + ---------------- + + After loading this module you will find a new device teqlN + and new qdisc with the same name. To join a slave to the equalizer + you should just set this qdisc on a device f.e. + + # tc qdisc add dev eth0 root teql0 + # tc qdisc add dev eth1 root teql0 + + That's all. Full PnP 8) + + Applicability. + -------------- + + 1. Slave devices MUST be active devices, i.e., they must raise the tbusy + signal and generate EOI events. If you want to equalize virtual devices + like tunnels, use a normal eql device. + 2. This device puts no limitations on physical slave characteristics + f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) + Certainly, large difference in link speeds will make the resulting + eqalized link unusable, because of huge packet reordering. + I estimate an upper useful difference as ~10 times. + 3. If the slave requires address resolution, only protocols using + neighbour cache (IPv4/IPv6) will work over the equalized link. + Other protocols are still allowed to use the slave device directly, + which will not break load balancing, though native slave + traffic will have the highest priority. */ + +struct teql_master { + struct Qdisc_ops qops; + struct net_device *dev; + struct Qdisc *slaves; + struct list_head master_list; + unsigned long tx_bytes; + unsigned long tx_packets; + unsigned long tx_errors; + unsigned long tx_dropped; +}; + +struct teql_sched_data { + struct Qdisc *next; + struct teql_master *m; + struct neighbour *ncache; + struct sk_buff_head q; +}; + +#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next) + +#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT) + +/* "teql*" qdisc routines */ + +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct teql_sched_data *q = qdisc_priv(sch); + + if (q->q.qlen < dev->tx_queue_len) { + __skb_queue_tail(&q->q, skb); + return NET_XMIT_SUCCESS; + } + + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; +} + +static struct sk_buff * +teql_dequeue(struct Qdisc *sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + struct netdev_queue *dat_queue; + struct sk_buff *skb; + + skb = __skb_dequeue(&dat->q); + dat_queue = netdev_get_tx_queue(dat->m->dev, 0); + if (skb == NULL) { + struct net_device *m = qdisc_dev(dat_queue->qdisc); + if (m) { + dat->m->slaves = sch; + netif_wake_queue(m); + } + } else { + qdisc_bstats_update(sch, skb); + } + sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; + return skb; +} + +static struct sk_buff * +teql_peek(struct Qdisc *sch) +{ + /* teql is meant to be used as root qdisc */ + return NULL; +} + +static inline void +teql_neigh_release(struct neighbour *n) +{ + if (n) + neigh_release(n); +} + +static void +teql_reset(struct Qdisc *sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + + skb_queue_purge(&dat->q); + sch->q.qlen = 0; + teql_neigh_release(xchg(&dat->ncache, NULL)); +} + +static void +teql_destroy(struct Qdisc *sch) +{ + struct Qdisc *q, *prev; + struct teql_sched_data *dat = qdisc_priv(sch); + struct teql_master *master = dat->m; + + prev = master->slaves; + if (prev) { + do { + q = NEXT_SLAVE(prev); + if (q == sch) { + NEXT_SLAVE(prev) = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NEXT_SLAVE(q); + if (q == master->slaves) { + struct netdev_queue *txq; + spinlock_t *root_lock; + + txq = netdev_get_tx_queue(master->dev, 0); + master->slaves = NULL; + + root_lock = qdisc_root_sleeping_lock(txq->qdisc); + spin_lock_bh(root_lock); + qdisc_reset(txq->qdisc); + spin_unlock_bh(root_lock); + } + } + skb_queue_purge(&dat->q); + teql_neigh_release(xchg(&dat->ncache, NULL)); + break; + } + + } while ((prev = q) != master->slaves); + } +} + +static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct teql_master *m = (struct teql_master *)sch->ops; + struct teql_sched_data *q = qdisc_priv(sch); + + if (dev->hard_header_len > m->dev->hard_header_len) + return -EINVAL; + + if (m->dev == dev) + return -ELOOP; + + q->m = m; + + skb_queue_head_init(&q->q); + + if (m->slaves) { + if (m->dev->flags & IFF_UP) { + if ((m->dev->flags & IFF_POINTOPOINT && + !(dev->flags & IFF_POINTOPOINT)) || + (m->dev->flags & IFF_BROADCAST && + !(dev->flags & IFF_BROADCAST)) || + (m->dev->flags & IFF_MULTICAST && + !(dev->flags & IFF_MULTICAST)) || + dev->mtu < m->dev->mtu) + return -EINVAL; + } else { + if (!(dev->flags&IFF_POINTOPOINT)) + m->dev->flags &= ~IFF_POINTOPOINT; + if (!(dev->flags&IFF_BROADCAST)) + m->dev->flags &= ~IFF_BROADCAST; + if (!(dev->flags&IFF_MULTICAST)) + m->dev->flags &= ~IFF_MULTICAST; + if (dev->mtu < m->dev->mtu) + m->dev->mtu = dev->mtu; + } + q->next = NEXT_SLAVE(m->slaves); + NEXT_SLAVE(m->slaves) = sch; + } else { + q->next = sch; + m->slaves = sch; + m->dev->mtu = dev->mtu; + m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK); + } + return 0; +} + + +static int +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, + struct net_device *dev, struct netdev_queue *txq, + struct neighbour *mn) +{ + struct teql_sched_data *q = qdisc_priv(txq->qdisc); + struct neighbour *n = q->ncache; + + if (mn->tbl == NULL) + return -EINVAL; + if (n && n->tbl == mn->tbl && + memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { + atomic_inc(&n->refcnt); + } else { + n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + } + if (neigh_event_send(n, skb_res) == 0) { + int err; + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, n, dev); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr, + NULL, skb->len); + + if (err < 0) { + neigh_release(n); + return -EINVAL; + } + teql_neigh_release(xchg(&q->ncache, n)); + return 0; + } + neigh_release(n); + return (skb_res == NULL) ? -EAGAIN : 1; +} + +static inline int teql_resolve(struct sk_buff *skb, + struct sk_buff *skb_res, + struct net_device *dev, + struct netdev_queue *txq) +{ + struct dst_entry *dst = skb_dst(skb); + struct neighbour *mn; + int res; + + if (txq->qdisc == &noop_qdisc) + return -ENODEV; + + if (!dev->header_ops || !dst) + return 0; + + rcu_read_lock(); + mn = dst_get_neighbour(dst); + res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0; + rcu_read_unlock(); + + return res; +} + +static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct teql_master *master = netdev_priv(dev); + struct Qdisc *start, *q; + int busy; + int nores; + int subq = skb_get_queue_mapping(skb); + struct sk_buff *skb_res = NULL; + + start = master->slaves; + +restart: + nores = 0; + busy = 0; + + q = start; + if (!q) + goto drop; + + do { + struct net_device *slave = qdisc_dev(q); + struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); + const struct net_device_ops *slave_ops = slave->netdev_ops; + + if (slave_txq->qdisc_sleeping != q) + continue; + if (__netif_subqueue_stopped(slave, subq) || + !netif_running(slave)) { + busy = 1; + continue; + } + + switch (teql_resolve(skb, skb_res, slave, slave_txq)) { + case 0: + if (__netif_tx_trylock(slave_txq)) { + unsigned int length = qdisc_pkt_len(skb); + + if (!netif_tx_queue_frozen_or_stopped(slave_txq) && + slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { + txq_trans_update(slave_txq); + __netif_tx_unlock(slave_txq); + master->slaves = NEXT_SLAVE(q); + netif_wake_queue(dev); + master->tx_packets++; + master->tx_bytes += length; + return NETDEV_TX_OK; + } + __netif_tx_unlock(slave_txq); + } + if (netif_queue_stopped(dev)) + busy = 1; + break; + case 1: + master->slaves = NEXT_SLAVE(q); + return NETDEV_TX_OK; + default: + nores = 1; + break; + } + __skb_pull(skb, skb_network_offset(skb)); + } while ((q = NEXT_SLAVE(q)) != start); + + if (nores && skb_res == NULL) { + skb_res = skb; + goto restart; + } + + if (busy) { + netif_stop_queue(dev); + return NETDEV_TX_BUSY; + } + master->tx_errors++; + +drop: + master->tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int teql_master_open(struct net_device *dev) +{ + struct Qdisc *q; + struct teql_master *m = netdev_priv(dev); + int mtu = 0xFFFE; + unsigned int flags = IFF_NOARP | IFF_MULTICAST; + + if (m->slaves == NULL) + return -EUNATCH; + + flags = FMASK; + + q = m->slaves; + do { + struct net_device *slave = qdisc_dev(q); + + if (slave == NULL) + return -EUNATCH; + + if (slave->mtu < mtu) + mtu = slave->mtu; + if (slave->hard_header_len > LL_MAX_HEADER) + return -EINVAL; + + /* If all the slaves are BROADCAST, master is BROADCAST + If all the slaves are PtP, master is PtP + Otherwise, master is NBMA. + */ + if (!(slave->flags&IFF_POINTOPOINT)) + flags &= ~IFF_POINTOPOINT; + if (!(slave->flags&IFF_BROADCAST)) + flags &= ~IFF_BROADCAST; + if (!(slave->flags&IFF_MULTICAST)) + flags &= ~IFF_MULTICAST; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + + m->dev->mtu = mtu; + m->dev->flags = (m->dev->flags&~FMASK) | flags; + netif_start_queue(m->dev); + return 0; +} + +static int teql_master_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct teql_master *m = netdev_priv(dev); + + stats->tx_packets = m->tx_packets; + stats->tx_bytes = m->tx_bytes; + stats->tx_errors = m->tx_errors; + stats->tx_dropped = m->tx_dropped; + return stats; +} + +static int teql_master_mtu(struct net_device *dev, int new_mtu) +{ + struct teql_master *m = netdev_priv(dev); + struct Qdisc *q; + + if (new_mtu < 68) + return -EINVAL; + + q = m->slaves; + if (q) { + do { + if (new_mtu > qdisc_dev(q)->mtu) + return -EINVAL; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + } + + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops teql_netdev_ops = { + .ndo_open = teql_master_open, + .ndo_stop = teql_master_close, + .ndo_start_xmit = teql_master_xmit, + .ndo_get_stats64 = teql_master_stats64, + .ndo_change_mtu = teql_master_mtu, +}; + +static __init void teql_master_setup(struct net_device *dev) +{ + struct teql_master *master = netdev_priv(dev); + struct Qdisc_ops *ops = &master->qops; + + master->dev = dev; + ops->priv_size = sizeof(struct teql_sched_data); + + ops->enqueue = teql_enqueue; + ops->dequeue = teql_dequeue; + ops->peek = teql_peek; + ops->init = teql_qdisc_init; + ops->reset = teql_reset; + ops->destroy = teql_destroy; + ops->owner = THIS_MODULE; + + dev->netdev_ops = &teql_netdev_ops; + dev->type = ARPHRD_VOID; + dev->mtu = 1500; + dev->tx_queue_len = 100; + dev->flags = IFF_NOARP; + dev->hard_header_len = LL_MAX_HEADER; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static LIST_HEAD(master_dev_list); +static int max_equalizers = 1; +module_param(max_equalizers, int, 0); +MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers"); + +static int __init teql_init(void) +{ + int i; + int err = -ENODEV; + + for (i = 0; i < max_equalizers; i++) { + struct net_device *dev; + struct teql_master *master; + + dev = alloc_netdev(sizeof(struct teql_master), + "teql%d", teql_master_setup); + if (!dev) { + err = -ENOMEM; + break; + } + + if ((err = register_netdev(dev))) { + free_netdev(dev); + break; + } + + master = netdev_priv(dev); + + strlcpy(master->qops.id, dev->name, IFNAMSIZ); + err = register_qdisc(&master->qops); + + if (err) { + unregister_netdev(dev); + free_netdev(dev); + break; + } + + list_add_tail(&master->master_list, &master_dev_list); + } + return i ? 0 : err; +} + +static void __exit teql_exit(void) +{ + struct teql_master *master, *nxt; + + list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) { + + list_del(&master->master_list); + + unregister_qdisc(&master->qops); + unregister_netdev(master->dev); + free_netdev(master->dev); + } +} + +module_init(teql_init); +module_exit(teql_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig new file mode 100644 index 00000000..126b014e --- /dev/null +++ b/net/sctp/Kconfig @@ -0,0 +1,100 @@ +# +# SCTP configuration +# + +menuconfig IP_SCTP + tristate "The SCTP Protocol (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + depends on IPV6 || IPV6=n + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA1 + select CRYPTO_MD5 if SCTP_HMAC_MD5 + select LIBCRC32C + ---help--- + Stream Control Transmission Protocol + + From RFC 2960 . + + "SCTP is a reliable transport protocol operating on top of a + connectionless packet network such as IP. It offers the following + services to its users: + + -- acknowledged error-free non-duplicated transfer of user data, + -- data fragmentation to conform to discovered path MTU size, + -- sequenced delivery of user messages within multiple streams, + with an option for order-of-arrival delivery of individual user + messages, + -- optional bundling of multiple user messages into a single SCTP + packet, and + -- network-level fault tolerance through supporting of multi- + homing at either or both ends of an association." + + To compile this protocol support as a module, choose M here: the + module will be called sctp. + + If in doubt, say N. + +if IP_SCTP + +config NET_SCTPPROBE + tristate "SCTP: Association probing" + depends on PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to SCTP association + state in response to incoming packets. It is used for debugging + SCTP congestion control algorithms. If you don't understand + what was just said, you don't need it: say N. + + To compile this code as a module, choose M here: the + module will be called sctp_probe. + +config SCTP_DBG_MSG + bool "SCTP: Debug messages" + help + If you say Y, this will enable verbose debugging messages. + + If unsure, say N. However, if you are running into problems, use + this option to gather detailed trace information + +config SCTP_DBG_OBJCNT + bool "SCTP: Debug object counts" + depends on PROC_FS + help + If you say Y, this will enable debugging support for counting the + type of objects that are currently allocated. This is useful for + identifying memory leaks. This debug information can be viewed by + 'cat /proc/net/sctp/sctp_dbg_objcnt' + + If unsure, say N + +choice + prompt "SCTP: Cookie HMAC Algorithm" + default SCTP_HMAC_MD5 + help + HMAC algorithm to be used during association initialization. It + is strongly recommended to use HMAC-SHA1 or HMAC-MD5. See + configuration for Cryptographic API and enable those algorithms + to make usable by SCTP. + +config SCTP_HMAC_NONE + bool "None" + help + Choosing this disables the use of an HMAC during association + establishment. It is advised to use either HMAC-MD5 or HMAC-SHA1. + +config SCTP_HMAC_SHA1 + bool "HMAC-SHA1" + help + Enable the use of HMAC-SHA1 during association establishment. It + is advised to use either HMAC-MD5 or HMAC-SHA1. + +config SCTP_HMAC_MD5 + bool "HMAC-MD5" + help + Enable the use of HMAC-MD5 during association establishment. It is + advised to use either HMAC-MD5 or HMAC-SHA1. + +endchoice + +endif # IP_SCTP diff --git a/net/sctp/Makefile b/net/sctp/Makefile new file mode 100644 index 00000000..5c30b7a8 --- /dev/null +++ b/net/sctp/Makefile @@ -0,0 +1,21 @@ +# +# Makefile for SCTP support code. +# + +obj-$(CONFIG_IP_SCTP) += sctp.o +obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o + +sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ + protocol.o endpointola.o associola.o \ + transport.o chunk.o sm_make_chunk.o ulpevent.o \ + inqueue.o outqueue.o ulpqueue.o command.o \ + tsnmap.o bind_addr.o socket.o primitive.o \ + output.o input.o debug.o ssnmap.o auth.o + +sctp_probe-y := probe.o + +sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o +sctp-$(CONFIG_PROC_FS) += proc.o +sctp-$(CONFIG_SYSCTL) += sysctl.o + +sctp-$(subst m,y,$(CONFIG_IPV6)) += ipv6.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c new file mode 100644 index 00000000..17a6e658 --- /dev/null +++ b/net/sctp/associola.c @@ -0,0 +1,1653 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * This module provides the abstraction for an SCTP association. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Xingang Guo + * Hui Huang + * Sridhar Samudrala + * Daisy Chang + * Ryan Layer + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Forward declarations for internal functions. */ +static void sctp_assoc_bh_rcv(struct work_struct *work); +static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc); +static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc); + +/* Keep track of the new idr low so that we don't re-use association id + * numbers too fast. It is protected by they idr spin lock is in the + * range of 1 - INT_MAX. + */ +static u32 idr_low = 1; + + +/* 1st Level Abstractions. */ + +/* Initialize a new association from provided memory. */ +static struct sctp_association *sctp_association_init(struct sctp_association *asoc, + const struct sctp_endpoint *ep, + const struct sock *sk, + sctp_scope_t scope, + gfp_t gfp) +{ + struct sctp_sock *sp; + int i; + sctp_paramhdr_t *p; + int err; + + /* Retrieve the SCTP per socket area. */ + sp = sctp_sk((struct sock *)sk); + + /* Discarding const is appropriate here. */ + asoc->ep = (struct sctp_endpoint *)ep; + sctp_endpoint_hold(asoc->ep); + + /* Hold the sock. */ + asoc->base.sk = (struct sock *)sk; + sock_hold(asoc->base.sk); + + /* Initialize the common base substructure. */ + asoc->base.type = SCTP_EP_TYPE_ASSOCIATION; + + /* Initialize the object handling fields. */ + atomic_set(&asoc->base.refcnt, 1); + asoc->base.dead = 0; + asoc->base.malloced = 0; + + /* Initialize the bind addr area. */ + sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port); + + asoc->state = SCTP_STATE_CLOSED; + + /* Set these values from the socket values, a conversion between + * millsecons to seconds/microseconds must also be done. + */ + asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; + asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) + * 1000; + asoc->frag_point = 0; + asoc->user_frag = sp->user_frag; + + /* Set the association max_retrans and RTO values from the + * socket values. + */ + asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; + asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); + asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); + asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min); + + asoc->overall_error_count = 0; + + /* Initialize the association's heartbeat interval based on the + * sock configured value. + */ + asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + + /* Initialize path max retrans value. */ + asoc->pathmaxrxt = sp->pathmaxrxt; + + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + + /* Set association default SACK delay */ + asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); + asoc->sackfreq = sp->sackfreq; + + /* Set the association default flags controlling + * Heartbeat, SACK delay, and Path MTU Discovery. + */ + asoc->param_flags = sp->param_flags; + + /* Initialize the maximum mumber of new data packets that can be sent + * in a burst. + */ + asoc->max_burst = sp->max_burst; + + /* initialize association timers */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; + + /* sctpimpguide Section 2.12.2 + * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the + * recommended value of 5 times 'RTO.Max'. + */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] + = 5 * asoc->rto_max; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ; + + /* Initializes the timers */ + for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) + setup_timer(&asoc->timers[i], sctp_timer_events[i], + (unsigned long)asoc); + + /* Pull default initialization values from the sock options. + * Note: This assumes that the values have already been + * validated in the sock. + */ + asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams; + asoc->c.sinit_num_ostreams = sp->initmsg.sinit_num_ostreams; + asoc->max_init_attempts = sp->initmsg.sinit_max_attempts; + + asoc->max_init_timeo = + msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo); + + /* Allocate storage for the ssnmap after the inbound and outbound + * streams have been negotiated during Init. + */ + asoc->ssnmap = NULL; + + /* Set the local window size for receive. + * This is also the rcvbuf space per association. + * RFC 6 - A SCTP receiver MUST be able to receive a minimum of + * 1500 bytes in one SCTP packet. + */ + if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) + asoc->rwnd = SCTP_DEFAULT_MINWINDOW; + else + asoc->rwnd = sk->sk_rcvbuf/2; + + asoc->a_rwnd = asoc->rwnd; + + asoc->rwnd_over = 0; + asoc->rwnd_press = 0; + + /* Use my own max window until I learn something better. */ + asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW; + + /* Set the sndbuf size for transmit. */ + asoc->sndbuf_used = 0; + + /* Initialize the receive memory counter */ + atomic_set(&asoc->rmem_alloc, 0); + + init_waitqueue_head(&asoc->wait); + + asoc->c.my_vtag = sctp_generate_tag(ep); + asoc->peer.i.init_tag = 0; /* INIT needs a vtag of 0. */ + asoc->c.peer_vtag = 0; + asoc->c.my_ttag = 0; + asoc->c.peer_ttag = 0; + asoc->c.my_port = ep->base.bind_addr.port; + + asoc->c.initial_tsn = sctp_generate_tsn(ep); + + asoc->next_tsn = asoc->c.initial_tsn; + + asoc->ctsn_ack_point = asoc->next_tsn - 1; + asoc->adv_peer_ack_point = asoc->ctsn_ack_point; + asoc->highest_sacked = asoc->ctsn_ack_point; + asoc->last_cwr_tsn = asoc->ctsn_ack_point; + asoc->unack_data = 0; + + /* ADDIP Section 4.1 Asconf Chunk Procedures + * + * When an endpoint has an ASCONF signaled change to be sent to the + * remote endpoint it should do the following: + * ... + * A2) a serial number should be assigned to the chunk. The serial + * number SHOULD be a monotonically increasing number. The serial + * numbers SHOULD be initialized at the start of the + * association to the same value as the initial TSN. + */ + asoc->addip_serial = asoc->c.initial_tsn; + + INIT_LIST_HEAD(&asoc->addip_chunk_list); + INIT_LIST_HEAD(&asoc->asconf_ack_list); + + /* Make an empty list of remote transport addresses. */ + INIT_LIST_HEAD(&asoc->peer.transport_addr_list); + asoc->peer.transport_count = 0; + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * After the reception of the first data chunk in an + * association the endpoint must immediately respond with a + * sack to acknowledge the data chunk. Subsequent + * acknowledgements should be done as described in Section + * 6.2. + * + * [We implement this by telling a new association that it + * already received one packet.] + */ + asoc->peer.sack_needed = 1; + asoc->peer.sack_cnt = 0; + + /* Assume that the peer will tell us if he recognizes ASCONF + * as part of INIT exchange. + * The sctp_addip_noauth option is there for backward compatibilty + * and will revert old behavior. + */ + asoc->peer.asconf_capable = 0; + if (sctp_addip_noauth) + asoc->peer.asconf_capable = 1; + + /* Create an input queue. */ + sctp_inq_init(&asoc->base.inqueue); + sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv); + + /* Create an output queue. */ + sctp_outq_init(asoc, &asoc->outqueue); + + if (!sctp_ulpq_init(&asoc->ulpq, asoc)) + goto fail_init; + + memset(&asoc->peer.tsn_map, 0, sizeof(struct sctp_tsnmap)); + + asoc->need_ecne = 0; + + asoc->assoc_id = 0; + + /* Assume that peer would support both address types unless we are + * told otherwise. + */ + asoc->peer.ipv4_address = 1; + if (asoc->base.sk->sk_family == PF_INET6) + asoc->peer.ipv6_address = 1; + INIT_LIST_HEAD(&asoc->asocs); + + asoc->autoclose = sp->autoclose; + + asoc->default_stream = sp->default_stream; + asoc->default_ppid = sp->default_ppid; + asoc->default_flags = sp->default_flags; + asoc->default_context = sp->default_context; + asoc->default_timetolive = sp->default_timetolive; + asoc->default_rcv_context = sp->default_rcv_context; + + /* AUTH related initializations */ + INIT_LIST_HEAD(&asoc->endpoint_shared_keys); + err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp); + if (err) + goto fail_init; + + asoc->active_key_id = ep->active_key_id; + asoc->asoc_shared_key = NULL; + + asoc->default_hmac_id = 0; + /* Save the hmacs and chunks list into this association */ + if (ep->auth_hmacs_list) + memcpy(asoc->c.auth_hmacs, ep->auth_hmacs_list, + ntohs(ep->auth_hmacs_list->param_hdr.length)); + if (ep->auth_chunk_list) + memcpy(asoc->c.auth_chunks, ep->auth_chunk_list, + ntohs(ep->auth_chunk_list->param_hdr.length)); + + /* Get the AUTH random number for this association */ + p = (sctp_paramhdr_t *)asoc->c.auth_random; + p->type = SCTP_PARAM_RANDOM; + p->length = htons(sizeof(sctp_paramhdr_t) + SCTP_AUTH_RANDOM_LENGTH); + get_random_bytes(p+1, SCTP_AUTH_RANDOM_LENGTH); + + return asoc; + +fail_init: + sctp_endpoint_put(asoc->ep); + sock_put(asoc->base.sk); + return NULL; +} + +/* Allocate and initialize a new association */ +struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, + const struct sock *sk, + sctp_scope_t scope, + gfp_t gfp) +{ + struct sctp_association *asoc; + + asoc = t_new(struct sctp_association, gfp); + if (!asoc) + goto fail; + + if (!sctp_association_init(asoc, ep, sk, scope, gfp)) + goto fail_init; + + asoc->base.malloced = 1; + SCTP_DBG_OBJCNT_INC(assoc); + SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc); + + return asoc; + +fail_init: + kfree(asoc); +fail: + return NULL; +} + +/* Free this association if possible. There may still be users, so + * the actual deallocation may be delayed. + */ +void sctp_association_free(struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + struct sctp_transport *transport; + struct list_head *pos, *temp; + int i; + + /* Only real associations count against the endpoint, so + * don't bother for if this is a temporary association. + */ + if (!asoc->temp) { + list_del(&asoc->asocs); + + /* Decrement the backlog value for a TCP-style listening + * socket. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + sk->sk_ack_backlog--; + } + + /* Mark as dead, so other users can know this structure is + * going away. + */ + asoc->base.dead = 1; + + /* Dispose of any data lying around in the outqueue. */ + sctp_outq_free(&asoc->outqueue); + + /* Dispose of any pending messages for the upper layer. */ + sctp_ulpq_free(&asoc->ulpq); + + /* Dispose of any pending chunks on the inqueue. */ + sctp_inq_free(&asoc->base.inqueue); + + sctp_tsnmap_free(&asoc->peer.tsn_map); + + /* Free ssnmap storage. */ + sctp_ssnmap_free(asoc->ssnmap); + + /* Clean up the bound address list. */ + sctp_bind_addr_free(&asoc->base.bind_addr); + + /* Do we need to go through all of our timers and + * delete them? To be safe we will try to delete all, but we + * should be able to go through and make a guess based + * on our state. + */ + for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { + if (timer_pending(&asoc->timers[i]) && + del_timer(&asoc->timers[i])) + sctp_association_put(asoc); + } + + /* Free peer's cached cookie. */ + kfree(asoc->peer.cookie); + kfree(asoc->peer.peer_random); + kfree(asoc->peer.peer_chunks); + kfree(asoc->peer.peer_hmacs); + + /* Release the transport structures. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + list_del(pos); + sctp_transport_free(transport); + } + + asoc->peer.transport_count = 0; + + sctp_asconf_queue_teardown(asoc); + + /* AUTH - Free the endpoint shared keys */ + sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); + + /* AUTH - Free the association shared key */ + sctp_auth_key_put(asoc->asoc_shared_key); + + sctp_association_put(asoc); +} + +/* Cleanup and free up an association. */ +static void sctp_association_destroy(struct sctp_association *asoc) +{ + SCTP_ASSERT(asoc->base.dead, "Assoc is not dead", return); + + sctp_endpoint_put(asoc->ep); + sock_put(asoc->base.sk); + + if (asoc->assoc_id != 0) { + spin_lock_bh(&sctp_assocs_id_lock); + idr_remove(&sctp_assocs_id, asoc->assoc_id); + spin_unlock_bh(&sctp_assocs_id_lock); + } + + WARN_ON(atomic_read(&asoc->rmem_alloc)); + + if (asoc->base.malloced) { + kfree(asoc); + SCTP_DBG_OBJCNT_DEC(assoc); + } +} + +/* Change the primary destination address for the peer. */ +void sctp_assoc_set_primary(struct sctp_association *asoc, + struct sctp_transport *transport) +{ + int changeover = 0; + + /* it's a changeover only if we already have a primary path + * that we are changing + */ + if (asoc->peer.primary_path != NULL && + asoc->peer.primary_path != transport) + changeover = 1 ; + + asoc->peer.primary_path = transport; + + /* Set a default msg_name for events. */ + memcpy(&asoc->peer.primary_addr, &transport->ipaddr, + sizeof(union sctp_addr)); + + /* If the primary path is changing, assume that the + * user wants to use this new path. + */ + if ((transport->state == SCTP_ACTIVE) || + (transport->state == SCTP_UNKNOWN)) + asoc->peer.active_path = transport; + + /* + * SFR-CACC algorithm: + * Upon the receipt of a request to change the primary + * destination address, on the data structure for the new + * primary destination, the sender MUST do the following: + * + * 1) If CHANGEOVER_ACTIVE is set, then there was a switch + * to this destination address earlier. The sender MUST set + * CYCLING_CHANGEOVER to indicate that this switch is a + * double switch to the same destination address. + * + * Really, only bother is we have data queued or outstanding on + * the association. + */ + if (!asoc->outqueue.outstanding_bytes && !asoc->outqueue.out_qlen) + return; + + if (transport->cacc.changeover_active) + transport->cacc.cycling_changeover = changeover; + + /* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that + * a changeover has occurred. + */ + transport->cacc.changeover_active = changeover; + + /* 3) The sender MUST store the next TSN to be sent in + * next_tsn_at_change. + */ + transport->cacc.next_tsn_at_change = asoc->next_tsn; +} + +/* Remove a transport from an association. */ +void sctp_assoc_rm_peer(struct sctp_association *asoc, + struct sctp_transport *peer) +{ + struct list_head *pos; + struct sctp_transport *transport; + + SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ", + " port: %d\n", + asoc, + (&peer->ipaddr), + ntohs(peer->ipaddr.v4.sin_port)); + + /* If we are to remove the current retran_path, update it + * to the next peer before removing this peer from the list. + */ + if (asoc->peer.retran_path == peer) + sctp_assoc_update_retran_path(asoc); + + /* Remove this peer from the list. */ + list_del(&peer->transports); + + /* Get the first transport of asoc. */ + pos = asoc->peer.transport_addr_list.next; + transport = list_entry(pos, struct sctp_transport, transports); + + /* Update any entries that match the peer to be deleted. */ + if (asoc->peer.primary_path == peer) + sctp_assoc_set_primary(asoc, transport); + if (asoc->peer.active_path == peer) + asoc->peer.active_path = transport; + if (asoc->peer.retran_path == peer) + asoc->peer.retran_path = transport; + if (asoc->peer.last_data_from == peer) + asoc->peer.last_data_from = transport; + + /* If we remove the transport an INIT was last sent to, set it to + * NULL. Combined with the update of the retran path above, this + * will cause the next INIT to be sent to the next available + * transport, maintaining the cycle. + */ + if (asoc->init_last_sent_to == peer) + asoc->init_last_sent_to = NULL; + + /* If we remove the transport an SHUTDOWN was last sent to, set it + * to NULL. Combined with the update of the retran path above, this + * will cause the next SHUTDOWN to be sent to the next available + * transport, maintaining the cycle. + */ + if (asoc->shutdown_last_sent_to == peer) + asoc->shutdown_last_sent_to = NULL; + + /* If we remove the transport an ASCONF was last sent to, set it to + * NULL. + */ + if (asoc->addip_last_asconf && + asoc->addip_last_asconf->transport == peer) + asoc->addip_last_asconf->transport = NULL; + + /* If we have something on the transmitted list, we have to + * save it off. The best place is the active path. + */ + if (!list_empty(&peer->transmitted)) { + struct sctp_transport *active = asoc->peer.active_path; + struct sctp_chunk *ch; + + /* Reset the transport of each chunk on this list */ + list_for_each_entry(ch, &peer->transmitted, + transmitted_list) { + ch->transport = NULL; + ch->rtt_in_progress = 0; + } + + list_splice_tail_init(&peer->transmitted, + &active->transmitted); + + /* Start a T3 timer here in case it wasn't running so + * that these migrated packets have a chance to get + * retrnasmitted. + */ + if (!timer_pending(&active->T3_rtx_timer)) + if (!mod_timer(&active->T3_rtx_timer, + jiffies + active->rto)) + sctp_transport_hold(active); + } + + asoc->peer.transport_count--; + + sctp_transport_free(peer); +} + +/* Add a transport address to an association. */ +struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, + const union sctp_addr *addr, + const gfp_t gfp, + const int peer_state) +{ + struct sctp_transport *peer; + struct sctp_sock *sp; + unsigned short port; + + sp = sctp_sk(asoc->base.sk); + + /* AF_INET and AF_INET6 share common port field. */ + port = ntohs(addr->v4.sin_port); + + SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ", + " port: %d state:%d\n", + asoc, + addr, + port, + peer_state); + + /* Set the port if it has not been set yet. */ + if (0 == asoc->peer.port) + asoc->peer.port = port; + + /* Check to see if this is a duplicate. */ + peer = sctp_assoc_lookup_paddr(asoc, addr); + if (peer) { + /* An UNKNOWN state is only set on transports added by + * user in sctp_connectx() call. Such transports should be + * considered CONFIRMED per RFC 4960, Section 5.4. + */ + if (peer->state == SCTP_UNKNOWN) { + peer->state = SCTP_ACTIVE; + } + return peer; + } + + peer = sctp_transport_new(addr, gfp); + if (!peer) + return NULL; + + sctp_transport_set_owner(peer, asoc); + + /* Initialize the peer's heartbeat interval based on the + * association configured value. + */ + peer->hbinterval = asoc->hbinterval; + + /* Set the path max_retrans. */ + peer->pathmaxrxt = asoc->pathmaxrxt; + + /* Initialize the peer's SACK delay timeout based on the + * association configured value. + */ + peer->sackdelay = asoc->sackdelay; + peer->sackfreq = asoc->sackfreq; + + /* Enable/disable heartbeat, SACK delay, and path MTU discovery + * based on association setting. + */ + peer->param_flags = asoc->param_flags; + + sctp_transport_route(peer, NULL, sp); + + /* Initialize the pmtu of the transport. */ + if (peer->param_flags & SPP_PMTUD_DISABLE) { + if (asoc->pathmtu) + peer->pathmtu = asoc->pathmtu; + else + peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; + } + + /* If this is the first transport addr on this association, + * initialize the association PMTU to the peer's PMTU. + * If not and the current association PMTU is higher than the new + * peer's PMTU, reset the association PMTU to the new peer's PMTU. + */ + if (asoc->pathmtu) + asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); + else + asoc->pathmtu = peer->pathmtu; + + SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " + "%d\n", asoc, asoc->pathmtu); + peer->pmtu_pending = 0; + + asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); + + /* The asoc->peer.port might not be meaningful yet, but + * initialize the packet structure anyway. + */ + sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port, + asoc->peer.port); + + /* 7.2.1 Slow-Start + * + * o The initial cwnd before DATA transmission or after a sufficiently + * long idle period MUST be set to + * min(4*MTU, max(2*MTU, 4380 bytes)) + * + * o The initial value of ssthresh MAY be arbitrarily high + * (for example, implementations MAY use the size of the + * receiver advertised window). + */ + peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); + + /* At this point, we may not have the receiver's advertised window, + * so initialize ssthresh to the default value and it will be set + * later when we process the INIT. + */ + peer->ssthresh = SCTP_DEFAULT_MAXWINDOW; + + peer->partial_bytes_acked = 0; + peer->flight_size = 0; + peer->burst_limited = 0; + + /* Set the transport's RTO.initial value */ + peer->rto = asoc->rto_initial; + + /* Set the peer's active state. */ + peer->state = peer_state; + + /* Attach the remote transport to our asoc. */ + list_add_tail(&peer->transports, &asoc->peer.transport_addr_list); + asoc->peer.transport_count++; + + /* If we do not yet have a primary path, set one. */ + if (!asoc->peer.primary_path) { + sctp_assoc_set_primary(asoc, peer); + asoc->peer.retran_path = peer; + } + + if (asoc->peer.active_path == asoc->peer.retran_path && + peer->state != SCTP_UNCONFIRMED) { + asoc->peer.retran_path = peer; + } + + return peer; +} + +/* Delete a transport address from an association. */ +void sctp_assoc_del_peer(struct sctp_association *asoc, + const union sctp_addr *addr) +{ + struct list_head *pos; + struct list_head *temp; + struct sctp_transport *transport; + + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { + /* Do book keeping for removing the peer and free it. */ + sctp_assoc_rm_peer(asoc, transport); + break; + } + } +} + +/* Lookup a transport by address. */ +struct sctp_transport *sctp_assoc_lookup_paddr( + const struct sctp_association *asoc, + const union sctp_addr *address) +{ + struct sctp_transport *t; + + /* Cycle through all transports searching for a peer address. */ + + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + if (sctp_cmp_addr_exact(address, &t->ipaddr)) + return t; + } + + return NULL; +} + +/* Remove all transports except a give one */ +void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc, + struct sctp_transport *primary) +{ + struct sctp_transport *temp; + struct sctp_transport *t; + + list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list, + transports) { + /* if the current transport is not the primary one, delete it */ + if (t != primary) + sctp_assoc_rm_peer(asoc, t); + } +} + +/* Engage in transport control operations. + * Mark the transport up or down and send a notification to the user. + * Select and update the new active and retran paths. + */ +void sctp_assoc_control_transport(struct sctp_association *asoc, + struct sctp_transport *transport, + sctp_transport_cmd_t command, + sctp_sn_error_t error) +{ + struct sctp_transport *t = NULL; + struct sctp_transport *first; + struct sctp_transport *second; + struct sctp_ulpevent *event; + struct sockaddr_storage addr; + int spc_state = 0; + + /* Record the transition on the transport. */ + switch (command) { + case SCTP_TRANSPORT_UP: + /* If we are moving from UNCONFIRMED state due + * to heartbeat success, report the SCTP_ADDR_CONFIRMED + * state to the user, otherwise report SCTP_ADDR_AVAILABLE. + */ + if (SCTP_UNCONFIRMED == transport->state && + SCTP_HEARTBEAT_SUCCESS == error) + spc_state = SCTP_ADDR_CONFIRMED; + else + spc_state = SCTP_ADDR_AVAILABLE; + transport->state = SCTP_ACTIVE; + break; + + case SCTP_TRANSPORT_DOWN: + /* If the transport was never confirmed, do not transition it + * to inactive state. Also, release the cached route since + * there may be a better route next time. + */ + if (transport->state != SCTP_UNCONFIRMED) + transport->state = SCTP_INACTIVE; + else { + dst_release(transport->dst); + transport->dst = NULL; + } + + spc_state = SCTP_ADDR_UNREACHABLE; + break; + + default: + return; + } + + /* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the + * user. + */ + memset(&addr, 0, sizeof(struct sockaddr_storage)); + memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); + event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, + 0, spc_state, error, GFP_ATOMIC); + if (event) + sctp_ulpq_tail_event(&asoc->ulpq, event); + + /* Select new active and retran paths. */ + + /* Look for the two most recently used active transports. + * + * This code produces the wrong ordering whenever jiffies + * rolls over, but we still get usable transports, so we don't + * worry about it. + */ + first = NULL; second = NULL; + + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + + if ((t->state == SCTP_INACTIVE) || + (t->state == SCTP_UNCONFIRMED)) + continue; + if (!first || t->last_time_heard > first->last_time_heard) { + second = first; + first = t; + } + if (!second || t->last_time_heard > second->last_time_heard) + second = t; + } + + /* RFC 2960 6.4 Multi-Homed SCTP Endpoints + * + * By default, an endpoint should always transmit to the + * primary path, unless the SCTP user explicitly specifies the + * destination transport address (and possibly source + * transport address) to use. + * + * [If the primary is active but not most recent, bump the most + * recently used transport.] + */ + if (((asoc->peer.primary_path->state == SCTP_ACTIVE) || + (asoc->peer.primary_path->state == SCTP_UNKNOWN)) && + first != asoc->peer.primary_path) { + second = first; + first = asoc->peer.primary_path; + } + + /* If we failed to find a usable transport, just camp on the + * primary, even if it is inactive. + */ + if (!first) { + first = asoc->peer.primary_path; + second = asoc->peer.primary_path; + } + + /* Set the active and retran transports. */ + asoc->peer.active_path = first; + asoc->peer.retran_path = second; +} + +/* Hold a reference to an association. */ +void sctp_association_hold(struct sctp_association *asoc) +{ + atomic_inc(&asoc->base.refcnt); +} + +/* Release a reference to an association and cleanup + * if there are no more references. + */ +void sctp_association_put(struct sctp_association *asoc) +{ + if (atomic_dec_and_test(&asoc->base.refcnt)) + sctp_association_destroy(asoc); +} + +/* Allocate the next TSN, Transmission Sequence Number, for the given + * association. + */ +__u32 sctp_association_get_next_tsn(struct sctp_association *asoc) +{ + /* From Section 1.6 Serial Number Arithmetic: + * Transmission Sequence Numbers wrap around when they reach + * 2**32 - 1. That is, the next TSN a DATA chunk MUST use + * after transmitting TSN = 2*32 - 1 is TSN = 0. + */ + __u32 retval = asoc->next_tsn; + asoc->next_tsn++; + asoc->unack_data++; + + return retval; +} + +/* Compare two addresses to see if they match. Wildcard addresses + * only match themselves. + */ +int sctp_cmp_addr_exact(const union sctp_addr *ss1, + const union sctp_addr *ss2) +{ + struct sctp_af *af; + + af = sctp_get_af_specific(ss1->sa.sa_family); + if (unlikely(!af)) + return 0; + + return af->cmp_addr(ss1, ss2); +} + +/* Return an ecne chunk to get prepended to a packet. + * Note: We are sly and return a shared, prealloced chunk. FIXME: + * No we don't, but we could/should. + */ +struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc) +{ + struct sctp_chunk *chunk; + + /* Send ECNE if needed. + * Not being able to allocate a chunk here is not deadly. + */ + if (asoc->need_ecne) + chunk = sctp_make_ecne(asoc, asoc->last_ecne_tsn); + else + chunk = NULL; + + return chunk; +} + +/* + * Find which transport this TSN was sent on. + */ +struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc, + __u32 tsn) +{ + struct sctp_transport *active; + struct sctp_transport *match; + struct sctp_transport *transport; + struct sctp_chunk *chunk; + __be32 key = htonl(tsn); + + match = NULL; + + /* + * FIXME: In general, find a more efficient data structure for + * searching. + */ + + /* + * The general strategy is to search each transport's transmitted + * list. Return which transport this TSN lives on. + * + * Let's be hopeful and check the active_path first. + * Another optimization would be to know if there is only one + * outbound path and not have to look for the TSN at all. + * + */ + + active = asoc->peer.active_path; + + list_for_each_entry(chunk, &active->transmitted, + transmitted_list) { + + if (key == chunk->subh.data_hdr->tsn) { + match = active; + goto out; + } + } + + /* If not found, go search all the other transports. */ + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + + if (transport == active) + break; + list_for_each_entry(chunk, &transport->transmitted, + transmitted_list) { + if (key == chunk->subh.data_hdr->tsn) { + match = transport; + goto out; + } + } + } +out: + return match; +} + +/* Is this the association we are looking for? */ +struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc, + const union sctp_addr *laddr, + const union sctp_addr *paddr) +{ + struct sctp_transport *transport; + + if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) && + (htons(asoc->peer.port) == paddr->v4.sin_port)) { + transport = sctp_assoc_lookup_paddr(asoc, paddr); + if (!transport) + goto out; + + if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr, + sctp_sk(asoc->base.sk))) + goto out; + } + transport = NULL; + +out: + return transport; +} + +/* Do delayed input processing. This is scheduled by sctp_rcv(). */ +static void sctp_assoc_bh_rcv(struct work_struct *work) +{ + struct sctp_association *asoc = + container_of(work, struct sctp_association, + base.inqueue.immediate); + struct sctp_endpoint *ep; + struct sctp_chunk *chunk; + struct sctp_inq *inqueue; + int state; + sctp_subtype_t subtype; + int error = 0; + + /* The association should be held so we should be safe. */ + ep = asoc->ep; + + inqueue = &asoc->base.inqueue; + sctp_association_hold(asoc); + while (NULL != (chunk = sctp_inq_pop(inqueue))) { + state = asoc->state; + subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); + + /* SCTP-AUTH, Section 6.3: + * The receiver has a list of chunk types which it expects + * to be received only after an AUTH-chunk. This list has + * been sent to the peer during the association setup. It + * MUST silently discard these chunks if they are not placed + * after an AUTH chunk in the packet. + */ + if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) + continue; + + /* Remember where the last DATA chunk came from so we + * know where to send the SACK. + */ + if (sctp_chunk_is_data(chunk)) + asoc->peer.last_data_from = chunk->transport; + else + SCTP_INC_STATS(SCTP_MIB_INCTRLCHUNKS); + + if (chunk->transport) + chunk->transport->last_time_heard = jiffies; + + /* Run through the state machine. */ + error = sctp_do_sm(SCTP_EVENT_T_CHUNK, subtype, + state, ep, asoc, chunk, GFP_ATOMIC); + + /* Check to see if the association is freed in response to + * the incoming chunk. If so, get out of the while loop. + */ + if (asoc->base.dead) + break; + + /* If there is an error on chunk, discard this packet. */ + if (error && chunk) + chunk->pdiscard = 1; + } + sctp_association_put(asoc); +} + +/* This routine moves an association from its old sk to a new sk. */ +void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) +{ + struct sctp_sock *newsp = sctp_sk(newsk); + struct sock *oldsk = assoc->base.sk; + + /* Delete the association from the old endpoint's list of + * associations. + */ + list_del_init(&assoc->asocs); + + /* Decrement the backlog value for a TCP-style socket. */ + if (sctp_style(oldsk, TCP)) + oldsk->sk_ack_backlog--; + + /* Release references to the old endpoint and the sock. */ + sctp_endpoint_put(assoc->ep); + sock_put(assoc->base.sk); + + /* Get a reference to the new endpoint. */ + assoc->ep = newsp->ep; + sctp_endpoint_hold(assoc->ep); + + /* Get a reference to the new sock. */ + assoc->base.sk = newsk; + sock_hold(assoc->base.sk); + + /* Add the association to the new endpoint's list of associations. */ + sctp_endpoint_add_asoc(newsp->ep, assoc); +} + +/* Update an association (possibly from unexpected COOKIE-ECHO processing). */ +void sctp_assoc_update(struct sctp_association *asoc, + struct sctp_association *new) +{ + struct sctp_transport *trans; + struct list_head *pos, *temp; + + /* Copy in new parameters of peer. */ + asoc->c = new->c; + asoc->peer.rwnd = new->peer.rwnd; + asoc->peer.sack_needed = new->peer.sack_needed; + asoc->peer.i = new->peer.i; + sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, + asoc->peer.i.initial_tsn, GFP_ATOMIC); + + /* Remove any peer addresses not present in the new association. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, transports); + if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) { + sctp_assoc_rm_peer(asoc, trans); + continue; + } + + if (asoc->state >= SCTP_STATE_ESTABLISHED) + sctp_transport_reset(trans); + } + + /* If the case is A (association restart), use + * initial_tsn as next_tsn. If the case is B, use + * current next_tsn in case data sent to peer + * has been discarded and needs retransmission. + */ + if (asoc->state >= SCTP_STATE_ESTABLISHED) { + asoc->next_tsn = new->next_tsn; + asoc->ctsn_ack_point = new->ctsn_ack_point; + asoc->adv_peer_ack_point = new->adv_peer_ack_point; + + /* Reinitialize SSN for both local streams + * and peer's streams. + */ + sctp_ssnmap_clear(asoc->ssnmap); + + /* Flush the ULP reassembly and ordered queue. + * Any data there will now be stale and will + * cause problems. + */ + sctp_ulpq_flush(&asoc->ulpq); + + /* reset the overall association error count so + * that the restarted association doesn't get torn + * down on the next retransmission timer. + */ + asoc->overall_error_count = 0; + + } else { + /* Add any peer addresses from the new association. */ + list_for_each_entry(trans, &new->peer.transport_addr_list, + transports) { + if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr)) + sctp_assoc_add_peer(asoc, &trans->ipaddr, + GFP_ATOMIC, trans->state); + } + + asoc->ctsn_ack_point = asoc->next_tsn - 1; + asoc->adv_peer_ack_point = asoc->ctsn_ack_point; + if (!asoc->ssnmap) { + /* Move the ssnmap. */ + asoc->ssnmap = new->ssnmap; + new->ssnmap = NULL; + } + + if (!asoc->assoc_id) { + /* get a new association id since we don't have one + * yet. + */ + sctp_assoc_set_id(asoc, GFP_ATOMIC); + } + } + + /* SCTP-AUTH: Save the peer parameters from the new assocaitions + * and also move the association shared keys over + */ + kfree(asoc->peer.peer_random); + asoc->peer.peer_random = new->peer.peer_random; + new->peer.peer_random = NULL; + + kfree(asoc->peer.peer_chunks); + asoc->peer.peer_chunks = new->peer.peer_chunks; + new->peer.peer_chunks = NULL; + + kfree(asoc->peer.peer_hmacs); + asoc->peer.peer_hmacs = new->peer.peer_hmacs; + new->peer.peer_hmacs = NULL; + + sctp_auth_key_put(asoc->asoc_shared_key); + sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); +} + +/* Update the retran path for sending a retransmitted packet. + * Round-robin through the active transports, else round-robin + * through the inactive transports as this is the next best thing + * we can try. + */ +void sctp_assoc_update_retran_path(struct sctp_association *asoc) +{ + struct sctp_transport *t, *next; + struct list_head *head = &asoc->peer.transport_addr_list; + struct list_head *pos; + + if (asoc->peer.transport_count == 1) + return; + + /* Find the next transport in a round-robin fashion. */ + t = asoc->peer.retran_path; + pos = &t->transports; + next = NULL; + + while (1) { + /* Skip the head. */ + if (pos->next == head) + pos = head->next; + else + pos = pos->next; + + t = list_entry(pos, struct sctp_transport, transports); + + /* We have exhausted the list, but didn't find any + * other active transports. If so, use the next + * transport. + */ + if (t == asoc->peer.retran_path) { + t = next; + break; + } + + /* Try to find an active transport. */ + + if ((t->state == SCTP_ACTIVE) || + (t->state == SCTP_UNKNOWN)) { + break; + } else { + /* Keep track of the next transport in case + * we don't find any active transport. + */ + if (t->state != SCTP_UNCONFIRMED && !next) + next = t; + } + } + + if (t) + asoc->peer.retran_path = t; + else + t = asoc->peer.retran_path; + + SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association" + " %p addr: ", + " port: %d\n", + asoc, + (&t->ipaddr), + ntohs(t->ipaddr.v4.sin_port)); +} + +/* Choose the transport for sending retransmit packet. */ +struct sctp_transport *sctp_assoc_choose_alter_transport( + struct sctp_association *asoc, struct sctp_transport *last_sent_to) +{ + /* If this is the first time packet is sent, use the active path, + * else use the retran path. If the last packet was sent over the + * retran path, update the retran path and use it. + */ + if (!last_sent_to) + return asoc->peer.active_path; + else { + if (last_sent_to == asoc->peer.retran_path) + sctp_assoc_update_retran_path(asoc); + return asoc->peer.retran_path; + } +} + +/* Update the association's pmtu and frag_point by going through all the + * transports. This routine is called when a transport's PMTU has changed. + */ +void sctp_assoc_sync_pmtu(struct sctp_association *asoc) +{ + struct sctp_transport *t; + __u32 pmtu = 0; + + if (!asoc) + return; + + /* Get the lowest pmtu of all the transports. */ + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + if (t->pmtu_pending && t->dst) { + sctp_transport_update_pmtu(t, dst_mtu(t->dst)); + t->pmtu_pending = 0; + } + if (!pmtu || (t->pathmtu < pmtu)) + pmtu = t->pathmtu; + } + + if (pmtu) { + asoc->pathmtu = pmtu; + asoc->frag_point = sctp_frag_point(asoc, pmtu); + } + + SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", + __func__, asoc, asoc->pathmtu, asoc->frag_point); +} + +/* Should we send a SACK to update our peer? */ +static inline int sctp_peer_needs_update(struct sctp_association *asoc) +{ + switch (asoc->state) { + case SCTP_STATE_ESTABLISHED: + case SCTP_STATE_SHUTDOWN_PENDING: + case SCTP_STATE_SHUTDOWN_RECEIVED: + case SCTP_STATE_SHUTDOWN_SENT: + if ((asoc->rwnd > asoc->a_rwnd) && + ((asoc->rwnd - asoc->a_rwnd) >= max_t(__u32, + (asoc->base.sk->sk_rcvbuf >> sctp_rwnd_upd_shift), + asoc->pathmtu))) + return 1; + break; + default: + break; + } + return 0; +} + +/* Increase asoc's rwnd by len and send any window update SACK if needed. */ +void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned len) +{ + struct sctp_chunk *sack; + struct timer_list *timer; + + if (asoc->rwnd_over) { + if (asoc->rwnd_over >= len) { + asoc->rwnd_over -= len; + } else { + asoc->rwnd += (len - asoc->rwnd_over); + asoc->rwnd_over = 0; + } + } else { + asoc->rwnd += len; + } + + /* If we had window pressure, start recovering it + * once our rwnd had reached the accumulated pressure + * threshold. The idea is to recover slowly, but up + * to the initial advertised window. + */ + if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { + int change = min(asoc->pathmtu, asoc->rwnd_press); + asoc->rwnd += change; + asoc->rwnd_press -= change; + } + + SCTP_DEBUG_PRINTK("%s: asoc %p rwnd increased by %d to (%u, %u) " + "- %u\n", __func__, asoc, len, asoc->rwnd, + asoc->rwnd_over, asoc->a_rwnd); + + /* Send a window update SACK if the rwnd has increased by at least the + * minimum of the association's PMTU and half of the receive buffer. + * The algorithm used is similar to the one described in + * Section 4.2.3.3 of RFC 1122. + */ + if (sctp_peer_needs_update(asoc)) { + asoc->a_rwnd = asoc->rwnd; + SCTP_DEBUG_PRINTK("%s: Sending window update SACK- asoc: %p " + "rwnd: %u a_rwnd: %u\n", __func__, + asoc, asoc->rwnd, asoc->a_rwnd); + sack = sctp_make_sack(asoc); + if (!sack) + return; + + asoc->peer.sack_needed = 0; + + sctp_outq_tail(&asoc->outqueue, sack); + + /* Stop the SACK timer. */ + timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; + if (timer_pending(timer) && del_timer(timer)) + sctp_association_put(asoc); + } +} + +/* Decrease asoc's rwnd by len. */ +void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len) +{ + int rx_count; + int over = 0; + + SCTP_ASSERT(asoc->rwnd, "rwnd zero", return); + SCTP_ASSERT(!asoc->rwnd_over, "rwnd_over not zero", return); + + if (asoc->ep->rcvbuf_policy) + rx_count = atomic_read(&asoc->rmem_alloc); + else + rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + + /* If we've reached or overflowed our receive buffer, announce + * a 0 rwnd if rwnd would still be positive. Store the + * the pottential pressure overflow so that the window can be restored + * back to original value. + */ + if (rx_count >= asoc->base.sk->sk_rcvbuf) + over = 1; + + if (asoc->rwnd >= len) { + asoc->rwnd -= len; + if (over) { + asoc->rwnd_press += asoc->rwnd; + asoc->rwnd = 0; + } + } else { + asoc->rwnd_over = len - asoc->rwnd; + asoc->rwnd = 0; + } + SCTP_DEBUG_PRINTK("%s: asoc %p rwnd decreased by %d to (%u, %u, %u)\n", + __func__, asoc, len, asoc->rwnd, + asoc->rwnd_over, asoc->rwnd_press); +} + +/* Build the bind address list for the association based on info from the + * local endpoint and the remote peer. + */ +int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, + sctp_scope_t scope, gfp_t gfp) +{ + int flags; + + /* Use scoping rules to determine the subset of addresses from + * the endpoint. + */ + flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; + if (asoc->peer.ipv4_address) + flags |= SCTP_ADDR4_PEERSUPP; + if (asoc->peer.ipv6_address) + flags |= SCTP_ADDR6_PEERSUPP; + + return sctp_bind_addr_copy(&asoc->base.bind_addr, + &asoc->ep->base.bind_addr, + scope, gfp, flags); +} + +/* Build the association's bind address list from the cookie. */ +int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, + struct sctp_cookie *cookie, + gfp_t gfp) +{ + int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length); + int var_size3 = cookie->raw_addr_list_len; + __u8 *raw = (__u8 *)cookie->peer_init + var_size2; + + return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3, + asoc->ep->base.bind_addr.port, gfp); +} + +/* Lookup laddr in the bind address list of an association. */ +int sctp_assoc_lookup_laddr(struct sctp_association *asoc, + const union sctp_addr *laddr) +{ + int found = 0; + + if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) && + sctp_bind_addr_match(&asoc->base.bind_addr, laddr, + sctp_sk(asoc->base.sk))) + found = 1; + + return found; +} + +/* Set an association id for a given association */ +int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) +{ + int assoc_id; + int error = 0; + + /* If the id is already assigned, keep it. */ + if (asoc->assoc_id) + return error; +retry: + if (unlikely(!idr_pre_get(&sctp_assocs_id, gfp))) + return -ENOMEM; + + spin_lock_bh(&sctp_assocs_id_lock); + error = idr_get_new_above(&sctp_assocs_id, (void *)asoc, + idr_low, &assoc_id); + if (!error) { + idr_low = assoc_id + 1; + if (idr_low == INT_MAX) + idr_low = 1; + } + spin_unlock_bh(&sctp_assocs_id_lock); + if (error == -EAGAIN) + goto retry; + else if (error) + return error; + + asoc->assoc_id = (sctp_assoc_t) assoc_id; + return error; +} + +/* Free the ASCONF queue */ +static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc) +{ + struct sctp_chunk *asconf; + struct sctp_chunk *tmp; + + list_for_each_entry_safe(asconf, tmp, &asoc->addip_chunk_list, list) { + list_del_init(&asconf->list); + sctp_chunk_free(asconf); + } +} + +/* Free asconf_ack cache */ +static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc) +{ + struct sctp_chunk *ack; + struct sctp_chunk *tmp; + + list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, + transmitted_list) { + list_del_init(&ack->transmitted_list); + sctp_chunk_free(ack); + } +} + +/* Clean up the ASCONF_ACK queue */ +void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc) +{ + struct sctp_chunk *ack; + struct sctp_chunk *tmp; + + /* We can remove all the entries from the queue up to + * the "Peer-Sequence-Number". + */ + list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, + transmitted_list) { + if (ack->subh.addip_hdr->serial == + htonl(asoc->peer.addip_serial)) + break; + + list_del_init(&ack->transmitted_list); + sctp_chunk_free(ack); + } +} + +/* Find the ASCONF_ACK whose serial number matches ASCONF */ +struct sctp_chunk *sctp_assoc_lookup_asconf_ack( + const struct sctp_association *asoc, + __be32 serial) +{ + struct sctp_chunk *ack; + + /* Walk through the list of cached ASCONF-ACKs and find the + * ack chunk whose serial number matches that of the request. + */ + list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) { + if (ack->subh.addip_hdr->serial == serial) { + sctp_chunk_hold(ack); + return ack; + } + } + + return NULL; +} + +void sctp_asconf_queue_teardown(struct sctp_association *asoc) +{ + /* Free any cached ASCONF_ACK chunk. */ + sctp_assoc_free_asconf_acks(asoc); + + /* Free the ASCONF queue. */ + sctp_assoc_free_asconf_queue(asoc); + + /* Free any cached ASCONF chunk. */ + if (asoc->addip_last_asconf) + sctp_chunk_free(asoc->addip_last_asconf); +} diff --git a/net/sctp/auth.c b/net/sctp/auth.c new file mode 100644 index 00000000..865e68fe --- /dev/null +++ b/net/sctp/auth.c @@ -0,0 +1,950 @@ +/* SCTP kernel implementation + * (C) Copyright 2007 Hewlett-Packard Development Company, L.P. + * + * This file is part of the SCTP kernel implementation + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Vlad Yasevich + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include + +static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { + { + /* id 0 is reserved. as all 0 */ + .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, + }, + { + .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, + .hmac_name="hmac(sha1)", + .hmac_len = SCTP_SHA1_SIG_SIZE, + }, + { + /* id 2 is reserved as well */ + .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, + }, +#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) + { + .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, + .hmac_name="hmac(sha256)", + .hmac_len = SCTP_SHA256_SIG_SIZE, + } +#endif +}; + + +void sctp_auth_key_put(struct sctp_auth_bytes *key) +{ + if (!key) + return; + + if (atomic_dec_and_test(&key->refcnt)) { + kfree(key); + SCTP_DBG_OBJCNT_DEC(keys); + } +} + +/* Create a new key structure of a given length */ +static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp) +{ + struct sctp_auth_bytes *key; + + /* Verify that we are not going to overflow INT_MAX */ + if ((INT_MAX - key_len) < sizeof(struct sctp_auth_bytes)) + return NULL; + + /* Allocate the shared key */ + key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp); + if (!key) + return NULL; + + key->len = key_len; + atomic_set(&key->refcnt, 1); + SCTP_DBG_OBJCNT_INC(keys); + + return key; +} + +/* Create a new shared key container with a give key id */ +struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) +{ + struct sctp_shared_key *new; + + /* Allocate the shared key container */ + new = kzalloc(sizeof(struct sctp_shared_key), gfp); + if (!new) + return NULL; + + INIT_LIST_HEAD(&new->key_list); + new->key_id = key_id; + + return new; +} + +/* Free the shared key structure */ +static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key) +{ + BUG_ON(!list_empty(&sh_key->key_list)); + sctp_auth_key_put(sh_key->key); + sh_key->key = NULL; + kfree(sh_key); +} + +/* Destroy the entire key list. This is done during the + * associon and endpoint free process. + */ +void sctp_auth_destroy_keys(struct list_head *keys) +{ + struct sctp_shared_key *ep_key; + struct sctp_shared_key *tmp; + + if (list_empty(keys)) + return; + + key_for_each_safe(ep_key, tmp, keys) { + list_del_init(&ep_key->key_list); + sctp_auth_shkey_free(ep_key); + } +} + +/* Compare two byte vectors as numbers. Return values + * are: + * 0 - vectors are equal + * < 0 - vector 1 is smaller than vector2 + * > 0 - vector 1 is greater than vector2 + * + * Algorithm is: + * This is performed by selecting the numerically smaller key vector... + * If the key vectors are equal as numbers but differ in length ... + * the shorter vector is considered smaller + * + * Examples (with small values): + * 000123456789 > 123456789 (first number is longer) + * 000123456789 < 234567891 (second number is larger numerically) + * 123456789 > 2345678 (first number is both larger & longer) + */ +static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1, + struct sctp_auth_bytes *vector2) +{ + int diff; + int i; + const __u8 *longer; + + diff = vector1->len - vector2->len; + if (diff) { + longer = (diff > 0) ? vector1->data : vector2->data; + + /* Check to see if the longer number is + * lead-zero padded. If it is not, it + * is automatically larger numerically. + */ + for (i = 0; i < abs(diff); i++ ) { + if (longer[i] != 0) + return diff; + } + } + + /* lengths are the same, compare numbers */ + return memcmp(vector1->data, vector2->data, vector1->len); +} + +/* + * Create a key vector as described in SCTP-AUTH, Section 6.1 + * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO + * parameter sent by each endpoint are concatenated as byte vectors. + * These parameters include the parameter type, parameter length, and + * the parameter value, but padding is omitted; all padding MUST be + * removed from this concatenation before proceeding with further + * computation of keys. Parameters which were not sent are simply + * omitted from the concatenation process. The resulting two vectors + * are called the two key vectors. + */ +static struct sctp_auth_bytes *sctp_auth_make_key_vector( + sctp_random_param_t *random, + sctp_chunks_param_t *chunks, + sctp_hmac_algo_param_t *hmacs, + gfp_t gfp) +{ + struct sctp_auth_bytes *new; + __u32 len; + __u32 offset = 0; + + len = ntohs(random->param_hdr.length) + ntohs(hmacs->param_hdr.length); + if (chunks) + len += ntohs(chunks->param_hdr.length); + + new = kmalloc(sizeof(struct sctp_auth_bytes) + len, gfp); + if (!new) + return NULL; + + new->len = len; + + memcpy(new->data, random, ntohs(random->param_hdr.length)); + offset += ntohs(random->param_hdr.length); + + if (chunks) { + memcpy(new->data + offset, chunks, + ntohs(chunks->param_hdr.length)); + offset += ntohs(chunks->param_hdr.length); + } + + memcpy(new->data + offset, hmacs, ntohs(hmacs->param_hdr.length)); + + return new; +} + + +/* Make a key vector based on our local parameters */ +static struct sctp_auth_bytes *sctp_auth_make_local_vector( + const struct sctp_association *asoc, + gfp_t gfp) +{ + return sctp_auth_make_key_vector( + (sctp_random_param_t*)asoc->c.auth_random, + (sctp_chunks_param_t*)asoc->c.auth_chunks, + (sctp_hmac_algo_param_t*)asoc->c.auth_hmacs, + gfp); +} + +/* Make a key vector based on peer's parameters */ +static struct sctp_auth_bytes *sctp_auth_make_peer_vector( + const struct sctp_association *asoc, + gfp_t gfp) +{ + return sctp_auth_make_key_vector(asoc->peer.peer_random, + asoc->peer.peer_chunks, + asoc->peer.peer_hmacs, + gfp); +} + + +/* Set the value of the association shared key base on the parameters + * given. The algorithm is: + * From the endpoint pair shared keys and the key vectors the + * association shared keys are computed. This is performed by selecting + * the numerically smaller key vector and concatenating it to the + * endpoint pair shared key, and then concatenating the numerically + * larger key vector to that. The result of the concatenation is the + * association shared key. + */ +static struct sctp_auth_bytes *sctp_auth_asoc_set_secret( + struct sctp_shared_key *ep_key, + struct sctp_auth_bytes *first_vector, + struct sctp_auth_bytes *last_vector, + gfp_t gfp) +{ + struct sctp_auth_bytes *secret; + __u32 offset = 0; + __u32 auth_len; + + auth_len = first_vector->len + last_vector->len; + if (ep_key->key) + auth_len += ep_key->key->len; + + secret = sctp_auth_create_key(auth_len, gfp); + if (!secret) + return NULL; + + if (ep_key->key) { + memcpy(secret->data, ep_key->key->data, ep_key->key->len); + offset += ep_key->key->len; + } + + memcpy(secret->data + offset, first_vector->data, first_vector->len); + offset += first_vector->len; + + memcpy(secret->data + offset, last_vector->data, last_vector->len); + + return secret; +} + +/* Create an association shared key. Follow the algorithm + * described in SCTP-AUTH, Section 6.1 + */ +static struct sctp_auth_bytes *sctp_auth_asoc_create_secret( + const struct sctp_association *asoc, + struct sctp_shared_key *ep_key, + gfp_t gfp) +{ + struct sctp_auth_bytes *local_key_vector; + struct sctp_auth_bytes *peer_key_vector; + struct sctp_auth_bytes *first_vector, + *last_vector; + struct sctp_auth_bytes *secret = NULL; + int cmp; + + + /* Now we need to build the key vectors + * SCTP-AUTH , Section 6.1 + * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO + * parameter sent by each endpoint are concatenated as byte vectors. + * These parameters include the parameter type, parameter length, and + * the parameter value, but padding is omitted; all padding MUST be + * removed from this concatenation before proceeding with further + * computation of keys. Parameters which were not sent are simply + * omitted from the concatenation process. The resulting two vectors + * are called the two key vectors. + */ + + local_key_vector = sctp_auth_make_local_vector(asoc, gfp); + peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp); + + if (!peer_key_vector || !local_key_vector) + goto out; + + /* Figure out the order in which the key_vectors will be + * added to the endpoint shared key. + * SCTP-AUTH, Section 6.1: + * This is performed by selecting the numerically smaller key + * vector and concatenating it to the endpoint pair shared + * key, and then concatenating the numerically larger key + * vector to that. If the key vectors are equal as numbers + * but differ in length, then the concatenation order is the + * endpoint shared key, followed by the shorter key vector, + * followed by the longer key vector. Otherwise, the key + * vectors are identical, and may be concatenated to the + * endpoint pair key in any order. + */ + cmp = sctp_auth_compare_vectors(local_key_vector, + peer_key_vector); + if (cmp < 0) { + first_vector = local_key_vector; + last_vector = peer_key_vector; + } else { + first_vector = peer_key_vector; + last_vector = local_key_vector; + } + + secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector, + gfp); +out: + kfree(local_key_vector); + kfree(peer_key_vector); + + return secret; +} + +/* + * Populate the association overlay list with the list + * from the endpoint. + */ +int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, + struct sctp_association *asoc, + gfp_t gfp) +{ + struct sctp_shared_key *sh_key; + struct sctp_shared_key *new; + + BUG_ON(!list_empty(&asoc->endpoint_shared_keys)); + + key_for_each(sh_key, &ep->endpoint_shared_keys) { + new = sctp_auth_shkey_create(sh_key->key_id, gfp); + if (!new) + goto nomem; + + new->key = sh_key->key; + sctp_auth_key_hold(new->key); + list_add(&new->key_list, &asoc->endpoint_shared_keys); + } + + return 0; + +nomem: + sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); + return -ENOMEM; +} + + +/* Public interface to creat the association shared key. + * See code above for the algorithm. + */ +int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) +{ + struct sctp_auth_bytes *secret; + struct sctp_shared_key *ep_key; + + /* If we don't support AUTH, or peer is not capable + * we don't need to do anything. + */ + if (!sctp_auth_enable || !asoc->peer.auth_capable) + return 0; + + /* If the key_id is non-zero and we couldn't find an + * endpoint pair shared key, we can't compute the + * secret. + * For key_id 0, endpoint pair shared key is a NULL key. + */ + ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id); + BUG_ON(!ep_key); + + secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); + if (!secret) + return -ENOMEM; + + sctp_auth_key_put(asoc->asoc_shared_key); + asoc->asoc_shared_key = secret; + + return 0; +} + + +/* Find the endpoint pair shared key based on the key_id */ +struct sctp_shared_key *sctp_auth_get_shkey( + const struct sctp_association *asoc, + __u16 key_id) +{ + struct sctp_shared_key *key; + + /* First search associations set of endpoint pair shared keys */ + key_for_each(key, &asoc->endpoint_shared_keys) { + if (key->key_id == key_id) + return key; + } + + return NULL; +} + +/* + * Initialize all the possible digest transforms that we can use. Right now + * now, the supported digests are SHA1 and SHA256. We do this here once + * because of the restrictiong that transforms may only be allocated in + * user context. This forces us to pre-allocated all possible transforms + * at the endpoint init time. + */ +int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) +{ + struct crypto_hash *tfm = NULL; + __u16 id; + + /* if the transforms are already allocted, we are done */ + if (!sctp_auth_enable) { + ep->auth_hmacs = NULL; + return 0; + } + + if (ep->auth_hmacs) + return 0; + + /* Allocated the array of pointers to transorms */ + ep->auth_hmacs = kzalloc( + sizeof(struct crypto_hash *) * SCTP_AUTH_NUM_HMACS, + gfp); + if (!ep->auth_hmacs) + return -ENOMEM; + + for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) { + + /* See is we support the id. Supported IDs have name and + * length fields set, so that we can allocated and use + * them. We can safely just check for name, for without the + * name, we can't allocate the TFM. + */ + if (!sctp_hmac_list[id].hmac_name) + continue; + + /* If this TFM has been allocated, we are all set */ + if (ep->auth_hmacs[id]) + continue; + + /* Allocate the ID */ + tfm = crypto_alloc_hash(sctp_hmac_list[id].hmac_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + goto out_err; + + ep->auth_hmacs[id] = tfm; + } + + return 0; + +out_err: + /* Clean up any successful allocations */ + sctp_auth_destroy_hmacs(ep->auth_hmacs); + return -ENOMEM; +} + +/* Destroy the hmac tfm array */ +void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[]) +{ + int i; + + if (!auth_hmacs) + return; + + for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) + { + if (auth_hmacs[i]) + crypto_free_hash(auth_hmacs[i]); + } + kfree(auth_hmacs); +} + + +struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) +{ + return &sctp_hmac_list[hmac_id]; +} + +/* Get an hmac description information that we can use to build + * the AUTH chunk + */ +struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) +{ + struct sctp_hmac_algo_param *hmacs; + __u16 n_elt; + __u16 id = 0; + int i; + + /* If we have a default entry, use it */ + if (asoc->default_hmac_id) + return &sctp_hmac_list[asoc->default_hmac_id]; + + /* Since we do not have a default entry, find the first entry + * we support and return that. Do not cache that id. + */ + hmacs = asoc->peer.peer_hmacs; + if (!hmacs) + return NULL; + + n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t)) >> 1; + for (i = 0; i < n_elt; i++) { + id = ntohs(hmacs->hmac_ids[i]); + + /* Check the id is in the supported range */ + if (id > SCTP_AUTH_HMAC_ID_MAX) { + id = 0; + continue; + } + + /* See is we support the id. Supported IDs have name and + * length fields set, so that we can allocated and use + * them. We can safely just check for name, for without the + * name, we can't allocate the TFM. + */ + if (!sctp_hmac_list[id].hmac_name) { + id = 0; + continue; + } + + break; + } + + if (id == 0) + return NULL; + + return &sctp_hmac_list[id]; +} + +static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) +{ + int found = 0; + int i; + + for (i = 0; i < n_elts; i++) { + if (hmac_id == hmacs[i]) { + found = 1; + break; + } + } + + return found; +} + +/* See if the HMAC_ID is one that we claim as supported */ +int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, + __be16 hmac_id) +{ + struct sctp_hmac_algo_param *hmacs; + __u16 n_elt; + + if (!asoc) + return 0; + + hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs; + n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t)) >> 1; + + return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id); +} + + +/* Cache the default HMAC id. This to follow this text from SCTP-AUTH: + * Section 6.1: + * The receiver of a HMAC-ALGO parameter SHOULD use the first listed + * algorithm it supports. + */ +void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, + struct sctp_hmac_algo_param *hmacs) +{ + struct sctp_endpoint *ep; + __u16 id; + int i; + int n_params; + + /* if the default id is already set, use it */ + if (asoc->default_hmac_id) + return; + + n_params = (ntohs(hmacs->param_hdr.length) + - sizeof(sctp_paramhdr_t)) >> 1; + ep = asoc->ep; + for (i = 0; i < n_params; i++) { + id = ntohs(hmacs->hmac_ids[i]); + + /* Check the id is in the supported range */ + if (id > SCTP_AUTH_HMAC_ID_MAX) + continue; + + /* If this TFM has been allocated, use this id */ + if (ep->auth_hmacs[id]) { + asoc->default_hmac_id = id; + break; + } + } +} + + +/* Check to see if the given chunk is supposed to be authenticated */ +static int __sctp_auth_cid(sctp_cid_t chunk, struct sctp_chunks_param *param) +{ + unsigned short len; + int found = 0; + int i; + + if (!param || param->param_hdr.length == 0) + return 0; + + len = ntohs(param->param_hdr.length) - sizeof(sctp_paramhdr_t); + + /* SCTP-AUTH, Section 3.2 + * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH + * chunks MUST NOT be listed in the CHUNKS parameter. However, if + * a CHUNKS parameter is received then the types for INIT, INIT-ACK, + * SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored. + */ + for (i = 0; !found && i < len; i++) { + switch (param->chunks[i]) { + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + case SCTP_CID_SHUTDOWN_COMPLETE: + case SCTP_CID_AUTH: + break; + + default: + if (param->chunks[i] == chunk) + found = 1; + break; + } + } + + return found; +} + +/* Check if peer requested that this chunk is authenticated */ +int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc) +{ + if (!sctp_auth_enable || !asoc || !asoc->peer.auth_capable) + return 0; + + return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); +} + +/* Check if we requested that peer authenticate this chunk. */ +int sctp_auth_recv_cid(sctp_cid_t chunk, const struct sctp_association *asoc) +{ + if (!sctp_auth_enable || !asoc) + return 0; + + return __sctp_auth_cid(chunk, + (struct sctp_chunks_param *)asoc->c.auth_chunks); +} + +/* SCTP-AUTH: Section 6.2: + * The sender MUST calculate the MAC as described in RFC2104 [2] using + * the hash function H as described by the MAC Identifier and the shared + * association key K based on the endpoint pair shared key described by + * the shared key identifier. The 'data' used for the computation of + * the AUTH-chunk is given by the AUTH chunk with its HMAC field set to + * zero (as shown in Figure 6) followed by all chunks that are placed + * after the AUTH chunk in the SCTP packet. + */ +void sctp_auth_calculate_hmac(const struct sctp_association *asoc, + struct sk_buff *skb, + struct sctp_auth_chunk *auth, + gfp_t gfp) +{ + struct scatterlist sg; + struct hash_desc desc; + struct sctp_auth_bytes *asoc_key; + __u16 key_id, hmac_id; + __u8 *digest; + unsigned char *end; + int free_key = 0; + + /* Extract the info we need: + * - hmac id + * - key id + */ + key_id = ntohs(auth->auth_hdr.shkey_id); + hmac_id = ntohs(auth->auth_hdr.hmac_id); + + if (key_id == asoc->active_key_id) + asoc_key = asoc->asoc_shared_key; + else { + struct sctp_shared_key *ep_key; + + ep_key = sctp_auth_get_shkey(asoc, key_id); + if (!ep_key) + return; + + asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); + if (!asoc_key) + return; + + free_key = 1; + } + + /* set up scatter list */ + end = skb_tail_pointer(skb); + sg_init_one(&sg, auth, end - (unsigned char *)auth); + + desc.tfm = asoc->ep->auth_hmacs[hmac_id]; + desc.flags = 0; + + digest = auth->auth_hdr.hmac; + if (crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len)) + goto free; + + crypto_hash_digest(&desc, &sg, sg.length, digest); + +free: + if (free_key) + sctp_auth_key_put(asoc_key); +} + +/* API Helpers */ + +/* Add a chunk to the endpoint authenticated chunk list */ +int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id) +{ + struct sctp_chunks_param *p = ep->auth_chunk_list; + __u16 nchunks; + __u16 param_len; + + /* If this chunk is already specified, we are done */ + if (__sctp_auth_cid(chunk_id, p)) + return 0; + + /* Check if we can add this chunk to the array */ + param_len = ntohs(p->param_hdr.length); + nchunks = param_len - sizeof(sctp_paramhdr_t); + if (nchunks == SCTP_NUM_CHUNK_TYPES) + return -EINVAL; + + p->chunks[nchunks] = chunk_id; + p->param_hdr.length = htons(param_len + 1); + return 0; +} + +/* Add hmac identifires to the endpoint list of supported hmac ids */ +int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, + struct sctp_hmacalgo *hmacs) +{ + int has_sha1 = 0; + __u16 id; + int i; + + /* Scan the list looking for unsupported id. Also make sure that + * SHA1 is specified. + */ + for (i = 0; i < hmacs->shmac_num_idents; i++) { + id = hmacs->shmac_idents[i]; + + if (id > SCTP_AUTH_HMAC_ID_MAX) + return -EOPNOTSUPP; + + if (SCTP_AUTH_HMAC_ID_SHA1 == id) + has_sha1 = 1; + + if (!sctp_hmac_list[id].hmac_name) + return -EOPNOTSUPP; + } + + if (!has_sha1) + return -EINVAL; + + memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0], + hmacs->shmac_num_idents * sizeof(__u16)); + ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) + + hmacs->shmac_num_idents * sizeof(__u16)); + return 0; +} + +/* Set a new shared key on either endpoint or association. If the + * the key with a same ID already exists, replace the key (remove the + * old key and add a new one). + */ +int sctp_auth_set_key(struct sctp_endpoint *ep, + struct sctp_association *asoc, + struct sctp_authkey *auth_key) +{ + struct sctp_shared_key *cur_key = NULL; + struct sctp_auth_bytes *key; + struct list_head *sh_keys; + int replace = 0; + + /* Try to find the given key id to see if + * we are doing a replace, or adding a new key + */ + if (asoc) + sh_keys = &asoc->endpoint_shared_keys; + else + sh_keys = &ep->endpoint_shared_keys; + + key_for_each(cur_key, sh_keys) { + if (cur_key->key_id == auth_key->sca_keynumber) { + replace = 1; + break; + } + } + + /* If we are not replacing a key id, we need to allocate + * a shared key. + */ + if (!replace) { + cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, + GFP_KERNEL); + if (!cur_key) + return -ENOMEM; + } + + /* Create a new key data based on the info passed in */ + key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL); + if (!key) + goto nomem; + + memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); + + /* If we are replacing, remove the old keys data from the + * key id. If we are adding new key id, add it to the + * list. + */ + if (replace) + sctp_auth_key_put(cur_key->key); + else + list_add(&cur_key->key_list, sh_keys); + + cur_key->key = key; + sctp_auth_key_hold(key); + + return 0; +nomem: + if (!replace) + sctp_auth_shkey_free(cur_key); + + return -ENOMEM; +} + +int sctp_auth_set_active_key(struct sctp_endpoint *ep, + struct sctp_association *asoc, + __u16 key_id) +{ + struct sctp_shared_key *key; + struct list_head *sh_keys; + int found = 0; + + /* The key identifier MUST correst to an existing key */ + if (asoc) + sh_keys = &asoc->endpoint_shared_keys; + else + sh_keys = &ep->endpoint_shared_keys; + + key_for_each(key, sh_keys) { + if (key->key_id == key_id) { + found = 1; + break; + } + } + + if (!found) + return -EINVAL; + + if (asoc) { + asoc->active_key_id = key_id; + sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL); + } else + ep->active_key_id = key_id; + + return 0; +} + +int sctp_auth_del_key_id(struct sctp_endpoint *ep, + struct sctp_association *asoc, + __u16 key_id) +{ + struct sctp_shared_key *key; + struct list_head *sh_keys; + int found = 0; + + /* The key identifier MUST NOT be the current active key + * The key identifier MUST correst to an existing key + */ + if (asoc) { + if (asoc->active_key_id == key_id) + return -EINVAL; + + sh_keys = &asoc->endpoint_shared_keys; + } else { + if (ep->active_key_id == key_id) + return -EINVAL; + + sh_keys = &ep->endpoint_shared_keys; + } + + key_for_each(key, sh_keys) { + if (key->key_id == key_id) { + found = 1; + break; + } + } + + if (!found) + return -EINVAL; + + /* Delete the shared key */ + list_del_init(&key->key_list); + sctp_auth_shkey_free(key); + + return 0; +} diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c new file mode 100644 index 00000000..83e3011c --- /dev/null +++ b/net/sctp/bind_addr.c @@ -0,0 +1,551 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2003 + * Copyright (c) Cisco 1999,2000 + * Copyright (c) Motorola 1999,2000,2001 + * Copyright (c) La Monte H.P. Yarroll 2001 + * + * This file is part of the SCTP kernel implementation. + * + * A collection class to handle the storage of transport addresses. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Daisy Chang + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *, + sctp_scope_t scope, gfp_t gfp, + int flags); +static void sctp_bind_addr_clean(struct sctp_bind_addr *); + +/* First Level Abstractions. */ + +/* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses + * in 'src' which have a broader scope than 'scope'. + */ +int sctp_bind_addr_copy(struct sctp_bind_addr *dest, + const struct sctp_bind_addr *src, + sctp_scope_t scope, gfp_t gfp, + int flags) +{ + struct sctp_sockaddr_entry *addr; + int error = 0; + + /* All addresses share the same port. */ + dest->port = src->port; + + /* Extract the addresses which are relevant for this scope. */ + list_for_each_entry(addr, &src->address_list, list) { + error = sctp_copy_one_addr(dest, &addr->a, scope, + gfp, flags); + if (error < 0) + goto out; + } + + /* If there are no addresses matching the scope and + * this is global scope, try to get a link scope address, with + * the assumption that we must be sitting behind a NAT. + */ + if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { + list_for_each_entry(addr, &src->address_list, list) { + error = sctp_copy_one_addr(dest, &addr->a, + SCTP_SCOPE_LINK, gfp, + flags); + if (error < 0) + goto out; + } + } + +out: + if (error) + sctp_bind_addr_clean(dest); + + return error; +} + +/* Exactly duplicate the address lists. This is necessary when doing + * peer-offs and accepts. We don't want to put all the current system + * addresses into the endpoint. That's useless. But we do want duplicat + * the list of bound addresses that the older endpoint used. + */ +int sctp_bind_addr_dup(struct sctp_bind_addr *dest, + const struct sctp_bind_addr *src, + gfp_t gfp) +{ + struct sctp_sockaddr_entry *addr; + int error = 0; + + /* All addresses share the same port. */ + dest->port = src->port; + + list_for_each_entry(addr, &src->address_list, list) { + error = sctp_add_bind_addr(dest, &addr->a, 1, gfp); + if (error < 0) + break; + } + + return error; +} + +/* Initialize the SCTP_bind_addr structure for either an endpoint or + * an association. + */ +void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) +{ + bp->malloced = 0; + + INIT_LIST_HEAD(&bp->address_list); + bp->port = port; +} + +/* Dispose of the address list. */ +static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) +{ + struct sctp_sockaddr_entry *addr, *temp; + + /* Empty the bind address list. */ + list_for_each_entry_safe(addr, temp, &bp->address_list, list) { + list_del_rcu(&addr->list); + kfree_rcu(addr, rcu); + SCTP_DBG_OBJCNT_DEC(addr); + } +} + +/* Dispose of an SCTP_bind_addr structure */ +void sctp_bind_addr_free(struct sctp_bind_addr *bp) +{ + /* Empty the bind address list. */ + sctp_bind_addr_clean(bp); + + if (bp->malloced) { + kfree(bp); + SCTP_DBG_OBJCNT_DEC(bind_addr); + } +} + +/* Add an address to the bind address list in the SCTP_bind_addr structure. */ +int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, + __u8 addr_state, gfp_t gfp) +{ + struct sctp_sockaddr_entry *addr; + + /* Add the address to the bind address list. */ + addr = t_new(struct sctp_sockaddr_entry, gfp); + if (!addr) + return -ENOMEM; + + memcpy(&addr->a, new, sizeof(*new)); + + /* Fix up the port if it has not yet been set. + * Both v4 and v6 have the port at the same offset. + */ + if (!addr->a.v4.sin_port) + addr->a.v4.sin_port = htons(bp->port); + + addr->state = addr_state; + addr->valid = 1; + + INIT_LIST_HEAD(&addr->list); + + /* We always hold a socket lock when calling this function, + * and that acts as a writer synchronizing lock. + */ + list_add_tail_rcu(&addr->list, &bp->address_list); + SCTP_DBG_OBJCNT_INC(addr); + + return 0; +} + +/* Delete an address from the bind address list in the SCTP_bind_addr + * structure. + */ +int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) +{ + struct sctp_sockaddr_entry *addr, *temp; + int found = 0; + + /* We hold the socket lock when calling this function, + * and that acts as a writer synchronizing lock. + */ + list_for_each_entry_safe(addr, temp, &bp->address_list, list) { + if (sctp_cmp_addr_exact(&addr->a, del_addr)) { + /* Found the exact match. */ + found = 1; + addr->valid = 0; + list_del_rcu(&addr->list); + break; + } + } + + if (found) { + kfree_rcu(addr, rcu); + SCTP_DBG_OBJCNT_DEC(addr); + return 0; + } + + return -EINVAL; +} + +/* Create a network byte-order representation of all the addresses + * formated as SCTP parameters. + * + * The second argument is the return value for the length. + */ +union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, + int *addrs_len, + gfp_t gfp) +{ + union sctp_params addrparms; + union sctp_params retval; + int addrparms_len; + union sctp_addr_param rawaddr; + int len; + struct sctp_sockaddr_entry *addr; + struct list_head *pos; + struct sctp_af *af; + + addrparms_len = 0; + len = 0; + + /* Allocate enough memory at once. */ + list_for_each(pos, &bp->address_list) { + len += sizeof(union sctp_addr_param); + } + + /* Don't even bother embedding an address if there + * is only one. + */ + if (len == sizeof(union sctp_addr_param)) { + retval.v = NULL; + goto end_raw; + } + + retval.v = kmalloc(len, gfp); + if (!retval.v) + goto end_raw; + + addrparms = retval; + + list_for_each_entry(addr, &bp->address_list, list) { + af = sctp_get_af_specific(addr->a.v4.sin_family); + len = af->to_addr_param(&addr->a, &rawaddr); + memcpy(addrparms.v, &rawaddr, len); + addrparms.v += len; + addrparms_len += len; + } + +end_raw: + *addrs_len = addrparms_len; + return retval; +} + +/* + * Create an address list out of the raw address list format (IPv4 and IPv6 + * address parameters). + */ +int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, + int addrs_len, __u16 port, gfp_t gfp) +{ + union sctp_addr_param *rawaddr; + struct sctp_paramhdr *param; + union sctp_addr addr; + int retval = 0; + int len; + struct sctp_af *af; + + /* Convert the raw address to standard address format */ + while (addrs_len) { + param = (struct sctp_paramhdr *)raw_addr_list; + rawaddr = (union sctp_addr_param *)raw_addr_list; + + af = sctp_get_af_specific(param_type2af(param->type)); + if (unlikely(!af)) { + retval = -EINVAL; + sctp_bind_addr_clean(bp); + break; + } + + af->from_addr_param(&addr, rawaddr, htons(port), 0); + retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp); + if (retval) { + /* Can't finish building the list, clean up. */ + sctp_bind_addr_clean(bp); + break; + } + + len = ntohs(param->length); + addrs_len -= len; + raw_addr_list += len; + } + + return retval; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Does this contain a specified address? Allow wildcarding. */ +int sctp_bind_addr_match(struct sctp_bind_addr *bp, + const union sctp_addr *addr, + struct sctp_sock *opt) +{ + struct sctp_sockaddr_entry *laddr; + int match = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid) + continue; + if (opt->pf->cmp_addr(&laddr->a, addr, opt)) { + match = 1; + break; + } + } + rcu_read_unlock(); + + return match; +} + +/* Does the address 'addr' conflict with any addresses in + * the bp. + */ +int sctp_bind_addr_conflict(struct sctp_bind_addr *bp, + const union sctp_addr *addr, + struct sctp_sock *bp_sp, + struct sctp_sock *addr_sp) +{ + struct sctp_sockaddr_entry *laddr; + int conflict = 0; + struct sctp_sock *sp; + + /* Pick the IPv6 socket as the basis of comparison + * since it's usually a superset of the IPv4. + * If there is no IPv6 socket, then default to bind_addr. + */ + if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6) + sp = bp_sp; + else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6) + sp = addr_sp; + else + sp = bp_sp; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid) + continue; + + conflict = sp->pf->cmp_addr(&laddr->a, addr, sp); + if (conflict) + break; + } + rcu_read_unlock(); + + return conflict; +} + +/* Get the state of the entry in the bind_addr_list */ +int sctp_bind_addr_state(const struct sctp_bind_addr *bp, + const union sctp_addr *addr) +{ + struct sctp_sockaddr_entry *laddr; + struct sctp_af *af; + int state = -1; + + af = sctp_get_af_specific(addr->sa.sa_family); + if (unlikely(!af)) + return state; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid) + continue; + if (af->cmp_addr(&laddr->a, addr)) { + state = laddr->state; + break; + } + } + rcu_read_unlock(); + + return state; +} + +/* Find the first address in the bind address list that is not present in + * the addrs packed array. + */ +union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, + const union sctp_addr *addrs, + int addrcnt, + struct sctp_sock *opt) +{ + struct sctp_sockaddr_entry *laddr; + union sctp_addr *addr; + void *addr_buf; + struct sctp_af *af; + int i; + + /* This is only called sctp_send_asconf_del_ip() and we hold + * the socket lock in that code patch, so that address list + * can't change. + */ + list_for_each_entry(laddr, &bp->address_list, list) { + addr_buf = (union sctp_addr *)addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + if (!af) + break; + + if (opt->pf->cmp_addr(&laddr->a, addr, opt)) + break; + + addr_buf += af->sockaddr_len; + } + if (i == addrcnt) + return &laddr->a; + } + + return NULL; +} + +/* Copy out addresses from the global local address list. */ +static int sctp_copy_one_addr(struct sctp_bind_addr *dest, + union sctp_addr *addr, + sctp_scope_t scope, gfp_t gfp, + int flags) +{ + int error = 0; + + if (sctp_is_any(NULL, addr)) { + error = sctp_copy_local_addr_list(dest, scope, gfp, flags); + } else if (sctp_in_scope(addr, scope)) { + /* Now that the address is in scope, check to see if + * the address type is supported by local sock as + * well as the remote peer. + */ + if ((((AF_INET == addr->sa.sa_family) && + (flags & SCTP_ADDR4_PEERSUPP))) || + (((AF_INET6 == addr->sa.sa_family) && + (flags & SCTP_ADDR6_ALLOWED) && + (flags & SCTP_ADDR6_PEERSUPP)))) + error = sctp_add_bind_addr(dest, addr, SCTP_ADDR_SRC, + gfp); + } + + return error; +} + +/* Is this a wildcard address? */ +int sctp_is_any(struct sock *sk, const union sctp_addr *addr) +{ + unsigned short fam = 0; + struct sctp_af *af; + + /* Try to get the right address family */ + if (addr->sa.sa_family != AF_UNSPEC) + fam = addr->sa.sa_family; + else if (sk) + fam = sk->sk_family; + + af = sctp_get_af_specific(fam); + if (!af) + return 0; + + return af->is_any(addr); +} + +/* Is 'addr' valid for 'scope'? */ +int sctp_in_scope(const union sctp_addr *addr, sctp_scope_t scope) +{ + sctp_scope_t addr_scope = sctp_scope(addr); + + /* The unusable SCTP addresses will not be considered with + * any defined scopes. + */ + if (SCTP_SCOPE_UNUSABLE == addr_scope) + return 0; + /* + * For INIT and INIT-ACK address list, let L be the level of + * of requested destination address, sender and receiver + * SHOULD include all of its addresses with level greater + * than or equal to L. + * + * Address scoping can be selectively controlled via sysctl + * option + */ + switch (sctp_scope_policy) { + case SCTP_SCOPE_POLICY_DISABLE: + return 1; + case SCTP_SCOPE_POLICY_ENABLE: + if (addr_scope <= scope) + return 1; + break; + case SCTP_SCOPE_POLICY_PRIVATE: + if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope) + return 1; + break; + case SCTP_SCOPE_POLICY_LINK: + if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope) + return 1; + break; + default: + break; + } + + return 0; +} + +/******************************************************************** + * 3rd Level Abstractions + ********************************************************************/ + +/* What is the scope of 'addr'? */ +sctp_scope_t sctp_scope(const union sctp_addr *addr) +{ + struct sctp_af *af; + + af = sctp_get_af_specific(addr->sa.sa_family); + if (!af) + return SCTP_SCOPE_UNUSABLE; + + return af->scope((union sctp_addr *)addr); +} diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c new file mode 100644 index 00000000..6c855645 --- /dev/null +++ b/net/sctp/chunk.c @@ -0,0 +1,364 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2003, 2004 + * + * This file is part of the SCTP kernel implementation + * + * This file contains the code relating the chunk abstraction. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This file is mostly in anticipation of future work, but initially + * populate with fragment tracking for an outbound message. + */ + +/* Initialize datamsg from memory. */ +static void sctp_datamsg_init(struct sctp_datamsg *msg) +{ + atomic_set(&msg->refcnt, 1); + msg->send_failed = 0; + msg->send_error = 0; + msg->can_abandon = 0; + msg->can_delay = 1; + msg->expires_at = 0; + INIT_LIST_HEAD(&msg->chunks); +} + +/* Allocate and initialize datamsg. */ +SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) +{ + struct sctp_datamsg *msg; + msg = kmalloc(sizeof(struct sctp_datamsg), gfp); + if (msg) { + sctp_datamsg_init(msg); + SCTP_DBG_OBJCNT_INC(datamsg); + } + return msg; +} + +void sctp_datamsg_free(struct sctp_datamsg *msg) +{ + struct sctp_chunk *chunk; + + /* This doesn't have to be a _safe vairant because + * sctp_chunk_free() only drops the refs. + */ + list_for_each_entry(chunk, &msg->chunks, frag_list) + sctp_chunk_free(chunk); + + sctp_datamsg_put(msg); +} + +/* Final destructruction of datamsg memory. */ +static void sctp_datamsg_destroy(struct sctp_datamsg *msg) +{ + struct list_head *pos, *temp; + struct sctp_chunk *chunk; + struct sctp_sock *sp; + struct sctp_ulpevent *ev; + struct sctp_association *asoc = NULL; + int error = 0, notify; + + /* If we failed, we may need to notify. */ + notify = msg->send_failed ? -1 : 0; + + /* Release all references. */ + list_for_each_safe(pos, temp, &msg->chunks) { + list_del_init(pos); + chunk = list_entry(pos, struct sctp_chunk, frag_list); + /* Check whether we _really_ need to notify. */ + if (notify < 0) { + asoc = chunk->asoc; + if (msg->send_error) + error = msg->send_error; + else + error = asoc->outqueue.error; + + sp = sctp_sk(asoc->base.sk); + notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED, + &sp->subscribe); + } + + /* Generate a SEND FAILED event only if enabled. */ + if (notify > 0) { + int sent; + if (chunk->has_tsn) + sent = SCTP_DATA_SENT; + else + sent = SCTP_DATA_UNSENT; + + ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, + error, GFP_ATOMIC); + if (ev) + sctp_ulpq_tail_event(&asoc->ulpq, ev); + } + + sctp_chunk_put(chunk); + } + + SCTP_DBG_OBJCNT_DEC(datamsg); + kfree(msg); +} + +/* Hold a reference. */ +static void sctp_datamsg_hold(struct sctp_datamsg *msg) +{ + atomic_inc(&msg->refcnt); +} + +/* Release a reference. */ +void sctp_datamsg_put(struct sctp_datamsg *msg) +{ + if (atomic_dec_and_test(&msg->refcnt)) + sctp_datamsg_destroy(msg); +} + +/* Assign a chunk to this datamsg. */ +static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) +{ + sctp_datamsg_hold(msg); + chunk->msg = msg; +} + + +/* A data chunk can have a maximum payload of (2^16 - 20). Break + * down any such message into smaller chunks. Opportunistically, fragment + * the chunks down to the current MTU constraints. We may get refragmented + * later if the PMTU changes, but it is _much better_ to fragment immediately + * with a reasonable guess than always doing our fragmentation on the + * soft-interrupt. + */ +struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct msghdr *msgh, int msg_len) +{ + int max, whole, i, offset, over, err; + int len, first_len; + int max_data; + struct sctp_chunk *chunk; + struct sctp_datamsg *msg; + struct list_head *pos, *temp; + __u8 frag; + + msg = sctp_datamsg_new(GFP_KERNEL); + if (!msg) + return NULL; + + /* Note: Calculate this outside of the loop, so that all fragments + * have the same expiration. + */ + if (sinfo->sinfo_timetolive) { + /* sinfo_timetolive is in milliseconds */ + msg->expires_at = jiffies + + msecs_to_jiffies(sinfo->sinfo_timetolive); + msg->can_abandon = 1; + SCTP_DEBUG_PRINTK("%s: msg:%p expires_at: %ld jiffies:%ld\n", + __func__, msg, msg->expires_at, jiffies); + } + + /* This is the biggest possible DATA chunk that can fit into + * the packet + */ + max_data = asoc->pathmtu - + sctp_sk(asoc->base.sk)->pf->af->net_header_len - + sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); + + max = asoc->frag_point; + /* If the the peer requested that we authenticate DATA chunks + * we need to accound for bundling of the AUTH chunks along with + * DATA. + */ + if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { + struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); + + if (hmac_desc) + max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) + + hmac_desc->hmac_len); + } + + /* Now, check if we need to reduce our max */ + if (max > max_data) + max = max_data; + + whole = 0; + first_len = max; + + /* Check to see if we have a pending SACK and try to let it be bundled + * with this message. Do this if we don't have any data queued already. + * To check that, look at out_qlen and retransmit list. + * NOTE: we will not reduce to account for SACK, if the message would + * not have been fragmented. + */ + if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && + asoc->outqueue.out_qlen == 0 && + list_empty(&asoc->outqueue.retransmit) && + msg_len > max) + max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t)); + + /* Encourage Cookie-ECHO bundling. */ + if (asoc->state < SCTP_STATE_COOKIE_ECHOED) + max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; + + /* Now that we adjusted completely, reset first_len */ + if (first_len > max_data) + first_len = max_data; + + /* Account for a different sized first fragment */ + if (msg_len >= first_len) { + msg_len -= first_len; + whole = 1; + msg->can_delay = 0; + } + + /* How many full sized? How many bytes leftover? */ + whole += msg_len / max; + over = msg_len % max; + offset = 0; + + if ((whole > 1) || (whole && over)) + SCTP_INC_STATS_USER(SCTP_MIB_FRAGUSRMSGS); + + /* Create chunks for all the full sized DATA chunks. */ + for (i=0, len=first_len; i < whole; i++) { + frag = SCTP_DATA_MIDDLE_FRAG; + + if (0 == i) + frag |= SCTP_DATA_FIRST_FRAG; + + if ((i == (whole - 1)) && !over) { + frag |= SCTP_DATA_LAST_FRAG; + + /* The application requests to set the I-bit of the + * last DATA chunk of a user message when providing + * the user message to the SCTP implementation. + */ + if ((sinfo->sinfo_flags & SCTP_EOF) || + (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) + frag |= SCTP_DATA_SACK_IMM; + } + + chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0); + + if (!chunk) + goto errout; + err = sctp_user_addto_chunk(chunk, offset, len, msgh->msg_iov); + if (err < 0) + goto errout; + + offset += len; + + /* Put the chunk->skb back into the form expected by send. */ + __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr + - (__u8 *)chunk->skb->data); + + sctp_datamsg_assign(msg, chunk); + list_add_tail(&chunk->frag_list, &msg->chunks); + + /* The first chunk, the first chunk was likely short + * to allow bundling, so reset to full size. + */ + if (0 == i) + len = max; + } + + /* .. now the leftover bytes. */ + if (over) { + if (!whole) + frag = SCTP_DATA_NOT_FRAG; + else + frag = SCTP_DATA_LAST_FRAG; + + if ((sinfo->sinfo_flags & SCTP_EOF) || + (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) + frag |= SCTP_DATA_SACK_IMM; + + chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0); + + if (!chunk) + goto errout; + + err = sctp_user_addto_chunk(chunk, offset, over,msgh->msg_iov); + + /* Put the chunk->skb back into the form expected by send. */ + __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr + - (__u8 *)chunk->skb->data); + if (err < 0) + goto errout; + + sctp_datamsg_assign(msg, chunk); + list_add_tail(&chunk->frag_list, &msg->chunks); + } + + return msg; + +errout: + list_for_each_safe(pos, temp, &msg->chunks) { + list_del_init(pos); + chunk = list_entry(pos, struct sctp_chunk, frag_list); + sctp_chunk_free(chunk); + } + sctp_datamsg_put(msg); + return NULL; +} + +/* Check whether this message has expired. */ +int sctp_chunk_abandoned(struct sctp_chunk *chunk) +{ + struct sctp_datamsg *msg = chunk->msg; + + if (!msg->can_abandon) + return 0; + + if (time_after(jiffies, msg->expires_at)) + return 1; + + return 0; +} + +/* This chunk (and consequently entire message) has failed in its sending. */ +void sctp_chunk_fail(struct sctp_chunk *chunk, int error) +{ + chunk->msg->send_failed = 1; + chunk->msg->send_error = error; +} diff --git a/net/sctp/command.c b/net/sctp/command.c new file mode 100644 index 00000000..c0044019 --- /dev/null +++ b/net/sctp/command.c @@ -0,0 +1,75 @@ +/* SCTP kernel implementation Copyright (C) 1999-2001 + * Cisco, Motorola, and IBM + * Copyright 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp command sequences. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +/* Initialize a block of memory as a command sequence. */ +int sctp_init_cmd_seq(sctp_cmd_seq_t *seq) +{ + memset(seq, 0, sizeof(sctp_cmd_seq_t)); + return 1; /* We always succeed. */ +} + +/* Add a command to a sctp_cmd_seq_t. + * Return 0 if the command sequence is full. + */ +void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb, sctp_arg_t obj) +{ + BUG_ON(seq->next_free_slot >= SCTP_MAX_NUM_COMMANDS); + + seq->cmds[seq->next_free_slot].verb = verb; + seq->cmds[seq->next_free_slot++].obj = obj; +} + +/* Return the next command structure in a sctp_cmd_seq. + * Returns NULL at the end of the sequence. + */ +sctp_cmd_t *sctp_next_cmd(sctp_cmd_seq_t *seq) +{ + sctp_cmd_t *retval = NULL; + + if (seq->next_cmd < seq->next_free_slot) + retval = &seq->cmds[seq->next_cmd++]; + + return retval; +} + diff --git a/net/sctp/debug.c b/net/sctp/debug.c new file mode 100644 index 00000000..ec997cfe --- /dev/null +++ b/net/sctp/debug.c @@ -0,0 +1,183 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * This file converts numerical ID value to alphabetical names for SCTP + * terms such as chunk type, parameter time, event type, etc. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Xingang Guo + * Jon Grimm + * Daisy Chang + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include + +#if SCTP_DEBUG +int sctp_debug_flag = 1; /* Initially enable DEBUG */ +#endif /* SCTP_DEBUG */ + +/* These are printable forms of Chunk ID's from section 3.1. */ +static const char *const sctp_cid_tbl[SCTP_NUM_BASE_CHUNK_TYPES] = { + "DATA", + "INIT", + "INIT_ACK", + "SACK", + "HEARTBEAT", + "HEARTBEAT_ACK", + "ABORT", + "SHUTDOWN", + "SHUTDOWN_ACK", + "ERROR", + "COOKIE_ECHO", + "COOKIE_ACK", + "ECN_ECNE", + "ECN_CWR", + "SHUTDOWN_COMPLETE", +}; + +/* Lookup "chunk type" debug name. */ +const char *sctp_cname(const sctp_subtype_t cid) +{ + if (cid.chunk <= SCTP_CID_BASE_MAX) + return sctp_cid_tbl[cid.chunk]; + + switch (cid.chunk) { + case SCTP_CID_ASCONF: + return "ASCONF"; + + case SCTP_CID_ASCONF_ACK: + return "ASCONF_ACK"; + + case SCTP_CID_FWD_TSN: + return "FWD_TSN"; + + case SCTP_CID_AUTH: + return "AUTH"; + + default: + break; + } + + return "unknown chunk"; +} + +/* These are printable forms of the states. */ +const char *const sctp_state_tbl[SCTP_STATE_NUM_STATES] = { + "STATE_CLOSED", + "STATE_COOKIE_WAIT", + "STATE_COOKIE_ECHOED", + "STATE_ESTABLISHED", + "STATE_SHUTDOWN_PENDING", + "STATE_SHUTDOWN_SENT", + "STATE_SHUTDOWN_RECEIVED", + "STATE_SHUTDOWN_ACK_SENT", +}; + +/* Events that could change the state of an association. */ +const char *const sctp_evttype_tbl[] = { + "EVENT_T_unknown", + "EVENT_T_CHUNK", + "EVENT_T_TIMEOUT", + "EVENT_T_OTHER", + "EVENT_T_PRIMITIVE" +}; + +/* Return value of a state function */ +const char *const sctp_status_tbl[] = { + "DISPOSITION_DISCARD", + "DISPOSITION_CONSUME", + "DISPOSITION_NOMEM", + "DISPOSITION_DELETE_TCB", + "DISPOSITION_ABORT", + "DISPOSITION_VIOLATION", + "DISPOSITION_NOT_IMPL", + "DISPOSITION_ERROR", + "DISPOSITION_BUG" +}; + +/* Printable forms of primitives */ +static const char *const sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = { + "PRIMITIVE_ASSOCIATE", + "PRIMITIVE_SHUTDOWN", + "PRIMITIVE_ABORT", + "PRIMITIVE_SEND", + "PRIMITIVE_REQUESTHEARTBEAT", + "PRIMITIVE_ASCONF", +}; + +/* Lookup primitive debug name. */ +const char *sctp_pname(const sctp_subtype_t id) +{ + if (id.primitive <= SCTP_EVENT_PRIMITIVE_MAX) + return sctp_primitive_tbl[id.primitive]; + return "unknown_primitive"; +} + +static const char *const sctp_other_tbl[] = { + "NO_PENDING_TSN", + "ICMP_PROTO_UNREACH", +}; + +/* Lookup "other" debug name. */ +const char *sctp_oname(const sctp_subtype_t id) +{ + if (id.other <= SCTP_EVENT_OTHER_MAX) + return sctp_other_tbl[id.other]; + return "unknown 'other' event"; +} + +static const char *const sctp_timer_tbl[] = { + "TIMEOUT_NONE", + "TIMEOUT_T1_COOKIE", + "TIMEOUT_T1_INIT", + "TIMEOUT_T2_SHUTDOWN", + "TIMEOUT_T3_RTX", + "TIMEOUT_T4_RTO", + "TIMEOUT_T5_SHUTDOWN_GUARD", + "TIMEOUT_HEARTBEAT", + "TIMEOUT_SACK", + "TIMEOUT_AUTOCLOSE", +}; + +/* Lookup timer debug name. */ +const char *sctp_tname(const sctp_subtype_t id) +{ + if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) + return sctp_timer_tbl[id.timeout]; + return "unknown_timer"; +} diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c new file mode 100644 index 00000000..c8cc24e2 --- /dev/null +++ b/net/sctp/endpointola.c @@ -0,0 +1,497 @@ +/* SCTP kernel implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2002 International Business Machines, Corp. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * This abstraction represents an SCTP endpoint. + * + * The SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Daisy Chang + * Dajiang Zhang + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include /* get_random_bytes() */ +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static void sctp_endpoint_bh_rcv(struct work_struct *work); + +/* + * Initialize the base fields of the endpoint structure. + */ +static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, + struct sock *sk, + gfp_t gfp) +{ + struct sctp_hmac_algo_param *auth_hmacs = NULL; + struct sctp_chunks_param *auth_chunks = NULL; + struct sctp_shared_key *null_key; + int err; + + ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); + if (!ep->digest) + return NULL; + + if (sctp_auth_enable) { + /* Allocate space for HMACS and CHUNKS authentication + * variables. There are arrays that we encode directly + * into parameters to make the rest of the operations easier. + */ + auth_hmacs = kzalloc(sizeof(sctp_hmac_algo_param_t) + + sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp); + if (!auth_hmacs) + goto nomem; + + auth_chunks = kzalloc(sizeof(sctp_chunks_param_t) + + SCTP_NUM_CHUNK_TYPES, gfp); + if (!auth_chunks) + goto nomem; + + /* Initialize the HMACS parameter. + * SCTP-AUTH: Section 3.3 + * Every endpoint supporting SCTP chunk authentication MUST + * support the HMAC based on the SHA-1 algorithm. + */ + auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; + auth_hmacs->param_hdr.length = + htons(sizeof(sctp_paramhdr_t) + 2); + auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); + + /* Initialize the CHUNKS parameter */ + auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; + auth_chunks->param_hdr.length = htons(sizeof(sctp_paramhdr_t)); + + /* If the Add-IP functionality is enabled, we must + * authenticate, ASCONF and ASCONF-ACK chunks + */ + if (sctp_addip_enable) { + auth_chunks->chunks[0] = SCTP_CID_ASCONF; + auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK; + auth_chunks->param_hdr.length = + htons(sizeof(sctp_paramhdr_t) + 2); + } + } + + /* Initialize the base structure. */ + /* What type of endpoint are we? */ + ep->base.type = SCTP_EP_TYPE_SOCKET; + + /* Initialize the basic object fields. */ + atomic_set(&ep->base.refcnt, 1); + ep->base.dead = 0; + ep->base.malloced = 1; + + /* Create an input queue. */ + sctp_inq_init(&ep->base.inqueue); + + /* Set its top-half handler */ + sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv); + + /* Initialize the bind addr area */ + sctp_bind_addr_init(&ep->base.bind_addr, 0); + + /* Remember who we are attached to. */ + ep->base.sk = sk; + sock_hold(ep->base.sk); + + /* Create the lists of associations. */ + INIT_LIST_HEAD(&ep->asocs); + + /* Use SCTP specific send buffer space queues. */ + ep->sndbuf_policy = sctp_sndbuf_policy; + + sk->sk_data_ready = sctp_data_ready; + sk->sk_write_space = sctp_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + /* Get the receive buffer policy for this endpoint */ + ep->rcvbuf_policy = sctp_rcvbuf_policy; + + /* Initialize the secret key used with cookie. */ + get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); + ep->last_key = ep->current_key = 0; + ep->key_changed_at = jiffies; + + /* SCTP-AUTH extensions*/ + INIT_LIST_HEAD(&ep->endpoint_shared_keys); + null_key = sctp_auth_shkey_create(0, GFP_KERNEL); + if (!null_key) + goto nomem; + + list_add(&null_key->key_list, &ep->endpoint_shared_keys); + + /* Allocate and initialize transorms arrays for suported HMACs. */ + err = sctp_auth_init_hmacs(ep, gfp); + if (err) + goto nomem_hmacs; + + /* Add the null key to the endpoint shared keys list and + * set the hmcas and chunks pointers. + */ + ep->auth_hmacs_list = auth_hmacs; + ep->auth_chunk_list = auth_chunks; + + return ep; + +nomem_hmacs: + sctp_auth_destroy_keys(&ep->endpoint_shared_keys); +nomem: + /* Free all allocations */ + kfree(auth_hmacs); + kfree(auth_chunks); + kfree(ep->digest); + return NULL; + +} + +/* Create a sctp_endpoint with all that boring stuff initialized. + * Returns NULL if there isn't enough memory. + */ +struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp) +{ + struct sctp_endpoint *ep; + + /* Build a local endpoint. */ + ep = t_new(struct sctp_endpoint, gfp); + if (!ep) + goto fail; + if (!sctp_endpoint_init(ep, sk, gfp)) + goto fail_init; + ep->base.malloced = 1; + SCTP_DBG_OBJCNT_INC(ep); + return ep; + +fail_init: + kfree(ep); +fail: + return NULL; +} + +/* Add an association to an endpoint. */ +void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, + struct sctp_association *asoc) +{ + struct sock *sk = ep->base.sk; + + /* If this is a temporary association, don't bother + * since we'll be removing it shortly and don't + * want anyone to find it anyway. + */ + if (asoc->temp) + return; + + /* Now just add it to our list of asocs */ + list_add_tail(&asoc->asocs, &ep->asocs); + + /* Increment the backlog value for a TCP-style listening socket. */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + sk->sk_ack_backlog++; +} + +/* Free the endpoint structure. Delay cleanup until + * all users have released their reference count on this structure. + */ +void sctp_endpoint_free(struct sctp_endpoint *ep) +{ + ep->base.dead = 1; + + ep->base.sk->sk_state = SCTP_SS_CLOSED; + + /* Unlink this endpoint, so we can't find it again! */ + sctp_unhash_endpoint(ep); + + sctp_endpoint_put(ep); +} + +/* Final destructor for endpoint. */ +static void sctp_endpoint_destroy(struct sctp_endpoint *ep) +{ + SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return); + + /* Free up the HMAC transform. */ + crypto_free_hash(sctp_sk(ep->base.sk)->hmac); + + /* Free the digest buffer */ + kfree(ep->digest); + + /* SCTP-AUTH: Free up AUTH releated data such as shared keys + * chunks and hmacs arrays that were allocated + */ + sctp_auth_destroy_keys(&ep->endpoint_shared_keys); + kfree(ep->auth_hmacs_list); + kfree(ep->auth_chunk_list); + + /* AUTH - Free any allocated HMAC transform containers */ + sctp_auth_destroy_hmacs(ep->auth_hmacs); + + /* Cleanup. */ + sctp_inq_free(&ep->base.inqueue); + sctp_bind_addr_free(&ep->base.bind_addr); + + /* Remove and free the port */ + if (sctp_sk(ep->base.sk)->bind_hash) + sctp_put_port(ep->base.sk); + + /* Give up our hold on the sock. */ + if (ep->base.sk) + sock_put(ep->base.sk); + + /* Finally, free up our memory. */ + if (ep->base.malloced) { + kfree(ep); + SCTP_DBG_OBJCNT_DEC(ep); + } +} + +/* Hold a reference to an endpoint. */ +void sctp_endpoint_hold(struct sctp_endpoint *ep) +{ + atomic_inc(&ep->base.refcnt); +} + +/* Release a reference to an endpoint and clean up if there are + * no more references. + */ +void sctp_endpoint_put(struct sctp_endpoint *ep) +{ + if (atomic_dec_and_test(&ep->base.refcnt)) + sctp_endpoint_destroy(ep); +} + +/* Is this the endpoint we are looking for? */ +struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, + const union sctp_addr *laddr) +{ + struct sctp_endpoint *retval = NULL; + + if (htons(ep->base.bind_addr.port) == laddr->v4.sin_port) { + if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, + sctp_sk(ep->base.sk))) + retval = ep; + } + + return retval; +} + +/* Find the association that goes with this chunk. + * We do a linear search of the associations for this endpoint. + * We return the matching transport address too. + */ +static struct sctp_association *__sctp_endpoint_lookup_assoc( + const struct sctp_endpoint *ep, + const union sctp_addr *paddr, + struct sctp_transport **transport) +{ + struct sctp_association *asoc = NULL; + struct sctp_association *tmp; + struct sctp_transport *t = NULL; + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct hlist_node *node; + int hash; + int rport; + + *transport = NULL; + + /* If the local port is not set, there can't be any associations + * on this endpoint. + */ + if (!ep->base.bind_addr.port) + goto out; + + rport = ntohs(paddr->v4.sin_port); + + hash = sctp_assoc_hashfn(ep->base.bind_addr.port, rport); + head = &sctp_assoc_hashtable[hash]; + read_lock(&head->lock); + sctp_for_each_hentry(epb, node, &head->chain) { + tmp = sctp_assoc(epb); + if (tmp->ep != ep || rport != tmp->peer.port) + continue; + + t = sctp_assoc_lookup_paddr(tmp, paddr); + if (t) { + asoc = tmp; + *transport = t; + break; + } + } + read_unlock(&head->lock); +out: + return asoc; +} + +/* Lookup association on an endpoint based on a peer address. BH-safe. */ +struct sctp_association *sctp_endpoint_lookup_assoc( + const struct sctp_endpoint *ep, + const union sctp_addr *paddr, + struct sctp_transport **transport) +{ + struct sctp_association *asoc; + + sctp_local_bh_disable(); + asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport); + sctp_local_bh_enable(); + + return asoc; +} + +/* Look for any peeled off association from the endpoint that matches the + * given peer address. + */ +int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, + const union sctp_addr *paddr) +{ + struct sctp_sockaddr_entry *addr; + struct sctp_bind_addr *bp; + + bp = &ep->base.bind_addr; + /* This function is called with the socket lock held, + * so the address_list can not change. + */ + list_for_each_entry(addr, &bp->address_list, list) { + if (sctp_has_association(&addr->a, paddr)) + return 1; + } + + return 0; +} + +/* Do delayed input processing. This is scheduled by sctp_rcv(). + * This may be called on BH or task time. + */ +static void sctp_endpoint_bh_rcv(struct work_struct *work) +{ + struct sctp_endpoint *ep = + container_of(work, struct sctp_endpoint, + base.inqueue.immediate); + struct sctp_association *asoc; + struct sock *sk; + struct sctp_transport *transport; + struct sctp_chunk *chunk; + struct sctp_inq *inqueue; + sctp_subtype_t subtype; + sctp_state_t state; + int error = 0; + int first_time = 1; /* is this the first time through the looop */ + + if (ep->base.dead) + return; + + asoc = NULL; + inqueue = &ep->base.inqueue; + sk = ep->base.sk; + + while (NULL != (chunk = sctp_inq_pop(inqueue))) { + subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); + + /* If the first chunk in the packet is AUTH, do special + * processing specified in Section 6.3 of SCTP-AUTH spec + */ + if (first_time && (subtype.chunk == SCTP_CID_AUTH)) { + struct sctp_chunkhdr *next_hdr; + + next_hdr = sctp_inq_peek(inqueue); + if (!next_hdr) + goto normal; + + /* If the next chunk is COOKIE-ECHO, skip the AUTH + * chunk while saving a pointer to it so we can do + * Authentication later (during cookie-echo + * processing). + */ + if (next_hdr->type == SCTP_CID_COOKIE_ECHO) { + chunk->auth_chunk = skb_clone(chunk->skb, + GFP_ATOMIC); + chunk->auth = 1; + continue; + } + } +normal: + /* We might have grown an association since last we + * looked, so try again. + * + * This happens when we've just processed our + * COOKIE-ECHO chunk. + */ + if (NULL == chunk->asoc) { + asoc = sctp_endpoint_lookup_assoc(ep, + sctp_source(chunk), + &transport); + chunk->asoc = asoc; + chunk->transport = transport; + } + + state = asoc ? asoc->state : SCTP_STATE_CLOSED; + if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) + continue; + + /* Remember where the last DATA chunk came from so we + * know where to send the SACK. + */ + if (asoc && sctp_chunk_is_data(chunk)) + asoc->peer.last_data_from = chunk->transport; + else + SCTP_INC_STATS(SCTP_MIB_INCTRLCHUNKS); + + if (chunk->transport) + chunk->transport->last_time_heard = jiffies; + + error = sctp_do_sm(SCTP_EVENT_T_CHUNK, subtype, state, + ep, asoc, chunk, GFP_ATOMIC); + + if (error && chunk) + chunk->pdiscard = 1; + + /* Check to see if the endpoint is freed in response to + * the incoming chunk. If so, get out of the while loop. + */ + if (!sctp_sk(sk)->ep) + break; + + if (first_time) + first_time = 0; + } +} diff --git a/net/sctp/input.c b/net/sctp/input.c new file mode 100644 index 00000000..741ed164 --- /dev/null +++ b/net/sctp/input.c @@ -0,0 +1,1139 @@ +/* SCTP kernel implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 International Business Machines, Corp. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * These functions handle all input from the IP layer into SCTP. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Xingang Guo + * Jon Grimm + * Hui Huang + * Daisy Chang + * Sridhar Samudrala + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include /* For struct list_head */ +#include +#include +#include /* For struct timeval */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static int sctp_rcv_ootb(struct sk_buff *); +static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *paddr, + struct sctp_transport **transportp); +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(const union sctp_addr *laddr); +static struct sctp_association *__sctp_lookup_association( + const union sctp_addr *local, + const union sctp_addr *peer, + struct sctp_transport **pt); + +static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); + + +/* Calculate the SCTP checksum of an SCTP packet. */ +static inline int sctp_rcv_checksum(struct sk_buff *skb) +{ + struct sctphdr *sh = sctp_hdr(skb); + __le32 cmp = sh->checksum; + struct sk_buff *list; + __le32 val; + __u32 tmp = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); + + skb_walk_frags(skb, list) + tmp = sctp_update_cksum((__u8 *)list->data, skb_headlen(list), + tmp); + + val = sctp_end_cksum(tmp); + + if (val != cmp) { + /* CRC failure, dump it. */ + SCTP_INC_STATS_BH(SCTP_MIB_CHECKSUMERRORS); + return -1; + } + return 0; +} + +struct sctp_input_cb { + union { + struct inet_skb_parm h4; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + struct inet6_skb_parm h6; +#endif + } header; + struct sctp_chunk *chunk; +}; +#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) + +/* + * This is the routine which IP calls when receiving an SCTP packet. + */ +int sctp_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct sctp_association *asoc; + struct sctp_endpoint *ep = NULL; + struct sctp_ep_common *rcvr; + struct sctp_transport *transport = NULL; + struct sctp_chunk *chunk; + struct sctphdr *sh; + union sctp_addr src; + union sctp_addr dest; + int family; + struct sctp_af *af; + + if (skb->pkt_type!=PACKET_HOST) + goto discard_it; + + SCTP_INC_STATS_BH(SCTP_MIB_INSCTPPACKS); + + if (skb_linearize(skb)) + goto discard_it; + + sh = sctp_hdr(skb); + + /* Pull up the IP and SCTP headers. */ + __skb_pull(skb, skb_transport_offset(skb)); + if (skb->len < sizeof(struct sctphdr)) + goto discard_it; + if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) && + sctp_rcv_checksum(skb) < 0) + goto discard_it; + + skb_pull(skb, sizeof(struct sctphdr)); + + /* Make sure we at least have chunk headers worth of data left. */ + if (skb->len < sizeof(struct sctp_chunkhdr)) + goto discard_it; + + family = ipver2af(ip_hdr(skb)->version); + af = sctp_get_af_specific(family); + if (unlikely(!af)) + goto discard_it; + + /* Initialize local addresses for lookups. */ + af->from_skb(&src, skb, 1); + af->from_skb(&dest, skb, 0); + + /* If the packet is to or from a non-unicast address, + * silently discard the packet. + * + * This is not clearly defined in the RFC except in section + * 8.4 - OOTB handling. However, based on the book "Stream Control + * Transmission Protocol" 2.1, "It is important to note that the + * IP address of an SCTP transport address must be a routable + * unicast address. In other words, IP multicast addresses and + * IP broadcast addresses cannot be used in an SCTP transport + * address." + */ + if (!af->addr_valid(&src, NULL, skb) || + !af->addr_valid(&dest, NULL, skb)) + goto discard_it; + + asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); + + if (!asoc) + ep = __sctp_rcv_lookup_endpoint(&dest); + + /* Retrieve the common input handling substructure. */ + rcvr = asoc ? &asoc->base : &ep->base; + sk = rcvr->sk; + + /* + * If a frame arrives on an interface and the receiving socket is + * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB + */ + if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) + { + if (asoc) { + sctp_association_put(asoc); + asoc = NULL; + } else { + sctp_endpoint_put(ep); + ep = NULL; + } + sk = sctp_get_ctl_sock(); + ep = sctp_sk(sk)->ep; + sctp_endpoint_hold(ep); + rcvr = &ep->base; + } + + /* + * RFC 2960, 8.4 - Handle "Out of the blue" Packets. + * An SCTP packet is called an "out of the blue" (OOTB) + * packet if it is correctly formed, i.e., passed the + * receiver's checksum check, but the receiver is not + * able to identify the association to which this + * packet belongs. + */ + if (!asoc) { + if (sctp_rcv_ootb(skb)) { + SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES); + goto discard_release; + } + } + + if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) + goto discard_release; + nf_reset(skb); + + if (sk_filter(sk, skb)) + goto discard_release; + + /* Create an SCTP packet structure. */ + chunk = sctp_chunkify(skb, asoc, sk); + if (!chunk) + goto discard_release; + SCTP_INPUT_CB(skb)->chunk = chunk; + + /* Remember what endpoint is to handle this packet. */ + chunk->rcvr = rcvr; + + /* Remember the SCTP header. */ + chunk->sctp_hdr = sh; + + /* Set the source and destination addresses of the incoming chunk. */ + sctp_init_addrs(chunk, &src, &dest); + + /* Remember where we came from. */ + chunk->transport = transport; + + /* Acquire access to the sock lock. Note: We are safe from other + * bottom halves on this lock, but a user may be in the lock too, + * so check if it is busy. + */ + sctp_bh_lock_sock(sk); + + if (sk != rcvr->sk) { + /* Our cached sk is different from the rcvr->sk. This is + * because migrate()/accept() may have moved the association + * to a new socket and released all the sockets. So now we + * are holding a lock on the old socket while the user may + * be doing something with the new socket. Switch our veiw + * of the current sk. + */ + sctp_bh_unlock_sock(sk); + sk = rcvr->sk; + sctp_bh_lock_sock(sk); + } + + if (sock_owned_by_user(sk)) { + if (sctp_add_backlog(sk, skb)) { + sctp_bh_unlock_sock(sk); + sctp_chunk_free(chunk); + skb = NULL; /* sctp_chunk_free already freed the skb */ + goto discard_release; + } + SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_BACKLOG); + } else { + SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_SOFTIRQ); + sctp_inq_push(&chunk->rcvr->inqueue, chunk); + } + + sctp_bh_unlock_sock(sk); + + /* Release the asoc/ep ref we took in the lookup calls. */ + if (asoc) + sctp_association_put(asoc); + else + sctp_endpoint_put(ep); + + return 0; + +discard_it: + SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_DISCARDS); + kfree_skb(skb); + return 0; + +discard_release: + /* Release the asoc/ep ref we took in the lookup calls. */ + if (asoc) + sctp_association_put(asoc); + else + sctp_endpoint_put(ep); + + goto discard_it; +} + +/* Process the backlog queue of the socket. Every skb on + * the backlog holds a ref on an association or endpoint. + * We hold this ref throughout the state machine to make + * sure that the structure we need is still around. + */ +int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + struct sctp_inq *inqueue = &chunk->rcvr->inqueue; + struct sctp_ep_common *rcvr = NULL; + int backloged = 0; + + rcvr = chunk->rcvr; + + /* If the rcvr is dead then the association or endpoint + * has been deleted and we can safely drop the chunk + * and refs that we are holding. + */ + if (rcvr->dead) { + sctp_chunk_free(chunk); + goto done; + } + + if (unlikely(rcvr->sk != sk)) { + /* In this case, the association moved from one socket to + * another. We are currently sitting on the backlog of the + * old socket, so we need to move. + * However, since we are here in the process context we + * need to take make sure that the user doesn't own + * the new socket when we process the packet. + * If the new socket is user-owned, queue the chunk to the + * backlog of the new socket without dropping any refs. + * Otherwise, we can safely push the chunk on the inqueue. + */ + + sk = rcvr->sk; + sctp_bh_lock_sock(sk); + + if (sock_owned_by_user(sk)) { + if (sk_add_backlog(sk, skb)) + sctp_chunk_free(chunk); + else + backloged = 1; + } else + sctp_inq_push(inqueue, chunk); + + sctp_bh_unlock_sock(sk); + + /* If the chunk was backloged again, don't drop refs */ + if (backloged) + return 0; + } else { + sctp_inq_push(inqueue, chunk); + } + +done: + /* Release the refs we took in sctp_add_backlog */ + if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) + sctp_association_put(sctp_assoc(rcvr)); + else if (SCTP_EP_TYPE_SOCKET == rcvr->type) + sctp_endpoint_put(sctp_ep(rcvr)); + else + BUG(); + + return 0; +} + +static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + struct sctp_ep_common *rcvr = chunk->rcvr; + int ret; + + ret = sk_add_backlog(sk, skb); + if (!ret) { + /* Hold the assoc/ep while hanging on the backlog queue. + * This way, we know structures we need will not disappear + * from us + */ + if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) + sctp_association_hold(sctp_assoc(rcvr)); + else if (SCTP_EP_TYPE_SOCKET == rcvr->type) + sctp_endpoint_hold(sctp_ep(rcvr)); + else + BUG(); + } + return ret; + +} + +/* Handle icmp frag needed error. */ +void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, + struct sctp_transport *t, __u32 pmtu) +{ + if (!t || (t->pathmtu <= pmtu)) + return; + + if (sock_owned_by_user(sk)) { + asoc->pmtu_pending = 1; + t->pmtu_pending = 1; + return; + } + + if (t->param_flags & SPP_PMTUD_ENABLE) { + /* Update transports view of the MTU */ + sctp_transport_update_pmtu(t, pmtu); + + /* Update association pmtu. */ + sctp_assoc_sync_pmtu(asoc); + } + + /* Retransmit with the new pmtu setting. + * Normally, if PMTU discovery is disabled, an ICMP Fragmentation + * Needed will never be sent, but if a message was sent before + * PMTU discovery was disabled that was larger than the PMTU, it + * would not be fragmented, so it must be re-transmitted fragmented. + */ + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); +} + +/* + * SCTP Implementer's Guide, 2.37 ICMP handling procedures + * + * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" + * or a "Protocol Unreachable" treat this message as an abort + * with the T bit set. + * + * This function sends an event to the state machine, which will abort the + * association. + * + */ +void sctp_icmp_proto_unreachable(struct sock *sk, + struct sctp_association *asoc, + struct sctp_transport *t) +{ + SCTP_DEBUG_PRINTK("%s\n", __func__); + + if (sock_owned_by_user(sk)) { + if (timer_pending(&t->proto_unreach_timer)) + return; + else { + if (!mod_timer(&t->proto_unreach_timer, + jiffies + (HZ/20))) + sctp_association_hold(asoc); + } + + } else { + if (timer_pending(&t->proto_unreach_timer) && + del_timer(&t->proto_unreach_timer)) + sctp_association_put(asoc); + + sctp_do_sm(SCTP_EVENT_T_OTHER, + SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), + asoc->state, asoc->ep, asoc, t, + GFP_ATOMIC); + } +} + +/* Common lookup code for icmp/icmpv6 error handler. */ +struct sock *sctp_err_lookup(int family, struct sk_buff *skb, + struct sctphdr *sctphdr, + struct sctp_association **app, + struct sctp_transport **tpp) +{ + union sctp_addr saddr; + union sctp_addr daddr; + struct sctp_af *af; + struct sock *sk = NULL; + struct sctp_association *asoc; + struct sctp_transport *transport = NULL; + struct sctp_init_chunk *chunkhdr; + __u32 vtag = ntohl(sctphdr->vtag); + int len = skb->len - ((void *)sctphdr - (void *)skb->data); + + *app = NULL; *tpp = NULL; + + af = sctp_get_af_specific(family); + if (unlikely(!af)) { + return NULL; + } + + /* Initialize local addresses for lookups. */ + af->from_skb(&saddr, skb, 1); + af->from_skb(&daddr, skb, 0); + + /* Look for an association that matches the incoming ICMP error + * packet. + */ + asoc = __sctp_lookup_association(&saddr, &daddr, &transport); + if (!asoc) + return NULL; + + sk = asoc->base.sk; + + /* RFC 4960, Appendix C. ICMP Handling + * + * ICMP6) An implementation MUST validate that the Verification Tag + * contained in the ICMP message matches the Verification Tag of + * the peer. If the Verification Tag is not 0 and does NOT + * match, discard the ICMP message. If it is 0 and the ICMP + * message contains enough bytes to verify that the chunk type is + * an INIT chunk and that the Initiate Tag matches the tag of the + * peer, continue with ICMP7. If the ICMP message is too short + * or the chunk type or the Initiate Tag does not match, silently + * discard the packet. + */ + if (vtag == 0) { + chunkhdr = (struct sctp_init_chunk *)((void *)sctphdr + + sizeof(struct sctphdr)); + if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t) + + sizeof(__be32) || + chunkhdr->chunk_hdr.type != SCTP_CID_INIT || + ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) { + goto out; + } + } else if (vtag != asoc->c.peer_vtag) { + goto out; + } + + sctp_bh_lock_sock(sk); + + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(&init_net, LINUX_MIB_LOCKDROPPEDICMPS); + + *app = asoc; + *tpp = transport; + return sk; + +out: + if (asoc) + sctp_association_put(asoc); + return NULL; +} + +/* Common cleanup code for icmp/icmpv6 error handler. */ +void sctp_err_finish(struct sock *sk, struct sctp_association *asoc) +{ + sctp_bh_unlock_sock(sk); + if (asoc) + sctp_association_put(asoc); +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the sctp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + */ +void sctp_v4_err(struct sk_buff *skb, __u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + const int ihlen = iph->ihl * 4; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct sock *sk; + struct sctp_association *asoc = NULL; + struct sctp_transport *transport; + struct inet_sock *inet; + sk_buff_data_t saveip, savesctp; + int err; + + if (skb->len < ihlen + 8) { + ICMP_INC_STATS_BH(&init_net, ICMP_MIB_INERRORS); + return; + } + + /* Fix up skb to look at the embedded net header. */ + saveip = skb->network_header; + savesctp = skb->transport_header; + skb_reset_network_header(skb); + skb_set_transport_header(skb, ihlen); + sk = sctp_err_lookup(AF_INET, skb, sctp_hdr(skb), &asoc, &transport); + /* Put back, the original values. */ + skb->network_header = saveip; + skb->transport_header = savesctp; + if (!sk) { + ICMP_INC_STATS_BH(&init_net, ICMP_MIB_INERRORS); + return; + } + /* Warning: The sock lock is held. Remember to call + * sctp_err_finish! + */ + + switch (type) { + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out_unlock; + + /* PMTU discovery (RFC1191) */ + if (ICMP_FRAG_NEEDED == code) { + sctp_icmp_frag_needed(sk, asoc, transport, info); + goto out_unlock; + } + else { + if (ICMP_PROT_UNREACH == code) { + sctp_icmp_proto_unreachable(sk, asoc, + transport); + goto out_unlock; + } + } + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + /* Ignore any time exceeded errors due to fragment reassembly + * timeouts. + */ + if (ICMP_EXC_FRAGTIME == code) + goto out_unlock; + + err = EHOSTUNREACH; + break; + default: + goto out_unlock; + } + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out_unlock: + sctp_err_finish(sk, asoc); +} + +/* + * RFC 2960, 8.4 - Handle "Out of the blue" Packets. + * + * This function scans all the chunks in the OOTB packet to determine if + * the packet should be discarded right away. If a response might be needed + * for this packet, or, if further processing is possible, the packet will + * be queued to a proper inqueue for the next phase of handling. + * + * Output: + * Return 0 - If further processing is needed. + * Return 1 - If the packet can be discarded right away. + */ +static int sctp_rcv_ootb(struct sk_buff *skb) +{ + sctp_chunkhdr_t *ch; + __u8 *ch_end; + + ch = (sctp_chunkhdr_t *) skb->data; + + /* Scan through all the chunks in the packet. */ + do { + /* Break out if chunk length is less then minimal. */ + if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) + break; + + ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb_tail_pointer(skb)) + break; + + /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the + * receiver MUST silently discard the OOTB packet and take no + * further action. + */ + if (SCTP_CID_ABORT == ch->type) + goto discard; + + /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE + * chunk, the receiver should silently discard the packet + * and take no further action. + */ + if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) + goto discard; + + /* RFC 4460, 2.11.2 + * This will discard packets with INIT chunk bundled as + * subsequent chunks in the packet. When INIT is first, + * the normal INIT processing will discard the chunk. + */ + if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) + goto discard; + + ch = (sctp_chunkhdr_t *) ch_end; + } while (ch_end < skb_tail_pointer(skb)); + + return 0; + +discard: + return 1; +} + +/* Insert endpoint into the hash table. */ +static void __sctp_hash_endpoint(struct sctp_endpoint *ep) +{ + struct sctp_ep_common *epb; + struct sctp_hashbucket *head; + + epb = &ep->base; + + epb->hashent = sctp_ep_hashfn(epb->bind_addr.port); + head = &sctp_ep_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + hlist_add_head(&epb->node, &head->chain); + sctp_write_unlock(&head->lock); +} + +/* Add an endpoint to the hash. Local BH-safe. */ +void sctp_hash_endpoint(struct sctp_endpoint *ep) +{ + sctp_local_bh_disable(); + __sctp_hash_endpoint(ep); + sctp_local_bh_enable(); +} + +/* Remove endpoint from the hash table. */ +static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + + epb = &ep->base; + + if (hlist_unhashed(&epb->node)) + return; + + epb->hashent = sctp_ep_hashfn(epb->bind_addr.port); + + head = &sctp_ep_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + __hlist_del(&epb->node); + sctp_write_unlock(&head->lock); +} + +/* Remove endpoint from the hash. Local BH-safe. */ +void sctp_unhash_endpoint(struct sctp_endpoint *ep) +{ + sctp_local_bh_disable(); + __sctp_unhash_endpoint(ep); + sctp_local_bh_enable(); +} + +/* Look up an endpoint. */ +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(const union sctp_addr *laddr) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_endpoint *ep; + struct hlist_node *node; + int hash; + + hash = sctp_ep_hashfn(ntohs(laddr->v4.sin_port)); + head = &sctp_ep_hashtable[hash]; + read_lock(&head->lock); + sctp_for_each_hentry(epb, node, &head->chain) { + ep = sctp_ep(epb); + if (sctp_endpoint_is_match(ep, laddr)) + goto hit; + } + + ep = sctp_sk((sctp_get_ctl_sock()))->ep; + +hit: + sctp_endpoint_hold(ep); + read_unlock(&head->lock); + return ep; +} + +/* Insert association into the hash table. */ +static void __sctp_hash_established(struct sctp_association *asoc) +{ + struct sctp_ep_common *epb; + struct sctp_hashbucket *head; + + epb = &asoc->base; + + /* Calculate which chain this entry will belong to. */ + epb->hashent = sctp_assoc_hashfn(epb->bind_addr.port, asoc->peer.port); + + head = &sctp_assoc_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + hlist_add_head(&epb->node, &head->chain); + sctp_write_unlock(&head->lock); +} + +/* Add an association to the hash. Local BH-safe. */ +void sctp_hash_established(struct sctp_association *asoc) +{ + if (asoc->temp) + return; + + sctp_local_bh_disable(); + __sctp_hash_established(asoc); + sctp_local_bh_enable(); +} + +/* Remove association from the hash table. */ +static void __sctp_unhash_established(struct sctp_association *asoc) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + + epb = &asoc->base; + + epb->hashent = sctp_assoc_hashfn(epb->bind_addr.port, + asoc->peer.port); + + head = &sctp_assoc_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + __hlist_del(&epb->node); + sctp_write_unlock(&head->lock); +} + +/* Remove association from the hash table. Local BH-safe. */ +void sctp_unhash_established(struct sctp_association *asoc) +{ + if (asoc->temp) + return; + + sctp_local_bh_disable(); + __sctp_unhash_established(asoc); + sctp_local_bh_enable(); +} + +/* Look up an association. */ +static struct sctp_association *__sctp_lookup_association( + const union sctp_addr *local, + const union sctp_addr *peer, + struct sctp_transport **pt) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_association *asoc; + struct sctp_transport *transport; + struct hlist_node *node; + int hash; + + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + hash = sctp_assoc_hashfn(ntohs(local->v4.sin_port), ntohs(peer->v4.sin_port)); + head = &sctp_assoc_hashtable[hash]; + read_lock(&head->lock); + sctp_for_each_hentry(epb, node, &head->chain) { + asoc = sctp_assoc(epb); + transport = sctp_assoc_is_match(asoc, local, peer); + if (transport) + goto hit; + } + + read_unlock(&head->lock); + + return NULL; + +hit: + *pt = transport; + sctp_association_hold(asoc); + read_unlock(&head->lock); + return asoc; +} + +/* Look up an association. BH-safe. */ +SCTP_STATIC +struct sctp_association *sctp_lookup_association(const union sctp_addr *laddr, + const union sctp_addr *paddr, + struct sctp_transport **transportp) +{ + struct sctp_association *asoc; + + sctp_local_bh_disable(); + asoc = __sctp_lookup_association(laddr, paddr, transportp); + sctp_local_bh_enable(); + + return asoc; +} + +/* Is there an association matching the given local and peer addresses? */ +int sctp_has_association(const union sctp_addr *laddr, + const union sctp_addr *paddr) +{ + struct sctp_association *asoc; + struct sctp_transport *transport; + + if ((asoc = sctp_lookup_association(laddr, paddr, &transport))) { + sctp_association_put(asoc); + return 1; + } + + return 0; +} + +/* + * SCTP Implementors Guide, 2.18 Handling of address + * parameters within the INIT or INIT-ACK. + * + * D) When searching for a matching TCB upon reception of an INIT + * or INIT-ACK chunk the receiver SHOULD use not only the + * source address of the packet (containing the INIT or + * INIT-ACK) but the receiver SHOULD also use all valid + * address parameters contained within the chunk. + * + * 2.18.3 Solution description + * + * This new text clearly specifies to an implementor the need + * to look within the INIT or INIT-ACK. Any implementation that + * does not do this, may not be able to establish associations + * in certain circumstances. + * + */ +static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, + const union sctp_addr *laddr, struct sctp_transport **transportp) +{ + struct sctp_association *asoc; + union sctp_addr addr; + union sctp_addr *paddr = &addr; + struct sctphdr *sh = sctp_hdr(skb); + union sctp_params params; + sctp_init_chunk_t *init; + struct sctp_transport *transport; + struct sctp_af *af; + + /* + * This code will NOT touch anything inside the chunk--it is + * strictly READ-ONLY. + * + * RFC 2960 3 SCTP packet Format + * + * Multiple chunks can be bundled into one SCTP packet up to + * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN + * COMPLETE chunks. These chunks MUST NOT be bundled with any + * other chunk in a packet. See Section 6.10 for more details + * on chunk bundling. + */ + + /* Find the start of the TLVs and the end of the chunk. This is + * the region we search for address parameters. + */ + init = (sctp_init_chunk_t *)skb->data; + + /* Walk the parameters looking for embedded addresses. */ + sctp_walk_params(params, init, init_hdr.params) { + + /* Note: Ignoring hostname addresses. */ + af = sctp_get_af_specific(param_type2af(params.p->type)); + if (!af) + continue; + + af->from_addr_param(paddr, params.addr, sh->source, 0); + + asoc = __sctp_lookup_association(laddr, paddr, &transport); + if (asoc) + return asoc; + } + + return NULL; +} + +/* ADD-IP, Section 5.2 + * When an endpoint receives an ASCONF Chunk from the remote peer + * special procedures may be needed to identify the association the + * ASCONF Chunk is associated with. To properly find the association + * the following procedures SHOULD be followed: + * + * D2) If the association is not found, use the address found in the + * Address Parameter TLV combined with the port number found in the + * SCTP common header. If found proceed to rule D4. + * + * D2-ext) If more than one ASCONF Chunks are packed together, use the + * address found in the ASCONF Address Parameter TLV of each of the + * subsequent ASCONF Chunks. If found, proceed to rule D4. + */ +static struct sctp_association *__sctp_rcv_asconf_lookup( + sctp_chunkhdr_t *ch, + const union sctp_addr *laddr, + __be16 peer_port, + struct sctp_transport **transportp) +{ + sctp_addip_chunk_t *asconf = (struct sctp_addip_chunk *)ch; + struct sctp_af *af; + union sctp_addr_param *param; + union sctp_addr paddr; + + /* Skip over the ADDIP header and find the Address parameter */ + param = (union sctp_addr_param *)(asconf + 1); + + af = sctp_get_af_specific(param_type2af(param->p.type)); + if (unlikely(!af)) + return NULL; + + af->from_addr_param(&paddr, param, peer_port, 0); + + return __sctp_lookup_association(laddr, &paddr, transportp); +} + + +/* SCTP-AUTH, Section 6.3: +* If the receiver does not find a STCB for a packet containing an AUTH +* chunk as the first chunk and not a COOKIE-ECHO chunk as the second +* chunk, it MUST use the chunks after the AUTH chunk to look up an existing +* association. +* +* This means that any chunks that can help us identify the association need +* to be looked at to find this association. +*/ +static struct sctp_association *__sctp_rcv_walk_lookup(struct sk_buff *skb, + const union sctp_addr *laddr, + struct sctp_transport **transportp) +{ + struct sctp_association *asoc = NULL; + sctp_chunkhdr_t *ch; + int have_auth = 0; + unsigned int chunk_num = 1; + __u8 *ch_end; + + /* Walk through the chunks looking for AUTH or ASCONF chunks + * to help us find the association. + */ + ch = (sctp_chunkhdr_t *) skb->data; + do { + /* Break out if chunk length is less then minimal. */ + if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) + break; + + ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb_tail_pointer(skb)) + break; + + switch(ch->type) { + case SCTP_CID_AUTH: + have_auth = chunk_num; + break; + + case SCTP_CID_COOKIE_ECHO: + /* If a packet arrives containing an AUTH chunk as + * a first chunk, a COOKIE-ECHO chunk as the second + * chunk, and possibly more chunks after them, and + * the receiver does not have an STCB for that + * packet, then authentication is based on + * the contents of the COOKIE- ECHO chunk. + */ + if (have_auth == 1 && chunk_num == 2) + return NULL; + break; + + case SCTP_CID_ASCONF: + if (have_auth || sctp_addip_noauth) + asoc = __sctp_rcv_asconf_lookup(ch, laddr, + sctp_hdr(skb)->source, + transportp); + default: + break; + } + + if (asoc) + break; + + ch = (sctp_chunkhdr_t *) ch_end; + chunk_num++; + } while (ch_end < skb_tail_pointer(skb)); + + return asoc; +} + +/* + * There are circumstances when we need to look inside the SCTP packet + * for information to help us find the association. Examples + * include looking inside of INIT/INIT-ACK chunks or after the AUTH + * chunks. + */ +static struct sctp_association *__sctp_rcv_lookup_harder(struct sk_buff *skb, + const union sctp_addr *laddr, + struct sctp_transport **transportp) +{ + sctp_chunkhdr_t *ch; + + ch = (sctp_chunkhdr_t *) skb->data; + + /* The code below will attempt to walk the chunk and extract + * parameter information. Before we do that, we need to verify + * that the chunk length doesn't cause overflow. Otherwise, we'll + * walk off the end. + */ + if (WORD_ROUND(ntohs(ch->length)) > skb->len) + return NULL; + + /* If this is INIT/INIT-ACK look inside the chunk too. */ + switch (ch->type) { + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + return __sctp_rcv_init_lookup(skb, laddr, transportp); + break; + + default: + return __sctp_rcv_walk_lookup(skb, laddr, transportp); + break; + } + + + return NULL; +} + +/* Lookup an association for an inbound skb. */ +static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb, + const union sctp_addr *paddr, + const union sctp_addr *laddr, + struct sctp_transport **transportp) +{ + struct sctp_association *asoc; + + asoc = __sctp_lookup_association(laddr, paddr, transportp); + + /* Further lookup for INIT/INIT-ACK packets. + * SCTP Implementors Guide, 2.18 Handling of address + * parameters within the INIT or INIT-ACK. + */ + if (!asoc) + asoc = __sctp_rcv_lookup_harder(skb, laddr, transportp); + + return asoc; +} diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c new file mode 100644 index 00000000..397296fb --- /dev/null +++ b/net/sctp/inqueue.c @@ -0,0 +1,246 @@ +/* SCTP kernel implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2002 International Business Machines, Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions are the methods for accessing the SCTP inqueue. + * + * An SCTP inqueue is a queue into which you push SCTP packets + * (which might be bundles or fragments of chunks) and out of which you + * pop SCTP whole chunks. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +/* Initialize an SCTP inqueue. */ +void sctp_inq_init(struct sctp_inq *queue) +{ + INIT_LIST_HEAD(&queue->in_chunk_list); + queue->in_progress = NULL; + + /* Create a task for delivering data. */ + INIT_WORK(&queue->immediate, NULL); + + queue->malloced = 0; +} + +/* Release the memory associated with an SCTP inqueue. */ +void sctp_inq_free(struct sctp_inq *queue) +{ + struct sctp_chunk *chunk, *tmp; + + /* Empty the queue. */ + list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) { + list_del_init(&chunk->list); + sctp_chunk_free(chunk); + } + + /* If there is a packet which is currently being worked on, + * free it as well. + */ + if (queue->in_progress) { + sctp_chunk_free(queue->in_progress); + queue->in_progress = NULL; + } + + if (queue->malloced) { + /* Dump the master memory segment. */ + kfree(queue); + } +} + +/* Put a new packet in an SCTP inqueue. + * We assume that packet->sctp_hdr is set and in host byte order. + */ +void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) +{ + /* Directly call the packet handling routine. */ + if (chunk->rcvr->dead) { + sctp_chunk_free(chunk); + return; + } + + /* We are now calling this either from the soft interrupt + * or from the backlog processing. + * Eventually, we should clean up inqueue to not rely + * on the BH related data structures. + */ + list_add_tail(&chunk->list, &q->in_chunk_list); + q->immediate.func(&q->immediate); +} + +/* Peek at the next chunk on the inqeue. */ +struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue) +{ + struct sctp_chunk *chunk; + sctp_chunkhdr_t *ch = NULL; + + chunk = queue->in_progress; + /* If there is no more chunks in this packet, say so */ + if (chunk->singleton || + chunk->end_of_packet || + chunk->pdiscard) + return NULL; + + ch = (sctp_chunkhdr_t *)chunk->chunk_end; + + return ch; +} + + +/* Extract a chunk from an SCTP inqueue. + * + * WARNING: If you need to put the chunk on another queue, you need to + * make a shallow copy (clone) of it. + */ +struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) +{ + struct sctp_chunk *chunk; + sctp_chunkhdr_t *ch = NULL; + + /* The assumption is that we are safe to process the chunks + * at this time. + */ + + if ((chunk = queue->in_progress)) { + /* There is a packet that we have been working on. + * Any post processing work to do before we move on? + */ + if (chunk->singleton || + chunk->end_of_packet || + chunk->pdiscard) { + sctp_chunk_free(chunk); + chunk = queue->in_progress = NULL; + } else { + /* Nothing to do. Next chunk in the packet, please. */ + ch = (sctp_chunkhdr_t *) chunk->chunk_end; + + /* Force chunk->skb->data to chunk->chunk_end. */ + skb_pull(chunk->skb, + chunk->chunk_end - chunk->skb->data); + + /* Verify that we have at least chunk headers + * worth of buffer left. + */ + if (skb_headlen(chunk->skb) < sizeof(sctp_chunkhdr_t)) { + sctp_chunk_free(chunk); + chunk = queue->in_progress = NULL; + } + } + } + + /* Do we need to take the next packet out of the queue to process? */ + if (!chunk) { + struct list_head *entry; + + /* Is the queue empty? */ + if (list_empty(&queue->in_chunk_list)) + return NULL; + + entry = queue->in_chunk_list.next; + chunk = queue->in_progress = + list_entry(entry, struct sctp_chunk, list); + list_del_init(entry); + + /* This is the first chunk in the packet. */ + chunk->singleton = 1; + ch = (sctp_chunkhdr_t *) chunk->skb->data; + chunk->data_accepted = 0; + } + + chunk->chunk_hdr = ch; + chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + /* In the unlikely case of an IP reassembly, the skb could be + * non-linear. If so, update chunk_end so that it doesn't go past + * the skb->tail. + */ + if (unlikely(skb_is_nonlinear(chunk->skb))) { + if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) + chunk->chunk_end = skb_tail_pointer(chunk->skb); + } + skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); + chunk->subh.v = NULL; /* Subheader is no longer valid. */ + + if (chunk->chunk_end < skb_tail_pointer(chunk->skb)) { + /* This is not a singleton */ + chunk->singleton = 0; + } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { + /* RFC 2960, Section 6.10 Bundling + * + * Partial chunks MUST NOT be placed in an SCTP packet. + * If the receiver detects a partial chunk, it MUST drop + * the chunk. + * + * Since the end of the chunk is past the end of our buffer + * (which contains the whole packet, we can freely discard + * the whole packet. + */ + sctp_chunk_free(chunk); + chunk = queue->in_progress = NULL; + + return NULL; + } else { + /* We are at the end of the packet, so mark the chunk + * in case we need to send a SACK. + */ + chunk->end_of_packet = 1; + } + + SCTP_DEBUG_PRINTK("+++sctp_inq_pop+++ chunk %p[%s]," + " length %d, skb->len %d\n",chunk, + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), + ntohs(chunk->chunk_hdr->length), chunk->skb->len); + return chunk; +} + +/* Set a top-half handler. + * + * Originally, we the top-half handler was scheduled as a BH. We now + * call the handler directly in sctp_inq_push() at a time that + * we know we are lock safe. + * The intent is that this routine will pull stuff out of the + * inqueue and process it. + */ +void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback) +{ + INIT_WORK(&q->immediate, callback); +} + diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c new file mode 100644 index 00000000..0bb0d7cb --- /dev/null +++ b/net/sctp/ipv6.c @@ -0,0 +1,1079 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2002, 2004 + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * Copyright (c) 2002-2003 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * SCTP over IPv6. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Le Yanqun + * Hui Huang + * La Monte H.P. Yarroll + * Sridhar Samudrala + * Jon Grimm + * Ardelle Fan + * + * Based on: + * linux/net/ipv6/tcp_ipv6.c + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static inline int sctp_v6_addr_match_len(union sctp_addr *s1, + union sctp_addr *s2); +static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, + __be16 port); +static int sctp_v6_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2); + +/* Event handler for inet6 address addition/deletion events. + * The sctp_local_addr_list needs to be protocted by a spin lock since + * multiple notifiers (say IPv4 and IPv6) may be running at the same + * time and thus corrupt the list. + * The reader side is protected with RCU. + */ +static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, + void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct sctp_sockaddr_entry *addr = NULL; + struct sctp_sockaddr_entry *temp; + int found = 0; + + switch (ev) { + case NETDEV_UP: + addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); + if (addr) { + addr->a.v6.sin6_family = AF_INET6; + addr->a.v6.sin6_port = 0; + ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifa->addr); + addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; + addr->valid = 1; + spin_lock_bh(&sctp_local_addr_lock); + list_add_tail_rcu(&addr->list, &sctp_local_addr_list); + spin_unlock_bh(&sctp_local_addr_lock); + } + break; + case NETDEV_DOWN: + spin_lock_bh(&sctp_local_addr_lock); + list_for_each_entry_safe(addr, temp, + &sctp_local_addr_list, list) { + if (addr->a.sa.sa_family == AF_INET6 && + ipv6_addr_equal(&addr->a.v6.sin6_addr, + &ifa->addr)) { + found = 1; + addr->valid = 0; + list_del_rcu(&addr->list); + break; + } + } + spin_unlock_bh(&sctp_local_addr_lock); + if (found) + kfree_rcu(addr, rcu); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block sctp_inet6addr_notifier = { + .notifier_call = sctp_inet6addr_event, +}; + +/* ICMP error handler. */ +SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct inet6_dev *idev; + struct sock *sk; + struct sctp_association *asoc; + struct sctp_transport *transport; + struct ipv6_pinfo *np; + sk_buff_data_t saveip, savesctp; + int err; + + idev = in6_dev_get(skb->dev); + + /* Fix up skb to look at the embedded net header. */ + saveip = skb->network_header; + savesctp = skb->transport_header; + skb_reset_network_header(skb); + skb_set_transport_header(skb, offset); + sk = sctp_err_lookup(AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); + /* Put back, the original pointers. */ + skb->network_header = saveip; + skb->transport_header = savesctp; + if (!sk) { + ICMP6_INC_STATS_BH(dev_net(skb->dev), idev, ICMP6_MIB_INERRORS); + goto out; + } + + /* Warning: The sock lock is held. Remember to call + * sctp_err_finish! + */ + + switch (type) { + case ICMPV6_PKT_TOOBIG: + sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); + goto out_unlock; + case ICMPV6_PARAMPROB: + if (ICMPV6_UNK_NEXTHDR == code) { + sctp_icmp_proto_unreachable(sk, asoc, transport); + goto out_unlock; + } + break; + default: + break; + } + + np = inet6_sk(sk); + icmpv6_err_convert(type, code, &err); + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out_unlock: + sctp_err_finish(sk, asoc); +out: + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +/* Based on tcp_v6_xmit() in tcp_ipv6.c. */ +static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) +{ + struct sock *sk = skb->sk; + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_proto = sk->sk_protocol; + + /* Fill in the dest address from the route entry passed with the skb + * and the source address from the transport. + */ + ipv6_addr_copy(&fl6.daddr, &transport->ipaddr.v6.sin6_addr); + ipv6_addr_copy(&fl6.saddr, &transport->saddr.v6.sin6_addr); + + fl6.flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl6.flowlabel); + if (ipv6_addr_type(&fl6.saddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = transport->saddr.v6.sin6_scope_id; + else + fl6.flowi6_oif = sk->sk_bound_dev_if; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&fl6.daddr, rt0->addr); + } + + SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", + __func__, skb, skb->len, + &fl6.saddr, &fl6.daddr); + + SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS); + + if (!(transport->param_flags & SPP_PMTUD_ENABLE)) + skb->local_df = 1; + + return ip6_xmit(sk, skb, &fl6, np->opt); +} + +/* Returns the dst cache entry for the given source and destination ip + * addresses. + */ +static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, + struct flowi *fl, struct sock *sk) +{ + struct sctp_association *asoc = t->asoc; + struct dst_entry *dst = NULL; + struct flowi6 *fl6 = &fl->u.ip6; + struct sctp_bind_addr *bp; + struct sctp_sockaddr_entry *laddr; + union sctp_addr *baddr = NULL; + union sctp_addr *daddr = &t->ipaddr; + union sctp_addr dst_saddr; + __u8 matchlen = 0; + __u8 bmatchlen; + sctp_scope_t scope; + + memset(fl6, 0, sizeof(struct flowi6)); + ipv6_addr_copy(&fl6->daddr, &daddr->v6.sin6_addr); + fl6->fl6_dport = daddr->v6.sin6_port; + fl6->flowi6_proto = IPPROTO_SCTP; + if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + fl6->flowi6_oif = daddr->v6.sin6_scope_id; + + SCTP_DEBUG_PRINTK("%s: DST=%pI6 ", __func__, &fl6->daddr); + + if (asoc) + fl6->fl6_sport = htons(asoc->base.bind_addr.port); + + if (saddr) { + ipv6_addr_copy(&fl6->saddr, &saddr->v6.sin6_addr); + fl6->fl6_sport = saddr->v6.sin6_port; + SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl6->saddr); + } + + dst = ip6_dst_lookup_flow(sk, fl6, NULL, false); + if (!asoc || saddr) + goto out; + + bp = &asoc->base.bind_addr; + scope = sctp_scope(daddr); + /* ip6_dst_lookup has filled in the fl6->saddr for us. Check + * to see if we can use it. + */ + if (!IS_ERR(dst)) { + /* Walk through the bind address list and look for a bind + * address that matches the source address of the returned dst. + */ + sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port)); + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC)) + continue; + + /* Do not compare against v4 addrs */ + if ((laddr->a.sa.sa_family == AF_INET6) && + (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) { + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + /* None of the bound addresses match the source address of the + * dst. So release it. + */ + dst_release(dst); + dst = NULL; + } + + /* Walk through the bind address list and try to get the + * best source address for a given destination. + */ + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid && laddr->state != SCTP_ADDR_SRC) + continue; + if ((laddr->a.sa.sa_family == AF_INET6) && + (scope <= sctp_scope(&laddr->a))) { + bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); + if (!baddr || (matchlen < bmatchlen)) { + baddr = &laddr->a; + matchlen = bmatchlen; + } + } + } + rcu_read_unlock(); + if (baddr) { + ipv6_addr_copy(&fl6->saddr, &baddr->v6.sin6_addr); + fl6->fl6_sport = baddr->v6.sin6_port; + dst = ip6_dst_lookup_flow(sk, fl6, NULL, false); + } + +out: + if (!IS_ERR(dst)) { + struct rt6_info *rt; + rt = (struct rt6_info *)dst; + t->dst = dst; + SCTP_DEBUG_PRINTK("rt6_dst:%pI6 rt6_src:%pI6\n", + &rt->rt6i_dst.addr, &fl6->saddr); + } else { + t->dst = NULL; + SCTP_DEBUG_PRINTK("NO ROUTE\n"); + } +} + +/* Returns the number of consecutive initial bits that match in the 2 ipv6 + * addresses. + */ +static inline int sctp_v6_addr_match_len(union sctp_addr *s1, + union sctp_addr *s2) +{ + return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr); +} + +/* Fills in the source address(saddr) based on the destination address(daddr) + * and asoc's bind address list. + */ +static void sctp_v6_get_saddr(struct sctp_sock *sk, + struct sctp_transport *t, + struct flowi *fl) +{ + struct flowi6 *fl6 = &fl->u.ip6; + union sctp_addr *saddr = &t->saddr; + + SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst); + + if (t->dst) { + saddr->v6.sin6_family = AF_INET6; + ipv6_addr_copy(&saddr->v6.sin6_addr, &fl6->saddr); + } +} + +/* Make a copy of all potential local addresses. */ +static void sctp_v6_copy_addrlist(struct list_head *addrlist, + struct net_device *dev) +{ + struct inet6_dev *in6_dev; + struct inet6_ifaddr *ifp; + struct sctp_sockaddr_entry *addr; + + rcu_read_lock(); + if ((in6_dev = __in6_dev_get(dev)) == NULL) { + rcu_read_unlock(); + return; + } + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + /* Add the address to the local list. */ + addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC); + if (addr) { + addr->a.v6.sin6_family = AF_INET6; + addr->a.v6.sin6_port = 0; + ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifp->addr); + addr->a.v6.sin6_scope_id = dev->ifindex; + addr->valid = 1; + INIT_LIST_HEAD(&addr->list); + list_add_tail(&addr->list, addrlist); + } + } + + read_unlock_bh(&in6_dev->lock); + rcu_read_unlock(); +} + +/* Initialize a sockaddr_storage from in incoming skb. */ +static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb, + int is_saddr) +{ + void *from; + __be16 *port; + struct sctphdr *sh; + + port = &addr->v6.sin6_port; + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_flowinfo = 0; /* FIXME */ + addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; + + sh = sctp_hdr(skb); + if (is_saddr) { + *port = sh->source; + from = &ipv6_hdr(skb)->saddr; + } else { + *port = sh->dest; + from = &ipv6_hdr(skb)->daddr; + } + ipv6_addr_copy(&addr->v6.sin6_addr, from); +} + +/* Initialize an sctp_addr from a socket. */ +static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) +{ + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = 0; + ipv6_addr_copy(&addr->v6.sin6_addr, &inet6_sk(sk)->rcv_saddr); +} + +/* Initialize sk->sk_rcv_saddr from sctp_addr. */ +static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) +{ + if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) { + inet6_sk(sk)->rcv_saddr.s6_addr32[0] = 0; + inet6_sk(sk)->rcv_saddr.s6_addr32[1] = 0; + inet6_sk(sk)->rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); + inet6_sk(sk)->rcv_saddr.s6_addr32[3] = + addr->v4.sin_addr.s_addr; + } else { + ipv6_addr_copy(&inet6_sk(sk)->rcv_saddr, &addr->v6.sin6_addr); + } +} + +/* Initialize sk->sk_daddr from sctp_addr. */ +static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) +{ + if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) { + inet6_sk(sk)->daddr.s6_addr32[0] = 0; + inet6_sk(sk)->daddr.s6_addr32[1] = 0; + inet6_sk(sk)->daddr.s6_addr32[2] = htonl(0x0000ffff); + inet6_sk(sk)->daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; + } else { + ipv6_addr_copy(&inet6_sk(sk)->daddr, &addr->v6.sin6_addr); + } +} + +/* Initialize a sctp_addr from an address parameter. */ +static void sctp_v6_from_addr_param(union sctp_addr *addr, + union sctp_addr_param *param, + __be16 port, int iif) +{ + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = port; + addr->v6.sin6_flowinfo = 0; /* BUG */ + ipv6_addr_copy(&addr->v6.sin6_addr, ¶m->v6.addr); + addr->v6.sin6_scope_id = iif; +} + +/* Initialize an address parameter from a sctp_addr and return the length + * of the address parameter. + */ +static int sctp_v6_to_addr_param(const union sctp_addr *addr, + union sctp_addr_param *param) +{ + int length = sizeof(sctp_ipv6addr_param_t); + + param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; + param->v6.param_hdr.length = htons(length); + ipv6_addr_copy(¶m->v6.addr, &addr->v6.sin6_addr); + + return length; +} + +/* Initialize a sctp_addr from struct in6_addr. */ +static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, + __be16 port) +{ + addr->sa.sa_family = AF_INET6; + addr->v6.sin6_port = port; + ipv6_addr_copy(&addr->v6.sin6_addr, saddr); +} + +/* Compare addresses exactly. + * v4-mapped-v6 is also in consideration. + */ +static int sctp_v6_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2) +{ + if (addr1->sa.sa_family != addr2->sa.sa_family) { + if (addr1->sa.sa_family == AF_INET && + addr2->sa.sa_family == AF_INET6 && + ipv6_addr_v4mapped(&addr2->v6.sin6_addr)) { + if (addr2->v6.sin6_port == addr1->v4.sin_port && + addr2->v6.sin6_addr.s6_addr32[3] == + addr1->v4.sin_addr.s_addr) + return 1; + } + if (addr2->sa.sa_family == AF_INET && + addr1->sa.sa_family == AF_INET6 && + ipv6_addr_v4mapped(&addr1->v6.sin6_addr)) { + if (addr1->v6.sin6_port == addr2->v4.sin_port && + addr1->v6.sin6_addr.s6_addr32[3] == + addr2->v4.sin_addr.s_addr) + return 1; + } + return 0; + } + if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) + return 0; + /* If this is a linklocal address, compare the scope_id. */ + if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id && + (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) { + return 0; + } + } + + return 1; +} + +/* Initialize addr struct to INADDR_ANY. */ +static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port) +{ + memset(addr, 0x00, sizeof(union sctp_addr)); + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = port; +} + +/* Is this a wildcard address? */ +static int sctp_v6_is_any(const union sctp_addr *addr) +{ + return ipv6_addr_any(&addr->v6.sin6_addr); +} + +/* Should this be available for binding? */ +static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) +{ + int type; + const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; + + type = ipv6_addr_type(in6); + if (IPV6_ADDR_ANY == type) + return 1; + if (type == IPV6_ADDR_MAPPED) { + if (sp && !sp->v4mapped) + return 0; + if (sp && ipv6_only_sock(sctp_opt2sk(sp))) + return 0; + sctp_v6_map_v4(addr); + return sctp_get_af_specific(AF_INET)->available(addr, sp); + } + if (!(type & IPV6_ADDR_UNICAST)) + return 0; + + return ipv6_chk_addr(&init_net, in6, NULL, 0); +} + +/* This function checks if the address is a valid address to be used for + * SCTP. + * + * Output: + * Return 0 - If the address is a non-unicast or an illegal address. + * Return 1 - If the address is a unicast. + */ +static int sctp_v6_addr_valid(union sctp_addr *addr, + struct sctp_sock *sp, + const struct sk_buff *skb) +{ + int ret = ipv6_addr_type(&addr->v6.sin6_addr); + + /* Support v4-mapped-v6 address. */ + if (ret == IPV6_ADDR_MAPPED) { + /* Note: This routine is used in input, so v4-mapped-v6 + * are disallowed here when there is no sctp_sock. + */ + if (!sp || !sp->v4mapped) + return 0; + if (sp && ipv6_only_sock(sctp_opt2sk(sp))) + return 0; + sctp_v6_map_v4(addr); + return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); + } + + /* Is this a non-unicast address */ + if (!(ret & IPV6_ADDR_UNICAST)) + return 0; + + return 1; +} + +/* What is the scope of 'addr'? */ +static sctp_scope_t sctp_v6_scope(union sctp_addr *addr) +{ + int v6scope; + sctp_scope_t retval; + + /* The IPv6 scope is really a set of bit fields. + * See IFA_* in . Map to a generic SCTP scope. + */ + + v6scope = ipv6_addr_scope(&addr->v6.sin6_addr); + switch (v6scope) { + case IFA_HOST: + retval = SCTP_SCOPE_LOOPBACK; + break; + case IFA_LINK: + retval = SCTP_SCOPE_LINK; + break; + case IFA_SITE: + retval = SCTP_SCOPE_PRIVATE; + break; + default: + retval = SCTP_SCOPE_GLOBAL; + break; + } + + return retval; +} + +/* Create and initialize a new sk for the socket to be returned by accept(). */ +static struct sock *sctp_v6_create_accept_sk(struct sock *sk, + struct sctp_association *asoc) +{ + struct sock *newsk; + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct sctp6_sock *newsctp6sk; + + newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot); + if (!newsk) + goto out; + + sock_init_data(NULL, newsk); + + sctp_copy_sock(newsk, sk, asoc); + sock_reset_flag(sk, SOCK_ZAPPED); + + newsctp6sk = (struct sctp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; + + sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; + + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() + * and getpeername(). + */ + sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); + + sk_refcnt_debug_inc(newsk); + + if (newsk->sk_prot->init(newsk)) { + sk_common_release(newsk); + newsk = NULL; + } + +out: + return newsk; +} + +/* Map v4 address to mapped v6 address */ +static void sctp_v6_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr) +{ + if (sp->v4mapped && AF_INET == addr->sa.sa_family) + sctp_v4_map_v6(addr); +} + +/* Where did this skb come from? */ +static int sctp_v6_skb_iif(const struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; + return opt->iif; +} + +/* Was this packet marked by Explicit Congestion Notification? */ +static int sctp_v6_is_ce(const struct sk_buff *skb) +{ + return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20); +} + +/* Dump the v6 addr to the seq file. */ +static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) +{ + seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr); +} + +static void sctp_v6_ecn_capable(struct sock *sk) +{ + inet6_sk(sk)->tclass |= INET_ECN_ECT_0; +} + +/* Initialize a PF_INET6 socket msg_name. */ +static void sctp_inet6_msgname(char *msgname, int *addr_len) +{ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)msgname; + sin6->sin6_family = AF_INET6; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; /*FIXME */ + *addr_len = sizeof(struct sockaddr_in6); +} + +/* Initialize a PF_INET msgname from a ulpevent. */ +static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, + char *msgname, int *addrlen) +{ + struct sockaddr_in6 *sin6, *sin6from; + + if (msgname) { + union sctp_addr *addr; + struct sctp_association *asoc; + + asoc = event->asoc; + sctp_inet6_msgname(msgname, addrlen); + sin6 = (struct sockaddr_in6 *)msgname; + sin6->sin6_port = htons(asoc->peer.port); + addr = &asoc->peer.primary_addr; + + /* Note: If we go to a common v6 format, this code + * will change. + */ + + /* Map ipv4 address into v4-mapped-on-v6 address. */ + if (sctp_sk(asoc->base.sk)->v4mapped && + AF_INET == addr->sa.sa_family) { + sctp_v4_map_v6((union sctp_addr *)sin6); + sin6->sin6_addr.s6_addr32[3] = + addr->v4.sin_addr.s_addr; + return; + } + + sin6from = &asoc->peer.primary_addr.v6; + ipv6_addr_copy(&sin6->sin6_addr, &sin6from->sin6_addr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sin6from->sin6_scope_id; + } +} + +/* Initialize a msg_name from an inbound skb. */ +static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, + int *addr_len) +{ + struct sctphdr *sh; + struct sockaddr_in6 *sin6; + + if (msgname) { + sctp_inet6_msgname(msgname, addr_len); + sin6 = (struct sockaddr_in6 *)msgname; + sh = sctp_hdr(skb); + sin6->sin6_port = sh->source; + + /* Map ipv4 address into v4-mapped-on-v6 address. */ + if (sctp_sk(skb->sk)->v4mapped && + ip_hdr(skb)->version == 4) { + sctp_v4_map_v6((union sctp_addr *)sin6); + sin6->sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr; + return; + } + + /* Otherwise, just copy the v6 address. */ + ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) { + struct sctp_ulpevent *ev = sctp_skb2event(skb); + sin6->sin6_scope_id = ev->iif; + } + } +} + +/* Do we support this AF? */ +static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) +{ + switch (family) { + case AF_INET6: + return 1; + /* v4-mapped-v6 addresses */ + case AF_INET: + if (!__ipv6_only_sock(sctp_opt2sk(sp))) + return 1; + default: + return 0; + } +} + +/* Address matching with wildcards allowed. This extra level + * of indirection lets us choose whether a PF_INET6 should + * disallow any v4 addresses if we so choose. + */ +static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2, + struct sctp_sock *opt) +{ + struct sctp_af *af1, *af2; + struct sock *sk = sctp_opt2sk(opt); + + af1 = sctp_get_af_specific(addr1->sa.sa_family); + af2 = sctp_get_af_specific(addr2->sa.sa_family); + + if (!af1 || !af2) + return 0; + + /* If the socket is IPv6 only, v4 addrs will not match */ + if (__ipv6_only_sock(sk) && af1 != af2) + return 0; + + /* Today, wildcard AF_INET/AF_INET6. */ + if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2)) + return 1; + + if (addr1->sa.sa_family != addr2->sa.sa_family) + return 0; + + return af1->cmp_addr(addr1, addr2); +} + +/* Verify that the provided sockaddr looks bindable. Common verification, + * has already been taken care of. + */ +static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + struct sctp_af *af; + + /* ASSERT: address family has already been verified. */ + if (addr->sa.sa_family != AF_INET6) + af = sctp_get_af_specific(addr->sa.sa_family); + else { + int type = ipv6_addr_type(&addr->v6.sin6_addr); + struct net_device *dev; + + if (type & IPV6_ADDR_LINKLOCAL) { + if (!addr->v6.sin6_scope_id) + return 0; + rcu_read_lock(); + dev = dev_get_by_index_rcu(&init_net, + addr->v6.sin6_scope_id); + if (!dev || + !ipv6_chk_addr(&init_net, &addr->v6.sin6_addr, + dev, 0)) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + } else if (type == IPV6_ADDR_MAPPED) { + if (!opt->v4mapped) + return 0; + } + + af = opt->pf->af; + } + return af->available(addr, opt); +} + +/* Verify that the provided sockaddr looks sendable. Common verification, + * has already been taken care of. + */ +static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + struct sctp_af *af = NULL; + + /* ASSERT: address family has already been verified. */ + if (addr->sa.sa_family != AF_INET6) + af = sctp_get_af_specific(addr->sa.sa_family); + else { + int type = ipv6_addr_type(&addr->v6.sin6_addr); + struct net_device *dev; + + if (type & IPV6_ADDR_LINKLOCAL) { + if (!addr->v6.sin6_scope_id) + return 0; + rcu_read_lock(); + dev = dev_get_by_index_rcu(&init_net, + addr->v6.sin6_scope_id); + rcu_read_unlock(); + if (!dev) + return 0; + } + af = opt->pf->af; + } + + return af != NULL; +} + +/* Fill in Supported Address Type information for INIT and INIT-ACK + * chunks. Note: In the future, we may want to look at sock options + * to determine whether a PF_INET6 socket really wants to have IPV4 + * addresses. + * Returns number of addresses supported. + */ +static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, + __be16 *types) +{ + types[0] = SCTP_PARAM_IPV6_ADDRESS; + if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) { + types[1] = SCTP_PARAM_IPV4_ADDRESS; + return 2; + } + return 1; +} + +static const struct proto_ops inet6_seqpacket_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = sctp_poll, + .ioctl = inet6_ioctl, + .listen = sctp_inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw sctpv6_seqpacket_protosw = { + .type = SOCK_SEQPACKET, + .protocol = IPPROTO_SCTP, + .prot = &sctpv6_prot, + .ops = &inet6_seqpacket_ops, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG +}; +static struct inet_protosw sctpv6_stream_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SCTP, + .prot = &sctpv6_prot, + .ops = &inet6_seqpacket_ops, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG, +}; + +static int sctp6_rcv(struct sk_buff *skb) +{ + return sctp_rcv(skb) ? -1 : 0; +} + +static const struct inet6_protocol sctpv6_protocol = { + .handler = sctp6_rcv, + .err_handler = sctp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static struct sctp_af sctp_af_inet6 = { + .sa_family = AF_INET6, + .sctp_xmit = sctp_v6_xmit, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .get_dst = sctp_v6_get_dst, + .get_saddr = sctp_v6_get_saddr, + .copy_addrlist = sctp_v6_copy_addrlist, + .from_skb = sctp_v6_from_skb, + .from_sk = sctp_v6_from_sk, + .to_sk_saddr = sctp_v6_to_sk_saddr, + .to_sk_daddr = sctp_v6_to_sk_daddr, + .from_addr_param = sctp_v6_from_addr_param, + .to_addr_param = sctp_v6_to_addr_param, + .cmp_addr = sctp_v6_cmp_addr, + .scope = sctp_v6_scope, + .addr_valid = sctp_v6_addr_valid, + .inaddr_any = sctp_v6_inaddr_any, + .is_any = sctp_v6_is_any, + .available = sctp_v6_available, + .skb_iif = sctp_v6_skb_iif, + .is_ce = sctp_v6_is_ce, + .seq_dump_addr = sctp_v6_seq_dump_addr, + .ecn_capable = sctp_v6_ecn_capable, + .net_header_len = sizeof(struct ipv6hdr), + .sockaddr_len = sizeof(struct sockaddr_in6), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +static struct sctp_pf sctp_pf_inet6 = { + .event_msgname = sctp_inet6_event_msgname, + .skb_msgname = sctp_inet6_skb_msgname, + .af_supported = sctp_inet6_af_supported, + .cmp_addr = sctp_inet6_cmp_addr, + .bind_verify = sctp_inet6_bind_verify, + .send_verify = sctp_inet6_send_verify, + .supported_addrs = sctp_inet6_supported_addrs, + .create_accept_sk = sctp_v6_create_accept_sk, + .addr_v4map = sctp_v6_addr_v4map, + .af = &sctp_af_inet6, +}; + +/* Initialize IPv6 support and register with socket layer. */ +void sctp_v6_pf_init(void) +{ + /* Register the SCTP specific PF_INET6 functions. */ + sctp_register_pf(&sctp_pf_inet6, PF_INET6); + + /* Register the SCTP specific AF_INET6 functions. */ + sctp_register_af(&sctp_af_inet6); +} + +void sctp_v6_pf_exit(void) +{ + list_del(&sctp_af_inet6.list); +} + +/* Initialize IPv6 support and register with socket layer. */ +int sctp_v6_protosw_init(void) +{ + int rc; + + rc = proto_register(&sctpv6_prot, 1); + if (rc) + return rc; + + /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ + inet6_register_protosw(&sctpv6_seqpacket_protosw); + inet6_register_protosw(&sctpv6_stream_protosw); + + return 0; +} + +void sctp_v6_protosw_exit(void) +{ + inet6_unregister_protosw(&sctpv6_seqpacket_protosw); + inet6_unregister_protosw(&sctpv6_stream_protosw); + proto_unregister(&sctpv6_prot); +} + + +/* Register with inet6 layer. */ +int sctp_v6_add_protocol(void) +{ + /* Register notifier for inet6 address additions/deletions. */ + register_inet6addr_notifier(&sctp_inet6addr_notifier); + + if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) + return -EAGAIN; + + return 0; +} + +/* Unregister with inet6 layer. */ +void sctp_v6_del_protocol(void) +{ + inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); + unregister_inet6addr_notifier(&sctp_inet6addr_notifier); +} diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c new file mode 100644 index 00000000..8ef8e7d9 --- /dev/null +++ b/net/sctp/objcnt.c @@ -0,0 +1,148 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * + * This file is part of the SCTP kernel implementation + * + * Support for memory object debugging. This allows one to monitor the + * object allocations/deallocations for types instrumented for this + * via the proc fs. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +/* + * Global counters to count raw object allocation counts. + * To add new counters, choose a unique suffix for the variable + * name as the helper macros key off this suffix to make + * life easier for the programmer. + */ + +SCTP_DBG_OBJCNT(sock); +SCTP_DBG_OBJCNT(ep); +SCTP_DBG_OBJCNT(transport); +SCTP_DBG_OBJCNT(assoc); +SCTP_DBG_OBJCNT(bind_addr); +SCTP_DBG_OBJCNT(bind_bucket); +SCTP_DBG_OBJCNT(chunk); +SCTP_DBG_OBJCNT(addr); +SCTP_DBG_OBJCNT(ssnmap); +SCTP_DBG_OBJCNT(datamsg); +SCTP_DBG_OBJCNT(keys); + +/* An array to make it easy to pretty print the debug information + * to the proc fs. + */ +static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = { + SCTP_DBG_OBJCNT_ENTRY(sock), + SCTP_DBG_OBJCNT_ENTRY(ep), + SCTP_DBG_OBJCNT_ENTRY(assoc), + SCTP_DBG_OBJCNT_ENTRY(transport), + SCTP_DBG_OBJCNT_ENTRY(chunk), + SCTP_DBG_OBJCNT_ENTRY(bind_addr), + SCTP_DBG_OBJCNT_ENTRY(bind_bucket), + SCTP_DBG_OBJCNT_ENTRY(addr), + SCTP_DBG_OBJCNT_ENTRY(ssnmap), + SCTP_DBG_OBJCNT_ENTRY(datamsg), + SCTP_DBG_OBJCNT_ENTRY(keys), +}; + +/* Callback from procfs to read out objcount information. + * Walk through the entries in the sctp_dbg_objcnt array, dumping + * the raw object counts for each monitored type. + */ +static int sctp_objcnt_seq_show(struct seq_file *seq, void *v) +{ + int i, len; + + i = (int)*(loff_t *)v; + seq_printf(seq, "%s: %d%n", sctp_dbg_objcnt[i].label, + atomic_read(sctp_dbg_objcnt[i].counter), &len); + seq_printf(seq, "%*s\n", 127 - len, ""); + return 0; +} + +static void *sctp_objcnt_seq_start(struct seq_file *seq, loff_t *pos) +{ + return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos; +} + +static void sctp_objcnt_seq_stop(struct seq_file *seq, void *v) +{ +} + +static void * sctp_objcnt_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos; +} + +static const struct seq_operations sctp_objcnt_seq_ops = { + .start = sctp_objcnt_seq_start, + .next = sctp_objcnt_seq_next, + .stop = sctp_objcnt_seq_stop, + .show = sctp_objcnt_seq_show, +}; + +static int sctp_objcnt_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sctp_objcnt_seq_ops); +} + +static const struct file_operations sctp_objcnt_ops = { + .open = sctp_objcnt_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Initialize the objcount in the proc filesystem. */ +void sctp_dbg_objcnt_init(void) +{ + struct proc_dir_entry *ent; + + ent = proc_create("sctp_dbg_objcnt", 0, + proc_net_sctp, &sctp_objcnt_ops); + if (!ent) + pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n"); +} + +/* Cleanup the objcount entry in the proc filesystem. */ +void sctp_dbg_objcnt_exit(void) +{ + remove_proc_entry("sctp_dbg_objcnt", proc_net_sctp); +} + + diff --git a/net/sctp/output.c b/net/sctp/output.c new file mode 100644 index 00000000..8fc4dcd2 --- /dev/null +++ b/net/sctp/output.c @@ -0,0 +1,751 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * + * This file is part of the SCTP kernel implementation + * + * These functions handle output processing. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* for sa_family_t */ +#include + +#include +#include +#include + +/* Forward declarations for private helpers. */ +static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, + struct sctp_chunk *chunk); +static void sctp_packet_append_data(struct sctp_packet *packet, + struct sctp_chunk *chunk); +static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, + struct sctp_chunk *chunk, + u16 chunk_len); + +static void sctp_packet_reset(struct sctp_packet *packet) +{ + packet->size = packet->overhead; + packet->has_cookie_echo = 0; + packet->has_sack = 0; + packet->has_data = 0; + packet->has_auth = 0; + packet->ipfragok = 0; + packet->auth = NULL; +} + +/* Config a packet. + * This appears to be a followup set of initializations. + */ +struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, + __u32 vtag, int ecn_capable) +{ + struct sctp_chunk *chunk = NULL; + + SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__, + packet, vtag); + + packet->vtag = vtag; + + if (ecn_capable && sctp_packet_empty(packet)) { + chunk = sctp_get_ecne_prepend(packet->transport->asoc); + + /* If there a is a prepend chunk stick it on the list before + * any other chunks get appended. + */ + if (chunk) + sctp_packet_append_chunk(packet, chunk); + } + + return packet; +} + +/* Initialize the packet structure. */ +struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, + struct sctp_transport *transport, + __u16 sport, __u16 dport) +{ + struct sctp_association *asoc = transport->asoc; + size_t overhead; + + SCTP_DEBUG_PRINTK("%s: packet:%p transport:%p\n", __func__, + packet, transport); + + packet->transport = transport; + packet->source_port = sport; + packet->destination_port = dport; + INIT_LIST_HEAD(&packet->chunk_list); + if (asoc) { + struct sctp_sock *sp = sctp_sk(asoc->base.sk); + overhead = sp->pf->af->net_header_len; + } else { + overhead = sizeof(struct ipv6hdr); + } + overhead += sizeof(struct sctphdr); + packet->overhead = overhead; + sctp_packet_reset(packet); + packet->vtag = 0; + packet->malloced = 0; + return packet; +} + +/* Free a packet. */ +void sctp_packet_free(struct sctp_packet *packet) +{ + struct sctp_chunk *chunk, *tmp; + + SCTP_DEBUG_PRINTK("%s: packet:%p\n", __func__, packet); + + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + sctp_chunk_free(chunk); + } + + if (packet->malloced) + kfree(packet); +} + +/* This routine tries to append the chunk to the offered packet. If adding + * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk + * is not present in the packet, it transmits the input packet. + * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long + * as it can fit in the packet, but any more data that does not fit in this + * packet can be sent only after receiving the COOKIE_ACK. + */ +sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, + struct sctp_chunk *chunk, + int one_packet) +{ + sctp_xmit_t retval; + int error = 0; + + SCTP_DEBUG_PRINTK("%s: packet:%p chunk:%p\n", __func__, + packet, chunk); + + switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { + case SCTP_XMIT_PMTU_FULL: + if (!packet->has_cookie_echo) { + error = sctp_packet_transmit(packet); + if (error < 0) + chunk->skb->sk->sk_err = -error; + + /* If we have an empty packet, then we can NOT ever + * return PMTU_FULL. + */ + if (!one_packet) + retval = sctp_packet_append_chunk(packet, + chunk); + } + break; + + case SCTP_XMIT_RWND_FULL: + case SCTP_XMIT_OK: + case SCTP_XMIT_NAGLE_DELAY: + break; + } + + return retval; +} + +/* Try to bundle an auth chunk into the packet. */ +static sctp_xmit_t sctp_packet_bundle_auth(struct sctp_packet *pkt, + struct sctp_chunk *chunk) +{ + struct sctp_association *asoc = pkt->transport->asoc; + struct sctp_chunk *auth; + sctp_xmit_t retval = SCTP_XMIT_OK; + + /* if we don't have an association, we can't do authentication */ + if (!asoc) + return retval; + + /* See if this is an auth chunk we are bundling or if + * auth is already bundled. + */ + if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth) + return retval; + + /* if the peer did not request this chunk to be authenticated, + * don't do it + */ + if (!chunk->auth) + return retval; + + auth = sctp_make_auth(asoc); + if (!auth) + return retval; + + retval = sctp_packet_append_chunk(pkt, auth); + + return retval; +} + +/* Try to bundle a SACK with the packet. */ +static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval = SCTP_XMIT_OK; + + /* If sending DATA and haven't aleady bundled a SACK, try to + * bundle one in to the packet. + */ + if (sctp_chunk_is_data(chunk) && !pkt->has_sack && + !pkt->has_cookie_echo) { + struct sctp_association *asoc; + struct timer_list *timer; + asoc = pkt->transport->asoc; + timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; + + /* If the SACK timer is running, we have a pending SACK */ + if (timer_pending(timer)) { + struct sctp_chunk *sack; + asoc->a_rwnd = asoc->rwnd; + sack = sctp_make_sack(asoc); + if (sack) { + retval = sctp_packet_append_chunk(pkt, sack); + asoc->peer.sack_needed = 0; + if (del_timer(timer)) + sctp_association_put(asoc); + } + } + } + return retval; +} + +/* Append a chunk to the offered packet reporting back any inability to do + * so. + */ +sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval = SCTP_XMIT_OK; + __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length)); + + SCTP_DEBUG_PRINTK("%s: packet:%p chunk:%p\n", __func__, packet, + chunk); + + /* Data chunks are special. Before seeing what else we can + * bundle into this packet, check to see if we are allowed to + * send this DATA. + */ + if (sctp_chunk_is_data(chunk)) { + retval = sctp_packet_can_append_data(packet, chunk); + if (retval != SCTP_XMIT_OK) + goto finish; + } + + /* Try to bundle AUTH chunk */ + retval = sctp_packet_bundle_auth(packet, chunk); + if (retval != SCTP_XMIT_OK) + goto finish; + + /* Try to bundle SACK chunk */ + retval = sctp_packet_bundle_sack(packet, chunk); + if (retval != SCTP_XMIT_OK) + goto finish; + + /* Check to see if this chunk will fit into the packet */ + retval = sctp_packet_will_fit(packet, chunk, chunk_len); + if (retval != SCTP_XMIT_OK) + goto finish; + + /* We believe that this chunk is OK to add to the packet */ + switch (chunk->chunk_hdr->type) { + case SCTP_CID_DATA: + /* Account for the data being in the packet */ + sctp_packet_append_data(packet, chunk); + /* Disallow SACK bundling after DATA. */ + packet->has_sack = 1; + /* Disallow AUTH bundling after DATA */ + packet->has_auth = 1; + /* Let it be knows that packet has DATA in it */ + packet->has_data = 1; + /* timestamp the chunk for rtx purposes */ + chunk->sent_at = jiffies; + break; + case SCTP_CID_COOKIE_ECHO: + packet->has_cookie_echo = 1; + break; + + case SCTP_CID_SACK: + packet->has_sack = 1; + break; + + case SCTP_CID_AUTH: + packet->has_auth = 1; + packet->auth = chunk; + break; + } + + /* It is OK to send this chunk. */ + list_add_tail(&chunk->list, &packet->chunk_list); + packet->size += chunk_len; + chunk->transport = packet->transport; +finish: + return retval; +} + +/* All packets are sent to the network through this function from + * sctp_outq_tail(). + * + * The return value is a normal kernel error return value. + */ +int sctp_packet_transmit(struct sctp_packet *packet) +{ + struct sctp_transport *tp = packet->transport; + struct sctp_association *asoc = tp->asoc; + struct sctphdr *sh; + struct sk_buff *nskb; + struct sctp_chunk *chunk, *tmp; + struct sock *sk; + int err = 0; + int padding; /* How much padding do we need? */ + __u8 has_data = 0; + struct dst_entry *dst = tp->dst; + unsigned char *auth = NULL; /* pointer to auth in skb data */ + __u32 cksum_buf_len = sizeof(struct sctphdr); + + SCTP_DEBUG_PRINTK("%s: packet:%p\n", __func__, packet); + + /* Do NOT generate a chunkless packet. */ + if (list_empty(&packet->chunk_list)) + return err; + + /* Set up convenience variables... */ + chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); + sk = chunk->skb->sk; + + /* Allocate the new skb. */ + nskb = alloc_skb(packet->size + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + goto nomem; + + /* Make sure the outbound skb has enough header room reserved. */ + skb_reserve(nskb, packet->overhead + LL_MAX_HEADER); + + /* Set the owning socket so that we know where to get the + * destination IP address. + */ + skb_set_owner_w(nskb, sk); + + if (!sctp_transport_dst_check(tp)) { + sctp_transport_route(tp, NULL, sctp_sk(sk)); + if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) { + sctp_assoc_sync_pmtu(asoc); + } + } + dst = dst_clone(tp->dst); + skb_dst_set(nskb, dst); + if (!dst) + goto no_route; + + /* Build the SCTP header. */ + sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); + skb_reset_transport_header(nskb); + sh->source = htons(packet->source_port); + sh->dest = htons(packet->destination_port); + + /* From 6.8 Adler-32 Checksum Calculation: + * After the packet is constructed (containing the SCTP common + * header and one or more control or DATA chunks), the + * transmitter shall: + * + * 1) Fill in the proper Verification Tag in the SCTP common + * header and initialize the checksum field to 0's. + */ + sh->vtag = htonl(packet->vtag); + sh->checksum = 0; + + /** + * 6.10 Bundling + * + * An endpoint bundles chunks by simply including multiple + * chunks in one outbound SCTP packet. ... + */ + + /** + * 3.2 Chunk Field Descriptions + * + * The total length of a chunk (including Type, Length and + * Value fields) MUST be a multiple of 4 bytes. If the length + * of the chunk is not a multiple of 4 bytes, the sender MUST + * pad the chunk with all zero bytes and this padding is not + * included in the chunk length field. The sender should + * never pad with more than 3 bytes. + * + * [This whole comment explains WORD_ROUND() below.] + */ + SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n"); + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + if (sctp_chunk_is_data(chunk)) { + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ + + if (!tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; + } + has_data = 1; + } + + padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + if (padding) + memset(skb_put(chunk->skb, padding), 0, padding); + + /* if this is the auth chunk that we are adding, + * store pointer where it will be added and put + * the auth into the packet. + */ + if (chunk == packet->auth) + auth = skb_tail_pointer(nskb); + + cksum_buf_len += chunk->skb->len; + memcpy(skb_put(nskb, chunk->skb->len), + chunk->skb->data, chunk->skb->len); + + SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", + "*** Chunk", chunk, + sctp_cname(SCTP_ST_CHUNK( + chunk->chunk_hdr->type)), + chunk->has_tsn ? "TSN" : "No TSN", + chunk->has_tsn ? + ntohl(chunk->subh.data_hdr->tsn) : 0, + "length", ntohs(chunk->chunk_hdr->length), + "chunk->skb->len", chunk->skb->len, + "rtt_in_progress", chunk->rtt_in_progress); + + /* + * If this is a control chunk, this is our last + * reference. Free data chunks after they've been + * acknowledged or have failed. + */ + if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + } + + /* SCTP-AUTH, Section 6.2 + * The sender MUST calculate the MAC as described in RFC2104 [2] + * using the hash function H as described by the MAC Identifier and + * the shared association key K based on the endpoint pair shared key + * described by the shared key identifier. The 'data' used for the + * computation of the AUTH-chunk is given by the AUTH chunk with its + * HMAC field set to zero (as shown in Figure 6) followed by all + * chunks that are placed after the AUTH chunk in the SCTP packet. + */ + if (auth) + sctp_auth_calculate_hmac(asoc, nskb, + (struct sctp_auth_chunk *)auth, + GFP_ATOMIC); + + /* 2) Calculate the Adler-32 checksum of the whole packet, + * including the SCTP common header and all the + * chunks. + * + * Note: Adler-32 is no longer applicable, as has been replaced + * by CRC32-C as described in . + */ + if (!sctp_checksum_disable) { + if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) { + __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); + + /* 3) Put the resultant value into the checksum field in the + * common header, and leave the rest of the bits unchanged. + */ + sh->checksum = sctp_end_cksum(crc32); + } else { + /* no need to seed pseudo checksum for SCTP */ + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (skb_transport_header(nskb) - + nskb->head); + nskb->csum_offset = offsetof(struct sctphdr, checksum); + } + } + + /* IP layer ECN support + * From RFC 2481 + * "The ECN-Capable Transport (ECT) bit would be set by the + * data sender to indicate that the end-points of the + * transport protocol are ECN-capable." + * + * Now setting the ECT bit all the time, as it should not cause + * any problems protocol-wise even if our peer ignores it. + * + * Note: The works for IPv6 layer checks this bit too later + * in transmission. See IP6_ECN_flow_xmit(). + */ + (*tp->af_specific->ecn_capable)(nskb->sk); + + /* Set up the IP options. */ + /* BUG: not implemented + * For v4 this all lives somewhere in sk->sk_opt... + */ + + /* Dump that on IP! */ + if (asoc && asoc->peer.last_sent_to != tp) { + /* Considering the multiple CPU scenario, this is a + * "correcter" place for last_sent_to. --xguo + */ + asoc->peer.last_sent_to = tp; + } + + if (has_data) { + struct timer_list *timer; + unsigned long timeout; + + /* Restart the AUTOCLOSE timer when sending data. */ + if (sctp_state(asoc, ESTABLISHED) && asoc->autoclose) { + timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; + timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; + + if (!mod_timer(timer, jiffies + timeout)) + sctp_association_hold(asoc); + } + } + + SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", + nskb->len); + + nskb->local_df = packet->ipfragok; + (*tp->af_specific->sctp_xmit)(nskb, tp); + +out: + sctp_packet_reset(packet); + return err; +no_route: + kfree_skb(nskb); + IP_INC_STATS_BH(&init_net, IPSTATS_MIB_OUTNOROUTES); + + /* FIXME: Returning the 'err' will effect all the associations + * associated with a socket, although only one of the paths of the + * association is unreachable. + * The real failure of a transport or association can be passed on + * to the user via notifications. So setting this error may not be + * required. + */ + /* err = -EHOSTUNREACH; */ +err: + /* Control chunks are unreliable so just drop them. DATA chunks + * will get resent or dropped later. + */ + + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + } + goto out; +nomem: + err = -ENOMEM; + goto err; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* This private function check to see if a chunk can be added */ +static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval = SCTP_XMIT_OK; + size_t datasize, rwnd, inflight, flight_size; + struct sctp_transport *transport = packet->transport; + struct sctp_association *asoc = transport->asoc; + struct sctp_outq *q = &asoc->outqueue; + + /* RFC 2960 6.1 Transmission of DATA Chunks + * + * A) At any given time, the data sender MUST NOT transmit new data to + * any destination transport address if its peer's rwnd indicates + * that the peer has no buffer space (i.e. rwnd is 0, see Section + * 6.2.1). However, regardless of the value of rwnd (including if it + * is 0), the data sender can always have one DATA chunk in flight to + * the receiver if allowed by cwnd (see rule B below). This rule + * allows the sender to probe for a change in rwnd that the sender + * missed due to the SACK having been lost in transit from the data + * receiver to the data sender. + */ + + rwnd = asoc->peer.rwnd; + inflight = q->outstanding_bytes; + flight_size = transport->flight_size; + + datasize = sctp_data_size(chunk); + + if (datasize > rwnd) { + if (inflight > 0) { + /* We have (at least) one data chunk in flight, + * so we can't fall back to rule 6.1 B). + */ + retval = SCTP_XMIT_RWND_FULL; + goto finish; + } + } + + /* RFC 2960 6.1 Transmission of DATA Chunks + * + * B) At any given time, the sender MUST NOT transmit new data + * to a given transport address if it has cwnd or more bytes + * of data outstanding to that transport address. + */ + /* RFC 7.2.4 & the Implementers Guide 2.8. + * + * 3) ... + * When a Fast Retransmit is being performed the sender SHOULD + * ignore the value of cwnd and SHOULD NOT delay retransmission. + */ + if (chunk->fast_retransmit != SCTP_NEED_FRTX) + if (flight_size >= transport->cwnd) { + retval = SCTP_XMIT_RWND_FULL; + goto finish; + } + + /* Nagle's algorithm to solve small-packet problem: + * Inhibit the sending of new chunks when new outgoing data arrives + * if any previously transmitted data on the connection remains + * unacknowledged. + */ + if (!sctp_sk(asoc->base.sk)->nodelay && sctp_packet_empty(packet) && + inflight && sctp_state(asoc, ESTABLISHED)) { + unsigned max = transport->pathmtu - packet->overhead; + unsigned len = chunk->skb->len + q->out_qlen; + + /* Check whether this chunk and all the rest of pending + * data will fit or delay in hopes of bundling a full + * sized packet. + * Don't delay large message writes that may have been + * fragmeneted into small peices. + */ + if ((len < max) && chunk->msg->can_delay) { + retval = SCTP_XMIT_NAGLE_DELAY; + goto finish; + } + } + +finish: + return retval; +} + +/* This private function does management things when adding DATA chunk */ +static void sctp_packet_append_data(struct sctp_packet *packet, + struct sctp_chunk *chunk) +{ + struct sctp_transport *transport = packet->transport; + size_t datasize = sctp_data_size(chunk); + struct sctp_association *asoc = transport->asoc; + u32 rwnd = asoc->peer.rwnd; + + /* Keep track of how many bytes are in flight over this transport. */ + transport->flight_size += datasize; + + /* Keep track of how many bytes are in flight to the receiver. */ + asoc->outqueue.outstanding_bytes += datasize; + + /* Update our view of the receiver's rwnd. */ + if (datasize < rwnd) + rwnd -= datasize; + else + rwnd = 0; + + asoc->peer.rwnd = rwnd; + /* Has been accepted for transmission. */ + if (!asoc->peer.prsctp_capable) + chunk->msg->can_abandon = 0; + sctp_chunk_assign_tsn(chunk); + sctp_chunk_assign_ssn(chunk); +} + +static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, + struct sctp_chunk *chunk, + u16 chunk_len) +{ + size_t psize; + size_t pmtu; + int too_big; + sctp_xmit_t retval = SCTP_XMIT_OK; + + psize = packet->size; + pmtu = ((packet->transport->asoc) ? + (packet->transport->asoc->pathmtu) : + (packet->transport->pathmtu)); + + too_big = (psize + chunk_len > pmtu); + + /* Decide if we need to fragment or resubmit later. */ + if (too_big) { + /* It's OK to fragmet at IP level if any one of the following + * is true: + * 1. The packet is empty (meaning this chunk is greater + * the MTU) + * 2. The chunk we are adding is a control chunk + * 3. The packet doesn't have any data in it yet and data + * requires authentication. + */ + if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) || + (!packet->has_data && chunk->auth)) { + /* We no longer do re-fragmentation. + * Just fragment at the IP layer, if we + * actually hit this condition + */ + packet->ipfragok = 1; + } else { + retval = SCTP_XMIT_PMTU_FULL; + } + } + + return retval; +} diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c new file mode 100644 index 00000000..1f2938fb --- /dev/null +++ b/net/sctp/outqueue.c @@ -0,0 +1,1900 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions implement the sctp_outq class. The outqueue handles + * bundling and queueing of outgoing SCTP chunks. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Perry Melange + * Xingang Guo + * Hui Huang + * Sridhar Samudrala + * Jon Grimm + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include /* For struct list_head */ +#include +#include +#include +#include /* For skb_set_owner_w */ + +#include +#include + +/* Declare internal functions here. */ +static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); +static void sctp_check_transmitted(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + struct sctp_sackhdr *sack, + __u32 *highest_new_tsn); + +static void sctp_mark_missing(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + __u32 highest_new_tsn, + int count_of_newacks); + +static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); + +static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout); + +/* Add data to the front of the queue. */ +static inline void sctp_outq_head_data(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + list_add(&ch->list, &q->out_chunk_list); + q->out_qlen += ch->skb->len; +} + +/* Take data from the front of the queue. */ +static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) +{ + struct sctp_chunk *ch = NULL; + + if (!list_empty(&q->out_chunk_list)) { + struct list_head *entry = q->out_chunk_list.next; + + ch = list_entry(entry, struct sctp_chunk, list); + list_del_init(entry); + q->out_qlen -= ch->skb->len; + } + return ch; +} +/* Add data chunk to the end of the queue. */ +static inline void sctp_outq_tail_data(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + list_add_tail(&ch->list, &q->out_chunk_list); + q->out_qlen += ch->skb->len; +} + +/* + * SFR-CACC algorithm: + * D) If count_of_newacks is greater than or equal to 2 + * and t was not sent to the current primary then the + * sender MUST NOT increment missing report count for t. + */ +static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, + struct sctp_transport *transport, + int count_of_newacks) +{ + if (count_of_newacks >=2 && transport != primary) + return 1; + return 0; +} + +/* + * SFR-CACC algorithm: + * F) If count_of_newacks is less than 2, let d be the + * destination to which t was sent. If cacc_saw_newack + * is 0 for destination d, then the sender MUST NOT + * increment missing report count for t. + */ +static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport, + int count_of_newacks) +{ + if (count_of_newacks < 2 && + (transport && !transport->cacc.cacc_saw_newack)) + return 1; + return 0; +} + +/* + * SFR-CACC algorithm: + * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD + * execute steps C, D, F. + * + * C has been implemented in sctp_outq_sack + */ +static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary, + struct sctp_transport *transport, + int count_of_newacks) +{ + if (!primary->cacc.cycling_changeover) { + if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks)) + return 1; + if (sctp_cacc_skip_3_1_f(transport, count_of_newacks)) + return 1; + return 0; + } + return 0; +} + +/* + * SFR-CACC algorithm: + * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less + * than next_tsn_at_change of the current primary, then + * the sender MUST NOT increment missing report count + * for t. + */ +static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn) +{ + if (primary->cacc.cycling_changeover && + TSN_lt(tsn, primary->cacc.next_tsn_at_change)) + return 1; + return 0; +} + +/* + * SFR-CACC algorithm: + * 3) If the missing report count for TSN t is to be + * incremented according to [RFC2960] and + * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set, + * then the sender MUST further execute steps 3.1 and + * 3.2 to determine if the missing report count for + * TSN t SHOULD NOT be incremented. + * + * 3.3) If 3.1 and 3.2 do not dictate that the missing + * report count for t should not be incremented, then + * the sender SHOULD increment missing report count for + * t (according to [RFC2960] and [SCTP_STEWART_2002]). + */ +static inline int sctp_cacc_skip(struct sctp_transport *primary, + struct sctp_transport *transport, + int count_of_newacks, + __u32 tsn) +{ + if (primary->cacc.changeover_active && + (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) || + sctp_cacc_skip_3_2(primary, tsn))) + return 1; + return 0; +} + +/* Initialize an existing sctp_outq. This does the boring stuff. + * You still need to define handlers if you really want to DO + * something with this structure... + */ +void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) +{ + q->asoc = asoc; + INIT_LIST_HEAD(&q->out_chunk_list); + INIT_LIST_HEAD(&q->control_chunk_list); + INIT_LIST_HEAD(&q->retransmit); + INIT_LIST_HEAD(&q->sacked); + INIT_LIST_HEAD(&q->abandoned); + + q->fast_rtx = 0; + q->outstanding_bytes = 0; + q->empty = 1; + q->cork = 0; + + q->malloced = 0; + q->out_qlen = 0; +} + +/* Free the outqueue structure and any related pending chunks. + */ +void sctp_outq_teardown(struct sctp_outq *q) +{ + struct sctp_transport *transport; + struct list_head *lchunk, *temp; + struct sctp_chunk *chunk, *tmp; + + /* Throw away unacknowledged chunks. */ + list_for_each_entry(transport, &q->asoc->peer.transport_addr_list, + transports) { + while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + /* Mark as part of a failed message. */ + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + } + + /* Throw away chunks that have been gap ACKed. */ + list_for_each_safe(lchunk, temp, &q->sacked) { + list_del_init(lchunk); + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + /* Throw away any chunks in the retransmit queue. */ + list_for_each_safe(lchunk, temp, &q->retransmit) { + list_del_init(lchunk); + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + /* Throw away any chunks that are in the abandoned queue. */ + list_for_each_safe(lchunk, temp, &q->abandoned) { + list_del_init(lchunk); + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + /* Throw away any leftover data chunks. */ + while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + + /* Mark as send failure. */ + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + q->error = 0; + + /* Throw away any leftover control chunks. */ + list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { + list_del_init(&chunk->list); + sctp_chunk_free(chunk); + } +} + +/* Free the outqueue structure and any related pending chunks. */ +void sctp_outq_free(struct sctp_outq *q) +{ + /* Throw away leftover chunks. */ + sctp_outq_teardown(q); + + /* If we were kmalloc()'d, free the memory. */ + if (q->malloced) + kfree(q); +} + +/* Put a new chunk in an sctp_outq. */ +int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) +{ + int error = 0; + + SCTP_DEBUG_PRINTK("sctp_outq_tail(%p, %p[%s])\n", + q, chunk, chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) + : "Illegal Chunk"); + + /* If it is data, queue it up, otherwise, send it + * immediately. + */ + if (sctp_chunk_is_data(chunk)) { + /* Is it OK to queue data chunks? */ + /* From 9. Termination of Association + * + * When either endpoint performs a shutdown, the + * association on each peer will stop accepting new + * data from its user and only deliver data in queue + * at the time of sending or receiving the SHUTDOWN + * chunk. + */ + switch (q->asoc->state) { + case SCTP_STATE_CLOSED: + case SCTP_STATE_SHUTDOWN_PENDING: + case SCTP_STATE_SHUTDOWN_SENT: + case SCTP_STATE_SHUTDOWN_RECEIVED: + case SCTP_STATE_SHUTDOWN_ACK_SENT: + /* Cannot send after transport endpoint shutdown */ + error = -ESHUTDOWN; + break; + + default: + SCTP_DEBUG_PRINTK("outqueueing (%p, %p[%s])\n", + q, chunk, chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) + : "Illegal Chunk"); + + sctp_outq_tail_data(q, chunk); + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + SCTP_INC_STATS(SCTP_MIB_OUTUNORDERCHUNKS); + else + SCTP_INC_STATS(SCTP_MIB_OUTORDERCHUNKS); + q->empty = 0; + break; + } + } else { + list_add_tail(&chunk->list, &q->control_chunk_list); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } + + if (error < 0) + return error; + + if (!q->cork) + error = sctp_outq_flush(q, 0); + + return error; +} + +/* Insert a chunk into the sorted list based on the TSNs. The retransmit list + * and the abandoned list are in ascending order. + */ +static void sctp_insert_list(struct list_head *head, struct list_head *new) +{ + struct list_head *pos; + struct sctp_chunk *nchunk, *lchunk; + __u32 ntsn, ltsn; + int done = 0; + + nchunk = list_entry(new, struct sctp_chunk, transmitted_list); + ntsn = ntohl(nchunk->subh.data_hdr->tsn); + + list_for_each(pos, head) { + lchunk = list_entry(pos, struct sctp_chunk, transmitted_list); + ltsn = ntohl(lchunk->subh.data_hdr->tsn); + if (TSN_lt(ntsn, ltsn)) { + list_add(new, pos->prev); + done = 1; + break; + } + } + if (!done) + list_add_tail(new, head); +} + +/* Mark all the eligible packets on a transport for retransmission. */ +void sctp_retransmit_mark(struct sctp_outq *q, + struct sctp_transport *transport, + __u8 reason) +{ + struct list_head *lchunk, *ltemp; + struct sctp_chunk *chunk; + + /* Walk through the specified transmitted queue. */ + list_for_each_safe(lchunk, ltemp, &transport->transmitted) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + + /* If the chunk is abandoned, move it to abandoned list. */ + if (sctp_chunk_abandoned(chunk)) { + list_del_init(lchunk); + sctp_insert_list(&q->abandoned, lchunk); + + /* If this chunk has not been previousely acked, + * stop considering it 'outstanding'. Our peer + * will most likely never see it since it will + * not be retransmitted + */ + if (!chunk->tsn_gap_acked) { + if (chunk->transport) + chunk->transport->flight_size -= + sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); + q->asoc->peer.rwnd += sctp_data_size(chunk); + } + continue; + } + + /* If we are doing retransmission due to a timeout or pmtu + * discovery, only the chunks that are not yet acked should + * be added to the retransmit queue. + */ + if ((reason == SCTP_RTXR_FAST_RTX && + (chunk->fast_retransmit == SCTP_NEED_FRTX)) || + (reason != SCTP_RTXR_FAST_RTX && !chunk->tsn_gap_acked)) { + /* RFC 2960 6.2.1 Processing a Received SACK + * + * C) Any time a DATA chunk is marked for + * retransmission (via either T3-rtx timer expiration + * (Section 6.3.3) or via fast retransmit + * (Section 7.2.4)), add the data size of those + * chunks to the rwnd. + */ + q->asoc->peer.rwnd += sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); + if (chunk->transport) + transport->flight_size -= sctp_data_size(chunk); + + /* sctpimpguide-05 Section 2.8.2 + * M5) If a T3-rtx timer expires, the + * 'TSN.Missing.Report' of all affected TSNs is set + * to 0. + */ + chunk->tsn_missing_report = 0; + + /* If a chunk that is being used for RTT measurement + * has to be retransmitted, we cannot use this chunk + * anymore for RTT measurements. Reset rto_pending so + * that a new RTT measurement is started when a new + * data chunk is sent. + */ + if (chunk->rtt_in_progress) { + chunk->rtt_in_progress = 0; + transport->rto_pending = 0; + } + + /* Move the chunk to the retransmit queue. The chunks + * on the retransmit queue are always kept in order. + */ + list_del_init(lchunk); + sctp_insert_list(&q->retransmit, lchunk); + } + } + + SCTP_DEBUG_PRINTK("%s: transport: %p, reason: %d, " + "cwnd: %d, ssthresh: %d, flight_size: %d, " + "pba: %d\n", __func__, + transport, reason, + transport->cwnd, transport->ssthresh, + transport->flight_size, + transport->partial_bytes_acked); + +} + +/* Mark all the eligible packets on a transport for retransmission and force + * one packet out. + */ +void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, + sctp_retransmit_reason_t reason) +{ + int error = 0; + + switch(reason) { + case SCTP_RTXR_T3_RTX: + SCTP_INC_STATS(SCTP_MIB_T3_RETRANSMITS); + sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); + /* Update the retran path if the T3-rtx timer has expired for + * the current retran path. + */ + if (transport == transport->asoc->peer.retran_path) + sctp_assoc_update_retran_path(transport->asoc); + transport->asoc->rtx_data_chunks += + transport->asoc->unack_data; + break; + case SCTP_RTXR_FAST_RTX: + SCTP_INC_STATS(SCTP_MIB_FAST_RETRANSMITS); + sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); + q->fast_rtx = 1; + break; + case SCTP_RTXR_PMTUD: + SCTP_INC_STATS(SCTP_MIB_PMTUD_RETRANSMITS); + break; + case SCTP_RTXR_T1_RTX: + SCTP_INC_STATS(SCTP_MIB_T1_RETRANSMITS); + transport->asoc->init_retries++; + break; + default: + BUG(); + } + + sctp_retransmit_mark(q, transport, reason); + + /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination, + * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by + * following the procedures outlined in C1 - C5. + */ + if (reason == SCTP_RTXR_T3_RTX) + sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point); + + /* Flush the queues only on timeout, since fast_rtx is only + * triggered during sack processing and the queue + * will be flushed at the end. + */ + if (reason != SCTP_RTXR_FAST_RTX) + error = sctp_outq_flush(q, /* rtx_timeout */ 1); + + if (error) + q->asoc->base.sk->sk_err = -error; +} + +/* + * Transmit DATA chunks on the retransmit queue. Upon return from + * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which + * need to be transmitted by the caller. + * We assume that pkt->transport has already been set. + * + * The return value is a normal kernel error return value. + */ +static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, + int rtx_timeout, int *start_timer) +{ + struct list_head *lqueue; + struct sctp_transport *transport = pkt->transport; + sctp_xmit_t status; + struct sctp_chunk *chunk, *chunk1; + int fast_rtx; + int error = 0; + int timer = 0; + int done = 0; + + lqueue = &q->retransmit; + fast_rtx = q->fast_rtx; + + /* This loop handles time-out retransmissions, fast retransmissions, + * and retransmissions due to opening of whindow. + * + * RFC 2960 6.3.3 Handle T3-rtx Expiration + * + * E3) Determine how many of the earliest (i.e., lowest TSN) + * outstanding DATA chunks for the address for which the + * T3-rtx has expired will fit into a single packet, subject + * to the MTU constraint for the path corresponding to the + * destination transport address to which the retransmission + * is being sent (this may be different from the address for + * which the timer expires [see Section 6.4]). Call this value + * K. Bundle and retransmit those K DATA chunks in a single + * packet to the destination endpoint. + * + * [Just to be painfully clear, if we are retransmitting + * because a timeout just happened, we should send only ONE + * packet of retransmitted data.] + * + * For fast retransmissions we also send only ONE packet. However, + * if we are just flushing the queue due to open window, we'll + * try to send as much as possible. + */ + list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) { + /* If the chunk is abandoned, move it to abandoned list. */ + if (sctp_chunk_abandoned(chunk)) { + list_del_init(&chunk->transmitted_list); + sctp_insert_list(&q->abandoned, + &chunk->transmitted_list); + continue; + } + + /* Make sure that Gap Acked TSNs are not retransmitted. A + * simple approach is just to move such TSNs out of the + * way and into a 'transmitted' queue and skip to the + * next chunk. + */ + if (chunk->tsn_gap_acked) { + list_del(&chunk->transmitted_list); + list_add_tail(&chunk->transmitted_list, + &transport->transmitted); + continue; + } + + /* If we are doing fast retransmit, ignore non-fast_rtransmit + * chunks + */ + if (fast_rtx && !chunk->fast_retransmit) + continue; + +redo: + /* Attempt to append this chunk to the packet. */ + status = sctp_packet_append_chunk(pkt, chunk); + + switch (status) { + case SCTP_XMIT_PMTU_FULL: + if (!pkt->has_data && !pkt->has_cookie_echo) { + /* If this packet did not contain DATA then + * retransmission did not happen, so do it + * again. We'll ignore the error here since + * control chunks are already freed so there + * is nothing we can do. + */ + sctp_packet_transmit(pkt); + goto redo; + } + + /* Send this packet. */ + error = sctp_packet_transmit(pkt); + + /* If we are retransmitting, we should only + * send a single packet. + * Otherwise, try appending this chunk again. + */ + if (rtx_timeout || fast_rtx) + done = 1; + else + goto redo; + + /* Bundle next chunk in the next round. */ + break; + + case SCTP_XMIT_RWND_FULL: + /* Send this packet. */ + error = sctp_packet_transmit(pkt); + + /* Stop sending DATA as there is no more room + * at the receiver. + */ + done = 1; + break; + + case SCTP_XMIT_NAGLE_DELAY: + /* Send this packet. */ + error = sctp_packet_transmit(pkt); + + /* Stop sending DATA because of nagle delay. */ + done = 1; + break; + + default: + /* The append was successful, so add this chunk to + * the transmitted list. + */ + list_del(&chunk->transmitted_list); + list_add_tail(&chunk->transmitted_list, + &transport->transmitted); + + /* Mark the chunk as ineligible for fast retransmit + * after it is retransmitted. + */ + if (chunk->fast_retransmit == SCTP_NEED_FRTX) + chunk->fast_retransmit = SCTP_DONT_FRTX; + + q->empty = 0; + break; + } + + /* Set the timer if there were no errors */ + if (!error && !timer) + timer = 1; + + if (done) + break; + } + + /* If we are here due to a retransmit timeout or a fast + * retransmit and if there are any chunks left in the retransmit + * queue that could not fit in the PMTU sized packet, they need + * to be marked as ineligible for a subsequent fast retransmit. + */ + if (rtx_timeout || fast_rtx) { + list_for_each_entry(chunk1, lqueue, transmitted_list) { + if (chunk1->fast_retransmit == SCTP_NEED_FRTX) + chunk1->fast_retransmit = SCTP_DONT_FRTX; + } + } + + *start_timer = timer; + + /* Clear fast retransmit hint */ + if (fast_rtx) + q->fast_rtx = 0; + + return error; +} + +/* Cork the outqueue so queued chunks are really queued. */ +int sctp_outq_uncork(struct sctp_outq *q) +{ + int error = 0; + if (q->cork) + q->cork = 0; + error = sctp_outq_flush(q, 0); + return error; +} + + +/* + * Try to flush an outqueue. + * + * Description: Send everything in q which we legally can, subject to + * congestion limitations. + * * Note: This function can be called from multiple contexts so appropriate + * locking concerns must be made. Today we use the sock lock to protect + * this function. + */ +static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) +{ + struct sctp_packet *packet; + struct sctp_packet singleton; + struct sctp_association *asoc = q->asoc; + __u16 sport = asoc->base.bind_addr.port; + __u16 dport = asoc->peer.port; + __u32 vtag = asoc->peer.i.init_tag; + struct sctp_transport *transport = NULL; + struct sctp_transport *new_transport; + struct sctp_chunk *chunk, *tmp; + sctp_xmit_t status; + int error = 0; + int start_timer = 0; + int one_packet = 0; + + /* These transports have chunks to send. */ + struct list_head transport_list; + struct list_head *ltransport; + + INIT_LIST_HEAD(&transport_list); + packet = NULL; + + /* + * 6.10 Bundling + * ... + * When bundling control chunks with DATA chunks, an + * endpoint MUST place control chunks first in the outbound + * SCTP packet. The transmitter MUST transmit DATA chunks + * within a SCTP packet in increasing order of TSN. + * ... + */ + + list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { + list_del_init(&chunk->list); + + /* Pick the right transport to use. */ + new_transport = chunk->transport; + + if (!new_transport) { + /* + * If we have a prior transport pointer, see if + * the destination address of the chunk + * matches the destination address of the + * current transport. If not a match, then + * try to look up the transport with a given + * destination address. We do this because + * after processing ASCONFs, we may have new + * transports created. + */ + if (transport && + sctp_cmp_addr_exact(&chunk->dest, + &transport->ipaddr)) + new_transport = transport; + else + new_transport = sctp_assoc_lookup_paddr(asoc, + &chunk->dest); + + /* if we still don't have a new transport, then + * use the current active path. + */ + if (!new_transport) + new_transport = asoc->peer.active_path; + } else if ((new_transport->state == SCTP_INACTIVE) || + (new_transport->state == SCTP_UNCONFIRMED)) { + /* If the chunk is Heartbeat or Heartbeat Ack, + * send it to chunk->transport, even if it's + * inactive. + * + * 3.3.6 Heartbeat Acknowledgement: + * ... + * A HEARTBEAT ACK is always sent to the source IP + * address of the IP datagram containing the + * HEARTBEAT chunk to which this ack is responding. + * ... + * + * ASCONF_ACKs also must be sent to the source. + */ + if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT && + chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK && + chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK) + new_transport = asoc->peer.active_path; + } + + /* Are we switching transports? + * Take care of transport locks. + */ + if (new_transport != transport) { + transport = new_transport; + if (list_empty(&transport->send_ready)) { + list_add_tail(&transport->send_ready, + &transport_list); + } + packet = &transport->packet; + sctp_packet_config(packet, vtag, + asoc->peer.ecn_capable); + } + + switch (chunk->chunk_hdr->type) { + /* + * 6.10 Bundling + * ... + * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN + * COMPLETE with any other chunks. [Send them immediately.] + */ + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + case SCTP_CID_SHUTDOWN_COMPLETE: + sctp_packet_init(&singleton, transport, sport, dport); + sctp_packet_config(&singleton, vtag, 0); + sctp_packet_append_chunk(&singleton, chunk); + error = sctp_packet_transmit(&singleton); + if (error < 0) + return error; + break; + + case SCTP_CID_ABORT: + if (sctp_test_T_bit(chunk)) { + packet->vtag = asoc->c.my_vtag; + } + /* The following chunks are "response" chunks, i.e. + * they are generated in response to something we + * received. If we are sending these, then we can + * send only 1 packet containing these chunks. + */ + case SCTP_CID_HEARTBEAT_ACK: + case SCTP_CID_SHUTDOWN_ACK: + case SCTP_CID_COOKIE_ACK: + case SCTP_CID_COOKIE_ECHO: + case SCTP_CID_ERROR: + case SCTP_CID_ECN_CWR: + case SCTP_CID_ASCONF_ACK: + one_packet = 1; + /* Fall through */ + + case SCTP_CID_SACK: + case SCTP_CID_HEARTBEAT: + case SCTP_CID_SHUTDOWN: + case SCTP_CID_ECN_ECNE: + case SCTP_CID_ASCONF: + case SCTP_CID_FWD_TSN: + status = sctp_packet_transmit_chunk(packet, chunk, + one_packet); + if (status != SCTP_XMIT_OK) { + /* put the chunk back */ + list_add(&chunk->list, &q->control_chunk_list); + } else if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { + /* PR-SCTP C5) If a FORWARD TSN is sent, the + * sender MUST assure that at least one T3-rtx + * timer is running. + */ + sctp_transport_reset_timers(transport); + } + break; + + default: + /* We built a chunk with an illegal type! */ + BUG(); + } + } + + /* Is it OK to send data chunks? */ + switch (asoc->state) { + case SCTP_STATE_COOKIE_ECHOED: + /* Only allow bundling when this packet has a COOKIE-ECHO + * chunk. + */ + if (!packet || !packet->has_cookie_echo) + break; + + /* fallthru */ + case SCTP_STATE_ESTABLISHED: + case SCTP_STATE_SHUTDOWN_PENDING: + case SCTP_STATE_SHUTDOWN_RECEIVED: + /* + * RFC 2960 6.1 Transmission of DATA Chunks + * + * C) When the time comes for the sender to transmit, + * before sending new DATA chunks, the sender MUST + * first transmit any outstanding DATA chunks which + * are marked for retransmission (limited by the + * current cwnd). + */ + if (!list_empty(&q->retransmit)) { + if (transport == asoc->peer.retran_path) + goto retran; + + /* Switch transports & prepare the packet. */ + + transport = asoc->peer.retran_path; + + if (list_empty(&transport->send_ready)) { + list_add_tail(&transport->send_ready, + &transport_list); + } + + packet = &transport->packet; + sctp_packet_config(packet, vtag, + asoc->peer.ecn_capable); + retran: + error = sctp_outq_flush_rtx(q, packet, + rtx_timeout, &start_timer); + + if (start_timer) + sctp_transport_reset_timers(transport); + + /* This can happen on COOKIE-ECHO resend. Only + * one chunk can get bundled with a COOKIE-ECHO. + */ + if (packet->has_cookie_echo) + goto sctp_flush_out; + + /* Don't send new data if there is still data + * waiting to retransmit. + */ + if (!list_empty(&q->retransmit)) + goto sctp_flush_out; + } + + /* Apply Max.Burst limitation to the current transport in + * case it will be used for new data. We are going to + * rest it before we return, but we want to apply the limit + * to the currently queued data. + */ + if (transport) + sctp_transport_burst_limited(transport); + + /* Finally, transmit new packets. */ + while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + /* RFC 2960 6.5 Every DATA chunk MUST carry a valid + * stream identifier. + */ + if (chunk->sinfo.sinfo_stream >= + asoc->c.sinit_num_ostreams) { + + /* Mark as failed send. */ + sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); + sctp_chunk_free(chunk); + continue; + } + + /* Has this chunk expired? */ + if (sctp_chunk_abandoned(chunk)) { + sctp_chunk_fail(chunk, 0); + sctp_chunk_free(chunk); + continue; + } + + /* If there is a specified transport, use it. + * Otherwise, we want to use the active path. + */ + new_transport = chunk->transport; + if (!new_transport || + ((new_transport->state == SCTP_INACTIVE) || + (new_transport->state == SCTP_UNCONFIRMED))) + new_transport = asoc->peer.active_path; + + /* Change packets if necessary. */ + if (new_transport != transport) { + transport = new_transport; + + /* Schedule to have this transport's + * packet flushed. + */ + if (list_empty(&transport->send_ready)) { + list_add_tail(&transport->send_ready, + &transport_list); + } + + packet = &transport->packet; + sctp_packet_config(packet, vtag, + asoc->peer.ecn_capable); + /* We've switched transports, so apply the + * Burst limit to the new transport. + */ + sctp_transport_burst_limited(transport); + } + + SCTP_DEBUG_PRINTK("sctp_outq_flush(%p, %p[%s]), ", + q, chunk, + chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK( + chunk->chunk_hdr->type)) + : "Illegal Chunk"); + + SCTP_DEBUG_PRINTK("TX TSN 0x%x skb->head " + "%p skb->users %d.\n", + ntohl(chunk->subh.data_hdr->tsn), + chunk->skb ?chunk->skb->head : NULL, + chunk->skb ? + atomic_read(&chunk->skb->users) : -1); + + /* Add the chunk to the packet. */ + status = sctp_packet_transmit_chunk(packet, chunk, 0); + + switch (status) { + case SCTP_XMIT_PMTU_FULL: + case SCTP_XMIT_RWND_FULL: + case SCTP_XMIT_NAGLE_DELAY: + /* We could not append this chunk, so put + * the chunk back on the output queue. + */ + SCTP_DEBUG_PRINTK("sctp_outq_flush: could " + "not transmit TSN: 0x%x, status: %d\n", + ntohl(chunk->subh.data_hdr->tsn), + status); + sctp_outq_head_data(q, chunk); + goto sctp_flush_out; + break; + + case SCTP_XMIT_OK: + /* The sender is in the SHUTDOWN-PENDING state, + * The sender MAY set the I-bit in the DATA + * chunk header. + */ + if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) + chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; + + break; + + default: + BUG(); + } + + /* BUG: We assume that the sctp_packet_transmit() + * call below will succeed all the time and add the + * chunk to the transmitted list and restart the + * timers. + * It is possible that the call can fail under OOM + * conditions. + * + * Is this really a problem? Won't this behave + * like a lost TSN? + */ + list_add_tail(&chunk->transmitted_list, + &transport->transmitted); + + sctp_transport_reset_timers(transport); + + q->empty = 0; + + /* Only let one DATA chunk get bundled with a + * COOKIE-ECHO chunk. + */ + if (packet->has_cookie_echo) + goto sctp_flush_out; + } + break; + + default: + /* Do nothing. */ + break; + } + +sctp_flush_out: + + /* Before returning, examine all the transports touched in + * this call. Right now, we bluntly force clear all the + * transports. Things might change after we implement Nagle. + * But such an examination is still required. + * + * --xguo + */ + while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL ) { + struct sctp_transport *t = list_entry(ltransport, + struct sctp_transport, + send_ready); + packet = &t->packet; + if (!sctp_packet_empty(packet)) + error = sctp_packet_transmit(packet); + + /* Clear the burst limited state, if any */ + sctp_transport_burst_reset(t); + } + + return error; +} + +/* Update unack_data based on the incoming SACK chunk */ +static void sctp_sack_update_unack_data(struct sctp_association *assoc, + struct sctp_sackhdr *sack) +{ + sctp_sack_variable_t *frags; + __u16 unack_data; + int i; + + unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1; + + frags = sack->variable; + for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) { + unack_data -= ((ntohs(frags[i].gab.end) - + ntohs(frags[i].gab.start) + 1)); + } + + assoc->unack_data = unack_data; +} + +/* This is where we REALLY process a SACK. + * + * Process the SACK against the outqueue. Mostly, this just frees + * things off the transmitted queue. + */ +int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) +{ + struct sctp_association *asoc = q->asoc; + struct sctp_transport *transport; + struct sctp_chunk *tchunk = NULL; + struct list_head *lchunk, *transport_list, *temp; + sctp_sack_variable_t *frags = sack->variable; + __u32 sack_ctsn, ctsn, tsn; + __u32 highest_tsn, highest_new_tsn; + __u32 sack_a_rwnd; + unsigned outstanding; + struct sctp_transport *primary = asoc->peer.primary_path; + int count_of_newacks = 0; + int gap_ack_blocks; + u8 accum_moved = 0; + + /* Grab the association's destination address list. */ + transport_list = &asoc->peer.transport_addr_list; + + sack_ctsn = ntohl(sack->cum_tsn_ack); + gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); + /* + * SFR-CACC algorithm: + * On receipt of a SACK the sender SHOULD execute the + * following statements. + * + * 1) If the cumulative ack in the SACK passes next tsn_at_change + * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be + * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for + * all destinations. + * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE + * is set the receiver of the SACK MUST take the following actions: + * + * A) Initialize the cacc_saw_newack to 0 for all destination + * addresses. + * + * Only bother if changeover_active is set. Otherwise, this is + * totally suboptimal to do on every SACK. + */ + if (primary->cacc.changeover_active) { + u8 clear_cycling = 0; + + if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { + primary->cacc.changeover_active = 0; + clear_cycling = 1; + } + + if (clear_cycling || gap_ack_blocks) { + list_for_each_entry(transport, transport_list, + transports) { + if (clear_cycling) + transport->cacc.cycling_changeover = 0; + if (gap_ack_blocks) + transport->cacc.cacc_saw_newack = 0; + } + } + } + + /* Get the highest TSN in the sack. */ + highest_tsn = sack_ctsn; + if (gap_ack_blocks) + highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end); + + if (TSN_lt(asoc->highest_sacked, highest_tsn)) + asoc->highest_sacked = highest_tsn; + + highest_new_tsn = sack_ctsn; + + /* Run through the retransmit queue. Credit bytes received + * and free those chunks that we can. + */ + sctp_check_transmitted(q, &q->retransmit, NULL, sack, &highest_new_tsn); + + /* Run through the transmitted queue. + * Credit bytes received and free those chunks which we can. + * + * This is a MASSIVE candidate for optimization. + */ + list_for_each_entry(transport, transport_list, transports) { + sctp_check_transmitted(q, &transport->transmitted, + transport, sack, &highest_new_tsn); + /* + * SFR-CACC algorithm: + * C) Let count_of_newacks be the number of + * destinations for which cacc_saw_newack is set. + */ + if (transport->cacc.cacc_saw_newack) + count_of_newacks ++; + } + + /* Move the Cumulative TSN Ack Point if appropriate. */ + if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) { + asoc->ctsn_ack_point = sack_ctsn; + accum_moved = 1; + } + + if (gap_ack_blocks) { + + if (asoc->fast_recovery && accum_moved) + highest_new_tsn = highest_tsn; + + list_for_each_entry(transport, transport_list, transports) + sctp_mark_missing(q, &transport->transmitted, transport, + highest_new_tsn, count_of_newacks); + } + + /* Update unack_data field in the assoc. */ + sctp_sack_update_unack_data(asoc, sack); + + ctsn = asoc->ctsn_ack_point; + + /* Throw away stuff rotting on the sack queue. */ + list_for_each_safe(lchunk, temp, &q->sacked) { + tchunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + tsn = ntohl(tchunk->subh.data_hdr->tsn); + if (TSN_lte(tsn, ctsn)) { + list_del_init(&tchunk->transmitted_list); + sctp_chunk_free(tchunk); + } + } + + /* ii) Set rwnd equal to the newly received a_rwnd minus the + * number of bytes still outstanding after processing the + * Cumulative TSN Ack and the Gap Ack Blocks. + */ + + sack_a_rwnd = ntohl(sack->a_rwnd); + outstanding = q->outstanding_bytes; + + if (outstanding < sack_a_rwnd) + sack_a_rwnd -= outstanding; + else + sack_a_rwnd = 0; + + asoc->peer.rwnd = sack_a_rwnd; + + sctp_generate_fwdtsn(q, sack_ctsn); + + SCTP_DEBUG_PRINTK("%s: sack Cumulative TSN Ack is 0x%x.\n", + __func__, sack_ctsn); + SCTP_DEBUG_PRINTK("%s: Cumulative TSN Ack of association, " + "%p is 0x%x. Adv peer ack point: 0x%x\n", + __func__, asoc, ctsn, asoc->adv_peer_ack_point); + + /* See if all chunks are acked. + * Make sure the empty queue handler will get run later. + */ + q->empty = (list_empty(&q->out_chunk_list) && + list_empty(&q->retransmit)); + if (!q->empty) + goto finish; + + list_for_each_entry(transport, transport_list, transports) { + q->empty = q->empty && list_empty(&transport->transmitted); + if (!q->empty) + goto finish; + } + + SCTP_DEBUG_PRINTK("sack queue is empty.\n"); +finish: + return q->empty; +} + +/* Is the outqueue empty? */ +int sctp_outq_is_empty(const struct sctp_outq *q) +{ + return q->empty; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Go through a transport's transmitted list or the association's retransmit + * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked. + * The retransmit list will not have an associated transport. + * + * I added coherent debug information output. --xguo + * + * Instead of printing 'sacked' or 'kept' for each TSN on the + * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5. + * KEPT TSN6-TSN7, etc. + */ +static void sctp_check_transmitted(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + struct sctp_sackhdr *sack, + __u32 *highest_new_tsn_in_sack) +{ + struct list_head *lchunk; + struct sctp_chunk *tchunk; + struct list_head tlist; + __u32 tsn; + __u32 sack_ctsn; + __u32 rtt; + __u8 restart_timer = 0; + int bytes_acked = 0; + int migrate_bytes = 0; + + /* These state variables are for coherent debug output. --xguo */ + +#if SCTP_DEBUG + __u32 dbg_ack_tsn = 0; /* An ACKed TSN range starts here... */ + __u32 dbg_last_ack_tsn = 0; /* ...and finishes here. */ + __u32 dbg_kept_tsn = 0; /* An un-ACKed range starts here... */ + __u32 dbg_last_kept_tsn = 0; /* ...and finishes here. */ + + /* 0 : The last TSN was ACKed. + * 1 : The last TSN was NOT ACKed (i.e. KEPT). + * -1: We need to initialize. + */ + int dbg_prt_state = -1; +#endif /* SCTP_DEBUG */ + + sack_ctsn = ntohl(sack->cum_tsn_ack); + + INIT_LIST_HEAD(&tlist); + + /* The while loop will skip empty transmitted queues. */ + while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) { + tchunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + + if (sctp_chunk_abandoned(tchunk)) { + /* Move the chunk to abandoned list. */ + sctp_insert_list(&q->abandoned, lchunk); + + /* If this chunk has not been acked, stop + * considering it as 'outstanding'. + */ + if (!tchunk->tsn_gap_acked) { + if (tchunk->transport) + tchunk->transport->flight_size -= + sctp_data_size(tchunk); + q->outstanding_bytes -= sctp_data_size(tchunk); + } + continue; + } + + tsn = ntohl(tchunk->subh.data_hdr->tsn); + if (sctp_acked(sack, tsn)) { + /* If this queue is the retransmit queue, the + * retransmit timer has already reclaimed + * the outstanding bytes for this chunk, so only + * count bytes associated with a transport. + */ + if (transport) { + /* If this chunk is being used for RTT + * measurement, calculate the RTT and update + * the RTO using this value. + * + * 6.3.1 C5) Karn's algorithm: RTT measurements + * MUST NOT be made using packets that were + * retransmitted (and thus for which it is + * ambiguous whether the reply was for the + * first instance of the packet or a later + * instance). + */ + if (!tchunk->tsn_gap_acked && + tchunk->rtt_in_progress) { + tchunk->rtt_in_progress = 0; + rtt = jiffies - tchunk->sent_at; + sctp_transport_update_rto(transport, + rtt); + } + } + + /* If the chunk hasn't been marked as ACKED, + * mark it and account bytes_acked if the + * chunk had a valid transport (it will not + * have a transport if ASCONF had deleted it + * while DATA was outstanding). + */ + if (!tchunk->tsn_gap_acked) { + tchunk->tsn_gap_acked = 1; + *highest_new_tsn_in_sack = tsn; + bytes_acked += sctp_data_size(tchunk); + if (!tchunk->transport) + migrate_bytes += sctp_data_size(tchunk); + } + + if (TSN_lte(tsn, sack_ctsn)) { + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R3) Whenever a SACK is received + * that acknowledges the DATA chunk + * with the earliest outstanding TSN + * for that address, restart T3-rtx + * timer for that address with its + * current RTO. + */ + restart_timer = 1; + + if (!tchunk->tsn_gap_acked) { + /* + * SFR-CACC algorithm: + * 2) If the SACK contains gap acks + * and the flag CHANGEOVER_ACTIVE is + * set the receiver of the SACK MUST + * take the following action: + * + * B) For each TSN t being acked that + * has not been acked in any SACK so + * far, set cacc_saw_newack to 1 for + * the destination that the TSN was + * sent to. + */ + if (transport && + sack->num_gap_ack_blocks && + q->asoc->peer.primary_path->cacc. + changeover_active) + transport->cacc.cacc_saw_newack + = 1; + } + + list_add_tail(&tchunk->transmitted_list, + &q->sacked); + } else { + /* RFC2960 7.2.4, sctpimpguide-05 2.8.2 + * M2) Each time a SACK arrives reporting + * 'Stray DATA chunk(s)' record the highest TSN + * reported as newly acknowledged, call this + * value 'HighestTSNinSack'. A newly + * acknowledged DATA chunk is one not + * previously acknowledged in a SACK. + * + * When the SCTP sender of data receives a SACK + * chunk that acknowledges, for the first time, + * the receipt of a DATA chunk, all the still + * unacknowledged DATA chunks whose TSN is + * older than that newly acknowledged DATA + * chunk, are qualified as 'Stray DATA chunks'. + */ + list_add_tail(lchunk, &tlist); + } + +#if SCTP_DEBUG + switch (dbg_prt_state) { + case 0: /* last TSN was ACKed */ + if (dbg_last_ack_tsn + 1 == tsn) { + /* This TSN belongs to the + * current ACK range. + */ + break; + } + + if (dbg_last_ack_tsn != dbg_ack_tsn) { + /* Display the end of the + * current range. + */ + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); + } + + /* Start a new range. */ + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); + dbg_ack_tsn = tsn; + break; + + case 1: /* The last TSN was NOT ACKed. */ + if (dbg_last_kept_tsn != dbg_kept_tsn) { + /* Display the end of current range. */ + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); + } + + SCTP_DEBUG_PRINTK_CONT("\n"); + + /* FALL THROUGH... */ + default: + /* This is the first-ever TSN we examined. */ + /* Start a new range of ACK-ed TSNs. */ + SCTP_DEBUG_PRINTK("ACKed: %08x", tsn); + dbg_prt_state = 0; + dbg_ack_tsn = tsn; + } + + dbg_last_ack_tsn = tsn; +#endif /* SCTP_DEBUG */ + + } else { + if (tchunk->tsn_gap_acked) { + SCTP_DEBUG_PRINTK("%s: Receiver reneged on " + "data TSN: 0x%x\n", + __func__, + tsn); + tchunk->tsn_gap_acked = 0; + + if (tchunk->transport) + bytes_acked -= sctp_data_size(tchunk); + + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R4) Whenever a SACK is received missing a + * TSN that was previously acknowledged via a + * Gap Ack Block, start T3-rtx for the + * destination address to which the DATA + * chunk was originally + * transmitted if it is not already running. + */ + restart_timer = 1; + } + + list_add_tail(lchunk, &tlist); + +#if SCTP_DEBUG + /* See the above comments on ACK-ed TSNs. */ + switch (dbg_prt_state) { + case 1: + if (dbg_last_kept_tsn + 1 == tsn) + break; + + if (dbg_last_kept_tsn != dbg_kept_tsn) + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); + + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); + dbg_kept_tsn = tsn; + break; + + case 0: + if (dbg_last_ack_tsn != dbg_ack_tsn) + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("\n"); + + /* FALL THROUGH... */ + default: + SCTP_DEBUG_PRINTK("KEPT: %08x",tsn); + dbg_prt_state = 1; + dbg_kept_tsn = tsn; + } + + dbg_last_kept_tsn = tsn; +#endif /* SCTP_DEBUG */ + } + } + +#if SCTP_DEBUG + /* Finish off the last range, displaying its ending TSN. */ + switch (dbg_prt_state) { + case 0: + if (dbg_last_ack_tsn != dbg_ack_tsn) { + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn); + } else { + SCTP_DEBUG_PRINTK_CONT("\n"); + } + break; + + case 1: + if (dbg_last_kept_tsn != dbg_kept_tsn) { + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn); + } else { + SCTP_DEBUG_PRINTK_CONT("\n"); + } + } +#endif /* SCTP_DEBUG */ + if (transport) { + if (bytes_acked) { + struct sctp_association *asoc = transport->asoc; + + /* We may have counted DATA that was migrated + * to this transport due to DEL-IP operation. + * Subtract those bytes, since the were never + * send on this transport and shouldn't be + * credited to this transport. + */ + bytes_acked -= migrate_bytes; + + /* 8.2. When an outstanding TSN is acknowledged, + * the endpoint shall clear the error counter of + * the destination transport address to which the + * DATA chunk was last sent. + * The association's overall error counter is + * also cleared. + */ + transport->error_count = 0; + transport->asoc->overall_error_count = 0; + + /* + * While in SHUTDOWN PENDING, we may have started + * the T5 shutdown guard timer after reaching the + * retransmission limit. Stop that timer as soon + * as the receiver acknowledged any data. + */ + if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && + del_timer(&asoc->timers + [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) + sctp_association_put(asoc); + + /* Mark the destination transport address as + * active if it is not so marked. + */ + if ((transport->state == SCTP_INACTIVE) || + (transport->state == SCTP_UNCONFIRMED)) { + sctp_assoc_control_transport( + transport->asoc, + transport, + SCTP_TRANSPORT_UP, + SCTP_RECEIVED_SACK); + } + + sctp_transport_raise_cwnd(transport, sack_ctsn, + bytes_acked); + + transport->flight_size -= bytes_acked; + if (transport->flight_size == 0) + transport->partial_bytes_acked = 0; + q->outstanding_bytes -= bytes_acked + migrate_bytes; + } else { + /* RFC 2960 6.1, sctpimpguide-06 2.15.2 + * When a sender is doing zero window probing, it + * should not timeout the association if it continues + * to receive new packets from the receiver. The + * reason is that the receiver MAY keep its window + * closed for an indefinite time. + * A sender is doing zero window probing when the + * receiver's advertised window is zero, and there is + * only one data chunk in flight to the receiver. + * + * Allow the association to timeout while in SHUTDOWN + * PENDING or SHUTDOWN RECEIVED in case the receiver + * stays in zero window mode forever. + */ + if (!q->asoc->peer.rwnd && + !list_empty(&tlist) && + (sack_ctsn+2 == q->asoc->next_tsn) && + q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) { + SCTP_DEBUG_PRINTK("%s: SACK received for zero " + "window probe: %u\n", + __func__, sack_ctsn); + q->asoc->overall_error_count = 0; + transport->error_count = 0; + } + } + + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R2) Whenever all outstanding data sent to an address have + * been acknowledged, turn off the T3-rtx timer of that + * address. + */ + if (!transport->flight_size) { + if (timer_pending(&transport->T3_rtx_timer) && + del_timer(&transport->T3_rtx_timer)) { + sctp_transport_put(transport); + } + } else if (restart_timer) { + if (!mod_timer(&transport->T3_rtx_timer, + jiffies + transport->rto)) + sctp_transport_hold(transport); + } + } + + list_splice(&tlist, transmitted_queue); +} + +/* Mark chunks as missing and consequently may get retransmitted. */ +static void sctp_mark_missing(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + __u32 highest_new_tsn_in_sack, + int count_of_newacks) +{ + struct sctp_chunk *chunk; + __u32 tsn; + char do_fast_retransmit = 0; + struct sctp_association *asoc = q->asoc; + struct sctp_transport *primary = asoc->peer.primary_path; + + list_for_each_entry(chunk, transmitted_queue, transmitted_list) { + + tsn = ntohl(chunk->subh.data_hdr->tsn); + + /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all + * 'Unacknowledged TSN's', if the TSN number of an + * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack' + * value, increment the 'TSN.Missing.Report' count on that + * chunk if it has NOT been fast retransmitted or marked for + * fast retransmit already. + */ + if (chunk->fast_retransmit == SCTP_CAN_FRTX && + !chunk->tsn_gap_acked && + TSN_lt(tsn, highest_new_tsn_in_sack)) { + + /* SFR-CACC may require us to skip marking + * this chunk as missing. + */ + if (!transport || !sctp_cacc_skip(primary, + chunk->transport, + count_of_newacks, tsn)) { + chunk->tsn_missing_report++; + + SCTP_DEBUG_PRINTK( + "%s: TSN 0x%x missing counter: %d\n", + __func__, tsn, + chunk->tsn_missing_report); + } + } + /* + * M4) If any DATA chunk is found to have a + * 'TSN.Missing.Report' + * value larger than or equal to 3, mark that chunk for + * retransmission and start the fast retransmit procedure. + */ + + if (chunk->tsn_missing_report >= 3) { + chunk->fast_retransmit = SCTP_NEED_FRTX; + do_fast_retransmit = 1; + } + } + + if (transport) { + if (do_fast_retransmit) + sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX); + + SCTP_DEBUG_PRINTK("%s: transport: %p, cwnd: %d, " + "ssthresh: %d, flight_size: %d, pba: %d\n", + __func__, transport, transport->cwnd, + transport->ssthresh, transport->flight_size, + transport->partial_bytes_acked); + } +} + +/* Is the given TSN acked by this packet? */ +static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) +{ + int i; + sctp_sack_variable_t *frags; + __u16 gap; + __u32 ctsn = ntohl(sack->cum_tsn_ack); + + if (TSN_lte(tsn, ctsn)) + goto pass; + + /* 3.3.4 Selective Acknowledgement (SACK) (3): + * + * Gap Ack Blocks: + * These fields contain the Gap Ack Blocks. They are repeated + * for each Gap Ack Block up to the number of Gap Ack Blocks + * defined in the Number of Gap Ack Blocks field. All DATA + * chunks with TSNs greater than or equal to (Cumulative TSN + * Ack + Gap Ack Block Start) and less than or equal to + * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack + * Block are assumed to have been received correctly. + */ + + frags = sack->variable; + gap = tsn - ctsn; + for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) { + if (TSN_lte(ntohs(frags[i].gab.start), gap) && + TSN_lte(gap, ntohs(frags[i].gab.end))) + goto pass; + } + + return 0; +pass: + return 1; +} + +static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, + int nskips, __be16 stream) +{ + int i; + + for (i = 0; i < nskips; i++) { + if (skiplist[i].stream == stream) + return i; + } + return i; +} + +/* Create and add a fwdtsn chunk to the outq's control queue if needed. */ +static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) +{ + struct sctp_association *asoc = q->asoc; + struct sctp_chunk *ftsn_chunk = NULL; + struct sctp_fwdtsn_skip ftsn_skip_arr[10]; + int nskips = 0; + int skip_pos = 0; + __u32 tsn; + struct sctp_chunk *chunk; + struct list_head *lchunk, *temp; + + if (!asoc->peer.prsctp_capable) + return; + + /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the + * received SACK. + * + * If (Advanced.Peer.Ack.Point < SackCumAck), then update + * Advanced.Peer.Ack.Point to be equal to SackCumAck. + */ + if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) + asoc->adv_peer_ack_point = ctsn; + + /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point" + * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as + * the chunk next in the out-queue space is marked as "abandoned" as + * shown in the following example: + * + * Assuming that a SACK arrived with the Cumulative TSN ACK 102 + * and the Advanced.Peer.Ack.Point is updated to this value: + * + * out-queue at the end of ==> out-queue after Adv.Ack.Point + * normal SACK processing local advancement + * ... ... + * Adv.Ack.Pt-> 102 acked 102 acked + * 103 abandoned 103 abandoned + * 104 abandoned Adv.Ack.P-> 104 abandoned + * 105 105 + * 106 acked 106 acked + * ... ... + * + * In this example, the data sender successfully advanced the + * "Advanced.Peer.Ack.Point" from 102 to 104 locally. + */ + list_for_each_safe(lchunk, temp, &q->abandoned) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + tsn = ntohl(chunk->subh.data_hdr->tsn); + + /* Remove any chunks in the abandoned queue that are acked by + * the ctsn. + */ + if (TSN_lte(tsn, ctsn)) { + list_del_init(lchunk); + sctp_chunk_free(chunk); + } else { + if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) { + asoc->adv_peer_ack_point = tsn; + if (chunk->chunk_hdr->flags & + SCTP_DATA_UNORDERED) + continue; + skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], + nskips, + chunk->subh.data_hdr->stream); + ftsn_skip_arr[skip_pos].stream = + chunk->subh.data_hdr->stream; + ftsn_skip_arr[skip_pos].ssn = + chunk->subh.data_hdr->ssn; + if (skip_pos == nskips) + nskips++; + if (nskips == 10) + break; + } else + break; + } + } + + /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point" + * is greater than the Cumulative TSN ACK carried in the received + * SACK, the data sender MUST send the data receiver a FORWARD TSN + * chunk containing the latest value of the + * "Advanced.Peer.Ack.Point". + * + * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD + * list each stream and sequence number in the forwarded TSN. This + * information will enable the receiver to easily find any + * stranded TSN's waiting on stream reorder queues. Each stream + * SHOULD only be reported once; this means that if multiple + * abandoned messages occur in the same stream then only the + * highest abandoned stream sequence number is reported. If the + * total size of the FORWARD TSN does NOT fit in a single MTU then + * the sender of the FORWARD TSN SHOULD lower the + * Advanced.Peer.Ack.Point to the last TSN that will fit in a + * single MTU. + */ + if (asoc->adv_peer_ack_point > ctsn) + ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point, + nskips, &ftsn_skip_arr[0]); + + if (ftsn_chunk) { + list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } +} diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c new file mode 100644 index 00000000..534c7eae --- /dev/null +++ b/net/sctp/primitive.c @@ -0,0 +1,220 @@ +/* SCTP kernel implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * + * This file is part of the SCTP kernel implementation + * + * These functions implement the SCTP primitive functions from Section 10. + * + * Note that the descriptions from the specification are USER level + * functions--this file is the functions which populate the struct proto + * for SCTP which is the BOTTOM of the sockets interface. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Narasimha Budihal + * Karl Knutson + * Ardelle Fan + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include /* For struct list_head */ +#include +#include +#include /* For struct timeval */ +#include +#include +#include +#include + +#define DECLARE_PRIMITIVE(name) \ +/* This is called in the code as sctp_primitive_ ## name. */ \ +int sctp_primitive_ ## name(struct sctp_association *asoc, \ + void *arg) { \ + int error = 0; \ + sctp_event_t event_type; sctp_subtype_t subtype; \ + sctp_state_t state; \ + struct sctp_endpoint *ep; \ + \ + event_type = SCTP_EVENT_T_PRIMITIVE; \ + subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \ + state = asoc ? asoc->state : SCTP_STATE_CLOSED; \ + ep = asoc ? asoc->ep : NULL; \ + \ + error = sctp_do_sm(event_type, subtype, state, ep, asoc, \ + arg, GFP_KERNEL); \ + return error; \ +} + +/* 10.1 ULP-to-SCTP + * B) Associate + * + * Format: ASSOCIATE(local SCTP instance name, destination transport addr, + * outbound stream count) + * -> association id [,destination transport addr list] [,outbound stream + * count] + * + * This primitive allows the upper layer to initiate an association to a + * specific peer endpoint. + * + * This version assumes that asoc is fully populated with the initial + * parameters. We then return a traditional kernel indicator of + * success or failure. + */ + +/* This is called in the code as sctp_primitive_ASSOCIATE. */ + +DECLARE_PRIMITIVE(ASSOCIATE) + +/* 10.1 ULP-to-SCTP + * C) Shutdown + * + * Format: SHUTDOWN(association id) + * -> result + * + * Gracefully closes an association. Any locally queued user data + * will be delivered to the peer. The association will be terminated only + * after the peer acknowledges all the SCTP packets sent. A success code + * will be returned on successful termination of the association. If + * attempting to terminate the association results in a failure, an error + * code shall be returned. + */ + +DECLARE_PRIMITIVE(SHUTDOWN); + +/* 10.1 ULP-to-SCTP + * C) Abort + * + * Format: Abort(association id [, cause code]) + * -> result + * + * Ungracefully closes an association. Any locally queued user data + * will be discarded and an ABORT chunk is sent to the peer. A success + * code will be returned on successful abortion of the association. If + * attempting to abort the association results in a failure, an error + * code shall be returned. + */ + +DECLARE_PRIMITIVE(ABORT); + +/* 10.1 ULP-to-SCTP + * E) Send + * + * Format: SEND(association id, buffer address, byte count [,context] + * [,stream id] [,life time] [,destination transport address] + * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) + * -> result + * + * This is the main method to send user data via SCTP. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o buffer address - the location where the user message to be + * transmitted is stored; + * + * o byte count - The size of the user data in number of bytes; + * + * Optional attributes: + * + * o context - an optional 32 bit integer that will be carried in the + * sending failure notification to the ULP if the transportation of + * this User Message fails. + * + * o stream id - to indicate which stream to send the data on. If not + * specified, stream 0 will be used. + * + * o life time - specifies the life time of the user data. The user data + * will not be sent by SCTP after the life time expires. This + * parameter can be used to avoid efforts to transmit stale + * user messages. SCTP notifies the ULP if the data cannot be + * initiated to transport (i.e. sent to the destination via SCTP's + * send primitive) within the life time variable. However, the + * user data will be transmitted if SCTP has attempted to transmit a + * chunk before the life time expired. + * + * o destination transport address - specified as one of the destination + * transport addresses of the peer endpoint to which this packet + * should be sent. Whenever possible, SCTP should use this destination + * transport address for sending the packets, instead of the current + * primary path. + * + * o unorder flag - this flag, if present, indicates that the user + * would like the data delivered in an unordered fashion to the peer + * (i.e., the U flag is set to 1 on all DATA chunks carrying this + * message). + * + * o no-bundle flag - instructs SCTP not to bundle this user data with + * other outbound DATA chunks. SCTP MAY still bundle even when + * this flag is present, when faced with network congestion. + * + * o payload protocol-id - A 32 bit unsigned integer that is to be + * passed to the peer indicating the type of payload protocol data + * being transmitted. This value is passed as opaque data by SCTP. + */ + +DECLARE_PRIMITIVE(SEND); + +/* 10.1 ULP-to-SCTP + * J) Request Heartbeat + * + * Format: REQUESTHEARTBEAT(association id, destination transport address) + * + * -> result + * + * Instructs the local endpoint to perform a HeartBeat on the specified + * destination transport address of the given association. The returned + * result should indicate whether the transmission of the HEARTBEAT + * chunk to the destination address is successful. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o destination transport address - the transport address of the + * association on which a heartbeat should be issued. + */ + +DECLARE_PRIMITIVE(REQUESTHEARTBEAT); + +/* ADDIP +* 3.1.1 Address Configuration Change Chunk (ASCONF) +* +* This chunk is used to communicate to the remote endpoint one of the +* configuration change requests that MUST be acknowledged. The +* information carried in the ASCONF Chunk uses the form of a +* Type-Length-Value (TLV), as described in "3.2.1 Optional/ +* Variable-length Parameter Format" in RFC2960 [5], forall variable +* parameters. +*/ + +DECLARE_PRIMITIVE(ASCONF); diff --git a/net/sctp/probe.c b/net/sctp/probe.c new file mode 100644 index 00000000..bc6cd75c --- /dev/null +++ b/net/sctp/probe.c @@ -0,0 +1,217 @@ +/* + * sctp_probe - Observe the SCTP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * Modified for SCTP from Stephen Hemminger's code + * Copyright (C) 2010, Wei Yongjun + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Wei Yongjun "); +MODULE_DESCRIPTION("SCTP snooper"); +MODULE_LICENSE("GPL"); + +static int port __read_mostly = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static int bufsize __read_mostly = 64 * 1024; +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +static int full __read_mostly = 1; +MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); +module_param(full, int, 0); + +static const char procname[] = "sctpprobe"; + +static struct { + struct kfifo fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timespec tstart; +} sctpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + char tbuf[256]; + + va_start(args, fmt); + len = vscnprintf(tbuf, sizeof(tbuf), fmt, args); + va_end(args); + + kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock); + wake_up(&sctpw.wait); +} + +static int sctpprobe_open(struct inode *inode, struct file *file) +{ + kfifo_reset(&sctpw.fifo); + getnstimeofday(&sctpw.tstart); + + return 0; +} + +static ssize_t sctpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt = 0; + unsigned char *tbuf; + + if (!buf) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(sctpw.wait, + kfifo_len(&sctpw.fifo) != 0); + if (error) + goto out_free; + + cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock); + error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; + +out_free: + vfree(tbuf); + + return error ? error : cnt; +} + +static const struct file_operations sctpprobe_fops = { + .owner = THIS_MODULE, + .open = sctpprobe_open, + .read = sctpprobe_read, + .llseek = noop_llseek, +}; + +sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *sp; + static __u32 lcwnd = 0; + struct timespec now; + + sp = asoc->peer.primary_path; + + if ((full || sp->cwnd != lcwnd) && + (!port || asoc->peer.port == port || + ep->base.bind_addr.port == port)) { + lcwnd = sp->cwnd; + + getnstimeofday(&now); + now = timespec_sub(now, sctpw.tstart); + + printl("%lu.%06lu ", (unsigned long) now.tv_sec, + (unsigned long) now.tv_nsec / NSEC_PER_USEC); + + printl("%p %5d %5d %5d %8d %5d ", asoc, + ep->base.bind_addr.port, asoc->peer.port, + asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data); + + list_for_each_entry(sp, &asoc->peer.transport_addr_list, + transports) { + if (sp == asoc->peer.primary_path) + printl("*"); + + if (sp->ipaddr.sa.sa_family == AF_INET) + printl("%pI4 ", &sp->ipaddr.v4.sin_addr); + else + printl("%pI6 ", &sp->ipaddr.v6.sin6_addr); + + printl("%2u %8u %8u %8u %8u %8u ", + sp->state, sp->cwnd, sp->ssthresh, + sp->flight_size, sp->partial_bytes_acked, + sp->pathmtu); + } + printl("\n"); + } + + jprobe_return(); + return 0; +} + +static struct jprobe sctp_recv_probe = { + .kp = { + .symbol_name = "sctp_sf_eat_sack_6_2", + }, + .entry = jsctp_sf_eat_sack, +}; + +static __init int sctpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&sctpw.wait); + spin_lock_init(&sctpw.lock); + if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL)) + return ret; + + if (!proc_net_fops_create(&init_net, procname, S_IRUSR, + &sctpprobe_fops)) + goto free_kfifo; + + ret = register_jprobe(&sctp_recv_probe); + if (ret) + goto remove_proc; + + pr_info("probe registered (port=%d)\n", port); + + return 0; + +remove_proc: + proc_net_remove(&init_net, procname); +free_kfifo: + kfifo_free(&sctpw.fifo); + return ret; +} + +static __exit void sctpprobe_exit(void) +{ + kfifo_free(&sctpw.fifo); + proc_net_remove(&init_net, procname); + unregister_jprobe(&sctp_recv_probe); +} + +module_init(sctpprobe_init); +module_exit(sctpprobe_exit); diff --git a/net/sctp/proc.c b/net/sctp/proc.c new file mode 100644 index 00000000..05a6ce21 --- /dev/null +++ b/net/sctp/proc.c @@ -0,0 +1,516 @@ +/* SCTP kernel implementation + * Copyright (c) 2003 International Business Machines, Corp. + * + * This file is part of the SCTP kernel implementation + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include /* for snmp_fold_field */ + +static const struct snmp_mib sctp_snmp_list[] = { + SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB), + SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS), + SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS), + SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS), + SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS), + SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES), + SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS), + SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS), + SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS), + SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS), + SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS), + SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS), + SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS), + SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS), + SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), + SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), + SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), + SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS), + SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS), + SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS), + SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS), + SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS), + SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS), + SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS), + SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS), + SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS), + SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS), + SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS), + SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ), + SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), + SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), + SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), + SNMP_MIB_SENTINEL +}; + +/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ +static int sctp_snmp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + for (i = 0; sctp_snmp_list[i].name != NULL; i++) + seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, + snmp_fold_field((void __percpu **)sctp_statistics, + sctp_snmp_list[i].entry)); + + return 0; +} + +/* Initialize the seq file operations for 'snmp' object. */ +static int sctp_snmp_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sctp_snmp_seq_show, NULL); +} + +static const struct file_operations sctp_snmp_seq_fops = { + .owner = THIS_MODULE, + .open = sctp_snmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Set up the proc fs entry for 'snmp' object. */ +int __init sctp_snmp_proc_init(void) +{ + struct proc_dir_entry *p; + + p = proc_create("snmp", S_IRUGO, proc_net_sctp, &sctp_snmp_seq_fops); + if (!p) + return -ENOMEM; + + return 0; +} + +/* Cleanup the proc fs entry for 'snmp' object. */ +void sctp_snmp_proc_exit(void) +{ + remove_proc_entry("snmp", proc_net_sctp); +} + +/* Dump local addresses of an association/endpoint. */ +static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) +{ + struct sctp_association *asoc; + struct sctp_sockaddr_entry *laddr; + struct sctp_transport *peer; + union sctp_addr *addr, *primary = NULL; + struct sctp_af *af; + + if (epb->type == SCTP_EP_TYPE_ASSOCIATION) { + asoc = sctp_assoc(epb); + peer = asoc->peer.primary_path; + primary = &peer->saddr; + } + + list_for_each_entry(laddr, &epb->bind_addr.address_list, list) { + addr = &laddr->a; + af = sctp_get_af_specific(addr->sa.sa_family); + if (primary && af->cmp_addr(addr, primary)) { + seq_printf(seq, "*"); + } + af->seq_dump_addr(seq, addr); + } +} + +/* Dump remote addresses of an association. */ +static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc) +{ + struct sctp_transport *transport; + union sctp_addr *addr, *primary; + struct sctp_af *af; + + primary = &assoc->peer.primary_addr; + list_for_each_entry(transport, &assoc->peer.transport_addr_list, + transports) { + addr = &transport->ipaddr; + af = sctp_get_af_specific(addr->sa.sa_family); + if (af->cmp_addr(addr, primary)) { + seq_printf(seq, "*"); + } + af->seq_dump_addr(seq, addr); + } +} + +static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= sctp_ep_hashsize) + return NULL; + + if (*pos < 0) + *pos = 0; + + if (*pos == 0) + seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n"); + + return (void *)pos; +} + +static void sctp_eps_seq_stop(struct seq_file *seq, void *v) +{ +} + + +static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + if (++*pos >= sctp_ep_hashsize) + return NULL; + + return pos; +} + + +/* Display sctp endpoints (/proc/net/sctp/eps). */ +static int sctp_eps_seq_show(struct seq_file *seq, void *v) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_endpoint *ep; + struct sock *sk; + struct hlist_node *node; + int hash = *(loff_t *)v; + + if (hash >= sctp_ep_hashsize) + return -ENOMEM; + + head = &sctp_ep_hashtable[hash]; + sctp_local_bh_disable(); + read_lock(&head->lock); + sctp_for_each_hentry(epb, node, &head->chain) { + ep = sctp_ep(epb); + sk = epb->sk; + seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk, + sctp_sk(sk)->type, sk->sk_state, hash, + epb->bind_addr.port, + sock_i_uid(sk), sock_i_ino(sk)); + + sctp_seq_dump_local_addrs(seq, epb); + seq_printf(seq, "\n"); + } + read_unlock(&head->lock); + sctp_local_bh_enable(); + + return 0; +} + +static const struct seq_operations sctp_eps_ops = { + .start = sctp_eps_seq_start, + .next = sctp_eps_seq_next, + .stop = sctp_eps_seq_stop, + .show = sctp_eps_seq_show, +}; + + +/* Initialize the seq file operations for 'eps' object. */ +static int sctp_eps_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sctp_eps_ops); +} + +static const struct file_operations sctp_eps_seq_fops = { + .open = sctp_eps_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Set up the proc fs entry for 'eps' object. */ +int __init sctp_eps_proc_init(void) +{ + struct proc_dir_entry *p; + + p = proc_create("eps", S_IRUGO, proc_net_sctp, &sctp_eps_seq_fops); + if (!p) + return -ENOMEM; + + return 0; +} + +/* Cleanup the proc fs entry for 'eps' object. */ +void sctp_eps_proc_exit(void) +{ + remove_proc_entry("eps", proc_net_sctp); +} + + +static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= sctp_assoc_hashsize) + return NULL; + + if (*pos < 0) + *pos = 0; + + if (*pos == 0) + seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " + "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " + "RPORT LADDRS <-> RADDRS " + "HBINT INS OUTS MAXRT T1X T2X RTXC\n"); + + return (void *)pos; +} + +static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) +{ +} + + +static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + if (++*pos >= sctp_assoc_hashsize) + return NULL; + + return pos; +} + +/* Display sctp associations (/proc/net/sctp/assocs). */ +static int sctp_assocs_seq_show(struct seq_file *seq, void *v) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_association *assoc; + struct sock *sk; + struct hlist_node *node; + int hash = *(loff_t *)v; + + if (hash >= sctp_assoc_hashsize) + return -ENOMEM; + + head = &sctp_assoc_hashtable[hash]; + sctp_local_bh_disable(); + read_lock(&head->lock); + sctp_for_each_hentry(epb, node, &head->chain) { + assoc = sctp_assoc(epb); + sk = epb->sk; + seq_printf(seq, + "%8pK %8pK %-3d %-3d %-2d %-4d " + "%4d %8d %8d %7d %5lu %-5d %5d ", + assoc, sk, sctp_sk(sk)->type, sk->sk_state, + assoc->state, hash, + assoc->assoc_id, + assoc->sndbuf_used, + atomic_read(&assoc->rmem_alloc), + sock_i_uid(sk), sock_i_ino(sk), + epb->bind_addr.port, + assoc->peer.port); + seq_printf(seq, " "); + sctp_seq_dump_local_addrs(seq, epb); + seq_printf(seq, "<-> "); + sctp_seq_dump_remote_addrs(seq, assoc); + seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d ", + assoc->hbinterval, assoc->c.sinit_max_instreams, + assoc->c.sinit_num_ostreams, assoc->max_retrans, + assoc->init_retries, assoc->shutdown_retries, + assoc->rtx_data_chunks); + seq_printf(seq, "\n"); + } + read_unlock(&head->lock); + sctp_local_bh_enable(); + + return 0; +} + +static const struct seq_operations sctp_assoc_ops = { + .start = sctp_assocs_seq_start, + .next = sctp_assocs_seq_next, + .stop = sctp_assocs_seq_stop, + .show = sctp_assocs_seq_show, +}; + +/* Initialize the seq file operations for 'assocs' object. */ +static int sctp_assocs_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sctp_assoc_ops); +} + +static const struct file_operations sctp_assocs_seq_fops = { + .open = sctp_assocs_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Set up the proc fs entry for 'assocs' object. */ +int __init sctp_assocs_proc_init(void) +{ + struct proc_dir_entry *p; + + p = proc_create("assocs", S_IRUGO, proc_net_sctp, + &sctp_assocs_seq_fops); + if (!p) + return -ENOMEM; + + return 0; +} + +/* Cleanup the proc fs entry for 'assocs' object. */ +void sctp_assocs_proc_exit(void) +{ + remove_proc_entry("assocs", proc_net_sctp); +} + +static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= sctp_assoc_hashsize) + return NULL; + + if (*pos < 0) + *pos = 0; + + if (*pos == 0) + seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " + "REM_ADDR_RTX START\n"); + + return (void *)pos; +} + +static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + if (++*pos >= sctp_assoc_hashsize) + return NULL; + + return pos; +} + +static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_association *assoc; + struct hlist_node *node; + struct sctp_transport *tsp; + int hash = *(loff_t *)v; + + if (hash >= sctp_assoc_hashsize) + return -ENOMEM; + + head = &sctp_assoc_hashtable[hash]; + sctp_local_bh_disable(); + read_lock(&head->lock); + sctp_for_each_hentry(epb, node, &head->chain) { + assoc = sctp_assoc(epb); + list_for_each_entry(tsp, &assoc->peer.transport_addr_list, + transports) { + /* + * The remote address (ADDR) + */ + tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr); + seq_printf(seq, " "); + + /* + * The association ID (ASSOC_ID) + */ + seq_printf(seq, "%d ", tsp->asoc->assoc_id); + + /* + * If the Heartbeat is active (HB_ACT) + * Note: 1 = Active, 0 = Inactive + */ + seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer)); + + /* + * Retransmit time out (RTO) + */ + seq_printf(seq, "%lu ", tsp->rto); + + /* + * Maximum path retransmit count (PATH_MAX_RTX) + */ + seq_printf(seq, "%d ", tsp->pathmaxrxt); + + /* + * remote address retransmit count (REM_ADDR_RTX) + * Note: We don't have a way to tally this at the moment + * so lets just leave it as zero for the moment + */ + seq_printf(seq, "0 "); + + /* + * remote address start time (START). This is also not + * currently implemented, but we can record it with a + * jiffies marker in a subsequent patch + */ + seq_printf(seq, "0"); + + seq_printf(seq, "\n"); + } + } + + read_unlock(&head->lock); + sctp_local_bh_enable(); + + return 0; + +} + +static const struct seq_operations sctp_remaddr_ops = { + .start = sctp_remaddr_seq_start, + .next = sctp_remaddr_seq_next, + .stop = sctp_remaddr_seq_stop, + .show = sctp_remaddr_seq_show, +}; + +/* Cleanup the proc fs entry for 'remaddr' object. */ +void sctp_remaddr_proc_exit(void) +{ + remove_proc_entry("remaddr", proc_net_sctp); +} + +static int sctp_remaddr_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sctp_remaddr_ops); +} + +static const struct file_operations sctp_remaddr_seq_fops = { + .open = sctp_remaddr_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int __init sctp_remaddr_proc_init(void) +{ + struct proc_dir_entry *p; + + p = proc_create("remaddr", S_IRUGO, proc_net_sctp, &sctp_remaddr_seq_fops); + if (!p) + return -ENOMEM; + return 0; +} diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c new file mode 100644 index 00000000..946afd60 --- /dev/null +++ b/net/sctp/protocol.c @@ -0,0 +1,1381 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * Initialization/cleanup for SCTP protocol support. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Sridhar Samudrala + * Daisy Chang + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global data structures. */ +struct sctp_globals sctp_globals __read_mostly; +DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly; + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_sctp; +#endif + +struct idr sctp_assocs_id; +DEFINE_SPINLOCK(sctp_assocs_id_lock); + +/* This is the global socket data structure used for responding to + * the Out-of-the-blue (OOTB) packets. A control sock will be created + * for this socket at the initialization time. + */ +static struct sock *sctp_ctl_sock; + +static struct sctp_pf *sctp_pf_inet6_specific; +static struct sctp_pf *sctp_pf_inet_specific; +static struct sctp_af *sctp_af_v4_specific; +static struct sctp_af *sctp_af_v6_specific; + +struct kmem_cache *sctp_chunk_cachep __read_mostly; +struct kmem_cache *sctp_bucket_cachep __read_mostly; + +long sysctl_sctp_mem[3]; +int sysctl_sctp_rmem[3]; +int sysctl_sctp_wmem[3]; + +/* Return the address of the control sock. */ +struct sock *sctp_get_ctl_sock(void) +{ + return sctp_ctl_sock; +} + +/* Set up the proc fs entry for the SCTP protocol. */ +static __init int sctp_proc_init(void) +{ + if (percpu_counter_init(&sctp_sockets_allocated, 0)) + goto out_nomem; +#ifdef CONFIG_PROC_FS + if (!proc_net_sctp) { + proc_net_sctp = proc_mkdir("sctp", init_net.proc_net); + if (!proc_net_sctp) + goto out_free_percpu; + } + + if (sctp_snmp_proc_init()) + goto out_snmp_proc_init; + if (sctp_eps_proc_init()) + goto out_eps_proc_init; + if (sctp_assocs_proc_init()) + goto out_assocs_proc_init; + if (sctp_remaddr_proc_init()) + goto out_remaddr_proc_init; + + return 0; + +out_remaddr_proc_init: + sctp_assocs_proc_exit(); +out_assocs_proc_init: + sctp_eps_proc_exit(); +out_eps_proc_init: + sctp_snmp_proc_exit(); +out_snmp_proc_init: + if (proc_net_sctp) { + proc_net_sctp = NULL; + remove_proc_entry("sctp", init_net.proc_net); + } +out_free_percpu: + percpu_counter_destroy(&sctp_sockets_allocated); +#else + return 0; +#endif /* CONFIG_PROC_FS */ + +out_nomem: + return -ENOMEM; +} + +/* Clean up the proc fs entry for the SCTP protocol. + * Note: Do not make this __exit as it is used in the init error + * path. + */ +static void sctp_proc_exit(void) +{ +#ifdef CONFIG_PROC_FS + sctp_snmp_proc_exit(); + sctp_eps_proc_exit(); + sctp_assocs_proc_exit(); + sctp_remaddr_proc_exit(); + + if (proc_net_sctp) { + proc_net_sctp = NULL; + remove_proc_entry("sctp", init_net.proc_net); + } +#endif + percpu_counter_destroy(&sctp_sockets_allocated); +} + +/* Private helper to extract ipv4 address and stash them in + * the protocol structure. + */ +static void sctp_v4_copy_addrlist(struct list_head *addrlist, + struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + struct sctp_sockaddr_entry *addr; + + rcu_read_lock(); + if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { + rcu_read_unlock(); + return; + } + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + /* Add the address to the local list. */ + addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC); + if (addr) { + addr->a.v4.sin_family = AF_INET; + addr->a.v4.sin_port = 0; + addr->a.v4.sin_addr.s_addr = ifa->ifa_local; + addr->valid = 1; + INIT_LIST_HEAD(&addr->list); + list_add_tail(&addr->list, addrlist); + } + } + + rcu_read_unlock(); +} + +/* Extract our IP addresses from the system and stash them in the + * protocol structure. + */ +static void sctp_get_local_addr_list(void) +{ + struct net_device *dev; + struct list_head *pos; + struct sctp_af *af; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + __list_for_each(pos, &sctp_address_families) { + af = list_entry(pos, struct sctp_af, list); + af->copy_addrlist(&sctp_local_addr_list, dev); + } + } + rcu_read_unlock(); +} + +/* Free the existing local addresses. */ +static void sctp_free_local_addr_list(void) +{ + struct sctp_sockaddr_entry *addr; + struct list_head *pos, *temp; + + list_for_each_safe(pos, temp, &sctp_local_addr_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + list_del(pos); + kfree(addr); + } +} + +/* Copy the local addresses which are valid for 'scope' into 'bp'. */ +int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, + gfp_t gfp, int copy_flags) +{ + struct sctp_sockaddr_entry *addr; + int error = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) { + if (!addr->valid) + continue; + if (sctp_in_scope(&addr->a, scope)) { + /* Now that the address is in scope, check to see if + * the address type is really supported by the local + * sock as well as the remote peer. + */ + if ((((AF_INET == addr->a.sa.sa_family) && + (copy_flags & SCTP_ADDR4_PEERSUPP))) || + (((AF_INET6 == addr->a.sa.sa_family) && + (copy_flags & SCTP_ADDR6_ALLOWED) && + (copy_flags & SCTP_ADDR6_PEERSUPP)))) { + error = sctp_add_bind_addr(bp, &addr->a, + SCTP_ADDR_SRC, GFP_ATOMIC); + if (error) + goto end_copy; + } + } + } + +end_copy: + rcu_read_unlock(); + return error; +} + +/* Initialize a sctp_addr from in incoming skb. */ +static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, + int is_saddr) +{ + void *from; + __be16 *port; + struct sctphdr *sh; + + port = &addr->v4.sin_port; + addr->v4.sin_family = AF_INET; + + sh = sctp_hdr(skb); + if (is_saddr) { + *port = sh->source; + from = &ip_hdr(skb)->saddr; + } else { + *port = sh->dest; + from = &ip_hdr(skb)->daddr; + } + memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr)); +} + +/* Initialize an sctp_addr from a socket. */ +static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) +{ + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = 0; + addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr; +} + +/* Initialize sk->sk_rcv_saddr from sctp_addr. */ +static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk) +{ + inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr; +} + +/* Initialize sk->sk_daddr from sctp_addr. */ +static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) +{ + inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr; +} + +/* Initialize a sctp_addr from an address parameter. */ +static void sctp_v4_from_addr_param(union sctp_addr *addr, + union sctp_addr_param *param, + __be16 port, int iif) +{ + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = port; + addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; +} + +/* Initialize an address parameter from a sctp_addr and return the length + * of the address parameter. + */ +static int sctp_v4_to_addr_param(const union sctp_addr *addr, + union sctp_addr_param *param) +{ + int length = sizeof(sctp_ipv4addr_param_t); + + param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS; + param->v4.param_hdr.length = htons(length); + param->v4.addr.s_addr = addr->v4.sin_addr.s_addr; + + return length; +} + +/* Initialize a sctp_addr from a dst_entry. */ +static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4, + __be16 port) +{ + saddr->v4.sin_family = AF_INET; + saddr->v4.sin_port = port; + saddr->v4.sin_addr.s_addr = fl4->saddr; +} + +/* Compare two addresses exactly. */ +static int sctp_v4_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2) +{ + if (addr1->sa.sa_family != addr2->sa.sa_family) + return 0; + if (addr1->v4.sin_port != addr2->v4.sin_port) + return 0; + if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr) + return 0; + + return 1; +} + +/* Initialize addr struct to INADDR_ANY. */ +static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) +{ + addr->v4.sin_family = AF_INET; + addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); + addr->v4.sin_port = port; +} + +/* Is this a wildcard address? */ +static int sctp_v4_is_any(const union sctp_addr *addr) +{ + return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr; +} + +/* This function checks if the address is a valid address to be used for + * SCTP binding. + * + * Output: + * Return 0 - If the address is a non-unicast or an illegal address. + * Return 1 - If the address is a unicast. + */ +static int sctp_v4_addr_valid(union sctp_addr *addr, + struct sctp_sock *sp, + const struct sk_buff *skb) +{ + /* IPv4 addresses not allowed */ + if (sp && ipv6_only_sock(sctp_opt2sk(sp))) + return 0; + + /* Is this a non-unicast address or a unusable SCTP address? */ + if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) + return 0; + + /* Is this a broadcast address? */ + if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST) + return 0; + + return 1; +} + +/* Should this be available for binding? */ +static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) +{ + int ret = inet_addr_type(&init_net, addr->v4.sin_addr.s_addr); + + + if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && + ret != RTN_LOCAL && + !sp->inet.freebind && + !sysctl_ip_nonlocal_bind) + return 0; + + if (ipv6_only_sock(sctp_opt2sk(sp))) + return 0; + + return 1; +} + +/* Checking the loopback, private and other address scopes as defined in + * RFC 1918. The IPv4 scoping is based on the draft for SCTP IPv4 + * scoping . + * + * Level 0 - unusable SCTP addresses + * Level 1 - loopback address + * Level 2 - link-local addresses + * Level 3 - private addresses. + * Level 4 - global addresses + * For INIT and INIT-ACK address list, let L be the level of + * of requested destination address, sender and receiver + * SHOULD include all of its addresses with level greater + * than or equal to L. + * + * IPv4 scoping can be controlled through sysctl option + * net.sctp.addr_scope_policy + */ +static sctp_scope_t sctp_v4_scope(union sctp_addr *addr) +{ + sctp_scope_t retval; + + /* Check for unusable SCTP addresses. */ + if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_UNUSABLE; + } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_LOOPBACK; + } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_LINK; + } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || + ipv4_is_private_172(addr->v4.sin_addr.s_addr) || + ipv4_is_private_192(addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_PRIVATE; + } else { + retval = SCTP_SCOPE_GLOBAL; + } + + return retval; +} + +/* Returns a valid dst cache entry for the given source and destination ip + * addresses. If an association is passed, trys to get a dst entry with a + * source address that matches an address in the bind address list. + */ +static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, + struct flowi *fl, struct sock *sk) +{ + struct sctp_association *asoc = t->asoc; + struct rtable *rt; + struct flowi4 *fl4 = &fl->u.ip4; + struct sctp_bind_addr *bp; + struct sctp_sockaddr_entry *laddr; + struct dst_entry *dst = NULL; + union sctp_addr *daddr = &t->ipaddr; + union sctp_addr dst_saddr; + + memset(fl4, 0x0, sizeof(struct flowi4)); + fl4->daddr = daddr->v4.sin_addr.s_addr; + fl4->fl4_dport = daddr->v4.sin_port; + fl4->flowi4_proto = IPPROTO_SCTP; + if (asoc) { + fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; + fl4->fl4_sport = htons(asoc->base.bind_addr.port); + } + if (saddr) { + fl4->saddr = saddr->v4.sin_addr.s_addr; + fl4->fl4_sport = saddr->v4.sin_port; + } + + SCTP_DEBUG_PRINTK("%s: DST:%pI4, SRC:%pI4 - ", + __func__, &fl4->daddr, &fl4->saddr); + + rt = ip_route_output_key(&init_net, fl4); + if (!IS_ERR(rt)) + dst = &rt->dst; + + /* If there is no association or if a source address is passed, no + * more validation is required. + */ + if (!asoc || saddr) + goto out; + + bp = &asoc->base.bind_addr; + + if (dst) { + /* Walk through the bind address list and look for a bind + * address that matches the source address of the returned dst. + */ + sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port)); + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC)) + continue; + if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) + goto out_unlock; + } + rcu_read_unlock(); + + /* None of the bound addresses match the source address of the + * dst. So release it. + */ + dst_release(dst); + dst = NULL; + } + + /* Walk through the bind address list and try to get a dst that + * matches a bind address as the source address. + */ + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid) + continue; + if ((laddr->state == SCTP_ADDR_SRC) && + (AF_INET == laddr->a.sa.sa_family)) { + fl4->saddr = laddr->a.v4.sin_addr.s_addr; + fl4->fl4_sport = laddr->a.v4.sin_port; + rt = ip_route_output_key(&init_net, fl4); + if (!IS_ERR(rt)) { + dst = &rt->dst; + goto out_unlock; + } + } + } + +out_unlock: + rcu_read_unlock(); +out: + t->dst = dst; + if (dst) + SCTP_DEBUG_PRINTK("rt_dst:%pI4, rt_src:%pI4\n", + &fl4->daddr, &fl4->saddr); + else + SCTP_DEBUG_PRINTK("NO ROUTE\n"); +} + +/* For v4, the source address is cached in the route entry(dst). So no need + * to cache it separately and hence this is an empty routine. + */ +static void sctp_v4_get_saddr(struct sctp_sock *sk, + struct sctp_transport *t, + struct flowi *fl) +{ + union sctp_addr *saddr = &t->saddr; + struct rtable *rt = (struct rtable *)t->dst; + + if (rt) { + saddr->v4.sin_family = AF_INET; + saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr; + } +} + +/* What interface did this skb arrive on? */ +static int sctp_v4_skb_iif(const struct sk_buff *skb) +{ + return skb_rtable(skb)->rt_iif; +} + +/* Was this packet marked by Explicit Congestion Notification? */ +static int sctp_v4_is_ce(const struct sk_buff *skb) +{ + return INET_ECN_is_ce(ip_hdr(skb)->tos); +} + +/* Create and initialize a new sk for the socket returned by accept(). */ +static struct sock *sctp_v4_create_accept_sk(struct sock *sk, + struct sctp_association *asoc) +{ + struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, + sk->sk_prot); + struct inet_sock *newinet; + + if (!newsk) + goto out; + + sock_init_data(NULL, newsk); + + sctp_copy_sock(newsk, sk, asoc); + sock_reset_flag(newsk, SOCK_ZAPPED); + + newinet = inet_sk(newsk); + + newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; + + sk_refcnt_debug_inc(newsk); + + if (newsk->sk_prot->init(newsk)) { + sk_common_release(newsk); + newsk = NULL; + } + +out: + return newsk; +} + +/* Map address, empty for v4 family */ +static void sctp_v4_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr) +{ + /* Empty */ +} + +/* Dump the v4 addr to the seq file. */ +static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) +{ + seq_printf(seq, "%pI4 ", &addr->v4.sin_addr); +} + +static void sctp_v4_ecn_capable(struct sock *sk) +{ + INET_ECN_xmit(sk); +} + +/* Event handler for inet address addition/deletion events. + * The sctp_local_addr_list needs to be protocted by a spin lock since + * multiple notifiers (say IPv4 and IPv6) may be running at the same + * time and thus corrupt the list. + * The reader side is protected with RCU. + */ +static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, + void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct sctp_sockaddr_entry *addr = NULL; + struct sctp_sockaddr_entry *temp; + int found = 0; + + if (!net_eq(dev_net(ifa->ifa_dev->dev), &init_net)) + return NOTIFY_DONE; + + switch (ev) { + case NETDEV_UP: + addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); + if (addr) { + addr->a.v4.sin_family = AF_INET; + addr->a.v4.sin_port = 0; + addr->a.v4.sin_addr.s_addr = ifa->ifa_local; + addr->valid = 1; + spin_lock_bh(&sctp_local_addr_lock); + list_add_tail_rcu(&addr->list, &sctp_local_addr_list); + spin_unlock_bh(&sctp_local_addr_lock); + } + break; + case NETDEV_DOWN: + spin_lock_bh(&sctp_local_addr_lock); + list_for_each_entry_safe(addr, temp, + &sctp_local_addr_list, list) { + if (addr->a.sa.sa_family == AF_INET && + addr->a.v4.sin_addr.s_addr == + ifa->ifa_local) { + found = 1; + addr->valid = 0; + list_del_rcu(&addr->list); + break; + } + } + spin_unlock_bh(&sctp_local_addr_lock); + if (found) + kfree_rcu(addr, rcu); + break; + } + + return NOTIFY_DONE; +} + +/* + * Initialize the control inode/socket with a control endpoint data + * structure. This endpoint is reserved exclusively for the OOTB processing. + */ +static int sctp_ctl_sock_init(void) +{ + int err; + sa_family_t family = PF_INET; + + if (sctp_get_pf_specific(PF_INET6)) + family = PF_INET6; + + err = inet_ctl_sock_create(&sctp_ctl_sock, family, + SOCK_SEQPACKET, IPPROTO_SCTP, &init_net); + + /* If IPv6 socket could not be created, try the IPv4 socket */ + if (err < 0 && family == PF_INET6) + err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET, + SOCK_SEQPACKET, IPPROTO_SCTP, + &init_net); + + if (err < 0) { + pr_err("Failed to create the SCTP control socket\n"); + return err; + } + return 0; +} + +/* Register address family specific functions. */ +int sctp_register_af(struct sctp_af *af) +{ + switch (af->sa_family) { + case AF_INET: + if (sctp_af_v4_specific) + return 0; + sctp_af_v4_specific = af; + break; + case AF_INET6: + if (sctp_af_v6_specific) + return 0; + sctp_af_v6_specific = af; + break; + default: + return 0; + } + + INIT_LIST_HEAD(&af->list); + list_add_tail(&af->list, &sctp_address_families); + return 1; +} + +/* Get the table of functions for manipulating a particular address + * family. + */ +struct sctp_af *sctp_get_af_specific(sa_family_t family) +{ + switch (family) { + case AF_INET: + return sctp_af_v4_specific; + case AF_INET6: + return sctp_af_v6_specific; + default: + return NULL; + } +} + +/* Common code to initialize a AF_INET msg_name. */ +static void sctp_inet_msgname(char *msgname, int *addr_len) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)msgname; + *addr_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +} + +/* Copy the primary address of the peer primary address as the msg_name. */ +static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname, + int *addr_len) +{ + struct sockaddr_in *sin, *sinfrom; + + if (msgname) { + struct sctp_association *asoc; + + asoc = event->asoc; + sctp_inet_msgname(msgname, addr_len); + sin = (struct sockaddr_in *)msgname; + sinfrom = &asoc->peer.primary_addr.v4; + sin->sin_port = htons(asoc->peer.port); + sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr; + } +} + +/* Initialize and copy out a msgname from an inbound skb. */ +static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) +{ + if (msgname) { + struct sctphdr *sh = sctp_hdr(skb); + struct sockaddr_in *sin = (struct sockaddr_in *)msgname; + + sctp_inet_msgname(msgname, len); + sin->sin_port = sh->source; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + } +} + +/* Do we support this AF? */ +static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) +{ + /* PF_INET only supports AF_INET addresses. */ + return AF_INET == family; +} + +/* Address matching with wildcards allowed. */ +static int sctp_inet_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2, + struct sctp_sock *opt) +{ + /* PF_INET only supports AF_INET addresses. */ + if (addr1->sa.sa_family != addr2->sa.sa_family) + return 0; + if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr || + htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr) + return 1; + if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr) + return 1; + + return 0; +} + +/* Verify that provided sockaddr looks bindable. Common verification has + * already been taken care of. + */ +static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + return sctp_v4_available(addr, opt); +} + +/* Verify that sockaddr looks sendable. Common verification has already + * been taken care of. + */ +static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + return 1; +} + +/* Fill in Supported Address Type information for INIT and INIT-ACK + * chunks. Returns number of addresses supported. + */ +static int sctp_inet_supported_addrs(const struct sctp_sock *opt, + __be16 *types) +{ + types[0] = SCTP_PARAM_IPV4_ADDRESS; + return 1; +} + +/* Wrapper routine that calls the ip transmit routine. */ +static inline int sctp_v4_xmit(struct sk_buff *skb, + struct sctp_transport *transport) +{ + struct inet_sock *inet = inet_sk(skb->sk); + + SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", + __func__, skb, skb->len, + &transport->fl.u.ip4.saddr, + &transport->fl.u.ip4.daddr); + + inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? + IP_PMTUDISC_DO : IP_PMTUDISC_DONT; + + SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS); + return ip_queue_xmit(skb, &transport->fl); +} + +static struct sctp_af sctp_af_inet; + +static struct sctp_pf sctp_pf_inet = { + .event_msgname = sctp_inet_event_msgname, + .skb_msgname = sctp_inet_skb_msgname, + .af_supported = sctp_inet_af_supported, + .cmp_addr = sctp_inet_cmp_addr, + .bind_verify = sctp_inet_bind_verify, + .send_verify = sctp_inet_send_verify, + .supported_addrs = sctp_inet_supported_addrs, + .create_accept_sk = sctp_v4_create_accept_sk, + .addr_v4map = sctp_v4_addr_v4map, + .af = &sctp_af_inet +}; + +/* Notifier for inetaddr addition/deletion events. */ +static struct notifier_block sctp_inetaddr_notifier = { + .notifier_call = sctp_inetaddr_event, +}; + +/* Socket operations. */ +static const struct proto_ops inet_seqpacket_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, /* Needs to be wrapped... */ + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, /* Semantics are different. */ + .poll = sctp_poll, + .ioctl = inet_ioctl, + .listen = sctp_inet_listen, + .shutdown = inet_shutdown, /* Looks harmless. */ + .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +/* Registration with AF_INET family. */ +static struct inet_protosw sctp_seqpacket_protosw = { + .type = SOCK_SEQPACKET, + .protocol = IPPROTO_SCTP, + .prot = &sctp_prot, + .ops = &inet_seqpacket_ops, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG +}; +static struct inet_protosw sctp_stream_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SCTP, + .prot = &sctp_prot, + .ops = &inet_seqpacket_ops, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG +}; + +/* Register with IP layer. */ +static const struct net_protocol sctp_protocol = { + .handler = sctp_rcv, + .err_handler = sctp_v4_err, + .no_policy = 1, +}; + +/* IPv4 address related functions. */ +static struct sctp_af sctp_af_inet = { + .sa_family = AF_INET, + .sctp_xmit = sctp_v4_xmit, + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .get_dst = sctp_v4_get_dst, + .get_saddr = sctp_v4_get_saddr, + .copy_addrlist = sctp_v4_copy_addrlist, + .from_skb = sctp_v4_from_skb, + .from_sk = sctp_v4_from_sk, + .to_sk_saddr = sctp_v4_to_sk_saddr, + .to_sk_daddr = sctp_v4_to_sk_daddr, + .from_addr_param = sctp_v4_from_addr_param, + .to_addr_param = sctp_v4_to_addr_param, + .cmp_addr = sctp_v4_cmp_addr, + .addr_valid = sctp_v4_addr_valid, + .inaddr_any = sctp_v4_inaddr_any, + .is_any = sctp_v4_is_any, + .available = sctp_v4_available, + .scope = sctp_v4_scope, + .skb_iif = sctp_v4_skb_iif, + .is_ce = sctp_v4_is_ce, + .seq_dump_addr = sctp_v4_seq_dump_addr, + .ecn_capable = sctp_v4_ecn_capable, + .net_header_len = sizeof(struct iphdr), + .sockaddr_len = sizeof(struct sockaddr_in), +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ip_setsockopt, + .compat_getsockopt = compat_ip_getsockopt, +#endif +}; + +struct sctp_pf *sctp_get_pf_specific(sa_family_t family) { + + switch (family) { + case PF_INET: + return sctp_pf_inet_specific; + case PF_INET6: + return sctp_pf_inet6_specific; + default: + return NULL; + } +} + +/* Register the PF specific function table. */ +int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) +{ + switch (family) { + case PF_INET: + if (sctp_pf_inet_specific) + return 0; + sctp_pf_inet_specific = pf; + break; + case PF_INET6: + if (sctp_pf_inet6_specific) + return 0; + sctp_pf_inet6_specific = pf; + break; + default: + return 0; + } + return 1; +} + +static inline int init_sctp_mibs(void) +{ + return snmp_mib_init((void __percpu **)sctp_statistics, + sizeof(struct sctp_mib), + __alignof__(struct sctp_mib)); +} + +static inline void cleanup_sctp_mibs(void) +{ + snmp_mib_free((void __percpu **)sctp_statistics); +} + +static void sctp_v4_pf_init(void) +{ + /* Initialize the SCTP specific PF functions. */ + sctp_register_pf(&sctp_pf_inet, PF_INET); + sctp_register_af(&sctp_af_inet); +} + +static void sctp_v4_pf_exit(void) +{ + list_del(&sctp_af_inet.list); +} + +static int sctp_v4_protosw_init(void) +{ + int rc; + + rc = proto_register(&sctp_prot, 1); + if (rc) + return rc; + + /* Register SCTP(UDP and TCP style) with socket layer. */ + inet_register_protosw(&sctp_seqpacket_protosw); + inet_register_protosw(&sctp_stream_protosw); + + return 0; +} + +static void sctp_v4_protosw_exit(void) +{ + inet_unregister_protosw(&sctp_stream_protosw); + inet_unregister_protosw(&sctp_seqpacket_protosw); + proto_unregister(&sctp_prot); +} + +static int sctp_v4_add_protocol(void) +{ + /* Register notifier for inet address additions/deletions. */ + register_inetaddr_notifier(&sctp_inetaddr_notifier); + + /* Register SCTP with inet layer. */ + if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) + return -EAGAIN; + + return 0; +} + +static void sctp_v4_del_protocol(void) +{ + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); + unregister_inetaddr_notifier(&sctp_inetaddr_notifier); +} + +/* Initialize the universe into something sensible. */ +SCTP_STATIC __init int sctp_init(void) +{ + int i; + int status = -EINVAL; + unsigned long goal; + unsigned long limit; + int max_share; + int order; + + /* SCTP_DEBUG sanity check. */ + if (!sctp_sanity_check()) + goto out; + + /* Allocate bind_bucket and chunk caches. */ + status = -ENOBUFS; + sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", + sizeof(struct sctp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!sctp_bucket_cachep) + goto out; + + sctp_chunk_cachep = kmem_cache_create("sctp_chunk", + sizeof(struct sctp_chunk), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!sctp_chunk_cachep) + goto err_chunk_cachep; + + /* Allocate and initialise sctp mibs. */ + status = init_sctp_mibs(); + if (status) + goto err_init_mibs; + + /* Initialize proc fs directory. */ + status = sctp_proc_init(); + if (status) + goto err_init_proc; + + /* Initialize object count debugging. */ + sctp_dbg_objcnt_init(); + + /* + * 14. Suggested SCTP Protocol Parameter Values + */ + /* The following protocol parameters are RECOMMENDED: */ + /* RTO.Initial - 3 seconds */ + sctp_rto_initial = SCTP_RTO_INITIAL; + /* RTO.Min - 1 second */ + sctp_rto_min = SCTP_RTO_MIN; + /* RTO.Max - 60 seconds */ + sctp_rto_max = SCTP_RTO_MAX; + /* RTO.Alpha - 1/8 */ + sctp_rto_alpha = SCTP_RTO_ALPHA; + /* RTO.Beta - 1/4 */ + sctp_rto_beta = SCTP_RTO_BETA; + + /* Valid.Cookie.Life - 60 seconds */ + sctp_valid_cookie_life = SCTP_DEFAULT_COOKIE_LIFE; + + /* Whether Cookie Preservative is enabled(1) or not(0) */ + sctp_cookie_preserve_enable = 1; + + /* Max.Burst - 4 */ + sctp_max_burst = SCTP_DEFAULT_MAX_BURST; + + /* Association.Max.Retrans - 10 attempts + * Path.Max.Retrans - 5 attempts (per destination address) + * Max.Init.Retransmits - 8 attempts + */ + sctp_max_retrans_association = 10; + sctp_max_retrans_path = 5; + sctp_max_retrans_init = 8; + + /* Sendbuffer growth - do per-socket accounting */ + sctp_sndbuf_policy = 0; + + /* Rcvbuffer growth - do per-socket accounting */ + sctp_rcvbuf_policy = 0; + + /* HB.interval - 30 seconds */ + sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; + + /* delayed SACK timeout */ + sctp_sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK; + + /* Implementation specific variables. */ + + /* Initialize default stream count setup information. */ + sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; + sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; + + /* Initialize maximum autoclose timeout. */ + sctp_max_autoclose = INT_MAX / HZ; + + /* Initialize handle used for association ids. */ + idr_init(&sctp_assocs_id); + + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + sysctl_sctp_mem[0] = limit / 4 * 3; + sysctl_sctp_mem[1] = limit; + sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2; + + /* Set per-socket limits to no more than 1/128 the pressure threshold*/ + limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7); + max_share = min(4UL*1024*1024, limit); + + sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */ + sysctl_sctp_rmem[1] = (1500 *(sizeof(struct sk_buff) + 1)); + sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); + + sysctl_sctp_wmem[0] = SK_MEM_QUANTUM; + sysctl_sctp_wmem[1] = 16*1024; + sysctl_sctp_wmem[2] = max(64*1024, max_share); + + /* Size and allocate the association hash table. + * The methodology is similar to that of the tcp hash tables. + */ + if (totalram_pages >= (128 * 1024)) + goal = totalram_pages >> (22 - PAGE_SHIFT); + else + goal = totalram_pages >> (24 - PAGE_SHIFT); + + for (order = 0; (1UL << order) < goal; order++) + ; + + do { + sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_hashbucket); + if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0) + continue; + sctp_assoc_hashtable = (struct sctp_hashbucket *) + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); + } while (!sctp_assoc_hashtable && --order > 0); + if (!sctp_assoc_hashtable) { + pr_err("Failed association hash alloc\n"); + status = -ENOMEM; + goto err_ahash_alloc; + } + for (i = 0; i < sctp_assoc_hashsize; i++) { + rwlock_init(&sctp_assoc_hashtable[i].lock); + INIT_HLIST_HEAD(&sctp_assoc_hashtable[i].chain); + } + + /* Allocate and initialize the endpoint hash table. */ + sctp_ep_hashsize = 64; + sctp_ep_hashtable = (struct sctp_hashbucket *) + kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); + if (!sctp_ep_hashtable) { + pr_err("Failed endpoint_hash alloc\n"); + status = -ENOMEM; + goto err_ehash_alloc; + } + for (i = 0; i < sctp_ep_hashsize; i++) { + rwlock_init(&sctp_ep_hashtable[i].lock); + INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); + } + + /* Allocate and initialize the SCTP port hash table. */ + do { + sctp_port_hashsize = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + if ((sctp_port_hashsize > (64 * 1024)) && order > 0) + continue; + sctp_port_hashtable = (struct sctp_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); + } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { + pr_err("Failed bind hash alloc\n"); + status = -ENOMEM; + goto err_bhash_alloc; + } + for (i = 0; i < sctp_port_hashsize; i++) { + spin_lock_init(&sctp_port_hashtable[i].lock); + INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); + } + + pr_info("Hash tables configured (established %d bind %d)\n", + sctp_assoc_hashsize, sctp_port_hashsize); + + /* Disable ADDIP by default. */ + sctp_addip_enable = 0; + sctp_addip_noauth = 0; + + /* Enable PR-SCTP by default. */ + sctp_prsctp_enable = 1; + + /* Disable AUTH by default. */ + sctp_auth_enable = 0; + + /* Set SCOPE policy to enabled */ + sctp_scope_policy = SCTP_SCOPE_POLICY_ENABLE; + + /* Set the default rwnd update threshold */ + sctp_rwnd_upd_shift = SCTP_DEFAULT_RWND_SHIFT; + + sctp_sysctl_register(); + + INIT_LIST_HEAD(&sctp_address_families); + sctp_v4_pf_init(); + sctp_v6_pf_init(); + + /* Initialize the local address list. */ + INIT_LIST_HEAD(&sctp_local_addr_list); + spin_lock_init(&sctp_local_addr_lock); + sctp_get_local_addr_list(); + + status = sctp_v4_protosw_init(); + + if (status) + goto err_protosw_init; + + status = sctp_v6_protosw_init(); + if (status) + goto err_v6_protosw_init; + + /* Initialize the control inode/socket for handling OOTB packets. */ + if ((status = sctp_ctl_sock_init())) { + pr_err("Failed to initialize the SCTP control sock\n"); + goto err_ctl_sock_init; + } + + status = sctp_v4_add_protocol(); + if (status) + goto err_add_protocol; + + /* Register SCTP with inet6 layer. */ + status = sctp_v6_add_protocol(); + if (status) + goto err_v6_add_protocol; + + status = 0; +out: + return status; +err_v6_add_protocol: + sctp_v4_del_protocol(); +err_add_protocol: + inet_ctl_sock_destroy(sctp_ctl_sock); +err_ctl_sock_init: + sctp_v6_protosw_exit(); +err_v6_protosw_init: + sctp_v4_protosw_exit(); +err_protosw_init: + sctp_free_local_addr_list(); + sctp_v4_pf_exit(); + sctp_v6_pf_exit(); + sctp_sysctl_unregister(); + free_pages((unsigned long)sctp_port_hashtable, + get_order(sctp_port_hashsize * + sizeof(struct sctp_bind_hashbucket))); +err_bhash_alloc: + kfree(sctp_ep_hashtable); +err_ehash_alloc: + free_pages((unsigned long)sctp_assoc_hashtable, + get_order(sctp_assoc_hashsize * + sizeof(struct sctp_hashbucket))); +err_ahash_alloc: + sctp_dbg_objcnt_exit(); + sctp_proc_exit(); +err_init_proc: + cleanup_sctp_mibs(); +err_init_mibs: + kmem_cache_destroy(sctp_chunk_cachep); +err_chunk_cachep: + kmem_cache_destroy(sctp_bucket_cachep); + goto out; +} + +/* Exit handler for the SCTP protocol. */ +SCTP_STATIC __exit void sctp_exit(void) +{ + /* BUG. This should probably do something useful like clean + * up all the remaining associations and all that memory. + */ + + /* Unregister with inet6/inet layers. */ + sctp_v6_del_protocol(); + sctp_v4_del_protocol(); + + /* Free the control endpoint. */ + inet_ctl_sock_destroy(sctp_ctl_sock); + + /* Free protosw registrations */ + sctp_v6_protosw_exit(); + sctp_v4_protosw_exit(); + + /* Free the local address list. */ + sctp_free_local_addr_list(); + + /* Unregister with socket layer. */ + sctp_v6_pf_exit(); + sctp_v4_pf_exit(); + + sctp_sysctl_unregister(); + + free_pages((unsigned long)sctp_assoc_hashtable, + get_order(sctp_assoc_hashsize * + sizeof(struct sctp_hashbucket))); + kfree(sctp_ep_hashtable); + free_pages((unsigned long)sctp_port_hashtable, + get_order(sctp_port_hashsize * + sizeof(struct sctp_bind_hashbucket))); + + sctp_dbg_objcnt_exit(); + sctp_proc_exit(); + cleanup_sctp_mibs(); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ + + kmem_cache_destroy(sctp_chunk_cachep); + kmem_cache_destroy(sctp_bucket_cachep); +} + +module_init(sctp_init); +module_exit(sctp_exit); + +/* + * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly. + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); +MODULE_AUTHOR("Linux Kernel SCTP developers "); +MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); +module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); +MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); +MODULE_LICENSE("GPL"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c new file mode 100644 index 00000000..58eb27fe --- /dev/null +++ b/net/sctp/sm_make_chunk.c @@ -0,0 +1,3421 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2002 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions work with the state functions in sctp_sm_statefuns.c + * to implement the state operations. These functions implement the + * steps which require modifying existing data structures. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * C. Robin + * Jon Grimm + * Xingang Guo + * Dajiang Zhang + * Sridhar Samudrala + * Daisy Chang + * Ardelle Fan + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* for get_random_bytes */ +#include +#include + +SCTP_STATIC +struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc, + __u8 type, __u8 flags, int paylen); +static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *init_chunk, + int *cookie_len, + const __u8 *raw_addrs, int addrs_len); +static int sctp_process_param(struct sctp_association *asoc, + union sctp_params param, + const union sctp_addr *peer_addr, + gfp_t gfp); +static void *sctp_addto_param(struct sctp_chunk *chunk, int len, + const void *data); + +/* What was the inbound interface for this chunk? */ +int sctp_chunk_iif(const struct sctp_chunk *chunk) +{ + struct sctp_af *af; + int iif = 0; + + af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version)); + if (af) + iif = af->skb_iif(chunk->skb); + + return iif; +} + +/* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 2: The ECN capable field is reserved for future use of + * Explicit Congestion Notification. + */ +static const struct sctp_paramhdr ecap_param = { + SCTP_PARAM_ECN_CAPABLE, + cpu_to_be16(sizeof(struct sctp_paramhdr)), +}; +static const struct sctp_paramhdr prsctp_param = { + SCTP_PARAM_FWD_TSN_SUPPORT, + cpu_to_be16(sizeof(struct sctp_paramhdr)), +}; + +/* A helper to initialize an op error inside a + * provided chunk, as most cause codes will be embedded inside an + * abort chunk. + */ +void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, + size_t paylen) +{ + sctp_errhdr_t err; + __u16 len; + + /* Cause code constants are now defined in network order. */ + err.cause = cause_code; + len = sizeof(sctp_errhdr_t) + paylen; + err.length = htons(len); + chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err); +} + +/* A helper to initialize an op error inside a + * provided chunk, as most cause codes will be embedded inside an + * abort chunk. Differs from sctp_init_cause in that it won't oops + * if there isn't enough space in the op error chunk + */ +int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code, + size_t paylen) +{ + sctp_errhdr_t err; + __u16 len; + + /* Cause code constants are now defined in network order. */ + err.cause = cause_code; + len = sizeof(sctp_errhdr_t) + paylen; + err.length = htons(len); + + if (skb_tailroom(chunk->skb) < len) + return -ENOSPC; + chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, + sizeof(sctp_errhdr_t), + &err); + return 0; +} +/* 3.3.2 Initiation (INIT) (1) + * + * This chunk is used to initiate a SCTP association between two + * endpoints. The format of the INIT chunk is shown below: + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 1 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Initiate Tag | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Advertised Receiver Window Credit (a_rwnd) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Number of Outbound Streams | Number of Inbound Streams | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Initial TSN | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Optional/Variable-Length Parameters / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * + * The INIT chunk contains the following parameters. Unless otherwise + * noted, each parameter MUST only be included once in the INIT chunk. + * + * Fixed Parameters Status + * ---------------------------------------------- + * Initiate Tag Mandatory + * Advertised Receiver Window Credit Mandatory + * Number of Outbound Streams Mandatory + * Number of Inbound Streams Mandatory + * Initial TSN Mandatory + * + * Variable Parameters Status Type Value + * ------------------------------------------------------------- + * IPv4 Address (Note 1) Optional 5 + * IPv6 Address (Note 1) Optional 6 + * Cookie Preservative Optional 9 + * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) + * Host Name Address (Note 3) Optional 11 + * Supported Address Types (Note 4) Optional 12 + */ +struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, + const struct sctp_bind_addr *bp, + gfp_t gfp, int vparam_len) +{ + sctp_inithdr_t init; + union sctp_params addrs; + size_t chunksize; + struct sctp_chunk *retval = NULL; + int num_types, addrs_len = 0; + struct sctp_sock *sp; + sctp_supported_addrs_param_t sat; + __be16 types[2]; + sctp_adaptation_ind_param_t aiparam; + sctp_supported_ext_param_t ext_param; + int num_ext = 0; + __u8 extensions[3]; + sctp_paramhdr_t *auth_chunks = NULL, + *auth_hmacs = NULL; + + /* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 1: The INIT chunks can contain multiple addresses that + * can be IPv4 and/or IPv6 in any combination. + */ + retval = NULL; + + /* Convert the provided bind address list to raw format. */ + addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); + + init.init_tag = htonl(asoc->c.my_vtag); + init.a_rwnd = htonl(asoc->rwnd); + init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); + init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); + init.initial_tsn = htonl(asoc->c.initial_tsn); + + /* How many address types are needed? */ + sp = sctp_sk(asoc->base.sk); + num_types = sp->pf->supported_addrs(sp, types); + + chunksize = sizeof(init) + addrs_len; + chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); + chunksize += sizeof(ecap_param); + + if (sctp_prsctp_enable) + chunksize += sizeof(prsctp_param); + + /* ADDIP: Section 4.2.7: + * An implementation supporting this extension [ADDIP] MUST list + * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and + * INIT-ACK parameters. + */ + if (sctp_addip_enable) { + extensions[num_ext] = SCTP_CID_ASCONF; + extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; + num_ext += 2; + } + + if (sp->adaptation_ind) + chunksize += sizeof(aiparam); + + chunksize += vparam_len; + + /* Account for AUTH related parameters */ + if (sctp_auth_enable) { + /* Add random parameter length*/ + chunksize += sizeof(asoc->c.auth_random); + + /* Add HMACS parameter length if any were defined */ + auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; + if (auth_hmacs->length) + chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + else + auth_hmacs = NULL; + + /* Add CHUNKS parameter length */ + auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; + if (auth_chunks->length) + chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + else + auth_chunks = NULL; + + extensions[num_ext] = SCTP_CID_AUTH; + num_ext += 1; + } + + /* If we have any extensions to report, account for that */ + if (num_ext) + chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + + num_ext); + + /* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 3: An INIT chunk MUST NOT contain more than one Host + * Name address parameter. Moreover, the sender of the INIT + * MUST NOT combine any other address types with the Host Name + * address in the INIT. The receiver of INIT MUST ignore any + * other address types if the Host Name address parameter is + * present in the received INIT chunk. + * + * PLEASE DO NOT FIXME [This version does not support Host Name.] + */ + + retval = sctp_make_chunk(asoc, SCTP_CID_INIT, 0, chunksize); + if (!retval) + goto nodata; + + retval->subh.init_hdr = + sctp_addto_chunk(retval, sizeof(init), &init); + retval->param_hdr.v = + sctp_addto_chunk(retval, addrs_len, addrs.v); + + /* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 4: This parameter, when present, specifies all the + * address types the sending endpoint can support. The absence + * of this parameter indicates that the sending endpoint can + * support any address type. + */ + sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; + sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); + sctp_addto_chunk(retval, sizeof(sat), &sat); + sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); + + sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); + + /* Add the supported extensions parameter. Be nice and add this + * fist before addiding the parameters for the extensions themselves + */ + if (num_ext) { + ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; + ext_param.param_hdr.length = + htons(sizeof(sctp_supported_ext_param_t) + num_ext); + sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t), + &ext_param); + sctp_addto_param(retval, num_ext, extensions); + } + + if (sctp_prsctp_enable) + sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); + + if (sp->adaptation_ind) { + aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaptation_ind = htonl(sp->adaptation_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + } + + /* Add SCTP-AUTH chunks to the parameter list */ + if (sctp_auth_enable) { + sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), + asoc->c.auth_random); + if (auth_hmacs) + sctp_addto_chunk(retval, ntohs(auth_hmacs->length), + auth_hmacs); + if (auth_chunks) + sctp_addto_chunk(retval, ntohs(auth_chunks->length), + auth_chunks); + } +nodata: + kfree(addrs.v); + return retval; +} + +struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + gfp_t gfp, int unkparam_len) +{ + sctp_inithdr_t initack; + struct sctp_chunk *retval; + union sctp_params addrs; + struct sctp_sock *sp; + int addrs_len; + sctp_cookie_param_t *cookie; + int cookie_len; + size_t chunksize; + sctp_adaptation_ind_param_t aiparam; + sctp_supported_ext_param_t ext_param; + int num_ext = 0; + __u8 extensions[3]; + sctp_paramhdr_t *auth_chunks = NULL, + *auth_hmacs = NULL, + *auth_random = NULL; + + retval = NULL; + + /* Note: there may be no addresses to embed. */ + addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); + + initack.init_tag = htonl(asoc->c.my_vtag); + initack.a_rwnd = htonl(asoc->rwnd); + initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); + initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); + initack.initial_tsn = htonl(asoc->c.initial_tsn); + + /* FIXME: We really ought to build the cookie right + * into the packet instead of allocating more fresh memory. + */ + cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, + addrs.v, addrs_len); + if (!cookie) + goto nomem_cookie; + + /* Calculate the total size of allocation, include the reserved + * space for reporting unknown parameters if it is specified. + */ + sp = sctp_sk(asoc->base.sk); + chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; + + /* Tell peer that we'll do ECN only if peer advertised such cap. */ + if (asoc->peer.ecn_capable) + chunksize += sizeof(ecap_param); + + if (asoc->peer.prsctp_capable) + chunksize += sizeof(prsctp_param); + + if (asoc->peer.asconf_capable) { + extensions[num_ext] = SCTP_CID_ASCONF; + extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; + num_ext += 2; + } + + if (sp->adaptation_ind) + chunksize += sizeof(aiparam); + + if (asoc->peer.auth_capable) { + auth_random = (sctp_paramhdr_t *)asoc->c.auth_random; + chunksize += ntohs(auth_random->length); + + auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; + if (auth_hmacs->length) + chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + else + auth_hmacs = NULL; + + auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; + if (auth_chunks->length) + chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + else + auth_chunks = NULL; + + extensions[num_ext] = SCTP_CID_AUTH; + num_ext += 1; + } + + if (num_ext) + chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + + num_ext); + + /* Now allocate and fill out the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_INIT_ACK, 0, chunksize); + if (!retval) + goto nomem_chunk; + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it received the DATA or control chunk + * to which it is replying. + * + * [INIT ACK back to where the INIT came from.] + */ + retval->transport = chunk->transport; + + retval->subh.init_hdr = + sctp_addto_chunk(retval, sizeof(initack), &initack); + retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); + sctp_addto_chunk(retval, cookie_len, cookie); + if (asoc->peer.ecn_capable) + sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); + if (num_ext) { + ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; + ext_param.param_hdr.length = + htons(sizeof(sctp_supported_ext_param_t) + num_ext); + sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t), + &ext_param); + sctp_addto_param(retval, num_ext, extensions); + } + if (asoc->peer.prsctp_capable) + sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); + + if (sp->adaptation_ind) { + aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaptation_ind = htonl(sp->adaptation_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + } + + if (asoc->peer.auth_capable) { + sctp_addto_chunk(retval, ntohs(auth_random->length), + auth_random); + if (auth_hmacs) + sctp_addto_chunk(retval, ntohs(auth_hmacs->length), + auth_hmacs); + if (auth_chunks) + sctp_addto_chunk(retval, ntohs(auth_chunks->length), + auth_chunks); + } + + /* We need to remove the const qualifier at this point. */ + retval->asoc = (struct sctp_association *) asoc; + +nomem_chunk: + kfree(cookie); +nomem_cookie: + kfree(addrs.v); + return retval; +} + +/* 3.3.11 Cookie Echo (COOKIE ECHO) (10): + * + * This chunk is used only during the initialization of an association. + * It is sent by the initiator of an association to its peer to complete + * the initialization process. This chunk MUST precede any DATA chunk + * sent within the association, but MAY be bundled with one or more DATA + * chunks in the same packet. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 10 |Chunk Flags | Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * / Cookie / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Chunk Flags: 8 bit + * + * Set to zero on transmit and ignored on receipt. + * + * Length: 16 bits (unsigned integer) + * + * Set to the size of the chunk in bytes, including the 4 bytes of + * the chunk header and the size of the Cookie. + * + * Cookie: variable size + * + * This field must contain the exact cookie received in the + * State Cookie parameter from the previous INIT ACK. + * + * An implementation SHOULD make the cookie as small as possible + * to insure interoperability. + */ +struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + void *cookie; + int cookie_len; + + cookie = asoc->peer.cookie; + cookie_len = asoc->peer.cookie_len; + + /* Build a cookie echo chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len); + if (!retval) + goto nodata; + retval->subh.cookie_hdr = + sctp_addto_chunk(retval, cookie_len, cookie); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [COOKIE ECHO back to where the INIT ACK came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): + * + * This chunk is used only during the initialization of an + * association. It is used to acknowledge the receipt of a COOKIE + * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent + * within the association, but MAY be bundled with one or more DATA + * chunks or SACK chunk in the same SCTP packet. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 11 |Chunk Flags | Length = 4 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Chunk Flags: 8 bits + * + * Set to zero on transmit and ignored on receipt. + */ +struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ACK, 0, 0); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [COOKIE ACK back to where the COOKIE ECHO came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +/* + * Appendix A: Explicit Congestion Notification: + * CWR: + * + * RFC 2481 details a specific bit for a sender to send in the header of + * its next outbound TCP segment to indicate to its peer that it has + * reduced its congestion window. This is termed the CWR bit. For + * SCTP the same indication is made by including the CWR chunk. + * This chunk contains one data element, i.e. the TSN number that + * was sent in the ECNE chunk. This element represents the lowest + * TSN number in the datagram that was originally marked with the + * CE bit. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Lowest TSN Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Note: The CWR is considered a Control chunk. + */ +struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, + const __u32 lowest_tsn, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + sctp_cwrhdr_t cwr; + + cwr.lowest_tsn = htonl(lowest_tsn); + retval = sctp_make_chunk(asoc, SCTP_CID_ECN_CWR, 0, + sizeof(sctp_cwrhdr_t)); + + if (!retval) + goto nodata; + + retval->subh.ecn_cwr_hdr = + sctp_addto_chunk(retval, sizeof(cwr), &cwr); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [Report a reduced congestion window back to where the ECNE + * came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* Make an ECNE chunk. This is a congestion experienced report. */ +struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, + const __u32 lowest_tsn) +{ + struct sctp_chunk *retval; + sctp_ecnehdr_t ecne; + + ecne.lowest_tsn = htonl(lowest_tsn); + retval = sctp_make_chunk(asoc, SCTP_CID_ECN_ECNE, 0, + sizeof(sctp_ecnehdr_t)); + if (!retval) + goto nodata; + retval->subh.ecne_hdr = + sctp_addto_chunk(retval, sizeof(ecne), &ecne); + +nodata: + return retval; +} + +/* Make a DATA chunk for the given association from the provided + * parameters. However, do not populate the data payload. + */ +struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, + const struct sctp_sndrcvinfo *sinfo, + int data_len, __u8 flags, __u16 ssn) +{ + struct sctp_chunk *retval; + struct sctp_datahdr dp; + int chunk_len; + + /* We assign the TSN as LATE as possible, not here when + * creating the chunk. + */ + dp.tsn = 0; + dp.stream = htons(sinfo->sinfo_stream); + dp.ppid = sinfo->sinfo_ppid; + + /* Set the flags for an unordered send. */ + if (sinfo->sinfo_flags & SCTP_UNORDERED) { + flags |= SCTP_DATA_UNORDERED; + dp.ssn = 0; + } else + dp.ssn = htons(ssn); + + chunk_len = sizeof(dp) + data_len; + retval = sctp_make_chunk(asoc, SCTP_CID_DATA, flags, chunk_len); + if (!retval) + goto nodata; + + retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); + memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); + +nodata: + return retval; +} + +/* Create a selective ackowledgement (SACK) for the given + * association. This reports on which TSN's we've seen to date, + * including duplicates and gaps. + */ +struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) +{ + struct sctp_chunk *retval; + struct sctp_sackhdr sack; + int len; + __u32 ctsn; + __u16 num_gabs, num_dup_tsns; + struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; + struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; + + memset(gabs, 0, sizeof(gabs)); + ctsn = sctp_tsnmap_get_ctsn(map); + SCTP_DEBUG_PRINTK("sackCTSNAck sent: 0x%x.\n", ctsn); + + /* How much room is needed in the chunk? */ + num_gabs = sctp_tsnmap_num_gabs(map, gabs); + num_dup_tsns = sctp_tsnmap_num_dups(map); + + /* Initialize the SACK header. */ + sack.cum_tsn_ack = htonl(ctsn); + sack.a_rwnd = htonl(asoc->a_rwnd); + sack.num_gap_ack_blocks = htons(num_gabs); + sack.num_dup_tsns = htons(num_dup_tsns); + + len = sizeof(sack) + + sizeof(struct sctp_gap_ack_block) * num_gabs + + sizeof(__u32) * num_dup_tsns; + + /* Create the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_SACK, 0, len); + if (!retval) + goto nodata; + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, etc.) to the same destination transport + * address from which it received the DATA or control chunk to + * which it is replying. This rule should also be followed if + * the endpoint is bundling DATA chunks together with the + * reply chunk. + * + * However, when acknowledging multiple DATA chunks received + * in packets from different source addresses in a single + * SACK, the SACK chunk may be transmitted to one of the + * destination transport addresses from which the DATA or + * control chunks being acknowledged were received. + * + * [BUG: We do not implement the following paragraph. + * Perhaps we should remember the last transport we used for a + * SACK and avoid that (if possible) if we have seen any + * duplicates. --piggy] + * + * When a receiver of a duplicate DATA chunk sends a SACK to a + * multi- homed endpoint it MAY be beneficial to vary the + * destination address and not use the source address of the + * DATA chunk. The reason being that receiving a duplicate + * from a multi-homed endpoint might indicate that the return + * path (as specified in the source address of the DATA chunk) + * for the SACK is broken. + * + * [Send to the address from which we last received a DATA chunk.] + */ + retval->transport = asoc->peer.last_data_from; + + retval->subh.sack_hdr = + sctp_addto_chunk(retval, sizeof(sack), &sack); + + /* Add the gap ack block information. */ + if (num_gabs) + sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, + gabs); + + /* Add the duplicate TSN information. */ + if (num_dup_tsns) + sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, + sctp_tsnmap_get_dups(map)); + +nodata: + return retval; +} + +/* Make a SHUTDOWN chunk. */ +struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + sctp_shutdownhdr_t shut; + __u32 ctsn; + + ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + shut.cum_tsn_ack = htonl(ctsn); + + retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN, 0, + sizeof(sctp_shutdownhdr_t)); + if (!retval) + goto nodata; + + retval->subh.shutdown_hdr = + sctp_addto_chunk(retval, sizeof(shut), &shut); + + if (chunk) + retval->transport = chunk->transport; +nodata: + return retval; +} + +struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [ACK back to where the SHUTDOWN came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +struct sctp_chunk *sctp_make_shutdown_complete( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + __u8 flags = 0; + + /* Set the T-bit if we have no association (vtag will be + * reflected) + */ + flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; + + retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK + * came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +/* Create an ABORT. Note that we set the T bit if we have no + * association, except when responding to an INIT (sctpimpguide 2.41). + */ +struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const size_t hint) +{ + struct sctp_chunk *retval; + __u8 flags = 0; + + /* Set the T-bit if we have no association and 'chunk' is not + * an INIT (vtag will be reflected). + */ + if (!asoc) { + if (chunk && chunk->chunk_hdr && + chunk->chunk_hdr->type == SCTP_CID_INIT) + flags = 0; + else + flags = SCTP_CHUNK_FLAG_T; + } + + retval = sctp_make_chunk(asoc, SCTP_CID_ABORT, flags, hint); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [ABORT back to where the offender came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +/* Helper to create ABORT with a NO_USER_DATA error. */ +struct sctp_chunk *sctp_make_abort_no_data( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, __u32 tsn) +{ + struct sctp_chunk *retval; + __be32 payload; + + retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + + sizeof(tsn)); + + if (!retval) + goto no_mem; + + /* Put the tsn back into network byte order. */ + payload = htonl(tsn); + sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); + sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [ABORT back to where the offender came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +no_mem: + return retval; +} + +/* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ +struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, + const struct msghdr *msg, + size_t paylen) +{ + struct sctp_chunk *retval; + void *payload = NULL; + int err; + + retval = sctp_make_abort(asoc, NULL, sizeof(sctp_errhdr_t) + paylen); + if (!retval) + goto err_chunk; + + if (paylen) { + /* Put the msg_iov together into payload. */ + payload = kmalloc(paylen, GFP_KERNEL); + if (!payload) + goto err_payload; + + err = memcpy_fromiovec(payload, msg->msg_iov, paylen); + if (err < 0) + goto err_copy; + } + + sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); + sctp_addto_chunk(retval, paylen, payload); + + if (paylen) + kfree(payload); + + return retval; + +err_copy: + kfree(payload); +err_payload: + sctp_chunk_free(retval); + retval = NULL; +err_chunk: + return retval; +} + +/* Append bytes to the end of a parameter. Will panic if chunk is not big + * enough. + */ +static void *sctp_addto_param(struct sctp_chunk *chunk, int len, + const void *data) +{ + void *target; + int chunklen = ntohs(chunk->chunk_hdr->length); + + target = skb_put(chunk->skb, len); + + if (data) + memcpy(target, data, len); + else + memset(target, 0, len); + + /* Adjust the chunk length field. */ + chunk->chunk_hdr->length = htons(chunklen + len); + chunk->chunk_end = skb_tail_pointer(chunk->skb); + + return target; +} + +/* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ +struct sctp_chunk *sctp_make_abort_violation( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const __u8 *payload, + const size_t paylen) +{ + struct sctp_chunk *retval; + struct sctp_paramhdr phdr; + + retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + paylen + + sizeof(sctp_paramhdr_t)); + if (!retval) + goto end; + + sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + + sizeof(sctp_paramhdr_t)); + + phdr.type = htons(chunk->chunk_hdr->type); + phdr.length = chunk->chunk_hdr->length; + sctp_addto_chunk(retval, paylen, payload); + sctp_addto_param(retval, sizeof(sctp_paramhdr_t), &phdr); + +end: + return retval; +} + +struct sctp_chunk *sctp_make_violation_paramlen( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + struct sctp_paramhdr *param) +{ + struct sctp_chunk *retval; + static const char error[] = "The following parameter had invalid length:"; + size_t payload_len = sizeof(error) + sizeof(sctp_errhdr_t) + + sizeof(sctp_paramhdr_t); + + retval = sctp_make_abort(asoc, chunk, payload_len); + if (!retval) + goto nodata; + + sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, + sizeof(error) + sizeof(sctp_paramhdr_t)); + sctp_addto_chunk(retval, sizeof(error), error); + sctp_addto_param(retval, sizeof(sctp_paramhdr_t), param); + +nodata: + return retval; +} + +/* Make a HEARTBEAT chunk. */ +struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, + const struct sctp_transport *transport) +{ + struct sctp_chunk *retval; + sctp_sender_hb_info_t hbinfo; + + retval = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo)); + + if (!retval) + goto nodata; + + hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; + hbinfo.param_hdr.length = htons(sizeof(sctp_sender_hb_info_t)); + hbinfo.daddr = transport->ipaddr; + hbinfo.sent_at = jiffies; + hbinfo.hb_nonce = transport->hb_nonce; + + /* Cast away the 'const', as this is just telling the chunk + * what transport it belongs to. + */ + retval->transport = (struct sctp_transport *) transport; + retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), + &hbinfo); + +nodata: + return retval; +} + +struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const void *payload, const size_t paylen) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen); + if (!retval) + goto nodata; + + retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [HBACK back to where the HEARTBEAT came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* Create an Operation Error chunk with the specified space reserved. + * This routine can be used for containing multiple causes in the chunk. + */ +static struct sctp_chunk *sctp_make_op_error_space( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + size_t size) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_ERROR, 0, + sizeof(sctp_errhdr_t) + size); + if (!retval) + goto nodata; + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, etc.) to the same destination transport + * address from which it received the DATA or control chunk + * to which it is replying. + * + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* Create an Operation Error chunk of a fixed size, + * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) + * This is a helper function to allocate an error chunk for + * for those invalid parameter codes in which we may not want + * to report all the errors, if the incomming chunk is large + */ +static inline struct sctp_chunk *sctp_make_op_error_fixed( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + size_t size = asoc ? asoc->pathmtu : 0; + + if (!size) + size = SCTP_DEFAULT_MAXSEGMENT; + + return sctp_make_op_error_space(asoc, chunk, size); +} + +/* Create an Operation Error chunk. */ +struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + __be16 cause_code, const void *payload, + size_t paylen, size_t reserve_tail) +{ + struct sctp_chunk *retval; + + retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); + if (!retval) + goto nodata; + + sctp_init_cause(retval, cause_code, paylen + reserve_tail); + sctp_addto_chunk(retval, paylen, payload); + if (reserve_tail) + sctp_addto_param(retval, reserve_tail, NULL); + +nodata: + return retval; +} + +struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) +{ + struct sctp_chunk *retval; + struct sctp_hmac *hmac_desc; + struct sctp_authhdr auth_hdr; + __u8 *hmac; + + /* Get the first hmac that the peer told us to use */ + hmac_desc = sctp_auth_asoc_get_hmac(asoc); + if (unlikely(!hmac_desc)) + return NULL; + + retval = sctp_make_chunk(asoc, SCTP_CID_AUTH, 0, + hmac_desc->hmac_len + sizeof(sctp_authhdr_t)); + if (!retval) + return NULL; + + auth_hdr.hmac_id = htons(hmac_desc->hmac_id); + auth_hdr.shkey_id = htons(asoc->active_key_id); + + retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(sctp_authhdr_t), + &auth_hdr); + + hmac = skb_put(retval->skb, hmac_desc->hmac_len); + memset(hmac, 0, hmac_desc->hmac_len); + + /* Adjust the chunk header to include the empty MAC */ + retval->chunk_hdr->length = + htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); + retval->chunk_end = skb_tail_pointer(retval->skb); + + return retval; +} + + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Turn an skb into a chunk. + * FIXME: Eventually move the structure directly inside the skb->cb[]. + */ +struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, + const struct sctp_association *asoc, + struct sock *sk) +{ + struct sctp_chunk *retval; + + retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC); + + if (!retval) + goto nodata; + + if (!sk) { + SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb); + } + + INIT_LIST_HEAD(&retval->list); + retval->skb = skb; + retval->asoc = (struct sctp_association *)asoc; + retval->has_tsn = 0; + retval->has_ssn = 0; + retval->rtt_in_progress = 0; + retval->sent_at = 0; + retval->singleton = 1; + retval->end_of_packet = 0; + retval->ecn_ce_done = 0; + retval->pdiscard = 0; + + /* sctpimpguide-05.txt Section 2.8.2 + * M1) Each time a new DATA chunk is transmitted + * set the 'TSN.Missing.Report' count for that TSN to 0. The + * 'TSN.Missing.Report' count will be used to determine missing chunks + * and when to fast retransmit. + */ + retval->tsn_missing_report = 0; + retval->tsn_gap_acked = 0; + retval->fast_retransmit = SCTP_CAN_FRTX; + + /* If this is a fragmented message, track all fragments + * of the message (for SEND_FAILED). + */ + retval->msg = NULL; + + /* Polish the bead hole. */ + INIT_LIST_HEAD(&retval->transmitted_list); + INIT_LIST_HEAD(&retval->frag_list); + SCTP_DBG_OBJCNT_INC(chunk); + atomic_set(&retval->refcnt, 1); + +nodata: + return retval; +} + +/* Set chunk->source and dest based on the IP header in chunk->skb. */ +void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, + union sctp_addr *dest) +{ + memcpy(&chunk->source, src, sizeof(union sctp_addr)); + memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); +} + +/* Extract the source address from a chunk. */ +const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) +{ + /* If we have a known transport, use that. */ + if (chunk->transport) { + return &chunk->transport->ipaddr; + } else { + /* Otherwise, extract it from the IP header. */ + return &chunk->source; + } +} + +/* Create a new chunk, setting the type and flags headers from the + * arguments, reserving enough space for a 'paylen' byte payload. + */ +SCTP_STATIC +struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc, + __u8 type, __u8 flags, int paylen) +{ + struct sctp_chunk *retval; + sctp_chunkhdr_t *chunk_hdr; + struct sk_buff *skb; + struct sock *sk; + + /* No need to allocate LL here, as this is only a chunk. */ + skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), + GFP_ATOMIC); + if (!skb) + goto nodata; + + /* Make room for the chunk header. */ + chunk_hdr = (sctp_chunkhdr_t *)skb_put(skb, sizeof(sctp_chunkhdr_t)); + chunk_hdr->type = type; + chunk_hdr->flags = flags; + chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t)); + + sk = asoc ? asoc->base.sk : NULL; + retval = sctp_chunkify(skb, asoc, sk); + if (!retval) { + kfree_skb(skb); + goto nodata; + } + + retval->chunk_hdr = chunk_hdr; + retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(struct sctp_chunkhdr); + + /* Determine if the chunk needs to be authenticated */ + if (sctp_auth_send_cid(type, asoc)) + retval->auth = 1; + + /* Set the skb to the belonging sock for accounting. */ + skb->sk = sk; + + return retval; +nodata: + return NULL; +} + + +/* Release the memory occupied by a chunk. */ +static void sctp_chunk_destroy(struct sctp_chunk *chunk) +{ + BUG_ON(!list_empty(&chunk->list)); + list_del_init(&chunk->transmitted_list); + + /* Free the chunk skb data and the SCTP_chunk stub itself. */ + dev_kfree_skb(chunk->skb); + + SCTP_DBG_OBJCNT_DEC(chunk); + kmem_cache_free(sctp_chunk_cachep, chunk); +} + +/* Possibly, free the chunk. */ +void sctp_chunk_free(struct sctp_chunk *chunk) +{ + /* Release our reference on the message tracker. */ + if (chunk->msg) + sctp_datamsg_put(chunk->msg); + + sctp_chunk_put(chunk); +} + +/* Grab a reference to the chunk. */ +void sctp_chunk_hold(struct sctp_chunk *ch) +{ + atomic_inc(&ch->refcnt); +} + +/* Release a reference to the chunk. */ +void sctp_chunk_put(struct sctp_chunk *ch) +{ + if (atomic_dec_and_test(&ch->refcnt)) + sctp_chunk_destroy(ch); +} + +/* Append bytes to the end of a chunk. Will panic if chunk is not big + * enough. + */ +void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) +{ + void *target; + void *padding; + int chunklen = ntohs(chunk->chunk_hdr->length); + int padlen = WORD_ROUND(chunklen) - chunklen; + + padding = skb_put(chunk->skb, padlen); + target = skb_put(chunk->skb, len); + + memset(padding, 0, padlen); + memcpy(target, data, len); + + /* Adjust the chunk length field. */ + chunk->chunk_hdr->length = htons(chunklen + padlen + len); + chunk->chunk_end = skb_tail_pointer(chunk->skb); + + return target; +} + +/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient + * space in the chunk + */ +void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk, + int len, const void *data) +{ + if (skb_tailroom(chunk->skb) >= len) + return sctp_addto_chunk(chunk, len, data); + else + return NULL; +} + +/* Append bytes from user space to the end of a chunk. Will panic if + * chunk is not big enough. + * Returns a kernel err value. + */ +int sctp_user_addto_chunk(struct sctp_chunk *chunk, int off, int len, + struct iovec *data) +{ + __u8 *target; + int err = 0; + + /* Make room in chunk for data. */ + target = skb_put(chunk->skb, len); + + /* Copy data (whole iovec) into chunk */ + if ((err = memcpy_fromiovecend(target, data, off, len))) + goto out; + + /* Adjust the chunk length field. */ + chunk->chunk_hdr->length = + htons(ntohs(chunk->chunk_hdr->length) + len); + chunk->chunk_end = skb_tail_pointer(chunk->skb); + +out: + return err; +} + +/* Helper function to assign a TSN if needed. This assumes that both + * the data_hdr and association have already been assigned. + */ +void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) +{ + struct sctp_datamsg *msg; + struct sctp_chunk *lchunk; + struct sctp_stream *stream; + __u16 ssn; + __u16 sid; + + if (chunk->has_ssn) + return; + + /* All fragments will be on the same stream */ + sid = ntohs(chunk->subh.data_hdr->stream); + stream = &chunk->asoc->ssnmap->out; + + /* Now assign the sequence number to the entire message. + * All fragments must have the same stream sequence number. + */ + msg = chunk->msg; + list_for_each_entry(lchunk, &msg->chunks, frag_list) { + if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { + ssn = 0; + } else { + if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) + ssn = sctp_ssn_next(stream, sid); + else + ssn = sctp_ssn_peek(stream, sid); + } + + lchunk->subh.data_hdr->ssn = htons(ssn); + lchunk->has_ssn = 1; + } +} + +/* Helper function to assign a TSN if needed. This assumes that both + * the data_hdr and association have already been assigned. + */ +void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) +{ + if (!chunk->has_tsn) { + /* This is the last possible instant to + * assign a TSN. + */ + chunk->subh.data_hdr->tsn = + htonl(sctp_association_get_next_tsn(chunk->asoc)); + chunk->has_tsn = 1; + } +} + +/* Create a CLOSED association to use with an incoming packet. */ +struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, + struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sctp_association *asoc; + struct sk_buff *skb; + sctp_scope_t scope; + struct sctp_af *af; + + /* Create the bare association. */ + scope = sctp_scope(sctp_source(chunk)); + asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); + if (!asoc) + goto nodata; + asoc->temp = 1; + skb = chunk->skb; + /* Create an entry for the source address of the packet. */ + af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version)); + if (unlikely(!af)) + goto fail; + af->from_skb(&asoc->c.peer_addr, skb, 1); +nodata: + return asoc; + +fail: + sctp_association_free(asoc); + return NULL; +} + +/* Build a cookie representing asoc. + * This INCLUDES the param header needed to put the cookie in the INIT ACK. + */ +static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *init_chunk, + int *cookie_len, + const __u8 *raw_addrs, int addrs_len) +{ + sctp_cookie_param_t *retval; + struct sctp_signed_cookie *cookie; + struct scatterlist sg; + int headersize, bodysize; + unsigned int keylen; + char *key; + + /* Header size is static data prior to the actual cookie, including + * any padding. + */ + headersize = sizeof(sctp_paramhdr_t) + + (sizeof(struct sctp_signed_cookie) - + sizeof(struct sctp_cookie)); + bodysize = sizeof(struct sctp_cookie) + + ntohs(init_chunk->chunk_hdr->length) + addrs_len; + + /* Pad out the cookie to a multiple to make the signature + * functions simpler to write. + */ + if (bodysize % SCTP_COOKIE_MULTIPLE) + bodysize += SCTP_COOKIE_MULTIPLE + - (bodysize % SCTP_COOKIE_MULTIPLE); + *cookie_len = headersize + bodysize; + + /* Clear this memory since we are sending this data structure + * out on the network. + */ + retval = kzalloc(*cookie_len, GFP_ATOMIC); + if (!retval) + goto nodata; + + cookie = (struct sctp_signed_cookie *) retval->body; + + /* Set up the parameter header. */ + retval->p.type = SCTP_PARAM_STATE_COOKIE; + retval->p.length = htons(*cookie_len); + + /* Copy the cookie part of the association itself. */ + cookie->c = asoc->c; + /* Save the raw address list length in the cookie. */ + cookie->c.raw_addr_list_len = addrs_len; + + /* Remember PR-SCTP capability. */ + cookie->c.prsctp_capable = asoc->peer.prsctp_capable; + + /* Save adaptation indication in the cookie. */ + cookie->c.adaptation_ind = asoc->peer.adaptation_ind; + + /* Set an expiration time for the cookie. */ + do_gettimeofday(&cookie->c.expiration); + TIMEVAL_ADD(asoc->cookie_life, cookie->c.expiration); + + /* Copy the peer's init packet. */ + memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr, + ntohs(init_chunk->chunk_hdr->length)); + + /* Copy the raw local address list of the association. */ + memcpy((__u8 *)&cookie->c.peer_init[0] + + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); + + if (sctp_sk(ep->base.sk)->hmac) { + struct hash_desc desc; + + /* Sign the message. */ + sg_init_one(&sg, &cookie->c, bodysize); + keylen = SCTP_SECRET_SIZE; + key = (char *)ep->secret_key[ep->current_key]; + desc.tfm = sctp_sk(ep->base.sk)->hmac; + desc.flags = 0; + + if (crypto_hash_setkey(desc.tfm, key, keylen) || + crypto_hash_digest(&desc, &sg, bodysize, cookie->signature)) + goto free_cookie; + } + + return retval; + +free_cookie: + kfree(retval); +nodata: + *cookie_len = 0; + return NULL; +} + +/* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ +struct sctp_association *sctp_unpack_cookie( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, gfp_t gfp, + int *error, struct sctp_chunk **errp) +{ + struct sctp_association *retval = NULL; + struct sctp_signed_cookie *cookie; + struct sctp_cookie *bear_cookie; + int headersize, bodysize, fixed_size; + __u8 *digest = ep->digest; + struct scatterlist sg; + unsigned int keylen, len; + char *key; + sctp_scope_t scope; + struct sk_buff *skb = chunk->skb; + struct timeval tv; + struct hash_desc desc; + + /* Header size is static data prior to the actual cookie, including + * any padding. + */ + headersize = sizeof(sctp_chunkhdr_t) + + (sizeof(struct sctp_signed_cookie) - + sizeof(struct sctp_cookie)); + bodysize = ntohs(chunk->chunk_hdr->length) - headersize; + fixed_size = headersize + sizeof(struct sctp_cookie); + + /* Verify that the chunk looks like it even has a cookie. + * There must be enough room for our cookie and our peer's + * INIT chunk. + */ + len = ntohs(chunk->chunk_hdr->length); + if (len < fixed_size + sizeof(struct sctp_chunkhdr)) + goto malformed; + + /* Verify that the cookie has been padded out. */ + if (bodysize % SCTP_COOKIE_MULTIPLE) + goto malformed; + + /* Process the cookie. */ + cookie = chunk->subh.cookie_hdr; + bear_cookie = &cookie->c; + + if (!sctp_sk(ep->base.sk)->hmac) + goto no_hmac; + + /* Check the signature. */ + keylen = SCTP_SECRET_SIZE; + sg_init_one(&sg, bear_cookie, bodysize); + key = (char *)ep->secret_key[ep->current_key]; + desc.tfm = sctp_sk(ep->base.sk)->hmac; + desc.flags = 0; + + memset(digest, 0x00, SCTP_SIGNATURE_SIZE); + if (crypto_hash_setkey(desc.tfm, key, keylen) || + crypto_hash_digest(&desc, &sg, bodysize, digest)) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } + + if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { + /* Try the previous key. */ + key = (char *)ep->secret_key[ep->last_key]; + memset(digest, 0x00, SCTP_SIGNATURE_SIZE); + if (crypto_hash_setkey(desc.tfm, key, keylen) || + crypto_hash_digest(&desc, &sg, bodysize, digest)) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } + + if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { + /* Yikes! Still bad signature! */ + *error = -SCTP_IERROR_BAD_SIG; + goto fail; + } + } + +no_hmac: + /* IG Section 2.35.2: + * 3) Compare the port numbers and the verification tag contained + * within the COOKIE ECHO chunk to the actual port numbers and the + * verification tag within the SCTP common header of the received + * packet. If these values do not match the packet MUST be silently + * discarded, + */ + if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { + *error = -SCTP_IERROR_BAD_TAG; + goto fail; + } + + if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || + ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { + *error = -SCTP_IERROR_BAD_PORTS; + goto fail; + } + + /* Check to see if the cookie is stale. If there is already + * an association, there is no need to check cookie's expiration + * for init collision case of lost COOKIE ACK. + * If skb has been timestamped, then use the stamp, otherwise + * use current time. This introduces a small possibility that + * that a cookie may be considered expired, but his would only slow + * down the new association establishment instead of every packet. + */ + if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) + skb_get_timestamp(skb, &tv); + else + do_gettimeofday(&tv); + + if (!asoc && tv_lt(bear_cookie->expiration, tv)) { + /* + * Section 3.3.10.3 Stale Cookie Error (3) + * + * Cause of error + * --------------- + * Stale Cookie Error: Indicates the receipt of a valid State + * Cookie that has expired. + */ + len = ntohs(chunk->chunk_hdr->length); + *errp = sctp_make_op_error_space(asoc, chunk, len); + if (*errp) { + suseconds_t usecs = (tv.tv_sec - + bear_cookie->expiration.tv_sec) * 1000000L + + tv.tv_usec - bear_cookie->expiration.tv_usec; + __be32 n = htonl(usecs); + + sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, + sizeof(n)); + sctp_addto_chunk(*errp, sizeof(n), &n); + *error = -SCTP_IERROR_STALE_COOKIE; + } else + *error = -SCTP_IERROR_NOMEM; + + goto fail; + } + + /* Make a new base association. */ + scope = sctp_scope(sctp_source(chunk)); + retval = sctp_association_new(ep, ep->base.sk, scope, gfp); + if (!retval) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } + + /* Set up our peer's port number. */ + retval->peer.port = ntohs(chunk->sctp_hdr->source); + + /* Populate the association from the cookie. */ + memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); + + if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, + GFP_ATOMIC) < 0) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } + + /* Also, add the destination address. */ + if (list_empty(&retval->base.bind_addr.address_list)) { + sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, + SCTP_ADDR_SRC, GFP_ATOMIC); + } + + retval->next_tsn = retval->c.initial_tsn; + retval->ctsn_ack_point = retval->next_tsn - 1; + retval->addip_serial = retval->c.initial_tsn; + retval->adv_peer_ack_point = retval->ctsn_ack_point; + retval->peer.prsctp_capable = retval->c.prsctp_capable; + retval->peer.adaptation_ind = retval->c.adaptation_ind; + + /* The INIT stuff will be done by the side effects. */ + return retval; + +fail: + if (retval) + sctp_association_free(retval); + + return NULL; + +malformed: + /* Yikes! The packet is either corrupt or deliberately + * malformed. + */ + *error = -SCTP_IERROR_MALFORMED; + goto fail; +} + +/******************************************************************** + * 3rd Level Abstractions + ********************************************************************/ + +struct __sctp_missing { + __be32 num_missing; + __be16 type; +} __packed; + +/* + * Report a missing mandatory parameter. + */ +static int sctp_process_missing_param(const struct sctp_association *asoc, + sctp_param_t paramtype, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + struct __sctp_missing report; + __u16 len; + + len = WORD_ROUND(sizeof(report)); + + /* Make an ERROR chunk, preparing enough room for + * returning multiple unknown parameters. + */ + if (!*errp) + *errp = sctp_make_op_error_space(asoc, chunk, len); + + if (*errp) { + report.num_missing = htonl(1); + report.type = paramtype; + sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, + sizeof(report)); + sctp_addto_chunk(*errp, sizeof(report), &report); + } + + /* Stop processing this chunk. */ + return 0; +} + +/* Report an Invalid Mandatory Parameter. */ +static int sctp_process_inv_mandatory(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + /* Invalid Mandatory Parameter Error has no payload. */ + + if (!*errp) + *errp = sctp_make_op_error_space(asoc, chunk, 0); + + if (*errp) + sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); + + /* Stop processing this chunk. */ + return 0; +} + +static int sctp_process_inv_paramlength(const struct sctp_association *asoc, + struct sctp_paramhdr *param, + const struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + /* This is a fatal error. Any accumulated non-fatal errors are + * not reported. + */ + if (*errp) + sctp_chunk_free(*errp); + + /* Create an error chunk and fill it in with our payload. */ + *errp = sctp_make_violation_paramlen(asoc, chunk, param); + + return 0; +} + + +/* Do not attempt to handle the HOST_NAME parm. However, do + * send back an indicator to the peer. + */ +static int sctp_process_hn_param(const struct sctp_association *asoc, + union sctp_params param, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + __u16 len = ntohs(param.p->length); + + /* Processing of the HOST_NAME parameter will generate an + * ABORT. If we've accumulated any non-fatal errors, they + * would be unrecognized parameters and we should not include + * them in the ABORT. + */ + if (*errp) + sctp_chunk_free(*errp); + + *errp = sctp_make_op_error_space(asoc, chunk, len); + + if (*errp) { + sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len); + sctp_addto_chunk(*errp, len, param.v); + } + + /* Stop processing this chunk. */ + return 0; +} + +static int sctp_verify_ext_param(union sctp_params param) +{ + __u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + int have_auth = 0; + int have_asconf = 0; + int i; + + for (i = 0; i < num_ext; i++) { + switch (param.ext->chunks[i]) { + case SCTP_CID_AUTH: + have_auth = 1; + break; + case SCTP_CID_ASCONF: + case SCTP_CID_ASCONF_ACK: + have_asconf = 1; + break; + } + } + + /* ADD-IP Security: The draft requires us to ABORT or ignore the + * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this + * only if ADD-IP is turned on and we are not backward-compatible + * mode. + */ + if (sctp_addip_noauth) + return 1; + + if (sctp_addip_enable && !have_auth && have_asconf) + return 0; + + return 1; +} + +static void sctp_process_ext_param(struct sctp_association *asoc, + union sctp_params param) +{ + __u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + int i; + + for (i = 0; i < num_ext; i++) { + switch (param.ext->chunks[i]) { + case SCTP_CID_FWD_TSN: + if (sctp_prsctp_enable && + !asoc->peer.prsctp_capable) + asoc->peer.prsctp_capable = 1; + break; + case SCTP_CID_AUTH: + /* if the peer reports AUTH, assume that he + * supports AUTH. + */ + if (sctp_auth_enable) + asoc->peer.auth_capable = 1; + break; + case SCTP_CID_ASCONF: + case SCTP_CID_ASCONF_ACK: + if (sctp_addip_enable) + asoc->peer.asconf_capable = 1; + break; + default: + break; + } + } +} + +/* RFC 3.2.1 & the Implementers Guide 2.2. + * + * The Parameter Types are encoded such that the + * highest-order two bits specify the action that must be + * taken if the processing endpoint does not recognize the + * Parameter Type. + * + * 00 - Stop processing this parameter; do not process any further + * parameters within this chunk + * + * 01 - Stop processing this parameter, do not process any further + * parameters within this chunk, and report the unrecognized + * parameter in an 'Unrecognized Parameter' ERROR chunk. + * + * 10 - Skip this parameter and continue processing. + * + * 11 - Skip this parameter and continue processing but + * report the unrecognized parameter in an + * 'Unrecognized Parameter' ERROR chunk. + * + * Return value: + * SCTP_IERROR_NO_ERROR - continue with the chunk + * SCTP_IERROR_ERROR - stop and report an error. + * SCTP_IERROR_NOMEME - out of memory. + */ +static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, + union sctp_params param, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + int retval = SCTP_IERROR_NO_ERROR; + + switch (param.p->type & SCTP_PARAM_ACTION_MASK) { + case SCTP_PARAM_ACTION_DISCARD: + retval = SCTP_IERROR_ERROR; + break; + case SCTP_PARAM_ACTION_SKIP: + break; + case SCTP_PARAM_ACTION_DISCARD_ERR: + retval = SCTP_IERROR_ERROR; + /* Fall through */ + case SCTP_PARAM_ACTION_SKIP_ERR: + /* Make an ERROR chunk, preparing enough room for + * returning multiple unknown parameters. + */ + if (NULL == *errp) + *errp = sctp_make_op_error_fixed(asoc, chunk); + + if (*errp) { + if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, + WORD_ROUND(ntohs(param.p->length)))) + sctp_addto_chunk_fixed(*errp, + WORD_ROUND(ntohs(param.p->length)), + param.v); + } else { + /* If there is no memory for generating the ERROR + * report as specified, an ABORT will be triggered + * to the peer and the association won't be + * established. + */ + retval = SCTP_IERROR_NOMEM; + } + break; + default: + break; + } + + return retval; +} + +/* Verify variable length parameters + * Return values: + * SCTP_IERROR_ABORT - trigger an ABORT + * SCTP_IERROR_NOMEM - out of memory (abort) + * SCTP_IERROR_ERROR - stop processing, trigger an ERROR + * SCTP_IERROR_NO_ERROR - continue with the chunk + */ +static sctp_ierror_t sctp_verify_param(const struct sctp_association *asoc, + union sctp_params param, + sctp_cid_t cid, + struct sctp_chunk *chunk, + struct sctp_chunk **err_chunk) +{ + struct sctp_hmac_algo_param *hmacs; + int retval = SCTP_IERROR_NO_ERROR; + __u16 n_elt, id = 0; + int i; + + /* FIXME - This routine is not looking at each parameter per the + * chunk type, i.e., unrecognized parameters should be further + * identified based on the chunk id. + */ + + switch (param.p->type) { + case SCTP_PARAM_IPV4_ADDRESS: + case SCTP_PARAM_IPV6_ADDRESS: + case SCTP_PARAM_COOKIE_PRESERVATIVE: + case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: + case SCTP_PARAM_STATE_COOKIE: + case SCTP_PARAM_HEARTBEAT_INFO: + case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: + case SCTP_PARAM_ECN_CAPABLE: + case SCTP_PARAM_ADAPTATION_LAYER_IND: + break; + + case SCTP_PARAM_SUPPORTED_EXT: + if (!sctp_verify_ext_param(param)) + return SCTP_IERROR_ABORT; + break; + + case SCTP_PARAM_SET_PRIMARY: + if (sctp_addip_enable) + break; + goto fallthrough; + + case SCTP_PARAM_HOST_NAME_ADDRESS: + /* Tell the peer, we won't support this param. */ + sctp_process_hn_param(asoc, param, chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + break; + + case SCTP_PARAM_FWD_TSN_SUPPORT: + if (sctp_prsctp_enable) + break; + goto fallthrough; + + case SCTP_PARAM_RANDOM: + if (!sctp_auth_enable) + goto fallthrough; + + /* SCTP-AUTH: Secion 6.1 + * If the random number is not 32 byte long the association + * MUST be aborted. The ABORT chunk SHOULD contain the error + * cause 'Protocol Violation'. + */ + if (SCTP_AUTH_RANDOM_LENGTH != + ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; + + case SCTP_PARAM_CHUNKS: + if (!sctp_auth_enable) + goto fallthrough; + + /* SCTP-AUTH: Section 3.2 + * The CHUNKS parameter MUST be included once in the INIT or + * INIT-ACK chunk if the sender wants to receive authenticated + * chunks. Its maximum length is 260 bytes. + */ + if (260 < ntohs(param.p->length)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; + + case SCTP_PARAM_HMAC_ALGO: + if (!sctp_auth_enable) + goto fallthrough; + + hmacs = (struct sctp_hmac_algo_param *)param.p; + n_elt = (ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) >> 1; + + /* SCTP-AUTH: Section 6.1 + * The HMAC algorithm based on SHA-1 MUST be supported and + * included in the HMAC-ALGO parameter. + */ + for (i = 0; i < n_elt; i++) { + id = ntohs(hmacs->hmac_ids[i]); + + if (id == SCTP_AUTH_HMAC_ID_SHA1) + break; + } + + if (id != SCTP_AUTH_HMAC_ID_SHA1) { + sctp_process_inv_paramlength(asoc, param.p, chunk, + err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; +fallthrough: + default: + SCTP_DEBUG_PRINTK("Unrecognized param: %d for chunk %d.\n", + ntohs(param.p->type), cid); + retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); + break; + } + return retval; +} + +/* Verify the INIT packet before we process it. */ +int sctp_verify_init(const struct sctp_association *asoc, + sctp_cid_t cid, + sctp_init_chunk_t *peer_init, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + union sctp_params param; + int has_cookie = 0; + int result; + + /* Verify stream values are non-zero. */ + if ((0 == peer_init->init_hdr.num_outbound_streams) || + (0 == peer_init->init_hdr.num_inbound_streams) || + (0 == peer_init->init_hdr.init_tag) || + (SCTP_DEFAULT_MINWINDOW > ntohl(peer_init->init_hdr.a_rwnd))) { + + return sctp_process_inv_mandatory(asoc, chunk, errp); + } + + /* Check for missing mandatory parameters. */ + sctp_walk_params(param, peer_init, init_hdr.params) { + + if (SCTP_PARAM_STATE_COOKIE == param.p->type) + has_cookie = 1; + + } /* for (loop through all parameters) */ + + /* There is a possibility that a parameter length was bad and + * in that case we would have stoped walking the parameters. + * The current param.p would point at the bad one. + * Current consensus on the mailing list is to generate a PROTOCOL + * VIOLATION error. We build the ERROR chunk here and let the normal + * error handling code build and send the packet. + */ + if (param.v != (void*)chunk->chunk_end) + return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); + + /* The only missing mandatory param possible today is + * the state cookie for an INIT-ACK chunk. + */ + if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) + return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, + chunk, errp); + + /* Verify all the variable length parameters */ + sctp_walk_params(param, peer_init, init_hdr.params) { + + result = sctp_verify_param(asoc, param, cid, chunk, errp); + switch (result) { + case SCTP_IERROR_ABORT: + case SCTP_IERROR_NOMEM: + return 0; + case SCTP_IERROR_ERROR: + return 1; + case SCTP_IERROR_NO_ERROR: + default: + break; + } + + } /* for (loop through all parameters) */ + + return 1; +} + +/* Unpack the parameters in an INIT packet into an association. + * Returns 0 on failure, else success. + * FIXME: This is an association method. + */ +int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, + const union sctp_addr *peer_addr, + sctp_init_chunk_t *peer_init, gfp_t gfp) +{ + union sctp_params param; + struct sctp_transport *transport; + struct list_head *pos, *temp; + struct sctp_af *af; + union sctp_addr addr; + char *cookie; + int src_match = 0; + + /* We must include the address that the INIT packet came from. + * This is the only address that matters for an INIT packet. + * When processing a COOKIE ECHO, we retrieve the from address + * of the INIT from the cookie. + */ + + /* This implementation defaults to making the first transport + * added as the primary transport. The source address seems to + * be a a better choice than any of the embedded addresses. + */ + if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) + goto nomem; + + if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) + src_match = 1; + + /* Process the initialization parameters. */ + sctp_walk_params(param, peer_init, init_hdr.params) { + if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || + param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { + af = sctp_get_af_specific(param_type2af(param.p->type)); + af->from_addr_param(&addr, param.addr, + chunk->sctp_hdr->source, 0); + if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) + src_match = 1; + } + + if (!sctp_process_param(asoc, param, peer_addr, gfp)) + goto clean_up; + } + + /* source address of chunk may not match any valid address */ + if (!src_match) + goto clean_up; + + /* AUTH: After processing the parameters, make sure that we + * have all the required info to potentially do authentications. + */ + if (asoc->peer.auth_capable && (!asoc->peer.peer_random || + !asoc->peer.peer_hmacs)) + asoc->peer.auth_capable = 0; + + /* In a non-backward compatible mode, if the peer claims + * support for ADD-IP but not AUTH, the ADD-IP spec states + * that we MUST ABORT the association. Section 6. The section + * also give us an option to silently ignore the packet, which + * is what we'll do here. + */ + if (!sctp_addip_noauth && + (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { + asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | + SCTP_PARAM_DEL_IP | + SCTP_PARAM_SET_PRIMARY); + asoc->peer.asconf_capable = 0; + goto clean_up; + } + + /* Walk list of transports, removing transports in the UNKNOWN state. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + if (transport->state == SCTP_UNKNOWN) { + sctp_assoc_rm_peer(asoc, transport); + } + } + + /* The fixed INIT headers are always in network byte + * order. + */ + asoc->peer.i.init_tag = + ntohl(peer_init->init_hdr.init_tag); + asoc->peer.i.a_rwnd = + ntohl(peer_init->init_hdr.a_rwnd); + asoc->peer.i.num_outbound_streams = + ntohs(peer_init->init_hdr.num_outbound_streams); + asoc->peer.i.num_inbound_streams = + ntohs(peer_init->init_hdr.num_inbound_streams); + asoc->peer.i.initial_tsn = + ntohl(peer_init->init_hdr.initial_tsn); + + /* Apply the upper bounds for output streams based on peer's + * number of inbound streams. + */ + if (asoc->c.sinit_num_ostreams > + ntohs(peer_init->init_hdr.num_inbound_streams)) { + asoc->c.sinit_num_ostreams = + ntohs(peer_init->init_hdr.num_inbound_streams); + } + + if (asoc->c.sinit_max_instreams > + ntohs(peer_init->init_hdr.num_outbound_streams)) { + asoc->c.sinit_max_instreams = + ntohs(peer_init->init_hdr.num_outbound_streams); + } + + /* Copy Initiation tag from INIT to VT_peer in cookie. */ + asoc->c.peer_vtag = asoc->peer.i.init_tag; + + /* Peer Rwnd : Current calculated value of the peer's rwnd. */ + asoc->peer.rwnd = asoc->peer.i.a_rwnd; + + /* Copy cookie in case we need to resend COOKIE-ECHO. */ + cookie = asoc->peer.cookie; + if (cookie) { + asoc->peer.cookie = kmemdup(cookie, asoc->peer.cookie_len, gfp); + if (!asoc->peer.cookie) + goto clean_up; + } + + /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily + * high (for example, implementations MAY use the size of the receiver + * advertised window). + */ + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + transport->ssthresh = asoc->peer.i.a_rwnd; + } + + /* Set up the TSN tracking pieces. */ + if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, + asoc->peer.i.initial_tsn, gfp)) + goto clean_up; + + /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number + * + * The stream sequence number in all the streams shall start + * from 0 when the association is established. Also, when the + * stream sequence number reaches the value 65535 the next + * stream sequence number shall be set to 0. + */ + + /* Allocate storage for the negotiated streams if it is not a temporary + * association. + */ + if (!asoc->temp) { + int error; + + asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams, + asoc->c.sinit_num_ostreams, gfp); + if (!asoc->ssnmap) + goto clean_up; + + error = sctp_assoc_set_id(asoc, gfp); + if (error) + goto clean_up; + } + + /* ADDIP Section 4.1 ASCONF Chunk Procedures + * + * When an endpoint has an ASCONF signaled change to be sent to the + * remote endpoint it should do the following: + * ... + * A2) A serial number should be assigned to the Chunk. The serial + * number should be a monotonically increasing number. All serial + * numbers are defined to be initialized at the start of the + * association to the same value as the Initial TSN. + */ + asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; + return 1; + +clean_up: + /* Release the transport structures. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + if (transport->state != SCTP_ACTIVE) + sctp_assoc_rm_peer(asoc, transport); + } + +nomem: + return 0; +} + + +/* Update asoc with the option described in param. + * + * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT + * + * asoc is the association to update. + * param is the variable length parameter to use for update. + * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. + * If the current packet is an INIT we want to minimize the amount of + * work we do. In particular, we should not build transport + * structures for the addresses. + */ +static int sctp_process_param(struct sctp_association *asoc, + union sctp_params param, + const union sctp_addr *peer_addr, + gfp_t gfp) +{ + union sctp_addr addr; + int i; + __u16 sat; + int retval = 1; + sctp_scope_t scope; + time_t stale; + struct sctp_af *af; + union sctp_addr_param *addr_param; + struct sctp_transport *t; + + /* We maintain all INIT parameters in network byte order all the + * time. This allows us to not worry about whether the parameters + * came from a fresh INIT, and INIT ACK, or were stored in a cookie. + */ + switch (param.p->type) { + case SCTP_PARAM_IPV6_ADDRESS: + if (PF_INET6 != asoc->base.sk->sk_family) + break; + goto do_addr_param; + + case SCTP_PARAM_IPV4_ADDRESS: + /* v4 addresses are not allowed on v6-only socket */ + if (ipv6_only_sock(asoc->base.sk)) + break; +do_addr_param: + af = sctp_get_af_specific(param_type2af(param.p->type)); + af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0); + scope = sctp_scope(peer_addr); + if (sctp_in_scope(&addr, scope)) + if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) + return 0; + break; + + case SCTP_PARAM_COOKIE_PRESERVATIVE: + if (!sctp_cookie_preserve_enable) + break; + + stale = ntohl(param.life->lifespan_increment); + + /* Suggested Cookie Life span increment's unit is msec, + * (1/1000sec). + */ + asoc->cookie_life.tv_sec += stale / 1000; + asoc->cookie_life.tv_usec += (stale % 1000) * 1000; + break; + + case SCTP_PARAM_HOST_NAME_ADDRESS: + SCTP_DEBUG_PRINTK("unimplemented SCTP_HOST_NAME_ADDRESS\n"); + break; + + case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: + /* Turn off the default values first so we'll know which + * ones are really set by the peer. + */ + asoc->peer.ipv4_address = 0; + asoc->peer.ipv6_address = 0; + + /* Assume that peer supports the address family + * by which it sends a packet. + */ + if (peer_addr->sa.sa_family == AF_INET6) + asoc->peer.ipv6_address = 1; + else if (peer_addr->sa.sa_family == AF_INET) + asoc->peer.ipv4_address = 1; + + /* Cycle through address types; avoid divide by 0. */ + sat = ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + if (sat) + sat /= sizeof(__u16); + + for (i = 0; i < sat; ++i) { + switch (param.sat->types[i]) { + case SCTP_PARAM_IPV4_ADDRESS: + asoc->peer.ipv4_address = 1; + break; + + case SCTP_PARAM_IPV6_ADDRESS: + if (PF_INET6 == asoc->base.sk->sk_family) + asoc->peer.ipv6_address = 1; + break; + + case SCTP_PARAM_HOST_NAME_ADDRESS: + asoc->peer.hostname_address = 1; + break; + + default: /* Just ignore anything else. */ + break; + } + } + break; + + case SCTP_PARAM_STATE_COOKIE: + asoc->peer.cookie_len = + ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + asoc->peer.cookie = param.cookie->body; + break; + + case SCTP_PARAM_HEARTBEAT_INFO: + /* Would be odd to receive, but it causes no problems. */ + break; + + case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: + /* Rejected during verify stage. */ + break; + + case SCTP_PARAM_ECN_CAPABLE: + asoc->peer.ecn_capable = 1; + break; + + case SCTP_PARAM_ADAPTATION_LAYER_IND: + asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); + break; + + case SCTP_PARAM_SET_PRIMARY: + if (!sctp_addip_enable) + goto fall_through; + + addr_param = param.v + sizeof(sctp_addip_param_t); + + af = sctp_get_af_specific(param_type2af(param.p->type)); + af->from_addr_param(&addr, addr_param, + htons(asoc->peer.port), 0); + + /* if the address is invalid, we can't process it. + * XXX: see spec for what to do. + */ + if (!af->addr_valid(&addr, NULL, NULL)) + break; + + t = sctp_assoc_lookup_paddr(asoc, &addr); + if (!t) + break; + + sctp_assoc_set_primary(asoc, t); + break; + + case SCTP_PARAM_SUPPORTED_EXT: + sctp_process_ext_param(asoc, param); + break; + + case SCTP_PARAM_FWD_TSN_SUPPORT: + if (sctp_prsctp_enable) { + asoc->peer.prsctp_capable = 1; + break; + } + /* Fall Through */ + goto fall_through; + + case SCTP_PARAM_RANDOM: + if (!sctp_auth_enable) + goto fall_through; + + /* Save peer's random parameter */ + asoc->peer.peer_random = kmemdup(param.p, + ntohs(param.p->length), gfp); + if (!asoc->peer.peer_random) { + retval = 0; + break; + } + break; + + case SCTP_PARAM_HMAC_ALGO: + if (!sctp_auth_enable) + goto fall_through; + + /* Save peer's HMAC list */ + asoc->peer.peer_hmacs = kmemdup(param.p, + ntohs(param.p->length), gfp); + if (!asoc->peer.peer_hmacs) { + retval = 0; + break; + } + + /* Set the default HMAC the peer requested*/ + sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); + break; + + case SCTP_PARAM_CHUNKS: + if (!sctp_auth_enable) + goto fall_through; + + asoc->peer.peer_chunks = kmemdup(param.p, + ntohs(param.p->length), gfp); + if (!asoc->peer.peer_chunks) + retval = 0; + break; +fall_through: + default: + /* Any unrecognized parameters should have been caught + * and handled by sctp_verify_param() which should be + * called prior to this routine. Simply log the error + * here. + */ + SCTP_DEBUG_PRINTK("Ignoring param: %d for association %p.\n", + ntohs(param.p->type), asoc); + break; + } + + return retval; +} + +/* Select a new verification tag. */ +__u32 sctp_generate_tag(const struct sctp_endpoint *ep) +{ + /* I believe that this random number generator complies with RFC1750. + * A tag of 0 is reserved for special cases (e.g. INIT). + */ + __u32 x; + + do { + get_random_bytes(&x, sizeof(__u32)); + } while (x == 0); + + return x; +} + +/* Select an initial TSN to send during startup. */ +__u32 sctp_generate_tsn(const struct sctp_endpoint *ep) +{ + __u32 retval; + + get_random_bytes(&retval, sizeof(__u32)); + return retval; +} + +/* + * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0xC1 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Serial Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter #1 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / .... / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter #N | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Address Parameter and other parameter will not be wrapped in this function + */ +static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, + union sctp_addr *addr, + int vparam_len) +{ + sctp_addiphdr_t asconf; + struct sctp_chunk *retval; + int length = sizeof(asconf) + vparam_len; + union sctp_addr_param addrparam; + int addrlen; + struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); + + addrlen = af->to_addr_param(addr, &addrparam); + if (!addrlen) + return NULL; + length += addrlen; + + /* Create the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF, 0, length); + if (!retval) + return NULL; + + asconf.serial = htonl(asoc->addip_serial++); + + retval->subh.addip_hdr = + sctp_addto_chunk(retval, sizeof(asconf), &asconf); + retval->param_hdr.v = + sctp_addto_chunk(retval, addrlen, &addrparam); + + return retval; +} + +/* ADDIP + * 3.2.1 Add IP Address + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0xC001 | Length = Variable | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF-Request Correlation ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * 3.2.2 Delete IP Address + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0xC002 | Length = Variable | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF-Request Correlation ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + */ +struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, + union sctp_addr *laddr, + struct sockaddr *addrs, + int addrcnt, + __be16 flags) +{ + sctp_addip_param_t param; + struct sctp_chunk *retval; + union sctp_addr_param addr_param; + union sctp_addr *addr; + void *addr_buf; + struct sctp_af *af; + int paramlen = sizeof(param); + int addr_param_len = 0; + int totallen = 0; + int i; + + /* Get total length of all the address parameters. */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + addr_param_len = af->to_addr_param(addr, &addr_param); + + totallen += paramlen; + totallen += addr_param_len; + + addr_buf += af->sockaddr_len; + } + + /* Create an asconf chunk with the required length. */ + retval = sctp_make_asconf(asoc, laddr, totallen); + if (!retval) + return NULL; + + /* Add the address parameters to the asconf chunk. */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + addr_param_len = af->to_addr_param(addr, &addr_param); + param.param_hdr.type = flags; + param.param_hdr.length = htons(paramlen + addr_param_len); + param.crr_id = i; + + sctp_addto_chunk(retval, paramlen, ¶m); + sctp_addto_chunk(retval, addr_param_len, &addr_param); + + addr_buf += af->sockaddr_len; + } + return retval; +} + +/* ADDIP + * 3.2.4 Set Primary IP Address + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type =0xC004 | Length = Variable | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF-Request Correlation ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Create an ASCONF chunk with Set Primary IP address parameter. + */ +struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, + union sctp_addr *addr) +{ + sctp_addip_param_t param; + struct sctp_chunk *retval; + int len = sizeof(param); + union sctp_addr_param addrparam; + int addrlen; + struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); + + addrlen = af->to_addr_param(addr, &addrparam); + if (!addrlen) + return NULL; + len += addrlen; + + /* Create the chunk and make asconf header. */ + retval = sctp_make_asconf(asoc, addr, len); + if (!retval) + return NULL; + + param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; + param.param_hdr.length = htons(len); + param.crr_id = 0; + + sctp_addto_chunk(retval, sizeof(param), ¶m); + sctp_addto_chunk(retval, addrlen, &addrparam); + + return retval; +} + +/* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0x80 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Serial Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter Response#1 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / .... / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter Response#N | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Create an ASCONF_ACK chunk with enough space for the parameter responses. + */ +static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, + __u32 serial, int vparam_len) +{ + sctp_addiphdr_t asconf; + struct sctp_chunk *retval; + int length = sizeof(asconf) + vparam_len; + + /* Create the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF_ACK, 0, length); + if (!retval) + return NULL; + + asconf.serial = htonl(serial); + + retval->subh.addip_hdr = + sctp_addto_chunk(retval, sizeof(asconf), &asconf); + + return retval; +} + +/* Add response parameters to an ASCONF_ACK chunk. */ +static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, + __be16 err_code, sctp_addip_param_t *asconf_param) +{ + sctp_addip_param_t ack_param; + sctp_errhdr_t err_param; + int asconf_param_len = 0; + int err_param_len = 0; + __be16 response_type; + + if (SCTP_ERROR_NO_ERROR == err_code) { + response_type = SCTP_PARAM_SUCCESS_REPORT; + } else { + response_type = SCTP_PARAM_ERR_CAUSE; + err_param_len = sizeof(err_param); + if (asconf_param) + asconf_param_len = + ntohs(asconf_param->param_hdr.length); + } + + /* Add Success Indication or Error Cause Indication parameter. */ + ack_param.param_hdr.type = response_type; + ack_param.param_hdr.length = htons(sizeof(ack_param) + + err_param_len + + asconf_param_len); + ack_param.crr_id = crr_id; + sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); + + if (SCTP_ERROR_NO_ERROR == err_code) + return; + + /* Add Error Cause parameter. */ + err_param.cause = err_code; + err_param.length = htons(err_param_len + asconf_param_len); + sctp_addto_chunk(chunk, err_param_len, &err_param); + + /* Add the failed TLV copied from ASCONF chunk. */ + if (asconf_param) + sctp_addto_chunk(chunk, asconf_param_len, asconf_param); +} + +/* Process a asconf parameter. */ +static __be16 sctp_process_asconf_param(struct sctp_association *asoc, + struct sctp_chunk *asconf, + sctp_addip_param_t *asconf_param) +{ + struct sctp_transport *peer; + struct sctp_af *af; + union sctp_addr addr; + union sctp_addr_param *addr_param; + + addr_param = (union sctp_addr_param *) + ((void *)asconf_param + sizeof(sctp_addip_param_t)); + + if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && + asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && + asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) + return SCTP_ERROR_UNKNOWN_PARAM; + + switch (addr_param->p.type) { + case SCTP_PARAM_IPV6_ADDRESS: + if (!asoc->peer.ipv6_address) + return SCTP_ERROR_DNS_FAILED; + break; + case SCTP_PARAM_IPV4_ADDRESS: + if (!asoc->peer.ipv4_address) + return SCTP_ERROR_DNS_FAILED; + break; + default: + return SCTP_ERROR_DNS_FAILED; + } + + af = sctp_get_af_specific(param_type2af(addr_param->p.type)); + if (unlikely(!af)) + return SCTP_ERROR_DNS_FAILED; + + af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); + + /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast + * or multicast address. + * (note: wildcard is permitted and requires special handling so + * make sure we check for that) + */ + if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) + return SCTP_ERROR_DNS_FAILED; + + switch (asconf_param->param_hdr.type) { + case SCTP_PARAM_ADD_IP: + /* Section 4.2.1: + * If the address 0.0.0.0 or ::0 is provided, the source + * address of the packet MUST be added. + */ + if (af->is_any(&addr)) + memcpy(&addr, &asconf->source, sizeof(addr)); + + /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address + * request and does not have the local resources to add this + * new address to the association, it MUST return an Error + * Cause TLV set to the new error code 'Operation Refused + * Due to Resource Shortage'. + */ + + peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); + if (!peer) + return SCTP_ERROR_RSRC_LOW; + + /* Start the heartbeat timer. */ + if (!mod_timer(&peer->hb_timer, sctp_transport_timeout(peer))) + sctp_transport_hold(peer); + break; + case SCTP_PARAM_DEL_IP: + /* ADDIP 4.3 D7) If a request is received to delete the + * last remaining IP address of a peer endpoint, the receiver + * MUST send an Error Cause TLV with the error cause set to the + * new error code 'Request to Delete Last Remaining IP Address'. + */ + if (asoc->peer.transport_count == 1) + return SCTP_ERROR_DEL_LAST_IP; + + /* ADDIP 4.3 D8) If a request is received to delete an IP + * address which is also the source address of the IP packet + * which contained the ASCONF chunk, the receiver MUST reject + * this request. To reject the request the receiver MUST send + * an Error Cause TLV set to the new error code 'Request to + * Delete Source IP Address' + */ + if (sctp_cmp_addr_exact(sctp_source(asconf), &addr)) + return SCTP_ERROR_DEL_SRC_IP; + + /* Section 4.2.2 + * If the address 0.0.0.0 or ::0 is provided, all + * addresses of the peer except the source address of the + * packet MUST be deleted. + */ + if (af->is_any(&addr)) { + sctp_assoc_set_primary(asoc, asconf->transport); + sctp_assoc_del_nonprimary_peers(asoc, + asconf->transport); + } else + sctp_assoc_del_peer(asoc, &addr); + break; + case SCTP_PARAM_SET_PRIMARY: + /* ADDIP Section 4.2.4 + * If the address 0.0.0.0 or ::0 is provided, the receiver + * MAY mark the source address of the packet as its + * primary. + */ + if (af->is_any(&addr)) + memcpy(&addr.v4, sctp_source(asconf), sizeof(addr)); + + peer = sctp_assoc_lookup_paddr(asoc, &addr); + if (!peer) + return SCTP_ERROR_DNS_FAILED; + + sctp_assoc_set_primary(asoc, peer); + break; + } + + return SCTP_ERROR_NO_ERROR; +} + +/* Verify the ASCONF packet before we process it. */ +int sctp_verify_asconf(const struct sctp_association *asoc, + struct sctp_paramhdr *param_hdr, void *chunk_end, + struct sctp_paramhdr **errp) { + sctp_addip_param_t *asconf_param; + union sctp_params param; + int length, plen; + + param.v = (sctp_paramhdr_t *) param_hdr; + while (param.v <= chunk_end - sizeof(sctp_paramhdr_t)) { + length = ntohs(param.p->length); + *errp = param.p; + + if (param.v > chunk_end - length || + length < sizeof(sctp_paramhdr_t)) + return 0; + + switch (param.p->type) { + case SCTP_PARAM_ADD_IP: + case SCTP_PARAM_DEL_IP: + case SCTP_PARAM_SET_PRIMARY: + asconf_param = (sctp_addip_param_t *)param.v; + plen = ntohs(asconf_param->param_hdr.length); + if (plen < sizeof(sctp_addip_param_t) + + sizeof(sctp_paramhdr_t)) + return 0; + break; + case SCTP_PARAM_SUCCESS_REPORT: + case SCTP_PARAM_ADAPTATION_LAYER_IND: + if (length != sizeof(sctp_addip_param_t)) + return 0; + + break; + default: + break; + } + + param.v += WORD_ROUND(length); + } + + if (param.v != chunk_end) + return 0; + + return 1; +} + +/* Process an incoming ASCONF chunk with the next expected serial no. and + * return an ASCONF_ACK chunk to be sent in response. + */ +struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, + struct sctp_chunk *asconf) +{ + sctp_addiphdr_t *hdr; + union sctp_addr_param *addr_param; + sctp_addip_param_t *asconf_param; + struct sctp_chunk *asconf_ack; + + __be16 err_code; + int length = 0; + int chunk_len; + __u32 serial; + int all_param_pass = 1; + + chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); + hdr = (sctp_addiphdr_t *)asconf->skb->data; + serial = ntohl(hdr->serial); + + /* Skip the addiphdr and store a pointer to address parameter. */ + length = sizeof(sctp_addiphdr_t); + addr_param = (union sctp_addr_param *)(asconf->skb->data + length); + chunk_len -= length; + + /* Skip the address parameter and store a pointer to the first + * asconf parameter. + */ + length = ntohs(addr_param->p.length); + asconf_param = (sctp_addip_param_t *)((void *)addr_param + length); + chunk_len -= length; + + /* create an ASCONF_ACK chunk. + * Based on the definitions of parameters, we know that the size of + * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF + * parameters. + */ + asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); + if (!asconf_ack) + goto done; + + /* Process the TLVs contained within the ASCONF chunk. */ + while (chunk_len > 0) { + err_code = sctp_process_asconf_param(asoc, asconf, + asconf_param); + /* ADDIP 4.1 A7) + * If an error response is received for a TLV parameter, + * all TLVs with no response before the failed TLV are + * considered successful if not reported. All TLVs after + * the failed response are considered unsuccessful unless + * a specific success indication is present for the parameter. + */ + if (SCTP_ERROR_NO_ERROR != err_code) + all_param_pass = 0; + + if (!all_param_pass) + sctp_add_asconf_response(asconf_ack, + asconf_param->crr_id, err_code, + asconf_param); + + /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add + * an IP address sends an 'Out of Resource' in its response, it + * MUST also fail any subsequent add or delete requests bundled + * in the ASCONF. + */ + if (SCTP_ERROR_RSRC_LOW == err_code) + goto done; + + /* Move to the next ASCONF param. */ + length = ntohs(asconf_param->param_hdr.length); + asconf_param = (sctp_addip_param_t *)((void *)asconf_param + + length); + chunk_len -= length; + } + +done: + asoc->peer.addip_serial++; + + /* If we are sending a new ASCONF_ACK hold a reference to it in assoc + * after freeing the reference to old asconf ack if any. + */ + if (asconf_ack) { + sctp_chunk_hold(asconf_ack); + list_add_tail(&asconf_ack->transmitted_list, + &asoc->asconf_ack_list); + } + + return asconf_ack; +} + +/* Process a asconf parameter that is successfully acked. */ +static void sctp_asconf_param_success(struct sctp_association *asoc, + sctp_addip_param_t *asconf_param) +{ + struct sctp_af *af; + union sctp_addr addr; + struct sctp_bind_addr *bp = &asoc->base.bind_addr; + union sctp_addr_param *addr_param; + struct sctp_transport *transport; + struct sctp_sockaddr_entry *saddr; + + addr_param = (union sctp_addr_param *) + ((void *)asconf_param + sizeof(sctp_addip_param_t)); + + /* We have checked the packet before, so we do not check again. */ + af = sctp_get_af_specific(param_type2af(addr_param->p.type)); + af->from_addr_param(&addr, addr_param, htons(bp->port), 0); + + switch (asconf_param->param_hdr.type) { + case SCTP_PARAM_ADD_IP: + /* This is always done in BH context with a socket lock + * held, so the list can not change. + */ + local_bh_disable(); + list_for_each_entry(saddr, &bp->address_list, list) { + if (sctp_cmp_addr_exact(&saddr->a, &addr)) + saddr->state = SCTP_ADDR_SRC; + } + local_bh_enable(); + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + dst_release(transport->dst); + transport->dst = NULL; + } + break; + case SCTP_PARAM_DEL_IP: + local_bh_disable(); + sctp_del_bind_addr(bp, &addr); + local_bh_enable(); + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + dst_release(transport->dst); + transport->dst = NULL; + } + break; + default: + break; + } +} + +/* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk + * for the given asconf parameter. If there is no response for this parameter, + * return the error code based on the third argument 'no_err'. + * ADDIP 4.1 + * A7) If an error response is received for a TLV parameter, all TLVs with no + * response before the failed TLV are considered successful if not reported. + * All TLVs after the failed response are considered unsuccessful unless a + * specific success indication is present for the parameter. + */ +static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, + sctp_addip_param_t *asconf_param, + int no_err) +{ + sctp_addip_param_t *asconf_ack_param; + sctp_errhdr_t *err_param; + int length; + int asconf_ack_len; + __be16 err_code; + + if (no_err) + err_code = SCTP_ERROR_NO_ERROR; + else + err_code = SCTP_ERROR_REQ_REFUSED; + + asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t); + + /* Skip the addiphdr from the asconf_ack chunk and store a pointer to + * the first asconf_ack parameter. + */ + length = sizeof(sctp_addiphdr_t); + asconf_ack_param = (sctp_addip_param_t *)(asconf_ack->skb->data + + length); + asconf_ack_len -= length; + + while (asconf_ack_len > 0) { + if (asconf_ack_param->crr_id == asconf_param->crr_id) { + switch(asconf_ack_param->param_hdr.type) { + case SCTP_PARAM_SUCCESS_REPORT: + return SCTP_ERROR_NO_ERROR; + case SCTP_PARAM_ERR_CAUSE: + length = sizeof(sctp_addip_param_t); + err_param = (sctp_errhdr_t *) + ((void *)asconf_ack_param + length); + asconf_ack_len -= length; + if (asconf_ack_len > 0) + return err_param->cause; + else + return SCTP_ERROR_INV_PARAM; + break; + default: + return SCTP_ERROR_INV_PARAM; + } + } + + length = ntohs(asconf_ack_param->param_hdr.length); + asconf_ack_param = (sctp_addip_param_t *) + ((void *)asconf_ack_param + length); + asconf_ack_len -= length; + } + + return err_code; +} + +/* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ +int sctp_process_asconf_ack(struct sctp_association *asoc, + struct sctp_chunk *asconf_ack) +{ + struct sctp_chunk *asconf = asoc->addip_last_asconf; + union sctp_addr_param *addr_param; + sctp_addip_param_t *asconf_param; + int length = 0; + int asconf_len = asconf->skb->len; + int all_param_pass = 0; + int no_err = 1; + int retval = 0; + __be16 err_code = SCTP_ERROR_NO_ERROR; + + /* Skip the chunkhdr and addiphdr from the last asconf sent and store + * a pointer to address parameter. + */ + length = sizeof(sctp_addip_chunk_t); + addr_param = (union sctp_addr_param *)(asconf->skb->data + length); + asconf_len -= length; + + /* Skip the address parameter in the last asconf sent and store a + * pointer to the first asconf parameter. + */ + length = ntohs(addr_param->p.length); + asconf_param = (sctp_addip_param_t *)((void *)addr_param + length); + asconf_len -= length; + + /* ADDIP 4.1 + * A8) If there is no response(s) to specific TLV parameter(s), and no + * failures are indicated, then all request(s) are considered + * successful. + */ + if (asconf_ack->skb->len == sizeof(sctp_addiphdr_t)) + all_param_pass = 1; + + /* Process the TLVs contained in the last sent ASCONF chunk. */ + while (asconf_len > 0) { + if (all_param_pass) + err_code = SCTP_ERROR_NO_ERROR; + else { + err_code = sctp_get_asconf_response(asconf_ack, + asconf_param, + no_err); + if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) + no_err = 0; + } + + switch (err_code) { + case SCTP_ERROR_NO_ERROR: + sctp_asconf_param_success(asoc, asconf_param); + break; + + case SCTP_ERROR_RSRC_LOW: + retval = 1; + break; + + case SCTP_ERROR_UNKNOWN_PARAM: + /* Disable sending this type of asconf parameter in + * future. + */ + asoc->peer.addip_disabled_mask |= + asconf_param->param_hdr.type; + break; + + case SCTP_ERROR_REQ_REFUSED: + case SCTP_ERROR_DEL_LAST_IP: + case SCTP_ERROR_DEL_SRC_IP: + default: + break; + } + + /* Skip the processed asconf parameter and move to the next + * one. + */ + length = ntohs(asconf_param->param_hdr.length); + asconf_param = (sctp_addip_param_t *)((void *)asconf_param + + length); + asconf_len -= length; + } + + /* Free the cached last sent asconf chunk. */ + list_del_init(&asconf->transmitted_list); + sctp_chunk_free(asconf); + asoc->addip_last_asconf = NULL; + + return retval; +} + +/* Make a FWD TSN chunk. */ +struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, + __u32 new_cum_tsn, size_t nstreams, + struct sctp_fwdtsn_skip *skiplist) +{ + struct sctp_chunk *retval = NULL; + struct sctp_fwdtsn_hdr ftsn_hdr; + struct sctp_fwdtsn_skip skip; + size_t hint; + int i; + + hint = (nstreams + 1) * sizeof(__u32); + + retval = sctp_make_chunk(asoc, SCTP_CID_FWD_TSN, 0, hint); + + if (!retval) + return NULL; + + ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); + retval->subh.fwdtsn_hdr = + sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); + + for (i = 0; i < nstreams; i++) { + skip.stream = skiplist[i].stream; + skip.ssn = skiplist[i].ssn; + sctp_addto_chunk(retval, sizeof(skip), &skip); + } + + return retval; +} diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c new file mode 100644 index 00000000..6e0f8829 --- /dev/null +++ b/net/sctp/sm_sideeffect.c @@ -0,0 +1,1717 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * + * This file is part of the SCTP kernel implementation + * + * These functions work with the state functions in sctp_sm_statefuns.c + * to implement that state operations. These functions implement the + * steps which require modifying existing data structures. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Hui Huang + * Dajiang Zhang + * Daisy Chang + * Sridhar Samudrala + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +static int sctp_cmd_interpreter(sctp_event_t event_type, + sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + gfp_t gfp); +static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + gfp_t gfp); + +/******************************************************************** + * Helper functions + ********************************************************************/ + +/* A helper function for delayed processing of INET ECN CE bit. */ +static void sctp_do_ecn_ce_work(struct sctp_association *asoc, + __u32 lowest_tsn) +{ + /* Save the TSN away for comparison when we receive CWR */ + + asoc->last_ecne_tsn = lowest_tsn; + asoc->need_ecne = 1; +} + +/* Helper function for delayed processing of SCTP ECNE chunk. */ +/* RFC 2960 Appendix A + * + * RFC 2481 details a specific bit for a sender to send in + * the header of its next outbound TCP segment to indicate to + * its peer that it has reduced its congestion window. This + * is termed the CWR bit. For SCTP the same indication is made + * by including the CWR chunk. This chunk contains one data + * element, i.e. the TSN number that was sent in the ECNE chunk. + * This element represents the lowest TSN number in the datagram + * that was originally marked with the CE bit. + */ +static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc, + __u32 lowest_tsn, + struct sctp_chunk *chunk) +{ + struct sctp_chunk *repl; + + /* Our previously transmitted packet ran into some congestion + * so we should take action by reducing cwnd and ssthresh + * and then ACK our peer that we we've done so by + * sending a CWR. + */ + + /* First, try to determine if we want to actually lower + * our cwnd variables. Only lower them if the ECNE looks more + * recent than the last response. + */ + if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) { + struct sctp_transport *transport; + + /* Find which transport's congestion variables + * need to be adjusted. + */ + transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn); + + /* Update the congestion variables. */ + if (transport) + sctp_transport_lower_cwnd(transport, + SCTP_LOWER_CWND_ECNE); + asoc->last_cwr_tsn = lowest_tsn; + } + + /* Always try to quiet the other end. In case of lost CWR, + * resend last_cwr_tsn. + */ + repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk); + + /* If we run out of memory, it will look like a lost CWR. We'll + * get back in sync eventually. + */ + return repl; +} + +/* Helper function to do delayed processing of ECN CWR chunk. */ +static void sctp_do_ecn_cwr_work(struct sctp_association *asoc, + __u32 lowest_tsn) +{ + /* Turn off ECNE getting auto-prepended to every outgoing + * packet + */ + asoc->need_ecne = 0; +} + +/* Generate SACK if necessary. We call this at the end of a packet. */ +static int sctp_gen_sack(struct sctp_association *asoc, int force, + sctp_cmd_seq_t *commands) +{ + __u32 ctsn, max_tsn_seen; + struct sctp_chunk *sack; + struct sctp_transport *trans = asoc->peer.last_data_from; + int error = 0; + + if (force || + (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || + (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) + asoc->peer.sack_needed = 1; + + ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); + + /* From 12.2 Parameters necessary per association (i.e. the TCB): + * + * Ack State : This flag indicates if the next received packet + * : is to be responded to with a SACK. ... + * : When DATA chunks are out of order, SACK's + * : are not delayed (see Section 6). + * + * [This is actually not mentioned in Section 6, but we + * implement it here anyway. --piggy] + */ + if (max_tsn_seen != ctsn) + asoc->peer.sack_needed = 1; + + /* From 6.2 Acknowledgement on Reception of DATA Chunks: + * + * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, + * an acknowledgement SHOULD be generated for at least every + * second packet (not every second DATA chunk) received, and + * SHOULD be generated within 200 ms of the arrival of any + * unacknowledged DATA chunk. ... + */ + if (!asoc->peer.sack_needed) { + asoc->peer.sack_cnt++; + + /* Set the SACK delay timeout based on the + * SACK delay for the last transport + * data was received from, or the default + * for the association. + */ + if (trans) { + /* We will need a SACK for the next packet. */ + if (asoc->peer.sack_cnt >= trans->sackfreq - 1) + asoc->peer.sack_needed = 1; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + trans->sackdelay; + } else { + /* We will need a SACK for the next packet. */ + if (asoc->peer.sack_cnt >= asoc->sackfreq - 1) + asoc->peer.sack_needed = 1; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + asoc->sackdelay; + } + + /* Restart the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + } else { + asoc->a_rwnd = asoc->rwnd; + sack = sctp_make_sack(asoc); + if (!sack) + goto nomem; + + asoc->peer.sack_needed = 0; + asoc->peer.sack_cnt = 0; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack)); + + /* Stop the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + } + + return error; +nomem: + error = -ENOMEM; + return error; +} + +/* When the T3-RTX timer expires, it calls this function to create the + * relevant state machine event. + */ +void sctp_generate_t3_rtx_event(unsigned long peer) +{ + int error; + struct sctp_transport *transport = (struct sctp_transport *) peer; + struct sctp_association *asoc = transport->asoc; + + /* Check whether a task is in the sock. */ + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__); + + /* Try again later. */ + if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + /* Is this transport really dead and just waiting around for + * the timer to let go of the reference? + */ + if (transport->dead) + goto out_unlock; + + /* Run through the state machine. */ + error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), + asoc->state, + asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + asoc->base.sk->sk_err = -error; + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_transport_put(transport); +} + +/* This is a sa interface for producing timeout events. It works + * for timeouts which use the association as their parameter. + */ +static void sctp_generate_timeout_event(struct sctp_association *asoc, + sctp_event_timeout_t timeout_type) +{ + int error = 0; + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy: timer %d\n", + __func__, + timeout_type); + + /* Try again later. */ + if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20))) + sctp_association_hold(asoc); + goto out_unlock; + } + + /* Is this association really dead and just waiting around for + * the timer to let go of the reference? + */ + if (asoc->base.dead) + goto out_unlock; + + /* Run through the state machine. */ + error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(timeout_type), + asoc->state, asoc->ep, asoc, + (void *)timeout_type, GFP_ATOMIC); + + if (error) + asoc->base.sk->sk_err = -error; + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_association_put(asoc); +} + +static void sctp_generate_t1_cookie_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); +} + +static void sctp_generate_t1_init_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); +} + +static void sctp_generate_t2_shutdown_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); +} + +static void sctp_generate_t4_rto_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); +} + +static void sctp_generate_t5_shutdown_guard_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *)data; + sctp_generate_timeout_event(asoc, + SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); + +} /* sctp_generate_t5_shutdown_guard_event() */ + +static void sctp_generate_autoclose_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); +} + +/* Generate a heart beat event. If the sock is busy, reschedule. Make + * sure that the transport is still valid. + */ +void sctp_generate_heartbeat_event(unsigned long data) +{ + int error = 0; + struct sctp_transport *transport = (struct sctp_transport *) data; + struct sctp_association *asoc = transport->asoc; + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__); + + /* Try again later. */ + if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + /* Is this structure just waiting around for us to actually + * get destroyed? + */ + if (transport->dead) + goto out_unlock; + + error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), + asoc->state, asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + asoc->base.sk->sk_err = -error; + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_transport_put(transport); +} + +/* Handle the timeout of the ICMP protocol unreachable timer. Trigger + * the correct state machine transition that will close the association. + */ +void sctp_generate_proto_unreach_event(unsigned long data) +{ + struct sctp_transport *transport = (struct sctp_transport *) data; + struct sctp_association *asoc = transport->asoc; + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__); + + /* Try again later. */ + if (!mod_timer(&transport->proto_unreach_timer, + jiffies + (HZ/20))) + sctp_association_hold(asoc); + goto out_unlock; + } + + /* Is this structure just waiting around for us to actually + * get destroyed? + */ + if (asoc->base.dead) + goto out_unlock; + + sctp_do_sm(SCTP_EVENT_T_OTHER, + SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), + asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_association_put(asoc); +} + + +/* Inject a SACK Timeout event into the state machine. */ +static void sctp_generate_sack_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); +} + +sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { + NULL, + sctp_generate_t1_cookie_event, + sctp_generate_t1_init_event, + sctp_generate_t2_shutdown_event, + NULL, + sctp_generate_t4_rto_event, + sctp_generate_t5_shutdown_guard_event, + NULL, + sctp_generate_sack_event, + sctp_generate_autoclose_event, +}; + + +/* RFC 2960 8.2 Path Failure Detection + * + * When its peer endpoint is multi-homed, an endpoint should keep a + * error counter for each of the destination transport addresses of the + * peer endpoint. + * + * Each time the T3-rtx timer expires on any address, or when a + * HEARTBEAT sent to an idle address is not acknowledged within a RTO, + * the error counter of that destination address will be incremented. + * When the value in the error counter exceeds the protocol parameter + * 'Path.Max.Retrans' of that destination address, the endpoint should + * mark the destination transport address as inactive, and a + * notification SHOULD be sent to the upper layer. + * + */ +static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, + struct sctp_transport *transport, + int is_hb) +{ + /* The check for association's overall error counter exceeding the + * threshold is done in the state function. + */ + /* We are here due to a timer expiration. If the timer was + * not a HEARTBEAT, then normal error tracking is done. + * If the timer was a heartbeat, we only increment error counts + * when we already have an outstanding HEARTBEAT that has not + * been acknowledged. + * Additionally, some tranport states inhibit error increments. + */ + if (!is_hb) { + asoc->overall_error_count++; + if (transport->state != SCTP_INACTIVE) + transport->error_count++; + } else if (transport->hb_sent) { + if (transport->state != SCTP_UNCONFIRMED) + asoc->overall_error_count++; + if (transport->state != SCTP_INACTIVE) + transport->error_count++; + } + + if (transport->state != SCTP_INACTIVE && + (transport->error_count > transport->pathmaxrxt)) { + SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", + " transport IP: port:%d failed.\n", + asoc, + (&transport->ipaddr), + ntohs(transport->ipaddr.v4.sin_port)); + sctp_assoc_control_transport(asoc, transport, + SCTP_TRANSPORT_DOWN, + SCTP_FAILED_THRESHOLD); + } + + /* E2) For the destination address for which the timer + * expires, set RTO <- RTO * 2 ("back off the timer"). The + * maximum value discussed in rule C7 above (RTO.max) may be + * used to provide an upper bound to this doubling operation. + * + * Special Case: the first HB doesn't trigger exponential backoff. + * The first unacknowledged HB triggers it. We do this with a flag + * that indicates that we have an outstanding HB. + */ + if (!is_hb || transport->hb_sent) { + transport->rto = min((transport->rto * 2), transport->asoc->rto_max); + } +} + +/* Worker routine to handle INIT command failure. */ +static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + unsigned error) +{ + struct sctp_ulpevent *event; + + event = sctp_ulpevent_make_assoc_change(asoc,0, SCTP_CANT_STR_ASSOC, + (__u16)error, 0, 0, NULL, + GFP_ATOMIC); + + if (event) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(event)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + /* SEND_FAILED sent later when cleaning up the association. */ + asoc->outqueue.error = error; + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); +} + +/* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ +static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + sctp_event_t event_type, + sctp_subtype_t subtype, + struct sctp_chunk *chunk, + unsigned error) +{ + struct sctp_ulpevent *event; + + /* Cancel any partial delivery in progress. */ + sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + + if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) + event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, + (__u16)error, 0, 0, chunk, + GFP_ATOMIC); + else + event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, + (__u16)error, 0, 0, NULL, + GFP_ATOMIC); + if (event) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(event)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + /* SEND_FAILED sent later when cleaning up the association. */ + asoc->outqueue.error = error; + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); +} + +/* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT + * inside the cookie. In reality, this is only used for INIT-ACK processing + * since all other cases use "temporary" associations and can do all + * their work in statefuns directly. + */ +static int sctp_cmd_process_init(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_init_chunk_t *peer_init, + gfp_t gfp) +{ + int error; + + /* We only process the init as a sideeffect in a single + * case. This is when we process the INIT-ACK. If we + * fail during INIT processing (due to malloc problems), + * just return the error and stop processing the stack. + */ + if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp)) + error = -ENOMEM; + else + error = 0; + + return error; +} + +/* Helper function to break out starting up of heartbeat timers. */ +static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sctp_transport *t; + + /* Start a heartbeat timer for each transport on the association. + * hold a reference on the transport to make sure none of + * the needed data structures go away. + */ + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { + + if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) + sctp_transport_hold(t); + } +} + +static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sctp_transport *t; + + /* Stop all heartbeat timers. */ + + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + if (del_timer(&t->hb_timer)) + sctp_transport_put(t); + } +} + +/* Helper function to stop any pending T3-RTX timers */ +static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sctp_transport *t; + + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + if (timer_pending(&t->T3_rtx_timer) && + del_timer(&t->T3_rtx_timer)) { + sctp_transport_put(t); + } + } +} + + +/* Helper function to update the heartbeat timer. */ +static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds, + struct sctp_transport *t) +{ + /* Update the heartbeat timer. */ + if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) + sctp_transport_hold(t); +} + +/* Helper function to handle the reception of an HEARTBEAT ACK. */ +static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_transport *t, + struct sctp_chunk *chunk) +{ + sctp_sender_hb_info_t *hbinfo; + + /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the + * HEARTBEAT should clear the error counter of the destination + * transport address to which the HEARTBEAT was sent. + */ + t->error_count = 0; + + /* + * Although RFC4960 specifies that the overall error count must + * be cleared when a HEARTBEAT ACK is received, we make an + * exception while in SHUTDOWN PENDING. If the peer keeps its + * window shut forever, we may never be able to transmit our + * outstanding data and rely on the retransmission limit be reached + * to shutdown the association. + */ + if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING) + t->asoc->overall_error_count = 0; + + /* Clear the hb_sent flag to signal that we had a good + * acknowledgement. + */ + t->hb_sent = 0; + + /* Mark the destination transport address as active if it is not so + * marked. + */ + if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) + sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, + SCTP_HEARTBEAT_SUCCESS); + + /* The receiver of the HEARTBEAT ACK should also perform an + * RTT measurement for that destination transport address + * using the time value carried in the HEARTBEAT ACK chunk. + * If the transport's rto_pending variable has been cleared, + * it was most likely due to a retransmit. However, we want + * to re-enable it to properly update the rto. + */ + if (t->rto_pending == 0) + t->rto_pending = 1; + + hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data; + sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at)); + + /* Update the heartbeat timer. */ + if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) + sctp_transport_hold(t); +} + + +/* Helper function to process the process SACK command. */ +static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_sackhdr *sackh) +{ + int err = 0; + + if (sctp_outq_sack(&asoc->outqueue, sackh)) { + /* There are no more TSNs awaiting SACK. */ + err = sctp_do_sm(SCTP_EVENT_T_OTHER, + SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), + asoc->state, asoc->ep, asoc, NULL, + GFP_ATOMIC); + } + + return err; +} + +/* Helper function to set the timeout value for T2-SHUTDOWN timer and to set + * the transport for a shutdown chunk. + */ +static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct sctp_transport *t; + + if (chunk->transport) + t = chunk->transport; + else { + t = sctp_assoc_choose_alter_transport(asoc, + asoc->shutdown_last_sent_to); + chunk->transport = t; + } + asoc->shutdown_last_sent_to = t; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; +} + +/* Helper function to change the state of an association. */ +static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + sctp_state_t state) +{ + struct sock *sk = asoc->base.sk; + + asoc->state = state; + + SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n", + asoc, sctp_state_tbl[state]); + + if (sctp_style(sk, TCP)) { + /* Change the sk->sk_state of a TCP-style socket that has + * successfully completed a connect() call. + */ + if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) + sk->sk_state = SCTP_SS_ESTABLISHED; + + /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ + if (sctp_state(asoc, SHUTDOWN_RECEIVED) && + sctp_sstate(sk, ESTABLISHED)) + sk->sk_shutdown |= RCV_SHUTDOWN; + } + + if (sctp_state(asoc, COOKIE_WAIT)) { + /* Reset init timeouts since they may have been + * increased due to timer expirations. + */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = + asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = + asoc->rto_initial; + } + + if (sctp_state(asoc, ESTABLISHED) || + sctp_state(asoc, CLOSED) || + sctp_state(asoc, SHUTDOWN_RECEIVED)) { + /* Wake up any processes waiting in the asoc's wait queue in + * sctp_wait_for_connect() or sctp_wait_for_sndbuf(). + */ + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); + + /* Wake up any processes waiting in the sk's sleep queue of + * a TCP-style or UDP-style peeled-off socket in + * sctp_wait_for_accept() or sctp_wait_for_packet(). + * For a UDP-style socket, the waiters are woken up by the + * notifications. + */ + if (!sctp_style(sk, UDP)) + sk->sk_state_change(sk); + } +} + +/* Helper function to delete an association. */ +static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + + /* If it is a non-temporary association belonging to a TCP-style + * listening socket that is not closed, do not free it so that accept() + * can pick it up later. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) && + (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) + return; + + sctp_unhash_established(asoc); + sctp_association_free(asoc); +} + +/* + * ADDIP Section 4.1 ASCONF Chunk Procedures + * A4) Start a T-4 RTO timer, using the RTO value of the selected + * destination address (we use active path instead of primary path just + * because primary path may be inactive. + */ +static void sctp_cmd_setup_t4(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct sctp_transport *t; + + t = sctp_assoc_choose_alter_transport(asoc, chunk->transport); + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto; + chunk->transport = t; +} + +/* Process an incoming Operation Error Chunk. */ +static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct sctp_errhdr *err_hdr; + struct sctp_ulpevent *ev; + + while (chunk->chunk_end > chunk->skb->data) { + err_hdr = (struct sctp_errhdr *)(chunk->skb->data); + + ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, + GFP_ATOMIC); + if (!ev) + return; + + sctp_ulpq_tail_event(&asoc->ulpq, ev); + + switch (err_hdr->cause) { + case SCTP_ERROR_UNKNOWN_CHUNK: + { + sctp_chunkhdr_t *unk_chunk_hdr; + + unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable; + switch (unk_chunk_hdr->type) { + /* ADDIP 4.1 A9) If the peer responds to an ASCONF with + * an ERROR chunk reporting that it did not recognized + * the ASCONF chunk type, the sender of the ASCONF MUST + * NOT send any further ASCONF chunks and MUST stop its + * T-4 timer. + */ + case SCTP_CID_ASCONF: + if (asoc->peer.asconf_capable == 0) + break; + + asoc->peer.asconf_capable = 0; + sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + break; + default: + break; + } + break; + } + default: + break; + } + } +} + +/* Process variable FWDTSN chunk information. */ +static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk) +{ + struct sctp_fwdtsn_skip *skip; + /* Walk through all the skipped SSNs */ + sctp_walk_fwdtsn(skip, chunk) { + sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); + } +} + +/* Helper function to remove the association non-primary peer + * transports. + */ +static void sctp_cmd_del_non_primary(struct sctp_association *asoc) +{ + struct sctp_transport *t; + struct list_head *pos; + struct list_head *temp; + + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + if (!sctp_cmp_addr_exact(&t->ipaddr, + &asoc->peer.primary_addr)) { + sctp_assoc_del_peer(asoc, &t->ipaddr); + } + } +} + +/* Helper function to set sk_err on a 1-1 style socket. */ +static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error) +{ + struct sock *sk = asoc->base.sk; + + if (!sctp_style(sk, UDP)) + sk->sk_err = error; +} + +/* Helper function to generate an association change event */ +static void sctp_cmd_assoc_change(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + u8 state) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0, + asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, + NULL, GFP_ATOMIC); + if (ev) + sctp_ulpq_tail_event(&asoc->ulpq, ev); +} + +/* Helper function to generate an adaptation indication event */ +static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands, + struct sctp_association *asoc) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); + + if (ev) + sctp_ulpq_tail_event(&asoc->ulpq, ev); +} + + +static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, + sctp_event_timeout_t timer, + char *name) +{ + struct sctp_transport *t; + + t = asoc->init_last_sent_to; + asoc->init_err_counter++; + + if (t->init_sent_count > (asoc->init_cycle + 1)) { + asoc->timeouts[timer] *= 2; + if (asoc->timeouts[timer] > asoc->max_init_timeo) { + asoc->timeouts[timer] = asoc->max_init_timeo; + } + asoc->init_cycle++; + SCTP_DEBUG_PRINTK( + "T1 %s Timeout adjustment" + " init_err_counter: %d" + " cycle: %d" + " timeout: %ld\n", + name, + asoc->init_err_counter, + asoc->init_cycle, + asoc->timeouts[timer]); + } + +} + +/* Send the whole message, chunk by chunk, to the outqueue. + * This way the whole message is queued up and bundling if + * encouraged for small fragments. + */ +static int sctp_cmd_send_msg(struct sctp_association *asoc, + struct sctp_datamsg *msg) +{ + struct sctp_chunk *chunk; + int error = 0; + + list_for_each_entry(chunk, &msg->chunks, frag_list) { + error = sctp_outq_tail(&asoc->outqueue, chunk); + if (error) + break; + } + + return error; +} + + +/* Sent the next ASCONF packet currently stored in the association. + * This happens after the ASCONF_ACK was succeffully processed. + */ +static void sctp_cmd_send_asconf(struct sctp_association *asoc) +{ + /* Send the next asconf chunk from the addip chunk + * queue. + */ + if (!list_empty(&asoc->addip_chunk_list)) { + struct list_head *entry = asoc->addip_chunk_list.next; + struct sctp_chunk *asconf = list_entry(entry, + struct sctp_chunk, list); + list_del_init(entry); + + /* Hold the chunk until an ASCONF_ACK is received. */ + sctp_chunk_hold(asconf); + if (sctp_primitive_ASCONF(asoc, asconf)) + sctp_chunk_free(asconf); + else + asoc->addip_last_asconf = asconf; + } +} + + +/* These three macros allow us to pull the debugging code out of the + * main flow of sctp_do_sm() to keep attention focused on the real + * functionality there. + */ +#define DEBUG_PRE \ + SCTP_DEBUG_PRINTK("sctp_do_sm prefn: " \ + "ep %p, %s, %s, asoc %p[%s], %s\n", \ + ep, sctp_evttype_tbl[event_type], \ + (*debug_fn)(subtype), asoc, \ + sctp_state_tbl[state], state_fn->name) + +#define DEBUG_POST \ + SCTP_DEBUG_PRINTK("sctp_do_sm postfn: " \ + "asoc %p, status: %s\n", \ + asoc, sctp_status_tbl[status]) + +#define DEBUG_POST_SFX \ + SCTP_DEBUG_PRINTK("sctp_do_sm post sfx: error %d, asoc %p[%s]\n", \ + error, asoc, \ + sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \ + sctp_assoc2id(asoc)))?asoc->state:SCTP_STATE_CLOSED]) + +/* + * This is the master state machine processing function. + * + * If you want to understand all of lksctp, this is a + * good place to start. + */ +int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + gfp_t gfp) +{ + sctp_cmd_seq_t commands; + const sctp_sm_table_entry_t *state_fn; + sctp_disposition_t status; + int error = 0; + typedef const char *(printfn_t)(sctp_subtype_t); + + static printfn_t *table[] = { + NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname, + }; + printfn_t *debug_fn __attribute__ ((unused)) = table[event_type]; + + /* Look up the state function, run it, and then process the + * side effects. These three steps are the heart of lksctp. + */ + state_fn = sctp_sm_lookup_event(event_type, state, subtype); + + sctp_init_cmd_seq(&commands); + + DEBUG_PRE; + status = (*state_fn->fn)(ep, asoc, subtype, event_arg, &commands); + DEBUG_POST; + + error = sctp_side_effects(event_type, subtype, state, + ep, asoc, event_arg, status, + &commands, gfp); + DEBUG_POST_SFX; + + return error; +} + +#undef DEBUG_PRE +#undef DEBUG_POST + +/***************************************************************** + * This the master state function side effect processing function. + *****************************************************************/ +static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + gfp_t gfp) +{ + int error; + + /* FIXME - Most of the dispositions left today would be categorized + * as "exceptional" dispositions. For those dispositions, it + * may not be proper to run through any of the commands at all. + * For example, the command interpreter might be run only with + * disposition SCTP_DISPOSITION_CONSUME. + */ + if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state, + ep, asoc, + event_arg, status, + commands, gfp))) + goto bail; + + switch (status) { + case SCTP_DISPOSITION_DISCARD: + SCTP_DEBUG_PRINTK("Ignored sctp protocol event - state %d, " + "event_type %d, event_id %d\n", + state, event_type, subtype.chunk); + break; + + case SCTP_DISPOSITION_NOMEM: + /* We ran out of memory, so we need to discard this + * packet. + */ + /* BUG--we should now recover some memory, probably by + * reneging... + */ + error = -ENOMEM; + break; + + case SCTP_DISPOSITION_DELETE_TCB: + /* This should now be a command. */ + break; + + case SCTP_DISPOSITION_CONSUME: + case SCTP_DISPOSITION_ABORT: + /* + * We should no longer have much work to do here as the + * real work has been done as explicit commands above. + */ + break; + + case SCTP_DISPOSITION_VIOLATION: + if (net_ratelimit()) + pr_err("protocol violation state %d chunkid %d\n", + state, subtype.chunk); + break; + + case SCTP_DISPOSITION_NOT_IMPL: + pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", + state, event_type, subtype.chunk); + break; + + case SCTP_DISPOSITION_BUG: + pr_err("bug in state %d, event_type %d, event_id %d\n", + state, event_type, subtype.chunk); + BUG(); + break; + + default: + pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", + status, state, event_type, subtype.chunk); + BUG(); + break; + } + +bail: + return error; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* This is the side-effect interpreter. */ +static int sctp_cmd_interpreter(sctp_event_t event_type, + sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + gfp_t gfp) +{ + int error = 0; + int force; + sctp_cmd_t *cmd; + struct sctp_chunk *new_obj; + struct sctp_chunk *chunk = NULL; + struct sctp_packet *packet; + struct timer_list *timer; + unsigned long timeout; + struct sctp_transport *t; + struct sctp_sackhdr sackh; + int local_cork = 0; + + if (SCTP_EVENT_T_TIMEOUT != event_type) + chunk = (struct sctp_chunk *) event_arg; + + /* Note: This whole file is a huge candidate for rework. + * For example, each command could either have its own handler, so + * the loop would look like: + * while (cmds) + * cmd->handle(x, y, z) + * --jgrimm + */ + while (NULL != (cmd = sctp_next_cmd(commands))) { + switch (cmd->verb) { + case SCTP_CMD_NOP: + /* Do nothing. */ + break; + + case SCTP_CMD_NEW_ASOC: + /* Register a new association. */ + if (local_cork) { + sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + } + asoc = cmd->obj.ptr; + /* Register with the endpoint. */ + sctp_endpoint_add_asoc(ep, asoc); + sctp_hash_established(asoc); + break; + + case SCTP_CMD_UPDATE_ASSOC: + sctp_assoc_update(asoc, cmd->obj.ptr); + break; + + case SCTP_CMD_PURGE_OUTQUEUE: + sctp_outq_teardown(&asoc->outqueue); + break; + + case SCTP_CMD_DELETE_TCB: + if (local_cork) { + sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + } + /* Delete the current association. */ + sctp_cmd_delete_tcb(commands, asoc); + asoc = NULL; + break; + + case SCTP_CMD_NEW_STATE: + /* Enter a new state. */ + sctp_cmd_new_state(commands, asoc, cmd->obj.state); + break; + + case SCTP_CMD_REPORT_TSN: + /* Record the arrival of a TSN. */ + error = sctp_tsnmap_mark(&asoc->peer.tsn_map, + cmd->obj.u32); + break; + + case SCTP_CMD_REPORT_FWDTSN: + /* Move the Cumulattive TSN Ack ahead. */ + sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32); + + /* purge the fragmentation queue */ + sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32); + + /* Abort any in progress partial delivery. */ + sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + break; + + case SCTP_CMD_PROCESS_FWDTSN: + sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.ptr); + break; + + case SCTP_CMD_GEN_SACK: + /* Generate a Selective ACK. + * The argument tells us whether to just count + * the packet and MAYBE generate a SACK, or + * force a SACK out. + */ + force = cmd->obj.i32; + error = sctp_gen_sack(asoc, force, commands); + break; + + case SCTP_CMD_PROCESS_SACK: + /* Process an inbound SACK. */ + error = sctp_cmd_process_sack(commands, asoc, + cmd->obj.ptr); + break; + + case SCTP_CMD_GEN_INIT_ACK: + /* Generate an INIT ACK chunk. */ + new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, + 0); + if (!new_obj) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + break; + + case SCTP_CMD_PEER_INIT: + /* Process a unified INIT from the peer. + * Note: Only used during INIT-ACK processing. If + * there is an error just return to the outter + * layer which will bail. + */ + error = sctp_cmd_process_init(commands, asoc, chunk, + cmd->obj.ptr, gfp); + break; + + case SCTP_CMD_GEN_COOKIE_ECHO: + /* Generate a COOKIE ECHO chunk. */ + new_obj = sctp_make_cookie_echo(asoc, chunk); + if (!new_obj) { + if (cmd->obj.ptr) + sctp_chunk_free(cmd->obj.ptr); + goto nomem; + } + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + + /* If there is an ERROR chunk to be sent along with + * the COOKIE_ECHO, send it, too. + */ + if (cmd->obj.ptr) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(cmd->obj.ptr)); + + if (new_obj->transport) { + new_obj->transport->init_sent_count++; + asoc->init_last_sent_to = new_obj->transport; + } + + /* FIXME - Eventually come up with a cleaner way to + * enabling COOKIE-ECHO + DATA bundling during + * multihoming stale cookie scenarios, the following + * command plays with asoc->peer.retran_path to + * avoid the problem of sending the COOKIE-ECHO and + * DATA in different paths, which could result + * in the association being ABORTed if the DATA chunk + * is processed first by the server. Checking the + * init error counter simply causes this command + * to be executed only during failed attempts of + * association establishment. + */ + if ((asoc->peer.retran_path != + asoc->peer.primary_path) && + (asoc->init_err_counter > 0)) { + sctp_add_cmd_sf(commands, + SCTP_CMD_FORCE_PRIM_RETRAN, + SCTP_NULL()); + } + + break; + + case SCTP_CMD_GEN_SHUTDOWN: + /* Generate SHUTDOWN when in SHUTDOWN_SENT state. + * Reset error counts. + */ + asoc->overall_error_count = 0; + + /* Generate a SHUTDOWN chunk. */ + new_obj = sctp_make_shutdown(asoc, chunk); + if (!new_obj) + goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + break; + + case SCTP_CMD_CHUNK_ULP: + /* Send a chunk to the sockets layer. */ + SCTP_DEBUG_PRINTK("sm_sideff: %s %p, %s %p.\n", + "chunk_up:", cmd->obj.ptr, + "ulpq:", &asoc->ulpq); + sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.ptr, + GFP_ATOMIC); + break; + + case SCTP_CMD_EVENT_ULP: + /* Send a notification to the sockets layer. */ + SCTP_DEBUG_PRINTK("sm_sideff: %s %p, %s %p.\n", + "event_up:",cmd->obj.ptr, + "ulpq:",&asoc->ulpq); + sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ptr); + break; + + case SCTP_CMD_REPLY: + /* If an caller has not already corked, do cork. */ + if (!asoc->outqueue.cork) { + sctp_outq_cork(&asoc->outqueue); + local_cork = 1; + } + /* Send a chunk to our peer. */ + error = sctp_outq_tail(&asoc->outqueue, cmd->obj.ptr); + break; + + case SCTP_CMD_SEND_PKT: + /* Send a full packet to our peer. */ + packet = cmd->obj.ptr; + sctp_packet_transmit(packet); + sctp_ootb_pkt_free(packet); + break; + + case SCTP_CMD_T1_RETRAN: + /* Mark a transport for retransmission. */ + sctp_retransmit(&asoc->outqueue, cmd->obj.transport, + SCTP_RTXR_T1_RTX); + break; + + case SCTP_CMD_RETRAN: + /* Mark a transport for retransmission. */ + sctp_retransmit(&asoc->outqueue, cmd->obj.transport, + SCTP_RTXR_T3_RTX); + break; + + case SCTP_CMD_ECN_CE: + /* Do delayed CE processing. */ + sctp_do_ecn_ce_work(asoc, cmd->obj.u32); + break; + + case SCTP_CMD_ECN_ECNE: + /* Do delayed ECNE processing. */ + new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32, + chunk); + if (new_obj) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + break; + + case SCTP_CMD_ECN_CWR: + /* Do delayed CWR processing. */ + sctp_do_ecn_cwr_work(asoc, cmd->obj.u32); + break; + + case SCTP_CMD_SETUP_T2: + sctp_cmd_setup_t2(commands, asoc, cmd->obj.ptr); + break; + + case SCTP_CMD_TIMER_START_ONCE: + timer = &asoc->timers[cmd->obj.to]; + + if (timer_pending(timer)) + break; + /* fall through */ + + case SCTP_CMD_TIMER_START: + timer = &asoc->timers[cmd->obj.to]; + timeout = asoc->timeouts[cmd->obj.to]; + BUG_ON(!timeout); + + timer->expires = jiffies + timeout; + sctp_association_hold(asoc); + add_timer(timer); + break; + + case SCTP_CMD_TIMER_RESTART: + timer = &asoc->timers[cmd->obj.to]; + timeout = asoc->timeouts[cmd->obj.to]; + if (!mod_timer(timer, jiffies + timeout)) + sctp_association_hold(asoc); + break; + + case SCTP_CMD_TIMER_STOP: + timer = &asoc->timers[cmd->obj.to]; + if (timer_pending(timer) && del_timer(timer)) + sctp_association_put(asoc); + break; + + case SCTP_CMD_INIT_CHOOSE_TRANSPORT: + chunk = cmd->obj.ptr; + t = sctp_assoc_choose_alter_transport(asoc, + asoc->init_last_sent_to); + asoc->init_last_sent_to = t; + chunk->transport = t; + t->init_sent_count++; + /* Set the new transport as primary */ + sctp_assoc_set_primary(asoc, t); + break; + + case SCTP_CMD_INIT_RESTART: + /* Do the needed accounting and updates + * associated with restarting an initialization + * timer. Only multiply the timeout by two if + * all transports have been tried at the current + * timeout. + */ + sctp_cmd_t1_timer_update(asoc, + SCTP_EVENT_TIMEOUT_T1_INIT, + "INIT"); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + break; + + case SCTP_CMD_COOKIEECHO_RESTART: + /* Do the needed accounting and updates + * associated with restarting an initialization + * timer. Only multiply the timeout by two if + * all transports have been tried at the current + * timeout. + */ + sctp_cmd_t1_timer_update(asoc, + SCTP_EVENT_TIMEOUT_T1_COOKIE, + "COOKIE"); + + /* If we've sent any data bundled with + * COOKIE-ECHO we need to resend. + */ + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + sctp_retransmit_mark(&asoc->outqueue, t, + SCTP_RTXR_T1_RTX); + } + + sctp_add_cmd_sf(commands, + SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + break; + + case SCTP_CMD_INIT_FAILED: + sctp_cmd_init_failed(commands, asoc, cmd->obj.err); + break; + + case SCTP_CMD_ASSOC_FAILED: + sctp_cmd_assoc_failed(commands, asoc, event_type, + subtype, chunk, cmd->obj.err); + break; + + case SCTP_CMD_INIT_COUNTER_INC: + asoc->init_err_counter++; + break; + + case SCTP_CMD_INIT_COUNTER_RESET: + asoc->init_err_counter = 0; + asoc->init_cycle = 0; + list_for_each_entry(t, &asoc->peer.transport_addr_list, + transports) { + t->init_sent_count = 0; + } + break; + + case SCTP_CMD_REPORT_DUP: + sctp_tsnmap_mark_dup(&asoc->peer.tsn_map, + cmd->obj.u32); + break; + + case SCTP_CMD_REPORT_BAD_TAG: + SCTP_DEBUG_PRINTK("vtag mismatch!\n"); + break; + + case SCTP_CMD_STRIKE: + /* Mark one strike against a transport. */ + sctp_do_8_2_transport_strike(asoc, cmd->obj.transport, + 0); + break; + + case SCTP_CMD_TRANSPORT_IDLE: + t = cmd->obj.transport; + sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); + break; + + case SCTP_CMD_TRANSPORT_HB_SENT: + t = cmd->obj.transport; + sctp_do_8_2_transport_strike(asoc, t, 1); + t->hb_sent = 1; + break; + + case SCTP_CMD_TRANSPORT_ON: + t = cmd->obj.transport; + sctp_cmd_transport_on(commands, asoc, t, chunk); + break; + + case SCTP_CMD_HB_TIMERS_START: + sctp_cmd_hb_timers_start(commands, asoc); + break; + + case SCTP_CMD_HB_TIMER_UPDATE: + t = cmd->obj.transport; + sctp_cmd_hb_timer_update(commands, t); + break; + + case SCTP_CMD_HB_TIMERS_STOP: + sctp_cmd_hb_timers_stop(commands, asoc); + break; + + case SCTP_CMD_REPORT_ERROR: + error = cmd->obj.error; + break; + + case SCTP_CMD_PROCESS_CTSN: + /* Dummy up a SACK for processing. */ + sackh.cum_tsn_ack = cmd->obj.be32; + sackh.a_rwnd = asoc->peer.rwnd + + asoc->outqueue.outstanding_bytes; + sackh.num_gap_ack_blocks = 0; + sackh.num_dup_tsns = 0; + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, + SCTP_SACKH(&sackh)); + break; + + case SCTP_CMD_DISCARD_PACKET: + /* We need to discard the whole packet. + * Uncork the queue since there might be + * responses pending + */ + chunk->pdiscard = 1; + if (asoc) { + sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + } + break; + + case SCTP_CMD_RTO_PENDING: + t = cmd->obj.transport; + t->rto_pending = 1; + break; + + case SCTP_CMD_PART_DELIVER: + sctp_ulpq_partial_delivery(&asoc->ulpq, cmd->obj.ptr, + GFP_ATOMIC); + break; + + case SCTP_CMD_RENEGE: + sctp_ulpq_renege(&asoc->ulpq, cmd->obj.ptr, + GFP_ATOMIC); + break; + + case SCTP_CMD_SETUP_T4: + sctp_cmd_setup_t4(commands, asoc, cmd->obj.ptr); + break; + + case SCTP_CMD_PROCESS_OPERR: + sctp_cmd_process_operr(commands, asoc, chunk); + break; + case SCTP_CMD_CLEAR_INIT_TAG: + asoc->peer.i.init_tag = 0; + break; + case SCTP_CMD_DEL_NON_PRIMARY: + sctp_cmd_del_non_primary(asoc); + break; + case SCTP_CMD_T3_RTX_TIMERS_STOP: + sctp_cmd_t3_rtx_timers_stop(commands, asoc); + break; + case SCTP_CMD_FORCE_PRIM_RETRAN: + t = asoc->peer.retran_path; + asoc->peer.retran_path = asoc->peer.primary_path; + error = sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + asoc->peer.retran_path = t; + break; + case SCTP_CMD_SET_SK_ERR: + sctp_cmd_set_sk_err(asoc, cmd->obj.error); + break; + case SCTP_CMD_ASSOC_CHANGE: + sctp_cmd_assoc_change(commands, asoc, + cmd->obj.u8); + break; + case SCTP_CMD_ADAPTATION_IND: + sctp_cmd_adaptation_ind(commands, asoc); + break; + + case SCTP_CMD_ASSOC_SHKEY: + error = sctp_auth_asoc_init_active_key(asoc, + GFP_ATOMIC); + break; + case SCTP_CMD_UPDATE_INITTAG: + asoc->peer.i.init_tag = cmd->obj.u32; + break; + case SCTP_CMD_SEND_MSG: + if (!asoc->outqueue.cork) { + sctp_outq_cork(&asoc->outqueue); + local_cork = 1; + } + error = sctp_cmd_send_msg(asoc, cmd->obj.msg); + break; + case SCTP_CMD_SEND_NEXT_ASCONF: + sctp_cmd_send_asconf(asoc); + break; + case SCTP_CMD_PURGE_ASCONF_QUEUE: + sctp_asconf_queue_teardown(asoc); + break; + default: + pr_warn("Impossible command: %u, %p\n", + cmd->verb, cmd->obj.ptr); + break; + } + + if (error) + break; + } + +out: + /* If this is in response to a received chunk, wait until + * we are done with the packet to open the queue so that we don't + * send multiple packets in response to a single request. + */ + if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { + if (chunk->end_of_packet || chunk->singleton) + error = sctp_outq_uncork(&asoc->outqueue); + } else if (local_cork) + error = sctp_outq_uncork(&asoc->outqueue); + return error; +nomem: + error = -ENOMEM; + goto out; +} + diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c new file mode 100644 index 00000000..24611714 --- /dev/null +++ b/net/sctp/sm_statefuns.c @@ -0,0 +1,6161 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2002 Intel Corp. + * Copyright (c) 2002 Nokia Corp. + * + * This is part of the SCTP Linux Kernel Implementation. + * + * These are the state functions for the state machine. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Mathew Kotowsky + * Sridhar Samudrala + * Jon Grimm + * Hui Huang + * Dajiang Zhang + * Daisy Chang + * Ardelle Fan + * Ryan Layer + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sctp_packet *sctp_abort_pkt_new(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + const void *payload, + size_t paylen); +static int sctp_eat_data(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands); +static struct sctp_packet *sctp_ootb_pkt_new(const struct sctp_association *asoc, + const struct sctp_chunk *chunk); +static void sctp_send_stale_cookie_err(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_chunk *err_chunk); +static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); +static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); +static sctp_disposition_t sctp_sf_tabort_8_4_8(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); +static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); + +static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, + __be16 error, int sk_err, + const struct sctp_association *asoc, + struct sctp_transport *transport); + +static sctp_disposition_t sctp_sf_abort_violation( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + void *arg, + sctp_cmd_seq_t *commands, + const __u8 *payload, + const size_t paylen); + +static sctp_disposition_t sctp_sf_violation_chunklen( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); + +static sctp_disposition_t sctp_sf_violation_paramlen( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, void *ext, + sctp_cmd_seq_t *commands); + +static sctp_disposition_t sctp_sf_violation_ctsn( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); + +static sctp_disposition_t sctp_sf_violation_chunk( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); + +static sctp_ierror_t sctp_sf_authenticate(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + struct sctp_chunk *chunk); + +static sctp_disposition_t __sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); + +/* Small helper function that checks if the chunk length + * is of the appropriate length. The 'required_length' argument + * is set to be the size of a specific chunk we are testing. + * Return Values: 1 = Valid length + * 0 = Invalid length + * + */ +static inline int +sctp_chunk_length_valid(struct sctp_chunk *chunk, + __u16 required_length) +{ + __u16 chunk_length = ntohs(chunk->chunk_hdr->length); + + if (unlikely(chunk_length < required_length)) + return 0; + + return 1; +} + +/********************************************************** + * These are the state functions for handling chunk events. + **********************************************************/ + +/* + * Process the final SHUTDOWN COMPLETE. + * + * Section: 4 (C) (diagram), 9.2 + * Upon reception of the SHUTDOWN COMPLETE chunk the endpoint will verify + * that it is in SHUTDOWN-ACK-SENT state, if it is not the chunk should be + * discarded. If the endpoint is in the SHUTDOWN-ACK-SENT state the endpoint + * should stop the T2-shutdown timer and remove all knowledge of the + * association (and thus the association enters the CLOSED state). + * + * Verification Tag: 8.5.1(C), sctpimpguide 2.41. + * C) Rules for packet carrying SHUTDOWN COMPLETE: + * ... + * - The receiver of a SHUTDOWN COMPLETE shall accept the packet + * if the Verification Tag field of the packet matches its own tag and + * the T bit is not set + * OR + * it is set to its peer's tag and the T bit is set in the Chunk + * Flags. + * Otherwise, the receiver MUST silently discard the packet + * and take no further action. An endpoint MUST ignore the + * SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_4_C(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* RFC 2960 6.10 Bundling + * + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + */ + if (!chunk->singleton) + return sctp_sf_violation_chunk(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN_COMPLETE chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* RFC 2960 10.2 SCTP-to-ULP + * + * H) SHUTDOWN COMPLETE notification + * + * When SCTP completes the shutdown procedures (section 9.2) this + * notification is passed to the upper layer. + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP, + 0, 0, 0, NULL, GFP_ATOMIC); + if (ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + + /* Upon reception of the SHUTDOWN COMPLETE chunk the endpoint + * will verify that it is in SHUTDOWN-ACK-SENT state, if it is + * not the chunk should be discarded. If the endpoint is in + * the SHUTDOWN-ACK-SENT state the endpoint should stop the + * T2-shutdown timer and remove all knowledge of the + * association (and thus the association enters the CLOSED + * state). + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return SCTP_DISPOSITION_DELETE_TCB; +} + +/* + * Respond to a normal INIT chunk. + * We are the side that is being asked for an association. + * + * Section: 5.1 Normal Establishment of an Association, B + * B) "Z" shall respond immediately with an INIT ACK chunk. The + * destination IP address of the INIT ACK MUST be set to the source + * IP address of the INIT to which this INIT ACK is responding. In + * the response, besides filling in other parameters, "Z" must set the + * Verification Tag field to Tag_A, and also provide its own + * Verification Tag (Tag_Z) in the Initiate Tag field. + * + * Verification Tag: Must be 0. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1B_init(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *repl; + struct sctp_association *new_asoc; + struct sctp_chunk *err_chunk; + struct sctp_packet *packet; + sctp_unrecognized_param_t *unk_param; + int len; + + /* 6.10 Bundling + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + * + * IG Section 2.11.2 + * Furthermore, we require that the receiver of an INIT chunk MUST + * enforce these rules by silently discarding an arriving packet + * with an INIT chunk that is bundled with other chunks. + */ + if (!chunk->singleton) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* If the packet is an OOTB packet which is temporarily on the + * control endpoint, respond with an ABORT. + */ + if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) { + SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES); + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + } + + /* 3.1 A packet containing an INIT chunk MUST have a zero Verification + * Tag. + */ + if (chunk->sctp_hdr->vtag != 0) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* Make sure that the INIT chunk has a valid length. + * Normally, this would cause an ABORT with a Protocol Violation + * error, but since we don't have an association, we'll + * just discard the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* If the INIT is coming toward a closing socket, we'll send back + * and ABORT. Essentially, this catches the race of INIT being + * backloged to the socket at the same time as the user isses close(). + * Since the socket and all its associations are going away, we + * can treat this OOTB + */ + if (sctp_sstate(ep->base.sk, CLOSING)) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* Verify the INIT chunk before processing it. */ + err_chunk = NULL; + if (!sctp_verify_init(asoc, chunk->chunk_hdr->type, + (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, + &err_chunk)) { + /* This chunk contains fatal error. It is to be discarded. + * Send an ABORT, with causes if there is any. + */ + if (err_chunk) { + packet = sctp_abort_pkt_new(ep, asoc, arg, + (__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t), + ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + sctp_chunk_free(err_chunk); + + if (packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + return SCTP_DISPOSITION_CONSUME; + } else { + return SCTP_DISPOSITION_NOMEM; + } + } else { + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, + commands); + } + } + + /* Grab the INIT header. */ + chunk->subh.init_hdr = (sctp_inithdr_t *)chunk->skb->data; + + /* Tag the variable length parameters. */ + chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t)); + + new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC); + if (!new_asoc) + goto nomem; + + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, + sctp_scope(sctp_source(chunk)), + GFP_ATOMIC) < 0) + goto nomem_init; + + /* The call, sctp_process_init(), can fail on memory allocation. */ + if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), + (sctp_init_chunk_t *)chunk->chunk_hdr, + GFP_ATOMIC)) + goto nomem_init; + + /* B) "Z" shall respond immediately with an INIT ACK chunk. */ + + /* If there are errors need to be reported for unknown parameters, + * make sure to reserve enough room in the INIT ACK for them. + */ + len = 0; + if (err_chunk) + len = ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t); + + repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len); + if (!repl) + goto nomem_init; + + /* If there are errors need to be reported for unknown parameters, + * include them in the outgoing INIT ACK as "Unrecognized parameter" + * parameter. + */ + if (err_chunk) { + /* Get the "Unrecognized parameter" parameter(s) out of the + * ERROR chunk generated by sctp_verify_init(). Since the + * error cause code for "unknown parameter" and the + * "Unrecognized parameter" type is the same, we can + * construct the parameters in INIT ACK by copying the + * ERROR causes over. + */ + unk_param = (sctp_unrecognized_param_t *) + ((__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t)); + /* Replace the cause code with the "Unrecognized parameter" + * parameter type. + */ + sctp_addto_chunk(repl, len, unk_param); + sctp_chunk_free(err_chunk); + } + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* + * Note: After sending out INIT ACK with the State Cookie parameter, + * "Z" MUST NOT allocate any resources, nor keep any states for the + * new association. Otherwise, "Z" will be vulnerable to resource + * attacks. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return SCTP_DISPOSITION_DELETE_TCB; + +nomem_init: + sctp_association_free(new_asoc); +nomem: + if (err_chunk) + sctp_chunk_free(err_chunk); + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Respond to a normal INIT ACK chunk. + * We are the side that is initiating the association. + * + * Section: 5.1 Normal Establishment of an Association, C + * C) Upon reception of the INIT ACK from "Z", "A" shall stop the T1-init + * timer and leave COOKIE-WAIT state. "A" shall then send the State + * Cookie received in the INIT ACK chunk in a COOKIE ECHO chunk, start + * the T1-cookie timer, and enter the COOKIE-ECHOED state. + * + * Note: The COOKIE ECHO chunk can be bundled with any pending outbound + * DATA chunks, but it MUST be the first chunk in the packet and + * until the COOKIE ACK is returned the sender MUST NOT send any + * other packets to the peer. + * + * Verification Tag: 3.3.3 + * If the value of the Initiate Tag in a received INIT ACK chunk is + * found to be 0, the receiver MUST treat it as an error and close the + * association by transmitting an ABORT. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_init_chunk_t *initchunk; + struct sctp_chunk *err_chunk; + struct sctp_packet *packet; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* 6.10 Bundling + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + */ + if (!chunk->singleton) + return sctp_sf_violation_chunk(ep, asoc, type, arg, commands); + + /* Make sure that the INIT-ACK chunk has a valid length */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_initack_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + /* Grab the INIT header. */ + chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data; + + /* Verify the INIT chunk before processing it. */ + err_chunk = NULL; + if (!sctp_verify_init(asoc, chunk->chunk_hdr->type, + (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, + &err_chunk)) { + + sctp_error_t error = SCTP_ERROR_NO_RESOURCE; + + /* This chunk contains fatal error. It is to be discarded. + * Send an ABORT, with causes. If there are no causes, + * then there wasn't enough memory. Just terminate + * the association. + */ + if (err_chunk) { + packet = sctp_abort_pkt_new(ep, asoc, arg, + (__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t), + ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + sctp_chunk_free(err_chunk); + + if (packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + error = SCTP_ERROR_INV_PARAM; + } + } + + /* SCTP-AUTH, Section 6.3: + * It should be noted that if the receiver wants to tear + * down an association in an authenticated way only, the + * handling of malformed packets should not result in + * tearing down the association. + * + * This means that if we only want to abort associations + * in an authenticated way (i.e AUTH+ABORT), then we + * can't destroy this association just because the packet + * was malformed. + */ + if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + return sctp_stop_t1_and_abort(commands, error, ECONNREFUSED, + asoc, chunk->transport); + } + + /* Tag the variable length parameters. Note that we never + * convert the parameters in an INIT chunk. + */ + chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t)); + + initchunk = (sctp_init_chunk_t *) chunk->chunk_hdr; + + sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT, + SCTP_PEER_INIT(initchunk)); + + /* Reset init error count upon receipt of INIT-ACK. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); + + /* 5.1 C) "A" shall stop the T1-init timer and leave + * COOKIE-WAIT state. "A" shall then ... start the T1-cookie + * timer, and enter the COOKIE-ECHOED state. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_COOKIE_ECHOED)); + + /* SCTP-AUTH: genereate the assocition shared keys so that + * we can potentially signe the COOKIE-ECHO. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL()); + + /* 5.1 C) "A" shall then send the State Cookie received in the + * INIT ACK chunk in a COOKIE ECHO chunk, ... + */ + /* If there is any errors to report, send the ERROR chunk generated + * for unknown parameters as well. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_COOKIE_ECHO, + SCTP_CHUNK(err_chunk)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Respond to a normal COOKIE ECHO chunk. + * We are the side that is being asked for an association. + * + * Section: 5.1 Normal Establishment of an Association, D + * D) Upon reception of the COOKIE ECHO chunk, Endpoint "Z" will reply + * with a COOKIE ACK chunk after building a TCB and moving to + * the ESTABLISHED state. A COOKIE ACK chunk may be bundled with + * any pending DATA chunks (and/or SACK chunks), but the COOKIE ACK + * chunk MUST be the first chunk in the packet. + * + * IMPLEMENTATION NOTE: An implementation may choose to send the + * Communication Up notification to the SCTP user upon reception + * of a valid COOKIE ECHO chunk. + * + * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules + * D) Rules for packet carrying a COOKIE ECHO + * + * - When sending a COOKIE ECHO, the endpoint MUST use the value of the + * Initial Tag received in the INIT ACK. + * + * - The receiver of a COOKIE ECHO follows the procedures in Section 5. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_association *new_asoc; + sctp_init_chunk_t *peer_init; + struct sctp_chunk *repl; + struct sctp_ulpevent *ev, *ai_ev = NULL; + int error = 0; + struct sctp_chunk *err_chk_p; + struct sock *sk; + + /* If the packet is an OOTB packet which is temporarily on the + * control endpoint, respond with an ABORT. + */ + if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) { + SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES); + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + } + + /* Make sure that the COOKIE_ECHO chunk has a valid length. + * In this case, we check that we have enough for at least a + * chunk header. More detailed verification is done + * in sctp_unpack_cookie(). + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* If the endpoint is not listening or if the number of associations + * on the TCP-style socket exceed the max backlog, respond with an + * ABORT. + */ + sk = ep->base.sk; + if (!sctp_sstate(sk, LISTENING) || + (sctp_style(sk, TCP) && sk_acceptq_is_full(sk))) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* "Decode" the chunk. We have no optional parameters so we + * are in good shape. + */ + chunk->subh.cookie_hdr = + (struct sctp_signed_cookie *)chunk->skb->data; + if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t))) + goto nomem; + + /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint + * "Z" will reply with a COOKIE ACK chunk after building a TCB + * and moving to the ESTABLISHED state. + */ + new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error, + &err_chk_p); + + /* FIXME: + * If the re-build failed, what is the proper error path + * from here? + * + * [We should abort the association. --piggy] + */ + if (!new_asoc) { + /* FIXME: Several errors are possible. A bad cookie should + * be silently discarded, but think about logging it too. + */ + switch (error) { + case -SCTP_IERROR_NOMEM: + goto nomem; + + case -SCTP_IERROR_STALE_COOKIE: + sctp_send_stale_cookie_err(ep, asoc, chunk, commands, + err_chk_p); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + case -SCTP_IERROR_BAD_SIG: + default: + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + } + + + /* Delay state machine commands until later. + * + * Re-build the bind address for the association is done in + * the sctp_unpack_cookie() already. + */ + /* This is a brand-new association, so these are not yet side + * effects--it is safe to run them here. + */ + peer_init = &chunk->subh.cookie_hdr->c.peer_init[0]; + + if (!sctp_process_init(new_asoc, chunk, + &chunk->subh.cookie_hdr->c.peer_addr, + peer_init, GFP_ATOMIC)) + goto nomem_init; + + /* SCTP-AUTH: Now that we've populate required fields in + * sctp_process_init, set up the assocaition shared keys as + * necessary so that we can potentially authenticate the ACK + */ + error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC); + if (error) + goto nomem_init; + + /* SCTP-AUTH: auth_chunk pointer is only set when the cookie-echo + * is supposed to be authenticated and we have to do delayed + * authentication. We've just recreated the association using + * the information in the cookie and now it's much easier to + * do the authentication. + */ + if (chunk->auth_chunk) { + struct sctp_chunk auth; + sctp_ierror_t ret; + + /* set-up our fake chunk so that we can process it */ + auth.skb = chunk->auth_chunk; + auth.asoc = chunk->asoc; + auth.sctp_hdr = chunk->sctp_hdr; + auth.chunk_hdr = (sctp_chunkhdr_t *)skb_push(chunk->auth_chunk, + sizeof(sctp_chunkhdr_t)); + skb_pull(chunk->auth_chunk, sizeof(sctp_chunkhdr_t)); + auth.transport = chunk->transport; + + ret = sctp_sf_authenticate(ep, new_asoc, type, &auth); + + /* We can now safely free the auth_chunk clone */ + kfree_skb(chunk->auth_chunk); + + if (ret != SCTP_IERROR_NO_ERROR) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + } + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem_init; + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * D) IMPLEMENTATION NOTE: An implementation may choose to + * send the Communication Up notification to the SCTP user + * upon reception of a valid COOKIE ECHO chunk. + */ + ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, SCTP_COMM_UP, 0, + new_asoc->c.sinit_num_ostreams, + new_asoc->c.sinit_max_instreams, + NULL, GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaptation Layer Indication parameter , SCTP + * delivers this notification to inform the application that of the + * peers requested adaptation layer. + */ + if (new_asoc->peer.adaptation_ind) { + ai_ev = sctp_ulpevent_make_adaptation_indication(new_asoc, + GFP_ATOMIC); + if (!ai_ev) + goto nomem_aiev; + } + + /* Add all the state machine commands now since we've created + * everything. This way we don't introduce memory corruptions + * during side-effect processing and correclty count established + * associations. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + SCTP_INC_STATS(SCTP_MIB_PASSIVEESTABS); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); + + if (new_asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + /* This will send the COOKIE ACK */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* Queue the ASSOC_CHANGE event */ + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Send up the Adaptation Layer Indication event */ + if (ai_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ai_ev)); + + return SCTP_DISPOSITION_CONSUME; + +nomem_aiev: + sctp_ulpevent_free(ev); +nomem_ev: + sctp_chunk_free(repl); +nomem_init: + sctp_association_free(new_asoc); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Respond to a normal COOKIE ACK chunk. + * We are the side that is being asked for an association. + * + * RFC 2960 5.1 Normal Establishment of an Association + * + * E) Upon reception of the COOKIE ACK, endpoint "A" will move from the + * COOKIE-ECHOED state to the ESTABLISHED state, stopping the T1-cookie + * timer. It may also notify its ULP about the successful + * establishment of the association with a Communication Up + * notification (see Section 10). + * + * Verification Tag: + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Verify that the chunk length for the COOKIE-ACK is OK. + * If we don't do this, any bundled chunks may be junked. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Reset init error count upon receipt of COOKIE-ACK, + * to avoid problems with the managemement of this + * counter in stale cookie situations when a transition back + * from the COOKIE-ECHOED state to the COOKIE-WAIT + * state is performed. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * E) Upon reception of the COOKIE ACK, endpoint "A" will move + * from the COOKIE-ECHOED state to the ESTABLISHED state, + * stopping the T1-cookie timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + SCTP_INC_STATS(SCTP_MIB_ACTIVEESTABS); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); + if (asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + /* It may also notify its ULP about the successful + * establishment of the association with a Communication Up + * notification (see Section 10). + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP, + 0, asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, + NULL, GFP_ATOMIC); + + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaptation Layer Indication parameter , SCTP + * delivers this notification to inform the application that of the + * peers requested adaptation layer. + */ + if (asoc->peer.adaptation_ind) { + ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + + return SCTP_DISPOSITION_CONSUME; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Generate and sendout a heartbeat packet. */ +static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = (struct sctp_transport *) arg; + struct sctp_chunk *reply; + + /* Send a heartbeat to our peer. */ + reply = sctp_make_heartbeat(asoc, transport); + if (!reply) + return SCTP_DISPOSITION_NOMEM; + + /* Set rto_pending indicating that an RTT measurement + * is started with this heartbeat chunk. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_RTO_PENDING, + SCTP_TRANSPORT(transport)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + return SCTP_DISPOSITION_CONSUME; +} + +/* Generate a HEARTBEAT packet on the given transport. */ +sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = (struct sctp_transport *) arg; + + if (asoc->overall_error_count >= asoc->max_retrans) { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + + /* Section 3.3.5. + * The Sender-specific Heartbeat Info field should normally include + * information about the sender's current time when this HEARTBEAT + * chunk is sent and the destination transport address to which this + * HEARTBEAT is sent (see Section 8.3). + */ + + if (transport->param_flags & SPP_HB_ENABLE) { + if (SCTP_DISPOSITION_NOMEM == + sctp_sf_heartbeat(ep, asoc, type, arg, + commands)) + return SCTP_DISPOSITION_NOMEM; + + /* Set transport error counter and association error counter + * when sending heartbeat. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT, + SCTP_TRANSPORT(transport)); + } + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE, + SCTP_TRANSPORT(transport)); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE, + SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process an heartbeat request. + * + * Section: 8.3 Path Heartbeat + * The receiver of the HEARTBEAT should immediately respond with a + * HEARTBEAT ACK that contains the Heartbeat Information field copied + * from the received HEARTBEAT chunk. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * When receiving an SCTP packet, the endpoint MUST ensure that the + * value in the Verification Tag field of the received SCTP packet + * matches its own Tag. If the received Verification Tag value does not + * match the receiver's own tag value, the receiver shall silently + * discard the packet and shall not process it any further except for + * those cases listed in Section 8.5.1 below. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_beat_8_3(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *reply; + size_t paylen = 0; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the HEARTBEAT chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* 8.3 The receiver of the HEARTBEAT should immediately + * respond with a HEARTBEAT ACK that contains the Heartbeat + * Information field copied from the received HEARTBEAT chunk. + */ + chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data; + paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); + if (!pskb_pull(chunk->skb, paylen)) + goto nomem; + + reply = sctp_make_heartbeat_ack(asoc, chunk, + chunk->subh.hb_hdr, paylen); + if (!reply) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process the returning HEARTBEAT ACK. + * + * Section: 8.3 Path Heartbeat + * Upon the receipt of the HEARTBEAT ACK, the sender of the HEARTBEAT + * should clear the error counter of the destination transport + * address to which the HEARTBEAT was sent, and mark the destination + * transport address as active if it is not so marked. The endpoint may + * optionally report to the upper layer when an inactive destination + * address is marked as active due to the reception of the latest + * HEARTBEAT ACK. The receiver of the HEARTBEAT ACK must also + * clear the association overall error count as well (as defined + * in section 8.1). + * + * The receiver of the HEARTBEAT ACK should also perform an RTT + * measurement for that destination transport address using the time + * value carried in the HEARTBEAT ACK chunk. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + union sctp_addr from_addr; + struct sctp_transport *link; + sctp_sender_hb_info_t *hbinfo; + unsigned long max_interval; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the HEARTBEAT-ACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t) + + sizeof(sctp_sender_hb_info_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data; + /* Make sure that the length of the parameter is what we expect */ + if (ntohs(hbinfo->param_hdr.length) != + sizeof(sctp_sender_hb_info_t)) { + return SCTP_DISPOSITION_DISCARD; + } + + from_addr = hbinfo->daddr; + link = sctp_assoc_lookup_paddr(asoc, &from_addr); + + /* This should never happen, but lets log it if so. */ + if (unlikely(!link)) { + if (from_addr.sa.sa_family == AF_INET6) { + if (net_ratelimit()) + pr_warn("%s association %p could not find address %pI6\n", + __func__, + asoc, + &from_addr.v6.sin6_addr); + } else { + if (net_ratelimit()) + pr_warn("%s association %p could not find address %pI4\n", + __func__, + asoc, + &from_addr.v4.sin_addr.s_addr); + } + return SCTP_DISPOSITION_DISCARD; + } + + /* Validate the 64-bit random nonce. */ + if (hbinfo->hb_nonce != link->hb_nonce) + return SCTP_DISPOSITION_DISCARD; + + max_interval = link->hbinterval + link->rto; + + /* Check if the timestamp looks valid. */ + if (time_after(hbinfo->sent_at, jiffies) || + time_after(jiffies, hbinfo->sent_at + max_interval)) { + SCTP_DEBUG_PRINTK("%s: HEARTBEAT ACK with invalid timestamp " + "received for transport: %p\n", + __func__, link); + return SCTP_DISPOSITION_DISCARD; + } + + /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of + * the HEARTBEAT should clear the error counter of the + * destination transport address to which the HEARTBEAT was + * sent and mark the destination transport address as active if + * it is not so marked. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_ON, SCTP_TRANSPORT(link)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* Helper function to send out an abort for the restart + * condition. + */ +static int sctp_sf_send_restart_abort(union sctp_addr *ssa, + struct sctp_chunk *init, + sctp_cmd_seq_t *commands) +{ + int len; + struct sctp_packet *pkt; + union sctp_addr_param *addrparm; + struct sctp_errhdr *errhdr; + struct sctp_endpoint *ep; + char buffer[sizeof(struct sctp_errhdr)+sizeof(union sctp_addr_param)]; + struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family); + + /* Build the error on the stack. We are way to malloc crazy + * throughout the code today. + */ + errhdr = (struct sctp_errhdr *)buffer; + addrparm = (union sctp_addr_param *)errhdr->variable; + + /* Copy into a parm format. */ + len = af->to_addr_param(ssa, addrparm); + len += sizeof(sctp_errhdr_t); + + errhdr->cause = SCTP_ERROR_RESTART; + errhdr->length = htons(len); + + /* Assign to the control socket. */ + ep = sctp_sk((sctp_get_ctl_sock()))->ep; + + /* Association is NULL since this may be a restart attack and we + * want to send back the attacker's vtag. + */ + pkt = sctp_abort_pkt_new(ep, NULL, init, errhdr, len); + + if (!pkt) + goto out; + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(pkt)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + /* Discard the rest of the inbound packet. */ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); + +out: + /* Even if there is no memory, treat as a failure so + * the packet will get dropped. + */ + return 0; +} + +static bool list_has_sctp_addr(const struct list_head *list, + union sctp_addr *ipaddr) +{ + struct sctp_transport *addr; + + list_for_each_entry(addr, list, transports) { + if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr)) + return true; + } + + return false; +} +/* A restart is occurring, check to make sure no new addresses + * are being added as we may be under a takeover attack. + */ +static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, + const struct sctp_association *asoc, + struct sctp_chunk *init, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *new_addr; + int ret = 1; + + /* Implementor's Guide - Section 5.2.2 + * ... + * Before responding the endpoint MUST check to see if the + * unexpected INIT adds new addresses to the association. If new + * addresses are added to the association, the endpoint MUST respond + * with an ABORT.. + */ + + /* Search through all current addresses and make sure + * we aren't adding any new ones. + */ + list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list, + transports) { + if (!list_has_sctp_addr(&asoc->peer.transport_addr_list, + &new_addr->ipaddr)) { + sctp_sf_send_restart_abort(&new_addr->ipaddr, init, + commands); + ret = 0; + break; + } + } + + /* Return success if all addresses were found. */ + return ret; +} + +/* Populate the verification/tie tags based on overlapping INIT + * scenario. + * + * Note: Do not use in CLOSED or SHUTDOWN-ACK-SENT state. + */ +static void sctp_tietags_populate(struct sctp_association *new_asoc, + const struct sctp_association *asoc) +{ + switch (asoc->state) { + + /* 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State */ + + case SCTP_STATE_COOKIE_WAIT: + new_asoc->c.my_vtag = asoc->c.my_vtag; + new_asoc->c.my_ttag = asoc->c.my_vtag; + new_asoc->c.peer_ttag = 0; + break; + + case SCTP_STATE_COOKIE_ECHOED: + new_asoc->c.my_vtag = asoc->c.my_vtag; + new_asoc->c.my_ttag = asoc->c.my_vtag; + new_asoc->c.peer_ttag = asoc->c.peer_vtag; + break; + + /* 5.2.2 Unexpected INIT in States Other than CLOSED, COOKIE-ECHOED, + * COOKIE-WAIT and SHUTDOWN-ACK-SENT + */ + default: + new_asoc->c.my_ttag = asoc->c.my_vtag; + new_asoc->c.peer_ttag = asoc->c.peer_vtag; + break; + } + + /* Other parameters for the endpoint SHOULD be copied from the + * existing parameters of the association (e.g. number of + * outbound streams) into the INIT ACK and cookie. + */ + new_asoc->rwnd = asoc->rwnd; + new_asoc->c.sinit_num_ostreams = asoc->c.sinit_num_ostreams; + new_asoc->c.sinit_max_instreams = asoc->c.sinit_max_instreams; + new_asoc->c.initial_tsn = asoc->c.initial_tsn; +} + +/* + * Compare vtag/tietag values to determine unexpected COOKIE-ECHO + * handling action. + * + * RFC 2960 5.2.4 Handle a COOKIE ECHO when a TCB exists. + * + * Returns value representing action to be taken. These action values + * correspond to Action/Description values in RFC 2960, Table 2. + */ +static char sctp_tietags_compare(struct sctp_association *new_asoc, + const struct sctp_association *asoc) +{ + /* In this case, the peer may have restarted. */ + if ((asoc->c.my_vtag != new_asoc->c.my_vtag) && + (asoc->c.peer_vtag != new_asoc->c.peer_vtag) && + (asoc->c.my_vtag == new_asoc->c.my_ttag) && + (asoc->c.peer_vtag == new_asoc->c.peer_ttag)) + return 'A'; + + /* Collision case B. */ + if ((asoc->c.my_vtag == new_asoc->c.my_vtag) && + ((asoc->c.peer_vtag != new_asoc->c.peer_vtag) || + (0 == asoc->c.peer_vtag))) { + return 'B'; + } + + /* Collision case D. */ + if ((asoc->c.my_vtag == new_asoc->c.my_vtag) && + (asoc->c.peer_vtag == new_asoc->c.peer_vtag)) + return 'D'; + + /* Collision case C. */ + if ((asoc->c.my_vtag != new_asoc->c.my_vtag) && + (asoc->c.peer_vtag == new_asoc->c.peer_vtag) && + (0 == new_asoc->c.my_ttag) && + (0 == new_asoc->c.peer_ttag)) + return 'C'; + + /* No match to any of the special cases; discard this packet. */ + return 'E'; +} + +/* Common helper routine for both duplicate and simulataneous INIT + * chunk handling. + */ +static sctp_disposition_t sctp_sf_do_unexpected_init( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, sctp_cmd_seq_t *commands) +{ + sctp_disposition_t retval; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *repl; + struct sctp_association *new_asoc; + struct sctp_chunk *err_chunk; + struct sctp_packet *packet; + sctp_unrecognized_param_t *unk_param; + int len; + + /* 6.10 Bundling + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + * + * IG Section 2.11.2 + * Furthermore, we require that the receiver of an INIT chunk MUST + * enforce these rules by silently discarding an arriving packet + * with an INIT chunk that is bundled with other chunks. + */ + if (!chunk->singleton) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* 3.1 A packet containing an INIT chunk MUST have a zero Verification + * Tag. + */ + if (chunk->sctp_hdr->vtag != 0) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* Make sure that the INIT chunk has a valid length. + * In this case, we generate a protocol violation since we have + * an association established. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + /* Grab the INIT header. */ + chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data; + + /* Tag the variable length parameters. */ + chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t)); + + /* Verify the INIT chunk before processing it. */ + err_chunk = NULL; + if (!sctp_verify_init(asoc, chunk->chunk_hdr->type, + (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, + &err_chunk)) { + /* This chunk contains fatal error. It is to be discarded. + * Send an ABORT, with causes if there is any. + */ + if (err_chunk) { + packet = sctp_abort_pkt_new(ep, asoc, arg, + (__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t), + ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + if (packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + retval = SCTP_DISPOSITION_CONSUME; + } else { + retval = SCTP_DISPOSITION_NOMEM; + } + goto cleanup; + } else { + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, + commands); + } + } + + /* + * Other parameters for the endpoint SHOULD be copied from the + * existing parameters of the association (e.g. number of + * outbound streams) into the INIT ACK and cookie. + * FIXME: We are copying parameters from the endpoint not the + * association. + */ + new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC); + if (!new_asoc) + goto nomem; + + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, + sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) + goto nomem; + + /* In the outbound INIT ACK the endpoint MUST copy its current + * Verification Tag and Peers Verification tag into a reserved + * place (local tie-tag and per tie-tag) within the state cookie. + */ + if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), + (sctp_init_chunk_t *)chunk->chunk_hdr, + GFP_ATOMIC)) + goto nomem; + + /* Make sure no new addresses are being added during the + * restart. Do not do this check for COOKIE-WAIT state, + * since there are no peer addresses to check against. + * Upon return an ABORT will have been sent if needed. + */ + if (!sctp_state(asoc, COOKIE_WAIT)) { + if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, + commands)) { + retval = SCTP_DISPOSITION_CONSUME; + goto nomem_retval; + } + } + + sctp_tietags_populate(new_asoc, asoc); + + /* B) "Z" shall respond immediately with an INIT ACK chunk. */ + + /* If there are errors need to be reported for unknown parameters, + * make sure to reserve enough room in the INIT ACK for them. + */ + len = 0; + if (err_chunk) { + len = ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t); + } + + repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len); + if (!repl) + goto nomem; + + /* If there are errors need to be reported for unknown parameters, + * include them in the outgoing INIT ACK as "Unrecognized parameter" + * parameter. + */ + if (err_chunk) { + /* Get the "Unrecognized parameter" parameter(s) out of the + * ERROR chunk generated by sctp_verify_init(). Since the + * error cause code for "unknown parameter" and the + * "Unrecognized parameter" type is the same, we can + * construct the parameters in INIT ACK by copying the + * ERROR causes over. + */ + unk_param = (sctp_unrecognized_param_t *) + ((__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t)); + /* Replace the cause code with the "Unrecognized parameter" + * parameter type. + */ + sctp_addto_chunk(repl, len, unk_param); + } + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* + * Note: After sending out INIT ACK with the State Cookie parameter, + * "Z" MUST NOT allocate any resources for this new association. + * Otherwise, "Z" will be vulnerable to resource attacks. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + retval = SCTP_DISPOSITION_CONSUME; + + return retval; + +nomem: + retval = SCTP_DISPOSITION_NOMEM; +nomem_retval: + if (new_asoc) + sctp_association_free(new_asoc); +cleanup: + if (err_chunk) + sctp_chunk_free(err_chunk); + return retval; +} + +/* + * Handle simultaneous INIT. + * This means we started an INIT and then we got an INIT request from + * our peer. + * + * Section: 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State (Item B) + * This usually indicates an initialization collision, i.e., each + * endpoint is attempting, at about the same time, to establish an + * association with the other endpoint. + * + * Upon receipt of an INIT in the COOKIE-WAIT or COOKIE-ECHOED state, an + * endpoint MUST respond with an INIT ACK using the same parameters it + * sent in its original INIT chunk (including its Verification Tag, + * unchanged). These original parameters are combined with those from the + * newly received INIT chunk. The endpoint shall also generate a State + * Cookie with the INIT ACK. The endpoint uses the parameters sent in its + * INIT to calculate the State Cookie. + * + * After that, the endpoint MUST NOT change its state, the T1-init + * timer shall be left running and the corresponding TCB MUST NOT be + * destroyed. The normal procedures for handling State Cookies when + * a TCB exists will resolve the duplicate INITs to a single association. + * + * For an endpoint that is in the COOKIE-ECHOED state it MUST populate + * its Tie-Tags with the Tag information of itself and its peer (see + * section 5.2.2 for a description of the Tie-Tags). + * + * Verification Tag: Not explicit, but an INIT can not have a valid + * verification tag, so we skip the check. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_2_1_siminit(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Call helper to do the real work for both simulataneous and + * duplicate INIT chunk handling. + */ + return sctp_sf_do_unexpected_init(ep, asoc, type, arg, commands); +} + +/* + * Handle duplicated INIT messages. These are usually delayed + * restransmissions. + * + * Section: 5.2.2 Unexpected INIT in States Other than CLOSED, + * COOKIE-ECHOED and COOKIE-WAIT + * + * Unless otherwise stated, upon reception of an unexpected INIT for + * this association, the endpoint shall generate an INIT ACK with a + * State Cookie. In the outbound INIT ACK the endpoint MUST copy its + * current Verification Tag and peer's Verification Tag into a reserved + * place within the state cookie. We shall refer to these locations as + * the Peer's-Tie-Tag and the Local-Tie-Tag. The outbound SCTP packet + * containing this INIT ACK MUST carry a Verification Tag value equal to + * the Initiation Tag found in the unexpected INIT. And the INIT ACK + * MUST contain a new Initiation Tag (randomly generated see Section + * 5.3.1). Other parameters for the endpoint SHOULD be copied from the + * existing parameters of the association (e.g. number of outbound + * streams) into the INIT ACK and cookie. + * + * After sending out the INIT ACK, the endpoint shall take no further + * actions, i.e., the existing association, including its current state, + * and the corresponding TCB MUST NOT be changed. + * + * Note: Only when a TCB exists and the association is not in a COOKIE- + * WAIT state are the Tie-Tags populated. For a normal association INIT + * (i.e. the endpoint is in a COOKIE-WAIT state), the Tie-Tags MUST be + * set to 0 (indicating that no previous TCB existed). The INIT ACK and + * State Cookie are populated as specified in section 5.2.1. + * + * Verification Tag: Not specified, but an INIT has no way of knowing + * what the verification tag could be, so we ignore it. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_2_2_dupinit(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Call helper to do the real work for both simulataneous and + * duplicate INIT chunk handling. + */ + return sctp_sf_do_unexpected_init(ep, asoc, type, arg, commands); +} + + +/* + * Unexpected INIT-ACK handler. + * + * Section 5.2.3 + * If an INIT ACK received by an endpoint in any state other than the + * COOKIE-WAIT state, the endpoint should discard the INIT ACK chunk. + * An unexpected INIT ACK usually indicates the processing of an old or + * duplicated INIT chunk. +*/ +sctp_disposition_t sctp_sf_do_5_2_3_initack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, sctp_cmd_seq_t *commands) +{ + /* Per the above section, we'll discard the chunk if we have an + * endpoint. If this is an OOTB INIT-ACK, treat it as such. + */ + if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) + return sctp_sf_ootb(ep, asoc, type, arg, commands); + else + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); +} + +/* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A') + * + * Section 5.2.4 + * A) In this case, the peer may have restarted. + */ +static sctp_disposition_t sctp_sf_do_dupcook_a(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + sctp_init_chunk_t *peer_init; + struct sctp_ulpevent *ev; + struct sctp_chunk *repl; + struct sctp_chunk *err; + sctp_disposition_t disposition; + + /* new_asoc is a brand-new association, so these are not yet + * side effects--it is safe to run them here. + */ + peer_init = &chunk->subh.cookie_hdr->c.peer_init[0]; + + if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init, + GFP_ATOMIC)) + goto nomem; + + /* Make sure no new addresses are being added during the + * restart. Though this is a pretty complicated attack + * since you'd have to get inside the cookie. + */ + if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands)) { + return SCTP_DISPOSITION_CONSUME; + } + + /* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes + * the peer has restarted (Action A), it MUST NOT setup a new + * association but instead resend the SHUTDOWN ACK and send an ERROR + * chunk with a "Cookie Received while Shutting Down" error cause to + * its peer. + */ + if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) { + disposition = sctp_sf_do_9_2_reshutack(ep, asoc, + SCTP_ST_CHUNK(chunk->chunk_hdr->type), + chunk, commands); + if (SCTP_DISPOSITION_NOMEM == disposition) + goto nomem; + + err = sctp_make_op_error(asoc, chunk, + SCTP_ERROR_COOKIE_IN_SHUTDOWN, + NULL, 0, 0); + if (err) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err)); + + return SCTP_DISPOSITION_CONSUME; + } + + /* For now, stop pending T3-rtx and SACK timers, fail any unsent/unacked + * data. Consider the optional choice of resending of this data. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL()); + + /* Stop pending T4-rto timer, teardown ASCONF queue, ASCONF-ACK queue + * and ASCONF-ACK cache. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL()); + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem; + + /* Report association restart to upper layer. */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0, + new_asoc->c.sinit_num_ostreams, + new_asoc->c.sinit_max_instreams, + NULL, GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + /* Update the content of current association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + return SCTP_DISPOSITION_CONSUME; + +nomem_ev: + sctp_chunk_free(repl); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'B') + * + * Section 5.2.4 + * B) In this case, both sides may be attempting to start an association + * at about the same time but the peer endpoint started its INIT + * after responding to the local endpoint's INIT + */ +/* This case represents an initialization collision. */ +static sctp_disposition_t sctp_sf_do_dupcook_b(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + sctp_init_chunk_t *peer_init; + struct sctp_chunk *repl; + + /* new_asoc is a brand-new association, so these are not yet + * side effects--it is safe to run them here. + */ + peer_init = &chunk->subh.cookie_hdr->c.peer_init[0]; + if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init, + GFP_ATOMIC)) + goto nomem; + + /* Update the content of current association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * D) IMPLEMENTATION NOTE: An implementation may choose to + * send the Communication Up notification to the SCTP user + * upon reception of a valid COOKIE ECHO chunk. + * + * Sadly, this needs to be implemented as a side-effect, because + * we are not guaranteed to have set the association id of the real + * association and so these notifications need to be delayed until + * the association id is allocated. + */ + + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_CHANGE, SCTP_U8(SCTP_COMM_UP)); + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaptation Layer Indication parameter , SCTP + * delivers this notification to inform the application that of the + * peers requested adaptation layer. + * + * This also needs to be done as a side effect for the same reason as + * above. + */ + if (asoc->peer.adaptation_ind) + sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL()); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'C') + * + * Section 5.2.4 + * C) In this case, the local endpoint's cookie has arrived late. + * Before it arrived, the local endpoint sent an INIT and received an + * INIT-ACK and finally sent a COOKIE ECHO with the peer's same tag + * but a new tag of its own. + */ +/* This case represents an initialization collision. */ +static sctp_disposition_t sctp_sf_do_dupcook_c(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + /* The cookie should be silently discarded. + * The endpoint SHOULD NOT change states and should leave + * any timers running. + */ + return SCTP_DISPOSITION_DISCARD; +} + +/* Unexpected COOKIE-ECHO handler lost chunk (Table 2, action 'D') + * + * Section 5.2.4 + * + * D) When both local and remote tags match the endpoint should always + * enter the ESTABLISHED state, if it has not already done so. + */ +/* This case represents an initialization collision. */ +static sctp_disposition_t sctp_sf_do_dupcook_d(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + struct sctp_ulpevent *ev = NULL, *ai_ev = NULL; + struct sctp_chunk *repl; + + /* Clarification from Implementor's Guide: + * D) When both local and remote tags match the endpoint should + * enter the ESTABLISHED state, if it is in the COOKIE-ECHOED state. + * It should stop any cookie timer that may be running and send + * a COOKIE ACK. + */ + + /* Don't accidentally move back into established state. */ + if (asoc->state < SCTP_STATE_ESTABLISHED) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, + SCTP_NULL()); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * D) IMPLEMENTATION NOTE: An implementation may choose + * to send the Communication Up notification to the + * SCTP user upon reception of a valid COOKIE + * ECHO chunk. + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, + SCTP_COMM_UP, 0, + asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, + NULL, GFP_ATOMIC); + if (!ev) + goto nomem; + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaptation Layer Indication parameter, + * SCTP delivers this notification to inform the application + * that of the peers requested adaptation layer. + */ + if (asoc->peer.adaptation_ind) { + ai_ev = sctp_ulpevent_make_adaptation_indication(asoc, + GFP_ATOMIC); + if (!ai_ev) + goto nomem; + + } + } + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + if (ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + if (ai_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ai_ev)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + if (ai_ev) + sctp_ulpevent_free(ai_ev); + if (ev) + sctp_ulpevent_free(ev); + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Handle a duplicate COOKIE-ECHO. This usually means a cookie-carrying + * chunk was retransmitted and then delayed in the network. + * + * Section: 5.2.4 Handle a COOKIE ECHO when a TCB exists + * + * Verification Tag: None. Do cookie validation. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_2_4_dupcook(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_disposition_t retval; + struct sctp_chunk *chunk = arg; + struct sctp_association *new_asoc; + int error = 0; + char action; + struct sctp_chunk *err_chk_p; + + /* Make sure that the chunk has a valid length from the protocol + * perspective. In this case check to make sure we have at least + * enough for the chunk header. Cookie length verification is + * done later. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* "Decode" the chunk. We have no optional parameters so we + * are in good shape. + */ + chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data; + if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t))) + goto nomem; + + /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie + * of a duplicate COOKIE ECHO match the Verification Tags of the + * current association, consider the State Cookie valid even if + * the lifespan is exceeded. + */ + new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error, + &err_chk_p); + + /* FIXME: + * If the re-build failed, what is the proper error path + * from here? + * + * [We should abort the association. --piggy] + */ + if (!new_asoc) { + /* FIXME: Several errors are possible. A bad cookie should + * be silently discarded, but think about logging it too. + */ + switch (error) { + case -SCTP_IERROR_NOMEM: + goto nomem; + + case -SCTP_IERROR_STALE_COOKIE: + sctp_send_stale_cookie_err(ep, asoc, chunk, commands, + err_chk_p); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + case -SCTP_IERROR_BAD_SIG: + default: + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + } + + /* Compare the tie_tag in cookie with the verification tag of + * current association. + */ + action = sctp_tietags_compare(new_asoc, asoc); + + switch (action) { + case 'A': /* Association restart. */ + retval = sctp_sf_do_dupcook_a(ep, asoc, chunk, commands, + new_asoc); + break; + + case 'B': /* Collision case B. */ + retval = sctp_sf_do_dupcook_b(ep, asoc, chunk, commands, + new_asoc); + break; + + case 'C': /* Collision case C. */ + retval = sctp_sf_do_dupcook_c(ep, asoc, chunk, commands, + new_asoc); + break; + + case 'D': /* Collision case D. */ + retval = sctp_sf_do_dupcook_d(ep, asoc, chunk, commands, + new_asoc); + break; + + default: /* Discard packet for all others. */ + retval = sctp_sf_pdiscard(ep, asoc, type, arg, commands); + break; + } + + /* Delete the tempory new association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return retval; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process an ABORT. (SHUTDOWN-PENDING state) + * + * See sctp_sf_do_9_1_abort(). + */ +sctp_disposition_t sctp_sf_shutdown_pending_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Because the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* ADD-IP: Special case for ABORT chunks + * F4) One special consideration is that ABORT Chunks arriving + * destined to the IP address being deleted MUST be + * ignored (see Section 5.3.1 for further details). + */ + if (SCTP_ADDR_DEL == + sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + + return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); +} + +/* + * Process an ABORT. (SHUTDOWN-SENT state) + * + * See sctp_sf_do_9_1_abort(). + */ +sctp_disposition_t sctp_sf_shutdown_sent_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Because the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* ADD-IP: Special case for ABORT chunks + * F4) One special consideration is that ABORT Chunks arriving + * destined to the IP address being deleted MUST be + * ignored (see Section 5.3.1 for further details). + */ + if (SCTP_ADDR_DEL == + sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + + /* Stop the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); +} + +/* + * Process an ABORT. (SHUTDOWN-ACK-SENT state) + * + * See sctp_sf_do_9_1_abort(). + */ +sctp_disposition_t sctp_sf_shutdown_ack_sent_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* The same T2 timer, so we should be able to use + * common function with the SHUTDOWN-SENT state. + */ + return sctp_sf_shutdown_sent_abort(ep, asoc, type, arg, commands); +} + +/* + * Handle an Error received in COOKIE_ECHOED state. + * + * Only handle the error type of stale COOKIE Error, the other errors will + * be ignored. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_cookie_echoed_err(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_errhdr_t *err; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ERROR chunk has a valid length. + * The parameter walking depends on this as well. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Process the error here */ + /* FUTURE FIXME: When PR-SCTP related and other optional + * parms are emitted, this will have to change to handle multiple + * errors. + */ + sctp_walk_errors(err, chunk->chunk_hdr) { + if (SCTP_ERROR_STALE_COOKIE == err->cause) + return sctp_sf_do_5_2_6_stale(ep, asoc, type, + arg, commands); + } + + /* It is possible to have malformed error causes, and that + * will cause us to end the walk early. However, since + * we are discarding the packet, there should be no adverse + * affects. + */ + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); +} + +/* + * Handle a Stale COOKIE Error + * + * Section: 5.2.6 Handle Stale COOKIE Error + * If the association is in the COOKIE-ECHOED state, the endpoint may elect + * one of the following three alternatives. + * ... + * 3) Send a new INIT chunk to the endpoint, adding a Cookie + * Preservative parameter requesting an extension to the lifetime of + * the State Cookie. When calculating the time extension, an + * implementation SHOULD use the RTT information measured based on the + * previous COOKIE ECHO / ERROR exchange, and should add no more + * than 1 second beyond the measured RTT, due to long State Cookie + * lifetimes making the endpoint more subject to a replay attack. + * + * Verification Tag: Not explicit, but safe to ignore. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + time_t stale; + sctp_cookie_preserve_param_t bht; + sctp_errhdr_t *err; + struct sctp_chunk *reply; + struct sctp_bind_addr *bp; + int attempts = asoc->init_err_counter + 1; + + if (attempts > asoc->max_init_attempts) { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_PERR(SCTP_ERROR_STALE_COOKIE)); + return SCTP_DISPOSITION_DELETE_TCB; + } + + err = (sctp_errhdr_t *)(chunk->skb->data); + + /* When calculating the time extension, an implementation + * SHOULD use the RTT information measured based on the + * previous COOKIE ECHO / ERROR exchange, and should add no + * more than 1 second beyond the measured RTT, due to long + * State Cookie lifetimes making the endpoint more subject to + * a replay attack. + * Measure of Staleness's unit is usec. (1/1000000 sec) + * Suggested Cookie Life-span Increment's unit is msec. + * (1/1000 sec) + * In general, if you use the suggested cookie life, the value + * found in the field of measure of staleness should be doubled + * to give ample time to retransmit the new cookie and thus + * yield a higher probability of success on the reattempt. + */ + stale = ntohl(*(__be32 *)((u8 *)err + sizeof(sctp_errhdr_t))); + stale = (stale * 2) / 1000; + + bht.param_hdr.type = SCTP_PARAM_COOKIE_PRESERVATIVE; + bht.param_hdr.length = htons(sizeof(bht)); + bht.lifespan_increment = htonl(stale); + + /* Build that new INIT chunk. */ + bp = (struct sctp_bind_addr *) &asoc->base.bind_addr; + reply = sctp_make_init(asoc, bp, GFP_ATOMIC, sizeof(bht)); + if (!reply) + goto nomem; + + sctp_addto_chunk(reply, sizeof(bht), &bht); + + /* Clear peer's init_tag cached in assoc as we are sending a new INIT */ + sctp_add_cmd_sf(commands, SCTP_CMD_CLEAR_INIT_TAG, SCTP_NULL()); + + /* Stop pending T3-rtx and heartbeat timers */ + sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL()); + + /* Delete non-primary peer ip addresses since we are transitioning + * back to the COOKIE-WAIT state + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL()); + + /* If we've sent any data bundled with COOKIE-ECHO we will need to + * resend + */ + sctp_add_cmd_sf(commands, SCTP_CMD_T1_RETRAN, + SCTP_TRANSPORT(asoc->peer.primary_path)); + + /* Cast away the const modifier, as we want to just + * rerun it through as a sideffect. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL()); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_COOKIE_WAIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process an ABORT. + * + * Section: 9.1 + * After checking the Verification Tag, the receiving endpoint shall + * remove the association from its record, and shall report the + * termination to its upper layer. + * + * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules + * B) Rules for packet carrying ABORT: + * + * - The endpoint shall always fill in the Verification Tag field of the + * outbound packet with the destination endpoint's tag value if it + * is known. + * + * - If the ABORT is sent in response to an OOTB packet, the endpoint + * MUST follow the procedure described in Section 8.4. + * + * - The receiver MUST accept the packet if the Verification Tag + * matches either its own tag, OR the tag of its peer. Otherwise, the + * receiver MUST silently discard the packet and take no further + * action. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Because the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* ADD-IP: Special case for ABORT chunks + * F4) One special consideration is that ABORT Chunks arriving + * destined to the IP address being deleted MUST be + * ignored (see Section 5.3.1 for further details). + */ + if (SCTP_ADDR_DEL == + sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + + return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); +} + +static sctp_disposition_t __sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + unsigned len; + __be16 error = SCTP_ERROR_NO_ERROR; + + /* See if we have an error cause code in the chunk. */ + len = ntohs(chunk->chunk_hdr->length); + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) { + + sctp_errhdr_t *err; + sctp_walk_errors(err, chunk->chunk_hdr); + if ((void *)err != (void *)chunk->chunk_end) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + error = ((sctp_errhdr_t *)chunk->skb->data)->cause; + } + + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET)); + /* ASSOC_FAILED will DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, SCTP_PERR(error)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + return SCTP_DISPOSITION_ABORT; +} + +/* + * Process an ABORT. (COOKIE-WAIT state) + * + * See sctp_sf_do_9_1_abort() above. + */ +sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + unsigned len; + __be16 error = SCTP_ERROR_NO_ERROR; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Because the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* See if we have an error cause code in the chunk. */ + len = ntohs(chunk->chunk_hdr->length); + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) + error = ((sctp_errhdr_t *)chunk->skb->data)->cause; + + return sctp_stop_t1_and_abort(commands, error, ECONNREFUSED, asoc, + chunk->transport); +} + +/* + * Process an incoming ICMP as an ABORT. (COOKIE-WAIT state) + */ +sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR, + ENOPROTOOPT, asoc, + (struct sctp_transport *)arg); +} + +/* + * Process an ABORT. (COOKIE-ECHOED state) + */ +sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* There is a single T1 timer, so we should be able to use + * common function with the COOKIE-WAIT state. + */ + return sctp_sf_cookie_wait_abort(ep, asoc, type, arg, commands); +} + +/* + * Stop T1 timer and abort association with "INIT failed". + * + * This is common code called by several sctp_sf_*_abort() functions above. + */ +static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, + __be16 error, int sk_err, + const struct sctp_association *asoc, + struct sctp_transport *transport) +{ + SCTP_DEBUG_PRINTK("ABORT received (INIT).\n"); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(sk_err)); + /* CMD_INIT_FAILED will DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_PERR(error)); + return SCTP_DISPOSITION_ABORT; +} + +/* + * sctp_sf_do_9_2_shut + * + * Section: 9.2 + * Upon the reception of the SHUTDOWN, the peer endpoint shall + * - enter the SHUTDOWN-RECEIVED state, + * + * - stop accepting new data from its SCTP user + * + * - verify, by checking the Cumulative TSN Ack field of the chunk, + * that all its outstanding DATA chunks have been received by the + * SHUTDOWN sender. + * + * Once an endpoint as reached the SHUTDOWN-RECEIVED state it MUST NOT + * send a SHUTDOWN in response to a ULP request. And should discard + * subsequent SHUTDOWN chunks. + * + * If there are still outstanding DATA chunks left, the SHUTDOWN + * receiver shall continue to follow normal data transmission + * procedures defined in Section 6 until all outstanding DATA chunks + * are acknowledged; however, the SHUTDOWN receiver MUST NOT accept + * new data from its SCTP user. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_shutdownhdr_t *sdh; + sctp_disposition_t disposition; + struct sctp_ulpevent *ev; + __u32 ctsn; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, + sizeof(struct sctp_shutdown_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Convert the elaborate header. */ + sdh = (sctp_shutdownhdr_t *)chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t)); + chunk->subh.shutdown_hdr = sdh; + ctsn = ntohl(sdh->cum_tsn_ack); + + if (TSN_lt(ctsn, asoc->ctsn_ack_point)) { + SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn); + SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point); + return SCTP_DISPOSITION_DISCARD; + } + + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (!TSN_lt(ctsn, asoc->next_tsn)) + return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands); + + /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT + * When a peer sends a SHUTDOWN, SCTP delivers this notification to + * inform the application that it should cease sending data. + */ + ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC); + if (!ev) { + disposition = SCTP_DISPOSITION_NOMEM; + goto out; + } + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Upon the reception of the SHUTDOWN, the peer endpoint shall + * - enter the SHUTDOWN-RECEIVED state, + * - stop accepting new data from its SCTP user + * + * [This is implicit in the new state.] + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_RECEIVED)); + disposition = SCTP_DISPOSITION_CONSUME; + + if (sctp_outq_is_empty(&asoc->outqueue)) { + disposition = sctp_sf_do_9_2_shutdown_ack(ep, asoc, type, + arg, commands); + } + + if (SCTP_DISPOSITION_NOMEM == disposition) + goto out; + + /* - verify, by checking the Cumulative TSN Ack field of the + * chunk, that all its outstanding DATA chunks have been + * received by the SHUTDOWN sender. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN, + SCTP_BE32(chunk->subh.shutdown_hdr->cum_tsn_ack)); + +out: + return disposition; +} + +/* + * sctp_sf_do_9_2_shut_ctsn + * + * Once an endpoint has reached the SHUTDOWN-RECEIVED state, + * it MUST NOT send a SHUTDOWN in response to a ULP request. + * The Cumulative TSN Ack of the received SHUTDOWN chunk + * MUST be processed. + */ +sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_shutdownhdr_t *sdh; + __u32 ctsn; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, + sizeof(struct sctp_shutdown_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + sdh = (sctp_shutdownhdr_t *)chunk->skb->data; + ctsn = ntohl(sdh->cum_tsn_ack); + + if (TSN_lt(ctsn, asoc->ctsn_ack_point)) { + SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn); + SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point); + return SCTP_DISPOSITION_DISCARD; + } + + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (!TSN_lt(ctsn, asoc->next_tsn)) + return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands); + + /* verify, by checking the Cumulative TSN Ack field of the + * chunk, that all its outstanding DATA chunks have been + * received by the SHUTDOWN sender. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN, + SCTP_BE32(sdh->cum_tsn_ack)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* RFC 2960 9.2 + * If an endpoint is in SHUTDOWN-ACK-SENT state and receives an INIT chunk + * (e.g., if the SHUTDOWN COMPLETE was lost) with source and destination + * transport addresses (either in the IP addresses or in the INIT chunk) + * that belong to this association, it should discard the INIT chunk and + * retransmit the SHUTDOWN ACK chunk. + */ +sctp_disposition_t sctp_sf_do_9_2_reshutack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = (struct sctp_chunk *) arg; + struct sctp_chunk *reply; + + /* Make sure that the chunk has a valid length */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Since we are not going to really process this INIT, there + * is no point in verifying chunk boundries. Just generate + * the SHUTDOWN ACK. + */ + reply = sctp_make_shutdown_ack(asoc, chunk); + if (NULL == reply) + goto nomem; + + /* Set the transport for the SHUTDOWN ACK chunk and the timeout for + * the T2-SHUTDOWN timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* and restart the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * sctp_sf_do_ecn_cwr + * + * Section: Appendix A: Explicit Congestion Notification + * + * CWR: + * + * RFC 2481 details a specific bit for a sender to send in the header of + * its next outbound TCP segment to indicate to its peer that it has + * reduced its congestion window. This is termed the CWR bit. For + * SCTP the same indication is made by including the CWR chunk. + * This chunk contains one data element, i.e. the TSN number that + * was sent in the ECNE chunk. This element represents the lowest + * TSN number in the datagram that was originally marked with the + * CE bit. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_ecn_cwr(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_cwrhdr_t *cwr; + struct sctp_chunk *chunk = arg; + u32 lowest_tsn; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + cwr = (sctp_cwrhdr_t *) chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_cwrhdr_t)); + + lowest_tsn = ntohl(cwr->lowest_tsn); + + /* Does this CWR ack the last sent congestion notification? */ + if (TSN_lte(asoc->last_ecne_tsn, lowest_tsn)) { + /* Stop sending ECNE. */ + sctp_add_cmd_sf(commands, + SCTP_CMD_ECN_CWR, + SCTP_U32(lowest_tsn)); + } + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_sf_do_ecne + * + * Section: Appendix A: Explicit Congestion Notification + * + * ECN-Echo + * + * RFC 2481 details a specific bit for a receiver to send back in its + * TCP acknowledgements to notify the sender of the Congestion + * Experienced (CE) bit having arrived from the network. For SCTP this + * same indication is made by including the ECNE chunk. This chunk + * contains one data element, i.e. the lowest TSN associated with the IP + * datagram marked with the CE bit..... + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_ecne(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_ecnehdr_t *ecne; + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + ecne = (sctp_ecnehdr_t *) chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_ecnehdr_t)); + + /* If this is a newer ECNE than the last CWR packet we sent out */ + sctp_add_cmd_sf(commands, SCTP_CMD_ECN_ECNE, + SCTP_U32(ntohl(ecne->lowest_tsn))); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Section: 6.2 Acknowledgement on Reception of DATA Chunks + * + * The SCTP endpoint MUST always acknowledge the reception of each valid + * DATA chunk. + * + * The guidelines on delayed acknowledgement algorithm specified in + * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an + * acknowledgement SHOULD be generated for at least every second packet + * (not every second DATA chunk) received, and SHOULD be generated within + * 200 ms of the arrival of any unacknowledged DATA chunk. In some + * situations it may be beneficial for an SCTP transmitter to be more + * conservative than the algorithms detailed in this document allow. + * However, an SCTP transmitter MUST NOT be more aggressive than the + * following algorithms allow. + * + * A SCTP receiver MUST NOT generate more than one SACK for every + * incoming packet, other than to update the offered window as the + * receiving application consumes new data. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_arg_t force = SCTP_NOFORCE(); + int error; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + error = sctp_eat_data(asoc, chunk, commands ); + switch (error) { + case SCTP_IERROR_NO_ERROR: + break; + case SCTP_IERROR_HIGH_TSN: + case SCTP_IERROR_BAD_STREAM: + SCTP_INC_STATS(SCTP_MIB_IN_DATA_CHUNK_DISCARDS); + goto discard_noforce; + case SCTP_IERROR_DUP_TSN: + case SCTP_IERROR_IGNORE_TSN: + SCTP_INC_STATS(SCTP_MIB_IN_DATA_CHUNK_DISCARDS); + goto discard_force; + case SCTP_IERROR_NO_DATA: + goto consume; + case SCTP_IERROR_PROTO_VIOLATION: + return sctp_sf_abort_violation(ep, asoc, chunk, commands, + (u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t)); + default: + BUG(); + } + + if (chunk->chunk_hdr->flags & SCTP_DATA_SACK_IMM) + force = SCTP_FORCE(); + + if (asoc->autoclose) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + } + + /* If this is the last chunk in a packet, we need to count it + * toward sack generation. Note that we need to SACK every + * OTHER packet containing data chunks, EVEN IF WE DISCARD + * THEM. We elect to NOT generate SACK's if the chunk fails + * the verification tag test. + * + * RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks + * + * The SCTP endpoint MUST always acknowledge the reception of + * each valid DATA chunk. + * + * The guidelines on delayed acknowledgement algorithm + * specified in Section 4.2 of [RFC2581] SHOULD be followed. + * Specifically, an acknowledgement SHOULD be generated for at + * least every second packet (not every second DATA chunk) + * received, and SHOULD be generated within 200 ms of the + * arrival of any unacknowledged DATA chunk. In some + * situations it may be beneficial for an SCTP transmitter to + * be more conservative than the algorithms detailed in this + * document allow. However, an SCTP transmitter MUST NOT be + * more aggressive than the following algorithms allow. + */ + if (chunk->end_of_packet) + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force); + + return SCTP_DISPOSITION_CONSUME; + +discard_force: + /* RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks + * + * When a packet arrives with duplicate DATA chunk(s) and with + * no new DATA chunk(s), the endpoint MUST immediately send a + * SACK with no delay. If a packet arrives with duplicate + * DATA chunk(s) bundled with new DATA chunks, the endpoint + * MAY immediately send a SACK. Normally receipt of duplicate + * DATA chunks will occur when the original SACK chunk was lost + * and the peer's RTO has expired. The duplicate TSN number(s) + * SHOULD be reported in the SACK as duplicate. + */ + /* In our case, we split the MAY SACK advice up whether or not + * the last chunk is a duplicate.' + */ + if (chunk->end_of_packet) + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + return SCTP_DISPOSITION_DISCARD; + +discard_noforce: + if (chunk->end_of_packet) + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force); + + return SCTP_DISPOSITION_DISCARD; +consume: + return SCTP_DISPOSITION_CONSUME; + +} + +/* + * sctp_sf_eat_data_fast_4_4 + * + * Section: 4 (4) + * (4) In SHUTDOWN-SENT state the endpoint MUST acknowledge any received + * DATA chunks without delay. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_data_fast_4_4(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + int error; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + error = sctp_eat_data(asoc, chunk, commands ); + switch (error) { + case SCTP_IERROR_NO_ERROR: + case SCTP_IERROR_HIGH_TSN: + case SCTP_IERROR_DUP_TSN: + case SCTP_IERROR_IGNORE_TSN: + case SCTP_IERROR_BAD_STREAM: + break; + case SCTP_IERROR_NO_DATA: + goto consume; + case SCTP_IERROR_PROTO_VIOLATION: + return sctp_sf_abort_violation(ep, asoc, chunk, commands, + (u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t)); + default: + BUG(); + } + + /* Go a head and force a SACK, since we are shutting down. */ + + /* Implementor's Guide. + * + * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately + * respond to each received packet containing one or more DATA chunk(s) + * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer + */ + if (chunk->end_of_packet) { + /* We must delay the chunk creation since the cumulative + * TSN has not been updated yet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + } + +consume: + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Section: 6.2 Processing a Received SACK + * D) Any time a SACK arrives, the endpoint performs the following: + * + * i) If Cumulative TSN Ack is less than the Cumulative TSN Ack Point, + * then drop the SACK. Since Cumulative TSN Ack is monotonically + * increasing, a SACK whose Cumulative TSN Ack is less than the + * Cumulative TSN Ack Point indicates an out-of-order SACK. + * + * ii) Set rwnd equal to the newly received a_rwnd minus the number + * of bytes still outstanding after processing the Cumulative TSN Ack + * and the Gap Ack Blocks. + * + * iii) If the SACK is missing a TSN that was previously + * acknowledged via a Gap Ack Block (e.g., the data receiver + * reneged on the data), then mark the corresponding DATA chunk + * as available for retransmit: Mark it as missing for fast + * retransmit as described in Section 7.2.4 and if no retransmit + * timer is running for the destination address to which the DATA + * chunk was originally transmitted, then T3-rtx is started for + * that destination address. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_sack_6_2(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_sackhdr_t *sackh; + __u32 ctsn; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_sack_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Pull the SACK chunk from the data buffer */ + sackh = sctp_sm_pull_sack(chunk); + /* Was this a bogus SACK? */ + if (!sackh) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + chunk->subh.sack_hdr = sackh; + ctsn = ntohl(sackh->cum_tsn_ack); + + /* i) If Cumulative TSN Ack is less than the Cumulative TSN + * Ack Point, then drop the SACK. Since Cumulative TSN + * Ack is monotonically increasing, a SACK whose + * Cumulative TSN Ack is less than the Cumulative TSN Ack + * Point indicates an out-of-order SACK. + */ + if (TSN_lt(ctsn, asoc->ctsn_ack_point)) { + SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn); + SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point); + return SCTP_DISPOSITION_DISCARD; + } + + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (!TSN_lt(ctsn, asoc->next_tsn)) + return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands); + + /* Return this SACK for further processing. */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_SACKH(sackh)); + + /* Note: We do the rest of the work on the PROCESS_SACK + * sideeffect. + */ + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Generate an ABORT in response to a packet. + * + * Section: 8.4 Handle "Out of the blue" Packets, sctpimpguide 2.41 + * + * 8) The receiver should respond to the sender of the OOTB packet with + * an ABORT. When sending the ABORT, the receiver of the OOTB packet + * MUST fill in the Verification Tag field of the outbound packet + * with the value found in the Verification Tag field of the OOTB + * packet and set the T-bit in the Chunk Flags to indicate that the + * Verification Tag is reflected. After sending this ABORT, the + * receiver of the OOTB packet shall discard the OOTB packet and take + * no further action. + * + * Verification Tag: + * + * The return value is the disposition of the chunk. +*/ +static sctp_disposition_t sctp_sf_tabort_8_4_8(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *abort; + + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (packet) { + /* Make an ABORT. The T bit will be set if the asoc + * is NULL. + */ + abort = sctp_make_abort(asoc, chunk, 0); + if (!abort) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } + + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(abort)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); + + /* Set the skb to the belonging sock for accounting. */ + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + sctp_sf_pdiscard(ep, asoc, type, arg, commands); + return SCTP_DISPOSITION_CONSUME; + } + + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Received an ERROR chunk from peer. Generate SCTP_REMOTE_ERROR + * event as ULP notification for each cause included in the chunk. + * + * API 5.3.1.3 - SCTP_REMOTE_ERROR + * + * The return value is the disposition of the chunk. +*/ +sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_errhdr_t *err; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ERROR chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + sctp_walk_errors(err, chunk->chunk_hdr); + if ((void *)err != (void *)chunk->chunk_end) + return sctp_sf_violation_paramlen(ep, asoc, type, arg, + (void *)err, commands); + + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR, + SCTP_CHUNK(chunk)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process an inbound SHUTDOWN ACK. + * + * From Section 9.2: + * Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall + * stop the T2-shutdown timer, send a SHUTDOWN COMPLETE chunk to its + * peer, and remove all record of the association. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_final(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *reply; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + /* 10.2 H) SHUTDOWN COMPLETE notification + * + * When SCTP completes the shutdown procedures (section 9.2) this + * notification is passed to the upper layer. + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP, + 0, 0, 0, NULL, GFP_ATOMIC); + if (!ev) + goto nomem; + + /* ...send a SHUTDOWN COMPLETE chunk to its peer, */ + reply = sctp_make_shutdown_complete(asoc, chunk); + if (!reply) + goto nomem_chunk; + + /* Do all the commands now (after allocation), so that we + * have consistent state if memory allocation failes + */ + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall + * stop the T2-shutdown timer, + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + /* ...and remove all record of the association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + return SCTP_DISPOSITION_DELETE_TCB; + +nomem_chunk: + sctp_ulpevent_free(ev); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * RFC 2960, 8.4 - Handle "Out of the blue" Packets, sctpimpguide 2.41. + * + * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should + * respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE. + * When sending the SHUTDOWN COMPLETE, the receiver of the OOTB + * packet must fill in the Verification Tag field of the outbound + * packet with the Verification Tag received in the SHUTDOWN ACK and + * set the T-bit in the Chunk Flags to indicate that the Verification + * Tag is reflected. + * + * 8) The receiver should respond to the sender of the OOTB packet with + * an ABORT. When sending the ABORT, the receiver of the OOTB packet + * MUST fill in the Verification Tag field of the outbound packet + * with the value found in the Verification Tag field of the OOTB + * packet and set the T-bit in the Chunk Flags to indicate that the + * Verification Tag is reflected. After sending this ABORT, the + * receiver of the OOTB packet shall discard the OOTB packet and take + * no further action. + */ +sctp_disposition_t sctp_sf_ootb(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sk_buff *skb = chunk->skb; + sctp_chunkhdr_t *ch; + sctp_errhdr_t *err; + __u8 *ch_end; + int ootb_shut_ack = 0; + int ootb_cookie_ack = 0; + + SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES); + + ch = (sctp_chunkhdr_t *) chunk->chunk_hdr; + do { + /* Report violation if the chunk is less then minimal */ + if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Now that we know we at least have a chunk header, + * do things that are type appropriate. + */ + if (SCTP_CID_SHUTDOWN_ACK == ch->type) + ootb_shut_ack = 1; + + /* RFC 2960, Section 3.3.7 + * Moreover, under any circumstances, an endpoint that + * receives an ABORT MUST NOT respond to that ABORT by + * sending an ABORT of its own. + */ + if (SCTP_CID_ABORT == ch->type) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* RFC 8.4, 7) If the packet contains a "Stale cookie" ERROR + * or a COOKIE ACK the SCTP Packet should be silently + * discarded. + */ + + if (SCTP_CID_COOKIE_ACK == ch->type) + ootb_cookie_ack = 1; + + if (SCTP_CID_ERROR == ch->type) { + sctp_walk_errors(err, ch) { + if (SCTP_ERROR_STALE_COOKIE == err->cause) { + ootb_cookie_ack = 1; + break; + } + } + } + + /* Report violation if chunk len overflows */ + ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb_tail_pointer(skb)) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + ch = (sctp_chunkhdr_t *) ch_end; + } while (ch_end < skb_tail_pointer(skb)); + + if (ootb_shut_ack) + return sctp_sf_shut_8_4_5(ep, asoc, type, arg, commands); + else if (ootb_cookie_ack) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + else + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); +} + +/* + * Handle an "Out of the blue" SHUTDOWN ACK. + * + * Section: 8.4 5, sctpimpguide 2.41. + * + * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should + * respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE. + * When sending the SHUTDOWN COMPLETE, the receiver of the OOTB + * packet must fill in the Verification Tag field of the outbound + * packet with the Verification Tag received in the SHUTDOWN ACK and + * set the T-bit in the Chunk Flags to indicate that the Verification + * Tag is reflected. + * + * Inputs + * (endpoint, asoc, type, arg, commands) + * + * Outputs + * (sctp_disposition_t) + * + * The return value is the disposition of the chunk. + */ +static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *shut; + + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (packet) { + /* Make an SHUTDOWN_COMPLETE. + * The T bit will be set if the asoc is NULL. + */ + shut = sctp_make_shutdown_complete(asoc, chunk); + if (!shut) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } + + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(shut)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); + + /* Set the skb to the belonging sock for accounting. */ + shut->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, shut); + + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + /* If the chunk length is invalid, we don't want to process + * the reset of the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* We need to discard the rest of the packet to prevent + * potential bomming attacks from additional bundled chunks. + * This is documented in SCTP Threats ID. + */ + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Handle SHUTDOWN ACK in COOKIE_ECHOED or COOKIE_WAIT state. + * + * Verification Tag: 8.5.1 E) Rules for packet carrying a SHUTDOWN ACK + * If the receiver is in COOKIE-ECHOED or COOKIE-WAIT state the + * procedures in section 8.4 SHOULD be followed, in other words it + * should be treated as an Out Of The Blue packet. + * [This means that we do NOT check the Verification Tag on these + * chunks. --piggy ] + * + */ +sctp_disposition_t sctp_sf_do_8_5_1_E_sa(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Although we do have an association in this case, it corresponds + * to a restarted association. So the packet is treated as an OOTB + * packet and the state function that handles OOTB SHUTDOWN_ACK is + * called with a NULL association. + */ + SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES); + + return sctp_sf_shut_8_4_5(ep, NULL, type, arg, commands); +} + +/* ADDIP Section 4.2 Upon reception of an ASCONF Chunk. */ +sctp_disposition_t sctp_sf_do_asconf(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *asconf_ack = NULL; + struct sctp_paramhdr *err_param = NULL; + sctp_addiphdr_t *hdr; + union sctp_addr_param *addr_param; + __u32 serial; + int length; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* ADD-IP: Section 4.1.1 + * This chunk MUST be sent in an authenticated way by using + * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk + * is received unauthenticated it MUST be silently discarded as + * described in [I-D.ietf-tsvwg-sctp-auth]. + */ + if (!sctp_addip_noauth && !chunk->auth) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + + /* Make sure that the ASCONF ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_addip_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + hdr = (sctp_addiphdr_t *)chunk->skb->data; + serial = ntohl(hdr->serial); + + addr_param = (union sctp_addr_param *)hdr->params; + length = ntohs(addr_param->p.length); + if (length < sizeof(sctp_paramhdr_t)) + return sctp_sf_violation_paramlen(ep, asoc, type, arg, + (void *)addr_param, commands); + + /* Verify the ASCONF chunk before processing it. */ + if (!sctp_verify_asconf(asoc, + (sctp_paramhdr_t *)((void *)addr_param + length), + (void *)chunk->chunk_end, + &err_param)) + return sctp_sf_violation_paramlen(ep, asoc, type, arg, + (void *)err_param, commands); + + /* ADDIP 5.2 E1) Compare the value of the serial number to the value + * the endpoint stored in a new association variable + * 'Peer-Serial-Number'. + */ + if (serial == asoc->peer.addip_serial + 1) { + /* If this is the first instance of ASCONF in the packet, + * we can clean our old ASCONF-ACKs. + */ + if (!chunk->has_asconf) + sctp_assoc_clean_asconf_ack_cache(asoc); + + /* ADDIP 5.2 E4) When the Sequence Number matches the next one + * expected, process the ASCONF as described below and after + * processing the ASCONF Chunk, append an ASCONF-ACK Chunk to + * the response packet and cache a copy of it (in the event it + * later needs to be retransmitted). + * + * Essentially, do V1-V5. + */ + asconf_ack = sctp_process_asconf((struct sctp_association *) + asoc, chunk); + if (!asconf_ack) + return SCTP_DISPOSITION_NOMEM; + } else if (serial < asoc->peer.addip_serial + 1) { + /* ADDIP 5.2 E2) + * If the value found in the Sequence Number is less than the + * ('Peer- Sequence-Number' + 1), simply skip to the next + * ASCONF, and include in the outbound response packet + * any previously cached ASCONF-ACK response that was + * sent and saved that matches the Sequence Number of the + * ASCONF. Note: It is possible that no cached ASCONF-ACK + * Chunk exists. This will occur when an older ASCONF + * arrives out of order. In such a case, the receiver + * should skip the ASCONF Chunk and not include ASCONF-ACK + * Chunk for that chunk. + */ + asconf_ack = sctp_assoc_lookup_asconf_ack(asoc, hdr->serial); + if (!asconf_ack) + return SCTP_DISPOSITION_DISCARD; + + /* Reset the transport so that we select the correct one + * this time around. This is to make sure that we don't + * accidentally use a stale transport that's been removed. + */ + asconf_ack->transport = NULL; + } else { + /* ADDIP 5.2 E5) Otherwise, the ASCONF Chunk is discarded since + * it must be either a stale packet or from an attacker. + */ + return SCTP_DISPOSITION_DISCARD; + } + + /* ADDIP 5.2 E6) The destination address of the SCTP packet + * containing the ASCONF-ACK Chunks MUST be the source address of + * the SCTP packet that held the ASCONF Chunks. + * + * To do this properly, we'll set the destination address of the chunk + * and at the transmit time, will try look up the transport to use. + * Since ASCONFs may be bundled, the correct transport may not be + * created until we process the entire packet, thus this workaround. + */ + asconf_ack->dest = chunk->source; + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * ADDIP Section 4.3 General rules for address manipulation + * When building TLV parameters for the ASCONF Chunk that will add or + * delete IP addresses the D0 to D13 rules should be applied: + */ +sctp_disposition_t sctp_sf_do_asconf_ack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *asconf_ack = arg; + struct sctp_chunk *last_asconf = asoc->addip_last_asconf; + struct sctp_chunk *abort; + struct sctp_paramhdr *err_param = NULL; + sctp_addiphdr_t *addip_hdr; + __u32 sent_serial, rcvd_serial; + + if (!sctp_vtag_verify(asconf_ack, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* ADD-IP, Section 4.1.2: + * This chunk MUST be sent in an authenticated way by using + * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk + * is received unauthenticated it MUST be silently discarded as + * described in [I-D.ietf-tsvwg-sctp-auth]. + */ + if (!sctp_addip_noauth && !asconf_ack->auth) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + + /* Make sure that the ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(asconf_ack, sizeof(sctp_addip_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + addip_hdr = (sctp_addiphdr_t *)asconf_ack->skb->data; + rcvd_serial = ntohl(addip_hdr->serial); + + /* Verify the ASCONF-ACK chunk before processing it. */ + if (!sctp_verify_asconf(asoc, + (sctp_paramhdr_t *)addip_hdr->params, + (void *)asconf_ack->chunk_end, + &err_param)) + return sctp_sf_violation_paramlen(ep, asoc, type, arg, + (void *)err_param, commands); + + if (last_asconf) { + addip_hdr = (sctp_addiphdr_t *)last_asconf->subh.addip_hdr; + sent_serial = ntohl(addip_hdr->serial); + } else { + sent_serial = asoc->addip_serial - 1; + } + + /* D0) If an endpoint receives an ASCONF-ACK that is greater than or + * equal to the next serial number to be used but no ASCONF chunk is + * outstanding the endpoint MUST ABORT the association. Note that a + * sequence number is greater than if it is no more than 2^^31-1 + * larger than the current sequence number (using serial arithmetic). + */ + if (ADDIP_SERIAL_gte(rcvd_serial, sent_serial + 1) && + !(asoc->addip_last_asconf)) { + abort = sctp_make_abort(asoc, asconf_ack, + sizeof(sctp_errhdr_t)); + if (abort) { + sctp_init_cause(abort, SCTP_ERROR_ASCONF_ACK, 0); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(abort)); + } + /* We are going to ABORT, so we might as well stop + * processing the rest of the chunks in the packet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_ASCONF_ACK)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_ABORT; + } + + if ((rcvd_serial == sent_serial) && asoc->addip_last_asconf) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + + if (!sctp_process_asconf_ack((struct sctp_association *)asoc, + asconf_ack)) { + /* Successfully processed ASCONF_ACK. We can + * release the next asconf if we have one. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF, + SCTP_NULL()); + return SCTP_DISPOSITION_CONSUME; + } + + abort = sctp_make_abort(asoc, asconf_ack, + sizeof(sctp_errhdr_t)); + if (abort) { + sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(abort)); + } + /* We are going to ABORT, so we might as well stop + * processing the rest of the chunks in the packet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_ASCONF_ACK)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_ABORT; + } + + return SCTP_DISPOSITION_DISCARD; +} + +/* + * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP + * + * When a FORWARD TSN chunk arrives, the data receiver MUST first update + * its cumulative TSN point to the value carried in the FORWARD TSN + * chunk, and then MUST further advance its cumulative TSN point locally + * if possible. + * After the above processing, the data receiver MUST stop reporting any + * missing TSNs earlier than or equal to the new cumulative TSN point. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_fwdtsn_hdr *fwdtsn_hdr; + struct sctp_fwdtsn_skip *skip; + __u16 len; + __u32 tsn; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the FORWARD_TSN chunk has valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data; + chunk->subh.fwdtsn_hdr = fwdtsn_hdr; + len = ntohs(chunk->chunk_hdr->length); + len -= sizeof(struct sctp_chunkhdr); + skb_pull(chunk->skb, len); + + tsn = ntohl(fwdtsn_hdr->new_cum_tsn); + SCTP_DEBUG_PRINTK("%s: TSN 0x%x.\n", __func__, tsn); + + /* The TSN is too high--silently discard the chunk and count on it + * getting retransmitted later. + */ + if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) + goto discard_noforce; + + /* Silently discard the chunk if stream-id is not valid */ + sctp_walk_fwdtsn(skip, chunk) { + if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams) + goto discard_noforce; + } + + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); + if (len > sizeof(struct sctp_fwdtsn_hdr)) + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, + SCTP_CHUNK(chunk)); + + /* Count this as receiving DATA. */ + if (asoc->autoclose) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + } + + /* FIXME: For now send a SACK, but DATA processing may + * send another. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); + + return SCTP_DISPOSITION_CONSUME; + +discard_noforce: + return SCTP_DISPOSITION_DISCARD; +} + +sctp_disposition_t sctp_sf_eat_fwd_tsn_fast( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_fwdtsn_hdr *fwdtsn_hdr; + struct sctp_fwdtsn_skip *skip; + __u16 len; + __u32 tsn; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the FORWARD_TSN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data; + chunk->subh.fwdtsn_hdr = fwdtsn_hdr; + len = ntohs(chunk->chunk_hdr->length); + len -= sizeof(struct sctp_chunkhdr); + skb_pull(chunk->skb, len); + + tsn = ntohl(fwdtsn_hdr->new_cum_tsn); + SCTP_DEBUG_PRINTK("%s: TSN 0x%x.\n", __func__, tsn); + + /* The TSN is too high--silently discard the chunk and count on it + * getting retransmitted later. + */ + if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) + goto gen_shutdown; + + /* Silently discard the chunk if stream-id is not valid */ + sctp_walk_fwdtsn(skip, chunk) { + if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams) + goto gen_shutdown; + } + + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); + if (len > sizeof(struct sctp_fwdtsn_hdr)) + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, + SCTP_CHUNK(chunk)); + + /* Go a head and force a SACK, since we are shutting down. */ +gen_shutdown: + /* Implementor's Guide. + * + * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately + * respond to each received packet containing one or more DATA chunk(s) + * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * SCTP-AUTH Section 6.3 Receiving authenticated chukns + * + * The receiver MUST use the HMAC algorithm indicated in the HMAC + * Identifier field. If this algorithm was not specified by the + * receiver in the HMAC-ALGO parameter in the INIT or INIT-ACK chunk + * during association setup, the AUTH chunk and all chunks after it MUST + * be discarded and an ERROR chunk SHOULD be sent with the error cause + * defined in Section 4.1. + * + * If an endpoint with no shared key receives a Shared Key Identifier + * other than 0, it MUST silently discard all authenticated chunks. If + * the endpoint has at least one endpoint pair shared key for the peer, + * it MUST use the key specified by the Shared Key Identifier if a + * key has been configured for that Shared Key Identifier. If no + * endpoint pair shared key has been configured for that Shared Key + * Identifier, all authenticated chunks MUST be silently discarded. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * The return value is the disposition of the chunk. + */ +static sctp_ierror_t sctp_sf_authenticate(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + struct sctp_chunk *chunk) +{ + struct sctp_authhdr *auth_hdr; + struct sctp_hmac *hmac; + unsigned int sig_len; + __u16 key_id; + __u8 *save_digest; + __u8 *digest; + + /* Pull in the auth header, so we can do some more verification */ + auth_hdr = (struct sctp_authhdr *)chunk->skb->data; + chunk->subh.auth_hdr = auth_hdr; + skb_pull(chunk->skb, sizeof(struct sctp_authhdr)); + + /* Make sure that we suport the HMAC algorithm from the auth + * chunk. + */ + if (!sctp_auth_asoc_verify_hmac_id(asoc, auth_hdr->hmac_id)) + return SCTP_IERROR_AUTH_BAD_HMAC; + + /* Make sure that the provided shared key identifier has been + * configured + */ + key_id = ntohs(auth_hdr->shkey_id); + if (key_id != asoc->active_key_id && !sctp_auth_get_shkey(asoc, key_id)) + return SCTP_IERROR_AUTH_BAD_KEYID; + + + /* Make sure that the length of the signature matches what + * we expect. + */ + sig_len = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_auth_chunk_t); + hmac = sctp_auth_get_hmac(ntohs(auth_hdr->hmac_id)); + if (sig_len != hmac->hmac_len) + return SCTP_IERROR_PROTO_VIOLATION; + + /* Now that we've done validation checks, we can compute and + * verify the hmac. The steps involved are: + * 1. Save the digest from the chunk. + * 2. Zero out the digest in the chunk. + * 3. Compute the new digest + * 4. Compare saved and new digests. + */ + digest = auth_hdr->hmac; + skb_pull(chunk->skb, sig_len); + + save_digest = kmemdup(digest, sig_len, GFP_ATOMIC); + if (!save_digest) + goto nomem; + + memset(digest, 0, sig_len); + + sctp_auth_calculate_hmac(asoc, chunk->skb, + (struct sctp_auth_chunk *)chunk->chunk_hdr, + GFP_ATOMIC); + + /* Discard the packet if the digests do not match */ + if (memcmp(save_digest, digest, sig_len)) { + kfree(save_digest); + return SCTP_IERROR_BAD_SIG; + } + + kfree(save_digest); + chunk->auth = 1; + + return SCTP_IERROR_NO_ERROR; +nomem: + return SCTP_IERROR_NOMEM; +} + +sctp_disposition_t sctp_sf_eat_auth(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_authhdr *auth_hdr; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *err_chunk; + sctp_ierror_t error; + + /* Make sure that the peer has AUTH capable */ + if (!asoc->peer.auth_capable) + return sctp_sf_unk_chunk(ep, asoc, type, arg, commands); + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the AUTH chunk has valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_auth_chunk))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + auth_hdr = (struct sctp_authhdr *)chunk->skb->data; + error = sctp_sf_authenticate(ep, asoc, type, chunk); + switch (error) { + case SCTP_IERROR_AUTH_BAD_HMAC: + /* Generate the ERROR chunk and discard the rest + * of the packet + */ + err_chunk = sctp_make_op_error(asoc, chunk, + SCTP_ERROR_UNSUP_HMAC, + &auth_hdr->hmac_id, + sizeof(__u16), 0); + if (err_chunk) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err_chunk)); + } + /* Fall Through */ + case SCTP_IERROR_AUTH_BAD_KEYID: + case SCTP_IERROR_BAD_SIG: + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + break; + case SCTP_IERROR_PROTO_VIOLATION: + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + break; + case SCTP_IERROR_NOMEM: + return SCTP_DISPOSITION_NOMEM; + default: + break; + } + + if (asoc->active_key_id != ntohs(auth_hdr->shkey_id)) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id), + SCTP_AUTH_NEWKEY, GFP_ATOMIC); + + if (!ev) + return -ENOMEM; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process an unknown chunk. + * + * Section: 3.2. Also, 2.1 in the implementor's guide. + * + * Chunk Types are encoded such that the highest-order two bits specify + * the action that must be taken if the processing endpoint does not + * recognize the Chunk Type. + * + * 00 - Stop processing this SCTP packet and discard it, do not process + * any further chunks within it. + * + * 01 - Stop processing this SCTP packet and discard it, do not process + * any further chunks within it, and report the unrecognized + * chunk in an 'Unrecognized Chunk Type'. + * + * 10 - Skip this chunk and continue processing. + * + * 11 - Skip this chunk and continue processing, but report in an ERROR + * Chunk using the 'Unrecognized Chunk Type' cause of error. + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_unk_chunk(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *unk_chunk = arg; + struct sctp_chunk *err_chunk; + sctp_chunkhdr_t *hdr; + + SCTP_DEBUG_PRINTK("Processing the unknown chunk id %d.\n", type.chunk); + + if (!sctp_vtag_verify(unk_chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the chunk has a valid length. + * Since we don't know the chunk type, we use a general + * chunkhdr structure to make a comparison. + */ + if (!sctp_chunk_length_valid(unk_chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + switch (type.chunk & SCTP_CID_ACTION_MASK) { + case SCTP_CID_ACTION_DISCARD: + /* Discard the packet. */ + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + break; + case SCTP_CID_ACTION_DISCARD_ERR: + /* Generate an ERROR chunk as response. */ + hdr = unk_chunk->chunk_hdr; + err_chunk = sctp_make_op_error(asoc, unk_chunk, + SCTP_ERROR_UNKNOWN_CHUNK, hdr, + WORD_ROUND(ntohs(hdr->length)), + 0); + if (err_chunk) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err_chunk)); + } + + /* Discard the packet. */ + sctp_sf_pdiscard(ep, asoc, type, arg, commands); + return SCTP_DISPOSITION_CONSUME; + break; + case SCTP_CID_ACTION_SKIP: + /* Skip the chunk. */ + return SCTP_DISPOSITION_DISCARD; + break; + case SCTP_CID_ACTION_SKIP_ERR: + /* Generate an ERROR chunk as response. */ + hdr = unk_chunk->chunk_hdr; + err_chunk = sctp_make_op_error(asoc, unk_chunk, + SCTP_ERROR_UNKNOWN_CHUNK, hdr, + WORD_ROUND(ntohs(hdr->length)), + 0); + if (err_chunk) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err_chunk)); + } + /* Skip the chunk. */ + return SCTP_DISPOSITION_CONSUME; + break; + default: + break; + } + + return SCTP_DISPOSITION_DISCARD; +} + +/* + * Discard the chunk. + * + * Section: 0.2, 5.2.3, 5.2.5, 5.2.6, 6.0, 8.4.6, 8.5.1c, 9.2 + * [Too numerous to mention...] + * Verification Tag: No verification needed. + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_discard_chunk(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + /* Make sure that the chunk has a valid length. + * Since we don't know the chunk type, we use a general + * chunkhdr structure to make a comparison. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + SCTP_DEBUG_PRINTK("Chunk %d is discarded\n", type.chunk); + return SCTP_DISPOSITION_DISCARD; +} + +/* + * Discard the whole packet. + * + * Section: 8.4 2) + * + * 2) If the OOTB packet contains an ABORT chunk, the receiver MUST + * silently discard the OOTB packet and take no further action. + * + * Verification Tag: No verification necessary + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_pdiscard(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_INC_STATS(SCTP_MIB_IN_PKT_DISCARDS); + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); + + return SCTP_DISPOSITION_CONSUME; +} + + +/* + * The other end is violating protocol. + * + * Section: Not specified + * Verification Tag: Not specified + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * We simply tag the chunk as a violation. The state machine will log + * the violation and continue. + */ +sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + /* Make sure that the chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + return SCTP_DISPOSITION_VIOLATION; +} + +/* + * Common function to handle a protocol violation. + */ +static sctp_disposition_t sctp_sf_abort_violation( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + void *arg, + sctp_cmd_seq_t *commands, + const __u8 *payload, + const size_t paylen) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *abort = NULL; + + /* SCTP-AUTH, Section 6.3: + * It should be noted that if the receiver wants to tear + * down an association in an authenticated way only, the + * handling of malformed packets should not result in + * tearing down the association. + * + * This means that if we only want to abort associations + * in an authenticated way (i.e AUTH+ABORT), then we + * can't destroy this association just because the packet + * was malformed. + */ + if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc)) + goto discard; + + /* Make the abort chunk. */ + abort = sctp_make_abort_violation(asoc, chunk, payload, paylen); + if (!abort) + goto nomem; + + if (asoc) { + /* Treat INIT-ACK as a special case during COOKIE-WAIT. */ + if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK && + !asoc->peer.i.init_tag) { + sctp_initack_chunk_t *initack; + + initack = (sctp_initack_chunk_t *)chunk->chunk_hdr; + if (!sctp_chunk_length_valid(chunk, + sizeof(sctp_initack_chunk_t))) + abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T; + else { + unsigned int inittag; + + inittag = ntohl(initack->init_hdr.init_tag); + sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_INITTAG, + SCTP_U32(inittag)); + } + } + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + if (asoc->state <= SCTP_STATE_COOKIE_ECHOED) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNREFUSED)); + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION)); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION)); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + } + } else { + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (!packet) + goto nomem_pkt; + + if (sctp_test_T_bit(abort)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); + + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + +discard: + sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands); + return SCTP_DISPOSITION_ABORT; + +nomem_pkt: + sctp_chunk_free(abort); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Handle a protocol violation when the chunk length is invalid. + * "Invalid" length is identified as smaller than the minimal length a + * given chunk can be. For example, a SACK chunk has invalid length + * if its length is set to be smaller than the size of sctp_sack_chunk_t. + * + * We inform the other end by sending an ABORT with a Protocol Violation + * error code. + * + * Section: Not specified + * Verification Tag: Nothing to do + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (reply_msg, msg_up, counters) + * + * Generate an ABORT chunk and terminate the association. + */ +static sctp_disposition_t sctp_sf_violation_chunklen( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + static const char err_str[]="The following chunk had invalid length:"; + + return sctp_sf_abort_violation(ep, asoc, arg, commands, err_str, + sizeof(err_str)); +} + +/* + * Handle a protocol violation when the parameter length is invalid. + * If the length is smaller than the minimum length of a given parameter, + * or accumulated length in multi parameters exceeds the end of the chunk, + * the length is considered as invalid. + */ +static sctp_disposition_t sctp_sf_violation_paramlen( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, void *ext, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_paramhdr *param = ext; + struct sctp_chunk *abort = NULL; + + if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc)) + goto discard; + + /* Make the abort chunk. */ + abort = sctp_make_violation_paramlen(asoc, chunk, param); + if (!abort) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION)); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + +discard: + sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands); + return SCTP_DISPOSITION_ABORT; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Handle a protocol violation when the peer trying to advance the + * cumulative tsn ack to a point beyond the max tsn currently sent. + * + * We inform the other end by sending an ABORT with a Protocol Violation + * error code. + */ +static sctp_disposition_t sctp_sf_violation_ctsn( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + static const char err_str[]="The cumulative tsn ack beyond the max tsn currently sent:"; + + return sctp_sf_abort_violation(ep, asoc, arg, commands, err_str, + sizeof(err_str)); +} + +/* Handle protocol violation of an invalid chunk bundling. For example, + * when we have an association and we receive bundled INIT-ACK, or + * SHUDOWN-COMPLETE, our peer is clearly violationg the "MUST NOT bundle" + * statement from the specs. Additionally, there might be an attacker + * on the path and we may not want to continue this communication. + */ +static sctp_disposition_t sctp_sf_violation_chunk( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + static const char err_str[]="The following chunk violates protocol:"; + + if (!asoc) + return sctp_sf_violation(ep, asoc, type, arg, commands); + + return sctp_sf_abort_violation(ep, asoc, arg, commands, err_str, + sizeof(err_str)); +} +/*************************************************************************** + * These are the state functions for handling primitive (Section 10) events. + ***************************************************************************/ +/* + * sctp_sf_do_prm_asoc + * + * Section: 10.1 ULP-to-SCTP + * B) Associate + * + * Format: ASSOCIATE(local SCTP instance name, destination transport addr, + * outbound stream count) + * -> association id [,destination transport addr list] [,outbound stream + * count] + * + * This primitive allows the upper layer to initiate an association to a + * specific peer endpoint. + * + * The peer endpoint shall be specified by one of the transport addresses + * which defines the endpoint (see Section 1.4). If the local SCTP + * instance has not been initialized, the ASSOCIATE is considered an + * error. + * [This is not relevant for the kernel implementation since we do all + * initialization at boot time. It we hadn't initialized we wouldn't + * get anywhere near this code.] + * + * An association id, which is a local handle to the SCTP association, + * will be returned on successful establishment of the association. If + * SCTP is not able to open an SCTP association with the peer endpoint, + * an error is returned. + * [In the kernel implementation, the struct sctp_association needs to + * be created BEFORE causing this primitive to run.] + * + * Other association parameters may be returned, including the + * complete destination transport addresses of the peer as well as the + * outbound stream count of the local endpoint. One of the transport + * address from the returned destination addresses will be selected by + * the local endpoint as default primary path for sending SCTP packets + * to this peer. The returned "destination transport addr list" can + * be used by the ULP to change the default primary path or to force + * sending a packet to a specific transport address. [All of this + * stuff happens when the INIT ACK arrives. This is a NON-BLOCKING + * function.] + * + * Mandatory attributes: + * + * o local SCTP instance name - obtained from the INITIALIZE operation. + * [This is the argument asoc.] + * o destination transport addr - specified as one of the transport + * addresses of the peer endpoint with which the association is to be + * established. + * [This is asoc->peer.active_path.] + * o outbound stream count - the number of outbound streams the ULP + * would like to open towards this peer endpoint. + * [BUG: This is not currently implemented.] + * Optional attributes: + * + * None. + * + * The return value is a disposition. + */ +sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *repl; + struct sctp_association* my_asoc; + + /* The comment below says that we enter COOKIE-WAIT AFTER + * sending the INIT, but that doesn't actually work in our + * implementation... + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_COOKIE_WAIT)); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * A) "A" first sends an INIT chunk to "Z". In the INIT, "A" + * must provide its Verification Tag (Tag_A) in the Initiate + * Tag field. Tag_A SHOULD be a random number in the range of + * 1 to 4294967295 (see 5.3.1 for Tag value selection). ... + */ + + repl = sctp_make_init(asoc, &asoc->base.bind_addr, GFP_ATOMIC, 0); + if (!repl) + goto nomem; + + /* Cast away the const modifier, as we want to just + * rerun it through as a sideffect. + */ + my_asoc = (struct sctp_association *)asoc; + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(my_asoc)); + + /* Choose transport for INIT. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT, + SCTP_CHUNK(repl)); + + /* After sending the INIT, "A" starts the T1-init timer and + * enters the COOKIE-WAIT state. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process the SEND primitive. + * + * Section: 10.1 ULP-to-SCTP + * E) Send + * + * Format: SEND(association id, buffer address, byte count [,context] + * [,stream id] [,life time] [,destination transport address] + * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) + * -> result + * + * This is the main method to send user data via SCTP. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o buffer address - the location where the user message to be + * transmitted is stored; + * + * o byte count - The size of the user data in number of bytes; + * + * Optional attributes: + * + * o context - an optional 32 bit integer that will be carried in the + * sending failure notification to the ULP if the transportation of + * this User Message fails. + * + * o stream id - to indicate which stream to send the data on. If not + * specified, stream 0 will be used. + * + * o life time - specifies the life time of the user data. The user data + * will not be sent by SCTP after the life time expires. This + * parameter can be used to avoid efforts to transmit stale + * user messages. SCTP notifies the ULP if the data cannot be + * initiated to transport (i.e. sent to the destination via SCTP's + * send primitive) within the life time variable. However, the + * user data will be transmitted if SCTP has attempted to transmit a + * chunk before the life time expired. + * + * o destination transport address - specified as one of the destination + * transport addresses of the peer endpoint to which this packet + * should be sent. Whenever possible, SCTP should use this destination + * transport address for sending the packets, instead of the current + * primary path. + * + * o unorder flag - this flag, if present, indicates that the user + * would like the data delivered in an unordered fashion to the peer + * (i.e., the U flag is set to 1 on all DATA chunks carrying this + * message). + * + * o no-bundle flag - instructs SCTP not to bundle this user data with + * other outbound DATA chunks. SCTP MAY still bundle even when + * this flag is present, when faced with network congestion. + * + * o payload protocol-id - A 32 bit unsigned integer that is to be + * passed to the peer indicating the type of payload protocol data + * being transmitted. This value is passed as opaque data by SCTP. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_prm_send(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_datamsg *msg = arg; + + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_MSG, SCTP_DATAMSG(msg)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process the SHUTDOWN primitive. + * + * Section: 10.1: + * C) Shutdown + * + * Format: SHUTDOWN(association id) + * -> result + * + * Gracefully closes an association. Any locally queued user data + * will be delivered to the peer. The association will be terminated only + * after the peer acknowledges all the SCTP packets sent. A success code + * will be returned on successful termination of the association. If + * attempting to terminate the association results in a failure, an error + * code shall be returned. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * Optional attributes: + * + * None. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_prm_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + int disposition; + + /* From 9.2 Shutdown of an Association + * Upon receipt of the SHUTDOWN primitive from its upper + * layer, the endpoint enters SHUTDOWN-PENDING state and + * remains there until all outstanding data has been + * acknowledged by its peer. The endpoint accepts no new data + * from its upper layer, but retransmits data to the far end + * if necessary to fill gaps. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING)); + + disposition = SCTP_DISPOSITION_CONSUME; + if (sctp_outq_is_empty(&asoc->outqueue)) { + disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type, + arg, commands); + } + return disposition; +} + +/* + * Process the ABORT primitive. + * + * Section: 10.1: + * C) Abort + * + * Format: Abort(association id [, cause code]) + * -> result + * + * Ungracefully closes an association. Any locally queued user data + * will be discarded and an ABORT chunk is sent to the peer. A success code + * will be returned on successful abortion of the association. If + * attempting to abort the association results in a failure, an error + * code shall be returned. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * Optional attributes: + * + * o cause code - reason of the abort to be passed to the peer + * + * None. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_1_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* From 9.1 Abort of an Association + * Upon receipt of the ABORT primitive from its upper + * layer, the endpoint enters CLOSED state and + * discard all outstanding data has been + * acknowledged by its peer. The endpoint accepts no new data + * from its upper layer, but retransmits data to the far end + * if necessary to fill gaps. + */ + struct sctp_chunk *abort = arg; + sctp_disposition_t retval; + + retval = SCTP_DISPOSITION_CONSUME; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + + /* Even if we can't send the ABORT due to low memory delete the + * TCB. This is a departure from our typical NOMEM handling. + */ + + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNABORTED)); + /* Delete the established association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_USER_ABORT)); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + return retval; +} + +/* We tried an illegal operation on an association which is closed. */ +sctp_disposition_t sctp_sf_error_closed(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL)); + return SCTP_DISPOSITION_CONSUME; +} + +/* We tried an illegal operation on an association which is shutting + * down. + */ +sctp_disposition_t sctp_sf_error_shutdown(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, + SCTP_ERROR(-ESHUTDOWN)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_cookie_wait_prm_shutdown + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues a shutdown while in COOKIE_WAIT state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS); + + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return SCTP_DISPOSITION_DELETE_TCB; +} + +/* + * sctp_cookie_echoed_prm_shutdown + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explcitly address this issue, but is the route through the + * state table when someone issues a shutdown while in COOKIE_ECHOED state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, sctp_cmd_seq_t *commands) +{ + /* There is a single T1 timer, so we should be able to use + * common function with the COOKIE-WAIT state. + */ + return sctp_sf_cookie_wait_prm_shutdown(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_cookie_wait_prm_abort + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues an abort while in COOKIE_WAIT state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_wait_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *abort = arg; + sctp_disposition_t retval; + + /* Stop T1-init timer */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + retval = SCTP_DISPOSITION_CONSUME; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + + /* Even if we can't send the ABORT due to low memory delete the + * TCB. This is a departure from our typical NOMEM handling. + */ + + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNREFUSED)); + /* Delete the established association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_PERR(SCTP_ERROR_USER_ABORT)); + + return retval; +} + +/* + * sctp_sf_cookie_echoed_prm_abort + * + * Section: 4 Note: 3 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explcitly address this issue, but is the route through the + * state table when someone issues an abort while in COOKIE_ECHOED state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_echoed_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* There is a single T1 timer, so we should be able to use + * common function with the COOKIE-WAIT state. + */ + return sctp_sf_cookie_wait_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_shutdown_pending_prm_abort + * + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues an abort while in SHUTDOWN-PENDING state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_shutdown_pending_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return sctp_sf_do_9_1_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_shutdown_sent_prm_abort + * + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues an abort while in SHUTDOWN-SENT state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_shutdown_sent_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Stop the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return sctp_sf_do_9_1_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_cookie_echoed_prm_abort + * + * Inputs + * (endpoint, asoc) + * + * The RFC does not explcitly address this issue, but is the route through the + * state table when someone issues an abort while in COOKIE_ECHOED state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* The same T2 timer, so we should be able to use + * common function with the SHUTDOWN-SENT state. + */ + return sctp_sf_shutdown_sent_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * Process the REQUESTHEARTBEAT primitive + * + * 10.1 ULP-to-SCTP + * J) Request Heartbeat + * + * Format: REQUESTHEARTBEAT(association id, destination transport address) + * + * -> result + * + * Instructs the local endpoint to perform a HeartBeat on the specified + * destination transport address of the given association. The returned + * result should indicate whether the transmission of the HEARTBEAT + * chunk to the destination address is successful. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o destination transport address - the transport address of the + * association on which a heartbeat should be issued. + */ +sctp_disposition_t sctp_sf_do_prm_requestheartbeat( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type, + (struct sctp_transport *)arg, commands)) + return SCTP_DISPOSITION_NOMEM; + + /* + * RFC 2960 (bis), section 8.3 + * + * D) Request an on-demand HEARTBEAT on a specific destination + * transport address of a given association. + * + * The endpoint should increment the respective error counter of + * the destination transport address each time a HEARTBEAT is sent + * to that address and not acknowledged within one RTO. + * + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT, + SCTP_TRANSPORT(arg)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * ADDIP Section 4.1 ASCONF Chunk Procedures + * When an endpoint has an ASCONF signaled change to be sent to the + * remote endpoint it should do A1 to A9 + */ +sctp_disposition_t sctp_sf_do_prm_asconf(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk)); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Ignore the primitive event + * + * The return value is the disposition of the primitive. + */ +sctp_disposition_t sctp_sf_ignore_primitive( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("Primitive type %d is ignored.\n", type.primitive); + return SCTP_DISPOSITION_DISCARD; +} + +/*************************************************************************** + * These are the state functions for the OTHER events. + ***************************************************************************/ + +/* + * When the SCTP stack has no more user data to send or retransmit, this + * notification is given to the user. Also, at the time when a user app + * subscribes to this event, if there is no data to be sent or + * retransmit, the stack will immediately send up this notification. + */ +sctp_disposition_t sctp_sf_do_no_pending_tsn( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_ulpevent *event; + + event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_ATOMIC); + if (!event) + return SCTP_DISPOSITION_NOMEM; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Start the shutdown negotiation. + * + * From Section 9.2: + * Once all its outstanding data has been acknowledged, the endpoint + * shall send a SHUTDOWN chunk to its peer including in the Cumulative + * TSN Ack field the last sequential TSN it has received from the peer. + * It shall then start the T2-shutdown timer and enter the SHUTDOWN-SENT + * state. If the timer expires, the endpoint must re-send the SHUTDOWN + * with the updated last sequential TSN received from its peer. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_start_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *reply; + + /* Once all its outstanding data has been acknowledged, the + * endpoint shall send a SHUTDOWN chunk to its peer including + * in the Cumulative TSN Ack field the last sequential TSN it + * has received from the peer. + */ + reply = sctp_make_shutdown(asoc, NULL); + if (!reply) + goto nomem; + + /* Set the transport for the SHUTDOWN chunk and the timeout for the + * T2-shutdown timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* It shall then start the T2-shutdown timer */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + /* RFC 4960 Section 9.2 + * The sender of the SHUTDOWN MAY also start an overall guard timer + * 'T5-shutdown-guard' to bound the overall time for shutdown sequence. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + if (asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + /* and enter the SHUTDOWN-SENT state. */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_SENT)); + + /* sctp-implguide 2.10 Issues with Heartbeating and failover + * + * HEARTBEAT ... is discontinued after sending either SHUTDOWN + * or SHUTDOWN-ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL()); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Generate a SHUTDOWN ACK now that everything is SACK'd. + * + * From Section 9.2: + * + * If it has no more outstanding DATA chunks, the SHUTDOWN receiver + * shall send a SHUTDOWN ACK and start a T2-shutdown timer of its own, + * entering the SHUTDOWN-ACK-SENT state. If the timer expires, the + * endpoint must re-send the SHUTDOWN ACK. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_shutdown_ack( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = (struct sctp_chunk *) arg; + struct sctp_chunk *reply; + + /* There are 2 ways of getting here: + * 1) called in response to a SHUTDOWN chunk + * 2) called when SCTP_EVENT_NO_PENDING_TSN event is issued. + * + * For the case (2), the arg parameter is set to NULL. We need + * to check that we have a chunk before accessing it's fields. + */ + if (chunk) { + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + } + + /* If it has no more outstanding DATA chunks, the SHUTDOWN receiver + * shall send a SHUTDOWN ACK ... + */ + reply = sctp_make_shutdown_ack(asoc, chunk); + if (!reply) + goto nomem; + + /* Set the transport for the SHUTDOWN ACK chunk and the timeout for + * the T2-shutdown timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* and start/restart a T2-shutdown timer of its own, */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + if (asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + /* Enter the SHUTDOWN-ACK-SENT state. */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_ACK_SENT)); + + /* sctp-implguide 2.10 Issues with Heartbeating and failover + * + * HEARTBEAT ... is discontinued after sending either SHUTDOWN + * or SHUTDOWN-ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL()); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Ignore the event defined as other + * + * The return value is the disposition of the event. + */ +sctp_disposition_t sctp_sf_ignore_other(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("The event other type %d is ignored\n", type.other); + return SCTP_DISPOSITION_DISCARD; +} + +/************************************************************ + * These are the state functions for handling timeout events. + ************************************************************/ + +/* + * RTX Timeout + * + * Section: 6.3.3 Handle T3-rtx Expiration + * + * Whenever the retransmission timer T3-rtx expires for a destination + * address, do the following: + * [See below] + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_6_3_3_rtx(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = arg; + + SCTP_INC_STATS(SCTP_MIB_T3_RTX_EXPIREDS); + + if (asoc->overall_error_count >= asoc->max_retrans) { + if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { + /* + * We are here likely because the receiver had its rwnd + * closed for a while and we have not been able to + * transmit the locally queued data within the maximum + * retransmission attempts limit. Start the T5 + * shutdown guard timer to give the receiver one last + * chance and some additional time to recover before + * aborting. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START_ONCE, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + } + + /* E1) For the destination address for which the timer + * expires, adjust its ssthresh with rules defined in Section + * 7.2.3 and set the cwnd <- MTU. + */ + + /* E2) For the destination address for which the timer + * expires, set RTO <- RTO * 2 ("back off the timer"). The + * maximum value discussed in rule C7 above (RTO.max) may be + * used to provide an upper bound to this doubling operation. + */ + + /* E3) Determine how many of the earliest (i.e., lowest TSN) + * outstanding DATA chunks for the address for which the + * T3-rtx has expired will fit into a single packet, subject + * to the MTU constraint for the path corresponding to the + * destination transport address to which the retransmission + * is being sent (this may be different from the address for + * which the timer expires [see Section 6.4]). Call this + * value K. Bundle and retransmit those K DATA chunks in a + * single packet to the destination endpoint. + * + * Note: Any DATA chunks that were sent to the address for + * which the T3-rtx timer expired but did not fit in one MTU + * (rule E3 above), should be marked for retransmission and + * sent as soon as cwnd allows (normally when a SACK arrives). + */ + + /* Do some failure management (Section 8.2). */ + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport)); + + /* NB: Rules E4 and F1 are implicit in R1. */ + sctp_add_cmd_sf(commands, SCTP_CMD_RETRAN, SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Generate delayed SACK on timeout + * + * Section: 6.2 Acknowledgement on Reception of DATA Chunks + * + * The guidelines on delayed acknowledgement algorithm specified in + * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an + * acknowledgement SHOULD be generated for at least every second packet + * (not every second DATA chunk) received, and SHOULD be generated + * within 200 ms of the arrival of any unacknowledged DATA chunk. In + * some situations it may be beneficial for an SCTP transmitter to be + * more conservative than the algorithms detailed in this document + * allow. However, an SCTP transmitter MUST NOT be more aggressive than + * the following algorithms allow. + */ +sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_INC_STATS(SCTP_MIB_DELAY_SACK_EXPIREDS); + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_sf_t1_init_timer_expire + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * RFC 2960 Section 4 Notes + * 2) If the T1-init timer expires, the endpoint MUST retransmit INIT + * and re-start the T1-init timer without changing state. This MUST + * be repeated up to 'Max.Init.Retransmits' times. After that, the + * endpoint MUST abort the initialization process and report the + * error to SCTP user. + * + * Outputs + * (timers, events) + * + */ +sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *repl = NULL; + struct sctp_bind_addr *bp; + int attempts = asoc->init_err_counter + 1; + + SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n"); + SCTP_INC_STATS(SCTP_MIB_T1_INIT_EXPIREDS); + + if (attempts <= asoc->max_init_attempts) { + bp = (struct sctp_bind_addr *) &asoc->base.bind_addr; + repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0); + if (!repl) + return SCTP_DISPOSITION_NOMEM; + + /* Choose transport for INIT. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT, + SCTP_CHUNK(repl)); + + /* Issue a sideeffect to do the needed accounting. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + } else { + SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d" + " max_init_attempts: %d\n", + attempts, asoc->max_init_attempts); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + return SCTP_DISPOSITION_DELETE_TCB; + } + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_sf_t1_cookie_timer_expire + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * RFC 2960 Section 4 Notes + * 3) If the T1-cookie timer expires, the endpoint MUST retransmit + * COOKIE ECHO and re-start the T1-cookie timer without changing + * state. This MUST be repeated up to 'Max.Init.Retransmits' times. + * After that, the endpoint MUST abort the initialization process and + * report the error to SCTP user. + * + * Outputs + * (timers, events) + * + */ +sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *repl = NULL; + int attempts = asoc->init_err_counter + 1; + + SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n"); + SCTP_INC_STATS(SCTP_MIB_T1_COOKIE_EXPIREDS); + + if (attempts <= asoc->max_init_attempts) { + repl = sctp_make_cookie_echo(asoc, NULL); + if (!repl) + return SCTP_DISPOSITION_NOMEM; + + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT, + SCTP_CHUNK(repl)); + /* Issue a sideeffect to do the needed accounting. */ + sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + return SCTP_DISPOSITION_DELETE_TCB; + } + + return SCTP_DISPOSITION_CONSUME; +} + +/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN + * with the updated last sequential TSN received from its peer. + * + * An endpoint should limit the number of retransmissions of the + * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'. + * If this threshold is exceeded the endpoint should destroy the TCB and + * MUST report the peer endpoint unreachable to the upper layer (and + * thus the association enters the CLOSED state). The reception of any + * packet from its peer (i.e. as the peer sends all of its queued DATA + * chunks) should clear the endpoint's retransmission count and restart + * the T2-Shutdown timer, giving its peer ample opportunity to transmit + * all of its queued DATA chunks that have not yet been sent. + */ +sctp_disposition_t sctp_sf_t2_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *reply = NULL; + + SCTP_DEBUG_PRINTK("Timer T2 expired.\n"); + SCTP_INC_STATS(SCTP_MIB_T2_SHUTDOWN_EXPIREDS); + + ((struct sctp_association *)asoc)->shutdown_retries++; + + if (asoc->overall_error_count >= asoc->max_retrans) { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + /* Note: CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + + switch (asoc->state) { + case SCTP_STATE_SHUTDOWN_SENT: + reply = sctp_make_shutdown(asoc, NULL); + break; + + case SCTP_STATE_SHUTDOWN_ACK_SENT: + reply = sctp_make_shutdown_ack(asoc, NULL); + break; + + default: + BUG(); + break; + } + + if (!reply) + goto nomem; + + /* Do some failure management (Section 8.2). + * If we remove the transport an SHUTDOWN was last sent to, don't + * do failure management. + */ + if (asoc->shutdown_last_sent_to) + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, + SCTP_TRANSPORT(asoc->shutdown_last_sent_to)); + + /* Set the transport for the SHUTDOWN/ACK chunk and the timeout for + * the T2-shutdown timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* Restart the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * ADDIP Section 4.1 ASCONF CHunk Procedures + * If the T4 RTO timer expires the endpoint should do B1 to B5 + */ +sctp_disposition_t sctp_sf_t4_timer_expire( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = asoc->addip_last_asconf; + struct sctp_transport *transport = chunk->transport; + + SCTP_INC_STATS(SCTP_MIB_T4_RTO_EXPIREDS); + + /* ADDIP 4.1 B1) Increment the error counters and perform path failure + * detection on the appropriate destination address as defined in + * RFC2960 [5] section 8.1 and 8.2. + */ + if (transport) + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, + SCTP_TRANSPORT(transport)); + + /* Reconfig T4 timer and transport. */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk)); + + /* ADDIP 4.1 B2) Increment the association error counters and perform + * endpoint failure detection on the association as defined in + * RFC2960 [5] section 8.1 and 8.2. + * association error counter is incremented in SCTP_CMD_STRIKE. + */ + if (asoc->overall_error_count >= asoc->max_retrans) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_ABORT; + } + + /* ADDIP 4.1 B3) Back-off the destination address RTO value to which + * the ASCONF chunk was sent by doubling the RTO timer value. + * This is done in SCTP_CMD_STRIKE. + */ + + /* ADDIP 4.1 B4) Re-transmit the ASCONF Chunk last sent and if possible + * choose an alternate destination address (please refer to RFC2960 + * [5] section 6.4.1). An endpoint MUST NOT add new parameters to this + * chunk, it MUST be the same (including its serial number) as the last + * ASCONF sent. + */ + sctp_chunk_hold(asoc->addip_last_asconf); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(asoc->addip_last_asconf)); + + /* ADDIP 4.1 B5) Restart the T-4 RTO timer. Note that if a different + * destination is selected, then the RTO used will be that of the new + * destination address. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* sctpimpguide-05 Section 2.12.2 + * The sender of the SHUTDOWN MAY also start an overall guard timer + * 'T5-shutdown-guard' to bound the overall time for shutdown sequence. + * At the expiration of this timer the sender SHOULD abort the association + * by sending an ABORT chunk. + */ +sctp_disposition_t sctp_sf_t5_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *reply = NULL; + + SCTP_DEBUG_PRINTK("Timer T5 expired.\n"); + SCTP_INC_STATS(SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS); + + reply = sctp_make_abort(asoc, NULL, 0); + if (!reply) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + return SCTP_DISPOSITION_DELETE_TCB; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Handle expiration of AUTOCLOSE timer. When the autoclose timer expires, + * the association is automatically closed by starting the shutdown process. + * The work that needs to be done is same as when SHUTDOWN is initiated by + * the user. So this routine looks same as sctp_sf_do_9_2_prm_shutdown(). + */ +sctp_disposition_t sctp_sf_autoclose_timer_expire( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + int disposition; + + SCTP_INC_STATS(SCTP_MIB_AUTOCLOSE_EXPIREDS); + + /* From 9.2 Shutdown of an Association + * Upon receipt of the SHUTDOWN primitive from its upper + * layer, the endpoint enters SHUTDOWN-PENDING state and + * remains there until all outstanding data has been + * acknowledged by its peer. The endpoint accepts no new data + * from its upper layer, but retransmits data to the far end + * if necessary to fill gaps. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING)); + + disposition = SCTP_DISPOSITION_CONSUME; + if (sctp_outq_is_empty(&asoc->outqueue)) { + disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type, + arg, commands); + } + return disposition; +} + +/***************************************************************************** + * These are sa state functions which could apply to all types of events. + ****************************************************************************/ + +/* + * This table entry is not implemented. + * + * Inputs + * (endpoint, asoc, chunk) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_not_impl(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return SCTP_DISPOSITION_NOT_IMPL; +} + +/* + * This table entry represents a bug. + * + * Inputs + * (endpoint, asoc, chunk) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_bug(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return SCTP_DISPOSITION_BUG; +} + +/* + * This table entry represents the firing of a timer in the wrong state. + * Since timer deletion cannot be guaranteed a timer 'may' end up firing + * when the association is in the wrong state. This event should + * be ignored, so as to prevent any rearming of the timer. + * + * Inputs + * (endpoint, asoc, chunk) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_timer_ignore(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("Timer %d ignored.\n", type.chunk); + return SCTP_DISPOSITION_CONSUME; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Pull the SACK chunk based on the SACK header. */ +static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk) +{ + struct sctp_sackhdr *sack; + unsigned int len; + __u16 num_blocks; + __u16 num_dup_tsns; + + /* Protect ourselves from reading too far into + * the skb from a bogus sender. + */ + sack = (struct sctp_sackhdr *) chunk->skb->data; + + num_blocks = ntohs(sack->num_gap_ack_blocks); + num_dup_tsns = ntohs(sack->num_dup_tsns); + len = sizeof(struct sctp_sackhdr); + len += (num_blocks + num_dup_tsns) * sizeof(__u32); + if (len > chunk->skb->len) + return NULL; + + skb_pull(chunk->skb, len); + + return sack; +} + +/* Create an ABORT packet to be sent as a response, with the specified + * error causes. + */ +static struct sctp_packet *sctp_abort_pkt_new(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + const void *payload, + size_t paylen) +{ + struct sctp_packet *packet; + struct sctp_chunk *abort; + + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (packet) { + /* Make an ABORT. + * The T bit will be set if the asoc is NULL. + */ + abort = sctp_make_abort(asoc, chunk, paylen); + if (!abort) { + sctp_ootb_pkt_free(packet); + return NULL; + } + + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(abort)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); + + /* Add specified error causes, i.e., payload, to the + * end of the chunk. + */ + sctp_addto_chunk(abort, paylen, payload); + + /* Set the skb to the belonging sock for accounting. */ + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + + } + + return packet; +} + +/* Allocate a packet for responding in the OOTB conditions. */ +static struct sctp_packet *sctp_ootb_pkt_new(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_packet *packet; + struct sctp_transport *transport; + __u16 sport; + __u16 dport; + __u32 vtag; + + /* Get the source and destination port from the inbound packet. */ + sport = ntohs(chunk->sctp_hdr->dest); + dport = ntohs(chunk->sctp_hdr->source); + + /* The V-tag is going to be the same as the inbound packet if no + * association exists, otherwise, use the peer's vtag. + */ + if (asoc) { + /* Special case the INIT-ACK as there is no peer's vtag + * yet. + */ + switch(chunk->chunk_hdr->type) { + case SCTP_CID_INIT_ACK: + { + sctp_initack_chunk_t *initack; + + initack = (sctp_initack_chunk_t *)chunk->chunk_hdr; + vtag = ntohl(initack->init_hdr.init_tag); + break; + } + default: + vtag = asoc->peer.i.init_tag; + break; + } + } else { + /* Special case the INIT and stale COOKIE_ECHO as there is no + * vtag yet. + */ + switch(chunk->chunk_hdr->type) { + case SCTP_CID_INIT: + { + sctp_init_chunk_t *init; + + init = (sctp_init_chunk_t *)chunk->chunk_hdr; + vtag = ntohl(init->init_hdr.init_tag); + break; + } + default: + vtag = ntohl(chunk->sctp_hdr->vtag); + break; + } + } + + /* Make a transport for the bucket, Eliza... */ + transport = sctp_transport_new(sctp_source(chunk), GFP_ATOMIC); + if (!transport) + goto nomem; + + /* Cache a route for the transport with the chunk's destination as + * the source address. + */ + sctp_transport_route(transport, (union sctp_addr *)&chunk->dest, + sctp_sk(sctp_get_ctl_sock())); + + packet = sctp_packet_init(&transport->packet, transport, sport, dport); + packet = sctp_packet_config(packet, vtag, 0); + + return packet; + +nomem: + return NULL; +} + +/* Free the packet allocated earlier for responding in the OOTB condition. */ +void sctp_ootb_pkt_free(struct sctp_packet *packet) +{ + sctp_transport_free(packet->transport); +} + +/* Send a stale cookie error when a invalid COOKIE ECHO chunk is found */ +static void sctp_send_stale_cookie_err(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_chunk *err_chunk) +{ + struct sctp_packet *packet; + + if (err_chunk) { + packet = sctp_ootb_pkt_new(asoc, chunk); + if (packet) { + struct sctp_signed_cookie *cookie; + + /* Override the OOTB vtag from the cookie. */ + cookie = chunk->subh.cookie_hdr; + packet->vtag = cookie->c.peer_vtag; + + /* Set the skb to the belonging sock for accounting. */ + err_chunk->skb->sk = ep->base.sk; + sctp_packet_append_chunk(packet, err_chunk); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } else + sctp_chunk_free (err_chunk); + } +} + + +/* Process a data chunk */ +static int sctp_eat_data(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands) +{ + sctp_datahdr_t *data_hdr; + struct sctp_chunk *err; + size_t datalen; + sctp_verb_t deliver; + int tmp; + __u32 tsn; + struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; + struct sock *sk = asoc->base.sk; + u16 ssn; + u16 sid; + u8 ordered = 0; + + data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); + + tsn = ntohl(data_hdr->tsn); + SCTP_DEBUG_PRINTK("eat_data: TSN 0x%x.\n", tsn); + + /* ASSERT: Now skb->data is really the user data. */ + + /* Process ECN based congestion. + * + * Since the chunk structure is reused for all chunks within + * a packet, we use ecn_ce_done to track if we've already + * done CE processing for this packet. + * + * We need to do ECN processing even if we plan to discard the + * chunk later. + */ + + if (!chunk->ecn_ce_done) { + struct sctp_af *af; + chunk->ecn_ce_done = 1; + + af = sctp_get_af_specific( + ipver2af(ip_hdr(chunk->skb)->version)); + + if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) { + /* Do real work as sideffect. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, + SCTP_U32(tsn)); + } + } + + tmp = sctp_tsnmap_check(&asoc->peer.tsn_map, tsn); + if (tmp < 0) { + /* The TSN is too high--silently discard the chunk and + * count on it getting retransmitted later. + */ + return SCTP_IERROR_HIGH_TSN; + } else if (tmp > 0) { + /* This is a duplicate. Record it. */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_DUP, SCTP_U32(tsn)); + return SCTP_IERROR_DUP_TSN; + } + + /* This is a new TSN. */ + + /* Discard if there is no room in the receive window. + * Actually, allow a little bit of overflow (up to a MTU). + */ + datalen = ntohs(chunk->chunk_hdr->length); + datalen -= sizeof(sctp_data_chunk_t); + + deliver = SCTP_CMD_CHUNK_ULP; + + /* Think about partial delivery. */ + if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) { + + /* Even if we don't accept this chunk there is + * memory pressure. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PART_DELIVER, SCTP_NULL()); + } + + /* Spill over rwnd a little bit. Note: While allowed, this spill over + * seems a bit troublesome in that frag_point varies based on + * PMTU. In cases, such as loopback, this might be a rather + * large spill over. + */ + if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over || + (datalen > asoc->rwnd + asoc->frag_point))) { + + /* If this is the next TSN, consider reneging to make + * room. Note: Playing nice with a confused sender. A + * malicious sender can still eat up all our buffer + * space and in the future we may want to detect and + * do more drastic reneging. + */ + if (sctp_tsnmap_has_gap(map) && + (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { + SCTP_DEBUG_PRINTK("Reneging for tsn:%u\n", tsn); + deliver = SCTP_CMD_RENEGE; + } else { + SCTP_DEBUG_PRINTK("Discard tsn: %u len: %Zd, " + "rwnd: %d\n", tsn, datalen, + asoc->rwnd); + return SCTP_IERROR_IGNORE_TSN; + } + } + + /* + * Also try to renege to limit our memory usage in the event that + * we are under memory pressure + * If we can't renege, don't worry about it, the sk_rmem_schedule + * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our + * memory usage too much + */ + if (*sk->sk_prot_creator->memory_pressure) { + if (sctp_tsnmap_has_gap(map) && + (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { + SCTP_DEBUG_PRINTK("Under Pressure! Reneging for tsn:%u\n", tsn); + deliver = SCTP_CMD_RENEGE; + } + } + + /* + * Section 3.3.10.9 No User Data (9) + * + * Cause of error + * --------------- + * No User Data: This error cause is returned to the originator of a + * DATA chunk if a received DATA chunk has no user data. + */ + if (unlikely(0 == datalen)) { + err = sctp_make_abort_no_data(asoc, chunk, tsn); + if (err) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err)); + } + /* We are going to ABORT, so we might as well stop + * processing the rest of the chunks in the packet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_DATA)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_IERROR_NO_DATA; + } + + chunk->data_accepted = 1; + + /* Note: Some chunks may get overcounted (if we drop) or overcounted + * if we renege and the chunk arrives again. + */ + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + SCTP_INC_STATS(SCTP_MIB_INUNORDERCHUNKS); + else { + SCTP_INC_STATS(SCTP_MIB_INORDERCHUNKS); + ordered = 1; + } + + /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number + * + * If an endpoint receive a DATA chunk with an invalid stream + * identifier, it shall acknowledge the reception of the DATA chunk + * following the normal procedure, immediately send an ERROR chunk + * with cause set to "Invalid Stream Identifier" (See Section 3.3.10) + * and discard the DATA chunk. + */ + sid = ntohs(data_hdr->stream); + if (sid >= asoc->c.sinit_max_instreams) { + /* Mark tsn as received even though we drop it */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); + + err = sctp_make_op_error(asoc, chunk, SCTP_ERROR_INV_STRM, + &data_hdr->stream, + sizeof(data_hdr->stream), + sizeof(u16)); + if (err) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err)); + return SCTP_IERROR_BAD_STREAM; + } + + /* Check to see if the SSN is possible for this TSN. + * The biggest gap we can record is 4K wide. Since SSNs wrap + * at an unsigned short, there is no way that an SSN can + * wrap and for a valid TSN. We can simply check if the current + * SSN is smaller then the next expected one. If it is, it wrapped + * and is invalid. + */ + ssn = ntohs(data_hdr->ssn); + if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) { + return SCTP_IERROR_PROTO_VIOLATION; + } + + /* Send the data up to the user. Note: Schedule the + * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK + * chunk needs the updated rwnd. + */ + sctp_add_cmd_sf(commands, deliver, SCTP_CHUNK(chunk)); + + return SCTP_IERROR_NO_ERROR; +} diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c new file mode 100644 index 00000000..7c211a7f --- /dev/null +++ b/net/sctp/sm_statetable.c @@ -0,0 +1,937 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * + * This file is part of the SCTP kernel implementation + * + * These are the state tables for the SCTP state machine. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Hui Huang + * Daisy Chang + * Ardelle Fan + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static const sctp_sm_table_entry_t +primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES]; +static const sctp_sm_table_entry_t +other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES]; +static const sctp_sm_table_entry_t +timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES]; + +static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(sctp_cid_t cid, + sctp_state_t state); + + +static const sctp_sm_table_entry_t bug = { + .fn = sctp_sf_bug, + .name = "sctp_sf_bug" +}; + +#define DO_LOOKUP(_max, _type, _table) \ +({ \ + const sctp_sm_table_entry_t *rtn; \ + \ + if ((event_subtype._type > (_max))) { \ + pr_warn("table %p possible attack: event %d exceeds max %d\n", \ + _table, event_subtype._type, _max); \ + rtn = &bug; \ + } else \ + rtn = &_table[event_subtype._type][(int)state]; \ + \ + rtn; \ +}) + +const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, + sctp_state_t state, + sctp_subtype_t event_subtype) +{ + switch (event_type) { + case SCTP_EVENT_T_CHUNK: + return sctp_chunk_event_lookup(event_subtype.chunk, state); + case SCTP_EVENT_T_TIMEOUT: + return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, + timeout_event_table); + case SCTP_EVENT_T_OTHER: + return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, + other_event_table); + case SCTP_EVENT_T_PRIMITIVE: + return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, + primitive_event_table); + default: + /* Yikes! We got an illegal event type. */ + return &bug; + } +} + +#define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func} + +#define TYPE_SCTP_DATA { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_DATA */ + +#define TYPE_SCTP_INIT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \ +} /* TYPE_SCTP_INIT */ + +#define TYPE_SCTP_INIT_ACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_INIT_ACK */ + +#define TYPE_SCTP_SACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_SACK */ + +#define TYPE_SCTP_HEARTBEAT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + /* This should not happen, but we are nice. */ \ + TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \ +} /* TYPE_SCTP_HEARTBEAT */ + +#define TYPE_SCTP_HEARTBEAT_ACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_violation), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_HEARTBEAT_ACK */ + +#define TYPE_SCTP_ABORT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_pdiscard), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \ +} /* TYPE_SCTP_ABORT */ + +#define TYPE_SCTP_SHUTDOWN { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_SHUTDOWN */ + +#define TYPE_SCTP_SHUTDOWN_ACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_violation), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_violation), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_violation), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \ +} /* TYPE_SCTP_SHUTDOWN_ACK */ + +#define TYPE_SCTP_ERROR { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_operr_notify), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_ERROR */ + +#define TYPE_SCTP_COOKIE_ECHO { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \ +} /* TYPE_SCTP_COOKIE_ECHO */ + +#define TYPE_SCTP_COOKIE_ACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_COOKIE_ACK */ + +#define TYPE_SCTP_ECN_ECNE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecne), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_ECN_ECNE */ + +#define TYPE_SCTP_ECN_CWR { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_ECN_CWR */ + +#define TYPE_SCTP_SHUTDOWN_COMPLETE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_4_C), \ +} /* TYPE_SCTP_SHUTDOWN_COMPLETE */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + * + * For base protocol (RFC 2960). + */ +static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_DATA, + TYPE_SCTP_INIT, + TYPE_SCTP_INIT_ACK, + TYPE_SCTP_SACK, + TYPE_SCTP_HEARTBEAT, + TYPE_SCTP_HEARTBEAT_ACK, + TYPE_SCTP_ABORT, + TYPE_SCTP_SHUTDOWN, + TYPE_SCTP_SHUTDOWN_ACK, + TYPE_SCTP_ERROR, + TYPE_SCTP_COOKIE_ECHO, + TYPE_SCTP_COOKIE_ACK, + TYPE_SCTP_ECN_ECNE, + TYPE_SCTP_ECN_CWR, + TYPE_SCTP_SHUTDOWN_COMPLETE, +}; /* state_fn_t chunk_event_table[][] */ + +#define TYPE_SCTP_ASCONF { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_ASCONF */ + +#define TYPE_SCTP_ASCONF_ACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_ASCONF_ACK */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_ASCONF, + TYPE_SCTP_ASCONF_ACK, +}; /*state_fn_t addip_chunk_event_table[][] */ + +#define TYPE_SCTP_FWD_TSN { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ +} /* TYPE_SCTP_FWD_TSN */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_FWD_TSN, +}; /*state_fn_t prsctp_chunk_event_table[][] */ + +#define TYPE_SCTP_AUTH { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ootb), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_eat_auth), \ +} /* TYPE_SCTP_AUTH */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_AUTH, +}; /*state_fn_t auth_chunk_event_table[][] */ + +static const sctp_sm_table_entry_t +chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { + /* SCTP_STATE_CLOSED */ + TYPE_SCTP_FUNC(sctp_sf_ootb), + /* SCTP_STATE_COOKIE_WAIT */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), + /* SCTP_STATE_COOKIE_ECHOED */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), + /* SCTP_STATE_ESTABLISHED */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), + /* SCTP_STATE_SHUTDOWN_PENDING */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), + /* SCTP_STATE_SHUTDOWN_SENT */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), + /* SCTP_STATE_SHUTDOWN_RECEIVED */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ + TYPE_SCTP_FUNC(sctp_sf_unk_chunk), +}; /* chunk unknown */ + + +#define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_not_impl), \ +} /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */ + +#define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \ +} /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */ + +#define TYPE_SCTP_PRIMITIVE_ABORT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \ +} /* TYPE_SCTP_PRIMITIVE_ABORT */ + +#define TYPE_SCTP_PRIMITIVE_SEND { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ +} /* TYPE_SCTP_PRIMITIVE_SEND */ + +#define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \ +} /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ + +#define TYPE_SCTP_PRIMITIVE_ASCONF { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_error_closed), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ +} /* TYPE_SCTP_PRIMITIVE_ASCONF */ + +/* The primary index for this table is the primitive type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_PRIMITIVE_ASSOCIATE, + TYPE_SCTP_PRIMITIVE_SHUTDOWN, + TYPE_SCTP_PRIMITIVE_ABORT, + TYPE_SCTP_PRIMITIVE_SEND, + TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, + TYPE_SCTP_PRIMITIVE_ASCONF, +}; + +#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ +} + +#define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_ignore_other), \ +} + +static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_OTHER_NO_PENDING_TSN, + TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH, +}; + +#define TYPE_SCTP_EVENT_TIMEOUT_NONE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_bug), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_SACK { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \ + /* SCTP_STATE_CLOSED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_WAIT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_ESTABLISHED */ \ + TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ +} + +static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_EVENT_TIMEOUT_NONE, + TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, + TYPE_SCTP_EVENT_TIMEOUT_T1_INIT, + TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN, + TYPE_SCTP_EVENT_TIMEOUT_T3_RTX, + TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, + TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, + TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, + TYPE_SCTP_EVENT_TIMEOUT_SACK, + TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, +}; + +static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(sctp_cid_t cid, + sctp_state_t state) +{ + if (state > SCTP_STATE_MAX) + return &bug; + + if (cid <= SCTP_CID_BASE_MAX) + return &chunk_event_table[cid][state]; + + if (sctp_prsctp_enable) { + if (cid == SCTP_CID_FWD_TSN) + return &prsctp_chunk_event_table[0][state]; + } + + if (sctp_addip_enable) { + if (cid == SCTP_CID_ASCONF) + return &addip_chunk_event_table[0][state]; + + if (cid == SCTP_CID_ASCONF_ACK) + return &addip_chunk_event_table[1][state]; + } + + if (sctp_auth_enable) { + if (cid == SCTP_CID_AUTH) + return &auth_chunk_event_table[0][state]; + } + + return &chunk_event_table_unknown[state]; +} diff --git a/net/sctp/socket.c b/net/sctp/socket.c new file mode 100644 index 00000000..4434853a --- /dev/null +++ b/net/sctp/socket.c @@ -0,0 +1,6719 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 Intel Corp. + * Copyright (c) 2001-2002 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * These functions interface with the sockets layer to implement the + * SCTP Extensions for the Sockets API. + * + * Note that the descriptions from the specification are USER level + * functions--this file is the functions which populate the struct proto + * for SCTP which is the BOTTOM of the sockets interface. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Narasimha Budihal + * Karl Knutson + * Jon Grimm + * Xingang Guo + * Daisy Chang + * Sridhar Samudrala + * Inaky Perez-Gonzalez + * Ardelle Fan + * Ryan Layer + * Anup Pemmaiah + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include /* for sa_family_t */ +#include +#include +#include + +/* WARNING: Please do not remove the SCTP_STATIC attribute to + * any of the functions below as they are used to export functions + * used by a project regression testsuite. + */ + +/* Forward declarations for internal helper functions. */ +static int sctp_writeable(struct sock *sk); +static void sctp_wfree(struct sk_buff *skb); +static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p, + size_t msg_len); +static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p); +static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); +static int sctp_wait_for_accept(struct sock *sk, long timeo); +static void sctp_wait_for_close(struct sock *sk, long timeo); +static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, + union sctp_addr *addr, int len); +static int sctp_bindx_add(struct sock *, struct sockaddr *, int); +static int sctp_bindx_rem(struct sock *, struct sockaddr *, int); +static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int); +static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int); +static int sctp_send_asconf(struct sctp_association *asoc, + struct sctp_chunk *chunk); +static int sctp_do_bind(struct sock *, union sctp_addr *, int); +static int sctp_autobind(struct sock *sk); +static void sctp_sock_migrate(struct sock *, struct sock *, + struct sctp_association *, sctp_socket_type_t); +static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG; + +extern struct kmem_cache *sctp_bucket_cachep; +extern long sysctl_sctp_mem[3]; +extern int sysctl_sctp_rmem[3]; +extern int sysctl_sctp_wmem[3]; + +static int sctp_memory_pressure; +static atomic_long_t sctp_memory_allocated; +struct percpu_counter sctp_sockets_allocated; + +static void sctp_enter_memory_pressure(struct sock *sk) +{ + sctp_memory_pressure = 1; +} + + +/* Get the sndbuf space available at the time on the association. */ +static inline int sctp_wspace(struct sctp_association *asoc) +{ + int amt; + + if (asoc->ep->sndbuf_policy) + amt = asoc->sndbuf_used; + else + amt = sk_wmem_alloc_get(asoc->base.sk); + + if (amt >= asoc->base.sk->sk_sndbuf) { + if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK) + amt = 0; + else { + amt = sk_stream_wspace(asoc->base.sk); + if (amt < 0) + amt = 0; + } + } else { + amt = asoc->base.sk->sk_sndbuf - amt; + } + return amt; +} + +/* Increment the used sndbuf space count of the corresponding association by + * the size of the outgoing data chunk. + * Also, set the skb destructor for sndbuf accounting later. + * + * Since it is always 1-1 between chunk and skb, and also a new skb is always + * allocated for chunk bundling in sctp_packet_transmit(), we can use the + * destructor in the data chunk skb for the purpose of the sndbuf space + * tracking. + */ +static inline void sctp_set_owner_w(struct sctp_chunk *chunk) +{ + struct sctp_association *asoc = chunk->asoc; + struct sock *sk = asoc->base.sk; + + /* The sndbuf space is tracked per association. */ + sctp_association_hold(asoc); + + skb_set_owner_w(chunk->skb, sk); + + chunk->skb->destructor = sctp_wfree; + /* Save the chunk pointer in skb for sctp_wfree to use later. */ + *((struct sctp_chunk **)(chunk->skb->cb)) = chunk; + + asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + + atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); + sk->sk_wmem_queued += chunk->skb->truesize; + sk_mem_charge(sk, chunk->skb->truesize); +} + +/* Verify that this is a valid address. */ +static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, + int len) +{ + struct sctp_af *af; + + /* Verify basic sockaddr. */ + af = sctp_sockaddr_af(sctp_sk(sk), addr, len); + if (!af) + return -EINVAL; + + /* Is this a valid SCTP address? */ + if (!af->addr_valid(addr, sctp_sk(sk), NULL)) + return -EINVAL; + + if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) + return -EINVAL; + + return 0; +} + +/* Look up the association by its id. If this is not a UDP-style + * socket, the ID field is always ignored. + */ +struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) +{ + struct sctp_association *asoc = NULL; + + /* If this is not a UDP-style socket, assoc id should be ignored. */ + if (!sctp_style(sk, UDP)) { + /* Return NULL if the socket state is not ESTABLISHED. It + * could be a TCP-style listening socket or a socket which + * hasn't yet called connect() to establish an association. + */ + if (!sctp_sstate(sk, ESTABLISHED)) + return NULL; + + /* Get the first and the only association from the list. */ + if (!list_empty(&sctp_sk(sk)->ep->asocs)) + asoc = list_entry(sctp_sk(sk)->ep->asocs.next, + struct sctp_association, asocs); + return asoc; + } + + /* Otherwise this is a UDP-style socket. */ + if (!id || (id == (sctp_assoc_t)-1)) + return NULL; + + spin_lock_bh(&sctp_assocs_id_lock); + asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); + spin_unlock_bh(&sctp_assocs_id_lock); + + if (!asoc || (asoc->base.sk != sk) || asoc->base.dead) + return NULL; + + return asoc; +} + +/* Look up the transport from an address and an assoc id. If both address and + * id are specified, the associations matching the address and the id should be + * the same. + */ +static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, + struct sockaddr_storage *addr, + sctp_assoc_t id) +{ + struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; + struct sctp_transport *transport; + union sctp_addr *laddr = (union sctp_addr *)addr; + + addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, + laddr, + &transport); + + if (!addr_asoc) + return NULL; + + id_asoc = sctp_id2assoc(sk, id); + if (id_asoc && (id_asoc != addr_asoc)) + return NULL; + + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + (union sctp_addr *)addr); + + return transport; +} + +/* API 3.1.2 bind() - UDP Style Syntax + * The syntax of bind() is, + * + * ret = bind(int sd, struct sockaddr *addr, int addrlen); + * + * sd - the socket descriptor returned by socket(). + * addr - the address structure (struct sockaddr_in or struct + * sockaddr_in6 [RFC 2553]), + * addr_len - the size of the address structure. + */ +SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + int retval = 0; + + sctp_lock_sock(sk); + + SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n", + sk, addr, addr_len); + + /* Disallow binding twice. */ + if (!sctp_sk(sk)->ep->base.bind_addr.port) + retval = sctp_do_bind(sk, (union sctp_addr *)addr, + addr_len); + else + retval = -EINVAL; + + sctp_release_sock(sk); + + return retval; +} + +static long sctp_get_port_local(struct sock *, union sctp_addr *); + +/* Verify this is a valid sockaddr. */ +static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, + union sctp_addr *addr, int len) +{ + struct sctp_af *af; + + /* Check minimum size. */ + if (len < sizeof (struct sockaddr)) + return NULL; + + /* V4 mapped address are really of AF_INET family */ + if (addr->sa.sa_family == AF_INET6 && + ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { + if (!opt->pf->af_supported(AF_INET, opt)) + return NULL; + } else { + /* Does this PF support this AF? */ + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + } + + /* If we get this far, af is valid. */ + af = sctp_get_af_specific(addr->sa.sa_family); + + if (len < af->sockaddr_len) + return NULL; + + return af; +} + +/* Bind a local address either to an endpoint or to an association. */ +SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + struct sctp_bind_addr *bp = &ep->base.bind_addr; + struct sctp_af *af; + unsigned short snum; + int ret = 0; + + /* Common sockaddr verification. */ + af = sctp_sockaddr_af(sp, addr, len); + if (!af) { + SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n", + sk, addr, len); + return -EINVAL; + } + + snum = ntohs(addr->v4.sin_port); + + SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ", + ", port: %d, new port: %d, len: %d)\n", + sk, + addr, + bp->port, snum, + len); + + /* PF specific bind() address verification. */ + if (!sp->pf->bind_verify(sp, addr)) + return -EADDRNOTAVAIL; + + /* We must either be unbound, or bind to the same port. + * It's OK to allow 0 ports if we are already bound. + * We'll just inhert an already bound port in this case + */ + if (bp->port) { + if (!snum) + snum = bp->port; + else if (snum != bp->port) { + SCTP_DEBUG_PRINTK("sctp_do_bind:" + " New port %d does not match existing port " + "%d.\n", snum, bp->port); + return -EINVAL; + } + } + + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + /* See if the address matches any of the addresses we may have + * already bound before checking against other endpoints. + */ + if (sctp_bind_addr_match(bp, addr, sp)) + return -EINVAL; + + /* Make sure we are allowed to bind here. + * The function sctp_get_port_local() does duplicate address + * detection. + */ + addr->v4.sin_port = htons(snum); + if ((ret = sctp_get_port_local(sk, addr))) { + return -EADDRINUSE; + } + + /* Refresh ephemeral port. */ + if (!bp->port) + bp->port = inet_sk(sk)->inet_num; + + /* Add the address to the bind address list. + * Use GFP_ATOMIC since BHs will be disabled. + */ + ret = sctp_add_bind_addr(bp, addr, SCTP_ADDR_SRC, GFP_ATOMIC); + + /* Copy back into socket for getsockname() use. */ + if (!ret) { + inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num); + af->to_sk_saddr(addr, sk); + } + + return ret; +} + + /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks + * + * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged + * at any one time. If a sender, after sending an ASCONF chunk, decides + * it needs to transfer another ASCONF Chunk, it MUST wait until the + * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a + * subsequent ASCONF. Note this restriction binds each side, so at any + * time two ASCONF may be in-transit on any given association (one sent + * from each endpoint). + */ +static int sctp_send_asconf(struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + int retval = 0; + + /* If there is an outstanding ASCONF chunk, queue it for later + * transmission. + */ + if (asoc->addip_last_asconf) { + list_add_tail(&chunk->list, &asoc->addip_chunk_list); + goto out; + } + + /* Hold the chunk until an ASCONF_ACK is received. */ + sctp_chunk_hold(chunk); + retval = sctp_primitive_ASCONF(asoc, chunk); + if (retval) + sctp_chunk_free(chunk); + else + asoc->addip_last_asconf = chunk; + +out: + return retval; +} + +/* Add a list of addresses as bind addresses to local endpoint or + * association. + * + * Basically run through each address specified in the addrs/addrcnt + * array/length pair, determine if it is IPv6 or IPv4 and call + * sctp_do_bind() on it. + * + * If any of them fails, then the operation will be reversed and the + * ones that were added will be removed. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +static int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt) +{ + int cnt; + int retval = 0; + void *addr_buf; + struct sockaddr *sa_addr; + struct sctp_af *af; + + SCTP_DEBUG_PRINTK("sctp_bindx_add (sk: %p, addrs: %p, addrcnt: %d)\n", + sk, addrs, addrcnt); + + addr_buf = addrs; + for (cnt = 0; cnt < addrcnt; cnt++) { + /* The list may contain either IPv4 or IPv6 address; + * determine the address length for walking thru the list. + */ + sa_addr = (struct sockaddr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa_family); + if (!af) { + retval = -EINVAL; + goto err_bindx_add; + } + + retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr, + af->sockaddr_len); + + addr_buf += af->sockaddr_len; + +err_bindx_add: + if (retval < 0) { + /* Failed. Cleanup the ones that have been added */ + if (cnt > 0) + sctp_bindx_rem(sk, addrs, cnt); + return retval; + } + } + + return retval; +} + +/* Send an ASCONF chunk with Add IP address parameters to all the peers of the + * associations that are part of the endpoint indicating that a list of local + * addresses are added to the endpoint. + * + * If any of the addresses is already in the bind address list of the + * association, we do not send the chunk for that association. But it will not + * affect other associations. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +static int sctp_send_asconf_add_ip(struct sock *sk, + struct sockaddr *addrs, + int addrcnt) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_bind_addr *bp; + struct sctp_chunk *chunk; + struct sctp_sockaddr_entry *laddr; + union sctp_addr *addr; + union sctp_addr saveaddr; + void *addr_buf; + struct sctp_af *af; + struct list_head *p; + int i; + int retval = 0; + + if (!sctp_addip_enable) + return retval; + + sp = sctp_sk(sk); + ep = sp->ep; + + SCTP_DEBUG_PRINTK("%s: (sk: %p, addrs: %p, addrcnt: %d)\n", + __func__, sk, addrs, addrcnt); + + list_for_each_entry(asoc, &ep->asocs, asocs) { + + if (!asoc->peer.asconf_capable) + continue; + + if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP) + continue; + + if (!sctp_state(asoc, ESTABLISHED)) + continue; + + /* Check if any address in the packed array of addresses is + * in the bind address list of the association. If so, + * do not send the asconf chunk to its peer, but continue with + * other associations. + */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + if (!af) { + retval = -EINVAL; + goto out; + } + + if (sctp_assoc_lookup_laddr(asoc, addr)) + break; + + addr_buf += af->sockaddr_len; + } + if (i < addrcnt) + continue; + + /* Use the first valid address in bind addr list of + * association as Address Parameter of ASCONF CHUNK. + */ + bp = &asoc->base.bind_addr; + p = bp->address_list.next; + laddr = list_entry(p, struct sctp_sockaddr_entry, list); + chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs, + addrcnt, SCTP_PARAM_ADD_IP); + if (!chunk) { + retval = -ENOMEM; + goto out; + } + + retval = sctp_send_asconf(asoc, chunk); + if (retval) + goto out; + + /* Add the new addresses to the bind address list with + * use_as_src set to 0. + */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + memcpy(&saveaddr, addr, af->sockaddr_len); + retval = sctp_add_bind_addr(bp, &saveaddr, + SCTP_ADDR_NEW, GFP_ATOMIC); + addr_buf += af->sockaddr_len; + } + } + +out: + return retval; +} + +/* Remove a list of addresses from bind addresses list. Do not remove the + * last address. + * + * Basically run through each address specified in the addrs/addrcnt + * array/length pair, determine if it is IPv6 or IPv4 and call + * sctp_del_bind() on it. + * + * If any of them fails, then the operation will be reversed and the + * ones that were removed will be added back. + * + * At least one address has to be left; if only one address is + * available, the operation will return -EBUSY. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + int cnt; + struct sctp_bind_addr *bp = &ep->base.bind_addr; + int retval = 0; + void *addr_buf; + union sctp_addr *sa_addr; + struct sctp_af *af; + + SCTP_DEBUG_PRINTK("sctp_bindx_rem (sk: %p, addrs: %p, addrcnt: %d)\n", + sk, addrs, addrcnt); + + addr_buf = addrs; + for (cnt = 0; cnt < addrcnt; cnt++) { + /* If the bind address list is empty or if there is only one + * bind address, there is nothing more to be removed (we need + * at least one address here). + */ + if (list_empty(&bp->address_list) || + (sctp_list_single_entry(&bp->address_list))) { + retval = -EBUSY; + goto err_bindx_rem; + } + + sa_addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa.sa_family); + if (!af) { + retval = -EINVAL; + goto err_bindx_rem; + } + + if (!af->addr_valid(sa_addr, sp, NULL)) { + retval = -EADDRNOTAVAIL; + goto err_bindx_rem; + } + + if (sa_addr->v4.sin_port && + sa_addr->v4.sin_port != htons(bp->port)) { + retval = -EINVAL; + goto err_bindx_rem; + } + + if (!sa_addr->v4.sin_port) + sa_addr->v4.sin_port = htons(bp->port); + + /* FIXME - There is probably a need to check if sk->sk_saddr and + * sk->sk_rcv_addr are currently set to one of the addresses to + * be removed. This is something which needs to be looked into + * when we are fixing the outstanding issues with multi-homing + * socket routing and failover schemes. Refer to comments in + * sctp_do_bind(). -daisy + */ + retval = sctp_del_bind_addr(bp, sa_addr); + + addr_buf += af->sockaddr_len; +err_bindx_rem: + if (retval < 0) { + /* Failed. Add the ones that has been removed back */ + if (cnt > 0) + sctp_bindx_add(sk, addrs, cnt); + return retval; + } + } + + return retval; +} + +/* Send an ASCONF chunk with Delete IP address parameters to all the peers of + * the associations that are part of the endpoint indicating that a list of + * local addresses are removed from the endpoint. + * + * If any of the addresses is already in the bind address list of the + * association, we do not send the chunk for that association. But it will not + * affect other associations. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +static int sctp_send_asconf_del_ip(struct sock *sk, + struct sockaddr *addrs, + int addrcnt) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_transport *transport; + struct sctp_bind_addr *bp; + struct sctp_chunk *chunk; + union sctp_addr *laddr; + void *addr_buf; + struct sctp_af *af; + struct sctp_sockaddr_entry *saddr; + int i; + int retval = 0; + + if (!sctp_addip_enable) + return retval; + + sp = sctp_sk(sk); + ep = sp->ep; + + SCTP_DEBUG_PRINTK("%s: (sk: %p, addrs: %p, addrcnt: %d)\n", + __func__, sk, addrs, addrcnt); + + list_for_each_entry(asoc, &ep->asocs, asocs) { + + if (!asoc->peer.asconf_capable) + continue; + + if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP) + continue; + + if (!sctp_state(asoc, ESTABLISHED)) + continue; + + /* Check if any address in the packed array of addresses is + * not present in the bind address list of the association. + * If so, do not send the asconf chunk to its peer, but + * continue with other associations. + */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + laddr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(laddr->v4.sin_family); + if (!af) { + retval = -EINVAL; + goto out; + } + + if (!sctp_assoc_lookup_laddr(asoc, laddr)) + break; + + addr_buf += af->sockaddr_len; + } + if (i < addrcnt) + continue; + + /* Find one address in the association's bind address list + * that is not in the packed array of addresses. This is to + * make sure that we do not delete all the addresses in the + * association. + */ + bp = &asoc->base.bind_addr; + laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs, + addrcnt, sp); + if (!laddr) + continue; + + /* We do not need RCU protection throughout this loop + * because this is done under a socket lock from the + * setsockopt call. + */ + chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt, + SCTP_PARAM_DEL_IP); + if (!chunk) { + retval = -ENOMEM; + goto out; + } + + /* Reset use_as_src flag for the addresses in the bind address + * list that are to be deleted. + */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + laddr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(laddr->v4.sin_family); + list_for_each_entry(saddr, &bp->address_list, list) { + if (sctp_cmp_addr_exact(&saddr->a, laddr)) + saddr->state = SCTP_ADDR_DEL; + } + addr_buf += af->sockaddr_len; + } + + /* Update the route and saddr entries for all the transports + * as some of the addresses in the bind address list are + * about to be deleted and cannot be used as source addresses. + */ + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + dst_release(transport->dst); + sctp_transport_route(transport, NULL, + sctp_sk(asoc->base.sk)); + } + + retval = sctp_send_asconf(asoc, chunk); + } +out: + return retval; +} + +/* Helper for tunneling sctp_bindx() requests through sctp_setsockopt() + * + * API 8.1 + * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt, + * int flags); + * + * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. + * If the sd is an IPv6 socket, the addresses passed can either be IPv4 + * or IPv6 addresses. + * + * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see + * Section 3.1.2 for this usage. + * + * addrs is a pointer to an array of one or more socket addresses. Each + * address is contained in its appropriate structure (i.e. struct + * sockaddr_in or struct sockaddr_in6) the family of the address type + * must be used to distinguish the address length (note that this + * representation is termed a "packed array" of addresses). The caller + * specifies the number of addresses in the array with addrcnt. + * + * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns + * -1, and sets errno to the appropriate error code. + * + * For SCTP, the port given in each socket address must be the same, or + * sctp_bindx() will fail, setting errno to EINVAL. + * + * The flags parameter is formed from the bitwise OR of zero or more of + * the following currently defined flags: + * + * SCTP_BINDX_ADD_ADDR + * + * SCTP_BINDX_REM_ADDR + * + * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the + * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given + * addresses from the association. The two flags are mutually exclusive; + * if both are given, sctp_bindx() will fail with EINVAL. A caller may + * not remove all addresses from an association; sctp_bindx() will + * reject such an attempt with EINVAL. + * + * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate + * additional addresses with an endpoint after calling bind(). Or use + * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening + * socket is associated with so that no new association accepted will be + * associated with those addresses. If the endpoint supports dynamic + * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a + * endpoint to send the appropriate message to the peer to change the + * peers address lists. + * + * Adding and removing addresses from a connected association is + * optional functionality. Implementations that do not support this + * functionality should return EOPNOTSUPP. + * + * Basically do nothing but copying the addresses from user to kernel + * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. + * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() + * from userspace. + * + * We don't use copy_from_user() for optimization: we first do the + * sanity checks (buffer size -fast- and access check-healthy + * pointer); if all of those succeed, then we can alloc the memory + * (expensive operation) needed to copy the data to kernel. Then we do + * the copying without checking the user space area + * (__copy_from_user()). + * + * On exit there is no need to do sockfd_put(), sys_setsockopt() does + * it. + * + * sk The sk of the socket + * addrs The pointer to the addresses in user land + * addrssize Size of the addrs buffer + * op Operation to perform (add or remove, see the flags of + * sctp_bindx) + * + * Returns 0 if ok, <0 errno code on error. + */ +SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk, + struct sockaddr __user *addrs, + int addrs_size, int op) +{ + struct sockaddr *kaddrs; + int err; + int addrcnt = 0; + int walk_size = 0; + struct sockaddr *sa_addr; + void *addr_buf; + struct sctp_af *af; + + SCTP_DEBUG_PRINTK("sctp_setsocktopt_bindx: sk %p addrs %p" + " addrs_size %d opt %d\n", sk, addrs, addrs_size, op); + + if (unlikely(addrs_size <= 0)) + return -EINVAL; + + /* Check the user passed a healthy pointer. */ + if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size))) + return -EFAULT; + + /* Alloc space for the address array in kernel memory. */ + kaddrs = kmalloc(addrs_size, GFP_KERNEL); + if (unlikely(!kaddrs)) + return -ENOMEM; + + if (__copy_from_user(kaddrs, addrs, addrs_size)) { + kfree(kaddrs); + return -EFAULT; + } + + /* Walk through the addrs buffer and count the number of addresses. */ + addr_buf = kaddrs; + while (walk_size < addrs_size) { + if (walk_size + sizeof(sa_family_t) > addrs_size) { + kfree(kaddrs); + return -EINVAL; + } + + sa_addr = (struct sockaddr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa_family); + + /* If the address family is not supported or if this address + * causes the address buffer to overflow return EINVAL. + */ + if (!af || (walk_size + af->sockaddr_len) > addrs_size) { + kfree(kaddrs); + return -EINVAL; + } + addrcnt++; + addr_buf += af->sockaddr_len; + walk_size += af->sockaddr_len; + } + + /* Do the work. */ + switch (op) { + case SCTP_BINDX_ADD_ADDR: + err = sctp_bindx_add(sk, kaddrs, addrcnt); + if (err) + goto out; + err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt); + break; + + case SCTP_BINDX_REM_ADDR: + err = sctp_bindx_rem(sk, kaddrs, addrcnt); + if (err) + goto out; + err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt); + break; + + default: + err = -EINVAL; + break; + } + +out: + kfree(kaddrs); + + return err; +} + +/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size) + * + * Common routine for handling connect() and sctp_connectx(). + * Connect will come in with just a single address. + */ +static int __sctp_connect(struct sock* sk, + struct sockaddr *kaddrs, + int addrs_size, + sctp_assoc_t *assoc_id) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc = NULL; + struct sctp_association *asoc2; + struct sctp_transport *transport; + union sctp_addr to; + struct sctp_af *af; + sctp_scope_t scope; + long timeo; + int err = 0; + int addrcnt = 0; + int walk_size = 0; + union sctp_addr *sa_addr = NULL; + void *addr_buf; + unsigned short port; + unsigned int f_flags = 0; + + sp = sctp_sk(sk); + ep = sp->ep; + + /* connect() cannot be done on a socket that is already in ESTABLISHED + * state - UDP-style peeled off socket or a TCP-style socket that + * is already connected. + * It cannot be done even on a TCP-style listening socket. + */ + if (sctp_sstate(sk, ESTABLISHED) || + (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) { + err = -EISCONN; + goto out_free; + } + + /* Walk through the addrs buffer and count the number of addresses. */ + addr_buf = kaddrs; + while (walk_size < addrs_size) { + if (walk_size + sizeof(sa_family_t) > addrs_size) { + err = -EINVAL; + goto out_free; + } + + sa_addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa.sa_family); + + /* If the address family is not supported or if this address + * causes the address buffer to overflow return EINVAL. + */ + if (!af || (walk_size + af->sockaddr_len) > addrs_size) { + err = -EINVAL; + goto out_free; + } + + port = ntohs(sa_addr->v4.sin_port); + + /* Save current address so we can work with it */ + memcpy(&to, sa_addr, af->sockaddr_len); + + err = sctp_verify_addr(sk, &to, af->sockaddr_len); + if (err) + goto out_free; + + /* Make sure the destination port is correctly set + * in all addresses. + */ + if (asoc && asoc->peer.port && asoc->peer.port != port) + goto out_free; + + + /* Check if there already is a matching association on the + * endpoint (other than the one created here). + */ + asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport); + if (asoc2 && asoc2 != asoc) { + if (asoc2->state >= SCTP_STATE_ESTABLISHED) + err = -EISCONN; + else + err = -EALREADY; + goto out_free; + } + + /* If we could not find a matching association on the endpoint, + * make sure that there is no peeled-off association matching + * the peer address even on another socket. + */ + if (sctp_endpoint_is_peeled_off(ep, &to)) { + err = -EADDRNOTAVAIL; + goto out_free; + } + + if (!asoc) { + /* If a bind() or sctp_bindx() is not called prior to + * an sctp_connectx() call, the system picks an + * ephemeral port and will choose an address set + * equivalent to binding with a wildcard address. + */ + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) { + err = -EAGAIN; + goto out_free; + } + } else { + /* + * If an unprivileged user inherits a 1-many + * style socket with open associations on a + * privileged port, it MAY be permitted to + * accept new associations, but it SHOULD NOT + * be permitted to open new associations. + */ + if (ep->base.bind_addr.port < PROT_SOCK && + !capable(CAP_NET_BIND_SERVICE)) { + err = -EACCES; + goto out_free; + } + } + + scope = sctp_scope(&to); + asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!asoc) { + err = -ENOMEM; + goto out_free; + } + + err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, + GFP_KERNEL); + if (err < 0) { + goto out_free; + } + + } + + /* Prime the peer's transport structures. */ + transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, + SCTP_UNKNOWN); + if (!transport) { + err = -ENOMEM; + goto out_free; + } + + addrcnt++; + addr_buf += af->sockaddr_len; + walk_size += af->sockaddr_len; + } + + /* In case the user of sctp_connectx() wants an association + * id back, assign one now. + */ + if (assoc_id) { + err = sctp_assoc_set_id(asoc, GFP_KERNEL); + if (err < 0) + goto out_free; + } + + err = sctp_primitive_ASSOCIATE(asoc, NULL); + if (err < 0) { + goto out_free; + } + + /* Initialize sk's dport and daddr for getpeername() */ + inet_sk(sk)->inet_dport = htons(asoc->peer.port); + af = sctp_get_af_specific(sa_addr->sa.sa_family); + af->to_sk_daddr(sa_addr, sk); + sk->sk_err = 0; + + /* in-kernel sockets don't generally have a file allocated to them + * if all they do is call sock_create_kern(). + */ + if (sk->sk_socket->file) + f_flags = sk->sk_socket->file->f_flags; + + timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK); + + err = sctp_wait_for_connect(asoc, &timeo); + if ((err == 0 || err == -EINPROGRESS) && assoc_id) + *assoc_id = asoc->assoc_id; + + /* Don't free association on exit. */ + asoc = NULL; + +out_free: + + SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p" + " kaddrs: %p err: %d\n", + asoc, kaddrs, err); + if (asoc) + sctp_association_free(asoc); + return err; +} + +/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt() + * + * API 8.9 + * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt, + * sctp_assoc_t *asoc); + * + * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. + * If the sd is an IPv6 socket, the addresses passed can either be IPv4 + * or IPv6 addresses. + * + * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see + * Section 3.1.2 for this usage. + * + * addrs is a pointer to an array of one or more socket addresses. Each + * address is contained in its appropriate structure (i.e. struct + * sockaddr_in or struct sockaddr_in6) the family of the address type + * must be used to distengish the address length (note that this + * representation is termed a "packed array" of addresses). The caller + * specifies the number of addresses in the array with addrcnt. + * + * On success, sctp_connectx() returns 0. It also sets the assoc_id to + * the association id of the new association. On failure, sctp_connectx() + * returns -1, and sets errno to the appropriate error code. The assoc_id + * is not touched by the kernel. + * + * For SCTP, the port given in each socket address must be the same, or + * sctp_connectx() will fail, setting errno to EINVAL. + * + * An application can use sctp_connectx to initiate an association with + * an endpoint that is multi-homed. Much like sctp_bindx() this call + * allows a caller to specify multiple addresses at which a peer can be + * reached. The way the SCTP stack uses the list of addresses to set up + * the association is implementation dependent. This function only + * specifies that the stack will try to make use of all the addresses in + * the list when needed. + * + * Note that the list of addresses passed in is only used for setting up + * the association. It does not necessarily equal the set of addresses + * the peer uses for the resulting association. If the caller wants to + * find out the set of peer addresses, it must use sctp_getpaddrs() to + * retrieve them after the association has been set up. + * + * Basically do nothing but copying the addresses from user to kernel + * land and invoking either sctp_connectx(). This is used for tunneling + * the sctp_connectx() request through sctp_setsockopt() from userspace. + * + * We don't use copy_from_user() for optimization: we first do the + * sanity checks (buffer size -fast- and access check-healthy + * pointer); if all of those succeed, then we can alloc the memory + * (expensive operation) needed to copy the data to kernel. Then we do + * the copying without checking the user space area + * (__copy_from_user()). + * + * On exit there is no need to do sockfd_put(), sys_setsockopt() does + * it. + * + * sk The sk of the socket + * addrs The pointer to the addresses in user land + * addrssize Size of the addrs buffer + * + * Returns >=0 if ok, <0 errno code on error. + */ +SCTP_STATIC int __sctp_setsockopt_connectx(struct sock* sk, + struct sockaddr __user *addrs, + int addrs_size, + sctp_assoc_t *assoc_id) +{ + int err = 0; + struct sockaddr *kaddrs; + + SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n", + __func__, sk, addrs, addrs_size); + + if (unlikely(addrs_size <= 0)) + return -EINVAL; + + /* Check the user passed a healthy pointer. */ + if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size))) + return -EFAULT; + + /* Alloc space for the address array in kernel memory. */ + kaddrs = kmalloc(addrs_size, GFP_KERNEL); + if (unlikely(!kaddrs)) + return -ENOMEM; + + if (__copy_from_user(kaddrs, addrs, addrs_size)) { + err = -EFAULT; + } else { + err = __sctp_connect(sk, kaddrs, addrs_size, assoc_id); + } + + kfree(kaddrs); + + return err; +} + +/* + * This is an older interface. It's kept for backward compatibility + * to the option that doesn't provide association id. + */ +SCTP_STATIC int sctp_setsockopt_connectx_old(struct sock* sk, + struct sockaddr __user *addrs, + int addrs_size) +{ + return __sctp_setsockopt_connectx(sk, addrs, addrs_size, NULL); +} + +/* + * New interface for the API. The since the API is done with a socket + * option, to make it simple we feed back the association id is as a return + * indication to the call. Error is always negative and association id is + * always positive. + */ +SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk, + struct sockaddr __user *addrs, + int addrs_size) +{ + sctp_assoc_t assoc_id = 0; + int err = 0; + + err = __sctp_setsockopt_connectx(sk, addrs, addrs_size, &assoc_id); + + if (err) + return err; + else + return assoc_id; +} + +/* + * New (hopefully final) interface for the API. + * We use the sctp_getaddrs_old structure so that use-space library + * can avoid any unnecessary allocations. The only defferent part + * is that we store the actual length of the address buffer into the + * addrs_num structure member. That way we can re-use the existing + * code. + */ +SCTP_STATIC int sctp_getsockopt_connectx3(struct sock* sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_getaddrs_old param; + sctp_assoc_t assoc_id = 0; + int err = 0; + + if (len < sizeof(param)) + return -EINVAL; + + if (copy_from_user(¶m, optval, sizeof(param))) + return -EFAULT; + + err = __sctp_setsockopt_connectx(sk, + (struct sockaddr __user *)param.addrs, + param.addr_num, &assoc_id); + + if (err == 0 || err == -EINPROGRESS) { + if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) + return -EFAULT; + if (put_user(sizeof(assoc_id), optlen)) + return -EFAULT; + } + + return err; +} + +/* API 3.1.4 close() - UDP Style Syntax + * Applications use close() to perform graceful shutdown (as described in + * Section 10.1 of [SCTP]) on ALL the associations currently represented + * by a UDP-style socket. + * + * The syntax is + * + * ret = close(int sd); + * + * sd - the socket descriptor of the associations to be closed. + * + * To gracefully shutdown a specific association represented by the + * UDP-style socket, an application should use the sendmsg() call, + * passing no user data, but including the appropriate flag in the + * ancillary data (see Section xxxx). + * + * If sd in the close() call is a branched-off socket representing only + * one association, the shutdown is performed on that association only. + * + * 4.1.6 close() - TCP Style Syntax + * + * Applications use close() to gracefully close down an association. + * + * The syntax is: + * + * int close(int sd); + * + * sd - the socket descriptor of the association to be closed. + * + * After an application calls close() on a socket descriptor, no further + * socket operations will succeed on that descriptor. + * + * API 7.1.4 SO_LINGER + * + * An application using the TCP-style socket can use this option to + * perform the SCTP ABORT primitive. The linger option structure is: + * + * struct linger { + * int l_onoff; // option on/off + * int l_linger; // linger time + * }; + * + * To enable the option, set l_onoff to 1. If the l_linger value is set + * to 0, calling close() is the same as the ABORT primitive. If the + * value is set to a negative value, the setsockopt() call will return + * an error. If the value is set to a positive value linger_time, the + * close() can be blocked for at most linger_time ms. If the graceful + * shutdown phase does not finish during this period, close() will + * return but the graceful shutdown phase continues in the system. + */ +SCTP_STATIC void sctp_close(struct sock *sk, long timeout) +{ + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct list_head *pos, *temp; + unsigned int data_was_unread; + + SCTP_DEBUG_PRINTK("sctp_close(sk: 0x%p, timeout:%ld)\n", sk, timeout); + + sctp_lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + sk->sk_state = SCTP_SS_CLOSING; + + ep = sctp_sk(sk)->ep; + + /* Clean up any skbs sitting on the receive queue. */ + data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue); + data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby); + + /* Walk all associations on an endpoint. */ + list_for_each_safe(pos, temp, &ep->asocs) { + asoc = list_entry(pos, struct sctp_association, asocs); + + if (sctp_style(sk, TCP)) { + /* A closed association can still be in the list if + * it belongs to a TCP-style listening socket that is + * not yet accepted. If so, free it. If not, send an + * ABORT or SHUTDOWN based on the linger options. + */ + if (sctp_state(asoc, CLOSED)) { + sctp_unhash_established(asoc); + sctp_association_free(asoc); + continue; + } + } + + if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) || + !skb_queue_empty(&asoc->ulpq.reasm) || + (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { + struct sctp_chunk *chunk; + + chunk = sctp_make_abort_user(asoc, NULL, 0); + if (chunk) + sctp_primitive_ABORT(asoc, chunk); + } else + sctp_primitive_SHUTDOWN(asoc, NULL); + } + + /* On a TCP-style socket, block for at most linger_time if set. */ + if (sctp_style(sk, TCP) && timeout) + sctp_wait_for_close(sk, timeout); + + /* This will run the backlog queue. */ + sctp_release_sock(sk); + + /* Supposedly, no process has access to the socket, but + * the net layers still may. + */ + sctp_local_bh_disable(); + sctp_bh_lock_sock(sk); + + /* Hold the sock, since sk_common_release() will put sock_put() + * and we have just a little more cleanup. + */ + sock_hold(sk); + sk_common_release(sk); + + sctp_bh_unlock_sock(sk); + sctp_local_bh_enable(); + + sock_put(sk); + + SCTP_DBG_OBJCNT_DEC(sock); +} + +/* Handle EPIPE error. */ +static int sctp_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + +/* API 3.1.3 sendmsg() - UDP Style Syntax + * + * An application uses sendmsg() and recvmsg() calls to transmit data to + * and receive data from its peer. + * + * ssize_t sendmsg(int socket, const struct msghdr *message, + * int flags); + * + * socket - the socket descriptor of the endpoint. + * message - pointer to the msghdr structure which contains a single + * user message and possibly some ancillary data. + * + * See Section 5 for complete description of the data + * structures. + * + * flags - flags sent or received with the user message, see Section + * 5 for complete description of the flags. + * + * Note: This function could use a rewrite especially when explicit + * connect support comes in. + */ +/* BUG: We do not implement the equivalent of sk_stream_wait_memory(). */ + +SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *); + +SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t msg_len) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *new_asoc=NULL, *asoc=NULL; + struct sctp_transport *transport, *chunk_tp; + struct sctp_chunk *chunk; + union sctp_addr to; + struct sockaddr *msg_name = NULL; + struct sctp_sndrcvinfo default_sinfo; + struct sctp_sndrcvinfo *sinfo; + struct sctp_initmsg *sinit; + sctp_assoc_t associd = 0; + sctp_cmsgs_t cmsgs = { NULL }; + int err; + sctp_scope_t scope; + long timeo; + __u16 sinfo_flags = 0; + struct sctp_datamsg *datamsg; + int msg_flags = msg->msg_flags; + + SCTP_DEBUG_PRINTK("sctp_sendmsg(sk: %p, msg: %p, msg_len: %zu)\n", + sk, msg, msg_len); + + err = 0; + sp = sctp_sk(sk); + ep = sp->ep; + + SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep); + + /* We cannot send a message over a TCP-style listening socket. */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { + err = -EPIPE; + goto out_nounlock; + } + + /* Parse out the SCTP CMSGs. */ + err = sctp_msghdr_parse(msg, &cmsgs); + + if (err) { + SCTP_DEBUG_PRINTK("msghdr parse err = %x\n", err); + goto out_nounlock; + } + + /* Fetch the destination address for this packet. This + * address only selects the association--it is not necessarily + * the address we will send to. + * For a peeled-off socket, msg_name is ignored. + */ + if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { + int msg_namelen = msg->msg_namelen; + + err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name, + msg_namelen); + if (err) + return err; + + if (msg_namelen > sizeof(to)) + msg_namelen = sizeof(to); + memcpy(&to, msg->msg_name, msg_namelen); + msg_name = msg->msg_name; + } + + sinfo = cmsgs.info; + sinit = cmsgs.init; + + /* Did the user specify SNDRCVINFO? */ + if (sinfo) { + sinfo_flags = sinfo->sinfo_flags; + associd = sinfo->sinfo_assoc_id; + } + + SCTP_DEBUG_PRINTK("msg_len: %zu, sinfo_flags: 0x%x\n", + msg_len, sinfo_flags); + + /* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */ + if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) { + err = -EINVAL; + goto out_nounlock; + } + + /* If SCTP_EOF is set, no data can be sent. Disallow sending zero + * length messages when SCTP_EOF|SCTP_ABORT is not set. + * If SCTP_ABORT is set, the message length could be non zero with + * the msg_iov set to the user abort reason. + */ + if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) || + (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) { + err = -EINVAL; + goto out_nounlock; + } + + /* If SCTP_ADDR_OVER is set, there must be an address + * specified in msg_name. + */ + if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) { + err = -EINVAL; + goto out_nounlock; + } + + transport = NULL; + + SCTP_DEBUG_PRINTK("About to look up association.\n"); + + sctp_lock_sock(sk); + + /* If a msg_name has been specified, assume this is to be used. */ + if (msg_name) { + /* Look for a matching association on the endpoint. */ + asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); + if (!asoc) { + /* If we could not find a matching association on the + * endpoint, make sure that it is not a TCP-style + * socket that already has an association or there is + * no peeled-off association on another socket. + */ + if ((sctp_style(sk, TCP) && + sctp_sstate(sk, ESTABLISHED)) || + sctp_endpoint_is_peeled_off(ep, &to)) { + err = -EADDRNOTAVAIL; + goto out_unlock; + } + } + } else { + asoc = sctp_id2assoc(sk, associd); + if (!asoc) { + err = -EPIPE; + goto out_unlock; + } + } + + if (asoc) { + SCTP_DEBUG_PRINTK("Just looked up association: %p.\n", asoc); + + /* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED + * socket that has an association in CLOSED state. This can + * happen when an accepted socket has an association that is + * already CLOSED. + */ + if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) { + err = -EPIPE; + goto out_unlock; + } + + if (sinfo_flags & SCTP_EOF) { + SCTP_DEBUG_PRINTK("Shutting down association: %p\n", + asoc); + sctp_primitive_SHUTDOWN(asoc, NULL); + err = 0; + goto out_unlock; + } + if (sinfo_flags & SCTP_ABORT) { + + chunk = sctp_make_abort_user(asoc, msg, msg_len); + if (!chunk) { + err = -ENOMEM; + goto out_unlock; + } + + SCTP_DEBUG_PRINTK("Aborting association: %p\n", asoc); + sctp_primitive_ABORT(asoc, chunk); + err = 0; + goto out_unlock; + } + } + + /* Do we need to create the association? */ + if (!asoc) { + SCTP_DEBUG_PRINTK("There is no association yet.\n"); + + if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) { + err = -EINVAL; + goto out_unlock; + } + + /* Check for invalid stream against the stream counts, + * either the default or the user specified stream counts. + */ + if (sinfo) { + if (!sinit || (sinit && !sinit->sinit_num_ostreams)) { + /* Check against the defaults. */ + if (sinfo->sinfo_stream >= + sp->initmsg.sinit_num_ostreams) { + err = -EINVAL; + goto out_unlock; + } + } else { + /* Check against the requested. */ + if (sinfo->sinfo_stream >= + sinit->sinit_num_ostreams) { + err = -EINVAL; + goto out_unlock; + } + } + } + + /* + * API 3.1.2 bind() - UDP Style Syntax + * If a bind() or sctp_bindx() is not called prior to a + * sendmsg() call that initiates a new association, the + * system picks an ephemeral port and will choose an address + * set equivalent to binding with a wildcard address. + */ + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) { + err = -EAGAIN; + goto out_unlock; + } + } else { + /* + * If an unprivileged user inherits a one-to-many + * style socket with open associations on a privileged + * port, it MAY be permitted to accept new associations, + * but it SHOULD NOT be permitted to open new + * associations. + */ + if (ep->base.bind_addr.port < PROT_SOCK && + !capable(CAP_NET_BIND_SERVICE)) { + err = -EACCES; + goto out_unlock; + } + } + + scope = sctp_scope(&to); + new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!new_asoc) { + err = -ENOMEM; + goto out_unlock; + } + asoc = new_asoc; + err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); + if (err < 0) { + err = -ENOMEM; + goto out_free; + } + + /* If the SCTP_INIT ancillary data is specified, set all + * the association init values accordingly. + */ + if (sinit) { + if (sinit->sinit_num_ostreams) { + asoc->c.sinit_num_ostreams = + sinit->sinit_num_ostreams; + } + if (sinit->sinit_max_instreams) { + asoc->c.sinit_max_instreams = + sinit->sinit_max_instreams; + } + if (sinit->sinit_max_attempts) { + asoc->max_init_attempts + = sinit->sinit_max_attempts; + } + if (sinit->sinit_max_init_timeo) { + asoc->max_init_timeo = + msecs_to_jiffies(sinit->sinit_max_init_timeo); + } + } + + /* Prime the peer's transport structures. */ + transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN); + if (!transport) { + err = -ENOMEM; + goto out_free; + } + } + + /* ASSERT: we have a valid association at this point. */ + SCTP_DEBUG_PRINTK("We have a valid association.\n"); + + if (!sinfo) { + /* If the user didn't specify SNDRCVINFO, make up one with + * some defaults. + */ + memset(&default_sinfo, 0, sizeof(default_sinfo)); + default_sinfo.sinfo_stream = asoc->default_stream; + default_sinfo.sinfo_flags = asoc->default_flags; + default_sinfo.sinfo_ppid = asoc->default_ppid; + default_sinfo.sinfo_context = asoc->default_context; + default_sinfo.sinfo_timetolive = asoc->default_timetolive; + default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); + sinfo = &default_sinfo; + } + + /* API 7.1.7, the sndbuf size per association bounds the + * maximum size of data that can be sent in a single send call. + */ + if (msg_len > sk->sk_sndbuf) { + err = -EMSGSIZE; + goto out_free; + } + + if (asoc->pmtu_pending) + sctp_assoc_pending_pmtu(asoc); + + /* If fragmentation is disabled and the message length exceeds the + * association fragmentation point, return EMSGSIZE. The I-D + * does not specify what this error is, but this looks like + * a great fit. + */ + if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) { + err = -EMSGSIZE; + goto out_free; + } + + /* Check for invalid stream. */ + if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) { + err = -EINVAL; + goto out_free; + } + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + if (!sctp_wspace(asoc)) { + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + if (err) + goto out_free; + } + + /* If an address is passed with the sendto/sendmsg call, it is used + * to override the primary destination address in the TCP model, or + * when SCTP_ADDR_OVER flag is set in the UDP model. + */ + if ((sctp_style(sk, TCP) && msg_name) || + (sinfo_flags & SCTP_ADDR_OVER)) { + chunk_tp = sctp_assoc_lookup_paddr(asoc, &to); + if (!chunk_tp) { + err = -EINVAL; + goto out_free; + } + } else + chunk_tp = NULL; + + /* Auto-connect, if we aren't connected already. */ + if (sctp_state(asoc, CLOSED)) { + err = sctp_primitive_ASSOCIATE(asoc, NULL); + if (err < 0) + goto out_free; + SCTP_DEBUG_PRINTK("We associated primitively.\n"); + } + + /* Break the message into multiple chunks of maximum size. */ + datamsg = sctp_datamsg_from_user(asoc, sinfo, msg, msg_len); + if (!datamsg) { + err = -ENOMEM; + goto out_free; + } + + /* Now send the (possibly) fragmented message. */ + list_for_each_entry(chunk, &datamsg->chunks, frag_list) { + sctp_chunk_hold(chunk); + + /* Do accounting for the write space. */ + sctp_set_owner_w(chunk); + + chunk->transport = chunk_tp; + } + + /* Send it to the lower layers. Note: all chunks + * must either fail or succeed. The lower layer + * works that way today. Keep it that way or this + * breaks. + */ + err = sctp_primitive_SEND(asoc, datamsg); + /* Did the lower layer accept the chunk? */ + if (err) + sctp_datamsg_free(datamsg); + else + sctp_datamsg_put(datamsg); + + SCTP_DEBUG_PRINTK("We sent primitively.\n"); + + if (err) + goto out_free; + else + err = msg_len; + + /* If we are already past ASSOCIATE, the lower + * layers are responsible for association cleanup. + */ + goto out_unlock; + +out_free: + if (new_asoc) + sctp_association_free(asoc); +out_unlock: + sctp_release_sock(sk); + +out_nounlock: + return sctp_error(sk, msg_flags, err); + +#if 0 +do_sock_err: + if (msg_len) + err = msg_len; + else + err = sock_error(sk); + goto out; + +do_interrupted: + if (msg_len) + err = msg_len; + goto out; +#endif /* 0 */ +} + +/* This is an extended version of skb_pull() that removes the data from the + * start of a skb even when data is spread across the list of skb's in the + * frag_list. len specifies the total amount of data that needs to be removed. + * when 'len' bytes could be removed from the skb, it returns 0. + * If 'len' exceeds the total skb length, it returns the no. of bytes that + * could not be removed. + */ +static int sctp_skb_pull(struct sk_buff *skb, int len) +{ + struct sk_buff *list; + int skb_len = skb_headlen(skb); + int rlen; + + if (len <= skb_len) { + __skb_pull(skb, len); + return 0; + } + len -= skb_len; + __skb_pull(skb, skb_len); + + skb_walk_frags(skb, list) { + rlen = sctp_skb_pull(list, len); + skb->len -= (len-rlen); + skb->data_len -= (len-rlen); + + if (!rlen) + return 0; + + len = rlen; + } + + return len; +} + +/* API 3.1.3 recvmsg() - UDP Style Syntax + * + * ssize_t recvmsg(int socket, struct msghdr *message, + * int flags); + * + * socket - the socket descriptor of the endpoint. + * message - pointer to the msghdr structure which contains a single + * user message and possibly some ancillary data. + * + * See Section 5 for complete description of the data + * structures. + * + * flags - flags sent or received with the user message, see Section + * 5 for complete description of the flags. + */ +static struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); + +SCTP_STATIC int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) +{ + struct sctp_ulpevent *event = NULL; + struct sctp_sock *sp = sctp_sk(sk); + struct sk_buff *skb; + int copied; + int err = 0; + int skb_len; + + SCTP_DEBUG_PRINTK("sctp_recvmsg(%s: %p, %s: %p, %s: %zd, %s: %d, %s: " + "0x%x, %s: %p)\n", "sk", sk, "msghdr", msg, + "len", len, "knoblauch", noblock, + "flags", flags, "addr_len", addr_len); + + sctp_lock_sock(sk); + + if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) { + err = -ENOTCONN; + goto out; + } + + skb = sctp_skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + /* Get the total length of the skb including any skb's in the + * frag_list. + */ + skb_len = skb->len; + + copied = skb_len; + if (copied > len) + copied = len; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + event = sctp_skb2event(skb); + + if (err) + goto out_free; + + sock_recv_ts_and_drops(msg, sk, skb); + if (sctp_ulpevent_is_notification(event)) { + msg->msg_flags |= MSG_NOTIFICATION; + sp->pf->event_msgname(event, msg->msg_name, addr_len); + } else { + sp->pf->skb_msgname(skb, msg->msg_name, addr_len); + } + + /* Check if we allow SCTP_SNDRCVINFO. */ + if (sp->subscribe.sctp_data_io_event) + sctp_ulpevent_read_sndrcvinfo(event, msg); +#if 0 + /* FIXME: we should be calling IP/IPv6 layers. */ + if (sk->sk_protinfo.af_inet.cmsg_flags) + ip_cmsg_recv(msg, skb); +#endif + + err = copied; + + /* If skb's length exceeds the user's buffer, update the skb and + * push it back to the receive_queue so that the next call to + * recvmsg() will return the remaining data. Don't set MSG_EOR. + */ + if (skb_len > copied) { + msg->msg_flags &= ~MSG_EOR; + if (flags & MSG_PEEK) + goto out_free; + sctp_skb_pull(skb, copied); + skb_queue_head(&sk->sk_receive_queue, skb); + + /* When only partial message is copied to the user, increase + * rwnd by that amount. If all the data in the skb is read, + * rwnd is updated when the event is freed. + */ + if (!sctp_ulpevent_is_notification(event)) + sctp_assoc_rwnd_increase(event->asoc, copied); + goto out; + } else if ((event->msg_flags & MSG_NOTIFICATION) || + (event->msg_flags & MSG_EOR)) + msg->msg_flags |= MSG_EOR; + else + msg->msg_flags &= ~MSG_EOR; + +out_free: + if (flags & MSG_PEEK) { + /* Release the skb reference acquired after peeking the skb in + * sctp_skb_recv_datagram(). + */ + kfree_skb(skb); + } else { + /* Free the event which includes releasing the reference to + * the owner of the skb, freeing the skb and updating the + * rwnd. + */ + sctp_ulpevent_free(event); + } +out: + sctp_release_sock(sk); + return err; +} + +/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) + * + * This option is a on/off flag. If enabled no SCTP message + * fragmentation will be performed. Instead if a message being sent + * exceeds the current PMTU size, the message will NOT be sent and + * instead a error will be indicated to the user. + */ +static int sctp_setsockopt_disable_fragments(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->disable_fragments = (val == 0) ? 0 : 1; + + return 0; +} + +static int sctp_setsockopt_events(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_ulpevent *event; + + if (optlen > sizeof(struct sctp_event_subscribe)) + return -EINVAL; + if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) + return -EFAULT; + + /* + * At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, + * if there is no data to be sent or retransmit, the stack will + * immediately send up this notification. + */ + if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT, + &sctp_sk(sk)->subscribe)) { + asoc = sctp_id2assoc(sk, 0); + + if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { + event = sctp_ulpevent_make_sender_dry_event(asoc, + GFP_ATOMIC); + if (!event) + return -ENOMEM; + + sctp_ulpq_tail_event(&asoc->ulpq, event); + } + } + + return 0; +} + +/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) + * + * This socket option is applicable to the UDP-style socket only. When + * set it will cause associations that are idle for more than the + * specified number of seconds to automatically close. An association + * being idle is defined an association that has NOT sent or received + * user data. The special value of '0' indicates that no automatic + * close of any associations should be performed. The option expects an + * integer defining the number of seconds of idle time before an + * association is closed. + */ +static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + + /* Applicable to UDP-style socket only */ + if (sctp_style(sk, TCP)) + return -EOPNOTSUPP; + if (optlen != sizeof(int)) + return -EINVAL; + if (copy_from_user(&sp->autoclose, optval, optlen)) + return -EFAULT; + + return 0; +} + +/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) + * + * Applications can enable or disable heartbeats for any peer address of + * an association, modify an address's heartbeat interval, force a + * heartbeat to be sent immediately, and adjust the address's maximum + * number of retransmissions sent before an address is considered + * unreachable. The following structure is used to access and modify an + * address's parameters: + * + * struct sctp_paddrparams { + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. + * spp_address - This specifies which address is of interest. + * spp_hbinterval - This contains the value of the heartbeat interval, + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmaxrxt - This contains the maximum number of + * retransmissions before this address shall be + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_HB_TIME_IS_ZERO - Specify's that the time for + * heartbeat delayis to be set to the value of 0 + * milliseconds. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. + */ +static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, + struct sctp_transport *trans, + struct sctp_association *asoc, + struct sctp_sock *sp, + int hb_change, + int pmtud_change, + int sackdelay_change) +{ + int error; + + if (params->spp_flags & SPP_HB_DEMAND && trans) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } + + /* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of + * this field is ignored. Note also that a value of zero indicates + * the current setting should be left unchanged. + */ + if (params->spp_flags & SPP_HB_ENABLE) { + + /* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is + * set. This lets us use 0 value when this flag + * is set. + */ + if (params->spp_flags & SPP_HB_TIME_IS_ZERO) + params->spp_hbinterval = 0; + + if (params->spp_hbinterval || + (params->spp_flags & SPP_HB_TIME_IS_ZERO)) { + if (trans) { + trans->hbinterval = + msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = + msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } + } + } + + if (hb_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_HB) | hb_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_HB) | hb_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_HB) | hb_change; + } + } + + /* When Path MTU discovery is disabled the value specified here will + * be the "fixed" path mtu (i.e. the value of the spp_flags field must + * include the flag SPP_PMTUD_DISABLE for this field to have any + * effect). + */ + if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { + if (trans) { + trans->pathmtu = params->spp_pathmtu; + sctp_assoc_sync_pmtu(asoc); + } else if (asoc) { + asoc->pathmtu = params->spp_pathmtu; + sctp_frag_point(asoc, params->spp_pathmtu); + } else { + sp->pathmtu = params->spp_pathmtu; + } + } + + if (pmtud_change) { + if (trans) { + int update = (trans->param_flags & SPP_PMTUD_DISABLE) && + (params->spp_flags & SPP_PMTUD_ENABLE); + trans->param_flags = + (trans->param_flags & ~SPP_PMTUD) | pmtud_change; + if (update) { + sctp_transport_pmtu(trans, sctp_opt2sk(sp)); + sctp_assoc_sync_pmtu(asoc); + } + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_PMTUD) | pmtud_change; + } + } + + /* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the + * value of this field is ignored. Note also that a value of zero + * indicates the current setting should be left unchanged. + */ + if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) { + if (trans) { + trans->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else { + sp->sackdelay = params->spp_sackdelay; + } + } + + if (sackdelay_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } + } + + /* Note that a value of zero indicates the current setting should be + left unchanged. + */ + if (params->spp_pathmaxrxt) { + if (trans) { + trans->pathmaxrxt = params->spp_pathmaxrxt; + } else if (asoc) { + asoc->pathmaxrxt = params->spp_pathmaxrxt; + } else { + sp->pathmaxrxt = params->spp_pathmaxrxt; + } + } + + return 0; +} + +static int sctp_setsockopt_peer_addr_params(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + int error; + int hb_change, pmtud_change, sackdelay_change; + + if (optlen != sizeof(struct sctp_paddrparams)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* Validate flags and value parameters. */ + hb_change = params.spp_flags & SPP_HB; + pmtud_change = params.spp_flags & SPP_PMTUD; + sackdelay_change = params.spp_flags & SPP_SACKDELAY; + + if (hb_change == SPP_HB || + pmtud_change == SPP_PMTUD || + sackdelay_change == SPP_SACKDELAY || + params.spp_sackdelay > 500 || + (params.spp_pathmtu && + params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + return -EINVAL; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, ( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) + return -EINVAL; + } + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Heartbeat demand can only be sent on a transport or + * association, but not a socket. + */ + if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + return -EINVAL; + + /* Process parameters. */ + error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + + if (error) + return error; + + /* If changes are for association, also apply parameters to each + * transport. + */ + if (!trans && asoc) { + list_for_each_entry(trans, &asoc->peer.transport_addr_list, + transports) { + sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + } + } + + return 0; +} + +/* + * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) + * + * This option will effect the way delayed acks are performed. This + * option allows you to get or set the delayed ack time, in + * milliseconds. It also allows changing the delayed ack frequency. + * Changing the frequency to 1 disables the delayed sack algorithm. If + * the assoc_id is 0, then this sets or gets the endpoints default + * values. If the assoc_id field is non-zero, then the set or get + * effects the specified association for the one to many model (the + * assoc_id field is ignored by the one to one model). Note that if + * sack_delay or sack_freq are 0 when setting this option, then the + * current values will remain unchanged. + * + * struct sctp_sack_info { + * sctp_assoc_t sack_assoc_id; + * uint32_t sack_delay; + * uint32_t sack_freq; + * }; + * + * sack_assoc_id - This parameter, indicates which association the user + * is performing an action upon. Note that if this field's value is + * zero then the endpoints default value is changed (effecting future + * associations only). + * + * sack_delay - This parameter contains the number of milliseconds that + * the user is requesting the delayed ACK timer be set to. Note that + * this value is defined in the standard to be between 200 and 500 + * milliseconds. + * + * sack_freq - This parameter contains the number of packets that must + * be received before a sack is sent without waiting for the delay + * timer to expire. The default value for this is 2, setting this + * value to 1 will disable the delayed sack algorithm. + */ + +static int sctp_setsockopt_delayed_ack(struct sock *sk, + char __user *optval, unsigned int optlen) +{ + struct sctp_sack_info params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen == sizeof(struct sctp_sack_info)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + if (params.sack_delay == 0 && params.sack_freq == 0) + return 0; + } else if (optlen == sizeof(struct sctp_assoc_value)) { + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + if (params.sack_delay == 0) + params.sack_freq = 1; + else + params.sack_freq = 0; + } else + return - EINVAL; + + /* Validate value parameter. */ + if (params.sack_delay > 500) + return -EINVAL; + + /* Get association, if sack_assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.sack_assoc_id); + if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (params.sack_delay) { + if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params.sack_delay); + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackdelay = params.sack_delay; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } + + if (params.sack_freq == 1) { + if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } else if (params.sack_freq > 1) { + if (asoc) { + asoc->sackfreq = params.sack_freq; + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackfreq = params.sack_freq; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } + + /* If change is for association, also apply to each transport. */ + if (asoc) { + list_for_each_entry(trans, &asoc->peer.transport_addr_list, + transports) { + if (params.sack_delay) { + trans->sackdelay = + msecs_to_jiffies(params.sack_delay); + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + if (params.sack_freq == 1) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else if (params.sack_freq > 1) { + trans->sackfreq = params.sack_freq; + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } + } + + return 0; +} + +/* 7.1.3 Initialization Parameters (SCTP_INITMSG) + * + * Applications can specify protocol parameters for the default association + * initialization. The option name argument to setsockopt() and getsockopt() + * is SCTP_INITMSG. + * + * Setting initialization parameters is effective only on an unconnected + * socket (for UDP-style sockets only future associations are effected + * by the change). With TCP-style sockets, this option is inherited by + * sockets derived from a listener socket. + */ +static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigned int optlen) +{ + struct sctp_initmsg sinit; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_initmsg)) + return -EINVAL; + if (copy_from_user(&sinit, optval, optlen)) + return -EFAULT; + + if (sinit.sinit_num_ostreams) + sp->initmsg.sinit_num_ostreams = sinit.sinit_num_ostreams; + if (sinit.sinit_max_instreams) + sp->initmsg.sinit_max_instreams = sinit.sinit_max_instreams; + if (sinit.sinit_max_attempts) + sp->initmsg.sinit_max_attempts = sinit.sinit_max_attempts; + if (sinit.sinit_max_init_timeo) + sp->initmsg.sinit_max_init_timeo = sinit.sinit_max_init_timeo; + + return 0; +} + +/* + * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) + * + * Applications that wish to use the sendto() system call may wish to + * specify a default set of parameters that would normally be supplied + * through the inclusion of ancillary data. This socket option allows + * such an application to set the default sctp_sndrcvinfo structure. + * The application that wishes to use this socket option simply passes + * in to this call the sctp_sndrcvinfo structure defined in Section + * 5.2.2) The input parameters accepted by this call include + * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, + * sinfo_timetolive. The user must provide the sinfo_assoc_id field in + * to this call if the caller is using the UDP model. + */ +static int sctp_setsockopt_default_send_param(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sndrcvinfo info; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_sndrcvinfo)) + return -EINVAL; + if (copy_from_user(&info, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); + if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + asoc->default_stream = info.sinfo_stream; + asoc->default_flags = info.sinfo_flags; + asoc->default_ppid = info.sinfo_ppid; + asoc->default_context = info.sinfo_context; + asoc->default_timetolive = info.sinfo_timetolive; + } else { + sp->default_stream = info.sinfo_stream; + sp->default_flags = info.sinfo_flags; + sp->default_ppid = info.sinfo_ppid; + sp->default_context = info.sinfo_context; + sp->default_timetolive = info.sinfo_timetolive; + } + + return 0; +} + +/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) + * + * Requests that the local SCTP stack use the enclosed peer address as + * the association primary. The enclosed address must be one of the + * association peer's addresses. + */ +static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_prim prim; + struct sctp_transport *trans; + + if (optlen != sizeof(struct sctp_prim)) + return -EINVAL; + + if (copy_from_user(&prim, optval, sizeof(struct sctp_prim))) + return -EFAULT; + + trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id); + if (!trans) + return -EINVAL; + + sctp_assoc_set_primary(trans->asoc, trans); + + return 0; +} + +/* + * 7.1.5 SCTP_NODELAY + * + * Turn on/off any Nagle-like algorithm. This means that packets are + * generally sent as soon as possible and no unnecessary delays are + * introduced, at the cost of more packets in the network. Expects an + * integer boolean flag. + */ +static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1; + return 0; +} + +/* + * + * 7.1.1 SCTP_RTOINFO + * + * The protocol parameters used to initialize and bound retransmission + * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access + * and modify these parameters. + * All parameters are time values, in milliseconds. A value of 0, when + * modifying the parameters, indicates that the current value should not + * be changed. + * + */ +static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigned int optlen) +{ + struct sctp_rtoinfo rtoinfo; + struct sctp_association *asoc; + + if (optlen != sizeof (struct sctp_rtoinfo)) + return -EINVAL; + + if (copy_from_user(&rtoinfo, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); + + /* Set the values to the specific association */ + if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + if (rtoinfo.srto_initial != 0) + asoc->rto_initial = + msecs_to_jiffies(rtoinfo.srto_initial); + if (rtoinfo.srto_max != 0) + asoc->rto_max = msecs_to_jiffies(rtoinfo.srto_max); + if (rtoinfo.srto_min != 0) + asoc->rto_min = msecs_to_jiffies(rtoinfo.srto_min); + } else { + /* If there is no association or the association-id = 0 + * set the values to the endpoint. + */ + struct sctp_sock *sp = sctp_sk(sk); + + if (rtoinfo.srto_initial != 0) + sp->rtoinfo.srto_initial = rtoinfo.srto_initial; + if (rtoinfo.srto_max != 0) + sp->rtoinfo.srto_max = rtoinfo.srto_max; + if (rtoinfo.srto_min != 0) + sp->rtoinfo.srto_min = rtoinfo.srto_min; + } + + return 0; +} + +/* + * + * 7.1.2 SCTP_ASSOCINFO + * + * This option is used to tune the maximum retransmission attempts + * of the association. + * Returns an error if the new association retransmission value is + * greater than the sum of the retransmission value of the peer. + * See [SCTP] for more information. + * + */ +static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsigned int optlen) +{ + + struct sctp_assocparams assocparams; + struct sctp_association *asoc; + + if (optlen != sizeof(struct sctp_assocparams)) + return -EINVAL; + if (copy_from_user(&assocparams, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); + + if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Set the values to the specific association */ + if (asoc) { + if (assocparams.sasoc_asocmaxrxt != 0) { + __u32 path_sum = 0; + int paths = 0; + struct sctp_transport *peer_addr; + + list_for_each_entry(peer_addr, &asoc->peer.transport_addr_list, + transports) { + path_sum += peer_addr->pathmaxrxt; + paths++; + } + + /* Only validate asocmaxrxt if we have more than + * one path/transport. We do this because path + * retransmissions are only counted when we have more + * then one path. + */ + if (paths > 1 && + assocparams.sasoc_asocmaxrxt > path_sum) + return -EINVAL; + + asoc->max_retrans = assocparams.sasoc_asocmaxrxt; + } + + if (assocparams.sasoc_cookie_life != 0) { + asoc->cookie_life.tv_sec = + assocparams.sasoc_cookie_life / 1000; + asoc->cookie_life.tv_usec = + (assocparams.sasoc_cookie_life % 1000) + * 1000; + } + } else { + /* Set the values to the endpoint */ + struct sctp_sock *sp = sctp_sk(sk); + + if (assocparams.sasoc_asocmaxrxt != 0) + sp->assocparams.sasoc_asocmaxrxt = + assocparams.sasoc_asocmaxrxt; + if (assocparams.sasoc_cookie_life != 0) + sp->assocparams.sasoc_cookie_life = + assocparams.sasoc_cookie_life; + } + return 0; +} + +/* + * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) + * + * This socket option is a boolean flag which turns on or off mapped V4 + * addresses. If this option is turned on and the socket is type + * PF_INET6, then IPv4 addresses will be mapped to V6 representation. + * If this option is turned off, then no mapping will be done of V4 + * addresses and a user will receive both PF_INET6 and PF_INET type + * addresses on the socket. + */ +static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsigned int optlen) +{ + int val; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + if (val) + sp->v4mapped = 1; + else + sp->v4mapped = 0; + + return 0; +} + +/* + * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG) + * This option will get or set the maximum size to put in any outgoing + * SCTP DATA chunk. If a message is larger than this size it will be + * fragmented by SCTP into the specified size. Note that the underlying + * SCTP implementation may fragment into smaller sized chunks when the + * PMTU of the underlying association is smaller than the value set by + * the user. The default value for this option is '0' which indicates + * the user is NOT limiting fragmentation and only the PMTU will effect + * SCTP's choice of DATA chunk size. Note also that values set larger + * than the maximum size of an IP datagram will effectively let SCTP + * control fragmentation (i.e. the same as setting this option to 0). + * + * The following structure is used to access and modify this parameter: + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id: This parameter is ignored for one-to-one style sockets. + * For one-to-many style sockets this parameter indicates which + * association the user is performing an action upon. Note that if + * this field's value is zero then the endpoints default value is + * changed (effecting future associations only). + * assoc_value: This parameter specifies the maximum size in bytes. + */ +static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + int val; + + if (optlen == sizeof(int)) { + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + params.assoc_id = 0; + } else if (optlen == sizeof(struct sctp_assoc_value)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + val = params.assoc_value; + } else + return -EINVAL; + + if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN))) + return -EINVAL; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + if (val == 0) { + val = asoc->pathmtu; + val -= sp->pf->af->net_header_len; + val -= sizeof(struct sctphdr) + + sizeof(struct sctp_data_chunk); + } + asoc->user_frag = val; + asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); + } else { + sp->user_frag = val; + } + + return 0; +} + + +/* + * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR) + * + * Requests that the peer mark the enclosed address as the association + * primary. The enclosed address must be one of the association's + * locally bound addresses. The following structure is used to make a + * set primary request: + */ +static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp; + struct sctp_association *asoc = NULL; + struct sctp_setpeerprim prim; + struct sctp_chunk *chunk; + struct sctp_af *af; + int err; + + sp = sctp_sk(sk); + + if (!sctp_addip_enable) + return -EPERM; + + if (optlen != sizeof(struct sctp_setpeerprim)) + return -EINVAL; + + if (copy_from_user(&prim, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, prim.sspp_assoc_id); + if (!asoc) + return -EINVAL; + + if (!asoc->peer.asconf_capable) + return -EPERM; + + if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY) + return -EPERM; + + if (!sctp_state(asoc, ESTABLISHED)) + return -ENOTCONN; + + af = sctp_get_af_specific(prim.sspp_addr.ss_family); + if (!af) + return -EINVAL; + + if (!af->addr_valid((union sctp_addr *)&prim.sspp_addr, sp, NULL)) + return -EADDRNOTAVAIL; + + if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr)) + return -EADDRNOTAVAIL; + + /* Create an ASCONF chunk with SET_PRIMARY parameter */ + chunk = sctp_make_asconf_set_prim(asoc, + (union sctp_addr *)&prim.sspp_addr); + if (!chunk) + return -ENOMEM; + + err = sctp_send_asconf(asoc, chunk); + + SCTP_DEBUG_PRINTK("We set peer primary addr primitively.\n"); + + return err; +} + +static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_setadaptation adaptation; + + if (optlen != sizeof(struct sctp_setadaptation)) + return -EINVAL; + if (copy_from_user(&adaptation, optval, optlen)) + return -EFAULT; + + sctp_sk(sk)->adaptation_ind = adaptation.ssb_adaptation_ind; + + return 0; +} + +/* + * 7.1.29. Set or Get the default context (SCTP_CONTEXT) + * + * The context field in the sctp_sndrcvinfo structure is normally only + * used when a failed message is retrieved holding the value that was + * sent down on the actual send call. This option allows the setting of + * a default context on an association basis that will be received on + * reading messages from the peer. This is especially helpful in the + * one-2-many model for an application to keep some reference to an + * internal state machine that is processing messages on the + * association. Note that the setting of this value only effects + * received messages from the peer and does not effect the value that is + * saved with outbound messages. + */ +static int sctp_setsockopt_context(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_sock *sp; + struct sctp_association *asoc; + + if (optlen != sizeof(struct sctp_assoc_value)) + return -EINVAL; + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + sp = sctp_sk(sk); + + if (params.assoc_id != 0) { + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + return -EINVAL; + asoc->default_rcv_context = params.assoc_value; + } else { + sp->default_rcv_context = params.assoc_value; + } + + return 0; +} + +/* + * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) + * + * This options will at a minimum specify if the implementation is doing + * fragmented interleave. Fragmented interleave, for a one to many + * socket, is when subsequent calls to receive a message may return + * parts of messages from different associations. Some implementations + * may allow you to turn this value on or off. If so, when turned off, + * no fragment interleave will occur (which will cause a head of line + * blocking amongst multiple associations sharing the same one to many + * socket). When this option is turned on, then each receive call may + * come from a different association (thus the user must receive data + * with the extended calls (e.g. sctp_recvmsg) to keep track of which + * association each receive belongs to. + * + * This option takes a boolean value. A non-zero value indicates that + * fragmented interleave is on. A value of zero indicates that + * fragmented interleave is off. + * + * Note that it is important that an implementation that allows this + * option to be turned on, have it off by default. Otherwise an unaware + * application using the one to many model may become confused and act + * incorrectly. + */ +static int sctp_setsockopt_fragment_interleave(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1; + + return 0; +} + +/* + * 8.1.21. Set or Get the SCTP Partial Delivery Point + * (SCTP_PARTIAL_DELIVERY_POINT) + * + * This option will set or get the SCTP partial delivery point. This + * point is the size of a message where the partial delivery API will be + * invoked to help free up rwnd space for the peer. Setting this to a + * lower value will cause partial deliveries to happen more often. The + * calls argument is an integer that sets or gets the partial delivery + * point. Note also that the call will fail if the user attempts to set + * this value larger than the socket receive buffer size. + * + * Note that any single message having a length smaller than or equal to + * the SCTP partial delivery point will be delivered in one single read + * call as long as the user provided buffer is large enough to hold the + * message. + */ +static int sctp_setsockopt_partial_delivery_point(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + u32 val; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + /* Note: We double the receive buffer from what the user sets + * it to be, also initial rwnd is based on rcvbuf/2. + */ + if (val > (sk->sk_rcvbuf >> 1)) + return -EINVAL; + + sctp_sk(sk)->pd_point = val; + + return 0; /* is this the right error code? */ +} + +/* + * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) + * + * This option will allow a user to change the maximum burst of packets + * that can be emitted by this association. Note that the default value + * is 4, and some implementations may restrict this setting so that it + * can only be lowered. + * + * NOTE: This text doesn't seem right. Do this on a socket basis with + * future associations inheriting the socket value. + */ +static int sctp_setsockopt_maxburst(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_sock *sp; + struct sctp_association *asoc; + int val; + int assoc_id = 0; + + if (optlen == sizeof(int)) { + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + } else if (optlen == sizeof(struct sctp_assoc_value)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + val = params.assoc_value; + assoc_id = params.assoc_id; + } else + return -EINVAL; + + sp = sctp_sk(sk); + + if (assoc_id != 0) { + asoc = sctp_id2assoc(sk, assoc_id); + if (!asoc) + return -EINVAL; + asoc->max_burst = val; + } else + sp->max_burst = val; + + return 0; +} + +/* + * 7.1.18. Add a chunk that must be authenticated (SCTP_AUTH_CHUNK) + * + * This set option adds a chunk type that the user is requesting to be + * received only in an authenticated way. Changes to the list of chunks + * will only effect future associations on the socket. + */ +static int sctp_setsockopt_auth_chunk(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_authchunk val; + + if (!sctp_auth_enable) + return -EACCES; + + if (optlen != sizeof(struct sctp_authchunk)) + return -EINVAL; + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + + switch (val.sauth_chunk) { + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + case SCTP_CID_SHUTDOWN_COMPLETE: + case SCTP_CID_AUTH: + return -EINVAL; + } + + /* add this chunk id to the endpoint */ + return sctp_auth_ep_add_chunkid(sctp_sk(sk)->ep, val.sauth_chunk); +} + +/* + * 7.1.19. Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT) + * + * This option gets or sets the list of HMAC algorithms that the local + * endpoint requires the peer to use. + */ +static int sctp_setsockopt_hmac_ident(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_hmacalgo *hmacs; + u32 idents; + int err; + + if (!sctp_auth_enable) + return -EACCES; + + if (optlen < sizeof(struct sctp_hmacalgo)) + return -EINVAL; + + hmacs= memdup_user(optval, optlen); + if (IS_ERR(hmacs)) + return PTR_ERR(hmacs); + + idents = hmacs->shmac_num_idents; + if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS || + (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) { + err = -EINVAL; + goto out; + } + + err = sctp_auth_ep_set_hmacs(sctp_sk(sk)->ep, hmacs); +out: + kfree(hmacs); + return err; +} + +/* + * 7.1.20. Set a shared key (SCTP_AUTH_KEY) + * + * This option will set a shared secret key which is used to build an + * association shared key. + */ +static int sctp_setsockopt_auth_key(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_authkey *authkey; + struct sctp_association *asoc; + int ret; + + if (!sctp_auth_enable) + return -EACCES; + + if (optlen <= sizeof(struct sctp_authkey)) + return -EINVAL; + + authkey= memdup_user(optval, optlen); + if (IS_ERR(authkey)) + return PTR_ERR(authkey); + + if (authkey->sca_keylength > optlen - sizeof(struct sctp_authkey)) { + ret = -EINVAL; + goto out; + } + + asoc = sctp_id2assoc(sk, authkey->sca_assoc_id); + if (!asoc && authkey->sca_assoc_id && sctp_style(sk, UDP)) { + ret = -EINVAL; + goto out; + } + + ret = sctp_auth_set_key(sctp_sk(sk)->ep, asoc, authkey); +out: + kfree(authkey); + return ret; +} + +/* + * 7.1.21. Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY) + * + * This option will get or set the active shared key to be used to build + * the association shared key. + */ +static int sctp_setsockopt_active_key(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_authkeyid val; + struct sctp_association *asoc; + + if (!sctp_auth_enable) + return -EACCES; + + if (optlen != sizeof(struct sctp_authkeyid)) + return -EINVAL; + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, val.scact_assoc_id); + if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + return sctp_auth_set_active_key(sctp_sk(sk)->ep, asoc, + val.scact_keynumber); +} + +/* + * 7.1.22. Delete a shared key (SCTP_AUTH_DELETE_KEY) + * + * This set option will delete a shared secret key from use. + */ +static int sctp_setsockopt_del_key(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_authkeyid val; + struct sctp_association *asoc; + + if (!sctp_auth_enable) + return -EACCES; + + if (optlen != sizeof(struct sctp_authkeyid)) + return -EINVAL; + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, val.scact_assoc_id); + if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + return sctp_auth_del_key_id(sctp_sk(sk)->ep, asoc, + val.scact_keynumber); + +} + + +/* API 6.2 setsockopt(), getsockopt() + * + * Applications use setsockopt() and getsockopt() to set or retrieve + * socket options. Socket options are used to change the default + * behavior of sockets calls. They are described in Section 7. + * + * The syntax is: + * + * ret = getsockopt(int sd, int level, int optname, void __user *optval, + * int __user *optlen); + * ret = setsockopt(int sd, int level, int optname, const void __user *optval, + * int optlen); + * + * sd - the socket descript. + * level - set to IPPROTO_SCTP for all SCTP options. + * optname - the option name. + * optval - the buffer to store the value of the option. + * optlen - the size of the buffer. + */ +SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int retval = 0; + + SCTP_DEBUG_PRINTK("sctp_setsockopt(sk: %p... optname: %d)\n", + sk, optname); + + /* I can hardly begin to describe how wrong this is. This is + * so broken as to be worse than useless. The API draft + * REALLY is NOT helpful here... I am not convinced that the + * semantics of setsockopt() with a level OTHER THAN SOL_SCTP + * are at all well-founded. + */ + if (level != SOL_SCTP) { + struct sctp_af *af = sctp_sk(sk)->pf->af; + retval = af->setsockopt(sk, level, optname, optval, optlen); + goto out_nounlock; + } + + sctp_lock_sock(sk); + + switch (optname) { + case SCTP_SOCKOPT_BINDX_ADD: + /* 'optlen' is the size of the addresses buffer. */ + retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, + optlen, SCTP_BINDX_ADD_ADDR); + break; + + case SCTP_SOCKOPT_BINDX_REM: + /* 'optlen' is the size of the addresses buffer. */ + retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, + optlen, SCTP_BINDX_REM_ADDR); + break; + + case SCTP_SOCKOPT_CONNECTX_OLD: + /* 'optlen' is the size of the addresses buffer. */ + retval = sctp_setsockopt_connectx_old(sk, + (struct sockaddr __user *)optval, + optlen); + break; + + case SCTP_SOCKOPT_CONNECTX: + /* 'optlen' is the size of the addresses buffer. */ + retval = sctp_setsockopt_connectx(sk, + (struct sockaddr __user *)optval, + optlen); + break; + + case SCTP_DISABLE_FRAGMENTS: + retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); + break; + + case SCTP_EVENTS: + retval = sctp_setsockopt_events(sk, optval, optlen); + break; + + case SCTP_AUTOCLOSE: + retval = sctp_setsockopt_autoclose(sk, optval, optlen); + break; + + case SCTP_PEER_ADDR_PARAMS: + retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); + break; + + case SCTP_DELAYED_SACK: + retval = sctp_setsockopt_delayed_ack(sk, optval, optlen); + break; + case SCTP_PARTIAL_DELIVERY_POINT: + retval = sctp_setsockopt_partial_delivery_point(sk, optval, optlen); + break; + + case SCTP_INITMSG: + retval = sctp_setsockopt_initmsg(sk, optval, optlen); + break; + case SCTP_DEFAULT_SEND_PARAM: + retval = sctp_setsockopt_default_send_param(sk, optval, + optlen); + break; + case SCTP_PRIMARY_ADDR: + retval = sctp_setsockopt_primary_addr(sk, optval, optlen); + break; + case SCTP_SET_PEER_PRIMARY_ADDR: + retval = sctp_setsockopt_peer_primary_addr(sk, optval, optlen); + break; + case SCTP_NODELAY: + retval = sctp_setsockopt_nodelay(sk, optval, optlen); + break; + case SCTP_RTOINFO: + retval = sctp_setsockopt_rtoinfo(sk, optval, optlen); + break; + case SCTP_ASSOCINFO: + retval = sctp_setsockopt_associnfo(sk, optval, optlen); + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + retval = sctp_setsockopt_mappedv4(sk, optval, optlen); + break; + case SCTP_MAXSEG: + retval = sctp_setsockopt_maxseg(sk, optval, optlen); + break; + case SCTP_ADAPTATION_LAYER: + retval = sctp_setsockopt_adaptation_layer(sk, optval, optlen); + break; + case SCTP_CONTEXT: + retval = sctp_setsockopt_context(sk, optval, optlen); + break; + case SCTP_FRAGMENT_INTERLEAVE: + retval = sctp_setsockopt_fragment_interleave(sk, optval, optlen); + break; + case SCTP_MAX_BURST: + retval = sctp_setsockopt_maxburst(sk, optval, optlen); + break; + case SCTP_AUTH_CHUNK: + retval = sctp_setsockopt_auth_chunk(sk, optval, optlen); + break; + case SCTP_HMAC_IDENT: + retval = sctp_setsockopt_hmac_ident(sk, optval, optlen); + break; + case SCTP_AUTH_KEY: + retval = sctp_setsockopt_auth_key(sk, optval, optlen); + break; + case SCTP_AUTH_ACTIVE_KEY: + retval = sctp_setsockopt_active_key(sk, optval, optlen); + break; + case SCTP_AUTH_DELETE_KEY: + retval = sctp_setsockopt_del_key(sk, optval, optlen); + break; + default: + retval = -ENOPROTOOPT; + break; + } + + sctp_release_sock(sk); + +out_nounlock: + return retval; +} + +/* API 3.1.6 connect() - UDP Style Syntax + * + * An application may use the connect() call in the UDP model to initiate an + * association without sending data. + * + * The syntax is: + * + * ret = connect(int sd, const struct sockaddr *nam, socklen_t len); + * + * sd: the socket descriptor to have a new association added to. + * + * nam: the address structure (either struct sockaddr_in or struct + * sockaddr_in6 defined in RFC2553 [7]). + * + * len: the size of the address. + */ +SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr, + int addr_len) +{ + int err = 0; + struct sctp_af *af; + + sctp_lock_sock(sk); + + SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n", + __func__, sk, addr, addr_len); + + /* Validate addr_len before calling common connect/connectx routine. */ + af = sctp_get_af_specific(addr->sa_family); + if (!af || addr_len < af->sockaddr_len) { + err = -EINVAL; + } else { + /* Pass correct addr len to common routine (so it knows there + * is only one address being passed. + */ + err = __sctp_connect(sk, addr, af->sockaddr_len, NULL); + } + + sctp_release_sock(sk); + return err; +} + +/* FIXME: Write comments. */ +SCTP_STATIC int sctp_disconnect(struct sock *sk, int flags) +{ + return -EOPNOTSUPP; /* STUB */ +} + +/* 4.1.4 accept() - TCP Style Syntax + * + * Applications use accept() call to remove an established SCTP + * association from the accept queue of the endpoint. A new socket + * descriptor will be returned from accept() to represent the newly + * formed association. + */ +SCTP_STATIC struct sock *sctp_accept(struct sock *sk, int flags, int *err) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sock *newsk = NULL; + struct sctp_association *asoc; + long timeo; + int error = 0; + + sctp_lock_sock(sk); + + sp = sctp_sk(sk); + ep = sp->ep; + + if (!sctp_style(sk, TCP)) { + error = -EOPNOTSUPP; + goto out; + } + + if (!sctp_sstate(sk, LISTENING)) { + error = -EINVAL; + goto out; + } + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + error = sctp_wait_for_accept(sk, timeo); + if (error) + goto out; + + /* We treat the list of associations on the endpoint as the accept + * queue and pick the first association on the list. + */ + asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); + + newsk = sp->pf->create_accept_sk(sk, asoc); + if (!newsk) { + error = -ENOMEM; + goto out; + } + + /* Populate the fields of the newsk from the oldsk and migrate the + * asoc to the newsk. + */ + sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); + +out: + sctp_release_sock(sk); + *err = error; + return newsk; +} + +/* The SCTP ioctl handler. */ +SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + int rc = -ENOTCONN; + + sctp_lock_sock(sk); + + /* + * SEQPACKET-style sockets in LISTENING state are valid, for + * SCTP, so only discard TCP-style sockets in LISTENING state. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + goto out; + + switch (cmd) { + case SIOCINQ: { + struct sk_buff *skb; + unsigned int amount = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount of this packet since + * that is all that will be read. + */ + amount = skb->len; + } + rc = put_user(amount, (int __user *)arg); + break; + } + default: + rc = -ENOIOCTLCMD; + break; + } +out: + sctp_release_sock(sk); + return rc; +} + +/* This is the function which gets called during socket creation to + * initialized the SCTP-specific portion of the sock. + * The sock structure should already be zero-filled memory. + */ +SCTP_STATIC int sctp_init_sock(struct sock *sk) +{ + struct sctp_endpoint *ep; + struct sctp_sock *sp; + + SCTP_DEBUG_PRINTK("sctp_init_sock(sk: %p)\n", sk); + + sp = sctp_sk(sk); + + /* Initialize the SCTP per socket area. */ + switch (sk->sk_type) { + case SOCK_SEQPACKET: + sp->type = SCTP_SOCKET_UDP; + break; + case SOCK_STREAM: + sp->type = SCTP_SOCKET_TCP; + break; + default: + return -ESOCKTNOSUPPORT; + } + + /* Initialize default send parameters. These parameters can be + * modified with the SCTP_DEFAULT_SEND_PARAM socket option. + */ + sp->default_stream = 0; + sp->default_ppid = 0; + sp->default_flags = 0; + sp->default_context = 0; + sp->default_timetolive = 0; + + sp->default_rcv_context = 0; + sp->max_burst = sctp_max_burst; + + /* Initialize default setup parameters. These parameters + * can be modified with the SCTP_INITMSG socket option or + * overridden by the SCTP_INIT CMSG. + */ + sp->initmsg.sinit_num_ostreams = sctp_max_outstreams; + sp->initmsg.sinit_max_instreams = sctp_max_instreams; + sp->initmsg.sinit_max_attempts = sctp_max_retrans_init; + sp->initmsg.sinit_max_init_timeo = sctp_rto_max; + + /* Initialize default RTO related parameters. These parameters can + * be modified for with the SCTP_RTOINFO socket option. + */ + sp->rtoinfo.srto_initial = sctp_rto_initial; + sp->rtoinfo.srto_max = sctp_rto_max; + sp->rtoinfo.srto_min = sctp_rto_min; + + /* Initialize default association related parameters. These parameters + * can be modified with the SCTP_ASSOCINFO socket option. + */ + sp->assocparams.sasoc_asocmaxrxt = sctp_max_retrans_association; + sp->assocparams.sasoc_number_peer_destinations = 0; + sp->assocparams.sasoc_peer_rwnd = 0; + sp->assocparams.sasoc_local_rwnd = 0; + sp->assocparams.sasoc_cookie_life = sctp_valid_cookie_life; + + /* Initialize default event subscriptions. By default, all the + * options are off. + */ + memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe)); + + /* Default Peer Address Parameters. These defaults can + * be modified via SCTP_PEER_ADDR_PARAMS + */ + sp->hbinterval = sctp_hb_interval; + sp->pathmaxrxt = sctp_max_retrans_path; + sp->pathmtu = 0; // allow default discovery + sp->sackdelay = sctp_sack_timeout; + sp->sackfreq = 2; + sp->param_flags = SPP_HB_ENABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + + /* If enabled no SCTP message fragmentation will be performed. + * Configure through SCTP_DISABLE_FRAGMENTS socket option. + */ + sp->disable_fragments = 0; + + /* Enable Nagle algorithm by default. */ + sp->nodelay = 0; + + /* Enable by default. */ + sp->v4mapped = 1; + + /* Auto-close idle associations after the configured + * number of seconds. A value of 0 disables this + * feature. Configure through the SCTP_AUTOCLOSE socket option, + * for UDP-style sockets only. + */ + sp->autoclose = 0; + + /* User specified fragmentation limit. */ + sp->user_frag = 0; + + sp->adaptation_ind = 0; + + sp->pf = sctp_get_pf_specific(sk->sk_family); + + /* Control variables for partial data delivery. */ + atomic_set(&sp->pd_mode, 0); + skb_queue_head_init(&sp->pd_lobby); + sp->frag_interleave = 0; + + /* Create a per socket endpoint structure. Even if we + * change the data structure relationships, this may still + * be useful for storing pre-connect address information. + */ + ep = sctp_endpoint_new(sk, GFP_KERNEL); + if (!ep) + return -ENOMEM; + + sp->ep = ep; + sp->hmac = NULL; + + SCTP_DBG_OBJCNT_INC(sock); + + local_bh_disable(); + percpu_counter_inc(&sctp_sockets_allocated); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + + return 0; +} + +/* Cleanup any SCTP per socket resources. */ +SCTP_STATIC void sctp_destroy_sock(struct sock *sk) +{ + struct sctp_endpoint *ep; + + SCTP_DEBUG_PRINTK("sctp_destroy_sock(sk: %p)\n", sk); + + /* Release our hold on the endpoint. */ + ep = sctp_sk(sk)->ep; + sctp_endpoint_free(ep); + local_bh_disable(); + percpu_counter_dec(&sctp_sockets_allocated); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + local_bh_enable(); +} + +/* API 4.1.7 shutdown() - TCP Style Syntax + * int shutdown(int socket, int how); + * + * sd - the socket descriptor of the association to be closed. + * how - Specifies the type of shutdown. The values are + * as follows: + * SHUT_RD + * Disables further receive operations. No SCTP + * protocol action is taken. + * SHUT_WR + * Disables further send operations, and initiates + * the SCTP shutdown sequence. + * SHUT_RDWR + * Disables further send and receive operations + * and initiates the SCTP shutdown sequence. + */ +SCTP_STATIC void sctp_shutdown(struct sock *sk, int how) +{ + struct sctp_endpoint *ep; + struct sctp_association *asoc; + + if (!sctp_style(sk, TCP)) + return; + + if (how & SEND_SHUTDOWN) { + ep = sctp_sk(sk)->ep; + if (!list_empty(&ep->asocs)) { + asoc = list_entry(ep->asocs.next, + struct sctp_association, asocs); + sctp_primitive_SHUTDOWN(asoc, NULL); + } + } +} + +/* 7.2.1 Association Status (SCTP_STATUS) + + * Applications can retrieve current status information about an + * association, including association state, peer receiver window size, + * number of unacked data chunks, and number of data chunks pending + * receipt. This information is read-only. + */ +static int sctp_getsockopt_sctp_status(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_status status; + struct sctp_association *asoc = NULL; + struct sctp_transport *transport; + sctp_assoc_t associd; + int retval = 0; + + if (len < sizeof(status)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(status); + if (copy_from_user(&status, optval, len)) { + retval = -EFAULT; + goto out; + } + + associd = status.sstat_assoc_id; + asoc = sctp_id2assoc(sk, associd); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + transport = asoc->peer.primary_path; + + status.sstat_assoc_id = sctp_assoc2id(asoc); + status.sstat_state = asoc->state; + status.sstat_rwnd = asoc->peer.rwnd; + status.sstat_unackdata = asoc->unack_data; + + status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); + status.sstat_instrms = asoc->c.sinit_max_instreams; + status.sstat_outstrms = asoc->c.sinit_num_ostreams; + status.sstat_fragmentation_point = asoc->frag_point; + status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); + memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, + transport->af_specific->sockaddr_len); + /* Map ipv4 address into v4-mapped-on-v6 address. */ + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + (union sctp_addr *)&status.sstat_primary.spinfo_address); + status.sstat_primary.spinfo_state = transport->state; + status.sstat_primary.spinfo_cwnd = transport->cwnd; + status.sstat_primary.spinfo_srtt = transport->srtt; + status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); + status.sstat_primary.spinfo_mtu = transport->pathmtu; + + if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) + status.sstat_primary.spinfo_state = SCTP_ACTIVE; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + SCTP_DEBUG_PRINTK("sctp_getsockopt_sctp_status(%d): %d %d %d\n", + len, status.sstat_state, status.sstat_rwnd, + status.sstat_assoc_id); + + if (copy_to_user(optval, &status, len)) { + retval = -EFAULT; + goto out; + } + +out: + return retval; +} + + +/* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO) + * + * Applications can retrieve information about a specific peer address + * of an association, including its reachability state, congestion + * window, and retransmission timer values. This information is + * read-only. + */ +static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_paddrinfo pinfo; + struct sctp_transport *transport; + int retval = 0; + + if (len < sizeof(pinfo)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(pinfo); + if (copy_from_user(&pinfo, optval, len)) { + retval = -EFAULT; + goto out; + } + + transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, + pinfo.spinfo_assoc_id); + if (!transport) + return -EINVAL; + + pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); + pinfo.spinfo_state = transport->state; + pinfo.spinfo_cwnd = transport->cwnd; + pinfo.spinfo_srtt = transport->srtt; + pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); + pinfo.spinfo_mtu = transport->pathmtu; + + if (pinfo.spinfo_state == SCTP_UNKNOWN) + pinfo.spinfo_state = SCTP_ACTIVE; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, &pinfo, len)) { + retval = -EFAULT; + goto out; + } + +out: + return retval; +} + +/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) + * + * This option is a on/off flag. If enabled no SCTP message + * fragmentation will be performed. Instead if a message being sent + * exceeds the current PMTU size, the message will NOT be sent and + * instead a error will be indicated to the user. + */ +static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = (sctp_sk(sk)->disable_fragments == 1); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +/* 7.1.15 Set notification and ancillary events (SCTP_EVENTS) + * + * This socket option is used to specify various notifications and + * ancillary data the user wishes to receive. + */ +static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, + int __user *optlen) +{ + if (len <= 0) + return -EINVAL; + if (len > sizeof(struct sctp_event_subscribe)) + len = sizeof(struct sctp_event_subscribe); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len)) + return -EFAULT; + return 0; +} + +/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) + * + * This socket option is applicable to the UDP-style socket only. When + * set it will cause associations that are idle for more than the + * specified number of seconds to automatically close. An association + * being idle is defined an association that has NOT sent or received + * user data. The special value of '0' indicates that no automatic + * close of any associations should be performed. The option expects an + * integer defining the number of seconds of idle time before an + * association is closed. + */ +static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen) +{ + /* Applicable to UDP-style socket only */ + if (sctp_style(sk, TCP)) + return -EOPNOTSUPP; + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int))) + return -EFAULT; + return 0; +} + +/* Helper routine to branch off an association to a new socket. */ +SCTP_STATIC int sctp_do_peeloff(struct sctp_association *asoc, + struct socket **sockp) +{ + struct sock *sk = asoc->base.sk; + struct socket *sock; + struct sctp_af *af; + int err = 0; + + /* An association cannot be branched off from an already peeled-off + * socket, nor is this supported for tcp style sockets. + */ + if (!sctp_style(sk, UDP)) + return -EINVAL; + + /* Create a new socket. */ + err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); + if (err < 0) + return err; + + sctp_copy_sock(sock->sk, sk, asoc); + + /* Make peeled-off sockets more like 1-1 accepted sockets. + * Set the daddr and initialize id to something more random + */ + af = sctp_get_af_specific(asoc->peer.primary_addr.sa.sa_family); + af->to_sk_daddr(&asoc->peer.primary_addr, sk); + + /* Populate the fields of the newsk from the oldsk and migrate the + * asoc to the newsk. + */ + sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); + + *sockp = sock; + + return err; +} + +static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen) +{ + sctp_peeloff_arg_t peeloff; + struct socket *newsock; + int retval = 0; + struct sctp_association *asoc; + + if (len < sizeof(sctp_peeloff_arg_t)) + return -EINVAL; + len = sizeof(sctp_peeloff_arg_t); + if (copy_from_user(&peeloff, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, peeloff.associd); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + SCTP_DEBUG_PRINTK("%s: sk: %p asoc: %p\n", __func__, sk, asoc); + + retval = sctp_do_peeloff(asoc, &newsock); + if (retval < 0) + goto out; + + /* Map the socket to an unused fd that can be returned to the user. */ + retval = sock_map_fd(newsock, 0); + if (retval < 0) { + sock_release(newsock); + goto out; + } + + SCTP_DEBUG_PRINTK("%s: sk: %p asoc: %p newsk: %p sd: %d\n", + __func__, sk, asoc, newsock->sk, retval); + + /* Return the fd mapped to the new socket. */ + peeloff.sd = retval; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &peeloff, len)) + retval = -EFAULT; + +out: + return retval; +} + +/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) + * + * Applications can enable or disable heartbeats for any peer address of + * an association, modify an address's heartbeat interval, force a + * heartbeat to be sent immediately, and adjust the address's maximum + * number of retransmissions sent before an address is considered + * unreachable. The following structure is used to access and modify an + * address's parameters: + * + * struct sctp_paddrparams { + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. + * spp_address - This specifies which address is of interest. + * spp_hbinterval - This contains the value of the heartbeat interval, + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmaxrxt - This contains the maximum number of + * retransmissions before this address shall be + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. + */ +static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len < sizeof(struct sctp_paddrparams)) + return -EINVAL; + len = sizeof(struct sctp_paddrparams); + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(sk, ( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) { + SCTP_DEBUG_PRINTK("Failed no transport\n"); + return -EINVAL; + } + } + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) { + SCTP_DEBUG_PRINTK("Failed no association\n"); + return -EINVAL; + } + + if (trans) { + /* Fetch transport values. */ + params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); + params.spp_pathmtu = trans->pathmtu; + params.spp_pathmaxrxt = trans->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = trans->param_flags; + } else if (asoc) { + /* Fetch association values. */ + params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); + params.spp_pathmtu = asoc->pathmtu; + params.spp_pathmaxrxt = asoc->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = asoc->param_flags; + } else { + /* Fetch socket values. */ + params.spp_hbinterval = sp->hbinterval; + params.spp_pathmtu = sp->pathmtu; + params.spp_sackdelay = sp->sackdelay; + params.spp_pathmaxrxt = sp->pathmaxrxt; + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = sp->param_flags; + } + + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) + * + * This option will effect the way delayed acks are performed. This + * option allows you to get or set the delayed ack time, in + * milliseconds. It also allows changing the delayed ack frequency. + * Changing the frequency to 1 disables the delayed sack algorithm. If + * the assoc_id is 0, then this sets or gets the endpoints default + * values. If the assoc_id field is non-zero, then the set or get + * effects the specified association for the one to many model (the + * assoc_id field is ignored by the one to one model). Note that if + * sack_delay or sack_freq are 0 when setting this option, then the + * current values will remain unchanged. + * + * struct sctp_sack_info { + * sctp_assoc_t sack_assoc_id; + * uint32_t sack_delay; + * uint32_t sack_freq; + * }; + * + * sack_assoc_id - This parameter, indicates which association the user + * is performing an action upon. Note that if this field's value is + * zero then the endpoints default value is changed (effecting future + * associations only). + * + * sack_delay - This parameter contains the number of milliseconds that + * the user is requesting the delayed ACK timer be set to. Note that + * this value is defined in the standard to be between 200 and 500 + * milliseconds. + * + * sack_freq - This parameter contains the number of packets that must + * be received before a sack is sent without waiting for the delay + * timer to expire. The default value for this is 2, setting this + * value to 1 will disable the delayed sack algorithm. + */ +static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_sack_info params; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len >= sizeof(struct sctp_sack_info)) { + len = sizeof(struct sctp_sack_info); + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + } else if (len == sizeof(struct sctp_assoc_value)) { + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + } else + return - EINVAL; + + /* Get association, if sack_assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.sack_assoc_id); + if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + /* Fetch association values. */ + if (asoc->param_flags & SPP_SACKDELAY_ENABLE) { + params.sack_delay = jiffies_to_msecs( + asoc->sackdelay); + params.sack_freq = asoc->sackfreq; + + } else { + params.sack_delay = 0; + params.sack_freq = 1; + } + } else { + /* Fetch socket values. */ + if (sp->param_flags & SPP_SACKDELAY_ENABLE) { + params.sack_delay = sp->sackdelay; + params.sack_freq = sp->sackfreq; + } else { + params.sack_delay = 0; + params.sack_freq = 1; + } + } + + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + +/* 7.1.3 Initialization Parameters (SCTP_INITMSG) + * + * Applications can specify protocol parameters for the default association + * initialization. The option name argument to setsockopt() and getsockopt() + * is SCTP_INITMSG. + * + * Setting initialization parameters is effective only on an unconnected + * socket (for UDP-style sockets only future associations are effected + * by the change). With TCP-style sockets, this option is inherited by + * sockets derived from a listener socket. + */ +static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen) +{ + if (len < sizeof(struct sctp_initmsg)) + return -EINVAL; + len = sizeof(struct sctp_initmsg); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len)) + return -EFAULT; + return 0; +} + + +static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_association *asoc; + int cnt = 0; + struct sctp_getaddrs getaddrs; + struct sctp_transport *from; + void __user *to; + union sctp_addr temp; + struct sctp_sock *sp = sctp_sk(sk); + int addrlen; + size_t space_left; + int bytes_copied; + + if (len < sizeof(struct sctp_getaddrs)) + return -EINVAL; + + if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) + return -EFAULT; + + /* For UDP-style sockets, id specifies the association to query. */ + asoc = sctp_id2assoc(sk, getaddrs.assoc_id); + if (!asoc) + return -EINVAL; + + to = optval + offsetof(struct sctp_getaddrs,addrs); + space_left = len - offsetof(struct sctp_getaddrs,addrs); + + list_for_each_entry(from, &asoc->peer.transport_addr_list, + transports) { + memcpy(&temp, &from->ipaddr, sizeof(temp)); + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + if (space_left < addrlen) + return -ENOMEM; + if (copy_to_user(to, &temp, addrlen)) + return -EFAULT; + to += addrlen; + cnt++; + space_left -= addrlen; + } + + if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) + return -EFAULT; + bytes_copied = ((char __user *)to) - optval; + if (put_user(bytes_copied, optlen)) + return -EFAULT; + + return 0; +} + +static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, + size_t space_left, int *bytes_copied) +{ + struct sctp_sockaddr_entry *addr; + union sctp_addr temp; + int cnt = 0; + int addrlen; + + rcu_read_lock(); + list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) { + if (!addr->valid) + continue; + + if ((PF_INET == sk->sk_family) && + (AF_INET6 == addr->a.sa.sa_family)) + continue; + if ((PF_INET6 == sk->sk_family) && + inet_v6_ipv6only(sk) && + (AF_INET == addr->a.sa.sa_family)) + continue; + memcpy(&temp, &addr->a, sizeof(temp)); + if (!temp.v4.sin_port) + temp.v4.sin_port = htons(port); + + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + &temp); + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + if (space_left < addrlen) { + cnt = -ENOMEM; + break; + } + memcpy(to, &temp, addrlen); + + to += addrlen; + cnt ++; + space_left -= addrlen; + *bytes_copied += addrlen; + } + rcu_read_unlock(); + + return cnt; +} + + +static int sctp_getsockopt_local_addrs(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_bind_addr *bp; + struct sctp_association *asoc; + int cnt = 0; + struct sctp_getaddrs getaddrs; + struct sctp_sockaddr_entry *addr; + void __user *to; + union sctp_addr temp; + struct sctp_sock *sp = sctp_sk(sk); + int addrlen; + int err = 0; + size_t space_left; + int bytes_copied = 0; + void *addrs; + void *buf; + + if (len < sizeof(struct sctp_getaddrs)) + return -EINVAL; + + if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) + return -EFAULT; + + /* + * For UDP-style sockets, id specifies the association to query. + * If the id field is set to the value '0' then the locally bound + * addresses are returned without regard to any particular + * association. + */ + if (0 == getaddrs.assoc_id) { + bp = &sctp_sk(sk)->ep->base.bind_addr; + } else { + asoc = sctp_id2assoc(sk, getaddrs.assoc_id); + if (!asoc) + return -EINVAL; + bp = &asoc->base.bind_addr; + } + + to = optval + offsetof(struct sctp_getaddrs,addrs); + space_left = len - offsetof(struct sctp_getaddrs,addrs); + + addrs = kmalloc(space_left, GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid + * addresses from the global local address list. + */ + if (sctp_list_single_entry(&bp->address_list)) { + addr = list_entry(bp->address_list.next, + struct sctp_sockaddr_entry, list); + if (sctp_is_any(sk, &addr->a)) { + cnt = sctp_copy_laddrs(sk, bp->port, addrs, + space_left, &bytes_copied); + if (cnt < 0) { + err = cnt; + goto out; + } + goto copy_getaddrs; + } + } + + buf = addrs; + /* Protection on the bound address list is not needed since + * in the socket option context we hold a socket lock and + * thus the bound address list can't change. + */ + list_for_each_entry(addr, &bp->address_list, list) { + memcpy(&temp, &addr->a, sizeof(temp)); + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + if (space_left < addrlen) { + err = -ENOMEM; /*fixme: right error?*/ + goto out; + } + memcpy(buf, &temp, addrlen); + buf += addrlen; + bytes_copied += addrlen; + cnt ++; + space_left -= addrlen; + } + +copy_getaddrs: + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto out; + } + if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) { + err = -EFAULT; + goto out; + } + if (put_user(bytes_copied, optlen)) + err = -EFAULT; +out: + kfree(addrs); + return err; +} + +/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) + * + * Requests that the local SCTP stack use the enclosed peer address as + * the association primary. The enclosed address must be one of the + * association peer's addresses. + */ +static int sctp_getsockopt_primary_addr(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_prim prim; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + + if (len < sizeof(struct sctp_prim)) + return -EINVAL; + + len = sizeof(struct sctp_prim); + + if (copy_from_user(&prim, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, prim.ssp_assoc_id); + if (!asoc) + return -EINVAL; + + if (!asoc->peer.primary_path) + return -ENOTCONN; + + memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr, + asoc->peer.primary_path->af_specific->sockaddr_len); + + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, + (union sctp_addr *)&prim.ssp_addr); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &prim, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.11 Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER) + * + * Requests that the local endpoint set the specified Adaptation Layer + * Indication parameter for all future INIT and INIT-ACK exchanges. + */ +static int sctp_getsockopt_adaptation_layer(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_setadaptation adaptation; + + if (len < sizeof(struct sctp_setadaptation)) + return -EINVAL; + + len = sizeof(struct sctp_setadaptation); + + adaptation.ssb_adaptation_ind = sctp_sk(sk)->adaptation_ind; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &adaptation, len)) + return -EFAULT; + + return 0; +} + +/* + * + * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) + * + * Applications that wish to use the sendto() system call may wish to + * specify a default set of parameters that would normally be supplied + * through the inclusion of ancillary data. This socket option allows + * such an application to set the default sctp_sndrcvinfo structure. + + + * The application that wishes to use this socket option simply passes + * in to this call the sctp_sndrcvinfo structure defined in Section + * 5.2.2) The input parameters accepted by this call include + * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, + * sinfo_timetolive. The user must provide the sinfo_assoc_id field in + * to this call if the caller is using the UDP model. + * + * For getsockopt, it get the default sctp_sndrcvinfo structure. + */ +static int sctp_getsockopt_default_send_param(struct sock *sk, + int len, char __user *optval, + int __user *optlen) +{ + struct sctp_sndrcvinfo info; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + + if (len < sizeof(struct sctp_sndrcvinfo)) + return -EINVAL; + + len = sizeof(struct sctp_sndrcvinfo); + + if (copy_from_user(&info, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); + if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + info.sinfo_stream = asoc->default_stream; + info.sinfo_flags = asoc->default_flags; + info.sinfo_ppid = asoc->default_ppid; + info.sinfo_context = asoc->default_context; + info.sinfo_timetolive = asoc->default_timetolive; + } else { + info.sinfo_stream = sp->default_stream; + info.sinfo_flags = sp->default_flags; + info.sinfo_ppid = sp->default_ppid; + info.sinfo_context = sp->default_context; + info.sinfo_timetolive = sp->default_timetolive; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + + return 0; +} + +/* + * + * 7.1.5 SCTP_NODELAY + * + * Turn on/off any Nagle-like algorithm. This means that packets are + * generally sent as soon as possible and no unnecessary delays are + * introduced, at the cost of more packets in the network. Expects an + * integer boolean flag. + */ + +static int sctp_getsockopt_nodelay(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = (sctp_sk(sk)->nodelay == 1); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +/* + * + * 7.1.1 SCTP_RTOINFO + * + * The protocol parameters used to initialize and bound retransmission + * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access + * and modify these parameters. + * All parameters are time values, in milliseconds. A value of 0, when + * modifying the parameters, indicates that the current value should not + * be changed. + * + */ +static int sctp_getsockopt_rtoinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) { + struct sctp_rtoinfo rtoinfo; + struct sctp_association *asoc; + + if (len < sizeof (struct sctp_rtoinfo)) + return -EINVAL; + + len = sizeof(struct sctp_rtoinfo); + + if (copy_from_user(&rtoinfo, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); + + if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Values corresponding to the specific association. */ + if (asoc) { + rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial); + rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max); + rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min); + } else { + /* Values corresponding to the endpoint. */ + struct sctp_sock *sp = sctp_sk(sk); + + rtoinfo.srto_initial = sp->rtoinfo.srto_initial; + rtoinfo.srto_max = sp->rtoinfo.srto_max; + rtoinfo.srto_min = sp->rtoinfo.srto_min; + } + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &rtoinfo, len)) + return -EFAULT; + + return 0; +} + +/* + * + * 7.1.2 SCTP_ASSOCINFO + * + * This option is used to tune the maximum retransmission attempts + * of the association. + * Returns an error if the new association retransmission value is + * greater than the sum of the retransmission value of the peer. + * See [SCTP] for more information. + * + */ +static int sctp_getsockopt_associnfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + + struct sctp_assocparams assocparams; + struct sctp_association *asoc; + struct list_head *pos; + int cnt = 0; + + if (len < sizeof (struct sctp_assocparams)) + return -EINVAL; + + len = sizeof(struct sctp_assocparams); + + if (copy_from_user(&assocparams, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); + + if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Values correspoinding to the specific association */ + if (asoc) { + assocparams.sasoc_asocmaxrxt = asoc->max_retrans; + assocparams.sasoc_peer_rwnd = asoc->peer.rwnd; + assocparams.sasoc_local_rwnd = asoc->a_rwnd; + assocparams.sasoc_cookie_life = (asoc->cookie_life.tv_sec + * 1000) + + (asoc->cookie_life.tv_usec + / 1000); + + list_for_each(pos, &asoc->peer.transport_addr_list) { + cnt ++; + } + + assocparams.sasoc_number_peer_destinations = cnt; + } else { + /* Values corresponding to the endpoint */ + struct sctp_sock *sp = sctp_sk(sk); + + assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt; + assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd; + assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd; + assocparams.sasoc_cookie_life = + sp->assocparams.sasoc_cookie_life; + assocparams.sasoc_number_peer_destinations = + sp->assocparams. + sasoc_number_peer_destinations; + } + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &assocparams, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) + * + * This socket option is a boolean flag which turns on or off mapped V4 + * addresses. If this option is turned on and the socket is type + * PF_INET6, then IPv4 addresses will be mapped to V6 representation. + * If this option is turned off, then no mapping will be done of V4 + * addresses and a user will receive both PF_INET6 and PF_INET type + * addresses on the socket. + */ +static int sctp_getsockopt_mappedv4(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + struct sctp_sock *sp = sctp_sk(sk); + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = sp->v4mapped; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.29. Set or Get the default context (SCTP_CONTEXT) + * (chapter and verse is quoted at sctp_setsockopt_context()) + */ +static int sctp_getsockopt_context(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_sock *sp; + struct sctp_association *asoc; + + if (len < sizeof(struct sctp_assoc_value)) + return -EINVAL; + + len = sizeof(struct sctp_assoc_value); + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + sp = sctp_sk(sk); + + if (params.assoc_id != 0) { + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + return -EINVAL; + params.assoc_value = asoc->default_rcv_context; + } else { + params.assoc_value = sp->default_rcv_context; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + return 0; +} + +/* + * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG) + * This option will get or set the maximum size to put in any outgoing + * SCTP DATA chunk. If a message is larger than this size it will be + * fragmented by SCTP into the specified size. Note that the underlying + * SCTP implementation may fragment into smaller sized chunks when the + * PMTU of the underlying association is smaller than the value set by + * the user. The default value for this option is '0' which indicates + * the user is NOT limiting fragmentation and only the PMTU will effect + * SCTP's choice of DATA chunk size. Note also that values set larger + * than the maximum size of an IP datagram will effectively let SCTP + * control fragmentation (i.e. the same as setting this option to 0). + * + * The following structure is used to access and modify this parameter: + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id: This parameter is ignored for one-to-one style sockets. + * For one-to-many style sockets this parameter indicates which + * association the user is performing an action upon. Note that if + * this field's value is zero then the endpoints default value is + * changed (effecting future associations only). + * assoc_value: This parameter specifies the maximum size in bytes. + */ +static int sctp_getsockopt_maxseg(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + + if (len == sizeof(int)) { + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); + params.assoc_id = 0; + } else if (len >= sizeof(struct sctp_assoc_value)) { + len = sizeof(struct sctp_assoc_value); + if (copy_from_user(¶ms, optval, sizeof(params))) + return -EFAULT; + } else + return -EINVAL; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) + params.assoc_value = asoc->frag_point; + else + params.assoc_value = sctp_sk(sk)->user_frag; + + if (put_user(len, optlen)) + return -EFAULT; + if (len == sizeof(int)) { + if (copy_to_user(optval, ¶ms.assoc_value, len)) + return -EFAULT; + } else { + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + } + + return 0; +} + +/* + * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) + * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave()) + */ +static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + + val = sctp_sk(sk)->frag_interleave; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.25. Set or Get the sctp partial delivery point + * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point()) + */ +static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + u32 val; + + if (len < sizeof(u32)) + return -EINVAL; + + len = sizeof(u32); + + val = sctp_sk(sk)->pd_point; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) + * (chapter and verse is quoted at sctp_setsockopt_maxburst()) + */ +static int sctp_getsockopt_maxburst(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_sock *sp; + struct sctp_association *asoc; + + if (len == sizeof(int)) { + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); + params.assoc_id = 0; + } else if (len >= sizeof(struct sctp_assoc_value)) { + len = sizeof(struct sctp_assoc_value); + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + } else + return -EINVAL; + + sp = sctp_sk(sk); + + if (params.assoc_id != 0) { + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + return -EINVAL; + params.assoc_value = asoc->max_burst; + } else + params.assoc_value = sp->max_burst; + + if (len == sizeof(int)) { + if (copy_to_user(optval, ¶ms.assoc_value, len)) + return -EFAULT; + } else { + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + } + + return 0; + +} + +static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_hmacalgo __user *p = (void __user *)optval; + struct sctp_hmac_algo_param *hmacs; + __u16 data_len = 0; + u32 num_idents; + + if (!sctp_auth_enable) + return -EACCES; + + hmacs = sctp_sk(sk)->ep->auth_hmacs_list; + data_len = ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t); + + if (len < sizeof(struct sctp_hmacalgo) + data_len) + return -EINVAL; + + len = sizeof(struct sctp_hmacalgo) + data_len; + num_idents = data_len / sizeof(u16); + + if (put_user(len, optlen)) + return -EFAULT; + if (put_user(num_idents, &p->shmac_num_idents)) + return -EFAULT; + if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len)) + return -EFAULT; + return 0; +} + +static int sctp_getsockopt_active_key(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_authkeyid val; + struct sctp_association *asoc; + + if (!sctp_auth_enable) + return -EACCES; + + if (len < sizeof(struct sctp_authkeyid)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid))) + return -EFAULT; + + asoc = sctp_id2assoc(sk, val.scact_assoc_id); + if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) + val.scact_keynumber = asoc->active_key_id; + else + val.scact_keynumber = sctp_sk(sk)->ep->active_key_id; + + len = sizeof(struct sctp_authkeyid); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_authchunks __user *p = (void __user *)optval; + struct sctp_authchunks val; + struct sctp_association *asoc; + struct sctp_chunks_param *ch; + u32 num_chunks = 0; + char __user *to; + + if (!sctp_auth_enable) + return -EACCES; + + if (len < sizeof(struct sctp_authchunks)) + return -EINVAL; + + if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + return -EFAULT; + + to = p->gauth_chunks; + asoc = sctp_id2assoc(sk, val.gauth_assoc_id); + if (!asoc) + return -EINVAL; + + ch = asoc->peer.peer_chunks; + if (!ch) + goto num; + + /* See if the user provided enough room for all the data */ + num_chunks = ntohs(ch->param_hdr.length) - sizeof(sctp_paramhdr_t); + if (len < num_chunks) + return -EINVAL; + + if (copy_to_user(to, ch->chunks, num_chunks)) + return -EFAULT; +num: + len = sizeof(struct sctp_authchunks) + num_chunks; + if (put_user(len, optlen)) return -EFAULT; + if (put_user(num_chunks, &p->gauth_number_of_chunks)) + return -EFAULT; + return 0; +} + +static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_authchunks __user *p = (void __user *)optval; + struct sctp_authchunks val; + struct sctp_association *asoc; + struct sctp_chunks_param *ch; + u32 num_chunks = 0; + char __user *to; + + if (!sctp_auth_enable) + return -EACCES; + + if (len < sizeof(struct sctp_authchunks)) + return -EINVAL; + + if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + return -EFAULT; + + to = p->gauth_chunks; + asoc = sctp_id2assoc(sk, val.gauth_assoc_id); + if (!asoc && val.gauth_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) + ch = (struct sctp_chunks_param*)asoc->c.auth_chunks; + else + ch = sctp_sk(sk)->ep->auth_chunk_list; + + if (!ch) + goto num; + + num_chunks = ntohs(ch->param_hdr.length) - sizeof(sctp_paramhdr_t); + if (len < sizeof(struct sctp_authchunks) + num_chunks) + return -EINVAL; + + if (copy_to_user(to, ch->chunks, num_chunks)) + return -EFAULT; +num: + len = sizeof(struct sctp_authchunks) + num_chunks; + if (put_user(len, optlen)) + return -EFAULT; + if (put_user(num_chunks, &p->gauth_number_of_chunks)) + return -EFAULT; + + return 0; +} + +/* + * 8.2.5. Get the Current Number of Associations (SCTP_GET_ASSOC_NUMBER) + * This option gets the current number of associations that are attached + * to a one-to-many style socket. The option value is an uint32_t. + */ +static int sctp_getsockopt_assoc_number(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + u32 val = 0; + + if (sctp_style(sk, TCP)) + return -EOPNOTSUPP; + + if (len < sizeof(u32)) + return -EINVAL; + + len = sizeof(u32); + + list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { + val++; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +/* + * 8.2.6. Get the Current Identifiers of Associations + * (SCTP_GET_ASSOC_ID_LIST) + * + * This option gets the current list of SCTP association identifiers of + * the SCTP associations handled by a one-to-many style socket. + */ +static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_assoc_ids *ids; + u32 num = 0; + + if (sctp_style(sk, TCP)) + return -EOPNOTSUPP; + + if (len < sizeof(struct sctp_assoc_ids)) + return -EINVAL; + + list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { + num++; + } + + if (len < sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num) + return -EINVAL; + + len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; + + ids = kmalloc(len, GFP_KERNEL); + if (unlikely(!ids)) + return -ENOMEM; + + ids->gaids_number_of_ids = num; + num = 0; + list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { + ids->gaids_assoc_id[num++] = asoc->assoc_id; + } + + if (put_user(len, optlen) || copy_to_user(optval, ids, len)) { + kfree(ids); + return -EFAULT; + } + + kfree(ids); + return 0; +} + +SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int retval = 0; + int len; + + SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n", + sk, optname); + + /* I can hardly begin to describe how wrong this is. This is + * so broken as to be worse than useless. The API draft + * REALLY is NOT helpful here... I am not convinced that the + * semantics of getsockopt() with a level OTHER THAN SOL_SCTP + * are at all well-founded. + */ + if (level != SOL_SCTP) { + struct sctp_af *af = sctp_sk(sk)->pf->af; + + retval = af->getsockopt(sk, level, optname, optval, optlen); + return retval; + } + + if (get_user(len, optlen)) + return -EFAULT; + + sctp_lock_sock(sk); + + switch (optname) { + case SCTP_STATUS: + retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen); + break; + case SCTP_DISABLE_FRAGMENTS: + retval = sctp_getsockopt_disable_fragments(sk, len, optval, + optlen); + break; + case SCTP_EVENTS: + retval = sctp_getsockopt_events(sk, len, optval, optlen); + break; + case SCTP_AUTOCLOSE: + retval = sctp_getsockopt_autoclose(sk, len, optval, optlen); + break; + case SCTP_SOCKOPT_PEELOFF: + retval = sctp_getsockopt_peeloff(sk, len, optval, optlen); + break; + case SCTP_PEER_ADDR_PARAMS: + retval = sctp_getsockopt_peer_addr_params(sk, len, optval, + optlen); + break; + case SCTP_DELAYED_SACK: + retval = sctp_getsockopt_delayed_ack(sk, len, optval, + optlen); + break; + case SCTP_INITMSG: + retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); + break; + case SCTP_GET_PEER_ADDRS: + retval = sctp_getsockopt_peer_addrs(sk, len, optval, + optlen); + break; + case SCTP_GET_LOCAL_ADDRS: + retval = sctp_getsockopt_local_addrs(sk, len, optval, + optlen); + break; + case SCTP_SOCKOPT_CONNECTX3: + retval = sctp_getsockopt_connectx3(sk, len, optval, optlen); + break; + case SCTP_DEFAULT_SEND_PARAM: + retval = sctp_getsockopt_default_send_param(sk, len, + optval, optlen); + break; + case SCTP_PRIMARY_ADDR: + retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); + break; + case SCTP_NODELAY: + retval = sctp_getsockopt_nodelay(sk, len, optval, optlen); + break; + case SCTP_RTOINFO: + retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen); + break; + case SCTP_ASSOCINFO: + retval = sctp_getsockopt_associnfo(sk, len, optval, optlen); + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen); + break; + case SCTP_MAXSEG: + retval = sctp_getsockopt_maxseg(sk, len, optval, optlen); + break; + case SCTP_GET_PEER_ADDR_INFO: + retval = sctp_getsockopt_peer_addr_info(sk, len, optval, + optlen); + break; + case SCTP_ADAPTATION_LAYER: + retval = sctp_getsockopt_adaptation_layer(sk, len, optval, + optlen); + break; + case SCTP_CONTEXT: + retval = sctp_getsockopt_context(sk, len, optval, optlen); + break; + case SCTP_FRAGMENT_INTERLEAVE: + retval = sctp_getsockopt_fragment_interleave(sk, len, optval, + optlen); + break; + case SCTP_PARTIAL_DELIVERY_POINT: + retval = sctp_getsockopt_partial_delivery_point(sk, len, optval, + optlen); + break; + case SCTP_MAX_BURST: + retval = sctp_getsockopt_maxburst(sk, len, optval, optlen); + break; + case SCTP_AUTH_KEY: + case SCTP_AUTH_CHUNK: + case SCTP_AUTH_DELETE_KEY: + retval = -EOPNOTSUPP; + break; + case SCTP_HMAC_IDENT: + retval = sctp_getsockopt_hmac_ident(sk, len, optval, optlen); + break; + case SCTP_AUTH_ACTIVE_KEY: + retval = sctp_getsockopt_active_key(sk, len, optval, optlen); + break; + case SCTP_PEER_AUTH_CHUNKS: + retval = sctp_getsockopt_peer_auth_chunks(sk, len, optval, + optlen); + break; + case SCTP_LOCAL_AUTH_CHUNKS: + retval = sctp_getsockopt_local_auth_chunks(sk, len, optval, + optlen); + break; + case SCTP_GET_ASSOC_NUMBER: + retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen); + break; + case SCTP_GET_ASSOC_ID_LIST: + retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen); + break; + default: + retval = -ENOPROTOOPT; + break; + } + + sctp_release_sock(sk); + return retval; +} + +static void sctp_hash(struct sock *sk) +{ + /* STUB */ +} + +static void sctp_unhash(struct sock *sk) +{ + /* STUB */ +} + +/* Check if port is acceptable. Possibly find first available port. + * + * The port hash table (contained in the 'global' SCTP protocol storage + * returned by struct sctp_protocol *sctp_get_protocol()). The hash + * table is an array of 4096 lists (sctp_bind_hashbucket). Each + * list (the list number is the port number hashed out, so as you + * would expect from a hash function, all the ports in a given list have + * such a number that hashes out to the same list number; you were + * expecting that, right?); so each list has a set of ports, with a + * link to the socket (struct sock) that uses it, the port number and + * a fastreuse flag (FIXME: NPI ipg). + */ +static struct sctp_bind_bucket *sctp_bucket_create( + struct sctp_bind_hashbucket *head, unsigned short snum); + +static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) +{ + struct sctp_bind_hashbucket *head; /* hash list */ + struct sctp_bind_bucket *pp; /* hash list port iterator */ + struct hlist_node *node; + unsigned short snum; + int ret; + + snum = ntohs(addr->v4.sin_port); + + SCTP_DEBUG_PRINTK("sctp_get_port() begins, snum=%d\n", snum); + sctp_local_bh_disable(); + + if (snum == 0) { + /* Search for an available port. */ + int low, high, remaining, index; + unsigned int rover; + + inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + rover = net_random() % remaining + low; + + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + if (inet_is_reserved_local_port(rover)) + continue; + index = sctp_phashfn(rover); + head = &sctp_port_hashtable[index]; + sctp_spin_lock(&head->lock); + sctp_for_each_hentry(pp, node, &head->chain) + if (pp->port == rover) + goto next; + break; + next: + sctp_spin_unlock(&head->lock); + } while (--remaining > 0); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD (the port + * hash table list entry) is non-NULL and we hold it's + * mutex. + */ + snum = rover; + } else { + /* We are given an specific port number; we verify + * that it is not being used. If it is used, we will + * exahust the search in the hash list corresponding + * to the port number (snum) - we detect that with the + * port iterator, pp being NULL. + */ + head = &sctp_port_hashtable[sctp_phashfn(snum)]; + sctp_spin_lock(&head->lock); + sctp_for_each_hentry(pp, node, &head->chain) { + if (pp->port == snum) + goto pp_found; + } + } + pp = NULL; + goto pp_not_found; +pp_found: + if (!hlist_empty(&pp->owner)) { + /* We had a port hash table hit - there is an + * available port (pp != NULL) and it is being + * used by other socket (pp->owner not empty); that other + * socket is going to be sk2. + */ + int reuse = sk->sk_reuse; + struct sock *sk2; + + SCTP_DEBUG_PRINTK("sctp_get_port() found a possible match\n"); + if (pp->fastreuse && sk->sk_reuse && + sk->sk_state != SCTP_SS_LISTENING) + goto success; + + /* Run through the list of sockets bound to the port + * (pp->port) [via the pointers bind_next and + * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one, + * we get the endpoint they describe and run through + * the endpoint's list of IP (v4 or v6) addresses, + * comparing each of the addresses with the address of + * the socket sk. If we find a match, then that means + * that this port/socket (sk) combination are already + * in an endpoint. + */ + sk_for_each_bound(sk2, node, &pp->owner) { + struct sctp_endpoint *ep2; + ep2 = sctp_sk(sk2)->ep; + + if (sk == sk2 || + (reuse && sk2->sk_reuse && + sk2->sk_state != SCTP_SS_LISTENING)) + continue; + + if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, + sctp_sk(sk2), sctp_sk(sk))) { + ret = (long)sk2; + goto fail_unlock; + } + } + SCTP_DEBUG_PRINTK("sctp_get_port(): Found a match\n"); + } +pp_not_found: + /* If there was a hash table miss, create a new port. */ + ret = 1; + if (!pp && !(pp = sctp_bucket_create(head, snum))) + goto fail_unlock; + + /* In either case (hit or miss), make sure fastreuse is 1 only + * if sk->sk_reuse is too (that is, if the caller requested + * SO_REUSEADDR on this socket -sk-). + */ + if (hlist_empty(&pp->owner)) { + if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) + pp->fastreuse = 1; + else + pp->fastreuse = 0; + } else if (pp->fastreuse && + (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) + pp->fastreuse = 0; + + /* We are set, so fill up all the data in the hash table + * entry, tie the socket list information with the rest of the + * sockets FIXME: Blurry, NPI (ipg). + */ +success: + if (!sctp_sk(sk)->bind_hash) { + inet_sk(sk)->inet_num = snum; + sk_add_bind_node(sk, &pp->owner); + sctp_sk(sk)->bind_hash = pp; + } + ret = 0; + +fail_unlock: + sctp_spin_unlock(&head->lock); + +fail: + sctp_local_bh_enable(); + return ret; +} + +/* Assign a 'snum' port to the socket. If snum == 0, an ephemeral + * port is requested. + */ +static int sctp_get_port(struct sock *sk, unsigned short snum) +{ + long ret; + union sctp_addr addr; + struct sctp_af *af = sctp_sk(sk)->pf->af; + + /* Set up a dummy address struct from the sk. */ + af->from_sk(&addr, sk); + addr.v4.sin_port = htons(snum); + + /* Note: sk->sk_num gets filled in if ephemeral port request. */ + ret = sctp_get_port_local(sk, &addr); + + return ret ? 1 : 0; +} + +/* + * Move a socket to LISTENING state. + */ +SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + struct crypto_hash *tfm = NULL; + + /* Allocate HMAC for generating cookie. */ + if (!sctp_sk(sk)->hmac && sctp_hmac_alg) { + tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + if (net_ratelimit()) { + pr_info("failed to load transform for %s: %ld\n", + sctp_hmac_alg, PTR_ERR(tfm)); + } + return -ENOSYS; + } + sctp_sk(sk)->hmac = tfm; + } + + /* + * If a bind() or sctp_bindx() is not called prior to a listen() + * call that allows new associations to be accepted, the system + * picks an ephemeral port and will choose an address set equivalent + * to binding with a wildcard address. + * + * This is not currently spelled out in the SCTP sockets + * extensions draft, but follows the practice as seen in TCP + * sockets. + * + */ + sk->sk_state = SCTP_SS_LISTENING; + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) + return -EAGAIN; + } else { + if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { + sk->sk_state = SCTP_SS_CLOSED; + return -EADDRINUSE; + } + } + + sk->sk_max_ack_backlog = backlog; + sctp_hash_endpoint(ep); + return 0; +} + +/* + * 4.1.3 / 5.1.3 listen() + * + * By default, new associations are not accepted for UDP style sockets. + * An application uses listen() to mark a socket as being able to + * accept new associations. + * + * On TCP style sockets, applications use listen() to ready the SCTP + * endpoint for accepting inbound associations. + * + * On both types of endpoints a backlog of '0' disables listening. + * + * Move a socket to LISTENING state. + */ +int sctp_inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + int err = -EINVAL; + + if (unlikely(backlog < 0)) + return err; + + sctp_lock_sock(sk); + + /* Peeled-off sockets are not allowed to listen(). */ + if (sctp_style(sk, UDP_HIGH_BANDWIDTH)) + goto out; + + if (sock->state != SS_UNCONNECTED) + goto out; + + /* If backlog is zero, disable listening. */ + if (!backlog) { + if (sctp_sstate(sk, CLOSED)) + goto out; + + err = 0; + sctp_unhash_endpoint(ep); + sk->sk_state = SCTP_SS_CLOSED; + if (sk->sk_reuse) + sctp_sk(sk)->bind_hash->fastreuse = 1; + goto out; + } + + /* If we are already listening, just update the backlog */ + if (sctp_sstate(sk, LISTENING)) + sk->sk_max_ack_backlog = backlog; + else { + err = sctp_listen_start(sk, backlog); + if (err) + goto out; + } + + err = 0; +out: + sctp_release_sock(sk); + return err; +} + +/* + * This function is done by modeling the current datagram_poll() and the + * tcp_poll(). Note that, based on these implementations, we don't + * lock the socket in this function, even though it seems that, + * ideally, locking or some other mechanisms can be used to ensure + * the integrity of the counters (sndbuf and wmem_alloc) used + * in this place. We assume that we don't need locks either until proven + * otherwise. + * + * Another thing to note is that we include the Async I/O support + * here, again, by modeling the current TCP/UDP code. We don't have + * a good way to test with it yet. + */ +unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct sctp_sock *sp = sctp_sk(sk); + unsigned int mask; + + poll_wait(file, sk_sleep(sk), wait); + + /* A TCP-style listening socket becomes readable when the accept queue + * is not empty. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + return (!list_empty(&sp->ep->asocs)) ? + (POLLIN | POLLRDNORM) : 0; + + mask = 0; + + /* Is there any exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP | POLLIN | POLLRDNORM; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* Is it readable? Reconsider this code with TCP-style support. */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* The association is either gone or not ready. */ + if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED)) + return mask; + + /* Is it writable? */ + if (sctp_writeable(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + /* + * Since the socket is not locked, the buffer + * might be made available after the writeable check and + * before the bit is set. This could cause a lost I/O + * signal. tcp_poll() has a race breaker for this race + * condition. Based on their implementation, we put + * in the following code to cover it as well. + */ + if (sctp_writeable(sk)) + mask |= POLLOUT | POLLWRNORM; + } + return mask; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +static struct sctp_bind_bucket *sctp_bucket_create( + struct sctp_bind_hashbucket *head, unsigned short snum) +{ + struct sctp_bind_bucket *pp; + + pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC); + if (pp) { + SCTP_DBG_OBJCNT_INC(bind_bucket); + pp->port = snum; + pp->fastreuse = 0; + INIT_HLIST_HEAD(&pp->owner); + hlist_add_head(&pp->node, &head->chain); + } + return pp; +} + +/* Caller must hold hashbucket lock for this tb with local BH disabled */ +static void sctp_bucket_destroy(struct sctp_bind_bucket *pp) +{ + if (pp && hlist_empty(&pp->owner)) { + __hlist_del(&pp->node); + kmem_cache_free(sctp_bucket_cachep, pp); + SCTP_DBG_OBJCNT_DEC(bind_bucket); + } +} + +/* Release this socket's reference to a local port. */ +static inline void __sctp_put_port(struct sock *sk) +{ + struct sctp_bind_hashbucket *head = + &sctp_port_hashtable[sctp_phashfn(inet_sk(sk)->inet_num)]; + struct sctp_bind_bucket *pp; + + sctp_spin_lock(&head->lock); + pp = sctp_sk(sk)->bind_hash; + __sk_del_bind_node(sk); + sctp_sk(sk)->bind_hash = NULL; + inet_sk(sk)->inet_num = 0; + sctp_bucket_destroy(pp); + sctp_spin_unlock(&head->lock); +} + +void sctp_put_port(struct sock *sk) +{ + sctp_local_bh_disable(); + __sctp_put_port(sk); + sctp_local_bh_enable(); +} + +/* + * The system picks an ephemeral port and choose an address set equivalent + * to binding with a wildcard address. + * One of those addresses will be the primary address for the association. + * This automatically enables the multihoming capability of SCTP. + */ +static int sctp_autobind(struct sock *sk) +{ + union sctp_addr autoaddr; + struct sctp_af *af; + __be16 port; + + /* Initialize a local sockaddr structure to INADDR_ANY. */ + af = sctp_sk(sk)->pf->af; + + port = htons(inet_sk(sk)->inet_num); + af->inaddr_any(&autoaddr, port); + + return sctp_do_bind(sk, &autoaddr, af->sockaddr_len); +} + +/* Parse out IPPROTO_SCTP CMSG headers. Perform only minimal validation. + * + * From RFC 2292 + * 4.2 The cmsghdr Structure * + * + * When ancillary data is sent or received, any number of ancillary data + * objects can be specified by the msg_control and msg_controllen members of + * the msghdr structure, because each object is preceded by + * a cmsghdr structure defining the object's length (the cmsg_len member). + * Historically Berkeley-derived implementations have passed only one object + * at a time, but this API allows multiple objects to be + * passed in a single call to sendmsg() or recvmsg(). The following example + * shows two ancillary data objects in a control buffer. + * + * |<--------------------------- msg_controllen -------------------------->| + * | | + * + * |<----- ancillary data object ----->|<----- ancillary data object ----->| + * + * |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->| + * | | | + * + * |<---------- cmsg_len ---------->| |<--------- cmsg_len ----------->| | + * + * |<--------- CMSG_LEN() --------->| |<-------- CMSG_LEN() ---------->| | + * | | | | | + * + * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ + * |cmsg_|cmsg_|cmsg_|XX| |XX|cmsg_|cmsg_|cmsg_|XX| |XX| + * + * |len |level|type |XX|cmsg_data[]|XX|len |level|type |XX|cmsg_data[]|XX| + * + * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ + * ^ + * | + * + * msg_control + * points here + */ +SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg, + sctp_cmsgs_t *cmsgs) +{ + struct cmsghdr *cmsg; + struct msghdr *my_msg = (struct msghdr *)msg; + + for (cmsg = CMSG_FIRSTHDR(msg); + cmsg != NULL; + cmsg = CMSG_NXTHDR(my_msg, cmsg)) { + if (!CMSG_OK(my_msg, cmsg)) + return -EINVAL; + + /* Should we parse this header or ignore? */ + if (cmsg->cmsg_level != IPPROTO_SCTP) + continue; + + /* Strictly check lengths following example in SCM code. */ + switch (cmsg->cmsg_type) { + case SCTP_INIT: + /* SCTP Socket API Extension + * 5.2.1 SCTP Initiation Structure (SCTP_INIT) + * + * This cmsghdr structure provides information for + * initializing new SCTP associations with sendmsg(). + * The SCTP_INITMSG socket option uses this same data + * structure. This structure is not used for + * recvmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ---------------------- + * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg + */ + if (cmsg->cmsg_len != + CMSG_LEN(sizeof(struct sctp_initmsg))) + return -EINVAL; + cmsgs->init = (struct sctp_initmsg *)CMSG_DATA(cmsg); + break; + + case SCTP_SNDRCV: + /* SCTP Socket API Extension + * 5.2.2 SCTP Header Information Structure(SCTP_SNDRCV) + * + * This cmsghdr structure specifies SCTP options for + * sendmsg() and describes SCTP header information + * about a received message through recvmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ---------------------- + * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo + */ + if (cmsg->cmsg_len != + CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) + return -EINVAL; + + cmsgs->info = + (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + + /* Minimally, validate the sinfo_flags. */ + if (cmsgs->info->sinfo_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +/* + * Wait for a packet.. + * Note: This function is the same function as in core/datagram.c + * with a few modifications to make lksctp work. + */ +static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) +{ + int error; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + goto ready; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + + /* Sequenced packets can come disconnected. If so we report the + * problem. + */ + error = -ENOTCONN; + + /* Is there a good reason to think that we may receive some data? */ + if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING)) + goto out; + + /* Handle signals. */ + if (signal_pending(current)) + goto interrupted; + + /* Let another process have a go. Since we are going to sleep + * anyway. Note: This may cause odd behaviors if the message + * does not fit in the user's buffer, but this seems to be the + * only way to honor MSG_DONTWAIT realistically. + */ + sctp_release_sock(sk); + *timeo_p = schedule_timeout(*timeo_p); + sctp_lock_sock(sk); + +ready: + finish_wait(sk_sleep(sk), &wait); + return 0; + +interrupted: + error = sock_intr_errno(*timeo_p); + +out: + finish_wait(sk_sleep(sk), &wait); + *err = error; + return error; +} + +/* Receive a datagram. + * Note: This is pretty much the same routine as in core/datagram.c + * with a few changes to make lksctp work. + */ +static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, + int noblock, int *err) +{ + int error; + struct sk_buff *skb; + long timeo; + + timeo = sock_rcvtimeo(sk, noblock); + + SCTP_DEBUG_PRINTK("Timeout: timeo: %ld, MAX: %ld.\n", + timeo, MAX_SCHEDULE_TIMEOUT); + + do { + /* Again only user level code calls this function, + * so nothing interrupt level + * will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was correct in any case. 8) + */ + if (flags & MSG_PEEK) { + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + atomic_inc(&skb->users); + spin_unlock_bh(&sk->sk_receive_queue.lock); + } else { + skb = skb_dequeue(&sk->sk_receive_queue); + } + + if (skb) + return skb; + + /* Caller is allowed not to check sk->sk_err before calling. */ + error = sock_error(sk); + if (error) + goto no_packet; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + /* User doesn't want to wait. */ + error = -EAGAIN; + if (!timeo) + goto no_packet; + } while (sctp_wait_for_packet(sk, err, &timeo) == 0); + + return NULL; + +no_packet: + *err = error; + return NULL; +} + +/* If sndbuf has changed, wake up per association sndbuf waiters. */ +static void __sctp_write_space(struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + struct socket *sock = sk->sk_socket; + + if ((sctp_wspace(asoc) > 0) && sock) { + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); + + if (sctp_writeable(sk)) { + wait_queue_head_t *wq = sk_sleep(sk); + + if (wq && waitqueue_active(wq)) + wake_up_interruptible(wq); + + /* Note that we try to include the Async I/O support + * here by modeling from the current TCP/UDP code. + * We have not tested with it yet. + */ + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, + SOCK_WAKE_SPACE, POLL_OUT); + } + } +} + +/* Do accounting for the sndbuf space. + * Decrement the used sndbuf space of the corresponding association by the + * data size which was just transmitted(freed). + */ +static void sctp_wfree(struct sk_buff *skb) +{ + struct sctp_association *asoc; + struct sctp_chunk *chunk; + struct sock *sk; + + /* Get the saved chunk pointer. */ + chunk = *((struct sctp_chunk **)(skb->cb)); + asoc = chunk->asoc; + sk = asoc->base.sk; + asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + + atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); + + /* + * This undoes what is done via sctp_set_owner_w and sk_mem_charge + */ + sk->sk_wmem_queued -= skb->truesize; + sk_mem_uncharge(sk, skb->truesize); + + sock_wfree(skb); + __sctp_write_space(asoc); + + sctp_association_put(asoc); +} + +/* Do accounting for the receive space on the socket. + * Accounting for the association is done in ulpevent.c + * We set this as a destructor for the cloned data skbs so that + * accounting is done at the correct time. + */ +void sctp_sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sctp_ulpevent *event = sctp_skb2event(skb); + + atomic_sub(event->rmem_len, &sk->sk_rmem_alloc); + + /* + * Mimic the behavior of sock_rfree + */ + sk_mem_uncharge(sk, event->rmem_len); +} + + +/* Helper function to wait for space in the sndbuf. */ +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, + size_t msg_len) +{ + struct sock *sk = asoc->base.sk; + int err = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + SCTP_DEBUG_PRINTK("wait_for_sndbuf: asoc=%p, timeo=%ld, msg_len=%zu\n", + asoc, (long)(*timeo_p), msg_len); + + /* Increment the association's refcnt. */ + sctp_association_hold(asoc); + + /* Wait on the association specific sndbuf space. */ + for (;;) { + prepare_to_wait_exclusive(&asoc->wait, &wait, + TASK_INTERRUPTIBLE); + if (!*timeo_p) + goto do_nonblock; + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || + asoc->base.dead) + goto do_error; + if (signal_pending(current)) + goto do_interrupted; + if (msg_len <= sctp_wspace(asoc)) + break; + + /* Let another process have a go. Since we are going + * to sleep anyway. + */ + sctp_release_sock(sk); + current_timeo = schedule_timeout(current_timeo); + BUG_ON(sk != asoc->base.sk); + sctp_lock_sock(sk); + + *timeo_p = current_timeo; + } + +out: + finish_wait(&asoc->wait, &wait); + + /* Release the association's refcnt. */ + sctp_association_put(asoc); + + return err; + +do_error: + err = -EPIPE; + goto out; + +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; + +do_nonblock: + err = -EAGAIN; + goto out; +} + +void sctp_data_ready(struct sock *sk, int len) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +/* If socket sndbuf has changed, wake up all per association waiters. */ +void sctp_write_space(struct sock *sk) +{ + struct sctp_association *asoc; + + /* Wake up the tasks in each wait queue. */ + list_for_each_entry(asoc, &((sctp_sk(sk))->ep->asocs), asocs) { + __sctp_write_space(asoc); + } +} + +/* Is there any sndbuf space available on the socket? + * + * Note that sk_wmem_alloc is the sum of the send buffers on all of the + * associations on the same socket. For a UDP-style socket with + * multiple associations, it is possible for it to be "unwriteable" + * prematurely. I assume that this is acceptable because + * a premature "unwriteable" is better than an accidental "writeable" which + * would cause an unwanted block under certain circumstances. For the 1-1 + * UDP-style sockets or TCP-style sockets, this code should work. + * - Daisy + */ +static int sctp_writeable(struct sock *sk) +{ + int amt = 0; + + amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amt < 0) + amt = 0; + return amt; +} + +/* Wait for an association to go into ESTABLISHED state. If timeout is 0, + * returns immediately with EINPROGRESS. + */ +static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) +{ + struct sock *sk = asoc->base.sk; + int err = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + SCTP_DEBUG_PRINTK("%s: asoc=%p, timeo=%ld\n", __func__, asoc, + (long)(*timeo_p)); + + /* Increment the association's refcnt. */ + sctp_association_hold(asoc); + + for (;;) { + prepare_to_wait_exclusive(&asoc->wait, &wait, + TASK_INTERRUPTIBLE); + if (!*timeo_p) + goto do_nonblock; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || + asoc->base.dead) + goto do_error; + if (signal_pending(current)) + goto do_interrupted; + + if (sctp_state(asoc, ESTABLISHED)) + break; + + /* Let another process have a go. Since we are going + * to sleep anyway. + */ + sctp_release_sock(sk); + current_timeo = schedule_timeout(current_timeo); + sctp_lock_sock(sk); + + *timeo_p = current_timeo; + } + +out: + finish_wait(&asoc->wait, &wait); + + /* Release the association's refcnt. */ + sctp_association_put(asoc); + + return err; + +do_error: + if (asoc->init_err_counter + 1 > asoc->max_init_attempts) + err = -ETIMEDOUT; + else + err = -ECONNREFUSED; + goto out; + +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; + +do_nonblock: + err = -EINPROGRESS; + goto out; +} + +static int sctp_wait_for_accept(struct sock *sk, long timeo) +{ + struct sctp_endpoint *ep; + int err = 0; + DEFINE_WAIT(wait); + + ep = sctp_sk(sk)->ep; + + + for (;;) { + prepare_to_wait_exclusive(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + + if (list_empty(&ep->asocs)) { + sctp_release_sock(sk); + timeo = schedule_timeout(timeo); + sctp_lock_sock(sk); + } + + err = -EINVAL; + if (!sctp_sstate(sk, LISTENING)) + break; + + err = 0; + if (!list_empty(&ep->asocs)) + break; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + + err = -EAGAIN; + if (!timeo) + break; + } + + finish_wait(sk_sleep(sk), &wait); + + return err; +} + +static void sctp_wait_for_close(struct sock *sk, long timeout) +{ + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (list_empty(&sctp_sk(sk)->ep->asocs)) + break; + sctp_release_sock(sk); + timeout = schedule_timeout(timeout); + sctp_lock_sock(sk); + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); +} + +static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk) +{ + struct sk_buff *frag; + + if (!skb->data_len) + goto done; + + /* Don't forget the fragments. */ + skb_walk_frags(skb, frag) + sctp_skb_set_owner_r_frag(frag, sk); + +done: + sctp_skb_set_owner_r(skb, sk); +} + +void sctp_copy_sock(struct sock *newsk, struct sock *sk, + struct sctp_association *asoc) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_sock *newinet; + + newsk->sk_type = sk->sk_type; + newsk->sk_bound_dev_if = sk->sk_bound_dev_if; + newsk->sk_flags = sk->sk_flags; + newsk->sk_no_check = sk->sk_no_check; + newsk->sk_reuse = sk->sk_reuse; + + newsk->sk_shutdown = sk->sk_shutdown; + newsk->sk_destruct = inet_sock_destruct; + newsk->sk_family = sk->sk_family; + newsk->sk_protocol = IPPROTO_SCTP; + newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + newsk->sk_sndbuf = sk->sk_sndbuf; + newsk->sk_rcvbuf = sk->sk_rcvbuf; + newsk->sk_lingertime = sk->sk_lingertime; + newsk->sk_rcvtimeo = sk->sk_rcvtimeo; + newsk->sk_sndtimeo = sk->sk_sndtimeo; + + newinet = inet_sk(newsk); + + /* Initialize sk's sport, dport, rcv_saddr and daddr for + * getsockname() and getpeername() + */ + newinet->inet_sport = inet->inet_sport; + newinet->inet_saddr = inet->inet_saddr; + newinet->inet_rcv_saddr = inet->inet_rcv_saddr; + newinet->inet_dport = htons(asoc->peer.port); + newinet->pmtudisc = inet->pmtudisc; + newinet->inet_id = asoc->next_tsn ^ jiffies; + + newinet->uc_ttl = inet->uc_ttl; + newinet->mc_loop = 1; + newinet->mc_ttl = 1; + newinet->mc_index = 0; + newinet->mc_list = NULL; +} + +/* Populate the fields of the newsk from the oldsk and migrate the assoc + * and its messages to the newsk. + */ +static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, + struct sctp_association *assoc, + sctp_socket_type_t type) +{ + struct sctp_sock *oldsp = sctp_sk(oldsk); + struct sctp_sock *newsp = sctp_sk(newsk); + struct sctp_bind_bucket *pp; /* hash list port iterator */ + struct sctp_endpoint *newep = newsp->ep; + struct sk_buff *skb, *tmp; + struct sctp_ulpevent *event; + struct sctp_bind_hashbucket *head; + + /* Migrate socket buffer sizes and all the socket level options to the + * new socket. + */ + newsk->sk_sndbuf = oldsk->sk_sndbuf; + newsk->sk_rcvbuf = oldsk->sk_rcvbuf; + /* Brute force copy old sctp opt. */ + inet_sk_copy_descendant(newsk, oldsk); + + /* Restore the ep value that was overwritten with the above structure + * copy. + */ + newsp->ep = newep; + newsp->hmac = NULL; + + /* Hook this new socket in to the bind_hash list. */ + head = &sctp_port_hashtable[sctp_phashfn(inet_sk(oldsk)->inet_num)]; + sctp_local_bh_disable(); + sctp_spin_lock(&head->lock); + pp = sctp_sk(oldsk)->bind_hash; + sk_add_bind_node(newsk, &pp->owner); + sctp_sk(newsk)->bind_hash = pp; + inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num; + sctp_spin_unlock(&head->lock); + sctp_local_bh_enable(); + + /* Copy the bind_addr list from the original endpoint to the new + * endpoint so that we can handle restarts properly + */ + sctp_bind_addr_dup(&newsp->ep->base.bind_addr, + &oldsp->ep->base.bind_addr, GFP_KERNEL); + + /* Move any messages in the old socket's receive queue that are for the + * peeled off association to the new socket's receive queue. + */ + sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { + event = sctp_skb2event(skb); + if (event->asoc == assoc) { + __skb_unlink(skb, &oldsk->sk_receive_queue); + __skb_queue_tail(&newsk->sk_receive_queue, skb); + sctp_skb_set_owner_r_frag(skb, newsk); + } + } + + /* Clean up any messages pending delivery due to partial + * delivery. Three cases: + * 1) No partial deliver; no work. + * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby. + * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue. + */ + skb_queue_head_init(&newsp->pd_lobby); + atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode); + + if (atomic_read(&sctp_sk(oldsk)->pd_mode)) { + struct sk_buff_head *queue; + + /* Decide which queue to move pd_lobby skbs to. */ + if (assoc->ulpq.pd_mode) { + queue = &newsp->pd_lobby; + } else + queue = &newsk->sk_receive_queue; + + /* Walk through the pd_lobby, looking for skbs that + * need moved to the new socket. + */ + sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { + event = sctp_skb2event(skb); + if (event->asoc == assoc) { + __skb_unlink(skb, &oldsp->pd_lobby); + __skb_queue_tail(queue, skb); + sctp_skb_set_owner_r_frag(skb, newsk); + } + } + + /* Clear up any skbs waiting for the partial + * delivery to finish. + */ + if (assoc->ulpq.pd_mode) + sctp_clear_pd(oldsk, NULL); + + } + + sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp) + sctp_skb_set_owner_r_frag(skb, newsk); + + sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp) + sctp_skb_set_owner_r_frag(skb, newsk); + + /* Set the type of socket to indicate that it is peeled off from the + * original UDP-style socket or created with the accept() call on a + * TCP-style socket.. + */ + newsp->type = type; + + /* Mark the new socket "in-use" by the user so that any packets + * that may arrive on the association after we've moved it are + * queued to the backlog. This prevents a potential race between + * backlog processing on the old socket and new-packet processing + * on the new socket. + * + * The caller has just allocated newsk so we can guarantee that other + * paths won't try to lock it and then oldsk. + */ + lock_sock_nested(newsk, SINGLE_DEPTH_NESTING); + sctp_assoc_migrate(assoc, newsk); + + /* If the association on the newsk is already closed before accept() + * is called, set RCV_SHUTDOWN flag. + */ + if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) + newsk->sk_shutdown |= RCV_SHUTDOWN; + + newsk->sk_state = SCTP_SS_ESTABLISHED; + sctp_release_sock(newsk); +} + + +/* This proto struct describes the ULP interface for SCTP. */ +struct proto sctp_prot = { + .name = "SCTP", + .owner = THIS_MODULE, + .close = sctp_close, + .connect = sctp_connect, + .disconnect = sctp_disconnect, + .accept = sctp_accept, + .ioctl = sctp_ioctl, + .init = sctp_init_sock, + .destroy = sctp_destroy_sock, + .shutdown = sctp_shutdown, + .setsockopt = sctp_setsockopt, + .getsockopt = sctp_getsockopt, + .sendmsg = sctp_sendmsg, + .recvmsg = sctp_recvmsg, + .bind = sctp_bind, + .backlog_rcv = sctp_backlog_rcv, + .hash = sctp_hash, + .unhash = sctp_unhash, + .get_port = sctp_get_port, + .obj_size = sizeof(struct sctp_sock), + .sysctl_mem = sysctl_sctp_mem, + .sysctl_rmem = sysctl_sctp_rmem, + .sysctl_wmem = sysctl_sctp_wmem, + .memory_pressure = &sctp_memory_pressure, + .enter_memory_pressure = sctp_enter_memory_pressure, + .memory_allocated = &sctp_memory_allocated, + .sockets_allocated = &sctp_sockets_allocated, +}; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +struct proto sctpv6_prot = { + .name = "SCTPv6", + .owner = THIS_MODULE, + .close = sctp_close, + .connect = sctp_connect, + .disconnect = sctp_disconnect, + .accept = sctp_accept, + .ioctl = sctp_ioctl, + .init = sctp_init_sock, + .destroy = sctp_destroy_sock, + .shutdown = sctp_shutdown, + .setsockopt = sctp_setsockopt, + .getsockopt = sctp_getsockopt, + .sendmsg = sctp_sendmsg, + .recvmsg = sctp_recvmsg, + .bind = sctp_bind, + .backlog_rcv = sctp_backlog_rcv, + .hash = sctp_hash, + .unhash = sctp_unhash, + .get_port = sctp_get_port, + .obj_size = sizeof(struct sctp6_sock), + .sysctl_mem = sysctl_sctp_mem, + .sysctl_rmem = sysctl_sctp_rmem, + .sysctl_wmem = sysctl_sctp_wmem, + .memory_pressure = &sctp_memory_pressure, + .enter_memory_pressure = sctp_enter_memory_pressure, + .memory_allocated = &sctp_memory_allocated, + .sockets_allocated = &sctp_sockets_allocated, +}; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c new file mode 100644 index 00000000..442ad4ed --- /dev/null +++ b/net/sctp/ssnmap.c @@ -0,0 +1,133 @@ +/* SCTP kernel implementation + * Copyright (c) 2003 International Business Machines, Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp SSN tracker. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include + +#define MAX_KMALLOC_SIZE 131072 + +static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, + __u16 out); + +/* Storage size needed for map includes 2 headers and then the + * specific needs of in or out streams. + */ +static inline size_t sctp_ssnmap_size(__u16 in, __u16 out) +{ + return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16); +} + + +/* Create a new sctp_ssnmap. + * Allocate room to store at least 'len' contiguous TSNs. + */ +struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, + gfp_t gfp) +{ + struct sctp_ssnmap *retval; + int size; + + size = sctp_ssnmap_size(in, out); + if (size <= MAX_KMALLOC_SIZE) + retval = kmalloc(size, gfp); + else + retval = (struct sctp_ssnmap *) + __get_free_pages(gfp, get_order(size)); + if (!retval) + goto fail; + + if (!sctp_ssnmap_init(retval, in, out)) + goto fail_map; + + retval->malloced = 1; + SCTP_DBG_OBJCNT_INC(ssnmap); + + return retval; + +fail_map: + if (size <= MAX_KMALLOC_SIZE) + kfree(retval); + else + free_pages((unsigned long)retval, get_order(size)); +fail: + return NULL; +} + + +/* Initialize a block of memory as a ssnmap. */ +static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, + __u16 out) +{ + memset(map, 0x00, sctp_ssnmap_size(in, out)); + + /* Start 'in' stream just after the map header. */ + map->in.ssn = (__u16 *)&map[1]; + map->in.len = in; + + /* Start 'out' stream just after 'in'. */ + map->out.ssn = &map->in.ssn[in]; + map->out.len = out; + + return map; +} + +/* Clear out the ssnmap streams. */ +void sctp_ssnmap_clear(struct sctp_ssnmap *map) +{ + size_t size; + + size = (map->in.len + map->out.len) * sizeof(__u16); + memset(map->in.ssn, 0x00, size); +} + +/* Dispose of a ssnmap. */ +void sctp_ssnmap_free(struct sctp_ssnmap *map) +{ + if (map && map->malloced) { + int size; + + size = sctp_ssnmap_size(map->in.len, map->out.len); + if (size <= MAX_KMALLOC_SIZE) + kfree(map); + else + free_pages((unsigned long)map, get_order(size)); + SCTP_DBG_OBJCNT_DEC(ssnmap); + } +} diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c new file mode 100644 index 00000000..6752f489 --- /dev/null +++ b/net/sctp/sysctl.c @@ -0,0 +1,289 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2002, 2004 + * Copyright (c) 2002 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * Sysctl related interfaces for SCTP. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Mingqin Liu + * Jon Grimm + * Ardelle Fan + * Ryan Layer + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +static int zero = 0; +static int one = 1; +static int timer_max = 86400000; /* ms in one day */ +static int int_max = INT_MAX; +static int sack_timer_min = 1; +static int sack_timer_max = 500; +static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ +static int rwnd_scale_max = 16; +static unsigned long max_autoclose_min = 0; +static unsigned long max_autoclose_max = + (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) + ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; + +extern long sysctl_sctp_mem[3]; +extern int sysctl_sctp_rmem[3]; +extern int sysctl_sctp_wmem[3]; + +static ctl_table sctp_table[] = { + { + .procname = "rto_initial", + .data = &sctp_rto_initial, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &timer_max + }, + { + .procname = "rto_min", + .data = &sctp_rto_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &timer_max + }, + { + .procname = "rto_max", + .data = &sctp_rto_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &timer_max + }, + { + .procname = "valid_cookie_life", + .data = &sctp_valid_cookie_life, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &timer_max + }, + { + .procname = "max_burst", + .data = &sctp_max_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max + }, + { + .procname = "association_max_retrans", + .data = &sctp_max_retrans_association, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &int_max + }, + { + .procname = "sndbuf_policy", + .data = &sctp_sndbuf_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rcvbuf_policy", + .data = &sctp_rcvbuf_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "path_max_retrans", + .data = &sctp_max_retrans_path, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &int_max + }, + { + .procname = "max_init_retransmits", + .data = &sctp_max_retrans_init, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &int_max + }, + { + .procname = "hb_interval", + .data = &sctp_hb_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &timer_max + }, + { + .procname = "cookie_preserve_enable", + .data = &sctp_cookie_preserve_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rto_alpha_exp_divisor", + .data = &sctp_rto_alpha, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "rto_beta_exp_divisor", + .data = &sctp_rto_beta, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "addip_enable", + .data = &sctp_addip_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "prsctp_enable", + .data = &sctp_prsctp_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sack_timeout", + .data = &sctp_sack_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sack_timer_min, + .extra2 = &sack_timer_max, + }, + { + .procname = "sctp_mem", + .data = &sysctl_sctp_mem, + .maxlen = sizeof(sysctl_sctp_mem), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, + { + .procname = "sctp_rmem", + .data = &sysctl_sctp_rmem, + .maxlen = sizeof(sysctl_sctp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sctp_wmem", + .data = &sysctl_sctp_wmem, + .maxlen = sizeof(sysctl_sctp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "auth_enable", + .data = &sctp_auth_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "addip_noauth_enable", + .data = &sctp_addip_noauth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "addr_scope_policy", + .data = &sctp_scope_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &addr_scope_max, + }, + { + .procname = "rwnd_update_shift", + .data = &sctp_rwnd_upd_shift, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &rwnd_scale_max, + }, + { + .procname = "max_autoclose", + .data = &sctp_max_autoclose, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &max_autoclose_min, + .extra2 = &max_autoclose_max, + }, + + { /* sentinel */ } +}; + +static struct ctl_path sctp_path[] = { + { .procname = "net", }, + { .procname = "sctp", }, + { } +}; + +static struct ctl_table_header * sctp_sysctl_header; + +/* Sysctl registration. */ +void sctp_sysctl_register(void) +{ + sctp_sysctl_header = register_sysctl_paths(sctp_path, sctp_table); +} + +/* Sysctl deregistration. */ +void sctp_sysctl_unregister(void) +{ + unregister_sysctl_table(sctp_sysctl_header); +} diff --git a/net/sctp/transport.c b/net/sctp/transport.c new file mode 100644 index 00000000..8da4481e --- /dev/null +++ b/net/sctp/transport.c @@ -0,0 +1,626 @@ +/* SCTP kernel implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 International Business Machines Corp. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel implementation + * + * This module provides the abstraction for an SCTP tranport representing + * a remote transport address. For local transport addresses, we just use + * union sctp_addr. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Xingang Guo + * Hui Huang + * Sridhar Samudrala + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +/* 1st Level Abstractions. */ + +/* Initialize a new transport from provided memory. */ +static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, + const union sctp_addr *addr, + gfp_t gfp) +{ + /* Copy in the address. */ + peer->ipaddr = *addr; + peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); + memset(&peer->saddr, 0, sizeof(union sctp_addr)); + + /* From 6.3.1 RTO Calculation: + * + * C1) Until an RTT measurement has been made for a packet sent to the + * given destination transport address, set RTO to the protocol + * parameter 'RTO.Initial'. + */ + peer->rto = msecs_to_jiffies(sctp_rto_initial); + + peer->last_time_heard = jiffies; + peer->last_time_ecne_reduced = jiffies; + + peer->param_flags = SPP_HB_DISABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + + /* Initialize the default path max_retrans. */ + peer->pathmaxrxt = sctp_max_retrans_path; + + INIT_LIST_HEAD(&peer->transmitted); + INIT_LIST_HEAD(&peer->send_ready); + INIT_LIST_HEAD(&peer->transports); + + setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, + (unsigned long)peer); + setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, + (unsigned long)peer); + setup_timer(&peer->proto_unreach_timer, + sctp_generate_proto_unreach_event, (unsigned long)peer); + + /* Initialize the 64-bit random nonce sent with heartbeat. */ + get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); + + atomic_set(&peer->refcnt, 1); + + return peer; +} + +/* Allocate and initialize a new transport. */ +struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, + gfp_t gfp) +{ + struct sctp_transport *transport; + + transport = t_new(struct sctp_transport, gfp); + if (!transport) + goto fail; + + if (!sctp_transport_init(transport, addr, gfp)) + goto fail_init; + + transport->malloced = 1; + SCTP_DBG_OBJCNT_INC(transport); + + return transport; + +fail_init: + kfree(transport); + +fail: + return NULL; +} + +/* This transport is no longer needed. Free up if possible, or + * delay until it last reference count. + */ +void sctp_transport_free(struct sctp_transport *transport) +{ + transport->dead = 1; + + /* Try to delete the heartbeat timer. */ + if (del_timer(&transport->hb_timer)) + sctp_transport_put(transport); + + /* Delete the T3_rtx timer if it's active. + * There is no point in not doing this now and letting + * structure hang around in memory since we know + * the tranport is going away. + */ + if (timer_pending(&transport->T3_rtx_timer) && + del_timer(&transport->T3_rtx_timer)) + sctp_transport_put(transport); + + /* Delete the ICMP proto unreachable timer if it's active. */ + if (timer_pending(&transport->proto_unreach_timer) && + del_timer(&transport->proto_unreach_timer)) + sctp_association_put(transport->asoc); + + sctp_transport_put(transport); +} + +/* Destroy the transport data structure. + * Assumes there are no more users of this structure. + */ +static void sctp_transport_destroy(struct sctp_transport *transport) +{ + SCTP_ASSERT(transport->dead, "Transport is not dead", return); + + if (transport->asoc) + sctp_association_put(transport->asoc); + + sctp_packet_free(&transport->packet); + + dst_release(transport->dst); + kfree(transport); + SCTP_DBG_OBJCNT_DEC(transport); +} + +/* Start T3_rtx timer if it is not already running and update the heartbeat + * timer. This routine is called every time a DATA chunk is sent. + */ +void sctp_transport_reset_timers(struct sctp_transport *transport) +{ + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R1) Every time a DATA chunk is sent to any address(including a + * retransmission), if the T3-rtx timer of that address is not running + * start it running so that it will expire after the RTO of that + * address. + */ + + if (!timer_pending(&transport->T3_rtx_timer)) + if (!mod_timer(&transport->T3_rtx_timer, + jiffies + transport->rto)) + sctp_transport_hold(transport); + + /* When a data chunk is sent, reset the heartbeat interval. */ + if (!mod_timer(&transport->hb_timer, + sctp_transport_timeout(transport))) + sctp_transport_hold(transport); +} + +/* This transport has been assigned to an association. + * Initialize fields from the association or from the sock itself. + * Register the reference count in the association. + */ +void sctp_transport_set_owner(struct sctp_transport *transport, + struct sctp_association *asoc) +{ + transport->asoc = asoc; + sctp_association_hold(asoc); +} + +/* Initialize the pmtu of a transport. */ +void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) +{ + /* If we don't have a fresh route, look one up */ + if (!transport->dst || transport->dst->obsolete > 1) { + dst_release(transport->dst); + transport->af_specific->get_dst(transport, &transport->saddr, + &transport->fl, sk); + } + + if (transport->dst) { + transport->pathmtu = dst_mtu(transport->dst); + } else + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; +} + +void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) +{ + struct dst_entry *dst; + + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment size and disable + * pmtu discovery on this transport. + */ + t->pathmtu = SCTP_DEFAULT_MINSEGMENT; + } else { + t->pathmtu = pmtu; + } + + dst = sctp_transport_dst_check(t); + if (dst) + dst->ops->update_pmtu(dst, pmtu); +} + +/* Caches the dst entry and source address for a transport's destination + * address. + */ +void sctp_transport_route(struct sctp_transport *transport, + union sctp_addr *saddr, struct sctp_sock *opt) +{ + struct sctp_association *asoc = transport->asoc; + struct sctp_af *af = transport->af_specific; + + af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt)); + + if (saddr) + memcpy(&transport->saddr, saddr, sizeof(union sctp_addr)); + else + af->get_saddr(opt, transport, &transport->fl); + + if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { + return; + } + if (transport->dst) { + transport->pathmtu = dst_mtu(transport->dst); + + /* Initialize sk->sk_rcv_saddr, if the transport is the + * association's active path for getsockname(). + */ + if (asoc && (!asoc->peer.primary_path || + (transport == asoc->peer.active_path))) + opt->pf->af->to_sk_saddr(&transport->saddr, + asoc->base.sk); + } else + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; +} + +/* Hold a reference to a transport. */ +void sctp_transport_hold(struct sctp_transport *transport) +{ + atomic_inc(&transport->refcnt); +} + +/* Release a reference to a transport and clean up + * if there are no more references. + */ +void sctp_transport_put(struct sctp_transport *transport) +{ + if (atomic_dec_and_test(&transport->refcnt)) + sctp_transport_destroy(transport); +} + +/* Update transport's RTO based on the newly calculated RTT. */ +void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) +{ + /* Check for valid transport. */ + SCTP_ASSERT(tp, "NULL transport", return); + + /* We should not be doing any RTO updates unless rto_pending is set. */ + SCTP_ASSERT(tp->rto_pending, "rto_pending not set", return); + + if (tp->rttvar || tp->srtt) { + /* 6.3.1 C3) When a new RTT measurement R' is made, set + * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| + * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' + */ + + /* Note: The above algorithm has been rewritten to + * express rto_beta and rto_alpha as inverse powers + * of two. + * For example, assuming the default value of RTO.Alpha of + * 1/8, rto_alpha would be expressed as 3. + */ + tp->rttvar = tp->rttvar - (tp->rttvar >> sctp_rto_beta) + + ((abs(tp->srtt - rtt)) >> sctp_rto_beta); + tp->srtt = tp->srtt - (tp->srtt >> sctp_rto_alpha) + + (rtt >> sctp_rto_alpha); + } else { + /* 6.3.1 C2) When the first RTT measurement R is made, set + * SRTT <- R, RTTVAR <- R/2. + */ + tp->srtt = rtt; + tp->rttvar = rtt >> 1; + } + + /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then + * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY. + */ + if (tp->rttvar == 0) + tp->rttvar = SCTP_CLOCK_GRANULARITY; + + /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */ + tp->rto = tp->srtt + (tp->rttvar << 2); + + /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min + * seconds then it is rounded up to RTO.Min seconds. + */ + if (tp->rto < tp->asoc->rto_min) + tp->rto = tp->asoc->rto_min; + + /* 6.3.1 C7) A maximum value may be placed on RTO provided it is + * at least RTO.max seconds. + */ + if (tp->rto > tp->asoc->rto_max) + tp->rto = tp->asoc->rto_max; + + tp->rtt = rtt; + + /* Reset rto_pending so that a new RTT measurement is started when a + * new data chunk is sent. + */ + tp->rto_pending = 0; + + SCTP_DEBUG_PRINTK("%s: transport: %p, rtt: %d, srtt: %d " + "rttvar: %d, rto: %ld\n", __func__, + tp, rtt, tp->srtt, tp->rttvar, tp->rto); +} + +/* This routine updates the transport's cwnd and partial_bytes_acked + * parameters based on the bytes acked in the received SACK. + */ +void sctp_transport_raise_cwnd(struct sctp_transport *transport, + __u32 sack_ctsn, __u32 bytes_acked) +{ + struct sctp_association *asoc = transport->asoc; + __u32 cwnd, ssthresh, flight_size, pba, pmtu; + + cwnd = transport->cwnd; + flight_size = transport->flight_size; + + /* See if we need to exit Fast Recovery first */ + if (asoc->fast_recovery && + TSN_lte(asoc->fast_recovery_exit, sack_ctsn)) + asoc->fast_recovery = 0; + + /* The appropriate cwnd increase algorithm is performed if, and only + * if the cumulative TSN whould advanced and the congestion window is + * being fully utilized. + */ + if (TSN_lte(sack_ctsn, transport->asoc->ctsn_ack_point) || + (flight_size < cwnd)) + return; + + ssthresh = transport->ssthresh; + pba = transport->partial_bytes_acked; + pmtu = transport->asoc->pathmtu; + + if (cwnd <= ssthresh) { + /* RFC 4960 7.2.1 + * o When cwnd is less than or equal to ssthresh, an SCTP + * endpoint MUST use the slow-start algorithm to increase + * cwnd only if the current congestion window is being fully + * utilized, an incoming SACK advances the Cumulative TSN + * Ack Point, and the data sender is not in Fast Recovery. + * Only when these three conditions are met can the cwnd be + * increased; otherwise, the cwnd MUST not be increased. + * If these conditions are met, then cwnd MUST be increased + * by, at most, the lesser of 1) the total size of the + * previously outstanding DATA chunk(s) acknowledged, and + * 2) the destination's path MTU. This upper bound protects + * against the ACK-Splitting attack outlined in [SAVAGE99]. + */ + if (asoc->fast_recovery) + return; + + if (bytes_acked > pmtu) + cwnd += pmtu; + else + cwnd += bytes_acked; + SCTP_DEBUG_PRINTK("%s: SLOW START: transport: %p, " + "bytes_acked: %d, cwnd: %d, ssthresh: %d, " + "flight_size: %d, pba: %d\n", + __func__, + transport, bytes_acked, cwnd, + ssthresh, flight_size, pba); + } else { + /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh, + * upon each SACK arrival that advances the Cumulative TSN Ack + * Point, increase partial_bytes_acked by the total number of + * bytes of all new chunks acknowledged in that SACK including + * chunks acknowledged by the new Cumulative TSN Ack and by + * Gap Ack Blocks. + * + * When partial_bytes_acked is equal to or greater than cwnd + * and before the arrival of the SACK the sender had cwnd or + * more bytes of data outstanding (i.e., before arrival of the + * SACK, flightsize was greater than or equal to cwnd), + * increase cwnd by MTU, and reset partial_bytes_acked to + * (partial_bytes_acked - cwnd). + */ + pba += bytes_acked; + if (pba >= cwnd) { + cwnd += pmtu; + pba = ((cwnd < pba) ? (pba - cwnd) : 0); + } + SCTP_DEBUG_PRINTK("%s: CONGESTION AVOIDANCE: " + "transport: %p, bytes_acked: %d, cwnd: %d, " + "ssthresh: %d, flight_size: %d, pba: %d\n", + __func__, + transport, bytes_acked, cwnd, + ssthresh, flight_size, pba); + } + + transport->cwnd = cwnd; + transport->partial_bytes_acked = pba; +} + +/* This routine is used to lower the transport's cwnd when congestion is + * detected. + */ +void sctp_transport_lower_cwnd(struct sctp_transport *transport, + sctp_lower_cwnd_t reason) +{ + struct sctp_association *asoc = transport->asoc; + + switch (reason) { + case SCTP_LOWER_CWND_T3_RTX: + /* RFC 2960 Section 7.2.3, sctpimpguide + * When the T3-rtx timer expires on an address, SCTP should + * perform slow start by: + * ssthresh = max(cwnd/2, 4*MTU) + * cwnd = 1*MTU + * partial_bytes_acked = 0 + */ + transport->ssthresh = max(transport->cwnd/2, + 4*asoc->pathmtu); + transport->cwnd = asoc->pathmtu; + + /* T3-rtx also clears fast recovery */ + asoc->fast_recovery = 0; + break; + + case SCTP_LOWER_CWND_FAST_RTX: + /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the + * destination address(es) to which the missing DATA chunks + * were last sent, according to the formula described in + * Section 7.2.3. + * + * RFC 2960 7.2.3, sctpimpguide Upon detection of packet + * losses from SACK (see Section 7.2.4), An endpoint + * should do the following: + * ssthresh = max(cwnd/2, 4*MTU) + * cwnd = ssthresh + * partial_bytes_acked = 0 + */ + if (asoc->fast_recovery) + return; + + /* Mark Fast recovery */ + asoc->fast_recovery = 1; + asoc->fast_recovery_exit = asoc->next_tsn - 1; + + transport->ssthresh = max(transport->cwnd/2, + 4*asoc->pathmtu); + transport->cwnd = transport->ssthresh; + break; + + case SCTP_LOWER_CWND_ECNE: + /* RFC 2481 Section 6.1.2. + * If the sender receives an ECN-Echo ACK packet + * then the sender knows that congestion was encountered in the + * network on the path from the sender to the receiver. The + * indication of congestion should be treated just as a + * congestion loss in non-ECN Capable TCP. That is, the TCP + * source halves the congestion window "cwnd" and reduces the + * slow start threshold "ssthresh". + * A critical condition is that TCP does not react to + * congestion indications more than once every window of + * data (or more loosely more than once every round-trip time). + */ + if (time_after(jiffies, transport->last_time_ecne_reduced + + transport->rtt)) { + transport->ssthresh = max(transport->cwnd/2, + 4*asoc->pathmtu); + transport->cwnd = transport->ssthresh; + transport->last_time_ecne_reduced = jiffies; + } + break; + + case SCTP_LOWER_CWND_INACTIVE: + /* RFC 2960 Section 7.2.1, sctpimpguide + * When the endpoint does not transmit data on a given + * transport address, the cwnd of the transport address + * should be adjusted to max(cwnd/2, 4*MTU) per RTO. + * NOTE: Although the draft recommends that this check needs + * to be done every RTO interval, we do it every hearbeat + * interval. + */ + transport->cwnd = max(transport->cwnd/2, + 4*asoc->pathmtu); + break; + } + + transport->partial_bytes_acked = 0; + SCTP_DEBUG_PRINTK("%s: transport: %p reason: %d cwnd: " + "%d ssthresh: %d\n", __func__, + transport, reason, + transport->cwnd, transport->ssthresh); +} + +/* Apply Max.Burst limit to the congestion window: + * sctpimpguide-05 2.14.2 + * D) When the time comes for the sender to + * transmit new DATA chunks, the protocol parameter Max.Burst MUST + * first be applied to limit how many new DATA chunks may be sent. + * The limit is applied by adjusting cwnd as follows: + * if ((flightsize+ Max.Burst * MTU) < cwnd) + * cwnd = flightsize + Max.Burst * MTU + */ + +void sctp_transport_burst_limited(struct sctp_transport *t) +{ + struct sctp_association *asoc = t->asoc; + u32 old_cwnd = t->cwnd; + u32 max_burst_bytes; + + if (t->burst_limited) + return; + + max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu); + if (max_burst_bytes < old_cwnd) { + t->cwnd = max_burst_bytes; + t->burst_limited = old_cwnd; + } +} + +/* Restore the old cwnd congestion window, after the burst had it's + * desired effect. + */ +void sctp_transport_burst_reset(struct sctp_transport *t) +{ + if (t->burst_limited) { + t->cwnd = t->burst_limited; + t->burst_limited = 0; + } +} + +/* What is the next timeout value for this transport? */ +unsigned long sctp_transport_timeout(struct sctp_transport *t) +{ + unsigned long timeout; + timeout = t->rto + sctp_jitter(t->rto); + if (t->state != SCTP_UNCONFIRMED) + timeout += t->hbinterval; + timeout += jiffies; + return timeout; +} + +/* Reset transport variables to their initial values */ +void sctp_transport_reset(struct sctp_transport *t) +{ + struct sctp_association *asoc = t->asoc; + + /* RFC 2960 (bis), Section 5.2.4 + * All the congestion control parameters (e.g., cwnd, ssthresh) + * related to this peer MUST be reset to their initial values + * (see Section 6.2.1) + */ + t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); + t->burst_limited = 0; + t->ssthresh = asoc->peer.i.a_rwnd; + t->rto = asoc->rto_initial; + t->rtt = 0; + t->srtt = 0; + t->rttvar = 0; + + /* Reset these additional varibles so that we have a clean + * slate. + */ + t->partial_bytes_acked = 0; + t->flight_size = 0; + t->error_count = 0; + t->rto_pending = 0; + t->hb_sent = 0; + + /* Initialize the state information for SFR-CACC */ + t->cacc.changeover_active = 0; + t->cacc.cycling_changeover = 0; + t->cacc.next_tsn_at_change = 0; + t->cacc.cacc_saw_newack = 0; +} diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c new file mode 100644 index 00000000..f1e40ceb --- /dev/null +++ b/net/sctp/tsnmap.c @@ -0,0 +1,385 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp tsn mapping array. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Jon Grimm + * Karl Knutson + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include + +static void sctp_tsnmap_update(struct sctp_tsnmap *map); +static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, + __u16 len, __u16 *start, __u16 *end); +static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap); + +/* Initialize a block of memory as a tsnmap. */ +struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len, + __u32 initial_tsn, gfp_t gfp) +{ + if (!map->tsn_map) { + map->tsn_map = kzalloc(len>>3, gfp); + if (map->tsn_map == NULL) + return NULL; + + map->len = len; + } else { + bitmap_zero(map->tsn_map, map->len); + } + + /* Keep track of TSNs represented by tsn_map. */ + map->base_tsn = initial_tsn; + map->cumulative_tsn_ack_point = initial_tsn - 1; + map->max_tsn_seen = map->cumulative_tsn_ack_point; + map->num_dup_tsns = 0; + + return map; +} + +void sctp_tsnmap_free(struct sctp_tsnmap *map) +{ + map->len = 0; + kfree(map->tsn_map); +} + +/* Test the tracking state of this TSN. + * Returns: + * 0 if the TSN has not yet been seen + * >0 if the TSN has been seen (duplicate) + * <0 if the TSN is invalid (too large to track) + */ +int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) +{ + u32 gap; + + /* Check to see if this is an old TSN */ + if (TSN_lte(tsn, map->cumulative_tsn_ack_point)) + return 1; + + /* Verify that we can hold this TSN and that it will not + * overlfow our map + */ + if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) + return -1; + + /* Calculate the index into the mapping arrays. */ + gap = tsn - map->base_tsn; + + /* Check to see if TSN has already been recorded. */ + if (gap < map->len && test_bit(gap, map->tsn_map)) + return 1; + else + return 0; +} + + +/* Mark this TSN as seen. */ +int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn) +{ + u16 gap; + + if (TSN_lt(tsn, map->base_tsn)) + return 0; + + gap = tsn - map->base_tsn; + + if (gap >= map->len && !sctp_tsnmap_grow(map, gap)) + return -ENOMEM; + + if (!sctp_tsnmap_has_gap(map) && gap == 0) { + /* In this case the map has no gaps and the tsn we are + * recording is the next expected tsn. We don't touch + * the map but simply bump the values. + */ + map->max_tsn_seen++; + map->cumulative_tsn_ack_point++; + map->base_tsn++; + } else { + /* Either we already have a gap, or about to record a gap, so + * have work to do. + * + * Bump the max. + */ + if (TSN_lt(map->max_tsn_seen, tsn)) + map->max_tsn_seen = tsn; + + /* Mark the TSN as received. */ + set_bit(gap, map->tsn_map); + + /* Go fixup any internal TSN mapping variables including + * cumulative_tsn_ack_point. + */ + sctp_tsnmap_update(map); + } + + return 0; +} + + +/* Initialize a Gap Ack Block iterator from memory being provided. */ +SCTP_STATIC void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map, + struct sctp_tsnmap_iter *iter) +{ + /* Only start looking one past the Cumulative TSN Ack Point. */ + iter->start = map->cumulative_tsn_ack_point + 1; +} + +/* Get the next Gap Ack Blocks. Returns 0 if there was not another block + * to get. + */ +SCTP_STATIC int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map, + struct sctp_tsnmap_iter *iter, + __u16 *start, __u16 *end) +{ + int ended = 0; + __u16 start_ = 0, end_ = 0, offset; + + /* If there are no more gap acks possible, get out fast. */ + if (TSN_lte(map->max_tsn_seen, iter->start)) + return 0; + + offset = iter->start - map->base_tsn; + sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len, + &start_, &end_); + + /* The Gap Ack Block happens to end at the end of the map. */ + if (start_ && !end_) + end_ = map->len - 1; + + /* If we found a Gap Ack Block, return the start and end and + * bump the iterator forward. + */ + if (end_) { + /* Fix up the start and end based on the + * Cumulative TSN Ack which is always 1 behind base. + */ + *start = start_ + 1; + *end = end_ + 1; + + /* Move the iterator forward. */ + iter->start = map->cumulative_tsn_ack_point + *end + 1; + ended = 1; + } + + return ended; +} + +/* Mark this and any lower TSN as seen. */ +void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn) +{ + u32 gap; + + if (TSN_lt(tsn, map->base_tsn)) + return; + if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) + return; + + /* Bump the max. */ + if (TSN_lt(map->max_tsn_seen, tsn)) + map->max_tsn_seen = tsn; + + gap = tsn - map->base_tsn + 1; + + map->base_tsn += gap; + map->cumulative_tsn_ack_point += gap; + if (gap >= map->len) { + /* If our gap is larger then the map size, just + * zero out the map. + */ + bitmap_zero(map->tsn_map, map->len); + } else { + /* If the gap is smaller than the map size, + * shift the map by 'gap' bits and update further. + */ + bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len); + sctp_tsnmap_update(map); + } +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* This private helper function updates the tsnmap buffers and + * the Cumulative TSN Ack Point. + */ +static void sctp_tsnmap_update(struct sctp_tsnmap *map) +{ + u16 len; + unsigned long zero_bit; + + + len = map->max_tsn_seen - map->cumulative_tsn_ack_point; + zero_bit = find_first_zero_bit(map->tsn_map, len); + if (!zero_bit) + return; /* The first 0-bit is bit 0. nothing to do */ + + map->base_tsn += zero_bit; + map->cumulative_tsn_ack_point += zero_bit; + + bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len); +} + +/* How many data chunks are we missing from our peer? + */ +__u16 sctp_tsnmap_pending(struct sctp_tsnmap *map) +{ + __u32 cum_tsn = map->cumulative_tsn_ack_point; + __u32 max_tsn = map->max_tsn_seen; + __u32 base_tsn = map->base_tsn; + __u16 pending_data; + u32 gap, i; + + pending_data = max_tsn - cum_tsn; + gap = max_tsn - base_tsn; + + if (gap == 0 || gap >= map->len) + goto out; + + for (i = 0; i < gap+1; i++) { + if (test_bit(i, map->tsn_map)) + pending_data--; + } + +out: + return pending_data; +} + +/* This is a private helper for finding Gap Ack Blocks. It searches a + * single array for the start and end of a Gap Ack Block. + * + * The flags "started" and "ended" tell is if we found the beginning + * or (respectively) the end of a Gap Ack Block. + */ +static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, + __u16 len, __u16 *start, __u16 *end) +{ + int i = off; + + /* Look through the entire array, but break out + * early if we have found the end of the Gap Ack Block. + */ + + /* Also, stop looking past the maximum TSN seen. */ + + /* Look for the start. */ + i = find_next_bit(map, len, off); + if (i < len) + *start = i; + + /* Look for the end. */ + if (*start) { + /* We have found the start, let's find the + * end. If we find the end, break out. + */ + i = find_next_zero_bit(map, len, i); + if (i < len) + *end = i - 1; + } +} + +/* Renege that we have seen a TSN. */ +void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn) +{ + u32 gap; + + if (TSN_lt(tsn, map->base_tsn)) + return; + /* Assert: TSN is in range. */ + if (!TSN_lt(tsn, map->base_tsn + map->len)) + return; + + gap = tsn - map->base_tsn; + + /* Pretend we never saw the TSN. */ + clear_bit(gap, map->tsn_map); +} + +/* How many gap ack blocks do we have recorded? */ +__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, + struct sctp_gap_ack_block *gabs) +{ + struct sctp_tsnmap_iter iter; + int ngaps = 0; + + /* Refresh the gap ack information. */ + if (sctp_tsnmap_has_gap(map)) { + __u16 start = 0, end = 0; + sctp_tsnmap_iter_init(map, &iter); + while (sctp_tsnmap_next_gap_ack(map, &iter, + &start, + &end)) { + + gabs[ngaps].start = htons(start); + gabs[ngaps].end = htons(end); + ngaps++; + if (ngaps >= SCTP_MAX_GABS) + break; + } + } + return ngaps; +} + +static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap) +{ + unsigned long *new; + unsigned long inc; + u16 len; + + if (gap >= SCTP_TSN_MAP_SIZE) + return 0; + + inc = ALIGN((gap - map->len),BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT; + len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE); + + new = kzalloc(len>>3, GFP_ATOMIC); + if (!new) + return 0; + + bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->base_tsn); + kfree(map->tsn_map); + map->tsn_map = new; + map->len = len; + + return 1; +} diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c new file mode 100644 index 00000000..8a840178 --- /dev/null +++ b/net/sctp/ulpevent.c @@ -0,0 +1,1099 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * These functions manipulate an sctp event. The struct ulpevent is used + * to carry notifications and data to the ULP (sockets). + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * La Monte H.P. Yarroll + * Ardelle Fan + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include + +static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, + struct sctp_association *asoc); +static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); +static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); + + +/* Initialize an ULP event from an given skb. */ +SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, + int msg_flags, + unsigned int len) +{ + memset(event, 0, sizeof(struct sctp_ulpevent)); + event->msg_flags = msg_flags; + event->rmem_len = len; +} + +/* Create a new sctp_ulpevent. */ +SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, + gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sk_buff *skb; + + skb = alloc_skb(size, gfp); + if (!skb) + goto fail; + + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, msg_flags, skb->truesize); + + return event; + +fail: + return NULL; +} + +/* Is this a MSG_NOTIFICATION? */ +int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) +{ + return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION); +} + +/* Hold the association in case the msg_name needs read out of + * the association. + */ +static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, + const struct sctp_association *asoc) +{ + struct sk_buff *skb; + + /* Cast away the const, as we are just wanting to + * bump the reference count. + */ + sctp_association_hold((struct sctp_association *)asoc); + skb = sctp_event2skb(event); + event->asoc = (struct sctp_association *)asoc; + atomic_add(event->rmem_len, &event->asoc->rmem_alloc); + sctp_skb_set_owner_r(skb, asoc->base.sk); +} + +/* A simple destructor to give up the reference to the association. */ +static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) +{ + struct sctp_association *asoc = event->asoc; + + atomic_sub(event->rmem_len, &asoc->rmem_alloc); + sctp_association_put(asoc); +} + +/* Create and initialize an SCTP_ASSOC_CHANGE event. + * + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * Communication notifications inform the ULP that an SCTP association + * has either begun or ended. The identifier for a new association is + * provided by this notification. + * + * Note: There is no field checking here. If a field is unused it will be + * zero'd out. + */ +struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( + const struct sctp_association *asoc, + __u16 flags, __u16 state, __u16 error, __u16 outbound, + __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_assoc_change *sac; + struct sk_buff *skb; + + /* If the lower layer passed in the chunk, it will be + * an ABORT, so we need to include it in the sac_info. + */ + if (chunk) { + /* Copy the chunk data to a new skb and reserve enough + * head room to use as notification. + */ + skb = skb_copy_expand(chunk->skb, + sizeof(struct sctp_assoc_change), 0, gfp); + + if (!skb) + goto fail; + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + /* Include the notification structure */ + sac = (struct sctp_assoc_change *) + skb_push(skb, sizeof(struct sctp_assoc_change)); + + /* Trim the buffer to the right length. */ + skb_trim(skb, sizeof(struct sctp_assoc_change) + + ntohs(chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + } else { + event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + sac = (struct sctp_assoc_change *) skb_put(skb, + sizeof(struct sctp_assoc_change)); + } + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_type: + * It should be SCTP_ASSOC_CHANGE. + */ + sac->sac_type = SCTP_ASSOC_CHANGE; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_state: 32 bits (signed integer) + * This field holds one of a number of values that communicate the + * event that happened to the association. + */ + sac->sac_state = state; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_flags: 16 bits (unsigned integer) + * Currently unused. + */ + sac->sac_flags = 0; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_length: sizeof (__u32) + * This field is the total length of the notification data, including + * the notification header. + */ + sac->sac_length = skb->len; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_error: 32 bits (signed integer) + * + * If the state was reached due to a error condition (e.g. + * COMMUNICATION_LOST) any relevant error information is available in + * this field. This corresponds to the protocol error codes defined in + * [SCTP]. + */ + sac->sac_error = error; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_outbound_streams: 16 bits (unsigned integer) + * sac_inbound_streams: 16 bits (unsigned integer) + * + * The maximum number of streams allowed in each direction are + * available in sac_outbound_streams and sac_inbound streams. + */ + sac->sac_outbound_streams = outbound; + sac->sac_inbound_streams = inbound; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + sac->sac_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* Create and initialize an SCTP_PEER_ADDR_CHANGE event. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * When a destination address on a multi-homed peer encounters a change + * an interface details event is sent. + */ +struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( + const struct sctp_association *asoc, + const struct sockaddr_storage *aaddr, + int flags, int state, int error, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_paddr_change *spc; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + spc = (struct sctp_paddr_change *) + skb_put(skb, sizeof(struct sctp_paddr_change)); + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_type: + * + * It should be SCTP_PEER_ADDR_CHANGE. + */ + spc->spc_type = SCTP_PEER_ADDR_CHANGE; + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_length: sizeof (__u32) + * + * This field is the total length of the notification data, including + * the notification header. + */ + spc->spc_length = sizeof(struct sctp_paddr_change); + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_flags: 16 bits (unsigned integer) + * Currently unused. + */ + spc->spc_flags = 0; + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_state: 32 bits (signed integer) + * + * This field holds one of a number of values that communicate the + * event that happened to the address. + */ + spc->spc_state = state; + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_error: 32 bits (signed integer) + * + * If the state was reached due to any error condition (e.g. + * ADDRESS_UNREACHABLE) any relevant error information is available in + * this field. + */ + spc->spc_error = error; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * spc_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + spc->spc_assoc_id = sctp_assoc2id(asoc); + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_aaddr: sizeof (struct sockaddr_storage) + * + * The affected address field, holds the remote peer's address that is + * encountering the change of state. + */ + memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage)); + + /* Map ipv4 address into v4-mapped-on-v6 address. */ + sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_v4map( + sctp_sk(asoc->base.sk), + (union sctp_addr *)&spc->spc_aaddr); + + return event; + +fail: + return NULL; +} + +/* Create and initialize an SCTP_REMOTE_ERROR notification. + * + * Note: This assumes that the chunk->skb->data already points to the + * operation error payload. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * A remote peer may send an Operational Error message to its peer. + * This message indicates a variety of error conditions on an + * association. The entire error TLV as it appears on the wire is + * included in a SCTP_REMOTE_ERROR event. Please refer to the SCTP + * specification [SCTP] and any extensions for a list of possible + * error formats. + */ +struct sctp_ulpevent *sctp_ulpevent_make_remote_error( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_remote_error *sre; + struct sk_buff *skb; + sctp_errhdr_t *ch; + __be16 cause; + int elen; + + ch = (sctp_errhdr_t *)(chunk->skb->data); + cause = ch->cause; + elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t); + + /* Pull off the ERROR header. */ + skb_pull(chunk->skb, sizeof(sctp_errhdr_t)); + + /* Copy the skb to a new skb with room for us to prepend + * notification with. + */ + skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error), + 0, gfp); + + /* Pull off the rest of the cause TLV from the chunk. */ + skb_pull(chunk->skb, elen); + if (!skb) + goto fail; + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + sre = (struct sctp_remote_error *) + skb_push(skb, sizeof(struct sctp_remote_error)); + + /* Trim the buffer to the right length. */ + skb_trim(skb, sizeof(struct sctp_remote_error) + elen); + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_type: + * It should be SCTP_REMOTE_ERROR. + */ + sre->sre_type = SCTP_REMOTE_ERROR; + + /* + * Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_flags: 16 bits (unsigned integer) + * Currently unused. + */ + sre->sre_flags = 0; + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_length: sizeof (__u32) + * + * This field is the total length of the notification data, + * including the notification header. + */ + sre->sre_length = skb->len; + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_error: 16 bits (unsigned integer) + * This value represents one of the Operational Error causes defined in + * the SCTP specification, in network byte order. + */ + sre->sre_error = cause; + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + sre->sre_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* Create and initialize a SCTP_SEND_FAILED notification. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.4 SCTP_SEND_FAILED + */ +struct sctp_ulpevent *sctp_ulpevent_make_send_failed( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, __u32 error, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_send_failed *ssf; + struct sk_buff *skb; + + /* Pull off any padding. */ + int len = ntohs(chunk->chunk_hdr->length); + + /* Make skb with more room so we can prepend notification. */ + skb = skb_copy_expand(chunk->skb, + sizeof(struct sctp_send_failed), /* headroom */ + 0, /* tailroom */ + gfp); + if (!skb) + goto fail; + + /* Pull off the common chunk header and DATA header. */ + skb_pull(skb, sizeof(struct sctp_data_chunk)); + len -= sizeof(struct sctp_data_chunk); + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); + + ssf = (struct sctp_send_failed *) + skb_push(skb, sizeof(struct sctp_send_failed)); + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_type: + * It should be SCTP_SEND_FAILED. + */ + ssf->ssf_type = SCTP_SEND_FAILED; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_flags: 16 bits (unsigned integer) + * The flag value will take one of the following values + * + * SCTP_DATA_UNSENT - Indicates that the data was never put on + * the wire. + * + * SCTP_DATA_SENT - Indicates that the data was put on the wire. + * Note that this does not necessarily mean that the + * data was (or was not) successfully delivered. + */ + ssf->ssf_flags = flags; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_length: sizeof (__u32) + * This field is the total length of the notification data, including + * the notification header. + */ + ssf->ssf_length = sizeof(struct sctp_send_failed) + len; + skb_trim(skb, ssf->ssf_length); + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_error: 16 bits (unsigned integer) + * This value represents the reason why the send failed, and if set, + * will be a SCTP protocol error code as defined in [SCTP] section + * 3.3.10. + */ + ssf->ssf_error = error; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_info: sizeof (struct sctp_sndrcvinfo) + * The original send information associated with the undelivered + * message. + */ + memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo)); + + /* Per TSVWG discussion with Randy. Allow the application to + * reassemble a fragmented message. + */ + ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_assoc_id: sizeof (sctp_assoc_t) + * The association id field, sf_assoc_id, holds the identifier for the + * association. All notifications for a given association have the + * same association identifier. For TCP style socket, this field is + * ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + ssf->ssf_assoc_id = sctp_assoc2id(asoc); + return event; + +fail: + return NULL; +} + +/* Create and initialize a SCTP_SHUTDOWN_EVENT notification. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + */ +struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( + const struct sctp_association *asoc, + __u16 flags, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_shutdown_event *sse; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + sse = (struct sctp_shutdown_event *) + skb_put(skb, sizeof(struct sctp_shutdown_event)); + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_type + * It should be SCTP_SHUTDOWN_EVENT + */ + sse->sse_type = SCTP_SHUTDOWN_EVENT; + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_flags: 16 bits (unsigned integer) + * Currently unused. + */ + sse->sse_flags = 0; + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_length: sizeof (__u32) + * This field is the total length of the notification data, including + * the notification header. + */ + sse->sse_length = sizeof(struct sctp_shutdown_event); + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_assoc_id: sizeof (sctp_assoc_t) + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + sse->sse_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* Create and initialize a SCTP_ADAPTATION_INDICATION notification. + * + * Socket Extensions for SCTP + * 5.3.1.6 SCTP_ADAPTATION_INDICATION + */ +struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication( + const struct sctp_association *asoc, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_adaptation_event *sai; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_adaptation_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + sai = (struct sctp_adaptation_event *) + skb_put(skb, sizeof(struct sctp_adaptation_event)); + + sai->sai_type = SCTP_ADAPTATION_INDICATION; + sai->sai_flags = 0; + sai->sai_length = sizeof(struct sctp_adaptation_event); + sai->sai_adaptation_ind = asoc->peer.adaptation_ind; + sctp_ulpevent_set_owner(event, asoc); + sai->sai_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* A message has been received. Package this message as a notification + * to pass it to the upper layers. Go ahead and calculate the sndrcvinfo + * even if filtered out later. + * + * Socket Extensions for SCTP + * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) + */ +struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, + struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sctp_ulpevent *event = NULL; + struct sk_buff *skb; + size_t padding, len; + int rx_count; + + /* + * check to see if we need to make space for this + * new skb, expand the rcvbuffer if needed, or drop + * the frame + */ + if (asoc->ep->rcvbuf_policy) + rx_count = atomic_read(&asoc->rmem_alloc); + else + rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + + if (rx_count >= asoc->base.sk->sk_rcvbuf) { + + if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || + (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) + goto fail; + } + + /* Clone the original skb, sharing the data. */ + skb = skb_clone(chunk->skb, gfp); + if (!skb) + goto fail; + + /* Now that all memory allocations for this chunk succeeded, we + * can mark it as received so the tsn_map is updated correctly. + */ + if (sctp_tsnmap_mark(&asoc->peer.tsn_map, + ntohl(chunk->subh.data_hdr->tsn))) + goto fail_mark; + + /* First calculate the padding, so we don't inadvertently + * pass up the wrong length to the user. + * + * RFC 2960 - Section 3.2 Chunk Field Descriptions + * + * The total length of a chunk(including Type, Length and Value fields) + * MUST be a multiple of 4 bytes. If the length of the chunk is not a + * multiple of 4 bytes, the sender MUST pad the chunk with all zero + * bytes and this padding is not included in the chunk length field. + * The sender should never pad with more than 3 bytes. The receiver + * MUST ignore the padding bytes. + */ + len = ntohs(chunk->chunk_hdr->length); + padding = WORD_ROUND(len) - len; + + /* Fixup cloned skb with just this chunks data. */ + skb_trim(skb, chunk->chunk_end - padding - skb->data); + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + + /* Initialize event with flags 0 and correct length + * Since this is a clone of the original skb, only account for + * the data of this chunk as other chunks will be accounted separately. + */ + sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff)); + + sctp_ulpevent_receive_data(event, asoc); + + event->stream = ntohs(chunk->subh.data_hdr->stream); + event->ssn = ntohs(chunk->subh.data_hdr->ssn); + event->ppid = chunk->subh.data_hdr->ppid; + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { + event->flags |= SCTP_UNORDERED; + event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + } + event->tsn = ntohl(chunk->subh.data_hdr->tsn); + event->msg_flags |= chunk->chunk_hdr->flags; + event->iif = sctp_chunk_iif(chunk); + + return event; + +fail_mark: + kfree_skb(skb); +fail: + return NULL; +} + +/* Create a partial delivery related event. + * + * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT + * + * When a receiver is engaged in a partial delivery of a + * message this notification will be used to indicate + * various events. + */ +struct sctp_ulpevent *sctp_ulpevent_make_pdapi( + const struct sctp_association *asoc, __u32 indication, + gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_pdapi_event *pd; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + pd = (struct sctp_pdapi_event *) + skb_put(skb, sizeof(struct sctp_pdapi_event)); + + /* pdapi_type + * It should be SCTP_PARTIAL_DELIVERY_EVENT + * + * pdapi_flags: 16 bits (unsigned integer) + * Currently unused. + */ + pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; + pd->pdapi_flags = 0; + + /* pdapi_length: 32 bits (unsigned integer) + * + * This field is the total length of the notification data, including + * the notification header. It will generally be sizeof (struct + * sctp_pdapi_event). + */ + pd->pdapi_length = sizeof(struct sctp_pdapi_event); + + /* pdapi_indication: 32 bits (unsigned integer) + * + * This field holds the indication being sent to the application. + */ + pd->pdapi_indication = indication; + + /* pdapi_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + */ + sctp_ulpevent_set_owner(event, asoc); + pd->pdapi_assoc_id = sctp_assoc2id(asoc); + + return event; +fail: + return NULL; +} + +struct sctp_ulpevent *sctp_ulpevent_make_authkey( + const struct sctp_association *asoc, __u16 key_id, + __u32 indication, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_authkey_event *ak; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_authkey_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + ak = (struct sctp_authkey_event *) + skb_put(skb, sizeof(struct sctp_authkey_event)); + + ak->auth_type = SCTP_AUTHENTICATION_EVENT; + ak->auth_flags = 0; + ak->auth_length = sizeof(struct sctp_authkey_event); + + ak->auth_keynumber = key_id; + ak->auth_altkeynumber = 0; + ak->auth_indication = indication; + + /* + * The association id field, holds the identifier for the association. + */ + sctp_ulpevent_set_owner(event, asoc); + ak->auth_assoc_id = sctp_assoc2id(asoc); + + return event; +fail: + return NULL; +} + +/* + * Socket Extensions for SCTP + * 6.3.10. SCTP_SENDER_DRY_EVENT + */ +struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( + const struct sctp_association *asoc, gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_sender_dry_event *sdry; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_sender_dry_event), + MSG_NOTIFICATION, gfp); + if (!event) + return NULL; + + skb = sctp_event2skb(event); + sdry = (struct sctp_sender_dry_event *) + skb_put(skb, sizeof(struct sctp_sender_dry_event)); + + sdry->sender_dry_type = SCTP_SENDER_DRY_EVENT; + sdry->sender_dry_flags = 0; + sdry->sender_dry_length = sizeof(struct sctp_sender_dry_event); + sctp_ulpevent_set_owner(event, asoc); + sdry->sender_dry_assoc_id = sctp_assoc2id(asoc); + + return event; +} + +/* Return the notification type, assuming this is a notification + * event. + */ +__u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) +{ + union sctp_notification *notification; + struct sk_buff *skb; + + skb = sctp_event2skb(event); + notification = (union sctp_notification *) skb->data; + return notification->sn_header.sn_type; +} + +/* Copy out the sndrcvinfo into a msghdr. */ +void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr) +{ + struct sctp_sndrcvinfo sinfo; + + if (sctp_ulpevent_is_notification(event)) + return; + + /* Sockets API Extensions for SCTP + * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) + * + * sinfo_stream: 16 bits (unsigned integer) + * + * For recvmsg() the SCTP stack places the message's stream number in + * this value. + */ + sinfo.sinfo_stream = event->stream; + /* sinfo_ssn: 16 bits (unsigned integer) + * + * For recvmsg() this value contains the stream sequence number that + * the remote endpoint placed in the DATA chunk. For fragmented + * messages this is the same number for all deliveries of the message + * (if more than one recvmsg() is needed to read the message). + */ + sinfo.sinfo_ssn = event->ssn; + /* sinfo_ppid: 32 bits (unsigned integer) + * + * In recvmsg() this value is + * the same information that was passed by the upper layer in the peer + * application. Please note that byte order issues are NOT accounted + * for and this information is passed opaquely by the SCTP stack from + * one end to the other. + */ + sinfo.sinfo_ppid = event->ppid; + /* sinfo_flags: 16 bits (unsigned integer) + * + * This field may contain any of the following flags and is composed of + * a bitwise OR of these values. + * + * recvmsg() flags: + * + * SCTP_UNORDERED - This flag is present when the message was sent + * non-ordered. + */ + sinfo.sinfo_flags = event->flags; + /* sinfo_tsn: 32 bit (unsigned integer) + * + * For the receiving side, this field holds a TSN that was + * assigned to one of the SCTP Data Chunks. + */ + sinfo.sinfo_tsn = event->tsn; + /* sinfo_cumtsn: 32 bit (unsigned integer) + * + * This field will hold the current cumulative TSN as + * known by the underlying SCTP layer. Note this field is + * ignored when sending and only valid for a receive + * operation when sinfo_flags are set to SCTP_UNORDERED. + */ + sinfo.sinfo_cumtsn = event->cumtsn; + /* sinfo_assoc_id: sizeof (sctp_assoc_t) + * + * The association handle field, sinfo_assoc_id, holds the identifier + * for the association announced in the COMMUNICATION_UP notification. + * All notifications for a given association have the same identifier. + * Ignored for one-to-one style sockets. + */ + sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); + + /* context value that is set via SCTP_CONTEXT socket option. */ + sinfo.sinfo_context = event->asoc->default_rcv_context; + + /* These fields are not used while receiving. */ + sinfo.sinfo_timetolive = 0; + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, + sizeof(struct sctp_sndrcvinfo), (void *)&sinfo); +} + +/* Do accounting for bytes received and hold a reference to the association + * for each skb. + */ +static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, + struct sctp_association *asoc) +{ + struct sk_buff *skb, *frag; + + skb = sctp_event2skb(event); + /* Set the owner and charge rwnd for bytes received. */ + sctp_ulpevent_set_owner(event, asoc); + sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); + + if (!skb->data_len) + return; + + /* Note: Not clearing the entire event struct as this is just a + * fragment of the real event. However, we still need to do rwnd + * accounting. + * In general, the skb passed from IP can have only 1 level of + * fragments. But we allow multiple levels of fragments. + */ + skb_walk_frags(skb, frag) + sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc); +} + +/* Do accounting for bytes just read by user and release the references to + * the association. + */ +static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) +{ + struct sk_buff *skb, *frag; + unsigned int len; + + /* Current stack structures assume that the rcv buffer is + * per socket. For UDP style sockets this is not true as + * multiple associations may be on a single UDP-style socket. + * Use the local private area of the skb to track the owning + * association. + */ + + skb = sctp_event2skb(event); + len = skb->len; + + if (!skb->data_len) + goto done; + + /* Don't forget the fragments. */ + skb_walk_frags(skb, frag) { + /* NOTE: skb_shinfos are recursive. Although IP returns + * skb's with only 1 level of fragments, SCTP reassembly can + * increase the levels. + */ + sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); + } + +done: + sctp_assoc_rwnd_increase(event->asoc, len); + sctp_ulpevent_release_owner(event); +} + +static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) +{ + struct sk_buff *skb, *frag; + + skb = sctp_event2skb(event); + + if (!skb->data_len) + goto done; + + /* Don't forget the fragments. */ + skb_walk_frags(skb, frag) { + /* NOTE: skb_shinfos are recursive. Although IP returns + * skb's with only 1 level of fragments, SCTP reassembly can + * increase the levels. + */ + sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); + } + +done: + sctp_ulpevent_release_owner(event); +} + +/* Free a ulpevent that has an owner. It includes releasing the reference + * to the owner, updating the rwnd in case of a DATA event and freeing the + * skb. + */ +void sctp_ulpevent_free(struct sctp_ulpevent *event) +{ + if (sctp_ulpevent_is_notification(event)) + sctp_ulpevent_release_owner(event); + else + sctp_ulpevent_release_data(event); + + kfree_skb(sctp_event2skb(event)); +} + +/* Purge the skb lists holding ulpevents. */ +unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list) +{ + struct sk_buff *skb; + unsigned int data_unread = 0; + + while ((skb = skb_dequeue(list)) != NULL) { + struct sctp_ulpevent *event = sctp_skb2event(skb); + + if (!sctp_ulpevent_is_notification(event)) + data_unread += skb->len; + + sctp_ulpevent_free(event); + } + + return data_unread; +} diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c new file mode 100644 index 00000000..f2d1de7f --- /dev/null +++ b/net/sctp/ulpqueue.c @@ -0,0 +1,1088 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This abstraction carries sctp events to the ULP (sockets). + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * La Monte H.P. Yarroll + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *); +static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, + struct sctp_ulpevent *); +static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq); + +/* 1st Level Abstractions */ + +/* Initialize a ULP queue from a block of memory. */ +struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, + struct sctp_association *asoc) +{ + memset(ulpq, 0, sizeof(struct sctp_ulpq)); + + ulpq->asoc = asoc; + skb_queue_head_init(&ulpq->reasm); + skb_queue_head_init(&ulpq->lobby); + ulpq->pd_mode = 0; + ulpq->malloced = 0; + + return ulpq; +} + + +/* Flush the reassembly and ordering queues. */ +void sctp_ulpq_flush(struct sctp_ulpq *ulpq) +{ + struct sk_buff *skb; + struct sctp_ulpevent *event; + + while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } + + while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } + +} + +/* Dispose of a ulpqueue. */ +void sctp_ulpq_free(struct sctp_ulpq *ulpq) +{ + sctp_ulpq_flush(ulpq); + if (ulpq->malloced) + kfree(ulpq); +} + +/* Process an incoming DATA chunk. */ +int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sk_buff_head temp; + struct sctp_ulpevent *event; + + /* Create an event from the incoming chunk. */ + event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); + if (!event) + return -ENOMEM; + + /* Do reassembly if needed. */ + event = sctp_ulpq_reasm(ulpq, event); + + /* Do ordering if needed. */ + if ((event) && (event->msg_flags & MSG_EOR)){ + /* Create a temporary list to collect chunks on. */ + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + + event = sctp_ulpq_order(ulpq, event); + } + + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ + if (event) + sctp_ulpq_tail_event(ulpq, event); + + return 0; +} + +/* Add a new event for propagation to the ULP. */ +/* Clear the partial delivery mode for this socket. Note: This + * assumes that no association is currently in partial delivery mode. + */ +int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) +{ + struct sctp_sock *sp = sctp_sk(sk); + + if (atomic_dec_and_test(&sp->pd_mode)) { + /* This means there are no other associations in PD, so + * we can go ahead and clear out the lobby in one shot + */ + if (!skb_queue_empty(&sp->pd_lobby)) { + struct list_head *list; + sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue); + list = (struct list_head *)&sctp_sk(sk)->pd_lobby; + INIT_LIST_HEAD(list); + return 1; + } + } else { + /* There are other associations in PD, so we only need to + * pull stuff out of the lobby that belongs to the + * associations that is exiting PD (all of its notifications + * are posted here). + */ + if (!skb_queue_empty(&sp->pd_lobby) && asoc) { + struct sk_buff *skb, *tmp; + struct sctp_ulpevent *event; + + sctp_skb_for_each(skb, &sp->pd_lobby, tmp) { + event = sctp_skb2event(skb); + if (event->asoc == asoc) { + __skb_unlink(skb, &sp->pd_lobby); + __skb_queue_tail(&sk->sk_receive_queue, + skb); + } + } + } + } + + return 0; +} + +/* Set the pd_mode on the socket and ulpq */ +static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq) +{ + struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk); + + atomic_inc(&sp->pd_mode); + ulpq->pd_mode = 1; +} + +/* Clear the pd_mode and restart any pending messages waiting for delivery. */ +static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) +{ + ulpq->pd_mode = 0; + sctp_ulpq_reasm_drain(ulpq); + return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); +} + +/* If the SKB of 'event' is on a list, it is the first such member + * of that list. + */ +int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +{ + struct sock *sk = ulpq->asoc->base.sk; + struct sk_buff_head *queue, *skb_list; + struct sk_buff *skb = sctp_event2skb(event); + int clear_pd = 0; + + skb_list = (struct sk_buff_head *) skb->prev; + + /* If the socket is just going to throw this away, do not + * even try to deliver it. + */ + if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) + goto out_free; + + /* Check if the user wishes to receive this event. */ + if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) + goto out_free; + + /* If we are in partial delivery mode, post to the lobby until + * partial delivery is cleared, unless, of course _this_ is + * the association the cause of the partial delivery. + */ + + if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) { + queue = &sk->sk_receive_queue; + } else { + if (ulpq->pd_mode) { + /* If the association is in partial delivery, we + * need to finish delivering the partially processed + * packet before passing any other data. This is + * because we don't truly support stream interleaving. + */ + if ((event->msg_flags & MSG_NOTIFICATION) || + (SCTP_DATA_NOT_FRAG == + (event->msg_flags & SCTP_DATA_FRAG_MASK))) + queue = &sctp_sk(sk)->pd_lobby; + else { + clear_pd = event->msg_flags & MSG_EOR; + queue = &sk->sk_receive_queue; + } + } else { + /* + * If fragment interleave is enabled, we + * can queue this to the receive queue instead + * of the lobby. + */ + if (sctp_sk(sk)->frag_interleave) + queue = &sk->sk_receive_queue; + else + queue = &sctp_sk(sk)->pd_lobby; + } + } + + /* If we are harvesting multiple skbs they will be + * collected on a list. + */ + if (skb_list) + sctp_skb_list_tail(skb_list, queue); + else + __skb_queue_tail(queue, skb); + + /* Did we just complete partial delivery and need to get + * rolling again? Move pending data to the receive + * queue. + */ + if (clear_pd) + sctp_ulpq_clear_pd(ulpq); + + if (queue == &sk->sk_receive_queue) + sk->sk_data_ready(sk, 0); + return 1; + +out_free: + if (skb_list) + sctp_queue_purge_ulpevents(skb_list); + else + sctp_ulpevent_free(event); + + return 0; +} + +/* 2nd Level Abstractions */ + +/* Helper function to store chunks that need to be reassembled. */ +static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *pos; + struct sctp_ulpevent *cevent; + __u32 tsn, ctsn; + + tsn = event->tsn; + + /* See if it belongs at the end. */ + pos = skb_peek_tail(&ulpq->reasm); + if (!pos) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + /* Short circuit just dropping it at the end. */ + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + if (TSN_lt(ctsn, tsn)) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + /* Find the right place in this list. We store them by TSN. */ + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + if (TSN_lt(tsn, ctsn)) + break; + } + + /* Insert before pos. */ + __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); + +} + +/* Helper function to return an event corresponding to the reassembled + * datagram. + * This routine creates a re-assembled skb given the first and last skb's + * as stored in the reassembly queue. The skb's may be non-linear if the sctp + * payload was fragmented on the way and ip had to reassemble them. + * We add the rest of skb's to the first skb's fraglist. + */ +static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) +{ + struct sk_buff *pos; + struct sk_buff *new = NULL; + struct sctp_ulpevent *event; + struct sk_buff *pnext, *last; + struct sk_buff *list = skb_shinfo(f_frag)->frag_list; + + /* Store the pointer to the 2nd skb */ + if (f_frag == l_frag) + pos = NULL; + else + pos = f_frag->next; + + /* Get the last skb in the f_frag's frag_list if present. */ + for (last = list; list; last = list, list = list->next); + + /* Add the list of remaining fragments to the first fragments + * frag_list. + */ + if (last) + last->next = pos; + else { + if (skb_cloned(f_frag)) { + /* This is a cloned skb, we can't just modify + * the frag_list. We need a new skb to do that. + * Instead of calling skb_unshare(), we'll do it + * ourselves since we need to delay the free. + */ + new = skb_copy(f_frag, GFP_ATOMIC); + if (!new) + return NULL; /* try again later */ + + sctp_skb_set_owner_r(new, f_frag->sk); + + skb_shinfo(new)->frag_list = pos; + } else + skb_shinfo(f_frag)->frag_list = pos; + } + + /* Remove the first fragment from the reassembly queue. */ + __skb_unlink(f_frag, queue); + + /* if we did unshare, then free the old skb and re-assign */ + if (new) { + kfree_skb(f_frag); + f_frag = new; + } + + while (pos) { + + pnext = pos->next; + + /* Update the len and data_len fields of the first fragment. */ + f_frag->len += pos->len; + f_frag->data_len += pos->len; + + /* Remove the fragment from the reassembly queue. */ + __skb_unlink(pos, queue); + + /* Break if we have reached the last fragment. */ + if (pos == l_frag) + break; + pos->next = pnext; + pos = pnext; + } + + event = sctp_skb2event(f_frag); + SCTP_INC_STATS(SCTP_MIB_REASMUSRMSGS); + + return event; +} + + +/* Helper function to check if an incoming chunk has filled up the last + * missing fragment in a SCTP datagram and return the corresponding event. + */ +static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos; + struct sctp_ulpevent *cevent; + struct sk_buff *first_frag = NULL; + __u32 ctsn, next_tsn; + struct sctp_ulpevent *retval = NULL; + struct sk_buff *pd_first = NULL; + struct sk_buff *pd_last = NULL; + size_t pd_len = 0; + struct sctp_association *asoc; + u32 pd_point; + + /* Initialized to 0 just to avoid compiler warning message. Will + * never be used with this value. It is referenced only after it + * is set when we find the first fragment of a message. + */ + next_tsn = 0; + + /* The chunks are held in the reasm queue sorted by TSN. + * Walk through the queue sequentially and look for a sequence of + * fragmented chunks that complete a datagram. + * 'first_frag' and next_tsn are reset when we find a chunk which + * is the first fragment of a datagram. Once these 2 fields are set + * we expect to find the remaining middle fragments and the last + * fragment in order. If not, first_frag is reset to NULL and we + * start the next pass when we find another first fragment. + * + * There is a potential to do partial delivery if user sets + * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here + * to see if can do PD. + */ + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + /* If this "FIRST_FRAG" is the first + * element in the queue, then count it towards + * possible PD. + */ + if (pos == ulpq->reasm.next) { + pd_first = pos; + pd_last = pos; + pd_len = pos->len; + } else { + pd_first = NULL; + pd_last = NULL; + pd_len = 0; + } + + first_frag = pos; + next_tsn = ctsn + 1; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if ((first_frag) && (ctsn == next_tsn)) { + next_tsn++; + if (pd_first) { + pd_last = pos; + pd_len += pos->len; + } + } else + first_frag = NULL; + break; + + case SCTP_DATA_LAST_FRAG: + if (first_frag && (ctsn == next_tsn)) + goto found; + else + first_frag = NULL; + break; + } + } + + asoc = ulpq->asoc; + if (pd_first) { + /* Make sure we can enter partial deliver. + * We can trigger partial delivery only if framgent + * interleave is set, or the socket is not already + * in partial delivery. + */ + if (!sctp_sk(asoc->base.sk)->frag_interleave && + atomic_read(&sctp_sk(asoc->base.sk)->pd_mode)) + goto done; + + cevent = sctp_skb2event(pd_first); + pd_point = sctp_sk(asoc->base.sk)->pd_point; + if (pd_point && pd_point <= pd_len) { + retval = sctp_make_reassembled_event(&ulpq->reasm, + pd_first, + pd_last); + if (retval) + sctp_ulpq_set_pd(ulpq); + } + } +done: + return retval; +found: + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos); + if (retval) + retval->msg_flags |= MSG_EOR; + goto done; +} + +/* Retrieve the next set of fragments of a partial message. */ +static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos, *last_frag, *first_frag; + struct sctp_ulpevent *cevent; + __u32 ctsn, next_tsn; + int is_last; + struct sctp_ulpevent *retval; + + /* The chunks are held in the reasm queue sorted by TSN. + * Walk through the queue sequentially and look for the first + * sequence of fragmented chunks. + */ + + if (skb_queue_empty(&ulpq->reasm)) + return NULL; + + last_frag = first_frag = NULL; + retval = NULL; + next_tsn = 0; + is_last = 0; + + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) { + first_frag = pos; + next_tsn = ctsn + 1; + last_frag = pos; + } else if (next_tsn == ctsn) + next_tsn++; + else + goto done; + break; + case SCTP_DATA_LAST_FRAG: + if (!first_frag) + first_frag = pos; + else if (ctsn != next_tsn) + goto done; + last_frag = pos; + is_last = 1; + goto done; + default: + return NULL; + } + } + + /* We have the reassembled event. There is no need to look + * further. + */ +done: + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); + if (retval && is_last) + retval->msg_flags |= MSG_EOR; + + return retval; +} + + +/* Helper function to reassemble chunks. Hold chunks on the reasm queue that + * need reassembling. + */ +static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *retval = NULL; + + /* Check if this is part of a fragmented message. */ + if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { + event->msg_flags |= MSG_EOR; + return event; + } + + sctp_ulpq_store_reasm(ulpq, event); + if (!ulpq->pd_mode) + retval = sctp_ulpq_retrieve_reassembled(ulpq); + else { + __u32 ctsn, ctsnap; + + /* Do not even bother unless this is the next tsn to + * be delivered. + */ + ctsn = event->tsn; + ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map); + if (TSN_lte(ctsn, ctsnap)) + retval = sctp_ulpq_retrieve_partial(ulpq); + } + + return retval; +} + +/* Retrieve the first part (sequential fragments) for partial delivery. */ +static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos, *last_frag, *first_frag; + struct sctp_ulpevent *cevent; + __u32 ctsn, next_tsn; + struct sctp_ulpevent *retval; + + /* The chunks are held in the reasm queue sorted by TSN. + * Walk through the queue sequentially and look for a sequence of + * fragmented chunks that start a datagram. + */ + + if (skb_queue_empty(&ulpq->reasm)) + return NULL; + + last_frag = first_frag = NULL; + retval = NULL; + next_tsn = 0; + + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (!first_frag) { + first_frag = pos; + next_tsn = ctsn + 1; + last_frag = pos; + } else + goto done; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) + return NULL; + if (ctsn == next_tsn) { + next_tsn++; + last_frag = pos; + } else + goto done; + break; + default: + return NULL; + } + } + + /* We have the reassembled event. There is no need to look + * further. + */ +done: + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); + return retval; +} + +/* + * Flush out stale fragments from the reassembly queue when processing + * a Forward TSN. + * + * RFC 3758, Section 3.6 + * + * After receiving and processing a FORWARD TSN, the data receiver MUST + * take cautions in updating its re-assembly queue. The receiver MUST + * remove any partially reassembled message, which is still missing one + * or more TSNs earlier than or equal to the new cumulative TSN point. + * In the event that the receiver has invoked the partial delivery API, + * a notification SHOULD also be generated to inform the upper layer API + * that the message being partially delivered will NOT be completed. + */ +void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn) +{ + struct sk_buff *pos, *tmp; + struct sctp_ulpevent *event; + __u32 tsn; + + if (skb_queue_empty(&ulpq->reasm)) + return; + + skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { + event = sctp_skb2event(pos); + tsn = event->tsn; + + /* Since the entire message must be abandoned by the + * sender (item A3 in Section 3.5, RFC 3758), we can + * free all fragments on the list that are less then + * or equal to ctsn_point + */ + if (TSN_lte(tsn, fwd_tsn)) { + __skb_unlink(pos, &ulpq->reasm); + sctp_ulpevent_free(event); + } else + break; + } +} + +/* + * Drain the reassembly queue. If we just cleared parted delivery, it + * is possible that the reassembly queue will contain already reassembled + * messages. Retrieve any such messages and give them to the user. + */ +static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) +{ + struct sctp_ulpevent *event = NULL; + struct sk_buff_head temp; + + if (skb_queue_empty(&ulpq->reasm)) + return; + + while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { + /* Do ordering if needed. */ + if ((event) && (event->msg_flags & MSG_EOR)){ + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + + event = sctp_ulpq_order(ulpq, event); + } + + /* Send event to the ULP. 'event' is the + * sctp_ulpevent for very first SKB on the temp' list. + */ + if (event) + sctp_ulpq_tail_event(ulpq, event); + } +} + + +/* Helper function to gather skbs that have possibly become + * ordered by an an incoming chunk. + */ +static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff_head *event_list; + struct sk_buff *pos, *tmp; + struct sctp_ulpevent *cevent; + struct sctp_stream *in; + __u16 sid, csid, cssn; + + sid = event->stream; + in = &ulpq->asoc->ssnmap->in; + + event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; + + /* We are holding the chunks by stream, by SSN. */ + sctp_skb_for_each(pos, &ulpq->lobby, tmp) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + /* Have we gone too far? */ + if (csid > sid) + break; + + /* Have we not gone far enough? */ + if (csid < sid) + continue; + + if (cssn != sctp_ssn_peek(in, sid)) + break; + + /* Found it, so mark in the ssnmap. */ + sctp_ssn_next(in, sid); + + __skb_unlink(pos, &ulpq->lobby); + + /* Attach all gathered skbs to the event. */ + __skb_queue_tail(event_list, pos); + } +} + +/* Helper function to store chunks needing ordering. */ +static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *pos; + struct sctp_ulpevent *cevent; + __u16 sid, csid; + __u16 ssn, cssn; + + pos = skb_peek_tail(&ulpq->lobby); + if (!pos) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + sid = event->stream; + ssn = event->ssn; + + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + if (sid > csid) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + if ((sid == csid) && SSN_lt(cssn, ssn)) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + /* Find the right place in this list. We store them by + * stream ID and then by SSN. + */ + skb_queue_walk(&ulpq->lobby, pos) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + if (csid > sid) + break; + if (csid == sid && SSN_lt(ssn, cssn)) + break; + } + + + /* Insert before pos. */ + __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); +} + +static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + __u16 sid, ssn; + struct sctp_stream *in; + + /* Check if this message needs ordering. */ + if (SCTP_DATA_UNORDERED & event->msg_flags) + return event; + + /* Note: The stream ID must be verified before this routine. */ + sid = event->stream; + ssn = event->ssn; + in = &ulpq->asoc->ssnmap->in; + + /* Is this the expected SSN for this stream ID? */ + if (ssn != sctp_ssn_peek(in, sid)) { + /* We've received something out of order, so find where it + * needs to be placed. We order by stream and then by SSN. + */ + sctp_ulpq_store_ordered(ulpq, event); + return NULL; + } + + /* Mark that the next chunk has been found. */ + sctp_ssn_next(in, sid); + + /* Go find any other chunks that were waiting for + * ordering. + */ + sctp_ulpq_retrieve_ordered(ulpq, event); + + return event; +} + +/* Helper function to gather skbs that have possibly become + * ordered by forward tsn skipping their dependencies. + */ +static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) +{ + struct sk_buff *pos, *tmp; + struct sctp_ulpevent *cevent; + struct sctp_ulpevent *event; + struct sctp_stream *in; + struct sk_buff_head temp; + struct sk_buff_head *lobby = &ulpq->lobby; + __u16 csid, cssn; + + in = &ulpq->asoc->ssnmap->in; + + /* We are holding the chunks by stream, by SSN. */ + skb_queue_head_init(&temp); + event = NULL; + sctp_skb_for_each(pos, lobby, tmp) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + /* Have we gone too far? */ + if (csid > sid) + break; + + /* Have we not gone far enough? */ + if (csid < sid) + continue; + + /* see if this ssn has been marked by skipping */ + if (!SSN_lt(cssn, sctp_ssn_peek(in, csid))) + break; + + __skb_unlink(pos, lobby); + if (!event) + /* Create a temporary list to collect chunks on. */ + event = sctp_skb2event(pos); + + /* Attach all gathered skbs to the event. */ + __skb_queue_tail(&temp, pos); + } + + /* If we didn't reap any data, see if the next expected SSN + * is next on the queue and if so, use that. + */ + if (event == NULL && pos != (struct sk_buff *)lobby) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + if (csid == sid && cssn == sctp_ssn_peek(in, csid)) { + sctp_ssn_next(in, csid); + __skb_unlink(pos, lobby); + __skb_queue_tail(&temp, pos); + event = sctp_skb2event(pos); + } + } + + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ + if (event) { + /* see if we have more ordered that we can deliver */ + sctp_ulpq_retrieve_ordered(ulpq, event); + sctp_ulpq_tail_event(ulpq, event); + } +} + +/* Skip over an SSN. This is used during the processing of + * Forwared TSN chunk to skip over the abandoned ordered data + */ +void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) +{ + struct sctp_stream *in; + + /* Note: The stream ID must be verified before this routine. */ + in = &ulpq->asoc->ssnmap->in; + + /* Is this an old SSN? If so ignore. */ + if (SSN_lt(ssn, sctp_ssn_peek(in, sid))) + return; + + /* Mark that we are no longer expecting this SSN or lower. */ + sctp_ssn_skip(in, sid, ssn); + + /* Go find any other chunks that were waiting for + * ordering and deliver them if needed. + */ + sctp_ulpq_reap_ordered(ulpq, sid); +} + +static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, + struct sk_buff_head *list, __u16 needed) +{ + __u16 freed = 0; + __u32 tsn; + struct sk_buff *skb; + struct sctp_ulpevent *event; + struct sctp_tsnmap *tsnmap; + + tsnmap = &ulpq->asoc->peer.tsn_map; + + while ((skb = __skb_dequeue_tail(list)) != NULL) { + freed += skb_headlen(skb); + event = sctp_skb2event(skb); + tsn = event->tsn; + + sctp_ulpevent_free(event); + sctp_tsnmap_renege(tsnmap, tsn); + if (freed >= needed) + return freed; + } + + return freed; +} + +/* Renege 'needed' bytes from the ordering queue. */ +static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed) +{ + return sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); +} + +/* Renege 'needed' bytes from the reassembly queue. */ +static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed) +{ + return sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); +} + +/* Partial deliver the first message as there is pressure on rwnd. */ +void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sctp_ulpevent *event; + struct sctp_association *asoc; + struct sctp_sock *sp; + + asoc = ulpq->asoc; + sp = sctp_sk(asoc->base.sk); + + /* If the association is already in Partial Delivery mode + * we have noting to do. + */ + if (ulpq->pd_mode) + return; + + /* If the user enabled fragment interleave socket option, + * multiple associations can enter partial delivery. + * Otherwise, we can only enter partial delivery if the + * socket is not in partial deliver mode. + */ + if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) { + /* Is partial delivery possible? */ + event = sctp_ulpq_retrieve_first(ulpq); + /* Send event to the ULP. */ + if (event) { + sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_set_pd(ulpq); + return; + } + } +} + +/* Renege some packets to make room for an incoming chunk. */ +void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, + gfp_t gfp) +{ + struct sctp_association *asoc; + __u16 needed, freed; + + asoc = ulpq->asoc; + + if (chunk) { + needed = ntohs(chunk->chunk_hdr->length); + needed -= sizeof(sctp_data_chunk_t); + } else + needed = SCTP_DEFAULT_MAXWINDOW; + + freed = 0; + + if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { + freed = sctp_ulpq_renege_order(ulpq, needed); + if (freed < needed) { + freed += sctp_ulpq_renege_frags(ulpq, needed - freed); + } + } + /* If able to free enough room, accept this chunk. */ + if (chunk && (freed >= needed)) { + __u32 tsn; + tsn = ntohl(chunk->subh.data_hdr->tsn); + sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn); + sctp_ulpq_tail_data(ulpq, chunk, gfp); + + sctp_ulpq_partial_delivery(ulpq, chunk, gfp); + } + + sk_mem_reclaim(asoc->base.sk); +} + + + +/* Notify the application if an association is aborted and in + * partial delivery mode. Send up any pending received messages. + */ +void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) +{ + struct sctp_ulpevent *ev = NULL; + struct sock *sk; + + if (!ulpq->pd_mode) + return; + + sk = ulpq->asoc->base.sk; + if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, + &sctp_sk(sk)->subscribe)) + ev = sctp_ulpevent_make_pdapi(ulpq->asoc, + SCTP_PARTIAL_DELIVERY_ABORTED, + gfp); + if (ev) + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); + + /* If there is data waiting, send it up the socket now. */ + if (sctp_ulpq_clear_pd(ulpq) || ev) + sk->sk_data_ready(sk, 0); +} diff --git a/net/socket.c b/net/socket.c new file mode 100644 index 00000000..cf41afcc --- /dev/null +++ b/net/socket.c @@ -0,0 +1,3384 @@ +/* + * NET An implementation of the SOCKET network access protocol. + * + * Version: @(#)socket.c 1.1.93 18/02/95 + * + * Authors: Orest Zborowski, + * Ross Biro + * Fred N. van Kempen, + * + * Fixes: + * Anonymous : NOTSOCK/BADF cleanup. Error fix in + * shutdown() + * Alan Cox : verify_area() fixes + * Alan Cox : Removed DDI + * Jonathan Kamens : SOCK_DGRAM reconnect bug + * Alan Cox : Moved a load of checks to the very + * top level. + * Alan Cox : Move address structures to/from user + * mode above the protocol layers. + * Rob Janssen : Allow 0 length sends. + * Alan Cox : Asynchronous I/O support (cribbed from the + * tty drivers). + * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) + * Jeff Uphoff : Made max number of sockets command-line + * configurable. + * Matti Aarnio : Made the number of sockets dynamic, + * to be allocated when needed, and mr. + * Uphoff's max is used as max to be + * allowed to allocate. + * Linus : Argh. removed all the socket allocation + * altogether: it's in the inode now. + * Alan Cox : Made sock_alloc()/sock_release() public + * for NetROM and future kernel nfsd type + * stuff. + * Alan Cox : sendmsg/recvmsg basics. + * Tom Dyas : Export net symbols. + * Marcin Dalecki : Fixed problems with CONFIG_NET="n". + * Alan Cox : Added thread locking to sys_* calls + * for sockets. May have errors at the + * moment. + * Kevin Buhr : Fixed the dumb errors in the above. + * Andi Kleen : Some small cleanups, optimizations, + * and fixed a copy_from_user() bug. + * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) + * Tigran Aivazian : Made listen(2) backlog sanity checks + * protocol-independent + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * This module is effectively the top level interface to the BSD socket + * paradigm. + * + * Based upon Swansea University Computer Society NET3.039 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare); +static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); +static int sock_mmap(struct file *file, struct vm_area_struct *vma); + +static int sock_close(struct inode *inode, struct file *file); +static unsigned int sock_poll(struct file *file, + struct poll_table_struct *wait); +static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +static long compat_sock_ioctl(struct file *file, + unsigned int cmd, unsigned long arg); +#endif +static int sock_fasync(int fd, struct file *filp, int on); +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more); +static ssize_t sock_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + +/* + * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear + * in the operation structures but are done directly via the socketcall() multiplexor. + */ + +static const struct file_operations socket_file_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .aio_read = sock_aio_read, + .aio_write = sock_aio_write, + .poll = sock_poll, + .unlocked_ioctl = sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_sock_ioctl, +#endif + .mmap = sock_mmap, + .open = sock_no_open, /* special open code to disallow open via /proc */ + .release = sock_close, + .fasync = sock_fasync, + .sendpage = sock_sendpage, + .splice_write = generic_splice_sendpage, + .splice_read = sock_splice_read, +}; + +/* + * The protocol list. Each protocol is registered in here. + */ + +static DEFINE_SPINLOCK(net_family_lock); +static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; + +/* + * Statistics counters of the socket lists + */ + +static DEFINE_PER_CPU(int, sockets_in_use); + +/* + * Support routines. + * Move socket addresses back and forth across the kernel/user + * divide and look after the messy bits. + */ + +/** + * move_addr_to_kernel - copy a socket address into kernel space + * @uaddr: Address in user space + * @kaddr: Address in kernel space + * @ulen: Length in user space + * + * The address is copied into kernel space. If the provided address is + * too long an error code of -EINVAL is returned. If the copy gives + * invalid addresses -EFAULT is returned. On a success 0 is returned. + */ + +int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr) +{ + if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) + return -EINVAL; + if (ulen == 0) + return 0; + if (copy_from_user(kaddr, uaddr, ulen)) + return -EFAULT; + return audit_sockaddr(ulen, kaddr); +} + +/** + * move_addr_to_user - copy an address to user space + * @kaddr: kernel space address + * @klen: length of address in kernel + * @uaddr: user space address + * @ulen: pointer to user length field + * + * The value pointed to by ulen on entry is the buffer length available. + * This is overwritten with the buffer space used. -EINVAL is returned + * if an overlong buffer is specified or a negative buffer size. -EFAULT + * is returned if either the buffer or the length field are not + * accessible. + * After copying the data up to the limit the user specifies, the true + * length of the data is written over the length limit the user + * specified. Zero is returned for a success. + */ + +static int move_addr_to_user(struct sockaddr *kaddr, int klen, + void __user *uaddr, int __user *ulen) +{ + int err; + int len; + + err = get_user(len, ulen); + if (err) + return err; + if (len > klen) + len = klen; + if (len < 0 || len > sizeof(struct sockaddr_storage)) + return -EINVAL; + if (len) { + if (audit_sockaddr(klen, kaddr)) + return -ENOMEM; + if (copy_to_user(uaddr, kaddr, len)) + return -EFAULT; + } + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + return __put_user(klen, ulen); +} + +static struct kmem_cache *sock_inode_cachep __read_mostly; + +static struct inode *sock_alloc_inode(struct super_block *sb) +{ + struct socket_alloc *ei; + struct socket_wq *wq; + + ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + wq = kmalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) { + kmem_cache_free(sock_inode_cachep, ei); + return NULL; + } + init_waitqueue_head(&wq->wait); + wq->fasync_list = NULL; + RCU_INIT_POINTER(ei->socket.wq, wq); + + ei->socket.state = SS_UNCONNECTED; + ei->socket.flags = 0; + ei->socket.ops = NULL; + ei->socket.sk = NULL; + ei->socket.file = NULL; + + return &ei->vfs_inode; +} + +static void sock_destroy_inode(struct inode *inode) +{ + struct socket_alloc *ei; + struct socket_wq *wq; + + ei = container_of(inode, struct socket_alloc, vfs_inode); + wq = rcu_dereference_protected(ei->socket.wq, 1); + kfree_rcu(wq, rcu); + kmem_cache_free(sock_inode_cachep, ei); +} + +static void init_once(void *foo) +{ + struct socket_alloc *ei = (struct socket_alloc *)foo; + + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + sock_inode_cachep = kmem_cache_create("sock_inode_cache", + sizeof(struct socket_alloc), + 0, + (SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + init_once); + if (sock_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static const struct super_operations sockfs_ops = { + .alloc_inode = sock_alloc_inode, + .destroy_inode = sock_destroy_inode, + .statfs = simple_statfs, +}; + +/* + * sockfs_dname() is called from d_path(). + */ +static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", + dentry->d_inode->i_ino); +} + +static const struct dentry_operations sockfs_dentry_operations = { + .d_dname = sockfs_dname, +}; + +static struct dentry *sockfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "socket:", &sockfs_ops, + &sockfs_dentry_operations, SOCKFS_MAGIC); +} + +static struct vfsmount *sock_mnt __read_mostly; + +static struct file_system_type sock_fs_type = { + .name = "sockfs", + .mount = sockfs_mount, + .kill_sb = kill_anon_super, +}; + +/* + * Obtains the first available file descriptor and sets it up for use. + * + * These functions create file structures and maps them to fd space + * of the current process. On success it returns file descriptor + * and file struct implicitly stored in sock->file. + * Note that another thread may close file descriptor before we return + * from this function. We use the fact that now we do not refer + * to socket after mapping. If one day we will need it, this + * function will increment ref. count on file by 1. + * + * In any case returned fd MAY BE not valid! + * This race condition is unavoidable + * with shared fd spaces, we cannot solve it inside kernel, + * but we take care of internal coherence yet. + */ + +static int sock_alloc_file(struct socket *sock, struct file **f, int flags) +{ + struct qstr name = { .name = "" }; + struct path path; + struct file *file; + int fd; + + fd = get_unused_fd_flags(flags); + if (unlikely(fd < 0)) + return fd; + + path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); + if (unlikely(!path.dentry)) { + put_unused_fd(fd); + return -ENOMEM; + } + path.mnt = mntget(sock_mnt); + + d_instantiate(path.dentry, SOCK_INODE(sock)); + SOCK_INODE(sock)->i_fop = &socket_file_ops; + + file = alloc_file(&path, FMODE_READ | FMODE_WRITE, + &socket_file_ops); + if (unlikely(!file)) { + /* drop dentry, keep inode */ + ihold(path.dentry->d_inode); + path_put(&path); + put_unused_fd(fd); + return -ENFILE; + } + + sock->file = file; + file->f_flags = O_RDWR | (flags & O_NONBLOCK); + file->f_pos = 0; + file->private_data = sock; + + *f = file; + return fd; +} + +int sock_map_fd(struct socket *sock, int flags) +{ + struct file *newfile; + int fd = sock_alloc_file(sock, &newfile, flags); + + if (likely(fd >= 0)) + fd_install(fd, newfile); + + return fd; +} +EXPORT_SYMBOL(sock_map_fd); + +static struct socket *sock_from_file(struct file *file, int *err) +{ + if (file->f_op == &socket_file_ops) + return file->private_data; /* set in sock_map_fd */ + + *err = -ENOTSOCK; + return NULL; +} + +/** + * sockfd_lookup - Go from a file number to its socket slot + * @fd: file handle + * @err: pointer to an error code return + * + * The file handle passed in is locked and the socket it is bound + * too is returned. If an error occurs the err pointer is overwritten + * with a negative errno code and NULL is returned. The function checks + * for both invalid handles and passing a handle which is not a socket. + * + * On a success the socket object pointer is returned. + */ + +struct socket *sockfd_lookup(int fd, int *err) +{ + struct file *file; + struct socket *sock; + + file = fget(fd); + if (!file) { + *err = -EBADF; + return NULL; + } + + sock = sock_from_file(file, err); + if (!sock) + fput(file); + return sock; +} +EXPORT_SYMBOL(sockfd_lookup); + +static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) +{ + struct file *file; + struct socket *sock; + + *err = -EBADF; + file = fget_light(fd, fput_needed); + if (file) { + sock = sock_from_file(file, err); + if (sock) + return sock; + fput_light(file, *fput_needed); + } + return NULL; +} + +/** + * sock_alloc - allocate a socket + * + * Allocate a new inode and socket object. The two are bound together + * and initialised. The socket is then returned. If we are out of inodes + * NULL is returned. + */ + +static struct socket *sock_alloc(void) +{ + struct inode *inode; + struct socket *sock; + + inode = new_inode(sock_mnt->mnt_sb); + if (!inode) + return NULL; + + sock = SOCKET_I(inode); + + kmemcheck_annotate_bitfield(sock, type); + inode->i_ino = get_next_ino(); + inode->i_mode = S_IFSOCK | S_IRWXUGO; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + + percpu_add(sockets_in_use, 1); + return sock; +} + +/* + * In theory you can't get an open on this inode, but /proc provides + * a back door. Remember to keep it shut otherwise you'll let the + * creepy crawlies in. + */ + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare) +{ + return -ENXIO; +} + +const struct file_operations bad_sock_fops = { + .owner = THIS_MODULE, + .open = sock_no_open, + .llseek = noop_llseek, +}; + +/** + * sock_release - close a socket + * @sock: socket to close + * + * The socket is released from the protocol stack if it has a release + * callback, and the inode is then released if the socket is bound to + * an inode not a file. + */ + +void sock_release(struct socket *sock) +{ + if (sock->ops) { + struct module *owner = sock->ops->owner; + + sock->ops->release(sock); + sock->ops = NULL; + module_put(owner); + } + + if (rcu_dereference_protected(sock->wq, 1)->fasync_list) + printk(KERN_ERR "sock_release: fasync list not empty!\n"); + + percpu_sub(sockets_in_use, 1); + if (!sock->file) { + iput(SOCK_INODE(sock)); + return; + } + sock->file = NULL; +} +EXPORT_SYMBOL(sock_release); + +int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) +{ + *tx_flags = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + *tx_flags |= SKBTX_HW_TSTAMP; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + *tx_flags |= SKBTX_SW_TSTAMP; + return 0; +} +EXPORT_SYMBOL(sock_tx_timestamp); + +static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock_iocb *si = kiocb_to_siocb(iocb); + + sock_update_classid(sock->sk); + + si->sock = sock; + si->scm = NULL; + si->msg = msg; + si->size = size; + + return sock->ops->sendmsg(iocb, sock, msg, size); +} + +static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + int err = security_socket_sendmsg(sock, msg, size); + + return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size); +} + +int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_sendmsg(&iocb, sock, msg, size); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} +EXPORT_SYMBOL(sock_sendmsg); + +int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_sendmsg_nosec(&iocb, sock, msg, size); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +int kernel_sendmsg(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int result; + + set_fs(KERNEL_DS); + /* + * the following is safe, since for compiler definitions of kvec and + * iovec are identical, yielding the same in-core layout and alignment + */ + msg->msg_iov = (struct iovec *)vec; + msg->msg_iovlen = num; + result = sock_sendmsg(sock, msg, size); + set_fs(oldfs); + return result; +} +EXPORT_SYMBOL(kernel_sendmsg); + +static int ktime2ts(ktime_t kt, struct timespec *ts) +{ + if (kt.tv64) { + *ts = ktime_to_timespec(kt); + return 1; + } else { + return 0; + } +} + +/* + * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) + */ +void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); + struct timespec ts[3]; + int empty = 1; + struct skb_shared_hwtstamps *shhwtstamps = + skb_hwtstamps(skb); + + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ + if (need_software_tstamp && skb->tstamp.tv64 == 0) + __net_timestamp(skb); + + if (need_software_tstamp) { + if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { + struct timeval tv; + skb_get_timestamp(skb, &tv); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(tv), &tv); + } else { + skb_get_timestampns(skb, &ts[0]); + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, + sizeof(ts[0]), &ts[0]); + } + } + + + memset(ts, 0, sizeof(ts)); + if (skb->tstamp.tv64 && + sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) { + skb_get_timestampns(skb, ts + 0); + empty = 0; + } + if (shhwtstamps) { + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) && + ktime2ts(shhwtstamps->syststamp, ts + 1)) + empty = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && + ktime2ts(shhwtstamps->hwtstamp, ts + 2)) + empty = 0; + } + if (!empty) + put_cmsg(msg, SOL_SOCKET, + SCM_TIMESTAMPING, sizeof(ts), &ts); +} +EXPORT_SYMBOL_GPL(__sock_recv_timestamp); + +static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount) + put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, + sizeof(__u32), &skb->dropcount); +} + +void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + sock_recv_timestamp(msg, sk, skb); + sock_recv_drops(msg, sk, skb); +} +EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); + +static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock_iocb *si = kiocb_to_siocb(iocb); + + sock_update_classid(sock->sk); + + si->sock = sock; + si->scm = NULL; + si->msg = msg; + si->size = size; + si->flags = flags; + + return sock->ops->recvmsg(iocb, sock, msg, size, flags); +} + +static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + int err = security_socket_recvmsg(sock, msg, size, flags); + + return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); +} + +int sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_recvmsg(&iocb, sock, msg, size, flags); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} +EXPORT_SYMBOL(sock_recvmsg); + +static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +/** + * kernel_recvmsg - Receive a message from a socket (kernel space) + * @sock: The socket to receive the message from + * @msg: Received message + * @vec: Input s/g array for message data + * @num: Size of input s/g array + * @size: Number of bytes to read + * @flags: Message flags (MSG_DONTWAIT, etc...) + * + * On return the msg structure contains the scatter/gather array passed in the + * vec argument. The array is modified so that it consists of the unfilled + * portion of the original array. + * + * The returned value is the total number of bytes received, or an error. + */ +int kernel_recvmsg(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size, int flags) +{ + mm_segment_t oldfs = get_fs(); + int result; + + set_fs(KERNEL_DS); + /* + * the following is safe, since for compiler definitions of kvec and + * iovec are identical, yielding the same in-core layout and alignment + */ + msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num; + result = sock_recvmsg(sock, msg, size, flags); + set_fs(oldfs); + return result; +} +EXPORT_SYMBOL(kernel_recvmsg); + +static void sock_aio_dtor(struct kiocb *iocb) +{ + kfree(iocb->private); +} + +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) +{ + struct socket *sock; + int flags; + + sock = file->private_data; + + flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ + flags |= more; + + return kernel_sendpage(sock, page, offset, size, flags); +} + +static ssize_t sock_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct socket *sock = file->private_data; + + if (unlikely(!sock->ops->splice_read)) + return -EINVAL; + + sock_update_classid(sock->sk); + + return sock->ops->splice_read(sock, ppos, pipe, len, flags); +} + +static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, + struct sock_iocb *siocb) +{ + if (!is_sync_kiocb(iocb)) { + siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); + if (!siocb) + return NULL; + iocb->ki_dtor = sock_aio_dtor; + } + + siocb->kiocb = iocb; + iocb->private = siocb; + return siocb; +} + +static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, + struct file *file, const struct iovec *iov, + unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; + + for (i = 0; i < nr_segs; i++) + size += iov[i].iov_len; + + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *)iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); +} + +static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct sock_iocb siocb, *x; + + if (pos != 0) + return -ESPIPE; + + if (iocb->ki_left == 0) /* Match SYS5 behaviour */ + return 0; + + + x = alloc_sock_iocb(iocb, &siocb); + if (!x) + return -ENOMEM; + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); +} + +static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, + struct file *file, const struct iovec *iov, + unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; + + for (i = 0; i < nr_segs; i++) + size += iov[i].iov_len; + + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *)iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + if (sock->type == SOCK_SEQPACKET) + msg->msg_flags |= MSG_EOR; + + return __sock_sendmsg(iocb, sock, msg, size); +} + +static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct sock_iocb siocb, *x; + + if (pos != 0) + return -ESPIPE; + + x = alloc_sock_iocb(iocb, &siocb); + if (!x) + return -ENOMEM; + + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs); +} + +/* + * Atomic setting of ioctl hooks to avoid race + * with module unload. + */ + +static DEFINE_MUTEX(br_ioctl_mutex); +static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg); + +void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) +{ + mutex_lock(&br_ioctl_mutex); + br_ioctl_hook = hook; + mutex_unlock(&br_ioctl_mutex); +} +EXPORT_SYMBOL(brioctl_set); + +static DEFINE_MUTEX(vlan_ioctl_mutex); +static int (*vlan_ioctl_hook) (struct net *, void __user *arg); + +void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) +{ + mutex_lock(&vlan_ioctl_mutex); + vlan_ioctl_hook = hook; + mutex_unlock(&vlan_ioctl_mutex); +} +EXPORT_SYMBOL(vlan_ioctl_set); + +static DEFINE_MUTEX(dlci_ioctl_mutex); +static int (*dlci_ioctl_hook) (unsigned int, void __user *); + +void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) +{ + mutex_lock(&dlci_ioctl_mutex); + dlci_ioctl_hook = hook; + mutex_unlock(&dlci_ioctl_mutex); +} +EXPORT_SYMBOL(dlci_ioctl_set); + +static long sock_do_ioctl(struct net *net, struct socket *sock, + unsigned int cmd, unsigned long arg) +{ + int err; + void __user *argp = (void __user *)arg; + + err = sock->ops->ioctl(sock, cmd, arg); + + /* + * If this ioctl is unknown try to hand it down + * to the NIC driver. + */ + if (err == -ENOIOCTLCMD) + err = dev_ioctl(net, cmd, argp); + + return err; +} + +/* + * With an ioctl, arg may well be a user mode pointer, but we don't know + * what to do with it - that's up to the protocol still. + */ + +static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct socket *sock; + struct sock *sk; + void __user *argp = (void __user *)arg; + int pid, err; + struct net *net; + + sock = file->private_data; + sk = sock->sk; + net = sock_net(sk); + if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { + err = dev_ioctl(net, cmd, argp); + } else +#ifdef CONFIG_WEXT_CORE + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + err = dev_ioctl(net, cmd, argp); + } else +#endif + switch (cmd) { + case FIOSETOWN: + case SIOCSPGRP: + err = -EFAULT; + if (get_user(pid, (int __user *)argp)) + break; + err = f_setown(sock->file, pid, 1); + break; + case FIOGETOWN: + case SIOCGPGRP: + err = put_user(f_getown(sock->file), + (int __user *)argp); + break; + case SIOCGIFBR: + case SIOCSIFBR: + case SIOCBRADDBR: + case SIOCBRDELBR: + err = -ENOPKG; + if (!br_ioctl_hook) + request_module("bridge"); + + mutex_lock(&br_ioctl_mutex); + if (br_ioctl_hook) + err = br_ioctl_hook(net, cmd, argp); + mutex_unlock(&br_ioctl_mutex); + break; + case SIOCGIFVLAN: + case SIOCSIFVLAN: + err = -ENOPKG; + if (!vlan_ioctl_hook) + request_module("8021q"); + + mutex_lock(&vlan_ioctl_mutex); + if (vlan_ioctl_hook) + err = vlan_ioctl_hook(net, argp); + mutex_unlock(&vlan_ioctl_mutex); + break; + case SIOCADDDLCI: + case SIOCDELDLCI: + err = -ENOPKG; + if (!dlci_ioctl_hook) + request_module("dlci"); + + mutex_lock(&dlci_ioctl_mutex); + if (dlci_ioctl_hook) + err = dlci_ioctl_hook(cmd, argp); + mutex_unlock(&dlci_ioctl_mutex); + break; + default: + err = sock_do_ioctl(net, sock, cmd, arg); + break; + } + return err; +} + +int sock_create_lite(int family, int type, int protocol, struct socket **res) +{ + int err; + struct socket *sock = NULL; + + err = security_socket_create(family, type, protocol, 1); + if (err) + goto out; + + sock = sock_alloc(); + if (!sock) { + err = -ENOMEM; + goto out; + } + + sock->type = type; + err = security_socket_post_create(sock, family, type, protocol, 1); + if (err) + goto out_release; + +out: + *res = sock; + return err; +out_release: + sock_release(sock); + sock = NULL; + goto out; +} +EXPORT_SYMBOL(sock_create_lite); + +/* No kernel lock held - perfect */ +static unsigned int sock_poll(struct file *file, poll_table *wait) +{ + struct socket *sock; + + /* + * We can't return errors to poll, so it's either yes or no. + */ + sock = file->private_data; + return sock->ops->poll(file, sock, wait); +} + +static int sock_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct socket *sock = file->private_data; + + return sock->ops->mmap(file, sock, vma); +} + +static int sock_close(struct inode *inode, struct file *filp) +{ + /* + * It was possible the inode is NULL we were + * closing an unfinished socket. + */ + + if (!inode) { + printk(KERN_DEBUG "sock_close: NULL inode\n"); + return 0; + } + sock_release(SOCKET_I(inode)); + return 0; +} + +/* + * Update the socket async list + * + * Fasync_list locking strategy. + * + * 1. fasync_list is modified only under process context socket lock + * i.e. under semaphore. + * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) + * or under socket lock + */ + +static int sock_fasync(int fd, struct file *filp, int on) +{ + struct socket *sock = filp->private_data; + struct sock *sk = sock->sk; + struct socket_wq *wq; + + if (sk == NULL) + return -EINVAL; + + lock_sock(sk); + wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk)); + fasync_helper(fd, filp, on, &wq->fasync_list); + + if (!wq->fasync_list) + sock_reset_flag(sk, SOCK_FASYNC); + else + sock_set_flag(sk, SOCK_FASYNC); + + release_sock(sk); + return 0; +} + +/* This function may be called only under socket lock or callback_lock or rcu_lock */ + +int sock_wake_async(struct socket *sock, int how, int band) +{ + struct socket_wq *wq; + + if (!sock) + return -1; + rcu_read_lock(); + wq = rcu_dereference(sock->wq); + if (!wq || !wq->fasync_list) { + rcu_read_unlock(); + return -1; + } + switch (how) { + case SOCK_WAKE_WAITD: + if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + break; + goto call_kill; + case SOCK_WAKE_SPACE: + if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + break; + /* fall through */ + case SOCK_WAKE_IO: +call_kill: + kill_fasync(&wq->fasync_list, SIGIO, band); + break; + case SOCK_WAKE_URG: + kill_fasync(&wq->fasync_list, SIGURG, band); + } + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(sock_wake_async); + +int __sock_create(struct net *net, int family, int type, int protocol, + struct socket **res, int kern) +{ + int err; + struct socket *sock; + const struct net_proto_family *pf; + + /* + * Check protocol is in range + */ + if (family < 0 || family >= NPROTO) + return -EAFNOSUPPORT; + if (type < 0 || type >= SOCK_MAX) + return -EINVAL; + + /* Compatibility. + + This uglymoron is moved from INET layer to here to avoid + deadlock in module load. + */ + if (family == PF_INET && type == SOCK_PACKET) { + static int warned; + if (!warned) { + warned = 1; + printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", + current->comm); + } + family = PF_PACKET; + } + + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; + + /* + * Allocate the socket and allow the family to set things up. if + * the protocol is 0, the family is instructed to select an appropriate + * default. + */ + sock = sock_alloc(); + if (!sock) { + if (net_ratelimit()) + printk(KERN_WARNING "socket: no more sockets\n"); + return -ENFILE; /* Not exactly a match, but its the + closest posix thing */ + } + + sock->type = type; + +#ifdef CONFIG_MODULES + /* Attempt to load a protocol module if the find failed. + * + * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user + * requested real, full-featured networking support upon configuration. + * Otherwise module support will break! + */ + if (rcu_access_pointer(net_families[family]) == NULL) + request_module("net-pf-%d", family); +#endif + + rcu_read_lock(); + pf = rcu_dereference(net_families[family]); + err = -EAFNOSUPPORT; + if (!pf) + goto out_release; + + /* + * We will call the ->create function, that possibly is in a loadable + * module, so we have to bump that loadable module refcnt first. + */ + if (!try_module_get(pf->owner)) + goto out_release; + + /* Now protected by module ref count */ + rcu_read_unlock(); + + err = pf->create(net, sock, protocol, kern); + if (err < 0) + goto out_module_put; + + /* + * Now to bump the refcnt of the [loadable] module that owns this + * socket at sock_release time we decrement its refcnt. + */ + if (!try_module_get(sock->ops->owner)) + goto out_module_busy; + + /* + * Now that we're done with the ->create function, the [loadable] + * module can have its refcnt decremented + */ + module_put(pf->owner); + err = security_socket_post_create(sock, family, type, protocol, kern); + if (err) + goto out_sock_release; + *res = sock; + + return 0; + +out_module_busy: + err = -EAFNOSUPPORT; +out_module_put: + sock->ops = NULL; + module_put(pf->owner); +out_sock_release: + sock_release(sock); + return err; + +out_release: + rcu_read_unlock(); + goto out_sock_release; +} +EXPORT_SYMBOL(__sock_create); + +int sock_create(int family, int type, int protocol, struct socket **res) +{ + return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); +} +EXPORT_SYMBOL(sock_create); + +int sock_create_kern(int family, int type, int protocol, struct socket **res) +{ + return __sock_create(&init_net, family, type, protocol, res, 1); +} +EXPORT_SYMBOL(sock_create_kern); + +SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) +{ + int retval; + struct socket *sock; + int flags; + + /* Check the SOCK_* constants for consistency. */ + BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); + BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); + BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); + + flags = type & ~SOCK_TYPE_MASK; + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + type &= SOCK_TYPE_MASK; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + goto out; + + retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + if (retval < 0) + goto out_release; + +out: + /* It may be already another descriptor 8) Not kernel problem. */ + return retval; + +out_release: + sock_release(sock); + return retval; +} + +/* + * Create a pair of connected sockets. + */ + +SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, + int __user *, usockvec) +{ + struct socket *sock1, *sock2; + int fd1, fd2, err; + struct file *newfile1, *newfile2; + int flags; + + flags = type & ~SOCK_TYPE_MASK; + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + type &= SOCK_TYPE_MASK; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + /* + * Obtain the first socket and check if the underlying protocol + * supports the socketpair call. + */ + + err = sock_create(family, type, protocol, &sock1); + if (err < 0) + goto out; + + err = sock_create(family, type, protocol, &sock2); + if (err < 0) + goto out_release_1; + + err = sock1->ops->socketpair(sock1, sock2); + if (err < 0) + goto out_release_both; + + fd1 = sock_alloc_file(sock1, &newfile1, flags); + if (unlikely(fd1 < 0)) { + err = fd1; + goto out_release_both; + } + + fd2 = sock_alloc_file(sock2, &newfile2, flags); + if (unlikely(fd2 < 0)) { + err = fd2; + fput(newfile1); + put_unused_fd(fd1); + sock_release(sock2); + goto out; + } + + audit_fd_pair(fd1, fd2); + fd_install(fd1, newfile1); + fd_install(fd2, newfile2); + /* fd1 and fd2 may be already another descriptors. + * Not kernel problem. + */ + + err = put_user(fd1, &usockvec[0]); + if (!err) + err = put_user(fd2, &usockvec[1]); + if (!err) + return 0; + + sys_close(fd2); + sys_close(fd1); + return err; + +out_release_both: + sock_release(sock2); +out_release_1: + sock_release(sock1); +out: + return err; +} + +/* + * Bind a name to a socket. Nothing much to do here since it's + * the protocol's responsibility to handle the local address. + * + * We move the socket address to kernel space before we call + * the protocol layer (having also checked the address is ok). + */ + +SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) +{ + struct socket *sock; + struct sockaddr_storage address; + int err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock) { + err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address); + if (err >= 0) { + err = security_socket_bind(sock, + (struct sockaddr *)&address, + addrlen); + if (!err) + err = sock->ops->bind(sock, + (struct sockaddr *) + &address, addrlen); + } + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Perform a listen. Basically, we allow the protocol to do anything + * necessary for a listen, and if that works, we mark the socket as + * ready for listening. + */ + +SYSCALL_DEFINE2(listen, int, fd, int, backlog) +{ + struct socket *sock; + int err, fput_needed; + int somaxconn; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock) { + somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; + if ((unsigned)backlog > somaxconn) + backlog = somaxconn; + + err = security_socket_listen(sock, backlog); + if (!err) + err = sock->ops->listen(sock, backlog); + + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * For accept, we attempt to create a new socket, set up the link + * with the client, wake up the client, then return the new + * connected fd. We collect the address of the connector in kernel + * space and move it to user at the very end. This is unclean because + * we open the socket then return an error. + * + * 1003.1g adds the ability to recvmsg() to query connection pending + * status to recvmsg. We need to add that support in a way thats + * clean when we restucture accept also. + */ + +SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, + int __user *, upeer_addrlen, int, flags) +{ + struct socket *sock, *newsock; + struct file *newfile; + int err, len, newfd, fput_needed; + struct sockaddr_storage address; + + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + err = -ENFILE; + newsock = sock_alloc(); + if (!newsock) + goto out_put; + + newsock->type = sock->type; + newsock->ops = sock->ops; + + /* + * We don't need try_module_get here, as the listening socket (sock) + * has the protocol module (sock->ops->owner) held. + */ + __module_get(newsock->ops->owner); + + newfd = sock_alloc_file(newsock, &newfile, flags); + if (unlikely(newfd < 0)) { + err = newfd; + sock_release(newsock); + goto out_put; + } + + err = security_socket_accept(sock, newsock); + if (err) + goto out_fd; + + err = sock->ops->accept(sock, newsock, sock->file->f_flags); + if (err < 0) + goto out_fd; + + if (upeer_sockaddr) { + if (newsock->ops->getname(newsock, (struct sockaddr *)&address, + &len, 2) < 0) { + err = -ECONNABORTED; + goto out_fd; + } + err = move_addr_to_user((struct sockaddr *)&address, + len, upeer_sockaddr, upeer_addrlen); + if (err < 0) + goto out_fd; + } + + /* File flags are not inherited via accept() unlike another OSes. */ + + fd_install(newfd, newfile); + err = newfd; + +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +out_fd: + fput(newfile); + put_unused_fd(newfd); + goto out_put; +} + +SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, + int __user *, upeer_addrlen) +{ + return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); +} + +/* + * Attempt to connect to a socket with the server address. The address + * is in user space so we verify it is OK and move it to kernel space. + * + * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to + * break bindings + * + * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and + * other SEQPACKET protocols that take time to connect() as it doesn't + * include the -EINPROGRESS status for such sockets. + */ + +SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, + int, addrlen) +{ + struct socket *sock; + struct sockaddr_storage address; + int err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address); + if (err < 0) + goto out_put; + + err = + security_socket_connect(sock, (struct sockaddr *)&address, addrlen); + if (err) + goto out_put; + + err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, + sock->file->f_flags); +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Get the local address ('name') of a socket object. Move the obtained + * name to user space. + */ + +SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, + int __user *, usockaddr_len) +{ + struct socket *sock; + struct sockaddr_storage address; + int len, err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + err = security_socket_getsockname(sock); + if (err) + goto out_put; + + err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0); + if (err) + goto out_put; + err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len); + +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Get the remote address ('name') of a socket object. Move the obtained + * name to user space. + */ + +SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, + int __user *, usockaddr_len) +{ + struct socket *sock; + struct sockaddr_storage address; + int len, err, fput_needed; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_getpeername(sock); + if (err) { + fput_light(sock->file, fput_needed); + return err; + } + + err = + sock->ops->getname(sock, (struct sockaddr *)&address, &len, + 1); + if (!err) + err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, + usockaddr_len); + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Send a datagram to a given address. We move the address into kernel + * space and check the user space data area is readable before invoking + * the protocol. + */ + +SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, + unsigned, flags, struct sockaddr __user *, addr, + int, addr_len) +{ + struct socket *sock; + struct sockaddr_storage address; + int err; + struct msghdr msg; + struct iovec iov; + int fput_needed; + + if (len > INT_MAX) + len = INT_MAX; + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + iov.iov_base = buff; + iov.iov_len = len; + msg.msg_name = NULL; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + if (addr) { + err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address); + if (err < 0) + goto out_put; + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = addr_len; + } + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg, len); + +out_put: + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Send a datagram down a socket. + */ + +SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, + unsigned, flags) +{ + return sys_sendto(fd, buff, len, flags, NULL, 0); +} + +/* + * Receive a frame from the socket and optionally record the address of the + * sender. We verify the buffers are writable and if needed move the + * sender address from kernel to user space. + */ + +SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, + unsigned, flags, struct sockaddr __user *, addr, + int __user *, addr_len) +{ + struct socket *sock; + struct iovec iov; + struct msghdr msg; + struct sockaddr_storage address; + int err, err2; + int fput_needed; + + if (size > INT_MAX) + size = INT_MAX; + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + goto out; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = ubuf; + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = sizeof(address); + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err = sock_recvmsg(sock, &msg, size, flags); + + if (err >= 0 && addr != NULL) { + err2 = move_addr_to_user((struct sockaddr *)&address, + msg.msg_namelen, addr, addr_len); + if (err2 < 0) + err = err2; + } + + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Receive a datagram from a socket. + */ + +asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size, + unsigned flags) +{ + return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); +} + +/* + * Set a socket option. Because we don't know the option lengths we have + * to pass the user mode parameter for the protocols to sort out. + */ + +SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, + char __user *, optval, int, optlen) +{ + int err, fput_needed; + struct socket *sock; + + if (optlen < 0) + return -EINVAL; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_setsockopt(sock, level, optname); + if (err) + goto out_put; + + if (level == SOL_SOCKET) + err = + sock_setsockopt(sock, level, optname, optval, + optlen); + else + err = + sock->ops->setsockopt(sock, level, optname, optval, + optlen); +out_put: + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Get a socket option. Because we don't know the option lengths we have + * to pass a user mode parameter for the protocols to sort out. + */ + +SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, + char __user *, optval, int __user *, optlen) +{ + int err, fput_needed; + struct socket *sock; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_getsockopt(sock, level, optname); + if (err) + goto out_put; + + if (level == SOL_SOCKET) + err = + sock_getsockopt(sock, level, optname, optval, + optlen); + else + err = + sock->ops->getsockopt(sock, level, optname, optval, + optlen); +out_put: + fput_light(sock->file, fput_needed); + } + return err; +} + +/* + * Shutdown a socket. + */ + +SYSCALL_DEFINE2(shutdown, int, fd, int, how) +{ + int err, fput_needed; + struct socket *sock; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (sock != NULL) { + err = security_socket_shutdown(sock, how); + if (!err) + err = sock->ops->shutdown(sock, how); + fput_light(sock->file, fput_needed); + } + return err; +} + +/* A couple of helpful macros for getting the address of the 32/64 bit + * fields which are the same type (int / unsigned) on our platforms. + */ +#define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) +#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) +#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) + +struct used_address { + struct sockaddr_storage name; + unsigned int name_len; +}; + +static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg, + struct msghdr *msg_sys, unsigned flags, + struct used_address *used_address) +{ + struct compat_msghdr __user *msg_compat = + (struct compat_msghdr __user *)msg; + struct sockaddr_storage address; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + unsigned char ctl[sizeof(struct cmsghdr) + 20] + __attribute__ ((aligned(sizeof(__kernel_size_t)))); + /* 20 is size of ipv6_pktinfo */ + unsigned char *ctl_buf = ctl; + int err, ctl_len, iov_size, total_len; + + err = -EFAULT; + if (MSG_CMSG_COMPAT & flags) { + if (get_compat_msghdr(msg_sys, msg_compat)) + return -EFAULT; + } else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) + return -EFAULT; + + /* do not move before msg_sys is valid */ + err = -EMSGSIZE; + if (msg_sys->msg_iovlen > UIO_MAXIOV) + goto out; + + /* Check whether to allocate the iovec area */ + err = -ENOMEM; + iov_size = msg_sys->msg_iovlen * sizeof(struct iovec); + if (msg_sys->msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out; + } + + /* This will also move the address data into kernel space */ + if (MSG_CMSG_COMPAT & flags) { + err = verify_compat_iovec(msg_sys, iov, + (struct sockaddr *)&address, + VERIFY_READ); + } else + err = verify_iovec(msg_sys, iov, + (struct sockaddr *)&address, + VERIFY_READ); + if (err < 0) + goto out_freeiov; + total_len = err; + + err = -ENOBUFS; + + if (msg_sys->msg_controllen > INT_MAX) + goto out_freeiov; + ctl_len = msg_sys->msg_controllen; + if ((MSG_CMSG_COMPAT & flags) && ctl_len) { + err = + cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, + sizeof(ctl)); + if (err) + goto out_freeiov; + ctl_buf = msg_sys->msg_control; + ctl_len = msg_sys->msg_controllen; + } else if (ctl_len) { + if (ctl_len > sizeof(ctl)) { + ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); + if (ctl_buf == NULL) + goto out_freeiov; + } + err = -EFAULT; + /* + * Careful! Before this, msg_sys->msg_control contains a user pointer. + * Afterwards, it will be a kernel pointer. Thus the compiler-assisted + * checking falls down on this. + */ + if (copy_from_user(ctl_buf, + (void __user __force *)msg_sys->msg_control, + ctl_len)) + goto out_freectl; + msg_sys->msg_control = ctl_buf; + } + msg_sys->msg_flags = flags; + + if (sock->file->f_flags & O_NONBLOCK) + msg_sys->msg_flags |= MSG_DONTWAIT; + /* + * If this is sendmmsg() and current destination address is same as + * previously succeeded address, omit asking LSM's decision. + * used_address->name_len is initialized to UINT_MAX so that the first + * destination address never matches. + */ + if (used_address && msg_sys->msg_name && + used_address->name_len == msg_sys->msg_namelen && + !memcmp(&used_address->name, msg_sys->msg_name, + used_address->name_len)) { + err = sock_sendmsg_nosec(sock, msg_sys, total_len); + goto out_freectl; + } + err = sock_sendmsg(sock, msg_sys, total_len); + /* + * If this is sendmmsg() and sending to current destination address was + * successful, remember it. + */ + if (used_address && err >= 0) { + used_address->name_len = msg_sys->msg_namelen; + if (msg_sys->msg_name) + memcpy(&used_address->name, msg_sys->msg_name, + used_address->name_len); + } + +out_freectl: + if (ctl_buf != ctl) + sock_kfree_s(sock->sk, ctl_buf, ctl_len); +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out: + return err; +} + +/* + * BSD sendmsg interface + */ + +SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) +{ + int fput_needed, err; + struct msghdr msg_sys; + struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed); + + if (!sock) + goto out; + + err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Linux sendmmsg interface + */ + +int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, + unsigned int flags) +{ + int fput_needed, err, datagrams; + struct socket *sock; + struct mmsghdr __user *entry; + struct compat_mmsghdr __user *compat_entry; + struct msghdr msg_sys; + struct used_address used_address; + + if (vlen > UIO_MAXIOV) + vlen = UIO_MAXIOV; + + datagrams = 0; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + return err; + + used_address.name_len = UINT_MAX; + entry = mmsg; + compat_entry = (struct compat_mmsghdr __user *)mmsg; + err = 0; + + while (datagrams < vlen) { + if (MSG_CMSG_COMPAT & flags) { + err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags, &used_address); + if (err < 0) + break; + err = __put_user(err, &compat_entry->msg_len); + ++compat_entry; + } else { + err = __sys_sendmsg(sock, (struct msghdr __user *)entry, + &msg_sys, flags, &used_address); + if (err < 0) + break; + err = put_user(err, &entry->msg_len); + ++entry; + } + + if (err) + break; + ++datagrams; + } + + fput_light(sock->file, fput_needed); + + /* We only return an error if no datagrams were able to be sent */ + if (datagrams != 0) + return datagrams; + + return err; +} + +SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags) +{ + return __sys_sendmmsg(fd, mmsg, vlen, flags); +} + +static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg, + struct msghdr *msg_sys, unsigned flags, int nosec) +{ + struct compat_msghdr __user *msg_compat = + (struct compat_msghdr __user *)msg; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + unsigned long cmsg_ptr; + int err, iov_size, total_len, len; + + /* kernel mode address */ + struct sockaddr_storage addr; + + /* user mode address pointers */ + struct sockaddr __user *uaddr; + int __user *uaddr_len; + + if (MSG_CMSG_COMPAT & flags) { + if (get_compat_msghdr(msg_sys, msg_compat)) + return -EFAULT; + } else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) + return -EFAULT; + + err = -EMSGSIZE; + if (msg_sys->msg_iovlen > UIO_MAXIOV) + goto out; + + /* Check whether to allocate the iovec area */ + err = -ENOMEM; + iov_size = msg_sys->msg_iovlen * sizeof(struct iovec); + if (msg_sys->msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out; + } + + /* + * Save the user-mode address (verify_iovec will change the + * kernel msghdr to use the kernel address space) + */ + + uaddr = (__force void __user *)msg_sys->msg_name; + uaddr_len = COMPAT_NAMELEN(msg); + if (MSG_CMSG_COMPAT & flags) { + err = verify_compat_iovec(msg_sys, iov, + (struct sockaddr *)&addr, + VERIFY_WRITE); + } else + err = verify_iovec(msg_sys, iov, + (struct sockaddr *)&addr, + VERIFY_WRITE); + if (err < 0) + goto out_freeiov; + total_len = err; + + cmsg_ptr = (unsigned long)msg_sys->msg_control; + msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); + + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, + total_len, flags); + if (err < 0) + goto out_freeiov; + len = err; + + if (uaddr != NULL) { + err = move_addr_to_user((struct sockaddr *)&addr, + msg_sys->msg_namelen, uaddr, + uaddr_len); + if (err < 0) + goto out_freeiov; + } + err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), + COMPAT_FLAGS(msg)); + if (err) + goto out_freeiov; + if (MSG_CMSG_COMPAT & flags) + err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, + &msg_compat->msg_controllen); + else + err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, + &msg->msg_controllen); + if (err) + goto out_freeiov; + err = len; + +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out: + return err; +} + +/* + * BSD recvmsg interface + */ + +SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, + unsigned int, flags) +{ + int fput_needed, err; + struct msghdr msg_sys; + struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed); + + if (!sock) + goto out; + + err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0); + + fput_light(sock->file, fput_needed); +out: + return err; +} + +/* + * Linux recvmmsg interface + */ + +int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, + unsigned int flags, struct timespec *timeout) +{ + int fput_needed, err, datagrams; + struct socket *sock; + struct mmsghdr __user *entry; + struct compat_mmsghdr __user *compat_entry; + struct msghdr msg_sys; + struct timespec end_time; + + if (timeout && + poll_select_set_timeout(&end_time, timeout->tv_sec, + timeout->tv_nsec)) + return -EINVAL; + + datagrams = 0; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + return err; + + err = sock_error(sock->sk); + if (err) + goto out_put; + + entry = mmsg; + compat_entry = (struct compat_mmsghdr __user *)mmsg; + + while (datagrams < vlen) { + /* + * No need to ask LSM for more than the first datagram. + */ + if (MSG_CMSG_COMPAT & flags) { + err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); + if (err < 0) + break; + err = __put_user(err, &compat_entry->msg_len); + ++compat_entry; + } else { + err = __sys_recvmsg(sock, (struct msghdr __user *)entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); + if (err < 0) + break; + err = put_user(err, &entry->msg_len); + ++entry; + } + + if (err) + break; + ++datagrams; + + /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */ + if (flags & MSG_WAITFORONE) + flags |= MSG_DONTWAIT; + + if (timeout) { + ktime_get_ts(timeout); + *timeout = timespec_sub(end_time, *timeout); + if (timeout->tv_sec < 0) { + timeout->tv_sec = timeout->tv_nsec = 0; + break; + } + + /* Timeout, return less than vlen datagrams */ + if (timeout->tv_nsec == 0 && timeout->tv_sec == 0) + break; + } + + /* Out of band data, return right away */ + if (msg_sys.msg_flags & MSG_OOB) + break; + } + +out_put: + fput_light(sock->file, fput_needed); + + if (err == 0) + return datagrams; + + if (datagrams != 0) { + /* + * We may return less entries than requested (vlen) if the + * sock is non block and there aren't enough datagrams... + */ + if (err != -EAGAIN) { + /* + * ... or if recvmsg returns an error after we + * received some datagrams, where we record the + * error to return on the next call or if the + * app asks about it using getsockopt(SO_ERROR). + */ + sock->sk->sk_err = -err; + } + + return datagrams; + } + + return err; +} + +SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct timespec __user *, timeout) +{ + int datagrams; + struct timespec timeout_sys; + + if (!timeout) + return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); + + if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys))) + return -EFAULT; + + datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); + + if (datagrams > 0 && + copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys))) + datagrams = -EFAULT; + + return datagrams; +} + +#ifdef __ARCH_WANT_SYS_SOCKETCALL +/* Argument list sizes for sys_socketcall */ +#define AL(x) ((x) * sizeof(unsigned long)) +static const unsigned char nargs[21] = { + AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), + AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), + AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), + AL(4), AL(5), AL(4) +}; + +#undef AL + +/* + * System call vectors. + * + * Argument checking cleaned up. Saved 20% in size. + * This function doesn't need to set the kernel lock because + * it is set by the callees. + */ + +SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) +{ + unsigned long a[6]; + unsigned long a0, a1; + int err; + unsigned int len; + + if (call < 1 || call > SYS_SENDMMSG) + return -EINVAL; + + len = nargs[call]; + if (len > sizeof(a)) + return -EINVAL; + + /* copy_from_user should be SMP safe. */ + if (copy_from_user(a, args, len)) + return -EFAULT; + + audit_socketcall(nargs[call] / sizeof(unsigned long), a); + + a0 = a[0]; + a1 = a[1]; + + switch (call) { + case SYS_SOCKET: + err = sys_socket(a0, a1, a[2]); + break; + case SYS_BIND: + err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); + break; + case SYS_CONNECT: + err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); + break; + case SYS_LISTEN: + err = sys_listen(a0, a1); + break; + case SYS_ACCEPT: + err = sys_accept4(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], 0); + break; + case SYS_GETSOCKNAME: + err = + sys_getsockname(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); + break; + case SYS_GETPEERNAME: + err = + sys_getpeername(a0, (struct sockaddr __user *)a1, + (int __user *)a[2]); + break; + case SYS_SOCKETPAIR: + err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); + break; + case SYS_SEND: + err = sys_send(a0, (void __user *)a1, a[2], a[3]); + break; + case SYS_SENDTO: + err = sys_sendto(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], a[5]); + break; + case SYS_RECV: + err = sys_recv(a0, (void __user *)a1, a[2], a[3]); + break; + case SYS_RECVFROM: + err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], + (int __user *)a[5]); + break; + case SYS_SHUTDOWN: + err = sys_shutdown(a0, a1); + break; + case SYS_SETSOCKOPT: + err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); + break; + case SYS_GETSOCKOPT: + err = + sys_getsockopt(a0, a1, a[2], (char __user *)a[3], + (int __user *)a[4]); + break; + case SYS_SENDMSG: + err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); + break; + case SYS_SENDMMSG: + err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]); + break; + case SYS_RECVMSG: + err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); + break; + case SYS_RECVMMSG: + err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], + (struct timespec __user *)a[4]); + break; + case SYS_ACCEPT4: + err = sys_accept4(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], a[3]); + break; + default: + err = -EINVAL; + break; + } + return err; +} + +#endif /* __ARCH_WANT_SYS_SOCKETCALL */ + +/** + * sock_register - add a socket protocol handler + * @ops: description of protocol + * + * This function is called by a protocol handler that wants to + * advertise its address family, and have it linked into the + * socket interface. The value ops->family coresponds to the + * socket system call protocol family. + */ +int sock_register(const struct net_proto_family *ops) +{ + int err; + + if (ops->family >= NPROTO) { + printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, + NPROTO); + return -ENOBUFS; + } + + spin_lock(&net_family_lock); + if (rcu_dereference_protected(net_families[ops->family], + lockdep_is_held(&net_family_lock))) + err = -EEXIST; + else { + rcu_assign_pointer(net_families[ops->family], ops); + err = 0; + } + spin_unlock(&net_family_lock); + + printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); + return err; +} +EXPORT_SYMBOL(sock_register); + +/** + * sock_unregister - remove a protocol handler + * @family: protocol family to remove + * + * This function is called by a protocol handler that wants to + * remove its address family, and have it unlinked from the + * new socket creation. + * + * If protocol handler is a module, then it can use module reference + * counts to protect against new references. If protocol handler is not + * a module then it needs to provide its own protection in + * the ops->create routine. + */ +void sock_unregister(int family) +{ + BUG_ON(family < 0 || family >= NPROTO); + + spin_lock(&net_family_lock); + rcu_assign_pointer(net_families[family], NULL); + spin_unlock(&net_family_lock); + + synchronize_rcu(); + + printk(KERN_INFO "NET: Unregistered protocol family %d\n", family); +} +EXPORT_SYMBOL(sock_unregister); + +static int __init sock_init(void) +{ + int err; + + /* + * Initialize sock SLAB cache. + */ + + sk_init(); + + /* + * Initialize skbuff SLAB cache + */ + skb_init(); + + /* + * Initialize the protocols module. + */ + + init_inodecache(); + + err = register_filesystem(&sock_fs_type); + if (err) + goto out_fs; + sock_mnt = kern_mount(&sock_fs_type); + if (IS_ERR(sock_mnt)) { + err = PTR_ERR(sock_mnt); + goto out_mount; + } + + /* The real protocol initialization is performed in later initcalls. + */ + +#ifdef CONFIG_NETFILTER + netfilter_init(); +#endif + +#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING + skb_timestamping_init(); +#endif + +out: + return err; + +out_mount: + unregister_filesystem(&sock_fs_type); +out_fs: + goto out; +} + +core_initcall(sock_init); /* early initcall */ + +#ifdef CONFIG_PROC_FS +void socket_seq_show(struct seq_file *seq) +{ + int cpu; + int counter = 0; + + for_each_possible_cpu(cpu) + counter += per_cpu(sockets_in_use, cpu); + + /* It can be negative, by the way. 8) */ + if (counter < 0) + counter = 0; + + seq_printf(seq, "sockets: used %d\n", counter); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_COMPAT +static int do_siocgstamp(struct net *net, struct socket *sock, + unsigned int cmd, struct compat_timeval __user *up) +{ + mm_segment_t old_fs = get_fs(); + struct timeval ktv; + int err; + + set_fs(KERNEL_DS); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); + set_fs(old_fs); + if (!err) { + err = put_user(ktv.tv_sec, &up->tv_sec); + err |= __put_user(ktv.tv_usec, &up->tv_usec); + } + return err; +} + +static int do_siocgstampns(struct net *net, struct socket *sock, + unsigned int cmd, struct compat_timespec __user *up) +{ + mm_segment_t old_fs = get_fs(); + struct timespec kts; + int err; + + set_fs(KERNEL_DS); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); + set_fs(old_fs); + if (!err) { + err = put_user(kts.tv_sec, &up->tv_sec); + err |= __put_user(kts.tv_nsec, &up->tv_nsec); + } + return err; +} + +static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32) +{ + struct ifreq __user *uifr; + int err; + + uifr = compat_alloc_user_space(sizeof(struct ifreq)); + if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) + return -EFAULT; + + err = dev_ioctl(net, SIOCGIFNAME, uifr); + if (err) + return err; + + if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq))) + return -EFAULT; + + return 0; +} + +static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) +{ + struct compat_ifconf ifc32; + struct ifconf ifc; + struct ifconf __user *uifc; + struct compat_ifreq __user *ifr32; + struct ifreq __user *ifr; + unsigned int i, j; + int err; + + if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) + return -EFAULT; + + if (ifc32.ifcbuf == 0) { + ifc32.ifc_len = 0; + ifc.ifc_len = 0; + ifc.ifc_req = NULL; + uifc = compat_alloc_user_space(sizeof(struct ifconf)); + } else { + size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) * + sizeof(struct ifreq); + uifc = compat_alloc_user_space(sizeof(struct ifconf) + len); + ifc.ifc_len = len; + ifr = ifc.ifc_req = (void __user *)(uifc + 1); + ifr32 = compat_ptr(ifc32.ifcbuf); + for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) { + if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq))) + return -EFAULT; + ifr++; + ifr32++; + } + } + if (copy_to_user(uifc, &ifc, sizeof(struct ifconf))) + return -EFAULT; + + err = dev_ioctl(net, SIOCGIFCONF, uifc); + if (err) + return err; + + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; + + ifr = ifc.ifc_req; + ifr32 = compat_ptr(ifc32.ifcbuf); + for (i = 0, j = 0; + i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len; + i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) { + if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq))) + return -EFAULT; + ifr32++; + ifr++; + } + + if (ifc32.ifcbuf == 0) { + /* Translate from 64-bit structure multiple to + * a 32-bit one. + */ + i = ifc.ifc_len; + i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq)); + ifc32.ifc_len = i; + } else { + ifc32.ifc_len = i; + } + if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf))) + return -EFAULT; + + return 0; +} + +static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) +{ + struct compat_ethtool_rxnfc __user *compat_rxnfc; + bool convert_in = false, convert_out = false; + size_t buf_size = ALIGN(sizeof(struct ifreq), 8); + struct ethtool_rxnfc __user *rxnfc; + struct ifreq __user *ifr; + u32 rule_cnt = 0, actual_rule_cnt; + u32 ethcmd; + u32 data; + int ret; + + if (get_user(data, &ifr32->ifr_ifru.ifru_data)) + return -EFAULT; + + compat_rxnfc = compat_ptr(data); + + if (get_user(ethcmd, &compat_rxnfc->cmd)) + return -EFAULT; + + /* Most ethtool structures are defined without padding. + * Unfortunately struct ethtool_rxnfc is an exception. + */ + switch (ethcmd) { + default: + break; + case ETHTOOL_GRXCLSRLALL: + /* Buffer size is variable */ + if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) + return -EFAULT; + if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) + return -ENOMEM; + buf_size += rule_cnt * sizeof(u32); + /* fall through */ + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + convert_out = true; + /* fall through */ + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + buf_size += sizeof(struct ethtool_rxnfc); + convert_in = true; + break; + } + + ifr = compat_alloc_user_space(buf_size); + rxnfc = (void *)ifr + ALIGN(sizeof(struct ifreq), 8); + + if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + return -EFAULT; + + if (put_user(convert_in ? rxnfc : compat_ptr(data), + &ifr->ifr_ifru.ifru_data)) + return -EFAULT; + + if (convert_in) { + /* We expect there to be holes between fs.m_ext and + * fs.ring_cookie and at the end of fs, but nowhere else. + */ + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + + sizeof(compat_rxnfc->fs.m_ext) != + offsetof(struct ethtool_rxnfc, fs.m_ext) + + sizeof(rxnfc->fs.m_ext)); + BUILD_BUG_ON( + offsetof(struct compat_ethtool_rxnfc, fs.location) - + offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != + offsetof(struct ethtool_rxnfc, fs.location) - + offsetof(struct ethtool_rxnfc, fs.ring_cookie)); + + if (copy_in_user(rxnfc, compat_rxnfc, + (void *)(&rxnfc->fs.m_ext + 1) - + (void *)rxnfc) || + copy_in_user(&rxnfc->fs.ring_cookie, + &compat_rxnfc->fs.ring_cookie, + (void *)(&rxnfc->fs.location + 1) - + (void *)&rxnfc->fs.ring_cookie) || + copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) + return -EFAULT; + } + + ret = dev_ioctl(net, SIOCETHTOOL, ifr); + if (ret) + return ret; + + if (convert_out) { + if (copy_in_user(compat_rxnfc, rxnfc, + (const void *)(&rxnfc->fs.m_ext + 1) - + (const void *)rxnfc) || + copy_in_user(&compat_rxnfc->fs.ring_cookie, + &rxnfc->fs.ring_cookie, + (const void *)(&rxnfc->fs.location + 1) - + (const void *)&rxnfc->fs.ring_cookie) || + copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) + return -EFAULT; + + if (ethcmd == ETHTOOL_GRXCLSRLALL) { + /* As an optimisation, we only copy the actual + * number of rules that the underlying + * function returned. Since Mallory might + * change the rule count in user memory, we + * check that it is less than the rule count + * originally given (as the user buffer size), + * which has been range-checked. + */ + if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) + return -EFAULT; + if (actual_rule_cnt < rule_cnt) + rule_cnt = actual_rule_cnt; + if (copy_in_user(&compat_rxnfc->rule_locs[0], + &rxnfc->rule_locs[0], + rule_cnt * sizeof(u32))) + return -EFAULT; + } + } + + return 0; +} + +static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) +{ + void __user *uptr; + compat_uptr_t uptr32; + struct ifreq __user *uifr; + + uifr = compat_alloc_user_space(sizeof(*uifr)); + if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) + return -EFAULT; + + if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) + return -EFAULT; + + uptr = compat_ptr(uptr32); + + if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc)) + return -EFAULT; + + return dev_ioctl(net, SIOCWANDEV, uifr); +} + +static int bond_ioctl(struct net *net, unsigned int cmd, + struct compat_ifreq __user *ifr32) +{ + struct ifreq kifr; + struct ifreq __user *uifr; + mm_segment_t old_fs; + int err; + u32 data; + void __user *datap; + + switch (cmd) { + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq))) + return -EFAULT; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = dev_ioctl(net, cmd, + (struct ifreq __user __force *) &kifr); + set_fs(old_fs); + + return err; + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + uifr = compat_alloc_user_space(sizeof(*uifr)); + if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + return -EFAULT; + + if (get_user(data, &ifr32->ifr_ifru.ifru_data)) + return -EFAULT; + + datap = compat_ptr(data); + if (put_user(datap, &uifr->ifr_ifru.ifru_data)) + return -EFAULT; + + return dev_ioctl(net, cmd, uifr); + default: + return -EINVAL; + } +} + +static int siocdevprivate_ioctl(struct net *net, unsigned int cmd, + struct compat_ifreq __user *u_ifreq32) +{ + struct ifreq __user *u_ifreq64; + char tmp_buf[IFNAMSIZ]; + void __user *data64; + u32 data32; + + if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]), + IFNAMSIZ)) + return -EFAULT; + if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data)) + return -EFAULT; + data64 = compat_ptr(data32); + + u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64)); + + /* Don't check these user accesses, just let that get trapped + * in the ioctl handler instead. + */ + if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0], + IFNAMSIZ)) + return -EFAULT; + if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data)) + return -EFAULT; + + return dev_ioctl(net, cmd, u_ifreq64); +} + +static int dev_ifsioc(struct net *net, struct socket *sock, + unsigned int cmd, struct compat_ifreq __user *uifr32) +{ + struct ifreq __user *uifr; + int err; + + uifr = compat_alloc_user_space(sizeof(*uifr)); + if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) + return -EFAULT; + + err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); + + if (!err) { + switch (cmd) { + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFMEM: + case SIOCGIFHWADDR: + case SIOCGIFINDEX: + case SIOCGIFADDR: + case SIOCGIFBRDADDR: + case SIOCGIFDSTADDR: + case SIOCGIFNETMASK: + case SIOCGIFPFLAGS: + case SIOCGIFTXQLEN: + case SIOCGMIIPHY: + case SIOCGMIIREG: + if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) + err = -EFAULT; + break; + } + } + return err; +} + +static int compat_sioc_ifmap(struct net *net, unsigned int cmd, + struct compat_ifreq __user *uifr32) +{ + struct ifreq ifr; + struct compat_ifmap __user *uifmap32; + mm_segment_t old_fs; + int err; + + uifmap32 = &uifr32->ifr_ifru.ifru_map; + err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); + err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= __get_user(ifr.ifr_map.port, &uifmap32->port); + if (err) + return -EFAULT; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = dev_ioctl(net, cmd, (void __user __force *)&ifr); + set_fs(old_fs); + + if (cmd == SIOCGIFMAP && !err) { + err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); + err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= __put_user(ifr.ifr_map.port, &uifmap32->port); + if (err) + err = -EFAULT; + } + return err; +} + +static int compat_siocshwtstamp(struct net *net, struct compat_ifreq __user *uifr32) +{ + void __user *uptr; + compat_uptr_t uptr32; + struct ifreq __user *uifr; + + uifr = compat_alloc_user_space(sizeof(*uifr)); + if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) + return -EFAULT; + + if (get_user(uptr32, &uifr32->ifr_data)) + return -EFAULT; + + uptr = compat_ptr(uptr32); + + if (put_user(uptr, &uifr->ifr_data)) + return -EFAULT; + + return dev_ioctl(net, SIOCSHWTSTAMP, uifr); +} + +struct rtentry32 { + u32 rt_pad1; + struct sockaddr rt_dst; /* target address */ + struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */ + struct sockaddr rt_genmask; /* target network mask (IP) */ + unsigned short rt_flags; + short rt_pad2; + u32 rt_pad3; + unsigned char rt_tos; + unsigned char rt_class; + short rt_pad4; + short rt_metric; /* +1 for binary compatibility! */ + /* char * */ u32 rt_dev; /* forcing the device at add */ + u32 rt_mtu; /* per route MTU/Window */ + u32 rt_window; /* Window clamping */ + unsigned short rt_irtt; /* Initial RTT */ +}; + +struct in6_rtmsg32 { + struct in6_addr rtmsg_dst; + struct in6_addr rtmsg_src; + struct in6_addr rtmsg_gateway; + u32 rtmsg_type; + u16 rtmsg_dst_len; + u16 rtmsg_src_len; + u32 rtmsg_metric; + u32 rtmsg_info; + u32 rtmsg_flags; + s32 rtmsg_ifindex; +}; + +static int routing_ioctl(struct net *net, struct socket *sock, + unsigned int cmd, void __user *argp) +{ + int ret; + void *r = NULL; + struct in6_rtmsg r6; + struct rtentry r4; + char devname[16]; + u32 rtdev; + mm_segment_t old_fs = get_fs(); + + if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */ + struct in6_rtmsg32 __user *ur6 = argp; + ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), + 3 * sizeof(struct in6_addr)); + ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); + ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); + ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); + ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); + ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); + ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); + ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); + + r = (void *) &r6; + } else { /* ipv4 */ + struct rtentry32 __user *ur4 = argp; + ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), + 3 * sizeof(struct sockaddr)); + ret |= __get_user(r4.rt_flags, &(ur4->rt_flags)); + ret |= __get_user(r4.rt_metric, &(ur4->rt_metric)); + ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu)); + ret |= __get_user(r4.rt_window, &(ur4->rt_window)); + ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt)); + ret |= __get_user(rtdev, &(ur4->rt_dev)); + if (rtdev) { + ret |= copy_from_user(devname, compat_ptr(rtdev), 15); + r4.rt_dev = (char __user __force *)devname; + devname[15] = 0; + } else + r4.rt_dev = NULL; + + r = (void *) &r4; + } + + if (ret) { + ret = -EFAULT; + goto out; + } + + set_fs(KERNEL_DS); + ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); + set_fs(old_fs); + +out: + return ret; +} + +/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE + * for some operations; this forces use of the newer bridge-utils that + * use compatible ioctls + */ +static int old_bridge_ioctl(compat_ulong_t __user *argp) +{ + compat_ulong_t tmp; + + if (get_user(tmp, argp)) + return -EFAULT; + if (tmp == BRCTL_GET_VERSION) + return BRCTL_VERSION + 1; + return -EINVAL; +} + +static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + + if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) + return siocdevprivate_ioctl(net, cmd, argp); + + switch (cmd) { + case SIOCSIFBR: + case SIOCGIFBR: + return old_bridge_ioctl(argp); + case SIOCGIFNAME: + return dev_ifname32(net, argp); + case SIOCGIFCONF: + return dev_ifconf(net, argp); + case SIOCETHTOOL: + return ethtool_ioctl(net, argp); + case SIOCWANDEV: + return compat_siocwandev(net, argp); + case SIOCGIFMAP: + case SIOCSIFMAP: + return compat_sioc_ifmap(net, cmd, argp); + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + case SIOCBONDCHANGEACTIVE: + return bond_ioctl(net, cmd, argp); + case SIOCADDRT: + case SIOCDELRT: + return routing_ioctl(net, sock, cmd, argp); + case SIOCGSTAMP: + return do_siocgstamp(net, sock, cmd, argp); + case SIOCGSTAMPNS: + return do_siocgstampns(net, sock, cmd, argp); + case SIOCSHWTSTAMP: + return compat_siocshwtstamp(net, argp); + + case FIOSETOWN: + case SIOCSPGRP: + case FIOGETOWN: + case SIOCGPGRP: + case SIOCBRADDBR: + case SIOCBRDELBR: + case SIOCGIFVLAN: + case SIOCSIFVLAN: + case SIOCADDDLCI: + case SIOCDELDLCI: + return sock_ioctl(file, cmd, arg); + + case SIOCGIFFLAGS: + case SIOCSIFFLAGS: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMTU: + case SIOCSIFMTU: + case SIOCGIFMEM: + case SIOCSIFMEM: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCGIFINDEX: + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCSIFHWBROADCAST: + case SIOCDIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCGIFTXQLEN: + case SIOCSIFTXQLEN: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSIFNAME: + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSMIIREG: + return dev_ifsioc(net, sock, cmd, argp); + + case SIOCSARP: + case SIOCGARP: + case SIOCDARP: + case SIOCATMARK: + return sock_do_ioctl(net, sock, cmd, arg); + } + + /* Prevent warning from compat_sys_ioctl, these always + * result in -EINVAL in the native case anyway. */ + switch (cmd) { + case SIOCRTMSG: + case SIOCGIFCOUNT: + case SIOCSRARP: + case SIOCGRARP: + case SIOCDRARP: + case SIOCSIFLINK: + case SIOCGIFSLAVE: + case SIOCSIFSLAVE: + return -EINVAL; + } + + return -ENOIOCTLCMD; +} + +static long compat_sock_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + struct socket *sock = file->private_data; + int ret = -ENOIOCTLCMD; + struct sock *sk; + struct net *net; + + sk = sock->sk; + net = sock_net(sk); + + if (sock->ops->compat_ioctl) + ret = sock->ops->compat_ioctl(sock, cmd, arg); + + if (ret == -ENOIOCTLCMD && + (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) + ret = compat_wext_handle_ioctl(net, cmd, arg); + + if (ret == -ENOIOCTLCMD) + ret = compat_sock_ioctl_trans(file, sock, cmd, arg); + + return ret; +} +#endif + +int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +{ + return sock->ops->bind(sock, addr, addrlen); +} +EXPORT_SYMBOL(kernel_bind); + +int kernel_listen(struct socket *sock, int backlog) +{ + return sock->ops->listen(sock, backlog); +} +EXPORT_SYMBOL(kernel_listen); + +int kernel_accept(struct socket *sock, struct socket **newsock, int flags) +{ + struct sock *sk = sock->sk; + int err; + + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, + newsock); + if (err < 0) + goto done; + + err = sock->ops->accept(sock, *newsock, flags); + if (err < 0) { + sock_release(*newsock); + *newsock = NULL; + goto done; + } + + (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); + +done: + return err; +} +EXPORT_SYMBOL(kernel_accept); + +int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, + int flags) +{ + return sock->ops->connect(sock, addr, addrlen, flags); +} +EXPORT_SYMBOL(kernel_connect); + +int kernel_getsockname(struct socket *sock, struct sockaddr *addr, + int *addrlen) +{ + return sock->ops->getname(sock, addr, addrlen, 0); +} +EXPORT_SYMBOL(kernel_getsockname); + +int kernel_getpeername(struct socket *sock, struct sockaddr *addr, + int *addrlen) +{ + return sock->ops->getname(sock, addr, addrlen, 1); +} +EXPORT_SYMBOL(kernel_getpeername); + +int kernel_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + mm_segment_t oldfs = get_fs(); + char __user *uoptval; + int __user *uoptlen; + int err; + + uoptval = (char __user __force *) optval; + uoptlen = (int __user __force *) optlen; + + set_fs(KERNEL_DS); + if (level == SOL_SOCKET) + err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); + else + err = sock->ops->getsockopt(sock, level, optname, uoptval, + uoptlen); + set_fs(oldfs); + return err; +} +EXPORT_SYMBOL(kernel_getsockopt); + +int kernel_setsockopt(struct socket *sock, int level, int optname, + char *optval, unsigned int optlen) +{ + mm_segment_t oldfs = get_fs(); + char __user *uoptval; + int err; + + uoptval = (char __user __force *) optval; + + set_fs(KERNEL_DS); + if (level == SOL_SOCKET) + err = sock_setsockopt(sock, level, optname, uoptval, optlen); + else + err = sock->ops->setsockopt(sock, level, optname, uoptval, + optlen); + set_fs(oldfs); + return err; +} +EXPORT_SYMBOL(kernel_setsockopt); + +int kernel_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) +{ + sock_update_classid(sock->sk); + + if (sock->ops->sendpage) + return sock->ops->sendpage(sock, page, offset, size, flags); + + return sock_no_sendpage(sock, page, offset, size, flags); +} +EXPORT_SYMBOL(kernel_sendpage); + +int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) +{ + mm_segment_t oldfs = get_fs(); + int err; + + set_fs(KERNEL_DS); + err = sock->ops->ioctl(sock, cmd, arg); + set_fs(oldfs); + + return err; +} +EXPORT_SYMBOL(kernel_sock_ioctl); + +int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) +{ + return sock->ops->shutdown(sock, how); +} +EXPORT_SYMBOL(kernel_sock_shutdown); diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig new file mode 100644 index 00000000..b2198e65 --- /dev/null +++ b/net/sunrpc/Kconfig @@ -0,0 +1,37 @@ +config SUNRPC + tristate + +config SUNRPC_GSS + tristate + +config SUNRPC_XPRT_RDMA + tristate + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL + default SUNRPC && INFINIBAND + help + This option allows the NFS client and server to support + an RDMA-enabled transport. + + To compile RPC client RDMA transport support as a module, + choose M here: the module will be called xprtrdma. + + If unsure, say N. + +config RPCSEC_GSS_KRB5 + tristate "Secure RPC: Kerberos V mechanism" + depends on SUNRPC && CRYPTO + depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS + depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES + depends on CRYPTO_ARC4 + default y + select SUNRPC_GSS + help + Choose Y here to enable Secure RPC using the Kerberos version 5 + GSS-API mechanism (RFC 1964). + + Secure RPC calls with Kerberos require an auxiliary user-space + daemon which may be found in the Linux nfs-utils package + available from http://linux-nfs.org/. In addition, user-space + Kerberos support should be installed. + + If unsure, say Y. diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile new file mode 100644 index 00000000..9d2fca5a --- /dev/null +++ b/net/sunrpc/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for Linux kernel SUN RPC +# + + +obj-$(CONFIG_SUNRPC) += sunrpc.o +obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ + +sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ + auth.o auth_null.o auth_unix.o auth_generic.o \ + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ + svc_xprt.o +sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o +sunrpc-$(CONFIG_PROC_FS) += stats.o +sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c new file mode 100644 index 00000000..4195233c --- /dev/null +++ b/net/sunrpc/addr.c @@ -0,0 +1,355 @@ +/* + * Copyright 2009, Oracle. All rights reserved. + * + * Convert socket addresses to presentation addresses and universal + * addresses, and vice versa. + * + * Universal addresses are introduced by RFC 1833 and further refined by + * recent RFCs describing NFSv4. The universal address format is part + * of the external (network) interface provided by rpcbind version 3 + * and 4, and by NFSv4. Such an address is a string containing a + * presentation format IP address followed by a port number in + * "hibyte.lobyte" format. + * + * IPv6 addresses can also include a scope ID, typically denoted by + * a '%' followed by a device name or a non-negative integer. Refer to + * RFC 4291, Section 2.2 for details on IPv6 presentation formats. + */ + +#include +#include +#include + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, + char *buf, const int buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr *addr = &sin6->sin6_addr; + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded ANY address + */ + if (ipv6_addr_any(addr)) + return snprintf(buf, buflen, "::"); + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded loopback address + */ + if (ipv6_addr_loopback(addr)) + return snprintf(buf, buflen, "::1"); + + /* + * RFC 4291, Section 2.2.3 + * + * Special presentation address format for mapped v4 + * addresses. + */ + if (ipv6_addr_v4mapped(addr)) + return snprintf(buf, buflen, "::ffff:%pI4", + &addr->s6_addr32[3]); + + /* + * RFC 4291, Section 2.2.1 + */ + return snprintf(buf, buflen, "%pI6c", addr); +} + +static size_t rpc_ntop6(const struct sockaddr *sap, + char *buf, const size_t buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char scopebuf[IPV6_SCOPE_ID_LEN]; + size_t len; + int rc; + + len = rpc_ntop6_noscopeid(sap, buf, buflen); + if (unlikely(len == 0)) + return len; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return len; + if (sin6->sin6_scope_id == 0) + return len; + + rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u", + IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id); + if (unlikely((size_t)rc > sizeof(scopebuf))) + return 0; + + len += rc; + if (unlikely(len > buflen)) + return 0; + + strcat(buf, scopebuf); + return len; +} + +#else /* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */ + +static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, + char *buf, const int buflen) +{ + return 0; +} + +static size_t rpc_ntop6(const struct sockaddr *sap, + char *buf, const size_t buflen) +{ + return 0; +} + +#endif /* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */ + +static int rpc_ntop4(const struct sockaddr *sap, + char *buf, const size_t buflen) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + return snprintf(buf, buflen, "%pI4", &sin->sin_addr); +} + +/** + * rpc_ntop - construct a presentation address in @buf + * @sap: socket address + * @buf: construction area + * @buflen: size of @buf, in bytes + * + * Plants a %NUL-terminated string in @buf and returns the length + * of the string, excluding the %NUL. Otherwise zero is returned. + */ +size_t rpc_ntop(const struct sockaddr *sap, char *buf, const size_t buflen) +{ + switch (sap->sa_family) { + case AF_INET: + return rpc_ntop4(sap, buf, buflen); + case AF_INET6: + return rpc_ntop6(sap, buf, buflen); + } + + return 0; +} +EXPORT_SYMBOL_GPL(rpc_ntop); + +static size_t rpc_pton4(const char *buf, const size_t buflen, + struct sockaddr *sap, const size_t salen) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + u8 *addr = (u8 *)&sin->sin_addr.s_addr; + + if (buflen > INET_ADDRSTRLEN || salen < sizeof(struct sockaddr_in)) + return 0; + + memset(sap, 0, sizeof(struct sockaddr_in)); + + if (in4_pton(buf, buflen, addr, '\0', NULL) == 0) + return 0; + + sin->sin_family = AF_INET; + return sizeof(struct sockaddr_in); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int rpc_parse_scope_id(const char *buf, const size_t buflen, + const char *delim, struct sockaddr_in6 *sin6) +{ + char *p; + size_t len; + + if ((buf + buflen) == delim) + return 1; + + if (*delim != IPV6_SCOPE_DELIMITER) + return 0; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; + + len = (buf + buflen) - delim - 1; + p = kstrndup(delim + 1, len, GFP_KERNEL); + if (p) { + unsigned long scope_id = 0; + struct net_device *dev; + + dev = dev_get_by_name(&init_net, p); + if (dev != NULL) { + scope_id = dev->ifindex; + dev_put(dev); + } else { + if (strict_strtoul(p, 10, &scope_id) == 0) { + kfree(p); + return 0; + } + } + + kfree(p); + + sin6->sin6_scope_id = scope_id; + return 1; + } + + return 0; +} + +static size_t rpc_pton6(const char *buf, const size_t buflen, + struct sockaddr *sap, const size_t salen) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; + const char *delim; + + if (buflen > (INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN) || + salen < sizeof(struct sockaddr_in6)) + return 0; + + memset(sap, 0, sizeof(struct sockaddr_in6)); + + if (in6_pton(buf, buflen, addr, IPV6_SCOPE_DELIMITER, &delim) == 0) + return 0; + + if (!rpc_parse_scope_id(buf, buflen, delim, sin6)) + return 0; + + sin6->sin6_family = AF_INET6; + return sizeof(struct sockaddr_in6); +} +#else +static size_t rpc_pton6(const char *buf, const size_t buflen, + struct sockaddr *sap, const size_t salen) +{ + return 0; +} +#endif + +/** + * rpc_pton - Construct a sockaddr in @sap + * @buf: C string containing presentation format IP address + * @buflen: length of presentation address in bytes + * @sap: buffer into which to plant socket address + * @salen: size of buffer in bytes + * + * Returns the size of the socket address if successful; otherwise + * zero is returned. + * + * Plants a socket address in @sap and returns the size of the + * socket address, if successful. Returns zero if an error + * occurred. + */ +size_t rpc_pton(const char *buf, const size_t buflen, + struct sockaddr *sap, const size_t salen) +{ + unsigned int i; + + for (i = 0; i < buflen; i++) + if (buf[i] == ':') + return rpc_pton6(buf, buflen, sap, salen); + return rpc_pton4(buf, buflen, sap, salen); +} +EXPORT_SYMBOL_GPL(rpc_pton); + +/** + * rpc_sockaddr2uaddr - Construct a universal address string from @sap. + * @sap: socket address + * + * Returns a %NUL-terminated string in dynamically allocated memory; + * otherwise NULL is returned if an error occurred. Caller must + * free the returned string. + */ +char *rpc_sockaddr2uaddr(const struct sockaddr *sap) +{ + char portbuf[RPCBIND_MAXUADDRPLEN]; + char addrbuf[RPCBIND_MAXUADDRLEN]; + unsigned short port; + + switch (sap->sa_family) { + case AF_INET: + if (rpc_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) + return NULL; + port = ntohs(((struct sockaddr_in *)sap)->sin_port); + break; + case AF_INET6: + if (rpc_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) + return NULL; + port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + break; + default: + return NULL; + } + + if (snprintf(portbuf, sizeof(portbuf), + ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf)) + return NULL; + + if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf)) + return NULL; + + return kstrdup(addrbuf, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(rpc_sockaddr2uaddr); + +/** + * rpc_uaddr2sockaddr - convert a universal address to a socket address. + * @uaddr: C string containing universal address to convert + * @uaddr_len: length of universal address string + * @sap: buffer into which to plant socket address + * @salen: size of buffer + * + * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and + * rpc_pton() require proper string termination to be successful. + * + * Returns the size of the socket address if successful; otherwise + * zero is returned. + */ +size_t rpc_uaddr2sockaddr(const char *uaddr, const size_t uaddr_len, + struct sockaddr *sap, const size_t salen) +{ + char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')]; + unsigned long portlo, porthi; + unsigned short port; + + if (uaddr_len > RPCBIND_MAXUADDRLEN) + return 0; + + memcpy(buf, uaddr, uaddr_len); + + buf[uaddr_len] = '\0'; + c = strrchr(buf, '.'); + if (unlikely(c == NULL)) + return 0; + if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0)) + return 0; + if (unlikely(portlo > 255)) + return 0; + + *c = '\0'; + c = strrchr(buf, '.'); + if (unlikely(c == NULL)) + return 0; + if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0)) + return 0; + if (unlikely(porthi > 255)) + return 0; + + port = (unsigned short)((porthi << 8) | portlo); + + *c = '\0'; + if (rpc_pton(buf, strlen(buf), sap, salen) == 0) + return 0; + + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = htons(port); + return sizeof(struct sockaddr_in); + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); + return sizeof(struct sockaddr_in6); + } + + return 0; +} +EXPORT_SYMBOL_GPL(rpc_uaddr2sockaddr); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c new file mode 100644 index 00000000..cd6e4aa1 --- /dev/null +++ b/net/sunrpc/auth.c @@ -0,0 +1,688 @@ +/* + * linux/net/sunrpc/auth.c + * + * Generic RPC client authentication API. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +#define RPC_CREDCACHE_DEFAULT_HASHBITS (4) +struct rpc_cred_cache { + struct hlist_head *hashtable; + unsigned int hashbits; + spinlock_t lock; +}; + +static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; + +static DEFINE_SPINLOCK(rpc_authflavor_lock); +static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { + &authnull_ops, /* AUTH_NULL */ + &authunix_ops, /* AUTH_UNIX */ + NULL, /* others can be loadable modules */ +}; + +static LIST_HEAD(cred_unused); +static unsigned long number_cred_unused; + +#define MAX_HASHTABLE_BITS (14) +static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + unsigned int nbits; + int ret; + + if (!val) + goto out_inval; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL) + goto out_inval; + nbits = fls(num); + if (num > (1U << nbits)) + nbits++; + if (nbits > MAX_HASHTABLE_BITS || nbits < 2) + goto out_inval; + *(unsigned int *)kp->arg = nbits; + return 0; +out_inval: + return -EINVAL; +} + +static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) +{ + unsigned int nbits; + + nbits = *(unsigned int *)kp->arg; + return sprintf(buffer, "%u", 1U << nbits); +} + +#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); + +static struct kernel_param_ops param_ops_hashtbl_sz = { + .set = param_set_hashtbl_sz, + .get = param_get_hashtbl_sz, +}; + +module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); +MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); + +static u32 +pseudoflavor_to_flavor(u32 flavor) { + if (flavor >= RPC_AUTH_MAXFLAVOR) + return RPC_AUTH_GSS; + return flavor; +} + +int +rpcauth_register(const struct rpc_authops *ops) +{ + rpc_authflavor_t flavor; + int ret = -EPERM; + + if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + spin_lock(&rpc_authflavor_lock); + if (auth_flavors[flavor] == NULL) { + auth_flavors[flavor] = ops; + ret = 0; + } + spin_unlock(&rpc_authflavor_lock); + return ret; +} +EXPORT_SYMBOL_GPL(rpcauth_register); + +int +rpcauth_unregister(const struct rpc_authops *ops) +{ + rpc_authflavor_t flavor; + int ret = -EPERM; + + if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + spin_lock(&rpc_authflavor_lock); + if (auth_flavors[flavor] == ops) { + auth_flavors[flavor] = NULL; + ret = 0; + } + spin_unlock(&rpc_authflavor_lock); + return ret; +} +EXPORT_SYMBOL_GPL(rpcauth_unregister); + +struct rpc_auth * +rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) +{ + struct rpc_auth *auth; + const struct rpc_authops *ops; + u32 flavor = pseudoflavor_to_flavor(pseudoflavor); + + auth = ERR_PTR(-EINVAL); + if (flavor >= RPC_AUTH_MAXFLAVOR) + goto out; + + if ((ops = auth_flavors[flavor]) == NULL) + request_module("rpc-auth-%u", flavor); + spin_lock(&rpc_authflavor_lock); + ops = auth_flavors[flavor]; + if (ops == NULL || !try_module_get(ops->owner)) { + spin_unlock(&rpc_authflavor_lock); + goto out; + } + spin_unlock(&rpc_authflavor_lock); + auth = ops->create(clnt, pseudoflavor); + module_put(ops->owner); + if (IS_ERR(auth)) + return auth; + if (clnt->cl_auth) + rpcauth_release(clnt->cl_auth); + clnt->cl_auth = auth; + +out: + return auth; +} +EXPORT_SYMBOL_GPL(rpcauth_create); + +void +rpcauth_release(struct rpc_auth *auth) +{ + if (!atomic_dec_and_test(&auth->au_count)) + return; + auth->au_ops->destroy(auth); +} + +static DEFINE_SPINLOCK(rpc_credcache_lock); + +static void +rpcauth_unhash_cred_locked(struct rpc_cred *cred) +{ + hlist_del_rcu(&cred->cr_hash); + smp_mb__before_clear_bit(); + clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); +} + +static int +rpcauth_unhash_cred(struct rpc_cred *cred) +{ + spinlock_t *cache_lock; + int ret; + + cache_lock = &cred->cr_auth->au_credcache->lock; + spin_lock(cache_lock); + ret = atomic_read(&cred->cr_count) == 0; + if (ret) + rpcauth_unhash_cred_locked(cred); + spin_unlock(cache_lock); + return ret; +} + +/* + * Initialize RPC credential cache + */ +int +rpcauth_init_credcache(struct rpc_auth *auth) +{ + struct rpc_cred_cache *new; + unsigned int hashsize; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out_nocache; + new->hashbits = auth_hashbits; + hashsize = 1U << new->hashbits; + new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); + if (!new->hashtable) + goto out_nohashtbl; + spin_lock_init(&new->lock); + auth->au_credcache = new; + return 0; +out_nohashtbl: + kfree(new); +out_nocache: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(rpcauth_init_credcache); + +/* + * Destroy a list of credentials + */ +static inline +void rpcauth_destroy_credlist(struct list_head *head) +{ + struct rpc_cred *cred; + + while (!list_empty(head)) { + cred = list_entry(head->next, struct rpc_cred, cr_lru); + list_del_init(&cred->cr_lru); + put_rpccred(cred); + } +} + +/* + * Clear the RPC credential cache, and delete those credentials + * that are not referenced. + */ +void +rpcauth_clear_credcache(struct rpc_cred_cache *cache) +{ + LIST_HEAD(free); + struct hlist_head *head; + struct rpc_cred *cred; + unsigned int hashsize = 1U << cache->hashbits; + int i; + + spin_lock(&rpc_credcache_lock); + spin_lock(&cache->lock); + for (i = 0; i < hashsize; i++) { + head = &cache->hashtable[i]; + while (!hlist_empty(head)) { + cred = hlist_entry(head->first, struct rpc_cred, cr_hash); + get_rpccred(cred); + if (!list_empty(&cred->cr_lru)) { + list_del(&cred->cr_lru); + number_cred_unused--; + } + list_add_tail(&cred->cr_lru, &free); + rpcauth_unhash_cred_locked(cred); + } + } + spin_unlock(&cache->lock); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); +} + +/* + * Destroy the RPC credential cache + */ +void +rpcauth_destroy_credcache(struct rpc_auth *auth) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + + if (cache) { + auth->au_credcache = NULL; + rpcauth_clear_credcache(cache); + kfree(cache->hashtable); + kfree(cache); + } +} +EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); + + +#define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) + +/* + * Remove stale credentials. Avoid sleeping inside the loop. + */ +static int +rpcauth_prune_expired(struct list_head *free, int nr_to_scan) +{ + spinlock_t *cache_lock; + struct rpc_cred *cred, *next; + unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; + + list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { + + if (nr_to_scan-- == 0) + break; + /* + * Enforce a 60 second garbage collection moratorium + * Note that the cred_unused list must be time-ordered. + */ + if (time_in_range(cred->cr_expire, expired, jiffies) && + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) + return 0; + + list_del_init(&cred->cr_lru); + number_cred_unused--; + if (atomic_read(&cred->cr_count) != 0) + continue; + + cache_lock = &cred->cr_auth->au_credcache->lock; + spin_lock(cache_lock); + if (atomic_read(&cred->cr_count) == 0) { + get_rpccred(cred); + list_add_tail(&cred->cr_lru, free); + rpcauth_unhash_cred_locked(cred); + } + spin_unlock(cache_lock); + } + return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; +} + +/* + * Run memory cache shrinker. + */ +static int +rpcauth_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(free); + int res; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return (nr_to_scan == 0) ? 0 : -1; + if (list_empty(&cred_unused)) + return 0; + spin_lock(&rpc_credcache_lock); + res = rpcauth_prune_expired(&free, nr_to_scan); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); + return res; +} + +/* + * Look up a process' credentials in the authentication cache + */ +struct rpc_cred * +rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, + int flags) +{ + LIST_HEAD(free); + struct rpc_cred_cache *cache = auth->au_credcache; + struct hlist_node *pos; + struct rpc_cred *cred = NULL, + *entry, *new; + unsigned int nr; + + nr = hash_long(acred->uid, cache->hashbits); + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) { + if (!entry->cr_ops->crmatch(acred, entry, flags)) + continue; + spin_lock(&cache->lock); + if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { + spin_unlock(&cache->lock); + continue; + } + cred = get_rpccred(entry); + spin_unlock(&cache->lock); + break; + } + rcu_read_unlock(); + + if (cred != NULL) + goto found; + + new = auth->au_ops->crcreate(auth, acred, flags); + if (IS_ERR(new)) { + cred = new; + goto out; + } + + spin_lock(&cache->lock); + hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) { + if (!entry->cr_ops->crmatch(acred, entry, flags)) + continue; + cred = get_rpccred(entry); + break; + } + if (cred == NULL) { + cred = new; + set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); + } else + list_add_tail(&new->cr_lru, &free); + spin_unlock(&cache->lock); +found: + if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && + cred->cr_ops->cr_init != NULL && + !(flags & RPCAUTH_LOOKUP_NEW)) { + int res = cred->cr_ops->cr_init(auth, cred); + if (res < 0) { + put_rpccred(cred); + cred = ERR_PTR(res); + } + } + rpcauth_destroy_credlist(&free); +out: + return cred; +} +EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); + +struct rpc_cred * +rpcauth_lookupcred(struct rpc_auth *auth, int flags) +{ + struct auth_cred acred; + struct rpc_cred *ret; + const struct cred *cred = current_cred(); + + dprintk("RPC: looking up %s cred\n", + auth->au_ops->au_name); + + memset(&acred, 0, sizeof(acred)); + acred.uid = cred->fsuid; + acred.gid = cred->fsgid; + acred.group_info = get_group_info(((struct cred *)cred)->group_info); + + ret = auth->au_ops->lookup_cred(auth, &acred, flags); + put_group_info(acred.group_info); + return ret; +} + +void +rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, + struct rpc_auth *auth, const struct rpc_credops *ops) +{ + INIT_HLIST_NODE(&cred->cr_hash); + INIT_LIST_HEAD(&cred->cr_lru); + atomic_set(&cred->cr_count, 1); + cred->cr_auth = auth; + cred->cr_ops = ops; + cred->cr_expire = jiffies; +#ifdef RPC_DEBUG + cred->cr_magic = RPCAUTH_CRED_MAGIC; +#endif + cred->cr_uid = acred->uid; +} +EXPORT_SYMBOL_GPL(rpcauth_init_cred); + +struct rpc_cred * +rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) +{ + dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, + cred->cr_auth->au_ops->au_name, cred); + return get_rpccred(cred); +} +EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred); + +static struct rpc_cred * +rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) +{ + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred acred = { + .uid = 0, + .gid = 0, + }; + + dprintk("RPC: %5u looking up %s cred\n", + task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); + return auth->au_ops->lookup_cred(auth, &acred, lookupflags); +} + +static struct rpc_cred * +rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) +{ + struct rpc_auth *auth = task->tk_client->cl_auth; + + dprintk("RPC: %5u looking up %s cred\n", + task->tk_pid, auth->au_ops->au_name); + return rpcauth_lookupcred(auth, lookupflags); +} + +static int +rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *new; + int lookupflags = 0; + + if (flags & RPC_TASK_ASYNC) + lookupflags |= RPCAUTH_LOOKUP_NEW; + if (cred != NULL) + new = cred->cr_ops->crbind(task, cred, lookupflags); + else if (flags & RPC_TASK_ROOTCREDS) + new = rpcauth_bind_root_cred(task, lookupflags); + else + new = rpcauth_bind_new_cred(task, lookupflags); + if (IS_ERR(new)) + return PTR_ERR(new); + if (req->rq_cred != NULL) + put_rpccred(req->rq_cred); + req->rq_cred = new; + return 0; +} + +void +put_rpccred(struct rpc_cred *cred) +{ + /* Fast path for unhashed credentials */ + if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { + if (atomic_dec_and_test(&cred->cr_count)) + cred->cr_ops->crdestroy(cred); + return; + } + + if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) + return; + if (!list_empty(&cred->cr_lru)) { + number_cred_unused--; + list_del_init(&cred->cr_lru); + } + if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { + cred->cr_expire = jiffies; + list_add_tail(&cred->cr_lru, &cred_unused); + number_cred_unused++; + goto out_nodestroy; + } + if (!rpcauth_unhash_cred(cred)) { + /* We were hashed and someone looked us up... */ + goto out_nodestroy; + } + } + spin_unlock(&rpc_credcache_lock); + cred->cr_ops->crdestroy(cred); + return; +out_nodestroy: + spin_unlock(&rpc_credcache_lock); +} +EXPORT_SYMBOL_GPL(put_rpccred); + +__be32 * +rpcauth_marshcred(struct rpc_task *task, __be32 *p) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + dprintk("RPC: %5u marshaling %s cred %p\n", + task->tk_pid, cred->cr_auth->au_ops->au_name, cred); + + return cred->cr_ops->crmarshal(task, p); +} + +__be32 * +rpcauth_checkverf(struct rpc_task *task, __be32 *p) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + dprintk("RPC: %5u validating %s cred %p\n", + task->tk_pid, cred->cr_auth->au_ops->au_name, cred); + + return cred->cr_ops->crvalidate(task, p); +} + +static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp, + __be32 *data, void *obj) +{ + struct xdr_stream xdr; + + xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data); + encode(rqstp, &xdr, obj); +} + +int +rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, + __be32 *data, void *obj) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + dprintk("RPC: %5u using %s cred %p to wrap rpc data\n", + task->tk_pid, cred->cr_ops->cr_name, cred); + if (cred->cr_ops->crwrap_req) + return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj); + /* By default, we encode the arguments normally. */ + rpcauth_wrap_req_encode(encode, rqstp, data, obj); + return 0; +} + +static int +rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp, + __be32 *data, void *obj) +{ + struct xdr_stream xdr; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data); + return decode(rqstp, &xdr, obj); +} + +int +rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, + __be32 *data, void *obj) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n", + task->tk_pid, cred->cr_ops->cr_name, cred); + if (cred->cr_ops->crunwrap_resp) + return cred->cr_ops->crunwrap_resp(task, decode, rqstp, + data, obj); + /* By default, we decode the arguments normally. */ + return rpcauth_unwrap_req_decode(decode, rqstp, data, obj); +} + +int +rpcauth_refreshcred(struct rpc_task *task) +{ + struct rpc_cred *cred; + int err; + + cred = task->tk_rqstp->rq_cred; + if (cred == NULL) { + err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags); + if (err < 0) + goto out; + cred = task->tk_rqstp->rq_cred; + }; + dprintk("RPC: %5u refreshing %s cred %p\n", + task->tk_pid, cred->cr_auth->au_ops->au_name, cred); + + err = cred->cr_ops->crrefresh(task); +out: + if (err < 0) + task->tk_status = err; + return err; +} + +void +rpcauth_invalcred(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + dprintk("RPC: %5u invalidating %s cred %p\n", + task->tk_pid, cred->cr_auth->au_ops->au_name, cred); + if (cred) + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); +} + +int +rpcauth_uptodatecred(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + return cred == NULL || + test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; +} + +static struct shrinker rpc_cred_shrinker = { + .shrink = rpcauth_cache_shrinker, + .seeks = DEFAULT_SEEKS, +}; + +int __init rpcauth_init_module(void) +{ + int err; + + err = rpc_init_authunix(); + if (err < 0) + goto out1; + err = rpc_init_generic_auth(); + if (err < 0) + goto out2; + register_shrinker(&rpc_cred_shrinker); + return 0; +out2: + rpc_destroy_authunix(); +out1: + return err; +} + +void rpcauth_remove_module(void) +{ + rpc_destroy_authunix(); + rpc_destroy_generic_auth(); + unregister_shrinker(&rpc_cred_shrinker); +} diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c new file mode 100644 index 00000000..e010a015 --- /dev/null +++ b/net/sunrpc/auth_generic.c @@ -0,0 +1,183 @@ +/* + * Generic RPC credential + * + * Copyright (C) 2008, Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +#define RPC_MACHINE_CRED_USERID ((uid_t)0) +#define RPC_MACHINE_CRED_GROUPID ((gid_t)0) + +struct generic_cred { + struct rpc_cred gc_base; + struct auth_cred acred; +}; + +static struct rpc_auth generic_auth; +static const struct rpc_credops generic_credops; + +/* + * Public call interface + */ +struct rpc_cred *rpc_lookup_cred(void) +{ + return rpcauth_lookupcred(&generic_auth, 0); +} +EXPORT_SYMBOL_GPL(rpc_lookup_cred); + +/* + * Public call interface for looking up machine creds. + */ +struct rpc_cred *rpc_lookup_machine_cred(void) +{ + struct auth_cred acred = { + .uid = RPC_MACHINE_CRED_USERID, + .gid = RPC_MACHINE_CRED_GROUPID, + .machine_cred = 1, + }; + + dprintk("RPC: looking up machine cred\n"); + return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0); +} +EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred); + +static struct rpc_cred *generic_bind_cred(struct rpc_task *task, + struct rpc_cred *cred, int lookupflags) +{ + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; + + return auth->au_ops->lookup_cred(auth, acred, lookupflags); +} + +/* + * Lookup generic creds for current process + */ +static struct rpc_cred * +generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return rpcauth_lookup_credcache(&generic_auth, acred, flags); +} + +static struct rpc_cred * +generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + struct generic_cred *gcred; + + gcred = kmalloc(sizeof(*gcred), GFP_KERNEL); + if (gcred == NULL) + return ERR_PTR(-ENOMEM); + + rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops); + gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + + gcred->acred.uid = acred->uid; + gcred->acred.gid = acred->gid; + gcred->acred.group_info = acred->group_info; + if (gcred->acred.group_info != NULL) + get_group_info(gcred->acred.group_info); + gcred->acred.machine_cred = acred->machine_cred; + + dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", + gcred->acred.machine_cred ? "machine" : "generic", + gcred, acred->uid, acred->gid); + return &gcred->gc_base; +} + +static void +generic_free_cred(struct rpc_cred *cred) +{ + struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); + + dprintk("RPC: generic_free_cred %p\n", gcred); + if (gcred->acred.group_info != NULL) + put_group_info(gcred->acred.group_info); + kfree(gcred); +} + +static void +generic_free_cred_callback(struct rcu_head *head) +{ + struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu); + generic_free_cred(cred); +} + +static void +generic_destroy_cred(struct rpc_cred *cred) +{ + call_rcu(&cred->cr_rcu, generic_free_cred_callback); +} + +/* + * Match credentials against current process creds. + */ +static int +generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) +{ + struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); + int i; + + if (gcred->acred.uid != acred->uid || + gcred->acred.gid != acred->gid || + gcred->acred.machine_cred != acred->machine_cred) + goto out_nomatch; + + /* Optimisation in the case where pointers are identical... */ + if (gcred->acred.group_info == acred->group_info) + goto out_match; + + /* Slow path... */ + if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) + goto out_nomatch; + for (i = 0; i < gcred->acred.group_info->ngroups; i++) { + if (GROUP_AT(gcred->acred.group_info, i) != + GROUP_AT(acred->group_info, i)) + goto out_nomatch; + } +out_match: + return 1; +out_nomatch: + return 0; +} + +int __init rpc_init_generic_auth(void) +{ + return rpcauth_init_credcache(&generic_auth); +} + +void rpc_destroy_generic_auth(void) +{ + rpcauth_destroy_credcache(&generic_auth); +} + +static const struct rpc_authops generic_auth_ops = { + .owner = THIS_MODULE, + .au_name = "Generic", + .lookup_cred = generic_lookup_cred, + .crcreate = generic_create_cred, +}; + +static struct rpc_auth generic_auth = { + .au_ops = &generic_auth_ops, + .au_count = ATOMIC_INIT(0), +}; + +static const struct rpc_credops generic_credops = { + .cr_name = "Generic cred", + .crdestroy = generic_destroy_cred, + .crbind = generic_bind_cred, + .crmatch = generic_match, +}; diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile new file mode 100644 index 00000000..9e4cb59e --- /dev/null +++ b/net/sunrpc/auth_gss/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for Linux kernel rpcsec_gss implementation +# + +obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o + +auth_rpcgss-y := auth_gss.o gss_generic_token.o \ + gss_mech_switch.o svcauth_gss.o + +obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + +rpcsec_gss_krb5-y := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ + gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c new file mode 100644 index 00000000..5daf6cc4 --- /dev/null +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -0,0 +1,1646 @@ +/* + * linux/net/sunrpc/auth_gss/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct rpc_authops authgss_ops; + +static const struct rpc_credops gss_credops; +static const struct rpc_credops gss_nullops; + +#define GSS_RETRY_EXPIRED 5 +static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +#define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) +/* length of a krb5 verifier (48), plus data added before arguments when + * using integrity (two 4-byte integers): */ +#define GSS_VERF_SLACK 100 + +struct gss_auth { + struct kref kref; + struct rpc_auth rpc_auth; + struct gss_api_mech *mech; + enum rpc_gss_svc service; + struct rpc_clnt *client; + /* + * There are two upcall pipes; dentry[1], named "gssd", is used + * for the new text-based upcall; dentry[0] is named after the + * mechanism (for example, "krb5") and exists for + * backwards-compatibility with older gssd's. + */ + struct dentry *dentry[2]; +}; + +/* pipe_version >= 0 if and only if someone has a pipe open. */ +static int pipe_version = -1; +static atomic_t pipe_users = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(pipe_version_lock); +static struct rpc_wait_queue pipe_version_rpc_waitqueue; +static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); + +static void gss_free_ctx(struct gss_cl_ctx *); +static const struct rpc_pipe_ops gss_upcall_ops_v0; +static const struct rpc_pipe_ops gss_upcall_ops_v1; + +static inline struct gss_cl_ctx * +gss_get_ctx(struct gss_cl_ctx *ctx) +{ + atomic_inc(&ctx->count); + return ctx; +} + +static inline void +gss_put_ctx(struct gss_cl_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->count)) + gss_free_ctx(ctx); +} + +/* gss_cred_set_ctx: + * called by gss_upcall_callback and gss_create_upcall in order + * to set the gss context. The actual exchange of an old context + * and a new one is protected by the inode->i_lock. + */ +static void +gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) + return; + gss_get_ctx(ctx); + rcu_assign_pointer(gss_cred->gc_ctx, ctx); + set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + smp_mb__before_clear_bit(); + clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); +} + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static inline const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + dest->data = kmemdup(p, len, GFP_NOFS); + if (unlikely(dest->data == NULL)) + return ERR_PTR(-ENOMEM); + dest->len = len; + return q; +} + +static struct gss_cl_ctx * +gss_cred_get_ctx(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx = NULL; + + rcu_read_lock(); + if (gss_cred->gc_ctx) + ctx = gss_get_ctx(gss_cred->gc_ctx); + rcu_read_unlock(); + return ctx; +} + +static struct gss_cl_ctx * +gss_alloc_context(void) +{ + struct gss_cl_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_NOFS); + if (ctx != NULL) { + ctx->gc_proc = RPC_GSS_PROC_DATA; + ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ + spin_lock_init(&ctx->gc_seq_lock); + atomic_set(&ctx->count,1); + } + return ctx; +} + +#define GSSD_MIN_TIMEOUT (60 * 60) +static const void * +gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) +{ + const void *q; + unsigned int seclen; + unsigned int timeout; + u32 window_size; + int ret; + + /* First unsigned int gives the lifetime (in seconds) of the cred */ + p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); + if (IS_ERR(p)) + goto err; + if (timeout == 0) + timeout = GSSD_MIN_TIMEOUT; + ctx->gc_expiry = jiffies + (unsigned long)timeout * HZ * 3 / 4; + /* Sequence number window. Determines the maximum number of simultaneous requests */ + p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); + if (IS_ERR(p)) + goto err; + ctx->gc_win = window_size; + /* gssd signals an error by passing ctx->gc_win = 0: */ + if (ctx->gc_win == 0) { + /* + * in which case, p points to an error code. Anything other + * than -EKEYEXPIRED gets converted to -EACCES. + */ + p = simple_get_bytes(p, end, &ret, sizeof(ret)); + if (!IS_ERR(p)) + p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) : + ERR_PTR(-EACCES); + goto err; + } + /* copy the opaque wire context */ + p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); + if (IS_ERR(p)) + goto err; + /* import the opaque security context */ + p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); + if (IS_ERR(p)) + goto err; + q = (const void *)((const char *)p + seclen); + if (unlikely(q > end || q < p)) { + p = ERR_PTR(-EFAULT); + goto err; + } + ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS); + if (ret < 0) { + p = ERR_PTR(ret); + goto err; + } + return q; +err: + dprintk("RPC: gss_fill_context returning %ld\n", -PTR_ERR(p)); + return p; +} + +#define UPCALL_BUF_LEN 128 + +struct gss_upcall_msg { + atomic_t count; + uid_t uid; + struct rpc_pipe_msg msg; + struct list_head list; + struct gss_auth *auth; + struct rpc_inode *inode; + struct rpc_wait_queue rpc_waitqueue; + wait_queue_head_t waitqueue; + struct gss_cl_ctx *ctx; + char databuf[UPCALL_BUF_LEN]; +}; + +static int get_pipe_version(void) +{ + int ret; + + spin_lock(&pipe_version_lock); + if (pipe_version >= 0) { + atomic_inc(&pipe_users); + ret = pipe_version; + } else + ret = -EAGAIN; + spin_unlock(&pipe_version_lock); + return ret; +} + +static void put_pipe_version(void) +{ + if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) { + pipe_version = -1; + spin_unlock(&pipe_version_lock); + } +} + +static void +gss_release_msg(struct gss_upcall_msg *gss_msg) +{ + if (!atomic_dec_and_test(&gss_msg->count)) + return; + put_pipe_version(); + BUG_ON(!list_empty(&gss_msg->list)); + if (gss_msg->ctx != NULL) + gss_put_ctx(gss_msg->ctx); + rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); + kfree(gss_msg); +} + +static struct gss_upcall_msg * +__gss_find_upcall(struct rpc_inode *rpci, uid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &rpci->in_downcall, list) { + if (pos->uid != uid) + continue; + atomic_inc(&pos->count); + dprintk("RPC: gss_find_upcall found msg %p\n", pos); + return pos; + } + dprintk("RPC: gss_find_upcall found nothing\n"); + return NULL; +} + +/* Try to add an upcall to the pipefs queue. + * If an upcall owned by our uid already exists, then we return a reference + * to that upcall instead of adding the new upcall. + */ +static inline struct gss_upcall_msg * +gss_add_msg(struct gss_upcall_msg *gss_msg) +{ + struct rpc_inode *rpci = gss_msg->inode; + struct inode *inode = &rpci->vfs_inode; + struct gss_upcall_msg *old; + + spin_lock(&inode->i_lock); + old = __gss_find_upcall(rpci, gss_msg->uid); + if (old == NULL) { + atomic_inc(&gss_msg->count); + list_add(&gss_msg->list, &rpci->in_downcall); + } else + gss_msg = old; + spin_unlock(&inode->i_lock); + return gss_msg; +} + +static void +__gss_unhash_msg(struct gss_upcall_msg *gss_msg) +{ + list_del_init(&gss_msg->list); + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); + wake_up_all(&gss_msg->waitqueue); + atomic_dec(&gss_msg->count); +} + +static void +gss_unhash_msg(struct gss_upcall_msg *gss_msg) +{ + struct inode *inode = &gss_msg->inode->vfs_inode; + + if (list_empty(&gss_msg->list)) + return; + spin_lock(&inode->i_lock); + if (!list_empty(&gss_msg->list)) + __gss_unhash_msg(gss_msg); + spin_unlock(&inode->i_lock); +} + +static void +gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) +{ + switch (gss_msg->msg.errno) { + case 0: + if (gss_msg->ctx == NULL) + break; + clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); + gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); + break; + case -EKEYEXPIRED: + set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); + } + gss_cred->gc_upcall_timestamp = jiffies; + gss_cred->gc_upcall = NULL; + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); +} + +static void +gss_upcall_callback(struct rpc_task *task) +{ + struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; + struct inode *inode = &gss_msg->inode->vfs_inode; + + spin_lock(&inode->i_lock); + gss_handle_downcall_result(gss_cred, gss_msg); + spin_unlock(&inode->i_lock); + task->tk_status = gss_msg->msg.errno; + gss_release_msg(gss_msg); +} + +static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) +{ + gss_msg->msg.data = &gss_msg->uid; + gss_msg->msg.len = sizeof(gss_msg->uid); +} + +static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) +{ + struct gss_api_mech *mech = gss_msg->auth->mech; + char *p = gss_msg->databuf; + int len = 0; + + gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ", + mech->gm_name, + gss_msg->uid); + p += gss_msg->msg.len; + if (clnt->cl_principal) { + len = sprintf(p, "target=%s ", clnt->cl_principal); + p += len; + gss_msg->msg.len += len; + } + if (machine_cred) { + len = sprintf(p, "service=* "); + p += len; + gss_msg->msg.len += len; + } else if (!strcmp(clnt->cl_program->name, "nfs4_cb")) { + len = sprintf(p, "service=nfs "); + p += len; + gss_msg->msg.len += len; + } + if (mech->gm_upcall_enctypes) { + len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes); + p += len; + gss_msg->msg.len += len; + } + len = sprintf(p, "\n"); + gss_msg->msg.len += len; + + gss_msg->msg.data = gss_msg->databuf; + BUG_ON(gss_msg->msg.len > UPCALL_BUF_LEN); +} + +static void gss_encode_msg(struct gss_upcall_msg *gss_msg, + struct rpc_clnt *clnt, int machine_cred) +{ + if (pipe_version == 0) + gss_encode_v0_msg(gss_msg); + else /* pipe_version == 1 */ + gss_encode_v1_msg(gss_msg, clnt, machine_cred); +} + +static inline struct gss_upcall_msg * +gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt, + int machine_cred) +{ + struct gss_upcall_msg *gss_msg; + int vers; + + gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); + if (gss_msg == NULL) + return ERR_PTR(-ENOMEM); + vers = get_pipe_version(); + if (vers < 0) { + kfree(gss_msg); + return ERR_PTR(vers); + } + gss_msg->inode = RPC_I(gss_auth->dentry[vers]->d_inode); + INIT_LIST_HEAD(&gss_msg->list); + rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); + init_waitqueue_head(&gss_msg->waitqueue); + atomic_set(&gss_msg->count, 1); + gss_msg->uid = uid; + gss_msg->auth = gss_auth; + gss_encode_msg(gss_msg, clnt, machine_cred); + return gss_msg; +} + +static struct gss_upcall_msg * +gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_new, *gss_msg; + uid_t uid = cred->cr_uid; + + gss_new = gss_alloc_msg(gss_auth, uid, clnt, gss_cred->gc_machine_cred); + if (IS_ERR(gss_new)) + return gss_new; + gss_msg = gss_add_msg(gss_new); + if (gss_msg == gss_new) { + struct inode *inode = &gss_new->inode->vfs_inode; + int res = rpc_queue_upcall(inode, &gss_new->msg); + if (res) { + gss_unhash_msg(gss_new); + gss_msg = ERR_PTR(res); + } + } else + gss_release_msg(gss_new); + return gss_msg; +} + +static void warn_gssd(void) +{ + static unsigned long ratelimit; + unsigned long now = jiffies; + + if (time_after(now, ratelimit)) { + printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" + "Please check user daemon is running.\n"); + ratelimit = now + 15*HZ; + } +} + +static inline int +gss_refresh_upcall(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + struct gss_auth *gss_auth = container_of(cred->cr_auth, + struct gss_auth, rpc_auth); + struct gss_cred *gss_cred = container_of(cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_msg; + struct inode *inode; + int err = 0; + + dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid, + cred->cr_uid); + gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); + if (PTR_ERR(gss_msg) == -EAGAIN) { + /* XXX: warning on the first, under the assumption we + * shouldn't normally hit this case on a refresh. */ + warn_gssd(); + task->tk_timeout = 15*HZ; + rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL); + return -EAGAIN; + } + if (IS_ERR(gss_msg)) { + err = PTR_ERR(gss_msg); + goto out; + } + inode = &gss_msg->inode->vfs_inode; + spin_lock(&inode->i_lock); + if (gss_cred->gc_upcall != NULL) + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); + else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + task->tk_timeout = 0; + gss_cred->gc_upcall = gss_msg; + /* gss_upcall_callback will release the reference to gss_upcall_msg */ + atomic_inc(&gss_msg->count); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); + } else { + gss_handle_downcall_result(gss_cred, gss_msg); + err = gss_msg->msg.errno; + } + spin_unlock(&inode->i_lock); + gss_release_msg(gss_msg); +out: + dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n", + task->tk_pid, cred->cr_uid, err); + return err; +} + +static inline int +gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) +{ + struct inode *inode; + struct rpc_cred *cred = &gss_cred->gc_base; + struct gss_upcall_msg *gss_msg; + DEFINE_WAIT(wait); + int err = 0; + + dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); +retry: + gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); + if (PTR_ERR(gss_msg) == -EAGAIN) { + err = wait_event_interruptible_timeout(pipe_version_waitqueue, + pipe_version >= 0, 15*HZ); + if (pipe_version < 0) { + warn_gssd(); + err = -EACCES; + } + if (err) + goto out; + goto retry; + } + if (IS_ERR(gss_msg)) { + err = PTR_ERR(gss_msg); + goto out; + } + inode = &gss_msg->inode->vfs_inode; + for (;;) { + prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE); + spin_lock(&inode->i_lock); + if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { + break; + } + spin_unlock(&inode->i_lock); + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_intr; + } + schedule(); + } + if (gss_msg->ctx) + gss_cred_set_ctx(cred, gss_msg->ctx); + else + err = gss_msg->msg.errno; + spin_unlock(&inode->i_lock); +out_intr: + finish_wait(&gss_msg->waitqueue, &wait); + gss_release_msg(gss_msg); +out: + dprintk("RPC: gss_create_upcall for uid %u result %d\n", + cred->cr_uid, err); + return err; +} + +static ssize_t +gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + size_t mlen = min(msg->len, buflen); + unsigned long left; + + left = copy_to_user(dst, data, mlen); + if (left == mlen) { + msg->errno = -EFAULT; + return -EFAULT; + } + + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} + +#define MSG_BUF_MAXSIZE 1024 + +static ssize_t +gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + const void *p, *end; + void *buf; + struct gss_upcall_msg *gss_msg; + struct inode *inode = filp->f_path.dentry->d_inode; + struct gss_cl_ctx *ctx; + uid_t uid; + ssize_t err = -EFBIG; + + if (mlen > MSG_BUF_MAXSIZE) + goto out; + err = -ENOMEM; + buf = kmalloc(mlen, GFP_NOFS); + if (!buf) + goto out; + + err = -EFAULT; + if (copy_from_user(buf, src, mlen)) + goto err; + + end = (const void *)((char *)buf + mlen); + p = simple_get_bytes(buf, end, &uid, sizeof(uid)); + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto err; + } + + err = -ENOMEM; + ctx = gss_alloc_context(); + if (ctx == NULL) + goto err; + + err = -ENOENT; + /* Find a matching upcall */ + spin_lock(&inode->i_lock); + gss_msg = __gss_find_upcall(RPC_I(inode), uid); + if (gss_msg == NULL) { + spin_unlock(&inode->i_lock); + goto err_put_ctx; + } + list_del_init(&gss_msg->list); + spin_unlock(&inode->i_lock); + + p = gss_fill_context(p, end, ctx, gss_msg->auth->mech); + if (IS_ERR(p)) { + err = PTR_ERR(p); + switch (err) { + case -EACCES: + case -EKEYEXPIRED: + gss_msg->msg.errno = err; + err = mlen; + break; + case -EFAULT: + case -ENOMEM: + case -EINVAL: + case -ENOSYS: + gss_msg->msg.errno = -EAGAIN; + break; + default: + printk(KERN_CRIT "%s: bad return from " + "gss_fill_context: %zd\n", __func__, err); + BUG(); + } + goto err_release_msg; + } + gss_msg->ctx = gss_get_ctx(ctx); + err = mlen; + +err_release_msg: + spin_lock(&inode->i_lock); + __gss_unhash_msg(gss_msg); + spin_unlock(&inode->i_lock); + gss_release_msg(gss_msg); +err_put_ctx: + gss_put_ctx(ctx); +err: + kfree(buf); +out: + dprintk("RPC: gss_pipe_downcall returning %Zd\n", err); + return err; +} + +static int gss_pipe_open(struct inode *inode, int new_version) +{ + int ret = 0; + + spin_lock(&pipe_version_lock); + if (pipe_version < 0) { + /* First open of any gss pipe determines the version: */ + pipe_version = new_version; + rpc_wake_up(&pipe_version_rpc_waitqueue); + wake_up(&pipe_version_waitqueue); + } else if (pipe_version != new_version) { + /* Trying to open a pipe of a different version */ + ret = -EBUSY; + goto out; + } + atomic_inc(&pipe_users); +out: + spin_unlock(&pipe_version_lock); + return ret; + +} + +static int gss_pipe_open_v0(struct inode *inode) +{ + return gss_pipe_open(inode, 0); +} + +static int gss_pipe_open_v1(struct inode *inode) +{ + return gss_pipe_open(inode, 1); +} + +static void +gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct gss_upcall_msg *gss_msg; + +restart: + spin_lock(&inode->i_lock); + list_for_each_entry(gss_msg, &rpci->in_downcall, list) { + + if (!list_empty(&gss_msg->msg.list)) + continue; + gss_msg->msg.errno = -EPIPE; + atomic_inc(&gss_msg->count); + __gss_unhash_msg(gss_msg); + spin_unlock(&inode->i_lock); + gss_release_msg(gss_msg); + goto restart; + } + spin_unlock(&inode->i_lock); + + put_pipe_version(); +} + +static void +gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); + + if (msg->errno < 0) { + dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", + gss_msg); + atomic_inc(&gss_msg->count); + gss_unhash_msg(gss_msg); + if (msg->errno == -ETIMEDOUT) + warn_gssd(); + gss_release_msg(gss_msg); + } +} + +/* + * NOTE: we have the opportunity to use different + * parameters based on the input flavor (which must be a pseudoflavor) + */ +static struct rpc_auth * +gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + struct gss_auth *gss_auth; + struct rpc_auth * auth; + int err = -ENOMEM; /* XXX? */ + + dprintk("RPC: creating GSS authenticator for client %p\n", clnt); + + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(err); + if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) + goto out_dec; + gss_auth->client = clnt; + err = -EINVAL; + gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); + if (!gss_auth->mech) { + printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n", + __func__, flavor); + goto err_free; + } + gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); + if (gss_auth->service == 0) + goto err_put_mech; + auth = &gss_auth->rpc_auth; + auth->au_cslack = GSS_CRED_SLACK >> 2; + auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_ops = &authgss_ops; + auth->au_flavor = flavor; + atomic_set(&auth->au_count, 1); + kref_init(&gss_auth->kref); + + /* + * Note: if we created the old pipe first, then someone who + * examined the directory at the right moment might conclude + * that we supported only the old pipe. So we instead create + * the new pipe first. + */ + gss_auth->dentry[1] = rpc_mkpipe(clnt->cl_path.dentry, + "gssd", + clnt, &gss_upcall_ops_v1, + RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gss_auth->dentry[1])) { + err = PTR_ERR(gss_auth->dentry[1]); + goto err_put_mech; + } + + gss_auth->dentry[0] = rpc_mkpipe(clnt->cl_path.dentry, + gss_auth->mech->gm_name, + clnt, &gss_upcall_ops_v0, + RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gss_auth->dentry[0])) { + err = PTR_ERR(gss_auth->dentry[0]); + goto err_unlink_pipe_1; + } + err = rpcauth_init_credcache(auth); + if (err) + goto err_unlink_pipe_0; + + return auth; +err_unlink_pipe_0: + rpc_unlink(gss_auth->dentry[0]); +err_unlink_pipe_1: + rpc_unlink(gss_auth->dentry[1]); +err_put_mech: + gss_mech_put(gss_auth->mech); +err_free: + kfree(gss_auth); +out_dec: + module_put(THIS_MODULE); + return ERR_PTR(err); +} + +static void +gss_free(struct gss_auth *gss_auth) +{ + rpc_unlink(gss_auth->dentry[1]); + rpc_unlink(gss_auth->dentry[0]); + gss_mech_put(gss_auth->mech); + + kfree(gss_auth); + module_put(THIS_MODULE); +} + +static void +gss_free_callback(struct kref *kref) +{ + struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref); + + gss_free(gss_auth); +} + +static void +gss_destroy(struct rpc_auth *auth) +{ + struct gss_auth *gss_auth; + + dprintk("RPC: destroying GSS authenticator %p flavor %d\n", + auth, auth->au_flavor); + + rpcauth_destroy_credcache(auth); + + gss_auth = container_of(auth, struct gss_auth, rpc_auth); + kref_put(&gss_auth->kref, gss_free_callback); +} + +/* + * gss_destroying_context will cause the RPCSEC_GSS to send a NULL RPC call + * to the server with the GSS control procedure field set to + * RPC_GSS_PROC_DESTROY. This should normally cause the server to release + * all RPCSEC_GSS state associated with that context. + */ +static int +gss_destroying_context(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); + struct rpc_task *task; + + if (gss_cred->gc_ctx == NULL || + test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) + return 0; + + gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY; + cred->cr_ops = &gss_nullops; + + /* Take a reference to ensure the cred will be destroyed either + * by the RPC call or by the put_rpccred() below */ + get_rpccred(cred); + + task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT); + if (!IS_ERR(task)) + rpc_put_task(task); + + put_rpccred(cred); + return 1; +} + +/* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure + * to create a new cred or context, so they check that things have been + * allocated before freeing them. */ +static void +gss_do_free_ctx(struct gss_cl_ctx *ctx) +{ + dprintk("RPC: gss_free_ctx\n"); + + gss_delete_sec_context(&ctx->gc_gss_ctx); + kfree(ctx->gc_wire_ctx.data); + kfree(ctx); +} + +static void +gss_free_ctx_callback(struct rcu_head *head) +{ + struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu); + gss_do_free_ctx(ctx); +} + +static void +gss_free_ctx(struct gss_cl_ctx *ctx) +{ + call_rcu(&ctx->gc_rcu, gss_free_ctx_callback); +} + +static void +gss_free_cred(struct gss_cred *gss_cred) +{ + dprintk("RPC: gss_free_cred %p\n", gss_cred); + kfree(gss_cred); +} + +static void +gss_free_cred_callback(struct rcu_head *head) +{ + struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu); + gss_free_cred(gss_cred); +} + +static void +gss_destroy_nullcred(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); + struct gss_cl_ctx *ctx = gss_cred->gc_ctx; + + rcu_assign_pointer(gss_cred->gc_ctx, NULL); + call_rcu(&cred->cr_rcu, gss_free_cred_callback); + if (ctx) + gss_put_ctx(ctx); + kref_put(&gss_auth->kref, gss_free_callback); +} + +static void +gss_destroy_cred(struct rpc_cred *cred) +{ + + if (gss_destroying_context(cred)) + return; + gss_destroy_nullcred(cred); +} + +/* + * Lookup RPCSEC_GSS cred for the current process + */ +static struct rpc_cred * +gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return rpcauth_lookup_credcache(auth, acred, flags); +} + +static struct rpc_cred * +gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); + struct gss_cred *cred = NULL; + int err = -ENOMEM; + + dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", + acred->uid, auth->au_flavor); + + if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) + goto out_err; + + rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); + /* + * Note: in order to force a call to call_refresh(), we deliberately + * fail to flag the credential as RPCAUTH_CRED_UPTODATE. + */ + cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; + cred->gc_service = gss_auth->service; + cred->gc_machine_cred = acred->machine_cred; + kref_get(&gss_auth->kref); + return &cred->gc_base; + +out_err: + dprintk("RPC: gss_create_cred failed with error %d\n", err); + return ERR_PTR(err); +} + +static int +gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) +{ + struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); + struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base); + int err; + + do { + err = gss_create_upcall(gss_auth, gss_cred); + } while (err == -EAGAIN); + return err; +} + +static int +gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) +{ + struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + + if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) + goto out; + /* Don't match with creds that have expired. */ + if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + return 0; + if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) + return 0; +out: + if (acred->machine_cred != gss_cred->gc_machine_cred) + return 0; + return rc->cr_uid == acred->uid; +} + +/* +* Marshal credentials. +* Maybe we should keep a cached credential for performance reasons. +*/ +static __be32 * +gss_marshal(struct rpc_task *task, __be32 *p) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *cred = req->rq_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + __be32 *cred_len; + u32 maj_stat = 0; + struct xdr_netobj mic; + struct kvec iov; + struct xdr_buf verf_buf; + + dprintk("RPC: %5u gss_marshal\n", task->tk_pid); + + *p++ = htonl(RPC_AUTH_GSS); + cred_len = p++; + + spin_lock(&ctx->gc_seq_lock); + req->rq_seqno = ctx->gc_seq++; + spin_unlock(&ctx->gc_seq_lock); + + *p++ = htonl((u32) RPC_GSS_VERSION); + *p++ = htonl((u32) ctx->gc_proc); + *p++ = htonl((u32) req->rq_seqno); + *p++ = htonl((u32) gss_cred->gc_service); + p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); + *cred_len = htonl((p - (cred_len + 1)) << 2); + + /* We compute the checksum for the verifier over the xdr-encoded bytes + * starting with the xid and ending at the end of the credential: */ + iov.iov_base = xprt_skip_transport_header(task->tk_xprt, + req->rq_snd_buf.head[0].iov_base); + iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; + xdr_buf_from_iov(&iov, &verf_buf); + + /* set verifier flavor*/ + *p++ = htonl(RPC_AUTH_GSS); + + mic.data = (u8 *)(p + 1); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) { + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + } else if (maj_stat != 0) { + printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + goto out_put_ctx; + } + p = xdr_encode_opaque(p, NULL, mic.len); + gss_put_ctx(ctx); + return p; +out_put_ctx: + gss_put_ctx(ctx); + return NULL; +} + +static int gss_renew_cred(struct rpc_task *task) +{ + struct rpc_cred *oldcred = task->tk_rqstp->rq_cred; + struct gss_cred *gss_cred = container_of(oldcred, + struct gss_cred, + gc_base); + struct rpc_auth *auth = oldcred->cr_auth; + struct auth_cred acred = { + .uid = oldcred->cr_uid, + .machine_cred = gss_cred->gc_machine_cred, + }; + struct rpc_cred *new; + + new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); + if (IS_ERR(new)) + return PTR_ERR(new); + task->tk_rqstp->rq_cred = new; + put_rpccred(oldcred); + return 0; +} + +static int gss_cred_is_negative_entry(struct rpc_cred *cred) +{ + if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { + unsigned long now = jiffies; + unsigned long begin, expire; + struct gss_cred *gss_cred; + + gss_cred = container_of(cred, struct gss_cred, gc_base); + begin = gss_cred->gc_upcall_timestamp; + expire = begin + gss_expired_cred_retry_delay * HZ; + + if (time_in_range_open(now, begin, expire)) + return 1; + } + return 0; +} + +/* +* Refresh credentials. XXX - finish +*/ +static int +gss_refresh(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + int ret = 0; + + if (gss_cred_is_negative_entry(cred)) + return -EKEYEXPIRED; + + if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && + !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + ret = gss_renew_cred(task); + if (ret < 0) + goto out; + cred = task->tk_rqstp->rq_cred; + } + + if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) + ret = gss_refresh_upcall(task); +out: + return ret; +} + +/* Dummy refresh routine: used only when destroying the context */ +static int +gss_refresh_null(struct rpc_task *task) +{ + return -EACCES; +} + +static __be32 * +gss_validate(struct rpc_task *task, __be32 *p) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + __be32 seq; + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + u32 flav,len; + u32 maj_stat; + + dprintk("RPC: %5u gss_validate\n", task->tk_pid); + + flav = ntohl(*p++); + if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE) + goto out_bad; + if (flav != RPC_AUTH_GSS) + goto out_bad; + seq = htonl(task->tk_rqstp->rq_seqno); + iov.iov_base = &seq; + iov.iov_len = sizeof(seq); + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + if (maj_stat) { + dprintk("RPC: %5u gss_validate: gss_verify_mic returned " + "error 0x%08x\n", task->tk_pid, maj_stat); + goto out_bad; + } + /* We leave it to unwrap to calculate au_rslack. For now we just + * calculate the length of the verifier: */ + cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; + gss_put_ctx(ctx); + dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n", + task->tk_pid); + return p + XDR_QUADLEN(len); +out_bad: + gss_put_ctx(ctx); + dprintk("RPC: %5u gss_validate failed.\n", task->tk_pid); + return NULL; +} + +static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp, + __be32 *p, void *obj) +{ + struct xdr_stream xdr; + + xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p); + encode(rqstp, &xdr, obj); +} + +static inline int +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdreproc_t encode, struct rpc_rqst *rqstp, + __be32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + struct xdr_buf integ_buf; + __be32 *integ_len = NULL; + struct xdr_netobj mic; + u32 offset; + __be32 *q; + struct kvec *iov; + u32 maj_stat = 0; + int status = -EIO; + + integ_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + gss_wrap_req_encode(encode, rqstp, p, obj); + + if (xdr_buf_subsegment(snd_buf, &integ_buf, + offset, snd_buf->len - offset)) + return status; + *integ_len = htonl(integ_buf.len); + + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + mic.data = (u8 *)(p + 1); + + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); + status = -EIO; /* XXX? */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + else if (maj_stat) + return status; + q = xdr_encode_opaque(p, NULL, mic.len); + + offset = (u8 *)q - (u8 *)p; + iov->iov_len += offset; + snd_buf->len += offset; + return 0; +} + +static void +priv_release_snd_buf(struct rpc_rqst *rqstp) +{ + int i; + + for (i=0; i < rqstp->rq_enc_pages_num; i++) + __free_page(rqstp->rq_enc_pages[i]); + kfree(rqstp->rq_enc_pages); +} + +static int +alloc_enc_pages(struct rpc_rqst *rqstp) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + int first, last, i; + + if (snd_buf->page_len == 0) { + rqstp->rq_enc_pages_num = 0; + return 0; + } + + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; + rqstp->rq_enc_pages_num = last - first + 1 + 1; + rqstp->rq_enc_pages + = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), + GFP_NOFS); + if (!rqstp->rq_enc_pages) + goto out; + for (i=0; i < rqstp->rq_enc_pages_num; i++) { + rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); + if (rqstp->rq_enc_pages[i] == NULL) + goto out_free; + } + rqstp->rq_release_snd_buf = priv_release_snd_buf; + return 0; +out_free: + rqstp->rq_enc_pages_num = i; + priv_release_snd_buf(rqstp); +out: + return -EAGAIN; +} + +static inline int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdreproc_t encode, struct rpc_rqst *rqstp, + __be32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + u32 offset; + u32 maj_stat; + int status; + __be32 *opaque_len; + struct page **inpages; + int first; + int pad; + struct kvec *iov; + char *tmp; + + opaque_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + gss_wrap_req_encode(encode, rqstp, p, obj); + + status = alloc_enc_pages(rqstp); + if (status) + return status; + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; + /* + * Give the tail its own page, in case we need extra space in the + * head when wrapping: + * + * call_allocate() allocates twice the slack space required + * by the authentication flavor to rq_callsize. + * For GSS, slack is GSS_CRED_SLACK. + */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); + /* slack space should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was + * done anyway, so it's safe to put the request on the wire: */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + else if (maj_stat) + return status; + + *opaque_len = htonl(snd_buf->len - offset); + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + pad = 3 - ((snd_buf->len - offset - 1) & 3); + memset(p, 0, pad); + iov->iov_len += pad; + snd_buf->len += pad; + + return 0; +} + +static int +gss_wrap_req(struct rpc_task *task, + kxdreproc_t encode, void *rqstp, __be32 *p, void *obj) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + int status = -EIO; + + dprintk("RPC: %5u gss_wrap_req\n", task->tk_pid); + if (ctx->gc_proc != RPC_GSS_PROC_DATA) { + /* The spec seems a little ambiguous here, but I think that not + * wrapping context destruction requests makes the most sense. + */ + gss_wrap_req_encode(encode, rqstp, p, obj); + status = 0; + goto out; + } + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + gss_wrap_req_encode(encode, rqstp, p, obj); + status = 0; + break; + case RPC_GSS_SVC_INTEGRITY: + status = gss_wrap_req_integ(cred, ctx, encode, + rqstp, p, obj); + break; + case RPC_GSS_SVC_PRIVACY: + status = gss_wrap_req_priv(cred, ctx, encode, + rqstp, p, obj); + break; + } +out: + gss_put_ctx(ctx); + dprintk("RPC: %5u gss_wrap_req returning %d\n", task->tk_pid, status); + return status; +} + +static inline int +gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, __be32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + u32 data_offset, mic_offset; + u32 integ_len; + u32 maj_stat; + int status = -EIO; + + integ_len = ntohl(*(*p)++); + if (integ_len & 3) + return status; + data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + mic_offset = integ_len + data_offset; + if (mic_offset > rcv_buf->len) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, + mic_offset - data_offset)) + return status; + + if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + return status; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + if (maj_stat != GSS_S_COMPLETE) + return status; + return 0; +} + +static inline int +gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, __be32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + u32 offset; + u32 opaque_len; + u32 maj_stat; + int status = -EIO; + + opaque_len = ntohl(*(*p)++); + offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + if (offset + opaque_len > rcv_buf->len) + return status; + /* remove padding: */ + rcv_buf->len = offset + opaque_len; + + maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + if (maj_stat != GSS_S_COMPLETE) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + return 0; +} + +static int +gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp, + __be32 *p, void *obj) +{ + struct xdr_stream xdr; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + return decode(rqstp, &xdr, obj); +} + +static int +gss_unwrap_resp(struct rpc_task *task, + kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + __be32 *savedp = p; + struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head; + int savedlen = head->iov_len; + int status = -EIO; + + if (ctx->gc_proc != RPC_GSS_PROC_DATA) + goto out_decode; + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p); + if (status) + goto out; + break; + case RPC_GSS_SVC_PRIVACY: + status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p); + if (status) + goto out; + break; + } + /* take into account extra slack for integrity and privacy cases: */ + cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp) + + (savedlen - head->iov_len); +out_decode: + status = gss_unwrap_req_decode(decode, rqstp, p, obj); +out: + gss_put_ctx(ctx); + dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid, + status); + return status; +} + +static const struct rpc_authops authgss_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_GSS, + .au_name = "RPCSEC_GSS", + .create = gss_create, + .destroy = gss_destroy, + .lookup_cred = gss_lookup_cred, + .crcreate = gss_create_cred +}; + +static const struct rpc_credops gss_credops = { + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_cred, + .cr_init = gss_cred_init, + .crbind = rpcauth_generic_bind_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, +}; + +static const struct rpc_credops gss_nullops = { + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_nullcred, + .crbind = rpcauth_generic_bind_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh_null, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, +}; + +static const struct rpc_pipe_ops gss_upcall_ops_v0 = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .open_pipe = gss_pipe_open_v0, + .release_pipe = gss_pipe_release, +}; + +static const struct rpc_pipe_ops gss_upcall_ops_v1 = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .open_pipe = gss_pipe_open_v1, + .release_pipe = gss_pipe_release, +}; + +/* + * Initialize RPCSEC_GSS module + */ +static int __init init_rpcsec_gss(void) +{ + int err = 0; + + err = rpcauth_register(&authgss_ops); + if (err) + goto out; + err = gss_svc_init(); + if (err) + goto out_unregister; + rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version"); + return 0; +out_unregister: + rpcauth_unregister(&authgss_ops); +out: + return err; +} + +static void __exit exit_rpcsec_gss(void) +{ + gss_svc_shutdown(); + rpcauth_unregister(&authgss_ops); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_LICENSE("GPL"); +module_param_named(expired_cred_retry_delay, + gss_expired_cred_retry_delay, + uint, 0644); +MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " + "the RPC engine retries an expired credential"); + +module_init(init_rpcsec_gss) +module_exit(exit_rpcsec_gss) diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c new file mode 100644 index 00000000..c586e92b --- /dev/null +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -0,0 +1,234 @@ +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + compile-time constant string (assume 1 byte) + compile-time constant string + the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static int +der_length_size( int length) +{ + if (length < (1<<7)) + return 1; + else if (length < (1<<8)) + return 2; +#if (SIZEOF_INT == 2) + else + return 3; +#else + else if (length < (1<<16)) + return 3; + else if (length < (1<<24)) + return 4; + else + return 5; +#endif +} + +static void +der_write_length(unsigned char **buf, int length) +{ + if (length < (1<<7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length)+127); +#if (SIZEOF_INT > 2) + if (length >= (1<<24)) + *(*buf)++ = (unsigned char) (length>>24); + if (length >= (1<<16)) + *(*buf)++ = (unsigned char) ((length>>16)&0xff); +#endif + if (length >= (1<<8)) + *(*buf)++ = (unsigned char) ((length>>8)&0xff); + *(*buf)++ = (unsigned char) (length&0xff); + } +} + +/* returns decoded length, or < 0 on failure. Advances buf and + decrements bufsize */ + +static int +der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return -1; + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize)-1)) + return -1; + if (sf > SIZEOF_INT) + return -1; + ret = 0; + for (; sf; sf--) { + ret = (ret<<8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return ret; +} + +/* returns the length of a token, given the mech oid and the body size */ + +int +g_token_size(struct xdr_netobj *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 2 + (int) mech->len; /* NEED overflow check */ + return 1 + der_length_size(body_size) + body_size; +} + +EXPORT_SYMBOL_GPL(g_token_size); + +/* fills in a buffer with the token header. The buffer is assumed to + be the right size. buf is advanced past the token header */ + +void +g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 2 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +EXPORT_SYMBOL_GPL(g_make_token_header); + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +u32 +g_verify_token_header(struct xdr_netobj *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + struct xdr_netobj toid; + int ret = 0; + + if ((toksize-=1) < 0) + return G_BAD_TOK_HEADER; + if (*buf++ != 0x60) + return G_BAD_TOK_HEADER; + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return G_BAD_TOK_HEADER; + + if (seqsize != toksize) + return G_BAD_TOK_HEADER; + + if ((toksize-=1) < 0) + return G_BAD_TOK_HEADER; + if (*buf++ != 0x06) + return G_BAD_TOK_HEADER; + + if ((toksize-=1) < 0) + return G_BAD_TOK_HEADER; + toid.len = *buf++; + + if ((toksize-=toid.len) < 0) + return G_BAD_TOK_HEADER; + toid.data = buf; + buf+=toid.len; + + if (! g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more important + to return G_BAD_TOK_HEADER if the token header is in fact bad */ + + if ((toksize-=2) < 0) + return G_BAD_TOK_HEADER; + + if (ret) + return ret; + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return ret; +} + +EXPORT_SYMBOL_GPL(g_verify_token_header); + diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c new file mode 100644 index 00000000..9576f35a --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -0,0 +1,990 @@ +/* + * linux/net/sunrpc/gss_krb5_crypto.c + * + * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +u32 +krb5_encrypt( + struct crypto_blkcipher *tfm, + void * iv, + void * in, + void * out, + int length) +{ + u32 ret = -EINVAL; + struct scatterlist sg[1]; + u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + + if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm)); + + memcpy(out, in, length); + sg_init_one(sg, out, length); + + ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, length); +out: + dprintk("RPC: krb5_encrypt returns %d\n", ret); + return ret; +} + +u32 +krb5_decrypt( + struct crypto_blkcipher *tfm, + void * iv, + void * in, + void * out, + int length) +{ + u32 ret = -EINVAL; + struct scatterlist sg[1]; + u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; + struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv }; + + if (length % crypto_blkcipher_blocksize(tfm) != 0) + goto out; + + if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", + crypto_blkcipher_ivsize(tfm)); + goto out; + } + if (iv) + memcpy(local_iv,iv, crypto_blkcipher_ivsize(tfm)); + + memcpy(out, in, length); + sg_init_one(sg, out, length); + + ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, length); +out: + dprintk("RPC: gss_k5decrypt returns %d\n",ret); + return ret; +} + +static int +checksummer(struct scatterlist *sg, void *data) +{ + struct hash_desc *desc = data; + + return crypto_hash_update(desc, sg, sg->length); +} + +static int +arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4]) +{ + unsigned int ms_usage; + + switch (usage) { + case KG_USAGE_SIGN: + ms_usage = 15; + break; + case KG_USAGE_SEAL: + ms_usage = 13; + break; + default: + return -EINVAL; + } + salt[0] = (ms_usage >> 0) & 0xff; + salt[1] = (ms_usage >> 8) & 0xff; + salt[2] = (ms_usage >> 16) & 0xff; + salt[3] = (ms_usage >> 24) & 0xff; + + return 0; +} + +static u32 +make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, + struct xdr_buf *body, int body_offset, u8 *cksumkey, + unsigned int usage, struct xdr_netobj *cksumout) +{ + struct hash_desc desc; + struct scatterlist sg[1]; + int err; + u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + u8 rc4salt[4]; + struct crypto_hash *md5; + struct crypto_hash *hmac_md5; + + if (cksumkey == NULL) + return GSS_S_FAILURE; + + if (cksumout->len < kctx->gk5e->cksumlength) { + dprintk("%s: checksum buffer length, %u, too small for %s\n", + __func__, cksumout->len, kctx->gk5e->name); + return GSS_S_FAILURE; + } + + if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) { + dprintk("%s: invalid usage value %u\n", __func__, usage); + return GSS_S_FAILURE; + } + + md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(md5)) + return GSS_S_FAILURE; + + hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(hmac_md5)) { + crypto_free_hash(md5); + return GSS_S_FAILURE; + } + + desc.tfm = md5; + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_hash_init(&desc); + if (err) + goto out; + sg_init_one(sg, rc4salt, 4); + err = crypto_hash_update(&desc, sg, 4); + if (err) + goto out; + + sg_init_one(sg, header, hdrlen); + err = crypto_hash_update(&desc, sg, hdrlen); + if (err) + goto out; + err = xdr_process_buf(body, body_offset, body->len - body_offset, + checksummer, &desc); + if (err) + goto out; + err = crypto_hash_final(&desc, checksumdata); + if (err) + goto out; + + desc.tfm = hmac_md5; + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_hash_init(&desc); + if (err) + goto out; + err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength); + if (err) + goto out; + + sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5)); + err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5), + checksumdata); + if (err) + goto out; + + memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); + cksumout->len = kctx->gk5e->cksumlength; +out: + crypto_free_hash(md5); + crypto_free_hash(hmac_md5); + return err ? GSS_S_FAILURE : 0; +} + +/* + * checksum the plaintext data and hdrlen bytes of the token header + * The checksum is performed over the first 8 bytes of the + * gss token header and then over the data body + */ +u32 +make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, + struct xdr_buf *body, int body_offset, u8 *cksumkey, + unsigned int usage, struct xdr_netobj *cksumout) +{ + struct hash_desc desc; + struct scatterlist sg[1]; + int err; + u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + unsigned int checksumlen; + + if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) + return make_checksum_hmac_md5(kctx, header, hdrlen, + body, body_offset, + cksumkey, usage, cksumout); + + if (cksumout->len < kctx->gk5e->cksumlength) { + dprintk("%s: checksum buffer length, %u, too small for %s\n", + __func__, cksumout->len, kctx->gk5e->name); + return GSS_S_FAILURE; + } + + desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) + return GSS_S_FAILURE; + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + checksumlen = crypto_hash_digestsize(desc.tfm); + + if (cksumkey != NULL) { + err = crypto_hash_setkey(desc.tfm, cksumkey, + kctx->gk5e->keylength); + if (err) + goto out; + } + + err = crypto_hash_init(&desc); + if (err) + goto out; + sg_init_one(sg, header, hdrlen); + err = crypto_hash_update(&desc, sg, hdrlen); + if (err) + goto out; + err = xdr_process_buf(body, body_offset, body->len - body_offset, + checksummer, &desc); + if (err) + goto out; + err = crypto_hash_final(&desc, checksumdata); + if (err) + goto out; + + switch (kctx->gk5e->ctype) { + case CKSUMTYPE_RSA_MD5: + err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata, + checksumdata, checksumlen); + if (err) + goto out; + memcpy(cksumout->data, + checksumdata + checksumlen - kctx->gk5e->cksumlength, + kctx->gk5e->cksumlength); + break; + case CKSUMTYPE_HMAC_SHA1_DES3: + memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); + break; + default: + BUG(); + break; + } + cksumout->len = kctx->gk5e->cksumlength; +out: + crypto_free_hash(desc.tfm); + return err ? GSS_S_FAILURE : 0; +} + +/* + * checksum the plaintext data and hdrlen bytes of the token header + * Per rfc4121, sec. 4.2.4, the checksum is performed over the data + * body then over the first 16 octets of the MIC token + * Inclusion of the header data in the calculation of the + * checksum is optional. + */ +u32 +make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, + struct xdr_buf *body, int body_offset, u8 *cksumkey, + unsigned int usage, struct xdr_netobj *cksumout) +{ + struct hash_desc desc; + struct scatterlist sg[1]; + int err; + u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + unsigned int checksumlen; + + if (kctx->gk5e->keyed_cksum == 0) { + dprintk("%s: expected keyed hash for %s\n", + __func__, kctx->gk5e->name); + return GSS_S_FAILURE; + } + if (cksumkey == NULL) { + dprintk("%s: no key supplied for %s\n", + __func__, kctx->gk5e->name); + return GSS_S_FAILURE; + } + + desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(desc.tfm)) + return GSS_S_FAILURE; + checksumlen = crypto_hash_digestsize(desc.tfm); + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength); + if (err) + goto out; + + err = crypto_hash_init(&desc); + if (err) + goto out; + err = xdr_process_buf(body, body_offset, body->len - body_offset, + checksummer, &desc); + if (err) + goto out; + if (header != NULL) { + sg_init_one(sg, header, hdrlen); + err = crypto_hash_update(&desc, sg, hdrlen); + if (err) + goto out; + } + err = crypto_hash_final(&desc, checksumdata); + if (err) + goto out; + + cksumout->len = kctx->gk5e->cksumlength; + + switch (kctx->gk5e->ctype) { + case CKSUMTYPE_HMAC_SHA1_96_AES128: + case CKSUMTYPE_HMAC_SHA1_96_AES256: + /* note that this truncates the hash */ + memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); + break; + default: + BUG(); + break; + } +out: + crypto_free_hash(desc.tfm); + return err ? GSS_S_FAILURE : 0; +} + +struct encryptor_desc { + u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + int pos; + struct xdr_buf *outbuf; + struct page **pages; + struct scatterlist infrags[4]; + struct scatterlist outfrags[4]; + int fragno; + int fraglen; +}; + +static int +encryptor(struct scatterlist *sg, void *data) +{ + struct encryptor_desc *desc = data; + struct xdr_buf *outbuf = desc->outbuf; + struct page *in_page; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + int page_pos; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + + page_pos = desc->pos - outbuf->head[0].iov_len; + if (page_pos >= 0 && page_pos < outbuf->page_len) { + /* pages are not in place: */ + int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; + in_page = desc->pages[i]; + } else { + in_page = sg_page(sg); + } + sg_set_page(&desc->infrags[desc->fragno], in_page, sg->length, + sg->offset); + sg_set_page(&desc->outfrags[desc->fragno], sg_page(sg), sg->length, + sg->offset); + desc->fragno++; + desc->fraglen += sg->length; + desc->pos += sg->length; + + fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) + return 0; + + sg_mark_end(&desc->infrags[desc->fragno - 1]); + sg_mark_end(&desc->outfrags[desc->fragno - 1]); + + ret = crypto_blkcipher_encrypt_iv(&desc->desc, desc->outfrags, + desc->infrags, thislen); + if (ret) + return ret; + + sg_init_table(desc->infrags, 4); + sg_init_table(desc->outfrags, 4); + + if (fraglen) { + sg_set_page(&desc->outfrags[0], sg_page(sg), fraglen, + sg->offset + sg->length - fraglen); + desc->infrags[0] = desc->outfrags[0]; + sg_assign_page(&desc->infrags[0], in_page); + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, + int offset, struct page **pages) +{ + int ret; + struct encryptor_desc desc; + + BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.desc.tfm = tfm; + desc.desc.info = desc.iv; + desc.desc.flags = 0; + desc.pos = offset; + desc.outbuf = buf; + desc.pages = pages; + desc.fragno = 0; + desc.fraglen = 0; + + sg_init_table(desc.infrags, 4); + sg_init_table(desc.outfrags, 4); + + ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc); + return ret; +} + +struct decryptor_desc { + u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; + struct blkcipher_desc desc; + struct scatterlist frags[4]; + int fragno; + int fraglen; +}; + +static int +decryptor(struct scatterlist *sg, void *data) +{ + struct decryptor_desc *desc = data; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + sg_set_page(&desc->frags[desc->fragno], sg_page(sg), sg->length, + sg->offset); + desc->fragno++; + desc->fraglen += sg->length; + + fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1); + thislen -= fraglen; + + if (thislen == 0) + return 0; + + sg_mark_end(&desc->frags[desc->fragno - 1]); + + ret = crypto_blkcipher_decrypt_iv(&desc->desc, desc->frags, + desc->frags, thislen); + if (ret) + return ret; + + sg_init_table(desc->frags, 4); + + if (fraglen) { + sg_set_page(&desc->frags[0], sg_page(sg), fraglen, + sg->offset + sg->length - fraglen); + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf, + int offset) +{ + struct decryptor_desc desc; + + /* XXXJBF: */ + BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.desc.tfm = tfm; + desc.desc.info = desc.iv; + desc.desc.flags = 0; + desc.fragno = 0; + desc.fraglen = 0; + + sg_init_table(desc.frags, 4); + + return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); +} + +/* + * This function makes the assumption that it was ultimately called + * from gss_wrap(). + * + * The client auth_gss code moves any existing tail data into a + * separate page before calling gss_wrap. + * The server svcauth_gss code ensures that both the head and the + * tail have slack space of RPC_MAX_AUTH_SIZE before calling gss_wrap. + * + * Even with that guarantee, this function may be called more than + * once in the processing of gss_wrap(). The best we can do is + * verify at compile-time (see GSS_KRB5_SLACK_CHECK) that the + * largest expected shift will fit within RPC_MAX_AUTH_SIZE. + * At run-time we can verify that a single invocation of this + * function doesn't attempt to use more the RPC_MAX_AUTH_SIZE. + */ + +int +xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) +{ + u8 *p; + + if (shiftlen == 0) + return 0; + + BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); + BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE); + + p = buf->head[0].iov_base + base; + + memmove(p + shiftlen, p, buf->head[0].iov_len - base); + + buf->head[0].iov_len += shiftlen; + buf->len += shiftlen; + + return 0; +} + +static u32 +gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf, + u32 offset, u8 *iv, struct page **pages, int encrypt) +{ + u32 ret; + struct scatterlist sg[1]; + struct blkcipher_desc desc = { .tfm = cipher, .info = iv }; + u8 data[crypto_blkcipher_blocksize(cipher) * 2]; + struct page **save_pages; + u32 len = buf->len - offset; + + BUG_ON(len > crypto_blkcipher_blocksize(cipher) * 2); + + /* + * For encryption, we want to read from the cleartext + * page cache pages, and write the encrypted data to + * the supplied xdr_buf pages. + */ + save_pages = buf->pages; + if (encrypt) + buf->pages = pages; + + ret = read_bytes_from_xdr_buf(buf, offset, data, len); + buf->pages = save_pages; + if (ret) + goto out; + + sg_init_one(sg, data, len); + + if (encrypt) + ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + else + ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len); + + if (ret) + goto out; + + ret = write_bytes_to_xdr_buf(buf, offset, data, len); + +out: + return ret; +} + +u32 +gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, + struct xdr_buf *buf, int ec, struct page **pages) +{ + u32 err; + struct xdr_netobj hmac; + u8 *cksumkey; + u8 *ecptr; + struct crypto_blkcipher *cipher, *aux_cipher; + int blocksize; + struct page **save_pages; + int nblocks, nbytes; + struct encryptor_desc desc; + u32 cbcbytes; + unsigned int usage; + + if (kctx->initiate) { + cipher = kctx->initiator_enc; + aux_cipher = kctx->initiator_enc_aux; + cksumkey = kctx->initiator_integ; + usage = KG_USAGE_INITIATOR_SEAL; + } else { + cipher = kctx->acceptor_enc; + aux_cipher = kctx->acceptor_enc_aux; + cksumkey = kctx->acceptor_integ; + usage = KG_USAGE_ACCEPTOR_SEAL; + } + blocksize = crypto_blkcipher_blocksize(cipher); + + /* hide the gss token header and insert the confounder */ + offset += GSS_KRB5_TOK_HDR_LEN; + if (xdr_extend_head(buf, offset, kctx->gk5e->conflen)) + return GSS_S_FAILURE; + gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen); + offset -= GSS_KRB5_TOK_HDR_LEN; + + if (buf->tail[0].iov_base != NULL) { + ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len; + } else { + buf->tail[0].iov_base = buf->head[0].iov_base + + buf->head[0].iov_len; + buf->tail[0].iov_len = 0; + ecptr = buf->tail[0].iov_base; + } + + memset(ecptr, 'X', ec); + buf->tail[0].iov_len += ec; + buf->len += ec; + + /* copy plaintext gss token header after filler (if any) */ + memcpy(ecptr + ec, buf->head[0].iov_base + offset, + GSS_KRB5_TOK_HDR_LEN); + buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; + buf->len += GSS_KRB5_TOK_HDR_LEN; + + /* Do the HMAC */ + hmac.len = GSS_KRB5_MAX_CKSUM_LEN; + hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len; + + /* + * When we are called, pages points to the real page cache + * data -- which we can't go and encrypt! buf->pages points + * to scratch pages which we are going to send off to the + * client/server. Swap in the plaintext pages to calculate + * the hmac. + */ + save_pages = buf->pages; + buf->pages = pages; + + err = make_checksum_v2(kctx, NULL, 0, buf, + offset + GSS_KRB5_TOK_HDR_LEN, + cksumkey, usage, &hmac); + buf->pages = save_pages; + if (err) + return GSS_S_FAILURE; + + nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN; + nblocks = (nbytes + blocksize - 1) / blocksize; + cbcbytes = 0; + if (nblocks > 2) + cbcbytes = (nblocks - 2) * blocksize; + + memset(desc.iv, 0, sizeof(desc.iv)); + + if (cbcbytes) { + desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; + desc.fragno = 0; + desc.fraglen = 0; + desc.pages = pages; + desc.outbuf = buf; + desc.desc.info = desc.iv; + desc.desc.flags = 0; + desc.desc.tfm = aux_cipher; + + sg_init_table(desc.infrags, 4); + sg_init_table(desc.outfrags, 4); + + err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN, + cbcbytes, encryptor, &desc); + if (err) + goto out_err; + } + + /* Make sure IV carries forward from any CBC results. */ + err = gss_krb5_cts_crypt(cipher, buf, + offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes, + desc.iv, pages, 1); + if (err) { + err = GSS_S_FAILURE; + goto out_err; + } + + /* Now update buf to account for HMAC */ + buf->tail[0].iov_len += kctx->gk5e->cksumlength; + buf->len += kctx->gk5e->cksumlength; + +out_err: + if (err) + err = GSS_S_FAILURE; + return err; +} + +u32 +gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, + u32 *headskip, u32 *tailskip) +{ + struct xdr_buf subbuf; + u32 ret = 0; + u8 *cksum_key; + struct crypto_blkcipher *cipher, *aux_cipher; + struct xdr_netobj our_hmac_obj; + u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; + u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; + int nblocks, blocksize, cbcbytes; + struct decryptor_desc desc; + unsigned int usage; + + if (kctx->initiate) { + cipher = kctx->acceptor_enc; + aux_cipher = kctx->acceptor_enc_aux; + cksum_key = kctx->acceptor_integ; + usage = KG_USAGE_ACCEPTOR_SEAL; + } else { + cipher = kctx->initiator_enc; + aux_cipher = kctx->initiator_enc_aux; + cksum_key = kctx->initiator_integ; + usage = KG_USAGE_INITIATOR_SEAL; + } + blocksize = crypto_blkcipher_blocksize(cipher); + + + /* create a segment skipping the header and leaving out the checksum */ + xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN, + (buf->len - offset - GSS_KRB5_TOK_HDR_LEN - + kctx->gk5e->cksumlength)); + + nblocks = (subbuf.len + blocksize - 1) / blocksize; + + cbcbytes = 0; + if (nblocks > 2) + cbcbytes = (nblocks - 2) * blocksize; + + memset(desc.iv, 0, sizeof(desc.iv)); + + if (cbcbytes) { + desc.fragno = 0; + desc.fraglen = 0; + desc.desc.info = desc.iv; + desc.desc.flags = 0; + desc.desc.tfm = aux_cipher; + + sg_init_table(desc.frags, 4); + + ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc); + if (ret) + goto out_err; + } + + /* Make sure IV carries forward from any CBC results. */ + ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0); + if (ret) + goto out_err; + + + /* Calculate our hmac over the plaintext data */ + our_hmac_obj.len = sizeof(our_hmac); + our_hmac_obj.data = our_hmac; + + ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0, + cksum_key, usage, &our_hmac_obj); + if (ret) + goto out_err; + + /* Get the packet's hmac value */ + ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength, + pkt_hmac, kctx->gk5e->cksumlength); + if (ret) + goto out_err; + + if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { + ret = GSS_S_BAD_SIG; + goto out_err; + } + *headskip = kctx->gk5e->conflen; + *tailskip = kctx->gk5e->cksumlength; +out_err: + if (ret && ret != GSS_S_BAD_SIG) + ret = GSS_S_FAILURE; + return ret; +} + +/* + * Compute Kseq given the initial session key and the checksum. + * Set the key of the given cipher. + */ +int +krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, + unsigned char *cksum) +{ + struct crypto_hash *hmac; + struct hash_desc desc; + struct scatterlist sg[1]; + u8 Kseq[GSS_KRB5_MAX_KEYLEN]; + u32 zeroconstant = 0; + int err; + + dprintk("%s: entered\n", __func__); + + hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hmac)) { + dprintk("%s: error %ld, allocating hash '%s'\n", + __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); + return PTR_ERR(hmac); + } + + desc.tfm = hmac; + desc.flags = 0; + + err = crypto_hash_init(&desc); + if (err) + goto out_err; + + /* Compute intermediate Kseq from session key */ + err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); + if (err) + goto out_err; + + sg_init_table(sg, 1); + sg_set_buf(sg, &zeroconstant, 4); + + err = crypto_hash_digest(&desc, sg, 4, Kseq); + if (err) + goto out_err; + + /* Compute final Kseq from the checksum and intermediate Kseq */ + err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength); + if (err) + goto out_err; + + sg_set_buf(sg, cksum, 8); + + err = crypto_hash_digest(&desc, sg, 8, Kseq); + if (err) + goto out_err; + + err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); + if (err) + goto out_err; + + err = 0; + +out_err: + crypto_free_hash(hmac); + dprintk("%s: returning %d\n", __func__, err); + return err; +} + +/* + * Compute Kcrypt given the initial session key and the plaintext seqnum. + * Set the key of cipher kctx->enc. + */ +int +krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, + s32 seqnum) +{ + struct crypto_hash *hmac; + struct hash_desc desc; + struct scatterlist sg[1]; + u8 Kcrypt[GSS_KRB5_MAX_KEYLEN]; + u8 zeroconstant[4] = {0}; + u8 seqnumarray[4]; + int err, i; + + dprintk("%s: entered, seqnum %u\n", __func__, seqnum); + + hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hmac)) { + dprintk("%s: error %ld, allocating hash '%s'\n", + __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name); + return PTR_ERR(hmac); + } + + desc.tfm = hmac; + desc.flags = 0; + + err = crypto_hash_init(&desc); + if (err) + goto out_err; + + /* Compute intermediate Kcrypt from session key */ + for (i = 0; i < kctx->gk5e->keylength; i++) + Kcrypt[i] = kctx->Ksess[i] ^ 0xf0; + + err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); + if (err) + goto out_err; + + sg_init_table(sg, 1); + sg_set_buf(sg, zeroconstant, 4); + + err = crypto_hash_digest(&desc, sg, 4, Kcrypt); + if (err) + goto out_err; + + /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */ + err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength); + if (err) + goto out_err; + + seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff); + seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff); + seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff); + seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff); + + sg_set_buf(sg, seqnumarray, 4); + + err = crypto_hash_digest(&desc, sg, 4, Kcrypt); + if (err) + goto out_err; + + err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); + if (err) + goto out_err; + + err = 0; + +out_err: + crypto_free_hash(hmac); + dprintk("%s: returning %d\n", __func__, err); + return err; +} + diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c new file mode 100644 index 00000000..76e42e6b --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -0,0 +1,336 @@ +/* + * COPYRIGHT (c) 2008 + * The Regents of the University of Michigan + * ALL RIGHTS RESERVED + * + * Permission is granted to use, copy, create derivative works + * and redistribute this software and such derivative works + * for any purpose, so long as the name of The University of + * Michigan is not used in any advertising or publicity + * pertaining to the use of distribution of this software + * without specific, written prior authorization. If the + * above copyright notice or any other identification of the + * University of Michigan is included in any copy of any + * portion of this software, then the disclaimer below must + * also be included. + * + * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION + * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY + * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF + * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING + * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE + * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE + * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR + * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING + * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN + * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGES. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * This is the n-fold function as described in rfc3961, sec 5.1 + * Taken from MIT Kerberos and modified. + */ + +static void krb5_nfold(u32 inbits, const u8 *in, + u32 outbits, u8 *out) +{ + int a, b, c, lcm; + int byte, i, msbit; + + /* the code below is more readable if I make these bytes + instead of bits */ + + inbits >>= 3; + outbits >>= 3; + + /* first compute lcm(n,k) */ + + a = outbits; + b = inbits; + + while (b != 0) { + c = b; + b = a%b; + a = c; + } + + lcm = outbits*inbits/a; + + /* now do the real work */ + + memset(out, 0, outbits); + byte = 0; + + /* this will end up cycling through k lcm(k,n)/k times, which + is correct */ + for (i = lcm-1; i >= 0; i--) { + /* compute the msbit in k which gets added into this byte */ + msbit = ( + /* first, start with the msbit in the first, + * unrotated byte */ + ((inbits << 3) - 1) + /* then, for each byte, shift to the right + * for each repetition */ + + (((inbits << 3) + 13) * (i/inbits)) + /* last, pick out the correct byte within + * that shifted repetition */ + + ((inbits - (i % inbits)) << 3) + ) % (inbits << 3); + + /* pull out the byte value itself */ + byte += (((in[((inbits - 1) - (msbit >> 3)) % inbits] << 8)| + (in[((inbits) - (msbit >> 3)) % inbits])) + >> ((msbit & 7) + 1)) & 0xff; + + /* do the addition */ + byte += out[i % outbits]; + out[i % outbits] = byte & 0xff; + + /* keep around the carry bit, if any */ + byte >>= 8; + + } + + /* if there's a carry bit left over, add it back in */ + if (byte) { + for (i = outbits - 1; i >= 0; i--) { + /* do the addition */ + byte += out[i]; + out[i] = byte & 0xff; + + /* keep around the carry bit, if any */ + byte >>= 8; + } + } +} + +/* + * This is the DK (derive_key) function as described in rfc3961, sec 5.1 + * Taken from MIT Kerberos and modified. + */ + +u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, + const struct xdr_netobj *inkey, + struct xdr_netobj *outkey, + const struct xdr_netobj *in_constant, + gfp_t gfp_mask) +{ + size_t blocksize, keybytes, keylength, n; + unsigned char *inblockdata, *outblockdata, *rawkey; + struct xdr_netobj inblock, outblock; + struct crypto_blkcipher *cipher; + u32 ret = EINVAL; + + blocksize = gk5e->blocksize; + keybytes = gk5e->keybytes; + keylength = gk5e->keylength; + + if ((inkey->len != keylength) || (outkey->len != keylength)) + goto err_return; + + cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(cipher)) + goto err_return; + if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len)) + goto err_return; + + /* allocate and set up buffers */ + + ret = ENOMEM; + inblockdata = kmalloc(blocksize, gfp_mask); + if (inblockdata == NULL) + goto err_free_cipher; + + outblockdata = kmalloc(blocksize, gfp_mask); + if (outblockdata == NULL) + goto err_free_in; + + rawkey = kmalloc(keybytes, gfp_mask); + if (rawkey == NULL) + goto err_free_out; + + inblock.data = (char *) inblockdata; + inblock.len = blocksize; + + outblock.data = (char *) outblockdata; + outblock.len = blocksize; + + /* initialize the input block */ + + if (in_constant->len == inblock.len) { + memcpy(inblock.data, in_constant->data, inblock.len); + } else { + krb5_nfold(in_constant->len * 8, in_constant->data, + inblock.len * 8, inblock.data); + } + + /* loop encrypting the blocks until enough key bytes are generated */ + + n = 0; + while (n < keybytes) { + (*(gk5e->encrypt))(cipher, NULL, inblock.data, + outblock.data, inblock.len); + + if ((keybytes - n) <= outblock.len) { + memcpy(rawkey + n, outblock.data, (keybytes - n)); + break; + } + + memcpy(rawkey + n, outblock.data, outblock.len); + memcpy(inblock.data, outblock.data, outblock.len); + n += outblock.len; + } + + /* postprocess the key */ + + inblock.data = (char *) rawkey; + inblock.len = keybytes; + + BUG_ON(gk5e->mk_key == NULL); + ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey); + if (ret) { + dprintk("%s: got %d from mk_key function for '%s'\n", + __func__, ret, gk5e->encrypt_name); + goto err_free_raw; + } + + /* clean memory, free resources and exit */ + + ret = 0; + +err_free_raw: + memset(rawkey, 0, keybytes); + kfree(rawkey); +err_free_out: + memset(outblockdata, 0, blocksize); + kfree(outblockdata); +err_free_in: + memset(inblockdata, 0, blocksize); + kfree(inblockdata); +err_free_cipher: + crypto_free_blkcipher(cipher); +err_return: + return ret; +} + +#define smask(step) ((1<>step)&smask(step))) +#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1) + +static void mit_des_fixup_key_parity(u8 key[8]) +{ + int i; + for (i = 0; i < 8; i++) { + key[i] &= 0xfe; + key[i] |= 1^parity_char(key[i]); + } +} + +/* + * This is the des3 key derivation postprocess function + */ +u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e, + struct xdr_netobj *randombits, + struct xdr_netobj *key) +{ + int i; + u32 ret = EINVAL; + + if (key->len != 24) { + dprintk("%s: key->len is %d\n", __func__, key->len); + goto err_out; + } + if (randombits->len != 21) { + dprintk("%s: randombits->len is %d\n", + __func__, randombits->len); + goto err_out; + } + + /* take the seven bytes, move them around into the top 7 bits of the + 8 key bytes, then compute the parity bits. Do this three times. */ + + for (i = 0; i < 3; i++) { + memcpy(key->data + i*8, randombits->data + i*7, 7); + key->data[i*8+7] = (((key->data[i*8]&1)<<1) | + ((key->data[i*8+1]&1)<<2) | + ((key->data[i*8+2]&1)<<3) | + ((key->data[i*8+3]&1)<<4) | + ((key->data[i*8+4]&1)<<5) | + ((key->data[i*8+5]&1)<<6) | + ((key->data[i*8+6]&1)<<7)); + + mit_des_fixup_key_parity(key->data + i*8); + } + ret = 0; +err_out: + return ret; +} + +/* + * This is the aes key derivation postprocess function + */ +u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e, + struct xdr_netobj *randombits, + struct xdr_netobj *key) +{ + u32 ret = EINVAL; + + if (key->len != 16 && key->len != 32) { + dprintk("%s: key->len is %d\n", __func__, key->len); + goto err_out; + } + if (randombits->len != 16 && randombits->len != 32) { + dprintk("%s: randombits->len is %d\n", + __func__, randombits->len); + goto err_out; + } + if (randombits->len != key->len) { + dprintk("%s: randombits->len is %d, key->len is %d\n", + __func__, randombits->len, key->len); + goto err_out; + } + memcpy(key->data, randombits->data, key->len); + ret = 0; +err_out: + return ret; +} + diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c new file mode 100644 index 00000000..c3b75333 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -0,0 +1,774 @@ +/* + * linux/net/sunrpc/gss_krb5_mech.c + * + * Copyright (c) 2001-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct gss_api_mech gss_kerberos_mech; /* forward declaration */ + +static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = { + /* + * DES (All DES enctypes are mapped to the same gss functionality) + */ + { + .etype = ENCTYPE_DES_CBC_RAW, + .ctype = CKSUMTYPE_RSA_MD5, + .name = "des-cbc-crc", + .encrypt_name = "cbc(des)", + .cksum_name = "md5", + .encrypt = krb5_encrypt, + .decrypt = krb5_decrypt, + .mk_key = NULL, + .signalg = SGN_ALG_DES_MAC_MD5, + .sealalg = SEAL_ALG_DES, + .keybytes = 7, + .keylength = 8, + .blocksize = 8, + .conflen = 8, + .cksumlength = 8, + .keyed_cksum = 0, + }, + /* + * RC4-HMAC + */ + { + .etype = ENCTYPE_ARCFOUR_HMAC, + .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR, + .name = "rc4-hmac", + .encrypt_name = "ecb(arc4)", + .cksum_name = "hmac(md5)", + .encrypt = krb5_encrypt, + .decrypt = krb5_decrypt, + .mk_key = NULL, + .signalg = SGN_ALG_HMAC_MD5, + .sealalg = SEAL_ALG_MICROSOFT_RC4, + .keybytes = 16, + .keylength = 16, + .blocksize = 1, + .conflen = 8, + .cksumlength = 8, + .keyed_cksum = 1, + }, + /* + * 3DES + */ + { + .etype = ENCTYPE_DES3_CBC_RAW, + .ctype = CKSUMTYPE_HMAC_SHA1_DES3, + .name = "des3-hmac-sha1", + .encrypt_name = "cbc(des3_ede)", + .cksum_name = "hmac(sha1)", + .encrypt = krb5_encrypt, + .decrypt = krb5_decrypt, + .mk_key = gss_krb5_des3_make_key, + .signalg = SGN_ALG_HMAC_SHA1_DES3_KD, + .sealalg = SEAL_ALG_DES3KD, + .keybytes = 21, + .keylength = 24, + .blocksize = 8, + .conflen = 8, + .cksumlength = 20, + .keyed_cksum = 1, + }, + /* + * AES128 + */ + { + .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96, + .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128, + .name = "aes128-cts", + .encrypt_name = "cts(cbc(aes))", + .cksum_name = "hmac(sha1)", + .encrypt = krb5_encrypt, + .decrypt = krb5_decrypt, + .mk_key = gss_krb5_aes_make_key, + .encrypt_v2 = gss_krb5_aes_encrypt, + .decrypt_v2 = gss_krb5_aes_decrypt, + .signalg = -1, + .sealalg = -1, + .keybytes = 16, + .keylength = 16, + .blocksize = 16, + .conflen = 16, + .cksumlength = 12, + .keyed_cksum = 1, + }, + /* + * AES256 + */ + { + .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96, + .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256, + .name = "aes256-cts", + .encrypt_name = "cts(cbc(aes))", + .cksum_name = "hmac(sha1)", + .encrypt = krb5_encrypt, + .decrypt = krb5_decrypt, + .mk_key = gss_krb5_aes_make_key, + .encrypt_v2 = gss_krb5_aes_encrypt, + .decrypt_v2 = gss_krb5_aes_decrypt, + .signalg = -1, + .sealalg = -1, + .keybytes = 32, + .keylength = 32, + .blocksize = 16, + .conflen = 16, + .cksumlength = 12, + .keyed_cksum = 1, + }, +}; + +static const int num_supported_enctypes = + ARRAY_SIZE(supported_gss_krb5_enctypes); + +static int +supported_gss_krb5_enctype(int etype) +{ + int i; + for (i = 0; i < num_supported_enctypes; i++) + if (supported_gss_krb5_enctypes[i].etype == etype) + return 1; + return 0; +} + +static const struct gss_krb5_enctype * +get_gss_krb5_enctype(int etype) +{ + int i; + for (i = 0; i < num_supported_enctypes; i++) + if (supported_gss_krb5_enctypes[i].etype == etype) + return &supported_gss_krb5_enctypes[i]; + return NULL; +} + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, int len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + res->data = kmemdup(p, len, GFP_NOFS); + if (unlikely(res->data == NULL)) + return ERR_PTR(-ENOMEM); + res->len = len; + return q; +} + +static inline const void * +get_key(const void *p, const void *end, + struct krb5_ctx *ctx, struct crypto_blkcipher **res) +{ + struct xdr_netobj key; + int alg; + + p = simple_get_bytes(p, end, &alg, sizeof(alg)); + if (IS_ERR(p)) + goto out_err; + + switch (alg) { + case ENCTYPE_DES_CBC_CRC: + case ENCTYPE_DES_CBC_MD4: + case ENCTYPE_DES_CBC_MD5: + /* Map all these key types to ENCTYPE_DES_CBC_RAW */ + alg = ENCTYPE_DES_CBC_RAW; + break; + } + + if (!supported_gss_krb5_enctype(alg)) { + printk(KERN_WARNING "gss_kerberos_mech: unsupported " + "encryption key algorithm %d\n", alg); + p = ERR_PTR(-EINVAL); + goto out_err; + } + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + + *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(*res)) { + printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " + "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + *res = NULL; + goto out_err_free_key; + } + if (crypto_blkcipher_setkey(*res, key.data, key.len)) { + printk(KERN_WARNING "gss_kerberos_mech: error setting key for " + "crypto algorithm %s\n", ctx->gk5e->encrypt_name); + goto out_err_free_tfm; + } + + kfree(key.data); + return p; + +out_err_free_tfm: + crypto_free_blkcipher(*res); +out_err_free_key: + kfree(key.data); + p = ERR_PTR(-EINVAL); +out_err: + return p; +} + +static int +gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) +{ + int tmp; + + p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); + if (IS_ERR(p)) + goto out_err; + + /* Old format supports only DES! Any other enctype uses new format */ + ctx->enctype = ENCTYPE_DES_CBC_RAW; + + ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); + if (ctx->gk5e == NULL) { + p = ERR_PTR(-EINVAL); + goto out_err; + } + + /* The downcall format was designed before we completely understood + * the uses of the context fields; so it includes some stuff we + * just give some minimal sanity-checking, and some we ignore + * completely (like the next twenty bytes): */ + if (unlikely(p + 20 > end || p + 20 < p)) { + p = ERR_PTR(-EFAULT); + goto out_err; + } + p += 20; + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) + goto out_err; + if (tmp != SGN_ALG_DES_MAC_MD5) { + p = ERR_PTR(-ENOSYS); + goto out_err; + } + p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); + if (IS_ERR(p)) + goto out_err; + if (tmp != SEAL_ALG_DES) { + p = ERR_PTR(-ENOSYS); + goto out_err; + } + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) + goto out_err; + p = get_key(p, end, ctx, &ctx->enc); + if (IS_ERR(p)) + goto out_err_free_mech; + p = get_key(p, end, ctx, &ctx->seq); + if (IS_ERR(p)) + goto out_err_free_key1; + if (p != end) { + p = ERR_PTR(-EFAULT); + goto out_err_free_key2; + } + + return 0; + +out_err_free_key2: + crypto_free_blkcipher(ctx->seq); +out_err_free_key1: + crypto_free_blkcipher(ctx->enc); +out_err_free_mech: + kfree(ctx->mech_used.data); +out_err: + return PTR_ERR(p); +} + +struct crypto_blkcipher * +context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) +{ + struct crypto_blkcipher *cp; + + cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(cp)) { + dprintk("gss_kerberos_mech: unable to initialize " + "crypto algorithm %s\n", cname); + return NULL; + } + if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) { + dprintk("gss_kerberos_mech: error setting key for " + "crypto algorithm %s\n", cname); + crypto_free_blkcipher(cp); + return NULL; + } + return cp; +} + +static inline void +set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed) +{ + cdata[0] = (usage>>24)&0xff; + cdata[1] = (usage>>16)&0xff; + cdata[2] = (usage>>8)&0xff; + cdata[3] = usage&0xff; + cdata[4] = seed; +} + +static int +context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) +{ + struct xdr_netobj c, keyin, keyout; + u8 cdata[GSS_KRB5_K5CLENGTH]; + u32 err; + + c.len = GSS_KRB5_K5CLENGTH; + c.data = cdata; + + keyin.data = ctx->Ksess; + keyin.len = ctx->gk5e->keylength; + keyout.len = ctx->gk5e->keylength; + + /* seq uses the raw key */ + ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, + ctx->Ksess); + if (ctx->seq == NULL) + goto out_err; + + ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name, + ctx->Ksess); + if (ctx->enc == NULL) + goto out_free_seq; + + /* derive cksum */ + set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM); + keyout.data = ctx->cksum; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving cksum key\n", + __func__, err); + goto out_free_enc; + } + + return 0; + +out_free_enc: + crypto_free_blkcipher(ctx->enc); +out_free_seq: + crypto_free_blkcipher(ctx->seq); +out_err: + return -EINVAL; +} + +/* + * Note that RC4 depends on deriving keys using the sequence + * number or the checksum of a token. Therefore, the final keys + * cannot be calculated until the token is being constructed! + */ +static int +context_derive_keys_rc4(struct krb5_ctx *ctx) +{ + struct crypto_hash *hmac; + char sigkeyconstant[] = "signaturekey"; + int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ + struct hash_desc desc; + struct scatterlist sg[1]; + int err; + + dprintk("RPC: %s: entered\n", __func__); + /* + * derive cksum (aka Ksign) key + */ + hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hmac)) { + dprintk("%s: error %ld allocating hash '%s'\n", + __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name); + err = PTR_ERR(hmac); + goto out_err; + } + + err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength); + if (err) + goto out_err_free_hmac; + + sg_init_table(sg, 1); + sg_set_buf(sg, sigkeyconstant, slen); + + desc.tfm = hmac; + desc.flags = 0; + + err = crypto_hash_init(&desc); + if (err) + goto out_err_free_hmac; + + err = crypto_hash_digest(&desc, sg, slen, ctx->cksum); + if (err) + goto out_err_free_hmac; + /* + * allocate hash, and blkciphers for data and seqnum encryption + */ + ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(ctx->enc)) { + err = PTR_ERR(ctx->enc); + goto out_err_free_hmac; + } + + ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(ctx->seq)) { + crypto_free_blkcipher(ctx->enc); + err = PTR_ERR(ctx->seq); + goto out_err_free_hmac; + } + + dprintk("RPC: %s: returning success\n", __func__); + + err = 0; + +out_err_free_hmac: + crypto_free_hash(hmac); +out_err: + dprintk("RPC: %s: returning %d\n", __func__, err); + return err; +} + +static int +context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) +{ + struct xdr_netobj c, keyin, keyout; + u8 cdata[GSS_KRB5_K5CLENGTH]; + u32 err; + + c.len = GSS_KRB5_K5CLENGTH; + c.data = cdata; + + keyin.data = ctx->Ksess; + keyin.len = ctx->gk5e->keylength; + keyout.len = ctx->gk5e->keylength; + + /* initiator seal encryption */ + set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); + keyout.data = ctx->initiator_seal; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving initiator_seal key\n", + __func__, err); + goto out_err; + } + ctx->initiator_enc = context_v2_alloc_cipher(ctx, + ctx->gk5e->encrypt_name, + ctx->initiator_seal); + if (ctx->initiator_enc == NULL) + goto out_err; + + /* acceptor seal encryption */ + set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION); + keyout.data = ctx->acceptor_seal; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving acceptor_seal key\n", + __func__, err); + goto out_free_initiator_enc; + } + ctx->acceptor_enc = context_v2_alloc_cipher(ctx, + ctx->gk5e->encrypt_name, + ctx->acceptor_seal); + if (ctx->acceptor_enc == NULL) + goto out_free_initiator_enc; + + /* initiator sign checksum */ + set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM); + keyout.data = ctx->initiator_sign; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving initiator_sign key\n", + __func__, err); + goto out_free_acceptor_enc; + } + + /* acceptor sign checksum */ + set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM); + keyout.data = ctx->acceptor_sign; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving acceptor_sign key\n", + __func__, err); + goto out_free_acceptor_enc; + } + + /* initiator seal integrity */ + set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY); + keyout.data = ctx->initiator_integ; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving initiator_integ key\n", + __func__, err); + goto out_free_acceptor_enc; + } + + /* acceptor seal integrity */ + set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY); + keyout.data = ctx->acceptor_integ; + err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask); + if (err) { + dprintk("%s: Error %d deriving acceptor_integ key\n", + __func__, err); + goto out_free_acceptor_enc; + } + + switch (ctx->enctype) { + case ENCTYPE_AES128_CTS_HMAC_SHA1_96: + case ENCTYPE_AES256_CTS_HMAC_SHA1_96: + ctx->initiator_enc_aux = + context_v2_alloc_cipher(ctx, "cbc(aes)", + ctx->initiator_seal); + if (ctx->initiator_enc_aux == NULL) + goto out_free_acceptor_enc; + ctx->acceptor_enc_aux = + context_v2_alloc_cipher(ctx, "cbc(aes)", + ctx->acceptor_seal); + if (ctx->acceptor_enc_aux == NULL) { + crypto_free_blkcipher(ctx->initiator_enc_aux); + goto out_free_acceptor_enc; + } + } + + return 0; + +out_free_acceptor_enc: + crypto_free_blkcipher(ctx->acceptor_enc); +out_free_initiator_enc: + crypto_free_blkcipher(ctx->initiator_enc); +out_err: + return -EINVAL; +} + +static int +gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, + gfp_t gfp_mask) +{ + int keylen; + + p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); + if (IS_ERR(p)) + goto out_err; + ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR; + + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); + if (IS_ERR(p)) + goto out_err; + /* set seq_send for use by "older" enctypes */ + ctx->seq_send = ctx->seq_send64; + if (ctx->seq_send64 != ctx->seq_send) { + dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, + (long unsigned)ctx->seq_send64, ctx->seq_send); + p = ERR_PTR(-EINVAL); + goto out_err; + } + p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); + if (IS_ERR(p)) + goto out_err; + /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */ + if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1) + ctx->enctype = ENCTYPE_DES3_CBC_RAW; + ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); + if (ctx->gk5e == NULL) { + dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n", + ctx->enctype); + p = ERR_PTR(-EINVAL); + goto out_err; + } + keylen = ctx->gk5e->keylength; + + p = simple_get_bytes(p, end, ctx->Ksess, keylen); + if (IS_ERR(p)) + goto out_err; + + if (p != end) { + p = ERR_PTR(-EINVAL); + goto out_err; + } + + ctx->mech_used.data = kmemdup(gss_kerberos_mech.gm_oid.data, + gss_kerberos_mech.gm_oid.len, gfp_mask); + if (unlikely(ctx->mech_used.data == NULL)) { + p = ERR_PTR(-ENOMEM); + goto out_err; + } + ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; + + switch (ctx->enctype) { + case ENCTYPE_DES3_CBC_RAW: + return context_derive_keys_des3(ctx, gfp_mask); + case ENCTYPE_ARCFOUR_HMAC: + return context_derive_keys_rc4(ctx); + case ENCTYPE_AES128_CTS_HMAC_SHA1_96: + case ENCTYPE_AES256_CTS_HMAC_SHA1_96: + return context_derive_keys_new(ctx, gfp_mask); + default: + return -EINVAL; + } + +out_err: + return PTR_ERR(p); +} + +static int +gss_import_sec_context_kerberos(const void *p, size_t len, + struct gss_ctx *ctx_id, + gfp_t gfp_mask) +{ + const void *end = (const void *)((const char *)p + len); + struct krb5_ctx *ctx; + int ret; + + ctx = kzalloc(sizeof(*ctx), gfp_mask); + if (ctx == NULL) + return -ENOMEM; + + if (len == 85) + ret = gss_import_v1_context(p, end, ctx); + else + ret = gss_import_v2_context(p, end, ctx, gfp_mask); + + if (ret == 0) + ctx_id->internal_ctx_id = ctx; + else + kfree(ctx); + + dprintk("RPC: %s: returning %d\n", __func__, ret); + return ret; +} + +static void +gss_delete_sec_context_kerberos(void *internal_ctx) { + struct krb5_ctx *kctx = internal_ctx; + + crypto_free_blkcipher(kctx->seq); + crypto_free_blkcipher(kctx->enc); + crypto_free_blkcipher(kctx->acceptor_enc); + crypto_free_blkcipher(kctx->initiator_enc); + crypto_free_blkcipher(kctx->acceptor_enc_aux); + crypto_free_blkcipher(kctx->initiator_enc_aux); + kfree(kctx->mech_used.data); + kfree(kctx); +} + +static const struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, +}; + +static struct pf_desc gss_kerberos_pfs[] = { + [0] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5, + .service = RPC_GSS_SVC_NONE, + .name = "krb5", + }, + [1] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5I, + .service = RPC_GSS_SVC_INTEGRITY, + .name = "krb5i", + }, + [2] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5P, + .service = RPC_GSS_SVC_PRIVACY, + .name = "krb5p", + }, +}; + +static struct gss_api_mech gss_kerberos_mech = { + .gm_name = "krb5", + .gm_owner = THIS_MODULE, + .gm_oid = {9, (void *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02"}, + .gm_ops = &gss_kerberos_ops, + .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), + .gm_pfs = gss_kerberos_pfs, + .gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES, +}; + +static int __init init_kerberos_module(void) +{ + int status; + + status = gss_mech_register(&gss_kerberos_mech); + if (status) + printk("Failed to register kerberos gss mechanism!\n"); + return status; +} + +static void __exit cleanup_kerberos_module(void) +{ + gss_mech_unregister(&gss_kerberos_mech); +} + +MODULE_LICENSE("GPL"); +module_init(init_kerberos_module); +module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c new file mode 100644 index 00000000..d7941eab --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -0,0 +1,223 @@ +/* + * linux/net/sunrpc/gss_krb5_seal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * + * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +DEFINE_SPINLOCK(krb5_seq_lock); + +static char * +setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) +{ + __be16 *ptr, *krb5_hdr; + int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; + + token->len = g_token_size(&ctx->mech_used, body_size); + + ptr = (__be16 *)token->data; + g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); + + /* ptr now at start of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr; + *ptr++ = KG_TOK_MIC_MSG; + *ptr++ = cpu_to_le16(ctx->gk5e->signalg); + *ptr++ = SEAL_ALG_NONE; + *ptr++ = 0xffff; + + return (char *)krb5_hdr; +} + +static void * +setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) +{ + __be16 *ptr, *krb5_hdr; + u8 *p, flags = 0x00; + + if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) + flags |= 0x01; + if (ctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) + flags |= 0x04; + + /* Per rfc 4121, sec 4.2.6.1, there is no header, + * just start the token */ + krb5_hdr = ptr = (__be16 *)token->data; + + *ptr++ = KG2_TOK_MIC; + p = (u8 *)ptr; + *p++ = flags; + *p++ = 0xff; + ptr = (__be16 *)p; + *ptr++ = 0xffff; + *ptr++ = 0xffff; + + token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; + return krb5_hdr; +} + +static u32 +gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token) +{ + char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), + .data = cksumdata}; + void *ptr; + s32 now; + u32 seq_send; + u8 *cksumkey; + + dprintk("RPC: %s\n", __func__); + BUG_ON(ctx == NULL); + + now = get_seconds(); + + ptr = setup_token(ctx, token); + + if (ctx->gk5e->keyed_cksum) + cksumkey = ctx->cksum; + else + cksumkey = NULL; + + if (make_checksum(ctx, ptr, 8, text, 0, cksumkey, + KG_USAGE_SIGN, &md5cksum)) + return GSS_S_FAILURE; + + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + + spin_lock(&krb5_seq_lock); + seq_send = ctx->seq_send++; + spin_unlock(&krb5_seq_lock); + + if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) + return GSS_S_FAILURE; + + return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; +} + +u32 +gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, + struct xdr_netobj *token) +{ + char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj cksumobj = { .len = sizeof(cksumdata), + .data = cksumdata}; + void *krb5_hdr; + s32 now; + u64 seq_send; + u8 *cksumkey; + unsigned int cksum_usage; + + dprintk("RPC: %s\n", __func__); + + krb5_hdr = setup_token_v2(ctx, token); + + /* Set up the sequence number. Now 64-bits in clear + * text and w/o direction indicator */ + spin_lock(&krb5_seq_lock); + seq_send = ctx->seq_send64++; + spin_unlock(&krb5_seq_lock); + *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + + if (ctx->initiate) { + cksumkey = ctx->initiator_sign; + cksum_usage = KG_USAGE_INITIATOR_SIGN; + } else { + cksumkey = ctx->acceptor_sign; + cksum_usage = KG_USAGE_ACCEPTOR_SIGN; + } + + if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN, + text, 0, cksumkey, cksum_usage, &cksumobj)) + return GSS_S_FAILURE; + + memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len); + + now = get_seconds(); + + return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; +} + +u32 +gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, + struct xdr_netobj *token) +{ + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; + + switch (ctx->enctype) { + default: + BUG(); + case ENCTYPE_DES_CBC_RAW: + case ENCTYPE_DES3_CBC_RAW: + case ENCTYPE_ARCFOUR_HMAC: + return gss_get_mic_v1(ctx, text, token); + case ENCTYPE_AES128_CTS_HMAC_SHA1_96: + case ENCTYPE_AES256_CTS_HMAC_SHA1_96: + return gss_get_mic_v2(ctx, text, token); + } +} + diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c new file mode 100644 index 00000000..62ac90c6 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -0,0 +1,166 @@ +/* + * linux/net/sunrpc/gss_krb5_seqnum.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static s32 +krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, + unsigned char *cksum, unsigned char *buf) +{ + struct crypto_blkcipher *cipher; + unsigned char plain[8]; + s32 code; + + dprintk("RPC: %s:\n", __func__); + cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(cipher)) + return PTR_ERR(cipher); + + plain[0] = (unsigned char) ((seqnum >> 24) & 0xff); + plain[1] = (unsigned char) ((seqnum >> 16) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[3] = (unsigned char) ((seqnum >> 0) & 0xff); + plain[4] = direction; + plain[5] = direction; + plain[6] = direction; + plain[7] = direction; + + code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); + if (code) + goto out; + + code = krb5_encrypt(cipher, cksum, plain, buf, 8); +out: + crypto_free_blkcipher(cipher); + return code; +} +s32 +krb5_make_seq_num(struct krb5_ctx *kctx, + struct crypto_blkcipher *key, + int direction, + u32 seqnum, + unsigned char *cksum, unsigned char *buf) +{ + unsigned char plain[8]; + + if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) + return krb5_make_rc4_seq_num(kctx, direction, seqnum, + cksum, buf); + + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); + plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); + + plain[4] = direction; + plain[5] = direction; + plain[6] = direction; + plain[7] = direction; + + return krb5_encrypt(key, cksum, plain, buf, 8); +} + +static s32 +krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, + unsigned char *buf, int *direction, s32 *seqnum) +{ + struct crypto_blkcipher *cipher; + unsigned char plain[8]; + s32 code; + + dprintk("RPC: %s:\n", __func__); + cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(cipher)) + return PTR_ERR(cipher); + + code = krb5_rc4_setup_seq_key(kctx, cipher, cksum); + if (code) + goto out; + + code = krb5_decrypt(cipher, cksum, buf, plain, 8); + if (code) + goto out; + + if ((plain[4] != plain[5]) || (plain[4] != plain[6]) + || (plain[4] != plain[7])) { + code = (s32)KG_BAD_SEQ; + goto out; + } + + *direction = plain[4]; + + *seqnum = ((plain[0] << 24) | (plain[1] << 16) | + (plain[2] << 8) | (plain[3])); +out: + crypto_free_blkcipher(cipher); + return code; +} + +s32 +krb5_get_seq_num(struct krb5_ctx *kctx, + unsigned char *cksum, + unsigned char *buf, + int *direction, u32 *seqnum) +{ + s32 code; + unsigned char plain[8]; + struct crypto_blkcipher *key = kctx->seq; + + dprintk("RPC: krb5_get_seq_num:\n"); + + if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) + return krb5_get_rc4_seq_num(kctx, cksum, buf, + direction, seqnum); + + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + + if ((plain[4] != plain[5]) || (plain[4] != plain[6]) || + (plain[4] != plain[7])) + return (s32)KG_BAD_SEQ; + + *direction = plain[4]; + + *seqnum = ((plain[0]) | + (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); + + return 0; +} diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c new file mode 100644 index 00000000..6cd930f3 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -0,0 +1,226 @@ +/* + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * + * Copyright (c) 2000-2008 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + + +/* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ + +static u32 +gss_verify_mic_v1(struct krb5_ctx *ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) +{ + int signalg; + int sealalg; + char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), + .data = cksumdata}; + s32 now; + int direction; + u32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; + u8 *cksumkey; + + dprintk("RPC: krb5_read_token\n"); + + if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, + read_token->len)) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_MIC_MSG & 0xff))) + return GSS_S_DEFECTIVE_TOKEN; + + /* XXX sanity-check bodysize?? */ + + signalg = ptr[2] + (ptr[3] << 8); + if (signalg != ctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); + if (sealalg != SEAL_ALG_NONE) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + + if (ctx->gk5e->keyed_cksum) + cksumkey = ctx->cksum; + else + cksumkey = NULL; + + if (make_checksum(ctx, ptr, 8, message_buffer, 0, + cksumkey, KG_USAGE_SIGN, &md5cksum)) + return GSS_S_FAILURE; + + if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, + ctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ + + now = get_seconds(); + + if (now > ctx->endtime) + return GSS_S_CONTEXT_EXPIRED; + + /* do sequencing checks */ + + if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, + &direction, &seqnum)) + return GSS_S_FAILURE; + + if ((ctx->initiate && direction != 0xff) || + (!ctx->initiate && direction != 0)) + return GSS_S_BAD_SIG; + + return GSS_S_COMPLETE; +} + +static u32 +gss_verify_mic_v2(struct krb5_ctx *ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) +{ + char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj cksumobj = {.len = sizeof(cksumdata), + .data = cksumdata}; + s32 now; + u64 seqnum; + u8 *ptr = read_token->data; + u8 *cksumkey; + u8 flags; + int i; + unsigned int cksum_usage; + + dprintk("RPC: %s\n", __func__); + + if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC) + return GSS_S_DEFECTIVE_TOKEN; + + flags = ptr[2]; + if ((!ctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || + (ctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) + return GSS_S_BAD_SIG; + + if (flags & KG2_TOKEN_FLAG_SEALED) { + dprintk("%s: token has unexpected sealed flag\n", __func__); + return GSS_S_FAILURE; + } + + for (i = 3; i < 8; i++) + if (ptr[i] != 0xff) + return GSS_S_DEFECTIVE_TOKEN; + + if (ctx->initiate) { + cksumkey = ctx->acceptor_sign; + cksum_usage = KG_USAGE_ACCEPTOR_SIGN; + } else { + cksumkey = ctx->initiator_sign; + cksum_usage = KG_USAGE_INITIATOR_SIGN; + } + + if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0, + cksumkey, cksum_usage, &cksumobj)) + return GSS_S_FAILURE; + + if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN, + ctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ + now = get_seconds(); + if (now > ctx->endtime) + return GSS_S_CONTEXT_EXPIRED; + + /* do sequencing checks */ + + seqnum = be64_to_cpup((__be64 *)ptr + 8); + + return GSS_S_COMPLETE; +} + +u32 +gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, + struct xdr_buf *message_buffer, + struct xdr_netobj *read_token) +{ + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; + + switch (ctx->enctype) { + default: + BUG(); + case ENCTYPE_DES_CBC_RAW: + case ENCTYPE_DES3_CBC_RAW: + case ENCTYPE_ARCFOUR_HMAC: + return gss_verify_mic_v1(ctx, message_buffer, read_token); + case ENCTYPE_AES128_CTS_HMAC_SHA1_96: + case ENCTYPE_AES256_CTS_HMAC_SHA1_96: + return gss_verify_mic_v2(ctx, message_buffer, read_token); + } +} + diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c new file mode 100644 index 00000000..2763e3e4 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -0,0 +1,587 @@ +/* + * COPYRIGHT (c) 2008 + * The Regents of the University of Michigan + * ALL RIGHTS RESERVED + * + * Permission is granted to use, copy, create derivative works + * and redistribute this software and such derivative works + * for any purpose, so long as the name of The University of + * Michigan is not used in any advertising or publicity + * pertaining to the use of distribution of this software + * without specific, written prior authorization. If the + * above copyright notice or any other identification of the + * University of Michigan is included in any copy of any + * portion of this software, then the disclaimer below must + * also be included. + * + * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION + * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY + * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF + * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING + * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE + * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE + * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR + * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING + * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN + * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGES. + */ + +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static inline int +gss_krb5_padding(int blocksize, int length) +{ + return blocksize - (length % blocksize); +} + +static inline void +gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) +{ + int padding = gss_krb5_padding(blocksize, buf->len - offset); + char *p; + struct kvec *iov; + + if (buf->page_len || buf->tail[0].iov_len) + iov = &buf->tail[0]; + else + iov = &buf->head[0]; + p = iov->iov_base + iov->iov_len; + iov->iov_len += padding; + buf->len += padding; + memset(p, padding, padding); +} + +static inline int +gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) +{ + u8 *ptr; + u8 pad; + size_t len = buf->len; + + if (len <= buf->head[0].iov_len) { + pad = *(u8 *)(buf->head[0].iov_base + len - 1); + if (pad > buf->head[0].iov_len) + return -EINVAL; + buf->head[0].iov_len -= pad; + goto out; + } else + len -= buf->head[0].iov_len; + if (len <= buf->page_len) { + unsigned int last = (buf->page_base + len - 1) + >>PAGE_CACHE_SHIFT; + unsigned int offset = (buf->page_base + len - 1) + & (PAGE_CACHE_SIZE - 1); + ptr = kmap_atomic(buf->pages[last], KM_USER0); + pad = *(ptr + offset); + kunmap_atomic(ptr, KM_USER0); + goto out; + } else + len -= buf->page_len; + BUG_ON(len > buf->tail[0].iov_len); + pad = *(u8 *)(buf->tail[0].iov_base + len - 1); +out: + /* XXX: NOTE: we do not adjust the page lengths--they represent + * a range of data in the real filesystem page cache, and we need + * to know that range so the xdr code can properly place read data. + * However adjusting the head length, as we do above, is harmless. + * In the case of a request that fits into a single page, the server + * also uses length and head length together to determine the original + * start of the request to copy the request for deferal; so it's + * easier on the server if we adjust head and tail length in tandem. + * It's not really a problem that we don't fool with the page and + * tail lengths, though--at worst badly formed xdr might lead the + * server to attempt to parse the padding. + * XXX: Document all these weird requirements for gss mechanism + * wrap/unwrap functions. */ + if (pad > blocksize) + return -EINVAL; + if (buf->len > pad) + buf->len -= pad; + else + return -EINVAL; + return 0; +} + +void +gss_krb5_make_confounder(char *p, u32 conflen) +{ + static u64 i = 0; + u64 *q = (u64 *)p; + + /* rfc1964 claims this should be "random". But all that's really + * necessary is that it be unique. And not even that is necessary in + * our case since our "gssapi" implementation exists only to support + * rpcsec_gss, so we know that the only buffers we will ever encrypt + * already begin with a unique sequence number. Just to hedge my bets + * I'll make a half-hearted attempt at something unique, but ensuring + * uniqueness would mean worrying about atomicity and rollover, and I + * don't care enough. */ + + /* initialize to random value */ + if (i == 0) { + i = random32(); + i = (i << 32) | random32(); + } + + switch (conflen) { + case 16: + *q++ = i++; + /* fall through */ + case 8: + *q++ = i++; + break; + default: + BUG(); + } +} + +/* Assumptions: the head and tail of inbuf are ours to play with. + * The pages, however, may be real pages in the page cache and we replace + * them with scratch pages from **pages before writing to them. */ +/* XXX: obviously the above should be documentation of wrap interface, + * and shouldn't be in this kerberos-specific file. */ + +/* XXX factor out common code with seal/unseal. */ + +static u32 +gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, + struct xdr_buf *buf, struct page **pages) +{ + char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), + .data = cksumdata}; + int blocksize = 0, plainlen; + unsigned char *ptr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + u32 seq_send; + u8 *cksumkey; + u32 conflen = kctx->gk5e->conflen; + + dprintk("RPC: %s\n", __func__); + + now = get_seconds(); + + blocksize = crypto_blkcipher_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); + plainlen = conflen + buf->len - offset; + + headlen = g_token_size(&kctx->mech_used, + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) - + (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ + xdr_extend_head(buf, offset, headlen); + + /* XXX Would be cleverer to encrypt while copying. */ + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, + GSS_KRB5_TOK_HDR_LEN + + kctx->gk5e->cksumlength + plainlen, &ptr); + + + /* ptr now at header described in rfc 1964, section 1.2.1: */ + ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); + + msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; + + *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); + memset(ptr + 4, 0xff, 4); + *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + + gss_krb5_make_confounder(msg_start, conflen); + + if (kctx->gk5e->keyed_cksum) + cksumkey = kctx->cksum; + else + cksumkey = NULL; + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; + if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen, + cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + buf->pages = tmp_pages; + + memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); + + spin_lock(&krb5_seq_lock); + seq_send = kctx->seq_send++; + spin_unlock(&krb5_seq_lock); + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ + if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff, + seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))) + return GSS_S_FAILURE; + + if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { + struct crypto_blkcipher *cipher; + int err; + cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(cipher)) + return GSS_S_FAILURE; + + krb5_rc4_setup_enc_key(kctx, cipher, seq_send); + + err = gss_encrypt_xdr_buf(cipher, buf, + offset + headlen - conflen, pages); + crypto_free_blkcipher(cipher); + if (err) + return GSS_S_FAILURE; + } else { + if (gss_encrypt_xdr_buf(kctx->enc, buf, + offset + headlen - conflen, pages)) + return GSS_S_FAILURE; + } + + return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; +} + +static u32 +gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) +{ + int signalg; + int sealalg; + char cksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + struct xdr_netobj md5cksum = {.len = sizeof(cksumdata), + .data = cksumdata}; + s32 now; + int direction; + s32 seqnum; + unsigned char *ptr; + int bodysize; + void *data_start, *orig_start; + int data_len; + int blocksize; + u32 conflen = kctx->gk5e->conflen; + int crypt_offset; + u8 *cksumkey; + + dprintk("RPC: gss_unwrap_kerberos\n"); + + ptr = (u8 *)buf->head[0].iov_base + offset; + if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, + buf->len - offset)) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || + (ptr[1] != (KG_TOK_WRAP_MSG & 0xff))) + return GSS_S_DEFECTIVE_TOKEN; + + /* XXX sanity-check bodysize?? */ + + /* get the sign and seal algorithms */ + + signalg = ptr[2] + (ptr[3] << 8); + if (signalg != kctx->gk5e->signalg) + return GSS_S_DEFECTIVE_TOKEN; + + sealalg = ptr[4] + (ptr[5] << 8); + if (sealalg != kctx->gk5e->sealalg) + return GSS_S_DEFECTIVE_TOKEN; + + if ((ptr[6] != 0xff) || (ptr[7] != 0xff)) + return GSS_S_DEFECTIVE_TOKEN; + + /* + * Data starts after token header and checksum. ptr points + * to the beginning of the token header + */ + crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) - + (unsigned char *)buf->head[0].iov_base; + + /* + * Need plaintext seqnum to derive encryption key for arcfour-hmac + */ + if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN, + ptr + 8, &direction, &seqnum)) + return GSS_S_BAD_SIG; + + if ((kctx->initiate && direction != 0xff) || + (!kctx->initiate && direction != 0)) + return GSS_S_BAD_SIG; + + if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { + struct crypto_blkcipher *cipher; + int err; + + cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(cipher)) + return GSS_S_FAILURE; + + krb5_rc4_setup_enc_key(kctx, cipher, seqnum); + + err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); + crypto_free_blkcipher(cipher); + if (err) + return GSS_S_DEFECTIVE_TOKEN; + } else { + if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset)) + return GSS_S_DEFECTIVE_TOKEN; + } + + if (kctx->gk5e->keyed_cksum) + cksumkey = kctx->cksum; + else + cksumkey = NULL; + + if (make_checksum(kctx, ptr, 8, buf, crypt_offset, + cksumkey, KG_USAGE_SEAL, &md5cksum)) + return GSS_S_FAILURE; + + if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN, + kctx->gk5e->cksumlength)) + return GSS_S_BAD_SIG; + + /* it got through unscathed. Make sure the context is unexpired */ + + now = get_seconds(); + + if (now > kctx->endtime) + return GSS_S_CONTEXT_EXPIRED; + + /* do sequencing checks */ + + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_blkcipher_blocksize(kctx->enc); + data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + + conflen; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); + buf->head[0].iov_len -= (data_start - orig_start); + buf->len -= (data_start - orig_start); + + if (gss_krb5_remove_padding(buf, blocksize)) + return GSS_S_DEFECTIVE_TOKEN; + + return GSS_S_COMPLETE; +} + +/* + * We cannot currently handle tokens with rotated data. We need a + * generalized routine to rotate the data in place. It is anticipated + * that we won't encounter rotated data in the general case. + */ +static u32 +rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc) +{ + unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN); + + if (realrrc == 0) + return 0; + + dprintk("%s: cannot process token with rotated data: " + "rrc %u, realrrc %u\n", __func__, rrc, realrrc); + return 1; +} + +static u32 +gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, + struct xdr_buf *buf, struct page **pages) +{ + int blocksize; + u8 *ptr, *plainhdr; + s32 now; + u8 flags = 0x00; + __be16 *be16ptr, ec = 0; + __be64 *be64ptr; + u32 err; + + dprintk("RPC: %s\n", __func__); + + if (kctx->gk5e->encrypt_v2 == NULL) + return GSS_S_FAILURE; + + /* make room for gss token header */ + if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN)) + return GSS_S_FAILURE; + + /* construct gss token header */ + ptr = plainhdr = buf->head[0].iov_base + offset; + *ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff); + *ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff); + + if ((kctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) + flags |= KG2_TOKEN_FLAG_SENTBYACCEPTOR; + if ((kctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) != 0) + flags |= KG2_TOKEN_FLAG_ACCEPTORSUBKEY; + /* We always do confidentiality in wrap tokens */ + flags |= KG2_TOKEN_FLAG_SEALED; + + *ptr++ = flags; + *ptr++ = 0xff; + be16ptr = (__be16 *)ptr; + + blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); + *be16ptr++ = cpu_to_be16(ec); + /* "inner" token header always uses 0 for RRC */ + *be16ptr++ = cpu_to_be16(0); + + be64ptr = (__be64 *)be16ptr; + spin_lock(&krb5_seq_lock); + *be64ptr = cpu_to_be64(kctx->seq_send64++); + spin_unlock(&krb5_seq_lock); + + err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); + if (err) + return err; + + now = get_seconds(); + return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; +} + +static u32 +gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) +{ + s32 now; + u64 seqnum; + u8 *ptr; + u8 flags = 0x00; + u16 ec, rrc; + int err; + u32 headskip, tailskip; + u8 decrypted_hdr[GSS_KRB5_TOK_HDR_LEN]; + unsigned int movelen; + + + dprintk("RPC: %s\n", __func__); + + if (kctx->gk5e->decrypt_v2 == NULL) + return GSS_S_FAILURE; + + ptr = buf->head[0].iov_base + offset; + + if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP) + return GSS_S_DEFECTIVE_TOKEN; + + flags = ptr[2]; + if ((!kctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) || + (kctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR))) + return GSS_S_BAD_SIG; + + if ((flags & KG2_TOKEN_FLAG_SEALED) == 0) { + dprintk("%s: token missing expected sealed flag\n", __func__); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (ptr[3] != 0xff) + return GSS_S_DEFECTIVE_TOKEN; + + ec = be16_to_cpup((__be16 *)(ptr + 4)); + rrc = be16_to_cpup((__be16 *)(ptr + 6)); + + seqnum = be64_to_cpup((__be64 *)(ptr + 8)); + + if (rrc != 0) { + err = rotate_left(kctx, offset, buf, rrc); + if (err) + return GSS_S_FAILURE; + } + + err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, + &headskip, &tailskip); + if (err) + return GSS_S_FAILURE; + + /* + * Retrieve the decrypted gss token header and verify + * it against the original + */ + err = read_bytes_from_xdr_buf(buf, + buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip, + decrypted_hdr, GSS_KRB5_TOK_HDR_LEN); + if (err) { + dprintk("%s: error %u getting decrypted_hdr\n", __func__, err); + return GSS_S_FAILURE; + } + if (memcmp(ptr, decrypted_hdr, 6) + || memcmp(ptr + 8, decrypted_hdr + 8, 8)) { + dprintk("%s: token hdr, plaintext hdr mismatch!\n", __func__); + return GSS_S_FAILURE; + } + + /* do sequencing checks */ + + /* it got through unscathed. Make sure the context is unexpired */ + now = get_seconds(); + if (now > kctx->endtime) + return GSS_S_CONTEXT_EXPIRED; + + /* + * Move the head data back to the right position in xdr_buf. + * We ignore any "ec" data since it might be in the head or + * the tail, and we really don't need to deal with it. + * Note that buf->head[0].iov_len may indicate the available + * head buffer space rather than that actually occupied. + */ + movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len); + movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip; + BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen > + buf->head[0].iov_len); + memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); + buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; + buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip; + + return GSS_S_COMPLETE; +} + +u32 +gss_wrap_kerberos(struct gss_ctx *gctx, int offset, + struct xdr_buf *buf, struct page **pages) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + switch (kctx->enctype) { + default: + BUG(); + case ENCTYPE_DES_CBC_RAW: + case ENCTYPE_DES3_CBC_RAW: + case ENCTYPE_ARCFOUR_HMAC: + return gss_wrap_kerberos_v1(kctx, offset, buf, pages); + case ENCTYPE_AES128_CTS_HMAC_SHA1_96: + case ENCTYPE_AES256_CTS_HMAC_SHA1_96: + return gss_wrap_kerberos_v2(kctx, offset, buf, pages); + } +} + +u32 +gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + switch (kctx->enctype) { + default: + BUG(); + case ENCTYPE_DES_CBC_RAW: + case ENCTYPE_DES3_CBC_RAW: + case ENCTYPE_ARCFOUR_HMAC: + return gss_unwrap_kerberos_v1(kctx, offset, buf); + case ENCTYPE_AES128_CTS_HMAC_SHA1_96: + case ENCTYPE_AES256_CTS_HMAC_SHA1_96: + return gss_unwrap_kerberos_v2(kctx, offset, buf); + } +} + diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c new file mode 100644 index 00000000..e3c36a27 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -0,0 +1,381 @@ +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static LIST_HEAD(registered_mechs); +static DEFINE_SPINLOCK(registered_mechs_lock); + +static void +gss_mech_free(struct gss_api_mech *gm) +{ + struct pf_desc *pf; + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + pf = &gm->gm_pfs[i]; + kfree(pf->auth_domain_name); + pf->auth_domain_name = NULL; + } +} + +static inline char * +make_auth_domain_name(char *name) +{ + static char *prefix = "gss/"; + char *new; + + new = kmalloc(strlen(name) + strlen(prefix) + 1, GFP_KERNEL); + if (new) { + strcpy(new, prefix); + strcat(new, name); + } + return new; +} + +static int +gss_mech_svc_setup(struct gss_api_mech *gm) +{ + struct pf_desc *pf; + int i, status; + + for (i = 0; i < gm->gm_pf_num; i++) { + pf = &gm->gm_pfs[i]; + pf->auth_domain_name = make_auth_domain_name(pf->name); + status = -ENOMEM; + if (pf->auth_domain_name == NULL) + goto out; + status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, + pf->auth_domain_name); + if (status) + goto out; + } + return 0; +out: + gss_mech_free(gm); + return status; +} + +int +gss_mech_register(struct gss_api_mech *gm) +{ + int status; + + status = gss_mech_svc_setup(gm); + if (status) + return status; + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); + return 0; +} + +EXPORT_SYMBOL_GPL(gss_mech_register); + +void +gss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); + gss_mech_free(gm); +} + +EXPORT_SYMBOL_GPL(gss_mech_unregister); + +struct gss_api_mech * +gss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +EXPORT_SYMBOL_GPL(gss_mech_get); + +struct gss_api_mech * +gss_mech_get_by_name(const char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL_GPL(gss_mech_get_by_name); + +struct gss_api_mech * +gss_mech_get_by_OID(struct xdr_netobj *obj) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (obj->len == pos->gm_oid.len) { + if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL_GPL(gss_mech_get_by_OID); + +static inline int +mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return 1; + } + return 0; +} + +struct gss_api_mech * +gss_mech_get_by_pseudoflavor(u32 pseudoflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!mech_supports_pseudoflavor(pos, pseudoflavor)) { + module_put(pos->gm_owner); + continue; + } + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); + +int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) +{ + struct gss_api_mech *pos = NULL; + int i = 0; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + array_ptr[i] = pos->gm_pfs->pseudoflavor; + i++; + } + spin_unlock(®istered_mechs_lock); + return i; +} + +EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors); + +u32 +gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].service == service) { + return gm->gm_pfs[i].pseudoflavor; + } + } + return RPC_AUTH_MAXFLAVOR; /* illegal value */ +} +EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor); + +u32 +gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].service; + } + return 0; +} + +EXPORT_SYMBOL_GPL(gss_pseudoflavor_to_service); + +char * +gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].service == service) + return gm->gm_pfs[i].auth_domain_name; + } + return NULL; +} + +EXPORT_SYMBOL_GPL(gss_service_to_auth_domain_name); + +void +gss_mech_put(struct gss_api_mech * gm) +{ + if (gm) + module_put(gm->gm_owner); +} + +EXPORT_SYMBOL_GPL(gss_mech_put); + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +int +gss_import_sec_context(const void *input_token, size_t bufsize, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id, + gfp_t gfp_mask) +{ + if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) + return -ENOMEM; + (*ctx_id)->mech_type = gss_mech_get(mech); + + return mech->gm_ops + ->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ + +u32 +gss_get_mic(struct gss_ctx *context_handle, + struct xdr_buf *message, + struct xdr_netobj *mic_token) +{ + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + message, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ + +u32 +gss_verify_mic(struct gss_ctx *context_handle, + struct xdr_buf *message, + struct xdr_netobj *mic_token) +{ + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + message, + mic_token); +} + +/* + * This function is called from both the client and server code. + * Each makes guarantees about how much "slack" space is available + * for the underlying function in "buf"'s head and tail while + * performing the wrap. + * + * The client and server code allocate RPC_MAX_AUTH_SIZE extra + * space in both the head and tail which is available for use by + * the wrap function. + * + * Underlying functions should verify they do not use more than + * RPC_MAX_AUTH_SIZE of extra space in either the head or tail + * when performing the wrap. + */ +u32 +gss_wrap(struct gss_ctx *ctx_id, + int offset, + struct xdr_buf *buf, + struct page **inpages) +{ + return ctx_id->mech_type->gm_ops + ->gss_wrap(ctx_id, offset, buf, inpages); +} + +u32 +gss_unwrap(struct gss_ctx *ctx_id, + int offset, + struct xdr_buf *buf) +{ + return ctx_id->mech_type->gm_ops + ->gss_unwrap(ctx_id, offset, buf); +} + + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +u32 +gss_delete_sec_context(struct gss_ctx **context_handle) +{ + dprintk("RPC: gss_delete_sec_context deleting %p\n", + *context_handle); + + if (!*context_handle) + return GSS_S_NO_CONTEXT; + if ((*context_handle)->internal_ctx_id) + (*context_handle)->mech_type->gm_ops + ->gss_delete_sec_context((*context_handle) + ->internal_ctx_id); + gss_mech_put((*context_handle)->mech_type); + kfree(*context_handle); + *context_handle=NULL; + return GSS_S_COMPLETE; +} diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c new file mode 100644 index 00000000..8d0f7d3c --- /dev/null +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -0,0 +1,1458 @@ +/* + * Neil Brown + * J. Bruce Fields + * Andy Adamson + * Dug Song + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests + * into replies. + * + * Key is context handle (\x if empty) and gss_token. + * Content is major_status minor_status (integers) context_handle, reply_token. + * + */ + +static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) +{ + return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); +} + +#define RSI_HASHBITS 6 +#define RSI_HASHMAX (1<in_handle.data); + kfree(rsii->in_token.data); + kfree(rsii->out_handle.data); + kfree(rsii->out_token.data); +} + +static void rsi_put(struct kref *ref) +{ + struct rsi *rsii = container_of(ref, struct rsi, h.ref); + rsi_free(rsii); + kfree(rsii); +} + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) + ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); +} + +static int rsi_match(struct cache_head *a, struct cache_head *b) +{ + struct rsi *item = container_of(a, struct rsi, h); + struct rsi *tmp = container_of(b, struct rsi, h); + return netobj_equal(&item->in_handle, &tmp->in_handle) && + netobj_equal(&item->in_token, &tmp->in_token); +} + +static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) +{ + dst->len = len; + dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL); + if (len && !dst->data) + return -ENOMEM; + return 0; +} + +static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) +{ + return dup_to_netobj(dst, src->data, src->len); +} + +static void rsi_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + new->out_handle.data = NULL; + new->out_handle.len = 0; + new->out_token.data = NULL; + new->out_token.len = 0; + new->in_handle.len = item->in_handle.len; + item->in_handle.len = 0; + new->in_token.len = item->in_token.len; + item->in_token.len = 0; + new->in_handle.data = item->in_handle.data; + item->in_handle.data = NULL; + new->in_token.data = item->in_token.data; + item->in_token.data = NULL; +} + +static void update_rsi(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + BUG_ON(new->out_handle.data || new->out_token.data); + new->out_handle.len = item->out_handle.len; + item->out_handle.len = 0; + new->out_token.len = item->out_token.len; + item->out_token.len = 0; + new->out_handle.data = item->out_handle.data; + item->out_handle.data = NULL; + new->out_token.data = item->out_token.data; + item->out_token.data = NULL; + + new->major_status = item->major_status; + new->minor_status = item->minor_status; +} + +static struct cache_head *rsi_alloc(void) +{ + struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); + if (rsii) + return &rsii->h; + else + return NULL; +} + +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsii = container_of(h, struct rsi, h); + + qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); + qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); + (*bpp)[-1] = '\n'; +} + +static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, rsi_request); +} + + +static int rsi_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* context token expiry major minor context token */ + char *buf = mesg; + char *ep; + int len; + struct rsi rsii, *rsip = NULL; + time_t expiry; + int status = -EINVAL; + + memset(&rsii, 0, sizeof(rsii)); + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.in_handle, buf, len)) + goto out; + + /* token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.in_token, buf, len)) + goto out; + + rsip = rsi_lookup(&rsii); + if (!rsip) + goto out; + + rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* major/minor */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_handle, buf, len)) + goto out; + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_token, buf, len)) + goto out; + rsii.h.expiry_time = expiry; + rsip = rsi_update(&rsii, rsip); + status = 0; +out: + rsi_free(&rsii); + if (rsip) + cache_put(&rsip->h, &rsi_cache); + else + status = -ENOMEM; + return status; +} + +static struct cache_detail rsi_cache = { + .owner = THIS_MODULE, + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.rpcsec.init", + .cache_put = rsi_put, + .cache_upcall = rsi_upcall, + .cache_parse = rsi_parse, + .match = rsi_match, + .init = rsi_init, + .update = update_rsi, + .alloc = rsi_alloc, +}; + +static struct rsi *rsi_lookup(struct rsi *item) +{ + struct cache_head *ch; + int hash = rsi_hash(item); + + ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +static struct rsi *rsi_update(struct rsi *new, struct rsi *old) +{ + struct cache_head *ch; + int hash = rsi_hash(new); + + ch = sunrpc_cache_update(&rsi_cache, &new->h, + &old->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + + +/* + * The rpcsec_context cache is used to store a context that is + * used in data exchange. + * The key is a context handle. The content is: + * uid, gidlist, mechanism, service-set, mech-specific-data + */ + +#define RSC_HASHBITS 10 +#define RSC_HASHMAX (1<handle.data); + if (rsci->mechctx) + gss_delete_sec_context(&rsci->mechctx); + if (rsci->cred.cr_group_info) + put_group_info(rsci->cred.cr_group_info); + kfree(rsci->client_name); +} + +static void rsc_put(struct kref *ref) +{ + struct rsc *rsci = container_of(ref, struct rsc, h.ref); + + rsc_free(rsci); + kfree(rsci); +} + +static inline int +rsc_hash(struct rsc *rsci) +{ + return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); +} + +static int +rsc_match(struct cache_head *a, struct cache_head *b) +{ + struct rsc *new = container_of(a, struct rsc, h); + struct rsc *tmp = container_of(b, struct rsc, h); + + return netobj_equal(&new->handle, &tmp->handle); +} + +static void +rsc_init(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + new->handle.len = tmp->handle.len; + tmp->handle.len = 0; + new->handle.data = tmp->handle.data; + tmp->handle.data = NULL; + new->mechctx = NULL; + new->cred.cr_group_info = NULL; + new->client_name = NULL; +} + +static void +update_rsc(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + new->mechctx = tmp->mechctx; + tmp->mechctx = NULL; + memset(&new->seqdata, 0, sizeof(new->seqdata)); + spin_lock_init(&new->seqdata.sd_lock); + new->cred = tmp->cred; + tmp->cred.cr_group_info = NULL; + new->client_name = tmp->client_name; + tmp->client_name = NULL; +} + +static struct cache_head * +rsc_alloc(void) +{ + struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); + if (rsci) + return &rsci->h; + else + return NULL; +} + +static int rsc_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* contexthandle expiry [ uid gid N mechname ...mechdata... ] */ + char *buf = mesg; + int len, rv; + struct rsc rsci, *rscp = NULL; + time_t expiry; + int status = -EINVAL; + struct gss_api_mech *gm = NULL; + + memset(&rsci, 0, sizeof(rsci)); + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsci.handle, buf, len)) + goto out; + + rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + rscp = rsc_lookup(&rsci); + if (!rscp) + goto out; + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, &rsci.cred.cr_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + else { + int N, i; + + /* gid */ + if (get_int(&mesg, &rsci.cred.cr_gid)) + goto out; + + /* number of additional gid's */ + if (get_int(&mesg, &N)) + goto out; + status = -ENOMEM; + rsci.cred.cr_group_info = groups_alloc(N); + if (rsci.cred.cr_group_info == NULL) + goto out; + + /* gid's */ + status = -EINVAL; + for (i=0; i 0) { + rsci.client_name = kstrdup(buf, GFP_KERNEL); + if (!rsci.client_name) + goto out; + } + + } + rsci.h.expiry_time = expiry; + rscp = rsc_update(&rsci, rscp); + status = 0; +out: + gss_mech_put(gm); + rsc_free(&rsci); + if (rscp) + cache_put(&rscp->h, &rsc_cache); + else + status = -ENOMEM; + return status; +} + +static struct cache_detail rsc_cache = { + .owner = THIS_MODULE, + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.rpcsec.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, + .match = rsc_match, + .init = rsc_init, + .update = update_rsc, + .alloc = rsc_alloc, +}; + +static struct rsc *rsc_lookup(struct rsc *item) +{ + struct cache_head *ch; + int hash = rsc_hash(item); + + ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +static struct rsc *rsc_update(struct rsc *new, struct rsc *old) +{ + struct cache_head *ch; + int hash = rsc_hash(new); + + ch = sunrpc_cache_update(&rsc_cache, &new->h, + &old->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + + +static struct rsc * +gss_svc_searchbyctx(struct xdr_netobj *handle) +{ + struct rsc rsci; + struct rsc *found; + + memset(&rsci, 0, sizeof(rsci)); + if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) + return NULL; + found = rsc_lookup(&rsci); + rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + return found; +} + +/* Implements sequence number algorithm as specified in RFC 2203. */ +static int +gss_check_seq_num(struct rsc *rsci, int seq_num) +{ + struct gss_svc_seq_data *sd = &rsci->seqdata; + + spin_lock(&sd->sd_lock); + if (seq_num > sd->sd_max) { + if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { + memset(sd->sd_win,0,sizeof(sd->sd_win)); + sd->sd_max = seq_num; + } else while (sd->sd_max < seq_num) { + sd->sd_max++; + __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); + } + __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); + goto ok; + } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { + goto drop; + } + /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */ + if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) + goto drop; +ok: + spin_unlock(&sd->sd_lock); + return 1; +drop: + spin_unlock(&sd->sd_lock); + return 0; +} + +static inline u32 round_up_to_quad(u32 i) +{ + return (i + 3 ) & ~3; +} + +static inline int +svc_safe_getnetobj(struct kvec *argv, struct xdr_netobj *o) +{ + int l; + + if (argv->iov_len < 4) + return -1; + o->len = svc_getnl(argv); + l = round_up_to_quad(o->len); + if (argv->iov_len < l) + return -1; + o->data = argv->iov_base; + argv->iov_base += l; + argv->iov_len -= l; + return 0; +} + +static inline int +svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) +{ + u8 *p; + + if (resv->iov_len + 4 > PAGE_SIZE) + return -1; + svc_putnl(resv, o->len); + p = resv->iov_base + resv->iov_len; + resv->iov_len += round_up_to_quad(o->len); + if (resv->iov_len > PAGE_SIZE) + return -1; + memcpy(p, o->data, o->len); + memset(p + o->len, 0, round_up_to_quad(o->len) - o->len); + return 0; +} + +/* + * Verify the checksum on the header and return SVC_OK on success. + * Otherwise, return SVC_DROP (in the case of a bad sequence number) + * or return SVC_DENIED and indicate error in authp. + */ +static int +gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, + __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp) +{ + struct gss_ctx *ctx_id = rsci->mechctx; + struct xdr_buf rpchdr; + struct xdr_netobj checksum; + u32 flavor = 0; + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec iov; + + /* data to compute the checksum over: */ + iov.iov_base = rpcstart; + iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; + xdr_buf_from_iov(&iov, &rpchdr); + + *authp = rpc_autherr_badverf; + if (argv->iov_len < 4) + return SVC_DENIED; + flavor = svc_getnl(argv); + if (flavor != RPC_AUTH_GSS) + return SVC_DENIED; + if (svc_safe_getnetobj(argv, &checksum)) + return SVC_DENIED; + + if (rqstp->rq_deferred) /* skip verification of revisited request */ + return SVC_OK; + if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { + *authp = rpcsec_gsserr_credproblem; + return SVC_DENIED; + } + + if (gc->gc_seq > MAXSEQ) { + dprintk("RPC: svcauth_gss: discarding request with " + "large sequence number %d\n", gc->gc_seq); + *authp = rpcsec_gsserr_ctxproblem; + return SVC_DENIED; + } + if (!gss_check_seq_num(rsci, gc->gc_seq)) { + dprintk("RPC: svcauth_gss: discarding request with " + "old sequence number %d\n", gc->gc_seq); + return SVC_DROP; + } + return SVC_OK; +} + +static int +gss_write_null_verf(struct svc_rqst *rqstp) +{ + __be32 *p; + + svc_putnl(rqstp->rq_res.head, RPC_AUTH_NULL); + p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; + /* don't really need to check if head->iov_len > PAGE_SIZE ... */ + *p++ = 0; + if (!xdr_ressize_check(rqstp, p)) + return -1; + return 0; +} + +static int +gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) +{ + __be32 xdr_seq; + u32 maj_stat; + struct xdr_buf verf_data; + struct xdr_netobj mic; + __be32 *p; + struct kvec iov; + + svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS); + xdr_seq = htonl(seq); + + iov.iov_base = &xdr_seq; + iov.iov_len = sizeof(xdr_seq); + xdr_buf_from_iov(&iov, &verf_data); + p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; + mic.data = (u8 *)(p + 1); + maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); + if (maj_stat != GSS_S_COMPLETE) + return -1; + *p++ = htonl(mic.len); + memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len); + p += XDR_QUADLEN(mic.len); + if (!xdr_ressize_check(rqstp, p)) + return -1; + return 0; +} + +struct gss_domain { + struct auth_domain h; + u32 pseudoflavor; +}; + +static struct auth_domain * +find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) +{ + char *name; + + name = gss_service_to_auth_domain_name(ctx->mech_type, svc); + if (!name) + return NULL; + return auth_domain_find(name); +} + +static struct auth_ops svcauthops_gss; + +u32 svcauth_gss_flavor(struct auth_domain *dom) +{ + struct gss_domain *gd = container_of(dom, struct gss_domain, h); + + return gd->pseudoflavor; +} + +EXPORT_SYMBOL_GPL(svcauth_gss_flavor); + +int +svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) +{ + struct gss_domain *new; + struct auth_domain *test; + int stat = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + kref_init(&new->h.ref); + new->h.name = kstrdup(name, GFP_KERNEL); + if (!new->h.name) + goto out_free_dom; + new->h.flavour = &svcauthops_gss; + new->pseudoflavor = pseudoflavor; + + stat = 0; + test = auth_domain_lookup(name, &new->h); + if (test != &new->h) { /* Duplicate registration */ + auth_domain_put(test); + kfree(new->h.name); + goto out_free_dom; + } + return 0; + +out_free_dom: + kfree(new); +out: + return stat; +} + +EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); + +static inline int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + __be32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* It would be nice if this bit of code could be shared with the client. + * Obstacles: + * The client shouldn't malloc(), would have to pass in own memory. + * The server uses base of head iovec as read pointer, while the + * client uses separate pointer. */ +static int +unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +{ + int stat = -EINVAL; + u32 integ_len, maj_stat; + struct xdr_netobj mic; + struct xdr_buf integ_buf; + + integ_len = svc_getnl(&buf->head[0]); + if (integ_len & 3) + return stat; + if (integ_len > buf->len) + return stat; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + BUG(); + /* copy out mic... */ + if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) + BUG(); + if (mic.len > RPC_MAX_AUTH_SIZE) + return stat; + mic.data = kmalloc(mic.len, GFP_KERNEL); + if (!mic.data) + return stat; + if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) + goto out; + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); + if (maj_stat != GSS_S_COMPLETE) + goto out; + if (svc_getnl(&buf->head[0]) != seq) + goto out; + stat = 0; +out: + kfree(mic.data); + return stat; +} + +static inline int +total_buf_len(struct xdr_buf *buf) +{ + return buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len; +} + +static void +fix_priv_head(struct xdr_buf *buf, int pad) +{ + if (buf->page_len == 0) { + /* We need to adjust head and buf->len in tandem in this + * case to make svc_defer() work--it finds the original + * buffer start using buf->len - buf->head[0].iov_len. */ + buf->head[0].iov_len -= pad; + } +} + +static int +unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +{ + u32 priv_len, maj_stat; + int pad, saved_len, remaining_len, offset; + + rqstp->rq_splice_ok = 0; + + priv_len = svc_getnl(&buf->head[0]); + if (rqstp->rq_deferred) { + /* Already decrypted last time through! The sequence number + * check at out_seq is unnecessary but harmless: */ + goto out_seq; + } + /* buf->len is the number of bytes from the original start of the + * request to the end, where head[0].iov_len is just the bytes + * not yet read from the head, so these two values are different: */ + remaining_len = total_buf_len(buf); + if (priv_len > remaining_len) + return -EINVAL; + pad = remaining_len - priv_len; + buf->len -= pad; + fix_priv_head(buf, pad); + + /* Maybe it would be better to give gss_unwrap a length parameter: */ + saved_len = buf->len; + buf->len = priv_len; + maj_stat = gss_unwrap(ctx, 0, buf); + pad = priv_len - buf->len; + buf->len = saved_len; + buf->len -= pad; + /* The upper layers assume the buffer is aligned on 4-byte boundaries. + * In the krb5p case, at least, the data ends up offset, so we need to + * move it around. */ + /* XXX: This is very inefficient. It would be better to either do + * this while we encrypt, or maybe in the receive code, if we can peak + * ahead and work out the service and mechanism there. */ + offset = buf->head[0].iov_len % 4; + if (offset) { + buf->buflen = RPCSVC_MAXPAYLOAD; + xdr_shift_buf(buf, offset); + fix_priv_head(buf, pad); + } + if (maj_stat != GSS_S_COMPLETE) + return -EINVAL; +out_seq: + if (svc_getnl(&buf->head[0]) != seq) + return -EINVAL; + return 0; +} + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* save a pointer to the beginning of the encoded verifier, + * for use in encryption/checksumming in svcauth_gss_release: */ + __be32 *verf_start; + struct rsc *rsci; +}; + +char *svc_gss_principal(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data; + + if (gd && gd->rsci) + return gd->rsci->client_name; + return NULL; +} +EXPORT_SYMBOL_GPL(svc_gss_principal); + +static int +svcauth_gss_set_client(struct svc_rqst *rqstp) +{ + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rsc *rsci = svcdata->rsci; + struct rpc_gss_wire_cred *gc = &svcdata->clcred; + int stat; + + /* + * A gss export can be specified either by: + * export *(sec=krb5,rw) + * or by + * export gss/krb5(rw) + * The latter is deprecated; but for backwards compatibility reasons + * the nfsd code will still fall back on trying it if the former + * doesn't work; so we try to make both available to nfsd, below. + */ + rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); + if (rqstp->rq_gssclient == NULL) + return SVC_DENIED; + stat = svcauth_unix_set_client(rqstp); + if (stat == SVC_DROP || stat == SVC_CLOSE) + return stat; + return SVC_OK; +} + +static inline int +gss_write_init_verf(struct svc_rqst *rqstp, struct rsi *rsip) +{ + struct rsc *rsci; + int rc; + + if (rsip->major_status != GSS_S_COMPLETE) + return gss_write_null_verf(rqstp); + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (rsci == NULL) { + rsip->major_status = GSS_S_NO_CONTEXT; + return gss_write_null_verf(rqstp); + } + rc = gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN); + cache_put(&rsci->h, &rsc_cache); + return rc; +} + +/* + * Having read the cred already and found we're in the context + * initiation case, read the verifier and initiate (or check the results + * of) upcalls to userspace for help with context initiation. If + * the upcall results are available, write the verifier and result. + * Otherwise, drop the request pending an answer to the upcall. + */ +static int svcauth_gss_handle_init(struct svc_rqst *rqstp, + struct rpc_gss_wire_cred *gc, __be32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct xdr_netobj tmpobj; + struct rsi *rsip, rsikey; + int ret; + + /* Read the verifier; should be NULL: */ + *authp = rpc_autherr_badverf; + if (argv->iov_len < 2 * 4) + return SVC_DENIED; + if (svc_getnl(argv) != RPC_AUTH_NULL) + return SVC_DENIED; + if (svc_getnl(argv) != 0) + return SVC_DENIED; + + /* Martial context handle and token for upcall: */ + *authp = rpc_autherr_badcred; + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) + return SVC_DENIED; + memset(&rsikey, 0, sizeof(rsikey)); + if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) + return SVC_CLOSE; + *authp = rpc_autherr_badverf; + if (svc_safe_getnetobj(argv, &tmpobj)) { + kfree(rsikey.in_handle.data); + return SVC_DENIED; + } + if (dup_netobj(&rsikey.in_token, &tmpobj)) { + kfree(rsikey.in_handle.data); + return SVC_CLOSE; + } + + /* Perform upcall, or find upcall result: */ + rsip = rsi_lookup(&rsikey); + rsi_free(&rsikey); + if (!rsip) + return SVC_CLOSE; + if (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0) + /* No upcall result: */ + return SVC_CLOSE; + + ret = SVC_CLOSE; + /* Got an answer to the upcall; use it: */ + if (gss_write_init_verf(rqstp, rsip)) + goto out; + if (resv->iov_len + 4 > PAGE_SIZE) + goto out; + svc_putnl(resv, RPC_SUCCESS); + if (svc_safe_putnetobj(resv, &rsip->out_handle)) + goto out; + if (resv->iov_len + 3 * 4 > PAGE_SIZE) + goto out; + svc_putnl(resv, rsip->major_status); + svc_putnl(resv, rsip->minor_status); + svc_putnl(resv, GSS_SEQ_WIN); + if (svc_safe_putnetobj(resv, &rsip->out_token)) + goto out; + + ret = SVC_COMPLETE; +out: + cache_put(&rsip->h, &rsi_cache); + return ret; +} + +/* + * Accept an rpcsec packet. + * If context establishment, punt to user space + * If data exchange, verify/decrypt + * If context destruction, handle here + * In the context establishment and destruction case we encode + * response here and return SVC_COMPLETE. + */ +static int +svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + u32 crlen; + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc; + struct rsc *rsci = NULL; + __be32 *rpcstart; + __be32 *reject_stat = resv->iov_base + resv->iov_len; + int ret; + + dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n", + argv->iov_len); + + *authp = rpc_autherr_badcred; + if (!svcdata) + svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); + if (!svcdata) + goto auth_err; + rqstp->rq_auth_data = svcdata; + svcdata->verf_start = NULL; + svcdata->rsci = NULL; + gc = &svcdata->clcred; + + /* start of rpc packet is 7 u32's back from here: + * xid direction rpcversion prog vers proc flavour + */ + rpcstart = argv->iov_base; + rpcstart -= 7; + + /* credential is: + * version(==1), proc(0,1,2,3), seq, service (1,2,3), handle + * at least 5 u32s, and is preceded by length, so that makes 6. + */ + + if (argv->iov_len < 5 * 4) + goto auth_err; + crlen = svc_getnl(argv); + if (svc_getnl(argv) != RPC_GSS_VERSION) + goto auth_err; + gc->gc_proc = svc_getnl(argv); + gc->gc_seq = svc_getnl(argv); + gc->gc_svc = svc_getnl(argv); + if (svc_safe_getnetobj(argv, &gc->gc_ctx)) + goto auth_err; + if (crlen != round_up_to_quad(gc->gc_ctx.len) + 5 * 4) + goto auth_err; + + if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) + goto auth_err; + + *authp = rpc_autherr_badverf; + switch (gc->gc_proc) { + case RPC_GSS_PROC_INIT: + case RPC_GSS_PROC_CONTINUE_INIT: + return svcauth_gss_handle_init(rqstp, gc, authp); + case RPC_GSS_PROC_DATA: + case RPC_GSS_PROC_DESTROY: + /* Look up the context, and check the verifier: */ + *authp = rpcsec_gsserr_credproblem; + rsci = gss_svc_searchbyctx(&gc->gc_ctx); + if (!rsci) + goto auth_err; + switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + case SVC_OK: + break; + case SVC_DENIED: + goto auth_err; + case SVC_DROP: + goto drop; + } + break; + default: + *authp = rpc_autherr_rejectedcred; + goto auth_err; + } + + /* now act upon the command: */ + switch (gc->gc_proc) { + case RPC_GSS_PROC_DESTROY: + if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; + rsci->h.expiry_time = get_seconds(); + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + if (resv->iov_len + 4 > PAGE_SIZE) + goto drop; + svc_putnl(resv, RPC_SUCCESS); + goto complete; + case RPC_GSS_PROC_DATA: + *authp = rpcsec_gsserr_ctxproblem; + svcdata->verf_start = resv->iov_base + resv->iov_len; + if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; + rqstp->rq_cred = rsci->cred; + get_group_info(rsci->cred.cr_group_info); + *authp = rpc_autherr_badcred; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + /* placeholders for length and seq. number: */ + svc_putnl(resv, 0); + svc_putnl(resv, 0); + if (unwrap_integ_data(&rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto garbage_args; + break; + case RPC_GSS_SVC_PRIVACY: + /* placeholders for length and seq. number: */ + svc_putnl(resv, 0); + svc_putnl(resv, 0); + if (unwrap_priv_data(rqstp, &rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto garbage_args; + break; + default: + goto auth_err; + } + svcdata->rsci = rsci; + cache_get(&rsci->h); + rqstp->rq_flavor = gss_svc_to_pseudoflavor( + rsci->mechctx->mech_type, gc->gc_svc); + ret = SVC_OK; + goto out; + } +garbage_args: + ret = SVC_GARBAGE; + goto out; +auth_err: + /* Restore write pointer to its original value: */ + xdr_ressize_check(rqstp, reject_stat); + ret = SVC_DENIED; + goto out; +complete: + ret = SVC_COMPLETE; + goto out; +drop: + ret = SVC_DROP; +out: + if (rsci) + cache_put(&rsci->h, &rsc_cache); + return ret; +} + +static __be32 * +svcauth_gss_prepare_to_wrap(struct xdr_buf *resbuf, struct gss_svc_data *gsd) +{ + __be32 *p; + u32 verf_len; + + p = gsd->verf_start; + gsd->verf_start = NULL; + + /* If the reply stat is nonzero, don't wrap: */ + if (*(p-1) != rpc_success) + return NULL; + /* Skip the verifier: */ + p += 1; + verf_len = ntohl(*p++); + p += XDR_QUADLEN(verf_len); + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* Also don't wrap if the accept stat is nonzero: */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + return NULL; + } + p++; + return p; +} + +static inline int +svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + struct kvec *resv; + __be32 *p; + int integ_offset, integ_len; + int stat = -EINVAL; + + p = svcauth_gss_prepare_to_wrap(resbuf, gsd); + if (p == NULL) + goto out; + integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; + integ_len = resbuf->len - integ_offset; + BUG_ON(integ_len % 4); + *p++ = htonl(integ_len); + *p++ = htonl(gc->gc_seq); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, + integ_len)) + BUG(); + if (resbuf->tail[0].iov_base == NULL) { + if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) + goto out_err; + resbuf->tail[0].iov_base = resbuf->head[0].iov_base + + resbuf->head[0].iov_len; + resbuf->tail[0].iov_len = 0; + resv = &resbuf->tail[0]; + } else { + resv = &resbuf->tail[0]; + } + mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; + if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) + goto out_err; + svc_putnl(resv, mic.len); + memset(mic.data + mic.len, 0, + round_up_to_quad(mic.len) - mic.len); + resv->iov_len += XDR_QUADLEN(mic.len) << 2; + /* not strictly required: */ + resbuf->len += XDR_QUADLEN(mic.len) << 2; + BUG_ON(resv->iov_len > PAGE_SIZE); +out: + stat = 0; +out_err: + return stat; +} + +static inline int +svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + struct page **inpages = NULL; + __be32 *p, *len; + int offset; + int pad; + + p = svcauth_gss_prepare_to_wrap(resbuf, gsd); + if (p == NULL) + return 0; + len = p++; + offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base; + *p++ = htonl(gc->gc_seq); + inpages = resbuf->pages; + /* XXX: Would be better to write some xdr helper functions for + * nfs{2,3,4}xdr.c that place the data right, instead of copying: */ + + /* + * If there is currently tail data, make sure there is + * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in + * the page, and move the current tail data such that + * there is RPC_MAX_AUTH_SIZE slack space available in + * both the head and tail. + */ + if (resbuf->tail[0].iov_base) { + BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base + + PAGE_SIZE); + BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base); + if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len + + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) + return -ENOMEM; + memmove(resbuf->tail[0].iov_base + RPC_MAX_AUTH_SIZE, + resbuf->tail[0].iov_base, + resbuf->tail[0].iov_len); + resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE; + } + /* + * If there is no current tail data, make sure there is + * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the + * allotted page, and set up tail information such that there + * is RPC_MAX_AUTH_SIZE slack space available in both the + * head and tail. + */ + if (resbuf->tail[0].iov_base == NULL) { + if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE) + return -ENOMEM; + resbuf->tail[0].iov_base = resbuf->head[0].iov_base + + resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE; + resbuf->tail[0].iov_len = 0; + } + if (gss_wrap(gsd->rsci->mechctx, offset, resbuf, inpages)) + return -ENOMEM; + *len = htonl(resbuf->len - offset); + pad = 3 - ((resbuf->len - offset - 1)&3); + p = (__be32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len); + memset(p, 0, pad); + resbuf->tail[0].iov_len += pad; + resbuf->len += pad; + return 0; +} + +static int +svcauth_gss_release(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + int stat = -EINVAL; + + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ + if (gsd->verf_start == NULL) + goto out; + /* normally not set till svc_send, but we need it here: */ + /* XXX: what for? Do we mess it up the moment we call svc_putu32 + * or whatever? */ + resbuf->len = total_buf_len(resbuf); + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + stat = svcauth_gss_wrap_resp_integ(rqstp); + if (stat) + goto out_err; + break; + case RPC_GSS_SVC_PRIVACY: + stat = svcauth_gss_wrap_resp_priv(rqstp); + if (stat) + goto out_err; + break; + /* + * For any other gc_svc value, svcauth_gss_accept() already set + * the auth_error appropriately; just fall through: + */ + } + +out: + stat = 0; +out_err: + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_gssclient) + auth_domain_put(rqstp->rq_gssclient); + rqstp->rq_gssclient = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + if (gsd->rsci) + cache_put(&gsd->rsci->h, &rsc_cache); + gsd->rsci = NULL; + + return stat; +} + +static void +svcauth_gss_domain_release(struct auth_domain *dom) +{ + struct gss_domain *gd = container_of(dom, struct gss_domain, h); + + kfree(dom->name); + kfree(gd); +} + +static struct auth_ops svcauthops_gss = { + .name = "rpcsec_gss", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_GSS, + .accept = svcauth_gss_accept, + .release = svcauth_gss_release, + .domain_release = svcauth_gss_domain_release, + .set_client = svcauth_gss_set_client, +}; + +int +gss_svc_init(void) +{ + int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); + if (rv) + return rv; + rv = cache_register(&rsc_cache); + if (rv) + goto out1; + rv = cache_register(&rsi_cache); + if (rv) + goto out2; + return 0; +out2: + cache_unregister(&rsc_cache); +out1: + svc_auth_unregister(RPC_AUTH_GSS); + return rv; +} + +void +gss_svc_shutdown(void) +{ + cache_unregister(&rsc_cache); + cache_unregister(&rsi_cache); + svc_auth_unregister(RPC_AUTH_GSS); +} diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c new file mode 100644 index 00000000..a5c36c01 --- /dev/null +++ b/net/sunrpc/auth_null.c @@ -0,0 +1,142 @@ +/* + * linux/net/sunrpc/auth_null.c + * + * AUTH_NULL authentication. Really :-) + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_auth null_auth; +static struct rpc_cred null_cred; + +static struct rpc_auth * +nul_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + atomic_inc(&null_auth.au_count); + return &null_auth; +} + +static void +nul_destroy(struct rpc_auth *auth) +{ +} + +/* + * Lookup NULL creds for current process + */ +static struct rpc_cred * +nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return get_rpccred(&null_cred); +} + +/* + * Destroy cred handle. + */ +static void +nul_destroy_cred(struct rpc_cred *cred) +{ +} + +/* + * Match cred handle against current process + */ +static int +nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags) +{ + return 1; +} + +/* + * Marshal credential. + */ +static __be32 * +nul_marshal(struct rpc_task *task, __be32 *p) +{ + *p++ = htonl(RPC_AUTH_NULL); + *p++ = 0; + *p++ = htonl(RPC_AUTH_NULL); + *p++ = 0; + + return p; +} + +/* + * Refresh credential. This is a no-op for AUTH_NULL + */ +static int +nul_refresh(struct rpc_task *task) +{ + set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); + return 0; +} + +static __be32 * +nul_validate(struct rpc_task *task, __be32 *p) +{ + rpc_authflavor_t flavor; + u32 size; + + flavor = ntohl(*p++); + if (flavor != RPC_AUTH_NULL) { + printk("RPC: bad verf flavor: %u\n", flavor); + return NULL; + } + + size = ntohl(*p++); + if (size != 0) { + printk("RPC: bad verf size: %u\n", size); + return NULL; + } + + return p; +} + +const struct rpc_authops authnull_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_NULL, + .au_name = "NULL", + .create = nul_create, + .destroy = nul_destroy, + .lookup_cred = nul_lookup_cred, +}; + +static +struct rpc_auth null_auth = { + .au_cslack = 4, + .au_rslack = 2, + .au_ops = &authnull_ops, + .au_flavor = RPC_AUTH_NULL, + .au_count = ATOMIC_INIT(0), +}; + +static +const struct rpc_credops null_credops = { + .cr_name = "AUTH_NULL", + .crdestroy = nul_destroy_cred, + .crbind = rpcauth_generic_bind_cred, + .crmatch = nul_match, + .crmarshal = nul_marshal, + .crrefresh = nul_refresh, + .crvalidate = nul_validate, +}; + +static +struct rpc_cred null_cred = { + .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), + .cr_auth = &null_auth, + .cr_ops = &null_credops, + .cr_count = ATOMIC_INIT(1), + .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, +#ifdef RPC_DEBUG + .cr_magic = RPCAUTH_CRED_MAGIC, +#endif +}; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c new file mode 100644 index 00000000..e50502d8 --- /dev/null +++ b/net/sunrpc/auth_unix.c @@ -0,0 +1,246 @@ +/* + * linux/net/sunrpc/auth_unix.c + * + * UNIX-style authentication; no AUTH_SHORT support + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include + +#define NFS_NGROUPS 16 + +struct unx_cred { + struct rpc_cred uc_base; + gid_t uc_gid; + gid_t uc_gids[NFS_NGROUPS]; +}; +#define uc_uid uc_base.cr_uid + +#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_auth unix_auth; +static const struct rpc_credops unix_credops; + +static struct rpc_auth * +unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + dprintk("RPC: creating UNIX authenticator for client %p\n", + clnt); + atomic_inc(&unix_auth.au_count); + return &unix_auth; +} + +static void +unx_destroy(struct rpc_auth *auth) +{ + dprintk("RPC: destroying UNIX authenticator %p\n", auth); + rpcauth_clear_credcache(auth->au_credcache); +} + +/* + * Lookup AUTH_UNIX creds for current process + */ +static struct rpc_cred * +unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return rpcauth_lookup_credcache(auth, acred, flags); +} + +static struct rpc_cred * +unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + struct unx_cred *cred; + unsigned int groups = 0; + unsigned int i; + + dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", + acred->uid, acred->gid); + + if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS))) + return ERR_PTR(-ENOMEM); + + rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); + cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + + if (acred->group_info != NULL) + groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + + cred->uc_gid = acred->gid; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) + cred->uc_gids[i] = NOGROUP; + + return &cred->uc_base; +} + +static void +unx_free_cred(struct unx_cred *unx_cred) +{ + dprintk("RPC: unx_free_cred %p\n", unx_cred); + kfree(unx_cred); +} + +static void +unx_free_cred_callback(struct rcu_head *head) +{ + struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu); + unx_free_cred(unx_cred); +} + +static void +unx_destroy_cred(struct rpc_cred *cred) +{ + call_rcu(&cred->cr_rcu, unx_free_cred_callback); +} + +/* + * Match credentials against current process creds. + * The root_override argument takes care of cases where the caller may + * request root creds (e.g. for NFS swapping). + */ +static int +unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) +{ + struct unx_cred *cred = container_of(rcred, struct unx_cred, uc_base); + unsigned int groups = 0; + unsigned int i; + + + if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid) + return 0; + + if (acred->group_info != NULL) + groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + for (i = 0; i < groups ; i++) + if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) + return 0; + if (groups < NFS_NGROUPS && + cred->uc_gids[groups] != NOGROUP) + return 0; + return 1; +} + +/* + * Marshal credentials. + * Maybe we should keep a cached credential for performance reasons. + */ +static __be32 * +unx_marshal(struct rpc_task *task, __be32 *p) +{ + struct rpc_clnt *clnt = task->tk_client; + struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base); + __be32 *base, *hold; + int i; + + *p++ = htonl(RPC_AUTH_UNIX); + base = p++; + *p++ = htonl(jiffies/HZ); + + /* + * Copy the UTS nodename captured when the client was created. + */ + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); + + *p++ = htonl((u32) cred->uc_uid); + *p++ = htonl((u32) cred->uc_gid); + hold = p++; + for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) + *p++ = htonl((u32) cred->uc_gids[i]); + *hold = htonl(p - hold - 1); /* gid array length */ + *base = htonl((p - base - 1) << 2); /* cred length */ + + *p++ = htonl(RPC_AUTH_NULL); + *p++ = htonl(0); + + return p; +} + +/* + * Refresh credentials. This is a no-op for AUTH_UNIX + */ +static int +unx_refresh(struct rpc_task *task) +{ + set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); + return 0; +} + +static __be32 * +unx_validate(struct rpc_task *task, __be32 *p) +{ + rpc_authflavor_t flavor; + u32 size; + + flavor = ntohl(*p++); + if (flavor != RPC_AUTH_NULL && + flavor != RPC_AUTH_UNIX && + flavor != RPC_AUTH_SHORT) { + printk("RPC: bad verf flavor: %u\n", flavor); + return NULL; + } + + size = ntohl(*p++); + if (size > RPC_MAX_AUTH_SIZE) { + printk("RPC: giant verf size: %u\n", size); + return NULL; + } + task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2; + p += (size >> 2); + + return p; +} + +int __init rpc_init_authunix(void) +{ + return rpcauth_init_credcache(&unix_auth); +} + +void rpc_destroy_authunix(void) +{ + rpcauth_destroy_credcache(&unix_auth); +} + +const struct rpc_authops authunix_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_UNIX, + .au_name = "UNIX", + .create = unx_create, + .destroy = unx_destroy, + .lookup_cred = unx_lookup_cred, + .crcreate = unx_create_cred, +}; + +static +struct rpc_auth unix_auth = { + .au_cslack = UNX_WRITESLACK, + .au_rslack = 2, /* assume AUTH_NULL verf */ + .au_ops = &authunix_ops, + .au_flavor = RPC_AUTH_UNIX, + .au_count = ATOMIC_INIT(0), +}; + +static +const struct rpc_credops unix_credops = { + .cr_name = "AUTH_UNIX", + .crdestroy = unx_destroy_cred, + .crbind = rpcauth_generic_bind_cred, + .crmatch = unx_match, + .crmarshal = unx_marshal, + .crrefresh = unx_refresh, + .crvalidate = unx_validate, +}; diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c new file mode 100644 index 00000000..cf06af3b --- /dev/null +++ b/net/sunrpc/backchannel_rqst.c @@ -0,0 +1,282 @@ +/****************************************************************************** + +(c) 2007 Network Appliance, Inc. All Rights Reserved. +(c) 2009 NetApp. All Rights Reserved. + +NetApp provides this source code under the GPL v2 License. +The GPL v2 license is available at +http://opensource.org/licenses/gpl-license.php. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ + +#include +#include +#include + +#ifdef RPC_DEBUG +#define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#if defined(CONFIG_NFS_V4_1) + +/* + * Helper routines that track the number of preallocation elements + * on the transport. + */ +static inline int xprt_need_to_requeue(struct rpc_xprt *xprt) +{ + return xprt->bc_alloc_count > 0; +} + +static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n) +{ + xprt->bc_alloc_count += n; +} + +static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n) +{ + return xprt->bc_alloc_count -= n; +} + +/* + * Free the preallocated rpc_rqst structure and the memory + * buffers hanging off of it. + */ +static void xprt_free_allocation(struct rpc_rqst *req) +{ + struct xdr_buf *xbufp; + + dprintk("RPC: free allocations for req= %p\n", req); + BUG_ON(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); + xbufp = &req->rq_private_buf; + free_page((unsigned long)xbufp->head[0].iov_base); + xbufp = &req->rq_snd_buf; + free_page((unsigned long)xbufp->head[0].iov_base); + list_del(&req->rq_bc_pa_list); + kfree(req); +} + +/* + * Preallocate up to min_reqs structures and related buffers for use + * by the backchannel. This function can be called multiple times + * when creating new sessions that use the same rpc_xprt. The + * preallocated buffers are added to the pool of resources used by + * the rpc_xprt. Anyone of these resources may be used used by an + * incoming callback request. It's up to the higher levels in the + * stack to enforce that the maximum number of session slots is not + * being exceeded. + * + * Some callback arguments can be large. For example, a pNFS server + * using multiple deviceids. The list can be unbound, but the client + * has the ability to tell the server the maximum size of the callback + * requests. Each deviceID is 16 bytes, so allocate one page + * for the arguments to have enough room to receive a number of these + * deviceIDs. The NFS client indicates to the pNFS server that its + * callback requests can be up to 4096 bytes in size. + */ +int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) +{ + struct page *page_rcv = NULL, *page_snd = NULL; + struct xdr_buf *xbufp = NULL; + struct rpc_rqst *req, *tmp; + struct list_head tmp_list; + int i; + + dprintk("RPC: setup backchannel transport\n"); + + /* + * We use a temporary list to keep track of the preallocated + * buffers. Once we're done building the list we splice it + * into the backchannel preallocation list off of the rpc_xprt + * struct. This helps minimize the amount of time the list + * lock is held on the rpc_xprt struct. It also makes cleanup + * easier in case of memory allocation errors. + */ + INIT_LIST_HEAD(&tmp_list); + for (i = 0; i < min_reqs; i++) { + /* Pre-allocate one backchannel rpc_rqst */ + req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); + if (req == NULL) { + printk(KERN_ERR "Failed to create bc rpc_rqst\n"); + goto out_free; + } + + /* Add the allocated buffer to the tmp list */ + dprintk("RPC: adding req= %p\n", req); + list_add(&req->rq_bc_pa_list, &tmp_list); + + req->rq_xprt = xprt; + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_bc_list); + + /* Preallocate one XDR receive buffer */ + page_rcv = alloc_page(GFP_KERNEL); + if (page_rcv == NULL) { + printk(KERN_ERR "Failed to create bc receive xbuf\n"); + goto out_free; + } + xbufp = &req->rq_rcv_buf; + xbufp->head[0].iov_base = page_address(page_rcv); + xbufp->head[0].iov_len = PAGE_SIZE; + xbufp->tail[0].iov_base = NULL; + xbufp->tail[0].iov_len = 0; + xbufp->page_len = 0; + xbufp->len = PAGE_SIZE; + xbufp->buflen = PAGE_SIZE; + + /* Preallocate one XDR send buffer */ + page_snd = alloc_page(GFP_KERNEL); + if (page_snd == NULL) { + printk(KERN_ERR "Failed to create bc snd xbuf\n"); + goto out_free; + } + + xbufp = &req->rq_snd_buf; + xbufp->head[0].iov_base = page_address(page_snd); + xbufp->head[0].iov_len = 0; + xbufp->tail[0].iov_base = NULL; + xbufp->tail[0].iov_len = 0; + xbufp->page_len = 0; + xbufp->len = 0; + xbufp->buflen = PAGE_SIZE; + } + + /* + * Add the temporary list to the backchannel preallocation list + */ + spin_lock_bh(&xprt->bc_pa_lock); + list_splice(&tmp_list, &xprt->bc_pa_list); + xprt_inc_alloc_count(xprt, min_reqs); + spin_unlock_bh(&xprt->bc_pa_lock); + + dprintk("RPC: setup backchannel transport done\n"); + return 0; + +out_free: + /* + * Memory allocation failed, free the temporary list + */ + list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) + xprt_free_allocation(req); + + dprintk("RPC: setup backchannel transport failed\n"); + return -1; +} +EXPORT_SYMBOL(xprt_setup_backchannel); + +/* + * Destroys the backchannel preallocated structures. + * Since these structures may have been allocated by multiple calls + * to xprt_setup_backchannel, we only destroy up to the maximum number + * of reqs specified by the caller. + * @xprt: the transport holding the preallocated strucures + * @max_reqs the maximum number of preallocated structures to destroy + */ +void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) +{ + struct rpc_rqst *req = NULL, *tmp = NULL; + + dprintk("RPC: destroy backchannel transport\n"); + + BUG_ON(max_reqs == 0); + spin_lock_bh(&xprt->bc_pa_lock); + xprt_dec_alloc_count(xprt, max_reqs); + list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { + dprintk("RPC: req=%p\n", req); + xprt_free_allocation(req); + if (--max_reqs == 0) + break; + } + spin_unlock_bh(&xprt->bc_pa_lock); + + dprintk("RPC: backchannel list empty= %s\n", + list_empty(&xprt->bc_pa_list) ? "true" : "false"); +} +EXPORT_SYMBOL(xprt_destroy_backchannel); + +/* + * One or more rpc_rqst structure have been preallocated during the + * backchannel setup. Buffer space for the send and private XDR buffers + * has been preallocated as well. Use xprt_alloc_bc_request to allocate + * to this request. Use xprt_free_bc_request to return it. + * + * We know that we're called in soft interrupt context, grab the spin_lock + * since there is no need to grab the bottom half spin_lock. + * + * Return an available rpc_rqst, otherwise NULL if non are available. + */ +struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt) +{ + struct rpc_rqst *req; + + dprintk("RPC: allocate a backchannel request\n"); + spin_lock(&xprt->bc_pa_lock); + if (!list_empty(&xprt->bc_pa_list)) { + req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, + rq_bc_pa_list); + list_del(&req->rq_bc_pa_list); + } else { + req = NULL; + } + spin_unlock(&xprt->bc_pa_lock); + + if (req != NULL) { + set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); + req->rq_reply_bytes_recvd = 0; + req->rq_bytes_sent = 0; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + } + dprintk("RPC: backchannel req=%p\n", req); + return req; +} + +/* + * Return the preallocated rpc_rqst structure and XDR buffers + * associated with this rpc_task. + */ +void xprt_free_bc_request(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + + dprintk("RPC: free backchannel req=%p\n", req); + + smp_mb__before_clear_bit(); + BUG_ON(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); + clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); + smp_mb__after_clear_bit(); + + if (!xprt_need_to_requeue(xprt)) { + /* + * The last remaining session was destroyed while this + * entry was in use. Free the entry and don't attempt + * to add back to the list because there is no need to + * have anymore preallocated entries. + */ + dprintk("RPC: Last session removed req=%p\n", req); + xprt_free_allocation(req); + return; + } + + /* + * Return it to the list of preallocations so that it + * may be reused by a new callback request. + */ + spin_lock_bh(&xprt->bc_pa_lock); + list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); +} + +#endif /* CONFIG_NFS_V4_1 */ diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c new file mode 100644 index 00000000..1dd1a689 --- /dev/null +++ b/net/sunrpc/bc_svc.c @@ -0,0 +1,66 @@ +/****************************************************************************** + +(c) 2007 Network Appliance, Inc. All Rights Reserved. +(c) 2009 NetApp. All Rights Reserved. + +NetApp provides this source code under the GPL v2 License. +The GPL v2 license is available at +http://opensource.org/licenses/gpl-license.php. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ + +/* + * The NFSv4.1 callback service helper routines. + * They implement the transport level processing required to send the + * reply over an existing open connection previously established by the client. + */ + +#if defined(CONFIG_NFS_V4_1) + +#include + +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCDSP + +/* Empty callback ops */ +static const struct rpc_call_ops nfs41_callback_ops = { +}; + + +/* + * Send the callback reply + */ +int bc_send(struct rpc_rqst *req) +{ + struct rpc_task *task; + int ret; + + dprintk("RPC: bc_send req= %p\n", req); + task = rpc_run_bc_task(req, &nfs41_callback_ops); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else { + BUG_ON(atomic_read(&task->tk_count) != 1); + ret = task->tk_status; + rpc_put_task(task); + } + dprintk("RPC: bc_send ret= %d\n", ret); + return ret; +} + +#endif /* CONFIG_NFS_V4_1 */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c new file mode 100644 index 00000000..4530a912 --- /dev/null +++ b/net/sunrpc/cache.c @@ -0,0 +1,1812 @@ +/* + * net/sunrpc/cache.c + * + * Generic code for various authentication-related caches + * used by sunrpc clients and servers. + * + * Copyright (C) 2002 Neil Brown + * + * Released under terms in GPL version 2. See COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netns.h" + +#define RPCDBG_FACILITY RPCDBG_CACHE + +static bool cache_defer_req(struct cache_req *req, struct cache_head *item); +static void cache_revisit_request(struct cache_head *item); + +static void cache_init(struct cache_head *h) +{ + time_t now = seconds_since_boot(); + h->next = NULL; + h->flags = 0; + kref_init(&h->ref); + h->expiry_time = now + CACHE_NEW_EXPIRY; + h->last_refresh = now; +} + +static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) +{ + return (h->expiry_time < seconds_since_boot()) || + (detail->flush_time > h->last_refresh); +} + +struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, + struct cache_head *key, int hash) +{ + struct cache_head **head, **hp; + struct cache_head *new = NULL, *freeme = NULL; + + head = &detail->hash_table[hash]; + + read_lock(&detail->hash_lock); + + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { + if (cache_is_expired(detail, tmp)) + /* This entry is expired, we will discard it. */ + break; + cache_get(tmp); + read_unlock(&detail->hash_lock); + return tmp; + } + } + read_unlock(&detail->hash_lock); + /* Didn't find anything, insert an empty entry */ + + new = detail->alloc(); + if (!new) + return NULL; + /* must fully initialise 'new', else + * we might get lose if we need to + * cache_put it soon. + */ + cache_init(new); + detail->init(new, key); + + write_lock(&detail->hash_lock); + + /* check if entry appeared while we slept */ + for (hp=head; *hp != NULL ; hp = &(*hp)->next) { + struct cache_head *tmp = *hp; + if (detail->match(tmp, key)) { + if (cache_is_expired(detail, tmp)) { + *hp = tmp->next; + tmp->next = NULL; + detail->entries --; + freeme = tmp; + break; + } + cache_get(tmp); + write_unlock(&detail->hash_lock); + cache_put(new, detail); + return tmp; + } + } + new->next = *head; + *head = new; + detail->entries++; + cache_get(new); + write_unlock(&detail->hash_lock); + + if (freeme) + cache_put(freeme, detail); + return new; +} +EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); + + +static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); + +static void cache_fresh_locked(struct cache_head *head, time_t expiry) +{ + head->expiry_time = expiry; + head->last_refresh = seconds_since_boot(); + smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */ + set_bit(CACHE_VALID, &head->flags); +} + +static void cache_fresh_unlocked(struct cache_head *head, + struct cache_detail *detail) +{ + if (test_and_clear_bit(CACHE_PENDING, &head->flags)) { + cache_revisit_request(head); + cache_dequeue(detail, head); + } +} + +struct cache_head *sunrpc_cache_update(struct cache_detail *detail, + struct cache_head *new, struct cache_head *old, int hash) +{ + /* The 'old' entry is to be replaced by 'new'. + * If 'old' is not VALID, we update it directly, + * otherwise we need to replace it + */ + struct cache_head **head; + struct cache_head *tmp; + + if (!test_bit(CACHE_VALID, &old->flags)) { + write_lock(&detail->hash_lock); + if (!test_bit(CACHE_VALID, &old->flags)) { + if (test_bit(CACHE_NEGATIVE, &new->flags)) + set_bit(CACHE_NEGATIVE, &old->flags); + else + detail->update(old, new); + cache_fresh_locked(old, new->expiry_time); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(old, detail); + return old; + } + write_unlock(&detail->hash_lock); + } + /* We need to insert a new entry */ + tmp = detail->alloc(); + if (!tmp) { + cache_put(old, detail); + return NULL; + } + cache_init(tmp); + detail->init(tmp, old); + head = &detail->hash_table[hash]; + + write_lock(&detail->hash_lock); + if (test_bit(CACHE_NEGATIVE, &new->flags)) + set_bit(CACHE_NEGATIVE, &tmp->flags); + else + detail->update(tmp, new); + tmp->next = *head; + *head = tmp; + detail->entries++; + cache_get(tmp); + cache_fresh_locked(tmp, new->expiry_time); + cache_fresh_locked(old, 0); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(tmp, detail); + cache_fresh_unlocked(old, detail); + cache_put(old, detail); + return tmp; +} +EXPORT_SYMBOL_GPL(sunrpc_cache_update); + +static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h) +{ + if (!cd->cache_upcall) + return -EINVAL; + return cd->cache_upcall(cd, h); +} + +static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) +{ + if (!test_bit(CACHE_VALID, &h->flags)) + return -EAGAIN; + else { + /* entry is valid */ + if (test_bit(CACHE_NEGATIVE, &h->flags)) + return -ENOENT; + else { + /* + * In combination with write barrier in + * sunrpc_cache_update, ensures that anyone + * using the cache entry after this sees the + * updated contents: + */ + smp_rmb(); + return 0; + } + } +} + +static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h) +{ + int rv; + + write_lock(&detail->hash_lock); + rv = cache_is_valid(detail, h); + if (rv != -EAGAIN) { + write_unlock(&detail->hash_lock); + return rv; + } + set_bit(CACHE_NEGATIVE, &h->flags); + cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(h, detail); + return -ENOENT; +} + +/* + * This is the generic cache management routine for all + * the authentication caches. + * It checks the currency of a cache item and will (later) + * initiate an upcall to fill it if needed. + * + * + * Returns 0 if the cache_head can be used, or cache_puts it and returns + * -EAGAIN if upcall is pending and request has been queued + * -ETIMEDOUT if upcall failed or request could not be queue or + * upcall completed but item is still invalid (implying that + * the cache item has been replaced with a newer one). + * -ENOENT if cache entry was negative + */ +int cache_check(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp) +{ + int rv; + long refresh_age, age; + + /* First decide return status as best we can */ + rv = cache_is_valid(detail, h); + + /* now see if we want to start an upcall */ + refresh_age = (h->expiry_time - h->last_refresh); + age = seconds_since_boot() - h->last_refresh; + + if (rqstp == NULL) { + if (rv == -EAGAIN) + rv = -ENOENT; + } else if (rv == -EAGAIN || age > refresh_age/2) { + dprintk("RPC: Want update, refage=%ld, age=%ld\n", + refresh_age, age); + if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { + switch (cache_make_upcall(detail, h)) { + case -EINVAL: + clear_bit(CACHE_PENDING, &h->flags); + cache_revisit_request(h); + rv = try_to_negate_entry(detail, h); + break; + case -EAGAIN: + clear_bit(CACHE_PENDING, &h->flags); + cache_revisit_request(h); + break; + } + } + } + + if (rv == -EAGAIN) { + if (!cache_defer_req(rqstp, h)) { + /* + * Request was not deferred; handle it as best + * we can ourselves: + */ + rv = cache_is_valid(detail, h); + if (rv == -EAGAIN) + rv = -ETIMEDOUT; + } + } + if (rv) + cache_put(h, detail); + return rv; +} +EXPORT_SYMBOL_GPL(cache_check); + +/* + * caches need to be periodically cleaned. + * For this we maintain a list of cache_detail and + * a current pointer into that list and into the table + * for that entry. + * + * Each time clean_cache is called it finds the next non-empty entry + * in the current table and walks the list in that entry + * looking for entries that can be removed. + * + * An entry gets removed if: + * - The expiry is before current time + * - The last_refresh time is before the flush_time for that cache + * + * later we might drop old entries with non-NEVER expiry if that table + * is getting 'full' for some definition of 'full' + * + * The question of "how often to scan a table" is an interesting one + * and is answered in part by the use of the "nextcheck" field in the + * cache_detail. + * When a scan of a table begins, the nextcheck field is set to a time + * that is well into the future. + * While scanning, if an expiry time is found that is earlier than the + * current nextcheck time, nextcheck is set to that expiry time. + * If the flush_time is ever set to a time earlier than the nextcheck + * time, the nextcheck time is then set to that flush_time. + * + * A table is then only scanned if the current time is at least + * the nextcheck time. + * + */ + +static LIST_HEAD(cache_list); +static DEFINE_SPINLOCK(cache_list_lock); +static struct cache_detail *current_detail; +static int current_index; + +static void do_cache_clean(struct work_struct *work); +static struct delayed_work cache_cleaner; + +static void sunrpc_init_cache_detail(struct cache_detail *cd) +{ + rwlock_init(&cd->hash_lock); + INIT_LIST_HEAD(&cd->queue); + spin_lock(&cache_list_lock); + cd->nextcheck = 0; + cd->entries = 0; + atomic_set(&cd->readers, 0); + cd->last_close = 0; + cd->last_warn = -1; + list_add(&cd->others, &cache_list); + spin_unlock(&cache_list_lock); + + /* start the cleaning process */ + schedule_delayed_work(&cache_cleaner, 0); +} + +static void sunrpc_destroy_cache_detail(struct cache_detail *cd) +{ + cache_purge(cd); + spin_lock(&cache_list_lock); + write_lock(&cd->hash_lock); + if (cd->entries || atomic_read(&cd->inuse)) { + write_unlock(&cd->hash_lock); + spin_unlock(&cache_list_lock); + goto out; + } + if (current_detail == cd) + current_detail = NULL; + list_del_init(&cd->others); + write_unlock(&cd->hash_lock); + spin_unlock(&cache_list_lock); + if (list_empty(&cache_list)) { + /* module must be being unloaded so its safe to kill the worker */ + cancel_delayed_work_sync(&cache_cleaner); + } + return; +out: + printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); +} + +/* clean cache tries to find something to clean + * and cleans it. + * It returns 1 if it cleaned something, + * 0 if it didn't find anything this time + * -1 if it fell off the end of the list. + */ +static int cache_clean(void) +{ + int rv = 0; + struct list_head *next; + + spin_lock(&cache_list_lock); + + /* find a suitable table if we don't already have one */ + while (current_detail == NULL || + current_index >= current_detail->hash_size) { + if (current_detail) + next = current_detail->others.next; + else + next = cache_list.next; + if (next == &cache_list) { + current_detail = NULL; + spin_unlock(&cache_list_lock); + return -1; + } + current_detail = list_entry(next, struct cache_detail, others); + if (current_detail->nextcheck > seconds_since_boot()) + current_index = current_detail->hash_size; + else { + current_index = 0; + current_detail->nextcheck = seconds_since_boot()+30*60; + } + } + + /* find a non-empty bucket in the table */ + while (current_detail && + current_index < current_detail->hash_size && + current_detail->hash_table[current_index] == NULL) + current_index++; + + /* find a cleanable entry in the bucket and clean it, or set to next bucket */ + + if (current_detail && current_index < current_detail->hash_size) { + struct cache_head *ch, **cp; + struct cache_detail *d; + + write_lock(¤t_detail->hash_lock); + + /* Ok, now to clean this strand */ + + cp = & current_detail->hash_table[current_index]; + for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) { + if (current_detail->nextcheck > ch->expiry_time) + current_detail->nextcheck = ch->expiry_time+1; + if (!cache_is_expired(current_detail, ch)) + continue; + + *cp = ch->next; + ch->next = NULL; + current_detail->entries--; + rv = 1; + break; + } + + write_unlock(¤t_detail->hash_lock); + d = current_detail; + if (!ch) + current_index ++; + spin_unlock(&cache_list_lock); + if (ch) { + if (test_and_clear_bit(CACHE_PENDING, &ch->flags)) + cache_dequeue(current_detail, ch); + cache_revisit_request(ch); + cache_put(ch, d); + } + } else + spin_unlock(&cache_list_lock); + + return rv; +} + +/* + * We want to regularly clean the cache, so we need to schedule some work ... + */ +static void do_cache_clean(struct work_struct *work) +{ + int delay = 5; + if (cache_clean() == -1) + delay = round_jiffies_relative(30*HZ); + + if (list_empty(&cache_list)) + delay = 0; + + if (delay) + schedule_delayed_work(&cache_cleaner, delay); +} + + +/* + * Clean all caches promptly. This just calls cache_clean + * repeatedly until we are sure that every cache has had a chance to + * be fully cleaned + */ +void cache_flush(void) +{ + while (cache_clean() != -1) + cond_resched(); + while (cache_clean() != -1) + cond_resched(); +} +EXPORT_SYMBOL_GPL(cache_flush); + +void cache_purge(struct cache_detail *detail) +{ + detail->flush_time = LONG_MAX; + detail->nextcheck = seconds_since_boot(); + cache_flush(); + detail->flush_time = 1; +} +EXPORT_SYMBOL_GPL(cache_purge); + + +/* + * Deferral and Revisiting of Requests. + * + * If a cache lookup finds a pending entry, we + * need to defer the request and revisit it later. + * All deferred requests are stored in a hash table, + * indexed by "struct cache_head *". + * As it may be wasteful to store a whole request + * structure, we allow the request to provide a + * deferred form, which must contain a + * 'struct cache_deferred_req' + * This cache_deferred_req contains a method to allow + * it to be revisited when cache info is available + */ + +#define DFR_HASHSIZE (PAGE_SIZE/sizeof(struct list_head)) +#define DFR_HASH(item) ((((long)item)>>4 ^ (((long)item)>>13)) % DFR_HASHSIZE) + +#define DFR_MAX 300 /* ??? */ + +static DEFINE_SPINLOCK(cache_defer_lock); +static LIST_HEAD(cache_defer_list); +static struct hlist_head cache_defer_hash[DFR_HASHSIZE]; +static int cache_defer_cnt; + +static void __unhash_deferred_req(struct cache_deferred_req *dreq) +{ + hlist_del_init(&dreq->hash); + if (!list_empty(&dreq->recent)) { + list_del_init(&dreq->recent); + cache_defer_cnt--; + } +} + +static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item) +{ + int hash = DFR_HASH(item); + + INIT_LIST_HEAD(&dreq->recent); + hlist_add_head(&dreq->hash, &cache_defer_hash[hash]); +} + +static void setup_deferral(struct cache_deferred_req *dreq, + struct cache_head *item, + int count_me) +{ + + dreq->item = item; + + spin_lock(&cache_defer_lock); + + __hash_deferred_req(dreq, item); + + if (count_me) { + cache_defer_cnt++; + list_add(&dreq->recent, &cache_defer_list); + } + + spin_unlock(&cache_defer_lock); + +} + +struct thread_deferred_req { + struct cache_deferred_req handle; + struct completion completion; +}; + +static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many) +{ + struct thread_deferred_req *dr = + container_of(dreq, struct thread_deferred_req, handle); + complete(&dr->completion); +} + +static void cache_wait_req(struct cache_req *req, struct cache_head *item) +{ + struct thread_deferred_req sleeper; + struct cache_deferred_req *dreq = &sleeper.handle; + + sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion); + dreq->revisit = cache_restart_thread; + + setup_deferral(dreq, item, 0); + + if (!test_bit(CACHE_PENDING, &item->flags) || + wait_for_completion_interruptible_timeout( + &sleeper.completion, req->thread_wait) <= 0) { + /* The completion wasn't completed, so we need + * to clean up + */ + spin_lock(&cache_defer_lock); + if (!hlist_unhashed(&sleeper.handle.hash)) { + __unhash_deferred_req(&sleeper.handle); + spin_unlock(&cache_defer_lock); + } else { + /* cache_revisit_request already removed + * this from the hash table, but hasn't + * called ->revisit yet. It will very soon + * and we need to wait for it. + */ + spin_unlock(&cache_defer_lock); + wait_for_completion(&sleeper.completion); + } + } +} + +static void cache_limit_defers(void) +{ + /* Make sure we haven't exceed the limit of allowed deferred + * requests. + */ + struct cache_deferred_req *discard = NULL; + + if (cache_defer_cnt <= DFR_MAX) + return; + + spin_lock(&cache_defer_lock); + + /* Consider removing either the first or the last */ + if (cache_defer_cnt > DFR_MAX) { + if (net_random() & 1) + discard = list_entry(cache_defer_list.next, + struct cache_deferred_req, recent); + else + discard = list_entry(cache_defer_list.prev, + struct cache_deferred_req, recent); + __unhash_deferred_req(discard); + } + spin_unlock(&cache_defer_lock); + if (discard) + discard->revisit(discard, 1); +} + +/* Return true if and only if a deferred request is queued. */ +static bool cache_defer_req(struct cache_req *req, struct cache_head *item) +{ + struct cache_deferred_req *dreq; + + if (req->thread_wait) { + cache_wait_req(req, item); + if (!test_bit(CACHE_PENDING, &item->flags)) + return false; + } + dreq = req->defer(req); + if (dreq == NULL) + return false; + setup_deferral(dreq, item, 1); + if (!test_bit(CACHE_PENDING, &item->flags)) + /* Bit could have been cleared before we managed to + * set up the deferral, so need to revisit just in case + */ + cache_revisit_request(item); + + cache_limit_defers(); + return true; +} + +static void cache_revisit_request(struct cache_head *item) +{ + struct cache_deferred_req *dreq; + struct list_head pending; + struct hlist_node *lp, *tmp; + int hash = DFR_HASH(item); + + INIT_LIST_HEAD(&pending); + spin_lock(&cache_defer_lock); + + hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash) + if (dreq->item == item) { + __unhash_deferred_req(dreq); + list_add(&dreq->recent, &pending); + } + + spin_unlock(&cache_defer_lock); + + while (!list_empty(&pending)) { + dreq = list_entry(pending.next, struct cache_deferred_req, recent); + list_del_init(&dreq->recent); + dreq->revisit(dreq, 0); + } +} + +void cache_clean_deferred(void *owner) +{ + struct cache_deferred_req *dreq, *tmp; + struct list_head pending; + + + INIT_LIST_HEAD(&pending); + spin_lock(&cache_defer_lock); + + list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { + if (dreq->owner == owner) { + __unhash_deferred_req(dreq); + list_add(&dreq->recent, &pending); + } + } + spin_unlock(&cache_defer_lock); + + while (!list_empty(&pending)) { + dreq = list_entry(pending.next, struct cache_deferred_req, recent); + list_del_init(&dreq->recent); + dreq->revisit(dreq, 1); + } +} + +/* + * communicate with user-space + * + * We have a magic /proc file - /proc/sunrpc//channel. + * On read, you get a full request, or block. + * On write, an update request is processed. + * Poll works if anything to read, and always allows write. + * + * Implemented by linked list of requests. Each open file has + * a ->private that also exists in this list. New requests are added + * to the end and may wakeup and preceding readers. + * New readers are added to the head. If, on read, an item is found with + * CACHE_UPCALLING clear, we free it from the list. + * + */ + +static DEFINE_SPINLOCK(queue_lock); +static DEFINE_MUTEX(queue_io_mutex); + +struct cache_queue { + struct list_head list; + int reader; /* if 0, then request */ +}; +struct cache_request { + struct cache_queue q; + struct cache_head *item; + char * buf; + int len; + int readers; +}; +struct cache_reader { + struct cache_queue q; + int offset; /* if non-0, we have a refcnt on next request */ +}; + +static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, + loff_t *ppos, struct cache_detail *cd) +{ + struct cache_reader *rp = filp->private_data; + struct cache_request *rq; + struct inode *inode = filp->f_path.dentry->d_inode; + int err; + + if (count == 0) + return 0; + + mutex_lock(&inode->i_mutex); /* protect against multiple concurrent + * readers on this file */ + again: + spin_lock(&queue_lock); + /* need to find next request */ + while (rp->q.list.next != &cd->queue && + list_entry(rp->q.list.next, struct cache_queue, list) + ->reader) { + struct list_head *next = rp->q.list.next; + list_move(&rp->q.list, next); + } + if (rp->q.list.next == &cd->queue) { + spin_unlock(&queue_lock); + mutex_unlock(&inode->i_mutex); + BUG_ON(rp->offset); + return 0; + } + rq = container_of(rp->q.list.next, struct cache_request, q.list); + BUG_ON(rq->q.reader); + if (rp->offset == 0) + rq->readers++; + spin_unlock(&queue_lock); + + if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { + err = -EAGAIN; + spin_lock(&queue_lock); + list_move(&rp->q.list, &rq->q.list); + spin_unlock(&queue_lock); + } else { + if (rp->offset + count > rq->len) + count = rq->len - rp->offset; + err = -EFAULT; + if (copy_to_user(buf, rq->buf + rp->offset, count)) + goto out; + rp->offset += count; + if (rp->offset >= rq->len) { + rp->offset = 0; + spin_lock(&queue_lock); + list_move(&rp->q.list, &rq->q.list); + spin_unlock(&queue_lock); + } + err = 0; + } + out: + if (rp->offset == 0) { + /* need to release rq */ + spin_lock(&queue_lock); + rq->readers--; + if (rq->readers == 0 && + !test_bit(CACHE_PENDING, &rq->item->flags)) { + list_del(&rq->q.list); + spin_unlock(&queue_lock); + cache_put(rq->item, cd); + kfree(rq->buf); + kfree(rq); + } else + spin_unlock(&queue_lock); + } + if (err == -EAGAIN) + goto again; + mutex_unlock(&inode->i_mutex); + return err ? err : count; +} + +static ssize_t cache_do_downcall(char *kaddr, const char __user *buf, + size_t count, struct cache_detail *cd) +{ + ssize_t ret; + + if (count == 0) + return -EINVAL; + if (copy_from_user(kaddr, buf, count)) + return -EFAULT; + kaddr[count] = '\0'; + ret = cd->cache_parse(cd, kaddr, count); + if (!ret) + ret = count; + return ret; +} + +static ssize_t cache_slow_downcall(const char __user *buf, + size_t count, struct cache_detail *cd) +{ + static char write_buf[8192]; /* protected by queue_io_mutex */ + ssize_t ret = -EINVAL; + + if (count >= sizeof(write_buf)) + goto out; + mutex_lock(&queue_io_mutex); + ret = cache_do_downcall(write_buf, buf, count, cd); + mutex_unlock(&queue_io_mutex); +out: + return ret; +} + +static ssize_t cache_downcall(struct address_space *mapping, + const char __user *buf, + size_t count, struct cache_detail *cd) +{ + struct page *page; + char *kaddr; + ssize_t ret = -ENOMEM; + + if (count >= PAGE_CACHE_SIZE) + goto out_slow; + + page = find_or_create_page(mapping, 0, GFP_KERNEL); + if (!page) + goto out_slow; + + kaddr = kmap(page); + ret = cache_do_downcall(kaddr, buf, count, cd); + kunmap(page); + unlock_page(page); + page_cache_release(page); + return ret; +out_slow: + return cache_slow_downcall(buf, count, cd); +} + +static ssize_t cache_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos, + struct cache_detail *cd) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = filp->f_path.dentry->d_inode; + ssize_t ret = -EINVAL; + + if (!cd->cache_parse) + goto out; + + mutex_lock(&inode->i_mutex); + ret = cache_downcall(mapping, buf, count, cd); + mutex_unlock(&inode->i_mutex); +out: + return ret; +} + +static DECLARE_WAIT_QUEUE_HEAD(queue_wait); + +static unsigned int cache_poll(struct file *filp, poll_table *wait, + struct cache_detail *cd) +{ + unsigned int mask; + struct cache_reader *rp = filp->private_data; + struct cache_queue *cq; + + poll_wait(filp, &queue_wait, wait); + + /* alway allow write */ + mask = POLL_OUT | POLLWRNORM; + + if (!rp) + return mask; + + spin_lock(&queue_lock); + + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + mask |= POLLIN | POLLRDNORM; + break; + } + spin_unlock(&queue_lock); + return mask; +} + +static int cache_ioctl(struct inode *ino, struct file *filp, + unsigned int cmd, unsigned long arg, + struct cache_detail *cd) +{ + int len = 0; + struct cache_reader *rp = filp->private_data; + struct cache_queue *cq; + + if (cmd != FIONREAD || !rp) + return -EINVAL; + + spin_lock(&queue_lock); + + /* only find the length remaining in current request, + * or the length of the next request + */ + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + struct cache_request *cr = + container_of(cq, struct cache_request, q); + len = cr->len - rp->offset; + break; + } + spin_unlock(&queue_lock); + + return put_user(len, (int __user *)arg); +} + +static int cache_open(struct inode *inode, struct file *filp, + struct cache_detail *cd) +{ + struct cache_reader *rp = NULL; + + if (!cd || !try_module_get(cd->owner)) + return -EACCES; + nonseekable_open(inode, filp); + if (filp->f_mode & FMODE_READ) { + rp = kmalloc(sizeof(*rp), GFP_KERNEL); + if (!rp) + return -ENOMEM; + rp->offset = 0; + rp->q.reader = 1; + atomic_inc(&cd->readers); + spin_lock(&queue_lock); + list_add(&rp->q.list, &cd->queue); + spin_unlock(&queue_lock); + } + filp->private_data = rp; + return 0; +} + +static int cache_release(struct inode *inode, struct file *filp, + struct cache_detail *cd) +{ + struct cache_reader *rp = filp->private_data; + + if (rp) { + spin_lock(&queue_lock); + if (rp->offset) { + struct cache_queue *cq; + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + container_of(cq, struct cache_request, q) + ->readers--; + break; + } + rp->offset = 0; + } + list_del(&rp->q.list); + spin_unlock(&queue_lock); + + filp->private_data = NULL; + kfree(rp); + + cd->last_close = seconds_since_boot(); + atomic_dec(&cd->readers); + } + module_put(cd->owner); + return 0; +} + + + +static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch) +{ + struct cache_queue *cq; + spin_lock(&queue_lock); + list_for_each_entry(cq, &detail->queue, list) + if (!cq->reader) { + struct cache_request *cr = container_of(cq, struct cache_request, q); + if (cr->item != ch) + continue; + if (cr->readers != 0) + continue; + list_del(&cr->q.list); + spin_unlock(&queue_lock); + cache_put(cr->item, detail); + kfree(cr->buf); + kfree(cr); + return; + } + spin_unlock(&queue_lock); +} + +/* + * Support routines for text-based upcalls. + * Fields are separated by spaces. + * Fields are either mangled to quote space tab newline slosh with slosh + * or a hexified with a leading \x + * Record is terminated with newline. + * + */ + +void qword_add(char **bpp, int *lp, char *str) +{ + char *bp = *bpp; + int len = *lp; + char c; + + if (len < 0) return; + + while ((c=*str++) && len) + switch(c) { + case ' ': + case '\t': + case '\n': + case '\\': + if (len >= 4) { + *bp++ = '\\'; + *bp++ = '0' + ((c & 0300)>>6); + *bp++ = '0' + ((c & 0070)>>3); + *bp++ = '0' + ((c & 0007)>>0); + } + len -= 4; + break; + default: + *bp++ = c; + len--; + } + if (c || len <1) len = -1; + else { + *bp++ = ' '; + len--; + } + *bpp = bp; + *lp = len; +} +EXPORT_SYMBOL_GPL(qword_add); + +void qword_addhex(char **bpp, int *lp, char *buf, int blen) +{ + char *bp = *bpp; + int len = *lp; + + if (len < 0) return; + + if (len > 2) { + *bp++ = '\\'; + *bp++ = 'x'; + len -= 2; + while (blen && len >= 2) { + unsigned char c = *buf++; + *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + len -= 2; + blen--; + } + } + if (blen || len<1) len = -1; + else { + *bp++ = ' '; + len--; + } + *bpp = bp; + *lp = len; +} +EXPORT_SYMBOL_GPL(qword_addhex); + +static void warn_no_listener(struct cache_detail *detail) +{ + if (detail->last_warn != detail->last_close) { + detail->last_warn = detail->last_close; + if (detail->warn_no_listener) + detail->warn_no_listener(detail, detail->last_close != 0); + } +} + +static bool cache_listeners_exist(struct cache_detail *detail) +{ + if (atomic_read(&detail->readers)) + return true; + if (detail->last_close == 0) + /* This cache was never opened */ + return false; + if (detail->last_close < seconds_since_boot() - 30) + /* + * We allow for the possibility that someone might + * restart a userspace daemon without restarting the + * server; but after 30 seconds, we give up. + */ + return false; + return true; +} + +/* + * register an upcall request to user-space and queue it up for read() by the + * upcall daemon. + * + * Each request is at most one page long. + */ +int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h, + void (*cache_request)(struct cache_detail *, + struct cache_head *, + char **, + int *)) +{ + + char *buf; + struct cache_request *crq; + char *bp; + int len; + + if (!cache_listeners_exist(detail)) { + warn_no_listener(detail); + return -EINVAL; + } + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -EAGAIN; + + crq = kmalloc(sizeof (*crq), GFP_KERNEL); + if (!crq) { + kfree(buf); + return -EAGAIN; + } + + bp = buf; len = PAGE_SIZE; + + cache_request(detail, h, &bp, &len); + + if (len < 0) { + kfree(buf); + kfree(crq); + return -EAGAIN; + } + crq->q.reader = 0; + crq->item = cache_get(h); + crq->buf = buf; + crq->len = PAGE_SIZE - len; + crq->readers = 0; + spin_lock(&queue_lock); + list_add_tail(&crq->q.list, &detail->queue); + spin_unlock(&queue_lock); + wake_up(&queue_wait); + return 0; +} +EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall); + +/* + * parse a message from user-space and pass it + * to an appropriate cache + * Messages are, like requests, separated into fields by + * spaces and dequotes as \xHEXSTRING or embedded \nnn octal + * + * Message is + * reply cachename expiry key ... content.... + * + * key and content are both parsed by cache + */ + +#define isodigit(c) (isdigit(c) && c <= '7') +int qword_get(char **bpp, char *dest, int bufsize) +{ + /* return bytes copied, or -1 on error */ + char *bp = *bpp; + int len = 0; + + while (*bp == ' ') bp++; + + if (bp[0] == '\\' && bp[1] == 'x') { + /* HEX STRING */ + bp += 2; + while (len < bufsize) { + int h, l; + + h = hex_to_bin(bp[0]); + if (h < 0) + break; + + l = hex_to_bin(bp[1]); + if (l < 0) + break; + + *dest++ = (h << 4) | l; + bp += 2; + len++; + } + } else { + /* text with \nnn octal quoting */ + while (*bp != ' ' && *bp != '\n' && *bp && len < bufsize-1) { + if (*bp == '\\' && + isodigit(bp[1]) && (bp[1] <= '3') && + isodigit(bp[2]) && + isodigit(bp[3])) { + int byte = (*++bp -'0'); + bp++; + byte = (byte << 3) | (*bp++ - '0'); + byte = (byte << 3) | (*bp++ - '0'); + *dest++ = byte; + len++; + } else { + *dest++ = *bp++; + len++; + } + } + } + + if (*bp != ' ' && *bp != '\n' && *bp != '\0') + return -1; + while (*bp == ' ') bp++; + *bpp = bp; + *dest = '\0'; + return len; +} +EXPORT_SYMBOL_GPL(qword_get); + + +/* + * support /proc/sunrpc/cache/$CACHENAME/content + * as a seqfile. + * We call ->cache_show passing NULL for the item to + * get a header, then pass each real item in the cache + */ + +struct handle { + struct cache_detail *cd; +}; + +static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(cd->hash_lock) +{ + loff_t n = *pos; + unsigned hash, entry; + struct cache_head *ch; + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + + read_lock(&cd->hash_lock); + if (!n--) + return SEQ_START_TOKEN; + hash = n >> 32; + entry = n & ((1LL<<32) - 1); + + for (ch=cd->hash_table[hash]; ch; ch=ch->next) + if (!entry--) + return ch; + n &= ~((1LL<<32) - 1); + do { + hash++; + n += 1LL<<32; + } while(hash < cd->hash_size && + cd->hash_table[hash]==NULL); + if (hash >= cd->hash_size) + return NULL; + *pos = n+1; + return cd->hash_table[hash]; +} + +static void *c_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cache_head *ch = p; + int hash = (*pos >> 32); + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + if (p == SEQ_START_TOKEN) + hash = 0; + else if (ch->next == NULL) { + hash++; + *pos += 1LL<<32; + } else { + ++*pos; + return ch->next; + } + *pos &= ~((1LL<<32) - 1); + while (hash < cd->hash_size && + cd->hash_table[hash] == NULL) { + hash++; + *pos += 1LL<<32; + } + if (hash >= cd->hash_size) + return NULL; + ++*pos; + return cd->hash_table[hash]; +} + +static void c_stop(struct seq_file *m, void *p) + __releases(cd->hash_lock) +{ + struct cache_detail *cd = ((struct handle*)m->private)->cd; + read_unlock(&cd->hash_lock); +} + +static int c_show(struct seq_file *m, void *p) +{ + struct cache_head *cp = p; + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + if (p == SEQ_START_TOKEN) + return cd->cache_show(m, cd, NULL); + + ifdebug(CACHE) + seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", + convert_to_wallclock(cp->expiry_time), + atomic_read(&cp->ref.refcount), cp->flags); + cache_get(cp); + if (cache_check(cd, cp, NULL)) + /* cache_check does a cache_put on failure */ + seq_printf(m, "# "); + else + cache_put(cp, cd); + + return cd->cache_show(m, cd, cp); +} + +static const struct seq_operations cache_content_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = c_show, +}; + +static int content_open(struct inode *inode, struct file *file, + struct cache_detail *cd) +{ + struct handle *han; + + if (!cd || !try_module_get(cd->owner)) + return -EACCES; + han = __seq_open_private(file, &cache_content_op, sizeof(*han)); + if (han == NULL) { + module_put(cd->owner); + return -ENOMEM; + } + + han->cd = cd; + return 0; +} + +static int content_release(struct inode *inode, struct file *file, + struct cache_detail *cd) +{ + int ret = seq_release_private(inode, file); + module_put(cd->owner); + return ret; +} + +static int open_flush(struct inode *inode, struct file *file, + struct cache_detail *cd) +{ + if (!cd || !try_module_get(cd->owner)) + return -EACCES; + return nonseekable_open(inode, file); +} + +static int release_flush(struct inode *inode, struct file *file, + struct cache_detail *cd) +{ + module_put(cd->owner); + return 0; +} + +static ssize_t read_flush(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + struct cache_detail *cd) +{ + char tbuf[20]; + unsigned long p = *ppos; + size_t len; + + sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time)); + len = strlen(tbuf); + if (p >= len) + return 0; + len -= p; + if (len > count) + len = count; + if (copy_to_user(buf, (void*)(tbuf+p), len)) + return -EFAULT; + *ppos += len; + return len; +} + +static ssize_t write_flush(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, + struct cache_detail *cd) +{ + char tbuf[20]; + char *bp, *ep; + + if (*ppos || count > sizeof(tbuf)-1) + return -EINVAL; + if (copy_from_user(tbuf, buf, count)) + return -EFAULT; + tbuf[count] = 0; + simple_strtoul(tbuf, &ep, 0); + if (*ep && *ep != '\n') + return -EINVAL; + + bp = tbuf; + cd->flush_time = get_expiry(&bp); + cd->nextcheck = seconds_since_boot(); + cache_flush(); + + *ppos += count; + return count; +} + +static ssize_t cache_read_procfs(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + + return cache_read(filp, buf, count, ppos, cd); +} + +static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + + return cache_write(filp, buf, count, ppos, cd); +} + +static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait) +{ + struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + + return cache_poll(filp, wait, cd); +} + +static long cache_ioctl_procfs(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct cache_detail *cd = PDE(inode)->data; + + return cache_ioctl(inode, filp, cmd, arg, cd); +} + +static int cache_open_procfs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = PDE(inode)->data; + + return cache_open(inode, filp, cd); +} + +static int cache_release_procfs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = PDE(inode)->data; + + return cache_release(inode, filp, cd); +} + +static const struct file_operations cache_file_operations_procfs = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = cache_read_procfs, + .write = cache_write_procfs, + .poll = cache_poll_procfs, + .unlocked_ioctl = cache_ioctl_procfs, /* for FIONREAD */ + .open = cache_open_procfs, + .release = cache_release_procfs, +}; + +static int content_open_procfs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = PDE(inode)->data; + + return content_open(inode, filp, cd); +} + +static int content_release_procfs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = PDE(inode)->data; + + return content_release(inode, filp, cd); +} + +static const struct file_operations content_file_operations_procfs = { + .open = content_open_procfs, + .read = seq_read, + .llseek = seq_lseek, + .release = content_release_procfs, +}; + +static int open_flush_procfs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = PDE(inode)->data; + + return open_flush(inode, filp, cd); +} + +static int release_flush_procfs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = PDE(inode)->data; + + return release_flush(inode, filp, cd); +} + +static ssize_t read_flush_procfs(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + + return read_flush(filp, buf, count, ppos, cd); +} + +static ssize_t write_flush_procfs(struct file *filp, + const char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data; + + return write_flush(filp, buf, count, ppos, cd); +} + +static const struct file_operations cache_flush_operations_procfs = { + .open = open_flush_procfs, + .read = read_flush_procfs, + .write = write_flush_procfs, + .release = release_flush_procfs, + .llseek = no_llseek, +}; + +static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net) +{ + struct sunrpc_net *sn; + + if (cd->u.procfs.proc_ent == NULL) + return; + if (cd->u.procfs.flush_ent) + remove_proc_entry("flush", cd->u.procfs.proc_ent); + if (cd->u.procfs.channel_ent) + remove_proc_entry("channel", cd->u.procfs.proc_ent); + if (cd->u.procfs.content_ent) + remove_proc_entry("content", cd->u.procfs.proc_ent); + cd->u.procfs.proc_ent = NULL; + sn = net_generic(net, sunrpc_net_id); + remove_proc_entry(cd->name, sn->proc_net_rpc); +} + +#ifdef CONFIG_PROC_FS +static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) +{ + struct proc_dir_entry *p; + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc); + if (cd->u.procfs.proc_ent == NULL) + goto out_nomem; + cd->u.procfs.channel_ent = NULL; + cd->u.procfs.content_ent = NULL; + + p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, + cd->u.procfs.proc_ent, + &cache_flush_operations_procfs, cd); + cd->u.procfs.flush_ent = p; + if (p == NULL) + goto out_nomem; + + if (cd->cache_upcall || cd->cache_parse) { + p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, + cd->u.procfs.proc_ent, + &cache_file_operations_procfs, cd); + cd->u.procfs.channel_ent = p; + if (p == NULL) + goto out_nomem; + } + if (cd->cache_show) { + p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR, + cd->u.procfs.proc_ent, + &content_file_operations_procfs, cd); + cd->u.procfs.content_ent = p; + if (p == NULL) + goto out_nomem; + } + return 0; +out_nomem: + remove_cache_proc_entries(cd, net); + return -ENOMEM; +} +#else /* CONFIG_PROC_FS */ +static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) +{ + return 0; +} +#endif + +void __init cache_initialize(void) +{ + INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean); +} + +int cache_register_net(struct cache_detail *cd, struct net *net) +{ + int ret; + + sunrpc_init_cache_detail(cd); + ret = create_cache_proc_entries(cd, net); + if (ret) + sunrpc_destroy_cache_detail(cd); + return ret; +} + +int cache_register(struct cache_detail *cd) +{ + return cache_register_net(cd, &init_net); +} +EXPORT_SYMBOL_GPL(cache_register); + +void cache_unregister_net(struct cache_detail *cd, struct net *net) +{ + remove_cache_proc_entries(cd, net); + sunrpc_destroy_cache_detail(cd); +} + +void cache_unregister(struct cache_detail *cd) +{ + cache_unregister_net(cd, &init_net); +} +EXPORT_SYMBOL_GPL(cache_unregister); + +static ssize_t cache_read_pipefs(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + + return cache_read(filp, buf, count, ppos, cd); +} + +static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + + return cache_write(filp, buf, count, ppos, cd); +} + +static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait) +{ + struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + + return cache_poll(filp, wait, cd); +} + +static long cache_ioctl_pipefs(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct cache_detail *cd = RPC_I(inode)->private; + + return cache_ioctl(inode, filp, cmd, arg, cd); +} + +static int cache_open_pipefs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = RPC_I(inode)->private; + + return cache_open(inode, filp, cd); +} + +static int cache_release_pipefs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = RPC_I(inode)->private; + + return cache_release(inode, filp, cd); +} + +const struct file_operations cache_file_operations_pipefs = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = cache_read_pipefs, + .write = cache_write_pipefs, + .poll = cache_poll_pipefs, + .unlocked_ioctl = cache_ioctl_pipefs, /* for FIONREAD */ + .open = cache_open_pipefs, + .release = cache_release_pipefs, +}; + +static int content_open_pipefs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = RPC_I(inode)->private; + + return content_open(inode, filp, cd); +} + +static int content_release_pipefs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = RPC_I(inode)->private; + + return content_release(inode, filp, cd); +} + +const struct file_operations content_file_operations_pipefs = { + .open = content_open_pipefs, + .read = seq_read, + .llseek = seq_lseek, + .release = content_release_pipefs, +}; + +static int open_flush_pipefs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = RPC_I(inode)->private; + + return open_flush(inode, filp, cd); +} + +static int release_flush_pipefs(struct inode *inode, struct file *filp) +{ + struct cache_detail *cd = RPC_I(inode)->private; + + return release_flush(inode, filp, cd); +} + +static ssize_t read_flush_pipefs(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + + return read_flush(filp, buf, count, ppos, cd); +} + +static ssize_t write_flush_pipefs(struct file *filp, + const char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private; + + return write_flush(filp, buf, count, ppos, cd); +} + +const struct file_operations cache_flush_operations_pipefs = { + .open = open_flush_pipefs, + .read = read_flush_pipefs, + .write = write_flush_pipefs, + .release = release_flush_pipefs, + .llseek = no_llseek, +}; + +int sunrpc_cache_register_pipefs(struct dentry *parent, + const char *name, mode_t umode, + struct cache_detail *cd) +{ + struct qstr q; + struct dentry *dir; + int ret = 0; + + sunrpc_init_cache_detail(cd); + q.name = name; + q.len = strlen(name); + q.hash = full_name_hash(q.name, q.len); + dir = rpc_create_cache_dir(parent, &q, umode, cd); + if (!IS_ERR(dir)) + cd->u.pipefs.dir = dir; + else { + sunrpc_destroy_cache_detail(cd); + ret = PTR_ERR(dir); + } + return ret; +} +EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs); + +void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) +{ + rpc_remove_cache_dir(cd->u.pipefs.dir); + cd->u.pipefs.dir = NULL; + sunrpc_destroy_cache_detail(cd); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); + diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c new file mode 100644 index 00000000..8c914158 --- /dev/null +++ b/net/sunrpc/clnt.c @@ -0,0 +1,1876 @@ +/* + * linux/net/sunrpc/clnt.c + * + * This file contains the high-level RPC interface. + * It is modeled as a finite state machine to support both synchronous + * and asynchronous requests. + * + * - RPC header generation and argument serialization. + * - Credential refresh. + * - TCP connect handling. + * - Retry of operation when it is suspected the operation failed because + * of uid squashing on the server, or when the credentials were stale + * and need to be refreshed, or when a packet was damaged in transit. + * This may be have to be moved to the VFS layer. + * + * Copyright (C) 1992,1993 Rick Sladkey + * Copyright (C) 1995,1996 Olaf Kirch + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "sunrpc.h" + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_CALL +#endif + +#define dprint_status(t) \ + dprintk("RPC: %5u %s (status %d)\n", t->tk_pid, \ + __func__, t->tk_status) + +/* + * All RPC clients are linked into this list + */ +static LIST_HEAD(all_clients); +static DEFINE_SPINLOCK(rpc_client_lock); + +static DECLARE_WAIT_QUEUE_HEAD(destroy_wait); + + +static void call_start(struct rpc_task *task); +static void call_reserve(struct rpc_task *task); +static void call_reserveresult(struct rpc_task *task); +static void call_allocate(struct rpc_task *task); +static void call_decode(struct rpc_task *task); +static void call_bind(struct rpc_task *task); +static void call_bind_status(struct rpc_task *task); +static void call_transmit(struct rpc_task *task); +#if defined(CONFIG_NFS_V4_1) +static void call_bc_transmit(struct rpc_task *task); +#endif /* CONFIG_NFS_V4_1 */ +static void call_status(struct rpc_task *task); +static void call_transmit_status(struct rpc_task *task); +static void call_refresh(struct rpc_task *task); +static void call_refreshresult(struct rpc_task *task); +static void call_timeout(struct rpc_task *task); +static void call_connect(struct rpc_task *task); +static void call_connect_status(struct rpc_task *task); + +static __be32 *rpc_encode_header(struct rpc_task *task); +static __be32 *rpc_verify_header(struct rpc_task *task); +static int rpc_ping(struct rpc_clnt *clnt); + +static void rpc_register_client(struct rpc_clnt *clnt) +{ + spin_lock(&rpc_client_lock); + list_add(&clnt->cl_clients, &all_clients); + spin_unlock(&rpc_client_lock); +} + +static void rpc_unregister_client(struct rpc_clnt *clnt) +{ + spin_lock(&rpc_client_lock); + list_del(&clnt->cl_clients); + spin_unlock(&rpc_client_lock); +} + +static int +rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) +{ + static uint32_t clntid; + struct nameidata nd; + struct path path; + char name[15]; + struct qstr q = { + .name = name, + }; + int error; + + clnt->cl_path.mnt = ERR_PTR(-ENOENT); + clnt->cl_path.dentry = ERR_PTR(-ENOENT); + if (dir_name == NULL) + return 0; + + path.mnt = rpc_get_mount(); + if (IS_ERR(path.mnt)) + return PTR_ERR(path.mnt); + error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &nd); + if (error) + goto err; + + for (;;) { + q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); + name[sizeof(name) - 1] = '\0'; + q.hash = full_name_hash(q.name, q.len); + path.dentry = rpc_create_client_dir(nd.path.dentry, &q, clnt); + if (!IS_ERR(path.dentry)) + break; + error = PTR_ERR(path.dentry); + if (error != -EEXIST) { + printk(KERN_INFO "RPC: Couldn't create pipefs entry" + " %s/%s, error %d\n", + dir_name, name, error); + goto err_path_put; + } + } + path_put(&nd.path); + clnt->cl_path = path; + return 0; +err_path_put: + path_put(&nd.path); +err: + rpc_put_mount(); + return error; +} + +static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt) +{ + struct rpc_program *program = args->program; + struct rpc_version *version; + struct rpc_clnt *clnt = NULL; + struct rpc_auth *auth; + int err; + size_t len; + + /* sanity check the name before trying to print it */ + err = -EINVAL; + len = strlen(args->servername); + if (len > RPC_MAXNETNAMELEN) + goto out_no_rpciod; + len++; + + dprintk("RPC: creating %s client for %s (xprt %p)\n", + program->name, args->servername, xprt); + + err = rpciod_up(); + if (err) + goto out_no_rpciod; + err = -EINVAL; + if (!xprt) + goto out_no_xprt; + + if (args->version >= program->nrvers) + goto out_err; + version = program->version[args->version]; + if (version == NULL) + goto out_err; + + err = -ENOMEM; + clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); + if (!clnt) + goto out_err; + clnt->cl_parent = clnt; + + clnt->cl_server = clnt->cl_inline_name; + if (len > sizeof(clnt->cl_inline_name)) { + char *buf = kmalloc(len, GFP_KERNEL); + if (buf != NULL) + clnt->cl_server = buf; + else + len = sizeof(clnt->cl_inline_name); + } + strlcpy(clnt->cl_server, args->servername, len); + + clnt->cl_xprt = xprt; + clnt->cl_procinfo = version->procs; + clnt->cl_maxproc = version->nrprocs; + clnt->cl_protname = program->name; + clnt->cl_prog = args->prognumber ? : program->number; + clnt->cl_vers = version->number; + clnt->cl_stats = program->stats; + clnt->cl_metrics = rpc_alloc_iostats(clnt); + err = -ENOMEM; + if (clnt->cl_metrics == NULL) + goto out_no_stats; + clnt->cl_program = program; + INIT_LIST_HEAD(&clnt->cl_tasks); + spin_lock_init(&clnt->cl_lock); + + if (!xprt_bound(clnt->cl_xprt)) + clnt->cl_autobind = 1; + + clnt->cl_timeout = xprt->timeout; + if (args->timeout != NULL) { + memcpy(&clnt->cl_timeout_default, args->timeout, + sizeof(clnt->cl_timeout_default)); + clnt->cl_timeout = &clnt->cl_timeout_default; + } + + clnt->cl_rtt = &clnt->cl_rtt_default; + rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); + clnt->cl_principal = NULL; + if (args->client_name) { + clnt->cl_principal = kstrdup(args->client_name, GFP_KERNEL); + if (!clnt->cl_principal) + goto out_no_principal; + } + + atomic_set(&clnt->cl_count, 1); + + err = rpc_setup_pipedir(clnt, program->pipe_dir_name); + if (err < 0) + goto out_no_path; + + auth = rpcauth_create(args->authflavor, clnt); + if (IS_ERR(auth)) { + printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", + args->authflavor); + err = PTR_ERR(auth); + goto out_no_auth; + } + + /* save the nodename */ + clnt->cl_nodelen = strlen(init_utsname()->nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; + memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); + rpc_register_client(clnt); + return clnt; + +out_no_auth: + if (!IS_ERR(clnt->cl_path.dentry)) { + rpc_remove_client_dir(clnt->cl_path.dentry); + rpc_put_mount(); + } +out_no_path: + kfree(clnt->cl_principal); +out_no_principal: + rpc_free_iostats(clnt->cl_metrics); +out_no_stats: + if (clnt->cl_server != clnt->cl_inline_name) + kfree(clnt->cl_server); + kfree(clnt); +out_err: + xprt_put(xprt); +out_no_xprt: + rpciod_down(); +out_no_rpciod: + return ERR_PTR(err); +} + +/* + * rpc_create - create an RPC client and transport with one call + * @args: rpc_clnt create argument structure + * + * Creates and initializes an RPC transport and an RPC client. + * + * It can ping the server in order to determine if it is up, and to see if + * it supports this program and version. RPC_CLNT_CREATE_NOPING disables + * this behavior so asynchronous tasks can also use rpc_create. + */ +struct rpc_clnt *rpc_create(struct rpc_create_args *args) +{ + struct rpc_xprt *xprt; + struct rpc_clnt *clnt; + struct xprt_create xprtargs = { + .net = args->net, + .ident = args->protocol, + .srcaddr = args->saddress, + .dstaddr = args->address, + .addrlen = args->addrsize, + .bc_xprt = args->bc_xprt, + }; + char servername[48]; + + /* + * If the caller chooses not to specify a hostname, whip + * up a string representation of the passed-in address. + */ + if (args->servername == NULL) { + struct sockaddr_un *sun = + (struct sockaddr_un *)args->address; + struct sockaddr_in *sin = + (struct sockaddr_in *)args->address; + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)args->address; + + servername[0] = '\0'; + switch (args->address->sa_family) { + case AF_LOCAL: + snprintf(servername, sizeof(servername), "%s", + sun->sun_path); + break; + case AF_INET: + snprintf(servername, sizeof(servername), "%pI4", + &sin->sin_addr.s_addr); + break; + case AF_INET6: + snprintf(servername, sizeof(servername), "%pI6", + &sin6->sin6_addr); + break; + default: + /* caller wants default server name, but + * address family isn't recognized. */ + return ERR_PTR(-EINVAL); + } + args->servername = servername; + } + + xprt = xprt_create_transport(&xprtargs); + if (IS_ERR(xprt)) + return (struct rpc_clnt *)xprt; + + /* + * By default, kernel RPC client connects from a reserved port. + * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters, + * but it is always enabled for rpciod, which handles the connect + * operation. + */ + xprt->resvport = 1; + if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) + xprt->resvport = 0; + + clnt = rpc_new_client(args, xprt); + if (IS_ERR(clnt)) + return clnt; + + if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { + int err = rpc_ping(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } + } + + clnt->cl_softrtry = 1; + if (args->flags & RPC_CLNT_CREATE_HARDRTRY) + clnt->cl_softrtry = 0; + + if (args->flags & RPC_CLNT_CREATE_AUTOBIND) + clnt->cl_autobind = 1; + if (args->flags & RPC_CLNT_CREATE_DISCRTRY) + clnt->cl_discrtry = 1; + if (!(args->flags & RPC_CLNT_CREATE_QUIET)) + clnt->cl_chatty = 1; + + return clnt; +} +EXPORT_SYMBOL_GPL(rpc_create); + +/* + * This function clones the RPC client structure. It allows us to share the + * same transport while varying parameters such as the authentication + * flavour. + */ +struct rpc_clnt * +rpc_clone_client(struct rpc_clnt *clnt) +{ + struct rpc_clnt *new; + int err = -ENOMEM; + + new = kmemdup(clnt, sizeof(*new), GFP_KERNEL); + if (!new) + goto out_no_clnt; + new->cl_parent = clnt; + /* Turn off autobind on clones */ + new->cl_autobind = 0; + INIT_LIST_HEAD(&new->cl_tasks); + spin_lock_init(&new->cl_lock); + rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval); + new->cl_metrics = rpc_alloc_iostats(clnt); + if (new->cl_metrics == NULL) + goto out_no_stats; + if (clnt->cl_principal) { + new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL); + if (new->cl_principal == NULL) + goto out_no_principal; + } + atomic_set(&new->cl_count, 1); + err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); + if (err != 0) + goto out_no_path; + if (new->cl_auth) + atomic_inc(&new->cl_auth->au_count); + xprt_get(clnt->cl_xprt); + atomic_inc(&clnt->cl_count); + rpc_register_client(new); + rpciod_up(); + return new; +out_no_path: + kfree(new->cl_principal); +out_no_principal: + rpc_free_iostats(new->cl_metrics); +out_no_stats: + kfree(new); +out_no_clnt: + dprintk("RPC: %s: returned error %d\n", __func__, err); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(rpc_clone_client); + +/* + * Kill all tasks for the given client. + * XXX: kill their descendants as well? + */ +void rpc_killall_tasks(struct rpc_clnt *clnt) +{ + struct rpc_task *rovr; + + + if (list_empty(&clnt->cl_tasks)) + return; + dprintk("RPC: killing all tasks for client %p\n", clnt); + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&clnt->cl_lock); + list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(rovr)) + continue; + if (!(rovr->tk_flags & RPC_TASK_KILLED)) { + rovr->tk_flags |= RPC_TASK_KILLED; + rpc_exit(rovr, -EIO); + if (RPC_IS_QUEUED(rovr)) + rpc_wake_up_queued_task(rovr->tk_waitqueue, + rovr); + } + } + spin_unlock(&clnt->cl_lock); +} +EXPORT_SYMBOL_GPL(rpc_killall_tasks); + +/* + * Properly shut down an RPC client, terminating all outstanding + * requests. + */ +void rpc_shutdown_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: shutting down %s client for %s\n", + clnt->cl_protname, clnt->cl_server); + + while (!list_empty(&clnt->cl_tasks)) { + rpc_killall_tasks(clnt); + wait_event_timeout(destroy_wait, + list_empty(&clnt->cl_tasks), 1*HZ); + } + + rpc_release_client(clnt); +} +EXPORT_SYMBOL_GPL(rpc_shutdown_client); + +/* + * Free an RPC client + */ +static void +rpc_free_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: destroying %s client for %s\n", + clnt->cl_protname, clnt->cl_server); + if (!IS_ERR(clnt->cl_path.dentry)) { + rpc_remove_client_dir(clnt->cl_path.dentry); + rpc_put_mount(); + } + if (clnt->cl_parent != clnt) { + rpc_release_client(clnt->cl_parent); + goto out_free; + } + if (clnt->cl_server != clnt->cl_inline_name) + kfree(clnt->cl_server); +out_free: + rpc_unregister_client(clnt); + rpc_free_iostats(clnt->cl_metrics); + kfree(clnt->cl_principal); + clnt->cl_metrics = NULL; + xprt_put(clnt->cl_xprt); + rpciod_down(); + kfree(clnt); +} + +/* + * Free an RPC client + */ +static void +rpc_free_auth(struct rpc_clnt *clnt) +{ + if (clnt->cl_auth == NULL) { + rpc_free_client(clnt); + return; + } + + /* + * Note: RPCSEC_GSS may need to send NULL RPC calls in order to + * release remaining GSS contexts. This mechanism ensures + * that it can do so safely. + */ + atomic_inc(&clnt->cl_count); + rpcauth_release(clnt->cl_auth); + clnt->cl_auth = NULL; + if (atomic_dec_and_test(&clnt->cl_count)) + rpc_free_client(clnt); +} + +/* + * Release reference to the RPC client + */ +void +rpc_release_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: rpc_release_client(%p)\n", clnt); + + if (list_empty(&clnt->cl_tasks)) + wake_up(&destroy_wait); + if (atomic_dec_and_test(&clnt->cl_count)) + rpc_free_auth(clnt); +} + +/** + * rpc_bind_new_program - bind a new RPC program to an existing client + * @old: old rpc_client + * @program: rpc program to set + * @vers: rpc program version + * + * Clones the rpc client and sets up a new RPC program. This is mainly + * of use for enabling different RPC programs to share the same transport. + * The Sun NFSv2/v3 ACL protocol can do this. + */ +struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, + struct rpc_program *program, + u32 vers) +{ + struct rpc_clnt *clnt; + struct rpc_version *version; + int err; + + BUG_ON(vers >= program->nrvers || !program->version[vers]); + version = program->version[vers]; + clnt = rpc_clone_client(old); + if (IS_ERR(clnt)) + goto out; + clnt->cl_procinfo = version->procs; + clnt->cl_maxproc = version->nrprocs; + clnt->cl_protname = program->name; + clnt->cl_prog = program->number; + clnt->cl_vers = version->number; + clnt->cl_stats = program->stats; + err = rpc_ping(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + clnt = ERR_PTR(err); + } +out: + return clnt; +} +EXPORT_SYMBOL_GPL(rpc_bind_new_program); + +void rpc_task_release_client(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + if (clnt != NULL) { + /* Remove from client task list */ + spin_lock(&clnt->cl_lock); + list_del(&task->tk_task); + spin_unlock(&clnt->cl_lock); + task->tk_client = NULL; + + rpc_release_client(clnt); + } +} + +static +void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) +{ + if (clnt != NULL) { + rpc_task_release_client(task); + task->tk_client = clnt; + atomic_inc(&clnt->cl_count); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + /* Add to the client's list of all tasks */ + spin_lock(&clnt->cl_lock); + list_add_tail(&task->tk_task, &clnt->cl_tasks); + spin_unlock(&clnt->cl_lock); + } +} + +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt) +{ + rpc_task_release_client(task); + rpc_task_set_client(task, clnt); +} +EXPORT_SYMBOL_GPL(rpc_task_reset_client); + + +static void +rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) +{ + if (msg != NULL) { + task->tk_msg.rpc_proc = msg->rpc_proc; + task->tk_msg.rpc_argp = msg->rpc_argp; + task->tk_msg.rpc_resp = msg->rpc_resp; + if (msg->rpc_cred != NULL) + task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred); + } +} + +/* + * Default callback for async RPC calls + */ +static void +rpc_default_callback(struct rpc_task *task, void *data) +{ +} + +static const struct rpc_call_ops rpc_default_ops = { + .rpc_call_done = rpc_default_callback, +}; + +/** + * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it + * @task_setup_data: pointer to task initialisation data + */ +struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) +{ + struct rpc_task *task; + + task = rpc_new_task(task_setup_data); + if (IS_ERR(task)) + goto out; + + rpc_task_set_client(task, task_setup_data->rpc_client); + rpc_task_set_rpc_message(task, task_setup_data->rpc_message); + + if (task->tk_action == NULL) + rpc_call_start(task); + + atomic_inc(&task->tk_count); + rpc_execute(task); +out: + return task; +} +EXPORT_SYMBOL_GPL(rpc_run_task); + +/** + * rpc_call_sync - Perform a synchronous RPC call + * @clnt: pointer to RPC client + * @msg: RPC call parameters + * @flags: RPC call flags + */ +int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags) +{ + struct rpc_task *task; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = msg, + .callback_ops = &rpc_default_ops, + .flags = flags, + }; + int status; + + BUG_ON(flags & RPC_TASK_ASYNC); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} +EXPORT_SYMBOL_GPL(rpc_call_sync); + +/** + * rpc_call_async - Perform an asynchronous RPC call + * @clnt: pointer to RPC client + * @msg: RPC call parameters + * @flags: RPC call flags + * @tk_ops: RPC call ops + * @data: user call data + */ +int +rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, + const struct rpc_call_ops *tk_ops, void *data) +{ + struct rpc_task *task; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = msg, + .callback_ops = tk_ops, + .callback_data = data, + .flags = flags|RPC_TASK_ASYNC, + }; + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(rpc_call_async); + +#if defined(CONFIG_NFS_V4_1) +/** + * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run + * rpc_execute against it + * @req: RPC request + * @tk_ops: RPC call ops + */ +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, + const struct rpc_call_ops *tk_ops) +{ + struct rpc_task *task; + struct xdr_buf *xbufp = &req->rq_snd_buf; + struct rpc_task_setup task_setup_data = { + .callback_ops = tk_ops, + }; + + dprintk("RPC: rpc_run_bc_task req= %p\n", req); + /* + * Create an rpc_task to send the data + */ + task = rpc_new_task(&task_setup_data); + if (IS_ERR(task)) { + xprt_free_bc_request(req); + goto out; + } + task->tk_rqstp = req; + + /* + * Set up the xdr_buf length. + * This also indicates that the buffer is XDR encoded already. + */ + xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + + xbufp->tail[0].iov_len; + + task->tk_action = call_bc_transmit; + atomic_inc(&task->tk_count); + BUG_ON(atomic_read(&task->tk_count) != 2); + rpc_execute(task); + +out: + dprintk("RPC: rpc_run_bc_task: task= %p\n", task); + return task; +} +#endif /* CONFIG_NFS_V4_1 */ + +void +rpc_call_start(struct rpc_task *task) +{ + task->tk_action = call_start; +} +EXPORT_SYMBOL_GPL(rpc_call_start); + +/** + * rpc_peeraddr - extract remote peer address from clnt's xprt + * @clnt: RPC client structure + * @buf: target buffer + * @bufsize: length of target buffer + * + * Returns the number of bytes that are actually in the stored address. + */ +size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize) +{ + size_t bytes; + struct rpc_xprt *xprt = clnt->cl_xprt; + + bytes = sizeof(xprt->addr); + if (bytes > bufsize) + bytes = bufsize; + memcpy(buf, &clnt->cl_xprt->addr, bytes); + return xprt->addrlen; +} +EXPORT_SYMBOL_GPL(rpc_peeraddr); + +/** + * rpc_peeraddr2str - return remote peer address in printable format + * @clnt: RPC client structure + * @format: address format + * + */ +const char *rpc_peeraddr2str(struct rpc_clnt *clnt, + enum rpc_display_format_t format) +{ + struct rpc_xprt *xprt = clnt->cl_xprt; + + if (xprt->address_strings[format] != NULL) + return xprt->address_strings[format]; + else + return "unprintable"; +} +EXPORT_SYMBOL_GPL(rpc_peeraddr2str); + +void +rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) +{ + struct rpc_xprt *xprt = clnt->cl_xprt; + if (xprt->ops->set_buffer_size) + xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); +} +EXPORT_SYMBOL_GPL(rpc_setbufsize); + +/* + * Return size of largest payload RPC client can support, in bytes + * + * For stream transports, this is one RPC record fragment (see RFC + * 1831), as we don't support multi-record requests yet. For datagram + * transports, this is the size of an IP packet minus the IP, UDP, and + * RPC header sizes. + */ +size_t rpc_max_payload(struct rpc_clnt *clnt) +{ + return clnt->cl_xprt->max_payload; +} +EXPORT_SYMBOL_GPL(rpc_max_payload); + +/** + * rpc_force_rebind - force transport to check that remote port is unchanged + * @clnt: client to rebind + * + */ +void rpc_force_rebind(struct rpc_clnt *clnt) +{ + if (clnt->cl_autobind) + xprt_clear_bound(clnt->cl_xprt); +} +EXPORT_SYMBOL_GPL(rpc_force_rebind); + +/* + * Restart an (async) RPC call from the call_prepare state. + * Usually called from within the exit handler. + */ +int +rpc_restart_call_prepare(struct rpc_task *task) +{ + if (RPC_ASSASSINATED(task)) + return 0; + task->tk_action = rpc_prepare_task; + return 1; +} +EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); + +/* + * Restart an (async) RPC call. Usually called from within the + * exit handler. + */ +int +rpc_restart_call(struct rpc_task *task) +{ + if (RPC_ASSASSINATED(task)) + return 0; + task->tk_action = call_start; + return 1; +} +EXPORT_SYMBOL_GPL(rpc_restart_call); + +#ifdef RPC_DEBUG +static const char *rpc_proc_name(const struct rpc_task *task) +{ + const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + + if (proc) { + if (proc->p_name) + return proc->p_name; + else + return "NULL"; + } else + return "no proc"; +} +#endif + +/* + * 0. Initial state + * + * Other FSM states can be visited zero or more times, but + * this state is visited exactly once for each RPC. + */ +static void +call_start(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, + clnt->cl_protname, clnt->cl_vers, + rpc_proc_name(task), + (RPC_IS_ASYNC(task) ? "async" : "sync")); + + /* Increment call count */ + task->tk_msg.rpc_proc->p_count++; + clnt->cl_stats->rpccnt++; + task->tk_action = call_reserve; +} + +/* + * 1. Reserve an RPC call slot + */ +static void +call_reserve(struct rpc_task *task) +{ + dprint_status(task); + + task->tk_status = 0; + task->tk_action = call_reserveresult; + xprt_reserve(task); +} + +/* + * 1b. Grok the result of xprt_reserve() + */ +static void +call_reserveresult(struct rpc_task *task) +{ + int status = task->tk_status; + + dprint_status(task); + + /* + * After a call to xprt_reserve(), we must have either + * a request slot or else an error status. + */ + task->tk_status = 0; + if (status >= 0) { + if (task->tk_rqstp) { + task->tk_action = call_refresh; + return; + } + + printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", + __func__, status); + rpc_exit(task, -EIO); + return; + } + + /* + * Even though there was an error, we may have acquired + * a request slot somehow. Make sure not to leak it. + */ + if (task->tk_rqstp) { + printk(KERN_ERR "%s: status=%d, request allocated anyway\n", + __func__, status); + xprt_release(task); + } + + switch (status) { + case -EAGAIN: /* woken up; retry */ + task->tk_action = call_reserve; + return; + case -EIO: /* probably a shutdown */ + break; + default: + printk(KERN_ERR "%s: unrecognized error %d, exiting\n", + __func__, status); + break; + } + rpc_exit(task, status); +} + +/* + * 2. Bind and/or refresh the credentials + */ +static void +call_refresh(struct rpc_task *task) +{ + dprint_status(task); + + task->tk_action = call_refreshresult; + task->tk_status = 0; + task->tk_client->cl_stats->rpcauthrefresh++; + rpcauth_refreshcred(task); +} + +/* + * 2a. Process the results of a credential refresh + */ +static void +call_refreshresult(struct rpc_task *task) +{ + int status = task->tk_status; + + dprint_status(task); + + task->tk_status = 0; + task->tk_action = call_refresh; + switch (status) { + case 0: + if (rpcauth_uptodatecred(task)) + task->tk_action = call_allocate; + return; + case -ETIMEDOUT: + rpc_delay(task, 3*HZ); + case -EAGAIN: + status = -EACCES; + if (!task->tk_cred_retry) + break; + task->tk_cred_retry--; + dprintk("RPC: %5u %s: retry refresh creds\n", + task->tk_pid, __func__); + return; + } + dprintk("RPC: %5u %s: refresh creds failed with error %d\n", + task->tk_pid, __func__, status); + rpc_exit(task, status); +} + +/* + * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc. + * (Note: buffer memory is freed in xprt_release). + */ +static void +call_allocate(struct rpc_task *task) +{ + unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + + dprint_status(task); + + task->tk_status = 0; + task->tk_action = call_bind; + + if (req->rq_buffer) + return; + + if (proc->p_proc != 0) { + BUG_ON(proc->p_arglen == 0); + if (proc->p_decode != NULL) + BUG_ON(proc->p_replen == 0); + } + + /* + * Calculate the size (in quads) of the RPC call + * and reply headers, and convert both values + * to byte sizes. + */ + req->rq_callsize = RPC_CALLHDRSIZE + (slack << 1) + proc->p_arglen; + req->rq_callsize <<= 2; + req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; + req->rq_rcvsize <<= 2; + + req->rq_buffer = xprt->ops->buf_alloc(task, + req->rq_callsize + req->rq_rcvsize); + if (req->rq_buffer != NULL) + return; + + dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); + + if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) { + task->tk_action = call_allocate; + rpc_delay(task, HZ>>4); + return; + } + + rpc_exit(task, -ERESTARTSYS); +} + +static inline int +rpc_task_need_encode(struct rpc_task *task) +{ + return task->tk_rqstp->rq_snd_buf.len == 0; +} + +static inline void +rpc_task_force_reencode(struct rpc_task *task) +{ + task->tk_rqstp->rq_snd_buf.len = 0; + task->tk_rqstp->rq_bytes_sent = 0; +} + +static inline void +rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) +{ + buf->head[0].iov_base = start; + buf->head[0].iov_len = len; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->flags = 0; + buf->len = 0; + buf->buflen = len; +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +rpc_xdr_encode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + kxdreproc_t encode; + __be32 *p; + + dprint_status(task); + + rpc_xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + rpc_xdr_buf_init(&req->rq_rcv_buf, + (char *)req->rq_buffer + req->rq_callsize, + req->rq_rcvsize); + + p = rpc_encode_header(task); + if (p == NULL) { + printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n"); + rpc_exit(task, -EIO); + return; + } + + encode = task->tk_msg.rpc_proc->p_encode; + if (encode == NULL) + return; + + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); +} + +/* + * 4. Get the server port number if not yet set + */ +static void +call_bind(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprint_status(task); + + task->tk_action = call_connect; + if (!xprt_bound(xprt)) { + task->tk_action = call_bind_status; + task->tk_timeout = xprt->bind_timeout; + xprt->ops->rpcbind(task); + } +} + +/* + * 4a. Sort out bind result + */ +static void +call_bind_status(struct rpc_task *task) +{ + int status = -EIO; + + if (task->tk_status >= 0) { + dprint_status(task); + task->tk_status = 0; + task->tk_action = call_connect; + return; + } + + switch (task->tk_status) { + case -ENOMEM: + dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid); + rpc_delay(task, HZ >> 2); + goto retry_timeout; + case -EACCES: + dprintk("RPC: %5u remote rpcbind: RPC program/version " + "unavailable\n", task->tk_pid); + /* fail immediately if this is an RPC ping */ + if (task->tk_msg.rpc_proc->p_proc == 0) { + status = -EOPNOTSUPP; + break; + } + if (task->tk_rebind_retry == 0) + break; + task->tk_rebind_retry--; + rpc_delay(task, 3*HZ); + goto retry_timeout; + case -ETIMEDOUT: + dprintk("RPC: %5u rpcbind request timed out\n", + task->tk_pid); + goto retry_timeout; + case -EPFNOSUPPORT: + /* server doesn't support any rpcbind version we know of */ + dprintk("RPC: %5u unrecognized remote rpcbind service\n", + task->tk_pid); + break; + case -EPROTONOSUPPORT: + dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n", + task->tk_pid); + task->tk_status = 0; + task->tk_action = call_bind; + return; + case -ECONNREFUSED: /* connection problems */ + case -ECONNRESET: + case -ENOTCONN: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EPIPE: + dprintk("RPC: %5u remote rpcbind unreachable: %d\n", + task->tk_pid, task->tk_status); + if (!RPC_IS_SOFTCONN(task)) { + rpc_delay(task, 5*HZ); + goto retry_timeout; + } + status = task->tk_status; + break; + default: + dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", + task->tk_pid, -task->tk_status); + } + + rpc_exit(task, status); + return; + +retry_timeout: + task->tk_action = call_timeout; +} + +/* + * 4b. Connect to the RPC server + */ +static void +call_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %5u call_connect xprt %p %s connected\n", + task->tk_pid, xprt, + (xprt_connected(xprt) ? "is" : "is not")); + + task->tk_action = call_transmit; + if (!xprt_connected(xprt)) { + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); + } +} + +/* + * 4c. Sort out connect result + */ +static void +call_connect_status(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + int status = task->tk_status; + + dprint_status(task); + + task->tk_status = 0; + if (status >= 0 || status == -EAGAIN) { + clnt->cl_stats->netreconn++; + task->tk_action = call_transmit; + return; + } + + switch (status) { + /* if soft mounted, test if we've timed out */ + case -ETIMEDOUT: + task->tk_action = call_timeout; + break; + default: + rpc_exit(task, -EIO); + } +} + +/* + * 5. Transmit the RPC request, and wait for reply + */ +static void +call_transmit(struct rpc_task *task) +{ + dprint_status(task); + + task->tk_action = call_status; + if (task->tk_status < 0) + return; + task->tk_status = xprt_prepare_transmit(task); + if (task->tk_status != 0) + return; + task->tk_action = call_transmit_status; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + if (rpc_task_need_encode(task)) { + BUG_ON(task->tk_rqstp->rq_bytes_sent != 0); + rpc_xdr_encode(task); + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) { + /* Was the error nonfatal? */ + if (task->tk_status == -EAGAIN) + rpc_delay(task, HZ >> 4); + else + rpc_exit(task, task->tk_status); + return; + } + } + xprt_transmit(task); + if (task->tk_status < 0) + return; + /* + * On success, ensure that we call xprt_end_transmit() before sleeping + * in order to allow access to the socket to other RPC requests. + */ + call_transmit_status(task); + if (rpc_reply_expected(task)) + return; + task->tk_action = rpc_exit_task; + rpc_wake_up_queued_task(&task->tk_xprt->pending, task); +} + +/* + * 5a. Handle cleanup after a transmission + */ +static void +call_transmit_status(struct rpc_task *task) +{ + task->tk_action = call_status; + + /* + * Common case: success. Force the compiler to put this + * test first. + */ + if (task->tk_status == 0) { + xprt_end_transmit(task); + rpc_task_force_reencode(task); + return; + } + + switch (task->tk_status) { + case -EAGAIN: + break; + default: + dprint_status(task); + xprt_end_transmit(task); + rpc_task_force_reencode(task); + break; + /* + * Special cases: if we've been waiting on the + * socket's write_space() callback, or if the + * socket just returned a connection error, + * then hold onto the transport lock. + */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + if (RPC_IS_SOFTCONN(task)) { + xprt_end_transmit(task); + rpc_exit(task, task->tk_status); + break; + } + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + rpc_task_force_reencode(task); + } +} + +#if defined(CONFIG_NFS_V4_1) +/* + * 5b. Send the backchannel RPC reply. On error, drop the reply. In + * addition, disconnect on connectivity errors. + */ +static void +call_bc_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + BUG_ON(task->tk_status != 0); + task->tk_status = xprt_prepare_transmit(task); + if (task->tk_status == -EAGAIN) { + /* + * Could not reserve the transport. Try again after the + * transport is released. + */ + task->tk_status = 0; + task->tk_action = call_bc_transmit; + return; + } + + task->tk_action = rpc_exit_task; + if (task->tk_status < 0) { + printk(KERN_NOTICE "RPC: Could not send backchannel reply " + "error: %d\n", task->tk_status); + return; + } + + xprt_transmit(task); + xprt_end_transmit(task); + dprint_status(task); + switch (task->tk_status) { + case 0: + /* Success */ + break; + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -ETIMEDOUT: + /* + * Problem reaching the server. Disconnect and let the + * forechannel reestablish the connection. The server will + * have to retransmit the backchannel request and we'll + * reprocess it. Since these ops are idempotent, there's no + * need to cache our reply at this time. + */ + printk(KERN_NOTICE "RPC: Could not send backchannel reply " + "error: %d\n", task->tk_status); + xprt_conditional_disconnect(task->tk_xprt, + req->rq_connect_cookie); + break; + default: + /* + * We were unable to reply and will have to drop the + * request. The server should reconnect and retransmit. + */ + BUG_ON(task->tk_status == -EAGAIN); + printk(KERN_NOTICE "RPC: Could not send backchannel reply " + "error: %d\n", task->tk_status); + break; + } + rpc_wake_up_queued_task(&req->rq_xprt->pending, task); +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * 6. Sort out the RPC call status + */ +static void +call_status(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + int status; + + if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) + task->tk_status = req->rq_reply_bytes_recvd; + + dprint_status(task); + + status = task->tk_status; + if (status >= 0) { + task->tk_action = call_decode; + return; + } + + task->tk_status = 0; + switch(status) { + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + /* + * Delay any retries for 3 seconds, then handle as if it + * were a timeout. + */ + rpc_delay(task, 3*HZ); + case -ETIMEDOUT: + task->tk_action = call_timeout; + if (task->tk_client->cl_discrtry) + xprt_conditional_disconnect(task->tk_xprt, + req->rq_connect_cookie); + break; + case -ECONNRESET: + case -ECONNREFUSED: + rpc_force_rebind(clnt); + rpc_delay(task, 3*HZ); + case -EPIPE: + case -ENOTCONN: + task->tk_action = call_bind; + break; + case -EAGAIN: + task->tk_action = call_transmit; + break; + case -EIO: + /* shutdown or soft timeout */ + rpc_exit(task, status); + break; + default: + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); + } +} + +/* + * 6a. Handle RPC timeout + * We do not release the request slot, so we keep using the + * same XID for all retransmits. + */ +static void +call_timeout(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + if (xprt_adjust_timeout(task->tk_rqstp) == 0) { + dprintk("RPC: %5u call_timeout (minor)\n", task->tk_pid); + goto retry; + } + + dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); + task->tk_timeouts++; + + if (RPC_IS_SOFTCONN(task)) { + rpc_exit(task, -ETIMEDOUT); + return; + } + if (RPC_IS_SOFT(task)) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + clnt->cl_protname, clnt->cl_server); + if (task->tk_flags & RPC_TASK_TIMEOUT) + rpc_exit(task, -ETIMEDOUT); + else + rpc_exit(task, -EIO); + return; + } + + if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { + task->tk_flags |= RPC_CALL_MAJORSEEN; + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s not responding, still trying\n", + clnt->cl_protname, clnt->cl_server); + } + rpc_force_rebind(clnt); + /* + * Did our request time out due to an RPCSEC_GSS out-of-sequence + * event? RFC2203 requires the server to drop all such requests. + */ + rpcauth_invalcred(task); + +retry: + clnt->cl_stats->rpcretrans++; + task->tk_action = call_bind; + task->tk_status = 0; +} + +/* + * 7. Decode the RPC reply + */ +static void +call_decode(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode; + __be32 *p; + + dprintk("RPC: %5u call_decode (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_flags & RPC_CALL_MAJORSEEN) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); + task->tk_flags &= ~RPC_CALL_MAJORSEEN; + } + + /* + * Ensure that we see all writes made by xprt_complete_rqst() + * before it changed req->rq_reply_bytes_recvd. + */ + smp_rmb(); + req->rq_rcv_buf.len = req->rq_private_buf.len; + + /* Check that the softirq receive buffer is valid */ + WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, + sizeof(req->rq_rcv_buf)) != 0); + + if (req->rq_rcv_buf.len < 12) { + if (!RPC_IS_SOFT(task)) { + task->tk_action = call_bind; + clnt->cl_stats->rpcretrans++; + goto out_retry; + } + dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", + clnt->cl_protname, task->tk_status); + task->tk_action = call_timeout; + goto out_retry; + } + + p = rpc_verify_header(task); + if (IS_ERR(p)) { + if (p == ERR_PTR(-EAGAIN)) + goto out_retry; + return; + } + + task->tk_action = rpc_exit_task; + + if (decode) { + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + } + dprintk("RPC: %5u call_decode result %d\n", task->tk_pid, + task->tk_status); + return; +out_retry: + task->tk_status = 0; + /* Note: rpc_verify_header() may have freed the RPC slot */ + if (task->tk_rqstp == req) { + req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0; + if (task->tk_client->cl_discrtry) + xprt_conditional_disconnect(task->tk_xprt, + req->rq_connect_cookie); + } +} + +static __be32 * +rpc_encode_header(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + __be32 *p = req->rq_svec[0].iov_base; + + /* FIXME: check buffer size? */ + + p = xprt_skip_transport_header(task->tk_xprt, p); + *p++ = req->rq_xid; /* XID */ + *p++ = htonl(RPC_CALL); /* CALL */ + *p++ = htonl(RPC_VERSION); /* RPC version */ + *p++ = htonl(clnt->cl_prog); /* program number */ + *p++ = htonl(clnt->cl_vers); /* program version */ + *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */ + p = rpcauth_marshcred(task, p); + req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p); + return p; +} + +static __be32 * +rpc_verify_header(struct rpc_task *task) +{ + struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0]; + int len = task->tk_rqstp->rq_rcv_buf.len >> 2; + __be32 *p = iov->iov_base; + u32 n; + int error = -EACCES; + + if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) { + /* RFC-1014 says that the representation of XDR data must be a + * multiple of four bytes + * - if it isn't pointer subtraction in the NFS client may give + * undefined results + */ + dprintk("RPC: %5u %s: XDR representation not a multiple of" + " 4 bytes: 0x%x\n", task->tk_pid, __func__, + task->tk_rqstp->rq_rcv_buf.len); + goto out_eio; + } + if ((len -= 3) < 0) + goto out_overflow; + + p += 1; /* skip XID */ + if ((n = ntohl(*p++)) != RPC_REPLY) { + dprintk("RPC: %5u %s: not an RPC reply: %x\n", + task->tk_pid, __func__, n); + goto out_garbage; + } + + if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) { + if (--len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_AUTH_ERROR: + break; + case RPC_MISMATCH: + dprintk("RPC: %5u %s: RPC call version " + "mismatch!\n", + task->tk_pid, __func__); + error = -EPROTONOSUPPORT; + goto out_err; + default: + dprintk("RPC: %5u %s: RPC call rejected, " + "unknown error: %x\n", + task->tk_pid, __func__, n); + goto out_eio; + } + if (--len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_AUTH_REJECTEDCRED: + case RPC_AUTH_REJECTEDVERF: + case RPCSEC_GSS_CREDPROBLEM: + case RPCSEC_GSS_CTXPROBLEM: + if (!task->tk_cred_retry) + break; + task->tk_cred_retry--; + dprintk("RPC: %5u %s: retry stale creds\n", + task->tk_pid, __func__); + rpcauth_invalcred(task); + /* Ensure we obtain a new XID! */ + xprt_release(task); + task->tk_action = call_reserve; + goto out_retry; + case RPC_AUTH_BADCRED: + case RPC_AUTH_BADVERF: + /* possibly garbled cred/verf? */ + if (!task->tk_garb_retry) + break; + task->tk_garb_retry--; + dprintk("RPC: %5u %s: retry garbled creds\n", + task->tk_pid, __func__); + task->tk_action = call_bind; + goto out_retry; + case RPC_AUTH_TOOWEAK: + printk(KERN_NOTICE "RPC: server %s requires stronger " + "authentication.\n", task->tk_client->cl_server); + break; + default: + dprintk("RPC: %5u %s: unknown auth error: %x\n", + task->tk_pid, __func__, n); + error = -EIO; + } + dprintk("RPC: %5u %s: call rejected %d\n", + task->tk_pid, __func__, n); + goto out_err; + } + if (!(p = rpcauth_checkverf(task, p))) { + dprintk("RPC: %5u %s: auth check failed\n", + task->tk_pid, __func__); + goto out_garbage; /* bad verifier, retry */ + } + len = p - (__be32 *)iov->iov_base - 1; + if (len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_SUCCESS: + return p; + case RPC_PROG_UNAVAIL: + dprintk("RPC: %5u %s: program %u is unsupported by server %s\n", + task->tk_pid, __func__, + (unsigned int)task->tk_client->cl_prog, + task->tk_client->cl_server); + error = -EPFNOSUPPORT; + goto out_err; + case RPC_PROG_MISMATCH: + dprintk("RPC: %5u %s: program %u, version %u unsupported by " + "server %s\n", task->tk_pid, __func__, + (unsigned int)task->tk_client->cl_prog, + (unsigned int)task->tk_client->cl_vers, + task->tk_client->cl_server); + error = -EPROTONOSUPPORT; + goto out_err; + case RPC_PROC_UNAVAIL: + dprintk("RPC: %5u %s: proc %s unsupported by program %u, " + "version %u on server %s\n", + task->tk_pid, __func__, + rpc_proc_name(task), + task->tk_client->cl_prog, + task->tk_client->cl_vers, + task->tk_client->cl_server); + error = -EOPNOTSUPP; + goto out_err; + case RPC_GARBAGE_ARGS: + dprintk("RPC: %5u %s: server saw garbage\n", + task->tk_pid, __func__); + break; /* retry */ + default: + dprintk("RPC: %5u %s: server accept status: %x\n", + task->tk_pid, __func__, n); + /* Also retry */ + } + +out_garbage: + task->tk_client->cl_stats->rpcgarbage++; + if (task->tk_garb_retry) { + task->tk_garb_retry--; + dprintk("RPC: %5u %s: retrying\n", + task->tk_pid, __func__); + task->tk_action = call_bind; +out_retry: + return ERR_PTR(-EAGAIN); + } +out_eio: + error = -EIO; +out_err: + rpc_exit(task, error); + dprintk("RPC: %5u %s: call failed with error %d\n", task->tk_pid, + __func__, error); + return ERR_PTR(error); +out_overflow: + dprintk("RPC: %5u %s: server reply was truncated.\n", task->tk_pid, + __func__); + goto out_garbage; +} + +static void rpcproc_encode_null(void *rqstp, struct xdr_stream *xdr, void *obj) +{ +} + +static int rpcproc_decode_null(void *rqstp, struct xdr_stream *xdr, void *obj) +{ + return 0; +} + +static struct rpc_procinfo rpcproc_null = { + .p_encode = rpcproc_encode_null, + .p_decode = rpcproc_decode_null, +}; + +static int rpc_ping(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null, + }; + int err; + msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0); + err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN); + put_rpccred(msg.rpc_cred); + return err; +} + +struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null, + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = &rpc_default_ops, + .flags = flags, + }; + return rpc_run_task(&task_setup_data); +} +EXPORT_SYMBOL_GPL(rpc_call_null); + +#ifdef RPC_DEBUG +static void rpc_show_header(void) +{ + printk(KERN_INFO "-pid- flgs status -client- --rqstp- " + "-timeout ---ops--\n"); +} + +static void rpc_show_task(const struct rpc_clnt *clnt, + const struct rpc_task *task) +{ + const char *rpc_waitq = "none"; + + if (RPC_IS_QUEUED(task)) + rpc_waitq = rpc_qname(task->tk_waitqueue); + + printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n", + task->tk_pid, task->tk_flags, task->tk_status, + clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops, + clnt->cl_protname, clnt->cl_vers, rpc_proc_name(task), + task->tk_action, rpc_waitq); +} + +void rpc_show_tasks(void) +{ + struct rpc_clnt *clnt; + struct rpc_task *task; + int header = 0; + + spin_lock(&rpc_client_lock); + list_for_each_entry(clnt, &all_clients, cl_clients) { + spin_lock(&clnt->cl_lock); + list_for_each_entry(task, &clnt->cl_tasks, tk_task) { + if (!header) { + rpc_show_header(); + header++; + } + rpc_show_task(clnt, task); + } + spin_unlock(&clnt->cl_lock); + } + spin_unlock(&rpc_client_lock); +} +#endif diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h new file mode 100644 index 00000000..d013bf21 --- /dev/null +++ b/net/sunrpc/netns.h @@ -0,0 +1,19 @@ +#ifndef __SUNRPC_NETNS_H__ +#define __SUNRPC_NETNS_H__ + +#include +#include + +struct cache_detail; + +struct sunrpc_net { + struct proc_dir_entry *proc_net_rpc; + struct cache_detail *ip_map_cache; +}; + +extern int sunrpc_net_id; + +int ip_map_cache_create(struct net *); +void ip_map_cache_destroy(struct net *); + +#endif diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c new file mode 100644 index 00000000..72bc5368 --- /dev/null +++ b/net/sunrpc/rpc_pipe.c @@ -0,0 +1,1086 @@ +/* + * net/sunrpc/rpc_pipe.c + * + * Userland/kernel interface for rpcauth_gss. + * Code shamelessly plagiarized from fs/nfsd/nfsctl.c + * and fs/sysfs/inode.c + * + * Copyright (c) 2002, Trond Myklebust + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct vfsmount *rpc_mnt __read_mostly; +static int rpc_mount_count; + +static struct file_system_type rpc_pipe_fs_type; + + +static struct kmem_cache *rpc_inode_cachep __read_mostly; + +#define RPC_UPCALL_TIMEOUT (30*HZ) + +static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head, + void (*destroy_msg)(struct rpc_pipe_msg *), int err) +{ + struct rpc_pipe_msg *msg; + + if (list_empty(head)) + return; + do { + msg = list_entry(head->next, struct rpc_pipe_msg, list); + list_del_init(&msg->list); + msg->errno = err; + destroy_msg(msg); + } while (!list_empty(head)); + wake_up(&rpci->waitq); +} + +static void +rpc_timeout_upcall_queue(struct work_struct *work) +{ + LIST_HEAD(free_list); + struct rpc_inode *rpci = + container_of(work, struct rpc_inode, queue_timeout.work); + struct inode *inode = &rpci->vfs_inode; + void (*destroy_msg)(struct rpc_pipe_msg *); + + spin_lock(&inode->i_lock); + if (rpci->ops == NULL) { + spin_unlock(&inode->i_lock); + return; + } + destroy_msg = rpci->ops->destroy_msg; + if (rpci->nreaders == 0) { + list_splice_init(&rpci->pipe, &free_list); + rpci->pipelen = 0; + } + spin_unlock(&inode->i_lock); + rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT); +} + +/** + * rpc_queue_upcall - queue an upcall message to userspace + * @inode: inode of upcall pipe on which to queue given message + * @msg: message to queue + * + * Call with an @inode created by rpc_mkpipe() to queue an upcall. + * A userspace process may then later read the upcall by performing a + * read on an open file for this inode. It is up to the caller to + * initialize the fields of @msg (other than @msg->list) appropriately. + */ +int +rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) +{ + struct rpc_inode *rpci = RPC_I(inode); + int res = -EPIPE; + + spin_lock(&inode->i_lock); + if (rpci->ops == NULL) + goto out; + if (rpci->nreaders) { + list_add_tail(&msg->list, &rpci->pipe); + rpci->pipelen += msg->len; + res = 0; + } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { + if (list_empty(&rpci->pipe)) + queue_delayed_work(rpciod_workqueue, + &rpci->queue_timeout, + RPC_UPCALL_TIMEOUT); + list_add_tail(&msg->list, &rpci->pipe); + rpci->pipelen += msg->len; + res = 0; + } +out: + spin_unlock(&inode->i_lock); + wake_up(&rpci->waitq); + return res; +} +EXPORT_SYMBOL_GPL(rpc_queue_upcall); + +static inline void +rpc_inode_setowner(struct inode *inode, void *private) +{ + RPC_I(inode)->private = private; +} + +static void +rpc_close_pipes(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + const struct rpc_pipe_ops *ops; + int need_release; + + mutex_lock(&inode->i_mutex); + ops = rpci->ops; + if (ops != NULL) { + LIST_HEAD(free_list); + spin_lock(&inode->i_lock); + need_release = rpci->nreaders != 0 || rpci->nwriters != 0; + rpci->nreaders = 0; + list_splice_init(&rpci->in_upcall, &free_list); + list_splice_init(&rpci->pipe, &free_list); + rpci->pipelen = 0; + rpci->ops = NULL; + spin_unlock(&inode->i_lock); + rpc_purge_list(rpci, &free_list, ops->destroy_msg, -EPIPE); + rpci->nwriters = 0; + if (need_release && ops->release_pipe) + ops->release_pipe(inode); + cancel_delayed_work_sync(&rpci->queue_timeout); + } + rpc_inode_setowner(inode, NULL); + mutex_unlock(&inode->i_mutex); +} + +static struct inode * +rpc_alloc_inode(struct super_block *sb) +{ + struct rpc_inode *rpci; + rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); + if (!rpci) + return NULL; + return &rpci->vfs_inode; +} + +static void +rpc_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); +} + +static void +rpc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, rpc_i_callback); +} + +static int +rpc_pipe_open(struct inode *inode, struct file *filp) +{ + struct rpc_inode *rpci = RPC_I(inode); + int first_open; + int res = -ENXIO; + + mutex_lock(&inode->i_mutex); + if (rpci->ops == NULL) + goto out; + first_open = rpci->nreaders == 0 && rpci->nwriters == 0; + if (first_open && rpci->ops->open_pipe) { + res = rpci->ops->open_pipe(inode); + if (res) + goto out; + } + if (filp->f_mode & FMODE_READ) + rpci->nreaders++; + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters++; + res = 0; +out: + mutex_unlock(&inode->i_mutex); + return res; +} + +static int +rpc_pipe_release(struct inode *inode, struct file *filp) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_pipe_msg *msg; + int last_close; + + mutex_lock(&inode->i_mutex); + if (rpci->ops == NULL) + goto out; + msg = filp->private_data; + if (msg != NULL) { + spin_lock(&inode->i_lock); + msg->errno = -EAGAIN; + list_del_init(&msg->list); + spin_unlock(&inode->i_lock); + rpci->ops->destroy_msg(msg); + } + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters --; + if (filp->f_mode & FMODE_READ) { + rpci->nreaders --; + if (rpci->nreaders == 0) { + LIST_HEAD(free_list); + spin_lock(&inode->i_lock); + list_splice_init(&rpci->pipe, &free_list); + rpci->pipelen = 0; + spin_unlock(&inode->i_lock); + rpc_purge_list(rpci, &free_list, + rpci->ops->destroy_msg, -EAGAIN); + } + } + last_close = rpci->nwriters == 0 && rpci->nreaders == 0; + if (last_close && rpci->ops->release_pipe) + rpci->ops->release_pipe(inode); +out: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static ssize_t +rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_pipe_msg *msg; + int res = 0; + + mutex_lock(&inode->i_mutex); + if (rpci->ops == NULL) { + res = -EPIPE; + goto out_unlock; + } + msg = filp->private_data; + if (msg == NULL) { + spin_lock(&inode->i_lock); + if (!list_empty(&rpci->pipe)) { + msg = list_entry(rpci->pipe.next, + struct rpc_pipe_msg, + list); + list_move(&msg->list, &rpci->in_upcall); + rpci->pipelen -= msg->len; + filp->private_data = msg; + msg->copied = 0; + } + spin_unlock(&inode->i_lock); + if (msg == NULL) + goto out_unlock; + } + /* NOTE: it is up to the callback to update msg->copied */ + res = rpci->ops->upcall(filp, msg, buf, len); + if (res < 0 || msg->len == msg->copied) { + filp->private_data = NULL; + spin_lock(&inode->i_lock); + list_del_init(&msg->list); + spin_unlock(&inode->i_lock); + rpci->ops->destroy_msg(msg); + } +out_unlock: + mutex_unlock(&inode->i_mutex); + return res; +} + +static ssize_t +rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + int res; + + mutex_lock(&inode->i_mutex); + res = -EPIPE; + if (rpci->ops != NULL) + res = rpci->ops->downcall(filp, buf, len); + mutex_unlock(&inode->i_mutex); + return res; +} + +static unsigned int +rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct rpc_inode *rpci; + unsigned int mask = 0; + + rpci = RPC_I(filp->f_path.dentry->d_inode); + poll_wait(filp, &rpci->waitq, wait); + + mask = POLLOUT | POLLWRNORM; + if (rpci->ops == NULL) + mask |= POLLERR | POLLHUP; + if (filp->private_data || !list_empty(&rpci->pipe)) + mask |= POLLIN | POLLRDNORM; + return mask; +} + +static long +rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + int len; + + switch (cmd) { + case FIONREAD: + spin_lock(&inode->i_lock); + if (rpci->ops == NULL) { + spin_unlock(&inode->i_lock); + return -EPIPE; + } + len = rpci->pipelen; + if (filp->private_data) { + struct rpc_pipe_msg *msg; + msg = filp->private_data; + len += msg->len - msg->copied; + } + spin_unlock(&inode->i_lock); + return put_user(len, (int __user *)arg); + default: + return -EINVAL; + } +} + +static const struct file_operations rpc_pipe_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = rpc_pipe_read, + .write = rpc_pipe_write, + .poll = rpc_pipe_poll, + .unlocked_ioctl = rpc_pipe_ioctl, + .open = rpc_pipe_open, + .release = rpc_pipe_release, +}; + +static int +rpc_show_info(struct seq_file *m, void *v) +{ + struct rpc_clnt *clnt = m->private; + + seq_printf(m, "RPC server: %s\n", clnt->cl_server); + seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_protname, + clnt->cl_prog, clnt->cl_vers); + seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); + seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO)); + seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT)); + return 0; +} + +static int +rpc_info_open(struct inode *inode, struct file *file) +{ + struct rpc_clnt *clnt = NULL; + int ret = single_open(file, rpc_show_info, NULL); + + if (!ret) { + struct seq_file *m = file->private_data; + + spin_lock(&file->f_path.dentry->d_lock); + if (!d_unhashed(file->f_path.dentry)) + clnt = RPC_I(inode)->private; + if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) { + spin_unlock(&file->f_path.dentry->d_lock); + m->private = clnt; + } else { + spin_unlock(&file->f_path.dentry->d_lock); + single_release(inode, file); + ret = -EINVAL; + } + } + return ret; +} + +static int +rpc_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct rpc_clnt *clnt = (struct rpc_clnt *)m->private; + + if (clnt) + rpc_release_client(clnt); + return single_release(inode, file); +} + +static const struct file_operations rpc_info_operations = { + .owner = THIS_MODULE, + .open = rpc_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = rpc_info_release, +}; + + +/* + * Description of fs contents. + */ +struct rpc_filelist { + const char *name; + const struct file_operations *i_fop; + umode_t mode; +}; + +struct vfsmount *rpc_get_mount(void) +{ + int err; + + err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mnt, &rpc_mount_count); + if (err != 0) + return ERR_PTR(err); + return rpc_mnt; +} +EXPORT_SYMBOL_GPL(rpc_get_mount); + +void rpc_put_mount(void) +{ + simple_release_fs(&rpc_mnt, &rpc_mount_count); +} +EXPORT_SYMBOL_GPL(rpc_put_mount); + +static int rpc_delete_dentry(const struct dentry *dentry) +{ + return 1; +} + +static const struct dentry_operations rpc_dentry_operations = { + .d_delete = rpc_delete_dentry, +}; + +static struct inode * +rpc_get_inode(struct super_block *sb, umode_t mode) +{ + struct inode *inode = new_inode(sb); + if (!inode) + return NULL; + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch(mode & S_IFMT) { + case S_IFDIR: + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + default: + break; + } + return inode; +} + +static int __rpc_create_common(struct inode *dir, struct dentry *dentry, + umode_t mode, + const struct file_operations *i_fop, + void *private) +{ + struct inode *inode; + + d_drop(dentry); + inode = rpc_get_inode(dir->i_sb, mode); + if (!inode) + goto out_err; + inode->i_ino = iunique(dir->i_sb, 100); + if (i_fop) + inode->i_fop = i_fop; + if (private) + rpc_inode_setowner(inode, private); + d_add(dentry, inode); + return 0; +out_err: + printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", + __FILE__, __func__, dentry->d_name.name); + dput(dentry); + return -ENOMEM; +} + +static int __rpc_create(struct inode *dir, struct dentry *dentry, + umode_t mode, + const struct file_operations *i_fop, + void *private) +{ + int err; + + err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private); + if (err) + return err; + fsnotify_create(dir, dentry); + return 0; +} + +static int __rpc_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode, + const struct file_operations *i_fop, + void *private) +{ + int err; + + err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private); + if (err) + return err; + inc_nlink(dir); + fsnotify_mkdir(dir, dentry); + return 0; +} + +static int __rpc_mkpipe(struct inode *dir, struct dentry *dentry, + umode_t mode, + const struct file_operations *i_fop, + void *private, + const struct rpc_pipe_ops *ops, + int flags) +{ + struct rpc_inode *rpci; + int err; + + err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); + if (err) + return err; + rpci = RPC_I(dentry->d_inode); + rpci->nkern_readwriters = 1; + rpci->private = private; + rpci->flags = flags; + rpci->ops = ops; + fsnotify_create(dir, dentry); + return 0; +} + +static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) +{ + int ret; + + dget(dentry); + ret = simple_rmdir(dir, dentry); + d_delete(dentry); + dput(dentry); + return ret; +} + +static int __rpc_unlink(struct inode *dir, struct dentry *dentry) +{ + int ret; + + dget(dentry); + ret = simple_unlink(dir, dentry); + d_delete(dentry); + dput(dentry); + return ret; +} + +static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + + rpci->nkern_readwriters--; + if (rpci->nkern_readwriters != 0) + return 0; + rpc_close_pipes(inode); + return __rpc_unlink(dir, dentry); +} + +static struct dentry *__rpc_lookup_create(struct dentry *parent, + struct qstr *name) +{ + struct dentry *dentry; + + dentry = d_lookup(parent, name); + if (!dentry) { + dentry = d_alloc(parent, name); + if (!dentry) { + dentry = ERR_PTR(-ENOMEM); + goto out_err; + } + } + if (!dentry->d_inode) + d_set_d_op(dentry, &rpc_dentry_operations); +out_err: + return dentry; +} + +static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, + struct qstr *name) +{ + struct dentry *dentry; + + dentry = __rpc_lookup_create(parent, name); + if (IS_ERR(dentry)) + return dentry; + if (dentry->d_inode == NULL) + return dentry; + dput(dentry); + return ERR_PTR(-EEXIST); +} + +/* + * FIXME: This probably has races. + */ +static void __rpc_depopulate(struct dentry *parent, + const struct rpc_filelist *files, + int start, int eof) +{ + struct inode *dir = parent->d_inode; + struct dentry *dentry; + struct qstr name; + int i; + + for (i = start; i < eof; i++) { + name.name = files[i].name; + name.len = strlen(files[i].name); + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + + if (dentry == NULL) + continue; + if (dentry->d_inode == NULL) + goto next; + switch (dentry->d_inode->i_mode & S_IFMT) { + default: + BUG(); + case S_IFREG: + __rpc_unlink(dir, dentry); + break; + case S_IFDIR: + __rpc_rmdir(dir, dentry); + } +next: + dput(dentry); + } +} + +static void rpc_depopulate(struct dentry *parent, + const struct rpc_filelist *files, + int start, int eof) +{ + struct inode *dir = parent->d_inode; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); + __rpc_depopulate(parent, files, start, eof); + mutex_unlock(&dir->i_mutex); +} + +static int rpc_populate(struct dentry *parent, + const struct rpc_filelist *files, + int start, int eof, + void *private) +{ + struct inode *dir = parent->d_inode; + struct dentry *dentry; + int i, err; + + mutex_lock(&dir->i_mutex); + for (i = start; i < eof; i++) { + struct qstr q; + + q.name = files[i].name; + q.len = strlen(files[i].name); + q.hash = full_name_hash(q.name, q.len); + dentry = __rpc_lookup_create_exclusive(parent, &q); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_bad; + switch (files[i].mode & S_IFMT) { + default: + BUG(); + case S_IFREG: + err = __rpc_create(dir, dentry, + files[i].mode, + files[i].i_fop, + private); + break; + case S_IFDIR: + err = __rpc_mkdir(dir, dentry, + files[i].mode, + NULL, + private); + } + if (err != 0) + goto out_bad; + } + mutex_unlock(&dir->i_mutex); + return 0; +out_bad: + __rpc_depopulate(parent, files, start, eof); + mutex_unlock(&dir->i_mutex); + printk(KERN_WARNING "%s: %s failed to populate directory %s\n", + __FILE__, __func__, parent->d_name.name); + return err; +} + +static struct dentry *rpc_mkdir_populate(struct dentry *parent, + struct qstr *name, umode_t mode, void *private, + int (*populate)(struct dentry *, void *), void *args_populate) +{ + struct dentry *dentry; + struct inode *dir = parent->d_inode; + int error; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + dentry = __rpc_lookup_create_exclusive(parent, name); + if (IS_ERR(dentry)) + goto out; + error = __rpc_mkdir(dir, dentry, mode, NULL, private); + if (error != 0) + goto out_err; + if (populate != NULL) { + error = populate(dentry, args_populate); + if (error) + goto err_rmdir; + } +out: + mutex_unlock(&dir->i_mutex); + return dentry; +err_rmdir: + __rpc_rmdir(dir, dentry); +out_err: + dentry = ERR_PTR(error); + goto out; +} + +static int rpc_rmdir_depopulate(struct dentry *dentry, + void (*depopulate)(struct dentry *)) +{ + struct dentry *parent; + struct inode *dir; + int error; + + parent = dget_parent(dentry); + dir = parent->d_inode; + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (depopulate != NULL) + depopulate(dentry); + error = __rpc_rmdir(dir, dentry); + mutex_unlock(&dir->i_mutex); + dput(parent); + return error; +} + +/** + * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication + * @parent: dentry of directory to create new "pipe" in + * @name: name of pipe + * @private: private data to associate with the pipe, for the caller's use + * @ops: operations defining the behavior of the pipe: upcall, downcall, + * release_pipe, open_pipe, and destroy_msg. + * @flags: rpc_inode flags + * + * Data is made available for userspace to read by calls to + * rpc_queue_upcall(). The actual reads will result in calls to + * @ops->upcall, which will be called with the file pointer, + * message, and userspace buffer to copy to. + * + * Writes can come at any time, and do not necessarily have to be + * responses to upcalls. They will result in calls to @msg->downcall. + * + * The @private argument passed here will be available to all these methods + * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private. + */ +struct dentry *rpc_mkpipe(struct dentry *parent, const char *name, + void *private, const struct rpc_pipe_ops *ops, + int flags) +{ + struct dentry *dentry; + struct inode *dir = parent->d_inode; + umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; + struct qstr q; + int err; + + if (ops->upcall == NULL) + umode &= ~S_IRUGO; + if (ops->downcall == NULL) + umode &= ~S_IWUGO; + + q.name = name; + q.len = strlen(name); + q.hash = full_name_hash(q.name, q.len), + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + dentry = __rpc_lookup_create(parent, &q); + if (IS_ERR(dentry)) + goto out; + if (dentry->d_inode) { + struct rpc_inode *rpci = RPC_I(dentry->d_inode); + if (rpci->private != private || + rpci->ops != ops || + rpci->flags != flags) { + dput (dentry); + err = -EBUSY; + goto out_err; + } + rpci->nkern_readwriters++; + goto out; + } + + err = __rpc_mkpipe(dir, dentry, umode, &rpc_pipe_fops, + private, ops, flags); + if (err) + goto out_err; +out: + mutex_unlock(&dir->i_mutex); + return dentry; +out_err: + dentry = ERR_PTR(err); + printk(KERN_WARNING "%s: %s() failed to create pipe %s/%s (errno = %d)\n", + __FILE__, __func__, parent->d_name.name, name, + err); + goto out; +} +EXPORT_SYMBOL_GPL(rpc_mkpipe); + +/** + * rpc_unlink - remove a pipe + * @dentry: dentry for the pipe, as returned from rpc_mkpipe + * + * After this call, lookups will no longer find the pipe, and any + * attempts to read or write using preexisting opens of the pipe will + * return -EPIPE. + */ +int +rpc_unlink(struct dentry *dentry) +{ + struct dentry *parent; + struct inode *dir; + int error = 0; + + parent = dget_parent(dentry); + dir = parent->d_inode; + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + error = __rpc_rmpipe(dir, dentry); + mutex_unlock(&dir->i_mutex); + dput(parent); + return error; +} +EXPORT_SYMBOL_GPL(rpc_unlink); + +enum { + RPCAUTH_info, + RPCAUTH_EOF +}; + +static const struct rpc_filelist authfiles[] = { + [RPCAUTH_info] = { + .name = "info", + .i_fop = &rpc_info_operations, + .mode = S_IFREG | S_IRUSR, + }, +}; + +static int rpc_clntdir_populate(struct dentry *dentry, void *private) +{ + return rpc_populate(dentry, + authfiles, RPCAUTH_info, RPCAUTH_EOF, + private); +} + +static void rpc_clntdir_depopulate(struct dentry *dentry) +{ + rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); +} + +/** + * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs + * @dentry: dentry from the rpc_pipefs root to the new directory + * @name: &struct qstr for the name + * @rpc_client: rpc client to associate with this directory + * + * This creates a directory at the given @path associated with + * @rpc_clnt, which will contain a file named "info" with some basic + * information about the client, together with any "pipes" that may + * later be created using rpc_mkpipe(). + */ +struct dentry *rpc_create_client_dir(struct dentry *dentry, + struct qstr *name, + struct rpc_clnt *rpc_client) +{ + return rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL, + rpc_clntdir_populate, rpc_client); +} + +/** + * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir() + * @dentry: directory to remove + */ +int rpc_remove_client_dir(struct dentry *dentry) +{ + return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate); +} + +static const struct rpc_filelist cache_pipefs_files[3] = { + [0] = { + .name = "channel", + .i_fop = &cache_file_operations_pipefs, + .mode = S_IFREG|S_IRUSR|S_IWUSR, + }, + [1] = { + .name = "content", + .i_fop = &content_file_operations_pipefs, + .mode = S_IFREG|S_IRUSR, + }, + [2] = { + .name = "flush", + .i_fop = &cache_flush_operations_pipefs, + .mode = S_IFREG|S_IRUSR|S_IWUSR, + }, +}; + +static int rpc_cachedir_populate(struct dentry *dentry, void *private) +{ + return rpc_populate(dentry, + cache_pipefs_files, 0, 3, + private); +} + +static void rpc_cachedir_depopulate(struct dentry *dentry) +{ + rpc_depopulate(dentry, cache_pipefs_files, 0, 3); +} + +struct dentry *rpc_create_cache_dir(struct dentry *parent, struct qstr *name, + mode_t umode, struct cache_detail *cd) +{ + return rpc_mkdir_populate(parent, name, umode, NULL, + rpc_cachedir_populate, cd); +} + +void rpc_remove_cache_dir(struct dentry *dentry) +{ + rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate); +} + +/* + * populate the filesystem + */ +static const struct super_operations s_ops = { + .alloc_inode = rpc_alloc_inode, + .destroy_inode = rpc_destroy_inode, + .statfs = simple_statfs, +}; + +#define RPCAUTH_GSSMAGIC 0x67596969 + +/* + * We have a single directory with 1 node in it. + */ +enum { + RPCAUTH_lockd, + RPCAUTH_mount, + RPCAUTH_nfs, + RPCAUTH_portmap, + RPCAUTH_statd, + RPCAUTH_nfsd4_cb, + RPCAUTH_cache, + RPCAUTH_RootEOF +}; + +static const struct rpc_filelist files[] = { + [RPCAUTH_lockd] = { + .name = "lockd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_mount] = { + .name = "mount", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_nfs] = { + .name = "nfs", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_portmap] = { + .name = "portmap", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_statd] = { + .name = "statd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_nfsd4_cb] = { + .name = "nfsd4_cb", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_cache] = { + .name = "cache", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, +}; + +static int +rpc_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RPCAUTH_GSSMAGIC; + sb->s_op = &s_ops; + sb->s_time_gran = 1; + + inode = rpc_get_inode(sb, S_IFDIR | 0755); + if (!inode) + return -ENOMEM; + sb->s_root = root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) + return -ENOMEM; + return 0; +} + +static struct dentry * +rpc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, rpc_fill_super); +} + +static struct file_system_type rpc_pipe_fs_type = { + .owner = THIS_MODULE, + .name = "rpc_pipefs", + .mount = rpc_mount, + .kill_sb = kill_litter_super, +}; + +static void +init_once(void *foo) +{ + struct rpc_inode *rpci = (struct rpc_inode *) foo; + + inode_init_once(&rpci->vfs_inode); + rpci->private = NULL; + rpci->nreaders = 0; + rpci->nwriters = 0; + INIT_LIST_HEAD(&rpci->in_upcall); + INIT_LIST_HEAD(&rpci->in_downcall); + INIT_LIST_HEAD(&rpci->pipe); + rpci->pipelen = 0; + init_waitqueue_head(&rpci->waitq); + INIT_DELAYED_WORK(&rpci->queue_timeout, + rpc_timeout_upcall_queue); + rpci->ops = NULL; +} + +int register_rpc_pipefs(void) +{ + int err; + + rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", + sizeof(struct rpc_inode), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (!rpc_inode_cachep) + return -ENOMEM; + err = register_filesystem(&rpc_pipe_fs_type); + if (err) { + kmem_cache_destroy(rpc_inode_cachep); + return err; + } + + return 0; +} + +void unregister_rpc_pipefs(void) +{ + kmem_cache_destroy(rpc_inode_cachep); + unregister_filesystem(&rpc_pipe_fs_type); +} diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c new file mode 100644 index 00000000..e45d2fbb --- /dev/null +++ b/net/sunrpc/rpcb_clnt.c @@ -0,0 +1,1074 @@ +/* + * In-kernel rpcbind client supporting versions 2, 3, and 4 of the rpcbind + * protocol + * + * Based on RFC 1833: "Binding Protocols for ONC RPC Version 2" and + * RFC 3530: "Network File System (NFS) version 4 Protocol" + * + * Original: Gilles Quillard, Bull Open Source, 2005 + * Updated: Chuck Lever, Oracle Corporation, 2007 + * + * Descended from net/sunrpc/pmap_clnt.c, + * Copyright (C) 1996, Olaf Kirch + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_BIND +#endif + +#define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock" + +#define RPCBIND_PROGRAM (100000u) +#define RPCBIND_PORT (111u) + +#define RPCBVERS_2 (2u) +#define RPCBVERS_3 (3u) +#define RPCBVERS_4 (4u) + +enum { + RPCBPROC_NULL, + RPCBPROC_SET, + RPCBPROC_UNSET, + RPCBPROC_GETPORT, + RPCBPROC_GETADDR = 3, /* alias for GETPORT */ + RPCBPROC_DUMP, + RPCBPROC_CALLIT, + RPCBPROC_BCAST = 5, /* alias for CALLIT */ + RPCBPROC_GETTIME, + RPCBPROC_UADDR2TADDR, + RPCBPROC_TADDR2UADDR, + RPCBPROC_GETVERSADDR, + RPCBPROC_INDIRECT, + RPCBPROC_GETADDRLIST, + RPCBPROC_GETSTAT, +}; + +/* + * r_owner + * + * The "owner" is allowed to unset a service in the rpcbind database. + * + * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a + * UID which it maps to a local user name via a password lookup. + * In all other cases it is ignored. + * + * For SET/UNSET requests, user space provides a value, even for + * network requests, and GETADDR uses an empty string. We follow + * those precedents here. + */ +#define RPCB_OWNER_STRING "0" +#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) + +/* + * XDR data type sizes + */ +#define RPCB_program_sz (1) +#define RPCB_version_sz (1) +#define RPCB_protocol_sz (1) +#define RPCB_port_sz (1) +#define RPCB_boolean_sz (1) + +#define RPCB_netid_sz (1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN)) +#define RPCB_addr_sz (1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN)) +#define RPCB_ownerstring_sz (1 + XDR_QUADLEN(RPCB_MAXOWNERLEN)) + +/* + * XDR argument and result sizes + */ +#define RPCB_mappingargs_sz (RPCB_program_sz + RPCB_version_sz + \ + RPCB_protocol_sz + RPCB_port_sz) +#define RPCB_getaddrargs_sz (RPCB_program_sz + RPCB_version_sz + \ + RPCB_netid_sz + RPCB_addr_sz + \ + RPCB_ownerstring_sz) + +#define RPCB_getportres_sz RPCB_port_sz +#define RPCB_setres_sz RPCB_boolean_sz + +/* + * Note that RFC 1833 does not put any size restrictions on the + * address string returned by the remote rpcbind database. + */ +#define RPCB_getaddrres_sz RPCB_addr_sz + +static void rpcb_getport_done(struct rpc_task *, void *); +static void rpcb_map_release(void *data); +static struct rpc_program rpcb_program; + +static struct rpc_clnt * rpcb_local_clnt; +static struct rpc_clnt * rpcb_local_clnt4; + +struct rpcbind_args { + struct rpc_xprt * r_xprt; + + u32 r_prog; + u32 r_vers; + u32 r_prot; + unsigned short r_port; + const char * r_netid; + const char * r_addr; + const char * r_owner; + + int r_status; +}; + +static struct rpc_procinfo rpcb_procedures2[]; +static struct rpc_procinfo rpcb_procedures3[]; +static struct rpc_procinfo rpcb_procedures4[]; + +struct rpcb_info { + u32 rpc_vers; + struct rpc_procinfo * rpc_proc; +}; + +static struct rpcb_info rpcb_next_version[]; +static struct rpcb_info rpcb_next_version6[]; + +static const struct rpc_call_ops rpcb_getport_ops = { + .rpc_call_done = rpcb_getport_done, + .rpc_release = rpcb_map_release, +}; + +static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status) +{ + xprt_clear_binding(xprt); + rpc_wake_up_status(&xprt->binding, status); +} + +static void rpcb_map_release(void *data) +{ + struct rpcbind_args *map = data; + + rpcb_wake_rpcbind_waiters(map->r_xprt, map->r_status); + xprt_put(map->r_xprt); + kfree(map->r_addr); + kfree(map); +} + +/* + * Returns zero on success, otherwise a negative errno value + * is returned. + */ +static int rpcb_create_local_unix(void) +{ + static const struct sockaddr_un rpcb_localaddr_rpcbind = { + .sun_family = AF_LOCAL, + .sun_path = RPCBIND_SOCK_PATHNAME, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = XPRT_TRANSPORT_LOCAL, + .address = (struct sockaddr *)&rpcb_localaddr_rpcbind, + .addrsize = sizeof(rpcb_localaddr_rpcbind), + .servername = "localhost", + .program = &rpcb_program, + .version = RPCBVERS_2, + .authflavor = RPC_AUTH_NULL, + }; + struct rpc_clnt *clnt, *clnt4; + int result = 0; + + /* + * Because we requested an RPC PING at transport creation time, + * this works only if the user space portmapper is rpcbind, and + * it's listening on AF_LOCAL on the named socket. + */ + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create AF_LOCAL rpcbind " + "client (errno %ld).\n", PTR_ERR(clnt)); + result = -PTR_ERR(clnt); + goto out; + } + + clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); + if (IS_ERR(clnt4)) { + dprintk("RPC: failed to bind second program to " + "rpcbind v4 client (errno %ld).\n", + PTR_ERR(clnt4)); + clnt4 = NULL; + } + + /* Protected by rpcb_create_local_mutex */ + rpcb_local_clnt = clnt; + rpcb_local_clnt4 = clnt4; + +out: + return result; +} + +/* + * Returns zero on success, otherwise a negative errno value + * is returned. + */ +static int rpcb_create_local_net(void) +{ + static const struct sockaddr_in rpcb_inaddr_loopback = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + .sin_port = htons(RPCBIND_PORT), + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = XPRT_TRANSPORT_TCP, + .address = (struct sockaddr *)&rpcb_inaddr_loopback, + .addrsize = sizeof(rpcb_inaddr_loopback), + .servername = "localhost", + .program = &rpcb_program, + .version = RPCBVERS_2, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct rpc_clnt *clnt, *clnt4; + int result = 0; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create local rpcbind " + "client (errno %ld).\n", PTR_ERR(clnt)); + result = -PTR_ERR(clnt); + goto out; + } + + /* + * This results in an RPC ping. On systems running portmapper, + * the v4 ping will fail. Proceed anyway, but disallow rpcb + * v4 upcalls. + */ + clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); + if (IS_ERR(clnt4)) { + dprintk("RPC: failed to bind second program to " + "rpcbind v4 client (errno %ld).\n", + PTR_ERR(clnt4)); + clnt4 = NULL; + } + + /* Protected by rpcb_create_local_mutex */ + rpcb_local_clnt = clnt; + rpcb_local_clnt4 = clnt4; + +out: + return result; +} + +/* + * Returns zero on success, otherwise a negative errno value + * is returned. + */ +static int rpcb_create_local(void) +{ + static DEFINE_MUTEX(rpcb_create_local_mutex); + int result = 0; + + if (rpcb_local_clnt) + return result; + + mutex_lock(&rpcb_create_local_mutex); + if (rpcb_local_clnt) + goto out; + + if (rpcb_create_local_unix() != 0) + result = rpcb_create_local_net(); + +out: + mutex_unlock(&rpcb_create_local_mutex); + return result; +} + +static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, + size_t salen, int proto, u32 version) +{ + struct rpc_create_args args = { + .net = &init_net, + .protocol = proto, + .address = srvaddr, + .addrsize = salen, + .servername = hostname, + .program = &rpcb_program, + .version = version, + .authflavor = RPC_AUTH_UNIX, + .flags = (RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_NONPRIVPORT), + }; + + switch (srvaddr->sa_family) { + case AF_INET: + ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); + break; + case AF_INET6: + ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT); + break; + default: + return ERR_PTR(-EAFNOSUPPORT); + } + + return rpc_create(&args); +} + +static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg) +{ + int result, error = 0; + + msg->rpc_resp = &result; + + error = rpc_call_sync(clnt, msg, RPC_TASK_SOFTCONN); + if (error < 0) { + dprintk("RPC: failed to contact local rpcbind " + "server (errno %d).\n", -error); + return error; + } + + if (!result) + return -EACCES; + return 0; +} + +/** + * rpcb_register - set or unset a port registration with the local rpcbind svc + * @prog: RPC program number to bind + * @vers: RPC version number to bind + * @prot: transport protocol to register + * @port: port value to register + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). + * + * RPC services invoke this function to advertise their contact + * information via the system's rpcbind daemon. RPC services + * invoke this function once for each [program, version, transport] + * tuple they wish to advertise. + * + * Callers may also unregister RPC services that are no longer + * available by setting the passed-in port to zero. This removes + * all registered transports for [program, version] from the local + * rpcbind database. + * + * This function uses rpcbind protocol version 2 to contact the + * local rpcbind daemon. + * + * Registration works over both AF_INET and AF_INET6, and services + * registered via this function are advertised as available for any + * address. If the local rpcbind daemon is listening on AF_INET6, + * services registered via this function will be advertised on + * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6 + * addresses). + */ +int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) +{ + struct rpcbind_args map = { + .r_prog = prog, + .r_vers = vers, + .r_prot = prot, + .r_port = port, + }; + struct rpc_message msg = { + .rpc_argp = &map, + }; + int error; + + error = rpcb_create_local(); + if (error) + return error; + + dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " + "rpcbind\n", (port ? "" : "un"), + prog, vers, prot, port); + + msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET]; + if (port) + msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; + + return rpcb_register_call(rpcb_local_clnt, &msg); +} + +/* + * Fill in AF_INET family-specific arguments to register + */ +static int rpcb_register_inet4(const struct sockaddr *sap, + struct rpc_message *msg) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; + struct rpcbind_args *map = msg->rpc_argp; + unsigned short port = ntohs(sin->sin_port); + int result; + + map->r_addr = rpc_sockaddr2uaddr(sap); + + dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " + "local rpcbind\n", (port ? "" : "un"), + map->r_prog, map->r_vers, + map->r_addr, map->r_netid); + + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + if (port) + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; + + result = rpcb_register_call(rpcb_local_clnt4, msg); + kfree(map->r_addr); + return result; +} + +/* + * Fill in AF_INET6 family-specific arguments to register + */ +static int rpcb_register_inet6(const struct sockaddr *sap, + struct rpc_message *msg) +{ + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap; + struct rpcbind_args *map = msg->rpc_argp; + unsigned short port = ntohs(sin6->sin6_port); + int result; + + map->r_addr = rpc_sockaddr2uaddr(sap); + + dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with " + "local rpcbind\n", (port ? "" : "un"), + map->r_prog, map->r_vers, + map->r_addr, map->r_netid); + + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + if (port) + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; + + result = rpcb_register_call(rpcb_local_clnt4, msg); + kfree(map->r_addr); + return result; +} + +static int rpcb_unregister_all_protofamilies(struct rpc_message *msg) +{ + struct rpcbind_args *map = msg->rpc_argp; + + dprintk("RPC: unregistering [%u, %u, '%s'] with " + "local rpcbind\n", + map->r_prog, map->r_vers, map->r_netid); + + map->r_addr = ""; + msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; + + return rpcb_register_call(rpcb_local_clnt4, msg); +} + +/** + * rpcb_v4_register - set or unset a port registration with the local rpcbind + * @program: RPC program number of service to (un)register + * @version: RPC version number of service to (un)register + * @address: address family, IP address, and port to (un)register + * @netid: netid of transport protocol to (un)register + * + * Returns zero if the registration request was dispatched successfully + * and the rpcbind daemon returned success. Otherwise, returns an errno + * value that reflects the nature of the error (request could not be + * dispatched, timed out, or rpcbind returned an error). + * + * RPC services invoke this function to advertise their contact + * information via the system's rpcbind daemon. RPC services + * invoke this function once for each [program, version, address, + * netid] tuple they wish to advertise. + * + * Callers may also unregister RPC services that are registered at a + * specific address by setting the port number in @address to zero. + * They may unregister all registered protocol families at once for + * a service by passing a NULL @address argument. If @netid is "" + * then all netids for [program, version, address] are unregistered. + * + * This function uses rpcbind protocol version 4 to contact the + * local rpcbind daemon. The local rpcbind daemon must support + * version 4 of the rpcbind protocol in order for these functions + * to register a service successfully. + * + * Supported netids include "udp" and "tcp" for UDP and TCP over + * IPv4, and "udp6" and "tcp6" for UDP and TCP over IPv6, + * respectively. + * + * The contents of @address determine the address family and the + * port to be registered. The usual practice is to pass INADDR_ANY + * as the raw address, but specifying a non-zero address is also + * supported by this API if the caller wishes to advertise an RPC + * service on a specific network interface. + * + * Note that passing in INADDR_ANY does not create the same service + * registration as IN6ADDR_ANY. The former advertises an RPC + * service on any IPv4 address, but not on IPv6. The latter + * advertises the service on all IPv4 and IPv6 addresses. + */ +int rpcb_v4_register(const u32 program, const u32 version, + const struct sockaddr *address, const char *netid) +{ + struct rpcbind_args map = { + .r_prog = program, + .r_vers = version, + .r_netid = netid, + .r_owner = RPCB_OWNER_STRING, + }; + struct rpc_message msg = { + .rpc_argp = &map, + }; + int error; + + error = rpcb_create_local(); + if (error) + return error; + if (rpcb_local_clnt4 == NULL) + return -EPROTONOSUPPORT; + + if (address == NULL) + return rpcb_unregister_all_protofamilies(&msg); + + switch (address->sa_family) { + case AF_INET: + return rpcb_register_inet4(address, &msg); + case AF_INET6: + return rpcb_register_inet6(address, &msg); + } + + return -EAFNOSUPPORT; +} + +static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc) +{ + struct rpc_message msg = { + .rpc_proc = proc, + .rpc_argp = map, + .rpc_resp = map, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = rpcb_clnt, + .rpc_message = &msg, + .callback_ops = &rpcb_getport_ops, + .callback_data = map, + .flags = RPC_TASK_ASYNC | RPC_TASK_SOFTCONN, + }; + + return rpc_run_task(&task_setup_data); +} + +/* + * In the case where rpc clients have been cloned, we want to make + * sure that we use the program number/version etc of the actual + * owner of the xprt. To do so, we walk back up the tree of parents + * to find whoever created the transport and/or whoever has the + * autobind flag set. + */ +static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) +{ + struct rpc_clnt *parent = clnt->cl_parent; + + while (parent != clnt) { + if (parent->cl_xprt != clnt->cl_xprt) + break; + if (clnt->cl_autobind) + break; + clnt = parent; + parent = parent->cl_parent; + } + return clnt; +} + +/** + * rpcb_getport_async - obtain the port for a given RPC service on a given host + * @task: task that is waiting for portmapper request + * + * This one can be called for an ongoing RPC request, and can be used in + * an async (rpciod) context. + */ +void rpcb_getport_async(struct rpc_task *task) +{ + struct rpc_clnt *clnt; + struct rpc_procinfo *proc; + u32 bind_version; + struct rpc_xprt *xprt; + struct rpc_clnt *rpcb_clnt; + struct rpcbind_args *map; + struct rpc_task *child; + struct sockaddr_storage addr; + struct sockaddr *sap = (struct sockaddr *)&addr; + size_t salen; + int status; + + clnt = rpcb_find_transport_owner(task->tk_client); + xprt = clnt->cl_xprt; + + dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", + task->tk_pid, __func__, + clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); + + /* Put self on the wait queue to ensure we get notified if + * some other task is already attempting to bind the port */ + rpc_sleep_on(&xprt->binding, task, NULL); + + if (xprt_test_and_set_binding(xprt)) { + dprintk("RPC: %5u %s: waiting for another binder\n", + task->tk_pid, __func__); + return; + } + + /* Someone else may have bound if we slept */ + if (xprt_bound(xprt)) { + status = 0; + dprintk("RPC: %5u %s: already bound\n", + task->tk_pid, __func__); + goto bailout_nofree; + } + + /* Parent transport's destination address */ + salen = rpc_peeraddr(clnt, sap, sizeof(addr)); + + /* Don't ever use rpcbind v2 for AF_INET6 requests */ + switch (sap->sa_family) { + case AF_INET: + proc = rpcb_next_version[xprt->bind_index].rpc_proc; + bind_version = rpcb_next_version[xprt->bind_index].rpc_vers; + break; + case AF_INET6: + proc = rpcb_next_version6[xprt->bind_index].rpc_proc; + bind_version = rpcb_next_version6[xprt->bind_index].rpc_vers; + break; + default: + status = -EAFNOSUPPORT; + dprintk("RPC: %5u %s: bad address family\n", + task->tk_pid, __func__); + goto bailout_nofree; + } + if (proc == NULL) { + xprt->bind_index = 0; + status = -EPFNOSUPPORT; + dprintk("RPC: %5u %s: no more getport versions available\n", + task->tk_pid, __func__); + goto bailout_nofree; + } + + dprintk("RPC: %5u %s: trying rpcbind version %u\n", + task->tk_pid, __func__, bind_version); + + rpcb_clnt = rpcb_create(clnt->cl_server, sap, salen, xprt->prot, + bind_version); + if (IS_ERR(rpcb_clnt)) { + status = PTR_ERR(rpcb_clnt); + dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", + task->tk_pid, __func__, PTR_ERR(rpcb_clnt)); + goto bailout_nofree; + } + + map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); + if (!map) { + status = -ENOMEM; + dprintk("RPC: %5u %s: no memory available\n", + task->tk_pid, __func__); + goto bailout_release_client; + } + map->r_prog = clnt->cl_prog; + map->r_vers = clnt->cl_vers; + map->r_prot = xprt->prot; + map->r_port = 0; + map->r_xprt = xprt_get(xprt); + map->r_status = -EIO; + + switch (bind_version) { + case RPCBVERS_4: + case RPCBVERS_3: + map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); + map->r_addr = rpc_sockaddr2uaddr(sap); + map->r_owner = ""; + break; + case RPCBVERS_2: + map->r_addr = NULL; + break; + default: + BUG(); + } + + child = rpcb_call_async(rpcb_clnt, map, proc); + rpc_release_client(rpcb_clnt); + if (IS_ERR(child)) { + /* rpcb_map_release() has freed the arguments */ + dprintk("RPC: %5u %s: rpc_run_task failed\n", + task->tk_pid, __func__); + return; + } + + xprt->stat.bind_count++; + rpc_put_task(child); + return; + +bailout_release_client: + rpc_release_client(rpcb_clnt); +bailout_nofree: + rpcb_wake_rpcbind_waiters(xprt, status); + task->tk_status = status; +} +EXPORT_SYMBOL_GPL(rpcb_getport_async); + +/* + * Rpcbind child task calls this callback via tk_exit. + */ +static void rpcb_getport_done(struct rpc_task *child, void *data) +{ + struct rpcbind_args *map = data; + struct rpc_xprt *xprt = map->r_xprt; + int status = child->tk_status; + + /* Garbage reply: retry with a lesser rpcbind version */ + if (status == -EIO) + status = -EPROTONOSUPPORT; + + /* rpcbind server doesn't support this rpcbind protocol version */ + if (status == -EPROTONOSUPPORT) + xprt->bind_index++; + + if (status < 0) { + /* rpcbind server not available on remote host? */ + xprt->ops->set_port(xprt, 0); + } else if (map->r_port == 0) { + /* Requested RPC service wasn't registered on remote host */ + xprt->ops->set_port(xprt, 0); + status = -EACCES; + } else { + /* Succeeded */ + xprt->ops->set_port(xprt, map->r_port); + xprt_set_bound(xprt); + status = 0; + } + + dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n", + child->tk_pid, status, map->r_port); + + map->r_status = status; +} + +/* + * XDR functions for rpcbind + */ + +static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct rpcbind_args *rpcb) +{ + struct rpc_task *task = req->rq_task; + __be32 *p; + + dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n", + task->tk_pid, task->tk_msg.rpc_proc->p_name, + rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port); + + p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2); + *p++ = cpu_to_be32(rpcb->r_prog); + *p++ = cpu_to_be32(rpcb->r_vers); + *p++ = cpu_to_be32(rpcb->r_prot); + *p = cpu_to_be32(rpcb->r_port); +} + +static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr, + struct rpcbind_args *rpcb) +{ + struct rpc_task *task = req->rq_task; + unsigned long port; + __be32 *p; + + rpcb->r_port = 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + + port = be32_to_cpup(p); + dprintk("RPC: %5u PMAP_%s result: %lu\n", task->tk_pid, + task->tk_msg.rpc_proc->p_name, port); + if (unlikely(port > USHRT_MAX)) + return -EIO; + + rpcb->r_port = port; + return 0; +} + +static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr, + unsigned int *boolp) +{ + struct rpc_task *task = req->rq_task; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + + *boolp = 0; + if (*p != xdr_zero) + *boolp = 1; + + dprintk("RPC: %5u RPCB_%s call %s\n", + task->tk_pid, task->tk_msg.rpc_proc->p_name, + (*boolp ? "succeeded" : "failed")); + return 0; +} + +static void encode_rpcb_string(struct xdr_stream *xdr, const char *string, + const u32 maxstrlen) +{ + __be32 *p; + u32 len; + + len = strlen(string); + BUG_ON(len > maxstrlen); + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, string, len); +} + +static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct rpcbind_args *rpcb) +{ + struct rpc_task *task = req->rq_task; + __be32 *p; + + dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n", + task->tk_pid, task->tk_msg.rpc_proc->p_name, + rpcb->r_prog, rpcb->r_vers, + rpcb->r_netid, rpcb->r_addr); + + p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2); + *p++ = cpu_to_be32(rpcb->r_prog); + *p = cpu_to_be32(rpcb->r_vers); + + encode_rpcb_string(xdr, rpcb->r_netid, RPCBIND_MAXNETIDLEN); + encode_rpcb_string(xdr, rpcb->r_addr, RPCBIND_MAXUADDRLEN); + encode_rpcb_string(xdr, rpcb->r_owner, RPCB_MAXOWNERLEN); +} + +static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, + struct rpcbind_args *rpcb) +{ + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + struct rpc_task *task = req->rq_task; + __be32 *p; + u32 len; + + rpcb->r_port = 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_fail; + len = be32_to_cpup(p); + + /* + * If the returned universal address is a null string, + * the requested RPC service was not registered. + */ + if (len == 0) { + dprintk("RPC: %5u RPCB reply: program not registered\n", + task->tk_pid); + return 0; + } + + if (unlikely(len > RPCBIND_MAXUADDRLEN)) + goto out_fail; + + p = xdr_inline_decode(xdr, len); + if (unlikely(p == NULL)) + goto out_fail; + dprintk("RPC: %5u RPCB_%s reply: %s\n", task->tk_pid, + task->tk_msg.rpc_proc->p_name, (char *)p); + + if (rpc_uaddr2sockaddr((char *)p, len, sap, sizeof(address)) == 0) + goto out_fail; + rpcb->r_port = rpc_get_port(sap); + + return 0; + +out_fail: + dprintk("RPC: %5u malformed RPCB_%s reply\n", + task->tk_pid, task->tk_msg.rpc_proc->p_name); + return -EIO; +} + +/* + * Not all rpcbind procedures described in RFC 1833 are implemented + * since the Linux kernel RPC code requires only these. + */ + +static struct rpc_procinfo rpcb_procedures2[] = { + [RPCBPROC_SET] = { + .p_proc = RPCBPROC_SET, + .p_encode = (kxdreproc_t)rpcb_enc_mapping, + .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_arglen = RPCB_mappingargs_sz, + .p_replen = RPCB_setres_sz, + .p_statidx = RPCBPROC_SET, + .p_timer = 0, + .p_name = "SET", + }, + [RPCBPROC_UNSET] = { + .p_proc = RPCBPROC_UNSET, + .p_encode = (kxdreproc_t)rpcb_enc_mapping, + .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_arglen = RPCB_mappingargs_sz, + .p_replen = RPCB_setres_sz, + .p_statidx = RPCBPROC_UNSET, + .p_timer = 0, + .p_name = "UNSET", + }, + [RPCBPROC_GETPORT] = { + .p_proc = RPCBPROC_GETPORT, + .p_encode = (kxdreproc_t)rpcb_enc_mapping, + .p_decode = (kxdrdproc_t)rpcb_dec_getport, + .p_arglen = RPCB_mappingargs_sz, + .p_replen = RPCB_getportres_sz, + .p_statidx = RPCBPROC_GETPORT, + .p_timer = 0, + .p_name = "GETPORT", + }, +}; + +static struct rpc_procinfo rpcb_procedures3[] = { + [RPCBPROC_SET] = { + .p_proc = RPCBPROC_SET, + .p_encode = (kxdreproc_t)rpcb_enc_getaddr, + .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_arglen = RPCB_getaddrargs_sz, + .p_replen = RPCB_setres_sz, + .p_statidx = RPCBPROC_SET, + .p_timer = 0, + .p_name = "SET", + }, + [RPCBPROC_UNSET] = { + .p_proc = RPCBPROC_UNSET, + .p_encode = (kxdreproc_t)rpcb_enc_getaddr, + .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_arglen = RPCB_getaddrargs_sz, + .p_replen = RPCB_setres_sz, + .p_statidx = RPCBPROC_UNSET, + .p_timer = 0, + .p_name = "UNSET", + }, + [RPCBPROC_GETADDR] = { + .p_proc = RPCBPROC_GETADDR, + .p_encode = (kxdreproc_t)rpcb_enc_getaddr, + .p_decode = (kxdrdproc_t)rpcb_dec_getaddr, + .p_arglen = RPCB_getaddrargs_sz, + .p_replen = RPCB_getaddrres_sz, + .p_statidx = RPCBPROC_GETADDR, + .p_timer = 0, + .p_name = "GETADDR", + }, +}; + +static struct rpc_procinfo rpcb_procedures4[] = { + [RPCBPROC_SET] = { + .p_proc = RPCBPROC_SET, + .p_encode = (kxdreproc_t)rpcb_enc_getaddr, + .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_arglen = RPCB_getaddrargs_sz, + .p_replen = RPCB_setres_sz, + .p_statidx = RPCBPROC_SET, + .p_timer = 0, + .p_name = "SET", + }, + [RPCBPROC_UNSET] = { + .p_proc = RPCBPROC_UNSET, + .p_encode = (kxdreproc_t)rpcb_enc_getaddr, + .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_arglen = RPCB_getaddrargs_sz, + .p_replen = RPCB_setres_sz, + .p_statidx = RPCBPROC_UNSET, + .p_timer = 0, + .p_name = "UNSET", + }, + [RPCBPROC_GETADDR] = { + .p_proc = RPCBPROC_GETADDR, + .p_encode = (kxdreproc_t)rpcb_enc_getaddr, + .p_decode = (kxdrdproc_t)rpcb_dec_getaddr, + .p_arglen = RPCB_getaddrargs_sz, + .p_replen = RPCB_getaddrres_sz, + .p_statidx = RPCBPROC_GETADDR, + .p_timer = 0, + .p_name = "GETADDR", + }, +}; + +static struct rpcb_info rpcb_next_version[] = { + { + .rpc_vers = RPCBVERS_2, + .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], + }, + { + .rpc_proc = NULL, + }, +}; + +static struct rpcb_info rpcb_next_version6[] = { + { + .rpc_vers = RPCBVERS_4, + .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR], + }, + { + .rpc_vers = RPCBVERS_3, + .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR], + }, + { + .rpc_proc = NULL, + }, +}; + +static struct rpc_version rpcb_version2 = { + .number = RPCBVERS_2, + .nrprocs = ARRAY_SIZE(rpcb_procedures2), + .procs = rpcb_procedures2 +}; + +static struct rpc_version rpcb_version3 = { + .number = RPCBVERS_3, + .nrprocs = ARRAY_SIZE(rpcb_procedures3), + .procs = rpcb_procedures3 +}; + +static struct rpc_version rpcb_version4 = { + .number = RPCBVERS_4, + .nrprocs = ARRAY_SIZE(rpcb_procedures4), + .procs = rpcb_procedures4 +}; + +static struct rpc_version *rpcb_version[] = { + NULL, + NULL, + &rpcb_version2, + &rpcb_version3, + &rpcb_version4 +}; + +static struct rpc_stat rpcb_stats; + +static struct rpc_program rpcb_program = { + .name = "rpcbind", + .number = RPCBIND_PROGRAM, + .nrvers = ARRAY_SIZE(rpcb_version), + .version = rpcb_version, + .stats = &rpcb_stats, +}; + +/** + * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister + * + */ +void cleanup_rpcb_clnt(void) +{ + if (rpcb_local_clnt4) + rpc_shutdown_client(rpcb_local_clnt4); + if (rpcb_local_clnt) + rpc_shutdown_client(rpcb_local_clnt); +} diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c new file mode 100644 index 00000000..b6bb2257 --- /dev/null +++ b/net/sunrpc/sched.c @@ -0,0 +1,1016 @@ +/* + * linux/net/sunrpc/sched.c + * + * Scheduling for synchronous and asynchronous RPC requests. + * + * Copyright (C) 1996 Olaf Kirch, + * + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sunrpc.h" + +#ifdef RPC_DEBUG +#define RPCDBG_FACILITY RPCDBG_SCHED +#endif + +/* + * RPC slabs and memory pools + */ +#define RPC_BUFFER_MAXSIZE (2048) +#define RPC_BUFFER_POOLSIZE (8) +#define RPC_TASK_POOLSIZE (8) +static struct kmem_cache *rpc_task_slabp __read_mostly; +static struct kmem_cache *rpc_buffer_slabp __read_mostly; +static mempool_t *rpc_task_mempool __read_mostly; +static mempool_t *rpc_buffer_mempool __read_mostly; + +static void rpc_async_schedule(struct work_struct *); +static void rpc_release_task(struct rpc_task *task); +static void __rpc_queue_timer_fn(unsigned long ptr); + +/* + * RPC tasks sit here while waiting for conditions to improve. + */ +static struct rpc_wait_queue delay_queue; + +/* + * rpciod-related stuff + */ +struct workqueue_struct *rpciod_workqueue; + +/* + * Disable the timer for a given RPC task. Should be called with + * queue->lock and bh_disabled in order to avoid races within + * rpc_run_timer(). + */ +static void +__rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + if (task->tk_timeout == 0) + return; + dprintk("RPC: %5u disabling timer\n", task->tk_pid); + task->tk_timeout = 0; + list_del(&task->u.tk_wait.timer_list); + if (list_empty(&queue->timer_list.list)) + del_timer(&queue->timer_list.timer); +} + +static void +rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) +{ + queue->timer_list.expires = expires; + mod_timer(&queue->timer_list.timer, expires); +} + +/* + * Set up a timer for the current task. + */ +static void +__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + if (!task->tk_timeout) + return; + + dprintk("RPC: %5u setting alarm for %lu ms\n", + task->tk_pid, task->tk_timeout * 1000 / HZ); + + task->u.tk_wait.expires = jiffies + task->tk_timeout; + if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) + rpc_set_queue_timer(queue, task->u.tk_wait.expires); + list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + INIT_LIST_HEAD(&task->u.tk_wait.links); + q = &queue->tasks[task->tk_priority]; + if (unlikely(task->tk_priority > queue->maxpriority)) + q = &queue->tasks[queue->maxpriority]; + list_for_each_entry(t, q, u.tk_wait.list) { + if (t->tk_owner == task->tk_owner) { + list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + return; + } + } + list_add_tail(&task->u.tk_wait.list, q); +} + +/* + * Add new request to wait queue. + * + * Swapper tasks always get inserted at the head of the queue. + * This should avoid many nasty memory deadlocks and hopefully + * improve overall performance. + * Everyone else gets appended to the queue to ensure proper FIFO behavior. + */ +static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + BUG_ON (RPC_IS_QUEUED(task)); + + if (RPC_IS_PRIORITY(queue)) + __rpc_add_wait_queue_priority(queue, task); + else if (RPC_IS_SWAPPER(task)) + list_add(&task->u.tk_wait.list, &queue->tasks[0]); + else + list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); + task->tk_waitqueue = queue; + queue->qlen++; + rpc_set_queued(task); + + dprintk("RPC: %5u added to queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +} + +/* + * Remove request from a priority queue. + */ +static void __rpc_remove_wait_queue_priority(struct rpc_task *task) +{ + struct rpc_task *t; + + if (!list_empty(&task->u.tk_wait.links)) { + t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); + list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); + list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); + } +} + +/* + * Remove request from queue. + * Note: must be called with spin lock held. + */ +static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + __rpc_disable_timer(queue, task); + if (RPC_IS_PRIORITY(queue)) + __rpc_remove_wait_queue_priority(task); + list_del(&task->u.tk_wait.list); + queue->qlen--; + dprintk("RPC: %5u removed from queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +} + +static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) +{ + queue->priority = priority; + queue->count = 1 << (priority * 2); +} + +static inline void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) +{ + queue->owner = pid; + queue->nr = RPC_BATCH_COUNT; +} + +static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) +{ + rpc_set_waitqueue_priority(queue, queue->maxpriority); + rpc_set_waitqueue_owner(queue, 0); +} + +static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues) +{ + int i; + + spin_lock_init(&queue->lock); + for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) + INIT_LIST_HEAD(&queue->tasks[i]); + queue->maxpriority = nr_queues - 1; + rpc_reset_waitqueue_priority(queue); + queue->qlen = 0; + setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue); + INIT_LIST_HEAD(&queue->timer_list.list); +#ifdef RPC_DEBUG + queue->name = qname; +#endif +} + +void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, RPC_NR_PRIORITY); +} +EXPORT_SYMBOL_GPL(rpc_init_priority_wait_queue); + +void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, 1); +} +EXPORT_SYMBOL_GPL(rpc_init_wait_queue); + +void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) +{ + del_timer_sync(&queue->timer_list.timer); +} +EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); + +static int rpc_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + +#ifdef RPC_DEBUG +static void rpc_task_set_debuginfo(struct rpc_task *task) +{ + static atomic_t rpc_pid; + + task->tk_pid = atomic_inc_return(&rpc_pid); +} +#else +static inline void rpc_task_set_debuginfo(struct rpc_task *task) +{ +} +#endif + +static void rpc_set_active(struct rpc_task *task) +{ + rpc_task_set_debuginfo(task); + set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); +} + +/* + * Mark an RPC call as having completed by clearing the 'active' bit + * and then waking up all tasks that were sleeping. + */ +static int rpc_complete_task(struct rpc_task *task) +{ + void *m = &task->tk_runstate; + wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE); + struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE); + unsigned long flags; + int ret; + + spin_lock_irqsave(&wq->lock, flags); + clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); + ret = atomic_dec_and_test(&task->tk_count); + if (waitqueue_active(wq)) + __wake_up_locked_key(wq, TASK_NORMAL, &k); + spin_unlock_irqrestore(&wq->lock, flags); + return ret; +} + +/* + * Allow callers to wait for completion of an RPC call + * + * Note the use of out_of_line_wait_on_bit() rather than wait_on_bit() + * to enforce taking of the wq->lock and hence avoid races with + * rpc_complete_task(). + */ +int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *)) +{ + if (action == NULL) + action = rpc_wait_bit_killable; + return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, + action, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); + +/* + * Make an RPC task runnable. + * + * Note: If the task is ASYNC, this must be called with + * the spinlock held to protect the wait queue operation. + */ +static void rpc_make_runnable(struct rpc_task *task) +{ + rpc_clear_queued(task); + if (rpc_test_and_set_running(task)) + return; + if (RPC_IS_ASYNC(task)) { + INIT_WORK(&task->u.tk_work, rpc_async_schedule); + queue_work(rpciod_workqueue, &task->u.tk_work); + } else + wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); +} + +/* + * Prepare for sleeping on a wait queue. + * By always appending tasks to the list we ensure FIFO behavior. + * NB: An RPC task will only receive interrupt-driven events as long + * as it's on a wait queue. + */ +static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action) +{ + dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", + task->tk_pid, rpc_qname(q), jiffies); + + __rpc_add_wait_queue(q, task); + + BUG_ON(task->tk_callback != NULL); + task->tk_callback = action; + __rpc_add_timer(q, task); +} + +void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action) +{ + /* We shouldn't ever put an inactive task to sleep */ + BUG_ON(!RPC_IS_ACTIVATED(task)); + + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on(q, task, action); + spin_unlock_bh(&q->lock); +} +EXPORT_SYMBOL_GPL(rpc_sleep_on); + +/** + * __rpc_do_wake_up_task - wake up a single rpc_task + * @queue: wait queue + * @task: task to be woken up + * + * Caller must hold queue->lock, and have cleared the task queued flag. + */ +static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", + task->tk_pid, jiffies); + + /* Has the task been executed yet? If not, we cannot wake it up! */ + if (!RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); + return; + } + + __rpc_remove_wait_queue(queue, task); + + rpc_make_runnable(task); + + dprintk("RPC: __rpc_wake_up_task done\n"); +} + +/* + * Wake up a queued task while the queue lock is being held + */ +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) + __rpc_do_wake_up_task(queue, task); +} + +/* + * Tests whether rpc queue is empty + */ +int rpc_queue_empty(struct rpc_wait_queue *queue) +{ + int res; + + spin_lock_bh(&queue->lock); + res = queue->qlen; + spin_unlock_bh(&queue->lock); + return res == 0; +} +EXPORT_SYMBOL_GPL(rpc_queue_empty); + +/* + * Wake up a task on a specific queue + */ +void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + spin_lock_bh(&queue->lock); + rpc_wake_up_task_queue_locked(queue, task); + spin_unlock_bh(&queue->lock); +} +EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); + +/* + * Wake up the next task on a priority queue. + */ +static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue) +{ + struct list_head *q; + struct rpc_task *task; + + /* + * Service a batch of tasks from a single owner. + */ + q = &queue->tasks[queue->priority]; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + if (queue->owner == task->tk_owner) { + if (--queue->nr) + goto out; + list_move_tail(&task->u.tk_wait.list, q); + } + /* + * Check if we need to switch queues. + */ + if (--queue->count) + goto new_owner; + } + + /* + * Service the next queue. + */ + do { + if (q == &queue->tasks[0]) + q = &queue->tasks[queue->maxpriority]; + else + q = q - 1; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + goto new_queue; + } + } while (q != &queue->tasks[queue->priority]); + + rpc_reset_waitqueue_priority(queue); + return NULL; + +new_queue: + rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); +new_owner: + rpc_set_waitqueue_owner(queue, task->tk_owner); +out: + rpc_wake_up_task_queue_locked(queue, task); + return task; +} + +/* + * Wake up the next task on the wait queue. + */ +struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue) +{ + struct rpc_task *task = NULL; + + dprintk("RPC: wake_up_next(%p \"%s\")\n", + queue, rpc_qname(queue)); + spin_lock_bh(&queue->lock); + if (RPC_IS_PRIORITY(queue)) + task = __rpc_wake_up_next_priority(queue); + else { + task_for_first(task, &queue->tasks[0]) + rpc_wake_up_task_queue_locked(queue, task); + } + spin_unlock_bh(&queue->lock); + + return task; +} +EXPORT_SYMBOL_GPL(rpc_wake_up_next); + +/** + * rpc_wake_up - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + * Grabs queue->lock + */ +void rpc_wake_up(struct rpc_wait_queue *queue) +{ + struct list_head *head; + + spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + struct rpc_task *task; + task = list_first_entry(head, + struct rpc_task, + u.tk_wait.list); + rpc_wake_up_task_queue_locked(queue, task); + } + if (head == &queue->tasks[0]) + break; + head--; + } + spin_unlock_bh(&queue->lock); +} +EXPORT_SYMBOL_GPL(rpc_wake_up); + +/** + * rpc_wake_up_status - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + * + * Grabs queue->lock + */ +void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) +{ + struct list_head *head; + + spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + struct rpc_task *task; + task = list_first_entry(head, + struct rpc_task, + u.tk_wait.list); + task->tk_status = status; + rpc_wake_up_task_queue_locked(queue, task); + } + if (head == &queue->tasks[0]) + break; + head--; + } + spin_unlock_bh(&queue->lock); +} +EXPORT_SYMBOL_GPL(rpc_wake_up_status); + +static void __rpc_queue_timer_fn(unsigned long ptr) +{ + struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr; + struct rpc_task *task, *n; + unsigned long expires, now, timeo; + + spin_lock(&queue->lock); + expires = now = jiffies; + list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { + timeo = task->u.tk_wait.expires; + if (time_after_eq(now, timeo)) { + dprintk("RPC: %5u timeout\n", task->tk_pid); + task->tk_status = -ETIMEDOUT; + rpc_wake_up_task_queue_locked(queue, task); + continue; + } + if (expires == now || time_after(expires, timeo)) + expires = timeo; + } + if (!list_empty(&queue->timer_list.list)) + rpc_set_queue_timer(queue, expires); + spin_unlock(&queue->lock); +} + +static void __rpc_atrun(struct rpc_task *task) +{ + task->tk_status = 0; +} + +/* + * Run a task at a later time + */ +void rpc_delay(struct rpc_task *task, unsigned long delay) +{ + task->tk_timeout = delay; + rpc_sleep_on(&delay_queue, task, __rpc_atrun); +} +EXPORT_SYMBOL_GPL(rpc_delay); + +/* + * Helper to call task->tk_ops->rpc_call_prepare + */ +void rpc_prepare_task(struct rpc_task *task) +{ + task->tk_ops->rpc_call_prepare(task, task->tk_calldata); +} + +/* + * Helper that calls task->tk_ops->rpc_call_done if it exists + */ +void rpc_exit_task(struct rpc_task *task) +{ + task->tk_action = NULL; + if (task->tk_ops->rpc_call_done != NULL) { + task->tk_ops->rpc_call_done(task, task->tk_calldata); + if (task->tk_action != NULL) { + WARN_ON(RPC_ASSASSINATED(task)); + /* Always release the RPC slot and buffer memory */ + xprt_release(task); + } + } +} + +void rpc_exit(struct rpc_task *task, int status) +{ + task->tk_status = status; + task->tk_action = rpc_exit_task; + if (RPC_IS_QUEUED(task)) + rpc_wake_up_queued_task(task->tk_waitqueue, task); +} +EXPORT_SYMBOL_GPL(rpc_exit); + +void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) +{ + if (ops->rpc_release != NULL) + ops->rpc_release(calldata); +} + +/* + * This is the RPC `scheduler' (or rather, the finite state machine). + */ +static void __rpc_execute(struct rpc_task *task) +{ + struct rpc_wait_queue *queue; + int task_is_async = RPC_IS_ASYNC(task); + int status = 0; + + dprintk("RPC: %5u __rpc_execute flags=0x%x\n", + task->tk_pid, task->tk_flags); + + BUG_ON(RPC_IS_QUEUED(task)); + + for (;;) { + void (*do_action)(struct rpc_task *); + + /* + * Execute any pending callback first. + */ + do_action = task->tk_callback; + task->tk_callback = NULL; + if (do_action == NULL) { + /* + * Perform the next FSM step. + * tk_action may be NULL if the task has been killed. + * In particular, note that rpc_killall_tasks may + * do this at any time, so beware when dereferencing. + */ + do_action = task->tk_action; + if (do_action == NULL) + break; + } + do_action(task); + + /* + * Lockless check for whether task is sleeping or not. + */ + if (!RPC_IS_QUEUED(task)) + continue; + /* + * The queue->lock protects against races with + * rpc_make_runnable(). + * + * Note that once we clear RPC_TASK_RUNNING on an asynchronous + * rpc_task, rpc_make_runnable() can assign it to a + * different workqueue. We therefore cannot assume that the + * rpc_task pointer may still be dereferenced. + */ + queue = task->tk_waitqueue; + spin_lock_bh(&queue->lock); + if (!RPC_IS_QUEUED(task)) { + spin_unlock_bh(&queue->lock); + continue; + } + rpc_clear_running(task); + spin_unlock_bh(&queue->lock); + if (task_is_async) + return; + + /* sync task: sleep here */ + dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid); + status = out_of_line_wait_on_bit(&task->tk_runstate, + RPC_TASK_QUEUED, rpc_wait_bit_killable, + TASK_KILLABLE); + if (status == -ERESTARTSYS) { + /* + * When a sync task receives a signal, it exits with + * -ERESTARTSYS. In order to catch any callbacks that + * clean up after sleeping on some queue, we don't + * break the loop here, but go around once more. + */ + dprintk("RPC: %5u got signal\n", task->tk_pid); + task->tk_flags |= RPC_TASK_KILLED; + rpc_exit(task, -ERESTARTSYS); + } + rpc_set_running(task); + dprintk("RPC: %5u sync task resuming\n", task->tk_pid); + } + + dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status, + task->tk_status); + /* Release all resources associated with the task */ + rpc_release_task(task); +} + +/* + * User-visible entry point to the scheduler. + * + * This may be called recursively if e.g. an async NFS task updates + * the attributes and finds that dirty pages must be flushed. + * NOTE: Upon exit of this function the task is guaranteed to be + * released. In particular note that tk_release() will have + * been called, so your task memory may have been freed. + */ +void rpc_execute(struct rpc_task *task) +{ + rpc_set_active(task); + rpc_make_runnable(task); + if (!RPC_IS_ASYNC(task)) + __rpc_execute(task); +} + +static void rpc_async_schedule(struct work_struct *work) +{ + __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); +} + +/** + * rpc_malloc - allocate an RPC buffer + * @task: RPC task that will use this buffer + * @size: requested byte size + * + * To prevent rpciod from hanging, this allocator never sleeps, + * returning NULL if the request cannot be serviced immediately. + * The caller can arrange to sleep in a way that is safe for rpciod. + * + * Most requests are 'small' (under 2KiB) and can be serviced from a + * mempool, ensuring that NFS reads and writes can always proceed, + * and that there is good locality of reference for these buffers. + * + * In order to avoid memory starvation triggering more writebacks of + * NFS requests, we avoid using GFP_KERNEL. + */ +void *rpc_malloc(struct rpc_task *task, size_t size) +{ + struct rpc_buffer *buf; + gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; + + size += sizeof(struct rpc_buffer); + if (size <= RPC_BUFFER_MAXSIZE) + buf = mempool_alloc(rpc_buffer_mempool, gfp); + else + buf = kmalloc(size, gfp); + + if (!buf) + return NULL; + + buf->len = size; + dprintk("RPC: %5u allocated buffer of size %zu at %p\n", + task->tk_pid, size, buf); + return &buf->data; +} +EXPORT_SYMBOL_GPL(rpc_malloc); + +/** + * rpc_free - free buffer allocated via rpc_malloc + * @buffer: buffer to free + * + */ +void rpc_free(void *buffer) +{ + size_t size; + struct rpc_buffer *buf; + + if (!buffer) + return; + + buf = container_of(buffer, struct rpc_buffer, data); + size = buf->len; + + dprintk("RPC: freeing buffer of size %zu at %p\n", + size, buf); + + if (size <= RPC_BUFFER_MAXSIZE) + mempool_free(buf, rpc_buffer_mempool); + else + kfree(buf); +} +EXPORT_SYMBOL_GPL(rpc_free); + +/* + * Creation and deletion of RPC task structures + */ +static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data) +{ + memset(task, 0, sizeof(*task)); + atomic_set(&task->tk_count, 1); + task->tk_flags = task_setup_data->flags; + task->tk_ops = task_setup_data->callback_ops; + task->tk_calldata = task_setup_data->callback_data; + INIT_LIST_HEAD(&task->tk_task); + + /* Initialize retry counters */ + task->tk_garb_retry = 2; + task->tk_cred_retry = 2; + task->tk_rebind_retry = 2; + + task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW; + task->tk_owner = current->tgid; + + /* Initialize workqueue for async tasks */ + task->tk_workqueue = task_setup_data->workqueue; + + if (task->tk_ops->rpc_call_prepare != NULL) + task->tk_action = rpc_prepare_task; + + /* starting timestamp */ + task->tk_start = ktime_get(); + + dprintk("RPC: new task initialized, procpid %u\n", + task_pid_nr(current)); +} + +static struct rpc_task * +rpc_alloc_task(void) +{ + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); +} + +/* + * Create a new task for the specified client. + */ +struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) +{ + struct rpc_task *task = setup_data->task; + unsigned short flags = 0; + + if (task == NULL) { + task = rpc_alloc_task(); + if (task == NULL) { + rpc_release_calldata(setup_data->callback_ops, + setup_data->callback_data); + return ERR_PTR(-ENOMEM); + } + flags = RPC_TASK_DYNAMIC; + } + + rpc_init_task(task, setup_data); + task->tk_flags |= flags; + dprintk("RPC: allocated task %p\n", task); + return task; +} + +static void rpc_free_task(struct rpc_task *task) +{ + const struct rpc_call_ops *tk_ops = task->tk_ops; + void *calldata = task->tk_calldata; + + if (task->tk_flags & RPC_TASK_DYNAMIC) { + dprintk("RPC: %5u freeing task\n", task->tk_pid); + mempool_free(task, rpc_task_mempool); + } + rpc_release_calldata(tk_ops, calldata); +} + +static void rpc_async_release(struct work_struct *work) +{ + rpc_free_task(container_of(work, struct rpc_task, u.tk_work)); +} + +static void rpc_release_resources_task(struct rpc_task *task) +{ + if (task->tk_rqstp) + xprt_release(task); + if (task->tk_msg.rpc_cred) { + put_rpccred(task->tk_msg.rpc_cred); + task->tk_msg.rpc_cred = NULL; + } + rpc_task_release_client(task); +} + +static void rpc_final_put_task(struct rpc_task *task, + struct workqueue_struct *q) +{ + if (q != NULL) { + INIT_WORK(&task->u.tk_work, rpc_async_release); + queue_work(q, &task->u.tk_work); + } else + rpc_free_task(task); +} + +static void rpc_do_put_task(struct rpc_task *task, struct workqueue_struct *q) +{ + if (atomic_dec_and_test(&task->tk_count)) { + rpc_release_resources_task(task); + rpc_final_put_task(task, q); + } +} + +void rpc_put_task(struct rpc_task *task) +{ + rpc_do_put_task(task, NULL); +} +EXPORT_SYMBOL_GPL(rpc_put_task); + +void rpc_put_task_async(struct rpc_task *task) +{ + rpc_do_put_task(task, task->tk_workqueue); +} +EXPORT_SYMBOL_GPL(rpc_put_task_async); + +static void rpc_release_task(struct rpc_task *task) +{ + dprintk("RPC: %5u release task\n", task->tk_pid); + + BUG_ON (RPC_IS_QUEUED(task)); + + rpc_release_resources_task(task); + + /* + * Note: at this point we have been removed from rpc_clnt->cl_tasks, + * so it should be safe to use task->tk_count as a test for whether + * or not any other processes still hold references to our rpc_task. + */ + if (atomic_read(&task->tk_count) != 1 + !RPC_IS_ASYNC(task)) { + /* Wake up anyone who may be waiting for task completion */ + if (!rpc_complete_task(task)) + return; + } else { + if (!atomic_dec_and_test(&task->tk_count)) + return; + } + rpc_final_put_task(task, task->tk_workqueue); +} + +int rpciod_up(void) +{ + return try_module_get(THIS_MODULE) ? 0 : -EINVAL; +} + +void rpciod_down(void) +{ + module_put(THIS_MODULE); +} + +/* + * Start up the rpciod workqueue. + */ +static int rpciod_start(void) +{ + struct workqueue_struct *wq; + + /* + * Create the rpciod thread and wait for it to start. + */ + dprintk("RPC: creating workqueue rpciod\n"); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + rpciod_workqueue = wq; + return rpciod_workqueue != NULL; +} + +static void rpciod_stop(void) +{ + struct workqueue_struct *wq = NULL; + + if (rpciod_workqueue == NULL) + return; + dprintk("RPC: destroying workqueue rpciod\n"); + + wq = rpciod_workqueue; + rpciod_workqueue = NULL; + destroy_workqueue(wq); +} + +void +rpc_destroy_mempool(void) +{ + rpciod_stop(); + if (rpc_buffer_mempool) + mempool_destroy(rpc_buffer_mempool); + if (rpc_task_mempool) + mempool_destroy(rpc_task_mempool); + if (rpc_task_slabp) + kmem_cache_destroy(rpc_task_slabp); + if (rpc_buffer_slabp) + kmem_cache_destroy(rpc_buffer_slabp); + rpc_destroy_wait_queue(&delay_queue); +} + +int +rpc_init_mempool(void) +{ + /* + * The following is not strictly a mempool initialisation, + * but there is no harm in doing it here + */ + rpc_init_wait_queue(&delay_queue, "delayq"); + if (!rpciod_start()) + goto err_nomem; + + rpc_task_slabp = kmem_cache_create("rpc_tasks", + sizeof(struct rpc_task), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!rpc_task_slabp) + goto err_nomem; + rpc_buffer_slabp = kmem_cache_create("rpc_buffers", + RPC_BUFFER_MAXSIZE, + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!rpc_buffer_slabp) + goto err_nomem; + rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE, + rpc_task_slabp); + if (!rpc_task_mempool) + goto err_nomem; + rpc_buffer_mempool = mempool_create_slab_pool(RPC_BUFFER_POOLSIZE, + rpc_buffer_slabp); + if (!rpc_buffer_mempool) + goto err_nomem; + return 0; +err_nomem: + rpc_destroy_mempool(); + return -ENOMEM; +} diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c new file mode 100644 index 00000000..10b4319e --- /dev/null +++ b/net/sunrpc/socklib.c @@ -0,0 +1,185 @@ +/* + * linux/net/sunrpc/socklib.c + * + * Common socket helper routines for RPC client and server + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +/** + * xdr_skb_read_bits - copy some data bits from skb to internal buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Possibly called several times to iterate over an sk_buff and copy + * data out of it. + */ +size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) + return 0; + desc->count -= len; + desc->offset += len; + return len; +} +EXPORT_SYMBOL_GPL(xdr_skb_read_bits); + +/** + * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Same as skb_read_bits, but calculate a checksum at the same time. + */ +static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len) +{ + unsigned int pos; + __wsum csum2; + + if (len > desc->count) + len = desc->count; + pos = desc->offset; + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + desc->csum = csum_block_add(desc->csum, csum2, pos); + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * xdr_partial_copy_from_skb - copy data out of an skb + * @xdr: target XDR buffer + * @base: starting offset + * @desc: sk_buff copy helper + * @copy_actor: virtual method for copying data + * + */ +ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + ssize_t copied = 0; + size_t ret; + + len = xdr->head[0].iov_len; + if (base < len) { + len -= base; + ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); + copied += ret; + if (ret != len || !desc->count) + goto out; + base = 0; + } else + base -= len; + + if (unlikely(pglen == 0)) + goto copy_tail; + if (unlikely(base >= pglen)) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + do { + char *kaddr; + + /* ACL likes to be lazy in allocating pages - ACLs + * are small by default but can get huge. */ + if (unlikely(*ppage == NULL)) { + *ppage = alloc_page(GFP_ATOMIC); + if (unlikely(*ppage == NULL)) { + if (copied == 0) + copied = -ENOMEM; + goto out; + } + } + + len = PAGE_CACHE_SIZE; + kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); + if (base) { + len -= base; + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr + base, len); + base = 0; + } else { + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr, len); + } + flush_dcache_page(*ppage); + kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); + copied += ret; + if (ret != len || !desc->count) + goto out; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) + copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); +out: + return copied; +} +EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); + +/** + * csum_partial_copy_to_xdr - checksum and copy data + * @xdr: target XDR buffer + * @skb: source skb + * + * We have set things up such that we perform the checksum of the UDP + * packet in parallel with the copies into the RPC client iovec. -DaveM + */ +int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + struct xdr_skb_reader desc; + + desc.skb = skb; + desc.offset = sizeof(struct udphdr); + desc.count = skb->len - desc.offset; + + if (skb_csum_unnecessary(skb)) + goto no_checksum; + + desc.csum = csum_partial(skb->data, desc.offset, skb->csum); + if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0) + return -1; + if (desc.offset != skb->len) { + __wsum csum2; + csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); + desc.csum = csum_block_add(desc.csum, csum2, desc.offset); + } + if (desc.count) + return -1; + if (csum_fold(desc.csum)) + return -1; + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + netdev_rx_csum_fault(skb->dev); + return 0; +no_checksum: + if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) + return -1; + if (desc.count) + return -1; + return 0; +} +EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c new file mode 100644 index 00000000..80df89d9 --- /dev/null +++ b/net/sunrpc/stats.c @@ -0,0 +1,277 @@ +/* + * linux/net/sunrpc/stats.c + * + * procfs-based user access to generic RPC statistics. The stats files + * reside in /proc/net/rpc. + * + * The read routines assume that the buffer passed in is just big enough. + * If you implement an RPC service that has its own stats routine which + * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE + * limit. + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "netns.h" + +#define RPCDBG_FACILITY RPCDBG_MISC + +/* + * Get RPC client stats + */ +static int rpc_proc_show(struct seq_file *seq, void *v) { + const struct rpc_stat *statp = seq->private; + const struct rpc_program *prog = statp->program; + unsigned int i, j; + + seq_printf(seq, + "net %u %u %u %u\n", + statp->netcnt, + statp->netudpcnt, + statp->nettcpcnt, + statp->nettcpconn); + seq_printf(seq, + "rpc %u %u %u\n", + statp->rpccnt, + statp->rpcretrans, + statp->rpcauthrefresh); + + for (i = 0; i < prog->nrvers; i++) { + const struct rpc_version *vers = prog->version[i]; + if (!vers) + continue; + seq_printf(seq, "proc%u %u", + vers->number, vers->nrprocs); + for (j = 0; j < vers->nrprocs; j++) + seq_printf(seq, " %u", + vers->procs[j].p_count); + seq_putc(seq, '\n'); + } + return 0; +} + +static int rpc_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rpc_proc_show, PDE(inode)->data); +} + +static const struct file_operations rpc_proc_fops = { + .owner = THIS_MODULE, + .open = rpc_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Get RPC server stats + */ +void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { + const struct svc_program *prog = statp->program; + const struct svc_procedure *proc; + const struct svc_version *vers; + unsigned int i, j; + + seq_printf(seq, + "net %u %u %u %u\n", + statp->netcnt, + statp->netudpcnt, + statp->nettcpcnt, + statp->nettcpconn); + seq_printf(seq, + "rpc %u %u %u %u %u\n", + statp->rpccnt, + statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, + statp->rpcbadfmt, + statp->rpcbadauth, + statp->rpcbadclnt); + + for (i = 0; i < prog->pg_nvers; i++) { + if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc)) + continue; + seq_printf(seq, "proc%d %u", i, vers->vs_nproc); + for (j = 0; j < vers->vs_nproc; j++, proc++) + seq_printf(seq, " %u", proc->pc_count); + seq_putc(seq, '\n'); + } +} +EXPORT_SYMBOL_GPL(svc_seq_show); + +/** + * rpc_alloc_iostats - allocate an rpc_iostats structure + * @clnt: RPC program, version, and xprt + * + */ +struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) +{ + return kcalloc(clnt->cl_maxproc, sizeof(struct rpc_iostats), GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(rpc_alloc_iostats); + +/** + * rpc_free_iostats - release an rpc_iostats structure + * @stats: doomed rpc_iostats structure + * + */ +void rpc_free_iostats(struct rpc_iostats *stats) +{ + kfree(stats); +} +EXPORT_SYMBOL_GPL(rpc_free_iostats); + +/** + * rpc_count_iostats - tally up per-task stats + * @task: completed rpc_task + * + * Relies on the caller for serialization. + */ +void rpc_count_iostats(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_iostats *stats; + struct rpc_iostats *op_metrics; + ktime_t delta; + + if (!task->tk_client || !task->tk_client->cl_metrics || !req) + return; + + stats = task->tk_client->cl_metrics; + op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx]; + + op_metrics->om_ops++; + op_metrics->om_ntrans += req->rq_ntrans; + op_metrics->om_timeouts += task->tk_timeouts; + + op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; + op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + + delta = ktime_sub(req->rq_xtime, task->tk_start); + op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); + + op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); + + delta = ktime_sub(ktime_get(), task->tk_start); + op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); +} + +static void _print_name(struct seq_file *seq, unsigned int op, + struct rpc_procinfo *procs) +{ + if (procs[op].p_name) + seq_printf(seq, "\t%12s: ", procs[op].p_name); + else if (op == 0) + seq_printf(seq, "\t NULL: "); + else + seq_printf(seq, "\t%12u: ", op); +} + +void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) +{ + struct rpc_iostats *stats = clnt->cl_metrics; + struct rpc_xprt *xprt = clnt->cl_xprt; + unsigned int op, maxproc = clnt->cl_maxproc; + + if (!stats) + return; + + seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); + seq_printf(seq, "p/v: %u/%u (%s)\n", + clnt->cl_prog, clnt->cl_vers, clnt->cl_protname); + + if (xprt) + xprt->ops->print_stats(xprt, seq); + + seq_printf(seq, "\tper-op statistics\n"); + for (op = 0; op < maxproc; op++) { + struct rpc_iostats *metrics = &stats[op]; + _print_name(seq, op, clnt->cl_procinfo); + seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n", + metrics->om_ops, + metrics->om_ntrans, + metrics->om_timeouts, + metrics->om_bytes_sent, + metrics->om_bytes_recv, + ktime_to_ms(metrics->om_queue), + ktime_to_ms(metrics->om_rtt), + ktime_to_ms(metrics->om_execute)); + } +} +EXPORT_SYMBOL_GPL(rpc_print_iostats); + +/* + * Register/unregister RPC proc files + */ +static inline struct proc_dir_entry * +do_register(const char *name, void *data, const struct file_operations *fops) +{ + struct sunrpc_net *sn; + + dprintk("RPC: registering /proc/net/rpc/%s\n", name); + sn = net_generic(&init_net, sunrpc_net_id); + return proc_create_data(name, 0, sn->proc_net_rpc, fops, data); +} + +struct proc_dir_entry * +rpc_proc_register(struct rpc_stat *statp) +{ + return do_register(statp->program->name, statp, &rpc_proc_fops); +} +EXPORT_SYMBOL_GPL(rpc_proc_register); + +void +rpc_proc_unregister(const char *name) +{ + struct sunrpc_net *sn; + + sn = net_generic(&init_net, sunrpc_net_id); + remove_proc_entry(name, sn->proc_net_rpc); +} +EXPORT_SYMBOL_GPL(rpc_proc_unregister); + +struct proc_dir_entry * +svc_proc_register(struct svc_stat *statp, const struct file_operations *fops) +{ + return do_register(statp->program->pg_name, statp, fops); +} +EXPORT_SYMBOL_GPL(svc_proc_register); + +void +svc_proc_unregister(const char *name) +{ + struct sunrpc_net *sn; + + sn = net_generic(&init_net, sunrpc_net_id); + remove_proc_entry(name, sn->proc_net_rpc); +} +EXPORT_SYMBOL_GPL(svc_proc_unregister); + +int rpc_proc_init(struct net *net) +{ + struct sunrpc_net *sn; + + dprintk("RPC: registering /proc/net/rpc\n"); + sn = net_generic(net, sunrpc_net_id); + sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); + if (sn->proc_net_rpc == NULL) + return -ENOMEM; + + return 0; +} + +void rpc_proc_exit(struct net *net) +{ + dprintk("RPC: unregistering /proc/net/rpc\n"); + remove_proc_entry("rpc", net->proc_net); +} + diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h new file mode 100644 index 00000000..90c292e2 --- /dev/null +++ b/net/sunrpc/sunrpc.h @@ -0,0 +1,51 @@ +/****************************************************************************** + +(c) 2008 NetApp. All Rights Reserved. + +NetApp provides this source code under the GPL v2 License. +The GPL v2 license is available at +http://opensource.org/licenses/gpl-license.php. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ + +/* + * Functions and macros used internally by RPC + */ + +#ifndef _NET_SUNRPC_SUNRPC_H +#define _NET_SUNRPC_SUNRPC_H + +#include + +/* + * Header for dynamically allocated rpc buffers. + */ +struct rpc_buffer { + size_t len; + char data[]; +}; + +static inline int rpc_reply_expected(struct rpc_task *task) +{ + return (task->tk_msg.rpc_proc != NULL) && + (task->tk_msg.rpc_proc->p_decode != NULL); +} + +int svc_send_common(struct socket *sock, struct xdr_buf *xdr, + struct page *headpage, unsigned long headoffset, + struct page *tailpage, unsigned long tailoffset); + +#endif /* _NET_SUNRPC_SUNRPC_H */ + diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c new file mode 100644 index 00000000..9d080916 --- /dev/null +++ b/net/sunrpc/sunrpc_syms.c @@ -0,0 +1,120 @@ +/* + * linux/net/sunrpc/sunrpc_syms.c + * + * Symbols exported by the sunrpc module. + * + * Copyright (C) 1997 Olaf Kirch + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netns.h" + +int sunrpc_net_id; + +static __net_init int sunrpc_init_net(struct net *net) +{ + int err; + + err = rpc_proc_init(net); + if (err) + goto err_proc; + + err = ip_map_cache_create(net); + if (err) + goto err_ipmap; + + return 0; + +err_ipmap: + rpc_proc_exit(net); +err_proc: + return err; +} + +static __net_exit void sunrpc_exit_net(struct net *net) +{ + ip_map_cache_destroy(net); + rpc_proc_exit(net); +} + +static struct pernet_operations sunrpc_net_ops = { + .init = sunrpc_init_net, + .exit = sunrpc_exit_net, + .id = &sunrpc_net_id, + .size = sizeof(struct sunrpc_net), +}; + +extern struct cache_detail unix_gid_cache; + +extern void cleanup_rpcb_clnt(void); + +static int __init +init_sunrpc(void) +{ + int err = register_rpc_pipefs(); + if (err) + goto out; + err = rpc_init_mempool(); + if (err) + goto out2; + err = rpcauth_init_module(); + if (err) + goto out3; + + cache_initialize(); + + err = register_pernet_subsys(&sunrpc_net_ops); + if (err) + goto out4; +#ifdef RPC_DEBUG + rpc_register_sysctl(); +#endif + cache_register(&unix_gid_cache); + svc_init_xprt_sock(); /* svc sock transport */ + init_socket_xprt(); /* clnt sock transport */ + return 0; + +out4: + rpcauth_remove_module(); +out3: + rpc_destroy_mempool(); +out2: + unregister_rpc_pipefs(); +out: + return err; +} + +static void __exit +cleanup_sunrpc(void) +{ + cleanup_rpcb_clnt(); + rpcauth_remove_module(); + cleanup_socket_xprt(); + svc_cleanup_xprt_sock(); + unregister_rpc_pipefs(); + rpc_destroy_mempool(); + cache_unregister(&unix_gid_cache); + unregister_pernet_subsys(&sunrpc_net_ops); +#ifdef RPC_DEBUG + rpc_unregister_sysctl(); +#endif + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} +MODULE_LICENSE("GPL"); +fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ +module_exit(cleanup_sunrpc); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c new file mode 100644 index 00000000..ce5f111f --- /dev/null +++ b/net/sunrpc/svc.c @@ -0,0 +1,1323 @@ +/* + * linux/net/sunrpc/svc.c + * + * High-level RPC service routines + * + * Copyright (C) 1995, 1996 Olaf Kirch + * + * Multiple threads pools and NUMAisation + * Copyright (c) 2006 Silicon Graphics, Inc. + * by Greg Banks + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCDSP + +static void svc_unregister(const struct svc_serv *serv); + +#define svc_serv_is_pooled(serv) ((serv)->sv_function) + +/* + * Mode for mapping cpus to pools. + */ +enum { + SVC_POOL_AUTO = -1, /* choose one of the others */ + SVC_POOL_GLOBAL, /* no mapping, just a single global pool + * (legacy & UP mode) */ + SVC_POOL_PERCPU, /* one pool per cpu */ + SVC_POOL_PERNODE /* one pool per numa node */ +}; +#define SVC_POOL_DEFAULT SVC_POOL_GLOBAL + +/* + * Structure for mapping cpus to pools and vice versa. + * Setup once during sunrpc initialisation. + */ +static struct svc_pool_map { + int count; /* How many svc_servs use us */ + int mode; /* Note: int not enum to avoid + * warnings about "enumeration value + * not handled in switch" */ + unsigned int npools; + unsigned int *pool_to; /* maps pool id to cpu or node */ + unsigned int *to_pool; /* maps cpu or node to pool id */ +} svc_pool_map = { + .count = 0, + .mode = SVC_POOL_DEFAULT +}; +static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ + +static int +param_set_pool_mode(const char *val, struct kernel_param *kp) +{ + int *ip = (int *)kp->arg; + struct svc_pool_map *m = &svc_pool_map; + int err; + + mutex_lock(&svc_pool_map_mutex); + + err = -EBUSY; + if (m->count) + goto out; + + err = 0; + if (!strncmp(val, "auto", 4)) + *ip = SVC_POOL_AUTO; + else if (!strncmp(val, "global", 6)) + *ip = SVC_POOL_GLOBAL; + else if (!strncmp(val, "percpu", 6)) + *ip = SVC_POOL_PERCPU; + else if (!strncmp(val, "pernode", 7)) + *ip = SVC_POOL_PERNODE; + else + err = -EINVAL; + +out: + mutex_unlock(&svc_pool_map_mutex); + return err; +} + +static int +param_get_pool_mode(char *buf, struct kernel_param *kp) +{ + int *ip = (int *)kp->arg; + + switch (*ip) + { + case SVC_POOL_AUTO: + return strlcpy(buf, "auto", 20); + case SVC_POOL_GLOBAL: + return strlcpy(buf, "global", 20); + case SVC_POOL_PERCPU: + return strlcpy(buf, "percpu", 20); + case SVC_POOL_PERNODE: + return strlcpy(buf, "pernode", 20); + default: + return sprintf(buf, "%d", *ip); + } +} + +module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode, + &svc_pool_map.mode, 0644); + +/* + * Detect best pool mapping mode heuristically, + * according to the machine's topology. + */ +static int +svc_pool_map_choose_mode(void) +{ + unsigned int node; + + if (nr_online_nodes > 1) { + /* + * Actually have multiple NUMA nodes, + * so split pools on NUMA node boundaries + */ + return SVC_POOL_PERNODE; + } + + node = first_online_node; + if (nr_cpus_node(node) > 2) { + /* + * Non-trivial SMP, or CONFIG_NUMA on + * non-NUMA hardware, e.g. with a generic + * x86_64 kernel on Xeons. In this case we + * want to divide the pools on cpu boundaries. + */ + return SVC_POOL_PERCPU; + } + + /* default: one global pool */ + return SVC_POOL_GLOBAL; +} + +/* + * Allocate the to_pool[] and pool_to[] arrays. + * Returns 0 on success or an errno. + */ +static int +svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools) +{ + m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->to_pool) + goto fail; + m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->pool_to) + goto fail_free; + + return 0; + +fail_free: + kfree(m->to_pool); + m->to_pool = NULL; +fail: + return -ENOMEM; +} + +/* + * Initialise the pool map for SVC_POOL_PERCPU mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_percpu(struct svc_pool_map *m) +{ + unsigned int maxpools = nr_cpu_ids; + unsigned int pidx = 0; + unsigned int cpu; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_online_cpu(cpu) { + BUG_ON(pidx > maxpools); + m->to_pool[cpu] = pidx; + m->pool_to[pidx] = cpu; + pidx++; + } + /* cpus brought online later all get mapped to pool0, sorry */ + + return pidx; +}; + + +/* + * Initialise the pool map for SVC_POOL_PERNODE mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_pernode(struct svc_pool_map *m) +{ + unsigned int maxpools = nr_node_ids; + unsigned int pidx = 0; + unsigned int node; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_node_with_cpus(node) { + /* some architectures (e.g. SN2) have cpuless nodes */ + BUG_ON(pidx > maxpools); + m->to_pool[node] = pidx; + m->pool_to[pidx] = node; + pidx++; + } + /* nodes brought online later all get mapped to pool0, sorry */ + + return pidx; +} + + +/* + * Add a reference to the global map of cpus to pools (and + * vice versa). Initialise the map if we're the first user. + * Returns the number of pools. + */ +static unsigned int +svc_pool_map_get(void) +{ + struct svc_pool_map *m = &svc_pool_map; + int npools = -1; + + mutex_lock(&svc_pool_map_mutex); + + if (m->count++) { + mutex_unlock(&svc_pool_map_mutex); + return m->npools; + } + + if (m->mode == SVC_POOL_AUTO) + m->mode = svc_pool_map_choose_mode(); + + switch (m->mode) { + case SVC_POOL_PERCPU: + npools = svc_pool_map_init_percpu(m); + break; + case SVC_POOL_PERNODE: + npools = svc_pool_map_init_pernode(m); + break; + } + + if (npools < 0) { + /* default, or memory allocation failure */ + npools = 1; + m->mode = SVC_POOL_GLOBAL; + } + m->npools = npools; + + mutex_unlock(&svc_pool_map_mutex); + return m->npools; +} + + +/* + * Drop a reference to the global map of cpus to pools. + * When the last reference is dropped, the map data is + * freed; this allows the sysadmin to change the pool + * mode using the pool_mode module option without + * rebooting or re-loading sunrpc.ko. + */ +static void +svc_pool_map_put(void) +{ + struct svc_pool_map *m = &svc_pool_map; + + mutex_lock(&svc_pool_map_mutex); + + if (!--m->count) { + m->mode = SVC_POOL_DEFAULT; + kfree(m->to_pool); + m->to_pool = NULL; + kfree(m->pool_to); + m->pool_to = NULL; + m->npools = 0; + } + + mutex_unlock(&svc_pool_map_mutex); +} + + +/* + * Set the given thread's cpus_allowed mask so that it + * will only run on cpus in the given pool. + */ +static inline void +svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int node = m->pool_to[pidx]; + + /* + * The caller checks for sv_nrpools > 1, which + * implies that we've been initialized. + */ + BUG_ON(m->count == 0); + + switch (m->mode) { + case SVC_POOL_PERCPU: + { + set_cpus_allowed_ptr(task, cpumask_of(node)); + break; + } + case SVC_POOL_PERNODE: + { + set_cpus_allowed_ptr(task, cpumask_of_node(node)); + break; + } + } +} + +/* + * Use the mapping mode to choose a pool for a given CPU. + * Used when enqueueing an incoming RPC. Always returns + * a non-NULL pool pointer. + */ +struct svc_pool * +svc_pool_for_cpu(struct svc_serv *serv, int cpu) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int pidx = 0; + + /* + * An uninitialised map happens in a pure client when + * lockd is brought up, so silently treat it the + * same as SVC_POOL_GLOBAL. + */ + if (svc_serv_is_pooled(serv)) { + switch (m->mode) { + case SVC_POOL_PERCPU: + pidx = m->to_pool[cpu]; + break; + case SVC_POOL_PERNODE: + pidx = m->to_pool[cpu_to_node(cpu)]; + break; + } + } + return &serv->sv_pools[pidx % serv->sv_nrpools]; +} + + +/* + * Create an RPC service + */ +static struct svc_serv * +__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, + void (*shutdown)(struct svc_serv *serv)) +{ + struct svc_serv *serv; + unsigned int vers; + unsigned int xdrsize; + unsigned int i; + + if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) + return NULL; + serv->sv_name = prog->pg_name; + serv->sv_program = prog; + serv->sv_nrthreads = 1; + serv->sv_stats = prog->pg_stats; + if (bufsize > RPCSVC_MAXPAYLOAD) + bufsize = RPCSVC_MAXPAYLOAD; + serv->sv_max_payload = bufsize? bufsize : 4096; + serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); + serv->sv_shutdown = shutdown; + xdrsize = 0; + while (prog) { + prog->pg_lovers = prog->pg_nvers-1; + for (vers=0; verspg_nvers ; vers++) + if (prog->pg_vers[vers]) { + prog->pg_hivers = vers; + if (prog->pg_lovers > vers) + prog->pg_lovers = vers; + if (prog->pg_vers[vers]->vs_xdrsize > xdrsize) + xdrsize = prog->pg_vers[vers]->vs_xdrsize; + } + prog = prog->pg_next; + } + serv->sv_xdrsize = xdrsize; + INIT_LIST_HEAD(&serv->sv_tempsocks); + INIT_LIST_HEAD(&serv->sv_permsocks); + init_timer(&serv->sv_temptimer); + spin_lock_init(&serv->sv_lock); + + serv->sv_nrpools = npools; + serv->sv_pools = + kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), + GFP_KERNEL); + if (!serv->sv_pools) { + kfree(serv); + return NULL; + } + + for (i = 0; i < serv->sv_nrpools; i++) { + struct svc_pool *pool = &serv->sv_pools[i]; + + dprintk("svc: initialising pool %u for %s\n", + i, serv->sv_name); + + pool->sp_id = i; + INIT_LIST_HEAD(&pool->sp_threads); + INIT_LIST_HEAD(&pool->sp_sockets); + INIT_LIST_HEAD(&pool->sp_all_threads); + spin_lock_init(&pool->sp_lock); + } + + /* Remove any stale portmap registrations */ + svc_unregister(serv); + + return serv; +} + +struct svc_serv * +svc_create(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv)) +{ + return __svc_create(prog, bufsize, /*npools*/1, shutdown); +} +EXPORT_SYMBOL_GPL(svc_create); + +struct svc_serv * +svc_create_pooled(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv), + svc_thread_fn func, struct module *mod) +{ + struct svc_serv *serv; + unsigned int npools = svc_pool_map_get(); + + serv = __svc_create(prog, bufsize, npools, shutdown); + + if (serv != NULL) { + serv->sv_function = func; + serv->sv_module = mod; + } + + return serv; +} +EXPORT_SYMBOL_GPL(svc_create_pooled); + +/* + * Destroy an RPC service. Should be called with appropriate locking to + * protect the sv_nrthreads, sv_permsocks and sv_tempsocks. + */ +void +svc_destroy(struct svc_serv *serv) +{ + dprintk("svc: svc_destroy(%s, %d)\n", + serv->sv_program->pg_name, + serv->sv_nrthreads); + + if (serv->sv_nrthreads) { + if (--(serv->sv_nrthreads) != 0) { + svc_sock_update_bufs(serv); + return; + } + } else + printk("svc_destroy: no threads for serv=%p!\n", serv); + + del_timer_sync(&serv->sv_temptimer); + /* + * The set of xprts (contained in the sv_tempsocks and + * sv_permsocks lists) is now constant, since it is modified + * only by accepting new sockets (done by service threads in + * svc_recv) or aging old ones (done by sv_temptimer), or + * configuration changes (excluded by whatever locking the + * caller is using--nfsd_mutex in the case of nfsd). So it's + * safe to traverse those lists and shut everything down: + */ + svc_close_all(serv); + + if (serv->sv_shutdown) + serv->sv_shutdown(serv); + + cache_clean_deferred(serv); + + if (svc_serv_is_pooled(serv)) + svc_pool_map_put(); + + svc_unregister(serv); + kfree(serv->sv_pools); + kfree(serv); +} +EXPORT_SYMBOL_GPL(svc_destroy); + +/* + * Allocate an RPC server's buffer space. + * We allocate pages and place them in rq_argpages. + */ +static int +svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) +{ + unsigned int pages, arghi; + + /* bc_xprt uses fore channel allocated buffers */ + if (svc_is_backchannel(rqstp)) + return 1; + + pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. + * We assume one is at most one page + */ + arghi = 0; + BUG_ON(pages > RPCSVC_MAXPAGES); + while (pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + break; + rqstp->rq_pages[arghi++] = p; + pages--; + } + return pages == 0; +} + +/* + * Release an RPC server buffer + */ +static void +svc_release_buffer(struct svc_rqst *rqstp) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++) + if (rqstp->rq_pages[i]) + put_page(rqstp->rq_pages[i]); +} + +struct svc_rqst * +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool) +{ + struct svc_rqst *rqstp; + + rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); + if (!rqstp) + goto out_enomem; + + init_waitqueue_head(&rqstp->rq_wait); + + serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); + rqstp->rq_server = serv; + rqstp->rq_pool = pool; + + rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); + if (!rqstp->rq_argp) + goto out_thread; + + rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); + if (!rqstp->rq_resp) + goto out_thread; + + if (!svc_init_buffer(rqstp, serv->sv_max_mesg)) + goto out_thread; + + return rqstp; +out_thread: + svc_exit_thread(rqstp); +out_enomem: + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(svc_prepare_thread); + +/* + * Choose a pool in which to create a new thread, for svc_set_num_threads + */ +static inline struct svc_pool * +choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + if (pool != NULL) + return pool; + + return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +} + +/* + * Choose a thread to kill, for svc_set_num_threads + */ +static inline struct task_struct * +choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + unsigned int i; + struct task_struct *task = NULL; + + if (pool != NULL) { + spin_lock_bh(&pool->sp_lock); + } else { + /* choose a pool in round-robin fashion */ + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_all_threads)) + goto found_pool; + spin_unlock_bh(&pool->sp_lock); + } + return NULL; + } + +found_pool: + if (!list_empty(&pool->sp_all_threads)) { + struct svc_rqst *rqstp; + + /* + * Remove from the pool->sp_all_threads list + * so we don't try to kill it again. + */ + rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); + list_del_init(&rqstp->rq_all); + task = rqstp->rq_task; + } + spin_unlock_bh(&pool->sp_lock); + + return task; +} + +/* + * Create or destroy enough new threads to make the number + * of threads the given number. If `pool' is non-NULL, applies + * only to threads in that pool, otherwise round-robins between + * all pools. Must be called with a svc_get() reference and + * the BKL or another lock to protect access to svc_serv fields. + * + * Destroying threads relies on the service threads filling in + * rqstp->rq_task, which only the nfs ones do. Assumes the serv + * has been created using svc_create_pooled(). + * + * Based on code that used to be in nfsd_svc() but tweaked + * to be pool-aware. + */ +int +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct svc_rqst *rqstp; + struct task_struct *task; + struct svc_pool *chosen_pool; + int error = 0; + unsigned int state = serv->sv_nrthreads-1; + + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + /* create new threads */ + while (nrservs > 0) { + nrservs--; + chosen_pool = choose_pool(serv, pool, &state); + + rqstp = svc_prepare_thread(serv, chosen_pool); + if (IS_ERR(rqstp)) { + error = PTR_ERR(rqstp); + break; + } + + __module_get(serv->sv_module); + task = kthread_create(serv->sv_function, rqstp, serv->sv_name); + if (IS_ERR(task)) { + error = PTR_ERR(task); + module_put(serv->sv_module); + svc_exit_thread(rqstp); + break; + } + + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); + } + /* destroy old threads */ + while (nrservs < 0 && + (task = choose_victim(serv, pool, &state)) != NULL) { + send_sig(SIGINT, task, 1); + nrservs++; + } + + return error; +} +EXPORT_SYMBOL_GPL(svc_set_num_threads); + +/* + * Called from a server thread as it's exiting. Caller must hold the BKL or + * the "service mutex", whichever is appropriate for the service. + */ +void +svc_exit_thread(struct svc_rqst *rqstp) +{ + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + + svc_release_buffer(rqstp); + kfree(rqstp->rq_resp); + kfree(rqstp->rq_argp); + kfree(rqstp->rq_auth_data); + + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads--; + list_del(&rqstp->rq_all); + spin_unlock_bh(&pool->sp_lock); + + kfree(rqstp); + + /* Release the server */ + if (serv) + svc_destroy(serv); +} +EXPORT_SYMBOL_GPL(svc_exit_thread); + +/* + * Register an "inet" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_rpcb_register4(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ + const struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; + const char *netid; + int error; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP; + break; + default: + return -ENOPROTOOPT; + } + + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin, netid); + + /* + * User space didn't support rpcbind v4, so retry this + * registration request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, protocol, port); + + return error; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/* + * Register an "inet6" protocol family netid with the local + * rpcbind daemon via an rpcbind v4 SET request. + * + * No netconfig infrastructure is available in the kernel, so + * we map IP_ protocol numbers to netids by hand. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_rpcb_register6(const u32 program, const u32 version, + const unsigned short protocol, + const unsigned short port) +{ + const struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + const char *netid; + int error; + + switch (protocol) { + case IPPROTO_UDP: + netid = RPCBIND_NETID_UDP6; + break; + case IPPROTO_TCP: + netid = RPCBIND_NETID_TCP6; + break; + default: + return -ENOPROTOOPT; + } + + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin6, netid); + + /* + * User space didn't support rpcbind version 4, so we won't + * use a PF_INET6 listener. + */ + if (error == -EPROTONOSUPPORT) + error = -EAFNOSUPPORT; + + return error; +} +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + +/* + * Register a kernel RPC service via rpcbind version 4. + * + * Returns zero on success; a negative errno value is returned + * if any error occurs. + */ +static int __svc_register(const char *progname, + const u32 program, const u32 version, + const int family, + const unsigned short protocol, + const unsigned short port) +{ + int error = -EAFNOSUPPORT; + + switch (family) { + case PF_INET: + error = __svc_rpcb_register4(program, version, + protocol, port); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case PF_INET6: + error = __svc_rpcb_register6(program, version, + protocol, port); +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + } + + if (error < 0) + printk(KERN_WARNING "svc: failed to register %sv%u RPC " + "service (errno %d).\n", progname, version, -error); + return error; +} + +/** + * svc_register - register an RPC service with the local portmapper + * @serv: svc_serv struct for the service to register + * @family: protocol family of service's listener socket + * @proto: transport protocol number to advertise + * @port: port to advertise + * + * Service is registered for any address in the passed-in protocol family + */ +int svc_register(const struct svc_serv *serv, const int family, + const unsigned short proto, const unsigned short port) +{ + struct svc_program *progp; + unsigned int i; + int error = 0; + + BUG_ON(proto == 0 && port == 0); + + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + + dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", + progp->pg_name, + i, + proto == IPPROTO_UDP? "udp" : "tcp", + port, + family, + progp->pg_vers[i]->vs_hidden? + " (but not telling portmap)" : ""); + + if (progp->pg_vers[i]->vs_hidden) + continue; + + error = __svc_register(progp->pg_name, progp->pg_prog, + i, family, proto, port); + if (error < 0) + break; + } + } + + return error; +} + +/* + * If user space is running rpcbind, it should take the v4 UNSET + * and clear everything for this [program, version]. If user space + * is running portmap, it will reject the v4 UNSET, but won't have + * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient + * in this case to clear all existing entries for [program, version]. + */ +static void __svc_unregister(const u32 program, const u32 version, + const char *progname) +{ + int error; + + error = rpcb_v4_register(program, version, NULL, ""); + + /* + * User space didn't support rpcbind v4, so retry this + * request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, 0, 0); + + dprintk("svc: %s(%sv%u), error %d\n", + __func__, progname, version, error); +} + +/* + * All netids, bind addresses and ports registered for [program, version] + * are removed from the local rpcbind database (if the service is not + * hidden) to make way for a new instance of the service. + * + * The result of unregistration is reported via dprintk for those who want + * verification of the result, but is otherwise not important. + */ +static void svc_unregister(const struct svc_serv *serv) +{ + struct svc_program *progp; + unsigned long flags; + unsigned int i; + + clear_thread_flag(TIF_SIGPENDING); + + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + if (progp->pg_vers[i]->vs_hidden) + continue; + + dprintk("svc: attempting to unregister %sv%u\n", + progp->pg_name, i); + __svc_unregister(progp->pg_prog, i, progp->pg_name); + } + } + + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +/* + * Printk the given error with the address of the client that caused it. + */ +static int +__attribute__ ((format (printf, 2, 3))) +svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) +{ + va_list args; + int r; + char buf[RPC_MAX_ADDRBUFLEN]; + + if (!net_ratelimit()) + return 0; + + printk(KERN_WARNING "svc: %s: ", + svc_print_addr(rqstp, buf, sizeof(buf))); + + va_start(args, fmt); + r = vprintk(fmt, args); + va_end(args); + + return r; +} + +/* + * Common routine for processing the RPC request. + */ +static int +svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) +{ + struct svc_program *progp; + struct svc_version *versp = NULL; /* compiler food */ + struct svc_procedure *procp = NULL; + struct svc_serv *serv = rqstp->rq_server; + kxdrproc_t xdr; + __be32 *statp; + u32 prog, vers, proc; + __be32 auth_stat, rpc_stat; + int auth_res; + __be32 *reply_statp; + + rpc_stat = rpc_success; + + if (argv->iov_len < 6*4) + goto err_short_len; + + /* Will be turned off only in gss privacy case: */ + rqstp->rq_splice_ok = 1; + /* Will be turned off only when NFSv4 Sessions are used */ + rqstp->rq_usedeferral = 1; + rqstp->rq_dropme = false; + + /* Setup reply header */ + rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); + + svc_putu32(resv, rqstp->rq_xid); + + vers = svc_getnl(argv); + + /* First words of reply: */ + svc_putnl(resv, 1); /* REPLY */ + + if (vers != 2) /* RPC version number */ + goto err_bad_rpc; + + /* Save position in case we later decide to reject: */ + reply_statp = resv->iov_base + resv->iov_len; + + svc_putnl(resv, 0); /* ACCEPT */ + + rqstp->rq_prog = prog = svc_getnl(argv); /* program number */ + rqstp->rq_vers = vers = svc_getnl(argv); /* version number */ + rqstp->rq_proc = proc = svc_getnl(argv); /* procedure number */ + + progp = serv->sv_program; + + for (progp = serv->sv_program; progp; progp = progp->pg_next) + if (prog == progp->pg_prog) + break; + + /* + * Decode auth data, and add verifier to reply buffer. + * We do this before anything else in order to get a decent + * auth verifier. + */ + auth_res = svc_authenticate(rqstp, &auth_stat); + /* Also give the program a chance to reject this call: */ + if (auth_res == SVC_OK && progp) { + auth_stat = rpc_autherr_badcred; + auth_res = progp->pg_authenticate(rqstp); + } + switch (auth_res) { + case SVC_OK: + break; + case SVC_GARBAGE: + goto err_garbage; + case SVC_SYSERR: + rpc_stat = rpc_system_err; + goto err_bad; + case SVC_DENIED: + goto err_bad_auth; + case SVC_CLOSE: + if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) + svc_close_xprt(rqstp->rq_xprt); + case SVC_DROP: + goto dropit; + case SVC_COMPLETE: + goto sendit; + } + + if (progp == NULL) + goto err_bad_prog; + + if (vers >= progp->pg_nvers || + !(versp = progp->pg_vers[vers])) + goto err_bad_vers; + + procp = versp->vs_proc + proc; + if (proc >= versp->vs_nproc || !procp->pc_func) + goto err_bad_proc; + rqstp->rq_procinfo = procp; + + /* Syntactic check complete */ + serv->sv_stats->rpccnt++; + + /* Build the reply header. */ + statp = resv->iov_base +resv->iov_len; + svc_putnl(resv, RPC_SUCCESS); + + /* Bump per-procedure stats counter */ + procp->pc_count++; + + /* Initialize storage for argp and resp */ + memset(rqstp->rq_argp, 0, procp->pc_argsize); + memset(rqstp->rq_resp, 0, procp->pc_ressize); + + /* un-reserve some of the out-queue now that we have a + * better idea of reply size + */ + if (procp->pc_xdrressize) + svc_reserve_auth(rqstp, procp->pc_xdrressize<<2); + + /* Call the function that processes the request. */ + if (!versp->vs_dispatch) { + /* Decode arguments */ + xdr = procp->pc_decode; + if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp)) + goto err_garbage; + + *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + + /* Encode reply */ + if (rqstp->rq_dropme) { + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto dropit; + } + if (*statp == rpc_success && + (xdr = procp->pc_encode) && + !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { + dprintk("svc: failed to encode reply\n"); + /* serv->sv_stats->rpcsystemerr++; */ + *statp = rpc_system_err; + } + } else { + dprintk("svc: calling dispatcher\n"); + if (!versp->vs_dispatch(rqstp, statp)) { + /* Release reply info */ + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto dropit; + } + } + + /* Check RPC status result */ + if (*statp != rpc_success) + resv->iov_len = ((void*)statp) - resv->iov_base + 4; + + /* Release reply info */ + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + + if (procp->pc_encode == NULL) + goto dropit; + + sendit: + if (svc_authorise(rqstp)) + goto dropit; + return 1; /* Caller can now send it */ + + dropit: + svc_authorise(rqstp); /* doesn't hurt to call this twice */ + dprintk("svc: svc_process dropit\n"); + return 0; + +err_short_len: + svc_printk(rqstp, "short len %Zd, dropping request\n", + argv->iov_len); + + goto dropit; /* drop request */ + +err_bad_rpc: + serv->sv_stats->rpcbadfmt++; + svc_putnl(resv, 1); /* REJECT */ + svc_putnl(resv, 0); /* RPC_MISMATCH */ + svc_putnl(resv, 2); /* Only RPCv2 supported */ + svc_putnl(resv, 2); + goto sendit; + +err_bad_auth: + dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + serv->sv_stats->rpcbadauth++; + /* Restore write pointer to location of accept status: */ + xdr_ressize_check(rqstp, reply_statp); + svc_putnl(resv, 1); /* REJECT */ + svc_putnl(resv, 1); /* AUTH_ERROR */ + svc_putnl(resv, ntohl(auth_stat)); /* status */ + goto sendit; + +err_bad_prog: + dprintk("svc: unknown program %d\n", prog); + serv->sv_stats->rpcbadfmt++; + svc_putnl(resv, RPC_PROG_UNAVAIL); + goto sendit; + +err_bad_vers: + svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", + vers, prog, progp->pg_name); + + serv->sv_stats->rpcbadfmt++; + svc_putnl(resv, RPC_PROG_MISMATCH); + svc_putnl(resv, progp->pg_lovers); + svc_putnl(resv, progp->pg_hivers); + goto sendit; + +err_bad_proc: + svc_printk(rqstp, "unknown procedure (%d)\n", proc); + + serv->sv_stats->rpcbadfmt++; + svc_putnl(resv, RPC_PROC_UNAVAIL); + goto sendit; + +err_garbage: + svc_printk(rqstp, "failed to decode args\n"); + + rpc_stat = rpc_garbage_args; +err_bad: + serv->sv_stats->rpcbadfmt++; + svc_putnl(resv, ntohl(rpc_stat)); + goto sendit; +} +EXPORT_SYMBOL_GPL(svc_process); + +/* + * Process the RPC request. + */ +int +svc_process(struct svc_rqst *rqstp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_serv *serv = rqstp->rq_server; + u32 dir; + + /* + * Setup response xdr_buf. + * Initially it has just one page + */ + rqstp->rq_resused = 1; + resv->iov_base = page_address(rqstp->rq_respages[0]); + resv->iov_len = 0; + rqstp->rq_res.pages = rqstp->rq_respages + 1; + rqstp->rq_res.len = 0; + rqstp->rq_res.page_base = 0; + rqstp->rq_res.page_len = 0; + rqstp->rq_res.buflen = PAGE_SIZE; + rqstp->rq_res.tail[0].iov_base = NULL; + rqstp->rq_res.tail[0].iov_len = 0; + + rqstp->rq_xid = svc_getu32(argv); + + dir = svc_getnl(argv); + if (dir != 0) { + /* direction != CALL */ + svc_printk(rqstp, "bad direction %d, dropping request\n", dir); + serv->sv_stats->rpcbadfmt++; + svc_drop(rqstp); + return 0; + } + + /* Returns 1 for send, 0 for drop */ + if (svc_process_common(rqstp, argv, resv)) + return svc_send(rqstp); + else { + svc_drop(rqstp); + return 0; + } +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Process a backchannel RPC request that arrived over an existing + * outbound connection + */ +int +bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, + struct svc_rqst *rqstp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* Build the svc_rqst used by the common processing routine */ + rqstp->rq_xprt = serv->sv_bc_xprt; + rqstp->rq_xid = req->rq_xid; + rqstp->rq_prot = req->rq_xprt->prot; + rqstp->rq_server = serv; + + rqstp->rq_addrlen = sizeof(req->rq_xprt->addr); + memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); + memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); + memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); + + /* reset result send buffer "put" position */ + resv->iov_len = 0; + + if (rqstp->rq_prot != IPPROTO_TCP) { + printk(KERN_ERR "No support for Non-TCP transports!\n"); + BUG(); + } + + /* + * Skip the next two words because they've already been + * processed in the trasport + */ + svc_getu32(argv); /* XID */ + svc_getnl(argv); /* CALLDIR */ + + /* Returns 1 for send, 0 for drop */ + if (svc_process_common(rqstp, argv, resv)) { + memcpy(&req->rq_snd_buf, &rqstp->rq_res, + sizeof(req->rq_snd_buf)); + return bc_send(req); + } else { + /* Nothing to do to drop request */ + return 0; + } +} +EXPORT_SYMBOL(bc_svc_process); +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Return (transport-specific) limit on the rpc payload. + */ +u32 svc_max_payload(const struct svc_rqst *rqstp) +{ + u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload; + + if (rqstp->rq_server->sv_max_payload < max) + max = rqstp->rq_server->sv_max_payload; + return max; +} +EXPORT_SYMBOL_GPL(svc_max_payload); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c new file mode 100644 index 00000000..9d7ed0b4 --- /dev/null +++ b/net/sunrpc/svc_xprt.c @@ -0,0 +1,1270 @@ +/* + * linux/net/sunrpc/svc_xprt.c + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); +static void svc_age_temp_xprts(unsigned long closure); + +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + +/* List of registered transport classes */ +static DEFINE_SPINLOCK(svc_xprt_class_lock); +static LIST_HEAD(svc_xprt_class_list); + +/* SMP locking strategy: + * + * svc_pool->sp_lock protects most of the fields of that pool. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * when both need to be taken (rare), svc_serv->sv_lock is first. + * BKL protects svc_serv->sv_nrthread. + * svc_sock->sk_lock protects the svc_sock->sk_deferred list + * and the ->sk_info_authunix cache. + * + * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being + * enqueued multiply. During normal transport processing this bit + * is set by svc_xprt_enqueue and cleared by svc_xprt_received. + * Providers should not manipulate this bit directly. + * + * Some flags can be set to certain values at any time + * providing that certain rules are followed: + * + * XPT_CONN, XPT_DATA: + * - Can be set or cleared at any time. + * - After a set, svc_xprt_enqueue must be called to enqueue + * the transport for processing. + * - After a clear, the transport must be read/accepted. + * If this succeeds, it must be set again. + * XPT_CLOSE: + * - Can set at any time. It is never cleared. + * XPT_DEAD: + * - Can only be set while XPT_BUSY is held which ensures + * that no other thread will be using the transport or will + * try to set XPT_DEAD. + */ + +int svc_reg_xprt_class(struct svc_xprt_class *xcl) +{ + struct svc_xprt_class *cl; + int res = -EEXIST; + + dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); + + INIT_LIST_HEAD(&xcl->xcl_list); + spin_lock(&svc_xprt_class_lock); + /* Make sure there isn't already a class with the same name */ + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { + if (strcmp(xcl->xcl_name, cl->xcl_name) == 0) + goto out; + } + list_add_tail(&xcl->xcl_list, &svc_xprt_class_list); + res = 0; +out: + spin_unlock(&svc_xprt_class_lock); + return res; +} +EXPORT_SYMBOL_GPL(svc_reg_xprt_class); + +void svc_unreg_xprt_class(struct svc_xprt_class *xcl) +{ + dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); + spin_lock(&svc_xprt_class_lock); + list_del_init(&xcl->xcl_list); + spin_unlock(&svc_xprt_class_lock); +} +EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); + +/* + * Format the transport list for printing + */ +int svc_print_xprts(char *buf, int maxlen) +{ + struct svc_xprt_class *xcl; + char tmpstr[80]; + int len = 0; + buf[0] = '\0'; + + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { + int slen; + + sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); + slen = strlen(tmpstr); + if (len + slen > maxlen) + break; + len += slen; + strcat(buf, tmpstr); + } + spin_unlock(&svc_xprt_class_lock); + + return len; +} + +static void svc_xprt_free(struct kref *kref) +{ + struct svc_xprt *xprt = + container_of(kref, struct svc_xprt, xpt_ref); + struct module *owner = xprt->xpt_class->xcl_owner; + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) + svcauth_unix_info_release(xprt); + put_net(xprt->xpt_net); + /* See comment on corresponding get in xs_setup_bc_tcp(): */ + if (xprt->xpt_bc_xprt) + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_ops->xpo_free(xprt); + module_put(owner); +} + +void svc_xprt_put(struct svc_xprt *xprt) +{ + kref_put(&xprt->xpt_ref, svc_xprt_free); +} +EXPORT_SYMBOL_GPL(svc_xprt_put); + +/* + * Called by transport drivers to initialize the transport independent + * portion of the transport instance. + */ +void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, + struct svc_serv *serv) +{ + memset(xprt, 0, sizeof(*xprt)); + xprt->xpt_class = xcl; + xprt->xpt_ops = xcl->xcl_ops; + kref_init(&xprt->xpt_ref); + xprt->xpt_server = serv; + INIT_LIST_HEAD(&xprt->xpt_list); + INIT_LIST_HEAD(&xprt->xpt_ready); + INIT_LIST_HEAD(&xprt->xpt_deferred); + INIT_LIST_HEAD(&xprt->xpt_users); + mutex_init(&xprt->xpt_mutex); + spin_lock_init(&xprt->xpt_lock); + set_bit(XPT_BUSY, &xprt->xpt_flags); + rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); + xprt->xpt_net = get_net(&init_net); +} +EXPORT_SYMBOL_GPL(svc_xprt_init); + +static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, + struct svc_serv *serv, + struct net *net, + const int family, + const unsigned short port, + int flags) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + struct sockaddr *sap; + size_t len; + + switch (family) { + case PF_INET: + sap = (struct sockaddr *)&sin; + len = sizeof(sin); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case PF_INET6: + sap = (struct sockaddr *)&sin6; + len = sizeof(sin6); + break; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + default: + return ERR_PTR(-EAFNOSUPPORT); + } + + return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); +} + +int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + struct net *net, const int family, + const unsigned short port, int flags) +{ + struct svc_xprt_class *xcl; + + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { + struct svc_xprt *newxprt; + unsigned short newport; + + if (strcmp(xprt_name, xcl->xcl_name)) + continue; + + if (!try_module_get(xcl->xcl_owner)) + goto err; + + spin_unlock(&svc_xprt_class_lock); + newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags); + if (IS_ERR(newxprt)) { + module_put(xcl->xcl_owner); + return PTR_ERR(newxprt); + } + + clear_bit(XPT_TEMP, &newxprt->xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&newxprt->xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + newport = svc_xprt_local_port(newxprt); + clear_bit(XPT_BUSY, &newxprt->xpt_flags); + return newport; + } + err: + spin_unlock(&svc_xprt_class_lock); + dprintk("svc: transport %s not found\n", xprt_name); + + /* This errno is exposed to user space. Provide a reasonable + * perror msg for a bad transport. */ + return -EPROTONOSUPPORT; +} +EXPORT_SYMBOL_GPL(svc_create_xprt); + +/* + * Copy the local and remote xprt addresses to the rqstp structure + */ +void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + struct sockaddr *sin; + + memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); + rqstp->rq_addrlen = xprt->xpt_remotelen; + + /* + * Destination address in request is needed for binding the + * source address in RPC replies/callbacks later. + */ + sin = (struct sockaddr *)&xprt->xpt_local; + switch (sin->sa_family) { + case AF_INET: + rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; + break; + case AF_INET6: + rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; + break; + } +} +EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs); + +/** + * svc_print_addr - Format rq_addr field for printing + * @rqstp: svc_rqst struct containing address to print + * @buf: target buffer for formatted address + * @len: length of target buffer + * + */ +char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) +{ + return __svc_print_addr(svc_addr(rqstp), buf, len); +} +EXPORT_SYMBOL_GPL(svc_print_addr); + +/* + * Queue up an idle server thread. Must have pool->sp_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't pollute + * the cache. + */ +static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &pool->sp_threads); +} + +/* + * Dequeue an nfsd thread. Must have pool->sp_lock held. + */ +static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) +{ + if (xprt->xpt_flags & ((1<xpt_flags & ((1<xpt_ops->xpo_has_wspace(xprt); + return false; +} + +/* + * Queue up a transport with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +void svc_xprt_enqueue(struct svc_xprt *xprt) +{ + struct svc_serv *serv = xprt->xpt_server; + struct svc_pool *pool; + struct svc_rqst *rqstp; + int cpu; + + if (!svc_xprt_has_something_to_do(xprt)) + return; + + cpu = get_cpu(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + put_cpu(); + + spin_lock_bh(&pool->sp_lock); + + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) + printk(KERN_ERR + "svc_xprt_enqueue: " + "threads and transports both waiting??\n"); + + pool->sp_stats.packets++; + + /* Mark transport as busy. It will remain in this state until + * the provider calls svc_xprt_received. We update XPT_BUSY + * atomically because it also guards against trying to enqueue + * the transport twice. + */ + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Don't enqueue transport while already enqueued */ + dprintk("svc: transport %p busy, not enqueued\n", xprt); + goto out_unlock; + } + + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: transport %p served by daemon %p\n", + xprt, rqstp); + svc_thread_dequeue(pool, rqstp); + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + pool->sp_stats.threads_woken++; + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: transport %p put into queue\n", xprt); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + pool->sp_stats.sockets_queued++; + } + +out_unlock: + spin_unlock_bh(&pool->sp_lock); +} +EXPORT_SYMBOL_GPL(svc_xprt_enqueue); + +/* + * Dequeue the first transport. Must be called with the pool->sp_lock held. + */ +static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) +{ + struct svc_xprt *xprt; + + if (list_empty(&pool->sp_sockets)) + return NULL; + + xprt = list_entry(pool->sp_sockets.next, + struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); + + dprintk("svc: transport %p dequeued, inuse=%d\n", + xprt, atomic_read(&xprt->xpt_ref.refcount)); + + return xprt; +} + +/* + * svc_xprt_received conditionally queues the transport for processing + * by another thread. The caller must hold the XPT_BUSY bit and must + * not thereafter touch transport data. + * + * Note: XPT_DATA only gets cleared when a read-attempt finds no (or + * insufficient) data. + */ +void svc_xprt_received(struct svc_xprt *xprt) +{ + BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + /* As soon as we clear busy, the xprt could be closed and + * 'put', so we need a reference to call svc_xprt_enqueue with: + */ + svc_xprt_get(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} +EXPORT_SYMBOL_GPL(svc_xprt_received); + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the transport + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_xprt *xprt = rqstp->rq_xprt; + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); + rqstp->rq_reserved = space; + + svc_xprt_enqueue(xprt); + } +} +EXPORT_SYMBOL_GPL(svc_reserve); + +static void svc_xprt_release(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + + kfree(rqstp->rq_deferred); + rqstp->rq_deferred = NULL; + + svc_free_res_pages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_xprt = NULL; + + svc_xprt_put(xprt); +} + +/* + * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. + */ +void svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_xprt = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); + } +} +EXPORT_SYMBOL_GPL(svc_wake_up); + +int svc_port_is_privileged(struct sockaddr *sin) +{ + switch (sin->sa_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)sin)->sin_port) + < PROT_SOCK; + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) + < PROT_SOCK; + default: + return 0; + } +} + +/* + * Make sure that we don't have too many active connections. If we have, + * something must be dropped. It's not clear what will happen if we allow + * "too many" connections, but when dealing with network-facing software, + * we have to code defensively. Here we do that by imposing hard limits. + * + * There's no point in trying to do random drop here for DoS + * prevention. The NFS clients does 1 reconnect in 15 seconds. An + * attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop old + * connections from the same IP first. But right now we don't even + * record the client IP in svc_sock. + * + * single-threaded services that expect a lot of clients will probably + * need to set sv_maxconn to override the default value which is based + * on the number of threads + */ +static void svc_check_conn_limits(struct svc_serv *serv) +{ + unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn : + (serv->sv_nrthreads+3) * 20; + + if (serv->sv_tmpcnt > limit) { + struct svc_xprt *xprt = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open " + "connections, consider increasing %s\n", + serv->sv_name, serv->sv_maxconn ? + "the max number of connections." : + "the number of threads."); + } + /* + * Always select the oldest connection. It's not fair, + * but so is life + */ + xprt = list_entry(serv->sv_tempsocks.prev, + struct svc_xprt, + xpt_list); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + } + spin_unlock_bh(&serv->sv_lock); + + if (xprt) { + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + } +} + +/* + * Receive the next request on any transport. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. + */ +int svc_recv(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt = NULL; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + int len, i; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + long time_left; + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_recv: service %p, transport not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; + for (i = 0; i < pages ; i++) + while (rqstp->rq_pages[i] == NULL) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + set_current_state(TASK_INTERRUPTIBLE); + if (signalled() || kthread_should_stop()) { + set_current_state(TASK_RUNNING); + return -EINTR; + } + schedule_timeout(msecs_to_jiffies(500)); + } + rqstp->rq_pages[i] = p; + } + rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ + BUG_ON(pages >= RPCSVC_MAXPAGES); + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); + arg->head[0].iov_len = PAGE_SIZE; + arg->pages = rqstp->rq_pages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + try_to_freeze(); + cond_resched(); + if (signalled() || kthread_should_stop()) + return -EINTR; + + /* Normally we will wait up to 5 seconds for any required + * cache information to be provided. + */ + rqstp->rq_chandle.thread_wait = 5*HZ; + + spin_lock_bh(&pool->sp_lock); + xprt = svc_xprt_dequeue(pool); + if (xprt) { + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + + /* As there is a shortage of threads and this request + * had to be queued, don't allow the thread to wait so + * long for cache updates. + */ + rqstp->rq_chandle.thread_wait = 1*HZ; + } else { + /* No data pending. Go to sleep */ + svc_thread_enqueue(pool, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + + /* + * checking kthread_should_stop() here allows us to avoid + * locking and signalling when stopping kthreads that call + * svc_recv. If the thread has already been woken up, then + * we can exit here without sleeping. If not, then it + * it'll be woken up quickly during the schedule_timeout + */ + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + spin_unlock_bh(&pool->sp_lock); + return -EINTR; + } + + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&pool->sp_lock); + + time_left = schedule_timeout(timeout); + + try_to_freeze(); + + spin_lock_bh(&pool->sp_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + if (!time_left) + pool->sp_stats.threads_timedout++; + + xprt = rqstp->rq_xprt; + if (!xprt) { + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + if (signalled() || kthread_should_stop()) + return -EINTR; + else + return -EAGAIN; + } + } + spin_unlock_bh(&pool->sp_lock); + + len = 0; + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); + svc_delete_xprt(xprt); + /* Leave XPT_BUSY set on the dead xprt: */ + goto out; + } + if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + struct svc_xprt *newxpt; + newxpt = xprt->xpt_ops->xpo_accept(xprt); + if (newxpt) { + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(newxpt->xpt_class->xcl_owner); + svc_check_conn_limits(xprt->xpt_server); + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp transports */ + setup_timer(&serv->sv_temptimer, + svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(newxpt); + } + } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { + dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", + rqstp, pool->sp_id, xprt, + atomic_read(&xprt->xpt_ref.refcount)); + rqstp->rq_deferred = svc_deferred_dequeue(xprt); + if (rqstp->rq_deferred) + len = svc_deferred_recv(rqstp); + else + len = xprt->xpt_ops->xpo_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + } + svc_xprt_received(xprt); + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) + goto out; + + clear_bit(XPT_OLD, &xprt->xpt_flags); + + rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +out: + rqstp->rq_res.len = 0; + svc_xprt_release(rqstp); + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(svc_recv); + +/* + * Drop request + */ +void svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); + svc_xprt_release(rqstp); +} +EXPORT_SYMBOL_GPL(svc_drop); + +/* + * Return reply to client. + */ +int svc_send(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt; + int len; + struct xdr_buf *xb; + + xprt = rqstp->rq_xprt; + if (!xprt) + return -EFAULT; + + /* release the receive skb before sending the reply */ + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + + /* calculate over-all length */ + xb = &rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + /* Grab mutex to serialize outgoing data. */ + mutex_lock(&xprt->xpt_mutex); + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = xprt->xpt_ops->xpo_sendto(rqstp); + mutex_unlock(&xprt->xpt_mutex); + rpc_wake_up(&xprt->xpt_bc_pending); + svc_xprt_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Timer function to close old temporary transports, using + * a mark-and-sweep algorithm. + */ +static void svc_age_temp_xprts(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_xprt *xprt; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_xprts\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_xprts: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + + /* First time through, just mark it OLD. Second time + * through, close it. */ + if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) + continue; + if (atomic_read(&xprt->xpt_ref.refcount) > 1 || + test_bit(XPT_BUSY, &xprt->xpt_flags)) + continue; + svc_xprt_get(xprt); + list_move(le, &to_be_aged); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_DETACHED, &xprt->xpt_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + + dprintk("queuing xprt %p for closing\n", xprt); + + /* a thread will dequeue and close it soon */ + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +static void call_xpt_users(struct svc_xprt *xprt) +{ + struct svc_xpt_user *u; + + spin_lock(&xprt->xpt_lock); + while (!list_empty(&xprt->xpt_users)) { + u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list); + list_del(&u->list); + u->callback(u); + } + spin_unlock(&xprt->xpt_lock); +} + +/* + * Remove a dead transport + */ +void svc_delete_xprt(struct svc_xprt *xprt) +{ + struct svc_serv *serv = xprt->xpt_server; + struct svc_deferred_req *dr; + + /* Only do this once */ + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) + BUG(); + + dprintk("svc: svc_delete_xprt(%p)\n", xprt); + xprt->xpt_ops->xpo_detach(xprt); + + spin_lock_bh(&serv->sv_lock); + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) + list_del_init(&xprt->xpt_list); + BUG_ON(!list_empty(&xprt->xpt_ready)); + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; + spin_unlock_bh(&serv->sv_lock); + + while ((dr = svc_deferred_dequeue(xprt)) != NULL) + kfree(dr); + + call_xpt_users(xprt); + svc_xprt_put(xprt); +} + +void svc_close_xprt(struct svc_xprt *xprt) +{ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + /* someone else will have to effect the close */ + return; + /* + * We expect svc_close_xprt() to work even when no threads are + * running (e.g., while configuring the server before starting + * any threads), so if the transport isn't busy, we delete + * it ourself: + */ + svc_delete_xprt(xprt); +} +EXPORT_SYMBOL_GPL(svc_close_xprt); + +static void svc_close_list(struct list_head *xprt_list) +{ + struct svc_xprt *xprt; + + list_for_each_entry(xprt, xprt_list, xpt_list) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_BUSY, &xprt->xpt_flags); + } +} + +void svc_close_all(struct svc_serv *serv) +{ + struct svc_pool *pool; + struct svc_xprt *xprt; + struct svc_xprt *tmp; + int i; + + svc_close_list(&serv->sv_tempsocks); + svc_close_list(&serv->sv_permsocks); + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + while (!list_empty(&pool->sp_sockets)) { + xprt = list_first_entry(&pool->sp_sockets, struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); + } + spin_unlock_bh(&pool->sp_lock); + } + /* + * At this point the sp_sockets lists will stay empty, since + * svc_enqueue will not add new entries without taking the + * sp_lock and checking XPT_BUSY. + */ + list_for_each_entry_safe(xprt, tmp, &serv->sv_tempsocks, xpt_list) + svc_delete_xprt(xprt); + list_for_each_entry_safe(xprt, tmp, &serv->sv_permsocks, xpt_list) + svc_delete_xprt(xprt); + + BUG_ON(!list_empty(&serv->sv_permsocks)); + BUG_ON(!list_empty(&serv->sv_tempsocks)); +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = + container_of(dreq, struct svc_deferred_req, handle); + struct svc_xprt *xprt = dr->xprt; + + spin_lock(&xprt->xpt_lock); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) { + spin_unlock(&xprt->xpt_lock); + dprintk("revisit canceled\n"); + svc_xprt_put(xprt); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + dr->xprt = NULL; + list_add(&dr->handle.recent, &xprt->xpt_deferred); + spin_unlock(&xprt->xpt_lock); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} + +/* + * Save the request off for later processing. The request buffer looks + * like this: + * + * + * + * This code can only handle requests that consist of an xprt-header + * and rpc-header. + */ +static struct cache_deferred_req *svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + size_t skip; + size_t size; + /* FIXME maybe discard if size too large */ + size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len; + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->handle.owner = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); + dr->addrlen = rqstp->rq_addrlen; + dr->daddr = rqstp->rq_daddr; + dr->argslen = rqstp->rq_arg.len >> 2; + dr->xprt_hlen = rqstp->rq_xprt_hlen; + + /* back up head to the start of the buffer and copy */ + skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, + dr->argslen << 2); + } + svc_xprt_get(rqstp->rq_xprt); + dr->xprt = rqstp->rq_xprt; + rqstp->rq_dropme = true; + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + /* setup iov_base past transport header */ + rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); + /* The iov_len does not include the transport header bytes */ + rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; + rqstp->rq_arg.page_len = 0; + /* The rq_arg.len includes the transport header bytes */ + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); + rqstp->rq_addrlen = dr->addrlen; + /* Save off transport header len in case we get deferred again */ + rqstp->rq_xprt_hlen = dr->xprt_hlen; + rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; + return (dr->argslen<<2) - dr->xprt_hlen; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) +{ + struct svc_deferred_req *dr = NULL; + + if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) + return NULL; + spin_lock(&xprt->xpt_lock); + if (!list_empty(&xprt->xpt_deferred)) { + dr = list_entry(xprt->xpt_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + } else + clear_bit(XPT_DEFERRED, &xprt->xpt_flags); + spin_unlock(&xprt->xpt_lock); + return dr; +} + +/** + * svc_find_xprt - find an RPC transport instance + * @serv: pointer to svc_serv to search + * @xcl_name: C string containing transport's class name + * @af: Address family of transport's local address + * @port: transport's IP port number + * + * Return the transport instance pointer for the endpoint accepting + * connections/peer traffic from the specified transport class, + * address family and port. + * + * Specifying 0 for the address family or port is effectively a + * wild-card, and will result in matching the first transport in the + * service's list that has a matching class name. + */ +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, + const sa_family_t af, const unsigned short port) +{ + struct svc_xprt *xprt; + struct svc_xprt *found = NULL; + + /* Sanity check the args */ + if (serv == NULL || xcl_name == NULL) + return found; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) + continue; + if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) + continue; + if (port != 0 && port != svc_xprt_local_port(xprt)) + continue; + found = xprt; + svc_xprt_get(xprt); + break; + } + spin_unlock_bh(&serv->sv_lock); + return found; +} +EXPORT_SYMBOL_GPL(svc_find_xprt); + +static int svc_one_xprt_name(const struct svc_xprt *xprt, + char *pos, int remaining) +{ + int len; + + len = snprintf(pos, remaining, "%s %u\n", + xprt->xpt_class->xcl_name, + svc_xprt_local_port(xprt)); + if (len >= remaining) + return -ENAMETOOLONG; + return len; +} + +/** + * svc_xprt_names - format a buffer with a list of transport names + * @serv: pointer to an RPC service + * @buf: pointer to a buffer to be filled in + * @buflen: length of buffer to be filled in + * + * Fills in @buf with a string containing a list of transport names, + * each name terminated with '\n'. + * + * Returns positive length of the filled-in string on success; otherwise + * a negative errno value is returned if an error occurs. + */ +int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen) +{ + struct svc_xprt *xprt; + int len, totlen; + char *pos; + + /* Sanity check args */ + if (!serv) + return 0; + + spin_lock_bh(&serv->sv_lock); + + pos = buf; + totlen = 0; + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + len = svc_one_xprt_name(xprt, pos, buflen - totlen); + if (len < 0) { + *buf = '\0'; + totlen = len; + } + if (len <= 0) + break; + + pos += len; + totlen += len; + } + + spin_unlock_bh(&serv->sv_lock); + return totlen; +} +EXPORT_SYMBOL_GPL(svc_xprt_names); + + +/*----------------------------------------------------------------------------*/ + +static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) +{ + unsigned int pidx = (unsigned int)*pos; + struct svc_serv *serv = m->private; + + dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + + if (!pidx) + return SEQ_START_TOKEN; + return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); +} + +static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct svc_pool *pool = p; + struct svc_serv *serv = m->private; + + dprintk("svc_pool_stats_next, *pos=%llu\n", *pos); + + if (p == SEQ_START_TOKEN) { + pool = &serv->sv_pools[0]; + } else { + unsigned int pidx = (pool - &serv->sv_pools[0]); + if (pidx < serv->sv_nrpools-1) + pool = &serv->sv_pools[pidx+1]; + else + pool = NULL; + } + ++*pos; + return pool; +} + +static void svc_pool_stats_stop(struct seq_file *m, void *p) +{ +} + +static int svc_pool_stats_show(struct seq_file *m, void *p) +{ + struct svc_pool *pool = p; + + if (p == SEQ_START_TOKEN) { + seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken threads-timedout\n"); + return 0; + } + + seq_printf(m, "%u %lu %lu %lu %lu\n", + pool->sp_id, + pool->sp_stats.packets, + pool->sp_stats.sockets_queued, + pool->sp_stats.threads_woken, + pool->sp_stats.threads_timedout); + + return 0; +} + +static const struct seq_operations svc_pool_stats_seq_ops = { + .start = svc_pool_stats_start, + .next = svc_pool_stats_next, + .stop = svc_pool_stats_stop, + .show = svc_pool_stats_show, +}; + +int svc_pool_stats_open(struct svc_serv *serv, struct file *file) +{ + int err; + + err = seq_open(file, &svc_pool_stats_seq_ops); + if (!err) + ((struct seq_file *) file->private_data)->private = serv; + return err; +} +EXPORT_SYMBOL(svc_pool_stats_open); + +/*----------------------------------------------------------------------------*/ diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c new file mode 100644 index 00000000..7963569f --- /dev/null +++ b/net/sunrpc/svcauth.c @@ -0,0 +1,165 @@ +/* + * linux/net/sunrpc/svcauth.c + * + * The generic interface for RPC authentication on the server side. + * + * Copyright (C) 1995, 1996 Olaf Kirch + * + * CHANGES + * 19-Apr-2000 Chris Evans - Security fix + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_AUTH + + +/* + * Table of authenticators + */ +extern struct auth_ops svcauth_null; +extern struct auth_ops svcauth_unix; + +static DEFINE_SPINLOCK(authtab_lock); +static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = { + [0] = &svcauth_null, + [1] = &svcauth_unix, +}; + +int +svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) +{ + rpc_authflavor_t flavor; + struct auth_ops *aops; + + *authp = rpc_auth_ok; + + flavor = svc_getnl(&rqstp->rq_arg.head[0]); + + dprintk("svc: svc_authenticate (%d)\n", flavor); + + spin_lock(&authtab_lock); + if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) || + !try_module_get(aops->owner)) { + spin_unlock(&authtab_lock); + *authp = rpc_autherr_badcred; + return SVC_DENIED; + } + spin_unlock(&authtab_lock); + + rqstp->rq_authop = aops; + return aops->accept(rqstp, authp); +} +EXPORT_SYMBOL_GPL(svc_authenticate); + +int svc_set_client(struct svc_rqst *rqstp) +{ + return rqstp->rq_authop->set_client(rqstp); +} +EXPORT_SYMBOL_GPL(svc_set_client); + +/* A request, which was authenticated, has now executed. + * Time to finalise the credentials and verifier + * and release and resources + */ +int svc_authorise(struct svc_rqst *rqstp) +{ + struct auth_ops *aops = rqstp->rq_authop; + int rv = 0; + + rqstp->rq_authop = NULL; + + if (aops) { + rv = aops->release(rqstp); + module_put(aops->owner); + } + return rv; +} + +int +svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) +{ + int rv = -EINVAL; + spin_lock(&authtab_lock); + if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) { + authtab[flavor] = aops; + rv = 0; + } + spin_unlock(&authtab_lock); + return rv; +} +EXPORT_SYMBOL_GPL(svc_auth_register); + +void +svc_auth_unregister(rpc_authflavor_t flavor) +{ + spin_lock(&authtab_lock); + if (flavor < RPC_AUTH_MAXFLAVOR) + authtab[flavor] = NULL; + spin_unlock(&authtab_lock); +} +EXPORT_SYMBOL_GPL(svc_auth_unregister); + +/************************************************** + * 'auth_domains' are stored in a hash table indexed by name. + * When the last reference to an 'auth_domain' is dropped, + * the object is unhashed and freed. + * If auth_domain_lookup fails to find an entry, it will return + * it's second argument 'new'. If this is non-null, it will + * have been atomically linked into the table. + */ + +#define DN_HASHBITS 6 +#define DN_HASHMAX (1<ref.refcount, &auth_domain_lock)) { + hlist_del(&dom->hash); + dom->flavour->domain_release(dom); + spin_unlock(&auth_domain_lock); + } +} +EXPORT_SYMBOL_GPL(auth_domain_put); + +struct auth_domain * +auth_domain_lookup(char *name, struct auth_domain *new) +{ + struct auth_domain *hp; + struct hlist_head *head; + struct hlist_node *np; + + head = &auth_domain_table[hash_str(name, DN_HASHBITS)]; + + spin_lock(&auth_domain_lock); + + hlist_for_each_entry(hp, np, head, hash) { + if (strcmp(hp->name, name)==0) { + kref_get(&hp->ref); + spin_unlock(&auth_domain_lock); + return hp; + } + } + if (new) + hlist_add_head(&new->hash, head); + spin_unlock(&auth_domain_lock); + return new; +} +EXPORT_SYMBOL_GPL(auth_domain_lookup); + +struct auth_domain *auth_domain_find(char *name) +{ + return auth_domain_lookup(name, NULL); +} +EXPORT_SYMBOL_GPL(auth_domain_find); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c new file mode 100644 index 00000000..c8e10216 --- /dev/null +++ b/net/sunrpc/svcauth_unix.c @@ -0,0 +1,981 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define RPCDBG_FACILITY RPCDBG_AUTH + +#include + +#include "netns.h" + +/* + * AUTHUNIX and AUTHNULL credentials are both handled here. + * AUTHNULL is treated just like AUTHUNIX except that the uid/gid + * are always nobody (-2). i.e. we do the same IP address checks for + * AUTHNULL as for AUTHUNIX, and that is done here. + */ + + +struct unix_domain { + struct auth_domain h; +#ifdef CONFIG_NFSD_DEPRECATED + int addr_changes; +#endif /* CONFIG_NFSD_DEPRECATED */ + /* other stuff later */ +}; + +extern struct auth_ops svcauth_unix; + +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + struct unix_domain *ud = container_of(dom, struct unix_domain, h); + + kfree(dom->name); + kfree(ud); +} + +struct auth_domain *unix_domain_find(char *name) +{ + struct auth_domain *rv; + struct unix_domain *new = NULL; + + rv = auth_domain_lookup(name, NULL); + while(1) { + if (rv) { + if (new && rv != &new->h) + svcauth_unix_domain_release(&new->h); + + if (rv->flavour != &svcauth_unix) { + auth_domain_put(rv); + return NULL; + } + return rv; + } + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + kref_init(&new->h.ref); + new->h.name = kstrdup(name, GFP_KERNEL); + if (new->h.name == NULL) { + kfree(new); + return NULL; + } + new->h.flavour = &svcauth_unix; +#ifdef CONFIG_NFSD_DEPRECATED + new->addr_changes = 0; +#endif /* CONFIG_NFSD_DEPRECATED */ + rv = auth_domain_lookup(name, &new->h); + } +} +EXPORT_SYMBOL_GPL(unix_domain_find); + + +/************************************************** + * cache for IP address to unix_domain + * as needed by AUTH_UNIX + */ +#define IP_HASHBITS 8 +#define IP_HASHMAX (1<flags) && + !test_bit(CACHE_NEGATIVE, &item->flags)) + auth_domain_put(&im->m_client->h); + kfree(im); +} + +#if IP_HASHBITS == 8 +/* hash_long on a 64 bit machine is currently REALLY BAD for + * IP addresses in reverse-endian (i.e. on a little-endian machine). + * So use a trivial but reliable hash instead + */ +static inline int hash_ip(__be32 ip) +{ + int hash = (__force u32)ip ^ ((__force u32)ip>>16); + return (hash ^ (hash>>8)) & 0xff; +} +#endif +static inline int hash_ip6(struct in6_addr ip) +{ + return (hash_ip(ip.s6_addr32[0]) ^ + hash_ip(ip.s6_addr32[1]) ^ + hash_ip(ip.s6_addr32[2]) ^ + hash_ip(ip.s6_addr32[3])); +} +static int ip_map_match(struct cache_head *corig, struct cache_head *cnew) +{ + struct ip_map *orig = container_of(corig, struct ip_map, h); + struct ip_map *new = container_of(cnew, struct ip_map, h); + return strcmp(orig->m_class, new->m_class) == 0 && + ipv6_addr_equal(&orig->m_addr, &new->m_addr); +} +static void ip_map_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct ip_map *new = container_of(cnew, struct ip_map, h); + struct ip_map *item = container_of(citem, struct ip_map, h); + + strcpy(new->m_class, item->m_class); + ipv6_addr_copy(&new->m_addr, &item->m_addr); +} +static void update(struct cache_head *cnew, struct cache_head *citem) +{ + struct ip_map *new = container_of(cnew, struct ip_map, h); + struct ip_map *item = container_of(citem, struct ip_map, h); + + kref_get(&item->m_client->h.ref); + new->m_client = item->m_client; +#ifdef CONFIG_NFSD_DEPRECATED + new->m_add_change = item->m_add_change; +#endif /* CONFIG_NFSD_DEPRECATED */ +} +static struct cache_head *ip_map_alloc(void) +{ + struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +static void ip_map_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + char text_addr[40]; + struct ip_map *im = container_of(h, struct ip_map, h); + + if (ipv6_addr_v4mapped(&(im->m_addr))) { + snprintf(text_addr, 20, "%pI4", &im->m_addr.s6_addr32[3]); + } else { + snprintf(text_addr, 40, "%pI6", &im->m_addr); + } + qword_add(bpp, blen, im->m_class); + qword_add(bpp, blen, text_addr); + (*bpp)[-1] = '\n'; +} + +static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, ip_map_request); +} + +static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr); +static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry); + +static int ip_map_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* class ipaddress [domainname] */ + /* should be safe just to use the start of the input buffer + * for scratch: */ + char *buf = mesg; + int len; + char class[8]; + union { + struct sockaddr sa; + struct sockaddr_in s4; + struct sockaddr_in6 s6; + } address; + struct sockaddr_in6 sin6; + int err; + + struct ip_map *ipmp; + struct auth_domain *dom; + time_t expiry; + + if (mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + /* class */ + len = qword_get(&mesg, class, sizeof(class)); + if (len <= 0) return -EINVAL; + + /* ip address */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) return -EINVAL; + + if (rpc_pton(buf, len, &address.sa, sizeof(address)) == 0) + return -EINVAL; + switch (address.sa.sa_family) { + case AF_INET: + /* Form a mapped IPv4 address in sin6 */ + sin6.sin6_family = AF_INET6; + ipv6_addr_set_v4mapped(address.s4.sin_addr.s_addr, + &sin6.sin6_addr); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + memcpy(&sin6, &address.s6, sizeof(sin6)); + break; +#endif + default: + return -EINVAL; + } + + expiry = get_expiry(&mesg); + if (expiry ==0) + return -EINVAL; + + /* domainname, or empty for NEGATIVE */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) return -EINVAL; + + if (len) { + dom = unix_domain_find(buf); + if (dom == NULL) + return -ENOENT; + } else + dom = NULL; + + /* IPv6 scope IDs are ignored for now */ + ipmp = __ip_map_lookup(cd, class, &sin6.sin6_addr); + if (ipmp) { + err = __ip_map_update(cd, ipmp, + container_of(dom, struct unix_domain, h), + expiry); + } else + err = -ENOMEM; + + if (dom) + auth_domain_put(dom); + + cache_flush(); + return err; +} + +static int ip_map_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct ip_map *im; + struct in6_addr addr; + char *dom = "-no-domain-"; + + if (h == NULL) { + seq_puts(m, "#class IP domain\n"); + return 0; + } + im = container_of(h, struct ip_map, h); + /* class addr domain */ + ipv6_addr_copy(&addr, &im->m_addr); + + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) + dom = im->m_client->h.name; + + if (ipv6_addr_v4mapped(&addr)) { + seq_printf(m, "%s %pI4 %s\n", + im->m_class, &addr.s6_addr32[3], dom); + } else { + seq_printf(m, "%s %pI6 %s\n", im->m_class, &addr, dom); + } + return 0; +} + + +static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, + struct in6_addr *addr) +{ + struct ip_map ip; + struct cache_head *ch; + + strcpy(ip.m_class, class); + ipv6_addr_copy(&ip.m_addr, addr); + ch = sunrpc_cache_lookup(cd, &ip.h, + hash_str(class, IP_HASHBITS) ^ + hash_ip6(*addr)); + + if (ch) + return container_of(ch, struct ip_map, h); + else + return NULL; +} + +static inline struct ip_map *ip_map_lookup(struct net *net, char *class, + struct in6_addr *addr) +{ + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + return __ip_map_lookup(sn->ip_map_cache, class, addr); +} + +static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, + struct unix_domain *udom, time_t expiry) +{ + struct ip_map ip; + struct cache_head *ch; + + ip.m_client = udom; + ip.h.flags = 0; + if (!udom) + set_bit(CACHE_NEGATIVE, &ip.h.flags); +#ifdef CONFIG_NFSD_DEPRECATED + else { + ip.m_add_change = udom->addr_changes; + /* if this is from the legacy set_client system call, + * we need m_add_change to be one higher + */ + if (expiry == NEVER) + ip.m_add_change++; + } +#endif /* CONFIG_NFSD_DEPRECATED */ + ip.h.expiry_time = expiry; + ch = sunrpc_cache_update(cd, &ip.h, &ipm->h, + hash_str(ipm->m_class, IP_HASHBITS) ^ + hash_ip6(ipm->m_addr)); + if (!ch) + return -ENOMEM; + cache_put(ch, cd); + return 0; +} + +static inline int ip_map_update(struct net *net, struct ip_map *ipm, + struct unix_domain *udom, time_t expiry) +{ + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry); +} + +#ifdef CONFIG_NFSD_DEPRECATED +int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom) +{ + struct unix_domain *udom; + struct ip_map *ipmp; + + if (dom->flavour != &svcauth_unix) + return -EINVAL; + udom = container_of(dom, struct unix_domain, h); + ipmp = ip_map_lookup(net, "nfsd", addr); + + if (ipmp) + return ip_map_update(net, ipmp, udom, NEVER); + else + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(auth_unix_add_addr); + +int auth_unix_forget_old(struct auth_domain *dom) +{ + struct unix_domain *udom; + + if (dom->flavour != &svcauth_unix) + return -EINVAL; + udom = container_of(dom, struct unix_domain, h); + udom->addr_changes++; + return 0; +} +EXPORT_SYMBOL_GPL(auth_unix_forget_old); + +struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr) +{ + struct ip_map *ipm; + struct auth_domain *rv; + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + ipm = ip_map_lookup(net, "nfsd", addr); + + if (!ipm) + return NULL; + if (cache_check(sn->ip_map_cache, &ipm->h, NULL)) + return NULL; + + if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { + sunrpc_invalidate(&ipm->h, sn->ip_map_cache); + rv = NULL; + } else { + rv = &ipm->m_client->h; + kref_get(&rv->ref); + } + cache_put(&ipm->h, sn->ip_map_cache); + return rv; +} +EXPORT_SYMBOL_GPL(auth_unix_lookup); +#endif /* CONFIG_NFSD_DEPRECATED */ + +void svcauth_unix_purge(void) +{ + struct net *net; + + for_each_net(net) { + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + cache_purge(sn->ip_map_cache); + } +} +EXPORT_SYMBOL_GPL(svcauth_unix_purge); + +static inline struct ip_map * +ip_map_cached_get(struct svc_xprt *xprt) +{ + struct ip_map *ipm = NULL; + struct sunrpc_net *sn; + + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + ipm = xprt->xpt_auth_cache; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* + * The entry has been invalidated since it was + * remembered, e.g. by a second mount from the + * same IP address. + */ + sn = net_generic(xprt->xpt_net, sunrpc_net_id); + xprt->xpt_auth_cache = NULL; + spin_unlock(&xprt->xpt_lock); + cache_put(&ipm->h, sn->ip_map_cache); + return NULL; + } + cache_get(&ipm->h); + } + spin_unlock(&xprt->xpt_lock); + } + return ipm; +} + +static inline void +ip_map_cached_put(struct svc_xprt *xprt, struct ip_map *ipm) +{ + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + if (xprt->xpt_auth_cache == NULL) { + /* newly cached, keep the reference */ + xprt->xpt_auth_cache = ipm; + ipm = NULL; + } + spin_unlock(&xprt->xpt_lock); + } + if (ipm) { + struct sunrpc_net *sn; + + sn = net_generic(xprt->xpt_net, sunrpc_net_id); + cache_put(&ipm->h, sn->ip_map_cache); + } +} + +void +svcauth_unix_info_release(struct svc_xprt *xpt) +{ + struct ip_map *ipm; + + ipm = xpt->xpt_auth_cache; + if (ipm != NULL) { + struct sunrpc_net *sn; + + sn = net_generic(xpt->xpt_net, sunrpc_net_id); + cache_put(&ipm->h, sn->ip_map_cache); + } +} + +/**************************************************************************** + * auth.unix.gid cache + * simple cache to map a UID to a list of GIDs + * because AUTH_UNIX aka AUTH_SYS has a max of 16 + */ +#define GID_HASHBITS 8 +#define GID_HASHMAX (1<flags) && + !test_bit(CACHE_NEGATIVE, &item->flags)) + put_group_info(ug->gi); + kfree(ug); +} + +static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) +{ + struct unix_gid *orig = container_of(corig, struct unix_gid, h); + struct unix_gid *new = container_of(cnew, struct unix_gid, h); + return orig->uid == new->uid; +} +static void unix_gid_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct unix_gid *new = container_of(cnew, struct unix_gid, h); + struct unix_gid *item = container_of(citem, struct unix_gid, h); + new->uid = item->uid; +} +static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem) +{ + struct unix_gid *new = container_of(cnew, struct unix_gid, h); + struct unix_gid *item = container_of(citem, struct unix_gid, h); + + get_group_info(item->gi); + new->gi = item->gi; +} +static struct cache_head *unix_gid_alloc(void) +{ + struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL); + if (g) + return &g->h; + else + return NULL; +} + +static void unix_gid_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + char tuid[20]; + struct unix_gid *ug = container_of(h, struct unix_gid, h); + + snprintf(tuid, 20, "%u", ug->uid); + qword_add(bpp, blen, tuid); + (*bpp)[-1] = '\n'; +} + +static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, unix_gid_request); +} + +static struct unix_gid *unix_gid_lookup(uid_t uid); +extern struct cache_detail unix_gid_cache; + +static int unix_gid_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* uid expiry Ngid gid0 gid1 ... gidN-1 */ + int uid; + int gids; + int rv; + int i; + int err; + time_t expiry; + struct unix_gid ug, *ugp; + + if (mlen <= 0 || mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + rv = get_int(&mesg, &uid); + if (rv) + return -EINVAL; + ug.uid = uid; + + expiry = get_expiry(&mesg); + if (expiry == 0) + return -EINVAL; + + rv = get_int(&mesg, &gids); + if (rv || gids < 0 || gids > 8192) + return -EINVAL; + + ug.gi = groups_alloc(gids); + if (!ug.gi) + return -ENOMEM; + + for (i = 0 ; i < gids ; i++) { + int gid; + rv = get_int(&mesg, &gid); + err = -EINVAL; + if (rv) + goto out; + GROUP_AT(ug.gi, i) = gid; + } + + ugp = unix_gid_lookup(uid); + if (ugp) { + struct cache_head *ch; + ug.h.flags = 0; + ug.h.expiry_time = expiry; + ch = sunrpc_cache_update(&unix_gid_cache, + &ug.h, &ugp->h, + hash_long(uid, GID_HASHBITS)); + if (!ch) + err = -ENOMEM; + else { + err = 0; + cache_put(ch, &unix_gid_cache); + } + } else + err = -ENOMEM; + out: + if (ug.gi) + put_group_info(ug.gi); + return err; +} + +static int unix_gid_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct unix_gid *ug; + int i; + int glen; + + if (h == NULL) { + seq_puts(m, "#uid cnt: gids...\n"); + return 0; + } + ug = container_of(h, struct unix_gid, h); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) + glen = ug->gi->ngroups; + else + glen = 0; + + seq_printf(m, "%u %d:", ug->uid, glen); + for (i = 0; i < glen; i++) + seq_printf(m, " %d", GROUP_AT(ug->gi, i)); + seq_printf(m, "\n"); + return 0; +} + +struct cache_detail unix_gid_cache = { + .owner = THIS_MODULE, + .hash_size = GID_HASHMAX, + .hash_table = gid_table, + .name = "auth.unix.gid", + .cache_put = unix_gid_put, + .cache_upcall = unix_gid_upcall, + .cache_parse = unix_gid_parse, + .cache_show = unix_gid_show, + .match = unix_gid_match, + .init = unix_gid_init, + .update = unix_gid_update, + .alloc = unix_gid_alloc, +}; + +static struct unix_gid *unix_gid_lookup(uid_t uid) +{ + struct unix_gid ug; + struct cache_head *ch; + + ug.uid = uid; + ch = sunrpc_cache_lookup(&unix_gid_cache, &ug.h, + hash_long(uid, GID_HASHBITS)); + if (ch) + return container_of(ch, struct unix_gid, h); + else + return NULL; +} + +static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp) +{ + struct unix_gid *ug; + struct group_info *gi; + int ret; + + ug = unix_gid_lookup(uid); + if (!ug) + return ERR_PTR(-EAGAIN); + ret = cache_check(&unix_gid_cache, &ug->h, &rqstp->rq_chandle); + switch (ret) { + case -ENOENT: + return ERR_PTR(-ENOENT); + case -ETIMEDOUT: + return ERR_PTR(-ESHUTDOWN); + case 0: + gi = get_group_info(ug->gi); + cache_put(&ug->h, &unix_gid_cache); + return gi; + default: + return ERR_PTR(-EAGAIN); + } +} + +int +svcauth_unix_set_client(struct svc_rqst *rqstp) +{ + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6, sin6_storage; + struct ip_map *ipm; + struct group_info *gi; + struct svc_cred *cred = &rqstp->rq_cred; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct net *net = xprt->xpt_net; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + switch (rqstp->rq_addr.ss_family) { + case AF_INET: + sin = svc_addr_in(rqstp); + sin6 = &sin6_storage; + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &sin6->sin6_addr); + break; + case AF_INET6: + sin6 = svc_addr_in6(rqstp); + break; + default: + BUG(); + } + + rqstp->rq_client = NULL; + if (rqstp->rq_proc == 0) + return SVC_OK; + + ipm = ip_map_cached_get(xprt); + if (ipm == NULL) + ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, + &sin6->sin6_addr); + + if (ipm == NULL) + return SVC_DENIED; + + switch (cache_check(sn->ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { + default: + BUG(); + case -ETIMEDOUT: + return SVC_CLOSE; + case -EAGAIN: + return SVC_DROP; + case -ENOENT: + return SVC_DENIED; + case 0: + rqstp->rq_client = &ipm->m_client->h; + kref_get(&rqstp->rq_client->ref); + ip_map_cached_put(xprt, ipm); + break; + } + + gi = unix_gid_find(cred->cr_uid, rqstp); + switch (PTR_ERR(gi)) { + case -EAGAIN: + return SVC_DROP; + case -ESHUTDOWN: + return SVC_CLOSE; + case -ENOENT: + break; + default: + put_group_info(cred->cr_group_info); + cred->cr_group_info = gi; + } + return SVC_OK; +} + +EXPORT_SYMBOL_GPL(svcauth_unix_set_client); + +static int +svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_cred *cred = &rqstp->rq_cred; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; + + if (argv->iov_len < 3*4) + return SVC_GARBAGE; + + if (svc_getu32(argv) != 0) { + dprintk("svc: bad null cred\n"); + *authp = rpc_autherr_badcred; + return SVC_DENIED; + } + if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { + dprintk("svc: bad null verf\n"); + *authp = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* Signal that mapping to nobody uid/gid is required */ + cred->cr_uid = (uid_t) -1; + cred->cr_gid = (gid_t) -1; + cred->cr_group_info = groups_alloc(0); + if (cred->cr_group_info == NULL) + return SVC_CLOSE; /* kmalloc failure - client must retry */ + + /* Put NULL verifier */ + svc_putnl(resv, RPC_AUTH_NULL); + svc_putnl(resv, 0); + + rqstp->rq_flavor = RPC_AUTH_NULL; + return SVC_OK; +} + +static int +svcauth_null_release(struct svc_rqst *rqstp) +{ + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + + return 0; /* don't drop */ +} + + +struct auth_ops svcauth_null = { + .name = "null", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_NULL, + .accept = svcauth_null_accept, + .release = svcauth_null_release, + .set_client = svcauth_unix_set_client, +}; + + +static int +svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_cred *cred = &rqstp->rq_cred; + u32 slen, i; + int len = argv->iov_len; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; + + if ((len -= 3*4) < 0) + return SVC_GARBAGE; + + svc_getu32(argv); /* length */ + svc_getu32(argv); /* time stamp */ + slen = XDR_QUADLEN(svc_getnl(argv)); /* machname length */ + if (slen > 64 || (len -= (slen + 3)*4) < 0) + goto badcred; + argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */ + argv->iov_len -= slen*4; + + cred->cr_uid = svc_getnl(argv); /* uid */ + cred->cr_gid = svc_getnl(argv); /* gid */ + slen = svc_getnl(argv); /* gids length */ + if (slen > 16 || (len -= (slen + 2)*4) < 0) + goto badcred; + cred->cr_group_info = groups_alloc(slen); + if (cred->cr_group_info == NULL) + return SVC_CLOSE; + for (i = 0; i < slen; i++) + GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv); + if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { + *authp = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* Put NULL verifier */ + svc_putnl(resv, RPC_AUTH_NULL); + svc_putnl(resv, 0); + + rqstp->rq_flavor = RPC_AUTH_UNIX; + return SVC_OK; + +badcred: + *authp = rpc_autherr_badcred; + return SVC_DENIED; +} + +static int +svcauth_unix_release(struct svc_rqst *rqstp) +{ + /* Verifier (such as it is) is already in place. + */ + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + + return 0; +} + + +struct auth_ops svcauth_unix = { + .name = "unix", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_UNIX, + .accept = svcauth_unix_accept, + .release = svcauth_unix_release, + .domain_release = svcauth_unix_domain_release, + .set_client = svcauth_unix_set_client, +}; + +int ip_map_cache_create(struct net *net) +{ + int err = -ENOMEM; + struct cache_detail *cd; + struct cache_head **tbl; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); + if (cd == NULL) + goto err_cd; + + tbl = kzalloc(IP_HASHMAX * sizeof(struct cache_head *), GFP_KERNEL); + if (tbl == NULL) + goto err_tbl; + + cd->owner = THIS_MODULE, + cd->hash_size = IP_HASHMAX, + cd->hash_table = tbl, + cd->name = "auth.unix.ip", + cd->cache_put = ip_map_put, + cd->cache_upcall = ip_map_upcall, + cd->cache_parse = ip_map_parse, + cd->cache_show = ip_map_show, + cd->match = ip_map_match, + cd->init = ip_map_init, + cd->update = update, + cd->alloc = ip_map_alloc, + + err = cache_register_net(cd, net); + if (err) + goto err_reg; + + sn->ip_map_cache = cd; + return 0; + +err_reg: + kfree(tbl); +err_tbl: + kfree(cd); +err_cd: + return err; +} + +void ip_map_cache_destroy(struct net *net) +{ + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + cache_purge(sn->ip_map_cache); + cache_unregister_net(sn->ip_map_cache, net); + kfree(sn->ip_map_cache->hash_table); + kfree(sn->ip_map_cache); +} diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c new file mode 100644 index 00000000..af04f779 --- /dev/null +++ b/net/sunrpc/svcsock.c @@ -0,0 +1,1663 @@ +/* + * linux/net/sunrpc/svcsock.c + * + * These are the RPC server socket internals. + * + * The server scheduling algorithm does not always distribute the load + * evenly when servicing a single client. May need to modify the + * svc_xprt_enqueue procedure... + * + * TCP support is largely untested and may be a little slow. The problem + * is that we currently do two separate recvfrom's, one for the 4-byte + * record length, and the second for the actual record. This could possibly + * be improved by always reading a minimum size of around 100 bytes and + * tucking any superfluous bytes away in a temporary store. Still, that + * leaves write requests out in the rain. An alternative may be to peek at + * the first skb in the queue, and if it matches the next TCP sequence + * number, to extract the record marker. Yuck. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + + +static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, + int *errp, int flags); +static void svc_udp_data_ready(struct sock *, int); +static int svc_udp_recvfrom(struct svc_rqst *); +static int svc_udp_sendto(struct svc_rqst *); +static void svc_sock_detach(struct svc_xprt *); +static void svc_tcp_sock_detach(struct svc_xprt *); +static void svc_sock_free(struct svc_xprt *); + +static struct svc_xprt *svc_create_socket(struct svc_serv *, int, + struct net *, struct sockaddr *, + int, int); +#if defined(CONFIG_NFS_V4_1) +static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, + struct net *, struct sockaddr *, + int, int); +static void svc_bc_sock_free(struct svc_xprt *xprt); +#endif /* CONFIG_NFS_V4_1 */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key svc_key[2]; +static struct lock_class_key svc_slock_key[2]; + +static void svc_reclassify_socket(struct socket *sock) +{ + struct sock *sk = sock->sk; + BUG_ON(sock_owned_by_user(sk)); + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", + &svc_slock_key[0], + "sk_xprt.xpt_lock-AF_INET-NFSD", + &svc_key[0]); + break; + + case AF_INET6: + sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", + &svc_slock_key[1], + "sk_xprt.xpt_lock-AF_INET6-NFSD", + &svc_key[1]); + break; + + default: + BUG(); + } +} +#else +static void svc_reclassify_socket(struct socket *sock) +{ +} +#endif + +/* + * Release an skbuff after use + */ +static void svc_release_skb(struct svc_rqst *rqstp) +{ + struct sk_buff *skb = rqstp->rq_xprt_ctxt; + + if (skb) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + rqstp->rq_xprt_ctxt = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + skb_free_datagram_locked(svsk->sk_sk, skb); + } +} + +union svc_pktinfo_u { + struct in_pktinfo pkti; + struct in6_pktinfo pkti6; +}; +#define SVC_PKTINFO_SPACE \ + CMSG_SPACE(sizeof(union svc_pktinfo_u)) + +static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { + case AF_INET: { + struct in_pktinfo *pki = CMSG_DATA(cmh); + + cmh->cmsg_level = SOL_IP; + cmh->cmsg_type = IP_PKTINFO; + pki->ipi_ifindex = 0; + pki->ipi_spec_dst.s_addr = rqstp->rq_daddr.addr.s_addr; + cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); + } + break; + + case AF_INET6: { + struct in6_pktinfo *pki = CMSG_DATA(cmh); + + cmh->cmsg_level = SOL_IPV6; + cmh->cmsg_type = IPV6_PKTINFO; + pki->ipi6_ifindex = 0; + ipv6_addr_copy(&pki->ipi6_addr, + &rqstp->rq_daddr.addr6); + cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); + } + break; + } +} + +/* + * send routine intended to be shared by the fore- and back-channel + */ +int svc_send_common(struct socket *sock, struct xdr_buf *xdr, + struct page *headpage, unsigned long headoffset, + struct page *tailpage, unsigned long tailoffset) +{ + int result; + int size; + struct page **ppage = xdr->pages; + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; + int slen; + int len = 0; + + slen = xdr->len; + + /* send head */ + if (slen == xdr->head[0].iov_len) + flags = 0; + len = kernel_sendpage(sock, headpage, headoffset, + xdr->head[0].iov_len, flags); + if (len != xdr->head[0].iov_len) + goto out; + slen -= xdr->head[0].iov_len; + if (slen == 0) + goto out; + + /* send page data */ + size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; + while (pglen > 0) { + if (slen == size) + flags = 0; + result = kernel_sendpage(sock, *ppage, base, size, flags); + if (result > 0) + len += result; + if (result != size) + goto out; + slen -= size; + pglen -= size; + size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; + base = 0; + ppage++; + } + + /* send tail */ + if (xdr->tail[0].iov_len) { + result = kernel_sendpage(sock, tailpage, tailoffset, + xdr->tail[0].iov_len, 0); + if (result > 0) + len += result; + } + +out: + return len; +} + + +/* + * Generic sendto routine + */ +static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct socket *sock = svsk->sk_sock; + union { + struct cmsghdr hdr; + long all[SVC_PKTINFO_SPACE / sizeof(long)]; + } buffer; + struct cmsghdr *cmh = &buffer.hdr; + int len = 0; + unsigned long tailoff; + unsigned long headoff; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + + if (rqstp->rq_prot == IPPROTO_UDP) { + struct msghdr msg = { + .msg_name = &rqstp->rq_addr, + .msg_namelen = rqstp->rq_addrlen, + .msg_control = cmh, + .msg_controllen = sizeof(buffer), + .msg_flags = MSG_MORE, + }; + + svc_set_cmsg_data(rqstp, cmh); + + if (sock_sendmsg(sock, &msg, 0) < 0) + goto out; + } + + tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1); + headoff = 0; + len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff, + rqstp->rq_respages[0], tailoff); + +out: + dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", + svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, + xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); + + return len; +} + +/* + * Report socket names for nfsdfs + */ +static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) +{ + const struct sock *sk = svsk->sk_sk; + const char *proto_name = sk->sk_protocol == IPPROTO_UDP ? + "udp" : "tcp"; + int len; + + switch (sk->sk_family) { + case PF_INET: + len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n", + proto_name, + &inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + break; + case PF_INET6: + len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n", + proto_name, + &inet6_sk(sk)->rcv_saddr, + inet_sk(sk)->inet_num); + break; + default: + len = snprintf(buf, remaining, "*unknown-%d*\n", + sk->sk_family); + } + + if (len >= remaining) { + *buf = '\0'; + return -ENAMETOOLONG; + } + return len; +} + +/** + * svc_sock_names - construct a list of listener names in a string + * @serv: pointer to RPC service + * @buf: pointer to a buffer to fill in with socket names + * @buflen: size of the buffer to be filled + * @toclose: pointer to '\0'-terminated C string containing the name + * of a listener to be closed + * + * Fills in @buf with a '\n'-separated list of names of listener + * sockets. If @toclose is not NULL, the socket named by @toclose + * is closed, and is not included in the output list. + * + * Returns positive length of the socket name string, or a negative + * errno value on error. + */ +int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen, + const char *toclose) +{ + struct svc_sock *svsk, *closesk = NULL; + int len = 0; + + if (!serv) + return 0; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { + int onelen = svc_one_sock_name(svsk, buf + len, buflen - len); + if (onelen < 0) { + len = onelen; + break; + } + if (toclose && strcmp(toclose, buf + len) == 0) { + closesk = svsk; + svc_xprt_get(&closesk->sk_xprt); + } else + len += onelen; + } + spin_unlock_bh(&serv->sv_lock); + + if (closesk) { + /* Should unregister with portmap, but you cannot + * unregister just one protocol... + */ + svc_close_xprt(&closesk->sk_xprt); + svc_xprt_put(&closesk->sk_xprt); + } else if (toclose) + return -ENOENT; + return len; +} +EXPORT_SYMBOL_GPL(svc_sock_names); + +/* + * Check input queue length + */ +static int svc_recv_available(struct svc_sock *svsk) +{ + struct socket *sock = svsk->sk_sock; + int avail, err; + + err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail); + + return (err >= 0)? avail : err; +} + +/* + * Generic recvfrom routine. + */ +static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, + int buflen) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT, + }; + int len; + + rqstp->rq_xprt_hlen = 0; + + len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, + msg.msg_flags); + + dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + svsk, iov[0].iov_base, iov[0].iov_len, len); + return len; +} + +static int svc_partial_recvfrom(struct svc_rqst *rqstp, + struct kvec *iov, int nr, + int buflen, unsigned int base) +{ + size_t save_iovlen; + void __user *save_iovbase; + unsigned int i; + int ret; + + if (base == 0) + return svc_recvfrom(rqstp, iov, nr, buflen); + + for (i = 0; i < nr; i++) { + if (iov[i].iov_len > base) + break; + base -= iov[i].iov_len; + } + save_iovlen = iov[i].iov_len; + save_iovbase = iov[i].iov_base; + iov[i].iov_len -= base; + iov[i].iov_base += base; + ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen); + iov[i].iov_len = save_iovlen; + iov[i].iov_base = save_iovbase; + return ret; +} + +/* + * Set socket snd and rcv buffer lengths + */ +static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, + unsigned int rcv) +{ +#if 0 + mm_segment_t oldfs; + oldfs = get_fs(); set_fs(KERNEL_DS); + sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char*)&snd, sizeof(snd)); + sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char*)&rcv, sizeof(rcv)); +#else + /* sock_setsockopt limits use to sysctl_?mem_max, + * which isn't acceptable. Until that is made conditional + * on not having CAP_SYS_RESOURCE or similar, we go direct... + * DaveM said I could! + */ + lock_sock(sock->sk); + sock->sk->sk_sndbuf = snd * 2; + sock->sk->sk_rcvbuf = rcv * 2; + sock->sk->sk_write_space(sock->sk); + release_sock(sock->sk); +#endif +} +/* + * INET callback when data has been received on the socket. + */ +static void svc_udp_data_ready(struct sock *sk, int count) +{ + struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + wait_queue_head_t *wq = sk_sleep(sk); + + if (svsk) { + dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", + svsk, sk, count, + test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } + if (wq && waitqueue_active(wq)) + wake_up_interruptible(wq); +} + +/* + * INET callback when space is newly available on the socket. + */ +static void svc_write_space(struct sock *sk) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); + wait_queue_head_t *wq = sk_sleep(sk); + + if (svsk) { + dprintk("svc: socket %p(inet %p), write_space busy=%d\n", + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svc_xprt_enqueue(&svsk->sk_xprt); + } + + if (wq && waitqueue_active(wq)) { + dprintk("RPC svc_write_space: someone sleeping on %p\n", + svsk); + wake_up_interruptible(wq); + } +} + +static void svc_tcp_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) + clear_bit(SOCK_NOSPACE, &sock->flags); + svc_write_space(sk); +} + +/* + * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo + */ +static int svc_udp_get_dest_address4(struct svc_rqst *rqstp, + struct cmsghdr *cmh) +{ + struct in_pktinfo *pki = CMSG_DATA(cmh); + if (cmh->cmsg_type != IP_PKTINFO) + return 0; + rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; + return 1; +} + +/* + * See net/ipv6/datagram.c : datagram_recv_ctl + */ +static int svc_udp_get_dest_address6(struct svc_rqst *rqstp, + struct cmsghdr *cmh) +{ + struct in6_pktinfo *pki = CMSG_DATA(cmh); + if (cmh->cmsg_type != IPV6_PKTINFO) + return 0; + ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr); + return 1; +} + +/* + * Copy the UDP datagram's destination address to the rqstp structure. + * The 'destination' address in this case is the address to which the + * peer sent the datagram, i.e. our local address. For multihomed + * hosts, this can change from msg to msg. Note that only the IP + * address changes, the port number should remain the same. + */ +static int svc_udp_get_dest_address(struct svc_rqst *rqstp, + struct cmsghdr *cmh) +{ + switch (cmh->cmsg_level) { + case SOL_IP: + return svc_udp_get_dest_address4(rqstp, cmh); + case SOL_IPV6: + return svc_udp_get_dest_address6(rqstp, cmh); + } + + return 0; +} + +/* + * Receive a datagram from a UDP socket. + */ +static int svc_udp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + struct sk_buff *skb; + union { + struct cmsghdr hdr; + long all[SVC_PKTINFO_SPACE / sizeof(long)]; + } buffer; + struct cmsghdr *cmh = &buffer.hdr; + struct msghdr msg = { + .msg_name = svc_addr(rqstp), + .msg_control = cmh, + .msg_controllen = sizeof(buffer), + .msg_flags = MSG_DONTWAIT, + }; + size_t len; + int err; + + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) + /* udp sockets need large rcvbuf as all pending + * requests are still in that buffer. sndbuf must + * also be large enough that there is enough space + * for one reply per thread. We count all threads + * rather than threads in a particular pool, which + * provides an upper bound on the number of threads + * which will access the socket. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_max_mesg, + (serv->sv_nrthreads+3) * serv->sv_max_mesg); + + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + skb = NULL; + err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, + 0, 0, MSG_PEEK | MSG_DONTWAIT); + if (err >= 0) + skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err); + + if (skb == NULL) { + if (err != -EAGAIN) { + /* possibly an icmp error */ + dprintk("svc: recvfrom returned error %d\n", -err); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } + return -EAGAIN; + } + len = svc_addr_len(svc_addr(rqstp)); + if (len == 0) + return -EAFNOSUPPORT; + rqstp->rq_addrlen = len; + if (skb->tstamp.tv64 == 0) { + skb->tstamp = ktime_get_real(); + /* Don't enable netstamp, sunrpc doesn't + need that much accuracy */ + } + svsk->sk_sk->sk_stamp = skb->tstamp; + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ + + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + + rqstp->rq_prot = IPPROTO_UDP; + + if (!svc_udp_get_dest_address(rqstp, cmh)) { + if (net_ratelimit()) + printk(KERN_WARNING + "svc: received unknown control message %d/%d; " + "dropping RPC reply datagram\n", + cmh->cmsg_level, cmh->cmsg_type); + skb_free_datagram_locked(svsk->sk_sk, skb); + return 0; + } + + if (skb_is_nonlinear(skb)) { + /* we have to copy */ + local_bh_disable(); + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { + local_bh_enable(); + /* checksum error */ + skb_free_datagram_locked(svsk->sk_sk, skb); + return 0; + } + local_bh_enable(); + skb_free_datagram_locked(svsk->sk_sk, skb); + } else { + /* we can use it in-place */ + rqstp->rq_arg.head[0].iov_base = skb->data + + sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_len = len; + if (skb_checksum_complete(skb)) { + skb_free_datagram_locked(svsk->sk_sk, skb); + return 0; + } + rqstp->rq_xprt_ctxt = skb; + } + + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + rqstp->rq_respages = rqstp->rq_pages+1; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + rqstp->rq_respages = rqstp->rq_pages + 1 + + DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE); + } + + if (serv->sv_stats) + serv->sv_stats->netudpcnt++; + + return len; +} + +static int +svc_udp_sendto(struct svc_rqst *rqstp) +{ + int error; + + error = svc_sendto(rqstp, &rqstp->rq_res); + if (error == -ECONNREFUSED) + /* ICMP error on earlier request. */ + error = svc_sendto(rqstp, &rqstp->rq_res); + + return error; +} + +static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +static int svc_udp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = xprt->xpt_server; + unsigned long required; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; + if (required*2 > sock_wspace(svsk->sk_sk)) + return 0; + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + +static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) +{ + BUG(); + return NULL; +} + +static struct svc_xprt *svc_udp_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags); +} + +static struct svc_xprt_ops svc_udp_ops = { + .xpo_create = svc_udp_create, + .xpo_recvfrom = svc_udp_recvfrom, + .xpo_sendto = svc_udp_sendto, + .xpo_release_rqst = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, + .xpo_has_wspace = svc_udp_has_wspace, + .xpo_accept = svc_udp_accept, +}; + +static struct svc_xprt_class svc_udp_class = { + .xcl_name = "udp", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_udp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, +}; + +static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) +{ + int err, level, optname, one = 1; + + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); + clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); + svsk->sk_sk->sk_data_ready = svc_udp_data_ready; + svsk->sk_sk->sk_write_space = svc_write_space; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_udp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); + + /* data might have come in before data_ready set up */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + + /* make sure we get destination address info */ + switch (svsk->sk_sk->sk_family) { + case AF_INET: + level = SOL_IP; + optname = IP_PKTINFO; + break; + case AF_INET6: + level = SOL_IPV6; + optname = IPV6_RECVPKTINFO; + break; + default: + BUG(); + } + err = kernel_setsockopt(svsk->sk_sock, level, optname, + (char *)&one, sizeof(one)); + dprintk("svc: kernel_setsockopt returned %d\n", err); +} + +/* + * A data_ready event on a listening socket means there's a connection + * pending. Do not use state_change as a substitute for it. + */ +static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +{ + struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + wait_queue_head_t *wq; + + dprintk("svc: socket %p TCP (listen) state change %d\n", + sk, sk->sk_state); + + /* + * This callback may called twice when a new connection + * is established as a child socket inherits everything + * from a parent LISTEN socket. + * 1) data_ready method of the parent socket will be called + * when one of child sockets become ESTABLISHED. + * 2) data_ready method of the child socket may be called + * when it receives data before the socket is accepted. + * In case of 2, we should ignore it silently. + */ + if (sk->sk_state == TCP_LISTEN) { + if (svsk) { + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } else + printk("svc: socket %p: no user data\n", sk); + } + + wq = sk_sleep(sk); + if (wq && waitqueue_active(wq)) + wake_up_interruptible_all(wq); +} + +/* + * A state change on a connected socket means it's dying or dead. + */ +static void svc_tcp_state_change(struct sock *sk) +{ + struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + wait_queue_head_t *wq = sk_sleep(sk); + + dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", + sk, sk->sk_state, sk->sk_user_data); + + if (!svsk) + printk("svc: socket %p: no user data\n", sk); + else { + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } + if (wq && waitqueue_active(wq)) + wake_up_interruptible_all(wq); +} + +static void svc_tcp_data_ready(struct sock *sk, int count) +{ + struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + wait_queue_head_t *wq = sk_sleep(sk); + + dprintk("svc: socket %p TCP data ready (svsk %p)\n", + sk, sk->sk_user_data); + if (svsk) { + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } + if (wq && waitqueue_active(wq)) + wake_up_interruptible(wq); +} + +/* + * Accept a TCP connection + */ +static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sockaddr_storage addr; + struct sockaddr *sin = (struct sockaddr *) &addr; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + struct socket *sock = svsk->sk_sock; + struct socket *newsock; + struct svc_sock *newsvsk; + int err, slen; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return NULL; + + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + err = kernel_accept(sock, &newsock, O_NONBLOCK); + if (err < 0) { + if (err == -ENOMEM) + printk(KERN_WARNING "%s: no more sockets!\n", + serv->sv_name); + else if (err != -EAGAIN && net_ratelimit()) + printk(KERN_WARNING "%s: accept failed (err %d)!\n", + serv->sv_name, -err); + return NULL; + } + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + + err = kernel_getpeername(newsock, sin, &slen); + if (err < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: peername failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + + /* Ideally, we would want to reject connections from unauthorized + * hosts here, but when we get encryption, the IP of the host won't + * tell us anything. For now just warn about unpriv connections. + */ + if (!svc_port_is_privileged(sin)) { + dprintk(KERN_WARNING + "%s: connect from unprivileged port: %s\n", + serv->sv_name, + __svc_print_addr(sin, buf, sizeof(buf))); + } + dprintk("%s: connect from %s\n", serv->sv_name, + __svc_print_addr(sin, buf, sizeof(buf))); + + /* make sure that a write doesn't block forever when + * low on memory + */ + newsock->sk->sk_sndtimeo = HZ*30; + + if (!(newsvsk = svc_setup_socket(serv, newsock, &err, + (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) + goto failed; + svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); + err = kernel_getsockname(newsock, sin, &slen); + if (unlikely(err < 0)) { + dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); + slen = offsetof(struct sockaddr, sa_data); + } + svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); + + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + + return &newsvsk->sk_xprt; + +failed: + sock_release(newsock); + return NULL; +} + +static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + return 0; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (rqstp->rq_pages[i] != NULL) + put_page(rqstp->rq_pages[i]); + BUG_ON(svsk->sk_pages[i] == NULL); + rqstp->rq_pages[i] = svsk->sk_pages[i]; + svsk->sk_pages[i] = NULL; + } + rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]); + return len; +} + +static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + return; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + svsk->sk_pages[i] = rqstp->rq_pages[i]; + rqstp->rq_pages[i] = NULL; + } +} + +static void svc_tcp_clear_pages(struct svc_sock *svsk) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + goto out; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + BUG_ON(svsk->sk_pages[i] == NULL); + put_page(svsk->sk_pages[i]); + svsk->sk_pages[i] = NULL; + } +out: + svsk->sk_tcplen = 0; +} + +/* + * Receive data. + * If we haven't gotten the record length yet, get the next four bytes. + * Otherwise try to gobble up as much as possible up to the complete + * record length. + */ +static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + unsigned int want; + int len; + + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + + if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { + struct kvec iov; + + want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; + iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_len = want; + if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + goto error; + svsk->sk_tcplen += len; + + if (len < want) { + dprintk("svc: short recvfrom while reading record " + "length (%d of %d)\n", len, want); + return -EAGAIN; + } + + svsk->sk_reclen = ntohl(svsk->sk_reclen); + if (!(svsk->sk_reclen & RPC_LAST_STREAM_FRAGMENT)) { + /* FIXME: technically, a record can be fragmented, + * and non-terminal fragments will not have the top + * bit set in the fragment length header. + * But apparently no known nfs clients send fragmented + * records. */ + if (net_ratelimit()) + printk(KERN_NOTICE "RPC: multiple fragments " + "per record not supported\n"); + goto err_delete; + } + + svsk->sk_reclen &= RPC_FRAGMENT_SIZE_MASK; + dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); + if (svsk->sk_reclen > serv->sv_max_mesg) { + if (net_ratelimit()) + printk(KERN_NOTICE "RPC: " + "fragment too large: 0x%08lx\n", + (unsigned long)svsk->sk_reclen); + goto err_delete; + } + } + + if (svsk->sk_reclen < 8) + goto err_delete; /* client is nuts. */ + + len = svsk->sk_reclen; + + return len; +error: + dprintk("RPC: TCP recv_record got %d\n", len); + return len; +err_delete: + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + return -EAGAIN; +} + +static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt; + struct rpc_rqst *req = NULL; + struct kvec *src, *dst; + __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; + __be32 xid; + __be32 calldir; + + xid = *p++; + calldir = *p; + + if (bc_xprt) + req = xprt_lookup_rqst(bc_xprt, xid); + + if (!req) { + printk(KERN_NOTICE + "%s: Got unrecognized reply: " + "calldir 0x%x xpt_bc_xprt %p xid %08x\n", + __func__, ntohl(calldir), + bc_xprt, xid); + return -EAGAIN; + } + + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + /* + * XXX!: cheating for now! Only copying HEAD. + * But we know this is good enough for now (in fact, for any + * callback reply in the forseeable future). + */ + dst = &req->rq_private_buf.head[0]; + src = &rqstp->rq_arg.head[0]; + if (dst->iov_len < src->iov_len) + return -EAGAIN; /* whatever; just giving up. */ + memcpy(dst->iov_base, src->iov_base, src->iov_len); + xprt_complete_rqst(req->rq_task, svsk->sk_reclen); + rqstp->rq_arg.len = 0; + return 0; +} + +static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) +{ + int i = 0; + int t = 0; + + while (t < len) { + vec[i].iov_base = page_address(pages[i]); + vec[i].iov_len = PAGE_SIZE; + i++; + t += PAGE_SIZE; + } + return i; +} + + +/* + * Receive data from a TCP socket. + */ +static int svc_tcp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int len; + struct kvec *vec; + unsigned int want, base; + __be32 *p; + __be32 calldir; + int pnum; + + dprintk("svc: tcp_recv %p data %d conn %d close %d\n", + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); + + len = svc_tcp_recv_record(svsk, rqstp); + if (len < 0) + goto error; + + base = svc_tcp_restore_pages(svsk, rqstp); + want = svsk->sk_reclen - base; + + vec = rqstp->rq_vec; + + pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], + svsk->sk_reclen); + + rqstp->rq_respages = &rqstp->rq_pages[pnum]; + + /* Now receive data */ + len = svc_partial_recvfrom(rqstp, vec, pnum, want, base); + if (len >= 0) + svsk->sk_tcplen += len; + if (len != want) { + if (len < 0 && len != -EAGAIN) + goto err_other; + svc_tcp_save_pages(svsk, rqstp); + dprintk("svc: incomplete TCP record (%d of %d)\n", + svsk->sk_tcplen, svsk->sk_reclen); + goto err_noclose; + } + + rqstp->rq_arg.len = svsk->sk_reclen; + rqstp->rq_arg.page_base = 0; + if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; + rqstp->rq_arg.page_len = 0; + } else + rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + + rqstp->rq_xprt_ctxt = NULL; + rqstp->rq_prot = IPPROTO_TCP; + + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; + calldir = p[1]; + if (calldir) + len = receive_cb_reply(svsk, rqstp); + + /* Reset TCP read info */ + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + /* If we have more data, signal svc_xprt_enqueue() to try again */ + if (svc_recv_available(svsk) > sizeof(rpc_fraghdr)) + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + + if (len < 0) + goto error; + + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + + dprintk("svc: TCP complete record (%d bytes)\n", rqstp->rq_arg.len); + return rqstp->rq_arg.len; + +error: + if (len != -EAGAIN) + goto err_other; + dprintk("RPC: TCP recvfrom got EAGAIN\n"); + return -EAGAIN; +err_other: + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_xprt.xpt_server->sv_name, -len); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +err_noclose: + return -EAGAIN; /* record not complete */ +} + +/* + * Send out data on TCP socket. + */ +static int svc_tcp_sendto(struct svc_rqst *rqstp) +{ + struct xdr_buf *xbufp = &rqstp->rq_res; + int sent; + __be32 reclen; + + /* Set up the first element of the reply kvec. + * Any other kvecs that may be in use have been taken + * care of by the server implementation itself. + */ + reclen = htonl(0x80000000|((xbufp->len ) - 4)); + memcpy(xbufp->head[0].iov_base, &reclen, 4); + + sent = svc_sendto(rqstp, &rqstp->rq_res); + if (sent != xbufp->len) { + printk(KERN_NOTICE + "rpc-srv/tcp: %s: %s %d when sending %d bytes " + "- shutting down socket\n", + rqstp->rq_xprt->xpt_server->sv_name, + (sent<0)?"got error":"sent only", + sent, xbufp->len); + set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); + svc_xprt_enqueue(rqstp->rq_xprt); + sent = -EAGAIN; + } + return sent; +} + +/* + * Setup response header. TCP has a 4B record length field. + */ +static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* tcp needs a space for the record length... */ + svc_putnl(resv, 0); +} + +static int svc_tcp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) + return 1; + required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; + if (sk_stream_wspace(svsk->sk_sk) >= required) + return 1; + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 0; +} + +static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); +} + +#if defined(CONFIG_NFS_V4_1) +static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, + struct net *, struct sockaddr *, + int, int); +static void svc_bc_sock_free(struct svc_xprt *xprt); + +static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); +} + +static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt) +{ +} + +static struct svc_xprt_ops svc_tcp_bc_ops = { + .xpo_create = svc_bc_tcp_create, + .xpo_detach = svc_bc_tcp_sock_detach, + .xpo_free = svc_bc_sock_free, + .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, +}; + +static struct svc_xprt_class svc_tcp_bc_class = { + .xcl_name = "tcp-bc", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_tcp_bc_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +static void svc_init_bc_xprt_sock(void) +{ + svc_reg_xprt_class(&svc_tcp_bc_class); +} + +static void svc_cleanup_bc_xprt_sock(void) +{ + svc_unreg_xprt_class(&svc_tcp_bc_class); +} +#else /* CONFIG_NFS_V4_1 */ +static void svc_init_bc_xprt_sock(void) +{ +} + +static void svc_cleanup_bc_xprt_sock(void) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +static struct svc_xprt_ops svc_tcp_ops = { + .xpo_create = svc_tcp_create, + .xpo_recvfrom = svc_tcp_recvfrom, + .xpo_sendto = svc_tcp_sendto, + .xpo_release_rqst = svc_release_skb, + .xpo_detach = svc_tcp_sock_detach, + .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_has_wspace = svc_tcp_has_wspace, + .xpo_accept = svc_tcp_accept, +}; + +static struct svc_xprt_class svc_tcp_class = { + .xcl_name = "tcp", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_tcp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +void svc_init_xprt_sock(void) +{ + svc_reg_xprt_class(&svc_tcp_class); + svc_reg_xprt_class(&svc_udp_class); + svc_init_bc_xprt_sock(); +} + +void svc_cleanup_xprt_sock(void) +{ + svc_unreg_xprt_class(&svc_tcp_class); + svc_unreg_xprt_class(&svc_udp_class); + svc_cleanup_bc_xprt_sock(); +} + +static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) +{ + struct sock *sk = svsk->sk_sk; + + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); + set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); + if (sk->sk_state == TCP_LISTEN) { + dprintk("setting up TCP socket for listening\n"); + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + sk->sk_data_ready = svc_tcp_listen_data_ready; + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + } else { + dprintk("setting up TCP socket for reading\n"); + sk->sk_state_change = svc_tcp_state_change; + sk->sk_data_ready = svc_tcp_data_ready; + sk->sk_write_space = svc_tcp_write_space; + + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); + + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + if (sk->sk_state != TCP_ESTABLISHED) + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + } +} + +void svc_sock_update_bufs(struct svc_serv *serv) +{ + /* + * The number of server threads has changed. Update + * rcvbuf and sndbuf accordingly on all sockets + */ + struct svc_sock *svsk; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + list_for_each_entry(svsk, &serv->sv_tempsocks, sk_xprt.xpt_list) + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + spin_unlock_bh(&serv->sv_lock); +} +EXPORT_SYMBOL_GPL(svc_sock_update_bufs); + +/* + * Initialize socket for RPC use and create svc_sock struct + * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. + */ +static struct svc_sock *svc_setup_socket(struct svc_serv *serv, + struct socket *sock, + int *errp, int flags) +{ + struct svc_sock *svsk; + struct sock *inet; + int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + + dprintk("svc: svc_setup_socket %p\n", sock); + if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { + *errp = -ENOMEM; + return NULL; + } + + inet = sock->sk; + + /* Register socket with portmapper */ + if (*errp >= 0 && pmap_register) + *errp = svc_register(serv, inet->sk_family, inet->sk_protocol, + ntohs(inet_sk(inet)->inet_sport)); + + if (*errp < 0) { + kfree(svsk); + return NULL; + } + + inet->sk_user_data = svsk; + svsk->sk_sock = sock; + svsk->sk_sk = inet; + svsk->sk_ostate = inet->sk_state_change; + svsk->sk_odata = inet->sk_data_ready; + svsk->sk_owspace = inet->sk_write_space; + + /* Initialize the socket */ + if (sock->type == SOCK_DGRAM) + svc_udp_init(svsk, serv); + else { + /* initialise setting must have enough space to + * receive and respond to one request. + */ + svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, + 4 * serv->sv_max_mesg); + svc_tcp_init(svsk, serv); + } + + dprintk("svc: svc_setup_socket created %p (inet %p)\n", + svsk, svsk->sk_sk); + + return svsk; +} + +/** + * svc_addsock - add a listener socket to an RPC service + * @serv: pointer to RPC service to which to add a new listener + * @fd: file descriptor of the new listener + * @name_return: pointer to buffer to fill in with name of listener + * @len: size of the buffer + * + * Fills in socket name and returns positive length of name if successful. + * Name is terminated with '\n'. On error, returns a negative errno + * value. + */ +int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, + const size_t len) +{ + int err = 0; + struct socket *so = sockfd_lookup(fd, &err); + struct svc_sock *svsk = NULL; + + if (!so) + return err; + if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6)) + err = -EAFNOSUPPORT; + else if (so->sk->sk_protocol != IPPROTO_TCP && + so->sk->sk_protocol != IPPROTO_UDP) + err = -EPROTONOSUPPORT; + else if (so->state > SS_UNCONNECTED) + err = -EISCONN; + else { + if (!try_module_get(THIS_MODULE)) + err = -ENOENT; + else + svsk = svc_setup_socket(serv, so, &err, + SVC_SOCK_DEFAULTS); + if (svsk) { + struct sockaddr_storage addr; + struct sockaddr *sin = (struct sockaddr *)&addr; + int salen; + if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) + svc_xprt_set_local(&svsk->sk_xprt, sin, salen); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(&svsk->sk_xprt); + err = 0; + } else + module_put(THIS_MODULE); + } + if (err) { + sockfd_put(so); + return err; + } + return svc_one_sock_name(svsk, name_return, len); +} +EXPORT_SYMBOL_GPL(svc_addsock); + +/* + * Create socket for RPC service. + */ +static struct svc_xprt *svc_create_socket(struct svc_serv *serv, + int protocol, + struct net *net, + struct sockaddr *sin, int len, + int flags) +{ + struct svc_sock *svsk; + struct socket *sock; + int error; + int type; + struct sockaddr_storage addr; + struct sockaddr *newsin = (struct sockaddr *)&addr; + int newlen; + int family; + int val; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + + dprintk("svc: svc_create_socket(%s, %d, %s)\n", + serv->sv_program->pg_name, protocol, + __svc_print_addr(sin, buf, sizeof(buf))); + + if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { + printk(KERN_WARNING "svc: only UDP and TCP " + "sockets supported\n"); + return ERR_PTR(-EINVAL); + } + + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + switch (sin->sa_family) { + case AF_INET6: + family = PF_INET6; + break; + case AF_INET: + family = PF_INET; + break; + default: + return ERR_PTR(-EINVAL); + } + + error = __sock_create(net, family, type, protocol, &sock, 1); + if (error < 0) + return ERR_PTR(error); + + svc_reclassify_socket(sock); + + /* + * If this is an PF_INET6 listener, we want to avoid + * getting requests from IPv4 remotes. Those should + * be shunted to a PF_INET listener via rpcbind. + */ + val = 1; + if (family == PF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + + if (type == SOCK_STREAM) + sock->sk->sk_reuse = 1; /* allow address reuse */ + error = kernel_bind(sock, sin, len); + if (error < 0) + goto bummer; + + newlen = len; + error = kernel_getsockname(sock, newsin, &newlen); + if (error < 0) + goto bummer; + + if (protocol == IPPROTO_TCP) { + if ((error = kernel_listen(sock, 64)) < 0) + goto bummer; + } + + if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { + svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); + return (struct svc_xprt *)svsk; + } + +bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); + return ERR_PTR(error); +} + +/* + * Detach the svc_sock from the socket so that no + * more callbacks occur. + */ +static void svc_sock_detach(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sk; + wait_queue_head_t *wq; + + dprintk("svc: svc_sock_detach(%p)\n", svsk); + + /* put back the old socket callbacks */ + sk->sk_state_change = svsk->sk_ostate; + sk->sk_data_ready = svsk->sk_odata; + sk->sk_write_space = svsk->sk_owspace; + + wq = sk_sleep(sk); + if (wq && waitqueue_active(wq)) + wake_up_interruptible(wq); +} + +/* + * Disconnect the socket, and reset the callbacks + */ +static void svc_tcp_sock_detach(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk); + + svc_sock_detach(xprt); + + if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + svc_tcp_clear_pages(svsk); + kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR); + } +} + +/* + * Free the svc_sock's socket resources and the svc_sock itself. + */ +static void svc_sock_free(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + dprintk("svc: svc_sock_free(%p)\n", svsk); + + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); + kfree(svsk); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Create a back channel svc_xprt which shares the fore channel socket. + */ +static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv, + int protocol, + struct net *net, + struct sockaddr *sin, int len, + int flags) +{ + struct svc_sock *svsk; + struct svc_xprt *xprt; + + if (protocol != IPPROTO_TCP) { + printk(KERN_WARNING "svc: only TCP sockets" + " supported on shared back channel\n"); + return ERR_PTR(-EINVAL); + } + + svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + if (!svsk) + return ERR_PTR(-ENOMEM); + + xprt = &svsk->sk_xprt; + svc_xprt_init(&svc_tcp_bc_class, xprt, serv); + + serv->sv_bc_xprt = xprt; + + return xprt; +} + +/* + * Free a back channel svc_sock. + */ +static void svc_bc_sock_free(struct svc_xprt *xprt) +{ + if (xprt) + kfree(container_of(xprt, struct svc_sock, sk_xprt)); +} +#endif /* CONFIG_NFS_V4_1 */ diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c new file mode 100644 index 00000000..e65dcc61 --- /dev/null +++ b/net/sunrpc/sysctl.c @@ -0,0 +1,183 @@ +/* + * linux/net/sunrpc/sysctl.c + * + * Sysctl interface to sunrpc module. + * + * I would prefer to register the sunrpc table below sys/net, but that's + * impossible at the moment. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Declare the debug flags here + */ +unsigned int rpc_debug; +EXPORT_SYMBOL_GPL(rpc_debug); + +unsigned int nfs_debug; +EXPORT_SYMBOL_GPL(nfs_debug); + +unsigned int nfsd_debug; +EXPORT_SYMBOL_GPL(nfsd_debug); + +unsigned int nlm_debug; +EXPORT_SYMBOL_GPL(nlm_debug); + +#ifdef RPC_DEBUG + +static struct ctl_table_header *sunrpc_table_header; +static ctl_table sunrpc_table[]; + +void +rpc_register_sysctl(void) +{ + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl_table(sunrpc_table); +} + +void +rpc_unregister_sysctl(void) +{ + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +} + +static int proc_do_xprt(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[256]; + size_t len; + + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); + return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); +} + +static int +proc_dodebug(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[20], c, *s; + char __user *p; + unsigned int value; + size_t left, len; + + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + + left = *lenp; + + if (write) { + if (!access_ok(VERIFY_READ, buffer, left)) + return -EFAULT; + p = buffer; + while (left && __get_user(c, p) >= 0 && isspace(c)) + left--, p++; + if (!left) + goto done; + + if (left > sizeof(tmpbuf) - 1) + return -EINVAL; + if (copy_from_user(tmpbuf, p, left)) + return -EFAULT; + tmpbuf[left] = '\0'; + + for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--) + value = 10 * value + (*s - '0'); + if (*s && !isspace(*s)) + return -EINVAL; + while (left && isspace(*s)) + left--, s++; + *(unsigned int *) table->data = value; + /* Display the RPC tasks on writing to rpc_debug */ + if (strcmp(table->procname, "rpc_debug") == 0) + rpc_show_tasks(); + } else { + if (!access_ok(VERIFY_WRITE, buffer, left)) + return -EFAULT; + len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + if (len > left) + len = left; + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + if ((left -= len) > 0) { + if (put_user('\n', (char __user *)buffer + len)) + return -EFAULT; + left--; + } + } + +done: + *lenp -= left; + *ppos += *lenp; + return 0; +} + + +static ctl_table debug_table[] = { + { + .procname = "rpc_debug", + .data = &rpc_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dodebug + }, + { + .procname = "nfs_debug", + .data = &nfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dodebug + }, + { + .procname = "nfsd_debug", + .data = &nfsd_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dodebug + }, + { + .procname = "nlm_debug", + .data = &nlm_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dodebug + }, + { + .procname = "transports", + .maxlen = 256, + .mode = 0444, + .proc_handler = proc_do_xprt, + }, + { } +}; + +static ctl_table sunrpc_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = debug_table + }, + { } +}; + +#endif diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c new file mode 100644 index 00000000..dd824341 --- /dev/null +++ b/net/sunrpc/timer.c @@ -0,0 +1,122 @@ +/* + * linux/net/sunrpc/timer.c + * + * Estimate RPC request round trip time. + * + * Based on packet round-trip and variance estimator algorithms described + * in appendix A of "Congestion Avoidance and Control" by Van Jacobson + * and Michael J. Karels (ACM Computer Communication Review; Proceedings + * of the Sigcomm '88 Symposium in Stanford, CA, August, 1988). + * + * This RTT estimator is used only for RPC over datagram protocols. + * + * Copyright (C) 2002 Trond Myklebust + */ + +#include + +#include +#include +#include + +#include + +#define RPC_RTO_MAX (60*HZ) +#define RPC_RTO_INIT (HZ/5) +#define RPC_RTO_MIN (HZ/10) + +/** + * rpc_init_rtt - Initialize an RPC RTT estimator context + * @rt: context to initialize + * @timeo: initial timeout value, in jiffies + * + */ +void rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo) +{ + unsigned long init = 0; + unsigned i; + + rt->timeo = timeo; + + if (timeo > RPC_RTO_INIT) + init = (timeo - RPC_RTO_INIT) << 3; + for (i = 0; i < 5; i++) { + rt->srtt[i] = init; + rt->sdrtt[i] = RPC_RTO_INIT; + rt->ntimeouts[i] = 0; + } +} +EXPORT_SYMBOL_GPL(rpc_init_rtt); + +/** + * rpc_update_rtt - Update an RPC RTT estimator context + * @rt: context to update + * @timer: timer array index (request type) + * @m: recent actual RTT, in jiffies + * + * NB: When computing the smoothed RTT and standard deviation, + * be careful not to produce negative intermediate results. + */ +void rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m) +{ + long *srtt, *sdrtt; + + if (timer-- == 0) + return; + + /* jiffies wrapped; ignore this one */ + if (m < 0) + return; + + if (m == 0) + m = 1L; + + srtt = (long *)&rt->srtt[timer]; + m -= *srtt >> 3; + *srtt += m; + + if (m < 0) + m = -m; + + sdrtt = (long *)&rt->sdrtt[timer]; + m -= *sdrtt >> 2; + *sdrtt += m; + + /* Set lower bound on the variance */ + if (*sdrtt < RPC_RTO_MIN) + *sdrtt = RPC_RTO_MIN; +} +EXPORT_SYMBOL_GPL(rpc_update_rtt); + +/** + * rpc_calc_rto - Provide an estimated timeout value + * @rt: context to use for calculation + * @timer: timer array index (request type) + * + * Estimate RTO for an NFS RPC sent via an unreliable datagram. Use + * the mean and mean deviation of RTT for the appropriate type of RPC + * for frequently issued RPCs, and a fixed default for the others. + * + * The justification for doing "other" this way is that these RPCs + * happen so infrequently that timer estimation would probably be + * stale. Also, since many of these RPCs are non-idempotent, a + * conservative timeout is desired. + * + * getattr, lookup, + * read, write, commit - A+4D + * other - timeo + */ +unsigned long rpc_calc_rto(struct rpc_rtt *rt, unsigned timer) +{ + unsigned long res; + + if (timer-- == 0) + return rt->timeo; + + res = ((rt->srtt[timer] + 7) >> 3) + rt->sdrtt[timer]; + if (res > RPC_RTO_MAX) + res = RPC_RTO_MAX; + + return res; +} +EXPORT_SYMBOL_GPL(rpc_calc_rto); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c new file mode 100644 index 00000000..f008c14a --- /dev/null +++ b/net/sunrpc/xdr.c @@ -0,0 +1,1267 @@ +/* + * linux/net/sunrpc/xdr.c + * + * Generic XDR support. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * XDR functions for basic NFS types + */ +__be32 * +xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) +{ + unsigned int quadlen = XDR_QUADLEN(obj->len); + + p[quadlen] = 0; /* zero trailing bytes */ + *p++ = cpu_to_be32(obj->len); + memcpy(p, obj->data, obj->len); + return p + XDR_QUADLEN(obj->len); +} +EXPORT_SYMBOL_GPL(xdr_encode_netobj); + +__be32 * +xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) +{ + unsigned int len; + + if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ) + return NULL; + obj->len = len; + obj->data = (u8 *) p; + return p + XDR_QUADLEN(len); +} +EXPORT_SYMBOL_GPL(xdr_decode_netobj); + +/** + * xdr_encode_opaque_fixed - Encode fixed length opaque data + * @p: pointer to current position in XDR buffer. + * @ptr: pointer to data to encode (or NULL) + * @nbytes: size of data. + * + * Copy the array of data of length nbytes at ptr to the XDR buffer + * at position p, then align to the next 32-bit boundary by padding + * with zero bytes (see RFC1832). + * Note: if ptr is NULL, only the padding is performed. + * + * Returns the updated current XDR buffer position + * + */ +__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int nbytes) +{ + if (likely(nbytes != 0)) { + unsigned int quadlen = XDR_QUADLEN(nbytes); + unsigned int padding = (quadlen << 2) - nbytes; + + if (ptr != NULL) + memcpy(p, ptr, nbytes); + if (padding != 0) + memset((char *)p + nbytes, 0, padding); + p += quadlen; + } + return p; +} +EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed); + +/** + * xdr_encode_opaque - Encode variable length opaque data + * @p: pointer to current position in XDR buffer. + * @ptr: pointer to data to encode (or NULL) + * @nbytes: size of data. + * + * Returns the updated current XDR buffer position + */ +__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes) +{ + *p++ = cpu_to_be32(nbytes); + return xdr_encode_opaque_fixed(p, ptr, nbytes); +} +EXPORT_SYMBOL_GPL(xdr_encode_opaque); + +__be32 * +xdr_encode_string(__be32 *p, const char *string) +{ + return xdr_encode_array(p, string, strlen(string)); +} +EXPORT_SYMBOL_GPL(xdr_encode_string); + +__be32 * +xdr_decode_string_inplace(__be32 *p, char **sp, + unsigned int *lenp, unsigned int maxlen) +{ + u32 len; + + len = be32_to_cpu(*p++); + if (len > maxlen) + return NULL; + *lenp = len; + *sp = (char *) p; + return p + XDR_QUADLEN(len); +} +EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); + +/** + * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf + * @buf: XDR buffer where string resides + * @len: length of string, in bytes + * + */ +void +xdr_terminate_string(struct xdr_buf *buf, const u32 len) +{ + char *kaddr; + + kaddr = kmap_atomic(buf->pages[0], KM_USER0); + kaddr[buf->page_base + len] = '\0'; + kunmap_atomic(kaddr, KM_USER0); +} +EXPORT_SYMBOL(xdr_terminate_string); + +void +xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct kvec *tail = xdr->tail; + u32 *p; + + xdr->pages = pages; + xdr->page_base = base; + xdr->page_len = len; + + p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len); + tail->iov_base = p; + tail->iov_len = 0; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + *p = 0; + tail->iov_base = (char *)p + (len & 3); + tail->iov_len = pad; + len += pad; + } + xdr->buflen += len; + xdr->len += len; +} +EXPORT_SYMBOL_GPL(xdr_encode_pages); + +void +xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, + struct page **pages, unsigned int base, unsigned int len) +{ + struct kvec *head = xdr->head; + struct kvec *tail = xdr->tail; + char *buf = (char *)head->iov_base; + unsigned int buflen = head->iov_len; + + head->iov_len = offset; + + xdr->pages = pages; + xdr->page_base = base; + xdr->page_len = len; + + tail->iov_base = buf + offset; + tail->iov_len = buflen - offset; + + xdr->buflen += len; +} +EXPORT_SYMBOL_GPL(xdr_inline_pages); + +/* + * Helper routines for doing 'memmove' like operations on a struct xdr_buf + * + * _shift_data_right_pages + * @pages: vector of pages containing both the source and dest memory area. + * @pgto_base: page vector address of destination + * @pgfrom_base: page vector address of source + * @len: number of bytes to copy + * + * Note: the addresses pgto_base and pgfrom_base are both calculated in + * the same way: + * if a memory area starts at byte 'base' in page 'pages[i]', + * then its address is given as (i << PAGE_CACHE_SHIFT) + base + * Also note: pgfrom_base must be < pgto_base, but the memory areas + * they point to may overlap. + */ +static void +_shift_data_right_pages(struct page **pages, size_t pgto_base, + size_t pgfrom_base, size_t len) +{ + struct page **pgfrom, **pgto; + char *vfrom, *vto; + size_t copy; + + BUG_ON(pgto_base <= pgfrom_base); + + pgto_base += len; + pgfrom_base += len; + + pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT); + pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT); + + pgto_base &= ~PAGE_CACHE_MASK; + pgfrom_base &= ~PAGE_CACHE_MASK; + + do { + /* Are any pointers crossing a page boundary? */ + if (pgto_base == 0) { + pgto_base = PAGE_CACHE_SIZE; + pgto--; + } + if (pgfrom_base == 0) { + pgfrom_base = PAGE_CACHE_SIZE; + pgfrom--; + } + + copy = len; + if (copy > pgto_base) + copy = pgto_base; + if (copy > pgfrom_base) + copy = pgfrom_base; + pgto_base -= copy; + pgfrom_base -= copy; + + vto = kmap_atomic(*pgto, KM_USER0); + vfrom = kmap_atomic(*pgfrom, KM_USER1); + memmove(vto + pgto_base, vfrom + pgfrom_base, copy); + flush_dcache_page(*pgto); + kunmap_atomic(vfrom, KM_USER1); + kunmap_atomic(vto, KM_USER0); + + } while ((len -= copy) != 0); +} + +/* + * _copy_to_pages + * @pages: array of pages + * @pgbase: page vector address of destination + * @p: pointer to source data + * @len: length + * + * Copies data from an arbitrary memory location into an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) +{ + struct page **pgto; + char *vto; + size_t copy; + + pgto = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + for (;;) { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vto = kmap_atomic(*pgto, KM_USER0); + memcpy(vto + pgbase, p, copy); + kunmap_atomic(vto, KM_USER0); + + len -= copy; + if (len == 0) + break; + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + flush_dcache_page(*pgto); + pgbase = 0; + pgto++; + } + p += copy; + } + flush_dcache_page(*pgto); +} + +/* + * _copy_from_pages + * @p: pointer to destination + * @pages: array of pages + * @pgbase: offset of source data + * @len: length + * + * Copies data into an arbitrary memory location from an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) +{ + struct page **pgfrom; + char *vfrom; + size_t copy; + + pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vfrom = kmap_atomic(*pgfrom, KM_USER0); + memcpy(p, vfrom + pgbase, copy); + kunmap_atomic(vfrom, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + pgbase = 0; + pgfrom++; + } + p += copy; + + } while ((len -= copy) != 0); +} + +/* + * xdr_shrink_bufhead + * @buf: xdr_buf + * @len: bytes to remove from buf->head[0] + * + * Shrinks XDR buffer's header kvec buf->head[0] by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the inlined pages and/or the tail. + */ +static void +xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +{ + struct kvec *head, *tail; + size_t copy, offs; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + head = buf->head; + BUG_ON (len > head->iov_len); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove((char *)tail->iov_base + len, + tail->iov_base, copy); + } + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > pglen) + copy = pglen; + offs = len - copy; + if (offs >= tail->iov_len) + copy = 0; + else if (copy > tail->iov_len - offs) + copy = tail->iov_len - offs; + if (copy != 0) + _copy_from_pages((char *)tail->iov_base + offs, + buf->pages, + buf->page_base + pglen + offs - len, + copy); + /* Do we also need to copy data from the head into the tail ? */ + if (len > pglen) { + offs = copy = len - pglen; + if (copy > tail->iov_len) + copy = tail->iov_len; + memcpy(tail->iov_base, + (char *)head->iov_base + + head->iov_len - offs, + copy); + } + } + /* Now handle pages */ + if (pglen != 0) { + if (pglen > len) + _shift_data_right_pages(buf->pages, + buf->page_base + len, + buf->page_base, + pglen - len); + copy = len; + if (len > pglen) + copy = pglen; + _copy_to_pages(buf->pages, buf->page_base, + (char *)head->iov_base + head->iov_len - len, + copy); + } + head->iov_len -= len; + buf->buflen -= len; + /* Have we truncated the message? */ + if (buf->len > buf->buflen) + buf->len = buf->buflen; +} + +/* + * xdr_shrink_pagelen + * @buf: xdr_buf + * @len: bytes to remove from buf->pages + * + * Shrinks XDR buffer's page array buf->pages by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the tail. + */ +static void +xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +{ + struct kvec *tail; + size_t copy; + unsigned int pglen = buf->page_len; + unsigned int tailbuf_len; + + tail = buf->tail; + BUG_ON (len > pglen); + + tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; + + /* Shift the tail first */ + if (tailbuf_len != 0) { + unsigned int free_space = tailbuf_len - tail->iov_len; + + if (len < free_space) + free_space = len; + tail->iov_len += free_space; + + copy = len; + if (tail->iov_len > len) { + char *p = (char *)tail->iov_base + len; + memmove(p, tail->iov_base, tail->iov_len - len); + } else + copy = tail->iov_len; + /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); + } + buf->page_len -= len; + buf->buflen -= len; + /* Have we truncated the message? */ + if (buf->len > buf->buflen) + buf->len = buf->buflen; +} + +void +xdr_shift_buf(struct xdr_buf *buf, size_t len) +{ + xdr_shrink_bufhead(buf, len); +} +EXPORT_SYMBOL_GPL(xdr_shift_buf); + +/** + * xdr_init_encode - Initialize a struct xdr_stream for sending data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer in which to encode data + * @p: current pointer inside XDR buffer + * + * Note: at the moment the RPC client only passes the length of our + * scratch buffer in the xdr_buf's header kvec. Previously this + * meant we needed to call xdr_adjust_iovec() after encoding the + * data. With the new scheme, the xdr_stream manages the details + * of the buffer length, and takes care of adjusting the kvec + * length for us. + */ +void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) +{ + struct kvec *iov = buf->head; + int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; + + BUG_ON(scratch_len < 0); + xdr->buf = buf; + xdr->iov = iov; + xdr->p = (__be32 *)((char *)iov->iov_base + iov->iov_len); + xdr->end = (__be32 *)((char *)iov->iov_base + scratch_len); + BUG_ON(iov->iov_len > scratch_len); + + if (p != xdr->p && p != NULL) { + size_t len; + + BUG_ON(p < xdr->p || p > xdr->end); + len = (char *)p - (char *)xdr->p; + xdr->p = p; + buf->len += len; + iov->iov_len += len; + } +} +EXPORT_SYMBOL_GPL(xdr_init_encode); + +/** + * xdr_reserve_space - Reserve buffer space for sending + * @xdr: pointer to xdr_stream + * @nbytes: number of bytes to reserve + * + * Checks that we have enough buffer space to encode 'nbytes' more + * bytes of data. If so, update the total xdr_buf length, and + * adjust the length of the current kvec. + */ +__be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr->p; + __be32 *q; + + /* align nbytes on the next 32-bit boundary */ + nbytes += 3; + nbytes &= ~3; + q = p + (nbytes >> 2); + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + xdr->iov->iov_len += nbytes; + xdr->buf->len += nbytes; + return p; +} +EXPORT_SYMBOL_GPL(xdr_reserve_space); + +/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages + * @base: offset of first byte + * @len: length of data in bytes + * + */ +void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov = buf->tail; + buf->pages = pages; + buf->page_base = base; + buf->page_len = len; + + iov->iov_base = (char *)xdr->p; + iov->iov_len = 0; + xdr->iov = iov; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + BUG_ON(xdr->p >= xdr->end); + iov->iov_base = (char *)xdr->p + (len & 3); + iov->iov_len += pad; + len += pad; + *xdr->p++ = 0; + } + buf->buflen += len; + buf->len += len; +} +EXPORT_SYMBOL_GPL(xdr_write_pages); + +static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, + __be32 *p, unsigned int len) +{ + if (len > iov->iov_len) + len = iov->iov_len; + if (p == NULL) + p = (__be32*)iov->iov_base; + xdr->p = p; + xdr->end = (__be32*)(iov->iov_base + len); + xdr->iov = iov; + xdr->page_ptr = NULL; +} + +static int xdr_set_page_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) +{ + unsigned int pgnr; + unsigned int maxlen; + unsigned int pgoff; + unsigned int pgend; + void *kaddr; + + maxlen = xdr->buf->page_len; + if (base >= maxlen) + return -EINVAL; + maxlen -= base; + if (len > maxlen) + len = maxlen; + + base += xdr->buf->page_base; + + pgnr = base >> PAGE_SHIFT; + xdr->page_ptr = &xdr->buf->pages[pgnr]; + kaddr = page_address(*xdr->page_ptr); + + pgoff = base & ~PAGE_MASK; + xdr->p = (__be32*)(kaddr + pgoff); + + pgend = pgoff + len; + if (pgend > PAGE_SIZE) + pgend = PAGE_SIZE; + xdr->end = (__be32*)(kaddr + pgend); + xdr->iov = NULL; + return 0; +} + +static void xdr_set_next_page(struct xdr_stream *xdr) +{ + unsigned int newbase; + + newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; + newbase -= xdr->buf->page_base; + + if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) + xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len); +} + +static bool xdr_set_next_buffer(struct xdr_stream *xdr) +{ + if (xdr->page_ptr != NULL) + xdr_set_next_page(xdr); + else if (xdr->iov == xdr->buf->head) { + if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) + xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len); + } + return xdr->p != xdr->end; +} + +/** + * xdr_init_decode - Initialize an xdr_stream for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer from which to decode data + * @p: current pointer inside XDR buffer + */ +void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) +{ + xdr->buf = buf; + xdr->scratch.iov_base = NULL; + xdr->scratch.iov_len = 0; + if (buf->head[0].iov_len != 0) + xdr_set_iov(xdr, buf->head, p, buf->len); + else if (buf->page_len != 0) + xdr_set_page_base(xdr, 0, buf->len); +} +EXPORT_SYMBOL_GPL(xdr_init_decode); + +/** + * xdr_init_decode - Initialize an xdr_stream for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer from which to decode data + * @pages: list of pages to decode into + * @len: length in bytes of buffer in pages + */ +void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, + struct page **pages, unsigned int len) +{ + memset(buf, 0, sizeof(*buf)); + buf->pages = pages; + buf->page_len = len; + buf->buflen = len; + buf->len = len; + xdr_init_decode(xdr, buf, NULL); +} +EXPORT_SYMBOL_GPL(xdr_init_decode_pages); + +static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr->p; + __be32 *q = p + XDR_QUADLEN(nbytes); + + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + return p; +} + +/** + * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to an empty buffer + * @buflen: size of 'buf' + * + * The scratch buffer is used when decoding from an array of pages. + * If an xdr_inline_decode() call spans across page boundaries, then + * we copy the data into the scratch buffer in order to allow linear + * access. + */ +void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) +{ + xdr->scratch.iov_base = buf; + xdr->scratch.iov_len = buflen; +} +EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer); + +static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p; + void *cpdest = xdr->scratch.iov_base; + size_t cplen = (char *)xdr->end - (char *)xdr->p; + + if (nbytes > xdr->scratch.iov_len) + return NULL; + memcpy(cpdest, xdr->p, cplen); + cpdest += cplen; + nbytes -= cplen; + if (!xdr_set_next_buffer(xdr)) + return NULL; + p = __xdr_inline_decode(xdr, nbytes); + if (p == NULL) + return NULL; + memcpy(cpdest, p, nbytes); + return xdr->scratch.iov_base; +} + +/** + * xdr_inline_decode - Retrieve XDR data to decode + * @xdr: pointer to xdr_stream struct + * @nbytes: number of bytes of data to decode + * + * Check if the input buffer is long enough to enable us to decode + * 'nbytes' more bytes of data starting at the current position. + * If so return the current pointer, then update the current + * pointer position. + */ +__be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p; + + if (nbytes == 0) + return xdr->p; + if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) + return NULL; + p = __xdr_inline_decode(xdr, nbytes); + if (p != NULL) + return p; + return xdr_copy_to_scratch(xdr, nbytes); +} +EXPORT_SYMBOL_GPL(xdr_inline_decode); + +/** + * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * @xdr: pointer to xdr_stream struct + * @len: number of bytes of page data + * + * Moves data beyond the current pointer position from the XDR head[] buffer + * into the page list. Any data that lies beyond current position + "len" + * bytes is moved into the XDR tail[]. + */ +void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov; + ssize_t shift; + unsigned int end; + int padding; + + /* Realign pages to current pointer position */ + iov = buf->head; + shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p; + if (shift > 0) + xdr_shrink_bufhead(buf, shift); + + /* Truncate page data and move it into the tail */ + if (buf->page_len > len) + xdr_shrink_pagelen(buf, buf->page_len - len); + padding = (XDR_QUADLEN(len) << 2) - len; + xdr->iov = iov = buf->tail; + /* Compute remaining message length. */ + end = iov->iov_len; + shift = buf->buflen - buf->len; + if (shift < end) + end -= shift; + else if (shift > 0) + end = 0; + /* + * Position current pointer at beginning of tail, and + * set remaining message length. + */ + xdr->p = (__be32 *)((char *)iov->iov_base + padding); + xdr->end = (__be32 *)((char *)iov->iov_base + end); +} +EXPORT_SYMBOL_GPL(xdr_read_pages); + +/** + * xdr_enter_page - decode data from the XDR page + * @xdr: pointer to xdr_stream struct + * @len: number of bytes of page data + * + * Moves data beyond the current pointer position from the XDR head[] buffer + * into the page list. Any data that lies beyond current position + "len" + * bytes is moved into the XDR tail[]. The current pointer is then + * repositioned at the beginning of the first XDR page. + */ +void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) +{ + xdr_read_pages(xdr, len); + /* + * Position current pointer at beginning of tail, and + * set remaining message length. + */ + xdr_set_page_base(xdr, 0, len); +} +EXPORT_SYMBOL_GPL(xdr_enter_page); + +static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; + +void +xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +{ + buf->head[0] = *iov; + buf->tail[0] = empty_iov; + buf->page_len = 0; + buf->buflen = buf->len = iov->iov_len; +} +EXPORT_SYMBOL_GPL(xdr_buf_from_iov); + +/* Sets subbuf to the portion of buf of length len beginning base bytes + * from the start of buf. Returns -1 if base of length are out of bounds. */ +int +xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, + unsigned int base, unsigned int len) +{ + subbuf->buflen = subbuf->len = len; + if (base < buf->head[0].iov_len) { + subbuf->head[0].iov_base = buf->head[0].iov_base + base; + subbuf->head[0].iov_len = min_t(unsigned int, len, + buf->head[0].iov_len - base); + len -= subbuf->head[0].iov_len; + base = 0; + } else { + subbuf->head[0].iov_base = NULL; + subbuf->head[0].iov_len = 0; + base -= buf->head[0].iov_len; + } + + if (base < buf->page_len) { + subbuf->page_len = min(buf->page_len - base, len); + base += buf->page_base; + subbuf->page_base = base & ~PAGE_CACHE_MASK; + subbuf->pages = &buf->pages[base >> PAGE_CACHE_SHIFT]; + len -= subbuf->page_len; + base = 0; + } else { + base -= buf->page_len; + subbuf->page_len = 0; + } + + if (base < buf->tail[0].iov_len) { + subbuf->tail[0].iov_base = buf->tail[0].iov_base + base; + subbuf->tail[0].iov_len = min_t(unsigned int, len, + buf->tail[0].iov_len - base); + len -= subbuf->tail[0].iov_len; + base = 0; + } else { + subbuf->tail[0].iov_base = NULL; + subbuf->tail[0].iov_len = 0; + base -= buf->tail[0].iov_len; + } + + if (base || len) + return -1; + return 0; +} +EXPORT_SYMBOL_GPL(xdr_buf_subsegment); + +static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +{ + unsigned int this_len; + + this_len = min_t(unsigned int, len, subbuf->head[0].iov_len); + memcpy(obj, subbuf->head[0].iov_base, this_len); + len -= this_len; + obj += this_len; + this_len = min_t(unsigned int, len, subbuf->page_len); + if (this_len) + _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); + len -= this_len; + obj += this_len; + this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); + memcpy(obj, subbuf->tail[0].iov_base, this_len); +} + +/* obj is assumed to point to allocated memory of size at least len: */ +int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +{ + struct xdr_buf subbuf; + int status; + + status = xdr_buf_subsegment(buf, &subbuf, base, len); + if (status != 0) + return status; + __read_bytes_from_xdr_buf(&subbuf, obj, len); + return 0; +} +EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); + +static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +{ + unsigned int this_len; + + this_len = min_t(unsigned int, len, subbuf->head[0].iov_len); + memcpy(subbuf->head[0].iov_base, obj, this_len); + len -= this_len; + obj += this_len; + this_len = min_t(unsigned int, len, subbuf->page_len); + if (this_len) + _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); + len -= this_len; + obj += this_len; + this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); + memcpy(subbuf->tail[0].iov_base, obj, this_len); +} + +/* obj is assumed to point to allocated memory of size at least len: */ +int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +{ + struct xdr_buf subbuf; + int status; + + status = xdr_buf_subsegment(buf, &subbuf, base, len); + if (status != 0) + return status; + __write_bytes_to_xdr_buf(&subbuf, obj, len); + return 0; +} +EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); + +int +xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +{ + __be32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = be32_to_cpu(raw); + return 0; +} +EXPORT_SYMBOL_GPL(xdr_decode_word); + +int +xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) +{ + __be32 raw = cpu_to_be32(obj); + + return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj)); +} +EXPORT_SYMBOL_GPL(xdr_encode_word); + +/* If the netobj starting offset bytes from the start of xdr_buf is contained + * entirely in the head or the tail, set object to point to it; otherwise + * try to find space for it at the end of the tail, copy it there, and + * set obj to point to it. */ +int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) +{ + struct xdr_buf subbuf; + + if (xdr_decode_word(buf, offset, &obj->len)) + return -EFAULT; + if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len)) + return -EFAULT; + + /* Is the obj contained entirely in the head? */ + obj->data = subbuf.head[0].iov_base; + if (subbuf.head[0].iov_len == obj->len) + return 0; + /* ..or is the obj contained entirely in the tail? */ + obj->data = subbuf.tail[0].iov_base; + if (subbuf.tail[0].iov_len == obj->len) + return 0; + + /* use end of tail as storage for obj: + * (We don't copy to the beginning because then we'd have + * to worry about doing a potentially overlapping copy. + * This assumes the object is at most half the length of the + * tail.) */ + if (obj->len > buf->buflen - buf->len) + return -ENOMEM; + if (buf->tail[0].iov_len != 0) + obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len; + else + obj->data = buf->head[0].iov_base + buf->head[0].iov_len; + __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); + return 0; +} +EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); + +/* Returns 0 on success, or else a negative error code. */ +static int +xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc, int encode) +{ + char *elem = NULL, *c; + unsigned int copied = 0, todo, avail_here; + struct page **ppages = NULL; + int err; + + if (encode) { + if (xdr_encode_word(buf, base, desc->array_len) != 0) + return -EINVAL; + } else { + if (xdr_decode_word(buf, base, &desc->array_len) != 0 || + desc->array_len > desc->array_maxlen || + (unsigned long) base + 4 + desc->array_len * + desc->elem_size > buf->len) + return -EINVAL; + } + base += 4; + + if (!desc->xcode) + return 0; + + todo = desc->array_len * desc->elem_size; + + /* process head */ + if (todo && base < buf->head->iov_len) { + c = buf->head->iov_base + base; + avail_here = min_t(unsigned int, todo, + buf->head->iov_len - base); + todo -= avail_here; + + while (avail_here >= desc->elem_size) { + err = desc->xcode(desc, c); + if (err) + goto out; + c += desc->elem_size; + avail_here -= desc->elem_size; + } + if (avail_here) { + if (!elem) { + elem = kmalloc(desc->elem_size, GFP_KERNEL); + err = -ENOMEM; + if (!elem) + goto out; + } + if (encode) { + err = desc->xcode(desc, elem); + if (err) + goto out; + memcpy(c, elem, avail_here); + } else + memcpy(elem, c, avail_here); + copied = avail_here; + } + base = buf->head->iov_len; /* align to start of pages */ + } + + /* process pages array */ + base -= buf->head->iov_len; + if (todo && base < buf->page_len) { + unsigned int avail_page; + + avail_here = min(todo, buf->page_len - base); + todo -= avail_here; + + base += buf->page_base; + ppages = buf->pages + (base >> PAGE_CACHE_SHIFT); + base &= ~PAGE_CACHE_MASK; + avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base, + avail_here); + c = kmap(*ppages) + base; + + while (avail_here) { + avail_here -= avail_page; + if (copied || avail_page < desc->elem_size) { + unsigned int l = min(avail_page, + desc->elem_size - copied); + if (!elem) { + elem = kmalloc(desc->elem_size, + GFP_KERNEL); + err = -ENOMEM; + if (!elem) + goto out; + } + if (encode) { + if (!copied) { + err = desc->xcode(desc, elem); + if (err) + goto out; + } + memcpy(c, elem + copied, l); + copied += l; + if (copied == desc->elem_size) + copied = 0; + } else { + memcpy(elem + copied, c, l); + copied += l; + if (copied == desc->elem_size) { + err = desc->xcode(desc, elem); + if (err) + goto out; + copied = 0; + } + } + avail_page -= l; + c += l; + } + while (avail_page >= desc->elem_size) { + err = desc->xcode(desc, c); + if (err) + goto out; + c += desc->elem_size; + avail_page -= desc->elem_size; + } + if (avail_page) { + unsigned int l = min(avail_page, + desc->elem_size - copied); + if (!elem) { + elem = kmalloc(desc->elem_size, + GFP_KERNEL); + err = -ENOMEM; + if (!elem) + goto out; + } + if (encode) { + if (!copied) { + err = desc->xcode(desc, elem); + if (err) + goto out; + } + memcpy(c, elem + copied, l); + copied += l; + if (copied == desc->elem_size) + copied = 0; + } else { + memcpy(elem + copied, c, l); + copied += l; + if (copied == desc->elem_size) { + err = desc->xcode(desc, elem); + if (err) + goto out; + copied = 0; + } + } + } + if (avail_here) { + kunmap(*ppages); + ppages++; + c = kmap(*ppages); + } + + avail_page = min(avail_here, + (unsigned int) PAGE_CACHE_SIZE); + } + base = buf->page_len; /* align to start of tail */ + } + + /* process tail */ + base -= buf->page_len; + if (todo) { + c = buf->tail->iov_base + base; + if (copied) { + unsigned int l = desc->elem_size - copied; + + if (encode) + memcpy(c, elem + copied, l); + else { + memcpy(elem + copied, c, l); + err = desc->xcode(desc, elem); + if (err) + goto out; + } + todo -= l; + c += l; + } + while (todo) { + err = desc->xcode(desc, c); + if (err) + goto out; + c += desc->elem_size; + todo -= desc->elem_size; + } + } + err = 0; + +out: + kfree(elem); + if (ppages) + kunmap(*ppages); + return err; +} + +int +xdr_decode_array2(struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) +{ + if (base >= buf->len) + return -EINVAL; + + return xdr_xcode_array2(buf, base, desc, 0); +} +EXPORT_SYMBOL_GPL(xdr_decode_array2); + +int +xdr_encode_array2(struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) +{ + if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > + buf->head->iov_len + buf->page_len + buf->tail->iov_len) + return -EINVAL; + + return xdr_xcode_array2(buf, base, desc, 1); +} +EXPORT_SYMBOL_GPL(xdr_encode_array2); + +int +xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, + int (*actor)(struct scatterlist *, void *), void *data) +{ + int i, ret = 0; + unsigned page_len, thislen, page_offset; + struct scatterlist sg[1]; + + sg_init_table(sg, 1); + + if (offset >= buf->head[0].iov_len) { + offset -= buf->head[0].iov_len; + } else { + thislen = buf->head[0].iov_len - offset; + if (thislen > len) + thislen = len; + sg_set_buf(sg, buf->head[0].iov_base + offset, thislen); + ret = actor(sg, data); + if (ret) + goto out; + offset = 0; + len -= thislen; + } + if (len == 0) + goto out; + + if (offset >= buf->page_len) { + offset -= buf->page_len; + } else { + page_len = buf->page_len - offset; + if (page_len > len) + page_len = len; + len -= page_len; + page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); + i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; + thislen = PAGE_CACHE_SIZE - page_offset; + do { + if (thislen > page_len) + thislen = page_len; + sg_set_page(sg, buf->pages[i], thislen, page_offset); + ret = actor(sg, data); + if (ret) + goto out; + page_len -= thislen; + i++; + page_offset = 0; + thislen = PAGE_CACHE_SIZE; + } while (page_len != 0); + offset = 0; + } + if (len == 0) + goto out; + if (offset < buf->tail[0].iov_len) { + thislen = buf->tail[0].iov_len - offset; + if (thislen > len) + thislen = len; + sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen); + ret = actor(sg, data); + len -= thislen; + } + if (len != 0) + ret = -EINVAL; +out: + return ret; +} +EXPORT_SYMBOL_GPL(xdr_process_buf); + diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c new file mode 100644 index 00000000..ce5eb68a --- /dev/null +++ b/net/sunrpc/xprt.c @@ -0,0 +1,1190 @@ +/* + * linux/net/sunrpc/xprt.c + * + * This is a generic RPC call interface supporting congestion avoidance, + * and asynchronous calls. + * + * The interface works like this: + * + * - When a process places a call, it allocates a request slot if + * one is available. Otherwise, it sleeps on the backlog queue + * (xprt_reserve). + * - Next, the caller puts together the RPC message, stuffs it into + * the request struct, and calls xprt_transmit(). + * - xprt_transmit sends the message and installs the caller on the + * transport's wait list. At the same time, if a reply is expected, + * it installs a timer that is run after the packet's timeout has + * expired. + * - When a packet arrives, the data_ready handler walks the list of + * pending requests for that transport. If a matching XID is found, the + * caller is woken up, and the timer removed. + * - When no reply arrives within the timeout interval, the timer is + * fired by the kernel and runs xprt_timer(). It either adjusts the + * timeout values (minor timeout) or wakes up the caller with a status + * of -ETIMEDOUT. + * - When the caller receives a notification from RPC that a reply arrived, + * it should release the RPC slot, and process the reply. + * If the call timed out, it may choose to retry the operation by + * adjusting the initial timeout value, and simply calling rpc_call + * again. + * + * Support for async RPC is done through a set of RPC-specific scheduling + * primitives that `transparently' work for processes as well as async + * tasks that rely on callbacks. + * + * Copyright (C) 1995-1997, Olaf Kirch + * + * Transport switch API copyright (C) 2005, Chuck Lever + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "sunrpc.h" + +/* + * Local variables + */ + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +/* + * Local functions + */ +static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +static void xprt_connect_status(struct rpc_task *task); +static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); + +static DEFINE_SPINLOCK(xprt_list_lock); +static LIST_HEAD(xprt_list); + +/* + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. + */ +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) +#define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) + +#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) + +/** + * xprt_register_transport - register a transport implementation + * @transport: transport to register + * + * If a transport implementation is loaded as a kernel module, it can + * call this interface to make itself known to the RPC client. + * + * Returns: + * 0: transport successfully registered + * -EEXIST: transport already registered + * -EINVAL: transport module being unloaded + */ +int xprt_register_transport(struct xprt_class *transport) +{ + struct xprt_class *t; + int result; + + result = -EEXIST; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + /* don't register the same transport class twice */ + if (t->ident == transport->ident) + goto out; + } + + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; + +out: + spin_unlock(&xprt_list_lock); + return result; +} +EXPORT_SYMBOL_GPL(xprt_register_transport); + +/** + * xprt_unregister_transport - unregister a transport implementation + * @transport: transport to unregister + * + * Returns: + * 0: transport successfully unregistered + * -ENOENT: transport never registered + */ +int xprt_unregister_transport(struct xprt_class *transport) +{ + struct xprt_class *t; + int result; + + result = 0; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (t == transport) { + printk(KERN_INFO + "RPC: Unregistered %s transport module.\n", + transport->name); + list_del_init(&transport->list); + goto out; + } + } + result = -ENOENT; + +out: + spin_unlock(&xprt_list_lock); + return result; +} +EXPORT_SYMBOL_GPL(xprt_unregister_transport); + +/** + * xprt_load_transport - load a transport implementation + * @transport_name: transport to load + * + * Returns: + * 0: transport successfully loaded + * -ENOENT: transport module not available + */ +int xprt_load_transport(const char *transport_name) +{ + struct xprt_class *t; + int result; + + result = 0; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (strcmp(t->name, transport_name) == 0) { + spin_unlock(&xprt_list_lock); + goto out; + } + } + spin_unlock(&xprt_list_lock); + result = request_module("xprt%s", transport_name); +out: + return result; +} +EXPORT_SYMBOL_GPL(xprt_load_transport); + +/** + * xprt_reserve_xprt - serialize write access to transports + * @task: task that is requesting access to the transport + * + * This prevents mixing the payload of separate requests, and prevents + * transport connects from colliding with writes. No congestion control + * is provided. + */ +int xprt_reserve_xprt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { + if (task == xprt->snd_task) + return 1; + goto out_sleep; + } + xprt->snd_task = task; + req->rq_bytes_sent = 0; + req->rq_ntrans++; + + return 1; + +out_sleep: + dprintk("RPC: %5u failed to lock transport %p\n", + task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(xprt_reserve_xprt); + +static void xprt_clear_locked(struct rpc_xprt *xprt) +{ + xprt->snd_task = NULL; + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) { + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); + } else + queue_work(rpciod_workqueue, &xprt->task_cleanup); +} + +/* + * xprt_reserve_xprt_cong - serialize write access to transports + * @task: task that is requesting access to the transport + * + * Same as xprt_reserve_xprt, but Van Jacobson congestion control is + * integrated into the decision of whether a request is allowed to be + * woken up and given access to the transport. + */ +int xprt_reserve_xprt_cong(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { + if (task == xprt->snd_task) + return 1; + goto out_sleep; + } + if (__xprt_get_cong(xprt, task)) { + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + } + xprt_clear_locked(xprt); +out_sleep: + dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); + +static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + int retval; + + spin_lock_bh(&xprt->transport_lock); + retval = xprt->ops->reserve_xprt(task); + spin_unlock_bh(&xprt->transport_lock); + return retval; +} + +static void __xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + struct rpc_rqst *req; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + return; + + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + + req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + +out_unlock: + xprt_clear_locked(xprt); +} + +static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + return; + if (RPCXPRT_CONGESTED(xprt)) + goto out_unlock; + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + if (__xprt_get_cong(xprt, task)) { + struct rpc_rqst *req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + } +out_unlock: + xprt_clear_locked(xprt); +} + +/** + * xprt_release_xprt - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. No congestion control is provided. + */ +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt_clear_locked(xprt); + __xprt_lock_write_next(xprt); + } +} +EXPORT_SYMBOL_GPL(xprt_release_xprt); + +/** + * xprt_release_xprt_cong - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. Another task is awoken to use the + * transport if the transport's congestion window allows it. + */ +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt_clear_locked(xprt); + __xprt_lock_write_next_cong(xprt); + } +} +EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); + +static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + spin_lock_bh(&xprt->transport_lock); + xprt->ops->release_xprt(xprt, task); + spin_unlock_bh(&xprt->transport_lock); +} + +/* + * Van Jacobson congestion avoidance. Check if the congestion window + * overflowed. Put the task to sleep if this is the case. + */ +static int +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (req->rq_cong) + return 1; + dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", + task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) + return 0; + req->rq_cong = 1; + xprt->cong += RPC_CWNDSCALE; + return 1; +} + +/* + * Adjust the congestion window, and wake up the next task + * that has been sleeping due to congestion + */ +static void +__xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + if (!req->rq_cong) + return; + req->rq_cong = 0; + xprt->cong -= RPC_CWNDSCALE; + __xprt_lock_write_next_cong(xprt); +} + +/** + * xprt_release_rqst_cong - housekeeping when request is complete + * @task: RPC request that recently completed + * + * Useful for transports that require congestion control. + */ +void xprt_release_rqst_cong(struct rpc_task *task) +{ + __xprt_put_cong(task->tk_xprt, task->tk_rqstp); +} +EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); + +/** + * xprt_adjust_cwnd - adjust transport congestion window + * @task: recently completed RPC request used to adjust window + * @result: result code of completed RPC request + * + * We use a time-smoothed congestion estimator to avoid heavy oscillation. + */ +void xprt_adjust_cwnd(struct rpc_task *task, int result) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + unsigned long cwnd = xprt->cwnd; + + if (result >= 0 && cwnd <= xprt->cong) { + /* The (cwnd >> 1) term makes sure + * the result gets rounded properly. */ + cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; + if (cwnd > RPC_MAXCWND(xprt)) + cwnd = RPC_MAXCWND(xprt); + __xprt_lock_write_next_cong(xprt); + } else if (result == -ETIMEDOUT) { + cwnd >>= 1; + if (cwnd < RPC_CWNDSCALE) + cwnd = RPC_CWNDSCALE; + } + dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", + xprt->cong, xprt->cwnd, cwnd); + xprt->cwnd = cwnd; + __xprt_put_cong(xprt, req); +} +EXPORT_SYMBOL_GPL(xprt_adjust_cwnd); + +/** + * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue + * @xprt: transport with waiting tasks + * @status: result code to plant in each task before waking it + * + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) +{ + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +} +EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); + +/** + * xprt_wait_for_buffer_space - wait for transport output buffer to clear + * @task: task to be put to sleep + * @action: function pointer to be executed after wait + */ +void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, action); +} +EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); + +/** + * xprt_write_space - wake the task waiting for transport output buffer space + * @xprt: transport with waiting tasks + * + * Can be called in a soft IRQ context, so xprt_write_space never sleeps. + */ +void xprt_write_space(struct rpc_xprt *xprt) +{ + if (unlikely(xprt->shutdown)) + return; + + spin_lock_bh(&xprt->transport_lock); + if (xprt->snd_task) { + dprintk("RPC: write space: waking waiting task on " + "xprt %p\n", xprt); + rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task); + } + spin_unlock_bh(&xprt->transport_lock); +} +EXPORT_SYMBOL_GPL(xprt_write_space); + +/** + * xprt_set_retrans_timeout_def - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation. + */ +void xprt_set_retrans_timeout_def(struct rpc_task *task) +{ + task->tk_timeout = task->tk_rqstp->rq_timeout; +} +EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def); + +/* + * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout using the RTT estimator. + */ +void xprt_set_retrans_timeout_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rtt *rtt = clnt->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = clnt->cl_timeout->to_maxval; + + task->tk_timeout = rpc_calc_rto(rtt, timer); + task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (task->tk_timeout > max_timeout || task->tk_timeout == 0) + task->tk_timeout = max_timeout; +} +EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt); + +static void xprt_reset_majortimeo(struct rpc_rqst *req) +{ + const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; + + req->rq_majortimeo = req->rq_timeout; + if (to->to_exponential) + req->rq_majortimeo <<= to->to_retries; + else + req->rq_majortimeo += to->to_increment * to->to_retries; + if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0) + req->rq_majortimeo = to->to_maxval; + req->rq_majortimeo += jiffies; +} + +/** + * xprt_adjust_timeout - adjust timeout values for next retransmit + * @req: RPC request containing parameters to use for the adjustment + * + */ +int xprt_adjust_timeout(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; + int status = 0; + + if (time_before(jiffies, req->rq_majortimeo)) { + if (to->to_exponential) + req->rq_timeout <<= 1; + else + req->rq_timeout += to->to_increment; + if (to->to_maxval && req->rq_timeout >= to->to_maxval) + req->rq_timeout = to->to_maxval; + req->rq_retries++; + } else { + req->rq_timeout = to->to_initval; + req->rq_retries = 0; + xprt_reset_majortimeo(req); + /* Reset the RTT counters == "slow start" */ + spin_lock_bh(&xprt->transport_lock); + rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); + spin_unlock_bh(&xprt->transport_lock); + status = -ETIMEDOUT; + } + + if (req->rq_timeout == 0) { + printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); + req->rq_timeout = 5 * HZ; + } + return status; +} + +static void xprt_autoclose(struct work_struct *work) +{ + struct rpc_xprt *xprt = + container_of(work, struct rpc_xprt, task_cleanup); + + xprt->ops->close(xprt); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + xprt_release_write(xprt, NULL); +} + +/** + * xprt_disconnect_done - mark a transport as disconnected + * @xprt: transport to flag for disconnect + * + */ +void xprt_disconnect_done(struct rpc_xprt *xprt) +{ + dprintk("RPC: disconnected transport %p\n", xprt); + spin_lock_bh(&xprt->transport_lock); + xprt_clear_connected(xprt); + xprt_wake_pending_tasks(xprt, -EAGAIN); + spin_unlock_bh(&xprt->transport_lock); +} +EXPORT_SYMBOL_GPL(xprt_disconnect_done); + +/** + * xprt_force_disconnect - force a transport to disconnect + * @xprt: transport to disconnect + * + */ +void xprt_force_disconnect(struct rpc_xprt *xprt) +{ + /* Don't race with the test_bit() in xprt_clear_locked() */ + spin_lock_bh(&xprt->transport_lock); + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + /* Try to schedule an autoclose RPC call */ + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(rpciod_workqueue, &xprt->task_cleanup); + xprt_wake_pending_tasks(xprt, -EAGAIN); + spin_unlock_bh(&xprt->transport_lock); +} + +/** + * xprt_conditional_disconnect - force a transport to disconnect + * @xprt: transport to disconnect + * @cookie: 'connection cookie' + * + * This attempts to break the connection if and only if 'cookie' matches + * the current transport 'connection cookie'. It ensures that we don't + * try to break the connection more than once when we need to retransmit + * a batch of RPC requests. + * + */ +void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) +{ + /* Don't race with the test_bit() in xprt_clear_locked() */ + spin_lock_bh(&xprt->transport_lock); + if (cookie != xprt->connect_cookie) + goto out; + if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt)) + goto out; + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + /* Try to schedule an autoclose RPC call */ + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(rpciod_workqueue, &xprt->task_cleanup); + xprt_wake_pending_tasks(xprt, -EAGAIN); +out: + spin_unlock_bh(&xprt->transport_lock); +} + +static void +xprt_init_autodisconnect(unsigned long data) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)data; + + spin_lock(&xprt->transport_lock); + if (!list_empty(&xprt->recv) || xprt->shutdown) + goto out_abort; + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + goto out_abort; + spin_unlock(&xprt->transport_lock); + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + queue_work(rpciod_workqueue, &xprt->task_cleanup); + return; +out_abort: + spin_unlock(&xprt->transport_lock); +} + +/** + * xprt_connect - schedule a transport connect operation + * @task: RPC task that is requesting the connect + * + */ +void xprt_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + + if (!xprt_bound(xprt)) { + task->tk_status = -EAGAIN; + return; + } + if (!xprt_lock_write(xprt, task)) + return; + + if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) + xprt->ops->close(xprt); + + if (xprt_connected(xprt)) + xprt_release_write(xprt, task); + else { + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + + task->tk_timeout = task->tk_rqstp->rq_timeout; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status); + + if (test_bit(XPRT_CLOSING, &xprt->state)) + return; + if (xprt_test_and_set_connecting(xprt)) + return; + xprt->stat.connect_start = jiffies; + xprt->ops->connect(task); + } +} + +static void xprt_connect_status(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + if (task->tk_status == 0) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + dprintk("RPC: %5u xprt_connect_status: connection established\n", + task->tk_pid); + return; + } + + switch (task->tk_status) { + case -EAGAIN: + dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); + break; + case -ETIMEDOUT: + dprintk("RPC: %5u xprt_connect_status: connect attempt timed " + "out\n", task->tk_pid); + break; + default: + dprintk("RPC: %5u xprt_connect_status: error %d connecting to " + "server %s\n", task->tk_pid, -task->tk_status, + task->tk_client->cl_server); + xprt_release_write(xprt, task); + task->tk_status = -EIO; + } +} + +/** + * xprt_lookup_rqst - find an RPC request corresponding to an XID + * @xprt: transport on which the original request was transmitted + * @xid: RPC XID of incoming reply + * + */ +struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) +{ + struct rpc_rqst *entry; + + list_for_each_entry(entry, &xprt->recv, rq_list) + if (entry->rq_xid == xid) + return entry; + + dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", + ntohl(xid)); + xprt->stat.bad_xids++; + return NULL; +} +EXPORT_SYMBOL_GPL(xprt_lookup_rqst); + +static void xprt_update_rtt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; + long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt)); + + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(rtt, timer, m); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); + } +} + +/** + * xprt_complete_rqst - called when reply processing is complete + * @task: RPC request that recently completed + * @copied: actual number of bytes received from the transport + * + * Caller holds transport lock. + */ +void xprt_complete_rqst(struct rpc_task *task, int copied) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", + task->tk_pid, ntohl(req->rq_xid), copied); + + xprt->stat.recvs++; + req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); + if (xprt->ops->timer != NULL) + xprt_update_rtt(task); + + list_del_init(&req->rq_list); + req->rq_private_buf.len = copied; + /* Ensure all writes are done before we update */ + /* req->rq_reply_bytes_recvd */ + smp_wmb(); + req->rq_reply_bytes_recvd = copied; + rpc_wake_up_queued_task(&xprt->pending, task); +} +EXPORT_SYMBOL_GPL(xprt_complete_rqst); + +static void xprt_timer(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (task->tk_status != -ETIMEDOUT) + return; + dprintk("RPC: %5u xprt_timer\n", task->tk_pid); + + spin_lock_bh(&xprt->transport_lock); + if (!req->rq_reply_bytes_recvd) { + if (xprt->ops->timer) + xprt->ops->timer(task); + } else + task->tk_status = 0; + spin_unlock_bh(&xprt->transport_lock); +} + +static inline int xprt_has_timer(struct rpc_xprt *xprt) +{ + return xprt->idle_timeout != 0; +} + +/** + * xprt_prepare_transmit - reserve the transport before sending a request + * @task: RPC task about to send a request + * + */ +int xprt_prepare_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int err = 0; + + dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); + + spin_lock_bh(&xprt->transport_lock); + if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) { + err = req->rq_reply_bytes_recvd; + goto out_unlock; + } + if (!xprt->ops->reserve_xprt(task)) + err = -EAGAIN; +out_unlock: + spin_unlock_bh(&xprt->transport_lock); + return err; +} + +void xprt_end_transmit(struct rpc_task *task) +{ + xprt_release_write(task->tk_rqstp->rq_xprt, task); +} + +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * We have to copy the iovec because sendmsg fiddles with its contents. + */ +void xprt_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status; + + dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); + + if (!req->rq_reply_bytes_recvd) { + if (list_empty(&req->rq_list) && rpc_reply_expected(task)) { + /* + * Add to the list only if we're expecting a reply + */ + spin_lock_bh(&xprt->transport_lock); + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + /* Add request to the receive list */ + list_add_tail(&req->rq_list, &xprt->recv); + spin_unlock_bh(&xprt->transport_lock); + xprt_reset_majortimeo(req); + /* Turn off autodisconnect */ + del_singleshot_timer_sync(&xprt->timer); + } + } else if (!req->rq_bytes_sent) + return; + + req->rq_connect_cookie = xprt->connect_cookie; + req->rq_xtime = ktime_get(); + status = xprt->ops->send_request(task); + if (status != 0) { + task->tk_status = status; + return; + } + + dprintk("RPC: %5u xmit complete\n", task->tk_pid); + task->tk_flags |= RPC_TASK_SENT; + spin_lock_bh(&xprt->transport_lock); + + xprt->ops->set_retrans_timeout(task); + + xprt->stat.sends++; + xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; + xprt->stat.bklog_u += xprt->backlog.qlen; + + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) { + /* + * Sleep on the pending queue since + * we're expecting a reply. + */ + rpc_sleep_on(&xprt->pending, task, xprt_timer); + } + spin_unlock_bh(&xprt->transport_lock); +} + +static void xprt_alloc_slot(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + task->tk_status = 0; + if (task->tk_rqstp) + return; + if (!list_empty(&xprt->free)) { + struct rpc_rqst *req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); + list_del_init(&req->rq_list); + task->tk_rqstp = req; + xprt_request_init(task, xprt); + return; + } + dprintk("RPC: waiting for request slot\n"); + task->tk_status = -EAGAIN; + task->tk_timeout = 0; + rpc_sleep_on(&xprt->backlog, task, NULL); +} + +static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + memset(req, 0, sizeof(*req)); /* mark unused */ + + spin_lock(&xprt->reserve_lock); + list_add(&req->rq_list, &xprt->free); + rpc_wake_up_next(&xprt->backlog); + spin_unlock(&xprt->reserve_lock); +} + +struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req) +{ + struct rpc_xprt *xprt; + + xprt = kzalloc(size, GFP_KERNEL); + if (xprt == NULL) + goto out; + atomic_set(&xprt->count, 1); + + xprt->max_reqs = max_req; + xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL); + if (xprt->slot == NULL) + goto out_free; + + xprt->xprt_net = get_net(net); + return xprt; + +out_free: + kfree(xprt); +out: + return NULL; +} +EXPORT_SYMBOL_GPL(xprt_alloc); + +void xprt_free(struct rpc_xprt *xprt) +{ + put_net(xprt->xprt_net); + kfree(xprt->slot); + kfree(xprt); +} +EXPORT_SYMBOL_GPL(xprt_free); + +/** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation + * + * If no more slots are available, place the task on the transport's + * backlog queue. + */ +void xprt_reserve(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + task->tk_status = -EIO; + spin_lock(&xprt->reserve_lock); + xprt_alloc_slot(task); + spin_unlock(&xprt->reserve_lock); +} + +static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) +{ + return (__force __be32)xprt->xid++; +} + +static inline void xprt_init_xid(struct rpc_xprt *xprt) +{ + xprt->xid = net_random(); +} + +static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +{ + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); + req->rq_release_snd_buf = NULL; + xprt_reset_majortimeo(req); + dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +/** + * xprt_release - release an RPC request slot + * @task: task which is finished with the slot + * + */ +void xprt_release(struct rpc_task *task) +{ + struct rpc_xprt *xprt; + struct rpc_rqst *req; + + if (!(req = task->tk_rqstp)) + return; + + xprt = req->rq_xprt; + rpc_count_iostats(task); + spin_lock_bh(&xprt->transport_lock); + xprt->ops->release_xprt(xprt, task); + if (xprt->ops->release_request) + xprt->ops->release_request(task); + if (!list_empty(&req->rq_list)) + list_del(&req->rq_list); + xprt->last_used = jiffies; + if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) + mod_timer(&xprt->timer, + xprt->last_used + xprt->idle_timeout); + spin_unlock_bh(&xprt->transport_lock); + if (req->rq_buffer) + xprt->ops->buf_free(req->rq_buffer); + if (req->rq_cred != NULL) + put_rpccred(req->rq_cred); + task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); + + dprintk("RPC: %5u release request %p\n", task->tk_pid, req); + if (likely(!bc_prealloc(req))) + xprt_free_slot(xprt, req); + else + xprt_free_bc_request(req); +} + +/** + * xprt_create_transport - create an RPC transport + * @args: rpc transport creation arguments + * + */ +struct rpc_xprt *xprt_create_transport(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpc_rqst *req; + struct xprt_class *t; + + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (t->ident == args->ident) { + spin_unlock(&xprt_list_lock); + goto found; + } + } + spin_unlock(&xprt_list_lock); + printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); + +found: + xprt = t->setup(args); + if (IS_ERR(xprt)) { + dprintk("RPC: xprt_create_transport: failed, %ld\n", + -PTR_ERR(xprt)); + return xprt; + } + if (test_and_set_bit(XPRT_INITIALIZED, &xprt->state)) + /* ->setup returned a pre-initialized xprt: */ + return xprt; + + spin_lock_init(&xprt->transport_lock); + spin_lock_init(&xprt->reserve_lock); + + INIT_LIST_HEAD(&xprt->free); + INIT_LIST_HEAD(&xprt->recv); +#if defined(CONFIG_NFS_V4_1) + spin_lock_init(&xprt->bc_pa_lock); + INIT_LIST_HEAD(&xprt->bc_pa_list); +#endif /* CONFIG_NFS_V4_1 */ + + INIT_WORK(&xprt->task_cleanup, xprt_autoclose); + if (xprt_has_timer(xprt)) + setup_timer(&xprt->timer, xprt_init_autodisconnect, + (unsigned long)xprt); + else + init_timer(&xprt->timer); + xprt->last_used = jiffies; + xprt->cwnd = RPC_INITCWND; + xprt->bind_index = 0; + + rpc_init_wait_queue(&xprt->binding, "xprt_binding"); + rpc_init_wait_queue(&xprt->pending, "xprt_pending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->resend, "xprt_resend"); + rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); + + /* initialize free list */ + for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--) + list_add(&req->rq_list, &xprt->free); + + xprt_init_xid(xprt); + + dprintk("RPC: created transport %p with %u slots\n", xprt, + xprt->max_reqs); + return xprt; +} + +/** + * xprt_destroy - destroy an RPC transport, killing off all requests. + * @xprt: transport to destroy + * + */ +static void xprt_destroy(struct rpc_xprt *xprt) +{ + dprintk("RPC: destroying transport %p\n", xprt); + xprt->shutdown = 1; + del_timer_sync(&xprt->timer); + + rpc_destroy_wait_queue(&xprt->binding); + rpc_destroy_wait_queue(&xprt->pending); + rpc_destroy_wait_queue(&xprt->sending); + rpc_destroy_wait_queue(&xprt->resend); + rpc_destroy_wait_queue(&xprt->backlog); + cancel_work_sync(&xprt->task_cleanup); + /* + * Tear down transport state and free the rpc_xprt + */ + xprt->ops->destroy(xprt); +} + +/** + * xprt_put - release a reference to an RPC transport. + * @xprt: pointer to the transport + * + */ +void xprt_put(struct rpc_xprt *xprt) +{ + if (atomic_dec_and_test(&xprt->count)) + xprt_destroy(xprt); +} + +/** + * xprt_get - return a reference to an RPC transport. + * @xprt: pointer to the transport + * + */ +struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) +{ + if (atomic_inc_not_zero(&xprt->count)) + return xprt; + return NULL; +} diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile new file mode 100644 index 00000000..5a8f268b --- /dev/null +++ b/net/sunrpc/xprtrdma/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o + +xprtrdma-y := transport.o rpc_rdma.o verbs.o + +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o + +svcrdma-y := svc_rdma.o svc_rdma_transport.o \ + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c new file mode 100644 index 00000000..554d0814 --- /dev/null +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -0,0 +1,882 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * rpc_rdma.c + * + * This file contains the guts of the RPC RDMA protocol, and + * does marshaling/unmarshaling, etc. It is also where interfacing + * to the Linux RPC framework lives. + */ + +#include "xprt_rdma.h" + +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +#ifdef RPC_DEBUG +static const char transfertypes[][12] = { + "pure inline", /* no chunks */ + " read chunk", /* some argument via rdma read */ + "*read chunk", /* entire request via rdma read */ + "write chunk", /* some result via rdma write */ + "reply chunk" /* entire reply via rdma write */ +}; +#endif + +/* + * Chunk assembly from upper layer xdr_buf. + * + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk + * elements. Segments are then coalesced when registered, if possible + * within the selected memreg mode. + * + * Note, this routine is never called if the connection's memory + * registration strategy is 0 (bounce buffers). + */ + +static int +rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) +{ + int len, n = 0, p; + int page_base; + struct page **ppages; + + if (pos == 0 && xdrbuf->head[0].iov_len) { + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->head[0].iov_base; + seg[n].mr_len = xdrbuf->head[0].iov_len; + ++n; + } + + len = xdrbuf->page_len; + ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); + page_base = xdrbuf->page_base & ~PAGE_MASK; + p = 0; + while (len && n < nsegs) { + seg[n].mr_page = ppages[p]; + seg[n].mr_offset = (void *)(unsigned long) page_base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); + BUG_ON(seg[n].mr_len > PAGE_SIZE); + len -= seg[n].mr_len; + ++n; + ++p; + page_base = 0; /* page offset only applies to first page */ + } + + /* Message overflows the seg array */ + if (len && n == nsegs) + return 0; + + if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; + if (n == nsegs) + /* Tail remains, but we're out of segments */ + return 0; + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->tail[0].iov_base; + seg[n].mr_len = xdrbuf->tail[0].iov_len; + ++n; + } + + return n; +} + +/* + * Create read/write chunk lists, and reply chunks, for RDMA + * + * Assume check against THRESHOLD has been done, and chunks are required. + * Assume only encoding one list entry for read|write chunks. The NFSv3 + * protocol is simple enough to allow this as it only has a single "bulk + * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The + * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) + * + * When used for a single reply chunk (which is a special write + * chunk used for the entire reply, rather than just the data), it + * is used primarily for READDIR and READLINK which would otherwise + * be severely size-limited by a small rdma inline read max. The server + * response will come back as an RDMA Write, followed by a message + * of type RDMA_NOMSG carrying the xid and length. As a result, reply + * chunks do not provide data alignment, however they do not require + * "fixup" (moving the response to the upper layer buffer) either. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Read chunklist (a linked list): + * N elements, position P (same P for all chunks of same arg!): + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 + * + * Write chunklist (a list of (one) counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO - 0 + * + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO + */ + +static unsigned int +rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt); + int nsegs, nchunks = 0; + unsigned int pos; + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_read_chunk *cur_rchunk = NULL; + struct rpcrdma_write_array *warray = NULL; + struct rpcrdma_write_chunk *cur_wchunk = NULL; + __be32 *iptr = headerp->rm_body.rm_chunks; + + if (type == rpcrdma_readch || type == rpcrdma_areadch) { + /* a read chunk - server will RDMA Read our memory */ + cur_rchunk = (struct rpcrdma_read_chunk *) iptr; + } else { + /* a write or reply chunk - server will RDMA Write our memory */ + *iptr++ = xdr_zero; /* encode a NULL read chunk list */ + if (type == rpcrdma_replych) + *iptr++ = xdr_zero; /* a NULL write chunk list */ + warray = (struct rpcrdma_write_array *) iptr; + cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); + } + + if (type == rpcrdma_replych || type == rpcrdma_areadch) + pos = 0; + else + pos = target->head[0].iov_len; + + nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); + if (nsegs == 0) + return 0; + + do { + /* bind/register the memory, then build chunk from result. */ + int n = rpcrdma_register_external(seg, nsegs, + cur_wchunk != NULL, r_xprt); + if (n <= 0) + goto out; + if (cur_rchunk) { /* read */ + cur_rchunk->rc_discrim = xdr_one; + /* all read chunks have the same "position" */ + cur_rchunk->rc_position = htonl(pos); + cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); + cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (__be32 *)&cur_rchunk->rc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: read chunk " + "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, pos, n < nsegs ? "more" : "last"); + cur_rchunk++; + r_xprt->rx_stats.read_chunk_count++; + } else { /* write/reply */ + cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); + cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (__be32 *)&cur_wchunk->wc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: %s chunk " + "elem %d@0x%llx:0x%x (%s)\n", __func__, + (type == rpcrdma_replych) ? "reply" : "write", + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + cur_wchunk++; + if (type == rpcrdma_replych) + r_xprt->rx_stats.reply_chunk_count++; + else + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + } + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + + /* success. all failures return above */ + req->rl_nchunks = nchunks; + + BUG_ON(nchunks == 0); + BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + && (nchunks > 3)); + + /* + * finish off header. If write, marshal discrim and nchunks. + */ + if (cur_rchunk) { + iptr = (__be32 *) cur_rchunk; + *iptr++ = xdr_zero; /* finish the read chunk list */ + *iptr++ = xdr_zero; /* encode a NULL write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } else { + warray->wc_discrim = xdr_one; + warray->wc_nchunks = htonl(nchunks); + iptr = (__be32 *) cur_wchunk; + if (type == rpcrdma_writech) { + *iptr++ = xdr_zero; /* finish the write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } + } + + /* + * Return header size. + */ + return (unsigned char *)iptr - (unsigned char *)headerp; + +out: + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt, NULL); + return 0; +} + +/* + * Copy write data inline. + * This function is used for "small" requests. Data which is passed + * to RPC via iovecs (or page list) is copied directly into the + * pre-registered memory buffer for this request. For small amounts + * of data, this is efficient. The cutoff value is tunable. + */ +static int +rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +{ + int i, npages, curlen; + int copy_len; + unsigned char *srcp, *destp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + int page_base; + struct page **ppages; + + destp = rqst->rq_svec[0].iov_base; + curlen = rqst->rq_svec[0].iov_len; + destp += curlen; + /* + * Do optional padding where it makes sense. Alignment of write + * payload can help the server, if our setting is accurate. + */ + pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); + if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) + pad = 0; /* don't pad this request */ + + dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", + __func__, pad, destp, rqst->rq_slen, curlen); + + copy_len = rqst->rq_snd_buf.page_len; + + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { + memmove(destp + copy_len, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d\n", + __func__, destp + copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } + r_xprt->rx_stats.pullup_copy_count += copy_len; + + page_base = rqst->rq_snd_buf.page_base; + ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; + for (i = 0; copy_len && i < npages; i++) { + curlen = PAGE_SIZE - page_base; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", + __func__, i, destp, copy_len, curlen); + srcp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA); + memcpy(destp, srcp+page_base, curlen); + kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); + rqst->rq_svec[0].iov_len += curlen; + destp += curlen; + copy_len -= curlen; + page_base = 0; + } + /* header now contains entire send message */ + return pad; +} + +/* + * Marshal a request: the primary job of this routine is to choose + * the transfer modes. See comments below. + * + * Uses multiple RDMA IOVs for a request: + * [0] -- RPC RDMA header, which uses memory from the *start* of the + * preregistered buffer that already holds the RPC data in + * its middle. + * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. + * [2] -- optional padding. + * [3] -- if padded, header only in [1] and data here. + */ + +int +rpcrdma_marshal_req(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + char *base; + size_t hdrlen, rpclen, padlen; + enum rpcrdma_chunktype rtype, wtype; + struct rpcrdma_msg *headerp; + + /* + * rpclen gets amount of data in first buffer, which is the + * pre-registered buffer. + */ + base = rqst->rq_svec[0].iov_base; + rpclen = rqst->rq_svec[0].iov_len; + + /* build RDMA header in private area at front */ + headerp = (struct rpcrdma_msg *) req->rl_base; + /* don't htonl XID, it's already done in request */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = xdr_one; + headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); + headerp->rm_type = htonl(RDMA_MSG); + + /* + * Chunks needed for results? + * + * o If the expected result is under the inline threshold, all ops + * return as inline (but see later). + * o Large non-read ops return as a single reply chunk. + * o Large read ops return data as write chunk(s), header as inline. + * + * Note: the NFS code sending down multiple result segments implies + * the op is one of read, readdir[plus], readlink or NFSv4 getacl. + */ + + /* + * This code can handle read chunks, write chunks OR reply + * chunks -- only one type. If the request is too big to fit + * inline, then we will choose read chunks. If the request is + * a READ, then use write chunks to separate the file data + * into pages; otherwise use reply chunks. + */ + if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) + wtype = rpcrdma_noch; + else if (rqst->rq_rcv_buf.page_len == 0) + wtype = rpcrdma_replych; + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + wtype = rpcrdma_writech; + else + wtype = rpcrdma_replych; + + /* + * Chunks needed for arguments? + * + * o If the total request is under the inline threshold, all ops + * are sent as inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). + * o Large write ops transmit data as read chunk(s), header as + * inline. + * + * Note: the NFS code sending down multiple argument segments + * implies the op is a write. + * TBD check NFSv4 setacl + */ + if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + rtype = rpcrdma_noch; + else if (rqst->rq_snd_buf.page_len == 0) + rtype = rpcrdma_areadch; + else + rtype = rpcrdma_readch; + + /* The following simplification is not true forever */ + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); + + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && + (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { + /* forced to "pure inline"? */ + dprintk("RPC: %s: too much data (%d/%d) for inline\n", + __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); + return -1; + } + + hdrlen = 28; /*sizeof *headerp;*/ + padlen = 0; + + /* + * Pull up any extra send data into the preregistered buffer. + * When padding is in use and applies to the transfer, insert + * it and change the message type. + */ + if (rtype == rpcrdma_noch) { + + padlen = rpcrdma_inline_pullup(rqst, + RPCRDMA_INLINE_PAD_VALUE(rqst)); + + if (padlen) { + headerp->rm_type = htonl(RDMA_MSGP); + headerp->rm_body.rm_padded.rm_align = + htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); + headerp->rm_body.rm_padded.rm_thresh = + htonl(RPCRDMA_INLINE_PAD_THRESH); + headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; + hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ + BUG_ON(wtype != rpcrdma_noch); + + } else { + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + /* + * Currently we try to not actually use read inline. + * Reply chunks have the desirable property that + * they land, packed, directly in the target buffers + * without headers, so they require no fixup. The + * additional RDMA Write op sends the same amount + * of data, streams on-the-wire and adds no overhead + * on receive. Therefore, we request a reply chunk + * for non-writes wherever feasible and efficient. + */ + if (wtype == rpcrdma_noch && + r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) + wtype = rpcrdma_replych; + } + } + + /* + * Marshal chunks. This routine will return the header length + * consumed by marshaling. + */ + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_snd_buf, headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_rcv_buf, headerp, wtype); + } + + if (hdrlen == 0) + return -1; + + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + headerp, base, req->rl_iov.lkey); + + /* + * initialize send_iov's - normally only two: rdma chunk header and + * single preregistered RPC header buffer, but if padding is present, + * then use a preregistered (and zeroed) pad buffer between the RPC + * header and any write data. In all non-rdma cases, any following + * data has been copied into the RPC header buffer. + */ + req->rl_send_iov[0].addr = req->rl_iov.addr; + req->rl_send_iov[0].length = hdrlen; + req->rl_send_iov[0].lkey = req->rl_iov.lkey; + + req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); + req->rl_send_iov[1].length = rpclen; + req->rl_send_iov[1].lkey = req->rl_iov.lkey; + + req->rl_niovs = 2; + + if (padlen) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + req->rl_send_iov[2].addr = ep->rep_pad.addr; + req->rl_send_iov[2].length = padlen; + req->rl_send_iov[2].lkey = ep->rep_pad.lkey; + + req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; + req->rl_send_iov[3].length = rqst->rq_slen - rpclen; + req->rl_send_iov[3].lkey = req->rl_iov.lkey; + + req->rl_niovs = 4; + } + + return 0; +} + +/* + * Chase down a received write or reply chunklist to get length + * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) + */ +static int +rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) +{ + unsigned int i, total_len; + struct rpcrdma_write_chunk *cur_wchunk; + + i = ntohl(**iptrp); /* get array count */ + if (i > max) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); + total_len = 0; + while (i--) { + struct rpcrdma_segment *seg = &cur_wchunk->wc_target; + ifdebug(FACILITY) { + u64 off; + xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); + dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", + __func__, + ntohl(seg->rs_length), + (unsigned long long)off, + ntohl(seg->rs_handle)); + } + total_len += ntohl(seg->rs_length); + ++cur_wchunk; + } + /* check and adjust for properly terminated write chunk */ + if (wrchunk) { + __be32 *w = (__be32 *) cur_wchunk; + if (*w++ != xdr_zero) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) w; + } + if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) + return -1; + + *iptrp = (__be32 *) cur_wchunk; + return total_len; +} + +/* + * Scatter inline received data back into provided iov's. + */ +static void +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) +{ + int i, npages, curlen, olen; + char *destp; + struct page **ppages; + int page_base; + + curlen = rqst->rq_rcv_buf.head[0].iov_len; + if (curlen > copy_len) { /* write chunk header fixup */ + curlen = copy_len; + rqst->rq_rcv_buf.head[0].iov_len = curlen; + } + + dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", + __func__, srcp, copy_len, curlen); + + /* Shift pointer for first receive segment only */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + srcp += curlen; + copy_len -= curlen; + + olen = copy_len; + i = 0; + rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; + page_base = rqst->rq_rcv_buf.page_base; + ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + + if (copy_len && rqst->rq_rcv_buf.page_len) { + npages = PAGE_ALIGN(page_base + + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; + for (; i < npages; i++) { + curlen = PAGE_SIZE - page_base; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d" + " srcp 0x%p len %d curlen %d\n", + __func__, i, srcp, copy_len, curlen); + destp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA); + memcpy(destp + page_base, srcp, curlen); + flush_dcache_page(ppages[i]); + kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); + srcp += curlen; + copy_len -= curlen; + if (copy_len == 0) + break; + page_base = 0; + } + rqst->rq_rcv_buf.page_len = olen - copy_len; + } else + rqst->rq_rcv_buf.page_len = 0; + + if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { + curlen = copy_len; + if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) + curlen = rqst->rq_rcv_buf.tail[0].iov_len; + if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) + memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", + __func__, srcp, copy_len, curlen); + rqst->rq_rcv_buf.tail[0].iov_len = curlen; + copy_len -= curlen; ++i; + } else + rqst->rq_rcv_buf.tail[0].iov_len = 0; + + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + + if (copy_len) + dprintk("RPC: %s: %d bytes in" + " %d extra segments (%d lost)\n", + __func__, olen, i, copy_len); + + /* TBD avoid a warning from call_decode() */ + rqst->rq_private_buf = rqst->rq_rcv_buf; +} + +/* + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. + */ +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + struct rpc_xprt *xprt = ep->rep_xprt; + + spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; + if (ep->rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) + xprt_wake_pending_tasks(xprt, 0); + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock_bh(&xprt->transport_lock); +} + +/* + * This function is called when memory window unbind which we are waiting + * for completes. Just use rr_func (zeroed by upcall) to signal completion. + */ +static void +rpcrdma_unbind_func(struct rpcrdma_rep *rep) +{ + wake_up(&rep->rr_unbind); +} + +/* + * Called as a tasklet to do req/reply match and complete a request + * Errors must result in the RPC task either being awakened, or + * allowed to timeout, to discover the errors at that time. + */ +void +rpcrdma_reply_handler(struct rpcrdma_rep *rep) +{ + struct rpcrdma_msg *headerp; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct rpc_xprt *xprt = rep->rr_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + __be32 *iptr; + int i, rdmalen, status; + + /* Check status. If bad, signal disconnect and return rep to pool */ + if (rep->rr_len == ~0U) { + rpcrdma_recv_buffer_put(rep); + if (r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -EIO; + rpcrdma_conn_func(&r_xprt->rx_ep); + } + return; + } + if (rep->rr_len < 28) { + dprintk("RPC: %s: short/invalid reply\n", __func__); + goto repost; + } + headerp = (struct rpcrdma_msg *) rep->rr_base; + if (headerp->rm_vers != xdr_one) { + dprintk("RPC: %s: invalid version %d\n", + __func__, ntohl(headerp->rm_vers)); + goto repost; + } + + /* Get XID and try for a match. */ + spin_lock(&xprt->transport_lock); + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); + if (rqst == NULL) { + spin_unlock(&xprt->transport_lock); + dprintk("RPC: %s: reply 0x%p failed " + "to match any request xid 0x%08x len %d\n", + __func__, rep, headerp->rm_xid, rep->rr_len); +repost: + r_xprt->rx_stats.bad_reply_count++; + rep->rr_func = rpcrdma_reply_handler; + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + rpcrdma_recv_buffer_put(rep); + + return; + } + + /* get request object */ + req = rpcr_to_rdmar(rqst); + + dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" + " RPC request 0x%p xid 0x%08x\n", + __func__, rep, req, rqst, headerp->rm_xid); + + BUG_ON(!req || req->rl_reply); + + /* from here on, the reply is no longer an orphan */ + req->rl_reply = rep; + + /* check for expected message types */ + /* The order of some of these tests is important. */ + switch (headerp->rm_type) { + case htonl(RDMA_MSG): + /* never expect read chunks */ + /* never expect reply chunks (two ways to check) */ + /* never expect write chunks without having offered RDMA */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + (headerp->rm_body.rm_chunks[1] == xdr_zero && + headerp->rm_body.rm_chunks[2] != xdr_zero) || + (headerp->rm_body.rm_chunks[1] != xdr_zero && + req->rl_nchunks == 0)) + goto badheader; + if (headerp->rm_body.rm_chunks[1] != xdr_zero) { + /* count any expected write chunks in read reply */ + /* start at write chunk array count */ + iptr = &headerp->rm_body.rm_chunks[2]; + rdmalen = rpcrdma_count_chunks(rep, + req->rl_nchunks, 1, &iptr); + /* check for validity, and no reply chunk after */ + if (rdmalen < 0 || *iptr++ != xdr_zero) + goto badheader; + rep->rr_len -= + ((unsigned char *)iptr - (unsigned char *)headerp); + status = rep->rr_len + rdmalen; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } + } else { + /* else ordinary inline */ + rdmalen = 0; + iptr = (__be32 *)((unsigned char *)headerp + 28); + rep->rr_len -= 28; /*sizeof *headerp;*/ + status = rep->rr_len; + } + /* Fix up the rpc results for upper layer */ + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); + break; + + case htonl(RDMA_NOMSG): + /* never expect read or write chunks, always reply chunks */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + headerp->rm_body.rm_chunks[1] != xdr_zero || + headerp->rm_body.rm_chunks[2] != xdr_one || + req->rl_nchunks == 0) + goto badheader; + iptr = (__be32 *)((unsigned char *)headerp + 28); + rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + if (rdmalen < 0) + goto badheader; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* Reply chunk buffer already is the reply vector - no fixup. */ + status = rdmalen; + break; + +badheader: + default: + dprintk("%s: invalid rpcrdma reply header (type %d):" + " chunks[012] == %d %d %d" + " expected chunks <= %d\n", + __func__, ntohl(headerp->rm_type), + headerp->rm_body.rm_chunks[0], + headerp->rm_body.rm_chunks[1], + headerp->rm_body.rm_chunks[2], + req->rl_nchunks); + status = -EIO; + r_xprt->rx_stats.bad_reply_count++; + break; + } + + /* If using mw bind, start the deregister process now. */ + /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ + if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_MEMWINDOWS: + for (i = 0; req->rl_nchunks-- > 1;) + i += rpcrdma_deregister_external( + &req->rl_segments[i], r_xprt, NULL); + /* Optionally wait (not here) for unbinds to complete */ + rep->rr_func = rpcrdma_unbind_func; + (void) rpcrdma_deregister_external(&req->rl_segments[i], + r_xprt, rep); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + for (i = 0; req->rl_nchunks--;) + i += rpcrdma_deregister_external(&req->rl_segments[i], + r_xprt, NULL); + break; + default: + break; + } + + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", + __func__, xprt, rqst, status); + xprt_complete_rqst(rqst->rq_task, status); + spin_unlock(&xprt->transport_lock); +} diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c new file mode 100644 index 00000000..09af4fab --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* RPC/RDMA parameters */ +unsigned int svcrdma_ord = RPCRDMA_ORD; +static unsigned int min_ord = 1; +static unsigned int max_ord = 4096; +unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +static unsigned int min_max_requests = 4; +static unsigned int max_max_requests = 16384; +unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; +static unsigned int min_max_inline = 4096; +static unsigned int max_max_inline = 65536; + +atomic_t rdma_stat_recv; +atomic_t rdma_stat_read; +atomic_t rdma_stat_write; +atomic_t rdma_stat_sq_starve; +atomic_t rdma_stat_rq_starve; +atomic_t rdma_stat_rq_poll; +atomic_t rdma_stat_rq_prod; +atomic_t rdma_stat_sq_poll; +atomic_t rdma_stat_sq_prod; + +/* Temporary NFS request map and context caches */ +struct kmem_cache *svc_rdma_map_cachep; +struct kmem_cache *svc_rdma_ctxt_cachep; + +struct workqueue_struct *svc_rdma_wq; + +/* + * This function implements reading and resetting an atomic_t stat + * variable through read/write to a proc file. Any write to the file + * resets the associated statistic to zero. Any read returns it's + * current value. + */ +static int read_reset_stat(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + atomic_t *stat = (atomic_t *)table->data; + + if (!stat) + return -EINVAL; + + if (write) + atomic_set(stat, 0); + else { + char str_buf[32]; + char *data; + int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); + if (len >= 32) + return -EFAULT; + len = strlen(str_buf); + if (*ppos > len) { + *lenp = 0; + return 0; + } + data = &str_buf[*ppos]; + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len && copy_to_user(buffer, str_buf, len)) + return -EFAULT; + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table_header *svcrdma_table_header; +static ctl_table svcrdma_parm_table[] = { + { + .procname = "max_requests", + .data = &svcrdma_max_requests, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_requests, + .extra2 = &max_max_requests + }, + { + .procname = "max_req_size", + .data = &svcrdma_max_req_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_inline, + .extra2 = &max_max_inline + }, + { + .procname = "max_outbound_read_requests", + .data = &svcrdma_ord, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ord, + .extra2 = &max_ord, + }, + + { + .procname = "rdma_stat_read", + .data = &rdma_stat_read, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_recv", + .data = &rdma_stat_recv, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_write", + .data = &rdma_stat_write, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_starve", + .data = &rdma_stat_sq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_starve", + .data = &rdma_stat_rq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_poll", + .data = &rdma_stat_rq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_prod", + .data = &rdma_stat_rq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_poll", + .data = &rdma_stat_sq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_prod", + .data = &rdma_stat_sq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { }, +}; + +static ctl_table svcrdma_table[] = { + { + .procname = "svc_rdma", + .mode = 0555, + .child = svcrdma_parm_table + }, + { }, +}; + +static ctl_table svcrdma_root_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = svcrdma_table + }, + { }, +}; + +void svc_rdma_cleanup(void) +{ + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + destroy_workqueue(svc_rdma_wq); + if (svcrdma_table_header) { + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + } + svc_unreg_xprt_class(&svc_rdma_class); + kmem_cache_destroy(svc_rdma_map_cachep); + kmem_cache_destroy(svc_rdma_ctxt_cachep); +} + +int svc_rdma_init(void) +{ + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %d\n", svcrdma_max_requests); + dprintk("\tsq_depth : %d\n", + svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + + svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); + if (!svc_rdma_wq) + return -ENOMEM; + + if (!svcrdma_table_header) + svcrdma_table_header = + register_sysctl_table(svcrdma_root_table); + + /* Create the temporary map cache */ + svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", + sizeof(struct svc_rdma_req_map), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_map_cachep) { + printk(KERN_INFO "Could not allocate map cache.\n"); + goto err0; + } + + /* Create the temporary context cache */ + svc_rdma_ctxt_cachep = + kmem_cache_create("svc_rdma_ctxt_cache", + sizeof(struct svc_rdma_op_ctxt), + 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!svc_rdma_ctxt_cachep) { + printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); + goto err1; + } + + /* Register RDMA with the SVC transport switch */ + svc_reg_xprt_class(&svc_rdma_class); + return 0; + err1: + kmem_cache_destroy(svc_rdma_map_cachep); + err0: + unregister_sysctl_table(svcrdma_table_header); + destroy_workqueue(svc_rdma_wq); + return -ENOMEM; +} +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("SVC RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +module_init(svc_rdma_init); +module_exit(svc_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c new file mode 100644 index 00000000..9530ef2d --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Decodes a read chunk list. The expected format is as follows: + * descrim : xdr_one + * position : u32 offset into XDR stream + * handle : u32 RKEY + * . . . + * end-of-list: xdr_zero + */ +static u32 *decode_read_list(u32 *va, u32 *vaend) +{ + struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + + while (ch->rc_discrim != xdr_zero) { + u64 ch_offset; + + if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > + (unsigned long)vaend) { + dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + return NULL; + } + + ch->rc_discrim = ntohl(ch->rc_discrim); + ch->rc_position = ntohl(ch->rc_position); + ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle); + ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length); + va = (u32 *)&ch->rc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + ch++; + } + return (u32 *)&ch->rc_position; +} + +/* + * Determine number of chunks and total bytes in chunk list. The chunk + * list has already been verified to fit within the RPCRDMA header. + */ +void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, + int *ch_count, int *byte_count) +{ + /* compute the number of bytes represented by read chunks */ + *byte_count = 0; + *ch_count = 0; + for (; ch->rc_discrim != 0; ch++) { + *byte_count = *byte_count + ch->rc_target.rs_length; + *ch_count = *ch_count + 1; + } +} + +/* + * Decodes a write chunk list. The expected format is as follows: + * descrim : xdr_one + * nchunks : + * handle : u32 RKEY ---+ + * length : u32 | + * offset : remove va + + * . . . | + * ---+ + */ +static u32 *decode_write_list(u32 *va, u32 *vaend) +{ + int ch_no; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for not write-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + ary->wc_discrim = ntohl(ary->wc_discrim); + ary->wc_nchunks = ntohl(ary->wc_nchunks); + if (((unsigned long)&ary->wc_array[0] + + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, ary->wc_nchunks, vaend); + return NULL; + } + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { + u64 ch_offset; + + ary->wc_array[ch_no].wc_target.rs_handle = + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); + ary->wc_array[ch_no].wc_target.rs_length = + ntohl(ary->wc_array[ch_no].wc_target.rs_length); + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + } + + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length; +} + +static u32 *decode_reply_array(u32 *va, u32 *vaend) +{ + int ch_no; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for no reply-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + ary->wc_discrim = ntohl(ary->wc_discrim); + ary->wc_nchunks = ntohl(ary->wc_nchunks); + if (((unsigned long)&ary->wc_array[0] + + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, ary->wc_nchunks, vaend); + return NULL; + } + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { + u64 ch_offset; + + ary->wc_array[ch_no].wc_target.rs_handle = + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); + ary->wc_array[ch_no].wc_target.rs_length = + ntohl(ary->wc_array[ch_no].wc_target.rs_length); + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + } + + return (u32 *)&ary->wc_array[ch_no]; +} + +int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, + struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + u32 *va; + u32 *vaend; + u32 hdr_len; + + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Verify that there's enough bytes for header + something */ + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { + dprintk("svcrdma: header too short = %d\n", + rqstp->rq_arg.len); + return -EINVAL; + } + + /* Decode the header */ + rmsgp->rm_xid = ntohl(rmsgp->rm_xid); + rmsgp->rm_vers = ntohl(rmsgp->rm_vers); + rmsgp->rm_credit = ntohl(rmsgp->rm_credit); + rmsgp->rm_type = ntohl(rmsgp->rm_type); + + if (rmsgp->rm_vers != RPCRDMA_VERSION) + return -ENOSYS; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + int hdrlen; + rmsgp->rm_body.rm_padded.rm_align = + ntohl(rmsgp->rm_body.rm_padded.rm_align); + rmsgp->rm_body.rm_padded.rm_thresh = + ntohl(rmsgp->rm_body.rm_padded.rm_thresh); + + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + if (hdrlen > rqstp->rq_arg.len) + return -EINVAL; + return hdrlen; + } + + /* The chunk list may contain either a read chunk list or a write + * chunk list and a reply chunk list. + */ + va = &rmsgp->rm_body.rm_chunks[0]; + vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + va = decode_read_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_write_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_reply_array(va, vaend); + if (!va) + return -EINVAL; + + rqstp->rq_arg.head[0].iov_base = va; + hdr_len = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdr_len; + + *rdma_req = rmsgp; + return hdr_len; +} + +int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + struct rpcrdma_read_chunk *ch; + struct rpcrdma_write_array *ary; + u32 *va; + u32 hdrlen; + + dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", + rqstp); + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + return hdrlen; + } + + /* + * Skip all chunks to find RPC msg. These were previously processed + */ + va = &rmsgp->rm_body.rm_chunks[0]; + + /* Skip read-list */ + for (ch = (struct rpcrdma_read_chunk *)va; + ch->rc_discrim != xdr_zero; ch++); + va = (u32 *)&ch->rc_position; + + /* Skip write-list */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; + + /* Skip reply-array */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + va = (u32 *)&ary->wc_array[ary->wc_nchunks]; + + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdrlen; + + return hdrlen; +} + +int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err, u32 *va) +{ + u32 *startp = va; + + *va++ = htonl(rmsgp->rm_xid); + *va++ = htonl(rmsgp->rm_vers); + *va++ = htonl(xprt->sc_max_requests); + *va++ = htonl(RDMA_ERROR); + *va++ = htonl(err); + if (err == ERR_VERS) { + *va++ = htonl(RPCRDMA_VERSION); + *va++ = htonl(RPCRDMA_VERSION); + } + + return (int)((unsigned long)va - (unsigned long)startp); +} + +int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_write_array *wr_ary; + + /* There is no read-list in a reply */ + + /* skip write list */ + wr_ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. + wc_target.rs_length; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + /* skip reply array */ + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + return (unsigned long) wr_ary - (unsigned long) rmsgp; +} + +void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) +{ + struct rpcrdma_write_array *ary; + + /* no read-list */ + rmsgp->rm_body.rm_chunks[0] = xdr_zero; + + /* write-array discrim */ + ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); + + /* write-list terminator */ + ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; + + /* reply-array discriminator */ + ary->wc_array[chunks].wc_target.rs_length = xdr_zero; +} + +void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, + int chunks) +{ + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); +} + +void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, + int chunk_no, + u32 rs_handle, u64 rs_offset, + u32 write_len) +{ + struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; + seg->rs_handle = htonl(rs_handle); + seg->rs_length = htonl(write_len); + xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset); +} + +void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + enum rpcrdma_proc rdma_type) +{ + rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); + rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); + rdma_resp->rm_credit = htonl(xprt->sc_max_requests); + rdma_resp->rm_type = htonl(rdma_type); + + /* Encode chunks lists */ + rdma_resp->rm_body.rm_chunks[0] = xdr_zero; + rdma_resp->rm_body.rm_chunks[1] = xdr_zero; + rdma_resp->rm_body.rm_chunks[2] = xdr_zero; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c new file mode 100644 index 00000000..df67211c --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Replace the pages in the rq_argpages array with the pages from the SGE in + * the RDMA_RECV completion. The SGL should contain full pages up until the + * last one. + */ +static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt, + u32 byte_count) +{ + struct page *page; + u32 bc; + int sge_no; + + /* Swap the page in the SGE with the page in argpages */ + page = ctxt->pages[0]; + put_page(rqstp->rq_pages[0]); + rqstp->rq_pages[0] = page; + + /* Set up the XDR head */ + rqstp->rq_arg.head[0].iov_base = page_address(page); + rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); + rqstp->rq_arg.len = byte_count; + rqstp->rq_arg.buflen = byte_count; + + /* Compute bytes past head in the SGL */ + bc = byte_count - rqstp->rq_arg.head[0].iov_len; + + /* If data remains, store it in the pagelist */ + rqstp->rq_arg.page_len = bc; + rqstp->rq_arg.page_base = 0; + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; + sge_no = 1; + while (bc && sge_no < ctxt->count) { + page = ctxt->pages[sge_no]; + put_page(rqstp->rq_pages[sge_no]); + rqstp->rq_pages[sge_no] = page; + bc -= min(bc, ctxt->sge[sge_no].length); + rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; + sge_no++; + } + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + + /* We should never run out of SGE because the limit is defined to + * support the max allowed RPC data length + */ + BUG_ON(bc && (sge_no == ctxt->count)); + BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) + != byte_count); + BUG_ON(rqstp->rq_arg.len != byte_count); + + /* If not all pages were used from the SGL, free the remaining ones */ + bc = sge_no; + while (sge_no < ctxt->count) { + page = ctxt->pages[sge_no++]; + put_page(page); + } + ctxt->count = bc; + + /* Set up tail */ + rqstp->rq_arg.tail[0].iov_base = NULL; + rqstp->rq_arg.tail[0].iov_len = 0; +} + +/* Encode a read-chunk-list as an array of IB SGE + * + * Assumptions: + * - chunk[0]->position points to pages[0] at an offset of 0 + * - pages[] is not physically or virtually contiguous and consists of + * PAGE_SIZE elements. + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + * + */ +static int map_read_chunks(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, + int ch_count, + int byte_count) +{ + int sge_no; + int sge_bytes; + int page_off; + int page_no; + int ch_bytes; + int ch_no; + struct rpcrdma_read_chunk *ch; + + sge_no = 0; + page_no = 0; + page_off = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch_no = 0; + ch_bytes = ch->rc_target.rs_length; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = ch_bytes; + head->arg.len = rqstp->rq_arg.len + ch_bytes; + head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; + head->count++; + chl_map->ch[0].start = 0; + while (byte_count) { + rpl_map->sge[sge_no].iov_base = + page_address(rqstp->rq_arg.pages[page_no]) + page_off; + sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); + rpl_map->sge[sge_no].iov_len = sge_bytes; + /* + * Don't bump head->count here because the same page + * may be used by multiple SGE. + */ + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; + + byte_count -= sge_bytes; + ch_bytes -= sge_bytes; + sge_no++; + /* + * If all bytes for this chunk have been mapped to an + * SGE, move to the next SGE + */ + if (ch_bytes == 0) { + chl_map->ch[ch_no].count = + sge_no - chl_map->ch[ch_no].start; + ch_no++; + ch++; + chl_map->ch[ch_no].start = sge_no; + ch_bytes = ch->rc_target.rs_length; + /* If bytes remaining account for next chunk */ + if (byte_count) { + head->arg.page_len += ch_bytes; + head->arg.len += ch_bytes; + head->arg.buflen += ch_bytes; + } + } + /* + * If this SGE consumed all of the page, move to the + * next page + */ + if ((sge_bytes + page_off) == PAGE_SIZE) { + page_no++; + page_off = 0; + /* + * If there are still bytes left to map, bump + * the page count + */ + if (byte_count) + head->count++; + } else + page_off += sge_bytes; + } + BUG_ON(byte_count != 0); + return sge_no; +} + +/* Map a read-chunk-list to an XDR and fast register the page-list. + * + * Assumptions: + * - chunk[0] position points to pages[0] at an offset of 0 + * - pages[] will be made physically contiguous by creating a one-off memory + * region using the fastreg verb. + * - byte_count is # of bytes in read-chunk-list + * - ch_count is # of chunks in read-chunk-list + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + */ +static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct svc_rdma_req_map *rpl_map, + struct svc_rdma_req_map *chl_map, + int ch_count, + int byte_count) +{ + int page_no; + int ch_no; + u32 offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_fastreg_mr *frmr; + int ret = 0; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + + head->frmr = frmr; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->hdr_count = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = byte_count; + head->arg.len = rqstp->rq_arg.len + byte_count; + head->arg.buflen = rqstp->rq_arg.buflen + byte_count; + + /* Fast register the page list */ + frmr->kva = page_address(rqstp->rq_arg.pages[0]); + frmr->direction = DMA_FROM_DEVICE; + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); + frmr->map_len = byte_count; + frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, + rqstp->rq_arg.pages[page_no], 0, + PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + } + head->count += page_no; + + /* rq_respages points one past arg pages */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + + /* Create the reply and chunk maps */ + offset = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + for (ch_no = 0; ch_no < ch_count; ch_no++) { + rpl_map->sge[ch_no].iov_base = frmr->kva + offset; + rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length; + chl_map->ch[ch_no].count = 1; + chl_map->ch[ch_no].start = ch_no; + offset += ch->rc_target.rs_length; + ch++; + } + + ret = svc_rdma_fastreg(xprt, frmr); + if (ret) + goto fatal_err; + + return ch_no; + + fatal_err: + printk("svcrdma: error fast registering xdr for xprt %p", xprt); + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_fastreg_mr *frmr, + struct kvec *vec, + u64 *sgl_offset, + int count) +{ + int i; + unsigned long off; + + ctxt->count = count; + ctxt->direction = DMA_FROM_DEVICE; + for (i = 0; i < count; i++) { + ctxt->sge[i].length = 0; /* in case map fails */ + if (!frmr) { + BUG_ON(0 == virt_to_page(vec[i].iov_base)); + off = (unsigned long)vec[i].iov_base & ~PAGE_MASK; + ctxt->sge[i].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + virt_to_page(vec[i].iov_base), + off, + vec[i].iov_len, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + ctxt->sge[i].addr)) + return -EINVAL; + ctxt->sge[i].lkey = xprt->sc_dma_lkey; + atomic_inc(&xprt->sc_dma_used); + } else { + ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; + ctxt->sge[i].lkey = frmr->mr->lkey; + } + ctxt->sge[i].length = vec[i].iov_len; + *sgl_offset = *sgl_offset + vec[i].iov_len; + } + return 0; +} + +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) +{ + if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == + RDMA_TRANSPORT_IWARP) && + sge_count > 1) + return 1; + else + return min_t(int, sge_count, xprt->sc_max_sge); +} + +/* + * Use RDMA_READ to read data from the advertised client buffer into the + * XDR stream starting at rq_arg.head[0].iov_base. + * Each chunk in the array + * contains the following fields: + * discrim - '1', This isn't used for data placement + * position - The xdr stream offset (the same for every chunk) + * handle - RMR for client memory region + * length - data transfer length + * offset - 64 bit tagged offset in remote memory region + * + * On our side, we need to read into a pagelist. The first page immediately + * follows the RPC header. + * + * This function returns: + * 0 - No error and no read-list found. + * + * 1 - Successful read-list processing. The data is not yet in + * the pagelist and therefore the RPC request must be deferred. The + * I/O completion will enqueue the transport again and + * svc_rdma_recvfrom will complete the request. + * + * <0 - Error processing/posting read-list. + * + * NOTE: The ctxt must not be touched after the last WR has been posted + * because the I/O completion processing may occur on another + * processor and free / modify the context. Ne touche pas! + */ +static int rdma_read_xdr(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *hdr_ctxt) +{ + struct ib_send_wr read_wr; + struct ib_send_wr inv_wr; + int err = 0; + int ch_no; + int ch_count; + int byte_count; + int sge_count; + u64 sgl_offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_op_ctxt *ctxt = NULL; + struct svc_rdma_req_map *rpl_map; + struct svc_rdma_req_map *chl_map; + + /* If no read list is present, return 0 */ + ch = svc_rdma_get_read_chunk(rmsgp); + if (!ch) + return 0; + + svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); + if (ch_count > RPCSVC_MAXPAGES) + return -EINVAL; + + /* Allocate temporary reply and chunk maps */ + rpl_map = svc_rdma_get_req_map(); + chl_map = svc_rdma_get_req_map(); + + if (!xprt->sc_frmr_pg_list_len) + sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + else + sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, + rpl_map, chl_map, ch_count, + byte_count); + if (sge_count < 0) { + err = -EIO; + goto out; + } + + sgl_offset = 0; + ch_no = 0; + + for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch->rc_discrim != 0; ch++, ch_no++) { +next_sge: + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_FROM_DEVICE; + ctxt->frmr = hdr_ctxt->frmr; + ctxt->read_hdr = NULL; + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + + /* Prepare READ WR */ + memset(&read_wr, 0, sizeof read_wr); + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + ctxt->wr_op = read_wr.opcode; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; + read_wr.wr.rdma.remote_addr = + get_unaligned(&(ch->rc_target.rs_offset)) + + sgl_offset; + read_wr.sg_list = ctxt->sge; + read_wr.num_sge = + rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); + err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, + &rpl_map->sge[chl_map->ch[ch_no].start], + &sgl_offset, + read_wr.num_sge); + if (err) { + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + goto out; + } + if (((ch+1)->rc_discrim == 0) && + (read_wr.num_sge == chl_map->ch[ch_no].count)) { + /* + * Mark the last RDMA_READ with a bit to + * indicate all RPC data has been fetched from + * the client and the RPC needs to be enqueued. + */ + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + if (hdr_ctxt->frmr) { + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + /* + * Invalidate the local MR used to map the data + * sink. + */ + if (xprt->sc_dev_caps & + SVCRDMA_DEVCAP_READ_W_INV) { + read_wr.opcode = + IB_WR_RDMA_READ_WITH_INV; + ctxt->wr_op = read_wr.opcode; + read_wr.ex.invalidate_rkey = + ctxt->frmr->mr->lkey; + } else { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + hdr_ctxt->frmr->mr->lkey; + read_wr.next = &inv_wr; + } + } + ctxt->read_hdr = hdr_ctxt; + } + /* Post the read */ + err = svc_rdma_send(xprt, &read_wr); + if (err) { + printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", + err); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); + goto out; + } + atomic_inc(&rdma_stat_read); + + if (read_wr.num_sge < chl_map->ch[ch_no].count) { + chl_map->ch[ch_no].count -= read_wr.num_sge; + chl_map->ch[ch_no].start += read_wr.num_sge; + goto next_sge; + } + sgl_offset = 0; + err = 1; + } + + out: + svc_rdma_put_req_map(rpl_map); + svc_rdma_put_req_map(chl_map); + + /* Detach arg pages. svc_recv will replenish them */ + for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) + rqstp->rq_pages[ch_no] = NULL; + + /* + * Detach res pages. svc_release must see a resused count of + * zero or it will attempt to put them. + */ + while (rqstp->rq_resused) + rqstp->rq_respages[--rqstp->rq_resused] = NULL; + + return err; +} + +static int rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) +{ + int page_no; + int ret; + + BUG_ON(!head); + + /* Copy RPC pages */ + for (page_no = 0; page_no < head->count; page_no++) { + put_page(rqstp->rq_pages[page_no]); + rqstp->rq_pages[page_no] = head->pages[page_no]; + } + /* Point rq_arg.pages past header */ + rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; + rqstp->rq_arg.page_len = head->arg.page_len; + rqstp->rq_arg.page_base = head->arg.page_base; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_resused = 0; + + /* Rebuild rq_arg head and tail. */ + rqstp->rq_arg.head[0] = head->arg.head[0]; + rqstp->rq_arg.tail[0] = head->arg.tail[0]; + rqstp->rq_arg.len = head->arg.len; + rqstp->rq_arg.buflen = head->arg.buflen; + + /* Free the context */ + svc_rdma_put_context(head, 0); + + /* XXX: What should this be? */ + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt); + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + + return ret; +} + +/* + * Set up the rqstp thread context to point to the RQ buffer. If + * necessary, pull additional data from the client with an RDMA_READ + * request. + */ +int svc_rdma_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma_xprt = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_op_ctxt *ctxt = NULL; + struct rpcrdma_msg *rmsgp; + int ret = 0; + int len; + + dprintk("svcrdma: rqstp=%p\n", rqstp); + + spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!list_empty(&rdma_xprt->sc_read_complete_q)) { + ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } + if (ctxt) { + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + return rdma_read_complete(rqstp, ctxt); + } + + if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } else { + atomic_inc(&rdma_stat_rq_starve); + clear_bit(XPT_DATA, &xprt->xpt_flags); + ctxt = NULL; + } + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!ctxt) { + /* This is the EAGAIN path. The svc_recv routine will + * return -EAGAIN, the nfsd thread will go to call into + * svc_recv again and we shouldn't be on the active + * transport list + */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto close_out; + + BUG_ON(ret); + goto out; + } + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", + ctxt, rdma_xprt, rqstp, ctxt->wc_status); + BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); + atomic_inc(&rdma_stat_recv); + + /* Build up the XDR from the receive buffers. */ + rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); + + /* Decode the RDMA header. */ + len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); + rqstp->rq_xprt_hlen = len; + + /* If the request is invalid, reply with an error */ + if (len < 0) { + if (len == -ENOSYS) + svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); + goto close_out; + } + + /* Read read-list data. */ + ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); + if (ret > 0) { + /* read-list posted, defer until data received from client. */ + goto defer; + } + if (ret < 0) { + /* Post of read-list failed, free context. */ + svc_rdma_put_context(ctxt, 1); + return 0; + } + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + svc_rdma_put_context(ctxt, 0); + out: + dprintk("svcrdma: ret = %d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, + rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); + return ret; + + close_out: + if (ctxt) + svc_rdma_put_context(ctxt, 1); + dprintk("svcrdma: transport %p is closing\n", xprt); + /* + * Set the close bit and enqueue it. svc_recv will see the + * close bit and call svc_xprt_delete + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); +defer: + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c new file mode 100644 index 00000000..249a835b --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* Encode an XDR as an array of IB SGE + * + * Assumptions: + * - head[0] is physically contiguous. + * - tail[0] is physically contiguous. + * - pages[] is not physically or virtually contiguous and consists of + * PAGE_SIZE elements. + * + * Output: + * SGE[0] reserved for RCPRDMA header + * SGE[1] data from xdr->head[] + * SGE[2..sge_count-2] data from xdr->pages[] + * SGE[sge_count-1] data from xdr->tail. + * + * The max SGE we need is the length of the XDR / pagesize + one for + * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES + * reserves a page for both the request and the reply header, and this + * array is only concerned with the reply we are assured that we have + * on extra page for the RPCRMDA header. + */ +static int fast_reg_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +{ + int sge_no; + u32 sge_bytes; + u32 page_bytes; + u32 page_off; + int page_no = 0; + u8 *frva; + struct svc_rdma_fastreg_mr *frmr; + + frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + vec->frmr = frmr; + + /* Skip the RPCRDMA header */ + sge_no = 1; + + /* Map the head. */ + frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; + vec->count = 2; + sge_no++; + + /* Map the XDR head */ + frmr->kva = frva; + frmr->direction = DMA_TO_DEVICE; + frmr->access_flags = 0; + frmr->map_len = PAGE_SIZE; + frmr->page_list_len = 1; + page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, + virt_to_page(xdr->head[0].iov_base), + page_off, + PAGE_SIZE - page_off, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + + /* Map the XDR page list */ + page_off = xdr->page_base; + page_bytes = xdr->page_len + page_off; + if (!page_bytes) + goto encode_tail; + + /* Map the pages */ + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + vec->sge[sge_no].iov_len = page_bytes; + sge_no++; + while (page_bytes) { + struct page *page; + + page = xdr->pages[page_no++]; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); + page_bytes -= sge_bytes; + + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, + page, page_off, + sge_bytes, DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + + atomic_inc(&xprt->sc_dma_used); + page_off = 0; /* reset for next time through loop */ + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + vec->count++; + + encode_tail: + /* Map tail */ + if (0 == xdr->tail[0].iov_len) + goto done; + + vec->count++; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; + + if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == + ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { + /* + * If head and tail use the same page, we don't need + * to map it again. + */ + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + } else { + void *va; + + /* Map another page for the tail */ + page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; + + frmr->page_list->page_list[page_no] = + ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va), + page_off, + PAGE_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + frmr->page_list->page_list[page_no])) + goto fatal_err; + atomic_inc(&xprt->sc_dma_used); + frmr->map_len += PAGE_SIZE; + frmr->page_list_len++; + } + + done: + if (svc_rdma_fastreg(xprt, frmr)) + goto fatal_err; + + return 0; + + fatal_err: + printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); + vec->frmr = NULL; + svc_rdma_put_frmr(xprt, frmr); + return -EIO; +} + +static int map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +{ + int sge_no; + u32 sge_bytes; + u32 page_bytes; + u32 page_off; + int page_no; + + BUG_ON(xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + + if (xprt->sc_frmr_pg_list_len) + return fast_reg_xdr(xprt, xdr, vec); + + /* Skip the first sge, this is for the RPCRDMA header */ + sge_no = 1; + + /* Head SGE */ + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; + sge_no++; + + /* pages SGE */ + page_no = 0; + page_bytes = xdr->page_len; + page_off = xdr->page_base; + while (page_bytes) { + vec->sge[sge_no].iov_base = + page_address(xdr->pages[page_no]) + page_off; + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); + page_bytes -= sge_bytes; + vec->sge[sge_no].iov_len = sge_bytes; + + sge_no++; + page_no++; + page_off = 0; /* reset for next time through loop */ + } + + /* Tail SGE */ + if (xdr->tail[0].iov_len) { + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; + sge_no++; + } + + dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + "page_base %u page_len %u head_len %zu tail_len %zu\n", + sge_no, page_no, xdr->page_base, xdr->page_len, + xdr->head[0].iov_len, xdr->tail[0].iov_len); + + vec->count = sge_no; + return 0; +} + +static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + u32 xdr_off, size_t len, int dir) +{ + struct page *page; + dma_addr_t dma_addr; + if (xdr_off < xdr->head[0].iov_len) { + /* This offset is in the head */ + xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; + page = virt_to_page(xdr->head[0].iov_base); + } else { + xdr_off -= xdr->head[0].iov_len; + if (xdr_off < xdr->page_len) { + /* This offset is in the page list */ + page = xdr->pages[xdr_off >> PAGE_SHIFT]; + xdr_off &= ~PAGE_MASK; + } else { + /* This offset is in the tail */ + xdr_off -= xdr->page_len; + xdr_off += (unsigned long) + xdr->tail[0].iov_base & ~PAGE_MASK; + page = virt_to_page(xdr->tail[0].iov_base); + } + } + dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off, + min_t(size_t, PAGE_SIZE, len), dir); + return dma_addr; +} + +/* Assumptions: + * - We are using FRMR + * - or - + * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE + */ +static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + u32 rmr, u64 to, + u32 xdr_off, int write_len, + struct svc_rdma_req_map *vec) +{ + struct ib_send_wr write_wr; + struct ib_sge *sge; + int xdr_sge_no; + int sge_no; + int sge_bytes; + int sge_off; + int bc; + struct svc_rdma_op_ctxt *ctxt; + + BUG_ON(vec->count > RPCSVC_MAXPAGES); + dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " + "write_len=%d, vec->sge=%p, vec->count=%lu\n", + rmr, (unsigned long long)to, xdr_off, + write_len, vec->sge, vec->count); + + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_TO_DEVICE; + sge = ctxt->sge; + + /* Find the SGE associated with xdr_off */ + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; + xdr_sge_no++) { + if (vec->sge[xdr_sge_no].iov_len > bc) + break; + bc -= vec->sge[xdr_sge_no].iov_len; + } + + sge_off = bc; + bc = write_len; + sge_no = 0; + + /* Copy the remaining SGE */ + while (bc != 0) { + sge_bytes = min_t(size_t, + bc, vec->sge[xdr_sge_no].iov_len-sge_off); + sge[sge_no].length = sge_bytes; + if (!vec->frmr) { + sge[sge_no].addr = + dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, + sge[sge_no].addr)) + goto err; + atomic_inc(&xprt->sc_dma_used); + sge[sge_no].lkey = xprt->sc_dma_lkey; + } else { + sge[sge_no].addr = (unsigned long) + vec->sge[xdr_sge_no].iov_base + sge_off; + sge[sge_no].lkey = vec->frmr->mr->lkey; + } + ctxt->count++; + ctxt->frmr = vec->frmr; + sge_off = 0; + sge_no++; + xdr_sge_no++; + BUG_ON(xdr_sge_no > vec->count); + bc -= sge_bytes; + } + + /* Prepare WRITE WR */ + memset(&write_wr, 0, sizeof write_wr); + ctxt->wr_op = IB_WR_RDMA_WRITE; + write_wr.wr_id = (unsigned long)ctxt; + write_wr.sg_list = &sge[0]; + write_wr.num_sge = sge_no; + write_wr.opcode = IB_WR_RDMA_WRITE; + write_wr.send_flags = IB_SEND_SIGNALED; + write_wr.wr.rdma.rkey = rmr; + write_wr.wr.rdma.remote_addr = to; + + /* Post It */ + atomic_inc(&rdma_stat_write); + if (svc_rdma_send(xprt, &write_wr)) + goto err; + return 0; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_frmr(xprt, vec->frmr); + svc_rdma_put_context(ctxt, 0); + /* Fatal error, close transport */ + return -EIO; +} + +static int send_write_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct svc_rdma_req_map *vec) +{ + u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + int write_len; + int max_write; + u32 xdr_off; + int chunk_off; + int chunk_no; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_write_array(rdma_argp); + if (!arg_ary) + return 0; + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[1]; + + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* Write chunks start at the pagelist */ + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + struct rpcrdma_segment *arg_ch; + u64 rs_offset; + + arg_ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, arg_ch->rs_length); + + /* Prepare the response chunk given the length actually + * written */ + rs_offset = get_unaligned(&(arg_ch->rs_offset)); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + arg_ch->rs_handle, + rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + int this_write; + this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + arg_ch->rs_handle, + rs_offset + chunk_off, + xdr_off, + this_write, + vec); + if (ret) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += this_write; + xdr_off += this_write; + xfer_len -= this_write; + write_len -= this_write; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); + + return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; +} + +static int send_reply_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct svc_rdma_req_map *vec) +{ + u32 xfer_len = rqstp->rq_res.len; + int write_len; + int max_write; + u32 xdr_off; + int chunk_no; + int chunk_off; + struct rpcrdma_segment *ch; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_reply_array(rdma_argp); + if (!arg_ary) + return 0; + /* XXX: need to fix when reply lists occur with read-list and or + * write-list */ + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[2]; + + if (vec->frmr) + max_write = vec->frmr->map_len; + else + max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* xdr offset starts at RPC message */ + for (xdr_off = 0, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + u64 rs_offset; + ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, ch->rs_length); + + /* Prepare the reply chunk given the length actually + * written */ + rs_offset = get_unaligned(&(ch->rs_offset)); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + ch->rs_handle, rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + int this_write; + + this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + ch->rs_handle, + rs_offset + chunk_off, + xdr_off, + this_write, + vec); + if (ret) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += this_write; + xdr_off += this_write; + xfer_len -= this_write; + write_len -= this_write; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); + + return rqstp->rq_res.len; +} + +/* This function prepares the portion of the RPCRDMA message to be + * sent in the RDMA_SEND. This function is called after data sent via + * RDMA has already been transmitted. There are three cases: + * - The RPCRDMA header, RPC header, and payload are all sent in a + * single RDMA_SEND. This is the "inline" case. + * - The RPCRDMA header and some portion of the RPC header and data + * are sent via this RDMA_SEND and another portion of the data is + * sent via RDMA. + * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC + * header and data are all transmitted via RDMA. + * In all three cases, this function prepares the RPCRDMA header in + * sge[0], the 'type' parameter indicates the type to place in the + * RPCRDMA header, and the 'byte_count' field indicates how much of + * the XDR to include in this RDMA_SEND. NB: The offset of the payload + * to send is zero in the XDR. + */ +static int send_reply(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct page *page, + struct rpcrdma_msg *rdma_resp, + struct svc_rdma_op_ctxt *ctxt, + struct svc_rdma_req_map *vec, + int byte_count) +{ + struct ib_send_wr send_wr; + struct ib_send_wr inv_wr; + int sge_no; + int sge_bytes; + int page_no; + int ret; + + /* Post a recv buffer to handle another request. */ + ret = svc_rdma_post_recv(rdma); + if (ret) { + printk(KERN_INFO + "svcrdma: could not post a receive buffer, err=%d." + "Closing transport %p.\n", ret, rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 0); + return -ENOTCONN; + } + + /* Prepare the context */ + ctxt->pages[0] = page; + ctxt->count = 1; + ctxt->frmr = vec->frmr; + if (vec->frmr) + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + else + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + + /* Prepare the SGE for the RPCRDMA Header */ + ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, page, 0, + ctxt->sge[0].length, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + + ctxt->direction = DMA_TO_DEVICE; + + /* Map the payload indicated by 'byte_count' */ + for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { + int xdr_off = 0; + sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); + byte_count -= sge_bytes; + if (!vec->frmr) { + ctxt->sge[sge_no].addr = + dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; + if (ib_dma_mapping_error(rdma->sc_cm_id->device, + ctxt->sge[sge_no].addr)) + goto err; + atomic_inc(&rdma->sc_dma_used); + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + } else { + ctxt->sge[sge_no].addr = (unsigned long) + vec->sge[sge_no].iov_base; + ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; + } + ctxt->sge[sge_no].length = sge_bytes; + } + BUG_ON(byte_count != 0); + + /* Save all respages in the ctxt and remove them from the + * respages array. They are our pages until the I/O + * completes. + */ + for (page_no = 0; page_no < rqstp->rq_resused; page_no++) { + ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; + ctxt->count++; + rqstp->rq_respages[page_no] = NULL; + /* + * If there are more pages than SGE, terminate SGE + * list so that svc_rdma_unmap_dma doesn't attempt to + * unmap garbage. + */ + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; + } + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); + ctxt->wr_op = IB_WR_SEND; + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = sge_no; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + if (vec->frmr) { + /* Prepare INVALIDATE WR */ + memset(&inv_wr, 0, sizeof inv_wr); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.send_flags = IB_SEND_SIGNALED; + inv_wr.ex.invalidate_rkey = + vec->frmr->mr->lkey; + send_wr.next = &inv_wr; + } + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) + goto err; + + return 0; + + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 1); + return -EIO; +} + +void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +/* + * Return the start of an xdr buffer. + */ +static void *xdr_start(struct xdr_buf *xdr) +{ + return xdr->head[0].iov_base - + (xdr->len - + xdr->page_len - + xdr->tail[0].iov_len - + xdr->head[0].iov_len); +} + +int svc_rdma_sendto(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct rpcrdma_msg *rdma_argp; + struct rpcrdma_msg *rdma_resp; + struct rpcrdma_write_array *reply_ary; + enum rpcrdma_proc reply_type; + int ret; + int inline_bytes; + struct page *res_page; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + + dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); + + /* Get the RDMA request header. */ + rdma_argp = xdr_start(&rqstp->rq_arg); + + /* Build an req vec for the XDR */ + ctxt = svc_rdma_get_context(rdma); + ctxt->direction = DMA_TO_DEVICE; + vec = svc_rdma_get_req_map(); + ret = map_xdr(rdma, &rqstp->rq_res, vec); + if (ret) + goto err0; + inline_bytes = rqstp->rq_res.len; + + /* Create the RDMA response header */ + res_page = svc_rdma_get_page(); + rdma_resp = page_address(res_page); + reply_ary = svc_rdma_get_reply_array(rdma_argp); + if (reply_ary) + reply_type = RDMA_NOMSG; + else + reply_type = RDMA_MSG; + svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, + rdma_resp, reply_type); + + /* Send any write-chunk data and build resp write-list */ + ret = send_write_chunks(rdma, rdma_argp, rdma_resp, + rqstp, vec); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", + ret); + goto err1; + } + inline_bytes -= ret; + + /* Send any reply-list data and update resp reply-list */ + ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, + rqstp, vec); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", + ret); + goto err1; + } + inline_bytes -= ret; + + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, + inline_bytes); + svc_rdma_put_req_map(vec); + dprintk("svcrdma: send_reply returns %d\n", ret); + return ret; + + err1: + put_page(res_page); + err0: + svc_rdma_put_req_map(vec); + svc_rdma_put_context(ctxt, 0); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c new file mode 100644 index 00000000..c3c232a8 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -0,0 +1,1363 @@ +/* + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags); +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); +static void svc_rdma_release_rqst(struct svc_rqst *); +static void dto_tasklet_func(unsigned long data); +static void svc_rdma_detach(struct svc_xprt *xprt); +static void svc_rdma_free(struct svc_xprt *xprt); +static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static void rq_cq_reap(struct svcxprt_rdma *xprt); +static void sq_cq_reap(struct svcxprt_rdma *xprt); + +static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); +static DEFINE_SPINLOCK(dto_lock); +static LIST_HEAD(dto_xprt_q); + +static struct svc_xprt_ops svc_rdma_ops = { + .xpo_create = svc_rdma_create, + .xpo_recvfrom = svc_rdma_recvfrom, + .xpo_sendto = svc_rdma_sendto, + .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_detach = svc_rdma_detach, + .xpo_free = svc_rdma_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_has_wspace = svc_rdma_has_wspace, + .xpo_accept = svc_rdma_accept, +}; + +struct svc_xprt_class svc_rdma_class = { + .xcl_name = "rdma", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +/* WR context cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_ctxt_cachep; + +/* Workqueue created in svc_rdma.c */ +extern struct workqueue_struct *svc_rdma_wq; + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt; + + while (1) { + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); + if (ctxt) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + ctxt->frmr = NULL; + atomic_inc(&xprt->sc_ctxt_used); + return ctxt; +} + +void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) +{ + struct svcxprt_rdma *xprt = ctxt->xprt; + int i; + for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { + /* + * Unmap the DMA addr in the SGE if the lkey matches + * the sc_dma_lkey, otherwise, ignore it since it is + * an FRMR lkey and will be unmapped later when the + * last WR that uses it completes. + */ + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_page(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + } + } +} + +void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) +{ + struct svcxprt_rdma *xprt; + int i; + + BUG_ON(!ctxt); + xprt = ctxt->xprt; + if (free_pages) + for (i = 0; i < ctxt->count; i++) + put_page(ctxt->pages[i]); + + kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); + atomic_dec(&xprt->sc_ctxt_used); +} + +/* Temporary NFS request map cache. Created in svc_rdma.c */ +extern struct kmem_cache *svc_rdma_map_cachep; + +/* + * Temporary NFS req mappings are shared across all transport + * instances. These are short lived and should be bounded by the number + * of concurrent server threads * depth of the SQ. + */ +struct svc_rdma_req_map *svc_rdma_get_req_map(void) +{ + struct svc_rdma_req_map *map; + while (1) { + map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); + if (map) + break; + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + map->count = 0; + map->frmr = NULL; + return map; +} + +void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +{ + kmem_cache_free(svc_rdma_map_cachep, map); +} + +/* ib_cq event handler */ +static void cq_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + dprintk("svcrdma: received CQ event id=%d, context=%p\n", + event->event, context); + set_bit(XPT_CLOSE, &xprt->xpt_flags); +} + +/* QP event handler */ +static void qp_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + + switch (event->event) { + /* These are considered benign events */ + case IB_EVENT_PATH_MIG: + case IB_EVENT_COMM_EST: + case IB_EVENT_SQ_DRAINED: + case IB_EVENT_QP_LAST_WQE_REACHED: + dprintk("svcrdma: QP event %d received for QP=%p\n", + event->event, event->element.qp); + break; + /* These are considered fatal events */ + case IB_EVENT_PATH_MIG_ERR: + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_DEVICE_FATAL: + default: + dprintk("svcrdma: QP ERROR event %d received for QP=%p, " + "closing transport\n", + event->event, event->element.qp); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + break; + } +} + +/* + * Data Transfer Operation Tasklet + * + * Walks a list of transports with I/O pending, removing entries as + * they are added to the server's I/O pending list. Two bits indicate + * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave + * spinlock that serializes access to the transport list with the RQ + * and SQ interrupt handlers. + */ +static void dto_tasklet_func(unsigned long data) +{ + struct svcxprt_rdma *xprt; + unsigned long flags; + + spin_lock_irqsave(&dto_lock, flags); + while (!list_empty(&dto_xprt_q)) { + xprt = list_entry(dto_xprt_q.next, + struct svcxprt_rdma, sc_dto_q); + list_del_init(&xprt->sc_dto_q); + spin_unlock_irqrestore(&dto_lock, flags); + + rq_cq_reap(xprt); + sq_cq_reap(xprt); + + svc_xprt_put(&xprt->sc_xprt); + spin_lock_irqsave(&dto_lock, flags); + } + spin_unlock_irqrestore(&dto_lock, flags); +} + +/* + * Receive Queue Completion Handler + * + * Since an RQ completion handler is called on interrupt context, we + * need to defer the handling of the I/O to a tasklet + */ +static void rq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an SQ + * completion. + */ + set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) { + svc_xprt_get(&xprt->sc_xprt); + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + } + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +/* + * rq_cq_reap - Process the RQ CQ. + * + * Take all completing WC off the CQE and enqueue the associated DTO + * context on the dto_q for the transport. + * + * Note that caller must hold a transport reference. + */ +static void rq_cq_reap(struct svcxprt_rdma *xprt) +{ + int ret; + struct ib_wc wc; + struct svc_rdma_op_ctxt *ctxt = NULL; + + if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); + atomic_inc(&rdma_stat_rq_poll); + + while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + ctxt->wc_status = wc.status; + ctxt->byte_len = wc.byte_len; + svc_rdma_unmap_dma(ctxt); + if (wc.status != IB_WC_SUCCESS) { + /* Close the transport */ + dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); + continue; + } + spin_lock_bh(&xprt->sc_rq_dto_lock); + list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_put(&xprt->sc_xprt); + } + + if (ctxt) + atomic_inc(&rdma_stat_rq_prod); + + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); +} + +/* + * Process a completion context + */ +static void process_context(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt) +{ + svc_rdma_unmap_dma(ctxt); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + svc_rdma_put_context(ctxt, 0); + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); + break; + } +} + +/* + * Send Queue Completion Handler - potentially called on interrupt context. + * + * Note that caller must hold a transport reference. + */ +static void sq_cq_reap(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + struct ib_wc wc; + struct ib_cq *cq = xprt->sc_sq_cq; + int ret; + + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); + atomic_inc(&rdma_stat_sq_poll); + while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { + if (wc.status != IB_WC_SUCCESS) + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + if (ctxt) + process_context(xprt, ctxt); + + svc_xprt_put(&xprt->sc_xprt); + } + + if (ctxt) + atomic_inc(&rdma_stat_sq_prod); +} + +static void sq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an RQ + * completion. + */ + set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) { + svc_xprt_get(&xprt->sc_xprt); + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + } + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, + int listener) +{ + struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + + if (!cma_xprt) + return NULL; + svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv); + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); + INIT_LIST_HEAD(&cma_xprt->sc_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); + init_waitqueue_head(&cma_xprt->sc_send_wait); + + spin_lock_init(&cma_xprt->sc_lock); + spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_frmr_q_lock); + + cma_xprt->sc_ord = svcrdma_ord; + + cma_xprt->sc_max_req_size = svcrdma_max_req_size; + cma_xprt->sc_max_requests = svcrdma_max_requests; + cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; + atomic_set(&cma_xprt->sc_sq_count, 0); + atomic_set(&cma_xprt->sc_ctxt_used, 0); + + if (listener) + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + + return cma_xprt; +} + +struct page *svc_rdma_get_page(void) +{ + struct page *page; + + while ((page = alloc_page(GFP_KERNEL)) == NULL) { + /* If we can't get memory, wait a bit and try again */ + printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " + "jiffies.\n"); + schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); + } + return page; +} + +int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +{ + struct ib_recv_wr recv_wr, *bad_recv_wr; + struct svc_rdma_op_ctxt *ctxt; + struct page *page; + dma_addr_t pa; + int sge_no; + int buflen; + int ret; + + ctxt = svc_rdma_get_context(xprt); + buflen = 0; + ctxt->direction = DMA_FROM_DEVICE; + for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { + BUG_ON(sge_no >= xprt->sc_max_sge); + page = svc_rdma_get_page(); + ctxt->pages[sge_no] = page; + pa = ib_dma_map_page(xprt->sc_cm_id->device, + page, 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) + goto err_put_ctxt; + atomic_inc(&xprt->sc_dma_used); + ctxt->sge[sge_no].addr = pa; + ctxt->sge[sge_no].length = PAGE_SIZE; + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->count = sge_no + 1; + buflen += PAGE_SIZE; + } + recv_wr.next = NULL; + recv_wr.sg_list = &ctxt->sge[0]; + recv_wr.num_sge = ctxt->count; + recv_wr.wr_id = (u64)(unsigned long)ctxt; + + svc_xprt_get(&xprt->sc_xprt); + ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + if (ret) { + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); + } + return ret; + + err_put_ctxt: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + return -ENOMEM; +} + +/* + * This function handles the CONNECT_REQUEST event on a listening + * endpoint. It is passed the cma_id for the _new_ connection. The context in + * this cma_id is inherited from the listening cma_id and is the svc_xprt + * structure for the listening endpoint. + * + * This function creates a new xprt for the new connection and enqueues it on + * the accept queue for the listent xprt. When the listen thread is kicked, it + * will call the recvfrom method on the listen xprt which will accept the new + * connection. + */ +static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) +{ + struct svcxprt_rdma *listen_xprt = new_cma_id->context; + struct svcxprt_rdma *newxprt; + struct sockaddr *sa; + + /* Create a new transport */ + newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); + if (!newxprt) { + dprintk("svcrdma: failed to create new transport\n"); + return; + } + newxprt->sc_cm_id = new_cma_id; + new_cma_id->context = newxprt; + dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", + newxprt, newxprt->sc_cm_id, listen_xprt); + + /* Save client advertised inbound read limit for use later in accept. */ + newxprt->sc_ord = client_ird; + + /* Set the local and remote addresses in the transport */ + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + + /* + * Enqueue the new transport on the accept queue of the listening + * transport + */ + spin_lock_bh(&listen_xprt->sc_lock); + list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); + spin_unlock_bh(&listen_xprt->sc_lock); + + /* + * Can't use svc_xprt_received here because we are not on a + * rqstp thread + */ + set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&listen_xprt->sc_xprt); +} + +/* + * Handles events generated on the listening endpoint. These events will be + * either be incoming connect requests or adapter removal events. + */ +static int rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svcxprt_rdma *xprt = cma_id->context; + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, cma_id->context, event->event); + handle_connect_req(cma_id, + event->param.conn.initiator_depth); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + break; + + default: + dprintk("svcrdma: Unexpected event on listening endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + + return ret; +} + +static int rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svc_xprt *xprt = cma_id->context; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + svc_xprt_get(xprt); + dprintk("svcrdma: Connection completed on DTO xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + svc_xprt_enqueue(xprt); + break; + case RDMA_CM_EVENT_DISCONNECTED: + dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, xprt, event->event); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } + break; + default: + dprintk("svcrdma: Unexpected event on DTO endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + return 0; +} + +/* + * Create a listening RDMA service endpoint. + */ +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + struct rdma_cm_id *listen_id; + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + int ret; + + dprintk("svcrdma: Creating RDMA socket\n"); + if (sa->sa_family != AF_INET) { + dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); + return ERR_PTR(-EAFNOSUPPORT); + } + cma_xprt = rdma_create_xprt(serv, 1); + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + xprt = &cma_xprt->sc_xprt; + + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(listen_id)) { + ret = PTR_ERR(listen_id); + dprintk("svcrdma: rdma_create_id failed = %d\n", ret); + goto err0; + } + + ret = rdma_bind_addr(listen_id, sa); + if (ret) { + dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + goto err1; + } + cma_xprt->sc_cm_id = listen_id; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) { + dprintk("svcrdma: rdma_listen failed = %d\n", ret); + goto err1; + } + + /* + * We need to use the address from the cm_id in case the + * caller specified 0 for the port number. + */ + sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); + + return &cma_xprt->sc_xprt; + + err1: + rdma_destroy_id(listen_id); + err0: + kfree(cma_xprt); + return ERR_PTR(ret); +} + +static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) +{ + struct ib_mr *mr; + struct ib_fast_reg_page_list *pl; + struct svc_rdma_fastreg_mr *frmr; + + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); + if (!frmr) + goto err; + + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + if (IS_ERR(mr)) + goto err_free_frmr; + + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, + RPCSVC_MAXPAGES); + if (IS_ERR(pl)) + goto err_free_mr; + + frmr->mr = mr; + frmr->page_list = pl; + INIT_LIST_HEAD(&frmr->frmr_list); + return frmr; + + err_free_mr: + ib_dereg_mr(mr); + err_free_frmr: + kfree(frmr); + err: + return ERR_PTR(-ENOMEM); +} + +static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_fastreg_mr *frmr; + + while (!list_empty(&xprt->sc_frmr_q)) { + frmr = list_entry(xprt->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + ib_dereg_mr(frmr->mr); + ib_free_fast_reg_page_list(frmr->page_list); + kfree(frmr); + } +} + +struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_fastreg_mr *frmr = NULL; + + spin_lock_bh(&rdma->sc_frmr_q_lock); + if (!list_empty(&rdma->sc_frmr_q)) { + frmr = list_entry(rdma->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + frmr->map_len = 0; + frmr->page_list_len = 0; + } + spin_unlock_bh(&rdma->sc_frmr_q_lock); + if (frmr) + return frmr; + + return rdma_alloc_frmr(rdma); +} + +static void frmr_unmap_dma(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + int page_no; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + dma_addr_t addr = frmr->page_list->page_list[page_no]; + if (ib_dma_mapping_error(frmr->mr->device, addr)) + continue; + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); + } +} + +void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, + struct svc_rdma_fastreg_mr *frmr) +{ + if (frmr) { + frmr_unmap_dma(rdma, frmr); + spin_lock_bh(&rdma->sc_frmr_q_lock); + BUG_ON(!list_empty(&frmr->frmr_list)); + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); + spin_unlock_bh(&rdma->sc_frmr_q_lock); + } +} + +/* + * This is the xpo_recvfrom function for listening endpoints. Its + * purpose is to accept incoming connections. The CMA callback handler + * has already created a new transport and attached it to the new CMA + * ID. + * + * There is a queue of pending connections hung on the listening + * transport. This queue contains the new svc_xprt structure. This + * function takes svc_xprt structures off the accept_q and completes + * the connection. + */ +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *listen_rdma; + struct svcxprt_rdma *newxprt = NULL; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct ib_device_attr devattr; + int uninitialized_var(dma_mr_acc); + int need_dma_mr; + int ret; + int i; + + listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + clear_bit(XPT_CONN, &xprt->xpt_flags); + /* Get the next entry off the accept list */ + spin_lock_bh(&listen_rdma->sc_lock); + if (!list_empty(&listen_rdma->sc_accept_q)) { + newxprt = list_entry(listen_rdma->sc_accept_q.next, + struct svcxprt_rdma, sc_accept_q); + list_del_init(&newxprt->sc_accept_q); + } + if (!list_empty(&listen_rdma->sc_accept_q)) + set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); + spin_unlock_bh(&listen_rdma->sc_lock); + if (!newxprt) + return NULL; + + dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", + newxprt, newxprt->sc_cm_id); + + ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); + if (ret) { + dprintk("svcrdma: could not query device attributes on " + "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); + goto errout; + } + + /* Qualify the transport resource defaults with the + * capabilities of this particular device */ + newxprt->sc_max_sge = min((size_t)devattr.max_sge, + (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, + (size_t)svcrdma_max_requests); + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + + /* + * Limit ORD based on client limit, local device limit, and + * configured svcrdma limit. + */ + newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); + + newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + if (IS_ERR(newxprt->sc_pd)) { + dprintk("svcrdma: error creating PD for connect request\n"); + goto errout; + } + newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + sq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_sq_depth, + 0); + if (IS_ERR(newxprt->sc_sq_cq)) { + dprintk("svcrdma: error creating SQ CQ for connect request\n"); + goto errout; + } + newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + rq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_max_requests, + 0); + if (IS_ERR(newxprt->sc_rq_cq)) { + dprintk("svcrdma: error creating RQ CQ for connect request\n"); + goto errout; + } + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; + qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_send_sge = newxprt->sc_max_sge; + qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = newxprt->sc_sq_cq; + qp_attr.recv_cq = newxprt->sc_rq_cq; + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" + " cm_id->device=%p, sc_pd->device=%p\n" + " cap.max_send_wr = %d\n" + " cap.max_recv_wr = %d\n" + " cap.max_send_sge = %d\n" + " cap.max_recv_sge = %d\n", + newxprt->sc_cm_id, newxprt->sc_pd, + newxprt->sc_cm_id->device, newxprt->sc_pd->device, + qp_attr.cap.max_send_wr, + qp_attr.cap.max_recv_wr, + qp_attr.cap.max_send_sge, + qp_attr.cap.max_recv_sge); + + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); + if (ret) { + /* + * XXX: This is a hack. We need a xx_request_qp interface + * that will adjust the qp_attr's with a best-effort + * number + */ + qp_attr.cap.max_send_sge -= 2; + qp_attr.cap.max_recv_sge -= 2; + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, + &qp_attr); + if (ret) { + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; + } + newxprt->sc_max_sge = qp_attr.cap.max_send_sge; + newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; + newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; + newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; + } + newxprt->sc_qp = newxprt->sc_cm_id->qp; + + /* + * Use the most secure set of MR resources based on the + * transport type and available memory management features in + * the device. Here's the table implemented below: + * + * Fast Global DMA Remote WR + * Reg LKEY MR Access + * Sup'd Sup'd Needed Needed + * + * IWARP N N Y Y + * N Y Y Y + * Y N Y N + * Y Y N - + * + * IB N N Y N + * N Y N - + * Y N Y N + * Y Y N - + * + * NB: iWARP requires remote write access for the data sink + * of an RDMA_READ. IB does not. + */ + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + newxprt->sc_frmr_pg_list_len = + devattr.max_fast_reg_page_list_len; + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + } + + /* + * Determine if a DMA MR is required and if so, what privs are required + */ + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { + case RDMA_TRANSPORT_IWARP: + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: + if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + default: + goto errout; + } + + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ + if (need_dma_mr) { + /* Register all of physical memory */ + newxprt->sc_phys_mr = + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", + ret); + goto errout; + } + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; + } else + newxprt->sc_dma_lkey = + newxprt->sc_cm_id->device->local_dma_lkey; + + /* Post receive buffers */ + for (i = 0; i < newxprt->sc_max_requests; i++) { + ret = svc_rdma_post_recv(newxprt); + if (ret) { + dprintk("svcrdma: failure posting receive buffers\n"); + goto errout; + } + } + + /* Swap out the handler */ + newxprt->sc_cm_id->event_handler = rdma_cma_handler; + + /* + * Arm the CQs for the SQ and RQ before accepting so we can't + * miss the first message + */ + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + + /* Accept Connection */ + set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 0; + conn_param.initiator_depth = newxprt->sc_ord; + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); + if (ret) { + dprintk("svcrdma: failed to accept new connection, ret=%d\n", + ret); + goto errout; + } + + dprintk("svcrdma: new connection %p accepted with the following " + "attributes:\n" + " local_ip : %pI4\n" + " local_port : %d\n" + " remote_ip : %pI4\n" + " remote_port : %d\n" + " max_sge : %d\n" + " sq_depth : %d\n" + " max_requests : %d\n" + " ord : %d\n", + newxprt, + &((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_addr.s_addr, + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_port), + &((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_addr.s_addr, + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_port), + newxprt->sc_max_sge, + newxprt->sc_sq_depth, + newxprt->sc_max_requests, + newxprt->sc_ord); + + return &newxprt->sc_xprt; + + errout: + dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + /* Take a reference in case the DTO handler runs */ + svc_xprt_get(&newxprt->sc_xprt); + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) + ib_destroy_qp(newxprt->sc_qp); + rdma_destroy_id(newxprt->sc_cm_id); + /* This call to put will destroy the transport */ + svc_xprt_put(&newxprt->sc_xprt); + return NULL; +} + +static void svc_rdma_release_rqst(struct svc_rqst *rqstp) +{ +} + +/* + * When connected, an svc_xprt has at least two references: + * + * - A reference held by the cm_id between the ESTABLISHED and + * DISCONNECTED events. If the remote peer disconnected first, this + * reference could be gone. + * + * - A reference held by the svc_recv code that called this function + * as part of close processing. + * + * At a minimum one references should still be held. + */ +static void svc_rdma_detach(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + dprintk("svc: svc_rdma_detach(%p)\n", xprt); + + /* Disconnect and flush posted WQE */ + rdma_disconnect(rdma->sc_cm_id); +} + +static void __svc_rdma_free(struct work_struct *work) +{ + struct svcxprt_rdma *rdma = + container_of(work, struct svcxprt_rdma, sc_work); + dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + + /* We should only be called from kref_put */ + BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + + /* + * Destroy queued, but not processed read completions. Note + * that this cleanup has to be done before destroying the + * cm_id because the device ptr is needed to unmap the dma in + * svc_rdma_put_context. + */ + while (!list_empty(&rdma->sc_read_complete_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + + /* Destroy queued, but not processed recv completions */ + while (!list_empty(&rdma->sc_rq_dto_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + + /* Warn if we leaked a resource or under-referenced */ + WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + + /* De-allocate fastreg mr */ + rdma_dealloc_frmr_q(rdma); + + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_destroy_qp(rdma->sc_qp); + + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) + ib_destroy_cq(rdma->sc_sq_cq); + + if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) + ib_destroy_cq(rdma->sc_rq_cq); + + if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) + ib_dereg_mr(rdma->sc_phys_mr); + + if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) + ib_dealloc_pd(rdma->sc_pd); + + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + + kfree(rdma); +} + +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); + queue_work(svc_rdma_wq, &rdma->sc_work); +} + +static int svc_rdma_has_wspace(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* + * If there are fewer SQ WR available than required to send a + * simple response, return false. + */ + if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) + return 0; + + /* + * ...or there are already waiters on the SQ, + * return false. + */ + if (waitqueue_active(&rdma->sc_send_wait)) + return 0; + + /* Otherwise return true. */ + return 1; +} + +/* + * Attempt to register the kvec representing the RPC memory with the + * device. + * + * Returns: + * NULL : The device does not support fastreg or there were no more + * fastreg mr. + * frmr : The kvec register request was successfully posted. + * <0 : An error was encountered attempting to register the kvec. + */ +int svc_rdma_fastreg(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + struct ib_send_wr fastreg_wr; + u8 key; + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof fastreg_wr); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + return svc_rdma_send(xprt, &fastreg_wr); +} + +int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) +{ + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; + int ret; + + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) + return -ENOTCONN; + + BUG_ON(wr->send_flags != IB_SEND_SIGNALED); + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + spin_lock_bh(&xprt->sc_lock); + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { + spin_unlock_bh(&xprt->sc_lock); + atomic_inc(&rdma_stat_sq_starve); + + /* See if we can opportunistically reap SQ WR to make room */ + sq_cq_reap(xprt); + + /* Wait until SQ WR available if SQ still full */ + wait_event(xprt->sc_send_wait, + atomic_read(&xprt->sc_sq_count) < + xprt->sc_sq_depth); + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) + return -ENOTCONN; + continue; + } + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&xprt->sc_xprt); + + /* Bump used SQ WR count and post */ + atomic_add(wr_count, &xprt->sc_sq_count); + ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); + if (ret) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + atomic_sub(wr_count, &xprt->sc_sq_count); + for (i = 0; i < wr_count; i ++) + svc_xprt_put(&xprt->sc_xprt); + dprintk("svcrdma: failed to post SQ WR rc=%d, " + "sc_sq_count=%d, sc_sq_depth=%d\n", + ret, atomic_read(&xprt->sc_sq_count), + xprt->sc_sq_depth); + } + spin_unlock_bh(&xprt->sc_lock); + if (ret) + wake_up(&xprt->sc_send_wait); + break; + } + return ret; +} + +void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) +{ + struct ib_send_wr err_wr; + struct page *p; + struct svc_rdma_op_ctxt *ctxt; + u32 *va; + int length; + int ret; + + p = svc_rdma_get_page(); + va = page_address(p); + + /* XDR encode error */ + length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); + + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_FROM_DEVICE; + ctxt->count = 1; + ctxt->pages[0] = p; + + /* Prepare SGE for local address */ + ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, + p, 0, length, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { + put_page(p); + svc_rdma_put_context(ctxt, 1); + return; + } + atomic_inc(&xprt->sc_dma_used); + ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].length = length; + + /* Prepare SEND WR */ + memset(&err_wr, 0, sizeof err_wr); + ctxt->wr_op = IB_WR_SEND; + err_wr.wr_id = (unsigned long)ctxt; + err_wr.sg_list = ctxt->sge; + err_wr.num_sge = 1; + err_wr.opcode = IB_WR_SEND; + err_wr.send_flags = IB_SEND_SIGNALED; + + /* Post It */ + ret = svc_rdma_send(xprt, &err_wr); + if (ret) { + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + } +} diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c new file mode 100644 index 00000000..0867070b --- /dev/null +++ b/net/sunrpc/xprtrdma/transport.c @@ -0,0 +1,780 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * transport.c + * + * This file contains the top-level implementation of an RPC RDMA + * transport. + * + * Naming convention: functions beginning with xprt_ are part of the + * transport switch. All others are RPC RDMA internal. + */ + +#include +#include +#include +#include + +#include "xprt_rdma.h" + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS"); +MODULE_AUTHOR("Network Appliance, Inc."); + +/* + * tunables + */ + +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_inline_write_padding; +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; + +#ifdef RPC_DEBUG + +static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; +static unsigned int zero; +static unsigned int max_padding = PAGE_SIZE; +static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; +static unsigned int max_memreg = RPCRDMA_LAST - 1; + +static struct ctl_table_header *sunrpc_table_header; + +static ctl_table xr_tunables_table[] = { + { + .procname = "rdma_slot_table_entries", + .data = &xprt_rdma_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .procname = "rdma_max_inline_read", + .data = &xprt_rdma_max_inline_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rdma_max_inline_write", + .data = &xprt_rdma_max_inline_write, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rdma_inline_write_padding", + .data = &xprt_rdma_inline_write_padding, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &max_padding, + }, + { + .procname = "rdma_memreg_strategy", + .data = &xprt_rdma_memreg_strategy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_memreg, + .extra2 = &max_memreg, + }, + { + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { }, +}; + +static ctl_table sunrpc_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = xr_tunables_table + }, + { }, +}; + +#endif + +static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ + +static void +xprt_rdma_format_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr *sap = (struct sockaddr *) + &rpcx_to_rdmad(xprt).addr; + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[64]; + + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); + + /* netid */ + xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; +} + +static void +xprt_rdma_free_addresses(struct rpc_xprt *xprt) +{ + unsigned int i; + + for (i = 0; i < RPC_DISPLAY_MAX; i++) + switch (i) { + case RPC_DISPLAY_PROTO: + case RPC_DISPLAY_NETID: + continue; + default: + kfree(xprt->address_strings[i]); + } +} + +static void +xprt_rdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_xprt *r_xprt = + container_of(work, struct rpcrdma_xprt, rdma_connect.work); + struct rpc_xprt *xprt = &r_xprt->xprt; + int rc = 0; + + if (!xprt->shutdown) { + xprt_clear_connected(xprt); + + dprintk("RPC: %s: %sconnect\n", __func__, + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + goto out; + } + goto out_clear; + +out: + xprt_wake_pending_tasks(xprt, rc); + +out_clear: + dprintk("RPC: %s: exit\n", __func__); + xprt_clear_connecting(xprt); +} + +/* + * xprt_rdma_destroy + * + * Destroy the xprt. + * Free all memory associated with the object, including its own. + * NOTE: none of the *destroy methods free memory for their top-level + * objects, even though they may have allocated it (they do free + * private memory). It's up to the caller to handle it. In this + * case (RDMA transport), all structure memory is inlined with the + * struct rpcrdma_xprt. + */ +static void +xprt_rdma_destroy(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc; + + dprintk("RPC: %s: called\n", __func__); + + cancel_delayed_work_sync(&r_xprt->rdma_connect); + + xprt_clear_connected(xprt); + + rpcrdma_buffer_destroy(&r_xprt->rx_buf); + rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", + __func__, rc); + rpcrdma_ia_close(&r_xprt->rx_ia); + + xprt_rdma_free_addresses(xprt); + + xprt_free(xprt); + + dprintk("RPC: %s: returning\n", __func__); + + module_put(THIS_MODULE); +} + +static const struct rpc_timeout xprt_rdma_default_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/** + * xprt_setup_rdma - Set up transport to use RDMA + * + * @args: rpc transport arguments + */ +static struct rpc_xprt * +xprt_setup_rdma(struct xprt_create *args) +{ + struct rpcrdma_create_data_internal cdata; + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + struct rpcrdma_ep *new_ep; + struct sockaddr_in *sin; + int rc; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), + xprt_rdma_slot_table_entries); + if (xprt == NULL) { + dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; + xprt->bind_timeout = (60U * HZ); + xprt->reestablish_timeout = (5U * HZ); + xprt->idle_timeout = (5U * 60 * HZ); + + xprt->resvport = 0; /* privileged port not needed */ + xprt->tsh_size = 0; /* RPC-RDMA handles framing */ + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; + xprt->ops = &xprt_rdma_procs; + + /* + * Set up RDMA-specific connect data. + */ + + /* Put server RDMA address in local cdata */ + memcpy(&cdata.addr, args->dstaddr, args->addrlen); + + /* Ensure xprt->addr holds valid server TCP (not RDMA) + * address, for any side protocols which peek at it */ + xprt->prot = IPPROTO_TCP; + xprt->addrlen = args->addrlen; + memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + + sin = (struct sockaddr_in *)&cdata.addr; + if (ntohs(sin->sin_port) != 0) + xprt_set_bound(xprt); + + dprintk("RPC: %s: %pI4:%u\n", + __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); + + /* Set max requests */ + cdata.max_requests = xprt->max_reqs; + + /* Set some length limits */ + cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ + cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ + + cdata.inline_wsize = xprt_rdma_max_inline_write; + if (cdata.inline_wsize > cdata.wsize) + cdata.inline_wsize = cdata.wsize; + + cdata.inline_rsize = xprt_rdma_max_inline_read; + if (cdata.inline_rsize > cdata.rsize) + cdata.inline_rsize = cdata.rsize; + + cdata.padding = xprt_rdma_inline_write_padding; + + /* + * Create new transport instance, which includes initialized + * o ia + * o endpoint + * o buffers + */ + + new_xprt = rpcx_to_rdmax(xprt); + + rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, + xprt_rdma_memreg_strategy); + if (rc) + goto out1; + + /* + * initialize and create ep + */ + new_xprt->rx_data = cdata; + new_ep = &new_xprt->rx_ep; + new_ep->rep_remote_addr = cdata.addr; + + rc = rpcrdma_ep_create(&new_xprt->rx_ep, + &new_xprt->rx_ia, &new_xprt->rx_data); + if (rc) + goto out2; + + /* + * Allocate pre-registered send and receive buffers for headers and + * any inline data. Also specify any padding which will be provided + * from a preregistered zero buffer. + */ + rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, + &new_xprt->rx_data); + if (rc) + goto out3; + + /* + * Register a callback for connection events. This is necessary because + * connection loss notification is async. We also catch connection loss + * when reaping receives. + */ + INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); + new_ep->rep_func = rpcrdma_conn_func; + new_ep->rep_xprt = xprt; + + xprt_rdma_format_addresses(xprt); + + if (!try_module_get(THIS_MODULE)) + goto out4; + + return xprt; + +out4: + xprt_rdma_free_addresses(xprt); + rc = -EINVAL; +out3: + (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); +out2: + rpcrdma_ia_close(&new_xprt->rx_ia); +out1: + xprt_free(xprt); + return ERR_PTR(rc); +} + +/* + * Close a connection, during shutdown or timeout/reconnect + */ +static void +xprt_rdma_close(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; + xprt_disconnect_done(xprt); + (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); +} + +static void +xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) +{ + struct sockaddr_in *sap; + + sap = (struct sockaddr_in *)&xprt->addr; + sap->sin_port = htons(port); + sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; + sap->sin_port = htons(port); + dprintk("RPC: %s: %u\n", __func__, port); +} + +static void +xprt_rdma_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + if (r_xprt->rx_ep.rep_connected != 0) { + /* Reconnect */ + schedule_delayed_work(&r_xprt->rdma_connect, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > (30 * HZ)) + xprt->reestablish_timeout = (30 * HZ); + else if (xprt->reestablish_timeout < (5 * HZ)) + xprt->reestablish_timeout = (5 * HZ); + } else { + schedule_delayed_work(&r_xprt->rdma_connect, 0); + if (!RPC_IS_ASYNC(task)) + flush_delayed_work(&r_xprt->rdma_connect); + } +} + +static int +xprt_rdma_reserve_xprt(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int credits = atomic_read(&r_xprt->rx_buf.rb_credits); + + /* == RPC_CWNDSCALE @ init, but *after* setup */ + if (r_xprt->rx_buf.rb_cwndscale == 0UL) { + r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; + dprintk("RPC: %s: cwndscale %lu\n", __func__, + r_xprt->rx_buf.rb_cwndscale); + BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); + } + xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; + return xprt_reserve_xprt_cong(task); +} + +/* + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual send/recv + * sequence. For this reason, the recv buffers are attached to send + * buffers for portions of the RPC. Note that the RPC layer allocates + * both send and receive buffers in the same call. We may register + * the receive buffer portion when using reply chunks. + */ +static void * +xprt_rdma_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpcrdma_req *req, *nreq; + + req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); + BUG_ON(NULL == req); + + if (size > req->rl_size) { + dprintk("RPC: %s: size %zd too large for buffer[%zd]: " + "prog %d vers %d proc %d\n", + __func__, size, req->rl_size, + task->tk_client->cl_prog, task->tk_client->cl_vers, + task->tk_msg.rpc_proc->p_proc); + /* + * Outgoing length shortage. Our inline write max must have + * been configured to perform direct i/o. + * + * This is therefore a large metadata operation, and the + * allocate call was made on the maximum possible message, + * e.g. containing long filename(s) or symlink data. In + * fact, while these metadata operations *might* carry + * large outgoing payloads, they rarely *do*. However, we + * have to commit to the request here, so reallocate and + * register it now. The data path will never require this + * reallocation. + * + * If the allocation or registration fails, the RPC framework + * will (doggedly) retry. + */ + if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == + RPCRDMA_BOUNCEBUFFERS) { + /* forced to "pure inline" */ + dprintk("RPC: %s: too much data (%zd) for inline " + "(r/w max %d/%d)\n", __func__, size, + rpcx_to_rdmad(xprt).inline_rsize, + rpcx_to_rdmad(xprt).inline_wsize); + size = req->rl_size; + rpc_exit(task, -EIO); /* fail the operation */ + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; + goto out; + } + if (task->tk_flags & RPC_TASK_SWAPPER) + nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); + else + nreq = kmalloc(sizeof *req + size, GFP_NOFS); + if (nreq == NULL) + goto outfail; + + if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia, + nreq->rl_base, size + sizeof(struct rpcrdma_req) + - offsetof(struct rpcrdma_req, rl_base), + &nreq->rl_handle, &nreq->rl_iov)) { + kfree(nreq); + goto outfail; + } + rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size; + nreq->rl_size = size; + nreq->rl_niovs = 0; + nreq->rl_nchunks = 0; + nreq->rl_buffer = (struct rpcrdma_buffer *)req; + nreq->rl_reply = req->rl_reply; + memcpy(nreq->rl_segments, + req->rl_segments, sizeof nreq->rl_segments); + /* flag the swap with an unused field */ + nreq->rl_iov.length = 0; + req->rl_reply = NULL; + req = nreq; + } + dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); +out: + req->rl_connect_cookie = 0; /* our reserved value */ + return req->rl_xdr_buf; + +outfail: + rpcrdma_buffer_put(req); + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; + return NULL; +} + +/* + * This function returns all RDMA resources to the pool. + */ +static void +xprt_rdma_free(void *buffer) +{ + struct rpcrdma_req *req; + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_rep *rep; + int i; + + if (buffer == NULL) + return; + + req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); + if (req->rl_iov.length == 0) { /* see allocate above */ + r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer, + struct rpcrdma_xprt, rx_buf); + } else + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); + rep = req->rl_reply; + + dprintk("RPC: %s: called on 0x%p%s\n", + __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); + + /* + * Finish the deregistration. When using mw bind, this was + * begun in rpcrdma_reply_handler(). In all other modes, we + * do it here, in thread context. The process is considered + * complete when the rr_func vector becomes NULL - this + * was put in place during rpcrdma_reply_handler() - the wait + * call below will not block if the dereg is "done". If + * interrupted, our framework will clean up. + */ + for (i = 0; req->rl_nchunks;) { + --req->rl_nchunks; + i += rpcrdma_deregister_external( + &req->rl_segments[i], r_xprt, NULL); + } + + if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { + rep->rr_func = NULL; /* abandon the callback */ + req->rl_reply = NULL; + } + + if (req->rl_iov.length == 0) { /* see allocate above */ + struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; + oreq->rl_reply = req->rl_reply; + (void) rpcrdma_deregister_internal(&r_xprt->rx_ia, + req->rl_handle, + &req->rl_iov); + kfree(req); + req = oreq; + } + + /* Put back request+reply buffers */ + rpcrdma_buffer_put(req); +} + +/* + * send_request invokes the meat of RPC RDMA. It must do the following: + * 1. Marshal the RPC request into an RPC RDMA request, which means + * putting a header in front of data, and creating IOVs for RDMA + * from those in the request. + * 2. In marshaling, detect opportunities for RDMA, and use them. + * 3. Post a recv message to set up asynch completion, then send + * the request (rpcrdma_ep_post). + * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). + */ + +static int +xprt_rdma_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + /* marshal the send itself */ + if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { + r_xprt->rx_stats.failed_marshal_count++; + dprintk("RPC: %s: rpcrdma_marshal_req failed\n", + __func__); + return -EIO; + } + + if (req->rl_reply == NULL) /* e.g. reconnection */ + rpcrdma_recv_buffer_get(req); + + if (req->rl_reply) { + req->rl_reply->rr_func = rpcrdma_reply_handler; + /* this need only be done once, but... */ + req->rl_reply->rr_xprt = xprt; + } + + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + + rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + rqst->rq_bytes_sent = 0; + return 0; + +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ +} + +static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, + "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " + "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", + + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u, + + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count); +} + +/* + * Plumbing for rpc transport switch and kernel module + */ + +static struct rpc_xprt_ops xprt_rdma_procs = { + .reserve_xprt = xprt_rdma_reserve_xprt, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .release_request = xprt_release_rqst_cong, /* ditto */ + .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ + .set_port = xprt_rdma_set_port, + .connect = xprt_rdma_connect, + .buf_alloc = xprt_rdma_allocate, + .buf_free = xprt_rdma_free, + .send_request = xprt_rdma_send_request, + .close = xprt_rdma_close, + .destroy = xprt_rdma_destroy, + .print_stats = xprt_rdma_print_stats +}; + +static struct xprt_class xprt_rdma = { + .list = LIST_HEAD_INIT(xprt_rdma.list), + .name = "rdma", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_RDMA, + .setup = xprt_setup_rdma, +}; + +static void __exit xprt_rdma_cleanup(void) +{ + int rc; + + dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); +#ifdef RPC_DEBUG + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +#endif + rc = xprt_unregister_transport(&xprt_rdma); + if (rc) + dprintk("RPC: %s: xprt_unregister returned %i\n", + __func__, rc); +} + +static int __init xprt_rdma_init(void) +{ + int rc; + + rc = xprt_register_transport(&xprt_rdma); + + if (rc) + return rc; + + dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); + + dprintk(KERN_INFO "Defaults:\n"); + dprintk(KERN_INFO "\tSlots %d\n" + "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", + xprt_rdma_slot_table_entries, + xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); + dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", + xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + +#ifdef RPC_DEBUG + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl_table(sunrpc_table); +#endif + return 0; +} + +module_init(xprt_rdma_init); +module_exit(xprt_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c new file mode 100644 index 00000000..80f8da34 --- /dev/null +++ b/net/sunrpc/xprtrdma/verbs.c @@ -0,0 +1,1952 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * verbs.c + * + * Encapsulates the major functions managing: + * o adapters + * o endpoints + * o connections + * o buffer memory + */ + +#include /* for Tavor hack below */ +#include + +#include "xprt_rdma.h" + +/* + * Globals/Macros + */ + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* + * internal functions + */ + +/* + * handle replies in tasklet context, using a single, global list + * rdma tasklet function -- just turn around and call the func + * for all replies on the list + */ + +static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); +static LIST_HEAD(rpcrdma_tasklets_g); + +static void +rpcrdma_run_tasklet(unsigned long data) +{ + struct rpcrdma_rep *rep; + void (*func)(struct rpcrdma_rep *); + unsigned long flags; + + data = data; + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + while (!list_empty(&rpcrdma_tasklets_g)) { + rep = list_entry(rpcrdma_tasklets_g.next, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); + func = rep->rr_func; + rep->rr_func = NULL; + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + + if (func) + func(rep); + else + rpcrdma_recv_buffer_put(rep); + + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + } + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); +} + +static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); + +static inline void +rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) +{ + unsigned long flags; + + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); +} + +static void +rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + + dprintk("RPC: %s: QP error %X on device %s ep %p\n", + __func__, event->event, event->device->name, context); + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +static void +rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + + dprintk("RPC: %s: CQ error %X on device %s ep %p\n", + __func__, event->event, event->device->name, context); + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +static inline +void rpcrdma_event_process(struct ib_wc *wc) +{ + struct rpcrdma_mw *frmr; + struct rpcrdma_rep *rep = + (struct rpcrdma_rep *)(unsigned long) wc->wr_id; + + dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + + if (!rep) /* send or bind completion that we don't care about */ + return; + + if (IB_WC_SUCCESS != wc->status) { + dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", + __func__, wc->opcode, wc->status); + rep->rr_len = ~0U; + if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) + rpcrdma_schedule_tasklet(rep); + return; + } + + switch (wc->opcode) { + case IB_WC_FAST_REG_MR: + frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + frmr->r.frmr.state = FRMR_IS_VALID; + break; + case IB_WC_LOCAL_INV: + frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + frmr->r.frmr.state = FRMR_IS_INVALID; + break; + case IB_WC_RECV: + rep->rr_len = wc->byte_len; + ib_dma_sync_single_for_cpu( + rdmab_to_ia(rep->rr_buffer)->ri_id->device, + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); + /* Keep (only) the most recent credits, after check validity */ + if (rep->rr_len >= 16) { + struct rpcrdma_msg *p = + (struct rpcrdma_msg *) rep->rr_base; + unsigned int credits = ntohl(p->rm_credit); + if (credits == 0) { + dprintk("RPC: %s: server" + " dropped credits to 0!\n", __func__); + /* don't deadlock */ + credits = 1; + } else if (credits > rep->rr_buffer->rb_max_requests) { + dprintk("RPC: %s: server" + " over-crediting: %d (%d)\n", + __func__, credits, + rep->rr_buffer->rb_max_requests); + credits = rep->rr_buffer->rb_max_requests; + } + atomic_set(&rep->rr_buffer->rb_credits, credits); + } + /* fall through */ + case IB_WC_BIND_MW: + rpcrdma_schedule_tasklet(rep); + break; + default: + dprintk("RPC: %s: unexpected WC event %X\n", + __func__, wc->opcode); + break; + } +} + +static inline int +rpcrdma_cq_poll(struct ib_cq *cq) +{ + struct ib_wc wc; + int rc; + + for (;;) { + rc = ib_poll_cq(cq, 1, &wc); + if (rc < 0) { + dprintk("RPC: %s: ib_poll_cq failed %i\n", + __func__, rc); + return rc; + } + if (rc == 0) + break; + + rpcrdma_event_process(&wc); + } + + return 0; +} + +/* + * rpcrdma_cq_event_upcall + * + * This upcall handles recv, send, bind and unbind events. + * It is reentrant but processes single events in order to maintain + * ordering of receives to keep server credits. + * + * It is the responsibility of the scheduled tasklet to return + * recv buffers to the pool. NOTE: this affects synchronization of + * connection shutdown. That is, the structures required for + * the completion of the reply handler must remain intact until + * all memory has been reclaimed. + * + * Note that send events are suppressed and do not result in an upcall. + */ +static void +rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) +{ + int rc; + + rc = rpcrdma_cq_poll(cq); + if (rc) + return; + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed %i\n", + __func__, rc); + return; + } + + rpcrdma_cq_poll(cq); +} + +#ifdef RPC_DEBUG +static const char * const conn[] = { + "address resolved", + "address error", + "route resolved", + "route error", + "connect request", + "connect response", + "connect error", + "unreachable", + "rejected", + "established", + "disconnected", + "device removal" +}; +#endif + +static int +rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct rpcrdma_xprt *xprt = id->context; + struct rpcrdma_ia *ia = &xprt->rx_ia; + struct rpcrdma_ep *ep = &xprt->rx_ep; +#ifdef RPC_DEBUG + struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; +#endif + struct ib_qp_attr attr; + struct ib_qp_init_attr iattr; + int connstate = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + ia->ri_async_rc = -EHOSTUNREACH; + dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", + __func__, ep); + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + ia->ri_async_rc = -ENETUNREACH; + dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", + __func__, ep); + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ESTABLISHED: + connstate = 1; + ib_query_qp(ia->ri_id->qp, &attr, + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, + &iattr); + dprintk("RPC: %s: %d responder resources" + " (%d initiator)\n", + __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); + goto connected; + case RDMA_CM_EVENT_CONNECT_ERROR: + connstate = -ENOTCONN; + goto connected; + case RDMA_CM_EVENT_UNREACHABLE: + connstate = -ENETDOWN; + goto connected; + case RDMA_CM_EVENT_REJECTED: + connstate = -ECONNREFUSED; + goto connected; + case RDMA_CM_EVENT_DISCONNECTED: + connstate = -ECONNABORTED; + goto connected; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + connstate = -ENODEV; +connected: + dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", + __func__, + (event->event <= 11) ? conn[event->event] : + "unknown connection error", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + ep, event->event); + atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); + dprintk("RPC: %s: %sconnected\n", + __func__, connstate > 0 ? "" : "dis"); + ep->rep_connected = connstate; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + break; + default: + dprintk("RPC: %s: unexpected CM event %d\n", + __func__, event->event); + break; + } + +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %pI4:%u " + "on %s, memreg %d slots %d ird %d%s\n", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", + &addr->sin_addr.s_addr, + ntohs(addr->sin_port), + connstate); + } +#endif + + return 0; +} + +static struct rdma_cm_id * +rpcrdma_create_id(struct rpcrdma_xprt *xprt, + struct rpcrdma_ia *ia, struct sockaddr *addr) +{ + struct rdma_cm_id *id; + int rc; + + init_completion(&ia->ri_done); + + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + dprintk("RPC: %s: rdma_create_id() failed %i\n", + __func__, rc); + return id; + } + + ia->ri_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", + __func__, rc); + goto out; + } + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + rc = ia->ri_async_rc; + if (rc) + goto out; + + ia->ri_async_rc = -ETIMEDOUT; + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_route() failed %i\n", + __func__, rc); + goto out; + } + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + rc = ia->ri_async_rc; + if (rc) + goto out; + + return id; + +out: + rdma_destroy_id(id); + return ERR_PTR(rc); +} + +/* + * Drain any cq, prior to teardown. + */ +static void +rpcrdma_clean_cq(struct ib_cq *cq) +{ + struct ib_wc wc; + int count = 0; + + while (1 == ib_poll_cq(cq, 1, &wc)) + ++count; + + if (count) + dprintk("RPC: %s: flushed %d events (last 0x%x)\n", + __func__, count, wc.opcode); +} + +/* + * Exported functions. + */ + +/* + * Open and initialize an Interface Adapter. + * o initializes fields of struct rpcrdma_ia, including + * interface and provider attributes and protection zone. + */ +int +rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) +{ + int rc, mem_priv; + struct ib_device_attr devattr; + struct rpcrdma_ia *ia = &xprt->rx_ia; + + ia->ri_id = rpcrdma_create_id(xprt, ia, addr); + if (IS_ERR(ia->ri_id)) { + rc = PTR_ERR(ia->ri_id); + goto out1; + } + + ia->ri_pd = ib_alloc_pd(ia->ri_id->device); + if (IS_ERR(ia->ri_pd)) { + rc = PTR_ERR(ia->ri_pd); + dprintk("RPC: %s: ib_alloc_pd() failed %i\n", + __func__, rc); + goto out2; + } + + /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + switch (memreg) { + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { + dprintk("RPC: %s: MEMWINDOWS registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; + } + break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + case RPCRDMA_FRMR: + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + } + + /* + * Optionally obtain an underlying physical identity mapping in + * order to do a memory window-based bind. This base registration + * is protected from remote access - that is enabled only by binding + * for the specific bytes targeted during each RPC operation, and + * revoked after the corresponding completion similar to a storage + * adapter. + */ + switch (memreg) { + case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; +#endif + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_MW_BIND; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) + break; + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: + ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); + if (IS_ERR(ia->ri_bind_mem)) { + printk(KERN_ALERT "%s: ib_get_dma_mr for " + "phys register failed with %lX\n\t" + "Will continue with degraded performance\n", + __func__, PTR_ERR(ia->ri_bind_mem)); + memreg = RPCRDMA_REGISTER; + ia->ri_bind_mem = NULL; + } + break; + default: + printk(KERN_ERR "%s: invalid memory registration mode %d\n", + __func__, memreg); + rc = -EINVAL; + goto out2; + } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); + + /* Else will do memory reg/dereg for each chunk */ + ia->ri_memreg_strategy = memreg; + + return 0; +out2: + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; +out1: + return rc; +} + +/* + * Clean up/close an IA. + * o if event handles and PD have been initialized, free them. + * o close the IA + */ +void +rpcrdma_ia_close(struct rpcrdma_ia *ia) +{ + int rc; + + dprintk("RPC: %s: entering\n", __func__); + if (ia->ri_bind_mem != NULL) { + rc = ib_dereg_mr(ia->ri_bind_mem); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } + if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { + rc = ib_dealloc_pd(ia->ri_pd); + dprintk("RPC: %s: ib_dealloc_pd returned %i\n", + __func__, rc); + } +} + +/* + * Create unconnected endpoint. + */ +int +rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr devattr; + int rc, err; + + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + return rc; + } + + /* check provider's send/recv wr limits */ + if (cdata->max_requests > devattr.max_qp_wr) + cdata->max_requests = devattr.max_qp_wr; + + ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.qp_context = ep; + /* send_cq and recv_cq initialized below */ + ep->rep_attr.srq = NULL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. FRMR reg WR for pagelist + * 4. FRMR invalidate WR for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + ep->rep_attr.cap.max_send_wr *= 7; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { + cdata->max_requests = devattr.max_qp_wr / 7; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; + } + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + /* Add room for mw_binds+unbinds - overkill! */ + ep->rep_attr.cap.max_send_wr++; + ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; + default: + break; + } + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_recv_sge = 1; + ep->rep_attr.cap.max_inline_data = 0; + ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->rep_attr.qp_type = IB_QPT_RC; + ep->rep_attr.port_num = ~0; + + dprintk("RPC: %s: requested max: dtos: send %d recv %d; " + "iovs: send %d recv %d\n", + __func__, + ep->rep_attr.cap.max_send_wr, + ep->rep_attr.cap.max_recv_wr, + ep->rep_attr.cap.max_send_sge, + ep->rep_attr.cap.max_recv_sge); + + /* set trigger for requesting send completion */ + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + ep->rep_cqinit -= RPCRDMA_MAX_SEGS; + break; + default: + break; + } + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; + INIT_CQCOUNT(ep); + ep->rep_ia = ia; + init_waitqueue_head(&ep->rep_connect_wait); + + /* + * Create a single cq for receive dto and mw_bind (only ever + * care about unbind, really). Send completions are suppressed. + * Use single threaded tasklet upcalls to maintain ordering. + */ + ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, + rpcrdma_cq_async_error_upcall, NULL, + ep->rep_attr.cap.max_recv_wr + + ep->rep_attr.cap.max_send_wr + 1, 0); + if (IS_ERR(ep->rep_cq)) { + rc = PTR_ERR(ep->rep_cq); + dprintk("RPC: %s: ib_create_cq failed: %i\n", + __func__, rc); + goto out1; + } + + rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + goto out2; + } + + ep->rep_attr.send_cq = ep->rep_cq; + ep->rep_attr.recv_cq = ep->rep_cq; + + /* Initialize cma parameters */ + + /* RPC/RDMA does not use private data */ + ep->rep_remote_cma.private_data = NULL; + ep->rep_remote_cma.private_data_len = 0; + + /* Client offers RDMA Read but does not initiate */ + ep->rep_remote_cma.initiator_depth = 0; + if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) + ep->rep_remote_cma.responder_resources = 0; + else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else + ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; + + ep->rep_remote_cma.retry_count = 7; + ep->rep_remote_cma.flow_control = 0; + ep->rep_remote_cma.rnr_retry_count = 0; + + return 0; + +out2: + err = ib_destroy_cq(ep->rep_cq); + if (err) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, err); +out1: + return rc; +} + +/* + * rpcrdma_ep_destroy + * + * Disconnect and destroy endpoint. After this, the only + * valid operations on the ep are to free it (if dynamically + * allocated) or re-create it. + * + * The caller's error handling must be sure to not leak the endpoint + * if this function fails. + */ +int +rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + dprintk("RPC: %s: entering, connected is %d\n", + __func__, ep->rep_connected); + + if (ia->ri_id->qp) { + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc) + dprintk("RPC: %s: rpcrdma_ep_disconnect" + " returned %i\n", __func__, rc); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + + /* padding - could be done in rpcrdma_buffer_destroy... */ + if (ep->rep_pad_mr) { + rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); + ep->rep_pad_mr = NULL; + } + + rpcrdma_clean_cq(ep->rep_cq); + rc = ib_destroy_cq(ep->rep_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); + + return rc; +} + +/* + * Connect unconnected endpoint. + */ +int +rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct rdma_cm_id *id; + int rc = 0; + int retry_count = 0; + + if (ep->rep_connected != 0) { + struct rpcrdma_xprt *xprt; +retry: + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc && rc != -ENOTCONN) + dprintk("RPC: %s: rpcrdma_ep_disconnect" + " status %i\n", __func__, rc); + rpcrdma_clean_cq(ep->rep_cq); + + xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + id = rpcrdma_create_id(xprt, ia, + (struct sockaddr *)&xprt->rx_data.addr); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + goto out; + } + /* TEMP TEMP TEMP - fail if new device: + * Deregister/remarshal *all* requests! + * Close and recreate adapter, pd, etc! + * Re-determine all attributes still sane! + * More stuff I haven't thought of! + * Rrrgh! + */ + if (ia->ri_id->device != id->device) { + printk("RPC: %s: can't reconnect on " + "different device!\n", __func__); + rdma_destroy_id(id); + rc = -ENETDOWN; + goto out; + } + /* END TEMP */ + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = id; + } + + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + goto out; + } + +/* XXX Tavor device performs badly with 2K MTU! */ +if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { + struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); + if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && + (pcid->vendor == PCI_VENDOR_ID_MELLANOX || + pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { + struct ib_qp_attr attr = { + .path_mtu = IB_MTU_1024 + }; + rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); + } +} + + ep->rep_connected = 0; + + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + if (rc) { + dprintk("RPC: %s: rdma_connect() failed with %i\n", + __func__, rc); + goto out; + } + + wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); + + /* + * Check state. A non-peer reject indicates no listener + * (ECONNREFUSED), which may be a transient state. All + * others indicate a transport condition which has already + * undergone a best-effort. + */ + if (ep->rep_connected == -ECONNREFUSED && + ++retry_count <= RDMA_CONNECT_RETRY_MAX) { + dprintk("RPC: %s: non-peer_reject, retry\n", __func__); + goto retry; + } + if (ep->rep_connected <= 0) { + /* Sometimes, the only way to reliably connect to remote + * CMs is to use same nonzero values for ORD and IRD. */ + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; + goto retry; + } + rc = ep->rep_connected; + } else { + dprintk("RPC: %s: connected\n", __func__); + } + +out: + if (rc) + ep->rep_connected = rc; + return rc; +} + +/* + * rpcrdma_ep_disconnect + * + * This is separate from destroy to facilitate the ability + * to reconnect without recreating the endpoint. + * + * This call is not reentrant, and must not be made in parallel + * on the same endpoint. + */ +int +rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + rpcrdma_clean_cq(ep->rep_cq); + rc = rdma_disconnect(ia->ri_id); + if (!rc) { + /* returns without wait if not connected */ + wait_event_interruptible(ep->rep_connect_wait, + ep->rep_connected != 1); + dprintk("RPC: %s: after wait, %sconnected\n", __func__, + (ep->rep_connected == 1) ? "still " : "dis"); + } else { + dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); + ep->rep_connected = rc; + } + return rc; +} + +/* + * Initialize buffer memory + */ +int +rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) +{ + char *p; + size_t len; + int i, rc; + struct rpcrdma_mw *r; + + buf->rb_max_requests = cdata->max_requests; + spin_lock_init(&buf->rb_lock); + atomic_set(&buf->rb_credits, 1); + + /* Need to allocate: + * 1. arrays for send and recv pointers + * 2. arrays of struct rpcrdma_req to fill in pointers + * 3. array of struct rpcrdma_rep for replies + * 4. padding, if any + * 5. mw's, fmr's or frmr's, if any + * Send/recv buffers in req/rep need to be registered + */ + + len = buf->rb_max_requests * + (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); + len += cdata->padding; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; + case RPCRDMA_MTHCAFMR: + /* TBD we are perhaps overallocating here */ + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; + default: + break; + } + + /* allocate 1, 4 and 5 in one shot */ + p = kzalloc(len, GFP_KERNEL); + if (p == NULL) { + dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", + __func__, len); + rc = -ENOMEM; + goto out; + } + buf->rb_pool = p; /* for freeing it later */ + + buf->rb_send_bufs = (struct rpcrdma_req **) p; + p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; + buf->rb_recv_bufs = (struct rpcrdma_rep **) p; + p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; + + /* + * Register the zeroed pad buffer, if any. + */ + if (cdata->padding) { + rc = rpcrdma_register_internal(ia, p, cdata->padding, + &ep->rep_pad_mr, &ep->rep_pad); + if (rc) + goto out; + } + p += cdata->padding; + + /* + * Allocate the fmr's, or mw's for mw_bind chunk registration. + * We "cycle" the mw's in order to minimize rkey reuse, + * and also reduce unbind-to-bind collision. + */ + INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } + r->r.frmr.fr_pgl = + ib_alloc_fast_reg_page_list(ia->ri_id->device, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; + case RPCRDMA_MTHCAFMR: + /* TBD we are perhaps overallocating here */ + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + static struct ib_fmr_attr fa = + { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; + r->r.fmr = ib_alloc_fmr(ia->ri_pd, + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, + &fa); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr" + " failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + /* Allocate one extra request's worth, for full cycling */ + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + r->r.mw = ib_alloc_mw(ia->ri_pd); + if (IS_ERR(r->r.mw)) { + rc = PTR_ERR(r->r.mw); + dprintk("RPC: %s: ib_alloc_mw" + " failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; + default: + break; + } + + /* + * Allocate/init the request/reply buffers. Doing this + * using kmalloc for now -- one for each buf. + */ + for (i = 0; i < buf->rb_max_requests; i++) { + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + len = cdata->inline_wsize + sizeof(struct rpcrdma_req); + /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ + /* Typical ~2400b, so rounding up saves work later */ + if (len < 4096) + len = 4096; + req = kmalloc(len, GFP_KERNEL); + if (req == NULL) { + dprintk("RPC: %s: request buffer %d alloc" + " failed\n", __func__, i); + rc = -ENOMEM; + goto out; + } + memset(req, 0, sizeof(struct rpcrdma_req)); + buf->rb_send_bufs[i] = req; + buf->rb_send_bufs[i]->rl_buffer = buf; + + rc = rpcrdma_register_internal(ia, req->rl_base, + len - offsetof(struct rpcrdma_req, rl_base), + &buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + if (rc) + goto out; + + buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); + + len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); + rep = kmalloc(len, GFP_KERNEL); + if (rep == NULL) { + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, i); + rc = -ENOMEM; + goto out; + } + memset(rep, 0, sizeof(struct rpcrdma_rep)); + buf->rb_recv_bufs[i] = rep; + buf->rb_recv_bufs[i]->rr_buffer = buf; + init_waitqueue_head(&rep->rr_unbind); + + rc = rpcrdma_register_internal(ia, rep->rr_base, + len - offsetof(struct rpcrdma_rep, rr_base), + &buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + if (rc) + goto out; + + } + dprintk("RPC: %s: max_requests %d\n", + __func__, buf->rb_max_requests); + /* done */ + return 0; +out: + rpcrdma_buffer_destroy(buf); + return rc; +} + +/* + * Unregister and destroy buffer memory. Need to deal with + * partial initialization, so it's callable from failed create. + * Must be called before destroying endpoint, as registrations + * reference it. + */ +void +rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) +{ + int rc, i; + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *r; + + /* clean up in reverse order from create + * 1. recv mr memory (mr free, then kfree) + * 1a. bind mw memory + * 2. send mr memory (mr free, then kfree) + * 3. padding (if any) [moved to rpcrdma_ep_destroy] + * 4. arrays + */ + dprintk("RPC: %s: entering\n", __func__); + + for (i = 0; i < buf->rb_max_requests; i++) { + if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { + rpcrdma_deregister_internal(ia, + buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + kfree(buf->rb_recv_bufs[i]); + } + if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; + case RPCRDMA_MTHCAFMR: + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_fmr" + " failed %i\n", + __func__, rc); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + rc = ib_dealloc_mw(r->r.mw); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_mw" + " failed %i\n", + __func__, rc); + break; + default: + break; + } + } + rpcrdma_deregister_internal(ia, + buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + kfree(buf->rb_send_bufs[i]); + } + } + + kfree(buf->rb_pool); +} + +/* + * Get a set of request/reply buffers. + * + * Reply buffer (if needed) is attached to send buffer upon return. + * Rule: + * rb_send_index and rb_recv_index MUST always be pointing to the + * *next* available buffer (non-NULL). They are incremented after + * removing buffers, and decremented *before* returning them. + */ +struct rpcrdma_req * +rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) +{ + struct rpcrdma_req *req; + unsigned long flags; + int i; + struct rpcrdma_mw *r; + + spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_send_index == buffers->rb_max_requests) { + spin_unlock_irqrestore(&buffers->rb_lock, flags); + dprintk("RPC: %s: out of request buffers\n", __func__); + return ((struct rpcrdma_req *)NULL); + } + + req = buffers->rb_send_bufs[buffers->rb_send_index]; + if (buffers->rb_send_index < buffers->rb_recv_index) { + dprintk("RPC: %s: %d extra receives outstanding (ok)\n", + __func__, + buffers->rb_recv_index - buffers->rb_send_index); + req->rl_reply = NULL; + } else { + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; + } + buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; + if (!list_empty(&buffers->rb_mws)) { + i = RPCRDMA_MAX_SEGS - 1; + do { + r = list_entry(buffers->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + } while (--i >= 0); + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); + return req; +} + +/* + * Put request/reply buffers back into pool. + * Pre-decrement counter/array index. + */ +void +rpcrdma_buffer_put(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + int i; + unsigned long flags; + + BUG_ON(req->rl_nchunks != 0); + spin_lock_irqsave(&buffers->rb_lock, flags); + buffers->rb_send_bufs[--buffers->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; + init_waitqueue_head(&req->rl_reply->rr_unbind); + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + case RPCRDMA_MTHCAFMR: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + /* + * Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ + i = 1; + do { + struct rpcrdma_mw **mw; + mw = &req->rl_segments[i].mr_chunk.rl_mw; + list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); + *mw = NULL; + } while (++i < RPCRDMA_MAX_SEGS); + list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, + &buffers->rb_mws); + req->rl_segments[0].mr_chunk.rl_mw = NULL; + break; + default: + break; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Recover reply buffers from pool. + * This happens when recovering from error conditions. + * Post-increment counter/array index. + */ +void +rpcrdma_recv_buffer_get(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + unsigned long flags; + + if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ + buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; + spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_recv_index < buffers->rb_max_requests) { + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Put reply buffers back into pool when not attached to + * request. This happens in error conditions, and when + * aborting unbinds. Pre-decrement counter/array index. + */ +void +rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buffers = rep->rr_buffer; + unsigned long flags; + + rep->rr_func = NULL; + spin_lock_irqsave(&buffers->rb_lock, flags); + buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Wrappers for internal-use kmalloc memory registration, used by buffer code. + */ + +int +rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, + struct ib_mr **mrp, struct ib_sge *iov) +{ + struct ib_phys_buf ipb; + struct ib_mr *mr; + int rc; + + /* + * All memory passed here was kmalloc'ed, therefore phys-contiguous. + */ + iov->addr = ib_dma_map_single(ia->ri_id->device, + va, len, DMA_BIDIRECTIONAL); + iov->length = len; + + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { + *mrp = NULL; + iov->lkey = ia->ri_bind_mem->lkey; + return 0; + } + + ipb.addr = iov->addr; + ipb.size = iov->length; + mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, + IB_ACCESS_LOCAL_WRITE, &iov->addr); + + dprintk("RPC: %s: phys convert: 0x%llx " + "registered 0x%llx length %d\n", + __func__, (unsigned long long)ipb.addr, + (unsigned long long)iov->addr, len); + + if (IS_ERR(mr)) { + *mrp = NULL; + rc = PTR_ERR(mr); + dprintk("RPC: %s: failed with %i\n", __func__, rc); + } else { + *mrp = mr; + iov->lkey = mr->lkey; + rc = 0; + } + + return rc; +} + +int +rpcrdma_deregister_internal(struct rpcrdma_ia *ia, + struct ib_mr *mr, struct ib_sge *iov) +{ + int rc; + + ib_dma_unmap_single(ia->ri_id->device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + + if (NULL == mr) + return 0; + + rc = ib_dereg_mr(mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); + return rc; +} + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +static void +rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) +{ + seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + seg->mr_dmalen = seg->mr_len; + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(ia->ri_id->device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(ia->ri_id->device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { + dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", + __func__, + (unsigned long long)seg->mr_dma, + seg->mr_offset, seg->mr_dmalen); + } +} + +static void +rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(ia->ri_id->device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(ia->ri_id->device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; + + u8 key; + int len, pageoff; + int i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; + len += seg->mr_len; + BUG_ON(seg->mr_len > PAGE_SIZE); + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, seg1->mr_chunk.rl_mw, i); + + if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { + dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", + __func__, + seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); + /* Invalidate before using. */ + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.next = &frmr_wr; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = IB_SEND_SIGNALED; + invalidate_wr.ex.invalidate_rkey = + seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + post_wr = &invalidate_wr; + } else + post_wr = &frmr_wr; + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + frmr_wr.opcode = IB_WR_FAST_REG_MR; + frmr_wr.send_flags = IB_SEND_SIGNALED; + frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; + frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list_len = i; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + BUG_ON(frmr_wr.wr.fast_reg.length < len); + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ); + frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); + + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = IB_SEND_SIGNALED; + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + if (rc) + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct ib_mw_bind param; + int rc; + + *nsegs = 1; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + } + return rc; +} + +static int +rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt, void **r) +{ + struct ib_mw_bind param; + LIST_HEAD(l); + int rc; + + BUG_ON(seg->mr_nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (*r) { + param.wr_id = (u64) (unsigned long) *r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + *r = NULL; /* will upcall on completion */ + return rc; +} + +static int +rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len, i, rc = 0; + + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (len = 0, i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, i, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int rc; + + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + return rc; +} + +int +rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + int nsegs, int writing, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int rc = 0; + + switch (ia->ri_memreg_strategy) { + +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: + rpcrdma_map_one(ia, seg, writing); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + nsegs = 1; + break; +#endif + + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Registration using fmr memory registration */ + case RPCRDMA_MTHCAFMR: + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); + break; + + /* Registration using memory windows */ + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); + break; + + /* Default registration each time */ + default: + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); + break; + } + if (rc) + return -1; + + return nsegs; +} + +int +rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_xprt *r_xprt, void *r) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int nsegs = seg->mr_nsegs, rc; + + switch (ia->ri_memreg_strategy) { + +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: + BUG_ON(nsegs != 1); + rpcrdma_unmap_one(ia, seg); + rc = 0; + break; +#endif + + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + + case RPCRDMA_MTHCAFMR: + rc = rpcrdma_deregister_fmr_external(seg, ia); + break; + + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); + break; + + default: + rc = rpcrdma_deregister_default_external(seg, ia); + break; + } + if (r) { + struct rpcrdma_rep *rep = r; + void (*func)(struct rpcrdma_rep *) = rep->rr_func; + rep->rr_func = NULL; + func(rep); /* dereg done, callback now */ + } + return nsegs; +} + +/* + * Prepost any receive buffer, then post send. + * + * Receive buffer is donated to hardware, reclaimed upon recv completion. + */ +int +rpcrdma_ep_post(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_req *req) +{ + struct ib_send_wr send_wr, *send_wr_fail; + struct rpcrdma_rep *rep = req->rl_reply; + int rc; + + if (rep) { + rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (rc) + goto out; + req->rl_reply = NULL; + } + + send_wr.next = NULL; + send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.sg_list = req->rl_send_iov; + send_wr.num_sge = req->rl_niovs; + send_wr.opcode = IB_WR_SEND; + if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[3].addr, req->rl_send_iov[3].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[1].addr, req->rl_send_iov[1].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[0].addr, req->rl_send_iov[0].length, + DMA_TO_DEVICE); + + if (DECR_CQCOUNT(ep) > 0) + send_wr.send_flags = 0; + else { /* Provider must take a send completion every now and then */ + INIT_CQCOUNT(ep); + send_wr.send_flags = IB_SEND_SIGNALED; + } + + rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); + if (rc) + dprintk("RPC: %s: ib_post_send returned %i\n", __func__, + rc); +out: + return rc; +} + +/* + * (Re)post a receive buffer. + */ +int +rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_rep *rep) +{ + struct ib_recv_wr recv_wr, *recv_wr_fail; + int rc; + + recv_wr.next = NULL; + recv_wr.wr_id = (u64) (unsigned long) rep; + recv_wr.sg_list = &rep->rr_iov; + recv_wr.num_sge = 1; + + ib_dma_sync_single_for_cpu(ia->ri_id->device, + rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); + + DECR_CQCOUNT(ep); + rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); + + if (rc) + dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, + rc); + return rc; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h new file mode 100644 index 00000000..cae761a8 --- /dev/null +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SUNRPC_XPRT_RDMA_H +#define _LINUX_SUNRPC_XPRT_RDMA_H + +#include /* wait_queue_head_t, etc */ +#include /* spinlock_t, etc */ +#include /* atomic_t, etc */ + +#include /* RDMA connection api */ +#include /* RDMA verbs api */ + +#include /* rpc_xprt */ +#include /* RPC/RDMA protocol */ +#include /* xprt parameters */ + +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + +/* + * Interface Adapter -- one per transport instance + */ +struct rpcrdma_ia { + struct rdma_cm_id *ri_id; + struct ib_pd *ri_pd; + struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; + struct completion ri_done; + int ri_async_rc; + enum rpcrdma_memreg ri_memreg_strategy; +}; + +/* + * RDMA Endpoint -- one per transport instance + */ + +struct rpcrdma_ep { + atomic_t rep_cqcount; + int rep_cqinit; + int rep_connected; + struct rpcrdma_ia *rep_ia; + struct ib_cq *rep_cq; + struct ib_qp_init_attr rep_attr; + wait_queue_head_t rep_connect_wait; + struct ib_sge rep_pad; /* holds zeroed pad */ + struct ib_mr *rep_pad_mr; /* holds zeroed pad */ + void (*rep_func)(struct rpcrdma_ep *); + struct rpc_xprt *rep_xprt; /* for rep_func */ + struct rdma_conn_param rep_remote_cma; + struct sockaddr_storage rep_remote_addr; +}; + +#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) +#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) + +/* + * struct rpcrdma_rep -- this structure encapsulates state required to recv + * and complete a reply, asychronously. It needs several pieces of + * state: + * o recv buffer (posted to provider) + * o ib_sge (also donated to provider) + * o status of reply (length, success or not) + * o bookkeeping state to get run by tasklet (list, etc) + * + * These are allocated during initialization, per-transport instance; + * however, the tasklet execution list itself is global, as it should + * always be pretty short. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + */ + +/* temporary static scatter/gather max */ +#define RPCRDMA_MAX_DATA_SEGS (8) /* max scatter/gather */ +#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ +#define MAX_RPCRDMAHDR (\ + /* max supported RPC/RDMA header */ \ + sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ + (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) + +struct rpcrdma_buffer; + +struct rpcrdma_rep { + unsigned int rr_len; /* actual received reply length */ + struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ + struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ + void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ + struct list_head rr_list; /* tasklet list */ + wait_queue_head_t rr_unbind; /* optional unbind wait */ + struct ib_sge rr_iov; /* for posting */ + struct ib_mr *rr_handle; /* handle for mem in rr_iov */ + char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ +}; + +/* + * struct rpcrdma_req -- structure central to the request/reply sequence. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + * + * It includes pre-registered buffer memory for send AND recv. + * The recv buffer, however, is not owned by this structure, and + * is "donated" to the hardware when a recv is posted. When a + * reply is handled, the recv buffer used is given back to the + * struct rpcrdma_req associated with the request. + * + * In addition to the basic memory, this structure includes an array + * of iovs for send operations. The reason is that the iovs passed to + * ib_post_{send,recv} must not be modified until the work request + * completes. + * + * NOTES: + * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we + * marshal. The number needed varies depending on the iov lists that + * are passed to us, the memory registration mode we are in, and if + * physical addressing is used, the layout. + */ + +struct rpcrdma_mr_seg { /* chunk descriptors */ + union { /* chunk memory handles */ + struct ib_mr *rl_mr; /* if registered directly */ + struct rpcrdma_mw { /* if registered from region */ + union { + struct ib_mw *mw; + struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; + } frmr; + } r; + struct list_head mw_list; + } *rl_mw; + } mr_chunk; + u64 mr_base; /* registration result */ + u32 mr_rkey; /* registration result */ + u32 mr_len; /* length of chunk or segment */ + int mr_nsegs; /* number of segments in chunk or 0 */ + enum dma_data_direction mr_dir; /* segment mapping direction */ + dma_addr_t mr_dma; /* segment mapping address */ + size_t mr_dmalen; /* segment mapping length */ + struct page *mr_page; /* owning page, if any */ + char *mr_offset; /* kva if no page, else offset */ +}; + +struct rpcrdma_req { + size_t rl_size; /* actual length of buffer */ + unsigned int rl_niovs; /* 0, 2 or 4 */ + unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ + struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ + struct ib_sge rl_send_iov[4]; /* for active requests */ + struct ib_sge rl_iov; /* for posting */ + struct ib_mr *rl_handle; /* handle for mem in rl_iov */ + char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ + __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */ +}; +#define rpcr_to_rdmar(r) \ + container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) + +/* + * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for + * inline requests/replies, and client/server credits. + * + * One of these is associated with a transport instance + */ +struct rpcrdma_buffer { + spinlock_t rb_lock; /* protects indexes */ + atomic_t rb_credits; /* most recent server credits */ + unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ + int rb_max_requests;/* client max requests */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + int rb_send_index; + struct rpcrdma_req **rb_send_bufs; + int rb_recv_index; + struct rpcrdma_rep **rb_recv_bufs; + char *rb_pool; +}; +#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) + +/* + * Internal structure for transport instance creation. This + * exists primarily for modularity. + * + * This data should be set with mount options + */ +struct rpcrdma_create_data_internal { + struct sockaddr_storage addr; /* RDMA server address */ + unsigned int max_requests; /* max requests (slots) in flight */ + unsigned int rsize; /* mount rsize - max read hdr+data */ + unsigned int wsize; /* mount wsize - max write hdr+data */ + unsigned int inline_rsize; /* max non-rdma read data payload */ + unsigned int inline_wsize; /* max non-rdma write data payload */ + unsigned int padding; /* non-rdma write header padding */ +}; + +#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ + (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize) + +#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ + (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize) + +#define RPCRDMA_INLINE_PAD_VALUE(rq)\ + rpcx_to_rdmad(rq->rq_task->tk_xprt).padding + +/* + * Statistics for RPCRDMA + */ +struct rpcrdma_stats { + unsigned long read_chunk_count; + unsigned long write_chunk_count; + unsigned long reply_chunk_count; + + unsigned long long total_rdma_request; + unsigned long long total_rdma_reply; + + unsigned long long pullup_copy_count; + unsigned long long fixup_copy_count; + unsigned long hardway_register_count; + unsigned long failed_marshal_count; + unsigned long bad_reply_count; +}; + +/* + * RPCRDMA transport -- encapsulates the structures above for + * integration with RPC. + * + * The contained structures are embedded, not pointers, + * for convenience. This structure need not be visible externally. + * + * It is allocated and initialized during mount, and released + * during unmount. + */ +struct rpcrdma_xprt { + struct rpc_xprt xprt; + struct rpcrdma_ia rx_ia; + struct rpcrdma_ep rx_ep; + struct rpcrdma_buffer rx_buf; + struct rpcrdma_create_data_internal rx_data; + struct delayed_work rdma_connect; + struct rpcrdma_stats rx_stats; +}; + +#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) +#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) + +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + +/* + * Interface Adapter calls - xprtrdma/verbs.c + */ +int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); +void rpcrdma_ia_close(struct rpcrdma_ia *); + +/* + * Endpoint calls - xprtrdma/verbs.c + */ +int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); + +int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_req *); +int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_rep *); + +/* + * Buffer calls - xprtrdma/verbs.c + */ +int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, + struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); + +struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); +void rpcrdma_buffer_put(struct rpcrdma_req *); +void rpcrdma_recv_buffer_get(struct rpcrdma_req *); +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); + +int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, + struct ib_mr **, struct ib_sge *); +int rpcrdma_deregister_internal(struct rpcrdma_ia *, + struct ib_mr *, struct ib_sge *); + +int rpcrdma_register_external(struct rpcrdma_mr_seg *, + int, int, struct rpcrdma_xprt *); +int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, + struct rpcrdma_xprt *, void *); + +/* + * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c + */ +void rpcrdma_conn_func(struct rpcrdma_ep *); +void rpcrdma_reply_handler(struct rpcrdma_rep *); + +/* + * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c + */ +int rpcrdma_marshal_req(struct rpc_rqst *); + +#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c new file mode 100644 index 00000000..ea750797 --- /dev/null +++ b/net/sunrpc/xprtsock.c @@ -0,0 +1,2953 @@ +/* + * linux/net/sunrpc/xprtsock.c + * + * Client-side transport implementation for sockets. + * + * TCP callback races fixes (C) 1998 Red Hat + * TCP send fixes (C) 1998 Red Hat + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland + * + * Rewrite of larges part of the code in order to stabilize TCP stuff. + * Fix behaviour when socket buffer is full. + * (C) 1999 Trond Myklebust + * + * IP socket transport implementation, (C) 2005 Chuck Lever + * + * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NFS_V4_1 +#include +#endif + +#include +#include +#include +#include + +#include "sunrpc.h" + +static void xs_close(struct rpc_xprt *xprt); + +/* + * xprtsock tunables + */ +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + +unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; +unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; + +#define XS_TCP_LINGER_TO (15U * HZ) +static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; + +/* + * We can register our own files under /proc/sys/sunrpc by + * calling register_sysctl_table() again. The files in that + * directory become the union of all files registered there. + * + * We simply need to make sure that we don't collide with + * someone else's file names! + */ + +#ifdef RPC_DEBUG + +static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; +static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; +static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; + +static struct ctl_table_header *sunrpc_table_header; + +/* + * FIXME: changing the UDP slot table size should also resize the UDP + * socket buffers for existing UDP transports + */ +static ctl_table xs_tunables_table[] = { + { + .procname = "udp_slot_table_entries", + .data = &xprt_udp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .procname = "tcp_slot_table_entries", + .data = &xprt_tcp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .procname = "min_resvport", + .data = &xprt_min_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, + { + .procname = "max_resvport", + .data = &xprt_max_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, + { + .procname = "tcp_fin_timeout", + .data = &xs_tcp_fin_timeout, + .maxlen = sizeof(xs_tcp_fin_timeout), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { }, +}; + +static ctl_table sunrpc_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = xs_tunables_table + }, + { }, +}; + +#endif + +/* + * Wait duration for a reply from the RPC portmapper. + */ +#define XS_BIND_TO (60U * HZ) + +/* + * Delay if a UDP socket connect error occurs. This is most likely some + * kind of resource problem on the local host. + */ +#define XS_UDP_REEST_TO (2U * HZ) + +/* + * The reestablish timeout allows clients to delay for a bit before attempting + * to reconnect to a server that just dropped our connection. + * + * We implement an exponential backoff when trying to reestablish a TCP + * transport connection with the server. Some servers like to drop a TCP + * connection when they are overworked, so we start with a short timeout and + * increase over time if the server is down or not responding. + */ +#define XS_TCP_INIT_REEST_TO (3U * HZ) +#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) + +/* + * TCP idle timeout; client drops the transport socket if it is idle + * for this long. Note that we also timeout UDP sockets to prevent + * holding port numbers when there is no RPC traffic. + */ +#define XS_IDLE_DISC_TO (5U * 60 * HZ) + +#ifdef RPC_DEBUG +# undef RPC_DEBUG_DATA +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#ifdef RPC_DEBUG_DATA +static void xs_pktdump(char *msg, u32 *packet, unsigned int count) +{ + u8 *buf = (u8 *) packet; + int j; + + dprintk("RPC: %s\n", msg); + for (j = 0; j < count && j < 128; j += 4) { + if (!(j & 31)) { + if (j) + dprintk("\n"); + dprintk("0x%04x ", j); + } + dprintk("%02x%02x%02x%02x ", + buf[j], buf[j+1], buf[j+2], buf[j+3]); + } + dprintk("\n"); +} +#else +static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) +{ + /* NOP */ +} +#endif + +struct sock_xprt { + struct rpc_xprt xprt; + + /* + * Network layer + */ + struct socket * sock; + struct sock * inet; + + /* + * State of TCP reply receive + */ + __be32 tcp_fraghdr, + tcp_xid, + tcp_calldir; + + u32 tcp_offset, + tcp_reclen; + + unsigned long tcp_copied, + tcp_flags; + + /* + * Connection of transports + */ + struct delayed_work connect_worker; + struct sockaddr_storage srcaddr; + unsigned short srcport; + + /* + * UDP socket buffer size parameters + */ + size_t rcvsize, + sndsize; + + /* + * Saved socket callback addresses + */ + void (*old_data_ready)(struct sock *, int); + void (*old_state_change)(struct sock *); + void (*old_write_space)(struct sock *); + void (*old_error_report)(struct sock *); +}; + +/* + * TCP receive state flags + */ +#define TCP_RCV_LAST_FRAG (1UL << 0) +#define TCP_RCV_COPY_FRAGHDR (1UL << 1) +#define TCP_RCV_COPY_XID (1UL << 2) +#define TCP_RCV_COPY_DATA (1UL << 3) +#define TCP_RCV_READ_CALLDIR (1UL << 4) +#define TCP_RCV_COPY_CALLDIR (1UL << 5) + +/* + * TCP RPC flags + */ +#define TCP_RPC_REPLY (1UL << 6) + +static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) +{ + return (struct sockaddr *) &xprt->addr; +} + +static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt) +{ + return (struct sockaddr_un *) &xprt->addr; +} + +static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt) +{ + return (struct sockaddr_in *) &xprt->addr; +} + +static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt) +{ + return (struct sockaddr_in6 *) &xprt->addr; +} + +static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr *sap = xs_addr(xprt); + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + struct sockaddr_un *sun; + char buf[128]; + + switch (sap->sa_family) { + case AF_LOCAL: + sun = xs_addr_un(xprt); + strlcpy(buf, sun->sun_path, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = + kstrdup(buf, GFP_KERNEL); + break; + case AF_INET: + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = + kstrdup(buf, GFP_KERNEL); + sin = xs_addr_in(xprt); + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + break; + case AF_INET6: + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = + kstrdup(buf, GFP_KERNEL); + sin6 = xs_addr_in6(xprt); + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + break; + default: + BUG(); + } + + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); +} + +static void xs_format_common_peer_ports(struct rpc_xprt *xprt) +{ + struct sockaddr *sap = xs_addr(xprt); + char buf[128]; + + snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); +} + +static void xs_format_peer_addresses(struct rpc_xprt *xprt, + const char *protocol, + const char *netid) +{ + xprt->address_strings[RPC_DISPLAY_PROTO] = protocol; + xprt->address_strings[RPC_DISPLAY_NETID] = netid; + xs_format_common_peer_addresses(xprt); + xs_format_common_peer_ports(xprt); +} + +static void xs_update_peer_port(struct rpc_xprt *xprt) +{ + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + + xs_format_common_peer_ports(xprt); +} + +static void xs_free_peer_addresses(struct rpc_xprt *xprt) +{ + unsigned int i; + + for (i = 0; i < RPC_DISPLAY_MAX; i++) + switch (i) { + case RPC_DISPLAY_PROTO: + case RPC_DISPLAY_NETID: + continue; + default: + kfree(xprt->address_strings[i]); + } +} + +#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) +{ + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = XS_SENDMSG_FLAGS | (more ? MSG_MORE : 0), + }; + struct kvec iov = { + .iov_base = vec->iov_base + base, + .iov_len = vec->iov_len - base, + }; + + if (iov.iov_len != 0) + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + return kernel_sendmsg(sock, &msg, NULL, 0, 0); +} + +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more) +{ + struct page **ppage; + unsigned int remainder; + int err, sent = 0; + + remainder = xdr->page_len - base; + base += xdr->page_base; + ppage = xdr->pages + (base >> PAGE_SHIFT); + base &= ~PAGE_MASK; + for(;;) { + unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder); + int flags = XS_SENDMSG_FLAGS; + + remainder -= len; + if (remainder != 0 || more) + flags |= MSG_MORE; + err = sock->ops->sendpage(sock, *ppage, base, len, flags); + if (remainder == 0 || err != len) + break; + sent += err; + ppage++; + base = 0; + } + if (sent == 0) + return err; + if (err > 0) + sent += err; + return sent; +} + +/** + * xs_sendpages - write pages directly to a socket + * @sock: socket to send on + * @addr: UDP only -- address of destination + * @addrlen: UDP only -- length of destination address + * @xdr: buffer containing this request + * @base: starting position in the buffer + * + */ +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) +{ + unsigned int remainder = xdr->len - base; + int err, sent = 0; + + if (unlikely(!sock)) + return -ENOTSOCK; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + if (base != 0) { + addr = NULL; + addrlen = 0; + } + + if (base < xdr->head[0].iov_len || addr != NULL) { + unsigned int len = xdr->head[0].iov_len - base; + remainder -= len; + err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0); + if (remainder == 0 || err != len) + goto out; + sent += err; + base = 0; + } else + base -= xdr->head[0].iov_len; + + if (base < xdr->page_len) { + unsigned int len = xdr->page_len - base; + remainder -= len; + err = xs_send_pagedata(sock, xdr, base, remainder != 0); + if (remainder == 0 || err != len) + goto out; + sent += err; + base = 0; + } else + base -= xdr->page_len; + + if (base >= xdr->tail[0].iov_len) + return sent; + err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0); +out: + if (sent == 0) + return err; + if (err > 0) + sent += err; + return sent; +} + +static void xs_nospace_callback(struct rpc_task *task) +{ + struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); + + transport->inet->sk_write_pending--; + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); +} + +/** + * xs_nospace - place task on wait queue if transmit was incomplete + * @task: task to put to sleep + * + */ +static int xs_nospace(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int ret = -EAGAIN; + + dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + /* Protect against races with write_space */ + spin_lock_bh(&xprt->transport_lock); + + /* Don't race with disconnect */ + if (xprt_connected(xprt)) { + if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { + /* + * Notify TCP that we're limited by the application + * window size + */ + set_bit(SOCK_NOSPACE, &transport->sock->flags); + transport->inet->sk_write_pending++; + /* ...and wait for more buffer space */ + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } + } else { + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + ret = -ENOTCONN; + } + + spin_unlock_bh(&xprt->transport_lock); + return ret; +} + +/* + * Construct a stream transport record marker in @buf. + */ +static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen); +} + +/** + * xs_local_send_request - write an RPC request to an AF_LOCAL socket + * @task: RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent + */ +static int xs_local_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; + + xs_encode_stream_record_marker(&req->rq_snd_buf); + + xs_pktdump("packet data:", + req->rq_svec->iov_base, req->rq_svec->iov_len); + + status = xs_sendpages(transport->sock, NULL, 0, + xdr, req->rq_bytes_sent); + dprintk("RPC: %s(%u) = %d\n", + __func__, xdr->len - req->rq_bytes_sent, status); + if (likely(status >= 0)) { + req->rq_bytes_sent += status; + req->rq_xmit_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; + } + status = -EAGAIN; + } + + switch (status) { + case -EAGAIN: + status = xs_nospace(task); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + case -EPIPE: + xs_close(xprt); + status = -ENOTCONN; + } + + return status; +} + +/** + * xs_udp_send_request - write an RPC request to a UDP socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occurred, the request was not sent + */ +static int xs_udp_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; + + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + if (!xprt_bound(xprt)) + return -ENOTCONN; + status = xs_sendpages(transport->sock, + xs_addr(xprt), + xprt->addrlen, xdr, + req->rq_bytes_sent); + + dprintk("RPC: xs_udp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); + + if (status >= 0) { + req->rq_xmit_bytes_sent += status; + if (status >= req->rq_slen) + return 0; + /* Still some bytes left; set up for a retry later. */ + status = -EAGAIN; + } + + switch (status) { + case -ENOTSOCK: + status = -ENOTCONN; + /* Should we call xs_close() here? */ + break; + case -EAGAIN: + status = xs_nospace(task); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + case -ENETUNREACH: + case -EPIPE: + case -ECONNREFUSED: + /* When the server has died, an ICMP port unreachable message + * prompts ECONNREFUSED. */ + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + } + + return status; +} + +/** + * xs_tcp_shutdown - gracefully shut down a TCP socket + * @xprt: transport + * + * Initiates a graceful shutdown of the TCP socket by calling the + * equivalent of shutdown(SHUT_WR); + */ +static void xs_tcp_shutdown(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct socket *sock = transport->sock; + + if (sock != NULL) + kernel_sock_shutdown(sock, SHUT_WR); +} + +/** + * xs_tcp_send_request - write an RPC request to a TCP socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occurred, the request was not sent + * + * XXX: In the case of soft timeouts, should we eventually give up + * if sendmsg is not able to make progress? + */ +static int xs_tcp_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; + + xs_encode_stream_record_marker(&req->rq_snd_buf); + + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + /* Continue transmitting the packet/record. We must be careful + * to cope with writespace callbacks arriving _after_ we have + * called sendmsg(). */ + while (1) { + status = xs_sendpages(transport->sock, + NULL, 0, xdr, req->rq_bytes_sent); + + dprintk("RPC: xs_tcp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); + + if (unlikely(status < 0)) + break; + + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; + req->rq_xmit_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; + } + + if (status != 0) + continue; + status = -EAGAIN; + break; + } + + switch (status) { + case -ENOTSOCK: + status = -ENOTCONN; + /* Should we call xs_close() here? */ + break; + case -EAGAIN: + status = xs_nospace(task); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + case -ECONNRESET: + case -EPIPE: + xs_tcp_shutdown(xprt); + case -ECONNREFUSED: + case -ENOTCONN: + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + } + + return status; +} + +/** + * xs_tcp_release_xprt - clean up after a tcp transmission + * @xprt: transport + * @task: rpc task + * + * This cleans up if an error causes us to abort the transmission of a request. + * In this case, the socket may need to be reset in order to avoid confusing + * the server. + */ +static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req; + + if (task != xprt->snd_task) + return; + if (task == NULL) + goto out_release; + req = task->tk_rqstp; + if (req->rq_bytes_sent == 0) + goto out_release; + if (req->rq_bytes_sent == req->rq_snd_buf.len) + goto out_release; + set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state); +out_release: + xprt_release_xprt(xprt, task); +} + +static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) +{ + transport->old_data_ready = sk->sk_data_ready; + transport->old_state_change = sk->sk_state_change; + transport->old_write_space = sk->sk_write_space; + transport->old_error_report = sk->sk_error_report; +} + +static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) +{ + sk->sk_data_ready = transport->old_data_ready; + sk->sk_state_change = transport->old_state_change; + sk->sk_write_space = transport->old_write_space; + sk->sk_error_report = transport->old_error_report; +} + +static void xs_reset_transport(struct sock_xprt *transport) +{ + struct socket *sock = transport->sock; + struct sock *sk = transport->inet; + + if (sk == NULL) + return; + + transport->srcport = 0; + + write_lock_bh(&sk->sk_callback_lock); + transport->inet = NULL; + transport->sock = NULL; + + sk->sk_user_data = NULL; + + xs_restore_old_callbacks(transport, sk); + write_unlock_bh(&sk->sk_callback_lock); + + sk->sk_no_check = 0; + + sock_release(sock); +} + +/** + * xs_close - close a socket + * @xprt: transport + * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. + * + * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with + * xs_reset_transport() zeroing the socket from underneath a writer. + */ +static void xs_close(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + dprintk("RPC: xs_close xprt %p\n", xprt); + + xs_reset_transport(transport); + xprt->reestablish_timeout = 0; + + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); + xprt_disconnect_done(xprt); +} + +static void xs_tcp_close(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state)) + xs_close(xprt); + else + xs_tcp_shutdown(xprt); +} + +/** + * xs_destroy - prepare to shutdown a transport + * @xprt: doomed transport + * + */ +static void xs_destroy(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + dprintk("RPC: xs_destroy xprt %p\n", xprt); + + cancel_delayed_work_sync(&transport->connect_worker); + + xs_close(xprt); + xs_free_peer_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + struct xdr_skb_reader desc = { + .skb = skb, + .offset = sizeof(rpc_fraghdr), + .count = skb->len - sizeof(rpc_fraghdr), + }; + + if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) + return -1; + if (desc.count) + return -1; + return 0; +} + +/** + * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets + * @sk: socket with data to read + * @len: how much data to read + * + * Currently this assumes we can read the whole reply in a single gulp. + */ +static void xs_local_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid; + __be32 *xp; + + read_lock_bh(&sk->sk_callback_lock); + dprintk("RPC: %s...\n", __func__); + xprt = xprt_from_sock(sk); + if (xprt == NULL) + goto out; + + skb = skb_recv_datagram(sk, 0, 1, &err); + if (skb == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(rpc_fraghdr); + if (repsize < 4) { + dprintk("RPC: impossible RPC reply size %d\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->transport_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + copied = rovr->rq_private_buf.buflen; + if (copied > repsize) + copied = repsize; + + if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { + dprintk("RPC: sk_buff copy failed\n"); + goto out_unlock; + } + + xprt_complete_rqst(task, copied); + + out_unlock: + spin_unlock(&xprt->transport_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock_bh(&sk->sk_callback_lock); +} + +/** + * xs_udp_data_ready - "data ready" callback for UDP sockets + * @sk: socket with data to read + * @len: how much data to read + * + */ +static void xs_udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid; + __be32 *xp; + + read_lock_bh(&sk->sk_callback_lock); + dprintk("RPC: xs_udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) + goto out; + + if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(struct udphdr); + if (repsize < 4) { + dprintk("RPC: impossible RPC reply size %d!\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->transport_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + if ((copied = rovr->rq_private_buf.buflen) > repsize) + copied = repsize; + + /* Suck it into the iovec, verify checksum if not done by hw. */ + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { + UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS); + goto out_unlock; + } + + UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS); + + /* Something worked... */ + dst_confirm(skb_dst(skb)); + + xprt_adjust_cwnd(task, copied); + xprt_complete_rqst(task, copied); + + out_unlock: + spin_unlock(&xprt->transport_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + size_t len, used; + char *p; + + p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset; + len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset; + used = xdr_skb_read_bits(desc, p, len); + transport->tcp_offset += used; + if (used != len) + return; + + transport->tcp_reclen = ntohl(transport->tcp_fraghdr); + if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) + transport->tcp_flags |= TCP_RCV_LAST_FRAG; + else + transport->tcp_flags &= ~TCP_RCV_LAST_FRAG; + transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + + transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR; + transport->tcp_offset = 0; + + /* Sanity check of the record length */ + if (unlikely(transport->tcp_reclen < 8)) { + dprintk("RPC: invalid TCP record fragment length\n"); + xprt_force_disconnect(xprt); + return; + } + dprintk("RPC: reading TCP record fragment of length %d\n", + transport->tcp_reclen); +} + +static void xs_tcp_check_fraghdr(struct sock_xprt *transport) +{ + if (transport->tcp_offset == transport->tcp_reclen) { + transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR; + transport->tcp_offset = 0; + if (transport->tcp_flags & TCP_RCV_LAST_FRAG) { + transport->tcp_flags &= ~TCP_RCV_COPY_DATA; + transport->tcp_flags |= TCP_RCV_COPY_XID; + transport->tcp_copied = 0; + } + } +} + +static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc) +{ + size_t len, used; + char *p; + + len = sizeof(transport->tcp_xid) - transport->tcp_offset; + dprintk("RPC: reading XID (%Zu bytes)\n", len); + p = ((char *) &transport->tcp_xid) + transport->tcp_offset; + used = xdr_skb_read_bits(desc, p, len); + transport->tcp_offset += used; + if (used != len) + return; + transport->tcp_flags &= ~TCP_RCV_COPY_XID; + transport->tcp_flags |= TCP_RCV_READ_CALLDIR; + transport->tcp_copied = 4; + dprintk("RPC: reading %s XID %08x\n", + (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for" + : "request with", + ntohl(transport->tcp_xid)); + xs_tcp_check_fraghdr(transport); +} + +static inline void xs_tcp_read_calldir(struct sock_xprt *transport, + struct xdr_skb_reader *desc) +{ + size_t len, used; + u32 offset; + char *p; + + /* + * We want transport->tcp_offset to be 8 at the end of this routine + * (4 bytes for the xid and 4 bytes for the call/reply flag). + * When this function is called for the first time, + * transport->tcp_offset is 4 (after having already read the xid). + */ + offset = transport->tcp_offset - sizeof(transport->tcp_xid); + len = sizeof(transport->tcp_calldir) - offset; + dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len); + p = ((char *) &transport->tcp_calldir) + offset; + used = xdr_skb_read_bits(desc, p, len); + transport->tcp_offset += used; + if (used != len) + return; + transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR; + /* + * We don't yet have the XDR buffer, so we will write the calldir + * out after we get the buffer from the 'struct rpc_rqst' + */ + switch (ntohl(transport->tcp_calldir)) { + case RPC_REPLY: + transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; + transport->tcp_flags |= TCP_RCV_COPY_DATA; + transport->tcp_flags |= TCP_RPC_REPLY; + break; + case RPC_CALL: + transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; + transport->tcp_flags |= TCP_RCV_COPY_DATA; + transport->tcp_flags &= ~TCP_RPC_REPLY; + break; + default: + dprintk("RPC: invalid request message type\n"); + xprt_force_disconnect(&transport->xprt); + } + xs_tcp_check_fraghdr(transport); +} + +static inline void xs_tcp_read_common(struct rpc_xprt *xprt, + struct xdr_skb_reader *desc, + struct rpc_rqst *req) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct xdr_buf *rcvbuf; + size_t len; + ssize_t r; + + rcvbuf = &req->rq_private_buf; + + if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) { + /* + * Save the RPC direction in the XDR buffer + */ + memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied, + &transport->tcp_calldir, + sizeof(transport->tcp_calldir)); + transport->tcp_copied += sizeof(transport->tcp_calldir); + transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR; + } + + len = desc->count; + if (len > transport->tcp_reclen - transport->tcp_offset) { + struct xdr_skb_reader my_desc; + + len = transport->tcp_reclen - transport->tcp_offset; + memcpy(&my_desc, desc, sizeof(my_desc)); + my_desc.count = len; + r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, + &my_desc, xdr_skb_read_bits); + desc->count -= r; + desc->offset += r; + } else + r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, + desc, xdr_skb_read_bits); + + if (r > 0) { + transport->tcp_copied += r; + transport->tcp_offset += r; + } + if (r != len) { + /* Error when copying to the receive buffer, + * usually because we weren't able to allocate + * additional buffer pages. All we can do now + * is turn off TCP_RCV_COPY_DATA, so the request + * will not receive any additional updates, + * and time out. + * Any remaining data from this record will + * be discarded. + */ + transport->tcp_flags &= ~TCP_RCV_COPY_DATA; + dprintk("RPC: XID %08x truncated request\n", + ntohl(transport->tcp_xid)); + dprintk("RPC: xprt = %p, tcp_copied = %lu, " + "tcp_offset = %u, tcp_reclen = %u\n", + xprt, transport->tcp_copied, + transport->tcp_offset, transport->tcp_reclen); + return; + } + + dprintk("RPC: XID %08x read %Zd bytes\n", + ntohl(transport->tcp_xid), r); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " + "tcp_reclen = %u\n", xprt, transport->tcp_copied, + transport->tcp_offset, transport->tcp_reclen); + + if (transport->tcp_copied == req->rq_private_buf.buflen) + transport->tcp_flags &= ~TCP_RCV_COPY_DATA; + else if (transport->tcp_offset == transport->tcp_reclen) { + if (transport->tcp_flags & TCP_RCV_LAST_FRAG) + transport->tcp_flags &= ~TCP_RCV_COPY_DATA; + } +} + +/* + * Finds the request corresponding to the RPC xid and invokes the common + * tcp read code to read the data. + */ +static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, + struct xdr_skb_reader *desc) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct rpc_rqst *req; + + dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); + + /* Find and lock the request corresponding to this xid */ + spin_lock(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, transport->tcp_xid); + if (!req) { + dprintk("RPC: XID %08x request not found!\n", + ntohl(transport->tcp_xid)); + spin_unlock(&xprt->transport_lock); + return -1; + } + + xs_tcp_read_common(xprt, desc, req); + + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) + xprt_complete_rqst(req->rq_task, transport->tcp_copied); + + spin_unlock(&xprt->transport_lock); + return 0; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Obtains an rpc_rqst previously allocated and invokes the common + * tcp read code to read the data. The result is placed in the callback + * queue. + * If we're unable to obtain the rpc_rqst we schedule the closing of the + * connection and return -1. + */ +static inline int xs_tcp_read_callback(struct rpc_xprt *xprt, + struct xdr_skb_reader *desc) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct rpc_rqst *req; + + req = xprt_alloc_bc_request(xprt); + if (req == NULL) { + printk(KERN_WARNING "Callback slot table overflowed\n"); + xprt_force_disconnect(xprt); + return -1; + } + + req->rq_xid = transport->tcp_xid; + dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); + xs_tcp_read_common(xprt, desc, req); + + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) { + struct svc_serv *bc_serv = xprt->bc_serv; + + /* + * Add callback request to callback list. The callback + * service sleeps on the sv_cb_waitq waiting for new + * requests. Wake it up after adding enqueing the + * request. + */ + dprintk("RPC: add callback request to list\n"); + spin_lock(&bc_serv->sv_cb_lock); + list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); + spin_unlock(&bc_serv->sv_cb_lock); + wake_up(&bc_serv->sv_cb_waitq); + } + + req->rq_private_buf.len = transport->tcp_copied; + + return 0; +} + +static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, + struct xdr_skb_reader *desc) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + + return (transport->tcp_flags & TCP_RPC_REPLY) ? + xs_tcp_read_reply(xprt, desc) : + xs_tcp_read_callback(xprt, desc); +} +#else +static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, + struct xdr_skb_reader *desc) +{ + return xs_tcp_read_reply(xprt, desc); +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Read data off the transport. This can be either an RPC_CALL or an + * RPC_REPLY. Relay the processing to helper functions. + */ +static void xs_tcp_read_data(struct rpc_xprt *xprt, + struct xdr_skb_reader *desc) +{ + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + + if (_xs_tcp_read_data(xprt, desc) == 0) + xs_tcp_check_fraghdr(transport); + else { + /* + * The transport_lock protects the request handling. + * There's no need to hold it to update the tcp_flags. + */ + transport->tcp_flags &= ~TCP_RCV_COPY_DATA; + } +} + +static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc) +{ + size_t len; + + len = transport->tcp_reclen - transport->tcp_offset; + if (len > desc->count) + len = desc->count; + desc->count -= len; + desc->offset += len; + transport->tcp_offset += len; + dprintk("RPC: discarded %Zu bytes\n", len); + xs_tcp_check_fraghdr(transport); +} + +static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) +{ + struct rpc_xprt *xprt = rd_desc->arg.data; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct xdr_skb_reader desc = { + .skb = skb, + .offset = offset, + .count = len, + }; + + dprintk("RPC: xs_tcp_data_recv started\n"); + do { + /* Read in a new fragment marker if necessary */ + /* Can we ever really expect to get completely empty fragments? */ + if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) { + xs_tcp_read_fraghdr(xprt, &desc); + continue; + } + /* Read in the xid if necessary */ + if (transport->tcp_flags & TCP_RCV_COPY_XID) { + xs_tcp_read_xid(transport, &desc); + continue; + } + /* Read in the call/reply flag */ + if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) { + xs_tcp_read_calldir(transport, &desc); + continue; + } + /* Read in the request data */ + if (transport->tcp_flags & TCP_RCV_COPY_DATA) { + xs_tcp_read_data(xprt, &desc); + continue; + } + /* Skip over any trailing bytes on short reads */ + xs_tcp_read_discard(transport, &desc); + } while (desc.count); + dprintk("RPC: xs_tcp_data_recv done\n"); + return len - desc.count; +} + +/** + * xs_tcp_data_ready - "data ready" callback for TCP sockets + * @sk: socket with data to read + * @bytes: how much data to read + * + */ +static void xs_tcp_data_ready(struct sock *sk, int bytes) +{ + struct rpc_xprt *xprt; + read_descriptor_t rd_desc; + int read; + + dprintk("RPC: xs_tcp_data_ready...\n"); + + read_lock_bh(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + if (xprt->shutdown) + goto out; + + /* Any data means we had a useful conversation, so + * the we don't need to delay the next reconnect + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + + /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ + rd_desc.arg.data = xprt; + do { + rd_desc.count = 65536; + read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); + } while (read > 0); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +/* + * Do the equivalent of linger/linger2 handling for dealing with + * broken servers that don't close the socket in a timely + * fashion + */ +static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt, + unsigned long timeout) +{ + struct sock_xprt *transport; + + if (xprt_test_and_set_connecting(xprt)) + return; + set_bit(XPRT_CONNECTION_ABORT, &xprt->state); + transport = container_of(xprt, struct sock_xprt, xprt); + queue_delayed_work(rpciod_workqueue, &transport->connect_worker, + timeout); +} + +static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport; + + transport = container_of(xprt, struct sock_xprt, xprt); + + if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) || + !cancel_delayed_work(&transport->connect_worker)) + return; + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + xprt_clear_connecting(xprt); +} + +static void xs_sock_mark_closed(struct rpc_xprt *xprt) +{ + smp_mb__before_clear_bit(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); + /* Mark transport as closed and wake up all pending tasks */ + xprt_disconnect_done(xprt); +} + +/** + * xs_tcp_state_change - callback to handle TCP socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_tcp_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock_bh(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); + dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n", + sk->sk_state, xprt_connected(xprt), + sock_flag(sk, SOCK_DEAD), + sock_flag(sk, SOCK_ZAPPED), + sk->sk_shutdown); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + spin_lock(&xprt->transport_lock); + if (!xprt_test_and_set_connected(xprt)) { + struct sock_xprt *transport = container_of(xprt, + struct sock_xprt, xprt); + + /* Reset TCP record info */ + transport->tcp_offset = 0; + transport->tcp_reclen = 0; + transport->tcp_copied = 0; + transport->tcp_flags = + TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; + + xprt_wake_pending_tasks(xprt, -EAGAIN); + } + spin_unlock(&xprt->transport_lock); + break; + case TCP_FIN_WAIT1: + /* The client initiated a shutdown of the socket */ + xprt->connect_cookie++; + xprt->reestablish_timeout = 0; + set_bit(XPRT_CLOSING, &xprt->state); + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTED, &xprt->state); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + smp_mb__after_clear_bit(); + xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); + break; + case TCP_CLOSE_WAIT: + /* The server initiated a shutdown of the socket */ + xprt_force_disconnect(xprt); + xprt->connect_cookie++; + case TCP_CLOSING: + /* + * If the server closed down the connection, make sure that + * we back off before reconnecting + */ + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + break; + case TCP_LAST_ACK: + set_bit(XPRT_CLOSING, &xprt->state); + xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTED, &xprt->state); + smp_mb__after_clear_bit(); + break; + case TCP_CLOSE: + xs_tcp_cancel_linger_timeout(xprt); + xs_sock_mark_closed(xprt); + } + out: + read_unlock_bh(&sk->sk_callback_lock); +} + +/** + * xs_error_report - callback mainly for catching socket errors + * @sk: socket + */ +static void xs_error_report(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock_bh(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: %s client %p...\n" + "RPC: error %d\n", + __func__, xprt, sk->sk_err); + xprt_wake_pending_tasks(xprt, -EAGAIN); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void xs_write_space(struct sock *sk) +{ + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + return; + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (unlikely(!(xprt = xprt_from_sock(sk)))) + return; + if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) + return; + + xprt_write_space(xprt); +} + +/** + * xs_udp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_udp_write_space(struct sock *sk) +{ + read_lock_bh(&sk->sk_callback_lock); + + /* from net/core/sock.c:sock_def_write_space */ + if (sock_writeable(sk)) + xs_write_space(sk); + + read_unlock_bh(&sk->sk_callback_lock); +} + +/** + * xs_tcp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_tcp_write_space(struct sock *sk) +{ + read_lock_bh(&sk->sk_callback_lock); + + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + xs_write_space(sk); + + read_unlock_bh(&sk->sk_callback_lock); +} + +static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct sock *sk = transport->inet; + + if (transport->rcvsize) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = transport->rcvsize * xprt->max_reqs * 2; + } + if (transport->sndsize) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = transport->sndsize * xprt->max_reqs * 2; + sk->sk_write_space(sk); + } +} + +/** + * xs_udp_set_buffer_size - set send and receive limits + * @xprt: generic transport + * @sndsize: requested size of send buffer, in bytes + * @rcvsize: requested size of receive buffer, in bytes + * + * Set socket send and receive buffer size limits. + */ +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + transport->sndsize = 0; + if (sndsize) + transport->sndsize = sndsize + 1024; + transport->rcvsize = 0; + if (rcvsize) + transport->rcvsize = rcvsize + 1024; + + xs_udp_do_set_buffer_size(xprt); +} + +/** + * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @task: task that timed out + * + * Adjust the congestion window after a retransmit timeout has occurred. + */ +static void xs_udp_timer(struct rpc_task *task) +{ + xprt_adjust_cwnd(task, -ETIMEDOUT); +} + +static unsigned short xs_get_random_port(void) +{ + unsigned short range = xprt_max_resvport - xprt_min_resvport; + unsigned short rand = (unsigned short) net_random() % range; + return rand + xprt_min_resvport; +} + +/** + * xs_set_port - reset the port number in the remote endpoint address + * @xprt: generic transport + * @port: new port number + * + */ +static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) +{ + dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); + + rpc_set_port(xs_addr(xprt), port); + xs_update_peer_port(xprt); +} + +static unsigned short xs_get_srcport(struct sock_xprt *transport) +{ + unsigned short port = transport->srcport; + + if (port == 0 && transport->xprt.resvport) + port = xs_get_random_port(); + return port; +} + +static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) +{ + if (transport->srcport != 0) + transport->srcport = 0; + if (!transport->xprt.resvport) + return 0; + if (port <= xprt_min_resvport || port > xprt_max_resvport) + return xprt_max_resvport; + return --port; +} +static int xs_bind(struct sock_xprt *transport, struct socket *sock) +{ + struct sockaddr_storage myaddr; + int err, nloop = 0; + unsigned short port = xs_get_srcport(transport); + unsigned short last; + + memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); + do { + rpc_set_port((struct sockaddr *)&myaddr, port); + err = kernel_bind(sock, (struct sockaddr *)&myaddr, + transport->xprt.addrlen); + if (port == 0) + break; + if (err == 0) { + transport->srcport = port; + break; + } + last = port; + port = xs_next_srcport(transport, port); + if (port > last) + nloop++; + } while (err == -EADDRINUSE && nloop != 2); + + if (myaddr.ss_family == AF_INET) + dprintk("RPC: %s %pI4:%u: %s (%d)\n", __func__, + &((struct sockaddr_in *)&myaddr)->sin_addr, + port, err ? "failed" : "ok", err); + else + dprintk("RPC: %s %pI6:%u: %s (%d)\n", __func__, + &((struct sockaddr_in6 *)&myaddr)->sin6_addr, + port, err ? "failed" : "ok", err); + return err; +} + +/* + * We don't support autobind on AF_LOCAL sockets + */ +static void xs_local_rpcbind(struct rpc_task *task) +{ + xprt_set_bound(task->tk_xprt); +} + +static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) +{ +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key xs_key[2]; +static struct lock_class_key xs_slock_key[2]; + +static inline void xs_reclassify_socketu(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", + &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]); +} + +static inline void xs_reclassify_socket4(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", + &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]); +} + +static inline void xs_reclassify_socket6(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", + &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); +} + +static inline void xs_reclassify_socket(int family, struct socket *sock) +{ + switch (family) { + case AF_LOCAL: + xs_reclassify_socketu(sock); + break; + case AF_INET: + xs_reclassify_socket4(sock); + break; + case AF_INET6: + xs_reclassify_socket6(sock); + break; + } +} +#else +static inline void xs_reclassify_socketu(struct socket *sock) +{ +} + +static inline void xs_reclassify_socket4(struct socket *sock) +{ +} + +static inline void xs_reclassify_socket6(struct socket *sock) +{ +} + +static inline void xs_reclassify_socket(int family, struct socket *sock) +{ +} +#endif + +static struct socket *xs_create_sock(struct rpc_xprt *xprt, + struct sock_xprt *transport, int family, int type, int protocol) +{ + struct socket *sock; + int err; + + err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1); + if (err < 0) { + dprintk("RPC: can't create %d transport socket (%d).\n", + protocol, -err); + goto out; + } + xs_reclassify_socket(family, sock); + + err = xs_bind(transport, sock); + if (err) { + sock_release(sock); + goto out; + } + + return sock; +out: + return ERR_PTR(err); +} + +static int xs_local_finish_connecting(struct rpc_xprt *xprt, + struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, + xprt); + + if (!transport->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + xs_save_old_callbacks(transport, sk); + + sk->sk_user_data = xprt; + sk->sk_data_ready = xs_local_data_ready; + sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; + sk->sk_allocation = GFP_ATOMIC; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + transport->sock = sock; + transport->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ + xprt->stat.connect_count++; + xprt->stat.connect_start = jiffies; + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); +} + +/** + * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint + * @xprt: RPC transport to connect + * @transport: socket transport to connect + * @create_sock: function to create a socket of the correct type + * + * Invoked by a work queue tasklet. + */ +static void xs_local_setup_socket(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock; + int status = -EIO; + + if (xprt->shutdown) + goto out; + + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + status = __sock_create(xprt->xprt_net, AF_LOCAL, + SOCK_STREAM, 0, &sock, 1); + if (status < 0) { + dprintk("RPC: can't create AF_LOCAL " + "transport socket (%d).\n", -status); + goto out; + } + xs_reclassify_socketu(sock); + + dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + + status = xs_local_finish_connecting(xprt, sock); + switch (status) { + case 0: + dprintk("RPC: xprt %p connected to %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt_set_connected(xprt); + break; + case -ENOENT: + dprintk("RPC: xprt %p: socket %s does not exist\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + break; + default: + printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n", + __func__, -status, + xprt->address_strings[RPC_DISPLAY_ADDR]); + } + +out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); +} + +static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + if (!transport->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + xs_save_old_callbacks(transport, sk); + + sk->sk_user_data = xprt; + sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; + sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; + + xprt_set_connected(xprt); + + /* Reset to new socket */ + transport->sock = sock; + transport->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + xs_udp_do_set_buffer_size(xprt); +} + +static void xs_udp_setup_socket(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock = transport->sock; + int status = -EIO; + + if (xprt->shutdown) + goto out; + + /* Start by resetting any existing state */ + xs_reset_transport(transport); + sock = xs_create_sock(xprt, transport, + xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP); + if (IS_ERR(sock)) + goto out; + + dprintk("RPC: worker connecting xprt %p via %s to " + "%s (port %s)\n", xprt, + xprt->address_strings[RPC_DISPLAY_PROTO], + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); + + xs_udp_finish_connecting(xprt, sock); + status = 0; +out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); +} + +/* + * We need to preserve the port number so the reply cache on the server can + * find our cached RPC replies when we get around to reconnecting. + */ +static void xs_abort_connection(struct sock_xprt *transport) +{ + int result; + struct sockaddr any; + + dprintk("RPC: disconnecting xprt %p to reuse port\n", transport); + + /* + * Disconnect the transport socket by doing a connect operation + * with AF_UNSPEC. This should return immediately... + */ + memset(&any, 0, sizeof(any)); + any.sa_family = AF_UNSPEC; + result = kernel_connect(transport->sock, &any, sizeof(any), 0); + if (!result) + xs_sock_mark_closed(&transport->xprt); + else + dprintk("RPC: AF_UNSPEC connect return code %d\n", + result); +} + +static void xs_tcp_reuse_connection(struct sock_xprt *transport) +{ + unsigned int state = transport->inet->sk_state; + + if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) { + /* we don't need to abort the connection if the socket + * hasn't undergone a shutdown + */ + if (transport->inet->sk_shutdown == 0) + return; + dprintk("RPC: %s: TCP_CLOSEd and sk_shutdown set to %d\n", + __func__, transport->inet->sk_shutdown); + } + if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) { + /* we don't need to abort the connection if the socket + * hasn't undergone a shutdown + */ + if (transport->inet->sk_shutdown == 0) + return; + dprintk("RPC: %s: ESTABLISHED/SYN_SENT " + "sk_shutdown set to %d\n", + __func__, transport->inet->sk_shutdown); + } + xs_abort_connection(transport); +} + +static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int ret = -ENOTCONN; + + if (!transport->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + xs_save_old_callbacks(transport, sk); + + sk->sk_user_data = xprt; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; + sk->sk_error_report = xs_error_report; + sk->sk_allocation = GFP_ATOMIC; + + /* socket options */ + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + sock_reset_flag(sk, SOCK_LINGER); + tcp_sk(sk)->linger2 = 0; + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + transport->sock = sock; + transport->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + if (!xprt_bound(xprt)) + goto out; + + /* Tell the socket layer to start connecting... */ + xprt->stat.connect_count++; + xprt->stat.connect_start = jiffies; + ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + switch (ret) { + case 0: + case -EINPROGRESS: + /* SYN_SENT! */ + xprt->connect_cookie++; + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + } +out: + return ret; +} + +/** + * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint + * @xprt: RPC transport to connect + * @transport: socket transport to connect + * @create_sock: function to create a socket of the correct type + * + * Invoked by a work queue tasklet. + */ +static void xs_tcp_setup_socket(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct socket *sock = transport->sock; + struct rpc_xprt *xprt = &transport->xprt; + int status = -EIO; + + if (xprt->shutdown) + goto out; + + if (!sock) { + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + sock = xs_create_sock(xprt, transport, + xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP); + if (IS_ERR(sock)) { + status = PTR_ERR(sock); + goto out; + } + } else { + int abort_and_exit; + + abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, + &xprt->state); + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(transport); + + if (abort_and_exit) + goto out_eagain; + } + + dprintk("RPC: worker connecting xprt %p via %s to " + "%s (port %s)\n", xprt, + xprt->address_strings[RPC_DISPLAY_PROTO], + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); + + status = xs_tcp_finish_connecting(xprt, sock); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), + sock->sk->sk_state); + switch (status) { + default: + printk("%s: connect returned unhandled error %d\n", + __func__, status); + case -EADDRNOTAVAIL: + /* We're probably in TIME_WAIT. Get rid of existing socket, + * and retry + */ + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + xprt_force_disconnect(xprt); + break; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENETUNREACH: + /* retry with existing socket, after a delay */ + case 0: + case -EINPROGRESS: + case -EALREADY: + xprt_clear_connecting(xprt); + return; + case -EINVAL: + /* Happens, for instance, if the user specified a link + * local IPv6 address without a scope-id. + */ + goto out; + } +out_eagain: + status = -EAGAIN; +out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); +} + +/** + * xs_connect - connect a socket to a remote endpoint + * @task: address of RPC task that manages state of connect request + * + * TCP: If the remote end dropped the connection, delay reconnecting. + * + * UDP socket connects are synchronous, but we use a work queue anyway + * to guarantee that even unprivileged user processes can set up a + * socket on a privileged port. + * + * If a UDP socket connect fails, the delay behavior here prevents + * retry floods (hard mounts). + */ +static void xs_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + dprintk("RPC: xs_connect delayed xprt %p for %lu " + "seconds\n", + xprt, xprt->reestablish_timeout / HZ); + queue_delayed_work(rpciod_workqueue, + &transport->connect_worker, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) + xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); + queue_delayed_work(rpciod_workqueue, + &transport->connect_worker, 0); + } +} + +/** + * xs_local_print_stats - display AF_LOCAL socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu " + "%llu %llu\n", + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +/** + * xs_udp_print_stats - display UDP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %Lu %Lu\n", + transport->srcport, + xprt->stat.bind_count, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +/** + * xs_tcp_print_stats - display TCP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu\n", + transport->srcport, + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +/* + * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason + * we allocate pages instead doing a kmalloc like rpc_malloc is because we want + * to use the server side send routines. + */ +static void *bc_malloc(struct rpc_task *task, size_t size) +{ + struct page *page; + struct rpc_buffer *buf; + + BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer)); + page = alloc_page(GFP_KERNEL); + + if (!page) + return NULL; + + buf = page_address(page); + buf->len = PAGE_SIZE; + + return buf->data; +} + +/* + * Free the space allocated in the bc_alloc routine + */ +static void bc_free(void *buffer) +{ + struct rpc_buffer *buf; + + if (!buffer) + return; + + buf = container_of(buffer, struct rpc_buffer, data); + free_page((unsigned long)buf); +} + +/* + * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex + * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request. + */ +static int bc_sendto(struct rpc_rqst *req) +{ + int len; + struct xdr_buf *xbufp = &req->rq_snd_buf; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct socket *sock = transport->sock; + unsigned long headoff; + unsigned long tailoff; + + xs_encode_stream_record_marker(xbufp); + + tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK; + headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK; + len = svc_send_common(sock, xbufp, + virt_to_page(xbufp->head[0].iov_base), headoff, + xbufp->tail[0].iov_base, tailoff); + + if (len != xbufp->len) { + printk(KERN_NOTICE "Error sending entire callback!\n"); + len = -EAGAIN; + } + + return len; +} + +/* + * The send routine. Borrows from svc_send + */ +static int bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct svc_xprt *xprt; + struct svc_sock *svsk; + u32 len; + + dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); + /* + * Get the server socket associated with this callback xprt + */ + xprt = req->rq_xprt->bc_xprt; + svsk = container_of(xprt, struct svc_sock, sk_xprt); + + /* + * Grab the mutex to serialize data as the connection is shared + * with the fore channel + */ + if (!mutex_trylock(&xprt->xpt_mutex)) { + rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&xprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task); + } + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = bc_sendto(req); + mutex_unlock(&xprt->xpt_mutex); + + if (len > 0) + len = 0; + + return len; +} + +/* + * The close routine. Since this is client initiated, we do nothing + */ + +static void bc_close(struct rpc_xprt *xprt) +{ +} + +/* + * The xprt destroy routine. Again, because this connection is client + * initiated, we do nothing + */ + +static void bc_destroy(struct rpc_xprt *xprt) +{ +} + +static struct rpc_xprt_ops xs_local_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xs_tcp_release_xprt, + .rpcbind = xs_local_rpcbind, + .set_port = xs_local_set_port, + .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_local_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xs_close, + .destroy = xs_destroy, + .print_stats = xs_local_print_stats, +}; + +static struct rpc_xprt_ops xs_udp_ops = { + .set_buffer_size = xs_udp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, + .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_udp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .timer = xs_udp_timer, + .release_request = xprt_release_rqst_cong, + .close = xs_close, + .destroy = xs_destroy, + .print_stats = xs_udp_print_stats, +}; + +static struct rpc_xprt_ops xs_tcp_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xs_tcp_release_xprt, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, + .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_tcp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xs_tcp_close, + .destroy = xs_destroy, + .print_stats = xs_tcp_print_stats, +}; + +/* + * The rpc_xprt_ops for the server backchannel + */ + +static struct rpc_xprt_ops bc_tcp_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, + .buf_alloc = bc_malloc, + .buf_free = bc_free, + .send_request = bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = bc_close, + .destroy = bc_destroy, + .print_stats = xs_tcp_print_stats, +}; + +static int xs_init_anyaddr(const int family, struct sockaddr *sap) +{ + static const struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + static const struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + }; + + switch (family) { + case AF_LOCAL: + break; + case AF_INET: + memcpy(sap, &sin, sizeof(sin)); + break; + case AF_INET6: + memcpy(sap, &sin6, sizeof(sin6)); + break; + default: + dprintk("RPC: %s: Bad address family\n", __func__); + return -EAFNOSUPPORT; + } + return 0; +} + +static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, + unsigned int slot_table_size) +{ + struct rpc_xprt *xprt; + struct sock_xprt *new; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: xs_setup_xprt: address too large\n"); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size); + if (xprt == NULL) { + dprintk("RPC: xs_setup_xprt: couldn't allocate " + "rpc_xprt\n"); + return ERR_PTR(-ENOMEM); + } + + new = container_of(xprt, struct sock_xprt, xprt); + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + if (args->srcaddr) + memcpy(&new->srcaddr, args->srcaddr, args->addrlen); + else { + int err; + err = xs_init_anyaddr(args->dstaddr->sa_family, + (struct sockaddr *)&new->srcaddr); + if (err != 0) + return ERR_PTR(err); + } + + return xprt; +} + +static const struct rpc_timeout xs_local_default_timeout = { + .to_initval = 10 * HZ, + .to_maxval = 10 * HZ, + .to_retries = 2, +}; + +/** + * xs_setup_local - Set up transport to use an AF_LOCAL socket + * @args: rpc transport creation arguments + * + * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP + */ +static struct rpc_xprt *xs_setup_local(struct xprt_create *args) +{ + struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr; + struct sock_xprt *transport; + struct rpc_xprt *xprt; + struct rpc_xprt *ret; + + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = 0; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_local_ops; + xprt->timeout = &xs_local_default_timeout; + + switch (sun->sun_family) { + case AF_LOCAL: + if (sun->sun_path[0] != '/') { + dprintk("RPC: bad AF_LOCAL address: %s\n", + sun->sun_path); + ret = ERR_PTR(-EINVAL); + goto out_err; + } + xprt_set_bound(xprt); + INIT_DELAYED_WORK(&transport->connect_worker, + xs_local_setup_socket); + xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); + break; + default: + ret = ERR_PTR(-EAFNOSUPPORT); + goto out_err; + } + + dprintk("RPC: set up xprt to %s via AF_LOCAL\n", + xprt->address_strings[RPC_DISPLAY_ADDR]); + + if (try_module_get(THIS_MODULE)) + return xprt; + ret = ERR_PTR(-EINVAL); +out_err: + xprt_free(xprt); + return ret; +} + +static const struct rpc_timeout xs_udp_default_timeout = { + .to_initval = 5 * HZ, + .to_maxval = 30 * HZ, + .to_increment = 5 * HZ, + .to_retries = 5, +}; + +/** + * xs_setup_udp - Set up transport to use a UDP socket + * @args: rpc transport creation arguments + * + */ +static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) +{ + struct sockaddr *addr = args->dstaddr; + struct rpc_xprt *xprt; + struct sock_xprt *transport; + struct rpc_xprt *ret; + + xprt = xs_setup_xprt(args, xprt_udp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = IPPROTO_UDP; + xprt->tsh_size = 0; + /* XXX: header size can vary due to auth type, IPv6, etc. */ + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + xprt->bind_timeout = XS_BIND_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_udp_ops; + + xprt->timeout = &xs_udp_default_timeout; + + switch (addr->sa_family) { + case AF_INET: + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, + xs_udp_setup_socket); + xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); + break; + case AF_INET6: + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, + xs_udp_setup_socket); + xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); + break; + default: + ret = ERR_PTR(-EAFNOSUPPORT); + goto out_err; + } + + if (xprt_bound(xprt)) + dprintk("RPC: set up xprt to %s (port %s) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + xprt->address_strings[RPC_DISPLAY_PROTO]); + else + dprintk("RPC: set up xprt to %s (autobind) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PROTO]); + + if (try_module_get(THIS_MODULE)) + return xprt; + ret = ERR_PTR(-EINVAL); +out_err: + xprt_free(xprt); + return ret; +} + +static const struct rpc_timeout xs_tcp_default_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, + .to_retries = 2, +}; + +/** + * xs_setup_tcp - Set up transport to use a TCP socket + * @args: rpc transport creation arguments + * + */ +static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) +{ + struct sockaddr *addr = args->dstaddr; + struct rpc_xprt *xprt; + struct sock_xprt *transport; + struct rpc_xprt *ret; + + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = IPPROTO_TCP; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_tcp_ops; + xprt->timeout = &xs_tcp_default_timeout; + + switch (addr->sa_family) { + case AF_INET: + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, + xs_tcp_setup_socket); + xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); + break; + case AF_INET6: + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, + xs_tcp_setup_socket); + xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); + break; + default: + ret = ERR_PTR(-EAFNOSUPPORT); + goto out_err; + } + + if (xprt_bound(xprt)) + dprintk("RPC: set up xprt to %s (port %s) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + xprt->address_strings[RPC_DISPLAY_PROTO]); + else + dprintk("RPC: set up xprt to %s (autobind) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PROTO]); + + + if (try_module_get(THIS_MODULE)) + return xprt; + ret = ERR_PTR(-EINVAL); +out_err: + xprt_free(xprt); + return ret; +} + +/** + * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket + * @args: rpc transport creation arguments + * + */ +static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) +{ + struct sockaddr *addr = args->dstaddr; + struct rpc_xprt *xprt; + struct sock_xprt *transport; + struct svc_sock *bc_sock; + struct rpc_xprt *ret; + + if (args->bc_xprt->xpt_bc_xprt) { + /* + * This server connection already has a backchannel + * export; we can't create a new one, as we wouldn't be + * able to match replies based on xid any more. So, + * reuse the already-existing one: + */ + return args->bc_xprt->xpt_bc_xprt; + } + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = IPPROTO_TCP; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + xprt->timeout = &xs_tcp_default_timeout; + + /* backchannel */ + xprt_set_bound(xprt); + xprt->bind_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; + + xprt->ops = &bc_tcp_ops; + + switch (addr->sa_family) { + case AF_INET: + xs_format_peer_addresses(xprt, "tcp", + RPCBIND_NETID_TCP); + break; + case AF_INET6: + xs_format_peer_addresses(xprt, "tcp", + RPCBIND_NETID_TCP6); + break; + default: + ret = ERR_PTR(-EAFNOSUPPORT); + goto out_err; + } + + dprintk("RPC: set up xprt to %s (port %s) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + xprt->address_strings[RPC_DISPLAY_PROTO]); + + /* + * Once we've associated a backchannel xprt with a connection, + * we want to keep it around as long as long as the connection + * lasts, in case we need to start using it for a backchannel + * again; this reference won't be dropped until bc_xprt is + * destroyed. + */ + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt); + transport->sock = bc_sock->sk_sock; + transport->inet = bc_sock->sk_sk; + + /* + * Since we don't want connections for the backchannel, we set + * the xprt status to connected + */ + xprt_set_connected(xprt); + + + if (try_module_get(THIS_MODULE)) + return xprt; + xprt_put(xprt); + ret = ERR_PTR(-EINVAL); +out_err: + xprt_free(xprt); + return ret; +} + +static struct xprt_class xs_local_transport = { + .list = LIST_HEAD_INIT(xs_local_transport.list), + .name = "named UNIX socket", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_LOCAL, + .setup = xs_setup_local, +}; + +static struct xprt_class xs_udp_transport = { + .list = LIST_HEAD_INIT(xs_udp_transport.list), + .name = "udp", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_UDP, + .setup = xs_setup_udp, +}; + +static struct xprt_class xs_tcp_transport = { + .list = LIST_HEAD_INIT(xs_tcp_transport.list), + .name = "tcp", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_TCP, + .setup = xs_setup_tcp, +}; + +static struct xprt_class xs_bc_tcp_transport = { + .list = LIST_HEAD_INIT(xs_bc_tcp_transport.list), + .name = "tcp NFSv4.1 backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_TCP, + .setup = xs_setup_bc_tcp, +}; + +/** + * init_socket_xprt - set up xprtsock's sysctls, register with RPC client + * + */ +int init_socket_xprt(void) +{ +#ifdef RPC_DEBUG + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl_table(sunrpc_table); +#endif + + xprt_register_transport(&xs_local_transport); + xprt_register_transport(&xs_udp_transport); + xprt_register_transport(&xs_tcp_transport); + xprt_register_transport(&xs_bc_tcp_transport); + + return 0; +} + +/** + * cleanup_socket_xprt - remove xprtsock's sysctls, unregister + * + */ +void cleanup_socket_xprt(void) +{ +#ifdef RPC_DEBUG + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +#endif + + xprt_unregister_transport(&xs_local_transport); + xprt_unregister_transport(&xs_udp_transport); + xprt_unregister_transport(&xs_tcp_transport); + xprt_unregister_transport(&xs_bc_tcp_transport); +} + +static int param_set_uint_minmax(const char *val, + const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} + +static int param_set_portnr(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, + RPC_MIN_RESVPORT, + RPC_MAX_RESVPORT); +} + +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, +}; + +#define param_check_portnr(name, p) \ + __param_check(name, p, unsigned int); + +module_param_named(min_resvport, xprt_min_resvport, portnr, 0644); +module_param_named(max_resvport, xprt_max_resvport, portnr, 0644); + +static int param_set_slot_table_size(const char *val, + const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, + RPC_MIN_SLOT_TABLE, + RPC_MAX_SLOT_TABLE); +} + +static struct kernel_param_ops param_ops_slot_table_size = { + .set = param_set_slot_table_size, + .get = param_get_uint, +}; + +#define param_check_slot_table_size(name, p) \ + __param_check(name, p, unsigned int); + +module_param_named(tcp_slot_table_entries, xprt_tcp_slot_table_entries, + slot_table_size, 0644); +module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries, + slot_table_size, 0644); + diff --git a/net/sysctl_net.c b/net/sysctl_net.c new file mode 100644 index 00000000..ca84212c --- /dev/null +++ b/net/sysctl_net.c @@ -0,0 +1,129 @@ +/* -*- linux-c -*- + * sysctl_net.c: sysctl interface to net subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net directories for each protocol family. [MS] + * + * Revision 1.2 1996/05/08 20:24:40 shaver + * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and + * NET_IPV4_IP_FORWARD. + * + * + */ + +#include +#include +#include + +#include + +#ifdef CONFIG_INET +#include +#endif + +#ifdef CONFIG_NET +#include +#endif + +#ifdef CONFIG_TR +#include +#endif + +static struct ctl_table_set * +net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + return &namespaces->net_ns->sysctls; +} + +static int is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->net_ns->sysctls == set; +} + +/* Return standard mode bits for table entry. */ +static int net_ctl_permissions(struct ctl_table_root *root, + struct nsproxy *nsproxy, + struct ctl_table *table) +{ + /* Allow network administrator to have same access as root. */ + if (capable(CAP_NET_ADMIN)) { + int mode = (table->mode >> 6) & 7; + return (mode << 6) | (mode << 3) | mode; + } + return table->mode; +} + +static struct ctl_table_root net_sysctl_root = { + .lookup = net_ctl_header_lookup, + .permissions = net_ctl_permissions, +}; + +static int net_ctl_ro_header_perms(struct ctl_table_root *root, + struct nsproxy *namespaces, struct ctl_table *table) +{ + if (net_eq(namespaces->net_ns, &init_net)) + return table->mode; + else + return table->mode & ~0222; +} + +static struct ctl_table_root net_sysctl_ro_root = { + .permissions = net_ctl_ro_header_perms, +}; + +static int __net_init sysctl_net_init(struct net *net) +{ + setup_sysctl_set(&net->sysctls, + &net_sysctl_ro_root.default_set, + is_seen); + return 0; +} + +static void __net_exit sysctl_net_exit(struct net *net) +{ + WARN_ON(!list_empty(&net->sysctls.list)); +} + +static struct pernet_operations sysctl_pernet_ops = { + .init = sysctl_net_init, + .exit = sysctl_net_exit, +}; + +static __init int sysctl_init(void) +{ + int ret; + ret = register_pernet_subsys(&sysctl_pernet_ops); + if (ret) + goto out; + register_sysctl_root(&net_sysctl_root); + setup_sysctl_set(&net_sysctl_ro_root.default_set, NULL, NULL); + register_sysctl_root(&net_sysctl_ro_root); +out: + return ret; +} +subsys_initcall(sysctl_init); + +struct ctl_table_header *register_net_sysctl_table(struct net *net, + const struct ctl_path *path, struct ctl_table *table) +{ + struct nsproxy namespaces; + namespaces = *current->nsproxy; + namespaces.net_ns = net; + return __register_sysctl_paths(&net_sysctl_root, + &namespaces, path, table); +} +EXPORT_SYMBOL_GPL(register_net_sysctl_table); + +struct ctl_table_header *register_net_sysctl_rotable(const + struct ctl_path *path, struct ctl_table *table) +{ + return __register_sysctl_paths(&net_sysctl_ro_root, + &init_nsproxy, path, table); +} +EXPORT_SYMBOL_GPL(register_net_sysctl_rotable); + +void unregister_net_sysctl_table(struct ctl_table_header *header) +{ + unregister_sysctl_table(header); +} +EXPORT_SYMBOL_GPL(unregister_net_sysctl_table); diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig new file mode 100644 index 00000000..2c5954b8 --- /dev/null +++ b/net/tipc/Kconfig @@ -0,0 +1,69 @@ +# +# TIPC configuration +# + +menuconfig TIPC + tristate "The TIPC Protocol (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + ---help--- + The Transparent Inter Process Communication (TIPC) protocol is + specially designed for intra cluster communication. This protocol + originates from Ericsson where it has been used in carrier grade + cluster applications for many years. + + For more information about TIPC, see http://tipc.sourceforge.net. + + This protocol support is also available as a module ( = code which + can be inserted in and removed from the running kernel whenever you + want). The module will be called tipc. If you want to compile it + as a module, say M here and read . + + If in doubt, say N. + +if TIPC + +config TIPC_ADVANCED + bool "Advanced TIPC configuration" + default n + help + Saying Y here will open some advanced configuration for TIPC. + Most users do not need to bother; if unsure, just say N. + +config TIPC_PORTS + int "Maximum number of ports in a node" + depends on TIPC_ADVANCED + range 127 65535 + default "8191" + help + Specifies how many ports can be supported by a node. + Can range from 127 to 65535 ports; default is 8191. + + Setting this to a smaller value saves some memory, + setting it to higher allows for more ports. + +config TIPC_LOG + int "Size of log buffer" + depends on TIPC_ADVANCED + range 0 32768 + default "0" + help + Size (in bytes) of TIPC's internal log buffer, which records the + occurrence of significant events. Can range from 0 to 32768 bytes; + default is 0. + + There is no need to enable the log buffer unless the node will be + managed remotely via TIPC. + +config TIPC_DEBUG + bool "Enable debugging support" + default n + help + Saying Y here enables TIPC debugging capabilities used by developers. + Most users do not need to bother; if unsure, just say N. + + Enabling debugging support causes TIPC to display data about its + internal state when certain abnormal conditions occur. It also + makes it easy for developers to capture additional information of + interest using the dbg() or msg_dbg() macros. + +endif # TIPC diff --git a/net/tipc/Makefile b/net/tipc/Makefile new file mode 100644 index 00000000..521d24d0 --- /dev/null +++ b/net/tipc/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the Linux TIPC layer +# + +obj-$(CONFIG_TIPC) := tipc.o + +tipc-y += addr.o bcast.o bearer.o config.o \ + core.o handler.o link.o discover.o msg.o \ + name_distr.o subscr.o name_table.o net.o \ + netlink.o node.o node_subscr.o port.o ref.o \ + socket.o log.o eth_media.o + +# End of file diff --git a/net/tipc/addr.c b/net/tipc/addr.c new file mode 100644 index 00000000..a6fdab33 --- /dev/null +++ b/net/tipc/addr.c @@ -0,0 +1,106 @@ +/* + * net/tipc/addr.c: TIPC address utility routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" + +/** + * tipc_addr_domain_valid - validates a network domain address + * + * Accepts , , , and <0.0.0>, + * where Z, C, and N are non-zero. + * + * Returns 1 if domain address is valid, otherwise 0 + */ + +int tipc_addr_domain_valid(u32 addr) +{ + u32 n = tipc_node(addr); + u32 c = tipc_cluster(addr); + u32 z = tipc_zone(addr); + + if (n && (!z || !c)) + return 0; + if (c && !z) + return 0; + return 1; +} + +/** + * tipc_addr_node_valid - validates a proposed network address for this node + * + * Accepts , where Z, C, and N are non-zero. + * + * Returns 1 if address can be used, otherwise 0 + */ + +int tipc_addr_node_valid(u32 addr) +{ + return tipc_addr_domain_valid(addr) && tipc_node(addr); +} + +int tipc_in_scope(u32 domain, u32 addr) +{ + if (!domain || (domain == addr)) + return 1; + if (domain == tipc_cluster_mask(addr)) /* domain */ + return 1; + if (domain == tipc_zone_mask(addr)) /* domain */ + return 1; + return 0; +} + +/** + * tipc_addr_scope - convert message lookup domain to a 2-bit scope value + */ + +int tipc_addr_scope(u32 domain) +{ + if (likely(!domain)) + return TIPC_ZONE_SCOPE; + if (tipc_node(domain)) + return TIPC_NODE_SCOPE; + if (tipc_cluster(domain)) + return TIPC_CLUSTER_SCOPE; + return TIPC_ZONE_SCOPE; +} + +char *tipc_addr_string_fill(char *string, u32 addr) +{ + snprintf(string, 16, "<%u.%u.%u>", + tipc_zone(addr), tipc_cluster(addr), tipc_node(addr)); + return string; +} diff --git a/net/tipc/addr.h b/net/tipc/addr.h new file mode 100644 index 00000000..e4f35afe --- /dev/null +++ b/net/tipc/addr.h @@ -0,0 +1,79 @@ +/* + * net/tipc/addr.h: Include file for TIPC address utility routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_ADDR_H +#define _TIPC_ADDR_H + +#define TIPC_ZONE_MASK 0xff000000u +#define TIPC_CLUSTER_MASK 0xfffff000u + +static inline u32 tipc_zone_mask(u32 addr) +{ + return addr & TIPC_ZONE_MASK; +} + +static inline u32 tipc_cluster_mask(u32 addr) +{ + return addr & TIPC_CLUSTER_MASK; +} + +static inline int in_own_cluster(u32 addr) +{ + return !((addr ^ tipc_own_addr) >> 12); +} + +/** + * addr_domain - convert 2-bit scope value to equivalent message lookup domain + * + * Needed when address of a named message must be looked up a second time + * after a network hop. + */ + +static inline u32 addr_domain(u32 sc) +{ + if (likely(sc == TIPC_NODE_SCOPE)) + return tipc_own_addr; + if (sc == TIPC_CLUSTER_SCOPE) + return tipc_cluster_mask(tipc_own_addr); + return tipc_zone_mask(tipc_own_addr); +} + +int tipc_addr_domain_valid(u32); +int tipc_addr_node_valid(u32 addr); +int tipc_in_scope(u32 domain, u32 addr); +int tipc_addr_scope(u32 domain); +char *tipc_addr_string_fill(char *string, u32 addr); +#endif diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c new file mode 100644 index 00000000..fa68d1e9 --- /dev/null +++ b/net/tipc/bcast.c @@ -0,0 +1,914 @@ +/* + * net/tipc/bcast.c: TIPC broadcast code + * + * Copyright (c) 2004-2006, Ericsson AB + * Copyright (c) 2004, Intel Corporation. + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "link.h" +#include "port.h" +#include "bcast.h" + +#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ + +#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ + +/** + * struct bcbearer_pair - a pair of bearers used by broadcast link + * @primary: pointer to primary bearer + * @secondary: pointer to secondary bearer + * + * Bearers must have same priority and same set of reachable destinations + * to be paired. + */ + +struct bcbearer_pair { + struct tipc_bearer *primary; + struct tipc_bearer *secondary; +}; + +/** + * struct bcbearer - bearer used by broadcast link + * @bearer: (non-standard) broadcast bearer structure + * @media: (non-standard) broadcast media structure + * @bpairs: array of bearer pairs + * @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort() + * @remains: temporary node map used by tipc_bcbearer_send() + * @remains_new: temporary node map used tipc_bcbearer_send() + * + * Note: The fields labelled "temporary" are incorporated into the bearer + * to avoid consuming potentially limited stack space through the use of + * large local variables within multicast routines. Concurrent access is + * prevented through use of the spinlock "bc_lock". + */ + +struct bcbearer { + struct tipc_bearer bearer; + struct media media; + struct bcbearer_pair bpairs[MAX_BEARERS]; + struct bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1]; + struct tipc_node_map remains; + struct tipc_node_map remains_new; +}; + +/** + * struct bclink - link used for broadcast messages + * @link: (non-standard) broadcast link structure + * @node: (non-standard) node structure representing b'cast link's peer node + * @retransmit_to: node that most recently requested a retransmit + * + * Handles sequence numbering, fragmentation, bundling, etc. + */ + +struct bclink { + struct link link; + struct tipc_node node; + struct tipc_node *retransmit_to; +}; + + +static struct bcbearer *bcbearer; +static struct bclink *bclink; +static struct link *bcl; +static DEFINE_SPINLOCK(bc_lock); + +/* broadcast-capable node map */ +struct tipc_node_map tipc_bcast_nmap; + +const char tipc_bclink_name[] = "broadcast-link"; + +static void tipc_nmap_diff(struct tipc_node_map *nm_a, + struct tipc_node_map *nm_b, + struct tipc_node_map *nm_diff); + +static u32 buf_seqno(struct sk_buff *buf) +{ + return msg_seqno(buf_msg(buf)); +} + +static u32 bcbuf_acks(struct sk_buff *buf) +{ + return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle; +} + +static void bcbuf_set_acks(struct sk_buff *buf, u32 acks) +{ + TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks; +} + +static void bcbuf_decr_acks(struct sk_buff *buf) +{ + bcbuf_set_acks(buf, bcbuf_acks(buf) - 1); +} + + +static void bclink_set_last_sent(void) +{ + if (bcl->next_out) + bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1); + else + bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); +} + +u32 tipc_bclink_get_last_sent(void) +{ + return bcl->fsm_msg_cnt; +} + +/** + * bclink_set_gap - set gap according to contents of current deferred pkt queue + * + * Called with 'node' locked, bc_lock unlocked + */ + +static void bclink_set_gap(struct tipc_node *n_ptr) +{ + struct sk_buff *buf = n_ptr->bclink.deferred_head; + + n_ptr->bclink.gap_after = n_ptr->bclink.gap_to = + mod(n_ptr->bclink.last_in); + if (unlikely(buf != NULL)) + n_ptr->bclink.gap_to = mod(buf_seqno(buf) - 1); +} + +/** + * bclink_ack_allowed - test if ACK or NACK message can be sent at this moment + * + * This mechanism endeavours to prevent all nodes in network from trying + * to ACK or NACK at the same time. + * + * Note: TIPC uses a different trigger to distribute ACKs than it does to + * distribute NACKs, but tries to use the same spacing (divide by 16). + */ + +static int bclink_ack_allowed(u32 n) +{ + return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag; +} + + +/** + * tipc_bclink_retransmit_to - get most recent node to request retransmission + * + * Called with bc_lock locked + */ + +struct tipc_node *tipc_bclink_retransmit_to(void) +{ + return bclink->retransmit_to; +} + +/** + * bclink_retransmit_pkt - retransmit broadcast packets + * @after: sequence number of last packet to *not* retransmit + * @to: sequence number of last packet to retransmit + * + * Called with bc_lock locked + */ + +static void bclink_retransmit_pkt(u32 after, u32 to) +{ + struct sk_buff *buf; + + buf = bcl->first_out; + while (buf && less_eq(buf_seqno(buf), after)) + buf = buf->next; + tipc_link_retransmit(bcl, buf, mod(to - after)); +} + +/** + * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets + * @n_ptr: node that sent acknowledgement info + * @acked: broadcast sequence # that has been acknowledged + * + * Node is locked, bc_lock unlocked. + */ + +void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) +{ + struct sk_buff *crs; + struct sk_buff *next; + unsigned int released = 0; + + if (less_eq(acked, n_ptr->bclink.acked)) + return; + + spin_lock_bh(&bc_lock); + + /* Skip over packets that node has previously acknowledged */ + + crs = bcl->first_out; + while (crs && less_eq(buf_seqno(crs), n_ptr->bclink.acked)) + crs = crs->next; + + /* Update packets that node is now acknowledging */ + + while (crs && less_eq(buf_seqno(crs), acked)) { + next = crs->next; + bcbuf_decr_acks(crs); + if (bcbuf_acks(crs) == 0) { + bcl->first_out = next; + bcl->out_queue_size--; + buf_discard(crs); + released = 1; + } + crs = next; + } + n_ptr->bclink.acked = acked; + + /* Try resolving broadcast link congestion, if necessary */ + + if (unlikely(bcl->next_out)) { + tipc_link_push_queue(bcl); + bclink_set_last_sent(); + } + if (unlikely(released && !list_empty(&bcl->waiting_ports))) + tipc_link_wakeup_ports(bcl, 0); + spin_unlock_bh(&bc_lock); +} + +/** + * bclink_send_ack - unicast an ACK msg + * + * tipc_net_lock and node lock set + */ + +static void bclink_send_ack(struct tipc_node *n_ptr) +{ + struct link *l_ptr = n_ptr->active_links[n_ptr->addr & 1]; + + if (l_ptr != NULL) + tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); +} + +/** + * bclink_send_nack- broadcast a NACK msg + * + * tipc_net_lock and node lock set + */ + +static void bclink_send_nack(struct tipc_node *n_ptr) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + + if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to)) + return; + + buf = tipc_buf_acquire(INT_H_SIZE); + if (buf) { + msg = buf_msg(buf); + tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, + INT_H_SIZE, n_ptr->addr); + msg_set_non_seq(msg, 1); + msg_set_mc_netid(msg, tipc_net_id); + msg_set_bcast_ack(msg, mod(n_ptr->bclink.last_in)); + msg_set_bcgap_after(msg, n_ptr->bclink.gap_after); + msg_set_bcgap_to(msg, n_ptr->bclink.gap_to); + msg_set_bcast_tag(msg, tipc_own_tag); + + if (tipc_bearer_send(&bcbearer->bearer, buf, NULL)) { + bcl->stats.sent_nacks++; + buf_discard(buf); + } else { + tipc_bearer_schedule(bcl->b_ptr, bcl); + bcl->proto_msg_queue = buf; + bcl->stats.bearer_congs++; + } + + /* + * Ensure we doesn't send another NACK msg to the node + * until 16 more deferred messages arrive from it + * (i.e. helps prevent all nodes from NACK'ing at same time) + */ + + n_ptr->bclink.nack_sync = tipc_own_tag; + } +} + +/** + * tipc_bclink_check_gap - send a NACK if a sequence gap exists + * + * tipc_net_lock and node lock set + */ + +void tipc_bclink_check_gap(struct tipc_node *n_ptr, u32 last_sent) +{ + if (!n_ptr->bclink.supported || + less_eq(last_sent, mod(n_ptr->bclink.last_in))) + return; + + bclink_set_gap(n_ptr); + if (n_ptr->bclink.gap_after == n_ptr->bclink.gap_to) + n_ptr->bclink.gap_to = last_sent; + bclink_send_nack(n_ptr); +} + +/** + * tipc_bclink_peek_nack - process a NACK msg meant for another node + * + * Only tipc_net_lock set. + */ + +static void tipc_bclink_peek_nack(u32 dest, u32 sender_tag, u32 gap_after, u32 gap_to) +{ + struct tipc_node *n_ptr = tipc_node_find(dest); + u32 my_after, my_to; + + if (unlikely(!n_ptr || !tipc_node_is_up(n_ptr))) + return; + tipc_node_lock(n_ptr); + /* + * Modify gap to suppress unnecessary NACKs from this node + */ + my_after = n_ptr->bclink.gap_after; + my_to = n_ptr->bclink.gap_to; + + if (less_eq(gap_after, my_after)) { + if (less(my_after, gap_to) && less(gap_to, my_to)) + n_ptr->bclink.gap_after = gap_to; + else if (less_eq(my_to, gap_to)) + n_ptr->bclink.gap_to = n_ptr->bclink.gap_after; + } else if (less_eq(gap_after, my_to)) { + if (less_eq(my_to, gap_to)) + n_ptr->bclink.gap_to = gap_after; + } else { + /* + * Expand gap if missing bufs not in deferred queue: + */ + struct sk_buff *buf = n_ptr->bclink.deferred_head; + u32 prev = n_ptr->bclink.gap_to; + + for (; buf; buf = buf->next) { + u32 seqno = buf_seqno(buf); + + if (mod(seqno - prev) != 1) { + buf = NULL; + break; + } + if (seqno == gap_after) + break; + prev = seqno; + } + if (buf == NULL) + n_ptr->bclink.gap_to = gap_after; + } + /* + * Some nodes may send a complementary NACK now: + */ + if (bclink_ack_allowed(sender_tag + 1)) { + if (n_ptr->bclink.gap_to != n_ptr->bclink.gap_after) { + bclink_send_nack(n_ptr); + bclink_set_gap(n_ptr); + } + } + tipc_node_unlock(n_ptr); +} + +/** + * tipc_bclink_send_msg - broadcast a packet to all nodes in cluster + */ + +int tipc_bclink_send_msg(struct sk_buff *buf) +{ + int res; + + spin_lock_bh(&bc_lock); + + res = tipc_link_send_buf(bcl, buf); + if (likely(res > 0)) + bclink_set_last_sent(); + + bcl->stats.queue_sz_counts++; + bcl->stats.accu_queue_sz += bcl->out_queue_size; + + spin_unlock_bh(&bc_lock); + return res; +} + +/** + * tipc_bclink_recv_pkt - receive a broadcast packet, and deliver upwards + * + * tipc_net_lock is read_locked, no other locks set + */ + +void tipc_bclink_recv_pkt(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct tipc_node *node = tipc_node_find(msg_prevnode(msg)); + u32 next_in; + u32 seqno; + struct sk_buff *deferred; + + if (unlikely(!node || !tipc_node_is_up(node) || !node->bclink.supported || + (msg_mc_netid(msg) != tipc_net_id))) { + buf_discard(buf); + return; + } + + if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) { + if (msg_destnode(msg) == tipc_own_addr) { + tipc_node_lock(node); + tipc_bclink_acknowledge(node, msg_bcast_ack(msg)); + tipc_node_unlock(node); + spin_lock_bh(&bc_lock); + bcl->stats.recv_nacks++; + bclink->retransmit_to = node; + bclink_retransmit_pkt(msg_bcgap_after(msg), + msg_bcgap_to(msg)); + spin_unlock_bh(&bc_lock); + } else { + tipc_bclink_peek_nack(msg_destnode(msg), + msg_bcast_tag(msg), + msg_bcgap_after(msg), + msg_bcgap_to(msg)); + } + buf_discard(buf); + return; + } + + tipc_node_lock(node); +receive: + deferred = node->bclink.deferred_head; + next_in = mod(node->bclink.last_in + 1); + seqno = msg_seqno(msg); + + if (likely(seqno == next_in)) { + bcl->stats.recv_info++; + node->bclink.last_in++; + bclink_set_gap(node); + if (unlikely(bclink_ack_allowed(seqno))) { + bclink_send_ack(node); + bcl->stats.sent_acks++; + } + if (likely(msg_isdata(msg))) { + tipc_node_unlock(node); + tipc_port_recv_mcast(buf, NULL); + } else if (msg_user(msg) == MSG_BUNDLER) { + bcl->stats.recv_bundles++; + bcl->stats.recv_bundled += msg_msgcnt(msg); + tipc_node_unlock(node); + tipc_link_recv_bundle(buf); + } else if (msg_user(msg) == MSG_FRAGMENTER) { + bcl->stats.recv_fragments++; + if (tipc_link_recv_fragment(&node->bclink.defragm, + &buf, &msg)) + bcl->stats.recv_fragmented++; + tipc_node_unlock(node); + tipc_net_route_msg(buf); + } else { + tipc_node_unlock(node); + tipc_net_route_msg(buf); + } + if (deferred && (buf_seqno(deferred) == mod(next_in + 1))) { + tipc_node_lock(node); + buf = deferred; + msg = buf_msg(buf); + node->bclink.deferred_head = deferred->next; + goto receive; + } + return; + } else if (less(next_in, seqno)) { + u32 gap_after = node->bclink.gap_after; + u32 gap_to = node->bclink.gap_to; + + if (tipc_link_defer_pkt(&node->bclink.deferred_head, + &node->bclink.deferred_tail, + buf)) { + node->bclink.nack_sync++; + bcl->stats.deferred_recv++; + if (seqno == mod(gap_after + 1)) + node->bclink.gap_after = seqno; + else if (less(gap_after, seqno) && less(seqno, gap_to)) + node->bclink.gap_to = seqno; + } + if (bclink_ack_allowed(node->bclink.nack_sync)) { + if (gap_to != gap_after) + bclink_send_nack(node); + bclink_set_gap(node); + } + } else { + bcl->stats.duplicates++; + buf_discard(buf); + } + tipc_node_unlock(node); +} + +u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr) +{ + return (n_ptr->bclink.supported && + (tipc_bclink_get_last_sent() != n_ptr->bclink.acked)); +} + + +/** + * tipc_bcbearer_send - send a packet through the broadcast pseudo-bearer + * + * Send through as many bearers as necessary to reach all nodes + * that support TIPC multicasting. + * + * Returns 0 if packet sent successfully, non-zero if not + */ + +static int tipc_bcbearer_send(struct sk_buff *buf, + struct tipc_bearer *unused1, + struct tipc_media_addr *unused2) +{ + int bp_index; + + /* Prepare buffer for broadcasting (if first time trying to send it) */ + + if (likely(!msg_non_seq(buf_msg(buf)))) { + struct tipc_msg *msg; + + assert(tipc_bcast_nmap.count != 0); + bcbuf_set_acks(buf, tipc_bcast_nmap.count); + msg = buf_msg(buf); + msg_set_non_seq(msg, 1); + msg_set_mc_netid(msg, tipc_net_id); + bcl->stats.sent_info++; + } + + /* Send buffer over bearers until all targets reached */ + + bcbearer->remains = tipc_bcast_nmap; + + for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { + struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary; + struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary; + + if (!p) + break; /* no more bearers to try */ + + tipc_nmap_diff(&bcbearer->remains, &p->nodes, &bcbearer->remains_new); + if (bcbearer->remains_new.count == bcbearer->remains.count) + continue; /* bearer pair doesn't add anything */ + + if (p->blocked || + p->media->send_msg(buf, p, &p->media->bcast_addr)) { + /* unable to send on primary bearer */ + if (!s || s->blocked || + s->media->send_msg(buf, s, + &s->media->bcast_addr)) { + /* unable to send on either bearer */ + continue; + } + } + + if (s) { + bcbearer->bpairs[bp_index].primary = s; + bcbearer->bpairs[bp_index].secondary = p; + } + + if (bcbearer->remains_new.count == 0) + return 0; + + bcbearer->remains = bcbearer->remains_new; + } + + /* + * Unable to reach all targets (indicate success, since currently + * there isn't code in place to properly block & unblock the + * pseudo-bearer used by the broadcast link) + */ + + return TIPC_OK; +} + +/** + * tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer + */ + +void tipc_bcbearer_sort(void) +{ + struct bcbearer_pair *bp_temp = bcbearer->bpairs_temp; + struct bcbearer_pair *bp_curr; + int b_index; + int pri; + + spin_lock_bh(&bc_lock); + + /* Group bearers by priority (can assume max of two per priority) */ + + memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp)); + + for (b_index = 0; b_index < MAX_BEARERS; b_index++) { + struct tipc_bearer *b = &tipc_bearers[b_index]; + + if (!b->active || !b->nodes.count) + continue; + + if (!bp_temp[b->priority].primary) + bp_temp[b->priority].primary = b; + else + bp_temp[b->priority].secondary = b; + } + + /* Create array of bearer pairs for broadcasting */ + + bp_curr = bcbearer->bpairs; + memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs)); + + for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) { + + if (!bp_temp[pri].primary) + continue; + + bp_curr->primary = bp_temp[pri].primary; + + if (bp_temp[pri].secondary) { + if (tipc_nmap_equal(&bp_temp[pri].primary->nodes, + &bp_temp[pri].secondary->nodes)) { + bp_curr->secondary = bp_temp[pri].secondary; + } else { + bp_curr++; + bp_curr->primary = bp_temp[pri].secondary; + } + } + + bp_curr++; + } + + spin_unlock_bh(&bc_lock); +} + +/** + * tipc_bcbearer_push - resolve bearer congestion + * + * Forces bclink to push out any unsent packets, until all packets are gone + * or congestion reoccurs. + * No locks set when function called + */ + +void tipc_bcbearer_push(void) +{ + struct tipc_bearer *b_ptr; + + spin_lock_bh(&bc_lock); + b_ptr = &bcbearer->bearer; + if (b_ptr->blocked) { + b_ptr->blocked = 0; + tipc_bearer_lock_push(b_ptr); + } + spin_unlock_bh(&bc_lock); +} + + +int tipc_bclink_stats(char *buf, const u32 buf_size) +{ + struct print_buf pb; + + if (!bcl) + return 0; + + tipc_printbuf_init(&pb, buf, buf_size); + + spin_lock_bh(&bc_lock); + + tipc_printf(&pb, "Link <%s>\n" + " Window:%u packets\n", + bcl->name, bcl->queue_limit[0]); + tipc_printf(&pb, " RX packets:%u fragments:%u/%u bundles:%u/%u\n", + bcl->stats.recv_info, + bcl->stats.recv_fragments, + bcl->stats.recv_fragmented, + bcl->stats.recv_bundles, + bcl->stats.recv_bundled); + tipc_printf(&pb, " TX packets:%u fragments:%u/%u bundles:%u/%u\n", + bcl->stats.sent_info, + bcl->stats.sent_fragments, + bcl->stats.sent_fragmented, + bcl->stats.sent_bundles, + bcl->stats.sent_bundled); + tipc_printf(&pb, " RX naks:%u defs:%u dups:%u\n", + bcl->stats.recv_nacks, + bcl->stats.deferred_recv, + bcl->stats.duplicates); + tipc_printf(&pb, " TX naks:%u acks:%u dups:%u\n", + bcl->stats.sent_nacks, + bcl->stats.sent_acks, + bcl->stats.retransmitted); + tipc_printf(&pb, " Congestion bearer:%u link:%u Send queue max:%u avg:%u\n", + bcl->stats.bearer_congs, + bcl->stats.link_congs, + bcl->stats.max_queue_sz, + bcl->stats.queue_sz_counts + ? (bcl->stats.accu_queue_sz / bcl->stats.queue_sz_counts) + : 0); + + spin_unlock_bh(&bc_lock); + return tipc_printbuf_validate(&pb); +} + +int tipc_bclink_reset_stats(void) +{ + if (!bcl) + return -ENOPROTOOPT; + + spin_lock_bh(&bc_lock); + memset(&bcl->stats, 0, sizeof(bcl->stats)); + spin_unlock_bh(&bc_lock); + return 0; +} + +int tipc_bclink_set_queue_limits(u32 limit) +{ + if (!bcl) + return -ENOPROTOOPT; + if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN)) + return -EINVAL; + + spin_lock_bh(&bc_lock); + tipc_link_set_queue_limits(bcl, limit); + spin_unlock_bh(&bc_lock); + return 0; +} + +int tipc_bclink_init(void) +{ + bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC); + bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC); + if (!bcbearer || !bclink) { + warn("Multicast link creation failed, no memory\n"); + kfree(bcbearer); + bcbearer = NULL; + kfree(bclink); + bclink = NULL; + return -ENOMEM; + } + + INIT_LIST_HEAD(&bcbearer->bearer.cong_links); + bcbearer->bearer.media = &bcbearer->media; + bcbearer->media.send_msg = tipc_bcbearer_send; + sprintf(bcbearer->media.name, "tipc-multicast"); + + bcl = &bclink->link; + INIT_LIST_HEAD(&bcl->waiting_ports); + bcl->next_out_no = 1; + spin_lock_init(&bclink->node.lock); + bcl->owner = &bclink->node; + bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; + tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); + bcl->b_ptr = &bcbearer->bearer; + bcl->state = WORKING_WORKING; + strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); + + return 0; +} + +void tipc_bclink_stop(void) +{ + spin_lock_bh(&bc_lock); + if (bcbearer) { + tipc_link_stop(bcl); + bcl = NULL; + kfree(bclink); + bclink = NULL; + kfree(bcbearer); + bcbearer = NULL; + } + spin_unlock_bh(&bc_lock); +} + + +/** + * tipc_nmap_add - add a node to a node map + */ + +void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node) +{ + int n = tipc_node(node); + int w = n / WSIZE; + u32 mask = (1 << (n % WSIZE)); + + if ((nm_ptr->map[w] & mask) == 0) { + nm_ptr->count++; + nm_ptr->map[w] |= mask; + } +} + +/** + * tipc_nmap_remove - remove a node from a node map + */ + +void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node) +{ + int n = tipc_node(node); + int w = n / WSIZE; + u32 mask = (1 << (n % WSIZE)); + + if ((nm_ptr->map[w] & mask) != 0) { + nm_ptr->map[w] &= ~mask; + nm_ptr->count--; + } +} + +/** + * tipc_nmap_diff - find differences between node maps + * @nm_a: input node map A + * @nm_b: input node map B + * @nm_diff: output node map A-B (i.e. nodes of A that are not in B) + */ + +static void tipc_nmap_diff(struct tipc_node_map *nm_a, + struct tipc_node_map *nm_b, + struct tipc_node_map *nm_diff) +{ + int stop = ARRAY_SIZE(nm_a->map); + int w; + int b; + u32 map; + + memset(nm_diff, 0, sizeof(*nm_diff)); + for (w = 0; w < stop; w++) { + map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]); + nm_diff->map[w] = map; + if (map != 0) { + for (b = 0 ; b < WSIZE; b++) { + if (map & (1 << b)) + nm_diff->count++; + } + } + } +} + +/** + * tipc_port_list_add - add a port to a port list, ensuring no duplicates + */ + +void tipc_port_list_add(struct port_list *pl_ptr, u32 port) +{ + struct port_list *item = pl_ptr; + int i; + int item_sz = PLSIZE; + int cnt = pl_ptr->count; + + for (; ; cnt -= item_sz, item = item->next) { + if (cnt < PLSIZE) + item_sz = cnt; + for (i = 0; i < item_sz; i++) + if (item->ports[i] == port) + return; + if (i < PLSIZE) { + item->ports[i] = port; + pl_ptr->count++; + return; + } + if (!item->next) { + item->next = kmalloc(sizeof(*item), GFP_ATOMIC); + if (!item->next) { + warn("Incomplete multicast delivery, no memory\n"); + return; + } + item->next->next = NULL; + } + } +} + +/** + * tipc_port_list_free - free dynamically created entries in port_list chain + * + */ + +void tipc_port_list_free(struct port_list *pl_ptr) +{ + struct port_list *item; + struct port_list *next; + + for (item = pl_ptr->next; item; item = next) { + next = item->next; + kfree(item); + } +} + diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h new file mode 100644 index 00000000..500c97f1 --- /dev/null +++ b/net/tipc/bcast.h @@ -0,0 +1,106 @@ +/* + * net/tipc/bcast.h: Include file for TIPC broadcast code + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_BCAST_H +#define _TIPC_BCAST_H + +#define MAX_NODES 4096 +#define WSIZE 32 + +/** + * struct tipc_node_map - set of node identifiers + * @count: # of nodes in set + * @map: bitmap of node identifiers that are in the set + */ + +struct tipc_node_map { + u32 count; + u32 map[MAX_NODES / WSIZE]; +}; + +extern struct tipc_node_map tipc_bcast_nmap; + +#define PLSIZE 32 + +/** + * struct port_list - set of node local destination ports + * @count: # of ports in set (only valid for first entry in list) + * @next: pointer to next entry in list + * @ports: array of port references + */ + +struct port_list { + int count; + struct port_list *next; + u32 ports[PLSIZE]; +}; + + +struct tipc_node; + +extern const char tipc_bclink_name[]; + +void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node); +void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node); + +/** + * tipc_nmap_equal - test for equality of node maps + */ + +static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b) +{ + return !memcmp(nm_a, nm_b, sizeof(*nm_a)); +} + +void tipc_port_list_add(struct port_list *pl_ptr, u32 port); +void tipc_port_list_free(struct port_list *pl_ptr); + +int tipc_bclink_init(void); +void tipc_bclink_stop(void); +struct tipc_node *tipc_bclink_retransmit_to(void); +void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked); +int tipc_bclink_send_msg(struct sk_buff *buf); +void tipc_bclink_recv_pkt(struct sk_buff *buf); +u32 tipc_bclink_get_last_sent(void); +u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr); +void tipc_bclink_check_gap(struct tipc_node *n_ptr, u32 seqno); +int tipc_bclink_stats(char *stats_buf, const u32 buf_size); +int tipc_bclink_reset_stats(void); +int tipc_bclink_set_queue_limits(u32 limit); +void tipc_bcbearer_sort(void); +void tipc_bcbearer_push(void); + +#endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c new file mode 100644 index 00000000..85209ead --- /dev/null +++ b/net/tipc/bearer.c @@ -0,0 +1,677 @@ +/* + * net/tipc/bearer.c: TIPC bearer code + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2004-2006, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "bearer.h" +#include "discover.h" + +#define MAX_ADDR_STR 32 + +static struct media media_list[MAX_MEDIA]; +static u32 media_count; + +struct tipc_bearer tipc_bearers[MAX_BEARERS]; + +static void bearer_disable(struct tipc_bearer *b_ptr); + +/** + * media_name_valid - validate media name + * + * Returns 1 if media name is valid, otherwise 0. + */ + +static int media_name_valid(const char *name) +{ + u32 len; + + len = strlen(name); + if ((len + 1) > TIPC_MAX_MEDIA_NAME) + return 0; + return strspn(name, tipc_alphabet) == len; +} + +/** + * media_find - locates specified media object by name + */ + +static struct media *media_find(const char *name) +{ + struct media *m_ptr; + u32 i; + + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + if (!strcmp(m_ptr->name, name)) + return m_ptr; + } + return NULL; +} + +/** + * tipc_register_media - register a media type + * + * Bearers for this media type must be activated separately at a later stage. + */ + +int tipc_register_media(u32 media_type, + char *name, + int (*enable)(struct tipc_bearer *), + void (*disable)(struct tipc_bearer *), + int (*send_msg)(struct sk_buff *, + struct tipc_bearer *, + struct tipc_media_addr *), + char *(*addr2str)(struct tipc_media_addr *a, + char *str_buf, int str_size), + struct tipc_media_addr *bcast_addr, + const u32 bearer_priority, + const u32 link_tolerance, /* [ms] */ + const u32 send_window_limit) +{ + struct media *m_ptr; + u32 media_id; + u32 i; + int res = -EINVAL; + + write_lock_bh(&tipc_net_lock); + + if (tipc_mode != TIPC_NET_MODE) { + warn("Media <%s> rejected, not in networked mode yet\n", name); + goto exit; + } + if (!media_name_valid(name)) { + warn("Media <%s> rejected, illegal name\n", name); + goto exit; + } + if (!bcast_addr) { + warn("Media <%s> rejected, no broadcast address\n", name); + goto exit; + } + if ((bearer_priority < TIPC_MIN_LINK_PRI) || + (bearer_priority > TIPC_MAX_LINK_PRI)) { + warn("Media <%s> rejected, illegal priority (%u)\n", name, + bearer_priority); + goto exit; + } + if ((link_tolerance < TIPC_MIN_LINK_TOL) || + (link_tolerance > TIPC_MAX_LINK_TOL)) { + warn("Media <%s> rejected, illegal tolerance (%u)\n", name, + link_tolerance); + goto exit; + } + + media_id = media_count++; + if (media_id >= MAX_MEDIA) { + warn("Media <%s> rejected, media limit reached (%u)\n", name, + MAX_MEDIA); + media_count--; + goto exit; + } + for (i = 0; i < media_id; i++) { + if (media_list[i].type_id == media_type) { + warn("Media <%s> rejected, duplicate type (%u)\n", name, + media_type); + media_count--; + goto exit; + } + if (!strcmp(name, media_list[i].name)) { + warn("Media <%s> rejected, duplicate name\n", name); + media_count--; + goto exit; + } + } + + m_ptr = &media_list[media_id]; + m_ptr->type_id = media_type; + m_ptr->send_msg = send_msg; + m_ptr->enable_bearer = enable; + m_ptr->disable_bearer = disable; + m_ptr->addr2str = addr2str; + memcpy(&m_ptr->bcast_addr, bcast_addr, sizeof(*bcast_addr)); + strcpy(m_ptr->name, name); + m_ptr->priority = bearer_priority; + m_ptr->tolerance = link_tolerance; + m_ptr->window = send_window_limit; + res = 0; +exit: + write_unlock_bh(&tipc_net_lock); + return res; +} + +/** + * tipc_media_addr_printf - record media address in print buffer + */ + +void tipc_media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a) +{ + struct media *m_ptr; + u32 media_type; + u32 i; + + media_type = ntohl(a->type); + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + if (m_ptr->type_id == media_type) + break; + } + + if ((i < media_count) && (m_ptr->addr2str != NULL)) { + char addr_str[MAX_ADDR_STR]; + + tipc_printf(pb, "%s(%s)", m_ptr->name, + m_ptr->addr2str(a, addr_str, sizeof(addr_str))); + } else { + unchar *addr = (unchar *)&a->dev_addr; + + tipc_printf(pb, "UNKNOWN(%u)", media_type); + for (i = 0; i < (sizeof(*a) - sizeof(a->type)); i++) + tipc_printf(pb, "-%02x", addr[i]); + } +} + +/** + * tipc_media_get_names - record names of registered media in buffer + */ + +struct sk_buff *tipc_media_get_names(void) +{ + struct sk_buff *buf; + struct media *m_ptr; + int i; + + buf = tipc_cfg_reply_alloc(MAX_MEDIA * TLV_SPACE(TIPC_MAX_MEDIA_NAME)); + if (!buf) + return NULL; + + read_lock_bh(&tipc_net_lock); + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + tipc_cfg_append_tlv(buf, TIPC_TLV_MEDIA_NAME, m_ptr->name, + strlen(m_ptr->name) + 1); + } + read_unlock_bh(&tipc_net_lock); + return buf; +} + +/** + * bearer_name_validate - validate & (optionally) deconstruct bearer name + * @name - ptr to bearer name string + * @name_parts - ptr to area for bearer name components (or NULL if not needed) + * + * Returns 1 if bearer name is valid, otherwise 0. + */ + +static int bearer_name_validate(const char *name, + struct bearer_name *name_parts) +{ + char name_copy[TIPC_MAX_BEARER_NAME]; + char *media_name; + char *if_name; + u32 media_len; + u32 if_len; + + /* copy bearer name & ensure length is OK */ + + name_copy[TIPC_MAX_BEARER_NAME - 1] = 0; + /* need above in case non-Posix strncpy() doesn't pad with nulls */ + strncpy(name_copy, name, TIPC_MAX_BEARER_NAME); + if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0) + return 0; + + /* ensure all component parts of bearer name are present */ + + media_name = name_copy; + if_name = strchr(media_name, ':'); + if (if_name == NULL) + return 0; + *(if_name++) = 0; + media_len = if_name - media_name; + if_len = strlen(if_name) + 1; + + /* validate component parts of bearer name */ + + if ((media_len <= 1) || (media_len > TIPC_MAX_MEDIA_NAME) || + (if_len <= 1) || (if_len > TIPC_MAX_IF_NAME) || + (strspn(media_name, tipc_alphabet) != (media_len - 1)) || + (strspn(if_name, tipc_alphabet) != (if_len - 1))) + return 0; + + /* return bearer name components, if necessary */ + + if (name_parts) { + strcpy(name_parts->media_name, media_name); + strcpy(name_parts->if_name, if_name); + } + return 1; +} + +/** + * bearer_find - locates bearer object with matching bearer name + */ + +static struct tipc_bearer *bearer_find(const char *name) +{ + struct tipc_bearer *b_ptr; + u32 i; + + for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { + if (b_ptr->active && (!strcmp(b_ptr->name, name))) + return b_ptr; + } + return NULL; +} + +/** + * tipc_bearer_find_interface - locates bearer object with matching interface name + */ + +struct tipc_bearer *tipc_bearer_find_interface(const char *if_name) +{ + struct tipc_bearer *b_ptr; + char *b_if_name; + u32 i; + + for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { + if (!b_ptr->active) + continue; + b_if_name = strchr(b_ptr->name, ':') + 1; + if (!strcmp(b_if_name, if_name)) + return b_ptr; + } + return NULL; +} + +/** + * tipc_bearer_get_names - record names of bearers in buffer + */ + +struct sk_buff *tipc_bearer_get_names(void) +{ + struct sk_buff *buf; + struct media *m_ptr; + struct tipc_bearer *b_ptr; + int i, j; + + buf = tipc_cfg_reply_alloc(MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME)); + if (!buf) + return NULL; + + read_lock_bh(&tipc_net_lock); + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + for (j = 0; j < MAX_BEARERS; j++) { + b_ptr = &tipc_bearers[j]; + if (b_ptr->active && (b_ptr->media == m_ptr)) { + tipc_cfg_append_tlv(buf, TIPC_TLV_BEARER_NAME, + b_ptr->name, + strlen(b_ptr->name) + 1); + } + } + } + read_unlock_bh(&tipc_net_lock); + return buf; +} + +void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest) +{ + tipc_nmap_add(&b_ptr->nodes, dest); + tipc_bcbearer_sort(); + tipc_disc_add_dest(b_ptr->link_req); +} + +void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest) +{ + tipc_nmap_remove(&b_ptr->nodes, dest); + tipc_bcbearer_sort(); + tipc_disc_remove_dest(b_ptr->link_req); +} + +/* + * bearer_push(): Resolve bearer congestion. Force the waiting + * links to push out their unsent packets, one packet per link + * per iteration, until all packets are gone or congestion reoccurs. + * 'tipc_net_lock' is read_locked when this function is called + * bearer.lock must be taken before calling + * Returns binary true(1) ore false(0) + */ +static int bearer_push(struct tipc_bearer *b_ptr) +{ + u32 res = 0; + struct link *ln, *tln; + + if (b_ptr->blocked) + return 0; + + while (!list_empty(&b_ptr->cong_links) && (res != PUSH_FAILED)) { + list_for_each_entry_safe(ln, tln, &b_ptr->cong_links, link_list) { + res = tipc_link_push_packet(ln); + if (res == PUSH_FAILED) + break; + if (res == PUSH_FINISHED) + list_move_tail(&ln->link_list, &b_ptr->links); + } + } + return list_empty(&b_ptr->cong_links); +} + +void tipc_bearer_lock_push(struct tipc_bearer *b_ptr) +{ + int res; + + spin_lock_bh(&b_ptr->lock); + res = bearer_push(b_ptr); + spin_unlock_bh(&b_ptr->lock); + if (res) + tipc_bcbearer_push(); +} + + +/* + * Interrupt enabling new requests after bearer congestion or blocking: + * See bearer_send(). + */ +void tipc_continue(struct tipc_bearer *b_ptr) +{ + spin_lock_bh(&b_ptr->lock); + b_ptr->continue_count++; + if (!list_empty(&b_ptr->cong_links)) + tipc_k_signal((Handler)tipc_bearer_lock_push, (unsigned long)b_ptr); + b_ptr->blocked = 0; + spin_unlock_bh(&b_ptr->lock); +} + +/* + * Schedule link for sending of messages after the bearer + * has been deblocked by 'continue()'. This method is called + * when somebody tries to send a message via this link while + * the bearer is congested. 'tipc_net_lock' is in read_lock here + * bearer.lock is busy + */ + +static void tipc_bearer_schedule_unlocked(struct tipc_bearer *b_ptr, struct link *l_ptr) +{ + list_move_tail(&l_ptr->link_list, &b_ptr->cong_links); +} + +/* + * Schedule link for sending of messages after the bearer + * has been deblocked by 'continue()'. This method is called + * when somebody tries to send a message via this link while + * the bearer is congested. 'tipc_net_lock' is in read_lock here, + * bearer.lock is free + */ + +void tipc_bearer_schedule(struct tipc_bearer *b_ptr, struct link *l_ptr) +{ + spin_lock_bh(&b_ptr->lock); + tipc_bearer_schedule_unlocked(b_ptr, l_ptr); + spin_unlock_bh(&b_ptr->lock); +} + + +/* + * tipc_bearer_resolve_congestion(): Check if there is bearer congestion, + * and if there is, try to resolve it before returning. + * 'tipc_net_lock' is read_locked when this function is called + */ +int tipc_bearer_resolve_congestion(struct tipc_bearer *b_ptr, struct link *l_ptr) +{ + int res = 1; + + if (list_empty(&b_ptr->cong_links)) + return 1; + spin_lock_bh(&b_ptr->lock); + if (!bearer_push(b_ptr)) { + tipc_bearer_schedule_unlocked(b_ptr, l_ptr); + res = 0; + } + spin_unlock_bh(&b_ptr->lock); + return res; +} + +/** + * tipc_bearer_congested - determines if bearer is currently congested + */ + +int tipc_bearer_congested(struct tipc_bearer *b_ptr, struct link *l_ptr) +{ + if (unlikely(b_ptr->blocked)) + return 1; + if (likely(list_empty(&b_ptr->cong_links))) + return 0; + return !tipc_bearer_resolve_congestion(b_ptr, l_ptr); +} + +/** + * tipc_enable_bearer - enable bearer with the given name + */ + +int tipc_enable_bearer(const char *name, u32 disc_domain, u32 priority) +{ + struct tipc_bearer *b_ptr; + struct media *m_ptr; + struct bearer_name b_name; + char addr_string[16]; + u32 bearer_id; + u32 with_this_prio; + u32 i; + int res = -EINVAL; + + if (tipc_mode != TIPC_NET_MODE) { + warn("Bearer <%s> rejected, not supported in standalone mode\n", + name); + return -ENOPROTOOPT; + } + if (!bearer_name_validate(name, &b_name)) { + warn("Bearer <%s> rejected, illegal name\n", name); + return -EINVAL; + } + if (tipc_addr_domain_valid(disc_domain) && + (disc_domain != tipc_own_addr)) { + if (tipc_in_scope(disc_domain, tipc_own_addr)) { + disc_domain = tipc_own_addr & TIPC_CLUSTER_MASK; + res = 0; /* accept any node in own cluster */ + } else if (in_own_cluster(disc_domain)) + res = 0; /* accept specified node in own cluster */ + } + if (res) { + warn("Bearer <%s> rejected, illegal discovery domain\n", name); + return -EINVAL; + } + if ((priority < TIPC_MIN_LINK_PRI || + priority > TIPC_MAX_LINK_PRI) && + (priority != TIPC_MEDIA_LINK_PRI)) { + warn("Bearer <%s> rejected, illegal priority\n", name); + return -EINVAL; + } + + write_lock_bh(&tipc_net_lock); + + m_ptr = media_find(b_name.media_name); + if (!m_ptr) { + warn("Bearer <%s> rejected, media <%s> not registered\n", name, + b_name.media_name); + goto exit; + } + + if (priority == TIPC_MEDIA_LINK_PRI) + priority = m_ptr->priority; + +restart: + bearer_id = MAX_BEARERS; + with_this_prio = 1; + for (i = MAX_BEARERS; i-- != 0; ) { + if (!tipc_bearers[i].active) { + bearer_id = i; + continue; + } + if (!strcmp(name, tipc_bearers[i].name)) { + warn("Bearer <%s> rejected, already enabled\n", name); + goto exit; + } + if ((tipc_bearers[i].priority == priority) && + (++with_this_prio > 2)) { + if (priority-- == 0) { + warn("Bearer <%s> rejected, duplicate priority\n", + name); + goto exit; + } + warn("Bearer <%s> priority adjustment required %u->%u\n", + name, priority + 1, priority); + goto restart; + } + } + if (bearer_id >= MAX_BEARERS) { + warn("Bearer <%s> rejected, bearer limit reached (%u)\n", + name, MAX_BEARERS); + goto exit; + } + + b_ptr = &tipc_bearers[bearer_id]; + strcpy(b_ptr->name, name); + res = m_ptr->enable_bearer(b_ptr); + if (res) { + warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); + goto exit; + } + + b_ptr->identity = bearer_id; + b_ptr->media = m_ptr; + b_ptr->net_plane = bearer_id + 'A'; + b_ptr->active = 1; + b_ptr->priority = priority; + INIT_LIST_HEAD(&b_ptr->cong_links); + INIT_LIST_HEAD(&b_ptr->links); + spin_lock_init(&b_ptr->lock); + + res = tipc_disc_create(b_ptr, &m_ptr->bcast_addr, disc_domain); + if (res) { + bearer_disable(b_ptr); + warn("Bearer <%s> rejected, discovery object creation failed\n", + name); + goto exit; + } + info("Enabled bearer <%s>, discovery domain %s, priority %u\n", + name, tipc_addr_string_fill(addr_string, disc_domain), priority); +exit: + write_unlock_bh(&tipc_net_lock); + return res; +} + +/** + * tipc_block_bearer(): Block the bearer with the given name, + * and reset all its links + */ + +int tipc_block_bearer(const char *name) +{ + struct tipc_bearer *b_ptr = NULL; + struct link *l_ptr; + struct link *temp_l_ptr; + + read_lock_bh(&tipc_net_lock); + b_ptr = bearer_find(name); + if (!b_ptr) { + warn("Attempt to block unknown bearer <%s>\n", name); + read_unlock_bh(&tipc_net_lock); + return -EINVAL; + } + + info("Blocking bearer <%s>\n", name); + spin_lock_bh(&b_ptr->lock); + b_ptr->blocked = 1; + list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { + struct tipc_node *n_ptr = l_ptr->owner; + + spin_lock_bh(&n_ptr->lock); + tipc_link_reset(l_ptr); + spin_unlock_bh(&n_ptr->lock); + } + spin_unlock_bh(&b_ptr->lock); + read_unlock_bh(&tipc_net_lock); + return 0; +} + +/** + * bearer_disable - + * + * Note: This routine assumes caller holds tipc_net_lock. + */ + +static void bearer_disable(struct tipc_bearer *b_ptr) +{ + struct link *l_ptr; + struct link *temp_l_ptr; + + info("Disabling bearer <%s>\n", b_ptr->name); + spin_lock_bh(&b_ptr->lock); + b_ptr->blocked = 1; + b_ptr->media->disable_bearer(b_ptr); + list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { + tipc_link_delete(l_ptr); + } + if (b_ptr->link_req) + tipc_disc_delete(b_ptr->link_req); + spin_unlock_bh(&b_ptr->lock); + memset(b_ptr, 0, sizeof(struct tipc_bearer)); +} + +int tipc_disable_bearer(const char *name) +{ + struct tipc_bearer *b_ptr; + int res; + + write_lock_bh(&tipc_net_lock); + b_ptr = bearer_find(name); + if (b_ptr == NULL) { + warn("Attempt to disable unknown bearer <%s>\n", name); + res = -EINVAL; + } else { + bearer_disable(b_ptr); + res = 0; + } + write_unlock_bh(&tipc_net_lock); + return res; +} + + + +void tipc_bearer_stop(void) +{ + u32 i; + + for (i = 0; i < MAX_BEARERS; i++) { + if (tipc_bearers[i].active) + bearer_disable(&tipc_bearers[i]); + } + media_count = 0; +} diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h new file mode 100644 index 00000000..31d6172b --- /dev/null +++ b/net/tipc/bearer.h @@ -0,0 +1,214 @@ +/* + * net/tipc/bearer.h: Include file for TIPC bearer code + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_BEARER_H +#define _TIPC_BEARER_H + +#include "bcast.h" + +#define MAX_BEARERS 8 +#define MAX_MEDIA 4 + +/* + * Identifiers of supported TIPC media types + */ +#define TIPC_MEDIA_TYPE_ETH 1 + +/* + * Destination address structure used by TIPC bearers when sending messages + * + * IMPORTANT: The fields of this structure MUST be stored using the specified + * byte order indicated below, as the structure is exchanged between nodes + * as part of a link setup process. + */ +struct tipc_media_addr { + __be32 type; /* bearer type (network byte order) */ + union { + __u8 eth_addr[6]; /* 48 bit Ethernet addr (byte array) */ + } dev_addr; +}; + +struct tipc_bearer; + +/** + * struct media - TIPC media information available to internal users + * @send_msg: routine which handles buffer transmission + * @enable_bearer: routine which enables a bearer + * @disable_bearer: routine which disables a bearer + * @addr2str: routine which converts bearer's address to string form + * @bcast_addr: media address used in broadcasting + * @priority: default link (and bearer) priority + * @tolerance: default time (in ms) before declaring link failure + * @window: default window (in packets) before declaring link congestion + * @type_id: TIPC media identifier + * @name: media name + */ + +struct media { + int (*send_msg)(struct sk_buff *buf, + struct tipc_bearer *b_ptr, + struct tipc_media_addr *dest); + int (*enable_bearer)(struct tipc_bearer *b_ptr); + void (*disable_bearer)(struct tipc_bearer *b_ptr); + char *(*addr2str)(struct tipc_media_addr *a, + char *str_buf, int str_size); + struct tipc_media_addr bcast_addr; + u32 priority; + u32 tolerance; + u32 window; + u32 type_id; + char name[TIPC_MAX_MEDIA_NAME]; +}; + +/** + * struct tipc_bearer - TIPC bearer structure + * @usr_handle: pointer to additional media-specific information about bearer + * @mtu: max packet size bearer can support + * @blocked: non-zero if bearer is blocked + * @lock: spinlock for controlling access to bearer + * @addr: media-specific address associated with bearer + * @name: bearer name (format = media:interface) + * @media: ptr to media structure associated with bearer + * @priority: default link priority for bearer + * @identity: array index of this bearer within TIPC bearer array + * @link_req: ptr to (optional) structure making periodic link setup requests + * @links: list of non-congested links associated with bearer + * @cong_links: list of congested links associated with bearer + * @continue_count: # of times bearer has resumed after congestion or blocking + * @active: non-zero if bearer structure is represents a bearer + * @net_plane: network plane ('A' through 'H') currently associated with bearer + * @nodes: indicates which nodes in cluster can be reached through bearer + * + * Note: media-specific code is responsible for initialization of the fields + * indicated below when a bearer is enabled; TIPC's generic bearer code takes + * care of initializing all other fields. + */ +struct tipc_bearer { + void *usr_handle; /* initalized by media */ + u32 mtu; /* initalized by media */ + int blocked; /* initalized by media */ + struct tipc_media_addr addr; /* initalized by media */ + char name[TIPC_MAX_BEARER_NAME]; + spinlock_t lock; + struct media *media; + u32 priority; + u32 identity; + struct link_req *link_req; + struct list_head links; + struct list_head cong_links; + u32 continue_count; + int active; + char net_plane; + struct tipc_node_map nodes; +}; + +struct bearer_name { + char media_name[TIPC_MAX_MEDIA_NAME]; + char if_name[TIPC_MAX_IF_NAME]; +}; + +struct link; + +extern struct tipc_bearer tipc_bearers[]; + +/* + * TIPC routines available to supported media types + */ +int tipc_register_media(u32 media_type, + char *media_name, int (*enable)(struct tipc_bearer *), + void (*disable)(struct tipc_bearer *), + int (*send_msg)(struct sk_buff *, + struct tipc_bearer *, struct tipc_media_addr *), + char *(*addr2str)(struct tipc_media_addr *a, + char *str_buf, int str_size), + struct tipc_media_addr *bcast_addr, const u32 bearer_priority, + const u32 link_tolerance, /* [ms] */ + const u32 send_window_limit); + +void tipc_recv_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr); + +int tipc_block_bearer(const char *name); +void tipc_continue(struct tipc_bearer *tb_ptr); + +int tipc_enable_bearer(const char *bearer_name, u32 disc_domain, u32 priority); +int tipc_disable_bearer(const char *name); + +/* + * Routines made available to TIPC by supported media types + */ +int tipc_eth_media_start(void); +void tipc_eth_media_stop(void); + +void tipc_media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a); +struct sk_buff *tipc_media_get_names(void); + +struct sk_buff *tipc_bearer_get_names(void); +void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest); +void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest); +void tipc_bearer_schedule(struct tipc_bearer *b_ptr, struct link *l_ptr); +struct tipc_bearer *tipc_bearer_find_interface(const char *if_name); +int tipc_bearer_resolve_congestion(struct tipc_bearer *b_ptr, struct link *l_ptr); +int tipc_bearer_congested(struct tipc_bearer *b_ptr, struct link *l_ptr); +void tipc_bearer_stop(void); +void tipc_bearer_lock_push(struct tipc_bearer *b_ptr); + + +/** + * tipc_bearer_send- sends buffer to destination over bearer + * + * Returns true (1) if successful, or false (0) if unable to send + * + * IMPORTANT: + * The media send routine must not alter the buffer being passed in + * as it may be needed for later retransmission! + * + * If the media send routine returns a non-zero value (indicating that + * it was unable to send the buffer), it must: + * 1) mark the bearer as blocked, + * 2) call tipc_continue() once the bearer is able to send again. + * Media types that are unable to meet these two critera must ensure their + * send routine always returns success -- even if the buffer was not sent -- + * and let TIPC's link code deal with the undelivered message. + */ + +static inline int tipc_bearer_send(struct tipc_bearer *b_ptr, + struct sk_buff *buf, + struct tipc_media_addr *dest) +{ + return !b_ptr->media->send_msg(buf, b_ptr, dest); +} + +#endif /* _TIPC_BEARER_H */ diff --git a/net/tipc/config.c b/net/tipc/config.c new file mode 100644 index 00000000..b25a396b --- /dev/null +++ b/net/tipc/config.c @@ -0,0 +1,502 @@ +/* + * net/tipc/config.c: TIPC configuration management code + * + * Copyright (c) 2002-2006, Ericsson AB + * Copyright (c) 2004-2007, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "port.h" +#include "name_table.h" +#include "config.h" + +static u32 config_port_ref; + +static DEFINE_SPINLOCK(config_lock); + +static const void *req_tlv_area; /* request message TLV area */ +static int req_tlv_space; /* request message TLV area size */ +static int rep_headroom; /* reply message headroom to use */ + + +struct sk_buff *tipc_cfg_reply_alloc(int payload_size) +{ + struct sk_buff *buf; + + buf = alloc_skb(rep_headroom + payload_size, GFP_ATOMIC); + if (buf) + skb_reserve(buf, rep_headroom); + return buf; +} + +int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type, + void *tlv_data, int tlv_data_size) +{ + struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(buf); + int new_tlv_space = TLV_SPACE(tlv_data_size); + + if (skb_tailroom(buf) < new_tlv_space) + return 0; + skb_put(buf, new_tlv_space); + tlv->tlv_type = htons(tlv_type); + tlv->tlv_len = htons(TLV_LENGTH(tlv_data_size)); + if (tlv_data_size && tlv_data) + memcpy(TLV_DATA(tlv), tlv_data, tlv_data_size); + return 1; +} + +static struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value) +{ + struct sk_buff *buf; + __be32 value_net; + + buf = tipc_cfg_reply_alloc(TLV_SPACE(sizeof(value))); + if (buf) { + value_net = htonl(value); + tipc_cfg_append_tlv(buf, tlv_type, &value_net, + sizeof(value_net)); + } + return buf; +} + +static struct sk_buff *tipc_cfg_reply_unsigned(u32 value) +{ + return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value); +} + +struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string) +{ + struct sk_buff *buf; + int string_len = strlen(string) + 1; + + buf = tipc_cfg_reply_alloc(TLV_SPACE(string_len)); + if (buf) + tipc_cfg_append_tlv(buf, tlv_type, string, string_len); + return buf; +} + +#define MAX_STATS_INFO 2000 + +static struct sk_buff *tipc_show_stats(void) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + struct print_buf pb; + int str_len; + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = ntohl(*(u32 *)TLV_DATA(req_tlv_area)); + if (value != 0) + return tipc_cfg_reply_error_string("unsupported argument"); + + buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_STATS_INFO)); + if (buf == NULL) + return NULL; + + rep_tlv = (struct tlv_desc *)buf->data; + tipc_printbuf_init(&pb, (char *)TLV_DATA(rep_tlv), MAX_STATS_INFO); + + tipc_printf(&pb, "TIPC version " TIPC_MOD_VER "\n"); + + /* Use additional tipc_printf()'s to return more info ... */ + + str_len = tipc_printbuf_validate(&pb); + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +static struct sk_buff *cfg_enable_bearer(void) +{ + struct tipc_bearer_config *args; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_CONFIG)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + args = (struct tipc_bearer_config *)TLV_DATA(req_tlv_area); + if (tipc_enable_bearer(args->name, + ntohl(args->disc_domain), + ntohl(args->priority))) + return tipc_cfg_reply_error_string("unable to enable bearer"); + + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_disable_bearer(void) +{ + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_NAME)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + if (tipc_disable_bearer((char *)TLV_DATA(req_tlv_area))) + return tipc_cfg_reply_error_string("unable to disable bearer"); + + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_set_own_addr(void) +{ + u32 addr; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + addr = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (addr == tipc_own_addr) + return tipc_cfg_reply_none(); + if (!tipc_addr_node_valid(addr)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (node address)"); + if (tipc_mode == TIPC_NET_MODE) + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change node address once assigned)"); + + /* + * Must release all spinlocks before calling start_net() because + * Linux version of TIPC calls eth_media_start() which calls + * register_netdevice_notifier() which may block! + * + * Temporarily releasing the lock should be harmless for non-Linux TIPC, + * but Linux version of eth_media_start() should really be reworked + * so that it can be called with spinlocks held. + */ + + spin_unlock_bh(&config_lock); + tipc_core_start_net(addr); + spin_lock_bh(&config_lock); + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_set_remote_mng(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + tipc_remote_management = (value != 0); + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_publications(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (value != delimit(value, 1, 65535)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max publications must be 1-65535)"); + tipc_max_publications = value; + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_subscriptions(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (value != delimit(value, 1, 65535)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max subscriptions must be 1-65535"); + tipc_max_subscriptions = value; + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_ports(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (value == tipc_max_ports) + return tipc_cfg_reply_none(); + if (value != delimit(value, 127, 65535)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max ports must be 127-65535)"); + if (tipc_mode != TIPC_NOT_RUNNING) + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change max ports while TIPC is active)"); + tipc_max_ports = value; + return tipc_cfg_reply_none(); +} + +static struct sk_buff *cfg_set_netid(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (value == tipc_net_id) + return tipc_cfg_reply_none(); + if (value != delimit(value, 1, 9999)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (network id must be 1-9999)"); + if (tipc_mode == TIPC_NET_MODE) + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change network id once TIPC has joined a network)"); + tipc_net_id = value; + return tipc_cfg_reply_none(); +} + +struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area, + int request_space, int reply_headroom) +{ + struct sk_buff *rep_tlv_buf; + + spin_lock_bh(&config_lock); + + /* Save request and reply details in a well-known location */ + + req_tlv_area = request_area; + req_tlv_space = request_space; + rep_headroom = reply_headroom; + + /* Check command authorization */ + + if (likely(orig_node == tipc_own_addr)) { + /* command is permitted */ + } else if (cmd >= 0x8000) { + rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot be done remotely)"); + goto exit; + } else if (!tipc_remote_management) { + rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NO_REMOTE); + goto exit; + } else if (cmd >= 0x4000) { + u32 domain = 0; + + if ((tipc_nametbl_translate(TIPC_ZM_SRV, 0, &domain) == 0) || + (domain != orig_node)) { + rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_ZONE_MSTR); + goto exit; + } + } + + /* Call appropriate processing routine */ + + switch (cmd) { + case TIPC_CMD_NOOP: + rep_tlv_buf = tipc_cfg_reply_none(); + break; + case TIPC_CMD_GET_NODES: + rep_tlv_buf = tipc_node_get_nodes(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_GET_LINKS: + rep_tlv_buf = tipc_node_get_links(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_SHOW_LINK_STATS: + rep_tlv_buf = tipc_link_cmd_show_stats(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_RESET_LINK_STATS: + rep_tlv_buf = tipc_link_cmd_reset_stats(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_SHOW_NAME_TABLE: + rep_tlv_buf = tipc_nametbl_get(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_GET_BEARER_NAMES: + rep_tlv_buf = tipc_bearer_get_names(); + break; + case TIPC_CMD_GET_MEDIA_NAMES: + rep_tlv_buf = tipc_media_get_names(); + break; + case TIPC_CMD_SHOW_PORTS: + rep_tlv_buf = tipc_port_get_ports(); + break; + case TIPC_CMD_SET_LOG_SIZE: + rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_DUMP_LOG: + rep_tlv_buf = tipc_log_dump(); + break; + case TIPC_CMD_SHOW_STATS: + rep_tlv_buf = tipc_show_stats(); + break; + case TIPC_CMD_SET_LINK_TOL: + case TIPC_CMD_SET_LINK_PRI: + case TIPC_CMD_SET_LINK_WINDOW: + rep_tlv_buf = tipc_link_cmd_config(req_tlv_area, req_tlv_space, cmd); + break; + case TIPC_CMD_ENABLE_BEARER: + rep_tlv_buf = cfg_enable_bearer(); + break; + case TIPC_CMD_DISABLE_BEARER: + rep_tlv_buf = cfg_disable_bearer(); + break; + case TIPC_CMD_SET_NODE_ADDR: + rep_tlv_buf = cfg_set_own_addr(); + break; + case TIPC_CMD_SET_REMOTE_MNG: + rep_tlv_buf = cfg_set_remote_mng(); + break; + case TIPC_CMD_SET_MAX_PORTS: + rep_tlv_buf = cfg_set_max_ports(); + break; + case TIPC_CMD_SET_MAX_PUBL: + rep_tlv_buf = cfg_set_max_publications(); + break; + case TIPC_CMD_SET_MAX_SUBSCR: + rep_tlv_buf = cfg_set_max_subscriptions(); + break; + case TIPC_CMD_SET_NETID: + rep_tlv_buf = cfg_set_netid(); + break; + case TIPC_CMD_GET_REMOTE_MNG: + rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_remote_management); + break; + case TIPC_CMD_GET_MAX_PORTS: + rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_ports); + break; + case TIPC_CMD_GET_MAX_PUBL: + rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_publications); + break; + case TIPC_CMD_GET_MAX_SUBSCR: + rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_subscriptions); + break; + case TIPC_CMD_GET_NETID: + rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_net_id); + break; + case TIPC_CMD_NOT_NET_ADMIN: + rep_tlv_buf = + tipc_cfg_reply_error_string(TIPC_CFG_NOT_NET_ADMIN); + break; + case TIPC_CMD_SET_MAX_ZONES: + case TIPC_CMD_GET_MAX_ZONES: + case TIPC_CMD_SET_MAX_SLAVES: + case TIPC_CMD_GET_MAX_SLAVES: + case TIPC_CMD_SET_MAX_CLUSTERS: + case TIPC_CMD_GET_MAX_CLUSTERS: + case TIPC_CMD_SET_MAX_NODES: + case TIPC_CMD_GET_MAX_NODES: + rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (obsolete command)"); + break; + default: + rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (unknown command)"); + break; + } + + /* Return reply buffer */ +exit: + spin_unlock_bh(&config_lock); + return rep_tlv_buf; +} + +static void cfg_named_msg_event(void *userdata, + u32 port_ref, + struct sk_buff **buf, + const unchar *msg, + u32 size, + u32 importance, + struct tipc_portid const *orig, + struct tipc_name_seq const *dest) +{ + struct tipc_cfg_msg_hdr *req_hdr; + struct tipc_cfg_msg_hdr *rep_hdr; + struct sk_buff *rep_buf; + + /* Validate configuration message header (ignore invalid message) */ + + req_hdr = (struct tipc_cfg_msg_hdr *)msg; + if ((size < sizeof(*req_hdr)) || + (size != TCM_ALIGN(ntohl(req_hdr->tcm_len))) || + (ntohs(req_hdr->tcm_flags) != TCM_F_REQUEST)) { + warn("Invalid configuration message discarded\n"); + return; + } + + /* Generate reply for request (if can't, return request) */ + + rep_buf = tipc_cfg_do_cmd(orig->node, + ntohs(req_hdr->tcm_type), + msg + sizeof(*req_hdr), + size - sizeof(*req_hdr), + BUF_HEADROOM + MAX_H_SIZE + sizeof(*rep_hdr)); + if (rep_buf) { + skb_push(rep_buf, sizeof(*rep_hdr)); + rep_hdr = (struct tipc_cfg_msg_hdr *)rep_buf->data; + memcpy(rep_hdr, req_hdr, sizeof(*rep_hdr)); + rep_hdr->tcm_len = htonl(rep_buf->len); + rep_hdr->tcm_flags &= htons(~TCM_F_REQUEST); + } else { + rep_buf = *buf; + *buf = NULL; + } + + /* NEED TO ADD CODE TO HANDLE FAILED SEND (SUCH AS CONGESTION) */ + tipc_send_buf2port(port_ref, orig, rep_buf, rep_buf->len); +} + +int tipc_cfg_init(void) +{ + struct tipc_name_seq seq; + int res; + + res = tipc_createport(NULL, TIPC_CRITICAL_IMPORTANCE, + NULL, NULL, NULL, + NULL, cfg_named_msg_event, NULL, + NULL, &config_port_ref); + if (res) + goto failed; + + seq.type = TIPC_CFG_SRV; + seq.lower = seq.upper = tipc_own_addr; + res = tipc_nametbl_publish_rsv(config_port_ref, TIPC_ZONE_SCOPE, &seq); + if (res) + goto failed; + + return 0; + +failed: + err("Unable to create configuration service\n"); + return res; +} + +void tipc_cfg_stop(void) +{ + if (config_port_ref) { + tipc_deleteport(config_port_ref); + config_port_ref = 0; + } +} diff --git a/net/tipc/config.h b/net/tipc/config.h new file mode 100644 index 00000000..443159a1 --- /dev/null +++ b/net/tipc/config.h @@ -0,0 +1,72 @@ +/* + * net/tipc/config.h: Include file for TIPC configuration service code + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_CONFIG_H +#define _TIPC_CONFIG_H + +/* ---------------------------------------------------------------------- */ + +#include "link.h" + +struct sk_buff *tipc_cfg_reply_alloc(int payload_size); +int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type, + void *tlv_data, int tlv_data_size); +struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string); + +static inline struct sk_buff *tipc_cfg_reply_none(void) +{ + return tipc_cfg_reply_alloc(0); +} + +static inline struct sk_buff *tipc_cfg_reply_error_string(char *string) +{ + return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string); +} + +static inline struct sk_buff *tipc_cfg_reply_ultra_string(char *string) +{ + return tipc_cfg_reply_string_type(TIPC_TLV_ULTRA_STRING, string); +} + +struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, + const void *req_tlv_area, int req_tlv_space, + int headroom); + +void tipc_cfg_link_event(u32 addr, char *name, int up); +int tipc_cfg_init(void); +void tipc_cfg_stop(void); + +#endif diff --git a/net/tipc/core.c b/net/tipc/core.c new file mode 100644 index 00000000..943b6af8 --- /dev/null +++ b/net/tipc/core.c @@ -0,0 +1,211 @@ +/* + * net/tipc/core.c: TIPC module code + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005-2006, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "ref.h" +#include "name_table.h" +#include "subscr.h" +#include "config.h" + + +#ifndef CONFIG_TIPC_PORTS +#define CONFIG_TIPC_PORTS 8191 +#endif + +#ifndef CONFIG_TIPC_LOG +#define CONFIG_TIPC_LOG 0 +#endif + +/* global variables used by multiple sub-systems within TIPC */ + +int tipc_mode = TIPC_NOT_RUNNING; +int tipc_random; + +const char tipc_alphabet[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_."; + +/* configurable TIPC parameters */ + +u32 tipc_own_addr; +int tipc_max_ports; +int tipc_max_subscriptions; +int tipc_max_publications; +int tipc_net_id; +int tipc_remote_management; + + +/** + * tipc_buf_acquire - creates a TIPC message buffer + * @size: message size (including TIPC header) + * + * Returns a new buffer with data pointers set to the specified size. + * + * NOTE: Headroom is reserved to allow prepending of a data link header. + * There may also be unrequested tailroom present at the buffer's end. + */ + +struct sk_buff *tipc_buf_acquire(u32 size) +{ + struct sk_buff *skb; + unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; + + skb = alloc_skb_fclone(buf_size, GFP_ATOMIC); + if (skb) { + skb_reserve(skb, BUF_HEADROOM); + skb_put(skb, size); + skb->next = NULL; + } + return skb; +} + +/** + * tipc_core_stop_net - shut down TIPC networking sub-systems + */ + +static void tipc_core_stop_net(void) +{ + tipc_eth_media_stop(); + tipc_net_stop(); +} + +/** + * start_net - start TIPC networking sub-systems + */ + +int tipc_core_start_net(unsigned long addr) +{ + int res; + + res = tipc_net_start(addr); + if (!res) + res = tipc_eth_media_start(); + if (res) + tipc_core_stop_net(); + return res; +} + +/** + * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode + */ + +static void tipc_core_stop(void) +{ + if (tipc_mode != TIPC_NODE_MODE) + return; + + tipc_mode = TIPC_NOT_RUNNING; + + tipc_netlink_stop(); + tipc_handler_stop(); + tipc_cfg_stop(); + tipc_subscr_stop(); + tipc_nametbl_stop(); + tipc_ref_table_stop(); + tipc_socket_stop(); + tipc_log_resize(0); +} + +/** + * tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode + */ + +static int tipc_core_start(void) +{ + int res; + + if (tipc_mode != TIPC_NOT_RUNNING) + return -ENOPROTOOPT; + + get_random_bytes(&tipc_random, sizeof(tipc_random)); + tipc_mode = TIPC_NODE_MODE; + + res = tipc_handler_start(); + if (!res) + res = tipc_ref_table_init(tipc_max_ports, tipc_random); + if (!res) + res = tipc_nametbl_init(); + if (!res) + res = tipc_k_signal((Handler)tipc_subscr_start, 0); + if (!res) + res = tipc_k_signal((Handler)tipc_cfg_init, 0); + if (!res) + res = tipc_netlink_start(); + if (!res) + res = tipc_socket_init(); + if (res) + tipc_core_stop(); + + return res; +} + + +static int __init tipc_init(void) +{ + int res; + + if (tipc_log_resize(CONFIG_TIPC_LOG) != 0) + warn("Unable to create log buffer\n"); + + info("Activated (version " TIPC_MOD_VER ")\n"); + + tipc_own_addr = 0; + tipc_remote_management = 1; + tipc_max_publications = 10000; + tipc_max_subscriptions = 2000; + tipc_max_ports = CONFIG_TIPC_PORTS; + tipc_net_id = 4711; + + res = tipc_core_start(); + if (res) + err("Unable to start in single node mode\n"); + else + info("Started in single node mode\n"); + return res; +} + +static void __exit tipc_exit(void) +{ + tipc_core_stop_net(); + tipc_core_stop(); + info("Deactivated\n"); +} + +module_init(tipc_init); +module_exit(tipc_exit); + +MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(TIPC_MOD_VER); diff --git a/net/tipc/core.h b/net/tipc/core.h new file mode 100644 index 00000000..436dda11 --- /dev/null +++ b/net/tipc/core.h @@ -0,0 +1,312 @@ +/* + * net/tipc/core.h: Include file for TIPC global declarations + * + * Copyright (c) 2005-2006, Ericsson AB + * Copyright (c) 2005-2007, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_CORE_H +#define _TIPC_CORE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define TIPC_MOD_VER "2.0.0" + +struct tipc_msg; /* msg.h */ +struct print_buf; /* log.h */ + +/* + * TIPC sanity test macros + */ + +#define assert(i) BUG_ON(!(i)) + +/* + * TIPC system monitoring code + */ + +/* + * TIPC's print buffer subsystem supports the following print buffers: + * + * TIPC_NULL : null buffer (i.e. print nowhere) + * TIPC_CONS : system console + * TIPC_LOG : TIPC log buffer + * &buf : user-defined buffer (struct print_buf *) + * + * Note: TIPC_LOG is configured to echo its output to the system console; + * user-defined buffers can be configured to do the same thing. + */ +extern struct print_buf *const TIPC_NULL; +extern struct print_buf *const TIPC_CONS; +extern struct print_buf *const TIPC_LOG; + +void tipc_printf(struct print_buf *, const char *fmt, ...); + +/* + * TIPC_OUTPUT is the destination print buffer for system messages. + */ + +#ifndef TIPC_OUTPUT +#define TIPC_OUTPUT TIPC_LOG +#endif + +#define err(fmt, arg...) tipc_printf(TIPC_OUTPUT, \ + KERN_ERR "TIPC: " fmt, ## arg) +#define warn(fmt, arg...) tipc_printf(TIPC_OUTPUT, \ + KERN_WARNING "TIPC: " fmt, ## arg) +#define info(fmt, arg...) tipc_printf(TIPC_OUTPUT, \ + KERN_NOTICE "TIPC: " fmt, ## arg) + +#ifdef CONFIG_TIPC_DEBUG + +/* + * DBG_OUTPUT is the destination print buffer for debug messages. + */ + +#ifndef DBG_OUTPUT +#define DBG_OUTPUT TIPC_LOG +#endif + +#define dbg(fmt, arg...) tipc_printf(DBG_OUTPUT, KERN_DEBUG fmt, ## arg); + +#define msg_dbg(msg, txt) tipc_msg_dbg(DBG_OUTPUT, msg, txt); + +void tipc_msg_dbg(struct print_buf *, struct tipc_msg *, const char *); + +#else + +#define dbg(fmt, arg...) do {} while (0) +#define msg_dbg(msg, txt) do {} while (0) + +#define tipc_msg_dbg(buf, msg, txt) do {} while (0) + +#endif + + +/* + * TIPC-specific error codes + */ + +#define ELINKCONG EAGAIN /* link congestion <=> resource unavailable */ + +/* + * TIPC operating mode routines + */ +#define TIPC_NOT_RUNNING 0 +#define TIPC_NODE_MODE 1 +#define TIPC_NET_MODE 2 + +/* + * Global configuration variables + */ + +extern u32 tipc_own_addr; +extern int tipc_max_ports; +extern int tipc_max_subscriptions; +extern int tipc_max_publications; +extern int tipc_net_id; +extern int tipc_remote_management; + +/* + * Other global variables + */ + +extern int tipc_mode; +extern int tipc_random; +extern const char tipc_alphabet[]; + + +/* + * Routines available to privileged subsystems + */ + +extern int tipc_core_start_net(unsigned long); +extern int tipc_handler_start(void); +extern void tipc_handler_stop(void); +extern int tipc_netlink_start(void); +extern void tipc_netlink_stop(void); +extern int tipc_socket_init(void); +extern void tipc_socket_stop(void); + +static inline int delimit(int val, int min, int max) +{ + if (val > max) + return max; + if (val < min) + return min; + return val; +} + + +/* + * TIPC timer and signal code + */ + +typedef void (*Handler) (unsigned long); + +u32 tipc_k_signal(Handler routine, unsigned long argument); + +/** + * k_init_timer - initialize a timer + * @timer: pointer to timer structure + * @routine: pointer to routine to invoke when timer expires + * @argument: value to pass to routine when timer expires + * + * Timer must be initialized before use (and terminated when no longer needed). + */ + +static inline void k_init_timer(struct timer_list *timer, Handler routine, + unsigned long argument) +{ + setup_timer(timer, routine, argument); +} + +/** + * k_start_timer - start a timer + * @timer: pointer to timer structure + * @msec: time to delay (in ms) + * + * Schedules a previously initialized timer for later execution. + * If timer is already running, the new timeout overrides the previous request. + * + * To ensure the timer doesn't expire before the specified delay elapses, + * the amount of delay is rounded up when converting to the jiffies + * then an additional jiffy is added to account for the fact that + * the starting time may be in the middle of the current jiffy. + */ + +static inline void k_start_timer(struct timer_list *timer, unsigned long msec) +{ + mod_timer(timer, jiffies + msecs_to_jiffies(msec) + 1); +} + +/** + * k_cancel_timer - cancel a timer + * @timer: pointer to timer structure + * + * Cancels a previously initialized timer. + * Can be called safely even if the timer is already inactive. + * + * WARNING: Must not be called when holding locks required by the timer's + * timeout routine, otherwise deadlock can occur on SMP systems! + */ + +static inline void k_cancel_timer(struct timer_list *timer) +{ + del_timer_sync(timer); +} + +/** + * k_term_timer - terminate a timer + * @timer: pointer to timer structure + * + * Prevents further use of a previously initialized timer. + * + * WARNING: Caller must ensure timer isn't currently running. + * + * (Do not "enhance" this routine to automatically cancel an active timer, + * otherwise deadlock can arise when a timeout routine calls k_term_timer.) + */ + +static inline void k_term_timer(struct timer_list *timer) +{ +} + + +/* + * TIPC message buffer code + * + * TIPC message buffer headroom reserves space for the worst-case + * link-level device header (in case the message is sent off-node). + * + * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields + * are word aligned for quicker access + */ + +#define BUF_HEADROOM LL_MAX_HEADER + +struct tipc_skb_cb { + void *handle; +}; + +#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) + + +static inline struct tipc_msg *buf_msg(struct sk_buff *skb) +{ + return (struct tipc_msg *)skb->data; +} + +extern struct sk_buff *tipc_buf_acquire(u32 size); + +/** + * buf_discard - frees a TIPC message buffer + * @skb: message buffer + * + * Frees a message buffer. If passed NULL, just returns. + */ + +static inline void buf_discard(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/** + * buf_linearize - convert a TIPC message buffer into a single contiguous piece + * @skb: message buffer + * + * Returns 0 on success. + */ + +static inline int buf_linearize(struct sk_buff *skb) +{ + return skb_linearize(skb); +} + +#endif diff --git a/net/tipc/discover.c b/net/tipc/discover.c new file mode 100644 index 00000000..09879331 --- /dev/null +++ b/net/tipc/discover.c @@ -0,0 +1,358 @@ +/* + * net/tipc/discover.c + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005-2006, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "link.h" +#include "discover.h" + +#define TIPC_LINK_REQ_INIT 125 /* min delay during bearer start up */ +#define TIPC_LINK_REQ_FAST 1000 /* max delay if bearer has no links */ +#define TIPC_LINK_REQ_SLOW 60000 /* max delay if bearer has links */ +#define TIPC_LINK_REQ_INACTIVE 0xffffffff /* indicates no timer in use */ + + +/** + * struct link_req - information about an ongoing link setup request + * @bearer: bearer issuing requests + * @dest: destination address for request messages + * @domain: network domain to which links can be established + * @num_nodes: number of nodes currently discovered (i.e. with an active link) + * @buf: request message to be (repeatedly) sent + * @timer: timer governing period between requests + * @timer_intv: current interval between requests (in ms) + */ +struct link_req { + struct tipc_bearer *bearer; + struct tipc_media_addr dest; + u32 domain; + int num_nodes; + struct sk_buff *buf; + struct timer_list timer; + unsigned int timer_intv; +}; + +/** + * tipc_disc_init_msg - initialize a link setup message + * @type: message type (request or response) + * @dest_domain: network domain of node(s) which should respond to message + * @b_ptr: ptr to bearer issuing message + */ + +static struct sk_buff *tipc_disc_init_msg(u32 type, + u32 dest_domain, + struct tipc_bearer *b_ptr) +{ + struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE); + struct tipc_msg *msg; + + if (buf) { + msg = buf_msg(buf); + tipc_msg_init(msg, LINK_CONFIG, type, INT_H_SIZE, dest_domain); + msg_set_non_seq(msg, 1); + msg_set_dest_domain(msg, dest_domain); + msg_set_bc_netid(msg, tipc_net_id); + msg_set_media_addr(msg, &b_ptr->addr); + } + return buf; +} + +/** + * disc_dupl_alert - issue node address duplication alert + * @b_ptr: pointer to bearer detecting duplication + * @node_addr: duplicated node address + * @media_addr: media address advertised by duplicated node + */ + +static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr, + struct tipc_media_addr *media_addr) +{ + char node_addr_str[16]; + char media_addr_str[64]; + struct print_buf pb; + + tipc_addr_string_fill(node_addr_str, node_addr); + tipc_printbuf_init(&pb, media_addr_str, sizeof(media_addr_str)); + tipc_media_addr_printf(&pb, media_addr); + tipc_printbuf_validate(&pb); + warn("Duplicate %s using %s seen on <%s>\n", + node_addr_str, media_addr_str, b_ptr->name); +} + +/** + * tipc_disc_recv_msg - handle incoming link setup message (request or response) + * @buf: buffer containing message + * @b_ptr: bearer that message arrived on + */ + +void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr) +{ + struct tipc_node *n_ptr; + struct link *link; + struct tipc_media_addr media_addr, *addr; + struct sk_buff *rbuf; + struct tipc_msg *msg = buf_msg(buf); + u32 dest = msg_dest_domain(msg); + u32 orig = msg_prevnode(msg); + u32 net_id = msg_bc_netid(msg); + u32 type = msg_type(msg); + int link_fully_up; + + msg_get_media_addr(msg, &media_addr); + buf_discard(buf); + + /* Validate discovery message from requesting node */ + if (net_id != tipc_net_id) + return; + if (!tipc_addr_domain_valid(dest)) + return; + if (!tipc_addr_node_valid(orig)) + return; + if (orig == tipc_own_addr) { + if (memcmp(&media_addr, &b_ptr->addr, sizeof(media_addr))) + disc_dupl_alert(b_ptr, tipc_own_addr, &media_addr); + return; + } + if (!tipc_in_scope(dest, tipc_own_addr)) + return; + if (!tipc_in_scope(b_ptr->link_req->domain, orig)) + return; + + /* Locate structure corresponding to requesting node */ + n_ptr = tipc_node_find(orig); + if (!n_ptr) { + n_ptr = tipc_node_create(orig); + if (!n_ptr) + return; + } + tipc_node_lock(n_ptr); + + /* Don't talk to neighbor during cleanup after last session */ + if (n_ptr->cleanup_required) { + tipc_node_unlock(n_ptr); + return; + } + + link = n_ptr->links[b_ptr->identity]; + + /* Create a link endpoint for this bearer, if necessary */ + if (!link) { + link = tipc_link_create(n_ptr, b_ptr, &media_addr); + if (!link) { + tipc_node_unlock(n_ptr); + return; + } + } + + /* + * Ensure requesting node's media address is correct + * + * If media address doesn't match and the link is working, reject the + * request (must be from a duplicate node). + * + * If media address doesn't match and the link is not working, accept + * the new media address and reset the link to ensure it starts up + * cleanly. + */ + addr = &link->media_addr; + if (memcmp(addr, &media_addr, sizeof(*addr))) { + if (tipc_link_is_up(link) || (!link->started)) { + disc_dupl_alert(b_ptr, orig, &media_addr); + tipc_node_unlock(n_ptr); + return; + } + warn("Resetting link <%s>, peer interface address changed\n", + link->name); + memcpy(addr, &media_addr, sizeof(*addr)); + tipc_link_reset(link); + } + + /* Accept discovery message & send response, if necessary */ + link_fully_up = link_working_working(link); + + if ((type == DSC_REQ_MSG) && !link_fully_up && !b_ptr->blocked) { + rbuf = tipc_disc_init_msg(DSC_RESP_MSG, orig, b_ptr); + if (rbuf) { + b_ptr->media->send_msg(rbuf, b_ptr, &media_addr); + buf_discard(rbuf); + } + } + + tipc_node_unlock(n_ptr); +} + +/** + * disc_update - update frequency of periodic link setup requests + * @req: ptr to link request structure + * + * Reinitiates discovery process if discovery object has no associated nodes + * and is either not currently searching or is searching at a slow rate + */ + +static void disc_update(struct link_req *req) +{ + if (!req->num_nodes) { + if ((req->timer_intv == TIPC_LINK_REQ_INACTIVE) || + (req->timer_intv > TIPC_LINK_REQ_FAST)) { + req->timer_intv = TIPC_LINK_REQ_INIT; + k_start_timer(&req->timer, req->timer_intv); + } + } +} + +/** + * tipc_disc_add_dest - increment set of discovered nodes + * @req: ptr to link request structure + */ + +void tipc_disc_add_dest(struct link_req *req) +{ + req->num_nodes++; +} + +/** + * tipc_disc_remove_dest - decrement set of discovered nodes + * @req: ptr to link request structure + */ + +void tipc_disc_remove_dest(struct link_req *req) +{ + req->num_nodes--; + disc_update(req); +} + +/** + * disc_send_msg - send link setup request message + * @req: ptr to link request structure + */ + +static void disc_send_msg(struct link_req *req) +{ + if (!req->bearer->blocked) + tipc_bearer_send(req->bearer, req->buf, &req->dest); +} + +/** + * disc_timeout - send a periodic link setup request + * @req: ptr to link request structure + * + * Called whenever a link setup request timer associated with a bearer expires. + */ + +static void disc_timeout(struct link_req *req) +{ + int max_delay; + + spin_lock_bh(&req->bearer->lock); + + /* Stop searching if only desired node has been found */ + + if (tipc_node(req->domain) && req->num_nodes) { + req->timer_intv = TIPC_LINK_REQ_INACTIVE; + goto exit; + } + + /* + * Send discovery message, then update discovery timer + * + * Keep doubling time between requests until limit is reached; + * hold at fast polling rate if don't have any associated nodes, + * otherwise hold at slow polling rate + */ + + disc_send_msg(req); + + req->timer_intv *= 2; + if (req->num_nodes) + max_delay = TIPC_LINK_REQ_SLOW; + else + max_delay = TIPC_LINK_REQ_FAST; + if (req->timer_intv > max_delay) + req->timer_intv = max_delay; + + k_start_timer(&req->timer, req->timer_intv); +exit: + spin_unlock_bh(&req->bearer->lock); +} + +/** + * tipc_disc_create - create object to send periodic link setup requests + * @b_ptr: ptr to bearer issuing requests + * @dest: destination address for request messages + * @dest_domain: network domain to which links can be established + * + * Returns 0 if successful, otherwise -errno. + */ + +int tipc_disc_create(struct tipc_bearer *b_ptr, + struct tipc_media_addr *dest, u32 dest_domain) +{ + struct link_req *req; + + req = kmalloc(sizeof(*req), GFP_ATOMIC); + if (!req) + return -ENOMEM; + + req->buf = tipc_disc_init_msg(DSC_REQ_MSG, dest_domain, b_ptr); + if (!req->buf) { + kfree(req); + return -ENOMSG; + } + + memcpy(&req->dest, dest, sizeof(*dest)); + req->bearer = b_ptr; + req->domain = dest_domain; + req->num_nodes = 0; + req->timer_intv = TIPC_LINK_REQ_INIT; + k_init_timer(&req->timer, (Handler)disc_timeout, (unsigned long)req); + k_start_timer(&req->timer, req->timer_intv); + b_ptr->link_req = req; + disc_send_msg(req); + return 0; +} + +/** + * tipc_disc_delete - destroy object sending periodic link setup requests + * @req: ptr to link request structure + */ + +void tipc_disc_delete(struct link_req *req) +{ + k_cancel_timer(&req->timer); + k_term_timer(&req->timer); + buf_discard(req->buf); + kfree(req); +} + diff --git a/net/tipc/discover.h b/net/tipc/discover.h new file mode 100644 index 00000000..a3af595b --- /dev/null +++ b/net/tipc/discover.h @@ -0,0 +1,49 @@ +/* + * net/tipc/discover.h + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_DISCOVER_H +#define _TIPC_DISCOVER_H + +struct link_req; + +int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest, + u32 dest_domain); +void tipc_disc_delete(struct link_req *req); +void tipc_disc_add_dest(struct link_req *req); +void tipc_disc_remove_dest(struct link_req *req); +void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr); + +#endif diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c new file mode 100644 index 00000000..b69092eb --- /dev/null +++ b/net/tipc/eth_media.c @@ -0,0 +1,323 @@ +/* + * net/tipc/eth_media.c: Ethernet bearer support for TIPC + * + * Copyright (c) 2001-2007, Ericsson AB + * Copyright (c) 2005-2007, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "bearer.h" + +#define MAX_ETH_BEARERS 2 +#define ETH_LINK_PRIORITY TIPC_DEF_LINK_PRI +#define ETH_LINK_TOLERANCE TIPC_DEF_LINK_TOL +#define ETH_LINK_WINDOW TIPC_DEF_LINK_WIN + +/** + * struct eth_bearer - Ethernet bearer data structure + * @bearer: ptr to associated "generic" bearer structure + * @dev: ptr to associated Ethernet network device + * @tipc_packet_type: used in binding TIPC to Ethernet driver + */ + +struct eth_bearer { + struct tipc_bearer *bearer; + struct net_device *dev; + struct packet_type tipc_packet_type; +}; + +static struct eth_bearer eth_bearers[MAX_ETH_BEARERS]; +static int eth_started; +static struct notifier_block notifier; + +/** + * send_msg - send a TIPC message out over an Ethernet interface + */ + +static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, + struct tipc_media_addr *dest) +{ + struct sk_buff *clone; + struct net_device *dev; + int delta; + + clone = skb_clone(buf, GFP_ATOMIC); + if (!clone) + return 0; + + dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; + delta = dev->hard_header_len - skb_headroom(buf); + + if ((delta > 0) && + pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(clone); + return 0; + } + + skb_reset_network_header(clone); + clone->dev = dev; + dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr, + dev->dev_addr, clone->len); + dev_queue_xmit(clone); + return 0; +} + +/** + * recv_msg - handle incoming TIPC message from an Ethernet interface + * + * Accept only packets explicitly sent to this node, or broadcast packets; + * ignores packets sent using Ethernet multicast, and traffic sent to other + * nodes (which can happen if interface is running in promiscuous mode). + */ + +static int recv_msg(struct sk_buff *buf, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv; + + if (!net_eq(dev_net(dev), &init_net)) { + kfree_skb(buf); + return 0; + } + + if (likely(eb_ptr->bearer)) { + if (likely(buf->pkt_type <= PACKET_BROADCAST)) { + buf->next = NULL; + tipc_recv_msg(buf, eb_ptr->bearer); + return 0; + } + } + kfree_skb(buf); + return 0; +} + +/** + * enable_bearer - attach TIPC bearer to an Ethernet interface + */ + +static int enable_bearer(struct tipc_bearer *tb_ptr) +{ + struct net_device *dev = NULL; + struct net_device *pdev = NULL; + struct eth_bearer *eb_ptr = ð_bearers[0]; + struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; + char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; + int pending_dev = 0; + + /* Find unused Ethernet bearer structure */ + + while (eb_ptr->dev) { + if (!eb_ptr->bearer) + pending_dev++; + if (++eb_ptr == stop) + return pending_dev ? -EAGAIN : -EDQUOT; + } + + /* Find device with specified name */ + + for_each_netdev(&init_net, pdev) { + if (!strncmp(pdev->name, driver_name, IFNAMSIZ)) { + dev = pdev; + break; + } + } + if (!dev) + return -ENODEV; + + /* Find Ethernet bearer for device (or create one) */ + + while ((eb_ptr != stop) && eb_ptr->dev && (eb_ptr->dev != dev)) + eb_ptr++; + if (eb_ptr == stop) + return -EDQUOT; + if (!eb_ptr->dev) { + eb_ptr->dev = dev; + eb_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); + eb_ptr->tipc_packet_type.dev = dev; + eb_ptr->tipc_packet_type.func = recv_msg; + eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr; + INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list)); + dev_hold(dev); + dev_add_pack(&eb_ptr->tipc_packet_type); + } + + /* Associate TIPC bearer with Ethernet bearer */ + + eb_ptr->bearer = tb_ptr; + tb_ptr->usr_handle = (void *)eb_ptr; + tb_ptr->mtu = dev->mtu; + tb_ptr->blocked = 0; + tb_ptr->addr.type = htonl(TIPC_MEDIA_TYPE_ETH); + memcpy(&tb_ptr->addr.dev_addr, dev->dev_addr, ETH_ALEN); + return 0; +} + +/** + * disable_bearer - detach TIPC bearer from an Ethernet interface + * + * We really should do dev_remove_pack() here, but this function can not be + * called at tasklet level. => Use eth_bearer->bearer as a flag to throw away + * incoming buffers, & postpone dev_remove_pack() to eth_media_stop() on exit. + */ + +static void disable_bearer(struct tipc_bearer *tb_ptr) +{ + ((struct eth_bearer *)tb_ptr->usr_handle)->bearer = NULL; +} + +/** + * recv_notification - handle device updates from OS + * + * Change the state of the Ethernet bearer (if any) associated with the + * specified device. + */ + +static int recv_notification(struct notifier_block *nb, unsigned long evt, + void *dv) +{ + struct net_device *dev = (struct net_device *)dv; + struct eth_bearer *eb_ptr = ð_bearers[0]; + struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + while ((eb_ptr->dev != dev)) { + if (++eb_ptr == stop) + return NOTIFY_DONE; /* couldn't find device */ + } + if (!eb_ptr->bearer) + return NOTIFY_DONE; /* bearer had been disabled */ + + eb_ptr->bearer->mtu = dev->mtu; + + switch (evt) { + case NETDEV_CHANGE: + if (netif_carrier_ok(dev)) + tipc_continue(eb_ptr->bearer); + else + tipc_block_bearer(eb_ptr->bearer->name); + break; + case NETDEV_UP: + tipc_continue(eb_ptr->bearer); + break; + case NETDEV_DOWN: + tipc_block_bearer(eb_ptr->bearer->name); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + tipc_block_bearer(eb_ptr->bearer->name); + tipc_continue(eb_ptr->bearer); + break; + case NETDEV_UNREGISTER: + case NETDEV_CHANGENAME: + tipc_disable_bearer(eb_ptr->bearer->name); + break; + } + return NOTIFY_OK; +} + +/** + * eth_addr2str - convert Ethernet address to string + */ + +static char *eth_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) +{ + unchar *addr = (unchar *)&a->dev_addr; + + if (str_size < 18) + *str_buf = '\0'; + else + sprintf(str_buf, "%pM", addr); + return str_buf; +} + +/** + * tipc_eth_media_start - activate Ethernet bearer support + * + * Register Ethernet media type with TIPC bearer code. Also register + * with OS for notifications about device state changes. + */ + +int tipc_eth_media_start(void) +{ + struct tipc_media_addr bcast_addr; + int res; + + if (eth_started) + return -EINVAL; + + bcast_addr.type = htonl(TIPC_MEDIA_TYPE_ETH); + memset(&bcast_addr.dev_addr, 0xff, ETH_ALEN); + + memset(eth_bearers, 0, sizeof(eth_bearers)); + + res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth", + enable_bearer, disable_bearer, send_msg, + eth_addr2str, &bcast_addr, ETH_LINK_PRIORITY, + ETH_LINK_TOLERANCE, ETH_LINK_WINDOW); + if (res) + return res; + + notifier.notifier_call = &recv_notification; + notifier.priority = 0; + res = register_netdevice_notifier(¬ifier); + if (!res) + eth_started = 1; + return res; +} + +/** + * tipc_eth_media_stop - deactivate Ethernet bearer support + */ + +void tipc_eth_media_stop(void) +{ + int i; + + if (!eth_started) + return; + + unregister_netdevice_notifier(¬ifier); + for (i = 0; i < MAX_ETH_BEARERS ; i++) { + if (eth_bearers[i].bearer) { + eth_bearers[i].bearer->blocked = 1; + eth_bearers[i].bearer = NULL; + } + if (eth_bearers[i].dev) { + dev_remove_pack(ð_bearers[i].tipc_packet_type); + dev_put(eth_bearers[i].dev); + } + } + memset(ð_bearers, 0, sizeof(eth_bearers)); + eth_started = 0; +} diff --git a/net/tipc/handler.c b/net/tipc/handler.c new file mode 100644 index 00000000..274c98e1 --- /dev/null +++ b/net/tipc/handler.c @@ -0,0 +1,132 @@ +/* + * net/tipc/handler.c: TIPC signal handling + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" + +struct queue_item { + struct list_head next_signal; + void (*handler) (unsigned long); + unsigned long data; +}; + +static struct kmem_cache *tipc_queue_item_cache; +static struct list_head signal_queue_head; +static DEFINE_SPINLOCK(qitem_lock); +static int handler_enabled; + +static void process_signal_queue(unsigned long dummy); + +static DECLARE_TASKLET_DISABLED(tipc_tasklet, process_signal_queue, 0); + + +unsigned int tipc_k_signal(Handler routine, unsigned long argument) +{ + struct queue_item *item; + + if (!handler_enabled) { + err("Signal request ignored by handler\n"); + return -ENOPROTOOPT; + } + + spin_lock_bh(&qitem_lock); + item = kmem_cache_alloc(tipc_queue_item_cache, GFP_ATOMIC); + if (!item) { + err("Signal queue out of memory\n"); + spin_unlock_bh(&qitem_lock); + return -ENOMEM; + } + item->handler = routine; + item->data = argument; + list_add_tail(&item->next_signal, &signal_queue_head); + spin_unlock_bh(&qitem_lock); + tasklet_schedule(&tipc_tasklet); + return 0; +} + +static void process_signal_queue(unsigned long dummy) +{ + struct queue_item *__volatile__ item; + struct list_head *l, *n; + + spin_lock_bh(&qitem_lock); + list_for_each_safe(l, n, &signal_queue_head) { + item = list_entry(l, struct queue_item, next_signal); + list_del(&item->next_signal); + spin_unlock_bh(&qitem_lock); + item->handler(item->data); + spin_lock_bh(&qitem_lock); + kmem_cache_free(tipc_queue_item_cache, item); + } + spin_unlock_bh(&qitem_lock); +} + +int tipc_handler_start(void) +{ + tipc_queue_item_cache = + kmem_cache_create("tipc_queue_items", sizeof(struct queue_item), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!tipc_queue_item_cache) + return -ENOMEM; + + INIT_LIST_HEAD(&signal_queue_head); + tasklet_enable(&tipc_tasklet); + handler_enabled = 1; + return 0; +} + +void tipc_handler_stop(void) +{ + struct list_head *l, *n; + struct queue_item *item; + + if (!handler_enabled) + return; + + handler_enabled = 0; + tasklet_disable(&tipc_tasklet); + tasklet_kill(&tipc_tasklet); + + spin_lock_bh(&qitem_lock); + list_for_each_safe(l, n, &signal_queue_head) { + item = list_entry(l, struct queue_item, next_signal); + list_del(&item->next_signal); + kmem_cache_free(tipc_queue_item_cache, item); + } + spin_unlock_bh(&qitem_lock); + + kmem_cache_destroy(tipc_queue_item_cache); +} + diff --git a/net/tipc/link.c b/net/tipc/link.c new file mode 100644 index 00000000..5ed4b4f7 --- /dev/null +++ b/net/tipc/link.c @@ -0,0 +1,3042 @@ +/* + * net/tipc/link.c: TIPC link code + * + * Copyright (c) 1996-2007, Ericsson AB + * Copyright (c) 2004-2007, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "link.h" +#include "port.h" +#include "name_distr.h" +#include "discover.h" +#include "config.h" + + +/* + * Out-of-range value for link session numbers + */ + +#define INVALID_SESSION 0x10000 + +/* + * Link state events: + */ + +#define STARTING_EVT 856384768 /* link processing trigger */ +#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ +#define TIMEOUT_EVT 560817u /* link timer expired */ + +/* + * The following two 'message types' is really just implementation + * data conveniently stored in the message header. + * They must not be considered part of the protocol + */ +#define OPEN_MSG 0 +#define CLOSED_MSG 1 + +/* + * State value stored in 'exp_msg_count' + */ + +#define START_CHANGEOVER 100000u + +/** + * struct link_name - deconstructed link name + * @addr_local: network address of node at this end + * @if_local: name of interface at this end + * @addr_peer: network address of node at far end + * @if_peer: name of interface at far end + */ + +struct link_name { + u32 addr_local; + char if_local[TIPC_MAX_IF_NAME]; + u32 addr_peer; + char if_peer[TIPC_MAX_IF_NAME]; +}; + +static void link_handle_out_of_seq_msg(struct link *l_ptr, + struct sk_buff *buf); +static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf); +static int link_recv_changeover_msg(struct link **l_ptr, struct sk_buff **buf); +static void link_set_supervision_props(struct link *l_ptr, u32 tolerance); +static int link_send_sections_long(struct tipc_port *sender, + struct iovec const *msg_sect, + u32 num_sect, unsigned int total_len, + u32 destnode); +static void link_check_defragm_bufs(struct link *l_ptr); +static void link_state_event(struct link *l_ptr, u32 event); +static void link_reset_statistics(struct link *l_ptr); +static void link_print(struct link *l_ptr, const char *str); +static void link_start(struct link *l_ptr); +static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf); + +/* + * Simple link routines + */ + +static unsigned int align(unsigned int i) +{ + return (i + 3) & ~3u; +} + +static void link_init_max_pkt(struct link *l_ptr) +{ + u32 max_pkt; + + max_pkt = (l_ptr->b_ptr->mtu & ~3); + if (max_pkt > MAX_MSG_SIZE) + max_pkt = MAX_MSG_SIZE; + + l_ptr->max_pkt_target = max_pkt; + if (l_ptr->max_pkt_target < MAX_PKT_DEFAULT) + l_ptr->max_pkt = l_ptr->max_pkt_target; + else + l_ptr->max_pkt = MAX_PKT_DEFAULT; + + l_ptr->max_pkt_probes = 0; +} + +static u32 link_next_sent(struct link *l_ptr) +{ + if (l_ptr->next_out) + return msg_seqno(buf_msg(l_ptr->next_out)); + return mod(l_ptr->next_out_no); +} + +static u32 link_last_sent(struct link *l_ptr) +{ + return mod(link_next_sent(l_ptr) - 1); +} + +/* + * Simple non-static link routines (i.e. referenced outside this file) + */ + +int tipc_link_is_up(struct link *l_ptr) +{ + if (!l_ptr) + return 0; + return link_working_working(l_ptr) || link_working_unknown(l_ptr); +} + +int tipc_link_is_active(struct link *l_ptr) +{ + return (l_ptr->owner->active_links[0] == l_ptr) || + (l_ptr->owner->active_links[1] == l_ptr); +} + +/** + * link_name_validate - validate & (optionally) deconstruct link name + * @name - ptr to link name string + * @name_parts - ptr to area for link name components (or NULL if not needed) + * + * Returns 1 if link name is valid, otherwise 0. + */ + +static int link_name_validate(const char *name, struct link_name *name_parts) +{ + char name_copy[TIPC_MAX_LINK_NAME]; + char *addr_local; + char *if_local; + char *addr_peer; + char *if_peer; + char dummy; + u32 z_local, c_local, n_local; + u32 z_peer, c_peer, n_peer; + u32 if_local_len; + u32 if_peer_len; + + /* copy link name & ensure length is OK */ + + name_copy[TIPC_MAX_LINK_NAME - 1] = 0; + /* need above in case non-Posix strncpy() doesn't pad with nulls */ + strncpy(name_copy, name, TIPC_MAX_LINK_NAME); + if (name_copy[TIPC_MAX_LINK_NAME - 1] != 0) + return 0; + + /* ensure all component parts of link name are present */ + + addr_local = name_copy; + if_local = strchr(addr_local, ':'); + if (if_local == NULL) + return 0; + *(if_local++) = 0; + addr_peer = strchr(if_local, '-'); + if (addr_peer == NULL) + return 0; + *(addr_peer++) = 0; + if_local_len = addr_peer - if_local; + if_peer = strchr(addr_peer, ':'); + if (if_peer == NULL) + return 0; + *(if_peer++) = 0; + if_peer_len = strlen(if_peer) + 1; + + /* validate component parts of link name */ + + if ((sscanf(addr_local, "%u.%u.%u%c", + &z_local, &c_local, &n_local, &dummy) != 3) || + (sscanf(addr_peer, "%u.%u.%u%c", + &z_peer, &c_peer, &n_peer, &dummy) != 3) || + (z_local > 255) || (c_local > 4095) || (n_local > 4095) || + (z_peer > 255) || (c_peer > 4095) || (n_peer > 4095) || + (if_local_len <= 1) || (if_local_len > TIPC_MAX_IF_NAME) || + (if_peer_len <= 1) || (if_peer_len > TIPC_MAX_IF_NAME) || + (strspn(if_local, tipc_alphabet) != (if_local_len - 1)) || + (strspn(if_peer, tipc_alphabet) != (if_peer_len - 1))) + return 0; + + /* return link name components, if necessary */ + + if (name_parts) { + name_parts->addr_local = tipc_addr(z_local, c_local, n_local); + strcpy(name_parts->if_local, if_local); + name_parts->addr_peer = tipc_addr(z_peer, c_peer, n_peer); + strcpy(name_parts->if_peer, if_peer); + } + return 1; +} + +/** + * link_timeout - handle expiration of link timer + * @l_ptr: pointer to link + * + * This routine must not grab "tipc_net_lock" to avoid a potential deadlock conflict + * with tipc_link_delete(). (There is no risk that the node will be deleted by + * another thread because tipc_link_delete() always cancels the link timer before + * tipc_node_delete() is called.) + */ + +static void link_timeout(struct link *l_ptr) +{ + tipc_node_lock(l_ptr->owner); + + /* update counters used in statistical profiling of send traffic */ + + l_ptr->stats.accu_queue_sz += l_ptr->out_queue_size; + l_ptr->stats.queue_sz_counts++; + + if (l_ptr->first_out) { + struct tipc_msg *msg = buf_msg(l_ptr->first_out); + u32 length = msg_size(msg); + + if ((msg_user(msg) == MSG_FRAGMENTER) && + (msg_type(msg) == FIRST_FRAGMENT)) { + length = msg_size(msg_get_wrapped(msg)); + } + if (length) { + l_ptr->stats.msg_lengths_total += length; + l_ptr->stats.msg_length_counts++; + if (length <= 64) + l_ptr->stats.msg_length_profile[0]++; + else if (length <= 256) + l_ptr->stats.msg_length_profile[1]++; + else if (length <= 1024) + l_ptr->stats.msg_length_profile[2]++; + else if (length <= 4096) + l_ptr->stats.msg_length_profile[3]++; + else if (length <= 16384) + l_ptr->stats.msg_length_profile[4]++; + else if (length <= 32768) + l_ptr->stats.msg_length_profile[5]++; + else + l_ptr->stats.msg_length_profile[6]++; + } + } + + /* do all other link processing performed on a periodic basis */ + + link_check_defragm_bufs(l_ptr); + + link_state_event(l_ptr, TIMEOUT_EVT); + + if (l_ptr->next_out) + tipc_link_push_queue(l_ptr); + + tipc_node_unlock(l_ptr->owner); +} + +static void link_set_timer(struct link *l_ptr, u32 time) +{ + k_start_timer(&l_ptr->timer, time); +} + +/** + * tipc_link_create - create a new link + * @n_ptr: pointer to associated node + * @b_ptr: pointer to associated bearer + * @media_addr: media address to use when sending messages over link + * + * Returns pointer to link. + */ + +struct link *tipc_link_create(struct tipc_node *n_ptr, + struct tipc_bearer *b_ptr, + const struct tipc_media_addr *media_addr) +{ + struct link *l_ptr; + struct tipc_msg *msg; + char *if_name; + char addr_string[16]; + u32 peer = n_ptr->addr; + + if (n_ptr->link_cnt >= 2) { + tipc_addr_string_fill(addr_string, n_ptr->addr); + err("Attempt to establish third link to %s\n", addr_string); + return NULL; + } + + if (n_ptr->links[b_ptr->identity]) { + tipc_addr_string_fill(addr_string, n_ptr->addr); + err("Attempt to establish second link on <%s> to %s\n", + b_ptr->name, addr_string); + return NULL; + } + + l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC); + if (!l_ptr) { + warn("Link creation failed, no memory\n"); + return NULL; + } + + l_ptr->addr = peer; + if_name = strchr(b_ptr->name, ':') + 1; + sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:", + tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr), + tipc_node(tipc_own_addr), + if_name, + tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); + /* note: peer i/f is appended to link name by reset/activate */ + memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); + l_ptr->owner = n_ptr; + l_ptr->checkpoint = 1; + l_ptr->b_ptr = b_ptr; + link_set_supervision_props(l_ptr, b_ptr->media->tolerance); + l_ptr->state = RESET_UNKNOWN; + + l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; + msg = l_ptr->pmsg; + tipc_msg_init(msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, l_ptr->addr); + msg_set_size(msg, sizeof(l_ptr->proto_msg)); + msg_set_session(msg, (tipc_random & 0xffff)); + msg_set_bearer_id(msg, b_ptr->identity); + strcpy((char *)msg_data(msg), if_name); + + l_ptr->priority = b_ptr->priority; + tipc_link_set_queue_limits(l_ptr, b_ptr->media->window); + + link_init_max_pkt(l_ptr); + + l_ptr->next_out_no = 1; + INIT_LIST_HEAD(&l_ptr->waiting_ports); + + link_reset_statistics(l_ptr); + + tipc_node_attach_link(n_ptr, l_ptr); + + k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr); + list_add_tail(&l_ptr->link_list, &b_ptr->links); + tipc_k_signal((Handler)link_start, (unsigned long)l_ptr); + + return l_ptr; +} + +/** + * tipc_link_delete - delete a link + * @l_ptr: pointer to link + * + * Note: 'tipc_net_lock' is write_locked, bearer is locked. + * This routine must not grab the node lock until after link timer cancellation + * to avoid a potential deadlock situation. + */ + +void tipc_link_delete(struct link *l_ptr) +{ + if (!l_ptr) { + err("Attempt to delete non-existent link\n"); + return; + } + + k_cancel_timer(&l_ptr->timer); + + tipc_node_lock(l_ptr->owner); + tipc_link_reset(l_ptr); + tipc_node_detach_link(l_ptr->owner, l_ptr); + tipc_link_stop(l_ptr); + list_del_init(&l_ptr->link_list); + tipc_node_unlock(l_ptr->owner); + k_term_timer(&l_ptr->timer); + kfree(l_ptr); +} + +static void link_start(struct link *l_ptr) +{ + tipc_node_lock(l_ptr->owner); + link_state_event(l_ptr, STARTING_EVT); + tipc_node_unlock(l_ptr->owner); +} + +/** + * link_schedule_port - schedule port for deferred sending + * @l_ptr: pointer to link + * @origport: reference to sending port + * @sz: amount of data to be sent + * + * Schedules port for renewed sending of messages after link congestion + * has abated. + */ + +static int link_schedule_port(struct link *l_ptr, u32 origport, u32 sz) +{ + struct tipc_port *p_ptr; + + spin_lock_bh(&tipc_port_list_lock); + p_ptr = tipc_port_lock(origport); + if (p_ptr) { + if (!p_ptr->wakeup) + goto exit; + if (!list_empty(&p_ptr->wait_list)) + goto exit; + p_ptr->congested = 1; + p_ptr->waiting_pkts = 1 + ((sz - 1) / l_ptr->max_pkt); + list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports); + l_ptr->stats.link_congs++; +exit: + tipc_port_unlock(p_ptr); + } + spin_unlock_bh(&tipc_port_list_lock); + return -ELINKCONG; +} + +void tipc_link_wakeup_ports(struct link *l_ptr, int all) +{ + struct tipc_port *p_ptr; + struct tipc_port *temp_p_ptr; + int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size; + + if (all) + win = 100000; + if (win <= 0) + return; + if (!spin_trylock_bh(&tipc_port_list_lock)) + return; + if (link_congested(l_ptr)) + goto exit; + list_for_each_entry_safe(p_ptr, temp_p_ptr, &l_ptr->waiting_ports, + wait_list) { + if (win <= 0) + break; + list_del_init(&p_ptr->wait_list); + spin_lock_bh(p_ptr->lock); + p_ptr->congested = 0; + p_ptr->wakeup(p_ptr); + win -= p_ptr->waiting_pkts; + spin_unlock_bh(p_ptr->lock); + } + +exit: + spin_unlock_bh(&tipc_port_list_lock); +} + +/** + * link_release_outqueue - purge link's outbound message queue + * @l_ptr: pointer to link + */ + +static void link_release_outqueue(struct link *l_ptr) +{ + struct sk_buff *buf = l_ptr->first_out; + struct sk_buff *next; + + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + l_ptr->first_out = NULL; + l_ptr->out_queue_size = 0; +} + +/** + * tipc_link_reset_fragments - purge link's inbound message fragments queue + * @l_ptr: pointer to link + */ + +void tipc_link_reset_fragments(struct link *l_ptr) +{ + struct sk_buff *buf = l_ptr->defragm_buf; + struct sk_buff *next; + + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + l_ptr->defragm_buf = NULL; +} + +/** + * tipc_link_stop - purge all inbound and outbound messages associated with link + * @l_ptr: pointer to link + */ + +void tipc_link_stop(struct link *l_ptr) +{ + struct sk_buff *buf; + struct sk_buff *next; + + buf = l_ptr->oldest_deferred_in; + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + + buf = l_ptr->first_out; + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + + tipc_link_reset_fragments(l_ptr); + + buf_discard(l_ptr->proto_msg_queue); + l_ptr->proto_msg_queue = NULL; +} + +/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */ +#define link_send_event(fcn, l_ptr, up) do { } while (0) + +void tipc_link_reset(struct link *l_ptr) +{ + struct sk_buff *buf; + u32 prev_state = l_ptr->state; + u32 checkpoint = l_ptr->next_in_no; + int was_active_link = tipc_link_is_active(l_ptr); + + msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); + + /* Link is down, accept any session */ + l_ptr->peer_session = INVALID_SESSION; + + /* Prepare for max packet size negotiation */ + link_init_max_pkt(l_ptr); + + l_ptr->state = RESET_UNKNOWN; + + if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET)) + return; + + tipc_node_link_down(l_ptr->owner, l_ptr); + tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr); + + if (was_active_link && tipc_node_active_links(l_ptr->owner) && + l_ptr->owner->permit_changeover) { + l_ptr->reset_checkpoint = checkpoint; + l_ptr->exp_msg_count = START_CHANGEOVER; + } + + /* Clean up all queues: */ + + link_release_outqueue(l_ptr); + buf_discard(l_ptr->proto_msg_queue); + l_ptr->proto_msg_queue = NULL; + buf = l_ptr->oldest_deferred_in; + while (buf) { + struct sk_buff *next = buf->next; + buf_discard(buf); + buf = next; + } + if (!list_empty(&l_ptr->waiting_ports)) + tipc_link_wakeup_ports(l_ptr, 1); + + l_ptr->retransm_queue_head = 0; + l_ptr->retransm_queue_size = 0; + l_ptr->last_out = NULL; + l_ptr->first_out = NULL; + l_ptr->next_out = NULL; + l_ptr->unacked_window = 0; + l_ptr->checkpoint = 1; + l_ptr->next_out_no = 1; + l_ptr->deferred_inqueue_sz = 0; + l_ptr->oldest_deferred_in = NULL; + l_ptr->newest_deferred_in = NULL; + l_ptr->fsm_msg_cnt = 0; + l_ptr->stale_count = 0; + link_reset_statistics(l_ptr); + + link_send_event(tipc_cfg_link_event, l_ptr, 0); + if (!in_own_cluster(l_ptr->addr)) + link_send_event(tipc_disc_link_event, l_ptr, 0); +} + + +static void link_activate(struct link *l_ptr) +{ + l_ptr->next_in_no = l_ptr->stats.recv_info = 1; + tipc_node_link_up(l_ptr->owner, l_ptr); + tipc_bearer_add_dest(l_ptr->b_ptr, l_ptr->addr); + link_send_event(tipc_cfg_link_event, l_ptr, 1); + if (!in_own_cluster(l_ptr->addr)) + link_send_event(tipc_disc_link_event, l_ptr, 1); +} + +/** + * link_state_event - link finite state machine + * @l_ptr: pointer to link + * @event: state machine event to process + */ + +static void link_state_event(struct link *l_ptr, unsigned event) +{ + struct link *other; + u32 cont_intv = l_ptr->continuity_interval; + + if (!l_ptr->started && (event != STARTING_EVT)) + return; /* Not yet. */ + + if (link_blocked(l_ptr)) { + if (event == TIMEOUT_EVT) + link_set_timer(l_ptr, cont_intv); + return; /* Changeover going on */ + } + + switch (l_ptr->state) { + case WORKING_WORKING: + switch (event) { + case TRAFFIC_MSG_EVT: + case ACTIVATE_MSG: + break; + case TIMEOUT_EVT: + if (l_ptr->next_in_no != l_ptr->checkpoint) { + l_ptr->checkpoint = l_ptr->next_in_no; + if (tipc_bclink_acks_missing(l_ptr->owner)) { + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + } else if (l_ptr->max_pkt < l_ptr->max_pkt_target) { + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + } + link_set_timer(l_ptr, cont_intv); + break; + } + l_ptr->state = WORKING_UNKNOWN; + l_ptr->fsm_msg_cnt = 0; + tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv / 4); + break; + case RESET_MSG: + info("Resetting link <%s>, requested by peer\n", + l_ptr->name); + tipc_link_reset(l_ptr); + l_ptr->state = RESET_RESET; + l_ptr->fsm_msg_cnt = 0; + tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + default: + err("Unknown link event %u in WW state\n", event); + } + break; + case WORKING_UNKNOWN: + switch (event) { + case TRAFFIC_MSG_EVT: + case ACTIVATE_MSG: + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + link_set_timer(l_ptr, cont_intv); + break; + case RESET_MSG: + info("Resetting link <%s>, requested by peer " + "while probing\n", l_ptr->name); + tipc_link_reset(l_ptr); + l_ptr->state = RESET_RESET; + l_ptr->fsm_msg_cnt = 0; + tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case TIMEOUT_EVT: + if (l_ptr->next_in_no != l_ptr->checkpoint) { + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + l_ptr->checkpoint = l_ptr->next_in_no; + if (tipc_bclink_acks_missing(l_ptr->owner)) { + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + } + link_set_timer(l_ptr, cont_intv); + } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv / 4); + } else { /* Link has failed */ + warn("Resetting link <%s>, peer not responding\n", + l_ptr->name); + tipc_link_reset(l_ptr); + l_ptr->state = RESET_UNKNOWN; + l_ptr->fsm_msg_cnt = 0; + tipc_link_send_proto_msg(l_ptr, RESET_MSG, + 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + } + break; + default: + err("Unknown link event %u in WU state\n", event); + } + break; + case RESET_UNKNOWN: + switch (event) { + case TRAFFIC_MSG_EVT: + break; + case ACTIVATE_MSG: + other = l_ptr->owner->active_links[0]; + if (other && link_working_unknown(other)) + break; + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + link_activate(l_ptr); + tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case RESET_MSG: + l_ptr->state = RESET_RESET; + l_ptr->fsm_msg_cnt = 0; + tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case STARTING_EVT: + l_ptr->started = 1; + /* fall through */ + case TIMEOUT_EVT: + tipc_link_send_proto_msg(l_ptr, RESET_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + default: + err("Unknown link event %u in RU state\n", event); + } + break; + case RESET_RESET: + switch (event) { + case TRAFFIC_MSG_EVT: + case ACTIVATE_MSG: + other = l_ptr->owner->active_links[0]; + if (other && link_working_unknown(other)) + break; + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + link_activate(l_ptr); + tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case RESET_MSG: + break; + case TIMEOUT_EVT: + tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + default: + err("Unknown link event %u in RR state\n", event); + } + break; + default: + err("Unknown link state %u/%u\n", l_ptr->state, event); + } +} + +/* + * link_bundle_buf(): Append contents of a buffer to + * the tail of an existing one. + */ + +static int link_bundle_buf(struct link *l_ptr, + struct sk_buff *bundler, + struct sk_buff *buf) +{ + struct tipc_msg *bundler_msg = buf_msg(bundler); + struct tipc_msg *msg = buf_msg(buf); + u32 size = msg_size(msg); + u32 bundle_size = msg_size(bundler_msg); + u32 to_pos = align(bundle_size); + u32 pad = to_pos - bundle_size; + + if (msg_user(bundler_msg) != MSG_BUNDLER) + return 0; + if (msg_type(bundler_msg) != OPEN_MSG) + return 0; + if (skb_tailroom(bundler) < (pad + size)) + return 0; + if (l_ptr->max_pkt < (to_pos + size)) + return 0; + + skb_put(bundler, pad + size); + skb_copy_to_linear_data_offset(bundler, to_pos, buf->data, size); + msg_set_size(bundler_msg, to_pos + size); + msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1); + buf_discard(buf); + l_ptr->stats.sent_bundled++; + return 1; +} + +static void link_add_to_outqueue(struct link *l_ptr, + struct sk_buff *buf, + struct tipc_msg *msg) +{ + u32 ack = mod(l_ptr->next_in_no - 1); + u32 seqno = mod(l_ptr->next_out_no++); + + msg_set_word(msg, 2, ((ack << 16) | seqno)); + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + buf->next = NULL; + if (l_ptr->first_out) { + l_ptr->last_out->next = buf; + l_ptr->last_out = buf; + } else + l_ptr->first_out = l_ptr->last_out = buf; + + l_ptr->out_queue_size++; + if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz) + l_ptr->stats.max_queue_sz = l_ptr->out_queue_size; +} + +static void link_add_chain_to_outqueue(struct link *l_ptr, + struct sk_buff *buf_chain, + u32 long_msgno) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + + if (!l_ptr->next_out) + l_ptr->next_out = buf_chain; + while (buf_chain) { + buf = buf_chain; + buf_chain = buf_chain->next; + + msg = buf_msg(buf); + msg_set_long_msgno(msg, long_msgno); + link_add_to_outqueue(l_ptr, buf, msg); + } +} + +/* + * tipc_link_send_buf() is the 'full path' for messages, called from + * inside TIPC when the 'fast path' in tipc_send_buf + * has failed, and from link_send() + */ + +int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + u32 size = msg_size(msg); + u32 dsz = msg_data_sz(msg); + u32 queue_size = l_ptr->out_queue_size; + u32 imp = tipc_msg_tot_importance(msg); + u32 queue_limit = l_ptr->queue_limit[imp]; + u32 max_packet = l_ptr->max_pkt; + + msg_set_prevnode(msg, tipc_own_addr); /* If routed message */ + + /* Match msg importance against queue limits: */ + + if (unlikely(queue_size >= queue_limit)) { + if (imp <= TIPC_CRITICAL_IMPORTANCE) { + link_schedule_port(l_ptr, msg_origport(msg), size); + buf_discard(buf); + return -ELINKCONG; + } + buf_discard(buf); + if (imp > CONN_MANAGER) { + warn("Resetting link <%s>, send queue full", l_ptr->name); + tipc_link_reset(l_ptr); + } + return dsz; + } + + /* Fragmentation needed ? */ + + if (size > max_packet) + return link_send_long_buf(l_ptr, buf); + + /* Packet can be queued or sent: */ + + if (likely(!tipc_bearer_congested(l_ptr->b_ptr, l_ptr) && + !link_congested(l_ptr))) { + link_add_to_outqueue(l_ptr, buf, msg); + + if (likely(tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr))) { + l_ptr->unacked_window = 0; + } else { + tipc_bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->stats.bearer_congs++; + l_ptr->next_out = buf; + } + return dsz; + } + /* Congestion: can message be bundled ?: */ + + if ((msg_user(msg) != CHANGEOVER_PROTOCOL) && + (msg_user(msg) != MSG_FRAGMENTER)) { + + /* Try adding message to an existing bundle */ + + if (l_ptr->next_out && + link_bundle_buf(l_ptr, l_ptr->last_out, buf)) { + tipc_bearer_resolve_congestion(l_ptr->b_ptr, l_ptr); + return dsz; + } + + /* Try creating a new bundle */ + + if (size <= max_packet * 2 / 3) { + struct sk_buff *bundler = tipc_buf_acquire(max_packet); + struct tipc_msg bundler_hdr; + + if (bundler) { + tipc_msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG, + INT_H_SIZE, l_ptr->addr); + skb_copy_to_linear_data(bundler, &bundler_hdr, + INT_H_SIZE); + skb_trim(bundler, INT_H_SIZE); + link_bundle_buf(l_ptr, bundler, buf); + buf = bundler; + msg = buf_msg(buf); + l_ptr->stats.sent_bundles++; + } + } + } + if (!l_ptr->next_out) + l_ptr->next_out = buf; + link_add_to_outqueue(l_ptr, buf, msg); + tipc_bearer_resolve_congestion(l_ptr->b_ptr, l_ptr); + return dsz; +} + +/* + * tipc_link_send(): same as tipc_link_send_buf(), but the link to use has + * not been selected yet, and the the owner node is not locked + * Called by TIPC internal users, e.g. the name distributor + */ + +int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector) +{ + struct link *l_ptr; + struct tipc_node *n_ptr; + int res = -ELINKCONG; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(dest); + if (n_ptr) { + tipc_node_lock(n_ptr); + l_ptr = n_ptr->active_links[selector & 1]; + if (l_ptr) + res = tipc_link_send_buf(l_ptr, buf); + else + buf_discard(buf); + tipc_node_unlock(n_ptr); + } else { + buf_discard(buf); + } + read_unlock_bh(&tipc_net_lock); + return res; +} + +/* + * link_send_buf_fast: Entry for data messages where the + * destination link is known and the header is complete, + * inclusive total message length. Very time critical. + * Link is locked. Returns user data length. + */ + +static int link_send_buf_fast(struct link *l_ptr, struct sk_buff *buf, + u32 *used_max_pkt) +{ + struct tipc_msg *msg = buf_msg(buf); + int res = msg_data_sz(msg); + + if (likely(!link_congested(l_ptr))) { + if (likely(msg_size(msg) <= l_ptr->max_pkt)) { + if (likely(list_empty(&l_ptr->b_ptr->cong_links))) { + link_add_to_outqueue(l_ptr, buf, msg); + if (likely(tipc_bearer_send(l_ptr->b_ptr, buf, + &l_ptr->media_addr))) { + l_ptr->unacked_window = 0; + return res; + } + tipc_bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->stats.bearer_congs++; + l_ptr->next_out = buf; + return res; + } + } else + *used_max_pkt = l_ptr->max_pkt; + } + return tipc_link_send_buf(l_ptr, buf); /* All other cases */ +} + +/* + * tipc_send_buf_fast: Entry for data messages where the + * destination node is known and the header is complete, + * inclusive total message length. + * Returns user data length. + */ +int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode) +{ + struct link *l_ptr; + struct tipc_node *n_ptr; + int res; + u32 selector = msg_origport(buf_msg(buf)) & 1; + u32 dummy; + + if (destnode == tipc_own_addr) + return tipc_port_recv_msg(buf); + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(destnode); + if (likely(n_ptr)) { + tipc_node_lock(n_ptr); + l_ptr = n_ptr->active_links[selector]; + if (likely(l_ptr)) { + res = link_send_buf_fast(l_ptr, buf, &dummy); + tipc_node_unlock(n_ptr); + read_unlock_bh(&tipc_net_lock); + return res; + } + tipc_node_unlock(n_ptr); + } + read_unlock_bh(&tipc_net_lock); + res = msg_data_sz(buf_msg(buf)); + tipc_reject_msg(buf, TIPC_ERR_NO_NODE); + return res; +} + + +/* + * tipc_link_send_sections_fast: Entry for messages where the + * destination processor is known and the header is complete, + * except for total message length. + * Returns user data length or errno. + */ +int tipc_link_send_sections_fast(struct tipc_port *sender, + struct iovec const *msg_sect, + const u32 num_sect, + unsigned int total_len, + u32 destaddr) +{ + struct tipc_msg *hdr = &sender->phdr; + struct link *l_ptr; + struct sk_buff *buf; + struct tipc_node *node; + int res; + u32 selector = msg_origport(hdr) & 1; + +again: + /* + * Try building message using port's max_pkt hint. + * (Must not hold any locks while building message.) + */ + + res = tipc_msg_build(hdr, msg_sect, num_sect, total_len, + sender->max_pkt, !sender->user_port, &buf); + + read_lock_bh(&tipc_net_lock); + node = tipc_node_find(destaddr); + if (likely(node)) { + tipc_node_lock(node); + l_ptr = node->active_links[selector]; + if (likely(l_ptr)) { + if (likely(buf)) { + res = link_send_buf_fast(l_ptr, buf, + &sender->max_pkt); +exit: + tipc_node_unlock(node); + read_unlock_bh(&tipc_net_lock); + return res; + } + + /* Exit if build request was invalid */ + + if (unlikely(res < 0)) + goto exit; + + /* Exit if link (or bearer) is congested */ + + if (link_congested(l_ptr) || + !list_empty(&l_ptr->b_ptr->cong_links)) { + res = link_schedule_port(l_ptr, + sender->ref, res); + goto exit; + } + + /* + * Message size exceeds max_pkt hint; update hint, + * then re-try fast path or fragment the message + */ + + sender->max_pkt = l_ptr->max_pkt; + tipc_node_unlock(node); + read_unlock_bh(&tipc_net_lock); + + + if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt) + goto again; + + return link_send_sections_long(sender, msg_sect, + num_sect, total_len, + destaddr); + } + tipc_node_unlock(node); + } + read_unlock_bh(&tipc_net_lock); + + /* Couldn't find a link to the destination node */ + + if (buf) + return tipc_reject_msg(buf, TIPC_ERR_NO_NODE); + if (res >= 0) + return tipc_port_reject_sections(sender, hdr, msg_sect, num_sect, + total_len, TIPC_ERR_NO_NODE); + return res; +} + +/* + * link_send_sections_long(): Entry for long messages where the + * destination node is known and the header is complete, + * inclusive total message length. + * Link and bearer congestion status have been checked to be ok, + * and are ignored if they change. + * + * Note that fragments do not use the full link MTU so that they won't have + * to undergo refragmentation if link changeover causes them to be sent + * over another link with an additional tunnel header added as prefix. + * (Refragmentation will still occur if the other link has a smaller MTU.) + * + * Returns user data length or errno. + */ +static int link_send_sections_long(struct tipc_port *sender, + struct iovec const *msg_sect, + u32 num_sect, + unsigned int total_len, + u32 destaddr) +{ + struct link *l_ptr; + struct tipc_node *node; + struct tipc_msg *hdr = &sender->phdr; + u32 dsz = total_len; + u32 max_pkt, fragm_sz, rest; + struct tipc_msg fragm_hdr; + struct sk_buff *buf, *buf_chain, *prev; + u32 fragm_crs, fragm_rest, hsz, sect_rest; + const unchar *sect_crs; + int curr_sect; + u32 fragm_no; + +again: + fragm_no = 1; + max_pkt = sender->max_pkt - INT_H_SIZE; + /* leave room for tunnel header in case of link changeover */ + fragm_sz = max_pkt - INT_H_SIZE; + /* leave room for fragmentation header in each fragment */ + rest = dsz; + fragm_crs = 0; + fragm_rest = 0; + sect_rest = 0; + sect_crs = NULL; + curr_sect = -1; + + /* Prepare reusable fragment header: */ + + tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, + INT_H_SIZE, msg_destnode(hdr)); + msg_set_size(&fragm_hdr, max_pkt); + msg_set_fragm_no(&fragm_hdr, 1); + + /* Prepare header of first fragment: */ + + buf_chain = buf = tipc_buf_acquire(max_pkt); + if (!buf) + return -ENOMEM; + buf->next = NULL; + skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); + hsz = msg_hdr_sz(hdr); + skb_copy_to_linear_data_offset(buf, INT_H_SIZE, hdr, hsz); + + /* Chop up message: */ + + fragm_crs = INT_H_SIZE + hsz; + fragm_rest = fragm_sz - hsz; + + do { /* For all sections */ + u32 sz; + + if (!sect_rest) { + sect_rest = msg_sect[++curr_sect].iov_len; + sect_crs = (const unchar *)msg_sect[curr_sect].iov_base; + } + + if (sect_rest < fragm_rest) + sz = sect_rest; + else + sz = fragm_rest; + + if (likely(!sender->user_port)) { + if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) { +error: + for (; buf_chain; buf_chain = buf) { + buf = buf_chain->next; + buf_discard(buf_chain); + } + return -EFAULT; + } + } else + skb_copy_to_linear_data_offset(buf, fragm_crs, + sect_crs, sz); + sect_crs += sz; + sect_rest -= sz; + fragm_crs += sz; + fragm_rest -= sz; + rest -= sz; + + if (!fragm_rest && rest) { + + /* Initiate new fragment: */ + if (rest <= fragm_sz) { + fragm_sz = rest; + msg_set_type(&fragm_hdr, LAST_FRAGMENT); + } else { + msg_set_type(&fragm_hdr, FRAGMENT); + } + msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); + msg_set_fragm_no(&fragm_hdr, ++fragm_no); + prev = buf; + buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE); + if (!buf) + goto error; + + buf->next = NULL; + prev->next = buf; + skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); + fragm_crs = INT_H_SIZE; + fragm_rest = fragm_sz; + } + } while (rest > 0); + + /* + * Now we have a buffer chain. Select a link and check + * that packet size is still OK + */ + node = tipc_node_find(destaddr); + if (likely(node)) { + tipc_node_lock(node); + l_ptr = node->active_links[sender->ref & 1]; + if (!l_ptr) { + tipc_node_unlock(node); + goto reject; + } + if (l_ptr->max_pkt < max_pkt) { + sender->max_pkt = l_ptr->max_pkt; + tipc_node_unlock(node); + for (; buf_chain; buf_chain = buf) { + buf = buf_chain->next; + buf_discard(buf_chain); + } + goto again; + } + } else { +reject: + for (; buf_chain; buf_chain = buf) { + buf = buf_chain->next; + buf_discard(buf_chain); + } + return tipc_port_reject_sections(sender, hdr, msg_sect, num_sect, + total_len, TIPC_ERR_NO_NODE); + } + + /* Append chain of fragments to send queue & send them */ + + l_ptr->long_msg_seq_no++; + link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no); + l_ptr->stats.sent_fragments += fragm_no; + l_ptr->stats.sent_fragmented++; + tipc_link_push_queue(l_ptr); + tipc_node_unlock(node); + return dsz; +} + +/* + * tipc_link_push_packet: Push one unsent packet to the media + */ +u32 tipc_link_push_packet(struct link *l_ptr) +{ + struct sk_buff *buf = l_ptr->first_out; + u32 r_q_size = l_ptr->retransm_queue_size; + u32 r_q_head = l_ptr->retransm_queue_head; + + /* Step to position where retransmission failed, if any, */ + /* consider that buffers may have been released in meantime */ + + if (r_q_size && buf) { + u32 last = lesser(mod(r_q_head + r_q_size), + link_last_sent(l_ptr)); + u32 first = msg_seqno(buf_msg(buf)); + + while (buf && less(first, r_q_head)) { + first = mod(first + 1); + buf = buf->next; + } + l_ptr->retransm_queue_head = r_q_head = first; + l_ptr->retransm_queue_size = r_q_size = mod(last - first); + } + + /* Continue retransmission now, if there is anything: */ + + if (r_q_size && buf) { + msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in); + if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + l_ptr->retransm_queue_head = mod(++r_q_head); + l_ptr->retransm_queue_size = --r_q_size; + l_ptr->stats.retransmitted++; + return 0; + } else { + l_ptr->stats.bearer_congs++; + return PUSH_FAILED; + } + } + + /* Send deferred protocol message, if any: */ + + buf = l_ptr->proto_msg_queue; + if (buf) { + msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in); + if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + l_ptr->unacked_window = 0; + buf_discard(buf); + l_ptr->proto_msg_queue = NULL; + return 0; + } else { + l_ptr->stats.bearer_congs++; + return PUSH_FAILED; + } + } + + /* Send one deferred data message, if send window not full: */ + + buf = l_ptr->next_out; + if (buf) { + struct tipc_msg *msg = buf_msg(buf); + u32 next = msg_seqno(msg); + u32 first = msg_seqno(buf_msg(l_ptr->first_out)); + + if (mod(next - first) < l_ptr->queue_limit[0]) { + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + if (msg_user(msg) == MSG_BUNDLER) + msg_set_type(msg, CLOSED_MSG); + l_ptr->next_out = buf->next; + return 0; + } else { + l_ptr->stats.bearer_congs++; + return PUSH_FAILED; + } + } + } + return PUSH_FINISHED; +} + +/* + * push_queue(): push out the unsent messages of a link where + * congestion has abated. Node is locked + */ +void tipc_link_push_queue(struct link *l_ptr) +{ + u32 res; + + if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) + return; + + do { + res = tipc_link_push_packet(l_ptr); + } while (!res); + + if (res == PUSH_FAILED) + tipc_bearer_schedule(l_ptr->b_ptr, l_ptr); +} + +static void link_reset_all(unsigned long addr) +{ + struct tipc_node *n_ptr; + char addr_string[16]; + u32 i; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find((u32)addr); + if (!n_ptr) { + read_unlock_bh(&tipc_net_lock); + return; /* node no longer exists */ + } + + tipc_node_lock(n_ptr); + + warn("Resetting all links to %s\n", + tipc_addr_string_fill(addr_string, n_ptr->addr)); + + for (i = 0; i < MAX_BEARERS; i++) { + if (n_ptr->links[i]) { + link_print(n_ptr->links[i], "Resetting link\n"); + tipc_link_reset(n_ptr->links[i]); + } + } + + tipc_node_unlock(n_ptr); + read_unlock_bh(&tipc_net_lock); +} + +static void link_retransmit_failure(struct link *l_ptr, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + + warn("Retransmission failure on link <%s>\n", l_ptr->name); + + if (l_ptr->addr) { + + /* Handle failure on standard link */ + + link_print(l_ptr, "Resetting link\n"); + tipc_link_reset(l_ptr); + + } else { + + /* Handle failure on broadcast link */ + + struct tipc_node *n_ptr; + char addr_string[16]; + + info("Msg seq number: %u, ", msg_seqno(msg)); + info("Outstanding acks: %lu\n", + (unsigned long) TIPC_SKB_CB(buf)->handle); + + n_ptr = tipc_bclink_retransmit_to(); + tipc_node_lock(n_ptr); + + tipc_addr_string_fill(addr_string, n_ptr->addr); + info("Multicast link info for %s\n", addr_string); + info("Supported: %d, ", n_ptr->bclink.supported); + info("Acked: %u\n", n_ptr->bclink.acked); + info("Last in: %u, ", n_ptr->bclink.last_in); + info("Gap after: %u, ", n_ptr->bclink.gap_after); + info("Gap to: %u\n", n_ptr->bclink.gap_to); + info("Nack sync: %u\n\n", n_ptr->bclink.nack_sync); + + tipc_k_signal((Handler)link_reset_all, (unsigned long)n_ptr->addr); + + tipc_node_unlock(n_ptr); + + l_ptr->stale_count = 0; + } +} + +void tipc_link_retransmit(struct link *l_ptr, struct sk_buff *buf, + u32 retransmits) +{ + struct tipc_msg *msg; + + if (!buf) + return; + + msg = buf_msg(buf); + + if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) { + if (l_ptr->retransm_queue_size == 0) { + l_ptr->retransm_queue_head = msg_seqno(msg); + l_ptr->retransm_queue_size = retransmits; + } else { + err("Unexpected retransmit on link %s (qsize=%d)\n", + l_ptr->name, l_ptr->retransm_queue_size); + } + return; + } else { + /* Detect repeated retransmit failures on uncongested bearer */ + + if (l_ptr->last_retransmitted == msg_seqno(msg)) { + if (++l_ptr->stale_count > 100) { + link_retransmit_failure(l_ptr, buf); + return; + } + } else { + l_ptr->last_retransmitted = msg_seqno(msg); + l_ptr->stale_count = 1; + } + } + + while (retransmits && (buf != l_ptr->next_out) && buf) { + msg = buf_msg(buf); + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + buf = buf->next; + retransmits--; + l_ptr->stats.retransmitted++; + } else { + tipc_bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->stats.bearer_congs++; + l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf)); + l_ptr->retransm_queue_size = retransmits; + return; + } + } + + l_ptr->retransm_queue_head = l_ptr->retransm_queue_size = 0; +} + +/** + * link_insert_deferred_queue - insert deferred messages back into receive chain + */ + +static struct sk_buff *link_insert_deferred_queue(struct link *l_ptr, + struct sk_buff *buf) +{ + u32 seq_no; + + if (l_ptr->oldest_deferred_in == NULL) + return buf; + + seq_no = msg_seqno(buf_msg(l_ptr->oldest_deferred_in)); + if (seq_no == mod(l_ptr->next_in_no)) { + l_ptr->newest_deferred_in->next = buf; + buf = l_ptr->oldest_deferred_in; + l_ptr->oldest_deferred_in = NULL; + l_ptr->deferred_inqueue_sz = 0; + } + return buf; +} + +/** + * link_recv_buf_validate - validate basic format of received message + * + * This routine ensures a TIPC message has an acceptable header, and at least + * as much data as the header indicates it should. The routine also ensures + * that the entire message header is stored in the main fragment of the message + * buffer, to simplify future access to message header fields. + * + * Note: Having extra info present in the message header or data areas is OK. + * TIPC will ignore the excess, under the assumption that it is optional info + * introduced by a later release of the protocol. + */ + +static int link_recv_buf_validate(struct sk_buff *buf) +{ + static u32 min_data_hdr_size[8] = { + SHORT_H_SIZE, MCAST_H_SIZE, LONG_H_SIZE, DIR_MSG_H_SIZE, + MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE + }; + + struct tipc_msg *msg; + u32 tipc_hdr[2]; + u32 size; + u32 hdr_size; + u32 min_hdr_size; + + if (unlikely(buf->len < MIN_H_SIZE)) + return 0; + + msg = skb_header_pointer(buf, 0, sizeof(tipc_hdr), tipc_hdr); + if (msg == NULL) + return 0; + + if (unlikely(msg_version(msg) != TIPC_VERSION)) + return 0; + + size = msg_size(msg); + hdr_size = msg_hdr_sz(msg); + min_hdr_size = msg_isdata(msg) ? + min_data_hdr_size[msg_type(msg)] : INT_H_SIZE; + + if (unlikely((hdr_size < min_hdr_size) || + (size < hdr_size) || + (buf->len < size) || + (size - hdr_size > TIPC_MAX_USER_MSG_SIZE))) + return 0; + + return pskb_may_pull(buf, hdr_size); +} + +/** + * tipc_recv_msg - process TIPC messages arriving from off-node + * @head: pointer to message buffer chain + * @tb_ptr: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ + +void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr) +{ + read_lock_bh(&tipc_net_lock); + while (head) { + struct tipc_node *n_ptr; + struct link *l_ptr; + struct sk_buff *crs; + struct sk_buff *buf = head; + struct tipc_msg *msg; + u32 seq_no; + u32 ackd; + u32 released = 0; + int type; + + head = head->next; + + /* Ensure bearer is still enabled */ + + if (unlikely(!b_ptr->active)) + goto cont; + + /* Ensure message is well-formed */ + + if (unlikely(!link_recv_buf_validate(buf))) + goto cont; + + /* Ensure message data is a single contiguous unit */ + + if (unlikely(buf_linearize(buf))) + goto cont; + + /* Handle arrival of a non-unicast link message */ + + msg = buf_msg(buf); + + if (unlikely(msg_non_seq(msg))) { + if (msg_user(msg) == LINK_CONFIG) + tipc_disc_recv_msg(buf, b_ptr); + else + tipc_bclink_recv_pkt(buf); + continue; + } + + if (unlikely(!msg_short(msg) && + (msg_destnode(msg) != tipc_own_addr))) + goto cont; + + /* Discard non-routeable messages destined for another node */ + + if (unlikely(!msg_isdata(msg) && + (msg_destnode(msg) != tipc_own_addr))) { + if ((msg_user(msg) != CONN_MANAGER) && + (msg_user(msg) != MSG_FRAGMENTER)) + goto cont; + } + + /* Locate neighboring node that sent message */ + + n_ptr = tipc_node_find(msg_prevnode(msg)); + if (unlikely(!n_ptr)) + goto cont; + tipc_node_lock(n_ptr); + + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + tipc_node_unlock(n_ptr); + goto cont; + } + + /* Locate unicast link endpoint that should handle message */ + + l_ptr = n_ptr->links[b_ptr->identity]; + if (unlikely(!l_ptr)) { + tipc_node_unlock(n_ptr); + goto cont; + } + + /* Validate message sequence number info */ + + seq_no = msg_seqno(msg); + ackd = msg_ack(msg); + + /* Release acked messages */ + + if (less(n_ptr->bclink.acked, msg_bcast_ack(msg))) { + if (tipc_node_is_up(n_ptr) && n_ptr->bclink.supported) + tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); + } + + crs = l_ptr->first_out; + while ((crs != l_ptr->next_out) && + less_eq(msg_seqno(buf_msg(crs)), ackd)) { + struct sk_buff *next = crs->next; + + buf_discard(crs); + crs = next; + released++; + } + if (released) { + l_ptr->first_out = crs; + l_ptr->out_queue_size -= released; + } + + /* Try sending any messages link endpoint has pending */ + + if (unlikely(l_ptr->next_out)) + tipc_link_push_queue(l_ptr); + if (unlikely(!list_empty(&l_ptr->waiting_ports))) + tipc_link_wakeup_ports(l_ptr, 0); + if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { + l_ptr->stats.sent_acks++; + tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + } + + /* Now (finally!) process the incoming message */ + +protocol_check: + if (likely(link_working_working(l_ptr))) { + if (likely(seq_no == mod(l_ptr->next_in_no))) { + l_ptr->next_in_no++; + if (unlikely(l_ptr->oldest_deferred_in)) + head = link_insert_deferred_queue(l_ptr, + head); + if (likely(msg_is_dest(msg, tipc_own_addr))) { +deliver: + if (likely(msg_isdata(msg))) { + tipc_node_unlock(n_ptr); + tipc_port_recv_msg(buf); + continue; + } + switch (msg_user(msg)) { + case MSG_BUNDLER: + l_ptr->stats.recv_bundles++; + l_ptr->stats.recv_bundled += + msg_msgcnt(msg); + tipc_node_unlock(n_ptr); + tipc_link_recv_bundle(buf); + continue; + case NAME_DISTRIBUTOR: + tipc_node_unlock(n_ptr); + tipc_named_recv(buf); + continue; + case CONN_MANAGER: + tipc_node_unlock(n_ptr); + tipc_port_recv_proto_msg(buf); + continue; + case MSG_FRAGMENTER: + l_ptr->stats.recv_fragments++; + if (tipc_link_recv_fragment(&l_ptr->defragm_buf, + &buf, &msg)) { + l_ptr->stats.recv_fragmented++; + goto deliver; + } + break; + case CHANGEOVER_PROTOCOL: + type = msg_type(msg); + if (link_recv_changeover_msg(&l_ptr, &buf)) { + msg = buf_msg(buf); + seq_no = msg_seqno(msg); + if (type == ORIGINAL_MSG) + goto deliver; + goto protocol_check; + } + break; + default: + buf_discard(buf); + buf = NULL; + break; + } + } + tipc_node_unlock(n_ptr); + tipc_net_route_msg(buf); + continue; + } + link_handle_out_of_seq_msg(l_ptr, buf); + head = link_insert_deferred_queue(l_ptr, head); + tipc_node_unlock(n_ptr); + continue; + } + + if (msg_user(msg) == LINK_PROTOCOL) { + link_recv_proto_msg(l_ptr, buf); + head = link_insert_deferred_queue(l_ptr, head); + tipc_node_unlock(n_ptr); + continue; + } + link_state_event(l_ptr, TRAFFIC_MSG_EVT); + + if (link_working_working(l_ptr)) { + /* Re-insert in front of queue */ + buf->next = head; + head = buf; + tipc_node_unlock(n_ptr); + continue; + } + tipc_node_unlock(n_ptr); +cont: + buf_discard(buf); + } + read_unlock_bh(&tipc_net_lock); +} + +/* + * link_defer_buf(): Sort a received out-of-sequence packet + * into the deferred reception queue. + * Returns the increase of the queue length,i.e. 0 or 1 + */ + +u32 tipc_link_defer_pkt(struct sk_buff **head, + struct sk_buff **tail, + struct sk_buff *buf) +{ + struct sk_buff *prev = NULL; + struct sk_buff *crs = *head; + u32 seq_no = msg_seqno(buf_msg(buf)); + + buf->next = NULL; + + /* Empty queue ? */ + if (*head == NULL) { + *head = *tail = buf; + return 1; + } + + /* Last ? */ + if (less(msg_seqno(buf_msg(*tail)), seq_no)) { + (*tail)->next = buf; + *tail = buf; + return 1; + } + + /* Scan through queue and sort it in */ + do { + struct tipc_msg *msg = buf_msg(crs); + + if (less(seq_no, msg_seqno(msg))) { + buf->next = crs; + if (prev) + prev->next = buf; + else + *head = buf; + return 1; + } + if (seq_no == msg_seqno(msg)) + break; + prev = crs; + crs = crs->next; + } while (crs); + + /* Message is a duplicate of an existing message */ + + buf_discard(buf); + return 0; +} + +/** + * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet + */ + +static void link_handle_out_of_seq_msg(struct link *l_ptr, + struct sk_buff *buf) +{ + u32 seq_no = msg_seqno(buf_msg(buf)); + + if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) { + link_recv_proto_msg(l_ptr, buf); + return; + } + + /* Record OOS packet arrival (force mismatch on next timeout) */ + + l_ptr->checkpoint--; + + /* + * Discard packet if a duplicate; otherwise add it to deferred queue + * and notify peer of gap as per protocol specification + */ + + if (less(seq_no, mod(l_ptr->next_in_no))) { + l_ptr->stats.duplicates++; + buf_discard(buf); + return; + } + + if (tipc_link_defer_pkt(&l_ptr->oldest_deferred_in, + &l_ptr->newest_deferred_in, buf)) { + l_ptr->deferred_inqueue_sz++; + l_ptr->stats.deferred_recv++; + if ((l_ptr->deferred_inqueue_sz % 16) == 1) + tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + } else + l_ptr->stats.duplicates++; +} + +/* + * Send protocol message to the other endpoint. + */ +void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg, + u32 gap, u32 tolerance, u32 priority, u32 ack_mtu) +{ + struct sk_buff *buf = NULL; + struct tipc_msg *msg = l_ptr->pmsg; + u32 msg_size = sizeof(l_ptr->proto_msg); + int r_flag; + + if (link_blocked(l_ptr)) + return; + msg_set_type(msg, msg_typ); + msg_set_net_plane(msg, l_ptr->b_ptr->net_plane); + msg_set_bcast_ack(msg, mod(l_ptr->owner->bclink.last_in)); + msg_set_last_bcast(msg, tipc_bclink_get_last_sent()); + + if (msg_typ == STATE_MSG) { + u32 next_sent = mod(l_ptr->next_out_no); + + if (!tipc_link_is_up(l_ptr)) + return; + if (l_ptr->next_out) + next_sent = msg_seqno(buf_msg(l_ptr->next_out)); + msg_set_next_sent(msg, next_sent); + if (l_ptr->oldest_deferred_in) { + u32 rec = msg_seqno(buf_msg(l_ptr->oldest_deferred_in)); + gap = mod(rec - mod(l_ptr->next_in_no)); + } + msg_set_seq_gap(msg, gap); + if (gap) + l_ptr->stats.sent_nacks++; + msg_set_link_tolerance(msg, tolerance); + msg_set_linkprio(msg, priority); + msg_set_max_pkt(msg, ack_mtu); + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_probe(msg, probe_msg != 0); + if (probe_msg) { + u32 mtu = l_ptr->max_pkt; + + if ((mtu < l_ptr->max_pkt_target) && + link_working_working(l_ptr) && + l_ptr->fsm_msg_cnt) { + msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; + if (l_ptr->max_pkt_probes == 10) { + l_ptr->max_pkt_target = (msg_size - 4); + l_ptr->max_pkt_probes = 0; + msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; + } + l_ptr->max_pkt_probes++; + } + + l_ptr->stats.sent_probes++; + } + l_ptr->stats.sent_states++; + } else { /* RESET_MSG or ACTIVATE_MSG */ + msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1)); + msg_set_seq_gap(msg, 0); + msg_set_next_sent(msg, 1); + msg_set_probe(msg, 0); + msg_set_link_tolerance(msg, l_ptr->tolerance); + msg_set_linkprio(msg, l_ptr->priority); + msg_set_max_pkt(msg, l_ptr->max_pkt_target); + } + + r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr)); + msg_set_redundant_link(msg, r_flag); + msg_set_linkprio(msg, l_ptr->priority); + + /* Ensure sequence number will not fit : */ + + msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2))); + + /* Congestion? */ + + if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) { + if (!l_ptr->proto_msg_queue) { + l_ptr->proto_msg_queue = + tipc_buf_acquire(sizeof(l_ptr->proto_msg)); + } + buf = l_ptr->proto_msg_queue; + if (!buf) + return; + skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); + return; + } + + /* Message can be sent */ + + buf = tipc_buf_acquire(msg_size); + if (!buf) + return; + + skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); + msg_set_size(buf_msg(buf), msg_size); + + if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + l_ptr->unacked_window = 0; + buf_discard(buf); + return; + } + + /* New congestion */ + tipc_bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->proto_msg_queue = buf; + l_ptr->stats.bearer_congs++; +} + +/* + * Receive protocol message : + * Note that network plane id propagates through the network, and may + * change at any time. The node with lowest address rules + */ + +static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf) +{ + u32 rec_gap = 0; + u32 max_pkt_info; + u32 max_pkt_ack; + u32 msg_tol; + struct tipc_msg *msg = buf_msg(buf); + + if (link_blocked(l_ptr)) + goto exit; + + /* record unnumbered packet arrival (force mismatch on next timeout) */ + + l_ptr->checkpoint--; + + if (l_ptr->b_ptr->net_plane != msg_net_plane(msg)) + if (tipc_own_addr > msg_prevnode(msg)) + l_ptr->b_ptr->net_plane = msg_net_plane(msg); + + l_ptr->owner->permit_changeover = msg_redundant_link(msg); + + switch (msg_type(msg)) { + + case RESET_MSG: + if (!link_working_unknown(l_ptr) && + (l_ptr->peer_session != INVALID_SESSION)) { + if (msg_session(msg) == l_ptr->peer_session) + break; /* duplicate: ignore */ + } + /* fall thru' */ + case ACTIVATE_MSG: + /* Update link settings according other endpoint's values */ + + strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg)); + + msg_tol = msg_link_tolerance(msg); + if (msg_tol > l_ptr->tolerance) + link_set_supervision_props(l_ptr, msg_tol); + + if (msg_linkprio(msg) > l_ptr->priority) + l_ptr->priority = msg_linkprio(msg); + + max_pkt_info = msg_max_pkt(msg); + if (max_pkt_info) { + if (max_pkt_info < l_ptr->max_pkt_target) + l_ptr->max_pkt_target = max_pkt_info; + if (l_ptr->max_pkt > l_ptr->max_pkt_target) + l_ptr->max_pkt = l_ptr->max_pkt_target; + } else { + l_ptr->max_pkt = l_ptr->max_pkt_target; + } + l_ptr->owner->bclink.supported = (max_pkt_info != 0); + + link_state_event(l_ptr, msg_type(msg)); + + l_ptr->peer_session = msg_session(msg); + l_ptr->peer_bearer_id = msg_bearer_id(msg); + + /* Synchronize broadcast sequence numbers */ + if (!tipc_node_redundant_links(l_ptr->owner)) + l_ptr->owner->bclink.last_in = mod(msg_last_bcast(msg)); + break; + case STATE_MSG: + + msg_tol = msg_link_tolerance(msg); + if (msg_tol) + link_set_supervision_props(l_ptr, msg_tol); + + if (msg_linkprio(msg) && + (msg_linkprio(msg) != l_ptr->priority)) { + warn("Resetting link <%s>, priority change %u->%u\n", + l_ptr->name, l_ptr->priority, msg_linkprio(msg)); + l_ptr->priority = msg_linkprio(msg); + tipc_link_reset(l_ptr); /* Enforce change to take effect */ + break; + } + link_state_event(l_ptr, TRAFFIC_MSG_EVT); + l_ptr->stats.recv_states++; + if (link_reset_unknown(l_ptr)) + break; + + if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) { + rec_gap = mod(msg_next_sent(msg) - + mod(l_ptr->next_in_no)); + } + + max_pkt_ack = msg_max_pkt(msg); + if (max_pkt_ack > l_ptr->max_pkt) { + l_ptr->max_pkt = max_pkt_ack; + l_ptr->max_pkt_probes = 0; + } + + max_pkt_ack = 0; + if (msg_probe(msg)) { + l_ptr->stats.recv_probes++; + if (msg_size(msg) > sizeof(l_ptr->proto_msg)) + max_pkt_ack = msg_size(msg); + } + + /* Protocol message before retransmits, reduce loss risk */ + + tipc_bclink_check_gap(l_ptr->owner, msg_last_bcast(msg)); + + if (rec_gap || (msg_probe(msg))) { + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 0, rec_gap, 0, 0, max_pkt_ack); + } + if (msg_seq_gap(msg)) { + l_ptr->stats.recv_nacks++; + tipc_link_retransmit(l_ptr, l_ptr->first_out, + msg_seq_gap(msg)); + } + break; + } +exit: + buf_discard(buf); +} + + +/* + * tipc_link_tunnel(): Send one message via a link belonging to + * another bearer. Owner node is locked. + */ +static void tipc_link_tunnel(struct link *l_ptr, + struct tipc_msg *tunnel_hdr, + struct tipc_msg *msg, + u32 selector) +{ + struct link *tunnel; + struct sk_buff *buf; + u32 length = msg_size(msg); + + tunnel = l_ptr->owner->active_links[selector & 1]; + if (!tipc_link_is_up(tunnel)) { + warn("Link changeover error, " + "tunnel link no longer available\n"); + return; + } + msg_set_size(tunnel_hdr, length + INT_H_SIZE); + buf = tipc_buf_acquire(length + INT_H_SIZE); + if (!buf) { + warn("Link changeover error, " + "unable to send tunnel msg\n"); + return; + } + skb_copy_to_linear_data(buf, tunnel_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(buf, INT_H_SIZE, msg, length); + tipc_link_send_buf(tunnel, buf); +} + + + +/* + * changeover(): Send whole message queue via the remaining link + * Owner node is locked. + */ + +void tipc_link_changeover(struct link *l_ptr) +{ + u32 msgcount = l_ptr->out_queue_size; + struct sk_buff *crs = l_ptr->first_out; + struct link *tunnel = l_ptr->owner->active_links[0]; + struct tipc_msg tunnel_hdr; + int split_bundles; + + if (!tunnel) + return; + + if (!l_ptr->owner->permit_changeover) { + warn("Link changeover error, " + "peer did not permit changeover\n"); + return; + } + + tipc_msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL, + ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr); + msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); + msg_set_msgcnt(&tunnel_hdr, msgcount); + + if (!l_ptr->first_out) { + struct sk_buff *buf; + + buf = tipc_buf_acquire(INT_H_SIZE); + if (buf) { + skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE); + msg_set_size(&tunnel_hdr, INT_H_SIZE); + tipc_link_send_buf(tunnel, buf); + } else { + warn("Link changeover error, " + "unable to send changeover msg\n"); + } + return; + } + + split_bundles = (l_ptr->owner->active_links[0] != + l_ptr->owner->active_links[1]); + + while (crs) { + struct tipc_msg *msg = buf_msg(crs); + + if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) { + struct tipc_msg *m = msg_get_wrapped(msg); + unchar *pos = (unchar *)m; + + msgcount = msg_msgcnt(msg); + while (msgcount--) { + msg_set_seqno(m, msg_seqno(msg)); + tipc_link_tunnel(l_ptr, &tunnel_hdr, m, + msg_link_selector(m)); + pos += align(msg_size(m)); + m = (struct tipc_msg *)pos; + } + } else { + tipc_link_tunnel(l_ptr, &tunnel_hdr, msg, + msg_link_selector(msg)); + } + crs = crs->next; + } +} + +void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel) +{ + struct sk_buff *iter; + struct tipc_msg tunnel_hdr; + + tipc_msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL, + DUPLICATE_MSG, INT_H_SIZE, l_ptr->addr); + msg_set_msgcnt(&tunnel_hdr, l_ptr->out_queue_size); + msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); + iter = l_ptr->first_out; + while (iter) { + struct sk_buff *outbuf; + struct tipc_msg *msg = buf_msg(iter); + u32 length = msg_size(msg); + + if (msg_user(msg) == MSG_BUNDLER) + msg_set_type(msg, CLOSED_MSG); + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */ + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + msg_set_size(&tunnel_hdr, length + INT_H_SIZE); + outbuf = tipc_buf_acquire(length + INT_H_SIZE); + if (outbuf == NULL) { + warn("Link changeover error, " + "unable to send duplicate msg\n"); + return; + } + skb_copy_to_linear_data(outbuf, &tunnel_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(outbuf, INT_H_SIZE, iter->data, + length); + tipc_link_send_buf(tunnel, outbuf); + if (!tipc_link_is_up(l_ptr)) + return; + iter = iter->next; + } +} + + + +/** + * buf_extract - extracts embedded TIPC message from another message + * @skb: encapsulating message buffer + * @from_pos: offset to extract from + * + * Returns a new message buffer containing an embedded message. The + * encapsulating message itself is left unchanged. + */ + +static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) +{ + struct tipc_msg *msg = (struct tipc_msg *)(skb->data + from_pos); + u32 size = msg_size(msg); + struct sk_buff *eb; + + eb = tipc_buf_acquire(size); + if (eb) + skb_copy_to_linear_data(eb, msg, size); + return eb; +} + +/* + * link_recv_changeover_msg(): Receive tunneled packet sent + * via other link. Node is locked. Return extracted buffer. + */ + +static int link_recv_changeover_msg(struct link **l_ptr, + struct sk_buff **buf) +{ + struct sk_buff *tunnel_buf = *buf; + struct link *dest_link; + struct tipc_msg *msg; + struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); + u32 msg_typ = msg_type(tunnel_msg); + u32 msg_count = msg_msgcnt(tunnel_msg); + + dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; + if (!dest_link) + goto exit; + if (dest_link == *l_ptr) { + err("Unexpected changeover message on link <%s>\n", + (*l_ptr)->name); + goto exit; + } + *l_ptr = dest_link; + msg = msg_get_wrapped(tunnel_msg); + + if (msg_typ == DUPLICATE_MSG) { + if (less(msg_seqno(msg), mod(dest_link->next_in_no))) + goto exit; + *buf = buf_extract(tunnel_buf, INT_H_SIZE); + if (*buf == NULL) { + warn("Link changeover error, duplicate msg dropped\n"); + goto exit; + } + buf_discard(tunnel_buf); + return 1; + } + + /* First original message ?: */ + + if (tipc_link_is_up(dest_link)) { + info("Resetting link <%s>, changeover initiated by peer\n", + dest_link->name); + tipc_link_reset(dest_link); + dest_link->exp_msg_count = msg_count; + if (!msg_count) + goto exit; + } else if (dest_link->exp_msg_count == START_CHANGEOVER) { + dest_link->exp_msg_count = msg_count; + if (!msg_count) + goto exit; + } + + /* Receive original message */ + + if (dest_link->exp_msg_count == 0) { + warn("Link switchover error, " + "got too many tunnelled messages\n"); + goto exit; + } + dest_link->exp_msg_count--; + if (less(msg_seqno(msg), dest_link->reset_checkpoint)) { + goto exit; + } else { + *buf = buf_extract(tunnel_buf, INT_H_SIZE); + if (*buf != NULL) { + buf_discard(tunnel_buf); + return 1; + } else { + warn("Link changeover error, original msg dropped\n"); + } + } +exit: + *buf = NULL; + buf_discard(tunnel_buf); + return 0; +} + +/* + * Bundler functionality: + */ +void tipc_link_recv_bundle(struct sk_buff *buf) +{ + u32 msgcount = msg_msgcnt(buf_msg(buf)); + u32 pos = INT_H_SIZE; + struct sk_buff *obuf; + + while (msgcount--) { + obuf = buf_extract(buf, pos); + if (obuf == NULL) { + warn("Link unable to unbundle message(s)\n"); + break; + } + pos += align(msg_size(buf_msg(obuf))); + tipc_net_route_msg(obuf); + } + buf_discard(buf); +} + +/* + * Fragmentation/defragmentation: + */ + + +/* + * link_send_long_buf: Entry for buffers needing fragmentation. + * The buffer is complete, inclusive total message length. + * Returns user data length. + */ +static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) +{ + struct sk_buff *buf_chain = NULL; + struct sk_buff *buf_chain_tail = (struct sk_buff *)&buf_chain; + struct tipc_msg *inmsg = buf_msg(buf); + struct tipc_msg fragm_hdr; + u32 insize = msg_size(inmsg); + u32 dsz = msg_data_sz(inmsg); + unchar *crs = buf->data; + u32 rest = insize; + u32 pack_sz = l_ptr->max_pkt; + u32 fragm_sz = pack_sz - INT_H_SIZE; + u32 fragm_no = 0; + u32 destaddr; + + if (msg_short(inmsg)) + destaddr = l_ptr->addr; + else + destaddr = msg_destnode(inmsg); + + /* Prepare reusable fragment header: */ + + tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, + INT_H_SIZE, destaddr); + + /* Chop up message: */ + + while (rest > 0) { + struct sk_buff *fragm; + + if (rest <= fragm_sz) { + fragm_sz = rest; + msg_set_type(&fragm_hdr, LAST_FRAGMENT); + } + fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE); + if (fragm == NULL) { + buf_discard(buf); + while (buf_chain) { + buf = buf_chain; + buf_chain = buf_chain->next; + buf_discard(buf); + } + return -ENOMEM; + } + msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); + fragm_no++; + msg_set_fragm_no(&fragm_hdr, fragm_no); + skb_copy_to_linear_data(fragm, &fragm_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(fragm, INT_H_SIZE, crs, + fragm_sz); + buf_chain_tail->next = fragm; + buf_chain_tail = fragm; + + rest -= fragm_sz; + crs += fragm_sz; + msg_set_type(&fragm_hdr, FRAGMENT); + } + buf_discard(buf); + + /* Append chain of fragments to send queue & send them */ + + l_ptr->long_msg_seq_no++; + link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no); + l_ptr->stats.sent_fragments += fragm_no; + l_ptr->stats.sent_fragmented++; + tipc_link_push_queue(l_ptr); + + return dsz; +} + +/* + * A pending message being re-assembled must store certain values + * to handle subsequent fragments correctly. The following functions + * help storing these values in unused, available fields in the + * pending message. This makes dynamic memory allocation unnecessary. + */ + +static void set_long_msg_seqno(struct sk_buff *buf, u32 seqno) +{ + msg_set_seqno(buf_msg(buf), seqno); +} + +static u32 get_fragm_size(struct sk_buff *buf) +{ + return msg_ack(buf_msg(buf)); +} + +static void set_fragm_size(struct sk_buff *buf, u32 sz) +{ + msg_set_ack(buf_msg(buf), sz); +} + +static u32 get_expected_frags(struct sk_buff *buf) +{ + return msg_bcast_ack(buf_msg(buf)); +} + +static void set_expected_frags(struct sk_buff *buf, u32 exp) +{ + msg_set_bcast_ack(buf_msg(buf), exp); +} + +static u32 get_timer_cnt(struct sk_buff *buf) +{ + return msg_reroute_cnt(buf_msg(buf)); +} + +static void incr_timer_cnt(struct sk_buff *buf) +{ + msg_incr_reroute_cnt(buf_msg(buf)); +} + +/* + * tipc_link_recv_fragment(): Called with node lock on. Returns + * the reassembled buffer if message is complete. + */ +int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, + struct tipc_msg **m) +{ + struct sk_buff *prev = NULL; + struct sk_buff *fbuf = *fb; + struct tipc_msg *fragm = buf_msg(fbuf); + struct sk_buff *pbuf = *pending; + u32 long_msg_seq_no = msg_long_msgno(fragm); + + *fb = NULL; + + /* Is there an incomplete message waiting for this fragment? */ + + while (pbuf && ((msg_seqno(buf_msg(pbuf)) != long_msg_seq_no) || + (msg_orignode(fragm) != msg_orignode(buf_msg(pbuf))))) { + prev = pbuf; + pbuf = pbuf->next; + } + + if (!pbuf && (msg_type(fragm) == FIRST_FRAGMENT)) { + struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm); + u32 msg_sz = msg_size(imsg); + u32 fragm_sz = msg_data_sz(fragm); + u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz); + u32 max = TIPC_MAX_USER_MSG_SIZE + LONG_H_SIZE; + if (msg_type(imsg) == TIPC_MCAST_MSG) + max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE; + if (msg_size(imsg) > max) { + buf_discard(fbuf); + return 0; + } + pbuf = tipc_buf_acquire(msg_size(imsg)); + if (pbuf != NULL) { + pbuf->next = *pending; + *pending = pbuf; + skb_copy_to_linear_data(pbuf, imsg, + msg_data_sz(fragm)); + /* Prepare buffer for subsequent fragments. */ + + set_long_msg_seqno(pbuf, long_msg_seq_no); + set_fragm_size(pbuf, fragm_sz); + set_expected_frags(pbuf, exp_fragm_cnt - 1); + } else { + warn("Link unable to reassemble fragmented message\n"); + } + buf_discard(fbuf); + return 0; + } else if (pbuf && (msg_type(fragm) != FIRST_FRAGMENT)) { + u32 dsz = msg_data_sz(fragm); + u32 fsz = get_fragm_size(pbuf); + u32 crs = ((msg_fragm_no(fragm) - 1) * fsz); + u32 exp_frags = get_expected_frags(pbuf) - 1; + skb_copy_to_linear_data_offset(pbuf, crs, + msg_data(fragm), dsz); + buf_discard(fbuf); + + /* Is message complete? */ + + if (exp_frags == 0) { + if (prev) + prev->next = pbuf->next; + else + *pending = pbuf->next; + msg_reset_reroute_cnt(buf_msg(pbuf)); + *fb = pbuf; + *m = buf_msg(pbuf); + return 1; + } + set_expected_frags(pbuf, exp_frags); + return 0; + } + buf_discard(fbuf); + return 0; +} + +/** + * link_check_defragm_bufs - flush stale incoming message fragments + * @l_ptr: pointer to link + */ + +static void link_check_defragm_bufs(struct link *l_ptr) +{ + struct sk_buff *prev = NULL; + struct sk_buff *next = NULL; + struct sk_buff *buf = l_ptr->defragm_buf; + + if (!buf) + return; + if (!link_working_working(l_ptr)) + return; + while (buf) { + u32 cnt = get_timer_cnt(buf); + + next = buf->next; + if (cnt < 4) { + incr_timer_cnt(buf); + prev = buf; + } else { + if (prev) + prev->next = buf->next; + else + l_ptr->defragm_buf = buf->next; + buf_discard(buf); + } + buf = next; + } +} + + + +static void link_set_supervision_props(struct link *l_ptr, u32 tolerance) +{ + if ((tolerance < TIPC_MIN_LINK_TOL) || (tolerance > TIPC_MAX_LINK_TOL)) + return; + + l_ptr->tolerance = tolerance; + l_ptr->continuity_interval = + ((tolerance / 4) > 500) ? 500 : tolerance / 4; + l_ptr->abort_limit = tolerance / (l_ptr->continuity_interval / 4); +} + + +void tipc_link_set_queue_limits(struct link *l_ptr, u32 window) +{ + /* Data messages from this node, inclusive FIRST_FRAGM */ + l_ptr->queue_limit[TIPC_LOW_IMPORTANCE] = window; + l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE] = (window / 3) * 4; + l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE] = (window / 3) * 5; + l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE] = (window / 3) * 6; + /* Transiting data messages,inclusive FIRST_FRAGM */ + l_ptr->queue_limit[TIPC_LOW_IMPORTANCE + 4] = 300; + l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE + 4] = 600; + l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE + 4] = 900; + l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE + 4] = 1200; + l_ptr->queue_limit[CONN_MANAGER] = 1200; + l_ptr->queue_limit[CHANGEOVER_PROTOCOL] = 2500; + l_ptr->queue_limit[NAME_DISTRIBUTOR] = 3000; + /* FRAGMENT and LAST_FRAGMENT packets */ + l_ptr->queue_limit[MSG_FRAGMENTER] = 4000; +} + +/** + * link_find_link - locate link by name + * @name - ptr to link name string + * @node - ptr to area to be filled with ptr to associated node + * + * Caller must hold 'tipc_net_lock' to ensure node and bearer are not deleted; + * this also prevents link deletion. + * + * Returns pointer to link (or 0 if invalid link name). + */ + +static struct link *link_find_link(const char *name, struct tipc_node **node) +{ + struct link_name link_name_parts; + struct tipc_bearer *b_ptr; + struct link *l_ptr; + + if (!link_name_validate(name, &link_name_parts)) + return NULL; + + b_ptr = tipc_bearer_find_interface(link_name_parts.if_local); + if (!b_ptr) + return NULL; + + *node = tipc_node_find(link_name_parts.addr_peer); + if (!*node) + return NULL; + + l_ptr = (*node)->links[b_ptr->identity]; + if (!l_ptr || strcmp(l_ptr->name, name)) + return NULL; + + return l_ptr; +} + +struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, + u16 cmd) +{ + struct tipc_link_config *args; + u32 new_value; + struct link *l_ptr; + struct tipc_node *node; + int res; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_CONFIG)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + args = (struct tipc_link_config *)TLV_DATA(req_tlv_area); + new_value = ntohl(args->value); + + if (!strcmp(args->name, tipc_bclink_name)) { + if ((cmd == TIPC_CMD_SET_LINK_WINDOW) && + (tipc_bclink_set_queue_limits(new_value) == 0)) + return tipc_cfg_reply_none(); + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change setting on broadcast link)"); + } + + read_lock_bh(&tipc_net_lock); + l_ptr = link_find_link(args->name, &node); + if (!l_ptr) { + read_unlock_bh(&tipc_net_lock); + return tipc_cfg_reply_error_string("link not found"); + } + + tipc_node_lock(node); + res = -EINVAL; + switch (cmd) { + case TIPC_CMD_SET_LINK_TOL: + if ((new_value >= TIPC_MIN_LINK_TOL) && + (new_value <= TIPC_MAX_LINK_TOL)) { + link_set_supervision_props(l_ptr, new_value); + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, new_value, 0, 0); + res = 0; + } + break; + case TIPC_CMD_SET_LINK_PRI: + if ((new_value >= TIPC_MIN_LINK_PRI) && + (new_value <= TIPC_MAX_LINK_PRI)) { + l_ptr->priority = new_value; + tipc_link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, 0, new_value, 0); + res = 0; + } + break; + case TIPC_CMD_SET_LINK_WINDOW: + if ((new_value >= TIPC_MIN_LINK_WIN) && + (new_value <= TIPC_MAX_LINK_WIN)) { + tipc_link_set_queue_limits(l_ptr, new_value); + res = 0; + } + break; + } + tipc_node_unlock(node); + + read_unlock_bh(&tipc_net_lock); + if (res) + return tipc_cfg_reply_error_string("cannot change link setting"); + + return tipc_cfg_reply_none(); +} + +/** + * link_reset_statistics - reset link statistics + * @l_ptr: pointer to link + */ + +static void link_reset_statistics(struct link *l_ptr) +{ + memset(&l_ptr->stats, 0, sizeof(l_ptr->stats)); + l_ptr->stats.sent_info = l_ptr->next_out_no; + l_ptr->stats.recv_info = l_ptr->next_in_no; +} + +struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space) +{ + char *link_name; + struct link *l_ptr; + struct tipc_node *node; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + link_name = (char *)TLV_DATA(req_tlv_area); + if (!strcmp(link_name, tipc_bclink_name)) { + if (tipc_bclink_reset_stats()) + return tipc_cfg_reply_error_string("link not found"); + return tipc_cfg_reply_none(); + } + + read_lock_bh(&tipc_net_lock); + l_ptr = link_find_link(link_name, &node); + if (!l_ptr) { + read_unlock_bh(&tipc_net_lock); + return tipc_cfg_reply_error_string("link not found"); + } + + tipc_node_lock(node); + link_reset_statistics(l_ptr); + tipc_node_unlock(node); + read_unlock_bh(&tipc_net_lock); + return tipc_cfg_reply_none(); +} + +/** + * percent - convert count to a percentage of total (rounding up or down) + */ + +static u32 percent(u32 count, u32 total) +{ + return (count * 100 + (total / 2)) / total; +} + +/** + * tipc_link_stats - print link statistics + * @name: link name + * @buf: print buffer area + * @buf_size: size of print buffer area + * + * Returns length of print buffer data string (or 0 if error) + */ + +static int tipc_link_stats(const char *name, char *buf, const u32 buf_size) +{ + struct print_buf pb; + struct link *l_ptr; + struct tipc_node *node; + char *status; + u32 profile_total = 0; + + if (!strcmp(name, tipc_bclink_name)) + return tipc_bclink_stats(buf, buf_size); + + tipc_printbuf_init(&pb, buf, buf_size); + + read_lock_bh(&tipc_net_lock); + l_ptr = link_find_link(name, &node); + if (!l_ptr) { + read_unlock_bh(&tipc_net_lock); + return 0; + } + tipc_node_lock(node); + + if (tipc_link_is_active(l_ptr)) + status = "ACTIVE"; + else if (tipc_link_is_up(l_ptr)) + status = "STANDBY"; + else + status = "DEFUNCT"; + tipc_printf(&pb, "Link <%s>\n" + " %s MTU:%u Priority:%u Tolerance:%u ms" + " Window:%u packets\n", + l_ptr->name, status, l_ptr->max_pkt, + l_ptr->priority, l_ptr->tolerance, l_ptr->queue_limit[0]); + tipc_printf(&pb, " RX packets:%u fragments:%u/%u bundles:%u/%u\n", + l_ptr->next_in_no - l_ptr->stats.recv_info, + l_ptr->stats.recv_fragments, + l_ptr->stats.recv_fragmented, + l_ptr->stats.recv_bundles, + l_ptr->stats.recv_bundled); + tipc_printf(&pb, " TX packets:%u fragments:%u/%u bundles:%u/%u\n", + l_ptr->next_out_no - l_ptr->stats.sent_info, + l_ptr->stats.sent_fragments, + l_ptr->stats.sent_fragmented, + l_ptr->stats.sent_bundles, + l_ptr->stats.sent_bundled); + profile_total = l_ptr->stats.msg_length_counts; + if (!profile_total) + profile_total = 1; + tipc_printf(&pb, " TX profile sample:%u packets average:%u octets\n" + " 0-64:%u%% -256:%u%% -1024:%u%% -4096:%u%% " + "-16354:%u%% -32768:%u%% -66000:%u%%\n", + l_ptr->stats.msg_length_counts, + l_ptr->stats.msg_lengths_total / profile_total, + percent(l_ptr->stats.msg_length_profile[0], profile_total), + percent(l_ptr->stats.msg_length_profile[1], profile_total), + percent(l_ptr->stats.msg_length_profile[2], profile_total), + percent(l_ptr->stats.msg_length_profile[3], profile_total), + percent(l_ptr->stats.msg_length_profile[4], profile_total), + percent(l_ptr->stats.msg_length_profile[5], profile_total), + percent(l_ptr->stats.msg_length_profile[6], profile_total)); + tipc_printf(&pb, " RX states:%u probes:%u naks:%u defs:%u dups:%u\n", + l_ptr->stats.recv_states, + l_ptr->stats.recv_probes, + l_ptr->stats.recv_nacks, + l_ptr->stats.deferred_recv, + l_ptr->stats.duplicates); + tipc_printf(&pb, " TX states:%u probes:%u naks:%u acks:%u dups:%u\n", + l_ptr->stats.sent_states, + l_ptr->stats.sent_probes, + l_ptr->stats.sent_nacks, + l_ptr->stats.sent_acks, + l_ptr->stats.retransmitted); + tipc_printf(&pb, " Congestion bearer:%u link:%u Send queue max:%u avg:%u\n", + l_ptr->stats.bearer_congs, + l_ptr->stats.link_congs, + l_ptr->stats.max_queue_sz, + l_ptr->stats.queue_sz_counts + ? (l_ptr->stats.accu_queue_sz / l_ptr->stats.queue_sz_counts) + : 0); + + tipc_node_unlock(node); + read_unlock_bh(&tipc_net_lock); + return tipc_printbuf_validate(&pb); +} + +#define MAX_LINK_STATS_INFO 2000 + +struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + int str_len; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_LINK_STATS_INFO)); + if (!buf) + return NULL; + + rep_tlv = (struct tlv_desc *)buf->data; + + str_len = tipc_link_stats((char *)TLV_DATA(req_tlv_area), + (char *)TLV_DATA(rep_tlv), MAX_LINK_STATS_INFO); + if (!str_len) { + buf_discard(buf); + return tipc_cfg_reply_error_string("link not found"); + } + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +/** + * tipc_link_get_max_pkt - get maximum packet size to use when sending to destination + * @dest: network address of destination node + * @selector: used to select from set of active links + * + * If no active link can be found, uses default maximum packet size. + */ + +u32 tipc_link_get_max_pkt(u32 dest, u32 selector) +{ + struct tipc_node *n_ptr; + struct link *l_ptr; + u32 res = MAX_PKT_DEFAULT; + + if (dest == tipc_own_addr) + return MAX_MSG_SIZE; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(dest); + if (n_ptr) { + tipc_node_lock(n_ptr); + l_ptr = n_ptr->active_links[selector & 1]; + if (l_ptr) + res = l_ptr->max_pkt; + tipc_node_unlock(n_ptr); + } + read_unlock_bh(&tipc_net_lock); + return res; +} + +static void link_print(struct link *l_ptr, const char *str) +{ + char print_area[256]; + struct print_buf pb; + struct print_buf *buf = &pb; + + tipc_printbuf_init(buf, print_area, sizeof(print_area)); + + tipc_printf(buf, str); + tipc_printf(buf, "Link %x<%s>:", + l_ptr->addr, l_ptr->b_ptr->name); + +#ifdef CONFIG_TIPC_DEBUG + if (link_reset_reset(l_ptr) || link_reset_unknown(l_ptr)) + goto print_state; + + tipc_printf(buf, ": NXO(%u):", mod(l_ptr->next_out_no)); + tipc_printf(buf, "NXI(%u):", mod(l_ptr->next_in_no)); + tipc_printf(buf, "SQUE"); + if (l_ptr->first_out) { + tipc_printf(buf, "[%u..", msg_seqno(buf_msg(l_ptr->first_out))); + if (l_ptr->next_out) + tipc_printf(buf, "%u..", + msg_seqno(buf_msg(l_ptr->next_out))); + tipc_printf(buf, "%u]", msg_seqno(buf_msg(l_ptr->last_out))); + if ((mod(msg_seqno(buf_msg(l_ptr->last_out)) - + msg_seqno(buf_msg(l_ptr->first_out))) + != (l_ptr->out_queue_size - 1)) || + (l_ptr->last_out->next != NULL)) { + tipc_printf(buf, "\nSend queue inconsistency\n"); + tipc_printf(buf, "first_out= %p ", l_ptr->first_out); + tipc_printf(buf, "next_out= %p ", l_ptr->next_out); + tipc_printf(buf, "last_out= %p ", l_ptr->last_out); + } + } else + tipc_printf(buf, "[]"); + tipc_printf(buf, "SQSIZ(%u)", l_ptr->out_queue_size); + if (l_ptr->oldest_deferred_in) { + u32 o = msg_seqno(buf_msg(l_ptr->oldest_deferred_in)); + u32 n = msg_seqno(buf_msg(l_ptr->newest_deferred_in)); + tipc_printf(buf, ":RQUE[%u..%u]", o, n); + if (l_ptr->deferred_inqueue_sz != mod((n + 1) - o)) { + tipc_printf(buf, ":RQSIZ(%u)", + l_ptr->deferred_inqueue_sz); + } + } +print_state: +#endif + + if (link_working_unknown(l_ptr)) + tipc_printf(buf, ":WU"); + else if (link_reset_reset(l_ptr)) + tipc_printf(buf, ":RR"); + else if (link_reset_unknown(l_ptr)) + tipc_printf(buf, ":RU"); + else if (link_working_working(l_ptr)) + tipc_printf(buf, ":WW"); + tipc_printf(buf, "\n"); + + tipc_printbuf_validate(buf); + info("%s", print_area); +} + diff --git a/net/tipc/link.h b/net/tipc/link.h new file mode 100644 index 00000000..74fbecab --- /dev/null +++ b/net/tipc/link.h @@ -0,0 +1,314 @@ +/* + * net/tipc/link.h: Include file for TIPC link code + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2004-2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_LINK_H +#define _TIPC_LINK_H + +#include "log.h" +#include "msg.h" +#include "node.h" + +#define PUSH_FAILED 1 +#define PUSH_FINISHED 2 + +/* + * Link states + */ + +#define WORKING_WORKING 560810u +#define WORKING_UNKNOWN 560811u +#define RESET_UNKNOWN 560812u +#define RESET_RESET 560813u + +/* + * Starting value for maximum packet size negotiation on unicast links + * (unless bearer MTU is less) + */ + +#define MAX_PKT_DEFAULT 1500 + +/** + * struct link - TIPC link data structure + * @addr: network address of link's peer node + * @name: link name character string + * @media_addr: media address to use when sending messages over link + * @timer: link timer + * @owner: pointer to peer node + * @link_list: adjacent links in bearer's list of links + * @started: indicates if link has been started + * @checkpoint: reference point for triggering link continuity checking + * @peer_session: link session # being used by peer end of link + * @peer_bearer_id: bearer id used by link's peer endpoint + * @b_ptr: pointer to bearer used by link + * @tolerance: minimum link continuity loss needed to reset link [in ms] + * @continuity_interval: link continuity testing interval [in ms] + * @abort_limit: # of unacknowledged continuity probes needed to reset link + * @state: current state of link FSM + * @blocked: indicates if link has been administratively blocked + * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state + * @proto_msg: template for control messages generated by link + * @pmsg: convenience pointer to "proto_msg" field + * @priority: current link priority + * @queue_limit: outbound message queue congestion thresholds (indexed by user) + * @exp_msg_count: # of tunnelled messages expected during link changeover + * @reset_checkpoint: seq # of last acknowledged message at time of link reset + * @max_pkt: current maximum packet size for this link + * @max_pkt_target: desired maximum packet size for this link + * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target) + * @out_queue_size: # of messages in outbound message queue + * @first_out: ptr to first outbound message in queue + * @last_out: ptr to last outbound message in queue + * @next_out_no: next sequence number to use for outbound messages + * @last_retransmitted: sequence number of most recently retransmitted message + * @stale_count: # of identical retransmit requests made by peer + * @next_in_no: next sequence number to expect for inbound messages + * @deferred_inqueue_sz: # of messages in inbound message queue + * @oldest_deferred_in: ptr to first inbound message in queue + * @newest_deferred_in: ptr to last inbound message in queue + * @unacked_window: # of inbound messages rx'd without ack'ing back to peer + * @proto_msg_queue: ptr to (single) outbound control message + * @retransm_queue_size: number of messages to retransmit + * @retransm_queue_head: sequence number of first message to retransmit + * @next_out: ptr to first unsent outbound message in queue + * @waiting_ports: linked list of ports waiting for link congestion to abate + * @long_msg_seq_no: next identifier to use for outbound fragmented messages + * @defragm_buf: list of partially reassembled inbound message fragments + * @stats: collects statistics regarding link activity + */ + +struct link { + u32 addr; + char name[TIPC_MAX_LINK_NAME]; + struct tipc_media_addr media_addr; + struct timer_list timer; + struct tipc_node *owner; + struct list_head link_list; + + /* Management and link supervision data */ + int started; + u32 checkpoint; + u32 peer_session; + u32 peer_bearer_id; + struct tipc_bearer *b_ptr; + u32 tolerance; + u32 continuity_interval; + u32 abort_limit; + int state; + int blocked; + u32 fsm_msg_cnt; + struct { + unchar hdr[INT_H_SIZE]; + unchar body[TIPC_MAX_IF_NAME]; + } proto_msg; + struct tipc_msg *pmsg; + u32 priority; + u32 queue_limit[15]; /* queue_limit[0]==window limit */ + + /* Changeover */ + u32 exp_msg_count; + u32 reset_checkpoint; + + /* Max packet negotiation */ + u32 max_pkt; + u32 max_pkt_target; + u32 max_pkt_probes; + + /* Sending */ + u32 out_queue_size; + struct sk_buff *first_out; + struct sk_buff *last_out; + u32 next_out_no; + u32 last_retransmitted; + u32 stale_count; + + /* Reception */ + u32 next_in_no; + u32 deferred_inqueue_sz; + struct sk_buff *oldest_deferred_in; + struct sk_buff *newest_deferred_in; + u32 unacked_window; + + /* Congestion handling */ + struct sk_buff *proto_msg_queue; + u32 retransm_queue_size; + u32 retransm_queue_head; + struct sk_buff *next_out; + struct list_head waiting_ports; + + /* Fragmentation/defragmentation */ + u32 long_msg_seq_no; + struct sk_buff *defragm_buf; + + /* Statistics */ + struct { + u32 sent_info; /* used in counting # sent packets */ + u32 recv_info; /* used in counting # recv'd packets */ + u32 sent_states; + u32 recv_states; + u32 sent_probes; + u32 recv_probes; + u32 sent_nacks; + u32 recv_nacks; + u32 sent_acks; + u32 sent_bundled; + u32 sent_bundles; + u32 recv_bundled; + u32 recv_bundles; + u32 retransmitted; + u32 sent_fragmented; + u32 sent_fragments; + u32 recv_fragmented; + u32 recv_fragments; + u32 link_congs; /* # port sends blocked by congestion */ + u32 bearer_congs; + u32 deferred_recv; + u32 duplicates; + u32 max_queue_sz; /* send queue size high water mark */ + u32 accu_queue_sz; /* used for send queue size profiling */ + u32 queue_sz_counts; /* used for send queue size profiling */ + u32 msg_length_counts; /* used for message length profiling */ + u32 msg_lengths_total; /* used for message length profiling */ + u32 msg_length_profile[7]; /* used for msg. length profiling */ + } stats; +}; + +struct tipc_port; + +struct link *tipc_link_create(struct tipc_node *n_ptr, + struct tipc_bearer *b_ptr, + const struct tipc_media_addr *media_addr); +void tipc_link_delete(struct link *l_ptr); +void tipc_link_changeover(struct link *l_ptr); +void tipc_link_send_duplicate(struct link *l_ptr, struct link *dest); +void tipc_link_reset_fragments(struct link *l_ptr); +int tipc_link_is_up(struct link *l_ptr); +int tipc_link_is_active(struct link *l_ptr); +u32 tipc_link_push_packet(struct link *l_ptr); +void tipc_link_stop(struct link *l_ptr); +struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd); +struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space); +struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space); +void tipc_link_reset(struct link *l_ptr); +int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector); +int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf); +u32 tipc_link_get_max_pkt(u32 dest, u32 selector); +int tipc_link_send_sections_fast(struct tipc_port *sender, + struct iovec const *msg_sect, + const u32 num_sect, + unsigned int total_len, + u32 destnode); +void tipc_link_recv_bundle(struct sk_buff *buf); +int tipc_link_recv_fragment(struct sk_buff **pending, + struct sk_buff **fb, + struct tipc_msg **msg); +void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int prob, u32 gap, + u32 tolerance, u32 priority, u32 acked_mtu); +void tipc_link_push_queue(struct link *l_ptr); +u32 tipc_link_defer_pkt(struct sk_buff **head, struct sk_buff **tail, + struct sk_buff *buf); +void tipc_link_wakeup_ports(struct link *l_ptr, int all); +void tipc_link_set_queue_limits(struct link *l_ptr, u32 window); +void tipc_link_retransmit(struct link *l_ptr, struct sk_buff *start, u32 retransmits); + +/* + * Link sequence number manipulation routines (uses modulo 2**16 arithmetic) + */ + +static inline u32 mod(u32 x) +{ + return x & 0xffffu; +} + +static inline int between(u32 lower, u32 upper, u32 n) +{ + if ((lower < n) && (n < upper)) + return 1; + if ((upper < lower) && ((n > lower) || (n < upper))) + return 1; + return 0; +} + +static inline int less_eq(u32 left, u32 right) +{ + return mod(right - left) < 32768u; +} + +static inline int less(u32 left, u32 right) +{ + return less_eq(left, right) && (mod(right) != mod(left)); +} + +static inline u32 lesser(u32 left, u32 right) +{ + return less_eq(left, right) ? left : right; +} + + +/* + * Link status checking routines + */ + +static inline int link_working_working(struct link *l_ptr) +{ + return l_ptr->state == WORKING_WORKING; +} + +static inline int link_working_unknown(struct link *l_ptr) +{ + return l_ptr->state == WORKING_UNKNOWN; +} + +static inline int link_reset_unknown(struct link *l_ptr) +{ + return l_ptr->state == RESET_UNKNOWN; +} + +static inline int link_reset_reset(struct link *l_ptr) +{ + return l_ptr->state == RESET_RESET; +} + +static inline int link_blocked(struct link *l_ptr) +{ + return l_ptr->exp_msg_count || l_ptr->blocked; +} + +static inline int link_congested(struct link *l_ptr) +{ + return l_ptr->out_queue_size >= l_ptr->queue_limit[0]; +} + +#endif diff --git a/net/tipc/log.c b/net/tipc/log.c new file mode 100644 index 00000000..952c39f6 --- /dev/null +++ b/net/tipc/log.c @@ -0,0 +1,351 @@ +/* + * net/tipc/log.c: TIPC print buffer routines for debugging + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2005-2007, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "log.h" + +/* + * TIPC pre-defines the following print buffers: + * + * TIPC_NULL : null buffer (i.e. print nowhere) + * TIPC_CONS : system console + * TIPC_LOG : TIPC log buffer + * + * Additional user-defined print buffers are also permitted. + */ + +static struct print_buf null_buf = { NULL, 0, NULL, 0 }; +struct print_buf *const TIPC_NULL = &null_buf; + +static struct print_buf cons_buf = { NULL, 0, NULL, 1 }; +struct print_buf *const TIPC_CONS = &cons_buf; + +static struct print_buf log_buf = { NULL, 0, NULL, 1 }; +struct print_buf *const TIPC_LOG = &log_buf; + +/* + * Locking policy when using print buffers. + * + * 1) tipc_printf() uses 'print_lock' to protect against concurrent access to + * 'print_string' when writing to a print buffer. This also protects against + * concurrent writes to the print buffer being written to. + * + * 2) tipc_log_XXX() leverages the aforementioned use of 'print_lock' to + * protect against all types of concurrent operations on their associated + * print buffer (not just write operations). + * + * Note: All routines of the form tipc_printbuf_XXX() are lock-free, and rely + * on the caller to prevent simultaneous use of the print buffer(s) being + * manipulated. + */ + +static char print_string[TIPC_PB_MAX_STR]; +static DEFINE_SPINLOCK(print_lock); + +static void tipc_printbuf_move(struct print_buf *pb_to, + struct print_buf *pb_from); + +#define FORMAT(PTR, LEN, FMT) \ +{\ + va_list args;\ + va_start(args, FMT);\ + LEN = vsprintf(PTR, FMT, args);\ + va_end(args);\ + *(PTR + LEN) = '\0';\ +} + +/** + * tipc_printbuf_init - initialize print buffer to empty + * @pb: pointer to print buffer structure + * @raw: pointer to character array used by print buffer + * @size: size of character array + * + * Note: If the character array is too small (or absent), the print buffer + * becomes a null device that discards anything written to it. + */ + +void tipc_printbuf_init(struct print_buf *pb, char *raw, u32 size) +{ + pb->buf = raw; + pb->crs = raw; + pb->size = size; + pb->echo = 0; + + if (size < TIPC_PB_MIN_SIZE) { + pb->buf = NULL; + } else if (raw) { + pb->buf[0] = 0; + pb->buf[size - 1] = ~0; + } +} + +/** + * tipc_printbuf_reset - reinitialize print buffer to empty state + * @pb: pointer to print buffer structure + */ + +static void tipc_printbuf_reset(struct print_buf *pb) +{ + if (pb->buf) { + pb->crs = pb->buf; + pb->buf[0] = 0; + pb->buf[pb->size - 1] = ~0; + } +} + +/** + * tipc_printbuf_empty - test if print buffer is in empty state + * @pb: pointer to print buffer structure + * + * Returns non-zero if print buffer is empty. + */ + +static int tipc_printbuf_empty(struct print_buf *pb) +{ + return !pb->buf || (pb->crs == pb->buf); +} + +/** + * tipc_printbuf_validate - check for print buffer overflow + * @pb: pointer to print buffer structure + * + * Verifies that a print buffer has captured all data written to it. + * If data has been lost, linearize buffer and prepend an error message + * + * Returns length of print buffer data string (including trailing NUL) + */ + +int tipc_printbuf_validate(struct print_buf *pb) +{ + char *err = "\n\n*** PRINT BUFFER OVERFLOW ***\n\n"; + char *cp_buf; + struct print_buf cb; + + if (!pb->buf) + return 0; + + if (pb->buf[pb->size - 1] == 0) { + cp_buf = kmalloc(pb->size, GFP_ATOMIC); + if (cp_buf) { + tipc_printbuf_init(&cb, cp_buf, pb->size); + tipc_printbuf_move(&cb, pb); + tipc_printbuf_move(pb, &cb); + kfree(cp_buf); + memcpy(pb->buf, err, strlen(err)); + } else { + tipc_printbuf_reset(pb); + tipc_printf(pb, err); + } + } + return pb->crs - pb->buf + 1; +} + +/** + * tipc_printbuf_move - move print buffer contents to another print buffer + * @pb_to: pointer to destination print buffer structure + * @pb_from: pointer to source print buffer structure + * + * Current contents of destination print buffer (if any) are discarded. + * Source print buffer becomes empty if a successful move occurs. + */ + +static void tipc_printbuf_move(struct print_buf *pb_to, + struct print_buf *pb_from) +{ + int len; + + /* Handle the cases where contents can't be moved */ + + if (!pb_to->buf) + return; + + if (!pb_from->buf) { + tipc_printbuf_reset(pb_to); + return; + } + + if (pb_to->size < pb_from->size) { + strcpy(pb_to->buf, "*** PRINT BUFFER MOVE ERROR ***"); + pb_to->buf[pb_to->size - 1] = ~0; + pb_to->crs = strchr(pb_to->buf, 0); + return; + } + + /* Copy data from char after cursor to end (if used) */ + + len = pb_from->buf + pb_from->size - pb_from->crs - 2; + if ((pb_from->buf[pb_from->size - 1] == 0) && (len > 0)) { + strcpy(pb_to->buf, pb_from->crs + 1); + pb_to->crs = pb_to->buf + len; + } else + pb_to->crs = pb_to->buf; + + /* Copy data from start to cursor (always) */ + + len = pb_from->crs - pb_from->buf; + strcpy(pb_to->crs, pb_from->buf); + pb_to->crs += len; + + tipc_printbuf_reset(pb_from); +} + +/** + * tipc_printf - append formatted output to print buffer + * @pb: pointer to print buffer + * @fmt: formatted info to be printed + */ + +void tipc_printf(struct print_buf *pb, const char *fmt, ...) +{ + int chars_to_add; + int chars_left; + char save_char; + + spin_lock_bh(&print_lock); + + FORMAT(print_string, chars_to_add, fmt); + if (chars_to_add >= TIPC_PB_MAX_STR) + strcpy(print_string, "*** PRINT BUFFER STRING TOO LONG ***"); + + if (pb->buf) { + chars_left = pb->buf + pb->size - pb->crs - 1; + if (chars_to_add <= chars_left) { + strcpy(pb->crs, print_string); + pb->crs += chars_to_add; + } else if (chars_to_add >= (pb->size - 1)) { + strcpy(pb->buf, print_string + chars_to_add + 1 + - pb->size); + pb->crs = pb->buf + pb->size - 1; + } else { + strcpy(pb->buf, print_string + chars_left); + save_char = print_string[chars_left]; + print_string[chars_left] = 0; + strcpy(pb->crs, print_string); + print_string[chars_left] = save_char; + pb->crs = pb->buf + chars_to_add - chars_left; + } + } + + if (pb->echo) + printk("%s", print_string); + + spin_unlock_bh(&print_lock); +} + +/** + * tipc_log_resize - change the size of the TIPC log buffer + * @log_size: print buffer size to use + */ + +int tipc_log_resize(int log_size) +{ + int res = 0; + + spin_lock_bh(&print_lock); + kfree(TIPC_LOG->buf); + TIPC_LOG->buf = NULL; + if (log_size) { + if (log_size < TIPC_PB_MIN_SIZE) + log_size = TIPC_PB_MIN_SIZE; + res = TIPC_LOG->echo; + tipc_printbuf_init(TIPC_LOG, kmalloc(log_size, GFP_ATOMIC), + log_size); + TIPC_LOG->echo = res; + res = !TIPC_LOG->buf; + } + spin_unlock_bh(&print_lock); + + return res; +} + +/** + * tipc_log_resize_cmd - reconfigure size of TIPC log buffer + */ + +struct sk_buff *tipc_log_resize_cmd(const void *req_tlv_area, int req_tlv_space) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (value != delimit(value, 0, 32768)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (log size must be 0-32768)"); + if (tipc_log_resize(value)) + return tipc_cfg_reply_error_string( + "unable to create specified log (log size is now 0)"); + return tipc_cfg_reply_none(); +} + +/** + * tipc_log_dump - capture TIPC log buffer contents in configuration message + */ + +struct sk_buff *tipc_log_dump(void) +{ + struct sk_buff *reply; + + spin_lock_bh(&print_lock); + if (!TIPC_LOG->buf) { + spin_unlock_bh(&print_lock); + reply = tipc_cfg_reply_ultra_string("log not activated\n"); + } else if (tipc_printbuf_empty(TIPC_LOG)) { + spin_unlock_bh(&print_lock); + reply = tipc_cfg_reply_ultra_string("log is empty\n"); + } else { + struct tlv_desc *rep_tlv; + struct print_buf pb; + int str_len; + + str_len = min(TIPC_LOG->size, 32768u); + spin_unlock_bh(&print_lock); + reply = tipc_cfg_reply_alloc(TLV_SPACE(str_len)); + if (reply) { + rep_tlv = (struct tlv_desc *)reply->data; + tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), str_len); + spin_lock_bh(&print_lock); + tipc_printbuf_move(&pb, TIPC_LOG); + spin_unlock_bh(&print_lock); + str_len = strlen(TLV_DATA(rep_tlv)) + 1; + skb_put(reply, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + } + } + return reply; +} diff --git a/net/tipc/log.h b/net/tipc/log.h new file mode 100644 index 00000000..2248d962 --- /dev/null +++ b/net/tipc/log.h @@ -0,0 +1,67 @@ +/* + * net/tipc/log.h: Include file for TIPC print buffer routines + * + * Copyright (c) 1997-2006, Ericsson AB + * Copyright (c) 2005-2007, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_LOG_H +#define _TIPC_LOG_H + +/** + * struct print_buf - TIPC print buffer structure + * @buf: pointer to character array containing print buffer contents + * @size: size of character array + * @crs: pointer to first unused space in character array (i.e. final NUL) + * @echo: echo output to system console if non-zero + */ + +struct print_buf { + char *buf; + u32 size; + char *crs; + int echo; +}; + +#define TIPC_PB_MIN_SIZE 64 /* minimum size for a print buffer's array */ +#define TIPC_PB_MAX_STR 512 /* max printable string (with trailing NUL) */ + +void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size); +int tipc_printbuf_validate(struct print_buf *pb); + +int tipc_log_resize(int log_size); + +struct sk_buff *tipc_log_resize_cmd(const void *req_tlv_area, + int req_tlv_space); +struct sk_buff *tipc_log_dump(void); + +#endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c new file mode 100644 index 00000000..03e57bf9 --- /dev/null +++ b/net/tipc/msg.c @@ -0,0 +1,355 @@ +/* + * net/tipc/msg.c: TIPC message header routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "msg.h" + +u32 tipc_msg_tot_importance(struct tipc_msg *m) +{ + if (likely(msg_isdata(m))) { + if (likely(msg_orignode(m) == tipc_own_addr)) + return msg_importance(m); + return msg_importance(m) + 4; + } + if ((msg_user(m) == MSG_FRAGMENTER) && + (msg_type(m) == FIRST_FRAGMENT)) + return msg_importance(msg_get_wrapped(m)); + return msg_importance(m); +} + + +void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, + u32 hsize, u32 destnode) +{ + memset(m, 0, hsize); + msg_set_version(m); + msg_set_user(m, user); + msg_set_hdr_sz(m, hsize); + msg_set_size(m, hsize); + msg_set_prevnode(m, tipc_own_addr); + msg_set_type(m, type); + if (!msg_short(m)) { + msg_set_orignode(m, tipc_own_addr); + msg_set_destnode(m, destnode); + } +} + +/** + * tipc_msg_build - create message using specified header and data + * + * Note: Caller must not hold any locks in case copy_from_user() is interrupted! + * + * Returns message data size or errno + */ + +int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, + u32 num_sect, unsigned int total_len, + int max_size, int usrmem, struct sk_buff **buf) +{ + int dsz, sz, hsz, pos, res, cnt; + + dsz = total_len; + pos = hsz = msg_hdr_sz(hdr); + sz = hsz + dsz; + msg_set_size(hdr, sz); + if (unlikely(sz > max_size)) { + *buf = NULL; + return dsz; + } + + *buf = tipc_buf_acquire(sz); + if (!(*buf)) + return -ENOMEM; + skb_copy_to_linear_data(*buf, hdr, hsz); + for (res = 1, cnt = 0; res && (cnt < num_sect); cnt++) { + if (likely(usrmem)) + res = !copy_from_user((*buf)->data + pos, + msg_sect[cnt].iov_base, + msg_sect[cnt].iov_len); + else + skb_copy_to_linear_data_offset(*buf, pos, + msg_sect[cnt].iov_base, + msg_sect[cnt].iov_len); + pos += msg_sect[cnt].iov_len; + } + if (likely(res)) + return dsz; + + buf_discard(*buf); + *buf = NULL; + return -EFAULT; +} + +#ifdef CONFIG_TIPC_DEBUG + +void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str) +{ + u32 usr = msg_user(msg); + tipc_printf(buf, KERN_DEBUG); + tipc_printf(buf, str); + + switch (usr) { + case MSG_BUNDLER: + tipc_printf(buf, "BNDL::"); + tipc_printf(buf, "MSGS(%u):", msg_msgcnt(msg)); + break; + case BCAST_PROTOCOL: + tipc_printf(buf, "BCASTP::"); + break; + case MSG_FRAGMENTER: + tipc_printf(buf, "FRAGM::"); + switch (msg_type(msg)) { + case FIRST_FRAGMENT: + tipc_printf(buf, "FIRST:"); + break; + case FRAGMENT: + tipc_printf(buf, "BODY:"); + break; + case LAST_FRAGMENT: + tipc_printf(buf, "LAST:"); + break; + default: + tipc_printf(buf, "UNKNOWN:%x", msg_type(msg)); + + } + tipc_printf(buf, "NO(%u/%u):", msg_long_msgno(msg), + msg_fragm_no(msg)); + break; + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + tipc_printf(buf, "DAT%u:", msg_user(msg)); + if (msg_short(msg)) { + tipc_printf(buf, "CON:"); + break; + } + switch (msg_type(msg)) { + case TIPC_CONN_MSG: + tipc_printf(buf, "CON:"); + break; + case TIPC_MCAST_MSG: + tipc_printf(buf, "MCST:"); + break; + case TIPC_NAMED_MSG: + tipc_printf(buf, "NAM:"); + break; + case TIPC_DIRECT_MSG: + tipc_printf(buf, "DIR:"); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE %u", msg_type(msg)); + } + if (msg_reroute_cnt(msg)) + tipc_printf(buf, "REROUTED(%u):", + msg_reroute_cnt(msg)); + break; + case NAME_DISTRIBUTOR: + tipc_printf(buf, "NMD::"); + switch (msg_type(msg)) { + case PUBLICATION: + tipc_printf(buf, "PUBL(%u):", (msg_size(msg) - msg_hdr_sz(msg)) / 20); /* Items */ + break; + case WITHDRAWAL: + tipc_printf(buf, "WDRW:"); + break; + default: + tipc_printf(buf, "UNKNOWN:%x", msg_type(msg)); + } + if (msg_reroute_cnt(msg)) + tipc_printf(buf, "REROUTED(%u):", + msg_reroute_cnt(msg)); + break; + case CONN_MANAGER: + tipc_printf(buf, "CONN_MNG:"); + switch (msg_type(msg)) { + case CONN_PROBE: + tipc_printf(buf, "PROBE:"); + break; + case CONN_PROBE_REPLY: + tipc_printf(buf, "PROBE_REPLY:"); + break; + case CONN_ACK: + tipc_printf(buf, "CONN_ACK:"); + tipc_printf(buf, "ACK(%u):", msg_msgcnt(msg)); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg)); + } + if (msg_reroute_cnt(msg)) + tipc_printf(buf, "REROUTED(%u):", msg_reroute_cnt(msg)); + break; + case LINK_PROTOCOL: + switch (msg_type(msg)) { + case STATE_MSG: + tipc_printf(buf, "STATE:"); + tipc_printf(buf, "%s:", msg_probe(msg) ? "PRB" : ""); + tipc_printf(buf, "NXS(%u):", msg_next_sent(msg)); + tipc_printf(buf, "GAP(%u):", msg_seq_gap(msg)); + tipc_printf(buf, "LSTBC(%u):", msg_last_bcast(msg)); + break; + case RESET_MSG: + tipc_printf(buf, "RESET:"); + if (msg_size(msg) != msg_hdr_sz(msg)) + tipc_printf(buf, "BEAR:%s:", msg_data(msg)); + break; + case ACTIVATE_MSG: + tipc_printf(buf, "ACTIVATE:"); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg)); + } + tipc_printf(buf, "PLANE(%c):", msg_net_plane(msg)); + tipc_printf(buf, "SESS(%u):", msg_session(msg)); + break; + case CHANGEOVER_PROTOCOL: + tipc_printf(buf, "TUNL:"); + switch (msg_type(msg)) { + case DUPLICATE_MSG: + tipc_printf(buf, "DUPL:"); + break; + case ORIGINAL_MSG: + tipc_printf(buf, "ORIG:"); + tipc_printf(buf, "EXP(%u)", msg_msgcnt(msg)); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg)); + } + break; + case LINK_CONFIG: + tipc_printf(buf, "CFG:"); + switch (msg_type(msg)) { + case DSC_REQ_MSG: + tipc_printf(buf, "DSC_REQ:"); + break; + case DSC_RESP_MSG: + tipc_printf(buf, "DSC_RESP:"); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x:", msg_type(msg)); + break; + } + break; + default: + tipc_printf(buf, "UNKNOWN USER:"); + } + + switch (usr) { + case CONN_MANAGER: + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + switch (msg_errcode(msg)) { + case TIPC_OK: + break; + case TIPC_ERR_NO_NAME: + tipc_printf(buf, "NO_NAME:"); + break; + case TIPC_ERR_NO_PORT: + tipc_printf(buf, "NO_PORT:"); + break; + case TIPC_ERR_NO_NODE: + tipc_printf(buf, "NO_PROC:"); + break; + case TIPC_ERR_OVERLOAD: + tipc_printf(buf, "OVERLOAD:"); + break; + case TIPC_CONN_SHUTDOWN: + tipc_printf(buf, "SHUTDOWN:"); + break; + default: + tipc_printf(buf, "UNKNOWN ERROR(%x):", + msg_errcode(msg)); + } + default: + break; + } + + tipc_printf(buf, "HZ(%u):", msg_hdr_sz(msg)); + tipc_printf(buf, "SZ(%u):", msg_size(msg)); + tipc_printf(buf, "SQNO(%u):", msg_seqno(msg)); + + if (msg_non_seq(msg)) + tipc_printf(buf, "NOSEQ:"); + else + tipc_printf(buf, "ACK(%u):", msg_ack(msg)); + tipc_printf(buf, "BACK(%u):", msg_bcast_ack(msg)); + tipc_printf(buf, "PRND(%x)", msg_prevnode(msg)); + + if (msg_isdata(msg)) { + if (msg_named(msg)) { + tipc_printf(buf, "NTYP(%u):", msg_nametype(msg)); + tipc_printf(buf, "NINST(%u)", msg_nameinst(msg)); + } + } + + if ((usr != LINK_PROTOCOL) && (usr != LINK_CONFIG) && + (usr != MSG_BUNDLER)) { + if (!msg_short(msg)) { + tipc_printf(buf, ":ORIG(%x:%u):", + msg_orignode(msg), msg_origport(msg)); + tipc_printf(buf, ":DEST(%x:%u):", + msg_destnode(msg), msg_destport(msg)); + } else { + tipc_printf(buf, ":OPRT(%u):", msg_origport(msg)); + tipc_printf(buf, ":DPRT(%u):", msg_destport(msg)); + } + } + if (msg_user(msg) == NAME_DISTRIBUTOR) { + tipc_printf(buf, ":ONOD(%x):", msg_orignode(msg)); + tipc_printf(buf, ":DNOD(%x):", msg_destnode(msg)); + } + + if (msg_user(msg) == LINK_CONFIG) { + u32 *raw = (u32 *)msg; + struct tipc_media_addr *orig = (struct tipc_media_addr *)&raw[5]; + tipc_printf(buf, ":DDOM(%x):", msg_dest_domain(msg)); + tipc_printf(buf, ":NETID(%u):", msg_bc_netid(msg)); + tipc_media_addr_printf(buf, orig); + } + if (msg_user(msg) == BCAST_PROTOCOL) { + tipc_printf(buf, "BCNACK:AFTER(%u):", msg_bcgap_after(msg)); + tipc_printf(buf, "TO(%u):", msg_bcgap_to(msg)); + } + tipc_printf(buf, "\n"); + if ((usr == CHANGEOVER_PROTOCOL) && (msg_msgcnt(msg))) + tipc_msg_dbg(buf, msg_get_wrapped(msg), " /"); + if ((usr == MSG_FRAGMENTER) && (msg_type(msg) == FIRST_FRAGMENT)) + tipc_msg_dbg(buf, msg_get_wrapped(msg), " /"); +} + +#endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h new file mode 100644 index 00000000..84524547 --- /dev/null +++ b/net/tipc/msg.h @@ -0,0 +1,767 @@ +/* + * net/tipc/msg.h: Include file for TIPC message header routines + * + * Copyright (c) 2000-2007, Ericsson AB + * Copyright (c) 2005-2008, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_MSG_H +#define _TIPC_MSG_H + +#include "bearer.h" + +/* + * Constants and routines used to read and write TIPC payload message headers + * + * Note: Some items are also used with TIPC internal message headers + */ + +#define TIPC_VERSION 2 + +/* + * Payload message users are defined in TIPC's public API: + * - TIPC_LOW_IMPORTANCE + * - TIPC_MEDIUM_IMPORTANCE + * - TIPC_HIGH_IMPORTANCE + * - TIPC_CRITICAL_IMPORTANCE + */ + +/* + * Payload message types + */ + +#define TIPC_CONN_MSG 0 +#define TIPC_MCAST_MSG 1 +#define TIPC_NAMED_MSG 2 +#define TIPC_DIRECT_MSG 3 + +/* + * Message header sizes + */ + +#define SHORT_H_SIZE 24 /* Connected, in-cluster messages */ +#define DIR_MSG_H_SIZE 32 /* Directly addressed messages */ +#define LONG_H_SIZE 40 /* Named messages */ +#define MCAST_H_SIZE 44 /* Multicast messages */ +#define INT_H_SIZE 40 /* Internal messages */ +#define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ +#define MAX_H_SIZE 60 /* Largest possible TIPC header size */ + +#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) + + +struct tipc_msg { + __be32 hdr[15]; +}; + + +static inline u32 msg_word(struct tipc_msg *m, u32 pos) +{ + return ntohl(m->hdr[pos]); +} + +static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) +{ + m->hdr[w] = htonl(val); +} + +static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) +{ + return (msg_word(m, w) >> pos) & mask; +} + +static inline void msg_set_bits(struct tipc_msg *m, u32 w, + u32 pos, u32 mask, u32 val) +{ + val = (val & mask) << pos; + mask = mask << pos; + m->hdr[w] &= ~htonl(mask); + m->hdr[w] |= htonl(val); +} + +static inline void msg_swap_words(struct tipc_msg *msg, u32 a, u32 b) +{ + u32 temp = msg->hdr[a]; + + msg->hdr[a] = msg->hdr[b]; + msg->hdr[b] = temp; +} + +/* + * Word 0 + */ + +static inline u32 msg_version(struct tipc_msg *m) +{ + return msg_bits(m, 0, 29, 7); +} + +static inline void msg_set_version(struct tipc_msg *m) +{ + msg_set_bits(m, 0, 29, 7, TIPC_VERSION); +} + +static inline u32 msg_user(struct tipc_msg *m) +{ + return msg_bits(m, 0, 25, 0xf); +} + +static inline u32 msg_isdata(struct tipc_msg *m) +{ + return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; +} + +static inline void msg_set_user(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 0, 25, 0xf, n); +} + +static inline u32 msg_importance(struct tipc_msg *m) +{ + return msg_bits(m, 0, 25, 0xf); +} + +static inline void msg_set_importance(struct tipc_msg *m, u32 i) +{ + msg_set_user(m, i); +} + +static inline u32 msg_hdr_sz(struct tipc_msg *m) +{ + return msg_bits(m, 0, 21, 0xf) << 2; +} + +static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 0, 21, 0xf, n>>2); +} + +static inline u32 msg_size(struct tipc_msg *m) +{ + return msg_bits(m, 0, 0, 0x1ffff); +} + +static inline u32 msg_data_sz(struct tipc_msg *m) +{ + return msg_size(m) - msg_hdr_sz(m); +} + +static inline int msg_non_seq(struct tipc_msg *m) +{ + return msg_bits(m, 0, 20, 1); +} + +static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 0, 20, 1, n); +} + +static inline int msg_dest_droppable(struct tipc_msg *m) +{ + return msg_bits(m, 0, 19, 1); +} + +static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 19, 1, d); +} + +static inline int msg_src_droppable(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + +static inline void msg_set_size(struct tipc_msg *m, u32 sz) +{ + m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); +} + + +/* + * Word 1 + */ + +static inline u32 msg_type(struct tipc_msg *m) +{ + return msg_bits(m, 1, 29, 0x7); +} + +static inline void msg_set_type(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 29, 0x7, n); +} + +static inline u32 msg_named(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_NAMED_MSG; +} + +static inline u32 msg_mcast(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_MCAST_MSG; +} + +static inline u32 msg_connected(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_CONN_MSG; +} + +static inline u32 msg_errcode(struct tipc_msg *m) +{ + return msg_bits(m, 1, 25, 0xf); +} + +static inline void msg_set_errcode(struct tipc_msg *m, u32 err) +{ + msg_set_bits(m, 1, 25, 0xf, err); +} + +static inline u32 msg_reroute_cnt(struct tipc_msg *m) +{ + return msg_bits(m, 1, 21, 0xf); +} + +static inline void msg_incr_reroute_cnt(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); +} + +static inline void msg_reset_reroute_cnt(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 21, 0xf, 0); +} + +static inline u32 msg_lookup_scope(struct tipc_msg *m) +{ + return msg_bits(m, 1, 19, 0x3); +} + +static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 19, 0x3, n); +} + +static inline u32 msg_bcast_ack(struct tipc_msg *m) +{ + return msg_bits(m, 1, 0, 0xffff); +} + +static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 0, 0xffff, n); +} + + +/* + * Word 2 + */ + +static inline u32 msg_ack(struct tipc_msg *m) +{ + return msg_bits(m, 2, 16, 0xffff); +} + +static inline void msg_set_ack(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 16, 0xffff, n); +} + +static inline u32 msg_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 2, 0, 0xffff); +} + +static inline void msg_set_seqno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 0, 0xffff, n); +} + +/* + * TIPC may utilize the "link ack #" and "link seq #" fields of a short + * message header to hold the destination node for the message, since the + * normal "dest node" field isn't present. This cache is only referenced + * when required, so populating the cache of a longer message header is + * harmless (as long as the header has the two link sequence fields present). + * + * Note: Host byte order is OK here, since the info never goes off-card. + */ + +static inline u32 msg_destnode_cache(struct tipc_msg *m) +{ + return m->hdr[2]; +} + +static inline void msg_set_destnode_cache(struct tipc_msg *m, u32 dnode) +{ + m->hdr[2] = dnode; +} + +/* + * Words 3-10 + */ + + +static inline u32 msg_prevnode(struct tipc_msg *m) +{ + return msg_word(m, 3); +} + +static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, 3, a); +} + +static inline u32 msg_origport(struct tipc_msg *m) +{ + return msg_word(m, 4); +} + +static inline void msg_set_origport(struct tipc_msg *m, u32 p) +{ + msg_set_word(m, 4, p); +} + +static inline u32 msg_destport(struct tipc_msg *m) +{ + return msg_word(m, 5); +} + +static inline void msg_set_destport(struct tipc_msg *m, u32 p) +{ + msg_set_word(m, 5, p); +} + +static inline u32 msg_mc_netid(struct tipc_msg *m) +{ + return msg_word(m, 5); +} + +static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) +{ + msg_set_word(m, 5, p); +} + +static inline int msg_short(struct tipc_msg *m) +{ + return msg_hdr_sz(m) == 24; +} + +static inline u32 msg_orignode(struct tipc_msg *m) +{ + if (likely(msg_short(m))) + return msg_prevnode(m); + return msg_word(m, 6); +} + +static inline void msg_set_orignode(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, 6, a); +} + +static inline u32 msg_destnode(struct tipc_msg *m) +{ + return msg_word(m, 7); +} + +static inline void msg_set_destnode(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, 7, a); +} + +static inline int msg_is_dest(struct tipc_msg *m, u32 d) +{ + return msg_short(m) || (msg_destnode(m) == d); +} + +static inline u32 msg_nametype(struct tipc_msg *m) +{ + return msg_word(m, 8); +} + +static inline void msg_set_nametype(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 8, n); +} + +static inline u32 msg_nameinst(struct tipc_msg *m) +{ + return msg_word(m, 9); +} + +static inline u32 msg_namelower(struct tipc_msg *m) +{ + return msg_nameinst(m); +} + +static inline void msg_set_namelower(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 9, n); +} + +static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) +{ + msg_set_namelower(m, n); +} + +static inline u32 msg_nameupper(struct tipc_msg *m) +{ + return msg_word(m, 10); +} + +static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 10, n); +} + +static inline unchar *msg_data(struct tipc_msg *m) +{ + return ((unchar *)m) + msg_hdr_sz(m); +} + +static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +{ + return (struct tipc_msg *)msg_data(m); +} + + +/* + * Constants and routines used to read and write TIPC internal message headers + */ + +/* + * Internal message users + */ + +#define BCAST_PROTOCOL 5 +#define MSG_BUNDLER 6 +#define LINK_PROTOCOL 7 +#define CONN_MANAGER 8 +#define ROUTE_DISTRIBUTOR 9 /* obsoleted */ +#define CHANGEOVER_PROTOCOL 10 +#define NAME_DISTRIBUTOR 11 +#define MSG_FRAGMENTER 12 +#define LINK_CONFIG 13 + +/* + * Connection management protocol message types + */ + +#define CONN_PROBE 0 +#define CONN_PROBE_REPLY 1 +#define CONN_ACK 2 + +/* + * Name distributor message types + */ + +#define PUBLICATION 0 +#define WITHDRAWAL 1 + +/* + * Segmentation message types + */ + +#define FIRST_FRAGMENT 0 +#define FRAGMENT 1 +#define LAST_FRAGMENT 2 + +/* + * Link management protocol message types + */ + +#define STATE_MSG 0 +#define RESET_MSG 1 +#define ACTIVATE_MSG 2 + +/* + * Changeover tunnel message types + */ +#define DUPLICATE_MSG 0 +#define ORIGINAL_MSG 1 + +/* + * Config protocol message types + */ + +#define DSC_REQ_MSG 0 +#define DSC_RESP_MSG 1 + + +/* + * Word 1 + */ + +static inline u32 msg_seq_gap(struct tipc_msg *m) +{ + return msg_bits(m, 1, 16, 0x1fff); +} + +static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 16, 0x1fff, n); +} + + +/* + * Word 2 + */ + +static inline u32 msg_dest_domain(struct tipc_msg *m) +{ + return msg_word(m, 2); +} + +static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 2, n); +} + +static inline u32 msg_bcgap_after(struct tipc_msg *m) +{ + return msg_bits(m, 2, 16, 0xffff); +} + +static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 16, 0xffff, n); +} + +static inline u32 msg_bcgap_to(struct tipc_msg *m) +{ + return msg_bits(m, 2, 0, 0xffff); +} + +static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 0, 0xffff, n); +} + + +/* + * Word 4 + */ + +static inline u32 msg_last_bcast(struct tipc_msg *m) +{ + return msg_bits(m, 4, 16, 0xffff); +} + +static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 16, 0xffff, n); +} + + +static inline u32 msg_fragm_no(struct tipc_msg *m) +{ + return msg_bits(m, 4, 16, 0xffff); +} + +static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 16, 0xffff, n); +} + + +static inline u32 msg_next_sent(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_next_sent(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + + +static inline u32 msg_long_msgno(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + +static inline u32 msg_bc_netid(struct tipc_msg *m) +{ + return msg_word(m, 4); +} + +static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) +{ + msg_set_word(m, 4, id); +} + +static inline u32 msg_link_selector(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 1); +} + +static inline void msg_set_link_selector(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 1, (n & 1)); +} + +/* + * Word 5 + */ + +static inline u32 msg_session(struct tipc_msg *m) +{ + return msg_bits(m, 5, 16, 0xffff); +} + +static inline void msg_set_session(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 5, 16, 0xffff, n); +} + +static inline u32 msg_probe(struct tipc_msg *m) +{ + return msg_bits(m, 5, 0, 1); +} + +static inline void msg_set_probe(struct tipc_msg *m, u32 val) +{ + msg_set_bits(m, 5, 0, 1, (val & 1)); +} + +static inline char msg_net_plane(struct tipc_msg *m) +{ + return msg_bits(m, 5, 1, 7) + 'A'; +} + +static inline void msg_set_net_plane(struct tipc_msg *m, char n) +{ + msg_set_bits(m, 5, 1, 7, (n - 'A')); +} + +static inline u32 msg_linkprio(struct tipc_msg *m) +{ + return msg_bits(m, 5, 4, 0x1f); +} + +static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 5, 4, 0x1f, n); +} + +static inline u32 msg_bearer_id(struct tipc_msg *m) +{ + return msg_bits(m, 5, 9, 0x7); +} + +static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 5, 9, 0x7, n); +} + +static inline u32 msg_redundant_link(struct tipc_msg *m) +{ + return msg_bits(m, 5, 12, 0x1); +} + +static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) +{ + msg_set_bits(m, 5, 12, 0x1, r); +} + + +/* + * Word 9 + */ + +static inline u32 msg_msgcnt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u32 msg_bcast_tag(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u32 msg_max_pkt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff) * 4; +} + +static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 16, 0xffff, (n / 4)); +} + +static inline u32 msg_link_tolerance(struct tipc_msg *m) +{ + return msg_bits(m, 9, 0, 0xffff); +} + +static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 0, 0xffff, n); +} + +u32 tipc_msg_tot_importance(struct tipc_msg *m); +void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, + u32 hsize, u32 destnode); +int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, + u32 num_sect, unsigned int total_len, + int max_size, int usrmem, struct sk_buff **buf); + +static inline void msg_set_media_addr(struct tipc_msg *m, struct tipc_media_addr *a) +{ + memcpy(&((int *)m)[5], a, sizeof(*a)); +} + +static inline void msg_get_media_addr(struct tipc_msg *m, struct tipc_media_addr *a) +{ + memcpy(a, &((int *)m)[5], sizeof(*a)); +} + +#endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c new file mode 100644 index 00000000..80025a1b --- /dev/null +++ b/net/tipc/name_distr.c @@ -0,0 +1,319 @@ +/* + * net/tipc/name_distr.c: TIPC name distribution code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "link.h" +#include "name_distr.h" + +#define ITEM_SIZE sizeof(struct distr_item) + +/** + * struct distr_item - publication info distributed to other nodes + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @ref: publishing port reference + * @key: publication key + * + * ===> All fields are stored in network byte order. <=== + * + * First 3 fields identify (name or) name sequence being published. + * Reference field uniquely identifies port that published name sequence. + * Key field uniquely identifies publication, in the event a port has + * multiple publications of the same name sequence. + * + * Note: There is no field that identifies the publishing node because it is + * the same for all items contained within a publication message. + */ + +struct distr_item { + __be32 type; + __be32 lower; + __be32 upper; + __be32 ref; + __be32 key; +}; + +/** + * List of externally visible publications by this node -- + * that is, all publications having scope > TIPC_NODE_SCOPE. + */ + +static LIST_HEAD(publ_root); +static u32 publ_cnt; + +/** + * publ_to_item - add publication info to a publication message + */ + +static void publ_to_item(struct distr_item *i, struct publication *p) +{ + i->type = htonl(p->type); + i->lower = htonl(p->lower); + i->upper = htonl(p->upper); + i->ref = htonl(p->ref); + i->key = htonl(p->key); +} + +/** + * named_prepare_buf - allocate & initialize a publication message + */ + +static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) +{ + struct sk_buff *buf = tipc_buf_acquire(LONG_H_SIZE + size); + struct tipc_msg *msg; + + if (buf != NULL) { + msg = buf_msg(buf); + tipc_msg_init(msg, NAME_DISTRIBUTOR, type, LONG_H_SIZE, dest); + msg_set_size(msg, LONG_H_SIZE + size); + } + return buf; +} + +static void named_cluster_distribute(struct sk_buff *buf) +{ + struct sk_buff *buf_copy; + struct tipc_node *n_ptr; + + list_for_each_entry(n_ptr, &tipc_node_list, list) { + if (tipc_node_active_links(n_ptr)) { + buf_copy = skb_copy(buf, GFP_ATOMIC); + if (!buf_copy) + break; + msg_set_destnode(buf_msg(buf_copy), n_ptr->addr); + tipc_link_send(buf_copy, n_ptr->addr, n_ptr->addr); + } + } + + buf_discard(buf); +} + +/** + * tipc_named_publish - tell other nodes about a new publication by this node + */ + +void tipc_named_publish(struct publication *publ) +{ + struct sk_buff *buf; + struct distr_item *item; + + list_add_tail(&publ->local_list, &publ_root); + publ_cnt++; + + buf = named_prepare_buf(PUBLICATION, ITEM_SIZE, 0); + if (!buf) { + warn("Publication distribution failure\n"); + return; + } + + item = (struct distr_item *)msg_data(buf_msg(buf)); + publ_to_item(item, publ); + named_cluster_distribute(buf); +} + +/** + * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node + */ + +void tipc_named_withdraw(struct publication *publ) +{ + struct sk_buff *buf; + struct distr_item *item; + + list_del(&publ->local_list); + publ_cnt--; + + buf = named_prepare_buf(WITHDRAWAL, ITEM_SIZE, 0); + if (!buf) { + warn("Withdrawal distribution failure\n"); + return; + } + + item = (struct distr_item *)msg_data(buf_msg(buf)); + publ_to_item(item, publ); + named_cluster_distribute(buf); +} + +/** + * tipc_named_node_up - tell specified node about all publications by this node + */ + +void tipc_named_node_up(unsigned long node) +{ + struct publication *publ; + struct distr_item *item = NULL; + struct sk_buff *buf = NULL; + u32 left = 0; + u32 rest; + u32 max_item_buf; + + read_lock_bh(&tipc_nametbl_lock); + max_item_buf = TIPC_MAX_USER_MSG_SIZE / ITEM_SIZE; + max_item_buf *= ITEM_SIZE; + rest = publ_cnt * ITEM_SIZE; + + list_for_each_entry(publ, &publ_root, local_list) { + if (!buf) { + left = (rest <= max_item_buf) ? rest : max_item_buf; + rest -= left; + buf = named_prepare_buf(PUBLICATION, left, node); + if (!buf) { + warn("Bulk publication distribution failure\n"); + goto exit; + } + item = (struct distr_item *)msg_data(buf_msg(buf)); + } + publ_to_item(item, publ); + item++; + left -= ITEM_SIZE; + if (!left) { + msg_set_link_selector(buf_msg(buf), node); + tipc_link_send(buf, node, node); + buf = NULL; + } + } +exit: + read_unlock_bh(&tipc_nametbl_lock); +} + +/** + * named_purge_publ - remove publication associated with a failed node + * + * Invoked for each publication issued by a newly failed node. + * Removes publication structure from name table & deletes it. + * In rare cases the link may have come back up again when this + * function is called, and we have two items representing the same + * publication. Nudge this item's key to distinguish it from the other. + */ + +static void named_purge_publ(struct publication *publ) +{ + struct publication *p; + + write_lock_bh(&tipc_nametbl_lock); + publ->key += 1222345; + p = tipc_nametbl_remove_publ(publ->type, publ->lower, + publ->node, publ->ref, publ->key); + if (p) + tipc_nodesub_unsubscribe(&p->subscr); + write_unlock_bh(&tipc_nametbl_lock); + + if (p != publ) { + err("Unable to remove publication from failed node\n" + "(type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n", + publ->type, publ->lower, publ->node, publ->ref, publ->key); + } + + kfree(p); +} + +/** + * tipc_named_recv - process name table update message sent by another node + */ + +void tipc_named_recv(struct sk_buff *buf) +{ + struct publication *publ; + struct tipc_msg *msg = buf_msg(buf); + struct distr_item *item = (struct distr_item *)msg_data(msg); + u32 count = msg_data_sz(msg) / ITEM_SIZE; + + write_lock_bh(&tipc_nametbl_lock); + while (count--) { + if (msg_type(msg) == PUBLICATION) { + publ = tipc_nametbl_insert_publ(ntohl(item->type), + ntohl(item->lower), + ntohl(item->upper), + TIPC_CLUSTER_SCOPE, + msg_orignode(msg), + ntohl(item->ref), + ntohl(item->key)); + if (publ) { + tipc_nodesub_subscribe(&publ->subscr, + msg_orignode(msg), + publ, + (net_ev_handler) + named_purge_publ); + } + } else if (msg_type(msg) == WITHDRAWAL) { + publ = tipc_nametbl_remove_publ(ntohl(item->type), + ntohl(item->lower), + msg_orignode(msg), + ntohl(item->ref), + ntohl(item->key)); + + if (publ) { + tipc_nodesub_unsubscribe(&publ->subscr); + kfree(publ); + } else { + err("Unable to remove publication by node 0x%x\n" + "(type=%u, lower=%u, ref=%u, key=%u)\n", + msg_orignode(msg), + ntohl(item->type), ntohl(item->lower), + ntohl(item->ref), ntohl(item->key)); + } + } else { + warn("Unrecognized name table message received\n"); + } + item++; + } + write_unlock_bh(&tipc_nametbl_lock); + buf_discard(buf); +} + +/** + * tipc_named_reinit - re-initialize local publication list + * + * This routine is called whenever TIPC networking is (re)enabled. + * All existing publications by this node that have "cluster" or "zone" scope + * are updated to reflect the node's current network address. + * (If the node's address is unchanged, the update loop terminates immediately.) + */ + +void tipc_named_reinit(void) +{ + struct publication *publ; + + write_lock_bh(&tipc_nametbl_lock); + list_for_each_entry(publ, &publ_root, local_list) { + if (publ->node == tipc_own_addr) + break; + publ->node = tipc_own_addr; + } + write_unlock_bh(&tipc_nametbl_lock); +} diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h new file mode 100644 index 00000000..1e41bdd4 --- /dev/null +++ b/net/tipc/name_distr.h @@ -0,0 +1,48 @@ +/* + * net/tipc/name_distr.h: Include file for TIPC name distribution code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NAME_DISTR_H +#define _TIPC_NAME_DISTR_H + +#include "name_table.h" + +void tipc_named_publish(struct publication *publ); +void tipc_named_withdraw(struct publication *publ); +void tipc_named_node_up(unsigned long node); +void tipc_named_recv(struct sk_buff *buf); +void tipc_named_reinit(void); + +#endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c new file mode 100644 index 00000000..205ed4a4 --- /dev/null +++ b/net/tipc/name_table.c @@ -0,0 +1,1037 @@ +/* + * net/tipc/name_table.c: TIPC name table code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2008, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "name_table.h" +#include "name_distr.h" +#include "subscr.h" +#include "port.h" + +static int tipc_nametbl_size = 1024; /* must be a power of 2 */ + +/** + * struct sub_seq - container for all published instances of a name sequence + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @node_list: circular list of publications made by own node + * @cluster_list: circular list of publications made by own cluster + * @zone_list: circular list of publications made by own zone + * @node_list_size: number of entries in "node_list" + * @cluster_list_size: number of entries in "cluster_list" + * @zone_list_size: number of entries in "zone_list" + * + * Note: The zone list always contains at least one entry, since all + * publications of the associated name sequence belong to it. + * (The cluster and node lists may be empty.) + */ + +struct sub_seq { + u32 lower; + u32 upper; + struct publication *node_list; + struct publication *cluster_list; + struct publication *zone_list; + u32 node_list_size; + u32 cluster_list_size; + u32 zone_list_size; +}; + +/** + * struct name_seq - container for all published instances of a name type + * @type: 32 bit 'type' value for name sequence + * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type'; + * sub-sequences are sorted in ascending order + * @alloc: number of sub-sequences currently in array + * @first_free: array index of first unused sub-sequence entry + * @ns_list: links to adjacent name sequences in hash chain + * @subscriptions: list of subscriptions for this 'type' + * @lock: spinlock controlling access to publication lists of all sub-sequences + */ + +struct name_seq { + u32 type; + struct sub_seq *sseqs; + u32 alloc; + u32 first_free; + struct hlist_node ns_list; + struct list_head subscriptions; + spinlock_t lock; +}; + +/** + * struct name_table - table containing all existing port name publications + * @types: pointer to fixed-sized array of name sequence lists, + * accessed via hashing on 'type'; name sequence lists are *not* sorted + * @local_publ_count: number of publications issued by this node + */ + +struct name_table { + struct hlist_head *types; + u32 local_publ_count; +}; + +static struct name_table table; +static atomic_t rsv_publ_ok = ATOMIC_INIT(0); +DEFINE_RWLOCK(tipc_nametbl_lock); + + +static int hash(int x) +{ + return x & (tipc_nametbl_size - 1); +} + +/** + * publ_create - create a publication structure + */ + +static struct publication *publ_create(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port_ref, + u32 key) +{ + struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC); + if (publ == NULL) { + warn("Publication creation failure, no memory\n"); + return NULL; + } + + publ->type = type; + publ->lower = lower; + publ->upper = upper; + publ->scope = scope; + publ->node = node; + publ->ref = port_ref; + publ->key = key; + INIT_LIST_HEAD(&publ->local_list); + INIT_LIST_HEAD(&publ->pport_list); + INIT_LIST_HEAD(&publ->subscr.nodesub_list); + return publ; +} + +/** + * tipc_subseq_alloc - allocate a specified number of sub-sequence structures + */ + +static struct sub_seq *tipc_subseq_alloc(u32 cnt) +{ + struct sub_seq *sseq = kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC); + return sseq; +} + +/** + * tipc_nameseq_create - create a name sequence structure for the specified 'type' + * + * Allocates a single sub-sequence structure and sets it to all 0's. + */ + +static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_head) +{ + struct name_seq *nseq = kzalloc(sizeof(*nseq), GFP_ATOMIC); + struct sub_seq *sseq = tipc_subseq_alloc(1); + + if (!nseq || !sseq) { + warn("Name sequence creation failed, no memory\n"); + kfree(nseq); + kfree(sseq); + return NULL; + } + + spin_lock_init(&nseq->lock); + nseq->type = type; + nseq->sseqs = sseq; + nseq->alloc = 1; + INIT_HLIST_NODE(&nseq->ns_list); + INIT_LIST_HEAD(&nseq->subscriptions); + hlist_add_head(&nseq->ns_list, seq_head); + return nseq; +} + +/** + * nameseq_find_subseq - find sub-sequence (if any) matching a name instance + * + * Very time-critical, so binary searches through sub-sequence array. + */ + +static struct sub_seq *nameseq_find_subseq(struct name_seq *nseq, + u32 instance) +{ + struct sub_seq *sseqs = nseq->sseqs; + int low = 0; + int high = nseq->first_free - 1; + int mid; + + while (low <= high) { + mid = (low + high) / 2; + if (instance < sseqs[mid].lower) + high = mid - 1; + else if (instance > sseqs[mid].upper) + low = mid + 1; + else + return &sseqs[mid]; + } + return NULL; +} + +/** + * nameseq_locate_subseq - determine position of name instance in sub-sequence + * + * Returns index in sub-sequence array of the entry that contains the specified + * instance value; if no entry contains that value, returns the position + * where a new entry for it would be inserted in the array. + * + * Note: Similar to binary search code for locating a sub-sequence. + */ + +static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance) +{ + struct sub_seq *sseqs = nseq->sseqs; + int low = 0; + int high = nseq->first_free - 1; + int mid; + + while (low <= high) { + mid = (low + high) / 2; + if (instance < sseqs[mid].lower) + high = mid - 1; + else if (instance > sseqs[mid].upper) + low = mid + 1; + else + return mid; + } + return low; +} + +/** + * tipc_nameseq_insert_publ - + */ + +static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, + u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port, u32 key) +{ + struct subscription *s; + struct subscription *st; + struct publication *publ; + struct sub_seq *sseq; + int created_subseq = 0; + + sseq = nameseq_find_subseq(nseq, lower); + if (sseq) { + + /* Lower end overlaps existing entry => need an exact match */ + + if ((sseq->lower != lower) || (sseq->upper != upper)) { + warn("Cannot publish {%u,%u,%u}, overlap error\n", + type, lower, upper); + return NULL; + } + } else { + u32 inspos; + struct sub_seq *freesseq; + + /* Find where lower end should be inserted */ + + inspos = nameseq_locate_subseq(nseq, lower); + + /* Fail if upper end overlaps into an existing entry */ + + if ((inspos < nseq->first_free) && + (upper >= nseq->sseqs[inspos].lower)) { + warn("Cannot publish {%u,%u,%u}, overlap error\n", + type, lower, upper); + return NULL; + } + + /* Ensure there is space for new sub-sequence */ + + if (nseq->first_free == nseq->alloc) { + struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2); + + if (!sseqs) { + warn("Cannot publish {%u,%u,%u}, no memory\n", + type, lower, upper); + return NULL; + } + memcpy(sseqs, nseq->sseqs, + nseq->alloc * sizeof(struct sub_seq)); + kfree(nseq->sseqs); + nseq->sseqs = sseqs; + nseq->alloc *= 2; + } + + /* Insert new sub-sequence */ + + sseq = &nseq->sseqs[inspos]; + freesseq = &nseq->sseqs[nseq->first_free]; + memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof(*sseq)); + memset(sseq, 0, sizeof(*sseq)); + nseq->first_free++; + sseq->lower = lower; + sseq->upper = upper; + created_subseq = 1; + } + + /* Insert a publication: */ + + publ = publ_create(type, lower, upper, scope, node, port, key); + if (!publ) + return NULL; + + sseq->zone_list_size++; + if (!sseq->zone_list) + sseq->zone_list = publ->zone_list_next = publ; + else { + publ->zone_list_next = sseq->zone_list->zone_list_next; + sseq->zone_list->zone_list_next = publ; + } + + if (in_own_cluster(node)) { + sseq->cluster_list_size++; + if (!sseq->cluster_list) + sseq->cluster_list = publ->cluster_list_next = publ; + else { + publ->cluster_list_next = + sseq->cluster_list->cluster_list_next; + sseq->cluster_list->cluster_list_next = publ; + } + } + + if (node == tipc_own_addr) { + sseq->node_list_size++; + if (!sseq->node_list) + sseq->node_list = publ->node_list_next = publ; + else { + publ->node_list_next = sseq->node_list->node_list_next; + sseq->node_list->node_list_next = publ; + } + } + + /* + * Any subscriptions waiting for notification? + */ + list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { + tipc_subscr_report_overlap(s, + publ->lower, + publ->upper, + TIPC_PUBLISHED, + publ->ref, + publ->node, + created_subseq); + } + return publ; +} + +/** + * tipc_nameseq_remove_publ - + * + * NOTE: There may be cases where TIPC is asked to remove a publication + * that is not in the name table. For example, if another node issues a + * publication for a name sequence that overlaps an existing name sequence + * the publication will not be recorded, which means the publication won't + * be found when the name sequence is later withdrawn by that node. + * A failed withdraw request simply returns a failure indication and lets the + * caller issue any error or warning messages associated with such a problem. + */ + +static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 inst, + u32 node, u32 ref, u32 key) +{ + struct publication *publ; + struct publication *curr; + struct publication *prev; + struct sub_seq *sseq = nameseq_find_subseq(nseq, inst); + struct sub_seq *free; + struct subscription *s, *st; + int removed_subseq = 0; + + if (!sseq) + return NULL; + + /* Remove publication from zone scope list */ + + prev = sseq->zone_list; + publ = sseq->zone_list->zone_list_next; + while ((publ->key != key) || (publ->ref != ref) || + (publ->node && (publ->node != node))) { + prev = publ; + publ = publ->zone_list_next; + if (prev == sseq->zone_list) { + + /* Prevent endless loop if publication not found */ + + return NULL; + } + } + if (publ != sseq->zone_list) + prev->zone_list_next = publ->zone_list_next; + else if (publ->zone_list_next != publ) { + prev->zone_list_next = publ->zone_list_next; + sseq->zone_list = publ->zone_list_next; + } else { + sseq->zone_list = NULL; + } + sseq->zone_list_size--; + + /* Remove publication from cluster scope list, if present */ + + if (in_own_cluster(node)) { + prev = sseq->cluster_list; + curr = sseq->cluster_list->cluster_list_next; + while (curr != publ) { + prev = curr; + curr = curr->cluster_list_next; + if (prev == sseq->cluster_list) { + + /* Prevent endless loop for malformed list */ + + err("Unable to de-list cluster publication\n" + "{%u%u}, node=0x%x, ref=%u, key=%u)\n", + publ->type, publ->lower, publ->node, + publ->ref, publ->key); + goto end_cluster; + } + } + if (publ != sseq->cluster_list) + prev->cluster_list_next = publ->cluster_list_next; + else if (publ->cluster_list_next != publ) { + prev->cluster_list_next = publ->cluster_list_next; + sseq->cluster_list = publ->cluster_list_next; + } else { + sseq->cluster_list = NULL; + } + sseq->cluster_list_size--; + } +end_cluster: + + /* Remove publication from node scope list, if present */ + + if (node == tipc_own_addr) { + prev = sseq->node_list; + curr = sseq->node_list->node_list_next; + while (curr != publ) { + prev = curr; + curr = curr->node_list_next; + if (prev == sseq->node_list) { + + /* Prevent endless loop for malformed list */ + + err("Unable to de-list node publication\n" + "{%u%u}, node=0x%x, ref=%u, key=%u)\n", + publ->type, publ->lower, publ->node, + publ->ref, publ->key); + goto end_node; + } + } + if (publ != sseq->node_list) + prev->node_list_next = publ->node_list_next; + else if (publ->node_list_next != publ) { + prev->node_list_next = publ->node_list_next; + sseq->node_list = publ->node_list_next; + } else { + sseq->node_list = NULL; + } + sseq->node_list_size--; + } +end_node: + + /* Contract subseq list if no more publications for that subseq */ + + if (!sseq->zone_list) { + free = &nseq->sseqs[nseq->first_free--]; + memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq)); + removed_subseq = 1; + } + + /* Notify any waiting subscriptions */ + + list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { + tipc_subscr_report_overlap(s, + publ->lower, + publ->upper, + TIPC_WITHDRAWN, + publ->ref, + publ->node, + removed_subseq); + } + + return publ; +} + +/** + * tipc_nameseq_subscribe: attach a subscription, and issue + * the prescribed number of events if there is any sub- + * sequence overlapping with the requested sequence + */ + +static void tipc_nameseq_subscribe(struct name_seq *nseq, struct subscription *s) +{ + struct sub_seq *sseq = nseq->sseqs; + + list_add(&s->nameseq_list, &nseq->subscriptions); + + if (!sseq) + return; + + while (sseq != &nseq->sseqs[nseq->first_free]) { + struct publication *zl = sseq->zone_list; + if (zl && tipc_subscr_overlap(s, sseq->lower, sseq->upper)) { + struct publication *crs = zl; + int must_report = 1; + + do { + tipc_subscr_report_overlap(s, + sseq->lower, + sseq->upper, + TIPC_PUBLISHED, + crs->ref, + crs->node, + must_report); + must_report = 0; + crs = crs->zone_list_next; + } while (crs != zl); + } + sseq++; + } +} + +static struct name_seq *nametbl_find_seq(u32 type) +{ + struct hlist_head *seq_head; + struct hlist_node *seq_node; + struct name_seq *ns; + + seq_head = &table.types[hash(type)]; + hlist_for_each_entry(ns, seq_node, seq_head, ns_list) { + if (ns->type == type) + return ns; + } + + return NULL; +}; + +struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port, u32 key) +{ + struct name_seq *seq = nametbl_find_seq(type); + + if (lower > upper) { + warn("Failed to publish illegal {%u,%u,%u}\n", + type, lower, upper); + return NULL; + } + + if (!seq) + seq = tipc_nameseq_create(type, &table.types[hash(type)]); + if (!seq) + return NULL; + + return tipc_nameseq_insert_publ(seq, type, lower, upper, + scope, node, port, key); +} + +struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, + u32 node, u32 ref, u32 key) +{ + struct publication *publ; + struct name_seq *seq = nametbl_find_seq(type); + + if (!seq) + return NULL; + + publ = tipc_nameseq_remove_publ(seq, lower, node, ref, key); + + if (!seq->first_free && list_empty(&seq->subscriptions)) { + hlist_del_init(&seq->ns_list); + kfree(seq->sseqs); + kfree(seq); + } + return publ; +} + +/* + * tipc_nametbl_translate - translate name to port id + * + * Note: on entry 'destnode' is the search domain used during translation; + * on exit it passes back the node address of the matching port (if any) + */ + +u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *destnode) +{ + struct sub_seq *sseq; + struct publication *publ = NULL; + struct name_seq *seq; + u32 ref; + + if (!tipc_in_scope(*destnode, tipc_own_addr)) + return 0; + + read_lock_bh(&tipc_nametbl_lock); + seq = nametbl_find_seq(type); + if (unlikely(!seq)) + goto not_found; + sseq = nameseq_find_subseq(seq, instance); + if (unlikely(!sseq)) + goto not_found; + spin_lock_bh(&seq->lock); + + /* Closest-First Algorithm: */ + if (likely(!*destnode)) { + publ = sseq->node_list; + if (publ) { + sseq->node_list = publ->node_list_next; +found: + ref = publ->ref; + *destnode = publ->node; + spin_unlock_bh(&seq->lock); + read_unlock_bh(&tipc_nametbl_lock); + return ref; + } + publ = sseq->cluster_list; + if (publ) { + sseq->cluster_list = publ->cluster_list_next; + goto found; + } + publ = sseq->zone_list; + if (publ) { + sseq->zone_list = publ->zone_list_next; + goto found; + } + } + + /* Round-Robin Algorithm: */ + else if (*destnode == tipc_own_addr) { + publ = sseq->node_list; + if (publ) { + sseq->node_list = publ->node_list_next; + goto found; + } + } else if (in_own_cluster(*destnode)) { + publ = sseq->cluster_list; + if (publ) { + sseq->cluster_list = publ->cluster_list_next; + goto found; + } + } else { + publ = sseq->zone_list; + if (publ) { + sseq->zone_list = publ->zone_list_next; + goto found; + } + } + spin_unlock_bh(&seq->lock); +not_found: + read_unlock_bh(&tipc_nametbl_lock); + return 0; +} + +/** + * tipc_nametbl_mc_translate - find multicast destinations + * + * Creates list of all local ports that overlap the given multicast address; + * also determines if any off-node ports overlap. + * + * Note: Publications with a scope narrower than 'limit' are ignored. + * (i.e. local node-scope publications mustn't receive messages arriving + * from another node, even if the multcast link brought it here) + * + * Returns non-zero if any off-node ports overlap + */ + +int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, + struct port_list *dports) +{ + struct name_seq *seq; + struct sub_seq *sseq; + struct sub_seq *sseq_stop; + int res = 0; + + read_lock_bh(&tipc_nametbl_lock); + seq = nametbl_find_seq(type); + if (!seq) + goto exit; + + spin_lock_bh(&seq->lock); + + sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); + sseq_stop = seq->sseqs + seq->first_free; + for (; sseq != sseq_stop; sseq++) { + struct publication *publ; + + if (sseq->lower > upper) + break; + + publ = sseq->node_list; + if (publ) { + do { + if (publ->scope <= limit) + tipc_port_list_add(dports, publ->ref); + publ = publ->node_list_next; + } while (publ != sseq->node_list); + } + + if (sseq->cluster_list_size != sseq->node_list_size) + res = 1; + } + + spin_unlock_bh(&seq->lock); +exit: + read_unlock_bh(&tipc_nametbl_lock); + return res; +} + +/** + * tipc_nametbl_publish_rsv - publish port name using a reserved name type + */ + +int tipc_nametbl_publish_rsv(u32 ref, unsigned int scope, + struct tipc_name_seq const *seq) +{ + int res; + + atomic_inc(&rsv_publ_ok); + res = tipc_publish(ref, scope, seq); + atomic_dec(&rsv_publ_ok); + return res; +} + +/** + * tipc_nametbl_publish - add name publication to network name tables + */ + +struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, + u32 scope, u32 port_ref, u32 key) +{ + struct publication *publ; + + if (table.local_publ_count >= tipc_max_publications) { + warn("Publication failed, local publication limit reached (%u)\n", + tipc_max_publications); + return NULL; + } + if ((type < TIPC_RESERVED_TYPES) && !atomic_read(&rsv_publ_ok)) { + warn("Publication failed, reserved name {%u,%u,%u}\n", + type, lower, upper); + return NULL; + } + + write_lock_bh(&tipc_nametbl_lock); + table.local_publ_count++; + publ = tipc_nametbl_insert_publ(type, lower, upper, scope, + tipc_own_addr, port_ref, key); + if (publ && (scope != TIPC_NODE_SCOPE)) + tipc_named_publish(publ); + write_unlock_bh(&tipc_nametbl_lock); + return publ; +} + +/** + * tipc_nametbl_withdraw - withdraw name publication from network name tables + */ + +int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) +{ + struct publication *publ; + + write_lock_bh(&tipc_nametbl_lock); + publ = tipc_nametbl_remove_publ(type, lower, tipc_own_addr, ref, key); + if (likely(publ)) { + table.local_publ_count--; + if (publ->scope != TIPC_NODE_SCOPE) + tipc_named_withdraw(publ); + write_unlock_bh(&tipc_nametbl_lock); + list_del_init(&publ->pport_list); + kfree(publ); + return 1; + } + write_unlock_bh(&tipc_nametbl_lock); + err("Unable to remove local publication\n" + "(type=%u, lower=%u, ref=%u, key=%u)\n", + type, lower, ref, key); + return 0; +} + +/** + * tipc_nametbl_subscribe - add a subscription object to the name table + */ + +void tipc_nametbl_subscribe(struct subscription *s) +{ + u32 type = s->seq.type; + struct name_seq *seq; + + write_lock_bh(&tipc_nametbl_lock); + seq = nametbl_find_seq(type); + if (!seq) + seq = tipc_nameseq_create(type, &table.types[hash(type)]); + if (seq) { + spin_lock_bh(&seq->lock); + tipc_nameseq_subscribe(seq, s); + spin_unlock_bh(&seq->lock); + } else { + warn("Failed to create subscription for {%u,%u,%u}\n", + s->seq.type, s->seq.lower, s->seq.upper); + } + write_unlock_bh(&tipc_nametbl_lock); +} + +/** + * tipc_nametbl_unsubscribe - remove a subscription object from name table + */ + +void tipc_nametbl_unsubscribe(struct subscription *s) +{ + struct name_seq *seq; + + write_lock_bh(&tipc_nametbl_lock); + seq = nametbl_find_seq(s->seq.type); + if (seq != NULL) { + spin_lock_bh(&seq->lock); + list_del_init(&s->nameseq_list); + spin_unlock_bh(&seq->lock); + if ((seq->first_free == 0) && list_empty(&seq->subscriptions)) { + hlist_del_init(&seq->ns_list); + kfree(seq->sseqs); + kfree(seq); + } + } + write_unlock_bh(&tipc_nametbl_lock); +} + + +/** + * subseq_list: print specified sub-sequence contents into the given buffer + */ + +static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, + u32 index) +{ + char portIdStr[27]; + const char *scope_str[] = {"", " zone", " cluster", " node"}; + struct publication *publ = sseq->zone_list; + + tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper); + + if (depth == 2 || !publ) { + tipc_printf(buf, "\n"); + return; + } + + do { + sprintf(portIdStr, "<%u.%u.%u:%u>", + tipc_zone(publ->node), tipc_cluster(publ->node), + tipc_node(publ->node), publ->ref); + tipc_printf(buf, "%-26s ", portIdStr); + if (depth > 3) { + tipc_printf(buf, "%-10u %s", publ->key, + scope_str[publ->scope]); + } + + publ = publ->zone_list_next; + if (publ == sseq->zone_list) + break; + + tipc_printf(buf, "\n%33s", " "); + } while (1); + + tipc_printf(buf, "\n"); +} + +/** + * nameseq_list: print specified name sequence contents into the given buffer + */ + +static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth, + u32 type, u32 lowbound, u32 upbound, u32 index) +{ + struct sub_seq *sseq; + char typearea[11]; + + if (seq->first_free == 0) + return; + + sprintf(typearea, "%-10u", seq->type); + + if (depth == 1) { + tipc_printf(buf, "%s\n", typearea); + return; + } + + for (sseq = seq->sseqs; sseq != &seq->sseqs[seq->first_free]; sseq++) { + if ((lowbound <= sseq->upper) && (upbound >= sseq->lower)) { + tipc_printf(buf, "%s ", typearea); + spin_lock_bh(&seq->lock); + subseq_list(sseq, buf, depth, index); + spin_unlock_bh(&seq->lock); + sprintf(typearea, "%10s", " "); + } + } +} + +/** + * nametbl_header - print name table header into the given buffer + */ + +static void nametbl_header(struct print_buf *buf, u32 depth) +{ + const char *header[] = { + "Type ", + "Lower Upper ", + "Port Identity ", + "Publication Scope" + }; + + int i; + + if (depth > 4) + depth = 4; + for (i = 0; i < depth; i++) + tipc_printf(buf, header[i]); + tipc_printf(buf, "\n"); +} + +/** + * nametbl_list - print specified name table contents into the given buffer + */ + +static void nametbl_list(struct print_buf *buf, u32 depth_info, + u32 type, u32 lowbound, u32 upbound) +{ + struct hlist_head *seq_head; + struct hlist_node *seq_node; + struct name_seq *seq; + int all_types; + u32 depth; + u32 i; + + all_types = (depth_info & TIPC_NTQ_ALLTYPES); + depth = (depth_info & ~TIPC_NTQ_ALLTYPES); + + if (depth == 0) + return; + + if (all_types) { + /* display all entries in name table to specified depth */ + nametbl_header(buf, depth); + lowbound = 0; + upbound = ~0; + for (i = 0; i < tipc_nametbl_size; i++) { + seq_head = &table.types[i]; + hlist_for_each_entry(seq, seq_node, seq_head, ns_list) { + nameseq_list(seq, buf, depth, seq->type, + lowbound, upbound, i); + } + } + } else { + /* display only the sequence that matches the specified type */ + if (upbound < lowbound) { + tipc_printf(buf, "invalid name sequence specified\n"); + return; + } + nametbl_header(buf, depth); + i = hash(type); + seq_head = &table.types[i]; + hlist_for_each_entry(seq, seq_node, seq_head, ns_list) { + if (seq->type == type) { + nameseq_list(seq, buf, depth, type, + lowbound, upbound, i); + break; + } + } + } +} + +#define MAX_NAME_TBL_QUERY 32768 + +struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space) +{ + struct sk_buff *buf; + struct tipc_name_table_query *argv; + struct tlv_desc *rep_tlv; + struct print_buf b; + int str_len; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NAME_TBL_QUERY)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_NAME_TBL_QUERY)); + if (!buf) + return NULL; + + rep_tlv = (struct tlv_desc *)buf->data; + tipc_printbuf_init(&b, TLV_DATA(rep_tlv), MAX_NAME_TBL_QUERY); + argv = (struct tipc_name_table_query *)TLV_DATA(req_tlv_area); + read_lock_bh(&tipc_nametbl_lock); + nametbl_list(&b, ntohl(argv->depth), ntohl(argv->type), + ntohl(argv->lowbound), ntohl(argv->upbound)); + read_unlock_bh(&tipc_nametbl_lock); + str_len = tipc_printbuf_validate(&b); + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +int tipc_nametbl_init(void) +{ + table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head), + GFP_ATOMIC); + if (!table.types) + return -ENOMEM; + + table.local_publ_count = 0; + return 0; +} + +void tipc_nametbl_stop(void) +{ + u32 i; + + if (!table.types) + return; + + /* Verify name table is empty, then release it */ + + write_lock_bh(&tipc_nametbl_lock); + for (i = 0; i < tipc_nametbl_size; i++) { + if (!hlist_empty(&table.types[i])) + err("tipc_nametbl_stop(): hash chain %u is non-null\n", i); + } + kfree(table.types); + table.types = NULL; + write_unlock_bh(&tipc_nametbl_lock); +} + diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h new file mode 100644 index 00000000..d228bd68 --- /dev/null +++ b/net/tipc/name_table.h @@ -0,0 +1,108 @@ +/* + * net/tipc/name_table.h: Include file for TIPC name table code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NAME_TABLE_H +#define _TIPC_NAME_TABLE_H + +#include "node_subscr.h" + +struct subscription; +struct port_list; + +/* + * TIPC name types reserved for internal TIPC use (both current and planned) + */ + +#define TIPC_ZM_SRV 3 /* zone master service name type */ + + +/** + * struct publication - info about a published (name or) name sequence + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @scope: scope of publication + * @node: network address of publishing port's node + * @ref: publishing port + * @key: publication key + * @subscr: subscription to "node down" event (for off-node publications only) + * @local_list: adjacent entries in list of publications made by this node + * @pport_list: adjacent entries in list of publications made by this port + * @node_list: next matching name seq publication with >= node scope + * @cluster_list: next matching name seq publication with >= cluster scope + * @zone_list: next matching name seq publication with >= zone scope + * + * Note that the node list, cluster list, and zone list are circular lists. + */ + +struct publication { + u32 type; + u32 lower; + u32 upper; + u32 scope; + u32 node; + u32 ref; + u32 key; + struct tipc_node_subscr subscr; + struct list_head local_list; + struct list_head pport_list; + struct publication *node_list_next; + struct publication *cluster_list_next; + struct publication *zone_list_next; +}; + + +extern rwlock_t tipc_nametbl_lock; + +struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space); +u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *node); +int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, + struct port_list *dports); +int tipc_nametbl_publish_rsv(u32 ref, unsigned int scope, + struct tipc_name_seq const *seq); +struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, + u32 scope, u32 port_ref, u32 key); +int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key); +struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 ref, u32 key); +struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, + u32 node, u32 ref, u32 key); +void tipc_nametbl_subscribe(struct subscription *s); +void tipc_nametbl_unsubscribe(struct subscription *s); +int tipc_nametbl_init(void); +void tipc_nametbl_stop(void); + +#endif diff --git a/net/tipc/net.c b/net/tipc/net.c new file mode 100644 index 00000000..68b3dd63 --- /dev/null +++ b/net/tipc/net.c @@ -0,0 +1,228 @@ +/* + * net/tipc/net.c: TIPC network routing code + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "net.h" +#include "name_distr.h" +#include "subscr.h" +#include "port.h" +#include "node.h" +#include "config.h" + +/* + * The TIPC locking policy is designed to ensure a very fine locking + * granularity, permitting complete parallel access to individual + * port and node/link instances. The code consists of three major + * locking domains, each protected with their own disjunct set of locks. + * + * 1: The routing hierarchy. + * Comprises the structures 'zone', 'cluster', 'node', 'link' + * and 'bearer'. The whole hierarchy is protected by a big + * read/write lock, tipc_net_lock, to enssure that nothing is added + * or removed while code is accessing any of these structures. + * This layer must not be called from the two others while they + * hold any of their own locks. + * Neither must it itself do any upcalls to the other two before + * it has released tipc_net_lock and other protective locks. + * + * Within the tipc_net_lock domain there are two sub-domains;'node' and + * 'bearer', where local write operations are permitted, + * provided that those are protected by individual spin_locks + * per instance. Code holding tipc_net_lock(read) and a node spin_lock + * is permitted to poke around in both the node itself and its + * subordinate links. I.e, it can update link counters and queues, + * change link state, send protocol messages, and alter the + * "active_links" array in the node; but it can _not_ remove a link + * or a node from the overall structure. + * Correspondingly, individual bearers may change status within a + * tipc_net_lock(read), protected by an individual spin_lock ber bearer + * instance, but it needs tipc_net_lock(write) to remove/add any bearers. + * + * + * 2: The transport level of the protocol. + * This consists of the structures port, (and its user level + * representations, such as user_port and tipc_sock), reference and + * tipc_user (port.c, reg.c, socket.c). + * + * This layer has four different locks: + * - The tipc_port spin_lock. This is protecting each port instance + * from parallel data access and removal. Since we can not place + * this lock in the port itself, it has been placed in the + * corresponding reference table entry, which has the same life + * cycle as the module. This entry is difficult to access from + * outside the TIPC core, however, so a pointer to the lock has + * been added in the port instance, -to be used for unlocking + * only. + * - A read/write lock to protect the reference table itself (teg.c). + * (Nobody is using read-only access to this, so it can just as + * well be changed to a spin_lock) + * - A spin lock to protect the registry of kernel/driver users (reg.c) + * - A global spin_lock (tipc_port_lock), which only task is to ensure + * consistency where more than one port is involved in an operation, + * i.e., whe a port is part of a linked list of ports. + * There are two such lists; 'port_list', which is used for management, + * and 'wait_list', which is used to queue ports during congestion. + * + * 3: The name table (name_table.c, name_distr.c, subscription.c) + * - There is one big read/write-lock (tipc_nametbl_lock) protecting the + * overall name table structure. Nothing must be added/removed to + * this structure without holding write access to it. + * - There is one local spin_lock per sub_sequence, which can be seen + * as a sub-domain to the tipc_nametbl_lock domain. It is used only + * for translation operations, and is needed because a translation + * steps the root of the 'publication' linked list between each lookup. + * This is always used within the scope of a tipc_nametbl_lock(read). + * - A local spin_lock protecting the queue of subscriber events. +*/ + +DEFINE_RWLOCK(tipc_net_lock); + +static void net_route_named_msg(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + u32 dnode; + u32 dport; + + if (!msg_named(msg)) { + buf_discard(buf); + return; + } + + dnode = addr_domain(msg_lookup_scope(msg)); + dport = tipc_nametbl_translate(msg_nametype(msg), msg_nameinst(msg), &dnode); + if (dport) { + msg_set_destnode(msg, dnode); + msg_set_destport(msg, dport); + tipc_net_route_msg(buf); + return; + } + tipc_reject_msg(buf, TIPC_ERR_NO_NAME); +} + +void tipc_net_route_msg(struct sk_buff *buf) +{ + struct tipc_msg *msg; + u32 dnode; + + if (!buf) + return; + msg = buf_msg(buf); + + msg_incr_reroute_cnt(msg); + if (msg_reroute_cnt(msg) > 6) { + if (msg_errcode(msg)) { + buf_discard(buf); + } else { + tipc_reject_msg(buf, msg_destport(msg) ? + TIPC_ERR_NO_PORT : TIPC_ERR_NO_NAME); + } + return; + } + + /* Handle message for this node */ + dnode = msg_short(msg) ? tipc_own_addr : msg_destnode(msg); + if (tipc_in_scope(dnode, tipc_own_addr)) { + if (msg_isdata(msg)) { + if (msg_mcast(msg)) + tipc_port_recv_mcast(buf, NULL); + else if (msg_destport(msg)) + tipc_port_recv_msg(buf); + else + net_route_named_msg(buf); + return; + } + switch (msg_user(msg)) { + case NAME_DISTRIBUTOR: + tipc_named_recv(buf); + break; + case CONN_MANAGER: + tipc_port_recv_proto_msg(buf); + break; + default: + buf_discard(buf); + } + return; + } + + /* Handle message for another node */ + skb_trim(buf, msg_size(msg)); + tipc_link_send(buf, dnode, msg_link_selector(msg)); +} + +int tipc_net_start(u32 addr) +{ + char addr_string[16]; + int res; + + if (tipc_mode != TIPC_NODE_MODE) + return -ENOPROTOOPT; + + tipc_subscr_stop(); + tipc_cfg_stop(); + + tipc_own_addr = addr; + tipc_mode = TIPC_NET_MODE; + tipc_named_reinit(); + tipc_port_reinit(); + + res = tipc_bclink_init(); + if (res) + return res; + + tipc_k_signal((Handler)tipc_subscr_start, 0); + tipc_k_signal((Handler)tipc_cfg_init, 0); + + info("Started in network mode\n"); + info("Own node address %s, network identity %u\n", + tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); + return 0; +} + +void tipc_net_stop(void) +{ + struct tipc_node *node, *t_node; + + if (tipc_mode != TIPC_NET_MODE) + return; + write_lock_bh(&tipc_net_lock); + tipc_bearer_stop(); + tipc_mode = TIPC_NODE_MODE; + tipc_bclink_stop(); + list_for_each_entry_safe(node, t_node, &tipc_node_list, list) + tipc_node_delete(node); + write_unlock_bh(&tipc_net_lock); + info("Left network mode\n"); +} diff --git a/net/tipc/net.h b/net/tipc/net.h new file mode 100644 index 00000000..9eb4b9e2 --- /dev/null +++ b/net/tipc/net.h @@ -0,0 +1,47 @@ +/* + * net/tipc/net.h: Include file for TIPC network routing code + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NET_H +#define _TIPC_NET_H + +extern rwlock_t tipc_net_lock; + +void tipc_net_route_msg(struct sk_buff *buf); + +int tipc_net_start(u32 addr); +void tipc_net_stop(void); + +#endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c new file mode 100644 index 00000000..7bda8e3d --- /dev/null +++ b/net/tipc/netlink.c @@ -0,0 +1,108 @@ +/* + * net/tipc/netlink.c: TIPC configuration handling + * + * Copyright (c) 2005-2006, Ericsson AB + * Copyright (c) 2005-2007, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include + +static int handle_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *rep_buf; + struct nlmsghdr *rep_nlh; + struct nlmsghdr *req_nlh = info->nlhdr; + struct tipc_genlmsghdr *req_userhdr = info->userhdr; + int hdr_space = NLMSG_SPACE(GENL_HDRLEN + TIPC_GENL_HDRLEN); + u16 cmd; + + if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN))) + cmd = TIPC_CMD_NOT_NET_ADMIN; + else + cmd = req_userhdr->cmd; + + rep_buf = tipc_cfg_do_cmd(req_userhdr->dest, cmd, + NLMSG_DATA(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN, + NLMSG_PAYLOAD(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN), + hdr_space); + + if (rep_buf) { + skb_push(rep_buf, hdr_space); + rep_nlh = nlmsg_hdr(rep_buf); + memcpy(rep_nlh, req_nlh, hdr_space); + rep_nlh->nlmsg_len = rep_buf->len; + genlmsg_unicast(&init_net, rep_buf, NETLINK_CB(skb).pid); + } + + return 0; +} + +static struct genl_family tipc_genl_family = { + .id = GENL_ID_GENERATE, + .name = TIPC_GENL_NAME, + .version = TIPC_GENL_VERSION, + .hdrsize = TIPC_GENL_HDRLEN, + .maxattr = 0, +}; + +static struct genl_ops tipc_genl_ops = { + .cmd = TIPC_GENL_CMD, + .doit = handle_cmd, +}; + +static int tipc_genl_family_registered; + +int tipc_netlink_start(void) +{ + int res; + + res = genl_register_family_with_ops(&tipc_genl_family, + &tipc_genl_ops, 1); + if (res) { + err("Failed to register netlink interface\n"); + return res; + } + + tipc_genl_family_registered = 1; + return 0; +} + +void tipc_netlink_stop(void) +{ + if (!tipc_genl_family_registered) + return; + + genl_unregister_family(&tipc_genl_family); + tipc_genl_family_registered = 0; +} diff --git a/net/tipc/node.c b/net/tipc/node.c new file mode 100644 index 00000000..2d106ef4 --- /dev/null +++ b/net/tipc/node.c @@ -0,0 +1,492 @@ +/* + * net/tipc/node.c: TIPC node management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005-2006, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "node.h" +#include "name_distr.h" + +static void node_lost_contact(struct tipc_node *n_ptr); +static void node_established_contact(struct tipc_node *n_ptr); + +static DEFINE_SPINLOCK(node_create_lock); + +static struct hlist_head node_htable[NODE_HTABLE_SIZE]; +LIST_HEAD(tipc_node_list); +static u32 tipc_num_nodes; + +static atomic_t tipc_num_links = ATOMIC_INIT(0); +u32 tipc_own_tag; + +/** + * tipc_node_find - locate specified node object, if it exists + */ + +struct tipc_node *tipc_node_find(u32 addr) +{ + struct tipc_node *node; + struct hlist_node *pos; + + if (unlikely(!in_own_cluster(addr))) + return NULL; + + hlist_for_each_entry(node, pos, &node_htable[tipc_hashfn(addr)], hash) { + if (node->addr == addr) + return node; + } + return NULL; +} + +/** + * tipc_node_create - create neighboring node + * + * Currently, this routine is called by neighbor discovery code, which holds + * net_lock for reading only. We must take node_create_lock to ensure a node + * isn't created twice if two different bearers discover the node at the same + * time. (It would be preferable to switch to holding net_lock in write mode, + * but this is a non-trivial change.) + */ + +struct tipc_node *tipc_node_create(u32 addr) +{ + struct tipc_node *n_ptr, *temp_node; + + spin_lock_bh(&node_create_lock); + + n_ptr = tipc_node_find(addr); + if (n_ptr) { + spin_unlock_bh(&node_create_lock); + return n_ptr; + } + + n_ptr = kzalloc(sizeof(*n_ptr), GFP_ATOMIC); + if (!n_ptr) { + spin_unlock_bh(&node_create_lock); + warn("Node creation failed, no memory\n"); + return NULL; + } + + n_ptr->addr = addr; + spin_lock_init(&n_ptr->lock); + INIT_HLIST_NODE(&n_ptr->hash); + INIT_LIST_HEAD(&n_ptr->list); + INIT_LIST_HEAD(&n_ptr->nsub); + + hlist_add_head(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]); + + list_for_each_entry(temp_node, &tipc_node_list, list) { + if (n_ptr->addr < temp_node->addr) + break; + } + list_add_tail(&n_ptr->list, &temp_node->list); + + tipc_num_nodes++; + + spin_unlock_bh(&node_create_lock); + return n_ptr; +} + +void tipc_node_delete(struct tipc_node *n_ptr) +{ + list_del(&n_ptr->list); + hlist_del(&n_ptr->hash); + kfree(n_ptr); + + tipc_num_nodes--; +} + + +/** + * tipc_node_link_up - handle addition of link + * + * Link becomes active (alone or shared) or standby, depending on its priority. + */ + +void tipc_node_link_up(struct tipc_node *n_ptr, struct link *l_ptr) +{ + struct link **active = &n_ptr->active_links[0]; + + n_ptr->working_links++; + + info("Established link <%s> on network plane %c\n", + l_ptr->name, l_ptr->b_ptr->net_plane); + + if (!active[0]) { + active[0] = active[1] = l_ptr; + node_established_contact(n_ptr); + return; + } + if (l_ptr->priority < active[0]->priority) { + info("New link <%s> becomes standby\n", l_ptr->name); + return; + } + tipc_link_send_duplicate(active[0], l_ptr); + if (l_ptr->priority == active[0]->priority) { + active[0] = l_ptr; + return; + } + info("Old link <%s> becomes standby\n", active[0]->name); + if (active[1] != active[0]) + info("Old link <%s> becomes standby\n", active[1]->name); + active[0] = active[1] = l_ptr; +} + +/** + * node_select_active_links - select active link + */ + +static void node_select_active_links(struct tipc_node *n_ptr) +{ + struct link **active = &n_ptr->active_links[0]; + u32 i; + u32 highest_prio = 0; + + active[0] = active[1] = NULL; + + for (i = 0; i < MAX_BEARERS; i++) { + struct link *l_ptr = n_ptr->links[i]; + + if (!l_ptr || !tipc_link_is_up(l_ptr) || + (l_ptr->priority < highest_prio)) + continue; + + if (l_ptr->priority > highest_prio) { + highest_prio = l_ptr->priority; + active[0] = active[1] = l_ptr; + } else { + active[1] = l_ptr; + } + } +} + +/** + * tipc_node_link_down - handle loss of link + */ + +void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr) +{ + struct link **active; + + n_ptr->working_links--; + + if (!tipc_link_is_active(l_ptr)) { + info("Lost standby link <%s> on network plane %c\n", + l_ptr->name, l_ptr->b_ptr->net_plane); + return; + } + info("Lost link <%s> on network plane %c\n", + l_ptr->name, l_ptr->b_ptr->net_plane); + + active = &n_ptr->active_links[0]; + if (active[0] == l_ptr) + active[0] = active[1]; + if (active[1] == l_ptr) + active[1] = active[0]; + if (active[0] == l_ptr) + node_select_active_links(n_ptr); + if (tipc_node_is_up(n_ptr)) + tipc_link_changeover(l_ptr); + else + node_lost_contact(n_ptr); +} + +int tipc_node_active_links(struct tipc_node *n_ptr) +{ + return n_ptr->active_links[0] != NULL; +} + +int tipc_node_redundant_links(struct tipc_node *n_ptr) +{ + return n_ptr->working_links > 1; +} + +int tipc_node_is_up(struct tipc_node *n_ptr) +{ + return tipc_node_active_links(n_ptr); +} + +void tipc_node_attach_link(struct tipc_node *n_ptr, struct link *l_ptr) +{ + n_ptr->links[l_ptr->b_ptr->identity] = l_ptr; + atomic_inc(&tipc_num_links); + n_ptr->link_cnt++; +} + +void tipc_node_detach_link(struct tipc_node *n_ptr, struct link *l_ptr) +{ + n_ptr->links[l_ptr->b_ptr->identity] = NULL; + atomic_dec(&tipc_num_links); + n_ptr->link_cnt--; +} + +/* + * Routing table management - five cases to handle: + * + * 1: A link towards a zone/cluster external node comes up. + * => Send a multicast message updating routing tables of all + * system nodes within own cluster that the new destination + * can be reached via this node. + * (node.establishedContact()=>cluster.multicastNewRoute()) + * + * 2: A link towards a slave node comes up. + * => Send a multicast message updating routing tables of all + * system nodes within own cluster that the new destination + * can be reached via this node. + * (node.establishedContact()=>cluster.multicastNewRoute()) + * => Send a message to the slave node about existence + * of all system nodes within cluster: + * (node.establishedContact()=>cluster.sendLocalRoutes()) + * + * 3: A new cluster local system node becomes available. + * => Send message(s) to this particular node containing + * information about all cluster external and slave + * nodes which can be reached via this node. + * (node.establishedContact()==>network.sendExternalRoutes()) + * (node.establishedContact()==>network.sendSlaveRoutes()) + * => Send messages to all directly connected slave nodes + * containing information about the existence of the new node + * (node.establishedContact()=>cluster.multicastNewRoute()) + * + * 4: The link towards a zone/cluster external node or slave + * node goes down. + * => Send a multcast message updating routing tables of all + * nodes within cluster that the new destination can not any + * longer be reached via this node. + * (node.lostAllLinks()=>cluster.bcastLostRoute()) + * + * 5: A cluster local system node becomes unavailable. + * => Remove all references to this node from the local + * routing tables. Note: This is a completely node + * local operation. + * (node.lostAllLinks()=>network.removeAsRouter()) + * => Send messages to all directly connected slave nodes + * containing information about loss of the node + * (node.establishedContact()=>cluster.multicastLostRoute()) + * + */ + +static void node_established_contact(struct tipc_node *n_ptr) +{ + tipc_k_signal((Handler)tipc_named_node_up, n_ptr->addr); + + /* Syncronize broadcast acks */ + n_ptr->bclink.acked = tipc_bclink_get_last_sent(); + + if (n_ptr->bclink.supported) { + tipc_nmap_add(&tipc_bcast_nmap, n_ptr->addr); + if (n_ptr->addr < tipc_own_addr) + tipc_own_tag++; + } +} + +static void node_cleanup_finished(unsigned long node_addr) +{ + struct tipc_node *n_ptr; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(node_addr); + if (n_ptr) { + tipc_node_lock(n_ptr); + n_ptr->cleanup_required = 0; + tipc_node_unlock(n_ptr); + } + read_unlock_bh(&tipc_net_lock); +} + +static void node_lost_contact(struct tipc_node *n_ptr) +{ + char addr_string[16]; + u32 i; + + /* Clean up broadcast reception remains */ + n_ptr->bclink.gap_after = n_ptr->bclink.gap_to = 0; + while (n_ptr->bclink.deferred_head) { + struct sk_buff *buf = n_ptr->bclink.deferred_head; + n_ptr->bclink.deferred_head = buf->next; + buf_discard(buf); + } + if (n_ptr->bclink.defragm) { + buf_discard(n_ptr->bclink.defragm); + n_ptr->bclink.defragm = NULL; + } + + if (n_ptr->bclink.supported) { + tipc_bclink_acknowledge(n_ptr, + mod(n_ptr->bclink.acked + 10000)); + tipc_nmap_remove(&tipc_bcast_nmap, n_ptr->addr); + if (n_ptr->addr < tipc_own_addr) + tipc_own_tag--; + } + + info("Lost contact with %s\n", + tipc_addr_string_fill(addr_string, n_ptr->addr)); + + /* Abort link changeover */ + for (i = 0; i < MAX_BEARERS; i++) { + struct link *l_ptr = n_ptr->links[i]; + if (!l_ptr) + continue; + l_ptr->reset_checkpoint = l_ptr->next_in_no; + l_ptr->exp_msg_count = 0; + tipc_link_reset_fragments(l_ptr); + } + + /* Notify subscribers */ + tipc_nodesub_notify(n_ptr); + + /* Prevent re-contact with node until all cleanup is done */ + + n_ptr->cleanup_required = 1; + tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr); +} + +struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space) +{ + u32 domain; + struct sk_buff *buf; + struct tipc_node *n_ptr; + struct tipc_node_info node_info; + u32 payload_size; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + domain = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (!tipc_addr_domain_valid(domain)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (network address)"); + + read_lock_bh(&tipc_net_lock); + if (!tipc_num_nodes) { + read_unlock_bh(&tipc_net_lock); + return tipc_cfg_reply_none(); + } + + /* For now, get space for all other nodes */ + + payload_size = TLV_SPACE(sizeof(node_info)) * tipc_num_nodes; + if (payload_size > 32768u) { + read_unlock_bh(&tipc_net_lock); + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (too many nodes)"); + } + buf = tipc_cfg_reply_alloc(payload_size); + if (!buf) { + read_unlock_bh(&tipc_net_lock); + return NULL; + } + + /* Add TLVs for all nodes in scope */ + + list_for_each_entry(n_ptr, &tipc_node_list, list) { + if (!tipc_in_scope(domain, n_ptr->addr)) + continue; + node_info.addr = htonl(n_ptr->addr); + node_info.up = htonl(tipc_node_is_up(n_ptr)); + tipc_cfg_append_tlv(buf, TIPC_TLV_NODE_INFO, + &node_info, sizeof(node_info)); + } + + read_unlock_bh(&tipc_net_lock); + return buf; +} + +struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) +{ + u32 domain; + struct sk_buff *buf; + struct tipc_node *n_ptr; + struct tipc_link_info link_info; + u32 payload_size; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR)) + return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + domain = ntohl(*(__be32 *)TLV_DATA(req_tlv_area)); + if (!tipc_addr_domain_valid(domain)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (network address)"); + + if (tipc_mode != TIPC_NET_MODE) + return tipc_cfg_reply_none(); + + read_lock_bh(&tipc_net_lock); + + /* Get space for all unicast links + multicast link */ + + payload_size = TLV_SPACE(sizeof(link_info)) * + (atomic_read(&tipc_num_links) + 1); + if (payload_size > 32768u) { + read_unlock_bh(&tipc_net_lock); + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (too many links)"); + } + buf = tipc_cfg_reply_alloc(payload_size); + if (!buf) { + read_unlock_bh(&tipc_net_lock); + return NULL; + } + + /* Add TLV for broadcast link */ + + link_info.dest = htonl(tipc_cluster_mask(tipc_own_addr)); + link_info.up = htonl(1); + strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME); + tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); + + /* Add TLVs for any other links in scope */ + + list_for_each_entry(n_ptr, &tipc_node_list, list) { + u32 i; + + if (!tipc_in_scope(domain, n_ptr->addr)) + continue; + tipc_node_lock(n_ptr); + for (i = 0; i < MAX_BEARERS; i++) { + if (!n_ptr->links[i]) + continue; + link_info.dest = htonl(n_ptr->addr); + link_info.up = htonl(tipc_link_is_up(n_ptr->links[i])); + strcpy(link_info.str, n_ptr->links[i]->name); + tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, + &link_info, sizeof(link_info)); + } + tipc_node_unlock(n_ptr); + } + + read_unlock_bh(&tipc_net_lock); + return buf; +} diff --git a/net/tipc/node.h b/net/tipc/node.h new file mode 100644 index 00000000..5c61afc7 --- /dev/null +++ b/net/tipc/node.h @@ -0,0 +1,134 @@ +/* + * net/tipc/node.h: Include file for TIPC node management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NODE_H +#define _TIPC_NODE_H + +#include "node_subscr.h" +#include "addr.h" +#include "net.h" +#include "bearer.h" + +/** + * struct tipc_node - TIPC node structure + * @addr: network address of node + * @lock: spinlock governing access to structure + * @hash: links to adjacent nodes in unsorted hash chain + * @list: links to adjacent nodes in sorted list of cluster's nodes + * @nsub: list of "node down" subscriptions monitoring node + * @active_links: pointers to active links to node + * @links: pointers to all links to node + * @working_links: number of working links to node (both active and standby) + * @cleanup_required: non-zero if cleaning up after a prior loss of contact + * @link_cnt: number of links to node + * @permit_changeover: non-zero if node has redundant links to this system + * @bclink: broadcast-related info + * @supported: non-zero if node supports TIPC b'cast capability + * @acked: sequence # of last outbound b'cast message acknowledged by node + * @last_in: sequence # of last in-sequence b'cast message received from node + * @gap_after: sequence # of last message not requiring a NAK request + * @gap_to: sequence # of last message requiring a NAK request + * @nack_sync: counter that determines when NAK requests should be sent + * @deferred_head: oldest OOS b'cast message received from node + * @deferred_tail: newest OOS b'cast message received from node + * @defragm: list of partially reassembled b'cast message fragments from node + */ + +struct tipc_node { + u32 addr; + spinlock_t lock; + struct hlist_node hash; + struct list_head list; + struct list_head nsub; + struct link *active_links[2]; + struct link *links[MAX_BEARERS]; + int link_cnt; + int working_links; + int cleanup_required; + int permit_changeover; + struct { + int supported; + u32 acked; + u32 last_in; + u32 gap_after; + u32 gap_to; + u32 nack_sync; + struct sk_buff *deferred_head; + struct sk_buff *deferred_tail; + struct sk_buff *defragm; + } bclink; +}; + +#define NODE_HTABLE_SIZE 512 +extern struct list_head tipc_node_list; + +/* + * A trivial power-of-two bitmask technique is used for speed, since this + * operation is done for every incoming TIPC packet. The number of hash table + * entries has been chosen so that no hash chain exceeds 8 nodes and will + * usually be much smaller (typically only a single node). + */ +static inline unsigned int tipc_hashfn(u32 addr) +{ + return addr & (NODE_HTABLE_SIZE - 1); +} + +extern u32 tipc_own_tag; + +struct tipc_node *tipc_node_find(u32 addr); +struct tipc_node *tipc_node_create(u32 addr); +void tipc_node_delete(struct tipc_node *n_ptr); +void tipc_node_attach_link(struct tipc_node *n_ptr, struct link *l_ptr); +void tipc_node_detach_link(struct tipc_node *n_ptr, struct link *l_ptr); +void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr); +void tipc_node_link_up(struct tipc_node *n_ptr, struct link *l_ptr); +int tipc_node_active_links(struct tipc_node *n_ptr); +int tipc_node_redundant_links(struct tipc_node *n_ptr); +int tipc_node_is_up(struct tipc_node *n_ptr); +struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space); +struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space); + +static inline void tipc_node_lock(struct tipc_node *n_ptr) +{ + spin_lock_bh(&n_ptr->lock); +} + +static inline void tipc_node_unlock(struct tipc_node *n_ptr) +{ + spin_unlock_bh(&n_ptr->lock); +} + +#endif diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c new file mode 100644 index 00000000..c3c2815a --- /dev/null +++ b/net/tipc/node_subscr.c @@ -0,0 +1,97 @@ +/* + * net/tipc/node_subscr.c: TIPC "node down" subscription handling + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "node_subscr.h" +#include "node.h" + +/** + * tipc_nodesub_subscribe - create "node down" subscription for specified node + */ + +void tipc_nodesub_subscribe(struct tipc_node_subscr *node_sub, u32 addr, + void *usr_handle, net_ev_handler handle_down) +{ + if (addr == tipc_own_addr) { + node_sub->node = NULL; + return; + } + + node_sub->node = tipc_node_find(addr); + if (!node_sub->node) { + warn("Node subscription rejected, unknown node 0x%x\n", addr); + return; + } + node_sub->handle_node_down = handle_down; + node_sub->usr_handle = usr_handle; + + tipc_node_lock(node_sub->node); + list_add_tail(&node_sub->nodesub_list, &node_sub->node->nsub); + tipc_node_unlock(node_sub->node); +} + +/** + * tipc_nodesub_unsubscribe - cancel "node down" subscription (if any) + */ + +void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub) +{ + if (!node_sub->node) + return; + + tipc_node_lock(node_sub->node); + list_del_init(&node_sub->nodesub_list); + tipc_node_unlock(node_sub->node); +} + +/** + * tipc_nodesub_notify - notify subscribers that a node is unreachable + * + * Note: node is locked by caller + */ + +void tipc_nodesub_notify(struct tipc_node *node) +{ + struct tipc_node_subscr *ns; + + list_for_each_entry(ns, &node->nsub, nodesub_list) { + if (ns->handle_node_down) { + tipc_k_signal((Handler)ns->handle_node_down, + (unsigned long)ns->usr_handle); + ns->handle_node_down = NULL; + } + } +} diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h new file mode 100644 index 00000000..4bc2ca08 --- /dev/null +++ b/net/tipc/node_subscr.h @@ -0,0 +1,64 @@ +/* + * net/tipc/node_subscr.h: Include file for TIPC "node down" subscription handling + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NODE_SUBSCR_H +#define _TIPC_NODE_SUBSCR_H + +#include "addr.h" + +typedef void (*net_ev_handler) (void *usr_handle); + +/** + * struct tipc_node_subscr - "node down" subscription entry + * @node: ptr to node structure of interest (or NULL, if none) + * @handle_node_down: routine to invoke when node fails + * @usr_handle: argument to pass to routine when node fails + * @nodesub_list: adjacent entries in list of subscriptions for the node + */ + +struct tipc_node_subscr { + struct tipc_node *node; + net_ev_handler handle_node_down; + void *usr_handle; + struct list_head nodesub_list; +}; + +void tipc_nodesub_subscribe(struct tipc_node_subscr *node_sub, u32 addr, + void *usr_handle, net_ev_handler handle_down); +void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub); +void tipc_nodesub_notify(struct tipc_node *node); + +#endif diff --git a/net/tipc/port.c b/net/tipc/port.c new file mode 100644 index 00000000..c68dc956 --- /dev/null +++ b/net/tipc/port.c @@ -0,0 +1,1355 @@ +/* + * net/tipc/port.c: TIPC port code + * + * Copyright (c) 1992-2007, Ericsson AB + * Copyright (c) 2004-2008, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "port.h" +#include "name_table.h" + +/* Connection management: */ +#define PROBING_INTERVAL 3600000 /* [ms] => 1 h */ +#define CONFIRMED 0 +#define PROBING 1 + +#define MAX_REJECT_SIZE 1024 + +static struct sk_buff *msg_queue_head; +static struct sk_buff *msg_queue_tail; + +DEFINE_SPINLOCK(tipc_port_list_lock); +static DEFINE_SPINLOCK(queue_lock); + +static LIST_HEAD(ports); +static void port_handle_node_down(unsigned long ref); +static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err); +static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err); +static void port_timeout(unsigned long ref); + + +static u32 port_peernode(struct tipc_port *p_ptr) +{ + return msg_destnode(&p_ptr->phdr); +} + +static u32 port_peerport(struct tipc_port *p_ptr) +{ + return msg_destport(&p_ptr->phdr); +} + +/** + * tipc_multicast - send a multicast message to local and remote destinations + */ + +int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, + u32 num_sect, struct iovec const *msg_sect, + unsigned int total_len) +{ + struct tipc_msg *hdr; + struct sk_buff *buf; + struct sk_buff *ibuf = NULL; + struct port_list dports = {0, NULL, }; + struct tipc_port *oport = tipc_port_deref(ref); + int ext_targets; + int res; + + if (unlikely(!oport)) + return -EINVAL; + + /* Create multicast message */ + + hdr = &oport->phdr; + msg_set_type(hdr, TIPC_MCAST_MSG); + msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_nametype(hdr, seq->type); + msg_set_namelower(hdr, seq->lower); + msg_set_nameupper(hdr, seq->upper); + msg_set_hdr_sz(hdr, MCAST_H_SIZE); + res = tipc_msg_build(hdr, msg_sect, num_sect, total_len, MAX_MSG_SIZE, + !oport->user_port, &buf); + if (unlikely(!buf)) + return res; + + /* Figure out where to send multicast message */ + + ext_targets = tipc_nametbl_mc_translate(seq->type, seq->lower, seq->upper, + TIPC_NODE_SCOPE, &dports); + + /* Send message to destinations (duplicate it only if necessary) */ + + if (ext_targets) { + if (dports.count != 0) { + ibuf = skb_copy(buf, GFP_ATOMIC); + if (ibuf == NULL) { + tipc_port_list_free(&dports); + buf_discard(buf); + return -ENOMEM; + } + } + res = tipc_bclink_send_msg(buf); + if ((res < 0) && (dports.count != 0)) + buf_discard(ibuf); + } else { + ibuf = buf; + } + + if (res >= 0) { + if (ibuf) + tipc_port_recv_mcast(ibuf, &dports); + } else { + tipc_port_list_free(&dports); + } + return res; +} + +/** + * tipc_port_recv_mcast - deliver multicast message to all destination ports + * + * If there is no port list, perform a lookup to create one + */ + +void tipc_port_recv_mcast(struct sk_buff *buf, struct port_list *dp) +{ + struct tipc_msg *msg; + struct port_list dports = {0, NULL, }; + struct port_list *item = dp; + int cnt = 0; + + msg = buf_msg(buf); + + /* Create destination port list, if one wasn't supplied */ + + if (dp == NULL) { + tipc_nametbl_mc_translate(msg_nametype(msg), + msg_namelower(msg), + msg_nameupper(msg), + TIPC_CLUSTER_SCOPE, + &dports); + item = dp = &dports; + } + + /* Deliver a copy of message to each destination port */ + + if (dp->count != 0) { + msg_set_destnode(msg, tipc_own_addr); + if (dp->count == 1) { + msg_set_destport(msg, dp->ports[0]); + tipc_port_recv_msg(buf); + tipc_port_list_free(dp); + return; + } + for (; cnt < dp->count; cnt++) { + int index = cnt % PLSIZE; + struct sk_buff *b = skb_clone(buf, GFP_ATOMIC); + + if (b == NULL) { + warn("Unable to deliver multicast message(s)\n"); + goto exit; + } + if ((index == 0) && (cnt != 0)) + item = item->next; + msg_set_destport(buf_msg(b), item->ports[index]); + tipc_port_recv_msg(b); + } + } +exit: + buf_discard(buf); + tipc_port_list_free(dp); +} + +/** + * tipc_createport_raw - create a generic TIPC port + * + * Returns pointer to (locked) TIPC port, or NULL if unable to create it + */ + +struct tipc_port *tipc_createport_raw(void *usr_handle, + u32 (*dispatcher)(struct tipc_port *, struct sk_buff *), + void (*wakeup)(struct tipc_port *), + const u32 importance) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg; + u32 ref; + + p_ptr = kzalloc(sizeof(*p_ptr), GFP_ATOMIC); + if (!p_ptr) { + warn("Port creation failed, no memory\n"); + return NULL; + } + ref = tipc_ref_acquire(p_ptr, &p_ptr->lock); + if (!ref) { + warn("Port creation failed, reference table exhausted\n"); + kfree(p_ptr); + return NULL; + } + + p_ptr->usr_handle = usr_handle; + p_ptr->max_pkt = MAX_PKT_DEFAULT; + p_ptr->ref = ref; + msg = &p_ptr->phdr; + tipc_msg_init(msg, importance, TIPC_NAMED_MSG, LONG_H_SIZE, 0); + msg_set_origport(msg, ref); + INIT_LIST_HEAD(&p_ptr->wait_list); + INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); + p_ptr->dispatcher = dispatcher; + p_ptr->wakeup = wakeup; + p_ptr->user_port = NULL; + k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref); + spin_lock_bh(&tipc_port_list_lock); + INIT_LIST_HEAD(&p_ptr->publications); + INIT_LIST_HEAD(&p_ptr->port_list); + list_add_tail(&p_ptr->port_list, &ports); + spin_unlock_bh(&tipc_port_list_lock); + return p_ptr; +} + +int tipc_deleteport(u32 ref) +{ + struct tipc_port *p_ptr; + struct sk_buff *buf = NULL; + + tipc_withdraw(ref, 0, NULL); + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + + tipc_ref_discard(ref); + tipc_port_unlock(p_ptr); + + k_cancel_timer(&p_ptr->timer); + if (p_ptr->connected) { + buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT); + tipc_nodesub_unsubscribe(&p_ptr->subscription); + } + kfree(p_ptr->user_port); + + spin_lock_bh(&tipc_port_list_lock); + list_del(&p_ptr->port_list); + list_del(&p_ptr->wait_list); + spin_unlock_bh(&tipc_port_list_lock); + k_term_timer(&p_ptr->timer); + kfree(p_ptr); + tipc_net_route_msg(buf); + return 0; +} + +static int port_unreliable(struct tipc_port *p_ptr) +{ + return msg_src_droppable(&p_ptr->phdr); +} + +int tipc_portunreliable(u32 ref, unsigned int *isunreliable) +{ + struct tipc_port *p_ptr; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + *isunreliable = port_unreliable(p_ptr); + tipc_port_unlock(p_ptr); + return 0; +} + +int tipc_set_portunreliable(u32 ref, unsigned int isunreliable) +{ + struct tipc_port *p_ptr; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + msg_set_src_droppable(&p_ptr->phdr, (isunreliable != 0)); + tipc_port_unlock(p_ptr); + return 0; +} + +static int port_unreturnable(struct tipc_port *p_ptr) +{ + return msg_dest_droppable(&p_ptr->phdr); +} + +int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable) +{ + struct tipc_port *p_ptr; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + *isunrejectable = port_unreturnable(p_ptr); + tipc_port_unlock(p_ptr); + return 0; +} + +int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable) +{ + struct tipc_port *p_ptr; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + msg_set_dest_droppable(&p_ptr->phdr, (isunrejectable != 0)); + tipc_port_unlock(p_ptr); + return 0; +} + +/* + * port_build_proto_msg(): build a port level protocol + * or a connection abortion message. Called with + * tipc_port lock on. + */ +static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode, + u32 origport, u32 orignode, + u32 usr, u32 type, u32 err, + u32 ack) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + + buf = tipc_buf_acquire(LONG_H_SIZE); + if (buf) { + msg = buf_msg(buf); + tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode); + msg_set_errcode(msg, err); + msg_set_destport(msg, destport); + msg_set_origport(msg, origport); + msg_set_orignode(msg, orignode); + msg_set_msgcnt(msg, ack); + } + return buf; +} + +int tipc_reject_msg(struct sk_buff *buf, u32 err) +{ + struct tipc_msg *msg = buf_msg(buf); + struct sk_buff *rbuf; + struct tipc_msg *rmsg; + int hdr_sz; + u32 imp = msg_importance(msg); + u32 data_sz = msg_data_sz(msg); + + if (data_sz > MAX_REJECT_SIZE) + data_sz = MAX_REJECT_SIZE; + if (msg_connected(msg) && (imp < TIPC_CRITICAL_IMPORTANCE)) + imp++; + + /* discard rejected message if it shouldn't be returned to sender */ + if (msg_errcode(msg) || msg_dest_droppable(msg)) { + buf_discard(buf); + return data_sz; + } + + /* construct rejected message */ + if (msg_mcast(msg)) + hdr_sz = MCAST_H_SIZE; + else + hdr_sz = LONG_H_SIZE; + rbuf = tipc_buf_acquire(data_sz + hdr_sz); + if (rbuf == NULL) { + buf_discard(buf); + return data_sz; + } + rmsg = buf_msg(rbuf); + tipc_msg_init(rmsg, imp, msg_type(msg), hdr_sz, msg_orignode(msg)); + msg_set_errcode(rmsg, err); + msg_set_destport(rmsg, msg_origport(msg)); + msg_set_origport(rmsg, msg_destport(msg)); + if (msg_short(msg)) { + msg_set_orignode(rmsg, tipc_own_addr); + /* leave name type & instance as zeroes */ + } else { + msg_set_orignode(rmsg, msg_destnode(msg)); + msg_set_nametype(rmsg, msg_nametype(msg)); + msg_set_nameinst(rmsg, msg_nameinst(msg)); + } + msg_set_size(rmsg, data_sz + hdr_sz); + skb_copy_to_linear_data_offset(rbuf, hdr_sz, msg_data(msg), data_sz); + + /* send self-abort message when rejecting on a connected port */ + if (msg_connected(msg)) { + struct sk_buff *abuf = NULL; + struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg)); + + if (p_ptr) { + if (p_ptr->connected) + abuf = port_build_self_abort_msg(p_ptr, err); + tipc_port_unlock(p_ptr); + } + tipc_net_route_msg(abuf); + } + + /* send rejected message */ + buf_discard(buf); + tipc_net_route_msg(rbuf); + return data_sz; +} + +int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr, + struct iovec const *msg_sect, u32 num_sect, + unsigned int total_len, int err) +{ + struct sk_buff *buf; + int res; + + res = tipc_msg_build(hdr, msg_sect, num_sect, total_len, MAX_MSG_SIZE, + !p_ptr->user_port, &buf); + if (!buf) + return res; + + return tipc_reject_msg(buf, err); +} + +static void port_timeout(unsigned long ref) +{ + struct tipc_port *p_ptr = tipc_port_lock(ref); + struct sk_buff *buf = NULL; + + if (!p_ptr) + return; + + if (!p_ptr->connected) { + tipc_port_unlock(p_ptr); + return; + } + + /* Last probe answered ? */ + if (p_ptr->probing_state == PROBING) { + buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT); + } else { + buf = port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + p_ptr->ref, + tipc_own_addr, + CONN_MANAGER, + CONN_PROBE, + TIPC_OK, + 0); + p_ptr->probing_state = PROBING; + k_start_timer(&p_ptr->timer, p_ptr->probing_interval); + } + tipc_port_unlock(p_ptr); + tipc_net_route_msg(buf); +} + + +static void port_handle_node_down(unsigned long ref) +{ + struct tipc_port *p_ptr = tipc_port_lock(ref); + struct sk_buff *buf = NULL; + + if (!p_ptr) + return; + buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE); + tipc_port_unlock(p_ptr); + tipc_net_route_msg(buf); +} + + +static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 err) +{ + u32 imp = msg_importance(&p_ptr->phdr); + + if (!p_ptr->connected) + return NULL; + if (imp < TIPC_CRITICAL_IMPORTANCE) + imp++; + return port_build_proto_msg(p_ptr->ref, + tipc_own_addr, + port_peerport(p_ptr), + port_peernode(p_ptr), + imp, + TIPC_CONN_MSG, + err, + 0); +} + + +static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 err) +{ + u32 imp = msg_importance(&p_ptr->phdr); + + if (!p_ptr->connected) + return NULL; + if (imp < TIPC_CRITICAL_IMPORTANCE) + imp++; + return port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + p_ptr->ref, + tipc_own_addr, + imp, + TIPC_CONN_MSG, + err, + 0); +} + +void tipc_port_recv_proto_msg(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg)); + u32 err = TIPC_OK; + struct sk_buff *r_buf = NULL; + struct sk_buff *abort_buf = NULL; + + if (!p_ptr) { + err = TIPC_ERR_NO_PORT; + } else if (p_ptr->connected) { + if ((port_peernode(p_ptr) != msg_orignode(msg)) || + (port_peerport(p_ptr) != msg_origport(msg))) { + err = TIPC_ERR_NO_PORT; + } else if (msg_type(msg) == CONN_ACK) { + int wakeup = tipc_port_congested(p_ptr) && + p_ptr->congested && + p_ptr->wakeup; + p_ptr->acked += msg_msgcnt(msg); + if (tipc_port_congested(p_ptr)) + goto exit; + p_ptr->congested = 0; + if (!wakeup) + goto exit; + p_ptr->wakeup(p_ptr); + goto exit; + } + } else if (p_ptr->published) { + err = TIPC_ERR_NO_PORT; + } + if (err) { + r_buf = port_build_proto_msg(msg_origport(msg), + msg_orignode(msg), + msg_destport(msg), + tipc_own_addr, + TIPC_HIGH_IMPORTANCE, + TIPC_CONN_MSG, + err, + 0); + goto exit; + } + + /* All is fine */ + if (msg_type(msg) == CONN_PROBE) { + r_buf = port_build_proto_msg(msg_origport(msg), + msg_orignode(msg), + msg_destport(msg), + tipc_own_addr, + CONN_MANAGER, + CONN_PROBE_REPLY, + TIPC_OK, + 0); + } + p_ptr->probing_state = CONFIRMED; +exit: + if (p_ptr) + tipc_port_unlock(p_ptr); + tipc_net_route_msg(r_buf); + tipc_net_route_msg(abort_buf); + buf_discard(buf); +} + +static void port_print(struct tipc_port *p_ptr, struct print_buf *buf, int full_id) +{ + struct publication *publ; + + if (full_id) + tipc_printf(buf, "<%u.%u.%u:%u>:", + tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr), + tipc_node(tipc_own_addr), p_ptr->ref); + else + tipc_printf(buf, "%-10u:", p_ptr->ref); + + if (p_ptr->connected) { + u32 dport = port_peerport(p_ptr); + u32 destnode = port_peernode(p_ptr); + + tipc_printf(buf, " connected to <%u.%u.%u:%u>", + tipc_zone(destnode), tipc_cluster(destnode), + tipc_node(destnode), dport); + if (p_ptr->conn_type != 0) + tipc_printf(buf, " via {%u,%u}", + p_ptr->conn_type, + p_ptr->conn_instance); + } else if (p_ptr->published) { + tipc_printf(buf, " bound to"); + list_for_each_entry(publ, &p_ptr->publications, pport_list) { + if (publ->lower == publ->upper) + tipc_printf(buf, " {%u,%u}", publ->type, + publ->lower); + else + tipc_printf(buf, " {%u,%u,%u}", publ->type, + publ->lower, publ->upper); + } + } + tipc_printf(buf, "\n"); +} + +#define MAX_PORT_QUERY 32768 + +struct sk_buff *tipc_port_get_ports(void) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + struct print_buf pb; + struct tipc_port *p_ptr; + int str_len; + + buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_QUERY)); + if (!buf) + return NULL; + rep_tlv = (struct tlv_desc *)buf->data; + + tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_QUERY); + spin_lock_bh(&tipc_port_list_lock); + list_for_each_entry(p_ptr, &ports, port_list) { + spin_lock_bh(p_ptr->lock); + port_print(p_ptr, &pb, 0); + spin_unlock_bh(p_ptr->lock); + } + spin_unlock_bh(&tipc_port_list_lock); + str_len = tipc_printbuf_validate(&pb); + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +void tipc_port_reinit(void) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg; + + spin_lock_bh(&tipc_port_list_lock); + list_for_each_entry(p_ptr, &ports, port_list) { + msg = &p_ptr->phdr; + if (msg_orignode(msg) == tipc_own_addr) + break; + msg_set_prevnode(msg, tipc_own_addr); + msg_set_orignode(msg, tipc_own_addr); + } + spin_unlock_bh(&tipc_port_list_lock); +} + + +/* + * port_dispatcher_sigh(): Signal handler for messages destinated + * to the tipc_port interface. + */ + +static void port_dispatcher_sigh(void *dummy) +{ + struct sk_buff *buf; + + spin_lock_bh(&queue_lock); + buf = msg_queue_head; + msg_queue_head = NULL; + spin_unlock_bh(&queue_lock); + + while (buf) { + struct tipc_port *p_ptr; + struct user_port *up_ptr; + struct tipc_portid orig; + struct tipc_name_seq dseq; + void *usr_handle; + int connected; + int published; + u32 message_type; + + struct sk_buff *next = buf->next; + struct tipc_msg *msg = buf_msg(buf); + u32 dref = msg_destport(msg); + + message_type = msg_type(msg); + if (message_type > TIPC_DIRECT_MSG) + goto reject; /* Unsupported message type */ + + p_ptr = tipc_port_lock(dref); + if (!p_ptr) + goto reject; /* Port deleted while msg in queue */ + + orig.ref = msg_origport(msg); + orig.node = msg_orignode(msg); + up_ptr = p_ptr->user_port; + usr_handle = up_ptr->usr_handle; + connected = p_ptr->connected; + published = p_ptr->published; + + if (unlikely(msg_errcode(msg))) + goto err; + + switch (message_type) { + + case TIPC_CONN_MSG:{ + tipc_conn_msg_event cb = up_ptr->conn_msg_cb; + u32 peer_port = port_peerport(p_ptr); + u32 peer_node = port_peernode(p_ptr); + u32 dsz; + + tipc_port_unlock(p_ptr); + if (unlikely(!cb)) + goto reject; + if (unlikely(!connected)) { + if (tipc_connect2port(dref, &orig)) + goto reject; + } else if ((msg_origport(msg) != peer_port) || + (msg_orignode(msg) != peer_node)) + goto reject; + dsz = msg_data_sz(msg); + if (unlikely(dsz && + (++p_ptr->conn_unacked >= + TIPC_FLOW_CONTROL_WIN))) + tipc_acknowledge(dref, + p_ptr->conn_unacked); + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), dsz); + break; + } + case TIPC_DIRECT_MSG:{ + tipc_msg_event cb = up_ptr->msg_cb; + + tipc_port_unlock(p_ptr); + if (unlikely(!cb || connected)) + goto reject; + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_importance(msg), + &orig); + break; + } + case TIPC_MCAST_MSG: + case TIPC_NAMED_MSG:{ + tipc_named_msg_event cb = up_ptr->named_msg_cb; + + tipc_port_unlock(p_ptr); + if (unlikely(!cb || connected || !published)) + goto reject; + dseq.type = msg_nametype(msg); + dseq.lower = msg_nameinst(msg); + dseq.upper = (message_type == TIPC_NAMED_MSG) + ? dseq.lower : msg_nameupper(msg); + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_importance(msg), + &orig, &dseq); + break; + } + } + if (buf) + buf_discard(buf); + buf = next; + continue; +err: + switch (message_type) { + + case TIPC_CONN_MSG:{ + tipc_conn_shutdown_event cb = + up_ptr->conn_err_cb; + u32 peer_port = port_peerport(p_ptr); + u32 peer_node = port_peernode(p_ptr); + + tipc_port_unlock(p_ptr); + if (!cb || !connected) + break; + if ((msg_origport(msg) != peer_port) || + (msg_orignode(msg) != peer_node)) + break; + tipc_disconnect(dref); + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_errcode(msg)); + break; + } + case TIPC_DIRECT_MSG:{ + tipc_msg_err_event cb = up_ptr->err_cb; + + tipc_port_unlock(p_ptr); + if (!cb || connected) + break; + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_errcode(msg), &orig); + break; + } + case TIPC_MCAST_MSG: + case TIPC_NAMED_MSG:{ + tipc_named_msg_err_event cb = + up_ptr->named_err_cb; + + tipc_port_unlock(p_ptr); + if (!cb || connected) + break; + dseq.type = msg_nametype(msg); + dseq.lower = msg_nameinst(msg); + dseq.upper = (message_type == TIPC_NAMED_MSG) + ? dseq.lower : msg_nameupper(msg); + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_errcode(msg), &dseq); + break; + } + } + if (buf) + buf_discard(buf); + buf = next; + continue; +reject: + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + buf = next; + } +} + +/* + * port_dispatcher(): Dispatcher for messages destinated + * to the tipc_port interface. Called with port locked. + */ + +static u32 port_dispatcher(struct tipc_port *dummy, struct sk_buff *buf) +{ + buf->next = NULL; + spin_lock_bh(&queue_lock); + if (msg_queue_head) { + msg_queue_tail->next = buf; + msg_queue_tail = buf; + } else { + msg_queue_tail = msg_queue_head = buf; + tipc_k_signal((Handler)port_dispatcher_sigh, 0); + } + spin_unlock_bh(&queue_lock); + return 0; +} + +/* + * Wake up port after congestion: Called with port locked, + * + */ + +static void port_wakeup_sh(unsigned long ref) +{ + struct tipc_port *p_ptr; + struct user_port *up_ptr; + tipc_continue_event cb = NULL; + void *uh = NULL; + + p_ptr = tipc_port_lock(ref); + if (p_ptr) { + up_ptr = p_ptr->user_port; + if (up_ptr) { + cb = up_ptr->continue_event_cb; + uh = up_ptr->usr_handle; + } + tipc_port_unlock(p_ptr); + } + if (cb) + cb(uh, ref); +} + + +static void port_wakeup(struct tipc_port *p_ptr) +{ + tipc_k_signal((Handler)port_wakeup_sh, p_ptr->ref); +} + +void tipc_acknowledge(u32 ref, u32 ack) +{ + struct tipc_port *p_ptr; + struct sk_buff *buf = NULL; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return; + if (p_ptr->connected) { + p_ptr->conn_unacked -= ack; + buf = port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + ref, + tipc_own_addr, + CONN_MANAGER, + CONN_ACK, + TIPC_OK, + ack); + } + tipc_port_unlock(p_ptr); + tipc_net_route_msg(buf); +} + +/* + * tipc_createport(): user level call. + */ + +int tipc_createport(void *usr_handle, + unsigned int importance, + tipc_msg_err_event error_cb, + tipc_named_msg_err_event named_error_cb, + tipc_conn_shutdown_event conn_error_cb, + tipc_msg_event msg_cb, + tipc_named_msg_event named_msg_cb, + tipc_conn_msg_event conn_msg_cb, + tipc_continue_event continue_event_cb,/* May be zero */ + u32 *portref) +{ + struct user_port *up_ptr; + struct tipc_port *p_ptr; + + up_ptr = kmalloc(sizeof(*up_ptr), GFP_ATOMIC); + if (!up_ptr) { + warn("Port creation failed, no memory\n"); + return -ENOMEM; + } + p_ptr = (struct tipc_port *)tipc_createport_raw(NULL, port_dispatcher, + port_wakeup, importance); + if (!p_ptr) { + kfree(up_ptr); + return -ENOMEM; + } + + p_ptr->user_port = up_ptr; + up_ptr->usr_handle = usr_handle; + up_ptr->ref = p_ptr->ref; + up_ptr->err_cb = error_cb; + up_ptr->named_err_cb = named_error_cb; + up_ptr->conn_err_cb = conn_error_cb; + up_ptr->msg_cb = msg_cb; + up_ptr->named_msg_cb = named_msg_cb; + up_ptr->conn_msg_cb = conn_msg_cb; + up_ptr->continue_event_cb = continue_event_cb; + *portref = p_ptr->ref; + tipc_port_unlock(p_ptr); + return 0; +} + +int tipc_portimportance(u32 ref, unsigned int *importance) +{ + struct tipc_port *p_ptr; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + *importance = (unsigned int)msg_importance(&p_ptr->phdr); + tipc_port_unlock(p_ptr); + return 0; +} + +int tipc_set_portimportance(u32 ref, unsigned int imp) +{ + struct tipc_port *p_ptr; + + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + msg_set_importance(&p_ptr->phdr, (u32)imp); + tipc_port_unlock(p_ptr); + return 0; +} + + +int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) +{ + struct tipc_port *p_ptr; + struct publication *publ; + u32 key; + int res = -EINVAL; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + + if (p_ptr->connected) + goto exit; + if (seq->lower > seq->upper) + goto exit; + if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE)) + goto exit; + key = ref + p_ptr->pub_count + 1; + if (key == ref) { + res = -EADDRINUSE; + goto exit; + } + publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, + scope, p_ptr->ref, key); + if (publ) { + list_add(&publ->pport_list, &p_ptr->publications); + p_ptr->pub_count++; + p_ptr->published = 1; + res = 0; + } +exit: + tipc_port_unlock(p_ptr); + return res; +} + +int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) +{ + struct tipc_port *p_ptr; + struct publication *publ; + struct publication *tpubl; + int res = -EINVAL; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + if (!seq) { + list_for_each_entry_safe(publ, tpubl, + &p_ptr->publications, pport_list) { + tipc_nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + } + res = 0; + } else { + list_for_each_entry_safe(publ, tpubl, + &p_ptr->publications, pport_list) { + if (publ->scope != scope) + continue; + if (publ->type != seq->type) + continue; + if (publ->lower != seq->lower) + continue; + if (publ->upper != seq->upper) + break; + tipc_nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + res = 0; + break; + } + } + if (list_empty(&p_ptr->publications)) + p_ptr->published = 0; + tipc_port_unlock(p_ptr); + return res; +} + +int tipc_connect2port(u32 ref, struct tipc_portid const *peer) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg; + int res = -EINVAL; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + if (p_ptr->published || p_ptr->connected) + goto exit; + if (!peer->ref) + goto exit; + + msg = &p_ptr->phdr; + msg_set_destnode(msg, peer->node); + msg_set_destport(msg, peer->ref); + msg_set_orignode(msg, tipc_own_addr); + msg_set_origport(msg, p_ptr->ref); + msg_set_type(msg, TIPC_CONN_MSG); + msg_set_lookup_scope(msg, 0); + msg_set_hdr_sz(msg, SHORT_H_SIZE); + + p_ptr->probing_interval = PROBING_INTERVAL; + p_ptr->probing_state = CONFIRMED; + p_ptr->connected = 1; + k_start_timer(&p_ptr->timer, p_ptr->probing_interval); + + tipc_nodesub_subscribe(&p_ptr->subscription, peer->node, + (void *)(unsigned long)ref, + (net_ev_handler)port_handle_node_down); + res = 0; +exit: + tipc_port_unlock(p_ptr); + p_ptr->max_pkt = tipc_link_get_max_pkt(peer->node, ref); + return res; +} + +/** + * tipc_disconnect_port - disconnect port from peer + * + * Port must be locked. + */ + +int tipc_disconnect_port(struct tipc_port *tp_ptr) +{ + int res; + + if (tp_ptr->connected) { + tp_ptr->connected = 0; + /* let timer expire on it's own to avoid deadlock! */ + tipc_nodesub_unsubscribe( + &((struct tipc_port *)tp_ptr)->subscription); + res = 0; + } else { + res = -ENOTCONN; + } + return res; +} + +/* + * tipc_disconnect(): Disconnect port form peer. + * This is a node local operation. + */ + +int tipc_disconnect(u32 ref) +{ + struct tipc_port *p_ptr; + int res; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + res = tipc_disconnect_port((struct tipc_port *)p_ptr); + tipc_port_unlock(p_ptr); + return res; +} + +/* + * tipc_shutdown(): Send a SHUTDOWN msg to peer and disconnect + */ +int tipc_shutdown(u32 ref) +{ + struct tipc_port *p_ptr; + struct sk_buff *buf = NULL; + + p_ptr = tipc_port_lock(ref); + if (!p_ptr) + return -EINVAL; + + if (p_ptr->connected) { + u32 imp = msg_importance(&p_ptr->phdr); + if (imp < TIPC_CRITICAL_IMPORTANCE) + imp++; + buf = port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + ref, + tipc_own_addr, + imp, + TIPC_CONN_MSG, + TIPC_CONN_SHUTDOWN, + 0); + } + tipc_port_unlock(p_ptr); + tipc_net_route_msg(buf); + return tipc_disconnect(ref); +} + +/* + * tipc_port_recv_sections(): Concatenate and deliver sectioned + * message for this node. + */ + +static int tipc_port_recv_sections(struct tipc_port *sender, unsigned int num_sect, + struct iovec const *msg_sect, + unsigned int total_len) +{ + struct sk_buff *buf; + int res; + + res = tipc_msg_build(&sender->phdr, msg_sect, num_sect, total_len, + MAX_MSG_SIZE, !sender->user_port, &buf); + if (likely(buf)) + tipc_port_recv_msg(buf); + return res; +} + +/** + * tipc_send - send message sections on connection + */ + +int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect, + unsigned int total_len) +{ + struct tipc_port *p_ptr; + u32 destnode; + int res; + + p_ptr = tipc_port_deref(ref); + if (!p_ptr || !p_ptr->connected) + return -EINVAL; + + p_ptr->congested = 1; + if (!tipc_port_congested(p_ptr)) { + destnode = port_peernode(p_ptr); + if (likely(destnode != tipc_own_addr)) + res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect, + total_len, destnode); + else + res = tipc_port_recv_sections(p_ptr, num_sect, msg_sect, + total_len); + + if (likely(res != -ELINKCONG)) { + p_ptr->congested = 0; + if (res > 0) + p_ptr->sent++; + return res; + } + } + if (port_unreliable(p_ptr)) { + p_ptr->congested = 0; + return total_len; + } + return -ELINKCONG; +} + +/** + * tipc_send2name - send message sections to port name + */ + +int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain, + unsigned int num_sect, struct iovec const *msg_sect, + unsigned int total_len) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg; + u32 destnode = domain; + u32 destport; + int res; + + p_ptr = tipc_port_deref(ref); + if (!p_ptr || p_ptr->connected) + return -EINVAL; + + msg = &p_ptr->phdr; + msg_set_type(msg, TIPC_NAMED_MSG); + msg_set_orignode(msg, tipc_own_addr); + msg_set_origport(msg, ref); + msg_set_hdr_sz(msg, LONG_H_SIZE); + msg_set_nametype(msg, name->type); + msg_set_nameinst(msg, name->instance); + msg_set_lookup_scope(msg, tipc_addr_scope(domain)); + destport = tipc_nametbl_translate(name->type, name->instance, &destnode); + msg_set_destnode(msg, destnode); + msg_set_destport(msg, destport); + + if (likely(destport)) { + if (likely(destnode == tipc_own_addr)) + res = tipc_port_recv_sections(p_ptr, num_sect, + msg_sect, total_len); + else + res = tipc_link_send_sections_fast(p_ptr, msg_sect, + num_sect, total_len, + destnode); + if (likely(res != -ELINKCONG)) { + if (res > 0) + p_ptr->sent++; + return res; + } + if (port_unreliable(p_ptr)) { + return total_len; + } + return -ELINKCONG; + } + return tipc_port_reject_sections(p_ptr, msg, msg_sect, num_sect, + total_len, TIPC_ERR_NO_NAME); +} + +/** + * tipc_send2port - send message sections to port identity + */ + +int tipc_send2port(u32 ref, struct tipc_portid const *dest, + unsigned int num_sect, struct iovec const *msg_sect, + unsigned int total_len) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg; + int res; + + p_ptr = tipc_port_deref(ref); + if (!p_ptr || p_ptr->connected) + return -EINVAL; + + msg = &p_ptr->phdr; + msg_set_type(msg, TIPC_DIRECT_MSG); + msg_set_lookup_scope(msg, 0); + msg_set_orignode(msg, tipc_own_addr); + msg_set_origport(msg, ref); + msg_set_destnode(msg, dest->node); + msg_set_destport(msg, dest->ref); + msg_set_hdr_sz(msg, DIR_MSG_H_SIZE); + + if (dest->node == tipc_own_addr) + res = tipc_port_recv_sections(p_ptr, num_sect, msg_sect, + total_len); + else + res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect, + total_len, dest->node); + if (likely(res != -ELINKCONG)) { + if (res > 0) + p_ptr->sent++; + return res; + } + if (port_unreliable(p_ptr)) { + return total_len; + } + return -ELINKCONG; +} + +/** + * tipc_send_buf2port - send message buffer to port identity + */ + +int tipc_send_buf2port(u32 ref, struct tipc_portid const *dest, + struct sk_buff *buf, unsigned int dsz) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg; + int res; + + p_ptr = (struct tipc_port *)tipc_ref_deref(ref); + if (!p_ptr || p_ptr->connected) + return -EINVAL; + + msg = &p_ptr->phdr; + msg_set_type(msg, TIPC_DIRECT_MSG); + msg_set_orignode(msg, tipc_own_addr); + msg_set_origport(msg, ref); + msg_set_destnode(msg, dest->node); + msg_set_destport(msg, dest->ref); + msg_set_hdr_sz(msg, DIR_MSG_H_SIZE); + msg_set_size(msg, DIR_MSG_H_SIZE + dsz); + if (skb_cow(buf, DIR_MSG_H_SIZE)) + return -ENOMEM; + + skb_push(buf, DIR_MSG_H_SIZE); + skb_copy_to_linear_data(buf, msg, DIR_MSG_H_SIZE); + + if (dest->node == tipc_own_addr) + res = tipc_port_recv_msg(buf); + else + res = tipc_send_buf_fast(buf, dest->node); + if (likely(res != -ELINKCONG)) { + if (res > 0) + p_ptr->sent++; + return res; + } + if (port_unreliable(p_ptr)) + return dsz; + return -ELINKCONG; +} + diff --git a/net/tipc/port.h b/net/tipc/port.h new file mode 100644 index 00000000..b9aa3419 --- /dev/null +++ b/net/tipc/port.h @@ -0,0 +1,315 @@ +/* + * net/tipc/port.h: Include file for TIPC port code + * + * Copyright (c) 1994-2007, Ericsson AB + * Copyright (c) 2004-2007, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_PORT_H +#define _TIPC_PORT_H + +#include "ref.h" +#include "net.h" +#include "msg.h" +#include "node_subscr.h" + +#define TIPC_FLOW_CONTROL_WIN 512 + +typedef void (*tipc_msg_err_event) (void *usr_handle, u32 portref, + struct sk_buff **buf, unsigned char const *data, + unsigned int size, int reason, + struct tipc_portid const *attmpt_destid); + +typedef void (*tipc_named_msg_err_event) (void *usr_handle, u32 portref, + struct sk_buff **buf, unsigned char const *data, + unsigned int size, int reason, + struct tipc_name_seq const *attmpt_dest); + +typedef void (*tipc_conn_shutdown_event) (void *usr_handle, u32 portref, + struct sk_buff **buf, unsigned char const *data, + unsigned int size, int reason); + +typedef void (*tipc_msg_event) (void *usr_handle, u32 portref, + struct sk_buff **buf, unsigned char const *data, + unsigned int size, unsigned int importance, + struct tipc_portid const *origin); + +typedef void (*tipc_named_msg_event) (void *usr_handle, u32 portref, + struct sk_buff **buf, unsigned char const *data, + unsigned int size, unsigned int importance, + struct tipc_portid const *orig, + struct tipc_name_seq const *dest); + +typedef void (*tipc_conn_msg_event) (void *usr_handle, u32 portref, + struct sk_buff **buf, unsigned char const *data, + unsigned int size); + +typedef void (*tipc_continue_event) (void *usr_handle, u32 portref); + +/** + * struct user_port - TIPC user port (used with native API) + * @usr_handle: user-specified field + * @ref: object reference to associated TIPC port + * + */ + +struct user_port { + void *usr_handle; + u32 ref; + tipc_msg_err_event err_cb; + tipc_named_msg_err_event named_err_cb; + tipc_conn_shutdown_event conn_err_cb; + tipc_msg_event msg_cb; + tipc_named_msg_event named_msg_cb; + tipc_conn_msg_event conn_msg_cb; + tipc_continue_event continue_event_cb; +}; + +/** + * struct tipc_port - TIPC port structure + * @usr_handle: pointer to additional user-defined information about port + * @lock: pointer to spinlock for controlling access to port + * @connected: non-zero if port is currently connected to a peer port + * @conn_type: TIPC type used when connection was established + * @conn_instance: TIPC instance used when connection was established + * @conn_unacked: number of unacknowledged messages received from peer port + * @published: non-zero if port has one or more associated names + * @congested: non-zero if cannot send because of link or port congestion + * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @ref: unique reference to port in TIPC object registry + * @phdr: preformatted message header used when sending messages + * @port_list: adjacent ports in TIPC's global list of ports + * @dispatcher: ptr to routine which handles received messages + * @wakeup: ptr to routine to call when port is no longer congested + * @user_port: ptr to user port associated with port (if any) + * @wait_list: adjacent ports in list of ports waiting on link congestion + * @waiting_pkts: + * @sent: # of non-empty messages sent by port + * @acked: # of non-empty message acknowledgements from connected port's peer + * @publications: list of publications for port + * @pub_count: total # of publications port has made during its lifetime + * @probing_state: + * @probing_interval: + * @timer_ref: + * @subscription: "node down" subscription used to terminate failed connections + */ +struct tipc_port { + void *usr_handle; + spinlock_t *lock; + int connected; + u32 conn_type; + u32 conn_instance; + u32 conn_unacked; + int published; + u32 congested; + u32 max_pkt; + u32 ref; + struct tipc_msg phdr; + struct list_head port_list; + u32 (*dispatcher)(struct tipc_port *, struct sk_buff *); + void (*wakeup)(struct tipc_port *); + struct user_port *user_port; + struct list_head wait_list; + u32 waiting_pkts; + u32 sent; + u32 acked; + struct list_head publications; + u32 pub_count; + u32 probing_state; + u32 probing_interval; + struct timer_list timer; + struct tipc_node_subscr subscription; +}; + +extern spinlock_t tipc_port_list_lock; +struct port_list; + +/* + * TIPC port manipulation routines + */ +struct tipc_port *tipc_createport_raw(void *usr_handle, + u32 (*dispatcher)(struct tipc_port *, struct sk_buff *), + void (*wakeup)(struct tipc_port *), const u32 importance); + +int tipc_reject_msg(struct sk_buff *buf, u32 err); + +int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode); + +void tipc_acknowledge(u32 port_ref, u32 ack); + +int tipc_createport(void *usr_handle, + unsigned int importance, tipc_msg_err_event error_cb, + tipc_named_msg_err_event named_error_cb, + tipc_conn_shutdown_event conn_error_cb, tipc_msg_event msg_cb, + tipc_named_msg_event named_msg_cb, + tipc_conn_msg_event conn_msg_cb, + tipc_continue_event continue_event_cb, u32 *portref); + +int tipc_deleteport(u32 portref); + +int tipc_portimportance(u32 portref, unsigned int *importance); +int tipc_set_portimportance(u32 portref, unsigned int importance); + +int tipc_portunreliable(u32 portref, unsigned int *isunreliable); +int tipc_set_portunreliable(u32 portref, unsigned int isunreliable); + +int tipc_portunreturnable(u32 portref, unsigned int *isunreturnable); +int tipc_set_portunreturnable(u32 portref, unsigned int isunreturnable); + +int tipc_publish(u32 portref, unsigned int scope, + struct tipc_name_seq const *name_seq); +int tipc_withdraw(u32 portref, unsigned int scope, + struct tipc_name_seq const *name_seq); + +int tipc_connect2port(u32 portref, struct tipc_portid const *port); + +int tipc_disconnect(u32 portref); + +int tipc_shutdown(u32 ref); + + +/* + * The following routines require that the port be locked on entry + */ +int tipc_disconnect_port(struct tipc_port *tp_ptr); + +/* + * TIPC messaging routines + */ +int tipc_send(u32 portref, unsigned int num_sect, struct iovec const *msg_sect, + unsigned int total_len); + +int tipc_send2name(u32 portref, struct tipc_name const *name, u32 domain, + unsigned int num_sect, struct iovec const *msg_sect, + unsigned int total_len); + +int tipc_send2port(u32 portref, struct tipc_portid const *dest, + unsigned int num_sect, struct iovec const *msg_sect, + unsigned int total_len); + +int tipc_send_buf2port(u32 portref, struct tipc_portid const *dest, + struct sk_buff *buf, unsigned int dsz); + +int tipc_multicast(u32 portref, struct tipc_name_seq const *seq, + unsigned int section_count, struct iovec const *msg, + unsigned int total_len); + +int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr, + struct iovec const *msg_sect, u32 num_sect, + unsigned int total_len, int err); +struct sk_buff *tipc_port_get_ports(void); +void tipc_port_recv_proto_msg(struct sk_buff *buf); +void tipc_port_recv_mcast(struct sk_buff *buf, struct port_list *dp); +void tipc_port_reinit(void); + +/** + * tipc_port_lock - lock port instance referred to and return its pointer + */ + +static inline struct tipc_port *tipc_port_lock(u32 ref) +{ + return (struct tipc_port *)tipc_ref_lock(ref); +} + +/** + * tipc_port_unlock - unlock a port instance + * + * Can use pointer instead of tipc_ref_unlock() since port is already locked. + */ + +static inline void tipc_port_unlock(struct tipc_port *p_ptr) +{ + spin_unlock_bh(p_ptr->lock); +} + +static inline struct tipc_port *tipc_port_deref(u32 ref) +{ + return (struct tipc_port *)tipc_ref_deref(ref); +} + +static inline u32 tipc_peer_port(struct tipc_port *p_ptr) +{ + return msg_destport(&p_ptr->phdr); +} + +static inline u32 tipc_peer_node(struct tipc_port *p_ptr) +{ + return msg_destnode(&p_ptr->phdr); +} + +static inline int tipc_port_congested(struct tipc_port *p_ptr) +{ + return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2); +} + +/** + * tipc_port_recv_msg - receive message from lower layer and deliver to port user + */ + +static inline int tipc_port_recv_msg(struct sk_buff *buf) +{ + struct tipc_port *p_ptr; + struct tipc_msg *msg = buf_msg(buf); + u32 destport = msg_destport(msg); + u32 dsz = msg_data_sz(msg); + u32 err; + + /* forward unresolved named message */ + if (unlikely(!destport)) { + tipc_net_route_msg(buf); + return dsz; + } + + /* validate destination & pass to port, otherwise reject message */ + p_ptr = tipc_port_lock(destport); + if (likely(p_ptr)) { + if (likely(p_ptr->connected)) { + if ((unlikely(msg_origport(msg) != tipc_peer_port(p_ptr))) || + (unlikely(msg_orignode(msg) != tipc_peer_node(p_ptr))) || + (unlikely(!msg_connected(msg)))) { + err = TIPC_ERR_NO_PORT; + tipc_port_unlock(p_ptr); + goto reject; + } + } + err = p_ptr->dispatcher(p_ptr, buf); + tipc_port_unlock(p_ptr); + if (likely(!err)) + return dsz; + } else { + err = TIPC_ERR_NO_PORT; + } +reject: + return tipc_reject_msg(buf, err); +} + +#endif diff --git a/net/tipc/ref.c b/net/tipc/ref.c new file mode 100644 index 00000000..83116892 --- /dev/null +++ b/net/tipc/ref.c @@ -0,0 +1,300 @@ +/* + * net/tipc/ref.c: TIPC object registry code + * + * Copyright (c) 1991-2006, Ericsson AB + * Copyright (c) 2004-2007, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "ref.h" + +/** + * struct reference - TIPC object reference entry + * @object: pointer to object associated with reference entry + * @lock: spinlock controlling access to object + * @ref: reference value for object (combines instance & array index info) + */ + +struct reference { + void *object; + spinlock_t lock; + u32 ref; +}; + +/** + * struct tipc_ref_table - table of TIPC object reference entries + * @entries: pointer to array of reference entries + * @capacity: array index of first unusable entry + * @init_point: array index of first uninitialized entry + * @first_free: array index of first unused object reference entry + * @last_free: array index of last unused object reference entry + * @index_mask: bitmask for array index portion of reference values + * @start_mask: initial value for instance value portion of reference values + */ + +struct ref_table { + struct reference *entries; + u32 capacity; + u32 init_point; + u32 first_free; + u32 last_free; + u32 index_mask; + u32 start_mask; +}; + +/* + * Object reference table consists of 2**N entries. + * + * State Object ptr Reference + * ----- ---------- --------- + * In use non-NULL XXXX|own index + * (XXXX changes each time entry is acquired) + * Free NULL YYYY|next free index + * (YYYY is one more than last used XXXX) + * Uninitialized NULL 0 + * + * Entry 0 is not used; this allows index 0 to denote the end of the free list. + * + * Note that a reference value of 0 does not necessarily indicate that an + * entry is uninitialized, since the last entry in the free list could also + * have a reference value of 0 (although this is unlikely). + */ + +static struct ref_table tipc_ref_table; + +static DEFINE_RWLOCK(ref_table_lock); + +/** + * tipc_ref_table_init - create reference table for objects + */ + +int tipc_ref_table_init(u32 requested_size, u32 start) +{ + struct reference *table; + u32 actual_size; + + /* account for unused entry, then round up size to a power of 2 */ + + requested_size++; + for (actual_size = 16; actual_size < requested_size; actual_size <<= 1) + /* do nothing */ ; + + /* allocate table & mark all entries as uninitialized */ + + table = __vmalloc(actual_size * sizeof(struct reference), + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + if (table == NULL) + return -ENOMEM; + + tipc_ref_table.entries = table; + tipc_ref_table.capacity = requested_size; + tipc_ref_table.init_point = 1; + tipc_ref_table.first_free = 0; + tipc_ref_table.last_free = 0; + tipc_ref_table.index_mask = actual_size - 1; + tipc_ref_table.start_mask = start & ~tipc_ref_table.index_mask; + + return 0; +} + +/** + * tipc_ref_table_stop - destroy reference table for objects + */ + +void tipc_ref_table_stop(void) +{ + if (!tipc_ref_table.entries) + return; + + vfree(tipc_ref_table.entries); + tipc_ref_table.entries = NULL; +} + +/** + * tipc_ref_acquire - create reference to an object + * + * Register an object pointer in reference table and lock the object. + * Returns a unique reference value that is used from then on to retrieve the + * object pointer, or to determine that the object has been deregistered. + * + * Note: The object is returned in the locked state so that the caller can + * register a partially initialized object, without running the risk that + * the object will be accessed before initialization is complete. + */ + +u32 tipc_ref_acquire(void *object, spinlock_t **lock) +{ + u32 index; + u32 index_mask; + u32 next_plus_upper; + u32 ref; + struct reference *entry = NULL; + + if (!object) { + err("Attempt to acquire reference to non-existent object\n"); + return 0; + } + if (!tipc_ref_table.entries) { + err("Reference table not found during acquisition attempt\n"); + return 0; + } + + /* take a free entry, if available; otherwise initialize a new entry */ + + write_lock_bh(&ref_table_lock); + if (tipc_ref_table.first_free) { + index = tipc_ref_table.first_free; + entry = &(tipc_ref_table.entries[index]); + index_mask = tipc_ref_table.index_mask; + next_plus_upper = entry->ref; + tipc_ref_table.first_free = next_plus_upper & index_mask; + ref = (next_plus_upper & ~index_mask) + index; + } else if (tipc_ref_table.init_point < tipc_ref_table.capacity) { + index = tipc_ref_table.init_point++; + entry = &(tipc_ref_table.entries[index]); + spin_lock_init(&entry->lock); + ref = tipc_ref_table.start_mask + index; + } else { + ref = 0; + } + write_unlock_bh(&ref_table_lock); + + /* + * Grab the lock so no one else can modify this entry + * While we assign its ref value & object pointer + */ + if (entry) { + spin_lock_bh(&entry->lock); + entry->ref = ref; + entry->object = object; + *lock = &entry->lock; + /* + * keep it locked, the caller is responsible + * for unlocking this when they're done with it + */ + } + + return ref; +} + +/** + * tipc_ref_discard - invalidate references to an object + * + * Disallow future references to an object and free up the entry for re-use. + * Note: The entry's spin_lock may still be busy after discard + */ + +void tipc_ref_discard(u32 ref) +{ + struct reference *entry; + u32 index; + u32 index_mask; + + if (!tipc_ref_table.entries) { + err("Reference table not found during discard attempt\n"); + return; + } + + index_mask = tipc_ref_table.index_mask; + index = ref & index_mask; + entry = &(tipc_ref_table.entries[index]); + + write_lock_bh(&ref_table_lock); + + if (!entry->object) { + err("Attempt to discard reference to non-existent object\n"); + goto exit; + } + if (entry->ref != ref) { + err("Attempt to discard non-existent reference\n"); + goto exit; + } + + /* + * mark entry as unused; increment instance part of entry's reference + * to invalidate any subsequent references + */ + + entry->object = NULL; + entry->ref = (ref & ~index_mask) + (index_mask + 1); + + /* append entry to free entry list */ + + if (tipc_ref_table.first_free == 0) + tipc_ref_table.first_free = index; + else + tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index; + tipc_ref_table.last_free = index; + +exit: + write_unlock_bh(&ref_table_lock); +} + +/** + * tipc_ref_lock - lock referenced object and return pointer to it + */ + +void *tipc_ref_lock(u32 ref) +{ + if (likely(tipc_ref_table.entries)) { + struct reference *entry; + + entry = &tipc_ref_table.entries[ref & + tipc_ref_table.index_mask]; + if (likely(entry->ref != 0)) { + spin_lock_bh(&entry->lock); + if (likely((entry->ref == ref) && (entry->object))) + return entry->object; + spin_unlock_bh(&entry->lock); + } + } + return NULL; +} + + +/** + * tipc_ref_deref - return pointer referenced object (without locking it) + */ + +void *tipc_ref_deref(u32 ref) +{ + if (likely(tipc_ref_table.entries)) { + struct reference *entry; + + entry = &tipc_ref_table.entries[ref & + tipc_ref_table.index_mask]; + if (likely(entry->ref == ref)) + return entry->object; + } + return NULL; +} + diff --git a/net/tipc/ref.h b/net/tipc/ref.h new file mode 100644 index 00000000..5bc8e7ab --- /dev/null +++ b/net/tipc/ref.h @@ -0,0 +1,49 @@ +/* + * net/tipc/ref.h: Include file for TIPC object registry code + * + * Copyright (c) 1991-2006, Ericsson AB + * Copyright (c) 2005-2006, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_REF_H +#define _TIPC_REF_H + +int tipc_ref_table_init(u32 requested_size, u32 start); +void tipc_ref_table_stop(void); + +u32 tipc_ref_acquire(void *object, spinlock_t **lock); +void tipc_ref_discard(u32 ref); + +void *tipc_ref_lock(u32 ref); +void *tipc_ref_deref(u32 ref); + +#endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c new file mode 100644 index 00000000..33883739 --- /dev/null +++ b/net/tipc/socket.c @@ -0,0 +1,1904 @@ +/* + * net/tipc/socket.c: TIPC socket API + * + * Copyright (c) 2001-2007, Ericsson AB + * Copyright (c) 2004-2008, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "core.h" +#include "port.h" + +#define SS_LISTENING -1 /* socket is listening */ +#define SS_READY -2 /* socket is connectionless */ + +#define OVERLOAD_LIMIT_BASE 5000 +#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ + +struct tipc_sock { + struct sock sk; + struct tipc_port *p; + struct tipc_portid peer_name; + long conn_timeout; +}; + +#define tipc_sk(sk) ((struct tipc_sock *)(sk)) +#define tipc_sk_port(sk) ((struct tipc_port *)(tipc_sk(sk)->p)) + +#define tipc_rx_ready(sock) (!skb_queue_empty(&sock->sk->sk_receive_queue) || \ + (sock->state == SS_DISCONNECTING)) + +static int backlog_rcv(struct sock *sk, struct sk_buff *skb); +static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf); +static void wakeupdispatch(struct tipc_port *tport); + +static const struct proto_ops packet_ops; +static const struct proto_ops stream_ops; +static const struct proto_ops msg_ops; + +static struct proto tipc_proto; + +static int sockets_enabled; + +static atomic_t tipc_queue_size = ATOMIC_INIT(0); + +/* + * Revised TIPC socket locking policy: + * + * Most socket operations take the standard socket lock when they start + * and hold it until they finish (or until they need to sleep). Acquiring + * this lock grants the owner exclusive access to the fields of the socket + * data structures, with the exception of the backlog queue. A few socket + * operations can be done without taking the socket lock because they only + * read socket information that never changes during the life of the socket. + * + * Socket operations may acquire the lock for the associated TIPC port if they + * need to perform an operation on the port. If any routine needs to acquire + * both the socket lock and the port lock it must take the socket lock first + * to avoid the risk of deadlock. + * + * The dispatcher handling incoming messages cannot grab the socket lock in + * the standard fashion, since invoked it runs at the BH level and cannot block. + * Instead, it checks to see if the socket lock is currently owned by someone, + * and either handles the message itself or adds it to the socket's backlog + * queue; in the latter case the queued message is processed once the process + * owning the socket lock releases it. + * + * NOTE: Releasing the socket lock while an operation is sleeping overcomes + * the problem of a blocked socket operation preventing any other operations + * from occurring. However, applications must be careful if they have + * multiple threads trying to send (or receive) on the same socket, as these + * operations might interfere with each other. For example, doing a connect + * and a receive at the same time might allow the receive to consume the + * ACK message meant for the connect. While additional work could be done + * to try and overcome this, it doesn't seem to be worthwhile at the present. + * + * NOTE: Releasing the socket lock while an operation is sleeping also ensures + * that another operation that must be performed in a non-blocking manner is + * not delayed for very long because the lock has already been taken. + * + * NOTE: This code assumes that certain fields of a port/socket pair are + * constant over its lifetime; such fields can be examined without taking + * the socket lock and/or port lock, and do not need to be re-read even + * after resuming processing after waiting. These fields include: + * - socket type + * - pointer to socket sk structure (aka tipc_sock structure) + * - pointer to port structure + * - port reference + */ + +/** + * advance_rx_queue - discard first buffer in socket receive queue + * + * Caller must hold socket lock + */ + +static void advance_rx_queue(struct sock *sk) +{ + buf_discard(__skb_dequeue(&sk->sk_receive_queue)); + atomic_dec(&tipc_queue_size); +} + +/** + * discard_rx_queue - discard all buffers in socket receive queue + * + * Caller must hold socket lock + */ + +static void discard_rx_queue(struct sock *sk) +{ + struct sk_buff *buf; + + while ((buf = __skb_dequeue(&sk->sk_receive_queue))) { + atomic_dec(&tipc_queue_size); + buf_discard(buf); + } +} + +/** + * reject_rx_queue - reject all buffers in socket receive queue + * + * Caller must hold socket lock + */ + +static void reject_rx_queue(struct sock *sk) +{ + struct sk_buff *buf; + + while ((buf = __skb_dequeue(&sk->sk_receive_queue))) { + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + atomic_dec(&tipc_queue_size); + } +} + +/** + * tipc_create - create a TIPC socket + * @net: network namespace (must be default network) + * @sock: pre-allocated socket structure + * @protocol: protocol indicator (must be 0) + * @kern: caused by kernel or by userspace? + * + * This routine creates additional data structures used by the TIPC socket, + * initializes them, and links them together. + * + * Returns 0 on success, errno otherwise + */ + +static int tipc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + const struct proto_ops *ops; + socket_state state; + struct sock *sk; + struct tipc_port *tp_ptr; + + /* Validate arguments */ + + if (!net_eq(net, &init_net)) + return -EAFNOSUPPORT; + + if (unlikely(protocol != 0)) + return -EPROTONOSUPPORT; + + switch (sock->type) { + case SOCK_STREAM: + ops = &stream_ops; + state = SS_UNCONNECTED; + break; + case SOCK_SEQPACKET: + ops = &packet_ops; + state = SS_UNCONNECTED; + break; + case SOCK_DGRAM: + case SOCK_RDM: + ops = &msg_ops; + state = SS_READY; + break; + default: + return -EPROTOTYPE; + } + + /* Allocate socket's protocol area */ + + sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); + if (sk == NULL) + return -ENOMEM; + + /* Allocate TIPC port for socket to use */ + + tp_ptr = tipc_createport_raw(sk, &dispatch, &wakeupdispatch, + TIPC_LOW_IMPORTANCE); + if (unlikely(!tp_ptr)) { + sk_free(sk); + return -ENOMEM; + } + + /* Finish initializing socket data structures */ + + sock->ops = ops; + sock->state = state; + + sock_init_data(sock, sk); + sk->sk_backlog_rcv = backlog_rcv; + tipc_sk(sk)->p = tp_ptr; + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); + + spin_unlock_bh(tp_ptr->lock); + + if (sock->state == SS_READY) { + tipc_set_portunreturnable(tp_ptr->ref, 1); + if (sock->type == SOCK_DGRAM) + tipc_set_portunreliable(tp_ptr->ref, 1); + } + + return 0; +} + +/** + * release - destroy a TIPC socket + * @sock: socket to destroy + * + * This routine cleans up any messages that are still queued on the socket. + * For DGRAM and RDM socket types, all queued messages are rejected. + * For SEQPACKET and STREAM socket types, the first message is rejected + * and any others are discarded. (If the first message on a STREAM socket + * is partially-read, it is discarded and the next one is rejected instead.) + * + * NOTE: Rejected messages are not necessarily returned to the sender! They + * are returned or discarded according to the "destination droppable" setting + * specified for the message by the sender. + * + * Returns 0 on success, errno otherwise + */ + +static int release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport; + struct sk_buff *buf; + int res; + + /* + * Exit if socket isn't fully initialized (occurs when a failed accept() + * releases a pre-allocated child socket that was never used) + */ + + if (sk == NULL) + return 0; + + tport = tipc_sk_port(sk); + lock_sock(sk); + + /* + * Reject all unreceived messages, except on an active connection + * (which disconnects locally & sends a 'FIN+' to peer) + */ + + while (sock->state != SS_DISCONNECTING) { + buf = __skb_dequeue(&sk->sk_receive_queue); + if (buf == NULL) + break; + atomic_dec(&tipc_queue_size); + if (TIPC_SKB_CB(buf)->handle != 0) + buf_discard(buf); + else { + if ((sock->state == SS_CONNECTING) || + (sock->state == SS_CONNECTED)) { + sock->state = SS_DISCONNECTING; + tipc_disconnect(tport->ref); + } + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + } + } + + /* + * Delete TIPC port; this ensures no more messages are queued + * (also disconnects an active connection & sends a 'FIN-' to peer) + */ + + res = tipc_deleteport(tport->ref); + + /* Discard any remaining (connection-based) messages in receive queue */ + + discard_rx_queue(sk); + + /* Reject any messages that accumulated in backlog queue */ + + sock->state = SS_DISCONNECTING; + release_sock(sk); + + sock_put(sk); + sock->sk = NULL; + + return res; +} + +/** + * bind - associate or disassocate TIPC name(s) with a socket + * @sock: socket structure + * @uaddr: socket address describing name(s) and desired operation + * @uaddr_len: size of socket address data structure + * + * Name and name sequence binding is indicated using a positive scope value; + * a negative scope value unbinds the specified name. Specifying no name + * (i.e. a socket address length of 0) unbinds all names from the socket. + * + * Returns 0 on success, errno otherwise + * + * NOTE: This routine doesn't need to take the socket lock since it doesn't + * access any non-constant socket information. + */ + +static int bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len) +{ + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; + u32 portref = tipc_sk_port(sock->sk)->ref; + + if (unlikely(!uaddr_len)) + return tipc_withdraw(portref, 0, NULL); + + if (uaddr_len < sizeof(struct sockaddr_tipc)) + return -EINVAL; + if (addr->family != AF_TIPC) + return -EAFNOSUPPORT; + + if (addr->addrtype == TIPC_ADDR_NAME) + addr->addr.nameseq.upper = addr->addr.nameseq.lower; + else if (addr->addrtype != TIPC_ADDR_NAMESEQ) + return -EAFNOSUPPORT; + + return (addr->scope > 0) ? + tipc_publish(portref, addr->scope, &addr->addr.nameseq) : + tipc_withdraw(portref, -addr->scope, &addr->addr.nameseq); +} + +/** + * get_name - get port ID of socket or peer socket + * @sock: socket structure + * @uaddr: area for returned socket address + * @uaddr_len: area for returned length of socket address + * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID + * + * Returns 0 on success, errno otherwise + * + * NOTE: This routine doesn't need to take the socket lock since it only + * accesses socket information that is unchanging (or which changes in + * a completely predictable manner). + */ + +static int get_name(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; + struct tipc_sock *tsock = tipc_sk(sock->sk); + + memset(addr, 0, sizeof(*addr)); + if (peer) { + if ((sock->state != SS_CONNECTED) && + ((peer != 2) || (sock->state != SS_DISCONNECTING))) + return -ENOTCONN; + addr->addr.id.ref = tsock->peer_name.ref; + addr->addr.id.node = tsock->peer_name.node; + } else { + addr->addr.id.ref = tsock->p->ref; + addr->addr.id.node = tipc_own_addr; + } + + *uaddr_len = sizeof(*addr); + addr->addrtype = TIPC_ADDR_ID; + addr->family = AF_TIPC; + addr->scope = 0; + addr->addr.name.domain = 0; + + return 0; +} + +/** + * poll - read and possibly block on pollmask + * @file: file structure associated with the socket + * @sock: socket for which to calculate the poll bits + * @wait: ??? + * + * Returns pollmask value + * + * COMMENTARY: + * It appears that the usual socket locking mechanisms are not useful here + * since the pollmask info is potentially out-of-date the moment this routine + * exits. TCP and other protocols seem to rely on higher level poll routines + * to handle any preventable race conditions, so TIPC will do the same ... + * + * TIPC sets the returned events as follows: + * + * socket state flags set + * ------------ --------- + * unconnected no read flags + * no write flags + * + * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue + * no write flags + * + * connected POLLIN/POLLRDNORM if data in rx queue + * POLLOUT if port is not congested + * + * disconnecting POLLIN/POLLRDNORM/POLLHUP + * no write flags + * + * listening POLLIN if SYN in rx queue + * no write flags + * + * ready POLLIN/POLLRDNORM if data in rx queue + * [connectionless] POLLOUT (since port cannot be congested) + * + * IMPORTANT: The fact that a read or write operation is indicated does NOT + * imply that the operation will succeed, merely that it should be performed + * and will not block. + */ + +static unsigned int poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + u32 mask = 0; + + poll_wait(file, sk_sleep(sk), wait); + + switch ((int)sock->state) { + case SS_READY: + case SS_CONNECTED: + if (!tipc_sk_port(sk)->congested) + mask |= POLLOUT; + /* fall thru' */ + case SS_CONNECTING: + case SS_LISTENING: + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= (POLLIN | POLLRDNORM); + break; + case SS_DISCONNECTING: + mask = (POLLIN | POLLRDNORM | POLLHUP); + break; + } + + return mask; +} + +/** + * dest_name_check - verify user is permitted to send to specified port name + * @dest: destination address + * @m: descriptor for message to be sent + * + * Prevents restricted configuration commands from being issued by + * unauthorized users. + * + * Returns 0 if permission is granted, otherwise errno + */ + +static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) +{ + struct tipc_cfg_msg_hdr hdr; + + if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES)) + return 0; + if (likely(dest->addr.name.name.type == TIPC_TOP_SRV)) + return 0; + if (likely(dest->addr.name.name.type != TIPC_CFG_SRV)) + return -EACCES; + + if (!m->msg_iovlen || (m->msg_iov[0].iov_len < sizeof(hdr))) + return -EMSGSIZE; + if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr))) + return -EFAULT; + if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN))) + return -EACCES; + + return 0; +} + +/** + * send_msg - send message in connectionless manner + * @iocb: if NULL, indicates that socket lock is already held + * @sock: socket structure + * @m: message to send + * @total_len: length of message + * + * Message must have an destination specified explicitly. + * Used for SOCK_RDM and SOCK_DGRAM messages, + * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections. + * (Note: 'SYN+' is prohibited on SOCK_STREAM.) + * + * Returns the number of bytes sent on success, or errno otherwise + */ + +static int send_msg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name; + int needs_conn; + int res = -EINVAL; + + if (unlikely(!dest)) + return -EDESTADDRREQ; + if (unlikely((m->msg_namelen < sizeof(*dest)) || + (dest->family != AF_TIPC))) + return -EINVAL; + if ((total_len > TIPC_MAX_USER_MSG_SIZE) || + (m->msg_iovlen > (unsigned)INT_MAX)) + return -EMSGSIZE; + + if (iocb) + lock_sock(sk); + + needs_conn = (sock->state != SS_READY); + if (unlikely(needs_conn)) { + if (sock->state == SS_LISTENING) { + res = -EPIPE; + goto exit; + } + if (sock->state != SS_UNCONNECTED) { + res = -EISCONN; + goto exit; + } + if ((tport->published) || + ((sock->type == SOCK_STREAM) && (total_len != 0))) { + res = -EOPNOTSUPP; + goto exit; + } + if (dest->addrtype == TIPC_ADDR_NAME) { + tport->conn_type = dest->addr.name.name.type; + tport->conn_instance = dest->addr.name.name.instance; + } + + /* Abort any pending connection attempts (very unlikely) */ + + reject_rx_queue(sk); + } + + do { + if (dest->addrtype == TIPC_ADDR_NAME) { + res = dest_name_check(dest, m); + if (res) + break; + res = tipc_send2name(tport->ref, + &dest->addr.name.name, + dest->addr.name.domain, + m->msg_iovlen, + m->msg_iov, + total_len); + } else if (dest->addrtype == TIPC_ADDR_ID) { + res = tipc_send2port(tport->ref, + &dest->addr.id, + m->msg_iovlen, + m->msg_iov, + total_len); + } else if (dest->addrtype == TIPC_ADDR_MCAST) { + if (needs_conn) { + res = -EOPNOTSUPP; + break; + } + res = dest_name_check(dest, m); + if (res) + break; + res = tipc_multicast(tport->ref, + &dest->addr.nameseq, + m->msg_iovlen, + m->msg_iov, + total_len); + } + if (likely(res != -ELINKCONG)) { + if (needs_conn && (res >= 0)) + sock->state = SS_CONNECTING; + break; + } + if (m->msg_flags & MSG_DONTWAIT) { + res = -EWOULDBLOCK; + break; + } + release_sock(sk); + res = wait_event_interruptible(*sk_sleep(sk), + !tport->congested); + lock_sock(sk); + if (res) + break; + } while (1); + +exit: + if (iocb) + release_sock(sk); + return res; +} + +/** + * send_packet - send a connection-oriented message + * @iocb: if NULL, indicates that socket lock is already held + * @sock: socket structure + * @m: message to send + * @total_len: length of message + * + * Used for SOCK_SEQPACKET messages and SOCK_STREAM data. + * + * Returns the number of bytes sent on success, or errno otherwise + */ + +static int send_packet(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name; + int res; + + /* Handle implied connection establishment */ + + if (unlikely(dest)) + return send_msg(iocb, sock, m, total_len); + + if ((total_len > TIPC_MAX_USER_MSG_SIZE) || + (m->msg_iovlen > (unsigned)INT_MAX)) + return -EMSGSIZE; + + if (iocb) + lock_sock(sk); + + do { + if (unlikely(sock->state != SS_CONNECTED)) { + if (sock->state == SS_DISCONNECTING) + res = -EPIPE; + else + res = -ENOTCONN; + break; + } + + res = tipc_send(tport->ref, m->msg_iovlen, m->msg_iov, + total_len); + if (likely(res != -ELINKCONG)) + break; + if (m->msg_flags & MSG_DONTWAIT) { + res = -EWOULDBLOCK; + break; + } + release_sock(sk); + res = wait_event_interruptible(*sk_sleep(sk), + (!tport->congested || !tport->connected)); + lock_sock(sk); + if (res) + break; + } while (1); + + if (iocb) + release_sock(sk); + return res; +} + +/** + * send_stream - send stream-oriented data + * @iocb: (unused) + * @sock: socket structure + * @m: data to send + * @total_len: total length of data to be sent + * + * Used for SOCK_STREAM data. + * + * Returns the number of bytes sent on success (or partial success), + * or errno if no data sent + */ + +static int send_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + struct msghdr my_msg; + struct iovec my_iov; + struct iovec *curr_iov; + int curr_iovlen; + char __user *curr_start; + u32 hdr_size; + int curr_left; + int bytes_to_send; + int bytes_sent; + int res; + + lock_sock(sk); + + /* Handle special cases where there is no connection */ + + if (unlikely(sock->state != SS_CONNECTED)) { + if (sock->state == SS_UNCONNECTED) { + res = send_packet(NULL, sock, m, total_len); + goto exit; + } else if (sock->state == SS_DISCONNECTING) { + res = -EPIPE; + goto exit; + } else { + res = -ENOTCONN; + goto exit; + } + } + + if (unlikely(m->msg_name)) { + res = -EISCONN; + goto exit; + } + + if ((total_len > (unsigned)INT_MAX) || + (m->msg_iovlen > (unsigned)INT_MAX)) { + res = -EMSGSIZE; + goto exit; + } + + /* + * Send each iovec entry using one or more messages + * + * Note: This algorithm is good for the most likely case + * (i.e. one large iovec entry), but could be improved to pass sets + * of small iovec entries into send_packet(). + */ + + curr_iov = m->msg_iov; + curr_iovlen = m->msg_iovlen; + my_msg.msg_iov = &my_iov; + my_msg.msg_iovlen = 1; + my_msg.msg_flags = m->msg_flags; + my_msg.msg_name = NULL; + bytes_sent = 0; + + hdr_size = msg_hdr_sz(&tport->phdr); + + while (curr_iovlen--) { + curr_start = curr_iov->iov_base; + curr_left = curr_iov->iov_len; + + while (curr_left) { + bytes_to_send = tport->max_pkt - hdr_size; + if (bytes_to_send > TIPC_MAX_USER_MSG_SIZE) + bytes_to_send = TIPC_MAX_USER_MSG_SIZE; + if (curr_left < bytes_to_send) + bytes_to_send = curr_left; + my_iov.iov_base = curr_start; + my_iov.iov_len = bytes_to_send; + res = send_packet(NULL, sock, &my_msg, bytes_to_send); + if (res < 0) { + if (bytes_sent) + res = bytes_sent; + goto exit; + } + curr_left -= bytes_to_send; + curr_start += bytes_to_send; + bytes_sent += bytes_to_send; + } + + curr_iov++; + } + res = bytes_sent; +exit: + release_sock(sk); + return res; +} + +/** + * auto_connect - complete connection setup to a remote port + * @sock: socket structure + * @msg: peer's response message + * + * Returns 0 on success, errno otherwise + */ + +static int auto_connect(struct socket *sock, struct tipc_msg *msg) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + + if (msg_errcode(msg)) { + sock->state = SS_DISCONNECTING; + return -ECONNREFUSED; + } + + tsock->peer_name.ref = msg_origport(msg); + tsock->peer_name.node = msg_orignode(msg); + tipc_connect2port(tsock->p->ref, &tsock->peer_name); + tipc_set_portimportance(tsock->p->ref, msg_importance(msg)); + sock->state = SS_CONNECTED; + return 0; +} + +/** + * set_orig_addr - capture sender's address for received message + * @m: descriptor for message info + * @msg: received message header + * + * Note: Address is not captured if not requested by receiver. + */ + +static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) +{ + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)m->msg_name; + + if (addr) { + addr->family = AF_TIPC; + addr->addrtype = TIPC_ADDR_ID; + addr->addr.id.ref = msg_origport(msg); + addr->addr.id.node = msg_orignode(msg); + addr->addr.name.domain = 0; /* could leave uninitialized */ + addr->scope = 0; /* could leave uninitialized */ + m->msg_namelen = sizeof(struct sockaddr_tipc); + } +} + +/** + * anc_data_recv - optionally capture ancillary data for received message + * @m: descriptor for message info + * @msg: received message header + * @tport: TIPC port associated with message + * + * Note: Ancillary data is not captured if not requested by receiver. + * + * Returns 0 if successful, otherwise errno + */ + +static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, + struct tipc_port *tport) +{ + u32 anc_data[3]; + u32 err; + u32 dest_type; + int has_name; + int res; + + if (likely(m->msg_controllen == 0)) + return 0; + + /* Optionally capture errored message object(s) */ + + err = msg ? msg_errcode(msg) : 0; + if (unlikely(err)) { + anc_data[0] = err; + anc_data[1] = msg_data_sz(msg); + res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data); + if (res) + return res; + if (anc_data[1]) { + res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], + msg_data(msg)); + if (res) + return res; + } + } + + /* Optionally capture message destination object */ + + dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; + switch (dest_type) { + case TIPC_NAMED_MSG: + has_name = 1; + anc_data[0] = msg_nametype(msg); + anc_data[1] = msg_namelower(msg); + anc_data[2] = msg_namelower(msg); + break; + case TIPC_MCAST_MSG: + has_name = 1; + anc_data[0] = msg_nametype(msg); + anc_data[1] = msg_namelower(msg); + anc_data[2] = msg_nameupper(msg); + break; + case TIPC_CONN_MSG: + has_name = (tport->conn_type != 0); + anc_data[0] = tport->conn_type; + anc_data[1] = tport->conn_instance; + anc_data[2] = tport->conn_instance; + break; + default: + has_name = 0; + } + if (has_name) { + res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data); + if (res) + return res; + } + + return 0; +} + +/** + * recv_msg - receive packet-oriented message + * @iocb: (unused) + * @m: descriptor for message info + * @buf_len: total size of user buffer area + * @flags: receive flags + * + * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. + * If the complete message doesn't fit in user area, truncate it. + * + * Returns size of returned message data, errno otherwise + */ + +static int recv_msg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t buf_len, int flags) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + struct sk_buff *buf; + struct tipc_msg *msg; + long timeout; + unsigned int sz; + u32 err; + int res; + + /* Catch invalid receive requests */ + + if (unlikely(!buf_len)) + return -EINVAL; + + lock_sock(sk); + + if (unlikely(sock->state == SS_UNCONNECTED)) { + res = -ENOTCONN; + goto exit; + } + + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); +restart: + + /* Look for a message in receive queue; wait if necessary */ + + while (skb_queue_empty(&sk->sk_receive_queue)) { + if (sock->state == SS_DISCONNECTING) { + res = -ENOTCONN; + goto exit; + } + if (timeout <= 0L) { + res = timeout ? timeout : -EWOULDBLOCK; + goto exit; + } + release_sock(sk); + timeout = wait_event_interruptible_timeout(*sk_sleep(sk), + tipc_rx_ready(sock), + timeout); + lock_sock(sk); + } + + /* Look at first message in receive queue */ + + buf = skb_peek(&sk->sk_receive_queue); + msg = buf_msg(buf); + sz = msg_data_sz(msg); + err = msg_errcode(msg); + + /* Complete connection setup for an implied connect */ + + if (unlikely(sock->state == SS_CONNECTING)) { + res = auto_connect(sock, msg); + if (res) + goto exit; + } + + /* Discard an empty non-errored message & try again */ + + if ((!sz) && (!err)) { + advance_rx_queue(sk); + goto restart; + } + + /* Capture sender's address (optional) */ + + set_orig_addr(m, msg); + + /* Capture ancillary data (optional) */ + + res = anc_data_recv(m, msg, tport); + if (res) + goto exit; + + /* Capture message data (if valid) & compute return value (always) */ + + if (!err) { + if (unlikely(buf_len < sz)) { + sz = buf_len; + m->msg_flags |= MSG_TRUNC; + } + res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg), + m->msg_iov, sz); + if (res) + goto exit; + res = sz; + } else { + if ((sock->state == SS_READY) || + ((err == TIPC_CONN_SHUTDOWN) || m->msg_control)) + res = 0; + else + res = -ECONNRESET; + } + + /* Consume received message (optional) */ + + if (likely(!(flags & MSG_PEEK))) { + if ((sock->state != SS_READY) && + (++tport->conn_unacked >= TIPC_FLOW_CONTROL_WIN)) + tipc_acknowledge(tport->ref, tport->conn_unacked); + advance_rx_queue(sk); + } +exit: + release_sock(sk); + return res; +} + +/** + * recv_stream - receive stream-oriented data + * @iocb: (unused) + * @m: descriptor for message info + * @buf_len: total size of user buffer area + * @flags: receive flags + * + * Used for SOCK_STREAM messages only. If not enough data is available + * will optionally wait for more; never truncates data. + * + * Returns size of returned message data, errno otherwise + */ + +static int recv_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t buf_len, int flags) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + struct sk_buff *buf; + struct tipc_msg *msg; + long timeout; + unsigned int sz; + int sz_to_copy, target, needed; + int sz_copied = 0; + u32 err; + int res = 0; + + /* Catch invalid receive attempts */ + + if (unlikely(!buf_len)) + return -EINVAL; + + lock_sock(sk); + + if (unlikely((sock->state == SS_UNCONNECTED) || + (sock->state == SS_CONNECTING))) { + res = -ENOTCONN; + goto exit; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); +restart: + + /* Look for a message in receive queue; wait if necessary */ + + while (skb_queue_empty(&sk->sk_receive_queue)) { + if (sock->state == SS_DISCONNECTING) { + res = -ENOTCONN; + goto exit; + } + if (timeout <= 0L) { + res = timeout ? timeout : -EWOULDBLOCK; + goto exit; + } + release_sock(sk); + timeout = wait_event_interruptible_timeout(*sk_sleep(sk), + tipc_rx_ready(sock), + timeout); + lock_sock(sk); + } + + /* Look at first message in receive queue */ + + buf = skb_peek(&sk->sk_receive_queue); + msg = buf_msg(buf); + sz = msg_data_sz(msg); + err = msg_errcode(msg); + + /* Discard an empty non-errored message & try again */ + + if ((!sz) && (!err)) { + advance_rx_queue(sk); + goto restart; + } + + /* Optionally capture sender's address & ancillary data of first msg */ + + if (sz_copied == 0) { + set_orig_addr(m, msg); + res = anc_data_recv(m, msg, tport); + if (res) + goto exit; + } + + /* Capture message data (if valid) & compute return value (always) */ + + if (!err) { + u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle); + + sz -= offset; + needed = (buf_len - sz_copied); + sz_to_copy = (sz <= needed) ? sz : needed; + + res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset, + m->msg_iov, sz_to_copy); + if (res) + goto exit; + + sz_copied += sz_to_copy; + + if (sz_to_copy < sz) { + if (!(flags & MSG_PEEK)) + TIPC_SKB_CB(buf)->handle = + (void *)(unsigned long)(offset + sz_to_copy); + goto exit; + } + } else { + if (sz_copied != 0) + goto exit; /* can't add error msg to valid data */ + + if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control) + res = 0; + else + res = -ECONNRESET; + } + + /* Consume received message (optional) */ + + if (likely(!(flags & MSG_PEEK))) { + if (unlikely(++tport->conn_unacked >= TIPC_FLOW_CONTROL_WIN)) + tipc_acknowledge(tport->ref, tport->conn_unacked); + advance_rx_queue(sk); + } + + /* Loop around if more data is required */ + + if ((sz_copied < buf_len) && /* didn't get all requested data */ + (!skb_queue_empty(&sk->sk_receive_queue) || + (sz_copied < target)) && /* and more is ready or required */ + (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ + (!err)) /* and haven't reached a FIN */ + goto restart; + +exit: + release_sock(sk); + return sz_copied ? sz_copied : res; +} + +/** + * rx_queue_full - determine if receive queue can accept another message + * @msg: message to be added to queue + * @queue_size: current size of queue + * @base: nominal maximum size of queue + * + * Returns 1 if queue is unable to accept message, 0 otherwise + */ + +static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base) +{ + u32 threshold; + u32 imp = msg_importance(msg); + + if (imp == TIPC_LOW_IMPORTANCE) + threshold = base; + else if (imp == TIPC_MEDIUM_IMPORTANCE) + threshold = base * 2; + else if (imp == TIPC_HIGH_IMPORTANCE) + threshold = base * 100; + else + return 0; + + if (msg_connected(msg)) + threshold *= 4; + + return queue_size >= threshold; +} + +/** + * filter_rcv - validate incoming message + * @sk: socket + * @buf: message + * + * Enqueues message on receive queue if acceptable; optionally handles + * disconnect indication for a connected socket. + * + * Called with socket lock already taken; port lock may also be taken. + * + * Returns TIPC error status code (TIPC_OK if message is not to be rejected) + */ + +static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) +{ + struct socket *sock = sk->sk_socket; + struct tipc_msg *msg = buf_msg(buf); + u32 recv_q_len; + + /* Reject message if it is wrong sort of message for socket */ + + /* + * WOULD IT BE BETTER TO JUST DISCARD THESE MESSAGES INSTEAD? + * "NO PORT" ISN'T REALLY THE RIGHT ERROR CODE, AND THERE MAY + * BE SECURITY IMPLICATIONS INHERENT IN REJECTING INVALID TRAFFIC + */ + + if (sock->state == SS_READY) { + if (msg_connected(msg)) + return TIPC_ERR_NO_PORT; + } else { + if (msg_mcast(msg)) + return TIPC_ERR_NO_PORT; + if (sock->state == SS_CONNECTED) { + if (!msg_connected(msg)) + return TIPC_ERR_NO_PORT; + } else if (sock->state == SS_CONNECTING) { + if (!msg_connected(msg) && (msg_errcode(msg) == 0)) + return TIPC_ERR_NO_PORT; + } else if (sock->state == SS_LISTENING) { + if (msg_connected(msg) || msg_errcode(msg)) + return TIPC_ERR_NO_PORT; + } else if (sock->state == SS_DISCONNECTING) { + return TIPC_ERR_NO_PORT; + } else /* (sock->state == SS_UNCONNECTED) */ { + if (msg_connected(msg) || msg_errcode(msg)) + return TIPC_ERR_NO_PORT; + } + } + + /* Reject message if there isn't room to queue it */ + + recv_q_len = (u32)atomic_read(&tipc_queue_size); + if (unlikely(recv_q_len >= OVERLOAD_LIMIT_BASE)) { + if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE)) + return TIPC_ERR_OVERLOAD; + } + recv_q_len = skb_queue_len(&sk->sk_receive_queue); + if (unlikely(recv_q_len >= (OVERLOAD_LIMIT_BASE / 2))) { + if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE / 2)) + return TIPC_ERR_OVERLOAD; + } + + /* Enqueue message (finally!) */ + + TIPC_SKB_CB(buf)->handle = 0; + atomic_inc(&tipc_queue_size); + __skb_queue_tail(&sk->sk_receive_queue, buf); + + /* Initiate connection termination for an incoming 'FIN' */ + + if (unlikely(msg_errcode(msg) && (sock->state == SS_CONNECTED))) { + sock->state = SS_DISCONNECTING; + tipc_disconnect_port(tipc_sk_port(sk)); + } + + if (waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); + return TIPC_OK; +} + +/** + * backlog_rcv - handle incoming message from backlog queue + * @sk: socket + * @buf: message + * + * Caller must hold socket lock, but not port lock. + * + * Returns 0 + */ + +static int backlog_rcv(struct sock *sk, struct sk_buff *buf) +{ + u32 res; + + res = filter_rcv(sk, buf); + if (res) + tipc_reject_msg(buf, res); + return 0; +} + +/** + * dispatch - handle incoming message + * @tport: TIPC port that received message + * @buf: message + * + * Called with port lock already taken. + * + * Returns TIPC error status code (TIPC_OK if message is not to be rejected) + */ + +static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf) +{ + struct sock *sk = (struct sock *)tport->usr_handle; + u32 res; + + /* + * Process message if socket is unlocked; otherwise add to backlog queue + * + * This code is based on sk_receive_skb(), but must be distinct from it + * since a TIPC-specific filter/reject mechanism is utilized + */ + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + res = filter_rcv(sk, buf); + } else { + if (sk_add_backlog(sk, buf)) + res = TIPC_ERR_OVERLOAD; + else + res = TIPC_OK; + } + bh_unlock_sock(sk); + + return res; +} + +/** + * wakeupdispatch - wake up port after congestion + * @tport: port to wakeup + * + * Called with port lock already taken. + */ + +static void wakeupdispatch(struct tipc_port *tport) +{ + struct sock *sk = (struct sock *)tport->usr_handle; + + if (waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); +} + +/** + * connect - establish a connection to another TIPC port + * @sock: socket structure + * @dest: socket address for destination port + * @destlen: size of socket address data structure + * @flags: file-related flags associated with socket + * + * Returns 0 on success, errno otherwise + */ + +static int connect(struct socket *sock, struct sockaddr *dest, int destlen, + int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; + struct msghdr m = {NULL,}; + struct sk_buff *buf; + struct tipc_msg *msg; + long timeout; + int res; + + lock_sock(sk); + + /* For now, TIPC does not allow use of connect() with DGRAM/RDM types */ + + if (sock->state == SS_READY) { + res = -EOPNOTSUPP; + goto exit; + } + + /* For now, TIPC does not support the non-blocking form of connect() */ + + if (flags & O_NONBLOCK) { + res = -EOPNOTSUPP; + goto exit; + } + + /* Issue Posix-compliant error code if socket is in the wrong state */ + + if (sock->state == SS_LISTENING) { + res = -EOPNOTSUPP; + goto exit; + } + if (sock->state == SS_CONNECTING) { + res = -EALREADY; + goto exit; + } + if (sock->state != SS_UNCONNECTED) { + res = -EISCONN; + goto exit; + } + + /* + * Reject connection attempt using multicast address + * + * Note: send_msg() validates the rest of the address fields, + * so there's no need to do it here + */ + + if (dst->addrtype == TIPC_ADDR_MCAST) { + res = -EINVAL; + goto exit; + } + + /* Reject any messages already in receive queue (very unlikely) */ + + reject_rx_queue(sk); + + /* Send a 'SYN-' to destination */ + + m.msg_name = dest; + m.msg_namelen = destlen; + res = send_msg(NULL, sock, &m, 0); + if (res < 0) + goto exit; + + /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ + + timeout = tipc_sk(sk)->conn_timeout; + release_sock(sk); + res = wait_event_interruptible_timeout(*sk_sleep(sk), + (!skb_queue_empty(&sk->sk_receive_queue) || + (sock->state != SS_CONNECTING)), + timeout ? timeout : MAX_SCHEDULE_TIMEOUT); + lock_sock(sk); + + if (res > 0) { + buf = skb_peek(&sk->sk_receive_queue); + if (buf != NULL) { + msg = buf_msg(buf); + res = auto_connect(sock, msg); + if (!res) { + if (!msg_data_sz(msg)) + advance_rx_queue(sk); + } + } else { + if (sock->state == SS_CONNECTED) + res = -EISCONN; + else + res = -ECONNREFUSED; + } + } else { + if (res == 0) + res = -ETIMEDOUT; + else + ; /* leave "res" unchanged */ + sock->state = SS_DISCONNECTING; + } + +exit: + release_sock(sk); + return res; +} + +/** + * listen - allow socket to listen for incoming connections + * @sock: socket structure + * @len: (unused) + * + * Returns 0 on success, errno otherwise + */ + +static int listen(struct socket *sock, int len) +{ + struct sock *sk = sock->sk; + int res; + + lock_sock(sk); + + if (sock->state == SS_READY) + res = -EOPNOTSUPP; + else if (sock->state != SS_UNCONNECTED) + res = -EINVAL; + else { + sock->state = SS_LISTENING; + res = 0; + } + + release_sock(sk); + return res; +} + +/** + * accept - wait for connection request + * @sock: listening socket + * @newsock: new socket that is to be connected + * @flags: file-related flags associated with socket + * + * Returns 0 on success, errno otherwise + */ + +static int accept(struct socket *sock, struct socket *new_sock, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *buf; + int res; + + lock_sock(sk); + + if (sock->state == SS_READY) { + res = -EOPNOTSUPP; + goto exit; + } + if (sock->state != SS_LISTENING) { + res = -EINVAL; + goto exit; + } + + while (skb_queue_empty(&sk->sk_receive_queue)) { + if (flags & O_NONBLOCK) { + res = -EWOULDBLOCK; + goto exit; + } + release_sock(sk); + res = wait_event_interruptible(*sk_sleep(sk), + (!skb_queue_empty(&sk->sk_receive_queue))); + lock_sock(sk); + if (res) + goto exit; + } + + buf = skb_peek(&sk->sk_receive_queue); + + res = tipc_create(sock_net(sock->sk), new_sock, 0, 0); + if (!res) { + struct sock *new_sk = new_sock->sk; + struct tipc_sock *new_tsock = tipc_sk(new_sk); + struct tipc_port *new_tport = new_tsock->p; + u32 new_ref = new_tport->ref; + struct tipc_msg *msg = buf_msg(buf); + + lock_sock(new_sk); + + /* + * Reject any stray messages received by new socket + * before the socket lock was taken (very, very unlikely) + */ + + reject_rx_queue(new_sk); + + /* Connect new socket to it's peer */ + + new_tsock->peer_name.ref = msg_origport(msg); + new_tsock->peer_name.node = msg_orignode(msg); + tipc_connect2port(new_ref, &new_tsock->peer_name); + new_sock->state = SS_CONNECTED; + + tipc_set_portimportance(new_ref, msg_importance(msg)); + if (msg_named(msg)) { + new_tport->conn_type = msg_nametype(msg); + new_tport->conn_instance = msg_nameinst(msg); + } + + /* + * Respond to 'SYN-' by discarding it & returning 'ACK'-. + * Respond to 'SYN+' by queuing it on new socket. + */ + + if (!msg_data_sz(msg)) { + struct msghdr m = {NULL,}; + + advance_rx_queue(sk); + send_packet(NULL, new_sock, &m, 0); + } else { + __skb_dequeue(&sk->sk_receive_queue); + __skb_queue_head(&new_sk->sk_receive_queue, buf); + } + release_sock(new_sk); + } +exit: + release_sock(sk); + return res; +} + +/** + * shutdown - shutdown socket connection + * @sock: socket structure + * @how: direction to close (must be SHUT_RDWR) + * + * Terminates connection (if necessary), then purges socket's receive queue. + * + * Returns 0 on success, errno otherwise + */ + +static int shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + struct sk_buff *buf; + int res; + + if (how != SHUT_RDWR) + return -EINVAL; + + lock_sock(sk); + + switch (sock->state) { + case SS_CONNECTING: + case SS_CONNECTED: + + /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */ +restart: + buf = __skb_dequeue(&sk->sk_receive_queue); + if (buf) { + atomic_dec(&tipc_queue_size); + if (TIPC_SKB_CB(buf)->handle != 0) { + buf_discard(buf); + goto restart; + } + tipc_disconnect(tport->ref); + tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN); + } else { + tipc_shutdown(tport->ref); + } + + sock->state = SS_DISCONNECTING; + + /* fall through */ + + case SS_DISCONNECTING: + + /* Discard any unreceived messages; wake up sleeping tasks */ + + discard_rx_queue(sk); + if (waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); + res = 0; + break; + + default: + res = -ENOTCONN; + } + + release_sock(sk); + return res; +} + +/** + * setsockopt - set socket option + * @sock: socket structure + * @lvl: option level + * @opt: option identifier + * @ov: pointer to new option value + * @ol: length of option value + * + * For stream sockets only, accepts and ignores all IPPROTO_TCP options + * (to ease compatibility). + * + * Returns 0 on success, errno otherwise + */ + +static int setsockopt(struct socket *sock, + int lvl, int opt, char __user *ov, unsigned int ol) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + u32 value; + int res; + + if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) + return 0; + if (lvl != SOL_TIPC) + return -ENOPROTOOPT; + if (ol < sizeof(value)) + return -EINVAL; + res = get_user(value, (u32 __user *)ov); + if (res) + return res; + + lock_sock(sk); + + switch (opt) { + case TIPC_IMPORTANCE: + res = tipc_set_portimportance(tport->ref, value); + break; + case TIPC_SRC_DROPPABLE: + if (sock->type != SOCK_STREAM) + res = tipc_set_portunreliable(tport->ref, value); + else + res = -ENOPROTOOPT; + break; + case TIPC_DEST_DROPPABLE: + res = tipc_set_portunreturnable(tport->ref, value); + break; + case TIPC_CONN_TIMEOUT: + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value); + /* no need to set "res", since already 0 at this point */ + break; + default: + res = -EINVAL; + } + + release_sock(sk); + + return res; +} + +/** + * getsockopt - get socket option + * @sock: socket structure + * @lvl: option level + * @opt: option identifier + * @ov: receptacle for option value + * @ol: receptacle for length of option value + * + * For stream sockets only, returns 0 length result for all IPPROTO_TCP options + * (to ease compatibility). + * + * Returns 0 on success, errno otherwise + */ + +static int getsockopt(struct socket *sock, + int lvl, int opt, char __user *ov, int __user *ol) +{ + struct sock *sk = sock->sk; + struct tipc_port *tport = tipc_sk_port(sk); + int len; + u32 value; + int res; + + if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) + return put_user(0, ol); + if (lvl != SOL_TIPC) + return -ENOPROTOOPT; + res = get_user(len, ol); + if (res) + return res; + + lock_sock(sk); + + switch (opt) { + case TIPC_IMPORTANCE: + res = tipc_portimportance(tport->ref, &value); + break; + case TIPC_SRC_DROPPABLE: + res = tipc_portunreliable(tport->ref, &value); + break; + case TIPC_DEST_DROPPABLE: + res = tipc_portunreturnable(tport->ref, &value); + break; + case TIPC_CONN_TIMEOUT: + value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout); + /* no need to set "res", since already 0 at this point */ + break; + case TIPC_NODE_RECVQ_DEPTH: + value = (u32)atomic_read(&tipc_queue_size); + break; + case TIPC_SOCK_RECVQ_DEPTH: + value = skb_queue_len(&sk->sk_receive_queue); + break; + default: + res = -EINVAL; + } + + release_sock(sk); + + if (res) + return res; /* "get" failed */ + + if (len < sizeof(value)) + return -EINVAL; + + if (copy_to_user(ov, &value, sizeof(value))) + return -EFAULT; + + return put_user(sizeof(value), ol); +} + +/** + * Protocol switches for the various types of TIPC sockets + */ + +static const struct proto_ops msg_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .release = release, + .bind = bind, + .connect = connect, + .socketpair = sock_no_socketpair, + .accept = accept, + .getname = get_name, + .poll = poll, + .ioctl = sock_no_ioctl, + .listen = listen, + .shutdown = shutdown, + .setsockopt = setsockopt, + .getsockopt = getsockopt, + .sendmsg = send_msg, + .recvmsg = recv_msg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage +}; + +static const struct proto_ops packet_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .release = release, + .bind = bind, + .connect = connect, + .socketpair = sock_no_socketpair, + .accept = accept, + .getname = get_name, + .poll = poll, + .ioctl = sock_no_ioctl, + .listen = listen, + .shutdown = shutdown, + .setsockopt = setsockopt, + .getsockopt = getsockopt, + .sendmsg = send_packet, + .recvmsg = recv_msg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage +}; + +static const struct proto_ops stream_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .release = release, + .bind = bind, + .connect = connect, + .socketpair = sock_no_socketpair, + .accept = accept, + .getname = get_name, + .poll = poll, + .ioctl = sock_no_ioctl, + .listen = listen, + .shutdown = shutdown, + .setsockopt = setsockopt, + .getsockopt = getsockopt, + .sendmsg = send_stream, + .recvmsg = recv_stream, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage +}; + +static const struct net_proto_family tipc_family_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .create = tipc_create +}; + +static struct proto tipc_proto = { + .name = "TIPC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct tipc_sock) +}; + +/** + * tipc_socket_init - initialize TIPC socket interface + * + * Returns 0 on success, errno otherwise + */ +int tipc_socket_init(void) +{ + int res; + + res = proto_register(&tipc_proto, 1); + if (res) { + err("Failed to register TIPC protocol type\n"); + goto out; + } + + res = sock_register(&tipc_family_ops); + if (res) { + err("Failed to register TIPC socket type\n"); + proto_unregister(&tipc_proto); + goto out; + } + + sockets_enabled = 1; + out: + return res; +} + +/** + * tipc_socket_stop - stop TIPC socket interface + */ + +void tipc_socket_stop(void) +{ + if (!sockets_enabled) + return; + + sockets_enabled = 0; + sock_unregister(tipc_family_ops.family); + proto_unregister(&tipc_proto); +} + diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c new file mode 100644 index 00000000..6cf72686 --- /dev/null +++ b/net/tipc/subscr.c @@ -0,0 +1,589 @@ +/* + * net/tipc/subscr.c: TIPC network topology service + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005-2007, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "name_table.h" +#include "port.h" +#include "subscr.h" + +/** + * struct subscriber - TIPC network topology subscriber + * @port_ref: object reference to server port connecting to subscriber + * @lock: pointer to spinlock controlling access to subscriber's server port + * @subscriber_list: adjacent subscribers in top. server's list of subscribers + * @subscription_list: list of subscription objects for this subscriber + */ + +struct subscriber { + u32 port_ref; + spinlock_t *lock; + struct list_head subscriber_list; + struct list_head subscription_list; +}; + +/** + * struct top_srv - TIPC network topology subscription service + * @user_ref: TIPC userid of subscription service + * @setup_port: reference to TIPC port that handles subscription requests + * @subscription_count: number of active subscriptions (not subscribers!) + * @subscriber_list: list of ports subscribing to service + * @lock: spinlock govering access to subscriber list + */ + +struct top_srv { + u32 setup_port; + atomic_t subscription_count; + struct list_head subscriber_list; + spinlock_t lock; +}; + +static struct top_srv topsrv; + +/** + * htohl - convert value to endianness used by destination + * @in: value to convert + * @swap: non-zero if endianness must be reversed + * + * Returns converted value + */ + +static u32 htohl(u32 in, int swap) +{ + return swap ? swab32(in) : in; +} + +/** + * subscr_send_event - send a message containing a tipc_event to the subscriber + * + * Note: Must not hold subscriber's server port lock, since tipc_send() will + * try to take the lock if the message is rejected and returned! + */ + +static void subscr_send_event(struct subscription *sub, + u32 found_lower, + u32 found_upper, + u32 event, + u32 port_ref, + u32 node) +{ + struct iovec msg_sect; + + msg_sect.iov_base = (void *)&sub->evt; + msg_sect.iov_len = sizeof(struct tipc_event); + + sub->evt.event = htohl(event, sub->swap); + sub->evt.found_lower = htohl(found_lower, sub->swap); + sub->evt.found_upper = htohl(found_upper, sub->swap); + sub->evt.port.ref = htohl(port_ref, sub->swap); + sub->evt.port.node = htohl(node, sub->swap); + tipc_send(sub->server_ref, 1, &msg_sect, msg_sect.iov_len); +} + +/** + * tipc_subscr_overlap - test for subscription overlap with the given values + * + * Returns 1 if there is overlap, otherwise 0. + */ + +int tipc_subscr_overlap(struct subscription *sub, + u32 found_lower, + u32 found_upper) + +{ + if (found_lower < sub->seq.lower) + found_lower = sub->seq.lower; + if (found_upper > sub->seq.upper) + found_upper = sub->seq.upper; + if (found_lower > found_upper) + return 0; + return 1; +} + +/** + * tipc_subscr_report_overlap - issue event if there is subscription overlap + * + * Protected by nameseq.lock in name_table.c + */ + +void tipc_subscr_report_overlap(struct subscription *sub, + u32 found_lower, + u32 found_upper, + u32 event, + u32 port_ref, + u32 node, + int must) +{ + if (!tipc_subscr_overlap(sub, found_lower, found_upper)) + return; + if (!must && !(sub->filter & TIPC_SUB_PORTS)) + return; + + sub->event_cb(sub, found_lower, found_upper, event, port_ref, node); +} + +/** + * subscr_timeout - subscription timeout has occurred + */ + +static void subscr_timeout(struct subscription *sub) +{ + struct tipc_port *server_port; + + /* Validate server port reference (in case subscriber is terminating) */ + + server_port = tipc_port_lock(sub->server_ref); + if (server_port == NULL) + return; + + /* Validate timeout (in case subscription is being cancelled) */ + + if (sub->timeout == TIPC_WAIT_FOREVER) { + tipc_port_unlock(server_port); + return; + } + + /* Unlink subscription from name table */ + + tipc_nametbl_unsubscribe(sub); + + /* Unlink subscription from subscriber */ + + list_del(&sub->subscription_list); + + /* Release subscriber's server port */ + + tipc_port_unlock(server_port); + + /* Notify subscriber of timeout */ + + subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, + TIPC_SUBSCR_TIMEOUT, 0, 0); + + /* Now destroy subscription */ + + k_term_timer(&sub->timer); + kfree(sub); + atomic_dec(&topsrv.subscription_count); +} + +/** + * subscr_del - delete a subscription within a subscription list + * + * Called with subscriber port locked. + */ + +static void subscr_del(struct subscription *sub) +{ + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscription_list); + kfree(sub); + atomic_dec(&topsrv.subscription_count); +} + +/** + * subscr_terminate - terminate communication with a subscriber + * + * Called with subscriber port locked. Routine must temporarily release lock + * to enable subscription timeout routine(s) to finish without deadlocking; + * the lock is then reclaimed to allow caller to release it upon return. + * (This should work even in the unlikely event some other thread creates + * a new object reference in the interim that uses this lock; this routine will + * simply wait for it to be released, then claim it.) + */ + +static void subscr_terminate(struct subscriber *subscriber) +{ + u32 port_ref; + struct subscription *sub; + struct subscription *sub_temp; + + /* Invalidate subscriber reference */ + + port_ref = subscriber->port_ref; + subscriber->port_ref = 0; + spin_unlock_bh(subscriber->lock); + + /* Sever connection to subscriber */ + + tipc_shutdown(port_ref); + tipc_deleteport(port_ref); + + /* Destroy any existing subscriptions for subscriber */ + + list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, + subscription_list) { + if (sub->timeout != TIPC_WAIT_FOREVER) { + k_cancel_timer(&sub->timer); + k_term_timer(&sub->timer); + } + subscr_del(sub); + } + + /* Remove subscriber from topology server's subscriber list */ + + spin_lock_bh(&topsrv.lock); + list_del(&subscriber->subscriber_list); + spin_unlock_bh(&topsrv.lock); + + /* Reclaim subscriber lock */ + + spin_lock_bh(subscriber->lock); + + /* Now destroy subscriber */ + + kfree(subscriber); +} + +/** + * subscr_cancel - handle subscription cancellation request + * + * Called with subscriber port locked. Routine must temporarily release lock + * to enable the subscription timeout routine to finish without deadlocking; + * the lock is then reclaimed to allow caller to release it upon return. + * + * Note that fields of 's' use subscriber's endianness! + */ + +static void subscr_cancel(struct tipc_subscr *s, + struct subscriber *subscriber) +{ + struct subscription *sub; + struct subscription *sub_temp; + int found = 0; + + /* Find first matching subscription, exit if not found */ + + list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, + subscription_list) { + if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { + found = 1; + break; + } + } + if (!found) + return; + + /* Cancel subscription timer (if used), then delete subscription */ + + if (sub->timeout != TIPC_WAIT_FOREVER) { + sub->timeout = TIPC_WAIT_FOREVER; + spin_unlock_bh(subscriber->lock); + k_cancel_timer(&sub->timer); + k_term_timer(&sub->timer); + spin_lock_bh(subscriber->lock); + } + subscr_del(sub); +} + +/** + * subscr_subscribe - create subscription for subscriber + * + * Called with subscriber port locked. + */ + +static struct subscription *subscr_subscribe(struct tipc_subscr *s, + struct subscriber *subscriber) +{ + struct subscription *sub; + int swap; + + /* Determine subscriber's endianness */ + + swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE)); + + /* Detect & process a subscription cancellation request */ + + if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { + s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); + subscr_cancel(s, subscriber); + return NULL; + } + + /* Refuse subscription if global limit exceeded */ + + if (atomic_read(&topsrv.subscription_count) >= tipc_max_subscriptions) { + warn("Subscription rejected, subscription limit reached (%u)\n", + tipc_max_subscriptions); + subscr_terminate(subscriber); + return NULL; + } + + /* Allocate subscription object */ + + sub = kmalloc(sizeof(*sub), GFP_ATOMIC); + if (!sub) { + warn("Subscription rejected, no memory\n"); + subscr_terminate(subscriber); + return NULL; + } + + /* Initialize subscription object */ + + sub->seq.type = htohl(s->seq.type, swap); + sub->seq.lower = htohl(s->seq.lower, swap); + sub->seq.upper = htohl(s->seq.upper, swap); + sub->timeout = htohl(s->timeout, swap); + sub->filter = htohl(s->filter, swap); + if ((!(sub->filter & TIPC_SUB_PORTS) == + !(sub->filter & TIPC_SUB_SERVICE)) || + (sub->seq.lower > sub->seq.upper)) { + warn("Subscription rejected, illegal request\n"); + kfree(sub); + subscr_terminate(subscriber); + return NULL; + } + sub->event_cb = subscr_send_event; + INIT_LIST_HEAD(&sub->nameseq_list); + list_add(&sub->subscription_list, &subscriber->subscription_list); + sub->server_ref = subscriber->port_ref; + sub->swap = swap; + memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); + atomic_inc(&topsrv.subscription_count); + if (sub->timeout != TIPC_WAIT_FOREVER) { + k_init_timer(&sub->timer, + (Handler)subscr_timeout, (unsigned long)sub); + k_start_timer(&sub->timer, sub->timeout); + } + + return sub; +} + +/** + * subscr_conn_shutdown_event - handle termination request from subscriber + * + * Called with subscriber's server port unlocked. + */ + +static void subscr_conn_shutdown_event(void *usr_handle, + u32 port_ref, + struct sk_buff **buf, + unsigned char const *data, + unsigned int size, + int reason) +{ + struct subscriber *subscriber = usr_handle; + spinlock_t *subscriber_lock; + + if (tipc_port_lock(port_ref) == NULL) + return; + + subscriber_lock = subscriber->lock; + subscr_terminate(subscriber); + spin_unlock_bh(subscriber_lock); +} + +/** + * subscr_conn_msg_event - handle new subscription request from subscriber + * + * Called with subscriber's server port unlocked. + */ + +static void subscr_conn_msg_event(void *usr_handle, + u32 port_ref, + struct sk_buff **buf, + const unchar *data, + u32 size) +{ + struct subscriber *subscriber = usr_handle; + spinlock_t *subscriber_lock; + struct subscription *sub; + + /* + * Lock subscriber's server port (& make a local copy of lock pointer, + * in case subscriber is deleted while processing subscription request) + */ + + if (tipc_port_lock(port_ref) == NULL) + return; + + subscriber_lock = subscriber->lock; + + if (size != sizeof(struct tipc_subscr)) { + subscr_terminate(subscriber); + spin_unlock_bh(subscriber_lock); + } else { + sub = subscr_subscribe((struct tipc_subscr *)data, subscriber); + spin_unlock_bh(subscriber_lock); + if (sub != NULL) { + + /* + * We must release the server port lock before adding a + * subscription to the name table since TIPC needs to be + * able to (re)acquire the port lock if an event message + * issued by the subscription process is rejected and + * returned. The subscription cannot be deleted while + * it is being added to the name table because: + * a) the single-threading of the native API port code + * ensures the subscription cannot be cancelled and + * the subscriber connection cannot be broken, and + * b) the name table lock ensures the subscription + * timeout code cannot delete the subscription, + * so the subscription object is still protected. + */ + + tipc_nametbl_subscribe(sub); + } + } +} + +/** + * subscr_named_msg_event - handle request to establish a new subscriber + */ + +static void subscr_named_msg_event(void *usr_handle, + u32 port_ref, + struct sk_buff **buf, + const unchar *data, + u32 size, + u32 importance, + struct tipc_portid const *orig, + struct tipc_name_seq const *dest) +{ + struct subscriber *subscriber; + u32 server_port_ref; + + /* Create subscriber object */ + + subscriber = kzalloc(sizeof(struct subscriber), GFP_ATOMIC); + if (subscriber == NULL) { + warn("Subscriber rejected, no memory\n"); + return; + } + INIT_LIST_HEAD(&subscriber->subscription_list); + INIT_LIST_HEAD(&subscriber->subscriber_list); + + /* Create server port & establish connection to subscriber */ + + tipc_createport(subscriber, + importance, + NULL, + NULL, + subscr_conn_shutdown_event, + NULL, + NULL, + subscr_conn_msg_event, + NULL, + &subscriber->port_ref); + if (subscriber->port_ref == 0) { + warn("Subscriber rejected, unable to create port\n"); + kfree(subscriber); + return; + } + tipc_connect2port(subscriber->port_ref, orig); + + /* Lock server port (& save lock address for future use) */ + + subscriber->lock = tipc_port_lock(subscriber->port_ref)->lock; + + /* Add subscriber to topology server's subscriber list */ + + spin_lock_bh(&topsrv.lock); + list_add(&subscriber->subscriber_list, &topsrv.subscriber_list); + spin_unlock_bh(&topsrv.lock); + + /* Unlock server port */ + + server_port_ref = subscriber->port_ref; + spin_unlock_bh(subscriber->lock); + + /* Send an ACK- to complete connection handshaking */ + + tipc_send(server_port_ref, 0, NULL, 0); + + /* Handle optional subscription request */ + + if (size != 0) { + subscr_conn_msg_event(subscriber, server_port_ref, + buf, data, size); + } +} + +int tipc_subscr_start(void) +{ + struct tipc_name_seq seq = {TIPC_TOP_SRV, TIPC_TOP_SRV, TIPC_TOP_SRV}; + int res; + + memset(&topsrv, 0, sizeof(topsrv)); + spin_lock_init(&topsrv.lock); + INIT_LIST_HEAD(&topsrv.subscriber_list); + + res = tipc_createport(NULL, + TIPC_CRITICAL_IMPORTANCE, + NULL, + NULL, + NULL, + NULL, + subscr_named_msg_event, + NULL, + NULL, + &topsrv.setup_port); + if (res) + goto failed; + + res = tipc_nametbl_publish_rsv(topsrv.setup_port, TIPC_NODE_SCOPE, &seq); + if (res) { + tipc_deleteport(topsrv.setup_port); + topsrv.setup_port = 0; + goto failed; + } + + return 0; + +failed: + err("Failed to create subscription service\n"); + return res; +} + +void tipc_subscr_stop(void) +{ + struct subscriber *subscriber; + struct subscriber *subscriber_temp; + spinlock_t *subscriber_lock; + + if (topsrv.setup_port) { + tipc_deleteport(topsrv.setup_port); + topsrv.setup_port = 0; + + list_for_each_entry_safe(subscriber, subscriber_temp, + &topsrv.subscriber_list, + subscriber_list) { + subscriber_lock = subscriber->lock; + spin_lock_bh(subscriber_lock); + subscr_terminate(subscriber); + spin_unlock_bh(subscriber_lock); + } + } +} diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h new file mode 100644 index 00000000..45d89bf4 --- /dev/null +++ b/net/tipc/subscr.h @@ -0,0 +1,90 @@ +/* + * net/tipc/subscr.h: Include file for TIPC network topology service + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005-2007, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_SUBSCR_H +#define _TIPC_SUBSCR_H + +struct subscription; + +typedef void (*tipc_subscr_event) (struct subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port_ref, u32 node); + +/** + * struct subscription - TIPC network topology subscription object + * @seq: name sequence associated with subscription + * @timeout: duration of subscription (in ms) + * @filter: event filtering to be done for subscription + * @event_cb: routine invoked when a subscription event is detected + * @timer: timer governing subscription duration (optional) + * @nameseq_list: adjacent subscriptions in name sequence's subscription list + * @subscription_list: adjacent subscriptions in subscriber's subscription list + * @server_ref: object reference of server port associated with subscription + * @swap: indicates if subscriber uses opposite endianness in its messages + * @evt: template for events generated by subscription + */ + +struct subscription { + struct tipc_name_seq seq; + u32 timeout; + u32 filter; + tipc_subscr_event event_cb; + struct timer_list timer; + struct list_head nameseq_list; + struct list_head subscription_list; + u32 server_ref; + int swap; + struct tipc_event evt; +}; + +int tipc_subscr_overlap(struct subscription *sub, + u32 found_lower, + u32 found_upper); + +void tipc_subscr_report_overlap(struct subscription *sub, + u32 found_lower, + u32 found_upper, + u32 event, + u32 port_ref, + u32 node, + int must_report); + +int tipc_subscr_start(void); + +void tipc_subscr_stop(void); + + +#endif diff --git a/net/unix/Kconfig b/net/unix/Kconfig new file mode 100644 index 00000000..5a69733b --- /dev/null +++ b/net/unix/Kconfig @@ -0,0 +1,21 @@ +# +# Unix Domain Sockets +# + +config UNIX + tristate "Unix domain sockets" + ---help--- + If you say Y here, you will include support for Unix domain sockets; + sockets are the standard Unix mechanism for establishing and + accessing network connections. Many commonly used programs such as + the X Window system and syslog use these sockets even if your + machine is not connected to any network. Unless you are working on + an embedded system or something similar, you therefore definitely + want to say Y here. + + To compile this driver as a module, choose M here: the module will be + called unix. Note that several important services won't work + correctly if you say M here and then neglect to load the module. + + Say Y unless you know what you are doing. + diff --git a/net/unix/Makefile b/net/unix/Makefile new file mode 100644 index 00000000..b852a2bd --- /dev/null +++ b/net/unix/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux unix domain socket layer. +# + +obj-$(CONFIG_UNIX) += unix.o + +unix-y := af_unix.o garbage.o +unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c new file mode 100644 index 00000000..0722a25a --- /dev/null +++ b/net/unix/af_unix.c @@ -0,0 +1,2385 @@ +/* + * NET4: Implementation of BSD Unix domain sockets. + * + * Authors: Alan Cox, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Linus Torvalds : Assorted bug cures. + * Niibe Yutaka : async I/O support. + * Carsten Paeth : PF_UNIX check, address fixes. + * Alan Cox : Limit size of allocated blocks. + * Alan Cox : Fixed the stupid socketpair bug. + * Alan Cox : BSD compatibility fine tuning. + * Alan Cox : Fixed a bug in connect when interrupted. + * Alan Cox : Sorted out a proper draft version of + * file descriptor passing hacked up from + * Mike Shaver's work. + * Marty Leisner : Fixes to fd passing + * Nick Nevin : recvmsg bugfix. + * Alan Cox : Started proper garbage collector + * Heiko EiBfeldt : Missing verify_area check + * Alan Cox : Started POSIXisms + * Andreas Schwab : Replace inode by dentry for proper + * reference counting + * Kirk Petersen : Made this a module + * Christoph Rohland : Elegant non-blocking accept/connect algorithm. + * Lots of bug fixes. + * Alexey Kuznetosv : Repaired (I hope) bugs introduces + * by above two patches. + * Andrea Arcangeli : If possible we block in connect(2) + * if the max backlog of the listen socket + * is been reached. This won't break + * old apps and it will avoid huge amount + * of socks hashed (this for unix_gc() + * performances reasons). + * Security fix that limits the max + * number of socks to 2*max_files and + * the number of skb queueable in the + * dgram receiver. + * Artur Skawina : Hash function optimizations + * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) + * Malcolm Beattie : Set peercred for socketpair + * Michal Ostrowski : Module initialization cleanup. + * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, + * the core infrastructure is doing that + * for all net proto families now (2.5.69+) + * + * + * Known differences from reference BSD that was tested: + * + * [TO FIX] + * ECONNREFUSED is not returned from one end of a connected() socket to the + * other the moment one end closes. + * fstat() doesn't return st_dev=0, and give the blksize as high water mark + * and a fake inode identifier (nor the BSD first socket fstat twice bug). + * [NOT TO FIX] + * accept() returns a path name even if the connecting socket has closed + * in the meantime (BSD loses the path and gives up). + * accept() returns 0 length path for an unbound connector. BSD returns 16 + * and a null first byte in the path (but not for gethost/peername - BSD bug ??) + * socketpair(...SOCK_RAW..) doesn't panic the kernel. + * BSD af_unix apparently has connect forgetting to block properly. + * (need to check this with the POSIX spec in detail) + * + * Differences from 2.0.0-11-... (ANK) + * Bug fixes and improvements. + * - client shutdown killed server socket. + * - removed all useless cli/sti pairs. + * + * Semantic changes/extensions. + * - generic control message passing. + * - SCM_CREDENTIALS control message. + * - "Abstract" (not FS based) socket bindings. + * Abstract names are sequences of bytes (not zero terminated) + * started by 0, so that this name space does not intersect + * with BSD names. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +static DEFINE_SPINLOCK(unix_table_lock); +static atomic_long_t unix_nr_socks; + +#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) + +#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) + +#ifdef CONFIG_SECURITY_NETWORK +static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ + memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); +} + +static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->secid = *UNIXSID(skb); +} +#else +static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ } + +static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ } +#endif /* CONFIG_SECURITY_NETWORK */ + +/* + * SMP locking strategy: + * hash table is protected with spinlock unix_table_lock + * each socket state is protected by separate spin lock. + */ + +static inline unsigned unix_hash_fold(__wsum n) +{ + unsigned hash = (__force unsigned)n; + hash ^= hash>>16; + hash ^= hash>>8; + return hash&(UNIX_HASH_SIZE-1); +} + +#define unix_peer(sk) (unix_sk(sk)->peer) + +static inline int unix_our_peer(struct sock *sk, struct sock *osk) +{ + return unix_peer(osk) == sk; +} + +static inline int unix_may_send(struct sock *sk, struct sock *osk) +{ + return unix_peer(osk) == NULL || unix_our_peer(sk, osk); +} + +static inline int unix_recvq_full(struct sock const *sk) +{ + return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; +} + +static struct sock *unix_peer_get(struct sock *s) +{ + struct sock *peer; + + unix_state_lock(s); + peer = unix_peer(s); + if (peer) + sock_hold(peer); + unix_state_unlock(s); + return peer; +} + +static inline void unix_release_addr(struct unix_address *addr) +{ + if (atomic_dec_and_test(&addr->refcnt)) + kfree(addr); +} + +/* + * Check unix socket name: + * - should be not zero length. + * - if started by not zero, should be NULL terminated (FS object) + * - if started by zero, it is abstract name. + */ + +static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned *hashp) +{ + if (len <= sizeof(short) || len > sizeof(*sunaddr)) + return -EINVAL; + if (!sunaddr || sunaddr->sun_family != AF_UNIX) + return -EINVAL; + if (sunaddr->sun_path[0]) { + /* + * This may look like an off by one error but it is a bit more + * subtle. 108 is the longest valid AF_UNIX path for a binding. + * sun_path[108] doesn't as such exist. However in kernel space + * we are guaranteed that it is a valid memory location in our + * kernel address buffer. + */ + ((char *)sunaddr)[len] = 0; + len = strlen(sunaddr->sun_path)+1+sizeof(short); + return len; + } + + *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0)); + return len; +} + +static void __unix_remove_socket(struct sock *sk) +{ + sk_del_node_init(sk); +} + +static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) +{ + WARN_ON(!sk_unhashed(sk)); + sk_add_node(sk, list); +} + +static inline void unix_remove_socket(struct sock *sk) +{ + spin_lock(&unix_table_lock); + __unix_remove_socket(sk); + spin_unlock(&unix_table_lock); +} + +static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) +{ + spin_lock(&unix_table_lock); + __unix_insert_socket(list, sk); + spin_unlock(&unix_table_lock); +} + +static struct sock *__unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, + int len, int type, unsigned hash) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &unix_socket_table[hash ^ type]) { + struct unix_sock *u = unix_sk(s); + + if (!net_eq(sock_net(s), net)) + continue; + + if (u->addr->len == len && + !memcmp(u->addr->name, sunname, len)) + goto found; + } + s = NULL; +found: + return s; +} + +static inline struct sock *unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, + int len, int type, + unsigned hash) +{ + struct sock *s; + + spin_lock(&unix_table_lock); + s = __unix_find_socket_byname(net, sunname, len, type, hash); + if (s) + sock_hold(s); + spin_unlock(&unix_table_lock); + return s; +} + +static struct sock *unix_find_socket_byinode(struct inode *i) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock(&unix_table_lock); + sk_for_each(s, node, + &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { + struct dentry *dentry = unix_sk(s)->dentry; + + if (dentry && dentry->d_inode == i) { + sock_hold(s); + goto found; + } + } + s = NULL; +found: + spin_unlock(&unix_table_lock); + return s; +} + +static inline int unix_writable(struct sock *sk) +{ + return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; +} + +static void unix_write_space(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + if (unix_writable(sk)) { + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, + POLLOUT | POLLWRNORM | POLLWRBAND); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } + rcu_read_unlock(); +} + +/* When dgram socket disconnects (or changes its peer), we clear its receive + * queue of packets arrived from previous peer. First, it allows to do + * flow control based only on wmem_alloc; second, sk connected to peer + * may receive messages only from that peer. */ +static void unix_dgram_disconnected(struct sock *sk, struct sock *other) +{ + if (!skb_queue_empty(&sk->sk_receive_queue)) { + skb_queue_purge(&sk->sk_receive_queue); + wake_up_interruptible_all(&unix_sk(sk)->peer_wait); + + /* If one link of bidirectional dgram pipe is disconnected, + * we signal error. Messages are lost. Do not make this, + * when peer was not connected to us. + */ + if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { + other->sk_err = ECONNRESET; + other->sk_error_report(other); + } + } +} + +static void unix_sock_destructor(struct sock *sk) +{ + struct unix_sock *u = unix_sk(sk); + + skb_queue_purge(&sk->sk_receive_queue); + + WARN_ON(atomic_read(&sk->sk_wmem_alloc)); + WARN_ON(!sk_unhashed(sk)); + WARN_ON(sk->sk_socket); + if (!sock_flag(sk, SOCK_DEAD)) { + printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk); + return; + } + + if (u->addr) + unix_release_addr(u->addr); + + atomic_long_dec(&unix_nr_socks); + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + local_bh_enable(); +#ifdef UNIX_REFCNT_DEBUG + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); +#endif +} + +static int unix_release_sock(struct sock *sk, int embrion) +{ + struct unix_sock *u = unix_sk(sk); + struct dentry *dentry; + struct vfsmount *mnt; + struct sock *skpair; + struct sk_buff *skb; + int state; + + unix_remove_socket(sk); + + /* Clear state */ + unix_state_lock(sk); + sock_orphan(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + dentry = u->dentry; + u->dentry = NULL; + mnt = u->mnt; + u->mnt = NULL; + state = sk->sk_state; + sk->sk_state = TCP_CLOSE; + unix_state_unlock(sk); + + wake_up_interruptible_all(&u->peer_wait); + + skpair = unix_peer(sk); + + if (skpair != NULL) { + if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + unix_state_lock(skpair); + /* No more writes */ + skpair->sk_shutdown = SHUTDOWN_MASK; + if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) + skpair->sk_err = ECONNRESET; + unix_state_unlock(skpair); + skpair->sk_state_change(skpair); + sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); + } + sock_put(skpair); /* It may now die */ + unix_peer(sk) = NULL; + } + + /* Try to flush out this socket. Throw out buffers at least */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (state == TCP_LISTEN) + unix_release_sock(skb->sk, 1); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb(skb); + } + + if (dentry) { + dput(dentry); + mntput(mnt); + } + + sock_put(sk); + + /* ---- Socket is dead now and most probably destroyed ---- */ + + /* + * Fixme: BSD difference: In BSD all sockets connected to use get + * ECONNRESET and we die on the spot. In Linux we behave + * like files and pipes do and wait for the last + * dereference. + * + * Can't we simply set sock->err? + * + * What the above comment does talk about? --ANK(980817) + */ + + if (unix_tot_inflight) + unix_gc(); /* Garbage collect fds */ + + return 0; +} + +static void init_peercred(struct sock *sk) +{ + put_pid(sk->sk_peer_pid); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + sk->sk_peer_pid = get_pid(task_tgid(current)); + sk->sk_peer_cred = get_current_cred(); +} + +static void copy_peercred(struct sock *sk, struct sock *peersk) +{ + put_pid(sk->sk_peer_pid); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); +} + +static int unix_listen(struct socket *sock, int backlog) +{ + int err; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + struct pid *old_pid = NULL; + const struct cred *old_cred = NULL; + + err = -EOPNOTSUPP; + if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) + goto out; /* Only stream/seqpacket sockets accept */ + err = -EINVAL; + if (!u->addr) + goto out; /* No listens on an unbound socket */ + unix_state_lock(sk); + if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) + goto out_unlock; + if (backlog > sk->sk_max_ack_backlog) + wake_up_interruptible_all(&u->peer_wait); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + /* set credentials so connect can copy them */ + init_peercred(sk); + err = 0; + +out_unlock: + unix_state_unlock(sk); + put_pid(old_pid); + if (old_cred) + put_cred(old_cred); +out: + return err; +} + +static int unix_release(struct socket *); +static int unix_bind(struct socket *, struct sockaddr *, int); +static int unix_stream_connect(struct socket *, struct sockaddr *, + int addr_len, int flags); +static int unix_socketpair(struct socket *, struct socket *); +static int unix_accept(struct socket *, struct socket *, int); +static int unix_getname(struct socket *, struct sockaddr *, int *, int); +static unsigned int unix_poll(struct file *, struct socket *, poll_table *); +static unsigned int unix_dgram_poll(struct file *, struct socket *, + poll_table *); +static int unix_ioctl(struct socket *, unsigned int, unsigned long); +static int unix_shutdown(struct socket *, int); +static int unix_stream_sendmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t); +static int unix_stream_recvmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t, int); +static int unix_dgram_sendmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t); +static int unix_dgram_recvmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t, int); +static int unix_dgram_connect(struct socket *, struct sockaddr *, + int, int); +static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t); +static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t, int); + +static const struct proto_ops unix_stream_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = unix_release, + .bind = unix_bind, + .connect = unix_stream_connect, + .socketpair = unix_socketpair, + .accept = unix_accept, + .getname = unix_getname, + .poll = unix_poll, + .ioctl = unix_ioctl, + .listen = unix_listen, + .shutdown = unix_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = unix_stream_sendmsg, + .recvmsg = unix_stream_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops unix_dgram_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = unix_release, + .bind = unix_bind, + .connect = unix_dgram_connect, + .socketpair = unix_socketpair, + .accept = sock_no_accept, + .getname = unix_getname, + .poll = unix_dgram_poll, + .ioctl = unix_ioctl, + .listen = sock_no_listen, + .shutdown = unix_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = unix_dgram_sendmsg, + .recvmsg = unix_dgram_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static const struct proto_ops unix_seqpacket_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = unix_release, + .bind = unix_bind, + .connect = unix_stream_connect, + .socketpair = unix_socketpair, + .accept = unix_accept, + .getname = unix_getname, + .poll = unix_dgram_poll, + .ioctl = unix_ioctl, + .listen = unix_listen, + .shutdown = unix_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = unix_seqpacket_sendmsg, + .recvmsg = unix_seqpacket_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto unix_proto = { + .name = "UNIX", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), +}; + +/* + * AF_UNIX sockets do not interact with hardware, hence they + * dont trigger interrupts - so it's safe for them to have + * bh-unsafe locking for their sk_receive_queue.lock. Split off + * this special lock-class by reinitializing the spinlock key: + */ +static struct lock_class_key af_unix_sk_receive_queue_lock_key; + +static struct sock *unix_create1(struct net *net, struct socket *sock) +{ + struct sock *sk = NULL; + struct unix_sock *u; + + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) + goto out; + + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + if (!sk) + goto out; + + sock_init_data(sock, sk); + lockdep_set_class(&sk->sk_receive_queue.lock, + &af_unix_sk_receive_queue_lock_key); + + sk->sk_write_space = unix_write_space; + sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; + sk->sk_destruct = unix_sock_destructor; + u = unix_sk(sk); + u->dentry = NULL; + u->mnt = NULL; + spin_lock_init(&u->lock); + atomic_long_set(&u->inflight, 0); + INIT_LIST_HEAD(&u->link); + mutex_init(&u->readlock); /* single task reading lock */ + init_waitqueue_head(&u->peer_wait); + unix_insert_socket(unix_sockets_unbound, sk); +out: + if (sk == NULL) + atomic_long_dec(&unix_nr_socks); + else { + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + } + return sk; +} + +static int unix_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + if (protocol && protocol != PF_UNIX) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &unix_stream_ops; + break; + /* + * Believe it or not BSD has AF_UNIX, SOCK_RAW though + * nothing uses it. + */ + case SOCK_RAW: + sock->type = SOCK_DGRAM; + case SOCK_DGRAM: + sock->ops = &unix_dgram_ops; + break; + case SOCK_SEQPACKET: + sock->ops = &unix_seqpacket_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + return unix_create1(net, sock) ? 0 : -ENOMEM; +} + +static int unix_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + sock->sk = NULL; + + return unix_release_sock(sk, 0); +} + +static int unix_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct unix_sock *u = unix_sk(sk); + static u32 ordernum = 1; + struct unix_address *addr; + int err; + unsigned int retries = 0; + + mutex_lock(&u->readlock); + + err = 0; + if (u->addr) + goto out; + + err = -ENOMEM; + addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + if (!addr) + goto out; + + addr->name->sun_family = AF_UNIX; + atomic_set(&addr->refcnt, 1); + +retry: + addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); + addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + + spin_lock(&unix_table_lock); + ordernum = (ordernum+1)&0xFFFFF; + + if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, + addr->hash)) { + spin_unlock(&unix_table_lock); + /* + * __unix_find_socket_byname() may take long time if many names + * are already in use. + */ + cond_resched(); + /* Give up if all names seems to be in use. */ + if (retries++ == 0xFFFFF) { + err = -ENOSPC; + kfree(addr); + goto out; + } + goto retry; + } + addr->hash ^= sk->sk_type; + + __unix_remove_socket(sk); + u->addr = addr; + __unix_insert_socket(&unix_socket_table[addr->hash], sk); + spin_unlock(&unix_table_lock); + err = 0; + +out: mutex_unlock(&u->readlock); + return err; +} + +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunname, int len, + int type, unsigned hash, int *error) +{ + struct sock *u; + struct path path; + int err = 0; + + if (sunname->sun_path[0]) { + struct inode *inode; + err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + inode = path.dentry->d_inode; + err = inode_permission(inode, MAY_WRITE); + if (err) + goto put_fail; + + err = -ECONNREFUSED; + if (!S_ISSOCK(inode->i_mode)) + goto put_fail; + u = unix_find_socket_byinode(inode); + if (!u) + goto put_fail; + + if (u->sk_type == type) + touch_atime(path.mnt, path.dentry); + + path_put(&path); + + err = -EPROTOTYPE; + if (u->sk_type != type) { + sock_put(u); + goto fail; + } + } else { + err = -ECONNREFUSED; + u = unix_find_socket_byname(net, sunname, len, type, hash); + if (u) { + struct dentry *dentry; + dentry = unix_sk(u)->dentry; + if (dentry) + touch_atime(unix_sk(u)->mnt, dentry); + } else + goto fail; + } + return u; + +put_fail: + path_put(&path); +fail: + *error = err; + return NULL; +} + + +static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct unix_sock *u = unix_sk(sk); + struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; + struct dentry *dentry = NULL; + struct nameidata nd; + int err; + unsigned hash; + struct unix_address *addr; + struct hlist_head *list; + + err = -EINVAL; + if (sunaddr->sun_family != AF_UNIX) + goto out; + + if (addr_len == sizeof(short)) { + err = unix_autobind(sock); + goto out; + } + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; + + mutex_lock(&u->readlock); + + err = -EINVAL; + if (u->addr) + goto out_up; + + err = -ENOMEM; + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); + if (!addr) + goto out_up; + + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash ^ sk->sk_type; + atomic_set(&addr->refcnt, 1); + + if (sunaddr->sun_path[0]) { + unsigned int mode; + err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + err = kern_path_parent(sunaddr->sun_path, &nd); + if (err) + goto out_mknod_parent; + + dentry = lookup_create(&nd, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_mknod_unlock; + + /* + * All right, let's create it. + */ + mode = S_IFSOCK | + (SOCK_INODE(sock)->i_mode & ~current_umask()); + err = mnt_want_write(nd.path.mnt); + if (err) + goto out_mknod_dput; + err = security_path_mknod(&nd.path, dentry, mode, 0); + if (err) + goto out_mknod_drop_write; + err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); +out_mknod_drop_write: + mnt_drop_write(nd.path.mnt); + if (err) + goto out_mknod_dput; + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + dput(nd.path.dentry); + nd.path.dentry = dentry; + + addr->hash = UNIX_HASH_SIZE; + } + + spin_lock(&unix_table_lock); + + if (!sunaddr->sun_path[0]) { + err = -EADDRINUSE; + if (__unix_find_socket_byname(net, sunaddr, addr_len, + sk->sk_type, hash)) { + unix_release_addr(addr); + goto out_unlock; + } + + list = &unix_socket_table[addr->hash]; + } else { + list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; + u->dentry = nd.path.dentry; + u->mnt = nd.path.mnt; + } + + err = 0; + __unix_remove_socket(sk); + u->addr = addr; + __unix_insert_socket(list, sk); + +out_unlock: + spin_unlock(&unix_table_lock); +out_up: + mutex_unlock(&u->readlock); +out: + return err; + +out_mknod_dput: + dput(dentry); +out_mknod_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); +out_mknod_parent: + if (err == -EEXIST) + err = -EADDRINUSE; + unix_release_addr(addr); + goto out_up; +} + +static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) +{ + if (unlikely(sk1 == sk2) || !sk2) { + unix_state_lock(sk1); + return; + } + if (sk1 < sk2) { + unix_state_lock(sk1); + unix_state_lock_nested(sk2); + } else { + unix_state_lock(sk2); + unix_state_lock_nested(sk1); + } +} + +static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) +{ + if (unlikely(sk1 == sk2) || !sk2) { + unix_state_unlock(sk1); + return; + } + unix_state_unlock(sk1); + unix_state_unlock(sk2); +} + +static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + struct sock *other; + unsigned hash; + int err; + + if (addr->sa_family != AF_UNSPEC) { + err = unix_mkname(sunaddr, alen, &hash); + if (err < 0) + goto out; + alen = err; + + if (test_bit(SOCK_PASSCRED, &sock->flags) && + !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) + goto out; + +restart: + other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); + if (!other) + goto out; + + unix_state_double_lock(sk, other); + + /* Apparently VFS overslept socket death. Retry. */ + if (sock_flag(other, SOCK_DEAD)) { + unix_state_double_unlock(sk, other); + sock_put(other); + goto restart; + } + + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + + err = security_unix_may_send(sk->sk_socket, other->sk_socket); + if (err) + goto out_unlock; + + } else { + /* + * 1003.1g breaking connected state with AF_UNSPEC + */ + other = NULL; + unix_state_double_lock(sk, other); + } + + /* + * If it was connected, reconnect. + */ + if (unix_peer(sk)) { + struct sock *old_peer = unix_peer(sk); + unix_peer(sk) = other; + unix_state_double_unlock(sk, other); + + if (other != old_peer) + unix_dgram_disconnected(sk, old_peer); + sock_put(old_peer); + } else { + unix_peer(sk) = other; + unix_state_double_unlock(sk, other); + } + return 0; + +out_unlock: + unix_state_double_unlock(sk, other); + sock_put(other); +out: + return err; +} + +static long unix_wait_for_peer(struct sock *other, long timeo) +{ + struct unix_sock *u = unix_sk(other); + int sched; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); + + sched = !sock_flag(other, SOCK_DEAD) && + !(other->sk_shutdown & RCV_SHUTDOWN) && + unix_recvq_full(other); + + unix_state_unlock(other); + + if (sched) + timeo = schedule_timeout(timeo); + + finish_wait(&u->peer_wait, &wait); + return timeo; +} + +static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct sock *newsk = NULL; + struct sock *other = NULL; + struct sk_buff *skb = NULL; + unsigned hash; + int st; + int err; + long timeo; + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; + + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && + (err = unix_autobind(sock)) != 0) + goto out; + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + /* First of all allocate resources. + If we will make it after state is locked, + we will have to recheck all again in any case. + */ + + err = -ENOMEM; + + /* create new sock for complete connection */ + newsk = unix_create1(sock_net(sk), NULL); + if (newsk == NULL) + goto out; + + /* Allocate skb for sending to listening sock */ + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; + +restart: + /* Find listening sock. */ + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); + if (!other) + goto out; + + /* Latch state of peer */ + unix_state_lock(other); + + /* Apparently VFS overslept socket death. Retry. */ + if (sock_flag(other, SOCK_DEAD)) { + unix_state_unlock(other); + sock_put(other); + goto restart; + } + + err = -ECONNREFUSED; + if (other->sk_state != TCP_LISTEN) + goto out_unlock; + if (other->sk_shutdown & RCV_SHUTDOWN) + goto out_unlock; + + if (unix_recvq_full(other)) { + err = -EAGAIN; + if (!timeo) + goto out_unlock; + + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + sock_put(other); + goto restart; + } + + /* Latch our state. + + It is tricky place. We need to grab our state lock and cannot + drop lock on peer. It is dangerous because deadlock is + possible. Connect to self case and simultaneous + attempt to connect are eliminated by checking socket + state. other is TCP_LISTEN, if sk is TCP_LISTEN we + check this before attempt to grab lock. + + Well, and we have to recheck the state after socket locked. + */ + st = sk->sk_state; + + switch (st) { + case TCP_CLOSE: + /* This is ok... continue with connect */ + break; + case TCP_ESTABLISHED: + /* Socket is already connected */ + err = -EISCONN; + goto out_unlock; + default: + err = -EINVAL; + goto out_unlock; + } + + unix_state_lock_nested(sk); + + if (sk->sk_state != st) { + unix_state_unlock(sk); + unix_state_unlock(other); + sock_put(other); + goto restart; + } + + err = security_unix_stream_connect(sk, other, newsk); + if (err) { + unix_state_unlock(sk); + goto out_unlock; + } + + /* The way is open! Fastly set all the necessary fields... */ + + sock_hold(sk); + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; + init_peercred(newsk); + newu = unix_sk(newsk); + RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); + otheru = unix_sk(other); + + /* copy address information from listening to new sock*/ + if (otheru->addr) { + atomic_inc(&otheru->addr->refcnt); + newu->addr = otheru->addr; + } + if (otheru->dentry) { + newu->dentry = dget(otheru->dentry); + newu->mnt = mntget(otheru->mnt); + } + + /* Set credentials */ + copy_peercred(sk, other); + + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + sock_hold(newsk); + + smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + unix_peer(sk) = newsk; + + unix_state_unlock(sk); + + /* take ten and and send info to listening sock */ + spin_lock(&other->sk_receive_queue.lock); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); + unix_state_unlock(other); + other->sk_data_ready(other, 0); + sock_put(other); + return 0; + +out_unlock: + if (other) + unix_state_unlock(other); + +out: + kfree_skb(skb); + if (newsk) + unix_release_sock(newsk, 0); + if (other) + sock_put(other); + return err; +} + +static int unix_socketpair(struct socket *socka, struct socket *sockb) +{ + struct sock *ska = socka->sk, *skb = sockb->sk; + + /* Join our sockets back to back */ + sock_hold(ska); + sock_hold(skb); + unix_peer(ska) = skb; + unix_peer(skb) = ska; + init_peercred(ska); + init_peercred(skb); + + if (ska->sk_type != SOCK_DGRAM) { + ska->sk_state = TCP_ESTABLISHED; + skb->sk_state = TCP_ESTABLISHED; + socka->state = SS_CONNECTED; + sockb->state = SS_CONNECTED; + } + return 0; +} + +static int unix_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct sock *tsk; + struct sk_buff *skb; + int err; + + err = -EOPNOTSUPP; + if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) + goto out; + + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out; + + /* If socket state is TCP_LISTEN it cannot change (for now...), + * so that no locks are necessary. + */ + + skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); + if (!skb) { + /* This means receive shutdown. */ + if (err == 0) + err = -EINVAL; + goto out; + } + + tsk = skb->sk; + skb_free_datagram(sk, skb); + wake_up_interruptible(&unix_sk(sk)->peer_wait); + + /* attach accepted sock to socket */ + unix_state_lock(tsk); + newsock->state = SS_CONNECTED; + sock_graft(tsk, newsock); + unix_state_unlock(tsk); + return 0; + +out: + return err; +} + + +static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct unix_sock *u; + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); + int err = 0; + + if (peer) { + sk = unix_peer_get(sk); + + err = -ENOTCONN; + if (!sk) + goto out; + err = 0; + } else { + sock_hold(sk); + } + + u = unix_sk(sk); + unix_state_lock(sk); + if (!u->addr) { + sunaddr->sun_family = AF_UNIX; + sunaddr->sun_path[0] = 0; + *uaddr_len = sizeof(short); + } else { + struct unix_address *addr = u->addr; + + *uaddr_len = addr->len; + memcpy(sunaddr, addr->name, *uaddr_len); + } + unix_state_unlock(sk); + sock_put(sk); +out: + return err; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + for (i = scm->fp->count-1; i >= 0; i--) + unix_notinflight(scm->fp->fp[i]); +} + +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + scm.cred = UNIXCB(skb).cred; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + +#define MAX_RECURSION_LEVEL 4 + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + unsigned char max_level = 0; + int unix_sock_count = 0; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) { + unix_sock_count++; + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; + + /* + * Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + if (unix_sock_count) { + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); + } + return max_level; +} + +static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) +{ + int err = 0; + UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).cred = get_cred(scm->cred); + UNIXCB(skb).fp = NULL; + if (scm->fp && send_fds) + err = unix_attach_fds(scm, skb); + + skb->destructor = unix_destruct_scm; + return err; +} + +/* + * Send AF_UNIX data. + */ + +static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct unix_sock *u = unix_sk(sk); + struct sockaddr_un *sunaddr = msg->msg_name; + struct sock *other = NULL; + int namelen = 0; /* fake GCC */ + int err; + unsigned hash; + struct sk_buff *skb; + long timeo; + struct scm_cookie tmp_scm; + int max_level; + + if (NULL == siocb->scm) + siocb->scm = &tmp_scm; + wait_for_unix_gc(); + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out; + + if (msg->msg_namelen) { + err = unix_mkname(sunaddr, msg->msg_namelen, &hash); + if (err < 0) + goto out; + namelen = err; + } else { + sunaddr = NULL; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out; + } + + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr + && (err = unix_autobind(sock)) != 0) + goto out; + + err = -EMSGSIZE; + if (len > sk->sk_sndbuf - 32) + goto out; + + skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + err = unix_scm_to_skb(siocb->scm, skb, true); + if (err < 0) + goto out_free; + max_level = err + 1; + unix_get_secdata(siocb->scm, skb); + + skb_reset_transport_header(skb); + err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (err) + goto out_free; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + +restart: + if (!other) { + err = -ECONNRESET; + if (sunaddr == NULL) + goto out_free; + + other = unix_find_other(net, sunaddr, namelen, sk->sk_type, + hash, &err); + if (other == NULL) + goto out_free; + } + + if (sk_filter(other, skb) < 0) { + /* Toss the packet but do not return any error to the sender */ + err = len; + goto out_free; + } + + unix_state_lock(other); + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + + if (sock_flag(other, SOCK_DEAD)) { + /* + * Check with 1003.1g - what should + * datagram error + */ + unix_state_unlock(other); + sock_put(other); + + err = 0; + unix_state_lock(sk); + if (unix_peer(sk) == other) { + unix_peer(sk) = NULL; + unix_state_unlock(sk); + + unix_dgram_disconnected(sk, other); + sock_put(other); + err = -ECONNREFUSED; + } else { + unix_state_unlock(sk); + } + + other = NULL; + if (err) + goto out_free; + goto restart; + } + + err = -EPIPE; + if (other->sk_shutdown & RCV_SHUTDOWN) + goto out_unlock; + + if (sk->sk_type != SOCK_SEQPACKET) { + err = security_unix_may_send(sk->sk_socket, other->sk_socket); + if (err) + goto out_unlock; + } + + if (unix_peer(other) != sk && unix_recvq_full(other)) { + if (!timeo) { + err = -EAGAIN; + goto out_unlock; + } + + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; + } + + if (sock_flag(other, SOCK_RCVTSTAMP)) + __net_timestamp(skb); + skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; + unix_state_unlock(other); + other->sk_data_ready(other, len); + sock_put(other); + scm_destroy(siocb->scm); + return len; + +out_unlock: + unix_state_unlock(other); +out_free: + kfree_skb(skb); +out: + if (other) + sock_put(other); + scm_destroy(siocb->scm); + return err; +} + + +static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct sock *other = NULL; + int err, size; + struct sk_buff *skb; + int sent = 0; + struct scm_cookie tmp_scm; + bool fds_sent = false; + int max_level; + + if (NULL == siocb->scm) + siocb->scm = &tmp_scm; + wait_for_unix_gc(); + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out_err; + + if (msg->msg_namelen) { + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; + goto out_err; + } else { + err = -ENOTCONN; + other = unix_peer(sk); + if (!other) + goto out_err; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto pipe_err; + + while (sent < len) { + /* + * Optimisation for the fact that under 0.01% of X + * messages typically need breaking up. + */ + + size = len-sent; + + /* Keep two messages in the pipe so it schedules better */ + if (size > ((sk->sk_sndbuf >> 1) - 64)) + size = (sk->sk_sndbuf >> 1) - 64; + + if (size > SKB_MAX_ALLOC) + size = SKB_MAX_ALLOC; + + /* + * Grab a buffer + */ + + skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, + &err); + + if (skb == NULL) + goto out_err; + + /* + * If you pass two values to the sock_alloc_send_skb + * it tries to grab the large buffer with GFP_NOFS + * (which can fail easily), and if it fails grab the + * fallback size buffer which is under a page and will + * succeed. [Alan] + */ + size = min_t(int, size, skb_tailroom(skb)); + + + /* Only send the fds in the first buffer */ + err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); + if (err < 0) { + kfree_skb(skb); + goto out_err; + } + max_level = err + 1; + fds_sent = true; + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err) { + kfree_skb(skb); + goto out_err; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + (other->sk_shutdown & RCV_SHUTDOWN)) + goto pipe_err_free; + + skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; + unix_state_unlock(other); + other->sk_data_ready(other, size); + sent += size; + } + + scm_destroy(siocb->scm); + siocb->scm = NULL; + + return sent; + +pipe_err_free: + unix_state_unlock(other); + kfree_skb(skb); +pipe_err: + if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + err = -EPIPE; +out_err: + scm_destroy(siocb->scm); + siocb->scm = NULL; + return sent ? : err; +} + +static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + int err; + struct sock *sk = sock->sk; + + err = sock_error(sk); + if (err) + return err; + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + if (msg->msg_namelen) + msg->msg_namelen = 0; + + return unix_dgram_sendmsg(kiocb, sock, msg, len); +} + +static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + return unix_dgram_recvmsg(iocb, sock, msg, size, flags); +} + +static void unix_copy_addr(struct msghdr *msg, struct sock *sk) +{ + struct unix_sock *u = unix_sk(sk); + + msg->msg_namelen = 0; + if (u->addr) { + msg->msg_namelen = u->addr->len; + memcpy(msg->msg_name, u->addr->name, u->addr->len); + } +} + +static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(iocb); + struct scm_cookie tmp_scm; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int err; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + msg->msg_namelen = 0; + + err = mutex_lock_interruptible(&u->readlock); + if (err) { + err = sock_intr_errno(sock_rcvtimeo(sk, noblock)); + goto out; + } + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) { + unix_state_lock(sk); + /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ + if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && + (sk->sk_shutdown & RCV_SHUTDOWN)) + err = 0; + unix_state_unlock(sk); + goto out_unlock; + } + + wake_up_interruptible_sync_poll(&u->peer_wait, + POLLOUT | POLLWRNORM | POLLWRBAND); + + if (msg->msg_name) + unix_copy_addr(msg, skb->sk); + + if (size > skb->len) + size = skb->len; + else if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); + if (err) + goto out_free; + + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + + if (!siocb->scm) { + siocb->scm = &tmp_scm; + memset(&tmp_scm, 0, sizeof(tmp_scm)); + } + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + unix_set_secdata(siocb->scm, skb); + + if (!(flags & MSG_PEEK)) { + if (UNIXCB(skb).fp) + unix_detach_fds(siocb->scm, skb); + } else { + /* It is questionable: on PEEK we could: + - do not return fds - good, but too simple 8) + - return fds, and do not return them on read (old strategy, + apparently wrong) + - clone fds (I chose it for now, it is the most universal + solution) + + POSIX 1003.1g does not actually define this clearly + at all. POSIX 1003.1g doesn't define a lot of things + clearly however! + + */ + if (UNIXCB(skb).fp) + siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + } + err = size; + + scm_recv(sock, msg, siocb->scm, flags); + +out_free: + skb_free_datagram(sk, skb); +out_unlock: + mutex_unlock(&u->readlock); +out: + return err; +} + +/* + * Sleep until data has arrive. But check for races.. + */ + +static long unix_stream_data_wait(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + unix_state_lock(sk); + + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + if (!skb_queue_empty(&sk->sk_receive_queue) || + sk->sk_err || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current) || + !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + unix_state_unlock(sk); + timeo = schedule_timeout(timeo); + unix_state_lock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + } + + finish_wait(sk_sleep(sk), &wait); + unix_state_unlock(sk); + return timeo; +} + + + +static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(iocb); + struct scm_cookie tmp_scm; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + struct sockaddr_un *sunaddr = msg->msg_name; + int copied = 0; + int check_creds = 0; + int target; + int err = 0; + long timeo; + + err = -EINVAL; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); + + msg->msg_namelen = 0; + + /* Lock the socket to prevent queue disordering + * while sleeps in memcpy_tomsg + */ + + if (!siocb->scm) { + siocb->scm = &tmp_scm; + memset(&tmp_scm, 0, sizeof(tmp_scm)); + } + + err = mutex_lock_interruptible(&u->readlock); + if (err) { + err = sock_intr_errno(timeo); + goto out; + } + + do { + int chunk; + struct sk_buff *skb; + + unix_state_lock(sk); + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) { + unix_sk(sk)->recursion_level = 0; + if (copied >= target) + goto unlock; + + /* + * POSIX 1003.1g mandates this order. + */ + + err = sock_error(sk); + if (err) + goto unlock; + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto unlock; + + unix_state_unlock(sk); + err = -EAGAIN; + if (!timeo) + break; + mutex_unlock(&u->readlock); + + timeo = unix_stream_data_wait(sk, timeo); + + if (signal_pending(current) + || mutex_lock_interruptible(&u->readlock)) { + err = sock_intr_errno(timeo); + goto out; + } + + continue; + unlock: + unix_state_unlock(sk); + break; + } + unix_state_unlock(sk); + + if (check_creds) { + /* Never glue messages from different writers */ + if ((UNIXCB(skb).pid != siocb->scm->pid) || + (UNIXCB(skb).cred != siocb->scm->cred)) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } else { + /* Copy credentials */ + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + check_creds = 1; + } + + /* Copy address just once */ + if (sunaddr) { + unix_copy_addr(msg, skb->sk); + sunaddr = NULL; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) { + skb_pull(skb, chunk); + + if (UNIXCB(skb).fp) + unix_detach_fds(siocb->scm, skb); + + /* put the skb back if we didn't use it up.. */ + if (skb->len) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + + consume_skb(skb); + + if (siocb->scm->fp) + break; + } else { + /* It is questionable, see note in unix_dgram_recvmsg. + */ + if (UNIXCB(skb).fp) + siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + + mutex_unlock(&u->readlock); + scm_recv(sock, msg, siocb->scm, flags); +out: + return copied ? : err; +} + +static int unix_shutdown(struct socket *sock, int mode) +{ + struct sock *sk = sock->sk; + struct sock *other; + + mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); + + if (!mode) + return 0; + + unix_state_lock(sk); + sk->sk_shutdown |= mode; + other = unix_peer(sk); + if (other) + sock_hold(other); + unix_state_unlock(sk); + sk->sk_state_change(sk); + + if (other && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { + + int peer_mode = 0; + + if (mode&RCV_SHUTDOWN) + peer_mode |= SEND_SHUTDOWN; + if (mode&SEND_SHUTDOWN) + peer_mode |= RCV_SHUTDOWN; + unix_state_lock(other); + other->sk_shutdown |= peer_mode; + unix_state_unlock(other); + other->sk_state_change(other); + if (peer_mode == SHUTDOWN_MASK) + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); + else if (peer_mode & RCV_SHUTDOWN) + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); + } + if (other) + sock_put(other); + + return 0; +} + +static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + long amount = 0; + int err; + + switch (cmd) { + case SIOCOUTQ: + amount = sk_wmem_alloc_get(sk); + err = put_user(amount, (int __user *)arg); + break; + case SIOCINQ: + { + struct sk_buff *skb; + + if (sk->sk_state == TCP_LISTEN) { + err = -EINVAL; + break; + } + + spin_lock(&sk->sk_receive_queue.lock); + if (sk->sk_type == SOCK_STREAM || + sk->sk_type == SOCK_SEQPACKET) { + skb_queue_walk(&sk->sk_receive_queue, skb) + amount += skb->len; + } else { + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + amount = skb->len; + } + spin_unlock(&sk->sk_receive_queue.lock); + err = put_user(amount, (int __user *)arg); + break; + } + + default: + err = -ENOIOCTLCMD; + break; + } + return err; +} + +static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err) + mask |= POLLERR; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP | POLLIN | POLLRDNORM; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && + sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + + /* + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ + if (unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + +static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk, *other; + unsigned int mask, writable; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP | POLLIN | POLLRDNORM; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (sk->sk_type == SOCK_SEQPACKET) { + if (sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->sk_state == TCP_SYN_SENT) + return mask; + } + + /* No write status requested, avoid expensive OUT tests. */ + if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT))) + return mask; + + writable = unix_writable(sk); + other = unix_peer_get(sk); + if (other) { + if (unix_peer(other) != sk) { + sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); + if (unix_recvq_full(other)) + writable = 0; + } + sock_put(other); + } + + if (writable) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} + +#ifdef CONFIG_PROC_FS +static struct sock *first_unix_socket(int *i) +{ + for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { + if (!hlist_empty(&unix_socket_table[*i])) + return __sk_head(&unix_socket_table[*i]); + } + return NULL; +} + +static struct sock *next_unix_socket(int *i, struct sock *s) +{ + struct sock *next = sk_next(s); + /* More in this chain? */ + if (next) + return next; + /* Look for next non-empty chain. */ + for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { + if (!hlist_empty(&unix_socket_table[*i])) + return __sk_head(&unix_socket_table[*i]); + } + return NULL; +} + +struct unix_iter_state { + struct seq_net_private p; + int i; +}; + +static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos) +{ + struct unix_iter_state *iter = seq->private; + loff_t off = 0; + struct sock *s; + + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (sock_net(s) != seq_file_net(seq)) + continue; + if (off == pos) + return s; + ++off; + } + return NULL; +} + +static void *unix_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(unix_table_lock) +{ + spin_lock(&unix_table_lock); + return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct unix_iter_state *iter = seq->private; + struct sock *sk = v; + ++*pos; + + if (v == SEQ_START_TOKEN) + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); + while (sk && (sock_net(sk) != seq_file_net(seq))) + sk = next_unix_socket(&iter->i, sk); + return sk; +} + +static void unix_seq_stop(struct seq_file *seq, void *v) + __releases(unix_table_lock) +{ + spin_unlock(&unix_table_lock); +} + +static int unix_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Num RefCount Protocol Flags Type St " + "Inode Path\n"); + else { + struct sock *s = v; + struct unix_sock *u = unix_sk(s); + unix_state_lock(s); + + seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", + s, + atomic_read(&s->sk_refcnt), + 0, + s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, + s->sk_type, + s->sk_socket ? + (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : + (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), + sock_i_ino(s)); + + if (u->addr) { + int i, len; + seq_putc(seq, ' '); + + i = 0; + len = u->addr->len - sizeof(short); + if (!UNIX_ABSTRACT(s)) + len--; + else { + seq_putc(seq, '@'); + i++; + } + for ( ; i < len; i++) + seq_putc(seq, u->addr->name->sun_path[i]); + } + unix_state_unlock(s); + seq_putc(seq, '\n'); + } + + return 0; +} + +static const struct seq_operations unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = unix_seq_stop, + .show = unix_seq_show, +}; + +static int unix_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &unix_seq_ops, + sizeof(struct unix_iter_state)); +} + +static const struct file_operations unix_seq_fops = { + .owner = THIS_MODULE, + .open = unix_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +static const struct net_proto_family unix_family_ops = { + .family = PF_UNIX, + .create = unix_create, + .owner = THIS_MODULE, +}; + + +static int __net_init unix_net_init(struct net *net) +{ + int error = -ENOMEM; + + net->unx.sysctl_max_dgram_qlen = 10; + if (unix_sysctl_register(net)) + goto out; + +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) { + unix_sysctl_unregister(net); + goto out; + } +#endif + error = 0; +out: + return error; +} + +static void __net_exit unix_net_exit(struct net *net) +{ + unix_sysctl_unregister(net); + proc_net_remove(net, "unix"); +} + +static struct pernet_operations unix_net_ops = { + .init = unix_net_init, + .exit = unix_net_exit, +}; + +static int __init af_unix_init(void) +{ + int rc = -1; + struct sk_buff *dummy_skb; + + BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)); + + rc = proto_register(&unix_proto, 1); + if (rc != 0) { + printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n", + __func__); + goto out; + } + + sock_register(&unix_family_ops); + register_pernet_subsys(&unix_net_ops); +out: + return rc; +} + +static void __exit af_unix_exit(void) +{ + sock_unregister(PF_UNIX); + proto_unregister(&unix_proto); + unregister_pernet_subsys(&unix_net_ops); +} + +/* Earlier than device_initcall() so that other drivers invoking + request_module() don't end up in a loop when modprobe tries + to use a UNIX socket. But later than subsys_initcall() because + we depend on stuff initialised there */ +fs_initcall(af_unix_init); +module_exit(af_unix_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_UNIX); diff --git a/net/unix/garbage.c b/net/unix/garbage.c new file mode 100644 index 00000000..b6f4b994 --- /dev/null +++ b/net/unix/garbage.c @@ -0,0 +1,386 @@ +/* + * NET3: Garbage Collector For AF_UNIX sockets + * + * Garbage Collector: + * Copyright (C) Barak A. Pearlmutter. + * Released under the GPL version 2 or later. + * + * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. + * If it doesn't work blame me, it worked when Barak sent it. + * + * Assumptions: + * + * - object w/ a bit + * - free list + * + * Current optimizations: + * + * - explicit stack instead of recursion + * - tail recurse on first born instead of immediate push/pop + * - we gather the stuff that should not be killed into tree + * and stack is just a path from root to the current pointer. + * + * Future optimizations: + * + * - don't just push entire root set; process in place + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. + * Cope with changing max_files. + * Al Viro 11 Oct 1998 + * Graph may have cycles. That is, we can send the descriptor + * of foo to bar and vice versa. Current code chokes on that. + * Fix: move SCM_RIGHTS ones into the separate list and then + * skb_free() them all instead of doing explicit fput's. + * Another problem: since fput() may block somebody may + * create a new unix_socket when we are in the middle of sweep + * phase. Fix: revert the logic wrt MARKED. Mark everything + * upon the beginning and unmark non-junk ones. + * + * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS + * sent to connect()'ed but still not accept()'ed sockets. + * Fixed. Old code had slightly different problem here: + * extra fput() in situation when we passed the descriptor via + * such socket and closed it (descriptor). That would happen on + * each unix_gc() until the accept(). Since the struct file in + * question would go to the free list and might be reused... + * That might be the reason of random oopses on filp_close() + * in unrelated processes. + * + * AV 28 Feb 1999 + * Kill the explicit allocation of stack. Now we keep the tree + * with root in dummy + pointer (gc_current) to one of the nodes. + * Stack is represented as path from gc_current to dummy. Unmark + * now means "add to tree". Push == "make it a son of gc_current". + * Pop == "move gc_current to parent". We keep only pointers to + * parents (->gc_tree). + * AV 1 Mar 1999 + * Damn. Added missing check for ->dead in listen queues scanning. + * + * Miklos Szeredi 25 Jun 2007 + * Reimplement with a cycle collecting algorithm. This should + * solve several problems with the previous code, like being racy + * wrt receive and holding up unrelated socket operations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Internal data structures and random procedures: */ + +static LIST_HEAD(gc_inflight_list); +static LIST_HEAD(gc_candidates); +static DEFINE_SPINLOCK(unix_gc_lock); +static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); + +unsigned int unix_tot_inflight; + + +struct sock *unix_get_socket(struct file *filp) +{ + struct sock *u_sock = NULL; + struct inode *inode = filp->f_path.dentry->d_inode; + + /* + * Socket ? + */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + struct sock *s = sock->sk; + + /* + * PF_UNIX ? + */ + if (s && sock->ops && sock->ops->family == PF_UNIX) + u_sock = s; + } + return u_sock; +} + +/* + * Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ + +void unix_inflight(struct file *fp) +{ + struct sock *s = unix_get_socket(fp); + if (s) { + struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); + if (atomic_long_inc_return(&u->inflight) == 1) { + BUG_ON(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); + } else { + BUG_ON(list_empty(&u->link)); + } + unix_tot_inflight++; + spin_unlock(&unix_gc_lock); + } +} + +void unix_notinflight(struct file *fp) +{ + struct sock *s = unix_get_socket(fp); + if (s) { + struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); + BUG_ON(list_empty(&u->link)); + if (atomic_long_dec_and_test(&u->inflight)) + list_del_init(&u->link); + unix_tot_inflight--; + spin_unlock(&unix_gc_lock); + } +} + +static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), + struct sk_buff_head *hitlist) +{ + struct sk_buff *skb; + struct sk_buff *next; + + spin_lock(&x->sk_receive_queue.lock); + skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { + /* + * Do we have file descriptors ? + */ + if (UNIXCB(skb).fp) { + bool hit = false; + /* + * Process the descriptors of this socket + */ + int nfd = UNIXCB(skb).fp->count; + struct file **fp = UNIXCB(skb).fp->fp; + while (nfd--) { + /* + * Get the socket the fd matches + * if it indeed does so + */ + struct sock *sk = unix_get_socket(*fp++); + if (sk) { + struct unix_sock *u = unix_sk(sk); + + /* + * Ignore non-candidates, they could + * have been added to the queues after + * starting the garbage collection + */ + if (u->gc_candidate) { + hit = true; + func(u); + } + } + } + if (hit && hitlist != NULL) { + __skb_unlink(skb, &x->sk_receive_queue); + __skb_queue_tail(hitlist, skb); + } + } + } + spin_unlock(&x->sk_receive_queue.lock); +} + +static void scan_children(struct sock *x, void (*func)(struct unix_sock *), + struct sk_buff_head *hitlist) +{ + if (x->sk_state != TCP_LISTEN) + scan_inflight(x, func, hitlist); + else { + struct sk_buff *skb; + struct sk_buff *next; + struct unix_sock *u; + LIST_HEAD(embryos); + + /* + * For a listening socket collect the queued embryos + * and perform a scan on them as well. + */ + spin_lock(&x->sk_receive_queue.lock); + skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { + u = unix_sk(skb->sk); + + /* + * An embryo cannot be in-flight, so it's safe + * to use the list link. + */ + BUG_ON(!list_empty(&u->link)); + list_add_tail(&u->link, &embryos); + } + spin_unlock(&x->sk_receive_queue.lock); + + while (!list_empty(&embryos)) { + u = list_entry(embryos.next, struct unix_sock, link); + scan_inflight(&u->sk, func, hitlist); + list_del_init(&u->link); + } + } +} + +static void dec_inflight(struct unix_sock *usk) +{ + atomic_long_dec(&usk->inflight); +} + +static void inc_inflight(struct unix_sock *usk) +{ + atomic_long_inc(&usk->inflight); +} + +static void inc_inflight_move_tail(struct unix_sock *u) +{ + atomic_long_inc(&u->inflight); + /* + * If this still might be part of a cycle, move it to the end + * of the list, so that it's checked even if it was already + * passed over + */ + if (u->gc_maybe_cycle) + list_move_tail(&u->link, &gc_candidates); +} + +static bool gc_in_progress = false; +#define UNIX_INFLIGHT_TRIGGER_GC 16000 + +void wait_for_unix_gc(void) +{ + /* + * If number of inflight sockets is insane, + * force a garbage collect right now. + */ + if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + unix_gc(); + wait_event(unix_gc_wait, gc_in_progress == false); +} + +/* The external entry point: unix_gc() */ +void unix_gc(void) +{ + struct unix_sock *u; + struct unix_sock *next; + struct sk_buff_head hitlist; + struct list_head cursor; + LIST_HEAD(not_cycle_list); + + spin_lock(&unix_gc_lock); + + /* Avoid a recursive GC. */ + if (gc_in_progress) + goto out; + + gc_in_progress = true; + /* + * First, select candidates for garbage collection. Only + * in-flight sockets are considered, and from those only ones + * which don't have any external reference. + * + * Holding unix_gc_lock will protect these candidates from + * being detached, and hence from gaining an external + * reference. Since there are no possible receivers, all + * buffers currently on the candidates' queues stay there + * during the garbage collection. + * + * We also know that no new candidate can be added onto the + * receive queues. Other, non candidate sockets _can_ be + * added to queue, so we must make sure only to touch + * candidates. + */ + list_for_each_entry_safe(u, next, &gc_inflight_list, link) { + long total_refs; + long inflight_refs; + + total_refs = file_count(u->sk.sk_socket->file); + inflight_refs = atomic_long_read(&u->inflight); + + BUG_ON(inflight_refs < 1); + BUG_ON(total_refs < inflight_refs); + if (total_refs == inflight_refs) { + list_move_tail(&u->link, &gc_candidates); + u->gc_candidate = 1; + u->gc_maybe_cycle = 1; + } + } + + /* + * Now remove all internal in-flight reference to children of + * the candidates. + */ + list_for_each_entry(u, &gc_candidates, link) + scan_children(&u->sk, dec_inflight, NULL); + + /* + * Restore the references for children of all candidates, + * which have remaining references. Do this recursively, so + * only those remain, which form cyclic references. + * + * Use a "cursor" link, to make the list traversal safe, even + * though elements might be moved about. + */ + list_add(&cursor, &gc_candidates); + while (cursor.next != &gc_candidates) { + u = list_entry(cursor.next, struct unix_sock, link); + + /* Move cursor to after the current position. */ + list_move(&cursor, &u->link); + + if (atomic_long_read(&u->inflight) > 0) { + list_move_tail(&u->link, ¬_cycle_list); + u->gc_maybe_cycle = 0; + scan_children(&u->sk, inc_inflight_move_tail, NULL); + } + } + list_del(&cursor); + + /* + * not_cycle_list contains those sockets which do not make up a + * cycle. Restore these to the inflight list. + */ + while (!list_empty(¬_cycle_list)) { + u = list_entry(not_cycle_list.next, struct unix_sock, link); + u->gc_candidate = 0; + list_move_tail(&u->link, &gc_inflight_list); + } + + /* + * Now gc_candidates contains only garbage. Restore original + * inflight counters for these as well, and remove the skbuffs + * which are creating the cycle(s). + */ + skb_queue_head_init(&hitlist); + list_for_each_entry(u, &gc_candidates, link) + scan_children(&u->sk, inc_inflight, &hitlist); + + spin_unlock(&unix_gc_lock); + + /* Here we are. Hitlist is filled. Die. */ + __skb_queue_purge(&hitlist); + + spin_lock(&unix_gc_lock); + + /* All candidates should have been detached by now. */ + BUG_ON(!list_empty(&gc_candidates)); + gc_in_progress = false; + wake_up(&unix_gc_wait); + + out: + spin_unlock(&unix_gc_lock); +} diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c new file mode 100644 index 00000000..397cffeb --- /dev/null +++ b/net/unix/sysctl_net_unix.c @@ -0,0 +1,63 @@ +/* + * NET4: Sysctl interface to net af_unix subsystem. + * + * Authors: Mike Shaver. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include + +static ctl_table unix_table[] = { + { + .procname = "max_dgram_qlen", + .data = &init_net.unx.sysctl_max_dgram_qlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +static struct ctl_path unix_path[] = { + { .procname = "net", }, + { .procname = "unix", }, + { }, +}; + +int __net_init unix_sysctl_register(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->unx.sysctl_max_dgram_qlen; + net->unx.ctl = register_net_sysctl_table(net, unix_path, table); + if (net->unx.ctl == NULL) + goto err_reg; + + return 0; + +err_reg: + kfree(table); +err_alloc: + return -ENOMEM; +} + +void unix_sysctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->unx.ctl->ctl_table_arg; + unregister_sysctl_table(net->unx.ctl); + kfree(table); +} diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig new file mode 100644 index 00000000..61ceae0b --- /dev/null +++ b/net/wanrouter/Kconfig @@ -0,0 +1,27 @@ +# +# Configuration for WAN router +# + +config WAN_ROUTER + tristate "WAN router" + depends on EXPERIMENTAL + ---help--- + Wide Area Networks (WANs), such as X.25, frame relay and leased + lines, are used to interconnect Local Area Networks (LANs) over vast + distances with data transfer rates significantly higher than those + achievable with commonly used asynchronous modem connections. + Usually, a quite expensive external device called a `WAN router' is + needed to connect to a WAN. + + As an alternative, WAN routing can be built into the Linux kernel. + With relatively inexpensive WAN interface cards available on the + market, a perfectly usable router can be built for less than half + the price of an external router. If you have one of those cards and + wish to use your Linux box as a WAN router, say Y here and also to + the WAN driver for your card, below. You will then need the + wan-tools package which is available from . + + To compile WAN routing support as a module, choose M here: the + module will be called wanrouter. + + If unsure, say N. diff --git a/net/wanrouter/Makefile b/net/wanrouter/Makefile new file mode 100644 index 00000000..4da14bc4 --- /dev/null +++ b/net/wanrouter/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux WAN router layer. +# + +obj-$(CONFIG_WAN_ROUTER) += wanrouter.o + +wanrouter-y := wanproc.o wanmain.o diff --git a/net/wanrouter/patchlevel b/net/wanrouter/patchlevel new file mode 100644 index 00000000..c043eea7 --- /dev/null +++ b/net/wanrouter/patchlevel @@ -0,0 +1 @@ +2.2.1 diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c new file mode 100644 index 00000000..788a12c1 --- /dev/null +++ b/net/wanrouter/wanmain.c @@ -0,0 +1,787 @@ +/***************************************************************************** +* wanmain.c WAN Multiprotocol Router Module. Main code. +* +* This module is completely hardware-independent and provides +* the following common services for the WAN Link Drivers: +* o WAN device management (registering, unregistering) +* o Network interface management +* o Physical connection management (dial-up, incoming calls) +* o Logical connection management (switched virtual circuits) +* o Protocol encapsulation/decapsulation +* +* Author: Gideon Hack +* +* Copyright: (c) 1995-1999 Sangoma Technologies Inc. +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version +* 2 of the License, or (at your option) any later version. +* ============================================================================ +* Nov 24, 2000 Nenad Corbic Updated for 2.4.X kernels +* Nov 07, 2000 Nenad Corbic Fixed the Mulit-Port PPP for kernels 2.2.16 and +* greater. +* Aug 2, 2000 Nenad Corbic Block the Multi-Port PPP from running on +* kernels 2.2.16 or greater. The SyncPPP +* has changed. +* Jul 13, 2000 Nenad Corbic Added SyncPPP support +* Added extra debugging in device_setup(). +* Oct 01, 1999 Gideon Hack Update for s514 PCI card +* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) +* Jan 16, 1997 Gene Kozin router_devlist made public +* Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 +* Jun 27, 1997 Alan Cox realigned with vendor code +* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 +* Apr 20, 1998 Alan Cox Fixed 2.1 symbols +* May 17, 1998 K. Baranowski Fixed SNAP encapsulation in wan_encapsulate +* Dec 15, 1998 Arnaldo Melo support for firmwares of up to 128000 bytes +* check wandev->setup return value +* Dec 22, 1998 Arnaldo Melo vmalloc/vfree used in device_setup to allocate +* kernel memory and copy configuration data to +* kernel space (for big firmwares) +* Jun 02, 1999 Gideon Hack Updates for Linux 2.0.X and 2.2.X kernels. +*****************************************************************************/ + +#include /* offsetof(), etc. */ +#include +#include /* return codes */ +#include +#include /* support for loadable modules */ +#include /* kmalloc(), kfree() */ +#include +#include +#include /* inline mem*, str* functions */ + +#include /* htons(), etc. */ +#include /* WAN router API definitions */ + +#include /* vmalloc, vfree */ +#include /* copy_to/from_user */ +#include /* __initfunc et al. */ + +#define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev))) + +/* + * Function Prototypes + */ + +/* + * WAN device IOCTL handlers + */ + +static DEFINE_MUTEX(wanrouter_mutex); +static int wanrouter_device_setup(struct wan_device *wandev, + wandev_conf_t __user *u_conf); +static int wanrouter_device_stat(struct wan_device *wandev, + wandev_stat_t __user *u_stat); +static int wanrouter_device_shutdown(struct wan_device *wandev); +static int wanrouter_device_new_if(struct wan_device *wandev, + wanif_conf_t __user *u_conf); +static int wanrouter_device_del_if(struct wan_device *wandev, + char __user *u_name); + +/* + * Miscellaneous + */ + +static struct wan_device *wanrouter_find_device(char *name); +static int wanrouter_delete_interface(struct wan_device *wandev, char *name); +static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __acquires(lock); +static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __releases(lock); + + + +/* + * Global Data + */ + +static char wanrouter_fullname[] = "Sangoma WANPIPE Router"; +static char wanrouter_copyright[] = "(c) 1995-2000 Sangoma Technologies Inc."; +static char wanrouter_modname[] = ROUTER_NAME; /* short module name */ +struct wan_device* wanrouter_router_devlist; /* list of registered devices */ + +/* + * Organize Unique Identifiers for encapsulation/decapsulation + */ + +#if 0 +static unsigned char wanrouter_oui_ether[] = { 0x00, 0x00, 0x00 }; +static unsigned char wanrouter_oui_802_2[] = { 0x00, 0x80, 0xC2 }; +#endif + +static int __init wanrouter_init(void) +{ + int err; + + printk(KERN_INFO "%s v%u.%u %s\n", + wanrouter_fullname, ROUTER_VERSION, ROUTER_RELEASE, + wanrouter_copyright); + + err = wanrouter_proc_init(); + if (err) + printk(KERN_INFO "%s: can't create entry in proc filesystem!\n", + wanrouter_modname); + + return err; +} + +static void __exit wanrouter_cleanup (void) +{ + wanrouter_proc_cleanup(); +} + +/* + * This is just plain dumb. We should move the bugger to drivers/net/wan, + * slap it first in directory and make it module_init(). The only reason + * for subsys_initcall() here is that net goes after drivers (why, BTW?) + */ +subsys_initcall(wanrouter_init); +module_exit(wanrouter_cleanup); + +/* + * Kernel APIs + */ + +/* + * Register WAN device. + * o verify device credentials + * o create an entry for the device in the /proc/net/router directory + * o initialize internally maintained fields of the wan_device structure + * o link device data space to a singly-linked list + * o if it's the first device, then start kernel 'thread' + * o increment module use count + * + * Return: + * 0 Ok + * < 0 error. + * + * Context: process + */ + + +int register_wan_device(struct wan_device *wandev) +{ + int err, namelen; + + if ((wandev == NULL) || (wandev->magic != ROUTER_MAGIC) || + (wandev->name == NULL)) + return -EINVAL; + + namelen = strlen(wandev->name); + if (!namelen || (namelen > WAN_DRVNAME_SZ)) + return -EINVAL; + + if (wanrouter_find_device(wandev->name)) + return -EEXIST; + +#ifdef WANDEBUG + printk(KERN_INFO "%s: registering WAN device %s\n", + wanrouter_modname, wandev->name); +#endif + + /* + * Register /proc directory entry + */ + err = wanrouter_proc_add(wandev); + if (err) { + printk(KERN_INFO + "%s: can't create /proc/net/router/%s entry!\n", + wanrouter_modname, wandev->name); + return err; + } + + /* + * Initialize fields of the wan_device structure maintained by the + * router and update local data. + */ + + wandev->ndev = 0; + wandev->dev = NULL; + wandev->next = wanrouter_router_devlist; + wanrouter_router_devlist = wandev; + return 0; +} + +/* + * Unregister WAN device. + * o shut down device + * o unlink device data space from the linked list + * o delete device entry in the /proc/net/router directory + * o decrement module use count + * + * Return: 0 Ok + * <0 error. + * Context: process + */ + + +int unregister_wan_device(char *name) +{ + struct wan_device *wandev, *prev; + + if (name == NULL) + return -EINVAL; + + for (wandev = wanrouter_router_devlist, prev = NULL; + wandev && strcmp(wandev->name, name); + prev = wandev, wandev = wandev->next) + ; + if (wandev == NULL) + return -ENODEV; + +#ifdef WANDEBUG + printk(KERN_INFO "%s: unregistering WAN device %s\n", + wanrouter_modname, name); +#endif + + if (wandev->state != WAN_UNCONFIGURED) + wanrouter_device_shutdown(wandev); + + if (prev) + prev->next = wandev->next; + else + wanrouter_router_devlist = wandev->next; + + wanrouter_proc_delete(wandev); + return 0; +} + +#if 0 + +/* + * Encapsulate packet. + * + * Return: encapsulation header size + * < 0 - unsupported Ethertype + * + * Notes: + * 1. This function may be called on interrupt context. + */ + + +int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev, + unsigned short type) +{ + int hdr_len = 0; + + switch (type) { + case ETH_P_IP: /* IP datagram encapsulation */ + hdr_len += 1; + skb_push(skb, 1); + skb->data[0] = NLPID_IP; + break; + + case ETH_P_IPX: /* SNAP encapsulation */ + case ETH_P_ARP: + hdr_len += 7; + skb_push(skb, 7); + skb->data[0] = 0; + skb->data[1] = NLPID_SNAP; + skb_copy_to_linear_data_offset(skb, 2, wanrouter_oui_ether, + sizeof(wanrouter_oui_ether)); + *((unsigned short*)&skb->data[5]) = htons(type); + break; + + default: /* Unknown packet type */ + printk(KERN_INFO + "%s: unsupported Ethertype 0x%04X on interface %s!\n", + wanrouter_modname, type, dev->name); + hdr_len = -EINVAL; + } + return hdr_len; +} + + +/* + * Decapsulate packet. + * + * Return: Ethertype (in network order) + * 0 unknown encapsulation + * + * Notes: + * 1. This function may be called on interrupt context. + */ + + +__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ + __be16 ethertype; + + switch (skb->data[cnt]) { + case NLPID_IP: /* IP datagramm */ + ethertype = htons(ETH_P_IP); + cnt += 1; + break; + + case NLPID_SNAP: /* SNAP encapsulation */ + if (memcmp(&skb->data[cnt + 1], wanrouter_oui_ether, + sizeof(wanrouter_oui_ether))){ + printk(KERN_INFO + "%s: unsupported SNAP OUI %02X-%02X-%02X " + "on interface %s!\n", wanrouter_modname, + skb->data[cnt+1], skb->data[cnt+2], + skb->data[cnt+3], dev->name); + return 0; + } + ethertype = *((__be16*)&skb->data[cnt+4]); + cnt += 6; + break; + + /* add other protocols, e.g. CLNP, ESIS, ISIS, if needed */ + + default: + printk(KERN_INFO + "%s: unsupported NLPID 0x%02X on interface %s!\n", + wanrouter_modname, skb->data[cnt], dev->name); + return 0; + } + skb->protocol = ethertype; + skb->pkt_type = PACKET_HOST; /* Physically point to point */ + skb_pull(skb, cnt); + skb_reset_mac_header(skb); + return ethertype; +} + +#endif /* 0 */ + +/* + * WAN device IOCTL. + * o find WAN device associated with this node + * o execute requested action or pass command to the device driver + */ + +long wanrouter_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int err = 0; + struct proc_dir_entry *dent; + struct wan_device *wandev; + void __user *data = (void __user *)arg; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if ((cmd >> 8) != ROUTER_IOCTL) + return -EINVAL; + + dent = PDE(inode); + if ((dent == NULL) || (dent->data == NULL)) + return -EINVAL; + + wandev = dent->data; + if (wandev->magic != ROUTER_MAGIC) + return -EINVAL; + + mutex_lock(&wanrouter_mutex); + switch (cmd) { + case ROUTER_SETUP: + err = wanrouter_device_setup(wandev, data); + break; + + case ROUTER_DOWN: + err = wanrouter_device_shutdown(wandev); + break; + + case ROUTER_STAT: + err = wanrouter_device_stat(wandev, data); + break; + + case ROUTER_IFNEW: + err = wanrouter_device_new_if(wandev, data); + break; + + case ROUTER_IFDEL: + err = wanrouter_device_del_if(wandev, data); + break; + + case ROUTER_IFSTAT: + break; + + default: + if ((cmd >= ROUTER_USER) && + (cmd <= ROUTER_USER_MAX) && + wandev->ioctl) + err = wandev->ioctl(wandev, cmd, arg); + else err = -EINVAL; + } + mutex_unlock(&wanrouter_mutex); + return err; +} + +/* + * WAN Driver IOCTL Handlers + */ + +/* + * Setup WAN link device. + * o verify user address space + * o allocate kernel memory and copy configuration data to kernel space + * o if configuration data includes extension, copy it to kernel space too + * o call driver's setup() entry point + */ + +static int wanrouter_device_setup(struct wan_device *wandev, + wandev_conf_t __user *u_conf) +{ + void *data = NULL; + wandev_conf_t *conf; + int err = -EINVAL; + + if (wandev->setup == NULL) { /* Nothing to do ? */ + printk(KERN_INFO "%s: ERROR, No setup script: wandev->setup()\n", + wandev->name); + return 0; + } + + conf = kmalloc(sizeof(wandev_conf_t), GFP_KERNEL); + if (conf == NULL){ + printk(KERN_INFO "%s: ERROR, Failed to allocate kernel memory !\n", + wandev->name); + return -ENOBUFS; + } + + if (copy_from_user(conf, u_conf, sizeof(wandev_conf_t))) { + printk(KERN_INFO "%s: Failed to copy user config data to kernel space!\n", + wandev->name); + kfree(conf); + return -EFAULT; + } + + if (conf->magic != ROUTER_MAGIC) { + kfree(conf); + printk(KERN_INFO "%s: ERROR, Invalid MAGIC Number\n", + wandev->name); + return -EINVAL; + } + + if (conf->data_size && conf->data) { + if (conf->data_size > 128000) { + printk(KERN_INFO + "%s: ERROR, Invalid firmware data size %i !\n", + wandev->name, conf->data_size); + kfree(conf); + return -EINVAL; + } + + data = vmalloc(conf->data_size); + if (!data) { + printk(KERN_INFO + "%s: ERROR, Failed allocate kernel memory !\n", + wandev->name); + kfree(conf); + return -ENOBUFS; + } + if (!copy_from_user(data, conf->data, conf->data_size)) { + conf->data = data; + err = wandev->setup(wandev, conf); + } else { + printk(KERN_INFO + "%s: ERROR, Failed to copy from user data !\n", + wandev->name); + err = -EFAULT; + } + vfree(data); + } else { + printk(KERN_INFO + "%s: ERROR, No firmware found ! Firmware size = %i !\n", + wandev->name, conf->data_size); + } + + kfree(conf); + return err; +} + +/* + * Shutdown WAN device. + * o delete all not opened logical channels for this device + * o call driver's shutdown() entry point + */ + +static int wanrouter_device_shutdown(struct wan_device *wandev) +{ + struct net_device *dev; + int err=0; + + if (wandev->state == WAN_UNCONFIGURED) + return 0; + + printk(KERN_INFO "\n%s: Shutting Down!\n",wandev->name); + + for (dev = wandev->dev; dev;) { + err = wanrouter_delete_interface(wandev, dev->name); + if (err) + return err; + /* The above function deallocates the current dev + * structure. Therefore, we cannot use netdev_priv(dev) + * as the next element: wandev->dev points to the + * next element */ + dev = wandev->dev; + } + + if (wandev->ndev) + return -EBUSY; /* there are opened interfaces */ + + if (wandev->shutdown) + err=wandev->shutdown(wandev); + + return err; +} + +/* + * Get WAN device status & statistics. + */ + +static int wanrouter_device_stat(struct wan_device *wandev, + wandev_stat_t __user *u_stat) +{ + wandev_stat_t stat; + + memset(&stat, 0, sizeof(stat)); + + /* Ask device driver to update device statistics */ + if ((wandev->state != WAN_UNCONFIGURED) && wandev->update) + wandev->update(wandev); + + /* Fill out structure */ + stat.ndev = wandev->ndev; + stat.state = wandev->state; + + if (copy_to_user(u_stat, &stat, sizeof(stat))) + return -EFAULT; + + return 0; +} + +/* + * Create new WAN interface. + * o verify user address space + * o copy configuration data to kernel address space + * o allocate network interface data space + * o call driver's new_if() entry point + * o make sure there is no interface name conflict + * o register network interface + */ + +static int wanrouter_device_new_if(struct wan_device *wandev, + wanif_conf_t __user *u_conf) +{ + wanif_conf_t *cnf; + struct net_device *dev = NULL; + int err; + + if ((wandev->state == WAN_UNCONFIGURED) || (wandev->new_if == NULL)) + return -ENODEV; + + cnf = kmalloc(sizeof(wanif_conf_t), GFP_KERNEL); + if (!cnf) + return -ENOBUFS; + + err = -EFAULT; + if (copy_from_user(cnf, u_conf, sizeof(wanif_conf_t))) + goto out; + + err = -EINVAL; + if (cnf->magic != ROUTER_MAGIC) + goto out; + + if (cnf->config_id == WANCONFIG_MPPP) { + printk(KERN_INFO "%s: Wanpipe Mulit-Port PPP support has not been compiled in!\n", + wandev->name); + err = -EPROTONOSUPPORT; + goto out; + } else { + err = wandev->new_if(wandev, dev, cnf); + } + + if (!err) { + /* Register network interface. This will invoke init() + * function supplied by the driver. If device registered + * successfully, add it to the interface list. + */ + + if (dev->name == NULL) { + err = -EINVAL; + } else { + + #ifdef WANDEBUG + printk(KERN_INFO "%s: registering interface %s...\n", + wanrouter_modname, dev->name); + #endif + + err = register_netdev(dev); + if (!err) { + struct net_device *slave = NULL; + unsigned long smp_flags=0; + + lock_adapter_irq(&wandev->lock, &smp_flags); + + if (wandev->dev == NULL) { + wandev->dev = dev; + } else { + for (slave=wandev->dev; + DEV_TO_SLAVE(slave); + slave = DEV_TO_SLAVE(slave)) + DEV_TO_SLAVE(slave) = dev; + } + ++wandev->ndev; + + unlock_adapter_irq(&wandev->lock, &smp_flags); + err = 0; /* done !!! */ + goto out; + } + } + if (wandev->del_if) + wandev->del_if(wandev, dev); + free_netdev(dev); + } + +out: + kfree(cnf); + return err; +} + + +/* + * Delete WAN logical channel. + * o verify user address space + * o copy configuration data to kernel address space + */ + +static int wanrouter_device_del_if(struct wan_device *wandev, char __user *u_name) +{ + char name[WAN_IFNAME_SZ + 1]; + int err = 0; + + if (wandev->state == WAN_UNCONFIGURED) + return -ENODEV; + + memset(name, 0, sizeof(name)); + + if (copy_from_user(name, u_name, WAN_IFNAME_SZ)) + return -EFAULT; + + err = wanrouter_delete_interface(wandev, name); + if (err) + return err; + + /* If last interface being deleted, shutdown card + * This helps with administration at leaf nodes + * (You can tell if the person at the other end of the phone + * has an interface configured) and avoids DoS vulnerabilities + * in binary driver files - this fixes a problem with the current + * Sangoma driver going into strange states when all the network + * interfaces are deleted and the link irrecoverably disconnected. + */ + + if (!wandev->ndev && wandev->shutdown) + err = wandev->shutdown(wandev); + + return err; +} + +/* + * Miscellaneous Functions + */ + +/* + * Find WAN device by name. + * Return pointer to the WAN device data space or NULL if device not found. + */ + +static struct wan_device *wanrouter_find_device(char *name) +{ + struct wan_device *wandev; + + for (wandev = wanrouter_router_devlist; + wandev && strcmp(wandev->name, name); + wandev = wandev->next); + return wandev; +} + +/* + * Delete WAN logical channel identified by its name. + * o find logical channel by its name + * o call driver's del_if() entry point + * o unregister network interface + * o unlink channel data space from linked list of channels + * o release channel data space + * + * Return: 0 success + * -ENODEV channel not found. + * -EBUSY interface is open + * + * Note: If (force != 0), then device will be destroyed even if interface + * associated with it is open. It's caller's responsibility to make + * sure that opened interfaces are not removed! + */ + +static int wanrouter_delete_interface(struct wan_device *wandev, char *name) +{ + struct net_device *dev = NULL, *prev = NULL; + unsigned long smp_flags=0; + + lock_adapter_irq(&wandev->lock, &smp_flags); + dev = wandev->dev; + prev = NULL; + while (dev && strcmp(name, dev->name)) { + struct net_device **slave = netdev_priv(dev); + prev = dev; + dev = *slave; + } + unlock_adapter_irq(&wandev->lock, &smp_flags); + + if (dev == NULL) + return -ENODEV; /* interface not found */ + + if (netif_running(dev)) + return -EBUSY; /* interface in use */ + + if (wandev->del_if) + wandev->del_if(wandev, dev); + + lock_adapter_irq(&wandev->lock, &smp_flags); + if (prev) { + struct net_device **prev_slave = netdev_priv(prev); + struct net_device **slave = netdev_priv(dev); + + *prev_slave = *slave; + } else { + struct net_device **slave = netdev_priv(dev); + wandev->dev = *slave; + } + --wandev->ndev; + unlock_adapter_irq(&wandev->lock, &smp_flags); + + printk(KERN_INFO "%s: unregistering '%s'\n", wandev->name, dev->name); + + unregister_netdev(dev); + + free_netdev(dev); + + return 0; +} + +static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __acquires(lock) +{ + spin_lock_irqsave(lock, *smp_flags); +} + + +static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) + __releases(lock) +{ + spin_unlock_irqrestore(lock, *smp_flags); +} + +EXPORT_SYMBOL(register_wan_device); +EXPORT_SYMBOL(unregister_wan_device); + +MODULE_LICENSE("GPL"); + +/* + * End + */ diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c new file mode 100644 index 00000000..f3463953 --- /dev/null +++ b/net/wanrouter/wanproc.c @@ -0,0 +1,382 @@ +/***************************************************************************** +* wanproc.c WAN Router Module. /proc filesystem interface. +* +* This module is completely hardware-independent and provides +* access to the router using Linux /proc filesystem. +* +* Author: Gideon Hack +* +* Copyright: (c) 1995-1999 Sangoma Technologies Inc. +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version +* 2 of the License, or (at your option) any later version. +* ============================================================================ +* Jun 02, 1999 Gideon Hack Updates for Linux 2.2.X kernels. +* Jun 29, 1997 Alan Cox Merged with 1.0.3 vendor code +* Jan 29, 1997 Gene Kozin v1.0.1. Implemented /proc read routines +* Jan 30, 1997 Alan Cox Hacked around for 2.1 +* Dec 13, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) +*****************************************************************************/ + +#include /* __initfunc et al. */ +#include /* offsetof(), etc. */ +#include /* return codes */ +#include +#include +#include /* WAN router API definitions */ +#include +#include + +#include +#include + +#define PROC_STATS_FORMAT "%30s: %12lu\n" + +/****** Defines and Macros **************************************************/ + +#define PROT_DECODE(prot) ((prot == WANCONFIG_FR) ? " FR" :\ + (prot == WANCONFIG_X25) ? " X25" : \ + (prot == WANCONFIG_PPP) ? " PPP" : \ + (prot == WANCONFIG_CHDLC) ? " CHDLC": \ + (prot == WANCONFIG_MPPP) ? " MPPP" : \ + " Unknown" ) + +/****** Function Prototypes *************************************************/ + +#ifdef CONFIG_PROC_FS + +/* Miscellaneous */ + +/* + * Structures for interfacing with the /proc filesystem. + * Router creates its own directory /proc/net/router with the following + * entries: + * config device configuration + * status global device statistics + * entry for each WAN device + */ + +/* + * Generic /proc/net/router/ file and inode operations + */ + +/* + * /proc/net/router + */ + +static DEFINE_MUTEX(config_mutex); +static struct proc_dir_entry *proc_router; + +/* Strings */ + +/* + * Interface functions + */ + +/****** Proc filesystem entry points ****************************************/ + +/* + * Iterator + */ +static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(kernel_lock) +{ + struct wan_device *wandev; + loff_t l = *pos; + + mutex_lock(&config_mutex); + if (!l--) + return SEQ_START_TOKEN; + for (wandev = wanrouter_router_devlist; l-- && wandev; + wandev = wandev->next) + ; + return wandev; +} + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct wan_device *wandev = v; + (*pos)++; + return (v == SEQ_START_TOKEN) ? wanrouter_router_devlist : wandev->next; +} + +static void r_stop(struct seq_file *m, void *v) + __releases(kernel_lock) +{ + mutex_unlock(&config_mutex); +} + +static int config_show(struct seq_file *m, void *v) +{ + struct wan_device *p = v; + if (v == SEQ_START_TOKEN) { + seq_puts(m, "Device name | port |IRQ|DMA| mem.addr |" + "mem.size|option1|option2|option3|option4\n"); + return 0; + } + if (!p->state) + return 0; + seq_printf(m, "%-15s|0x%-4X|%3u|%3u| 0x%-8lX |0x%-6X|%7u|%7u|%7u|%7u\n", + p->name, p->ioport, p->irq, p->dma, p->maddr, p->msize, + p->hw_opt[0], p->hw_opt[1], p->hw_opt[2], p->hw_opt[3]); + return 0; +} + +static int status_show(struct seq_file *m, void *v) +{ + struct wan_device *p = v; + if (v == SEQ_START_TOKEN) { + seq_puts(m, "Device name |protocol|station|interface|" + "clocking|baud rate| MTU |ndev|link state\n"); + return 0; + } + if (!p->state) + return 0; + seq_printf(m, "%-15s|%-8s| %-7s| %-9s|%-8s|%9u|%5u|%3u |", + p->name, + PROT_DECODE(p->config_id), + p->config_id == WANCONFIG_FR ? + (p->station ? "Node" : "CPE") : + (p->config_id == WANCONFIG_X25 ? + (p->station ? "DCE" : "DTE") : + ("N/A")), + p->interface ? "V.35" : "RS-232", + p->clocking ? "internal" : "external", + p->bps, + p->mtu, + p->ndev); + + switch (p->state) { + case WAN_UNCONFIGURED: + seq_printf(m, "%-12s\n", "unconfigured"); + break; + case WAN_DISCONNECTED: + seq_printf(m, "%-12s\n", "disconnected"); + break; + case WAN_CONNECTING: + seq_printf(m, "%-12s\n", "connecting"); + break; + case WAN_CONNECTED: + seq_printf(m, "%-12s\n", "connected"); + break; + default: + seq_printf(m, "%-12s\n", "invalid"); + break; + } + return 0; +} + +static const struct seq_operations config_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = config_show, +}; + +static const struct seq_operations status_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = status_show, +}; + +static int config_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &config_op); +} + +static int status_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &status_op); +} + +static const struct file_operations config_fops = { + .owner = THIS_MODULE, + .open = config_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations status_fops = { + .owner = THIS_MODULE, + .open = status_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int wandev_show(struct seq_file *m, void *v) +{ + struct wan_device *wandev = m->private; + + if (wandev->magic != ROUTER_MAGIC) + return 0; + + if (!wandev->state) { + seq_puts(m, "device is not configured!\n"); + return 0; + } + + /* Update device statistics */ + if (wandev->update) { + int err = wandev->update(wandev); + if (err == -EAGAIN) { + seq_puts(m, "Device is busy!\n"); + return 0; + } + if (err) { + seq_puts(m, "Device is not configured!\n"); + return 0; + } + } + + seq_printf(m, PROC_STATS_FORMAT, + "total packets received", wandev->stats.rx_packets); + seq_printf(m, PROC_STATS_FORMAT, + "total packets transmitted", wandev->stats.tx_packets); + seq_printf(m, PROC_STATS_FORMAT, + "total bytes received", wandev->stats.rx_bytes); + seq_printf(m, PROC_STATS_FORMAT, + "total bytes transmitted", wandev->stats.tx_bytes); + seq_printf(m, PROC_STATS_FORMAT, + "bad packets received", wandev->stats.rx_errors); + seq_printf(m, PROC_STATS_FORMAT, + "packet transmit problems", wandev->stats.tx_errors); + seq_printf(m, PROC_STATS_FORMAT, + "received frames dropped", wandev->stats.rx_dropped); + seq_printf(m, PROC_STATS_FORMAT, + "transmit frames dropped", wandev->stats.tx_dropped); + seq_printf(m, PROC_STATS_FORMAT, + "multicast packets received", wandev->stats.multicast); + seq_printf(m, PROC_STATS_FORMAT, + "transmit collisions", wandev->stats.collisions); + seq_printf(m, PROC_STATS_FORMAT, + "receive length errors", wandev->stats.rx_length_errors); + seq_printf(m, PROC_STATS_FORMAT, + "receiver overrun errors", wandev->stats.rx_over_errors); + seq_printf(m, PROC_STATS_FORMAT, + "CRC errors", wandev->stats.rx_crc_errors); + seq_printf(m, PROC_STATS_FORMAT, + "frame format errors (aborts)", wandev->stats.rx_frame_errors); + seq_printf(m, PROC_STATS_FORMAT, + "receiver fifo overrun", wandev->stats.rx_fifo_errors); + seq_printf(m, PROC_STATS_FORMAT, + "receiver missed packet", wandev->stats.rx_missed_errors); + seq_printf(m, PROC_STATS_FORMAT, + "aborted frames transmitted", wandev->stats.tx_aborted_errors); + return 0; +} + +static int wandev_open(struct inode *inode, struct file *file) +{ + return single_open(file, wandev_show, PDE(inode)->data); +} + +static const struct file_operations wandev_fops = { + .owner = THIS_MODULE, + .open = wandev_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .unlocked_ioctl = wanrouter_ioctl, +}; + +/* + * Initialize router proc interface. + */ + +int __init wanrouter_proc_init(void) +{ + struct proc_dir_entry *p; + proc_router = proc_mkdir(ROUTER_NAME, init_net.proc_net); + if (!proc_router) + goto fail; + + p = proc_create("config", S_IRUGO, proc_router, &config_fops); + if (!p) + goto fail_config; + p = proc_create("status", S_IRUGO, proc_router, &status_fops); + if (!p) + goto fail_stat; + return 0; +fail_stat: + remove_proc_entry("config", proc_router); +fail_config: + remove_proc_entry(ROUTER_NAME, init_net.proc_net); +fail: + return -ENOMEM; +} + +/* + * Clean up router proc interface. + */ + +void wanrouter_proc_cleanup(void) +{ + remove_proc_entry("config", proc_router); + remove_proc_entry("status", proc_router); + remove_proc_entry(ROUTER_NAME, init_net.proc_net); +} + +/* + * Add directory entry for WAN device. + */ + +int wanrouter_proc_add(struct wan_device* wandev) +{ + if (wandev->magic != ROUTER_MAGIC) + return -EINVAL; + + wandev->dent = proc_create(wandev->name, S_IRUGO, + proc_router, &wandev_fops); + if (!wandev->dent) + return -ENOMEM; + wandev->dent->data = wandev; + return 0; +} + +/* + * Delete directory entry for WAN device. + */ +int wanrouter_proc_delete(struct wan_device* wandev) +{ + if (wandev->magic != ROUTER_MAGIC) + return -EINVAL; + remove_proc_entry(wandev->name, proc_router); + return 0; +} + +#else + +/* + * No /proc - output stubs + */ + +int __init wanrouter_proc_init(void) +{ + return 0; +} + +void wanrouter_proc_cleanup(void) +{ +} + +int wanrouter_proc_add(struct wan_device *wandev) +{ + return 0; +} + +int wanrouter_proc_delete(struct wan_device *wandev) +{ + return 0; +} + +#endif + +/* + * End + */ + diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig new file mode 100644 index 00000000..e4d97ab4 --- /dev/null +++ b/net/wimax/Kconfig @@ -0,0 +1,39 @@ +# +# WiMAX LAN device configuration +# + +menuconfig WIMAX + tristate "WiMAX Wireless Broadband support" + depends on RFKILL || !RFKILL + help + + Select to configure support for devices that provide + wireless broadband connectivity using the WiMAX protocol + (IEEE 802.16). + + Please note that most of these devices require signing up + for a service plan with a provider. + + The different WiMAX drivers can be enabled in the menu entry + + Device Drivers > Network device support > WiMAX Wireless + Broadband devices + + If unsure, it is safe to select M (module). + +config WIMAX_DEBUG_LEVEL + int "WiMAX debug level" + depends on WIMAX + default 8 + help + + Select the maximum debug verbosity level to be compiled into + the WiMAX stack code. + + By default, debug messages are disabled at runtime and can + be selectively enabled for different parts of the code using + the sysfs debug-levels file. + + If set at zero, this will compile out all the debug code. + + It is recommended that it is left at 8. diff --git a/net/wimax/Makefile b/net/wimax/Makefile new file mode 100644 index 00000000..8f1510d0 --- /dev/null +++ b/net/wimax/Makefile @@ -0,0 +1,14 @@ + +obj-$(CONFIG_WIMAX) += wimax.o + +wimax-y := \ + id-table.o \ + op-msg.o \ + op-reset.o \ + op-rfkill.o \ + op-state-get.o \ + stack.o + +wimax-$(CONFIG_DEBUG_FS) += debugfs.o + + diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h new file mode 100644 index 00000000..0975adba --- /dev/null +++ b/net/wimax/debug-levels.h @@ -0,0 +1,43 @@ +/* + * Linux WiMAX Stack + * Debug levels control file for the wimax module + * + * + * Copyright (C) 2007-2008 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +#ifndef __debug_levels__h__ +#define __debug_levels__h__ + +/* Maximum compile and run time debug level for all submodules */ +#define D_MODULENAME wimax +#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL + +#include + +/* List of all the enabled modules */ +enum d_module { + D_SUBMODULE_DECLARE(debugfs), + D_SUBMODULE_DECLARE(id_table), + D_SUBMODULE_DECLARE(op_msg), + D_SUBMODULE_DECLARE(op_reset), + D_SUBMODULE_DECLARE(op_rfkill), + D_SUBMODULE_DECLARE(op_state_get), + D_SUBMODULE_DECLARE(stack), +}; + +#endif /* #ifndef __debug_levels__h__ */ diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c new file mode 100644 index 00000000..6c9bedb7 --- /dev/null +++ b/net/wimax/debugfs.c @@ -0,0 +1,80 @@ +/* + * Linux WiMAX + * Debugfs support + * + * + * Copyright (C) 2005-2006 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +#include +#include +#include "wimax-internal.h" + +#define D_SUBMODULE debugfs +#include "debug-levels.h" + + +#define __debugfs_register(prefix, name, parent) \ +do { \ + result = d_level_register_debugfs(prefix, name, parent); \ + if (result < 0) \ + goto error; \ +} while (0) + + +int wimax_debugfs_add(struct wimax_dev *wimax_dev) +{ + int result; + struct net_device *net_dev = wimax_dev->net_dev; + struct device *dev = net_dev->dev.parent; + struct dentry *dentry; + char buf[128]; + + snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name); + dentry = debugfs_create_dir(buf, NULL); + result = PTR_ERR(dentry); + if (IS_ERR(dentry)) { + if (result == -ENODEV) + result = 0; /* No debugfs support */ + else + dev_err(dev, "Can't create debugfs dentry: %d\n", + result); + goto out; + } + wimax_dev->debugfs_dentry = dentry; + __debugfs_register("wimax_dl_", debugfs, dentry); + __debugfs_register("wimax_dl_", id_table, dentry); + __debugfs_register("wimax_dl_", op_msg, dentry); + __debugfs_register("wimax_dl_", op_reset, dentry); + __debugfs_register("wimax_dl_", op_rfkill, dentry); + __debugfs_register("wimax_dl_", op_state_get, dentry); + __debugfs_register("wimax_dl_", stack, dentry); + result = 0; +out: + return result; + +error: + debugfs_remove_recursive(wimax_dev->debugfs_dentry); + return result; +} + +void wimax_debugfs_rm(struct wimax_dev *wimax_dev) +{ + debugfs_remove_recursive(wimax_dev->debugfs_dentry); +} + + diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c new file mode 100644 index 00000000..72273abf --- /dev/null +++ b/net/wimax/id-table.c @@ -0,0 +1,145 @@ +/* + * Linux WiMAX + * Mappping of generic netlink family IDs to net devices + * + * + * Copyright (C) 2005-2006 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * + * We assign a single generic netlink family ID to each device (to + * simplify lookup). + * + * We need a way to map family ID to a wimax_dev pointer. + * + * The idea is to use a very simple lookup. Using a netlink attribute + * with (for example) the interface name implies a heavier search over + * all the network devices; seemed kind of a waste given that we know + * we are looking for a WiMAX device and that most systems will have + * just a single WiMAX adapter. + * + * We put all the WiMAX devices in the system in a linked list and + * match the generic link family ID against the list. + * + * By using a linked list, the case of a single adapter in the system + * becomes (almost) no overhead, while still working for many more. If + * it ever goes beyond two, I'll be surprised. + */ +#include +#include +#include +#include +#include +#include "wimax-internal.h" + + +#define D_SUBMODULE id_table +#include "debug-levels.h" + + +static DEFINE_SPINLOCK(wimax_id_table_lock); +static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table); + + +/* + * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping + * + * @wimax_dev: WiMAX device descriptor to associate to the Generic + * Netlink family ID. + * + * Look for an empty spot in the ID table; if none found, double the + * table's size and get the first spot. + */ +void wimax_id_table_add(struct wimax_dev *wimax_dev) +{ + d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev); + spin_lock(&wimax_id_table_lock); + list_add(&wimax_dev->id_table_node, &wimax_id_table); + spin_unlock(&wimax_id_table_lock); + d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev); +} + + +/* + * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info + * + * The generic netlink family ID has been filled out in the + * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in + * the mapping table and reference the wimax_dev. + * + * When done, the reference should be dropped with + * 'dev_put(wimax_dev->net_dev)'. + */ +struct wimax_dev *wimax_dev_get_by_genl_info( + struct genl_info *info, int ifindex) +{ + struct wimax_dev *wimax_dev = NULL; + + d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex); + spin_lock(&wimax_id_table_lock); + list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { + if (wimax_dev->net_dev->ifindex == ifindex) { + dev_hold(wimax_dev->net_dev); + goto found; + } + } + wimax_dev = NULL; + d_printf(1, NULL, "wimax: no devices found with ifindex %d\n", + ifindex); +found: + spin_unlock(&wimax_id_table_lock); + d_fnend(3, NULL, "(info %p ifindex %d) = %p\n", + info, ifindex, wimax_dev); + return wimax_dev; +} + + +/* + * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping + * + * @id: family ID to remove from the table + */ +void wimax_id_table_rm(struct wimax_dev *wimax_dev) +{ + spin_lock(&wimax_id_table_lock); + list_del_init(&wimax_dev->id_table_node); + spin_unlock(&wimax_id_table_lock); +} + + +/* + * Release the gennetlink family id / mapping table + * + * On debug, verify that the table is empty upon removal. We want the + * code always compiled, to ensure it doesn't bit rot. It will be + * compiled out if CONFIG_BUG is disabled. + */ +void wimax_id_table_release(void) +{ + struct wimax_dev *wimax_dev; + +#ifndef CONFIG_BUG + return; +#endif + spin_lock(&wimax_id_table_lock); + list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { + printk(KERN_ERR "BUG: %s wimax_dev %p ifindex %d not cleared\n", + __func__, wimax_dev, wimax_dev->net_dev->ifindex); + WARN_ON(1); + } + spin_unlock(&wimax_id_table_lock); +} diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c new file mode 100644 index 00000000..d5b7c377 --- /dev/null +++ b/net/wimax/op-msg.c @@ -0,0 +1,432 @@ +/* + * Linux WiMAX + * Generic messaging interface between userspace and driver/device + * + * + * Copyright (C) 2007-2008 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * + * This implements a direct communication channel between user space and + * the driver/device, by which free form messages can be sent back and + * forth. + * + * This is intended for device-specific features, vendor quirks, etc. + * + * See include/net/wimax.h + * + * GENERIC NETLINK ENCODING AND CAPACITY + * + * A destination "pipe name" is added to each message; it is up to the + * drivers to assign or use those names (if using them at all). + * + * Messages are encoded as a binary netlink attribute using nla_put() + * using type NLA_UNSPEC (as some versions of libnl still in + * deployment don't yet understand NLA_BINARY). + * + * The maximum capacity of this transport is PAGESIZE per message (so + * the actual payload will be bit smaller depending on the + * netlink/generic netlink attributes and headers). + * + * RECEPTION OF MESSAGES + * + * When a message is received from user space, it is passed verbatim + * to the driver calling wimax_dev->op_msg_from_user(). The return + * value from this function is passed back to user space as an ack + * over the generic netlink protocol. + * + * The stack doesn't do any processing or interpretation of these + * messages. + * + * SENDING MESSAGES + * + * Messages can be sent with wimax_msg(). + * + * If the message delivery needs to happen on a different context to + * that of its creation, wimax_msg_alloc() can be used to get a + * pointer to the message that can be delivered later on with + * wimax_msg_send(). + * + * ROADMAP + * + * wimax_gnl_doit_msg_from_user() Process a message from user space + * wimax_dev_get_by_genl_info() + * wimax_dev->op_msg_from_user() Delivery of message to the driver + * + * wimax_msg() Send a message to user space + * wimax_msg_alloc() + * wimax_msg_send() + */ +#include +#include +#include +#include +#include +#include +#include "wimax-internal.h" + + +#define D_SUBMODULE op_msg +#include "debug-levels.h" + + +/** + * wimax_msg_alloc - Create a new skb for sending a message to userspace + * + * @wimax_dev: WiMAX device descriptor + * @pipe_name: "named pipe" the message will be sent to + * @msg: pointer to the message data to send + * @size: size of the message to send (in bytes), including the header. + * @gfp_flags: flags for memory allocation. + * + * Returns: %0 if ok, negative errno code on error + * + * Description: + * + * Allocates an skb that will contain the message to send to user + * space over the messaging pipe and initializes it, copying the + * payload. + * + * Once this call is done, you can deliver it with + * wimax_msg_send(). + * + * IMPORTANT: + * + * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as + * wimax_msg_send() depends on skb->data being placed at the + * beginning of the user message. + * + * Unlike other WiMAX stack calls, this call can be used way early, + * even before wimax_dev_add() is called, as long as the + * wimax_dev->net_dev pointer is set to point to a proper + * net_dev. This is so that drivers can use it early in case they need + * to send stuff around or communicate with user space. + */ +struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev, + const char *pipe_name, + const void *msg, size_t size, + gfp_t gfp_flags) +{ + int result; + struct device *dev = wimax_dev_to_dev(wimax_dev); + size_t msg_size; + void *genl_msg; + struct sk_buff *skb; + + msg_size = nla_total_size(size) + + nla_total_size(sizeof(u32)) + + (pipe_name ? nla_total_size(strlen(pipe_name)) : 0); + result = -ENOMEM; + skb = genlmsg_new(msg_size, gfp_flags); + if (skb == NULL) + goto error_new; + genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family, + 0, WIMAX_GNL_OP_MSG_TO_USER); + if (genl_msg == NULL) { + dev_err(dev, "no memory to create generic netlink message\n"); + goto error_genlmsg_put; + } + result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX, + wimax_dev->net_dev->ifindex); + if (result < 0) { + dev_err(dev, "no memory to add ifindex attribute\n"); + goto error_nla_put; + } + if (pipe_name) { + result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME, + pipe_name); + if (result < 0) { + dev_err(dev, "no memory to add pipe_name attribute\n"); + goto error_nla_put; + } + } + result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg); + if (result < 0) { + dev_err(dev, "no memory to add payload (msg %p size %zu) in " + "attribute: %d\n", msg, size, result); + goto error_nla_put; + } + genlmsg_end(skb, genl_msg); + return skb; + +error_nla_put: +error_genlmsg_put: +error_new: + nlmsg_free(skb); + return ERR_PTR(result); +} +EXPORT_SYMBOL_GPL(wimax_msg_alloc); + + +/** + * wimax_msg_data_len - Return a pointer and size of a message's payload + * + * @msg: Pointer to a message created with wimax_msg_alloc() + * @size: Pointer to where to store the message's size + * + * Returns the pointer to the message data. + */ +const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size) +{ + struct nlmsghdr *nlh = (void *) msg->head; + struct nlattr *nla; + + nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), + WIMAX_GNL_MSG_DATA); + if (nla == NULL) { + printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + return NULL; + } + *size = nla_len(nla); + return nla_data(nla); +} +EXPORT_SYMBOL_GPL(wimax_msg_data_len); + + +/** + * wimax_msg_data - Return a pointer to a message's payload + * + * @msg: Pointer to a message created with wimax_msg_alloc() + */ +const void *wimax_msg_data(struct sk_buff *msg) +{ + struct nlmsghdr *nlh = (void *) msg->head; + struct nlattr *nla; + + nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), + WIMAX_GNL_MSG_DATA); + if (nla == NULL) { + printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + return NULL; + } + return nla_data(nla); +} +EXPORT_SYMBOL_GPL(wimax_msg_data); + + +/** + * wimax_msg_len - Return a message's payload length + * + * @msg: Pointer to a message created with wimax_msg_alloc() + */ +ssize_t wimax_msg_len(struct sk_buff *msg) +{ + struct nlmsghdr *nlh = (void *) msg->head; + struct nlattr *nla; + + nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), + WIMAX_GNL_MSG_DATA); + if (nla == NULL) { + printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + return -EINVAL; + } + return nla_len(nla); +} +EXPORT_SYMBOL_GPL(wimax_msg_len); + + +/** + * wimax_msg_send - Send a pre-allocated message to user space + * + * @wimax_dev: WiMAX device descriptor + * + * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the + * ownership of @skb is transferred to this function. + * + * Returns: 0 if ok, < 0 errno code on error + * + * Description: + * + * Sends a free-form message that was preallocated with + * wimax_msg_alloc() and filled up. + * + * Assumes that once you pass an skb to this function for sending, it + * owns it and will release it when done (on success). + * + * IMPORTANT: + * + * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as + * wimax_msg_send() depends on skb->data being placed at the + * beginning of the user message. + * + * Unlike other WiMAX stack calls, this call can be used way early, + * even before wimax_dev_add() is called, as long as the + * wimax_dev->net_dev pointer is set to point to a proper + * net_dev. This is so that drivers can use it early in case they need + * to send stuff around or communicate with user space. + */ +int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb) +{ + struct device *dev = wimax_dev_to_dev(wimax_dev); + void *msg = skb->data; + size_t size = skb->len; + might_sleep(); + + d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size); + d_dump(2, dev, msg, size); + genlmsg_multicast(skb, 0, wimax_gnl_mcg.id, GFP_KERNEL); + d_printf(1, dev, "CTX: genl multicast done\n"); + return 0; +} +EXPORT_SYMBOL_GPL(wimax_msg_send); + + +/** + * wimax_msg - Send a message to user space + * + * @wimax_dev: WiMAX device descriptor (properly referenced) + * @pipe_name: "named pipe" the message will be sent to + * @buf: pointer to the message to send. + * @size: size of the buffer pointed to by @buf (in bytes). + * @gfp_flags: flags for memory allocation. + * + * Returns: %0 if ok, negative errno code on error. + * + * Description: + * + * Sends a free-form message to user space on the device @wimax_dev. + * + * NOTES: + * + * Once the @skb is given to this function, who will own it and will + * release it when done (unless it returns error). + */ +int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name, + const void *buf, size_t size, gfp_t gfp_flags) +{ + int result = -ENOMEM; + struct sk_buff *skb; + + skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags); + if (IS_ERR(skb)) + result = PTR_ERR(skb); + else + result = wimax_msg_send(wimax_dev, skb); + return result; +} +EXPORT_SYMBOL_GPL(wimax_msg); + + +static const struct nla_policy wimax_gnl_msg_policy[WIMAX_GNL_ATTR_MAX + 1] = { + [WIMAX_GNL_MSG_IFIDX] = { + .type = NLA_U32, + }, + [WIMAX_GNL_MSG_DATA] = { + .type = NLA_UNSPEC, /* libnl doesn't grok BINARY yet */ + }, +}; + + +/* + * Relays a message from user space to the driver + * + * The skb is passed to the driver-specific function with the netlink + * and generic netlink headers already stripped. + * + * This call will block while handling/relaying the message. + */ +static +int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info) +{ + int result, ifindex; + struct wimax_dev *wimax_dev; + struct device *dev; + struct nlmsghdr *nlh = info->nlhdr; + char *pipe_name; + void *msg_buf; + size_t msg_len; + + might_sleep(); + d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); + result = -ENODEV; + if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) { + printk(KERN_ERR "WIMAX_GNL_MSG_FROM_USER: can't find IFIDX " + "attribute\n"); + goto error_no_wimax_dev; + } + ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]); + wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); + if (wimax_dev == NULL) + goto error_no_wimax_dev; + dev = wimax_dev_to_dev(wimax_dev); + + /* Unpack arguments */ + result = -EINVAL; + if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) { + dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA " + "attribute\n"); + goto error_no_data; + } + msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]); + msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]); + + if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL) + pipe_name = NULL; + else { + struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME]; + size_t attr_len = nla_len(attr); + /* libnl-1.1 does not yet support NLA_NUL_STRING */ + result = -ENOMEM; + pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL); + if (pipe_name == NULL) + goto error_alloc; + pipe_name[attr_len] = 0; + } + mutex_lock(&wimax_dev->mutex); + result = wimax_dev_is_ready(wimax_dev); + if (result == -ENOMEDIUM) + result = 0; + if (result < 0) + goto error_not_ready; + result = -ENOSYS; + if (wimax_dev->op_msg_from_user == NULL) + goto error_noop; + + d_printf(1, dev, + "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n", + nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, + nlh->nlmsg_seq, nlh->nlmsg_pid); + d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len); + d_dump(2, dev, msg_buf, msg_len); + + result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name, + msg_buf, msg_len, info); +error_noop: +error_not_ready: + mutex_unlock(&wimax_dev->mutex); +error_alloc: + kfree(pipe_name); +error_no_data: + dev_put(wimax_dev->net_dev); +error_no_wimax_dev: + d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); + return result; +} + + +/* + * Generic Netlink glue + */ + +struct genl_ops wimax_gnl_msg_from_user = { + .cmd = WIMAX_GNL_OP_MSG_FROM_USER, + .flags = GENL_ADMIN_PERM, + .policy = wimax_gnl_msg_policy, + .doit = wimax_gnl_doit_msg_from_user, + .dumpit = NULL, +}; + diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c new file mode 100644 index 00000000..68bedf3e --- /dev/null +++ b/net/wimax/op-reset.c @@ -0,0 +1,140 @@ +/* + * Linux WiMAX + * Implement and export a method for resetting a WiMAX device + * + * + * Copyright (C) 2008 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * + * This implements a simple synchronous call to reset a WiMAX device. + * + * Resets aim at being warm, keeping the device handles active; + * however, when that fails, it falls back to a cold reset (that will + * disconnect and reconnect the device). + */ + +#include +#include +#include +#include +#include "wimax-internal.h" + +#define D_SUBMODULE op_reset +#include "debug-levels.h" + + +/** + * wimax_reset - Reset a WiMAX device + * + * @wimax_dev: WiMAX device descriptor + * + * Returns: + * + * %0 if ok and a warm reset was done (the device still exists in + * the system). + * + * -%ENODEV if a cold/bus reset had to be done (device has + * disconnected and reconnected, so current handle is not valid + * any more). + * + * -%EINVAL if the device is not even registered. + * + * Any other negative error code shall be considered as + * non-recoverable. + * + * Description: + * + * Called when wanting to reset the device for any reason. Device is + * taken back to power on status. + * + * This call blocks; on successful return, the device has completed the + * reset process and is ready to operate. + */ +int wimax_reset(struct wimax_dev *wimax_dev) +{ + int result = -EINVAL; + struct device *dev = wimax_dev_to_dev(wimax_dev); + enum wimax_st state; + + might_sleep(); + d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); + mutex_lock(&wimax_dev->mutex); + dev_hold(wimax_dev->net_dev); + state = wimax_dev->state; + mutex_unlock(&wimax_dev->mutex); + + if (state >= WIMAX_ST_DOWN) { + mutex_lock(&wimax_dev->mutex_reset); + result = wimax_dev->op_reset(wimax_dev); + mutex_unlock(&wimax_dev->mutex_reset); + } + dev_put(wimax_dev->net_dev); + + d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result); + return result; +} +EXPORT_SYMBOL(wimax_reset); + + +static const struct nla_policy wimax_gnl_reset_policy[WIMAX_GNL_ATTR_MAX + 1] = { + [WIMAX_GNL_RESET_IFIDX] = { + .type = NLA_U32, + }, +}; + + +/* + * Exporting to user space over generic netlink + * + * Parse the reset command from user space, return error code. + * + * No attributes. + */ +static +int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info) +{ + int result, ifindex; + struct wimax_dev *wimax_dev; + + d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); + result = -ENODEV; + if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) { + printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX " + "attribute\n"); + goto error_no_wimax_dev; + } + ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]); + wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); + if (wimax_dev == NULL) + goto error_no_wimax_dev; + /* Execute the operation and send the result back to user space */ + result = wimax_reset(wimax_dev); + dev_put(wimax_dev->net_dev); +error_no_wimax_dev: + d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); + return result; +} + + +struct genl_ops wimax_gnl_reset = { + .cmd = WIMAX_GNL_OP_RESET, + .flags = GENL_ADMIN_PERM, + .policy = wimax_gnl_reset_policy, + .doit = wimax_gnl_doit_reset, + .dumpit = NULL, +}; diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c new file mode 100644 index 00000000..2609e445 --- /dev/null +++ b/net/wimax/op-rfkill.c @@ -0,0 +1,468 @@ +/* + * Linux WiMAX + * RF-kill framework integration + * + * + * Copyright (C) 2008 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * + * This integrates into the Linux Kernel rfkill susbystem so that the + * drivers just have to do the bare minimal work, which is providing a + * method to set the software RF-Kill switch and to report changes in + * the software and hardware switch status. + * + * A non-polled generic rfkill device is embedded into the WiMAX + * subsystem's representation of a device. + * + * FIXME: Need polled support? Let drivers provide a poll routine + * and hand it to rfkill ops then? + * + * All device drivers have to do is after wimax_dev_init(), call + * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update + * initial state and then every time it changes. See wimax.h:struct + * wimax_dev for more information. + * + * ROADMAP + * + * wimax_gnl_doit_rfkill() User space calling wimax_rfkill() + * wimax_rfkill() Kernel calling wimax_rfkill() + * __wimax_rf_toggle_radio() + * + * wimax_rfkill_set_radio_block() RF-Kill subsystem calling + * __wimax_rf_toggle_radio() + * + * __wimax_rf_toggle_radio() + * wimax_dev->op_rfkill_sw_toggle() Driver backend + * __wimax_state_change() + * + * wimax_report_rfkill_sw() Driver reports state change + * __wimax_state_change() + * + * wimax_report_rfkill_hw() Driver reports state change + * __wimax_state_change() + * + * wimax_rfkill_add() Initialize/shutdown rfkill support + * wimax_rfkill_rm() [called by wimax_dev_add/rm()] + */ + +#include +#include +#include +#include +#include +#include "wimax-internal.h" + +#define D_SUBMODULE op_rfkill +#include "debug-levels.h" + +/** + * wimax_report_rfkill_hw - Reports changes in the hardware RF switch + * + * @wimax_dev: WiMAX device descriptor + * + * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on, + * %WIMAX_RF_OFF radio off. + * + * When the device detects a change in the state of thehardware RF + * switch, it must call this function to let the WiMAX kernel stack + * know that the state has changed so it can be properly propagated. + * + * The WiMAX stack caches the state (the driver doesn't need to). As + * well, as the change is propagated it will come back as a request to + * change the software state to mirror the hardware state. + * + * If the device doesn't have a hardware kill switch, just report + * it on initialization as always on (%WIMAX_RF_ON, radio on). + */ +void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev, + enum wimax_rf_state state) +{ + int result; + struct device *dev = wimax_dev_to_dev(wimax_dev); + enum wimax_st wimax_state; + + d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); + BUG_ON(state == WIMAX_RF_QUERY); + BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF); + + mutex_lock(&wimax_dev->mutex); + result = wimax_dev_is_ready(wimax_dev); + if (result < 0) + goto error_not_ready; + + if (state != wimax_dev->rf_hw) { + wimax_dev->rf_hw = state; + if (wimax_dev->rf_hw == WIMAX_RF_ON && + wimax_dev->rf_sw == WIMAX_RF_ON) + wimax_state = WIMAX_ST_READY; + else + wimax_state = WIMAX_ST_RADIO_OFF; + + result = rfkill_set_hw_state(wimax_dev->rfkill, + state == WIMAX_RF_OFF); + + __wimax_state_change(wimax_dev, wimax_state); + } +error_not_ready: + mutex_unlock(&wimax_dev->mutex); + d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n", + wimax_dev, state, result); +} +EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw); + + +/** + * wimax_report_rfkill_sw - Reports changes in the software RF switch + * + * @wimax_dev: WiMAX device descriptor + * + * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on, + * %WIMAX_RF_OFF radio off. + * + * Reports changes in the software RF switch state to the the WiMAX + * stack. + * + * The main use is during initialization, so the driver can query the + * device for its current software radio kill switch state and feed it + * to the system. + * + * On the side, the device does not change the software state by + * itself. In practice, this can happen, as the device might decide to + * switch (in software) the radio off for different reasons. + */ +void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev, + enum wimax_rf_state state) +{ + int result; + struct device *dev = wimax_dev_to_dev(wimax_dev); + enum wimax_st wimax_state; + + d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); + BUG_ON(state == WIMAX_RF_QUERY); + BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF); + + mutex_lock(&wimax_dev->mutex); + result = wimax_dev_is_ready(wimax_dev); + if (result < 0) + goto error_not_ready; + + if (state != wimax_dev->rf_sw) { + wimax_dev->rf_sw = state; + if (wimax_dev->rf_hw == WIMAX_RF_ON && + wimax_dev->rf_sw == WIMAX_RF_ON) + wimax_state = WIMAX_ST_READY; + else + wimax_state = WIMAX_ST_RADIO_OFF; + __wimax_state_change(wimax_dev, wimax_state); + rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF); + } +error_not_ready: + mutex_unlock(&wimax_dev->mutex); + d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n", + wimax_dev, state, result); +} +EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw); + + +/* + * Callback for the RF Kill toggle operation + * + * This function is called by: + * + * - The rfkill subsystem when the RF-Kill key is pressed in the + * hardware and the driver notifies through + * wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back + * here so the software RF Kill switch state is changed to reflect + * the hardware switch state. + * + * - When the user sets the state through sysfs' rfkill/state file + * + * - When the user calls wimax_rfkill(). + * + * This call blocks! + * + * WARNING! When we call rfkill_unregister(), this will be called with + * state 0! + * + * WARNING: wimax_dev must be locked + */ +static +int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev, + enum wimax_rf_state state) +{ + int result = 0; + struct device *dev = wimax_dev_to_dev(wimax_dev); + enum wimax_st wimax_state; + + might_sleep(); + d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); + if (wimax_dev->rf_sw == state) + goto out_no_change; + if (wimax_dev->op_rfkill_sw_toggle != NULL) + result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state); + else if (state == WIMAX_RF_OFF) /* No op? can't turn off */ + result = -ENXIO; + else /* No op? can turn on */ + result = 0; /* should never happen tho */ + if (result >= 0) { + result = 0; + wimax_dev->rf_sw = state; + wimax_state = state == WIMAX_RF_ON ? + WIMAX_ST_READY : WIMAX_ST_RADIO_OFF; + __wimax_state_change(wimax_dev, wimax_state); + } +out_no_change: + d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n", + wimax_dev, state, result); + return result; +} + + +/* + * Translate from rfkill state to wimax state + * + * NOTE: Special state handling rules here + * + * Just pretend the call didn't happen if we are in a state where + * we know for sure it cannot be handled (WIMAX_ST_DOWN or + * __WIMAX_ST_QUIESCING). rfkill() needs it to register and + * unregister, as it will run this path. + * + * NOTE: This call will block until the operation is completed. + */ +static int wimax_rfkill_set_radio_block(void *data, bool blocked) +{ + int result; + struct wimax_dev *wimax_dev = data; + struct device *dev = wimax_dev_to_dev(wimax_dev); + enum wimax_rf_state rf_state; + + d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked); + rf_state = WIMAX_RF_ON; + if (blocked) + rf_state = WIMAX_RF_OFF; + mutex_lock(&wimax_dev->mutex); + if (wimax_dev->state <= __WIMAX_ST_QUIESCING) + result = 0; + else + result = __wimax_rf_toggle_radio(wimax_dev, rf_state); + mutex_unlock(&wimax_dev->mutex); + d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n", + wimax_dev, blocked, result); + return result; +} + +static const struct rfkill_ops wimax_rfkill_ops = { + .set_block = wimax_rfkill_set_radio_block, +}; + +/** + * wimax_rfkill - Set the software RF switch state for a WiMAX device + * + * @wimax_dev: WiMAX device descriptor + * + * @state: New RF state. + * + * Returns: + * + * >= 0 toggle state if ok, < 0 errno code on error. The toggle state + * is returned as a bitmap, bit 0 being the hardware RF state, bit 1 + * the software RF state. + * + * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio + * off (%WIMAX_RF_OFF). + * + * Description: + * + * Called by the user when he wants to request the WiMAX radio to be + * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With + * %WIMAX_RF_QUERY, just the current state is returned. + * + * NOTE: + * + * This call will block until the operation is complete. + */ +int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state) +{ + int result; + struct device *dev = wimax_dev_to_dev(wimax_dev); + + d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state); + mutex_lock(&wimax_dev->mutex); + result = wimax_dev_is_ready(wimax_dev); + if (result < 0) { + /* While initializing, < 1.4.3 wimax-tools versions use + * this call to check if the device is a valid WiMAX + * device; so we allow it to proceed always, + * considering the radios are all off. */ + if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY) + result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF; + goto error_not_ready; + } + switch (state) { + case WIMAX_RF_ON: + case WIMAX_RF_OFF: + result = __wimax_rf_toggle_radio(wimax_dev, state); + if (result < 0) + goto error; + rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF); + break; + case WIMAX_RF_QUERY: + break; + default: + result = -EINVAL; + goto error; + } + result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw; +error: +error_not_ready: + mutex_unlock(&wimax_dev->mutex); + d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n", + wimax_dev, state, result); + return result; +} +EXPORT_SYMBOL(wimax_rfkill); + + +/* + * Register a new WiMAX device's RF Kill support + * + * WARNING: wimax_dev->mutex must be unlocked + */ +int wimax_rfkill_add(struct wimax_dev *wimax_dev) +{ + int result; + struct rfkill *rfkill; + struct device *dev = wimax_dev_to_dev(wimax_dev); + + d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); + /* Initialize RF Kill */ + result = -ENOMEM; + rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX, + &wimax_rfkill_ops, wimax_dev); + if (rfkill == NULL) + goto error_rfkill_allocate; + + d_printf(1, dev, "rfkill %p\n", rfkill); + + wimax_dev->rfkill = rfkill; + + rfkill_init_sw_state(rfkill, 1); + result = rfkill_register(wimax_dev->rfkill); + if (result < 0) + goto error_rfkill_register; + + /* If there is no SW toggle op, SW RFKill is always on */ + if (wimax_dev->op_rfkill_sw_toggle == NULL) + wimax_dev->rf_sw = WIMAX_RF_ON; + + d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev); + return 0; + +error_rfkill_register: + rfkill_destroy(wimax_dev->rfkill); +error_rfkill_allocate: + d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result); + return result; +} + + +/* + * Deregister a WiMAX device's RF Kill support + * + * Ick, we can't call rfkill_free() after rfkill_unregister()...oh + * well. + * + * WARNING: wimax_dev->mutex must be unlocked + */ +void wimax_rfkill_rm(struct wimax_dev *wimax_dev) +{ + struct device *dev = wimax_dev_to_dev(wimax_dev); + d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev); + rfkill_unregister(wimax_dev->rfkill); + rfkill_destroy(wimax_dev->rfkill); + d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev); +} + + +/* + * Exporting to user space over generic netlink + * + * Parse the rfkill command from user space, return a combination + * value that describe the states of the different toggles. + * + * Only one attribute: the new state requested (on, off or no change, + * just query). + */ + +static const struct nla_policy wimax_gnl_rfkill_policy[WIMAX_GNL_ATTR_MAX + 1] = { + [WIMAX_GNL_RFKILL_IFIDX] = { + .type = NLA_U32, + }, + [WIMAX_GNL_RFKILL_STATE] = { + .type = NLA_U32 /* enum wimax_rf_state */ + }, +}; + + +static +int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info) +{ + int result, ifindex; + struct wimax_dev *wimax_dev; + struct device *dev; + enum wimax_rf_state new_state; + + d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); + result = -ENODEV; + if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) { + printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX " + "attribute\n"); + goto error_no_wimax_dev; + } + ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]); + wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); + if (wimax_dev == NULL) + goto error_no_wimax_dev; + dev = wimax_dev_to_dev(wimax_dev); + result = -EINVAL; + if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) { + dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE " + "attribute\n"); + goto error_no_pid; + } + new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]); + + /* Execute the operation and send the result back to user space */ + result = wimax_rfkill(wimax_dev, new_state); +error_no_pid: + dev_put(wimax_dev->net_dev); +error_no_wimax_dev: + d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); + return result; +} + + +struct genl_ops wimax_gnl_rfkill = { + .cmd = WIMAX_GNL_OP_RFKILL, + .flags = GENL_ADMIN_PERM, + .policy = wimax_gnl_rfkill_policy, + .doit = wimax_gnl_doit_rfkill, + .dumpit = NULL, +}; + diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c new file mode 100644 index 00000000..aff8776e --- /dev/null +++ b/net/wimax/op-state-get.c @@ -0,0 +1,83 @@ +/* + * Linux WiMAX + * Implement and export a method for getting a WiMAX device current state + * + * Copyright (C) 2009 Paulius Zaleckas + * + * Based on previous WiMAX core work by: + * Copyright (C) 2008 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include +#include +#include +#include +#include "wimax-internal.h" + +#define D_SUBMODULE op_state_get +#include "debug-levels.h" + + +static const struct nla_policy wimax_gnl_state_get_policy[WIMAX_GNL_ATTR_MAX + 1] = { + [WIMAX_GNL_STGET_IFIDX] = { + .type = NLA_U32, + }, +}; + + +/* + * Exporting to user space over generic netlink + * + * Parse the state get command from user space, return a combination + * value that describe the current state. + * + * No attributes. + */ +static +int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info) +{ + int result, ifindex; + struct wimax_dev *wimax_dev; + + d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); + result = -ENODEV; + if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) { + printk(KERN_ERR "WIMAX_GNL_OP_STATE_GET: can't find IFIDX " + "attribute\n"); + goto error_no_wimax_dev; + } + ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]); + wimax_dev = wimax_dev_get_by_genl_info(info, ifindex); + if (wimax_dev == NULL) + goto error_no_wimax_dev; + /* Execute the operation and send the result back to user space */ + result = wimax_state_get(wimax_dev); + dev_put(wimax_dev->net_dev); +error_no_wimax_dev: + d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result); + return result; +} + + +struct genl_ops wimax_gnl_state_get = { + .cmd = WIMAX_GNL_OP_STATE_GET, + .flags = GENL_ADMIN_PERM, + .policy = wimax_gnl_state_get_policy, + .doit = wimax_gnl_doit_state_get, + .dumpit = NULL, +}; diff --git a/net/wimax/stack.c b/net/wimax/stack.c new file mode 100644 index 00000000..ee99e7df --- /dev/null +++ b/net/wimax/stack.c @@ -0,0 +1,633 @@ +/* + * Linux WiMAX + * Initialization, addition and removal of wimax devices + * + * + * Copyright (C) 2005-2006 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * + * This implements: + * + * - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on + * addition/registration initialize all subfields and allocate + * generic netlink resources for user space communication. On + * removal/unregistration, undo all that. + * + * - device state machine [wimax_state_change()] and support to send + * reports to user space when the state changes + * [wimax_gnl_re_state_change*()]. + * + * See include/net/wimax.h for rationales and design. + * + * ROADMAP + * + * [__]wimax_state_change() Called by drivers to update device's state + * wimax_gnl_re_state_change_alloc() + * wimax_gnl_re_state_change_send() + * + * wimax_dev_init() Init a device + * wimax_dev_add() Register + * wimax_rfkill_add() + * wimax_gnl_add() Register all the generic netlink resources. + * wimax_id_table_add() + * wimax_dev_rm() Unregister + * wimax_id_table_rm() + * wimax_gnl_rm() + * wimax_rfkill_rm() + */ +#include +#include +#include +#include +#include +#include "wimax-internal.h" + + +#define D_SUBMODULE stack +#include "debug-levels.h" + +static char wimax_debug_params[128]; +module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params), + 0644); +MODULE_PARM_DESC(debug, + "String of space-separated NAME:VALUE pairs, where NAMEs " + "are the different debug submodules and VALUE are the " + "initial debug value to set."); + +/* + * Authoritative source for the RE_STATE_CHANGE attribute policy + * + * We don't really use it here, but /me likes to keep the definition + * close to where the data is generated. + */ +/* +static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = { + [WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 }, + [WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 }, +}; +*/ + + +/* + * Allocate a Report State Change message + * + * @header: save it, you need it for _send() + * + * Creates and fills a basic state change message; different code + * paths can then add more attributes to the message as needed. + * + * Use wimax_gnl_re_state_change_send() to send the returned skb. + * + * Returns: skb with the genl message if ok, IS_ERR() ptr on error + * with an errno code. + */ +static +struct sk_buff *wimax_gnl_re_state_change_alloc( + struct wimax_dev *wimax_dev, + enum wimax_st new_state, enum wimax_st old_state, + void **header) +{ + int result; + struct device *dev = wimax_dev_to_dev(wimax_dev); + void *data; + struct sk_buff *report_skb; + + d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n", + wimax_dev, new_state, old_state); + result = -ENOMEM; + report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (report_skb == NULL) { + dev_err(dev, "RE_STCH: can't create message\n"); + goto error_new; + } + data = genlmsg_put(report_skb, 0, wimax_gnl_mcg.id, &wimax_gnl_family, + 0, WIMAX_GNL_RE_STATE_CHANGE); + if (data == NULL) { + dev_err(dev, "RE_STCH: can't put data into message\n"); + goto error_put; + } + *header = data; + + result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state); + if (result < 0) { + dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result); + goto error_put; + } + result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state); + if (result < 0) { + dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result); + goto error_put; + } + result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX, + wimax_dev->net_dev->ifindex); + if (result < 0) { + dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n"); + goto error_put; + } + d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n", + wimax_dev, new_state, old_state, report_skb); + return report_skb; + +error_put: + nlmsg_free(report_skb); +error_new: + d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n", + wimax_dev, new_state, old_state, result); + return ERR_PTR(result); +} + + +/* + * Send a Report State Change message (as created with _alloc). + * + * @report_skb: as returned by wimax_gnl_re_state_change_alloc() + * @header: as returned by wimax_gnl_re_state_change_alloc() + * + * Returns: 0 if ok, < 0 errno code on error. + * + * If the message is NULL, pretend it didn't happen. + */ +static +int wimax_gnl_re_state_change_send( + struct wimax_dev *wimax_dev, struct sk_buff *report_skb, + void *header) +{ + int result = 0; + struct device *dev = wimax_dev_to_dev(wimax_dev); + d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n", + wimax_dev, report_skb); + if (report_skb == NULL) { + result = -ENOMEM; + goto out; + } + genlmsg_end(report_skb, header); + genlmsg_multicast(report_skb, 0, wimax_gnl_mcg.id, GFP_KERNEL); +out: + d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n", + wimax_dev, report_skb, result); + return result; +} + + +static +void __check_new_state(enum wimax_st old_state, enum wimax_st new_state, + unsigned allowed_states_bm) +{ + if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) { + printk(KERN_ERR "SW BUG! Forbidden state change %u -> %u\n", + old_state, new_state); + } +} + + +/* + * Set the current state of a WiMAX device [unlocking version of + * wimax_state_change(). + */ +void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state) +{ + struct device *dev = wimax_dev_to_dev(wimax_dev); + enum wimax_st old_state = wimax_dev->state; + struct sk_buff *stch_skb; + void *header; + + d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n", + wimax_dev, new_state, old_state); + + if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) { + dev_err(dev, "SW BUG: requesting invalid state %u\n", + new_state); + goto out; + } + if (old_state == new_state) + goto out; + header = NULL; /* gcc complains? can't grok why */ + stch_skb = wimax_gnl_re_state_change_alloc( + wimax_dev, new_state, old_state, &header); + + /* Verify the state transition and do exit-from-state actions */ + switch (old_state) { + case __WIMAX_ST_NULL: + __check_new_state(old_state, new_state, + 1 << WIMAX_ST_DOWN); + break; + case WIMAX_ST_DOWN: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_UNINITIALIZED + | 1 << WIMAX_ST_RADIO_OFF); + break; + case __WIMAX_ST_QUIESCING: + __check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN); + break; + case WIMAX_ST_UNINITIALIZED: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_RADIO_OFF); + break; + case WIMAX_ST_RADIO_OFF: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_READY); + break; + case WIMAX_ST_READY: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_RADIO_OFF + | 1 << WIMAX_ST_SCANNING + | 1 << WIMAX_ST_CONNECTING + | 1 << WIMAX_ST_CONNECTED); + break; + case WIMAX_ST_SCANNING: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_RADIO_OFF + | 1 << WIMAX_ST_READY + | 1 << WIMAX_ST_CONNECTING + | 1 << WIMAX_ST_CONNECTED); + break; + case WIMAX_ST_CONNECTING: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_RADIO_OFF + | 1 << WIMAX_ST_READY + | 1 << WIMAX_ST_SCANNING + | 1 << WIMAX_ST_CONNECTED); + break; + case WIMAX_ST_CONNECTED: + __check_new_state(old_state, new_state, + 1 << __WIMAX_ST_QUIESCING + | 1 << WIMAX_ST_RADIO_OFF + | 1 << WIMAX_ST_READY); + netif_tx_disable(wimax_dev->net_dev); + netif_carrier_off(wimax_dev->net_dev); + break; + case __WIMAX_ST_INVALID: + default: + dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n", + wimax_dev, wimax_dev->state); + WARN_ON(1); + goto out; + } + + /* Execute the actions of entry to the new state */ + switch (new_state) { + case __WIMAX_ST_NULL: + dev_err(dev, "SW BUG: wimax_dev %p entering NULL state " + "from %u\n", wimax_dev, wimax_dev->state); + WARN_ON(1); /* Nobody can enter this state */ + break; + case WIMAX_ST_DOWN: + break; + case __WIMAX_ST_QUIESCING: + break; + case WIMAX_ST_UNINITIALIZED: + break; + case WIMAX_ST_RADIO_OFF: + break; + case WIMAX_ST_READY: + break; + case WIMAX_ST_SCANNING: + break; + case WIMAX_ST_CONNECTING: + break; + case WIMAX_ST_CONNECTED: + netif_carrier_on(wimax_dev->net_dev); + netif_wake_queue(wimax_dev->net_dev); + break; + case __WIMAX_ST_INVALID: + default: + BUG(); + } + __wimax_state_set(wimax_dev, new_state); + if (!IS_ERR(stch_skb)) + wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header); +out: + d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n", + wimax_dev, new_state, old_state); +} + + +/** + * wimax_state_change - Set the current state of a WiMAX device + * + * @wimax_dev: WiMAX device descriptor (properly referenced) + * @new_state: New state to switch to + * + * This implements the state changes for the wimax devices. It will + * + * - verify that the state transition is legal (for now it'll just + * print a warning if not) according to the table in + * linux/wimax.h's documentation for 'enum wimax_st'. + * + * - perform the actions needed for leaving the current state and + * whichever are needed for entering the new state. + * + * - issue a report to user space indicating the new state (and an + * optional payload with information about the new state). + * + * NOTE: @wimax_dev must be locked + */ +void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state) +{ + /* + * A driver cannot take the wimax_dev out of the + * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If + * the wimax_dev's state is still NULL, we ignore any request + * to change its state because it means it hasn't been yet + * registered. + * + * There is no need to complain about it, as routines that + * call this might be shared from different code paths that + * are called before or after wimax_dev_add() has done its + * job. + */ + mutex_lock(&wimax_dev->mutex); + if (wimax_dev->state > __WIMAX_ST_NULL) + __wimax_state_change(wimax_dev, new_state); + mutex_unlock(&wimax_dev->mutex); +} +EXPORT_SYMBOL_GPL(wimax_state_change); + + +/** + * wimax_state_get() - Return the current state of a WiMAX device + * + * @wimax_dev: WiMAX device descriptor + * + * Returns: Current state of the device according to its driver. + */ +enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev) +{ + enum wimax_st state; + mutex_lock(&wimax_dev->mutex); + state = wimax_dev->state; + mutex_unlock(&wimax_dev->mutex); + return state; +} +EXPORT_SYMBOL_GPL(wimax_state_get); + + +/** + * wimax_dev_init - initialize a newly allocated instance + * + * @wimax_dev: WiMAX device descriptor to initialize. + * + * Initializes fields of a freshly allocated @wimax_dev instance. This + * function assumes that after allocation, the memory occupied by + * @wimax_dev was zeroed. + */ +void wimax_dev_init(struct wimax_dev *wimax_dev) +{ + INIT_LIST_HEAD(&wimax_dev->id_table_node); + __wimax_state_set(wimax_dev, __WIMAX_ST_NULL); + mutex_init(&wimax_dev->mutex); + mutex_init(&wimax_dev->mutex_reset); +} +EXPORT_SYMBOL_GPL(wimax_dev_init); + +/* + * This extern is declared here because it's easier to keep track -- + * both declarations are a list of the same + */ +extern struct genl_ops + wimax_gnl_msg_from_user, + wimax_gnl_reset, + wimax_gnl_rfkill, + wimax_gnl_state_get; + +static +struct genl_ops *wimax_gnl_ops[] = { + &wimax_gnl_msg_from_user, + &wimax_gnl_reset, + &wimax_gnl_rfkill, + &wimax_gnl_state_get, +}; + + +static +size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size, + unsigned char *addr, size_t addr_len) +{ + unsigned cnt, total; + for (total = cnt = 0; cnt < addr_len; cnt++) + total += scnprintf(addr_str + total, addr_str_size - total, + "%02x%c", addr[cnt], + cnt == addr_len - 1 ? '\0' : ':'); + return total; +} + + +/** + * wimax_dev_add - Register a new WiMAX device + * + * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's + * priv data). You must have called wimax_dev_init() on it before. + * + * @net_dev: net device the @wimax_dev is associated with. The + * function expects SET_NETDEV_DEV() and register_netdev() were + * already called on it. + * + * Registers the new WiMAX device, sets up the user-kernel control + * interface (generic netlink) and common WiMAX infrastructure. + * + * Note that the parts that will allow interaction with user space are + * setup at the very end, when the rest is in place, as once that + * happens, the driver might get user space control requests via + * netlink or from debugfs that might translate into calls into + * wimax_dev->op_*(). + */ +int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev) +{ + int result; + struct device *dev = net_dev->dev.parent; + char addr_str[32]; + + d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev); + + /* Do the RFKILL setup before locking, as RFKILL will call + * into our functions. */ + wimax_dev->net_dev = net_dev; + result = wimax_rfkill_add(wimax_dev); + if (result < 0) + goto error_rfkill_add; + + /* Set up user-space interaction */ + mutex_lock(&wimax_dev->mutex); + wimax_id_table_add(wimax_dev); + result = wimax_debugfs_add(wimax_dev); + if (result < 0) { + dev_err(dev, "cannot initialize debugfs: %d\n", + result); + goto error_debugfs_add; + } + + __wimax_state_set(wimax_dev, WIMAX_ST_DOWN); + mutex_unlock(&wimax_dev->mutex); + + wimax_addr_scnprint(addr_str, sizeof(addr_str), + net_dev->dev_addr, net_dev->addr_len); + dev_err(dev, "WiMAX interface %s (%s) ready\n", + net_dev->name, addr_str); + d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev); + return 0; + +error_debugfs_add: + wimax_id_table_rm(wimax_dev); + mutex_unlock(&wimax_dev->mutex); + wimax_rfkill_rm(wimax_dev); +error_rfkill_add: + d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n", + wimax_dev, net_dev, result); + return result; +} +EXPORT_SYMBOL_GPL(wimax_dev_add); + + +/** + * wimax_dev_rm - Unregister an existing WiMAX device + * + * @wimax_dev: WiMAX device descriptor + * + * Unregisters a WiMAX device previously registered for use with + * wimax_add_rm(). + * + * IMPORTANT! Must call before calling unregister_netdev(). + * + * After this function returns, you will not get any more user space + * control requests (via netlink or debugfs) and thus to wimax_dev->ops. + * + * Reentrancy control is ensured by setting the state to + * %__WIMAX_ST_QUIESCING. rfkill operations coming through + * wimax_*rfkill*() will be stopped by the quiescing state; ops coming + * from the rfkill subsystem will be stopped by the support being + * removed by wimax_rfkill_rm(). + */ +void wimax_dev_rm(struct wimax_dev *wimax_dev) +{ + d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev); + + mutex_lock(&wimax_dev->mutex); + __wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING); + wimax_debugfs_rm(wimax_dev); + wimax_id_table_rm(wimax_dev); + __wimax_state_change(wimax_dev, WIMAX_ST_DOWN); + mutex_unlock(&wimax_dev->mutex); + wimax_rfkill_rm(wimax_dev); + d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev); +} +EXPORT_SYMBOL_GPL(wimax_dev_rm); + + +/* Debug framework control of debug levels */ +struct d_level D_LEVEL[] = { + D_SUBMODULE_DEFINE(debugfs), + D_SUBMODULE_DEFINE(id_table), + D_SUBMODULE_DEFINE(op_msg), + D_SUBMODULE_DEFINE(op_reset), + D_SUBMODULE_DEFINE(op_rfkill), + D_SUBMODULE_DEFINE(op_state_get), + D_SUBMODULE_DEFINE(stack), +}; +size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); + + +struct genl_family wimax_gnl_family = { + .id = GENL_ID_GENERATE, + .name = "WiMAX", + .version = WIMAX_GNL_VERSION, + .hdrsize = 0, + .maxattr = WIMAX_GNL_ATTR_MAX, +}; + +struct genl_multicast_group wimax_gnl_mcg = { + .name = "msg", +}; + + + +/* Shutdown the wimax stack */ +static +int __init wimax_subsys_init(void) +{ + int result, cnt; + + d_fnstart(4, NULL, "()\n"); + d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params, + "wimax.debug"); + + snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name), + "WiMAX"); + result = genl_register_family(&wimax_gnl_family); + if (unlikely(result < 0)) { + printk(KERN_ERR "cannot register generic netlink family: %d\n", + result); + goto error_register_family; + } + + for (cnt = 0; cnt < ARRAY_SIZE(wimax_gnl_ops); cnt++) { + result = genl_register_ops(&wimax_gnl_family, + wimax_gnl_ops[cnt]); + d_printf(4, NULL, "registering generic netlink op code " + "%u: %d\n", wimax_gnl_ops[cnt]->cmd, result); + if (unlikely(result < 0)) { + printk(KERN_ERR "cannot register generic netlink op " + "code %u: %d\n", + wimax_gnl_ops[cnt]->cmd, result); + goto error_register_ops; + } + } + + result = genl_register_mc_group(&wimax_gnl_family, &wimax_gnl_mcg); + if (result < 0) + goto error_mc_group; + d_fnend(4, NULL, "() = 0\n"); + return 0; + +error_mc_group: +error_register_ops: + for (cnt--; cnt >= 0; cnt--) + genl_unregister_ops(&wimax_gnl_family, + wimax_gnl_ops[cnt]); + genl_unregister_family(&wimax_gnl_family); +error_register_family: + d_fnend(4, NULL, "() = %d\n", result); + return result; + +} +module_init(wimax_subsys_init); + + +/* Shutdown the wimax stack */ +static +void __exit wimax_subsys_exit(void) +{ + int cnt; + wimax_id_table_release(); + genl_unregister_mc_group(&wimax_gnl_family, &wimax_gnl_mcg); + for (cnt = ARRAY_SIZE(wimax_gnl_ops) - 1; cnt >= 0; cnt--) + genl_unregister_ops(&wimax_gnl_family, + wimax_gnl_ops[cnt]); + genl_unregister_family(&wimax_gnl_family); +} +module_exit(wimax_subsys_exit); + +MODULE_AUTHOR("Intel Corporation "); +MODULE_DESCRIPTION("Linux WiMAX stack"); +MODULE_LICENSE("GPL"); + diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h new file mode 100644 index 00000000..1e743d21 --- /dev/null +++ b/net/wimax/wimax-internal.h @@ -0,0 +1,91 @@ +/* + * Linux WiMAX + * Internal API for kernel space WiMAX stack + * + * + * Copyright (C) 2007 Intel Corporation + * Inaky Perez-Gonzalez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * + * This header file is for declarations and definitions internal to + * the WiMAX stack. For public APIs and documentation, see + * include/net/wimax.h and include/linux/wimax.h. + */ + +#ifndef __WIMAX_INTERNAL_H__ +#define __WIMAX_INTERNAL_H__ +#ifdef __KERNEL__ + +#include +#include + + +/* + * Decide if a (locked) device is ready for use + * + * Before using the device structure, it must be locked + * (wimax_dev->mutex). As well, most operations need to call this + * function to check if the state is the right one. + * + * An error value will be returned if the state is not the right + * one. In that case, the caller should not attempt to use the device + * and just unlock it. + */ +static inline __must_check +int wimax_dev_is_ready(struct wimax_dev *wimax_dev) +{ + if (wimax_dev->state == __WIMAX_ST_NULL) + return -EINVAL; /* Device is not even registered! */ + if (wimax_dev->state == WIMAX_ST_DOWN) + return -ENOMEDIUM; + if (wimax_dev->state == __WIMAX_ST_QUIESCING) + return -ESHUTDOWN; + return 0; +} + + +static inline +void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state) +{ + wimax_dev->state = state; +} +extern void __wimax_state_change(struct wimax_dev *, enum wimax_st); + +#ifdef CONFIG_DEBUG_FS +extern int wimax_debugfs_add(struct wimax_dev *); +extern void wimax_debugfs_rm(struct wimax_dev *); +#else +static inline int wimax_debugfs_add(struct wimax_dev *wimax_dev) +{ + return 0; +} +static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {} +#endif + +extern void wimax_id_table_add(struct wimax_dev *); +extern struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int); +extern void wimax_id_table_rm(struct wimax_dev *); +extern void wimax_id_table_release(void); + +extern int wimax_rfkill_add(struct wimax_dev *); +extern void wimax_rfkill_rm(struct wimax_dev *); + +extern struct genl_family wimax_gnl_family; +extern struct genl_multicast_group wimax_gnl_mcg; + +#endif /* #ifdef __KERNEL__ */ +#endif /* #ifndef __WIMAX_INTERNAL_H__ */ diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore new file mode 100644 index 00000000..c33451b8 --- /dev/null +++ b/net/wireless/.gitignore @@ -0,0 +1 @@ +regdb.c diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig new file mode 100644 index 00000000..39caddcd --- /dev/null +++ b/net/wireless/Kconfig @@ -0,0 +1,184 @@ +config WIRELESS_EXT + bool "WIRELESS_EXT - Wireless extention interface " + depends on WIRELESS + help + if your wifi module is build outside this kernel source tree . + please choose this option by your self . + +config WEXT_CORE + def_bool y + depends on CFG80211_WEXT || WIRELESS_EXT + +config WEXT_PROC + def_bool y + depends on PROC_FS + depends on WEXT_CORE + +config WEXT_SPY + bool "WEXT_SPY - " + depends on WIRELESS_EXT + help + if your wifi module is build outside this kernel source tree . + please choose this option by your self . + +config WEXT_PRIV + bool "WEXT_PRIV - " + depends on WIRELESS_EXT + help + if your wifi module is build outside this kernel source tree . + please choose this option by your self . + +config CFG80211 + tristate "cfg80211 - wireless configuration API" + depends on RFKILL || !RFKILL + ---help--- + cfg80211 is the Linux wireless LAN (802.11) configuration API. + Enable this if you have a wireless device. + + For more information refer to documentation on the wireless wiki: + + http://wireless.kernel.org/en/developers/Documentation/cfg80211 + + When built as a module it will be called cfg80211. + +config NL80211_TESTMODE + bool "nl80211 testmode command" + depends on CFG80211 + help + The nl80211 testmode command helps implementing things like + factory calibration or validation tools for wireless chips. + + Select this option ONLY for kernels that are specifically + built for such purposes. + + Debugging tools that are supposed to end up in the hands of + users should better be implemented with debugfs. + + Say N. + +config CFG80211_DEVELOPER_WARNINGS + bool "enable developer warnings" + depends on CFG80211 + default n + help + This option enables some additional warnings that help + cfg80211 developers and driver developers, but that can + trigger due to races with userspace. + + For example, when a driver reports that it was disconnected + from the AP, but the user disconnects manually at the same + time, the warning might trigger spuriously due to races. + + Say Y only if you are developing cfg80211 or a driver based + on it (or mac80211). + + +config CFG80211_REG_DEBUG + bool "cfg80211 regulatory debugging" + depends on CFG80211 + default n + ---help--- + You can enable this if you want to debug regulatory changes. + For more information on cfg80211 regulatory refer to the wireless + wiki: + + http://wireless.kernel.org/en/developers/Regulatory + + If unsure, say N. + +config CFG80211_DEFAULT_PS + bool "enable powersave by default" + depends on CFG80211 + default y + help + This option enables powersave mode by default. + + If this causes your applications to misbehave you should fix your + applications instead -- they need to register their network + latency requirement, see Documentation/power/pm_qos_interface.txt. + +config CFG80211_DEBUGFS + bool "cfg80211 DebugFS entries" + depends on CFG80211 + depends on DEBUG_FS + ---help--- + You can enable this if you want to debugfs entries for cfg80211. + + If unsure, say N. + +config CFG80211_INTERNAL_REGDB + bool "use statically compiled regulatory rules database" if EXPERT + default n + depends on CFG80211 + ---help--- + This option generates an internal data structure representing + the wireless regulatory rules described in net/wireless/db.txt + and includes code to query that database. This is an alternative + to using CRDA for defining regulatory rules for the kernel. + + For details see: + + http://wireless.kernel.org/en/developers/Regulatory + + Most distributions have a CRDA package. So if unsure, say N. + +config CFG80211_WEXT + bool "cfg80211 wireless extensions compatibility" + depends on CFG80211 + select WEXT_CORE + default y + help + Enable this option if you need old userspace for wireless + extensions with cfg80211-based drivers. + +config WIRELESS_EXT_SYSFS + bool "Wireless extensions sysfs files" + default y + depends on WEXT_CORE && SYSFS + help + This option enables the deprecated wireless statistics + files in /sys/class/net/*/wireless/. The same information + is available via the ioctls as well. + + Say Y if you have programs using it, like old versions of + hal. + +config LIB80211 + tristate "Common routines for IEEE802.11 drivers" + default n + help + This options enables a library of common routines used + by IEEE802.11 wireless LAN drivers. + + Drivers should select this themselves if needed. Say Y if + you want this built into your kernel. + +config LIB80211_CRYPT_WEP + tristate + +config LIB80211_CRYPT_CCMP + tristate + +config LIB80211_CRYPT_TKIP + tristate + +config LIB80211_DEBUG + bool "lib80211 debugging messages" + depends on LIB80211 + default n + ---help--- + You can enable this if you want verbose debugging messages + from lib80211. + + If unsure, say N. + +config CFG80211_ALLOW_RECONNECT + bool "Allow reconnect while already connected" + depends on CFG80211 + default n + help + cfg80211 stack doesn't allow to connect if you are already + connected. This option allows to make a connection in this case. + + Select this option ONLY for wlan drivers that are specifically + built for such purposes. diff --git a/net/wireless/Makefile b/net/wireless/Makefile new file mode 100644 index 00000000..55a28ab2 --- /dev/null +++ b/net/wireless/Makefile @@ -0,0 +1,23 @@ +obj-$(CONFIG_CFG80211) += cfg80211.o +obj-$(CONFIG_LIB80211) += lib80211.o +obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o +obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o +obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o + +obj-$(CONFIG_WEXT_CORE) += wext-core.o +obj-$(CONFIG_WEXT_PROC) += wext-proc.o +obj-$(CONFIG_WEXT_SPY) += wext-spy.o +obj-$(CONFIG_WEXT_PRIV) += wext-priv.o + +cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o +cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o +cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o +cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o +cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o + +ccflags-y += -D__CHECK_ENDIAN__ + +$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk + @$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@ + +clean-files := regdb.c diff --git a/net/wireless/chan.c b/net/wireless/chan.c new file mode 100644 index 00000000..17cd0c04 --- /dev/null +++ b/net/wireless/chan.c @@ -0,0 +1,135 @@ +/* + * This file contains helper code to handle channel + * settings and keeping track of what is possible at + * any point in time. + * + * Copyright 2009 Johannes Berg + */ + +#include +#include "core.h" + +struct ieee80211_channel * +rdev_freq_to_chan(struct cfg80211_registered_device *rdev, + int freq, enum nl80211_channel_type channel_type) +{ + struct ieee80211_channel *chan; + struct ieee80211_sta_ht_cap *ht_cap; + + chan = ieee80211_get_channel(&rdev->wiphy, freq); + + /* Primary channel not allowed */ + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + return NULL; + + if (channel_type == NL80211_CHAN_HT40MINUS && + chan->flags & IEEE80211_CHAN_NO_HT40MINUS) + return NULL; + else if (channel_type == NL80211_CHAN_HT40PLUS && + chan->flags & IEEE80211_CHAN_NO_HT40PLUS) + return NULL; + + ht_cap = &rdev->wiphy.bands[chan->band]->ht_cap; + + if (channel_type != NL80211_CHAN_NO_HT) { + if (!ht_cap->ht_supported) + return NULL; + + if (channel_type != NL80211_CHAN_HT20 && + (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) || + ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT)) + return NULL; + } + + return chan; +} + +static bool can_beacon_sec_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type) +{ + struct ieee80211_channel *sec_chan; + int diff; + + switch (channel_type) { + case NL80211_CHAN_HT40PLUS: + diff = 20; + break; + case NL80211_CHAN_HT40MINUS: + diff = -20; + break; + default: + return false; + } + + sec_chan = ieee80211_get_channel(wiphy, chan->center_freq + diff); + if (!sec_chan) + return false; + + /* we'll need a DFS capability later */ + if (sec_chan->flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_PASSIVE_SCAN | + IEEE80211_CHAN_NO_IBSS | + IEEE80211_CHAN_RADAR)) + return false; + + return true; +} + +int cfg80211_set_freq(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, int freq, + enum nl80211_channel_type channel_type) +{ + struct ieee80211_channel *chan; + int result; + + if (wdev && wdev->iftype == NL80211_IFTYPE_MONITOR) + wdev = NULL; + + if (wdev) { + ASSERT_WDEV_LOCK(wdev); + + if (!netif_running(wdev->netdev)) + return -ENETDOWN; + } + + if (!rdev->ops->set_channel) + return -EOPNOTSUPP; + + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (!chan) + return -EINVAL; + + /* Both channels should be able to initiate communication */ + if (wdev && (wdev->iftype == NL80211_IFTYPE_ADHOC || + wdev->iftype == NL80211_IFTYPE_AP || + wdev->iftype == NL80211_IFTYPE_AP_VLAN || + wdev->iftype == NL80211_IFTYPE_MESH_POINT || + wdev->iftype == NL80211_IFTYPE_P2P_GO)) { + switch (channel_type) { + case NL80211_CHAN_HT40PLUS: + case NL80211_CHAN_HT40MINUS: + if (!can_beacon_sec_chan(&rdev->wiphy, chan, + channel_type)) { + printk(KERN_DEBUG + "cfg80211: Secondary channel not " + "allowed to initiate communication\n"); + return -EINVAL; + } + break; + default: + break; + } + } + + result = rdev->ops->set_channel(&rdev->wiphy, + wdev ? wdev->netdev : NULL, + chan, channel_type); + if (result) + return result; + + if (wdev) + wdev->channel = chan; + + return 0; +} diff --git a/net/wireless/core.c b/net/wireless/core.c new file mode 100644 index 00000000..880dbe2e --- /dev/null +++ b/net/wireless/core.c @@ -0,0 +1,1056 @@ +/* + * This is the linux wireless configuration interface. + * + * Copyright 2006-2010 Johannes Berg + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nl80211.h" +#include "core.h" +#include "sysfs.h" +#include "debugfs.h" +#include "wext-compat.h" +#include "ethtool.h" + +/* name for sysfs, %d is appended */ +#define PHY_NAME "phy" + +MODULE_AUTHOR("Johannes Berg"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("wireless configuration support"); + +/* RCU-protected (and cfg80211_mutex for writers) */ +LIST_HEAD(cfg80211_rdev_list); +int cfg80211_rdev_list_generation; + +DEFINE_MUTEX(cfg80211_mutex); + +/* for debugfs */ +static struct dentry *ieee80211_debugfs_dir; + +/* for the cleanup, scan and event works */ +struct workqueue_struct *cfg80211_wq; + +static bool cfg80211_disable_40mhz_24ghz; +module_param(cfg80211_disable_40mhz_24ghz, bool, 0644); +MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz, + "Disable 40MHz support in the 2.4GHz band"); + +/* requires cfg80211_mutex to be held! */ +struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx) +{ + struct cfg80211_registered_device *result = NULL, *rdev; + + if (!wiphy_idx_valid(wiphy_idx)) + return NULL; + + assert_cfg80211_lock(); + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (rdev->wiphy_idx == wiphy_idx) { + result = rdev; + break; + } + } + + return result; +} + +int get_wiphy_idx(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev; + if (!wiphy) + return WIPHY_IDX_STALE; + rdev = wiphy_to_dev(wiphy); + return rdev->wiphy_idx; +} + +/* requires cfg80211_rdev_mutex to be held! */ +struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx) +{ + struct cfg80211_registered_device *rdev; + + if (!wiphy_idx_valid(wiphy_idx)) + return NULL; + + assert_cfg80211_lock(); + + rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx); + if (!rdev) + return NULL; + return &rdev->wiphy; +} + +/* requires cfg80211_mutex to be held! */ +struct cfg80211_registered_device * +__cfg80211_rdev_from_info(struct genl_info *info) +{ + int ifindex; + struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL; + struct net_device *dev; + int err = -EINVAL; + + assert_cfg80211_lock(); + + if (info->attrs[NL80211_ATTR_WIPHY]) { + bywiphyidx = cfg80211_rdev_by_wiphy_idx( + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY])); + err = -ENODEV; + } + + if (info->attrs[NL80211_ATTR_IFINDEX]) { + ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]); + dev = dev_get_by_index(genl_info_net(info), ifindex); + if (dev) { + if (dev->ieee80211_ptr) + byifidx = + wiphy_to_dev(dev->ieee80211_ptr->wiphy); + dev_put(dev); + } + err = -ENODEV; + } + + if (bywiphyidx && byifidx) { + if (bywiphyidx != byifidx) + return ERR_PTR(-EINVAL); + else + return bywiphyidx; /* == byifidx */ + } + if (bywiphyidx) + return bywiphyidx; + + if (byifidx) + return byifidx; + + return ERR_PTR(err); +} + +struct cfg80211_registered_device * +cfg80211_get_dev_from_info(struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + + mutex_lock(&cfg80211_mutex); + rdev = __cfg80211_rdev_from_info(info); + + /* if it is not an error we grab the lock on + * it to assure it won't be going away while + * we operate on it */ + if (!IS_ERR(rdev)) + mutex_lock(&rdev->mtx); + + mutex_unlock(&cfg80211_mutex); + + return rdev; +} + +struct cfg80211_registered_device * +cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) +{ + struct cfg80211_registered_device *rdev = ERR_PTR(-ENODEV); + struct net_device *dev; + + mutex_lock(&cfg80211_mutex); + dev = dev_get_by_index(net, ifindex); + if (!dev) + goto out; + if (dev->ieee80211_ptr) { + rdev = wiphy_to_dev(dev->ieee80211_ptr->wiphy); + mutex_lock(&rdev->mtx); + } else + rdev = ERR_PTR(-ENODEV); + dev_put(dev); + out: + mutex_unlock(&cfg80211_mutex); + return rdev; +} + +/* requires cfg80211_mutex to be held */ +int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, + char *newname) +{ + struct cfg80211_registered_device *rdev2; + int wiphy_idx, taken = -1, result, digits; + + assert_cfg80211_lock(); + + /* prohibit calling the thing phy%d when %d is not its number */ + sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); + if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { + /* count number of places needed to print wiphy_idx */ + digits = 1; + while (wiphy_idx /= 10) + digits++; + /* + * deny the name if it is phy where is printed + * without leading zeroes. taken == strlen(newname) here + */ + if (taken == strlen(PHY_NAME) + digits) + return -EINVAL; + } + + + /* Ignore nop renames */ + if (strcmp(newname, dev_name(&rdev->wiphy.dev)) == 0) + return 0; + + /* Ensure another device does not already have this name. */ + list_for_each_entry(rdev2, &cfg80211_rdev_list, list) + if (strcmp(newname, dev_name(&rdev2->wiphy.dev)) == 0) + return -EINVAL; + + result = device_rename(&rdev->wiphy.dev, newname); + if (result) + return result; + + if (rdev->wiphy.debugfsdir && + !debugfs_rename(rdev->wiphy.debugfsdir->d_parent, + rdev->wiphy.debugfsdir, + rdev->wiphy.debugfsdir->d_parent, + newname)) + pr_err("failed to rename debugfs dir to %s!\n", newname); + + nl80211_notify_dev_rename(rdev); + + return 0; +} + +int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, + struct net *net) +{ + struct wireless_dev *wdev; + int err = 0; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK)) + return -EOPNOTSUPP; + + list_for_each_entry(wdev, &rdev->netdev_list, list) { + wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); + if (err) + break; + wdev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + if (err) { + /* failed -- clean up to old netns */ + net = wiphy_net(&rdev->wiphy); + + list_for_each_entry_continue_reverse(wdev, &rdev->netdev_list, + list) { + wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wdev->netdev, net, + "wlan%d"); + WARN_ON(err); + wdev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + return err; + } + + wiphy_net_set(&rdev->wiphy, net); + + err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); + WARN_ON(err); + + return 0; +} + +static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) +{ + struct cfg80211_registered_device *rdev = data; + + rdev->ops->rfkill_poll(&rdev->wiphy); +} + +static int cfg80211_rfkill_set_block(void *data, bool blocked) +{ + struct cfg80211_registered_device *rdev = data; + struct wireless_dev *wdev; + + if (!blocked) + return 0; + + rtnl_lock(); + mutex_lock(&rdev->devlist_mtx); + + list_for_each_entry(wdev, &rdev->netdev_list, list) + dev_close(wdev->netdev); + + mutex_unlock(&rdev->devlist_mtx); + rtnl_unlock(); + + return 0; +} + +static void cfg80211_rfkill_sync_work(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, rfkill_sync); + cfg80211_rfkill_set_block(rdev, rfkill_blocked(rdev->rfkill)); +} + +static void cfg80211_event_work(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, + event_work); + + rtnl_lock(); + cfg80211_lock_rdev(rdev); + + cfg80211_process_rdev_events(rdev); + cfg80211_unlock_rdev(rdev); + rtnl_unlock(); +} + +/* exported functions */ + +struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) +{ + static int wiphy_counter; + + struct cfg80211_registered_device *rdev; + int alloc_size; + + WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key)); + WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc)); + WARN_ON(ops->connect && !ops->disconnect); + WARN_ON(ops->join_ibss && !ops->leave_ibss); + WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf); + WARN_ON(ops->add_station && !ops->del_station); + WARN_ON(ops->add_mpath && !ops->del_mpath); + WARN_ON(ops->join_mesh && !ops->leave_mesh); + + alloc_size = sizeof(*rdev) + sizeof_priv; + + rdev = kzalloc(alloc_size, GFP_KERNEL); + if (!rdev) + return NULL; + + rdev->ops = ops; + + mutex_lock(&cfg80211_mutex); + + rdev->wiphy_idx = wiphy_counter++; + + if (unlikely(!wiphy_idx_valid(rdev->wiphy_idx))) { + wiphy_counter--; + mutex_unlock(&cfg80211_mutex); + /* ugh, wrapped! */ + kfree(rdev); + return NULL; + } + + mutex_unlock(&cfg80211_mutex); + + /* give it a proper name */ + dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + + mutex_init(&rdev->mtx); + mutex_init(&rdev->devlist_mtx); + mutex_init(&rdev->sched_scan_mtx); + INIT_LIST_HEAD(&rdev->netdev_list); + spin_lock_init(&rdev->bss_lock); + INIT_LIST_HEAD(&rdev->bss_list); + INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); + INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results); +#ifdef CONFIG_CFG80211_WEXT + rdev->wiphy.wext = &cfg80211_wext_handler; +#endif + + device_initialize(&rdev->wiphy.dev); + rdev->wiphy.dev.class = &ieee80211_class; + rdev->wiphy.dev.platform_data = rdev; + +#ifdef CONFIG_CFG80211_DEFAULT_PS + rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; +#endif + + wiphy_net_set(&rdev->wiphy, &init_net); + + rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block; + rdev->rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev), + &rdev->wiphy.dev, RFKILL_TYPE_WLAN, + &rdev->rfkill_ops, rdev); + + if (!rdev->rfkill) { + kfree(rdev); + return NULL; + } + + INIT_WORK(&rdev->rfkill_sync, cfg80211_rfkill_sync_work); + INIT_WORK(&rdev->conn_work, cfg80211_conn_work); + INIT_WORK(&rdev->event_work, cfg80211_event_work); + + init_waitqueue_head(&rdev->dev_wait); + + /* + * Initialize wiphy parameters to IEEE 802.11 MIB default values. + * Fragmentation and RTS threshold are disabled by default with the + * special -1 value. + */ + rdev->wiphy.retry_short = 7; + rdev->wiphy.retry_long = 4; + rdev->wiphy.frag_threshold = (u32) -1; + rdev->wiphy.rts_threshold = (u32) -1; + rdev->wiphy.coverage_class = 0; + + return &rdev->wiphy; +} +EXPORT_SYMBOL(wiphy_new); + +static int wiphy_verify_combinations(struct wiphy *wiphy) +{ + const struct ieee80211_iface_combination *c; + int i, j; + + /* If we have combinations enforce them */ + if (wiphy->n_iface_combinations) + wiphy->flags |= WIPHY_FLAG_ENFORCE_COMBINATIONS; + + for (i = 0; i < wiphy->n_iface_combinations; i++) { + u32 cnt = 0; + u16 all_iftypes = 0; + + c = &wiphy->iface_combinations[i]; + + /* Combinations with just one interface aren't real */ + if (WARN_ON(c->max_interfaces < 2)) + return -EINVAL; + + /* Need at least one channel */ + if (WARN_ON(!c->num_different_channels)) + return -EINVAL; + + if (WARN_ON(!c->n_limits)) + return -EINVAL; + + for (j = 0; j < c->n_limits; j++) { + u16 types = c->limits[j].types; + + /* + * interface types shouldn't overlap, this is + * used in cfg80211_can_change_interface() + */ + if (WARN_ON(types & all_iftypes)) + return -EINVAL; + all_iftypes |= types; + + if (WARN_ON(!c->limits[j].max)) + return -EINVAL; + + /* Shouldn't list software iftypes in combinations! */ + if (WARN_ON(wiphy->software_iftypes & types)) + return -EINVAL; + + cnt += c->limits[j].max; + /* + * Don't advertise an unsupported type + * in a combination. + */ + if (WARN_ON((wiphy->interface_modes & types) != types)) + return -EINVAL; + } + + /* You can't even choose that many! */ + if (WARN_ON(cnt < c->max_interfaces)) + return -EINVAL; + } + + return 0; +} + +int wiphy_register(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + int res; + enum ieee80211_band band; + struct ieee80211_supported_band *sband; + bool have_band = false; + int i; + u16 ifmodes = wiphy->interface_modes; + + if (WARN_ON(wiphy->addresses && !wiphy->n_addresses)) + return -EINVAL; + + if (WARN_ON(wiphy->addresses && + !is_zero_ether_addr(wiphy->perm_addr) && + memcmp(wiphy->perm_addr, wiphy->addresses[0].addr, + ETH_ALEN))) + return -EINVAL; + + if (wiphy->addresses) + memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); + + /* sanity check ifmodes */ + WARN_ON(!ifmodes); + ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1; + if (WARN_ON(ifmodes != wiphy->interface_modes)) + wiphy->interface_modes = ifmodes; + + res = wiphy_verify_combinations(wiphy); + if (res) + return res; + + /* sanity check supported bands/channels */ + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + sband = wiphy->bands[band]; + if (!sband) + continue; + + sband->band = band; + + if (WARN_ON(!sband->n_channels || !sband->n_bitrates)) + return -EINVAL; + + /* + * Since cfg80211_disable_40mhz_24ghz is global, we can + * modify the sband's ht data even if the driver uses a + * global structure for that. + */ + if (cfg80211_disable_40mhz_24ghz && + band == IEEE80211_BAND_2GHZ && + sband->ht_cap.ht_supported) { + sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; + sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40; + } + + /* + * Since we use a u32 for rate bitmaps in + * ieee80211_get_response_rate, we cannot + * have more than 32 legacy rates. + */ + if (WARN_ON(sband->n_bitrates > 32)) + return -EINVAL; + + for (i = 0; i < sband->n_channels; i++) { + sband->channels[i].orig_flags = + sband->channels[i].flags; + sband->channels[i].orig_mag = + sband->channels[i].max_antenna_gain; + sband->channels[i].orig_mpwr = + sband->channels[i].max_power; + sband->channels[i].band = band; + } + + have_band = true; + } + + if (!have_band) { + WARN_ON(1); + return -EINVAL; + } + + if (rdev->wiphy.wowlan.n_patterns) { + if (WARN_ON(!rdev->wiphy.wowlan.pattern_min_len || + rdev->wiphy.wowlan.pattern_min_len > + rdev->wiphy.wowlan.pattern_max_len)) + return -EINVAL; + } + + /* check and set up bitrates */ + ieee80211_set_bitrate_flags(wiphy); + + mutex_lock(&cfg80211_mutex); + + res = device_add(&rdev->wiphy.dev); + if (res) { + mutex_unlock(&cfg80211_mutex); + return res; + } + + /* set up regulatory info */ + wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE); + + list_add_rcu(&rdev->list, &cfg80211_rdev_list); + cfg80211_rdev_list_generation++; + + /* add to debugfs */ + rdev->wiphy.debugfsdir = + debugfs_create_dir(wiphy_name(&rdev->wiphy), + ieee80211_debugfs_dir); + if (IS_ERR(rdev->wiphy.debugfsdir)) + rdev->wiphy.debugfsdir = NULL; + + if (wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) { + struct regulatory_request request; + + request.wiphy_idx = get_wiphy_idx(wiphy); + request.initiator = NL80211_REGDOM_SET_BY_DRIVER; + request.alpha2[0] = '9'; + request.alpha2[1] = '9'; + + nl80211_send_reg_change_event(&request); + } + + cfg80211_debugfs_rdev_add(rdev); + mutex_unlock(&cfg80211_mutex); + + /* + * due to a locking dependency this has to be outside of the + * cfg80211_mutex lock + */ + res = rfkill_register(rdev->rfkill); + if (res) + goto out_rm_dev; + + return 0; + +out_rm_dev: + device_del(&rdev->wiphy.dev); + return res; +} +EXPORT_SYMBOL(wiphy_register); + +void wiphy_rfkill_start_polling(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + if (!rdev->ops->rfkill_poll) + return; + rdev->rfkill_ops.poll = cfg80211_rfkill_poll; + rfkill_resume_polling(rdev->rfkill); +} +EXPORT_SYMBOL(wiphy_rfkill_start_polling); + +void wiphy_rfkill_stop_polling(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + rfkill_pause_polling(rdev->rfkill); +} +EXPORT_SYMBOL(wiphy_rfkill_stop_polling); + +void wiphy_unregister(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + rfkill_unregister(rdev->rfkill); + + /* protect the device list */ + mutex_lock(&cfg80211_mutex); + + wait_event(rdev->dev_wait, ({ + int __count; + mutex_lock(&rdev->devlist_mtx); + __count = rdev->opencount; + mutex_unlock(&rdev->devlist_mtx); + __count == 0;})); + + mutex_lock(&rdev->devlist_mtx); + BUG_ON(!list_empty(&rdev->netdev_list)); + mutex_unlock(&rdev->devlist_mtx); + + /* + * First remove the hardware from everywhere, this makes + * it impossible to find from userspace. + */ + debugfs_remove_recursive(rdev->wiphy.debugfsdir); + list_del_rcu(&rdev->list); + synchronize_rcu(); + + /* + * Try to grab rdev->mtx. If a command is still in progress, + * hopefully the driver will refuse it since it's tearing + * down the device already. We wait for this command to complete + * before unlinking the item from the list. + * Note: as codified by the BUG_ON above we cannot get here if + * a virtual interface is still present. Hence, we can only get + * to lock contention here if userspace issues a command that + * identified the hardware by wiphy index. + */ + cfg80211_lock_rdev(rdev); + /* nothing */ + cfg80211_unlock_rdev(rdev); + + /* If this device got a regulatory hint tell core its + * free to listen now to a new shiny device regulatory hint */ + reg_device_remove(wiphy); + + cfg80211_rdev_list_generation++; + device_del(&rdev->wiphy.dev); + + mutex_unlock(&cfg80211_mutex); + + flush_work(&rdev->scan_done_wk); + cancel_work_sync(&rdev->conn_work); + flush_work(&rdev->event_work); +} +EXPORT_SYMBOL(wiphy_unregister); + +void cfg80211_dev_free(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_internal_bss *scan, *tmp; + rfkill_destroy(rdev->rfkill); + mutex_destroy(&rdev->mtx); + mutex_destroy(&rdev->devlist_mtx); + mutex_destroy(&rdev->sched_scan_mtx); + list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) + cfg80211_put_bss(&scan->pub); + cfg80211_rdev_free_wowlan(rdev); + kfree(rdev); +} + +void wiphy_free(struct wiphy *wiphy) +{ + put_device(&wiphy->dev); +} +EXPORT_SYMBOL(wiphy_free); + +void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + if (rfkill_set_hw_state(rdev->rfkill, blocked)) + schedule_work(&rdev->rfkill_sync); +} +EXPORT_SYMBOL(wiphy_rfkill_set_hw_state); + +static void wdev_cleanup_work(struct work_struct *work) +{ + struct wireless_dev *wdev; + struct cfg80211_registered_device *rdev; + + wdev = container_of(work, struct wireless_dev, cleanup_work); + rdev = wiphy_to_dev(wdev->wiphy); + + cfg80211_lock_rdev(rdev); + + if (WARN_ON(rdev->scan_req && rdev->scan_req->dev == wdev->netdev)) { + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev, true); + } + + cfg80211_unlock_rdev(rdev); + + mutex_lock(&rdev->sched_scan_mtx); + + if (WARN_ON(rdev->sched_scan_req && + rdev->sched_scan_req->dev == wdev->netdev)) { + __cfg80211_stop_sched_scan(rdev, false); + } + + mutex_unlock(&rdev->sched_scan_mtx); + + mutex_lock(&rdev->devlist_mtx); + rdev->opencount--; + mutex_unlock(&rdev->devlist_mtx); + wake_up(&rdev->dev_wait); + + dev_put(wdev->netdev); +} + +static struct device_type wiphy_type = { + .name = "wlan", +}; + +static int cfg80211_netdev_notifier_call(struct notifier_block * nb, + unsigned long state, + void *ndev) +{ + struct net_device *dev = ndev; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev; + int ret; + + if (!wdev) + return NOTIFY_DONE; + + rdev = wiphy_to_dev(wdev->wiphy); + + WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED); + + switch (state) { + case NETDEV_POST_INIT: + SET_NETDEV_DEVTYPE(dev, &wiphy_type); + break; + case NETDEV_REGISTER: + /* + * NB: cannot take rdev->mtx here because this may be + * called within code protected by it when interfaces + * are added with nl80211. + */ + mutex_init(&wdev->mtx); + INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work); + INIT_LIST_HEAD(&wdev->event_list); + spin_lock_init(&wdev->event_lock); + INIT_LIST_HEAD(&wdev->mgmt_registrations); + spin_lock_init(&wdev->mgmt_registrations_lock); + + mutex_lock(&rdev->devlist_mtx); + list_add_rcu(&wdev->list, &rdev->netdev_list); + rdev->devlist_generation++; + /* can only change netns with wiphy */ + dev->features |= NETIF_F_NETNS_LOCAL; + + if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj, + "phy80211")) { + pr_err("failed to add phy80211 symlink to netdev!\n"); + } + wdev->netdev = dev; + wdev->sme_state = CFG80211_SME_IDLE; + mutex_unlock(&rdev->devlist_mtx); +#ifdef CONFIG_CFG80211_WEXT + wdev->wext.default_key = -1; + wdev->wext.default_mgmt_key = -1; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; +#endif + + if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) + wdev->ps = true; + else + wdev->ps = false; + /* allow mac80211 to determine the timeout */ + wdev->ps_timeout = -1; + + if (!dev->ethtool_ops) + dev->ethtool_ops = &cfg80211_ethtool_ops; + + if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || + wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) + dev->priv_flags |= IFF_DONT_BRIDGE; + break; + case NETDEV_GOING_DOWN: + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + cfg80211_leave_ibss(rdev, dev, true); + break; + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_STATION: + mutex_lock(&rdev->sched_scan_mtx); + __cfg80211_stop_sched_scan(rdev, false); + mutex_unlock(&rdev->sched_scan_mtx); + + wdev_lock(wdev); +#ifdef CONFIG_CFG80211_WEXT + kfree(wdev->wext.ie); + wdev->wext.ie = NULL; + wdev->wext.ie_len = 0; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; +#endif + __cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, true); + cfg80211_mlme_down(rdev, dev); + wdev_unlock(wdev); + break; + case NL80211_IFTYPE_MESH_POINT: + cfg80211_leave_mesh(rdev, dev); + break; + default: + break; + } + wdev->beacon_interval = 0; + break; + case NETDEV_DOWN: + dev_hold(dev); + queue_work(cfg80211_wq, &wdev->cleanup_work); + break; + case NETDEV_UP: + /* + * If we have a really quick DOWN/UP succession we may + * have this work still pending ... cancel it and see + * if it was pending, in which case we need to account + * for some of the work it would have done. + */ + if (cancel_work_sync(&wdev->cleanup_work)) { + mutex_lock(&rdev->devlist_mtx); + rdev->opencount--; + mutex_unlock(&rdev->devlist_mtx); + dev_put(dev); + } + cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + switch (wdev->iftype) { +#ifdef CONFIG_CFG80211_WEXT + case NL80211_IFTYPE_ADHOC: + cfg80211_ibss_wext_join(rdev, wdev); + break; + case NL80211_IFTYPE_STATION: + cfg80211_mgd_wext_connect(rdev, wdev); + break; +#endif +#ifdef CONFIG_MAC80211_MESH + case NL80211_IFTYPE_MESH_POINT: + { + /* backward compat code... */ + struct mesh_setup setup; + memcpy(&setup, &default_mesh_setup, + sizeof(setup)); + /* back compat only needed for mesh_id */ + setup.mesh_id = wdev->ssid; + setup.mesh_id_len = wdev->mesh_id_up_len; + if (wdev->mesh_id_up_len) + __cfg80211_join_mesh(rdev, dev, + &setup, + &default_mesh_config); + break; + } +#endif + default: + break; + } + wdev_unlock(wdev); + rdev->opencount++; + mutex_unlock(&rdev->devlist_mtx); + cfg80211_unlock_rdev(rdev); + + /* + * Configure power management to the driver here so that its + * correctly set also after interface type changes etc. + */ + if (wdev->iftype == NL80211_IFTYPE_STATION && + rdev->ops->set_power_mgmt) + if (rdev->ops->set_power_mgmt(wdev->wiphy, dev, + wdev->ps, + wdev->ps_timeout)) { + /* assume this means it's off */ + wdev->ps = false; + } + break; + case NETDEV_UNREGISTER: + /* + * NB: cannot take rdev->mtx here because this may be + * called within code protected by it when interfaces + * are removed with nl80211. + */ + mutex_lock(&rdev->devlist_mtx); + /* + * It is possible to get NETDEV_UNREGISTER + * multiple times. To detect that, check + * that the interface is still on the list + * of registered interfaces, and only then + * remove and clean it up. + */ + if (!list_empty(&wdev->list)) { + sysfs_remove_link(&dev->dev.kobj, "phy80211"); + list_del_rcu(&wdev->list); + rdev->devlist_generation++; + cfg80211_mlme_purge_registrations(wdev); +#ifdef CONFIG_CFG80211_WEXT + kfree(wdev->wext.keys); +#endif + } + mutex_unlock(&rdev->devlist_mtx); + /* + * synchronise (so that we won't find this netdev + * from other code any more) and then clear the list + * head so that the above code can safely check for + * !list_empty() to avoid double-cleanup. + */ + synchronize_rcu(); + INIT_LIST_HEAD(&wdev->list); + break; + case NETDEV_PRE_UP: + if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) + return notifier_from_errno(-EOPNOTSUPP); + if (rfkill_blocked(rdev->rfkill)) + return notifier_from_errno(-ERFKILL); + ret = cfg80211_can_add_interface(rdev, wdev->iftype); + if (ret) + return notifier_from_errno(ret); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block cfg80211_netdev_notifier = { + .notifier_call = cfg80211_netdev_notifier_call, +}; + +static void __net_exit cfg80211_pernet_exit(struct net *net) +{ + struct cfg80211_registered_device *rdev; + + rtnl_lock(); + mutex_lock(&cfg80211_mutex); + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (net_eq(wiphy_net(&rdev->wiphy), net)) + WARN_ON(cfg80211_switch_netns(rdev, &init_net)); + } + mutex_unlock(&cfg80211_mutex); + rtnl_unlock(); +} + +static struct pernet_operations cfg80211_pernet_ops = { + .exit = cfg80211_pernet_exit, +}; + +static int __init cfg80211_init(void) +{ + int err; + + err = register_pernet_device(&cfg80211_pernet_ops); + if (err) + goto out_fail_pernet; + + err = wiphy_sysfs_init(); + if (err) + goto out_fail_sysfs; + + err = register_netdevice_notifier(&cfg80211_netdev_notifier); + if (err) + goto out_fail_notifier; + + err = nl80211_init(); + if (err) + goto out_fail_nl80211; + + ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL); + + err = regulatory_init(); + if (err) + goto out_fail_reg; + + cfg80211_wq = create_singlethread_workqueue("cfg80211"); + if (!cfg80211_wq) + goto out_fail_wq; + + return 0; + +out_fail_wq: + regulatory_exit(); +out_fail_reg: + debugfs_remove(ieee80211_debugfs_dir); +out_fail_nl80211: + unregister_netdevice_notifier(&cfg80211_netdev_notifier); +out_fail_notifier: + wiphy_sysfs_exit(); +out_fail_sysfs: + unregister_pernet_device(&cfg80211_pernet_ops); +out_fail_pernet: + return err; +} +subsys_initcall(cfg80211_init); + +static void __exit cfg80211_exit(void) +{ + debugfs_remove(ieee80211_debugfs_dir); + nl80211_exit(); + unregister_netdevice_notifier(&cfg80211_netdev_notifier); + wiphy_sysfs_exit(); + regulatory_exit(); + unregister_pernet_device(&cfg80211_pernet_ops); + destroy_workqueue(cfg80211_wq); +} +module_exit(cfg80211_exit); diff --git a/net/wireless/core.h b/net/wireless/core.h new file mode 100644 index 00000000..a570ff92 --- /dev/null +++ b/net/wireless/core.h @@ -0,0 +1,464 @@ +/* + * Wireless configuration interface internals. + * + * Copyright 2006-2010 Johannes Berg + */ +#ifndef __NET_WIRELESS_CORE_H +#define __NET_WIRELESS_CORE_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "reg.h" + +struct cfg80211_registered_device { + const struct cfg80211_ops *ops; + struct list_head list; + /* we hold this mutex during any call so that + * we cannot do multiple calls at once, and also + * to avoid the deregister call to proceed while + * any call is in progress */ + struct mutex mtx; + + /* rfkill support */ + struct rfkill_ops rfkill_ops; + struct rfkill *rfkill; + struct work_struct rfkill_sync; + + /* ISO / IEC 3166 alpha2 for which this device is receiving + * country IEs on, this can help disregard country IEs from APs + * on the same alpha2 quickly. The alpha2 may differ from + * cfg80211_regdomain's alpha2 when an intersection has occurred. + * If the AP is reconfigured this can also be used to tell us if + * the country on the country IE changed. */ + char country_ie_alpha2[2]; + + /* If a Country IE has been received this tells us the environment + * which its telling us its in. This defaults to ENVIRON_ANY */ + enum environment_cap env; + + /* wiphy index, internal only */ + int wiphy_idx; + + /* associate netdev list */ + struct mutex devlist_mtx; + /* protected by devlist_mtx or RCU */ + struct list_head netdev_list; + int devlist_generation; + int opencount; /* also protected by devlist_mtx */ + wait_queue_head_t dev_wait; + + /* BSSes/scanning */ + spinlock_t bss_lock; + struct list_head bss_list; + struct rb_root bss_tree; + u32 bss_generation; + struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct cfg80211_sched_scan_request *sched_scan_req; + unsigned long suspend_at; + struct work_struct scan_done_wk; + struct work_struct sched_scan_results_wk; + + struct mutex sched_scan_mtx; + +#ifdef CONFIG_NL80211_TESTMODE + struct genl_info *testmode_info; +#endif + + struct work_struct conn_work; + struct work_struct event_work; + + struct cfg80211_wowlan *wowlan; + + /* must be last because of the way we do wiphy_priv(), + * and it should at least be aligned to NETDEV_ALIGN */ + struct wiphy wiphy __attribute__((__aligned__(NETDEV_ALIGN))); +}; + +static inline +struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) +{ + BUG_ON(!wiphy); + return container_of(wiphy, struct cfg80211_registered_device, wiphy); +} + +/* Note 0 is valid, hence phy0 */ +static inline +bool wiphy_idx_valid(int wiphy_idx) +{ + return wiphy_idx >= 0; +} + +static inline void +cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) +{ + int i; + + if (!rdev->wowlan) + return; + for (i = 0; i < rdev->wowlan->n_patterns; i++) + kfree(rdev->wowlan->patterns[i].mask); + kfree(rdev->wowlan->patterns); + kfree(rdev->wowlan); +} + +extern struct workqueue_struct *cfg80211_wq; +extern struct mutex cfg80211_mutex; +extern struct list_head cfg80211_rdev_list; +extern int cfg80211_rdev_list_generation; + +static inline void assert_cfg80211_lock(void) +{ + lockdep_assert_held(&cfg80211_mutex); +} + +/* + * You can use this to mark a wiphy_idx as not having an associated wiphy. + * It guarantees cfg80211_rdev_by_wiphy_idx(wiphy_idx) will return NULL + */ +#define WIPHY_IDX_STALE -1 + +struct cfg80211_internal_bss { + struct list_head list; + struct rb_node rbn; + unsigned long ts; + struct kref ref; + atomic_t hold; + bool beacon_ies_allocated; + bool proberesp_ies_allocated; + + /* must be last because of priv member */ + struct cfg80211_bss pub; +}; + +static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub) +{ + return container_of(pub, struct cfg80211_internal_bss, pub); +} + +static inline void cfg80211_ref_bss(struct cfg80211_internal_bss *bss) +{ + kref_get(&bss->ref); +} + +static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss) +{ + atomic_inc(&bss->hold); +} + +static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss) +{ + int r = atomic_dec_return(&bss->hold); + WARN_ON(r < 0); +} + + +struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx); +int get_wiphy_idx(struct wiphy *wiphy); + +struct cfg80211_registered_device * +__cfg80211_rdev_from_info(struct genl_info *info); + +/* + * This function returns a pointer to the driver + * that the genl_info item that is passed refers to. + * If successful, it returns non-NULL and also locks + * the driver's mutex! + * + * This means that you need to call cfg80211_unlock_rdev() + * before being allowed to acquire &cfg80211_mutex! + * + * This is necessary because we need to lock the global + * mutex to get an item off the list safely, and then + * we lock the rdev mutex so it doesn't go away under us. + * + * We don't want to keep cfg80211_mutex locked + * for all the time in order to allow requests on + * other interfaces to go through at the same time. + * + * The result of this can be a PTR_ERR and hence must + * be checked with IS_ERR() for errors. + */ +extern struct cfg80211_registered_device * +cfg80211_get_dev_from_info(struct genl_info *info); + +/* requires cfg80211_rdev_mutex to be held! */ +struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); + +/* identical to cfg80211_get_dev_from_info but only operate on ifindex */ +extern struct cfg80211_registered_device * +cfg80211_get_dev_from_ifindex(struct net *net, int ifindex); + +int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, + struct net *net); + +static inline void cfg80211_lock_rdev(struct cfg80211_registered_device *rdev) +{ + mutex_lock(&rdev->mtx); +} + +static inline void cfg80211_unlock_rdev(struct cfg80211_registered_device *rdev) +{ + BUG_ON(IS_ERR(rdev) || !rdev); + mutex_unlock(&rdev->mtx); +} + +static inline void wdev_lock(struct wireless_dev *wdev) + __acquires(wdev) +{ + mutex_lock(&wdev->mtx); + __acquire(wdev->mtx); +} + +static inline void wdev_unlock(struct wireless_dev *wdev) + __releases(wdev) +{ + __release(wdev->mtx); + mutex_unlock(&wdev->mtx); +} + +#define ASSERT_RDEV_LOCK(rdev) lockdep_assert_held(&(rdev)->mtx) +#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx) + +enum cfg80211_event_type { + EVENT_CONNECT_RESULT, + EVENT_ROAMED, + EVENT_DISCONNECTED, + EVENT_IBSS_JOINED, +}; + +struct cfg80211_event { + struct list_head list; + enum cfg80211_event_type type; + + union { + struct { + u8 bssid[ETH_ALEN]; + const u8 *req_ie; + const u8 *resp_ie; + size_t req_ie_len; + size_t resp_ie_len; + u16 status; + } cr; + struct { + struct ieee80211_channel *channel; + u8 bssid[ETH_ALEN]; + const u8 *req_ie; + const u8 *resp_ie; + size_t req_ie_len; + size_t resp_ie_len; + } rm; + struct { + const u8 *ie; + size_t ie_len; + u16 reason; + } dc; + struct { + u8 bssid[ETH_ALEN]; + } ij; + }; +}; + +struct cfg80211_cached_keys { + struct key_params params[6]; + u8 data[6][WLAN_MAX_KEY_LEN]; + int def, defmgmt; +}; + + +/* free object */ +extern void cfg80211_dev_free(struct cfg80211_registered_device *rdev); + +extern int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, + char *newname); + +void ieee80211_set_bitrate_flags(struct wiphy *wiphy); +void wiphy_update_regulatory(struct wiphy *wiphy, + enum nl80211_reg_initiator setby); + +void cfg80211_bss_expire(struct cfg80211_registered_device *dev); +void cfg80211_bss_age(struct cfg80211_registered_device *dev, + unsigned long age_secs); + +/* IBSS */ +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys); +int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys); +void cfg80211_clear_ibss(struct net_device *dev, bool nowext); +int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool nowext); +int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool nowext); +void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid); +int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + +/* mesh */ +extern const struct mesh_config default_mesh_config; +extern const struct mesh_setup default_mesh_setup; +int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const struct mesh_setup *setup, + const struct mesh_config *conf); +int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const struct mesh_setup *setup, + const struct mesh_config *conf); +int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev); + +/* MLME */ +int __cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_auth_type auth_type, + const u8 *bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, + const u8 *key, int key_len, int key_idx, + bool local_state_change); +int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct ieee80211_channel *chan, + enum nl80211_auth_type auth_type, const u8 *bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, + const u8 *key, int key_len, int key_idx, + bool local_state_change); +int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + const u8 *bssid, const u8 *prev_bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, bool use_mfp, + struct cfg80211_crypto_settings *crypt); +int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct ieee80211_channel *chan, + const u8 *bssid, const u8 *prev_bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, bool use_mfp, + struct cfg80211_crypto_settings *crypt); +int __cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change); +int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change); +int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change); +void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, + struct net_device *dev); +void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, + u16 status, bool wextev, + struct cfg80211_bss *bss); +int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, + u16 frame_type, const u8 *match_data, + int match_len); +void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); +void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); +int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, bool offchan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, unsigned int wait, + const u8 *buf, size_t len, u64 *cookie); + +/* SME */ +int __cfg80211_connect(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_connect_params *connect, + struct cfg80211_cached_keys *connkeys, + const u8 *prev_bssid); +int cfg80211_connect(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_connect_params *connect, + struct cfg80211_cached_keys *connkeys); +int __cfg80211_disconnect(struct cfg80211_registered_device *rdev, + struct net_device *dev, u16 reason, + bool wextev); +int cfg80211_disconnect(struct cfg80211_registered_device *rdev, + struct net_device *dev, u16 reason, + bool wextev); +void __cfg80211_roamed(struct wireless_dev *wdev, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len); +int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + +void cfg80211_conn_work(struct work_struct *work); +void cfg80211_sme_failed_assoc(struct wireless_dev *wdev); +bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev); + +/* internal helpers */ +int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, + struct key_params *params, int key_idx, + bool pairwise, const u8 *mac_addr); +void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, + size_t ie_len, u16 reason, bool from_ap); +void cfg80211_sme_scan_done(struct net_device *dev); +void cfg80211_sme_rx_auth(struct net_device *dev, const u8 *buf, size_t len); +void cfg80211_sme_disassoc(struct net_device *dev, int idx); +void __cfg80211_scan_done(struct work_struct *wk); +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak); +void __cfg80211_sched_scan_results(struct work_struct *wk); +int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, + bool driver_initiated); +void cfg80211_upload_connect_keys(struct wireless_dev *wdev); +int cfg80211_change_iface(struct cfg80211_registered_device *rdev, + struct net_device *dev, enum nl80211_iftype ntype, + u32 *flags, struct vif_params *params); +void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); + +int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_iftype iftype); + +static inline int +cfg80211_can_add_interface(struct cfg80211_registered_device *rdev, + enum nl80211_iftype iftype) +{ + return cfg80211_can_change_interface(rdev, NULL, iftype); +} + +struct ieee80211_channel * +rdev_freq_to_chan(struct cfg80211_registered_device *rdev, + int freq, enum nl80211_channel_type channel_type); +int cfg80211_set_freq(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, int freq, + enum nl80211_channel_type channel_type); + +u16 cfg80211_calculate_bitrate(struct rate_info *rate); + +int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, + u32 beacon_int); + +#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS +#define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) +#else +/* + * Trick to enable using it as a condition, + * and also not give a warning when it's + * not used that way. + */ +#define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; }) +#endif + +#endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/db.txt b/net/wireless/db.txt new file mode 100644 index 00000000..a2fc3a09 --- /dev/null +++ b/net/wireless/db.txt @@ -0,0 +1,17 @@ +# +# This file is a placeholder to prevent accidental build breakage if someone +# enables CONFIG_CFG80211_INTERNAL_REGDB. Almost no one actually needs to +# enable that build option. +# +# You should be using CRDA instead. It is even better if you use the CRDA +# package provided by your distribution, since they will probably keep it +# up-to-date on your behalf. +# +# If you _really_ intend to use CONFIG_CFG80211_INTERNAL_REGDB then you will +# need to replace this file with one containing appropriately formatted +# regulatory rules that cover the regulatory domains you will be using. Your +# best option is to extract the db.txt file from the wireless-regdb git +# repository: +# +# git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git +# diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c new file mode 100644 index 00000000..39765bcf --- /dev/null +++ b/net/wireless/debugfs.c @@ -0,0 +1,121 @@ +/* + * cfg80211 debugfs + * + * Copyright 2009 Luis R. Rodriguez + * Copyright 2007 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include "core.h" +#include "debugfs.h" + +static int cfg80211_open_file_generic(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...) \ +static ssize_t name## _read(struct file *file, char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + struct wiphy *wiphy= file->private_data; \ + char buf[buflen]; \ + int res; \ + \ + res = scnprintf(buf, buflen, fmt "\n", ##value); \ + return simple_read_from_buffer(userbuf, count, ppos, buf, res); \ +} \ + \ +static const struct file_operations name## _ops = { \ + .read = name## _read, \ + .open = cfg80211_open_file_generic, \ + .llseek = generic_file_llseek, \ +}; + +DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", + wiphy->rts_threshold) +DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", + wiphy->frag_threshold); +DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", + wiphy->retry_short) +DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", + wiphy->retry_long); + +static int ht_print_chan(struct ieee80211_channel *chan, + char *buf, int buf_size, int offset) +{ + if (WARN_ON(offset > buf_size)) + return 0; + + if (chan->flags & IEEE80211_CHAN_DISABLED) + return snprintf(buf + offset, + buf_size - offset, + "%d Disabled\n", + chan->center_freq); + + return snprintf(buf + offset, + buf_size - offset, + "%d HT40 %c%c\n", + chan->center_freq, + (chan->flags & IEEE80211_CHAN_NO_HT40MINUS) ? ' ' : '-', + (chan->flags & IEEE80211_CHAN_NO_HT40PLUS) ? ' ' : '+'); +} + +static ssize_t ht40allow_map_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct wiphy *wiphy = file->private_data; + char *buf; + unsigned int offset = 0, buf_size = PAGE_SIZE, i, r; + enum ieee80211_band band; + struct ieee80211_supported_band *sband; + + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + mutex_lock(&cfg80211_mutex); + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + sband = wiphy->bands[band]; + if (!sband) + continue; + for (i = 0; i < sband->n_channels; i++) + offset += ht_print_chan(&sband->channels[i], + buf, buf_size, offset); + } + + mutex_unlock(&cfg80211_mutex); + + r = simple_read_from_buffer(user_buf, count, ppos, buf, offset); + + kfree(buf); + + return r; +} + +static const struct file_operations ht40allow_map_ops = { + .read = ht40allow_map_read, + .open = cfg80211_open_file_generic, + .llseek = default_llseek, +}; + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops); + +void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) +{ + struct dentry *phyd = rdev->wiphy.debugfsdir; + + DEBUGFS_ADD(rts_threshold); + DEBUGFS_ADD(fragmentation_threshold); + DEBUGFS_ADD(short_retry_limit); + DEBUGFS_ADD(long_retry_limit); + DEBUGFS_ADD(ht40allow_map); +} diff --git a/net/wireless/debugfs.h b/net/wireless/debugfs.h new file mode 100644 index 00000000..74fdd381 --- /dev/null +++ b/net/wireless/debugfs.h @@ -0,0 +1,11 @@ +#ifndef __CFG80211_DEBUGFS_H +#define __CFG80211_DEBUGFS_H + +#ifdef CONFIG_CFG80211_DEBUGFS +void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev); +#else +static inline +void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) {} +#endif + +#endif /* __CFG80211_DEBUGFS_H */ diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c new file mode 100644 index 00000000..9bde4d1d --- /dev/null +++ b/net/wireless/ethtool.c @@ -0,0 +1,78 @@ +#include +#include +#include "core.h" +#include "ethtool.h" + +static void cfg80211_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name, + sizeof(info->driver)); + + strlcpy(info->version, init_utsname()->release, sizeof(info->version)); + + if (wdev->wiphy->fw_version[0]) + strncpy(info->fw_version, wdev->wiphy->fw_version, + sizeof(info->fw_version)); + else + strncpy(info->fw_version, "N/A", sizeof(info->fw_version)); + + strlcpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), + sizeof(info->bus_info)); +} + +static int cfg80211_get_regs_len(struct net_device *dev) +{ + /* For now, return 0... */ + return 0; +} + +static void cfg80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, + void *data) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + regs->version = wdev->wiphy->hw_version; + regs->len = 0; +} + +static void cfg80211_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *rp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + + memset(rp, 0, sizeof(*rp)); + + if (rdev->ops->get_ringparam) + rdev->ops->get_ringparam(wdev->wiphy, + &rp->tx_pending, &rp->tx_max_pending, + &rp->rx_pending, &rp->rx_max_pending); +} + +static int cfg80211_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *rp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + + if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) + return -EINVAL; + + if (rdev->ops->set_ringparam) + return rdev->ops->set_ringparam(wdev->wiphy, + rp->tx_pending, rp->rx_pending); + + return -ENOTSUPP; +} + +const struct ethtool_ops cfg80211_ethtool_ops = { + .get_drvinfo = cfg80211_get_drvinfo, + .get_regs_len = cfg80211_get_regs_len, + .get_regs = cfg80211_get_regs, + .get_link = ethtool_op_get_link, + .get_ringparam = cfg80211_get_ringparam, + .set_ringparam = cfg80211_set_ringparam, +}; diff --git a/net/wireless/ethtool.h b/net/wireless/ethtool.h new file mode 100644 index 00000000..695ecad2 --- /dev/null +++ b/net/wireless/ethtool.h @@ -0,0 +1,6 @@ +#ifndef __CFG80211_ETHTOOL__ +#define __CFG80211_ETHTOOL__ + +extern const struct ethtool_ops cfg80211_ethtool_ops; + +#endif /* __CFG80211_ETHTOOL__ */ diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk new file mode 100644 index 00000000..53c143f5 --- /dev/null +++ b/net/wireless/genregdb.awk @@ -0,0 +1,119 @@ +#!/usr/bin/awk -f +# +# genregdb.awk -- generate regdb.c from db.txt +# +# Actually, it reads from stdin (presumed to be db.txt) and writes +# to stdout (presumed to be regdb.c), but close enough... +# +# Copyright 2009 John W. Linville +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# + +BEGIN { + active = 0 + rules = 0; + print "/*" + print " * DO NOT EDIT -- file generated from data in db.txt" + print " */" + print "" + print "#include " + print "#include " + print "#include \"regdb.h\"" + print "" + regdb = "const struct ieee80211_regdomain *reg_regdb[] = {\n" +} + +/^[ \t]*#/ { + # Ignore +} + +!active && /^[ \t]*$/ { + # Ignore +} + +!active && /country/ { + country=$2 + sub(/:/, "", country) + printf "static const struct ieee80211_regdomain regdom_%s = {\n", country + printf "\t.alpha2 = \"%s\",\n", country + printf "\t.reg_rules = {\n" + active = 1 + regdb = regdb "\t®dom_" country ",\n" +} + +active && /^[ \t]*\(/ { + start = $1 + sub(/\(/, "", start) + end = $3 + bw = $5 + sub(/\),/, "", bw) + gain = $6 + sub(/\(/, "", gain) + sub(/,/, "", gain) + power = $7 + sub(/\)/, "", power) + sub(/,/, "", power) + # power might be in mW... + units = $8 + sub(/\)/, "", units) + sub(/,/, "", units) + if (units == "mW") { + if (power == 100) { + power = 20 + } else if (power == 200) { + power = 23 + } else if (power == 500) { + power = 27 + } else if (power == 1000) { + power = 30 + } else { + print "Unknown power value in database!" + } + } + flagstr = "" + for (i=8; i<=NF; i++) + flagstr = flagstr $i + split(flagstr, flagarray, ",") + flags = "" + for (arg in flagarray) { + if (flagarray[arg] == "NO-OFDM") { + flags = flags "\n\t\t\tNL80211_RRF_NO_OFDM | " + } else if (flagarray[arg] == "NO-CCK") { + flags = flags "\n\t\t\tNL80211_RRF_NO_CCK | " + } else if (flagarray[arg] == "NO-INDOOR") { + flags = flags "\n\t\t\tNL80211_RRF_NO_INDOOR | " + } else if (flagarray[arg] == "NO-OUTDOOR") { + flags = flags "\n\t\t\tNL80211_RRF_NO_OUTDOOR | " + } else if (flagarray[arg] == "DFS") { + flags = flags "\n\t\t\tNL80211_RRF_DFS | " + } else if (flagarray[arg] == "PTP-ONLY") { + flags = flags "\n\t\t\tNL80211_RRF_PTP_ONLY | " + } else if (flagarray[arg] == "PTMP-ONLY") { + flags = flags "\n\t\t\tNL80211_RRF_PTMP_ONLY | " + } else if (flagarray[arg] == "PASSIVE-SCAN") { + flags = flags "\n\t\t\tNL80211_RRF_PASSIVE_SCAN | " + } else if (flagarray[arg] == "NO-IBSS") { + flags = flags "\n\t\t\tNL80211_RRF_NO_IBSS | " + } + } + flags = flags "0" + printf "\t\tREG_RULE(%d, %d, %d, %d, %d, %s),\n", start, end, bw, gain, power, flags + rules++ +} + +active && /^[ \t]*$/ { + active = 0 + printf "\t},\n" + printf "\t.n_reg_rules = %d\n", rules + printf "};\n\n" + rules = 0; +} + +END { + print regdb "};" + print "" + print "int reg_regdb_size = ARRAY_SIZE(reg_regdb);" +} diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c new file mode 100644 index 00000000..f33fbb79 --- /dev/null +++ b/net/wireless/ibss.c @@ -0,0 +1,526 @@ +/* + * Some IBSS support code for cfg80211. + * + * Copyright 2009 Johannes Berg + */ + +#include +#include +#include +#include +#include "wext-compat.h" +#include "nl80211.h" + + +void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_bss *bss; +#ifdef CONFIG_CFG80211_WEXT + union iwreq_data wrqu; +#endif + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return; + + if (!wdev->ssid_len) + return; + + bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, + wdev->ssid, wdev->ssid_len, + WLAN_CAPABILITY_IBSS, WLAN_CAPABILITY_IBSS); + + if (WARN_ON(!bss)) + return; + + if (wdev->current_bss) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + } + + cfg80211_hold_bss(bss_from_pub(bss)); + wdev->current_bss = bss_from_pub(bss); + + cfg80211_upload_connect_keys(wdev); + + nl80211_send_ibss_bssid(wiphy_to_dev(wdev->wiphy), dev, bssid, + GFP_KERNEL); +#ifdef CONFIG_CFG80211_WEXT + memset(&wrqu, 0, sizeof(wrqu)); + memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); + wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); +#endif +} + +void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + CFG80211_DEV_WARN_ON(!wdev->ssid_len); + + ev = kzalloc(sizeof(*ev), gfp); + if (!ev) + return; + + ev->type = EVENT_IBSS_JOINED; + memcpy(ev->cr.bssid, bssid, ETH_ALEN); + + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); +} +EXPORT_SYMBOL(cfg80211_ibss_joined); + +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (wdev->ssid_len) + return -EALREADY; + + if (!params->basic_rates) { + /* + * If no rates were explicitly configured, + * use the mandatory rate set for 11b or + * 11a for maximum compatibility. + */ + struct ieee80211_supported_band *sband = + rdev->wiphy.bands[params->channel->band]; + int j; + u32 flag = params->channel->band == IEEE80211_BAND_5GHZ ? + IEEE80211_RATE_MANDATORY_A : + IEEE80211_RATE_MANDATORY_B; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].flags & flag) + params->basic_rates |= BIT(j); + } + } + + if (WARN_ON(wdev->connect_keys)) + kfree(wdev->connect_keys); + wdev->connect_keys = connkeys; + +#ifdef CONFIG_CFG80211_WEXT + wdev->wext.ibss.channel = params->channel; +#endif + err = rdev->ops->join_ibss(&rdev->wiphy, dev, params); + if (err) { + wdev->connect_keys = NULL; + return err; + } + + memcpy(wdev->ssid, params->ssid, params->ssid_len); + wdev->ssid_len = params->ssid_len; + + return 0; +} + +int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + err = __cfg80211_join_ibss(rdev, dev, params, connkeys); + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + +static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + int i; + + ASSERT_WDEV_LOCK(wdev); + + kfree(wdev->connect_keys); + wdev->connect_keys = NULL; + + /* + * Delete all the keys ... pairwise keys can't really + * exist any more anyway, but default keys might. + */ + if (rdev->ops->del_key) + for (i = 0; i < 6; i++) + rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL); + + if (wdev->current_bss) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + } + + wdev->current_bss = NULL; + wdev->ssid_len = 0; +#ifdef CONFIG_CFG80211_WEXT + if (!nowext) + wdev->wext.ibss.ssid_len = 0; +#endif +} + +void cfg80211_clear_ibss(struct net_device *dev, bool nowext) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + wdev_lock(wdev); + __cfg80211_clear_ibss(dev, nowext); + wdev_unlock(wdev); +} + +int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool nowext) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (!wdev->ssid_len) + return -ENOLINK; + + err = rdev->ops->leave_ibss(&rdev->wiphy, dev); + + if (err) + return err; + + __cfg80211_clear_ibss(dev, nowext); + + return 0; +} + +int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool nowext) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_leave_ibss(rdev, dev, nowext); + wdev_unlock(wdev); + + return err; +} + +#ifdef CONFIG_CFG80211_WEXT +int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + struct cfg80211_cached_keys *ck = NULL; + enum ieee80211_band band; + int i, err; + + ASSERT_WDEV_LOCK(wdev); + + if (!wdev->wext.ibss.beacon_interval) + wdev->wext.ibss.beacon_interval = 100; + + /* try to find an IBSS channel if none requested ... */ + if (!wdev->wext.ibss.channel) { + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + + sband = rdev->wiphy.bands[band]; + if (!sband) + continue; + + for (i = 0; i < sband->n_channels; i++) { + chan = &sband->channels[i]; + if (chan->flags & IEEE80211_CHAN_NO_IBSS) + continue; + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + wdev->wext.ibss.channel = chan; + break; + } + + if (wdev->wext.ibss.channel) + break; + } + + if (!wdev->wext.ibss.channel) + return -EINVAL; + } + + /* don't join -- SSID is not there */ + if (!wdev->wext.ibss.ssid_len) + return 0; + + if (!netif_running(wdev->netdev)) + return 0; + + if (wdev->wext.keys) { + wdev->wext.keys->def = wdev->wext.default_key; + wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key; + } + + wdev->wext.ibss.privacy = wdev->wext.default_key != -1; + + if (wdev->wext.keys) { + ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); + if (!ck) + return -ENOMEM; + for (i = 0; i < 6; i++) + ck->params[i].key = ck->data[i]; + } + err = __cfg80211_join_ibss(rdev, wdev->netdev, + &wdev->wext.ibss, ck); + if (err) + kfree(ck); + + return err; +} + +int cfg80211_ibss_wext_siwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *wextfreq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct ieee80211_channel *chan = NULL; + int err, freq; + + /* call only for ibss! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return -EINVAL; + + if (!rdev->ops->join_ibss) + return -EOPNOTSUPP; + + freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + if (freq < 0) + return freq; + + if (freq) { + chan = ieee80211_get_channel(wdev->wiphy, freq); + if (!chan) + return -EINVAL; + if (chan->flags & IEEE80211_CHAN_NO_IBSS || + chan->flags & IEEE80211_CHAN_DISABLED) + return -EINVAL; + } + + if (wdev->wext.ibss.channel == chan) + return 0; + + wdev_lock(wdev); + err = 0; + if (wdev->ssid_len) + err = __cfg80211_leave_ibss(rdev, dev, true); + wdev_unlock(wdev); + + if (err) + return err; + + if (chan) { + wdev->wext.ibss.channel = chan; + wdev->wext.ibss.channel_fixed = true; + } else { + /* cfg80211_ibss_wext_join will pick one if needed */ + wdev->wext.ibss.channel_fixed = false; + } + + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + err = cfg80211_ibss_wext_join(rdev, wdev); + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + +int cfg80211_ibss_wext_giwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct ieee80211_channel *chan = NULL; + + /* call only for ibss! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return -EINVAL; + + wdev_lock(wdev); + if (wdev->current_bss) + chan = wdev->current_bss->pub.channel; + else if (wdev->wext.ibss.channel) + chan = wdev->wext.ibss.channel; + wdev_unlock(wdev); + + if (chan) { + freq->m = chan->center_freq; + freq->e = 6; + return 0; + } + + /* no channel if not joining */ + return -EINVAL; +} + +int cfg80211_ibss_wext_siwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + size_t len = data->length; + int err; + + /* call only for ibss! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return -EINVAL; + + if (!rdev->ops->join_ibss) + return -EOPNOTSUPP; + + wdev_lock(wdev); + err = 0; + if (wdev->ssid_len) + err = __cfg80211_leave_ibss(rdev, dev, true); + wdev_unlock(wdev); + + if (err) + return err; + + /* iwconfig uses nul termination in SSID.. */ + if (len > 0 && ssid[len - 1] == '\0') + len--; + + wdev->wext.ibss.ssid = wdev->ssid; + memcpy(wdev->wext.ibss.ssid, ssid, len); + wdev->wext.ibss.ssid_len = len; + + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + err = cfg80211_ibss_wext_join(rdev, wdev); + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + +int cfg80211_ibss_wext_giwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + /* call only for ibss! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return -EINVAL; + + data->flags = 0; + + wdev_lock(wdev); + if (wdev->ssid_len) { + data->flags = 1; + data->length = wdev->ssid_len; + memcpy(ssid, wdev->ssid, data->length); + } else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) { + data->flags = 1; + data->length = wdev->wext.ibss.ssid_len; + memcpy(ssid, wdev->wext.ibss.ssid, data->length); + } + wdev_unlock(wdev); + + return 0; +} + +int cfg80211_ibss_wext_siwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u8 *bssid = ap_addr->sa_data; + int err; + + /* call only for ibss! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return -EINVAL; + + if (!rdev->ops->join_ibss) + return -EOPNOTSUPP; + + if (ap_addr->sa_family != ARPHRD_ETHER) + return -EINVAL; + + /* automatic mode */ + if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) + bssid = NULL; + + /* both automatic */ + if (!bssid && !wdev->wext.ibss.bssid) + return 0; + + /* fixed already - and no change */ + if (wdev->wext.ibss.bssid && bssid && + compare_ether_addr(bssid, wdev->wext.ibss.bssid) == 0) + return 0; + + wdev_lock(wdev); + err = 0; + if (wdev->ssid_len) + err = __cfg80211_leave_ibss(rdev, dev, true); + wdev_unlock(wdev); + + if (err) + return err; + + if (bssid) { + memcpy(wdev->wext.bssid, bssid, ETH_ALEN); + wdev->wext.ibss.bssid = wdev->wext.bssid; + } else + wdev->wext.ibss.bssid = NULL; + + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + err = cfg80211_ibss_wext_join(rdev, wdev); + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + +int cfg80211_ibss_wext_giwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + /* call only for ibss! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) + return -EINVAL; + + ap_addr->sa_family = ARPHRD_ETHER; + + wdev_lock(wdev); + if (wdev->current_bss) + memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN); + else if (wdev->wext.ibss.bssid) + memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN); + else + memset(ap_addr->sa_data, 0, ETH_ALEN); + + wdev_unlock(wdev); + + return 0; +} +#endif diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c new file mode 100644 index 00000000..3268fac5 --- /dev/null +++ b/net/wireless/lib80211.c @@ -0,0 +1,286 @@ +/* + * lib80211 -- common bits for IEEE802.11 drivers + * + * Copyright(c) 2008 John W. Linville + * + * Portions copied from old ieee80211 component, w/ original copyright + * notices below: + * + * Host AP crypto routines + * + * Copyright (c) 2002-2003, Jouni Malinen + * Portions Copyright (C) 2004, Intel Corporation + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DRV_NAME "lib80211" + +#define DRV_DESCRIPTION "common routines for IEEE802.11 drivers" + +MODULE_DESCRIPTION(DRV_DESCRIPTION); +MODULE_AUTHOR("John W. Linville "); +MODULE_LICENSE("GPL"); + +struct lib80211_crypto_alg { + struct list_head list; + struct lib80211_crypto_ops *ops; +}; + +static LIST_HEAD(lib80211_crypto_algs); +static DEFINE_SPINLOCK(lib80211_crypto_lock); + +const char *print_ssid(char *buf, const char *ssid, u8 ssid_len) +{ + const char *s = ssid; + char *d = buf; + + ssid_len = min_t(u8, ssid_len, IEEE80211_MAX_SSID_LEN); + while (ssid_len--) { + if (isprint(*s)) { + *d++ = *s++; + continue; + } + + *d++ = '\\'; + if (*s == '\0') + *d++ = '0'; + else if (*s == '\n') + *d++ = 'n'; + else if (*s == '\r') + *d++ = 'r'; + else if (*s == '\t') + *d++ = 't'; + else if (*s == '\\') + *d++ = '\\'; + else + d += snprintf(d, 3, "%03o", *s); + s++; + } + *d = '\0'; + return buf; +} +EXPORT_SYMBOL(print_ssid); + +int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, + spinlock_t *lock) +{ + memset(info, 0, sizeof(*info)); + + info->name = name; + info->lock = lock; + + INIT_LIST_HEAD(&info->crypt_deinit_list); + setup_timer(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler, + (unsigned long)info); + + return 0; +} +EXPORT_SYMBOL(lib80211_crypt_info_init); + +void lib80211_crypt_info_free(struct lib80211_crypt_info *info) +{ + int i; + + lib80211_crypt_quiescing(info); + del_timer_sync(&info->crypt_deinit_timer); + lib80211_crypt_deinit_entries(info, 1); + + for (i = 0; i < NUM_WEP_KEYS; i++) { + struct lib80211_crypt_data *crypt = info->crypt[i]; + if (crypt) { + if (crypt->ops) { + crypt->ops->deinit(crypt->priv); + module_put(crypt->ops->owner); + } + kfree(crypt); + info->crypt[i] = NULL; + } + } +} +EXPORT_SYMBOL(lib80211_crypt_info_free); + +void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, int force) +{ + struct lib80211_crypt_data *entry, *next; + unsigned long flags; + + spin_lock_irqsave(info->lock, flags); + list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) { + if (atomic_read(&entry->refcnt) != 0 && !force) + continue; + + list_del(&entry->list); + + if (entry->ops) { + entry->ops->deinit(entry->priv); + module_put(entry->ops->owner); + } + kfree(entry); + } + spin_unlock_irqrestore(info->lock, flags); +} +EXPORT_SYMBOL(lib80211_crypt_deinit_entries); + +/* After this, crypt_deinit_list won't accept new members */ +void lib80211_crypt_quiescing(struct lib80211_crypt_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(info->lock, flags); + info->crypt_quiesced = 1; + spin_unlock_irqrestore(info->lock, flags); +} +EXPORT_SYMBOL(lib80211_crypt_quiescing); + +void lib80211_crypt_deinit_handler(unsigned long data) +{ + struct lib80211_crypt_info *info = (struct lib80211_crypt_info *)data; + unsigned long flags; + + lib80211_crypt_deinit_entries(info, 0); + + spin_lock_irqsave(info->lock, flags); + if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) { + printk(KERN_DEBUG "%s: entries remaining in delayed crypt " + "deletion list\n", info->name); + info->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&info->crypt_deinit_timer); + } + spin_unlock_irqrestore(info->lock, flags); +} +EXPORT_SYMBOL(lib80211_crypt_deinit_handler); + +void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, + struct lib80211_crypt_data **crypt) +{ + struct lib80211_crypt_data *tmp; + unsigned long flags; + + if (*crypt == NULL) + return; + + tmp = *crypt; + *crypt = NULL; + + /* must not run ops->deinit() while there may be pending encrypt or + * decrypt operations. Use a list of delayed deinits to avoid needing + * locking. */ + + spin_lock_irqsave(info->lock, flags); + if (!info->crypt_quiesced) { + list_add(&tmp->list, &info->crypt_deinit_list); + if (!timer_pending(&info->crypt_deinit_timer)) { + info->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&info->crypt_deinit_timer); + } + } + spin_unlock_irqrestore(info->lock, flags); +} +EXPORT_SYMBOL(lib80211_crypt_delayed_deinit); + +int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops) +{ + unsigned long flags; + struct lib80211_crypto_alg *alg; + + alg = kzalloc(sizeof(*alg), GFP_KERNEL); + if (alg == NULL) + return -ENOMEM; + + alg->ops = ops; + + spin_lock_irqsave(&lib80211_crypto_lock, flags); + list_add(&alg->list, &lib80211_crypto_algs); + spin_unlock_irqrestore(&lib80211_crypto_lock, flags); + + printk(KERN_DEBUG "lib80211_crypt: registered algorithm '%s'\n", + ops->name); + + return 0; +} +EXPORT_SYMBOL(lib80211_register_crypto_ops); + +int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops) +{ + struct lib80211_crypto_alg *alg; + unsigned long flags; + + spin_lock_irqsave(&lib80211_crypto_lock, flags); + list_for_each_entry(alg, &lib80211_crypto_algs, list) { + if (alg->ops == ops) + goto found; + } + spin_unlock_irqrestore(&lib80211_crypto_lock, flags); + return -EINVAL; + + found: + printk(KERN_DEBUG "lib80211_crypt: unregistered algorithm '%s'\n", + ops->name); + list_del(&alg->list); + spin_unlock_irqrestore(&lib80211_crypto_lock, flags); + kfree(alg); + return 0; +} +EXPORT_SYMBOL(lib80211_unregister_crypto_ops); + +struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name) +{ + struct lib80211_crypto_alg *alg; + unsigned long flags; + + spin_lock_irqsave(&lib80211_crypto_lock, flags); + list_for_each_entry(alg, &lib80211_crypto_algs, list) { + if (strcmp(alg->ops->name, name) == 0) + goto found; + } + spin_unlock_irqrestore(&lib80211_crypto_lock, flags); + return NULL; + + found: + spin_unlock_irqrestore(&lib80211_crypto_lock, flags); + return alg->ops; +} +EXPORT_SYMBOL(lib80211_get_crypto_ops); + +static void *lib80211_crypt_null_init(int keyidx) +{ + return (void *)1; +} + +static void lib80211_crypt_null_deinit(void *priv) +{ +} + +static struct lib80211_crypto_ops lib80211_crypt_null = { + .name = "NULL", + .init = lib80211_crypt_null_init, + .deinit = lib80211_crypt_null_deinit, + .owner = THIS_MODULE, +}; + +static int __init lib80211_init(void) +{ + pr_info(DRV_DESCRIPTION "\n"); + return lib80211_register_crypto_ops(&lib80211_crypt_null); +} + +static void __exit lib80211_exit(void) +{ + lib80211_unregister_crypto_ops(&lib80211_crypt_null); + BUG_ON(!list_empty(&lib80211_crypto_algs)); +} + +module_init(lib80211_init); +module_exit(lib80211_exit); diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c new file mode 100644 index 00000000..dacb3b4b --- /dev/null +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -0,0 +1,493 @@ +/* + * lib80211 crypt: host-based CCMP encryption implementation for lib80211 + * + * Copyright (c) 2003-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. See README and COPYING for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +MODULE_AUTHOR("Jouni Malinen"); +MODULE_DESCRIPTION("Host AP crypt: CCMP"); +MODULE_LICENSE("GPL"); + +#define AES_BLOCK_LEN 16 +#define CCMP_HDR_LEN 8 +#define CCMP_MIC_LEN 8 +#define CCMP_TK_LEN 16 +#define CCMP_PN_LEN 6 + +struct lib80211_ccmp_data { + u8 key[CCMP_TK_LEN]; + int key_set; + + u8 tx_pn[CCMP_PN_LEN]; + u8 rx_pn[CCMP_PN_LEN]; + + u32 dot11RSNAStatsCCMPFormatErrors; + u32 dot11RSNAStatsCCMPReplays; + u32 dot11RSNAStatsCCMPDecryptErrors; + + int key_idx; + + struct crypto_cipher *tfm; + + /* scratch buffers for virt_to_page() (crypto API) */ + u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN], + tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN]; + u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN]; +}; + +static inline void lib80211_ccmp_aes_encrypt(struct crypto_cipher *tfm, + const u8 pt[16], u8 ct[16]) +{ + crypto_cipher_encrypt_one(tfm, ct, pt); +} + +static void *lib80211_ccmp_init(int key_idx) +{ + struct lib80211_ccmp_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + priv->key_idx = key_idx; + + priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->tfm)) { + printk(KERN_DEBUG "lib80211_crypt_ccmp: could not allocate " + "crypto API aes\n"); + priv->tfm = NULL; + goto fail; + } + + return priv; + + fail: + if (priv) { + if (priv->tfm) + crypto_free_cipher(priv->tfm); + kfree(priv); + } + + return NULL; +} + +static void lib80211_ccmp_deinit(void *priv) +{ + struct lib80211_ccmp_data *_priv = priv; + if (_priv && _priv->tfm) + crypto_free_cipher(_priv->tfm); + kfree(priv); +} + +static inline void xor_block(u8 * b, u8 * a, size_t len) +{ + int i; + for (i = 0; i < len; i++) + b[i] ^= a[i]; +} + +static void ccmp_init_blocks(struct crypto_cipher *tfm, + struct ieee80211_hdr *hdr, + u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0) +{ + u8 *pos, qc = 0; + size_t aad_len; + int a4_included, qc_included; + u8 aad[2 * AES_BLOCK_LEN]; + + a4_included = ieee80211_has_a4(hdr->frame_control); + qc_included = ieee80211_is_data_qos(hdr->frame_control); + + aad_len = 22; + if (a4_included) + aad_len += 6; + if (qc_included) { + pos = (u8 *) & hdr->addr4; + if (a4_included) + pos += 6; + qc = *pos & 0x0f; + aad_len += 2; + } + + /* CCM Initial Block: + * Flag (Include authentication header, M=3 (8-octet MIC), + * L=1 (2-octet Dlen)) + * Nonce: 0x00 | A2 | PN + * Dlen */ + b0[0] = 0x59; + b0[1] = qc; + memcpy(b0 + 2, hdr->addr2, ETH_ALEN); + memcpy(b0 + 8, pn, CCMP_PN_LEN); + b0[14] = (dlen >> 8) & 0xff; + b0[15] = dlen & 0xff; + + /* AAD: + * FC with bits 4..6 and 11..13 masked to zero; 14 is always one + * A1 | A2 | A3 + * SC with bits 4..15 (seq#) masked to zero + * A4 (if present) + * QC (if present) + */ + pos = (u8 *) hdr; + aad[0] = 0; /* aad_len >> 8 */ + aad[1] = aad_len & 0xff; + aad[2] = pos[0] & 0x8f; + aad[3] = pos[1] & 0xc7; + memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN); + pos = (u8 *) & hdr->seq_ctrl; + aad[22] = pos[0] & 0x0f; + aad[23] = 0; /* all bits masked */ + memset(aad + 24, 0, 8); + if (a4_included) + memcpy(aad + 24, hdr->addr4, ETH_ALEN); + if (qc_included) { + aad[a4_included ? 30 : 24] = qc; + /* rest of QC masked */ + } + + /* Start with the first block and AAD */ + lib80211_ccmp_aes_encrypt(tfm, b0, auth); + xor_block(auth, aad, AES_BLOCK_LEN); + lib80211_ccmp_aes_encrypt(tfm, auth, auth); + xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN); + lib80211_ccmp_aes_encrypt(tfm, auth, auth); + b0[0] &= 0x07; + b0[14] = b0[15] = 0; + lib80211_ccmp_aes_encrypt(tfm, b0, s0); +} + +static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, + u8 *aeskey, int keylen, void *priv) +{ + struct lib80211_ccmp_data *key = priv; + int i; + u8 *pos; + + if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len) + return -1; + + if (aeskey != NULL && keylen >= CCMP_TK_LEN) + memcpy(aeskey, key->key, CCMP_TK_LEN); + + pos = skb_push(skb, CCMP_HDR_LEN); + memmove(pos, pos + CCMP_HDR_LEN, hdr_len); + pos += hdr_len; + + i = CCMP_PN_LEN - 1; + while (i >= 0) { + key->tx_pn[i]++; + if (key->tx_pn[i] != 0) + break; + i--; + } + + *pos++ = key->tx_pn[5]; + *pos++ = key->tx_pn[4]; + *pos++ = 0; + *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ; + *pos++ = key->tx_pn[3]; + *pos++ = key->tx_pn[2]; + *pos++ = key->tx_pn[1]; + *pos++ = key->tx_pn[0]; + + return CCMP_HDR_LEN; +} + +static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct lib80211_ccmp_data *key = priv; + int data_len, i, blocks, last, len; + u8 *pos, *mic; + struct ieee80211_hdr *hdr; + u8 *b0 = key->tx_b0; + u8 *b = key->tx_b; + u8 *e = key->tx_e; + u8 *s0 = key->tx_s0; + + if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) + return -1; + + data_len = skb->len - hdr_len; + len = lib80211_ccmp_hdr(skb, hdr_len, NULL, 0, priv); + if (len < 0) + return -1; + + pos = skb->data + hdr_len + CCMP_HDR_LEN; + hdr = (struct ieee80211_hdr *)skb->data; + ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0); + + blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); + last = data_len % AES_BLOCK_LEN; + + for (i = 1; i <= blocks; i++) { + len = (i == blocks && last) ? last : AES_BLOCK_LEN; + /* Authentication */ + xor_block(b, pos, len); + lib80211_ccmp_aes_encrypt(key->tfm, b, b); + /* Encryption, with counter */ + b0[14] = (i >> 8) & 0xff; + b0[15] = i & 0xff; + lib80211_ccmp_aes_encrypt(key->tfm, b0, e); + xor_block(pos, e, len); + pos += len; + } + + mic = skb_put(skb, CCMP_MIC_LEN); + for (i = 0; i < CCMP_MIC_LEN; i++) + mic[i] = b[i] ^ s0[i]; + + return 0; +} + +/* + * deal with seq counter wrapping correctly. + * refer to timer_after() for jiffies wrapping handling + */ +static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o) +{ + u32 iv32_n, iv16_n; + u32 iv32_o, iv16_o; + + iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3]; + iv16_n = (pn_n[4] << 8) | pn_n[5]; + + iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3]; + iv16_o = (pn_o[4] << 8) | pn_o[5]; + + if ((s32)iv32_n - (s32)iv32_o < 0 || + (iv32_n == iv32_o && iv16_n <= iv16_o)) + return 1; + return 0; +} + +static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct lib80211_ccmp_data *key = priv; + u8 keyidx, *pos; + struct ieee80211_hdr *hdr; + u8 *b0 = key->rx_b0; + u8 *b = key->rx_b; + u8 *a = key->rx_a; + u8 pn[6]; + int i, blocks, last, len; + size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN; + u8 *mic = skb->data + skb->len - CCMP_MIC_LEN; + + if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) { + key->dot11RSNAStatsCCMPFormatErrors++; + return -1; + } + + hdr = (struct ieee80211_hdr *)skb->data; + pos = skb->data + hdr_len; + keyidx = pos[3]; + if (!(keyidx & (1 << 5))) { + if (net_ratelimit()) { + printk(KERN_DEBUG "CCMP: received packet without ExtIV" + " flag from %pM\n", hdr->addr2); + } + key->dot11RSNAStatsCCMPFormatErrors++; + return -2; + } + keyidx >>= 6; + if (key->key_idx != keyidx) { + printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame " + "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv); + return -6; + } + if (!key->key_set) { + if (net_ratelimit()) { + printk(KERN_DEBUG "CCMP: received packet from %pM" + " with keyid=%d that does not have a configured" + " key\n", hdr->addr2, keyidx); + } + return -3; + } + + pn[0] = pos[7]; + pn[1] = pos[6]; + pn[2] = pos[5]; + pn[3] = pos[4]; + pn[4] = pos[1]; + pn[5] = pos[0]; + pos += 8; + + if (ccmp_replay_check(pn, key->rx_pn)) { +#ifdef CONFIG_LIB80211_DEBUG + if (net_ratelimit()) { + printk(KERN_DEBUG "CCMP: replay detected: STA=%pM " + "previous PN %02x%02x%02x%02x%02x%02x " + "received PN %02x%02x%02x%02x%02x%02x\n", + hdr->addr2, + key->rx_pn[0], key->rx_pn[1], key->rx_pn[2], + key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], + pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); + } +#endif + key->dot11RSNAStatsCCMPReplays++; + return -4; + } + + ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b); + xor_block(mic, b, CCMP_MIC_LEN); + + blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); + last = data_len % AES_BLOCK_LEN; + + for (i = 1; i <= blocks; i++) { + len = (i == blocks && last) ? last : AES_BLOCK_LEN; + /* Decrypt, with counter */ + b0[14] = (i >> 8) & 0xff; + b0[15] = i & 0xff; + lib80211_ccmp_aes_encrypt(key->tfm, b0, b); + xor_block(pos, b, len); + /* Authentication */ + xor_block(a, pos, len); + lib80211_ccmp_aes_encrypt(key->tfm, a, a); + pos += len; + } + + if (memcmp(mic, a, CCMP_MIC_LEN) != 0) { + if (net_ratelimit()) { + printk(KERN_DEBUG "CCMP: decrypt failed: STA=" + "%pM\n", hdr->addr2); + } + key->dot11RSNAStatsCCMPDecryptErrors++; + return -5; + } + + memcpy(key->rx_pn, pn, CCMP_PN_LEN); + + /* Remove hdr and MIC */ + memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len); + skb_pull(skb, CCMP_HDR_LEN); + skb_trim(skb, skb->len - CCMP_MIC_LEN); + + return keyidx; +} + +static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct lib80211_ccmp_data *data = priv; + int keyidx; + struct crypto_cipher *tfm = data->tfm; + + keyidx = data->key_idx; + memset(data, 0, sizeof(*data)); + data->key_idx = keyidx; + data->tfm = tfm; + if (len == CCMP_TK_LEN) { + memcpy(data->key, key, CCMP_TK_LEN); + data->key_set = 1; + if (seq) { + data->rx_pn[0] = seq[5]; + data->rx_pn[1] = seq[4]; + data->rx_pn[2] = seq[3]; + data->rx_pn[3] = seq[2]; + data->rx_pn[4] = seq[1]; + data->rx_pn[5] = seq[0]; + } + crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN); + } else if (len == 0) + data->key_set = 0; + else + return -1; + + return 0; +} + +static int lib80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct lib80211_ccmp_data *data = priv; + + if (len < CCMP_TK_LEN) + return -1; + + if (!data->key_set) + return 0; + memcpy(key, data->key, CCMP_TK_LEN); + + if (seq) { + seq[0] = data->tx_pn[5]; + seq[1] = data->tx_pn[4]; + seq[2] = data->tx_pn[3]; + seq[3] = data->tx_pn[2]; + seq[4] = data->tx_pn[1]; + seq[5] = data->tx_pn[0]; + } + + return CCMP_TK_LEN; +} + +static char *lib80211_ccmp_print_stats(char *p, void *priv) +{ + struct lib80211_ccmp_data *ccmp = priv; + + p += sprintf(p, "key[%d] alg=CCMP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "format_errors=%d replays=%d decrypt_errors=%d\n", + ccmp->key_idx, ccmp->key_set, + ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], + ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], + ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], + ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], + ccmp->dot11RSNAStatsCCMPFormatErrors, + ccmp->dot11RSNAStatsCCMPReplays, + ccmp->dot11RSNAStatsCCMPDecryptErrors); + + return p; +} + +static struct lib80211_crypto_ops lib80211_crypt_ccmp = { + .name = "CCMP", + .init = lib80211_ccmp_init, + .deinit = lib80211_ccmp_deinit, + .encrypt_mpdu = lib80211_ccmp_encrypt, + .decrypt_mpdu = lib80211_ccmp_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = lib80211_ccmp_set_key, + .get_key = lib80211_ccmp_get_key, + .print_stats = lib80211_ccmp_print_stats, + .extra_mpdu_prefix_len = CCMP_HDR_LEN, + .extra_mpdu_postfix_len = CCMP_MIC_LEN, + .owner = THIS_MODULE, +}; + +static int __init lib80211_crypto_ccmp_init(void) +{ + return lib80211_register_crypto_ops(&lib80211_crypt_ccmp); +} + +static void __exit lib80211_crypto_ccmp_exit(void) +{ + lib80211_unregister_crypto_ops(&lib80211_crypt_ccmp); +} + +module_init(lib80211_crypto_ccmp_init); +module_exit(lib80211_crypto_ccmp_exit); diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c new file mode 100644 index 00000000..7ea4f2b0 --- /dev/null +++ b/net/wireless/lib80211_crypt_tkip.c @@ -0,0 +1,784 @@ +/* + * lib80211 crypt: host-based TKIP encryption implementation for lib80211 + * + * Copyright (c) 2003-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. See README and COPYING for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +MODULE_AUTHOR("Jouni Malinen"); +MODULE_DESCRIPTION("lib80211 crypt: TKIP"); +MODULE_LICENSE("GPL"); + +#define TKIP_HDR_LEN 8 + +struct lib80211_tkip_data { +#define TKIP_KEY_LEN 32 + u8 key[TKIP_KEY_LEN]; + int key_set; + + u32 tx_iv32; + u16 tx_iv16; + u16 tx_ttak[5]; + int tx_phase1_done; + + u32 rx_iv32; + u16 rx_iv16; + u16 rx_ttak[5]; + int rx_phase1_done; + u32 rx_iv32_new; + u16 rx_iv16_new; + + u32 dot11RSNAStatsTKIPReplays; + u32 dot11RSNAStatsTKIPICVErrors; + u32 dot11RSNAStatsTKIPLocalMICFailures; + + int key_idx; + + struct crypto_blkcipher *rx_tfm_arc4; + struct crypto_hash *rx_tfm_michael; + struct crypto_blkcipher *tx_tfm_arc4; + struct crypto_hash *tx_tfm_michael; + + /* scratch buffers for virt_to_page() (crypto API) */ + u8 rx_hdr[16], tx_hdr[16]; + + unsigned long flags; +}; + +static unsigned long lib80211_tkip_set_flags(unsigned long flags, void *priv) +{ + struct lib80211_tkip_data *_priv = priv; + unsigned long old_flags = _priv->flags; + _priv->flags = flags; + return old_flags; +} + +static unsigned long lib80211_tkip_get_flags(void *priv) +{ + struct lib80211_tkip_data *_priv = priv; + return _priv->flags; +} + +static void *lib80211_tkip_init(int key_idx) +{ + struct lib80211_tkip_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + + priv->key_idx = key_idx; + + priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->tx_tfm_arc4)) { + printk(KERN_DEBUG pr_fmt("could not allocate crypto API arc4\n")); + priv->tx_tfm_arc4 = NULL; + goto fail; + } + + priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->tx_tfm_michael)) { + printk(KERN_DEBUG pr_fmt("could not allocate crypto API michael_mic\n")); + priv->tx_tfm_michael = NULL; + goto fail; + } + + priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->rx_tfm_arc4)) { + printk(KERN_DEBUG pr_fmt("could not allocate crypto API arc4\n")); + priv->rx_tfm_arc4 = NULL; + goto fail; + } + + priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->rx_tfm_michael)) { + printk(KERN_DEBUG pr_fmt("could not allocate crypto API michael_mic\n")); + priv->rx_tfm_michael = NULL; + goto fail; + } + + return priv; + + fail: + if (priv) { + if (priv->tx_tfm_michael) + crypto_free_hash(priv->tx_tfm_michael); + if (priv->tx_tfm_arc4) + crypto_free_blkcipher(priv->tx_tfm_arc4); + if (priv->rx_tfm_michael) + crypto_free_hash(priv->rx_tfm_michael); + if (priv->rx_tfm_arc4) + crypto_free_blkcipher(priv->rx_tfm_arc4); + kfree(priv); + } + + return NULL; +} + +static void lib80211_tkip_deinit(void *priv) +{ + struct lib80211_tkip_data *_priv = priv; + if (_priv) { + if (_priv->tx_tfm_michael) + crypto_free_hash(_priv->tx_tfm_michael); + if (_priv->tx_tfm_arc4) + crypto_free_blkcipher(_priv->tx_tfm_arc4); + if (_priv->rx_tfm_michael) + crypto_free_hash(_priv->rx_tfm_michael); + if (_priv->rx_tfm_arc4) + crypto_free_blkcipher(_priv->rx_tfm_arc4); + } + kfree(priv); +} + +static inline u16 RotR1(u16 val) +{ + return (val >> 1) | (val << 15); +} + +static inline u8 Lo8(u16 val) +{ + return val & 0xff; +} + +static inline u8 Hi8(u16 val) +{ + return val >> 8; +} + +static inline u16 Lo16(u32 val) +{ + return val & 0xffff; +} + +static inline u16 Hi16(u32 val) +{ + return val >> 16; +} + +static inline u16 Mk16(u8 hi, u8 lo) +{ + return lo | (((u16) hi) << 8); +} + +static inline u16 Mk16_le(__le16 * v) +{ + return le16_to_cpu(*v); +} + +static const u16 Sbox[256] = { + 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154, + 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A, + 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B, + 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B, + 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F, + 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F, + 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5, + 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F, + 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB, + 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397, + 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED, + 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A, + 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194, + 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3, + 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104, + 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D, + 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39, + 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695, + 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83, + 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76, + 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4, + 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B, + 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0, + 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018, + 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751, + 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85, + 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12, + 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9, + 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7, + 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A, + 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8, + 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A, +}; + +static inline u16 _S_(u16 v) +{ + u16 t = Sbox[Hi8(v)]; + return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8)); +} + +#define PHASE1_LOOP_COUNT 8 + +static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA, + u32 IV32) +{ + int i, j; + + /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */ + TTAK[0] = Lo16(IV32); + TTAK[1] = Hi16(IV32); + TTAK[2] = Mk16(TA[1], TA[0]); + TTAK[3] = Mk16(TA[3], TA[2]); + TTAK[4] = Mk16(TA[5], TA[4]); + + for (i = 0; i < PHASE1_LOOP_COUNT; i++) { + j = 2 * (i & 1); + TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j])); + TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j])); + TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j])); + TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j])); + TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i; + } +} + +static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, + u16 IV16) +{ + /* Make temporary area overlap WEP seed so that the final copy can be + * avoided on little endian hosts. */ + u16 *PPK = (u16 *) & WEPSeed[4]; + + /* Step 1 - make copy of TTAK and bring in TSC */ + PPK[0] = TTAK[0]; + PPK[1] = TTAK[1]; + PPK[2] = TTAK[2]; + PPK[3] = TTAK[3]; + PPK[4] = TTAK[4]; + PPK[5] = TTAK[4] + IV16; + + /* Step 2 - 96-bit bijective mixing using S-box */ + PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0])); + PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2])); + PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4])); + PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6])); + PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8])); + PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10])); + + PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12])); + PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14])); + PPK[2] += RotR1(PPK[1]); + PPK[3] += RotR1(PPK[2]); + PPK[4] += RotR1(PPK[3]); + PPK[5] += RotR1(PPK[4]); + + /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value + * WEPSeed[0..2] is transmitted as WEP IV */ + WEPSeed[0] = Hi8(IV16); + WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F; + WEPSeed[2] = Lo8(IV16); + WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1); + +#ifdef __BIG_ENDIAN + { + int i; + for (i = 0; i < 6; i++) + PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8); + } +#endif +} + +static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len, + u8 * rc4key, int keylen, void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + u8 *pos; + struct ieee80211_hdr *hdr; + + hdr = (struct ieee80211_hdr *)skb->data; + + if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len) + return -1; + + if (rc4key == NULL || keylen < 16) + return -1; + + if (!tkey->tx_phase1_done) { + tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2, + tkey->tx_iv32); + tkey->tx_phase1_done = 1; + } + tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16); + + pos = skb_push(skb, TKIP_HDR_LEN); + memmove(pos, pos + TKIP_HDR_LEN, hdr_len); + pos += hdr_len; + + *pos++ = *rc4key; + *pos++ = *(rc4key + 1); + *pos++ = *(rc4key + 2); + *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ; + *pos++ = tkey->tx_iv32 & 0xff; + *pos++ = (tkey->tx_iv32 >> 8) & 0xff; + *pos++ = (tkey->tx_iv32 >> 16) & 0xff; + *pos++ = (tkey->tx_iv32 >> 24) & 0xff; + + tkey->tx_iv16++; + if (tkey->tx_iv16 == 0) { + tkey->tx_phase1_done = 0; + tkey->tx_iv32++; + } + + return TKIP_HDR_LEN; +} + +static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 }; + int len; + u8 rc4key[16], *pos, *icv; + u32 crc; + struct scatterlist sg; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + if (net_ratelimit()) { + struct ieee80211_hdr *hdr = + (struct ieee80211_hdr *)skb->data; + printk(KERN_DEBUG ": TKIP countermeasures: dropped " + "TX packet to %pM\n", hdr->addr1); + } + return -1; + } + + if (skb_tailroom(skb) < 4 || skb->len < hdr_len) + return -1; + + len = skb->len - hdr_len; + pos = skb->data + hdr_len; + + if ((lib80211_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0) + return -1; + + crc = ~crc32_le(~0, pos, len); + icv = skb_put(skb, 4); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + + crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16); + sg_init_one(&sg, pos, len + 4); + return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4); +} + +/* + * deal with seq counter wrapping correctly. + * refer to timer_after() for jiffies wrapping handling + */ +static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, + u32 iv32_o, u16 iv16_o) +{ + if ((s32)iv32_n - (s32)iv32_o < 0 || + (iv32_n == iv32_o && iv16_n <= iv16_o)) + return 1; + return 0; +} + +static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 }; + u8 rc4key[16]; + u8 keyidx, *pos; + u32 iv32; + u16 iv16; + struct ieee80211_hdr *hdr; + u8 icv[4]; + u32 crc; + struct scatterlist sg; + int plen; + + hdr = (struct ieee80211_hdr *)skb->data; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + if (net_ratelimit()) { + printk(KERN_DEBUG ": TKIP countermeasures: dropped " + "received packet from %pM\n", hdr->addr2); + } + return -1; + } + + if (skb->len < hdr_len + TKIP_HDR_LEN + 4) + return -1; + + pos = skb->data + hdr_len; + keyidx = pos[3]; + if (!(keyidx & (1 << 5))) { + if (net_ratelimit()) { + printk(KERN_DEBUG "TKIP: received packet without ExtIV" + " flag from %pM\n", hdr->addr2); + } + return -2; + } + keyidx >>= 6; + if (tkey->key_idx != keyidx) { + printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame " + "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv); + return -6; + } + if (!tkey->key_set) { + if (net_ratelimit()) { + printk(KERN_DEBUG "TKIP: received packet from %pM" + " with keyid=%d that does not have a configured" + " key\n", hdr->addr2, keyidx); + } + return -3; + } + iv16 = (pos[0] << 8) | pos[2]; + iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24); + pos += TKIP_HDR_LEN; + + if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { +#ifdef CONFIG_LIB80211_DEBUG + if (net_ratelimit()) { + printk(KERN_DEBUG "TKIP: replay detected: STA=%pM" + " previous TSC %08x%04x received TSC " + "%08x%04x\n", hdr->addr2, + tkey->rx_iv32, tkey->rx_iv16, iv32, iv16); + } +#endif + tkey->dot11RSNAStatsTKIPReplays++; + return -4; + } + + if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) { + tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32); + tkey->rx_phase1_done = 1; + } + tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16); + + plen = skb->len - hdr_len - 12; + + crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16); + sg_init_one(&sg, pos, plen + 4); + if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) { + if (net_ratelimit()) { + printk(KERN_DEBUG ": TKIP: failed to decrypt " + "received packet from %pM\n", + hdr->addr2); + } + return -7; + } + + crc = ~crc32_le(~0, pos, plen); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + if (memcmp(icv, pos + plen, 4) != 0) { + if (iv32 != tkey->rx_iv32) { + /* Previously cached Phase1 result was already lost, so + * it needs to be recalculated for the next packet. */ + tkey->rx_phase1_done = 0; + } +#ifdef CONFIG_LIB80211_DEBUG + if (net_ratelimit()) { + printk(KERN_DEBUG "TKIP: ICV error detected: STA=" + "%pM\n", hdr->addr2); + } +#endif + tkey->dot11RSNAStatsTKIPICVErrors++; + return -5; + } + + /* Update real counters only after Michael MIC verification has + * completed */ + tkey->rx_iv32_new = iv32; + tkey->rx_iv16_new = iv16; + + /* Remove IV and ICV */ + memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len); + skb_pull(skb, TKIP_HDR_LEN); + skb_trim(skb, skb->len - 4); + + return keyidx; +} + +static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr, + u8 * data, size_t data_len, u8 * mic) +{ + struct hash_desc desc; + struct scatterlist sg[2]; + + if (tfm_michael == NULL) { + pr_warn("%s(): tfm_michael == NULL\n", __func__); + return -1; + } + sg_init_table(sg, 2); + sg_set_buf(&sg[0], hdr, 16); + sg_set_buf(&sg[1], data, data_len); + + if (crypto_hash_setkey(tfm_michael, key, 8)) + return -1; + + desc.tfm = tfm_michael; + desc.flags = 0; + return crypto_hash_digest(&desc, sg, data_len + 16, mic); +} + +static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) +{ + struct ieee80211_hdr *hdr11; + + hdr11 = (struct ieee80211_hdr *)skb->data; + + switch (le16_to_cpu(hdr11->frame_control) & + (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) { + case IEEE80211_FCTL_TODS: + memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ + break; + case IEEE80211_FCTL_FROMDS: + memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */ + break; + case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS: + memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ + break; + case 0: + memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ + break; + } + + if (ieee80211_is_data_qos(hdr11->frame_control)) { + hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11))) + & IEEE80211_QOS_CTL_TID_MASK; + } else + hdr[12] = 0; /* priority */ + + hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */ +} + +static int lib80211_michael_mic_add(struct sk_buff *skb, int hdr_len, + void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + u8 *pos; + + if (skb_tailroom(skb) < 8 || skb->len < hdr_len) { + printk(KERN_DEBUG "Invalid packet for Michael MIC add " + "(tailroom=%d hdr_len=%d skb->len=%d)\n", + skb_tailroom(skb), hdr_len, skb->len); + return -1; + } + + michael_mic_hdr(skb, tkey->tx_hdr); + pos = skb_put(skb, 8); + if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr, + skb->data + hdr_len, skb->len - 8 - hdr_len, pos)) + return -1; + + return 0; +} + +static void lib80211_michael_mic_failure(struct net_device *dev, + struct ieee80211_hdr *hdr, + int keyidx) +{ + union iwreq_data wrqu; + struct iw_michaelmicfailure ev; + + /* TODO: needed parameters: count, keyid, key type, TSC */ + memset(&ev, 0, sizeof(ev)); + ev.flags = keyidx & IW_MICFAILURE_KEY_ID; + if (hdr->addr1[0] & 0x01) + ev.flags |= IW_MICFAILURE_GROUP; + else + ev.flags |= IW_MICFAILURE_PAIRWISE; + ev.src_addr.sa_family = ARPHRD_ETHER; + memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN); + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = sizeof(ev); + wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev); +} + +static int lib80211_michael_mic_verify(struct sk_buff *skb, int keyidx, + int hdr_len, void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + u8 mic[8]; + + if (!tkey->key_set) + return -1; + + michael_mic_hdr(skb, tkey->rx_hdr); + if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr, + skb->data + hdr_len, skb->len - 8 - hdr_len, mic)) + return -1; + if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) { + struct ieee80211_hdr *hdr; + hdr = (struct ieee80211_hdr *)skb->data; + printk(KERN_DEBUG "%s: Michael MIC verification failed for " + "MSDU from %pM keyidx=%d\n", + skb->dev ? skb->dev->name : "N/A", hdr->addr2, + keyidx); + if (skb->dev) + lib80211_michael_mic_failure(skb->dev, hdr, keyidx); + tkey->dot11RSNAStatsTKIPLocalMICFailures++; + return -1; + } + + /* Update TSC counters for RX now that the packet verification has + * completed. */ + tkey->rx_iv32 = tkey->rx_iv32_new; + tkey->rx_iv16 = tkey->rx_iv16_new; + + skb_trim(skb, skb->len - 8); + + return 0; +} + +static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + int keyidx; + struct crypto_hash *tfm = tkey->tx_tfm_michael; + struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4; + struct crypto_hash *tfm3 = tkey->rx_tfm_michael; + struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4; + + keyidx = tkey->key_idx; + memset(tkey, 0, sizeof(*tkey)); + tkey->key_idx = keyidx; + tkey->tx_tfm_michael = tfm; + tkey->tx_tfm_arc4 = tfm2; + tkey->rx_tfm_michael = tfm3; + tkey->rx_tfm_arc4 = tfm4; + if (len == TKIP_KEY_LEN) { + memcpy(tkey->key, key, TKIP_KEY_LEN); + tkey->key_set = 1; + tkey->tx_iv16 = 1; /* TSC is initialized to 1 */ + if (seq) { + tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) | + (seq[3] << 8) | seq[2]; + tkey->rx_iv16 = (seq[1] << 8) | seq[0]; + } + } else if (len == 0) + tkey->key_set = 0; + else + return -1; + + return 0; +} + +static int lib80211_tkip_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct lib80211_tkip_data *tkey = priv; + + if (len < TKIP_KEY_LEN) + return -1; + + if (!tkey->key_set) + return 0; + memcpy(key, tkey->key, TKIP_KEY_LEN); + + if (seq) { + /* Return the sequence number of the last transmitted frame. */ + u16 iv16 = tkey->tx_iv16; + u32 iv32 = tkey->tx_iv32; + if (iv16 == 0) + iv32--; + iv16--; + seq[0] = tkey->tx_iv16; + seq[1] = tkey->tx_iv16 >> 8; + seq[2] = tkey->tx_iv32; + seq[3] = tkey->tx_iv32 >> 8; + seq[4] = tkey->tx_iv32 >> 16; + seq[5] = tkey->tx_iv32 >> 24; + } + + return TKIP_KEY_LEN; +} + +static char *lib80211_tkip_print_stats(char *p, void *priv) +{ + struct lib80211_tkip_data *tkip = priv; + p += sprintf(p, "key[%d] alg=TKIP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "replays=%d icv_errors=%d local_mic_failures=%d\n", + tkip->key_idx, tkip->key_set, + (tkip->tx_iv32 >> 24) & 0xff, + (tkip->tx_iv32 >> 16) & 0xff, + (tkip->tx_iv32 >> 8) & 0xff, + tkip->tx_iv32 & 0xff, + (tkip->tx_iv16 >> 8) & 0xff, + tkip->tx_iv16 & 0xff, + (tkip->rx_iv32 >> 24) & 0xff, + (tkip->rx_iv32 >> 16) & 0xff, + (tkip->rx_iv32 >> 8) & 0xff, + tkip->rx_iv32 & 0xff, + (tkip->rx_iv16 >> 8) & 0xff, + tkip->rx_iv16 & 0xff, + tkip->dot11RSNAStatsTKIPReplays, + tkip->dot11RSNAStatsTKIPICVErrors, + tkip->dot11RSNAStatsTKIPLocalMICFailures); + return p; +} + +static struct lib80211_crypto_ops lib80211_crypt_tkip = { + .name = "TKIP", + .init = lib80211_tkip_init, + .deinit = lib80211_tkip_deinit, + .encrypt_mpdu = lib80211_tkip_encrypt, + .decrypt_mpdu = lib80211_tkip_decrypt, + .encrypt_msdu = lib80211_michael_mic_add, + .decrypt_msdu = lib80211_michael_mic_verify, + .set_key = lib80211_tkip_set_key, + .get_key = lib80211_tkip_get_key, + .print_stats = lib80211_tkip_print_stats, + .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .extra_msdu_postfix_len = 8, /* MIC */ + .get_flags = lib80211_tkip_get_flags, + .set_flags = lib80211_tkip_set_flags, + .owner = THIS_MODULE, +}; + +static int __init lib80211_crypto_tkip_init(void) +{ + return lib80211_register_crypto_ops(&lib80211_crypt_tkip); +} + +static void __exit lib80211_crypto_tkip_exit(void) +{ + lib80211_unregister_crypto_ops(&lib80211_crypt_tkip); +} + +module_init(lib80211_crypto_tkip_init); +module_exit(lib80211_crypto_tkip_exit); diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c new file mode 100644 index 00000000..2f265e03 --- /dev/null +++ b/net/wireless/lib80211_crypt_wep.c @@ -0,0 +1,294 @@ +/* + * lib80211 crypt: host-based WEP encryption implementation for lib80211 + * + * Copyright (c) 2002-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. See README and COPYING for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +MODULE_AUTHOR("Jouni Malinen"); +MODULE_DESCRIPTION("lib80211 crypt: WEP"); +MODULE_LICENSE("GPL"); + +struct lib80211_wep_data { + u32 iv; +#define WEP_KEY_LEN 13 + u8 key[WEP_KEY_LEN + 1]; + u8 key_len; + u8 key_idx; + struct crypto_blkcipher *tx_tfm; + struct crypto_blkcipher *rx_tfm; +}; + +static void *lib80211_wep_init(int keyidx) +{ + struct lib80211_wep_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + priv->key_idx = keyidx; + + priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->tx_tfm)) { + printk(KERN_DEBUG "lib80211_crypt_wep: could not allocate " + "crypto API arc4\n"); + priv->tx_tfm = NULL; + goto fail; + } + + priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->rx_tfm)) { + printk(KERN_DEBUG "lib80211_crypt_wep: could not allocate " + "crypto API arc4\n"); + priv->rx_tfm = NULL; + goto fail; + } + /* start WEP IV from a random value */ + get_random_bytes(&priv->iv, 4); + + return priv; + + fail: + if (priv) { + if (priv->tx_tfm) + crypto_free_blkcipher(priv->tx_tfm); + if (priv->rx_tfm) + crypto_free_blkcipher(priv->rx_tfm); + kfree(priv); + } + return NULL; +} + +static void lib80211_wep_deinit(void *priv) +{ + struct lib80211_wep_data *_priv = priv; + if (_priv) { + if (_priv->tx_tfm) + crypto_free_blkcipher(_priv->tx_tfm); + if (_priv->rx_tfm) + crypto_free_blkcipher(_priv->rx_tfm); + } + kfree(priv); +} + +/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */ +static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len, + u8 *key, int keylen, void *priv) +{ + struct lib80211_wep_data *wep = priv; + u32 klen; + u8 *pos; + + if (skb_headroom(skb) < 4 || skb->len < hdr_len) + return -1; + + pos = skb_push(skb, 4); + memmove(pos, pos + 4, hdr_len); + pos += hdr_len; + + klen = 3 + wep->key_len; + + wep->iv++; + + /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key + * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N) + * can be used to speedup attacks, so avoid using them. */ + if ((wep->iv & 0xff00) == 0xff00) { + u8 B = (wep->iv >> 16) & 0xff; + if (B >= 3 && B < klen) + wep->iv += 0x0100; + } + + /* Prepend 24-bit IV to RC4 key and TX frame */ + *pos++ = (wep->iv >> 16) & 0xff; + *pos++ = (wep->iv >> 8) & 0xff; + *pos++ = wep->iv & 0xff; + *pos++ = wep->key_idx << 6; + + return 0; +} + +/* Perform WEP encryption on given skb that has at least 4 bytes of headroom + * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, + * so the payload length increases with 8 bytes. + * + * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) + */ +static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct lib80211_wep_data *wep = priv; + struct blkcipher_desc desc = { .tfm = wep->tx_tfm }; + u32 crc, klen, len; + u8 *pos, *icv; + struct scatterlist sg; + u8 key[WEP_KEY_LEN + 3]; + + /* other checks are in lib80211_wep_build_iv */ + if (skb_tailroom(skb) < 4) + return -1; + + /* add the IV to the frame */ + if (lib80211_wep_build_iv(skb, hdr_len, NULL, 0, priv)) + return -1; + + /* Copy the IV into the first 3 bytes of the key */ + skb_copy_from_linear_data_offset(skb, hdr_len, key, 3); + + /* Copy rest of the WEP key (the secret part) */ + memcpy(key + 3, wep->key, wep->key_len); + + len = skb->len - hdr_len - 4; + pos = skb->data + hdr_len + 4; + klen = 3 + wep->key_len; + + /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */ + crc = ~crc32_le(~0, pos, len); + icv = skb_put(skb, 4); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + + crypto_blkcipher_setkey(wep->tx_tfm, key, klen); + sg_init_one(&sg, pos, len + 4); + return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4); +} + +/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of + * the frame: IV (4 bytes), encrypted payload (including SNAP header), + * ICV (4 bytes). len includes both IV and ICV. + * + * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on + * failure. If frame is OK, IV and ICV will be removed. + */ +static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct lib80211_wep_data *wep = priv; + struct blkcipher_desc desc = { .tfm = wep->rx_tfm }; + u32 crc, klen, plen; + u8 key[WEP_KEY_LEN + 3]; + u8 keyidx, *pos, icv[4]; + struct scatterlist sg; + + if (skb->len < hdr_len + 8) + return -1; + + pos = skb->data + hdr_len; + key[0] = *pos++; + key[1] = *pos++; + key[2] = *pos++; + keyidx = *pos++ >> 6; + if (keyidx != wep->key_idx) + return -1; + + klen = 3 + wep->key_len; + + /* Copy rest of the WEP key (the secret part) */ + memcpy(key + 3, wep->key, wep->key_len); + + /* Apply RC4 to data and compute CRC32 over decrypted data */ + plen = skb->len - hdr_len - 8; + + crypto_blkcipher_setkey(wep->rx_tfm, key, klen); + sg_init_one(&sg, pos, plen + 4); + if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) + return -7; + + crc = ~crc32_le(~0, pos, plen); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + if (memcmp(icv, pos + plen, 4) != 0) { + /* ICV mismatch - drop frame */ + return -2; + } + + /* Remove IV and ICV */ + memmove(skb->data + 4, skb->data, hdr_len); + skb_pull(skb, 4); + skb_trim(skb, skb->len - 4); + + return 0; +} + +static int lib80211_wep_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct lib80211_wep_data *wep = priv; + + if (len < 0 || len > WEP_KEY_LEN) + return -1; + + memcpy(wep->key, key, len); + wep->key_len = len; + + return 0; +} + +static int lib80211_wep_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct lib80211_wep_data *wep = priv; + + if (len < wep->key_len) + return -1; + + memcpy(key, wep->key, wep->key_len); + + return wep->key_len; +} + +static char *lib80211_wep_print_stats(char *p, void *priv) +{ + struct lib80211_wep_data *wep = priv; + p += sprintf(p, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); + return p; +} + +static struct lib80211_crypto_ops lib80211_crypt_wep = { + .name = "WEP", + .init = lib80211_wep_init, + .deinit = lib80211_wep_deinit, + .encrypt_mpdu = lib80211_wep_encrypt, + .decrypt_mpdu = lib80211_wep_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = lib80211_wep_set_key, + .get_key = lib80211_wep_get_key, + .print_stats = lib80211_wep_print_stats, + .extra_mpdu_prefix_len = 4, /* IV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .owner = THIS_MODULE, +}; + +static int __init lib80211_crypto_wep_init(void) +{ + return lib80211_register_crypto_ops(&lib80211_crypt_wep); +} + +static void __exit lib80211_crypto_wep_exit(void) +{ + lib80211_unregister_crypto_ops(&lib80211_crypt_wep); +} + +module_init(lib80211_crypto_wep_init); +module_exit(lib80211_crypto_wep_exit); diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c new file mode 100644 index 00000000..5c116083 --- /dev/null +++ b/net/wireless/mesh.c @@ -0,0 +1,161 @@ +#include +#include +#include "nl80211.h" +#include "core.h" + +/* Default values, timeouts in ms */ +#define MESH_TTL 31 +#define MESH_DEFAULT_ELEMENT_TTL 31 +#define MESH_MAX_RETR 3 +#define MESH_RET_T 100 +#define MESH_CONF_T 100 +#define MESH_HOLD_T 100 + +#define MESH_PATH_TIMEOUT 5000 + +/* + * Minimum interval between two consecutive PREQs originated by the same + * interface + */ +#define MESH_PREQ_MIN_INT 10 +#define MESH_DIAM_TRAVERSAL_TIME 50 + +/* + * A path will be refreshed if it is used PATH_REFRESH_TIME milliseconds + * before timing out. This way it will remain ACTIVE and no data frames + * will be unnecessarily held in the pending queue. + */ +#define MESH_PATH_REFRESH_TIME 1000 +#define MESH_MIN_DISCOVERY_TIMEOUT (2 * MESH_DIAM_TRAVERSAL_TIME) + +/* Default maximum number of established plinks per interface */ +#define MESH_MAX_ESTAB_PLINKS 32 + +#define MESH_MAX_PREQ_RETRIES 4 + + +const struct mesh_config default_mesh_config = { + .dot11MeshRetryTimeout = MESH_RET_T, + .dot11MeshConfirmTimeout = MESH_CONF_T, + .dot11MeshHoldingTimeout = MESH_HOLD_T, + .dot11MeshMaxRetries = MESH_MAX_RETR, + .dot11MeshTTL = MESH_TTL, + .element_ttl = MESH_DEFAULT_ELEMENT_TTL, + .auto_open_plinks = true, + .dot11MeshMaxPeerLinks = MESH_MAX_ESTAB_PLINKS, + .dot11MeshHWMPactivePathTimeout = MESH_PATH_TIMEOUT, + .dot11MeshHWMPpreqMinInterval = MESH_PREQ_MIN_INT, + .dot11MeshHWMPnetDiameterTraversalTime = MESH_DIAM_TRAVERSAL_TIME, + .dot11MeshHWMPmaxPREQretries = MESH_MAX_PREQ_RETRIES, + .path_refresh_time = MESH_PATH_REFRESH_TIME, + .min_discovery_timeout = MESH_MIN_DISCOVERY_TIMEOUT, +}; + +const struct mesh_setup default_mesh_setup = { + .path_sel_proto = IEEE80211_PATH_PROTOCOL_HWMP, + .path_metric = IEEE80211_PATH_METRIC_AIRTIME, + .ie = NULL, + .ie_len = 0, + .is_secure = false, +}; + +int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const struct mesh_setup *setup, + const struct mesh_config *conf) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN); + + ASSERT_WDEV_LOCK(wdev); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) && + setup->is_secure) + return -EOPNOTSUPP; + + if (wdev->mesh_id_len) + return -EALREADY; + + if (!setup->mesh_id_len) + return -EINVAL; + + if (!rdev->ops->join_mesh) + return -EOPNOTSUPP; + + err = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup); + if (!err) { + memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len); + wdev->mesh_id_len = setup->mesh_id_len; + } + + return err; +} + +int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const struct mesh_setup *setup, + const struct mesh_config *conf) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_join_mesh(rdev, dev, setup, conf); + wdev_unlock(wdev); + + return err; +} + +void cfg80211_notify_new_peer_candidate(struct net_device *dev, + const u8 *macaddr, const u8* ie, u8 ie_len, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_MESH_POINT)) + return; + + nl80211_send_new_peer_candidate(wiphy_to_dev(wdev->wiphy), dev, + macaddr, ie, ie_len, gfp); +} +EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate); + +static int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + if (!rdev->ops->leave_mesh) + return -EOPNOTSUPP; + + if (!wdev->mesh_id_len) + return -ENOTCONN; + + err = rdev->ops->leave_mesh(&rdev->wiphy, dev); + if (!err) + wdev->mesh_id_len = 0; + return err; +} + +int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_leave_mesh(rdev, dev); + wdev_unlock(wdev); + + return err; +} diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c new file mode 100644 index 00000000..493b9399 --- /dev/null +++ b/net/wireless/mlme.c @@ -0,0 +1,1084 @@ +/* + * cfg80211 MLME SAP interface + * + * Copyright (c) 2009, Jouni Malinen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "core.h" +#include "nl80211.h" + +void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; + u8 *bssid = mgmt->bssid; + int i; + u16 status = le16_to_cpu(mgmt->u.auth.status_code); + bool done = false; + + wdev_lock(wdev); + + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (wdev->authtry_bsses[i] && + memcmp(wdev->authtry_bsses[i]->pub.bssid, bssid, + ETH_ALEN) == 0) { + if (status == WLAN_STATUS_SUCCESS) { + wdev->auth_bsses[i] = wdev->authtry_bsses[i]; + } else { + cfg80211_unhold_bss(wdev->authtry_bsses[i]); + cfg80211_put_bss(&wdev->authtry_bsses[i]->pub); + } + wdev->authtry_bsses[i] = NULL; + done = true; + break; + } + } + + if (done) { + nl80211_send_rx_auth(rdev, dev, buf, len, GFP_KERNEL); + cfg80211_sme_rx_auth(dev, buf, len); + } + + wdev_unlock(wdev); +} +EXPORT_SYMBOL(cfg80211_send_rx_auth); + +void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len) +{ + u16 status_code; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; + u8 *ie = mgmt->u.assoc_resp.variable; + int i, ieoffs = offsetof(struct ieee80211_mgmt, u.assoc_resp.variable); + struct cfg80211_internal_bss *bss = NULL; + + wdev_lock(wdev); + + status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); + + /* + * This is a bit of a hack, we don't notify userspace of + * a (re-)association reply if we tried to send a reassoc + * and got a reject -- we only try again with an assoc + * frame instead of reassoc. + */ + if (status_code != WLAN_STATUS_SUCCESS && wdev->conn && + cfg80211_sme_failed_reassoc(wdev)) + goto out; + + nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL); + + if (status_code == WLAN_STATUS_SUCCESS) { + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (!wdev->auth_bsses[i]) + continue; + if (memcmp(wdev->auth_bsses[i]->pub.bssid, mgmt->bssid, + ETH_ALEN) == 0) { + bss = wdev->auth_bsses[i]; + wdev->auth_bsses[i] = NULL; + /* additional reference to drop hold */ + cfg80211_ref_bss(bss); + break; + } + } + + /* + * We might be coming here because the driver reported + * a successful association at the same time as the + * user requested a deauth. In that case, we will have + * removed the BSS from the auth_bsses list due to the + * deauth request when the assoc response makes it. If + * the two code paths acquire the lock the other way + * around, that's just the standard situation of a + * deauth being requested while connected. + */ + if (!bss) + goto out; + } else if (wdev->conn) { + cfg80211_sme_failed_assoc(wdev); + /* + * do not call connect_result() now because the + * sme will schedule work that does it later. + */ + goto out; + } + + if (!wdev->conn && wdev->sme_state == CFG80211_SME_IDLE) { + /* + * This is for the userspace SME, the CONNECTING + * state will be changed to CONNECTED by + * __cfg80211_connect_result() below. + */ + wdev->sme_state = CFG80211_SME_CONNECTING; + } + + /* this consumes one bss reference (unless bss is NULL) */ + __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, + status_code, + status_code == WLAN_STATUS_SUCCESS, + bss ? &bss->pub : NULL); + /* drop hold now, and also reference acquired above */ + if (bss) { + cfg80211_unhold_bss(bss); + cfg80211_put_bss(&bss->pub); + } + + out: + wdev_unlock(wdev); +} +EXPORT_SYMBOL(cfg80211_send_rx_assoc); + +void __cfg80211_send_deauth(struct net_device *dev, + const u8 *buf, size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; + const u8 *bssid = mgmt->bssid; + int i; + bool found = false, was_current = false; + + ASSERT_WDEV_LOCK(wdev); + + if (wdev->current_bss && + memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + wdev->current_bss = NULL; + found = true; + was_current = true; + } else for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (wdev->auth_bsses[i] && + memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) { + cfg80211_unhold_bss(wdev->auth_bsses[i]); + cfg80211_put_bss(&wdev->auth_bsses[i]->pub); + wdev->auth_bsses[i] = NULL; + found = true; + break; + } + if (wdev->authtry_bsses[i] && + memcmp(wdev->authtry_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) { + cfg80211_unhold_bss(wdev->authtry_bsses[i]); + cfg80211_put_bss(&wdev->authtry_bsses[i]->pub); + wdev->authtry_bsses[i] = NULL; + found = true; + break; + } + } + + if (!found) + return; + + nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL); + + if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) { + u16 reason_code; + bool from_ap; + + reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); + + from_ap = memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0; + __cfg80211_disconnected(dev, NULL, 0, reason_code, from_ap); + } else if (wdev->sme_state == CFG80211_SME_CONNECTING) { + __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, NULL, 0, + WLAN_STATUS_UNSPECIFIED_FAILURE, + false, NULL); + } +} +EXPORT_SYMBOL(__cfg80211_send_deauth); + +void cfg80211_send_deauth(struct net_device *dev, const u8 *buf, size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + wdev_lock(wdev); + __cfg80211_send_deauth(dev, buf, len); + wdev_unlock(wdev); +} +EXPORT_SYMBOL(cfg80211_send_deauth); + +void __cfg80211_send_disassoc(struct net_device *dev, + const u8 *buf, size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; + const u8 *bssid = mgmt->bssid; + int i; + u16 reason_code; + bool from_ap; + bool done = false; + + ASSERT_WDEV_LOCK(wdev); + + nl80211_send_disassoc(rdev, dev, buf, len, GFP_KERNEL); + + if (wdev->sme_state != CFG80211_SME_CONNECTED) + return; + + if (wdev->current_bss && + memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) { + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (wdev->authtry_bsses[i] || wdev->auth_bsses[i]) + continue; + wdev->auth_bsses[i] = wdev->current_bss; + wdev->current_bss = NULL; + done = true; + cfg80211_sme_disassoc(dev, i); + break; + } + WARN_ON(!done); + } else + WARN_ON(1); + + + reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); + + from_ap = memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0; + __cfg80211_disconnected(dev, NULL, 0, reason_code, from_ap); +} +EXPORT_SYMBOL(__cfg80211_send_disassoc); + +void cfg80211_send_disassoc(struct net_device *dev, const u8 *buf, size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + wdev_lock(wdev); + __cfg80211_send_disassoc(dev, buf, len); + wdev_unlock(wdev); +} +EXPORT_SYMBOL(cfg80211_send_disassoc); + +void cfg80211_send_unprot_deauth(struct net_device *dev, const u8 *buf, + size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_unprot_deauth(rdev, dev, buf, len, GFP_ATOMIC); +} +EXPORT_SYMBOL(cfg80211_send_unprot_deauth); + +void cfg80211_send_unprot_disassoc(struct net_device *dev, const u8 *buf, + size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_unprot_disassoc(rdev, dev, buf, len, GFP_ATOMIC); +} +EXPORT_SYMBOL(cfg80211_send_unprot_disassoc); + +static void __cfg80211_auth_remove(struct wireless_dev *wdev, const u8 *addr) +{ + int i; + bool done = false; + + ASSERT_WDEV_LOCK(wdev); + + for (i = 0; addr && i < MAX_AUTH_BSSES; i++) { + if (wdev->authtry_bsses[i] && + memcmp(wdev->authtry_bsses[i]->pub.bssid, + addr, ETH_ALEN) == 0) { + cfg80211_unhold_bss(wdev->authtry_bsses[i]); + cfg80211_put_bss(&wdev->authtry_bsses[i]->pub); + wdev->authtry_bsses[i] = NULL; + done = true; + break; + } + } + + WARN_ON(!done); +} + +void __cfg80211_auth_canceled(struct net_device *dev, const u8 *addr) +{ + __cfg80211_auth_remove(dev->ieee80211_ptr, addr); +} +EXPORT_SYMBOL(__cfg80211_auth_canceled); + +void cfg80211_send_auth_timeout(struct net_device *dev, const u8 *addr) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + wdev_lock(wdev); + + nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL); + if (wdev->sme_state == CFG80211_SME_CONNECTING) + __cfg80211_connect_result(dev, addr, NULL, 0, NULL, 0, + WLAN_STATUS_UNSPECIFIED_FAILURE, + false, NULL); + + __cfg80211_auth_remove(wdev, addr); + + wdev_unlock(wdev); +} +EXPORT_SYMBOL(cfg80211_send_auth_timeout); + +void cfg80211_send_assoc_timeout(struct net_device *dev, const u8 *addr) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + int i; + bool done = false; + + wdev_lock(wdev); + + nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL); + if (wdev->sme_state == CFG80211_SME_CONNECTING) + __cfg80211_connect_result(dev, addr, NULL, 0, NULL, 0, + WLAN_STATUS_UNSPECIFIED_FAILURE, + false, NULL); + + for (i = 0; addr && i < MAX_AUTH_BSSES; i++) { + if (wdev->auth_bsses[i] && + memcmp(wdev->auth_bsses[i]->pub.bssid, + addr, ETH_ALEN) == 0) { + cfg80211_unhold_bss(wdev->auth_bsses[i]); + cfg80211_put_bss(&wdev->auth_bsses[i]->pub); + wdev->auth_bsses[i] = NULL; + done = true; + break; + } + } + + WARN_ON(!done); + + wdev_unlock(wdev); +} +EXPORT_SYMBOL(cfg80211_send_assoc_timeout); + +void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, + enum nl80211_key_type key_type, int key_id, + const u8 *tsc, gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); +#ifdef CONFIG_CFG80211_WEXT + union iwreq_data wrqu; + char *buf = kmalloc(128, gfp); + + if (buf) { + sprintf(buf, "MLME-MICHAELMICFAILURE.indication(" + "keyid=%d %scast addr=%pM)", key_id, + key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni", + addr); + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = strlen(buf); + wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); + kfree(buf); + } +#endif + + nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp); +} +EXPORT_SYMBOL(cfg80211_michael_mic_failure); + +/* some MLME handling for userspace SME */ +int __cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_auth_type auth_type, + const u8 *bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, + const u8 *key, int key_len, int key_idx, + bool local_state_change) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_auth_request req; + struct cfg80211_internal_bss *bss; + int i, err, slot = -1, nfree = 0; + + ASSERT_WDEV_LOCK(wdev); + + if (auth_type == NL80211_AUTHTYPE_SHARED_KEY) + if (!key || !key_len || key_idx < 0 || key_idx > 4) + return -EINVAL; + + if (wdev->current_bss && + memcmp(bssid, wdev->current_bss->pub.bssid, ETH_ALEN) == 0) + return -EALREADY; + + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (wdev->authtry_bsses[i] && + memcmp(bssid, wdev->authtry_bsses[i]->pub.bssid, + ETH_ALEN) == 0) + return -EALREADY; + if (wdev->auth_bsses[i] && + memcmp(bssid, wdev->auth_bsses[i]->pub.bssid, + ETH_ALEN) == 0) + return -EALREADY; + } + + memset(&req, 0, sizeof(req)); + + req.local_state_change = local_state_change; + req.ie = ie; + req.ie_len = ie_len; + req.auth_type = auth_type; + req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, + WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); + req.key = key; + req.key_len = key_len; + req.key_idx = key_idx; + if (!req.bss) + return -ENOENT; + + bss = bss_from_pub(req.bss); + + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (!wdev->auth_bsses[i] && !wdev->authtry_bsses[i]) { + slot = i; + nfree++; + } + } + + /* we need one free slot for disassoc and one for this auth */ + if (nfree < 2) { + err = -ENOSPC; + goto out; + } + + if (local_state_change) + wdev->auth_bsses[slot] = bss; + else + wdev->authtry_bsses[slot] = bss; + cfg80211_hold_bss(bss); + + err = rdev->ops->auth(&rdev->wiphy, dev, &req); + if (err) { + if (local_state_change) + wdev->auth_bsses[slot] = NULL; + else + wdev->authtry_bsses[slot] = NULL; + cfg80211_unhold_bss(bss); + } + + out: + if (err) + cfg80211_put_bss(req.bss); + return err; +} + +int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct ieee80211_channel *chan, + enum nl80211_auth_type auth_type, const u8 *bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, + const u8 *key, int key_len, int key_idx, + bool local_state_change) +{ + int err; + + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, + ssid, ssid_len, ie, ie_len, + key, key_len, key_idx, local_state_change); + wdev_unlock(dev->ieee80211_ptr); + + return err; +} + +int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + const u8 *bssid, const u8 *prev_bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, bool use_mfp, + struct cfg80211_crypto_settings *crypt) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_assoc_request req; + struct cfg80211_internal_bss *bss; + int i, err, slot = -1; + bool was_connected = false; + + ASSERT_WDEV_LOCK(wdev); + + memset(&req, 0, sizeof(req)); + + if (wdev->current_bss && prev_bssid && + memcmp(wdev->current_bss->pub.bssid, prev_bssid, ETH_ALEN) == 0) { + /* + * Trying to reassociate: Allow this to proceed and let the old + * association to be dropped when the new one is completed. + */ + if (wdev->sme_state == CFG80211_SME_CONNECTED) { + was_connected = true; + wdev->sme_state = CFG80211_SME_CONNECTING; + } + } else if (wdev->current_bss) + return -EALREADY; + + req.ie = ie; + req.ie_len = ie_len; + memcpy(&req.crypto, crypt, sizeof(req.crypto)); + req.use_mfp = use_mfp; + req.prev_bssid = prev_bssid; + req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, + WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); + if (!req.bss) { + if (was_connected) + wdev->sme_state = CFG80211_SME_CONNECTED; + return -ENOENT; + } + + bss = bss_from_pub(req.bss); + + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (bss == wdev->auth_bsses[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + err = -ENOTCONN; + goto out; + } + + err = rdev->ops->assoc(&rdev->wiphy, dev, &req); + out: + if (err && was_connected) + wdev->sme_state = CFG80211_SME_CONNECTED; + /* still a reference in wdev->auth_bsses[slot] */ + cfg80211_put_bss(req.bss); + return err; +} + +int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + const u8 *bssid, const u8 *prev_bssid, + const u8 *ssid, int ssid_len, + const u8 *ie, int ie_len, bool use_mfp, + struct cfg80211_crypto_settings *crypt) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, + ssid, ssid_len, ie, ie_len, use_mfp, crypt); + wdev_unlock(wdev); + + return err; +} + +int __cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_deauth_request req; + int i; + + ASSERT_WDEV_LOCK(wdev); + + memset(&req, 0, sizeof(req)); + req.reason_code = reason; + req.local_state_change = local_state_change; + req.ie = ie; + req.ie_len = ie_len; + if (wdev->current_bss && + memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) { + req.bss = &wdev->current_bss->pub; + } else for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (wdev->auth_bsses[i] && + memcmp(bssid, wdev->auth_bsses[i]->pub.bssid, ETH_ALEN) == 0) { + req.bss = &wdev->auth_bsses[i]->pub; + break; + } + if (wdev->authtry_bsses[i] && + memcmp(bssid, wdev->authtry_bsses[i]->pub.bssid, ETH_ALEN) == 0) { + req.bss = &wdev->authtry_bsses[i]->pub; + break; + } + } + + if (!req.bss) + return -ENOTCONN; + + return rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev); +} + +int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason, + local_state_change); + wdev_unlock(wdev); + + return err; +} + +static int __cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_disassoc_request req; + + ASSERT_WDEV_LOCK(wdev); + + if (wdev->sme_state != CFG80211_SME_CONNECTED) + return -ENOTCONN; + + if (WARN_ON(!wdev->current_bss)) + return -ENOTCONN; + + memset(&req, 0, sizeof(req)); + req.reason_code = reason; + req.local_state_change = local_state_change; + req.ie = ie; + req.ie_len = ie_len; + if (memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) + req.bss = &wdev->current_bss->pub; + else + return -ENOTCONN; + + return rdev->ops->disassoc(&rdev->wiphy, dev, &req, wdev); +} + +int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *bssid, + const u8 *ie, int ie_len, u16 reason, + bool local_state_change) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason, + local_state_change); + wdev_unlock(wdev); + + return err; +} + +void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_deauth_request req; + int i; + + ASSERT_WDEV_LOCK(wdev); + + if (!rdev->ops->deauth) + return; + + memset(&req, 0, sizeof(req)); + req.reason_code = WLAN_REASON_DEAUTH_LEAVING; + req.ie = NULL; + req.ie_len = 0; + + if (wdev->current_bss) { + req.bss = &wdev->current_bss->pub; + rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev); + if (wdev->current_bss) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + wdev->current_bss = NULL; + } + } + + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (wdev->auth_bsses[i]) { + req.bss = &wdev->auth_bsses[i]->pub; + rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev); + if (wdev->auth_bsses[i]) { + cfg80211_unhold_bss(wdev->auth_bsses[i]); + cfg80211_put_bss(&wdev->auth_bsses[i]->pub); + wdev->auth_bsses[i] = NULL; + } + } + if (wdev->authtry_bsses[i]) { + req.bss = &wdev->authtry_bsses[i]->pub; + rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev); + if (wdev->authtry_bsses[i]) { + cfg80211_unhold_bss(wdev->authtry_bsses[i]); + cfg80211_put_bss(&wdev->authtry_bsses[i]->pub); + wdev->authtry_bsses[i] = NULL; + } + } + } +} + +void cfg80211_ready_on_channel(struct net_device *dev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_remain_on_channel(rdev, dev, cookie, chan, channel_type, + duration, gfp); +} +EXPORT_SYMBOL(cfg80211_ready_on_channel); + +void cfg80211_remain_on_channel_expired(struct net_device *dev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_remain_on_channel_cancel(rdev, dev, cookie, chan, + channel_type, gfp); +} +EXPORT_SYMBOL(cfg80211_remain_on_channel_expired); + +void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, + struct station_info *sinfo, gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_sta_event(rdev, dev, mac_addr, sinfo, gfp); +} +EXPORT_SYMBOL(cfg80211_new_sta); + +void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp) +{ + struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + nl80211_send_sta_del_event(rdev, dev, mac_addr, gfp); +} +EXPORT_SYMBOL(cfg80211_del_sta); + +struct cfg80211_mgmt_registration { + struct list_head list; + + u32 nlpid; + + int match_len; + + __le16 frame_type; + + u8 match[]; +}; + +int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, + u16 frame_type, const u8 *match_data, + int match_len) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_mgmt_registration *reg, *nreg; + int err = 0; + u16 mgmt_type; + + if (!wdev->wiphy->mgmt_stypes) + return -EOPNOTSUPP; + + if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) + return -EINVAL; + + if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) + return -EINVAL; + + mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; + if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) + return -EINVAL; + + nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); + if (!nreg) + return -ENOMEM; + + spin_lock_bh(&wdev->mgmt_registrations_lock); + + list_for_each_entry(reg, &wdev->mgmt_registrations, list) { + int mlen = min(match_len, reg->match_len); + + if (frame_type != le16_to_cpu(reg->frame_type)) + continue; + + if (memcmp(reg->match, match_data, mlen) == 0) { + err = -EALREADY; + break; + } + } + + if (err) { + kfree(nreg); + goto out; + } + + memcpy(nreg->match, match_data, match_len); + nreg->match_len = match_len; + nreg->nlpid = snd_pid; + nreg->frame_type = cpu_to_le16(frame_type); + list_add(&nreg->list, &wdev->mgmt_registrations); + + if (rdev->ops->mgmt_frame_register) + rdev->ops->mgmt_frame_register(wiphy, wdev->netdev, + frame_type, true); + + out: + spin_unlock_bh(&wdev->mgmt_registrations_lock); + + return err; +} + +void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_mgmt_registration *reg, *tmp; + + spin_lock_bh(&wdev->mgmt_registrations_lock); + + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { + if (reg->nlpid != nlpid) + continue; + + if (rdev->ops->mgmt_frame_register) { + u16 frame_type = le16_to_cpu(reg->frame_type); + + rdev->ops->mgmt_frame_register(wiphy, wdev->netdev, + frame_type, false); + } + + list_del(®->list); + kfree(reg); + } + + spin_unlock_bh(&wdev->mgmt_registrations_lock); +} + +void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) +{ + struct cfg80211_mgmt_registration *reg, *tmp; + + spin_lock_bh(&wdev->mgmt_registrations_lock); + + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { + list_del(®->list); + kfree(reg); + } + + spin_unlock_bh(&wdev->mgmt_registrations_lock); +} + +int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, bool offchan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, unsigned int wait, + const u8 *buf, size_t len, u64 *cookie) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + const struct ieee80211_mgmt *mgmt; + u16 stype; + + if (!wdev->wiphy->mgmt_stypes) + return -EOPNOTSUPP; + + if (!rdev->ops->mgmt_tx) + return -EOPNOTSUPP; + + if (len < 24 + 1) + return -EINVAL; + + mgmt = (const struct ieee80211_mgmt *) buf; + + if (!ieee80211_is_mgmt(mgmt->frame_control)) + return -EINVAL; + + stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; + if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) + return -EINVAL; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { + int err = 0; + + wdev_lock(wdev); + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (!wdev->current_bss) { + err = -ENOTCONN; + break; + } + + if (memcmp(wdev->current_bss->pub.bssid, + mgmt->bssid, ETH_ALEN)) { + err = -ENOTCONN; + break; + } + + /* + * check for IBSS DA must be done by driver as + * cfg80211 doesn't track the stations + */ + if (wdev->iftype == NL80211_IFTYPE_ADHOC) + break; + + /* for station, check that DA is the AP */ + if (memcmp(wdev->current_bss->pub.bssid, + mgmt->da, ETH_ALEN)) { + err = -ENOTCONN; + break; + } + break; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_AP_VLAN: + if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN)) + err = -EINVAL; + break; + case NL80211_IFTYPE_MESH_POINT: + if (memcmp(mgmt->sa, mgmt->bssid, ETH_ALEN)) { + err = -EINVAL; + break; + } + /* + * check for mesh DA must be done by driver as + * cfg80211 doesn't track the stations + */ + break; + default: + err = -EOPNOTSUPP; + break; + } + wdev_unlock(wdev); + + if (err) + return err; + } + + if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0) + return -EINVAL; + + /* Transmit the Action frame as requested by user space */ + return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, offchan, + channel_type, channel_type_valid, + wait, buf, len, cookie); +} + +bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf, + size_t len, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct cfg80211_mgmt_registration *reg; + const struct ieee80211_txrx_stypes *stypes = + &wiphy->mgmt_stypes[wdev->iftype]; + struct ieee80211_mgmt *mgmt = (void *)buf; + const u8 *data; + int data_len; + bool result = false; + __le16 ftype = mgmt->frame_control & + cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); + u16 stype; + + stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; + + if (!(stypes->rx & BIT(stype))) + return false; + + data = buf + ieee80211_hdrlen(mgmt->frame_control); + data_len = len - ieee80211_hdrlen(mgmt->frame_control); + + spin_lock_bh(&wdev->mgmt_registrations_lock); + + list_for_each_entry(reg, &wdev->mgmt_registrations, list) { + if (reg->frame_type != ftype) + continue; + + if (reg->match_len > data_len) + continue; + + if (memcmp(reg->match, data, reg->match_len)) + continue; + + /* found match! */ + + /* Indicate the received Action frame to user space */ + if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq, + buf, len, gfp)) + continue; + + result = true; + break; + } + + spin_unlock_bh(&wdev->mgmt_registrations_lock); + + return result; +} +EXPORT_SYMBOL(cfg80211_rx_mgmt); + +void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + /* Indicate TX status of the Action frame to user space */ + nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp); +} +EXPORT_SYMBOL(cfg80211_mgmt_tx_status); + +void cfg80211_cqm_rssi_notify(struct net_device *dev, + enum nl80211_cqm_rssi_threshold_event rssi_event, + gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + /* Indicate roaming trigger event to user space */ + nl80211_send_cqm_rssi_notify(rdev, dev, rssi_event, gfp); +} +EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); + +void cfg80211_cqm_pktloss_notify(struct net_device *dev, + const u8 *peer, u32 num_packets, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + /* Indicate roaming trigger event to user space */ + nl80211_send_cqm_pktloss_notify(rdev, dev, peer, num_packets, gfp); +} +EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c new file mode 100644 index 00000000..4e84e222 --- /dev/null +++ b/net/wireless/nl80211.c @@ -0,0 +1,6994 @@ +/* + * This is the new netlink-based wireless configuration interface. + * + * Copyright 2006-2010 Johannes Berg + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core.h" +#include "nl80211.h" +#include "reg.h" + +static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info); +static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +/* the netlink family */ +static struct genl_family nl80211_fam = { + .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ + .name = "nl80211", /* have users key off the name instead */ + .hdrsize = 0, /* no private header */ + .version = 1, /* no particular meaning now */ + .maxattr = NL80211_ATTR_MAX, + .netnsok = true, + .pre_doit = nl80211_pre_doit, + .post_doit = nl80211_post_doit, +}; + +/* internal helper: get rdev and dev */ +static int get_rdev_dev_by_info_ifindex(struct genl_info *info, + struct cfg80211_registered_device **rdev, + struct net_device **dev) +{ + struct nlattr **attrs = info->attrs; + int ifindex; + + if (!attrs[NL80211_ATTR_IFINDEX]) + return -EINVAL; + + ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]); + *dev = dev_get_by_index(genl_info_net(info), ifindex); + if (!*dev) + return -ENODEV; + + *rdev = cfg80211_get_dev_from_ifindex(genl_info_net(info), ifindex); + if (IS_ERR(*rdev)) { + dev_put(*dev); + return PTR_ERR(*rdev); + } + + return 0; +} + +/* policy for the attributes */ +static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { + [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, + .len = 20-1 }, + [NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED }, + [NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, + + [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, + [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, + [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, + + [NL80211_ATTR_MAC] = { .len = ETH_ALEN }, + [NL80211_ATTR_PREV_BSSID] = { .len = ETH_ALEN }, + + [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, + [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, + .len = WLAN_MAX_KEY_LEN }, + [NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 }, + [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, + [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, + [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 }, + [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 }, + + [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, + [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, + [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_STA_AID] = { .type = NLA_U16 }, + [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED }, + [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 }, + [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_ATTR_STA_PLINK_ACTION] = { .type = NLA_U8 }, + [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 }, + [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, + [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_MESH_ID_LEN }, + [NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 }, + + [NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 }, + [NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED }, + + [NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 }, + [NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 }, + [NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 }, + [NL80211_ATTR_BSS_BASIC_RATES] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_ATTR_BSS_HT_OPMODE] = { .type = NLA_U16 }, + + [NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED }, + [NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG }, + + [NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN }, + + [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, + [NL80211_ATTR_IE] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED }, + [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED }, + + [NL80211_ATTR_SSID] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_SSID_LEN }, + [NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 }, + [NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 }, + [NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG }, + [NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG }, + [NL80211_ATTR_USE_MFP] = { .type = NLA_U32 }, + [NL80211_ATTR_STA_FLAGS2] = { + .len = sizeof(struct nl80211_sta_flag_update), + }, + [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, + [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, + [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, + [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, + [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, + [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, + [NL80211_ATTR_PID] = { .type = NLA_U32 }, + [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, + [NL80211_ATTR_PMKID] = { .type = NLA_BINARY, + .len = WLAN_PMKID_LEN }, + [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, + [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, + [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, + [NL80211_ATTR_FRAME] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, }, + [NL80211_ATTR_PS_STATE] = { .type = NLA_U32 }, + [NL80211_ATTR_CQM] = { .type = NLA_NESTED, }, + [NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG }, + [NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 }, + [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 }, + [NL80211_ATTR_WIPHY_ANTENNA_TX] = { .type = NLA_U32 }, + [NL80211_ATTR_WIPHY_ANTENNA_RX] = { .type = NLA_U32 }, + [NL80211_ATTR_MCAST_RATE] = { .type = NLA_U32 }, + [NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG }, + [NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, + [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED }, + [NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 }, + [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 }, + [NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED }, + [NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED }, + [NL80211_ATTR_HIDDEN_SSID] = { .type = NLA_U32 }, + [NL80211_ATTR_IE_PROBE_RESP] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_IE_ASSOC_RESP] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG }, + [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED }, +}; + +/* policy for the key attributes */ +static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { + [NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN }, + [NL80211_KEY_IDX] = { .type = NLA_U8 }, + [NL80211_KEY_CIPHER] = { .type = NLA_U32 }, + [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 }, + [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG }, + [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, + [NL80211_KEY_TYPE] = { .type = NLA_U32 }, + [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, +}; + +/* policy for the key default flags */ +static const struct nla_policy +nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = { + [NL80211_KEY_DEFAULT_TYPE_UNICAST] = { .type = NLA_FLAG }, + [NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG }, +}; + +/* policy for WoWLAN attributes */ +static const struct nla_policy +nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = { + [NL80211_WOWLAN_TRIG_ANY] = { .type = NLA_FLAG }, + [NL80211_WOWLAN_TRIG_DISCONNECT] = { .type = NLA_FLAG }, + [NL80211_WOWLAN_TRIG_MAGIC_PKT] = { .type = NLA_FLAG }, + [NL80211_WOWLAN_TRIG_PKT_PATTERN] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { + [NL80211_ATTR_SCHED_SCAN_MATCH_SSID] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_SSID_LEN }, +}; + +/* ifidx get helper */ +static int nl80211_get_ifidx(struct netlink_callback *cb) +{ + int res; + + res = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, + nl80211_fam.attrbuf, nl80211_fam.maxattr, + nl80211_policy); + if (res) + return res; + + if (!nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]) + return -EINVAL; + + res = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]); + if (!res) + return -EINVAL; + return res; +} + +static int nl80211_prepare_netdev_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct net_device **dev) +{ + int ifidx = cb->args[0]; + int err; + + if (!ifidx) + ifidx = nl80211_get_ifidx(cb); + if (ifidx < 0) + return ifidx; + + cb->args[0] = ifidx; + + rtnl_lock(); + + *dev = __dev_get_by_index(sock_net(skb->sk), ifidx); + if (!*dev) { + err = -ENODEV; + goto out_rtnl; + } + + *rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); + if (IS_ERR(*rdev)) { + err = PTR_ERR(*rdev); + goto out_rtnl; + } + + return 0; + out_rtnl: + rtnl_unlock(); + return err; +} + +static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev) +{ + cfg80211_unlock_rdev(rdev); + rtnl_unlock(); +} + +/* IE validation */ +static bool is_valid_ie_attr(const struct nlattr *attr) +{ + const u8 *pos; + int len; + + if (!attr) + return true; + + pos = nla_data(attr); + len = nla_len(attr); + + while (len) { + u8 elemlen; + + if (len < 2) + return false; + len -= 2; + + elemlen = pos[1]; + if (elemlen > len) + return false; + + len -= elemlen; + pos += 2 + elemlen; + } + + return true; +} + +/* message building helper */ +static inline void *nl80211hdr_put(struct sk_buff *skb, u32 pid, u32 seq, + int flags, u8 cmd) +{ + /* since there is no private header just add the generic one */ + return genlmsg_put(skb, pid, seq, &nl80211_fam, flags, cmd); +} + +static int nl80211_msg_put_channel(struct sk_buff *msg, + struct ieee80211_channel *chan) +{ + NLA_PUT_U32(msg, NL80211_FREQUENCY_ATTR_FREQ, + chan->center_freq); + + if (chan->flags & IEEE80211_CHAN_DISABLED) + NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_DISABLED); + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) + NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_PASSIVE_SCAN); + if (chan->flags & IEEE80211_CHAN_NO_IBSS) + NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_NO_IBSS); + if (chan->flags & IEEE80211_CHAN_RADAR) + NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_RADAR); + + NLA_PUT_U32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, + DBM_TO_MBM(chan->max_power)); + + return 0; + + nla_put_failure: + return -ENOBUFS; +} + +/* netlink command implementations */ + +struct key_parse { + struct key_params p; + int idx; + int type; + bool def, defmgmt; + bool def_uni, def_multi; +}; + +static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) +{ + struct nlattr *tb[NL80211_KEY_MAX + 1]; + int err = nla_parse_nested(tb, NL80211_KEY_MAX, key, + nl80211_key_policy); + if (err) + return err; + + k->def = !!tb[NL80211_KEY_DEFAULT]; + k->defmgmt = !!tb[NL80211_KEY_DEFAULT_MGMT]; + + if (k->def) { + k->def_uni = true; + k->def_multi = true; + } + if (k->defmgmt) + k->def_multi = true; + + if (tb[NL80211_KEY_IDX]) + k->idx = nla_get_u8(tb[NL80211_KEY_IDX]); + + if (tb[NL80211_KEY_DATA]) { + k->p.key = nla_data(tb[NL80211_KEY_DATA]); + k->p.key_len = nla_len(tb[NL80211_KEY_DATA]); + } + + if (tb[NL80211_KEY_SEQ]) { + k->p.seq = nla_data(tb[NL80211_KEY_SEQ]); + k->p.seq_len = nla_len(tb[NL80211_KEY_SEQ]); + } + + if (tb[NL80211_KEY_CIPHER]) + k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]); + + if (tb[NL80211_KEY_TYPE]) { + k->type = nla_get_u32(tb[NL80211_KEY_TYPE]); + if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) + return -EINVAL; + } + + if (tb[NL80211_KEY_DEFAULT_TYPES]) { + struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; + int err = nla_parse_nested(kdt, + NUM_NL80211_KEY_DEFAULT_TYPES - 1, + tb[NL80211_KEY_DEFAULT_TYPES], + nl80211_key_default_policy); + if (err) + return err; + + k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST]; + k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST]; + } + + return 0; +} + +static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k) +{ + if (info->attrs[NL80211_ATTR_KEY_DATA]) { + k->p.key = nla_data(info->attrs[NL80211_ATTR_KEY_DATA]); + k->p.key_len = nla_len(info->attrs[NL80211_ATTR_KEY_DATA]); + } + + if (info->attrs[NL80211_ATTR_KEY_SEQ]) { + k->p.seq = nla_data(info->attrs[NL80211_ATTR_KEY_SEQ]); + k->p.seq_len = nla_len(info->attrs[NL80211_ATTR_KEY_SEQ]); + } + + if (info->attrs[NL80211_ATTR_KEY_IDX]) + k->idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + + if (info->attrs[NL80211_ATTR_KEY_CIPHER]) + k->p.cipher = nla_get_u32(info->attrs[NL80211_ATTR_KEY_CIPHER]); + + k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT]; + k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT]; + + if (k->def) { + k->def_uni = true; + k->def_multi = true; + } + if (k->defmgmt) + k->def_multi = true; + + if (info->attrs[NL80211_ATTR_KEY_TYPE]) { + k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); + if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) { + struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; + int err = nla_parse_nested( + kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, + info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES], + nl80211_key_default_policy); + if (err) + return err; + + k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST]; + k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST]; + } + + return 0; +} + +static int nl80211_parse_key(struct genl_info *info, struct key_parse *k) +{ + int err; + + memset(k, 0, sizeof(*k)); + k->idx = -1; + k->type = -1; + + if (info->attrs[NL80211_ATTR_KEY]) + err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k); + else + err = nl80211_parse_key_old(info, k); + + if (err) + return err; + + if (k->def && k->defmgmt) + return -EINVAL; + + if (k->defmgmt) { + if (k->def_uni || !k->def_multi) + return -EINVAL; + } + + if (k->idx != -1) { + if (k->defmgmt) { + if (k->idx < 4 || k->idx > 5) + return -EINVAL; + } else if (k->def) { + if (k->idx < 0 || k->idx > 3) + return -EINVAL; + } else { + if (k->idx < 0 || k->idx > 5) + return -EINVAL; + } + } + + return 0; +} + +static struct cfg80211_cached_keys * +nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, + struct nlattr *keys) +{ + struct key_parse parse; + struct nlattr *key; + struct cfg80211_cached_keys *result; + int rem, err, def = 0; + + result = kzalloc(sizeof(*result), GFP_KERNEL); + if (!result) + return ERR_PTR(-ENOMEM); + + result->def = -1; + result->defmgmt = -1; + + nla_for_each_nested(key, keys, rem) { + memset(&parse, 0, sizeof(parse)); + parse.idx = -1; + + err = nl80211_parse_key_new(key, &parse); + if (err) + goto error; + err = -EINVAL; + if (!parse.p.key) + goto error; + if (parse.idx < 0 || parse.idx > 4) + goto error; + if (parse.def) { + if (def) + goto error; + def = 1; + result->def = parse.idx; + if (!parse.def_uni || !parse.def_multi) + goto error; + } else if (parse.defmgmt) + goto error; + err = cfg80211_validate_key_settings(rdev, &parse.p, + parse.idx, false, NULL); + if (err) + goto error; + result->params[parse.idx].cipher = parse.p.cipher; + result->params[parse.idx].key_len = parse.p.key_len; + result->params[parse.idx].key = result->data[parse.idx]; + memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len); + } + + return result; + error: + kfree(result); + return ERR_PTR(err); +} + +static int nl80211_key_allowed(struct wireless_dev *wdev) +{ + ASSERT_WDEV_LOCK(wdev); + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + break; + case NL80211_IFTYPE_ADHOC: + if (!wdev->current_bss) + return -ENOLINK; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->sme_state != CFG80211_SME_CONNECTED) + return -ENOLINK; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int nl80211_put_iftypes(struct sk_buff *msg, u32 attr, u16 ifmodes) +{ + struct nlattr *nl_modes = nla_nest_start(msg, attr); + int i; + + if (!nl_modes) + goto nla_put_failure; + + i = 0; + while (ifmodes) { + if (ifmodes & 1) + NLA_PUT_FLAG(msg, i); + ifmodes >>= 1; + i++; + } + + nla_nest_end(msg, nl_modes); + return 0; + +nla_put_failure: + return -ENOBUFS; +} + +static int nl80211_put_iface_combinations(struct wiphy *wiphy, + struct sk_buff *msg) +{ + struct nlattr *nl_combis; + int i, j; + + nl_combis = nla_nest_start(msg, + NL80211_ATTR_INTERFACE_COMBINATIONS); + if (!nl_combis) + goto nla_put_failure; + + for (i = 0; i < wiphy->n_iface_combinations; i++) { + const struct ieee80211_iface_combination *c; + struct nlattr *nl_combi, *nl_limits; + + c = &wiphy->iface_combinations[i]; + + nl_combi = nla_nest_start(msg, i + 1); + if (!nl_combi) + goto nla_put_failure; + + nl_limits = nla_nest_start(msg, NL80211_IFACE_COMB_LIMITS); + if (!nl_limits) + goto nla_put_failure; + + for (j = 0; j < c->n_limits; j++) { + struct nlattr *nl_limit; + + nl_limit = nla_nest_start(msg, j + 1); + if (!nl_limit) + goto nla_put_failure; + NLA_PUT_U32(msg, NL80211_IFACE_LIMIT_MAX, + c->limits[j].max); + if (nl80211_put_iftypes(msg, NL80211_IFACE_LIMIT_TYPES, + c->limits[j].types)) + goto nla_put_failure; + nla_nest_end(msg, nl_limit); + } + + nla_nest_end(msg, nl_limits); + + if (c->beacon_int_infra_match) + NLA_PUT_FLAG(msg, + NL80211_IFACE_COMB_STA_AP_BI_MATCH); + NLA_PUT_U32(msg, NL80211_IFACE_COMB_NUM_CHANNELS, + c->num_different_channels); + NLA_PUT_U32(msg, NL80211_IFACE_COMB_MAXNUM, + c->max_interfaces); + + nla_nest_end(msg, nl_combi); + } + + nla_nest_end(msg, nl_combis); + + return 0; +nla_put_failure: + return -ENOBUFS; +} + +static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, + struct cfg80211_registered_device *dev) +{ + void *hdr; + struct nlattr *nl_bands, *nl_band; + struct nlattr *nl_freqs, *nl_freq; + struct nlattr *nl_rates, *nl_rate; + struct nlattr *nl_cmds; + enum ieee80211_band band; + struct ieee80211_channel *chan; + struct ieee80211_rate *rate; + int i; + const struct ieee80211_txrx_stypes *mgmt_stypes = + dev->wiphy.mgmt_stypes; + + hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx); + NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy)); + + NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, + cfg80211_rdev_list_generation); + + NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT, + dev->wiphy.retry_short); + NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_RETRY_LONG, + dev->wiphy.retry_long); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD, + dev->wiphy.frag_threshold); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD, + dev->wiphy.rts_threshold); + NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS, + dev->wiphy.coverage_class); + NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, + dev->wiphy.max_scan_ssids); + NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS, + dev->wiphy.max_sched_scan_ssids); + NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN, + dev->wiphy.max_scan_ie_len); + NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN, + dev->wiphy.max_sched_scan_ie_len); + NLA_PUT_U8(msg, NL80211_ATTR_MAX_MATCH_SETS, + dev->wiphy.max_match_sets); + + if (dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) + NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_IBSS_RSN); + if (dev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) + NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_MESH_AUTH); + + NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES, + sizeof(u32) * dev->wiphy.n_cipher_suites, + dev->wiphy.cipher_suites); + + NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, + dev->wiphy.max_num_pmkids); + + if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) + NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE); + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX, + dev->wiphy.available_antennas_tx); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX, + dev->wiphy.available_antennas_rx); + + if ((dev->wiphy.available_antennas_tx || + dev->wiphy.available_antennas_rx) && dev->ops->get_antenna) { + u32 tx_ant = 0, rx_ant = 0; + int res; + res = dev->ops->get_antenna(&dev->wiphy, &tx_ant, &rx_ant); + if (!res) { + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, tx_ant); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_RX, rx_ant); + } + } + + if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES, + dev->wiphy.interface_modes)) + goto nla_put_failure; + + nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS); + if (!nl_bands) + goto nla_put_failure; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!dev->wiphy.bands[band]) + continue; + + nl_band = nla_nest_start(msg, band); + if (!nl_band) + goto nla_put_failure; + + /* add HT info */ + if (dev->wiphy.bands[band]->ht_cap.ht_supported) { + NLA_PUT(msg, NL80211_BAND_ATTR_HT_MCS_SET, + sizeof(dev->wiphy.bands[band]->ht_cap.mcs), + &dev->wiphy.bands[band]->ht_cap.mcs); + NLA_PUT_U16(msg, NL80211_BAND_ATTR_HT_CAPA, + dev->wiphy.bands[band]->ht_cap.cap); + NLA_PUT_U8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR, + dev->wiphy.bands[band]->ht_cap.ampdu_factor); + NLA_PUT_U8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY, + dev->wiphy.bands[band]->ht_cap.ampdu_density); + } + + /* add frequencies */ + nl_freqs = nla_nest_start(msg, NL80211_BAND_ATTR_FREQS); + if (!nl_freqs) + goto nla_put_failure; + + for (i = 0; i < dev->wiphy.bands[band]->n_channels; i++) { + nl_freq = nla_nest_start(msg, i); + if (!nl_freq) + goto nla_put_failure; + + chan = &dev->wiphy.bands[band]->channels[i]; + + if (nl80211_msg_put_channel(msg, chan)) + goto nla_put_failure; + + nla_nest_end(msg, nl_freq); + } + + nla_nest_end(msg, nl_freqs); + + /* add bitrates */ + nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); + if (!nl_rates) + goto nla_put_failure; + + for (i = 0; i < dev->wiphy.bands[band]->n_bitrates; i++) { + nl_rate = nla_nest_start(msg, i); + if (!nl_rate) + goto nla_put_failure; + + rate = &dev->wiphy.bands[band]->bitrates[i]; + NLA_PUT_U32(msg, NL80211_BITRATE_ATTR_RATE, + rate->bitrate); + if (rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) + NLA_PUT_FLAG(msg, + NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE); + + nla_nest_end(msg, nl_rate); + } + + nla_nest_end(msg, nl_rates); + + nla_nest_end(msg, nl_band); + } + nla_nest_end(msg, nl_bands); + + nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS); + if (!nl_cmds) + goto nla_put_failure; + + i = 0; +#define CMD(op, n) \ + do { \ + if (dev->ops->op) { \ + i++; \ + NLA_PUT_U32(msg, i, NL80211_CMD_ ## n); \ + } \ + } while (0) + + CMD(add_virtual_intf, NEW_INTERFACE); + CMD(change_virtual_intf, SET_INTERFACE); + CMD(add_key, NEW_KEY); + CMD(add_beacon, NEW_BEACON); + CMD(add_station, NEW_STATION); + CMD(add_mpath, NEW_MPATH); + CMD(update_mesh_config, SET_MESH_CONFIG); + CMD(change_bss, SET_BSS); + CMD(auth, AUTHENTICATE); + CMD(assoc, ASSOCIATE); + CMD(deauth, DEAUTHENTICATE); + CMD(disassoc, DISASSOCIATE); + CMD(join_ibss, JOIN_IBSS); + CMD(join_mesh, JOIN_MESH); + CMD(set_pmksa, SET_PMKSA); + CMD(del_pmksa, DEL_PMKSA); + CMD(flush_pmksa, FLUSH_PMKSA); + CMD(remain_on_channel, REMAIN_ON_CHANNEL); + CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); + CMD(mgmt_tx, FRAME); + CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL); + if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { + i++; + NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); + } + CMD(set_channel, SET_CHANNEL); + CMD(set_wds_peer, SET_WDS_PEER); + if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + CMD(sched_scan_start, START_SCHED_SCAN); + +#undef CMD + + if (dev->ops->connect || dev->ops->auth) { + i++; + NLA_PUT_U32(msg, i, NL80211_CMD_CONNECT); + } + + if (dev->ops->disconnect || dev->ops->deauth) { + i++; + NLA_PUT_U32(msg, i, NL80211_CMD_DISCONNECT); + } + + nla_nest_end(msg, nl_cmds); + + if (dev->ops->remain_on_channel) + NLA_PUT_U32(msg, NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION, + dev->wiphy.max_remain_on_channel_duration); + + /* for now at least assume all drivers have it */ + if (dev->ops->mgmt_tx) + NLA_PUT_FLAG(msg, NL80211_ATTR_OFFCHANNEL_TX_OK); + + if (mgmt_stypes) { + u16 stypes; + struct nlattr *nl_ftypes, *nl_ifs; + enum nl80211_iftype ift; + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES); + if (!nl_ifs) + goto nla_put_failure; + + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + goto nla_put_failure; + i = 0; + stypes = mgmt_stypes[ift].tx; + while (stypes) { + if (stypes & 1) + NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT); + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + + nla_nest_end(msg, nl_ifs); + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); + if (!nl_ifs) + goto nla_put_failure; + + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + goto nla_put_failure; + i = 0; + stypes = mgmt_stypes[ift].rx; + while (stypes) { + if (stypes & 1) + NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT); + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + nla_nest_end(msg, nl_ifs); + } + + if (dev->wiphy.wowlan.flags || dev->wiphy.wowlan.n_patterns) { + struct nlattr *nl_wowlan; + + nl_wowlan = nla_nest_start(msg, + NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED); + if (!nl_wowlan) + goto nla_put_failure; + + if (dev->wiphy.wowlan.flags & WIPHY_WOWLAN_ANY) + NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_ANY); + if (dev->wiphy.wowlan.flags & WIPHY_WOWLAN_DISCONNECT) + NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_DISCONNECT); + if (dev->wiphy.wowlan.flags & WIPHY_WOWLAN_MAGIC_PKT) + NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT); + if (dev->wiphy.wowlan.n_patterns) { + struct nl80211_wowlan_pattern_support pat = { + .max_patterns = dev->wiphy.wowlan.n_patterns, + .min_pattern_len = + dev->wiphy.wowlan.pattern_min_len, + .max_pattern_len = + dev->wiphy.wowlan.pattern_max_len, + }; + NLA_PUT(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN, + sizeof(pat), &pat); + } + + nla_nest_end(msg, nl_wowlan); + } + + if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES, + dev->wiphy.software_iftypes)) + goto nla_put_failure; + + if (nl80211_put_iface_combinations(&dev->wiphy, msg)) + goto nla_put_failure; + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct cfg80211_registered_device *dev; + + mutex_lock(&cfg80211_mutex); + list_for_each_entry(dev, &cfg80211_rdev_list, list) { + if (!net_eq(wiphy_net(&dev->wiphy), sock_net(skb->sk))) + continue; + if (++idx <= start) + continue; + if (nl80211_send_wiphy(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + dev) < 0) { + idx--; + break; + } + } + mutex_unlock(&cfg80211_mutex); + + cb->args[0] = idx; + + return skb->len; +} + +static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct cfg80211_registered_device *dev = info->user_ptr[0]; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } + + return genlmsg_reply(msg, info); +} + +static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = { + [NL80211_TXQ_ATTR_QUEUE] = { .type = NLA_U8 }, + [NL80211_TXQ_ATTR_TXOP] = { .type = NLA_U16 }, + [NL80211_TXQ_ATTR_CWMIN] = { .type = NLA_U16 }, + [NL80211_TXQ_ATTR_CWMAX] = { .type = NLA_U16 }, + [NL80211_TXQ_ATTR_AIFS] = { .type = NLA_U8 }, +}; + +static int parse_txq_params(struct nlattr *tb[], + struct ieee80211_txq_params *txq_params) +{ + if (!tb[NL80211_TXQ_ATTR_QUEUE] || !tb[NL80211_TXQ_ATTR_TXOP] || + !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] || + !tb[NL80211_TXQ_ATTR_AIFS]) + return -EINVAL; + + txq_params->queue = nla_get_u8(tb[NL80211_TXQ_ATTR_QUEUE]); + txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]); + txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]); + txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]); + txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]); + + return 0; +} + +static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) +{ + /* + * You can only set the channel explicitly for AP, mesh + * and WDS type interfaces; all others have their channel + * managed via their respective "establish a connection" + * command (connect, join, ...) + * + * Monitors are special as they are normally slaved to + * whatever else is going on, so they behave as though + * you tried setting the wiphy channel itself. + */ + return !wdev || + wdev->iftype == NL80211_IFTYPE_AP || + wdev->iftype == NL80211_IFTYPE_WDS || + wdev->iftype == NL80211_IFTYPE_MESH_POINT || + wdev->iftype == NL80211_IFTYPE_MONITOR || + wdev->iftype == NL80211_IFTYPE_P2P_GO; +} + +static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct genl_info *info) +{ + enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + u32 freq; + int result; + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ]) + return -EINVAL; + + if (!nl80211_can_set_dev_channel(wdev)) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { + channel_type = nla_get_u32(info->attrs[ + NL80211_ATTR_WIPHY_CHANNEL_TYPE]); + if (channel_type != NL80211_CHAN_NO_HT && + channel_type != NL80211_CHAN_HT20 && + channel_type != NL80211_CHAN_HT40PLUS && + channel_type != NL80211_CHAN_HT40MINUS) + return -EINVAL; + } + + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); + + mutex_lock(&rdev->devlist_mtx); + if (wdev) { + wdev_lock(wdev); + result = cfg80211_set_freq(rdev, wdev, freq, channel_type); + wdev_unlock(wdev); + } else { + result = cfg80211_set_freq(rdev, NULL, freq, channel_type); + } + mutex_unlock(&rdev->devlist_mtx); + + return result; +} + +static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *netdev = info->user_ptr[1]; + + return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info); +} + +static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *bssid; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (netif_running(dev)) + return -EBUSY; + + if (!rdev->ops->set_wds_peer) + return -EOPNOTSUPP; + + if (wdev->iftype != NL80211_IFTYPE_WDS) + return -EOPNOTSUPP; + + bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + return rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid); +} + + +static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *netdev = NULL; + struct wireless_dev *wdev; + int result = 0, rem_txq_params = 0; + struct nlattr *nl_txq_params; + u32 changed; + u8 retry_short = 0, retry_long = 0; + u32 frag_threshold = 0, rts_threshold = 0; + u8 coverage_class = 0; + + /* + * Try to find the wiphy and netdev. Normally this + * function shouldn't need the netdev, but this is + * done for backward compatibility -- previously + * setting the channel was done per wiphy, but now + * it is per netdev. Previous userland like hostapd + * also passed a netdev to set_wiphy, so that it is + * possible to let that go to the right netdev! + */ + mutex_lock(&cfg80211_mutex); + + if (info->attrs[NL80211_ATTR_IFINDEX]) { + int ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]); + + netdev = dev_get_by_index(genl_info_net(info), ifindex); + if (netdev && netdev->ieee80211_ptr) { + rdev = wiphy_to_dev(netdev->ieee80211_ptr->wiphy); + mutex_lock(&rdev->mtx); + } else + netdev = NULL; + } + + if (!netdev) { + rdev = __cfg80211_rdev_from_info(info); + if (IS_ERR(rdev)) { + mutex_unlock(&cfg80211_mutex); + return PTR_ERR(rdev); + } + wdev = NULL; + netdev = NULL; + result = 0; + + mutex_lock(&rdev->mtx); + } else if (netif_running(netdev) && + nl80211_can_set_dev_channel(netdev->ieee80211_ptr)) + wdev = netdev->ieee80211_ptr; + else + wdev = NULL; + + /* + * end workaround code, by now the rdev is available + * and locked, and wdev may or may not be NULL. + */ + + if (info->attrs[NL80211_ATTR_WIPHY_NAME]) + result = cfg80211_dev_rename( + rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME])); + + mutex_unlock(&cfg80211_mutex); + + if (result) + goto bad_res; + + if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { + struct ieee80211_txq_params txq_params; + struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; + + if (!rdev->ops->set_txq_params) { + result = -EOPNOTSUPP; + goto bad_res; + } + + if (!netif_running(netdev)) { + result = -ENETDOWN; + goto bad_res; + } + + nla_for_each_nested(nl_txq_params, + info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], + rem_txq_params) { + nla_parse(tb, NL80211_TXQ_ATTR_MAX, + nla_data(nl_txq_params), + nla_len(nl_txq_params), + txq_params_policy); + result = parse_txq_params(tb, &txq_params); + if (result) + goto bad_res; + + result = rdev->ops->set_txq_params(&rdev->wiphy, + &txq_params); + if (result) + goto bad_res; + } + } + + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { + result = __nl80211_set_channel(rdev, wdev, info); + if (result) + goto bad_res; + } + + if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) { + enum nl80211_tx_power_setting type; + int idx, mbm = 0; + + if (!rdev->ops->set_tx_power) { + result = -EOPNOTSUPP; + goto bad_res; + } + + idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING; + type = nla_get_u32(info->attrs[idx]); + + if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] && + (type != NL80211_TX_POWER_AUTOMATIC)) { + result = -EINVAL; + goto bad_res; + } + + if (type != NL80211_TX_POWER_AUTOMATIC) { + idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL; + mbm = nla_get_u32(info->attrs[idx]); + } + + result = rdev->ops->set_tx_power(&rdev->wiphy, type, mbm); + if (result) + goto bad_res; + } + + if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && + info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) { + u32 tx_ant, rx_ant; + if ((!rdev->wiphy.available_antennas_tx && + !rdev->wiphy.available_antennas_rx) || + !rdev->ops->set_antenna) { + result = -EOPNOTSUPP; + goto bad_res; + } + + tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]); + rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]); + + /* reject antenna configurations which don't match the + * available antenna masks, except for the "all" mask */ + if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) || + (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) { + result = -EINVAL; + goto bad_res; + } + + tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; + rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; + + result = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); + if (result) + goto bad_res; + } + + changed = 0; + + if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) { + retry_short = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]); + if (retry_short == 0) { + result = -EINVAL; + goto bad_res; + } + changed |= WIPHY_PARAM_RETRY_SHORT; + } + + if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) { + retry_long = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]); + if (retry_long == 0) { + result = -EINVAL; + goto bad_res; + } + changed |= WIPHY_PARAM_RETRY_LONG; + } + + if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) { + frag_threshold = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]); + if (frag_threshold < 256) { + result = -EINVAL; + goto bad_res; + } + if (frag_threshold != (u32) -1) { + /* + * Fragments (apart from the last one) are required to + * have even length. Make the fragmentation code + * simpler by stripping LSB should someone try to use + * odd threshold value. + */ + frag_threshold &= ~0x1; + } + changed |= WIPHY_PARAM_FRAG_THRESHOLD; + } + + if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) { + rts_threshold = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]); + changed |= WIPHY_PARAM_RTS_THRESHOLD; + } + + if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + coverage_class = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); + changed |= WIPHY_PARAM_COVERAGE_CLASS; + } + + if (changed) { + u8 old_retry_short, old_retry_long; + u32 old_frag_threshold, old_rts_threshold; + u8 old_coverage_class; + + if (!rdev->ops->set_wiphy_params) { + result = -EOPNOTSUPP; + goto bad_res; + } + + old_retry_short = rdev->wiphy.retry_short; + old_retry_long = rdev->wiphy.retry_long; + old_frag_threshold = rdev->wiphy.frag_threshold; + old_rts_threshold = rdev->wiphy.rts_threshold; + old_coverage_class = rdev->wiphy.coverage_class; + + if (changed & WIPHY_PARAM_RETRY_SHORT) + rdev->wiphy.retry_short = retry_short; + if (changed & WIPHY_PARAM_RETRY_LONG) + rdev->wiphy.retry_long = retry_long; + if (changed & WIPHY_PARAM_FRAG_THRESHOLD) + rdev->wiphy.frag_threshold = frag_threshold; + if (changed & WIPHY_PARAM_RTS_THRESHOLD) + rdev->wiphy.rts_threshold = rts_threshold; + if (changed & WIPHY_PARAM_COVERAGE_CLASS) + rdev->wiphy.coverage_class = coverage_class; + + result = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); + if (result) { + rdev->wiphy.retry_short = old_retry_short; + rdev->wiphy.retry_long = old_retry_long; + rdev->wiphy.frag_threshold = old_frag_threshold; + rdev->wiphy.rts_threshold = old_rts_threshold; + rdev->wiphy.coverage_class = old_coverage_class; + } + } + + bad_res: + mutex_unlock(&rdev->mtx); + if (netdev) + dev_put(netdev); + return result; +} + + +static int nl80211_send_iface(struct sk_buff *msg, u32 pid, u32 seq, int flags, + struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + void *hdr; + + hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_INTERFACE); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_STRING(msg, NL80211_ATTR_IFNAME, dev->name); + NLA_PUT_U32(msg, NL80211_ATTR_IFTYPE, dev->ieee80211_ptr->iftype); + + NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, + rdev->devlist_generation ^ + (cfg80211_rdev_list_generation << 2)); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) +{ + int wp_idx = 0; + int if_idx = 0; + int wp_start = cb->args[0]; + int if_start = cb->args[1]; + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + + mutex_lock(&cfg80211_mutex); + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk))) + continue; + if (wp_idx < wp_start) { + wp_idx++; + continue; + } + if_idx = 0; + + mutex_lock(&rdev->devlist_mtx); + list_for_each_entry(wdev, &rdev->netdev_list, list) { + if (if_idx < if_start) { + if_idx++; + continue; + } + if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wdev->netdev) < 0) { + mutex_unlock(&rdev->devlist_mtx); + goto out; + } + if_idx++; + } + mutex_unlock(&rdev->devlist_mtx); + + wp_idx++; + } + out: + mutex_unlock(&cfg80211_mutex); + + cb->args[0] = wp_idx; + cb->args[1] = if_idx; + + return skb->len; +} + +static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct cfg80211_registered_device *dev = info->user_ptr[0]; + struct net_device *netdev = info->user_ptr[1]; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0, + dev, netdev) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } + + return genlmsg_reply(msg, info); +} + +static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = { + [NL80211_MNTR_FLAG_FCSFAIL] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_PLCPFAIL] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_CONTROL] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG }, +}; + +static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) +{ + struct nlattr *flags[NL80211_MNTR_FLAG_MAX + 1]; + int flag; + + *mntrflags = 0; + + if (!nla) + return -EINVAL; + + if (nla_parse_nested(flags, NL80211_MNTR_FLAG_MAX, + nla, mntr_flags_policy)) + return -EINVAL; + + for (flag = 1; flag <= NL80211_MNTR_FLAG_MAX; flag++) + if (flags[flag]) + *mntrflags |= (1<priv_flags & IFF_BRIDGE_PORT)) + return -EBUSY; + return 0; + } + + switch (iftype) { + case NL80211_IFTYPE_AP_VLAN: + if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_AP) + return 0; + break; + case NL80211_IFTYPE_STATION: + if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_STATION) + return 0; + break; + default: + break; + } + + return -EOPNOTSUPP; +} + +static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct vif_params params; + int err; + enum nl80211_iftype otype, ntype; + struct net_device *dev = info->user_ptr[1]; + u32 _flags, *flags = NULL; + bool change = false; + + memset(¶ms, 0, sizeof(params)); + + otype = ntype = dev->ieee80211_ptr->iftype; + + if (info->attrs[NL80211_ATTR_IFTYPE]) { + ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); + if (otype != ntype) + change = true; + if (ntype > NL80211_IFTYPE_MAX) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_MESH_ID]) { + struct wireless_dev *wdev = dev->ieee80211_ptr; + + if (ntype != NL80211_IFTYPE_MESH_POINT) + return -EINVAL; + if (netif_running(dev)) + return -EBUSY; + + wdev_lock(wdev); + BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != + IEEE80211_MAX_MESH_ID_LEN); + wdev->mesh_id_up_len = + nla_len(info->attrs[NL80211_ATTR_MESH_ID]); + memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]), + wdev->mesh_id_up_len); + wdev_unlock(wdev); + } + + if (info->attrs[NL80211_ATTR_4ADDR]) { + params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]); + change = true; + err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype); + if (err) + return err; + } else { + params.use_4addr = -1; + } + + if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) { + if (ntype != NL80211_IFTYPE_MONITOR) + return -EINVAL; + err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS], + &_flags); + if (err) + return err; + + flags = &_flags; + change = true; + } + + if (change) + err = cfg80211_change_iface(rdev, dev, ntype, flags, ¶ms); + else + err = 0; + + if (!err && params.use_4addr != -1) + dev->ieee80211_ptr->use_4addr = params.use_4addr; + + return err; +} + +static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct vif_params params; + struct net_device *dev; + int err; + enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; + u32 flags; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_IFNAME]) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_IFTYPE]) { + type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); + if (type > NL80211_IFTYPE_MAX) + return -EINVAL; + } + + if (!rdev->ops->add_virtual_intf || + !(rdev->wiphy.interface_modes & (1 << type))) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_4ADDR]) { + params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]); + err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type); + if (err) + return err; + } + + err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? + info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL, + &flags); + dev = rdev->ops->add_virtual_intf(&rdev->wiphy, + nla_data(info->attrs[NL80211_ATTR_IFNAME]), + type, err ? NULL : &flags, ¶ms); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + if (type == NL80211_IFTYPE_MESH_POINT && + info->attrs[NL80211_ATTR_MESH_ID]) { + struct wireless_dev *wdev = dev->ieee80211_ptr; + + wdev_lock(wdev); + BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != + IEEE80211_MAX_MESH_ID_LEN); + wdev->mesh_id_up_len = + nla_len(info->attrs[NL80211_ATTR_MESH_ID]); + memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]), + wdev->mesh_id_up_len); + wdev_unlock(wdev); + } + + return 0; +} + +static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + + if (!rdev->ops->del_virtual_intf) + return -EOPNOTSUPP; + + return rdev->ops->del_virtual_intf(&rdev->wiphy, dev); +} + +struct get_key_cookie { + struct sk_buff *msg; + int error; + int idx; +}; + +static void get_key_callback(void *c, struct key_params *params) +{ + struct nlattr *key; + struct get_key_cookie *cookie = c; + + if (params->key) + NLA_PUT(cookie->msg, NL80211_ATTR_KEY_DATA, + params->key_len, params->key); + + if (params->seq) + NLA_PUT(cookie->msg, NL80211_ATTR_KEY_SEQ, + params->seq_len, params->seq); + + if (params->cipher) + NLA_PUT_U32(cookie->msg, NL80211_ATTR_KEY_CIPHER, + params->cipher); + + key = nla_nest_start(cookie->msg, NL80211_ATTR_KEY); + if (!key) + goto nla_put_failure; + + if (params->key) + NLA_PUT(cookie->msg, NL80211_KEY_DATA, + params->key_len, params->key); + + if (params->seq) + NLA_PUT(cookie->msg, NL80211_KEY_SEQ, + params->seq_len, params->seq); + + if (params->cipher) + NLA_PUT_U32(cookie->msg, NL80211_KEY_CIPHER, + params->cipher); + + NLA_PUT_U8(cookie->msg, NL80211_ATTR_KEY_IDX, cookie->idx); + + nla_nest_end(cookie->msg, key); + + return; + nla_put_failure: + cookie->error = 1; +} + +static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + u8 key_idx = 0; + const u8 *mac_addr = NULL; + bool pairwise; + struct get_key_cookie cookie = { + .error = 0, + }; + void *hdr; + struct sk_buff *msg; + + if (info->attrs[NL80211_ATTR_KEY_IDX]) + key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + + if (key_idx > 5) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + pairwise = !!mac_addr; + if (info->attrs[NL80211_ATTR_KEY_TYPE]) { + u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); + if (kt >= NUM_NL80211_KEYTYPES) + return -EINVAL; + if (kt != NL80211_KEYTYPE_GROUP && + kt != NL80211_KEYTYPE_PAIRWISE) + return -EINVAL; + pairwise = kt == NL80211_KEYTYPE_PAIRWISE; + } + + if (!rdev->ops->get_key) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_NEW_KEY); + if (IS_ERR(hdr)) + return PTR_ERR(hdr); + + cookie.msg = msg; + cookie.idx = key_idx; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT_U8(msg, NL80211_ATTR_KEY_IDX, key_idx); + if (mac_addr) + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); + + if (pairwise && mac_addr && + !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + return -ENOENT; + + err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, pairwise, + mac_addr, &cookie, get_key_callback); + + if (err) + goto free_msg; + + if (cookie.error) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + + nla_put_failure: + err = -ENOBUFS; + free_msg: + nlmsg_free(msg); + return err; +} + +static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct key_parse key; + int err; + struct net_device *dev = info->user_ptr[1]; + + err = nl80211_parse_key(info, &key); + if (err) + return err; + + if (key.idx < 0) + return -EINVAL; + + /* only support setting default key */ + if (!key.def && !key.defmgmt) + return -EINVAL; + + wdev_lock(dev->ieee80211_ptr); + + if (key.def) { + if (!rdev->ops->set_default_key) { + err = -EOPNOTSUPP; + goto out; + } + + err = nl80211_key_allowed(dev->ieee80211_ptr); + if (err) + goto out; + + err = rdev->ops->set_default_key(&rdev->wiphy, dev, key.idx, + key.def_uni, key.def_multi); + + if (err) + goto out; + +#ifdef CONFIG_CFG80211_WEXT + dev->ieee80211_ptr->wext.default_key = key.idx; +#endif + } else { + if (key.def_uni || !key.def_multi) { + err = -EINVAL; + goto out; + } + + if (!rdev->ops->set_default_mgmt_key) { + err = -EOPNOTSUPP; + goto out; + } + + err = nl80211_key_allowed(dev->ieee80211_ptr); + if (err) + goto out; + + err = rdev->ops->set_default_mgmt_key(&rdev->wiphy, + dev, key.idx); + if (err) + goto out; + +#ifdef CONFIG_CFG80211_WEXT + dev->ieee80211_ptr->wext.default_mgmt_key = key.idx; +#endif + } + + out: + wdev_unlock(dev->ieee80211_ptr); + + return err; +} + +static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + struct key_parse key; + const u8 *mac_addr = NULL; + + err = nl80211_parse_key(info, &key); + if (err) + return err; + + if (!key.p.key) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (key.type == -1) { + if (mac_addr) + key.type = NL80211_KEYTYPE_PAIRWISE; + else + key.type = NL80211_KEYTYPE_GROUP; + } + + /* for now */ + if (key.type != NL80211_KEYTYPE_PAIRWISE && + key.type != NL80211_KEYTYPE_GROUP) + return -EINVAL; + + if (!rdev->ops->add_key) + return -EOPNOTSUPP; + + if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE, + mac_addr)) + return -EINVAL; + + wdev_lock(dev->ieee80211_ptr); + err = nl80211_key_allowed(dev->ieee80211_ptr); + if (!err) + err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE, + mac_addr, &key.p); + wdev_unlock(dev->ieee80211_ptr); + + return err; +} + +static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + u8 *mac_addr = NULL; + struct key_parse key; + + err = nl80211_parse_key(info, &key); + if (err) + return err; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (key.type == -1) { + if (mac_addr) + key.type = NL80211_KEYTYPE_PAIRWISE; + else + key.type = NL80211_KEYTYPE_GROUP; + } + + /* for now */ + if (key.type != NL80211_KEYTYPE_PAIRWISE && + key.type != NL80211_KEYTYPE_GROUP) + return -EINVAL; + + if (!rdev->ops->del_key) + return -EOPNOTSUPP; + + wdev_lock(dev->ieee80211_ptr); + err = nl80211_key_allowed(dev->ieee80211_ptr); + + if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr && + !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + err = -ENOENT; + + if (!err) + err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE, + mac_addr); + +#ifdef CONFIG_CFG80211_WEXT + if (!err) { + if (key.idx == dev->ieee80211_ptr->wext.default_key) + dev->ieee80211_ptr->wext.default_key = -1; + else if (key.idx == dev->ieee80211_ptr->wext.default_mgmt_key) + dev->ieee80211_ptr->wext.default_mgmt_key = -1; + } +#endif + wdev_unlock(dev->ieee80211_ptr); + + return err; +} + +static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info) +{ + int (*call)(struct wiphy *wiphy, struct net_device *dev, + struct beacon_parameters *info); + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct beacon_parameters params; + int haveinfo = 0, err; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL])) + return -EINVAL; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + memset(¶ms, 0, sizeof(params)); + + switch (info->genlhdr->cmd) { + case NL80211_CMD_NEW_BEACON: + /* these are required for NEW_BEACON */ + if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] || + !info->attrs[NL80211_ATTR_DTIM_PERIOD] || + !info->attrs[NL80211_ATTR_BEACON_HEAD]) + return -EINVAL; + + params.interval = + nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); + params.dtim_period = + nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]); + + err = cfg80211_validate_beacon_int(rdev, params.interval); + if (err) + return err; + + call = rdev->ops->add_beacon; + break; + case NL80211_CMD_SET_BEACON: + call = rdev->ops->set_beacon; + break; + default: + WARN_ON(1); + return -EOPNOTSUPP; + } + + if (!call) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_BEACON_HEAD]) { + params.head = nla_data(info->attrs[NL80211_ATTR_BEACON_HEAD]); + params.head_len = + nla_len(info->attrs[NL80211_ATTR_BEACON_HEAD]); + haveinfo = 1; + } + + if (info->attrs[NL80211_ATTR_BEACON_TAIL]) { + params.tail = nla_data(info->attrs[NL80211_ATTR_BEACON_TAIL]); + params.tail_len = + nla_len(info->attrs[NL80211_ATTR_BEACON_TAIL]); + haveinfo = 1; + } + + if (!haveinfo) + return -EINVAL; + + err = call(&rdev->wiphy, dev, ¶ms); + if (!err && params.interval) + wdev->beacon_interval = params.interval; + return err; +} + +static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + if (!rdev->ops->del_beacon) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + err = rdev->ops->del_beacon(&rdev->wiphy, dev); + if (!err) + wdev->beacon_interval = 0; + return err; +} + +static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = { + [NL80211_STA_FLAG_AUTHORIZED] = { .type = NLA_FLAG }, + [NL80211_STA_FLAG_SHORT_PREAMBLE] = { .type = NLA_FLAG }, + [NL80211_STA_FLAG_WME] = { .type = NLA_FLAG }, + [NL80211_STA_FLAG_MFP] = { .type = NLA_FLAG }, + [NL80211_STA_FLAG_AUTHENTICATED] = { .type = NLA_FLAG }, +}; + +static int parse_station_flags(struct genl_info *info, + struct station_parameters *params) +{ + struct nlattr *flags[NL80211_STA_FLAG_MAX + 1]; + struct nlattr *nla; + int flag; + + /* + * Try parsing the new attribute first so userspace + * can specify both for older kernels. + */ + nla = info->attrs[NL80211_ATTR_STA_FLAGS2]; + if (nla) { + struct nl80211_sta_flag_update *sta_flags; + + sta_flags = nla_data(nla); + params->sta_flags_mask = sta_flags->mask; + params->sta_flags_set = sta_flags->set; + if ((params->sta_flags_mask | + params->sta_flags_set) & BIT(__NL80211_STA_FLAG_INVALID)) + return -EINVAL; + return 0; + } + + /* if present, parse the old attribute */ + + nla = info->attrs[NL80211_ATTR_STA_FLAGS]; + if (!nla) + return 0; + + if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX, + nla, sta_flags_policy)) + return -EINVAL; + + params->sta_flags_mask = (1 << __NL80211_STA_FLAG_AFTER_LAST) - 1; + params->sta_flags_mask &= ~1; + + for (flag = 1; flag <= NL80211_STA_FLAG_MAX; flag++) + if (flags[flag]) + params->sta_flags_set |= (1<= 32 */ + bitrate = cfg80211_calculate_bitrate(info); + if (bitrate > 0) + NLA_PUT_U16(msg, NL80211_RATE_INFO_BITRATE, bitrate); + + if (info->flags & RATE_INFO_FLAGS_MCS) + NLA_PUT_U8(msg, NL80211_RATE_INFO_MCS, info->mcs); + if (info->flags & RATE_INFO_FLAGS_40_MHZ_WIDTH) + NLA_PUT_FLAG(msg, NL80211_RATE_INFO_40_MHZ_WIDTH); + if (info->flags & RATE_INFO_FLAGS_SHORT_GI) + NLA_PUT_FLAG(msg, NL80211_RATE_INFO_SHORT_GI); + + nla_nest_end(msg, rate); + return true; + +nla_put_failure: + return false; +} + +static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq, + int flags, struct net_device *dev, + const u8 *mac_addr, struct station_info *sinfo) +{ + void *hdr; + struct nlattr *sinfoattr, *bss_param; + + hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_STATION); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); + + NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, sinfo->generation); + + sinfoattr = nla_nest_start(msg, NL80211_ATTR_STA_INFO); + if (!sinfoattr) + goto nla_put_failure; + if (sinfo->filled & STATION_INFO_CONNECTED_TIME) + NLA_PUT_U32(msg, NL80211_STA_INFO_CONNECTED_TIME, + sinfo->connected_time); + if (sinfo->filled & STATION_INFO_INACTIVE_TIME) + NLA_PUT_U32(msg, NL80211_STA_INFO_INACTIVE_TIME, + sinfo->inactive_time); + if (sinfo->filled & STATION_INFO_RX_BYTES) + NLA_PUT_U32(msg, NL80211_STA_INFO_RX_BYTES, + sinfo->rx_bytes); + if (sinfo->filled & STATION_INFO_TX_BYTES) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_BYTES, + sinfo->tx_bytes); + if (sinfo->filled & STATION_INFO_LLID) + NLA_PUT_U16(msg, NL80211_STA_INFO_LLID, + sinfo->llid); + if (sinfo->filled & STATION_INFO_PLID) + NLA_PUT_U16(msg, NL80211_STA_INFO_PLID, + sinfo->plid); + if (sinfo->filled & STATION_INFO_PLINK_STATE) + NLA_PUT_U8(msg, NL80211_STA_INFO_PLINK_STATE, + sinfo->plink_state); + if (sinfo->filled & STATION_INFO_SIGNAL) + NLA_PUT_U8(msg, NL80211_STA_INFO_SIGNAL, + sinfo->signal); + if (sinfo->filled & STATION_INFO_SIGNAL_AVG) + NLA_PUT_U8(msg, NL80211_STA_INFO_SIGNAL_AVG, + sinfo->signal_avg); + if (sinfo->filled & STATION_INFO_TX_BITRATE) { + if (!nl80211_put_sta_rate(msg, &sinfo->txrate, + NL80211_STA_INFO_TX_BITRATE)) + goto nla_put_failure; + } + if (sinfo->filled & STATION_INFO_RX_BITRATE) { + if (!nl80211_put_sta_rate(msg, &sinfo->rxrate, + NL80211_STA_INFO_RX_BITRATE)) + goto nla_put_failure; + } + if (sinfo->filled & STATION_INFO_RX_PACKETS) + NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS, + sinfo->rx_packets); + if (sinfo->filled & STATION_INFO_TX_PACKETS) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS, + sinfo->tx_packets); + if (sinfo->filled & STATION_INFO_TX_RETRIES) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_RETRIES, + sinfo->tx_retries); + if (sinfo->filled & STATION_INFO_TX_FAILED) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_FAILED, + sinfo->tx_failed); + if (sinfo->filled & STATION_INFO_BSS_PARAM) { + bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); + if (!bss_param) + goto nla_put_failure; + + if (sinfo->bss_param.flags & BSS_PARAM_FLAGS_CTS_PROT) + NLA_PUT_FLAG(msg, NL80211_STA_BSS_PARAM_CTS_PROT); + if (sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_PREAMBLE) + NLA_PUT_FLAG(msg, NL80211_STA_BSS_PARAM_SHORT_PREAMBLE); + if (sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_SLOT_TIME) + NLA_PUT_FLAG(msg, + NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME); + NLA_PUT_U8(msg, NL80211_STA_BSS_PARAM_DTIM_PERIOD, + sinfo->bss_param.dtim_period); + NLA_PUT_U16(msg, NL80211_STA_BSS_PARAM_BEACON_INTERVAL, + sinfo->bss_param.beacon_interval); + + nla_nest_end(msg, bss_param); + } + nla_nest_end(msg, sinfoattr); + + if (sinfo->filled & STATION_INFO_ASSOC_REQ_IES) + NLA_PUT(msg, NL80211_ATTR_IE, sinfo->assoc_req_ies_len, + sinfo->assoc_req_ies); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_station(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct station_info sinfo; + struct cfg80211_registered_device *dev; + struct net_device *netdev; + u8 mac_addr[ETH_ALEN]; + int sta_idx = cb->args[1]; + int err; + + err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + if (err) + return err; + + if (!dev->ops->dump_station) { + err = -EOPNOTSUPP; + goto out_err; + } + + while (1) { + memset(&sinfo, 0, sizeof(sinfo)); + err = dev->ops->dump_station(&dev->wiphy, netdev, sta_idx, + mac_addr, &sinfo); + if (err == -ENOENT) + break; + if (err) + goto out_err; + + if (nl80211_send_station(skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + netdev, mac_addr, + &sinfo) < 0) + goto out; + + sta_idx++; + } + + + out: + cb->args[1] = sta_idx; + err = skb->len; + out_err: + nl80211_finish_netdev_dump(dev); + + return err; +} + +static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct station_info sinfo; + struct sk_buff *msg; + u8 *mac_addr = NULL; + int err; + + memset(&sinfo, 0, sizeof(sinfo)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (!rdev->ops->get_station) + return -EOPNOTSUPP; + + err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0, + dev, mac_addr, &sinfo) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } + + return genlmsg_reply(msg, info); +} + +/* + * Get vlan interface making sure it is running and on the right wiphy. + */ +static int get_vlan(struct genl_info *info, + struct cfg80211_registered_device *rdev, + struct net_device **vlan) +{ + struct nlattr *vlanattr = info->attrs[NL80211_ATTR_STA_VLAN]; + *vlan = NULL; + + if (vlanattr) { + *vlan = dev_get_by_index(genl_info_net(info), + nla_get_u32(vlanattr)); + if (!*vlan) + return -ENODEV; + if (!(*vlan)->ieee80211_ptr) + return -EINVAL; + if ((*vlan)->ieee80211_ptr->wiphy != &rdev->wiphy) + return -EINVAL; + if (!netif_running(*vlan)) + return -ENETDOWN; + } + return 0; +} + +static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + struct station_parameters params; + u8 *mac_addr = NULL; + + memset(¶ms, 0, sizeof(params)); + + params.listen_interval = -1; + params.plink_state = -1; + + if (info->attrs[NL80211_ATTR_STA_AID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) { + params.supported_rates = + nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.supported_rates_len = + nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + } + + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) + params.listen_interval = + nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + + if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) + params.ht_capa = + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); + + if (parse_station_flags(info, ¶ms)) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) + params.plink_action = + nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + + if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) + params.plink_state = + nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); + + err = get_vlan(info, rdev, ¶ms.vlan); + if (err) + goto out; + + /* validate settings */ + err = 0; + + switch (dev->ieee80211_ptr->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: + /* disallow mesh-specific things */ + if (params.plink_action) + err = -EINVAL; + break; + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_STATION: + /* disallow everything but AUTHORIZED flag */ + if (params.plink_action) + err = -EINVAL; + if (params.vlan) + err = -EINVAL; + if (params.supported_rates) + err = -EINVAL; + if (params.ht_capa) + err = -EINVAL; + if (params.listen_interval >= 0) + err = -EINVAL; + if (params.sta_flags_mask & ~BIT(NL80211_STA_FLAG_AUTHORIZED)) + err = -EINVAL; + break; + case NL80211_IFTYPE_MESH_POINT: + /* disallow things mesh doesn't support */ + if (params.vlan) + err = -EINVAL; + if (params.ht_capa) + err = -EINVAL; + if (params.listen_interval >= 0) + err = -EINVAL; + if (params.sta_flags_mask & + ~(BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_MFP) | + BIT(NL80211_STA_FLAG_AUTHORIZED))) + err = -EINVAL; + break; + default: + err = -EINVAL; + } + + if (err) + goto out; + + if (!rdev->ops->change_station) { + err = -EOPNOTSUPP; + goto out; + } + + err = rdev->ops->change_station(&rdev->wiphy, dev, mac_addr, ¶ms); + + out: + if (params.vlan) + dev_put(params.vlan); + + return err; +} + +static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + struct station_parameters params; + u8 *mac_addr = NULL; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STA_AID]) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + params.supported_rates = + nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.supported_rates_len = + nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.listen_interval = + nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + + params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + if (!params.aid || params.aid > IEEE80211_MAX_AID) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) + params.ht_capa = + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); + + if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) + params.plink_action = + nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + + if (parse_station_flags(info, ¶ms)) + return -EINVAL; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; + + err = get_vlan(info, rdev, ¶ms.vlan); + if (err) + goto out; + + /* validate settings */ + err = 0; + + if (!rdev->ops->add_station) { + err = -EOPNOTSUPP; + goto out; + } + + err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, ¶ms); + + out: + if (params.vlan) + dev_put(params.vlan); + return err; +} + +static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u8 *mac_addr = NULL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; + + if (!rdev->ops->del_station) + return -EOPNOTSUPP; + + return rdev->ops->del_station(&rdev->wiphy, dev, mac_addr); +} + +static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq, + int flags, struct net_device *dev, + u8 *dst, u8 *next_hop, + struct mpath_info *pinfo) +{ + void *hdr; + struct nlattr *pinfoattr; + + hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_STATION); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, dst); + NLA_PUT(msg, NL80211_ATTR_MPATH_NEXT_HOP, ETH_ALEN, next_hop); + + NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, pinfo->generation); + + pinfoattr = nla_nest_start(msg, NL80211_ATTR_MPATH_INFO); + if (!pinfoattr) + goto nla_put_failure; + if (pinfo->filled & MPATH_INFO_FRAME_QLEN) + NLA_PUT_U32(msg, NL80211_MPATH_INFO_FRAME_QLEN, + pinfo->frame_qlen); + if (pinfo->filled & MPATH_INFO_SN) + NLA_PUT_U32(msg, NL80211_MPATH_INFO_SN, + pinfo->sn); + if (pinfo->filled & MPATH_INFO_METRIC) + NLA_PUT_U32(msg, NL80211_MPATH_INFO_METRIC, + pinfo->metric); + if (pinfo->filled & MPATH_INFO_EXPTIME) + NLA_PUT_U32(msg, NL80211_MPATH_INFO_EXPTIME, + pinfo->exptime); + if (pinfo->filled & MPATH_INFO_FLAGS) + NLA_PUT_U8(msg, NL80211_MPATH_INFO_FLAGS, + pinfo->flags); + if (pinfo->filled & MPATH_INFO_DISCOVERY_TIMEOUT) + NLA_PUT_U32(msg, NL80211_MPATH_INFO_DISCOVERY_TIMEOUT, + pinfo->discovery_timeout); + if (pinfo->filled & MPATH_INFO_DISCOVERY_RETRIES) + NLA_PUT_U8(msg, NL80211_MPATH_INFO_DISCOVERY_RETRIES, + pinfo->discovery_retries); + + nla_nest_end(msg, pinfoattr); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_mpath(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct mpath_info pinfo; + struct cfg80211_registered_device *dev; + struct net_device *netdev; + u8 dst[ETH_ALEN]; + u8 next_hop[ETH_ALEN]; + int path_idx = cb->args[1]; + int err; + + err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + if (err) + return err; + + if (!dev->ops->dump_mpath) { + err = -EOPNOTSUPP; + goto out_err; + } + + if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { + err = -EOPNOTSUPP; + goto out_err; + } + + while (1) { + err = dev->ops->dump_mpath(&dev->wiphy, netdev, path_idx, + dst, next_hop, &pinfo); + if (err == -ENOENT) + break; + if (err) + goto out_err; + + if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + netdev, dst, next_hop, + &pinfo) < 0) + goto out; + + path_idx++; + } + + + out: + cb->args[1] = path_idx; + err = skb->len; + out_err: + nl80211_finish_netdev_dump(dev); + return err; +} + +static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + struct mpath_info pinfo; + struct sk_buff *msg; + u8 *dst = NULL; + u8 next_hop[ETH_ALEN]; + + memset(&pinfo, 0, sizeof(pinfo)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + dst = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (!rdev->ops->get_mpath) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0, + dev, dst, next_hop, &pinfo) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } + + return genlmsg_reply(msg, info); +} + +static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u8 *dst = NULL; + u8 *next_hop = NULL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]) + return -EINVAL; + + dst = nla_data(info->attrs[NL80211_ATTR_MAC]); + next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]); + + if (!rdev->ops->change_mpath) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); +} + +static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u8 *dst = NULL; + u8 *next_hop = NULL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]) + return -EINVAL; + + dst = nla_data(info->attrs[NL80211_ATTR_MAC]); + next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]); + + if (!rdev->ops->add_mpath) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); +} + +static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u8 *dst = NULL; + + if (info->attrs[NL80211_ATTR_MAC]) + dst = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (!rdev->ops->del_mpath) + return -EOPNOTSUPP; + + return rdev->ops->del_mpath(&rdev->wiphy, dev, dst); +} + +static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct bss_parameters params; + + memset(¶ms, 0, sizeof(params)); + /* default to not changing parameters */ + params.use_cts_prot = -1; + params.use_short_preamble = -1; + params.use_short_slot_time = -1; + params.ap_isolate = -1; + params.ht_opmode = -1; + + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + params.use_cts_prot = + nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + params.use_short_preamble = + nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + params.use_short_slot_time = + nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + params.basic_rates = + nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + params.basic_rates_len = + nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + } + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) + params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + params.ht_opmode = + nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + + if (!rdev->ops->change_bss) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + return rdev->ops->change_bss(&rdev->wiphy, dev, ¶ms); +} + +static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = { + [NL80211_ATTR_REG_RULE_FLAGS] = { .type = NLA_U32 }, + [NL80211_ATTR_FREQ_RANGE_START] = { .type = NLA_U32 }, + [NL80211_ATTR_FREQ_RANGE_END] = { .type = NLA_U32 }, + [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 }, + [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 }, + [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 }, +}; + +static int parse_reg_rule(struct nlattr *tb[], + struct ieee80211_reg_rule *reg_rule) +{ + struct ieee80211_freq_range *freq_range = ®_rule->freq_range; + struct ieee80211_power_rule *power_rule = ®_rule->power_rule; + + if (!tb[NL80211_ATTR_REG_RULE_FLAGS]) + return -EINVAL; + if (!tb[NL80211_ATTR_FREQ_RANGE_START]) + return -EINVAL; + if (!tb[NL80211_ATTR_FREQ_RANGE_END]) + return -EINVAL; + if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]) + return -EINVAL; + if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]) + return -EINVAL; + + reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]); + + freq_range->start_freq_khz = + nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]); + freq_range->end_freq_khz = + nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]); + freq_range->max_bandwidth_khz = + nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]); + + power_rule->max_eirp = + nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]); + + if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]) + power_rule->max_antenna_gain = + nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]); + + return 0; +} + +static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) +{ + int r; + char *data = NULL; + + /* + * You should only get this when cfg80211 hasn't yet initialized + * completely when built-in to the kernel right between the time + * window between nl80211_init() and regulatory_init(), if that is + * even possible. + */ + mutex_lock(&cfg80211_mutex); + if (unlikely(!cfg80211_regdomain)) { + mutex_unlock(&cfg80211_mutex); + return -EINPROGRESS; + } + mutex_unlock(&cfg80211_mutex); + + if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) + return -EINVAL; + + data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); + + r = regulatory_hint_user(data); + + return r; +} + +static int nl80211_get_mesh_config(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct mesh_config cur_params; + int err = 0; + void *hdr; + struct nlattr *pinfoattr; + struct sk_buff *msg; + + if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + if (!rdev->ops->get_mesh_config) + return -EOPNOTSUPP; + + wdev_lock(wdev); + /* If not connected, get default parameters */ + if (!wdev->mesh_id_len) + memcpy(&cur_params, &default_mesh_config, sizeof(cur_params)); + else + err = rdev->ops->get_mesh_config(&rdev->wiphy, dev, + &cur_params); + wdev_unlock(wdev); + + if (err) + return err; + + /* Draw up a netlink message to send back */ + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_GET_MESH_CONFIG); + if (!hdr) + goto out; + pinfoattr = nla_nest_start(msg, NL80211_ATTR_MESH_CONFIG); + if (!pinfoattr) + goto nla_put_failure; + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT_U16(msg, NL80211_MESHCONF_RETRY_TIMEOUT, + cur_params.dot11MeshRetryTimeout); + NLA_PUT_U16(msg, NL80211_MESHCONF_CONFIRM_TIMEOUT, + cur_params.dot11MeshConfirmTimeout); + NLA_PUT_U16(msg, NL80211_MESHCONF_HOLDING_TIMEOUT, + cur_params.dot11MeshHoldingTimeout); + NLA_PUT_U16(msg, NL80211_MESHCONF_MAX_PEER_LINKS, + cur_params.dot11MeshMaxPeerLinks); + NLA_PUT_U8(msg, NL80211_MESHCONF_MAX_RETRIES, + cur_params.dot11MeshMaxRetries); + NLA_PUT_U8(msg, NL80211_MESHCONF_TTL, + cur_params.dot11MeshTTL); + NLA_PUT_U8(msg, NL80211_MESHCONF_ELEMENT_TTL, + cur_params.element_ttl); + NLA_PUT_U8(msg, NL80211_MESHCONF_AUTO_OPEN_PLINKS, + cur_params.auto_open_plinks); + NLA_PUT_U8(msg, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, + cur_params.dot11MeshHWMPmaxPREQretries); + NLA_PUT_U32(msg, NL80211_MESHCONF_PATH_REFRESH_TIME, + cur_params.path_refresh_time); + NLA_PUT_U16(msg, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, + cur_params.min_discovery_timeout); + NLA_PUT_U32(msg, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, + cur_params.dot11MeshHWMPactivePathTimeout); + NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, + cur_params.dot11MeshHWMPpreqMinInterval); + NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, + cur_params.dot11MeshHWMPnetDiameterTraversalTime); + NLA_PUT_U8(msg, NL80211_MESHCONF_HWMP_ROOTMODE, + cur_params.dot11MeshHWMPRootMode); + nla_nest_end(msg, pinfoattr); + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + out: + nlmsg_free(msg); + return -ENOBUFS; +} + +static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { + [NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 }, + [NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 }, + [NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 }, + [NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 }, + [NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 }, + [NL80211_MESHCONF_TTL] = { .type = NLA_U8 }, + [NL80211_MESHCONF_ELEMENT_TTL] = { .type = NLA_U8 }, + [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 }, + + [NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 }, + [NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 }, + [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 }, + [NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 }, + [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 }, + [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 }, +}; + +static const struct nla_policy + nl80211_mesh_setup_params_policy[NL80211_MESH_SETUP_ATTR_MAX+1] = { + [NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 }, + [NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 }, + [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG }, + [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG }, +}; + +static int nl80211_parse_mesh_config(struct genl_info *info, + struct mesh_config *cfg, + u32 *mask_out) +{ + struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1]; + u32 mask = 0; + +#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \ +do {\ + if (table[attr_num]) {\ + cfg->param = nla_fn(table[attr_num]); \ + mask |= (1 << (attr_num - 1)); \ + } \ +} while (0);\ + + + if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) + return -EINVAL; + if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX, + info->attrs[NL80211_ATTR_MESH_CONFIG], + nl80211_meshconf_params_policy)) + return -EINVAL; + + /* This makes sure that there aren't more than 32 mesh config + * parameters (otherwise our bitfield scheme would not work.) */ + BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32); + + /* Fill in the params struct */ + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, + mask, NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, + mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, + mask, NL80211_MESHCONF_HOLDING_TIMEOUT, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, + mask, NL80211_MESHCONF_MAX_PEER_LINKS, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, + mask, NL80211_MESHCONF_MAX_RETRIES, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, + mask, NL80211_MESHCONF_TTL, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, + mask, NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, + mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, + mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, + nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, + mask, NL80211_MESHCONF_PATH_REFRESH_TIME, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, + mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout, + mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, + nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, + mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, + dot11MeshHWMPnetDiameterTraversalTime, + mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, + nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, + dot11MeshHWMPRootMode, mask, + NL80211_MESHCONF_HWMP_ROOTMODE, + nla_get_u8); + if (mask_out) + *mask_out = mask; + + return 0; + +#undef FILL_IN_MESH_PARAM_IF_SET +} + +static int nl80211_parse_mesh_setup(struct genl_info *info, + struct mesh_setup *setup) +{ + struct nlattr *tb[NL80211_MESH_SETUP_ATTR_MAX + 1]; + + if (!info->attrs[NL80211_ATTR_MESH_SETUP]) + return -EINVAL; + if (nla_parse_nested(tb, NL80211_MESH_SETUP_ATTR_MAX, + info->attrs[NL80211_ATTR_MESH_SETUP], + nl80211_mesh_setup_params_policy)) + return -EINVAL; + + if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL]) + setup->path_sel_proto = + (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])) ? + IEEE80211_PATH_PROTOCOL_VENDOR : + IEEE80211_PATH_PROTOCOL_HWMP; + + if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC]) + setup->path_metric = + (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])) ? + IEEE80211_PATH_METRIC_VENDOR : + IEEE80211_PATH_METRIC_AIRTIME; + + + if (tb[NL80211_MESH_SETUP_IE]) { + struct nlattr *ieattr = + tb[NL80211_MESH_SETUP_IE]; + if (!is_valid_ie_attr(ieattr)) + return -EINVAL; + setup->ie = nla_data(ieattr); + setup->ie_len = nla_len(ieattr); + } + setup->is_authenticated = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AUTH]); + setup->is_secure = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AMPE]); + + return 0; +} + +static int nl80211_update_mesh_config(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct mesh_config cfg; + u32 mask; + int err; + + if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + if (!rdev->ops->update_mesh_config) + return -EOPNOTSUPP; + + err = nl80211_parse_mesh_config(info, &cfg, &mask); + if (err) + return err; + + wdev_lock(wdev); + if (!wdev->mesh_id_len) + err = -ENOLINK; + + if (!err) + err = rdev->ops->update_mesh_config(&rdev->wiphy, dev, + mask, &cfg); + + wdev_unlock(wdev); + + return err; +} + +static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr = NULL; + struct nlattr *nl_reg_rules; + unsigned int i; + int err = -EINVAL; + + mutex_lock(&cfg80211_mutex); + + if (!cfg80211_regdomain) + goto out; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOBUFS; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_GET_REG); + if (!hdr) + goto put_failure; + + NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, + cfg80211_regdomain->alpha2); + + nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES); + if (!nl_reg_rules) + goto nla_put_failure; + + for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) { + struct nlattr *nl_reg_rule; + const struct ieee80211_reg_rule *reg_rule; + const struct ieee80211_freq_range *freq_range; + const struct ieee80211_power_rule *power_rule; + + reg_rule = &cfg80211_regdomain->reg_rules[i]; + freq_range = ®_rule->freq_range; + power_rule = ®_rule->power_rule; + + nl_reg_rule = nla_nest_start(msg, i); + if (!nl_reg_rule) + goto nla_put_failure; + + NLA_PUT_U32(msg, NL80211_ATTR_REG_RULE_FLAGS, + reg_rule->flags); + NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_START, + freq_range->start_freq_khz); + NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_END, + freq_range->end_freq_khz); + NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW, + freq_range->max_bandwidth_khz); + NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN, + power_rule->max_antenna_gain); + NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP, + power_rule->max_eirp); + + nla_nest_end(msg, nl_reg_rule); + } + + nla_nest_end(msg, nl_reg_rules); + + genlmsg_end(msg, hdr); + err = genlmsg_reply(msg, info); + goto out; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +put_failure: + nlmsg_free(msg); + err = -EMSGSIZE; +out: + mutex_unlock(&cfg80211_mutex); + return err; +} + +static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1]; + struct nlattr *nl_reg_rule; + char *alpha2 = NULL; + int rem_reg_rules = 0, r = 0; + u32 num_rules = 0, rule_idx = 0, size_of_regd; + struct ieee80211_regdomain *rd = NULL; + + if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_REG_RULES]) + return -EINVAL; + + alpha2 = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); + + nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], + rem_reg_rules) { + num_rules++; + if (num_rules > NL80211_MAX_SUPP_REG_RULES) + return -EINVAL; + } + + mutex_lock(&cfg80211_mutex); + + if (!reg_is_valid_request(alpha2)) { + r = -EINVAL; + goto bad_reg; + } + + size_of_regd = sizeof(struct ieee80211_regdomain) + + (num_rules * sizeof(struct ieee80211_reg_rule)); + + rd = kzalloc(size_of_regd, GFP_KERNEL); + if (!rd) { + r = -ENOMEM; + goto bad_reg; + } + + rd->n_reg_rules = num_rules; + rd->alpha2[0] = alpha2[0]; + rd->alpha2[1] = alpha2[1]; + + nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], + rem_reg_rules) { + nla_parse(tb, NL80211_REG_RULE_ATTR_MAX, + nla_data(nl_reg_rule), nla_len(nl_reg_rule), + reg_rule_policy); + r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]); + if (r) + goto bad_reg; + + rule_idx++; + + if (rule_idx > NL80211_MAX_SUPP_REG_RULES) { + r = -EINVAL; + goto bad_reg; + } + } + + BUG_ON(rule_idx != num_rules); + + r = set_regdom(rd); + + mutex_unlock(&cfg80211_mutex); + + return r; + + bad_reg: + mutex_unlock(&cfg80211_mutex); + kfree(rd); + return r; +} + +static int validate_scan_freqs(struct nlattr *freqs) +{ + struct nlattr *attr1, *attr2; + int n_channels = 0, tmp1, tmp2; + + nla_for_each_nested(attr1, freqs, tmp1) { + n_channels++; + /* + * Some hardware has a limited channel list for + * scanning, and it is pretty much nonsensical + * to scan for a channel twice, so disallow that + * and don't require drivers to check that the + * channel list they get isn't longer than what + * they can scan, as long as they can scan all + * the channels they registered at once. + */ + nla_for_each_nested(attr2, freqs, tmp2) + if (attr1 != attr2 && + nla_get_u32(attr1) == nla_get_u32(attr2)) + return 0; + } + + return n_channels; +} + +static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_scan_request *request; + struct nlattr *attr; + struct wiphy *wiphy; + int err, tmp, n_ssids = 0, n_channels, i; + enum ieee80211_band band; + size_t ie_len; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + wiphy = &rdev->wiphy; + + if (!rdev->ops->scan) + return -EOPNOTSUPP; + + if (rdev->scan_req) + return -EBUSY; + + if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + n_channels = validate_scan_freqs( + info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); + if (!n_channels) + return -EINVAL; + } else { + n_channels = 0; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + n_channels += wiphy->bands[band]->n_channels; + } + + if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) + n_ssids++; + + if (n_ssids > wiphy->max_scan_ssids) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_IE]) + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + else + ie_len = 0; + + if (ie_len > wiphy->max_scan_ie_len) + return -EINVAL; + + request = kzalloc(sizeof(*request) + + sizeof(*request->ssids) * n_ssids + + sizeof(*request->channels) * n_channels + + ie_len, GFP_KERNEL); + if (!request) + return -ENOMEM; + + if (n_ssids) + request->ssids = (void *)&request->channels[n_channels]; + request->n_ssids = n_ssids; + if (ie_len) { + if (request->ssids) + request->ie = (void *)(request->ssids + n_ssids); + else + request->ie = (void *)(request->channels + n_channels); + } + + i = 0; + if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + /* user specified, bail out if channel not found */ + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) { + struct ieee80211_channel *chan; + + chan = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + + if (!chan) { + err = -EINVAL; + goto out_free; + } + + /* ignore disabled channels */ + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + request->channels[i] = chan; + i++; + } + } else { + /* all channels */ + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + int j; + if (!wiphy->bands[band]) + continue; + for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + struct ieee80211_channel *chan; + + chan = &wiphy->bands[band]->channels[j]; + + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + request->channels[i] = chan; + i++; + } + } + } + + if (!i) { + err = -EINVAL; + goto out_free; + } + + request->n_channels = i; + + i = 0; + if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) { + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) { + if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out_free; + } + request->ssids[i].ssid_len = nla_len(attr); + memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr)); + i++; + } + } + + if (info->attrs[NL80211_ATTR_IE]) { + request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + memcpy((void *)request->ie, + nla_data(info->attrs[NL80211_ATTR_IE]), + request->ie_len); + } + + request->dev = dev; + request->wiphy = &rdev->wiphy; + + rdev->scan_req = request; + err = rdev->ops->scan(&rdev->wiphy, dev, request); + + if (!err) { + nl80211_send_scan_start(rdev, dev); + dev_hold(dev); + } else { + out_free: + rdev->scan_req = NULL; + kfree(request); + } + + return err; +} + +static int nl80211_start_sched_scan(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_sched_scan_request *request; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct nlattr *attr; + struct wiphy *wiphy; + int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i; + u32 interval; + enum ieee80211_band band; + size_t ie_len; + struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || + !rdev->ops->sched_scan_start) + return -EOPNOTSUPP; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) + return -EINVAL; + + interval = nla_get_u32(info->attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); + if (interval == 0) + return -EINVAL; + + wiphy = &rdev->wiphy; + + if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + n_channels = validate_scan_freqs( + info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); + if (!n_channels) + return -EINVAL; + } else { + n_channels = 0; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + n_channels += wiphy->bands[band]->n_channels; + } + + if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], + tmp) + n_ssids++; + + if (n_ssids > wiphy->max_sched_scan_ssids) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) + nla_for_each_nested(attr, + info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH], + tmp) + n_match_sets++; + + if (n_match_sets > wiphy->max_match_sets) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_IE]) + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + else + ie_len = 0; + + if (ie_len > wiphy->max_sched_scan_ie_len) + return -EINVAL; + + mutex_lock(&rdev->sched_scan_mtx); + + if (rdev->sched_scan_req) { + err = -EINPROGRESS; + goto out; + } + + request = kzalloc(sizeof(*request) + + sizeof(*request->ssids) * n_ssids + + sizeof(*request->match_sets) * n_match_sets + + sizeof(*request->channels) * n_channels + + ie_len, GFP_KERNEL); + if (!request) { + err = -ENOMEM; + goto out; + } + + if (n_ssids) + request->ssids = (void *)&request->channels[n_channels]; + request->n_ssids = n_ssids; + if (ie_len) { + if (request->ssids) + request->ie = (void *)(request->ssids + n_ssids); + else + request->ie = (void *)(request->channels + n_channels); + } + + if (n_match_sets) { + if (request->ie) + request->match_sets = (void *)(request->ie + ie_len); + else if (request->ssids) + request->match_sets = + (void *)(request->ssids + n_ssids); + else + request->match_sets = + (void *)(request->channels + n_channels); + } + request->n_match_sets = n_match_sets; + + i = 0; + if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + /* user specified, bail out if channel not found */ + nla_for_each_nested(attr, + info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], + tmp) { + struct ieee80211_channel *chan; + + chan = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + + if (!chan) { + err = -EINVAL; + goto out_free; + } + + /* ignore disabled channels */ + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + request->channels[i] = chan; + i++; + } + } else { + /* all channels */ + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + int j; + if (!wiphy->bands[band]) + continue; + for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + struct ieee80211_channel *chan; + + chan = &wiphy->bands[band]->channels[j]; + + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + request->channels[i] = chan; + i++; + } + } + } + + if (!i) { + err = -EINVAL; + goto out_free; + } + + request->n_channels = i; + + i = 0; + if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) { + nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], + tmp) { + if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out_free; + } + request->ssids[i].ssid_len = nla_len(attr); + memcpy(request->ssids[i].ssid, nla_data(attr), + nla_len(attr)); + i++; + } + } + + i = 0; + if (info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) { + nla_for_each_nested(attr, + info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH], + tmp) { + struct nlattr *ssid; + + nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, + nla_data(attr), nla_len(attr), + nl80211_match_policy); + ssid = tb[NL80211_ATTR_SCHED_SCAN_MATCH_SSID]; + if (ssid) { + if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out_free; + } + memcpy(request->match_sets[i].ssid.ssid, + nla_data(ssid), nla_len(ssid)); + request->match_sets[i].ssid.ssid_len = + nla_len(ssid); + } + i++; + } + } + + if (info->attrs[NL80211_ATTR_IE]) { + request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + memcpy((void *)request->ie, + nla_data(info->attrs[NL80211_ATTR_IE]), + request->ie_len); + } + + request->dev = dev; + request->wiphy = &rdev->wiphy; + request->interval = interval; + + err = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); + if (!err) { + rdev->sched_scan_req = request; + nl80211_send_sched_scan(rdev, dev, + NL80211_CMD_START_SCHED_SCAN); + goto out; + } + +out_free: + kfree(request); +out: + mutex_unlock(&rdev->sched_scan_mtx); + return err; +} + +static int nl80211_stop_sched_scan(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || + !rdev->ops->sched_scan_stop) + return -EOPNOTSUPP; + + mutex_lock(&rdev->sched_scan_mtx); + err = __cfg80211_stop_sched_scan(rdev, false); + mutex_unlock(&rdev->sched_scan_mtx); + + return err; +} + +static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags, + struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_internal_bss *intbss) +{ + struct cfg80211_bss *res = &intbss->pub; + void *hdr; + struct nlattr *bss; + int i; + + ASSERT_WDEV_LOCK(wdev); + + hdr = nl80211hdr_put(msg, pid, seq, flags, + NL80211_CMD_NEW_SCAN_RESULTS); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex); + + bss = nla_nest_start(msg, NL80211_ATTR_BSS); + if (!bss) + goto nla_put_failure; + if (!is_zero_ether_addr(res->bssid)) + NLA_PUT(msg, NL80211_BSS_BSSID, ETH_ALEN, res->bssid); + if (res->information_elements && res->len_information_elements) + NLA_PUT(msg, NL80211_BSS_INFORMATION_ELEMENTS, + res->len_information_elements, + res->information_elements); + if (res->beacon_ies && res->len_beacon_ies && + res->beacon_ies != res->information_elements) + NLA_PUT(msg, NL80211_BSS_BEACON_IES, + res->len_beacon_ies, res->beacon_ies); + if (res->tsf) + NLA_PUT_U64(msg, NL80211_BSS_TSF, res->tsf); + if (res->beacon_interval) + NLA_PUT_U16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval); + NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability); + NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq); + NLA_PUT_U32(msg, NL80211_BSS_SEEN_MS_AGO, + jiffies_to_msecs(jiffies - intbss->ts)); + + switch (rdev->wiphy.signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal); + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + NLA_PUT_U8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal); + break; + default: + break; + } + + switch (wdev->iftype) { + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_STATION: + if (intbss == wdev->current_bss) + NLA_PUT_U32(msg, NL80211_BSS_STATUS, + NL80211_BSS_STATUS_ASSOCIATED); + else for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (intbss != wdev->auth_bsses[i]) + continue; + NLA_PUT_U32(msg, NL80211_BSS_STATUS, + NL80211_BSS_STATUS_AUTHENTICATED); + break; + } + break; + case NL80211_IFTYPE_ADHOC: + if (intbss == wdev->current_bss) + NLA_PUT_U32(msg, NL80211_BSS_STATUS, + NL80211_BSS_STATUS_IBSS_JOINED); + break; + default: + break; + } + + nla_nest_end(msg, bss); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_scan(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + struct cfg80211_internal_bss *scan; + struct wireless_dev *wdev; + int start = cb->args[1], idx = 0; + int err; + + err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev); + if (err) + return err; + + wdev = dev->ieee80211_ptr; + + wdev_lock(wdev); + spin_lock_bh(&rdev->bss_lock); + cfg80211_bss_expire(rdev); + + list_for_each_entry(scan, &rdev->bss_list, list) { + if (++idx <= start) + continue; + if (nl80211_send_bss(skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + rdev, wdev, scan) < 0) { + idx--; + break; + } + } + + spin_unlock_bh(&rdev->bss_lock); + wdev_unlock(wdev); + + cb->args[1] = idx; + nl80211_finish_netdev_dump(rdev); + + return skb->len; +} + +static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq, + int flags, struct net_device *dev, + struct survey_info *survey) +{ + void *hdr; + struct nlattr *infoattr; + + /* Survey without a channel doesn't make sense */ + if (!survey->channel) + return -EINVAL; + + hdr = nl80211hdr_put(msg, pid, seq, flags, + NL80211_CMD_NEW_SURVEY_RESULTS); + if (!hdr) + return -ENOMEM; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + + infoattr = nla_nest_start(msg, NL80211_ATTR_SURVEY_INFO); + if (!infoattr) + goto nla_put_failure; + + NLA_PUT_U32(msg, NL80211_SURVEY_INFO_FREQUENCY, + survey->channel->center_freq); + if (survey->filled & SURVEY_INFO_NOISE_DBM) + NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE, + survey->noise); + if (survey->filled & SURVEY_INFO_IN_USE) + NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME, + survey->channel_time); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_BUSY) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY, + survey->channel_time_busy); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY, + survey->channel_time_ext_busy); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_RX) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_RX, + survey->channel_time_rx); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_TX) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_TX, + survey->channel_time_tx); + + nla_nest_end(msg, infoattr); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl80211_dump_survey(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct survey_info survey; + struct cfg80211_registered_device *dev; + struct net_device *netdev; + int survey_idx = cb->args[1]; + int res; + + res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + if (res) + return res; + + if (!dev->ops->dump_survey) { + res = -EOPNOTSUPP; + goto out_err; + } + + while (1) { + res = dev->ops->dump_survey(&dev->wiphy, netdev, survey_idx, + &survey); + if (res == -ENOENT) + break; + if (res) + goto out_err; + + if (nl80211_send_survey(skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + netdev, + &survey) < 0) + goto out; + survey_idx++; + } + + out: + cb->args[1] = survey_idx; + res = skb->len; + out_err: + nl80211_finish_netdev_dump(dev); + return res; +} + +static bool nl80211_valid_auth_type(enum nl80211_auth_type auth_type) +{ + return auth_type <= NL80211_AUTHTYPE_MAX; +} + +static bool nl80211_valid_wpa_versions(u32 wpa_versions) +{ + return !(wpa_versions & ~(NL80211_WPA_VERSION_1 | + NL80211_WPA_VERSION_2)); +} + +static bool nl80211_valid_akm_suite(u32 akm) +{ + return akm == WLAN_AKM_SUITE_8021X || + akm == WLAN_AKM_SUITE_PSK; +} + +static bool nl80211_valid_cipher_suite(u32 cipher) +{ + return cipher == WLAN_CIPHER_SUITE_WEP40 || + cipher == WLAN_CIPHER_SUITE_WEP104 || + cipher == WLAN_CIPHER_SUITE_TKIP || + cipher == WLAN_CIPHER_SUITE_CCMP || + cipher == WLAN_CIPHER_SUITE_AES_CMAC; +} + + +static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct ieee80211_channel *chan; + const u8 *bssid, *ssid, *ie = NULL; + int err, ssid_len, ie_len = 0; + enum nl80211_auth_type auth_type; + struct key_parse key; + bool local_state_change; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_AUTH_TYPE]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_SSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ]) + return -EINVAL; + + err = nl80211_parse_key(info, &key); + if (err) + return err; + + if (key.idx >= 0) { + if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP) + return -EINVAL; + if (!key.p.key || !key.p.key_len) + return -EINVAL; + if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 || + key.p.key_len != WLAN_KEY_LEN_WEP40) && + (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 || + key.p.key_len != WLAN_KEY_LEN_WEP104)) + return -EINVAL; + if (key.idx > 4) + return -EINVAL; + } else { + key.p.key_len = 0; + key.p.key = NULL; + } + + if (key.idx >= 0) { + int i; + bool ok = false; + for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) { + if (key.p.cipher == rdev->wiphy.cipher_suites[i]) { + ok = true; + break; + } + } + if (!ok) + return -EINVAL; + } + + if (!rdev->ops->auth) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + chan = ieee80211_get_channel(&rdev->wiphy, + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) + return -EINVAL; + + ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + + if (info->attrs[NL80211_ATTR_IE]) { + ie = nla_data(info->attrs[NL80211_ATTR_IE]); + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); + if (!nl80211_valid_auth_type(auth_type)) + return -EINVAL; + + local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; + + return cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, + ssid, ssid_len, ie, ie_len, + key.p.key, key.p.key_len, key.idx, + local_state_change); +} + +static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, + struct genl_info *info, + struct cfg80211_crypto_settings *settings, + int cipher_limit) +{ + memset(settings, 0, sizeof(*settings)); + + settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT]; + + if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { + u16 proto; + proto = nla_get_u16( + info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); + settings->control_port_ethertype = cpu_to_be16(proto); + if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && + proto != ETH_P_PAE) + return -EINVAL; + if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]) + settings->control_port_no_encrypt = true; + } else + settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE); + + if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { + void *data; + int len, i; + + data = nla_data(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]); + len = nla_len(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]); + settings->n_ciphers_pairwise = len / sizeof(u32); + + if (len % sizeof(u32)) + return -EINVAL; + + if (settings->n_ciphers_pairwise > cipher_limit) + return -EINVAL; + + memcpy(settings->ciphers_pairwise, data, len); + + for (i = 0; i < settings->n_ciphers_pairwise; i++) + if (!nl80211_valid_cipher_suite( + settings->ciphers_pairwise[i])) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]) { + settings->cipher_group = + nla_get_u32(info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]); + if (!nl80211_valid_cipher_suite(settings->cipher_group)) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_WPA_VERSIONS]) { + settings->wpa_versions = + nla_get_u32(info->attrs[NL80211_ATTR_WPA_VERSIONS]); + if (!nl80211_valid_wpa_versions(settings->wpa_versions)) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_AKM_SUITES]) { + void *data; + int len, i; + + data = nla_data(info->attrs[NL80211_ATTR_AKM_SUITES]); + len = nla_len(info->attrs[NL80211_ATTR_AKM_SUITES]); + settings->n_akm_suites = len / sizeof(u32); + + if (len % sizeof(u32)) + return -EINVAL; + + if (settings->n_akm_suites > NL80211_MAX_NR_AKM_SUITES) + return -EINVAL; + + memcpy(settings->akm_suites, data, len); + + for (i = 0; i < settings->n_akm_suites; i++) + if (!nl80211_valid_akm_suite(settings->akm_suites[i])) + return -EINVAL; + } + + return 0; +} + +static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_crypto_settings crypto; + struct ieee80211_channel *chan; + const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL; + int err, ssid_len, ie_len = 0; + bool use_mfp = false; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_SSID] || + !info->attrs[NL80211_ATTR_WIPHY_FREQ]) + return -EINVAL; + + if (!rdev->ops->assoc) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + + chan = ieee80211_get_channel(&rdev->wiphy, + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) + return -EINVAL; + + ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + + if (info->attrs[NL80211_ATTR_IE]) { + ie = nla_data(info->attrs[NL80211_ATTR_IE]); + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + if (info->attrs[NL80211_ATTR_USE_MFP]) { + enum nl80211_mfp mfp = + nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); + if (mfp == NL80211_MFP_REQUIRED) + use_mfp = true; + else if (mfp != NL80211_MFP_NO) + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_PREV_BSSID]) + prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); + + err = nl80211_crypto_settings(rdev, info, &crypto, 1); + if (!err) + err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, + ssid, ssid_len, ie, ie_len, use_mfp, + &crypto); + + return err; +} + +static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + const u8 *ie = NULL, *bssid; + int ie_len = 0; + u16 reason_code; + bool local_state_change; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_REASON_CODE]) + return -EINVAL; + + if (!rdev->ops->deauth) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + + reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); + if (reason_code == 0) { + /* Reason Code 0 is reserved */ + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_IE]) { + ie = nla_data(info->attrs[NL80211_ATTR_IE]); + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; + + return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code, + local_state_change); +} + +static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + const u8 *ie = NULL, *bssid; + int ie_len = 0; + u16 reason_code; + bool local_state_change; + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_REASON_CODE]) + return -EINVAL; + + if (!rdev->ops->disassoc) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + + reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); + if (reason_code == 0) { + /* Reason Code 0 is reserved */ + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_IE]) { + ie = nla_data(info->attrs[NL80211_ATTR_IE]); + ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; + + return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code, + local_state_change); +} + +static bool +nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev, + int mcast_rate[IEEE80211_NUM_BANDS], + int rateval) +{ + struct wiphy *wiphy = &rdev->wiphy; + bool found = false; + int band, i; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + struct ieee80211_supported_band *sband; + + sband = wiphy->bands[band]; + if (!sband) + continue; + + for (i = 0; i < sband->n_bitrates; i++) { + if (sband->bitrates[i].bitrate == rateval) { + mcast_rate[band] = i + 1; + found = true; + break; + } + } + } + + return found; +} + +static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_ibss_params ibss; + struct wiphy *wiphy; + struct cfg80211_cached_keys *connkeys = NULL; + int err; + + memset(&ibss, 0, sizeof(ibss)); + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_SSID] || + !nla_len(info->attrs[NL80211_ATTR_SSID])) + return -EINVAL; + + ibss.beacon_interval = 100; + + if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { + ibss.beacon_interval = + nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); + if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000) + return -EINVAL; + } + + if (!rdev->ops->join_ibss) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; + + wiphy = &rdev->wiphy; + + if (info->attrs[NL80211_ATTR_MAC]) + ibss.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + ibss.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + ibss.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + + if (info->attrs[NL80211_ATTR_IE]) { + ibss.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + ibss.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + ibss.channel = ieee80211_get_channel(wiphy, + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (!ibss.channel || + ibss.channel->flags & IEEE80211_CHAN_NO_IBSS || + ibss.channel->flags & IEEE80211_CHAN_DISABLED) + return -EINVAL; + + ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED]; + ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; + + if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + u8 *rates = + nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + int n_rates = + nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + struct ieee80211_supported_band *sband = + wiphy->bands[ibss.channel->band]; + int i, j; + + if (n_rates == 0) + return -EINVAL; + + for (i = 0; i < n_rates; i++) { + int rate = (rates[i] & 0x7f) * 5; + bool found = false; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].bitrate == rate) { + found = true; + ibss.basic_rates |= BIT(j); + break; + } + } + if (!found) + return -EINVAL; + } + } + + if (info->attrs[NL80211_ATTR_MCAST_RATE] && + !nl80211_parse_mcast_rate(rdev, ibss.mcast_rate, + nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE]))) + return -EINVAL; + + if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) { + connkeys = nl80211_parse_connkeys(rdev, + info->attrs[NL80211_ATTR_KEYS]); + if (IS_ERR(connkeys)) + return PTR_ERR(connkeys); + } + + err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); + if (err) + kfree(connkeys); + return err; +} + +static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + + if (!rdev->ops->leave_ibss) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; + + return cfg80211_leave_ibss(rdev, dev, false); +} + +#ifdef CONFIG_NL80211_TESTMODE +static struct genl_multicast_group nl80211_testmode_mcgrp = { + .name = "testmode", +}; + +static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + + if (!info->attrs[NL80211_ATTR_TESTDATA]) + return -EINVAL; + + err = -EOPNOTSUPP; + if (rdev->ops->testmode_cmd) { + rdev->testmode_info = info; + err = rdev->ops->testmode_cmd(&rdev->wiphy, + nla_data(info->attrs[NL80211_ATTR_TESTDATA]), + nla_len(info->attrs[NL80211_ATTR_TESTDATA])); + rdev->testmode_info = NULL; + } + + return err; +} + +static struct sk_buff * +__cfg80211_testmode_alloc_skb(struct cfg80211_registered_device *rdev, + int approxlen, u32 pid, u32 seq, gfp_t gfp) +{ + struct sk_buff *skb; + void *hdr; + struct nlattr *data; + + skb = nlmsg_new(approxlen + 100, gfp); + if (!skb) + return NULL; + + hdr = nl80211hdr_put(skb, pid, seq, 0, NL80211_CMD_TESTMODE); + if (!hdr) { + kfree_skb(skb); + return NULL; + } + + NLA_PUT_U32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + data = nla_nest_start(skb, NL80211_ATTR_TESTDATA); + + ((void **)skb->cb)[0] = rdev; + ((void **)skb->cb)[1] = hdr; + ((void **)skb->cb)[2] = data; + + return skb; + + nla_put_failure: + kfree_skb(skb); + return NULL; +} + +struct sk_buff *cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy, + int approxlen) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + if (WARN_ON(!rdev->testmode_info)) + return NULL; + + return __cfg80211_testmode_alloc_skb(rdev, approxlen, + rdev->testmode_info->snd_pid, + rdev->testmode_info->snd_seq, + GFP_KERNEL); +} +EXPORT_SYMBOL(cfg80211_testmode_alloc_reply_skb); + +int cfg80211_testmode_reply(struct sk_buff *skb) +{ + struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0]; + void *hdr = ((void **)skb->cb)[1]; + struct nlattr *data = ((void **)skb->cb)[2]; + + if (WARN_ON(!rdev->testmode_info)) { + kfree_skb(skb); + return -EINVAL; + } + + nla_nest_end(skb, data); + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, rdev->testmode_info); +} +EXPORT_SYMBOL(cfg80211_testmode_reply); + +struct sk_buff *cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy, + int approxlen, gfp_t gfp) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + return __cfg80211_testmode_alloc_skb(rdev, approxlen, 0, 0, gfp); +} +EXPORT_SYMBOL(cfg80211_testmode_alloc_event_skb); + +void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) +{ + void *hdr = ((void **)skb->cb)[1]; + struct nlattr *data = ((void **)skb->cb)[2]; + + nla_nest_end(skb, data); + genlmsg_end(skb, hdr); + genlmsg_multicast(skb, 0, nl80211_testmode_mcgrp.id, gfp); +} +EXPORT_SYMBOL(cfg80211_testmode_event); +#endif + +static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_connect_params connect; + struct wiphy *wiphy; + struct cfg80211_cached_keys *connkeys = NULL; + int err; + + memset(&connect, 0, sizeof(connect)); + + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_SSID] || + !nla_len(info->attrs[NL80211_ATTR_SSID])) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_AUTH_TYPE]) { + connect.auth_type = + nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); + if (!nl80211_valid_auth_type(connect.auth_type)) + return -EINVAL; + } else + connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; + + connect.privacy = info->attrs[NL80211_ATTR_PRIVACY]; + + err = nl80211_crypto_settings(rdev, info, &connect.crypto, + NL80211_MAX_NR_CIPHER_SUITES); + if (err) + return err; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + wiphy = &rdev->wiphy; + + if (info->attrs[NL80211_ATTR_MAC]) + connect.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + connect.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + connect.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + + if (info->attrs[NL80211_ATTR_IE]) { + connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { + connect.channel = + ieee80211_get_channel(wiphy, + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (!connect.channel || + connect.channel->flags & IEEE80211_CHAN_DISABLED) + return -EINVAL; + } + + if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) { + connkeys = nl80211_parse_connkeys(rdev, + info->attrs[NL80211_ATTR_KEYS]); + if (IS_ERR(connkeys)) + return PTR_ERR(connkeys); + } + + err = cfg80211_connect(rdev, dev, &connect, connkeys); + if (err) + kfree(connkeys); + return err; +} + +static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u16 reason; + + if (!info->attrs[NL80211_ATTR_REASON_CODE]) + reason = WLAN_REASON_DEAUTH_LEAVING; + else + reason = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); + + if (reason == 0) + return -EINVAL; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + return cfg80211_disconnect(rdev, dev, reason, true); +} + +static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net *net; + int err; + u32 pid; + + if (!info->attrs[NL80211_ATTR_PID]) + return -EINVAL; + + pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]); + + net = get_net_ns_by_pid(pid); + if (IS_ERR(net)) + return PTR_ERR(net); + + err = 0; + + /* check if anything to do */ + if (!net_eq(wiphy_net(&rdev->wiphy), net)) + err = cfg80211_switch_netns(rdev, net); + + put_net(net); + return err; +} + +static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_pmksa *pmksa) = NULL; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_pmksa pmksa; + + memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_PMKID]) + return -EINVAL; + + pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); + pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + switch (info->genlhdr->cmd) { + case NL80211_CMD_SET_PMKSA: + rdev_ops = rdev->ops->set_pmksa; + break; + case NL80211_CMD_DEL_PMKSA: + rdev_ops = rdev->ops->del_pmksa; + break; + default: + WARN_ON(1); + break; + } + + if (!rdev_ops) + return -EOPNOTSUPP; + + return rdev_ops(&rdev->wiphy, dev, &pmksa); +} + +static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + if (!rdev->ops->flush_pmksa) + return -EOPNOTSUPP; + + return rdev->ops->flush_pmksa(&rdev->wiphy, dev); +} + +static int nl80211_remain_on_channel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct ieee80211_channel *chan; + struct sk_buff *msg; + void *hdr; + u64 cookie; + enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + u32 freq, duration; + int err; + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_DURATION]) + return -EINVAL; + + duration = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]); + + /* + * We should be on that channel for at least one jiffie, + * and more than 5 seconds seems excessive. + */ + if (!duration || !msecs_to_jiffies(duration) || + duration > rdev->wiphy.max_remain_on_channel_duration) + return -EINVAL; + + if (!rdev->ops->remain_on_channel) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { + channel_type = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); + if (channel_type != NL80211_CHAN_NO_HT && + channel_type != NL80211_CHAN_HT20 && + channel_type != NL80211_CHAN_HT40PLUS && + channel_type != NL80211_CHAN_HT40MINUS) + return -EINVAL; + } + + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (chan == NULL) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_REMAIN_ON_CHANNEL); + + if (IS_ERR(hdr)) { + err = PTR_ERR(hdr); + goto free_msg; + } + + err = rdev->ops->remain_on_channel(&rdev->wiphy, dev, chan, + channel_type, duration, &cookie); + + if (err) + goto free_msg; + + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + + nla_put_failure: + err = -ENOBUFS; + free_msg: + nlmsg_free(msg); + return err; +} + +static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u64 cookie; + + if (!info->attrs[NL80211_ATTR_COOKIE]) + return -EINVAL; + + if (!rdev->ops->cancel_remain_on_channel) + return -EOPNOTSUPP; + + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + + return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); +} + +static u32 rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len) +{ + u8 i; + u32 mask = 0; + + for (i = 0; i < rates_len; i++) { + int rate = (rates[i] & 0x7f) * 5; + int ridx; + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { + struct ieee80211_rate *srate = + &sband->bitrates[ridx]; + if (rate == srate->bitrate) { + mask |= 1 << ridx; + break; + } + } + if (ridx == sband->n_bitrates) + return 0; /* rate not found */ + } + + return mask; +} + +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, +}; + +static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[NL80211_TXRATE_MAX + 1]; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_bitrate_mask mask; + int rem, i; + struct net_device *dev = info->user_ptr[1]; + struct nlattr *tx_rates; + struct ieee80211_supported_band *sband; + + if (info->attrs[NL80211_ATTR_TX_RATES] == NULL) + return -EINVAL; + + if (!rdev->ops->set_bitrate_mask) + return -EOPNOTSUPP; + + memset(&mask, 0, sizeof(mask)); + /* Default to all rates enabled */ + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + sband = rdev->wiphy.bands[i]; + mask.control[i].legacy = + sband ? (1 << sband->n_bitrates) - 1 : 0; + } + + /* + * The nested attribute uses enum nl80211_band as the index. This maps + * directly to the enum ieee80211_band values used in cfg80211. + */ + nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) + { + enum ieee80211_band band = nla_type(tx_rates); + if (band < 0 || band >= IEEE80211_NUM_BANDS) + return -EINVAL; + sband = rdev->wiphy.bands[band]; + if (sband == NULL) + return -EINVAL; + nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), + nla_len(tx_rates), nl80211_txattr_policy); + if (tb[NL80211_TXRATE_LEGACY]) { + mask.control[band].legacy = rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_LEGACY]), + nla_len(tb[NL80211_TXRATE_LEGACY])); + if (mask.control[band].legacy == 0) + return -EINVAL; + } + } + + return rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask); +} + +static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION; + + if (!info->attrs[NL80211_ATTR_FRAME_MATCH]) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_FRAME_TYPE]) + frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + /* not much point in registering if we can't reply */ + if (!rdev->ops->mgmt_tx) + return -EOPNOTSUPP; + + return cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid, + frame_type, + nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), + nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH])); +} + +static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct ieee80211_channel *chan; + enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; + bool channel_type_valid = false; + u32 freq; + int err; + void *hdr; + u64 cookie; + struct sk_buff *msg; + unsigned int wait = 0; + bool offchan; + + if (!info->attrs[NL80211_ATTR_FRAME] || + !info->attrs[NL80211_ATTR_WIPHY_FREQ]) + return -EINVAL; + + if (!rdev->ops->mgmt_tx) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_DURATION]) { + if (!rdev->ops->mgmt_tx_cancel_wait) + return -EINVAL; + wait = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]); + } + + if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { + channel_type = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); + if (channel_type != NL80211_CHAN_NO_HT && + channel_type != NL80211_CHAN_HT20 && + channel_type != NL80211_CHAN_HT40PLUS && + channel_type != NL80211_CHAN_HT40MINUS) + return -EINVAL; + channel_type_valid = true; + } + + offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK]; + + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); + chan = rdev_freq_to_chan(rdev, freq, channel_type); + if (chan == NULL) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_FRAME); + + if (IS_ERR(hdr)) { + err = PTR_ERR(hdr); + goto free_msg; + } + err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, offchan, channel_type, + channel_type_valid, wait, + nla_data(info->attrs[NL80211_ATTR_FRAME]), + nla_len(info->attrs[NL80211_ATTR_FRAME]), + &cookie); + if (err) + goto free_msg; + + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + + nla_put_failure: + err = -ENOBUFS; + free_msg: + nlmsg_free(msg); + return err; +} + +static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + u64 cookie; + + if (!info->attrs[NL80211_ATTR_COOKIE]) + return -EINVAL; + + if (!rdev->ops->mgmt_tx_cancel_wait) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + + return rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, dev, cookie); +} + +static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev; + struct net_device *dev = info->user_ptr[1]; + u8 ps_state; + bool state; + int err; + + if (!info->attrs[NL80211_ATTR_PS_STATE]) + return -EINVAL; + + ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]); + + if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) + return -EINVAL; + + wdev = dev->ieee80211_ptr; + + if (!rdev->ops->set_power_mgmt) + return -EOPNOTSUPP; + + state = (ps_state == NL80211_PS_ENABLED) ? true : false; + + if (state == wdev->ps) + return 0; + + err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, state, + wdev->ps_timeout); + if (!err) + wdev->ps = state; + return err; +} + +static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + enum nl80211_ps_state ps_state; + struct wireless_dev *wdev; + struct net_device *dev = info->user_ptr[1]; + struct sk_buff *msg; + void *hdr; + int err; + + wdev = dev->ieee80211_ptr; + + if (!rdev->ops->set_power_mgmt) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_GET_POWER_SAVE); + if (!hdr) { + err = -ENOBUFS; + goto free_msg; + } + + if (wdev->ps) + ps_state = NL80211_PS_ENABLED; + else + ps_state = NL80211_PS_DISABLED; + + NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state); + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + + nla_put_failure: + err = -ENOBUFS; + free_msg: + nlmsg_free(msg); + return err; +} + +static struct nla_policy +nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = { + [NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_U32 }, + [NL80211_ATTR_CQM_RSSI_HYST] = { .type = NLA_U32 }, + [NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT] = { .type = NLA_U32 }, +}; + +static int nl80211_set_cqm_rssi(struct genl_info *info, + s32 threshold, u32 hysteresis) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev; + struct net_device *dev = info->user_ptr[1]; + + if (threshold > 0) + return -EINVAL; + + wdev = dev->ieee80211_ptr; + + if (!rdev->ops->set_cqm_rssi_config) + return -EOPNOTSUPP; + + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + return rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev, + threshold, hysteresis); +} + +static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[NL80211_ATTR_CQM_MAX + 1]; + struct nlattr *cqm; + int err; + + cqm = info->attrs[NL80211_ATTR_CQM]; + if (!cqm) { + err = -EINVAL; + goto out; + } + + err = nla_parse_nested(attrs, NL80211_ATTR_CQM_MAX, cqm, + nl80211_attr_cqm_policy); + if (err) + goto out; + + if (attrs[NL80211_ATTR_CQM_RSSI_THOLD] && + attrs[NL80211_ATTR_CQM_RSSI_HYST]) { + s32 threshold; + u32 hysteresis; + threshold = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_THOLD]); + hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]); + err = nl80211_set_cqm_rssi(info, threshold, hysteresis); + } else + err = -EINVAL; + +out: + return err; +} + +static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct mesh_config cfg; + struct mesh_setup setup; + int err; + + /* start with default */ + memcpy(&cfg, &default_mesh_config, sizeof(cfg)); + memcpy(&setup, &default_mesh_setup, sizeof(setup)); + + if (info->attrs[NL80211_ATTR_MESH_CONFIG]) { + /* and parse parameters if given */ + err = nl80211_parse_mesh_config(info, &cfg, NULL); + if (err) + return err; + } + + if (!info->attrs[NL80211_ATTR_MESH_ID] || + !nla_len(info->attrs[NL80211_ATTR_MESH_ID])) + return -EINVAL; + + setup.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]); + setup.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); + + if (info->attrs[NL80211_ATTR_MESH_SETUP]) { + /* parse additional setup parameters if given */ + err = nl80211_parse_mesh_setup(info, &setup); + if (err) + return err; + } + + return cfg80211_join_mesh(rdev, dev, &setup, &cfg); +} + +static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + + return cfg80211_leave_mesh(rdev, dev); +} + +static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct sk_buff *msg; + void *hdr; + + if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_GET_WOWLAN); + if (!hdr) + goto nla_put_failure; + + if (rdev->wowlan) { + struct nlattr *nl_wowlan; + + nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS); + if (!nl_wowlan) + goto nla_put_failure; + + if (rdev->wowlan->any) + NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_ANY); + if (rdev->wowlan->disconnect) + NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_DISCONNECT); + if (rdev->wowlan->magic_pkt) + NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT); + if (rdev->wowlan->n_patterns) { + struct nlattr *nl_pats, *nl_pat; + int i, pat_len; + + nl_pats = nla_nest_start(msg, + NL80211_WOWLAN_TRIG_PKT_PATTERN); + if (!nl_pats) + goto nla_put_failure; + + for (i = 0; i < rdev->wowlan->n_patterns; i++) { + nl_pat = nla_nest_start(msg, i + 1); + if (!nl_pat) + goto nla_put_failure; + pat_len = rdev->wowlan->patterns[i].pattern_len; + NLA_PUT(msg, NL80211_WOWLAN_PKTPAT_MASK, + DIV_ROUND_UP(pat_len, 8), + rdev->wowlan->patterns[i].mask); + NLA_PUT(msg, NL80211_WOWLAN_PKTPAT_PATTERN, + pat_len, + rdev->wowlan->patterns[i].pattern); + nla_nest_end(msg, nl_pat); + } + nla_nest_end(msg, nl_pats); + } + + nla_nest_end(msg, nl_wowlan); + } + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct nlattr *tb[NUM_NL80211_WOWLAN_TRIG]; + struct cfg80211_wowlan no_triggers = {}; + struct cfg80211_wowlan new_triggers = {}; + struct wiphy_wowlan_support *wowlan = &rdev->wiphy.wowlan; + int err, i; + + if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]) + goto no_triggers; + + err = nla_parse(tb, MAX_NL80211_WOWLAN_TRIG, + nla_data(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]), + nla_len(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]), + nl80211_wowlan_policy); + if (err) + return err; + + if (tb[NL80211_WOWLAN_TRIG_ANY]) { + if (!(wowlan->flags & WIPHY_WOWLAN_ANY)) + return -EINVAL; + new_triggers.any = true; + } + + if (tb[NL80211_WOWLAN_TRIG_DISCONNECT]) { + if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT)) + return -EINVAL; + new_triggers.disconnect = true; + } + + if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) { + if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT)) + return -EINVAL; + new_triggers.magic_pkt = true; + } + + if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) { + struct nlattr *pat; + int n_patterns = 0; + int rem, pat_len, mask_len; + struct nlattr *pat_tb[NUM_NL80211_WOWLAN_PKTPAT]; + + nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], + rem) + n_patterns++; + if (n_patterns > wowlan->n_patterns) + return -EINVAL; + + new_triggers.patterns = kcalloc(n_patterns, + sizeof(new_triggers.patterns[0]), + GFP_KERNEL); + if (!new_triggers.patterns) + return -ENOMEM; + + new_triggers.n_patterns = n_patterns; + i = 0; + + nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], + rem) { + nla_parse(pat_tb, MAX_NL80211_WOWLAN_PKTPAT, + nla_data(pat), nla_len(pat), NULL); + err = -EINVAL; + if (!pat_tb[NL80211_WOWLAN_PKTPAT_MASK] || + !pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]) + goto error; + pat_len = nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]); + mask_len = DIV_ROUND_UP(pat_len, 8); + if (nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]) != + mask_len) + goto error; + if (pat_len > wowlan->pattern_max_len || + pat_len < wowlan->pattern_min_len) + goto error; + + new_triggers.patterns[i].mask = + kmalloc(mask_len + pat_len, GFP_KERNEL); + if (!new_triggers.patterns[i].mask) { + err = -ENOMEM; + goto error; + } + new_triggers.patterns[i].pattern = + new_triggers.patterns[i].mask + mask_len; + memcpy(new_triggers.patterns[i].mask, + nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]), + mask_len); + new_triggers.patterns[i].pattern_len = pat_len; + memcpy(new_triggers.patterns[i].pattern, + nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]), + pat_len); + i++; + } + } + + if (memcmp(&new_triggers, &no_triggers, sizeof(new_triggers))) { + struct cfg80211_wowlan *ntrig; + ntrig = kmemdup(&new_triggers, sizeof(new_triggers), + GFP_KERNEL); + if (!ntrig) { + err = -ENOMEM; + goto error; + } + cfg80211_rdev_free_wowlan(rdev); + rdev->wowlan = ntrig; + } else { + no_triggers: + cfg80211_rdev_free_wowlan(rdev); + rdev->wowlan = NULL; + } + + return 0; + error: + for (i = 0; i < new_triggers.n_patterns; i++) + kfree(new_triggers.patterns[i].mask); + kfree(new_triggers.patterns); + return err; +} + +#define NL80211_FLAG_NEED_WIPHY 0x01 +#define NL80211_FLAG_NEED_NETDEV 0x02 +#define NL80211_FLAG_NEED_RTNL 0x04 +#define NL80211_FLAG_CHECK_NETDEV_UP 0x08 +#define NL80211_FLAG_NEED_NETDEV_UP (NL80211_FLAG_NEED_NETDEV |\ + NL80211_FLAG_CHECK_NETDEV_UP) + +static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + int err; + bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL; + + if (rtnl) + rtnl_lock(); + + if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) { + rdev = cfg80211_get_dev_from_info(info); + if (IS_ERR(rdev)) { + if (rtnl) + rtnl_unlock(); + return PTR_ERR(rdev); + } + info->user_ptr[0] = rdev; + } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) { + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) { + if (rtnl) + rtnl_unlock(); + return err; + } + if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && + !netif_running(dev)) { + cfg80211_unlock_rdev(rdev); + dev_put(dev); + if (rtnl) + rtnl_unlock(); + return -ENETDOWN; + } + info->user_ptr[0] = rdev; + info->user_ptr[1] = dev; + } + + return 0; +} + +static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + if (info->user_ptr[0]) + cfg80211_unlock_rdev(info->user_ptr[0]); + if (info->user_ptr[1]) + dev_put(info->user_ptr[1]); + if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) + rtnl_unlock(); +} + +static struct genl_ops nl80211_ops[] = { + { + .cmd = NL80211_CMD_GET_WIPHY, + .doit = nl80211_get_wiphy, + .dumpit = nl80211_dump_wiphy, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_WIPHY, + }, + { + .cmd = NL80211_CMD_SET_WIPHY, + .doit = nl80211_set_wiphy, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_INTERFACE, + .doit = nl80211_get_interface, + .dumpit = nl80211_dump_interface, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_NETDEV, + }, + { + .cmd = NL80211_CMD_SET_INTERFACE, + .doit = nl80211_set_interface, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_NEW_INTERFACE, + .doit = nl80211_new_interface, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_INTERFACE, + .doit = nl80211_del_interface, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_KEY, + .doit = nl80211_get_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_KEY, + .doit = nl80211_set_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_NEW_KEY, + .doit = nl80211_new_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_KEY, + .doit = nl80211_del_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_BEACON, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .doit = nl80211_addset_beacon, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_NEW_BEACON, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .doit = nl80211_addset_beacon, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_BEACON, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .doit = nl80211_del_beacon, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_STATION, + .doit = nl80211_get_station, + .dumpit = nl80211_dump_station, + .policy = nl80211_policy, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_STATION, + .doit = nl80211_set_station, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_NEW_STATION, + .doit = nl80211_new_station, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_STATION, + .doit = nl80211_del_station, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_MPATH, + .doit = nl80211_get_mpath, + .dumpit = nl80211_dump_mpath, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_MPATH, + .doit = nl80211_set_mpath, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_NEW_MPATH, + .doit = nl80211_new_mpath, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_MPATH, + .doit = nl80211_del_mpath, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_BSS, + .doit = nl80211_set_bss, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_REG, + .doit = nl80211_get_reg, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = NL80211_CMD_SET_REG, + .doit = nl80211_set_reg, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_REQ_SET_REG, + .doit = nl80211_req_set_reg, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_GET_MESH_CONFIG, + .doit = nl80211_get_mesh_config, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_MESH_CONFIG, + .doit = nl80211_update_mesh_config, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_TRIGGER_SCAN, + .doit = nl80211_trigger_scan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_SCAN, + .policy = nl80211_policy, + .dumpit = nl80211_dump_scan, + }, + { + .cmd = NL80211_CMD_START_SCHED_SCAN, + .doit = nl80211_start_sched_scan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_STOP_SCHED_SCAN, + .doit = nl80211_stop_sched_scan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_AUTHENTICATE, + .doit = nl80211_authenticate, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_ASSOCIATE, + .doit = nl80211_associate, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEAUTHENTICATE, + .doit = nl80211_deauthenticate, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DISASSOCIATE, + .doit = nl80211_disassociate, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_JOIN_IBSS, + .doit = nl80211_join_ibss, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_LEAVE_IBSS, + .doit = nl80211_leave_ibss, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, +#ifdef CONFIG_NL80211_TESTMODE + { + .cmd = NL80211_CMD_TESTMODE, + .doit = nl80211_testmode_do, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, +#endif + { + .cmd = NL80211_CMD_CONNECT, + .doit = nl80211_connect, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DISCONNECT, + .doit = nl80211_disconnect, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_WIPHY_NETNS, + .doit = nl80211_wiphy_netns, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_SURVEY, + .policy = nl80211_policy, + .dumpit = nl80211_dump_survey, + }, + { + .cmd = NL80211_CMD_SET_PMKSA, + .doit = nl80211_setdel_pmksa, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_PMKSA, + .doit = nl80211_setdel_pmksa, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_FLUSH_PMKSA, + .doit = nl80211_flush_pmksa, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, + .doit = nl80211_remain_on_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + .doit = nl80211_cancel_remain_on_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, + .doit = nl80211_set_tx_bitrate_mask, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_REGISTER_FRAME, + .doit = nl80211_register_mgmt, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_FRAME, + .doit = nl80211_tx_mgmt, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, + .doit = nl80211_tx_mgmt_cancel_wait, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_POWER_SAVE, + .doit = nl80211_set_power_save, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_POWER_SAVE, + .doit = nl80211_get_power_save, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_CQM, + .doit = nl80211_set_cqm, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_CHANNEL, + .doit = nl80211_set_channel, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_WDS_PEER, + .doit = nl80211_set_wds_peer, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_JOIN_MESH, + .doit = nl80211_join_mesh, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_LEAVE_MESH, + .doit = nl80211_leave_mesh, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_WOWLAN, + .doit = nl80211_get_wowlan, + .policy = nl80211_policy, + /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_WOWLAN, + .doit = nl80211_set_wowlan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, +}; + +static struct genl_multicast_group nl80211_mlme_mcgrp = { + .name = "mlme", +}; + +/* multicast groups */ +static struct genl_multicast_group nl80211_config_mcgrp = { + .name = "config", +}; +static struct genl_multicast_group nl80211_scan_mcgrp = { + .name = "scan", +}; +static struct genl_multicast_group nl80211_regulatory_mcgrp = { + .name = "regulatory", +}; + +/* notification functions */ + +void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_wiphy(msg, 0, 0, 0, rdev) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_config_mcgrp.id, GFP_KERNEL); +} + +static int nl80211_add_scan_req(struct sk_buff *msg, + struct cfg80211_registered_device *rdev) +{ + struct cfg80211_scan_request *req = rdev->scan_req; + struct nlattr *nest; + int i; + + ASSERT_RDEV_LOCK(rdev); + + if (WARN_ON(!req)) + return 0; + + nest = nla_nest_start(msg, NL80211_ATTR_SCAN_SSIDS); + if (!nest) + goto nla_put_failure; + for (i = 0; i < req->n_ssids; i++) + NLA_PUT(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid); + nla_nest_end(msg, nest); + + nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); + if (!nest) + goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) + NLA_PUT_U32(msg, i, req->channels[i]->center_freq); + nla_nest_end(msg, nest); + + if (req->ie) + NLA_PUT(msg, NL80211_ATTR_IE, req->ie_len, req->ie); + + return 0; + nla_put_failure: + return -ENOBUFS; +} + +static int nl80211_send_scan_msg(struct sk_buff *msg, + struct cfg80211_registered_device *rdev, + struct net_device *netdev, + u32 pid, u32 seq, int flags, + u32 cmd) +{ + void *hdr; + + hdr = nl80211hdr_put(msg, pid, seq, flags, cmd); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + + /* ignore errors and send incomplete event anyway */ + nl80211_add_scan_req(msg, rdev); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int +nl80211_send_sched_scan_msg(struct sk_buff *msg, + struct cfg80211_registered_device *rdev, + struct net_device *netdev, + u32 pid, u32 seq, int flags, u32 cmd) +{ + void *hdr; + + hdr = nl80211hdr_put(msg, pid, seq, flags, cmd); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_scan_msg(msg, rdev, netdev, 0, 0, 0, + NL80211_CMD_TRIGGER_SCAN) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_scan_msg(msg, rdev, netdev, 0, 0, 0, + NL80211_CMD_NEW_SCAN_RESULTS) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_scan_msg(msg, rdev, netdev, 0, 0, 0, + NL80211_CMD_SCAN_ABORTED) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, + struct net_device *netdev) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, + NL80211_CMD_SCHED_SCAN_RESULTS) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 cmd) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_scan_mcgrp.id, GFP_KERNEL); +} + +/* + * This can happen on global regulatory changes or device specific settings + * based on custom world regulatory domains. + */ +void nl80211_send_reg_change_event(struct regulatory_request *request) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE); + if (!hdr) { + nlmsg_free(msg); + return; + } + + /* Userspace can always count this one always being set */ + NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator); + + if (request->alpha2[0] == '0' && request->alpha2[1] == '0') + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_WORLD); + else if (request->alpha2[0] == '9' && request->alpha2[1] == '9') + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_CUSTOM_WORLD); + else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') || + request->intersect) + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_INTERSECTION); + else { + NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE, + NL80211_REGDOM_TYPE_COUNTRY); + NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2); + } + + if (wiphy_idx_valid(request->wiphy_idx)) + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + rcu_read_lock(); + genlmsg_multicast_allns(msg, 0, nl80211_regulatory_mcgrp.id, + GFP_ATOMIC); + rcu_read_unlock(); + + return; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, + enum nl80211_commands cmd, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *buf, + size_t len, gfp_t gfp) +{ + nl80211_send_mlme_event(rdev, netdev, buf, len, + NL80211_CMD_AUTHENTICATE, gfp); +} + +void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *buf, + size_t len, gfp_t gfp) +{ + nl80211_send_mlme_event(rdev, netdev, buf, len, + NL80211_CMD_ASSOCIATE, gfp); +} + +void nl80211_send_deauth(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *buf, + size_t len, gfp_t gfp) +{ + nl80211_send_mlme_event(rdev, netdev, buf, len, + NL80211_CMD_DEAUTHENTICATE, gfp); +} + +void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *buf, + size_t len, gfp_t gfp) +{ + nl80211_send_mlme_event(rdev, netdev, buf, len, + NL80211_CMD_DISASSOCIATE, gfp); +} + +void nl80211_send_unprot_deauth(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *buf, + size_t len, gfp_t gfp) +{ + nl80211_send_mlme_event(rdev, netdev, buf, len, + NL80211_CMD_UNPROT_DEAUTHENTICATE, gfp); +} + +void nl80211_send_unprot_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *buf, + size_t len, gfp_t gfp) +{ + nl80211_send_mlme_event(rdev, netdev, buf, len, + NL80211_CMD_UNPROT_DISASSOCIATE, gfp); +} + +static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev, + struct net_device *netdev, int cmd, + const u8 *addr, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT_FLAG(msg, NL80211_ATTR_TIMED_OUT); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, addr); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *addr, + gfp_t gfp) +{ + nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_AUTHENTICATE, + addr, gfp); +} + +void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *addr, + gfp_t gfp) +{ + nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_ASSOCIATE, + addr, gfp); +} + +void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, + u16 status, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONNECT); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + if (bssid) + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid); + NLA_PUT_U16(msg, NL80211_ATTR_STATUS_CODE, status); + if (req_ie) + NLA_PUT(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie); + if (resp_ie) + NLA_PUT(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + +} + +void nl80211_send_roamed(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ROAM); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid); + if (req_ie) + NLA_PUT(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie); + if (resp_ie) + NLA_PUT(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + +} + +void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u16 reason, + const u8 *ie, size_t ie_len, bool from_ap) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DISCONNECT); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + if (from_ap && reason) + NLA_PUT_U16(msg, NL80211_ATTR_REASON_CODE, reason); + if (from_ap) + NLA_PUT_FLAG(msg, NL80211_ATTR_DISCONNECTED_BY_AP); + if (ie) + NLA_PUT(msg, NL80211_ATTR_IE, ie_len, ie); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, GFP_KERNEL); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + +} + +void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_JOIN_IBSS); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *macaddr, const u8* ie, u8 ie_len, + gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NEW_PEER_CANDIDATE); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, macaddr); + if (ie_len && ie) + NLA_PUT(msg, NL80211_ATTR_IE, ie_len , ie); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *addr, + enum nl80211_key_type key_type, int key_id, + const u8 *tsc, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_MICHAEL_MIC_FAILURE); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + if (addr) + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, addr); + NLA_PUT_U32(msg, NL80211_ATTR_KEY_TYPE, key_type); + if (key_id != -1) + NLA_PUT_U8(msg, NL80211_ATTR_KEY_IDX, key_id); + if (tsc) + NLA_PUT(msg, NL80211_ATTR_KEY_SEQ, 6, tsc); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_beacon_hint_event(struct wiphy *wiphy, + struct ieee80211_channel *channel_before, + struct ieee80211_channel *channel_after) +{ + struct sk_buff *msg; + void *hdr; + struct nlattr *nl_freq; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_BEACON_HINT); + if (!hdr) { + nlmsg_free(msg); + return; + } + + /* + * Since we are applying the beacon hint to a wiphy we know its + * wiphy_idx is valid + */ + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)); + + /* Before */ + nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE); + if (!nl_freq) + goto nla_put_failure; + if (nl80211_msg_put_channel(msg, channel_before)) + goto nla_put_failure; + nla_nest_end(msg, nl_freq); + + /* After */ + nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER); + if (!nl_freq) + goto nla_put_failure; + if (nl80211_msg_put_channel(msg, channel_after)) + goto nla_put_failure; + nla_nest_end(msg, nl_freq); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + rcu_read_lock(); + genlmsg_multicast_allns(msg, 0, nl80211_regulatory_mcgrp.id, + GFP_ATOMIC); + rcu_read_unlock(); + + return; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +static void nl80211_send_remain_on_chan_event( + int cmd, struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE, channel_type); + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + + if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL) + NLA_PUT_U32(msg, NL80211_ATTR_DURATION, duration); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp) +{ + nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL, + rdev, netdev, cookie, chan, + channel_type, duration, gfp); +} + +void nl80211_send_remain_on_channel_cancel( + struct cfg80211_registered_device *rdev, struct net_device *netdev, + u64 cookie, struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, gfp_t gfp) +{ + nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + rdev, netdev, cookie, chan, + channel_type, 0, gfp); +} + +void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *mac_addr, + struct station_info *sinfo, gfp_t gfp) +{ + struct sk_buff *msg; + + msg = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (!msg) + return; + + if (nl80211_send_station(msg, 0, 0, 0, dev, mac_addr, sinfo) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); +} + +void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *mac_addr, + gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_STATION); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 nlpid, + int freq, const u8 *buf, size_t len, gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME); + if (!hdr) { + nlmsg_free(msg); + return -ENOMEM; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FREQ, freq); + NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf); + + err = genlmsg_end(msg, hdr); + if (err < 0) { + nlmsg_free(msg); + return err; + } + + err = genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlpid); + if (err < 0) + return err; + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + return -ENOBUFS; +} + +void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf); + NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); + if (ack) + NLA_PUT_FLAG(msg, NL80211_ATTR_ACK); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast(msg, 0, nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void +nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + enum nl80211_cqm_rssi_threshold_event rssi_event, + gfp_t gfp) +{ + struct sk_buff *msg; + struct nlattr *pinfoattr; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NOTIFY_CQM); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + + pinfoattr = nla_nest_start(msg, NL80211_ATTR_CQM); + if (!pinfoattr) + goto nla_put_failure; + + NLA_PUT_U32(msg, NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT, + rssi_event); + + nla_nest_end(msg, pinfoattr); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +void +nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *peer, + u32 num_packets, gfp_t gfp) +{ + struct sk_buff *msg; + struct nlattr *pinfoattr; + void *hdr; + + msg = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NOTIFY_CQM); + if (!hdr) { + nlmsg_free(msg); + return; + } + + NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx); + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, peer); + + pinfoattr = nla_nest_start(msg, NL80211_ATTR_CQM); + if (!pinfoattr) + goto nla_put_failure; + + NLA_PUT_U32(msg, NL80211_ATTR_CQM_PKT_LOSS_EVENT, num_packets); + + nla_nest_end(msg, pinfoattr); + + if (genlmsg_end(msg, hdr) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0, + nl80211_mlme_mcgrp.id, gfp); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + +static int nl80211_netlink_notify(struct notifier_block * nb, + unsigned long state, + void *_notify) +{ + struct netlink_notify *notify = _notify; + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + + if (state != NETLINK_URELEASE) + return NOTIFY_DONE; + + rcu_read_lock(); + + list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) + list_for_each_entry_rcu(wdev, &rdev->netdev_list, list) + cfg80211_mlme_unregister_socket(wdev, notify->pid); + + rcu_read_unlock(); + + return NOTIFY_DONE; +} + +static struct notifier_block nl80211_netlink_notifier = { + .notifier_call = nl80211_netlink_notify, +}; + +/* initialisation/exit functions */ + +int nl80211_init(void) +{ + int err; + + err = genl_register_family_with_ops(&nl80211_fam, + nl80211_ops, ARRAY_SIZE(nl80211_ops)); + if (err) + return err; + + err = genl_register_mc_group(&nl80211_fam, &nl80211_config_mcgrp); + if (err) + goto err_out; + + err = genl_register_mc_group(&nl80211_fam, &nl80211_scan_mcgrp); + if (err) + goto err_out; + + err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp); + if (err) + goto err_out; + + err = genl_register_mc_group(&nl80211_fam, &nl80211_mlme_mcgrp); + if (err) + goto err_out; + +#ifdef CONFIG_NL80211_TESTMODE + err = genl_register_mc_group(&nl80211_fam, &nl80211_testmode_mcgrp); + if (err) + goto err_out; +#endif + + err = netlink_register_notifier(&nl80211_netlink_notifier); + if (err) + goto err_out; + + return 0; + err_out: + genl_unregister_family(&nl80211_fam); + return err; +} + +void nl80211_exit(void) +{ + netlink_unregister_notifier(&nl80211_netlink_notifier); + genl_unregister_family(&nl80211_fam); +} diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h new file mode 100644 index 00000000..2f1bfb87 --- /dev/null +++ b/net/wireless/nl80211.h @@ -0,0 +1,112 @@ +#ifndef __NET_WIRELESS_NL80211_H +#define __NET_WIRELESS_NL80211_H + +#include "core.h" + +int nl80211_init(void); +void nl80211_exit(void); +void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev); +void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, + struct net_device *netdev); +void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, + struct net_device *netdev); +void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, + struct net_device *netdev); +void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 cmd); +void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, + struct net_device *netdev); +void nl80211_send_reg_change_event(struct regulatory_request *request); +void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_deauth(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_unprot_deauth(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_unprot_disassoc(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *addr, gfp_t gfp); +void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *addr, gfp_t gfp); +void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, + u16 status, gfp_t gfp); +void nl80211_send_roamed(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp); +void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u16 reason, + const u8 *ie, size_t ie_len, bool from_ap); + +void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + const u8 *macaddr, const u8* ie, u8 ie_len, + gfp_t gfp); +void +nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *addr, + enum nl80211_key_type key_type, + int key_id, const u8 *tsc, gfp_t gfp); + +void +nl80211_send_beacon_hint_event(struct wiphy *wiphy, + struct ieee80211_channel *channel_before, + struct ieee80211_channel *channel_after); + +void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid, + gfp_t gfp); + +void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + u64 cookie, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + unsigned int duration, gfp_t gfp); +void nl80211_send_remain_on_channel_cancel( + struct cfg80211_registered_device *rdev, struct net_device *netdev, + u64 cookie, struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, gfp_t gfp); + +void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *mac_addr, + struct station_info *sinfo, gfp_t gfp); +void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *mac_addr, + gfp_t gfp); + +int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 nlpid, int freq, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp); + +void +nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, + enum nl80211_cqm_rssi_threshold_event rssi_event, + gfp_t gfp); +void +nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *peer, + u32 num_packets, gfp_t gfp); + +#endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c new file mode 100644 index 00000000..dbe35e13 --- /dev/null +++ b/net/wireless/radiotap.c @@ -0,0 +1,357 @@ +/* + * Radiotap parser + * + * Copyright 2007 Andy Green + * Copyright 2009 Johannes Berg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Alternatively, this software may be distributed under the terms of BSD + * license. + * + * See COPYING for more details. + */ + +#include +#include +#include +#include + +/* function prototypes and related defs are in include/net/cfg80211.h */ + +static const struct radiotap_align_size rtap_namespace_sizes[] = { + [IEEE80211_RADIOTAP_TSFT] = { .align = 8, .size = 8, }, + [IEEE80211_RADIOTAP_FLAGS] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_RATE] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_CHANNEL] = { .align = 2, .size = 4, }, + [IEEE80211_RADIOTAP_FHSS] = { .align = 2, .size = 2, }, + [IEEE80211_RADIOTAP_DBM_ANTSIGNAL] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_DBM_ANTNOISE] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_LOCK_QUALITY] = { .align = 2, .size = 2, }, + [IEEE80211_RADIOTAP_TX_ATTENUATION] = { .align = 2, .size = 2, }, + [IEEE80211_RADIOTAP_DB_TX_ATTENUATION] = { .align = 2, .size = 2, }, + [IEEE80211_RADIOTAP_DBM_TX_POWER] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_ANTENNA] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_DB_ANTSIGNAL] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_DB_ANTNOISE] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_RX_FLAGS] = { .align = 2, .size = 2, }, + [IEEE80211_RADIOTAP_TX_FLAGS] = { .align = 2, .size = 2, }, + [IEEE80211_RADIOTAP_RTS_RETRIES] = { .align = 1, .size = 1, }, + [IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, }, + /* + * add more here as they are defined in radiotap.h + */ +}; + +static const struct ieee80211_radiotap_namespace radiotap_ns = { + .n_bits = ARRAY_SIZE(rtap_namespace_sizes), + .align_size = rtap_namespace_sizes, +}; + +/** + * ieee80211_radiotap_iterator_init - radiotap parser iterator initialization + * @iterator: radiotap_iterator to initialize + * @radiotap_header: radiotap header to parse + * @max_length: total length we can parse into (eg, whole packet length) + * + * Returns: 0 or a negative error code if there is a problem. + * + * This function initializes an opaque iterator struct which can then + * be passed to ieee80211_radiotap_iterator_next() to visit every radiotap + * argument which is present in the header. It knows about extended + * present headers and handles them. + * + * How to use: + * call __ieee80211_radiotap_iterator_init() to init a semi-opaque iterator + * struct ieee80211_radiotap_iterator (no need to init the struct beforehand) + * checking for a good 0 return code. Then loop calling + * __ieee80211_radiotap_iterator_next()... it returns either 0, + * -ENOENT if there are no more args to parse, or -EINVAL if there is a problem. + * The iterator's @this_arg member points to the start of the argument + * associated with the current argument index that is present, which can be + * found in the iterator's @this_arg_index member. This arg index corresponds + * to the IEEE80211_RADIOTAP_... defines. + * + * Radiotap header length: + * You can find the CPU-endian total radiotap header length in + * iterator->max_length after executing ieee80211_radiotap_iterator_init() + * successfully. + * + * Alignment Gotcha: + * You must take care when dereferencing iterator.this_arg + * for multibyte types... the pointer is not aligned. Use + * get_unaligned((type *)iterator.this_arg) to dereference + * iterator.this_arg for type "type" safely on all arches. + * + * Example code: + * See Documentation/networking/radiotap-headers.txt + */ + +int ieee80211_radiotap_iterator_init( + struct ieee80211_radiotap_iterator *iterator, + struct ieee80211_radiotap_header *radiotap_header, + int max_length, const struct ieee80211_radiotap_vendor_namespaces *vns) +{ + /* Linux only supports version 0 radiotap format */ + if (radiotap_header->it_version) + return -EINVAL; + + /* sanity check for allowed length and radiotap length field */ + if (max_length < get_unaligned_le16(&radiotap_header->it_len)) + return -EINVAL; + + iterator->_rtheader = radiotap_header; + iterator->_max_length = get_unaligned_le16(&radiotap_header->it_len); + iterator->_arg_index = 0; + iterator->_bitmap_shifter = get_unaligned_le32(&radiotap_header->it_present); + iterator->_arg = (uint8_t *)radiotap_header + sizeof(*radiotap_header); + iterator->_reset_on_ext = 0; + iterator->_next_bitmap = &radiotap_header->it_present; + iterator->_next_bitmap++; + iterator->_vns = vns; + iterator->current_namespace = &radiotap_ns; + iterator->is_radiotap_ns = 1; + + /* find payload start allowing for extended bitmap(s) */ + + if (iterator->_bitmap_shifter & (1<_arg) & + (1 << IEEE80211_RADIOTAP_EXT)) { + iterator->_arg += sizeof(uint32_t); + + /* + * check for insanity where the present bitmaps + * keep claiming to extend up to or even beyond the + * stated radiotap header length + */ + + if ((unsigned long)iterator->_arg - + (unsigned long)iterator->_rtheader > + (unsigned long)iterator->_max_length) + return -EINVAL; + } + + iterator->_arg += sizeof(uint32_t); + + /* + * no need to check again for blowing past stated radiotap + * header length, because ieee80211_radiotap_iterator_next + * checks it before it is dereferenced + */ + } + + iterator->this_arg = iterator->_arg; + + /* we are all initialized happily */ + + return 0; +} +EXPORT_SYMBOL(ieee80211_radiotap_iterator_init); + +static void find_ns(struct ieee80211_radiotap_iterator *iterator, + uint32_t oui, uint8_t subns) +{ + int i; + + iterator->current_namespace = NULL; + + if (!iterator->_vns) + return; + + for (i = 0; i < iterator->_vns->n_ns; i++) { + if (iterator->_vns->ns[i].oui != oui) + continue; + if (iterator->_vns->ns[i].subns != subns) + continue; + + iterator->current_namespace = &iterator->_vns->ns[i]; + break; + } +} + + + +/** + * ieee80211_radiotap_iterator_next - return next radiotap parser iterator arg + * @iterator: radiotap_iterator to move to next arg (if any) + * + * Returns: 0 if there is an argument to handle, + * -ENOENT if there are no more args or -EINVAL + * if there is something else wrong. + * + * This function provides the next radiotap arg index (IEEE80211_RADIOTAP_*) + * in @this_arg_index and sets @this_arg to point to the + * payload for the field. It takes care of alignment handling and extended + * present fields. @this_arg can be changed by the caller (eg, + * incremented to move inside a compound argument like + * IEEE80211_RADIOTAP_CHANNEL). The args pointed to are in + * little-endian format whatever the endianess of your CPU. + * + * Alignment Gotcha: + * You must take care when dereferencing iterator.this_arg + * for multibyte types... the pointer is not aligned. Use + * get_unaligned((type *)iterator.this_arg) to dereference + * iterator.this_arg for type "type" safely on all arches. + */ + +int ieee80211_radiotap_iterator_next( + struct ieee80211_radiotap_iterator *iterator) +{ + while (1) { + int hit = 0; + int pad, align, size, subns; + uint32_t oui; + + /* if no more EXT bits, that's it */ + if ((iterator->_arg_index % 32) == IEEE80211_RADIOTAP_EXT && + !(iterator->_bitmap_shifter & 1)) + return -ENOENT; + + if (!(iterator->_bitmap_shifter & 1)) + goto next_entry; /* arg not present */ + + /* get alignment/size of data */ + switch (iterator->_arg_index % 32) { + case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE: + case IEEE80211_RADIOTAP_EXT: + align = 1; + size = 0; + break; + case IEEE80211_RADIOTAP_VENDOR_NAMESPACE: + align = 2; + size = 6; + break; + default: + if (!iterator->current_namespace || + iterator->_arg_index >= iterator->current_namespace->n_bits) { + if (iterator->current_namespace == &radiotap_ns) + return -ENOENT; + align = 0; + } else { + align = iterator->current_namespace->align_size[iterator->_arg_index].align; + size = iterator->current_namespace->align_size[iterator->_arg_index].size; + } + if (!align) { + /* skip all subsequent data */ + iterator->_arg = iterator->_next_ns_data; + /* give up on this namespace */ + iterator->current_namespace = NULL; + goto next_entry; + } + break; + } + + /* + * arg is present, account for alignment padding + * + * Note that these alignments are relative to the start + * of the radiotap header. There is no guarantee + * that the radiotap header itself is aligned on any + * kind of boundary. + * + * The above is why get_unaligned() is used to dereference + * multibyte elements from the radiotap area. + */ + + pad = ((unsigned long)iterator->_arg - + (unsigned long)iterator->_rtheader) & (align - 1); + + if (pad) + iterator->_arg += align - pad; + + if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) { + int vnslen; + + if ((unsigned long)iterator->_arg + size - + (unsigned long)iterator->_rtheader > + (unsigned long)iterator->_max_length) + return -EINVAL; + + oui = (*iterator->_arg << 16) | + (*(iterator->_arg + 1) << 8) | + *(iterator->_arg + 2); + subns = *(iterator->_arg + 3); + + find_ns(iterator, oui, subns); + + vnslen = get_unaligned_le16(iterator->_arg + 4); + iterator->_next_ns_data = iterator->_arg + size + vnslen; + if (!iterator->current_namespace) + size += vnslen; + } + + /* + * this is what we will return to user, but we need to + * move on first so next call has something fresh to test + */ + iterator->this_arg_index = iterator->_arg_index; + iterator->this_arg = iterator->_arg; + iterator->this_arg_size = size; + + /* internally move on the size of this arg */ + iterator->_arg += size; + + /* + * check for insanity where we are given a bitmap that + * claims to have more arg content than the length of the + * radiotap section. We will normally end up equalling this + * max_length on the last arg, never exceeding it. + */ + + if ((unsigned long)iterator->_arg - + (unsigned long)iterator->_rtheader > + (unsigned long)iterator->_max_length) + return -EINVAL; + + /* these special ones are valid in each bitmap word */ + switch (iterator->_arg_index % 32) { + case IEEE80211_RADIOTAP_VENDOR_NAMESPACE: + iterator->_reset_on_ext = 1; + + iterator->is_radiotap_ns = 0; + /* + * If parser didn't register this vendor + * namespace with us, allow it to show it + * as 'raw. Do do that, set argument index + * to vendor namespace. + */ + iterator->this_arg_index = + IEEE80211_RADIOTAP_VENDOR_NAMESPACE; + if (!iterator->current_namespace) + hit = 1; + goto next_entry; + case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE: + iterator->_reset_on_ext = 1; + iterator->current_namespace = &radiotap_ns; + iterator->is_radiotap_ns = 1; + goto next_entry; + case IEEE80211_RADIOTAP_EXT: + /* + * bit 31 was set, there is more + * -- move to next u32 bitmap + */ + iterator->_bitmap_shifter = + get_unaligned_le32(iterator->_next_bitmap); + iterator->_next_bitmap++; + if (iterator->_reset_on_ext) + iterator->_arg_index = 0; + else + iterator->_arg_index++; + iterator->_reset_on_ext = 0; + break; + default: + /* we've got a hit! */ + hit = 1; + next_entry: + iterator->_bitmap_shifter >>= 1; + iterator->_arg_index++; + } + + /* if we found a valid arg earlier, return it now */ + if (hit) + return 0; + } +} +EXPORT_SYMBOL(ieee80211_radiotap_iterator_next); diff --git a/net/wireless/reg.c b/net/wireless/reg.c new file mode 100644 index 00000000..9bcb6bc2 --- /dev/null +++ b/net/wireless/reg.c @@ -0,0 +1,2322 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2007 Johannes Berg + * Copyright 2008 Luis R. Rodriguez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/** + * DOC: Wireless regulatory infrastructure + * + * The usual implementation is for a driver to read a device EEPROM to + * determine which regulatory domain it should be operating under, then + * looking up the allowable channels in a driver-local table and finally + * registering those channels in the wiphy structure. + * + * Another set of compliance enforcement is for drivers to use their + * own compliance limits which can be stored on the EEPROM. The host + * driver or firmware may ensure these are used. + * + * In addition to all this we provide an extra layer of regulatory + * conformance. For drivers which do not have any regulatory + * information CRDA provides the complete regulatory solution. + * For others it provides a community effort on further restrictions + * to enhance compliance. + * + * Note: When number of rules --> infinity we will not be able to + * index on alpha2 any more, instead we'll probably have to + * rely on some SHA1 checksum of the regdomain for example. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "core.h" +#include "reg.h" +#include "regdb.h" +#include "nl80211.h" + +#ifdef CONFIG_CFG80211_REG_DEBUG +#define REG_DBG_PRINT(format, args...) \ + do { \ + printk(KERN_DEBUG pr_fmt(format), ##args); \ + } while (0) +#else +#define REG_DBG_PRINT(args...) +#endif + +static struct regulatory_request core_request_world = { + .initiator = NL80211_REGDOM_SET_BY_CORE, + .alpha2[0] = '0', + .alpha2[1] = '0', + .intersect = false, + .processed = true, + .country_ie_env = ENVIRON_ANY, +}; + +/* Receipt of information from last regulatory request */ +static struct regulatory_request *last_request = &core_request_world; + +/* To trigger userspace events */ +static struct platform_device *reg_pdev; + +static struct device_type reg_device_type = { + .uevent = reg_device_uevent, +}; + +/* + * Central wireless core regulatory domains, we only need two, + * the current one and a world regulatory domain in case we have no + * information to give us an alpha2 + */ +const struct ieee80211_regdomain *cfg80211_regdomain; + +/* + * Protects static reg.c components: + * - cfg80211_world_regdom + * - cfg80211_regdom + * - last_request + */ +static DEFINE_MUTEX(reg_mutex); + +static inline void assert_reg_lock(void) +{ + lockdep_assert_held(®_mutex); +} + +/* Used to queue up regulatory hints */ +static LIST_HEAD(reg_requests_list); +static spinlock_t reg_requests_lock; + +/* Used to queue up beacon hints for review */ +static LIST_HEAD(reg_pending_beacons); +static spinlock_t reg_pending_beacons_lock; + +/* Used to keep track of processed beacon hints */ +static LIST_HEAD(reg_beacon_list); + +struct reg_beacon { + struct list_head list; + struct ieee80211_channel chan; +}; + +static void reg_todo(struct work_struct *work); +static DECLARE_WORK(reg_work, reg_todo); + +static void reg_timeout_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work); + +/* We keep a static world regulatory domain in case of the absence of CRDA */ +static const struct ieee80211_regdomain world_regdom = { + .n_reg_rules = 5, + .alpha2 = "00", + .reg_rules = { + /* IEEE 802.11b/g, channels 1..11 */ + REG_RULE(2412-10, 2462+10, 40, 6, 20, 0), + /* IEEE 802.11b/g, channels 12..13. No HT40 + * channel fits here. */ + REG_RULE(2467-10, 2472+10, 20, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + /* IEEE 802.11 channel 14 - Only JP enables + * this and for 802.11b only */ + REG_RULE(2484-10, 2484+10, 20, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS | + NL80211_RRF_NO_OFDM), + /* IEEE 802.11a, channel 36..48 */ + REG_RULE(5180-10, 5240+10, 40, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + + /* NB: 5260 MHz - 5700 MHz requies DFS */ + + /* IEEE 802.11a, channel 149..165 */ + REG_RULE(5745-10, 5825+10, 40, 6, 20, + NL80211_RRF_PASSIVE_SCAN | + NL80211_RRF_NO_IBSS), + } +}; + +static const struct ieee80211_regdomain *cfg80211_world_regdom = + &world_regdom; + +static char *ieee80211_regdom = "00"; +static char user_alpha2[2]; + +module_param(ieee80211_regdom, charp, 0444); +MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); + +static void reset_regdomains(bool full_reset) +{ + /* avoid freeing static information or freeing something twice */ + if (cfg80211_regdomain == cfg80211_world_regdom) + cfg80211_regdomain = NULL; + if (cfg80211_world_regdom == &world_regdom) + cfg80211_world_regdom = NULL; + if (cfg80211_regdomain == &world_regdom) + cfg80211_regdomain = NULL; + + kfree(cfg80211_regdomain); + kfree(cfg80211_world_regdom); + + cfg80211_world_regdom = &world_regdom; + cfg80211_regdomain = NULL; + + if (!full_reset) + return; + + if (last_request != &core_request_world) + kfree(last_request); + last_request = &core_request_world; +} + +/* + * Dynamic world regulatory domain requested by the wireless + * core upon initialization + */ +static void update_world_regdomain(const struct ieee80211_regdomain *rd) +{ + BUG_ON(!last_request); + + reset_regdomains(false); + + cfg80211_world_regdom = rd; + cfg80211_regdomain = rd; +} + +bool is_world_regdom(const char *alpha2) +{ + if (!alpha2) + return false; + if (alpha2[0] == '0' && alpha2[1] == '0') + return true; + return false; +} + +static bool is_alpha2_set(const char *alpha2) +{ + if (!alpha2) + return false; + if (alpha2[0] != 0 && alpha2[1] != 0) + return true; + return false; +} + +static bool is_unknown_alpha2(const char *alpha2) +{ + if (!alpha2) + return false; + /* + * Special case where regulatory domain was built by driver + * but a specific alpha2 cannot be determined + */ + if (alpha2[0] == '9' && alpha2[1] == '9') + return true; + return false; +} + +static bool is_intersected_alpha2(const char *alpha2) +{ + if (!alpha2) + return false; + /* + * Special case where regulatory domain is the + * result of an intersection between two regulatory domain + * structures + */ + if (alpha2[0] == '9' && alpha2[1] == '8') + return true; + return false; +} + +static bool is_an_alpha2(const char *alpha2) +{ + if (!alpha2) + return false; + if (isalpha(alpha2[0]) && isalpha(alpha2[1])) + return true; + return false; +} + +static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) +{ + if (!alpha2_x || !alpha2_y) + return false; + if (alpha2_x[0] == alpha2_y[0] && + alpha2_x[1] == alpha2_y[1]) + return true; + return false; +} + +static bool regdom_changes(const char *alpha2) +{ + assert_cfg80211_lock(); + + if (!cfg80211_regdomain) + return true; + if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) + return false; + return true; +} + +/* + * The NL80211_REGDOM_SET_BY_USER regdom alpha2 is cached, this lets + * you know if a valid regulatory hint with NL80211_REGDOM_SET_BY_USER + * has ever been issued. + */ +static bool is_user_regdom_saved(void) +{ + if (user_alpha2[0] == '9' && user_alpha2[1] == '7') + return false; + + /* This would indicate a mistake on the design */ + if (WARN((!is_world_regdom(user_alpha2) && + !is_an_alpha2(user_alpha2)), + "Unexpected user alpha2: %c%c\n", + user_alpha2[0], + user_alpha2[1])) + return false; + + return true; +} + +static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd, + const struct ieee80211_regdomain *src_regd) +{ + struct ieee80211_regdomain *regd; + int size_of_regd = 0; + unsigned int i; + + size_of_regd = sizeof(struct ieee80211_regdomain) + + ((src_regd->n_reg_rules + 1) * sizeof(struct ieee80211_reg_rule)); + + regd = kzalloc(size_of_regd, GFP_KERNEL); + if (!regd) + return -ENOMEM; + + memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); + + for (i = 0; i < src_regd->n_reg_rules; i++) + memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], + sizeof(struct ieee80211_reg_rule)); + + *dst_regd = regd; + return 0; +} + +#ifdef CONFIG_CFG80211_INTERNAL_REGDB +struct reg_regdb_search_request { + char alpha2[2]; + struct list_head list; +}; + +static LIST_HEAD(reg_regdb_search_list); +static DEFINE_MUTEX(reg_regdb_search_mutex); + +static void reg_regdb_search(struct work_struct *work) +{ + struct reg_regdb_search_request *request; + const struct ieee80211_regdomain *curdom, *regdom; + int i, r; + + mutex_lock(®_regdb_search_mutex); + while (!list_empty(®_regdb_search_list)) { + request = list_first_entry(®_regdb_search_list, + struct reg_regdb_search_request, + list); + list_del(&request->list); + + for (i=0; ialpha2, curdom->alpha2, 2)) { + r = reg_copy_regd(®dom, curdom); + if (r) + break; + mutex_lock(&cfg80211_mutex); + set_regdom(regdom); + mutex_unlock(&cfg80211_mutex); + break; + } + } + + kfree(request); + } + mutex_unlock(®_regdb_search_mutex); +} + +static DECLARE_WORK(reg_regdb_work, reg_regdb_search); + +static void reg_regdb_query(const char *alpha2) +{ + struct reg_regdb_search_request *request; + + if (!alpha2) + return; + + request = kzalloc(sizeof(struct reg_regdb_search_request), GFP_KERNEL); + if (!request) + return; + + memcpy(request->alpha2, alpha2, 2); + + mutex_lock(®_regdb_search_mutex); + list_add_tail(&request->list, ®_regdb_search_list); + mutex_unlock(®_regdb_search_mutex); + + schedule_work(®_regdb_work); +} + +/* Feel free to add any other sanity checks here */ +static void reg_regdb_size_check(void) +{ + /* We should ideally BUILD_BUG_ON() but then random builds would fail */ + WARN_ONCE(!reg_regdb_size, "db.txt is empty, you should update it..."); +} +#else +static inline void reg_regdb_size_check(void) {} +static inline void reg_regdb_query(const char *alpha2) {} +#endif /* CONFIG_CFG80211_INTERNAL_REGDB */ + +/* + * This lets us keep regulatory code which is updated on a regulatory + * basis in userspace. Country information is filled in by + * reg_device_uevent + */ +static int call_crda(const char *alpha2) +{ + if (!is_world_regdom((char *) alpha2)) + pr_info("Calling CRDA for country: %c%c\n", + alpha2[0], alpha2[1]); + else + pr_info("Calling CRDA to update world regulatory domain\n"); + + /* query internal regulatory database (if it exists) */ + reg_regdb_query(alpha2); + + return kobject_uevent(®_pdev->dev.kobj, KOBJ_CHANGE); +} + +/* Used by nl80211 before kmalloc'ing our regulatory domain */ +bool reg_is_valid_request(const char *alpha2) +{ + assert_cfg80211_lock(); + + if (!last_request) + return false; + + return alpha2_equal(last_request->alpha2, alpha2); +} + +/* Sanity check on a regulatory rule */ +static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) +{ + const struct ieee80211_freq_range *freq_range = &rule->freq_range; + u32 freq_diff; + + if (freq_range->start_freq_khz <= 0 || freq_range->end_freq_khz <= 0) + return false; + + if (freq_range->start_freq_khz > freq_range->end_freq_khz) + return false; + + freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; + + if (freq_range->end_freq_khz <= freq_range->start_freq_khz || + freq_range->max_bandwidth_khz > freq_diff) + return false; + + return true; +} + +static bool is_valid_rd(const struct ieee80211_regdomain *rd) +{ + const struct ieee80211_reg_rule *reg_rule = NULL; + unsigned int i; + + if (!rd->n_reg_rules) + return false; + + if (WARN_ON(rd->n_reg_rules > NL80211_MAX_SUPP_REG_RULES)) + return false; + + for (i = 0; i < rd->n_reg_rules; i++) { + reg_rule = &rd->reg_rules[i]; + if (!is_valid_reg_rule(reg_rule)) + return false; + } + + return true; +} + +static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range, + u32 center_freq_khz, + u32 bw_khz) +{ + u32 start_freq_khz, end_freq_khz; + + start_freq_khz = center_freq_khz - (bw_khz/2); + end_freq_khz = center_freq_khz + (bw_khz/2); + + if (start_freq_khz >= freq_range->start_freq_khz && + end_freq_khz <= freq_range->end_freq_khz) + return true; + + return false; +} + +/** + * freq_in_rule_band - tells us if a frequency is in a frequency band + * @freq_range: frequency rule we want to query + * @freq_khz: frequency we are inquiring about + * + * This lets us know if a specific frequency rule is or is not relevant to + * a specific frequency's band. Bands are device specific and artificial + * definitions (the "2.4 GHz band" and the "5 GHz band"), however it is + * safe for now to assume that a frequency rule should not be part of a + * frequency's band if the start freq or end freq are off by more than 2 GHz. + * This resolution can be lowered and should be considered as we add + * regulatory rule support for other "bands". + **/ +static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, + u32 freq_khz) +{ +#define ONE_GHZ_IN_KHZ 1000000 + if (abs(freq_khz - freq_range->start_freq_khz) <= (2 * ONE_GHZ_IN_KHZ)) + return true; + if (abs(freq_khz - freq_range->end_freq_khz) <= (2 * ONE_GHZ_IN_KHZ)) + return true; + return false; +#undef ONE_GHZ_IN_KHZ +} + +/* + * Helper for regdom_intersect(), this does the real + * mathematical intersection fun + */ +static int reg_rules_intersect( + const struct ieee80211_reg_rule *rule1, + const struct ieee80211_reg_rule *rule2, + struct ieee80211_reg_rule *intersected_rule) +{ + const struct ieee80211_freq_range *freq_range1, *freq_range2; + struct ieee80211_freq_range *freq_range; + const struct ieee80211_power_rule *power_rule1, *power_rule2; + struct ieee80211_power_rule *power_rule; + u32 freq_diff; + + freq_range1 = &rule1->freq_range; + freq_range2 = &rule2->freq_range; + freq_range = &intersected_rule->freq_range; + + power_rule1 = &rule1->power_rule; + power_rule2 = &rule2->power_rule; + power_rule = &intersected_rule->power_rule; + + freq_range->start_freq_khz = max(freq_range1->start_freq_khz, + freq_range2->start_freq_khz); + freq_range->end_freq_khz = min(freq_range1->end_freq_khz, + freq_range2->end_freq_khz); + freq_range->max_bandwidth_khz = min(freq_range1->max_bandwidth_khz, + freq_range2->max_bandwidth_khz); + + freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; + if (freq_range->max_bandwidth_khz > freq_diff) + freq_range->max_bandwidth_khz = freq_diff; + + power_rule->max_eirp = min(power_rule1->max_eirp, + power_rule2->max_eirp); + power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain, + power_rule2->max_antenna_gain); + + intersected_rule->flags = (rule1->flags | rule2->flags); + + if (!is_valid_reg_rule(intersected_rule)) + return -EINVAL; + + return 0; +} + +/** + * regdom_intersect - do the intersection between two regulatory domains + * @rd1: first regulatory domain + * @rd2: second regulatory domain + * + * Use this function to get the intersection between two regulatory domains. + * Once completed we will mark the alpha2 for the rd as intersected, "98", + * as no one single alpha2 can represent this regulatory domain. + * + * Returns a pointer to the regulatory domain structure which will hold the + * resulting intersection of rules between rd1 and rd2. We will + * kzalloc() this structure for you. + */ +static struct ieee80211_regdomain *regdom_intersect( + const struct ieee80211_regdomain *rd1, + const struct ieee80211_regdomain *rd2) +{ + int r, size_of_regd; + unsigned int x, y; + unsigned int num_rules = 0, rule_idx = 0; + const struct ieee80211_reg_rule *rule1, *rule2; + struct ieee80211_reg_rule *intersected_rule; + struct ieee80211_regdomain *rd; + /* This is just a dummy holder to help us count */ + struct ieee80211_reg_rule irule; + + /* Uses the stack temporarily for counter arithmetic */ + intersected_rule = &irule; + + memset(intersected_rule, 0, sizeof(struct ieee80211_reg_rule)); + + if (!rd1 || !rd2) + return NULL; + + /* + * First we get a count of the rules we'll need, then we actually + * build them. This is to so we can malloc() and free() a + * regdomain once. The reason we use reg_rules_intersect() here + * is it will return -EINVAL if the rule computed makes no sense. + * All rules that do check out OK are valid. + */ + + for (x = 0; x < rd1->n_reg_rules; x++) { + rule1 = &rd1->reg_rules[x]; + for (y = 0; y < rd2->n_reg_rules; y++) { + rule2 = &rd2->reg_rules[y]; + if (!reg_rules_intersect(rule1, rule2, + intersected_rule)) + num_rules++; + memset(intersected_rule, 0, + sizeof(struct ieee80211_reg_rule)); + } + } + + if (!num_rules) + return NULL; + + size_of_regd = sizeof(struct ieee80211_regdomain) + + ((num_rules + 1) * sizeof(struct ieee80211_reg_rule)); + + rd = kzalloc(size_of_regd, GFP_KERNEL); + if (!rd) + return NULL; + + for (x = 0; x < rd1->n_reg_rules; x++) { + rule1 = &rd1->reg_rules[x]; + for (y = 0; y < rd2->n_reg_rules; y++) { + rule2 = &rd2->reg_rules[y]; + /* + * This time around instead of using the stack lets + * write to the target rule directly saving ourselves + * a memcpy() + */ + intersected_rule = &rd->reg_rules[rule_idx]; + r = reg_rules_intersect(rule1, rule2, + intersected_rule); + /* + * No need to memset here the intersected rule here as + * we're not using the stack anymore + */ + if (r) + continue; + rule_idx++; + } + } + + if (rule_idx != num_rules) { + kfree(rd); + return NULL; + } + + rd->n_reg_rules = num_rules; + rd->alpha2[0] = '9'; + rd->alpha2[1] = '8'; + + return rd; +} + +/* + * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may + * want to just have the channel structure use these + */ +static u32 map_regdom_flags(u32 rd_flags) +{ + u32 channel_flags = 0; + if (rd_flags & NL80211_RRF_PASSIVE_SCAN) + channel_flags |= IEEE80211_CHAN_PASSIVE_SCAN; + if (rd_flags & NL80211_RRF_NO_IBSS) + channel_flags |= IEEE80211_CHAN_NO_IBSS; + if (rd_flags & NL80211_RRF_DFS) + channel_flags |= IEEE80211_CHAN_RADAR; + return channel_flags; +} + +static int freq_reg_info_regd(struct wiphy *wiphy, + u32 center_freq, + u32 desired_bw_khz, + const struct ieee80211_reg_rule **reg_rule, + const struct ieee80211_regdomain *custom_regd) +{ + int i; + bool band_rule_found = false; + const struct ieee80211_regdomain *regd; + bool bw_fits = false; + + if (!desired_bw_khz) + desired_bw_khz = MHZ_TO_KHZ(20); + + regd = custom_regd ? custom_regd : cfg80211_regdomain; + + /* + * Follow the driver's regulatory domain, if present, unless a country + * IE has been processed or a user wants to help complaince further + */ + if (!custom_regd && + last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + last_request->initiator != NL80211_REGDOM_SET_BY_USER && + wiphy->regd) + regd = wiphy->regd; + + if (!regd) + return -EINVAL; + + for (i = 0; i < regd->n_reg_rules; i++) { + const struct ieee80211_reg_rule *rr; + const struct ieee80211_freq_range *fr = NULL; + + rr = ®d->reg_rules[i]; + fr = &rr->freq_range; + + /* + * We only need to know if one frequency rule was + * was in center_freq's band, that's enough, so lets + * not overwrite it once found + */ + if (!band_rule_found) + band_rule_found = freq_in_rule_band(fr, center_freq); + + bw_fits = reg_does_bw_fit(fr, + center_freq, + desired_bw_khz); + + if (band_rule_found && bw_fits) { + *reg_rule = rr; + return 0; + } + } + + if (!band_rule_found) + return -ERANGE; + + return -EINVAL; +} + +int freq_reg_info(struct wiphy *wiphy, + u32 center_freq, + u32 desired_bw_khz, + const struct ieee80211_reg_rule **reg_rule) +{ + assert_cfg80211_lock(); + return freq_reg_info_regd(wiphy, + center_freq, + desired_bw_khz, + reg_rule, + NULL); +} +EXPORT_SYMBOL(freq_reg_info); + +#ifdef CONFIG_CFG80211_REG_DEBUG +static const char *reg_initiator_name(enum nl80211_reg_initiator initiator) +{ + switch (initiator) { + case NL80211_REGDOM_SET_BY_CORE: + return "Set by core"; + case NL80211_REGDOM_SET_BY_USER: + return "Set by user"; + case NL80211_REGDOM_SET_BY_DRIVER: + return "Set by driver"; + case NL80211_REGDOM_SET_BY_COUNTRY_IE: + return "Set by country IE"; + default: + WARN_ON(1); + return "Set by bug"; + } +} + +static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, + u32 desired_bw_khz, + const struct ieee80211_reg_rule *reg_rule) +{ + const struct ieee80211_power_rule *power_rule; + const struct ieee80211_freq_range *freq_range; + char max_antenna_gain[32]; + + power_rule = ®_rule->power_rule; + freq_range = ®_rule->freq_range; + + if (!power_rule->max_antenna_gain) + snprintf(max_antenna_gain, 32, "N/A"); + else + snprintf(max_antenna_gain, 32, "%d", power_rule->max_antenna_gain); + + REG_DBG_PRINT("Updating information on frequency %d MHz " + "for a %d MHz width channel with regulatory rule:\n", + chan->center_freq, + KHZ_TO_MHZ(desired_bw_khz)); + + REG_DBG_PRINT("%d KHz - %d KHz @ KHz), (%s mBi, %d mBm)\n", + freq_range->start_freq_khz, + freq_range->end_freq_khz, + max_antenna_gain, + power_rule->max_eirp); +} +#else +static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan, + u32 desired_bw_khz, + const struct ieee80211_reg_rule *reg_rule) +{ + return; +} +#endif + +/* + * Note that right now we assume the desired channel bandwidth + * is always 20 MHz for each individual channel (HT40 uses 20 MHz + * per channel, the primary and the extension channel). To support + * smaller custom bandwidths such as 5 MHz or 10 MHz we'll need a + * new ieee80211_channel.target_bw and re run the regulatory check + * on the wiphy with the target_bw specified. Then we can simply use + * that below for the desired_bw_khz below. + */ +static void handle_channel(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator, + enum ieee80211_band band, + unsigned int chan_idx) +{ + int r; + u32 flags, bw_flags = 0; + u32 desired_bw_khz = MHZ_TO_KHZ(20); + const struct ieee80211_reg_rule *reg_rule = NULL; + const struct ieee80211_power_rule *power_rule = NULL; + const struct ieee80211_freq_range *freq_range = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + struct wiphy *request_wiphy = NULL; + + assert_cfg80211_lock(); + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + sband = wiphy->bands[band]; + BUG_ON(chan_idx >= sband->n_channels); + chan = &sband->channels[chan_idx]; + + flags = chan->orig_flags; + + r = freq_reg_info(wiphy, + MHZ_TO_KHZ(chan->center_freq), + desired_bw_khz, + ®_rule); + + if (r) { + /* + * We will disable all channels that do not match our + * received regulatory rule unless the hint is coming + * from a Country IE and the Country IE had no information + * about a band. The IEEE 802.11 spec allows for an AP + * to send only a subset of the regulatory rules allowed, + * so an AP in the US that only supports 2.4 GHz may only send + * a country IE with information for the 2.4 GHz band + * while 5 GHz is still supported. + */ + if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE && + r == -ERANGE) + return; + + REG_DBG_PRINT("Disabling freq %d MHz\n", chan->center_freq); + chan->flags = IEEE80211_CHAN_DISABLED; + return; + } + + chan_reg_rule_print_dbg(chan, desired_bw_khz, reg_rule); + + power_rule = ®_rule->power_rule; + freq_range = ®_rule->freq_range; + + if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags = IEEE80211_CHAN_NO_HT40; + + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + request_wiphy && request_wiphy == wiphy && + request_wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) { + /* + * This guarantees the driver's requested regulatory domain + * will always be used as a base for further regulatory + * settings + */ + chan->flags = chan->orig_flags = + map_regdom_flags(reg_rule->flags) | bw_flags; + chan->max_antenna_gain = chan->orig_mag = + (int) MBI_TO_DBI(power_rule->max_antenna_gain); + chan->max_power = chan->orig_mpwr = + (int) MBM_TO_DBM(power_rule->max_eirp); + return; + } + + chan->beacon_found = false; + chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags); + chan->max_antenna_gain = min(chan->orig_mag, + (int) MBI_TO_DBI(power_rule->max_antenna_gain)); + if (chan->orig_mpwr) + chan->max_power = min(chan->orig_mpwr, + (int) MBM_TO_DBM(power_rule->max_eirp)); + else + chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp); +} + +static void handle_band(struct wiphy *wiphy, + enum ieee80211_band band, + enum nl80211_reg_initiator initiator) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + BUG_ON(!wiphy->bands[band]); + sband = wiphy->bands[band]; + + for (i = 0; i < sband->n_channels; i++) + handle_channel(wiphy, initiator, band, i); +} + +static bool ignore_reg_update(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator) +{ + if (!last_request) { + REG_DBG_PRINT("Ignoring regulatory request %s since " + "last_request is not set\n", + reg_initiator_name(initiator)); + return true; + } + + if (initiator == NL80211_REGDOM_SET_BY_CORE && + wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) { + REG_DBG_PRINT("Ignoring regulatory request %s " + "since the driver uses its own custom " + "regulatory domain ", + reg_initiator_name(initiator)); + return true; + } + + /* + * wiphy->regd will be set once the device has its own + * desired regulatory domain set + */ + if (wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY && !wiphy->regd && + initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + !is_world_regdom(last_request->alpha2)) { + REG_DBG_PRINT("Ignoring regulatory request %s " + "since the driver requires its own regulaotry " + "domain to be set first", + reg_initiator_name(initiator)); + return true; + } + + return false; +} + +static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator) +{ + struct cfg80211_registered_device *rdev; + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) + wiphy_update_regulatory(&rdev->wiphy, initiator); +} + +static void handle_reg_beacon(struct wiphy *wiphy, + unsigned int chan_idx, + struct reg_beacon *reg_beacon) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + bool channel_changed = false; + struct ieee80211_channel chan_before; + + assert_cfg80211_lock(); + + sband = wiphy->bands[reg_beacon->chan.band]; + chan = &sband->channels[chan_idx]; + + if (likely(chan->center_freq != reg_beacon->chan.center_freq)) + return; + + if (chan->beacon_found) + return; + + chan->beacon_found = true; + + if (wiphy->flags & WIPHY_FLAG_DISABLE_BEACON_HINTS) + return; + + chan_before.center_freq = chan->center_freq; + chan_before.flags = chan->flags; + + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) { + chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN; + channel_changed = true; + } + + if (chan->flags & IEEE80211_CHAN_NO_IBSS) { + chan->flags &= ~IEEE80211_CHAN_NO_IBSS; + channel_changed = true; + } + + if (channel_changed) + nl80211_send_beacon_hint_event(wiphy, &chan_before, chan); +} + +/* + * Called when a scan on a wiphy finds a beacon on + * new channel + */ +static void wiphy_update_new_beacon(struct wiphy *wiphy, + struct reg_beacon *reg_beacon) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + assert_cfg80211_lock(); + + if (!wiphy->bands[reg_beacon->chan.band]) + return; + + sband = wiphy->bands[reg_beacon->chan.band]; + + for (i = 0; i < sband->n_channels; i++) + handle_reg_beacon(wiphy, i, reg_beacon); +} + +/* + * Called upon reg changes or a new wiphy is added + */ +static void wiphy_update_beacon_reg(struct wiphy *wiphy) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + struct reg_beacon *reg_beacon; + + assert_cfg80211_lock(); + + if (list_empty(®_beacon_list)) + return; + + list_for_each_entry(reg_beacon, ®_beacon_list, list) { + if (!wiphy->bands[reg_beacon->chan.band]) + continue; + sband = wiphy->bands[reg_beacon->chan.band]; + for (i = 0; i < sband->n_channels; i++) + handle_reg_beacon(wiphy, i, reg_beacon); + } +} + +static bool reg_is_world_roaming(struct wiphy *wiphy) +{ + if (is_world_regdom(cfg80211_regdomain->alpha2) || + (wiphy->regd && is_world_regdom(wiphy->regd->alpha2))) + return true; + if (last_request && + last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) + return true; + return false; +} + +/* Reap the advantages of previously found beacons */ +static void reg_process_beacons(struct wiphy *wiphy) +{ + /* + * Means we are just firing up cfg80211, so no beacons would + * have been processed yet. + */ + if (!last_request) + return; + if (!reg_is_world_roaming(wiphy)) + return; + wiphy_update_beacon_reg(wiphy); +} + +static bool is_ht40_not_allowed(struct ieee80211_channel *chan) +{ + if (!chan) + return true; + if (chan->flags & IEEE80211_CHAN_DISABLED) + return true; + /* This would happen when regulatory rules disallow HT40 completely */ + if (IEEE80211_CHAN_NO_HT40 == (chan->flags & (IEEE80211_CHAN_NO_HT40))) + return true; + return false; +} + +static void reg_process_ht_flags_channel(struct wiphy *wiphy, + enum ieee80211_band band, + unsigned int chan_idx) +{ + struct ieee80211_supported_band *sband; + struct ieee80211_channel *channel; + struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; + unsigned int i; + + assert_cfg80211_lock(); + + sband = wiphy->bands[band]; + BUG_ON(chan_idx >= sband->n_channels); + channel = &sband->channels[chan_idx]; + + if (is_ht40_not_allowed(channel)) { + channel->flags |= IEEE80211_CHAN_NO_HT40; + return; + } + + /* + * We need to ensure the extension channels exist to + * be able to use HT40- or HT40+, this finds them (or not) + */ + for (i = 0; i < sband->n_channels; i++) { + struct ieee80211_channel *c = &sband->channels[i]; + if (c->center_freq == (channel->center_freq - 20)) + channel_before = c; + if (c->center_freq == (channel->center_freq + 20)) + channel_after = c; + } + + /* + * Please note that this assumes target bandwidth is 20 MHz, + * if that ever changes we also need to change the below logic + * to include that as well. + */ + if (is_ht40_not_allowed(channel_before)) + channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; + else + channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; + + if (is_ht40_not_allowed(channel_after)) + channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; + else + channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; +} + +static void reg_process_ht_flags_band(struct wiphy *wiphy, + enum ieee80211_band band) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + BUG_ON(!wiphy->bands[band]); + sband = wiphy->bands[band]; + + for (i = 0; i < sband->n_channels; i++) + reg_process_ht_flags_channel(wiphy, band, i); +} + +static void reg_process_ht_flags(struct wiphy *wiphy) +{ + enum ieee80211_band band; + + if (!wiphy) + return; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (wiphy->bands[band]) + reg_process_ht_flags_band(wiphy, band); + } + +} + +void wiphy_update_regulatory(struct wiphy *wiphy, + enum nl80211_reg_initiator initiator) +{ + enum ieee80211_band band; + + if (ignore_reg_update(wiphy, initiator)) + return; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (wiphy->bands[band]) + handle_band(wiphy, band, initiator); + } + + reg_process_beacons(wiphy); + reg_process_ht_flags(wiphy); + if (wiphy->reg_notifier) + wiphy->reg_notifier(wiphy, last_request); +} + +static void handle_channel_custom(struct wiphy *wiphy, + enum ieee80211_band band, + unsigned int chan_idx, + const struct ieee80211_regdomain *regd) +{ + int r; + u32 desired_bw_khz = MHZ_TO_KHZ(20); + u32 bw_flags = 0; + const struct ieee80211_reg_rule *reg_rule = NULL; + const struct ieee80211_power_rule *power_rule = NULL; + const struct ieee80211_freq_range *freq_range = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_channel *chan; + + assert_reg_lock(); + + sband = wiphy->bands[band]; + BUG_ON(chan_idx >= sband->n_channels); + chan = &sband->channels[chan_idx]; + + r = freq_reg_info_regd(wiphy, + MHZ_TO_KHZ(chan->center_freq), + desired_bw_khz, + ®_rule, + regd); + + if (r) { + REG_DBG_PRINT("Disabling freq %d MHz as custom " + "regd has no rule that fits a %d MHz " + "wide channel\n", + chan->center_freq, + KHZ_TO_MHZ(desired_bw_khz)); + chan->flags = IEEE80211_CHAN_DISABLED; + return; + } + + chan_reg_rule_print_dbg(chan, desired_bw_khz, reg_rule); + + power_rule = ®_rule->power_rule; + freq_range = ®_rule->freq_range; + + if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags = IEEE80211_CHAN_NO_HT40; + + chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags; + chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain); + chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp); +} + +static void handle_band_custom(struct wiphy *wiphy, enum ieee80211_band band, + const struct ieee80211_regdomain *regd) +{ + unsigned int i; + struct ieee80211_supported_band *sband; + + BUG_ON(!wiphy->bands[band]); + sband = wiphy->bands[band]; + + for (i = 0; i < sband->n_channels; i++) + handle_channel_custom(wiphy, band, i, regd); +} + +/* Used by drivers prior to wiphy registration */ +void wiphy_apply_custom_regulatory(struct wiphy *wiphy, + const struct ieee80211_regdomain *regd) +{ + enum ieee80211_band band; + unsigned int bands_set = 0; + + mutex_lock(®_mutex); + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!wiphy->bands[band]) + continue; + handle_band_custom(wiphy, band, regd); + bands_set++; + } + mutex_unlock(®_mutex); + + /* + * no point in calling this if it won't have any effect + * on your device's supportd bands. + */ + WARN_ON(!bands_set); +} +EXPORT_SYMBOL(wiphy_apply_custom_regulatory); + +/* + * Return value which can be used by ignore_request() to indicate + * it has been determined we should intersect two regulatory domains + */ +#define REG_INTERSECT 1 + +/* This has the logic which determines when a new request + * should be ignored. */ +static int ignore_request(struct wiphy *wiphy, + struct regulatory_request *pending_request) +{ + struct wiphy *last_wiphy = NULL; + + assert_cfg80211_lock(); + + /* All initial requests are respected */ + if (!last_request) + return 0; + + switch (pending_request->initiator) { + case NL80211_REGDOM_SET_BY_CORE: + return 0; + case NL80211_REGDOM_SET_BY_COUNTRY_IE: + + last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + if (unlikely(!is_an_alpha2(pending_request->alpha2))) + return -EINVAL; + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { + if (last_wiphy != wiphy) { + /* + * Two cards with two APs claiming different + * Country IE alpha2s. We could + * intersect them, but that seems unlikely + * to be correct. Reject second one for now. + */ + if (regdom_changes(pending_request->alpha2)) + return -EOPNOTSUPP; + return -EALREADY; + } + /* + * Two consecutive Country IE hints on the same wiphy. + * This should be picked up early by the driver/stack + */ + if (WARN_ON(regdom_changes(pending_request->alpha2))) + return 0; + return -EALREADY; + } + return 0; + case NL80211_REGDOM_SET_BY_DRIVER: + if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) { + if (regdom_changes(pending_request->alpha2)) + return 0; + return -EALREADY; + } + + /* + * This would happen if you unplug and plug your card + * back in or if you add a new device for which the previously + * loaded card also agrees on the regulatory domain. + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + !regdom_changes(pending_request->alpha2)) + return -EALREADY; + + return REG_INTERSECT; + case NL80211_REGDOM_SET_BY_USER: + if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) + return REG_INTERSECT; + /* + * If the user knows better the user should set the regdom + * to their country before the IE is picked up + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_USER && + last_request->intersect) + return -EOPNOTSUPP; + /* + * Process user requests only after previous user/driver/core + * requests have been processed + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE || + last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER || + last_request->initiator == NL80211_REGDOM_SET_BY_USER) { + if (regdom_changes(last_request->alpha2)) + return -EAGAIN; + } + + if (!regdom_changes(pending_request->alpha2)) + return -EALREADY; + + return 0; + } + + return -EINVAL; +} + +static void reg_set_request_processed(void) +{ + bool need_more_processing = false; + + last_request->processed = true; + + spin_lock(®_requests_lock); + if (!list_empty(®_requests_list)) + need_more_processing = true; + spin_unlock(®_requests_lock); + + if (last_request->initiator == NL80211_REGDOM_SET_BY_USER) + cancel_delayed_work_sync(®_timeout); + + if (need_more_processing) + schedule_work(®_work); +} + +/** + * __regulatory_hint - hint to the wireless core a regulatory domain + * @wiphy: if the hint comes from country information from an AP, this + * is required to be set to the wiphy that received the information + * @pending_request: the regulatory request currently being processed + * + * The Wireless subsystem can use this function to hint to the wireless core + * what it believes should be the current regulatory domain. + * + * Returns zero if all went fine, %-EALREADY if a regulatory domain had + * already been set or other standard error codes. + * + * Caller must hold &cfg80211_mutex and ®_mutex + */ +static int __regulatory_hint(struct wiphy *wiphy, + struct regulatory_request *pending_request) +{ + bool intersect = false; + int r = 0; + + assert_cfg80211_lock(); + + r = ignore_request(wiphy, pending_request); + + if (r == REG_INTERSECT) { + if (pending_request->initiator == + NL80211_REGDOM_SET_BY_DRIVER) { + r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); + if (r) { + kfree(pending_request); + return r; + } + } + intersect = true; + } else if (r) { + /* + * If the regulatory domain being requested by the + * driver has already been set just copy it to the + * wiphy + */ + if (r == -EALREADY && + pending_request->initiator == + NL80211_REGDOM_SET_BY_DRIVER) { + r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); + if (r) { + kfree(pending_request); + return r; + } + r = -EALREADY; + goto new_request; + } + kfree(pending_request); + return r; + } + +new_request: + if (last_request != &core_request_world) + kfree(last_request); + + last_request = pending_request; + last_request->intersect = intersect; + + pending_request = NULL; + + if (last_request->initiator == NL80211_REGDOM_SET_BY_USER) { + user_alpha2[0] = last_request->alpha2[0]; + user_alpha2[1] = last_request->alpha2[1]; + } + + /* When r == REG_INTERSECT we do need to call CRDA */ + if (r < 0) { + /* + * Since CRDA will not be called in this case as we already + * have applied the requested regulatory domain before we just + * inform userspace we have processed the request + */ + if (r == -EALREADY) { + nl80211_send_reg_change_event(last_request); + reg_set_request_processed(); + } + return r; + } + + return call_crda(last_request->alpha2); +} + +/* This processes *all* regulatory hints */ +static void reg_process_hint(struct regulatory_request *reg_request) +{ + int r = 0; + struct wiphy *wiphy = NULL; + enum nl80211_reg_initiator initiator = reg_request->initiator; + + BUG_ON(!reg_request->alpha2); + + if (wiphy_idx_valid(reg_request->wiphy_idx)) + wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); + + if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER && + !wiphy) { + kfree(reg_request); + return; + } + + r = __regulatory_hint(wiphy, reg_request); + /* This is required so that the orig_* parameters are saved */ + if (r == -EALREADY && wiphy && + wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) { + wiphy_update_regulatory(wiphy, initiator); + return; + } + + /* + * We only time out user hints, given that they should be the only + * source of bogus requests. + */ + if (r != -EALREADY && + reg_request->initiator == NL80211_REGDOM_SET_BY_USER) + schedule_delayed_work(®_timeout, msecs_to_jiffies(3142)); +} + +/* + * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* + * Regulatory hints come on a first come first serve basis and we + * must process each one atomically. + */ +static void reg_process_pending_hints(void) +{ + struct regulatory_request *reg_request; + + mutex_lock(&cfg80211_mutex); + mutex_lock(®_mutex); + + /* When last_request->processed becomes true this will be rescheduled */ + if (last_request && !last_request->processed) { + REG_DBG_PRINT("Pending regulatory request, waiting " + "for it to be processed..."); + goto out; + } + + spin_lock(®_requests_lock); + + if (list_empty(®_requests_list)) { + spin_unlock(®_requests_lock); + goto out; + } + + reg_request = list_first_entry(®_requests_list, + struct regulatory_request, + list); + list_del_init(®_request->list); + + spin_unlock(®_requests_lock); + + reg_process_hint(reg_request); + +out: + mutex_unlock(®_mutex); + mutex_unlock(&cfg80211_mutex); +} + +/* Processes beacon hints -- this has nothing to do with country IEs */ +static void reg_process_pending_beacon_hints(void) +{ + struct cfg80211_registered_device *rdev; + struct reg_beacon *pending_beacon, *tmp; + + /* + * No need to hold the reg_mutex here as we just touch wiphys + * and do not read or access regulatory variables. + */ + mutex_lock(&cfg80211_mutex); + + /* This goes through the _pending_ beacon list */ + spin_lock_bh(®_pending_beacons_lock); + + if (list_empty(®_pending_beacons)) { + spin_unlock_bh(®_pending_beacons_lock); + goto out; + } + + list_for_each_entry_safe(pending_beacon, tmp, + ®_pending_beacons, list) { + + list_del_init(&pending_beacon->list); + + /* Applies the beacon hint to current wiphys */ + list_for_each_entry(rdev, &cfg80211_rdev_list, list) + wiphy_update_new_beacon(&rdev->wiphy, pending_beacon); + + /* Remembers the beacon hint for new wiphys or reg changes */ + list_add_tail(&pending_beacon->list, ®_beacon_list); + } + + spin_unlock_bh(®_pending_beacons_lock); +out: + mutex_unlock(&cfg80211_mutex); +} + +static void reg_todo(struct work_struct *work) +{ + reg_process_pending_hints(); + reg_process_pending_beacon_hints(); +} + +static void queue_regulatory_request(struct regulatory_request *request) +{ + if (isalpha(request->alpha2[0])) + request->alpha2[0] = toupper(request->alpha2[0]); + if (isalpha(request->alpha2[1])) + request->alpha2[1] = toupper(request->alpha2[1]); + + spin_lock(®_requests_lock); + list_add_tail(&request->list, ®_requests_list); + spin_unlock(®_requests_lock); + + schedule_work(®_work); +} + +/* + * Core regulatory hint -- happens during cfg80211_init() + * and when we restore regulatory settings. + */ +static int regulatory_hint_core(const char *alpha2) +{ + struct regulatory_request *request; + + request = kzalloc(sizeof(struct regulatory_request), + GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_CORE; + + queue_regulatory_request(request); + + return 0; +} + +/* User hints */ +int regulatory_hint_user(const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(!alpha2); + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->wiphy_idx = WIPHY_IDX_STALE; + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_USER; + + queue_regulatory_request(request); + + return 0; +} + +/* Driver hints */ +int regulatory_hint(struct wiphy *wiphy, const char *alpha2) +{ + struct regulatory_request *request; + + BUG_ON(!alpha2); + BUG_ON(!wiphy); + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + return -ENOMEM; + + request->wiphy_idx = get_wiphy_idx(wiphy); + + /* Must have registered wiphy first */ + BUG_ON(!wiphy_idx_valid(request->wiphy_idx)); + + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_DRIVER; + + queue_regulatory_request(request); + + return 0; +} +EXPORT_SYMBOL(regulatory_hint); + +/* + * We hold wdev_lock() here so we cannot hold cfg80211_mutex() and + * therefore cannot iterate over the rdev list here. + */ +void regulatory_hint_11d(struct wiphy *wiphy, + enum ieee80211_band band, + u8 *country_ie, + u8 country_ie_len) +{ + char alpha2[2]; + enum environment_cap env = ENVIRON_ANY; + struct regulatory_request *request; + + mutex_lock(®_mutex); + + if (unlikely(!last_request)) + goto out; + + /* IE len must be evenly divisible by 2 */ + if (country_ie_len & 0x01) + goto out; + + if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) + goto out; + + alpha2[0] = country_ie[0]; + alpha2[1] = country_ie[1]; + + if (country_ie[2] == 'I') + env = ENVIRON_INDOOR; + else if (country_ie[2] == 'O') + env = ENVIRON_OUTDOOR; + + /* + * We will run this only upon a successful connection on cfg80211. + * We leave conflict resolution to the workqueue, where can hold + * cfg80211_mutex. + */ + if (likely(last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE && + wiphy_idx_valid(last_request->wiphy_idx))) + goto out; + + request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + if (!request) + goto out; + + request->wiphy_idx = get_wiphy_idx(wiphy); + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; + request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; + request->country_ie_env = env; + + mutex_unlock(®_mutex); + + queue_regulatory_request(request); + + return; + +out: + mutex_unlock(®_mutex); +} + +static void restore_alpha2(char *alpha2, bool reset_user) +{ + /* indicates there is no alpha2 to consider for restoration */ + alpha2[0] = '9'; + alpha2[1] = '7'; + + /* The user setting has precedence over the module parameter */ + if (is_user_regdom_saved()) { + /* Unless we're asked to ignore it and reset it */ + if (reset_user) { + REG_DBG_PRINT("Restoring regulatory settings " + "including user preference\n"); + user_alpha2[0] = '9'; + user_alpha2[1] = '7'; + + /* + * If we're ignoring user settings, we still need to + * check the module parameter to ensure we put things + * back as they were for a full restore. + */ + if (!is_world_regdom(ieee80211_regdom)) { + REG_DBG_PRINT("Keeping preference on " + "module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], + ieee80211_regdom[1]); + alpha2[0] = ieee80211_regdom[0]; + alpha2[1] = ieee80211_regdom[1]; + } + } else { + REG_DBG_PRINT("Restoring regulatory settings " + "while preserving user preference for: %c%c\n", + user_alpha2[0], + user_alpha2[1]); + alpha2[0] = user_alpha2[0]; + alpha2[1] = user_alpha2[1]; + } + } else if (!is_world_regdom(ieee80211_regdom)) { + REG_DBG_PRINT("Keeping preference on " + "module parameter ieee80211_regdom: %c%c\n", + ieee80211_regdom[0], + ieee80211_regdom[1]); + alpha2[0] = ieee80211_regdom[0]; + alpha2[1] = ieee80211_regdom[1]; + } else + REG_DBG_PRINT("Restoring regulatory settings\n"); +} + +/* + * Restoring regulatory settings involves ingoring any + * possibly stale country IE information and user regulatory + * settings if so desired, this includes any beacon hints + * learned as we could have traveled outside to another country + * after disconnection. To restore regulatory settings we do + * exactly what we did at bootup: + * + * - send a core regulatory hint + * - send a user regulatory hint if applicable + * + * Device drivers that send a regulatory hint for a specific country + * keep their own regulatory domain on wiphy->regd so that does does + * not need to be remembered. + */ +static void restore_regulatory_settings(bool reset_user) +{ + char alpha2[2]; + char world_alpha2[2]; + struct reg_beacon *reg_beacon, *btmp; + struct regulatory_request *reg_request, *tmp; + LIST_HEAD(tmp_reg_req_list); + + mutex_lock(&cfg80211_mutex); + mutex_lock(®_mutex); + + reset_regdomains(true); + restore_alpha2(alpha2, reset_user); + + /* + * If there's any pending requests we simply + * stash them to a temporary pending queue and + * add then after we've restored regulatory + * settings. + */ + spin_lock(®_requests_lock); + if (!list_empty(®_requests_list)) { + list_for_each_entry_safe(reg_request, tmp, + ®_requests_list, list) { + if (reg_request->initiator != + NL80211_REGDOM_SET_BY_USER) + continue; + list_del(®_request->list); + list_add_tail(®_request->list, &tmp_reg_req_list); + } + } + spin_unlock(®_requests_lock); + + /* Clear beacon hints */ + spin_lock_bh(®_pending_beacons_lock); + if (!list_empty(®_pending_beacons)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + spin_unlock_bh(®_pending_beacons_lock); + + if (!list_empty(®_beacon_list)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + + /* First restore to the basic regulatory settings */ + cfg80211_regdomain = cfg80211_world_regdom; + world_alpha2[0] = cfg80211_regdomain->alpha2[0]; + world_alpha2[1] = cfg80211_regdomain->alpha2[1]; + + mutex_unlock(®_mutex); + mutex_unlock(&cfg80211_mutex); + + regulatory_hint_core(world_alpha2); + + /* + * This restores the ieee80211_regdom module parameter + * preference or the last user requested regulatory + * settings, user regulatory settings takes precedence. + */ + if (is_an_alpha2(alpha2)) + regulatory_hint_user(user_alpha2); + + if (list_empty(&tmp_reg_req_list)) + return; + + mutex_lock(&cfg80211_mutex); + mutex_lock(®_mutex); + + spin_lock(®_requests_lock); + list_for_each_entry_safe(reg_request, tmp, &tmp_reg_req_list, list) { + REG_DBG_PRINT("Adding request for country %c%c back " + "into the queue\n", + reg_request->alpha2[0], + reg_request->alpha2[1]); + list_del(®_request->list); + list_add_tail(®_request->list, ®_requests_list); + } + spin_unlock(®_requests_lock); + + mutex_unlock(®_mutex); + mutex_unlock(&cfg80211_mutex); + + REG_DBG_PRINT("Kicking the queue\n"); + + schedule_work(®_work); +} + +void regulatory_hint_disconnect(void) +{ + REG_DBG_PRINT("All devices are disconnected, going to " + "restore regulatory settings\n"); + restore_regulatory_settings(false); +} + +static bool freq_is_chan_12_13_14(u16 freq) +{ + if (freq == ieee80211_channel_to_frequency(12, IEEE80211_BAND_2GHZ) || + freq == ieee80211_channel_to_frequency(13, IEEE80211_BAND_2GHZ) || + freq == ieee80211_channel_to_frequency(14, IEEE80211_BAND_2GHZ)) + return true; + return false; +} + +int regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp) +{ + struct reg_beacon *reg_beacon; + + if (likely((beacon_chan->beacon_found || + (beacon_chan->flags & IEEE80211_CHAN_RADAR) || + (beacon_chan->band == IEEE80211_BAND_2GHZ && + !freq_is_chan_12_13_14(beacon_chan->center_freq))))) + return 0; + + reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); + if (!reg_beacon) + return -ENOMEM; + + REG_DBG_PRINT("Found new beacon on " + "frequency: %d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, + ieee80211_frequency_to_channel(beacon_chan->center_freq), + wiphy_name(wiphy)); + + memcpy(®_beacon->chan, beacon_chan, + sizeof(struct ieee80211_channel)); + + + /* + * Since we can be called from BH or and non-BH context + * we must use spin_lock_bh() + */ + spin_lock_bh(®_pending_beacons_lock); + list_add_tail(®_beacon->list, ®_pending_beacons); + spin_unlock_bh(®_pending_beacons_lock); + + schedule_work(®_work); + + return 0; +} + +static void print_rd_rules(const struct ieee80211_regdomain *rd) +{ + unsigned int i; + const struct ieee80211_reg_rule *reg_rule = NULL; + const struct ieee80211_freq_range *freq_range = NULL; + const struct ieee80211_power_rule *power_rule = NULL; + + pr_info(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp)\n"); + + for (i = 0; i < rd->n_reg_rules; i++) { + reg_rule = &rd->reg_rules[i]; + freq_range = ®_rule->freq_range; + power_rule = ®_rule->power_rule; + + /* + * There may not be documentation for max antenna gain + * in certain regions + */ + if (power_rule->max_antenna_gain) + pr_info(" (%d KHz - %d KHz @ %d KHz), (%d mBi, %d mBm)\n", + freq_range->start_freq_khz, + freq_range->end_freq_khz, + freq_range->max_bandwidth_khz, + power_rule->max_antenna_gain, + power_rule->max_eirp); + else + pr_info(" (%d KHz - %d KHz @ %d KHz), (N/A, %d mBm)\n", + freq_range->start_freq_khz, + freq_range->end_freq_khz, + freq_range->max_bandwidth_khz, + power_rule->max_eirp); + } +} + +static void print_regdomain(const struct ieee80211_regdomain *rd) +{ + + if (is_intersected_alpha2(rd->alpha2)) { + + if (last_request->initiator == + NL80211_REGDOM_SET_BY_COUNTRY_IE) { + struct cfg80211_registered_device *rdev; + rdev = cfg80211_rdev_by_wiphy_idx( + last_request->wiphy_idx); + if (rdev) { + pr_info("Current regulatory domain updated by AP to: %c%c\n", + rdev->country_ie_alpha2[0], + rdev->country_ie_alpha2[1]); + } else + pr_info("Current regulatory domain intersected:\n"); + } else + pr_info("Current regulatory domain intersected:\n"); + } else if (is_world_regdom(rd->alpha2)) + pr_info("World regulatory domain updated:\n"); + else { + if (is_unknown_alpha2(rd->alpha2)) + pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n"); + else + pr_info("Regulatory domain changed to country: %c%c\n", + rd->alpha2[0], rd->alpha2[1]); + } + print_rd_rules(rd); +} + +static void print_regdomain_info(const struct ieee80211_regdomain *rd) +{ + pr_info("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]); + print_rd_rules(rd); +} + +/* Takes ownership of rd only if it doesn't fail */ +static int __set_regdom(const struct ieee80211_regdomain *rd) +{ + const struct ieee80211_regdomain *intersected_rd = NULL; + struct cfg80211_registered_device *rdev = NULL; + struct wiphy *request_wiphy; + /* Some basic sanity checks first */ + + if (is_world_regdom(rd->alpha2)) { + if (WARN_ON(!reg_is_valid_request(rd->alpha2))) + return -EINVAL; + update_world_regdomain(rd); + return 0; + } + + if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) && + !is_unknown_alpha2(rd->alpha2)) + return -EINVAL; + + if (!last_request) + return -EINVAL; + + /* + * Lets only bother proceeding on the same alpha2 if the current + * rd is non static (it means CRDA was present and was used last) + * and the pending request came in from a country IE + */ + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { + /* + * If someone else asked us to change the rd lets only bother + * checking if the alpha2 changes if CRDA was already called + */ + if (!regdom_changes(rd->alpha2)) + return -EINVAL; + } + + /* + * Now lets set the regulatory domain, update all driver channels + * and finally inform them of what we have done, in case they want + * to review or adjust their own settings based on their own + * internal EEPROM data + */ + + if (WARN_ON(!reg_is_valid_request(rd->alpha2))) + return -EINVAL; + + if (!is_valid_rd(rd)) { + pr_err("Invalid regulatory domain detected:\n"); + print_regdomain_info(rd); + return -EINVAL; + } + + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + if (!request_wiphy && + (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER || + last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)) { + schedule_delayed_work(®_timeout, 0); + return -ENODEV; + } + + if (!last_request->intersect) { + int r; + + if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) { + reset_regdomains(false); + cfg80211_regdomain = rd; + return 0; + } + + /* + * For a driver hint, lets copy the regulatory domain the + * driver wanted to the wiphy to deal with conflicts + */ + + /* + * Userspace could have sent two replies with only + * one kernel request. + */ + if (request_wiphy->regd) + return -EALREADY; + + r = reg_copy_regd(&request_wiphy->regd, rd); + if (r) + return r; + + reset_regdomains(false); + cfg80211_regdomain = rd; + return 0; + } + + /* Intersection requires a bit more work */ + + if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) { + + intersected_rd = regdom_intersect(rd, cfg80211_regdomain); + if (!intersected_rd) + return -EINVAL; + + /* + * We can trash what CRDA provided now. + * However if a driver requested this specific regulatory + * domain we keep it for its private use + */ + if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER) + request_wiphy->regd = rd; + else + kfree(rd); + + rd = NULL; + + reset_regdomains(false); + cfg80211_regdomain = intersected_rd; + + return 0; + } + + if (!intersected_rd) + return -EINVAL; + + rdev = wiphy_to_dev(request_wiphy); + + rdev->country_ie_alpha2[0] = rd->alpha2[0]; + rdev->country_ie_alpha2[1] = rd->alpha2[1]; + rdev->env = last_request->country_ie_env; + + BUG_ON(intersected_rd == rd); + + kfree(rd); + rd = NULL; + + reset_regdomains(false); + cfg80211_regdomain = intersected_rd; + + return 0; +} + + +/* + * Use this call to set the current regulatory domain. Conflicts with + * multiple drivers can be ironed out later. Caller must've already + * kmalloc'd the rd structure. Caller must hold cfg80211_mutex + */ +int set_regdom(const struct ieee80211_regdomain *rd) +{ + int r; + + assert_cfg80211_lock(); + + mutex_lock(®_mutex); + + /* Note that this doesn't update the wiphys, this is done below */ + r = __set_regdom(rd); + if (r) { + kfree(rd); + mutex_unlock(®_mutex); + return r; + } + + /* This would make this whole thing pointless */ + if (!last_request->intersect) + BUG_ON(rd != cfg80211_regdomain); + + /* update all wiphys now with the new established regulatory domain */ + update_all_wiphy_regulatory(last_request->initiator); + + print_regdomain(cfg80211_regdomain); + + nl80211_send_reg_change_event(last_request); + + reg_set_request_processed(); + + mutex_unlock(®_mutex); + + return r; +} + +#ifdef CONFIG_HOTPLUG +int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + if (last_request && !last_request->processed) { + if (add_uevent_var(env, "COUNTRY=%c%c", + last_request->alpha2[0], + last_request->alpha2[1])) + return -ENOMEM; + } + + return 0; +} +#else +int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + return -ENODEV; +} +#endif /* CONFIG_HOTPLUG */ + +/* Caller must hold cfg80211_mutex */ +void reg_device_remove(struct wiphy *wiphy) +{ + struct wiphy *request_wiphy = NULL; + + assert_cfg80211_lock(); + + mutex_lock(®_mutex); + + kfree(wiphy->regd); + + if (last_request) + request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx); + + if (!request_wiphy || request_wiphy != wiphy) + goto out; + + last_request->wiphy_idx = WIPHY_IDX_STALE; + last_request->country_ie_env = ENVIRON_ANY; +out: + mutex_unlock(®_mutex); +} + +static void reg_timeout_work(struct work_struct *work) +{ + REG_DBG_PRINT("Timeout while waiting for CRDA to reply, " + "restoring regulatory settings"); + restore_regulatory_settings(true); +} + +int __init regulatory_init(void) +{ + int err = 0; + + reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); + if (IS_ERR(reg_pdev)) + return PTR_ERR(reg_pdev); + + reg_pdev->dev.type = ®_device_type; + + spin_lock_init(®_requests_lock); + spin_lock_init(®_pending_beacons_lock); + + reg_regdb_size_check(); + + cfg80211_regdomain = cfg80211_world_regdom; + + user_alpha2[0] = '9'; + user_alpha2[1] = '7'; + + /* We always try to get an update for the static regdomain */ + err = regulatory_hint_core(cfg80211_regdomain->alpha2); + if (err) { + if (err == -ENOMEM) + return err; + /* + * N.B. kobject_uevent_env() can fail mainly for when we're out + * memory which is handled and propagated appropriately above + * but it can also fail during a netlink_broadcast() or during + * early boot for call_usermodehelper(). For now treat these + * errors as non-fatal. + */ + pr_err("kobject_uevent_env() was unable to call CRDA during init\n"); +#ifdef CONFIG_CFG80211_REG_DEBUG + /* We want to find out exactly why when debugging */ + WARN_ON(err); +#endif + } + + /* + * Finally, if the user set the module parameter treat it + * as a user hint. + */ + if (!is_world_regdom(ieee80211_regdom)) + regulatory_hint_user(ieee80211_regdom); + + return 0; +} + +void /* __init_or_exit */ regulatory_exit(void) +{ + struct regulatory_request *reg_request, *tmp; + struct reg_beacon *reg_beacon, *btmp; + + cancel_work_sync(®_work); + cancel_delayed_work_sync(®_timeout); + + mutex_lock(&cfg80211_mutex); + mutex_lock(®_mutex); + + reset_regdomains(true); + + dev_set_uevent_suppress(®_pdev->dev, true); + + platform_device_unregister(reg_pdev); + + spin_lock_bh(®_pending_beacons_lock); + if (!list_empty(®_pending_beacons)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_pending_beacons, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + spin_unlock_bh(®_pending_beacons_lock); + + if (!list_empty(®_beacon_list)) { + list_for_each_entry_safe(reg_beacon, btmp, + ®_beacon_list, list) { + list_del(®_beacon->list); + kfree(reg_beacon); + } + } + + spin_lock(®_requests_lock); + if (!list_empty(®_requests_list)) { + list_for_each_entry_safe(reg_request, tmp, + ®_requests_list, list) { + list_del(®_request->list); + kfree(reg_request); + } + } + spin_unlock(®_requests_lock); + + mutex_unlock(®_mutex); + mutex_unlock(&cfg80211_mutex); +} diff --git a/net/wireless/reg.h b/net/wireless/reg.h new file mode 100644 index 00000000..b67d1c3a --- /dev/null +++ b/net/wireless/reg.h @@ -0,0 +1,85 @@ +#ifndef __NET_WIRELESS_REG_H +#define __NET_WIRELESS_REG_H + +extern const struct ieee80211_regdomain *cfg80211_regdomain; + +bool is_world_regdom(const char *alpha2); +bool reg_is_valid_request(const char *alpha2); + +int regulatory_hint_user(const char *alpha2); + +int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env); +void reg_device_remove(struct wiphy *wiphy); + +int __init regulatory_init(void); +void regulatory_exit(void); + +int set_regdom(const struct ieee80211_regdomain *rd); + +/** + * regulatory_hint_found_beacon - hints a beacon was found on a channel + * @wiphy: the wireless device where the beacon was found on + * @beacon_chan: the channel on which the beacon was found on + * @gfp: context flags + * + * This informs the wireless core that a beacon from an AP was found on + * the channel provided. This allows the wireless core to make educated + * guesses on regulatory to help with world roaming. This is only used for + * world roaming -- when we do not know our current location. This is + * only useful on channels 12, 13 and 14 on the 2 GHz band as channels + * 1-11 are already enabled by the world regulatory domain; and on + * non-radar 5 GHz channels. + * + * Drivers do not need to call this, cfg80211 will do it for after a scan + * on a newly found BSS. If you cannot make use of this feature you can + * set the wiphy->disable_beacon_hints to true. + */ +int regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp); + +/** + * regulatory_hint_11d - hints a country IE as a regulatory domain + * @wiphy: the wireless device giving the hint (used only for reporting + * conflicts) + * @band: the band on which the country IE was received on. This determines + * the band we'll process the country IE channel triplets for. + * @country_ie: pointer to the country IE + * @country_ie_len: length of the country IE + * + * We will intersect the rd with the what CRDA tells us should apply + * for the alpha2 this country IE belongs to, this prevents APs from + * sending us incorrect or outdated information against a country. + * + * The AP is expected to provide Country IE channel triplets for the + * band it is on. It is technically possible for APs to send channel + * country IE triplets even for channels outside of the band they are + * in but for that they would have to use the regulatory extension + * in combination with a triplet but this behaviour is currently + * not observed. For this reason if a triplet is seen with channel + * information for a band the BSS is not present in it will be ignored. + */ +void regulatory_hint_11d(struct wiphy *wiphy, + enum ieee80211_band band, + u8 *country_ie, + u8 country_ie_len); + +/** + * regulatory_hint_disconnect - informs all devices have been disconneted + * + * Regulotory rules can be enhanced further upon scanning and upon + * connection to an AP. These rules become stale if we disconnect + * and go to another country, whether or not we suspend and resume. + * If we suspend, go to another country and resume we'll automatically + * get disconnected shortly after resuming and things will be reset as well. + * This routine is a helper to restore regulatory settings to how they were + * prior to our first connect attempt. This includes ignoring country IE and + * beacon regulatory hints. The ieee80211_regdom module parameter will always + * be respected but if a user had set the regulatory domain that will take + * precedence. + * + * Must be called from process context. + */ +void regulatory_hint_disconnect(void); + +#endif /* __NET_WIRELESS_REG_H */ diff --git a/net/wireless/regdb.h b/net/wireless/regdb.h new file mode 100644 index 00000000..818222c9 --- /dev/null +++ b/net/wireless/regdb.h @@ -0,0 +1,7 @@ +#ifndef __REGDB_H__ +#define __REGDB_H__ + +extern const struct ieee80211_regdomain *reg_regdb[]; +extern int reg_regdb_size; + +#endif /* __REGDB_H__ */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c new file mode 100644 index 00000000..cbbc9273 --- /dev/null +++ b/net/wireless/scan.c @@ -0,0 +1,1197 @@ +/* + * cfg80211 scan result handling + * + * Copyright 2008 Johannes Berg + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "core.h" +#include "nl80211.h" +#include "wext-compat.h" + +#define IEEE80211_SCAN_RESULT_EXPIRE (3 * HZ) + +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak) +{ + struct cfg80211_scan_request *request; + struct net_device *dev; +#ifdef CONFIG_CFG80211_WEXT + union iwreq_data wrqu; +#endif + + ASSERT_RDEV_LOCK(rdev); + + request = rdev->scan_req; + + if (!request) + return; + + dev = request->dev; + + /* + * This must be before sending the other events! + * Otherwise, wpa_supplicant gets completely confused with + * wext events. + */ + cfg80211_sme_scan_done(dev); + + if (request->aborted) + nl80211_send_scan_aborted(rdev, dev); + else + nl80211_send_scan_done(rdev, dev); + +#ifdef CONFIG_CFG80211_WEXT + if (!request->aborted) { + memset(&wrqu, 0, sizeof(wrqu)); + + wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL); + } +#endif + + dev_put(dev); + + rdev->scan_req = NULL; + + /* + * OK. If this is invoked with "leak" then we can't + * free this ... but we've cleaned it up anyway. The + * driver failed to call the scan_done callback, so + * all bets are off, it might still be trying to use + * the scan request or not ... if it accesses the dev + * in there (it shouldn't anyway) then it may crash. + */ + if (!leak) + kfree(request); +} + +void __cfg80211_scan_done(struct work_struct *wk) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(wk, struct cfg80211_registered_device, + scan_done_wk); + + cfg80211_lock_rdev(rdev); + ___cfg80211_scan_done(rdev, false); + cfg80211_unlock_rdev(rdev); +} + +void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) +{ + WARN_ON(request != wiphy_to_dev(request->wiphy)->scan_req); + + request->aborted = aborted; + queue_work(cfg80211_wq, &wiphy_to_dev(request->wiphy)->scan_done_wk); +} +EXPORT_SYMBOL(cfg80211_scan_done); + +void __cfg80211_sched_scan_results(struct work_struct *wk) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(wk, struct cfg80211_registered_device, + sched_scan_results_wk); + + mutex_lock(&rdev->sched_scan_mtx); + + /* we don't have sched_scan_req anymore if the scan is stopping */ + if (rdev->sched_scan_req) + nl80211_send_sched_scan_results(rdev, + rdev->sched_scan_req->dev); + + mutex_unlock(&rdev->sched_scan_mtx); +} + +void cfg80211_sched_scan_results(struct wiphy *wiphy) +{ + /* ignore if we're not scanning */ + if (wiphy_to_dev(wiphy)->sched_scan_req) + queue_work(cfg80211_wq, + &wiphy_to_dev(wiphy)->sched_scan_results_wk); +} +EXPORT_SYMBOL(cfg80211_sched_scan_results); + +void cfg80211_sched_scan_stopped(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + mutex_lock(&rdev->sched_scan_mtx); + __cfg80211_stop_sched_scan(rdev, true); + mutex_unlock(&rdev->sched_scan_mtx); +} +EXPORT_SYMBOL(cfg80211_sched_scan_stopped); + +int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, + bool driver_initiated) +{ + int err; + struct net_device *dev; + + lockdep_assert_held(&rdev->sched_scan_mtx); + + if (!rdev->sched_scan_req) + return 0; + + dev = rdev->sched_scan_req->dev; + + if (!driver_initiated) { + err = rdev->ops->sched_scan_stop(&rdev->wiphy, dev); + if (err) + return err; + } + + nl80211_send_sched_scan(rdev, dev, NL80211_CMD_SCHED_SCAN_STOPPED); + + kfree(rdev->sched_scan_req); + rdev->sched_scan_req = NULL; + + return err; +} + +static void bss_release(struct kref *ref) +{ + struct cfg80211_internal_bss *bss; + + bss = container_of(ref, struct cfg80211_internal_bss, ref); + if (bss->pub.free_priv) + bss->pub.free_priv(&bss->pub); + + if (bss->beacon_ies_allocated) + kfree(bss->pub.beacon_ies); + if (bss->proberesp_ies_allocated) + kfree(bss->pub.proberesp_ies); + + BUG_ON(atomic_read(&bss->hold)); + + kfree(bss); +} + +/* must hold dev->bss_lock! */ +void cfg80211_bss_age(struct cfg80211_registered_device *dev, + unsigned long age_secs) +{ + struct cfg80211_internal_bss *bss; + unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); + + list_for_each_entry(bss, &dev->bss_list, list) { + bss->ts -= age_jiffies; + } +} + +/* must hold dev->bss_lock! */ +static void __cfg80211_unlink_bss(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *bss) +{ + list_del_init(&bss->list); + rb_erase(&bss->rbn, &dev->bss_tree); + kref_put(&bss->ref, bss_release); +} + +/* must hold dev->bss_lock! */ +void cfg80211_bss_expire(struct cfg80211_registered_device *dev) +{ + struct cfg80211_internal_bss *bss, *tmp; + bool expired = false; + + list_for_each_entry_safe(bss, tmp, &dev->bss_list, list) { + if (atomic_read(&bss->hold)) + continue; + if (!time_after(jiffies, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE)) + continue; + __cfg80211_unlink_bss(dev, bss); + expired = true; + } + + if (expired) + dev->bss_generation++; +} + +const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) +{ + while (len > 2 && ies[0] != eid) { + len -= ies[1] + 2; + ies += ies[1] + 2; + } + if (len < 2) + return NULL; + if (len < 2 + ies[1]) + return NULL; + return ies; +} +EXPORT_SYMBOL(cfg80211_find_ie); + +static int cmp_ies(u8 num, u8 *ies1, size_t len1, u8 *ies2, size_t len2) +{ + const u8 *ie1 = cfg80211_find_ie(num, ies1, len1); + const u8 *ie2 = cfg80211_find_ie(num, ies2, len2); + int r; + + if (!ie1 && !ie2) + return 0; + if (!ie1 || !ie2) + return -1; + + r = memcmp(ie1 + 2, ie2 + 2, min(ie1[1], ie2[1])); + if (r == 0 && ie1[1] != ie2[1]) + return ie2[1] - ie1[1]; + return r; +} + +static bool is_bss(struct cfg80211_bss *a, + const u8 *bssid, + const u8 *ssid, size_t ssid_len) +{ + const u8 *ssidie; + + if (bssid && compare_ether_addr(a->bssid, bssid)) + return false; + + if (!ssid) + return true; + + ssidie = cfg80211_find_ie(WLAN_EID_SSID, + a->information_elements, + a->len_information_elements); + if (!ssidie) + return false; + if (ssidie[1] != ssid_len) + return false; + return memcmp(ssidie + 2, ssid, ssid_len) == 0; +} + +static bool is_mesh_bss(struct cfg80211_bss *a) +{ + const u8 *ie; + + if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability)) + return false; + + ie = cfg80211_find_ie(WLAN_EID_MESH_ID, + a->information_elements, + a->len_information_elements); + if (!ie) + return false; + + ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, + a->information_elements, + a->len_information_elements); + if (!ie) + return false; + + return true; +} + +static bool is_mesh(struct cfg80211_bss *a, + const u8 *meshid, size_t meshidlen, + const u8 *meshcfg) +{ + const u8 *ie; + + if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability)) + return false; + + ie = cfg80211_find_ie(WLAN_EID_MESH_ID, + a->information_elements, + a->len_information_elements); + if (!ie) + return false; + if (ie[1] != meshidlen) + return false; + if (memcmp(ie + 2, meshid, meshidlen)) + return false; + + ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, + a->information_elements, + a->len_information_elements); + if (!ie) + return false; + if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) + return false; + + /* + * Ignore mesh capability (last two bytes of the IE) when + * comparing since that may differ between stations taking + * part in the same mesh. + */ + return memcmp(ie + 2, meshcfg, + sizeof(struct ieee80211_meshconf_ie) - 2) == 0; +} + +static int cmp_bss(struct cfg80211_bss *a, + struct cfg80211_bss *b) +{ + int r; + + if (a->channel != b->channel) + return b->channel->center_freq - a->channel->center_freq; + + if (is_mesh_bss(a) && is_mesh_bss(b)) { + r = cmp_ies(WLAN_EID_MESH_ID, + a->information_elements, + a->len_information_elements, + b->information_elements, + b->len_information_elements); + if (r) + return r; + return cmp_ies(WLAN_EID_MESH_CONFIG, + a->information_elements, + a->len_information_elements, + b->information_elements, + b->len_information_elements); + } + + r = memcmp(a->bssid, b->bssid, ETH_ALEN); + if (r) + return r; + + return cmp_ies(WLAN_EID_SSID, + a->information_elements, + a->len_information_elements, + b->information_elements, + b->len_information_elements); +} + +struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + u16 capa_mask, u16 capa_val) +{ + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_internal_bss *bss, *res = NULL; + unsigned long now = jiffies; + + spin_lock_bh(&dev->bss_lock); + + list_for_each_entry(bss, &dev->bss_list, list) { + if ((bss->pub.capability & capa_mask) != capa_val) + continue; + if (channel && bss->pub.channel != channel) + continue; + /* Don't get expired BSS structs */ + if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) && + !atomic_read(&bss->hold)) + continue; + if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { + res = bss; + kref_get(&res->ref); + break; + } + } + + spin_unlock_bh(&dev->bss_lock); + if (!res) + return NULL; + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_get_bss); + +struct cfg80211_bss *cfg80211_get_mesh(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *meshid, size_t meshidlen, + const u8 *meshcfg) +{ + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_internal_bss *bss, *res = NULL; + + spin_lock_bh(&dev->bss_lock); + + list_for_each_entry(bss, &dev->bss_list, list) { + if (channel && bss->pub.channel != channel) + continue; + if (is_mesh(&bss->pub, meshid, meshidlen, meshcfg)) { + res = bss; + kref_get(&res->ref); + break; + } + } + + spin_unlock_bh(&dev->bss_lock); + if (!res) + return NULL; + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_get_mesh); + + +static void rb_insert_bss(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *bss) +{ + struct rb_node **p = &dev->bss_tree.rb_node; + struct rb_node *parent = NULL; + struct cfg80211_internal_bss *tbss; + int cmp; + + while (*p) { + parent = *p; + tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn); + + cmp = cmp_bss(&bss->pub, &tbss->pub); + + if (WARN_ON(!cmp)) { + /* will sort of leak this BSS */ + return; + } + + if (cmp < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&bss->rbn, parent, p); + rb_insert_color(&bss->rbn, &dev->bss_tree); +} + +static struct cfg80211_internal_bss * +rb_find_bss(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *res) +{ + struct rb_node *n = dev->bss_tree.rb_node; + struct cfg80211_internal_bss *bss; + int r; + + while (n) { + bss = rb_entry(n, struct cfg80211_internal_bss, rbn); + r = cmp_bss(&res->pub, &bss->pub); + + if (r == 0) + return bss; + else if (r < 0) + n = n->rb_left; + else + n = n->rb_right; + } + + return NULL; +} + +static struct cfg80211_internal_bss * +cfg80211_bss_update(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *res) +{ + struct cfg80211_internal_bss *found = NULL; + + /* + * The reference to "res" is donated to this function. + */ + + if (WARN_ON(!res->pub.channel)) { + kref_put(&res->ref, bss_release); + return NULL; + } + + res->ts = jiffies; + + spin_lock_bh(&dev->bss_lock); + + found = rb_find_bss(dev, res); + + if (found) { + found->pub.beacon_interval = res->pub.beacon_interval; + found->pub.tsf = res->pub.tsf; + found->pub.signal = res->pub.signal; + found->pub.capability = res->pub.capability; + found->ts = res->ts; + + /* Update IEs */ + if (res->pub.proberesp_ies) { + size_t used = dev->wiphy.bss_priv_size + sizeof(*res); + size_t ielen = res->pub.len_proberesp_ies; + + if (found->pub.proberesp_ies && + !found->proberesp_ies_allocated && + ksize(found) >= used + ielen) { + memcpy(found->pub.proberesp_ies, + res->pub.proberesp_ies, ielen); + found->pub.len_proberesp_ies = ielen; + } else { + u8 *ies = found->pub.proberesp_ies; + + if (found->proberesp_ies_allocated) + ies = krealloc(ies, ielen, GFP_ATOMIC); + else + ies = kmalloc(ielen, GFP_ATOMIC); + + if (ies) { + memcpy(ies, res->pub.proberesp_ies, + ielen); + found->proberesp_ies_allocated = true; + found->pub.proberesp_ies = ies; + found->pub.len_proberesp_ies = ielen; + } + } + + /* Override possible earlier Beacon frame IEs */ + found->pub.information_elements = + found->pub.proberesp_ies; + found->pub.len_information_elements = + found->pub.len_proberesp_ies; + } + if (res->pub.beacon_ies) { + size_t used = dev->wiphy.bss_priv_size + sizeof(*res); + size_t ielen = res->pub.len_beacon_ies; + bool information_elements_is_beacon_ies = + (found->pub.information_elements == + found->pub.beacon_ies); + + if (found->pub.beacon_ies && + !found->beacon_ies_allocated && + ksize(found) >= used + ielen) { + memcpy(found->pub.beacon_ies, + res->pub.beacon_ies, ielen); + found->pub.len_beacon_ies = ielen; + } else { + u8 *ies = found->pub.beacon_ies; + + if (found->beacon_ies_allocated) + ies = krealloc(ies, ielen, GFP_ATOMIC); + else + ies = kmalloc(ielen, GFP_ATOMIC); + + if (ies) { + memcpy(ies, res->pub.beacon_ies, + ielen); + found->beacon_ies_allocated = true; + found->pub.beacon_ies = ies; + found->pub.len_beacon_ies = ielen; + } + } + + /* Override IEs if they were from a beacon before */ + if (information_elements_is_beacon_ies) { + found->pub.information_elements = + found->pub.beacon_ies; + found->pub.len_information_elements = + found->pub.len_beacon_ies; + } + } + + kref_put(&res->ref, bss_release); + } else { + /* this "consumes" the reference */ + list_add_tail(&res->list, &dev->bss_list); + rb_insert_bss(dev, res); + found = res; + } + + dev->bss_generation++; + spin_unlock_bh(&dev->bss_lock); + + kref_get(&found->ref); + return found; +} + +struct cfg80211_bss* +cfg80211_inform_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + u64 timestamp, u16 capability, u16 beacon_interval, + const u8 *ie, size_t ielen, + s32 signal, gfp_t gfp) +{ + struct cfg80211_internal_bss *res; + size_t privsz; + + if (WARN_ON(!wiphy)) + return NULL; + + privsz = wiphy->bss_priv_size; + + if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && + (signal < 0 || signal > 100))) + return NULL; + + res = kzalloc(sizeof(*res) + privsz + ielen, gfp); + if (!res) + return NULL; + + memcpy(res->pub.bssid, bssid, ETH_ALEN); + res->pub.channel = channel; + res->pub.signal = signal; + res->pub.tsf = timestamp; + res->pub.beacon_interval = beacon_interval; + res->pub.capability = capability; + /* + * Since we do not know here whether the IEs are from a Beacon or Probe + * Response frame, we need to pick one of the options and only use it + * with the driver that does not provide the full Beacon/Probe Response + * frame. Use Beacon frame pointer to avoid indicating that this should + * override the information_elements pointer should we have received an + * earlier indication of Probe Response data. + * + * The initial buffer for the IEs is allocated with the BSS entry and + * is located after the private area. + */ + res->pub.beacon_ies = (u8 *)res + sizeof(*res) + privsz; + memcpy(res->pub.beacon_ies, ie, ielen); + res->pub.len_beacon_ies = ielen; + res->pub.information_elements = res->pub.beacon_ies; + res->pub.len_information_elements = res->pub.len_beacon_ies; + + kref_init(&res->ref); + + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res); + if (!res) + return NULL; + + if (res->pub.capability & WLAN_CAPABILITY_ESS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + + /* cfg80211_bss_update gives us a referenced result */ + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_inform_bss); + +struct cfg80211_bss * +cfg80211_inform_bss_frame(struct wiphy *wiphy, + struct ieee80211_channel *channel, + struct ieee80211_mgmt *mgmt, size_t len, + s32 signal, gfp_t gfp) +{ + struct cfg80211_internal_bss *res; + size_t ielen = len - offsetof(struct ieee80211_mgmt, + u.probe_resp.variable); + size_t privsz; + + if (WARN_ON(!mgmt)) + return NULL; + + if (WARN_ON(!wiphy)) + return NULL; + + if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && + (signal < 0 || signal > 100))) + return NULL; + + if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable))) + return NULL; + + privsz = wiphy->bss_priv_size; + + res = kzalloc(sizeof(*res) + privsz + ielen, gfp); + if (!res) + return NULL; + + memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN); + res->pub.channel = channel; + res->pub.signal = signal; + res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); + res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); + res->pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); + /* + * The initial buffer for the IEs is allocated with the BSS entry and + * is located after the private area. + */ + if (ieee80211_is_probe_resp(mgmt->frame_control)) { + res->pub.proberesp_ies = (u8 *) res + sizeof(*res) + privsz; + memcpy(res->pub.proberesp_ies, mgmt->u.probe_resp.variable, + ielen); + res->pub.len_proberesp_ies = ielen; + res->pub.information_elements = res->pub.proberesp_ies; + res->pub.len_information_elements = res->pub.len_proberesp_ies; + } else { + res->pub.beacon_ies = (u8 *) res + sizeof(*res) + privsz; + memcpy(res->pub.beacon_ies, mgmt->u.beacon.variable, ielen); + res->pub.len_beacon_ies = ielen; + res->pub.information_elements = res->pub.beacon_ies; + res->pub.len_information_elements = res->pub.len_beacon_ies; + } + + kref_init(&res->ref); + + res = cfg80211_bss_update(wiphy_to_dev(wiphy), res); + if (!res) + return NULL; + + if (res->pub.capability & WLAN_CAPABILITY_ESS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + + /* cfg80211_bss_update gives us a referenced result */ + return &res->pub; +} +EXPORT_SYMBOL(cfg80211_inform_bss_frame); + +void cfg80211_put_bss(struct cfg80211_bss *pub) +{ + struct cfg80211_internal_bss *bss; + + if (!pub) + return; + + bss = container_of(pub, struct cfg80211_internal_bss, pub); + kref_put(&bss->ref, bss_release); +} +EXPORT_SYMBOL(cfg80211_put_bss); + +void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) +{ + struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy); + struct cfg80211_internal_bss *bss; + + if (WARN_ON(!pub)) + return; + + bss = container_of(pub, struct cfg80211_internal_bss, pub); + + spin_lock_bh(&dev->bss_lock); + if (!list_empty(&bss->list)) { + __cfg80211_unlink_bss(dev, bss); + dev->bss_generation++; + } + spin_unlock_bh(&dev->bss_lock); +} +EXPORT_SYMBOL(cfg80211_unlink_bss); + +#ifdef CONFIG_CFG80211_WEXT +int cfg80211_wext_siwscan(struct net_device *dev, + struct iw_request_info *info, + union iwreq_data *wrqu, char *extra) +{ + struct cfg80211_registered_device *rdev; + struct wiphy *wiphy; + struct iw_scan_req *wreq = NULL; + struct cfg80211_scan_request *creq = NULL; + int i, err, n_channels = 0; + enum ieee80211_band band; + + if (!netif_running(dev)) + return -ENETDOWN; + + if (wrqu->data.length == sizeof(struct iw_scan_req)) + wreq = (struct iw_scan_req *)extra; + + rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); + + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + + if (rdev->scan_req) { + err = -EBUSY; + goto out; + } + + wiphy = &rdev->wiphy; + + /* Determine number of channels, needed to allocate creq */ + if (wreq && wreq->num_channels) + n_channels = wreq->num_channels; + else { + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + n_channels += wiphy->bands[band]->n_channels; + } + + creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + + n_channels * sizeof(void *), + GFP_ATOMIC); + if (!creq) { + err = -ENOMEM; + goto out; + } + + creq->wiphy = wiphy; + creq->dev = dev; + /* SSIDs come after channels */ + creq->ssids = (void *)&creq->channels[n_channels]; + creq->n_channels = n_channels; + creq->n_ssids = 1; + + /* translate "Scan on frequencies" request */ + i = 0; + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + int j; + + if (!wiphy->bands[band]) + continue; + + for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + /* ignore disabled channels */ + if (wiphy->bands[band]->channels[j].flags & + IEEE80211_CHAN_DISABLED) + continue; + + /* If we have a wireless request structure and the + * wireless request specifies frequencies, then search + * for the matching hardware channel. + */ + if (wreq && wreq->num_channels) { + int k; + int wiphy_freq = wiphy->bands[band]->channels[j].center_freq; + for (k = 0; k < wreq->num_channels; k++) { + int wext_freq = cfg80211_wext_freq(wiphy, &wreq->channel_list[k]); + if (wext_freq == wiphy_freq) + goto wext_freq_found; + } + goto wext_freq_not_found; + } + + wext_freq_found: + creq->channels[i] = &wiphy->bands[band]->channels[j]; + i++; + wext_freq_not_found: ; + } + } + /* No channels found? */ + if (!i) { + err = -EINVAL; + goto out; + } + + /* Set real number of channels specified in creq->channels[] */ + creq->n_channels = i; + + /* translate "Scan for SSID" request */ + if (wreq) { + if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { + if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out; + } + memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); + creq->ssids[0].ssid_len = wreq->essid_len; + } + if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) + creq->n_ssids = 0; + } + + rdev->scan_req = creq; + err = rdev->ops->scan(wiphy, dev, creq); + if (err) { + rdev->scan_req = NULL; + /* creq will be freed below */ + } else { + nl80211_send_scan_start(rdev, dev); + /* creq now owned by driver */ + creq = NULL; + dev_hold(dev); + } + out: + kfree(creq); + cfg80211_unlock_rdev(rdev); + return err; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwscan); + +static void ieee80211_scan_add_ies(struct iw_request_info *info, + struct cfg80211_bss *bss, + char **current_ev, char *end_buf) +{ + u8 *pos, *end, *next; + struct iw_event iwe; + + if (!bss->information_elements || + !bss->len_information_elements) + return; + + /* + * If needed, fragment the IEs buffer (at IE boundaries) into short + * enough fragments to fit into IW_GENERIC_IE_MAX octet messages. + */ + pos = bss->information_elements; + end = pos + bss->len_information_elements; + + while (end - pos > IW_GENERIC_IE_MAX) { + next = pos + 2 + pos[1]; + while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX) + next = next + 2 + next[1]; + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVGENIE; + iwe.u.data.length = next - pos; + *current_ev = iwe_stream_add_point(info, *current_ev, + end_buf, &iwe, pos); + + pos = next; + } + + if (end > pos) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVGENIE; + iwe.u.data.length = end - pos; + *current_ev = iwe_stream_add_point(info, *current_ev, + end_buf, &iwe, pos); + } +} + +static inline unsigned int elapsed_jiffies_msecs(unsigned long start) +{ + unsigned long end = jiffies; + + if (end >= start) + return jiffies_to_msecs(end - start); + + return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1); +} + +static char * +ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, + struct cfg80211_internal_bss *bss, char *current_ev, + char *end_buf) +{ + struct iw_event iwe; + u8 *buf, *cfg, *p; + u8 *ie = bss->pub.information_elements; + int rem = bss->pub.len_information_elements, i, sig; + bool ismesh = false; + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWAP; + iwe.u.ap_addr.sa_family = ARPHRD_ETHER; + memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN); + current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, + IW_EV_ADDR_LEN); + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWFREQ; + iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq); + iwe.u.freq.e = 0; + current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, + IW_EV_FREQ_LEN); + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWFREQ; + iwe.u.freq.m = bss->pub.channel->center_freq; + iwe.u.freq.e = 6; + current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, + IW_EV_FREQ_LEN); + + if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVQUAL; + iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | + IW_QUAL_NOISE_INVALID | + IW_QUAL_QUAL_UPDATED; + switch (wiphy->signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + sig = bss->pub.signal / 100; + iwe.u.qual.level = sig; + iwe.u.qual.updated |= IW_QUAL_DBM; + if (sig < -110) /* rather bad */ + sig = -110; + else if (sig > -40) /* perfect */ + sig = -40; + /* will give a range of 0 .. 70 */ + iwe.u.qual.qual = sig + 110; + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + iwe.u.qual.level = bss->pub.signal; + /* will give range 0 .. 100 */ + iwe.u.qual.qual = bss->pub.signal; + break; + default: + /* not reached */ + break; + } + current_ev = iwe_stream_add_event(info, current_ev, end_buf, + &iwe, IW_EV_QUAL_LEN); + } + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWENCODE; + if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY) + iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY; + else + iwe.u.data.flags = IW_ENCODE_DISABLED; + iwe.u.data.length = 0; + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, ""); + + while (rem >= 2) { + /* invalid data */ + if (ie[1] > rem - 2) + break; + + switch (ie[0]) { + case WLAN_EID_SSID: + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWESSID; + iwe.u.data.length = ie[1]; + iwe.u.data.flags = 1; + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, ie + 2); + break; + case WLAN_EID_MESH_ID: + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWESSID; + iwe.u.data.length = ie[1]; + iwe.u.data.flags = 1; + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, ie + 2); + break; + case WLAN_EID_MESH_CONFIG: + ismesh = true; + if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) + break; + buf = kmalloc(50, GFP_ATOMIC); + if (!buf) + break; + cfg = ie + 2; + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, "Mesh Network Path Selection Protocol ID: " + "0x%02X", cfg[0]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Path Selection Metric ID: 0x%02X", + cfg[1]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Congestion Control Mode ID: 0x%02X", + cfg[2]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Authentication ID: 0x%02X", cfg[4]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Formation Info: 0x%02X", cfg[5]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + sprintf(buf, "Capabilities: 0x%02X", cfg[6]); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, + &iwe, buf); + kfree(buf); + break; + case WLAN_EID_SUPP_RATES: + case WLAN_EID_EXT_SUPP_RATES: + /* display all supported rates in readable format */ + p = current_ev + iwe_stream_lcp_len(info); + + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWRATE; + /* Those two flags are ignored... */ + iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0; + + for (i = 0; i < ie[1]; i++) { + iwe.u.bitrate.value = + ((ie[i + 2] & 0x7f) * 500000); + p = iwe_stream_add_value(info, current_ev, p, + end_buf, &iwe, IW_EV_PARAM_LEN); + } + current_ev = p; + break; + } + rem -= ie[1] + 2; + ie += ie[1] + 2; + } + + if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) || + ismesh) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = SIOCGIWMODE; + if (ismesh) + iwe.u.mode = IW_MODE_MESH; + else if (bss->pub.capability & WLAN_CAPABILITY_ESS) + iwe.u.mode = IW_MODE_MASTER; + else + iwe.u.mode = IW_MODE_ADHOC; + current_ev = iwe_stream_add_event(info, current_ev, end_buf, + &iwe, IW_EV_UINT_LEN); + } + + buf = kmalloc(30, GFP_ATOMIC); + if (buf) { + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->pub.tsf)); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, end_buf, + &iwe, buf); + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, " Last beacon: %ums ago", + elapsed_jiffies_msecs(bss->ts)); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point(info, current_ev, + end_buf, &iwe, buf); + kfree(buf); + } + + ieee80211_scan_add_ies(info, &bss->pub, ¤t_ev, end_buf); + + return current_ev; +} + + +static int ieee80211_scan_results(struct cfg80211_registered_device *dev, + struct iw_request_info *info, + char *buf, size_t len) +{ + char *current_ev = buf; + char *end_buf = buf + len; + struct cfg80211_internal_bss *bss; + + spin_lock_bh(&dev->bss_lock); + cfg80211_bss_expire(dev); + + list_for_each_entry(bss, &dev->bss_list, list) { + if (buf + len - current_ev <= IW_EV_ADDR_LEN) { + spin_unlock_bh(&dev->bss_lock); + return -E2BIG; + } + current_ev = ieee80211_bss(&dev->wiphy, info, bss, + current_ev, end_buf); + } + spin_unlock_bh(&dev->bss_lock); + return current_ev - buf; +} + + +int cfg80211_wext_giwscan(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct cfg80211_registered_device *rdev; + int res; + + if (!netif_running(dev)) + return -ENETDOWN; + + rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); + + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + + if (rdev->scan_req) { + res = -EAGAIN; + goto out; + } + + res = ieee80211_scan_results(rdev, info, extra, data->length); + data->length = 0; + if (res >= 0) { + data->length = res; + res = 0; + } + + out: + cfg80211_unlock_rdev(rdev); + return res; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwscan); +#endif diff --git a/net/wireless/sme.c b/net/wireless/sme.c new file mode 100644 index 00000000..cf4be212 --- /dev/null +++ b/net/wireless/sme.c @@ -0,0 +1,1020 @@ +/* + * SME code for cfg80211's connect emulation. + * + * Copyright 2009 Johannes Berg + * Copyright (C) 2009 Intel Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "nl80211.h" +#include "reg.h" + +struct cfg80211_conn { + struct cfg80211_connect_params params; + /* these are sub-states of the _CONNECTING sme_state */ + enum { + CFG80211_CONN_IDLE, + CFG80211_CONN_SCANNING, + CFG80211_CONN_SCAN_AGAIN, + CFG80211_CONN_AUTHENTICATE_NEXT, + CFG80211_CONN_AUTHENTICATING, + CFG80211_CONN_ASSOCIATE_NEXT, + CFG80211_CONN_ASSOCIATING, + CFG80211_CONN_DEAUTH_ASSOC_FAIL, + } state; + u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; + u8 *ie; + size_t ie_len; + bool auto_auth, prev_bssid_valid; +}; + +static bool cfg80211_is_all_idle(void) +{ + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + bool is_all_idle = true; + + mutex_lock(&cfg80211_mutex); + + /* + * All devices must be idle as otherwise if you are actively + * scanning some new beacon hints could be learned and would + * count as new regulatory hints. + */ + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + cfg80211_lock_rdev(rdev); + list_for_each_entry(wdev, &rdev->netdev_list, list) { + wdev_lock(wdev); + if (wdev->sme_state != CFG80211_SME_IDLE) + is_all_idle = false; + wdev_unlock(wdev); + } + cfg80211_unlock_rdev(rdev); + } + + mutex_unlock(&cfg80211_mutex); + + return is_all_idle; +} + +static void disconnect_work(struct work_struct *work) +{ + if (!cfg80211_is_all_idle()) + return; + + regulatory_hint_disconnect(); +} + +static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); + +static int cfg80211_conn_scan(struct wireless_dev *wdev) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_scan_request *request; + int n_channels, err; + + ASSERT_RTNL(); + ASSERT_RDEV_LOCK(rdev); + ASSERT_WDEV_LOCK(wdev); + + if (rdev->scan_req) + return -EBUSY; + + if (wdev->conn->params.channel) { + n_channels = 1; + } else { + enum ieee80211_band band; + n_channels = 0; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!wdev->wiphy->bands[band]) + continue; + n_channels += wdev->wiphy->bands[band]->n_channels; + } + } + request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + + sizeof(request->channels[0]) * n_channels, + GFP_KERNEL); + if (!request) + return -ENOMEM; + + if (wdev->conn->params.channel) + request->channels[0] = wdev->conn->params.channel; + else { + int i = 0, j; + enum ieee80211_band band; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + if (!wdev->wiphy->bands[band]) + continue; + for (j = 0; j < wdev->wiphy->bands[band]->n_channels; + i++, j++) + request->channels[i] = + &wdev->wiphy->bands[band]->channels[j]; + } + } + request->n_channels = n_channels; + request->ssids = (void *)&request->channels[n_channels]; + request->n_ssids = 1; + + memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, + wdev->conn->params.ssid_len); + request->ssids[0].ssid_len = wdev->conn->params.ssid_len; + + request->dev = wdev->netdev; + request->wiphy = &rdev->wiphy; + + rdev->scan_req = request; + + err = rdev->ops->scan(wdev->wiphy, wdev->netdev, request); + if (!err) { + wdev->conn->state = CFG80211_CONN_SCANNING; + nl80211_send_scan_start(rdev, wdev->netdev); + dev_hold(wdev->netdev); + } else { + rdev->scan_req = NULL; + kfree(request); + } + return err; +} + +static int cfg80211_conn_do_work(struct wireless_dev *wdev) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_connect_params *params; + const u8 *prev_bssid = NULL; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (!wdev->conn) + return 0; + + params = &wdev->conn->params; + + switch (wdev->conn->state) { + case CFG80211_CONN_SCAN_AGAIN: + return cfg80211_conn_scan(wdev); + case CFG80211_CONN_AUTHENTICATE_NEXT: + BUG_ON(!rdev->ops->auth); + wdev->conn->state = CFG80211_CONN_AUTHENTICATING; + return __cfg80211_mlme_auth(rdev, wdev->netdev, + params->channel, params->auth_type, + params->bssid, + params->ssid, params->ssid_len, + NULL, 0, + params->key, params->key_len, + params->key_idx, false); + case CFG80211_CONN_ASSOCIATE_NEXT: + BUG_ON(!rdev->ops->assoc); + wdev->conn->state = CFG80211_CONN_ASSOCIATING; + if (wdev->conn->prev_bssid_valid) + prev_bssid = wdev->conn->prev_bssid; + err = __cfg80211_mlme_assoc(rdev, wdev->netdev, + params->channel, params->bssid, + prev_bssid, + params->ssid, params->ssid_len, + params->ie, params->ie_len, + false, ¶ms->crypto); + if (err) + __cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, + NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, + false); + return err; + case CFG80211_CONN_DEAUTH_ASSOC_FAIL: + __cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, + NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, false); + /* return an error so that we call __cfg80211_connect_result() */ + return -EINVAL; + default: + return 0; + } +} + +void cfg80211_conn_work(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev = + container_of(work, struct cfg80211_registered_device, conn_work); + struct wireless_dev *wdev; + u8 bssid_buf[ETH_ALEN], *bssid = NULL; + + rtnl_lock(); + cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->devlist_mtx); + + list_for_each_entry(wdev, &rdev->netdev_list, list) { + wdev_lock(wdev); + if (!netif_running(wdev->netdev)) { + wdev_unlock(wdev); + continue; + } + if (wdev->sme_state != CFG80211_SME_CONNECTING) { + wdev_unlock(wdev); + continue; + } + if (wdev->conn->params.bssid) { + memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); + bssid = bssid_buf; + } + if (cfg80211_conn_do_work(wdev)) + __cfg80211_connect_result( + wdev->netdev, bssid, + NULL, 0, NULL, 0, + WLAN_STATUS_UNSPECIFIED_FAILURE, + false, NULL); + wdev_unlock(wdev); + } + + mutex_unlock(&rdev->devlist_mtx); + cfg80211_unlock_rdev(rdev); + rtnl_unlock(); +} + +static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_bss *bss; + u16 capa = WLAN_CAPABILITY_ESS; + + ASSERT_WDEV_LOCK(wdev); + + if (wdev->conn->params.privacy) + capa |= WLAN_CAPABILITY_PRIVACY; + + bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, + wdev->conn->params.bssid, + wdev->conn->params.ssid, + wdev->conn->params.ssid_len, + WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_PRIVACY, + capa); + if (!bss) + return NULL; + + memcpy(wdev->conn->bssid, bss->bssid, ETH_ALEN); + wdev->conn->params.bssid = wdev->conn->bssid; + wdev->conn->params.channel = bss->channel; + wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; + schedule_work(&rdev->conn_work); + + return bss; +} + +static void __cfg80211_sme_scan_done(struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_bss *bss; + + ASSERT_WDEV_LOCK(wdev); + + if (wdev->sme_state != CFG80211_SME_CONNECTING) + return; + + if (!wdev->conn) + return; + + if (wdev->conn->state != CFG80211_CONN_SCANNING && + wdev->conn->state != CFG80211_CONN_SCAN_AGAIN) + return; + + bss = cfg80211_get_conn_bss(wdev); + if (bss) { + cfg80211_put_bss(bss); + } else { + /* not found */ + if (wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) + schedule_work(&rdev->conn_work); + else + __cfg80211_connect_result( + wdev->netdev, + wdev->conn->params.bssid, + NULL, 0, NULL, 0, + WLAN_STATUS_UNSPECIFIED_FAILURE, + false, NULL); + } +} + +void cfg80211_sme_scan_done(struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + mutex_lock(&wiphy_to_dev(wdev->wiphy)->devlist_mtx); + wdev_lock(wdev); + __cfg80211_sme_scan_done(dev); + wdev_unlock(wdev); + mutex_unlock(&wiphy_to_dev(wdev->wiphy)->devlist_mtx); +} + +void cfg80211_sme_rx_auth(struct net_device *dev, + const u8 *buf, size_t len) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; + u16 status_code = le16_to_cpu(mgmt->u.auth.status_code); + + ASSERT_WDEV_LOCK(wdev); + + /* should only RX auth frames when connecting */ + if (wdev->sme_state != CFG80211_SME_CONNECTING) + return; + + if (WARN_ON(!wdev->conn)) + return; + + if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG && + wdev->conn->auto_auth && + wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) { + /* select automatically between only open, shared, leap */ + switch (wdev->conn->params.auth_type) { + case NL80211_AUTHTYPE_OPEN_SYSTEM: + if (wdev->connect_keys) + wdev->conn->params.auth_type = + NL80211_AUTHTYPE_SHARED_KEY; + else + wdev->conn->params.auth_type = + NL80211_AUTHTYPE_NETWORK_EAP; + break; + case NL80211_AUTHTYPE_SHARED_KEY: + wdev->conn->params.auth_type = + NL80211_AUTHTYPE_NETWORK_EAP; + break; + default: + /* huh? */ + wdev->conn->params.auth_type = + NL80211_AUTHTYPE_OPEN_SYSTEM; + break; + } + wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; + schedule_work(&rdev->conn_work); + } else if (status_code != WLAN_STATUS_SUCCESS) { + __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, NULL, 0, + status_code, false, NULL); + } else if (wdev->sme_state == CFG80211_SME_CONNECTING && + wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { + wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; + schedule_work(&rdev->conn_work); + } +} + +bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + if (WARN_ON(!wdev->conn)) + return false; + + if (!wdev->conn->prev_bssid_valid) + return false; + + /* + * Some stupid APs don't accept reassoc, so we + * need to fall back to trying regular assoc. + */ + wdev->conn->prev_bssid_valid = false; + wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; + schedule_work(&rdev->conn_work); + + return true; +} + +void cfg80211_sme_failed_assoc(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + wdev->conn->state = CFG80211_CONN_DEAUTH_ASSOC_FAIL; + schedule_work(&rdev->conn_work); +} + +void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, + u16 status, bool wextev, + struct cfg80211_bss *bss) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 *country_ie; +#ifdef CONFIG_CFG80211_WEXT + union iwreq_data wrqu; +#endif + + ASSERT_WDEV_LOCK(wdev); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) + return; + + if (wdev->sme_state != CFG80211_SME_CONNECTING) + return; + + nl80211_send_connect_result(wiphy_to_dev(wdev->wiphy), dev, + bssid, req_ie, req_ie_len, + resp_ie, resp_ie_len, + status, GFP_KERNEL); + +#ifdef CONFIG_CFG80211_WEXT + if (wextev) { + if (req_ie && status == WLAN_STATUS_SUCCESS) { + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = req_ie_len; + wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, req_ie); + } + + if (resp_ie && status == WLAN_STATUS_SUCCESS) { + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = resp_ie_len; + wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, resp_ie); + } + + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.ap_addr.sa_family = ARPHRD_ETHER; + if (bssid && status == WLAN_STATUS_SUCCESS) { + memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); + memcpy(wdev->wext.prev_bssid, bssid, ETH_ALEN); + wdev->wext.prev_bssid_valid = true; + } + wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); + } +#endif + + if (wdev->current_bss) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + wdev->current_bss = NULL; + } + + if (wdev->conn) + wdev->conn->state = CFG80211_CONN_IDLE; + + if (status != WLAN_STATUS_SUCCESS) { + wdev->sme_state = CFG80211_SME_IDLE; + if (wdev->conn) + kfree(wdev->conn->ie); + kfree(wdev->conn); + wdev->conn = NULL; + kfree(wdev->connect_keys); + wdev->connect_keys = NULL; + wdev->ssid_len = 0; + return; + } + + if (!bss) + bss = cfg80211_get_bss(wdev->wiphy, + wdev->conn ? wdev->conn->params.channel : + NULL, + bssid, + wdev->ssid, wdev->ssid_len, + WLAN_CAPABILITY_ESS, + WLAN_CAPABILITY_ESS); + + if (WARN_ON(!bss)) + return; + + cfg80211_hold_bss(bss_from_pub(bss)); + wdev->current_bss = bss_from_pub(bss); + + wdev->sme_state = CFG80211_SME_CONNECTED; + cfg80211_upload_connect_keys(wdev); + + country_ie = (u8 *) ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY); + + if (!country_ie) + return; + + /* + * ieee80211_bss_get_ie() ensures we can access: + * - country_ie + 2, the start of the country ie data, and + * - and country_ie[1] which is the IE length + */ + regulatory_hint_11d(wdev->wiphy, + bss->channel->band, + country_ie + 2, + country_ie[1]); +} + +void cfg80211_connect_result(struct net_device *dev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, + u16 status, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + CFG80211_DEV_WARN_ON(wdev->sme_state != CFG80211_SME_CONNECTING); + + ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp); + if (!ev) + return; + + ev->type = EVENT_CONNECT_RESULT; + if (bssid) + memcpy(ev->cr.bssid, bssid, ETH_ALEN); + if (req_ie_len) { + ev->cr.req_ie = ((u8 *)ev) + sizeof(*ev); + ev->cr.req_ie_len = req_ie_len; + memcpy((void *)ev->cr.req_ie, req_ie, req_ie_len); + } + if (resp_ie_len) { + ev->cr.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len; + ev->cr.resp_ie_len = resp_ie_len; + memcpy((void *)ev->cr.resp_ie, resp_ie, resp_ie_len); + } + ev->cr.status = status; + + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); +} +EXPORT_SYMBOL(cfg80211_connect_result); + +void __cfg80211_roamed(struct wireless_dev *wdev, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len) +{ + struct cfg80211_bss *bss; +#ifdef CONFIG_CFG80211_WEXT + union iwreq_data wrqu; +#endif + + ASSERT_WDEV_LOCK(wdev); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) + return; + + if (wdev->sme_state != CFG80211_SME_CONNECTED) + return; + + /* internal error -- how did we get to CONNECTED w/o BSS? */ + if (WARN_ON(!wdev->current_bss)) { + return; + } + + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + wdev->current_bss = NULL; + + bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, + wdev->ssid, wdev->ssid_len, + WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); + + if (WARN_ON(!bss)) + return; + + cfg80211_hold_bss(bss_from_pub(bss)); + wdev->current_bss = bss_from_pub(bss); + + nl80211_send_roamed(wiphy_to_dev(wdev->wiphy), wdev->netdev, bssid, + req_ie, req_ie_len, resp_ie, resp_ie_len, + GFP_KERNEL); + +#ifdef CONFIG_CFG80211_WEXT + if (req_ie) { + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = req_ie_len; + wireless_send_event(wdev->netdev, IWEVASSOCREQIE, + &wrqu, req_ie); + } + + if (resp_ie) { + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = resp_ie_len; + wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, + &wrqu, resp_ie); + } + + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.ap_addr.sa_family = ARPHRD_ETHER; + memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); + memcpy(wdev->wext.prev_bssid, bssid, ETH_ALEN); + wdev->wext.prev_bssid_valid = true; + wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); +#endif +} + +void cfg80211_roamed(struct net_device *dev, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, + const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + CFG80211_DEV_WARN_ON(wdev->sme_state != CFG80211_SME_CONNECTED); + + ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp); + if (!ev) + return; + + ev->type = EVENT_ROAMED; + ev->rm.channel = channel; + memcpy(ev->rm.bssid, bssid, ETH_ALEN); + ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev); + ev->rm.req_ie_len = req_ie_len; + memcpy((void *)ev->rm.req_ie, req_ie, req_ie_len); + ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len; + ev->rm.resp_ie_len = resp_ie_len; + memcpy((void *)ev->rm.resp_ie, resp_ie, resp_ie_len); + + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); +} +EXPORT_SYMBOL(cfg80211_roamed); + +void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, + size_t ie_len, u16 reason, bool from_ap) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + int i; +#ifdef CONFIG_CFG80211_WEXT + union iwreq_data wrqu; +#endif + + ASSERT_WDEV_LOCK(wdev); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) + return; + +#ifndef CONFIG_CFG80211_ALLOW_RECONNECT + if (wdev->sme_state != CFG80211_SME_CONNECTED) + return; +#endif + + if (wdev->current_bss) { + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(&wdev->current_bss->pub); + } + + wdev->current_bss = NULL; + wdev->sme_state = CFG80211_SME_IDLE; + wdev->ssid_len = 0; + + if (wdev->conn) { + const u8 *bssid; + int ret; + + kfree(wdev->conn->ie); + wdev->conn->ie = NULL; + kfree(wdev->conn); + wdev->conn = NULL; + + /* + * If this disconnect was due to a disassoc, we + * we might still have an auth BSS around. For + * the userspace SME that's currently expected, + * but for the kernel SME (nl80211 CONNECT or + * wireless extensions) we want to clear up all + * state. + */ + for (i = 0; i < MAX_AUTH_BSSES; i++) { + if (!wdev->auth_bsses[i]) + continue; + bssid = wdev->auth_bsses[i]->pub.bssid; + ret = __cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, + false); + WARN(ret, "deauth failed: %d\n", ret); + } + } + + nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); + + /* + * Delete all the keys ... pairwise keys can't really + * exist any more anyway, but default keys might. + */ + if (rdev->ops->del_key) + for (i = 0; i < 6; i++) + rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL); + +#ifdef CONFIG_CFG80211_WEXT + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.ap_addr.sa_family = ARPHRD_ETHER; + wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); + wdev->wext.connect.ssid_len = 0; +#endif + + schedule_work(&cfg80211_disconnect_work); +} + +void cfg80211_disconnected(struct net_device *dev, u16 reason, + u8 *ie, size_t ie_len, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + CFG80211_DEV_WARN_ON(wdev->sme_state != CFG80211_SME_CONNECTED); + + ev = kzalloc(sizeof(*ev) + ie_len, gfp); + if (!ev) + return; + + ev->type = EVENT_DISCONNECTED; + ev->dc.ie = ((u8 *)ev) + sizeof(*ev); + ev->dc.ie_len = ie_len; + memcpy((void *)ev->dc.ie, ie, ie_len); + ev->dc.reason = reason; + + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); +} +EXPORT_SYMBOL(cfg80211_disconnected); + +int __cfg80211_connect(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_connect_params *connect, + struct cfg80211_cached_keys *connkeys, + const u8 *prev_bssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_bss *bss = NULL; + int err; + + ASSERT_WDEV_LOCK(wdev); + +#ifndef CONFIG_CFG80211_ALLOW_RECONNECT + if (wdev->sme_state != CFG80211_SME_IDLE) + return -EALREADY; + + if (WARN_ON(wdev->connect_keys)) { +#else + if (wdev->connect_keys) { +#endif + kfree(wdev->connect_keys); + wdev->connect_keys = NULL; + } + + if (connkeys && connkeys->def >= 0) { + int idx; + u32 cipher; + + idx = connkeys->def; + cipher = connkeys->params[idx].cipher; + /* If given a WEP key we may need it for shared key auth */ + if (cipher == WLAN_CIPHER_SUITE_WEP40 || + cipher == WLAN_CIPHER_SUITE_WEP104) { + connect->key_idx = idx; + connect->key = connkeys->params[idx].key; + connect->key_len = connkeys->params[idx].key_len; + + /* + * If ciphers are not set (e.g. when going through + * iwconfig), we have to set them appropriately here. + */ + if (connect->crypto.cipher_group == 0) + connect->crypto.cipher_group = cipher; + + if (connect->crypto.n_ciphers_pairwise == 0) { + connect->crypto.n_ciphers_pairwise = 1; + connect->crypto.ciphers_pairwise[0] = cipher; + } + } + } + + if (!rdev->ops->connect) { + if (!rdev->ops->auth || !rdev->ops->assoc) + return -EOPNOTSUPP; + + if (WARN_ON(wdev->conn)) + return -EINPROGRESS; + + wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); + if (!wdev->conn) + return -ENOMEM; + + /* + * Copy all parameters, and treat explicitly IEs, BSSID, SSID. + */ + memcpy(&wdev->conn->params, connect, sizeof(*connect)); + if (connect->bssid) { + wdev->conn->params.bssid = wdev->conn->bssid; + memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN); + } + + if (connect->ie) { + wdev->conn->ie = kmemdup(connect->ie, connect->ie_len, + GFP_KERNEL); + wdev->conn->params.ie = wdev->conn->ie; + if (!wdev->conn->ie) { + kfree(wdev->conn); + wdev->conn = NULL; + return -ENOMEM; + } + } + + if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) { + wdev->conn->auto_auth = true; + /* start with open system ... should mostly work */ + wdev->conn->params.auth_type = + NL80211_AUTHTYPE_OPEN_SYSTEM; + } else { + wdev->conn->auto_auth = false; + } + + memcpy(wdev->ssid, connect->ssid, connect->ssid_len); + wdev->ssid_len = connect->ssid_len; + wdev->conn->params.ssid = wdev->ssid; + wdev->conn->params.ssid_len = connect->ssid_len; + + /* see if we have the bss already */ + bss = cfg80211_get_conn_bss(wdev); + + wdev->sme_state = CFG80211_SME_CONNECTING; + wdev->connect_keys = connkeys; + + if (prev_bssid) { + memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); + wdev->conn->prev_bssid_valid = true; + } + + /* we're good if we have a matching bss struct */ + if (bss) { + wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; + err = cfg80211_conn_do_work(wdev); + cfg80211_put_bss(bss); + } else { + /* otherwise we'll need to scan for the AP first */ + err = cfg80211_conn_scan(wdev); + /* + * If we can't scan right now, then we need to scan again + * after the current scan finished, since the parameters + * changed (unless we find a good AP anyway). + */ + if (err == -EBUSY) { + err = 0; + wdev->conn->state = CFG80211_CONN_SCAN_AGAIN; + } + } + if (err) { + kfree(wdev->conn->ie); + kfree(wdev->conn); + wdev->conn = NULL; + wdev->sme_state = CFG80211_SME_IDLE; + wdev->connect_keys = NULL; + wdev->ssid_len = 0; + } + + return err; + } else { + wdev->sme_state = CFG80211_SME_CONNECTING; + wdev->connect_keys = connkeys; + err = rdev->ops->connect(&rdev->wiphy, dev, connect); + if (err) { + wdev->connect_keys = NULL; + wdev->sme_state = CFG80211_SME_IDLE; + return err; + } + + memcpy(wdev->ssid, connect->ssid, connect->ssid_len); + wdev->ssid_len = connect->ssid_len; + + return 0; + } +} + +int cfg80211_connect(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_connect_params *connect, + struct cfg80211_cached_keys *connkeys) +{ + int err; + + mutex_lock(&rdev->devlist_mtx); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_connect(rdev, dev, connect, connkeys, NULL); + wdev_unlock(dev->ieee80211_ptr); + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + +int __cfg80211_disconnect(struct cfg80211_registered_device *rdev, + struct net_device *dev, u16 reason, bool wextev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (wdev->sme_state == CFG80211_SME_IDLE) + return -EINVAL; + + kfree(wdev->connect_keys); + wdev->connect_keys = NULL; + + if (!rdev->ops->disconnect) { + if (!rdev->ops->deauth) + return -EOPNOTSUPP; + + /* was it connected by userspace SME? */ + if (!wdev->conn) { + cfg80211_mlme_down(rdev, dev); + return 0; + } + + if (wdev->sme_state == CFG80211_SME_CONNECTING && + (wdev->conn->state == CFG80211_CONN_SCANNING || + wdev->conn->state == CFG80211_CONN_SCAN_AGAIN)) { + wdev->sme_state = CFG80211_SME_IDLE; + kfree(wdev->conn->ie); + kfree(wdev->conn); + wdev->conn = NULL; + wdev->ssid_len = 0; + return 0; + } + + /* wdev->conn->params.bssid must be set if > SCANNING */ + err = __cfg80211_mlme_deauth(rdev, dev, + wdev->conn->params.bssid, + NULL, 0, reason, false); + if (err) + return err; + } else { + err = rdev->ops->disconnect(&rdev->wiphy, dev, reason); + if (err) + return err; + } + + if (wdev->sme_state == CFG80211_SME_CONNECTED) + __cfg80211_disconnected(dev, NULL, 0, 0, false); + else if (wdev->sme_state == CFG80211_SME_CONNECTING) + __cfg80211_connect_result(dev, NULL, NULL, 0, NULL, 0, + WLAN_STATUS_UNSPECIFIED_FAILURE, + wextev, NULL); + + return 0; +} + +int cfg80211_disconnect(struct cfg80211_registered_device *rdev, + struct net_device *dev, + u16 reason, bool wextev) +{ + int err; + + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_disconnect(rdev, dev, reason, wextev); + wdev_unlock(dev->ieee80211_ptr); + + return err; +} + +void cfg80211_sme_disassoc(struct net_device *dev, int idx) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u8 bssid[ETH_ALEN]; + + ASSERT_WDEV_LOCK(wdev); + + if (!wdev->conn) + return; + + if (wdev->conn->state == CFG80211_CONN_IDLE) + return; + + /* + * Ok, so the association was made by this SME -- we don't + * want it any more so deauthenticate too. + */ + + if (!wdev->auth_bsses[idx]) + return; + + memcpy(bssid, wdev->auth_bsses[idx]->pub.bssid, ETH_ALEN); + if (__cfg80211_mlme_deauth(rdev, dev, bssid, + NULL, 0, WLAN_REASON_DEAUTH_LEAVING, + false)) { + /* whatever -- assume gone anyway */ + cfg80211_unhold_bss(wdev->auth_bsses[idx]); + cfg80211_put_bss(&wdev->auth_bsses[idx]->pub); + wdev->auth_bsses[idx] = NULL; + } +} diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c new file mode 100644 index 00000000..c6e4ca6a --- /dev/null +++ b/net/wireless/sysfs.c @@ -0,0 +1,151 @@ +/* + * This file provides /sys/class/ieee80211// + * and some default attributes. + * + * Copyright 2005-2006 Jiri Benc + * Copyright 2006 Johannes Berg + * + * This file is GPLv2 as found in COPYING. + */ + +#include +#include +#include +#include +#include +#include +#include "sysfs.h" +#include "core.h" + +static inline struct cfg80211_registered_device *dev_to_rdev( + struct device *dev) +{ + return container_of(dev, struct cfg80211_registered_device, wiphy.dev); +} + +#define SHOW_FMT(name, fmt, member) \ +static ssize_t name ## _show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ +} + +SHOW_FMT(index, "%d", wiphy_idx); +SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); +SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); + +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, + char *buf) { + struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; + return sprintf(buf, "%s\n", dev_name(&wiphy->dev)); +} + + +static ssize_t addresses_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; + char *start = buf; + int i; + + if (!wiphy->addresses) + return sprintf(buf, "%pM\n", wiphy->perm_addr); + + for (i = 0; i < wiphy->n_addresses; i++) + buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr); + + return buf - start; +} + +static struct device_attribute ieee80211_dev_attrs[] = { + __ATTR_RO(index), + __ATTR_RO(macaddress), + __ATTR_RO(address_mask), + __ATTR_RO(addresses), + __ATTR_RO(name), + {} +}; + +static void wiphy_dev_release(struct device *dev) +{ + struct cfg80211_registered_device *rdev = dev_to_rdev(dev); + + cfg80211_dev_free(rdev); +} + +#ifdef CONFIG_HOTPLUG +static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + /* TODO, we probably need stuff here */ + return 0; +} +#endif + +static int wiphy_suspend(struct device *dev, pm_message_t state) +{ + struct cfg80211_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + rdev->suspend_at = get_seconds(); + + if (rdev->ops->suspend) { + rtnl_lock(); + ret = rdev->ops->suspend(&rdev->wiphy, rdev->wowlan); + rtnl_unlock(); + } + + return ret; +} + +static int wiphy_resume(struct device *dev) +{ + struct cfg80211_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + /* Age scan results with time spent in suspend */ + spin_lock_bh(&rdev->bss_lock); + cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); + spin_unlock_bh(&rdev->bss_lock); + + if (rdev->ops->resume) { + rtnl_lock(); + ret = rdev->ops->resume(&rdev->wiphy); + rtnl_unlock(); + } + + return ret; +} + +static const void *wiphy_namespace(struct device *d) +{ + struct wiphy *wiphy = container_of(d, struct wiphy, dev); + + return wiphy_net(wiphy); +} + +struct class ieee80211_class = { + .name = "ieee80211", + .owner = THIS_MODULE, + .dev_release = wiphy_dev_release, + .dev_attrs = ieee80211_dev_attrs, +#ifdef CONFIG_HOTPLUG + .dev_uevent = wiphy_uevent, +#endif + .suspend = wiphy_suspend, + .resume = wiphy_resume, + .ns_type = &net_ns_type_operations, + .namespace = wiphy_namespace, +}; + +int wiphy_sysfs_init(void) +{ + return class_register(&ieee80211_class); +} + +void wiphy_sysfs_exit(void) +{ + class_unregister(&ieee80211_class); +} diff --git a/net/wireless/sysfs.h b/net/wireless/sysfs.h new file mode 100644 index 00000000..65acbebd --- /dev/null +++ b/net/wireless/sysfs.h @@ -0,0 +1,9 @@ +#ifndef __WIRELESS_SYSFS_H +#define __WIRELESS_SYSFS_H + +extern int wiphy_sysfs_init(void); +extern void wiphy_sysfs_exit(void); + +extern struct class ieee80211_class; + +#endif /* __WIRELESS_SYSFS_H */ diff --git a/net/wireless/util.c b/net/wireless/util.c new file mode 100644 index 00000000..30f68dc7 --- /dev/null +++ b/net/wireless/util.c @@ -0,0 +1,1025 @@ +/* + * Wireless utility functions + * + * Copyright 2007-2009 Johannes Berg + */ +#include +#include +#include +#include +#include +#include "core.h" + +struct ieee80211_rate * +ieee80211_get_response_rate(struct ieee80211_supported_band *sband, + u32 basic_rates, int bitrate) +{ + struct ieee80211_rate *result = &sband->bitrates[0]; + int i; + + for (i = 0; i < sband->n_bitrates; i++) { + if (!(basic_rates & BIT(i))) + continue; + if (sband->bitrates[i].bitrate > bitrate) + continue; + result = &sband->bitrates[i]; + } + + return result; +} +EXPORT_SYMBOL(ieee80211_get_response_rate); + +int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band) +{ + /* see 802.11 17.3.8.3.2 and Annex J + * there are overlapping channel numbers in 5GHz and 2GHz bands */ + if (band == IEEE80211_BAND_5GHZ) { + if (chan >= 182 && chan <= 196) + return 4000 + chan * 5; + else + return 5000 + chan * 5; + } else { /* IEEE80211_BAND_2GHZ */ + if (chan == 14) + return 2484; + else if (chan < 14) + return 2407 + chan * 5; + else + return 0; /* not supported */ + } +} +EXPORT_SYMBOL(ieee80211_channel_to_frequency); + +int ieee80211_frequency_to_channel(int freq) +{ + /* see 802.11 17.3.8.3.2 and Annex J */ + if (freq == 2484) + return 14; + else if (freq < 2484) + return (freq - 2407) / 5; + else if (freq >= 4910 && freq <= 4980) + return (freq - 4000) / 5; + else + return (freq - 5000) / 5; +} +EXPORT_SYMBOL(ieee80211_frequency_to_channel); + +struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, + int freq) +{ + enum ieee80211_band band; + struct ieee80211_supported_band *sband; + int i; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + sband = wiphy->bands[band]; + + if (!sband) + continue; + + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].center_freq == freq) + return &sband->channels[i]; + } + } + + return NULL; +} +EXPORT_SYMBOL(__ieee80211_get_channel); + +static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, + enum ieee80211_band band) +{ + int i, want; + + switch (band) { + case IEEE80211_BAND_5GHZ: + want = 3; + for (i = 0; i < sband->n_bitrates; i++) { + if (sband->bitrates[i].bitrate == 60 || + sband->bitrates[i].bitrate == 120 || + sband->bitrates[i].bitrate == 240) { + sband->bitrates[i].flags |= + IEEE80211_RATE_MANDATORY_A; + want--; + } + } + WARN_ON(want); + break; + case IEEE80211_BAND_2GHZ: + want = 7; + for (i = 0; i < sband->n_bitrates; i++) { + if (sband->bitrates[i].bitrate == 10) { + sband->bitrates[i].flags |= + IEEE80211_RATE_MANDATORY_B | + IEEE80211_RATE_MANDATORY_G; + want--; + } + + if (sband->bitrates[i].bitrate == 20 || + sband->bitrates[i].bitrate == 55 || + sband->bitrates[i].bitrate == 110 || + sband->bitrates[i].bitrate == 60 || + sband->bitrates[i].bitrate == 120 || + sband->bitrates[i].bitrate == 240) { + sband->bitrates[i].flags |= + IEEE80211_RATE_MANDATORY_G; + want--; + } + + if (sband->bitrates[i].bitrate != 10 && + sband->bitrates[i].bitrate != 20 && + sband->bitrates[i].bitrate != 55 && + sband->bitrates[i].bitrate != 110) + sband->bitrates[i].flags |= + IEEE80211_RATE_ERP_G; + } + WARN_ON(want != 0 && want != 3 && want != 6); + break; + case IEEE80211_NUM_BANDS: + WARN_ON(1); + break; + } +} + +void ieee80211_set_bitrate_flags(struct wiphy *wiphy) +{ + enum ieee80211_band band; + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) + if (wiphy->bands[band]) + set_mandatory_flags_band(wiphy->bands[band], band); +} + +int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, + struct key_params *params, int key_idx, + bool pairwise, const u8 *mac_addr) +{ + int i; + + if (key_idx > 5) + return -EINVAL; + + if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + return -EINVAL; + + if (pairwise && !mac_addr) + return -EINVAL; + + /* + * Disallow pairwise keys with non-zero index unless it's WEP + * or a vendor specific cipher (because current deployments use + * pairwise WEP keys with non-zero indices and for vendor specific + * ciphers this should be validated in the driver or hardware level + * - but 802.11i clearly specifies to use zero) + */ + if (pairwise && key_idx && + ((params->cipher == WLAN_CIPHER_SUITE_TKIP) || + (params->cipher == WLAN_CIPHER_SUITE_CCMP) || + (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC))) + return -EINVAL; + + switch (params->cipher) { + case WLAN_CIPHER_SUITE_WEP40: + if (params->key_len != WLAN_KEY_LEN_WEP40) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_TKIP: + if (params->key_len != WLAN_KEY_LEN_TKIP) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_CCMP: + if (params->key_len != WLAN_KEY_LEN_CCMP) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_WEP104: + if (params->key_len != WLAN_KEY_LEN_WEP104) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + if (params->key_len != WLAN_KEY_LEN_AES_CMAC) + return -EINVAL; + break; + default: + /* + * We don't know anything about this algorithm, + * allow using it -- but the driver must check + * all parameters! We still check below whether + * or not the driver supports this algorithm, + * of course. + */ + break; + } + + if (params->seq) { + switch (params->cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + /* These ciphers do not use key sequence */ + return -EINVAL; + case WLAN_CIPHER_SUITE_TKIP: + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_AES_CMAC: + if (params->seq_len != 6) + return -EINVAL; + break; + } + } + + for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) + if (params->cipher == rdev->wiphy.cipher_suites[i]) + break; + if (i == rdev->wiphy.n_cipher_suites) + return -EINVAL; + + return 0; +} + +/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ +/* Ethernet-II snap header (RFC1042 for most EtherTypes) */ +const unsigned char rfc1042_header[] __aligned(2) = + { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; +EXPORT_SYMBOL(rfc1042_header); + +/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ +const unsigned char bridge_tunnel_header[] __aligned(2) = + { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; +EXPORT_SYMBOL(bridge_tunnel_header); + +unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) +{ + unsigned int hdrlen = 24; + + if (ieee80211_is_data(fc)) { + if (ieee80211_has_a4(fc)) + hdrlen = 30; + if (ieee80211_is_data_qos(fc)) { + hdrlen += IEEE80211_QOS_CTL_LEN; + if (ieee80211_has_order(fc)) + hdrlen += IEEE80211_HT_CTL_LEN; + } + goto out; + } + + if (ieee80211_is_ctl(fc)) { + /* + * ACK and CTS are 10 bytes, all others 16. To see how + * to get this condition consider + * subtype mask: 0b0000000011110000 (0x00F0) + * ACK subtype: 0b0000000011010000 (0x00D0) + * CTS subtype: 0b0000000011000000 (0x00C0) + * bits that matter: ^^^ (0x00E0) + * value of those: 0b0000000011000000 (0x00C0) + */ + if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) + hdrlen = 10; + else + hdrlen = 16; + } +out: + return hdrlen; +} +EXPORT_SYMBOL(ieee80211_hdrlen); + +unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) +{ + const struct ieee80211_hdr *hdr = + (const struct ieee80211_hdr *)skb->data; + unsigned int hdrlen; + + if (unlikely(skb->len < 10)) + return 0; + hdrlen = ieee80211_hdrlen(hdr->frame_control); + if (unlikely(hdrlen > skb->len)) + return 0; + return hdrlen; +} +EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); + +static int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) +{ + int ae = meshhdr->flags & MESH_FLAGS_AE; + /* 7.1.3.5a.2 */ + switch (ae) { + case 0: + return 6; + case MESH_FLAGS_AE_A4: + return 12; + case MESH_FLAGS_AE_A5_A6: + return 18; + case (MESH_FLAGS_AE_A4 | MESH_FLAGS_AE_A5_A6): + return 24; + default: + return 6; + } +} + +int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + u16 hdrlen, ethertype; + u8 *payload; + u8 dst[ETH_ALEN]; + u8 src[ETH_ALEN] __aligned(2); + + if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) + return -1; + + hdrlen = ieee80211_hdrlen(hdr->frame_control); + + /* convert IEEE 802.11 header + possible LLC headers into Ethernet + * header + * IEEE 802.11 address fields: + * ToDS FromDS Addr1 Addr2 Addr3 Addr4 + * 0 0 DA SA BSSID n/a + * 0 1 DA BSSID SA n/a + * 1 0 BSSID SA DA n/a + * 1 1 RA TA DA SA + */ + memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN); + memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN); + + switch (hdr->frame_control & + cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { + case cpu_to_le16(IEEE80211_FCTL_TODS): + if (unlikely(iftype != NL80211_IFTYPE_AP && + iftype != NL80211_IFTYPE_AP_VLAN && + iftype != NL80211_IFTYPE_P2P_GO)) + return -1; + break; + case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): + if (unlikely(iftype != NL80211_IFTYPE_WDS && + iftype != NL80211_IFTYPE_MESH_POINT && + iftype != NL80211_IFTYPE_AP_VLAN && + iftype != NL80211_IFTYPE_STATION)) + return -1; + if (iftype == NL80211_IFTYPE_MESH_POINT) { + struct ieee80211s_hdr *meshdr = + (struct ieee80211s_hdr *) (skb->data + hdrlen); + /* make sure meshdr->flags is on the linear part */ + if (!pskb_may_pull(skb, hdrlen + 1)) + return -1; + if (meshdr->flags & MESH_FLAGS_AE_A5_A6) { + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), + dst, ETH_ALEN); + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr2), + src, ETH_ALEN); + } + hdrlen += ieee80211_get_mesh_hdrlen(meshdr); + } + break; + case cpu_to_le16(IEEE80211_FCTL_FROMDS): + if ((iftype != NL80211_IFTYPE_STATION && + iftype != NL80211_IFTYPE_P2P_CLIENT && + iftype != NL80211_IFTYPE_MESH_POINT) || + (is_multicast_ether_addr(dst) && + !compare_ether_addr(src, addr))) + return -1; + if (iftype == NL80211_IFTYPE_MESH_POINT) { + struct ieee80211s_hdr *meshdr = + (struct ieee80211s_hdr *) (skb->data + hdrlen); + /* make sure meshdr->flags is on the linear part */ + if (!pskb_may_pull(skb, hdrlen + 1)) + return -1; + if (meshdr->flags & MESH_FLAGS_AE_A4) + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), + src, ETH_ALEN); + hdrlen += ieee80211_get_mesh_hdrlen(meshdr); + } + break; + case cpu_to_le16(0): + if (iftype != NL80211_IFTYPE_ADHOC) + return -1; + break; + } + + if (!pskb_may_pull(skb, hdrlen + 8)) + return -1; + + payload = skb->data + hdrlen; + ethertype = (payload[6] << 8) | payload[7]; + + if (likely((compare_ether_addr(payload, rfc1042_header) == 0 && + ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || + compare_ether_addr(payload, bridge_tunnel_header) == 0)) { + /* remove RFC1042 or Bridge-Tunnel encapsulation and + * replace EtherType */ + skb_pull(skb, hdrlen + 6); + memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN); + memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN); + } else { + struct ethhdr *ehdr; + __be16 len; + + skb_pull(skb, hdrlen); + len = htons(skb->len); + ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr)); + memcpy(ehdr->h_dest, dst, ETH_ALEN); + memcpy(ehdr->h_source, src, ETH_ALEN); + ehdr->h_proto = len; + } + return 0; +} +EXPORT_SYMBOL(ieee80211_data_to_8023); + +int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype, u8 *bssid, bool qos) +{ + struct ieee80211_hdr hdr; + u16 hdrlen, ethertype; + __le16 fc; + const u8 *encaps_data; + int encaps_len, skip_header_bytes; + int nh_pos, h_pos; + int head_need; + + if (unlikely(skb->len < ETH_HLEN)) + return -EINVAL; + + nh_pos = skb_network_header(skb) - skb->data; + h_pos = skb_transport_header(skb) - skb->data; + + /* convert Ethernet header to proper 802.11 header (based on + * operation mode) */ + ethertype = (skb->data[12] << 8) | skb->data[13]; + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); + + switch (iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA BSSID SA */ + memcpy(hdr.addr1, skb->data, ETH_ALEN); + memcpy(hdr.addr2, addr, ETH_ALEN); + memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); + hdrlen = 24; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + fc |= cpu_to_le16(IEEE80211_FCTL_TODS); + /* BSSID SA DA */ + memcpy(hdr.addr1, bssid, ETH_ALEN); + memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); + memcpy(hdr.addr3, skb->data, ETH_ALEN); + hdrlen = 24; + break; + case NL80211_IFTYPE_ADHOC: + /* DA SA BSSID */ + memcpy(hdr.addr1, skb->data, ETH_ALEN); + memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); + memcpy(hdr.addr3, bssid, ETH_ALEN); + hdrlen = 24; + break; + default: + return -EOPNOTSUPP; + } + + if (qos) { + fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); + hdrlen += 2; + } + + hdr.frame_control = fc; + hdr.duration_id = 0; + hdr.seq_ctrl = 0; + + skip_header_bytes = ETH_HLEN; + if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { + encaps_data = bridge_tunnel_header; + encaps_len = sizeof(bridge_tunnel_header); + skip_header_bytes -= 2; + } else if (ethertype > 0x600) { + encaps_data = rfc1042_header; + encaps_len = sizeof(rfc1042_header); + skip_header_bytes -= 2; + } else { + encaps_data = NULL; + encaps_len = 0; + } + + skb_pull(skb, skip_header_bytes); + nh_pos -= skip_header_bytes; + h_pos -= skip_header_bytes; + + head_need = hdrlen + encaps_len - skb_headroom(skb); + + if (head_need > 0 || skb_cloned(skb)) { + head_need = max(head_need, 0); + if (head_need) + skb_orphan(skb); + + if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) { + pr_err("failed to reallocate Tx buffer\n"); + return -ENOMEM; + } + skb->truesize += head_need; + } + + if (encaps_data) { + memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); + nh_pos += encaps_len; + h_pos += encaps_len; + } + + memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); + + nh_pos += hdrlen; + h_pos += hdrlen; + + /* Update skb pointers to various headers since this modified frame + * is going to go through Linux networking code that may potentially + * need things like pointer to IP header. */ + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, nh_pos); + skb_set_transport_header(skb, h_pos); + + return 0; +} +EXPORT_SYMBOL(ieee80211_data_from_8023); + + +void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, + const u8 *addr, enum nl80211_iftype iftype, + const unsigned int extra_headroom, + bool has_80211_header) +{ + struct sk_buff *frame = NULL; + u16 ethertype; + u8 *payload; + const struct ethhdr *eth; + int remaining, err; + u8 dst[ETH_ALEN], src[ETH_ALEN]; + + if (has_80211_header) { + err = ieee80211_data_to_8023(skb, addr, iftype); + if (err) + goto out; + + /* skip the wrapping header */ + eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr)); + if (!eth) + goto out; + } else { + eth = (struct ethhdr *) skb->data; + } + + while (skb != frame) { + u8 padding; + __be16 len = eth->h_proto; + unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len); + + remaining = skb->len; + memcpy(dst, eth->h_dest, ETH_ALEN); + memcpy(src, eth->h_source, ETH_ALEN); + + padding = (4 - subframe_len) & 0x3; + /* the last MSDU has no padding */ + if (subframe_len > remaining) + goto purge; + + skb_pull(skb, sizeof(struct ethhdr)); + /* reuse skb for the last subframe */ + if (remaining <= subframe_len + padding) + frame = skb; + else { + unsigned int hlen = ALIGN(extra_headroom, 4); + /* + * Allocate and reserve two bytes more for payload + * alignment since sizeof(struct ethhdr) is 14. + */ + frame = dev_alloc_skb(hlen + subframe_len + 2); + if (!frame) + goto purge; + + skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); + memcpy(skb_put(frame, ntohs(len)), skb->data, + ntohs(len)); + + eth = (struct ethhdr *)skb_pull(skb, ntohs(len) + + padding); + if (!eth) { + dev_kfree_skb(frame); + goto purge; + } + } + + skb_reset_network_header(frame); + frame->dev = skb->dev; + frame->priority = skb->priority; + + payload = frame->data; + ethertype = (payload[6] << 8) | payload[7]; + + if (likely((compare_ether_addr(payload, rfc1042_header) == 0 && + ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || + compare_ether_addr(payload, + bridge_tunnel_header) == 0)) { + /* remove RFC1042 or Bridge-Tunnel + * encapsulation and replace EtherType */ + skb_pull(frame, 6); + memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); + memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); + } else { + memcpy(skb_push(frame, sizeof(__be16)), &len, + sizeof(__be16)); + memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); + memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); + } + __skb_queue_tail(list, frame); + } + + return; + + purge: + __skb_queue_purge(list); + out: + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); + +/* Given a data frame determine the 802.1p/1d tag to use. */ +unsigned int cfg80211_classify8021d(struct sk_buff *skb) +{ + unsigned int dscp; + + /* skb->priority values from 256->263 are magic values to + * directly indicate a specific 802.1d priority. This is used + * to allow 802.1d priority to be passed directly in from VLAN + * tags, etc. + */ + if (skb->priority >= 256 && skb->priority <= 263) + return skb->priority - 256; + + switch (skb->protocol) { + case htons(ETH_P_IP): + dscp = ip_hdr(skb)->tos & 0xfc; + break; + default: + return 0; + } + + return dscp >> 5; +} +EXPORT_SYMBOL(cfg80211_classify8021d); + +const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie) +{ + u8 *end, *pos; + + pos = bss->information_elements; + if (pos == NULL) + return NULL; + end = pos + bss->len_information_elements; + + while (pos + 1 < end) { + if (pos + 2 + pos[1] > end) + break; + if (pos[0] == ie) + return pos; + pos += 2 + pos[1]; + } + + return NULL; +} +EXPORT_SYMBOL(ieee80211_bss_get_ie); + +void cfg80211_upload_connect_keys(struct wireless_dev *wdev) +{ + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct net_device *dev = wdev->netdev; + int i; + + if (!wdev->connect_keys) + return; + + for (i = 0; i < 6; i++) { + if (!wdev->connect_keys->params[i].cipher) + continue; + if (rdev->ops->add_key(wdev->wiphy, dev, i, false, NULL, + &wdev->connect_keys->params[i])) { + netdev_err(dev, "failed to set key %d\n", i); + continue; + } + if (wdev->connect_keys->def == i) + if (rdev->ops->set_default_key(wdev->wiphy, dev, + i, true, true)) { + netdev_err(dev, "failed to set defkey %d\n", i); + continue; + } + if (wdev->connect_keys->defmgmt == i) + if (rdev->ops->set_default_mgmt_key(wdev->wiphy, dev, i)) + netdev_err(dev, "failed to set mgtdef %d\n", i); + } + + kfree(wdev->connect_keys); + wdev->connect_keys = NULL; +} + +static void cfg80211_process_wdev_events(struct wireless_dev *wdev) +{ + struct cfg80211_event *ev; + unsigned long flags; + const u8 *bssid = NULL; + + spin_lock_irqsave(&wdev->event_lock, flags); + while (!list_empty(&wdev->event_list)) { + ev = list_first_entry(&wdev->event_list, + struct cfg80211_event, list); + list_del(&ev->list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + + wdev_lock(wdev); + switch (ev->type) { + case EVENT_CONNECT_RESULT: + if (!is_zero_ether_addr(ev->cr.bssid)) + bssid = ev->cr.bssid; + __cfg80211_connect_result( + wdev->netdev, bssid, + ev->cr.req_ie, ev->cr.req_ie_len, + ev->cr.resp_ie, ev->cr.resp_ie_len, + ev->cr.status, + ev->cr.status == WLAN_STATUS_SUCCESS, + NULL); + break; + case EVENT_ROAMED: + __cfg80211_roamed(wdev, ev->rm.channel, ev->rm.bssid, + ev->rm.req_ie, ev->rm.req_ie_len, + ev->rm.resp_ie, ev->rm.resp_ie_len); + break; + case EVENT_DISCONNECTED: + __cfg80211_disconnected(wdev->netdev, + ev->dc.ie, ev->dc.ie_len, + ev->dc.reason, true); + break; + case EVENT_IBSS_JOINED: + __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid); + break; + } + wdev_unlock(wdev); + + kfree(ev); + + spin_lock_irqsave(&wdev->event_lock, flags); + } + spin_unlock_irqrestore(&wdev->event_lock, flags); +} + +void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) +{ + struct wireless_dev *wdev; + + ASSERT_RTNL(); + ASSERT_RDEV_LOCK(rdev); + + mutex_lock(&rdev->devlist_mtx); + + list_for_each_entry(wdev, &rdev->netdev_list, list) + cfg80211_process_wdev_events(wdev); + + mutex_unlock(&rdev->devlist_mtx); +} + +int cfg80211_change_iface(struct cfg80211_registered_device *rdev, + struct net_device *dev, enum nl80211_iftype ntype, + u32 *flags, struct vif_params *params) +{ + int err; + enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; + + ASSERT_RDEV_LOCK(rdev); + + /* don't support changing VLANs, you just re-create them */ + if (otype == NL80211_IFTYPE_AP_VLAN) + return -EOPNOTSUPP; + + if (!rdev->ops->change_virtual_intf || + !(rdev->wiphy.interface_modes & (1 << ntype))) + return -EOPNOTSUPP; + + /* if it's part of a bridge, reject changing type to station/ibss */ + if ((dev->priv_flags & IFF_BRIDGE_PORT) && + (ntype == NL80211_IFTYPE_ADHOC || + ntype == NL80211_IFTYPE_STATION || + ntype == NL80211_IFTYPE_P2P_CLIENT)) + return -EBUSY; + + if (ntype != otype) { + err = cfg80211_can_change_interface(rdev, dev->ieee80211_ptr, + ntype); + if (err) + return err; + + dev->ieee80211_ptr->use_4addr = false; + dev->ieee80211_ptr->mesh_id_up_len = 0; + + switch (otype) { + case NL80211_IFTYPE_ADHOC: + cfg80211_leave_ibss(rdev, dev, false); + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, true); + break; + case NL80211_IFTYPE_MESH_POINT: + /* mesh should be handled? */ + break; + default: + break; + } + + cfg80211_process_rdev_events(rdev); + } + + err = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, + ntype, flags, params); + + WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); + + if (!err && params && params->use_4addr != -1) + dev->ieee80211_ptr->use_4addr = params->use_4addr; + + if (!err) { + dev->priv_flags &= ~IFF_DONT_BRIDGE; + switch (ntype) { + case NL80211_IFTYPE_STATION: + if (dev->ieee80211_ptr->use_4addr) + break; + /* fall through */ + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_ADHOC: + dev->priv_flags |= IFF_DONT_BRIDGE; + break; + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_MESH_POINT: + /* bridging OK */ + break; + case NL80211_IFTYPE_MONITOR: + /* monitor can't bridge anyway */ + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + /* not happening */ + break; + } + } + + return err; +} + +u16 cfg80211_calculate_bitrate(struct rate_info *rate) +{ + int modulation, streams, bitrate; + + if (!(rate->flags & RATE_INFO_FLAGS_MCS)) + return rate->legacy; + + /* the formula below does only work for MCS values smaller than 32 */ + if (rate->mcs >= 32) + return 0; + + modulation = rate->mcs & 7; + streams = (rate->mcs >> 3) + 1; + + bitrate = (rate->flags & RATE_INFO_FLAGS_40_MHZ_WIDTH) ? + 13500000 : 6500000; + + if (modulation < 4) + bitrate *= (modulation + 1); + else if (modulation == 4) + bitrate *= (modulation + 2); + else + bitrate *= (modulation + 3); + + bitrate *= streams; + + if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) + bitrate = (bitrate / 9) * 10; + + /* do NOT round down here */ + return (bitrate + 50000) / 100000; +} + +int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, + u32 beacon_int) +{ + struct wireless_dev *wdev; + int res = 0; + + if (!beacon_int) + return -EINVAL; + + mutex_lock(&rdev->devlist_mtx); + + list_for_each_entry(wdev, &rdev->netdev_list, list) { + if (!wdev->beacon_interval) + continue; + if (wdev->beacon_interval != beacon_int) { + res = -EINVAL; + break; + } + } + + mutex_unlock(&rdev->devlist_mtx); + + return res; +} + +int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_iftype iftype) +{ + struct wireless_dev *wdev_iter; + u32 used_iftypes = BIT(iftype); + int num[NUM_NL80211_IFTYPES]; + int total = 1; + int i, j; + + ASSERT_RTNL(); + + /* Always allow software iftypes */ + if (rdev->wiphy.software_iftypes & BIT(iftype)) + return 0; + + /* + * Drivers will gradually all set this flag, until all + * have it we only enforce for those that set it. + */ + if (!(rdev->wiphy.flags & WIPHY_FLAG_ENFORCE_COMBINATIONS)) + return 0; + + memset(num, 0, sizeof(num)); + + num[iftype] = 1; + + mutex_lock(&rdev->devlist_mtx); + list_for_each_entry(wdev_iter, &rdev->netdev_list, list) { + if (wdev_iter == wdev) + continue; + if (!netif_running(wdev_iter->netdev)) + continue; + + if (rdev->wiphy.software_iftypes & BIT(wdev_iter->iftype)) + continue; + + num[wdev_iter->iftype]++; + total++; + used_iftypes |= BIT(wdev_iter->iftype); + } + mutex_unlock(&rdev->devlist_mtx); + + for (i = 0; i < rdev->wiphy.n_iface_combinations; i++) { + const struct ieee80211_iface_combination *c; + struct ieee80211_iface_limit *limits; + u32 all_iftypes = 0; + + c = &rdev->wiphy.iface_combinations[i]; + + limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, + GFP_KERNEL); + if (!limits) + return -ENOMEM; + if (total > c->max_interfaces) + goto cont; + + for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { + if (rdev->wiphy.software_iftypes & BIT(iftype)) + continue; + for (j = 0; j < c->n_limits; j++) { + all_iftypes |= limits[j].types; + if (!(limits[j].types & BIT(iftype))) + continue; + if (limits[j].max < num[iftype]) + goto cont; + limits[j].max -= num[iftype]; + } + } + + /* + * Finally check that all iftypes that we're currently + * using are actually part of this combination. If they + * aren't then we can't use this combination and have + * to continue to the next. + */ + if ((all_iftypes & used_iftypes) != used_iftypes) + goto cont; + + /* + * This combination covered all interface types and + * supported the requested numbers, so we're good. + */ + kfree(limits); + return 0; + cont: + kfree(limits); + } + + return -EBUSY; +} diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c new file mode 100644 index 00000000..0bf169bb --- /dev/null +++ b/net/wireless/wext-compat.c @@ -0,0 +1,1537 @@ +/* + * cfg80211 - wext compat code + * + * This is temporary code until all wireless functionality is migrated + * into cfg80211, when that happens all the exports here go away and + * we directly assign the wireless handlers of wireless interfaces. + * + * Copyright 2008-2009 Johannes Berg + */ + +#include +#include +#include +#include +#include +#include +#include +#include "wext-compat.h" +#include "core.h" + +int cfg80211_wext_giwname(struct net_device *dev, + struct iw_request_info *info, + char *name, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct ieee80211_supported_band *sband; + bool is_ht = false, is_a = false, is_b = false, is_g = false; + + if (!wdev) + return -EOPNOTSUPP; + + sband = wdev->wiphy->bands[IEEE80211_BAND_5GHZ]; + if (sband) { + is_a = true; + is_ht |= sband->ht_cap.ht_supported; + } + + sband = wdev->wiphy->bands[IEEE80211_BAND_2GHZ]; + if (sband) { + int i; + /* Check for mandatory rates */ + for (i = 0; i < sband->n_bitrates; i++) { + if (sband->bitrates[i].bitrate == 10) + is_b = true; + if (sband->bitrates[i].bitrate == 60) + is_g = true; + } + is_ht |= sband->ht_cap.ht_supported; + } + + strcpy(name, "IEEE 802.11"); + if (is_a) + strcat(name, "a"); + if (is_b) + strcat(name, "b"); + if (is_g) + strcat(name, "g"); + if (is_ht) + strcat(name, "n"); + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwname); + +int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, + u32 *mode, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev; + struct vif_params vifparams; + enum nl80211_iftype type; + int ret; + + rdev = wiphy_to_dev(wdev->wiphy); + + switch (*mode) { + case IW_MODE_INFRA: + type = NL80211_IFTYPE_STATION; + break; + case IW_MODE_ADHOC: + type = NL80211_IFTYPE_ADHOC; + break; + case IW_MODE_REPEAT: + type = NL80211_IFTYPE_WDS; + break; + case IW_MODE_MONITOR: + type = NL80211_IFTYPE_MONITOR; + break; + default: + return -EINVAL; + } + + if (type == wdev->iftype) + return 0; + + memset(&vifparams, 0, sizeof(vifparams)); + + cfg80211_lock_rdev(rdev); + ret = cfg80211_change_iface(rdev, dev, type, NULL, &vifparams); + cfg80211_unlock_rdev(rdev); + + return ret; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwmode); + +int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, + u32 *mode, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + if (!wdev) + return -EOPNOTSUPP; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + *mode = IW_MODE_MASTER; + break; + case NL80211_IFTYPE_STATION: + *mode = IW_MODE_INFRA; + break; + case NL80211_IFTYPE_ADHOC: + *mode = IW_MODE_ADHOC; + break; + case NL80211_IFTYPE_MONITOR: + *mode = IW_MODE_MONITOR; + break; + case NL80211_IFTYPE_WDS: + *mode = IW_MODE_REPEAT; + break; + case NL80211_IFTYPE_AP_VLAN: + *mode = IW_MODE_SECOND; /* FIXME */ + break; + default: + *mode = IW_MODE_AUTO; + break; + } + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwmode); + + +int cfg80211_wext_giwrange(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct iw_range *range = (struct iw_range *) extra; + enum ieee80211_band band; + int i, c = 0; + + if (!wdev) + return -EOPNOTSUPP; + + data->length = sizeof(struct iw_range); + memset(range, 0, sizeof(struct iw_range)); + + range->we_version_compiled = WIRELESS_EXT; + range->we_version_source = 21; + range->retry_capa = IW_RETRY_LIMIT; + range->retry_flags = IW_RETRY_LIMIT; + range->min_retry = 0; + range->max_retry = 255; + range->min_rts = 0; + range->max_rts = 2347; + range->min_frag = 256; + range->max_frag = 2346; + + range->max_encoding_tokens = 4; + + range->max_qual.updated = IW_QUAL_NOISE_INVALID; + + switch (wdev->wiphy->signal_type) { + case CFG80211_SIGNAL_TYPE_NONE: + break; + case CFG80211_SIGNAL_TYPE_MBM: + range->max_qual.level = -110; + range->max_qual.qual = 70; + range->avg_qual.qual = 35; + range->max_qual.updated |= IW_QUAL_DBM; + range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; + range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + range->max_qual.level = 100; + range->max_qual.qual = 100; + range->avg_qual.qual = 50; + range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; + range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; + break; + } + + range->avg_qual.level = range->max_qual.level / 2; + range->avg_qual.noise = range->max_qual.noise / 2; + range->avg_qual.updated = range->max_qual.updated; + + for (i = 0; i < wdev->wiphy->n_cipher_suites; i++) { + switch (wdev->wiphy->cipher_suites[i]) { + case WLAN_CIPHER_SUITE_TKIP: + range->enc_capa |= (IW_ENC_CAPA_CIPHER_TKIP | + IW_ENC_CAPA_WPA); + break; + + case WLAN_CIPHER_SUITE_CCMP: + range->enc_capa |= (IW_ENC_CAPA_CIPHER_CCMP | + IW_ENC_CAPA_WPA2); + break; + + case WLAN_CIPHER_SUITE_WEP40: + range->encoding_size[range->num_encoding_sizes++] = + WLAN_KEY_LEN_WEP40; + break; + + case WLAN_CIPHER_SUITE_WEP104: + range->encoding_size[range->num_encoding_sizes++] = + WLAN_KEY_LEN_WEP104; + break; + } + } + + for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { + struct ieee80211_supported_band *sband; + + sband = wdev->wiphy->bands[band]; + + if (!sband) + continue; + + for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { + struct ieee80211_channel *chan = &sband->channels[i]; + + if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { + range->freq[c].i = + ieee80211_frequency_to_channel( + chan->center_freq); + range->freq[c].m = chan->center_freq; + range->freq[c].e = 6; + c++; + } + } + } + range->num_channels = c; + range->num_frequency = c; + + IW_EVENT_CAPA_SET_KERNEL(range->event_capa); + IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); + IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); + + if (wdev->wiphy->max_scan_ssids > 0) + range->scan_capa |= IW_SCAN_CAPA_ESSID; + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwrange); + + +/** + * cfg80211_wext_freq - get wext frequency for non-"auto" + * @wiphy: the wiphy + * @freq: the wext freq encoding + * + * Returns a frequency, or a negative error code, or 0 for auto. + */ +int cfg80211_wext_freq(struct wiphy *wiphy, struct iw_freq *freq) +{ + /* + * Parse frequency - return 0 for auto and + * -EINVAL for impossible things. + */ + if (freq->e == 0) { + enum ieee80211_band band = IEEE80211_BAND_2GHZ; + if (freq->m < 0) + return 0; + if (freq->m > 14) + band = IEEE80211_BAND_5GHZ; + return ieee80211_channel_to_frequency(freq->m, band); + } else { + int i, div = 1000000; + for (i = 0; i < freq->e; i++) + div /= 10; + if (div <= 0) + return -EINVAL; + return freq->m / div; + } +} + +int cfg80211_wext_siwrts(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *rts, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u32 orts = wdev->wiphy->rts_threshold; + int err; + + if (rts->disabled || !rts->fixed) + wdev->wiphy->rts_threshold = (u32) -1; + else if (rts->value < 0) + return -EINVAL; + else + wdev->wiphy->rts_threshold = rts->value; + + err = rdev->ops->set_wiphy_params(wdev->wiphy, + WIPHY_PARAM_RTS_THRESHOLD); + if (err) + wdev->wiphy->rts_threshold = orts; + + return err; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwrts); + +int cfg80211_wext_giwrts(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *rts, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + rts->value = wdev->wiphy->rts_threshold; + rts->disabled = rts->value == (u32) -1; + rts->fixed = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwrts); + +int cfg80211_wext_siwfrag(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *frag, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u32 ofrag = wdev->wiphy->frag_threshold; + int err; + + if (frag->disabled || !frag->fixed) + wdev->wiphy->frag_threshold = (u32) -1; + else if (frag->value < 256) + return -EINVAL; + else { + /* Fragment length must be even, so strip LSB. */ + wdev->wiphy->frag_threshold = frag->value & ~0x1; + } + + err = rdev->ops->set_wiphy_params(wdev->wiphy, + WIPHY_PARAM_FRAG_THRESHOLD); + if (err) + wdev->wiphy->frag_threshold = ofrag; + + return err; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwfrag); + +int cfg80211_wext_giwfrag(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *frag, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + frag->value = wdev->wiphy->frag_threshold; + frag->disabled = frag->value == (u32) -1; + frag->fixed = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwfrag); + +int cfg80211_wext_siwretry(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *retry, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u32 changed = 0; + u8 olong = wdev->wiphy->retry_long; + u8 oshort = wdev->wiphy->retry_short; + int err; + + if (retry->disabled || + (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) + return -EINVAL; + + if (retry->flags & IW_RETRY_LONG) { + wdev->wiphy->retry_long = retry->value; + changed |= WIPHY_PARAM_RETRY_LONG; + } else if (retry->flags & IW_RETRY_SHORT) { + wdev->wiphy->retry_short = retry->value; + changed |= WIPHY_PARAM_RETRY_SHORT; + } else { + wdev->wiphy->retry_short = retry->value; + wdev->wiphy->retry_long = retry->value; + changed |= WIPHY_PARAM_RETRY_LONG; + changed |= WIPHY_PARAM_RETRY_SHORT; + } + + if (!changed) + return 0; + + err = rdev->ops->set_wiphy_params(wdev->wiphy, changed); + if (err) { + wdev->wiphy->retry_short = oshort; + wdev->wiphy->retry_long = olong; + } + + return err; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwretry); + +int cfg80211_wext_giwretry(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *retry, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + retry->disabled = 0; + + if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) { + /* + * First return short value, iwconfig will ask long value + * later if needed + */ + retry->flags |= IW_RETRY_LIMIT; + retry->value = wdev->wiphy->retry_short; + if (wdev->wiphy->retry_long != wdev->wiphy->retry_short) + retry->flags |= IW_RETRY_LONG; + + return 0; + } + + if (retry->flags & IW_RETRY_LONG) { + retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG; + retry->value = wdev->wiphy->retry_long; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry); + +static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool pairwise, + const u8 *addr, bool remove, bool tx_key, + int idx, struct key_params *params) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err, i; + bool rejoin = false; + + if (pairwise && !addr) + return -EINVAL; + + if (!wdev->wext.keys) { + wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), + GFP_KERNEL); + if (!wdev->wext.keys) + return -ENOMEM; + for (i = 0; i < 6; i++) + wdev->wext.keys->params[i].key = + wdev->wext.keys->data[i]; + } + + if (wdev->iftype != NL80211_IFTYPE_ADHOC && + wdev->iftype != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) { + if (!wdev->current_bss) + return -ENOLINK; + + if (!rdev->ops->set_default_mgmt_key) + return -EOPNOTSUPP; + + if (idx < 4 || idx > 5) + return -EINVAL; + } else if (idx < 0 || idx > 3) + return -EINVAL; + + if (remove) { + err = 0; + if (wdev->current_bss) { + /* + * If removing the current TX key, we will need to + * join a new IBSS without the privacy bit clear. + */ + if (idx == wdev->wext.default_key && + wdev->iftype == NL80211_IFTYPE_ADHOC) { + __cfg80211_leave_ibss(rdev, wdev->netdev, true); + rejoin = true; + } + + if (!pairwise && addr && + !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + err = -ENOENT; + else + err = rdev->ops->del_key(&rdev->wiphy, dev, idx, + pairwise, addr); + } + wdev->wext.connect.privacy = false; + /* + * Applications using wireless extensions expect to be + * able to delete keys that don't exist, so allow that. + */ + if (err == -ENOENT) + err = 0; + if (!err) { + if (!addr) { + wdev->wext.keys->params[idx].key_len = 0; + wdev->wext.keys->params[idx].cipher = 0; + } + if (idx == wdev->wext.default_key) + wdev->wext.default_key = -1; + else if (idx == wdev->wext.default_mgmt_key) + wdev->wext.default_mgmt_key = -1; + } + + if (!err && rejoin) + err = cfg80211_ibss_wext_join(rdev, wdev); + + return err; + } + + if (addr) + tx_key = false; + + if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr)) + return -EINVAL; + + err = 0; + if (wdev->current_bss) + err = rdev->ops->add_key(&rdev->wiphy, dev, idx, + pairwise, addr, params); + if (err) + return err; + + if (!addr) { + wdev->wext.keys->params[idx] = *params; + memcpy(wdev->wext.keys->data[idx], + params->key, params->key_len); + wdev->wext.keys->params[idx].key = + wdev->wext.keys->data[idx]; + } + + if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 || + params->cipher == WLAN_CIPHER_SUITE_WEP104) && + (tx_key || (!addr && wdev->wext.default_key == -1))) { + if (wdev->current_bss) { + /* + * If we are getting a new TX key from not having + * had one before we need to join a new IBSS with + * the privacy bit set. + */ + if (wdev->iftype == NL80211_IFTYPE_ADHOC && + wdev->wext.default_key == -1) { + __cfg80211_leave_ibss(rdev, wdev->netdev, true); + rejoin = true; + } + err = rdev->ops->set_default_key(&rdev->wiphy, dev, + idx, true, true); + } + if (!err) { + wdev->wext.default_key = idx; + if (rejoin) + err = cfg80211_ibss_wext_join(rdev, wdev); + } + return err; + } + + if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC && + (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) { + if (wdev->current_bss) + err = rdev->ops->set_default_mgmt_key(&rdev->wiphy, + dev, idx); + if (!err) + wdev->wext.default_mgmt_key = idx; + return err; + } + + return 0; +} + +static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool pairwise, + const u8 *addr, bool remove, bool tx_key, + int idx, struct key_params *params) +{ + int err; + + /* devlist mutex needed for possible IBSS re-join */ + mutex_lock(&rdev->devlist_mtx); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_set_encryption(rdev, dev, pairwise, addr, + remove, tx_key, idx, params); + wdev_unlock(dev->ieee80211_ptr); + mutex_unlock(&rdev->devlist_mtx); + + return err; +} + +int cfg80211_wext_siwencode(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *erq, char *keybuf) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + int idx, err; + bool remove = false; + struct key_params params; + + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; + + /* no use -- only MFP (set_default_mgmt_key) is optional */ + if (!rdev->ops->del_key || + !rdev->ops->add_key || + !rdev->ops->set_default_key) + return -EOPNOTSUPP; + + idx = erq->flags & IW_ENCODE_INDEX; + if (idx == 0) { + idx = wdev->wext.default_key; + if (idx < 0) + idx = 0; + } else if (idx < 1 || idx > 4) + return -EINVAL; + else + idx--; + + if (erq->flags & IW_ENCODE_DISABLED) + remove = true; + else if (erq->length == 0) { + /* No key data - just set the default TX key index */ + err = 0; + wdev_lock(wdev); + if (wdev->current_bss) + err = rdev->ops->set_default_key(&rdev->wiphy, dev, + idx, true, true); + if (!err) + wdev->wext.default_key = idx; + wdev_unlock(wdev); + return err; + } + + memset(¶ms, 0, sizeof(params)); + params.key = keybuf; + params.key_len = erq->length; + if (erq->length == 5) + params.cipher = WLAN_CIPHER_SUITE_WEP40; + else if (erq->length == 13) + params.cipher = WLAN_CIPHER_SUITE_WEP104; + else if (!remove) + return -EINVAL; + + return cfg80211_set_encryption(rdev, dev, false, NULL, remove, + wdev->wext.default_key == -1, + idx, ¶ms); +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwencode); + +int cfg80211_wext_siwencodeext(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *erq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct iw_encode_ext *ext = (struct iw_encode_ext *) extra; + const u8 *addr; + int idx; + bool remove = false; + struct key_params params; + u32 cipher; + + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; + + /* no use -- only MFP (set_default_mgmt_key) is optional */ + if (!rdev->ops->del_key || + !rdev->ops->add_key || + !rdev->ops->set_default_key) + return -EOPNOTSUPP; + + switch (ext->alg) { + case IW_ENCODE_ALG_NONE: + remove = true; + cipher = 0; + break; + case IW_ENCODE_ALG_WEP: + if (ext->key_len == 5) + cipher = WLAN_CIPHER_SUITE_WEP40; + else if (ext->key_len == 13) + cipher = WLAN_CIPHER_SUITE_WEP104; + else + return -EINVAL; + break; + case IW_ENCODE_ALG_TKIP: + cipher = WLAN_CIPHER_SUITE_TKIP; + break; + case IW_ENCODE_ALG_CCMP: + cipher = WLAN_CIPHER_SUITE_CCMP; + break; + case IW_ENCODE_ALG_AES_CMAC: + cipher = WLAN_CIPHER_SUITE_AES_CMAC; + break; + default: + return -EOPNOTSUPP; + } + + if (erq->flags & IW_ENCODE_DISABLED) + remove = true; + + idx = erq->flags & IW_ENCODE_INDEX; + if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { + if (idx < 4 || idx > 5) { + idx = wdev->wext.default_mgmt_key; + if (idx < 0) + return -EINVAL; + } else + idx--; + } else { + if (idx < 1 || idx > 4) { + idx = wdev->wext.default_key; + if (idx < 0) + return -EINVAL; + } else + idx--; + } + + addr = ext->addr.sa_data; + if (is_broadcast_ether_addr(addr)) + addr = NULL; + + memset(¶ms, 0, sizeof(params)); + params.key = ext->key; + params.key_len = ext->key_len; + params.cipher = cipher; + + if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) { + params.seq = ext->rx_seq; + params.seq_len = 6; + } + + return cfg80211_set_encryption( + rdev, dev, + !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), + addr, remove, + ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, + idx, ¶ms); +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwencodeext); + +int cfg80211_wext_giwencode(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *erq, char *keybuf) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int idx; + + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; + + idx = erq->flags & IW_ENCODE_INDEX; + if (idx == 0) { + idx = wdev->wext.default_key; + if (idx < 0) + idx = 0; + } else if (idx < 1 || idx > 4) + return -EINVAL; + else + idx--; + + erq->flags = idx + 1; + + if (!wdev->wext.keys || !wdev->wext.keys->params[idx].cipher) { + erq->flags |= IW_ENCODE_DISABLED; + erq->length = 0; + return 0; + } + + erq->length = min_t(size_t, erq->length, + wdev->wext.keys->params[idx].key_len); + memcpy(keybuf, wdev->wext.keys->params[idx].key, erq->length); + erq->flags |= IW_ENCODE_ENABLED; + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwencode); + +int cfg80211_wext_siwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *wextfreq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + int freq, err; + + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); + case NL80211_IFTYPE_ADHOC: + return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_MESH_POINT: + freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + if (freq < 0) + return freq; + if (freq == 0) + return -EINVAL; + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT); + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + return err; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwfreq); + +int cfg80211_wext_giwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); + case NL80211_IFTYPE_ADHOC: + return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); + default: + if (!wdev->channel) + return -EINVAL; + freq->m = wdev->channel->center_freq; + freq->e = 6; + return 0; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwfreq); + +int cfg80211_wext_siwtxpower(struct net_device *dev, + struct iw_request_info *info, + union iwreq_data *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + enum nl80211_tx_power_setting type; + int dbm = 0; + + if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) + return -EINVAL; + if (data->txpower.flags & IW_TXPOW_RANGE) + return -EINVAL; + + if (!rdev->ops->set_tx_power) + return -EOPNOTSUPP; + + /* only change when not disabling */ + if (!data->txpower.disabled) { + rfkill_set_sw_state(rdev->rfkill, false); + + if (data->txpower.fixed) { + /* + * wext doesn't support negative values, see + * below where it's for automatic + */ + if (data->txpower.value < 0) + return -EINVAL; + dbm = data->txpower.value; + type = NL80211_TX_POWER_FIXED; + /* TODO: do regulatory check! */ + } else { + /* + * Automatic power level setting, max being the value + * passed in from userland. + */ + if (data->txpower.value < 0) { + type = NL80211_TX_POWER_AUTOMATIC; + } else { + dbm = data->txpower.value; + type = NL80211_TX_POWER_LIMITED; + } + } + } else { + rfkill_set_sw_state(rdev->rfkill, true); + schedule_work(&rdev->rfkill_sync); + return 0; + } + + return rdev->ops->set_tx_power(wdev->wiphy, type, DBM_TO_MBM(dbm)); +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwtxpower); + +int cfg80211_wext_giwtxpower(struct net_device *dev, + struct iw_request_info *info, + union iwreq_data *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + int err, val; + + if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) + return -EINVAL; + if (data->txpower.flags & IW_TXPOW_RANGE) + return -EINVAL; + + if (!rdev->ops->get_tx_power) + return -EOPNOTSUPP; + + err = rdev->ops->get_tx_power(wdev->wiphy, &val); + if (err) + return err; + + /* well... oh well */ + data->txpower.fixed = 1; + data->txpower.disabled = rfkill_blocked(rdev->rfkill); + data->txpower.value = val; + data->txpower.flags = IW_TXPOW_DBM; + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwtxpower); + +static int cfg80211_set_auth_alg(struct wireless_dev *wdev, + s32 auth_alg) +{ + int nr_alg = 0; + + if (!auth_alg) + return -EINVAL; + + if (auth_alg & ~(IW_AUTH_ALG_OPEN_SYSTEM | + IW_AUTH_ALG_SHARED_KEY | + IW_AUTH_ALG_LEAP)) + return -EINVAL; + + if (auth_alg & IW_AUTH_ALG_OPEN_SYSTEM) { + nr_alg++; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; + } + + if (auth_alg & IW_AUTH_ALG_SHARED_KEY) { + nr_alg++; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_SHARED_KEY; + } + + if (auth_alg & IW_AUTH_ALG_LEAP) { + nr_alg++; + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; + } + + if (nr_alg > 1) + wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; + + return 0; +} + +static int cfg80211_set_wpa_version(struct wireless_dev *wdev, u32 wpa_versions) +{ + if (wpa_versions & ~(IW_AUTH_WPA_VERSION_WPA | + IW_AUTH_WPA_VERSION_WPA2| + IW_AUTH_WPA_VERSION_DISABLED)) + return -EINVAL; + + if ((wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) && + (wpa_versions & (IW_AUTH_WPA_VERSION_WPA| + IW_AUTH_WPA_VERSION_WPA2))) + return -EINVAL; + + if (wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) + wdev->wext.connect.crypto.wpa_versions &= + ~(NL80211_WPA_VERSION_1|NL80211_WPA_VERSION_2); + + if (wpa_versions & IW_AUTH_WPA_VERSION_WPA) + wdev->wext.connect.crypto.wpa_versions |= + NL80211_WPA_VERSION_1; + + if (wpa_versions & IW_AUTH_WPA_VERSION_WPA2) + wdev->wext.connect.crypto.wpa_versions |= + NL80211_WPA_VERSION_2; + + return 0; +} + +static int cfg80211_set_cipher_group(struct wireless_dev *wdev, u32 cipher) +{ + if (cipher & IW_AUTH_CIPHER_WEP40) + wdev->wext.connect.crypto.cipher_group = + WLAN_CIPHER_SUITE_WEP40; + else if (cipher & IW_AUTH_CIPHER_WEP104) + wdev->wext.connect.crypto.cipher_group = + WLAN_CIPHER_SUITE_WEP104; + else if (cipher & IW_AUTH_CIPHER_TKIP) + wdev->wext.connect.crypto.cipher_group = + WLAN_CIPHER_SUITE_TKIP; + else if (cipher & IW_AUTH_CIPHER_CCMP) + wdev->wext.connect.crypto.cipher_group = + WLAN_CIPHER_SUITE_CCMP; + else if (cipher & IW_AUTH_CIPHER_AES_CMAC) + wdev->wext.connect.crypto.cipher_group = + WLAN_CIPHER_SUITE_AES_CMAC; + else if (cipher & IW_AUTH_CIPHER_NONE) + wdev->wext.connect.crypto.cipher_group = 0; + else + return -EINVAL; + + return 0; +} + +static int cfg80211_set_cipher_pairwise(struct wireless_dev *wdev, u32 cipher) +{ + int nr_ciphers = 0; + u32 *ciphers_pairwise = wdev->wext.connect.crypto.ciphers_pairwise; + + if (cipher & IW_AUTH_CIPHER_WEP40) { + ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP40; + nr_ciphers++; + } + + if (cipher & IW_AUTH_CIPHER_WEP104) { + ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP104; + nr_ciphers++; + } + + if (cipher & IW_AUTH_CIPHER_TKIP) { + ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_TKIP; + nr_ciphers++; + } + + if (cipher & IW_AUTH_CIPHER_CCMP) { + ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_CCMP; + nr_ciphers++; + } + + if (cipher & IW_AUTH_CIPHER_AES_CMAC) { + ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_AES_CMAC; + nr_ciphers++; + } + + BUILD_BUG_ON(NL80211_MAX_NR_CIPHER_SUITES < 5); + + wdev->wext.connect.crypto.n_ciphers_pairwise = nr_ciphers; + + return 0; +} + + +static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt) +{ + int nr_akm_suites = 0; + + if (key_mgt & ~(IW_AUTH_KEY_MGMT_802_1X | + IW_AUTH_KEY_MGMT_PSK)) + return -EINVAL; + + if (key_mgt & IW_AUTH_KEY_MGMT_802_1X) { + wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = + WLAN_AKM_SUITE_8021X; + nr_akm_suites++; + } + + if (key_mgt & IW_AUTH_KEY_MGMT_PSK) { + wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = + WLAN_AKM_SUITE_PSK; + nr_akm_suites++; + } + + wdev->wext.connect.crypto.n_akm_suites = nr_akm_suites; + + return 0; +} + +int cfg80211_wext_siwauth(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + if (wdev->iftype != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + switch (data->flags & IW_AUTH_INDEX) { + case IW_AUTH_PRIVACY_INVOKED: + wdev->wext.connect.privacy = data->value; + return 0; + case IW_AUTH_WPA_VERSION: + return cfg80211_set_wpa_version(wdev, data->value); + case IW_AUTH_CIPHER_GROUP: + return cfg80211_set_cipher_group(wdev, data->value); + case IW_AUTH_KEY_MGMT: + return cfg80211_set_key_mgt(wdev, data->value); + case IW_AUTH_CIPHER_PAIRWISE: + return cfg80211_set_cipher_pairwise(wdev, data->value); + case IW_AUTH_80211_AUTH_ALG: + return cfg80211_set_auth_alg(wdev, data->value); + case IW_AUTH_WPA_ENABLED: + case IW_AUTH_RX_UNENCRYPTED_EAPOL: + case IW_AUTH_DROP_UNENCRYPTED: + case IW_AUTH_MFP: + return 0; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwauth); + +int cfg80211_wext_giwauth(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *data, char *extra) +{ + /* XXX: what do we need? */ + + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwauth); + +int cfg80211_wext_siwpower(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *wrq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + bool ps = wdev->ps; + int timeout = wdev->ps_timeout; + int err; + + if (wdev->iftype != NL80211_IFTYPE_STATION) + return -EINVAL; + + if (!rdev->ops->set_power_mgmt) + return -EOPNOTSUPP; + + if (wrq->disabled) { + ps = false; + } else { + switch (wrq->flags & IW_POWER_MODE) { + case IW_POWER_ON: /* If not specified */ + case IW_POWER_MODE: /* If set all mask */ + case IW_POWER_ALL_R: /* If explicitely state all */ + ps = true; + break; + default: /* Otherwise we ignore */ + return -EINVAL; + } + + if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT)) + return -EINVAL; + + if (wrq->flags & IW_POWER_TIMEOUT) + timeout = wrq->value / 1000; + } + + err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, ps, timeout); + if (err) + return err; + + wdev->ps = ps; + wdev->ps_timeout = timeout; + + return 0; + +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwpower); + +int cfg80211_wext_giwpower(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *wrq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + wrq->disabled = !wdev->ps; + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwpower); + +static int cfg80211_wds_wext_siwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + int err; + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) + return -EINVAL; + + if (addr->sa_family != ARPHRD_ETHER) + return -EINVAL; + + if (netif_running(dev)) + return -EBUSY; + + if (!rdev->ops->set_wds_peer) + return -EOPNOTSUPP; + + err = rdev->ops->set_wds_peer(wdev->wiphy, dev, (u8 *) &addr->sa_data); + if (err) + return err; + + memcpy(&wdev->wext.bssid, (u8 *) &addr->sa_data, ETH_ALEN); + + return 0; +} + +static int cfg80211_wds_wext_giwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS)) + return -EINVAL; + + addr->sa_family = ARPHRD_ETHER; + memcpy(&addr->sa_data, wdev->wext.bssid, ETH_ALEN); + + return 0; +} + +int cfg80211_wext_siwrate(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *rate, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_bitrate_mask mask; + u32 fixed, maxrate; + struct ieee80211_supported_band *sband; + int band, ridx; + bool match = false; + + if (!rdev->ops->set_bitrate_mask) + return -EOPNOTSUPP; + + memset(&mask, 0, sizeof(mask)); + fixed = 0; + maxrate = (u32)-1; + + if (rate->value < 0) { + /* nothing */ + } else if (rate->fixed) { + fixed = rate->value / 100000; + } else { + maxrate = rate->value / 100000; + } + + for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + sband = wdev->wiphy->bands[band]; + if (sband == NULL) + continue; + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { + struct ieee80211_rate *srate = &sband->bitrates[ridx]; + if (fixed == srate->bitrate) { + mask.control[band].legacy = 1 << ridx; + match = true; + break; + } + if (srate->bitrate <= maxrate) { + mask.control[band].legacy |= 1 << ridx; + match = true; + } + } + } + + if (!match) + return -EINVAL; + + return rdev->ops->set_bitrate_mask(wdev->wiphy, dev, NULL, &mask); +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwrate); + +int cfg80211_wext_giwrate(struct net_device *dev, + struct iw_request_info *info, + struct iw_param *rate, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + /* we are under RTNL - globally locked - so can use a static struct */ + static struct station_info sinfo; + u8 addr[ETH_ALEN]; + int err; + + if (wdev->iftype != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + if (!rdev->ops->get_station) + return -EOPNOTSUPP; + + err = 0; + wdev_lock(wdev); + if (wdev->current_bss) + memcpy(addr, wdev->current_bss->pub.bssid, ETH_ALEN); + else + err = -EOPNOTSUPP; + wdev_unlock(wdev); + if (err) + return err; + + err = rdev->ops->get_station(&rdev->wiphy, dev, addr, &sinfo); + if (err) + return err; + + if (!(sinfo.filled & STATION_INFO_TX_BITRATE)) + return -EOPNOTSUPP; + + rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); + + return 0; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwrate); + +/* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ +struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + /* we are under RTNL - globally locked - so can use static structs */ + static struct iw_statistics wstats; + static struct station_info sinfo; + u8 bssid[ETH_ALEN]; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) + return NULL; + + if (!rdev->ops->get_station) + return NULL; + + /* Grab BSSID of current BSS, if any */ + wdev_lock(wdev); + if (!wdev->current_bss) { + wdev_unlock(wdev); + return NULL; + } + memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN); + wdev_unlock(wdev); + + if (rdev->ops->get_station(&rdev->wiphy, dev, bssid, &sinfo)) + return NULL; + + memset(&wstats, 0, sizeof(wstats)); + + switch (rdev->wiphy.signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + if (sinfo.filled & STATION_INFO_SIGNAL) { + int sig = sinfo.signal; + wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; + wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; + wstats.qual.updated |= IW_QUAL_DBM; + wstats.qual.level = sig; + if (sig < -110) + sig = -110; + else if (sig > -40) + sig = -40; + wstats.qual.qual = sig + 110; + break; + } + case CFG80211_SIGNAL_TYPE_UNSPEC: + if (sinfo.filled & STATION_INFO_SIGNAL) { + wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; + wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; + wstats.qual.level = sinfo.signal; + wstats.qual.qual = sinfo.signal; + break; + } + default: + wstats.qual.updated |= IW_QUAL_LEVEL_INVALID; + wstats.qual.updated |= IW_QUAL_QUAL_INVALID; + } + + wstats.qual.updated |= IW_QUAL_NOISE_INVALID; + if (sinfo.filled & STATION_INFO_RX_DROP_MISC) + wstats.discard.misc = sinfo.rx_dropped_misc; + if (sinfo.filled & STATION_INFO_TX_FAILED) + wstats.discard.retries = sinfo.tx_failed; + + return &wstats; +} +EXPORT_SYMBOL_GPL(cfg80211_wireless_stats); + +int cfg80211_wext_siwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); + case NL80211_IFTYPE_STATION: + return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); + case NL80211_IFTYPE_WDS: + return cfg80211_wds_wext_siwap(dev, info, ap_addr, extra); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwap); + +int cfg80211_wext_giwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); + case NL80211_IFTYPE_STATION: + return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); + case NL80211_IFTYPE_WDS: + return cfg80211_wds_wext_giwap(dev, info, ap_addr, extra); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwap); + +int cfg80211_wext_siwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + return cfg80211_ibss_wext_siwessid(dev, info, data, ssid); + case NL80211_IFTYPE_STATION: + return cfg80211_mgd_wext_siwessid(dev, info, data, ssid); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwessid); + +int cfg80211_wext_giwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + data->flags = 0; + data->length = 0; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); + case NL80211_IFTYPE_STATION: + return cfg80211_mgd_wext_giwessid(dev, info, data, ssid); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_giwessid); + +int cfg80211_wext_siwpmksa(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct cfg80211_pmksa cfg_pmksa; + struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; + + memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); + + if (wdev->iftype != NL80211_IFTYPE_STATION) + return -EINVAL; + + cfg_pmksa.bssid = pmksa->bssid.sa_data; + cfg_pmksa.pmkid = pmksa->pmkid; + + switch (pmksa->cmd) { + case IW_PMKSA_ADD: + if (!rdev->ops->set_pmksa) + return -EOPNOTSUPP; + + return rdev->ops->set_pmksa(&rdev->wiphy, dev, &cfg_pmksa); + + case IW_PMKSA_REMOVE: + if (!rdev->ops->del_pmksa) + return -EOPNOTSUPP; + + return rdev->ops->del_pmksa(&rdev->wiphy, dev, &cfg_pmksa); + + case IW_PMKSA_FLUSH: + if (!rdev->ops->flush_pmksa) + return -EOPNOTSUPP; + + return rdev->ops->flush_pmksa(&rdev->wiphy, dev); + + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwpmksa); + +static const iw_handler cfg80211_handlers[] = { + [IW_IOCTL_IDX(SIOCGIWNAME)] = (iw_handler) cfg80211_wext_giwname, + [IW_IOCTL_IDX(SIOCSIWFREQ)] = (iw_handler) cfg80211_wext_siwfreq, + [IW_IOCTL_IDX(SIOCGIWFREQ)] = (iw_handler) cfg80211_wext_giwfreq, + [IW_IOCTL_IDX(SIOCSIWMODE)] = (iw_handler) cfg80211_wext_siwmode, + [IW_IOCTL_IDX(SIOCGIWMODE)] = (iw_handler) cfg80211_wext_giwmode, + [IW_IOCTL_IDX(SIOCGIWRANGE)] = (iw_handler) cfg80211_wext_giwrange, + [IW_IOCTL_IDX(SIOCSIWAP)] = (iw_handler) cfg80211_wext_siwap, + [IW_IOCTL_IDX(SIOCGIWAP)] = (iw_handler) cfg80211_wext_giwap, + [IW_IOCTL_IDX(SIOCSIWMLME)] = (iw_handler) cfg80211_wext_siwmlme, + [IW_IOCTL_IDX(SIOCSIWSCAN)] = (iw_handler) cfg80211_wext_siwscan, + [IW_IOCTL_IDX(SIOCGIWSCAN)] = (iw_handler) cfg80211_wext_giwscan, + [IW_IOCTL_IDX(SIOCSIWESSID)] = (iw_handler) cfg80211_wext_siwessid, + [IW_IOCTL_IDX(SIOCGIWESSID)] = (iw_handler) cfg80211_wext_giwessid, + [IW_IOCTL_IDX(SIOCSIWRATE)] = (iw_handler) cfg80211_wext_siwrate, + [IW_IOCTL_IDX(SIOCGIWRATE)] = (iw_handler) cfg80211_wext_giwrate, + [IW_IOCTL_IDX(SIOCSIWRTS)] = (iw_handler) cfg80211_wext_siwrts, + [IW_IOCTL_IDX(SIOCGIWRTS)] = (iw_handler) cfg80211_wext_giwrts, + [IW_IOCTL_IDX(SIOCSIWFRAG)] = (iw_handler) cfg80211_wext_siwfrag, + [IW_IOCTL_IDX(SIOCGIWFRAG)] = (iw_handler) cfg80211_wext_giwfrag, + [IW_IOCTL_IDX(SIOCSIWTXPOW)] = (iw_handler) cfg80211_wext_siwtxpower, + [IW_IOCTL_IDX(SIOCGIWTXPOW)] = (iw_handler) cfg80211_wext_giwtxpower, + [IW_IOCTL_IDX(SIOCSIWRETRY)] = (iw_handler) cfg80211_wext_siwretry, + [IW_IOCTL_IDX(SIOCGIWRETRY)] = (iw_handler) cfg80211_wext_giwretry, + [IW_IOCTL_IDX(SIOCSIWENCODE)] = (iw_handler) cfg80211_wext_siwencode, + [IW_IOCTL_IDX(SIOCGIWENCODE)] = (iw_handler) cfg80211_wext_giwencode, + [IW_IOCTL_IDX(SIOCSIWPOWER)] = (iw_handler) cfg80211_wext_siwpower, + [IW_IOCTL_IDX(SIOCGIWPOWER)] = (iw_handler) cfg80211_wext_giwpower, + [IW_IOCTL_IDX(SIOCSIWGENIE)] = (iw_handler) cfg80211_wext_siwgenie, + [IW_IOCTL_IDX(SIOCSIWAUTH)] = (iw_handler) cfg80211_wext_siwauth, + [IW_IOCTL_IDX(SIOCGIWAUTH)] = (iw_handler) cfg80211_wext_giwauth, + [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= (iw_handler) cfg80211_wext_siwencodeext, + [IW_IOCTL_IDX(SIOCSIWPMKSA)] = (iw_handler) cfg80211_wext_siwpmksa, +}; + +const struct iw_handler_def cfg80211_wext_handler = { + .num_standard = ARRAY_SIZE(cfg80211_handlers), + .standard = cfg80211_handlers, + .get_wireless_stats = cfg80211_wireless_stats, +}; diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h new file mode 100644 index 00000000..20b3daef --- /dev/null +++ b/net/wireless/wext-compat.h @@ -0,0 +1,49 @@ +#ifndef __WEXT_COMPAT +#define __WEXT_COMPAT + +#include +#include + +int cfg80211_ibss_wext_siwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra); +int cfg80211_ibss_wext_giwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra); +int cfg80211_ibss_wext_siwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra); +int cfg80211_ibss_wext_giwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra); +int cfg80211_ibss_wext_siwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid); +int cfg80211_ibss_wext_giwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid); + +int cfg80211_mgd_wext_siwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra); +int cfg80211_mgd_wext_giwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra); +int cfg80211_mgd_wext_siwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra); +int cfg80211_mgd_wext_giwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra); +int cfg80211_mgd_wext_siwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid); +int cfg80211_mgd_wext_giwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid); + +int cfg80211_wext_freq(struct wiphy *wiphy, struct iw_freq *freq); + + +extern const struct iw_handler_def cfg80211_wext_handler; +#endif /* __WEXT_COMPAT */ diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c new file mode 100644 index 00000000..fdbc23c1 --- /dev/null +++ b/net/wireless/wext-core.c @@ -0,0 +1,1084 @@ +/* + * This file implement the Wireless Extensions core API. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. + * Copyright 2009 Johannes Berg + * + * (As all part of the Linux kernel, this file is GPL) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, + unsigned int, struct iw_request_info *, + iw_handler); + + +/* + * Meta-data about all the standard Wireless Extension request we + * know about. + */ +static const struct iw_ioctl_description standard_ioctl[] = { + [IW_IOCTL_IDX(SIOCSIWCOMMIT)] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [IW_IOCTL_IDX(SIOCGIWNAME)] = { + .header_type = IW_HEADER_TYPE_CHAR, + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWNWID)] = { + .header_type = IW_HEADER_TYPE_PARAM, + .flags = IW_DESCR_FLAG_EVENT, + }, + [IW_IOCTL_IDX(SIOCGIWNWID)] = { + .header_type = IW_HEADER_TYPE_PARAM, + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWFREQ)] = { + .header_type = IW_HEADER_TYPE_FREQ, + .flags = IW_DESCR_FLAG_EVENT, + }, + [IW_IOCTL_IDX(SIOCGIWFREQ)] = { + .header_type = IW_HEADER_TYPE_FREQ, + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWMODE)] = { + .header_type = IW_HEADER_TYPE_UINT, + .flags = IW_DESCR_FLAG_EVENT, + }, + [IW_IOCTL_IDX(SIOCGIWMODE)] = { + .header_type = IW_HEADER_TYPE_UINT, + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWSENS)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWSENS)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWRANGE)] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [IW_IOCTL_IDX(SIOCGIWRANGE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = sizeof(struct iw_range), + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWPRIV)] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [IW_IOCTL_IDX(SIOCGIWPRIV)] = { /* (handled directly by us) */ + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_priv_args), + .max_tokens = 16, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [IW_IOCTL_IDX(SIOCSIWSTATS)] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [IW_IOCTL_IDX(SIOCGIWSTATS)] = { /* (handled directly by us) */ + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = sizeof(struct iw_statistics), + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWSPY)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr), + .max_tokens = IW_MAX_SPY, + }, + [IW_IOCTL_IDX(SIOCGIWSPY)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr) + + sizeof(struct iw_quality), + .max_tokens = IW_MAX_SPY, + }, + [IW_IOCTL_IDX(SIOCSIWTHRSPY)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_thrspy), + .min_tokens = 1, + .max_tokens = 1, + }, + [IW_IOCTL_IDX(SIOCGIWTHRSPY)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_thrspy), + .min_tokens = 1, + .max_tokens = 1, + }, + [IW_IOCTL_IDX(SIOCSIWAP)] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IW_IOCTL_IDX(SIOCGIWAP)] = { + .header_type = IW_HEADER_TYPE_ADDR, + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWMLME)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .min_tokens = sizeof(struct iw_mlme), + .max_tokens = sizeof(struct iw_mlme), + }, + [IW_IOCTL_IDX(SIOCGIWAPLIST)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr) + + sizeof(struct iw_quality), + .max_tokens = IW_MAX_AP, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [IW_IOCTL_IDX(SIOCSIWSCAN)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .min_tokens = 0, + .max_tokens = sizeof(struct iw_scan_req), + }, + [IW_IOCTL_IDX(SIOCGIWSCAN)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_SCAN_MAX_DATA, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [IW_IOCTL_IDX(SIOCSIWESSID)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE, + .flags = IW_DESCR_FLAG_EVENT, + }, + [IW_IOCTL_IDX(SIOCGIWESSID)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE, + .flags = IW_DESCR_FLAG_DUMP, + }, + [IW_IOCTL_IDX(SIOCSIWNICKN)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE, + }, + [IW_IOCTL_IDX(SIOCGIWNICKN)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE, + }, + [IW_IOCTL_IDX(SIOCSIWRATE)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWRATE)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWRTS)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWRTS)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWFRAG)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWFRAG)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWTXPOW)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWTXPOW)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWRETRY)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWRETRY)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWENCODE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ENCODING_TOKEN_MAX, + .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT, + }, + [IW_IOCTL_IDX(SIOCGIWENCODE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ENCODING_TOKEN_MAX, + .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT, + }, + [IW_IOCTL_IDX(SIOCSIWPOWER)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWPOWER)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWGENIE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_GENERIC_IE_MAX, + }, + [IW_IOCTL_IDX(SIOCGIWGENIE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_GENERIC_IE_MAX, + }, + [IW_IOCTL_IDX(SIOCSIWAUTH)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCGIWAUTH)] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [IW_IOCTL_IDX(SIOCSIWENCODEEXT)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .min_tokens = sizeof(struct iw_encode_ext), + .max_tokens = sizeof(struct iw_encode_ext) + + IW_ENCODING_TOKEN_MAX, + }, + [IW_IOCTL_IDX(SIOCGIWENCODEEXT)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .min_tokens = sizeof(struct iw_encode_ext), + .max_tokens = sizeof(struct iw_encode_ext) + + IW_ENCODING_TOKEN_MAX, + }, + [IW_IOCTL_IDX(SIOCSIWPMKSA)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .min_tokens = sizeof(struct iw_pmksa), + .max_tokens = sizeof(struct iw_pmksa), + }, +}; +static const unsigned standard_ioctl_num = ARRAY_SIZE(standard_ioctl); + +/* + * Meta-data about all the additional standard Wireless Extension events + * we know about. + */ +static const struct iw_ioctl_description standard_event[] = { + [IW_EVENT_IDX(IWEVTXDROP)] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IW_EVENT_IDX(IWEVQUAL)] = { + .header_type = IW_HEADER_TYPE_QUAL, + }, + [IW_EVENT_IDX(IWEVCUSTOM)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_CUSTOM_MAX, + }, + [IW_EVENT_IDX(IWEVREGISTERED)] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IW_EVENT_IDX(IWEVEXPIRED)] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IW_EVENT_IDX(IWEVGENIE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_GENERIC_IE_MAX, + }, + [IW_EVENT_IDX(IWEVMICHAELMICFAILURE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = sizeof(struct iw_michaelmicfailure), + }, + [IW_EVENT_IDX(IWEVASSOCREQIE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_GENERIC_IE_MAX, + }, + [IW_EVENT_IDX(IWEVASSOCRESPIE)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_GENERIC_IE_MAX, + }, + [IW_EVENT_IDX(IWEVPMKIDCAND)] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = sizeof(struct iw_pmkid_cand), + }, +}; +static const unsigned standard_event_num = ARRAY_SIZE(standard_event); + +/* Size (in bytes) of various events */ +static const int event_type_size[] = { + IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */ + 0, + IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ + 0, + IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */ + IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ + IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ + 0, + IW_EV_POINT_LEN, /* Without variable payload */ + IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ + IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ +}; + +#ifdef CONFIG_COMPAT +static const int compat_event_type_size[] = { + IW_EV_COMPAT_LCP_LEN, /* IW_HEADER_TYPE_NULL */ + 0, + IW_EV_COMPAT_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ + 0, + IW_EV_COMPAT_UINT_LEN, /* IW_HEADER_TYPE_UINT */ + IW_EV_COMPAT_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ + IW_EV_COMPAT_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ + 0, + IW_EV_COMPAT_POINT_LEN, /* Without variable payload */ + IW_EV_COMPAT_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ + IW_EV_COMPAT_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ +}; +#endif + + +/* IW event code */ + +static int __net_init wext_pernet_init(struct net *net) +{ + skb_queue_head_init(&net->wext_nlevents); + return 0; +} + +static void __net_exit wext_pernet_exit(struct net *net) +{ + skb_queue_purge(&net->wext_nlevents); +} + +static struct pernet_operations wext_pernet_ops = { + .init = wext_pernet_init, + .exit = wext_pernet_exit, +}; + +static int __init wireless_nlevent_init(void) +{ + return register_pernet_subsys(&wext_pernet_ops); +} + +subsys_initcall(wireless_nlevent_init); + +/* Process events generated by the wireless layer or the driver. */ +static void wireless_nlevent_process(struct work_struct *work) +{ + struct sk_buff *skb; + struct net *net; + + rtnl_lock(); + + for_each_net(net) { + while ((skb = skb_dequeue(&net->wext_nlevents))) + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, + GFP_KERNEL); + } + + rtnl_unlock(); +} + +static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process); + +static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev, + struct sk_buff *skb) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, 0, 0, RTM_NEWLINK, sizeof(*r), 0); + if (!nlh) + return NULL; + + r = nlmsg_data(nlh); + r->ifi_family = AF_UNSPEC; + r->__ifi_pad = 0; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = 0; /* Wireless changes don't affect those flags */ + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + + return nlh; + nla_put_failure: + nlmsg_cancel(skb, nlh); + return NULL; +} + + +/* + * Main event dispatcher. Called from other parts and drivers. + * Send the event on the appropriate channels. + * May be called from interrupt context. + */ +void wireless_send_event(struct net_device * dev, + unsigned int cmd, + union iwreq_data * wrqu, + const char * extra) +{ + const struct iw_ioctl_description * descr = NULL; + int extra_len = 0; + struct iw_event *event; /* Mallocated whole event */ + int event_len; /* Its size */ + int hdr_len; /* Size of the event header */ + int wrqu_off = 0; /* Offset in wrqu */ + /* Don't "optimise" the following variable, it will crash */ + unsigned cmd_index; /* *MUST* be unsigned */ + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct nlattr *nla; +#ifdef CONFIG_COMPAT + struct __compat_iw_event *compat_event; + struct compat_iw_point compat_wrqu; + struct sk_buff *compskb; +#endif + + /* + * Nothing in the kernel sends scan events with data, be safe. + * This is necessary because we cannot fix up scan event data + * for compat, due to being contained in 'extra', but normally + * applications are required to retrieve the scan data anyway + * and no data is included in the event, this codifies that + * practice. + */ + if (WARN_ON(cmd == SIOCGIWSCAN && extra)) + extra = NULL; + + /* Get the description of the Event */ + if (cmd <= SIOCIWLAST) { + cmd_index = IW_IOCTL_IDX(cmd); + if (cmd_index < standard_ioctl_num) + descr = &(standard_ioctl[cmd_index]); + } else { + cmd_index = IW_EVENT_IDX(cmd); + if (cmd_index < standard_event_num) + descr = &(standard_event[cmd_index]); + } + /* Don't accept unknown events */ + if (descr == NULL) { + /* Note : we don't return an error to the driver, because + * the driver would not know what to do about it. It can't + * return an error to the user, because the event is not + * initiated by a user request. + * The best the driver could do is to log an error message. + * We will do it ourselves instead... + */ + netdev_err(dev, "(WE) : Invalid/Unknown Wireless Event (0x%04X)\n", + cmd); + return; + } + + /* Check extra parameters and set extra_len */ + if (descr->header_type == IW_HEADER_TYPE_POINT) { + /* Check if number of token fits within bounds */ + if (wrqu->data.length > descr->max_tokens) { + netdev_err(dev, "(WE) : Wireless Event too big (%d)\n", + wrqu->data.length); + return; + } + if (wrqu->data.length < descr->min_tokens) { + netdev_err(dev, "(WE) : Wireless Event too small (%d)\n", + wrqu->data.length); + return; + } + /* Calculate extra_len - extra is NULL for restricted events */ + if (extra != NULL) + extra_len = wrqu->data.length * descr->token_size; + /* Always at an offset in wrqu */ + wrqu_off = IW_EV_POINT_OFF; + } + + /* Total length of the event */ + hdr_len = event_type_size[descr->header_type]; + event_len = hdr_len + extra_len; + + /* + * The problem for 64/32 bit. + * + * On 64-bit, a regular event is laid out as follows: + * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + * | event.len | event.cmd | p a d d i n g | + * | wrqu data ... (with the correct size) | + * + * This padding exists because we manipulate event->u, + * and 'event' is not packed. + * + * An iw_point event is laid out like this instead: + * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + * | event.len | event.cmd | p a d d i n g | + * | iwpnt.len | iwpnt.flg | p a d d i n g | + * | extra data ... + * + * The second padding exists because struct iw_point is extended, + * but this depends on the platform... + * + * On 32-bit, all the padding shouldn't be there. + */ + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return; + + /* Send via the RtNetlink event channel */ + nlh = rtnetlink_ifinfo_prep(dev, skb); + if (WARN_ON(!nlh)) { + kfree_skb(skb); + return; + } + + /* Add the wireless events in the netlink packet */ + nla = nla_reserve(skb, IFLA_WIRELESS, event_len); + if (!nla) { + kfree_skb(skb); + return; + } + event = nla_data(nla); + + /* Fill event - first clear to avoid data leaking */ + memset(event, 0, hdr_len); + event->len = event_len; + event->cmd = cmd; + memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN); + if (extra_len) + memcpy(((char *) event) + hdr_len, extra, extra_len); + + nlmsg_end(skb, nlh); +#ifdef CONFIG_COMPAT + hdr_len = compat_event_type_size[descr->header_type]; + event_len = hdr_len + extra_len; + + compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!compskb) { + kfree_skb(skb); + return; + } + + /* Send via the RtNetlink event channel */ + nlh = rtnetlink_ifinfo_prep(dev, compskb); + if (WARN_ON(!nlh)) { + kfree_skb(skb); + kfree_skb(compskb); + return; + } + + /* Add the wireless events in the netlink packet */ + nla = nla_reserve(compskb, IFLA_WIRELESS, event_len); + if (!nla) { + kfree_skb(skb); + kfree_skb(compskb); + return; + } + compat_event = nla_data(nla); + + compat_event->len = event_len; + compat_event->cmd = cmd; + if (descr->header_type == IW_HEADER_TYPE_POINT) { + compat_wrqu.length = wrqu->data.length; + compat_wrqu.flags = wrqu->data.flags; + memcpy(&compat_event->pointer, + ((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF, + hdr_len - IW_EV_COMPAT_LCP_LEN); + if (extra_len) + memcpy(((char *) compat_event) + hdr_len, + extra, extra_len); + } else { + /* extra_len must be zero, so no if (extra) needed */ + memcpy(&compat_event->pointer, wrqu, + hdr_len - IW_EV_COMPAT_LCP_LEN); + } + + nlmsg_end(compskb, nlh); + + skb_shinfo(skb)->frag_list = compskb; +#endif + skb_queue_tail(&dev_net(dev)->wext_nlevents, skb); + schedule_work(&wireless_nlevent_work); +} +EXPORT_SYMBOL(wireless_send_event); + + + +/* IW handlers */ + +struct iw_statistics *get_wireless_stats(struct net_device *dev) +{ +#ifdef CONFIG_WIRELESS_EXT + if ((dev->wireless_handlers != NULL) && + (dev->wireless_handlers->get_wireless_stats != NULL)) + return dev->wireless_handlers->get_wireless_stats(dev); +#endif + +#ifdef CONFIG_CFG80211_WEXT + if (dev->ieee80211_ptr && + dev->ieee80211_ptr->wiphy && + dev->ieee80211_ptr->wiphy->wext && + dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) + return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev); +#endif + + /* not found */ + return NULL; +} + +static int iw_handler_get_iwstats(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + /* Get stats from the driver */ + struct iw_statistics *stats; + + stats = get_wireless_stats(dev); + if (stats) { + /* Copy statistics to extra */ + memcpy(extra, stats, sizeof(struct iw_statistics)); + wrqu->data.length = sizeof(struct iw_statistics); + + /* Check if we need to clear the updated flag */ + if (wrqu->data.flags != 0) + stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; + return 0; + } else + return -EOPNOTSUPP; +} + +static iw_handler get_handler(struct net_device *dev, unsigned int cmd) +{ + /* Don't "optimise" the following variable, it will crash */ + unsigned int index; /* *MUST* be unsigned */ + const struct iw_handler_def *handlers = NULL; + +#ifdef CONFIG_CFG80211_WEXT + if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) + handlers = dev->ieee80211_ptr->wiphy->wext; +#endif +#ifdef CONFIG_WIRELESS_EXT + if (dev->wireless_handlers) + handlers = dev->wireless_handlers; +#endif + + if (!handlers) + return NULL; + + /* Try as a standard command */ + index = IW_IOCTL_IDX(cmd); + if (index < handlers->num_standard) + return handlers->standard[index]; + +#ifdef CONFIG_WEXT_PRIV + /* Try as a private command */ + index = cmd - SIOCIWFIRSTPRIV; + if (index < handlers->num_private) + return handlers->private[index]; +#endif + + /* Not found */ + return NULL; +} + +static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd, + const struct iw_ioctl_description *descr, + iw_handler handler, struct net_device *dev, + struct iw_request_info *info) +{ + int err, extra_size, user_length = 0, essid_compat = 0; + char *extra; + + /* Calculate space needed by arguments. Always allocate + * for max space. + */ + extra_size = descr->max_tokens * descr->token_size; + + /* Check need for ESSID compatibility for WE < 21 */ + switch (cmd) { + case SIOCSIWESSID: + case SIOCGIWESSID: + case SIOCSIWNICKN: + case SIOCGIWNICKN: + if (iwp->length == descr->max_tokens + 1) + essid_compat = 1; + else if (IW_IS_SET(cmd) && (iwp->length != 0)) { + char essid[IW_ESSID_MAX_SIZE + 1]; + unsigned int len; + len = iwp->length * descr->token_size; + + if (len > IW_ESSID_MAX_SIZE) + return -EFAULT; + + err = copy_from_user(essid, iwp->pointer, len); + if (err) + return -EFAULT; + + if (essid[iwp->length - 1] == '\0') + essid_compat = 1; + } + break; + default: + break; + } + + iwp->length -= essid_compat; + + /* Check what user space is giving us */ + if (IW_IS_SET(cmd)) { + /* Check NULL pointer */ + if (!iwp->pointer && iwp->length != 0) + return -EFAULT; + /* Check if number of token fits within bounds */ + if (iwp->length > descr->max_tokens) + return -E2BIG; + if (iwp->length < descr->min_tokens) + return -EINVAL; + } else { + /* Check NULL pointer */ + if (!iwp->pointer) + return -EFAULT; + /* Save user space buffer size for checking */ + user_length = iwp->length; + + /* Don't check if user_length > max to allow forward + * compatibility. The test user_length < min is + * implied by the test at the end. + */ + + /* Support for very large requests */ + if ((descr->flags & IW_DESCR_FLAG_NOMAX) && + (user_length > descr->max_tokens)) { + /* Allow userspace to GET more than max so + * we can support any size GET requests. + * There is still a limit : -ENOMEM. + */ + extra_size = user_length * descr->token_size; + + /* Note : user_length is originally a __u16, + * and token_size is controlled by us, + * so extra_size won't get negative and + * won't overflow... + */ + } + } + + /* kzalloc() ensures NULL-termination for essid_compat. */ + extra = kzalloc(extra_size, GFP_KERNEL); + if (!extra) + return -ENOMEM; + + /* If it is a SET, get all the extra data in here */ + if (IW_IS_SET(cmd) && (iwp->length != 0)) { + if (copy_from_user(extra, iwp->pointer, + iwp->length * + descr->token_size)) { + err = -EFAULT; + goto out; + } + + if (cmd == SIOCSIWENCODEEXT) { + struct iw_encode_ext *ee = (void *) extra; + + if (iwp->length < sizeof(*ee) + ee->key_len) + return -EFAULT; + } + } + + if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) { + /* + * If this is a GET, but not NOMAX, it means that the extra + * data is not bounded by userspace, but by max_tokens. Thus + * set the length to max_tokens. This matches the extra data + * allocation. + * The driver should fill it with the number of tokens it + * provided, and it may check iwp->length rather than having + * knowledge of max_tokens. If the driver doesn't change the + * iwp->length, this ioctl just copies back max_token tokens + * filled with zeroes. Hopefully the driver isn't claiming + * them to be valid data. + */ + iwp->length = descr->max_tokens; + } + + err = handler(dev, info, (union iwreq_data *) iwp, extra); + + iwp->length += essid_compat; + + /* If we have something to return to the user */ + if (!err && IW_IS_GET(cmd)) { + /* Check if there is enough buffer up there */ + if (user_length < iwp->length) { + err = -E2BIG; + goto out; + } + + if (copy_to_user(iwp->pointer, extra, + iwp->length * + descr->token_size)) { + err = -EFAULT; + goto out; + } + } + + /* Generate an event to notify listeners of the change */ + if ((descr->flags & IW_DESCR_FLAG_EVENT) && + ((err == 0) || (err == -EIWCOMMIT))) { + union iwreq_data *data = (union iwreq_data *) iwp; + + if (descr->flags & IW_DESCR_FLAG_RESTRICT) + /* If the event is restricted, don't + * export the payload. + */ + wireless_send_event(dev, cmd, data, NULL); + else + wireless_send_event(dev, cmd, data, extra); + } + +out: + kfree(extra); + return err; +} + +/* + * Call the commit handler in the driver + * (if exist and if conditions are right) + * + * Note : our current commit strategy is currently pretty dumb, + * but we will be able to improve on that... + * The goal is to try to agreagate as many changes as possible + * before doing the commit. Drivers that will define a commit handler + * are usually those that need a reset after changing parameters, so + * we want to minimise the number of reset. + * A cool idea is to use a timer : at each "set" command, we re-set the + * timer, when the timer eventually fires, we call the driver. + * Hopefully, more on that later. + * + * Also, I'm waiting to see how many people will complain about the + * netif_running(dev) test. I'm open on that one... + * Hopefully, the driver will remember to do a commit in "open()" ;-) + */ +int call_commit_handler(struct net_device *dev) +{ +#ifdef CONFIG_WIRELESS_EXT + if ((netif_running(dev)) && + (dev->wireless_handlers->standard[0] != NULL)) + /* Call the commit handler on the driver */ + return dev->wireless_handlers->standard[0](dev, NULL, + NULL, NULL); + else + return 0; /* Command completed successfully */ +#else + /* cfg80211 has no commit */ + return 0; +#endif +} + +/* + * Main IOCTl dispatcher. + * Check the type of IOCTL and call the appropriate wrapper... + */ +static int wireless_process_ioctl(struct net *net, struct ifreq *ifr, + unsigned int cmd, + struct iw_request_info *info, + wext_ioctl_func standard, + wext_ioctl_func private) +{ + struct iwreq *iwr = (struct iwreq *) ifr; + struct net_device *dev; + iw_handler handler; + + /* Permissions are already checked in dev_ioctl() before calling us. + * The copy_to/from_user() of ifr is also dealt with in there */ + + /* Make sure the device exist */ + if ((dev = __dev_get_by_name(net, ifr->ifr_name)) == NULL) + return -ENODEV; + + /* A bunch of special cases, then the generic case... + * Note that 'cmd' is already filtered in dev_ioctl() with + * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ + if (cmd == SIOCGIWSTATS) + return standard(dev, iwr, cmd, info, + &iw_handler_get_iwstats); + +#ifdef CONFIG_WEXT_PRIV + if (cmd == SIOCGIWPRIV && dev->wireless_handlers) + return standard(dev, iwr, cmd, info, + iw_handler_get_private); +#endif + + /* Basic check */ + if (!netif_device_present(dev)) + return -ENODEV; + + /* New driver API : try to find the handler */ + handler = get_handler(dev, cmd); + if (handler) { + /* Standard and private are not the same */ + if (cmd < SIOCIWFIRSTPRIV) + return standard(dev, iwr, cmd, info, handler); + else if (private) + return private(dev, iwr, cmd, info, handler); + } + /* Old driver API : call driver ioctl handler */ + if (dev->netdev_ops->ndo_do_ioctl) + return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; +} + +/* If command is `set a parameter', or `get the encoding parameters', + * check if the user has the right to do it. + */ +static int wext_permission_check(unsigned int cmd) +{ + if ((IW_IS_SET(cmd) || cmd == SIOCGIWENCODE || + cmd == SIOCGIWENCODEEXT) && + !capable(CAP_NET_ADMIN)) + return -EPERM; + + return 0; +} + +/* entry point from dev ioctl */ +static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr, + unsigned int cmd, struct iw_request_info *info, + wext_ioctl_func standard, + wext_ioctl_func private) +{ + int ret = wext_permission_check(cmd); + + if (ret) + return ret; + + dev_load(net, ifr->ifr_name); + rtnl_lock(); + ret = wireless_process_ioctl(net, ifr, cmd, info, standard, private); + rtnl_unlock(); + + return ret; +} + +/* + * Wrapper to call a standard Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + */ +static int ioctl_standard_call(struct net_device * dev, + struct iwreq *iwr, + unsigned int cmd, + struct iw_request_info *info, + iw_handler handler) +{ + const struct iw_ioctl_description * descr; + int ret = -EINVAL; + + /* Get the description of the IOCTL */ + if (IW_IOCTL_IDX(cmd) >= standard_ioctl_num) + return -EOPNOTSUPP; + descr = &(standard_ioctl[IW_IOCTL_IDX(cmd)]); + + /* Check if we have a pointer to user space data or not */ + if (descr->header_type != IW_HEADER_TYPE_POINT) { + + /* No extra arguments. Trivial to handle */ + ret = handler(dev, info, &(iwr->u), NULL); + + /* Generate an event to notify listeners of the change */ + if ((descr->flags & IW_DESCR_FLAG_EVENT) && + ((ret == 0) || (ret == -EIWCOMMIT))) + wireless_send_event(dev, cmd, &(iwr->u), NULL); + } else { + ret = ioctl_standard_iw_point(&iwr->u.data, cmd, descr, + handler, dev, info); + } + + /* Call commit handler if needed and defined */ + if (ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + /* Here, we will generate the appropriate event if needed */ + + return ret; +} + + +int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd, + void __user *arg) +{ + struct iw_request_info info = { .cmd = cmd, .flags = 0 }; + int ret; + + ret = wext_ioctl_dispatch(net, ifr, cmd, &info, + ioctl_standard_call, + ioctl_private_call); + if (ret >= 0 && + IW_IS_GET(cmd) && + copy_to_user(arg, ifr, sizeof(struct iwreq))) + return -EFAULT; + + return ret; +} + +#ifdef CONFIG_COMPAT +static int compat_standard_call(struct net_device *dev, + struct iwreq *iwr, + unsigned int cmd, + struct iw_request_info *info, + iw_handler handler) +{ + const struct iw_ioctl_description *descr; + struct compat_iw_point *iwp_compat; + struct iw_point iwp; + int err; + + descr = standard_ioctl + IW_IOCTL_IDX(cmd); + + if (descr->header_type != IW_HEADER_TYPE_POINT) + return ioctl_standard_call(dev, iwr, cmd, info, handler); + + iwp_compat = (struct compat_iw_point *) &iwr->u.data; + iwp.pointer = compat_ptr(iwp_compat->pointer); + iwp.length = iwp_compat->length; + iwp.flags = iwp_compat->flags; + + err = ioctl_standard_iw_point(&iwp, cmd, descr, handler, dev, info); + + iwp_compat->pointer = ptr_to_compat(iwp.pointer); + iwp_compat->length = iwp.length; + iwp_compat->flags = iwp.flags; + + return err; +} + +int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct iw_request_info info; + struct iwreq iwr; + char *colon; + int ret; + + if (copy_from_user(&iwr, argp, sizeof(struct iwreq))) + return -EFAULT; + + iwr.ifr_name[IFNAMSIZ-1] = 0; + colon = strchr(iwr.ifr_name, ':'); + if (colon) + *colon = 0; + + info.cmd = cmd; + info.flags = IW_REQUEST_FLAG_COMPAT; + + ret = wext_ioctl_dispatch(net, (struct ifreq *) &iwr, cmd, &info, + compat_standard_call, + compat_private_call); + + if (ret >= 0 && + IW_IS_GET(cmd) && + copy_to_user(argp, &iwr, sizeof(struct iwreq))) + return -EFAULT; + + return ret; +} +#endif diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c new file mode 100644 index 00000000..674d426a --- /dev/null +++ b/net/wireless/wext-priv.c @@ -0,0 +1,249 @@ +/* + * This file implement the Wireless Extensions priv API. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. + * Copyright 2009 Johannes Berg + * + * (As all part of the Linux kernel, this file is GPL) + */ +#include +#include +#include +#include +#include + +int iw_handler_get_private(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + /* Check if the driver has something to export */ + if ((dev->wireless_handlers->num_private_args == 0) || + (dev->wireless_handlers->private_args == NULL)) + return -EOPNOTSUPP; + + /* Check if there is enough buffer up there */ + if (wrqu->data.length < dev->wireless_handlers->num_private_args) { + /* User space can't know in advance how large the buffer + * needs to be. Give it a hint, so that we can support + * any size buffer we want somewhat efficiently... */ + wrqu->data.length = dev->wireless_handlers->num_private_args; + return -E2BIG; + } + + /* Set the number of available ioctls. */ + wrqu->data.length = dev->wireless_handlers->num_private_args; + + /* Copy structure to the user buffer. */ + memcpy(extra, dev->wireless_handlers->private_args, + sizeof(struct iw_priv_args) * wrqu->data.length); + + return 0; +} + +/* Size (in bytes) of the various private data types */ +static const char iw_priv_type_size[] = { + 0, /* IW_PRIV_TYPE_NONE */ + 1, /* IW_PRIV_TYPE_BYTE */ + 1, /* IW_PRIV_TYPE_CHAR */ + 0, /* Not defined */ + sizeof(__u32), /* IW_PRIV_TYPE_INT */ + sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */ + sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */ + 0, /* Not defined */ +}; + +static int get_priv_size(__u16 args) +{ + int num = args & IW_PRIV_SIZE_MASK; + int type = (args & IW_PRIV_TYPE_MASK) >> 12; + + return num * iw_priv_type_size[type]; +} + +static int adjust_priv_size(__u16 args, struct iw_point *iwp) +{ + int num = iwp->length; + int max = args & IW_PRIV_SIZE_MASK; + int type = (args & IW_PRIV_TYPE_MASK) >> 12; + + /* Make sure the driver doesn't goof up */ + if (max < num) + num = max; + + return num * iw_priv_type_size[type]; +} + +/* + * Wrapper to call a private Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + * It's not as nice and slimline as the standard wrapper. The cause + * is struct iw_priv_args, which was not really designed for the + * job we are going here. + * + * IMPORTANT : This function prevent to set and get data on the same + * IOCTL and enforce the SET/GET convention. Not doing it would be + * far too hairy... + * If you need to set and get data at the same time, please don't use + * a iw_handler but process it in your ioctl handler (i.e. use the + * old driver API). + */ +static int get_priv_descr_and_size(struct net_device *dev, unsigned int cmd, + const struct iw_priv_args **descrp) +{ + const struct iw_priv_args *descr; + int i, extra_size; + + descr = NULL; + for (i = 0; i < dev->wireless_handlers->num_private_args; i++) { + if (cmd == dev->wireless_handlers->private_args[i].cmd) { + descr = &dev->wireless_handlers->private_args[i]; + break; + } + } + + extra_size = 0; + if (descr) { + if (IW_IS_SET(cmd)) { + int offset = 0; /* For sub-ioctls */ + /* Check for sub-ioctl handler */ + if (descr->name[0] == '\0') + /* Reserve one int for sub-ioctl index */ + offset = sizeof(__u32); + + /* Size of set arguments */ + extra_size = get_priv_size(descr->set_args); + + /* Does it fits in iwr ? */ + if ((descr->set_args & IW_PRIV_SIZE_FIXED) && + ((extra_size + offset) <= IFNAMSIZ)) + extra_size = 0; + } else { + /* Size of get arguments */ + extra_size = get_priv_size(descr->get_args); + + /* Does it fits in iwr ? */ + if ((descr->get_args & IW_PRIV_SIZE_FIXED) && + (extra_size <= IFNAMSIZ)) + extra_size = 0; + } + } + *descrp = descr; + return extra_size; +} + +static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd, + const struct iw_priv_args *descr, + iw_handler handler, struct net_device *dev, + struct iw_request_info *info, int extra_size) +{ + char *extra; + int err; + + /* Check what user space is giving us */ + if (IW_IS_SET(cmd)) { + if (!iwp->pointer && iwp->length != 0) + return -EFAULT; + + if (iwp->length > (descr->set_args & IW_PRIV_SIZE_MASK)) + return -E2BIG; + } else if (!iwp->pointer) + return -EFAULT; + + extra = kzalloc(extra_size, GFP_KERNEL); + if (!extra) + return -ENOMEM; + + /* If it is a SET, get all the extra data in here */ + if (IW_IS_SET(cmd) && (iwp->length != 0)) { + if (copy_from_user(extra, iwp->pointer, extra_size)) { + err = -EFAULT; + goto out; + } + } + + /* Call the handler */ + err = handler(dev, info, (union iwreq_data *) iwp, extra); + + /* If we have something to return to the user */ + if (!err && IW_IS_GET(cmd)) { + /* Adjust for the actual length if it's variable, + * avoid leaking kernel bits outside. + */ + if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) + extra_size = adjust_priv_size(descr->get_args, iwp); + + if (copy_to_user(iwp->pointer, extra, extra_size)) + err = -EFAULT; + } + +out: + kfree(extra); + return err; +} + +int ioctl_private_call(struct net_device *dev, struct iwreq *iwr, + unsigned int cmd, struct iw_request_info *info, + iw_handler handler) +{ + int extra_size = 0, ret = -EINVAL; + const struct iw_priv_args *descr; + + extra_size = get_priv_descr_and_size(dev, cmd, &descr); + + /* Check if we have a pointer to user space data or not. */ + if (extra_size == 0) { + /* No extra arguments. Trivial to handle */ + ret = handler(dev, info, &(iwr->u), (char *) &(iwr->u)); + } else { + ret = ioctl_private_iw_point(&iwr->u.data, cmd, descr, + handler, dev, info, extra_size); + } + + /* Call commit handler if needed and defined */ + if (ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + return ret; +} + +#ifdef CONFIG_COMPAT +int compat_private_call(struct net_device *dev, struct iwreq *iwr, + unsigned int cmd, struct iw_request_info *info, + iw_handler handler) +{ + const struct iw_priv_args *descr; + int ret, extra_size; + + extra_size = get_priv_descr_and_size(dev, cmd, &descr); + + /* Check if we have a pointer to user space data or not. */ + if (extra_size == 0) { + /* No extra arguments. Trivial to handle */ + ret = handler(dev, info, &(iwr->u), (char *) &(iwr->u)); + } else { + struct compat_iw_point *iwp_compat; + struct iw_point iwp; + + iwp_compat = (struct compat_iw_point *) &iwr->u.data; + iwp.pointer = compat_ptr(iwp_compat->pointer); + iwp.length = iwp_compat->length; + iwp.flags = iwp_compat->flags; + + ret = ioctl_private_iw_point(&iwp, cmd, descr, + handler, dev, info, extra_size); + + iwp_compat->pointer = ptr_to_compat(iwp.pointer); + iwp_compat->length = iwp.length; + iwp_compat->flags = iwp.flags; + } + + /* Call commit handler if needed and defined */ + if (ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + return ret; +} +#endif diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c new file mode 100644 index 00000000..8bafa31f --- /dev/null +++ b/net/wireless/wext-proc.c @@ -0,0 +1,155 @@ +/* + * This file implement the Wireless Extensions proc API. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +/* + * The /proc/net/wireless file is a human readable user-space interface + * exporting various wireless specific statistics from the wireless devices. + * This is the most popular part of the Wireless Extensions ;-) + * + * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). + * The content of the file is basically the content of "struct iw_statistics". + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +static void wireless_seq_printf_stats(struct seq_file *seq, + struct net_device *dev) +{ + /* Get stats from the driver */ + struct iw_statistics *stats = get_wireless_stats(dev); + static struct iw_statistics nullstats = {}; + + /* show device if it's wireless regardless of current stats */ + if (!stats) { +#ifdef CONFIG_WIRELESS_EXT + if (dev->wireless_handlers) + stats = &nullstats; +#endif +#ifdef CONFIG_CFG80211 + if (dev->ieee80211_ptr) + stats = &nullstats; +#endif + } + + if (stats) { + seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " + "%6d %6d %6d\n", + dev->name, stats->status, stats->qual.qual, + stats->qual.updated & IW_QUAL_QUAL_UPDATED + ? '.' : ' ', + ((__s32) stats->qual.level) - + ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), + stats->qual.updated & IW_QUAL_LEVEL_UPDATED + ? '.' : ' ', + ((__s32) stats->qual.noise) - + ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), + stats->qual.updated & IW_QUAL_NOISE_UPDATED + ? '.' : ' ', + stats->discard.nwid, stats->discard.code, + stats->discard.fragment, stats->discard.retries, + stats->discard.misc, stats->miss.beacon); + + if (stats != &nullstats) + stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; + } +} + +/* ---------------------------------------------------------------- */ +/* + * Print info for /proc/net/wireless (print all entries) + */ +static int wireless_dev_seq_show(struct seq_file *seq, void *v) +{ + might_sleep(); + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "Inter-| sta-| Quality | Discarded " + "packets | Missed | WE\n" + " face | tus | link level noise | nwid " + "crypt frag retry misc | beacon | %d\n", + WIRELESS_EXT); + else + wireless_seq_printf_stats(seq, v); + return 0; +} + +static void *wireless_dev_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + loff_t off; + struct net_device *dev; + + rtnl_lock(); + if (!*pos) + return SEQ_START_TOKEN; + + off = 1; + for_each_netdev(net, dev) + if (off++ == *pos) + return dev; + return NULL; +} + +static void *wireless_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + + ++*pos; + + return v == SEQ_START_TOKEN ? + first_net_device(net) : next_net_device(v); +} + +static void wireless_dev_seq_stop(struct seq_file *seq, void *v) +{ + rtnl_unlock(); +} + +static const struct seq_operations wireless_seq_ops = { + .start = wireless_dev_seq_start, + .next = wireless_dev_seq_next, + .stop = wireless_dev_seq_stop, + .show = wireless_dev_seq_show, +}; + +static int seq_open_wireless(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &wireless_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations wireless_seq_fops = { + .owner = THIS_MODULE, + .open = seq_open_wireless, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +int __net_init wext_proc_init(struct net *net) +{ + /* Create /proc/net/wireless entry */ + if (!proc_net_fops_create(net, "wireless", S_IRUGO, &wireless_seq_fops)) + return -ENOMEM; + + return 0; +} + +void __net_exit wext_proc_exit(struct net *net) +{ + proc_net_remove(net, "wireless"); +} diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c new file mode 100644 index 00000000..6fffe62d --- /dev/null +++ b/net/wireless/wext-sme.c @@ -0,0 +1,405 @@ +/* + * cfg80211 wext compat for managed mode. + * + * Copyright 2009 Johannes Berg + * Copyright (C) 2009 Intel Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include "wext-compat.h" +#include "nl80211.h" + +int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + struct cfg80211_cached_keys *ck = NULL; + const u8 *prev_bssid = NULL; + int err, i; + + ASSERT_RDEV_LOCK(rdev); + ASSERT_WDEV_LOCK(wdev); + + if (!netif_running(wdev->netdev)) + return 0; + + wdev->wext.connect.ie = wdev->wext.ie; + wdev->wext.connect.ie_len = wdev->wext.ie_len; + + if (wdev->wext.keys) { + wdev->wext.keys->def = wdev->wext.default_key; + wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key; + if (wdev->wext.default_key != -1) + wdev->wext.connect.privacy = true; + } + + if (!wdev->wext.connect.ssid_len) + return 0; + + if (wdev->wext.keys) { + ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); + if (!ck) + return -ENOMEM; + for (i = 0; i < 6; i++) + ck->params[i].key = ck->data[i]; + } + + if (wdev->wext.prev_bssid_valid) + prev_bssid = wdev->wext.prev_bssid; + + err = __cfg80211_connect(rdev, wdev->netdev, + &wdev->wext.connect, ck, prev_bssid); + if (err) + kfree(ck); + + return err; +} + +int cfg80211_mgd_wext_siwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *wextfreq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + struct ieee80211_channel *chan = NULL; + int err, freq; + + /* call only for station! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return -EINVAL; + + freq = cfg80211_wext_freq(wdev->wiphy, wextfreq); + if (freq < 0) + return freq; + + if (freq) { + chan = ieee80211_get_channel(wdev->wiphy, freq); + if (!chan) + return -EINVAL; + if (chan->flags & IEEE80211_CHAN_DISABLED) + return -EINVAL; + } + + cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + + if (wdev->sme_state != CFG80211_SME_IDLE) { + bool event = true; + + if (wdev->wext.connect.channel == chan) { + err = 0; + goto out; + } + + /* if SSID set, we'll try right again, avoid event */ + if (wdev->wext.connect.ssid_len) + event = false; + err = __cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, event); + if (err) + goto out; + } + + + wdev->wext.connect.channel = chan; + + /* SSID is not set, we just want to switch channel */ + if (chan && !wdev->wext.connect.ssid_len) { + err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT); + goto out; + } + + err = cfg80211_mgd_wext_connect(rdev, wdev); + out: + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + cfg80211_unlock_rdev(rdev); + return err; +} + +int cfg80211_mgd_wext_giwfreq(struct net_device *dev, + struct iw_request_info *info, + struct iw_freq *freq, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct ieee80211_channel *chan = NULL; + + /* call only for station! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return -EINVAL; + + wdev_lock(wdev); + if (wdev->current_bss) + chan = wdev->current_bss->pub.channel; + else if (wdev->wext.connect.channel) + chan = wdev->wext.connect.channel; + wdev_unlock(wdev); + + if (chan) { + freq->m = chan->center_freq; + freq->e = 6; + return 0; + } + + /* no channel if not joining */ + return -EINVAL; +} + +int cfg80211_mgd_wext_siwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + size_t len = data->length; + int err; + + /* call only for station! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return -EINVAL; + + if (!data->flags) + len = 0; + + /* iwconfig uses nul termination in SSID.. */ + if (len > 0 && ssid[len - 1] == '\0') + len--; + + cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + + err = 0; + + if (wdev->sme_state != CFG80211_SME_IDLE) { + bool event = true; + + if (wdev->wext.connect.ssid && len && + len == wdev->wext.connect.ssid_len && + memcmp(wdev->wext.connect.ssid, ssid, len) == 0) + goto out; + + /* if SSID set now, we'll try to connect, avoid event */ + if (len) + event = false; + err = __cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, event); + if (err) + goto out; + } + + wdev->wext.prev_bssid_valid = false; + wdev->wext.connect.ssid = wdev->wext.ssid; + memcpy(wdev->wext.ssid, ssid, len); + wdev->wext.connect.ssid_len = len; + + wdev->wext.connect.crypto.control_port = false; + wdev->wext.connect.crypto.control_port_ethertype = + cpu_to_be16(ETH_P_PAE); + + err = cfg80211_mgd_wext_connect(rdev, wdev); + out: + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + cfg80211_unlock_rdev(rdev); + return err; +} + +int cfg80211_mgd_wext_giwessid(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *ssid) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + /* call only for station! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return -EINVAL; + + data->flags = 0; + + wdev_lock(wdev); + if (wdev->current_bss) { + const u8 *ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, + WLAN_EID_SSID); + if (ie) { + data->flags = 1; + data->length = ie[1]; + memcpy(ssid, ie + 2, data->length); + } + } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { + data->flags = 1; + data->length = wdev->wext.connect.ssid_len; + memcpy(ssid, wdev->wext.connect.ssid, data->length); + } + wdev_unlock(wdev); + + return 0; +} + +int cfg80211_mgd_wext_siwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u8 *bssid = ap_addr->sa_data; + int err; + + /* call only for station! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return -EINVAL; + + if (ap_addr->sa_family != ARPHRD_ETHER) + return -EINVAL; + + /* automatic mode */ + if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) + bssid = NULL; + + cfg80211_lock_rdev(rdev); + mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); + + if (wdev->sme_state != CFG80211_SME_IDLE) { + err = 0; + /* both automatic */ + if (!bssid && !wdev->wext.connect.bssid) + goto out; + + /* fixed already - and no change */ + if (wdev->wext.connect.bssid && bssid && + compare_ether_addr(bssid, wdev->wext.connect.bssid) == 0) + goto out; + + err = __cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, false); + if (err) + goto out; + } + + if (bssid) { + memcpy(wdev->wext.bssid, bssid, ETH_ALEN); + wdev->wext.connect.bssid = wdev->wext.bssid; + } else + wdev->wext.connect.bssid = NULL; + + err = cfg80211_mgd_wext_connect(rdev, wdev); + out: + wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); + cfg80211_unlock_rdev(rdev); + return err; +} + +int cfg80211_mgd_wext_giwap(struct net_device *dev, + struct iw_request_info *info, + struct sockaddr *ap_addr, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + /* call only for station! */ + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return -EINVAL; + + ap_addr->sa_family = ARPHRD_ETHER; + + wdev_lock(wdev); + if (wdev->current_bss) + memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN); + else + memset(ap_addr->sa_data, 0, ETH_ALEN); + wdev_unlock(wdev); + + return 0; +} + +int cfg80211_wext_siwgenie(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy); + u8 *ie = extra; + int ie_len = data->length, err; + + if (wdev->iftype != NL80211_IFTYPE_STATION) + return -EOPNOTSUPP; + + if (!ie_len) + ie = NULL; + + wdev_lock(wdev); + + /* no change */ + err = 0; + if (wdev->wext.ie_len == ie_len && + memcmp(wdev->wext.ie, ie, ie_len) == 0) + goto out; + + if (ie_len) { + ie = kmemdup(extra, ie_len, GFP_KERNEL); + if (!ie) { + err = -ENOMEM; + goto out; + } + } else + ie = NULL; + + kfree(wdev->wext.ie); + wdev->wext.ie = ie; + wdev->wext.ie_len = ie_len; + + if (wdev->sme_state != CFG80211_SME_IDLE) { + err = __cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, false); + if (err) + goto out; + } + + /* userspace better not think we'll reconnect */ + err = 0; + out: + wdev_unlock(wdev); + return err; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwgenie); + +int cfg80211_wext_siwmlme(struct net_device *dev, + struct iw_request_info *info, + struct iw_point *data, char *extra) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct iw_mlme *mlme = (struct iw_mlme *)extra; + struct cfg80211_registered_device *rdev; + int err; + + if (!wdev) + return -EOPNOTSUPP; + + rdev = wiphy_to_dev(wdev->wiphy); + + if (wdev->iftype != NL80211_IFTYPE_STATION) + return -EINVAL; + + if (mlme->addr.sa_family != ARPHRD_ETHER) + return -EINVAL; + + wdev_lock(wdev); + switch (mlme->cmd) { + case IW_MLME_DEAUTH: + case IW_MLME_DISASSOC: + err = __cfg80211_disconnect(rdev, dev, mlme->reason_code, + true); + break; + default: + err = -EOPNOTSUPP; + break; + } + wdev_unlock(wdev); + + return err; +} +EXPORT_SYMBOL_GPL(cfg80211_wext_siwmlme); diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c new file mode 100644 index 00000000..6dcfe65a --- /dev/null +++ b/net/wireless/wext-spy.c @@ -0,0 +1,231 @@ +/* + * This file implement the Wireless Extensions spy API. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +#include +#include +#include +#include +#include +#include + +static inline struct iw_spy_data *get_spydata(struct net_device *dev) +{ + /* This is the new way */ + if (dev->wireless_data) + return dev->wireless_data->spy_data; + return NULL; +} + +int iw_handler_set_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Disable spy collection while we copy the addresses. + * While we copy addresses, any call to wireless_spy_update() + * will NOP. This is OK, as anyway the addresses are changing. */ + spydata->spy_number = 0; + + /* We want to operate without locking, because wireless_spy_update() + * most likely will happen in the interrupt handler, and therefore + * have its own locking constraints and needs performance. + * The rtnl_lock() make sure we don't race with the other iw_handlers. + * This make sure wireless_spy_update() "see" that the spy list + * is temporarily disabled. */ + smp_wmb(); + + /* Are there are addresses to copy? */ + if (wrqu->data.length > 0) { + int i; + + /* Copy addresses */ + for (i = 0; i < wrqu->data.length; i++) + memcpy(spydata->spy_address[i], address[i].sa_data, + ETH_ALEN); + /* Reset stats */ + memset(spydata->spy_stat, 0, + sizeof(struct iw_quality) * IW_MAX_SPY); + } + + /* Make sure above is updated before re-enabling */ + smp_wmb(); + + /* Enable addresses */ + spydata->spy_number = wrqu->data.length; + + return 0; +} +EXPORT_SYMBOL(iw_handler_set_spy); + +int iw_handler_get_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + int i; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + wrqu->data.length = spydata->spy_number; + + /* Copy addresses. */ + for (i = 0; i < spydata->spy_number; i++) { + memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); + address[i].sa_family = AF_UNIX; + } + /* Copy stats to the user buffer (just after). */ + if (spydata->spy_number > 0) + memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), + spydata->spy_stat, + sizeof(struct iw_quality) * spydata->spy_number); + /* Reset updated flags. */ + for (i = 0; i < spydata->spy_number; i++) + spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED; + return 0; +} +EXPORT_SYMBOL(iw_handler_get_spy); + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set spy threshold + */ +int iw_handler_set_thrspy(struct net_device * dev, + struct iw_request_info *info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + memcpy(&(spydata->spy_thr_low), &(threshold->low), + 2 * sizeof(struct iw_quality)); + + /* Clear flag */ + memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); + + return 0; +} +EXPORT_SYMBOL(iw_handler_set_thrspy); + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get spy threshold + */ +int iw_handler_get_thrspy(struct net_device * dev, + struct iw_request_info *info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + memcpy(&(threshold->low), &(spydata->spy_thr_low), + 2 * sizeof(struct iw_quality)); + + return 0; +} +EXPORT_SYMBOL(iw_handler_get_thrspy); + +/*------------------------------------------------------------------*/ +/* + * Prepare and send a Spy Threshold event + */ +static void iw_send_thrspy_event(struct net_device * dev, + struct iw_spy_data * spydata, + unsigned char * address, + struct iw_quality * wstats) +{ + union iwreq_data wrqu; + struct iw_thrspy threshold; + + /* Init */ + wrqu.data.length = 1; + wrqu.data.flags = 0; + /* Copy address */ + memcpy(threshold.addr.sa_data, address, ETH_ALEN); + threshold.addr.sa_family = ARPHRD_ETHER; + /* Copy stats */ + memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); + /* Copy also thresholds */ + memcpy(&(threshold.low), &(spydata->spy_thr_low), + 2 * sizeof(struct iw_quality)); + + /* Send event to user space */ + wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); +} + +/* ---------------------------------------------------------------- */ +/* + * Call for the driver to update the spy data. + * For now, the spy data is a simple array. As the size of the array is + * small, this is good enough. If we wanted to support larger number of + * spy addresses, we should use something more efficient... + */ +void wireless_spy_update(struct net_device * dev, + unsigned char * address, + struct iw_quality * wstats) +{ + struct iw_spy_data * spydata = get_spydata(dev); + int i; + int match = -1; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return; + + /* Update all records that match */ + for (i = 0; i < spydata->spy_number; i++) + if (!compare_ether_addr(address, spydata->spy_address[i])) { + memcpy(&(spydata->spy_stat[i]), wstats, + sizeof(struct iw_quality)); + match = i; + } + + /* Generate an event if we cross the spy threshold. + * To avoid event storms, we have a simple hysteresis : we generate + * event only when we go under the low threshold or above the + * high threshold. */ + if (match >= 0) { + if (spydata->spy_thr_under[match]) { + if (wstats->level > spydata->spy_thr_high.level) { + spydata->spy_thr_under[match] = 0; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } else { + if (wstats->level < spydata->spy_thr_low.level) { + spydata->spy_thr_under[match] = 1; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } + } +} +EXPORT_SYMBOL(wireless_spy_update); diff --git a/net/x25/Kconfig b/net/x25/Kconfig new file mode 100644 index 00000000..e6759c96 --- /dev/null +++ b/net/x25/Kconfig @@ -0,0 +1,36 @@ +# +# CCITT X.25 Packet Layer +# + +config X25 + tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + X.25 is a set of standardized network protocols, similar in scope to + frame relay; the one physical line from your box to the X.25 network + entry point can carry several logical point-to-point connections + (called "virtual circuits") to other computers connected to the X.25 + network. Governments, banks, and other organizations tend to use it + to connect to each other or to form Wide Area Networks (WANs). Many + countries have public X.25 networks. X.25 consists of two + protocols: the higher level Packet Layer Protocol (PLP) (say Y here + if you want that) and the lower level data link layer protocol LAPB + (say Y to "LAPB Data Link Driver" below if you want that). + + You can read more about X.25 at and + . + Information about X.25 for Linux is contained in the files + and + . + + One connects to an X.25 network either with a dedicated network card + using the X.21 protocol (not yet supported by Linux) or one can do + X.25 over a standard telephone line using an ordinary modem (say Y + to "X.25 async driver" below) or over Ethernet using an ordinary + Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link + Driver" and "LAPB over Ethernet driver" below). + + To compile this driver as a module, choose M here: the module + will be called x25. If unsure, say N. + + diff --git a/net/x25/Makefile b/net/x25/Makefile new file mode 100644 index 00000000..a2c34ab6 --- /dev/null +++ b/net/x25/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux X.25 Packet layer. +# + +obj-$(CONFIG_X25) += x25.o + +x25-y := af_x25.o x25_dev.o x25_facilities.o x25_in.o \ + x25_link.o x25_out.o x25_route.o x25_subr.o \ + x25_timer.o x25_proc.o x25_forward.o +x25-$(CONFIG_SYSCTL) += sysctl_net_x25.o diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c new file mode 100644 index 00000000..373e14f2 --- /dev/null +++ b/net/x25/af_x25.c @@ -0,0 +1,1829 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor Centralised disconnect handling. + * New timer architecture. + * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. + * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of + * facilities negotiation and increased + * the throughput upper limit. + * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups + * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). + * Fixed x25_output() related skb leakage. + * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. + * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. + * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN + * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to + * x25_proc.c, using seq_file + * 2005-04-02 Shaun Pereira Selective sub address matching + * with call user data + * 2005-04-15 Shaun Pereira Fast select with no restriction on + * response + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include + +#include +#include + +int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; +int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; +int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; +int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; +int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; +int sysctl_x25_forward = 0; + +HLIST_HEAD(x25_list); +DEFINE_RWLOCK(x25_list_lock); + +static const struct proto_ops x25_proto_ops; + +static struct x25_address null_x25_address = {" "}; + +#ifdef CONFIG_COMPAT +struct compat_x25_subscrip_struct { + char device[200-sizeof(compat_ulong_t)]; + compat_ulong_t global_facil_mask; + compat_uint_t extended; +}; +#endif + + +int x25_parse_address_block(struct sk_buff *skb, + struct x25_address *called_addr, + struct x25_address *calling_addr) +{ + unsigned char len; + int needed; + int rc; + + if (skb->len < 1) { + /* packet has no address block */ + rc = 0; + goto empty; + } + + len = *skb->data; + needed = 1 + (len >> 4) + (len & 0x0f); + + if (skb->len < needed) { + /* packet is too short to hold the addresses it claims + to hold */ + rc = -1; + goto empty; + } + + return x25_addr_ntoa(skb->data, called_addr, calling_addr); + +empty: + *called_addr->x25_addr = 0; + *calling_addr->x25_addr = 0; + + return rc; +} + + +int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, + struct x25_address *calling_addr) +{ + unsigned int called_len, calling_len; + char *called, *calling; + unsigned int i; + + called_len = (*p >> 0) & 0x0F; + calling_len = (*p >> 4) & 0x0F; + + called = called_addr->x25_addr; + calling = calling_addr->x25_addr; + p++; + + for (i = 0; i < (called_len + calling_len); i++) { + if (i < called_len) { + if (i % 2 != 0) { + *called++ = ((*p >> 0) & 0x0F) + '0'; + p++; + } else { + *called++ = ((*p >> 4) & 0x0F) + '0'; + } + } else { + if (i % 2 != 0) { + *calling++ = ((*p >> 0) & 0x0F) + '0'; + p++; + } else { + *calling++ = ((*p >> 4) & 0x0F) + '0'; + } + } + } + + *called = *calling = '\0'; + + return 1 + (called_len + calling_len + 1) / 2; +} + +int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, + struct x25_address *calling_addr) +{ + unsigned int called_len, calling_len; + char *called, *calling; + int i; + + called = called_addr->x25_addr; + calling = calling_addr->x25_addr; + + called_len = strlen(called); + calling_len = strlen(calling); + + *p++ = (calling_len << 4) | (called_len << 0); + + for (i = 0; i < (called_len + calling_len); i++) { + if (i < called_len) { + if (i % 2 != 0) { + *p |= (*called++ - '0') << 0; + p++; + } else { + *p = 0x00; + *p |= (*called++ - '0') << 4; + } + } else { + if (i % 2 != 0) { + *p |= (*calling++ - '0') << 0; + p++; + } else { + *p = 0x00; + *p |= (*calling++ - '0') << 4; + } + } + } + + return 1 + (called_len + calling_len + 1) / 2; +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void x25_remove_socket(struct sock *sk) +{ + write_lock_bh(&x25_list_lock); + sk_del_node_init(sk); + write_unlock_bh(&x25_list_lock); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void x25_kill_by_device(struct net_device *dev) +{ + struct sock *s; + struct hlist_node *node; + + write_lock_bh(&x25_list_lock); + + sk_for_each(s, node, &x25_list) + if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev) + x25_disconnect(s, ENETUNREACH, 0, 0); + + write_unlock_bh(&x25_list_lock); +} + +/* + * Handle device status changes. + */ +static int x25_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct x25_neigh *nb; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + if (dev->type == ARPHRD_X25 +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + || dev->type == ARPHRD_ETHER +#endif + ) { + switch (event) { + case NETDEV_UP: + x25_link_device_up(dev); + break; + case NETDEV_GOING_DOWN: + nb = x25_get_neigh(dev); + if (nb) { + x25_terminate_link(nb); + x25_neigh_put(nb); + } + break; + case NETDEV_DOWN: + x25_kill_by_device(dev); + x25_route_device_down(dev); + x25_link_device_down(dev); + break; + } + } + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void x25_insert_socket(struct sock *sk) +{ + write_lock_bh(&x25_list_lock); + sk_add_node(sk, &x25_list); + write_unlock_bh(&x25_list_lock); +} + +/* + * Find a socket that wants to accept the Call Request we just + * received. Check the full list for an address/cud match. + * If no cuds match return the next_best thing, an address match. + * Note: if a listening socket has cud set it must only get calls + * with matching cud. + */ +static struct sock *x25_find_listener(struct x25_address *addr, + struct sk_buff *skb) +{ + struct sock *s; + struct sock *next_best; + struct hlist_node *node; + + read_lock_bh(&x25_list_lock); + next_best = NULL; + + sk_for_each(s, node, &x25_list) + if ((!strcmp(addr->x25_addr, + x25_sk(s)->source_addr.x25_addr) || + !strcmp(addr->x25_addr, + null_x25_address.x25_addr)) && + s->sk_state == TCP_LISTEN) { + /* + * Found a listening socket, now check the incoming + * call user data vs this sockets call user data + */ + if (x25_sk(s)->cudmatchlength > 0 && + skb->len >= x25_sk(s)->cudmatchlength) { + if((memcmp(x25_sk(s)->calluserdata.cuddata, + skb->data, + x25_sk(s)->cudmatchlength)) == 0) { + sock_hold(s); + goto found; + } + } else + next_best = s; + } + if (next_best) { + s = next_best; + sock_hold(s); + goto found; + } + s = NULL; +found: + read_unlock_bh(&x25_list_lock); + return s; +} + +/* + * Find a connected X.25 socket given my LCI and neighbour. + */ +static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &x25_list) + if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { + sock_hold(s); + goto found; + } + s = NULL; +found: + return s; +} + +struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) +{ + struct sock *s; + + read_lock_bh(&x25_list_lock); + s = __x25_find_socket(lci, nb); + read_unlock_bh(&x25_list_lock); + return s; +} + +/* + * Find a unique LCI for a given device. + */ +static unsigned int x25_new_lci(struct x25_neigh *nb) +{ + unsigned int lci = 1; + struct sock *sk; + + read_lock_bh(&x25_list_lock); + + while ((sk = __x25_find_socket(lci, nb)) != NULL) { + sock_put(sk); + if (++lci == 4096) { + lci = 0; + break; + } + } + + read_unlock_bh(&x25_list_lock); + return lci; +} + +/* + * Deferred destroy. + */ +static void __x25_destroy_socket(struct sock *); + +/* + * handler for deferred kills. + */ +static void x25_destroy_timer(unsigned long data) +{ + x25_destroy_socket_from_timer((struct sock *)data); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + * Not static as it's used by the timer + */ +static void __x25_destroy_socket(struct sock *sk) +{ + struct sk_buff *skb; + + x25_stop_heartbeat(sk); + x25_stop_timer(sk); + + x25_remove_socket(sk); + x25_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + /* + * Queue the unaccepted socket for death + */ + skb->sk->sk_state = TCP_LISTEN; + sock_set_flag(skb->sk, SOCK_DEAD); + x25_start_heartbeat(skb->sk); + x25_sk(skb->sk)->state = X25_STATE_0; + } + + kfree_skb(skb); + } + + if (sk_has_allocations(sk)) { + /* Defer: outstanding buffers */ + sk->sk_timer.expires = jiffies + 10 * HZ; + sk->sk_timer.function = x25_destroy_timer; + sk->sk_timer.data = (unsigned long)sk; + add_timer(&sk->sk_timer); + } else { + /* drop last reference so sock_put will free */ + __sock_put(sk); + } +} + +void x25_destroy_socket_from_timer(struct sock *sk) +{ + sock_hold(sk); + bh_lock_sock(sk); + __x25_destroy_socket(sk); + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Handling for system calls applied via the various interfaces to a + * X.25 socket object. + */ + +static int x25_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int opt; + struct sock *sk = sock->sk; + int rc = -ENOPROTOOPT; + + if (level != SOL_X25 || optname != X25_QBITINCL) + goto out; + + rc = -EINVAL; + if (optlen < sizeof(int)) + goto out; + + rc = -EFAULT; + if (get_user(opt, (int __user *)optval)) + goto out; + + if (opt) + set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); + else + clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); + rc = 0; +out: + return rc; +} + +static int x25_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int val, len, rc = -ENOPROTOOPT; + + if (level != SOL_X25 || optname != X25_QBITINCL) + goto out; + + rc = -EFAULT; + if (get_user(len, optlen)) + goto out; + + len = min_t(unsigned int, len, sizeof(int)); + + rc = -EINVAL; + if (len < 0) + goto out; + + rc = -EFAULT; + if (put_user(len, optlen)) + goto out; + + val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); + rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; +out: + return rc; +} + +static int x25_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int rc = -EOPNOTSUPP; + + lock_sock(sk); + if (sk->sk_state != TCP_LISTEN) { + memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + rc = 0; + } + release_sock(sk); + + return rc; +} + +static struct proto x25_proto = { + .name = "X25", + .owner = THIS_MODULE, + .obj_size = sizeof(struct x25_sock), +}; + +static struct sock *x25_alloc_socket(struct net *net) +{ + struct x25_sock *x25; + struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto); + + if (!sk) + goto out; + + sock_init_data(NULL, sk); + + x25 = x25_sk(sk); + skb_queue_head_init(&x25->ack_queue); + skb_queue_head_init(&x25->fragment_queue); + skb_queue_head_init(&x25->interrupt_in_queue); + skb_queue_head_init(&x25->interrupt_out_queue); +out: + return sk; +} + +static int x25_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct x25_sock *x25; + int rc = -EAFNOSUPPORT; + + if (!net_eq(net, &init_net)) + goto out; + + rc = -ESOCKTNOSUPPORT; + if (sock->type != SOCK_SEQPACKET) + goto out; + + rc = -EINVAL; + if (protocol) + goto out; + + rc = -ENOBUFS; + if ((sk = x25_alloc_socket(net)) == NULL) + goto out; + + x25 = x25_sk(sk); + + sock_init_data(sock, sk); + + x25_init_timers(sk); + + sock->ops = &x25_proto_ops; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = x25_backlog_rcv; + + x25->t21 = sysctl_x25_call_request_timeout; + x25->t22 = sysctl_x25_reset_request_timeout; + x25->t23 = sysctl_x25_clear_request_timeout; + x25->t2 = sysctl_x25_ack_holdback_timeout; + x25->state = X25_STATE_0; + x25->cudmatchlength = 0; + set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ + /* on call accept */ + + x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; + x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; + x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; + x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; + x25->facilities.throughput = 0; /* by default don't negotiate + throughput */ + x25->facilities.reverse = X25_DEFAULT_REVERSE; + x25->dte_facilities.calling_len = 0; + x25->dte_facilities.called_len = 0; + memset(x25->dte_facilities.called_ae, '\0', + sizeof(x25->dte_facilities.called_ae)); + memset(x25->dte_facilities.calling_ae, '\0', + sizeof(x25->dte_facilities.calling_ae)); + + rc = 0; +out: + return rc; +} + +static struct sock *x25_make_new(struct sock *osk) +{ + struct sock *sk = NULL; + struct x25_sock *x25, *ox25; + + if (osk->sk_type != SOCK_SEQPACKET) + goto out; + + if ((sk = x25_alloc_socket(sock_net(osk))) == NULL) + goto out; + + x25 = x25_sk(sk); + + sk->sk_type = osk->sk_type; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_backlog_rcv = osk->sk_backlog_rcv; + sock_copy_flags(sk, osk); + + ox25 = x25_sk(osk); + x25->t21 = ox25->t21; + x25->t22 = ox25->t22; + x25->t23 = ox25->t23; + x25->t2 = ox25->t2; + x25->flags = ox25->flags; + x25->facilities = ox25->facilities; + x25->dte_facilities = ox25->dte_facilities; + x25->cudmatchlength = ox25->cudmatchlength; + + clear_bit(X25_INTERRUPT_FLAG, &x25->flags); + x25_init_timers(sk); +out: + return sk; +} + +static int x25_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25; + + if (!sk) + return 0; + + x25 = x25_sk(sk); + + sock_hold(sk); + lock_sock(sk); + switch (x25->state) { + + case X25_STATE_0: + case X25_STATE_2: + x25_disconnect(sk, 0, 0, 0); + __x25_destroy_socket(sk); + goto out; + + case X25_STATE_1: + case X25_STATE_3: + case X25_STATE_4: + x25_clear_queues(sk); + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_start_t23timer(sk); + x25->state = X25_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + sock_set_flag(sk, SOCK_DESTROY); + break; + } + + sock_orphan(sk); +out: + release_sock(sk); + sock_put(sk); + return 0; +} + +static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; + int len, i, rc = 0; + + if (!sock_flag(sk, SOCK_ZAPPED) || + addr_len != sizeof(struct sockaddr_x25) || + addr->sx25_family != AF_X25) { + rc = -EINVAL; + goto out; + } + + len = strlen(addr->sx25_addr.x25_addr); + for (i = 0; i < len; i++) { + if (!isdigit(addr->sx25_addr.x25_addr[i])) { + rc = -EINVAL; + goto out; + } + } + + lock_sock(sk); + x25_sk(sk)->source_addr = addr->sx25_addr; + x25_insert_socket(sk); + sock_reset_flag(sk, SOCK_ZAPPED); + release_sock(sk); + SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); +out: + return rc; +} + +static int x25_wait_for_connection_establishment(struct sock *sk) +{ + DECLARE_WAITQUEUE(wait, current); + int rc; + + add_wait_queue_exclusive(sk_sleep(sk), &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = sock_error(sk); + if (rc) { + sk->sk_socket->state = SS_UNCONNECTED; + break; + } + rc = 0; + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + schedule(); + lock_sock(sk); + } else + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + return rc; +} + +static int x25_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; + struct x25_route *rt; + int rc = 0; + + lock_sock(sk); + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + goto out; /* Connect completed during a ERESTARTSYS event */ + } + + rc = -ECONNREFUSED; + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + goto out; + } + + rc = -EISCONN; /* No reconnect on a seqpacket socket */ + if (sk->sk_state == TCP_ESTABLISHED) + goto out; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + rc = -EINVAL; + if (addr_len != sizeof(struct sockaddr_x25) || + addr->sx25_family != AF_X25) + goto out; + + rc = -ENETUNREACH; + rt = x25_get_route(&addr->sx25_addr); + if (!rt) + goto out; + + x25->neighbour = x25_get_neigh(rt->dev); + if (!x25->neighbour) + goto out_put_route; + + x25_limit_facilities(&x25->facilities, x25->neighbour); + + x25->lci = x25_new_lci(x25->neighbour); + if (!x25->lci) + goto out_put_neigh; + + rc = -EINVAL; + if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ + goto out_put_neigh; + + if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) + memset(&x25->source_addr, '\0', X25_ADDR_LEN); + + x25->dest_addr = addr->sx25_addr; + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + x25->state = X25_STATE_1; + + x25_write_internal(sk, X25_CALL_REQUEST); + + x25_start_heartbeat(sk); + x25_start_t21timer(sk); + + /* Now the loop */ + rc = -EINPROGRESS; + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + goto out_put_neigh; + + rc = x25_wait_for_connection_establishment(sk); + if (rc) + goto out_put_neigh; + + sock->state = SS_CONNECTED; + rc = 0; +out_put_neigh: + if (rc) + x25_neigh_put(x25->neighbour); +out_put_route: + x25_route_put(rt); +out: + release_sock(sk); + return rc; +} + +static int x25_wait_for_data(struct sock *sk, long timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int rc = 0; + + add_wait_queue_exclusive(sk_sleep(sk), &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + rc = 0; + if (skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } else + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + return rc; +} + +static int x25_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct sock *newsk; + struct sk_buff *skb; + int rc = -EINVAL; + + if (!sk) + goto out; + + rc = -EOPNOTSUPP; + if (sk->sk_type != SOCK_SEQPACKET) + goto out; + + lock_sock(sk); + rc = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out2; + + rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); + if (rc) + goto out2; + skb = skb_dequeue(&sk->sk_receive_queue); + rc = -EINVAL; + if (!skb->sk) + goto out2; + newsk = skb->sk; + sock_graft(newsk, newsock); + + /* Now attach up the new socket */ + skb->sk = NULL; + kfree_skb(skb); + sk->sk_ack_backlog--; + newsock->state = SS_CONNECTED; + rc = 0; +out2: + release_sock(sk); +out: + return rc; +} + +static int x25_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + int rc = 0; + + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) { + rc = -ENOTCONN; + goto out; + } + sx25->sx25_addr = x25->dest_addr; + } else + sx25->sx25_addr = x25->source_addr; + + sx25->sx25_family = AF_X25; + *uaddr_len = sizeof(*sx25); + +out: + return rc; +} + +int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, + unsigned int lci) +{ + struct sock *sk; + struct sock *make; + struct x25_sock *makex25; + struct x25_address source_addr, dest_addr; + struct x25_facilities facilities; + struct x25_dte_facilities dte_facilities; + int len, addr_len, rc; + + /* + * Remove the LCI and frame type. + */ + skb_pull(skb, X25_STD_MIN_LEN); + + /* + * Extract the X.25 addresses and convert them to ASCII strings, + * and remove them. + * + * Address block is mandatory in call request packets + */ + addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); + if (addr_len <= 0) + goto out_clear_request; + skb_pull(skb, addr_len); + + /* + * Get the length of the facilities, skip past them for the moment + * get the call user data because this is needed to determine + * the correct listener + * + * Facilities length is mandatory in call request packets + */ + if (skb->len < 1) + goto out_clear_request; + len = skb->data[0] + 1; + if (skb->len < len) + goto out_clear_request; + skb_pull(skb,len); + + /* + * Find a listener for the particular address/cud pair. + */ + sk = x25_find_listener(&source_addr,skb); + skb_push(skb,len); + + if (sk != NULL && sk_acceptq_is_full(sk)) { + goto out_sock_put; + } + + /* + * We dont have any listeners for this incoming call. + * Try forwarding it. + */ + if (sk == NULL) { + skb_push(skb, addr_len + X25_STD_MIN_LEN); + if (sysctl_x25_forward && + x25_forward_call(&dest_addr, nb, skb, lci) > 0) + { + /* Call was forwarded, dont process it any more */ + kfree_skb(skb); + rc = 1; + goto out; + } else { + /* No listeners, can't forward, clear the call */ + goto out_clear_request; + } + } + + /* + * Try to reach a compromise on the requested facilities. + */ + len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); + if (len == -1) + goto out_sock_put; + + /* + * current neighbour/link might impose additional limits + * on certain facilties + */ + + x25_limit_facilities(&facilities, nb); + + /* + * Try to create a new socket. + */ + make = x25_make_new(sk); + if (!make) + goto out_sock_put; + + /* + * Remove the facilities + */ + skb_pull(skb, len); + + skb->sk = make; + make->sk_state = TCP_ESTABLISHED; + + makex25 = x25_sk(make); + makex25->lci = lci; + makex25->dest_addr = dest_addr; + makex25->source_addr = source_addr; + makex25->neighbour = nb; + makex25->facilities = facilities; + makex25->dte_facilities= dte_facilities; + makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; + /* ensure no reverse facil on accept */ + makex25->vc_facil_mask &= ~X25_MASK_REVERSE; + /* ensure no calling address extension on accept */ + makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; + makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; + + /* Normally all calls are accepted immediately */ + if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { + x25_write_internal(make, X25_CALL_ACCEPTED); + makex25->state = X25_STATE_3; + } + + /* + * Incoming Call User Data. + */ + skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); + makex25->calluserdata.cudlength = skb->len; + + sk->sk_ack_backlog++; + + x25_insert_socket(make); + + skb_queue_head(&sk->sk_receive_queue, skb); + + x25_start_heartbeat(make); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + rc = 1; + sock_put(sk); +out: + return rc; +out_sock_put: + sock_put(sk); +out_clear_request: + rc = 0; + x25_transmit_clear_request(nb, lci, 0x01); + goto out; +} + +static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + struct sockaddr_x25 *usx25 = (struct sockaddr_x25 *)msg->msg_name; + struct sockaddr_x25 sx25; + struct sk_buff *skb; + unsigned char *asmptr; + int noblock = msg->msg_flags & MSG_DONTWAIT; + size_t size; + int qbit = 0, rc = -EINVAL; + + lock_sock(sk); + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) + goto out; + + /* we currently don't support segmented records at the user interface */ + if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) + goto out; + + rc = -EADDRNOTAVAIL; + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + + rc = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + goto out; + } + + rc = -ENETUNREACH; + if (!x25->neighbour) + goto out; + + if (usx25) { + rc = -EINVAL; + if (msg->msg_namelen < sizeof(sx25)) + goto out; + memcpy(&sx25, usx25, sizeof(sx25)); + rc = -EISCONN; + if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) + goto out; + rc = -EINVAL; + if (sx25.sx25_family != AF_X25) + goto out; + } else { + /* + * FIXME 1003.1g - if the socket is like this because + * it has become closed (not started closed) we ought + * to SIGPIPE, EPIPE; + */ + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + sx25.sx25_family = AF_X25; + sx25.sx25_addr = x25->dest_addr; + } + + /* Sanity check the packet size */ + if (len > 65535) { + rc = -EMSGSIZE; + goto out; + } + + SOCK_DEBUG(sk, "x25_sendmsg: sendto: Addresses built.\n"); + + /* Build a packet */ + SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n"); + + if ((msg->msg_flags & MSG_OOB) && len > 32) + len = 32; + + size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; + + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + if (!skb) + goto out; + X25_SKB_CB(skb)->flags = msg->msg_flags; + + skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); + + /* + * Put the data on the end + */ + SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n"); + + skb_reset_transport_header(skb); + skb_put(skb, len); + + rc = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len); + if (rc) + goto out_kfree_skb; + + /* + * If the Q BIT Include socket option is in force, the first + * byte of the user data is the logical value of the Q Bit. + */ + if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { + qbit = skb->data[0]; + skb_pull(skb, 1); + } + + /* + * Push down the X.25 header + */ + SOCK_DEBUG(sk, "x25_sendmsg: Building X.25 Header.\n"); + + if (msg->msg_flags & MSG_OOB) { + if (x25->neighbour->extended) { + asmptr = skb_push(skb, X25_STD_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_INTERRUPT; + } else { + asmptr = skb_push(skb, X25_STD_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_INTERRUPT; + } + } else { + if (x25->neighbour->extended) { + /* Build an Extended X.25 header */ + asmptr = skb_push(skb, X25_EXT_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_DATA; + *asmptr++ = X25_DATA; + } else { + /* Build an Standard X.25 header */ + asmptr = skb_push(skb, X25_STD_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_DATA; + } + + if (qbit) + skb->data[0] |= X25_Q_BIT; + } + + SOCK_DEBUG(sk, "x25_sendmsg: Built header.\n"); + SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n"); + + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out_kfree_skb; + + if (msg->msg_flags & MSG_OOB) + skb_queue_tail(&x25->interrupt_out_queue, skb); + else { + rc = x25_output(sk, skb); + len = rc; + if (rc < 0) + kfree_skb(skb); + else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) + len++; + } + + x25_kick(sk); + rc = len; +out: + release_sock(sk); + return rc; +out_kfree_skb: + kfree_skb(skb); + goto out; +} + + +static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)msg->msg_name; + size_t copied; + int qbit; + struct sk_buff *skb; + unsigned char *asmptr; + int rc = -ENOTCONN; + + lock_sock(sk); + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + if (flags & MSG_OOB) { + rc = -EINVAL; + if (sock_flag(sk, SOCK_URGINLINE) || + !skb_peek(&x25->interrupt_in_queue)) + goto out; + + skb = skb_dequeue(&x25->interrupt_in_queue); + + skb_pull(skb, X25_STD_MIN_LEN); + + /* + * No Q bit information on Interrupt data. + */ + if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { + asmptr = skb_push(skb, 1); + *asmptr = 0x00; + } + + msg->msg_flags |= MSG_OOB; + } else { + /* Now we can treat all alike */ + release_sock(sk); + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &rc); + lock_sock(sk); + if (!skb) + goto out; + + qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; + + skb_pull(skb, x25->neighbour->extended ? + X25_EXT_MIN_LEN : X25_STD_MIN_LEN); + + if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { + asmptr = skb_push(skb, 1); + *asmptr = qbit; + } + } + + skb_reset_transport_header(skb); + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + /* Currently, each datagram always contains a complete record */ + msg->msg_flags |= MSG_EOR; + + rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (rc) + goto out_free_dgram; + + if (sx25) { + sx25->sx25_family = AF_X25; + sx25->sx25_addr = x25->dest_addr; + } + + msg->msg_namelen = sizeof(struct sockaddr_x25); + + x25_check_rbuf(sk); + rc = copied; +out_free_dgram: + skb_free_datagram(sk, skb); +out: + release_sock(sk); + return rc; +} + + +static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + void __user *argp = (void __user *)arg; + int rc; + + switch (cmd) { + case TIOCOUTQ: { + int amount; + + amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (amount < 0) + amount = 0; + rc = put_user(amount, (unsigned int __user *)argp); + break; + } + + case TIOCINQ: { + struct sk_buff *skb; + int amount = 0; + /* + * These two are safe on a single CPU system as + * only user tasks fiddle here + */ + lock_sock(sk); + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + release_sock(sk); + rc = put_user(amount, (unsigned int __user *)argp); + break; + } + + case SIOCGSTAMP: + rc = -EINVAL; + if (sk) + rc = sock_get_timestamp(sk, + (struct timeval __user *)argp); + break; + case SIOCGSTAMPNS: + rc = -EINVAL; + if (sk) + rc = sock_get_timestampns(sk, + (struct timespec __user *)argp); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + rc = -EINVAL; + break; + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = x25_route_ioctl(cmd, argp); + break; + case SIOCX25GSUBSCRIP: + rc = x25_subscr_ioctl(cmd, argp); + break; + case SIOCX25SSUBSCRIP: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = x25_subscr_ioctl(cmd, argp); + break; + case SIOCX25GFACILITIES: { + lock_sock(sk); + rc = copy_to_user(argp, &x25->facilities, + sizeof(x25->facilities)) + ? -EFAULT : 0; + release_sock(sk); + break; + } + + case SIOCX25SFACILITIES: { + struct x25_facilities facilities; + rc = -EFAULT; + if (copy_from_user(&facilities, argp, + sizeof(facilities))) + break; + rc = -EINVAL; + lock_sock(sk); + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE) + goto out_fac_release; + if (facilities.pacsize_in < X25_PS16 || + facilities.pacsize_in > X25_PS4096) + goto out_fac_release; + if (facilities.pacsize_out < X25_PS16 || + facilities.pacsize_out > X25_PS4096) + goto out_fac_release; + if (facilities.winsize_in < 1 || + facilities.winsize_in > 127) + goto out_fac_release; + if (facilities.throughput) { + int out = facilities.throughput & 0xf0; + int in = facilities.throughput & 0x0f; + if (!out) + facilities.throughput |= + X25_DEFAULT_THROUGHPUT << 4; + else if (out < 0x30 || out > 0xD0) + goto out_fac_release; + if (!in) + facilities.throughput |= + X25_DEFAULT_THROUGHPUT; + else if (in < 0x03 || in > 0x0D) + goto out_fac_release; + } + if (facilities.reverse && + (facilities.reverse & 0x81) != 0x81) + goto out_fac_release; + x25->facilities = facilities; + rc = 0; +out_fac_release: + release_sock(sk); + break; + } + + case SIOCX25GDTEFACILITIES: { + lock_sock(sk); + rc = copy_to_user(argp, &x25->dte_facilities, + sizeof(x25->dte_facilities)); + release_sock(sk); + if (rc) + rc = -EFAULT; + break; + } + + case SIOCX25SDTEFACILITIES: { + struct x25_dte_facilities dtefacs; + rc = -EFAULT; + if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) + break; + rc = -EINVAL; + lock_sock(sk); + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE) + goto out_dtefac_release; + if (dtefacs.calling_len > X25_MAX_AE_LEN) + goto out_dtefac_release; + if (dtefacs.calling_ae == NULL) + goto out_dtefac_release; + if (dtefacs.called_len > X25_MAX_AE_LEN) + goto out_dtefac_release; + if (dtefacs.called_ae == NULL) + goto out_dtefac_release; + x25->dte_facilities = dtefacs; + rc = 0; +out_dtefac_release: + release_sock(sk); + break; + } + + case SIOCX25GCALLUSERDATA: { + lock_sock(sk); + rc = copy_to_user(argp, &x25->calluserdata, + sizeof(x25->calluserdata)) + ? -EFAULT : 0; + release_sock(sk); + break; + } + + case SIOCX25SCALLUSERDATA: { + struct x25_calluserdata calluserdata; + + rc = -EFAULT; + if (copy_from_user(&calluserdata, argp, + sizeof(calluserdata))) + break; + rc = -EINVAL; + if (calluserdata.cudlength > X25_MAX_CUD_LEN) + break; + lock_sock(sk); + x25->calluserdata = calluserdata; + release_sock(sk); + rc = 0; + break; + } + + case SIOCX25GCAUSEDIAG: { + lock_sock(sk); + rc = copy_to_user(argp, &x25->causediag, + sizeof(x25->causediag)) + ? -EFAULT : 0; + release_sock(sk); + break; + } + + case SIOCX25SCAUSEDIAG: { + struct x25_causediag causediag; + rc = -EFAULT; + if (copy_from_user(&causediag, argp, sizeof(causediag))) + break; + lock_sock(sk); + x25->causediag = causediag; + release_sock(sk); + rc = 0; + break; + + } + + case SIOCX25SCUDMATCHLEN: { + struct x25_subaddr sub_addr; + rc = -EINVAL; + lock_sock(sk); + if(sk->sk_state != TCP_CLOSE) + goto out_cud_release; + rc = -EFAULT; + if (copy_from_user(&sub_addr, argp, + sizeof(sub_addr))) + goto out_cud_release; + rc = -EINVAL; + if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN) + goto out_cud_release; + x25->cudmatchlength = sub_addr.cudmatchlength; + rc = 0; +out_cud_release: + release_sock(sk); + break; + } + + case SIOCX25CALLACCPTAPPRV: { + rc = -EINVAL; + lock_sock(sk); + if (sk->sk_state != TCP_CLOSE) + break; + clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); + release_sock(sk); + rc = 0; + break; + } + + case SIOCX25SENDCALLACCPT: { + rc = -EINVAL; + lock_sock(sk); + if (sk->sk_state != TCP_ESTABLISHED) + break; + /* must call accptapprv above */ + if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) + break; + x25_write_internal(sk, X25_CALL_ACCEPTED); + x25->state = X25_STATE_3; + release_sock(sk); + rc = 0; + break; + } + + default: + rc = -ENOIOCTLCMD; + break; + } + + return rc; +} + +static const struct net_proto_family x25_family_ops = { + .family = AF_X25, + .create = x25_create, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_COMPAT +static int compat_x25_subscr_ioctl(unsigned int cmd, + struct compat_x25_subscrip_struct __user *x25_subscr32) +{ + struct compat_x25_subscrip_struct x25_subscr; + struct x25_neigh *nb; + struct net_device *dev; + int rc = -EINVAL; + + rc = -EFAULT; + if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) + goto out; + + rc = -EINVAL; + dev = x25_dev_get(x25_subscr.device); + if (dev == NULL) + goto out; + + nb = x25_get_neigh(dev); + if (nb == NULL) + goto out_dev_put; + + dev_put(dev); + + if (cmd == SIOCX25GSUBSCRIP) { + read_lock_bh(&x25_neigh_list_lock); + x25_subscr.extended = nb->extended; + x25_subscr.global_facil_mask = nb->global_facil_mask; + read_unlock_bh(&x25_neigh_list_lock); + rc = copy_to_user(x25_subscr32, &x25_subscr, + sizeof(*x25_subscr32)) ? -EFAULT : 0; + } else { + rc = -EINVAL; + if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { + rc = 0; + write_lock_bh(&x25_neigh_list_lock); + nb->extended = x25_subscr.extended; + nb->global_facil_mask = x25_subscr.global_facil_mask; + write_unlock_bh(&x25_neigh_list_lock); + } + } + x25_neigh_put(nb); +out: + return rc; +out_dev_put: + dev_put(dev); + goto out; +} + +static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + + int rc = -ENOIOCTLCMD; + + switch(cmd) { + case TIOCOUTQ: + case TIOCINQ: + rc = x25_ioctl(sock, cmd, (unsigned long)argp); + break; + case SIOCGSTAMP: + rc = -EINVAL; + if (sk) + rc = compat_sock_get_timestamp(sk, + (struct timeval __user*)argp); + break; + case SIOCGSTAMPNS: + rc = -EINVAL; + if (sk) + rc = compat_sock_get_timestampns(sk, + (struct timespec __user*)argp); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + rc = -EINVAL; + break; + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = x25_route_ioctl(cmd, argp); + break; + case SIOCX25GSUBSCRIP: + rc = compat_x25_subscr_ioctl(cmd, argp); + break; + case SIOCX25SSUBSCRIP: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = compat_x25_subscr_ioctl(cmd, argp); + break; + case SIOCX25GFACILITIES: + case SIOCX25SFACILITIES: + case SIOCX25GDTEFACILITIES: + case SIOCX25SDTEFACILITIES: + case SIOCX25GCALLUSERDATA: + case SIOCX25SCALLUSERDATA: + case SIOCX25GCAUSEDIAG: + case SIOCX25SCAUSEDIAG: + case SIOCX25SCUDMATCHLEN: + case SIOCX25CALLACCPTAPPRV: + case SIOCX25SENDCALLACCPT: + rc = x25_ioctl(sock, cmd, (unsigned long)argp); + break; + default: + rc = -ENOIOCTLCMD; + break; + } + return rc; +} +#endif + +static const struct proto_ops x25_proto_ops = { + .family = AF_X25, + .owner = THIS_MODULE, + .release = x25_release, + .bind = x25_bind, + .connect = x25_connect, + .socketpair = sock_no_socketpair, + .accept = x25_accept, + .getname = x25_getname, + .poll = datagram_poll, + .ioctl = x25_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_x25_ioctl, +#endif + .listen = x25_listen, + .shutdown = sock_no_shutdown, + .setsockopt = x25_setsockopt, + .getsockopt = x25_getsockopt, + .sendmsg = x25_sendmsg, + .recvmsg = x25_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct packet_type x25_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_X25), + .func = x25_lapb_receive_frame, +}; + +static struct notifier_block x25_dev_notifier = { + .notifier_call = x25_device_event, +}; + +void x25_kill_by_neigh(struct x25_neigh *nb) +{ + struct sock *s; + struct hlist_node *node; + + write_lock_bh(&x25_list_lock); + + sk_for_each(s, node, &x25_list) + if (x25_sk(s)->neighbour == nb) + x25_disconnect(s, ENETUNREACH, 0, 0); + + write_unlock_bh(&x25_list_lock); + + /* Remove any related forwards */ + x25_clear_forward_by_dev(nb->dev); +} + +static int __init x25_init(void) +{ + int rc = proto_register(&x25_proto, 0); + + if (rc != 0) + goto out; + + rc = sock_register(&x25_family_ops); + if (rc != 0) + goto out_proto; + + dev_add_pack(&x25_packet_type); + + rc = register_netdevice_notifier(&x25_dev_notifier); + if (rc != 0) + goto out_sock; + + printk(KERN_INFO "X.25 for Linux Version 0.2\n"); + + x25_register_sysctl(); + rc = x25_proc_init(); + if (rc != 0) + goto out_dev; +out: + return rc; +out_dev: + unregister_netdevice_notifier(&x25_dev_notifier); +out_sock: + sock_unregister(AF_X25); +out_proto: + proto_unregister(&x25_proto); + goto out; +} +module_init(x25_init); + +static void __exit x25_exit(void) +{ + x25_proc_exit(); + x25_link_free(); + x25_route_free(); + + x25_unregister_sysctl(); + + unregister_netdevice_notifier(&x25_dev_notifier); + + dev_remove_pack(&x25_packet_type); + + sock_unregister(AF_X25); + proto_unregister(&x25_proto); +} +module_exit(x25_exit); + +MODULE_AUTHOR("Jonathan Naylor "); +MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_X25); diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c new file mode 100644 index 00000000..d2efd29f --- /dev/null +++ b/net/x25/sysctl_net_x25.c @@ -0,0 +1,90 @@ +/* -*- linux-c -*- + * sysctl_net_x25.c: sysctl interface to net X.25 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/x25 directory entry (empty =) ). [MS] + */ + +#include +#include +#include +#include +#include +#include + +static int min_timer[] = { 1 * HZ }; +static int max_timer[] = { 300 * HZ }; + +static struct ctl_table_header *x25_table_header; + +static struct ctl_table x25_table[] = { + { + .procname = "restart_request_timeout", + .data = &sysctl_x25_restart_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .procname = "call_request_timeout", + .data = &sysctl_x25_call_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .procname = "reset_request_timeout", + .data = &sysctl_x25_reset_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .procname = "clear_request_timeout", + .data = &sysctl_x25_clear_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .procname = "acknowledgement_hold_back_timeout", + .data = &sysctl_x25_ack_holdback_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .procname = "x25_forward", + .data = &sysctl_x25_forward, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { 0, }, +}; + +static struct ctl_path x25_path[] = { + { .procname = "net", }, + { .procname = "x25", }, + { } +}; + +void __init x25_register_sysctl(void) +{ + x25_table_header = register_sysctl_paths(x25_path, x25_table); +} + +void x25_unregister_sysctl(void) +{ + unregister_sysctl_table(x25_table_header); +} diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c new file mode 100644 index 00000000..9005f6da --- /dev/null +++ b/net/x25/x25_dev.c @@ -0,0 +1,224 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * 2000-09-04 Henner Eisen Prevent freeing a dangling skb. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) +{ + struct sock *sk; + unsigned short frametype; + unsigned int lci; + + frametype = skb->data[2]; + lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + + /* + * LCI of zero is always for us, and its always a link control + * frame. + */ + if (lci == 0) { + x25_link_control(skb, nb, frametype); + return 0; + } + + /* + * Find an existing socket. + */ + if ((sk = x25_find_socket(lci, nb)) != NULL) { + int queued = 1; + + skb_reset_transport_header(skb); + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + queued = x25_process_rx_frame(sk, skb); + } else { + queued = !sk_add_backlog(sk, skb); + } + bh_unlock_sock(sk); + sock_put(sk); + return queued; + } + + /* + * Is is a Call Request ? if so process it. + */ + if (frametype == X25_CALL_REQUEST) + return x25_rx_call_request(skb, nb, lci); + + /* + * Its not a Call Request, nor is it a control frame. + * Can we forward it? + */ + + if (x25_forward_data(lci, nb, skb)) { + if (frametype == X25_CLEAR_CONFIRMATION) { + x25_clear_forward_by_lci(lci); + } + kfree_skb(skb); + return 1; + } + +/* + x25_transmit_clear_request(nb, lci, 0x0D); +*/ + + if (frametype != X25_CLEAR_CONFIRMATION) + printk(KERN_DEBUG "x25_receive_data(): unknown frame type %2x\n",frametype); + + return 0; +} + +int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype, struct net_device *orig_dev) +{ + struct sk_buff *nskb; + struct x25_neigh *nb; + + if (!net_eq(dev_net(dev), &init_net)) + goto drop; + + nskb = skb_copy(skb, GFP_ATOMIC); + if (!nskb) + goto drop; + kfree_skb(skb); + skb = nskb; + + /* + * Packet received from unrecognised device, throw it away. + */ + nb = x25_get_neigh(dev); + if (!nb) { + printk(KERN_DEBUG "X.25: unknown neighbour - %s\n", dev->name); + goto drop; + } + + switch (skb->data[0]) { + + case X25_IFACE_DATA: + skb_pull(skb, 1); + if (x25_receive_data(skb, nb)) { + x25_neigh_put(nb); + goto out; + } + break; + + case X25_IFACE_CONNECT: + x25_link_established(nb); + break; + + case X25_IFACE_DISCONNECT: + x25_link_terminated(nb); + break; + } + x25_neigh_put(nb); +drop: + kfree_skb(skb); +out: + return 0; +} + +void x25_establish_link(struct x25_neigh *nb) +{ + struct sk_buff *skb; + unsigned char *ptr; + + switch (nb->dev->type) { + case ARPHRD_X25: + if ((skb = alloc_skb(1, GFP_ATOMIC)) == NULL) { + printk(KERN_ERR "x25_dev: out of memory\n"); + return; + } + ptr = skb_put(skb, 1); + *ptr = X25_IFACE_CONNECT; + break; + +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + case ARPHRD_ETHER: + return; +#endif + default: + return; + } + + skb->protocol = htons(ETH_P_X25); + skb->dev = nb->dev; + + dev_queue_xmit(skb); +} + +void x25_terminate_link(struct x25_neigh *nb) +{ + struct sk_buff *skb; + unsigned char *ptr; + +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + if (nb->dev->type == ARPHRD_ETHER) + return; +#endif + if (nb->dev->type != ARPHRD_X25) + return; + + skb = alloc_skb(1, GFP_ATOMIC); + if (!skb) { + printk(KERN_ERR "x25_dev: out of memory\n"); + return; + } + + ptr = skb_put(skb, 1); + *ptr = X25_IFACE_DISCONNECT; + + skb->protocol = htons(ETH_P_X25); + skb->dev = nb->dev; + dev_queue_xmit(skb); +} + +void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) +{ + unsigned char *dptr; + + skb_reset_network_header(skb); + + switch (nb->dev->type) { + case ARPHRD_X25: + dptr = skb_push(skb, 1); + *dptr = X25_IFACE_DATA; + break; + +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + case ARPHRD_ETHER: + kfree_skb(skb); + return; +#endif + default: + kfree_skb(skb); + return; + } + + skb->protocol = htons(ETH_P_X25); + skb->dev = nb->dev; + + dev_queue_xmit(skb); +} diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c new file mode 100644 index 00000000..f77e4e75 --- /dev/null +++ b/net/x25/x25_facilities.c @@ -0,0 +1,346 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Split from x25_subr.c + * mar/20/00 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * apr/14/05 Shaun Pereira - Allow fast select with no restriction + * on response. + */ + +#include +#include +#include +#include +#include + +/** + * x25_parse_facilities - Parse facilities from skb into the facilities structs + * + * @skb: sk_buff to parse + * @facilities: Regular facilities, updated as facilities are found + * @dte_facs: ITU DTE facilities, updated as DTE facilities are found + * @vc_fac_mask: mask is updated with all facilities found + * + * Return codes: + * -1 - Parsing error, caller should drop call and clean up + * 0 - Parse OK, this skb has no facilities + * >0 - Parse OK, returns the length of the facilities header + * + */ +int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, + struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask) +{ + unsigned char *p = skb->data; + unsigned int len; + + *vc_fac_mask = 0; + + /* + * The kernel knows which facilities were set on an incoming call but + * currently this information is not available to userspace. Here we + * give userspace who read incoming call facilities 0 length to indicate + * it wasn't set. + */ + dte_facs->calling_len = 0; + dte_facs->called_len = 0; + memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae)); + memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae)); + + if (skb->len < 1) + return 0; + + len = *p++; + + if (len >= skb->len) + return -1; + + while (len > 0) { + switch (*p & X25_FAC_CLASS_MASK) { + case X25_FAC_CLASS_A: + if (len < 2) + return -1; + switch (*p) { + case X25_FAC_REVERSE: + if((p[1] & 0x81) == 0x81) { + facilities->reverse = p[1] & 0x81; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + if((p[1] & 0x01) == 0x01) { + facilities->reverse = p[1] & 0x01; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + if((p[1] & 0x80) == 0x80) { + facilities->reverse = p[1] & 0x80; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + if(p[1] == 0x00) { + facilities->reverse + = X25_DEFAULT_REVERSE; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + } + + case X25_FAC_THROUGHPUT: + facilities->throughput = p[1]; + *vc_fac_mask |= X25_MASK_THROUGHPUT; + break; + case X25_MARKER: + break; + default: + printk(KERN_DEBUG "X.25: unknown facility " + "%02X, value %02X\n", + p[0], p[1]); + break; + } + p += 2; + len -= 2; + break; + case X25_FAC_CLASS_B: + if (len < 3) + return -1; + switch (*p) { + case X25_FAC_PACKET_SIZE: + facilities->pacsize_in = p[1]; + facilities->pacsize_out = p[2]; + *vc_fac_mask |= X25_MASK_PACKET_SIZE; + break; + case X25_FAC_WINDOW_SIZE: + facilities->winsize_in = p[1]; + facilities->winsize_out = p[2]; + *vc_fac_mask |= X25_MASK_WINDOW_SIZE; + break; + default: + printk(KERN_DEBUG "X.25: unknown facility " + "%02X, values %02X, %02X\n", + p[0], p[1], p[2]); + break; + } + p += 3; + len -= 3; + break; + case X25_FAC_CLASS_C: + if (len < 4) + return -1; + printk(KERN_DEBUG "X.25: unknown facility %02X, " + "values %02X, %02X, %02X\n", + p[0], p[1], p[2], p[3]); + p += 4; + len -= 4; + break; + case X25_FAC_CLASS_D: + if (len < p[1] + 2) + return -1; + switch (*p) { + case X25_FAC_CALLING_AE: + if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1) + return -1; + dte_facs->calling_len = p[2]; + memcpy(dte_facs->calling_ae, &p[3], p[1] - 1); + *vc_fac_mask |= X25_MASK_CALLING_AE; + break; + case X25_FAC_CALLED_AE: + if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1) + return -1; + dte_facs->called_len = p[2]; + memcpy(dte_facs->called_ae, &p[3], p[1] - 1); + *vc_fac_mask |= X25_MASK_CALLED_AE; + break; + default: + printk(KERN_DEBUG "X.25: unknown facility %02X," + "length %d\n", p[0], p[1]); + break; + } + len -= p[1] + 2; + p += p[1] + 2; + break; + } + } + + return p - skb->data; +} + +/* + * Create a set of facilities. + */ +int x25_create_facilities(unsigned char *buffer, + struct x25_facilities *facilities, + struct x25_dte_facilities *dte_facs, unsigned long facil_mask) +{ + unsigned char *p = buffer + 1; + int len; + + if (!facil_mask) { + /* + * Length of the facilities field in call_req or + * call_accept packets + */ + buffer[0] = 0; + len = 1; /* 1 byte for the length field */ + return len; + } + + if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) { + *p++ = X25_FAC_REVERSE; + *p++ = facilities->reverse; + } + + if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) { + *p++ = X25_FAC_THROUGHPUT; + *p++ = facilities->throughput; + } + + if ((facilities->pacsize_in || facilities->pacsize_out) && + (facil_mask & X25_MASK_PACKET_SIZE)) { + *p++ = X25_FAC_PACKET_SIZE; + *p++ = facilities->pacsize_in ? : facilities->pacsize_out; + *p++ = facilities->pacsize_out ? : facilities->pacsize_in; + } + + if ((facilities->winsize_in || facilities->winsize_out) && + (facil_mask & X25_MASK_WINDOW_SIZE)) { + *p++ = X25_FAC_WINDOW_SIZE; + *p++ = facilities->winsize_in ? : facilities->winsize_out; + *p++ = facilities->winsize_out ? : facilities->winsize_in; + } + + if (facil_mask & (X25_MASK_CALLING_AE|X25_MASK_CALLED_AE)) { + *p++ = X25_MARKER; + *p++ = X25_DTE_SERVICES; + } + + if (dte_facs->calling_len && (facil_mask & X25_MASK_CALLING_AE)) { + unsigned bytecount = (dte_facs->calling_len + 1) >> 1; + *p++ = X25_FAC_CALLING_AE; + *p++ = 1 + bytecount; + *p++ = dte_facs->calling_len; + memcpy(p, dte_facs->calling_ae, bytecount); + p += bytecount; + } + + if (dte_facs->called_len && (facil_mask & X25_MASK_CALLED_AE)) { + unsigned bytecount = (dte_facs->called_len % 2) ? + dte_facs->called_len / 2 + 1 : + dte_facs->called_len / 2; + *p++ = X25_FAC_CALLED_AE; + *p++ = 1 + bytecount; + *p++ = dte_facs->called_len; + memcpy(p, dte_facs->called_ae, bytecount); + p+=bytecount; + } + + len = p - buffer; + buffer[0] = len - 1; + + return len; +} + +/* + * Try to reach a compromise on a set of facilities. + * + * The only real problem is with reverse charging. + */ +int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, + struct x25_facilities *new, struct x25_dte_facilities *dte) +{ + struct x25_sock *x25 = x25_sk(sk); + struct x25_facilities *ours = &x25->facilities; + struct x25_facilities theirs; + int len; + + memset(&theirs, 0, sizeof(theirs)); + memcpy(new, ours, sizeof(*new)); + + len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask); + if (len < 0) + return len; + + /* + * They want reverse charging, we won't accept it. + */ + if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) { + SOCK_DEBUG(sk, "X.25: rejecting reverse charging request\n"); + return -1; + } + + new->reverse = theirs.reverse; + + if (theirs.throughput) { + int theirs_in = theirs.throughput & 0x0f; + int theirs_out = theirs.throughput & 0xf0; + int ours_in = ours->throughput & 0x0f; + int ours_out = ours->throughput & 0xf0; + if (!ours_in || theirs_in < ours_in) { + SOCK_DEBUG(sk, "X.25: inbound throughput negotiated\n"); + new->throughput = (new->throughput & 0xf0) | theirs_in; + } + if (!ours_out || theirs_out < ours_out) { + SOCK_DEBUG(sk, + "X.25: outbound throughput negotiated\n"); + new->throughput = (new->throughput & 0x0f) | theirs_out; + } + } + + if (theirs.pacsize_in && theirs.pacsize_out) { + if (theirs.pacsize_in < ours->pacsize_in) { + SOCK_DEBUG(sk, "X.25: packet size inwards negotiated down\n"); + new->pacsize_in = theirs.pacsize_in; + } + if (theirs.pacsize_out < ours->pacsize_out) { + SOCK_DEBUG(sk, "X.25: packet size outwards negotiated down\n"); + new->pacsize_out = theirs.pacsize_out; + } + } + + if (theirs.winsize_in && theirs.winsize_out) { + if (theirs.winsize_in < ours->winsize_in) { + SOCK_DEBUG(sk, "X.25: window size inwards negotiated down\n"); + new->winsize_in = theirs.winsize_in; + } + if (theirs.winsize_out < ours->winsize_out) { + SOCK_DEBUG(sk, "X.25: window size outwards negotiated down\n"); + new->winsize_out = theirs.winsize_out; + } + } + + return len; +} + +/* + * Limit values of certain facilities according to the capability of the + * currently attached x25 link. + */ +void x25_limit_facilities(struct x25_facilities *facilities, + struct x25_neigh *nb) +{ + + if (!nb->extended) { + if (facilities->winsize_in > 7) { + printk(KERN_DEBUG "X.25: incoming winsize limited to 7\n"); + facilities->winsize_in = 7; + } + if (facilities->winsize_out > 7) { + facilities->winsize_out = 7; + printk( KERN_DEBUG "X.25: outgoing winsize limited to 7\n"); + } + } +} diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c new file mode 100644 index 00000000..c541b622 --- /dev/null +++ b/net/x25/x25_forward.c @@ -0,0 +1,167 @@ +/* + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * 03-01-2007 Added forwarding for x.25 Andrew Hendry + */ +#include +#include +#include +#include + +LIST_HEAD(x25_forward_list); +DEFINE_RWLOCK(x25_forward_list_lock); + +int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, + struct sk_buff *skb, int lci) +{ + struct x25_route *rt; + struct x25_neigh *neigh_new = NULL; + struct list_head *entry; + struct x25_forward *x25_frwd, *new_frwd; + struct sk_buff *skbn; + short same_lci = 0; + int rc = 0; + + if ((rt = x25_get_route(dest_addr)) == NULL) + goto out_no_route; + + if ((neigh_new = x25_get_neigh(rt->dev)) == NULL) { + /* This shouldn't happen, if it occurs somehow + * do something sensible + */ + goto out_put_route; + } + + /* Avoid a loop. This is the normal exit path for a + * system with only one x.25 iface and default route + */ + if (rt->dev == from->dev) { + goto out_put_nb; + } + + /* Remote end sending a call request on an already + * established LCI? It shouldn't happen, just in case.. + */ + read_lock_bh(&x25_forward_list_lock); + list_for_each(entry, &x25_forward_list) { + x25_frwd = list_entry(entry, struct x25_forward, node); + if (x25_frwd->lci == lci) { + printk(KERN_WARNING "X.25: call request for lci which is already registered!, transmitting but not registering new pair\n"); + same_lci = 1; + } + } + read_unlock_bh(&x25_forward_list_lock); + + /* Save the forwarding details for future traffic */ + if (!same_lci){ + if ((new_frwd = kmalloc(sizeof(struct x25_forward), + GFP_ATOMIC)) == NULL){ + rc = -ENOMEM; + goto out_put_nb; + } + new_frwd->lci = lci; + new_frwd->dev1 = rt->dev; + new_frwd->dev2 = from->dev; + write_lock_bh(&x25_forward_list_lock); + list_add(&new_frwd->node, &x25_forward_list); + write_unlock_bh(&x25_forward_list_lock); + } + + /* Forward the call request */ + if ( (skbn = skb_clone(skb, GFP_ATOMIC)) == NULL){ + goto out_put_nb; + } + x25_transmit_link(skbn, neigh_new); + rc = 1; + + +out_put_nb: + x25_neigh_put(neigh_new); + +out_put_route: + x25_route_put(rt); + +out_no_route: + return rc; +} + + +int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { + + struct x25_forward *frwd; + struct list_head *entry; + struct net_device *peer = NULL; + struct x25_neigh *nb; + struct sk_buff *skbn; + int rc = 0; + + read_lock_bh(&x25_forward_list_lock); + list_for_each(entry, &x25_forward_list) { + frwd = list_entry(entry, struct x25_forward, node); + if (frwd->lci == lci) { + /* The call is established, either side can send */ + if (from->dev == frwd->dev1) { + peer = frwd->dev2; + } else { + peer = frwd->dev1; + } + break; + } + } + read_unlock_bh(&x25_forward_list_lock); + + if ( (nb = x25_get_neigh(peer)) == NULL) + goto out; + + if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){ + goto output; + + } + x25_transmit_link(skbn, nb); + + rc = 1; +output: + x25_neigh_put(nb); +out: + return rc; +} + +void x25_clear_forward_by_lci(unsigned int lci) +{ + struct x25_forward *fwd; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_forward_list_lock); + + list_for_each_safe(entry, tmp, &x25_forward_list) { + fwd = list_entry(entry, struct x25_forward, node); + if (fwd->lci == lci) { + list_del(&fwd->node); + kfree(fwd); + } + } + write_unlock_bh(&x25_forward_list_lock); +} + + +void x25_clear_forward_by_dev(struct net_device *dev) +{ + struct x25_forward *fwd; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_forward_list_lock); + + list_for_each_safe(entry, tmp, &x25_forward_list) { + fwd = list_entry(entry, struct x25_forward, node); + if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){ + list_del(&fwd->node); + kfree(fwd); + } + } + write_unlock_bh(&x25_forward_list_lock); +} diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c new file mode 100644 index 00000000..15de65f0 --- /dev/null +++ b/net/x25/x25_in.c @@ -0,0 +1,384 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor Centralised disconnection code. + * New timer architecture. + * 2000-03-20 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * 2000-11-10 Henner Eisen Check and reset for out-of-sequence + * i-frames. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) +{ + struct sk_buff *skbo, *skbn = skb; + struct x25_sock *x25 = x25_sk(sk); + + if (more) { + x25->fraglen += skb->len; + skb_queue_tail(&x25->fragment_queue, skb); + skb_set_owner_r(skb, sk); + return 0; + } + + if (!more && x25->fraglen > 0) { /* End of fragment */ + int len = x25->fraglen + skb->len; + + if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL){ + kfree_skb(skb); + return 1; + } + + skb_queue_tail(&x25->fragment_queue, skb); + + skb_reset_transport_header(skbn); + + skbo = skb_dequeue(&x25->fragment_queue); + skb_copy_from_linear_data(skbo, skb_put(skbn, skbo->len), + skbo->len); + kfree_skb(skbo); + + while ((skbo = + skb_dequeue(&x25->fragment_queue)) != NULL) { + skb_pull(skbo, (x25->neighbour->extended) ? + X25_EXT_MIN_LEN : X25_STD_MIN_LEN); + skb_copy_from_linear_data(skbo, + skb_put(skbn, skbo->len), + skbo->len); + kfree_skb(skbo); + } + + x25->fraglen = 0; + } + + skb_set_owner_r(skbn, sk); + skb_queue_tail(&sk->sk_receive_queue, skbn); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skbn->len); + + return 0; +} + +/* + * State machine for state 1, Awaiting Call Accepted State. + * The handling of the timer(s) is in file x25_timer.c. + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct x25_address source_addr, dest_addr; + int len; + struct x25_sock *x25 = x25_sk(sk); + + switch (frametype) { + case X25_CALL_ACCEPTED: { + + x25_stop_timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->va = 0; + x25->vr = 0; + x25->vl = 0; + x25->state = X25_STATE_3; + sk->sk_state = TCP_ESTABLISHED; + /* + * Parse the data in the frame. + */ + skb_pull(skb, X25_STD_MIN_LEN); + + len = x25_parse_address_block(skb, &source_addr, + &dest_addr); + if (len > 0) + skb_pull(skb, len); + else if (len < 0) + goto out_clear; + + len = x25_parse_facilities(skb, &x25->facilities, + &x25->dte_facilities, + &x25->vc_facil_mask); + if (len > 0) + skb_pull(skb, len); + else if (len < 0) + goto out_clear; + /* + * Copy any Call User Data. + */ + if (skb->len > 0) { + skb_copy_from_linear_data(skb, + x25->calluserdata.cuddata, + skb->len); + x25->calluserdata.cudlength = skb->len; + } + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + } + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); + break; + + default: + break; + } + + return 0; + +out_clear: + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25->state = X25_STATE_2; + x25_start_t23timer(sk); + return 0; +} + +/* + * State machine for state 2, Awaiting Clear Confirmation State. + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + case X25_CLEAR_CONFIRMATION: + x25_disconnect(sk, 0, 0, 0); + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) +{ + int queued = 0; + int modulus; + struct x25_sock *x25 = x25_sk(sk); + + modulus = (x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS; + + switch (frametype) { + + case X25_RESET_REQUEST: + x25_write_internal(sk, X25_RESET_CONFIRMATION); + x25_stop_timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25_requeue_frames(sk); + break; + + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + case X25_RR: + case X25_RNR: + if (!x25_validate_nr(sk, nr)) { + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25->state = X25_STATE_4; + } else { + x25_frames_acked(sk, nr); + if (frametype == X25_RNR) { + x25->condition |= X25_COND_PEER_RX_BUSY; + } else { + x25->condition &= ~X25_COND_PEER_RX_BUSY; + } + } + break; + + case X25_DATA: /* XXX */ + x25->condition &= ~X25_COND_PEER_RX_BUSY; + if ((ns != x25->vr) || !x25_validate_nr(sk, nr)) { + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25->state = X25_STATE_4; + break; + } + x25_frames_acked(sk, nr); + if (ns == x25->vr) { + if (x25_queue_rx_frame(sk, skb, m) == 0) { + x25->vr = (x25->vr + 1) % modulus; + queued = 1; + } else { + /* Should never happen */ + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25->state = X25_STATE_4; + break; + } + if (atomic_read(&sk->sk_rmem_alloc) > + (sk->sk_rcvbuf >> 1)) + x25->condition |= X25_COND_OWN_RX_BUSY; + } + /* + * If the window is full Ack it immediately, else + * start the holdback timer. + */ + if (((x25->vl + x25->facilities.winsize_in) % modulus) == x25->vr) { + x25->condition &= ~X25_COND_ACK_PENDING; + x25_stop_timer(sk); + x25_enquiry_response(sk); + } else { + x25->condition |= X25_COND_ACK_PENDING; + x25_start_t2timer(sk); + } + break; + + case X25_INTERRUPT_CONFIRMATION: + clear_bit(X25_INTERRUPT_FLAG, &x25->flags); + break; + + case X25_INTERRUPT: + if (sock_flag(sk, SOCK_URGINLINE)) + queued = !sock_queue_rcv_skb(sk, skb); + else { + skb_set_owner_r(skb, sk); + skb_queue_tail(&x25->interrupt_in_queue, skb); + queued = 1; + } + sk_send_sigurg(sk); + x25_write_internal(sk, X25_INTERRUPT_CONFIRMATION); + break; + + default: + printk(KERN_WARNING "x25: unknown %02X in state 3\n", frametype); + break; + } + + return queued; +} + +/* + * State machine for state 4, Awaiting Reset Confirmation State. + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case X25_RESET_REQUEST: + x25_write_internal(sk, X25_RESET_CONFIRMATION); + case X25_RESET_CONFIRMATION: { + struct x25_sock *x25 = x25_sk(sk); + + x25_stop_timer(sk); + x25->condition = 0x00; + x25->va = 0; + x25->vr = 0; + x25->vs = 0; + x25->vl = 0; + x25->state = X25_STATE_3; + x25_requeue_frames(sk); + break; + } + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + default: + break; + } + + return 0; +} + +/* Higher level upcall for a LAPB frame */ +int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + struct x25_sock *x25 = x25_sk(sk); + int queued = 0, frametype, ns, nr, q, d, m; + + if (x25->state == X25_STATE_0) + return 0; + + frametype = x25_decode(sk, skb, &ns, &nr, &q, &d, &m); + + switch (x25->state) { + case X25_STATE_1: + queued = x25_state1_machine(sk, skb, frametype); + break; + case X25_STATE_2: + queued = x25_state2_machine(sk, skb, frametype); + break; + case X25_STATE_3: + queued = x25_state3_machine(sk, skb, frametype, ns, nr, q, d, m); + break; + case X25_STATE_4: + queued = x25_state4_machine(sk, skb, frametype); + break; + } + + x25_kick(sk); + + return queued; +} + +int x25_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int queued = x25_process_rx_frame(sk, skb); + + if (!queued) + kfree_skb(skb); + + return 0; +} diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c new file mode 100644 index 00000000..21306928 --- /dev/null +++ b/net/x25/x25_link.c @@ -0,0 +1,407 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor New timer architecture. + * mar/20/00 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * 2000-09-04 Henner Eisen dev_hold() / dev_put() for x25_neigh. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +LIST_HEAD(x25_neigh_list); +DEFINE_RWLOCK(x25_neigh_list_lock); + +static void x25_t20timer_expiry(unsigned long); + +static void x25_transmit_restart_confirmation(struct x25_neigh *nb); +static void x25_transmit_restart_request(struct x25_neigh *nb); + +/* + * Linux set/reset timer routines + */ +static inline void x25_start_t20timer(struct x25_neigh *nb) +{ + mod_timer(&nb->t20timer, jiffies + nb->t20); +} + +static void x25_t20timer_expiry(unsigned long param) +{ + struct x25_neigh *nb = (struct x25_neigh *)param; + + x25_transmit_restart_request(nb); + + x25_start_t20timer(nb); +} + +static inline void x25_stop_t20timer(struct x25_neigh *nb) +{ + del_timer(&nb->t20timer); +} + +static inline int x25_t20timer_pending(struct x25_neigh *nb) +{ + return timer_pending(&nb->t20timer); +} + +/* + * This handles all restart and diagnostic frames. + */ +void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb, + unsigned short frametype) +{ + struct sk_buff *skbn; + int confirm; + + switch (frametype) { + case X25_RESTART_REQUEST: + confirm = !x25_t20timer_pending(nb); + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + if (confirm) + x25_transmit_restart_confirmation(nb); + break; + + case X25_RESTART_CONFIRMATION: + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + break; + + case X25_DIAGNOSTIC: + printk(KERN_WARNING "x25: diagnostic #%d - " + "%02X %02X %02X\n", + skb->data[3], skb->data[4], + skb->data[5], skb->data[6]); + break; + + default: + printk(KERN_WARNING "x25: received unknown %02X " + "with LCI 000\n", frametype); + break; + } + + if (nb->state == X25_LINK_STATE_3) + while ((skbn = skb_dequeue(&nb->queue)) != NULL) + x25_send_frame(skbn, nb); +} + +/* + * This routine is called when a Restart Request is needed + */ +static void x25_transmit_restart_request(struct x25_neigh *nb) +{ + unsigned char *dptr; + int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + skb_reserve(skb, X25_MAX_L2_LEN); + + dptr = skb_put(skb, X25_STD_MIN_LEN + 2); + + *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ; + *dptr++ = 0x00; + *dptr++ = X25_RESTART_REQUEST; + *dptr++ = 0x00; + *dptr++ = 0; + + skb->sk = NULL; + + x25_send_frame(skb, nb); +} + +/* + * This routine is called when a Restart Confirmation is needed + */ +static void x25_transmit_restart_confirmation(struct x25_neigh *nb) +{ + unsigned char *dptr; + int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + skb_reserve(skb, X25_MAX_L2_LEN); + + dptr = skb_put(skb, X25_STD_MIN_LEN); + + *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ; + *dptr++ = 0x00; + *dptr++ = X25_RESTART_CONFIRMATION; + + skb->sk = NULL; + + x25_send_frame(skb, nb); +} + +/* + * This routine is called when a Clear Request is needed outside of the context + * of a connected socket. + */ +void x25_transmit_clear_request(struct x25_neigh *nb, unsigned int lci, + unsigned char cause) +{ + unsigned char *dptr; + int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + skb_reserve(skb, X25_MAX_L2_LEN); + + dptr = skb_put(skb, X25_STD_MIN_LEN + 2); + + *dptr++ = ((lci >> 8) & 0x0F) | (nb->extended ? + X25_GFI_EXTSEQ : + X25_GFI_STDSEQ); + *dptr++ = (lci >> 0) & 0xFF; + *dptr++ = X25_CLEAR_REQUEST; + *dptr++ = cause; + *dptr++ = 0x00; + + skb->sk = NULL; + + x25_send_frame(skb, nb); +} + +void x25_transmit_link(struct sk_buff *skb, struct x25_neigh *nb) +{ + switch (nb->state) { + case X25_LINK_STATE_0: + skb_queue_tail(&nb->queue, skb); + nb->state = X25_LINK_STATE_1; + x25_establish_link(nb); + break; + case X25_LINK_STATE_1: + case X25_LINK_STATE_2: + skb_queue_tail(&nb->queue, skb); + break; + case X25_LINK_STATE_3: + x25_send_frame(skb, nb); + break; + } +} + +/* + * Called when the link layer has become established. + */ +void x25_link_established(struct x25_neigh *nb) +{ + switch (nb->state) { + case X25_LINK_STATE_0: + nb->state = X25_LINK_STATE_2; + break; + case X25_LINK_STATE_1: + x25_transmit_restart_request(nb); + nb->state = X25_LINK_STATE_2; + x25_start_t20timer(nb); + break; + } +} + +/* + * Called when the link layer has terminated, or an establishment + * request has failed. + */ + +void x25_link_terminated(struct x25_neigh *nb) +{ + nb->state = X25_LINK_STATE_0; + /* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */ + x25_kill_by_neigh(nb); +} + +/* + * Add a new device. + */ +void x25_link_device_up(struct net_device *dev) +{ + struct x25_neigh *nb = kmalloc(sizeof(*nb), GFP_ATOMIC); + + if (!nb) + return; + + skb_queue_head_init(&nb->queue); + setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb); + + dev_hold(dev); + nb->dev = dev; + nb->state = X25_LINK_STATE_0; + nb->extended = 0; + /* + * Enables negotiation + */ + nb->global_facil_mask = X25_MASK_REVERSE | + X25_MASK_THROUGHPUT | + X25_MASK_PACKET_SIZE | + X25_MASK_WINDOW_SIZE; + nb->t20 = sysctl_x25_restart_request_timeout; + atomic_set(&nb->refcnt, 1); + + write_lock_bh(&x25_neigh_list_lock); + list_add(&nb->node, &x25_neigh_list); + write_unlock_bh(&x25_neigh_list_lock); +} + +/** + * __x25_remove_neigh - remove neighbour from x25_neigh_list + * @nb - neigh to remove + * + * Remove neighbour from x25_neigh_list. If it was there. + * Caller must hold x25_neigh_list_lock. + */ +static void __x25_remove_neigh(struct x25_neigh *nb) +{ + skb_queue_purge(&nb->queue); + x25_stop_t20timer(nb); + + if (nb->node.next) { + list_del(&nb->node); + x25_neigh_put(nb); + } +} + +/* + * A device has been removed, remove its links. + */ +void x25_link_device_down(struct net_device *dev) +{ + struct x25_neigh *nb; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_neigh_list_lock); + + list_for_each_safe(entry, tmp, &x25_neigh_list) { + nb = list_entry(entry, struct x25_neigh, node); + + if (nb->dev == dev) { + __x25_remove_neigh(nb); + dev_put(dev); + } + } + + write_unlock_bh(&x25_neigh_list_lock); +} + +/* + * Given a device, return the neighbour address. + */ +struct x25_neigh *x25_get_neigh(struct net_device *dev) +{ + struct x25_neigh *nb, *use = NULL; + struct list_head *entry; + + read_lock_bh(&x25_neigh_list_lock); + list_for_each(entry, &x25_neigh_list) { + nb = list_entry(entry, struct x25_neigh, node); + + if (nb->dev == dev) { + use = nb; + break; + } + } + + if (use) + x25_neigh_hold(use); + read_unlock_bh(&x25_neigh_list_lock); + return use; +} + +/* + * Handle the ioctls that control the subscription functions. + */ +int x25_subscr_ioctl(unsigned int cmd, void __user *arg) +{ + struct x25_subscrip_struct x25_subscr; + struct x25_neigh *nb; + struct net_device *dev; + int rc = -EINVAL; + + if (cmd != SIOCX25GSUBSCRIP && cmd != SIOCX25SSUBSCRIP) + goto out; + + rc = -EFAULT; + if (copy_from_user(&x25_subscr, arg, sizeof(x25_subscr))) + goto out; + + rc = -EINVAL; + if ((dev = x25_dev_get(x25_subscr.device)) == NULL) + goto out; + + if ((nb = x25_get_neigh(dev)) == NULL) + goto out_dev_put; + + dev_put(dev); + + if (cmd == SIOCX25GSUBSCRIP) { + read_lock_bh(&x25_neigh_list_lock); + x25_subscr.extended = nb->extended; + x25_subscr.global_facil_mask = nb->global_facil_mask; + read_unlock_bh(&x25_neigh_list_lock); + rc = copy_to_user(arg, &x25_subscr, + sizeof(x25_subscr)) ? -EFAULT : 0; + } else { + rc = -EINVAL; + if (!(x25_subscr.extended && x25_subscr.extended != 1)) { + rc = 0; + write_lock_bh(&x25_neigh_list_lock); + nb->extended = x25_subscr.extended; + nb->global_facil_mask = x25_subscr.global_facil_mask; + write_unlock_bh(&x25_neigh_list_lock); + } + } + x25_neigh_put(nb); +out: + return rc; +out_dev_put: + dev_put(dev); + goto out; +} + + +/* + * Release all memory associated with X.25 neighbour structures. + */ +void __exit x25_link_free(void) +{ + struct x25_neigh *nb; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_neigh_list_lock); + + list_for_each_safe(entry, tmp, &x25_neigh_list) { + struct net_device *dev; + + nb = list_entry(entry, struct x25_neigh, node); + dev = nb->dev; + __x25_remove_neigh(nb); + dev_put(dev); + } + write_unlock_bh(&x25_neigh_list_lock); +} diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c new file mode 100644 index 00000000..0144271d --- /dev/null +++ b/net/x25/x25_out.c @@ -0,0 +1,231 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor New timer architecture. + * 2000-09-04 Henner Eisen Prevented x25_output() skb leakage. + * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. + * 2000-11-10 Henner Eisen x25_send_iframe(): re-queued frames + * needed cleaned seq-number fields. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int x25_pacsize_to_bytes(unsigned int pacsize) +{ + int bytes = 1; + + if (!pacsize) + return 128; + + while (pacsize-- > 0) + bytes *= 2; + + return bytes; +} + +/* + * This is where all X.25 information frames pass. + * + * Returns the amount of user data bytes sent on success + * or a negative error code on failure. + */ +int x25_output(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char header[X25_EXT_MIN_LEN]; + int err, frontlen, len; + int sent=0, noblock = X25_SKB_CB(skb)->flags & MSG_DONTWAIT; + struct x25_sock *x25 = x25_sk(sk); + int header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : + X25_STD_MIN_LEN; + int max_len = x25_pacsize_to_bytes(x25->facilities.pacsize_out); + + if (skb->len - header_len > max_len) { + /* Save a copy of the Header */ + skb_copy_from_linear_data(skb, header, header_len); + skb_pull(skb, header_len); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + release_sock(sk); + skbn = sock_alloc_send_skb(sk, frontlen + max_len, + noblock, &err); + lock_sock(sk); + if (!skbn) { + if (err == -EWOULDBLOCK && noblock){ + kfree_skb(skb); + return sent; + } + SOCK_DEBUG(sk, "x25_output: fragment alloc" + " failed, err=%d, %d bytes " + "sent\n", err, sent); + return err; + } + + skb_reserve(skbn, frontlen); + + len = max_len > skb->len ? skb->len : max_len; + + /* Copy the user data */ + skb_copy_from_linear_data(skb, skb_put(skbn, len), len); + skb_pull(skb, len); + + /* Duplicate the Header */ + skb_push(skbn, header_len); + skb_copy_to_linear_data(skbn, header, header_len); + + if (skb->len > 0) { + if (x25->neighbour->extended) + skbn->data[3] |= X25_EXT_M_BIT; + else + skbn->data[2] |= X25_STD_M_BIT; + } + + skb_queue_tail(&sk->sk_write_queue, skbn); + sent += len; + } + + kfree_skb(skb); + } else { + skb_queue_tail(&sk->sk_write_queue, skb); + sent = skb->len - header_len; + } + return sent; +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void x25_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (!skb) + return; + + if (x25->neighbour->extended) { + skb->data[2] = (x25->vs << 1) & 0xFE; + skb->data[3] &= X25_EXT_M_BIT; + skb->data[3] |= (x25->vr << 1) & 0xFE; + } else { + skb->data[2] &= X25_STD_M_BIT; + skb->data[2] |= (x25->vs << 1) & 0x0E; + skb->data[2] |= (x25->vr << 5) & 0xE0; + } + + x25_transmit_link(skb, x25->neighbour); +} + +void x25_kick(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + unsigned short start, end; + int modulus; + struct x25_sock *x25 = x25_sk(sk); + + if (x25->state != X25_STATE_3) + return; + + /* + * Transmit interrupt data. + */ + if (skb_peek(&x25->interrupt_out_queue) != NULL && + !test_and_set_bit(X25_INTERRUPT_FLAG, &x25->flags)) { + + skb = skb_dequeue(&x25->interrupt_out_queue); + x25_transmit_link(skb, x25->neighbour); + } + + if (x25->condition & X25_COND_PEER_RX_BUSY) + return; + + if (!skb_peek(&sk->sk_write_queue)) + return; + + modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS; + + start = skb_peek(&x25->ack_queue) ? x25->vs : x25->va; + end = (x25->va + x25->facilities.winsize_out) % modulus; + + if (start == end) + return; + + x25->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + skb = skb_dequeue(&sk->sk_write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->sk_write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + + /* + * Transmit the frame copy. + */ + x25_send_iframe(sk, skbn); + + x25->vs = (x25->vs + 1) % modulus; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&x25->ack_queue, skb); + + } while (x25->vs != end && + (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); + + x25->vl = x25->vr; + x25->condition &= ~X25_COND_ACK_PENDING; + + x25_stop_timer(sk); +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void x25_enquiry_response(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (x25->condition & X25_COND_OWN_RX_BUSY) + x25_write_internal(sk, X25_RNR); + else + x25_write_internal(sk, X25_RR); + + x25->vl = x25->vr; + x25->condition &= ~X25_COND_ACK_PENDING; + + x25_stop_timer(sk); +} diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c new file mode 100644 index 00000000..7ff37379 --- /dev/null +++ b/net/x25/x25_proc.c @@ -0,0 +1,266 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.4 with seq_file support + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * 2002/10/06 Arnaldo Carvalho de Melo seq_file support + */ + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS + +static void *x25_seq_route_start(struct seq_file *seq, loff_t *pos) + __acquires(x25_route_list_lock) +{ + read_lock_bh(&x25_route_list_lock); + return seq_list_start_head(&x25_route_list, *pos); +} + +static void *x25_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &x25_route_list, pos); +} + +static void x25_seq_route_stop(struct seq_file *seq, void *v) + __releases(x25_route_list_lock) +{ + read_unlock_bh(&x25_route_list_lock); +} + +static int x25_seq_route_show(struct seq_file *seq, void *v) +{ + struct x25_route *rt = list_entry(v, struct x25_route, node); + + if (v == &x25_route_list) { + seq_puts(seq, "Address Digits Device\n"); + goto out; + } + + rt = v; + seq_printf(seq, "%-15s %-6d %-5s\n", + rt->address.x25_addr, rt->sigdigits, + rt->dev ? rt->dev->name : "???"); +out: + return 0; +} + +static void *x25_seq_socket_start(struct seq_file *seq, loff_t *pos) + __acquires(x25_list_lock) +{ + read_lock_bh(&x25_list_lock); + return seq_hlist_start_head(&x25_list, *pos); +} + +static void *x25_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_hlist_next(v, &x25_list, pos); +} + +static void x25_seq_socket_stop(struct seq_file *seq, void *v) + __releases(x25_list_lock) +{ + read_unlock_bh(&x25_list_lock); +} + +static int x25_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock *s; + struct x25_sock *x25; + struct net_device *dev; + const char *devname; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "dest_addr src_addr dev lci st vs vr " + "va t t2 t21 t22 t23 Snd-Q Rcv-Q inode\n"); + goto out; + } + + s = sk_entry(v); + x25 = x25_sk(s); + + if (!x25->neighbour || (dev = x25->neighbour->dev) == NULL) + devname = "???"; + else + devname = x25->neighbour->dev->name; + + seq_printf(seq, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu " + "%3lu %3lu %3lu %5d %5d %ld\n", + !x25->dest_addr.x25_addr[0] ? "*" : x25->dest_addr.x25_addr, + !x25->source_addr.x25_addr[0] ? "*" : x25->source_addr.x25_addr, + devname, x25->lci & 0x0FFF, x25->state, x25->vs, x25->vr, + x25->va, x25_display_timer(s) / HZ, x25->t2 / HZ, + x25->t21 / HZ, x25->t22 / HZ, x25->t23 / HZ, + sk_wmem_alloc_get(s), + sk_rmem_alloc_get(s), + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); +out: + return 0; +} + +static void *x25_seq_forward_start(struct seq_file *seq, loff_t *pos) + __acquires(x25_forward_list_lock) +{ + read_lock_bh(&x25_forward_list_lock); + return seq_list_start_head(&x25_forward_list, *pos); +} + +static void *x25_seq_forward_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &x25_forward_list, pos); +} + +static void x25_seq_forward_stop(struct seq_file *seq, void *v) + __releases(x25_forward_list_lock) +{ + read_unlock_bh(&x25_forward_list_lock); +} + +static int x25_seq_forward_show(struct seq_file *seq, void *v) +{ + struct x25_forward *f = list_entry(v, struct x25_forward, node); + + if (v == &x25_forward_list) { + seq_printf(seq, "lci dev1 dev2\n"); + goto out; + } + + f = v; + + seq_printf(seq, "%d %-10s %-10s\n", + f->lci, f->dev1->name, f->dev2->name); +out: + return 0; +} + +static const struct seq_operations x25_seq_route_ops = { + .start = x25_seq_route_start, + .next = x25_seq_route_next, + .stop = x25_seq_route_stop, + .show = x25_seq_route_show, +}; + +static const struct seq_operations x25_seq_socket_ops = { + .start = x25_seq_socket_start, + .next = x25_seq_socket_next, + .stop = x25_seq_socket_stop, + .show = x25_seq_socket_show, +}; + +static const struct seq_operations x25_seq_forward_ops = { + .start = x25_seq_forward_start, + .next = x25_seq_forward_next, + .stop = x25_seq_forward_stop, + .show = x25_seq_forward_show, +}; + +static int x25_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &x25_seq_socket_ops); +} + +static int x25_seq_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &x25_seq_route_ops); +} + +static int x25_seq_forward_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &x25_seq_forward_ops); +} + +static const struct file_operations x25_seq_socket_fops = { + .owner = THIS_MODULE, + .open = x25_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations x25_seq_route_fops = { + .owner = THIS_MODULE, + .open = x25_seq_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations x25_seq_forward_fops = { + .owner = THIS_MODULE, + .open = x25_seq_forward_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *x25_proc_dir; + +int __init x25_proc_init(void) +{ + struct proc_dir_entry *p; + int rc = -ENOMEM; + + x25_proc_dir = proc_mkdir("x25", init_net.proc_net); + if (!x25_proc_dir) + goto out; + + p = proc_create("route", S_IRUGO, x25_proc_dir, &x25_seq_route_fops); + if (!p) + goto out_route; + + p = proc_create("socket", S_IRUGO, x25_proc_dir, &x25_seq_socket_fops); + if (!p) + goto out_socket; + + p = proc_create("forward", S_IRUGO, x25_proc_dir, + &x25_seq_forward_fops); + if (!p) + goto out_forward; + rc = 0; + +out: + return rc; +out_forward: + remove_proc_entry("socket", x25_proc_dir); +out_socket: + remove_proc_entry("route", x25_proc_dir); +out_route: + remove_proc_entry("x25", init_net.proc_net); + goto out; +} + +void __exit x25_proc_exit(void) +{ + remove_proc_entry("forward", x25_proc_dir); + remove_proc_entry("route", x25_proc_dir); + remove_proc_entry("socket", x25_proc_dir); + remove_proc_entry("x25", init_net.proc_net); +} + +#else /* CONFIG_PROC_FS */ + +int __init x25_proc_init(void) +{ + return 0; +} + +void __exit x25_proc_exit(void) +{ +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c new file mode 100644 index 00000000..97d77c53 --- /dev/null +++ b/net/x25/x25_route.c @@ -0,0 +1,226 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + */ + +#include +#include +#include +#include + +LIST_HEAD(x25_route_list); +DEFINE_RWLOCK(x25_route_list_lock); + +/* + * Add a new route. + */ +static int x25_add_route(struct x25_address *address, unsigned int sigdigits, + struct net_device *dev) +{ + struct x25_route *rt; + struct list_head *entry; + int rc = -EINVAL; + + write_lock_bh(&x25_route_list_lock); + + list_for_each(entry, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (!memcmp(&rt->address, address, sigdigits) && + rt->sigdigits == sigdigits) + goto out; + } + + rt = kmalloc(sizeof(*rt), GFP_ATOMIC); + rc = -ENOMEM; + if (!rt) + goto out; + + strcpy(rt->address.x25_addr, "000000000000000"); + memcpy(rt->address.x25_addr, address->x25_addr, sigdigits); + + rt->sigdigits = sigdigits; + rt->dev = dev; + atomic_set(&rt->refcnt, 1); + + list_add(&rt->node, &x25_route_list); + rc = 0; +out: + write_unlock_bh(&x25_route_list_lock); + return rc; +} + +/** + * __x25_remove_route - remove route from x25_route_list + * @rt - route to remove + * + * Remove route from x25_route_list. If it was there. + * Caller must hold x25_route_list_lock. + */ +static void __x25_remove_route(struct x25_route *rt) +{ + if (rt->node.next) { + list_del(&rt->node); + x25_route_put(rt); + } +} + +static int x25_del_route(struct x25_address *address, unsigned int sigdigits, + struct net_device *dev) +{ + struct x25_route *rt; + struct list_head *entry; + int rc = -EINVAL; + + write_lock_bh(&x25_route_list_lock); + + list_for_each(entry, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (!memcmp(&rt->address, address, sigdigits) && + rt->sigdigits == sigdigits && rt->dev == dev) { + __x25_remove_route(rt); + rc = 0; + break; + } + } + + write_unlock_bh(&x25_route_list_lock); + return rc; +} + +/* + * A device has been removed, remove its routes. + */ +void x25_route_device_down(struct net_device *dev) +{ + struct x25_route *rt; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_route_list_lock); + + list_for_each_safe(entry, tmp, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (rt->dev == dev) + __x25_remove_route(rt); + } + write_unlock_bh(&x25_route_list_lock); + + /* Remove any related forwarding */ + x25_clear_forward_by_dev(dev); +} + +/* + * Check that the device given is a valid X.25 interface that is "up". + */ +struct net_device *x25_dev_get(char *devname) +{ + struct net_device *dev = dev_get_by_name(&init_net, devname); + + if (dev && + (!(dev->flags & IFF_UP) || (dev->type != ARPHRD_X25 +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + && dev->type != ARPHRD_ETHER +#endif + ))){ + dev_put(dev); + dev = NULL; + } + + return dev; +} + +/** + * x25_get_route - Find a route given an X.25 address. + * @addr - address to find a route for + * + * Find a route given an X.25 address. + */ +struct x25_route *x25_get_route(struct x25_address *addr) +{ + struct x25_route *rt, *use = NULL; + struct list_head *entry; + + read_lock_bh(&x25_route_list_lock); + + list_for_each(entry, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (!memcmp(&rt->address, addr, rt->sigdigits)) { + if (!use) + use = rt; + else if (rt->sigdigits > use->sigdigits) + use = rt; + } + } + + if (use) + x25_route_hold(use); + + read_unlock_bh(&x25_route_list_lock); + return use; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int x25_route_ioctl(unsigned int cmd, void __user *arg) +{ + struct x25_route_struct rt; + struct net_device *dev; + int rc = -EINVAL; + + if (cmd != SIOCADDRT && cmd != SIOCDELRT) + goto out; + + rc = -EFAULT; + if (copy_from_user(&rt, arg, sizeof(rt))) + goto out; + + rc = -EINVAL; + if (rt.sigdigits > 15) + goto out; + + dev = x25_dev_get(rt.device); + if (!dev) + goto out; + + if (cmd == SIOCADDRT) + rc = x25_add_route(&rt.address, rt.sigdigits, dev); + else + rc = x25_del_route(&rt.address, rt.sigdigits, dev); + dev_put(dev); +out: + return rc; +} + +/* + * Release all memory associated with X.25 routing structures. + */ +void __exit x25_route_free(void) +{ + struct x25_route *rt; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_route_list_lock); + list_for_each_safe(entry, tmp, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + __x25_remove_route(rt); + } + write_unlock_bh(&x25_route_list_lock); +} diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c new file mode 100644 index 00000000..dc20cf12 --- /dev/null +++ b/net/x25/x25_subr.c @@ -0,0 +1,378 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor Centralised disconnection processing. + * mar/20/00 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups + * apr/04/15 Shaun Pereira Fast select with no + * restriction on response. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all of the queues of frames. + */ +void x25_clear_queues(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&x25->ack_queue); + skb_queue_purge(&x25->interrupt_in_queue); + skb_queue_purge(&x25->interrupt_out_queue); + skb_queue_purge(&x25->fragment_queue); +} + + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. +*/ +void x25_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + struct x25_sock *x25 = x25_sk(sk); + int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (x25->va != nr) + while (skb_peek(&x25->ack_queue) && x25->va != nr) { + skb = skb_dequeue(&x25->ack_queue); + kfree_skb(skb); + x25->va = (x25->va + 1) % modulus; + } +} + +void x25_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by x25_kick. This arrangement handles the possibility of an empty + * output queue. + */ + while ((skb = skb_dequeue(&x25_sk(sk)->ack_queue)) != NULL) { + if (!skb_prev) + skb_queue_head(&sk->sk_write_queue, skb); + else + skb_append(skb_prev, skb, &sk->sk_write_queue); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int x25_validate_nr(struct sock *sk, unsigned short nr) +{ + struct x25_sock *x25 = x25_sk(sk); + unsigned short vc = x25->va; + int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS; + + while (vc != x25->vs) { + if (nr == vc) + return 1; + vc = (vc + 1) % modulus; + } + + return nr == x25->vs ? 1 : 0; +} + +/* + * This routine is called when the packet layer internally generates a + * control frame. + */ +void x25_write_internal(struct sock *sk, int frametype) +{ + struct x25_sock *x25 = x25_sk(sk); + struct sk_buff *skb; + unsigned char *dptr; + unsigned char facilities[X25_MAX_FAC_LEN]; + unsigned char addresses[1 + X25_ADDR_LEN]; + unsigned char lci1, lci2; + /* + * Default safe frame size. + */ + int len = X25_MAX_L2_LEN + X25_EXT_MIN_LEN; + + /* + * Adjust frame size. + */ + switch (frametype) { + case X25_CALL_REQUEST: + len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + + X25_MAX_CUD_LEN; + break; + case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */ + if(x25->facilities.reverse & 0x80) { + len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; + } else { + len += 1 + X25_MAX_FAC_LEN; + } + break; + case X25_CLEAR_REQUEST: + case X25_RESET_REQUEST: + len += 2; + break; + case X25_RR: + case X25_RNR: + case X25_REJ: + case X25_CLEAR_CONFIRMATION: + case X25_INTERRUPT_CONFIRMATION: + case X25_RESET_CONFIRMATION: + break; + default: + printk(KERN_ERR "X.25: invalid frame type %02X\n", + frametype); + return; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for Ethernet and 802.2 LLC headers. + */ + skb_reserve(skb, X25_MAX_L2_LEN); + + /* + * Make space for the GFI and LCI, and fill them in. + */ + dptr = skb_put(skb, 2); + + lci1 = (x25->lci >> 8) & 0x0F; + lci2 = (x25->lci >> 0) & 0xFF; + + if (x25->neighbour->extended) { + *dptr++ = lci1 | X25_GFI_EXTSEQ; + *dptr++ = lci2; + } else { + *dptr++ = lci1 | X25_GFI_STDSEQ; + *dptr++ = lci2; + } + + /* + * Now fill in the frame type specific information. + */ + switch (frametype) { + + case X25_CALL_REQUEST: + dptr = skb_put(skb, 1); + *dptr++ = X25_CALL_REQUEST; + len = x25_addr_aton(addresses, &x25->dest_addr, + &x25->source_addr); + dptr = skb_put(skb, len); + memcpy(dptr, addresses, len); + len = x25_create_facilities(facilities, + &x25->facilities, + &x25->dte_facilities, + x25->neighbour->global_facil_mask); + dptr = skb_put(skb, len); + memcpy(dptr, facilities, len); + dptr = skb_put(skb, x25->calluserdata.cudlength); + memcpy(dptr, x25->calluserdata.cuddata, + x25->calluserdata.cudlength); + x25->calluserdata.cudlength = 0; + break; + + case X25_CALL_ACCEPTED: + dptr = skb_put(skb, 2); + *dptr++ = X25_CALL_ACCEPTED; + *dptr++ = 0x00; /* Address lengths */ + len = x25_create_facilities(facilities, + &x25->facilities, + &x25->dte_facilities, + x25->vc_facil_mask); + dptr = skb_put(skb, len); + memcpy(dptr, facilities, len); + + /* fast select with no restriction on response + allows call user data. Userland must + ensure it is ours and not theirs */ + if(x25->facilities.reverse & 0x80) { + dptr = skb_put(skb, + x25->calluserdata.cudlength); + memcpy(dptr, x25->calluserdata.cuddata, + x25->calluserdata.cudlength); + } + x25->calluserdata.cudlength = 0; + break; + + case X25_CLEAR_REQUEST: + dptr = skb_put(skb, 3); + *dptr++ = frametype; + *dptr++ = x25->causediag.cause; + *dptr++ = x25->causediag.diagnostic; + break; + + case X25_RESET_REQUEST: + dptr = skb_put(skb, 3); + *dptr++ = frametype; + *dptr++ = 0x00; /* XXX */ + *dptr++ = 0x00; /* XXX */ + break; + + case X25_RR: + case X25_RNR: + case X25_REJ: + if (x25->neighbour->extended) { + dptr = skb_put(skb, 2); + *dptr++ = frametype; + *dptr++ = (x25->vr << 1) & 0xFE; + } else { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr++ |= (x25->vr << 5) & 0xE0; + } + break; + + case X25_CLEAR_CONFIRMATION: + case X25_INTERRUPT_CONFIRMATION: + case X25_RESET_CONFIRMATION: + dptr = skb_put(skb, 1); + *dptr = frametype; + break; + } + + x25_transmit_link(skb, x25->neighbour); +} + +/* + * Unpick the contents of the passed X.25 Packet Layer frame. + */ +int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q, + int *d, int *m) +{ + struct x25_sock *x25 = x25_sk(sk); + unsigned char *frame = skb->data; + + *ns = *nr = *q = *d = *m = 0; + + switch (frame[2]) { + case X25_CALL_REQUEST: + case X25_CALL_ACCEPTED: + case X25_CLEAR_REQUEST: + case X25_CLEAR_CONFIRMATION: + case X25_INTERRUPT: + case X25_INTERRUPT_CONFIRMATION: + case X25_RESET_REQUEST: + case X25_RESET_CONFIRMATION: + case X25_RESTART_REQUEST: + case X25_RESTART_CONFIRMATION: + case X25_REGISTRATION_REQUEST: + case X25_REGISTRATION_CONFIRMATION: + case X25_DIAGNOSTIC: + return frame[2]; + } + + if (x25->neighbour->extended) { + if (frame[2] == X25_RR || + frame[2] == X25_RNR || + frame[2] == X25_REJ) { + *nr = (frame[3] >> 1) & 0x7F; + return frame[2]; + } + } else { + if ((frame[2] & 0x1F) == X25_RR || + (frame[2] & 0x1F) == X25_RNR || + (frame[2] & 0x1F) == X25_REJ) { + *nr = (frame[2] >> 5) & 0x07; + return frame[2] & 0x1F; + } + } + + if (x25->neighbour->extended) { + if ((frame[2] & 0x01) == X25_DATA) { + *q = (frame[0] & X25_Q_BIT) == X25_Q_BIT; + *d = (frame[0] & X25_D_BIT) == X25_D_BIT; + *m = (frame[3] & X25_EXT_M_BIT) == X25_EXT_M_BIT; + *nr = (frame[3] >> 1) & 0x7F; + *ns = (frame[2] >> 1) & 0x7F; + return X25_DATA; + } + } else { + if ((frame[2] & 0x01) == X25_DATA) { + *q = (frame[0] & X25_Q_BIT) == X25_Q_BIT; + *d = (frame[0] & X25_D_BIT) == X25_D_BIT; + *m = (frame[2] & X25_STD_M_BIT) == X25_STD_M_BIT; + *nr = (frame[2] >> 5) & 0x07; + *ns = (frame[2] >> 1) & 0x07; + return X25_DATA; + } + } + + printk(KERN_DEBUG "X.25: invalid PLP frame %02X %02X %02X\n", + frame[0], frame[1], frame[2]); + + return X25_ILLEGAL; +} + +void x25_disconnect(struct sock *sk, int reason, unsigned char cause, + unsigned char diagnostic) +{ + struct x25_sock *x25 = x25_sk(sk); + + x25_clear_queues(sk); + x25_stop_timer(sk); + + x25->lci = 0; + x25->state = X25_STATE_0; + + x25->causediag.cause = cause; + x25->causediag.diagnostic = diagnostic; + + sk->sk_state = TCP_CLOSE; + sk->sk_err = reason; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } +} + +/* + * Clear an own-rx-busy condition and tell the peer about this, provided + * that there is a significant amount of free receive buffer space available. + */ +void x25_check_rbuf(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf >> 1) && + (x25->condition & X25_COND_OWN_RX_BUSY)) { + x25->condition &= ~X25_COND_OWN_RX_BUSY; + x25->condition &= ~X25_COND_ACK_PENDING; + x25->vl = x25->vr; + x25_write_internal(sk, X25_RR); + x25_stop_timer(sk); + } +} + diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c new file mode 100644 index 00000000..5c5db1a3 --- /dev/null +++ b/net/x25/x25_timer.c @@ -0,0 +1,174 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor New timer architecture. + * Centralised disconnection processing. + */ + +#include +#include +#include +#include +#include +#include + +static void x25_heartbeat_expiry(unsigned long); +static void x25_timer_expiry(unsigned long); + +void x25_init_timers(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk); + + /* initialized by sock_init_data */ + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = &x25_heartbeat_expiry; +} + +void x25_start_heartbeat(struct sock *sk) +{ + mod_timer(&sk->sk_timer, jiffies + 5 * HZ); +} + +void x25_stop_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +void x25_start_t2timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t2); +} + +void x25_start_t21timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t21); +} + +void x25_start_t22timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t22); +} + +void x25_start_t23timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t23); +} + +void x25_stop_timer(struct sock *sk) +{ + del_timer(&x25_sk(sk)->timer); +} + +unsigned long x25_display_timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (!timer_pending(&x25->timer)) + return 0; + + return x25->timer.expires - jiffies; +} + +static void x25_heartbeat_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */ + goto restart_heartbeat; + + switch (x25_sk(sk)->state) { + + case X25_STATE_0: + /* + * Magic here: If we listen() and a new link dies + * before it is accepted() it isn't 'dead' so doesn't + * get removed. + */ + if (sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && + sock_flag(sk, SOCK_DEAD))) { + bh_unlock_sock(sk); + x25_destroy_socket_from_timer(sk); + return; + } + break; + + case X25_STATE_3: + /* + * Check for the state of the receive buffer. + */ + x25_check_rbuf(sk); + break; + } +restart_heartbeat: + x25_start_heartbeat(sk); + bh_unlock_sock(sk); +} + +/* + * Timer has expired, it may have been T2, T21, T22, or T23. We can tell + * by the state machine state. + */ +static inline void x25_do_timer_expiry(struct sock * sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + switch (x25->state) { + + case X25_STATE_3: /* T2 */ + if (x25->condition & X25_COND_ACK_PENDING) { + x25->condition &= ~X25_COND_ACK_PENDING; + x25_enquiry_response(sk); + } + break; + + case X25_STATE_1: /* T21 */ + case X25_STATE_4: /* T22 */ + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25->state = X25_STATE_2; + x25_start_t23timer(sk); + break; + + case X25_STATE_2: /* T23 */ + x25_disconnect(sk, ETIMEDOUT, 0, 0); + break; + } +} + +static void x25_timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */ + if (x25_sk(sk)->state == X25_STATE_3) + x25_start_t2timer(sk); + } else + x25_do_timer_expiry(sk); + bh_unlock_sock(sk); +} diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig new file mode 100644 index 00000000..6d081674 --- /dev/null +++ b/net/xfrm/Kconfig @@ -0,0 +1,80 @@ +# +# XFRM configuration +# +config XFRM + bool + select CRYPTO + depends on NET + +config XFRM_USER + tristate "Transformation user configuration interface" + depends on INET && XFRM + ---help--- + Support for Transformation(XFRM) user configuration interface + like IPsec used by native Linux tools. + + If unsure, say Y. + +config XFRM_SUB_POLICY + bool "Transformation sub policy support (EXPERIMENTAL)" + depends on XFRM && EXPERIMENTAL + ---help--- + Support sub policy for developers. By using sub policy with main + one, two policies can be applied to the same packet at once. + Policy which lives shorter time in kernel should be a sub. + + If unsure, say N. + +config XFRM_MIGRATE + bool "Transformation migrate database (EXPERIMENTAL)" + depends on XFRM && EXPERIMENTAL + ---help--- + A feature to update locator(s) of a given IPsec security + association dynamically. This feature is required, for + instance, in a Mobile IPv6 environment with IPsec configuration + where mobile nodes change their attachment point to the Internet. + + If unsure, say N. + +config XFRM_STATISTICS + bool "Transformation statistics (EXPERIMENTAL)" + depends on INET && XFRM && PROC_FS && EXPERIMENTAL + ---help--- + This statistics is not a SNMP/MIB specification but shows + statistics about transformation error (or almost error) factor + at packet processing for developer. + + If unsure, say N. + +config XFRM_IPCOMP + tristate + select XFRM + select CRYPTO + select CRYPTO_DEFLATE + +config NET_KEY + tristate "PF_KEY sockets" + select XFRM + ---help--- + PF_KEYv2 socket family, compatible to KAME ones. + They are required if you are going to use IPsec tools ported + from KAME. + + Say Y unless you know what you are doing. + +config NET_KEY_MIGRATE + bool "PF_KEY MIGRATE (EXPERIMENTAL)" + depends on NET_KEY && EXPERIMENTAL + select XFRM_MIGRATE + ---help--- + Add a PF_KEY MIGRATE message to PF_KEYv2 socket family. + The PF_KEY MIGRATE message is used to dynamically update + locator(s) of a given IPsec security association. + This feature is required, for instance, in a Mobile IPv6 + environment with IPsec configuration where mobile nodes + change their attachment point to the Internet. Detail + information can be found in the internet-draft + . + + If unsure, say N. + diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile new file mode 100644 index 00000000..aa429eef --- /dev/null +++ b/net/xfrm/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the XFRM subsystem. +# + +obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ + xfrm_input.o xfrm_output.o xfrm_algo.o \ + xfrm_sysctl.o xfrm_replay.o +obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o +obj-$(CONFIG_XFRM_USER) += xfrm_user.o +obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c new file mode 100644 index 00000000..791ab2e7 --- /dev/null +++ b/net/xfrm/xfrm_algo.c @@ -0,0 +1,754 @@ +/* + * xfrm algorithm interface + * + * Copyright (c) 2002 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE) +#include +#endif +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) +#include +#endif + +/* + * Algorithms supported by IPsec. These entries contain properties which + * are used in key negotiation and xfrm processing, and are used to verify + * that instantiated crypto transforms have correct parameters for IPsec + * purposes. + */ +static struct xfrm_algo_desc aead_list[] = { +{ + .name = "rfc4106(gcm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 64, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV8, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc4106(gcm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 96, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV12, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc4106(gcm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV16, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc4309(ccm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 64, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV8, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc4309(ccm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 96, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV12, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc4309(ccm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV16, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc4543(gcm(aes))", + + .uinfo = { + .aead = { + .icv_truncbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +}; + +static struct xfrm_algo_desc aalg_list[] = { +{ + .name = "digest_null", + + .uinfo = { + .auth = { + .icv_truncbits = 0, + .icv_fullbits = 0, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_NULL, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 0, + .sadb_alg_maxbits = 0 + } +}, +{ + .name = "hmac(md5)", + .compat = "md5", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_AALG_MD5HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 128 + } +}, +{ + .name = "hmac(sha1)", + .compat = "sha1", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 160, + } + }, + + .desc = { + .sadb_alg_id = SADB_AALG_SHA1HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 160 + } +}, +{ + .name = "hmac(sha256)", + .compat = "sha256", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 256, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 256, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "hmac(sha384)", + + .uinfo = { + .auth = { + .icv_truncbits = 192, + .icv_fullbits = 384, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_SHA2_384HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 384, + .sadb_alg_maxbits = 384 + } +}, +{ + .name = "hmac(sha512)", + + .uinfo = { + .auth = { + .icv_truncbits = 256, + .icv_fullbits = 512, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_SHA2_512HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 512, + .sadb_alg_maxbits = 512 + } +}, +{ + .name = "hmac(rmd160)", + .compat = "rmd160", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 160, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 160 + } +}, +{ + .name = "xcbc(aes)", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 128 + } +}, +}; + +static struct xfrm_algo_desc ealg_list[] = { +{ + .name = "ecb(cipher_null)", + .compat = "cipher_null", + + .uinfo = { + .encr = { + .blockbits = 8, + .defkeybits = 0, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_NULL, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 0, + .sadb_alg_maxbits = 0 + } +}, +{ + .name = "cbc(des)", + .compat = "des", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 64, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_DESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 64, + .sadb_alg_maxbits = 64 + } +}, +{ + .name = "cbc(des3_ede)", + .compat = "des3_ede", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 192, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_3DESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 192, + .sadb_alg_maxbits = 192 + } +}, +{ + .name = "cbc(cast5)", + .compat = "cast5", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_CASTCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 40, + .sadb_alg_maxbits = 128 + } +}, +{ + .name = "cbc(blowfish)", + .compat = "blowfish", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 40, + .sadb_alg_maxbits = 448 + } +}, +{ + .name = "cbc(aes)", + .compat = "aes", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "cbc(serpent)", + .compat = "serpent", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_SERPENTCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256, + } +}, +{ + .name = "cbc(camellia)", + .compat = "camellia", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_CAMELLIACBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "cbc(twofish)", + .compat = "twofish", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_TWOFISHCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "rfc3686(ctr(aes))", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 160, /* 128-bit key + 32-bit nonce */ + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AESCTR, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 288 + } +}, +}; + +static struct xfrm_algo_desc calg_list[] = { +{ + .name = "deflate", + .uinfo = { + .comp = { + .threshold = 90, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE } +}, +{ + .name = "lzs", + .uinfo = { + .comp = { + .threshold = 90, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_LZS } +}, +{ + .name = "lzjh", + .uinfo = { + .comp = { + .threshold = 50, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_LZJH } +}, +}; + +static inline int aead_entries(void) +{ + return ARRAY_SIZE(aead_list); +} + +static inline int aalg_entries(void) +{ + return ARRAY_SIZE(aalg_list); +} + +static inline int ealg_entries(void) +{ + return ARRAY_SIZE(ealg_list); +} + +static inline int calg_entries(void) +{ + return ARRAY_SIZE(calg_list); +} + +struct xfrm_algo_list { + struct xfrm_algo_desc *algs; + int entries; + u32 type; + u32 mask; +}; + +static const struct xfrm_algo_list xfrm_aead_list = { + .algs = aead_list, + .entries = ARRAY_SIZE(aead_list), + .type = CRYPTO_ALG_TYPE_AEAD, + .mask = CRYPTO_ALG_TYPE_MASK, +}; + +static const struct xfrm_algo_list xfrm_aalg_list = { + .algs = aalg_list, + .entries = ARRAY_SIZE(aalg_list), + .type = CRYPTO_ALG_TYPE_HASH, + .mask = CRYPTO_ALG_TYPE_HASH_MASK, +}; + +static const struct xfrm_algo_list xfrm_ealg_list = { + .algs = ealg_list, + .entries = ARRAY_SIZE(ealg_list), + .type = CRYPTO_ALG_TYPE_BLKCIPHER, + .mask = CRYPTO_ALG_TYPE_BLKCIPHER_MASK, +}; + +static const struct xfrm_algo_list xfrm_calg_list = { + .algs = calg_list, + .entries = ARRAY_SIZE(calg_list), + .type = CRYPTO_ALG_TYPE_COMPRESS, + .mask = CRYPTO_ALG_TYPE_MASK, +}; + +static struct xfrm_algo_desc *xfrm_find_algo( + const struct xfrm_algo_list *algo_list, + int match(const struct xfrm_algo_desc *entry, const void *data), + const void *data, int probe) +{ + struct xfrm_algo_desc *list = algo_list->algs; + int i, status; + + for (i = 0; i < algo_list->entries; i++) { + if (!match(list + i, data)) + continue; + + if (list[i].available) + return &list[i]; + + if (!probe) + break; + + status = crypto_has_alg(list[i].name, algo_list->type, + algo_list->mask); + if (!status) + break; + + list[i].available = status; + return &list[i]; + } + return NULL; +} + +static int xfrm_alg_id_match(const struct xfrm_algo_desc *entry, + const void *data) +{ + return entry->desc.sadb_alg_id == (unsigned long)data; +} + +struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id) +{ + return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_id_match, + (void *)(unsigned long)alg_id, 1); +} +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid); + +struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id) +{ + return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_id_match, + (void *)(unsigned long)alg_id, 1); +} +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid); + +struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id) +{ + return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_id_match, + (void *)(unsigned long)alg_id, 1); +} +EXPORT_SYMBOL_GPL(xfrm_calg_get_byid); + +static int xfrm_alg_name_match(const struct xfrm_algo_desc *entry, + const void *data) +{ + const char *name = data; + + return name && (!strcmp(name, entry->name) || + (entry->compat && !strcmp(name, entry->compat))); +} + +struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe) +{ + return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_name_match, name, + probe); +} +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname); + +struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe) +{ + return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_name_match, name, + probe); +} +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname); + +struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe) +{ + return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_name_match, name, + probe); +} +EXPORT_SYMBOL_GPL(xfrm_calg_get_byname); + +struct xfrm_aead_name { + const char *name; + int icvbits; +}; + +static int xfrm_aead_name_match(const struct xfrm_algo_desc *entry, + const void *data) +{ + const struct xfrm_aead_name *aead = data; + const char *name = aead->name; + + return aead->icvbits == entry->uinfo.aead.icv_truncbits && name && + !strcmp(name, entry->name); +} + +struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe) +{ + struct xfrm_aead_name data = { + .name = name, + .icvbits = icv_len, + }; + + return xfrm_find_algo(&xfrm_aead_list, xfrm_aead_name_match, &data, + probe); +} +EXPORT_SYMBOL_GPL(xfrm_aead_get_byname); + +struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx) +{ + if (idx >= aalg_entries()) + return NULL; + + return &aalg_list[idx]; +} +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx); + +struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx) +{ + if (idx >= ealg_entries()) + return NULL; + + return &ealg_list[idx]; +} +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx); + +/* + * Probe for the availability of crypto algorithms, and set the available + * flag for any algorithms found on the system. This is typically called by + * pfkey during userspace SA add, update or register. + */ +void xfrm_probe_algs(void) +{ + int i, status; + + BUG_ON(in_softirq()); + + for (i = 0; i < aalg_entries(); i++) { + status = crypto_has_hash(aalg_list[i].name, 0, + CRYPTO_ALG_ASYNC); + if (aalg_list[i].available != status) + aalg_list[i].available = status; + } + + for (i = 0; i < ealg_entries(); i++) { + status = crypto_has_blkcipher(ealg_list[i].name, 0, + CRYPTO_ALG_ASYNC); + if (ealg_list[i].available != status) + ealg_list[i].available = status; + } + + for (i = 0; i < calg_entries(); i++) { + status = crypto_has_comp(calg_list[i].name, 0, + CRYPTO_ALG_ASYNC); + if (calg_list[i].available != status) + calg_list[i].available = status; + } +} +EXPORT_SYMBOL_GPL(xfrm_probe_algs); + +int xfrm_count_auth_supported(void) +{ + int i, n; + + for (i = 0, n = 0; i < aalg_entries(); i++) + if (aalg_list[i].available) + n++; + return n; +} +EXPORT_SYMBOL_GPL(xfrm_count_auth_supported); + +int xfrm_count_enc_supported(void) +{ + int i, n; + + for (i = 0, n = 0; i < ealg_entries(); i++) + if (ealg_list[i].available) + n++; + return n; +} +EXPORT_SYMBOL_GPL(xfrm_count_enc_supported); + +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) + +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); +#endif diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c new file mode 100644 index 00000000..1e98bc0f --- /dev/null +++ b/net/xfrm/xfrm_hash.c @@ -0,0 +1,39 @@ +/* xfrm_hash.c: Common hash table code. + * + * Copyright (C) 2006 David S. Miller (davem@davemloft.net) + */ + +#include +#include +#include +#include +#include +#include + +#include "xfrm_hash.h" + +struct hlist_head *xfrm_hash_alloc(unsigned int sz) +{ + struct hlist_head *n; + + if (sz <= PAGE_SIZE) + n = kzalloc(sz, GFP_KERNEL); + else if (hashdist) + n = vzalloc(sz); + else + n = (struct hlist_head *) + __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + get_order(sz)); + + return n; +} + +void xfrm_hash_free(struct hlist_head *n, unsigned int sz) +{ + if (sz <= PAGE_SIZE) + kfree(n); + else if (hashdist) + vfree(n); + else + free_pages((unsigned long)n, get_order(sz)); +} diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h new file mode 100644 index 00000000..7199d78b --- /dev/null +++ b/net/xfrm/xfrm_hash.h @@ -0,0 +1,136 @@ +#ifndef _XFRM_HASH_H +#define _XFRM_HASH_H + +#include +#include + +static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) +{ + return ntohl(addr->a4); +} + +static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr) +{ + return ntohl(addr->a6[2] ^ addr->a6[3]); +} + +static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr) +{ + u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4; + return ntohl((__force __be32)sum); +} + +static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr) +{ + return ntohl(daddr->a6[2] ^ daddr->a6[3] ^ + saddr->a6[2] ^ saddr->a6[3]); +} + +static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + u32 reqid, unsigned short family, + unsigned int hmask) +{ + unsigned int h = family ^ reqid; + switch (family) { + case AF_INET: + h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); + break; + case AF_INET6: + h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); + break; + } + return (h ^ (h >> 16)) & hmask; +} + +static inline unsigned __xfrm_src_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + unsigned short family, + unsigned int hmask) +{ + unsigned int h = family; + switch (family) { + case AF_INET: + h ^= __xfrm4_daddr_saddr_hash(daddr, saddr); + break; + case AF_INET6: + h ^= __xfrm6_daddr_saddr_hash(daddr, saddr); + break; + } + return (h ^ (h >> 16)) & hmask; +} + +static inline unsigned int +__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto, + unsigned short family, unsigned int hmask) +{ + unsigned int h = (__force u32)spi ^ proto; + switch (family) { + case AF_INET: + h ^= __xfrm4_addr_hash(daddr); + break; + case AF_INET6: + h ^= __xfrm6_addr_hash(daddr); + break; + } + return (h ^ (h >> 10) ^ (h >> 20)) & hmask; +} + +static inline unsigned int __idx_hash(u32 index, unsigned int hmask) +{ + return (index ^ (index >> 8)) & hmask; +} + +static inline unsigned int __sel_hash(const struct xfrm_selector *sel, + unsigned short family, unsigned int hmask) +{ + const xfrm_address_t *daddr = &sel->daddr; + const xfrm_address_t *saddr = &sel->saddr; + unsigned int h = 0; + + switch (family) { + case AF_INET: + if (sel->prefixlen_d != 32 || + sel->prefixlen_s != 32) + return hmask + 1; + + h = __xfrm4_daddr_saddr_hash(daddr, saddr); + break; + + case AF_INET6: + if (sel->prefixlen_d != 128 || + sel->prefixlen_s != 128) + return hmask + 1; + + h = __xfrm6_daddr_saddr_hash(daddr, saddr); + break; + } + h ^= (h >> 16); + return h & hmask; +} + +static inline unsigned int __addr_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + unsigned short family, unsigned int hmask) +{ + unsigned int h = 0; + + switch (family) { + case AF_INET: + h = __xfrm4_daddr_saddr_hash(daddr, saddr); + break; + + case AF_INET6: + h = __xfrm6_daddr_saddr_hash(daddr, saddr); + break; + } + h ^= (h >> 16); + return h & hmask; +} + +extern struct hlist_head *xfrm_hash_alloc(unsigned int sz); +extern void xfrm_hash_free(struct hlist_head *n, unsigned int sz); + +#endif /* _XFRM_HASH_H */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c new file mode 100644 index 00000000..54a0dc2e --- /dev/null +++ b/net/xfrm/xfrm_input.c @@ -0,0 +1,291 @@ +/* + * xfrm_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *secpath_cachep __read_mostly; + +void __secpath_destroy(struct sec_path *sp) +{ + int i; + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); + kmem_cache_free(secpath_cachep, sp); +} +EXPORT_SYMBOL(__secpath_destroy); + +struct sec_path *secpath_dup(struct sec_path *src) +{ + struct sec_path *sp; + + sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC); + if (!sp) + return NULL; + + sp->len = 0; + if (src) { + int i; + + memcpy(sp, src, sizeof(*sp)); + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } + atomic_set(&sp->refcnt, 1); + return sp; +} +EXPORT_SYMBOL(secpath_dup); + +/* Fetch spi and seq from ipsec header */ + +int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) +{ + int offset, offset_seq; + int hlen; + + switch (nexthdr) { + case IPPROTO_AH: + hlen = sizeof(struct ip_auth_hdr); + offset = offsetof(struct ip_auth_hdr, spi); + offset_seq = offsetof(struct ip_auth_hdr, seq_no); + break; + case IPPROTO_ESP: + hlen = sizeof(struct ip_esp_hdr); + offset = offsetof(struct ip_esp_hdr, spi); + offset_seq = offsetof(struct ip_esp_hdr, seq_no); + break; + case IPPROTO_COMP: + if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) + return -EINVAL; + *spi = htonl(ntohs(*(__be16*)(skb_transport_header(skb) + 2))); + *seq = 0; + return 0; + default: + return 1; + } + + if (!pskb_may_pull(skb, hlen)) + return -EINVAL; + + *spi = *(__be32*)(skb_transport_header(skb) + offset); + *seq = *(__be32*)(skb_transport_header(skb) + offset_seq); + return 0; +} + +int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_mode *inner_mode = x->inner_mode; + int err; + + err = x->outer_mode->afinfo->extract_input(x, skb); + if (err) + return err; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) + return -EAFNOSUPPORT; + } + + skb->protocol = inner_mode->afinfo->eth_proto; + return inner_mode->input2(x, skb); +} +EXPORT_SYMBOL(xfrm_prepare_input); + +int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + struct net *net = dev_net(skb->dev); + int err; + __be32 seq; + __be32 seq_hi; + struct xfrm_state *x; + xfrm_address_t *daddr; + struct xfrm_mode *inner_mode; + unsigned int family; + int decaps = 0; + int async = 0; + + /* A negative encap_type indicates async resumption. */ + if (encap_type < 0) { + async = 1; + x = xfrm_input_state(skb); + seq = XFRM_SKB_CB(skb)->seq.input.low; + goto resume; + } + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); + goto drop; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + daddr = (xfrm_address_t *)(skb_network_header(skb) + + XFRM_SPI_SKB_CB(skb)->daddroff); + family = XFRM_SPI_SKB_CB(skb)->family; + + seq = 0; + if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + + do { + if (skb->sp->len == XFRM_MAX_DEPTH) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family); + if (x == NULL) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); + xfrm_audit_state_notfound(skb, family, spi, seq); + goto drop; + } + + skb->sp->xvec[skb->sp->len++] = x; + + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + goto drop_unlock; + } + + if ((x->encap ? x->encap->encap_type : 0) != encap_type) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); + goto drop_unlock; + } + + if (x->repl->check(x, skb, seq)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); + goto drop_unlock; + } + + if (xfrm_state_check_expire(x)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED); + goto drop_unlock; + } + + spin_unlock(&x->lock); + + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); + + XFRM_SKB_CB(skb)->seq.input.low = seq; + XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + + skb_dst_force(skb); + + nexthdr = x->type->input(x, skb); + + if (nexthdr == -EINPROGRESS) + return 0; + +resume: + spin_lock(&x->lock); + if (nexthdr <= 0) { + if (nexthdr == -EBADMSG) { + xfrm_audit_state_icvfail(x, skb, + x->type->proto); + x->stats.integrity_failed++; + } + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); + goto drop_unlock; + } + + /* only the first xfrm gets the encap type */ + encap_type = 0; + + if (async && x->repl->check(x, skb, seq)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); + goto drop_unlock; + } + + x->repl->advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; + + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) + goto drop; + } + + if (inner_mode->input(x, skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); + goto drop; + } + + if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { + decaps = 1; + break; + } + + /* + * We need the inner address. However, we only get here for + * transport mode so the outer address is identical. + */ + daddr = &x->id.daddr; + family = x->outer_mode->afinfo->family; + + err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); + if (err < 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + } while (!err); + + nf_reset(skb); + + if (decaps) { + skb_dst_drop(skb); + netif_rx(skb); + return 0; + } else { + return x->inner_mode->afinfo->transport_finish(skb, async); + } + +drop_unlock: + spin_unlock(&x->lock); +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm_input); + +int xfrm_input_resume(struct sk_buff *skb, int nexthdr) +{ + return xfrm_input(skb, nexthdr, 0, -1); +} +EXPORT_SYMBOL(xfrm_input_resume); + +void __init xfrm_input_init(void) +{ + secpath_cachep = kmem_cache_create("secpath_cache", + sizeof(struct sec_path), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c new file mode 100644 index 00000000..fc91ad7e --- /dev/null +++ b/net/xfrm/xfrm_ipcomp.c @@ -0,0 +1,383 @@ +/* + * IP Payload Compression Protocol (IPComp) - RFC3173. + * + * Copyright (c) 2003 James Morris + * Copyright (c) 2003-2008 Herbert Xu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Todo: + * - Tunable compression parameters. + * - Compression stats. + * - Adaptive compression. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ipcomp_tfms { + struct list_head list; + struct crypto_comp * __percpu *tfms; + int users; +}; + +static DEFINE_MUTEX(ipcomp_resource_mutex); +static void * __percpu *ipcomp_scratches; +static int ipcomp_scratch_users; +static LIST_HEAD(ipcomp_tfms_list); + +static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipcomp_data *ipcd = x->data; + const int plen = skb->len; + int dlen = IPCOMP_SCRATCH_SIZE; + const u8 *start = skb->data; + const int cpu = get_cpu(); + u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); + int len; + + if (err) + goto out; + + if (dlen < (plen + sizeof(struct ip_comp_hdr))) { + err = -EINVAL; + goto out; + } + + len = dlen - plen; + if (len > skb_tailroom(skb)) + len = skb_tailroom(skb); + + __skb_put(skb, len); + + len += plen; + skb_copy_to_linear_data(skb, scratch, len); + + while ((scratch += len, dlen -= len) > 0) { + skb_frag_t *frag; + + err = -EMSGSIZE; + if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) + goto out; + + frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; + frag->page = alloc_page(GFP_ATOMIC); + + err = -ENOMEM; + if (!frag->page) + goto out; + + len = PAGE_SIZE; + if (dlen < len) + len = dlen; + + memcpy(page_address(frag->page), scratch, len); + + frag->page_offset = 0; + frag->size = len; + skb->truesize += len; + skb->data_len += len; + skb->len += len; + + skb_shinfo(skb)->nr_frags++; + } + + err = 0; + +out: + put_cpu(); + return err; +} + +int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int nexthdr; + int err = -ENOMEM; + struct ip_comp_hdr *ipch; + + if (skb_linearize_cow(skb)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + ipch = (void *)skb->data; + nexthdr = ipch->nexthdr; + + skb->transport_header = skb->network_header + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); + err = ipcomp_decompress(x, skb); + if (err) + goto out; + + err = nexthdr; + +out: + return err; +} +EXPORT_SYMBOL_GPL(ipcomp_input); + +static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipcomp_data *ipcd = x->data; + const int plen = skb->len; + int dlen = IPCOMP_SCRATCH_SIZE; + u8 *start = skb->data; + const int cpu = get_cpu(); + u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + int err; + + local_bh_disable(); + err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); + local_bh_enable(); + if (err) + goto out; + + if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) { + err = -EMSGSIZE; + goto out; + } + + memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); + put_cpu(); + + pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); + return 0; + +out: + put_cpu(); + return err; +} + +int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct ip_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + + if (skb->len < ipcd->threshold) { + /* Don't bother compressing */ + goto out_ok; + } + + if (skb_linearize_cow(skb)) + goto out_ok; + + err = ipcomp_compress(x, skb); + + if (err) { + goto out_ok; + } + + /* Install ipcomp header, convert into ipcomp datagram. */ + ipch = ip_comp_hdr(skb); + ipch->nexthdr = *skb_mac_header(skb); + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + *skb_mac_header(skb) = IPPROTO_COMP; +out_ok: + skb_push(skb, -skb_network_offset(skb)); + return 0; +} +EXPORT_SYMBOL_GPL(ipcomp_output); + +static void ipcomp_free_scratches(void) +{ + int i; + void * __percpu *scratches; + + if (--ipcomp_scratch_users) + return; + + scratches = ipcomp_scratches; + if (!scratches) + return; + + for_each_possible_cpu(i) + vfree(*per_cpu_ptr(scratches, i)); + + free_percpu(scratches); +} + +static void * __percpu *ipcomp_alloc_scratches(void) +{ + int i; + void * __percpu *scratches; + + if (ipcomp_scratch_users++) + return ipcomp_scratches; + + scratches = alloc_percpu(void *); + if (!scratches) + return NULL; + + ipcomp_scratches = scratches; + + for_each_possible_cpu(i) { + void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); + if (!scratch) + return NULL; + *per_cpu_ptr(scratches, i) = scratch; + } + + return scratches; +} + +static void ipcomp_free_tfms(struct crypto_comp * __percpu *tfms) +{ + struct ipcomp_tfms *pos; + int cpu; + + list_for_each_entry(pos, &ipcomp_tfms_list, list) { + if (pos->tfms == tfms) + break; + } + + WARN_ON(!pos); + + if (--pos->users) + return; + + list_del(&pos->list); + kfree(pos); + + if (!tfms) + return; + + for_each_possible_cpu(cpu) { + struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu); + crypto_free_comp(tfm); + } + free_percpu(tfms); +} + +static struct crypto_comp * __percpu *ipcomp_alloc_tfms(const char *alg_name) +{ + struct ipcomp_tfms *pos; + struct crypto_comp * __percpu *tfms; + int cpu; + + /* This can be any valid CPU ID so we don't need locking. */ + cpu = raw_smp_processor_id(); + + list_for_each_entry(pos, &ipcomp_tfms_list, list) { + struct crypto_comp *tfm; + + tfms = pos->tfms; + tfm = *per_cpu_ptr(tfms, cpu); + + if (!strcmp(crypto_comp_name(tfm), alg_name)) { + pos->users++; + return tfms; + } + } + + pos = kmalloc(sizeof(*pos), GFP_KERNEL); + if (!pos) + return NULL; + + pos->users = 1; + INIT_LIST_HEAD(&pos->list); + list_add(&pos->list, &ipcomp_tfms_list); + + pos->tfms = tfms = alloc_percpu(struct crypto_comp *); + if (!tfms) + goto error; + + for_each_possible_cpu(cpu) { + struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + goto error; + *per_cpu_ptr(tfms, cpu) = tfm; + } + + return tfms; + +error: + ipcomp_free_tfms(tfms); + return NULL; +} + +static void ipcomp_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfms) + ipcomp_free_tfms(ipcd->tfms); + ipcomp_free_scratches(); +} + +void ipcomp_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + xfrm_state_delete_tunnel(x); + mutex_lock(&ipcomp_resource_mutex); + ipcomp_free_data(ipcd); + mutex_unlock(&ipcomp_resource_mutex); + kfree(ipcd); +} +EXPORT_SYMBOL_GPL(ipcomp_destroy); + +int ipcomp_init_state(struct xfrm_state *x) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + if (x->encap) + goto out; + + err = -ENOMEM; + ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto out; + + mutex_lock(&ipcomp_resource_mutex); + if (!ipcomp_alloc_scratches()) + goto error; + + ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name); + if (!ipcd->tfms) + goto error; + mutex_unlock(&ipcomp_resource_mutex); + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; + +error: + ipcomp_free_data(ipcd); + mutex_unlock(&ipcomp_resource_mutex); + kfree(ipcd); + goto out; +} +EXPORT_SYMBOL_GPL(ipcomp_init_state); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); +MODULE_AUTHOR("James Morris "); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c new file mode 100644 index 00000000..47bacd8c --- /dev/null +++ b/net/xfrm/xfrm_output.c @@ -0,0 +1,212 @@ +/* + * xfrm_output.c - Common IPsec encapsulation code. + * + * Copyright (c) 2007 Herbert Xu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrm_output2(struct sk_buff *skb); + +static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev) + - skb_headroom(skb); + int ntail = dst->dev->needed_tailroom - skb_tailroom(skb); + + if (nhead <= 0) { + if (ntail <= 0) + return 0; + nhead = 0; + } else if (ntail < 0) + ntail = 0; + + return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC); +} + +static int xfrm_output_one(struct sk_buff *skb, int err) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + struct net *net = xs_net(x); + + if (err <= 0) + goto resume; + + do { + err = xfrm_state_check_space(x, skb); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + goto error_nolock; + } + + err = x->outer_mode->output(x, skb); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_state_check_expire(x); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEEXPIRED); + goto error; + } + + err = x->repl->overflow(x, skb); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR); + goto error; + } + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock_bh(&x->lock); + + skb_dst_force(skb); + + err = x->type->output(x, skb); + if (err == -EINPROGRESS) + goto out_exit; + +resume: + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR); + goto error_nolock; + } + + dst = skb_dst_pop(skb); + if (!dst) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + err = -EHOSTUNREACH; + goto error_nolock; + } + skb_dst_set(skb, dst); + x = dst->xfrm; + } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); + + err = 0; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} + +int xfrm_output_resume(struct sk_buff *skb, int err) +{ + while (likely((err = xfrm_output_one(skb, err)) == 0)) { + nf_reset(skb); + + err = skb_dst(skb)->ops->local_out(skb); + if (unlikely(err != 1)) + goto out; + + if (!skb_dst(skb)->xfrm) + return dst_output(skb); + + err = nf_hook(skb_dst(skb)->ops->family, + NF_INET_POST_ROUTING, skb, + NULL, skb_dst(skb)->dev, xfrm_output2); + if (unlikely(err != 1)) + goto out; + } + + if (err == -EINPROGRESS) + err = 0; + +out: + return err; +} +EXPORT_SYMBOL_GPL(xfrm_output_resume); + +static int xfrm_output2(struct sk_buff *skb) +{ + return xfrm_output_resume(skb, 1); +} + +static int xfrm_output_gso(struct sk_buff *skb) +{ + struct sk_buff *segs; + + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (IS_ERR(segs)) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm_output2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + +int xfrm_output(struct sk_buff *skb) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + int err; + + if (skb_is_gso(skb)) + return xfrm_output_gso(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + err = skb_checksum_help(skb); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return err; + } + } + + return xfrm_output2(skb); +} + +int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_mode *inner_mode; + if (x->sel.family == AF_UNSPEC) + inner_mode = xfrm_ip2inner_mode(x, + xfrm_af2proto(skb_dst(skb)->ops->family)); + else + inner_mode = x->inner_mode; + + if (inner_mode == NULL) + return -EAFNOSUPPORT; + return inner_mode->afinfo->extract_output(x, skb); +} + +EXPORT_SYMBOL_GPL(xfrm_output); +EXPORT_SYMBOL_GPL(xfrm_inner_extract_output); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c new file mode 100644 index 00000000..0c0e40e9 --- /dev/null +++ b/net/xfrm/xfrm_policy.c @@ -0,0 +1,2978 @@ +/* + * xfrm_policy.c + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki + * Split up af-specific portion + * Derek Atkins Add the post_input processor + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_XFRM_STATISTICS +#include +#endif + +#include "xfrm_hash.h" + +DEFINE_MUTEX(xfrm_cfg_mutex); +EXPORT_SYMBOL(xfrm_cfg_mutex); + +static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock); +static struct dst_entry *xfrm_policy_sk_bundles; +static DEFINE_RWLOCK(xfrm_policy_lock); + +static DEFINE_RWLOCK(xfrm_policy_afinfo_lock); +static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; + +static struct kmem_cache *xfrm_dst_cache __read_mostly; + +static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); +static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); +static void xfrm_init_pmtu(struct dst_entry *dst); +static int stale_bundle(struct dst_entry *dst); +static int xfrm_bundle_ok(struct xfrm_dst *xdst); + + +static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, + int dir); + +static inline int +__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi4 *fl4 = &fl->u.ip4; + + return addr_match(&fl4->daddr, &sel->daddr, sel->prefixlen_d) && + addr_match(&fl4->saddr, &sel->saddr, sel->prefixlen_s) && + !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && + !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && + (fl4->flowi4_proto == sel->proto || !sel->proto) && + (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); +} + +static inline int +__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi6 *fl6 = &fl->u.ip6; + + return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && + addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && + !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && + !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && + (fl6->flowi6_proto == sel->proto || !sel->proto) && + (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); +} + +int xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, + unsigned short family) +{ + switch (family) { + case AF_INET: + return __xfrm4_selector_match(sel, fl); + case AF_INET6: + return __xfrm6_selector_match(sel, fl); + } + return 0; +} + +static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr, + int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct dst_entry *dst; + + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return ERR_PTR(-EAFNOSUPPORT); + + dst = afinfo->dst_lookup(net, tos, saddr, daddr); + + xfrm_policy_put_afinfo(afinfo); + + return dst; +} + +static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, + xfrm_address_t *prev_saddr, + xfrm_address_t *prev_daddr, + int family) +{ + struct net *net = xs_net(x); + xfrm_address_t *saddr = &x->props.saddr; + xfrm_address_t *daddr = &x->id.daddr; + struct dst_entry *dst; + + if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { + saddr = x->coaddr; + daddr = prev_daddr; + } + if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { + saddr = prev_saddr; + daddr = x->coaddr; + } + + dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family); + + if (!IS_ERR(dst)) { + if (prev_saddr != saddr) + memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); + if (prev_daddr != daddr) + memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); + } + + return dst; +} + +static inline unsigned long make_jiffies(long secs) +{ + if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) + return MAX_SCHEDULE_TIMEOUT-1; + else + return secs*HZ; +} + +static void xfrm_policy_timer(unsigned long data) +{ + struct xfrm_policy *xp = (struct xfrm_policy*)data; + unsigned long now = get_seconds(); + long next = LONG_MAX; + int warn = 0; + int dir; + + read_lock(&xp->lock); + + if (unlikely(xp->walk.dead)) + goto out; + + dir = xfrm_policy_id2dir(xp->index); + + if (xp->lft.hard_add_expires_seconds) { + long tmo = xp->lft.hard_add_expires_seconds + + xp->curlft.add_time - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (xp->lft.hard_use_expires_seconds) { + long tmo = xp->lft.hard_use_expires_seconds + + (xp->curlft.use_time ? : xp->curlft.add_time) - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (xp->lft.soft_add_expires_seconds) { + long tmo = xp->lft.soft_add_expires_seconds + + xp->curlft.add_time - now; + if (tmo <= 0) { + warn = 1; + tmo = XFRM_KM_TIMEOUT; + } + if (tmo < next) + next = tmo; + } + if (xp->lft.soft_use_expires_seconds) { + long tmo = xp->lft.soft_use_expires_seconds + + (xp->curlft.use_time ? : xp->curlft.add_time) - now; + if (tmo <= 0) { + warn = 1; + tmo = XFRM_KM_TIMEOUT; + } + if (tmo < next) + next = tmo; + } + + if (warn) + km_policy_expired(xp, dir, 0, 0); + if (next != LONG_MAX && + !mod_timer(&xp->timer, jiffies + make_jiffies(next))) + xfrm_pol_hold(xp); + +out: + read_unlock(&xp->lock); + xfrm_pol_put(xp); + return; + +expired: + read_unlock(&xp->lock); + if (!xfrm_policy_delete(xp, dir)) + km_policy_expired(xp, dir, 1, 0); + xfrm_pol_put(xp); +} + +static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo) +{ + struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); + + if (unlikely(pol->walk.dead)) + flo = NULL; + else + xfrm_pol_hold(pol); + + return flo; +} + +static int xfrm_policy_flo_check(struct flow_cache_object *flo) +{ + struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); + + return !pol->walk.dead; +} + +static void xfrm_policy_flo_delete(struct flow_cache_object *flo) +{ + xfrm_pol_put(container_of(flo, struct xfrm_policy, flo)); +} + +static const struct flow_cache_ops xfrm_policy_fc_ops = { + .get = xfrm_policy_flo_get, + .check = xfrm_policy_flo_check, + .delete = xfrm_policy_flo_delete, +}; + +/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 + * SPD calls. + */ + +struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) +{ + struct xfrm_policy *policy; + + policy = kzalloc(sizeof(struct xfrm_policy), gfp); + + if (policy) { + write_pnet(&policy->xp_net, net); + INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_NODE(&policy->bydst); + INIT_HLIST_NODE(&policy->byidx); + rwlock_init(&policy->lock); + atomic_set(&policy->refcnt, 1); + setup_timer(&policy->timer, xfrm_policy_timer, + (unsigned long)policy); + policy->flo.ops = &xfrm_policy_fc_ops; + } + return policy; +} +EXPORT_SYMBOL(xfrm_policy_alloc); + +/* Destroy xfrm_policy: descendant resources must be released to this moment. */ + +void xfrm_policy_destroy(struct xfrm_policy *policy) +{ + BUG_ON(!policy->walk.dead); + + if (del_timer(&policy->timer)) + BUG(); + + security_xfrm_policy_free(policy->security); + kfree(policy); +} +EXPORT_SYMBOL(xfrm_policy_destroy); + +/* Rule must be locked. Release descentant resources, announce + * entry dead. The rule must be unlinked from lists to the moment. + */ + +static void xfrm_policy_kill(struct xfrm_policy *policy) +{ + policy->walk.dead = 1; + + atomic_inc(&policy->genid); + + if (del_timer(&policy->timer)) + xfrm_pol_put(policy); + + xfrm_pol_put(policy); +} + +static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; + +static inline unsigned int idx_hash(struct net *net, u32 index) +{ + return __idx_hash(index, net->xfrm.policy_idx_hmask); +} + +static struct hlist_head *policy_hash_bysel(struct net *net, + const struct xfrm_selector *sel, + unsigned short family, int dir) +{ + unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; + unsigned int hash = __sel_hash(sel, family, hmask); + + return (hash == hmask + 1 ? + &net->xfrm.policy_inexact[dir] : + net->xfrm.policy_bydst[dir].table + hash); +} + +static struct hlist_head *policy_hash_direct(struct net *net, + const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + unsigned short family, int dir) +{ + unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; + unsigned int hash = __addr_hash(daddr, saddr, family, hmask); + + return net->xfrm.policy_bydst[dir].table + hash; +} + +static void xfrm_dst_hash_transfer(struct hlist_head *list, + struct hlist_head *ndsttable, + unsigned int nhashmask) +{ + struct hlist_node *entry, *tmp, *entry0 = NULL; + struct xfrm_policy *pol; + unsigned int h0 = 0; + +redo: + hlist_for_each_entry_safe(pol, entry, tmp, list, bydst) { + unsigned int h; + + h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, + pol->family, nhashmask); + if (!entry0) { + hlist_del(entry); + hlist_add_head(&pol->bydst, ndsttable+h); + h0 = h; + } else { + if (h != h0) + continue; + hlist_del(entry); + hlist_add_after(entry0, &pol->bydst); + } + entry0 = entry; + } + if (!hlist_empty(list)) { + entry0 = NULL; + goto redo; + } +} + +static void xfrm_idx_hash_transfer(struct hlist_head *list, + struct hlist_head *nidxtable, + unsigned int nhashmask) +{ + struct hlist_node *entry, *tmp; + struct xfrm_policy *pol; + + hlist_for_each_entry_safe(pol, entry, tmp, list, byidx) { + unsigned int h; + + h = __idx_hash(pol->index, nhashmask); + hlist_add_head(&pol->byidx, nidxtable+h); + } +} + +static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) +{ + return ((old_hmask + 1) << 1) - 1; +} + +static void xfrm_bydst_resize(struct net *net, int dir) +{ + unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; + unsigned int nhashmask = xfrm_new_hash_mask(hmask); + unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); + struct hlist_head *odst = net->xfrm.policy_bydst[dir].table; + struct hlist_head *ndst = xfrm_hash_alloc(nsize); + int i; + + if (!ndst) + return; + + write_lock_bh(&xfrm_policy_lock); + + for (i = hmask; i >= 0; i--) + xfrm_dst_hash_transfer(odst + i, ndst, nhashmask); + + net->xfrm.policy_bydst[dir].table = ndst; + net->xfrm.policy_bydst[dir].hmask = nhashmask; + + write_unlock_bh(&xfrm_policy_lock); + + xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); +} + +static void xfrm_byidx_resize(struct net *net, int total) +{ + unsigned int hmask = net->xfrm.policy_idx_hmask; + unsigned int nhashmask = xfrm_new_hash_mask(hmask); + unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); + struct hlist_head *oidx = net->xfrm.policy_byidx; + struct hlist_head *nidx = xfrm_hash_alloc(nsize); + int i; + + if (!nidx) + return; + + write_lock_bh(&xfrm_policy_lock); + + for (i = hmask; i >= 0; i--) + xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); + + net->xfrm.policy_byidx = nidx; + net->xfrm.policy_idx_hmask = nhashmask; + + write_unlock_bh(&xfrm_policy_lock); + + xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); +} + +static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) +{ + unsigned int cnt = net->xfrm.policy_count[dir]; + unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; + + if (total) + *total += cnt; + + if ((hmask + 1) < xfrm_policy_hashmax && + cnt > hmask) + return 1; + + return 0; +} + +static inline int xfrm_byidx_should_resize(struct net *net, int total) +{ + unsigned int hmask = net->xfrm.policy_idx_hmask; + + if ((hmask + 1) < xfrm_policy_hashmax && + total > hmask) + return 1; + + return 0; +} + +void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) +{ + read_lock_bh(&xfrm_policy_lock); + si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; + si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; + si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; + si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; + si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; + si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; + si->spdhcnt = net->xfrm.policy_idx_hmask; + si->spdhmcnt = xfrm_policy_hashmax; + read_unlock_bh(&xfrm_policy_lock); +} +EXPORT_SYMBOL(xfrm_spd_getinfo); + +static DEFINE_MUTEX(hash_resize_mutex); +static void xfrm_hash_resize(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, xfrm.policy_hash_work); + int dir, total; + + mutex_lock(&hash_resize_mutex); + + total = 0; + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + if (xfrm_bydst_should_resize(net, dir, &total)) + xfrm_bydst_resize(net, dir); + } + if (xfrm_byidx_should_resize(net, total)) + xfrm_byidx_resize(net, total); + + mutex_unlock(&hash_resize_mutex); +} + +/* Generate new index... KAME seems to generate them ordered by cost + * of an absolute inpredictability of ordering of rules. This will not pass. */ +static u32 xfrm_gen_index(struct net *net, int dir) +{ + static u32 idx_generator; + + for (;;) { + struct hlist_node *entry; + struct hlist_head *list; + struct xfrm_policy *p; + u32 idx; + int found; + + idx = (idx_generator | dir); + idx_generator += 8; + if (idx == 0) + idx = 8; + list = net->xfrm.policy_byidx + idx_hash(net, idx); + found = 0; + hlist_for_each_entry(p, entry, list, byidx) { + if (p->index == idx) { + found = 1; + break; + } + } + if (!found) + return idx; + } +} + +static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) +{ + u32 *p1 = (u32 *) s1; + u32 *p2 = (u32 *) s2; + int len = sizeof(struct xfrm_selector) / sizeof(u32); + int i; + + for (i = 0; i < len; i++) { + if (p1[i] != p2[i]) + return 1; + } + + return 0; +} + +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +{ + struct net *net = xp_net(policy); + struct xfrm_policy *pol; + struct xfrm_policy *delpol; + struct hlist_head *chain; + struct hlist_node *entry, *newpos; + u32 mark = policy->mark.v & policy->mark.m; + + write_lock_bh(&xfrm_policy_lock); + chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); + delpol = NULL; + newpos = NULL; + hlist_for_each_entry(pol, entry, chain, bydst) { + if (pol->type == policy->type && + !selector_cmp(&pol->selector, &policy->selector) && + (mark & pol->mark.m) == pol->mark.v && + xfrm_sec_ctx_match(pol->security, policy->security) && + !WARN_ON(delpol)) { + if (excl) { + write_unlock_bh(&xfrm_policy_lock); + return -EEXIST; + } + delpol = pol; + if (policy->priority > pol->priority) + continue; + } else if (policy->priority >= pol->priority) { + newpos = &pol->bydst; + continue; + } + if (delpol) + break; + } + if (newpos) + hlist_add_after(newpos, &policy->bydst); + else + hlist_add_head(&policy->bydst, chain); + xfrm_pol_hold(policy); + net->xfrm.policy_count[dir]++; + atomic_inc(&flow_cache_genid); + if (delpol) + __xfrm_policy_unlink(delpol, dir); + policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir); + hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); + policy->curlft.add_time = get_seconds(); + policy->curlft.use_time = 0; + if (!mod_timer(&policy->timer, jiffies + HZ)) + xfrm_pol_hold(policy); + list_add(&policy->walk.all, &net->xfrm.policy_all); + write_unlock_bh(&xfrm_policy_lock); + + if (delpol) + xfrm_policy_kill(delpol); + else if (xfrm_bydst_should_resize(net, dir, NULL)) + schedule_work(&net->xfrm.policy_hash_work); + + return 0; +} +EXPORT_SYMBOL(xfrm_policy_insert); + +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, + int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete, + int *err) +{ + struct xfrm_policy *pol, *ret; + struct hlist_head *chain; + struct hlist_node *entry; + + *err = 0; + write_lock_bh(&xfrm_policy_lock); + chain = policy_hash_bysel(net, sel, sel->family, dir); + ret = NULL; + hlist_for_each_entry(pol, entry, chain, bydst) { + if (pol->type == type && + (mark & pol->mark.m) == pol->mark.v && + !selector_cmp(sel, &pol->selector) && + xfrm_sec_ctx_match(ctx, pol->security)) { + xfrm_pol_hold(pol); + if (delete) { + *err = security_xfrm_policy_delete( + pol->security); + if (*err) { + write_unlock_bh(&xfrm_policy_lock); + return pol; + } + __xfrm_policy_unlink(pol, dir); + } + ret = pol; + break; + } + } + write_unlock_bh(&xfrm_policy_lock); + + if (ret && delete) + xfrm_policy_kill(ret); + return ret; +} +EXPORT_SYMBOL(xfrm_policy_bysel_ctx); + +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, + int dir, u32 id, int delete, int *err) +{ + struct xfrm_policy *pol, *ret; + struct hlist_head *chain; + struct hlist_node *entry; + + *err = -ENOENT; + if (xfrm_policy_id2dir(id) != dir) + return NULL; + + *err = 0; + write_lock_bh(&xfrm_policy_lock); + chain = net->xfrm.policy_byidx + idx_hash(net, id); + ret = NULL; + hlist_for_each_entry(pol, entry, chain, byidx) { + if (pol->type == type && pol->index == id && + (mark & pol->mark.m) == pol->mark.v) { + xfrm_pol_hold(pol); + if (delete) { + *err = security_xfrm_policy_delete( + pol->security); + if (*err) { + write_unlock_bh(&xfrm_policy_lock); + return pol; + } + __xfrm_policy_unlink(pol, dir); + } + ret = pol; + break; + } + } + write_unlock_bh(&xfrm_policy_lock); + + if (ret && delete) + xfrm_policy_kill(ret); + return ret; +} +EXPORT_SYMBOL(xfrm_policy_byid); + +#ifdef CONFIG_SECURITY_NETWORK_XFRM +static inline int +xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info) +{ + int dir, err = 0; + + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { + struct xfrm_policy *pol; + struct hlist_node *entry; + int i; + + hlist_for_each_entry(pol, entry, + &net->xfrm.policy_inexact[dir], bydst) { + if (pol->type != type) + continue; + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, + audit_info->loginuid, + audit_info->sessionid, + audit_info->secid); + return err; + } + } + for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { + hlist_for_each_entry(pol, entry, + net->xfrm.policy_bydst[dir].table + i, + bydst) { + if (pol->type != type) + continue; + err = security_xfrm_policy_delete( + pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, + audit_info->loginuid, + audit_info->sessionid, + audit_info->secid); + return err; + } + } + } + } + return err; +} +#else +static inline int +xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info) +{ + return 0; +} +#endif + +int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) +{ + int dir, err = 0, cnt = 0; + + write_lock_bh(&xfrm_policy_lock); + + err = xfrm_policy_flush_secctx_check(net, type, audit_info); + if (err) + goto out; + + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { + struct xfrm_policy *pol; + struct hlist_node *entry; + int i; + + again1: + hlist_for_each_entry(pol, entry, + &net->xfrm.policy_inexact[dir], bydst) { + if (pol->type != type) + continue; + __xfrm_policy_unlink(pol, dir); + write_unlock_bh(&xfrm_policy_lock); + cnt++; + + xfrm_audit_policy_delete(pol, 1, audit_info->loginuid, + audit_info->sessionid, + audit_info->secid); + + xfrm_policy_kill(pol); + + write_lock_bh(&xfrm_policy_lock); + goto again1; + } + + for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { + again2: + hlist_for_each_entry(pol, entry, + net->xfrm.policy_bydst[dir].table + i, + bydst) { + if (pol->type != type) + continue; + __xfrm_policy_unlink(pol, dir); + write_unlock_bh(&xfrm_policy_lock); + cnt++; + + xfrm_audit_policy_delete(pol, 1, + audit_info->loginuid, + audit_info->sessionid, + audit_info->secid); + xfrm_policy_kill(pol); + + write_lock_bh(&xfrm_policy_lock); + goto again2; + } + } + + } + if (!cnt) + err = -ESRCH; +out: + write_unlock_bh(&xfrm_policy_lock); + return err; +} +EXPORT_SYMBOL(xfrm_policy_flush); + +int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, + int (*func)(struct xfrm_policy *, int, int, void*), + void *data) +{ + struct xfrm_policy *pol; + struct xfrm_policy_walk_entry *x; + int error = 0; + + if (walk->type >= XFRM_POLICY_TYPE_MAX && + walk->type != XFRM_POLICY_TYPE_ANY) + return -EINVAL; + + if (list_empty(&walk->walk.all) && walk->seq != 0) + return 0; + + write_lock_bh(&xfrm_policy_lock); + if (list_empty(&walk->walk.all)) + x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); + else + x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); + list_for_each_entry_from(x, &net->xfrm.policy_all, all) { + if (x->dead) + continue; + pol = container_of(x, struct xfrm_policy, walk); + if (walk->type != XFRM_POLICY_TYPE_ANY && + walk->type != pol->type) + continue; + error = func(pol, xfrm_policy_id2dir(pol->index), + walk->seq, data); + if (error) { + list_move_tail(&walk->walk.all, &x->all); + goto out; + } + walk->seq++; + } + if (walk->seq == 0) { + error = -ENOENT; + goto out; + } + list_del_init(&walk->walk.all); +out: + write_unlock_bh(&xfrm_policy_lock); + return error; +} +EXPORT_SYMBOL(xfrm_policy_walk); + +void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) +{ + INIT_LIST_HEAD(&walk->walk.all); + walk->walk.dead = 1; + walk->type = type; + walk->seq = 0; +} +EXPORT_SYMBOL(xfrm_policy_walk_init); + +void xfrm_policy_walk_done(struct xfrm_policy_walk *walk) +{ + if (list_empty(&walk->walk.all)) + return; + + write_lock_bh(&xfrm_policy_lock); + list_del(&walk->walk.all); + write_unlock_bh(&xfrm_policy_lock); +} +EXPORT_SYMBOL(xfrm_policy_walk_done); + +/* + * Find policy to apply to this flow. + * + * Returns 0 if policy found, else an -errno. + */ +static int xfrm_policy_match(const struct xfrm_policy *pol, + const struct flowi *fl, + u8 type, u16 family, int dir) +{ + const struct xfrm_selector *sel = &pol->selector; + int match, ret = -ESRCH; + + if (pol->family != family || + (fl->flowi_mark & pol->mark.m) != pol->mark.v || + pol->type != type) + return ret; + + match = xfrm_selector_match(sel, fl, family); + if (match) + ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, + dir); + + return ret; +} + +static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, + const struct flowi *fl, + u16 family, u8 dir) +{ + int err; + struct xfrm_policy *pol, *ret; + const xfrm_address_t *daddr, *saddr; + struct hlist_node *entry; + struct hlist_head *chain; + u32 priority = ~0U; + + daddr = xfrm_flowi_daddr(fl, family); + saddr = xfrm_flowi_saddr(fl, family); + if (unlikely(!daddr || !saddr)) + return NULL; + + read_lock_bh(&xfrm_policy_lock); + chain = policy_hash_direct(net, daddr, saddr, family, dir); + ret = NULL; + hlist_for_each_entry(pol, entry, chain, bydst) { + err = xfrm_policy_match(pol, fl, type, family, dir); + if (err) { + if (err == -ESRCH) + continue; + else { + ret = ERR_PTR(err); + goto fail; + } + } else { + ret = pol; + priority = ret->priority; + break; + } + } + chain = &net->xfrm.policy_inexact[dir]; + hlist_for_each_entry(pol, entry, chain, bydst) { + err = xfrm_policy_match(pol, fl, type, family, dir); + if (err) { + if (err == -ESRCH) + continue; + else { + ret = ERR_PTR(err); + goto fail; + } + } else if (pol->priority < priority) { + ret = pol; + break; + } + } + if (ret) + xfrm_pol_hold(ret); +fail: + read_unlock_bh(&xfrm_policy_lock); + + return ret; +} + +static struct xfrm_policy * +__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_policy *pol; + + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); + if (pol != NULL) + return pol; +#endif + return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); +} + +static struct flow_cache_object * +xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, + u8 dir, struct flow_cache_object *old_obj, void *ctx) +{ + struct xfrm_policy *pol; + + if (old_obj) + xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo)); + + pol = __xfrm_policy_lookup(net, fl, family, dir); + if (IS_ERR_OR_NULL(pol)) + return ERR_CAST(pol); + + /* Resolver returns two references: + * one for cache and one for caller of flow_cache_lookup() */ + xfrm_pol_hold(pol); + + return &pol->flo; +} + +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + } +} + +static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, + const struct flowi *fl) +{ + struct xfrm_policy *pol; + + read_lock_bh(&xfrm_policy_lock); + if ((pol = sk->sk_policy[dir]) != NULL) { + int match = xfrm_selector_match(&pol->selector, fl, + sk->sk_family); + int err = 0; + + if (match) { + if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { + pol = NULL; + goto out; + } + err = security_xfrm_policy_lookup(pol->security, + fl->flowi_secid, + policy_to_flow_dir(dir)); + if (!err) + xfrm_pol_hold(pol); + else if (err == -ESRCH) + pol = NULL; + else + pol = ERR_PTR(err); + } else + pol = NULL; + } +out: + read_unlock_bh(&xfrm_policy_lock); + return pol; +} + +static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) +{ + struct net *net = xp_net(pol); + struct hlist_head *chain = policy_hash_bysel(net, &pol->selector, + pol->family, dir); + + list_add(&pol->walk.all, &net->xfrm.policy_all); + hlist_add_head(&pol->bydst, chain); + hlist_add_head(&pol->byidx, net->xfrm.policy_byidx+idx_hash(net, pol->index)); + net->xfrm.policy_count[dir]++; + xfrm_pol_hold(pol); + + if (xfrm_bydst_should_resize(net, dir, NULL)) + schedule_work(&net->xfrm.policy_hash_work); +} + +static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, + int dir) +{ + struct net *net = xp_net(pol); + + if (hlist_unhashed(&pol->bydst)) + return NULL; + + hlist_del(&pol->bydst); + hlist_del(&pol->byidx); + list_del(&pol->walk.all); + net->xfrm.policy_count[dir]--; + + return pol; +} + +int xfrm_policy_delete(struct xfrm_policy *pol, int dir) +{ + write_lock_bh(&xfrm_policy_lock); + pol = __xfrm_policy_unlink(pol, dir); + write_unlock_bh(&xfrm_policy_lock); + if (pol) { + xfrm_policy_kill(pol); + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL(xfrm_policy_delete); + +int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) +{ + struct net *net = xp_net(pol); + struct xfrm_policy *old_pol; + +#ifdef CONFIG_XFRM_SUB_POLICY + if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) + return -EINVAL; +#endif + + write_lock_bh(&xfrm_policy_lock); + old_pol = sk->sk_policy[dir]; + sk->sk_policy[dir] = pol; + if (pol) { + pol->curlft.add_time = get_seconds(); + pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir); + __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); + } + if (old_pol) + /* Unlinking succeeds always. This is the only function + * allowed to delete or replace socket policy. + */ + __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + + if (old_pol) { + xfrm_policy_kill(old_pol); + } + return 0; +} + +static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) +{ + struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); + + if (newp) { + newp->selector = old->selector; + if (security_xfrm_policy_clone(old->security, + &newp->security)) { + kfree(newp); + return NULL; /* ENOMEM */ + } + newp->lft = old->lft; + newp->curlft = old->curlft; + newp->mark = old->mark; + newp->action = old->action; + newp->flags = old->flags; + newp->xfrm_nr = old->xfrm_nr; + newp->index = old->index; + newp->type = old->type; + memcpy(newp->xfrm_vec, old->xfrm_vec, + newp->xfrm_nr*sizeof(struct xfrm_tmpl)); + write_lock_bh(&xfrm_policy_lock); + __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + xfrm_pol_put(newp); + } + return newp; +} + +int __xfrm_sk_clone_policy(struct sock *sk) +{ + struct xfrm_policy *p0 = sk->sk_policy[0], + *p1 = sk->sk_policy[1]; + + sk->sk_policy[0] = sk->sk_policy[1] = NULL; + if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL) + return -ENOMEM; + if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL) + return -ENOMEM; + return 0; +} + +static int +xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote, + unsigned short family) +{ + int err; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + + if (unlikely(afinfo == NULL)) + return -EINVAL; + err = afinfo->get_saddr(net, local, remote); + xfrm_policy_put_afinfo(afinfo); + return err; +} + +/* Resolve list of templates for the flow, given policy. */ + +static int +xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, + struct xfrm_state **xfrm, unsigned short family) +{ + struct net *net = xp_net(policy); + int nx; + int i, error; + xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); + xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); + xfrm_address_t tmp; + + for (nx=0, i = 0; i < policy->xfrm_nr; i++) { + struct xfrm_state *x; + xfrm_address_t *remote = daddr; + xfrm_address_t *local = saddr; + struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; + + if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_BEET) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; + if (xfrm_addr_any(local, tmpl->encap_family)) { + error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family); + if (error) + goto fail; + local = &tmp; + } + } + + x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); + + if (x && x->km.state == XFRM_STATE_VALID) { + xfrm[nx++] = x; + daddr = remote; + saddr = local; + continue; + } + if (x) { + error = (x->km.state == XFRM_STATE_ERROR ? + -EINVAL : -EAGAIN); + xfrm_state_put(x); + } + else if (error == -ESRCH) + error = -EAGAIN; + + if (!tmpl->optional) + goto fail; + } + return nx; + +fail: + for (nx--; nx>=0; nx--) + xfrm_state_put(xfrm[nx]); + return error; +} + +static int +xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, + struct xfrm_state **xfrm, unsigned short family) +{ + struct xfrm_state *tp[XFRM_MAX_DEPTH]; + struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; + int cnx = 0; + int error; + int ret; + int i; + + for (i = 0; i < npols; i++) { + if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { + error = -ENOBUFS; + goto fail; + } + + ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); + if (ret < 0) { + error = ret; + goto fail; + } else + cnx += ret; + } + + /* found states are sorted for outbound processing */ + if (npols > 1) + xfrm_state_sort(xfrm, tpp, cnx, family); + + return cnx; + + fail: + for (cnx--; cnx>=0; cnx--) + xfrm_state_put(tpp[cnx]); + return error; + +} + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +static inline int xfrm_get_tos(const struct flowi *fl, int family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + int tos; + + if (!afinfo) + return -EINVAL; + + tos = afinfo->get_tos(fl); + + xfrm_policy_put_afinfo(afinfo); + + return tos; +} + +static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo) +{ + struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); + struct dst_entry *dst = &xdst->u.dst; + + if (xdst->route == NULL) { + /* Dummy bundle - if it has xfrms we were not + * able to build bundle as template resolution failed. + * It means we need to try again resolving. */ + if (xdst->num_xfrms > 0) + return NULL; + } else { + /* Real bundle */ + if (stale_bundle(dst)) + return NULL; + } + + dst_hold(dst); + return flo; +} + +static int xfrm_bundle_flo_check(struct flow_cache_object *flo) +{ + struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); + struct dst_entry *dst = &xdst->u.dst; + + if (!xdst->route) + return 0; + if (stale_bundle(dst)) + return 0; + + return 1; +} + +static void xfrm_bundle_flo_delete(struct flow_cache_object *flo) +{ + struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); + struct dst_entry *dst = &xdst->u.dst; + + dst_free(dst); +} + +static const struct flow_cache_ops xfrm_bundle_fc_ops = { + .get = xfrm_bundle_flo_get, + .check = xfrm_bundle_flo_check, + .delete = xfrm_bundle_flo_delete, +}; + +static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + struct dst_ops *dst_ops; + struct xfrm_dst *xdst; + + if (!afinfo) + return ERR_PTR(-EINVAL); + + switch (family) { + case AF_INET: + dst_ops = &net->xfrm.xfrm4_dst_ops; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + dst_ops = &net->xfrm.xfrm6_dst_ops; + break; +#endif + default: + BUG(); + } + xdst = dst_alloc(dst_ops, NULL, 0, 0, 0); + memset(&xdst->u.rt6.rt6i_table, 0, sizeof(*xdst) - sizeof(struct dst_entry)); + xfrm_policy_put_afinfo(afinfo); + + if (likely(xdst)) + xdst->flo.ops = &xfrm_bundle_fc_ops; + else + xdst = ERR_PTR(-ENOBUFS); + + return xdst; +} + +static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) +{ + struct xfrm_policy_afinfo *afinfo = + xfrm_policy_get_afinfo(dst->ops->family); + int err; + + if (!afinfo) + return -EINVAL; + + err = afinfo->init_path(path, dst, nfheader_len); + + xfrm_policy_put_afinfo(afinfo); + + return err; +} + +static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + const struct flowi *fl) +{ + struct xfrm_policy_afinfo *afinfo = + xfrm_policy_get_afinfo(xdst->u.dst.ops->family); + int err; + + if (!afinfo) + return -EINVAL; + + err = afinfo->fill_dst(xdst, dev, fl); + + xfrm_policy_put_afinfo(afinfo); + + return err; +} + + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, + struct xfrm_state **xfrm, int nx, + const struct flowi *fl, + struct dst_entry *dst) +{ + struct net *net = xp_net(policy); + unsigned long now = jiffies; + struct net_device *dev; + struct xfrm_mode *inner_mode; + struct dst_entry *dst_prev = NULL; + struct dst_entry *dst0 = NULL; + int i = 0; + int err; + int header_len = 0; + int nfheader_len = 0; + int trailer_len = 0; + int tos; + int family = policy->selector.family; + xfrm_address_t saddr, daddr; + + xfrm_flowi_addr_get(fl, &saddr, &daddr, family); + + tos = xfrm_get_tos(fl, family); + err = tos; + if (tos < 0) + goto put_states; + + dst_hold(dst); + + for (; i < nx; i++) { + struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); + struct dst_entry *dst1 = &xdst->u.dst; + + err = PTR_ERR(xdst); + if (IS_ERR(xdst)) { + dst_release(dst); + goto put_states; + } + + if (xfrm[i]->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(xfrm[i], + xfrm_af2proto(family)); + if (!inner_mode) { + err = -EAFNOSUPPORT; + dst_release(dst); + goto put_states; + } + } else + inner_mode = xfrm[i]->inner_mode; + + if (!dst_prev) + dst0 = dst1; + else { + dst_prev->child = dst_clone(dst1); + dst1->flags |= DST_NOHASH; + } + + xdst->route = dst; + dst_copy_metrics(dst1, dst); + + if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { + family = xfrm[i]->props.family; + dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr, + family); + err = PTR_ERR(dst); + if (IS_ERR(dst)) + goto put_states; + } else + dst_hold(dst); + + dst1->xfrm = xfrm[i]; + xdst->xfrm_genid = xfrm[i]->genid; + + dst1->obsolete = -1; + dst1->flags |= DST_HOST; + dst1->lastuse = now; + + dst1->input = dst_discard; + dst1->output = inner_mode->afinfo->output; + + dst1->next = dst_prev; + dst_prev = dst1; + + header_len += xfrm[i]->props.header_len; + if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) + nfheader_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + } + + dst_prev->child = dst; + dst0->path = dst; + + err = -ENODEV; + dev = dst->dev; + if (!dev) + goto free_dst; + + /* Copy neighbour for reachability confirmation */ + dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour(dst))); + + xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); + xfrm_init_pmtu(dst_prev); + + for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; + + err = xfrm_fill_dst(xdst, dev, fl); + if (err) + goto free_dst; + + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + header_len -= xdst->u.dst.xfrm->props.header_len; + trailer_len -= xdst->u.dst.xfrm->props.trailer_len; + } + +out: + return dst0; + +put_states: + for (; i < nx; i++) + xfrm_state_put(xfrm[i]); +free_dst: + if (dst0) + dst_free(dst0); + dst0 = ERR_PTR(err); + goto out; +} + +static int inline +xfrm_dst_alloc_copy(void **target, const void *src, int size) +{ + if (!*target) { + *target = kmalloc(size, GFP_ATOMIC); + if (!*target) + return -ENOMEM; + } + memcpy(*target, src, size); + return 0; +} + +static int inline +xfrm_dst_update_parent(struct dst_entry *dst, const struct xfrm_selector *sel) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->partner), + sel, sizeof(*sel)); +#else + return 0; +#endif +} + +static int inline +xfrm_dst_update_origin(struct dst_entry *dst, const struct flowi *fl) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); +#else + return 0; +#endif +} + +static int xfrm_expand_policies(const struct flowi *fl, u16 family, + struct xfrm_policy **pols, + int *num_pols, int *num_xfrms) +{ + int i; + + if (*num_pols == 0 || !pols[0]) { + *num_pols = 0; + *num_xfrms = 0; + return 0; + } + if (IS_ERR(pols[0])) + return PTR_ERR(pols[0]); + + *num_xfrms = pols[0]->xfrm_nr; + +#ifdef CONFIG_XFRM_SUB_POLICY + if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW && + pols[0]->type != XFRM_POLICY_TYPE_MAIN) { + pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), + XFRM_POLICY_TYPE_MAIN, + fl, family, + XFRM_POLICY_OUT); + if (pols[1]) { + if (IS_ERR(pols[1])) { + xfrm_pols_put(pols, *num_pols); + return PTR_ERR(pols[1]); + } + (*num_pols) ++; + (*num_xfrms) += pols[1]->xfrm_nr; + } + } +#endif + for (i = 0; i < *num_pols; i++) { + if (pols[i]->action != XFRM_POLICY_ALLOW) { + *num_xfrms = -1; + break; + } + } + + return 0; + +} + +static struct xfrm_dst * +xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, + const struct flowi *fl, u16 family, + struct dst_entry *dst_orig) +{ + struct net *net = xp_net(pols[0]); + struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct dst_entry *dst; + struct xfrm_dst *xdst; + int err; + + /* Try to instantiate a bundle */ + err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); + if (err <= 0) { + if (err != 0 && err != -EAGAIN) + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + return ERR_PTR(err); + } + + dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); + if (IS_ERR(dst)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); + return ERR_CAST(dst); + } + + xdst = (struct xfrm_dst *)dst; + xdst->num_xfrms = err; + if (num_pols > 1) + err = xfrm_dst_update_parent(dst, &pols[1]->selector); + else + err = xfrm_dst_update_origin(dst, fl); + if (unlikely(err)) { + dst_free(dst); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); + return ERR_PTR(err); + } + + xdst->num_pols = num_pols; + memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); + xdst->policy_genid = atomic_read(&pols[0]->genid); + + return xdst; +} + +static struct flow_cache_object * +xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, + struct flow_cache_object *oldflo, void *ctx) +{ + struct dst_entry *dst_orig = (struct dst_entry *)ctx; + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + struct xfrm_dst *xdst, *new_xdst; + int num_pols = 0, num_xfrms = 0, i, err, pol_dead; + + /* Check if the policies from old bundle are usable */ + xdst = NULL; + if (oldflo) { + xdst = container_of(oldflo, struct xfrm_dst, flo); + num_pols = xdst->num_pols; + num_xfrms = xdst->num_xfrms; + pol_dead = 0; + for (i = 0; i < num_pols; i++) { + pols[i] = xdst->pols[i]; + pol_dead |= pols[i]->walk.dead; + } + if (pol_dead) { + dst_free(&xdst->u.dst); + xdst = NULL; + num_pols = 0; + num_xfrms = 0; + oldflo = NULL; + } + } + + /* Resolve policies to use if we couldn't get them from + * previous cache entry */ + if (xdst == NULL) { + num_pols = 1; + pols[0] = __xfrm_policy_lookup(net, fl, family, dir); + err = xfrm_expand_policies(fl, family, pols, + &num_pols, &num_xfrms); + if (err < 0) + goto inc_error; + if (num_pols == 0) + return NULL; + if (num_xfrms <= 0) + goto make_dummy_bundle; + } + + new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig); + if (IS_ERR(new_xdst)) { + err = PTR_ERR(new_xdst); + if (err != -EAGAIN) + goto error; + if (oldflo == NULL) + goto make_dummy_bundle; + dst_hold(&xdst->u.dst); + return oldflo; + } else if (new_xdst == NULL) { + num_xfrms = 0; + if (oldflo == NULL) + goto make_dummy_bundle; + xdst->num_xfrms = 0; + dst_hold(&xdst->u.dst); + return oldflo; + } + + /* Kill the previous bundle */ + if (xdst) { + /* The policies were stolen for newly generated bundle */ + xdst->num_pols = 0; + dst_free(&xdst->u.dst); + } + + /* Flow cache does not have reference, it dst_free()'s, + * but we do need to return one reference for original caller */ + dst_hold(&new_xdst->u.dst); + return &new_xdst->flo; + +make_dummy_bundle: + /* We found policies, but there's no bundles to instantiate: + * either because the policy blocks, has no transformations or + * we could not build template (no xfrm_states).*/ + xdst = xfrm_alloc_dst(net, family); + if (IS_ERR(xdst)) { + xfrm_pols_put(pols, num_pols); + return ERR_CAST(xdst); + } + xdst->num_pols = num_pols; + xdst->num_xfrms = num_xfrms; + memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); + + dst_hold(&xdst->u.dst); + return &xdst->flo; + +inc_error: + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); +error: + if (xdst != NULL) + dst_free(&xdst->u.dst); + else + xfrm_pols_put(pols, num_pols); + return ERR_PTR(err); +} + +static struct dst_entry *make_blackhole(struct net *net, u16 family, + struct dst_entry *dst_orig) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + struct dst_entry *ret; + + if (!afinfo) { + dst_release(dst_orig); + ret = ERR_PTR(-EINVAL); + } else { + ret = afinfo->blackhole_route(net, dst_orig); + } + xfrm_policy_put_afinfo(afinfo); + + return ret; +} + +/* Main function: finds/creates a bundle for given flow. + * + * At the moment we eat a raw IP route. Mostly to speed up lookups + * on interfaces with disabled IPsec. + */ +struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, + struct sock *sk, int flags) +{ + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + struct flow_cache_object *flo; + struct xfrm_dst *xdst; + struct dst_entry *dst, *route; + u16 family = dst_orig->ops->family; + u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + int i, err, num_pols, num_xfrms = 0, drop_pols = 0; + +restart: + dst = NULL; + xdst = NULL; + route = NULL; + + if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { + num_pols = 1; + pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + err = xfrm_expand_policies(fl, family, pols, + &num_pols, &num_xfrms); + if (err < 0) + goto dropdst; + + if (num_pols) { + if (num_xfrms <= 0) { + drop_pols = num_pols; + goto no_transform; + } + + xdst = xfrm_resolve_and_create_bundle( + pols, num_pols, fl, + family, dst_orig); + if (IS_ERR(xdst)) { + xfrm_pols_put(pols, num_pols); + err = PTR_ERR(xdst); + goto dropdst; + } else if (xdst == NULL) { + num_xfrms = 0; + drop_pols = num_pols; + goto no_transform; + } + + dst_hold(&xdst->u.dst); + + spin_lock_bh(&xfrm_policy_sk_bundle_lock); + xdst->u.dst.next = xfrm_policy_sk_bundles; + xfrm_policy_sk_bundles = &xdst->u.dst; + spin_unlock_bh(&xfrm_policy_sk_bundle_lock); + + route = xdst->route; + } + } + + if (xdst == NULL) { + /* To accelerate a bit... */ + if ((dst_orig->flags & DST_NOXFRM) || + !net->xfrm.policy_count[XFRM_POLICY_OUT]) + goto nopol; + + flo = flow_cache_lookup(net, fl, family, dir, + xfrm_bundle_lookup, dst_orig); + if (flo == NULL) + goto nopol; + if (IS_ERR(flo)) { + err = PTR_ERR(flo); + goto dropdst; + } + xdst = container_of(flo, struct xfrm_dst, flo); + + num_pols = xdst->num_pols; + num_xfrms = xdst->num_xfrms; + memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols); + route = xdst->route; + } + + dst = &xdst->u.dst; + if (route == NULL && num_xfrms > 0) { + /* The only case when xfrm_bundle_lookup() returns a + * bundle with null route, is when the template could + * not be resolved. It means policies are there, but + * bundle could not be created, since we don't yet + * have the xfrm_state's. We need to wait for KM to + * negotiate new SA's or bail out with error.*/ + if (net->xfrm.sysctl_larval_drop) { + /* EREMOTE tells the caller to generate + * a one-shot blackhole route. */ + dst_release(dst); + xfrm_pols_put(pols, drop_pols); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); + + return make_blackhole(net, family, dst_orig); + } + if (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&net->xfrm.km_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&net->xfrm.km_waitq, &wait); + + if (!signal_pending(current)) { + dst_release(dst); + goto restart; + } + + err = -ERESTART; + } else + err = -EAGAIN; + + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); + goto error; + } + +no_transform: + if (num_pols == 0) + goto nopol; + + if ((flags & XFRM_LOOKUP_ICMP) && + !(pols[0]->flags & XFRM_POLICY_ICMP)) { + err = -ENOENT; + goto error; + } + + for (i = 0; i < num_pols; i++) + pols[i]->curlft.use_time = get_seconds(); + + if (num_xfrms < 0) { + /* Prohibit the flow */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); + err = -EPERM; + goto error; + } else if (num_xfrms > 0) { + /* Flow transformed */ + dst_release(dst_orig); + } else { + /* Flow passes untransformed */ + dst_release(dst); + dst = dst_orig; + } +ok: + xfrm_pols_put(pols, drop_pols); + if (dst && dst->xfrm && + dst->xfrm->props.mode == XFRM_MODE_TUNNEL) + dst->flags |= DST_XFRM_TUNNEL; + return dst; + +nopol: + if (!(flags & XFRM_LOOKUP_ICMP)) { + dst = dst_orig; + goto ok; + } + err = -ENOENT; +error: + dst_release(dst); +dropdst: + dst_release(dst_orig); + xfrm_pols_put(pols, drop_pols); + return ERR_PTR(err); +} +EXPORT_SYMBOL(xfrm_lookup); + +static inline int +xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) +{ + struct xfrm_state *x; + + if (!skb->sp || idx < 0 || idx >= skb->sp->len) + return 0; + x = skb->sp->xvec[idx]; + if (!x->type->reject) + return 0; + return x->type->reject(x, skb, fl); +} + +/* When skb is transformed back to its "native" form, we have to + * check policy restrictions. At the moment we make this in maximally + * stupid way. Shame on me. :-) Of course, connected sockets must + * have policy cached at them. + */ + +static inline int +xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, + unsigned short family) +{ + if (xfrm_state_kern(x)) + return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); + return x->id.proto == tmpl->id.proto && + (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && + (x->props.reqid == tmpl->reqid || !tmpl->reqid) && + x->props.mode == tmpl->mode && + (tmpl->allalgs || (tmpl->aalgos & (1<props.aalgo)) || + !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && + !(x->props.mode != XFRM_MODE_TRANSPORT && + xfrm_state_addr_cmp(tmpl, x, family)); +} + +/* + * 0 or more than 0 is returned when validation is succeeded (either bypass + * because of optional transport mode, or next index of the mathced secpath + * state with the template. + * -1 is returned when no matching template is found. + * Otherwise "-2 - errored_index" is returned. + */ +static inline int +xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, + unsigned short family) +{ + int idx = start; + + if (tmpl->optional) { + if (tmpl->mode == XFRM_MODE_TRANSPORT) + return start; + } else + start = -1; + for (; idx < sp->len; idx++) { + if (xfrm_state_ok(tmpl, sp->xvec[idx], family)) + return ++idx; + if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { + if (start == -1) + start = -2-idx; + break; + } + } + return start; +} + +int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, + unsigned int family, int reverse) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + int err; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + afinfo->decode_session(skb, fl, reverse); + err = security_xfrm_decode_session(skb, &fl->flowi_secid); + xfrm_policy_put_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(__xfrm_decode_session); + +static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) +{ + for (; k < sp->len; k++) { + if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { + *idxp = k; + return 1; + } + } + + return 0; +} + +int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, + unsigned short family) +{ + struct net *net = dev_net(skb->dev); + struct xfrm_policy *pol; + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + int npols = 0; + int xfrm_nr; + int pi; + int reverse; + struct flowi fl; + u8 fl_dir; + int xerr_idx = -1; + + reverse = dir & ~XFRM_POLICY_MASK; + dir &= XFRM_POLICY_MASK; + fl_dir = policy_to_flow_dir(dir); + + if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + return 0; + } + + nf_nat_decode_session(skb, &fl, family); + + /* First, check used SA against their selectors. */ + if (skb->sp) { + int i; + + for (i=skb->sp->len-1; i>=0; i--) { + struct xfrm_state *x = skb->sp->xvec[i]; + if (!xfrm_selector_match(&x->sel, &fl, family)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); + return 0; + } + } + } + + pol = NULL; + if (sk && sk->sk_policy[dir]) { + pol = xfrm_sk_policy_lookup(sk, dir, &fl); + if (IS_ERR(pol)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + return 0; + } + } + + if (!pol) { + struct flow_cache_object *flo; + + flo = flow_cache_lookup(net, &fl, family, fl_dir, + xfrm_policy_lookup, NULL); + if (IS_ERR_OR_NULL(flo)) + pol = ERR_CAST(flo); + else + pol = container_of(flo, struct xfrm_policy, flo); + } + + if (IS_ERR(pol)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + return 0; + } + + if (!pol) { + if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { + xfrm_secpath_reject(xerr_idx, skb, &fl); + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); + return 0; + } + return 1; + } + + pol->curlft.use_time = get_seconds(); + + pols[0] = pol; + npols ++; +#ifdef CONFIG_XFRM_SUB_POLICY + if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { + pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, + &fl, family, + XFRM_POLICY_IN); + if (pols[1]) { + if (IS_ERR(pols[1])) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + return 0; + } + pols[1]->curlft.use_time = get_seconds(); + npols ++; + } + } +#endif + + if (pol->action == XFRM_POLICY_ALLOW) { + struct sec_path *sp; + static struct sec_path dummy; + struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; + struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; + struct xfrm_tmpl **tpp = tp; + int ti = 0; + int i, k; + + if ((sp = skb->sp) == NULL) + sp = &dummy; + + for (pi = 0; pi < npols; pi++) { + if (pols[pi] != pol && + pols[pi]->action != XFRM_POLICY_ALLOW) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); + goto reject; + } + if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto reject_error; + } + for (i = 0; i < pols[pi]->xfrm_nr; i++) + tpp[ti++] = &pols[pi]->xfrm_vec[i]; + } + xfrm_nr = ti; + if (npols > 1) { + xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); + tpp = stp; + } + + /* For each tunnel xfrm, find the first matching tmpl. + * For each tmpl before that, find corresponding xfrm. + * Order is _important_. Later we will implement + * some barriers, but at the moment barriers + * are implied between each two transformations. + */ + for (i = xfrm_nr-1, k = 0; i >= 0; i--) { + k = xfrm_policy_ok(tpp[i], sp, k, family); + if (k < 0) { + if (k < -1) + /* "-2 - errored_index" returned */ + xerr_idx = -(2+k); + XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); + goto reject; + } + } + + if (secpath_has_nontransport(sp, k, &xerr_idx)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); + goto reject; + } + + xfrm_pols_put(pols, npols); + return 1; + } + XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); + +reject: + xfrm_secpath_reject(xerr_idx, skb, &fl); +reject_error: + xfrm_pols_put(pols, npols); + return 0; +} +EXPORT_SYMBOL(__xfrm_policy_check); + +int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) +{ + struct net *net = dev_net(skb->dev); + struct flowi fl; + struct dst_entry *dst; + int res = 1; + + if (xfrm_decode_session(skb, &fl, family) < 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); + return 0; + } + + skb_dst_force(skb); + + dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0); + if (IS_ERR(dst)) { + res = 0; + dst = NULL; + } + skb_dst_set(skb, dst); + return res; +} +EXPORT_SYMBOL(__xfrm_route_forward); + +/* Optimize later using cookies and generation ids. */ + +static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) +{ + /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete + * to "-1" to force all XFRM destinations to get validated by + * dst_ops->check on every use. We do this because when a + * normal route referenced by an XFRM dst is obsoleted we do + * not go looking around for all parent referencing XFRM dsts + * so that we can invalidate them. It is just too much work. + * Instead we make the checks here on every use. For example: + * + * XFRM dst A --> IPv4 dst X + * + * X is the "xdst->route" of A (X is also the "dst->path" of A + * in this example). If X is marked obsolete, "A" will not + * notice. That's what we are validating here via the + * stale_bundle() check. + * + * When a policy's bundle is pruned, we dst_free() the XFRM + * dst which causes it's ->obsolete field to be set to a + * positive non-zero integer. If an XFRM dst has been pruned + * like this, we want to force a new route lookup. + */ + if (dst->obsolete < 0 && !stale_bundle(dst)) + return dst; + + return NULL; +} + +static int stale_bundle(struct dst_entry *dst) +{ + return !xfrm_bundle_ok((struct xfrm_dst *)dst); +} + +void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) +{ + while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { + dst->dev = dev_net(dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); + } +} +EXPORT_SYMBOL(xfrm_dst_ifdown); + +static void xfrm_link_failure(struct sk_buff *skb) +{ + /* Impossible. Such dst must be popped before reaches point of failure. */ +} + +static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +{ + if (dst) { + if (dst->obsolete) { + dst_release(dst); + dst = NULL; + } + } + return dst; +} + +static void __xfrm_garbage_collect(struct net *net) +{ + struct dst_entry *head, *next; + + flow_cache_flush(); + + spin_lock_bh(&xfrm_policy_sk_bundle_lock); + head = xfrm_policy_sk_bundles; + xfrm_policy_sk_bundles = NULL; + spin_unlock_bh(&xfrm_policy_sk_bundle_lock); + + while (head) { + next = head->next; + dst_free(head); + head = next; + } +} + +static void xfrm_init_pmtu(struct dst_entry *dst) +{ + do { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + u32 pmtu, route_mtu_cached; + + pmtu = dst_mtu(dst->child); + xdst->child_mtu_cached = pmtu; + + pmtu = xfrm_state_mtu(dst->xfrm, pmtu); + + route_mtu_cached = dst_mtu(xdst->route); + xdst->route_mtu_cached = route_mtu_cached; + + if (pmtu > route_mtu_cached) + pmtu = route_mtu_cached; + + dst_metric_set(dst, RTAX_MTU, pmtu); + } while ((dst = dst->next)); +} + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +static int xfrm_bundle_ok(struct xfrm_dst *first) +{ + struct dst_entry *dst = &first->u.dst; + struct xfrm_dst *last; + u32 mtu; + + if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || + (dst->dev && !netif_running(dst->dev))) + return 0; + + last = NULL; + + do { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (dst->xfrm->km.state != XFRM_STATE_VALID) + return 0; + if (xdst->xfrm_genid != dst->xfrm->genid) + return 0; + if (xdst->num_pols > 0 && + xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) + return 0; + + mtu = dst_mtu(dst->child); + if (xdst->child_mtu_cached != mtu) { + last = xdst; + xdst->child_mtu_cached = mtu; + } + + if (!dst_check(xdst->route, xdst->route_cookie)) + return 0; + mtu = dst_mtu(xdst->route); + if (xdst->route_mtu_cached != mtu) { + last = xdst; + xdst->route_mtu_cached = mtu; + } + + dst = dst->child; + } while (dst->xfrm); + + if (likely(!last)) + return 1; + + mtu = last->child_mtu_cached; + for (;;) { + dst = &last->u.dst; + + mtu = xfrm_state_mtu(dst->xfrm, mtu); + if (mtu > last->route_mtu_cached) + mtu = last->route_mtu_cached; + dst_metric_set(dst, RTAX_MTU, mtu); + + if (last == first) + break; + + last = (struct xfrm_dst *)last->u.dst.next; + last->child_mtu_cached = mtu; + } + + return 1; +} + +static unsigned int xfrm_default_advmss(const struct dst_entry *dst) +{ + return dst_metric_advmss(dst->path); +} + +static unsigned int xfrm_default_mtu(const struct dst_entry *dst) +{ + return dst_mtu(dst->path); +} + +int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + struct net *net; + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock_bh(&xfrm_policy_afinfo_lock); + if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else { + struct dst_ops *dst_ops = afinfo->dst_ops; + if (likely(dst_ops->kmem_cachep == NULL)) + dst_ops->kmem_cachep = xfrm_dst_cache; + if (likely(dst_ops->check == NULL)) + dst_ops->check = xfrm_dst_check; + if (likely(dst_ops->default_advmss == NULL)) + dst_ops->default_advmss = xfrm_default_advmss; + if (likely(dst_ops->default_mtu == NULL)) + dst_ops->default_mtu = xfrm_default_mtu; + if (likely(dst_ops->negative_advice == NULL)) + dst_ops->negative_advice = xfrm_negative_advice; + if (likely(dst_ops->link_failure == NULL)) + dst_ops->link_failure = xfrm_link_failure; + if (likely(afinfo->garbage_collect == NULL)) + afinfo->garbage_collect = __xfrm_garbage_collect; + xfrm_policy_afinfo[afinfo->family] = afinfo; + } + write_unlock_bh(&xfrm_policy_afinfo_lock); + + rtnl_lock(); + for_each_net(net) { + struct dst_ops *xfrm_dst_ops; + + switch (afinfo->family) { + case AF_INET: + xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops; + break; +#endif + default: + BUG(); + } + *xfrm_dst_ops = *afinfo->dst_ops; + } + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL(xfrm_policy_register_afinfo); + +int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock_bh(&xfrm_policy_afinfo_lock); + if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else { + struct dst_ops *dst_ops = afinfo->dst_ops; + xfrm_policy_afinfo[afinfo->family] = NULL; + dst_ops->kmem_cachep = NULL; + dst_ops->check = NULL; + dst_ops->negative_advice = NULL; + dst_ops->link_failure = NULL; + afinfo->garbage_collect = NULL; + } + } + write_unlock_bh(&xfrm_policy_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); + +static void __net_init xfrm_dst_ops_init(struct net *net) +{ + struct xfrm_policy_afinfo *afinfo; + + read_lock_bh(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[AF_INET]; + if (afinfo) + net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + afinfo = xfrm_policy_afinfo[AF_INET6]; + if (afinfo) + net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops; +#endif + read_unlock_bh(&xfrm_policy_afinfo_lock); +} + +static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + read_lock(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (unlikely(!afinfo)) + read_unlock(&xfrm_policy_afinfo_lock); + return afinfo; +} + +static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + read_unlock(&xfrm_policy_afinfo_lock); +} + +static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_DOWN: + __xfrm_garbage_collect(dev_net(dev)); + } + return NOTIFY_DONE; +} + +static struct notifier_block xfrm_dev_notifier = { + .notifier_call = xfrm_dev_event, +}; + +#ifdef CONFIG_XFRM_STATISTICS +static int __net_init xfrm_statistics_init(struct net *net) +{ + int rv; + + if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics, + sizeof(struct linux_xfrm_mib), + __alignof__(struct linux_xfrm_mib)) < 0) + return -ENOMEM; + rv = xfrm_proc_init(net); + if (rv < 0) + snmp_mib_free((void __percpu **)net->mib.xfrm_statistics); + return rv; +} + +static void xfrm_statistics_fini(struct net *net) +{ + xfrm_proc_fini(net); + snmp_mib_free((void __percpu **)net->mib.xfrm_statistics); +} +#else +static int __net_init xfrm_statistics_init(struct net *net) +{ + return 0; +} + +static void xfrm_statistics_fini(struct net *net) +{ +} +#endif + +static int __net_init xfrm_policy_init(struct net *net) +{ + unsigned int hmask, sz; + int dir; + + if (net_eq(net, &init_net)) + xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", + sizeof(struct xfrm_dst), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); + + hmask = 8 - 1; + sz = (hmask+1) * sizeof(struct hlist_head); + + net->xfrm.policy_byidx = xfrm_hash_alloc(sz); + if (!net->xfrm.policy_byidx) + goto out_byidx; + net->xfrm.policy_idx_hmask = hmask; + + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + struct xfrm_policy_hash *htab; + + net->xfrm.policy_count[dir] = 0; + INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); + + htab = &net->xfrm.policy_bydst[dir]; + htab->table = xfrm_hash_alloc(sz); + if (!htab->table) + goto out_bydst; + htab->hmask = hmask; + } + + INIT_LIST_HEAD(&net->xfrm.policy_all); + INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); + if (net_eq(net, &init_net)) + register_netdevice_notifier(&xfrm_dev_notifier); + return 0; + +out_bydst: + for (dir--; dir >= 0; dir--) { + struct xfrm_policy_hash *htab; + + htab = &net->xfrm.policy_bydst[dir]; + xfrm_hash_free(htab->table, sz); + } + xfrm_hash_free(net->xfrm.policy_byidx, sz); +out_byidx: + return -ENOMEM; +} + +static void xfrm_policy_fini(struct net *net) +{ + struct xfrm_audit audit_info; + unsigned int sz; + int dir; + + flush_work(&net->xfrm.policy_hash_work); +#ifdef CONFIG_XFRM_SUB_POLICY + audit_info.loginuid = -1; + audit_info.sessionid = -1; + audit_info.secid = 0; + xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, &audit_info); +#endif + audit_info.loginuid = -1; + audit_info.sessionid = -1; + audit_info.secid = 0; + xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); + + WARN_ON(!list_empty(&net->xfrm.policy_all)); + + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + struct xfrm_policy_hash *htab; + + WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); + + htab = &net->xfrm.policy_bydst[dir]; + sz = (htab->hmask + 1); + WARN_ON(!hlist_empty(htab->table)); + xfrm_hash_free(htab->table, sz); + } + + sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); + WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); + xfrm_hash_free(net->xfrm.policy_byidx, sz); +} + +static int __net_init xfrm_net_init(struct net *net) +{ + int rv; + + rv = xfrm_statistics_init(net); + if (rv < 0) + goto out_statistics; + rv = xfrm_state_init(net); + if (rv < 0) + goto out_state; + rv = xfrm_policy_init(net); + if (rv < 0) + goto out_policy; + xfrm_dst_ops_init(net); + rv = xfrm_sysctl_init(net); + if (rv < 0) + goto out_sysctl; + return 0; + +out_sysctl: + xfrm_policy_fini(net); +out_policy: + xfrm_state_fini(net); +out_state: + xfrm_statistics_fini(net); +out_statistics: + return rv; +} + +static void __net_exit xfrm_net_exit(struct net *net) +{ + xfrm_sysctl_fini(net); + xfrm_policy_fini(net); + xfrm_state_fini(net); + xfrm_statistics_fini(net); +} + +static struct pernet_operations __net_initdata xfrm_net_ops = { + .init = xfrm_net_init, + .exit = xfrm_net_exit, +}; + +void __init xfrm_init(void) +{ + register_pernet_subsys(&xfrm_net_ops); + xfrm_input_init(); +} + +#ifdef CONFIG_AUDITSYSCALL +static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, + struct audit_buffer *audit_buf) +{ + struct xfrm_sec_ctx *ctx = xp->security; + struct xfrm_selector *sel = &xp->selector; + + if (ctx) + audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", + ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); + + switch(sel->family) { + case AF_INET: + audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); + if (sel->prefixlen_s != 32) + audit_log_format(audit_buf, " src_prefixlen=%d", + sel->prefixlen_s); + audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); + if (sel->prefixlen_d != 32) + audit_log_format(audit_buf, " dst_prefixlen=%d", + sel->prefixlen_d); + break; + case AF_INET6: + audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); + if (sel->prefixlen_s != 128) + audit_log_format(audit_buf, " src_prefixlen=%d", + sel->prefixlen_s); + audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); + if (sel->prefixlen_d != 128) + audit_log_format(audit_buf, " dst_prefixlen=%d", + sel->prefixlen_d); + break; + } +} + +void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, + uid_t auid, u32 sessionid, u32 secid) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SPD-add"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + audit_log_format(audit_buf, " res=%u", result); + xfrm_audit_common_policyinfo(xp, audit_buf); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); + +void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, + uid_t auid, u32 sessionid, u32 secid) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SPD-delete"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + audit_log_format(audit_buf, " res=%u", result); + xfrm_audit_common_policyinfo(xp, audit_buf); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); +#endif + +#ifdef CONFIG_XFRM_MIGRATE +static int xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, + const struct xfrm_selector *sel_tgt) +{ + if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { + if (sel_tgt->family == sel_cmp->family && + xfrm_addr_cmp(&sel_tgt->daddr, &sel_cmp->daddr, + sel_cmp->family) == 0 && + xfrm_addr_cmp(&sel_tgt->saddr, &sel_cmp->saddr, + sel_cmp->family) == 0 && + sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && + sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { + return 1; + } + } else { + if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { + return 1; + } + } + return 0; +} + +static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector *sel, + u8 dir, u8 type) +{ + struct xfrm_policy *pol, *ret = NULL; + struct hlist_node *entry; + struct hlist_head *chain; + u32 priority = ~0U; + + read_lock_bh(&xfrm_policy_lock); + chain = policy_hash_direct(&init_net, &sel->daddr, &sel->saddr, sel->family, dir); + hlist_for_each_entry(pol, entry, chain, bydst) { + if (xfrm_migrate_selector_match(sel, &pol->selector) && + pol->type == type) { + ret = pol; + priority = ret->priority; + break; + } + } + chain = &init_net.xfrm.policy_inexact[dir]; + hlist_for_each_entry(pol, entry, chain, bydst) { + if (xfrm_migrate_selector_match(sel, &pol->selector) && + pol->type == type && + pol->priority < priority) { + ret = pol; + break; + } + } + + if (ret) + xfrm_pol_hold(ret); + + read_unlock_bh(&xfrm_policy_lock); + + return ret; +} + +static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) +{ + int match = 0; + + if (t->mode == m->mode && t->id.proto == m->proto && + (m->reqid == 0 || t->reqid == m->reqid)) { + switch (t->mode) { + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + if (xfrm_addr_cmp(&t->id.daddr, &m->old_daddr, + m->old_family) == 0 && + xfrm_addr_cmp(&t->saddr, &m->old_saddr, + m->old_family) == 0) { + match = 1; + } + break; + case XFRM_MODE_TRANSPORT: + /* in case of transport mode, template does not store + any IP addresses, hence we just compare mode and + protocol */ + match = 1; + break; + default: + break; + } + } + return match; +} + +/* update endpoint address(es) of template(s) */ +static int xfrm_policy_migrate(struct xfrm_policy *pol, + struct xfrm_migrate *m, int num_migrate) +{ + struct xfrm_migrate *mp; + int i, j, n = 0; + + write_lock_bh(&pol->lock); + if (unlikely(pol->walk.dead)) { + /* target policy has been deleted */ + write_unlock_bh(&pol->lock); + return -ENOENT; + } + + for (i = 0; i < pol->xfrm_nr; i++) { + for (j = 0, mp = m; j < num_migrate; j++, mp++) { + if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) + continue; + n++; + if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && + pol->xfrm_vec[i].mode != XFRM_MODE_BEET) + continue; + /* update endpoints */ + memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, + sizeof(pol->xfrm_vec[i].id.daddr)); + memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, + sizeof(pol->xfrm_vec[i].saddr)); + pol->xfrm_vec[i].encap_family = mp->new_family; + /* flush bundles */ + atomic_inc(&pol->genid); + } + } + + write_unlock_bh(&pol->lock); + + if (!n) + return -ENODATA; + + return 0; +} + +static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) +{ + int i, j; + + if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) + return -EINVAL; + + for (i = 0; i < num_migrate; i++) { + if ((xfrm_addr_cmp(&m[i].old_daddr, &m[i].new_daddr, + m[i].old_family) == 0) && + (xfrm_addr_cmp(&m[i].old_saddr, &m[i].new_saddr, + m[i].old_family) == 0)) + return -EINVAL; + if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || + xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) + return -EINVAL; + + /* check if there is any duplicated entry */ + for (j = i + 1; j < num_migrate; j++) { + if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, + sizeof(m[i].old_daddr)) && + !memcmp(&m[i].old_saddr, &m[j].old_saddr, + sizeof(m[i].old_saddr)) && + m[i].proto == m[j].proto && + m[i].mode == m[j].mode && + m[i].reqid == m[j].reqid && + m[i].old_family == m[j].old_family) + return -EINVAL; + } + } + + return 0; +} + +int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, + struct xfrm_migrate *m, int num_migrate, + struct xfrm_kmaddress *k) +{ + int i, err, nx_cur = 0, nx_new = 0; + struct xfrm_policy *pol = NULL; + struct xfrm_state *x, *xc; + struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; + struct xfrm_state *x_new[XFRM_MAX_DEPTH]; + struct xfrm_migrate *mp; + + if ((err = xfrm_migrate_check(m, num_migrate)) < 0) + goto out; + + /* Stage 1 - find policy */ + if ((pol = xfrm_migrate_policy_find(sel, dir, type)) == NULL) { + err = -ENOENT; + goto out; + } + + /* Stage 2 - find and update state(s) */ + for (i = 0, mp = m; i < num_migrate; i++, mp++) { + if ((x = xfrm_migrate_state_find(mp))) { + x_cur[nx_cur] = x; + nx_cur++; + if ((xc = xfrm_state_migrate(x, mp))) { + x_new[nx_new] = xc; + nx_new++; + } else { + err = -ENODATA; + goto restore_state; + } + } + } + + /* Stage 3 - update policy */ + if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0) + goto restore_state; + + /* Stage 4 - delete old state(s) */ + if (nx_cur) { + xfrm_states_put(x_cur, nx_cur); + xfrm_states_delete(x_cur, nx_cur); + } + + /* Stage 5 - announce */ + km_migrate(sel, dir, type, m, num_migrate, k); + + xfrm_pol_put(pol); + + return 0; +out: + return err; + +restore_state: + if (pol) + xfrm_pol_put(pol); + if (nx_cur) + xfrm_states_put(x_cur, nx_cur); + if (nx_new) + xfrm_states_delete(x_new, nx_new); + + return err; +} +EXPORT_SYMBOL(xfrm_migrate); +#endif diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c new file mode 100644 index 00000000..58d9ae00 --- /dev/null +++ b/net/xfrm/xfrm_proc.c @@ -0,0 +1,84 @@ +/* + * xfrm_proc.c + * + * Copyright (C)2006-2007 USAGI/WIDE Project + * + * Authors: Masahide NAKAMURA + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +static const struct snmp_mib xfrm_mib_list[] = { + SNMP_MIB_ITEM("XfrmInError", LINUX_MIB_XFRMINERROR), + SNMP_MIB_ITEM("XfrmInBufferError", LINUX_MIB_XFRMINBUFFERERROR), + SNMP_MIB_ITEM("XfrmInHdrError", LINUX_MIB_XFRMINHDRERROR), + SNMP_MIB_ITEM("XfrmInNoStates", LINUX_MIB_XFRMINNOSTATES), + SNMP_MIB_ITEM("XfrmInStateProtoError", LINUX_MIB_XFRMINSTATEPROTOERROR), + SNMP_MIB_ITEM("XfrmInStateModeError", LINUX_MIB_XFRMINSTATEMODEERROR), + SNMP_MIB_ITEM("XfrmInStateSeqError", LINUX_MIB_XFRMINSTATESEQERROR), + SNMP_MIB_ITEM("XfrmInStateExpired", LINUX_MIB_XFRMINSTATEEXPIRED), + SNMP_MIB_ITEM("XfrmInStateMismatch", LINUX_MIB_XFRMINSTATEMISMATCH), + SNMP_MIB_ITEM("XfrmInStateInvalid", LINUX_MIB_XFRMINSTATEINVALID), + SNMP_MIB_ITEM("XfrmInTmplMismatch", LINUX_MIB_XFRMINTMPLMISMATCH), + SNMP_MIB_ITEM("XfrmInNoPols", LINUX_MIB_XFRMINNOPOLS), + SNMP_MIB_ITEM("XfrmInPolBlock", LINUX_MIB_XFRMINPOLBLOCK), + SNMP_MIB_ITEM("XfrmInPolError", LINUX_MIB_XFRMINPOLERROR), + SNMP_MIB_ITEM("XfrmOutError", LINUX_MIB_XFRMOUTERROR), + SNMP_MIB_ITEM("XfrmOutBundleGenError", LINUX_MIB_XFRMOUTBUNDLEGENERROR), + SNMP_MIB_ITEM("XfrmOutBundleCheckError", LINUX_MIB_XFRMOUTBUNDLECHECKERROR), + SNMP_MIB_ITEM("XfrmOutNoStates", LINUX_MIB_XFRMOUTNOSTATES), + SNMP_MIB_ITEM("XfrmOutStateProtoError", LINUX_MIB_XFRMOUTSTATEPROTOERROR), + SNMP_MIB_ITEM("XfrmOutStateModeError", LINUX_MIB_XFRMOUTSTATEMODEERROR), + SNMP_MIB_ITEM("XfrmOutStateSeqError", LINUX_MIB_XFRMOUTSTATESEQERROR), + SNMP_MIB_ITEM("XfrmOutStateExpired", LINUX_MIB_XFRMOUTSTATEEXPIRED), + SNMP_MIB_ITEM("XfrmOutPolBlock", LINUX_MIB_XFRMOUTPOLBLOCK), + SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD), + SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR), + SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), + SNMP_MIB_SENTINEL +}; + +static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + int i; + for (i=0; xfrm_mib_list[i].name; i++) + seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, + snmp_fold_field((void __percpu **) + net->mib.xfrm_statistics, + xfrm_mib_list[i].entry)); + return 0; +} + +static int xfrm_statistics_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, xfrm_statistics_seq_show); +} + +static const struct file_operations xfrm_statistics_seq_fops = { + .owner = THIS_MODULE, + .open = xfrm_statistics_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +int __net_init xfrm_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "xfrm_stat", S_IRUGO, + &xfrm_statistics_seq_fops)) + return -ENOMEM; + return 0; +} + +void xfrm_proc_fini(struct net *net) +{ + proc_net_remove(net, "xfrm_stat"); +} diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c new file mode 100644 index 00000000..3235023e --- /dev/null +++ b/net/xfrm/xfrm_replay.c @@ -0,0 +1,550 @@ +/* + * xfrm_replay.c - xfrm replay detection, derived from xfrm_state.c. + * + * Copyright (C) 2010 secunet Security Networks AG + * Copyright (C) 2010 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq) +{ + u32 seq, seq_hi, bottom; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + + if (!(x->props.flags & XFRM_STATE_ESN)) + return 0; + + seq = ntohl(net_seq); + seq_hi = replay_esn->seq_hi; + bottom = replay_esn->seq - replay_esn->replay_window + 1; + + if (likely(replay_esn->seq >= replay_esn->replay_window - 1)) { + /* A. same subspace */ + if (unlikely(seq < bottom)) + seq_hi++; + } else { + /* B. window spans two subspaces */ + if (unlikely(seq >= bottom)) + seq_hi--; + } + + return seq_hi; +} + +static void xfrm_replay_notify(struct xfrm_state *x, int event) +{ + struct km_event c; + /* we send notify messages in case + * 1. we updated on of the sequence numbers, and the seqno difference + * is at least x->replay_maxdiff, in this case we also update the + * timeout of our timer function + * 2. if x->replay_maxage has elapsed since last update, + * and there were changes + * + * The state structure must be locked! + */ + + switch (event) { + case XFRM_REPLAY_UPDATE: + if (x->replay_maxdiff && + (x->replay.seq - x->preplay.seq < x->replay_maxdiff) && + (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff)) { + if (x->xflags & XFRM_TIME_DEFER) + event = XFRM_REPLAY_TIMEOUT; + else + return; + } + + break; + + case XFRM_REPLAY_TIMEOUT: + if (memcmp(&x->replay, &x->preplay, + sizeof(struct xfrm_replay_state)) == 0) { + x->xflags |= XFRM_TIME_DEFER; + return; + } + + break; + } + + memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state)); + c.event = XFRM_MSG_NEWAE; + c.data.aevent = event; + km_state_notify(x, &c); + + if (x->replay_maxage && + !mod_timer(&x->rtimer, jiffies + x->replay_maxage)) + x->xflags &= ~XFRM_TIME_DEFER; +} + +static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = 0; + struct net *net = xs_net(x); + + if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { + XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; + if (unlikely(x->replay.oseq == 0)) { + x->replay.oseq--; + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + + return err; + } + if (xfrm_aevent_is_on(net)) + x->repl->notify(x, XFRM_REPLAY_UPDATE); + } + + return err; +} + +static int xfrm_replay_check(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + u32 diff; + u32 seq = ntohl(net_seq); + + if (!x->props.replay_window) + return 0; + + if (unlikely(seq == 0)) + goto err; + + if (likely(seq > x->replay.seq)) + return 0; + + diff = x->replay.seq - seq; + if (diff >= min_t(unsigned int, x->props.replay_window, + sizeof(x->replay.bitmap) * 8)) { + x->stats.replay_window++; + goto err; + } + + if (x->replay.bitmap & (1U << diff)) { + x->stats.replay++; + goto err; + } + return 0; + +err: + xfrm_audit_state_replay(x, skb, net_seq); + return -EINVAL; +} + +static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) +{ + u32 diff; + u32 seq = ntohl(net_seq); + + if (!x->props.replay_window) + return; + + if (seq > x->replay.seq) { + diff = seq - x->replay.seq; + if (diff < x->props.replay_window) + x->replay.bitmap = ((x->replay.bitmap) << diff) | 1; + else + x->replay.bitmap = 1; + x->replay.seq = seq; + } else { + diff = x->replay.seq - seq; + x->replay.bitmap |= (1U << diff); + } + + if (xfrm_aevent_is_on(xs_net(x))) + x->repl->notify(x, XFRM_REPLAY_UPDATE); +} + +static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = 0; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + struct net *net = xs_net(x); + + if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { + XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; + if (unlikely(replay_esn->oseq == 0)) { + replay_esn->oseq--; + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + + return err; + } + if (xfrm_aevent_is_on(net)) + x->repl->notify(x, XFRM_REPLAY_UPDATE); + } + + return err; +} + +static int xfrm_replay_check_bmp(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + unsigned int bitnr, nr; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 pos; + u32 seq = ntohl(net_seq); + u32 diff = replay_esn->seq - seq; + + if (!replay_esn->replay_window) + return 0; + + pos = (replay_esn->seq - 1) % replay_esn->replay_window; + + if (unlikely(seq == 0)) + goto err; + + if (likely(seq > replay_esn->seq)) + return 0; + + if (diff >= replay_esn->replay_window) { + x->stats.replay_window++; + goto err; + } + + if (pos >= diff) { + bitnr = (pos - diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + if (replay_esn->bmp[nr] & (1U << bitnr)) + goto err_replay; + } else { + bitnr = replay_esn->replay_window - (diff - pos); + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + if (replay_esn->bmp[nr] & (1U << bitnr)) + goto err_replay; + } + return 0; + +err_replay: + x->stats.replay++; +err: + xfrm_audit_state_replay(x, skb, net_seq); + return -EINVAL; +} + +static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq) +{ + unsigned int bitnr, nr, i; + u32 diff; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 seq = ntohl(net_seq); + u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window; + + if (!replay_esn->replay_window) + return; + + if (seq > replay_esn->seq) { + diff = seq - replay_esn->seq; + + if (diff < replay_esn->replay_window) { + for (i = 1; i < diff; i++) { + bitnr = (pos + i) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] &= ~(1U << bitnr); + } + + bitnr = (pos + diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } else { + nr = (replay_esn->replay_window - 1) >> 5; + for (i = 0; i <= nr; i++) + replay_esn->bmp[i] = 0; + + bitnr = (pos + diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } + + replay_esn->seq = seq; + } else { + diff = replay_esn->seq - seq; + + if (pos >= diff) { + bitnr = (pos - diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } else { + bitnr = replay_esn->replay_window - (diff - pos); + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } + } + + if (xfrm_aevent_is_on(xs_net(x))) + x->repl->notify(x, XFRM_REPLAY_UPDATE); +} + +static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event) +{ + struct km_event c; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn; + + /* we send notify messages in case + * 1. we updated on of the sequence numbers, and the seqno difference + * is at least x->replay_maxdiff, in this case we also update the + * timeout of our timer function + * 2. if x->replay_maxage has elapsed since last update, + * and there were changes + * + * The state structure must be locked! + */ + + switch (event) { + case XFRM_REPLAY_UPDATE: + if (x->replay_maxdiff && + (replay_esn->seq - preplay_esn->seq < x->replay_maxdiff) && + (replay_esn->oseq - preplay_esn->oseq < x->replay_maxdiff)) { + if (x->xflags & XFRM_TIME_DEFER) + event = XFRM_REPLAY_TIMEOUT; + else + return; + } + + break; + + case XFRM_REPLAY_TIMEOUT: + if (memcmp(x->replay_esn, x->preplay_esn, + xfrm_replay_state_esn_len(replay_esn)) == 0) { + x->xflags |= XFRM_TIME_DEFER; + return; + } + + break; + } + + memcpy(x->preplay_esn, x->replay_esn, + xfrm_replay_state_esn_len(replay_esn)); + c.event = XFRM_MSG_NEWAE; + c.data.aevent = event; + km_state_notify(x, &c); + + if (x->replay_maxage && + !mod_timer(&x->rtimer, jiffies + x->replay_maxage)) + x->xflags &= ~XFRM_TIME_DEFER; +} + +static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = 0; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + struct net *net = xs_net(x); + + if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { + XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; + XFRM_SKB_CB(skb)->seq.output.hi = replay_esn->oseq_hi; + + if (unlikely(replay_esn->oseq == 0)) { + XFRM_SKB_CB(skb)->seq.output.hi = ++replay_esn->oseq_hi; + + if (replay_esn->oseq_hi == 0) { + replay_esn->oseq--; + replay_esn->oseq_hi--; + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + + return err; + } + } + if (xfrm_aevent_is_on(net)) + x->repl->notify(x, XFRM_REPLAY_UPDATE); + } + + return err; +} + +static int xfrm_replay_check_esn(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + unsigned int bitnr, nr; + u32 diff; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 pos; + u32 seq = ntohl(net_seq); + u32 wsize = replay_esn->replay_window; + u32 top = replay_esn->seq; + u32 bottom = top - wsize + 1; + + if (!wsize) + return 0; + + pos = (replay_esn->seq - 1) % replay_esn->replay_window; + + if (unlikely(seq == 0 && replay_esn->seq_hi == 0 && + (replay_esn->seq < replay_esn->replay_window - 1))) + goto err; + + diff = top - seq; + + if (likely(top >= wsize - 1)) { + /* A. same subspace */ + if (likely(seq > top) || seq < bottom) + return 0; + } else { + /* B. window spans two subspaces */ + if (likely(seq > top && seq < bottom)) + return 0; + if (seq >= bottom) + diff = ~seq + top + 1; + } + + if (diff >= replay_esn->replay_window) { + x->stats.replay_window++; + goto err; + } + + if (pos >= diff) { + bitnr = (pos - diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + if (replay_esn->bmp[nr] & (1U << bitnr)) + goto err_replay; + } else { + bitnr = replay_esn->replay_window - (diff - pos); + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + if (replay_esn->bmp[nr] & (1U << bitnr)) + goto err_replay; + } + return 0; + +err_replay: + x->stats.replay++; +err: + xfrm_audit_state_replay(x, skb, net_seq); + return -EINVAL; +} + +static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) +{ + unsigned int bitnr, nr, i; + int wrap; + u32 diff, pos, seq, seq_hi; + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + + if (!replay_esn->replay_window) + return; + + seq = ntohl(net_seq); + pos = (replay_esn->seq - 1) % replay_esn->replay_window; + seq_hi = xfrm_replay_seqhi(x, net_seq); + wrap = seq_hi - replay_esn->seq_hi; + + if ((!wrap && seq > replay_esn->seq) || wrap > 0) { + if (likely(!wrap)) + diff = seq - replay_esn->seq; + else + diff = ~replay_esn->seq + seq + 1; + + if (diff < replay_esn->replay_window) { + for (i = 1; i < diff; i++) { + bitnr = (pos + i) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] &= ~(1U << bitnr); + } + + bitnr = (pos + diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } else { + nr = (replay_esn->replay_window - 1) >> 5; + for (i = 0; i <= nr; i++) + replay_esn->bmp[i] = 0; + + bitnr = (pos + diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } + + replay_esn->seq = seq; + + if (unlikely(wrap > 0)) + replay_esn->seq_hi++; + } else { + diff = replay_esn->seq - seq; + + if (pos >= diff) { + bitnr = (pos - diff) % replay_esn->replay_window; + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } else { + bitnr = replay_esn->replay_window - (diff - pos); + nr = bitnr >> 5; + bitnr = bitnr & 0x1F; + replay_esn->bmp[nr] |= (1U << bitnr); + } + } + + if (xfrm_aevent_is_on(xs_net(x))) + x->repl->notify(x, XFRM_REPLAY_UPDATE); +} + +static struct xfrm_replay xfrm_replay_legacy = { + .advance = xfrm_replay_advance, + .check = xfrm_replay_check, + .notify = xfrm_replay_notify, + .overflow = xfrm_replay_overflow, +}; + +static struct xfrm_replay xfrm_replay_bmp = { + .advance = xfrm_replay_advance_bmp, + .check = xfrm_replay_check_bmp, + .notify = xfrm_replay_notify_bmp, + .overflow = xfrm_replay_overflow_bmp, +}; + +static struct xfrm_replay xfrm_replay_esn = { + .advance = xfrm_replay_advance_esn, + .check = xfrm_replay_check_esn, + .notify = xfrm_replay_notify_bmp, + .overflow = xfrm_replay_overflow_esn, +}; + +int xfrm_init_replay(struct xfrm_state *x) +{ + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + + if (replay_esn) { + if (replay_esn->replay_window > + replay_esn->bmp_len * sizeof(__u32) * 8) + return -EINVAL; + + if ((x->props.flags & XFRM_STATE_ESN) && replay_esn->replay_window == 0) + return -EINVAL; + + if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn) + x->repl = &xfrm_replay_esn; + else + x->repl = &xfrm_replay_bmp; + } else + x->repl = &xfrm_replay_legacy; + + return 0; +} +EXPORT_SYMBOL(xfrm_init_replay); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c new file mode 100644 index 00000000..9414b9c5 --- /dev/null +++ b/net/xfrm/xfrm_state.c @@ -0,0 +1,2231 @@ +/* + * xfrm_state.c + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific functions + * Derek Atkins + * Add UDP Encapsulation + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xfrm_hash.h" + +/* Each xfrm_state may be linked to two tables: + + 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) + 2. Hash table by (daddr,family,reqid) to find what SAs exist for given + destination/tunnel endpoint. (output) + */ + +static DEFINE_SPINLOCK(xfrm_state_lock); + +static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; + +static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); +static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); + +static inline unsigned int xfrm_dst_hash(struct net *net, + const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + u32 reqid, + unsigned short family) +{ + return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); +} + +static inline unsigned int xfrm_src_hash(struct net *net, + const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + unsigned short family) +{ + return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); +} + +static inline unsigned int +xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, + __be32 spi, u8 proto, unsigned short family) +{ + return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); +} + +static void xfrm_hash_transfer(struct hlist_head *list, + struct hlist_head *ndsttable, + struct hlist_head *nsrctable, + struct hlist_head *nspitable, + unsigned int nhashmask) +{ + struct hlist_node *entry, *tmp; + struct xfrm_state *x; + + hlist_for_each_entry_safe(x, entry, tmp, list, bydst) { + unsigned int h; + + h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, + x->props.reqid, x->props.family, + nhashmask); + hlist_add_head(&x->bydst, ndsttable+h); + + h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, + x->props.family, + nhashmask); + hlist_add_head(&x->bysrc, nsrctable+h); + + if (x->id.spi) { + h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, + x->id.proto, x->props.family, + nhashmask); + hlist_add_head(&x->byspi, nspitable+h); + } + } +} + +static unsigned long xfrm_hash_new_size(unsigned int state_hmask) +{ + return ((state_hmask + 1) << 1) * sizeof(struct hlist_head); +} + +static DEFINE_MUTEX(hash_resize_mutex); + +static void xfrm_hash_resize(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, xfrm.state_hash_work); + struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi; + unsigned long nsize, osize; + unsigned int nhashmask, ohashmask; + int i; + + mutex_lock(&hash_resize_mutex); + + nsize = xfrm_hash_new_size(net->xfrm.state_hmask); + ndst = xfrm_hash_alloc(nsize); + if (!ndst) + goto out_unlock; + nsrc = xfrm_hash_alloc(nsize); + if (!nsrc) { + xfrm_hash_free(ndst, nsize); + goto out_unlock; + } + nspi = xfrm_hash_alloc(nsize); + if (!nspi) { + xfrm_hash_free(ndst, nsize); + xfrm_hash_free(nsrc, nsize); + goto out_unlock; + } + + spin_lock_bh(&xfrm_state_lock); + + nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; + for (i = net->xfrm.state_hmask; i >= 0; i--) + xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi, + nhashmask); + + odst = net->xfrm.state_bydst; + osrc = net->xfrm.state_bysrc; + ospi = net->xfrm.state_byspi; + ohashmask = net->xfrm.state_hmask; + + net->xfrm.state_bydst = ndst; + net->xfrm.state_bysrc = nsrc; + net->xfrm.state_byspi = nspi; + net->xfrm.state_hmask = nhashmask; + + spin_unlock_bh(&xfrm_state_lock); + + osize = (ohashmask + 1) * sizeof(struct hlist_head); + xfrm_hash_free(odst, osize); + xfrm_hash_free(osrc, osize); + xfrm_hash_free(ospi, osize); + +out_unlock: + mutex_unlock(&hash_resize_mutex); +} + +static DEFINE_RWLOCK(xfrm_state_afinfo_lock); +static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO]; + +static DEFINE_SPINLOCK(xfrm_state_gc_lock); + +int __xfrm_state_delete(struct xfrm_state *x); + +int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); +void km_state_expired(struct xfrm_state *x, int hard, u32 pid); + +static struct xfrm_state_afinfo *xfrm_state_lock_afinfo(unsigned int family) +{ + struct xfrm_state_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + write_lock_bh(&xfrm_state_afinfo_lock); + afinfo = xfrm_state_afinfo[family]; + if (unlikely(!afinfo)) + write_unlock_bh(&xfrm_state_afinfo_lock); + return afinfo; +} + +static void xfrm_state_unlock_afinfo(struct xfrm_state_afinfo *afinfo) + __releases(xfrm_state_afinfo_lock) +{ + write_unlock_bh(&xfrm_state_afinfo_lock); +} + +int xfrm_register_type(const struct xfrm_type *type, unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_lock_afinfo(family); + const struct xfrm_type **typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_map; + + if (likely(typemap[type->proto] == NULL)) + typemap[type->proto] = type; + else + err = -EEXIST; + xfrm_state_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_type); + +int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_lock_afinfo(family); + const struct xfrm_type **typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_map; + + if (unlikely(typemap[type->proto] != type)) + err = -ENOENT; + else + typemap[type->proto] = NULL; + xfrm_state_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_type); + +static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) +{ + struct xfrm_state_afinfo *afinfo; + const struct xfrm_type **typemap; + const struct xfrm_type *type; + int modload_attempted = 0; + +retry: + afinfo = xfrm_state_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + typemap = afinfo->type_map; + + type = typemap[proto]; + if (unlikely(type && !try_module_get(type->owner))) + type = NULL; + if (!type && !modload_attempted) { + xfrm_state_put_afinfo(afinfo); + request_module("xfrm-type-%d-%d", family, proto); + modload_attempted = 1; + goto retry; + } + + xfrm_state_put_afinfo(afinfo); + return type; +} + +static void xfrm_put_type(const struct xfrm_type *type) +{ + module_put(type->owner); +} + +int xfrm_register_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_state_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -EEXIST; + modemap = afinfo->mode_map; + if (modemap[mode->encap]) + goto out; + + err = -ENOENT; + if (!try_module_get(afinfo->owner)) + goto out; + + mode->afinfo = afinfo; + modemap[mode->encap] = mode; + err = 0; + +out: + xfrm_state_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_mode); + +int xfrm_unregister_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_state_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -ENOENT; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == mode)) { + modemap[mode->encap] = NULL; + module_put(mode->afinfo->owner); + err = 0; + } + + xfrm_state_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_mode); + +static struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_mode *mode; + int modload_attempted = 0; + + if (unlikely(encap >= XFRM_MODE_MAX)) + return NULL; + +retry: + afinfo = xfrm_state_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + + mode = afinfo->mode_map[encap]; + if (unlikely(mode && !try_module_get(mode->owner))) + mode = NULL; + if (!mode && !modload_attempted) { + xfrm_state_put_afinfo(afinfo); + request_module("xfrm-mode-%d-%d", family, encap); + modload_attempted = 1; + goto retry; + } + + xfrm_state_put_afinfo(afinfo); + return mode; +} + +static void xfrm_put_mode(struct xfrm_mode *mode) +{ + module_put(mode->owner); +} + +static void xfrm_state_gc_destroy(struct xfrm_state *x) +{ + tasklet_hrtimer_cancel(&x->mtimer); + del_timer_sync(&x->rtimer); + kfree(x->aalg); + kfree(x->ealg); + kfree(x->calg); + kfree(x->encap); + kfree(x->coaddr); + kfree(x->replay_esn); + kfree(x->preplay_esn); + if (x->inner_mode) + xfrm_put_mode(x->inner_mode); + if (x->inner_mode_iaf) + xfrm_put_mode(x->inner_mode_iaf); + if (x->outer_mode) + xfrm_put_mode(x->outer_mode); + if (x->type) { + x->type->destructor(x); + xfrm_put_type(x->type); + } + security_xfrm_state_free(x); + kfree(x); +} + +static void xfrm_state_gc_task(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, xfrm.state_gc_work); + struct xfrm_state *x; + struct hlist_node *entry, *tmp; + struct hlist_head gc_list; + + spin_lock_bh(&xfrm_state_gc_lock); + hlist_move_list(&net->xfrm.state_gc_list, &gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + + hlist_for_each_entry_safe(x, entry, tmp, &gc_list, gclist) + xfrm_state_gc_destroy(x); + + wake_up(&net->xfrm.km_waitq); +} + +static inline unsigned long make_jiffies(long secs) +{ + if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) + return MAX_SCHEDULE_TIMEOUT-1; + else + return secs*HZ; +} + +static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) +{ + struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); + struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer); + struct net *net = xs_net(x); + unsigned long now = get_seconds(); + long next = LONG_MAX; + int warn = 0; + int err = 0; + + spin_lock(&x->lock); + if (x->km.state == XFRM_STATE_DEAD) + goto out; + if (x->km.state == XFRM_STATE_EXPIRED) + goto expired; + if (x->lft.hard_add_expires_seconds) { + long tmo = x->lft.hard_add_expires_seconds + + x->curlft.add_time - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (x->lft.hard_use_expires_seconds) { + long tmo = x->lft.hard_use_expires_seconds + + (x->curlft.use_time ? : now) - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (x->km.dying) + goto resched; + if (x->lft.soft_add_expires_seconds) { + long tmo = x->lft.soft_add_expires_seconds + + x->curlft.add_time - now; + if (tmo <= 0) + warn = 1; + else if (tmo < next) + next = tmo; + } + if (x->lft.soft_use_expires_seconds) { + long tmo = x->lft.soft_use_expires_seconds + + (x->curlft.use_time ? : now) - now; + if (tmo <= 0) + warn = 1; + else if (tmo < next) + next = tmo; + } + + x->km.dying = warn; + if (warn) + km_state_expired(x, 0, 0); +resched: + if (next != LONG_MAX){ + tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL); + } + + goto out; + +expired: + if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) { + x->km.state = XFRM_STATE_EXPIRED; + wake_up(&net->xfrm.km_waitq); + next = 2; + goto resched; + } + + err = __xfrm_state_delete(x); + if (!err && x->id.spi) + km_state_expired(x, 1, 0); + + xfrm_audit_state_delete(x, err ? 0 : 1, + audit_get_loginuid(current), + audit_get_sessionid(current), 0); + +out: + spin_unlock(&x->lock); + return HRTIMER_NORESTART; +} + +static void xfrm_replay_timer_handler(unsigned long data); + +struct xfrm_state *xfrm_state_alloc(struct net *net) +{ + struct xfrm_state *x; + + x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC); + + if (x) { + write_pnet(&x->xs_net, net); + atomic_set(&x->refcnt, 1); + atomic_set(&x->tunnel_users, 0); + INIT_LIST_HEAD(&x->km.all); + INIT_HLIST_NODE(&x->bydst); + INIT_HLIST_NODE(&x->bysrc); + INIT_HLIST_NODE(&x->byspi); + tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_REALTIME, HRTIMER_MODE_ABS); + setup_timer(&x->rtimer, xfrm_replay_timer_handler, + (unsigned long)x); + x->curlft.add_time = get_seconds(); + x->lft.soft_byte_limit = XFRM_INF; + x->lft.soft_packet_limit = XFRM_INF; + x->lft.hard_byte_limit = XFRM_INF; + x->lft.hard_packet_limit = XFRM_INF; + x->replay_maxage = 0; + x->replay_maxdiff = 0; + x->inner_mode = NULL; + x->inner_mode_iaf = NULL; + spin_lock_init(&x->lock); + } + return x; +} +EXPORT_SYMBOL(xfrm_state_alloc); + +void __xfrm_state_destroy(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + + WARN_ON(x->km.state != XFRM_STATE_DEAD); + + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &net->xfrm.state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&net->xfrm.state_gc_work); +} +EXPORT_SYMBOL(__xfrm_state_destroy); + +int __xfrm_state_delete(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + int err = -ESRCH; + + if (x->km.state != XFRM_STATE_DEAD) { + x->km.state = XFRM_STATE_DEAD; + spin_lock(&xfrm_state_lock); + list_del(&x->km.all); + hlist_del(&x->bydst); + hlist_del(&x->bysrc); + if (x->id.spi) + hlist_del(&x->byspi); + net->xfrm.state_num--; + spin_unlock(&xfrm_state_lock); + + /* All xfrm_state objects are created by xfrm_state_alloc. + * The xfrm_state_alloc call gives a reference, and that + * is what we are dropping here. + */ + xfrm_state_put(x); + err = 0; + } + + return err; +} +EXPORT_SYMBOL(__xfrm_state_delete); + +int xfrm_state_delete(struct xfrm_state *x) +{ + int err; + + spin_lock_bh(&x->lock); + err = __xfrm_state_delete(x); + spin_unlock_bh(&x->lock); + + return err; +} +EXPORT_SYMBOL(xfrm_state_delete); + +#ifdef CONFIG_SECURITY_NETWORK_XFRM +static inline int +xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audit_info) +{ + int i, err = 0; + + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct hlist_node *entry; + struct xfrm_state *x; + + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) { + if (xfrm_id_proto_match(x->id.proto, proto) && + (err = security_xfrm_state_delete(x)) != 0) { + xfrm_audit_state_delete(x, 0, + audit_info->loginuid, + audit_info->sessionid, + audit_info->secid); + return err; + } + } + } + + return err; +} +#else +static inline int +xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audit_info) +{ + return 0; +} +#endif + +int xfrm_state_flush(struct net *net, u8 proto, struct xfrm_audit *audit_info) +{ + int i, err = 0, cnt = 0; + + spin_lock_bh(&xfrm_state_lock); + err = xfrm_state_flush_secctx_check(net, proto, audit_info); + if (err) + goto out; + + err = -ESRCH; + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct hlist_node *entry; + struct xfrm_state *x; +restart: + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) { + if (!xfrm_state_kern(x) && + xfrm_id_proto_match(x->id.proto, proto)) { + xfrm_state_hold(x); + spin_unlock_bh(&xfrm_state_lock); + + err = xfrm_state_delete(x); + xfrm_audit_state_delete(x, err ? 0 : 1, + audit_info->loginuid, + audit_info->sessionid, + audit_info->secid); + xfrm_state_put(x); + if (!err) + cnt++; + + spin_lock_bh(&xfrm_state_lock); + goto restart; + } + } + } + if (cnt) + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + wake_up(&net->xfrm.km_waitq); + return err; +} +EXPORT_SYMBOL(xfrm_state_flush); + +void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) +{ + spin_lock_bh(&xfrm_state_lock); + si->sadcnt = net->xfrm.state_num; + si->sadhcnt = net->xfrm.state_hmask; + si->sadhmcnt = xfrm_state_hashmax; + spin_unlock_bh(&xfrm_state_lock); +} +EXPORT_SYMBOL(xfrm_sad_getinfo); + +static int +xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, + const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr, + unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return -1; + afinfo->init_tempsel(&x->sel, fl); + + if (family != tmpl->encap_family) { + xfrm_state_put_afinfo(afinfo); + afinfo = xfrm_state_get_afinfo(tmpl->encap_family); + if (!afinfo) + return -1; + } + afinfo->init_temprop(x, tmpl, daddr, saddr); + xfrm_state_put_afinfo(afinfo); + return 0; +} + +static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family) +{ + unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + struct xfrm_state *x; + struct hlist_node *entry; + + hlist_for_each_entry(x, entry, net->xfrm.state_byspi+h, byspi) { + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + xfrm_addr_cmp(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + xfrm_state_hold(x); + return x; + } + + return NULL; +} + +static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, + const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + u8 proto, unsigned short family) +{ + unsigned int h = xfrm_src_hash(net, daddr, saddr, family); + struct xfrm_state *x; + struct hlist_node *entry; + + hlist_for_each_entry(x, entry, net->xfrm.state_bysrc+h, bysrc) { + if (x->props.family != family || + x->id.proto != proto || + xfrm_addr_cmp(&x->id.daddr, daddr, family) || + xfrm_addr_cmp(&x->props.saddr, saddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + xfrm_state_hold(x); + return x; + } + + return NULL; +} + +static inline struct xfrm_state * +__xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) +{ + struct net *net = xs_net(x); + u32 mark = x->mark.v & x->mark.m; + + if (use_spi) + return __xfrm_state_lookup(net, mark, &x->id.daddr, + x->id.spi, x->id.proto, family); + else + return __xfrm_state_lookup_byaddr(net, mark, + &x->id.daddr, + &x->props.saddr, + x->id.proto, family); +} + +static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) +{ + if (have_hash_collision && + (net->xfrm.state_hmask + 1) < xfrm_state_hashmax && + net->xfrm.state_num > net->xfrm.state_hmask) + schedule_work(&net->xfrm.state_hash_work); +} + +static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, + const struct flowi *fl, unsigned short family, + struct xfrm_state **best, int *acq_in_progress, + int *error) +{ + /* Resolution logic: + * 1. There is a valid state with matching selector. Done. + * 2. Valid state with inappropriate selector. Skip. + * + * Entering area of "sysdeps". + * + * 3. If state is not valid, selector is temporary, it selects + * only session which triggered previous resolution. Key + * manager will do something to install a state with proper + * selector. + */ + if (x->km.state == XFRM_STATE_VALID) { + if ((x->sel.family && + !xfrm_selector_match(&x->sel, fl, x->sel.family)) || + !security_xfrm_state_pol_flow_match(x, pol, fl)) + return; + + if (!*best || + (*best)->km.dying > x->km.dying || + ((*best)->km.dying == x->km.dying && + (*best)->curlft.add_time < x->curlft.add_time)) + *best = x; + } else if (x->km.state == XFRM_STATE_ACQ) { + *acq_in_progress = 1; + } else if (x->km.state == XFRM_STATE_ERROR || + x->km.state == XFRM_STATE_EXPIRED) { + if (xfrm_selector_match(&x->sel, fl, x->sel.family) && + security_xfrm_state_pol_flow_match(x, pol, fl)) + *error = -ESRCH; + } +} + +struct xfrm_state * +xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, + const struct flowi *fl, struct xfrm_tmpl *tmpl, + struct xfrm_policy *pol, int *err, + unsigned short family) +{ + static xfrm_address_t saddr_wildcard = { }; + struct net *net = xp_net(pol); + unsigned int h, h_wildcard; + struct hlist_node *entry; + struct xfrm_state *x, *x0, *to_put; + int acquire_in_progress = 0; + int error = 0; + struct xfrm_state *best = NULL; + u32 mark = pol->mark.v & pol->mark.m; + unsigned short encap_family = tmpl->encap_family; + + to_put = NULL; + + spin_lock_bh(&xfrm_state_lock); + h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, encap_family, + &best, &acquire_in_progress, &error); + } + if (best) + goto found; + + h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, encap_family, + &best, &acquire_in_progress, &error); + } + +found: + x = best; + if (!x && !error && !acquire_in_progress) { + if (tmpl->id.spi && + (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, + tmpl->id.proto, encap_family)) != NULL) { + to_put = x0; + error = -EEXIST; + goto out; + } + x = xfrm_state_alloc(net); + if (x == NULL) { + error = -ENOMEM; + goto out; + } + /* Initialize temporary state matching only + * to current session. */ + xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); + memcpy(&x->mark, &pol->mark, sizeof(x->mark)); + + error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); + if (error) { + x->km.state = XFRM_STATE_DEAD; + to_put = x; + x = NULL; + goto out; + } + + if (km_query(x, tmpl, pol) == 0) { + x->km.state = XFRM_STATE_ACQ; + list_add(&x->km.all, &net->xfrm.state_all); + hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + h = xfrm_src_hash(net, daddr, saddr, encap_family); + hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + if (x->id.spi) { + h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); + hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + } + x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; + tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); + net->xfrm.state_num++; + xfrm_hash_grow_check(net, x->bydst.next != NULL); + } else { + x->km.state = XFRM_STATE_DEAD; + to_put = x; + x = NULL; + error = -ESRCH; + } + } +out: + if (x) + xfrm_state_hold(x); + else + *err = acquire_in_progress ? -EAGAIN : error; + spin_unlock_bh(&xfrm_state_lock); + if (to_put) + xfrm_state_put(to_put); + return x; +} + +struct xfrm_state * +xfrm_stateonly_find(struct net *net, u32 mark, + xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family, u8 mode, u8 proto, u32 reqid) +{ + unsigned int h; + struct xfrm_state *rx = NULL, *x = NULL; + struct hlist_node *entry; + + spin_lock(&xfrm_state_lock); + h = xfrm_dst_hash(net, daddr, saddr, reqid, family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + if (x->props.family == family && + x->props.reqid == reqid && + (mark & x->mark.m) == x->mark.v && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, family) && + mode == x->props.mode && + proto == x->id.proto && + x->km.state == XFRM_STATE_VALID) { + rx = x; + break; + } + } + + if (rx) + xfrm_state_hold(rx); + spin_unlock(&xfrm_state_lock); + + + return rx; +} +EXPORT_SYMBOL(xfrm_stateonly_find); + +static void __xfrm_state_insert(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + unsigned int h; + + list_add(&x->km.all, &net->xfrm.state_all); + + h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, + x->props.reqid, x->props.family); + hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + + h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); + hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + + if (x->id.spi) { + h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, + x->props.family); + + hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + } + + tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); + if (x->replay_maxage) + mod_timer(&x->rtimer, jiffies + x->replay_maxage); + + wake_up(&net->xfrm.km_waitq); + + net->xfrm.state_num++; + + xfrm_hash_grow_check(net, x->bydst.next != NULL); +} + +/* xfrm_state_lock is held */ +static void __xfrm_state_bump_genids(struct xfrm_state *xnew) +{ + struct net *net = xs_net(xnew); + unsigned short family = xnew->props.family; + u32 reqid = xnew->props.reqid; + struct xfrm_state *x; + struct hlist_node *entry; + unsigned int h; + u32 mark = xnew->mark.v & xnew->mark.m; + + h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + if (x->props.family == family && + x->props.reqid == reqid && + (mark & x->mark.m) == x->mark.v && + !xfrm_addr_cmp(&x->id.daddr, &xnew->id.daddr, family) && + !xfrm_addr_cmp(&x->props.saddr, &xnew->props.saddr, family)) + x->genid++; + } +} + +void xfrm_state_insert(struct xfrm_state *x) +{ + spin_lock_bh(&xfrm_state_lock); + __xfrm_state_bump_genids(x); + __xfrm_state_insert(x); + spin_unlock_bh(&xfrm_state_lock); +} +EXPORT_SYMBOL(xfrm_state_insert); + +/* xfrm_state_lock is held */ +static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, + unsigned short family, u8 mode, + u32 reqid, u8 proto, + const xfrm_address_t *daddr, + const xfrm_address_t *saddr, int create) +{ + unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family); + struct hlist_node *entry; + struct xfrm_state *x; + u32 mark = m->v & m->m; + + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + if (x->props.reqid != reqid || + x->props.mode != mode || + x->props.family != family || + x->km.state != XFRM_STATE_ACQ || + x->id.spi != 0 || + x->id.proto != proto || + (mark & x->mark.m) != x->mark.v || + xfrm_addr_cmp(&x->id.daddr, daddr, family) || + xfrm_addr_cmp(&x->props.saddr, saddr, family)) + continue; + + xfrm_state_hold(x); + return x; + } + + if (!create) + return NULL; + + x = xfrm_state_alloc(net); + if (likely(x)) { + switch (family) { + case AF_INET: + x->sel.daddr.a4 = daddr->a4; + x->sel.saddr.a4 = saddr->a4; + x->sel.prefixlen_d = 32; + x->sel.prefixlen_s = 32; + x->props.saddr.a4 = saddr->a4; + x->id.daddr.a4 = daddr->a4; + break; + + case AF_INET6: + ipv6_addr_copy((struct in6_addr *)x->sel.daddr.a6, + (const struct in6_addr *)daddr); + ipv6_addr_copy((struct in6_addr *)x->sel.saddr.a6, + (const struct in6_addr *)saddr); + x->sel.prefixlen_d = 128; + x->sel.prefixlen_s = 128; + ipv6_addr_copy((struct in6_addr *)x->props.saddr.a6, + (const struct in6_addr *)saddr); + ipv6_addr_copy((struct in6_addr *)x->id.daddr.a6, + (const struct in6_addr *)daddr); + break; + } + + x->km.state = XFRM_STATE_ACQ; + x->id.proto = proto; + x->props.family = family; + x->props.mode = mode; + x->props.reqid = reqid; + x->mark.v = m->v; + x->mark.m = m->m; + x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; + xfrm_state_hold(x); + tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); + list_add(&x->km.all, &net->xfrm.state_all); + hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + h = xfrm_src_hash(net, daddr, saddr, family); + hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + + net->xfrm.state_num++; + + xfrm_hash_grow_check(net, x->bydst.next != NULL); + } + + return x; +} + +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); + +int xfrm_state_add(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + struct xfrm_state *x1, *to_put; + int family; + int err; + u32 mark = x->mark.v & x->mark.m; + int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); + + family = x->props.family; + + to_put = NULL; + + spin_lock_bh(&xfrm_state_lock); + + x1 = __xfrm_state_locate(x, use_spi, family); + if (x1) { + to_put = x1; + x1 = NULL; + err = -EEXIST; + goto out; + } + + if (use_spi && x->km.seq) { + x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); + if (x1 && ((x1->id.proto != x->id.proto) || + xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family))) { + to_put = x1; + x1 = NULL; + } + } + + if (use_spi && !x1) + x1 = __find_acq_core(net, &x->mark, family, x->props.mode, + x->props.reqid, x->id.proto, + &x->id.daddr, &x->props.saddr, 0); + + __xfrm_state_bump_genids(x); + __xfrm_state_insert(x); + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + + if (x1) { + xfrm_state_delete(x1); + xfrm_state_put(x1); + } + + if (to_put) + xfrm_state_put(to_put); + + return err; +} +EXPORT_SYMBOL(xfrm_state_add); + +#ifdef CONFIG_XFRM_MIGRATE +static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) +{ + struct net *net = xs_net(orig); + int err = -ENOMEM; + struct xfrm_state *x = xfrm_state_alloc(net); + if (!x) + goto out; + + memcpy(&x->id, &orig->id, sizeof(x->id)); + memcpy(&x->sel, &orig->sel, sizeof(x->sel)); + memcpy(&x->lft, &orig->lft, sizeof(x->lft)); + x->props.mode = orig->props.mode; + x->props.replay_window = orig->props.replay_window; + x->props.reqid = orig->props.reqid; + x->props.family = orig->props.family; + x->props.saddr = orig->props.saddr; + + if (orig->aalg) { + x->aalg = xfrm_algo_auth_clone(orig->aalg); + if (!x->aalg) + goto error; + } + x->props.aalgo = orig->props.aalgo; + + if (orig->ealg) { + x->ealg = xfrm_algo_clone(orig->ealg); + if (!x->ealg) + goto error; + } + x->props.ealgo = orig->props.ealgo; + + if (orig->calg) { + x->calg = xfrm_algo_clone(orig->calg); + if (!x->calg) + goto error; + } + x->props.calgo = orig->props.calgo; + + if (orig->encap) { + x->encap = kmemdup(orig->encap, sizeof(*x->encap), GFP_KERNEL); + if (!x->encap) + goto error; + } + + if (orig->coaddr) { + x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr), + GFP_KERNEL); + if (!x->coaddr) + goto error; + } + + if (orig->replay_esn) { + err = xfrm_replay_clone(x, orig); + if (err) + goto error; + } + + memcpy(&x->mark, &orig->mark, sizeof(x->mark)); + + err = xfrm_init_state(x); + if (err) + goto error; + + x->props.flags = orig->props.flags; + + x->curlft.add_time = orig->curlft.add_time; + x->km.state = orig->km.state; + x->km.seq = orig->km.seq; + + return x; + + error: + xfrm_state_put(x); +out: + if (errp) + *errp = err; + return NULL; +} + +/* xfrm_state_lock is held */ +struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) +{ + unsigned int h; + struct xfrm_state *x; + struct hlist_node *entry; + + if (m->reqid) { + h = xfrm_dst_hash(&init_net, &m->old_daddr, &m->old_saddr, + m->reqid, m->old_family); + hlist_for_each_entry(x, entry, init_net.xfrm.state_bydst+h, bydst) { + if (x->props.mode != m->mode || + x->id.proto != m->proto) + continue; + if (m->reqid && x->props.reqid != m->reqid) + continue; + if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr, + m->old_family) || + xfrm_addr_cmp(&x->props.saddr, &m->old_saddr, + m->old_family)) + continue; + xfrm_state_hold(x); + return x; + } + } else { + h = xfrm_src_hash(&init_net, &m->old_daddr, &m->old_saddr, + m->old_family); + hlist_for_each_entry(x, entry, init_net.xfrm.state_bysrc+h, bysrc) { + if (x->props.mode != m->mode || + x->id.proto != m->proto) + continue; + if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr, + m->old_family) || + xfrm_addr_cmp(&x->props.saddr, &m->old_saddr, + m->old_family)) + continue; + xfrm_state_hold(x); + return x; + } + } + + return NULL; +} +EXPORT_SYMBOL(xfrm_migrate_state_find); + +struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x, + struct xfrm_migrate *m) +{ + struct xfrm_state *xc; + int err; + + xc = xfrm_state_clone(x, &err); + if (!xc) + return NULL; + + memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); + memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); + + /* add state */ + if (!xfrm_addr_cmp(&x->id.daddr, &m->new_daddr, m->new_family)) { + /* a care is needed when the destination address of the + state is to be updated as it is a part of triplet */ + xfrm_state_insert(xc); + } else { + if ((err = xfrm_state_add(xc)) < 0) + goto error; + } + + return xc; +error: + xfrm_state_put(xc); + return NULL; +} +EXPORT_SYMBOL(xfrm_state_migrate); +#endif + +int xfrm_state_update(struct xfrm_state *x) +{ + struct xfrm_state *x1, *to_put; + int err; + int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); + + to_put = NULL; + + spin_lock_bh(&xfrm_state_lock); + x1 = __xfrm_state_locate(x, use_spi, x->props.family); + + err = -ESRCH; + if (!x1) + goto out; + + if (xfrm_state_kern(x1)) { + to_put = x1; + err = -EEXIST; + goto out; + } + + if (x1->km.state == XFRM_STATE_ACQ) { + __xfrm_state_insert(x); + x = NULL; + } + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + + if (to_put) + xfrm_state_put(to_put); + + if (err) + return err; + + if (!x) { + xfrm_state_delete(x1); + xfrm_state_put(x1); + return 0; + } + + err = -EINVAL; + spin_lock_bh(&x1->lock); + if (likely(x1->km.state == XFRM_STATE_VALID)) { + if (x->encap && x1->encap) + memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + if (x->coaddr && x1->coaddr) { + memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); + } + if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel))) + memcpy(&x1->sel, &x->sel, sizeof(x1->sel)); + memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); + x1->km.dying = 0; + + tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); + if (x1->curlft.use_time) + xfrm_state_check_expire(x1); + + err = 0; + x->km.state = XFRM_STATE_DEAD; + __xfrm_state_put(x); + } + spin_unlock_bh(&x1->lock); + + xfrm_state_put(x1); + + return err; +} +EXPORT_SYMBOL(xfrm_state_update); + +int xfrm_state_check_expire(struct xfrm_state *x) +{ + if (!x->curlft.use_time) + x->curlft.use_time = get_seconds(); + + if (x->km.state != XFRM_STATE_VALID) + return -EINVAL; + + if (x->curlft.bytes >= x->lft.hard_byte_limit || + x->curlft.packets >= x->lft.hard_packet_limit) { + x->km.state = XFRM_STATE_EXPIRED; + tasklet_hrtimer_start(&x->mtimer, ktime_set(0,0), HRTIMER_MODE_REL); + return -EINVAL; + } + + if (!x->km.dying && + (x->curlft.bytes >= x->lft.soft_byte_limit || + x->curlft.packets >= x->lft.soft_packet_limit)) { + x->km.dying = 1; + km_state_expired(x, 0, 0); + } + return 0; +} +EXPORT_SYMBOL(xfrm_state_check_expire); + +struct xfrm_state * +xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, + u8 proto, unsigned short family) +{ + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + spin_unlock_bh(&xfrm_state_lock); + return x; +} +EXPORT_SYMBOL(xfrm_state_lookup); + +struct xfrm_state * +xfrm_state_lookup_byaddr(struct net *net, u32 mark, + const xfrm_address_t *daddr, const xfrm_address_t *saddr, + u8 proto, unsigned short family) +{ + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); + spin_unlock_bh(&xfrm_state_lock); + return x; +} +EXPORT_SYMBOL(xfrm_state_lookup_byaddr); + +struct xfrm_state * +xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto, + const xfrm_address_t *daddr, const xfrm_address_t *saddr, + int create, unsigned short family) +{ + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create); + spin_unlock_bh(&xfrm_state_lock); + + return x; +} +EXPORT_SYMBOL(xfrm_find_acq); + +#ifdef CONFIG_XFRM_SUB_POLICY +int +xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, + unsigned short family) +{ + int err = 0; + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return -EAFNOSUPPORT; + + spin_lock_bh(&xfrm_state_lock); + if (afinfo->tmpl_sort) + err = afinfo->tmpl_sort(dst, src, n); + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_tmpl_sort); + +int +xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, + unsigned short family) +{ + int err = 0; + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return -EAFNOSUPPORT; + + spin_lock_bh(&xfrm_state_lock); + if (afinfo->state_sort) + err = afinfo->state_sort(dst, src, n); + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_state_sort); +#endif + +/* Silly enough, but I'm lazy to build resolution list */ + +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +{ + int i; + + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct hlist_node *entry; + struct xfrm_state *x; + + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) { + if (x->km.seq == seq && + (mark & x->mark.m) == x->mark.v && + x->km.state == XFRM_STATE_ACQ) { + xfrm_state_hold(x); + return x; + } + } + } + return NULL; +} + +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +{ + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + x = __xfrm_find_acq_byseq(net, mark, seq); + spin_unlock_bh(&xfrm_state_lock); + return x; +} +EXPORT_SYMBOL(xfrm_find_acq_byseq); + +u32 xfrm_get_acqseq(void) +{ + u32 res; + static atomic_t acqseq; + + do { + res = atomic_inc_return(&acqseq); + } while (!res); + + return res; +} +EXPORT_SYMBOL(xfrm_get_acqseq); + +int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) +{ + struct net *net = xs_net(x); + unsigned int h; + struct xfrm_state *x0; + int err = -ENOENT; + __be32 minspi = htonl(low); + __be32 maxspi = htonl(high); + u32 mark = x->mark.v & x->mark.m; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_DEAD) + goto unlock; + + err = 0; + if (x->id.spi) + goto unlock; + + err = -ENOENT; + + if (minspi == maxspi) { + x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); + if (x0) { + xfrm_state_put(x0); + goto unlock; + } + x->id.spi = minspi; + } else { + u32 spi = 0; + for (h=0; hid.daddr, htonl(spi), x->id.proto, x->props.family); + if (x0 == NULL) { + x->id.spi = htonl(spi); + break; + } + xfrm_state_put(x0); + } + } + if (x->id.spi) { + spin_lock_bh(&xfrm_state_lock); + h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); + hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + spin_unlock_bh(&xfrm_state_lock); + + err = 0; + } + +unlock: + spin_unlock_bh(&x->lock); + + return err; +} +EXPORT_SYMBOL(xfrm_alloc_spi); + +int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, + int (*func)(struct xfrm_state *, int, void*), + void *data) +{ + struct xfrm_state *state; + struct xfrm_state_walk *x; + int err = 0; + + if (walk->seq != 0 && list_empty(&walk->all)) + return 0; + + spin_lock_bh(&xfrm_state_lock); + if (list_empty(&walk->all)) + x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); + else + x = list_entry(&walk->all, struct xfrm_state_walk, all); + list_for_each_entry_from(x, &net->xfrm.state_all, all) { + if (x->state == XFRM_STATE_DEAD) + continue; + state = container_of(x, struct xfrm_state, km); + if (!xfrm_id_proto_match(state->id.proto, walk->proto)) + continue; + err = func(state, walk->seq, data); + if (err) { + list_move_tail(&walk->all, &x->all); + goto out; + } + walk->seq++; + } + if (walk->seq == 0) { + err = -ENOENT; + goto out; + } + list_del_init(&walk->all); +out: + spin_unlock_bh(&xfrm_state_lock); + return err; +} +EXPORT_SYMBOL(xfrm_state_walk); + +void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto) +{ + INIT_LIST_HEAD(&walk->all); + walk->proto = proto; + walk->state = XFRM_STATE_DEAD; + walk->seq = 0; +} +EXPORT_SYMBOL(xfrm_state_walk_init); + +void xfrm_state_walk_done(struct xfrm_state_walk *walk) +{ + if (list_empty(&walk->all)) + return; + + spin_lock_bh(&xfrm_state_lock); + list_del(&walk->all); + spin_unlock_bh(&xfrm_state_lock); +} +EXPORT_SYMBOL(xfrm_state_walk_done); + +static void xfrm_replay_timer_handler(unsigned long data) +{ + struct xfrm_state *x = (struct xfrm_state*)data; + + spin_lock(&x->lock); + + if (x->km.state == XFRM_STATE_VALID) { + if (xfrm_aevent_is_on(xs_net(x))) + x->repl->notify(x, XFRM_REPLAY_TIMEOUT); + else + x->xflags |= XFRM_TIME_DEFER; + } + + spin_unlock(&x->lock); +} + +static LIST_HEAD(xfrm_km_list); +static DEFINE_RWLOCK(xfrm_km_lock); + +void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) +{ + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) + if (km->notify_policy) + km->notify_policy(xp, dir, c); + read_unlock(&xfrm_km_lock); +} + +void km_state_notify(struct xfrm_state *x, const struct km_event *c) +{ + struct xfrm_mgr *km; + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) + if (km->notify) + km->notify(x, c); + read_unlock(&xfrm_km_lock); +} + +EXPORT_SYMBOL(km_policy_notify); +EXPORT_SYMBOL(km_state_notify); + +void km_state_expired(struct xfrm_state *x, int hard, u32 pid) +{ + struct net *net = xs_net(x); + struct km_event c; + + c.data.hard = hard; + c.pid = pid; + c.event = XFRM_MSG_EXPIRE; + km_state_notify(x, &c); + + if (hard) + wake_up(&net->xfrm.km_waitq); +} + +EXPORT_SYMBOL(km_state_expired); +/* + * We send to all registered managers regardless of failure + * We are happy with one success +*/ +int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) +{ + int err = -EINVAL, acqret; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + acqret = km->acquire(x, t, pol, XFRM_POLICY_OUT); + if (!acqret) + err = acqret; + } + read_unlock(&xfrm_km_lock); + return err; +} +EXPORT_SYMBOL(km_query); + +int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) +{ + int err = -EINVAL; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + if (km->new_mapping) + err = km->new_mapping(x, ipaddr, sport); + if (!err) + break; + } + read_unlock(&xfrm_km_lock); + return err; +} +EXPORT_SYMBOL(km_new_mapping); + +void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 pid) +{ + struct net *net = xp_net(pol); + struct km_event c; + + c.data.hard = hard; + c.pid = pid; + c.event = XFRM_MSG_POLEXPIRE; + km_policy_notify(pol, dir, &c); + + if (hard) + wake_up(&net->xfrm.km_waitq); +} +EXPORT_SYMBOL(km_policy_expired); + +#ifdef CONFIG_XFRM_MIGRATE +int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, + const struct xfrm_migrate *m, int num_migrate, + const struct xfrm_kmaddress *k) +{ + int err = -EINVAL; + int ret; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + if (km->migrate) { + ret = km->migrate(sel, dir, type, m, num_migrate, k); + if (!ret) + err = ret; + } + } + read_unlock(&xfrm_km_lock); + return err; +} +EXPORT_SYMBOL(km_migrate); +#endif + +int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr) +{ + int err = -EINVAL; + int ret; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + if (km->report) { + ret = km->report(net, proto, sel, addr); + if (!ret) + err = ret; + } + } + read_unlock(&xfrm_km_lock); + return err; +} +EXPORT_SYMBOL(km_report); + +int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) +{ + int err; + u8 *data; + struct xfrm_mgr *km; + struct xfrm_policy *pol = NULL; + + if (optlen <= 0 || optlen > PAGE_SIZE) + return -EMSGSIZE; + + data = kmalloc(optlen, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = -EFAULT; + if (copy_from_user(data, optval, optlen)) + goto out; + + err = -EINVAL; + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + pol = km->compile_policy(sk, optname, data, + optlen, &err); + if (err >= 0) + break; + } + read_unlock(&xfrm_km_lock); + + if (err >= 0) { + xfrm_sk_policy_insert(sk, err, pol); + xfrm_pol_put(pol); + err = 0; + } + +out: + kfree(data); + return err; +} +EXPORT_SYMBOL(xfrm_user_policy); + +int xfrm_register_km(struct xfrm_mgr *km) +{ + write_lock_bh(&xfrm_km_lock); + list_add_tail(&km->list, &xfrm_km_list); + write_unlock_bh(&xfrm_km_lock); + return 0; +} +EXPORT_SYMBOL(xfrm_register_km); + +int xfrm_unregister_km(struct xfrm_mgr *km) +{ + write_lock_bh(&xfrm_km_lock); + list_del(&km->list); + write_unlock_bh(&xfrm_km_lock); + return 0; +} +EXPORT_SYMBOL(xfrm_unregister_km); + +int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock_bh(&xfrm_state_afinfo_lock); + if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else + xfrm_state_afinfo[afinfo->family] = afinfo; + write_unlock_bh(&xfrm_state_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_state_register_afinfo); + +int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock_bh(&xfrm_state_afinfo_lock); + if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else + xfrm_state_afinfo[afinfo->family] = NULL; + } + write_unlock_bh(&xfrm_state_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_state_unregister_afinfo); + +static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) +{ + struct xfrm_state_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + read_lock(&xfrm_state_afinfo_lock); + afinfo = xfrm_state_afinfo[family]; + if (unlikely(!afinfo)) + read_unlock(&xfrm_state_afinfo_lock); + return afinfo; +} + +static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) + __releases(xfrm_state_afinfo_lock) +{ + read_unlock(&xfrm_state_afinfo_lock); +} + +/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ +void xfrm_state_delete_tunnel(struct xfrm_state *x) +{ + if (x->tunnel) { + struct xfrm_state *t = x->tunnel; + + if (atomic_read(&t->tunnel_users) == 2) + xfrm_state_delete(t); + atomic_dec(&t->tunnel_users); + xfrm_state_put(t); + x->tunnel = NULL; + } +} +EXPORT_SYMBOL(xfrm_state_delete_tunnel); + +int xfrm_state_mtu(struct xfrm_state *x, int mtu) +{ + int res; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_VALID && + x->type && x->type->get_mtu) + res = x->type->get_mtu(x, mtu); + else + res = mtu - x->props.header_len; + spin_unlock_bh(&x->lock); + return res; +} + +int __xfrm_init_state(struct xfrm_state *x, bool init_replay) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_mode *inner_mode; + int family = x->props.family; + int err; + + err = -EAFNOSUPPORT; + afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + goto error; + + err = 0; + if (afinfo->init_flags) + err = afinfo->init_flags(x); + + xfrm_state_put_afinfo(afinfo); + + if (err) + goto error; + + err = -EPROTONOSUPPORT; + + if (x->sel.family != AF_UNSPEC) { + inner_mode = xfrm_get_mode(x->props.mode, x->sel.family); + if (inner_mode == NULL) + goto error; + + if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) && + family != x->sel.family) { + xfrm_put_mode(inner_mode); + goto error; + } + + x->inner_mode = inner_mode; + } else { + struct xfrm_mode *inner_mode_iaf; + int iafamily = AF_INET; + + inner_mode = xfrm_get_mode(x->props.mode, x->props.family); + if (inner_mode == NULL) + goto error; + + if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) { + xfrm_put_mode(inner_mode); + goto error; + } + x->inner_mode = inner_mode; + + if (x->props.family == AF_INET) + iafamily = AF_INET6; + + inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily); + if (inner_mode_iaf) { + if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL) + x->inner_mode_iaf = inner_mode_iaf; + else + xfrm_put_mode(inner_mode_iaf); + } + } + + x->type = xfrm_get_type(x->id.proto, family); + if (x->type == NULL) + goto error; + + err = x->type->init_state(x); + if (err) + goto error; + + x->outer_mode = xfrm_get_mode(x->props.mode, family); + if (x->outer_mode == NULL) + goto error; + + if (init_replay) { + err = xfrm_init_replay(x); + if (err) + goto error; + } + + x->km.state = XFRM_STATE_VALID; + +error: + return err; +} + +EXPORT_SYMBOL(__xfrm_init_state); + +int xfrm_init_state(struct xfrm_state *x) +{ + return __xfrm_init_state(x, true); +} + +EXPORT_SYMBOL(xfrm_init_state); + +int __net_init xfrm_state_init(struct net *net) +{ + unsigned int sz; + + INIT_LIST_HEAD(&net->xfrm.state_all); + + sz = sizeof(struct hlist_head) * 8; + + net->xfrm.state_bydst = xfrm_hash_alloc(sz); + if (!net->xfrm.state_bydst) + goto out_bydst; + net->xfrm.state_bysrc = xfrm_hash_alloc(sz); + if (!net->xfrm.state_bysrc) + goto out_bysrc; + net->xfrm.state_byspi = xfrm_hash_alloc(sz); + if (!net->xfrm.state_byspi) + goto out_byspi; + net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); + + net->xfrm.state_num = 0; + INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); + INIT_HLIST_HEAD(&net->xfrm.state_gc_list); + INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task); + init_waitqueue_head(&net->xfrm.km_waitq); + return 0; + +out_byspi: + xfrm_hash_free(net->xfrm.state_bysrc, sz); +out_bysrc: + xfrm_hash_free(net->xfrm.state_bydst, sz); +out_bydst: + return -ENOMEM; +} + +void xfrm_state_fini(struct net *net) +{ + struct xfrm_audit audit_info; + unsigned int sz; + + flush_work(&net->xfrm.state_hash_work); + audit_info.loginuid = -1; + audit_info.sessionid = -1; + audit_info.secid = 0; + xfrm_state_flush(net, IPSEC_PROTO_ANY, &audit_info); + flush_work(&net->xfrm.state_gc_work); + + WARN_ON(!list_empty(&net->xfrm.state_all)); + + sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); + WARN_ON(!hlist_empty(net->xfrm.state_byspi)); + xfrm_hash_free(net->xfrm.state_byspi, sz); + WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); + xfrm_hash_free(net->xfrm.state_bysrc, sz); + WARN_ON(!hlist_empty(net->xfrm.state_bydst)); + xfrm_hash_free(net->xfrm.state_bydst, sz); +} + +#ifdef CONFIG_AUDITSYSCALL +static void xfrm_audit_helper_sainfo(struct xfrm_state *x, + struct audit_buffer *audit_buf) +{ + struct xfrm_sec_ctx *ctx = x->security; + u32 spi = ntohl(x->id.spi); + + if (ctx) + audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", + ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); + + switch(x->props.family) { + case AF_INET: + audit_log_format(audit_buf, " src=%pI4 dst=%pI4", + &x->props.saddr.a4, &x->id.daddr.a4); + break; + case AF_INET6: + audit_log_format(audit_buf, " src=%pI6 dst=%pI6", + x->props.saddr.a6, x->id.daddr.a6); + break; + } + + audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); +} + +static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, + struct audit_buffer *audit_buf) +{ + const struct iphdr *iph4; + const struct ipv6hdr *iph6; + + switch (family) { + case AF_INET: + iph4 = ip_hdr(skb); + audit_log_format(audit_buf, " src=%pI4 dst=%pI4", + &iph4->saddr, &iph4->daddr); + break; + case AF_INET6: + iph6 = ipv6_hdr(skb); + audit_log_format(audit_buf, + " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x", + &iph6->saddr,&iph6->daddr, + iph6->flow_lbl[0] & 0x0f, + iph6->flow_lbl[1], + iph6->flow_lbl[2]); + break; + } +} + +void xfrm_audit_state_add(struct xfrm_state *x, int result, + uid_t auid, u32 sessionid, u32 secid) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SAD-add"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + xfrm_audit_helper_sainfo(x, audit_buf); + audit_log_format(audit_buf, " res=%u", result); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_add); + +void xfrm_audit_state_delete(struct xfrm_state *x, int result, + uid_t auid, u32 sessionid, u32 secid) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SAD-delete"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf); + xfrm_audit_helper_sainfo(x, audit_buf); + audit_log_format(audit_buf, " res=%u", result); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_delete); + +void xfrm_audit_state_replay_overflow(struct xfrm_state *x, + struct sk_buff *skb) +{ + struct audit_buffer *audit_buf; + u32 spi; + + audit_buf = xfrm_audit_start("SA-replay-overflow"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); + /* don't record the sequence number because it's inherent in this kind + * of audit message */ + spi = ntohl(x->id.spi); + audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow); + +void xfrm_audit_state_replay(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + struct audit_buffer *audit_buf; + u32 spi; + + audit_buf = xfrm_audit_start("SA-replayed-pkt"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); + spi = ntohl(x->id.spi); + audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", + spi, spi, ntohl(net_seq)); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_replay); + +void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SA-notfound"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, family, audit_buf); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple); + +void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, + __be32 net_spi, __be32 net_seq) +{ + struct audit_buffer *audit_buf; + u32 spi; + + audit_buf = xfrm_audit_start("SA-notfound"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, family, audit_buf); + spi = ntohl(net_spi); + audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", + spi, spi, ntohl(net_seq)); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound); + +void xfrm_audit_state_icvfail(struct xfrm_state *x, + struct sk_buff *skb, u8 proto) +{ + struct audit_buffer *audit_buf; + __be32 net_spi; + __be32 net_seq; + + audit_buf = xfrm_audit_start("SA-icv-failure"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); + if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) { + u32 spi = ntohl(net_spi); + audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", + spi, spi, ntohl(net_seq)); + } + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail); +#endif /* CONFIG_AUDITSYSCALL */ diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c new file mode 100644 index 00000000..05640bc9 --- /dev/null +++ b/net/xfrm/xfrm_sysctl.c @@ -0,0 +1,82 @@ +#include +#include +#include +#include + +static void __net_init __xfrm_sysctl_init(struct net *net) +{ + net->xfrm.sysctl_aevent_etime = XFRM_AE_ETIME; + net->xfrm.sysctl_aevent_rseqth = XFRM_AE_SEQT_SIZE; + net->xfrm.sysctl_larval_drop = 1; + net->xfrm.sysctl_acq_expires = 30; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm_table[] = { + { + .procname = "xfrm_aevent_etime", + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "xfrm_aevent_rseqth", + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "xfrm_larval_drop", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "xfrm_acq_expires", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +int __net_init xfrm_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + __xfrm_sysctl_init(net); + + table = kmemdup(xfrm_table, sizeof(xfrm_table), GFP_KERNEL); + if (!table) + goto out_kmemdup; + table[0].data = &net->xfrm.sysctl_aevent_etime; + table[1].data = &net->xfrm.sysctl_aevent_rseqth; + table[2].data = &net->xfrm.sysctl_larval_drop; + table[3].data = &net->xfrm.sysctl_acq_expires; + + net->xfrm.sysctl_hdr = register_net_sysctl_table(net, net_core_path, table); + if (!net->xfrm.sysctl_hdr) + goto out_register; + return 0; + +out_register: + kfree(table); +out_kmemdup: + return -ENOMEM; +} + +void __net_exit xfrm_sysctl_fini(struct net *net) +{ + struct ctl_table *table; + + table = net->xfrm.sysctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->xfrm.sysctl_hdr); + kfree(table); +} +#else +int __net_init xfrm_sysctl_init(struct net *net) +{ + __xfrm_sysctl_init(net); + return 0; +} +#endif diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c new file mode 100644 index 00000000..c658cb3b --- /dev/null +++ b/net/xfrm/xfrm_user.c @@ -0,0 +1,2973 @@ +/* xfrm_user.c: User interface to configure xfrm engine. + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#endif + +static inline int aead_len(struct xfrm_algo_aead *alg) +{ + return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); +} + +static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) +{ + struct nlattr *rt = attrs[type]; + struct xfrm_algo *algp; + + if (!rt) + return 0; + + algp = nla_data(rt); + if (nla_len(rt) < xfrm_alg_len(algp)) + return -EINVAL; + + switch (type) { + case XFRMA_ALG_AUTH: + case XFRMA_ALG_CRYPT: + case XFRMA_ALG_COMP: + break; + + default: + return -EINVAL; + } + + algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + return 0; +} + +static int verify_auth_trunc(struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC]; + struct xfrm_algo_auth *algp; + + if (!rt) + return 0; + + algp = nla_data(rt); + if (nla_len(rt) < xfrm_alg_auth_len(algp)) + return -EINVAL; + + algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + return 0; +} + +static int verify_aead(struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_ALG_AEAD]; + struct xfrm_algo_aead *algp; + + if (!rt) + return 0; + + algp = nla_data(rt); + if (nla_len(rt) < aead_len(algp)) + return -EINVAL; + + algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + return 0; +} + +static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type, + xfrm_address_t **addrp) +{ + struct nlattr *rt = attrs[type]; + + if (rt && addrp) + *addrp = nla_data(rt); +} + +static inline int verify_sec_ctx_len(struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_SEC_CTX]; + struct xfrm_user_sec_ctx *uctx; + + if (!rt) + return 0; + + uctx = nla_data(rt); + if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) + return -EINVAL; + + return 0; +} + +static inline int verify_replay(struct xfrm_usersa_info *p, + struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; + + if ((p->flags & XFRM_STATE_ESN) && !rt) + return -EINVAL; + + if (!rt) + return 0; + + if (p->id.proto != IPPROTO_ESP) + return -EINVAL; + + if (p->replay_window != 0) + return -EINVAL; + + return 0; +} + +static int verify_newsa_info(struct xfrm_usersa_info *p, + struct nlattr **attrs) +{ + int err; + + err = -EINVAL; + switch (p->family) { + case AF_INET: + break; + + case AF_INET6: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + break; +#else + err = -EAFNOSUPPORT; + goto out; +#endif + + default: + goto out; + } + + err = -EINVAL; + switch (p->id.proto) { + case IPPROTO_AH: + if ((!attrs[XFRMA_ALG_AUTH] && + !attrs[XFRMA_ALG_AUTH_TRUNC]) || + attrs[XFRMA_ALG_AEAD] || + attrs[XFRMA_ALG_CRYPT] || + attrs[XFRMA_ALG_COMP] || + attrs[XFRMA_TFCPAD]) + goto out; + break; + + case IPPROTO_ESP: + if (attrs[XFRMA_ALG_COMP]) + goto out; + if (!attrs[XFRMA_ALG_AUTH] && + !attrs[XFRMA_ALG_AUTH_TRUNC] && + !attrs[XFRMA_ALG_CRYPT] && + !attrs[XFRMA_ALG_AEAD]) + goto out; + if ((attrs[XFRMA_ALG_AUTH] || + attrs[XFRMA_ALG_AUTH_TRUNC] || + attrs[XFRMA_ALG_CRYPT]) && + attrs[XFRMA_ALG_AEAD]) + goto out; + if (attrs[XFRMA_TFCPAD] && + p->mode != XFRM_MODE_TUNNEL) + goto out; + break; + + case IPPROTO_COMP: + if (!attrs[XFRMA_ALG_COMP] || + attrs[XFRMA_ALG_AEAD] || + attrs[XFRMA_ALG_AUTH] || + attrs[XFRMA_ALG_AUTH_TRUNC] || + attrs[XFRMA_ALG_CRYPT] || + attrs[XFRMA_TFCPAD]) + goto out; + break; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + if (attrs[XFRMA_ALG_COMP] || + attrs[XFRMA_ALG_AUTH] || + attrs[XFRMA_ALG_AUTH_TRUNC] || + attrs[XFRMA_ALG_AEAD] || + attrs[XFRMA_ALG_CRYPT] || + attrs[XFRMA_ENCAP] || + attrs[XFRMA_SEC_CTX] || + attrs[XFRMA_TFCPAD] || + !attrs[XFRMA_COADDR]) + goto out; + break; +#endif + + default: + goto out; + } + + if ((err = verify_aead(attrs))) + goto out; + if ((err = verify_auth_trunc(attrs))) + goto out; + if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH))) + goto out; + if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT))) + goto out; + if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP))) + goto out; + if ((err = verify_sec_ctx_len(attrs))) + goto out; + if ((err = verify_replay(p, attrs))) + goto out; + + err = -EINVAL; + switch (p->mode) { + case XFRM_MODE_TRANSPORT: + case XFRM_MODE_TUNNEL: + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_BEET: + break; + + default: + goto out; + } + + err = 0; + +out: + return err; +} + +static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, + struct xfrm_algo_desc *(*get_byname)(const char *, int), + struct nlattr *rta) +{ + struct xfrm_algo *p, *ualg; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = nla_data(rta); + + algo = get_byname(ualg->alg_name, 1); + if (!algo) + return -ENOSYS; + *props = algo->desc.sadb_alg_id; + + p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); + if (!p) + return -ENOMEM; + + strcpy(p->alg_name, algo->name); + *algpp = p; + return 0; +} + +static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, + struct nlattr *rta) +{ + struct xfrm_algo *ualg; + struct xfrm_algo_auth *p; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = nla_data(rta); + + algo = xfrm_aalg_get_byname(ualg->alg_name, 1); + if (!algo) + return -ENOSYS; + *props = algo->desc.sadb_alg_id; + + p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL); + if (!p) + return -ENOMEM; + + strcpy(p->alg_name, algo->name); + p->alg_key_len = ualg->alg_key_len; + p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; + memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8); + + *algpp = p; + return 0; +} + +static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, + struct nlattr *rta) +{ + struct xfrm_algo_auth *p, *ualg; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = nla_data(rta); + + algo = xfrm_aalg_get_byname(ualg->alg_name, 1); + if (!algo) + return -ENOSYS; + if ((ualg->alg_trunc_len / 8) > MAX_AH_AUTH_LEN || + ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) + return -EINVAL; + *props = algo->desc.sadb_alg_id; + + p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL); + if (!p) + return -ENOMEM; + + strcpy(p->alg_name, algo->name); + if (!p->alg_trunc_len) + p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; + + *algpp = p; + return 0; +} + +static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, + struct nlattr *rta) +{ + struct xfrm_algo_aead *p, *ualg; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = nla_data(rta); + + algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1); + if (!algo) + return -ENOSYS; + *props = algo->desc.sadb_alg_id; + + p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL); + if (!p) + return -ENOMEM; + + strcpy(p->alg_name, algo->name); + *algpp = p; + return 0; +} + +static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn, + struct nlattr *rp) +{ + struct xfrm_replay_state_esn *up; + + if (!replay_esn || !rp) + return 0; + + up = nla_data(rp); + + if (xfrm_replay_state_esn_len(replay_esn) != + xfrm_replay_state_esn_len(up)) + return -EINVAL; + + return 0; +} + +static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn, + struct xfrm_replay_state_esn **preplay_esn, + struct nlattr *rta) +{ + struct xfrm_replay_state_esn *p, *pp, *up; + + if (!rta) + return 0; + + up = nla_data(rta); + + p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + if (!p) + return -ENOMEM; + + pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + if (!pp) { + kfree(p); + return -ENOMEM; + } + + *replay_esn = p; + *preplay_esn = pp; + + return 0; +} + +static inline int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx) +{ + int len = 0; + + if (xfrm_ctx) { + len += sizeof(struct xfrm_user_sec_ctx); + len += xfrm_ctx->ctx_len; + } + return len; +} + +static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) +{ + memcpy(&x->id, &p->id, sizeof(x->id)); + memcpy(&x->sel, &p->sel, sizeof(x->sel)); + memcpy(&x->lft, &p->lft, sizeof(x->lft)); + x->props.mode = p->mode; + x->props.replay_window = p->replay_window; + x->props.reqid = p->reqid; + x->props.family = p->family; + memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr)); + x->props.flags = p->flags; + + if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC)) + x->sel.family = p->family; +} + +/* + * someday when pfkey also has support, we could have the code + * somehow made shareable and move it to xfrm_state.c - JHS + * +*/ +static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs) +{ + struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; + struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; + struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; + struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; + struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; + + if (re) { + struct xfrm_replay_state_esn *replay_esn; + replay_esn = nla_data(re); + memcpy(x->replay_esn, replay_esn, + xfrm_replay_state_esn_len(replay_esn)); + memcpy(x->preplay_esn, replay_esn, + xfrm_replay_state_esn_len(replay_esn)); + } + + if (rp) { + struct xfrm_replay_state *replay; + replay = nla_data(rp); + memcpy(&x->replay, replay, sizeof(*replay)); + memcpy(&x->preplay, replay, sizeof(*replay)); + } + + if (lt) { + struct xfrm_lifetime_cur *ltime; + ltime = nla_data(lt); + x->curlft.bytes = ltime->bytes; + x->curlft.packets = ltime->packets; + x->curlft.add_time = ltime->add_time; + x->curlft.use_time = ltime->use_time; + } + + if (et) + x->replay_maxage = nla_get_u32(et); + + if (rt) + x->replay_maxdiff = nla_get_u32(rt); +} + +static struct xfrm_state *xfrm_state_construct(struct net *net, + struct xfrm_usersa_info *p, + struct nlattr **attrs, + int *errp) +{ + struct xfrm_state *x = xfrm_state_alloc(net); + int err = -ENOMEM; + + if (!x) + goto error_no_put; + + copy_from_user_state(x, p); + + if ((err = attach_aead(&x->aead, &x->props.ealgo, + attrs[XFRMA_ALG_AEAD]))) + goto error; + if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo, + attrs[XFRMA_ALG_AUTH_TRUNC]))) + goto error; + if (!x->props.aalgo) { + if ((err = attach_auth(&x->aalg, &x->props.aalgo, + attrs[XFRMA_ALG_AUTH]))) + goto error; + } + if ((err = attach_one_algo(&x->ealg, &x->props.ealgo, + xfrm_ealg_get_byname, + attrs[XFRMA_ALG_CRYPT]))) + goto error; + if ((err = attach_one_algo(&x->calg, &x->props.calgo, + xfrm_calg_get_byname, + attrs[XFRMA_ALG_COMP]))) + goto error; + + if (attrs[XFRMA_ENCAP]) { + x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), + sizeof(*x->encap), GFP_KERNEL); + if (x->encap == NULL) + goto error; + } + + if (attrs[XFRMA_TFCPAD]) + x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]); + + if (attrs[XFRMA_COADDR]) { + x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), + sizeof(*x->coaddr), GFP_KERNEL); + if (x->coaddr == NULL) + goto error; + } + + xfrm_mark_get(attrs, &x->mark); + + err = __xfrm_init_state(x, false); + if (err) + goto error; + + if (attrs[XFRMA_SEC_CTX] && + security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX]))) + goto error; + + if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, + attrs[XFRMA_REPLAY_ESN_VAL]))) + goto error; + + x->km.seq = p->seq; + x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth; + /* sysctl_xfrm_aevent_etime is in 100ms units */ + x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M; + + if ((err = xfrm_init_replay(x))) + goto error; + + /* override default values from above */ + xfrm_update_ae_params(x, attrs); + + return x; + +error: + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); +error_no_put: + *errp = err; + return NULL; +} + +static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_usersa_info *p = nlmsg_data(nlh); + struct xfrm_state *x; + int err; + struct km_event c; + uid_t loginuid = audit_get_loginuid(current); + u32 sessionid = audit_get_sessionid(current); + u32 sid; + + err = verify_newsa_info(p, attrs); + if (err) + return err; + + x = xfrm_state_construct(net, p, attrs, &err); + if (!x) + return err; + + xfrm_state_hold(x); + if (nlh->nlmsg_type == XFRM_MSG_NEWSA) + err = xfrm_state_add(x); + else + err = xfrm_state_update(x); + + security_task_getsecid(current, &sid); + xfrm_audit_state_add(x, err ? 0 : 1, loginuid, sessionid, sid); + + if (err < 0) { + x->km.state = XFRM_STATE_DEAD; + __xfrm_state_put(x); + goto out; + } + + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + c.event = nlh->nlmsg_type; + + km_state_notify(x, &c); +out: + xfrm_state_put(x); + return err; +} + +static struct xfrm_state *xfrm_user_state_lookup(struct net *net, + struct xfrm_usersa_id *p, + struct nlattr **attrs, + int *errp) +{ + struct xfrm_state *x = NULL; + struct xfrm_mark m; + int err; + u32 mark = xfrm_mark_get(attrs, &m); + + if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) { + err = -ESRCH; + x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family); + } else { + xfrm_address_t *saddr = NULL; + + verify_one_addr(attrs, XFRMA_SRCADDR, &saddr); + if (!saddr) { + err = -EINVAL; + goto out; + } + + err = -ESRCH; + x = xfrm_state_lookup_byaddr(net, mark, + &p->daddr, saddr, + p->proto, p->family); + } + + out: + if (!x && errp) + *errp = err; + return x; +} + +static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_state *x; + int err = -ESRCH; + struct km_event c; + struct xfrm_usersa_id *p = nlmsg_data(nlh); + uid_t loginuid = audit_get_loginuid(current); + u32 sessionid = audit_get_sessionid(current); + u32 sid; + + x = xfrm_user_state_lookup(net, p, attrs, &err); + if (x == NULL) + return err; + + if ((err = security_xfrm_state_delete(x)) != 0) + goto out; + + if (xfrm_state_kern(x)) { + err = -EPERM; + goto out; + } + + err = xfrm_state_delete(x); + + if (err < 0) + goto out; + + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + c.event = nlh->nlmsg_type; + km_state_notify(x, &c); + +out: + security_task_getsecid(current, &sid); + xfrm_audit_state_delete(x, err ? 0 : 1, loginuid, sessionid, sid); + xfrm_state_put(x); + return err; +} + +static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) +{ + memcpy(&p->id, &x->id, sizeof(p->id)); + memcpy(&p->sel, &x->sel, sizeof(p->sel)); + memcpy(&p->lft, &x->lft, sizeof(p->lft)); + memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); + memcpy(&p->stats, &x->stats, sizeof(p->stats)); + memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr)); + p->mode = x->props.mode; + p->replay_window = x->props.replay_window; + p->reqid = x->props.reqid; + p->family = x->props.family; + p->flags = x->props.flags; + p->seq = x->km.seq; +} + +struct xfrm_dump_info { + struct sk_buff *in_skb; + struct sk_buff *out_skb; + u32 nlmsg_seq; + u16 nlmsg_flags; +}; + +static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb) +{ + struct xfrm_user_sec_ctx *uctx; + struct nlattr *attr; + int ctx_size = sizeof(*uctx) + s->ctx_len; + + attr = nla_reserve(skb, XFRMA_SEC_CTX, ctx_size); + if (attr == NULL) + return -EMSGSIZE; + + uctx = nla_data(attr); + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = s->ctx_doi; + uctx->ctx_alg = s->ctx_alg; + uctx->ctx_len = s->ctx_len; + memcpy(uctx + 1, s->ctx_str, s->ctx_len); + + return 0; +} + +static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) +{ + struct xfrm_algo *algo; + struct nlattr *nla; + + nla = nla_reserve(skb, XFRMA_ALG_AUTH, + sizeof(*algo) + (auth->alg_key_len + 7) / 8); + if (!nla) + return -EMSGSIZE; + + algo = nla_data(nla); + strcpy(algo->alg_name, auth->alg_name); + memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); + algo->alg_key_len = auth->alg_key_len; + + return 0; +} + +/* Don't change this without updating xfrm_sa_len! */ +static int copy_to_user_state_extra(struct xfrm_state *x, + struct xfrm_usersa_info *p, + struct sk_buff *skb) +{ + copy_to_user_state(x, p); + + if (x->coaddr) + NLA_PUT(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr); + + if (x->lastused) + NLA_PUT_U64(skb, XFRMA_LASTUSED, x->lastused); + + if (x->aead) + NLA_PUT(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead); + if (x->aalg) { + if (copy_to_user_auth(x->aalg, skb)) + goto nla_put_failure; + + NLA_PUT(skb, XFRMA_ALG_AUTH_TRUNC, + xfrm_alg_auth_len(x->aalg), x->aalg); + } + if (x->ealg) + NLA_PUT(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg); + if (x->calg) + NLA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg); + + if (x->encap) + NLA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + + if (x->tfcpad) + NLA_PUT_U32(skb, XFRMA_TFCPAD, x->tfcpad); + + if (xfrm_mark_put(skb, &x->mark)) + goto nla_put_failure; + + if (x->replay_esn) + NLA_PUT(skb, XFRMA_REPLAY_ESN_VAL, + xfrm_replay_state_esn_len(x->replay_esn), x->replay_esn); + + if (x->security && copy_sec_ctx(x->security, skb) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int dump_one_state(struct xfrm_state *x, int count, void *ptr) +{ + struct xfrm_dump_info *sp = ptr; + struct sk_buff *in_skb = sp->in_skb; + struct sk_buff *skb = sp->out_skb; + struct xfrm_usersa_info *p; + struct nlmsghdr *nlh; + int err; + + nlh = nlmsg_put(skb, NETLINK_CB(in_skb).pid, sp->nlmsg_seq, + XFRM_MSG_NEWSA, sizeof(*p), sp->nlmsg_flags); + if (nlh == NULL) + return -EMSGSIZE; + + p = nlmsg_data(nlh); + + err = copy_to_user_state_extra(x, p, skb); + if (err) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return err; +} + +static int xfrm_dump_sa_done(struct netlink_callback *cb) +{ + struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1]; + xfrm_state_walk_done(walk); + return 0; +} + +static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1]; + struct xfrm_dump_info info; + + BUILD_BUG_ON(sizeof(struct xfrm_state_walk) > + sizeof(cb->args) - sizeof(cb->args[0])); + + info.in_skb = cb->skb; + info.out_skb = skb; + info.nlmsg_seq = cb->nlh->nlmsg_seq; + info.nlmsg_flags = NLM_F_MULTI; + + if (!cb->args[0]) { + cb->args[0] = 1; + xfrm_state_walk_init(walk, 0); + } + + (void) xfrm_state_walk(net, walk, dump_one_state, &info); + + return skb->len; +} + +static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, + struct xfrm_state *x, u32 seq) +{ + struct xfrm_dump_info info; + struct sk_buff *skb; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + info.in_skb = in_skb; + info.out_skb = skb; + info.nlmsg_seq = seq; + info.nlmsg_flags = 0; + + if (dump_one_state(x, 0, &info)) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static inline size_t xfrm_spdinfo_msgsize(void) +{ + return NLMSG_ALIGN(4) + + nla_total_size(sizeof(struct xfrmu_spdinfo)) + + nla_total_size(sizeof(struct xfrmu_spdhinfo)); +} + +static int build_spdinfo(struct sk_buff *skb, struct net *net, + u32 pid, u32 seq, u32 flags) +{ + struct xfrmk_spdinfo si; + struct xfrmu_spdinfo spc; + struct xfrmu_spdhinfo sph; + struct nlmsghdr *nlh; + u32 *f; + + nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); + if (nlh == NULL) /* shouldn't really happen ... */ + return -EMSGSIZE; + + f = nlmsg_data(nlh); + *f = flags; + xfrm_spd_getinfo(net, &si); + spc.incnt = si.incnt; + spc.outcnt = si.outcnt; + spc.fwdcnt = si.fwdcnt; + spc.inscnt = si.inscnt; + spc.outscnt = si.outscnt; + spc.fwdscnt = si.fwdscnt; + sph.spdhcnt = si.spdhcnt; + sph.spdhmcnt = si.spdhmcnt; + + NLA_PUT(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); + NLA_PUT(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *r_skb; + u32 *flags = nlmsg_data(nlh); + u32 spid = NETLINK_CB(skb).pid; + u32 seq = nlh->nlmsg_seq; + + r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC); + if (r_skb == NULL) + return -ENOMEM; + + if (build_spdinfo(r_skb, net, spid, seq, *flags) < 0) + BUG(); + + return nlmsg_unicast(net->xfrm.nlsk, r_skb, spid); +} + +static inline size_t xfrm_sadinfo_msgsize(void) +{ + return NLMSG_ALIGN(4) + + nla_total_size(sizeof(struct xfrmu_sadhinfo)) + + nla_total_size(4); /* XFRMA_SAD_CNT */ +} + +static int build_sadinfo(struct sk_buff *skb, struct net *net, + u32 pid, u32 seq, u32 flags) +{ + struct xfrmk_sadinfo si; + struct xfrmu_sadhinfo sh; + struct nlmsghdr *nlh; + u32 *f; + + nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0); + if (nlh == NULL) /* shouldn't really happen ... */ + return -EMSGSIZE; + + f = nlmsg_data(nlh); + *f = flags; + xfrm_sad_getinfo(net, &si); + + sh.sadhmcnt = si.sadhmcnt; + sh.sadhcnt = si.sadhcnt; + + NLA_PUT_U32(skb, XFRMA_SAD_CNT, si.sadcnt); + NLA_PUT(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *r_skb; + u32 *flags = nlmsg_data(nlh); + u32 spid = NETLINK_CB(skb).pid; + u32 seq = nlh->nlmsg_seq; + + r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC); + if (r_skb == NULL) + return -ENOMEM; + + if (build_sadinfo(r_skb, net, spid, seq, *flags) < 0) + BUG(); + + return nlmsg_unicast(net->xfrm.nlsk, r_skb, spid); +} + +static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_usersa_id *p = nlmsg_data(nlh); + struct xfrm_state *x; + struct sk_buff *resp_skb; + int err = -ESRCH; + + x = xfrm_user_state_lookup(net, p, attrs, &err); + if (x == NULL) + goto out_noput; + + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + } else { + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).pid); + } + xfrm_state_put(x); +out_noput: + return err; +} + +static int verify_userspi_info(struct xfrm_userspi_info *p) +{ + switch (p->info.id.proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + break; + + case IPPROTO_COMP: + /* IPCOMP spi is 16-bits. */ + if (p->max >= 0x10000) + return -EINVAL; + break; + + default: + return -EINVAL; + } + + if (p->min > p->max) + return -EINVAL; + + return 0; +} + +static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_state *x; + struct xfrm_userspi_info *p; + struct sk_buff *resp_skb; + xfrm_address_t *daddr; + int family; + int err; + u32 mark; + struct xfrm_mark m; + + p = nlmsg_data(nlh); + err = verify_userspi_info(p); + if (err) + goto out_noput; + + family = p->info.family; + daddr = &p->info.id.daddr; + + x = NULL; + + mark = xfrm_mark_get(attrs, &m); + if (p->info.seq) { + x = xfrm_find_acq_byseq(net, mark, p->info.seq); + if (x && xfrm_addr_cmp(&x->id.daddr, daddr, family)) { + xfrm_state_put(x); + x = NULL; + } + } + + if (!x) + x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, + p->info.id.proto, daddr, + &p->info.saddr, 1, + family); + err = -ENOENT; + if (x == NULL) + goto out_noput; + + err = xfrm_alloc_spi(x, p->min, p->max); + if (err) + goto out; + + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + goto out; + } + + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).pid); + +out: + xfrm_state_put(x); +out_noput: + return err; +} + +static int verify_policy_dir(u8 dir) +{ + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_OUT: + case XFRM_POLICY_FWD: + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int verify_policy_type(u8 type) +{ + switch (type) { + case XFRM_POLICY_TYPE_MAIN: +#ifdef CONFIG_XFRM_SUB_POLICY + case XFRM_POLICY_TYPE_SUB: +#endif + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) +{ + switch (p->share) { + case XFRM_SHARE_ANY: + case XFRM_SHARE_SESSION: + case XFRM_SHARE_USER: + case XFRM_SHARE_UNIQUE: + break; + + default: + return -EINVAL; + } + + switch (p->action) { + case XFRM_POLICY_ALLOW: + case XFRM_POLICY_BLOCK: + break; + + default: + return -EINVAL; + } + + switch (p->sel.family) { + case AF_INET: + break; + + case AF_INET6: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + break; +#else + return -EAFNOSUPPORT; +#endif + + default: + return -EINVAL; + } + + return verify_policy_dir(p->dir); +} + +static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_SEC_CTX]; + struct xfrm_user_sec_ctx *uctx; + + if (!rt) + return 0; + + uctx = nla_data(rt); + return security_xfrm_policy_alloc(&pol->security, uctx); +} + +static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, + int nr) +{ + int i; + + xp->xfrm_nr = nr; + for (i = 0; i < nr; i++, ut++) { + struct xfrm_tmpl *t = &xp->xfrm_vec[i]; + + memcpy(&t->id, &ut->id, sizeof(struct xfrm_id)); + memcpy(&t->saddr, &ut->saddr, + sizeof(xfrm_address_t)); + t->reqid = ut->reqid; + t->mode = ut->mode; + t->share = ut->share; + t->optional = ut->optional; + t->aalgos = ut->aalgos; + t->ealgos = ut->ealgos; + t->calgos = ut->calgos; + /* If all masks are ~0, then we allow all algorithms. */ + t->allalgs = !~(t->aalgos & t->ealgos & t->calgos); + t->encap_family = ut->family; + } +} + +static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) +{ + int i; + + if (nr > XFRM_MAX_DEPTH) + return -EINVAL; + + for (i = 0; i < nr; i++) { + /* We never validated the ut->family value, so many + * applications simply leave it at zero. The check was + * never made and ut->family was ignored because all + * templates could be assumed to have the same family as + * the policy itself. Now that we will have ipv4-in-ipv6 + * and ipv6-in-ipv4 tunnels, this is no longer true. + */ + if (!ut[i].family) + ut[i].family = family; + + switch (ut[i].family) { + case AF_INET: + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + break; +#endif + default: + return -EINVAL; + } + } + + return 0; +} + +static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_TMPL]; + + if (!rt) { + pol->xfrm_nr = 0; + } else { + struct xfrm_user_tmpl *utmpl = nla_data(rt); + int nr = nla_len(rt) / sizeof(*utmpl); + int err; + + err = validate_tmpl(nr, utmpl, pol->family); + if (err) + return err; + + copy_templates(pol, utmpl, nr); + } + return 0; +} + +static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs) +{ + struct nlattr *rt = attrs[XFRMA_POLICY_TYPE]; + struct xfrm_userpolicy_type *upt; + u8 type = XFRM_POLICY_TYPE_MAIN; + int err; + + if (rt) { + upt = nla_data(rt); + type = upt->type; + } + + err = verify_policy_type(type); + if (err) + return err; + + *tp = type; + return 0; +} + +static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p) +{ + xp->priority = p->priority; + xp->index = p->index; + memcpy(&xp->selector, &p->sel, sizeof(xp->selector)); + memcpy(&xp->lft, &p->lft, sizeof(xp->lft)); + xp->action = p->action; + xp->flags = p->flags; + xp->family = p->sel.family; + /* XXX xp->share = p->share; */ +} + +static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) +{ + memcpy(&p->sel, &xp->selector, sizeof(p->sel)); + memcpy(&p->lft, &xp->lft, sizeof(p->lft)); + memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); + p->priority = xp->priority; + p->index = xp->index; + p->sel.family = xp->family; + p->dir = dir; + p->action = xp->action; + p->flags = xp->flags; + p->share = XFRM_SHARE_ANY; /* XXX xp->share */ +} + +static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp) +{ + struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL); + int err; + + if (!xp) { + *errp = -ENOMEM; + return NULL; + } + + copy_from_user_policy(xp, p); + + err = copy_from_user_policy_type(&xp->type, attrs); + if (err) + goto error; + + if (!(err = copy_from_user_tmpl(xp, attrs))) + err = copy_from_user_sec_ctx(xp, attrs); + if (err) + goto error; + + xfrm_mark_get(attrs, &xp->mark); + + return xp; + error: + *errp = err; + xp->walk.dead = 1; + xfrm_policy_destroy(xp); + return NULL; +} + +static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_userpolicy_info *p = nlmsg_data(nlh); + struct xfrm_policy *xp; + struct km_event c; + int err; + int excl; + uid_t loginuid = audit_get_loginuid(current); + u32 sessionid = audit_get_sessionid(current); + u32 sid; + + err = verify_newpolicy_info(p); + if (err) + return err; + err = verify_sec_ctx_len(attrs); + if (err) + return err; + + xp = xfrm_policy_construct(net, p, attrs, &err); + if (!xp) + return err; + + /* shouldn't excl be based on nlh flags?? + * Aha! this is anti-netlink really i.e more pfkey derived + * in netlink excl is a flag and you wouldnt need + * a type XFRM_MSG_UPDPOLICY - JHS */ + excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; + err = xfrm_policy_insert(p->dir, xp, excl); + security_task_getsecid(current, &sid); + xfrm_audit_policy_add(xp, err ? 0 : 1, loginuid, sessionid, sid); + + if (err) { + security_xfrm_policy_free(xp->security); + kfree(xp); + return err; + } + + c.event = nlh->nlmsg_type; + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + km_policy_notify(xp, p->dir, &c); + + xfrm_pol_put(xp); + + return 0; +} + +static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) +{ + struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH]; + int i; + + if (xp->xfrm_nr == 0) + return 0; + + for (i = 0; i < xp->xfrm_nr; i++) { + struct xfrm_user_tmpl *up = &vec[i]; + struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + + memcpy(&up->id, &kp->id, sizeof(up->id)); + up->family = kp->encap_family; + memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); + up->reqid = kp->reqid; + up->mode = kp->mode; + up->share = kp->share; + up->optional = kp->optional; + up->aalgos = kp->aalgos; + up->ealgos = kp->ealgos; + up->calgos = kp->calgos; + } + + return nla_put(skb, XFRMA_TMPL, + sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr, vec); +} + +static inline int copy_to_user_state_sec_ctx(struct xfrm_state *x, struct sk_buff *skb) +{ + if (x->security) { + return copy_sec_ctx(x->security, skb); + } + return 0; +} + +static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb) +{ + if (xp->security) { + return copy_sec_ctx(xp->security, skb); + } + return 0; +} +static inline size_t userpolicy_type_attrsize(void) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + return nla_total_size(sizeof(struct xfrm_userpolicy_type)); +#else + return 0; +#endif +} + +#ifdef CONFIG_XFRM_SUB_POLICY +static int copy_to_user_policy_type(u8 type, struct sk_buff *skb) +{ + struct xfrm_userpolicy_type upt = { + .type = type, + }; + + return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt); +} + +#else +static inline int copy_to_user_policy_type(u8 type, struct sk_buff *skb) +{ + return 0; +} +#endif + +static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + struct xfrm_dump_info *sp = ptr; + struct xfrm_userpolicy_info *p; + struct sk_buff *in_skb = sp->in_skb; + struct sk_buff *skb = sp->out_skb; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(in_skb).pid, sp->nlmsg_seq, + XFRM_MSG_NEWPOLICY, sizeof(*p), sp->nlmsg_flags); + if (nlh == NULL) + return -EMSGSIZE; + + p = nlmsg_data(nlh); + copy_to_user_policy(xp, p, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; + if (copy_to_user_policy_type(xp->type, skb) < 0) + goto nlmsg_failure; + if (xfrm_mark_put(skb, &xp->mark)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_dump_policy_done(struct netlink_callback *cb) +{ + struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1]; + + xfrm_policy_walk_done(walk); + return 0; +} + +static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1]; + struct xfrm_dump_info info; + + BUILD_BUG_ON(sizeof(struct xfrm_policy_walk) > + sizeof(cb->args) - sizeof(cb->args[0])); + + info.in_skb = cb->skb; + info.out_skb = skb; + info.nlmsg_seq = cb->nlh->nlmsg_seq; + info.nlmsg_flags = NLM_F_MULTI; + + if (!cb->args[0]) { + cb->args[0] = 1; + xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY); + } + + (void) xfrm_policy_walk(net, walk, dump_one_policy, &info); + + return skb->len; +} + +static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, + struct xfrm_policy *xp, + int dir, u32 seq) +{ + struct xfrm_dump_info info; + struct sk_buff *skb; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + info.in_skb = in_skb; + info.out_skb = skb; + info.nlmsg_seq = seq; + info.nlmsg_flags = 0; + + if (dump_one_policy(xp, dir, 0, &info) < 0) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_policy *xp; + struct xfrm_userpolicy_id *p; + u8 type = XFRM_POLICY_TYPE_MAIN; + int err; + struct km_event c; + int delete; + struct xfrm_mark m; + u32 mark = xfrm_mark_get(attrs, &m); + + p = nlmsg_data(nlh); + delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; + + err = copy_from_user_policy_type(&type, attrs); + if (err) + return err; + + err = verify_policy_dir(p->dir); + if (err) + return err; + + if (p->index) + xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err); + else { + struct nlattr *rt = attrs[XFRMA_SEC_CTX]; + struct xfrm_sec_ctx *ctx; + + err = verify_sec_ctx_len(attrs); + if (err) + return err; + + ctx = NULL; + if (rt) { + struct xfrm_user_sec_ctx *uctx = nla_data(rt); + + err = security_xfrm_policy_alloc(&ctx, uctx); + if (err) + return err; + } + xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel, + ctx, delete, &err); + security_xfrm_policy_free(ctx); + } + if (xp == NULL) + return -ENOENT; + + if (!delete) { + struct sk_buff *resp_skb; + + resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + } else { + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, + NETLINK_CB(skb).pid); + } + } else { + uid_t loginuid = audit_get_loginuid(current); + u32 sessionid = audit_get_sessionid(current); + u32 sid; + + security_task_getsecid(current, &sid); + xfrm_audit_policy_delete(xp, err ? 0 : 1, loginuid, sessionid, + sid); + + if (err != 0) + goto out; + + c.data.byid = p->index; + c.event = nlh->nlmsg_type; + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + km_policy_notify(xp, p->dir, &c); + } + +out: + xfrm_pol_put(xp); + return err; +} + +static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct km_event c; + struct xfrm_usersa_flush *p = nlmsg_data(nlh); + struct xfrm_audit audit_info; + int err; + + audit_info.loginuid = audit_get_loginuid(current); + audit_info.sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &audit_info.secid); + err = xfrm_state_flush(net, p->proto, &audit_info); + if (err) { + if (err == -ESRCH) /* empty table */ + return 0; + return err; + } + c.data.proto = p->proto; + c.event = nlh->nlmsg_type; + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + c.net = net; + km_state_notify(NULL, &c); + + return 0; +} + +static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x) +{ + size_t replay_size = x->replay_esn ? + xfrm_replay_state_esn_len(x->replay_esn) : + sizeof(struct xfrm_replay_state); + + return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id)) + + nla_total_size(replay_size) + + nla_total_size(sizeof(struct xfrm_lifetime_cur)) + + nla_total_size(sizeof(struct xfrm_mark)) + + nla_total_size(4) /* XFRM_AE_RTHR */ + + nla_total_size(4); /* XFRM_AE_ETHR */ +} + +static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) +{ + struct xfrm_aevent_id *id; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, c->pid, c->seq, XFRM_MSG_NEWAE, sizeof(*id), 0); + if (nlh == NULL) + return -EMSGSIZE; + + id = nlmsg_data(nlh); + memcpy(&id->sa_id.daddr, &x->id.daddr,sizeof(x->id.daddr)); + id->sa_id.spi = x->id.spi; + id->sa_id.family = x->props.family; + id->sa_id.proto = x->id.proto; + memcpy(&id->saddr, &x->props.saddr,sizeof(x->props.saddr)); + id->reqid = x->props.reqid; + id->flags = c->data.aevent; + + if (x->replay_esn) + NLA_PUT(skb, XFRMA_REPLAY_ESN_VAL, + xfrm_replay_state_esn_len(x->replay_esn), + x->replay_esn); + else + NLA_PUT(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), &x->replay); + + NLA_PUT(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft); + + if (id->flags & XFRM_AE_RTHR) + NLA_PUT_U32(skb, XFRMA_REPLAY_THRESH, x->replay_maxdiff); + + if (id->flags & XFRM_AE_ETHR) + NLA_PUT_U32(skb, XFRMA_ETIMER_THRESH, + x->replay_maxage * 10 / HZ); + + if (xfrm_mark_put(skb, &x->mark)) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_state *x; + struct sk_buff *r_skb; + int err; + struct km_event c; + u32 mark; + struct xfrm_mark m; + struct xfrm_aevent_id *p = nlmsg_data(nlh); + struct xfrm_usersa_id *id = &p->sa_id; + + mark = xfrm_mark_get(attrs, &m); + + x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family); + if (x == NULL) + return -ESRCH; + + r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC); + if (r_skb == NULL) { + xfrm_state_put(x); + return -ENOMEM; + } + + /* + * XXX: is this lock really needed - none of the other + * gets lock (the concern is things getting updated + * while we are still reading) - jhs + */ + spin_lock_bh(&x->lock); + c.data.aevent = p->flags; + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + + if (build_aevent(r_skb, x, &c) < 0) + BUG(); + err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).pid); + spin_unlock_bh(&x->lock); + xfrm_state_put(x); + return err; +} + +static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_state *x; + struct km_event c; + int err = - EINVAL; + u32 mark = 0; + struct xfrm_mark m; + struct xfrm_aevent_id *p = nlmsg_data(nlh); + struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; + struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; + struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; + + if (!lt && !rp && !re) + return err; + + /* pedantic mode - thou shalt sayeth replaceth */ + if (!(nlh->nlmsg_flags&NLM_F_REPLACE)) + return err; + + mark = xfrm_mark_get(attrs, &m); + + x = xfrm_state_lookup(net, mark, &p->sa_id.daddr, p->sa_id.spi, p->sa_id.proto, p->sa_id.family); + if (x == NULL) + return -ESRCH; + + if (x->km.state != XFRM_STATE_VALID) + goto out; + + err = xfrm_replay_verify_len(x->replay_esn, rp); + if (err) + goto out; + + spin_lock_bh(&x->lock); + xfrm_update_ae_params(x, attrs); + spin_unlock_bh(&x->lock); + + c.event = nlh->nlmsg_type; + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + c.data.aevent = XFRM_AE_CU; + km_state_notify(x, &c); + err = 0; +out: + xfrm_state_put(x); + return err; +} + +static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct km_event c; + u8 type = XFRM_POLICY_TYPE_MAIN; + int err; + struct xfrm_audit audit_info; + + err = copy_from_user_policy_type(&type, attrs); + if (err) + return err; + + audit_info.loginuid = audit_get_loginuid(current); + audit_info.sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &audit_info.secid); + err = xfrm_policy_flush(net, type, &audit_info); + if (err) { + if (err == -ESRCH) /* empty table */ + return 0; + return err; + } + + c.data.type = type; + c.event = nlh->nlmsg_type; + c.seq = nlh->nlmsg_seq; + c.pid = nlh->nlmsg_pid; + c.net = net; + km_policy_notify(NULL, 0, &c); + return 0; +} + +static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_policy *xp; + struct xfrm_user_polexpire *up = nlmsg_data(nlh); + struct xfrm_userpolicy_info *p = &up->pol; + u8 type = XFRM_POLICY_TYPE_MAIN; + int err = -ENOENT; + struct xfrm_mark m; + u32 mark = xfrm_mark_get(attrs, &m); + + err = copy_from_user_policy_type(&type, attrs); + if (err) + return err; + + err = verify_policy_dir(p->dir); + if (err) + return err; + + if (p->index) + xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err); + else { + struct nlattr *rt = attrs[XFRMA_SEC_CTX]; + struct xfrm_sec_ctx *ctx; + + err = verify_sec_ctx_len(attrs); + if (err) + return err; + + ctx = NULL; + if (rt) { + struct xfrm_user_sec_ctx *uctx = nla_data(rt); + + err = security_xfrm_policy_alloc(&ctx, uctx); + if (err) + return err; + } + xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, + &p->sel, ctx, 0, &err); + security_xfrm_policy_free(ctx); + } + if (xp == NULL) + return -ENOENT; + + if (unlikely(xp->walk.dead)) + goto out; + + err = 0; + if (up->hard) { + uid_t loginuid = audit_get_loginuid(current); + u32 sessionid = audit_get_sessionid(current); + u32 sid; + + security_task_getsecid(current, &sid); + xfrm_policy_delete(xp, p->dir); + xfrm_audit_policy_delete(xp, 1, loginuid, sessionid, sid); + + } else { + // reset the timers here? + WARN(1, "Dont know what to do with soft policy expire\n"); + } + km_policy_expired(xp, p->dir, up->hard, current->pid); + +out: + xfrm_pol_put(xp); + return err; +} + +static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_state *x; + int err; + struct xfrm_user_expire *ue = nlmsg_data(nlh); + struct xfrm_usersa_info *p = &ue->state; + struct xfrm_mark m; + u32 mark = xfrm_mark_get(attrs, &m); + + x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family); + + err = -ENOENT; + if (x == NULL) + return err; + + spin_lock_bh(&x->lock); + err = -EINVAL; + if (x->km.state != XFRM_STATE_VALID) + goto out; + km_state_expired(x, ue->hard, current->pid); + + if (ue->hard) { + uid_t loginuid = audit_get_loginuid(current); + u32 sessionid = audit_get_sessionid(current); + u32 sid; + + security_task_getsecid(current, &sid); + __xfrm_state_delete(x); + xfrm_audit_state_delete(x, 1, loginuid, sessionid, sid); + } + err = 0; +out: + spin_unlock_bh(&x->lock); + xfrm_state_put(x); + return err; +} + +static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_policy *xp; + struct xfrm_user_tmpl *ut; + int i; + struct nlattr *rt = attrs[XFRMA_TMPL]; + struct xfrm_mark mark; + + struct xfrm_user_acquire *ua = nlmsg_data(nlh); + struct xfrm_state *x = xfrm_state_alloc(net); + int err = -ENOMEM; + + if (!x) + goto nomem; + + xfrm_mark_get(attrs, &mark); + + err = verify_newpolicy_info(&ua->policy); + if (err) + goto bad_policy; + + /* build an XP */ + xp = xfrm_policy_construct(net, &ua->policy, attrs, &err); + if (!xp) + goto free_state; + + memcpy(&x->id, &ua->id, sizeof(ua->id)); + memcpy(&x->props.saddr, &ua->saddr, sizeof(ua->saddr)); + memcpy(&x->sel, &ua->sel, sizeof(ua->sel)); + xp->mark.m = x->mark.m = mark.m; + xp->mark.v = x->mark.v = mark.v; + ut = nla_data(rt); + /* extract the templates and for each call km_key */ + for (i = 0; i < xp->xfrm_nr; i++, ut++) { + struct xfrm_tmpl *t = &xp->xfrm_vec[i]; + memcpy(&x->id, &t->id, sizeof(x->id)); + x->props.mode = t->mode; + x->props.reqid = t->reqid; + x->props.family = ut->family; + t->aalgos = ua->aalgos; + t->ealgos = ua->ealgos; + t->calgos = ua->calgos; + err = km_query(x, t, xp); + + } + + kfree(x); + kfree(xp); + + return 0; + +bad_policy: + WARN(1, "BAD policy passed\n"); +free_state: + kfree(x); +nomem: + return err; +} + +#ifdef CONFIG_XFRM_MIGRATE +static int copy_from_user_migrate(struct xfrm_migrate *ma, + struct xfrm_kmaddress *k, + struct nlattr **attrs, int *num) +{ + struct nlattr *rt = attrs[XFRMA_MIGRATE]; + struct xfrm_user_migrate *um; + int i, num_migrate; + + if (k != NULL) { + struct xfrm_user_kmaddress *uk; + + uk = nla_data(attrs[XFRMA_KMADDRESS]); + memcpy(&k->local, &uk->local, sizeof(k->local)); + memcpy(&k->remote, &uk->remote, sizeof(k->remote)); + k->family = uk->family; + k->reserved = uk->reserved; + } + + um = nla_data(rt); + num_migrate = nla_len(rt) / sizeof(*um); + + if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH) + return -EINVAL; + + for (i = 0; i < num_migrate; i++, um++, ma++) { + memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr)); + memcpy(&ma->old_saddr, &um->old_saddr, sizeof(ma->old_saddr)); + memcpy(&ma->new_daddr, &um->new_daddr, sizeof(ma->new_daddr)); + memcpy(&ma->new_saddr, &um->new_saddr, sizeof(ma->new_saddr)); + + ma->proto = um->proto; + ma->mode = um->mode; + ma->reqid = um->reqid; + + ma->old_family = um->old_family; + ma->new_family = um->new_family; + } + + *num = i; + return 0; +} + +static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct xfrm_userpolicy_id *pi = nlmsg_data(nlh); + struct xfrm_migrate m[XFRM_MAX_DEPTH]; + struct xfrm_kmaddress km, *kmp; + u8 type; + int err; + int n = 0; + + if (attrs[XFRMA_MIGRATE] == NULL) + return -EINVAL; + + kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL; + + err = copy_from_user_policy_type(&type, attrs); + if (err) + return err; + + err = copy_from_user_migrate((struct xfrm_migrate *)m, kmp, attrs, &n); + if (err) + return err; + + if (!n) + return 0; + + xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp); + + return 0; +} +#else +static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + return -ENOPROTOOPT; +} +#endif + +#ifdef CONFIG_XFRM_MIGRATE +static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb) +{ + struct xfrm_user_migrate um; + + memset(&um, 0, sizeof(um)); + um.proto = m->proto; + um.mode = m->mode; + um.reqid = m->reqid; + um.old_family = m->old_family; + memcpy(&um.old_daddr, &m->old_daddr, sizeof(um.old_daddr)); + memcpy(&um.old_saddr, &m->old_saddr, sizeof(um.old_saddr)); + um.new_family = m->new_family; + memcpy(&um.new_daddr, &m->new_daddr, sizeof(um.new_daddr)); + memcpy(&um.new_saddr, &m->new_saddr, sizeof(um.new_saddr)); + + return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um); +} + +static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb) +{ + struct xfrm_user_kmaddress uk; + + memset(&uk, 0, sizeof(uk)); + uk.family = k->family; + uk.reserved = k->reserved; + memcpy(&uk.local, &k->local, sizeof(uk.local)); + memcpy(&uk.remote, &k->remote, sizeof(uk.remote)); + + return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk); +} + +static inline size_t xfrm_migrate_msgsize(int num_migrate, int with_kma) +{ + return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id)) + + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0) + + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate) + + userpolicy_type_attrsize(); +} + +static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m, + int num_migrate, const struct xfrm_kmaddress *k, + const struct xfrm_selector *sel, u8 dir, u8 type) +{ + const struct xfrm_migrate *mp; + struct xfrm_userpolicy_id *pol_id; + struct nlmsghdr *nlh; + int i; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id), 0); + if (nlh == NULL) + return -EMSGSIZE; + + pol_id = nlmsg_data(nlh); + /* copy data from selector, dir, and type to the pol_id */ + memset(pol_id, 0, sizeof(*pol_id)); + memcpy(&pol_id->sel, sel, sizeof(pol_id->sel)); + pol_id->dir = dir; + + if (k != NULL && (copy_to_user_kmaddress(k, skb) < 0)) + goto nlmsg_failure; + + if (copy_to_user_policy_type(type, skb) < 0) + goto nlmsg_failure; + + for (i = 0, mp = m ; i < num_migrate; i++, mp++) { + if (copy_to_user_migrate(mp, skb) < 0) + goto nlmsg_failure; + } + + return nlmsg_end(skb, nlh); +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, + const struct xfrm_migrate *m, int num_migrate, + const struct xfrm_kmaddress *k) +{ + struct net *net = &init_net; + struct sk_buff *skb; + + skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + /* build migrate */ + if (build_migrate(skb, m, num_migrate, k, sel, dir, type) < 0) + BUG(); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MIGRATE, GFP_ATOMIC); +} +#else +static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, + const struct xfrm_migrate *m, int num_migrate, + const struct xfrm_kmaddress *k) +{ + return -ENOPROTOOPT; +} +#endif + +#define XMSGSIZE(type) sizeof(struct type) + +static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { + [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), + [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info), + [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info), + [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire), + [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire), + [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info), + [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), + [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire), + [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush), + [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0, + [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), + [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id), + [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), + [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), +}; + +#undef XMSGSIZE + +static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { + [XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)}, + [XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)}, + [XFRMA_LASTUSED] = { .type = NLA_U64}, + [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)}, + [XFRMA_ALG_AEAD] = { .len = sizeof(struct xfrm_algo_aead) }, + [XFRMA_ALG_AUTH] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, + [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, + [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, + [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, + [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)}, + [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) }, + [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) }, + [XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) }, + [XFRMA_TFCPAD] = { .type = NLA_U32 }, + [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, +}; + +static struct xfrm_link { + int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); + int (*dump)(struct sk_buff *, struct netlink_callback *); + int (*done)(struct netlink_callback *); +} xfrm_dispatch[XFRM_NR_MSGTYPES] = { + [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, + [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, + [XFRM_MSG_GETSA - XFRM_MSG_BASE] = { .doit = xfrm_get_sa, + .dump = xfrm_dump_sa, + .done = xfrm_dump_sa_done }, + [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, + [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy }, + [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy, + .dump = xfrm_dump_policy, + .done = xfrm_dump_policy_done }, + [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi }, + [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_acquire }, + [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_sa_expire }, + [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, + [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, + [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_pol_expire}, + [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa }, + [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, + [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = { .doit = xfrm_new_ae }, + [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, + [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, + [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, +}; + +static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *attrs[XFRMA_MAX+1]; + struct xfrm_link *link; + int type, err; + + type = nlh->nlmsg_type; + if (type > XFRM_MSG_MAX) + return -EINVAL; + + type -= XFRM_MSG_BASE; + link = &xfrm_dispatch[type]; + + /* All operations require privileges, even GET */ + if (security_netlink_recv(skb, CAP_NET_ADMIN)) + return -EPERM; + + if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || + type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && + (nlh->nlmsg_flags & NLM_F_DUMP)) { + if (link->dump == NULL) + return -EINVAL; + + return netlink_dump_start(net->xfrm.nlsk, skb, nlh, link->dump, link->done); + } + + err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX, + xfrma_policy); + if (err < 0) + return err; + + if (link->doit == NULL) + return -EINVAL; + + return link->doit(skb, nlh, attrs); +} + +static void xfrm_netlink_rcv(struct sk_buff *skb) +{ + mutex_lock(&xfrm_cfg_mutex); + netlink_rcv_skb(skb, &xfrm_user_rcv_msg); + mutex_unlock(&xfrm_cfg_mutex); +} + +static inline size_t xfrm_expire_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + + nla_total_size(sizeof(struct xfrm_mark)); +} + +static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) +{ + struct xfrm_user_expire *ue; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, c->pid, 0, XFRM_MSG_EXPIRE, sizeof(*ue), 0); + if (nlh == NULL) + return -EMSGSIZE; + + ue = nlmsg_data(nlh); + copy_to_user_state(x, &ue->state); + ue->hard = (c->data.hard != 0) ? 1 : 0; + + if (xfrm_mark_put(skb, &x->mark)) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + return -EMSGSIZE; +} + +static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c) +{ + struct net *net = xs_net(x); + struct sk_buff *skb; + + skb = nlmsg_new(xfrm_expire_msgsize(), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_expire(skb, x, c) < 0) { + kfree_skb(skb); + return -EMSGSIZE; + } + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); +} + +static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c) +{ + struct net *net = xs_net(x); + struct sk_buff *skb; + + skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_aevent(skb, x, c) < 0) + BUG(); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_AEVENTS, GFP_ATOMIC); +} + +static int xfrm_notify_sa_flush(const struct km_event *c) +{ + struct net *net = c->net; + struct xfrm_usersa_flush *p; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int len = NLMSG_ALIGN(sizeof(struct xfrm_usersa_flush)); + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, c->pid, c->seq, XFRM_MSG_FLUSHSA, sizeof(*p), 0); + if (nlh == NULL) { + kfree_skb(skb); + return -EMSGSIZE; + } + + p = nlmsg_data(nlh); + p->proto = c->data.proto; + + nlmsg_end(skb, nlh); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); +} + +static inline size_t xfrm_sa_len(struct xfrm_state *x) +{ + size_t l = 0; + if (x->aead) + l += nla_total_size(aead_len(x->aead)); + if (x->aalg) { + l += nla_total_size(sizeof(struct xfrm_algo) + + (x->aalg->alg_key_len + 7) / 8); + l += nla_total_size(xfrm_alg_auth_len(x->aalg)); + } + if (x->ealg) + l += nla_total_size(xfrm_alg_len(x->ealg)); + if (x->calg) + l += nla_total_size(sizeof(*x->calg)); + if (x->encap) + l += nla_total_size(sizeof(*x->encap)); + if (x->tfcpad) + l += nla_total_size(sizeof(x->tfcpad)); + if (x->replay_esn) + l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn)); + if (x->security) + l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) + + x->security->ctx_len); + if (x->coaddr) + l += nla_total_size(sizeof(*x->coaddr)); + + /* Must count x->lastused as it may become non-zero behind our back. */ + l += nla_total_size(sizeof(u64)); + + return l; +} + +static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) +{ + struct net *net = xs_net(x); + struct xfrm_usersa_info *p; + struct xfrm_usersa_id *id; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int len = xfrm_sa_len(x); + int headlen; + + headlen = sizeof(*p); + if (c->event == XFRM_MSG_DELSA) { + len += nla_total_size(headlen); + headlen = sizeof(*id); + len += nla_total_size(sizeof(struct xfrm_mark)); + } + len += NLMSG_ALIGN(headlen); + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, c->pid, c->seq, c->event, headlen, 0); + if (nlh == NULL) + goto nla_put_failure; + + p = nlmsg_data(nlh); + if (c->event == XFRM_MSG_DELSA) { + struct nlattr *attr; + + id = nlmsg_data(nlh); + memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr)); + id->spi = x->id.spi; + id->family = x->props.family; + id->proto = x->id.proto; + + attr = nla_reserve(skb, XFRMA_SA, sizeof(*p)); + if (attr == NULL) + goto nla_put_failure; + + p = nla_data(attr); + } + + if (copy_to_user_state_extra(x, p, skb)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); + +nla_put_failure: + /* Somebody screwed up with xfrm_sa_len! */ + WARN_ON(1); + kfree_skb(skb); + return -1; +} + +static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c) +{ + + switch (c->event) { + case XFRM_MSG_EXPIRE: + return xfrm_exp_state_notify(x, c); + case XFRM_MSG_NEWAE: + return xfrm_aevent_state_notify(x, c); + case XFRM_MSG_DELSA: + case XFRM_MSG_UPDSA: + case XFRM_MSG_NEWSA: + return xfrm_notify_sa(x, c); + case XFRM_MSG_FLUSHSA: + return xfrm_notify_sa_flush(c); + default: + printk(KERN_NOTICE "xfrm_user: Unknown SA event %d\n", + c->event); + break; + } + + return 0; + +} + +static inline size_t xfrm_acquire_msgsize(struct xfrm_state *x, + struct xfrm_policy *xp) +{ + return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire)) + + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + + nla_total_size(sizeof(struct xfrm_mark)) + + nla_total_size(xfrm_user_sec_ctx_size(x->security)) + + userpolicy_type_attrsize(); +} + +static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_tmpl *xt, struct xfrm_policy *xp, + int dir) +{ + struct xfrm_user_acquire *ua; + struct nlmsghdr *nlh; + __u32 seq = xfrm_get_acqseq(); + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_ACQUIRE, sizeof(*ua), 0); + if (nlh == NULL) + return -EMSGSIZE; + + ua = nlmsg_data(nlh); + memcpy(&ua->id, &x->id, sizeof(ua->id)); + memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr)); + memcpy(&ua->sel, &x->sel, sizeof(ua->sel)); + copy_to_user_policy(xp, &ua->policy, dir); + ua->aalgos = xt->aalgos; + ua->ealgos = xt->ealgos; + ua->calgos = xt->calgos; + ua->seq = x->km.seq = seq; + + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + if (copy_to_user_state_sec_ctx(x, skb)) + goto nlmsg_failure; + if (copy_to_user_policy_type(xp->type, skb) < 0) + goto nlmsg_failure; + if (xfrm_mark_put(skb, &xp->mark)) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, + struct xfrm_policy *xp, int dir) +{ + struct net *net = xs_net(x); + struct sk_buff *skb; + + skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_acquire(skb, x, xt, xp, dir) < 0) + BUG(); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC); +} + +/* User gives us xfrm_user_policy_info followed by an array of 0 + * or more templates. + */ +static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, + u8 *data, int len, int *dir) +{ + struct net *net = sock_net(sk); + struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data; + struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1); + struct xfrm_policy *xp; + int nr; + + switch (sk->sk_family) { + case AF_INET: + if (opt != IP_XFRM_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + if (opt != IPV6_XFRM_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#endif + default: + *dir = -EINVAL; + return NULL; + } + + *dir = -EINVAL; + + if (len < sizeof(*p) || + verify_newpolicy_info(p)) + return NULL; + + nr = ((len - sizeof(*p)) / sizeof(*ut)); + if (validate_tmpl(nr, ut, p->sel.family)) + return NULL; + + if (p->dir > XFRM_POLICY_OUT) + return NULL; + + xp = xfrm_policy_alloc(net, GFP_ATOMIC); + if (xp == NULL) { + *dir = -ENOBUFS; + return NULL; + } + + copy_from_user_policy(xp, p); + xp->type = XFRM_POLICY_TYPE_MAIN; + copy_templates(xp, ut, nr); + + *dir = p->dir; + + return xp; +} + +static inline size_t xfrm_polexpire_msgsize(struct xfrm_policy *xp) +{ + return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire)) + + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + + nla_total_size(xfrm_user_sec_ctx_size(xp->security)) + + nla_total_size(sizeof(struct xfrm_mark)) + + userpolicy_type_attrsize(); +} + +static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, + int dir, const struct km_event *c) +{ + struct xfrm_user_polexpire *upe; + struct nlmsghdr *nlh; + int hard = c->data.hard; + + nlh = nlmsg_put(skb, c->pid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe), 0); + if (nlh == NULL) + return -EMSGSIZE; + + upe = nlmsg_data(nlh); + copy_to_user_policy(xp, &upe->pol, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; + if (copy_to_user_policy_type(xp->type, skb) < 0) + goto nlmsg_failure; + if (xfrm_mark_put(skb, &xp->mark)) + goto nla_put_failure; + upe->hard = !!hard; + + return nlmsg_end(skb, nlh); + +nla_put_failure: +nlmsg_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) +{ + struct net *net = xp_net(xp); + struct sk_buff *skb; + + skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_polexpire(skb, xp, dir, c) < 0) + BUG(); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); +} + +static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) +{ + struct net *net = xp_net(xp); + struct xfrm_userpolicy_info *p; + struct xfrm_userpolicy_id *id; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); + int headlen; + + headlen = sizeof(*p); + if (c->event == XFRM_MSG_DELPOLICY) { + len += nla_total_size(headlen); + headlen = sizeof(*id); + } + len += userpolicy_type_attrsize(); + len += nla_total_size(sizeof(struct xfrm_mark)); + len += NLMSG_ALIGN(headlen); + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, c->pid, c->seq, c->event, headlen, 0); + if (nlh == NULL) + goto nlmsg_failure; + + p = nlmsg_data(nlh); + if (c->event == XFRM_MSG_DELPOLICY) { + struct nlattr *attr; + + id = nlmsg_data(nlh); + memset(id, 0, sizeof(*id)); + id->dir = dir; + if (c->data.byid) + id->index = xp->index; + else + memcpy(&id->sel, &xp->selector, sizeof(id->sel)); + + attr = nla_reserve(skb, XFRMA_POLICY, sizeof(*p)); + if (attr == NULL) + goto nlmsg_failure; + + p = nla_data(attr); + } + + copy_to_user_policy(xp, p, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + if (copy_to_user_policy_type(xp->type, skb) < 0) + goto nlmsg_failure; + + if (xfrm_mark_put(skb, &xp->mark)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); + +nla_put_failure: +nlmsg_failure: + kfree_skb(skb); + return -1; +} + +static int xfrm_notify_policy_flush(const struct km_event *c) +{ + struct net *net = c->net; + struct nlmsghdr *nlh; + struct sk_buff *skb; + + skb = nlmsg_new(userpolicy_type_attrsize(), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0, 0); + if (nlh == NULL) + goto nlmsg_failure; + if (copy_to_user_policy_type(c->data.type, skb) < 0) + goto nlmsg_failure; + + nlmsg_end(skb, nlh); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); + +nlmsg_failure: + kfree_skb(skb); + return -1; +} + +static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) +{ + + switch (c->event) { + case XFRM_MSG_NEWPOLICY: + case XFRM_MSG_UPDPOLICY: + case XFRM_MSG_DELPOLICY: + return xfrm_notify_policy(xp, dir, c); + case XFRM_MSG_FLUSHPOLICY: + return xfrm_notify_policy_flush(c); + case XFRM_MSG_POLEXPIRE: + return xfrm_exp_policy_notify(xp, dir, c); + default: + printk(KERN_NOTICE "xfrm_user: Unknown Policy event %d\n", + c->event); + } + + return 0; + +} + +static inline size_t xfrm_report_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct xfrm_user_report)); +} + +static int build_report(struct sk_buff *skb, u8 proto, + struct xfrm_selector *sel, xfrm_address_t *addr) +{ + struct xfrm_user_report *ur; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur), 0); + if (nlh == NULL) + return -EMSGSIZE; + + ur = nlmsg_data(nlh); + ur->proto = proto; + memcpy(&ur->sel, sel, sizeof(ur->sel)); + + if (addr) + NLA_PUT(skb, XFRMA_COADDR, sizeof(*addr), addr); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_send_report(struct net *net, u8 proto, + struct xfrm_selector *sel, xfrm_address_t *addr) +{ + struct sk_buff *skb; + + skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_report(skb, proto, sel, addr) < 0) + BUG(); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_REPORT, GFP_ATOMIC); +} + +static inline size_t xfrm_mapping_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping)); +} + +static int build_mapping(struct sk_buff *skb, struct xfrm_state *x, + xfrm_address_t *new_saddr, __be16 new_sport) +{ + struct xfrm_user_mapping *um; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0); + if (nlh == NULL) + return -EMSGSIZE; + + um = nlmsg_data(nlh); + + memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr)); + um->id.spi = x->id.spi; + um->id.family = x->props.family; + um->id.proto = x->id.proto; + memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr)); + memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr)); + um->new_sport = new_sport; + um->old_sport = x->encap->encap_sport; + um->reqid = x->props.reqid; + + return nlmsg_end(skb, nlh); +} + +static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, + __be16 sport) +{ + struct net *net = xs_net(x); + struct sk_buff *skb; + + if (x->id.proto != IPPROTO_ESP) + return -EINVAL; + + if (!x->encap) + return -EINVAL; + + skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_mapping(skb, x, ipaddr, sport) < 0) + BUG(); + + return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MAPPING, GFP_ATOMIC); +} + +static struct xfrm_mgr netlink_mgr = { + .id = "netlink", + .notify = xfrm_send_state_notify, + .acquire = xfrm_send_acquire, + .compile_policy = xfrm_compile_policy, + .notify_policy = xfrm_send_policy_notify, + .report = xfrm_send_report, + .migrate = xfrm_send_migrate, + .new_mapping = xfrm_send_mapping, +}; + +static int __net_init xfrm_user_net_init(struct net *net) +{ + struct sock *nlsk; + + nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX, + xfrm_netlink_rcv, NULL, THIS_MODULE); + if (nlsk == NULL) + return -ENOMEM; + net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */ + rcu_assign_pointer(net->xfrm.nlsk, nlsk); + return 0; +} + +static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list) +{ + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) + rcu_assign_pointer(net->xfrm.nlsk, NULL); + synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) + netlink_kernel_release(net->xfrm.nlsk_stash); +} + +static struct pernet_operations xfrm_user_net_ops = { + .init = xfrm_user_net_init, + .exit_batch = xfrm_user_net_exit, +}; + +static int __init xfrm_user_init(void) +{ + int rv; + + printk(KERN_INFO "Initializing XFRM netlink socket\n"); + + rv = register_pernet_subsys(&xfrm_user_net_ops); + if (rv < 0) + return rv; + rv = xfrm_register_km(&netlink_mgr); + if (rv < 0) + unregister_pernet_subsys(&xfrm_user_net_ops); + return rv; +} + +static void __exit xfrm_user_exit(void) +{ + xfrm_unregister_km(&netlink_mgr); + unregister_pernet_subsys(&xfrm_user_net_ops); +} + +module_init(xfrm_user_init); +module_exit(xfrm_user_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); + -- cgit v1.2.3